OpenBLAS-0.2.20/000077500000000000000000000000001313527062700131445ustar00rootroot00000000000000OpenBLAS-0.2.20/.gitignore000066400000000000000000000027221313527062700151370ustar00rootroot00000000000000*.obj *.lib *.dll *.dylib *.def *.o *.out lapack-3.1.1 lapack-3.1.1.tgz lapack-3.4.1 lapack-3.4.1.tgz lapack-3.4.2 lapack-3.4.2.tgz lapack-netlib/make.inc lapack-netlib/lapacke/include/lapacke_mangling.h lapack-netlib/TESTING/testing_results.txt lapack-netlib/INSTALL/test* lapack-netlib/TESTING/xeigtstc lapack-netlib/TESTING/xeigtstd lapack-netlib/TESTING/xeigtsts lapack-netlib/TESTING/xeigtstz lapack-netlib/TESTING/xlintstc lapack-netlib/TESTING/xlintstd lapack-netlib/TESTING/xlintstds lapack-netlib/TESTING/xlintstrfc lapack-netlib/TESTING/xlintstrfd lapack-netlib/TESTING/xlintstrfs lapack-netlib/TESTING/xlintstrfz lapack-netlib/TESTING/xlintsts lapack-netlib/TESTING/xlintstz lapack-netlib/TESTING/xlintstzc *.so *.so.* *.a .svn *~ lib.grd nohup.out config.h config_kernel.h Makefile.conf Makefile.conf_last Makefile_kernel.conf config_last.h getarch getarch_2nd utest/openblas_utest ctest/xccblat1 ctest/xccblat2 ctest/xccblat3 ctest/xdcblat1 ctest/xdcblat2 ctest/xdcblat3 ctest/xscblat1 ctest/xscblat2 ctest/xscblat3 ctest/xzcblat1 ctest/xzcblat2 ctest/xzcblat3 exports/linktest.c exports/linux.def kernel/setparam_*.c kernel/kernel_*.h test/CBLAT2.SUMM test/CBLAT3.SUMM test/DBLAT2.SUMM test/DBLAT3.SUMM test/SBLAT2.SUMM test/SBLAT3.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 test/dblat1 test/dblat2 test/dblat3 test/sblat1 test/sblat2 test/sblat3 test/zblat1 test/zblat2 test/zblat3 build build.* *.swp benchmark/*.goto benchmark/smallscaling OpenBLAS-0.2.20/.travis.yml000066400000000000000000000030441313527062700152560ustar00rootroot00000000000000language: c notifications: webhooks: urls: - https://webhooks.gitter.im/e/8a6e4470a0cebd090344 on_success: change # options: [always|never|change] default: always on_failure: always # options: [always|never|change] default: always on_start: never # options: [always|never|change] default: always compiler: - gcc env: - TARGET_BOX=LINUX64 BTYPE="BINARY=64" - TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1" - TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1" - TARGET_BOX=LINUX32 BTYPE="BINARY=32" - TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" before_install: - sudo apt-get update -qq - sudo apt-get install -qq gfortran - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi script: - set -e - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi # whitelist branches: only: - master - developOpenBLAS-0.2.20/BACKERS.md000066400000000000000000000011751313527062700145440ustar00rootroot00000000000000Thank you for the support. ### [2013.8] [Testbed for OpenBLAS project](https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project) https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project/pledges In chronological order: * aeberspaecher * fmolina * saullocastro * xianyi * cuda * carter * StefanKarpinski * staticfloat * sebastien-villemot * JeffBezanson * ihnorton * simonp0420 * andrioni * Tim Holy * ivarne * johnmyleswhite * traz * Jean-Francis Roy * bkalpert * Anirban * pgermain * alexandre.lacoste.18 * foges * ssam * WestleyArgentum * daniebmariani * pjpuglia * albarrentine * Alexander Vogt OpenBLAS-0.2.20/CMakeLists.txt000066400000000000000000000232441313527062700157110ustar00rootroot00000000000000## ## Author: Hank Anderson ## cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) set(OpenBLAS_PATCH_VERSION 20) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) enable_language(C) # Adhere to GNU filesystem layout conventions include(GNUInstallDirs) if(MSVC) set(OpenBLAS_LIBNAME libopenblas) else() set(OpenBLAS_LIBNAME openblas) endif() ####### if(MSVC) option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(BUILD_DEBUG "Build Debug Version" OFF) ####### if(BUILD_WITHOUT_LAPACK) set(NO_LAPACK 1) set(NO_LAPACKE 1) endif() if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator? set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) set(CMAKE_BUILD_TYPE Debug Debug Release Release ) else() if( NOT CMAKE_BUILD_TYPE ) if(BUILD_DEBUG) set(CMAKE_BUILD_TYPE Debug) else() set(CMAKE_BUILD_TYPE Release) endif() endif() endif() if(BUILD_WITHOUT_CBLAS) set(NO_CBLAS 1) endif() ####### message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") set(BLASDIRS interface driver/level2 driver/level3 driver/others) if (NOT DYNAMIC_ARCH) list(APPEND BLASDIRS kernel) endif () if (DEFINED SANITY_CHECK) list(APPEND BLASDIRS reference) endif () set(SUBDIRS ${BLASDIRS}) if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) set(BUILD_COMPLEX16 true) endif () set(FLOAT_TYPES "") if (BUILD_SINGLE) message(STATUS "Building Single Precision") list(APPEND FLOAT_TYPES "SINGLE") # defines nothing endif () if (BUILD_DOUBLE) message(STATUS "Building Double Precision") list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE endif () if (BUILD_COMPLEX) message(STATUS "Building Complex Precision") list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX endif () if (BUILD_COMPLEX16) message(STATUS "Building Double Complex Precision") list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () set(SUBDIRS_ALL ${SUBDIRS} test ctest utest exports benchmark ../laswp ../bench) # all :: libs netlib tests shared # libs : if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") endif () if (${NO_STATIC} AND ${NO_SHARED}) message(FATAL_ERROR "Neither static nor shared are enabled.") endif () #Set default output directory set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) if(MSVC) set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) endif () # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (SUBDIR ${SUBDIRS}) add_subdirectory(${SUBDIR}) string(REPLACE "/" "_" subdir_obj ${SUBDIR}) list(APPEND TARGET_OBJS "$") endforeach () # netlib: # Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke. # Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want. if (NOT NOFORTRAN AND NOT NO_LAPACK) include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake") if (NOT NO_LAPACKE) include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake") endif () endif () # Only generate .def for dll on MSVC and always produce pdb files for debug and release if(MSVC) set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF") endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") # Set output for libopenblas set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) endforeach() enable_testing() add_subdirectory(utest) if (NOT MSVC) #only build shared library for MSVC add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) if(SMP) target_link_libraries(${OpenBLAS_LIBNAME} pthread) target_link_libraries(${OpenBLAS_LIBNAME}_static pthread) endif() #build test and ctest add_subdirectory(test) if(NOT NO_CBLAS) add_subdirectory(ctest) endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} ) # TODO: Why is the config saved here? Is this necessary with CMake? #Save the config files for installation # @cp Makefile.conf Makefile.conf_last # @cp config.h config_last.h #ifdef QUAD_PRECISION # @echo "#define QUAD_PRECISION">> config_last.h #endif #ifeq ($(EXPRECISION), 1) # @echo "#define EXPRECISION">> config_last.h #endif ### #ifeq ($(DYNAMIC_ARCH), 1) # @$(MAKE) -C kernel commonlibs || exit 1 # @for d in $(DYNAMIC_CORE) ; \ # do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ # done # @echo DYNAMIC_ARCH=1 >> Makefile.conf_last #endif #ifdef USE_THREAD # @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last #endif # @touch lib.grd # Install project # Install libraries install(TARGETS ${OpenBLAS_LIBNAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) # Install include files set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX}) ADD_CUSTOM_COMMAND( OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h ) ADD_CUSTOM_TARGET(genconfig ALL DEPENDS openblas_config.h ) add_dependencies(genconfig ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(genf77blas ALL COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h ) add_dependencies(genf77blas ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(gencblas ALL COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h ) add_dependencies(gencblas ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) ADD_CUSTOM_TARGET(genlapacke COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT MSVC) install (TARGETS ${OpenBLAS_LIBNAME}_static DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() include(FindPkgConfig QUIET) if(PKG_CONFIG_FOUND) configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) endif() OpenBLAS-0.2.20/CONTRIBUTORS.md000066400000000000000000000146051313527062700154310ustar00rootroot00000000000000# Contributions to the OpenBLAS project ## Creator & Maintainer * Zhang Xianyi ## Active Developers * Wang Qian * Optimize BLAS3 on ICT Loongson 3A. * Optimize BLAS3 on Intel Sandy Bridge. * Werner Saar * [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer * [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer * [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer * [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer * [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer * [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer * [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer * [2013-06-21] Optimize dcopy kernel on AMD Bulldozer * Porting and Optimization on ARM Cortex-A9 * Optimization on AMD Piledriver * Optimization on Intel Haswell ## Previous Developers * Zaheer Chothia * Improve the compatibility about complex number * Build LAPACKE: C interface to LAPACK * Improve the windows build. * Chen Shaohu * Optimize GEMV on the Loongson 3A processor. * Luo Wen * Intern. Test Level-2 BLAS. ## Contributors In chronological order: * pipping * [2011-06-11] Make USE_OPENMP=0 disable openmp. * Stefan Karpinski * [2011-12-28] Fix a bug about SystemStubs on Mac OS X. * Alexander Eberspächer * [2012-05-02] Add note on patch for segfaults on Linux kernel 2.6.32. * Mike Nolta * [2012-05-19] Fix building bug on FreeBSD and NetBSD. * Sylvestre Ledru * [2012-07-01] Improve the detection of sparc. Fix building bug under Hurd and kfreebsd. * Jameson Nash * [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make on the command line. * Alexander Nasonov * [2012-11-10] Fix NetBSD build. * Sébastien Villemot * [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package. * [2013-08-28] Avoid failure on qemu guests declaring an Athlon CPU without 3dnow! * Kang-Che Sung * [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c. * Kenneth Hoste * [2013-05-22] Adjust Makefile about downloading LAPACK source files. * Lei WANG * [2013-05-22] Fix a bug about wget. * Dan Luu * [2013-06-30] Add Intel Haswell support (using sandybridge optimizations). * grisuthedragon * [2013-07-11] create openblas_get_parallel to retrieve information which parallelization model is used by OpenBLAS. * Elliot Saba * [2013-07-22] Add in return value for `interface/trtri.c` * Sébastien Fabbro * [2013-07-24] Modify makefile to respect user's LDFLAGS * [2013-07-24] Add stack markings for GNU as arch-independent for assembler files * Viral B. Shah * [2013-08-21] Patch LAPACK XLASD4.f as discussed in JuliaLang/julia#2340 * Lars Buitinck * [2013-08-28] get rid of the generated cblas_noconst.h file * [2013-08-28] Missing threshold in gemm.c * [2013-08-28] fix default prefix handling in makefiles * yieldthought * [2013-10-08] Remove -Wl,--retain-symbols-file from dynamic link line to fix tool support * Keno Fischer * [2013-10-23] Use FC instead of CC to link the dynamic library on OS X * Christopher Meng * [2013-12-09] Add DESTDIR support for easier building on RPM based distros. Use install command instead of cp to install files with permissions control. * Lucas Beyer * [2013-12-10] Added support for NO_SHARED in make install. * carlkl * [2013-12-13] Fixed LAPACKE building bug on Windows * Isaac Dunham * [2014-08-03] Fixed link error on Linux/musl * Dave Nuechterlein * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). ARMv8 support. * Jerome Robert * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478) * [2015-12-23] `stack_check` in `gemv.c` (bug #722) * [2015-12-28] Allow to force the number of parallel make job * [2015-12-28] Fix detection of AMD E2-3200 detection * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731) * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742) * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760) * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727) * Dan Kortschak * [2015-01-07] Added test for drotmg bug #484. * Ton van den Heuvel * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). * Martin Koehler * [2015-09-07] Improved imatcopy * Ashwin Sekhar T K * [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8) * [2015-11-20] lapack-test fixes for Cortex-A57 * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 * theoractice * [2016-03-20] Fix compiler error in VisualStudio with CMake * [2016-03-22] Fix access violation on Windows while static linking * Paul Mustière * [2016-02-04] Fix Android build on ARMV7 * [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8 * Shivraj Patil * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA * Kaustubh Raste * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA * Abdelrauf * [2017-01-01] dgemm and dtrmm kernels for IBM z13 * [2017-02-26] ztrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13 OpenBLAS-0.2.20/Changelog.txt000066400000000000000000000534361313527062700156070ustar00rootroot00000000000000OpenBLAS ChangeLog ==================================================================== Version 0.2.20 24-Jul-2017 common: * Improved CMake support * Fixed several thread race and locking bugs * Fixed default LAPACK optimization level * Updated LAPACK to 3.7.0 * Added ReLAPACK (https://github.com/HPAC/ReLAPACK, make BUILD_RELAPACK=1) POWER: * Optimizations for Power9 * Fixed several Power8 assembly bugs ARM: * New optimized Vulcan and ThunderX2T99 targets * Support for ARMV7 SOFT_FP ABI (make ARM_SOFTFP_ABI=1) * Detect all cpu cores including offline ones * Fix compilation with CLANG * Support building a shared library for Android MIPS: * Fixed several threading issues * Fix compilation with CLANG x86_64: * Detect Intel Bay Trail and Apollo Lake * Detect Intel Sky Lake and Kaby Lake * Detect Intel Knights Landing * Detect AMD A8, A10, A12 and Ryzen * Support 64bit builds with Visual Studio * Fix building with Intel and PGI compilers * Fix building with MINGW and TDM-GCC * Fix cmake builds for Haswell and related cpus * Fix building for Sandybridge with CLANG 3.9 * Add support for the FLANG compiler IBM Z: * New target z13 with BLAS3 optimizations ==================================================================== Version 0.2.19 1-Sep-2016 common: * Improved cross compiling. * Fix the bug on musl libc. POWER: * Optimize BLAS on Power8 * Fixed Julia+OpenBLAS bugs on Power8 MIPS: * Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste) ARM: * Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) ==================================================================== Version 0.2.18 12-Apr-2016 common: * If you set MAKE_NB_JOBS flag less or equal than zero, make will be without -j. x86/x86_64: * Support building Visual Studio static library. (#813, Thanks, theoractice) * Fix bugs to pass buidbot CI tests (http://build.openblas.net) ARM: * Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K) POWER: * Optimize S and C BLAS3 on Power8 * Optimize BLAS2/1 on Power8 ==================================================================== Version 0.2.17 20-Mar-2016 common: * Enable BUILD_LAPACK_DEPRECATED=1 by default. ==================================================================== Version 0.2.16 15-Mar-2016 common: * Avoid potential getenv segfault. (#716) * Import LAPACK svn bugfix #142-#147,#150-#155 x86/x86_64: * Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller * Fix bug with scipy linalg test. ARM: * Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) POWER: * Optimize D and Z BLAS3 functions for Power8. ==================================================================== Version 0.2.16.rc1 23-Feb-2016 common: * Upgrade LAPACK to 3.6.0 version. Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build LAPACK deprecated functions. * Add MAKE_NB_JOBS option in Makefile. Force number of make jobs.This is particularly useful when using distcc. (#735. Thanks, Jerome Robert.) * Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor). * Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert) * Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert) * Let openblas_get_num_threads return the number of active threads. (#760. Thanks, Jerome Robert) * Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen) * Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey) * Update scipy benchmark script. (#745. Thanks, John Kirkham) x86/x86_64: * Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller. * Detect Intel Avoton. * Detect AMD Trinity, Richland, E2-3200. * Fix gemv performance bug on Mac OSX Intel Haswell. * Fix some bugs with CMake and Visual Studio ARM: * Support and optimize Cortex-A57 AArch64. (#686. Thanks, Ashwin Sekhar TK) * Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere) * Update ARMV6 kernels. POWER: * Fix detection of POWER architecture (#684. Thanks, Sebastien Villemot) ==================================================================== Version 0.2.15 27-Oct-2015 common: * Support cmake on x86/x86-64. Natively compiling on MS Visual Studio. (experimental. Thank Hank Anderson for the initial cmake porting work.) On Linux and Mac OSX, OpenBLAS cmake supports assembly kernels. e.g. cmake . make make test (Optional) On Windows MS Visual Studio, OpenBLAS cmake only support C kernels. (OpenBLAS uses AT&T style assembly, which is not supported by MSVC.) e.g. cmake -G "Visual Studio 12 Win64" . Open OpenBLAS.sln and build. * Enable MAX_STACK_ALLOC flags by default. Improve ger and gemv for small matrices. * Improve gemv parallel with small m and large n case. * Improve ?imatcopy when lda==ldb (#633. Thanks, Martin Koehler) * Add vecLib benchmarks (#565. Thanks, Andreas Noack.) * Fix LAPACK lantr for row major matrices (#634. Thanks, Dan Kortschak) * Fix LAPACKE lansy (#640. Thanks, Dan Kortschak) * Import bug fixes for LAPACKE s/dormlq, c/zunmlq * Raise the signal when pthread_create fails (#668. Thanks, James K. Lowden) * Remove g77 from compiler list. * Enable AppVeyor Windows CI. x86/x86-64: * Support pure C generic kernels for x86/x86-64. * Support Intel Boardwell and Skylake by Haswell kernels. * Support AMD Excavator by Steamroller kernels. * Optimize s/d/c/zdot for Intel SandyBridge and Haswell. * Optimize s/d/c/zdot for AMD Piledriver and Steamroller. * Optimize s/d/c/zapxy for Intel SandyBridge and Haswell. * Optimize s/d/c/zapxy for AMD Piledriver and Steamroller. * Optimize d/c/zscal for Intel Haswell, dscal for Intel SandyBridge. * Optimize d/c/zscal for AMD Bulldozer, Piledriver and Steamroller. * Optimize s/dger for Intel SandyBridge. * Optimize s/dsymv for Intel SandyBridge. * Optimize ssymv for Intel Haswell. * Optimize dgemv for Intel Nehalem and Haswell. * Optimize dtrmm for Intel Haswell. ARM: * Support Android NDK armeabi-v7a-hard ABI (-mfloat-abi=hard) e.g. make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7 * Fix lock, rpcc bugs (#616, #617. Thanks, Grazvydas Ignotas) POWER: * Support ppc64le platform (ELF ABI v2. #612. Thanks, Matthew Brandyberry.) * Support POWER7/8 by POWER6 kernels. (#612. Thanks, Fábio Perez.) ==================================================================== Version 0.2.14 24-Mar-2015 common: * Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.) * Improve ger and gemv for small matrices by stack allocation. e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.) * Introduce openblas_get_num_threads and openblas_get_num_procs. (#497. Thanks, Erik Schnetter.) * Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.) * Fix c/zsyr bug with negative incx. (#492.) * Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.) x86/x86-64: * Support AMD Streamroller. ARM: * Add Cortex-A9 and Cortex-A15 targets. ==================================================================== Version 0.2.13 3-Dec-2014 common: * Add SYMBOLPREFIX and SYMBOLSUFFIX makefile options for adding a prefix or suffix to all exported symbol names in the shared library.(#459, Thanks Tony Kelman) * Provide OpenBLASConfig.cmake at installation. * Fix Fortran compiler detection on FreeBSD. (#470, Thanks Mike Nolta) x86/x86-64: * Add generic kernel files for x86-64. make TARGET=GENERIC * Fix a bug of sgemm kernel on Intel Sandy Bridge. * Fix c_check bug on some amd64 systems. (#471, Thanks Mike Nolta) ARM: * Support APM's X-Gene 1 AArch64 processors. Optimize trmm and sgemm. (#465, Thanks Dave Nuechterlein) ==================================================================== Version 0.2.12 13-Oct-2014 common: * Added CBLAS interface for ?omatcopy and ?imatcopy. * Enable ?gemm3m functions. * Added benchmark for ?gemm3m. * Optimized multithreading lower limits. * Disabled SYMM3M and HEMM3M functions because of segment violations. x86/x86-64: * Improved axpy and symv performance on AMD Bulldozer. * Improved gemv performance on modern Intel and AMD CPUs. ==================================================================== Version 0.2.11 18-Aug-2014 common: * Added some benchmark codes. * Fix link error on Linux/musl.(Thanks Isaac Dunham) x86/x86-64: * Improved s/c/zgemm performance for Intel Haswell. * Improved s/d/c/zgemv performance. * Support the big numa machine.(EXPERIMENT) ARM: * Fix detection when cpuinfo uses "Processor". (Thanks Isaiah) ==================================================================== Version 0.2.10 16-Jul-2014 common: * Added BLAS extensions as following. s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy. * Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34) * Added NO_AVX2 flag for old binutils. (#401) * Support outputing the CPU corename on runtime.(#407) * Patched LAPACK to fix bug 114, 117, 118. (http://www.netlib.org/lapack/bug_list.html) * Disabled ?gemm3m for a work-around fix. (#400) x86/x86-64: * Fixed lots of bugs for optimized kernels on sandybridge,Haswell, bulldozer, and piledriver. https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-List ARM: * Improved LAPACK testing. ==================================================================== Version 0.2.9 10-Jun-2014 common: * Improved the result for LAPACK testing. (#372) * Installed DLL to prefix/bin instead of prefix/lib. (#366) * Build import library on Windows.(#374) x86/x86-64: * To improve LAPACK testing, we fallback some kernels. (#372) https://github.com/xianyi/OpenBLAS/wiki/Fixed-optimized-kernels-To-do-List ==================================================================== Version 0.2.9.rc2 06-Mar-2014 common: * Added OPENBLAS_VERBOSE environment variable.(#338) * Make OpenBLAS thread-pool resilient to fork via pthread_atfork. (#294, Thank Olivier Grisel) * Rewrote rotmg * Fixed sdsdot bug. x86/x86-64: * Detect Intel Haswell for new Macbook. ==================================================================== Version 0.2.9.rc1 13-Jan-2013 common: * Update LAPACK to 3.5.0 version * Fixed compatiable issues with Clang and Pathscale compilers. x86/x86-64: * Optimization on Intel Haswell. * Enable optimization kernels on AMD Bulldozer and Piledriver. ARM: * Support ARMv6 and ARMv7 ISA. * Optimization on ARM Cortex-A9. ==================================================================== Version 0.2.8 01-Aug-2013 common: * Support Open64 5.0. (#266) * Add executable stack markings. (#262, Thank Sébastien Fabbro) * Respect user's LDFLAGS (Thank Sébastien Fabbro) x86/x86-64: * Rollback bulldozer and piledriver kernels to barcelona kernels (#263) We will fix the compuational error bug in bulldozer and piledriver kernels. ==================================================================== Version 0.2.7 20-Jul-2013 common: * Support LSB (Linux Standard Base) 4.1. e.g. make CC=lsbcc * Include LAPACK 3.4.2 source codes to the repo. Avoid downloading at compile time. * Add NO_PARALLEL_MAKE flag to disable parallel make. * Create openblas_get_parallel to retrieve information which parallelization model is used by OpenBLAS. (Thank grisuthedragon) * Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X. * Change LIBSUFFIX from .lib to .a on windows. * A work-around for dtrti_U single thread bug. Replace it with LAPACK codes. (#191) x86/x86-64: * Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on AMD Bulldozer. (Thank Werner Saar) * Add Intel Haswell support (using Sandybridge optimizations). (Thank Dan Luu) * Add AMD Piledriver support (using Bulldozer optimizations). * Fix the computational error in zgemm avx kernel on Sandybridge. (#237) * Fix the overflow bug in gemv. * Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS is very large.(#214, #221, #246). MIPS64: * Support loongcc (Open64 based) compiler for ICT Loongson 3A/B. Power: * Support Power7 by old Power6 kernels. (#220) ==================================================================== Version 0.2.6 2-Mar-2013 common: * Improved OpenMP performance slightly. (d744c9) * Improved cblas.h compatibility with Intel MKL.(#185) * Fixed the overflowing bug in single thread cholesky factorization. * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) x86/x86-64: * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) We will tune the performance in future. * Auto-detect Intel Xeon E7540. * Fixed the overflowing buffer bug of gemv. (#173) * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) MIPS64: ==================================================================== Version 0.2.5 26-Nov-2012 common: * Added NO_SHARED flag to disable generating the shared library. * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) * Export LAPACK 3.4.2 symbols in shared library. (#147) * Only detect the number of physical CPU cores on Mac OSX. (#157) * Fixed NetBSD build. (#155) * Fixed compilation with TARGET=GENERIC. (#160) x86/x86-64: * Restore the original CPU affinity when calling openblas_set_num_threads(1) (#153) * Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154) MIPS64: ==================================================================== Version 0.2.4 8-Oct-2012 common: * Upgraded LAPACK to 3.4.2 version. (#145) * Provided support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make. (#137) * f77blas.h:compatibility for compilers without C99 complex number support. (#141) x86/x86-64: * Added NO_AVX flag. Check OS supporting AVX on runtime. (#139) * Fixed zdot incompatibility ABI issue with GCC 4.7 on Windows 32-bit. (#140) MIPS64: * Fixed the generation of shared library bug. * Fixed the detection bug on the Loongson 3A server. ==================================================================== Version 0.2.3 20-Aug-2012 common: * Fixed LAPACK unstable bug about ?laswp. (#130) * Fixed the shared library bug about unloading the library on Linux (#132). * Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2) Please use gcc and IBM xlf. (#134) x86/x86-64: * Supported goto_set_num_threads and openblas_set_num_threads APIs in Windows. They can set the number of threads on runtime. ==================================================================== Version 0.2.2 6-July-2012 common: * Fixed exporting DLL functions bug on Windows/MingW * Support GNU Hurd (Thank Sylvestre Ledru) * Support kfreebsd kernel (Thank Sylvestre Ledru) x86/x86-64: * Support Intel Sandy Bridge 22nm desktop/mobile CPU SPARC: * Improve the detection of SPARC (Thank Sylvestre Ledru) ==================================================================== Version 0.2.1 30-Jun-2012 common: x86/x86-64: * Fixed the SEGFAULT bug about hyper-theading * Support AMD Bulldozer by using GotoBLAS2 AMD Barcelona codes ==================================================================== Version 0.2.0 26-Jun-2012 common: * Removed the limitation (64) of numbers of CPU cores. Now, it supports 256 cores at max. * Supported clang compiler. * Fixed some build bugs on FreeBSD x86/x86-64: * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. Please use gcc >= 4.6 or clang >=3.1. * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. ==================================================================== Version 0.1.1 29-Apr-2012 common: * Upgraded LAPACK to 3.4.1 version. (Thank Zaheer Chothia) * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) * Fixed the build bug (MD5 and download) on Mac OSX. * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. * Fxied the compatibility issue for compilers without C99 complex number (e.g. Visual Studio) x86/x86_64: * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. * Test alpha=Nan in dscale. * Fixed a SEGFAULT bug in samax on x86 windows. ==================================================================== Version 0.1.0 23-Mar-2012 common: * Set soname of shared library on Linux. * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use this flag to control the library name, e.g. libopenblas.a, libopenblas_ifort.a or libopenblas_omp.a. * Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule. The lib use single thread in GEMM function with small matrices. x86/x86_64: * Used GEMV SSE/SSE2 kernels on x86 32-bit. * Exported CBLAS functions in Windows DLL. MIPS64: * Completed Level-3 BLAS optimization on Loongson 3A CPU. * Improved GEMV performance on Loongson 3A CPU. * Improved Level-3 BLAS performance on Loongson 3B CPU. (EXPERIMENT) ==================================================================== Version 0.1 alpha2.5 19-Feb-2012 common: * Fixed missing "#include " bug on Mac OS X. Thank Mike Nolta for the patch. * Upgraded LAPACK to 3.4.0 version * Fixed a bug on Mac OS X. Don't require SystemStubs on OS X. SystemStubs does not exist on Lion. Thank Stefan Karpinski. * Improved README with using OpenMP. Check the internal threads count less than or equal to omp_get_max_threads() x86/x86_64: * Auto-detect Intel Core i6/i7 (Sandy Bridge) CPU with Nehalem assembly kernels * Fixed some bugs on MingW 64-bit including zgemv, cdot, zdot. ==================================================================== Version 0.1 alpha2.4 18-Sep-2011 common: * Fixed a bug about installation. The header file "fblas77.h" works fine now. * Fixed #61 a building bug about setting TARGET and DYNAMIC_ARCH. * Try to handle absolute path of shared library in OSX. (#57) Thank Dr Kane O'Donnell. * Changed the installation folder layout to $(PREFIX)/include and $(PREFIX)/lib x86/x86_64: * Fixed #58 zdot/xdot SEGFAULT bug with GCC-4.6 on x86. According to i386 calling convention, The callee should remove the first hidden parameter.Thank Mr. John for this patch. ==================================================================== Version 0.1 alpha2.3 5-Sep-2011 x86/x86_64: * Added DTB_ENTRIES into dynamic arch setting parameters. Now, it can read DTB_ENTRIES on runtime. (Refs issue #55 on github) ==================================================================== Version 0.1 alpha2.2 14-Jul-2011 common: * Fixed a building bug when DYNAMIC_ARCH=1 & INTERFACE64=1. (Refs issue #44 on github) ==================================================================== Version 0.1 alpha2.1 28-Jun-2011 common: * Stop the build and output the error message when detecting fortran compiler failed. (Refs issue #42 on github) ==================================================================== Version 0.1 alpha2 23-Jun-2011 common: * Fixed blasint undefined bug in file. Other software could include this header successfully(Refs issue #13 on github) * Fixed the SEGFAULT bug on 64 cores. On SMP server, the number of CPUs or cores should be less than or equal to 64.(Refs issue #14 on github) * Support "void goto_set_num_threads(int num_threads)" and "void openblas_set_num_threads(int num_threads)" when USE_OPENMP=1 * Added extern "C" to support C++. Thank Tasio for the patch(Refs issue #21 on github) * Provided an error message when the arch is not supported.(Refs issue #19 on github) * Fixed issue #23. Fixed a bug of f_check script about generating link flags. * Added openblas_set_num_threads for Fortran. * Fixed #25 a wrong result of rotmg. * Fixed a bug about detecting underscore prefix in c_check. * Print the wall time (cycles) with enabling FUNCTION_PROFILE * Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1 * Added install target. You can use "make install". (Refs #20) x86/x86_64: * Fixed #28 a wrong result of dsdot on x86_64. * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. * Fixed #33 ztrmm bug on Nehalem. * Work-around #27 the low performance axpy issue with small imput size & multithreads. MIPS64: * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. * Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2) * Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3) ==================================================================== Version 0.1 alpha1 20-Mar-2011 common: * Support "make NO_LAPACK=1" to build the library without LAPACK functions. * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34. Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github) * Added DEBUG=1 rule in Makefile.rule to build debug version. * Disable compiling quad precision in reference BLAS library(netlib BLAS). * Added unit testcases in utest/ subdir. Used CUnit framework. * Supported OPENBLAS_* & GOTO_* environment variables (Pleas see README) * Imported GotoBLAS2 1.13 BSD version x86/x86_64: * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) * Modified ?swap functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #6 on github) * Modified ?rot functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #4 on github) * Detect Intel Westmere,Intel Clarkdale and Intel Arrandale to use Nehalem codes. * Fixed a typo bug about compiling dynamic ARCH library. MIPS64: * Improve daxpy performance on ICT Loongson 3A. * Supported ICT Loongson 3A CPU (Refs issue #1 on github) ==================================================================== OpenBLAS-0.2.20/GotoBLAS_00License.txt000066400000000000000000000031101313527062700170540ustar00rootroot00000000000000 Copyright 2009, 2010 The University of Texas at Austin. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of The University of Texas at Austin. OpenBLAS-0.2.20/GotoBLAS_01Readme.txt000066400000000000000000000037731313527062700167070ustar00rootroot00000000000000 Optimized GotoBLAS2 libraries version 1.13 By Kazushige Goto # This is the last update and done on 5th Feb. 2010. 0. License See 00TACC_Research_License.txt. 1. Supported OS Linux FreeBSD(Also it may work on NetBSD) OSX Soralis Windows 2k, XP, Server 2003 and 2008(both 32bit and 64bit) AIX Tru64 UNIX 2. Supported Architecture X86 : Pentium3 Katmai Coppermine Athlon (not well optimized, though) PentiumM Banias, Yonah Pentium4 Northwood Nocona (Prescott) Core 2 Woodcrest Core 2 Penryn Nehalem-EP Corei{3,5,7} Atom AMD Opteron AMD Barlcelona, Shanghai, Istanbul VIA NANO X86_64: Pentium4 Nocona Core 2 Woodcrest Core 2 Penryn Nehalem Atom AMD Opteron AMD Barlcelona, Shanghai, Istanbul VIA NANO IA64 : Itanium2 Alpha : EV4, EV5, EV6 POWER : POWER4 PPC970/PPC970FX PPC970MP CELL (PPU only) POWER5 PPC440 (QCDOC) PPC440FP2(BG/L) POWERPC G4(PPC7450) POWER6 SPARC : SPARC IV SPARC VI, VII (Fujitsu chip) MIPS64/32: Sicortex 3. Supported compiler C compiler : GNU CC Cygwin, MinGW Other commercial compiler(especially for x86/x86_64) Fortran Compiler : GNU G77, GFORTRAN G95 Open64 Compaq F2C IBM Intel PathScale PGI SUN Fujitsu 4. Suported precision Now x86/x86_64 version support 80bit FP precision in addition to normal double presicion and single precision. Currently only gfortran supports 80bit FP with "REAL*10". 5. How to build library? Please see 02QuickInstall.txt or just type "make". OpenBLAS-0.2.20/GotoBLAS_02QuickInstall.txt000066400000000000000000000071641313527062700201140ustar00rootroot00000000000000 Quick installation for GotoBLAS2 *************************************************************************** *************************************************************************** ** ** ** ** ** Just type "make" <>. ** ** ** ** If you're not satisfied with this library, ** ** please read following instruction and customize it. ** ** ** ** ** *************************************************************************** *************************************************************************** 1. REALLY REALLY quick way to build library Type "make" or "gmake". $shell> make The script will detect Fortran compiler, number of cores and architecture which you're using. If default gcc binary type is 64bit, 64 bit library will be created. Otherwise 32 bit library will be created. After finishing compile, you'll find various information about generated library. =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= GotoBLAS2 build complete. OS ... Linux Architecture ... x86_64 BINARY ... 64bit C compiler ... GCC (command line : gcc) Fortran compiler ... PATHSCALE (command line : pathf90) Library Name ... libgoto_barcelonap-r1.27.a (Multi threaded; Max num-threads is 16) =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 2. Specifying 32bit or 64bit library If you need 32bit binary, $shell> make BINARY=32 If you need 64bit binary, $shell> make BINARY=64 3. Specifying target architecture If you need library for different architecture, you can use TARGET option. You can find current available options in top of getarch.c. For example, if you need library for Intel core2 architecture, you'll find FORCE_CORE2 option in getarch.c. Therefore you can specify TARGET=CORE2 (get rid of FORCE_) with make. $shell> make TARGET=CORE2 Also if you want GotoBLAS2 to support multiple architecture, $shell> make DYNAMIC_ARCH=1 All kernel will be included in the library and dynamically switched the best architecutre at run time. 4. Specifying for enabling multi-threaded Script will detect number of cores and will enable multi threaded library if number of cores is more than two. If you still want to create single threaded library, $shell> make USE_THREAD=0 Or if you need threaded library by force, $shell> make USE_THREAD=1 5. Specifying target OS Target architecture will be determined by the CC. If you specify cross compiler for MIPS, you can create library for MIPS architecture. $shell> make CC=mips64el-linux-gcc TARGET=SICORTEX Or you can specify your favorite C compiler with absolute path. $shell> make CC=/opt/intel/cc/32/10.0.026/bin/icc TARGET=BARCELONA Binary type (32bit/64bit) is determined by checking CC, you can control binary type with this option. $shell> make CC="pathcc -m32" In this case, 32bit library will be created. 6. Specifying Fortran compiler If you need to support other Fortran compiler, you can specify with FC option. $shell> make FC=gfortran 7. Other useful options You'll find other useful options in Makefile.rule. OpenBLAS-0.2.20/GotoBLAS_03FAQ.txt000066400000000000000000000110661313527062700161150ustar00rootroot00000000000000 GotoBLAS2 FAQ 1. General 1.1 Q Can I find useful paper about GotoBLAS2? A You may check following URL. http://www.cs.utexas.edu/users/flame/Publications/index.htm 11. Kazushige Goto and Robert A. van de Geijn, " Anatomy of High-Performance Matrix Multiplication," ACM Transactions on Mathematical Software, accepted. 15. Kazushige Goto and Robert van de Geijn, "High-Performance Implementation of the Level-3 BLAS." ACM Transactions on Mathematical Software, submitted. 1.2 Q Does GotoBLAS2 work with Hyperthread (SMT)? A Yes, it will work. GotoBLAS2 detects Hyperthread and avoid scheduling on the same core. 1.3 Q When I type "make", following error occured. What's wrong? $shell> make "./Makefile.rule", line 58: Missing dependency operator "./Makefile.rule", line 61: Need an operator ... A This error occurs because you didn't use GNU make. Some binary packages install GNU make as "gmake" and it's worth to try. 1.4 Q Function "xxx" is slow. Why? A Generally GotoBLAS2 has many well optimized functions, but it's far and far from perfect. Especially Level 1/2 function performance depends on how you call BLAS. You should understand what happends between your function and GotoBLAS2 by using profile enabled version or hardware performance counter. Again, please don't regard GotoBLAS2 as a black box. 1.5 Q I have a commercial C compiler and want to compile GotoBLAS2 with it. Is it possible? A All function that affects performance is written in assembler and C code is just used for wrapper of assembler functions or complicated functions. Also I use many inline assembler functions, unfortunately most of commercial compiler can't handle inline assembler. Therefore you should use gcc. 1.6 Q I use OpenMP compiler. How can I use GotoBLAS2 with it? A Please understand that OpenMP is a compromised method to use thread. If you want to use OpenMP based code with GotoBLAS2, you should enable "USE_OPENMP=1" in Makefile.rule. 1.7 Q Could you tell me how to use profiled library? A You need to build and link your application with -pg option. After executing your application, "gmon.out" is generated in your current directory. $shell> gprof gmon.out Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls Ks/call Ks/call name 89.86 975.02 975.02 79317 0.00 0.00 .dgemm_kernel 4.19 1020.47 45.45 40 0.00 0.00 .dlaswp00N 2.28 1045.16 24.69 2539 0.00 0.00 .dtrsm_kernel_LT 1.19 1058.03 12.87 79317 0.00 0.00 .dgemm_otcopy 1.05 1069.40 11.37 4999 0.00 0.00 .dgemm_oncopy .... I think profiled BLAS library is really useful for your research. Please find bottleneck of your application and improve it. 1.8 Q Is number of thread limited? A Basically, there is no limitation about number of threads. You can specify number of threads as many as you want, but larger number of threads will consume extra resource. I recommend you to specify minimum number of threads. 1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? A This may be related to a bug in the Linux kernel 2.6.32. Try applying the patch segaults.patch using patch < segfaults.patch and see if the crashes persist. Note that this patch will lead to many compiler warnings. 2. Architecture Specific issue or Implementation 2.1 Q GotoBLAS2 seems to support any combination with OS and architecture. Is it possible? A Combination is limited by current OS and architecture. For examble, the combination OSX with SPARC is impossible. But it will be possible with slight modification if these combination appears in front of us. 2.2 Q I have POWER architecture systems. Do I need extra work? A Although POWER architecture defined special instruction like CPUID to detect correct architecture, it's privileged and can't be accessed by user process. So you have to set the architecture that you have manually in getarch.c. 2.3 Q I can't create DLL on Cygwin (Error 53). What's wrong? A You have to make sure if lib.exe and mspdb80.dll are in Microsoft Studio PATH. The easiest way is to use 'which' command. $shell> which lib.exe /cygdrive/c/Program Files/Microsoft Visual Studio/VC98/bin/lib.exe OpenBLAS-0.2.20/GotoBLAS_04FAQ.txt000066400000000000000000000004501313527062700161110ustar00rootroot00000000000000 Quick guide to build library for Windows 64bit. 1. What you need a. Windows Server 2003 or later b. Cygwin environment(make, gcc, g77, perl, sed, wget) c. MinGW64 compiler d. Microsoft Visual Studio (lib.exe and mspdb80.dll are required to create dll) 2. Do ./quickbuild.win64 Good luck OpenBLAS-0.2.20/GotoBLAS_05LargePage.txt000066400000000000000000000027561313527062700173450ustar00rootroot00000000000000 To enhance perfomance, I'd recommend you to enable large page on your OS (root account is required). A) Linux x86 32bit ... (number of core) * 4 pages x86 64bit ... (number of core) * 8 pages POWER 32/64bit ... (number of core) * 1 pages If you want to allocate 64 large pages, $shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset $shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page $shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number $shell> echo 3355443200 > /proc/sys/kernel/shmall Also may add a few lines into /etc/security/limits.conf file. * hard memlock unlimited * soft memlock unlimited Then restart sshd (/etc/init.d/sshd restart). B) Solaris You don't have to set up. C) Windows (Windows Server 2003 or later, XP 64bit) You have to assign memory lock operation to your account. Control Panel -> Administrative Tools -> Local Security Policy -> Local Policies -> User Rights Assignment -> Lock pages in memory D) AIX Ask your administrator E) Tru64 UNIX Assign shared memory at boot time. F) Other aarchitecture which doesn't have Large TLB enhancement If you have root permission, please install device driver which located in drivers/mapper. $shell> cd drivers/mapper $shell> make $shell> insmod mapper.ko $shell> ./device_setup Then enable DEVICEDRIVER_ALLOCATION = 1 in Makefile.rule. OpenBLAS-0.2.20/GotoBLAS_06WeirdPerformance.txt000066400000000000000000000015761313527062700207520ustar00rootroot00000000000000 Weird Performance 1. If you see serious performance loss (extremely low performance), probably you created too many threads or process. Basically GotoBLAS assumes that available cores that you specify are exclusively for BLAS computation. Even one small thread/process conflicts with BLAS threads, performance will become worse. The best solution is to reduce your number of threads or insert some synchronization mechanism and suspend your threads until BLAS operation is finished. 2. Simlar problem may happen under virtual machine. If supervisor allocates different cores for each scheduling, BLAS performnace will be bad. This is because BLAS also utilizes all cache, unexpected re-schedule for different core may result of heavy performance loss. Anyway, if you see any weird performance loss, it means your code or algorithm is not optimal. OpenBLAS-0.2.20/LICENSE000066400000000000000000000030141313527062700141470ustar00rootroot00000000000000Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OpenBLAS-0.2.20/Makefile000066400000000000000000000233011313527062700146030ustar00rootroot00000000000000TOPDIR = . include ./Makefile.system BLASDIRS = interface driver/level2 driver/level3 driver/others ifneq ($(DYNAMIC_ARCH), 1) BLASDIRS += kernel endif ifdef SANITY_CHECK BLASDIRS += reference endif SUBDIRS = $(BLASDIRS) ifneq ($(NO_LAPACK), 1) SUBDIRS += lapack endif RELA = ifeq ($(BUILD_RELAPACK), 1) RELA = re_lapack endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench .PHONY : all libs netlib $(RELA) test ctest shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test all :: libs netlib $(RELA) tests shared @echo @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" @echo @echo " OS ... $(OSNAME) " @echo " Architecture ... $(ARCH) " ifndef BINARY64 @echo " BINARY ... 32bit " else @echo " BINARY ... 64bit " endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " endif endif @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" ifndef NOFORTRAN @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" endif ifneq ($(OSNAME), AIX) @echo -n " Library Name ... $(LIBNAME)" else @echo " Library Name ... $(LIBNAME)" endif ifndef SMP @echo " (Single threaded) " else @echo " (Multi threaded; Max num-threads is $(NUM_THREADS))" endif ifeq ($(USE_OPENMP), 1) @echo @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, " @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads." @echo endif ifeq ($(OSNAME), Darwin) @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:" @echo @echo "\"make PREFIX=/your_installation_path/ install\"." @echo @echo "(or set PREFIX in Makefile.rule and run make install." @echo "If you want to move the .dylib to a new location later, make sure you change" @echo "the internal name of the dylib with:" @echo @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)" endif @echo @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"." @echo shared : ifndef NO_SHARED ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll endif ifeq ($(OSNAME), CYGWIN_NT) @$(MAKE) -C exports dll endif endif tests : ifndef NOFORTRAN touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all $(MAKE) -C utest all endif ifndef NO_CBLAS $(MAKE) -C ctest all endif endif libs : ifeq ($(CORE), UNKOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) endif ifeq ($(NOFORTRAN), 1) $(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.) endif ifeq ($(NO_STATIC), 1) ifeq ($(NO_SHARED), 1) $(error OpenBLAS: neither static nor shared are enabled.) endif endif @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done #Save the config files for installation @cp Makefile.conf Makefile.conf_last @cp config.h config_last.h ifdef QUAD_PRECISION @echo "#define QUAD_PRECISION">> config_last.h endif ifeq ($(EXPRECISION), 1) @echo "#define EXPRECISION">> config_last.h endif ## ifeq ($(DYNAMIC_ARCH), 1) @$(MAKE) -C kernel commonlibs || exit 1 @for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ done @echo DYNAMIC_ARCH=1 >> Makefile.conf_last endif ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last endif @touch lib.grd prof : prof_blas prof_lapack prof_blas : ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d prof || exit 1 ; \ fi; \ done ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonprof || exit 1 endif blas : ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d libs || exit 1 ; \ fi; \ done hpl : ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ done endif hpl_p : ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done ifeq ($(NO_LAPACK), 1) netlib : else netlib : lapack_prebuild ifndef NOFORTRAN @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif ifndef NO_LAPACKE @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib endif endif ifeq ($(NO_LAPACK), 1) re_lapack : else re_lapack : @$(MAKE) -C relapack endif prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof lapack_prebuild : ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc ifeq ($(F_COMPILER), GFORTRAN) -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc ifdef SMP ifeq ($(OSNAME), WINNT) -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc else -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc endif else -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc endif else -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif large.tgz : ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; fi endif timing.tgz : ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/timing.tgz; fi endif lapack-timing : large.tgz timing.tgz ifndef NOFORTRAN (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING endif lapack-test : (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc ifneq ($(CROSS), 1) ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) endif lapack-runtest: ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) blas-test: (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) dummy : install : $(MAKE) -f Makefile.install install clean :: @for d in $(SUBDIRS_ALL) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done #ifdef DYNAMIC_ARCH @$(MAKE) -C kernel clean #endif @$(MAKE) -C reference clean @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h ifeq ($(OSNAME), Darwin) @rm -rf getarch.dSYM getarch_2nd.dSYM endif @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) @echo Done. OpenBLAS-0.2.20/Makefile.alpha000066400000000000000000000021261313527062700156710ustar00rootroot00000000000000CPP = $(CC) -E RANLIB = ranlib ifeq ($(LIBSUBARCH), EV4) LIBNAME = $(LIBPREFIX)_ev4.a LIBNAME_P = $(LIBPREFIX)_ev4_p.a endif ifeq ($(LIBSUBARCH), EV5) LIBNAME = $(LIBPREFIX)_ev5.a LIBNAME_P = $(LIBPREFIX)_ev5_p.a endif ifeq ($(LIBSUBARCH), EV6) LIBNAME = $(LIBPREFIX)_ev6.a LIBNAME_P = $(LIBPREFIX)_ev6_p.a endif ifneq ($(COMPILER), NATIVE) # GCC User ifeq ($(LIBSUBARCH), EV4) OPTION += -DEV4 -mcpu=ev4 endif ifeq ($(LIBSUBARCH), EV5) OPTION += -DEV5 -mcpu=ev5 endif ifeq ($(LIBSUBARCH), EV6) OPTION += -DEV6 -mcpu=ev6 endif else # Compaq Compiler User ifeq ($(LIBSUBARCH), EV4) OPTION += -DEV4 -tune ev4 -arch ev4 endif ifeq ($(LIBSUBARCH), EV5) OPTION += -DEV5 -tune ev5 -arch ev5 endif ifeq ($(LIBSUBARCH), EV6) OPTION += -DEV6 -tune ev6 -arch ev6 endif endif ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -mieee endif ifeq ($(F_COMPILER), G77) FCOMMON_OPT += -mieee endif ifndef SMP LIBCXML = -lcxml -lots -lm LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm else LIBCXML = -lcxmlp -lots -lm LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm endif OpenBLAS-0.2.20/Makefile.arm000066400000000000000000000007141313527062700153640ustar00rootroot00000000000000ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) CCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a endif endif ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp -march=armv6 endif ifeq ($(CORE), ARMV5) CCOMMON_OPT += -march=armv5 FCOMMON_OPT += -march=armv5 endif OpenBLAS-0.2.20/Makefile.arm64000066400000000000000000000011761313527062700155410ustar00rootroot00000000000000 ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a endif ifeq ($(CORE), CORTEXA57) CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 endif ifeq ($(CORE), VULCAN) CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan endif ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx endif ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 endif OpenBLAS-0.2.20/Makefile.generic000066400000000000000000000000361313527062700162160ustar00rootroot00000000000000COPT = -Wall -O2 # -DGEMMTEST OpenBLAS-0.2.20/Makefile.ia64000066400000000000000000000015371313527062700153540ustar00rootroot00000000000000CCOMMON_COPT += # -DUSE64BITINT # -DGEMMTEST # CCOMMON_OPT += -DPARAMTEST FLAMEPATH = $(HOME)/flame/lib/ia64 ifndef SMP LIBMKL = -L$(MKLPATH)/64 -Wl,-rpath,$(MKLPATH)/64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lguide -lpthread -lm else LIBMKL = -L$(MKLPATH)/64 -Wl,-rpath,$(MKLPATH)/64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm endif LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame $(TOPDIR)/$(LIBNAME) -lgfortran -lpthread -lm LIBMLIB = ../../level1/others/libmisc.a -L/opt/intel/fc/ia64/9.1.040/lib -L/opt/mlib/lib \ -llapack -lguide -lifcore -lm -lpthread LIBSCSL = -L/opt/scsl/1.4.1.0/lib -Wl,-rpath,/opt/scsl/1.4.1.0/lib -lscs ifndef SMP LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm else LIBATLAS = -L$(HOME)/misc/lib -L/usr/lib/atlas3.6.0p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm endif OpenBLAS-0.2.20/Makefile.install000066400000000000000000000157211313527062700162570ustar00rootroot00000000000000TOPDIR = . export GOTOBLAS_MAKEFILE = 1 -include $(TOPDIR)/Makefile.conf_last include ./Makefile.system PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig .PHONY : install .NOTPARALLEL : install lib.grd : $(error OpenBLAS: Please run "make" firstly) install : lib.grd @-mkdir -p "$(DESTDIR)$(PREFIX)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) #for inc @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" @echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" @echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" @echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" @cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" @echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h" ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" endif ifeq ($(OSNAME), CYGWIN_NT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" endif endif #Generating openblas.pc @echo Generating openblas.pc in $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR) @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc @echo 'version='$(VERSION) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc @echo 'extralib='$(EXTRALIB) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc @cat openblas.pc.in >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), Darwin) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif else #only static @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif #Generating OpenBLASConfigVersion.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! OpenBLAS-0.2.20/Makefile.mips000066400000000000000000000000321313527062700155460ustar00rootroot00000000000000ifdef BINARY64 else endif OpenBLAS-0.2.20/Makefile.mips64000066400000000000000000000000321313527062700157200ustar00rootroot00000000000000ifdef BINARY64 else endif OpenBLAS-0.2.20/Makefile.power000066400000000000000000000054741313527062700157510ustar00rootroot00000000000000 ifdef USE_THREAD ifeq ($(USE_THREAD), 0) USE_OPENMP = 0 else USE_OPENMP = 1 endif else USE_OPENMP = 1 endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math endif endif FLAMEPATH = $(HOME)/flame/lib #ifeq ($(CORE), CELL) #CELL_SDK_ROOT = /opt/IBM/cell-sdk-1.1/sysroot/usr #SPU_CC = spu-gcc #EXTRALIB += -lspe #endif ifeq ($(OSNAME), Linux) ifdef BINARY64 # COMPILER_PREFIX = powerpc64-linux- else # COMPILER_PREFIX = powerpc-linux- endif endif #Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library #USE_MASS = 1 ifeq ($(USE_MASS), 1) # Path to MASS libs, change it if the libs are installed at any other location MASSPATH = /opt/ibm/xlmass/8.1.5/lib COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8 endif ifdef BINARY64 ifeq ($(OSNAME), AIX) CCOMMON_OPT += -mpowerpc64 -maix64 ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mpowerpc64 -maix64 endif ifeq ($(COMPILER_F77), xlf) FCOMMON_OPT += -q64 endif ARFLAGS = -X 64 ASFLAGS = -a64 endif else ifeq ($(OSNAME), AIX) CCOMMON_OPT += -Wa,-a32 ARFLAGS = -X 32 ASFLAGS = -a32 endif endif # CCOMMON_OPT += -maltivec -mabi=altivec LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS) ifeq ($(OSNAME), Darwin) CCOMMON_OPT += -force_cpusubtype_ALL endif ifndef BINARY64 ifeq ($(OSNAME), Linux) ESSLPATH = -L/opt/ibmcmp/lib -L/opt/ibmcmp/xlf/11.1/lib -Wl,-rpath,/opt/ibmcmp/lib -Wl,-rpath,/opt/ibmcmp/xlf/11.1/lib -lxlf90_r -lxlomp_ser -lxlfmath -lxl -lpthread else ESSLPATH = -lxlf90_r endif LIBVECLIB = -framework VecLib ifndef SMP LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm LIBESSL = -lessl $(ESSLPATH) ../../level1/others/libmisc.a -lm else LIBATLAS = -L/usr/lib/atlas3.7.11p -lptf77blas -latlas -lm -lpthread LIBESSL = -lesslsmp $(ESSLPATH) ../../level1/others/libmisc.a -lm endif else ifeq ($(OSNAME), Linux) ESSLPATH = -L/opt/ibmcmp/lib64 -Wl,-rpath,/opt/ibmcmp/lib64 -L/opt/ibmcmp/xlf/11.1/lib64 -Wl,-rpath,/opt/ibmcmp/xlf/11.1/lib64 -lxlf90_r -lxlomp_ser else ESSLPATH = -lxlf90_r endif LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib ifndef SMP LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm LIBESSL = -lessl $(ESSLPATH) -lm else LIBATLAS = -L/usr/lib64/atlas3.7.11p -lptf77blas -latlas -lm -lpthread LIBESSL = -lesslsmp $(ESSLPATH) -lxlsmp -lm endif endif OpenBLAS-0.2.20/Makefile.prebuild000066400000000000000000000027211313527062700164130ustar00rootroot00000000000000# This is triggered by Makefile.system and runs before any of the code is built. export BINARY export USE_OPENMP ifdef TARGET_CORE TARGET_MAKE = Makefile_kernel.conf TARGET_CONF = config_kernel.h else TARGET_MAKE = Makefile.conf TARGET_CONF = config.h endif # CPUIDEMU = ../../cpuid/table.o ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif ifeq ($(TARGET), P5600) TARGET_FLAGS = -mips32r5 endif ifeq ($(TARGET), I6400) TARGET_FLAGS = -mips64r6 endif ifeq ($(TARGET), P6600) TARGET_FLAGS = -mips64r6 endif all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) config.h : c_check f_check getarch perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) ifneq ($(ONLY_CBLAS), 1) perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) else #When we only build CBLAS, we set NOFORTRAN=2 echo "NOFORTRAN=2" >> $(TARGET_MAKE) echo "NO_FBLAS=1" >> $(TARGET_MAKE) echo "F_COMPILER=GFORTRAN" >> $(TARGET_MAKE) echo "BU=_" >> $(TARGET_MAKE) echo "#define BUNDERSCORE _" >> $(TARGET_CONF) echo "#define NEEDBUNDERSCORE 1" >> $(TARGET_CONF) endif ./getarch 0 >> $(TARGET_MAKE) ./getarch 1 >> $(TARGET_CONF) getarch : getarch.c cpuid.S dummy $(CPUIDEMU) $(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c config.h dummy ifndef TARGET_CORE $(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c else $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c endif dummy: OpenBLAS-0.2.20/Makefile.rule000066400000000000000000000141741313527062700155610ustar00rootroot00000000000000# # Beginning of user configuration # # This library's version VERSION = 0.2.20 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # is libopenblas_$(LIBNAMESUFFIX).so.0. # LIBNAMESUFFIX = omp # You can specify the target architecture, otherwise it's # automatically detected. # TARGET = PENRYN # If you want to support multiple architecture in one binary # DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. # CC = gcc # Fortran compiler. Default is g77. # FC = gfortran # Even you can specify cross compiler. Meanwhile, please set HOSTCC. # cross compiler for Windows # CC = x86_64-w64-mingw32-gcc # FC = x86_64-w64-mingw32-gfortran # cross compiler for 32bit ARM # CC = arm-linux-gnueabihf-gcc # FC = arm-linux-gnueabihf-gfortran # cross compiler for 64bit ARM # CC = aarch64-linux-gnu-gcc # FC = aarch64-linux-gnu-gfortran # If you use the cross compiler, please set this host compiler. # HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 # BINARY=64 # About threaded BLAS. It will be automatically detected if you don't # specify it. # For force setting for single threaded, specify USE_THREAD = 0 # For force setting for multi threaded, specify USE_THREAD = 1 # USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. # This flag is always set for POWER8. Don't modify the flag # USE_OPENMP = 1 # You can define maximum number of threads. Basically it should be # less than actual number of cores. If you don't specify one, it's # automatically detected by the the script. # NUM_THREADS = 24 # if you don't need to install the static library, please comment it in. # NO_STATIC = 1 # if you don't need generate the shared library, please comment it in. # NO_SHARED = 1 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 # If you only want CBLAS interface without installing Fortran compiler, # please comment it in. # ONLY_CBLAS = 1 # If you don't need LAPACK, please comment it in. # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. # NO_LAPACK = 1 # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. # NO_LAPACKE = 1 # Build LAPACK Deprecated functions since LAPACK 3.6.0 BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran # compiler supports this. It's safe to keep comment it out if you # are not sure(equivalent to "-i8" option). # INTERFACE64 = 1 # Unfortunately most of kernel won't give us high quality buffer. # BLAS tries to find the best region before entering main function, # but it will consume time. If you don't like it, you can disable one. NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. #NO_AFFINITY = 1 # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. # NO_AVX = 1 # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) # NO_AVX2 = 1 # Don't use parallel make. # NO_PARALLEL_MAKE = 1 # Force number of make jobs. The default is the number of logical CPU of the host. # This is particularly useful when using distcc. # A negative value will disable adding a -j flag to make, allowing to use a parent # make -j value. This is useful to call OpenBLAS make from an other project # makefile # MAKE_NB_JOBS = 2 # If you would like to know minute performance report of GotoBLAS. # FUNCTION_PROFILE = 1 # Support for IEEE quad precision(it's *real* REAL*16)( under testing) # QUAD_PRECISION = 1 # Theads are still working for a while after finishing BLAS operation # to reduce thread activate/deactivate overhead. You can determine # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz # system). Also you can control this mumber by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 # Using special device driver for mapping physically contigous memory # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # with single thread. You can use this flag to avoid the overhead of multi-threading # in small matrix sizes. The default value is 4. # GEMM_MULTITHREAD_THRESHOLD = 4 # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 # The installation directory. # PREFIX = /opt/OpenBLAS # Common Optimization Flag; # The default -O2 is enough. # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # COMMON_OPT = -O2 # gfortran option for LAPACK # enable this flag only on 64bit Linux and if you need a thread safe lapack library # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # FCOMMON_OPT = -frecursive # Profiling flags COMMON_PROF = -pg # Build Debug version # DEBUG = 1 # Set maximum stack allocation. # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV # performance. For details, https://github.com/xianyi/OpenBLAS/pull/482 # # MAX_STACK_ALLOC = 0 # Add a prefix or suffix to all exported symbol names in the shared library. # Avoid conflicts with other BLAS libraries, especially when using # 64 bit integer interfaces in OpenBLAS. # For details, https://github.com/xianyi/OpenBLAS/pull/459 # # The same prefix and suffix are also added to the library name, # i.e. you get lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) rather than libopenblas # # SYMBOLPREFIX= # SYMBOLSUFFIX= # # End of user configuration # OpenBLAS-0.2.20/Makefile.sparc000066400000000000000000000014631313527062700157170ustar00rootroot00000000000000CPP = $(CC) -E RANLIB = ranlib ifdef BINARY64 CCOMMON_OPT += -mcpu=v9 -m64 ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mcpu=v9 -m64 endif ifeq ($(COMPILER_F77), f90) FCOMMON_OPT += -xarch=v9 endif else CCOMMON_OPT += -mcpu=v9 ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mcpu=v9 endif ifeq ($(COMPILER_F77), f90) FCOMMON_OPT += -xarch=v8plusb endif endif LIBNAME = $(LIBPREFIX).a ifndef SMP LIBCXML = -L/opt/SUNWspro/lib/v9 LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm else LIBCXML = -lcxmlp -lots -lm endif ifdef BINARY64 LIBSUNPERF = -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \ -Wl,-R,/opt/SUNWspro/lib/v9 -lsunperf -lompstubs -lfui -lfsu -lsunmath else LIBSUNPERF = -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \ -Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath endifOpenBLAS-0.2.20/Makefile.system000066400000000000000000000536631313527062700161440ustar00rootroot00000000000000# # Include user definition # # TO suppress recursive includes INCLUDED = 1 ifndef TOPDIR TOPDIR = . endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib # Default C compiler # - Only set if not specified on the command line or inherited from the environment. # - CC is an implicit variable so neither '?=' or 'ifndef' can be used. # http://stackoverflow.com/questions/4029274/mingw-and-make-variables # - Default value is 'cc' which is not always a valid command (e.g. MinGW). ifeq ($(origin CC),default) CC = gcc # Change the default compile to clang on Mac OSX. # http://stackoverflow.com/questions/714100/os-detecting-makefile UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) CC = clang # EXTRALIB += -Wl,-no_compact_unwind endif endif # Default Fortran compiler (FC) is selected by f_check. ifndef MAKEFILE_RULE include $(TOPDIR)/Makefile.rule else include $(TOPDIR)/$(MAKEFILE_RULE) endif # # Beginning of system configuration # ifndef HOSTCC HOSTCC = $(CC) endif ifdef TARGET GETARCH_FLAGS := -DFORCE_$(TARGET) endif # Force fallbacks for 32bit ifeq ($(BINARY), 32) ifeq ($(TARGET), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif ifeq ($(TARGET), BULLDOZER) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET), STEAMROLLER) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET), EXCAVATOR) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET), ZEN) GETARCH_FLAGS := -DFORCE_BARCELONA endif endif #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. # ifdef TARGET_CORE GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) endif # Force fallbacks for 32bit ifeq ($(BINARY), 32) ifeq ($(TARGET_CORE), HASWELL) GETARCH_FLAGS := -DFORCE_NEHALEM endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif ifeq ($(TARGET_CORE), BULLDOZER) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET_CORE), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET_CORE), STEAMROLLER) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET_CORE), EXCAVATOR) GETARCH_FLAGS := -DFORCE_BARCELONA endif ifeq ($(TARGET_CORE), ZEN) GETARCH_FLAGS := -DFORCE_BARCELONA endif endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) GETARCH_FLAGS += -DUSE64BITINT endif endif ifndef GEMM_MULTITHREAD_THRESHOLD GEMM_MULTITHREAD_THRESHOLD=4 endif GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) ifeq ($(NO_AVX), 1) GETARCH_FLAGS += -DNO_AVX endif ifeq ($(BINARY), 32) GETARCH_FLAGS += -DNO_AVX endif ifeq ($(NO_AVX2), 1) GETARCH_FLAGS += -DNO_AVX2 endif ifeq ($(DEBUG), 1) GETARCH_FLAGS += -g endif ifeq ($(QUIET_MAKE), 1) MAKE += -s endif ifndef NO_PARALLEL_MAKE NO_PARALLEL_MAKE=0 endif GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE) ifdef MAKE_NB_JOBS GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS) endif ifeq ($(HOSTCC), loongcc) GETARCH_FLAGS += -static endif #if don't use Fortran, it will only compile CBLAS. ifeq ($(ONLY_CBLAS), 1) NO_LAPACK = 1 else ONLY_CBLAS = 0 endif # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else include $(TOPDIR)/Makefile_kernel.conf endif endif ifndef NUM_THREADS NUM_THREADS = $(NUM_CORES) endif ifeq ($(NUM_THREADS), 1) override USE_THREAD = 0 endif ifdef USE_THREAD ifeq ($(USE_THREAD), 0) SMP = else SMP = 1 endif else ifeq ($(NUM_THREAD), 1) SMP = else SMP = 1 endif endif ifndef NEED_PIC NEED_PIC = 1 endif ARFLAGS = CPP = $(COMPILER) -E AR = $(CROSS_SUFFIX)ar AS = $(CROSS_SUFFIX)as LD = $(CROSS_SUFFIX)ld RANLIB = $(CROSS_SUFFIX)ranlib NM = $(CROSS_SUFFIX)nm DLLWRAP = $(CROSS_SUFFIX)dllwrap OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv # For detect fortran failed, only build BLAS. ifeq ($(NOFORTRAN), 1) NO_LAPACK = 1 endif # # OS dependent settings # ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET export MACOSX_DEPLOYMENT_TARGET=10.6 endif MD5SUM = md5 -r endif ifeq ($(OSNAME), FreeBSD) MD5SUM = md5 -r endif ifeq ($(OSNAME), NetBSD) MD5SUM = md5 -n endif ifeq ($(OSNAME), Linux) EXTRALIB += -lm NO_EXPRECISION = 1 endif ifeq ($(OSNAME), Android) EXTRALIB += -lm endif ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif ifeq ($(OSNAME), WINNT) NEED_PIC = 0 NO_EXPRECISION = 1 EXTRALIB += -defaultlib:advapi32 SUFFIX = obj PSUFFIX = pobj LIBSUFFIX = a ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -DMS_ABI endif ifeq ($(C_COMPILER), GCC) #Test for supporting MS_ABI GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Majar version > 4 # It is compatible with MSVC ABI. CCOMMON_OPT += -DMS_ABI endif ifeq ($(GCCVERSIONGTEQ4), 1) ifeq ($(GCCMINORVERSIONGTEQ7), 1) # GCC Version >=4.7 # It is compatible with MSVC ABI. CCOMMON_OPT += -DMS_ABI endif endif endif # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 ifeq ($(ARCH), x86) CCOMMON_OPT += -mincoming-stack-boundary=2 FCOMMON_OPT += -mincoming-stack-boundary=2 endif endif ifeq ($(OSNAME), Interix) NEED_PIC = 0 NO_EXPRECISION = 1 INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin endif ifeq ($(OSNAME), CYGWIN_NT) NEED_PIC = 0 NO_EXPRECISION = 1 endif ifneq ($(OSNAME), WINNT) ifneq ($(OSNAME), CYGWIN_NT) ifneq ($(OSNAME), Interix) ifneq ($(OSNAME), Android) ifdef SMP EXTRALIB += -lpthread endif endif endif endif endif # ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) OS_WINDOWS=1 endif ifdef QUAD_PRECISION CCOMMON_OPT += -DQUAD_PRECISION NO_EXPRECISION = 1 endif ifneq ($(ARCH), x86) ifneq ($(ARCH), x86_64) NO_EXPRECISION = 1 endif endif ifdef UTEST_CHECK CCOMMON_OPT += -DUTEST_CHECK SANITY_CHECK = 1 endif ifdef SANITY_CHECK CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) endif MAX_STACK_ALLOC ?= 2048 ifneq ($(MAX_STACK_ALLOC), 0) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) endif # # Architecture dependent settings # ifeq ($(ARCH), x86) ifndef BINARY NO_BINARY_MODE = 1 endif ifeq ($(CORE), generic) NO_EXPRECISION = 1 endif ifndef NO_EXPRECISION ifeq ($(F_COMPILER), GFORTRAN) # ifeq logical or. GCC or LSB ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) EXPRECISION = 1 CCOMMON_OPT += -DEXPRECISION -m128bit-long-double FCOMMON_OPT += -m128bit-long-double endif ifeq ($(C_COMPILER), CLANG) EXPRECISION = 1 CCOMMON_OPT += -DEXPRECISION FCOMMON_OPT += -m128bit-long-double endif endif endif endif ifeq ($(ARCH), x86_64) ifeq ($(CORE), generic) NO_EXPRECISION = 1 endif ifndef NO_EXPRECISION ifeq ($(F_COMPILER), GFORTRAN) # ifeq logical or. GCC or LSB ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) EXPRECISION = 1 CCOMMON_OPT += -DEXPRECISION -m128bit-long-double FCOMMON_OPT += -m128bit-long-double endif ifeq ($(C_COMPILER), CLANG) EXPRECISION = 1 CCOMMON_OPT += -DEXPRECISION FCOMMON_OPT += -m128bit-long-double endif endif endif endif ifeq ($(C_COMPILER), INTEL) CCOMMON_OPT += -wd981 endif ifeq ($(USE_OPENMP), 1) #check ifeq ($(USE_THREAD), 0) $(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) endif # ifeq logical or. GCC or LSB ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -fopenmp endif ifeq ($(C_COMPILER), INTEL) CCOMMON_OPT += -openmp endif ifeq ($(C_COMPILER), PGI) CCOMMON_OPT += -mp endif ifeq ($(C_COMPILER), OPEN64) CCOMMON_OPT += -mp CEXTRALIB += -lstdc++ endif ifeq ($(C_COMPILER), PATHSCALE) CCOMMON_OPT += -mp endif endif ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR endif ifneq ($(NO_AVX2), 1) DYNAMIC_CORE += HASWELL ZEN endif endif # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= endif endif ifeq ($(ARCH), ia64) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), GCC) # EXPRECISION = 1 # CCOMMON_OPT += -DEXPRECISION endif endif endif ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) NO_BINARY_MODE = 1 endif ifeq ($(ARCH), alpha) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 CCOMMON_OPT += -marm FCOMMON_OPT += -marm # If softfp abi is mentioned on the command line, force it. ifeq ($(ARM_SOFTFP_ABI), 1) CCOMMON_OPT += -mfloat-abi=softfp FCOMMON_OPT += -mfloat-abi=softfp endif ifeq ($(OSNAME), Android) ifeq ($(ARM_SOFTFP_ABI), 1) EXTRALIB += -lm else EXTRALIB += -Wl,-lm_hard endif endif endif ifeq ($(ARCH), arm64) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif # # C Compiler dependent settings # # ifeq logical or. GCC or CLANG or LSB # http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB)) CCOMMON_OPT += -Wall COMMON_PROF += -fno-inline NO_UNINITIALIZED_WARN = -Wno-uninitialized ifeq ($(QUIET_MAKE), 1) CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused endif ifdef NO_BINARY_MODE ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 CCOMMON_OPT += -mabi=64 else CCOMMON_OPT += -mabi=n32 endif BINARY_DEFINED = 1 else ifeq ($(ARCH), $(filter $(ARCH),mips)) CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif ifeq ($(CORE), LOONGSON3A) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif ifeq ($(CORE), P5600) CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) endif ifeq ($(CORE), I6400) CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS) endif ifeq ($(CORE), P6600) CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS) endif ifeq ($(OSNAME), AIX) BINARY_DEFINED = 1 endif endif ifndef BINARY_DEFINED ifdef BINARY64 CCOMMON_OPT += -m64 else CCOMMON_OPT += -m32 endif endif endif ifeq ($(C_COMPILER), PGI) ifdef BINARY64 CCOMMON_OPT += -tp p7-64 else CCOMMON_OPT += -tp p7 endif endif ifeq ($(C_COMPILER), PATHSCALE) ifdef BINARY64 CCOMMON_OPT += -m64 else CCOMMON_OPT += -m32 endif endif # # Fortran Compiler dependent settings # ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif FCOMMON_OPT += -Wall else FCOMMON_OPT += -Wall endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif endif ifeq ($(F_COMPILER), G77) CCOMMON_OPT += -DF_INTERFACE_G77 FCOMMON_OPT += -Wall ifndef NO_BINARY_MODE ifdef BINARY64 FCOMMON_OPT += -m64 else FCOMMON_OPT += -m32 endif endif endif ifeq ($(F_COMPILER), G95) CCOMMON_OPT += -DF_INTERFACE_G95 FCOMMON_OPT += -Wall ifndef NO_BINARY_MODE ifdef BINARY64 FCOMMON_OPT += -m64 else FCOMMON_OPT += -m32 endif endif endif ifeq ($(F_COMPILER), GFORTRAN) CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran endif ifdef NO_BINARY_MODE ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 FCOMMON_OPT += -mabi=64 else FCOMMON_OPT += -mabi=n32 endif else ifeq ($(ARCH), $(filter $(ARCH),mips)) FCOMMON_OPT += -mabi=32 endif else ifdef BINARY64 FCOMMON_OPT += -m64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -fdefault-integer-8 endif endif else FCOMMON_OPT += -m32 endif endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif endif ifeq ($(F_COMPILER), INTEL) CCOMMON_OPT += -DF_INTERFACE_INTEL ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif ifeq ($(F_COMPILER), FUJITSU) CCOMMON_OPT += -DF_INTERFACE_FUJITSU ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif ifeq ($(F_COMPILER), IBM) CCOMMON_OPT += -DF_INTERFACE_IBM # FCOMMON_OPT += -qarch=440 ifdef BINARY64 FCOMMON_OPT += -q64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -qintsize=8 endif endif else FCOMMON_OPT += -q32 endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif ifeq ($(F_COMPILER), PGI) CCOMMON_OPT += -DF_INTERFACE_PGI COMMON_PROF += -DPGICOMPILER ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif FCOMMON_OPT += -tp p7-64 else FCOMMON_OPT += -tp p7 endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif endif ifeq ($(F_COMPILER), PATHSCALE) CCOMMON_OPT += -DF_INTERFACE_PATHSCALE ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif endif ifeq ($(F_COMPILER), OPEN64) CCOMMON_OPT += -DF_INTERFACE_OPEN64 ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif endif ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifndef BINARY64 FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif ifeq ($(CORE), LOONGSON3A) FCOMMON_OPT += -loongson3 -static endif ifeq ($(CORE), LOONGSON3B) FCOMMON_OPT += -loongson3 -static endif else ifndef BINARY64 FCOMMON_OPT += -m32 else FCOMMON_OPT += -m64 endif endif ifeq ($(USE_OPENMP), 1) FEXTRALIB += -lstdc++ FCOMMON_OPT += -mp endif endif ifeq ($(C_COMPILER), OPEN64) ifeq ($(ARCH), $(filter $(ARCH),mips64 mips)) ifndef BINARY64 CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif ifeq ($(CORE), LOONGSON3A) CCOMMON_OPT += -loongson3 -static endif ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -loongson3 -static endif else ifndef BINARY64 CCOMMON_OPT += -m32 else CCOMMON_OPT += -m64 endif endif endif ifeq ($(C_COMPILER), SUN) CCOMMON_OPT += -w ifeq ($(ARCH), x86) CCOMMON_OPT += -m32 else FCOMMON_OPT += -m64 endif endif ifeq ($(F_COMPILER), SUN) CCOMMON_OPT += -DF_INTERFACE_SUN ifeq ($(ARCH), x86) FCOMMON_OPT += -m32 else FCOMMON_OPT += -m64 endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -xopenmp=parallel endif endif ifeq ($(F_COMPILER), COMPAQ) CCOMMON_OPT += -DF_INTERFACE_COMPAQ ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -openmp endif endif ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) CCOMMON_OPT += #-DUSE64BITINT endif endif endif ifeq ($(NEED_PIC), 1) ifeq ($(C_COMPILER), IBM) CCOMMON_OPT += -qpic=large else CCOMMON_OPT += -fPIC endif ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -pic else FCOMMON_OPT += -fPIC endif endif ifeq ($(DYNAMIC_ARCH), 1) CCOMMON_OPT += -DDYNAMIC_ARCH endif ifeq ($(NO_LAPACK), 1) CCOMMON_OPT += -DNO_LAPACK #Disable LAPACK C interface NO_LAPACKE = 1 endif ifeq ($(NO_LAPACKE), 1) CCOMMON_OPT += -DNO_LAPACKE endif ifeq ($(NO_AVX), 1) CCOMMON_OPT += -DNO_AVX endif ifeq ($(ARCH), x86) CCOMMON_OPT += -DNO_AVX endif ifeq ($(NO_AVX2), 1) CCOMMON_OPT += -DNO_AVX2 endif ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 # NO_AFFINITY = 1 CCOMMON_OPT += -DUSE_OPENMP endif ifeq ($(BIGNUMA), 1) CCOMMON_OPT += -DBIGNUMA endif endif ifeq ($(NO_WARMUP), 1) CCOMMON_OPT += -DNO_WARMUP endif ifeq ($(CONSISTENT_FPCSR), 1) CCOMMON_OPT += -DCONSISTENT_FPCSR endif # Only for development # CCOMMON_OPT += -DPARAMTEST # CCOMMON_OPT += -DPREFETCHTEST # CCOMMON_OPT += -DNO_SWITCHING # USE_PAPI = 1 ifdef USE_PAPI CCOMMON_OPT += -DUSE_PAPI EXTRALIB += -lpapi -lperfctr endif ifdef DYNAMIC_THREADS CCOMMON_OPT += -DDYNAMIC_THREADS endif CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif ifndef SYMBOLPREFIX SYMBOLPREFIX = endif ifndef SYMBOLSUFFIX SYMBOLSUFFIX = endif ifndef LIBNAMESUFFIX LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) else LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) CCOMMON_OPT += -DALLOC_QALLOC endif ifeq ($(CORE), PPC440FP2) STATIC_ALLOCATION = 1 endif ifneq ($(OSNAME), Linux) NO_AFFINITY = 1 endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif endif ifdef NO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY endif ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE endif ifdef HUGETLB_ALLOCATION CCOMMON_OPT += -DALLOC_HUGETLB endif ifdef HUGETLBFILE_ALLOCATION CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION) endif ifdef STATIC_ALLOCATION CCOMMON_OPT += -DALLOC_STATIC endif ifdef DEVICEDRIVER_ALLOCATION CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\" endif ifdef MIXED_MEMORY_ALLOCATION CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION endif ifeq ($(OSNAME), SunOS) TAR = gtar PATCH = gpatch GREP = ggrep AWK = nawk else TAR = tar PATCH = patch GREP = grep AWK = awk endif ifndef MD5SUM MD5SUM = md5sum endif REVISION = -r$(VERSION) MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) ifeq ($(DEBUG), 1) COMMON_OPT += -g endif ifeq ($(DEBUG), 1) FCOMMON_OPT += -g endif ifndef COMMON_OPT COMMON_OPT = -O2 endif ifndef FCOMMON_OPT FCOMMON_OPT = -O2 -frecursive endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = #For LAPACK Fortran codes. #Disable -fopenmp for LAPACK Fortran codes on Windows. ifdef OS_WINDOWS LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS)) LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS)) else LAPACK_FFLAGS := $(FFLAGS) LAPACK_FPFLAGS := $(FPFLAGS) endif LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) LAPACK_CFLAGS += -DLAPACK_ILP64 endif endif ifdef OS_WINDOWS LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS endif ifeq ($(C_COMPILER), LSB) LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE endif ifndef SUFFIX SUFFIX = o endif ifndef PSUFFIX PSUFFIX = po endif ifndef LIBSUFFIX LIBSUFFIX = a endif ifneq ($(DYNAMIC_ARCH), 1) ifndef SMP LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) else LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX) endif else ifndef SMP LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) else LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) endif endif LIBDLLNAME = $(LIBPREFIX).dll LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) LIBS = $(TOPDIR)/$(LIBNAME) LIBS_P = $(TOPDIR)/$(LIBNAME_P) LIB_COMPONENTS = BLAS ifneq ($(NO_CBLAS), 1) LIB_COMPONENTS += CBLAS endif ifneq ($(NO_LAPACK), 1) LIB_COMPONENTS += LAPACK ifneq ($(NO_LAPACKE), 1) LIB_COMPONENTS += LAPACKE endif ifeq ($(BUILD_RELAPACK), 1) LIB_COMPONENTS += ReLAPACK endif endif ifeq ($(ONLY_CBLAS), 1) LIB_COMPONENTS = CBLAS endif export OSNAME export ARCH export CORE export LIBCORE export PGCPATH export CONFIG export CC export FC export BU export FU export NEED2UNDERSCORES export USE_THREAD export NUM_THREADS export NUM_CORES export SMP export MAKEFILE_RULE export NEED_PIC export BINARY export BINARY32 export BINARY64 export F_COMPILER export C_COMPILER export USE_OPENMP export CROSS export CROSS_SUFFIX export NOFORTRAN export NO_FBLAS export EXTRALIB export CEXTRALIB export FEXTRALIB export HAVE_SSE export HAVE_SSE2 export HAVE_SSE3 export HAVE_SSSE3 export HAVE_SSE4_1 export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 export HAVE_NEON export HAVE_MSA export MSA_FLAGS export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M export DGEMM_UNROLL_N export QGEMM_UNROLL_M export QGEMM_UNROLL_N export CGEMM_UNROLL_M export CGEMM_UNROLL_N export ZGEMM_UNROLL_M export ZGEMM_UNROLL_N export XGEMM_UNROLL_M export XGEMM_UNROLL_N export CGEMM3M_UNROLL_M export CGEMM3M_UNROLL_N export ZGEMM3M_UNROLL_M export ZGEMM3M_UNROLL_N export XGEMM3M_UNROLL_M export XGEMM3M_UNROLL_N ifdef USE_CUDA export CUDADIR export CUCC export CUFLAGS export CULIB endif .SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f .f.$(SUFFIX): $(FC) $(FFLAGS) -c $< -o $(@F) .f.$(PSUFFIX): $(FC) $(FPFLAGS) -pg -c $< -o $(@F) ifdef BINARY64 PATHSCALEPATH = /opt/pathscale/lib/3.1 PGIPATH = /opt/pgi/linux86-64/7.1-5/lib else PATHSCALEPATH = /opt/pathscale/lib/3.1/32 PGIPATH = /opt/pgi/linux86/7.1-5/lib endif ACMLPATH = /opt/acml/4.3.0 ifneq ($(OSNAME), Darwin) MKLPATH = /opt/intel/mkl/10.2.2.025/lib else MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib endif ATLASPATH = /opt/atlas/3.9.17/opteron FLAMEPATH = $(HOME)/flame/lib ifneq ($(OSNAME), SunOS) SUNPATH = /opt/sunstudio12.1 else SUNPATH = /opt/SUNWspro endif OpenBLAS-0.2.20/Makefile.tail000066400000000000000000000517521313527062700155460ustar00rootroot00000000000000SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif ifdef QUAD_PRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) libs :: $(BLASOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ prof :: $(BLASOBJS_P) $(COMMONOBJS_P) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ hpl :: $(HPLOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ hpl_p :: $(HPLOBJS_P) $(COMMONOBJS_P) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ kernel :: $(BLASOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ commonlibs :: $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ commonprof :: $(COMMONOBJS_P) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ quick : $(MAKE) -C $(TOPDIR) libs bms.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) bmd.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) bmd-k.$(SUFFIX):bm-k.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) ifdef QUAD_PRECISION bmq.$(SUFFIX):bmq.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) bmx.$(SUFFIX):bmx.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) else bmq.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) bmx.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) endif bmc.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) bmz.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) bmd_nn.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DNN -c $< -o $(@F) bmd_nt.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DNT -c $< -o $(@F) bmd_tn.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DTN -c $< -o $(@F) bmd_tt.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DTT -c $< -o $(@F) bm-phy.$(SUFFIX):bm-phy.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) bms.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) bmd.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) ifdef QUAD_PRECISION bmq.$(PSUFFIX):bmq.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) bmx.$(PSUFFIX):bmx.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) else bmq.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) bmx.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) endif bmc.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) bmz.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(PFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) bms : bms.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmd : bmd.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) -lm bmd-k : bmd-k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) -lm bmq : bmq.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmc : bmc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) $(FEXTRALIB) bmz : bmz.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmx : bmx.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmd_nn : bmd_nn.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmd_nt : bmd_nt.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmd_tn : bmd_tn.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmd_tt : bmd_tt.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bm-phy:bm-phy.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmcc : bmcc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmzc : bmzc.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bms.prof : bms.$(PSUFFIX) $(SBLASOBJS_P) $(COMMONOBJS_P) $(SOBJS) $(OBJS) $(LIBS_P) $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmd.prof : bmd.$(PSUFFIX) $(DBLASOBJS_P) $(COMMONOBJS_P) $(DOBJS) $(OBJS) $(LIBS_P) $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmq.prof : bmq.$(PSUFFIX) $(QBLASOBJS_P) $(COMMONOBJS_P) $(QOBJS) $(OBJS) $(LIBS_P) $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmc.prof : bmc.$(PSUFFIX) $(CBLASOBJS_P) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS_P) $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmz.prof : bmz.$(PSUFFIX) $(ZBLASOBJS_P) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS_P) $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bmx.prof : bmz.$(PSUFFIX) $(XBLASOBJS_P) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS_P) $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bms.cxml : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) bmd.cxml : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) bmc.cxml : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) bmz.cxml : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) bms.scsl : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) bmd.scsl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) bmc.scsl : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) bmz.scsl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) bms.acml : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) bmd.acml : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) bmc.acml : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) bmz.acml : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) bms.sun : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) bmd.sun : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) bmc.sun : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) bmz.sun : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) bms.atlas : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) bmd.atlas : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) bmc.atlas : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) bmz.atlas : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) bms.essl : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) $(FCOMMON_OPT) -o $(@F) $^ $(LIBESSL) bmd.essl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) bmc.essl : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(F77) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) bmz.essl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) bms.flame : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) bmd.flame : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) bmc.flame : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) bmz.flame : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) bms.flame.prof : bms.$(SUFFIX) $(SOBJS) $(OBJS_P) $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) bmd.flame.prof : bmd.$(SUFFIX) $(DOBJS) $(OBJS_P) $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) bmc.flame.prof : bmc.$(SUFFIX) $(COBJS) $(OBJS_P) $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) bmz.flame.prof : bmz.$(SUFFIX) $(ZOBJS) $(OBJS_P) $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) bms.mkl : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bmd.mkl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bmc.mkl : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bmz.mkl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bmq.mkl : bmq.$(SUFFIX) $(QOBJS) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bms.mkl.prof : bms.$(PSUFFIX) $(SOBJS) $(OBJS) $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) bmd.mkl.prof : bmd.$(PSUFFIX) $(DOBJS) $(OBJS) $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) bmc.mkl.prof : bmc.$(PSUFFIX) $(COBJS) $(OBJS) $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) bmz.mkl.prof : bmz.$(PSUFFIX) $(ZOBJS) $(OBJS) $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) bms.mlib : bms.$(SUFFIX) $(SOBJS) $(OBJS) $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) bmd.mlib : bmd.$(SUFFIX) $(DOBJS) $(OBJS) $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) bmc.mlib : bmc.$(SUFFIX) $(COBJS) $(OBJS) $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) bmz.mlib : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) bms.veclib : bms.$(SUFFIX) $(SOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) bmd.veclib : bmd.$(SUFFIX) $(DOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) bmc.veclib : bmc.$(SUFFIX) $(COBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) bmz.veclib : bmz.$(SUFFIX) $(ZOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) bms.fuji : bms.$(SUFFIX) $(SOBJS) ifndef SMP fcc -KV9FMADD -SSL2 -o $(@F) $^ else fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ endif bmd.fuji : bmd.$(SUFFIX) $(DOBJS) ifndef SMP fcc -KV9FMADD -SSL2 -o $(@F) $^ else fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ endif bmc.fuji : bmc.$(SUFFIX) $(COBJS) ifndef SMP fcc -KV9FMADD -SSL2 -o $(@F) $^ else fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ endif bmz.fuji : bmz.$(SUFFIX) $(ZOBJS) ifndef SMP fcc -KV9FMADD -SSL2 -o $(@F) $^ else fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ endif bench: bench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) bench.$(SUFFIX): bench.c $(CC) -c -o $(@F) $(CFLAGS) $^ bench_old: bench_old.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) kbench: kbench.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) prebench: prebench.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) kbench_rank_k: kbench_rank_k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) smallbench: smallbench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) smallbench.mkl: smallbench.$(SUFFIX) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bench.sun: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) bench.cxml: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) bench.atlas: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) bench.essl: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) ../../level1/others/libmisc.$(LIBSUFFIX) bench.scsl: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) $(EXTRALIB) $(CEXTRALIB) bench.acml: bench.$(SUFFIX) $(OBJS) $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBACML) $(EXTRALIB) $(CEXTRALIB) bench.flame: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) kbench.mkl: kbench.$(SUFFIX) $(OBJS) $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bench.mkl: bench.$(SUFFIX) $(OBJS) $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bench_old.mkl: bench_old.$(SUFFIX) $(OBJS) $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bench.mlib: bench.$(SUFFIX) $(OBJS) $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) bench.veclib: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) params : params.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramd : paramd.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramq : paramq.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramc : paramc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramz : paramz.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramx : paramx.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) params-ex : params-ex.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramd-ex : paramd-ex.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramq-ex : paramq-ex.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramc-ex : paramc-ex.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramz-ex : paramz-ex.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) paramx-ex : paramx-ex.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) params.atlas : params.$(SUFFIX) $(OBJS) $(SOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) paramd.atlas : paramd.$(SUFFIX) $(OBJS) $(DOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) paramc.atlas : paramc.$(SUFFIX) $(OBJS) $(COBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) paramz.atlas : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) params.sun : params.$(SUFFIX) $(OBJS) $(SOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) paramd.sun : paramd.$(SUFFIX) $(OBJS) $(DOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) paramc.sun : paramc.$(SUFFIX) $(OBJS) $(COBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) paramz.sun : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) params.essl : params.$(SUFFIX) $(OBJS) $(SOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) paramd.essl : paramd.$(SUFFIX) $(OBJS) $(DOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) paramc.essl : paramc.$(SUFFIX) $(OBJS) $(COBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) paramz.essl : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) params.mkl : params.$(SUFFIX) $(OBJS) $(SOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) paramd.mkl : paramd.$(SUFFIX) $(OBJS) $(DOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) paramc.mkl : paramc.$(SUFFIX) $(OBJS) $(COBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) paramz.mkl : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) params.acml : params.$(SUFFIX) $(OBJS) $(SOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) paramd.acml : paramd.$(SUFFIX) $(OBJS) $(DOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) paramc.acml : paramc.$(SUFFIX) $(OBJS) $(COBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) paramz.acml : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) params.flame : params.$(SUFFIX) $(OBJS) $(SOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) paramd.flame : paramd.$(SUFFIX) $(OBJS) $(DOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) paramc.flame : paramc.$(SUFFIX) $(OBJS) $(COBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) paramz.flame : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) params.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) paramd.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) paramq.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) paramc.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) paramz.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) paramx.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) params-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) paramd-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) paramq-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) paramc-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) paramz-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) paramx-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) gen_insn_flash.c : echo '#include ' > gen_insn_flash.c echo '#include ' >> gen_insn_flash.c echo '#define ICACHE_SIZE ( 256 << 10)' >> gen_insn_flash.c echo 'int main(void){' >> gen_insn_flash.c echo 'int i;' >> gen_insn_flash.c echo '#ifdef __alpha' >> gen_insn_flash.c echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c echo 'printf("insn_flash:\n");' >> gen_insn_flash.c echo 'for (i = 0; i < ICACHE_SIZE / 4; i++)' >> gen_insn_flash.c echo 'printf("br 1f\n 1:\n");' >> gen_insn_flash.c echo 'printf(".align 5;ret;.end insn_flash\n");'>> gen_insn_flash.c echo '#else' >> gen_insn_flash.c echo 'printf(".text;.align 32\n");' >> gen_insn_flash.c echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c echo 'printf("insn_flash:\n");' >> gen_insn_flash.c echo 'for (i = 0; i < ICACHE_SIZE / 2; i++)' >> gen_insn_flash.c echo 'printf("jmp 1f\n 1:\n");' >> gen_insn_flash.c echo 'printf(".align 32;ret\n");' >> gen_insn_flash.c echo '#endif' >> gen_insn_flash.c echo 'return 0;' >> gen_insn_flash.c echo '}' >> gen_insn_flash.c insn_flash.$(SUFFIX) : gen_insn_flash ./gen_insn_flash > temp.s $(AS) -o $(@F) temp.s rm -f temp.s dummy : clean :: @if test -d $(ARCH); then \ (cd $(ARCH) && $(MAKE) clean) \ fi @find . -name '*.o' | xargs rm -rf @rm -rf *.a *.s *.po *.obj *.i *.so core core.* gmon.out *.cso \ *.csx *.is *~ *.exe *.flame *.pdb *.dwf \ gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \ *.pc *.pcl *.def *.i *.prof linktest.c \ bms bmd bmc bmz bmq bmx \ params paramd paramc paramz paramq paramx \ params-ex paramd-ex paramc-ex paramz-ex paramq-ex paramx-ex \ bench tpp kbench kbench2 \ *.mkl *.sun *.acml *.cxml *.essl *.atlas *.scsl *.mlib *.veclib *.fuji OpenBLAS-0.2.20/Makefile.x86000066400000000000000000000034541313527062700152360ustar00rootroot00000000000000# COMPILER_PREFIX = mingw32- ifeq ($(OSNAME), Interix) ARFLAGS = -m x86 endif ifndef SMP LIBMKL = -L$(MKLPATH)/32 -Wl,-rpath,$(MKLPATH)/32 -lmkl_intel -lmkl_sequential -lmkl_core -lguide -lpthread -lm else LIBMKL = -L$(MKLPATH)/32 -Wl,-rpath,$(MKLPATH)/32 -lmkl_intel -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm endif # LIBMKL = -L$(MKLPATH)/32 -lmkl_lapack -lmkl_ia32 -lguide -lpthread -lm ifndef SMP LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm else LIBATLAS = -L$(ATLAS) -lptf77blas -latlas -lpthread -lg2c -lm endif ifeq ($(COMPILER_F77), g77) LIBACML = -L$(ACMLPATH)/gnu32/lib -Wl,-rpath,$(ACMLPATH)/gnu32/lib -lacml -lg2c endif LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS) ifeq ($(F_COMPILER), GFORTRAN) ifndef SMP LIBACML = -L$(ACMLPATH)/gfortran32/lib -Wl,-rpath,$(ACMLPATH)/gfortran32/lib -lacml -lgfortran -lm else LIBACML = -L$(ACMLPATH)/gfortran32_mp/lib -Wl,-rpath,$(ACMLPATH)/gfortran32_mp/lib -lacml_mp -lgfortran -lgomp -lm endif endif ifeq ($(COMPILER_F77), pgf77) LIBACML = -L$(ACMLPATH)/pgi32/lib -lacml -L/opt/pgi/linux86-64/5.2/lib -lpgftnrtl -lnspgc -lpgc endif ifeq ($(F_COMPILER), PATHSCALE) ifndef SMP LIBACML = -L$(ACMLPATH)/pathscale32/lib -Wl,-rpath,$(ACMLPATH)/pathscale32/lib -lacml -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lpathfortran -lm else LIBACML = -L$(ACMLPATH)/pathscale32_mp/lib -Wl,-rpath,$(ACMLPATH)/pathscale32_mp/lib -lacml_mp -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lopenmp -lpathfortran -lm endif endif LIBSUNPERF = -L/opt/SUNWspro/lib/sse2 -Wl,-R,/opt/SUNWspro/lib/sse2 -lsunperf LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib ifndef SMP LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm endif OpenBLAS-0.2.20/Makefile.x86_64000066400000000000000000000063371313527062700155520ustar00rootroot00000000000000# CCOMMON_OPT += -DFASTCPU ifeq ($(OSNAME), SunOS) ifdef BINARY64 ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -m64 endif endif endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x64 endif ifeq ($(OSNAME), Darwin) ifndef SMP LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lguide -lpthread -lm else LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm endif else ifndef SMP LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -Wl,--start-group -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -Wl,--end-group -lguide -lpthread -lm else LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -Wl,--start-group -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -Wl,--end-group -lguide -lpthread -lm endif endif ifndef SMP LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)64 -llapack -lptcblas -lptf77blas -latlas -lpthread -lm endif LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame $(TOPDIR)/$(LIBNAME) -lgfortran -lpthread -lm ifeq ($(F_COMPILER), g77) LIBACML = -L$(ACMLPATH)/gnu64/lib -Wl,-rpath,$(ACMLPATH)/gnu64/lib -lacml -lacml_mv -lg2c -lm endif ifeq ($(F_COMPILER), GFORTRAN) ifndef SMP LIBACML = -L$(ACMLPATH)/gfortran64/lib -Wl,-rpath,$(ACMLPATH)/gfortran64/lib -lacml -lacml_mv -lgfortran -lm else LIBACML = -L$(ACMLPATH)/gfortran64_mp/lib -Wl,-rpath,$(ACMLPATH)/gfortran64_mp/lib -lacml_mp -lacml_mv -lgfortran -lgomp -lm endif endif ifeq ($(F_COMPILER), INTEL) ifndef SMP LIBACML = -L$(ACMLPATH)/ifort64/lib -Wl,-rpath,$(ACMLPATH)/ifort64/lib -lacml -lacml_mv -lifcoremt_pic -lirc -lm -lpthread -ldl else LIBACML = -L$(ACMLPATH)/ifort64_mp/lib -Wl,-rpath,$(ACMLPATH)/ifort64_mp/lib -lacml_mp -lacml_mv -lifcoremt_pic -liomp5 -lirc -lm -lpthread -ldl endif endif ifeq ($(F_COMPILER), OPEN64) ifndef SMP LIBACML = -L$(ACMLPATH)/open64/lib -Wl,-rpath,$(ACMLPATH)/open64/lib -lacml -lacml_mv -lm else LIBACML = -L$(ACMLPATH)/open64_mp/lib -Wl,-rpath,$(ACMLPATH)/open64_mp/lib -lacml_mp -lacml_mv -lm -lpthread endif endif ifeq ($(F_COMPILER), pgf77) ifndef SMP LIBACML = -L$(ACMLPATH)/pgi64/lib -Wl,-rpath,$(ACMLPATH)/pgi64/lib -lacml -lacml_mv -L$(PGIPATH) -Wl,-rpath,$(PGIPATH) -lpgftnrtl -lnspgc -lpgmp -lpgc else LIBACML = -L$(ACMLPATH)/pgi64_mp/lib -Wl,-rpath,$(ACMLPATH)/pgi64_mp/lib -lacml -lacml_mv -L$(PGIPATH) -Wl,-rpath,$(PGIPATH) -lpgftnrtl -lnspgc -lpgmp -lpgc endif endif ifeq ($(F_COMPILER), PATHSCALE) ifndef SMP LIBACML = -L$(ACMLPATH)/pathscale64/lib -Wl,-rpath,$(ACMLPATH)/pathscale64/lib -lacml -lacml_mv -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lpathfortran -lm else LIBACML = -L$(ACMLPATH)/pathscale64_mp/lib -Wl,-rpath,$(ACMLPATH)/pathscale64_mp/lib -lacml_mp -lacml_mv -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lopenmp -lpathfortran -lm endif endif ifeq ($(F_COMPILER), f90) LIBACML = -L$(ACMLPATH)/sun64/lib -Wl,-R,$(ACMLPATH)/sun64/lib -L$(SUNPATH)/lib/amd64 -Wl,-R,$(SUNPATH)/lib/amd64 -lacml -lacml_mv -lfsu endif LIBSUNPERF = -L$(SUNPATH)/lib/amd64 -L$(SUNPATH)/rtlibs/amd64 -Wl,-R,$(SUNPATH)/lib/amd64 -Wl,-R,$(SUNPATH)/rtlibs/amd64 -lsunperf -lfui -lfsu -lmtsk LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib OpenBLAS-0.2.20/Makefile.zarch000066400000000000000000000001441313527062700157110ustar00rootroot00000000000000 ifeq ($(CORE), Z13) CCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector endif OpenBLAS-0.2.20/README.md000066400000000000000000000166721313527062700144370ustar00rootroot00000000000000# OpenBLAS [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. Please read the documents on OpenBLAS wiki pages . ## Binary Packages We provide binary packages for the following platform. * Windows x86/x86_64 You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). ## Installation from Source Download from project homepage. http://xianyi.github.com/OpenBLAS/ Or, check out codes from git://github.com/xianyi/OpenBLAS.git ### Normal compile * type "make" to detect the CPU automatically. or * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. ### Cross compile Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. Examples: On X86 box, compile this library for loongson3a CPU. make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 ### Debug version make DEBUG=1 ### Compile with MASS Support on Power CPU (Optional dependency) [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER. The library can be installed as below - * On Ubuntu: wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
sudo apt-get update
sudo apt-get install libxlmass-devel.8.1.5
* On RHEL/CentOS: wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
sudo rpm --import repomd.xml.key
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
sudo yum install libxlmass-devel.8.1.5
After installing MASS library, compile openblas with USE_MASS=1. Example: Compiling on Power8 with MASS support - make USE_MASS=1 TARGET=POWER8 ### Install to the directory (optional) Example: make install PREFIX=your_installation_directory The default directory is /opt/OpenBLAS ## Support CPU & OS Please read GotoBLAS_01Readme.txt ### Additional support CPU: #### x86/x86-64: - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3B**: Experimental #### ARM: - **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ ) - **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 ) #### ARM64: - **ARMV8**: Experimental - **ARM Cortex-A57**: Experimental #### IBM zEnterprise System: - **Z13**: Optimized Level-3 BLAS ### Support OS: - **GNU/Linux** - **MingWin or Visual Studio(CMake)/Windows**: Please read . - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **FreeBSD**: Supported by community. We didn't test the library on this OS. - **Android**: Supported by community. Please read . ## Usages Link with libopenblas.a or -lopenblas for shared library. ### Set the number of threads with environment variables. Examples: export OPENBLAS_NUM_THREADS=4 or export GOTO_NUM_THREADS=4 or export OMP_NUM_THREADS=4 The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. ### Set the number of threads on runtime. We provided the below functions to control the number of threads on runtime. void goto_set_num_threads(int num_threads); void openblas_set_num_threads(int num_threads); If you compile this lib with USE_OPENMP=1, you should use the above functions, too. ## Report Bugs Please add a issue in https://github.com/xianyi/OpenBLAS/issues ## Contact * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev ## ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. ## Troubleshooting * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. * OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. ## Contributing 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. 1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. 1. Write a test which shows that the bug was fixed or that the feature works as expected. 1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. ## Donation Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). OpenBLAS-0.2.20/TargetList.txt000066400000000000000000000013461313527062700157730ustar00rootroot00000000000000Force Target Examples: make TARGET=NEHALEM make TARGET=LOONGSON3A BINARY=64 make TARGET=ISTANBUL Supported List: 1.X86/X86_64 a)Intel CPU: P2 KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS YONAH CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE HASWELL ATOM b)AMD CPU: ATHLON OPTERON OPTERON_SSE3 BARCELONA SHANGHAI ISTANBUL BOBCAT BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR ZEN c)VIA CPU: SSE_GENERIC VIAC3 NANO 2.Power CPU: POWER4 POWER5 POWER6 POWER7 POWER8 PPCG4 PPC970 PPC970MP PPC440 PPC440FP2 CELL 3.MIPS CPU: P5600 4.MIPS64 CPU: SICORTEX LOONGSON3A LOONGSON3B I6400 P6600 5.IA64 CPU: ITANIUM2 6.SPARC CPU: SPARC SPARCV7 7.ARM CPU: CORTEXA15 CORTEXA9 ARMV7 ARMV6 ARMV5 8.ARM 64-bit CPU: ARMV8 CORTEXA57 VULCAN THUNDERX THUNDERX2T99 OpenBLAS-0.2.20/USAGE.md000066400000000000000000000151601313527062700143350ustar00rootroot00000000000000# Notes on OpenBLAS usage ## Usage #### Program is Terminated. Because you tried to allocate too many memory regions In OpenBLAS, we mange a pool of memory buffers and allocate the number of buffers as the following. ``` #define NUM_BUFFERS (MAX_CPU_NUMBER * 2) ``` This error indicates that the program exceeded the number of buffers. Please build OpenBLAS with larger `NUM_THREADS`. For example, `make NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set `MAX_CPU_NUMBER=NUM_THREADS`. #### How can I use OpenBLAS in multi-threaded applications? If your application is already multi-threaded, it will conflict with OpenBLAS multi-threading. Thus, you must set OpenBLAS to use single thread in any of the following ways: * `export OPENBLAS_NUM_THREADS=1` in the environment variables. * Call `openblas_set_num_threads(1)` in the application on runtime. * Build OpenBLAS single thread version, e.g. `make USE_THREAD=0` If the application is parallelized by OpenMP, please use OpenBLAS built with `USE_OPENMP=1` #### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH The environment variable which control the kernel selection is `OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()` returns the used target. #### How could I disable OpenBLAS threading affinity on runtime? You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment variable to disable threading affinity on runtime. For example, before the running, ``` export OPENBLAS_MAIN_FREE=1 ``` Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1` in `Makefile.rule`. ## Linking with the library * Link with shared library `gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas` If the library is multithreaded, please add `-lpthread`. If the library contains LAPACK functions, please add `-lgfortran` or other Fortran libs. * Link with static library `gcc -o test test.c /your/path/libopenblas.a` You can download `test.c` from https://gist.github.com/xianyi/5780018 On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by default), custom programs statically linked against `libopenblas.a` should also link with the pthread library e.g.: ``` gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread ``` Failing to add the `-lpthread` flag will cause errors such as: ``` /opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory': memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock' memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock' ... ``` ## Code examples #### Call CBLAS interface This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656 ``` #include #include void main() { int i=0; double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5}; cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3); for(i=0; i<9; i++) printf("%lf ", C[i]); printf("\n"); } ``` `gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran` #### Call BLAS Fortran interface This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018 ``` #include "stdio.h" #include "stdlib.h" #include "sys/time.h" #include "time.h" extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*); int main(int argc, char* argv[]) { int i; printf("test!\n"); if(argc<4){ printf("Input Error\n"); return 1; } int m = atoi(argv[1]); int n = atoi(argv[2]); int k = atoi(argv[3]); int sizeofa = m * k; int sizeofb = k * n; int sizeofc = m * n; char ta = 'N'; char tb = 'N'; double alpha = 1.2; double beta = 0.001; struct timeval start,finish; double duration; double* A = (double*)malloc(sizeof(double) * sizeofa); double* B = (double*)malloc(sizeof(double) * sizeofb); double* C = (double*)malloc(sizeof(double) * sizeofc); srand((unsigned)time(NULL)); for (i=0; i ` ## Troubleshooting * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. * OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. ## BLAS reference manual If you want to understand every BLAS function and definition, please read [Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm) or [netlib.org](http://netlib.org/blas/) Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions) ## How to reference OpenBLAS. You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications). Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly. OpenBLAS-0.2.20/appveyor.yml000066400000000000000000000011461313527062700155360ustar00rootroot00000000000000version: 0.2.19.{build} #environment: platform: - x64 configuration: Release clone_folder: c:\projects\OpenBLAS init: - git config --global core.autocrlf input build: project: OpenBLAS.sln clone_depth: 5 #branches to build branches: only: - master - develop - cmake skip_tags: true matrix: fast_finish: true skip_commits: # Add [av skip] to commit messages message: /\[av skip\]/ before_build: - echo Running cmake... - cd c:\projects\OpenBLAS - cmake -G "Visual Studio 12 Win64" . test_script: - echo Running Test - cd c:\projects\OpenBLAS\utest - openblas_utest OpenBLAS-0.2.20/benchmark/000077500000000000000000000000001313527062700150765ustar00rootroot00000000000000OpenBLAS-0.2.20/benchmark/Make_exe.sh000077500000000000000000000001661313527062700171560ustar00rootroot00000000000000#!/bin/bash for f in *.goto *.acml *.mkl *.atlas do if [ -f "$f" ]; then mv $f `echo $f|tr '.' '_'`.exe fi done OpenBLAS-0.2.20/benchmark/Makefile000066400000000000000000002573441313527062700165550ustar00rootroot00000000000000TOPDIR = .. include $(TOPDIR)/Makefile.system # ACML standard #ACML=/opt/acml5.3.1/gfortran64_mp/lib #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm # ACML custom #ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm # ACML 6.1 custom ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm # Atlas Ubuntu #ATLAS=/usr/lib/atlas-base #LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm # Atlas RHEL and Fedora ATLAS=/usr/lib64/atlas LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm # Intel standard # MKL=/opt/intel/mkl/lib/intel64 # LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm # Intel custom MKL=/home/saar/intel_mkl LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm # Apple vecLib LIBVECLIB = -framework Accelerate ESSL=/opt/ibm/lib #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a ifneq ($(NO_LAPACK), 1) GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ csymv.goto zsymv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto else GOTO_LAPACK_TARGETS= endif ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ sscal.goto dscal.goto cscal.goto zscal.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ sscal.acml dscal.acml cscal.acml zscal.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ ssymm.acml dsymm.acml csymm.acml zsymm.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl else goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ sscal.goto dscal.goto cscal.goto zscal.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto \ chemv.goto zhemv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ sscal.acml dscal.acml cscal.acml zscal.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ ssymm.acml dsymm.acml csymm.acml zsymm.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl endif essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ chemv.veclib zhemv.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib goto_3m :: cgemm3m.goto zgemm3m.goto mkl_3m :: cgemm3m.mkl zgemm3m.mkl all :: goto mkl atlas acml veclib exe : @./Make_exe.sh ##################################### Slinpack #################################################### slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm slinpack.acml : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) slinpack.atlas : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) slinpack.mkl : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) slinpack.veclib : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) slinpack.essl : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dlinpack #################################################### dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dlinpack.acml : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dlinpack.atlas : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dlinpack.mkl : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dlinpack.veclib : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dlinpack.essl : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Clinpack #################################################### clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm clinpack.acml : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) clinpack.atlas : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) clinpack.mkl : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) clinpack.veclib : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) clinpack.essl : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zlinpack #################################################### zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zlinpack.acml : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zlinpack.atlas : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zlinpack.mkl : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zlinpack.veclib : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zlinpack.essl : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Scholesky ################################################### scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm scholesky.acml : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) scholesky.atlas : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) scholesky.mkl : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) scholesky.veclib : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) scholesky.essl : scholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dcholesky ################################################### dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dcholesky.acml : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dcholesky.atlas : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dcholesky.mkl : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dcholesky.veclib : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dcholesky.essl : dcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ccholesky ################################################### ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ccholesky.acml : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ccholesky.atlas : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ccholesky.mkl : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ccholesky.veclib : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ccholesky.essl : ccholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zcholesky ################################################### zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zcholesky.acml : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zcholesky.atlas : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zcholesky.mkl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zcholesky.veclib : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgemm.acml : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgemm.atlas : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgemm.mkl : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgemm.veclib : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgemm.essl : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dgemm #################################################### dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgemm.acml : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgemm.atlas : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgemm.mkl : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgemm.veclib : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgemm.essl : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cgemm #################################################### cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemm.acml : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemm.atlas : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemm.mkl : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemm.veclib : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemm.essl : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zgemm #################################################### zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemm.acml : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemm.atlas : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemm.mkl : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemm.veclib : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemm.essl : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssymm #################################################### ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssymm.acml : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssymm.atlas : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssymm.mkl : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssymm.veclib : ssymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dsymm #################################################### dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsymm.acml : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsymm.atlas : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsymm.mkl : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsymm.veclib : dsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Csymm #################################################### csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csymm.acml : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csymm.atlas : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csymm.mkl : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csymm.veclib : csymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zsymm #################################################### zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsymm.acml : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsymm.atlas : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsymm.mkl : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsymm.veclib : zsymm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Strmm #################################################### strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm strmm.acml : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strmm.atlas : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strmm.mkl : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strmm.veclib : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strmm.essl : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dtrmm #################################################### dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dtrmm.acml : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrmm.atlas : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrmm.mkl : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrmm.veclib : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrmm.essl : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ctrmm #################################################### ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ctrmm.acml : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrmm.atlas : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrmm.mkl : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrmm.veclib : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrmm.essl : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ztrmm #################################################### ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ztrmm.acml : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrmm.atlas : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrmm.mkl : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrmm.veclib : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrmm.essl : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Strsm #################################################### strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm strsm.acml : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strsm.atlas : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strsm.mkl : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strsm.veclib : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) strsm.essl : strsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dtrsm #################################################### dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dtrsm.acml : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrsm.atlas : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrsm.mkl : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrsm.veclib : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dtrsm.essl : dtrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ctrsm #################################################### ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ctrsm.acml : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrsm.atlas : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrsm.mkl : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrsm.veclib : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ctrsm.essl : ctrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ztrsm #################################################### ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ztrsm.acml : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrsm.atlas : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrsm.mkl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrsm.veclib : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ztrsm.essl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssyrk.acml : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssyrk.atlas : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssyrk.mkl : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssyrk.veclib : ssyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dsyrk #################################################### dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsyrk.acml : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsyrk.atlas : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsyrk.mkl : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsyrk.veclib : dsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Csyrk #################################################### csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csyrk.acml : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csyrk.atlas : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csyrk.mkl : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csyrk.veclib : csyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zsyrk #################################################### zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsyrk.acml : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsyrk.atlas : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsyrk.mkl : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsyrk.veclib : zsyrk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyr2k #################################################### ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssyr2k.acml : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssyr2k.atlas : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssyr2k.mkl : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssyr2k.veclib : ssyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dsyr2k #################################################### dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsyr2k.acml : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsyr2k.atlas : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsyr2k.mkl : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsyr2k.veclib : dsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Csyr2k #################################################### csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csyr2k.acml : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csyr2k.atlas : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csyr2k.mkl : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csyr2k.veclib : csyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zsyr2k #################################################### zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsyr2k.acml : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsyr2k.atlas : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsyr2k.mkl : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsyr2k.veclib : zsyr2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Chemm #################################################### chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm chemm.acml : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) chemm.atlas : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) chemm.mkl : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) chemm.veclib : chemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zhemm #################################################### zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zhemm.acml : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zhemm.atlas : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zhemm.mkl : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zhemm.veclib : zhemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cherk #################################################### cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cherk.acml : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cherk.atlas : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cherk.mkl : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cherk.veclib : cherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zherk #################################################### zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zherk.acml : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zherk.atlas : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zherk.mkl : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zherk.veclib : zherk.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cher2k #################################################### cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cher2k.acml : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cher2k.atlas : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cher2k.mkl : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cher2k.veclib : cher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zher2k #################################################### zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zher2k.acml : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zher2k.atlas : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zher2k.mkl : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zher2k.veclib : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgemv.acml : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgemv.atlas : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgemv.mkl : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgemv.veclib : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dgemv #################################################### dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgemv.acml : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgemv.atlas : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgemv.mkl : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgemv.veclib : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cgemv #################################################### cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemv.acml : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemv.atlas : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemv.mkl : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemv.veclib : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zgemv #################################################### zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemv.acml : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemv.atlas : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemv.mkl : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sger.acml : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sger.atlas : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sger.mkl : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sger.veclib : sger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dger #################################################### dger.goto : dger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dger.acml : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dger.atlas : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dger.mkl : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dger.veclib : dger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cger #################################################### cger.goto : cger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cger.acml : cger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cger.atlas : cger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cger.mkl : cger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cger.veclib : cger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zger #################################################### zger.goto : zger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zger.acml : zger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zger.atlas : zger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zger.mkl : zger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zger.veclib : zger.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssymv #################################################### ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ssymv.acml : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssymv.atlas : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssymv.mkl : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ssymv.veclib : ssymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dsymv #################################################### dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dsymv.acml : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsymv.atlas : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsymv.mkl : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dsymv.veclib : dsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Csymv #################################################### csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm csymv.acml : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csymv.atlas : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csymv.mkl : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) csymv.veclib : csymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dsymv #################################################### zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zsymv.acml : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsymv.atlas : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsymv.mkl : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zsymv.veclib : zsymv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgeev.acml : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgeev.atlas : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgeev.mkl : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgeev.veclib : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgeev.acml : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgeev.atlas : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgeev.mkl : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgeev.veclib : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgeev.acml : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgeev.atlas : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgeev.mkl : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgeev.veclib : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgeev.acml : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgeev.atlas : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgeev.mkl : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgeev.veclib : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgetri.acml : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgetri.atlas : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgetri.mkl : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgetri.veclib : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgetri.acml : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgetri.atlas : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgetri.mkl : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgetri.veclib : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgetri.acml : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgetri.atlas : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgetri.mkl : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgetri.veclib : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgetri.acml : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgetri.atlas : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgetri.mkl : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgetri.veclib : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Spotrf #################################################### spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm spotrf.acml : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) spotrf.atlas : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) spotrf.mkl : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) spotrf.veclib : spotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dpotrf #################################################### dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dpotrf.acml : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dpotrf.atlas : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dpotrf.mkl : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dpotrf.veclib : dpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cpotrf #################################################### cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cpotrf.acml : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cpotrf.atlas : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cpotrf.mkl : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cpotrf.veclib : cpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zpotrf #################################################### zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zpotrf.acml : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zpotrf.atlas : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zpotrf.mkl : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zpotrf.veclib : zpotrf.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Chemv #################################################### chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm chemv.acml : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) chemv.atlas : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) chemv.mkl : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) chemv.veclib : chemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zhemv #################################################### zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zhemv.acml : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zhemv.atlas : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zhemv.mkl : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zhemv.veclib : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sdot.acml : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sdot.atlas : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sdot.mkl : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sdot.veclib : sdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ddot #################################################### ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ddot.acml : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ddot.atlas : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ddot.mkl : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ddot.veclib : ddot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cdot #################################################### cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cdot.acml : cdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cdot.atlas : cdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cdot.mkl : cdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cdot.veclib : cdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zdot #################################################### zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zdot.acml : zdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zdot.atlas : zdot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zdot.mkl : zdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zdot.veclib : zdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Srot #################################################### srot.goto : srot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm srot.acml : srot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) srot.atlas : srot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) srot.mkl : srot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) srot.veclib : srot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Drot #################################################### drot.goto : drot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm drot.acml : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) drot.atlas : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) drot.mkl : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm saxpy.acml : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) saxpy.atlas : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) saxpy.mkl : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) saxpy.veclib : saxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Daxpy #################################################### daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm daxpy.acml : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) daxpy.atlas : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) daxpy.mkl : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) daxpy.veclib : daxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Caxpy #################################################### caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm caxpy.acml : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) caxpy.atlas : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) caxpy.mkl : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) caxpy.veclib : caxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zaxpy #################################################### zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zaxpy.acml : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zaxpy.atlas : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zaxpy.mkl : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zaxpy.veclib : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Scopy #################################################### scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm scopy.acml : scopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) scopy.atlas : scopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) scopy.mkl : scopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) scopy.veclib : scopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dcopy #################################################### dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dcopy.acml : dcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dcopy.atlas : dcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dcopy.mkl : dcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dcopy.veclib : dcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ccopy #################################################### ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ccopy.acml : ccopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ccopy.atlas : ccopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ccopy.mkl : ccopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ccopy.veclib : ccopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zcopy #################################################### zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zcopy.acml : zcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zcopy.atlas : zcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zcopy.mkl : zcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zcopy.veclib : zcopy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sscal #################################################### sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sscal.acml : sscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sscal.atlas : sscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sscal.mkl : sscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sscal.veclib : sscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dscal #################################################### dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dscal.acml : dscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dscal.atlas : dscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dscal.mkl : dscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dscal.veclib : dscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cscal #################################################### cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cscal.acml : cscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cscal.atlas : cscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cscal.mkl : cscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cscal.veclib : cscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zscal #################################################### zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zscal.acml : zscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zscal.atlas : zscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zscal.mkl : zscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zscal.veclib : zscal.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sasum #################################################### sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sasum.acml : sasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sasum.atlas : sasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sasum.mkl : sasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sasum.veclib : sasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dasum #################################################### dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dasum.acml : dasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dasum.atlas : dasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dasum.mkl : dasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dasum.veclib : dasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Casum #################################################### casum.goto : casum.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm casum.acml : casum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) casum.atlas : casum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) casum.mkl : casum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) casum.veclib : casum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zasum #################################################### zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zasum.acml : zasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zasum.atlas : zasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zasum.mkl : zasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zasum.veclib : zasum.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sswap #################################################### sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sswap.acml : sswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sswap.atlas : sswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sswap.mkl : sswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sswap.veclib : sswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dswap #################################################### dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dswap.acml : dswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dswap.atlas : dswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dswap.mkl : dswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dswap.veclib : dswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cswap #################################################### cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cswap.acml : cswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cswap.atlas : cswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cswap.mkl : cswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cswap.veclib : cswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zswap #################################################### zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zswap.acml : zswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zswap.atlas : zswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zswap.mkl : zswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zswap.veclib : zswap.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgesv #################################################### sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgesv.acml : sgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgesv.atlas : sgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgesv.mkl : sgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) sgesv.veclib : sgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Dgesv #################################################### dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgesv.acml : dgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgesv.atlas : dgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgesv.mkl : dgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) dgesv.veclib : dgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cgesv #################################################### cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgesv.acml : cgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgesv.atlas : cgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgesv.mkl : cgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgesv.veclib : cgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zgesv #################################################### zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgesv.acml : zgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgesv.atlas : zgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgesv.mkl : zgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgesv.veclib : zgesv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Cgemm3m #################################################### cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgemm3m.mkl : cgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) cgemm3m.veclib : cgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Zgemm3m #################################################### zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgemm3m.mkl : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) zgemm3m.veclib : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## ISAMAX ############################################## isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm isamax.atlas : isamax.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## IDAMAX ############################################## idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm idamax.atlas : idamax.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## ICAMAX ############################################## icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm icamax.atlas : icamax.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## IZAMAX ############################################## izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm izamax.atlas : izamax.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## SNRM2 ############################################## snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm snrm2.atlas : snrm2.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## DNRM2 ############################################## dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dnrm2.atlas : dnrm2.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## Sscnrm2 ############################################## scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm scnrm2.atlas : scnrm2.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ############################################## Ddznrm2 ############################################## dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dznrm2.atlas : dznrm2.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ################################################################################################### slinpack.$(SUFFIX) : linpack.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dlinpack.$(SUFFIX) : linpack.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ clinpack.$(SUFFIX) : linpack.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zlinpack.$(SUFFIX) : linpack.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ scholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ ccholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ ssymm.$(SUFFIX) : symm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dsymm.$(SUFFIX) : symm.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ csymm.$(SUFFIX) : symm.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zsymm.$(SUFFIX) : symm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ strmm.$(SUFFIX) : trmm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dtrmm.$(SUFFIX) : trmm.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ ctrmm.$(SUFFIX) : trmm.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ ztrmm.$(SUFFIX) : trmm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ strsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dtrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ ctrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ ztrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dsyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ csyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zsyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ ssyr2k.$(SUFFIX) : syr2k.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dsyr2k.$(SUFFIX) : syr2k.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ csyr2k.$(SUFFIX) : syr2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zsyr2k.$(SUFFIX) : syr2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ chemm.$(SUFFIX) : hemm.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zhemm.$(SUFFIX) : hemm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ cherk.$(SUFFIX) : herk.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zherk.$(SUFFIX) : herk.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ cher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ ssymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dsymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ csymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zsymv.$(SUFFIX) : symv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sgeev.$(SUFFIX) : geev.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dgeev.$(SUFFIX) : geev.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cgeev.$(SUFFIX) : geev.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zgeev.$(SUFFIX) : geev.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sgetri.$(SUFFIX) : getri.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dgetri.$(SUFFIX) : getri.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cgetri.$(SUFFIX) : getri.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zgetri.$(SUFFIX) : getri.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ spotrf.$(SUFFIX) : potrf.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dpotrf.$(SUFFIX) : potrf.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cpotrf.$(SUFFIX) : potrf.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zpotrf.$(SUFFIX) : potrf.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ chemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sdot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ ddot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cdot.$(SUFFIX) : zdot.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zdot.$(SUFFIX) : zdot.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ cdot-intel.$(SUFFIX) : zdot-intel.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zdot-intel.$(SUFFIX) : zdot-intel.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ saxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ daxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ caxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ scopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dcopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ ccopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zcopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sswap.$(SUFFIX) : swap.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dswap.$(SUFFIX) : swap.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cswap.$(SUFFIX) : swap.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zswap.$(SUFFIX) : swap.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sscal.$(SUFFIX) : scal.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dscal.$(SUFFIX) : scal.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cscal.$(SUFFIX) : scal.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zscal.$(SUFFIX) : scal.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sasum.$(SUFFIX) : asum.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dasum.$(SUFFIX) : asum.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ casum.$(SUFFIX) : asum.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zasum.$(SUFFIX) : asum.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sgesv.$(SUFFIX) : gesv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dgesv.$(SUFFIX) : gesv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cgesv.$(SUFFIX) : gesv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zgesv.$(SUFFIX) : gesv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ srot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ cgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ isamax.$(SUFFIX) : iamax.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ idamax.$(SUFFIX) : iamax.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ icamax.$(SUFFIX) : iamax.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ izamax.$(SUFFIX) : iamax.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ snrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ dnrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ scnrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ dznrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ smallscaling: smallscaling.c ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling include $(TOPDIR)/Makefile.tail OpenBLAS-0.2.20/benchmark/asum.c000066400000000000000000000116351313527062700162150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef ASUM #ifdef COMPLEX #ifdef DOUBLE #define ASUM BLASFUNC(dzasum) #else #define ASUM BLASFUNC(scasum) #endif #else #ifdef DOUBLE #define ASUM BLASFUNC(dasum) #else #define ASUM BLASFUNC(sasum) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x; FLOAT result; blasint m, i; blasint inc_x=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef AXPY #ifdef COMPLEX #ifdef DOUBLE #define AXPY BLASFUNC(zaxpy) #else #define AXPY BLASFUNC(caxpy) #endif #else #ifdef DOUBLE #define AXPY BLASFUNC(daxpy) #else #define AXPY BLASFUNC(saxpy) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" double fabs(double); #undef POTRF #ifndef COMPLEX #ifdef XDOUBLE #define POTRF BLASFUNC(qpotrf) #define SYRK BLASFUNC(qsyrk) #elif defined(DOUBLE) #define POTRF BLASFUNC(dpotrf) #define SYRK BLASFUNC(dsyrk) #else #define POTRF BLASFUNC(spotrf) #define SYRK BLASFUNC(ssyrk) #endif #else #ifdef XDOUBLE #define POTRF BLASFUNC(xpotrf) #define SYRK BLASFUNC(xherk) #elif defined(DOUBLE) #define POTRF BLASFUNC(zpotrf) #define SYRK BLASFUNC(zherk) #else #define POTRF BLASFUNC(cpotrf) #define SYRK BLASFUNC(cherk) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif static __inline double getmflops(int ratio, int m, double secs){ double mm = (double)m; double mulflops, addflops; if (secs==0.) return 0.; mulflops = mm * (1./3. + mm * (1./2. + mm * 1./6.)); addflops = 1./6. * mm * (mm * mm - 1); if (ratio == 1) { return (mulflops + addflops) / secs * 1.e-6; } else { return (2. * mulflops + 6. * addflops) / secs * 1.e-6; } } int main(int argc, char *argv[]){ #ifndef COMPLEX char *trans[] = {"T", "N"}; #else char *trans[] = {"C", "N"}; #endif char *uplo[] = {"U", "L"}; FLOAT alpha[] = {1.0, 0.0}; FLOAT beta [] = {0.0, 0.0}; FLOAT *a, *b; blasint m, i, j, info, uplos; int from = 1; int to = 200; int step = 1; FLOAT maxerr; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } for(m = from; m <= to; m += step){ fprintf(stderr, "M = %6d : ", (int)m); for (uplos = 0; uplos < 2; uplos ++) { #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[i + j * m] = 0.; a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[i + j * m] = 0.; } } #else if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { a[(i + j * m) * 2 + 0] = 0.; a[(i + j * m) * 2 + 1] = 0.; } a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; a[(j + j * m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; a[(j + j * m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { a[(i + j * m) * 2 + 0] = 0.; a[(i + j * m) * 2 + 1] = 0.; } } } #endif SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); gettimeofday( &start, (struct timezone *)0); POTRF(uplo[uplos], &m, b, &m, &info); gettimeofday( &stop, (struct timezone *)0); if (info != 0) { fprintf(stderr, "Info = %d\n", info); exit(1); } time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; maxerr = 0.; if (!(uplos & 1)) { for (j = 0; j < m; j++) { for(i = 0; i <= j; i++) { #ifndef COMPLEX if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); #else if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); #endif } } } else { for (j = 0; j < m; j++) { for(i = j; i < m; i++) { #ifndef COMPLEX if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); #else if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); #endif } } } fprintf(stderr, #ifdef XDOUBLE " %Le %10.3f MFlops", maxerr, #else " %e %10.3f MFlops", maxerr, #endif getmflops(COMPSIZE * COMPSIZE, m, time1)); if (maxerr > 1.e-3) { fprintf(stderr, "Hmm, probably it has bug.\n"); exit(1); } } fprintf(stderr, "\n"); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/copy.c000066400000000000000000000122511313527062700162150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef COPY #ifdef COMPLEX #ifdef DOUBLE #define COPY BLASFUNC(zcopy) #else #define COPY BLASFUNC(ccopy) #endif #else #ifdef DOUBLE #define COPY BLASFUNC(dcopy) #else #define COPY BLASFUNC(scopy) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include "culapack.h" static int initialized = 0; int sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info) { if (!initialized) { culaInitialize(); initialized = 1; } *info = culaSgetrf(*m, *m, a, *lda, ipiv); return 0; } int cgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info) { if (!initialized) { culaInitialize(); initialized = 1; } *info = culaCgetrf(*m, *m, (culaFloatComplex *)a, *lda, ipiv); return 0; } OpenBLAS-0.2.20/benchmark/dot.c000066400000000000000000000120431313527062700160300ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef DOT #ifdef DOUBLE #define DOT BLASFUNC(ddot) #else #define DOT BLASFUNC(sdot) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT result; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef GEEV #ifndef COMPLEX #ifdef XDOUBLE #define GEEV BLASFUNC(qgeev) #elif defined(DOUBLE) #define GEEV BLASFUNC(dgeev) #else #define GEEV BLASFUNC(sgeev) #endif #else #ifdef XDOUBLE #define GEEV BLASFUNC(xgeev) #elif defined(DOUBLE) #define GEEV BLASFUNC(zgeev) #else #define GEEV BLASFUNC(cgeev) #endif #endif #ifndef COMPLEX extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a, blasint* lda, FLOAT* wr, FLOAT* wi, FLOAT* vl, blasint* ldvl, FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, blasint* info ); #else extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a, blasint* lda, FLOAT* wr, FLOAT* vl, blasint* ldvl, FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info ); #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; FLOAT wkopt[4]; char job='V'; char jobr='N'; char *p; blasint m, i, j, info,lwork; double factor = 26.33; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_JOB"))) job=*p; if ( job == 'N' ) factor = 10.0; fprintf(stderr, "From : %3d To : %3d Step = %3d Job=%c\n", from, to, step,job); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( vl = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( vr = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( wr = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( wi = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( rwork = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } lwork = -1; m=to; #ifndef COMPLEX GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info); #else GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info); #endif lwork = (blasint)wkopt[0]; if (( work = (FLOAT *)malloc(sizeof(FLOAT) * lwork * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE FLops Time Lwork\n"); for(m = from; m <= to; m += step){ fprintf(stderr, " %6d : ", (int)m); gettimeofday( &start, (struct timezone *)0); lwork = -1; #ifndef COMPLEX GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info); #else GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info); #endif lwork = (blasint)wkopt[0]; #ifndef COMPLEX GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info); #else GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info); #endif gettimeofday( &stop, (struct timezone *)0); if (info) { fprintf(stderr, "failed to compute eigenvalues .. %d\n", info); exit(1); } time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", COMPSIZE * COMPSIZE * factor * (double)m * (double)m * (double)m / time1 * 1.e-6,time1,lwork); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/gemm.c000066400000000000000000000130651313527062700161740ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef GEMM #ifndef COMPLEX #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) #else #define GEMM BLASFUNC(sgemm) #endif #else #ifdef DOUBLE #define GEMM BLASFUNC(zgemm) #else #define GEMM BLASFUNC(cgemm) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {0.0, 0.0}; char trans='N'; blasint m, n, i, j; int loops = 1; int has_param_n=0; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } p = getenv("OPENBLAS_LOOPS"); if ( p != NULL ) loops = atoi(p); if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); has_param_n=1; } #ifdef linux srandom(getpid()); #endif for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } fprintf(stderr, " SIZE Flops Time\n"); for(m = from; m <= to; m += step) { timeg=0; if ( has_param_n == 1 && n <= m ) n=n; else n=m; fprintf(stderr, " %6dx%d : ", (int)m, (int)n); gettimeofday( &start, (struct timezone *)0); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef GEMM #ifndef COMPLEX #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) #else #define GEMM BLASFUNC(sgemm) #endif #else #ifdef DOUBLE #define GEMM BLASFUNC(zgemm3m) #else #define GEMM BLASFUNC(cgemm3m) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char trans='N'; blasint m, i, j; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } p = getenv("OPENBLAS_LOOPS"); if ( p != NULL ) loops = atoi(p); #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef GEMV #ifndef COMPLEX #ifdef DOUBLE #define GEMV BLASFUNC(dgemv) #else #define GEMV BLASFUNC(sgemv) #endif #else #ifdef DOUBLE #define GEMV BLASFUNC(zgemv) #else #define GEMV BLASFUNC(cgemv) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char trans='N'; blasint m, i, j; blasint inc_x=1,inc_y=1; blasint n=0; int has_param_n = 0; int has_param_m = 0; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} int tomax = to; if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); if ((n>0)) has_param_n = 1; if ( n > tomax ) tomax = n; } if ( has_param_n == 0 ) if ((p = getenv("OPENBLAS_PARAM_M"))) { m = atoi(p); if ((m>0)) has_param_m = 1; if ( m > tomax ) tomax = m; } fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); if (has_param_m == 0) { for(m = from; m <= to; m += step) { timeg=0; if ( has_param_n == 0 ) n = m; fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef GER #ifdef COMPLEX #ifdef DOUBLE #define GER BLASFUNC(zgeru) #else #define GER BLASFUNC(cgeru) #endif #else #ifdef DOUBLE #define GER BLASFUNC(dger) #else #define GER BLASFUNC(sger) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; blasint m, i, j; blasint inc_x=1,inc_y=1; blasint n=0; int has_param_n = 0; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); if ((n>0) && (n<=to)) has_param_n = 1; } if ( has_param_n == 1 ) fprintf(stderr, "From : %3d To : %3d Step = %3d N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,n,inc_x,inc_y,loops); else fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; if ( has_param_n == 0 ) n = m; fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" double fabs(double); #undef GESV #undef GETRS #ifndef COMPLEX #ifdef XDOUBLE #define GESV BLASFUNC(qgesv) #elif defined(DOUBLE) #define GESV BLASFUNC(dgesv) #else #define GESV BLASFUNC(sgesv) #endif #else #ifdef XDOUBLE #define GESV BLASFUNC(xgesv) #elif defined(DOUBLE) #define GESV BLASFUNC(zgesv) #else #define GESV BLASFUNC(cgesv) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b; blasint *ipiv; blasint m, i, j, info; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops Time\n"); for(m = from; m <= to; m += step){ fprintf(stderr, " %dx%d : ", (int)m, (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ b[i + j * m * COMPSIZE] = 0.0; } } for (j = 0; j < m; ++j) { for (i = 0; i < m * COMPSIZE; ++i) { b[i] += a[i + j * m * COMPSIZE]; } } gettimeofday( &start, (struct timezone *)0); GESV (&m, &m, a, &m, ipiv, b, &m, &info); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; fprintf(stderr, "%10.2f MFlops %10.6f s\n", COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/getri.c000066400000000000000000000155561313527062700163700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef GETRF #undef GETRI #ifndef COMPLEX #ifdef XDOUBLE #define GETRF BLASFUNC(qgetrf) #define GETRI BLASFUNC(qgetri) #elif defined(DOUBLE) #define GETRF BLASFUNC(dgetrf) #define GETRI BLASFUNC(dgetri) #else #define GETRF BLASFUNC(sgetrf) #define GETRI BLASFUNC(sgetri) #endif #else #ifdef XDOUBLE #define GETRF BLASFUNC(xgetrf) #define GETRI BLASFUNC(xgetri) #elif defined(DOUBLE) #define GETRF BLASFUNC(zgetrf) #define GETRI BLASFUNC(zgetri) #else #define GETRF BLASFUNC(cgetrf) #define GETRI BLASFUNC(cgetri) #endif #endif extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info); #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a,*work; FLOAT wkopt[4]; blasint *ipiv; blasint m, i, j, info,lwork; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } lwork = -1; m=to; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; if (( work = (FLOAT *)malloc(sizeof(FLOAT) * lwork * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE FLops Time Lwork\n"); for(m = from; m <= to; m += step){ fprintf(stderr, " %6d : ", (int)m); GETRF (&m, &m, a, &m, ipiv, &info); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } gettimeofday( &start, (struct timezone *)0); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); gettimeofday( &stop, (struct timezone *)0); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/hemm.c000066400000000000000000000122131313527062700161670ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef HEMM #ifdef DOUBLE #define HEMM BLASFUNC(zhemm) #else #define HEMM BLASFUNC(chemm) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char side='L'; char uplo='U'; if ((p = getenv("OPENBLAS_SIDE"))) side=*p; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c\n", from, to, step,side,uplo); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/hemv.c000066400000000000000000000127441313527062700162110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef HEMV #ifdef DOUBLE #define HEMV BLASFUNC(zhemv) #else #define HEMV BLASFUNC(chemv) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char uplo='L'; blasint m, i, j; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6dx%d : ", (int)m,(int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef HER2K #ifdef DOUBLE #define HER2K BLASFUNC(zher2k) #else #define HER2K BLASFUNC(cher2k) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char uplo='U'; char trans='N'; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/herk.c000066400000000000000000000117001313527062700161720ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef HERK #ifdef DOUBLE #define HERK BLASFUNC(zherk) #else #define HERK BLASFUNC(cherk) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char uplo='U'; char trans='N'; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/iamax.c000066400000000000000000000114711313527062700163450ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef IAMAX #ifdef COMPLEX #ifdef DOUBLE #define IAMAX BLASFUNC(izamax) #else #define IAMAX BLASFUNC(icamax) #endif #else #ifdef DOUBLE #define IAMAX BLASFUNC(idamax) #else #define IAMAX BLASFUNC(isamax) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x; blasint m, i; blasint inc_x=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" double fabs(double); #undef GETRF #undef GETRS #ifndef COMPLEX #ifdef XDOUBLE #define GETRF BLASFUNC(qgetrf) #define GETRS BLASFUNC(qgetrs) #elif defined(DOUBLE) #define GETRF BLASFUNC(dgetrf) #define GETRS BLASFUNC(dgetrs) #else #define GETRF BLASFUNC(sgetrf) #define GETRS BLASFUNC(sgetrs) #endif #else #ifdef XDOUBLE #define GETRF BLASFUNC(xgetrf) #define GETRS BLASFUNC(xgetrs) #elif defined(DOUBLE) #define GETRF BLASFUNC(zgetrf) #define GETRS BLASFUNC(zgetrs) #else #define GETRF BLASFUNC(cgetrf) #define GETRS BLASFUNC(cgetrs) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b; blasint *ipiv; blasint m, i, j, info; blasint unit = 1; int from = 1; int to = 200; int step = 1; FLOAT maxerr; struct timeval start, stop; double time1, time2; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); for(m = from; m <= to; m += step){ fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for (i = 0; i < m * COMPSIZE; ++i) b[i] = 0.; for (j = 0; j < m; ++j) { for (i = 0; i < m * COMPSIZE; ++i) { b[i] += a[i + j * m * COMPSIZE]; } } gettimeofday( &start, (struct timezone *)0); GETRF (&m, &m, a, &m, ipiv, &info); gettimeofday( &stop, (struct timezone *)0); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info); gettimeofday( &stop, (struct timezone *)0); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; maxerr = 0.; for(i = 0; i < m; i++){ #ifndef XDOUBLE if (maxerr < fabs(b[i * COMPSIZE] - 1.0)) maxerr = fabs(b[i * COMPSIZE] - 1.0); #ifdef COMPLEX if (maxerr < fabs(b[i * COMPSIZE] + 1)) maxerr = fabs(b[i * COMPSIZE + 1]); #endif #else if (maxerr < fabsl(b[i * COMPSIZE] - 1.0L)) maxerr = fabsl(b[i * COMPSIZE] - 1.0L); #ifdef COMPLEX if (maxerr < fabsl(b[i * COMPSIZE] + 1)) maxerr = fabsl(b[i * COMPSIZE + 1]); #endif #endif } #ifdef XDOUBLE fprintf(stderr," %Le ", maxerr); #else fprintf(stderr," %e ", maxerr); #endif fprintf(stderr, " %10.2f MFlops %10.2f MFlops %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. / 3. * (double)m * (double)m * (double)m / time1 * 1.e-6, COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time2 * 1.e-6, COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m) / (time1 + time2) * 1.e-6); #if 0 if ( #ifdef DOUBLE maxerr > 1.e-8 #else maxerr > 1.e-1 #endif ) { fprintf(stderr, "Error is too large.\n"); exit(1); } #endif } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/nrm2.c000066400000000000000000000114551313527062700161260ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef NRM2 #ifdef COMPLEX #ifdef DOUBLE #define NRM2 BLASFUNC(dznrm2) #else #define NRM2 BLASFUNC(scnrm2) #endif #else #ifdef DOUBLE #define NRM2 BLASFUNC(dnrm2) #else #define NRM2 BLASFUNC(snrm2) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x; blasint m, i; blasint inc_x=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l&1|./plotfilter.sh >OpenBLAS # ************************************************************************ if [ $# -eq 1 ] then arg1=$1 else arg1=0 fi case $arg1 in L) # Linpack Benchmark awk '/MFlops/ { print $1,int($8) }'|tail --lines=+2 ;; C) # Cholesky Benchmark awk '/MFlops/ { print $3,int($9) }'|tail --lines=+2 ;; B) # Copy Benchmark awk '/MBytes/ { print $1,int($3) }'|tail --lines=+2 ;; *) awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2 ;; esac OpenBLAS-0.2.20/benchmark/plot-header000066400000000000000000000040161313527062700172260ustar00rootroot00000000000000# ********************************************************************************** # Copyright (c) 2014, The OpenBLAS Project # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # 3. Neither the name of the OpenBLAS project nor the names of # its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ********************************************************************************** set term x11 font sans; set ylabel "MFlops"; set xlabel "Size"; set grid xtics; set grid ytics; set key left; set timestamp "generated on %Y-%m-%d by `whoami`" set title "Dtrsm\nUPLO=U TRANS=N SIDE=L\nBulldozer 1 Thread" plot 'OpenBLAS' smooth bezier, 'ACML' smooth bezier, 'MKL' smooth bezier; set output "print.png"; show title; show plot; show output; OpenBLAS-0.2.20/benchmark/potrf.c000066400000000000000000000217621313527062700164040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" double fabs(double); #undef POTRF #ifndef COMPLEX #ifdef XDOUBLE #define POTRF BLASFUNC(qpotrf) #define POTRS BLASFUNC(qpotrs) #define POTRI BLASFUNC(qpotri) #define SYRK BLASFUNC(qsyrk) #elif defined(DOUBLE) #define POTRF BLASFUNC(dpotrf) #define POTRS BLASFUNC(dpotrs) #define POTRI BLASFUNC(dpotri) #define SYRK BLASFUNC(dsyrk) #else #define POTRF BLASFUNC(spotrf) #define POTRS BLASFUNC(spotrs) #define POTRI BLASFUNC(spotri) #define SYRK BLASFUNC(ssyrk) #endif #else #ifdef XDOUBLE #define POTRF BLASFUNC(xpotrf) #define POTRS BLASFUNC(xpotrs) #define POTRI BLASFUNC(xpotri) #define SYRK BLASFUNC(xherk) #elif defined(DOUBLE) #define POTRF BLASFUNC(zpotrf) #define POTRS BLASFUNC(zpotrs) #define POTRI BLASFUNC(zpotri) #define SYRK BLASFUNC(zherk) #else #define POTRF BLASFUNC(cpotrf) #define POTRS BLASFUNC(cpotrs) #define POTRI BLASFUNC(cpotri) #define SYRK BLASFUNC(cherk) #endif #endif // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info); // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info); #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif int main(int argc, char *argv[]){ #ifndef COMPLEX char *trans[] = {"T", "N"}; #else char *trans[] = {"C", "N"}; #endif char *uplo[] = {"U", "L"}; FLOAT alpha[] = {1.0, 0.0}; FLOAT beta [] = {0.0, 0.0}; FLOAT *a, *b; char *p; char btest = 'F'; blasint m, i, j, info, uplos=0; double flops; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_UPLO"))) if (*p == 'L') uplos=1; if ((p = getenv("OPENBLAS_TEST"))) btest=*p; fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } for(m = from; m <= to; m += step){ #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[i + j * m] = 0.; a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[i + j * m] = 0.; } } #else if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { a[(i + j * m) * 2 + 0] = 0.; a[(i + j * m) * 2 + 1] = 0.; } a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; a[(j + j * m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; a[(j + j * m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { a[(i + j * m) * 2 + 0] = 0.; a[(i + j * m) * 2 + 1] = 0.; } } } #endif SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); gettimeofday( &start, (struct timezone *)0); POTRF(uplo[uplos], &m, b, &m, &info); gettimeofday( &stop, (struct timezone *)0); if (info != 0) { fprintf(stderr, "Potrf info = %d\n", info); exit(1); } time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; if ( btest == 'S' ) { for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info); gettimeofday( &stop, (struct timezone *)0); if (info != 0) { fprintf(stderr, "Potrs info = %d\n", info); exit(1); } time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; } if ( btest == 'I' ) { gettimeofday( &start, (struct timezone *)0); POTRI(uplo[uplos], &m, b, &m, &info); gettimeofday( &stop, (struct timezone *)0); if (info != 0) { fprintf(stderr, "Potri info = %d\n", info); exit(1); } time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; } fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/rot.c000066400000000000000000000121221313527062700160440ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef DOT #ifdef DOUBLE #define ROT BLASFUNC(drot) #else #define ROT BLASFUNC(srot) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; // FLOAT result; blasint m, i; blasint inc_x=1,inc_y=1; FLOAT c[1] = { 2.0 }; FLOAT s[1] = { 2.0 }; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef SCAL #ifdef COMPLEX #ifdef DOUBLE #define SCAL BLASFUNC(zscal) #else #define SCAL BLASFUNC(cscal) #endif #else #ifdef DOUBLE #define SCAL BLASFUNC(dscal) #else #define SCAL BLASFUNC(sscal) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l 0) { for (z in 1:length(argv)) { if (z == 1) { nfrom <- as.numeric(argv[z]) } else if (z == 2) { nto <- as.numeric(argv[z]) } else if (z == 3) { nstep <- as.numeric(argv[z]) } else if (z == 4) { loops <- as.numeric(argv[z]) } } } p <- Sys.getenv("OPENBLAS_LOOPS") if (p != "") { loops <- as.numeric(p) } cat(sprintf( "From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops )) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { A <- matrix(rnorm(n * n), ncol = n, nrow = n) ev <- 0 z <- system.time(for (l in 1:loops) { ev <- eigen(A) }) mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep } OpenBLAS-0.2.20/benchmark/scripts/R/dgemm.R000077500000000000000000000022131313527062700202030ustar00rootroot00000000000000#!/usr/bin/Rscript argv <- commandArgs(trailingOnly = TRUE) nfrom <- 128 nto <- 2048 nstep <- 128 loops <- 1 if (length(argv) > 0) { for (z in 1:length(argv)) { if (z == 1) { nfrom <- as.numeric(argv[z]) } else if (z == 2) { nto <- as.numeric(argv[z]) } else if (z == 3) { nstep <- as.numeric(argv[z]) } else if (z == 4) { loops <- as.numeric(argv[z]) } } } p <- Sys.getenv("OPENBLAS_LOOPS") if (p != "") { loops <- as.numeric(p) } cat(sprintf( "From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops )) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { A <- matrix(runif(n * n), ncol = n, nrow = n, byrow = TRUE) B <- matrix(runif(n * n), ncol = n, nrow = n, byrow = TRUE) C <- 1 z <- system.time(for (l in 1:loops) { C <- A %*% B l <- l + 1 }) mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep } OpenBLAS-0.2.20/benchmark/scripts/R/dsolve.R000077500000000000000000000020361313527062700204110ustar00rootroot00000000000000#!/usr/bin/Rscript argv <- commandArgs(trailingOnly = TRUE) nfrom <- 128 nto <- 2048 nstep <- 128 loops <- 1 if (length(argv) > 0) { for (z in 1:length(argv)) { if (z == 1) { nfrom <- as.numeric(argv[z]) } else if (z == 2) { nto <- as.numeric(argv[z]) } else if (z == 3) { nstep <- as.numeric(argv[z]) } else if (z == 4) { loops <- as.numeric(argv[z]) } } } p <- Sys.getenv("OPENBLAS_LOOPS") if (p != "") { loops <- as.numeric(p) } cat(sprintf( "From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops )) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { A <- matrix(rnorm(n * n), ncol = n, nrow = n) B <- matrix(rnorm(n * n), ncol = n, nrow = n) z <- system.time(for (l in 1:loops) { solve(A, B) }) mflops <- (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep } OpenBLAS-0.2.20/benchmark/scripts/SCIPY/000077500000000000000000000000001313527062700174545ustar00rootroot00000000000000OpenBLAS-0.2.20/benchmark/scripts/SCIPY/dsyrk.py000077500000000000000000000023171313527062700211700ustar00rootroot00000000000000#!/usr/bin/env python import os import sys import time import numpy from numpy import zeros from numpy.random import randn from scipy.linalg import blas def run_dsyrk(N, l): A = randn(N, N).astype('float64', order='F') C = zeros((N, N), dtype='float64', order='F') start = time.time() for i in range(0, l): blas.dsyrk(1.0, A, c=C, overwrite_c=True) end = time.time() timediff = (end - start) mflops = (N * N * N) * l / timediff mflops *= 1e-6 size = "%dx%d" % (N, N) print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff)) if __name__ == "__main__": N = 128 NMAX = 2048 NINC = 128 LOOPS = 1 z = 0 for arg in sys.argv: if z == 1: N = int(arg) elif z == 2: NMAX = int(arg) elif z == 3: NINC = int(arg) elif z == 4: LOOPS = int(arg) z = z + 1 if 'OPENBLAS_LOOPS' in os.environ: p = os.environ['OPENBLAS_LOOPS'] if p: LOOPS = int(p) print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") for i in range(N, NMAX + NINC, NINC): run_dsyrk(i, LOOPS) OpenBLAS-0.2.20/benchmark/scripts/SCIPY/ssyrk.py000077500000000000000000000023171313527062700212070ustar00rootroot00000000000000#!/usr/bin/env python import os import sys import time import numpy from numpy import zeros from numpy.random import randn from scipy.linalg import blas def run_ssyrk(N, l): A = randn(N, N).astype('float32', order='F') C = zeros((N, N), dtype='float32', order='F') start = time.time() for i in range(0, l): blas.ssyrk(1.0, A, c=C, overwrite_c=True) end = time.time() timediff = (end - start) mflops = (N * N * N) * l / timediff mflops *= 1e-6 size = "%dx%d" % (N, N) print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff)) if __name__ == "__main__": N = 128 NMAX = 2048 NINC = 128 LOOPS = 1 z = 0 for arg in sys.argv: if z == 1: N = int(arg) elif z == 2: NMAX = int(arg) elif z == 3: NINC = int(arg) elif z == 4: LOOPS = int(arg) z = z + 1 if 'OPENBLAS_LOOPS' in os.environ: p = os.environ['OPENBLAS_LOOPS'] if p: LOOPS = int(p) print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") for i in range(N, NMAX + NINC, NINC): run_ssyrk(i, LOOPS) OpenBLAS-0.2.20/benchmark/smallscaling.c000066400000000000000000000123771313527062700177250ustar00rootroot00000000000000// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n #include #include #include #include #include #include #include #define MIN_SIZE 5 #define MAX_SIZE 60 #define NB_SIZE 10 // number of loop for a 1x1 matrix. Lower it if the test is // too slow on you computer. #define NLOOP 2e7 typedef struct { int matrix_size; int n_loop; void (* bench_func)(); void (* blas_func)(); void * (* create_matrix)(int size); } BenchParam; void * s_create_matrix(int size) { float * r = malloc(size * sizeof(double)); int i; for(i = 0; i < size; i++) r[i] = 1e3 * i / size; return r; } void * c_create_matrix(int size) { float * r = malloc(size * 2 * sizeof(double)); int i; for(i = 0; i < 2 * size; i++) r[i] = 1e3 * i / size; return r; } void * z_create_matrix(int size) { double * r = malloc(size * 2 * sizeof(double)); int i; for(i = 0; i < 2 * size; i++) r[i] = 1e3 * i / size; return r; } void * d_create_matrix(int size) { double * r = malloc(size * sizeof(double)); int i; for(i = 0; i < size; i++) r[i] = 1e3 * i / size; return r; } void trmv_bench(BenchParam * param) { int i, n; int size = param->matrix_size; n = param->n_loop / size; int one = 1; void * A = param->create_matrix(size * size); void * y = param->create_matrix(size); for(i = 0; i < n; i++) { param->blas_func("U", "N", "N", &size, A, &size, y, &one); } free(A); free(y); } void gemv_bench(BenchParam * param) { int i, n; int size = param->matrix_size; n = param->n_loop / size; double v = 1.01; int one = 1; void * A = param->create_matrix(size * size); void * y = param->create_matrix(size); for(i = 0; i < n; i++) { param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); } free(A); free(y); } void ger_bench(BenchParam * param) { int i, n; int size = param->matrix_size; n = param->n_loop / size; double v = 1.01; int one = 1; void * A = param->create_matrix(size * size); void * y = param->create_matrix(size); for(i = 0; i < n; i++) { param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); } free(A); free(y); } #ifndef _WIN32 void * pthread_func_wrapper(void * param) { ((BenchParam *)param)->bench_func(param); pthread_exit(NULL); } #endif #define NB_TESTS 5 void * TESTS[4 * NB_TESTS] = { trmv_bench, ztrmv_, z_create_matrix, "ztrmv", gemv_bench, dgemv_, d_create_matrix, "dgemv", gemv_bench, zgemv_, z_create_matrix, "zgemv", ger_bench, dger_, d_create_matrix, "dger", ger_bench, zgerc_, z_create_matrix, "zgerc", }; inline static double delta_time(struct timespec tick) { struct timespec tock; clock_gettime(CLOCK_MONOTONIC, &tock); return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; } double pthread_bench(BenchParam * param, int nb_threads) { #ifdef _WIN32 return 0; #else BenchParam threaded_param = *param; pthread_t threads[nb_threads]; int t, rc; struct timespec tick; threaded_param.n_loop /= nb_threads; clock_gettime(CLOCK_MONOTONIC, &tick); for(t=0; tbench_func(param); return delta_time(tick); } double omp_bench(BenchParam * param) { BenchParam threaded_param = *param; struct timespec tick; int t; int nb_threads = omp_get_max_threads(); threaded_param.n_loop /= nb_threads; clock_gettime(CLOCK_MONOTONIC, &tick); #pragma omp parallel for for(t = 0; t < nb_threads; t ++){ param->bench_func(&threaded_param); } return delta_time(tick); } int main(int argc, char * argv[]) { double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); BenchParam param; int test_id; printf ("Running on %d threads\n", omp_get_max_threads()); for(test_id = 0; test_id < NB_TESTS; test_id ++) { double size = MIN_SIZE; param.bench_func = TESTS[test_id * 4]; param.blas_func = TESTS[test_id * 4 + 1]; param.create_matrix = TESTS[test_id * 4 + 2]; printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); param.n_loop = NLOOP; while(size <= MAX_SIZE) { param.matrix_size = (int)(size + 0.5); double seq_time = seq_bench(¶m); double omp_time = omp_bench(¶m); double pthread_time = pthread_bench(¶m, omp_get_max_threads()); printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " "pthread %gs, speedup %g\n", param.matrix_size, seq_time, omp_time, seq_time / omp_time, pthread_time, seq_time / pthread_time); size *= inc_factor; } } return(0); } OpenBLAS-0.2.20/benchmark/swap.c000066400000000000000000000122511313527062700162150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above swapright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above swapright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef SWAP #ifdef COMPLEX #ifdef DOUBLE #define SWAP BLASFUNC(zswap) #else #define SWAP BLASFUNC(cswap) #endif #else #ifdef DOUBLE #define SWAP BLASFUNC(dswap) #else #define SWAP BLASFUNC(sswap) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef SYMM #ifndef COMPLEX #ifdef DOUBLE #define SYMM BLASFUNC(dsymm) #else #define SYMM BLASFUNC(ssymm) #endif #else #ifdef DOUBLE #define SYMM BLASFUNC(zsymm) #else #define SYMM BLASFUNC(csymm) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char side='L'; char uplo='U'; if ((p = getenv("OPENBLAS_SIDE"))) side=*p; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c\n", from, to, step,side,uplo); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/symv.c000066400000000000000000000131341313527062700162420ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef SYMV #ifndef COMPLEX #ifdef DOUBLE #define SYMV BLASFUNC(dsymv) #else #define SYMV BLASFUNC(ssymv) #endif #else #ifdef DOUBLE #define SYMV BLASFUNC(zsymv) #else #define SYMV BLASFUNC(csymv) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char uplo='L'; blasint m, i, j; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6dx%d : ", (int)m,(int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef SYR2K #ifndef COMPLEX #ifdef DOUBLE #define SYR2K BLASFUNC(dsyr2k) #else #define SYR2K BLASFUNC(ssyr2k) #endif #else #ifdef DOUBLE #define SYR2K BLASFUNC(zsyr2k) #else #define SYR2K BLASFUNC(csyr2k) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char uplo='U'; char trans='N'; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/syrk.c000066400000000000000000000120701313527062700162320ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef SYRK #ifndef COMPLEX #ifdef DOUBLE #define SYRK BLASFUNC(dsyrk) #else #define SYRK BLASFUNC(ssyrk) #endif #else #ifdef DOUBLE #define SYRK BLASFUNC(zsyrk) #else #define SYRK BLASFUNC(csyrk) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char uplo='U'; char trans='N'; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/tplot-header000066400000000000000000000040021313527062700174050ustar00rootroot00000000000000# ********************************************************************************** # Copyright (c) 2014, The OpenBLAS Project # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # 3. Neither the name of the OpenBLAS project nor the names of # its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ********************************************************************************** set term x11 font sans; set ylabel "MFlops"; set xlabel "Size"; set grid xtics; set grid ytics; set key left; set timestamp "generated on %Y-%m-%d by `whoami`" set title "Sgemv\nTRANS=T\nBulldozer" plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier; set output "print.png"; show title; show plot; show output; OpenBLAS-0.2.20/benchmark/trmm.c000066400000000000000000000123611313527062700162240ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef TRMM #ifndef COMPLEX #ifdef DOUBLE #define TRMM BLASFUNC(dtrmm) #else #define TRMM BLASFUNC(strmm) #endif #else #ifdef DOUBLE #define TRMM BLASFUNC(ztrmm) #else #define TRMM BLASFUNC(ctrmm) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char side ='L'; char uplo ='U'; char trans='N'; char diag ='U'; if ((p = getenv("OPENBLAS_SIDE"))) side=*p; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } gettimeofday( &start, (struct timezone *)0); TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); } return 0; } // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); OpenBLAS-0.2.20/benchmark/trsm.c000066400000000000000000000126621313527062700162360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef TRSM #ifndef COMPLEX #ifdef DOUBLE #define TRSM BLASFUNC(dtrsm) #else #define TRSM BLASFUNC(strsm) #endif #else #ifdef DOUBLE #define TRSM BLASFUNC(ztrsm) #else #define TRSM BLASFUNC(ctrsm) #endif #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *a, *b; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char *p; char side ='L'; char uplo ='U'; char trans='N'; char diag ='U'; int l; int loops = 1; double timeg; if ((p = getenv("OPENBLAS_SIDE"))) side=*p; if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; p = getenv("OPENBLAS_LOOPS"); if ( p != NULL ) loops = atoi(p); blasint m, i, j; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c Loops = %d\n", from, to, step,side,uplo,trans,diag,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0.0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #define RETURN_BY_STACK 1 #include "common.h" #undef DOT #ifdef DOUBLE #define DOT BLASFUNC(zdotu) #else #define DOT BLASFUNC(cdotu) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT _Complex result; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l #include #ifdef __CYGWIN32__ #include #endif #include "common.h" #undef DOT #ifdef DOUBLE #define DOT BLASFUNC(zdotu) #else #define DOT BLASFUNC(cdotu) #endif #if defined(__WIN32__) || defined(__WIN64__) #ifndef DELTA_EPOCH_IN_MICROSECS #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; if (NULL != tv) { GetSystemTimeAsFileTime(&ft); tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } return 0; } #endif #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 static void *huge_malloc(BLASLONG size){ int shmid; void *address; #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif if ((shmid =shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), SHM_HUGETLB | IPC_CREAT |0600)) < 0) { printf( "Memory allocation failed(shmget).\n"); exit(1); } address = shmat(shmid, NULL, SHM_RND); if ((BLASLONG)address == -1){ printf( "Memory allocation failed(shmat).\n"); exit(1); } shmctl(shmid, IPC_RMID, 0); return address; } #define malloc huge_malloc #endif int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT _Complex result; blasint m, i; blasint inc_x=1,inc_y=1; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; fprintf(stderr, " %6d : ", (int)m); for (l=0; l 1 ); $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); $config = shift(@ARGV); $compiler_name = join(" ", @ARGV); # First, we need to know the target OS and compiler name $data = `$compiler_name -E ctest.c`; if ($?) { printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; die 1; } $cross_suffix = ""; if (dirname($compiler_name) ne ".") { $cross_suffix .= dirname($compiler_name) . "/"; } if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { $cross_suffix .= $1; } $compiler = ""; $compiler = LSB if ($data =~ /COMPILER_LSB/); $compiler = CLANG if ($data =~ /COMPILER_CLANG/); $compiler = PGI if ($data =~ /COMPILER_PGI/); $compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); $compiler = INTEL if ($data =~ /COMPILER_INTEL/); $compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/); $compiler = SUN if ($data =~ /COMPILER_SUN/); $compiler = IBM if ($data =~ /COMPILER_IBM/); $compiler = DEC if ($data =~ /COMPILER_DEC/); $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); $os = FreeBSD if ($data =~ /OS_FREEBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/); $os = Darwin if ($data =~ /OS_DARWIN/); $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = power if ($data =~ /ARCH_POWER/); $architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $defined = 0; if ($os eq "AIX") { $compiler_name .= " -maix32" if ($binary eq "32"); $compiler_name .= " -maix64" if ($binary eq "64"); $defined = 1; } if ($architecture eq "mips") { $compiler_name .= " -mabi=32"; $defined = 1; } if ($architecture eq "mips64") { $compiler_name .= " -mabi=n32" if ($binary eq "32"); $compiler_name .= " -mabi=64" if ($binary eq "64"); $defined = 1; } if (($architecture eq "arm") || ($architecture eq "arm64")) { $defined = 1; } if ($architecture eq "zarch") { $defined = 1; $binary = 64; } if ($architecture eq "alpha") { $defined = 1; $binary = 64; } if ($architecture eq "ia64") { $defined = 1; $binary = 64; } if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { $defined = 1; $binary =32; } if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); $openmp = "-mp"; $defined = 1; } if ($compiler eq "IBM") { $compiler_name .= " -q32" if ($binary eq "32"); $compiler_name .= " -q64" if ($binary eq "64"); $openmp = "-qsmp=omp"; $defined = 1; } if ($compiler eq "INTEL") { $openmp = "-openmp"; } if ($compiler eq "PATHSCALE") { $openmp = "-mp"; } if ($compiler eq "OPEN64") { $openmp = "-mp"; } if ($compiler eq "CLANG") { $openmp = "-fopenmp"; } if ($compiler eq "GCC" || $compiler eq "LSB") { $openmp = "-fopenmp"; } if ($defined == 0) { $compiler_name .= " -m32" if ($binary eq "32"); $compiler_name .= " -m64" if ($binary eq "64"); } # Do again $data = `$compiler_name -E ctest.c`; if ($?) { printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; die 1; } $have_msa = 0; if (($architecture eq "mips") || ($architecture eq "mips64")) { $code = '"addvi.b $w0, $w1, 1"'; $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; print $tmpf "#include \n\n"; print $tmpf "void main(void){ __asm__ volatile($code); }\n"; $args = "$msa_flags -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args"); system(@cmd) == 0; if ($? != 0) { $have_msa = 0; } else { $have_msa = 1; } unlink("$tmpf.o"); } $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = power if ($data =~ /ARCH_POWER/); $architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); $architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; $need_fu = $1; $cross = 0; $cross = 1 if ($os ne $hostos); if ($architecture ne $hostarch) { $cross = 1; $cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86")); $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); } $openmp = "" if $ENV{USE_OPENMP} != 1; $linker_L = ""; $linker_l = ""; $linker_a = ""; { $link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; $link =~ s/\-Y\sP\,/\-Y/g; @flags = split(/[\s\,\n]/, $link); # remove leading and trailing quotes from each flag. @flags = map {s/^['"]|['"]$//g; $_} @flags; foreach $flags (@flags) { if ( ($flags =~ /^\-L/) && ($flags !~ /^-LIST:/) && ($flags !~ /^-LANG:/) ) { $linker_L .= $flags . " " } if ($flags =~ /^\-Y/) { $linker_L .= "-Wl,". $flags . " " } if ($flags =~ /^\--exclude-libs/) { $linker_L .= "-Wl,". $flags . " "; $flags=""; } if ( ($flags =~ /^\-l/) && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) && ($flags !~ /numa/) && ($flags !~ /crt[0-9]/) && ($flags !~ /gcc/) && ($flags !~ /user32/) && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) ) { $linker_l .= $flags . " " } $linker_a .= $flags . " " if $flags =~ /\.a$/; } } open(MAKEFILE, "> $makefile") || die "Can't create $makefile"; open(CONFFILE, "> $config" ) || die "Can't create $config"; # print $data, "\n"; print MAKEFILE "OSNAME=$os\n"; print MAKEFILE "ARCH=$architecture\n"; print MAKEFILE "C_COMPILER=$compiler\n"; print MAKEFILE "BINARY32=\n" if $binformat ne bin32; print MAKEFILE "BINARY64=\n" if $binformat ne bin64; print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne ""; print MAKEFILE "CROSS=1\n" if $cross != 0; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; print CONFFILE "#define OS_$os\t1\n"; print CONFFILE "#define ARCH_$architecture\t1\n"; print CONFFILE "#define C_$compiler\t1\n"; print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; if ($os eq "LINUX") { # @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); # if ($pthread[2] ne "") { # print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; # } else { print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; # } } else { print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; } close(MAKEFILE); close(CONFFILE); OpenBLAS-0.2.20/cblas.h000066400000000000000000001311071313527062700144040ustar00rootroot00000000000000#ifndef CBLAS_H #define CBLAS_H #include #include "common.h" #ifdef __cplusplus extern "C" { /* Assume C declarations for C++ */ #endif /* __cplusplus */ /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); /*Get the number of threads on runtime.*/ int openblas_get_num_threads(void); /*Get the number of physical processors (cores).*/ int openblas_get_num_procs(void); /*Get the build configure on runtime.*/ char* openblas_get_config(void); /*Get the CPU corename on runtime.*/ char* openblas_get_corename(void); /* Get the parallelization type which is used by OpenBLAS */ int openblas_get_parallel(void); /* OpenBLAS is compiled for sequential use */ #define OPENBLAS_SEQUENTIAL 0 /* OpenBLAS is compiled using normal threading model */ #define OPENBLAS_THREAD 1 /* OpenBLAS is compiled using OpenMP threading model */ #define OPENBLAS_OPENMP 2 /* * Since all of GotoBlas was written without const, * we disable it at build time. */ #ifndef OPENBLAS_CONST # define OPENBLAS_CONST const #endif #define CBLAS_INDEX size_t typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_ddot(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy); openblas_complex_float cblas_cdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); openblas_complex_float cblas_cdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); openblas_complex_double cblas_zdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy); openblas_complex_double cblas_zdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy); void cblas_cdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy, openblas_complex_float *ret); void cblas_cdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy, openblas_complex_float *ret); void cblas_zdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy, openblas_complex_double *ret); void cblas_zdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy, openblas_complex_double *ret); float cblas_sasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); double cblas_dznrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); CBLAS_INDEX cblas_isamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_zcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_sswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_dswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_cswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_zswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); void cblas_srotmg(float *d1, float *d2, float *b1, OPENBLAS_CONST float b2, float *P); void cblas_drotmg(double *d1, double *d2, double *b1, OPENBLAS_CONST double b2, double *P); void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX); void cblas_dscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX); void cblas_cscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, float *X, OPENBLAS_CONST blasint incX); void cblas_zscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, double *X, OPENBLAS_CONST blasint incX); void cblas_csscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX); void cblas_zdscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX); void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); void cblas_dgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy); void cblas_cgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *beta, float *y, OPENBLAS_CONST blasint incy); void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy); void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda); void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda); void cblas_cgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda); void cblas_cgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda); void cblas_zgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda); void cblas_zgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda); void cblas_strsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_dtrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_ctrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_ztrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_strmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_dtrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_ctrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_ztrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_ssyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda); void cblas_dsyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda); void cblas_cher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda); void cblas_zher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda); void cblas_ssyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo,OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda); void cblas_dsyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda); void cblas_cher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda); void cblas_zher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda); void cblas_sgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_dgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_cgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_zgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_ssbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_stbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_dtbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_ctbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_ztbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_stbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_dtbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_ctbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX); void cblas_ztbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX); void cblas_stpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX); void cblas_dtpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX); void cblas_ctpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX); void cblas_ztpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX); void cblas_stpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX); void cblas_dtpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX); void cblas_ctpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX); void cblas_ztpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX); void cblas_ssymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_dsymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_chemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_zhemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_sspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *Ap, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_dspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *Ap, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_sspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *Ap); void cblas_dspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *Ap); void cblas_chpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A); void cblas_zhpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,OPENBLAS_CONST blasint incX, double *A); void cblas_sspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A); void cblas_dspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A); void cblas_chpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *Ap); void cblas_zhpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *Ap); void cblas_chbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_zhbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_chpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *Ap, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *beta, float *Y, OPENBLAS_CONST blasint incY); void cblas_zhpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *Ap, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *beta, double *Y, OPENBLAS_CONST blasint incY); void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_csymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_ssyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_dsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_csyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_ssyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_dsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_csyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_strmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb); void cblas_dtrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb); void cblas_ctrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb); void cblas_ztrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb); void cblas_strsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb); void cblas_dtrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb); void cblas_ctrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb); void cblas_ztrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb); void cblas_chemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zhemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_cherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); void cblas_xerbla(blasint p, char *rout, char *form, ...); /*** BLAS extensions ***/ void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float *beta, float *y, OPENBLAS_CONST blasint incy); void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy); void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb); void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a, OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb); void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, float *c, OPENBLAS_CONST blasint cldc); void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, double *c, OPENBLAS_CONST blasint cldc); void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, float *c, OPENBLAS_CONST blasint cldc); void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, double *c, OPENBLAS_CONST blasint cldc); #ifdef __cplusplus } #endif /* __cplusplus */ #endif OpenBLAS-0.2.20/cmake/000077500000000000000000000000001313527062700142245ustar00rootroot00000000000000OpenBLAS-0.2.20/cmake/arch.cmake000066400000000000000000000060611313527062700161460ustar00rootroot00000000000000## ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets various variables based on architecture. if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") if (${ARCH} STREQUAL "x86") if (NOT BINARY) set(NO_BINARY_MODE 1) endif () endif () if (NOT NO_EXPRECISION) if (${F_COMPILER} MATCHES "GFORTRAN") # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB") set(EXPRECISION 1) set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double") set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") set(EXPRECISION 1) set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION") set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") endif () endif () endif () endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") endif () if (USE_OPENMP) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB") set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") message(WARNING "Clang doesn't support OpenMP yet.") set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") set(CCOMMON_OPT "${CCOMMON_OPT} -openmp") endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI") set(CCOMMON_OPT "${CCOMMON_OPT} -mp") endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64") set(CCOMMON_OPT "${CCOMMON_OPT} -mp") set(CEXTRALIB "${CEXTRALIB} -lstdc++") endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE") set(CCOMMON_OPT "${CCOMMON_OPT} -mp") endif () endif () if (DYNAMIC_ARCH) if (${ARCH} STREQUAL "x86") set(DYNAMIC_CORE "KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") endif () if (${ARCH} STREQUAL "x86_64") set(DYNAMIC_CORE "PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO") if (NOT NO_AVX) set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER") endif () if (NOT NO_AVX2) set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL ZEN") endif () endif () if (NOT DYNAMIC_CORE) unset(DYNAMIC_ARCH) endif () endif () if (${ARCH} STREQUAL "ia64") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) if (${F_COMPILER} MATCHES "GFORTRAN") if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") # EXPRECISION = 1 # CCOMMON_OPT += -DEXPRECISION endif () endif () endif () if (${ARCH} STREQUAL "mips64") set(NO_BINARY_MODE 1) endif () if (${ARCH} STREQUAL "alpha") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) endif () if (${ARCH} STREQUAL "arm") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) endif () if (${ARCH} STREQUAL "arm64") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) endif () OpenBLAS-0.2.20/cmake/c_check.cmake000066400000000000000000000043031313527062700166050ustar00rootroot00000000000000## ## Author: Hank Anderson ## Description: Ported from the OpenBLAS/c_check perl script. ## This is triggered by prebuild.cmake and runs before any of the code is built. ## Creates config.h and Makefile.conf. # CMake vars set by this file: # OSNAME (use CMAKE_SYSTEM_NAME) # ARCH # C_COMPILER (use CMAKE_C_COMPILER) # BINARY32 # BINARY64 # FU # CROSS_SUFFIX # CROSS # CEXTRALIB # Defines set by this file: # OS_ # ARCH_ # C_ # __32BIT__ # __64BIT__ # FUNDERSCORE # PTHREAD_CREATE_FUNC # N.B. c_check (and ctest.c) is not cross-platform, so instead try to use CMake variables. set(FU "") if(APPLE) set(FU "_") elseif(MSVC) set(FU "_") elseif(UNIX) set(FU "") endif() # Convert CMake vars into the format that OpenBLAS expects string(TOUPPER ${CMAKE_SYSTEM_NAME} HOST_OS) if (${HOST_OS} STREQUAL "WINDOWS") set(HOST_OS WINNT) endif () # added by hpa - check size of void ptr to detect 64-bit compile if (NOT DEFINED BINARY) set(BINARY 32) if (CMAKE_SIZEOF_VOID_P EQUAL 8) set(BINARY 64) endif () endif () if (BINARY EQUAL 64) set(BINARY64 1) else () set(BINARY32 1) endif () # CMake docs define these: # CMAKE_SYSTEM_PROCESSOR - The name of the CPU CMake is building for. # CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on. # # TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) if (${ARCH} STREQUAL "AMD64") set(ARCH "x86_64") endif () # If you are using a 32-bit compiler on a 64-bit system CMAKE_SYSTEM_PROCESSOR will be wrong if (${ARCH} STREQUAL "x86_64" AND BINARY EQUAL 32) set(ARCH x86) endif () if (${ARCH} STREQUAL "X86") set(ARCH x86) endif () if (${ARCH} MATCHES "ppc") set(ARCH power) endif () set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") set(COMPILER_ID "GCC") endif () string(TOUPPER ${ARCH} UC_ARCH) file(WRITE ${TARGET_CONF} "#define OS_${HOST_OS}\t1\n" "#define ARCH_${UC_ARCH}\t1\n" "#define C_${COMPILER_ID}\t1\n" "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${FU}\n") if (${HOST_OS} STREQUAL "WINDOWSSTORE") file(APPEND ${TARGET_CONF} "#define OS_WINNT\t1\n") endif () OpenBLAS-0.2.20/cmake/cc.cmake000066400000000000000000000047671313527062700156310ustar00rootroot00000000000000## ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets C related variables. if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") set(COMMON_PROF "${COMMON_PROF} -fno-inline") set(NO_UNINITIALIZED_WARN "-Wno-uninitialized") if (QUIET_MAKE) set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused") endif () if (NO_BINARY_MODE) if (${ARCH} STREQUAL "mips64") if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") else () set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=n32") endif () set(BINARY_DEFINED 1) endif () if (${CORE} STREQUAL "LOONGSON3A") set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") endif () if (${CORE} STREQUAL "LOONGSON3B") set(CCOMMON_OPT "${CCOMMON_OPT} -march=mips64") set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") endif () if (${OSNAME} STREQUAL "AIX") set(BINARY_DEFINED 1) endif () endif () if (NOT BINARY_DEFINED) if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -m64") else () set(CCOMMON_OPT "${CCOMMON_OPT} -m32") endif () endif () endif () if (${CMAKE_C_COMPILER} STREQUAL "PGI") if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64") else () set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7") endif () endif () if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -m64") else () set(CCOMMON_OPT "${CCOMMON_OPT} -m32") endif () endif () if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") if (${ARCH} STREQUAL "mips64") if (NOT BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -n32") else () set(CCOMMON_OPT "${CCOMMON_OPT} -n64") endif () if (${CORE} STREQUAL "LOONGSON3A") set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") endif () if (${CORE} STREQUAL "LOONGSON3B") set(CCOMMON_OPT "${CCOMMON_OPT} -loongson3 -static") endif () else () if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -m32") else () set(CCOMMON_OPT "${CCOMMON_OPT} -m64") endif () endif () endif () if (${CMAKE_C_COMPILER} STREQUAL "SUN") set(CCOMMON_OPT "${CCOMMON_OPT} -w") if (${ARCH} STREQUAL "x86") set(CCOMMON_OPT "${CCOMMON_OPT} -m32") else () set(FCOMMON_OPT "${FCOMMON_OPT} -m64") endif () endif () OpenBLAS-0.2.20/cmake/export.cmake000066400000000000000000000024301313527062700165460ustar00rootroot00000000000000 #Only generate .def for dll on MSVC if(MSVC) set_source_files_properties(${OpenBLAS_DEF_FILE} PROPERTIES GENERATED 1) if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") else() set(ARCH_IN ${ARCH}) endif() if (${CORE} STREQUAL "generic") set(ARCH_IN "GENERIC") endif () if (NOT DEFINED EXPRECISION) set(EXPRECISION_IN 0) else() set(EXPRECISION_IN ${EXPRECISION}) endif() if (NOT DEFINED NO_CBLAS) set(NO_CBLAS_IN 0) else() set(NO_CBLAS_IN ${NO_CBLAS}) endif() if (NOT DEFINED NO_LAPACK) set(NO_LAPACK_IN 0) else() set(NO_LAPACK_IN ${NO_LAPACK}) endif() if (NOT DEFINED NO_LAPACKE) set(NO_LAPACKE_IN 0) else() set(NO_LAPACKE_IN ${NO_LAPACKE}) endif() if (NOT DEFINED NEED2UNDERSCORES) set(NEED2UNDERSCORES_IN 0) else() set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) endif() if (NOT DEFINED ONLY_CBLAS) set(ONLY_CBLAS_IN 0) else() set(ONLY_CBLAS_IN ${ONLY_CBLAS}) endif() add_custom_command( TARGET ${OpenBLAS_LIBNAME} PRE_LINK COMMAND perl ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" COMMENT "Create openblas.def file" VERBATIM) endif()OpenBLAS-0.2.20/cmake/f_check.cmake000066400000000000000000000034721313527062700166160ustar00rootroot00000000000000## ## Author: Hank Anderson ## Copyright: (c) Stat-Ease, Inc. ## Created: 12/29/14 ## Last Modified: 12/29/14 ## Description: Ported from the OpenBLAS/f_check perl script. ## This is triggered by prebuild.cmake and runs before any of the code is built. ## Appends Fortran information to config.h and Makefile.conf. # CMake vars set by this file: # F_COMPILER # FC # BU # NOFORTRAN # NEED2UNDERSCORES # FEXTRALIB # Defines set by this file: # BUNDERSCORE # NEEDBUNDERSCORE # NEED2UNDERSCORES if (MSVC) # had to do this for MSVC, else CMake automatically assumes I have ifort... -hpa include(CMakeForceCompiler) CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) endif () if (NOT NO_LAPACK) enable_language(Fortran) else() include(CMakeForceCompiler) CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) endif() if (NOT ONLY_CBLAS) # N.B. f_check is not cross-platform, so instead try to use CMake variables # run f_check (appends to TARGET files) # message(STATUS "Running f_check...") # execute_process(COMMAND perl f_check ${TARGET_MAKE} ${TARGET_CONF} ${CMAKE_Fortran_COMPILER} # WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile # TODO: set FEXTRALIB flags a la f_check? set(BU "_") file(APPEND ${TARGET_CONF} "#define BUNDERSCORE _\n" "#define NEEDBUNDERSCORE 1\n" "#define NEED2UNDERSCORES 0\n") else () #When we only build CBLAS, we set NOFORTRAN=2 set(NOFORTRAN 2) set(NO_FBLAS 1) #set(F_COMPILER GFORTRAN) # CMake handles the fortran compiler set(BU "_") file(APPEND ${TARGET_CONF} "#define BUNDERSCORE _\n" "#define NEEDBUNDERSCORE 1\n") endif() get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) string(TOUPPER ${F_COMPILER} F_COMPILER) OpenBLAS-0.2.20/cmake/fc.cmake000066400000000000000000000125111313527062700156160ustar00rootroot00000000000000## ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64) if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") else () set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () endif () if (${F_COMPILER} STREQUAL "G77") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") if (NOT NO_BINARY_MODE) if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") else () set(FCOMMON_OPT "${FCOMMON_OPT} -m32") endif () endif () endif () if (${F_COMPILER} STREQUAL "G95") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G95") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") if (NOT NO_BINARY_MODE) if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") else () set(FCOMMON_OPT "${FCOMMON_OPT} -m32") endif () endif () endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") endif () if (NO_BINARY_MODE) if (${ARCH} STREQUAL "mips64") if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") else () set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") endif () else () set(FCOMMON_OPT "${FCOMMON_OPT} -m32") endif () endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () endif () if (${F_COMPILER} STREQUAL "INTEL") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () endif () if (${F_COMPILER} STREQUAL "FUJITSU") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () endif () if (${F_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") # FCOMMON_OPT += -qarch=440 if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -q64") if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -qintsize=8") endif () else () set(FCOMMON_OPT "${FCOMMON_OPT} -q32") endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () endif () if (${F_COMPILER} STREQUAL "PGI") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") if (BINARY64) if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7-64") else () set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7") endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -mp") endif () endif () if (${F_COMPILER} STREQUAL "PATHSCALE") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PATHSCALE") if (BINARY64) if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () endif () if (NOT ${ARCH} STREQUAL "mips64") if (NOT BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m32") else () set(FCOMMON_OPT "${FCOMMON_OPT} -m64") endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") else () set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -mp") endif () endif () if (${F_COMPILER} STREQUAL "OPEN64") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_OPEN64") if (BINARY64) if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () endif () if (${ARCH} STREQUAL "mips64") if (NOT BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -n32") else () set(FCOMMON_OPT "${FCOMMON_OPT} -n64") endif () if (${CORE} STREQUAL "LOONGSON3A") set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") endif () if (${CORE} STREQUAL "LOONGSON3B") set(FCOMMON_OPT "${FCOMMON_OPT} -loongson3 -static") endif () else () if (NOT BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m32") else () set(FCOMMON_OPT "${FCOMMON_OPT} -m64") endif () endif () if (USE_OPENMP) set(FEXTRALIB "${FEXTRALIB} -lstdc++") set(FCOMMON_OPT "${FCOMMON_OPT} -mp") endif () endif () if (${F_COMPILER} STREQUAL "SUN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_SUN") if (${ARCH} STREQUAL "x86") set(FCOMMON_OPT "${FCOMMON_OPT} -m32") else () set(FCOMMON_OPT "${FCOMMON_OPT} -m64") endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") endif () endif () if (${F_COMPILER} STREQUAL "COMPAQ") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () endif () # from the root Makefile - this is for lapack-netlib to compile the correct secnd file. if (${F_COMPILER} STREQUAL "GFORTRAN") set(TIMER "INT_ETIME") else () set(TIMER "NONE") endif () OpenBLAS-0.2.20/cmake/kernel.cmake000066400000000000000000000115741313527062700165160ustar00rootroot00000000000000# helper functions for the kernel CMakeLists.txt # Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. macro(SetDefaultL1) set(SAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S) set(QAMAXKERNEL amax.S) set(CAMAXKERNEL zamax.S) set(ZAMAXKERNEL zamax.S) set(XAMAXKERNEL zamax.S) set(SAMINKERNEL amin.S) set(DAMINKERNEL amin.S) set(QAMINKERNEL amin.S) set(CAMINKERNEL zamin.S) set(ZAMINKERNEL zamin.S) set(XAMINKERNEL zamin.S) set(SMAXKERNEL max.S) set(DMAXKERNEL max.S) set(QMAXKERNEL max.S) set(SMINKERNEL min.S) set(DMINKERNEL min.S) set(QMINKERNEL min.S) set(ISAMAXKERNEL iamax.S) set(IDAMAXKERNEL iamax.S) set(IQAMAXKERNEL iamax.S) set(ICAMAXKERNEL izamax.S) set(IZAMAXKERNEL izamax.S) set(IXAMAXKERNEL izamax.S) set(ISAMINKERNEL iamin.S) set(IDAMINKERNEL iamin.S) set(IQAMINKERNEL iamin.S) set(ICAMINKERNEL izamin.S) set(IZAMINKERNEL izamin.S) set(IXAMINKERNEL izamin.S) set(ISMAXKERNEL iamax.S) set(IDMAXKERNEL iamax.S) set(IQMAXKERNEL iamax.S) set(ISMINKERNEL iamin.S) set(IDMINKERNEL iamin.S) set(IQMINKERNEL iamin.S) set(SASUMKERNEL asum.S) set(DASUMKERNEL asum.S) set(CASUMKERNEL zasum.S) set(ZASUMKERNEL zasum.S) set(QASUMKERNEL asum.S) set(XASUMKERNEL zasum.S) set(SAXPYKERNEL axpy.S) set(DAXPYKERNEL axpy.S) set(CAXPYKERNEL zaxpy.S) set(ZAXPYKERNEL zaxpy.S) set(QAXPYKERNEL axpy.S) set(XAXPYKERNEL zaxpy.S) set(SCOPYKERNEL copy.S) set(DCOPYKERNEL copy.S) set(CCOPYKERNEL zcopy.S) set(ZCOPYKERNEL zcopy.S) set(QCOPYKERNEL copy.S) set(XCOPYKERNEL zcopy.S) set(SDOTKERNEL dot.S) set(DDOTKERNEL dot.S) set(CDOTKERNEL zdot.S) set(ZDOTKERNEL zdot.S) set(QDOTKERNEL dot.S) set(XDOTKERNEL zdot.S) set(SNRM2KERNEL nrm2.S) set(DNRM2KERNEL nrm2.S) set(QNRM2KERNEL nrm2.S) set(CNRM2KERNEL znrm2.S) set(ZNRM2KERNEL znrm2.S) set(XNRM2KERNEL znrm2.S) set(SROTKERNEL rot.S) set(DROTKERNEL rot.S) set(QROTKERNEL rot.S) set(CROTKERNEL zrot.S) set(ZROTKERNEL zrot.S) set(XROTKERNEL zrot.S) set(SSCALKERNEL scal.S) set(DSCALKERNEL scal.S) set(CSCALKERNEL zscal.S) set(ZSCALKERNEL zscal.S) set(QSCALKERNEL scal.S) set(XSCALKERNEL zscal.S) set(SSWAPKERNEL swap.S) set(DSWAPKERNEL swap.S) set(CSWAPKERNEL zswap.S) set(ZSWAPKERNEL zswap.S) set(QSWAPKERNEL swap.S) set(XSWAPKERNEL zswap.S) set(SGEMVNKERNEL gemv_n.S) set(SGEMVTKERNEL gemv_t.S) set(DGEMVNKERNEL gemv_n.S) set(DGEMVTKERNEL gemv_t.S) set(CGEMVNKERNEL zgemv_n.S) set(CGEMVTKERNEL zgemv_t.S) set(ZGEMVNKERNEL zgemv_n.S) set(ZGEMVTKERNEL zgemv_t.S) set(QGEMVNKERNEL gemv_n.S) set(QGEMVTKERNEL gemv_t.S) set(XGEMVNKERNEL zgemv_n.S) set(XGEMVTKERNEL zgemv_t.S) set(SCABS_KERNEL ../generic/cabs.c) set(DCABS_KERNEL ../generic/cabs.c) set(QCABS_KERNEL ../generic/cabs.c) set(LSAME_KERNEL ../generic/lsame.c) set(SAXPBYKERNEL ../arm/axpby.c) set(DAXPBYKERNEL ../arm/axpby.c) set(CAXPBYKERNEL ../arm/zaxpby.c) set(ZAXPBYKERNEL ../arm/zaxpby.c) endmacro () macro(SetDefaultL2) set(SGEMVNKERNEL gemv_n.S) set(SGEMVTKERNEL gemv_t.S) set(DGEMVNKERNEL gemv_n.S) set(DGEMVTKERNEL gemv_t.S) set(CGEMVNKERNEL zgemv_n.S) set(CGEMVTKERNEL zgemv_t.S) set(ZGEMVNKERNEL zgemv_n.S) set(ZGEMVTKERNEL zgemv_t.S) set(QGEMVNKERNEL gemv_n.S) set(QGEMVTKERNEL gemv_t.S) set(XGEMVNKERNEL zgemv_n.S) set(XGEMVTKERNEL zgemv_t.S) set(SGERKERNEL ../generic/ger.c) set(DGERKERNEL ../generic/ger.c) set(QGERKERNEL ../generic/ger.c) set(CGERUKERNEL ../generic/zger.c) set(CGERCKERNEL ../generic/zger.c) set(ZGERUKERNEL ../generic/zger.c) set(ZGERCKERNEL ../generic/zger.c) set(XGERUKERNEL ../generic/zger.c) set(XGERCKERNEL ../generic/zger.c) set(SSYMV_U_KERNEL ../generic/symv_k.c) set(SSYMV_L_KERNEL ../generic/symv_k.c) set(DSYMV_U_KERNEL ../generic/symv_k.c) set(DSYMV_L_KERNEL ../generic/symv_k.c) set(QSYMV_U_KERNEL ../generic/symv_k.c) set(QSYMV_L_KERNEL ../generic/symv_k.c) set(CSYMV_U_KERNEL ../generic/zsymv_k.c) set(CSYMV_L_KERNEL ../generic/zsymv_k.c) set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) set(XSYMV_U_KERNEL ../generic/zsymv_k.c) set(XSYMV_L_KERNEL ../generic/zsymv_k.c) set(CHEMV_U_KERNEL ../generic/zhemv_k.c) set(CHEMV_L_KERNEL ../generic/zhemv_k.c) set(CHEMV_V_KERNEL ../generic/zhemv_k.c) set(CHEMV_M_KERNEL ../generic/zhemv_k.c) set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) set(XHEMV_U_KERNEL ../generic/zhemv_k.c) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) endmacro () macro(SetDefaultL3) set(SGEADD_KERNEL ../generic/geadd.c) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) endmacro ()OpenBLAS-0.2.20/cmake/lapack.cmake000066400000000000000000000466471313527062700165020ustar00rootroot00000000000000# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. set(ALLAUX ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f ilaprec.f ilatrans.f ilauplo.f iladiag.f iparam2stage.F chla_transtype.f ../INSTALL/ilaver.f ../INSTALL/slamch.f ) set(SCLAUX sbdsdc.f sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f slagts.f slamrg.f slanst.f slapy2.f slapy3.f slarnv.f slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f slarrk.f slarrr.f slaneg.f slartg.f slaruv.f slas2.f slascl.f slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f slasd7.f slasd8.f slasda.f slasdq.f slasdt.f slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f slasr.f slasrt.f slassq.f slasv2.f spttrf.f sstebz.f sstedc.f ssteqr.f ssterf.f slaisnan.f sisnan.f slartgp.f slartgs.f ../INSTALL/second_${TIMER}.f ) set(DZLAUX dbdsdc.f dbdsvdx.f dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f dlagts.f dlamrg.f dlanst.f dlapy2.f dlapy3.f dlarnv.f dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f dlarrk.f dlarrr.f dlaneg.f dlartg.f dlaruv.f dlas2.f dlascl.f dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f dsteqr.f dsterf.f dlaisnan.f disnan.f dlartgp.f dlartgs.f ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f dgetsls.f dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f dtplqt2.f dtpmlqt.f dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f dsytf2_rk.f dlasyf_rk.f dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f dsytri_3x.f dsysv_rk.f dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f dsbevx_2stage.f dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f dsyevx_2stage.f dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F dsytrd_sy2sb.f dlarfy.f ) set(SLASRC sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f sgetri.f sgetrf2.f sggbak.f sggbal.f sgghd3.f sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f sggglm.f sgghrd.f sgglse.f sggqrf.f sggrqf.f DEPRECATED/sggsvd.f sggsvd3.f DEPRECATED/sggsvp.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f shsein.f shseqr.f slabrd.f slacon.f slacn2.f slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f DEPRECATED/slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f slansy.f slantb.f slantp.f slantr.f slanv2.f slapll.f slapmt.f slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f sopgtr.f sopmtr.f sorg2l.f sorg2r.f sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f spbstf.f spbsv.f spbsvx.f spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f sposvx.f spstrf.f spstf2.f sppcon.f sppequ.f spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f sstevx.f ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f ssyswapr.f ssytrs.f ssytrs2.f ssyconv.f ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f ssytri_rook.f ssycon_rook.f ssysv_rook.f stbcon.f stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f stptrs.f strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f stfttr.f stpttf.f stpttr.f strttf.f strttp.f sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f sgeequb.f ssyequb.f spoequb.f sgbequb.f sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f sgelq.f sgelqt.f sgelqt3.f sgemlq.f sgemlqt.f sgemqr.f sgeqr.f sgetsls.f slamswlq.f slamtsqr.f slaswlq.f slatsqr.f stplqt.f stplqt2.f stpmlqt.f ssysv_aa.f ssytrf_aa.f ssytrs_aa.f slasyf_aa.f ssytf2_rk.f slasyf_rk.f ssytrf_rk.f ssytrs_3.f ssycon_3.f ssytri_3.f ssytri_3x.f ssysv_rk.f ssb2st_kernels.f ssbev_2stage.f ssbevd_2stage.f ssbevx_2stage.f ssyev_2stage.f ssyevd_2stage.f ssyevr_2stage.f ssyevx_2stage.f ssygv_2stage.f ssytrd_2stage.f ssytrd_sb2st.F ssytrd_sy2sb.f slarfy.f ) set(DSLASRC spotrs.f spotrf2.f) set(CLASRC cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f DEPRECATED/cgegs.f DEPRECATED/cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f cgels.f cgelsd.f cgelss.f DEPRECATED/cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f DEPRECATED/cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f cgesvx.f cgetc2.f cgetri.f cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f cgghrd.f cgglse.f cggqrf.f cggrqf.f DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f chetf2.f chetrd.f chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f chetrs.f chetrs2.f chetf2_rook.f chetrf_rook.f chetri_rook.f chetrs_rook.f checon_rook.f chesv_rook.f chgeqz.f chpcon.f chpev.f chpevd.f chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f chpsvx.f chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f claed0.f claed7.f claed8.f claein.f claesy.f claev2.f clags2.f clagtm.f clahef.f clahef_rook.f clahqr.f DEPRECATED/clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f clanhb.f clanhe.f clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f clarf.f clarfb.f clarfg.f clarft.f clarfgp.f clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f DEPRECATED/clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f cposv.f cposvx.f cpstrf.f cpstf2.f cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f crot.f cspcon.f csprfs.f cspsv.f cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f cstegr.f cstein.f csteqr.f csycon.f csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f csytri2.f csytri2x.f csyswapr.f csytrs.f csytrs2.f csyconv.f csytf2_rook.f csytrf_rook.f csytrs_rook.f csytri_rook.f csycon_rook.f csysv_rook.f ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f ctprfs.f ctptri.f ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f cgelq.f cgelqt.f cgelqt3.f cgemlq.f cgemlqt.f cgemqr.f cgeqr.f cgetsls.f clamswlq.f clamtsqr.f claswlq.f clatsqr.f ctplqt.f ctplqt2.f ctpmlqt.f chesv_aa.f chetrf_aa.f chetrs_aa.f clahef_aa.f csytf2_rk.f clasyf_rk.f csytrf_rk.f csytrs_3.f csycon_3.f csytri_3.f csytri_3x.f csysv_rk.f chetf2_rk.f clahef_rk.f chetrf_rk.f chetrs_3.f checon_3.f chetri_3.f chetri_3x.f chesv_rk.f chb2st_kernels.f chbev_2stage.f chbevd_2stage.f chbevx_2stage.f cheev_2stage.f cheevd_2stage.f cheevr_2stage.f cheevx_2stage.f chegv_2stage.f chetrd_2stage.f chetrd_hb2st.F chetrd_he2hb.f clarfy.f ) set(ZCLASRC cpotrs.f) set(DLASRC dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f dgetri.f dgetrf2.f dggbak.f dggbal.f dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f dggglm.f dgghd3.f dgghrd.f dgglse.f dggqrf.f dggrqf.f dggsvd3.f dggsvp3.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f DEPRECATED/dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f dlapll.f dlapmt.f dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlargv.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f dopgtr.f dopmtr.f dorg2l.f dorg2r.f dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f dpbstf.f dpbsv.f dpbsvx.f dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f dposvx.f dpotrf2.f dpotrs.f dpstrf.f dpstf2.f dppcon.f dppequ.f dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f dstevx.f dsycon.f dsyev.f dsyevd.f dsyevr.f dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f dsysv.f dsysvx.f dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytri2.f dsytri2x.f dsyswapr.f dsytrs.f dsytrs2.f dsyconv.f dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f dsytri_rook.f dsycon_rook.f dsysv_rook.f dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f dtptrs.f dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f dgeequb.f dsyequb.f dpoequb.f dgbequb.f dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f dgetsls.f dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f dtplqt2.f dtpmlqt.f dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f dsytf2_rk.f dlasyf_rk.f dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f dsytri_3x.f dsysv_rk.f dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f dsbevx_2stage.f dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f dsyevx_2stage.f dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F dsytrd_sy2sb.f dlarfy.f ) set(ZLASRC zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgejsv.f zgelq2.f zgelqf.f zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvj.f zgesvx.f zgetc2.f zgetri.f zgetrf2.f zggbak.f zggbal.f zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f zggglm.f zgghd3.f zgghrd.f zgglse.f zggqrf.f zggrqf.f DEPRECATED/zggsvd.f zggsvd3.f DEPRECATED/zggsvp.f zggsvp3.f zgsvj0.f zgsvj1.f zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f zhetf2.f zhetrd.f zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f zhetrs.f zhetrs2.f zhetf2_rook.f zhetrf_rook.f zhetri_rook.f zhetrs_rook.f zhecon_rook.f zhesv_rook.f zhgeqz.f zhpcon.f zhpev.f zhpevd.f zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f zhpsvx.f zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f zlaed0.f zlaed7.f zlaed8.f zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f zlahef.f zlahef_rook.f zlahqr.f DEPRECATED/zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f zlangt.f zlanhb.f zlanhe.f zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f zlarcm.f zlarf.f zlarfb.f zlarfg.f zlarft.f zlarfgp.f zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f zlassq.f zlasyf.f zlasyf_rook.f zlasyf_aa.f zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f zposv.f zposvx.f zpotrf2.f zpotrs.f zpstrf.f zpstf2.f zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f zrot.f zspcon.f zsprfs.f zspsv.f zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f zstegr.f zstein.f zsteqr.f zsycon.f zsysv_aa.f zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f zsyswapr.f zsytrs.f zsytrs_aa.f zsytrs2.f zsyconv.f zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f zsytri_rook.f zsycon_rook.f zsysv_rook.f ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f ztprfs.f ztptri.f ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunm22.f zunml2.f zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f zunmtr.f zupgtr.f zupmtr.f izmax1.f dzsum1.f zstemr.f zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f zgelq.f zgelqt.f zgelqt3.f zgemlq.f zgemlqt.f zgemqr.f zgeqr.f zgetsls.f zlamswlq.f zlamtsqr.f zlaswlq.f zlatsqr.f ztplqt.f ztplqt2.f ztpmlqt.f zhesv_aa.f zhetrf_aa.f zhetrs_aa.f zlahef_aa.f zsytf2_rk.f zlasyf_rk.f zsytrf_aa.f zsytrf_rk.f zsytrs_3.f zsycon_3.f zsytri_3.f zsytri_3x.f zsysv_rk.f zhetf2_rk.f zlahef_rk.f zhetrf_rk.f zhetrs_3.f zhecon_3.f zhetri_3.f zhetri_3x.f zhesv_rk.f zhb2st_kernels.f zhbev_2stage.f zhbevd_2stage.f zhbevx_2stage.f zheev_2stage.f zheevd_2stage.f zheevr_2stage.f zheevx_2stage.f zhegv_2stage.f zhetrd_2stage.f zhetrd_hb2st.F zhetrd_he2hb.f zlarfy.f ) set(LA_REL_SRC ${ALLAUX}) if (BUILD_SINGLE) list(APPEND LA_REL_SRC ${SLASRC} ${DSLASRC} ${SCLAUX}) endif () if (BUILD_DOUBLE) list(APPEND LA_REL_SRC ${DLASRC} ${DSLASRC} ${DZLAUX}) endif () if (BUILD_COMPLEX) list(APPEND LA_REL_SRC ${CLASRC} ${ZCLASRC} ${SCLAUX}) endif () if (BUILD_COMPLEX16) list(APPEND LA_REL_SRC ${ZLASRC} ${ZCLASRC} ${DZLAUX}) endif () # add lapack-netlib folder to the sources set(LA_SOURCES "") foreach (LA_FILE ${LA_REL_SRC}) list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") endforeach () set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") OpenBLAS-0.2.20/cmake/lapacke.cmake000066400000000000000000001561521313527062700166400ustar00rootroot00000000000000 set(C_SRC lapacke_cbbcsd.c lapacke_cbbcsd_work.c lapacke_cbdsqr.c lapacke_cbdsqr_work.c lapacke_cgbbrd.c lapacke_cgbbrd_work.c lapacke_cgbcon.c lapacke_cgbcon_work.c lapacke_cgbequ.c lapacke_cgbequ_work.c lapacke_cgbequb.c lapacke_cgbequb_work.c lapacke_cgbrfs.c lapacke_cgbrfs_work.c lapacke_cgbsv.c lapacke_cgbsv_work.c lapacke_cgbsvx.c lapacke_cgbsvx_work.c lapacke_cgbtrf.c lapacke_cgbtrf_work.c lapacke_cgbtrs.c lapacke_cgbtrs_work.c lapacke_cgebak.c lapacke_cgebak_work.c lapacke_cgebal.c lapacke_cgebal_work.c lapacke_cgebrd.c lapacke_cgebrd_work.c lapacke_cgecon.c lapacke_cgecon_work.c lapacke_cgeequ.c lapacke_cgeequ_work.c lapacke_cgeequb.c lapacke_cgeequb_work.c lapacke_cgees.c lapacke_cgees_work.c lapacke_cgeesx.c lapacke_cgeesx_work.c lapacke_cgeev.c lapacke_cgeev_work.c lapacke_cgeevx.c lapacke_cgeevx_work.c lapacke_cgehrd.c lapacke_cgehrd_work.c lapacke_cgejsv.c lapacke_cgejsv_work.c lapacke_cgelq2.c lapacke_cgelq2_work.c lapacke_cgelqf.c lapacke_cgelqf_work.c lapacke_cgels.c lapacke_cgels_work.c lapacke_cgelsd.c lapacke_cgelsd_work.c lapacke_cgelss.c lapacke_cgelss_work.c lapacke_cgelsy.c lapacke_cgelsy_work.c lapacke_cgemqr.c lapacke_cgemqr_work.c lapacke_cgemqrt.c lapacke_cgemqrt_work.c lapacke_cgeqlf.c lapacke_cgeqlf_work.c lapacke_cgeqp3.c lapacke_cgeqp3_work.c lapacke_cgeqr2.c lapacke_cgeqr2_work.c lapacke_cgeqrf.c lapacke_cgeqrf_work.c lapacke_cgeqrfp.c lapacke_cgeqrfp_work.c lapacke_cgeqrt.c lapacke_cgeqrt2.c lapacke_cgeqrt2_work.c lapacke_cgeqrt3.c lapacke_cgeqrt3_work.c lapacke_cgeqrt_work.c lapacke_cgerfs.c lapacke_cgerfs_work.c lapacke_cgerqf.c lapacke_cgerqf_work.c lapacke_cgesdd.c lapacke_cgesdd_work.c lapacke_cgesv.c lapacke_cgesv_work.c lapacke_cgesvd.c lapacke_cgesvd_work.c lapacke_cgesvdx.c lapacke_cgesvdx_work.c lapacke_cgesvj.c lapacke_cgesvj_work.c lapacke_cgesvx.c lapacke_cgesvx_work.c lapacke_cgetf2.c lapacke_cgetf2_work.c lapacke_cgetrf.c lapacke_cgetrf_work.c lapacke_cgetrf2.c lapacke_cgetrf2_work.c lapacke_cgetri.c lapacke_cgetri_work.c lapacke_cgetrs.c lapacke_cgetrs_work.c lapacke_cgetsls.c lapacke_cgetsls_work.c lapacke_cggbak.c lapacke_cggbak_work.c lapacke_cggbal.c lapacke_cggbal_work.c lapacke_cgges.c lapacke_cgges_work.c lapacke_cgges3.c lapacke_cgges3_work.c lapacke_cggesx.c lapacke_cggesx_work.c lapacke_cggev.c lapacke_cggev_work.c lapacke_cggev3.c lapacke_cggev3_work.c lapacke_cggevx.c lapacke_cggevx_work.c lapacke_cggglm.c lapacke_cggglm_work.c lapacke_cgghrd.c lapacke_cgghrd_work.c lapacke_cgghd3.c lapacke_cgghd3_work.c lapacke_cgglse.c lapacke_cgglse_work.c lapacke_cggqrf.c lapacke_cggqrf_work.c lapacke_cggrqf.c lapacke_cggrqf_work.c lapacke_cggsvd3.c lapacke_cggsvd3_work.c lapacke_cggsvp3.c lapacke_cggsvp3_work.c lapacke_cgtcon.c lapacke_cgtcon_work.c lapacke_cgtrfs.c lapacke_cgtrfs_work.c lapacke_cgtsv.c lapacke_cgtsv_work.c lapacke_cgtsvx.c lapacke_cgtsvx_work.c lapacke_cgttrf.c lapacke_cgttrf_work.c lapacke_cgttrs.c lapacke_cgttrs_work.c lapacke_chbev.c lapacke_chbev_work.c lapacke_chbevd.c lapacke_chbevd_work.c lapacke_chbevx.c lapacke_chbevx_work.c lapacke_chbev_2stage.c lapacke_chbev_2stage_work.c lapacke_chbevd_2stage.c lapacke_chbevd_2stage_work.c lapacke_chbevx_2stage.c lapacke_chbevx_2stage_work.c lapacke_chbgst.c lapacke_chbgst_work.c lapacke_chbgv.c lapacke_chbgv_work.c lapacke_chbgvd.c lapacke_chbgvd_work.c lapacke_chbgvx.c lapacke_chbgvx_work.c lapacke_chbtrd.c lapacke_chbtrd_work.c lapacke_checon.c lapacke_checon_work.c lapacke_checon_3.c lapacke_checon_3_work.c lapacke_cheequb.c lapacke_cheequb_work.c lapacke_cheev.c lapacke_cheev_work.c lapacke_cheevd.c lapacke_cheevd_work.c lapacke_cheevr.c lapacke_cheevr_work.c lapacke_cheevx.c lapacke_cheevx_work.c lapacke_cheev_2stage.c lapacke_cheev_2stage_work.c lapacke_cheevd_2stage.c lapacke_cheevd_2stage_work.c lapacke_cheevr_2stage.c lapacke_cheevr_2stage_work.c lapacke_cheevx_2stage.c lapacke_cheevx_2stage_work.c lapacke_chegst.c lapacke_chegst_work.c lapacke_chegv.c lapacke_chegv_work.c lapacke_chegv_2stage.c lapacke_chegv_2stage_work.c lapacke_chegvd.c lapacke_chegvd_work.c lapacke_chegvx.c lapacke_chegvx_work.c lapacke_cherfs.c lapacke_cherfs_work.c lapacke_chesv.c lapacke_chesv_work.c lapacke_chesv_aa.c lapacke_chesv_aa_work.c lapacke_chesv_rk.c lapacke_chesv_rk_work.c lapacke_chesvx.c lapacke_chesvx_work.c lapacke_cheswapr.c lapacke_cheswapr_work.c lapacke_chetrd.c lapacke_chetrd_work.c lapacke_chetrf.c lapacke_chetrf_rook.c lapacke_chetrf_work.c lapacke_chetrf_rook_work.c lapacke_chetrf_aa.c lapacke_chetrf_aa_work.c lapacke_chetrf_rk.c lapacke_chetrf_rk_work.c lapacke_chetri.c lapacke_chetri2.c lapacke_chetri2_work.c lapacke_chetri_3.c lapacke_chetri_3_work.c lapacke_chetri2x.c lapacke_chetri2x_work.c lapacke_chetri_work.c lapacke_chetrs.c lapacke_chetrs_rook.c lapacke_chetrs2.c lapacke_chetrs2_work.c lapacke_chetrs_work.c lapacke_chetrs_rook_work.c lapacke_chetrs_aa.c lapacke_chetrs_aa_work.c lapacke_chetrs_3.c lapacke_chetrs_3_work.c lapacke_chfrk.c lapacke_chfrk_work.c lapacke_chgeqz.c lapacke_chgeqz_work.c lapacke_chpcon.c lapacke_chpcon_work.c lapacke_chpev.c lapacke_chpev_work.c lapacke_chpevd.c lapacke_chpevd_work.c lapacke_chpevx.c lapacke_chpevx_work.c lapacke_chpgst.c lapacke_chpgst_work.c lapacke_chpgv.c lapacke_chpgv_work.c lapacke_chpgvd.c lapacke_chpgvd_work.c lapacke_chpgvx.c lapacke_chpgvx_work.c lapacke_chprfs.c lapacke_chprfs_work.c lapacke_chpsv.c lapacke_chpsv_work.c lapacke_chpsvx.c lapacke_chpsvx_work.c lapacke_chptrd.c lapacke_chptrd_work.c lapacke_chptrf.c lapacke_chptrf_work.c lapacke_chptri.c lapacke_chptri_work.c lapacke_chptrs.c lapacke_chptrs_work.c lapacke_chsein.c lapacke_chsein_work.c lapacke_chseqr.c lapacke_chseqr_work.c lapacke_clacgv.c lapacke_clacgv_work.c lapacke_clacn2.c lapacke_clacn2_work.c lapacke_clacp2.c lapacke_clacp2_work.c lapacke_clacpy.c lapacke_clacpy_work.c lapacke_clag2z.c lapacke_clag2z_work.c lapacke_clange.c lapacke_clange_work.c lapacke_clanhe.c lapacke_clanhe_work.c lapacke_clansy.c lapacke_clansy_work.c lapacke_clantr.c lapacke_clantr_work.c lapacke_clapmr.c lapacke_clapmr_work.c lapacke_clapmt.c lapacke_clapmt_work.c lapacke_clarfb.c lapacke_clarfb_work.c lapacke_clarfg.c lapacke_clarfg_work.c lapacke_clarft.c lapacke_clarft_work.c lapacke_clarfx.c lapacke_clarfx_work.c lapacke_clarnv.c lapacke_clarnv_work.c lapacke_clascl.c lapacke_clascl_work.c lapacke_claset.c lapacke_claset_work.c lapacke_claswp.c lapacke_claswp_work.c lapacke_clauum.c lapacke_clauum_work.c lapacke_cpbcon.c lapacke_cpbcon_work.c lapacke_cpbequ.c lapacke_cpbequ_work.c lapacke_cpbrfs.c lapacke_cpbrfs_work.c lapacke_cpbstf.c lapacke_cpbstf_work.c lapacke_cpbsv.c lapacke_cpbsv_work.c lapacke_cpbsvx.c lapacke_cpbsvx_work.c lapacke_cpbtrf.c lapacke_cpbtrf_work.c lapacke_cpbtrs.c lapacke_cpbtrs_work.c lapacke_cpftrf.c lapacke_cpftrf_work.c lapacke_cpftri.c lapacke_cpftri_work.c lapacke_cpftrs.c lapacke_cpftrs_work.c lapacke_cpocon.c lapacke_cpocon_work.c lapacke_cpoequ.c lapacke_cpoequ_work.c lapacke_cpoequb.c lapacke_cpoequb_work.c lapacke_cporfs.c lapacke_cporfs_work.c lapacke_cposv.c lapacke_cposv_work.c lapacke_cposvx.c lapacke_cposvx_work.c lapacke_cpotrf.c lapacke_cpotrf_work.c lapacke_cpotrf2.c lapacke_cpotrf2_work.c lapacke_cpotri.c lapacke_cpotri_work.c lapacke_cpotrs.c lapacke_cpotrs_work.c lapacke_cppcon.c lapacke_cppcon_work.c lapacke_cppequ.c lapacke_cppequ_work.c lapacke_cpprfs.c lapacke_cpprfs_work.c lapacke_cppsv.c lapacke_cppsv_work.c lapacke_cppsvx.c lapacke_cppsvx_work.c lapacke_cpptrf.c lapacke_cpptrf_work.c lapacke_cpptri.c lapacke_cpptri_work.c lapacke_cpptrs.c lapacke_cpptrs_work.c lapacke_cpstrf.c lapacke_cpstrf_work.c lapacke_cptcon.c lapacke_cptcon_work.c lapacke_cpteqr.c lapacke_cpteqr_work.c lapacke_cptrfs.c lapacke_cptrfs_work.c lapacke_cptsv.c lapacke_cptsv_work.c lapacke_cptsvx.c lapacke_cptsvx_work.c lapacke_cpttrf.c lapacke_cpttrf_work.c lapacke_cpttrs.c lapacke_cpttrs_work.c lapacke_cspcon.c lapacke_cspcon_work.c lapacke_csprfs.c lapacke_csprfs_work.c lapacke_cspsv.c lapacke_cspsv_work.c lapacke_cspsvx.c lapacke_cspsvx_work.c lapacke_csptrf.c lapacke_csptrf_work.c lapacke_csptri.c lapacke_csptri_work.c lapacke_csptrs.c lapacke_csptrs_work.c lapacke_cstedc.c lapacke_cstedc_work.c lapacke_cstegr.c lapacke_cstegr_work.c lapacke_cstein.c lapacke_cstein_work.c lapacke_cstemr.c lapacke_cstemr_work.c lapacke_csteqr.c lapacke_csteqr_work.c lapacke_csycon.c lapacke_csycon_work.c lapacke_csycon_3.c lapacke_csycon_3_work.c lapacke_csyconv.c lapacke_csyconv_work.c lapacke_csyequb.c lapacke_csyequb_work.c lapacke_csyrfs.c lapacke_csyrfs_work.c lapacke_csysv.c lapacke_csysv_rook.c lapacke_csysv_rook_work.c lapacke_csysv_work.c lapacke_csysv_aa.c lapacke_csysv_aa_work.c lapacke_csysv_rk.c lapacke_csysv_rk_work.c lapacke_csysvx.c lapacke_csysvx_work.c lapacke_csyswapr.c lapacke_csyswapr_work.c lapacke_csytrf.c lapacke_csytrf_work.c lapacke_csytrf_rook.c lapacke_csytrf_rook_work.c lapacke_csytrf_aa.c lapacke_csytrf_aa_work.c lapacke_csytrf_rk.c lapacke_csytrf_rk_work.c lapacke_csytri.c lapacke_csytri2.c lapacke_csytri2_work.c lapacke_csytri_3.c lapacke_csytri_3_work.c lapacke_csytri2x.c lapacke_csytri2x_work.c lapacke_csytri_work.c lapacke_csytrs.c lapacke_csytrs_rook.c lapacke_csytrs2.c lapacke_csytrs2_work.c lapacke_csytrs_work.c lapacke_csytrs_rook_work.c lapacke_csytrs_aa.c lapacke_csytrs_aa_work.c lapacke_csytrs_3.c lapacke_csytrs_3_work.c lapacke_ctbcon.c lapacke_ctbcon_work.c lapacke_ctbrfs.c lapacke_ctbrfs_work.c lapacke_ctbtrs.c lapacke_ctbtrs_work.c lapacke_ctfsm.c lapacke_ctfsm_work.c lapacke_ctftri.c lapacke_ctftri_work.c lapacke_ctfttp.c lapacke_ctfttp_work.c lapacke_ctfttr.c lapacke_ctfttr_work.c lapacke_ctgevc.c lapacke_ctgevc_work.c lapacke_ctgexc.c lapacke_ctgexc_work.c lapacke_ctgsen.c lapacke_ctgsen_work.c lapacke_ctgsja.c lapacke_ctgsja_work.c lapacke_ctgsna.c lapacke_ctgsna_work.c lapacke_ctgsyl.c lapacke_ctgsyl_work.c lapacke_ctpcon.c lapacke_ctpcon_work.c lapacke_ctpmqrt.c lapacke_ctpmqrt_work.c lapacke_ctpqrt.c lapacke_ctpqrt2.c lapacke_ctpqrt2_work.c lapacke_ctpqrt_work.c lapacke_ctprfb.c lapacke_ctprfb_work.c lapacke_ctprfs.c lapacke_ctprfs_work.c lapacke_ctptri.c lapacke_ctptri_work.c lapacke_ctptrs.c lapacke_ctptrs_work.c lapacke_ctpttf.c lapacke_ctpttf_work.c lapacke_ctpttr.c lapacke_ctpttr_work.c lapacke_ctrcon.c lapacke_ctrcon_work.c lapacke_ctrevc.c lapacke_ctrevc_work.c lapacke_ctrexc.c lapacke_ctrexc_work.c lapacke_ctrrfs.c lapacke_ctrrfs_work.c lapacke_ctrsen.c lapacke_ctrsen_work.c lapacke_ctrsna.c lapacke_ctrsna_work.c lapacke_ctrsyl.c lapacke_ctrsyl_work.c lapacke_ctrtri.c lapacke_ctrtri_work.c lapacke_ctrtrs.c lapacke_ctrtrs_work.c lapacke_ctrttf.c lapacke_ctrttf_work.c lapacke_ctrttp.c lapacke_ctrttp_work.c lapacke_ctzrzf.c lapacke_ctzrzf_work.c lapacke_cunbdb.c lapacke_cunbdb_work.c lapacke_cuncsd.c lapacke_cuncsd_work.c lapacke_cuncsd2by1.c lapacke_cuncsd2by1_work.c lapacke_cungbr.c lapacke_cungbr_work.c lapacke_cunghr.c lapacke_cunghr_work.c lapacke_cunglq.c lapacke_cunglq_work.c lapacke_cungql.c lapacke_cungql_work.c lapacke_cungqr.c lapacke_cungqr_work.c lapacke_cungrq.c lapacke_cungrq_work.c lapacke_cungtr.c lapacke_cungtr_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c lapacke_cunmhr_work.c lapacke_cunmlq.c lapacke_cunmlq_work.c lapacke_cunmql.c lapacke_cunmql_work.c lapacke_cunmqr.c lapacke_cunmqr_work.c lapacke_cunmrq.c lapacke_cunmrq_work.c lapacke_cunmrz.c lapacke_cunmrz_work.c lapacke_cunmtr.c lapacke_cunmtr_work.c lapacke_cupgtr.c lapacke_cupgtr_work.c lapacke_cupmtr.c lapacke_cupmtr_work.c ) set(DSRC lapacke_dbbcsd.c lapacke_dbbcsd_work.c lapacke_dbdsdc.c lapacke_dbdsdc_work.c lapacke_dbdsvdx.c lapacke_dbdsvdx_work.c lapacke_dbdsqr.c lapacke_dbdsqr_work.c lapacke_ddisna.c lapacke_ddisna_work.c lapacke_dgbbrd.c lapacke_dgbbrd_work.c lapacke_dgbcon.c lapacke_dgbcon_work.c lapacke_dgbequ.c lapacke_dgbequ_work.c lapacke_dgbequb.c lapacke_dgbequb_work.c lapacke_dgbrfs.c lapacke_dgbrfs_work.c lapacke_dgbsv.c lapacke_dgbsv_work.c lapacke_dgbsvx.c lapacke_dgbsvx_work.c lapacke_dgbtrf.c lapacke_dgbtrf_work.c lapacke_dgbtrs.c lapacke_dgbtrs_work.c lapacke_dgebak.c lapacke_dgebak_work.c lapacke_dgebal.c lapacke_dgebal_work.c lapacke_dgebrd.c lapacke_dgebrd_work.c lapacke_dgecon.c lapacke_dgecon_work.c lapacke_dgeequ.c lapacke_dgeequ_work.c lapacke_dgeequb.c lapacke_dgeequb_work.c lapacke_dgees.c lapacke_dgees_work.c lapacke_dgeesx.c lapacke_dgeesx_work.c lapacke_dgeev.c lapacke_dgeev_work.c lapacke_dgeevx.c lapacke_dgeevx_work.c lapacke_dgehrd.c lapacke_dgehrd_work.c lapacke_dgejsv.c lapacke_dgejsv_work.c lapacke_dgelq2.c lapacke_dgelq2_work.c lapacke_dgelqf.c lapacke_dgelqf_work.c lapacke_dgels.c lapacke_dgels_work.c lapacke_dgelsd.c lapacke_dgelsd_work.c lapacke_dgelss.c lapacke_dgelss_work.c lapacke_dgelsy.c lapacke_dgelsy_work.c lapacke_dgemqr.c lapacke_dgemqr_work.c lapacke_dgemqrt.c lapacke_dgemqrt_work.c lapacke_dgeqlf.c lapacke_dgeqlf_work.c lapacke_dgeqp3.c lapacke_dgeqp3_work.c lapacke_dgeqr2.c lapacke_dgeqr2_work.c lapacke_dgeqrf.c lapacke_dgeqrf_work.c lapacke_dgeqrfp.c lapacke_dgeqrfp_work.c lapacke_dgeqrt.c lapacke_dgeqrt2.c lapacke_dgeqrt2_work.c lapacke_dgeqrt3.c lapacke_dgeqrt3_work.c lapacke_dgeqrt_work.c lapacke_dgerfs.c lapacke_dgerfs_work.c lapacke_dgerqf.c lapacke_dgerqf_work.c lapacke_dgesdd.c lapacke_dgesdd_work.c lapacke_dgesv.c lapacke_dgesv_work.c lapacke_dgesvd.c lapacke_dgesvd_work.c lapacke_dgesvdx.c lapacke_dgesvdx_work.c lapacke_dgesvj.c lapacke_dgesvj_work.c lapacke_dgesvx.c lapacke_dgesvx_work.c lapacke_dgetf2.c lapacke_dgetf2_work.c lapacke_dgetrf.c lapacke_dgetrf_work.c lapacke_dgetrf2.c lapacke_dgetrf2_work.c lapacke_dgetri.c lapacke_dgetri_work.c lapacke_dgetrs.c lapacke_dgetrs_work.c lapacke_dgetsls.c lapacke_dgetsls_work.c lapacke_dggbak.c lapacke_dggbak_work.c lapacke_dggbal.c lapacke_dggbal_work.c lapacke_dgges.c lapacke_dgges_work.c lapacke_dgges3.c lapacke_dgges3_work.c lapacke_dggesx.c lapacke_dggesx_work.c lapacke_dggev.c lapacke_dggev_work.c lapacke_dggev3.c lapacke_dggev3_work.c lapacke_dggevx.c lapacke_dggevx_work.c lapacke_dggglm.c lapacke_dggglm_work.c lapacke_dgghrd.c lapacke_dgghrd_work.c lapacke_dgghd3.c lapacke_dgghd3_work.c lapacke_dgglse.c lapacke_dgglse_work.c lapacke_dggqrf.c lapacke_dggqrf_work.c lapacke_dggrqf.c lapacke_dggrqf_work.c lapacke_dggsvd3.c lapacke_dggsvd3_work.c lapacke_dggsvp3.c lapacke_dggsvp3_work.c lapacke_dgtcon.c lapacke_dgtcon_work.c lapacke_dgtrfs.c lapacke_dgtrfs_work.c lapacke_dgtsv.c lapacke_dgtsv_work.c lapacke_dgtsvx.c lapacke_dgtsvx_work.c lapacke_dgttrf.c lapacke_dgttrf_work.c lapacke_dgttrs.c lapacke_dgttrs_work.c lapacke_dhgeqz.c lapacke_dhgeqz_work.c lapacke_dhsein.c lapacke_dhsein_work.c lapacke_dhseqr.c lapacke_dhseqr_work.c lapacke_dlacn2.c lapacke_dlacn2_work.c lapacke_dlacpy.c lapacke_dlacpy_work.c lapacke_dlag2s.c lapacke_dlag2s_work.c lapacke_dlamch.c lapacke_dlamch_work.c lapacke_dlange.c lapacke_dlange_work.c lapacke_dlansy.c lapacke_dlansy_work.c lapacke_dlantr.c lapacke_dlantr_work.c lapacke_dlapmr.c lapacke_dlapmr_work.c lapacke_dlapmt.c lapacke_dlapmt_work.c lapacke_dlapy2.c lapacke_dlapy2_work.c lapacke_dlapy3.c lapacke_dlapy3_work.c lapacke_dlarfb.c lapacke_dlarfb_work.c lapacke_dlarfg.c lapacke_dlarfg_work.c lapacke_dlarft.c lapacke_dlarft_work.c lapacke_dlarfx.c lapacke_dlarfx_work.c lapacke_dlarnv.c lapacke_dlarnv_work.c lapacke_dlartgp.c lapacke_dlartgp_work.c lapacke_dlartgs.c lapacke_dlartgs_work.c lapacke_dlascl.c lapacke_dlascl_work.c lapacke_dlaset.c lapacke_dlaset_work.c lapacke_dlasrt.c lapacke_dlasrt_work.c lapacke_dlaswp.c lapacke_dlaswp_work.c lapacke_dlauum.c lapacke_dlauum_work.c lapacke_dopgtr.c lapacke_dopgtr_work.c lapacke_dopmtr.c lapacke_dopmtr_work.c lapacke_dorbdb.c lapacke_dorbdb_work.c lapacke_dorcsd2by1.c lapacke_dorcsd2by1_work.c lapacke_dorcsd.c lapacke_dorcsd_work.c lapacke_dorgbr.c lapacke_dorgbr_work.c lapacke_dorghr.c lapacke_dorghr_work.c lapacke_dorglq.c lapacke_dorglq_work.c lapacke_dorgql.c lapacke_dorgql_work.c lapacke_dorgqr.c lapacke_dorgqr_work.c lapacke_dorgrq.c lapacke_dorgrq_work.c lapacke_dorgtr.c lapacke_dorgtr_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c lapacke_dormhr_work.c lapacke_dormlq.c lapacke_dormlq_work.c lapacke_dormql.c lapacke_dormql_work.c lapacke_dormqr.c lapacke_dormqr_work.c lapacke_dormrq.c lapacke_dormrq_work.c lapacke_dormrz.c lapacke_dormrz_work.c lapacke_dormtr.c lapacke_dormtr_work.c lapacke_dpbcon.c lapacke_dpbcon_work.c lapacke_dpbequ.c lapacke_dpbequ_work.c lapacke_dpbrfs.c lapacke_dpbrfs_work.c lapacke_dpbstf.c lapacke_dpbstf_work.c lapacke_dpbsv.c lapacke_dpbsv_work.c lapacke_dpbsvx.c lapacke_dpbsvx_work.c lapacke_dpbtrf.c lapacke_dpbtrf_work.c lapacke_dpbtrs.c lapacke_dpbtrs_work.c lapacke_dpftrf.c lapacke_dpftrf_work.c lapacke_dpftri.c lapacke_dpftri_work.c lapacke_dpftrs.c lapacke_dpftrs_work.c lapacke_dpocon.c lapacke_dpocon_work.c lapacke_dpoequ.c lapacke_dpoequ_work.c lapacke_dpoequb.c lapacke_dpoequb_work.c lapacke_dporfs.c lapacke_dporfs_work.c lapacke_dposv.c lapacke_dposv_work.c lapacke_dposvx.c lapacke_dposvx_work.c lapacke_dpotrf.c lapacke_dpotrf_work.c lapacke_dpotrf2.c lapacke_dpotrf2_work.c lapacke_dpotri.c lapacke_dpotri_work.c lapacke_dpotrs.c lapacke_dpotrs_work.c lapacke_dppcon.c lapacke_dppcon_work.c lapacke_dppequ.c lapacke_dppequ_work.c lapacke_dpprfs.c lapacke_dpprfs_work.c lapacke_dppsv.c lapacke_dppsv_work.c lapacke_dppsvx.c lapacke_dppsvx_work.c lapacke_dpptrf.c lapacke_dpptrf_work.c lapacke_dpptri.c lapacke_dpptri_work.c lapacke_dpptrs.c lapacke_dpptrs_work.c lapacke_dpstrf.c lapacke_dpstrf_work.c lapacke_dptcon.c lapacke_dptcon_work.c lapacke_dpteqr.c lapacke_dpteqr_work.c lapacke_dptrfs.c lapacke_dptrfs_work.c lapacke_dptsv.c lapacke_dptsv_work.c lapacke_dptsvx.c lapacke_dptsvx_work.c lapacke_dpttrf.c lapacke_dpttrf_work.c lapacke_dpttrs.c lapacke_dpttrs_work.c lapacke_dsbev.c lapacke_dsbev_work.c lapacke_dsbevd.c lapacke_dsbevd_work.c lapacke_dsbevx.c lapacke_dsbevx_work.c lapacke_dsbev_2stage.c lapacke_dsbev_2stage_work.c lapacke_dsbevd_2stage.c lapacke_dsbevd_2stage_work.c lapacke_dsbevx_2stage.c lapacke_dsbevx_2stage_work.c lapacke_dsbgst.c lapacke_dsbgst_work.c lapacke_dsbgv.c lapacke_dsbgv_work.c lapacke_dsbgvd.c lapacke_dsbgvd_work.c lapacke_dsbgvx.c lapacke_dsbgvx_work.c lapacke_dsbtrd.c lapacke_dsbtrd_work.c lapacke_dsfrk.c lapacke_dsfrk_work.c lapacke_dsgesv.c lapacke_dsgesv_work.c lapacke_dspcon.c lapacke_dspcon_work.c lapacke_dspev.c lapacke_dspev_work.c lapacke_dspevd.c lapacke_dspevd_work.c lapacke_dspevx.c lapacke_dspevx_work.c lapacke_dspgst.c lapacke_dspgst_work.c lapacke_dspgv.c lapacke_dspgv_work.c lapacke_dspgvd.c lapacke_dspgvd_work.c lapacke_dspgvx.c lapacke_dspgvx_work.c lapacke_dsposv.c lapacke_dsposv_work.c lapacke_dsprfs.c lapacke_dsprfs_work.c lapacke_dspsv.c lapacke_dspsv_work.c lapacke_dspsvx.c lapacke_dspsvx_work.c lapacke_dsptrd.c lapacke_dsptrd_work.c lapacke_dsptrf.c lapacke_dsptrf_work.c lapacke_dsptri.c lapacke_dsptri_work.c lapacke_dsptrs.c lapacke_dsptrs_work.c lapacke_dstebz.c lapacke_dstebz_work.c lapacke_dstedc.c lapacke_dstedc_work.c lapacke_dstegr.c lapacke_dstegr_work.c lapacke_dstein.c lapacke_dstein_work.c lapacke_dstemr.c lapacke_dstemr_work.c lapacke_dsteqr.c lapacke_dsteqr_work.c lapacke_dsterf.c lapacke_dsterf_work.c lapacke_dstev.c lapacke_dstev_work.c lapacke_dstevd.c lapacke_dstevd_work.c lapacke_dstevr.c lapacke_dstevr_work.c lapacke_dstevx.c lapacke_dstevx_work.c lapacke_dsycon.c lapacke_dsycon_work.c lapacke_dsycon_3.c lapacke_dsycon_3_work.c lapacke_dsyconv.c lapacke_dsyconv_work.c lapacke_dsyequb.c lapacke_dsyequb_work.c lapacke_dsyev.c lapacke_dsyev_work.c lapacke_dsyevd.c lapacke_dsyevd_work.c lapacke_dsyevr.c lapacke_dsyevr_work.c lapacke_dsyevx.c lapacke_dsyevx_work.c lapacke_dsyev_2stage.c lapacke_dsyev_2stage_work.c lapacke_dsyevd_2stage.c lapacke_dsyevd_2stage_work.c lapacke_dsyevr_2stage.c lapacke_dsyevr_2stage_work.c lapacke_dsyevx_2stage.c lapacke_dsyevx_2stage_work.c lapacke_dsygst.c lapacke_dsygst_work.c lapacke_dsygv.c lapacke_dsygv_work.c lapacke_dsygv_2stage.c lapacke_dsygv_2stage_work.c lapacke_dsygvd.c lapacke_dsygvd_work.c lapacke_dsygvx.c lapacke_dsygvx_work.c lapacke_dsyrfs.c lapacke_dsyrfs_work.c lapacke_dsysv.c lapacke_dsysv_rook.c lapacke_dsysv_rook_work.c lapacke_dsysv_work.c lapacke_dsysv_aa.c lapacke_dsysv_aa_work.c lapacke_dsysv_rk.c lapacke_dsysv_rk_work.c lapacke_dsysvx.c lapacke_dsysvx_work.c lapacke_dsyswapr.c lapacke_dsyswapr_work.c lapacke_dsytrd.c lapacke_dsytrd_work.c lapacke_dsytrf.c lapacke_dsytrf_work.c lapacke_dsytrf_rook.c lapacke_dsytrf_rook_work.c lapacke_dsytrf_aa.c lapacke_dsytrf_aa_work.c lapacke_dsytrf_rk.c lapacke_dsytrf_rk_work.c lapacke_dsytri.c lapacke_dsytri2.c lapacke_dsytri2_work.c lapacke_dsytri_3.c lapacke_dsytri_3_work.c lapacke_dsytri2x.c lapacke_dsytri2x_work.c lapacke_dsytri_work.c lapacke_dsytrs.c lapacke_dsytrs_rook.c lapacke_dsytrs2.c lapacke_dsytrs2_work.c lapacke_dsytrs_aa.c lapacke_dsytrs_aa_work.c lapacke_dsytrs_3.c lapacke_dsytrs_3_work.c lapacke_dsytrs_work.c lapacke_dsytrs_rook_work.c lapacke_dtbcon.c lapacke_dtbcon_work.c lapacke_dtbrfs.c lapacke_dtbrfs_work.c lapacke_dtbtrs.c lapacke_dtbtrs_work.c lapacke_dtfsm.c lapacke_dtfsm_work.c lapacke_dtftri.c lapacke_dtftri_work.c lapacke_dtfttp.c lapacke_dtfttp_work.c lapacke_dtfttr.c lapacke_dtfttr_work.c lapacke_dtgevc.c lapacke_dtgevc_work.c lapacke_dtgexc.c lapacke_dtgexc_work.c lapacke_dtgsen.c lapacke_dtgsen_work.c lapacke_dtgsja.c lapacke_dtgsja_work.c lapacke_dtgsna.c lapacke_dtgsna_work.c lapacke_dtgsyl.c lapacke_dtgsyl_work.c lapacke_dtpcon.c lapacke_dtpcon_work.c lapacke_dtpmqrt.c lapacke_dtpmqrt_work.c lapacke_dtpqrt.c lapacke_dtpqrt2.c lapacke_dtpqrt2_work.c lapacke_dtpqrt_work.c lapacke_dtprfb.c lapacke_dtprfb_work.c lapacke_dtprfs.c lapacke_dtprfs_work.c lapacke_dtptri.c lapacke_dtptri_work.c lapacke_dtptrs.c lapacke_dtptrs_work.c lapacke_dtpttf.c lapacke_dtpttf_work.c lapacke_dtpttr.c lapacke_dtpttr_work.c lapacke_dtrcon.c lapacke_dtrcon_work.c lapacke_dtrevc.c lapacke_dtrevc_work.c lapacke_dtrexc.c lapacke_dtrexc_work.c lapacke_dtrrfs.c lapacke_dtrrfs_work.c lapacke_dtrsen.c lapacke_dtrsen_work.c lapacke_dtrsna.c lapacke_dtrsna_work.c lapacke_dtrsyl.c lapacke_dtrsyl_work.c lapacke_dtrtri.c lapacke_dtrtri_work.c lapacke_dtrtrs.c lapacke_dtrtrs_work.c lapacke_dtrttf.c lapacke_dtrttf_work.c lapacke_dtrttp.c lapacke_dtrttp_work.c lapacke_dtzrzf.c lapacke_dtzrzf_work.c ) set(SSRC lapacke_sbbcsd.c lapacke_sbbcsd_work.c lapacke_sbdsdc.c lapacke_sbdsdc_work.c lapacke_sbdsvdx.c lapacke_sbdsvdx_work.c lapacke_sbdsqr.c lapacke_sbdsqr_work.c lapacke_sdisna.c lapacke_sdisna_work.c lapacke_sgbbrd.c lapacke_sgbbrd_work.c lapacke_sgbcon.c lapacke_sgbcon_work.c lapacke_sgbequ.c lapacke_sgbequ_work.c lapacke_sgbequb.c lapacke_sgbequb_work.c lapacke_sgbrfs.c lapacke_sgbrfs_work.c lapacke_sgbsv.c lapacke_sgbsv_work.c lapacke_sgbsvx.c lapacke_sgbsvx_work.c lapacke_sgbtrf.c lapacke_sgbtrf_work.c lapacke_sgbtrs.c lapacke_sgbtrs_work.c lapacke_sgebak.c lapacke_sgebak_work.c lapacke_sgebal.c lapacke_sgebal_work.c lapacke_sgebrd.c lapacke_sgebrd_work.c lapacke_sgecon.c lapacke_sgecon_work.c lapacke_sgeequ.c lapacke_sgeequ_work.c lapacke_sgeequb.c lapacke_sgeequb_work.c lapacke_sgees.c lapacke_sgees_work.c lapacke_sgeesx.c lapacke_sgeesx_work.c lapacke_sgeev.c lapacke_sgeev_work.c lapacke_sgeevx.c lapacke_sgeevx_work.c lapacke_sgehrd.c lapacke_sgehrd_work.c lapacke_sgejsv.c lapacke_sgejsv_work.c lapacke_sgelq2.c lapacke_sgelq2_work.c lapacke_sgelqf.c lapacke_sgelqf_work.c lapacke_sgels.c lapacke_sgels_work.c lapacke_sgelsd.c lapacke_sgelsd_work.c lapacke_sgelss.c lapacke_sgelss_work.c lapacke_sgelsy.c lapacke_sgelsy_work.c lapacke_sgemqr.c lapacke_sgemqr_work.c lapacke_sgemqrt.c lapacke_sgemqrt_work.c lapacke_sgeqlf.c lapacke_sgeqlf_work.c lapacke_sgeqp3.c lapacke_sgeqp3_work.c lapacke_sgeqr2.c lapacke_sgeqr2_work.c lapacke_sgeqrf.c lapacke_sgeqrf_work.c lapacke_sgeqrfp.c lapacke_sgeqrfp_work.c lapacke_sgeqrt.c lapacke_sgeqrt2.c lapacke_sgeqrt2_work.c lapacke_sgeqrt3.c lapacke_sgeqrt3_work.c lapacke_sgeqrt_work.c lapacke_sgerfs.c lapacke_sgerfs_work.c lapacke_sgerqf.c lapacke_sgerqf_work.c lapacke_sgesdd.c lapacke_sgesdd_work.c lapacke_sgesv.c lapacke_sgesv_work.c lapacke_sgesvd.c lapacke_sgesvd_work.c lapacke_sgesvdx.c lapacke_sgesvdx_work.c lapacke_sgesvj.c lapacke_sgesvj_work.c lapacke_sgesvx.c lapacke_sgesvx_work.c lapacke_sgetf2.c lapacke_sgetf2_work.c lapacke_sgetrf.c lapacke_sgetrf_work.c lapacke_sgetrf2.c lapacke_sgetrf2_work.c lapacke_sgetri.c lapacke_sgetri_work.c lapacke_sgetrs.c lapacke_sgetrs_work.c lapacke_sgetsls.c lapacke_sgetsls_work.c lapacke_sggbak.c lapacke_sggbak_work.c lapacke_sggbal.c lapacke_sggbal_work.c lapacke_sgges.c lapacke_sgges_work.c lapacke_sgges3.c lapacke_sgges3_work.c lapacke_sggesx.c lapacke_sggesx_work.c lapacke_sggev.c lapacke_sggev_work.c lapacke_sggev3.c lapacke_sggev3_work.c lapacke_sggevx.c lapacke_sggevx_work.c lapacke_sggglm.c lapacke_sggglm_work.c lapacke_sgghrd.c lapacke_sgghrd_work.c lapacke_sgghd3.c lapacke_sgghd3_work.c lapacke_sgglse.c lapacke_sgglse_work.c lapacke_sggqrf.c lapacke_sggqrf_work.c lapacke_sggrqf.c lapacke_sggrqf_work.c lapacke_sggsvd3.c lapacke_sggsvd3_work.c lapacke_sggsvp3.c lapacke_sggsvp3_work.c lapacke_sgtcon.c lapacke_sgtcon_work.c lapacke_sgtrfs.c lapacke_sgtrfs_work.c lapacke_sgtsv.c lapacke_sgtsv_work.c lapacke_sgtsvx.c lapacke_sgtsvx_work.c lapacke_sgttrf.c lapacke_sgttrf_work.c lapacke_sgttrs.c lapacke_sgttrs_work.c lapacke_shgeqz.c lapacke_shgeqz_work.c lapacke_shsein.c lapacke_shsein_work.c lapacke_shseqr.c lapacke_shseqr_work.c lapacke_slacn2.c lapacke_slacn2_work.c lapacke_slacpy.c lapacke_slacpy_work.c lapacke_slag2d.c lapacke_slag2d_work.c lapacke_slamch.c lapacke_slamch_work.c lapacke_slange.c lapacke_slange_work.c lapacke_slansy.c lapacke_slansy_work.c lapacke_slantr.c lapacke_slantr_work.c lapacke_slapmr.c lapacke_slapmr_work.c lapacke_slapmt.c lapacke_slapmt_work.c lapacke_slapy2.c lapacke_slapy2_work.c lapacke_slapy3.c lapacke_slapy3_work.c lapacke_slarfb.c lapacke_slarfb_work.c lapacke_slarfg.c lapacke_slarfg_work.c lapacke_slarft.c lapacke_slarft_work.c lapacke_slarfx.c lapacke_slarfx_work.c lapacke_slarnv.c lapacke_slarnv_work.c lapacke_slartgp.c lapacke_slartgp_work.c lapacke_slartgs.c lapacke_slartgs_work.c lapacke_slascl.c lapacke_slascl_work.c lapacke_slaset.c lapacke_slaset_work.c lapacke_slasrt.c lapacke_slasrt_work.c lapacke_slaswp.c lapacke_slaswp_work.c lapacke_slauum.c lapacke_slauum_work.c lapacke_sopgtr.c lapacke_sopgtr_work.c lapacke_sopmtr.c lapacke_sopmtr_work.c lapacke_sorbdb.c lapacke_sorbdb_work.c lapacke_sorcsd.c lapacke_sorcsd_work.c lapacke_sorcsd2by1.c lapacke_sorcsd2by1_work.c lapacke_sorgbr.c lapacke_sorgbr_work.c lapacke_sorghr.c lapacke_sorghr_work.c lapacke_sorglq.c lapacke_sorglq_work.c lapacke_sorgql.c lapacke_sorgql_work.c lapacke_sorgqr.c lapacke_sorgqr_work.c lapacke_sorgrq.c lapacke_sorgrq_work.c lapacke_sorgtr.c lapacke_sorgtr_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c lapacke_sormhr_work.c lapacke_sormlq.c lapacke_sormlq_work.c lapacke_sormql.c lapacke_sormql_work.c lapacke_sormqr.c lapacke_sormqr_work.c lapacke_sormrq.c lapacke_sormrq_work.c lapacke_sormrz.c lapacke_sormrz_work.c lapacke_sormtr.c lapacke_sormtr_work.c lapacke_spbcon.c lapacke_spbcon_work.c lapacke_spbequ.c lapacke_spbequ_work.c lapacke_spbrfs.c lapacke_spbrfs_work.c lapacke_spbstf.c lapacke_spbstf_work.c lapacke_spbsv.c lapacke_spbsv_work.c lapacke_spbsvx.c lapacke_spbsvx_work.c lapacke_spbtrf.c lapacke_spbtrf_work.c lapacke_spbtrs.c lapacke_spbtrs_work.c lapacke_spftrf.c lapacke_spftrf_work.c lapacke_spftri.c lapacke_spftri_work.c lapacke_spftrs.c lapacke_spftrs_work.c lapacke_spocon.c lapacke_spocon_work.c lapacke_spoequ.c lapacke_spoequ_work.c lapacke_spoequb.c lapacke_spoequb_work.c lapacke_sporfs.c lapacke_sporfs_work.c lapacke_sposv.c lapacke_sposv_work.c lapacke_sposvx.c lapacke_sposvx_work.c lapacke_spotrf.c lapacke_spotrf_work.c lapacke_spotrf2.c lapacke_spotrf2_work.c lapacke_spotri.c lapacke_spotri_work.c lapacke_spotrs.c lapacke_spotrs_work.c lapacke_sppcon.c lapacke_sppcon_work.c lapacke_sppequ.c lapacke_sppequ_work.c lapacke_spprfs.c lapacke_spprfs_work.c lapacke_sppsv.c lapacke_sppsv_work.c lapacke_sppsvx.c lapacke_sppsvx_work.c lapacke_spptrf.c lapacke_spptrf_work.c lapacke_spptri.c lapacke_spptri_work.c lapacke_spptrs.c lapacke_spptrs_work.c lapacke_spstrf.c lapacke_spstrf_work.c lapacke_sptcon.c lapacke_sptcon_work.c lapacke_spteqr.c lapacke_spteqr_work.c lapacke_sptrfs.c lapacke_sptrfs_work.c lapacke_sptsv.c lapacke_sptsv_work.c lapacke_sptsvx.c lapacke_sptsvx_work.c lapacke_spttrf.c lapacke_spttrf_work.c lapacke_spttrs.c lapacke_spttrs_work.c lapacke_ssbev.c lapacke_ssbev_work.c lapacke_ssbevd.c lapacke_ssbevd_work.c lapacke_ssbevx.c lapacke_ssbevx_work.c lapacke_ssbev_2stage.c lapacke_ssbev_2stage_work.c lapacke_ssbevd_2stage.c lapacke_ssbevd_2stage_work.c lapacke_ssbevx_2stage.c lapacke_ssbevx_2stage_work.c lapacke_ssbgst.c lapacke_ssbgst_work.c lapacke_ssbgv.c lapacke_ssbgv_work.c lapacke_ssbgvd.c lapacke_ssbgvd_work.c lapacke_ssbgvx.c lapacke_ssbgvx_work.c lapacke_ssbtrd.c lapacke_ssbtrd_work.c lapacke_ssfrk.c lapacke_ssfrk_work.c lapacke_sspcon.c lapacke_sspcon_work.c lapacke_sspev.c lapacke_sspev_work.c lapacke_sspevd.c lapacke_sspevd_work.c lapacke_sspevx.c lapacke_sspevx_work.c lapacke_sspgst.c lapacke_sspgst_work.c lapacke_sspgv.c lapacke_sspgv_work.c lapacke_sspgvd.c lapacke_sspgvd_work.c lapacke_sspgvx.c lapacke_sspgvx_work.c lapacke_ssprfs.c lapacke_ssprfs_work.c lapacke_sspsv.c lapacke_sspsv_work.c lapacke_sspsvx.c lapacke_sspsvx_work.c lapacke_ssptrd.c lapacke_ssptrd_work.c lapacke_ssptrf.c lapacke_ssptrf_work.c lapacke_ssptri.c lapacke_ssptri_work.c lapacke_ssptrs.c lapacke_ssptrs_work.c lapacke_sstebz.c lapacke_sstebz_work.c lapacke_sstedc.c lapacke_sstedc_work.c lapacke_sstegr.c lapacke_sstegr_work.c lapacke_sstein.c lapacke_sstein_work.c lapacke_sstemr.c lapacke_sstemr_work.c lapacke_ssteqr.c lapacke_ssteqr_work.c lapacke_ssterf.c lapacke_ssterf_work.c lapacke_sstev.c lapacke_sstev_work.c lapacke_sstevd.c lapacke_sstevd_work.c lapacke_sstevr.c lapacke_sstevr_work.c lapacke_sstevx.c lapacke_sstevx_work.c lapacke_ssycon.c lapacke_ssycon_work.c lapacke_ssycon_3.c lapacke_ssycon_3_work.c lapacke_ssyconv.c lapacke_ssyconv_work.c lapacke_ssyequb.c lapacke_ssyequb_work.c lapacke_ssyev.c lapacke_ssyev_work.c lapacke_ssyevd.c lapacke_ssyevd_work.c lapacke_ssyevr.c lapacke_ssyevr_work.c lapacke_ssyevx.c lapacke_ssyevx_work.c lapacke_ssyev_2stage.c lapacke_ssyev_2stage_work.c lapacke_ssyevd_2stage.c lapacke_ssyevd_2stage_work.c lapacke_ssyevr_2stage.c lapacke_ssyevr_2stage_work.c lapacke_ssyevx_2stage.c lapacke_ssyevx_2stage_work.c lapacke_ssygst.c lapacke_ssygst_work.c lapacke_ssygv.c lapacke_ssygv_work.c lapacke_ssygv_2stage.c lapacke_ssygv_2stage_work.c lapacke_ssygvd.c lapacke_ssygvd_work.c lapacke_ssygvx.c lapacke_ssygvx_work.c lapacke_ssyrfs.c lapacke_ssyrfs_work.c lapacke_ssysv.c lapacke_ssysv_rook.c lapacke_ssysv_rook_work.c lapacke_ssysv_work.c lapacke_ssysv_aa.c lapacke_ssysv_aa_work.c lapacke_ssysv_rk.c lapacke_ssysv_rk_work.c lapacke_ssysvx.c lapacke_ssysvx_work.c lapacke_ssyswapr.c lapacke_ssyswapr_work.c lapacke_ssytrd.c lapacke_ssytrd_work.c lapacke_ssytrf.c lapacke_ssytrf_work.c lapacke_ssytrf_rook.c lapacke_ssytrf_rook_work.c lapacke_ssytrf_aa.c lapacke_ssytrf_aa_work.c lapacke_ssytrf_rk.c lapacke_ssytrf_rk_work.c lapacke_ssytri.c lapacke_ssytri2.c lapacke_ssytri2_work.c lapacke_ssytri_3.c lapacke_ssytri_3_work.c lapacke_ssytri2x.c lapacke_ssytri2x_work.c lapacke_ssytri_work.c lapacke_ssytrs.c lapacke_ssytrs_rook.c lapacke_ssytrs2.c lapacke_ssytrs2_work.c lapacke_ssytrs_aa.c lapacke_ssytrs_aa_work.c lapacke_ssytrs_3.c lapacke_ssytrs_3_work.c lapacke_ssytrs_work.c lapacke_ssytrs_rook_work.c lapacke_stbcon.c lapacke_stbcon_work.c lapacke_stbrfs.c lapacke_stbrfs_work.c lapacke_stbtrs.c lapacke_stbtrs_work.c lapacke_stfsm.c lapacke_stfsm_work.c lapacke_stftri.c lapacke_stftri_work.c lapacke_stfttp.c lapacke_stfttp_work.c lapacke_stfttr.c lapacke_stfttr_work.c lapacke_stgevc.c lapacke_stgevc_work.c lapacke_stgexc.c lapacke_stgexc_work.c lapacke_stgsen.c lapacke_stgsen_work.c lapacke_stgsja.c lapacke_stgsja_work.c lapacke_stgsna.c lapacke_stgsna_work.c lapacke_stgsyl.c lapacke_stgsyl_work.c lapacke_stpcon.c lapacke_stpcon_work.c lapacke_stpmqrt.c lapacke_stpmqrt_work.c lapacke_stpqrt.c lapacke_stpqrt_work.c lapacke_stpqrt2.c lapacke_stpqrt2_work.c lapacke_stprfb.c lapacke_stprfb_work.c lapacke_stprfs.c lapacke_stprfs_work.c lapacke_stptri.c lapacke_stptri_work.c lapacke_stptrs.c lapacke_stptrs_work.c lapacke_stpttf.c lapacke_stpttf_work.c lapacke_stpttr.c lapacke_stpttr_work.c lapacke_strcon.c lapacke_strcon_work.c lapacke_strevc.c lapacke_strevc_work.c lapacke_strexc.c lapacke_strexc_work.c lapacke_strrfs.c lapacke_strrfs_work.c lapacke_strsen.c lapacke_strsen_work.c lapacke_strsna.c lapacke_strsna_work.c lapacke_strsyl.c lapacke_strsyl_work.c lapacke_strtri.c lapacke_strtri_work.c lapacke_strtrs.c lapacke_strtrs_work.c lapacke_strttf.c lapacke_strttf_work.c lapacke_strttp.c lapacke_strttp_work.c lapacke_stzrzf.c lapacke_stzrzf_work.c ) set(ZSRC lapacke_zbbcsd.c lapacke_zbbcsd_work.c lapacke_zbdsqr.c lapacke_zbdsqr_work.c lapacke_zcgesv.c lapacke_zcgesv_work.c lapacke_zcposv.c lapacke_zcposv_work.c lapacke_zgbbrd.c lapacke_zgbbrd_work.c lapacke_zgbcon.c lapacke_zgbcon_work.c lapacke_zgbequ.c lapacke_zgbequ_work.c lapacke_zgbequb.c lapacke_zgbequb_work.c lapacke_zgbrfs.c lapacke_zgbrfs_work.c lapacke_zgbsv.c lapacke_zgbsv_work.c lapacke_zgbsvx.c lapacke_zgbsvx_work.c lapacke_zgbtrf.c lapacke_zgbtrf_work.c lapacke_zgbtrs.c lapacke_zgbtrs_work.c lapacke_zgebak.c lapacke_zgebak_work.c lapacke_zgebal.c lapacke_zgebal_work.c lapacke_zgebrd.c lapacke_zgebrd_work.c lapacke_zgecon.c lapacke_zgecon_work.c lapacke_zgeequ.c lapacke_zgeequ_work.c lapacke_zgeequb.c lapacke_zgeequb_work.c lapacke_zgees.c lapacke_zgees_work.c lapacke_zgeesx.c lapacke_zgeesx_work.c lapacke_zgeev.c lapacke_zgeev_work.c lapacke_zgeevx.c lapacke_zgeevx_work.c lapacke_zgehrd.c lapacke_zgehrd_work.c lapacke_zgejsv.c lapacke_zgejsv_work.c lapacke_zgelq2.c lapacke_zgelq2_work.c lapacke_zgelqf.c lapacke_zgelqf_work.c lapacke_zgels.c lapacke_zgels_work.c lapacke_zgelsd.c lapacke_zgelsd_work.c lapacke_zgelss.c lapacke_zgelss_work.c lapacke_zgelsy.c lapacke_zgelsy_work.c lapacke_zgemqr.c lapacke_zgemqr_work.c lapacke_zgemqrt.c lapacke_zgemqrt_work.c lapacke_zgeqlf.c lapacke_zgeqlf_work.c lapacke_zgeqp3.c lapacke_zgeqp3_work.c lapacke_zgeqr2.c lapacke_zgeqr2_work.c lapacke_zgeqrf.c lapacke_zgeqrf_work.c lapacke_zgeqrfp.c lapacke_zgeqrfp_work.c lapacke_zgeqrt.c lapacke_zgeqrt2.c lapacke_zgeqrt2_work.c lapacke_zgeqrt3.c lapacke_zgeqrt3_work.c lapacke_zgeqrt_work.c lapacke_zgerfs.c lapacke_zgerfs_work.c lapacke_zgerqf.c lapacke_zgerqf_work.c lapacke_zgesdd.c lapacke_zgesdd_work.c lapacke_zgesv.c lapacke_zgesv_work.c lapacke_zgesvd.c lapacke_zgesvd_work.c lapacke_zgesvdx.c lapacke_zgesvdx_work.c lapacke_zgesvj.c lapacke_zgesvj_work.c lapacke_zgesvx.c lapacke_zgesvx_work.c lapacke_zgetf2.c lapacke_zgetf2_work.c lapacke_zgetrf.c lapacke_zgetrf_work.c lapacke_zgetrf2.c lapacke_zgetrf2_work.c lapacke_zgetri.c lapacke_zgetri_work.c lapacke_zgetrs.c lapacke_zgetrs_work.c lapacke_zgetsls.c lapacke_zgetsls_work.c lapacke_zggbak.c lapacke_zggbak_work.c lapacke_zggbal.c lapacke_zggbal_work.c lapacke_zgges.c lapacke_zgges_work.c lapacke_zgges3.c lapacke_zgges3_work.c lapacke_zggesx.c lapacke_zggesx_work.c lapacke_zggev.c lapacke_zggev_work.c lapacke_zggev3.c lapacke_zggev3_work.c lapacke_zggevx.c lapacke_zggevx_work.c lapacke_zggglm.c lapacke_zggglm_work.c lapacke_zgghrd.c lapacke_zgghrd_work.c lapacke_zgghd3.c lapacke_zgghd3_work.c lapacke_zgglse.c lapacke_zgglse_work.c lapacke_zggqrf.c lapacke_zggqrf_work.c lapacke_zggrqf.c lapacke_zggrqf_work.c lapacke_zggsvd3.c lapacke_zggsvd3_work.c lapacke_zggsvp3.c lapacke_zggsvp3_work.c lapacke_zgtcon.c lapacke_zgtcon_work.c lapacke_zgtrfs.c lapacke_zgtrfs_work.c lapacke_zgtsv.c lapacke_zgtsv_work.c lapacke_zgtsvx.c lapacke_zgtsvx_work.c lapacke_zgttrf.c lapacke_zgttrf_work.c lapacke_zgttrs.c lapacke_zgttrs_work.c lapacke_zhbev.c lapacke_zhbev_work.c lapacke_zhbevd.c lapacke_zhbevd_work.c lapacke_zhbevx.c lapacke_zhbevx_work.c lapacke_zhbgst.c lapacke_zhbgst_work.c lapacke_zhbgv.c lapacke_zhbgv_work.c lapacke_zhbgvd.c lapacke_zhbgvd_work.c lapacke_zhbgvx.c lapacke_zhbgvx_work.c lapacke_zhbtrd.c lapacke_zhbtrd_work.c lapacke_zhecon.c lapacke_zhecon_work.c lapacke_zhecon_3.c lapacke_zhecon_3_work.c lapacke_zheequb.c lapacke_zheequb_work.c lapacke_zheev.c lapacke_zheev_work.c lapacke_zheevd.c lapacke_zheevd_work.c lapacke_zheevr.c lapacke_zheevr_work.c lapacke_zheevx.c lapacke_zheevx_work.c lapacke_zheev_2stage.c lapacke_zheev_2stage_work.c lapacke_zheevd_2stage.c lapacke_zheevd_2stage_work.c lapacke_zheevr_2stage.c lapacke_zheevr_2stage_work.c lapacke_zheevx_2stage.c lapacke_zheevx_2stage_work.c lapacke_zhegst.c lapacke_zhegst_work.c lapacke_zhegv.c lapacke_zhegv_work.c lapacke_zhegv_2stage.c lapacke_zhegv_2stage_work.c lapacke_zhegvd.c lapacke_zhegvd_work.c lapacke_zhegvx.c lapacke_zhegvx_work.c lapacke_zherfs.c lapacke_zherfs_work.c lapacke_zhesv.c lapacke_zhesv_work.c lapacke_zhesv_aa.c lapacke_zhesv_aa_work.c lapacke_zhesv_rk.c lapacke_zhesv_rk_work.c lapacke_zhesvx.c lapacke_zhesvx_work.c lapacke_zheswapr.c lapacke_zheswapr_work.c lapacke_zhetrd.c lapacke_zhetrd_work.c lapacke_zhetrf.c lapacke_zhetrf_rook.c lapacke_zhetrf_work.c lapacke_zhetrf_rook_work.c lapacke_zhetrf_aa.c lapacke_zhetrf_aa_work.c lapacke_zhetrf_rk.c lapacke_zhetrf_rk_work.c lapacke_zhetri.c lapacke_zhetri2.c lapacke_zhetri2_work.c lapacke_zhetri_3.c lapacke_zhetri_3_work.c lapacke_zhetri2x.c lapacke_zhetri2x_work.c lapacke_zhetri_work.c lapacke_zhetrs.c lapacke_zhetrs_rook.c lapacke_zhetrs2.c lapacke_zhetrs2_work.c lapacke_zhetrs_work.c lapacke_zhetrs_aa.c lapacke_zhetrs_aa_work.c lapacke_zhetrs_3.c lapacke_zhetrs_3_work.c lapacke_zhetrs_rook_work.c lapacke_zhfrk.c lapacke_zhfrk_work.c lapacke_zhgeqz.c lapacke_zhgeqz_work.c lapacke_zhpcon.c lapacke_zhpcon_work.c lapacke_zhpev.c lapacke_zhpev_work.c lapacke_zhpevd.c lapacke_zhpevd_work.c lapacke_zhpevx.c lapacke_zhpevx_work.c lapacke_zhpgst.c lapacke_zhpgst_work.c lapacke_zhpgv.c lapacke_zhpgv_work.c lapacke_zhpgvd.c lapacke_zhpgvd_work.c lapacke_zhpgvx.c lapacke_zhpgvx_work.c lapacke_zhprfs.c lapacke_zhprfs_work.c lapacke_zhpsv.c lapacke_zhpsv_work.c lapacke_zhpsvx.c lapacke_zhpsvx_work.c lapacke_zhptrd.c lapacke_zhptrd_work.c lapacke_zhptrf.c lapacke_zhptrf_work.c lapacke_zhptri.c lapacke_zhptri_work.c lapacke_zhptrs.c lapacke_zhptrs_work.c lapacke_zhsein.c lapacke_zhsein_work.c lapacke_zhseqr.c lapacke_zhseqr_work.c lapacke_zlacgv.c lapacke_zlacgv_work.c lapacke_zlacn2.c lapacke_zlacn2_work.c lapacke_zlacp2.c lapacke_zlacp2_work.c lapacke_zlacpy.c lapacke_zlacpy_work.c lapacke_zlag2c.c lapacke_zlag2c_work.c lapacke_zlange.c lapacke_zlange_work.c lapacke_zlanhe.c lapacke_zlanhe_work.c lapacke_zlansy.c lapacke_zlansy_work.c lapacke_zlantr.c lapacke_zlantr_work.c lapacke_zlapmr.c lapacke_zlapmr_work.c lapacke_zlapmt.c lapacke_zlapmt_work.c lapacke_zlarfb.c lapacke_zlarfb_work.c lapacke_zlarfg.c lapacke_zlarfg_work.c lapacke_zlarft.c lapacke_zlarft_work.c lapacke_zlarfx.c lapacke_zlarfx_work.c lapacke_zlarnv.c lapacke_zlarnv_work.c lapacke_zlascl.c lapacke_zlascl_work.c lapacke_zlaset.c lapacke_zlaset_work.c lapacke_zlaswp.c lapacke_zlaswp_work.c lapacke_zlauum.c lapacke_zlauum_work.c lapacke_zpbcon.c lapacke_zpbcon_work.c lapacke_zpbequ.c lapacke_zpbequ_work.c lapacke_zpbrfs.c lapacke_zpbrfs_work.c lapacke_zpbstf.c lapacke_zpbstf_work.c lapacke_zpbsv.c lapacke_zpbsv_work.c lapacke_zpbsvx.c lapacke_zpbsvx_work.c lapacke_zpbtrf.c lapacke_zpbtrf_work.c lapacke_zpbtrs.c lapacke_zpbtrs_work.c lapacke_zpftrf.c lapacke_zpftrf_work.c lapacke_zpftri.c lapacke_zpftri_work.c lapacke_zpftrs.c lapacke_zpftrs_work.c lapacke_zpocon.c lapacke_zpocon_work.c lapacke_zpoequ.c lapacke_zpoequ_work.c lapacke_zpoequb.c lapacke_zpoequb_work.c lapacke_zporfs.c lapacke_zporfs_work.c lapacke_zposv.c lapacke_zposv_work.c lapacke_zposvx.c lapacke_zposvx_work.c lapacke_zpotrf.c lapacke_zpotrf_work.c lapacke_zpotrf2.c lapacke_zpotrf2_work.c lapacke_zpotri.c lapacke_zpotri_work.c lapacke_zpotrs.c lapacke_zpotrs_work.c lapacke_zppcon.c lapacke_zppcon_work.c lapacke_zppequ.c lapacke_zppequ_work.c lapacke_zpprfs.c lapacke_zpprfs_work.c lapacke_zppsv.c lapacke_zppsv_work.c lapacke_zppsvx.c lapacke_zppsvx_work.c lapacke_zpptrf.c lapacke_zpptrf_work.c lapacke_zpptri.c lapacke_zpptri_work.c lapacke_zpptrs.c lapacke_zpptrs_work.c lapacke_zpstrf.c lapacke_zpstrf_work.c lapacke_zptcon.c lapacke_zptcon_work.c lapacke_zpteqr.c lapacke_zpteqr_work.c lapacke_zptrfs.c lapacke_zptrfs_work.c lapacke_zptsv.c lapacke_zptsv_work.c lapacke_zptsvx.c lapacke_zptsvx_work.c lapacke_zpttrf.c lapacke_zpttrf_work.c lapacke_zpttrs.c lapacke_zpttrs_work.c lapacke_zspcon.c lapacke_zspcon_work.c lapacke_zsprfs.c lapacke_zsprfs_work.c lapacke_zspsv.c lapacke_zspsv_work.c lapacke_zspsvx.c lapacke_zspsvx_work.c lapacke_zsptrf.c lapacke_zsptrf_work.c lapacke_zsptri.c lapacke_zsptri_work.c lapacke_zsptrs.c lapacke_zsptrs_work.c lapacke_zstedc.c lapacke_zstedc_work.c lapacke_zstegr.c lapacke_zstegr_work.c lapacke_zstein.c lapacke_zstein_work.c lapacke_zstemr.c lapacke_zstemr_work.c lapacke_zsteqr.c lapacke_zsteqr_work.c lapacke_zsycon.c lapacke_zsycon_work.c lapacke_zsycon_3.c lapacke_zsycon_3_work.c lapacke_zsyconv.c lapacke_zsyconv_work.c lapacke_zsyequb.c lapacke_zsyequb_work.c lapacke_zsyrfs.c lapacke_zsyrfs_work.c lapacke_zsysv.c lapacke_zsysv_rook.c lapacke_zsysv_rook_work.c lapacke_zsysv_work.c lapacke_zsysv_aa.c lapacke_zsysv_aa_work.c lapacke_zsysv_rk.c lapacke_zsysv_rk_work.c lapacke_zsysvx.c lapacke_zsysvx_work.c lapacke_zsyswapr.c lapacke_zsyswapr_work.c lapacke_zsytrf.c lapacke_zsytrf_work.c lapacke_zsytrf_rook.c lapacke_zsytrf_rook_work.c lapacke_zsytrf_aa.c lapacke_zsytrf_aa_work.c lapacke_zsytrf_rk.c lapacke_zsytrf_rk_work.c lapacke_zsytri.c lapacke_zsytri2.c lapacke_zsytri2_work.c lapacke_zsytri_3.c lapacke_zsytri_3_work.c lapacke_zsytri2x.c lapacke_zsytri2x_work.c lapacke_zsytri_work.c lapacke_zsytrs.c lapacke_zsytrs_rook.c lapacke_zsytrs2.c lapacke_zsytrs2_work.c lapacke_zsytrs_work.c lapacke_zsytrs_rook_work.c lapacke_zsytrs_aa.c lapacke_zsytrs_aa_work.c lapacke_zsytrs_3.c lapacke_zsytrs_3_work.c lapacke_ztbcon.c lapacke_ztbcon_work.c lapacke_ztbrfs.c lapacke_ztbrfs_work.c lapacke_ztbtrs.c lapacke_ztbtrs_work.c lapacke_ztfsm.c lapacke_ztfsm_work.c lapacke_ztftri.c lapacke_ztftri_work.c lapacke_ztfttp.c lapacke_ztfttp_work.c lapacke_ztfttr.c lapacke_ztfttr_work.c lapacke_ztgevc.c lapacke_ztgevc_work.c lapacke_ztgexc.c lapacke_ztgexc_work.c lapacke_ztgsen.c lapacke_ztgsen_work.c lapacke_ztgsja.c lapacke_ztgsja_work.c lapacke_ztgsna.c lapacke_ztgsna_work.c lapacke_ztgsyl.c lapacke_ztgsyl_work.c lapacke_ztpcon.c lapacke_ztpcon_work.c lapacke_ztpmqrt.c lapacke_ztpmqrt_work.c lapacke_ztpqrt.c lapacke_ztpqrt2.c lapacke_ztpqrt2_work.c lapacke_ztpqrt_work.c lapacke_ztprfb.c lapacke_ztprfb_work.c lapacke_ztprfs.c lapacke_ztprfs_work.c lapacke_ztptri.c lapacke_ztptri_work.c lapacke_ztptrs.c lapacke_ztptrs_work.c lapacke_ztpttf.c lapacke_ztpttf_work.c lapacke_ztpttr.c lapacke_ztpttr_work.c lapacke_ztrcon.c lapacke_ztrcon_work.c lapacke_ztrevc.c lapacke_ztrevc_work.c lapacke_ztrexc.c lapacke_ztrexc_work.c lapacke_ztrrfs.c lapacke_ztrrfs_work.c lapacke_ztrsen.c lapacke_ztrsen_work.c lapacke_ztrsna.c lapacke_ztrsna_work.c lapacke_ztrsyl.c lapacke_ztrsyl_work.c lapacke_ztrtri.c lapacke_ztrtri_work.c lapacke_ztrtrs.c lapacke_ztrtrs_work.c lapacke_ztrttf.c lapacke_ztrttf_work.c lapacke_ztrttp.c lapacke_ztrttp_work.c lapacke_ztzrzf.c lapacke_ztzrzf_work.c lapacke_zunbdb.c lapacke_zunbdb_work.c lapacke_zuncsd.c lapacke_zuncsd_work.c lapacke_zuncsd2by1.c lapacke_zuncsd2by1_work.c lapacke_zungbr.c lapacke_zungbr_work.c lapacke_zunghr.c lapacke_zunghr_work.c lapacke_zunglq.c lapacke_zunglq_work.c lapacke_zungql.c lapacke_zungql_work.c lapacke_zungqr.c lapacke_zungqr_work.c lapacke_zungrq.c lapacke_zungrq_work.c lapacke_zungtr.c lapacke_zungtr_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c lapacke_zunmhr_work.c lapacke_zunmlq.c lapacke_zunmlq_work.c lapacke_zunmql.c lapacke_zunmql_work.c lapacke_zunmqr.c lapacke_zunmqr_work.c lapacke_zunmrq.c lapacke_zunmrq_work.c lapacke_zunmrz.c lapacke_zunmrz_work.c lapacke_zunmtr.c lapacke_zunmtr_work.c lapacke_zupgtr.c lapacke_zupgtr_work.c lapacke_zupmtr.c lapacke_zupmtr_work.c lapacke_zsyr.c lapacke_csyr.c lapacke_zsyr_work.c lapacke_csyr_work.c lapacke_ilaver.c ) set(SRCX lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c lapacke_cgbrfsx_work.c lapacke_cporfsx_work.c lapacke_dgerfsx_work.c lapacke_sgbrfsx_work.c lapacke_ssyrfsx_work.c lapacke_zherfsx_work.c lapacke_cgerfsx.c lapacke_csyrfsx.c lapacke_dporfsx.c lapacke_sgerfsx.c lapacke_zgbrfsx.c lapacke_zporfsx.c lapacke_cgerfsx_work.c lapacke_csyrfsx_work.c lapacke_dporfsx_work.c lapacke_sgerfsx_work.c lapacke_zgbrfsx_work.c lapacke_zporfsx_work.c lapacke_cherfsx.c lapacke_dgbrfsx.c lapacke_dsyrfsx.c lapacke_sporfsx.c lapacke_zgerfsx.c lapacke_zsyrfsx.c lapacke_cherfsx_work.c lapacke_dgbrfsx_work.c lapacke_dsyrfsx_work.c lapacke_sporfsx_work.c lapacke_zgerfsx_work.c lapacke_zsyrfsx_work.c lapacke_cgbsvxx.c lapacke_cposvxx.c lapacke_dgesvxx.c lapacke_sgbsvxx.c lapacke_ssysvxx.c lapacke_zhesvxx.c lapacke_cgbsvxx_work.c lapacke_cposvxx_work.c lapacke_dgesvxx_work.c lapacke_sgbsvxx_work.c lapacke_ssysvxx_work.c lapacke_zhesvxx_work.c lapacke_cgesvxx.c lapacke_csysvxx.c lapacke_dposvxx.c lapacke_sgesvxx.c lapacke_zgbsvxx.c lapacke_zposvxx.c lapacke_cgesvxx_work.c lapacke_csysvxx_work.c lapacke_dposvxx_work.c lapacke_sgesvxx_work.c lapacke_zgbsvxx_work.c lapacke_zposvxx_work.c lapacke_chesvxx.c lapacke_dgbsvxx.c lapacke_dsysvxx.c lapacke_sposvxx.c lapacke_zgesvxx.c lapacke_zsysvxx.c lapacke_chesvxx_work.c lapacke_dgbsvxx_work.c lapacke_dsysvxx_work.c lapacke_sposvxx_work.c lapacke_zgesvxx_work.c lapacke_zsysvxx_work.c ) # FILE PARTS OF TMGLIB set(MATGEN lapacke_clatms.c lapacke_clatms_work.c lapacke_dlatms.c lapacke_dlatms_work.c lapacke_slatms.c lapacke_slatms_work.c lapacke_zlatms.c lapacke_zlatms_work.c lapacke_clagge.c lapacke_clagge_work.c lapacke_dlagge.c lapacke_dlagge_work.c lapacke_slagge.c lapacke_slagge_work.c lapacke_zlagge.c lapacke_zlagge_work.c lapacke_claghe.c lapacke_claghe_work.c lapacke_zlaghe.c lapacke_zlaghe_work.c lapacke_clagsy.c lapacke_clagsy_work.c lapacke_dlagsy.c lapacke_dlagsy_work.c lapacke_slagsy.c lapacke_slagsy_work.c lapacke_zlagsy.c lapacke_zlagsy_work.c ) set(Utils_SRC lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c lapacke_dpb_nancheck.c lapacke_sst_nancheck.c lapacke_dpb_trans.c lapacke_ssy_nancheck.c ) set(LAPACKE_REL_SRC "") if (BUILD_SINGLE) list(APPEND LAPACKE_REL_SRC ${SSRC}) endif () if (BUILD_DOUBLE) list(APPEND LAPACKE_REL_SRC ${DSRC}) endif () if (BUILD_COMPLEX) list(APPEND LAPACKE_REL_SRC ${CSRC}) endif () if (BUILD_COMPLEX16) list(APPEND LAPACKE_REL_SRC ${ZSRC}) endif () # add lapack-netlib folder to the sources set(LAPACKE_SOURCES "") foreach (LAE_FILE ${LAPACKE_REL_SRC}) list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}") endforeach () foreach (Utils_FILE ${Utils_SRC}) list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}") endforeach () set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h") include_directories(${lapacke_include_dir}) set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") OpenBLAS-0.2.20/cmake/openblas.pc.in000066400000000000000000000004561313527062700167650ustar00rootroot00000000000000libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS Libs: -L${libdir} -lopenblas Cflags: -I${includedir} OpenBLAS-0.2.20/cmake/os.cmake000066400000000000000000000054111313527062700156500ustar00rootroot00000000000000## ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system ## Detects the OS and sets appropriate variables. if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") set(ENV{MACOSX_DEPLOYMENT_TARGET} "10.2") # TODO: should be exported as an env var set(MD5SUM "md5 -r") endif () if (${CMAKE_SYSTEM_NAME} STREQUAL "FreeBSD") set(MD5SUM "md5 -r") endif () if (${CMAKE_SYSTEM_NAME} STREQUAL "NetBSD") set(MD5SUM "md5 -n") endif () if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(EXTRALIB "${EXTRALIB} -lm") set(NO_EXPRECISION 1) endif () if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") set(EXTRALIB "${EXTRALIB} -lm") endif () # TODO: this is probably meant for mingw, not other windows compilers if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(NEED_PIC 0) set(NO_EXPRECISION 1) set(EXTRALIB "${EXTRALIB} -defaultlib:advapi32") # probably not going to use these set(SUFFIX "obj") set(PSUFFIX "pobj") set(LIBSUFFIX "a") if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") # Test for supporting MS_ABI # removed string parsing in favor of CMake's version comparison -hpa execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) # GCC Version >=4.7 # It is compatible with MSVC ABI. set(CCOMMON_OPT "${CCOMMON_OPT} -DMS_ABI") endif () endif () # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 if (${ARCH} STREQUAL "x86") if (NOT MSVC AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") set(CCOMMON_OPT "${CCOMMON_OPT} -mincoming-stack-boundary=2") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -mincoming-stack-boundary=2") endif () endif () if (${CMAKE_SYSTEM_NAME} STREQUAL "Interix") set(NEED_PIC 0) set(NO_EXPRECISION 1) set(INTERIX_TOOL_DIR STREQUAL "/opt/gcc.3.3/i586-pc-interix3/bin") endif () if (CYGWIN) set(NEED_PIC 0) set(NO_EXPRECISION 1) endif () if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android") if (SMP) set(EXTRALIB "${EXTRALIB} -lpthread") endif () endif () if (QUAD_PRECISION) set(CCOMMON_OPT "${CCOMMON_OPT} -DQUAD_PRECISION") set(NO_EXPRECISION 1) endif () if (${ARCH} STREQUAL "x86") set(NO_EXPRECISION 1) endif () if (UTEST_CHECK) set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") set(SANITY_CHECK 1) endif () if (SANITY_CHECK) # TODO: need some way to get $(*F) (target filename) set(CCOMMON_OPT "${CCOMMON_OPT} -DSANITY_CHECK -DREFNAME=$(*F)f${BU}") endif () OpenBLAS-0.2.20/cmake/prebuild.cmake000066400000000000000000000113571313527062700170430ustar00rootroot00000000000000## ## Author: Hank Anderson ## Description: Ported from OpenBLAS/Makefile.prebuild ## This is triggered by system.cmake and runs before any of the code is built. ## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files). ## Next it runs f_check and appends some fortran information to the files. ## Then it runs getarch and getarch_2nd for even more environment information. ## Finally it builds gen_config_h for use at build time to generate config.h. # CMake vars set by this file: # CORE # LIBCORE # NUM_CORES # HAVE_MMX # HAVE_SSE # HAVE_SSE2 # HAVE_SSE3 # MAKE # SGEMM_UNROLL_M # SGEMM_UNROLL_N # DGEMM_UNROLL_M # DGEMM_UNROLL_M # QGEMM_UNROLL_N # QGEMM_UNROLL_N # CGEMM_UNROLL_M # CGEMM_UNROLL_M # ZGEMM_UNROLL_N # ZGEMM_UNROLL_N # XGEMM_UNROLL_M # XGEMM_UNROLL_N # CGEMM3M_UNROLL_M # CGEMM3M_UNROLL_N # ZGEMM3M_UNROLL_M # ZGEMM3M_UNROLL_M # XGEMM3M_UNROLL_N # XGEMM3M_UNROLL_N # CPUIDEMU = ../../cpuid/table.o if (DEFINED CPUIDEMU) set(EXFLAGS "-DCPUIDEMU -DVENDOR=99") endif () if (DEFINED TARGET_CORE) # set the C flags for just this file set(GETARCH2_FLAGS "-DBUILD_KERNEL") set(TARGET_MAKE "Makefile_kernel.conf") set(TARGET_CONF "config_kernel.h") else() set(TARGET_MAKE "Makefile.conf") set(TARGET_CONF "config.h") endif () include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake") if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") endif () # compile getarch set(GETARCH_SRC ${PROJECT_SOURCE_DIR}/getarch.c ${CPUIDEMO} ) if (NOT MSVC) list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) endif () if (MSVC) #Use generic for MSVC now set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) endif() if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") # disable WindowsStore strict CRT checks set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) endif () set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) if (NOT ${GETARCH_RESULT}) MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") endif () endif () message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 0 OUTPUT_VARIABLE GETARCH_MAKE_OUT) execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH_BIN} 1 OUTPUT_VARIABLE GETARCH_CONF_OUT) message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") # append config data from getarch to the TARGET file and read in CMake vars file(APPEND ${TARGET_CONF} ${GETARCH_CONF_OUT}) ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH2_RESULT ${GETARCH2_DIR} SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) if (NOT ${GETARCH2_RESULT}) MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") endif () endif () # use the cmake binary w/ the -E param to run a shell command in a cross-platform way execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) # append config data from getarch_2nd to the TARGET file and read in CMake vars file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) ParseGetArchVars(${GETARCH2_MAKE_OUT}) # compile get_config_h set(GEN_CONFIG_H_DIR "${PROJECT_BINARY_DIR}/genconfig_h_build") set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}") set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GEN_CONFIG_H_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} ) if (NOT ${GEN_CONFIG_H_RESULT}) MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") endif () endif ()OpenBLAS-0.2.20/cmake/system.cmake000066400000000000000000000314361313527062700165610ustar00rootroot00000000000000## ## Author: Hank Anderson ## Description: Ported from OpenBLAS/Makefile.system ## set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib") # TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa # http://stackoverflow.com/questions/714100/os-detecting-makefile # TODO: Makefile.system sets HOSTCC = $(CC) here if not already set -hpa # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. if (DEFINED TARGET_CORE) set(TARGET ${TARGET_CORE}) endif () # Force fallbacks for 32bit if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () endif () if (DEFINED TARGET) message(STATUS "Targetting the ${TARGET} architecture.") set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") endif () if (NOT DEFINED GEMM_MULTITHREAD_THRESHOLD) set(GEMM_MULTITHREAD_THRESHOLD 4) endif () message(STATUS "GEMM multithread threshold set to ${GEMM_MULTITHREAD_THRESHOLD}.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DGEMM_MULTITHREAD_THRESHOLD=${GEMM_MULTITHREAD_THRESHOLD}") if (NO_AVX) message(STATUS "Disabling Advanced Vector Extensions (AVX).") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX") endif () if (NO_AVX2) message(STATUS "Disabling Advanced Vector Extensions 2 (AVX2).") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2") endif () if (CMAKE_BUILD_TYPE STREQUAL Debug) set(GETARCH_FLAGS "${GETARCH_FLAGS} -g") endif () # TODO: let CMake handle this? -hpa #if (${QUIET_MAKE}) # set(MAKE "${MAKE} -s") #endif() if (NOT DEFINED NO_PARALLEL_MAKE) set(NO_PARALLEL_MAKE 0) endif () set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_PARALLEL_MAKE=${NO_PARALLEL_MAKE}") if (CMAKE_CXX_COMPILER STREQUAL loongcc) set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") endif () #if don't use Fortran, it will only compile CBLAS. if (ONLY_CBLAS) set(NO_LAPACK 1) else () set(ONLY_CBLAS 0) endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (NOT DEFINED NUM_THREADS) set(NUM_THREADS ${NUM_CORES}) endif () if (${NUM_THREADS} EQUAL 1) set(USE_THREAD 0) endif () if (DEFINED USE_THREAD) if (NOT ${USE_THREAD}) unset(SMP) else () set(SMP 1) endif () else () # N.B. this is NUM_THREAD in Makefile.system which is probably a bug -hpa if (${NUM_THREADS} EQUAL 1) unset(SMP) else () set(SMP 1) endif () endif () if (${SMP}) message(STATUS "SMP enabled.") endif () if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () # TODO: I think CMake should be handling all this stuff -hpa unset(ARFLAGS) set(CPP "${COMPILER} -E") set(AR "${CROSS_SUFFIX}ar") set(AS "${CROSS_SUFFIX}as") set(LD "${CROSS_SUFFIX}ld") set(RANLIB "${CROSS_SUFFIX}ranlib") set(NM "${CROSS_SUFFIX}nm") set(DLLWRAP "${CROSS_SUFFIX}dllwrap") set(OBJCOPY "${CROSS_SUFFIX}objcopy") set(OBJCONV "${CROSS_SUFFIX}objconv") # OS dependent settings include("${PROJECT_SOURCE_DIR}/cmake/os.cmake") # Architecture dependent settings include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") endif () if (BINARY64) if (INTERFACE64) # CCOMMON_OPT += -DUSE64BITINT endif () endif () if (NEED_PIC) if (${CMAKE_C_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") else () set(CCOMMON_OPT "${CCOMMON_OPT} -fPIC") endif () if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") endif () endif () if (DYNAMIC_ARCH) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") endif () if (NO_LAPACK) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") #Disable LAPACK C interface set(NO_LAPACKE 1) endif () if (NO_LAPACKE) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACKE") endif () if (NO_AVX) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX") endif () if (${ARCH} STREQUAL "x86") set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX") endif () if (NO_AVX2) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") endif () if (SMP) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMP_SERVER") if (${ARCH} STREQUAL "mips64") if (NOT ${CORE} STREQUAL "LOONGSON3B") set(USE_SIMPLE_THREADED_LEVEL3 1) endif () endif () if (USE_OPENMP) # USE_SIMPLE_THREADED_LEVEL3 = 1 # NO_AFFINITY = 1 set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") endif () if (BIGNUMA) set(CCOMMON_OPT "${CCOMMON_OPT} -DBIGNUMA") endif () endif () if (NO_WARMUP) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_WARMUP") endif () if (CONSISTENT_FPCSR) set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") endif () # Only for development # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_SWITCHING") # set(USE_PAPI 1) if (USE_PAPI) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_PAPI") set(EXTRALIB "${EXTRALIB} -lpapi -lperfctr") endif () if (DYNAMIC_THREADS) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_THREADS") endif () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () set(LIBPREFIX "libopenblas") endif () if (NOT DEFINED SYMBOLPREFIX) set(SYMBOLPREFIX "") endif () if (NOT DEFINED SYMBOLSUFFIX) set(SYMBOLSUFFIX "") endif () set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") # TODO: nead to convert these Makefiles # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC") endif () if (${CORE} STREQUAL "PPC440FP2") set(STATIC_ALLOCATION 1) endif () if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(NO_AFFINITY 1) endif () if (NOT ${ARCH} STREQUAL "x86_64" AND NOT ${ARCH} STREQUAL "x86" AND NOT ${CORE} STREQUAL "LOONGSON3B") set(NO_AFFINITY 1) endif () if (NO_AFFINITY) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AFFINITY") endif () if (FUNCTION_PROFILE) set(CCOMMON_OPT "${CCOMMON_OPT} -DFUNCTION_PROFILE") endif () if (HUGETLB_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLB") endif () if (DEFINED HUGETLBFILE_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=${HUGETLBFILE_ALLOCATION})") endif () if (STATIC_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_STATIC") endif () if (DEVICEDRIVER_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\"") endif () if (MIXED_MEMORY_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") endif () if (${CMAKE_SYSTEM_NAME} STREQUAL "SunOS") set(TAR gtar) set(PATCH gpatch) set(GREP ggrep) else () set(TAR tar) set(PATCH patch) set(GREP grep) endif () if (NOT DEFINED MD5SUM) set(MD5SUM md5sum) endif () set(AWK awk) set(SED sed) set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) if (DEBUG) set(COMMON_OPT "${COMMON_OPT} -g") endif () if (NOT DEFINED COMMON_OPT) set(COMMON_OPT "-O2") endif () #For x86 32-bit if (DEFINED BINARY AND BINARY EQUAL 32) if (NOT MSVC) set(COMMON_OPT "${COMMON_OPT} -m32") endif() endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMMON_OPT} ${CCOMMON_OPT}") endif() # TODO: not sure what PFLAGS is -hpa set(PFLAGS "${PFLAGS} ${COMMON_OPT} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${COMMON_OPT} ${FCOMMON_OPT}") # TODO: not sure what FPFLAGS is -hpa set(FPFLAGS "${FPFLAGS} ${COMMON_OPT} ${FCOMMON_OPT} ${COMMON_PROF}") #For LAPACK Fortran codes. set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}") set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") #Disable -fopenmp for LAPACK Fortran codes on Windows. if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parralel") foreach (FILTER_FLAG ${FILTER_FLAGS}) string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) endforeach () endif () if ("${F_COMPILER}" STREQUAL "GFORTRAN") # lapack-netlib is rife with uninitialized warnings -hpa set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized") endif () set(LAPACK_CFLAGS "${CMAKE_C_CFLAGS} -DHAVE_LAPACK_CONFIG_H") if (INTERFACE64) set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_ILP64") endif () if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DOPENBLAS_OS_WINDOWS") endif () if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () if (NOT DEFINED SUFFIX) set(SUFFIX o) endif () if (NOT DEFINED PSUFFIX) set(PSUFFIX po) endif () if (NOT DEFINED LIBSUFFIX) set(LIBSUFFIX a) endif () if (DYNAMIC_ARCH) if (DEFINED SMP) set(LIBNAME "${LIBPREFIX}p${REVISION}.${LIBSUFFIX}") set(LIBNAME_P "${LIBPREFIX}p${REVISION}_p.${LIBSUFFIX}") else () set(LIBNAME "${LIBPREFIX}${REVISION}.${LIBSUFFIX}") set(LIBNAME_P "${LIBPREFIX}${REVISION}_p.${LIBSUFFIX}") endif () else () if (DEFINED SMP) set(LIBNAME "${LIBPREFIX}_${LIBCORE}p${REVISION}.${LIBSUFFIX}") set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}p${REVISION}_p.${LIBSUFFIX}") else () set(LIBNAME "${LIBPREFIX}_${LIBCORE}${REVISION}.${LIBSUFFIX}") set(LIBNAME_P "${LIBPREFIX}_${LIBCORE}${REVISION}_p.${LIBSUFFIX}") endif () endif () set(LIBDLLNAME "${LIBPREFIX}.dll") set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so") set(LIBDYNNAME "${LIBNAME}.${LIBSUFFIX}.dylib") set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def") set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp") set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip") set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}") set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}") set(LIB_COMPONENTS BLAS) if (NOT NO_CBLAS) set(LIB_COMPONENTS "${LIB_COMPONENTS} CBLAS") endif () if (NOT NO_LAPACK) set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACK") if (NOT NO_LAPACKE) set(LIB_COMPONENTS "${LIB_COMPONENTS} LAPACKE") endif () endif () if (ONLY_CBLAS) set(LIB_COMPONENTS CBLAS) endif () # For GEMM3M set(USE_GEMM3M 0) if (DEFINED ARCH) if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64" OR ${ARCH} STREQUAL "ia64" OR ${ARCH} STREQUAL "MIPS") set(USE_GEMM3M 1) endif () if (${CORE} STREQUAL "generic") set(USE_GEMM3M 0) endif () endif () #export OSNAME #export ARCH #export CORE #export LIBCORE #export PGCPATH #export CONFIG #export CC #export FC #export BU #export FU #export NEED2UNDERSCORES #export USE_THREAD #export NUM_THREADS #export NUM_CORES #export SMP #export MAKEFILE_RULE #export NEED_PIC #export BINARY #export BINARY32 #export BINARY64 #export F_COMPILER #export C_COMPILER #export USE_OPENMP #export CROSS #export CROSS_SUFFIX #export NOFORTRAN #export NO_FBLAS #export EXTRALIB #export CEXTRALIB #export FEXTRALIB #export HAVE_SSE #export HAVE_SSE2 #export HAVE_SSE3 #export HAVE_SSSE3 #export HAVE_SSE4_1 #export HAVE_SSE4_2 #export HAVE_SSE4A #export HAVE_SSE5 #export HAVE_AVX #export HAVE_VFP #export HAVE_VFPV3 #export HAVE_VFPV4 #export HAVE_NEON #export KERNELDIR #export FUNCTION_PROFILE #export TARGET_CORE # #export SGEMM_UNROLL_M #export SGEMM_UNROLL_N #export DGEMM_UNROLL_M #export DGEMM_UNROLL_N #export QGEMM_UNROLL_M #export QGEMM_UNROLL_N #export CGEMM_UNROLL_M #export CGEMM_UNROLL_N #export ZGEMM_UNROLL_M #export ZGEMM_UNROLL_N #export XGEMM_UNROLL_M #export XGEMM_UNROLL_N #export CGEMM3M_UNROLL_M #export CGEMM3M_UNROLL_N #export ZGEMM3M_UNROLL_M #export ZGEMM3M_UNROLL_N #export XGEMM3M_UNROLL_M #export XGEMM3M_UNROLL_N #if (USE_CUDA) # export CUDADIR # export CUCC # export CUFLAGS # export CULIB #endif #.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f # #.f.$(SUFFIX): # $(FC) $(FFLAGS) -c $< -o $(@F) # #.f.$(PSUFFIX): # $(FC) $(FPFLAGS) -pg -c $< -o $(@F) # these are not cross-platform #ifdef BINARY64 #PATHSCALEPATH = /opt/pathscale/lib/3.1 #PGIPATH = /opt/pgi/linux86-64/7.1-5/lib #else #PATHSCALEPATH = /opt/pathscale/lib/3.1/32 #PGIPATH = /opt/pgi/linux86/7.1-5/lib #endif #ACMLPATH = /opt/acml/4.3.0 #ifneq ($(OSNAME), Darwin) #MKLPATH = /opt/intel/mkl/10.2.2.025/lib #else #MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib #endif #ATLASPATH = /opt/atlas/3.9.17/opteron #FLAMEPATH = $(HOME)/flame/lib #ifneq ($(OSNAME), SunOS) #SUNPATH = /opt/sunstudio12.1 #else #SUNPATH = /opt/SUNWspro #endif OpenBLAS-0.2.20/cmake/utils.cmake000066400000000000000000000346431313527062700164000ustar00rootroot00000000000000# Functions to help with the OpenBLAS build # Reads string from getarch into CMake vars. Format of getarch vars is VARNAME=VALUE function(ParseGetArchVars GETARCH_IN) string(REGEX MATCHALL "[0-9_a-zA-Z]+=[0-9_a-zA-Z]+" GETARCH_RESULT_LIST "${GETARCH_IN}") foreach (GETARCH_LINE ${GETARCH_RESULT_LIST}) # split the line into var and value, then assign the value to a CMake var string(REGEX MATCHALL "[0-9_a-zA-Z]+" SPLIT_VAR "${GETARCH_LINE}") list(GET SPLIT_VAR 0 VAR_NAME) list(GET SPLIT_VAR 1 VAR_VALUE) set(${VAR_NAME} ${VAR_VALUE} PARENT_SCOPE) endforeach () endfunction () # Reads a Makefile into CMake vars. macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") set(var_name ${CMAKE_MATCH_1}) set(var_value ${CMAKE_MATCH_2}) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) foreach (make_var ${make_var_matches}) # strip out Makefile $() markup string(REGEX REPLACE "\\$\\(([0-9_a-zA-Z]+)\\)" "\\1" make_var ${make_var}) # now replace the instance of the Makefile variable with the value of the CMake variable (note the double quote) string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) endforeach () set(${var_name} ${var_value}) else () string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) endif () endif () endforeach () endmacro () # Returns all combinations of the input list, as a list with colon-separated combinations # E.g. input of A B C returns A B C A:B A:C B:C # N.B. The input is meant to be a list, and to past a list to a function in CMake you must quote it (e.g. AllCombinations("${LIST_VAR}")). # #param absent_codes codes to use when an element is absent from a combination. For example, if you have TRANS;UNIT;UPPER you may want the code to be NNL when nothing is present. # @returns LIST_OUT a list of combinations # CODES_OUT a list of codes corresponding to each combination, with N meaning the item is not present, and the first letter of the list item meaning it is presen function(AllCombinations list_in absent_codes_in) list(LENGTH list_in list_count) set(num_combos 1) # subtract 1 since we will iterate from 0 to num_combos math(EXPR num_combos "(${num_combos} << ${list_count}) - 1") set(LIST_OUT "") set(CODES_OUT "") foreach (c RANGE 0 ${num_combos}) set(current_combo "") set(current_code "") # this is a little ridiculous just to iterate through a list w/ indices math(EXPR last_list_index "${list_count} - 1") foreach (list_index RANGE 0 ${last_list_index}) math(EXPR bit "1 << ${list_index}") math(EXPR combo_has_bit "${c} & ${bit}") list(GET list_in ${list_index} list_elem) if (combo_has_bit) if (current_combo) set(current_combo "${current_combo}:${list_elem}") else () set(current_combo ${list_elem}) endif () string(SUBSTRING ${list_elem} 0 1 code_char) else () list(GET absent_codes_in ${list_index} code_char) endif () set(current_code "${current_code}${code_char}") endforeach () if (current_combo STREQUAL "") list(APPEND LIST_OUT " ") # Empty set is a valid combination, but CMake isn't appending the empty string for some reason, use a space else () list(APPEND LIST_OUT ${current_combo}) endif () list(APPEND CODES_OUT ${current_code}) endforeach () set(LIST_OUT ${LIST_OUT} PARENT_SCOPE) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) endfunction () # generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition # @param sources_in the source files to build from # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. # e.g. with DOUBLE set, "i*max" will generate the name "idmax", and "max" will be "dmax" # @param replace_last_with replaces the last character in the filename with this string (e.g. symm_k should be symm_TU) # @param append_with appends the filename with this string (e.g. trmm_R should be trmm_RTUU or some other combination of characters) # @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) # @param complex_filename_scheme some routines have separate source files for complex and non-complex float types. # 0 - compiles for all types # 1 - compiles the sources for non-complex types only (SINGLE/DOUBLE) # 2 - compiles for complex types only (COMPLEX/DOUBLE COMPLEX) # 3 - compiles for all types, but changes source names for complex by prepending z (e.g. axpy.c becomes zaxpy.c) # 4 - compiles for complex types only, but changes source names for complex by prepending z (e.g. hemv.c becomes zhemv.c) # STRING - compiles only the given type (e.g. DOUBLE) function(GenerateNamedObjects sources_in) if (DEFINED ARGV1) set(defines_in ${ARGV1}) endif () if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") set(name_in ${ARGV2}) # strip off extension for kernel files that pass in the object name. get_filename_component(name_in ${name_in} NAME_WE) endif () if (DEFINED ARGV3) set(use_cblas ${ARGV3}) else () set(use_cblas false) endif () if (DEFINED ARGV4) set(replace_last_with ${ARGV4}) endif () if (DEFINED ARGV5) set(append_with ${ARGV5}) endif () if (DEFINED ARGV6) set(no_float_type ${ARGV6}) else () set(no_float_type false) endif () if (no_float_type) set(float_list "DUMMY") # still need to loop once else () set(float_list "${FLOAT_TYPES}") endif () set(real_only false) set(complex_only false) set(mangle_complex_sources false) if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") if (${ARGV7} EQUAL 1) set(real_only true) elseif (${ARGV7} EQUAL 2) set(complex_only true) elseif (${ARGV7} EQUAL 3) set(mangle_complex_sources true) elseif (${ARGV7} EQUAL 4) set(mangle_complex_sources true) set(complex_only true) elseif (NOT ${ARGV7} EQUAL 0) set(float_list ${ARGV7}) endif () endif () if (complex_only) list(REMOVE_ITEM float_list "SINGLE") list(REMOVE_ITEM float_list "DOUBLE") elseif (real_only) list(REMOVE_ITEM float_list "COMPLEX") list(REMOVE_ITEM float_list "ZCOMPLEX") endif () set(float_char "") set(OBJ_LIST_OUT "") foreach (float_type ${float_list}) foreach (source_file ${sources_in}) if (NOT no_float_type) string(SUBSTRING ${float_type} 0 1 float_char) string(TOLOWER ${float_char} float_char) endif () if (NOT name_in) get_filename_component(source_name ${source_file} NAME_WE) set(obj_name "${float_char}${source_name}") else () # replace * with float_char if (${name_in} MATCHES "\\*") string(REPLACE "*" ${float_char} obj_name ${name_in}) else () set(obj_name "${float_char}${name_in}") endif () endif () if (replace_last_with) string(REGEX REPLACE ".$" ${replace_last_with} obj_name ${obj_name}) else () set(obj_name "${obj_name}${append_with}") endif () # now add the object and set the defines set(obj_defines ${defines_in}) if (use_cblas) set(obj_name "cblas_${obj_name}") list(APPEND obj_defines "CBLAS") endif () list(APPEND obj_defines "ASMNAME=${FU}${obj_name};ASMFNAME=${FU}${obj_name}${BU};NAME=${obj_name}${BU};CNAME=${obj_name};CHAR_NAME=\"${obj_name}${BU}\";CHAR_CNAME=\"${obj_name}\"") if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "DOUBLE") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") if (mangle_complex_sources) # add a z to the filename get_filename_component(source_name ${source_file} NAME) get_filename_component(source_dir ${source_file} DIRECTORY) string(REPLACE ${source_name} "z${source_name}" source_file ${source_file}) endif () endif () if (VERBOSE_GEN) message(STATUS "${obj_name}:${source_file}") message(STATUS "${obj_defines}") endif () # create a copy of the source to avoid duplicate obj filename problem with ar.exe get_filename_component(source_extension ${source_file} EXT) set(new_source_file "${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${obj_name}${source_extension}") if (IS_ABSOLUTE ${source_file}) set(old_source_file ${source_file}) else () set(old_source_file "${CMAKE_CURRENT_LIST_DIR}/${source_file}") endif () string(REPLACE ";" "\n#define " define_source "${obj_defines}") string(REPLACE "=" " " define_source "${define_source}") file(WRITE ${new_source_file} "#define ${define_source}\n#include \"${old_source_file}\"") list(APPEND SRC_LIST_OUT ${new_source_file}) endforeach () endforeach () list(APPEND OPENBLAS_SRC ${SRC_LIST_OUT}) set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE) endfunction () # generates object files for each of the sources for each of the combinations of the preprocessor definitions passed in # @param sources_in the source files to build from # @param defines_in the preprocessor definitions that will be combined to create the object files # @param all_defines_in (optional) preprocessor definitions that will be applied to all objects # @param replace_scheme If 1, replace the "k" in the filename with the define combo letters. E.g. symm_k.c with TRANS and UNIT defined will be symm_TU. # If 0, it will simply append the code, e.g. symm_L.c with TRANS and UNIT will be symm_LTU. # If 2, it will append the code with an underscore, e.g. symm.c with TRANS and UNIT will be symm_TU. # If 3, it will insert the code *around* the last character with an underscore, e.g. symm_L.c with TRANS and UNIT will be symm_TLU (required by BLAS level2 objects). # If 4, it will insert the code before the last underscore. E.g. trtri_U_parallel with TRANS will be trtri_UT_parallel # @param alternate_name replaces the source name as the object name (define codes are still appended) # @param no_float_type turns off the float type define for this build (e.g. SINGLE/DOUBLE/etc) # @param complex_filename_scheme see GenerateNamedObjects function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) set(alternate_name_in "") if (DEFINED ARGV5) set(alternate_name_in ${ARGV5}) endif () set(no_float_type false) if (DEFINED ARGV6) set(no_float_type ${ARGV6}) endif () set(complex_filename_scheme "") if (DEFINED ARGV7) set(complex_filename_scheme ${ARGV7}) endif () AllCombinations("${defines_in}" "${absent_codes_in}") set(define_combos ${LIST_OUT}) set(define_codes ${CODES_OUT}) list(LENGTH define_combos num_combos) math(EXPR num_combos "${num_combos} - 1") foreach (c RANGE 0 ${num_combos}) list(GET define_combos ${c} define_combo) list(GET define_codes ${c} define_code) foreach (source_file ${sources_in}) set(alternate_name ${alternate_name_in}) # replace colon separated list with semicolons, this turns it into a CMake list that we can use foreach with string(REPLACE ":" ";" define_combo ${define_combo}) # now add the object and set the defines set(cur_defines ${define_combo}) if ("${cur_defines}" STREQUAL " ") set(cur_defines ${all_defines_in}) else () list(APPEND cur_defines ${all_defines_in}) endif () set(replace_code "") set(append_code "") if (replace_scheme EQUAL 1) set(replace_code ${define_code}) else () if (replace_scheme EQUAL 2) set(append_code "_${define_code}") elseif (replace_scheme EQUAL 3) if ("${alternate_name}" STREQUAL "") string(REGEX MATCH "[a-zA-Z]\\." last_letter ${source_file}) else () string(REGEX MATCH "[a-zA-Z]$" last_letter ${alternate_name}) endif () # first extract the last letter string(SUBSTRING ${last_letter} 0 1 last_letter) # remove period from match # break the code up into the first letter and the remaining (should only be 2 anyway) string(SUBSTRING ${define_code} 0 1 define_code_first) string(SUBSTRING ${define_code} 1 -1 define_code_second) set(replace_code "${define_code_first}${last_letter}${define_code_second}") elseif (replace_scheme EQUAL 4) # insert code before the last underscore and pass that in as the alternate_name if ("${alternate_name}" STREQUAL "") get_filename_component(alternate_name ${source_file} NAME_WE) endif () set(extra_underscore "") # check if filename has two underscores, insert another if not (e.g. getrs_parallel needs to become getrs_U_parallel not getrsU_parallel) string(REGEX MATCH "_[a-zA-Z]+_" underscores ${alternate_name}) string(LENGTH "${underscores}" underscores) if (underscores EQUAL 0) set(extra_underscore "_") endif () string(REGEX REPLACE "(.+)(_[^_]+)$" "\\1${extra_underscore}${define_code}\\2" alternate_name ${alternate_name}) else() set(append_code ${define_code}) # replace_scheme should be 0 endif () endif () GenerateNamedObjects("${source_file}" "${cur_defines}" "${alternate_name}" false "${replace_code}" "${append_code}" "${no_float_type}" "${complex_filename_scheme}") endforeach () endforeach () set(OPENBLAS_SRC ${OPENBLAS_SRC} PARENT_SCOPE) endfunction () OpenBLAS-0.2.20/common.h000066400000000000000000000465641313527062700146240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_H #define COMMON_H #ifdef __cplusplus extern "C" { /* Assume C declarations for C++ */ #endif /* __cplusplus */ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #ifndef __USE_XOPEN #define __USE_XOPEN #endif #ifndef __USE_SVID #define __USE_SVID #endif #ifdef BUILD_KERNEL #include "config_kernel.h" #else #include "config.h" #endif #undef ENABLE_SSE_EXCEPTION #if defined(SMP_SERVER) || defined(SMP_ONDEMAND) #define SMP #endif #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define WINDOWS_ABI #define OS_WINDOWS #ifdef DOUBLE #define DOUBLE_DEFINED DOUBLE #undef DOUBLE #endif #endif #if !defined(NOINCLUDE) && !defined(ASSEMBLER) #include #include #include #if !defined(_MSC_VER) #include #endif #include #ifdef OS_LINUX #include #include #endif #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) #include #endif #ifdef OS_ANDROID #define NO_SYSV_IPC //Android NDK only supports complex.h since Android 5.0 #if __ANDROID_API__ < 21 #define FORCE_OPENBLAS_COMPLEX_STRUCT #endif #endif #ifdef OS_WINDOWS #ifdef ATOM #define GOTO_ATOM ATOM #undef ATOM #endif #include #include #ifdef GOTO_ATOM #define ATOM GOTO_ATOM #undef GOTO_ATOM #endif #else #include #ifndef NO_SYSV_IPC #include #endif #include #include #include #include #ifdef SMP #include #endif #endif #if defined(OS_SUNOS) #include #endif #ifdef __DECC #include #include #endif #if defined(ARCH_IA64) && defined(ENABLE_SSE_EXCEPTION) #include #endif #endif #if defined(OS_WINDOWS) && defined(DOUBLE_DEFINED) #define DOUBLE DOUBLE_DEFINED #undef DOUBLE_DEFINED #endif #undef DEBUG_INFO #define SMP_DEBUG #undef MALLOC_DEBUG #undef SMP_ALLOC_DEBUG #ifndef ZERO #ifdef XDOUBLE #define ZERO 0.e0L #elif defined DOUBLE #define ZERO 0.e0 #else #define ZERO 0.e0f #endif #endif #ifndef ONE #ifdef XDOUBLE #define ONE 1.e0L #elif defined DOUBLE #define ONE 1.e0 #else #define ONE 1.e0f #endif #endif #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define ALLOCA_ALIGN 63UL #define NUM_BUFFERS (MAX_CPU_NUMBER * 2) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ #else #define BLASFUNC(FUNC) FUNC #endif #undef USE_PTHREAD_LOCK #undef USE_PTHREAD_SPINLOCK #if defined(USE_PTHREAD_LOCK) && defined(USE_PTHREAD_SPINLOCK) #error "You can't specify both LOCK operation!" #endif #ifdef SMP #define USE_PTHREAD_LOCK #undef USE_PTHREAD_SPINLOCK #endif #ifdef OS_WINDOWS #undef USE_PTHREAD_LOCK #undef USE_PTHREAD_SPINLOCK #endif #if defined(USE_PTHREAD_LOCK) #define LOCK_COMMAND(x) pthread_mutex_lock(x) #define UNLOCK_COMMAND(x) pthread_mutex_unlock(x) #elif defined(USE_PTHREAD_SPINLOCK) #ifndef ASSEMBLER typedef volatile int pthread_spinlock_t; int pthread_spin_lock (pthread_spinlock_t *__lock); int pthread_spin_unlock (pthread_spinlock_t *__lock); #endif #define LOCK_COMMAND(x) pthread_spin_lock(x) #define UNLOCK_COMMAND(x) pthread_spin_unlock(x) #else #define LOCK_COMMAND(x) blas_lock(x) #define UNLOCK_COMMAND(x) blas_unlock(x) #endif #define GOTO_SHMID 0x510510 #if 0 #ifndef __CUDACC__ #define __global__ #define __device__ #define __host__ #define __shared__ #endif #endif #ifndef ASSEMBLER #ifdef QUAD_PRECISION typedef struct { unsigned long x[2]; } xdouble; #elif defined EXPRECISION #define xdouble long double #else #define xdouble double #endif #if defined(OS_WINDOWS) && defined(__64BIT__) typedef long long BLASLONG; typedef unsigned long long BLASULONG; #else typedef long BLASLONG; typedef unsigned long BLASULONG; #endif #ifdef USE64BITINT typedef BLASLONG blasint; #else typedef int blasint; #endif #else #ifdef USE64BITINT #define INTSHIFT 3 #define INTSIZE 8 #else #define INTSHIFT 2 #define INTSIZE 4 #endif #endif #ifdef XDOUBLE #define FLOAT xdouble #ifdef QUAD_PRECISION #define XFLOAT xidouble #endif #ifdef QUAD_PRECISION #define SIZE 32 #define BASE_SHIFT 5 #define ZBASE_SHIFT 6 #else #define SIZE 16 #define BASE_SHIFT 4 #define ZBASE_SHIFT 5 #endif #elif defined(DOUBLE) #define FLOAT double #define SIZE 8 #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 #else #define FLOAT float #define SIZE 4 #define BASE_SHIFT 2 #define ZBASE_SHIFT 3 #endif #ifndef XFLOAT #define XFLOAT FLOAT #endif #ifndef COMPLEX #define COMPSIZE 1 #else #define COMPSIZE 2 #endif #define Address_H(x) (((x)+(1<<15))>>16) #define Address_L(x) ((x)-((Address_H(x))<<16)) #ifndef MAX_CPU_NUMBER #define MAX_CPU_NUMBER 2 #endif #if defined(OS_SUNOS) #define YIELDING thr_yield() #endif #if defined(OS_WINDOWS) #if defined(_MSC_VER) && !defined(__clang__) #define YIELDING YieldProcessor() #else #define YIELDING SwitchToThread() #endif #endif #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #endif #ifdef BULLDOZER #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif #endif #ifdef POWER8 #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif #endif /* #ifdef PILEDRIVER #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif #endif */ /* #ifdef STEAMROLLER #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif #endif */ #ifndef YIELDING #define YIELDING sched_yield() #endif /*** To alloc job_t on heap or statck. please https://github.com/xianyi/OpenBLAS/issues/246 ***/ #if defined(OS_WINDOWS) #define GETRF_MEM_ALLOC_THRESHOLD 32 #define BLAS3_MEM_ALLOC_THRESHOLD 32 #endif #ifndef GETRF_MEM_ALLOC_THRESHOLD #define GETRF_MEM_ALLOC_THRESHOLD 80 #endif #ifndef BLAS3_MEM_ALLOC_THRESHOLD #define BLAS3_MEM_ALLOC_THRESHOLD 160 #endif #ifdef QUAD_PRECISION #include "common_quad.h" #endif #ifdef ARCH_ALPHA #include "common_alpha.h" #endif #ifdef ARCH_X86 #include "common_x86.h" #endif #ifdef ARCH_X86_64 #include "common_x86_64.h" #endif #ifdef ARCH_IA64 #include "common_ia64.h" #endif #ifdef ARCH_POWER #include "common_power.h" #endif #ifdef sparc #include "common_sparc.h" #endif #ifdef ARCH_MIPS #include "common_mips.h" #endif #ifdef ARCH_MIPS64 #include "common_mips64.h" #endif #ifdef ARCH_ARM #include "common_arm.h" #endif #ifdef ARCH_ARM64 #include "common_arm64.h" #endif #ifdef ARCH_ZARCH #include "common_zarch.h" #endif #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; #define readenv(p, n) 0 #else #ifdef OS_WINDOWS typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #else typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif #endif #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK #if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17) #if __GLIBC_PREREQ(2, 17) // don't require -lrt #define USE_MONOTONIC #endif #elif defined(OS_ANDROID) #define USE_MONOTONIC #endif #endif /* use similar scale as x86 rdtsc for timeouts to work correctly */ static inline unsigned long long rpcc(void){ #ifdef USE_MONOTONIC struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; #else struct timeval tv; gettimeofday(&tv,NULL); return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; #endif } #define RPCC_DEFINED #define RPCC64BIT #endif // !RPCC_DEFINED #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) static void __inline blas_lock(volatile BLASULONG *address){ do { while (*address) {YIELDING;}; } while (!__sync_bool_compare_and_swap(address, 0, 1)); } #define BLAS_LOCK_DEFINED #endif #ifndef RPCC_DEFINED #error "rpcc() implementation is missing for your platform" #endif #ifndef BLAS_LOCK_DEFINED #error "blas_lock() implementation is missing for your platform" #endif #endif // !ASSEMBLER #ifdef OS_LINUX #include "common_linux.h" #endif #define MMAP_ACCESS (PROT_READ | PROT_WRITE) #ifdef __NetBSD__ #define MMAP_POLICY (MAP_PRIVATE | MAP_ANON) #else #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) #endif #include "param.h" #include "common_param.h" #ifndef STDERR #define STDERR stderr #endif #ifndef MASK #define MASK(a, b) (((a) + ((b) - 1)) & ~((b) - 1)) #endif #if defined(XDOUBLE) || defined(DOUBLE) #define FLOATRET FLOAT #else #ifdef NEED_F2CCONV #define FLOATRET double #else #define FLOATRET float #endif #endif #ifndef ASSEMBLER #ifndef NOINCLUDE /* Inclusion of a standard header file is needed for definition of __STDC_* predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs as a side effect of including either or . */ #include #endif // NOINCLUDE /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) #define OPENBLAS_COMPLEX_C99 #ifndef __cplusplus #include #endif typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; typedef xdouble _Complex openblas_complex_xdouble; #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) #else #define OPENBLAS_COMPLEX_STRUCT typedef struct { float real, imag; } openblas_complex_float; typedef struct { double real, imag; } openblas_complex_double; typedef struct { xdouble real, imag; } openblas_complex_xdouble; #define openblas_make_complex_float(real, imag) {(real), (imag)} #define openblas_make_complex_double(real, imag) {(real), (imag)} #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} #endif #ifdef XDOUBLE #define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) #elif defined(DOUBLE) #define OPENBLAS_COMPLEX_FLOAT openblas_complex_double #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i) #else #define OPENBLAS_COMPLEX_FLOAT openblas_complex_float #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i) #endif #if defined(C_PGI) || defined(C_SUN) #if defined(__STDC_IEC_559_COMPLEX__) #define CREAL(X) creal(X) #define CIMAG(X) cimag(X) #else #define CREAL(X) (*((FLOAT *)&X + 0)) #define CIMAG(X) (*((FLOAT *)&X + 1)) #endif #else #ifdef OPENBLAS_COMPLEX_STRUCT #define CREAL(Z) ((Z).real) #define CIMAG(Z) ((Z).imag) #else #define CREAL __real__ #define CIMAG __imag__ #endif #endif #endif // ASSEMBLER #ifndef IFLUSH #define IFLUSH #endif #ifndef IFLUSH_HALF #define IFLUSH_HALF #endif #if defined(C_GCC) && (( __GNUC__ <= 3) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2))) #ifdef USE_OPENMP #undef USE_OPENMP #endif #endif #if defined(C_MSVC) #define inline __inline #endif #ifndef ASSEMBLER #ifndef MIN #define MIN(a,b) (a>b? b:a) #endif #ifndef MAX #define MAX(a,b) (a 0x60) (a) -= 0x20;} #if defined(__FreeBSD__) || defined(__APPLE__) #define MAP_ANONYMOUS MAP_ANON #endif /* Common Memory Management Routine */ void blas_set_parameter(void); int blas_get_cpu_number(void); void *blas_memory_alloc (int); void blas_memory_free (void *); void *blas_memory_alloc_nolock (int); //use malloc without blas_lock void blas_memory_free_nolock (void *); int get_num_procs (void); #if defined(OS_LINUX) && defined(SMP) && !defined(NO_AFFINITY) int get_num_nodes (void); int get_num_proc (int); int get_node_equal (void); #endif void goto_set_num_threads(int); void gotoblas_affinity_init(void); void gotoblas_affinity_quit(void); void gotoblas_dynamic_init(void); void gotoblas_dynamic_quit(void); void gotoblas_profile_init(void); void gotoblas_profile_quit(void); #ifdef USE_OPENMP #ifndef C_MSVC int omp_in_parallel(void); int omp_get_num_procs(void); #else __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif #else #ifdef __ELF__ int omp_in_parallel (void) __attribute__ ((weak)); int omp_get_num_procs(void) __attribute__ ((weak)); #endif #endif static __inline void blas_unlock(volatile BLASULONG *address){ MB; *address = 0; } #ifdef OS_WINDOWSSTORE static __inline int readenv_atoi(char *env) { return 0; } #else #ifdef OS_WINDOWS static __inline int readenv_atoi(char *env) { env_var_t p; return readenv(p,env) ? 0 : atoi(p); } #else static __inline int readenv_atoi(char *env) { char *p; if (( p = getenv(env) )) return (atoi(p)); else return(0); } #endif #endif #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ #ifndef UNIT FLOAT ratio, den; if ( #ifdef XDOUBLE (fabsl(ar)) >= (fabsl(ai)) #elif defined DOUBLE (fabs (ar)) >= (fabs (ai)) #else (fabsf(ar)) >= (fabsf(ai)) #endif ) { ratio = ai / ar; den = (FLOAT)(ONE / (ar * (ONE + ratio * ratio))); ar = den; ai = -ratio * den; } else { ratio = ar / ai; den = (FLOAT)(ONE /(ai * (ONE + ratio * ratio))); ar = ratio * den; ai = -den; } b[0] = ar; b[1] = ai; #else b[0] = ONE; b[1] = ZERO; #endif } #endif #ifdef MALLOC_DEBUG void *blas_debug_alloc(int); void *blas_debug_free(void *); #undef malloc #undef free #define malloc(a) blas_debug_alloc(a) #define free(a) blas_debug_free (a) #endif #ifndef COPYOVERHEAD #define GEMMRETTYPE int #else typedef struct { double outercopy; double innercopy; double kernel; double mflops; } copyoverhead_t; #define GEMMRETTYPE copyoverhead_t #endif #endif #ifndef BUILD_KERNEL #define KNAME(A, B) A #else #define KNAME(A, B) A##B #endif #include "common_interface.h" #ifdef SANITY_CHECK #include "common_reference.h" #endif #include "common_macro.h" #include "common_level1.h" #include "common_level2.h" #include "common_level3.h" #include "common_lapack.h" #ifdef CBLAS # define OPENBLAS_CONST /* see comment in cblas.h */ # include "cblas.h" #endif #ifndef ASSEMBLER #include "common_stackalloc.h" #if 0 #include "symcopy.h" #endif #if defined(SMP_SERVER) && defined(SMP_ONDEMAND) #error Both SMP_SERVER and SMP_ONDEMAND are specified. #endif #if defined(SMP_SERVER) || defined(SMP_ONDEMAND) #include "common_thread.h" #endif #endif #define INFO_NUM 99 #ifndef DEFAULT_CPU_NUMBER #define DEFAULT_CPU_NUMBER 4 #endif #ifndef IDEBUG_START #define IDEBUG_START #endif #ifndef IDEBUG_END #define IDEBUG_END #endif #if !defined(ASSEMBLER) && defined(FUNCTION_PROFILE) typedef struct { int func; unsigned long long calls, fops, area, cycles, tcycles; } func_profile_t; extern func_profile_t function_profile_table[]; extern int gotoblas_profile; #ifdef XDOUBLE #define NUMOPT QNUMOPT #elif defined DOUBLE #define NUMOPT DNUMOPT #else #define NUMOPT SNUMOPT #endif #define FUNCTION_PROFILE_START() { unsigned long long profile_start = rpcc(), profile_end; #ifdef SMP #define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ if (gotoblas_profile) { \ profile_end = rpcc(); \ function_profile_table[PROFILE_FUNC_NAME].calls ++; \ function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ function_profile_table[PROFILE_FUNC_NAME].tcycles += blas_cpu_number * (profile_end - profile_start); \ } \ } #else #define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ if (gotoblas_profile) { \ profile_end = rpcc(); \ function_profile_table[PROFILE_FUNC_NAME].calls ++; \ function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ function_profile_table[PROFILE_FUNC_NAME].tcycles += (profile_end - profile_start); \ } \ } #endif #else #define FUNCTION_PROFILE_START() #define FUNCTION_PROFILE_END(COMP, AREA, OPS) #endif #if 1 #define PRINT_DEBUG_CNAME #define PRINT_DEBUG_NAME #else #define PRINT_DEBUG_CNAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME) #define PRINT_DEBUG_NAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME) #endif #ifdef __cplusplus } #endif /* __cplusplus */ #endif OpenBLAS-0.2.20/common_alpha.h000066400000000000000000000125571313527062700157640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_ALPHA #define COMMON_ALPHA #ifndef ASSEMBLER #define MB asm("mb") #define WMB asm("wmb") static void __inline blas_lock(unsigned long *address){ #ifndef __DECC unsigned long tmp1, tmp2; asm volatile( "1: ldq %1, %0\n" " bne %1, 2f\n" " ldq_l %1, %0\n" " bne %1, 2f\n" " or %1, 1, %2\n" " stq_c %2, %0\n" " beq %2, 2f\n" " mb\n " " br $31, 3f\n" "2: br $31, 1b\n" "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2) : : "memory"); #else asm ( "10:" " ldq %t0, 0(%a0); " " bne %t0, 20f; " " ldq_l %t0, 0(%a0); " " bne %t0, 20f; " " or %t0, 1, %t1;" " stq_c %t1, 0(%a0); " " beq %t1, 20f; " " mb; " " br %r31,30f; " "20: " " br %r31,10b; " "30:", address); #endif } #define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void){ unsigned int r0; #ifndef __DECC asm __volatile__("rpcc %0" : "=r"(r0) : : "memory"); #else r0 = asm("rpcc %v0"); #endif return r0; } #define RPCC_DEFINED #define HALT ldq $0, 0($0) #ifndef __DECC #define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory") #else #define GET_IMAGE(res) res = dasm("fmov $f1, %f0") #endif #ifdef SMP #ifdef USE64BITINT static __inline long blas_quickdivide(long x, long y){ return x/y; } #else extern unsigned int blas_quick_divide_table[]; static __inline int blas_quickdivide(unsigned int x, unsigned int y){ if (y <= 1) return x; return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); } #endif #endif #define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13)) #ifndef PAGESIZE #define PAGESIZE ( 8UL << 10) #define HUGE_PAGESIZE ( 4 << 20) #endif #define BUFFER_SIZE (32UL << 20) #else #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #define PROLOGUE \ .arch ev6; \ .set noat; \ .set noreorder; \ .text; \ .align 5; \ .globl REALNAME; \ .ent REALNAME; \ REALNAME: #ifdef PROFILE #define PROFCODE \ ldgp $gp, 0($27); \ lda $28, _mcount; \ jsr $28, ($28), _mcount; \ .prologue 1 #else #define PROFCODE .prologue 0 #endif #if defined(__linux__) && defined(__ELF__) #define GNUSTACK .section .note.GNU-stack,"",@progbits #else #define GNUSTACK #endif #define EPILOGUE \ .end REALNAME; \ .ident VERSION; \ GNUSTACK #endif #ifdef DOUBLE #define SXADDQ s8addq #define SXSUBL s8subl #define LD ldt #define ST stt #define STQ stq #define ADD addt/su #define SUB subt/su #define MUL mult/su #define DIV divt/su #else #define SXADDQ s4addq #define SXSUBL s4subl #define LD lds #define ST sts #define STQ stl #define ADD adds/su #define SUB subs/su #define MUL muls/su #define DIV divs/su #endif #endif OpenBLAS-0.2.20/common_arm.h000066400000000000000000000071011313527062700154430ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #ifndef COMMON_ARM #define COMMON_ARM #if defined(ARMV5) || defined(ARMV6) #define MB #define WMB #else #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") #endif #define INLINE inline #define RETURN_BY_COMPLEX #ifndef ASSEMBLER #if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) static void __inline blas_lock(volatile BLASULONG *address){ int register ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( "ldrex r2, [%1] \n\t" "strex %0, %2, [%1] \n\t" "orr %0, r2 \n\t" : "=&r"(ret) : "r"(address), "r"(1) : "memory", "r2" ); } while (ret); MB; } #define BLAS_LOCK_DEFINED #endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } #if !defined(HAVE_VFP) /* no FPU, soft float */ #define GET_IMAGE(res) #elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ .arm ;\ .global REALNAME ;\ REALNAME: #define EPILOGUE #define PROFCODE #endif #define SEEK_ADDRESS #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE ( 4 << 20) #define BUFFER_SIZE (16 << 20) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #if !defined(ARMV5) && !defined(ARMV6) && !defined(ARMV7) && !defined(ARMV8) #error "you must define ARMV5, ARMV6, ARMV7 or ARMV8" #endif #endif OpenBLAS-0.2.20/common_arm64.h000066400000000000000000000075461313527062700156320ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #ifndef COMMON_ARM64 #define COMMON_ARM64 #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") #define INLINE inline #ifdef F_INTERFACE_FLANG #define RETURN_BY_STACK #else #define RETURN_BY_COMPLEX #endif #ifndef ASSEMBLER static void __inline blas_lock(volatile BLASULONG *address){ BLASULONG ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( "mov x4, #1 \n\t" "1: \n\t" "ldaxr x2, [%1] \n\t" "cbnz x2, 1b \n\t" "2: \n\t" "stxr w3, x4, [%1] \n\t" "cbnz w3, 1b \n\t" "mov %0, #0 \n\t" : "=r"(ret), "=r"(address) : "1"(address) : "memory", "x2" , "x3", "x4" ); } while (ret); } #define BLAS_LOCK_DEFINED static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } #if defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ .text ;\ .align 4 ;\ .global REALNAME ;\ .type REALNAME, %function ;\ REALNAME: #define EPILOGUE #define PROFCODE #endif #define SEEK_ADDRESS #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE ( 4 << 20) #if defined(CORTEXA57) #define BUFFER_SIZE (20 << 20) #else #define BUFFER_SIZE (16 << 20) #endif #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #endif OpenBLAS-0.2.20/common_c.h000066400000000000000000000534421313527062700151170ustar00rootroot00000000000000#ifndef COMMON_C_H #define COMMON_C_H #ifndef DYNAMIC_ARCH #define CAMAX_K camax_k #define CAMIN_K camin_k #define CMAX_K cmax_k #define CMIN_K cmin_k #define ICAMAX_K icamax_k #define ICAMIN_K icamin_k #define ICMAX_K icmax_k #define ICMIN_K icmin_k #define CASUM_K casum_k #define CAXPYU_K caxpy_k #define CAXPYC_K caxpyc_k #define CCOPY_K ccopy_k #define CDOTU_K cdotu_k #define CDOTC_K cdotc_k #define CNRM2_K cnrm2_k #define CSCAL_K cscal_k #define CSWAP_K cswap_k #define CROT_K csrot_k #define CGEMV_N cgemv_n #define CGEMV_T cgemv_t #define CGEMV_R cgemv_r #define CGEMV_C cgemv_c #define CGEMV_O cgemv_o #define CGEMV_U cgemv_u #define CGEMV_S cgemv_s #define CGEMV_D cgemv_d #define CGERU_K cgeru_k #define CGERC_K cgerc_k #define CGERV_K cgerv_k #define CGERD_K cgerd_k #define CSYMV_U csymv_U #define CSYMV_L csymv_L #define CHEMV_U chemv_U #define CHEMV_L chemv_L #define CHEMV_V chemv_V #define CHEMV_M chemv_M #define CSYMV_THREAD_U csymv_thread_U #define CSYMV_THREAD_L csymv_thread_L #define CHEMV_THREAD_U chemv_thread_U #define CHEMV_THREAD_L chemv_thread_L #define CHEMV_THREAD_V chemv_thread_V #define CHEMV_THREAD_M chemv_thread_M #define CGEMM_ONCOPY cgemm_oncopy #define CGEMM_OTCOPY cgemm_otcopy #if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N #define CGEMM_INCOPY cgemm_oncopy #define CGEMM_ITCOPY cgemm_otcopy #else #define CGEMM_INCOPY cgemm_incopy #define CGEMM_ITCOPY cgemm_itcopy #endif #define CTRMM_OUNUCOPY ctrmm_ounucopy #define CTRMM_OUNNCOPY ctrmm_ounncopy #define CTRMM_OUTUCOPY ctrmm_outucopy #define CTRMM_OUTNCOPY ctrmm_outncopy #define CTRMM_OLNUCOPY ctrmm_olnucopy #define CTRMM_OLNNCOPY ctrmm_olnncopy #define CTRMM_OLTUCOPY ctrmm_oltucopy #define CTRMM_OLTNCOPY ctrmm_oltncopy #define CTRSM_OUNUCOPY ctrsm_ounucopy #define CTRSM_OUNNCOPY ctrsm_ounncopy #define CTRSM_OUTUCOPY ctrsm_outucopy #define CTRSM_OUTNCOPY ctrsm_outncopy #define CTRSM_OLNUCOPY ctrsm_olnucopy #define CTRSM_OLNNCOPY ctrsm_olnncopy #define CTRSM_OLTUCOPY ctrsm_oltucopy #define CTRSM_OLTNCOPY ctrsm_oltncopy #if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N #define CTRMM_IUNUCOPY ctrmm_ounucopy #define CTRMM_IUNNCOPY ctrmm_ounncopy #define CTRMM_IUTUCOPY ctrmm_outucopy #define CTRMM_IUTNCOPY ctrmm_outncopy #define CTRMM_ILNUCOPY ctrmm_olnucopy #define CTRMM_ILNNCOPY ctrmm_olnncopy #define CTRMM_ILTUCOPY ctrmm_oltucopy #define CTRMM_ILTNCOPY ctrmm_oltncopy #define CTRSM_IUNUCOPY ctrsm_ounucopy #define CTRSM_IUNNCOPY ctrsm_ounncopy #define CTRSM_IUTUCOPY ctrsm_outucopy #define CTRSM_IUTNCOPY ctrsm_outncopy #define CTRSM_ILNUCOPY ctrsm_olnucopy #define CTRSM_ILNNCOPY ctrsm_olnncopy #define CTRSM_ILTUCOPY ctrsm_oltucopy #define CTRSM_ILTNCOPY ctrsm_oltncopy #else #define CTRMM_IUNUCOPY ctrmm_iunucopy #define CTRMM_IUNNCOPY ctrmm_iunncopy #define CTRMM_IUTUCOPY ctrmm_iutucopy #define CTRMM_IUTNCOPY ctrmm_iutncopy #define CTRMM_ILNUCOPY ctrmm_ilnucopy #define CTRMM_ILNNCOPY ctrmm_ilnncopy #define CTRMM_ILTUCOPY ctrmm_iltucopy #define CTRMM_ILTNCOPY ctrmm_iltncopy #define CTRSM_IUNUCOPY ctrsm_iunucopy #define CTRSM_IUNNCOPY ctrsm_iunncopy #define CTRSM_IUTUCOPY ctrsm_iutucopy #define CTRSM_IUTNCOPY ctrsm_iutncopy #define CTRSM_ILNUCOPY ctrsm_ilnucopy #define CTRSM_ILNNCOPY ctrsm_ilnncopy #define CTRSM_ILTUCOPY ctrsm_iltucopy #define CTRSM_ILTNCOPY ctrsm_iltncopy #endif #define CGEMM_BETA cgemm_beta #define CGEMM_KERNEL_N cgemm_kernel_n #define CGEMM_KERNEL_L cgemm_kernel_l #define CGEMM_KERNEL_R cgemm_kernel_r #define CGEMM_KERNEL_B cgemm_kernel_b #define CTRMM_KERNEL_LN ctrmm_kernel_LN #define CTRMM_KERNEL_LT ctrmm_kernel_LT #define CTRMM_KERNEL_LR ctrmm_kernel_LR #define CTRMM_KERNEL_LC ctrmm_kernel_LC #define CTRMM_KERNEL_RN ctrmm_kernel_RN #define CTRMM_KERNEL_RT ctrmm_kernel_RT #define CTRMM_KERNEL_RR ctrmm_kernel_RR #define CTRMM_KERNEL_RC ctrmm_kernel_RC #define CTRSM_KERNEL_LN ctrsm_kernel_LN #define CTRSM_KERNEL_LT ctrsm_kernel_LT #define CTRSM_KERNEL_LR ctrsm_kernel_LR #define CTRSM_KERNEL_LC ctrsm_kernel_LC #define CTRSM_KERNEL_RN ctrsm_kernel_RN #define CTRSM_KERNEL_RT ctrsm_kernel_RT #define CTRSM_KERNEL_RR ctrsm_kernel_RR #define CTRSM_KERNEL_RC ctrsm_kernel_RC #define CSYMM_OUTCOPY csymm_outcopy #define CSYMM_OLTCOPY csymm_oltcopy #if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N #define CSYMM_IUTCOPY csymm_outcopy #define CSYMM_ILTCOPY csymm_oltcopy #else #define CSYMM_IUTCOPY csymm_iutcopy #define CSYMM_ILTCOPY csymm_iltcopy #endif #define CHEMM_OUTCOPY chemm_outcopy #define CHEMM_OLTCOPY chemm_oltcopy #if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N #define CHEMM_IUTCOPY chemm_outcopy #define CHEMM_ILTCOPY chemm_oltcopy #else #define CHEMM_IUTCOPY chemm_iutcopy #define CHEMM_ILTCOPY chemm_iltcopy #endif #define CGEMM3M_ONCOPYB cgemm3m_oncopyb #define CGEMM3M_ONCOPYR cgemm3m_oncopyr #define CGEMM3M_ONCOPYI cgemm3m_oncopyi #define CGEMM3M_OTCOPYB cgemm3m_otcopyb #define CGEMM3M_OTCOPYR cgemm3m_otcopyr #define CGEMM3M_OTCOPYI cgemm3m_otcopyi #define CGEMM3M_INCOPYB cgemm3m_incopyb #define CGEMM3M_INCOPYR cgemm3m_incopyr #define CGEMM3M_INCOPYI cgemm3m_incopyi #define CGEMM3M_ITCOPYB cgemm3m_itcopyb #define CGEMM3M_ITCOPYR cgemm3m_itcopyr #define CGEMM3M_ITCOPYI cgemm3m_itcopyi #define CSYMM3M_ILCOPYB csymm3m_ilcopyb #define CSYMM3M_IUCOPYB csymm3m_iucopyb #define CSYMM3M_ILCOPYR csymm3m_ilcopyr #define CSYMM3M_IUCOPYR csymm3m_iucopyr #define CSYMM3M_ILCOPYI csymm3m_ilcopyi #define CSYMM3M_IUCOPYI csymm3m_iucopyi #define CSYMM3M_OLCOPYB csymm3m_olcopyb #define CSYMM3M_OUCOPYB csymm3m_oucopyb #define CSYMM3M_OLCOPYR csymm3m_olcopyr #define CSYMM3M_OUCOPYR csymm3m_oucopyr #define CSYMM3M_OLCOPYI csymm3m_olcopyi #define CSYMM3M_OUCOPYI csymm3m_oucopyi #define CHEMM3M_ILCOPYB chemm3m_ilcopyb #define CHEMM3M_IUCOPYB chemm3m_iucopyb #define CHEMM3M_ILCOPYR chemm3m_ilcopyr #define CHEMM3M_IUCOPYR chemm3m_iucopyr #define CHEMM3M_ILCOPYI chemm3m_ilcopyi #define CHEMM3M_IUCOPYI chemm3m_iucopyi #define CHEMM3M_OLCOPYB chemm3m_olcopyb #define CHEMM3M_OUCOPYB chemm3m_oucopyb #define CHEMM3M_OLCOPYR chemm3m_olcopyr #define CHEMM3M_OUCOPYR chemm3m_oucopyr #define CHEMM3M_OLCOPYI chemm3m_olcopyi #define CHEMM3M_OUCOPYI chemm3m_oucopyi #define CGEMM3M_KERNEL cgemm3m_kernel #define CNEG_TCOPY cneg_tcopy #define CLASWP_NCOPY claswp_ncopy #define CAXPBY_K caxpby_k #define COMATCOPY_K_CN comatcopy_k_cn #define COMATCOPY_K_RN comatcopy_k_rn #define COMATCOPY_K_CT comatcopy_k_ct #define COMATCOPY_K_RT comatcopy_k_rt #define COMATCOPY_K_CNC comatcopy_k_cnc #define COMATCOPY_K_RNC comatcopy_k_rnc #define COMATCOPY_K_CTC comatcopy_k_ctc #define COMATCOPY_K_RTC comatcopy_k_rtc #define CIMATCOPY_K_CN cimatcopy_k_cn #define CIMATCOPY_K_RN cimatcopy_k_rn #define CIMATCOPY_K_CT cimatcopy_k_ct #define CIMATCOPY_K_RT cimatcopy_k_rt #define CIMATCOPY_K_CNC cimatcopy_k_cnc #define CIMATCOPY_K_RNC cimatcopy_k_rnc #define CIMATCOPY_K_CTC cimatcopy_k_ctc #define CIMATCOPY_K_RTC cimatcopy_k_rtc #define CGEADD_K cgeadd_k #else #define CAMAX_K gotoblas -> camax_k #define CAMIN_K gotoblas -> camin_k #define CMAX_K gotoblas -> cmax_k #define CMIN_K gotoblas -> cmin_k #define ICAMAX_K gotoblas -> icamax_k #define ICAMIN_K gotoblas -> icamin_k #define ICMAX_K gotoblas -> icmax_k #define ICMIN_K gotoblas -> icmin_k #define CASUM_K gotoblas -> casum_k #define CAXPYU_K gotoblas -> caxpy_k #define CAXPYC_K gotoblas -> caxpyc_k #define CCOPY_K gotoblas -> ccopy_k #define CDOTU_K gotoblas -> cdotu_k #define CDOTC_K gotoblas -> cdotc_k #define CNRM2_K gotoblas -> cnrm2_k #define CSCAL_K gotoblas -> cscal_k #define CSWAP_K gotoblas -> cswap_k #define CROT_K gotoblas -> csrot_k #define CGEMV_N gotoblas -> cgemv_n #define CGEMV_T gotoblas -> cgemv_t #define CGEMV_R gotoblas -> cgemv_r #define CGEMV_C gotoblas -> cgemv_c #define CGEMV_O gotoblas -> cgemv_o #define CGEMV_U gotoblas -> cgemv_u #define CGEMV_S gotoblas -> cgemv_s #define CGEMV_D gotoblas -> cgemv_d #define CGERU_K gotoblas -> cgeru_k #define CGERC_K gotoblas -> cgerc_k #define CGERV_K gotoblas -> cgerv_k #define CGERD_K gotoblas -> cgerd_k #define CSYMV_U gotoblas -> csymv_U #define CSYMV_L gotoblas -> csymv_L #define CHEMV_U gotoblas -> chemv_U #define CHEMV_L gotoblas -> chemv_L #define CHEMV_V gotoblas -> chemv_V #define CHEMV_M gotoblas -> chemv_M #define CSYMV_THREAD_U csymv_thread_U #define CSYMV_THREAD_L csymv_thread_L #define CHEMV_THREAD_U chemv_thread_U #define CHEMV_THREAD_L chemv_thread_L #define CHEMV_THREAD_V chemv_thread_V #define CHEMV_THREAD_M chemv_thread_M #define CGEMM_ONCOPY gotoblas -> cgemm_oncopy #define CGEMM_OTCOPY gotoblas -> cgemm_otcopy #define CGEMM_INCOPY gotoblas -> cgemm_incopy #define CGEMM_ITCOPY gotoblas -> cgemm_itcopy #define CTRMM_OUNUCOPY gotoblas -> ctrmm_ounucopy #define CTRMM_OUTUCOPY gotoblas -> ctrmm_outucopy #define CTRMM_OLNUCOPY gotoblas -> ctrmm_olnucopy #define CTRMM_OLTUCOPY gotoblas -> ctrmm_oltucopy #define CTRSM_OUNUCOPY gotoblas -> ctrsm_ounucopy #define CTRSM_OUTUCOPY gotoblas -> ctrsm_outucopy #define CTRSM_OLNUCOPY gotoblas -> ctrsm_olnucopy #define CTRSM_OLTUCOPY gotoblas -> ctrsm_oltucopy #define CTRMM_IUNUCOPY gotoblas -> ctrmm_iunucopy #define CTRMM_IUTUCOPY gotoblas -> ctrmm_iutucopy #define CTRMM_ILNUCOPY gotoblas -> ctrmm_ilnucopy #define CTRMM_ILTUCOPY gotoblas -> ctrmm_iltucopy #define CTRSM_IUNUCOPY gotoblas -> ctrsm_iunucopy #define CTRSM_IUTUCOPY gotoblas -> ctrsm_iutucopy #define CTRSM_ILNUCOPY gotoblas -> ctrsm_ilnucopy #define CTRSM_ILTUCOPY gotoblas -> ctrsm_iltucopy #define CTRMM_OUNNCOPY gotoblas -> ctrmm_ounncopy #define CTRMM_OUTNCOPY gotoblas -> ctrmm_outncopy #define CTRMM_OLNNCOPY gotoblas -> ctrmm_olnncopy #define CTRMM_OLTNCOPY gotoblas -> ctrmm_oltncopy #define CTRSM_OUNNCOPY gotoblas -> ctrsm_ounncopy #define CTRSM_OUTNCOPY gotoblas -> ctrsm_outncopy #define CTRSM_OLNNCOPY gotoblas -> ctrsm_olnncopy #define CTRSM_OLTNCOPY gotoblas -> ctrsm_oltncopy #define CTRMM_IUNNCOPY gotoblas -> ctrmm_iunncopy #define CTRMM_IUTNCOPY gotoblas -> ctrmm_iutncopy #define CTRMM_ILNNCOPY gotoblas -> ctrmm_ilnncopy #define CTRMM_ILTNCOPY gotoblas -> ctrmm_iltncopy #define CTRSM_IUNNCOPY gotoblas -> ctrsm_iunncopy #define CTRSM_IUTNCOPY gotoblas -> ctrsm_iutncopy #define CTRSM_ILNNCOPY gotoblas -> ctrsm_ilnncopy #define CTRSM_ILTNCOPY gotoblas -> ctrsm_iltncopy #define CGEMM_BETA gotoblas -> cgemm_beta #define CGEMM_KERNEL_N gotoblas -> cgemm_kernel_n #define CGEMM_KERNEL_L gotoblas -> cgemm_kernel_l #define CGEMM_KERNEL_R gotoblas -> cgemm_kernel_r #define CGEMM_KERNEL_B gotoblas -> cgemm_kernel_b #define CTRMM_KERNEL_LN gotoblas -> ctrmm_kernel_LN #define CTRMM_KERNEL_LT gotoblas -> ctrmm_kernel_LT #define CTRMM_KERNEL_LR gotoblas -> ctrmm_kernel_LR #define CTRMM_KERNEL_LC gotoblas -> ctrmm_kernel_LC #define CTRMM_KERNEL_RN gotoblas -> ctrmm_kernel_RN #define CTRMM_KERNEL_RT gotoblas -> ctrmm_kernel_RT #define CTRMM_KERNEL_RR gotoblas -> ctrmm_kernel_RR #define CTRMM_KERNEL_RC gotoblas -> ctrmm_kernel_RC #define CTRSM_KERNEL_LN gotoblas -> ctrsm_kernel_LN #define CTRSM_KERNEL_LT gotoblas -> ctrsm_kernel_LT #define CTRSM_KERNEL_LR gotoblas -> ctrsm_kernel_LR #define CTRSM_KERNEL_LC gotoblas -> ctrsm_kernel_LC #define CTRSM_KERNEL_RN gotoblas -> ctrsm_kernel_RN #define CTRSM_KERNEL_RT gotoblas -> ctrsm_kernel_RT #define CTRSM_KERNEL_RR gotoblas -> ctrsm_kernel_RR #define CTRSM_KERNEL_RC gotoblas -> ctrsm_kernel_RC #define CSYMM_IUTCOPY gotoblas -> csymm_iutcopy #define CSYMM_ILTCOPY gotoblas -> csymm_iltcopy #define CSYMM_OUTCOPY gotoblas -> csymm_outcopy #define CSYMM_OLTCOPY gotoblas -> csymm_oltcopy #define CHEMM_OUTCOPY gotoblas -> chemm_outcopy #define CHEMM_OLTCOPY gotoblas -> chemm_oltcopy #define CHEMM_IUTCOPY gotoblas -> chemm_iutcopy #define CHEMM_ILTCOPY gotoblas -> chemm_iltcopy #define CGEMM3M_ONCOPYB gotoblas -> cgemm3m_oncopyb #define CGEMM3M_ONCOPYR gotoblas -> cgemm3m_oncopyr #define CGEMM3M_ONCOPYI gotoblas -> cgemm3m_oncopyi #define CGEMM3M_OTCOPYB gotoblas -> cgemm3m_otcopyb #define CGEMM3M_OTCOPYR gotoblas -> cgemm3m_otcopyr #define CGEMM3M_OTCOPYI gotoblas -> cgemm3m_otcopyi #define CGEMM3M_INCOPYB gotoblas -> cgemm3m_incopyb #define CGEMM3M_INCOPYR gotoblas -> cgemm3m_incopyr #define CGEMM3M_INCOPYI gotoblas -> cgemm3m_incopyi #define CGEMM3M_ITCOPYB gotoblas -> cgemm3m_itcopyb #define CGEMM3M_ITCOPYR gotoblas -> cgemm3m_itcopyr #define CGEMM3M_ITCOPYI gotoblas -> cgemm3m_itcopyi #define CSYMM3M_ILCOPYB gotoblas -> csymm3m_ilcopyb #define CSYMM3M_IUCOPYB gotoblas -> csymm3m_iucopyb #define CSYMM3M_ILCOPYR gotoblas -> csymm3m_ilcopyr #define CSYMM3M_IUCOPYR gotoblas -> csymm3m_iucopyr #define CSYMM3M_ILCOPYI gotoblas -> csymm3m_ilcopyi #define CSYMM3M_IUCOPYI gotoblas -> csymm3m_iucopyi #define CSYMM3M_OLCOPYB gotoblas -> csymm3m_olcopyb #define CSYMM3M_OUCOPYB gotoblas -> csymm3m_oucopyb #define CSYMM3M_OLCOPYR gotoblas -> csymm3m_olcopyr #define CSYMM3M_OUCOPYR gotoblas -> csymm3m_oucopyr #define CSYMM3M_OLCOPYI gotoblas -> csymm3m_olcopyi #define CSYMM3M_OUCOPYI gotoblas -> csymm3m_oucopyi #define CHEMM3M_ILCOPYB gotoblas -> chemm3m_ilcopyb #define CHEMM3M_IUCOPYB gotoblas -> chemm3m_iucopyb #define CHEMM3M_ILCOPYR gotoblas -> chemm3m_ilcopyr #define CHEMM3M_IUCOPYR gotoblas -> chemm3m_iucopyr #define CHEMM3M_ILCOPYI gotoblas -> chemm3m_ilcopyi #define CHEMM3M_IUCOPYI gotoblas -> chemm3m_iucopyi #define CHEMM3M_OLCOPYB gotoblas -> chemm3m_olcopyb #define CHEMM3M_OUCOPYB gotoblas -> chemm3m_oucopyb #define CHEMM3M_OLCOPYR gotoblas -> chemm3m_olcopyr #define CHEMM3M_OUCOPYR gotoblas -> chemm3m_oucopyr #define CHEMM3M_OLCOPYI gotoblas -> chemm3m_olcopyi #define CHEMM3M_OUCOPYI gotoblas -> chemm3m_oucopyi #define CGEMM3M_KERNEL gotoblas -> cgemm3m_kernel #define CNEG_TCOPY gotoblas -> cneg_tcopy #define CLASWP_NCOPY gotoblas -> claswp_ncopy #define CAXPBY_K gotoblas -> caxpby_k #define COMATCOPY_K_CN gotoblas -> comatcopy_k_cn #define COMATCOPY_K_RN gotoblas -> comatcopy_k_rn #define COMATCOPY_K_CT gotoblas -> comatcopy_k_ct #define COMATCOPY_K_RT gotoblas -> comatcopy_k_rt #define COMATCOPY_K_CNC gotoblas -> comatcopy_k_cnc #define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc #define CIMATCOPY_K_CN gotoblas -> cimatcopy_k_cn #define CIMATCOPY_K_RN gotoblas -> cimatcopy_k_rn #define CIMATCOPY_K_CT gotoblas -> cimatcopy_k_ct #define CIMATCOPY_K_RT gotoblas -> cimatcopy_k_rt #define CIMATCOPY_K_CNC gotoblas -> cimatcopy_k_cnc #define CIMATCOPY_K_RNC gotoblas -> cimatcopy_k_rnc #define CIMATCOPY_K_CTC gotoblas -> cimatcopy_k_ctc #define CIMATCOPY_K_RTC gotoblas -> cimatcopy_k_rtc #define CGEADD_K gotoblas -> cgeadd_k #endif #define CGEMM_NN cgemm_nn #define CGEMM_CN cgemm_cn #define CGEMM_TN cgemm_tn #define CGEMM_NC cgemm_nc #define CGEMM_NT cgemm_nt #define CGEMM_CC cgemm_cc #define CGEMM_CT cgemm_ct #define CGEMM_TC cgemm_tc #define CGEMM_TT cgemm_tt #define CGEMM_NR cgemm_nr #define CGEMM_TR cgemm_tr #define CGEMM_CR cgemm_cr #define CGEMM_RN cgemm_rn #define CGEMM_RT cgemm_rt #define CGEMM_RC cgemm_rc #define CGEMM_RR cgemm_rr #define CSYMM_LU csymm_LU #define CSYMM_LL csymm_LL #define CSYMM_RU csymm_RU #define CSYMM_RL csymm_RL #define CHEMM_LU chemm_LU #define CHEMM_LL chemm_LL #define CHEMM_RU chemm_RU #define CHEMM_RL chemm_RL #define CSYRK_UN csyrk_UN #define CSYRK_UT csyrk_UT #define CSYRK_LN csyrk_LN #define CSYRK_LT csyrk_LT #define CSYRK_UR csyrk_UN #define CSYRK_UC csyrk_UT #define CSYRK_LR csyrk_LN #define CSYRK_LC csyrk_LT #define CSYRK_KERNEL_U csyrk_kernel_U #define CSYRK_KERNEL_L csyrk_kernel_L #define CHERK_UN cherk_UN #define CHERK_LN cherk_LN #define CHERK_UC cherk_UC #define CHERK_LC cherk_LC #define CHER2K_UN cher2k_UN #define CHER2K_LN cher2k_LN #define CHER2K_UC cher2k_UC #define CHER2K_LC cher2k_LC #define CSYR2K_UN csyr2k_UN #define CSYR2K_UT csyr2k_UT #define CSYR2K_LN csyr2k_LN #define CSYR2K_LT csyr2k_LT #define CSYR2K_UR csyr2k_UN #define CSYR2K_UC csyr2k_UT #define CSYR2K_LR csyr2k_LN #define CSYR2K_LC csyr2k_LT #define CSYR2K_KERNEL_U csyr2k_kernel_U #define CSYR2K_KERNEL_L csyr2k_kernel_L #define CTRMM_LNUU ctrmm_LNUU #define CTRMM_LNUN ctrmm_LNUN #define CTRMM_LNLU ctrmm_LNLU #define CTRMM_LNLN ctrmm_LNLN #define CTRMM_LTUU ctrmm_LTUU #define CTRMM_LTUN ctrmm_LTUN #define CTRMM_LTLU ctrmm_LTLU #define CTRMM_LTLN ctrmm_LTLN #define CTRMM_LRUU ctrmm_LRUU #define CTRMM_LRUN ctrmm_LRUN #define CTRMM_LRLU ctrmm_LRLU #define CTRMM_LRLN ctrmm_LRLN #define CTRMM_LCUU ctrmm_LCUU #define CTRMM_LCUN ctrmm_LCUN #define CTRMM_LCLU ctrmm_LCLU #define CTRMM_LCLN ctrmm_LCLN #define CTRMM_RNUU ctrmm_RNUU #define CTRMM_RNUN ctrmm_RNUN #define CTRMM_RNLU ctrmm_RNLU #define CTRMM_RNLN ctrmm_RNLN #define CTRMM_RTUU ctrmm_RTUU #define CTRMM_RTUN ctrmm_RTUN #define CTRMM_RTLU ctrmm_RTLU #define CTRMM_RTLN ctrmm_RTLN #define CTRMM_RRUU ctrmm_RRUU #define CTRMM_RRUN ctrmm_RRUN #define CTRMM_RRLU ctrmm_RRLU #define CTRMM_RRLN ctrmm_RRLN #define CTRMM_RCUU ctrmm_RCUU #define CTRMM_RCUN ctrmm_RCUN #define CTRMM_RCLU ctrmm_RCLU #define CTRMM_RCLN ctrmm_RCLN #define CTRSM_LNUU ctrsm_LNUU #define CTRSM_LNUN ctrsm_LNUN #define CTRSM_LNLU ctrsm_LNLU #define CTRSM_LNLN ctrsm_LNLN #define CTRSM_LTUU ctrsm_LTUU #define CTRSM_LTUN ctrsm_LTUN #define CTRSM_LTLU ctrsm_LTLU #define CTRSM_LTLN ctrsm_LTLN #define CTRSM_LRUU ctrsm_LRUU #define CTRSM_LRUN ctrsm_LRUN #define CTRSM_LRLU ctrsm_LRLU #define CTRSM_LRLN ctrsm_LRLN #define CTRSM_LCUU ctrsm_LCUU #define CTRSM_LCUN ctrsm_LCUN #define CTRSM_LCLU ctrsm_LCLU #define CTRSM_LCLN ctrsm_LCLN #define CTRSM_RNUU ctrsm_RNUU #define CTRSM_RNUN ctrsm_RNUN #define CTRSM_RNLU ctrsm_RNLU #define CTRSM_RNLN ctrsm_RNLN #define CTRSM_RTUU ctrsm_RTUU #define CTRSM_RTUN ctrsm_RTUN #define CTRSM_RTLU ctrsm_RTLU #define CTRSM_RTLN ctrsm_RTLN #define CTRSM_RRUU ctrsm_RRUU #define CTRSM_RRUN ctrsm_RRUN #define CTRSM_RRLU ctrsm_RRLU #define CTRSM_RRLN ctrsm_RRLN #define CTRSM_RCUU ctrsm_RCUU #define CTRSM_RCUN ctrsm_RCUN #define CTRSM_RCLU ctrsm_RCLU #define CTRSM_RCLN ctrsm_RCLN #define CGEMM_THREAD_NN cgemm_thread_nn #define CGEMM_THREAD_CN cgemm_thread_cn #define CGEMM_THREAD_TN cgemm_thread_tn #define CGEMM_THREAD_NC cgemm_thread_nc #define CGEMM_THREAD_NT cgemm_thread_nt #define CGEMM_THREAD_CC cgemm_thread_cc #define CGEMM_THREAD_CT cgemm_thread_ct #define CGEMM_THREAD_TC cgemm_thread_tc #define CGEMM_THREAD_TT cgemm_thread_tt #define CGEMM_THREAD_NR cgemm_thread_nr #define CGEMM_THREAD_TR cgemm_thread_tr #define CGEMM_THREAD_CR cgemm_thread_cr #define CGEMM_THREAD_RN cgemm_thread_rn #define CGEMM_THREAD_RT cgemm_thread_rt #define CGEMM_THREAD_RC cgemm_thread_rc #define CGEMM_THREAD_RR cgemm_thread_rr #define CSYMM_THREAD_LU csymm_thread_LU #define CSYMM_THREAD_LL csymm_thread_LL #define CSYMM_THREAD_RU csymm_thread_RU #define CSYMM_THREAD_RL csymm_thread_RL #define CHEMM_THREAD_LU chemm_thread_LU #define CHEMM_THREAD_LL chemm_thread_LL #define CHEMM_THREAD_RU chemm_thread_RU #define CHEMM_THREAD_RL chemm_thread_RL #define CSYRK_THREAD_UN csyrk_thread_UN #define CSYRK_THREAD_UT csyrk_thread_UT #define CSYRK_THREAD_LN csyrk_thread_LN #define CSYRK_THREAD_LT csyrk_thread_LT #define CSYRK_THREAD_UR csyrk_thread_UN #define CSYRK_THREAD_UC csyrk_thread_UT #define CSYRK_THREAD_LR csyrk_thread_LN #define CSYRK_THREAD_LC csyrk_thread_LT #define CHERK_THREAD_UN cherk_thread_UN #define CHERK_THREAD_UT cherk_thread_UT #define CHERK_THREAD_LN cherk_thread_LN #define CHERK_THREAD_LT cherk_thread_LT #define CHERK_THREAD_UR cherk_thread_UR #define CHERK_THREAD_UC cherk_thread_UC #define CHERK_THREAD_LR cherk_thread_LR #define CHERK_THREAD_LC cherk_thread_LC #define CGEMM3M_NN cgemm3m_nn #define CGEMM3M_CN cgemm3m_cn #define CGEMM3M_TN cgemm3m_tn #define CGEMM3M_NC cgemm3m_nc #define CGEMM3M_NT cgemm3m_nt #define CGEMM3M_CC cgemm3m_cc #define CGEMM3M_CT cgemm3m_ct #define CGEMM3M_TC cgemm3m_tc #define CGEMM3M_TT cgemm3m_tt #define CGEMM3M_NR cgemm3m_nr #define CGEMM3M_TR cgemm3m_tr #define CGEMM3M_CR cgemm3m_cr #define CGEMM3M_RN cgemm3m_rn #define CGEMM3M_RT cgemm3m_rt #define CGEMM3M_RC cgemm3m_rc #define CGEMM3M_RR cgemm3m_rr #define CGEMM3M_THREAD_NN cgemm3m_thread_nn #define CGEMM3M_THREAD_CN cgemm3m_thread_cn #define CGEMM3M_THREAD_TN cgemm3m_thread_tn #define CGEMM3M_THREAD_NC cgemm3m_thread_nc #define CGEMM3M_THREAD_NT cgemm3m_thread_nt #define CGEMM3M_THREAD_CC cgemm3m_thread_cc #define CGEMM3M_THREAD_CT cgemm3m_thread_ct #define CGEMM3M_THREAD_TC cgemm3m_thread_tc #define CGEMM3M_THREAD_TT cgemm3m_thread_tt #define CGEMM3M_THREAD_NR cgemm3m_thread_nr #define CGEMM3M_THREAD_TR cgemm3m_thread_tr #define CGEMM3M_THREAD_CR cgemm3m_thread_cr #define CGEMM3M_THREAD_RN cgemm3m_thread_rn #define CGEMM3M_THREAD_RT cgemm3m_thread_rt #define CGEMM3M_THREAD_RC cgemm3m_thread_rc #define CGEMM3M_THREAD_RR cgemm3m_thread_rr #define CSYMM3M_LU csymm3m_LU #define CSYMM3M_LL csymm3m_LL #define CSYMM3M_RU csymm3m_RU #define CSYMM3M_RL csymm3m_RL #define CSYMM3M_THREAD_LU csymm3m_thread_LU #define CSYMM3M_THREAD_LL csymm3m_thread_LL #define CSYMM3M_THREAD_RU csymm3m_thread_RU #define CSYMM3M_THREAD_RL csymm3m_thread_RL #define CHEMM3M_LU chemm3m_LU #define CHEMM3M_LL chemm3m_LL #define CHEMM3M_RU chemm3m_RU #define CHEMM3M_RL chemm3m_RL #define CHEMM3M_THREAD_LU chemm3m_thread_LU #define CHEMM3M_THREAD_LL chemm3m_thread_LL #define CHEMM3M_THREAD_RU chemm3m_thread_RU #define CHEMM3M_THREAD_RL chemm3m_thread_RL #endif OpenBLAS-0.2.20/common_d.h000066400000000000000000000344701313527062700151200ustar00rootroot00000000000000#ifndef COMMON_D_H #define COMMON_D_H #ifndef DYNAMIC_ARCH #define DAMAX_K damax_k #define DAMIN_K damin_k #define DMAX_K dmax_k #define DMIN_K dmin_k #define IDAMAX_K idamax_k #define IDAMIN_K idamin_k #define IDMAX_K idmax_k #define IDMIN_K idmin_k #define DASUM_K dasum_k #define DAXPYU_K daxpy_k #define DAXPYC_K daxpy_k #define DCOPY_K dcopy_k #define DDOTU_K ddot_k #define DDOTC_K ddot_k #define DNRM2_K dnrm2_k #define DSCAL_K dscal_k #define DSWAP_K dswap_k #define DROT_K drot_k #define DGEMV_N dgemv_n #define DGEMV_T dgemv_t #define DGEMV_R dgemv_n #define DGEMV_C dgemv_t #define DGEMV_O dgemv_n #define DGEMV_U dgemv_t #define DGEMV_S dgemv_n #define DGEMV_D dgemv_t #define DGERU_K dger_k #define DGERC_K dger_k #define DGERV_K dger_k #define DGERD_K dger_k #define DSYMV_U dsymv_U #define DSYMV_L dsymv_L #define DSYMV_THREAD_U dsymv_thread_U #define DSYMV_THREAD_L dsymv_thread_L #define DGEMM_ONCOPY dgemm_oncopy #define DGEMM_OTCOPY dgemm_otcopy #if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N #define DGEMM_INCOPY dgemm_oncopy #define DGEMM_ITCOPY dgemm_otcopy #else #define DGEMM_INCOPY dgemm_incopy #define DGEMM_ITCOPY dgemm_itcopy #endif #define DTRMM_OUNUCOPY dtrmm_ounucopy #define DTRMM_OUNNCOPY dtrmm_ounncopy #define DTRMM_OUTUCOPY dtrmm_outucopy #define DTRMM_OUTNCOPY dtrmm_outncopy #define DTRMM_OLNUCOPY dtrmm_olnucopy #define DTRMM_OLNNCOPY dtrmm_olnncopy #define DTRMM_OLTUCOPY dtrmm_oltucopy #define DTRMM_OLTNCOPY dtrmm_oltncopy #define DTRSM_OUNUCOPY dtrsm_ounucopy #define DTRSM_OUNNCOPY dtrsm_ounncopy #define DTRSM_OUTUCOPY dtrsm_outucopy #define DTRSM_OUTNCOPY dtrsm_outncopy #define DTRSM_OLNUCOPY dtrsm_olnucopy #define DTRSM_OLNNCOPY dtrsm_olnncopy #define DTRSM_OLTUCOPY dtrsm_oltucopy #define DTRSM_OLTNCOPY dtrsm_oltncopy #if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N #define DTRMM_IUNUCOPY dtrmm_ounucopy #define DTRMM_IUNNCOPY dtrmm_ounncopy #define DTRMM_IUTUCOPY dtrmm_outucopy #define DTRMM_IUTNCOPY dtrmm_outncopy #define DTRMM_ILNUCOPY dtrmm_olnucopy #define DTRMM_ILNNCOPY dtrmm_olnncopy #define DTRMM_ILTUCOPY dtrmm_oltucopy #define DTRMM_ILTNCOPY dtrmm_oltncopy #define DTRSM_IUNUCOPY dtrsm_ounucopy #define DTRSM_IUNNCOPY dtrsm_ounncopy #define DTRSM_IUTUCOPY dtrsm_outucopy #define DTRSM_IUTNCOPY dtrsm_outncopy #define DTRSM_ILNUCOPY dtrsm_olnucopy #define DTRSM_ILNNCOPY dtrsm_olnncopy #define DTRSM_ILTUCOPY dtrsm_oltucopy #define DTRSM_ILTNCOPY dtrsm_oltncopy #else #define DTRMM_IUNUCOPY dtrmm_iunucopy #define DTRMM_IUNNCOPY dtrmm_iunncopy #define DTRMM_IUTUCOPY dtrmm_iutucopy #define DTRMM_IUTNCOPY dtrmm_iutncopy #define DTRMM_ILNUCOPY dtrmm_ilnucopy #define DTRMM_ILNNCOPY dtrmm_ilnncopy #define DTRMM_ILTUCOPY dtrmm_iltucopy #define DTRMM_ILTNCOPY dtrmm_iltncopy #define DTRSM_IUNUCOPY dtrsm_iunucopy #define DTRSM_IUNNCOPY dtrsm_iunncopy #define DTRSM_IUTUCOPY dtrsm_iutucopy #define DTRSM_IUTNCOPY dtrsm_iutncopy #define DTRSM_ILNUCOPY dtrsm_ilnucopy #define DTRSM_ILNNCOPY dtrsm_ilnncopy #define DTRSM_ILTUCOPY dtrsm_iltucopy #define DTRSM_ILTNCOPY dtrsm_iltncopy #endif #define DGEMM_BETA dgemm_beta #define DGEMM_KERNEL dgemm_kernel #define DTRMM_KERNEL_LN dtrmm_kernel_LN #define DTRMM_KERNEL_LT dtrmm_kernel_LT #define DTRMM_KERNEL_LR dtrmm_kernel_LN #define DTRMM_KERNEL_LC dtrmm_kernel_LT #define DTRMM_KERNEL_RN dtrmm_kernel_RN #define DTRMM_KERNEL_RT dtrmm_kernel_RT #define DTRMM_KERNEL_RR dtrmm_kernel_RN #define DTRMM_KERNEL_RC dtrmm_kernel_RT #define DTRSM_KERNEL_LN dtrsm_kernel_LN #define DTRSM_KERNEL_LT dtrsm_kernel_LT #define DTRSM_KERNEL_LR dtrsm_kernel_LN #define DTRSM_KERNEL_LC dtrsm_kernel_LT #define DTRSM_KERNEL_RN dtrsm_kernel_RN #define DTRSM_KERNEL_RT dtrsm_kernel_RT #define DTRSM_KERNEL_RR dtrsm_kernel_RN #define DTRSM_KERNEL_RC dtrsm_kernel_RT #define DSYMM_OUTCOPY dsymm_outcopy #define DSYMM_OLTCOPY dsymm_oltcopy #if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N #define DSYMM_IUTCOPY dsymm_outcopy #define DSYMM_ILTCOPY dsymm_oltcopy #else #define DSYMM_IUTCOPY dsymm_iutcopy #define DSYMM_ILTCOPY dsymm_iltcopy #endif #define DNEG_TCOPY dneg_tcopy #define DLASWP_NCOPY dlaswp_ncopy #define DAXPBY_K daxpby_k #define DOMATCOPY_K_CN domatcopy_k_cn #define DOMATCOPY_K_RN domatcopy_k_rn #define DOMATCOPY_K_CT domatcopy_k_ct #define DOMATCOPY_K_RT domatcopy_k_rt #define DIMATCOPY_K_CN dimatcopy_k_cn #define DIMATCOPY_K_RN dimatcopy_k_rn #define DIMATCOPY_K_CT dimatcopy_k_ct #define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k #else #define DAMAX_K gotoblas -> damax_k #define DAMIN_K gotoblas -> damin_k #define DMAX_K gotoblas -> dmax_k #define DMIN_K gotoblas -> dmin_k #define IDAMAX_K gotoblas -> idamax_k #define IDAMIN_K gotoblas -> idamin_k #define IDMAX_K gotoblas -> idmax_k #define IDMIN_K gotoblas -> idmin_k #define DASUM_K gotoblas -> dasum_k #define DAXPYU_K gotoblas -> daxpy_k #define DAXPYC_K gotoblas -> daxpy_k #define DCOPY_K gotoblas -> dcopy_k #define DDOTU_K gotoblas -> ddot_k #define DDOTC_K gotoblas -> ddot_k #define DNRM2_K gotoblas -> dnrm2_k #define DSCAL_K gotoblas -> dscal_k #define DSWAP_K gotoblas -> dswap_k #define DROT_K gotoblas -> drot_k #define DGEMV_N gotoblas -> dgemv_n #define DGEMV_T gotoblas -> dgemv_t #define DGEMV_R gotoblas -> dgemv_n #define DGEMV_C gotoblas -> dgemv_t #define DGEMV_O gotoblas -> dgemv_n #define DGEMV_U gotoblas -> dgemv_t #define DGEMV_S gotoblas -> dgemv_n #define DGEMV_D gotoblas -> dgemv_t #define DGERU_K gotoblas -> dger_k #define DGERC_K gotoblas -> dger_k #define DGERV_K gotoblas -> dger_k #define DGERD_K gotoblas -> dger_k #define DSYMV_U gotoblas -> dsymv_U #define DSYMV_L gotoblas -> dsymv_L #define DSYMV_THREAD_U dsymv_thread_U #define DSYMV_THREAD_L dsymv_thread_L #define DGEMM_ONCOPY gotoblas -> dgemm_oncopy #define DGEMM_OTCOPY gotoblas -> dgemm_otcopy #define DGEMM_INCOPY gotoblas -> dgemm_incopy #define DGEMM_ITCOPY gotoblas -> dgemm_itcopy #define DTRMM_OUNUCOPY gotoblas -> dtrmm_ounucopy #define DTRMM_OUTUCOPY gotoblas -> dtrmm_outucopy #define DTRMM_OLNUCOPY gotoblas -> dtrmm_olnucopy #define DTRMM_OLTUCOPY gotoblas -> dtrmm_oltucopy #define DTRSM_OUNUCOPY gotoblas -> dtrsm_ounucopy #define DTRSM_OUTUCOPY gotoblas -> dtrsm_outucopy #define DTRSM_OLNUCOPY gotoblas -> dtrsm_olnucopy #define DTRSM_OLTUCOPY gotoblas -> dtrsm_oltucopy #define DTRMM_IUNUCOPY gotoblas -> dtrmm_iunucopy #define DTRMM_IUTUCOPY gotoblas -> dtrmm_iutucopy #define DTRMM_ILNUCOPY gotoblas -> dtrmm_ilnucopy #define DTRMM_ILTUCOPY gotoblas -> dtrmm_iltucopy #define DTRSM_IUNUCOPY gotoblas -> dtrsm_iunucopy #define DTRSM_IUTUCOPY gotoblas -> dtrsm_iutucopy #define DTRSM_ILNUCOPY gotoblas -> dtrsm_ilnucopy #define DTRSM_ILTUCOPY gotoblas -> dtrsm_iltucopy #define DTRMM_OUNNCOPY gotoblas -> dtrmm_ounncopy #define DTRMM_OUTNCOPY gotoblas -> dtrmm_outncopy #define DTRMM_OLNNCOPY gotoblas -> dtrmm_olnncopy #define DTRMM_OLTNCOPY gotoblas -> dtrmm_oltncopy #define DTRSM_OUNNCOPY gotoblas -> dtrsm_ounncopy #define DTRSM_OUTNCOPY gotoblas -> dtrsm_outncopy #define DTRSM_OLNNCOPY gotoblas -> dtrsm_olnncopy #define DTRSM_OLTNCOPY gotoblas -> dtrsm_oltncopy #define DTRMM_IUNNCOPY gotoblas -> dtrmm_iunncopy #define DTRMM_IUTNCOPY gotoblas -> dtrmm_iutncopy #define DTRMM_ILNNCOPY gotoblas -> dtrmm_ilnncopy #define DTRMM_ILTNCOPY gotoblas -> dtrmm_iltncopy #define DTRSM_IUNNCOPY gotoblas -> dtrsm_iunncopy #define DTRSM_IUTNCOPY gotoblas -> dtrsm_iutncopy #define DTRSM_ILNNCOPY gotoblas -> dtrsm_ilnncopy #define DTRSM_ILTNCOPY gotoblas -> dtrsm_iltncopy #define DGEMM_BETA gotoblas -> dgemm_beta #define DGEMM_KERNEL gotoblas -> dgemm_kernel #define DTRMM_KERNEL_LN gotoblas -> dtrmm_kernel_LN #define DTRMM_KERNEL_LT gotoblas -> dtrmm_kernel_LT #define DTRMM_KERNEL_LR gotoblas -> dtrmm_kernel_LN #define DTRMM_KERNEL_LC gotoblas -> dtrmm_kernel_LT #define DTRMM_KERNEL_RN gotoblas -> dtrmm_kernel_RN #define DTRMM_KERNEL_RT gotoblas -> dtrmm_kernel_RT #define DTRMM_KERNEL_RR gotoblas -> dtrmm_kernel_RN #define DTRMM_KERNEL_RC gotoblas -> dtrmm_kernel_RT #define DTRSM_KERNEL_LN gotoblas -> dtrsm_kernel_LN #define DTRSM_KERNEL_LT gotoblas -> dtrsm_kernel_LT #define DTRSM_KERNEL_LR gotoblas -> dtrsm_kernel_LN #define DTRSM_KERNEL_LC gotoblas -> dtrsm_kernel_LT #define DTRSM_KERNEL_RN gotoblas -> dtrsm_kernel_RN #define DTRSM_KERNEL_RT gotoblas -> dtrsm_kernel_RT #define DTRSM_KERNEL_RR gotoblas -> dtrsm_kernel_RN #define DTRSM_KERNEL_RC gotoblas -> dtrsm_kernel_RT #define DSYMM_IUTCOPY gotoblas -> dsymm_iutcopy #define DSYMM_ILTCOPY gotoblas -> dsymm_iltcopy #define DSYMM_OUTCOPY gotoblas -> dsymm_outcopy #define DSYMM_OLTCOPY gotoblas -> dsymm_oltcopy #define DNEG_TCOPY gotoblas -> dneg_tcopy #define DLASWP_NCOPY gotoblas -> dlaswp_ncopy #define DAXPBY_K gotoblas -> daxpby_k #define DOMATCOPY_K_CN gotoblas -> domatcopy_k_cn #define DOMATCOPY_K_RN gotoblas -> domatcopy_k_rn #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt #define DIMATCOPY_K_CN gotoblas -> dimatcopy_k_cn #define DIMATCOPY_K_RN gotoblas -> dimatcopy_k_rn #define DIMATCOPY_K_CT gotoblas -> dimatcopy_k_ct #define DIMATCOPY_K_RT gotoblas -> dimatcopy_k_rt #define DGEADD_K gotoblas -> dgeadd_k #endif #define DGEMM_NN dgemm_nn #define DGEMM_CN dgemm_tn #define DGEMM_TN dgemm_tn #define DGEMM_NC dgemm_nt #define DGEMM_NT dgemm_nt #define DGEMM_CC dgemm_tt #define DGEMM_CT dgemm_tt #define DGEMM_TC dgemm_tt #define DGEMM_TT dgemm_tt #define DGEMM_NR dgemm_nn #define DGEMM_TR dgemm_tn #define DGEMM_CR dgemm_tn #define DGEMM_RN dgemm_nn #define DGEMM_RT dgemm_nt #define DGEMM_RC dgemm_nt #define DGEMM_RR dgemm_nn #define DSYMM_LU dsymm_LU #define DSYMM_LL dsymm_LL #define DSYMM_RU dsymm_RU #define DSYMM_RL dsymm_RL #define DHEMM_LU dhemm_LU #define DHEMM_LL dhemm_LL #define DHEMM_RU dhemm_RU #define DHEMM_RL dhemm_RL #define DSYRK_UN dsyrk_UN #define DSYRK_UT dsyrk_UT #define DSYRK_LN dsyrk_LN #define DSYRK_LT dsyrk_LT #define DSYRK_UR dsyrk_UN #define DSYRK_UC dsyrk_UT #define DSYRK_LR dsyrk_LN #define DSYRK_LC dsyrk_LT #define DSYRK_KERNEL_U dsyrk_kernel_U #define DSYRK_KERNEL_L dsyrk_kernel_L #define DHERK_UN dsyrk_UN #define DHERK_LN dsyrk_LN #define DHERK_UC dsyrk_UT #define DHERK_LC dsyrk_LT #define DHER2K_UN dsyr2k_UN #define DHER2K_LN dsyr2k_LN #define DHER2K_UC dsyr2k_UT #define DHER2K_LC dsyr2k_LT #define DSYR2K_UN dsyr2k_UN #define DSYR2K_UT dsyr2k_UT #define DSYR2K_LN dsyr2k_LN #define DSYR2K_LT dsyr2k_LT #define DSYR2K_UR dsyr2k_UN #define DSYR2K_UC dsyr2k_UT #define DSYR2K_LR dsyr2k_LN #define DSYR2K_LC dsyr2k_LT #define DSYR2K_KERNEL_U dsyr2k_kernel_U #define DSYR2K_KERNEL_L dsyr2k_kernel_L #define DTRMM_LNUU dtrmm_LNUU #define DTRMM_LNUN dtrmm_LNUN #define DTRMM_LNLU dtrmm_LNLU #define DTRMM_LNLN dtrmm_LNLN #define DTRMM_LTUU dtrmm_LTUU #define DTRMM_LTUN dtrmm_LTUN #define DTRMM_LTLU dtrmm_LTLU #define DTRMM_LTLN dtrmm_LTLN #define DTRMM_LRUU dtrmm_LNUU #define DTRMM_LRUN dtrmm_LNUN #define DTRMM_LRLU dtrmm_LNLU #define DTRMM_LRLN dtrmm_LNLN #define DTRMM_LCUU dtrmm_LTUU #define DTRMM_LCUN dtrmm_LTUN #define DTRMM_LCLU dtrmm_LTLU #define DTRMM_LCLN dtrmm_LTLN #define DTRMM_RNUU dtrmm_RNUU #define DTRMM_RNUN dtrmm_RNUN #define DTRMM_RNLU dtrmm_RNLU #define DTRMM_RNLN dtrmm_RNLN #define DTRMM_RTUU dtrmm_RTUU #define DTRMM_RTUN dtrmm_RTUN #define DTRMM_RTLU dtrmm_RTLU #define DTRMM_RTLN dtrmm_RTLN #define DTRMM_RRUU dtrmm_RNUU #define DTRMM_RRUN dtrmm_RNUN #define DTRMM_RRLU dtrmm_RNLU #define DTRMM_RRLN dtrmm_RNLN #define DTRMM_RCUU dtrmm_RTUU #define DTRMM_RCUN dtrmm_RTUN #define DTRMM_RCLU dtrmm_RTLU #define DTRMM_RCLN dtrmm_RTLN #define DTRSM_LNUU dtrsm_LNUU #define DTRSM_LNUN dtrsm_LNUN #define DTRSM_LNLU dtrsm_LNLU #define DTRSM_LNLN dtrsm_LNLN #define DTRSM_LTUU dtrsm_LTUU #define DTRSM_LTUN dtrsm_LTUN #define DTRSM_LTLU dtrsm_LTLU #define DTRSM_LTLN dtrsm_LTLN #define DTRSM_LRUU dtrsm_LNUU #define DTRSM_LRUN dtrsm_LNUN #define DTRSM_LRLU dtrsm_LNLU #define DTRSM_LRLN dtrsm_LNLN #define DTRSM_LCUU dtrsm_LTUU #define DTRSM_LCUN dtrsm_LTUN #define DTRSM_LCLU dtrsm_LTLU #define DTRSM_LCLN dtrsm_LTLN #define DTRSM_RNUU dtrsm_RNUU #define DTRSM_RNUN dtrsm_RNUN #define DTRSM_RNLU dtrsm_RNLU #define DTRSM_RNLN dtrsm_RNLN #define DTRSM_RTUU dtrsm_RTUU #define DTRSM_RTUN dtrsm_RTUN #define DTRSM_RTLU dtrsm_RTLU #define DTRSM_RTLN dtrsm_RTLN #define DTRSM_RRUU dtrsm_RNUU #define DTRSM_RRUN dtrsm_RNUN #define DTRSM_RRLU dtrsm_RNLU #define DTRSM_RRLN dtrsm_RNLN #define DTRSM_RCUU dtrsm_RTUU #define DTRSM_RCUN dtrsm_RTUN #define DTRSM_RCLU dtrsm_RTLU #define DTRSM_RCLN dtrsm_RTLN #define DGEMM_THREAD_NN dgemm_thread_nn #define DGEMM_THREAD_CN dgemm_thread_tn #define DGEMM_THREAD_TN dgemm_thread_tn #define DGEMM_THREAD_NC dgemm_thread_nt #define DGEMM_THREAD_NT dgemm_thread_nt #define DGEMM_THREAD_CC dgemm_thread_tt #define DGEMM_THREAD_CT dgemm_thread_tt #define DGEMM_THREAD_TC dgemm_thread_tt #define DGEMM_THREAD_TT dgemm_thread_tt #define DGEMM_THREAD_NR dgemm_thread_nn #define DGEMM_THREAD_TR dgemm_thread_tn #define DGEMM_THREAD_CR dgemm_thread_tn #define DGEMM_THREAD_RN dgemm_thread_nn #define DGEMM_THREAD_RT dgemm_thread_nt #define DGEMM_THREAD_RC dgemm_thread_nt #define DGEMM_THREAD_RR dgemm_thread_nn #define DSYMM_THREAD_LU dsymm_thread_LU #define DSYMM_THREAD_LL dsymm_thread_LL #define DSYMM_THREAD_RU dsymm_thread_RU #define DSYMM_THREAD_RL dsymm_thread_RL #define DHEMM_THREAD_LU dhemm_thread_LU #define DHEMM_THREAD_LL dhemm_thread_LL #define DHEMM_THREAD_RU dhemm_thread_RU #define DHEMM_THREAD_RL dhemm_thread_RL #define DSYRK_THREAD_UN dsyrk_thread_UN #define DSYRK_THREAD_UT dsyrk_thread_UT #define DSYRK_THREAD_LN dsyrk_thread_LN #define DSYRK_THREAD_LT dsyrk_thread_LT #define DSYRK_THREAD_UR dsyrk_thread_UN #define DSYRK_THREAD_UC dsyrk_thread_UT #define DSYRK_THREAD_LR dsyrk_thread_LN #define DSYRK_THREAD_LC dsyrk_thread_LT #define DHERK_THREAD_UN dsyrk_thread_UN #define DHERK_THREAD_UT dsyrk_thread_UT #define DHERK_THREAD_LN dsyrk_thread_LN #define DHERK_THREAD_LT dsyrk_thread_LT #define DHERK_THREAD_UR dsyrk_thread_UN #define DHERK_THREAD_UC dsyrk_thread_UT #define DHERK_THREAD_LR dsyrk_thread_LN #define DHERK_THREAD_LC dsyrk_thread_LT #endif OpenBLAS-0.2.20/common_ia64.h000066400000000000000000000237451313527062700154430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_IA64 #define COMMON_IA64 #ifndef ASSEMBLER #ifndef MAP_WRITECOMBINED #define MAP_WRITECOMBINED 0x10000 #endif #define MB #define WMB #ifdef __ECC #include #endif #define RPCC64BIT #ifndef __ECC static __inline void blas_lock(volatile unsigned long *address){ unsigned long ret; do { while (*address) {YIELDING;}; __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n" "cmpxchg4.acq %0=[%2],%1,ar.ccv\n" : "=r"(ret) : "r"(1), "r"(address) : "ar.ccv", "memory"); } while (ret); } #define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void) { unsigned long clocks; __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks)); return clocks; } #define RPCC_DEFINED static __inline unsigned long stmxcsr(void){ unsigned long fp; __asm__ __volatile__ ("mov.m %0=ar.fpsr" : "=r" (fp)); return fp; } static __inline void ldmxcsr(unsigned long fp) { __asm__ __volatile__ ("mov.m ar.fpsr=%0" :: "r" (fp)); } #define GET_IMAGE(res) asm __volatile__("mov %0 = f9" : "=f"(res) : : "memory") #else static __inline void blas_lock(volatile unsigned long *address){ while (*address || _InterlockedCompareExchange((volatile int *) address,1,0)) ; } #define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void) { return __getReg(_IA64_REG_AR_ITC); } #define RPCC_DEFINED static __inline unsigned int stmxcsr(void) { return __getReg(_IA64_REG_AR_FPSR); } static __inline void ldmxcsr(unsigned long fp) { return __setReg(_IA64_REG_AR_FPSR, fp); } #ifdef DOUBLE #define GET_IMAGE(res) __stfd(&res, 9) #else #define GET_IMAGE(res) __stfs(&res, 9) #endif #endif #define GET_IMAGE_CANCEL #ifdef ENABLE_SSE_EXCEPTION #define IDEBUG_START \ { \ unsigned long fp_sse_mode, new_fp_mode; \ fp_sse_mode = stmxcsr();\ new_fp_mode = (fp_sse_mode & ~(FE_UNDERFLOW | FE_OVERFLOW | FE_UNNORMAL | FE_INVALID));\ ldmxcsr(new_fp_mode); #define IDEBUG_END \ ldmxcsr(fp_sse_mode); \ } #endif #ifdef SMP #ifdef USE64BITINT /* 64bit version */ extern unsigned long blas_quick_divide_table[]; #ifndef __ECC static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){ unsigned long ret; if (y <= 1) return x; __asm__ __volatile__("setf.sig f6 = %1\n\t" "ldf8 f7 = [%2];;\n\t" "xmpy.hu f6= f6, f7;;\n\t" "getf.sig %0 = f6;;\n" : "=r"(ret) : "r"(x), "r"(&blas_quick_divide_table[y]) : "f6", "f7" ); return ret; } #else /* Using Intel Compiler */ static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){ if (y <= 1) return x; return _m64_xmahu(x, blas_quick_divide_table[y], 0); } #endif #else /* 32bit version */ extern unsigned int blas_quick_divide_table[]; static __inline int blas_quickdivide(unsigned int x, unsigned int y){ if (y <= 1) return x; return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); } #endif #endif #endif #if 0 #ifdef DOUBLE #define GEMM_NCOPY dgemm_ncopy #define GEMM_TCOPY dgemm_tcopy #define ZGEMM_NCOPY zgemm_ncopy #define ZGEMM_TCOPY zgemm_tcopy #define GEMM_KERNEL dgemm_kernel #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ZGEMM_KERNEL zgemm_kernel_n #endif #if defined(CN) || defined(CT) || defined(RN) || defined(RT) #define ZGEMM_KERNEL zgemm_kernel_l #endif #if defined(NC) || defined(TC) || defined(NR) || defined(TR) #define ZGEMM_KERNEL zgemm_kernel_r #endif #if defined(CC) || defined(CR) || defined(RC) || defined(RR) #define ZGEMM_KERNEL zgemm_kernel_b #endif #else #define GEMM_NCOPY sgemm_ncopy #define GEMM_TCOPY sgemm_tcopy #define ZGEMM_NCOPY cgemm_ncopy #define ZGEMM_TCOPY cgemm_tcopy #define GEMM_KERNEL sgemm_kernel #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ZGEMM_KERNEL cgemm_kernel_n #endif #if defined(CN) || defined(CT) || defined(RN) || defined(RT) #define ZGEMM_KERNEL cgemm_kernel_l #endif #if defined(NC) || defined(TC) || defined(NR) || defined(TR) #define ZGEMM_KERNEL cgemm_kernel_r #endif #if defined(CC) || defined(CR) || defined(RC) || defined(RR) #define ZGEMM_KERNEL cgemm_kernel_b #endif #endif #endif #ifdef USE64BITINT #define LDINT ld8 #define INTSIZE 8 #define CMP4GE cmp.ge #define CMP4NE cmp.ge #define CMP4EQ cmp.eq #else #define LDINT ld4 #define INTSIZE 4 #define CMP4GE cmp4.ge #define CMP4NE cmp4.ne #define CMP4EQ cmp4.eq #endif #define HALT mov r0 = 0 #ifdef XDOUBLE #define LD8 ld8 #define ST8 st8 #define LDFD ldfe #define LDFPD ldfpe #define LDFD_T1 ldfe.t1 #define LDFD_NT1 ldfe.nt1 #define LDFD_NT2 ldfe.nt2 #define LDFD_NTA ldfe.nta #define LDFPD_NT1 ldfpe.nt1 #define LDFPD_NT2 ldfpe.nt2 #define LDFPD_NTA ldfpe.nta #define STFD stfe #define STFD_NTA stfe.nta #define FADD fadd #define FSUB fsub #define FMPY fmpy #define FMA fma #define FMS fms #define FNMA fnma #define FPMA fpma #define SETF setf.d #elif defined(DOUBLE) #define LD8 ld8 #define ST8 st8 #define LDF8 ldf8 #define LDF8_NT1 ldf8.nt1 #define LDF8_NTA ldf8.nta #define STF8 stf8 #define STF8_NTA stf8.nta #define LDFD ldfd #define LDFPD ldfpd #define LDFD_T1 ldfd.t1 #define LDFD_NT1 ldfd.nt1 #define LDFD_NT2 ldfd.nt2 #define LDFD_NTA ldfd.nta #define LDFPD_NT1 ldfpd.nt1 #define LDFPD_NT2 ldfpd.nt2 #define LDFPD_NTA ldfpd.nta #define STFD stfd #define STFD_NTA stfd.nta #define FADD fadd.d #define FSUB fsub.d #define FMPY fmpy.d #define FMA fma.d #define FMS fms.d #define FNMA fnma.d #define FPMA fpma.d #define SETF setf.d #else #define LD8 ld4 #define ST8 st4 #define LDF8 ldfs #define LDF8_NT1 ldfs.nt1 #define LDF8_NTA ldfs.nta #define STF8 stfs #define STF8_NTA stfs.nta #define LDFD ldfs #define LDFPD ldfps #define LDFD_T1 ldfs.t1 #define LDFD_NT1 ldfs.nt1 #define LDFD_NT2 ldfs.nt2 #define LDFD_NTA ldfs.nta #define LDFPD_NT1 ldfps.nt1 #define LDFPD_NT2 ldfps.nt2 #define LDFPD_NTA ldfps.nta #define STFD stfs #define STFD_NTA stfs.nta #if 0 #define FADD fadd.s #define FSUB fsub.s #define FMPY fmpy.s #define FMA fma.s #define FMS fms.s #define FNMA fnma.s #define FPMA fpma.s #else #define FADD fadd #define FSUB fsub #define FMPY fmpy #define FMA fma #define FMS fms #define FNMA fnma #define FPMA fpma #endif #define SETF setf.s #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #ifdef F_INTERFACE_G77 #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_G95 #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_GFORT #define RETURN_BY_REGS #endif #ifdef F_INTERFACE_INTEL #define RETURN_BY_STACK #endif #define PROLOGUE \ .explicit; \ .text; \ .align 128; \ .global REALNAME; \ .proc REALNAME; \ REALNAME: #ifdef PROFILE #define PROFCODE \ .data; \ .align 8; \ .LP0:; \ data8 0; \ .text; \ alloc out0 = ar.pfs, 8, 0, 4, 0; \ mov out1 = r1; \ mov out2 = b0; \ addl out3 = @ltoff(.LP0), r1;;; \ br.call.sptk.many b0 = _mcount;; #else #define PROFCODE #endif #if defined(__linux__) && defined(__ELF__) #define GNUSTACK .section .note.GNU-stack,"",@progbits #else #define GNUSTACK #endif #define EPILOGUE \ .endp REALNAME ; \ GNUSTACK #define START_ADDRESS 0x20000fc800000000UL #undef SEEK_ADDRESS #if 0 #ifdef CONFIG_IA64_PAGE_SIZE_4KB #define SEEK_ADDRESS #endif #ifdef CONFIG_IA64_PAGE_SIZE_8KB #define SEEK_ADDRESS #endif #endif #define BUFFER_SIZE (128 << 20) #ifndef PAGESIZE #define PAGESIZE (16UL << 10) #endif #define HUGE_PAGESIZE ( 4 << 20) #define BASE_ADDRESS (START_ADDRESS - (BLASULONG)BUFFER_SIZE * MAX_CPU_NUMBER) #endif OpenBLAS-0.2.20/common_interface.h000066400000000000000000001342421313527062700166330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef ASSEMBLER #ifdef __cplusplus extern "C" { /* Assume C declarations for C++ */ #endif /* __cplusplus */ int BLASFUNC(xerbla)(char *, blasint *info, blasint); void openblas_set_num_threads_(int *); FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); #ifdef RETURN_BY_STRUCT typedef struct { float r, i; } myccomplex_t; typedef struct { double r, i; } myzcomplex_t; typedef struct { xdouble r, i; } myxcomplex_t; myccomplex_t BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); myccomplex_t BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); myzcomplex_t BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); myzcomplex_t BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); #elif defined RETURN_BY_STACK void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *); void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); #else openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); #endif void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(qaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(xaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(caxpyc)(blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(zaxpyc)(blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(xaxpyc)(blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(scopy) (blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dcopy) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ccopy) (blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(zcopy) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(sswap) (blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dswap) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(cswap) (blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(zswap) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(sasum) (blasint *, float *, blasint *); FLOATRET BLASFUNC(scasum)(blasint *, float *, blasint *); double BLASFUNC(dasum) (blasint *, double *, blasint *); xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); double BLASFUNC(dzasum)(blasint *, double *, blasint *); xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); blasint BLASFUNC(isamax)(blasint *, float *, blasint *); blasint BLASFUNC(idamax)(blasint *, double *, blasint *); blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); blasint BLASFUNC(icamax)(blasint *, float *, blasint *); blasint BLASFUNC(izamax)(blasint *, double *, blasint *); blasint BLASFUNC(ixamax)(blasint *, xdouble *, blasint *); blasint BLASFUNC(ismax) (blasint *, float *, blasint *); blasint BLASFUNC(idmax) (blasint *, double *, blasint *); blasint BLASFUNC(iqmax) (blasint *, xdouble *, blasint *); blasint BLASFUNC(icmax) (blasint *, float *, blasint *); blasint BLASFUNC(izmax) (blasint *, double *, blasint *); blasint BLASFUNC(ixmax) (blasint *, xdouble *, blasint *); blasint BLASFUNC(isamin)(blasint *, float *, blasint *); blasint BLASFUNC(idamin)(blasint *, double *, blasint *); blasint BLASFUNC(iqamin)(blasint *, xdouble *, blasint *); blasint BLASFUNC(icamin)(blasint *, float *, blasint *); blasint BLASFUNC(izamin)(blasint *, double *, blasint *); blasint BLASFUNC(ixamin)(blasint *, xdouble *, blasint *); blasint BLASFUNC(ismin)(blasint *, float *, blasint *); blasint BLASFUNC(idmin)(blasint *, double *, blasint *); blasint BLASFUNC(iqmin)(blasint *, xdouble *, blasint *); blasint BLASFUNC(icmin)(blasint *, float *, blasint *); blasint BLASFUNC(izmin)(blasint *, double *, blasint *); blasint BLASFUNC(ixmin)(blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(samax) (blasint *, float *, blasint *); double BLASFUNC(damax) (blasint *, double *, blasint *); xdouble BLASFUNC(qamax) (blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(scamax)(blasint *, float *, blasint *); double BLASFUNC(dzamax)(blasint *, double *, blasint *); xdouble BLASFUNC(qxamax)(blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(samin) (blasint *, float *, blasint *); double BLASFUNC(damin) (blasint *, double *, blasint *); xdouble BLASFUNC(qamin) (blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(scamin)(blasint *, float *, blasint *); double BLASFUNC(dzamin)(blasint *, double *, blasint *); xdouble BLASFUNC(qxamin)(blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(smax) (blasint *, float *, blasint *); double BLASFUNC(dmax) (blasint *, double *, blasint *); xdouble BLASFUNC(qmax) (blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(scmax) (blasint *, float *, blasint *); double BLASFUNC(dzmax) (blasint *, double *, blasint *); xdouble BLASFUNC(qxmax) (blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(smin) (blasint *, float *, blasint *); double BLASFUNC(dmin) (blasint *, double *, blasint *); xdouble BLASFUNC(qmin) (blasint *, xdouble *, blasint *); FLOATRET BLASFUNC(scmin) (blasint *, float *, blasint *); double BLASFUNC(dzmin) (blasint *, double *, blasint *); xdouble BLASFUNC(qxmin) (blasint *, xdouble *, blasint *); void BLASFUNC(sscal) (blasint *, float *, float *, blasint *); void BLASFUNC(dscal) (blasint *, double *, double *, blasint *); void BLASFUNC(qscal) (blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cscal) (blasint *, float *, float *, blasint *); void BLASFUNC(zscal) (blasint *, double *, double *, blasint *); void BLASFUNC(xscal) (blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(csscal)(blasint *, float *, float *, blasint *); void BLASFUNC(zdscal)(blasint *, double *, double *, blasint *); void BLASFUNC(xqscal)(blasint *, xdouble *, xdouble *, blasint *); FLOATRET BLASFUNC(snrm2) (blasint *, float *, blasint *); FLOATRET BLASFUNC(scnrm2)(blasint *, float *, blasint *); double BLASFUNC(dnrm2) (blasint *, double *, blasint *); xdouble BLASFUNC(qnrm2) (blasint *, xdouble *, blasint *); double BLASFUNC(dznrm2)(blasint *, double *, blasint *); xdouble BLASFUNC(qxnrm2)(blasint *, xdouble *, blasint *); void BLASFUNC(srot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); void BLASFUNC(drot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); void BLASFUNC(qrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); void BLASFUNC(csrot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); void BLASFUNC(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); void BLASFUNC(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); void BLASFUNC(srotg) (float *, float *, float *, float *); void BLASFUNC(drotg) (double *, double *, double *, double *); void BLASFUNC(qrotg) (xdouble *, xdouble *, xdouble *, xdouble *); void BLASFUNC(crotg) (float *, float *, float *, float *); void BLASFUNC(zrotg) (double *, double *, double *, double *); void BLASFUNC(xrotg) (xdouble *, xdouble *, xdouble *, xdouble *); void BLASFUNC(srotmg)(float *, float *, float *, float *, float *); void BLASFUNC(drotmg)(double *, double *, double *, double *, double *); void BLASFUNC(srotm) (blasint *, float *, blasint *, float *, blasint *, float *); void BLASFUNC(drotm) (blasint *, double *, blasint *, double *, blasint *, double *); void BLASFUNC(qrotm) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); /* Level 2 routines */ void BLASFUNC(sger)(blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dger)(blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qger)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(cgeru)(blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(cgerc)(blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(zgeru)(blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(zgerc)(blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(strsv) (char *, char *, char *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(strmv) (char *, char *, char *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float *, float *, blasint *); void BLASFUNC(dtpsv) (char *, char *, char *, blasint *, double *, double *, blasint *); void BLASFUNC(qtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(ctpsv) (char *, char *, char *, blasint *, float *, float *, blasint *); void BLASFUNC(ztpsv) (char *, char *, char *, blasint *, double *, double *, blasint *); void BLASFUNC(xtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(stpmv) (char *, char *, char *, blasint *, float *, float *, blasint *); void BLASFUNC(dtpmv) (char *, char *, char *, blasint *, double *, double *, blasint *); void BLASFUNC(qtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(ctpmv) (char *, char *, char *, blasint *, float *, float *, blasint *); void BLASFUNC(ztpmv) (char *, char *, char *, blasint *, double *, double *, blasint *); void BLASFUNC(xtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(stbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dtbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ctbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(ztbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(stbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dtbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(sspmv) (char *, blasint *, float *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dspmv) (char *, blasint *, double *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cspmv) (char *, blasint *, float *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zspmv) (char *, blasint *, double *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xspmv) (char *, blasint *, xdouble *, xdouble *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(ssyr) (char *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(dsyr) (char *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(qsyr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(csyr) (char *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(zsyr) (char *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(xsyr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ssyr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(dsyr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qsyr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(csyr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(zsyr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xsyr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(sspr) (char *, blasint *, float *, float *, blasint *, float *); void BLASFUNC(dspr) (char *, blasint *, double *, double *, blasint *, double *); void BLASFUNC(qspr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); void BLASFUNC(cspr) (char *, blasint *, float *, float *, blasint *, float *); void BLASFUNC(zspr) (char *, blasint *, double *, double *, blasint *, double *); void BLASFUNC(xspr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); void BLASFUNC(sspr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *); void BLASFUNC(dspr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *); void BLASFUNC(qspr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); void BLASFUNC(cspr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *); void BLASFUNC(zspr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *); void BLASFUNC(xspr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); void BLASFUNC(cher) (char *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(zher) (char *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(xher) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(chpr) (char *, blasint *, float *, float *, blasint *, float *); void BLASFUNC(zhpr) (char *, blasint *, double *, double *, blasint *, double *); void BLASFUNC(xhpr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); void BLASFUNC(cher2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(zher2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xher2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(chpr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *); void BLASFUNC(zhpr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *); void BLASFUNC(xhpr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(chpmv) (char *, blasint *, float *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zhpmv) (char *, blasint *, double *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xhpmv) (char *, blasint *, xdouble *, xdouble *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); int BLASFUNC(snorm)(char *, blasint *, blasint *, float *, blasint *); int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *); int BLASFUNC(cnorm)(char *, blasint *, blasint *, float *, blasint *); int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *); void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); /* Level 3 routines */ void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zgemm)(char *, char *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cgemm3m)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); int BLASFUNC(dge2mm)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); int BLASFUNC(cge2mm)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); int BLASFUNC(zge2mm)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(strsm)(char *, char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(dtrsm)(char *, char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(qtrsm)(char *, char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ctrsm)(char *, char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(ztrsm)(char *, char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(xtrsm)(char *, char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(strmm)(char *, char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(dtrmm)(char *, char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(qtrmm)(char *, char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ctrmm)(char *, char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(ztrmm)(char *, char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(xtrmm)(char *, char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(ssymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(csymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(csymm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zsymm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xsymm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(ssyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(qsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(csyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(ssyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, blasint *, double *, double *, blasint *); void BLASFUNC(qsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble*, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(csyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, blasint *, double *, double *, blasint *); void BLASFUNC(xsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble*, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(chemm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zhemm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xhemm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(chemm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zhemm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xhemm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cherk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zherk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(xherk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cher2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zher2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, blasint *, double *, double *, blasint *); void BLASFUNC(xher2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble*, blasint *, xdouble *, xdouble *, blasint *); int BLASFUNC(cher2m)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); int BLASFUNC(zher2m)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, blasint *, double *, double *, blasint *); int BLASFUNC(xher2m)(char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble*, blasint *, xdouble *, xdouble *, blasint *); int BLASFUNC(sgemt)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, double *, blasint *, double*, blasint *); int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, double *, blasint *, double*, blasint *); int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(dgems)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, double *, blasint *, double*, blasint *); int BLASFUNC(cgems)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(zgems)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, double *, blasint *, double*, blasint *); int BLASFUNC(sgemc)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *); int BLASFUNC(dgemc)(char *, char *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *); int BLASFUNC(qgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); int BLASFUNC(cgemc)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *); int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *); int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); /* Lapack routines */ int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); int BLASFUNC(cgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); int BLASFUNC(zgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); int BLASFUNC(xgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); int BLASFUNC(sgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); int BLASFUNC(dgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); int BLASFUNC(qgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); int BLASFUNC(cgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); int BLASFUNC(zgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); int BLASFUNC(xgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); int BLASFUNC(slaswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *); int BLASFUNC(dlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *); int BLASFUNC(qlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *); int BLASFUNC(claswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *); int BLASFUNC(zlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *); int BLASFUNC(xlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *); int BLASFUNC(sgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(cgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(sgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); int BLASFUNC(qgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(cpotf2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotf2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(spotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(clauu2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zlauu2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(slauum)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dlauum)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qlauum)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(clauum)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zlauum)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xlauum)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(strti2)(char *, char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dtrti2)(char *, char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(ctrti2)(char *, char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(ztrti2)(char *, char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(strtri)(char *, char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dtrtri)(char *, char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(ctrtri)(char *, char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(ztrtri)(char *, char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *); FLOATRET BLASFUNC(slamch)(char *); double BLASFUNC(dlamch)(char *); xdouble BLASFUNC(qlamch)(char *); FLOATRET BLASFUNC(slamc3)(float *, float *); double BLASFUNC(dlamc3)(double *, double *); xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); /* BLAS extensions */ void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(comatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(zomatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(simatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *); void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *); void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *); void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *); void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); #ifdef __cplusplus } #endif /* __cplusplus */ #endif OpenBLAS-0.2.20/common_lapack.h000066400000000000000000000575501313527062700161340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef ASSEMBLER /* Lapack Library */ blasint sgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint sgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint sgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int slaswp_plus (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); int slaswp_minus(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); int dlaswp_plus (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); int dlaswp_minus(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); int qlaswp_plus (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); int qlaswp_minus(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); int claswp_plus (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); int claswp_minus(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); int zlaswp_plus (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); int zlaswp_minus(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); int xlaswp_plus (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); int xlaswp_minus(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); int slaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); int dlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); int qlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); int claswp_ncopy(BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); int zlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); int xlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); blasint sgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint sgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint sgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint sgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint spotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint spotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint spotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint spotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint spotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint spotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint cpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint cpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint slauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint slauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint clauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint clauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint slauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint slauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint clauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint clauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint slauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint slauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint clauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint clauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint strti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint ctrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ztrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint strtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint ctrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ztrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint strtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint strtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint ctrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ctrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint ztrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint ztrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int sneg_tcopy(BLASLONG, BLASLONG, float *, BLASLONG, float *); int dneg_tcopy(BLASLONG, BLASLONG, double *, BLASLONG, double *); int qneg_tcopy(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int cneg_tcopy(BLASLONG, BLASLONG, float *, BLASLONG, float *); int zneg_tcopy(BLASLONG, BLASLONG, double *, BLASLONG, double *); int xneg_tcopy(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); blasint slarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint slarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint dlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint dlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint qlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint qlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint clarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint clarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); blasint zlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif OpenBLAS-0.2.20/common_level1.h000066400000000000000000000265631313527062700160710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef ASSEMBLER #ifdef __CUDACC__ extern "C" { #endif float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_double zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_xdouble xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int caxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int zaxpy_k (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int xaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int caxpyc_k (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int zaxpyc_k (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int xaxpyc_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int scopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); int dcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); int qcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int ccopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); int zcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); int xcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int sswap_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dswap_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double*, BLASLONG); int qswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG); int cswap_k (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int zswap_k (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double*, BLASLONG); int xswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG); float sasum_k (BLASLONG, float *, BLASLONG); double dasum_k (BLASLONG, double *, BLASLONG); xdouble qasum_k (BLASLONG, xdouble *, BLASLONG); float casum_k (BLASLONG, float *, BLASLONG); double zasum_k (BLASLONG, double *, BLASLONG); xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); float samax_k (BLASLONG, float *, BLASLONG); double damax_k (BLASLONG, double *, BLASLONG); xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); float camax_k (BLASLONG, float *, BLASLONG); double zamax_k (BLASLONG, double *, BLASLONG); xdouble xamax_k (BLASLONG, xdouble *, BLASLONG); float samin_k (BLASLONG, float *, BLASLONG); double damin_k (BLASLONG, double *, BLASLONG); xdouble qamin_k (BLASLONG, xdouble *, BLASLONG); float camin_k (BLASLONG, float *, BLASLONG); double zamin_k (BLASLONG, double *, BLASLONG); xdouble xamin_k (BLASLONG, xdouble *, BLASLONG); BLASLONG isamax_k(BLASLONG, float *, BLASLONG); BLASLONG idamax_k(BLASLONG, double *, BLASLONG); BLASLONG iqamax_k(BLASLONG, xdouble *, BLASLONG); BLASLONG icamax_k(BLASLONG, float *, BLASLONG); BLASLONG izamax_k(BLASLONG, double *, BLASLONG); BLASLONG ixamax_k(BLASLONG, xdouble *, BLASLONG); BLASLONG isamin_k(BLASLONG, float *, BLASLONG); BLASLONG idamin_k(BLASLONG, double *, BLASLONG); BLASLONG iqamin_k(BLASLONG, xdouble *, BLASLONG); BLASLONG icamin_k(BLASLONG, float *, BLASLONG); BLASLONG izamin_k(BLASLONG, double *, BLASLONG); BLASLONG ixamin_k(BLASLONG, xdouble *, BLASLONG); float smax_k (BLASLONG, float *, BLASLONG); double dmax_k (BLASLONG, double *, BLASLONG); xdouble qmax_k (BLASLONG, xdouble *, BLASLONG); float cmax_k (BLASLONG, float *, BLASLONG); double zmax_k (BLASLONG, double *, BLASLONG); xdouble xmax_k (BLASLONG, xdouble *, BLASLONG); float smin_k (BLASLONG, float *, BLASLONG); double dmin_k (BLASLONG, double *, BLASLONG); xdouble qmin_k (BLASLONG, xdouble *, BLASLONG); float cmin_k (BLASLONG, float *, BLASLONG); double zmin_k (BLASLONG, double *, BLASLONG); xdouble xmin_k (BLASLONG, xdouble *, BLASLONG); BLASLONG ismax_k(BLASLONG, float *, BLASLONG); BLASLONG idmax_k(BLASLONG, double *, BLASLONG); BLASLONG iqmax_k(BLASLONG, xdouble *, BLASLONG); BLASLONG icmax_k(BLASLONG, float *, BLASLONG); BLASLONG izmax_k(BLASLONG, double *, BLASLONG); BLASLONG ixmax_k(BLASLONG, xdouble *, BLASLONG); BLASLONG ismin_k(BLASLONG, float *, BLASLONG); BLASLONG idmin_k(BLASLONG, double *, BLASLONG); BLASLONG iqmin_k(BLASLONG, xdouble *, BLASLONG); BLASLONG icmin_k(BLASLONG, float *, BLASLONG); BLASLONG izmin_k(BLASLONG, double *, BLASLONG); BLASLONG ixmin_k(BLASLONG, xdouble *, BLASLONG); int sscal_k(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dscal_k(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int qscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int cscal_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int zscal_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int xscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int csscal_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int zdscal_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int xqscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); float snrm2_k(BLASLONG, float *, BLASLONG); double dnrm2_k(BLASLONG, double *, BLASLONG); xdouble qnrm2_k(BLASLONG, xdouble *, BLASLONG); float cnrm2_k(BLASLONG, float *, BLASLONG); double znrm2_k(BLASLONG, double *, BLASLONG); xdouble xnrm2_k(BLASLONG, xdouble *, BLASLONG); int srot_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float , float ); int drot_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int qrot_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); int csrot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG, float , float ); int zdrot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int xqrot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); int srotg_k(float *, float *, float *, float *); int drotg_k(double *, double *, double *, double *); int qrotg_k(xdouble *, xdouble *, xdouble *, xdouble *); int csrotg_k(float *, float *, float *, float *); int zdrotg_k(double *, double *, double *, double *); int xqrotg_k(xdouble *, xdouble *, xdouble *, xdouble *); int srotmg_k(float *, float *, float *, float *, float *); int drotmg_k(double *, double *, double *, double *, double *); int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); int daxpby_k (BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); int caxpby_k (BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); int zaxpby_k (BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); #ifdef __CUDACC__ } #endif #endif OpenBLAS-0.2.20/common_level2.h000066400000000000000000003240541313527062700160660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef ASSEMBLER /* Level 2 Blas routines */ #ifdef __CUDACC__ extern "C" { #endif int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int cgeru_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int cgerc_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int cgerv_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int cgerd_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int zgeru_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zgerc_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zgerv_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zgerd_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int xgeru_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xgerc_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xgerv_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xgerd_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int sger_thread (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int dger_thread (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int qger_thread (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int cger_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int cger_thread_C(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int cger_thread_V(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int cger_thread_D(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int zger_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zger_thread_C(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zger_thread_V(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zger_thread_D(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xger_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xger_thread_C(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xger_thread_V(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xger_thread_D(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int sgemv_n(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int sgemv_t(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int dgemv_n(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int dgemv_t(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int qgemv_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int qgemv_t(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int cgemv_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int cgemv_t(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int cgemv_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int cgemv_c(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int cgemv_o(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int cgemv_u(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int cgemv_s(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int cgemv_d(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); int zgemv_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int zgemv_t(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int zgemv_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int zgemv_c(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int zgemv_o(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int zgemv_u(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int zgemv_s(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int zgemv_d(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); int xgemv_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int xgemv_t(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int xgemv_r(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int xgemv_c(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int xgemv_o(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int xgemv_u(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int xgemv_s(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int xgemv_d(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); int sgemv_thread_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int sgemv_thread_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int dgemv_thread_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int dgemv_thread_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int qgemv_thread_n(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int qgemv_thread_t(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int cgemv_thread_n(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgemv_thread_t(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgemv_thread_r(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgemv_thread_c(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgemv_thread_o(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgemv_thread_u(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgemv_thread_s(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgemv_thread_d(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int zgemv_thread_n(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgemv_thread_t(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgemv_thread_r(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgemv_thread_c(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgemv_thread_o(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgemv_thread_u(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgemv_thread_s(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgemv_thread_d(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int xgemv_thread_n(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgemv_thread_t(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgemv_thread_r(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgemv_thread_c(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgemv_thread_o(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgemv_thread_u(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgemv_thread_s(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgemv_thread_d(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int strsv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int strsv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int strsv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int strsv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int strsv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int strsv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int strsv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int strsv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int dtrsv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtrsv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtrsv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtrsv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtrsv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtrsv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtrsv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtrsv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int qtrsv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtrsv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtrsv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtrsv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtrsv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtrsv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtrsv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtrsv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int ctrsv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctrsv_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ztrsv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztrsv_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int xtrsv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtrsv_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int strmv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int strmv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int strmv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int strmv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int strmv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int strmv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int strmv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int strmv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dtrmv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dtrmv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dtrmv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dtrmv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dtrmv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dtrmv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dtrmv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dtrmv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qtrmv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qtrmv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qtrmv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qtrmv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qtrmv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qtrmv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qtrmv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qtrmv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int ctrmv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ctrmv_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ztrmv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int ztrmv_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int xtrmv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xtrmv_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int strmv_thread_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int strmv_thread_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int strmv_thread_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int strmv_thread_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int strmv_thread_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int strmv_thread_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int strmv_thread_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int strmv_thread_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int dtrmv_thread_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtrmv_thread_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtrmv_thread_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtrmv_thread_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtrmv_thread_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtrmv_thread_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtrmv_thread_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtrmv_thread_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int qtrmv_thread_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtrmv_thread_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtrmv_thread_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtrmv_thread_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtrmv_thread_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtrmv_thread_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtrmv_thread_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtrmv_thread_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int ctrmv_thread_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctrmv_thread_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ztrmv_thread_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztrmv_thread_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xtrmv_thread_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtrmv_thread_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int stpsv_NUU(BLASLONG, float *, float *, BLASLONG, void *); int stpsv_NUN(BLASLONG, float *, float *, BLASLONG, void *); int stpsv_NLU(BLASLONG, float *, float *, BLASLONG, void *); int stpsv_NLN(BLASLONG, float *, float *, BLASLONG, void *); int stpsv_TUU(BLASLONG, float *, float *, BLASLONG, void *); int stpsv_TUN(BLASLONG, float *, float *, BLASLONG, void *); int stpsv_TLU(BLASLONG, float *, float *, BLASLONG, void *); int stpsv_TLN(BLASLONG, float *, float *, BLASLONG, void *); int dtpsv_NUU(BLASLONG, double *, double *, BLASLONG, void *); int dtpsv_NUN(BLASLONG, double *, double *, BLASLONG, void *); int dtpsv_NLU(BLASLONG, double *, double *, BLASLONG, void *); int dtpsv_NLN(BLASLONG, double *, double *, BLASLONG, void *); int dtpsv_TUU(BLASLONG, double *, double *, BLASLONG, void *); int dtpsv_TUN(BLASLONG, double *, double *, BLASLONG, void *); int dtpsv_TLU(BLASLONG, double *, double *, BLASLONG, void *); int dtpsv_TLN(BLASLONG, double *, double *, BLASLONG, void *); int qtpsv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpsv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpsv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpsv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpsv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpsv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpsv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpsv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int ctpsv_NUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_NUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_NLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_NLN(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_TUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_TUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_TLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_TLN(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_RUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_RUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_RLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_RLN(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_CUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_CUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_CLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpsv_CLN(BLASLONG, float *, float *, BLASLONG, void *); int ztpsv_NUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_NUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_NLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_NLN(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_TUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_TUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_TLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_TLN(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_RUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_RUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_RLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_RLN(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_CUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_CUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_CLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpsv_CLN(BLASLONG, double *, double *, BLASLONG, void *); int xtpsv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpsv_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int stpmv_NUU(BLASLONG, float *, float *, BLASLONG, void *); int stpmv_NUN(BLASLONG, float *, float *, BLASLONG, void *); int stpmv_NLU(BLASLONG, float *, float *, BLASLONG, void *); int stpmv_NLN(BLASLONG, float *, float *, BLASLONG, void *); int stpmv_TUU(BLASLONG, float *, float *, BLASLONG, void *); int stpmv_TUN(BLASLONG, float *, float *, BLASLONG, void *); int stpmv_TLU(BLASLONG, float *, float *, BLASLONG, void *); int stpmv_TLN(BLASLONG, float *, float *, BLASLONG, void *); int dtpmv_NUU(BLASLONG, double *, double *, BLASLONG, void *); int dtpmv_NUN(BLASLONG, double *, double *, BLASLONG, void *); int dtpmv_NLU(BLASLONG, double *, double *, BLASLONG, void *); int dtpmv_NLN(BLASLONG, double *, double *, BLASLONG, void *); int dtpmv_TUU(BLASLONG, double *, double *, BLASLONG, void *); int dtpmv_TUN(BLASLONG, double *, double *, BLASLONG, void *); int dtpmv_TLU(BLASLONG, double *, double *, BLASLONG, void *); int dtpmv_TLN(BLASLONG, double *, double *, BLASLONG, void *); int qtpmv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpmv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpmv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpmv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpmv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpmv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpmv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int qtpmv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int ctpmv_NUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_NUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_NLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_NLN(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_TUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_TUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_TLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_TLN(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_RUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_RUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_RLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_RLN(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_CUU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_CUN(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_CLU(BLASLONG, float *, float *, BLASLONG, void *); int ctpmv_CLN(BLASLONG, float *, float *, BLASLONG, void *); int ztpmv_NUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_NUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_NLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_NLN(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_TUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_TUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_TLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_TLN(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_RUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_RUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_RLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_RLN(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_CUU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_CUN(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_CLU(BLASLONG, double *, double *, BLASLONG, void *); int ztpmv_CLN(BLASLONG, double *, double *, BLASLONG, void *); int xtpmv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int xtpmv_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); int stpmv_thread_NUU(BLASLONG, float *, float *, BLASLONG, float *, int); int stpmv_thread_NUN(BLASLONG, float *, float *, BLASLONG, float *, int); int stpmv_thread_NLU(BLASLONG, float *, float *, BLASLONG, float *, int); int stpmv_thread_NLN(BLASLONG, float *, float *, BLASLONG, float *, int); int stpmv_thread_TUU(BLASLONG, float *, float *, BLASLONG, float *, int); int stpmv_thread_TUN(BLASLONG, float *, float *, BLASLONG, float *, int); int stpmv_thread_TLU(BLASLONG, float *, float *, BLASLONG, float *, int); int stpmv_thread_TLN(BLASLONG, float *, float *, BLASLONG, float *, int); int dtpmv_thread_NUU(BLASLONG, double *, double *, BLASLONG, double *, int); int dtpmv_thread_NUN(BLASLONG, double *, double *, BLASLONG, double *, int); int dtpmv_thread_NLU(BLASLONG, double *, double *, BLASLONG, double *, int); int dtpmv_thread_NLN(BLASLONG, double *, double *, BLASLONG, double *, int); int dtpmv_thread_TUU(BLASLONG, double *, double *, BLASLONG, double *, int); int dtpmv_thread_TUN(BLASLONG, double *, double *, BLASLONG, double *, int); int dtpmv_thread_TLU(BLASLONG, double *, double *, BLASLONG, double *, int); int dtpmv_thread_TLN(BLASLONG, double *, double *, BLASLONG, double *, int); int qtpmv_thread_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int qtpmv_thread_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int qtpmv_thread_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int qtpmv_thread_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int qtpmv_thread_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int qtpmv_thread_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int qtpmv_thread_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int qtpmv_thread_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int ctpmv_thread_NUU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_NUN(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_NLU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_NLN(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_TUU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_TUN(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_TLU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_TLN(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_RUU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_RUN(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_RLU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_RLN(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_CUU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_CUN(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_CLU(BLASLONG, float *, float *, BLASLONG, float *, int); int ctpmv_thread_CLN(BLASLONG, float *, float *, BLASLONG, float *, int); int ztpmv_thread_NUU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_NUN(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_NLU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_NLN(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_TUU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_TUN(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_TLU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_TLN(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_RUU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_RUN(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_RLU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_RLN(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_CUU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_CUN(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_CLU(BLASLONG, double *, double *, BLASLONG, double *, int); int ztpmv_thread_CLN(BLASLONG, double *, double *, BLASLONG, double *, int); int xtpmv_thread_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int xtpmv_thread_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); int ssymv_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ssymv_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dsymv_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dsymv_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qsymv_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qsymv_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int csymv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int csymv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int zsymv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zsymv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int xsymv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xsymv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int ssymv_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ssymv_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int dsymv_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dsymv_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int qsymv_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qsymv_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int csymv_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int csymv_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int zsymv_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zsymv_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xsymv_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xsymv_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int chemv_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int chemv_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int chemv_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int chemv_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int zhemv_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zhemv_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zhemv_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zhemv_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xhemv_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhemv_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhemv_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhemv_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int sspmv_L(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int sspmv_U(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int dspmv_L(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int dspmv_U(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int qspmv_L(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qspmv_U(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int cspmv_L(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int cspmv_U(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int zspmv_L(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int zspmv_U(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int xspmv_L(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xspmv_U(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int sspmv_thread_L(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int sspmv_thread_U(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int dspmv_thread_L(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int dspmv_thread_U(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int qspmv_thread_L(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qspmv_thread_U(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int cspmv_thread_L(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int cspmv_thread_U(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int zspmv_thread_L(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int zspmv_thread_U(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int xspmv_thread_L(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xspmv_thread_U(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int ssyr_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); int ssyr_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); int dsyr_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); int dsyr_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); int qsyr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qsyr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int csyr_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *); int csyr_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *); int zsyr_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *); int zsyr_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *); int xsyr_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xsyr_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int ssyr_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); int ssyr_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); int dsyr_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); int dsyr_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); int qsyr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qsyr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int csyr_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int csyr_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int zsyr_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int zsyr_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int xsyr_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xsyr_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int ssyr2_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int ssyr2_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dsyr2_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dsyr2_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qsyr2_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qsyr2_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int csyr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int csyr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int zsyr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zsyr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int xsyr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xsyr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int ssyr2_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ssyr2_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int dsyr2_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dsyr2_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int qsyr2_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qsyr2_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int csyr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int csyr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int zsyr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zsyr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xsyr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xsyr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int sspr_L(BLASLONG, float, float *, BLASLONG, float *, float *); int sspr_U(BLASLONG, float, float *, BLASLONG, float *, float *); int dspr_L(BLASLONG, double, double *, BLASLONG, double *, double *); int dspr_U(BLASLONG, double, double *, BLASLONG, double *, double *); int qspr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int qspr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int cspr_L(BLASLONG, float, float, float *, BLASLONG, float *, float *); int cspr_U(BLASLONG, float, float, float *, BLASLONG, float *, float *); int zspr_L(BLASLONG, double, double, double *, BLASLONG, double *, double *); int zspr_U(BLASLONG, double, double, double *, BLASLONG, double *, double *); int xspr_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int xspr_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int sspr_thread_L(BLASLONG, float, float *, BLASLONG, float *, float *, int); int sspr_thread_U(BLASLONG, float, float *, BLASLONG, float *, float *, int); int dspr_thread_L(BLASLONG, double, double *, BLASLONG, double *, double *, int); int dspr_thread_U(BLASLONG, double, double *, BLASLONG, double *, double *, int); int qspr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); int qspr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); int cspr_thread_L(BLASLONG, float *, float *, BLASLONG, float *, float *, int); int cspr_thread_U(BLASLONG, float *, float *, BLASLONG, float *, float *, int); int zspr_thread_L(BLASLONG, double *, double *, BLASLONG, double *, double *, int); int zspr_thread_U(BLASLONG, double *, double *, BLASLONG, double *, double *, int); int xspr_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xspr_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, xdouble *, int); int sspr2_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int sspr2_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int dspr2_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int dspr2_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int qspr2_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int qspr2_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int cspr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int cspr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int zspr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int zspr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int xspr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int xspr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int sspr2_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int sspr2_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int dspr2_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int dspr2_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int qspr2_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int qspr2_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int cspr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int cspr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int zspr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int zspr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int xspr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xspr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int cher_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); int cher_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); int cher_V(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); int cher_M(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); int zher_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); int zher_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); int zher_V(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); int zher_M(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); int xher_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xher_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xher_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xher_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int cher_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); int cher_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); int cher_thread_V(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); int cher_thread_M(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); int zher_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); int zher_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); int zher_thread_V(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); int zher_thread_M(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); int xher_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xher_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xher_thread_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xher_thread_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int cher2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int cher2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int cher2_M(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int cher2_V(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int zher2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zher2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zher2_M(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zher2_V(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int xher2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xher2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xher2_M(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xher2_V(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int cher2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int cher2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int cher2_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int cher2_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int zher2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zher2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zher2_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zher2_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xher2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xher2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xher2_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xher2_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int chpr_L(BLASLONG, float, float *, BLASLONG, float *, float *); int chpr_U(BLASLONG, float, float *, BLASLONG, float *, float *); int chpr_M(BLASLONG, float, float *, BLASLONG, float *, float *); int chpr_V(BLASLONG, float, float *, BLASLONG, float *, float *); int zhpr_L(BLASLONG, double, double *, BLASLONG, double *, double *); int zhpr_U(BLASLONG, double, double *, BLASLONG, double *, double *); int zhpr_M(BLASLONG, double, double *, BLASLONG, double *, double *); int zhpr_V(BLASLONG, double, double *, BLASLONG, double *, double *); int xhpr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int xhpr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int xhpr_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int xhpr_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); int chpr_thread_L(BLASLONG, float, float *, BLASLONG, float *, float *, int); int chpr_thread_U(BLASLONG, float, float *, BLASLONG, float *, float *, int); int chpr_thread_M(BLASLONG, float, float *, BLASLONG, float *, float *, int); int chpr_thread_V(BLASLONG, float, float *, BLASLONG, float *, float *, int); int zhpr_thread_L(BLASLONG, double, double *, BLASLONG, double *, double *, int); int zhpr_thread_U(BLASLONG, double, double *, BLASLONG, double *, double *, int); int zhpr_thread_M(BLASLONG, double, double *, BLASLONG, double *, double *, int); int zhpr_thread_V(BLASLONG, double, double *, BLASLONG, double *, double *, int); int xhpr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xhpr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xhpr_thread_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xhpr_thread_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); int chpr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int chpr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int chpr2_M(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int chpr2_V(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); int zhpr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int zhpr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int zhpr2_M(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int zhpr2_V(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); int xhpr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int xhpr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int xhpr2_M(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int xhpr2_V(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); int chpr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int chpr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int chpr2_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int chpr2_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); int zhpr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int zhpr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int zhpr2_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int zhpr2_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); int xhpr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xhpr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xhpr2_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int xhpr2_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); int chemv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int chemv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int chemv_M(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int chemv_V(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int zhemv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zhemv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zhemv_M(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int zhemv_V(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int xhemv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xhemv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xhemv_M(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int xhemv_V(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int chpmv_L(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int chpmv_U(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int chpmv_M(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int chpmv_V(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); int zhpmv_L(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int zhpmv_U(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int zhpmv_M(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int zhpmv_V(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); int xhpmv_L(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xhpmv_U(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xhpmv_M(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xhpmv_V(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int chpmv_thread_L(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int chpmv_thread_U(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int chpmv_thread_M(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int chpmv_thread_V(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); int zhpmv_thread_L(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int zhpmv_thread_U(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int zhpmv_thread_M(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int zhpmv_thread_V(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); int xhpmv_thread_L(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhpmv_thread_U(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhpmv_thread_M(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhpmv_thread_V(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int ssbmv_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ssbmv_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int dsbmv_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dsbmv_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int qsbmv_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qsbmv_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int csbmv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int csbmv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int zsbmv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int zsbmv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int xsbmv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xsbmv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int chbmv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int chbmv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int chbmv_M(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int chbmv_V(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int zhbmv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int zhbmv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int zhbmv_M(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int zhbmv_V(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int xhbmv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xhbmv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xhbmv_M(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xhbmv_V(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int ssbmv_thread_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ssbmv_thread_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int dsbmv_thread_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dsbmv_thread_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int qsbmv_thread_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qsbmv_thread_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int csbmv_thread_L(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int csbmv_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int zsbmv_thread_L(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zsbmv_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xsbmv_thread_L(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xsbmv_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int chbmv_thread_L(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int chbmv_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int chbmv_thread_M(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int chbmv_thread_V(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int zhbmv_thread_L(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zhbmv_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zhbmv_thread_M(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int zhbmv_thread_V(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xhbmv_thread_L(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhbmv_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhbmv_thread_M(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xhbmv_thread_V(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int snorm_n(BLASLONG, BLASLONG, float *a, BLASLONG); int snorm_t(BLASLONG, BLASLONG, float *a, BLASLONG); int dnorm_n(BLASLONG, BLASLONG, double *a, BLASLONG); int dnorm_t(BLASLONG, BLASLONG, double *a, BLASLONG); int cnorm_n(BLASLONG, BLASLONG, float *a, BLASLONG); int cnorm_t(BLASLONG, BLASLONG, float *a, BLASLONG); int znorm_n(BLASLONG, BLASLONG, double *a, BLASLONG); int znorm_t(BLASLONG, BLASLONG, double *a, BLASLONG); void sgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void sgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void dgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void dgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void qgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void qgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void cgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void zgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void zgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void zgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void zgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void zgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void zgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void zgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void zgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void xgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void xgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void xgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void xgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void xgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void xgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void xgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void xgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); int sgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int sgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int dgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int dgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int qgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int qgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int cgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int zgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int zgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int xgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int xgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int stbmv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbmv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbmv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbmv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbmv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbmv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbmv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbmv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int dtbmv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbmv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbmv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbmv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbmv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbmv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbmv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbmv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int qtbmv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbmv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbmv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbmv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbmv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbmv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbmv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbmv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int ctbmv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbmv_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ztbmv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbmv_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int xtbmv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbmv_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int stbmv_thread_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int stbmv_thread_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int stbmv_thread_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int stbmv_thread_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int stbmv_thread_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int stbmv_thread_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int stbmv_thread_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int stbmv_thread_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int dtbmv_thread_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtbmv_thread_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtbmv_thread_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtbmv_thread_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtbmv_thread_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtbmv_thread_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtbmv_thread_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int dtbmv_thread_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int qtbmv_thread_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtbmv_thread_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtbmv_thread_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtbmv_thread_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtbmv_thread_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtbmv_thread_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtbmv_thread_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int qtbmv_thread_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int ctbmv_thread_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ctbmv_thread_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); int ztbmv_thread_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int ztbmv_thread_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); int xtbmv_thread_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int xtbmv_thread_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); int stbsv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbsv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbsv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbsv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbsv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbsv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbsv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int stbsv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int dtbsv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbsv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbsv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbsv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbsv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbsv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbsv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int dtbsv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int qtbsv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbsv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbsv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbsv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbsv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbsv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbsv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int qtbsv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int ctbsv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ctbsv_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); int ztbsv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int ztbsv_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); int xtbsv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); int xtbsv_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); #ifdef __CUDACC__ } #endif #endif OpenBLAS-0.2.20/common_level3.h000066400000000000000000004377241313527062700161000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef ASSEMBLER #ifdef __CUDACC__ __global__ void cuda_sgemm_kernel(int, int, int, float *, float *, float *); __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); #endif #ifdef __CUDACC__ extern "C" { #endif int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int cgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int zgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); #ifdef EXPRECISION int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #else int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int dgemm_incopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int dgemm_itcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int dgemm_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int dgemm_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int cgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int zgemm_incopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm_itcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); #ifdef QUAD_PRECISION int qgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); int qgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); int qgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); int qgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); int xgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); int xgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); int xgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); int xgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); #else int qgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int qgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int qgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int qgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); #endif int strsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int strsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int strsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int strsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int dtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int dtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int dtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int dtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int qtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int qtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int qtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int qtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int ctrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ztrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int xtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int strmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int strmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int strmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int strmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int dtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int dtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int dtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int dtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int qtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int qtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int qtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int qtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int ctrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ctrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int ztrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int ztrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int xtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int xtrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int strmm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int dtrmm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int qtrmm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qtrmm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int ctrmm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ztrmm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int xtrmm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xtrmm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int strsm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int strsm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int dtrsm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int dtrsm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int qtrsm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int qtrsm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int ctrsm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ctrsm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); int ztrsm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int ztrsm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); int xtrsm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int xtrsm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); int ssymm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ssymm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ssymm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ssymm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int dsymm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dsymm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dsymm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dsymm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int qsymm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qsymm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qsymm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int qsymm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int csymm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int csymm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int csymm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int csymm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int zsymm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int zsymm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int zsymm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int zsymm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int xsymm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xsymm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xsymm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xsymm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int chemm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int chemm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int chemm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int chemm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int zhemm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int zhemm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int zhemm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int zhemm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int xhemm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xhemm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xhemm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int xhemm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); int ssyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int ssyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int dsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int dsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int qsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int qsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int csyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int csyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int zsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int zsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int xsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int xsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int ssyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int ssyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int dsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int dsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int qsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int qsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int csyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int csyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int zsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int zsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int xsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int cherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int cherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int cherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int cherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); int zherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int zherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int zherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int zherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); int xherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int xherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int xherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int xherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); int cher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int cher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int cher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int cher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); int zher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int zher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int zher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int zher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); #ifdef QUAD_PRECISION int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble *, xdouble *, BLASLONG); #else int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); #endif int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int zgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int zgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int zgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int zgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int xgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int xgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int xgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); #ifdef QUAD_PRECISION int qgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int qgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int qgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int qgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); #else int qgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif int cgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); #ifdef QUAD_PRECISION int xgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); #else int xgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); #ifdef QUAD_PRECISION int qgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int qgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int qgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); int qgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); #else int qgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif int cgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int cgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int cgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int cher2m_LNN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LNT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LNR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LNC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LTN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LTT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LTR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LTC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LRN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LRT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LRR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LRC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LCN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LCT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LCR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_LCC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UNN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UNT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UNR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UNC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UTN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UTT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UTR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UTC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_URN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_URT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_URR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_URC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UCN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UCT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UCR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int cher2m_UCC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); int zher2m_LNN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LNT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LNR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LNC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LTN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LTT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LTR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LTC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LRN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LRT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LRR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LRC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LCN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LCT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LCR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_LCC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UNN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UNT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UNR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UNC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UTN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UTT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UTR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UTC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_URN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_URT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_URR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_URC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UCN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UCT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UCR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int zher2m_UCC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); int strsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int ctrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ztrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int strmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int strmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int ctrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ctrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ztrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int ztrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xtrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int ssymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int csymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int csymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zsymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xsymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int csymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zsymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xsymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int chemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zhemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xhemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int chemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zhemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xhemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int chemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zhemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xhemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int ssymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int csymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int chemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int chemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zhemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zhemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xhemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xhemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int ssyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int csyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int ssyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int csyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int ssyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int ssyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int csyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int csyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int cherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int cherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int cher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int sgemt_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, int); int sgemt_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, int); int dgemt_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, int); int dgemt_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, int); int cgemt_n(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); int cgemt_t(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); int cgemt_r(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); int cgemt_c(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); int zgemt_n(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); int zgemt_t(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); int zgemt_r(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); int zgemt_c(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); int sgema_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int sgema_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int dgema_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int dgema_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int cgema_n(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int cgema_t(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int cgema_r(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int cgema_c(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int zgema_n(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zgema_t(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zgema_r(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zgema_c(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int cgemm3m_incopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm3m_incopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm3m_incopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm3m_itcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm3m_itcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm3m_itcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int cgemm3m_oncopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); int cgemm3m_oncopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); int cgemm3m_oncopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); int cgemm3m_otcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); int cgemm3m_otcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); int cgemm3m_otcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); int zgemm3m_incopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm3m_incopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm3m_incopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm3m_itcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm3m_itcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm3m_itcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int zgemm3m_oncopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); int zgemm3m_oncopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); int zgemm3m_oncopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); int zgemm3m_otcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); int zgemm3m_otcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); int zgemm3m_otcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); int xgemm3m_incopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm3m_incopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm3m_incopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm3m_itcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm3m_itcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm3m_itcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); int xgemm3m_oncopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xgemm3m_oncopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xgemm3m_oncopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xgemm3m_otcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xgemm3m_otcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xgemm3m_otcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); int csymm3m_iucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int csymm3m_ilcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int csymm3m_iucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int csymm3m_ilcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int csymm3m_iucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int csymm3m_ilcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int csymm3m_oucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int csymm3m_olcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int csymm3m_oucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int csymm3m_olcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int csymm3m_oucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int csymm3m_olcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int zsymm3m_iucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zsymm3m_ilcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zsymm3m_iucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zsymm3m_ilcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zsymm3m_iucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zsymm3m_ilcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zsymm3m_oucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zsymm3m_olcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zsymm3m_oucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zsymm3m_olcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zsymm3m_oucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zsymm3m_olcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int xsymm3m_iucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xsymm3m_ilcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xsymm3m_iucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xsymm3m_ilcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xsymm3m_iucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xsymm3m_ilcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xsymm3m_oucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xsymm3m_olcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xsymm3m_oucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xsymm3m_olcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xsymm3m_oucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xsymm3m_olcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int chemm3m_iucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int chemm3m_ilcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int chemm3m_iucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int chemm3m_ilcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int chemm3m_iucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int chemm3m_ilcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); int chemm3m_oucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int chemm3m_olcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int chemm3m_oucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int chemm3m_olcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int chemm3m_oucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int chemm3m_olcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); int zhemm3m_iucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zhemm3m_ilcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zhemm3m_iucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zhemm3m_ilcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zhemm3m_iucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zhemm3m_ilcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); int zhemm3m_oucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zhemm3m_olcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zhemm3m_oucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zhemm3m_olcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zhemm3m_oucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int zhemm3m_olcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); int xhemm3m_iucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xhemm3m_ilcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xhemm3m_iucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xhemm3m_ilcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xhemm3m_iucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xhemm3m_ilcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); int xhemm3m_oucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xhemm3m_olcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xhemm3m_oucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xhemm3m_olcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xhemm3m_oucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int xhemm3m_olcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); int sgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int dgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int dgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int qgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int qgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int cgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int cgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int zgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int zgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); int xgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int sgemc_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); int sgemc_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); int dgemc_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); int dgemc_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); int qgemc_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); int qgemc_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); int cgemc_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); int cgemc_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); int zgemc_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); int zgemc_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); int xgemc_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); int xgemc_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); int somatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int simatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG); int simatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG); int simatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG); int simatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG); int domatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int dimatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG); int dimatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG); int dimatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG); int dimatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG); int comatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int cimatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int cimatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int cimatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int cimatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int comatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int cimatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int cimatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int cimatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int cimatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int zomatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zimatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zimatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zimatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zimatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zomatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zimatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zimatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zimatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG); int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); #ifdef __CUDACC__ } #endif #endif OpenBLAS-0.2.20/common_linux.h000066400000000000000000000104201313527062700160210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_LINUX_H #define COMMON_LINUX_H #ifndef ASSEMBLER #include extern long int syscall (long int __sysno, ...); #ifndef MPOL_PREFERRED #define MPOL_PREFERRED 1 #endif #ifndef MPOL_INTERLEAVE #define MPOL_INTERLEAVE 3 #endif #if defined(ARCH_IA64) && defined(__ECC) #ifndef __NR_mbind #define __NR_mbind 1259 #endif #ifndef __NR_get_mempolicy #define __NR_get_mempolicy 1260 #endif #ifndef __NR_set_mempolicy #define __NR_set_mempolicy 1261 #endif #endif static inline int my_mbind(void *addr, unsigned long len, int mode, unsigned long *nodemask, unsigned long maxnode, unsigned flags) { #if defined (__LSB_VERSION__) || defined(ARCH_ZARCH) // So far, LSB (Linux Standard Base) don't support syscall(). // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else #if defined (LOONGSON3B) #if defined (__64BIT__) return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #else return 0; //NULL Implementation on Loongson 3B 32bit. #endif #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif #endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { #if defined (__LSB_VERSION__) || defined(ARCH_ZARCH) // So far, LSB (Linux Standard Base) don't support syscall(). // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else return syscall(SYS_set_mempolicy, mode, addr, flag); #endif } static inline int my_gettid(void) { #ifdef SYS_gettid return syscall(SYS_gettid); #else return getpid(); #endif } #endif #endif OpenBLAS-0.2.20/common_macro.h000066400000000000000000002436201313527062700157750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_MACRO #define COMMON_MACRO #include "common_s.h" #include "common_d.h" #include "common_q.h" #include "common_c.h" #include "common_z.h" #include "common_x.h" #ifndef COMPLEX #ifdef XDOUBLE #define AMAX_K QAMAX_K #define AMIN_K QAMIN_K #define MAX_K QMAX_K #define MIN_K QMIN_K #define IAMAX_K IQAMAX_K #define IAMIN_K IQAMIN_K #define IMAX_K IQMAX_K #define IMIN_K IQMIN_K #define ASUM_K QASUM_K #define AXPYU_K QAXPYU_K #define AXPYC_K QAXPYC_K #define COPY_K QCOPY_K #define DOTU_K QDOTU_K #define DOTC_K QDOTC_K #define NRM2_K QNRM2_K #define SCAL_K QSCAL_K #define SWAP_K QSWAP_K #define ROT_K QROT_K #define GEMV_N QGEMV_N #define GEMV_T QGEMV_T #define GEMV_R QGEMV_R #define GEMV_C QGEMV_C #define GEMV_O QGEMV_O #define GEMV_U QGEMV_U #define GEMV_S QGEMV_S #define GEMV_D QGEMV_D #define GERU_K QGERU_K #define GERC_K QGERC_K #define GERV_K QGERV_K #define GERD_K QGERD_K #define SYMV_U QSYMV_U #define SYMV_L QSYMV_L #define SYMV_THREAD_U QSYMV_THREAD_U #define SYMV_THREAD_L QSYMV_THREAD_L #define GEMM_ONCOPY QGEMM_ONCOPY #define GEMM_OTCOPY QGEMM_OTCOPY #define GEMM_INCOPY QGEMM_INCOPY #define GEMM_ITCOPY QGEMM_ITCOPY #ifdef UNIT #define TRMM_OUNCOPY QTRMM_OUNUCOPY #define TRMM_OUTCOPY QTRMM_OUTUCOPY #define TRMM_OLNCOPY QTRMM_OLNUCOPY #define TRMM_OLTCOPY QTRMM_OLTUCOPY #define TRSM_OUNCOPY QTRSM_OUNUCOPY #define TRSM_OUTCOPY QTRSM_OUTUCOPY #define TRSM_OLNCOPY QTRSM_OLNUCOPY #define TRSM_OLTCOPY QTRSM_OLTUCOPY #define TRMM_IUNCOPY QTRMM_IUNUCOPY #define TRMM_IUTCOPY QTRMM_IUTUCOPY #define TRMM_ILNCOPY QTRMM_ILNUCOPY #define TRMM_ILTCOPY QTRMM_ILTUCOPY #define TRSM_IUNCOPY QTRSM_IUNUCOPY #define TRSM_IUTCOPY QTRSM_IUTUCOPY #define TRSM_ILNCOPY QTRSM_ILNUCOPY #define TRSM_ILTCOPY QTRSM_ILTUCOPY #else #define TRMM_OUNCOPY QTRMM_OUNNCOPY #define TRMM_OUTCOPY QTRMM_OUTNCOPY #define TRMM_OLNCOPY QTRMM_OLNNCOPY #define TRMM_OLTCOPY QTRMM_OLTNCOPY #define TRSM_OUNCOPY QTRSM_OUNNCOPY #define TRSM_OUTCOPY QTRSM_OUTNCOPY #define TRSM_OLNCOPY QTRSM_OLNNCOPY #define TRSM_OLTCOPY QTRSM_OLTNCOPY #define TRMM_IUNCOPY QTRMM_IUNNCOPY #define TRMM_IUTCOPY QTRMM_IUTNCOPY #define TRMM_ILNCOPY QTRMM_ILNNCOPY #define TRMM_ILTCOPY QTRMM_ILTNCOPY #define TRSM_IUNCOPY QTRSM_IUNNCOPY #define TRSM_IUTCOPY QTRSM_IUTNCOPY #define TRSM_ILNCOPY QTRSM_ILNNCOPY #define TRSM_ILTCOPY QTRSM_ILTNCOPY #endif #define GEMM_BETA QGEMM_BETA #define GEMM_KERNEL_N QGEMM_KERNEL #define GEMM_KERNEL_L QGEMM_KERNEL #define GEMM_KERNEL_R QGEMM_KERNEL #define GEMM_KERNEL_B QGEMM_KERNEL #define TRMM_KERNEL_LN QTRMM_KERNEL_LN #define TRMM_KERNEL_LT QTRMM_KERNEL_LT #define TRMM_KERNEL_LR QTRMM_KERNEL_LN #define TRMM_KERNEL_LC QTRMM_KERNEL_LT #define TRMM_KERNEL_RN QTRMM_KERNEL_RN #define TRMM_KERNEL_RT QTRMM_KERNEL_RT #define TRMM_KERNEL_RR QTRMM_KERNEL_RN #define TRMM_KERNEL_RC QTRMM_KERNEL_RT #define TRSM_KERNEL_LN QTRSM_KERNEL_LN #define TRSM_KERNEL_LT QTRSM_KERNEL_LT #define TRSM_KERNEL_LR QTRSM_KERNEL_LN #define TRSM_KERNEL_LC QTRSM_KERNEL_LT #define TRSM_KERNEL_RN QTRSM_KERNEL_RN #define TRSM_KERNEL_RT QTRSM_KERNEL_RT #define TRSM_KERNEL_RR QTRSM_KERNEL_RN #define TRSM_KERNEL_RC QTRSM_KERNEL_RT #define SYMM_IUTCOPY QSYMM_IUTCOPY #define SYMM_ILTCOPY QSYMM_ILTCOPY #define SYMM_OUTCOPY QSYMM_OUTCOPY #define SYMM_OLTCOPY QSYMM_OLTCOPY #define GEMM_NN QGEMM_NN #define GEMM_CN QGEMM_TN #define GEMM_TN QGEMM_TN #define GEMM_NC QGEMM_NT #define GEMM_NT QGEMM_NT #define GEMM_CC QGEMM_TT #define GEMM_CT QGEMM_TT #define GEMM_TC QGEMM_TT #define GEMM_TT QGEMM_TT #define GEMM_NR QGEMM_NN #define GEMM_TR QGEMM_TN #define GEMM_CR QGEMM_TN #define GEMM_RN QGEMM_NN #define GEMM_RT QGEMM_NT #define GEMM_RC QGEMM_NT #define GEMM_RR QGEMM_NN #define SYMM_LU QSYMM_LU #define SYMM_LL QSYMM_LL #define SYMM_RU QSYMM_RU #define SYMM_RL QSYMM_RL #define HEMM_LU QHEMM_LU #define HEMM_LL QHEMM_LL #define HEMM_RU QHEMM_RU #define HEMM_RL QHEMM_RL #define SYRK_UN QSYRK_UN #define SYRK_UT QSYRK_UT #define SYRK_LN QSYRK_LN #define SYRK_LT QSYRK_LT #define SYRK_UR QSYRK_UN #define SYRK_UC QSYRK_UT #define SYRK_LR QSYRK_LN #define SYRK_LC QSYRK_LT #define SYRK_KERNEL_U QSYRK_KERNEL_U #define SYRK_KERNEL_L QSYRK_KERNEL_L #define HERK_UN QSYRK_UN #define HERK_LN QSYRK_LN #define HERK_UC QSYRK_UT #define HERK_LC QSYRK_LT #define HER2K_UN QSYR2K_UN #define HER2K_LN QSYR2K_LN #define HER2K_UC QSYR2K_UT #define HER2K_LC QSYR2K_LT #define SYR2K_UN QSYR2K_UN #define SYR2K_UT QSYR2K_UT #define SYR2K_LN QSYR2K_LN #define SYR2K_LT QSYR2K_LT #define SYR2K_UR QSYR2K_UN #define SYR2K_UC QSYR2K_UT #define SYR2K_LR QSYR2K_LN #define SYR2K_LC QSYR2K_LT #define SYR2K_KERNEL_U QSYR2K_KERNEL_U #define SYR2K_KERNEL_L QSYR2K_KERNEL_L #define TRMM_LNUU QTRMM_LNUU #define TRMM_LNUN QTRMM_LNUN #define TRMM_LNLU QTRMM_LNLU #define TRMM_LNLN QTRMM_LNLN #define TRMM_LTUU QTRMM_LTUU #define TRMM_LTUN QTRMM_LTUN #define TRMM_LTLU QTRMM_LTLU #define TRMM_LTLN QTRMM_LTLN #define TRMM_LRUU QTRMM_LNUU #define TRMM_LRUN QTRMM_LNUN #define TRMM_LRLU QTRMM_LNLU #define TRMM_LRLN QTRMM_LNLN #define TRMM_LCUU QTRMM_LTUU #define TRMM_LCUN QTRMM_LTUN #define TRMM_LCLU QTRMM_LTLU #define TRMM_LCLN QTRMM_LTLN #define TRMM_RNUU QTRMM_RNUU #define TRMM_RNUN QTRMM_RNUN #define TRMM_RNLU QTRMM_RNLU #define TRMM_RNLN QTRMM_RNLN #define TRMM_RTUU QTRMM_RTUU #define TRMM_RTUN QTRMM_RTUN #define TRMM_RTLU QTRMM_RTLU #define TRMM_RTLN QTRMM_RTLN #define TRMM_RRUU QTRMM_RNUU #define TRMM_RRUN QTRMM_RNUN #define TRMM_RRLU QTRMM_RNLU #define TRMM_RRLN QTRMM_RNLN #define TRMM_RCUU QTRMM_RTUU #define TRMM_RCUN QTRMM_RTUN #define TRMM_RCLU QTRMM_RTLU #define TRMM_RCLN QTRMM_RTLN #define TRSM_LNUU QTRSM_LNUU #define TRSM_LNUN QTRSM_LNUN #define TRSM_LNLU QTRSM_LNLU #define TRSM_LNLN QTRSM_LNLN #define TRSM_LTUU QTRSM_LTUU #define TRSM_LTUN QTRSM_LTUN #define TRSM_LTLU QTRSM_LTLU #define TRSM_LTLN QTRSM_LTLN #define TRSM_LRUU QTRSM_LNUU #define TRSM_LRUN QTRSM_LNUN #define TRSM_LRLU QTRSM_LNLU #define TRSM_LRLN QTRSM_LNLN #define TRSM_LCUU QTRSM_LTUU #define TRSM_LCUN QTRSM_LTUN #define TRSM_LCLU QTRSM_LTLU #define TRSM_LCLN QTRSM_LTLN #define TRSM_RNUU QTRSM_RNUU #define TRSM_RNUN QTRSM_RNUN #define TRSM_RNLU QTRSM_RNLU #define TRSM_RNLN QTRSM_RNLN #define TRSM_RTUU QTRSM_RTUU #define TRSM_RTUN QTRSM_RTUN #define TRSM_RTLU QTRSM_RTLU #define TRSM_RTLN QTRSM_RTLN #define TRSM_RRUU QTRSM_RNUU #define TRSM_RRUN QTRSM_RNUN #define TRSM_RRLU QTRSM_RNLU #define TRSM_RRLN QTRSM_RNLN #define TRSM_RCUU QTRSM_RTUU #define TRSM_RCUN QTRSM_RTUN #define TRSM_RCLU QTRSM_RTLU #define TRSM_RCLN QTRSM_RTLN #define GEMM_THREAD_NN QGEMM_THREAD_NN #define GEMM_THREAD_CN QGEMM_THREAD_TN #define GEMM_THREAD_TN QGEMM_THREAD_TN #define GEMM_THREAD_NC QGEMM_THREAD_NT #define GEMM_THREAD_NT QGEMM_THREAD_NT #define GEMM_THREAD_CC QGEMM_THREAD_TT #define GEMM_THREAD_CT QGEMM_THREAD_TT #define GEMM_THREAD_TC QGEMM_THREAD_TT #define GEMM_THREAD_TT QGEMM_THREAD_TT #define GEMM_THREAD_NR QGEMM_THREAD_NN #define GEMM_THREAD_TR QGEMM_THREAD_TN #define GEMM_THREAD_CR QGEMM_THREAD_TN #define GEMM_THREAD_RN QGEMM_THREAD_NN #define GEMM_THREAD_RT QGEMM_THREAD_NT #define GEMM_THREAD_RC QGEMM_THREAD_NT #define GEMM_THREAD_RR QGEMM_THREAD_NN #define SYMM_THREAD_LU QSYMM_THREAD_LU #define SYMM_THREAD_LL QSYMM_THREAD_LL #define SYMM_THREAD_RU QSYMM_THREAD_RU #define SYMM_THREAD_RL QSYMM_THREAD_RL #define HEMM_THREAD_LU QHEMM_THREAD_LU #define HEMM_THREAD_LL QHEMM_THREAD_LL #define HEMM_THREAD_RU QHEMM_THREAD_RU #define HEMM_THREAD_RL QHEMM_THREAD_RL #define SYRK_THREAD_UN QSYRK_THREAD_UN #define SYRK_THREAD_UT QSYRK_THREAD_UT #define SYRK_THREAD_LN QSYRK_THREAD_LN #define SYRK_THREAD_LT QSYRK_THREAD_LT #define SYRK_THREAD_UR QSYRK_THREAD_UR #define SYRK_THREAD_UC QSYRK_THREAD_UC #define SYRK_THREAD_LR QSYRK_THREAD_LN #define SYRK_THREAD_LC QSYRK_THREAD_LT #define HERK_THREAD_UN QSYRK_THREAD_UN #define HERK_THREAD_UT QSYRK_THREAD_UT #define HERK_THREAD_LN QSYRK_THREAD_LN #define HERK_THREAD_LT QSYRK_THREAD_LT #define HERK_THREAD_UR QSYRK_THREAD_UR #define HERK_THREAD_UC QSYRK_THREAD_UC #define HERK_THREAD_LR QSYRK_THREAD_LN #define HERK_THREAD_LC QSYRK_THREAD_LT #elif defined(DOUBLE) #define AMAX_K DAMAX_K #define AMIN_K DAMIN_K #define MAX_K DMAX_K #define MIN_K DMIN_K #define IAMAX_K IDAMAX_K #define IAMIN_K IDAMIN_K #define IMAX_K IDMAX_K #define IMIN_K IDMIN_K #define ASUM_K DASUM_K #define AXPYU_K DAXPYU_K #define AXPYC_K DAXPYC_K #define COPY_K DCOPY_K #define DOTU_K DDOTU_K #define DOTC_K DDOTC_K #define NRM2_K DNRM2_K #define SCAL_K DSCAL_K #define SWAP_K DSWAP_K #define ROT_K DROT_K #define GEMV_N DGEMV_N #define GEMV_T DGEMV_T #define GEMV_R DGEMV_R #define GEMV_C DGEMV_C #define GEMV_O DGEMV_O #define GEMV_U DGEMV_U #define GEMV_S DGEMV_S #define GEMV_D DGEMV_D #define GERU_K DGERU_K #define GERC_K DGERC_K #define GERV_K DGERV_K #define GERD_K DGERD_K #define SYMV_U DSYMV_U #define SYMV_L DSYMV_L #define SYMV_THREAD_U DSYMV_THREAD_U #define SYMV_THREAD_L DSYMV_THREAD_L #define GEMM_ONCOPY DGEMM_ONCOPY #define GEMM_OTCOPY DGEMM_OTCOPY #define GEMM_INCOPY DGEMM_INCOPY #define GEMM_ITCOPY DGEMM_ITCOPY #ifdef UNIT #define TRMM_OUNCOPY DTRMM_OUNUCOPY #define TRMM_OUTCOPY DTRMM_OUTUCOPY #define TRMM_OLNCOPY DTRMM_OLNUCOPY #define TRMM_OLTCOPY DTRMM_OLTUCOPY #define TRSM_OUNCOPY DTRSM_OUNUCOPY #define TRSM_OUTCOPY DTRSM_OUTUCOPY #define TRSM_OLNCOPY DTRSM_OLNUCOPY #define TRSM_OLTCOPY DTRSM_OLTUCOPY #define TRMM_IUNCOPY DTRMM_IUNUCOPY #define TRMM_IUTCOPY DTRMM_IUTUCOPY #define TRMM_ILNCOPY DTRMM_ILNUCOPY #define TRMM_ILTCOPY DTRMM_ILTUCOPY #define TRSM_IUNCOPY DTRSM_IUNUCOPY #define TRSM_IUTCOPY DTRSM_IUTUCOPY #define TRSM_ILNCOPY DTRSM_ILNUCOPY #define TRSM_ILTCOPY DTRSM_ILTUCOPY #else #define TRMM_OUNCOPY DTRMM_OUNNCOPY #define TRMM_OUTCOPY DTRMM_OUTNCOPY #define TRMM_OLNCOPY DTRMM_OLNNCOPY #define TRMM_OLTCOPY DTRMM_OLTNCOPY #define TRSM_OUNCOPY DTRSM_OUNNCOPY #define TRSM_OUTCOPY DTRSM_OUTNCOPY #define TRSM_OLNCOPY DTRSM_OLNNCOPY #define TRSM_OLTCOPY DTRSM_OLTNCOPY #define TRMM_IUNCOPY DTRMM_IUNNCOPY #define TRMM_IUTCOPY DTRMM_IUTNCOPY #define TRMM_ILNCOPY DTRMM_ILNNCOPY #define TRMM_ILTCOPY DTRMM_ILTNCOPY #define TRSM_IUNCOPY DTRSM_IUNNCOPY #define TRSM_IUTCOPY DTRSM_IUTNCOPY #define TRSM_ILNCOPY DTRSM_ILNNCOPY #define TRSM_ILTCOPY DTRSM_ILTNCOPY #endif #define GEMM_BETA DGEMM_BETA #define GEMM_KERNEL_N DGEMM_KERNEL #define GEMM_KERNEL_L DGEMM_KERNEL #define GEMM_KERNEL_R DGEMM_KERNEL #define GEMM_KERNEL_B DGEMM_KERNEL #define TRMM_KERNEL_LN DTRMM_KERNEL_LN #define TRMM_KERNEL_LT DTRMM_KERNEL_LT #define TRMM_KERNEL_LR DTRMM_KERNEL_LN #define TRMM_KERNEL_LC DTRMM_KERNEL_LT #define TRMM_KERNEL_RN DTRMM_KERNEL_RN #define TRMM_KERNEL_RT DTRMM_KERNEL_RT #define TRMM_KERNEL_RR DTRMM_KERNEL_RN #define TRMM_KERNEL_RC DTRMM_KERNEL_RT #define TRSM_KERNEL_LN DTRSM_KERNEL_LN #define TRSM_KERNEL_LT DTRSM_KERNEL_LT #define TRSM_KERNEL_LR DTRSM_KERNEL_LN #define TRSM_KERNEL_LC DTRSM_KERNEL_LT #define TRSM_KERNEL_RN DTRSM_KERNEL_RN #define TRSM_KERNEL_RT DTRSM_KERNEL_RT #define TRSM_KERNEL_RR DTRSM_KERNEL_RN #define TRSM_KERNEL_RC DTRSM_KERNEL_RT #define SYMM_IUTCOPY DSYMM_IUTCOPY #define SYMM_ILTCOPY DSYMM_ILTCOPY #define SYMM_OUTCOPY DSYMM_OUTCOPY #define SYMM_OLTCOPY DSYMM_OLTCOPY #define GEMM_NN DGEMM_NN #define GEMM_CN DGEMM_TN #define GEMM_TN DGEMM_TN #define GEMM_NC DGEMM_NT #define GEMM_NT DGEMM_NT #define GEMM_CC DGEMM_TT #define GEMM_CT DGEMM_TT #define GEMM_TC DGEMM_TT #define GEMM_TT DGEMM_TT #define GEMM_NR DGEMM_NN #define GEMM_TR DGEMM_TN #define GEMM_CR DGEMM_TN #define GEMM_RN DGEMM_NN #define GEMM_RT DGEMM_NT #define GEMM_RC DGEMM_NT #define GEMM_RR DGEMM_NN #define SYMM_LU DSYMM_LU #define SYMM_LL DSYMM_LL #define SYMM_RU DSYMM_RU #define SYMM_RL DSYMM_RL #define HEMM_LU DHEMM_LU #define HEMM_LL DHEMM_LL #define HEMM_RU DHEMM_RU #define HEMM_RL DHEMM_RL #define SYRK_UN DSYRK_UN #define SYRK_UT DSYRK_UT #define SYRK_LN DSYRK_LN #define SYRK_LT DSYRK_LT #define SYRK_UR DSYRK_UN #define SYRK_UC DSYRK_UT #define SYRK_LR DSYRK_LN #define SYRK_LC DSYRK_LT #define SYRK_KERNEL_U DSYRK_KERNEL_U #define SYRK_KERNEL_L DSYRK_KERNEL_L #define HERK_UN DSYRK_UN #define HERK_LN DSYRK_LN #define HERK_UC DSYRK_UT #define HERK_LC DSYRK_LT #define HER2K_UN DSYR2K_UN #define HER2K_LN DSYR2K_LN #define HER2K_UC DSYR2K_UT #define HER2K_LC DSYR2K_LT #define SYR2K_UN DSYR2K_UN #define SYR2K_UT DSYR2K_UT #define SYR2K_LN DSYR2K_LN #define SYR2K_LT DSYR2K_LT #define SYR2K_UR DSYR2K_UN #define SYR2K_UC DSYR2K_UT #define SYR2K_LR DSYR2K_LN #define SYR2K_LC DSYR2K_LT #define SYR2K_KERNEL_U DSYR2K_KERNEL_U #define SYR2K_KERNEL_L DSYR2K_KERNEL_L #define TRMM_LNUU DTRMM_LNUU #define TRMM_LNUN DTRMM_LNUN #define TRMM_LNLU DTRMM_LNLU #define TRMM_LNLN DTRMM_LNLN #define TRMM_LTUU DTRMM_LTUU #define TRMM_LTUN DTRMM_LTUN #define TRMM_LTLU DTRMM_LTLU #define TRMM_LTLN DTRMM_LTLN #define TRMM_LRUU DTRMM_LNUU #define TRMM_LRUN DTRMM_LNUN #define TRMM_LRLU DTRMM_LNLU #define TRMM_LRLN DTRMM_LNLN #define TRMM_LCUU DTRMM_LTUU #define TRMM_LCUN DTRMM_LTUN #define TRMM_LCLU DTRMM_LTLU #define TRMM_LCLN DTRMM_LTLN #define TRMM_RNUU DTRMM_RNUU #define TRMM_RNUN DTRMM_RNUN #define TRMM_RNLU DTRMM_RNLU #define TRMM_RNLN DTRMM_RNLN #define TRMM_RTUU DTRMM_RTUU #define TRMM_RTUN DTRMM_RTUN #define TRMM_RTLU DTRMM_RTLU #define TRMM_RTLN DTRMM_RTLN #define TRMM_RRUU DTRMM_RNUU #define TRMM_RRUN DTRMM_RNUN #define TRMM_RRLU DTRMM_RNLU #define TRMM_RRLN DTRMM_RNLN #define TRMM_RCUU DTRMM_RTUU #define TRMM_RCUN DTRMM_RTUN #define TRMM_RCLU DTRMM_RTLU #define TRMM_RCLN DTRMM_RTLN #define TRSM_LNUU DTRSM_LNUU #define TRSM_LNUN DTRSM_LNUN #define TRSM_LNLU DTRSM_LNLU #define TRSM_LNLN DTRSM_LNLN #define TRSM_LTUU DTRSM_LTUU #define TRSM_LTUN DTRSM_LTUN #define TRSM_LTLU DTRSM_LTLU #define TRSM_LTLN DTRSM_LTLN #define TRSM_LRUU DTRSM_LNUU #define TRSM_LRUN DTRSM_LNUN #define TRSM_LRLU DTRSM_LNLU #define TRSM_LRLN DTRSM_LNLN #define TRSM_LCUU DTRSM_LTUU #define TRSM_LCUN DTRSM_LTUN #define TRSM_LCLU DTRSM_LTLU #define TRSM_LCLN DTRSM_LTLN #define TRSM_RNUU DTRSM_RNUU #define TRSM_RNUN DTRSM_RNUN #define TRSM_RNLU DTRSM_RNLU #define TRSM_RNLN DTRSM_RNLN #define TRSM_RTUU DTRSM_RTUU #define TRSM_RTUN DTRSM_RTUN #define TRSM_RTLU DTRSM_RTLU #define TRSM_RTLN DTRSM_RTLN #define TRSM_RRUU DTRSM_RNUU #define TRSM_RRUN DTRSM_RNUN #define TRSM_RRLU DTRSM_RNLU #define TRSM_RRLN DTRSM_RNLN #define TRSM_RCUU DTRSM_RTUU #define TRSM_RCUN DTRSM_RTUN #define TRSM_RCLU DTRSM_RTLU #define TRSM_RCLN DTRSM_RTLN #define GEMM_THREAD_NN DGEMM_THREAD_NN #define GEMM_THREAD_CN DGEMM_THREAD_TN #define GEMM_THREAD_TN DGEMM_THREAD_TN #define GEMM_THREAD_NC DGEMM_THREAD_NT #define GEMM_THREAD_NT DGEMM_THREAD_NT #define GEMM_THREAD_CC DGEMM_THREAD_TT #define GEMM_THREAD_CT DGEMM_THREAD_TT #define GEMM_THREAD_TC DGEMM_THREAD_TT #define GEMM_THREAD_TT DGEMM_THREAD_TT #define GEMM_THREAD_NR DGEMM_THREAD_NN #define GEMM_THREAD_TR DGEMM_THREAD_TN #define GEMM_THREAD_CR DGEMM_THREAD_TN #define GEMM_THREAD_RN DGEMM_THREAD_NN #define GEMM_THREAD_RT DGEMM_THREAD_NT #define GEMM_THREAD_RC DGEMM_THREAD_NT #define GEMM_THREAD_RR DGEMM_THREAD_NN #define SYMM_THREAD_LU DSYMM_THREAD_LU #define SYMM_THREAD_LL DSYMM_THREAD_LL #define SYMM_THREAD_RU DSYMM_THREAD_RU #define SYMM_THREAD_RL DSYMM_THREAD_RL #define HEMM_THREAD_LU DHEMM_THREAD_LU #define HEMM_THREAD_LL DHEMM_THREAD_LL #define HEMM_THREAD_RU DHEMM_THREAD_RU #define HEMM_THREAD_RL DHEMM_THREAD_RL #define SYRK_THREAD_UN DSYRK_THREAD_UN #define SYRK_THREAD_UT DSYRK_THREAD_UT #define SYRK_THREAD_LN DSYRK_THREAD_LN #define SYRK_THREAD_LT DSYRK_THREAD_LT #define SYRK_THREAD_UR DSYRK_THREAD_UR #define SYRK_THREAD_UC DSYRK_THREAD_UC #define SYRK_THREAD_LR DSYRK_THREAD_LN #define SYRK_THREAD_LC DSYRK_THREAD_LT #define HERK_THREAD_UN DSYRK_THREAD_UN #define HERK_THREAD_UT DSYRK_THREAD_UT #define HERK_THREAD_LN DSYRK_THREAD_LN #define HERK_THREAD_LT DSYRK_THREAD_LT #define HERK_THREAD_UR DSYRK_THREAD_UR #define HERK_THREAD_UC DSYRK_THREAD_UC #define HERK_THREAD_LR DSYRK_THREAD_LN #define HERK_THREAD_LC DSYRK_THREAD_LT #define AXPBY_K DAXPBY_K #define OMATCOPY_K_CN DOMATCOPY_K_CN #define OMATCOPY_K_RN DOMATCOPY_K_RN #define OMATCOPY_K_CT DOMATCOPY_K_CT #define OMATCOPY_K_RT DOMATCOPY_K_RT #define IMATCOPY_K_CN DIMATCOPY_K_CN #define IMATCOPY_K_RN DIMATCOPY_K_RN #define IMATCOPY_K_CT DIMATCOPY_K_CT #define IMATCOPY_K_RT DIMATCOPY_K_RT #define GEADD_K DGEADD_K #else #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K #define MAX_K SMAX_K #define MIN_K SMIN_K #define IAMAX_K ISAMAX_K #define IAMIN_K ISAMIN_K #define IMAX_K ISMAX_K #define IMIN_K ISMIN_K #define ASUM_K SASUM_K #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYU_K #define COPY_K SCOPY_K #define DOTU_K SDOTU_K #define DOTC_K SDOTC_K #define NRM2_K SNRM2_K #define SCAL_K SSCAL_K #define SWAP_K SSWAP_K #define ROT_K SROT_K #define GEMV_N SGEMV_N #define GEMV_T SGEMV_T #define GEMV_R SGEMV_R #define GEMV_C SGEMV_C #define GEMV_O SGEMV_O #define GEMV_U SGEMV_U #define GEMV_S SGEMV_S #define GEMV_D SGEMV_D #define GERU_K SGERU_K #define GERC_K SGERC_K #define GERV_K SGERV_K #define GERD_K SGERD_K #define SYMV_U SSYMV_U #define SYMV_L SSYMV_L #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L #define GEMM_ONCOPY SGEMM_ONCOPY #define GEMM_OTCOPY SGEMM_OTCOPY #define GEMM_INCOPY SGEMM_INCOPY #define GEMM_ITCOPY SGEMM_ITCOPY #ifdef UNIT #define TRMM_OUNCOPY STRMM_OUNUCOPY #define TRMM_OUTCOPY STRMM_OUTUCOPY #define TRMM_OLNCOPY STRMM_OLNUCOPY #define TRMM_OLTCOPY STRMM_OLTUCOPY #define TRSM_OUNCOPY STRSM_OUNUCOPY #define TRSM_OUTCOPY STRSM_OUTUCOPY #define TRSM_OLNCOPY STRSM_OLNUCOPY #define TRSM_OLTCOPY STRSM_OLTUCOPY #define TRMM_IUNCOPY STRMM_IUNUCOPY #define TRMM_IUTCOPY STRMM_IUTUCOPY #define TRMM_ILNCOPY STRMM_ILNUCOPY #define TRMM_ILTCOPY STRMM_ILTUCOPY #define TRSM_IUNCOPY STRSM_IUNUCOPY #define TRSM_IUTCOPY STRSM_IUTUCOPY #define TRSM_ILNCOPY STRSM_ILNUCOPY #define TRSM_ILTCOPY STRSM_ILTUCOPY #else #define TRMM_OUNCOPY STRMM_OUNNCOPY #define TRMM_OUTCOPY STRMM_OUTNCOPY #define TRMM_OLNCOPY STRMM_OLNNCOPY #define TRMM_OLTCOPY STRMM_OLTNCOPY #define TRSM_OUNCOPY STRSM_OUNNCOPY #define TRSM_OUTCOPY STRSM_OUTNCOPY #define TRSM_OLNCOPY STRSM_OLNNCOPY #define TRSM_OLTCOPY STRSM_OLTNCOPY #define TRMM_IUNCOPY STRMM_IUNNCOPY #define TRMM_IUTCOPY STRMM_IUTNCOPY #define TRMM_ILNCOPY STRMM_ILNNCOPY #define TRMM_ILTCOPY STRMM_ILTNCOPY #define TRSM_IUNCOPY STRSM_IUNNCOPY #define TRSM_IUTCOPY STRSM_IUTNCOPY #define TRSM_ILNCOPY STRSM_ILNNCOPY #define TRSM_ILTCOPY STRSM_ILTNCOPY #endif #define GEMM_BETA SGEMM_BETA #define GEMM_KERNEL_N SGEMM_KERNEL #define GEMM_KERNEL_L SGEMM_KERNEL #define GEMM_KERNEL_R SGEMM_KERNEL #define GEMM_KERNEL_B SGEMM_KERNEL #define TRMM_KERNEL_LN STRMM_KERNEL_LN #define TRMM_KERNEL_LT STRMM_KERNEL_LT #define TRMM_KERNEL_LR STRMM_KERNEL_LN #define TRMM_KERNEL_LC STRMM_KERNEL_LT #define TRMM_KERNEL_RN STRMM_KERNEL_RN #define TRMM_KERNEL_RT STRMM_KERNEL_RT #define TRMM_KERNEL_RR STRMM_KERNEL_RN #define TRMM_KERNEL_RC STRMM_KERNEL_RT #define TRSM_KERNEL_LN STRSM_KERNEL_LN #define TRSM_KERNEL_LT STRSM_KERNEL_LT #define TRSM_KERNEL_LR STRSM_KERNEL_LN #define TRSM_KERNEL_LC STRSM_KERNEL_LT #define TRSM_KERNEL_RN STRSM_KERNEL_RN #define TRSM_KERNEL_RT STRSM_KERNEL_RT #define TRSM_KERNEL_RR STRSM_KERNEL_RN #define TRSM_KERNEL_RC STRSM_KERNEL_RT #define SYMM_IUTCOPY SSYMM_IUTCOPY #define SYMM_ILTCOPY SSYMM_ILTCOPY #define SYMM_OUTCOPY SSYMM_OUTCOPY #define SYMM_OLTCOPY SSYMM_OLTCOPY #define GEMM_NN SGEMM_NN #define GEMM_CN SGEMM_TN #define GEMM_TN SGEMM_TN #define GEMM_NC SGEMM_NT #define GEMM_NT SGEMM_NT #define GEMM_CC SGEMM_TT #define GEMM_CT SGEMM_TT #define GEMM_TC SGEMM_TT #define GEMM_TT SGEMM_TT #define GEMM_NR SGEMM_NN #define GEMM_TR SGEMM_TN #define GEMM_CR SGEMM_TN #define GEMM_RN SGEMM_NN #define GEMM_RT SGEMM_NT #define GEMM_RC SGEMM_NT #define GEMM_RR SGEMM_NN #define SYMM_LU SSYMM_LU #define SYMM_LL SSYMM_LL #define SYMM_RU SSYMM_RU #define SYMM_RL SSYMM_RL #define HEMM_LU SHEMM_LU #define HEMM_LL SHEMM_LL #define HEMM_RU SHEMM_RU #define HEMM_RL SHEMM_RL #define SYRK_UN SSYRK_UN #define SYRK_UT SSYRK_UT #define SYRK_LN SSYRK_LN #define SYRK_LT SSYRK_LT #define SYRK_UR SSYRK_UN #define SYRK_UC SSYRK_UT #define SYRK_LR SSYRK_LN #define SYRK_LC SSYRK_LT #define SYRK_KERNEL_U SSYRK_KERNEL_U #define SYRK_KERNEL_L SSYRK_KERNEL_L #define HERK_UN SSYRK_UN #define HERK_LN SSYRK_LN #define HERK_UC SSYRK_UT #define HERK_LC SSYRK_LT #define HER2K_UN SSYR2K_UN #define HER2K_LN SSYR2K_LN #define HER2K_UC SSYR2K_UT #define HER2K_LC SSYR2K_LT #define SYR2K_UN SSYR2K_UN #define SYR2K_UT SSYR2K_UT #define SYR2K_LN SSYR2K_LN #define SYR2K_LT SSYR2K_LT #define SYR2K_UR SSYR2K_UN #define SYR2K_UC SSYR2K_UT #define SYR2K_LR SSYR2K_LN #define SYR2K_LC SSYR2K_LT #define SYR2K_KERNEL_U SSYR2K_KERNEL_U #define SYR2K_KERNEL_L SSYR2K_KERNEL_L #define TRMM_LNUU STRMM_LNUU #define TRMM_LNUN STRMM_LNUN #define TRMM_LNLU STRMM_LNLU #define TRMM_LNLN STRMM_LNLN #define TRMM_LTUU STRMM_LTUU #define TRMM_LTUN STRMM_LTUN #define TRMM_LTLU STRMM_LTLU #define TRMM_LTLN STRMM_LTLN #define TRMM_LRUU STRMM_LNUU #define TRMM_LRUN STRMM_LNUN #define TRMM_LRLU STRMM_LNLU #define TRMM_LRLN STRMM_LNLN #define TRMM_LCUU STRMM_LTUU #define TRMM_LCUN STRMM_LTUN #define TRMM_LCLU STRMM_LTLU #define TRMM_LCLN STRMM_LTLN #define TRMM_RNUU STRMM_RNUU #define TRMM_RNUN STRMM_RNUN #define TRMM_RNLU STRMM_RNLU #define TRMM_RNLN STRMM_RNLN #define TRMM_RTUU STRMM_RTUU #define TRMM_RTUN STRMM_RTUN #define TRMM_RTLU STRMM_RTLU #define TRMM_RTLN STRMM_RTLN #define TRMM_RRUU STRMM_RNUU #define TRMM_RRUN STRMM_RNUN #define TRMM_RRLU STRMM_RNLU #define TRMM_RRLN STRMM_RNLN #define TRMM_RCUU STRMM_RTUU #define TRMM_RCUN STRMM_RTUN #define TRMM_RCLU STRMM_RTLU #define TRMM_RCLN STRMM_RTLN #define TRSM_LNUU STRSM_LNUU #define TRSM_LNUN STRSM_LNUN #define TRSM_LNLU STRSM_LNLU #define TRSM_LNLN STRSM_LNLN #define TRSM_LTUU STRSM_LTUU #define TRSM_LTUN STRSM_LTUN #define TRSM_LTLU STRSM_LTLU #define TRSM_LTLN STRSM_LTLN #define TRSM_LRUU STRSM_LNUU #define TRSM_LRUN STRSM_LNUN #define TRSM_LRLU STRSM_LNLU #define TRSM_LRLN STRSM_LNLN #define TRSM_LCUU STRSM_LTUU #define TRSM_LCUN STRSM_LTUN #define TRSM_LCLU STRSM_LTLU #define TRSM_LCLN STRSM_LTLN #define TRSM_RNUU STRSM_RNUU #define TRSM_RNUN STRSM_RNUN #define TRSM_RNLU STRSM_RNLU #define TRSM_RNLN STRSM_RNLN #define TRSM_RTUU STRSM_RTUU #define TRSM_RTUN STRSM_RTUN #define TRSM_RTLU STRSM_RTLU #define TRSM_RTLN STRSM_RTLN #define TRSM_RRUU STRSM_RNUU #define TRSM_RRUN STRSM_RNUN #define TRSM_RRLU STRSM_RNLU #define TRSM_RRLN STRSM_RNLN #define TRSM_RCUU STRSM_RTUU #define TRSM_RCUN STRSM_RTUN #define TRSM_RCLU STRSM_RTLU #define TRSM_RCLN STRSM_RTLN #define GEMM_THREAD_NN SGEMM_THREAD_NN #define GEMM_THREAD_CN SGEMM_THREAD_TN #define GEMM_THREAD_TN SGEMM_THREAD_TN #define GEMM_THREAD_NC SGEMM_THREAD_NT #define GEMM_THREAD_NT SGEMM_THREAD_NT #define GEMM_THREAD_CC SGEMM_THREAD_TT #define GEMM_THREAD_CT SGEMM_THREAD_TT #define GEMM_THREAD_TC SGEMM_THREAD_TT #define GEMM_THREAD_TT SGEMM_THREAD_TT #define GEMM_THREAD_NR SGEMM_THREAD_NN #define GEMM_THREAD_TR SGEMM_THREAD_TN #define GEMM_THREAD_CR SGEMM_THREAD_TN #define GEMM_THREAD_RN SGEMM_THREAD_NN #define GEMM_THREAD_RT SGEMM_THREAD_NT #define GEMM_THREAD_RC SGEMM_THREAD_NT #define GEMM_THREAD_RR SGEMM_THREAD_NN #define SYMM_THREAD_LU SSYMM_THREAD_LU #define SYMM_THREAD_LL SSYMM_THREAD_LL #define SYMM_THREAD_RU SSYMM_THREAD_RU #define SYMM_THREAD_RL SSYMM_THREAD_RL #define HEMM_THREAD_LU SHEMM_THREAD_LU #define HEMM_THREAD_LL SHEMM_THREAD_LL #define HEMM_THREAD_RU SHEMM_THREAD_RU #define HEMM_THREAD_RL SHEMM_THREAD_RL #define SYRK_THREAD_UN SSYRK_THREAD_UN #define SYRK_THREAD_UT SSYRK_THREAD_UT #define SYRK_THREAD_LN SSYRK_THREAD_LN #define SYRK_THREAD_LT SSYRK_THREAD_LT #define SYRK_THREAD_UR SSYRK_THREAD_UR #define SYRK_THREAD_UC SSYRK_THREAD_UC #define SYRK_THREAD_LR SSYRK_THREAD_LN #define SYRK_THREAD_LC SSYRK_THREAD_LT #define HERK_THREAD_UN SSYRK_THREAD_UN #define HERK_THREAD_UT SSYRK_THREAD_UT #define HERK_THREAD_LN SSYRK_THREAD_LN #define HERK_THREAD_LT SSYRK_THREAD_LT #define HERK_THREAD_UR SSYRK_THREAD_UR #define HERK_THREAD_UC SSYRK_THREAD_UC #define HERK_THREAD_LR SSYRK_THREAD_LN #define HERK_THREAD_LC SSYRK_THREAD_LT #define AXPBY_K SAXPBY_K #define OMATCOPY_K_CN SOMATCOPY_K_CN #define OMATCOPY_K_RN SOMATCOPY_K_RN #define OMATCOPY_K_CT SOMATCOPY_K_CT #define OMATCOPY_K_RT SOMATCOPY_K_RT #define IMATCOPY_K_CN SIMATCOPY_K_CN #define IMATCOPY_K_RN SIMATCOPY_K_RN #define IMATCOPY_K_CT SIMATCOPY_K_CT #define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K #endif #else #ifdef XDOUBLE #define AMAX_K XAMAX_K #define AMIN_K XAMIN_K #define MAX_K XMAX_K #define MIN_K XMIN_K #define IAMAX_K IXAMAX_K #define IAMIN_K IXAMIN_K #define IMAX_K IXMAX_K #define IMIN_K IXMIN_K #define ASUM_K XASUM_K #define AXPYU_K XAXPYU_K #define AXPYC_K XAXPYC_K #define COPY_K XCOPY_K #define DOTU_K XDOTU_K #define DOTC_K XDOTC_K #define NRM2_K XNRM2_K #define SCAL_K XSCAL_K #define SWAP_K XSWAP_K #define ROT_K XROT_K #define GEMV_N XGEMV_N #define GEMV_T XGEMV_T #define GEMV_R XGEMV_R #define GEMV_C XGEMV_C #define GEMV_O XGEMV_O #define GEMV_U XGEMV_U #define GEMV_S XGEMV_S #define GEMV_D XGEMV_D #define GERU_K XGERU_K #define GERC_K XGERC_K #define GERV_K XGERV_K #define GERD_K XGERD_K #define SYMV_U XSYMV_U #define SYMV_L XSYMV_L #define HEMV_U XHEMV_U #define HEMV_L XHEMV_L #define HEMV_V XHEMV_V #define HEMV_M XHEMV_M #define SYMV_THREAD_U XSYMV_THREAD_U #define SYMV_THREAD_L XSYMV_THREAD_L #define HEMV_THREAD_U XHEMV_THREAD_U #define HEMV_THREAD_L XHEMV_THREAD_L #define HEMV_THREAD_V XHEMV_THREAD_V #define HEMV_THREAD_M XHEMV_THREAD_M #define GEMM_ONCOPY XGEMM_ONCOPY #define GEMM_OTCOPY XGEMM_OTCOPY #define GEMM_INCOPY XGEMM_INCOPY #define GEMM_ITCOPY XGEMM_ITCOPY #define GEMM3M_ONCOPYB XGEMM3M_ONCOPYB #define GEMM3M_ONCOPYR XGEMM3M_ONCOPYR #define GEMM3M_ONCOPYI XGEMM3M_ONCOPYI #define GEMM3M_OTCOPYB XGEMM3M_OTCOPYB #define GEMM3M_OTCOPYR XGEMM3M_OTCOPYR #define GEMM3M_OTCOPYI XGEMM3M_OTCOPYI #define GEMM3M_INCOPYB XGEMM3M_INCOPYB #define GEMM3M_INCOPYR XGEMM3M_INCOPYR #define GEMM3M_INCOPYI XGEMM3M_INCOPYI #define GEMM3M_ITCOPYB XGEMM3M_ITCOPYB #define GEMM3M_ITCOPYR XGEMM3M_ITCOPYR #define GEMM3M_ITCOPYI XGEMM3M_ITCOPYI #ifdef UNIT #define TRMM_OUNCOPY XTRMM_OUNUCOPY #define TRMM_OUTCOPY XTRMM_OUTUCOPY #define TRMM_OLNCOPY XTRMM_OLNUCOPY #define TRMM_OLTCOPY XTRMM_OLTUCOPY #define TRSM_OUNCOPY XTRSM_OUNUCOPY #define TRSM_OUTCOPY XTRSM_OUTUCOPY #define TRSM_OLNCOPY XTRSM_OLNUCOPY #define TRSM_OLTCOPY XTRSM_OLTUCOPY #define TRMM_IUNCOPY XTRMM_IUNUCOPY #define TRMM_IUTCOPY XTRMM_IUTUCOPY #define TRMM_ILNCOPY XTRMM_ILNUCOPY #define TRMM_ILTCOPY XTRMM_ILTUCOPY #define TRSM_IUNCOPY XTRSM_IUNUCOPY #define TRSM_IUTCOPY XTRSM_IUTUCOPY #define TRSM_ILNCOPY XTRSM_ILNUCOPY #define TRSM_ILTCOPY XTRSM_ILTUCOPY #else #define TRMM_OUNCOPY XTRMM_OUNNCOPY #define TRMM_OUTCOPY XTRMM_OUTNCOPY #define TRMM_OLNCOPY XTRMM_OLNNCOPY #define TRMM_OLTCOPY XTRMM_OLTNCOPY #define TRSM_OUNCOPY XTRSM_OUNNCOPY #define TRSM_OUTCOPY XTRSM_OUTNCOPY #define TRSM_OLNCOPY XTRSM_OLNNCOPY #define TRSM_OLTCOPY XTRSM_OLTNCOPY #define TRMM_IUNCOPY XTRMM_IUNNCOPY #define TRMM_IUTCOPY XTRMM_IUTNCOPY #define TRMM_ILNCOPY XTRMM_ILNNCOPY #define TRMM_ILTCOPY XTRMM_ILTNCOPY #define TRSM_IUNCOPY XTRSM_IUNNCOPY #define TRSM_IUTCOPY XTRSM_IUTNCOPY #define TRSM_ILNCOPY XTRSM_ILNNCOPY #define TRSM_ILTCOPY XTRSM_ILTNCOPY #endif #define SYMM3M_ILCOPYB XSYMM3M_ILCOPYB #define SYMM3M_IUCOPYB XSYMM3M_IUCOPYB #define SYMM3M_ILCOPYR XSYMM3M_ILCOPYR #define SYMM3M_IUCOPYR XSYMM3M_IUCOPYR #define SYMM3M_ILCOPYI XSYMM3M_ILCOPYI #define SYMM3M_IUCOPYI XSYMM3M_IUCOPYI #define SYMM3M_OLCOPYB XSYMM3M_OLCOPYB #define SYMM3M_OUCOPYB XSYMM3M_OUCOPYB #define SYMM3M_OLCOPYR XSYMM3M_OLCOPYR #define SYMM3M_OUCOPYR XSYMM3M_OUCOPYR #define SYMM3M_OLCOPYI XSYMM3M_OLCOPYI #define SYMM3M_OUCOPYI XSYMM3M_OUCOPYI #define HEMM3M_ILCOPYB XHEMM3M_ILCOPYB #define HEMM3M_IUCOPYB XHEMM3M_IUCOPYB #define HEMM3M_ILCOPYR XHEMM3M_ILCOPYR #define HEMM3M_IUCOPYR XHEMM3M_IUCOPYR #define HEMM3M_ILCOPYI XHEMM3M_ILCOPYI #define HEMM3M_IUCOPYI XHEMM3M_IUCOPYI #define HEMM3M_OLCOPYB XHEMM3M_OLCOPYB #define HEMM3M_OUCOPYB XHEMM3M_OUCOPYB #define HEMM3M_OLCOPYR XHEMM3M_OLCOPYR #define HEMM3M_OUCOPYR XHEMM3M_OUCOPYR #define HEMM3M_OLCOPYI XHEMM3M_OLCOPYI #define HEMM3M_OUCOPYI XHEMM3M_OUCOPYI #define GEMM_BETA XGEMM_BETA #define GEMM_KERNEL_N XGEMM_KERNEL_N #define GEMM_KERNEL_L XGEMM_KERNEL_L #define GEMM_KERNEL_R XGEMM_KERNEL_R #define GEMM_KERNEL_B XGEMM_KERNEL_B #define GEMM3M_KERNEL XGEMM3M_KERNEL #define TRMM_KERNEL_LN XTRMM_KERNEL_LN #define TRMM_KERNEL_LT XTRMM_KERNEL_LT #define TRMM_KERNEL_LR XTRMM_KERNEL_LR #define TRMM_KERNEL_LC XTRMM_KERNEL_LC #define TRMM_KERNEL_RN XTRMM_KERNEL_RN #define TRMM_KERNEL_RT XTRMM_KERNEL_RT #define TRMM_KERNEL_RR XTRMM_KERNEL_RR #define TRMM_KERNEL_RC XTRMM_KERNEL_RC #define TRSM_KERNEL_LN XTRSM_KERNEL_LN #define TRSM_KERNEL_LT XTRSM_KERNEL_LT #define TRSM_KERNEL_LR XTRSM_KERNEL_LR #define TRSM_KERNEL_LC XTRSM_KERNEL_LC #define TRSM_KERNEL_RN XTRSM_KERNEL_RN #define TRSM_KERNEL_RT XTRSM_KERNEL_RT #define TRSM_KERNEL_RR XTRSM_KERNEL_RR #define TRSM_KERNEL_RC XTRSM_KERNEL_RC #define GEMM_NN XGEMM_NN #define GEMM_CN XGEMM_CN #define GEMM_TN XGEMM_TN #define GEMM_NC XGEMM_NC #define GEMM_NT XGEMM_NT #define GEMM_CC XGEMM_CC #define GEMM_CT XGEMM_CT #define GEMM_TC XGEMM_TC #define GEMM_TT XGEMM_TT #define GEMM_NR XGEMM_NR #define GEMM_TR XGEMM_TR #define GEMM_CR XGEMM_CR #define GEMM_RN XGEMM_RN #define GEMM_RT XGEMM_RT #define GEMM_RC XGEMM_RC #define GEMM_RR XGEMM_RR #define SYMM_LU XSYMM_LU #define SYMM_LL XSYMM_LL #define SYMM_RU XSYMM_RU #define SYMM_RL XSYMM_RL #define HEMM_LU XHEMM_LU #define HEMM_LL XHEMM_LL #define HEMM_RU XHEMM_RU #define HEMM_RL XHEMM_RL #define HEMM_IUTCOPY XHEMM_IUTCOPY #define HEMM_ILTCOPY XHEMM_ILTCOPY #define HEMM_OUTCOPY XHEMM_OUTCOPY #define HEMM_OLTCOPY XHEMM_OLTCOPY #define SYRK_UN XSYRK_UN #define SYRK_UT XSYRK_UT #define SYRK_LN XSYRK_LN #define SYRK_LT XSYRK_LT #define SYRK_UR XSYRK_UN #define SYRK_UC XSYRK_UT #define SYRK_LR XSYRK_LN #define SYRK_LC XSYRK_LT #define SYRK_KERNEL_U XSYRK_KERNEL_U #define SYRK_KERNEL_L XSYRK_KERNEL_L #define HERK_UN XHERK_UN #define HERK_LN XHERK_LN #define HERK_UC XHERK_UC #define HERK_LC XHERK_LC #define HER2K_UN XHER2K_UN #define HER2K_LN XHER2K_LN #define HER2K_UC XHER2K_UC #define HER2K_LC XHER2K_LC #define SYR2K_UN XSYR2K_UN #define SYR2K_UT XSYR2K_UT #define SYR2K_LN XSYR2K_LN #define SYR2K_LT XSYR2K_LT #define SYR2K_UR XSYR2K_UN #define SYR2K_UC XSYR2K_UT #define SYR2K_LR XSYR2K_LN #define SYR2K_LC XSYR2K_LT #define SYR2K_KERNEL_U XSYR2K_KERNEL_U #define SYR2K_KERNEL_L XSYR2K_KERNEL_L #define TRMM_LNUU XTRMM_LNUU #define TRMM_LNUN XTRMM_LNUN #define TRMM_LNLU XTRMM_LNLU #define TRMM_LNLN XTRMM_LNLN #define TRMM_LTUU XTRMM_LTUU #define TRMM_LTUN XTRMM_LTUN #define TRMM_LTLU XTRMM_LTLU #define TRMM_LTLN XTRMM_LTLN #define TRMM_LRUU XTRMM_LRUU #define TRMM_LRUN XTRMM_LRUN #define TRMM_LRLU XTRMM_LRLU #define TRMM_LRLN XTRMM_LRLN #define TRMM_LCUU XTRMM_LCUU #define TRMM_LCUN XTRMM_LCUN #define TRMM_LCLU XTRMM_LCLU #define TRMM_LCLN XTRMM_LCLN #define TRMM_RNUU XTRMM_RNUU #define TRMM_RNUN XTRMM_RNUN #define TRMM_RNLU XTRMM_RNLU #define TRMM_RNLN XTRMM_RNLN #define TRMM_RTUU XTRMM_RTUU #define TRMM_RTUN XTRMM_RTUN #define TRMM_RTLU XTRMM_RTLU #define TRMM_RTLN XTRMM_RTLN #define TRMM_RRUU XTRMM_RRUU #define TRMM_RRUN XTRMM_RRUN #define TRMM_RRLU XTRMM_RRLU #define TRMM_RRLN XTRMM_RRLN #define TRMM_RCUU XTRMM_RCUU #define TRMM_RCUN XTRMM_RCUN #define TRMM_RCLU XTRMM_RCLU #define TRMM_RCLN XTRMM_RCLN #define TRSM_LNUU XTRSM_LNUU #define TRSM_LNUN XTRSM_LNUN #define TRSM_LNLU XTRSM_LNLU #define TRSM_LNLN XTRSM_LNLN #define TRSM_LTUU XTRSM_LTUU #define TRSM_LTUN XTRSM_LTUN #define TRSM_LTLU XTRSM_LTLU #define TRSM_LTLN XTRSM_LTLN #define TRSM_LRUU XTRSM_LRUU #define TRSM_LRUN XTRSM_LRUN #define TRSM_LRLU XTRSM_LRLU #define TRSM_LRLN XTRSM_LRLN #define TRSM_LCUU XTRSM_LCUU #define TRSM_LCUN XTRSM_LCUN #define TRSM_LCLU XTRSM_LCLU #define TRSM_LCLN XTRSM_LCLN #define TRSM_RNUU XTRSM_RNUU #define TRSM_RNUN XTRSM_RNUN #define TRSM_RNLU XTRSM_RNLU #define TRSM_RNLN XTRSM_RNLN #define TRSM_RTUU XTRSM_RTUU #define TRSM_RTUN XTRSM_RTUN #define TRSM_RTLU XTRSM_RTLU #define TRSM_RTLN XTRSM_RTLN #define TRSM_RRUU XTRSM_RRUU #define TRSM_RRUN XTRSM_RRUN #define TRSM_RRLU XTRSM_RRLU #define TRSM_RRLN XTRSM_RRLN #define TRSM_RCUU XTRSM_RCUU #define TRSM_RCUN XTRSM_RCUN #define TRSM_RCLU XTRSM_RCLU #define TRSM_RCLN XTRSM_RCLN #define GEMM_THREAD_NN XGEMM_THREAD_NN #define GEMM_THREAD_CN XGEMM_THREAD_CN #define GEMM_THREAD_TN XGEMM_THREAD_TN #define GEMM_THREAD_NC XGEMM_THREAD_NC #define GEMM_THREAD_NT XGEMM_THREAD_NT #define GEMM_THREAD_CC XGEMM_THREAD_CC #define GEMM_THREAD_CT XGEMM_THREAD_CT #define GEMM_THREAD_TC XGEMM_THREAD_TC #define GEMM_THREAD_TT XGEMM_THREAD_TT #define GEMM_THREAD_NR XGEMM_THREAD_NR #define GEMM_THREAD_TR XGEMM_THREAD_TR #define GEMM_THREAD_CR XGEMM_THREAD_CR #define GEMM_THREAD_RN XGEMM_THREAD_RN #define GEMM_THREAD_RT XGEMM_THREAD_RT #define GEMM_THREAD_RC XGEMM_THREAD_RC #define GEMM_THREAD_RR XGEMM_THREAD_RR #define SYMM_THREAD_LU XSYMM_THREAD_LU #define SYMM_THREAD_LL XSYMM_THREAD_LL #define SYMM_THREAD_RU XSYMM_THREAD_RU #define SYMM_THREAD_RL XSYMM_THREAD_RL #define HEMM_THREAD_LU XHEMM_THREAD_LU #define HEMM_THREAD_LL XHEMM_THREAD_LL #define HEMM_THREAD_RU XHEMM_THREAD_RU #define HEMM_THREAD_RL XHEMM_THREAD_RL #define SYRK_THREAD_UN XSYRK_THREAD_UN #define SYRK_THREAD_UT XSYRK_THREAD_UT #define SYRK_THREAD_LN XSYRK_THREAD_LN #define SYRK_THREAD_LT XSYRK_THREAD_LT #define SYRK_THREAD_UR XSYRK_THREAD_UR #define SYRK_THREAD_UC XSYRK_THREAD_UC #define SYRK_THREAD_LR XSYRK_THREAD_LR #define SYRK_THREAD_LC XSYRK_THREAD_LC #define HERK_THREAD_UN XHERK_THREAD_UN #define HERK_THREAD_UT XHERK_THREAD_UT #define HERK_THREAD_LN XHERK_THREAD_LN #define HERK_THREAD_LT XHERK_THREAD_LT #define HERK_THREAD_UR XHERK_THREAD_UR #define HERK_THREAD_UC XHERK_THREAD_UC #define HERK_THREAD_LR XHERK_THREAD_LR #define HERK_THREAD_LC XHERK_THREAD_LC #define GEMM3M_NN XGEMM3M_NN #define GEMM3M_CN XGEMM3M_CN #define GEMM3M_TN XGEMM3M_TN #define GEMM3M_NC XGEMM3M_NC #define GEMM3M_NT XGEMM3M_NT #define GEMM3M_CC XGEMM3M_CC #define GEMM3M_CT XGEMM3M_CT #define GEMM3M_TC XGEMM3M_TC #define GEMM3M_TT XGEMM3M_TT #define GEMM3M_NR XGEMM3M_NR #define GEMM3M_TR XGEMM3M_TR #define GEMM3M_CR XGEMM3M_CR #define GEMM3M_RN XGEMM3M_RN #define GEMM3M_RT XGEMM3M_RT #define GEMM3M_RC XGEMM3M_RC #define GEMM3M_RR XGEMM3M_RR #define GEMM3M_THREAD_NN XGEMM3M_THREAD_NN #define GEMM3M_THREAD_CN XGEMM3M_THREAD_CN #define GEMM3M_THREAD_TN XGEMM3M_THREAD_TN #define GEMM3M_THREAD_NC XGEMM3M_THREAD_NC #define GEMM3M_THREAD_NT XGEMM3M_THREAD_NT #define GEMM3M_THREAD_CC XGEMM3M_THREAD_CC #define GEMM3M_THREAD_CT XGEMM3M_THREAD_CT #define GEMM3M_THREAD_TC XGEMM3M_THREAD_TC #define GEMM3M_THREAD_TT XGEMM3M_THREAD_TT #define GEMM3M_THREAD_NR XGEMM3M_THREAD_NR #define GEMM3M_THREAD_TR XGEMM3M_THREAD_TR #define GEMM3M_THREAD_CR XGEMM3M_THREAD_CR #define GEMM3M_THREAD_RN XGEMM3M_THREAD_RN #define GEMM3M_THREAD_RT XGEMM3M_THREAD_RT #define GEMM3M_THREAD_RC XGEMM3M_THREAD_RC #define GEMM3M_THREAD_RR XGEMM3M_THREAD_RR #define SYMM3M_LU XSYMM3M_LU #define SYMM3M_LL XSYMM3M_LL #define SYMM3M_RU XSYMM3M_RU #define SYMM3M_RL XSYMM3M_RL #define SYMM3M_THREAD_LU XSYMM3M_THREAD_LU #define SYMM3M_THREAD_LL XSYMM3M_THREAD_LL #define SYMM3M_THREAD_RU XSYMM3M_THREAD_RU #define SYMM3M_THREAD_RL XSYMM3M_THREAD_RL #define HEMM3M_LU XHEMM3M_LU #define HEMM3M_LL XHEMM3M_LL #define HEMM3M_RU XHEMM3M_RU #define HEMM3M_RL XHEMM3M_RL #define HEMM3M_THREAD_LU XHEMM3M_THREAD_LU #define HEMM3M_THREAD_LL XHEMM3M_THREAD_LL #define HEMM3M_THREAD_RU XHEMM3M_THREAD_RU #define HEMM3M_THREAD_RL XHEMM3M_THREAD_RL #define SYMM_IUTCOPY XSYMM_IUTCOPY #define SYMM_ILTCOPY XSYMM_ILTCOPY #define SYMM_OUTCOPY XSYMM_OUTCOPY #define SYMM_OLTCOPY XSYMM_OLTCOPY #elif defined(DOUBLE) #define AMAX_K ZAMAX_K #define AMIN_K ZAMIN_K #define MAX_K ZMAX_K #define MIN_K ZMIN_K #define IAMAX_K IZAMAX_K #define IAMIN_K IZAMIN_K #define IMAX_K IZMAX_K #define IMIN_K IZMIN_K #define ASUM_K ZASUM_K #define AXPYU_K ZAXPYU_K #define AXPYC_K ZAXPYC_K #define COPY_K ZCOPY_K #define DOTU_K ZDOTU_K #define DOTC_K ZDOTC_K #define NRM2_K ZNRM2_K #define SCAL_K ZSCAL_K #define SWAP_K ZSWAP_K #define ROT_K ZROT_K #define GEMV_N ZGEMV_N #define GEMV_T ZGEMV_T #define GEMV_R ZGEMV_R #define GEMV_C ZGEMV_C #define GEMV_O ZGEMV_O #define GEMV_U ZGEMV_U #define GEMV_S ZGEMV_S #define GEMV_D ZGEMV_D #define GERU_K ZGERU_K #define GERC_K ZGERC_K #define GERV_K ZGERV_K #define GERD_K ZGERD_K #define SYMV_U ZSYMV_U #define SYMV_L ZSYMV_L #define HEMV_U ZHEMV_U #define HEMV_L ZHEMV_L #define HEMV_V ZHEMV_V #define HEMV_M ZHEMV_M #define SYMV_THREAD_U ZSYMV_THREAD_U #define SYMV_THREAD_L ZSYMV_THREAD_L #define HEMV_THREAD_U ZHEMV_THREAD_U #define HEMV_THREAD_L ZHEMV_THREAD_L #define HEMV_THREAD_V ZHEMV_THREAD_V #define HEMV_THREAD_M ZHEMV_THREAD_M #define GEMM_ONCOPY ZGEMM_ONCOPY #define GEMM_OTCOPY ZGEMM_OTCOPY #define GEMM_INCOPY ZGEMM_INCOPY #define GEMM_ITCOPY ZGEMM_ITCOPY #define GEMM3M_ONCOPYB ZGEMM3M_ONCOPYB #define GEMM3M_ONCOPYR ZGEMM3M_ONCOPYR #define GEMM3M_ONCOPYI ZGEMM3M_ONCOPYI #define GEMM3M_OTCOPYB ZGEMM3M_OTCOPYB #define GEMM3M_OTCOPYR ZGEMM3M_OTCOPYR #define GEMM3M_OTCOPYI ZGEMM3M_OTCOPYI #define GEMM3M_INCOPYB ZGEMM3M_INCOPYB #define GEMM3M_INCOPYR ZGEMM3M_INCOPYR #define GEMM3M_INCOPYI ZGEMM3M_INCOPYI #define GEMM3M_ITCOPYB ZGEMM3M_ITCOPYB #define GEMM3M_ITCOPYR ZGEMM3M_ITCOPYR #define GEMM3M_ITCOPYI ZGEMM3M_ITCOPYI #ifdef UNIT #define TRMM_OUNCOPY ZTRMM_OUNUCOPY #define TRMM_OUTCOPY ZTRMM_OUTUCOPY #define TRMM_OLNCOPY ZTRMM_OLNUCOPY #define TRMM_OLTCOPY ZTRMM_OLTUCOPY #define TRSM_OUNCOPY ZTRSM_OUNUCOPY #define TRSM_OUTCOPY ZTRSM_OUTUCOPY #define TRSM_OLNCOPY ZTRSM_OLNUCOPY #define TRSM_OLTCOPY ZTRSM_OLTUCOPY #define TRMM_IUNCOPY ZTRMM_IUNUCOPY #define TRMM_IUTCOPY ZTRMM_IUTUCOPY #define TRMM_ILNCOPY ZTRMM_ILNUCOPY #define TRMM_ILTCOPY ZTRMM_ILTUCOPY #define TRSM_IUNCOPY ZTRSM_IUNUCOPY #define TRSM_IUTCOPY ZTRSM_IUTUCOPY #define TRSM_ILNCOPY ZTRSM_ILNUCOPY #define TRSM_ILTCOPY ZTRSM_ILTUCOPY #else #define TRMM_OUNCOPY ZTRMM_OUNNCOPY #define TRMM_OUTCOPY ZTRMM_OUTNCOPY #define TRMM_OLNCOPY ZTRMM_OLNNCOPY #define TRMM_OLTCOPY ZTRMM_OLTNCOPY #define TRSM_OUNCOPY ZTRSM_OUNNCOPY #define TRSM_OUTCOPY ZTRSM_OUTNCOPY #define TRSM_OLNCOPY ZTRSM_OLNNCOPY #define TRSM_OLTCOPY ZTRSM_OLTNCOPY #define TRMM_IUNCOPY ZTRMM_IUNNCOPY #define TRMM_IUTCOPY ZTRMM_IUTNCOPY #define TRMM_ILNCOPY ZTRMM_ILNNCOPY #define TRMM_ILTCOPY ZTRMM_ILTNCOPY #define TRSM_IUNCOPY ZTRSM_IUNNCOPY #define TRSM_IUTCOPY ZTRSM_IUTNCOPY #define TRSM_ILNCOPY ZTRSM_ILNNCOPY #define TRSM_ILTCOPY ZTRSM_ILTNCOPY #endif #define SYMM3M_ILCOPYB ZSYMM3M_ILCOPYB #define SYMM3M_IUCOPYB ZSYMM3M_IUCOPYB #define SYMM3M_ILCOPYR ZSYMM3M_ILCOPYR #define SYMM3M_IUCOPYR ZSYMM3M_IUCOPYR #define SYMM3M_ILCOPYI ZSYMM3M_ILCOPYI #define SYMM3M_IUCOPYI ZSYMM3M_IUCOPYI #define SYMM3M_OLCOPYB ZSYMM3M_OLCOPYB #define SYMM3M_OUCOPYB ZSYMM3M_OUCOPYB #define SYMM3M_OLCOPYR ZSYMM3M_OLCOPYR #define SYMM3M_OUCOPYR ZSYMM3M_OUCOPYR #define SYMM3M_OLCOPYI ZSYMM3M_OLCOPYI #define SYMM3M_OUCOPYI ZSYMM3M_OUCOPYI #define HEMM3M_ILCOPYB ZHEMM3M_ILCOPYB #define HEMM3M_IUCOPYB ZHEMM3M_IUCOPYB #define HEMM3M_ILCOPYR ZHEMM3M_ILCOPYR #define HEMM3M_IUCOPYR ZHEMM3M_IUCOPYR #define HEMM3M_ILCOPYI ZHEMM3M_ILCOPYI #define HEMM3M_IUCOPYI ZHEMM3M_IUCOPYI #define HEMM3M_OLCOPYB ZHEMM3M_OLCOPYB #define HEMM3M_OUCOPYB ZHEMM3M_OUCOPYB #define HEMM3M_OLCOPYR ZHEMM3M_OLCOPYR #define HEMM3M_OUCOPYR ZHEMM3M_OUCOPYR #define HEMM3M_OLCOPYI ZHEMM3M_OLCOPYI #define HEMM3M_OUCOPYI ZHEMM3M_OUCOPYI #define GEMM_BETA ZGEMM_BETA #define GEMM_KERNEL_N ZGEMM_KERNEL_N #define GEMM_KERNEL_L ZGEMM_KERNEL_L #define GEMM_KERNEL_R ZGEMM_KERNEL_R #define GEMM_KERNEL_B ZGEMM_KERNEL_B #define GEMM3M_KERNEL ZGEMM3M_KERNEL #define TRMM_KERNEL_LN ZTRMM_KERNEL_LN #define TRMM_KERNEL_LT ZTRMM_KERNEL_LT #define TRMM_KERNEL_LR ZTRMM_KERNEL_LR #define TRMM_KERNEL_LC ZTRMM_KERNEL_LC #define TRMM_KERNEL_RN ZTRMM_KERNEL_RN #define TRMM_KERNEL_RT ZTRMM_KERNEL_RT #define TRMM_KERNEL_RR ZTRMM_KERNEL_RR #define TRMM_KERNEL_RC ZTRMM_KERNEL_RC #define TRSM_KERNEL_LN ZTRSM_KERNEL_LN #define TRSM_KERNEL_LT ZTRSM_KERNEL_LT #define TRSM_KERNEL_LR ZTRSM_KERNEL_LR #define TRSM_KERNEL_LC ZTRSM_KERNEL_LC #define TRSM_KERNEL_RN ZTRSM_KERNEL_RN #define TRSM_KERNEL_RT ZTRSM_KERNEL_RT #define TRSM_KERNEL_RR ZTRSM_KERNEL_RR #define TRSM_KERNEL_RC ZTRSM_KERNEL_RC #define GEMM_NN ZGEMM_NN #define GEMM_CN ZGEMM_CN #define GEMM_TN ZGEMM_TN #define GEMM_NC ZGEMM_NC #define GEMM_NT ZGEMM_NT #define GEMM_CC ZGEMM_CC #define GEMM_CT ZGEMM_CT #define GEMM_TC ZGEMM_TC #define GEMM_TT ZGEMM_TT #define GEMM_NR ZGEMM_NR #define GEMM_TR ZGEMM_TR #define GEMM_CR ZGEMM_CR #define GEMM_RN ZGEMM_RN #define GEMM_RT ZGEMM_RT #define GEMM_RC ZGEMM_RC #define GEMM_RR ZGEMM_RR #define SYMM_LU ZSYMM_LU #define SYMM_LL ZSYMM_LL #define SYMM_RU ZSYMM_RU #define SYMM_RL ZSYMM_RL #define HEMM_LU ZHEMM_LU #define HEMM_LL ZHEMM_LL #define HEMM_RU ZHEMM_RU #define HEMM_RL ZHEMM_RL #define HEMM_IUTCOPY ZHEMM_IUTCOPY #define HEMM_ILTCOPY ZHEMM_ILTCOPY #define HEMM_OUTCOPY ZHEMM_OUTCOPY #define HEMM_OLTCOPY ZHEMM_OLTCOPY #define SYRK_UN ZSYRK_UN #define SYRK_UT ZSYRK_UT #define SYRK_LN ZSYRK_LN #define SYRK_LT ZSYRK_LT #define SYRK_UR ZSYRK_UN #define SYRK_UC ZSYRK_UT #define SYRK_LR ZSYRK_LN #define SYRK_LC ZSYRK_LT #define SYRK_KERNEL_U ZSYRK_KERNEL_U #define SYRK_KERNEL_L ZSYRK_KERNEL_L #define HERK_UN ZHERK_UN #define HERK_LN ZHERK_LN #define HERK_UC ZHERK_UC #define HERK_LC ZHERK_LC #define HER2K_UN ZHER2K_UN #define HER2K_LN ZHER2K_LN #define HER2K_UC ZHER2K_UC #define HER2K_LC ZHER2K_LC #define SYR2K_UN ZSYR2K_UN #define SYR2K_UT ZSYR2K_UT #define SYR2K_LN ZSYR2K_LN #define SYR2K_LT ZSYR2K_LT #define SYR2K_UR ZSYR2K_UN #define SYR2K_UC ZSYR2K_UT #define SYR2K_LR ZSYR2K_LN #define SYR2K_LC ZSYR2K_LT #define SYR2K_KERNEL_U ZSYR2K_KERNEL_U #define SYR2K_KERNEL_L ZSYR2K_KERNEL_L #define TRMM_LNUU ZTRMM_LNUU #define TRMM_LNUN ZTRMM_LNUN #define TRMM_LNLU ZTRMM_LNLU #define TRMM_LNLN ZTRMM_LNLN #define TRMM_LTUU ZTRMM_LTUU #define TRMM_LTUN ZTRMM_LTUN #define TRMM_LTLU ZTRMM_LTLU #define TRMM_LTLN ZTRMM_LTLN #define TRMM_LRUU ZTRMM_LRUU #define TRMM_LRUN ZTRMM_LRUN #define TRMM_LRLU ZTRMM_LRLU #define TRMM_LRLN ZTRMM_LRLN #define TRMM_LCUU ZTRMM_LCUU #define TRMM_LCUN ZTRMM_LCUN #define TRMM_LCLU ZTRMM_LCLU #define TRMM_LCLN ZTRMM_LCLN #define TRMM_RNUU ZTRMM_RNUU #define TRMM_RNUN ZTRMM_RNUN #define TRMM_RNLU ZTRMM_RNLU #define TRMM_RNLN ZTRMM_RNLN #define TRMM_RTUU ZTRMM_RTUU #define TRMM_RTUN ZTRMM_RTUN #define TRMM_RTLU ZTRMM_RTLU #define TRMM_RTLN ZTRMM_RTLN #define TRMM_RRUU ZTRMM_RRUU #define TRMM_RRUN ZTRMM_RRUN #define TRMM_RRLU ZTRMM_RRLU #define TRMM_RRLN ZTRMM_RRLN #define TRMM_RCUU ZTRMM_RCUU #define TRMM_RCUN ZTRMM_RCUN #define TRMM_RCLU ZTRMM_RCLU #define TRMM_RCLN ZTRMM_RCLN #define TRSM_LNUU ZTRSM_LNUU #define TRSM_LNUN ZTRSM_LNUN #define TRSM_LNLU ZTRSM_LNLU #define TRSM_LNLN ZTRSM_LNLN #define TRSM_LTUU ZTRSM_LTUU #define TRSM_LTUN ZTRSM_LTUN #define TRSM_LTLU ZTRSM_LTLU #define TRSM_LTLN ZTRSM_LTLN #define TRSM_LRUU ZTRSM_LRUU #define TRSM_LRUN ZTRSM_LRUN #define TRSM_LRLU ZTRSM_LRLU #define TRSM_LRLN ZTRSM_LRLN #define TRSM_LCUU ZTRSM_LCUU #define TRSM_LCUN ZTRSM_LCUN #define TRSM_LCLU ZTRSM_LCLU #define TRSM_LCLN ZTRSM_LCLN #define TRSM_RNUU ZTRSM_RNUU #define TRSM_RNUN ZTRSM_RNUN #define TRSM_RNLU ZTRSM_RNLU #define TRSM_RNLN ZTRSM_RNLN #define TRSM_RTUU ZTRSM_RTUU #define TRSM_RTUN ZTRSM_RTUN #define TRSM_RTLU ZTRSM_RTLU #define TRSM_RTLN ZTRSM_RTLN #define TRSM_RRUU ZTRSM_RRUU #define TRSM_RRUN ZTRSM_RRUN #define TRSM_RRLU ZTRSM_RRLU #define TRSM_RRLN ZTRSM_RRLN #define TRSM_RCUU ZTRSM_RCUU #define TRSM_RCUN ZTRSM_RCUN #define TRSM_RCLU ZTRSM_RCLU #define TRSM_RCLN ZTRSM_RCLN #define GEMM_THREAD_NN ZGEMM_THREAD_NN #define GEMM_THREAD_CN ZGEMM_THREAD_CN #define GEMM_THREAD_TN ZGEMM_THREAD_TN #define GEMM_THREAD_NC ZGEMM_THREAD_NC #define GEMM_THREAD_NT ZGEMM_THREAD_NT #define GEMM_THREAD_CC ZGEMM_THREAD_CC #define GEMM_THREAD_CT ZGEMM_THREAD_CT #define GEMM_THREAD_TC ZGEMM_THREAD_TC #define GEMM_THREAD_TT ZGEMM_THREAD_TT #define GEMM_THREAD_NR ZGEMM_THREAD_NR #define GEMM_THREAD_TR ZGEMM_THREAD_TR #define GEMM_THREAD_CR ZGEMM_THREAD_CR #define GEMM_THREAD_RN ZGEMM_THREAD_RN #define GEMM_THREAD_RT ZGEMM_THREAD_RT #define GEMM_THREAD_RC ZGEMM_THREAD_RC #define GEMM_THREAD_RR ZGEMM_THREAD_RR #define SYMM_THREAD_LU ZSYMM_THREAD_LU #define SYMM_THREAD_LL ZSYMM_THREAD_LL #define SYMM_THREAD_RU ZSYMM_THREAD_RU #define SYMM_THREAD_RL ZSYMM_THREAD_RL #define HEMM_THREAD_LU ZHEMM_THREAD_LU #define HEMM_THREAD_LL ZHEMM_THREAD_LL #define HEMM_THREAD_RU ZHEMM_THREAD_RU #define HEMM_THREAD_RL ZHEMM_THREAD_RL #define SYRK_THREAD_UN ZSYRK_THREAD_UN #define SYRK_THREAD_UT ZSYRK_THREAD_UT #define SYRK_THREAD_LN ZSYRK_THREAD_LN #define SYRK_THREAD_LT ZSYRK_THREAD_LT #define SYRK_THREAD_UR ZSYRK_THREAD_UR #define SYRK_THREAD_UC ZSYRK_THREAD_UC #define SYRK_THREAD_LR ZSYRK_THREAD_LR #define SYRK_THREAD_LC ZSYRK_THREAD_LC #define HERK_THREAD_UN ZHERK_THREAD_UN #define HERK_THREAD_UT ZHERK_THREAD_UT #define HERK_THREAD_LN ZHERK_THREAD_LN #define HERK_THREAD_LT ZHERK_THREAD_LT #define HERK_THREAD_UR ZHERK_THREAD_UR #define HERK_THREAD_UC ZHERK_THREAD_UC #define HERK_THREAD_LR ZHERK_THREAD_LR #define HERK_THREAD_LC ZHERK_THREAD_LC #define GEMM3M_NN ZGEMM3M_NN #define GEMM3M_CN ZGEMM3M_CN #define GEMM3M_TN ZGEMM3M_TN #define GEMM3M_NC ZGEMM3M_NC #define GEMM3M_NT ZGEMM3M_NT #define GEMM3M_CC ZGEMM3M_CC #define GEMM3M_CT ZGEMM3M_CT #define GEMM3M_TC ZGEMM3M_TC #define GEMM3M_TT ZGEMM3M_TT #define GEMM3M_NR ZGEMM3M_NR #define GEMM3M_TR ZGEMM3M_TR #define GEMM3M_CR ZGEMM3M_CR #define GEMM3M_RN ZGEMM3M_RN #define GEMM3M_RT ZGEMM3M_RT #define GEMM3M_RC ZGEMM3M_RC #define GEMM3M_RR ZGEMM3M_RR #define GEMM3M_THREAD_NN ZGEMM3M_THREAD_NN #define GEMM3M_THREAD_CN ZGEMM3M_THREAD_CN #define GEMM3M_THREAD_TN ZGEMM3M_THREAD_TN #define GEMM3M_THREAD_NC ZGEMM3M_THREAD_NC #define GEMM3M_THREAD_NT ZGEMM3M_THREAD_NT #define GEMM3M_THREAD_CC ZGEMM3M_THREAD_CC #define GEMM3M_THREAD_CT ZGEMM3M_THREAD_CT #define GEMM3M_THREAD_TC ZGEMM3M_THREAD_TC #define GEMM3M_THREAD_TT ZGEMM3M_THREAD_TT #define GEMM3M_THREAD_NR ZGEMM3M_THREAD_NR #define GEMM3M_THREAD_TR ZGEMM3M_THREAD_TR #define GEMM3M_THREAD_CR ZGEMM3M_THREAD_CR #define GEMM3M_THREAD_RN ZGEMM3M_THREAD_RN #define GEMM3M_THREAD_RT ZGEMM3M_THREAD_RT #define GEMM3M_THREAD_RC ZGEMM3M_THREAD_RC #define GEMM3M_THREAD_RR ZGEMM3M_THREAD_RR #define SYMM3M_LU ZSYMM3M_LU #define SYMM3M_LL ZSYMM3M_LL #define SYMM3M_RU ZSYMM3M_RU #define SYMM3M_RL ZSYMM3M_RL #define SYMM3M_THREAD_LU ZSYMM3M_THREAD_LU #define SYMM3M_THREAD_LL ZSYMM3M_THREAD_LL #define SYMM3M_THREAD_RU ZSYMM3M_THREAD_RU #define SYMM3M_THREAD_RL ZSYMM3M_THREAD_RL #define HEMM3M_LU ZHEMM3M_LU #define HEMM3M_LL ZHEMM3M_LL #define HEMM3M_RU ZHEMM3M_RU #define HEMM3M_RL ZHEMM3M_RL #define HEMM3M_THREAD_LU ZHEMM3M_THREAD_LU #define HEMM3M_THREAD_LL ZHEMM3M_THREAD_LL #define HEMM3M_THREAD_RU ZHEMM3M_THREAD_RU #define HEMM3M_THREAD_RL ZHEMM3M_THREAD_RL #define SYMM_IUTCOPY ZSYMM_IUTCOPY #define SYMM_ILTCOPY ZSYMM_ILTCOPY #define SYMM_OUTCOPY ZSYMM_OUTCOPY #define SYMM_OLTCOPY ZSYMM_OLTCOPY #define AXPBY_K ZAXPBY_K #define OMATCOPY_K_CN ZOMATCOPY_K_CN #define OMATCOPY_K_RN ZOMATCOPY_K_RN #define OMATCOPY_K_CT ZOMATCOPY_K_CT #define OMATCOPY_K_RT ZOMATCOPY_K_RT #define OMATCOPY_K_CNC ZOMATCOPY_K_CNC #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC #define IMATCOPY_K_CN ZIMATCOPY_K_CN #define IMATCOPY_K_RN ZIMATCOPY_K_RN #define IMATCOPY_K_CT ZIMATCOPY_K_CT #define IMATCOPY_K_RT ZIMATCOPY_K_RT #define IMATCOPY_K_CNC ZIMATCOPY_K_CNC #define IMATCOPY_K_RNC ZIMATCOPY_K_RNC #define IMATCOPY_K_CTC ZIMATCOPY_K_CTC #define IMATCOPY_K_RTC ZIMATCOPY_K_RTC #define GEADD_K ZGEADD_K #else #define AMAX_K CAMAX_K #define AMIN_K CAMIN_K #define MAX_K CMAX_K #define MIN_K CMIN_K #define IAMAX_K ICAMAX_K #define IAMIN_K ICAMIN_K #define IMAX_K ICMAX_K #define IMIN_K ICMIN_K #define ASUM_K CASUM_K #define AXPYU_K CAXPYU_K #define AXPYC_K CAXPYC_K #define COPY_K CCOPY_K #define DOTU_K CDOTU_K #define DOTC_K CDOTC_K #define NRM2_K CNRM2_K #define SCAL_K CSCAL_K #define SWAP_K CSWAP_K #define ROT_K CROT_K #define GEMV_N CGEMV_N #define GEMV_T CGEMV_T #define GEMV_R CGEMV_R #define GEMV_C CGEMV_C #define GEMV_O CGEMV_O #define GEMV_U CGEMV_U #define GEMV_S CGEMV_S #define GEMV_D CGEMV_D #define GERU_K CGERU_K #define GERC_K CGERC_K #define GERV_K CGERV_K #define GERD_K CGERD_K #define SYMV_U CSYMV_U #define SYMV_L CSYMV_L #define HEMV_U CHEMV_U #define HEMV_L CHEMV_L #define HEMV_V CHEMV_V #define HEMV_M CHEMV_M #define SYMV_THREAD_U CSYMV_THREAD_U #define SYMV_THREAD_L CSYMV_THREAD_L #define HEMV_THREAD_U CHEMV_THREAD_U #define HEMV_THREAD_L CHEMV_THREAD_L #define HEMV_THREAD_V CHEMV_THREAD_V #define HEMV_THREAD_M CHEMV_THREAD_M #define GEMM_ONCOPY CGEMM_ONCOPY #define GEMM_OTCOPY CGEMM_OTCOPY #define GEMM_INCOPY CGEMM_INCOPY #define GEMM_ITCOPY CGEMM_ITCOPY #define GEMM3M_ONCOPYB CGEMM3M_ONCOPYB #define GEMM3M_ONCOPYR CGEMM3M_ONCOPYR #define GEMM3M_ONCOPYI CGEMM3M_ONCOPYI #define GEMM3M_OTCOPYB CGEMM3M_OTCOPYB #define GEMM3M_OTCOPYR CGEMM3M_OTCOPYR #define GEMM3M_OTCOPYI CGEMM3M_OTCOPYI #define GEMM3M_INCOPYB CGEMM3M_INCOPYB #define GEMM3M_INCOPYR CGEMM3M_INCOPYR #define GEMM3M_INCOPYI CGEMM3M_INCOPYI #define GEMM3M_ITCOPYB CGEMM3M_ITCOPYB #define GEMM3M_ITCOPYR CGEMM3M_ITCOPYR #define GEMM3M_ITCOPYI CGEMM3M_ITCOPYI #ifdef UNIT #define TRMM_OUNCOPY CTRMM_OUNUCOPY #define TRMM_OUTCOPY CTRMM_OUTUCOPY #define TRMM_OLNCOPY CTRMM_OLNUCOPY #define TRMM_OLTCOPY CTRMM_OLTUCOPY #define TRSM_OUNCOPY CTRSM_OUNUCOPY #define TRSM_OUTCOPY CTRSM_OUTUCOPY #define TRSM_OLNCOPY CTRSM_OLNUCOPY #define TRSM_OLTCOPY CTRSM_OLTUCOPY #define TRMM_IUNCOPY CTRMM_IUNUCOPY #define TRMM_IUTCOPY CTRMM_IUTUCOPY #define TRMM_ILNCOPY CTRMM_ILNUCOPY #define TRMM_ILTCOPY CTRMM_ILTUCOPY #define TRSM_IUNCOPY CTRSM_IUNUCOPY #define TRSM_IUTCOPY CTRSM_IUTUCOPY #define TRSM_ILNCOPY CTRSM_ILNUCOPY #define TRSM_ILTCOPY CTRSM_ILTUCOPY #else #define TRMM_OUNCOPY CTRMM_OUNNCOPY #define TRMM_OUTCOPY CTRMM_OUTNCOPY #define TRMM_OLNCOPY CTRMM_OLNNCOPY #define TRMM_OLTCOPY CTRMM_OLTNCOPY #define TRSM_OUNCOPY CTRSM_OUNNCOPY #define TRSM_OUTCOPY CTRSM_OUTNCOPY #define TRSM_OLNCOPY CTRSM_OLNNCOPY #define TRSM_OLTCOPY CTRSM_OLTNCOPY #define TRMM_IUNCOPY CTRMM_IUNNCOPY #define TRMM_IUTCOPY CTRMM_IUTNCOPY #define TRMM_ILNCOPY CTRMM_ILNNCOPY #define TRMM_ILTCOPY CTRMM_ILTNCOPY #define TRSM_IUNCOPY CTRSM_IUNNCOPY #define TRSM_IUTCOPY CTRSM_IUTNCOPY #define TRSM_ILNCOPY CTRSM_ILNNCOPY #define TRSM_ILTCOPY CTRSM_ILTNCOPY #endif #define SYMM3M_ILCOPYB CSYMM3M_ILCOPYB #define SYMM3M_IUCOPYB CSYMM3M_IUCOPYB #define SYMM3M_ILCOPYR CSYMM3M_ILCOPYR #define SYMM3M_IUCOPYR CSYMM3M_IUCOPYR #define SYMM3M_ILCOPYI CSYMM3M_ILCOPYI #define SYMM3M_IUCOPYI CSYMM3M_IUCOPYI #define SYMM3M_OLCOPYB CSYMM3M_OLCOPYB #define SYMM3M_OUCOPYB CSYMM3M_OUCOPYB #define SYMM3M_OLCOPYR CSYMM3M_OLCOPYR #define SYMM3M_OUCOPYR CSYMM3M_OUCOPYR #define SYMM3M_OLCOPYI CSYMM3M_OLCOPYI #define SYMM3M_OUCOPYI CSYMM3M_OUCOPYI #define HEMM3M_ILCOPYB CHEMM3M_ILCOPYB #define HEMM3M_IUCOPYB CHEMM3M_IUCOPYB #define HEMM3M_ILCOPYR CHEMM3M_ILCOPYR #define HEMM3M_IUCOPYR CHEMM3M_IUCOPYR #define HEMM3M_ILCOPYI CHEMM3M_ILCOPYI #define HEMM3M_IUCOPYI CHEMM3M_IUCOPYI #define HEMM3M_OLCOPYB CHEMM3M_OLCOPYB #define HEMM3M_OUCOPYB CHEMM3M_OUCOPYB #define HEMM3M_OLCOPYR CHEMM3M_OLCOPYR #define HEMM3M_OUCOPYR CHEMM3M_OUCOPYR #define HEMM3M_OLCOPYI CHEMM3M_OLCOPYI #define HEMM3M_OUCOPYI CHEMM3M_OUCOPYI #define GEMM_BETA CGEMM_BETA #define GEMM_KERNEL_N CGEMM_KERNEL_N #define GEMM_KERNEL_L CGEMM_KERNEL_L #define GEMM_KERNEL_R CGEMM_KERNEL_R #define GEMM_KERNEL_B CGEMM_KERNEL_B #define GEMM3M_KERNEL CGEMM3M_KERNEL #define TRMM_KERNEL_LN CTRMM_KERNEL_LN #define TRMM_KERNEL_LT CTRMM_KERNEL_LT #define TRMM_KERNEL_LR CTRMM_KERNEL_LR #define TRMM_KERNEL_LC CTRMM_KERNEL_LC #define TRMM_KERNEL_RN CTRMM_KERNEL_RN #define TRMM_KERNEL_RT CTRMM_KERNEL_RT #define TRMM_KERNEL_RR CTRMM_KERNEL_RR #define TRMM_KERNEL_RC CTRMM_KERNEL_RC #define TRSM_KERNEL_LN CTRSM_KERNEL_LN #define TRSM_KERNEL_LT CTRSM_KERNEL_LT #define TRSM_KERNEL_LR CTRSM_KERNEL_LR #define TRSM_KERNEL_LC CTRSM_KERNEL_LC #define TRSM_KERNEL_RN CTRSM_KERNEL_RN #define TRSM_KERNEL_RT CTRSM_KERNEL_RT #define TRSM_KERNEL_RR CTRSM_KERNEL_RR #define TRSM_KERNEL_RC CTRSM_KERNEL_RC #define GEMM_NN CGEMM_NN #define GEMM_CN CGEMM_CN #define GEMM_TN CGEMM_TN #define GEMM_NC CGEMM_NC #define GEMM_NT CGEMM_NT #define GEMM_CC CGEMM_CC #define GEMM_CT CGEMM_CT #define GEMM_TC CGEMM_TC #define GEMM_TT CGEMM_TT #define GEMM_NR CGEMM_NR #define GEMM_TR CGEMM_TR #define GEMM_CR CGEMM_CR #define GEMM_RN CGEMM_RN #define GEMM_RT CGEMM_RT #define GEMM_RC CGEMM_RC #define GEMM_RR CGEMM_RR #define SYMM_LU CSYMM_LU #define SYMM_LL CSYMM_LL #define SYMM_RU CSYMM_RU #define SYMM_RL CSYMM_RL #define HEMM_LU CHEMM_LU #define HEMM_LL CHEMM_LL #define HEMM_RU CHEMM_RU #define HEMM_RL CHEMM_RL #define HEMM_IUTCOPY CHEMM_IUTCOPY #define HEMM_ILTCOPY CHEMM_ILTCOPY #define HEMM_OUTCOPY CHEMM_OUTCOPY #define HEMM_OLTCOPY CHEMM_OLTCOPY #define SYRK_UN CSYRK_UN #define SYRK_UT CSYRK_UT #define SYRK_LN CSYRK_LN #define SYRK_LT CSYRK_LT #define SYRK_UR CSYRK_UN #define SYRK_UC CSYRK_UT #define SYRK_LR CSYRK_LN #define SYRK_LC CSYRK_LT #define SYRK_KERNEL_U CSYRK_KERNEL_U #define SYRK_KERNEL_L CSYRK_KERNEL_L #define HERK_UN CHERK_UN #define HERK_LN CHERK_LN #define HERK_UC CHERK_UC #define HERK_LC CHERK_LC #define HER2K_UN CHER2K_UN #define HER2K_LN CHER2K_LN #define HER2K_UC CHER2K_UC #define HER2K_LC CHER2K_LC #define SYR2K_UN CSYR2K_UN #define SYR2K_UT CSYR2K_UT #define SYR2K_LN CSYR2K_LN #define SYR2K_LT CSYR2K_LT #define SYR2K_UR CSYR2K_UN #define SYR2K_UC CSYR2K_UT #define SYR2K_LR CSYR2K_LN #define SYR2K_LC CSYR2K_LT #define SYR2K_KERNEL_U CSYR2K_KERNEL_U #define SYR2K_KERNEL_L CSYR2K_KERNEL_L #define TRMM_LNUU CTRMM_LNUU #define TRMM_LNUN CTRMM_LNUN #define TRMM_LNLU CTRMM_LNLU #define TRMM_LNLN CTRMM_LNLN #define TRMM_LTUU CTRMM_LTUU #define TRMM_LTUN CTRMM_LTUN #define TRMM_LTLU CTRMM_LTLU #define TRMM_LTLN CTRMM_LTLN #define TRMM_LRUU CTRMM_LRUU #define TRMM_LRUN CTRMM_LRUN #define TRMM_LRLU CTRMM_LRLU #define TRMM_LRLN CTRMM_LRLN #define TRMM_LCUU CTRMM_LCUU #define TRMM_LCUN CTRMM_LCUN #define TRMM_LCLU CTRMM_LCLU #define TRMM_LCLN CTRMM_LCLN #define TRMM_RNUU CTRMM_RNUU #define TRMM_RNUN CTRMM_RNUN #define TRMM_RNLU CTRMM_RNLU #define TRMM_RNLN CTRMM_RNLN #define TRMM_RTUU CTRMM_RTUU #define TRMM_RTUN CTRMM_RTUN #define TRMM_RTLU CTRMM_RTLU #define TRMM_RTLN CTRMM_RTLN #define TRMM_RRUU CTRMM_RRUU #define TRMM_RRUN CTRMM_RRUN #define TRMM_RRLU CTRMM_RRLU #define TRMM_RRLN CTRMM_RRLN #define TRMM_RCUU CTRMM_RCUU #define TRMM_RCUN CTRMM_RCUN #define TRMM_RCLU CTRMM_RCLU #define TRMM_RCLN CTRMM_RCLN #define TRSM_LNUU CTRSM_LNUU #define TRSM_LNUN CTRSM_LNUN #define TRSM_LNLU CTRSM_LNLU #define TRSM_LNLN CTRSM_LNLN #define TRSM_LTUU CTRSM_LTUU #define TRSM_LTUN CTRSM_LTUN #define TRSM_LTLU CTRSM_LTLU #define TRSM_LTLN CTRSM_LTLN #define TRSM_LRUU CTRSM_LRUU #define TRSM_LRUN CTRSM_LRUN #define TRSM_LRLU CTRSM_LRLU #define TRSM_LRLN CTRSM_LRLN #define TRSM_LCUU CTRSM_LCUU #define TRSM_LCUN CTRSM_LCUN #define TRSM_LCLU CTRSM_LCLU #define TRSM_LCLN CTRSM_LCLN #define TRSM_RNUU CTRSM_RNUU #define TRSM_RNUN CTRSM_RNUN #define TRSM_RNLU CTRSM_RNLU #define TRSM_RNLN CTRSM_RNLN #define TRSM_RTUU CTRSM_RTUU #define TRSM_RTUN CTRSM_RTUN #define TRSM_RTLU CTRSM_RTLU #define TRSM_RTLN CTRSM_RTLN #define TRSM_RRUU CTRSM_RRUU #define TRSM_RRUN CTRSM_RRUN #define TRSM_RRLU CTRSM_RRLU #define TRSM_RRLN CTRSM_RRLN #define TRSM_RCUU CTRSM_RCUU #define TRSM_RCUN CTRSM_RCUN #define TRSM_RCLU CTRSM_RCLU #define TRSM_RCLN CTRSM_RCLN #define GEMM_THREAD_NN CGEMM_THREAD_NN #define GEMM_THREAD_CN CGEMM_THREAD_CN #define GEMM_THREAD_TN CGEMM_THREAD_TN #define GEMM_THREAD_NC CGEMM_THREAD_NC #define GEMM_THREAD_NT CGEMM_THREAD_NT #define GEMM_THREAD_CC CGEMM_THREAD_CC #define GEMM_THREAD_CT CGEMM_THREAD_CT #define GEMM_THREAD_TC CGEMM_THREAD_TC #define GEMM_THREAD_TT CGEMM_THREAD_TT #define GEMM_THREAD_NR CGEMM_THREAD_NR #define GEMM_THREAD_TR CGEMM_THREAD_TR #define GEMM_THREAD_CR CGEMM_THREAD_CR #define GEMM_THREAD_RN CGEMM_THREAD_RN #define GEMM_THREAD_RT CGEMM_THREAD_RT #define GEMM_THREAD_RC CGEMM_THREAD_RC #define GEMM_THREAD_RR CGEMM_THREAD_RR #define SYMM_THREAD_LU CSYMM_THREAD_LU #define SYMM_THREAD_LL CSYMM_THREAD_LL #define SYMM_THREAD_RU CSYMM_THREAD_RU #define SYMM_THREAD_RL CSYMM_THREAD_RL #define HEMM_THREAD_LU CHEMM_THREAD_LU #define HEMM_THREAD_LL CHEMM_THREAD_LL #define HEMM_THREAD_RU CHEMM_THREAD_RU #define HEMM_THREAD_RL CHEMM_THREAD_RL #define SYRK_THREAD_UN CSYRK_THREAD_UN #define SYRK_THREAD_UT CSYRK_THREAD_UT #define SYRK_THREAD_LN CSYRK_THREAD_LN #define SYRK_THREAD_LT CSYRK_THREAD_LT #define SYRK_THREAD_UR CSYRK_THREAD_UR #define SYRK_THREAD_UC CSYRK_THREAD_UC #define SYRK_THREAD_LR CSYRK_THREAD_LR #define SYRK_THREAD_LC CSYRK_THREAD_LC #define HERK_THREAD_UN CHERK_THREAD_UN #define HERK_THREAD_UT CHERK_THREAD_UT #define HERK_THREAD_LN CHERK_THREAD_LN #define HERK_THREAD_LT CHERK_THREAD_LT #define HERK_THREAD_UR CHERK_THREAD_UR #define HERK_THREAD_UC CHERK_THREAD_UC #define HERK_THREAD_LR CHERK_THREAD_LR #define HERK_THREAD_LC CHERK_THREAD_LC #define GEMM3M_NN CGEMM3M_NN #define GEMM3M_CN CGEMM3M_CN #define GEMM3M_TN CGEMM3M_TN #define GEMM3M_NC CGEMM3M_NC #define GEMM3M_NT CGEMM3M_NT #define GEMM3M_CC CGEMM3M_CC #define GEMM3M_CT CGEMM3M_CT #define GEMM3M_TC CGEMM3M_TC #define GEMM3M_TT CGEMM3M_TT #define GEMM3M_NR CGEMM3M_NR #define GEMM3M_TR CGEMM3M_TR #define GEMM3M_CR CGEMM3M_CR #define GEMM3M_RN CGEMM3M_RN #define GEMM3M_RT CGEMM3M_RT #define GEMM3M_RC CGEMM3M_RC #define GEMM3M_RR CGEMM3M_RR #define GEMM3M_THREAD_NN CGEMM3M_THREAD_NN #define GEMM3M_THREAD_CN CGEMM3M_THREAD_CN #define GEMM3M_THREAD_TN CGEMM3M_THREAD_TN #define GEMM3M_THREAD_NC CGEMM3M_THREAD_NC #define GEMM3M_THREAD_NT CGEMM3M_THREAD_NT #define GEMM3M_THREAD_CC CGEMM3M_THREAD_CC #define GEMM3M_THREAD_CT CGEMM3M_THREAD_CT #define GEMM3M_THREAD_TC CGEMM3M_THREAD_TC #define GEMM3M_THREAD_TT CGEMM3M_THREAD_TT #define GEMM3M_THREAD_NR CGEMM3M_THREAD_NR #define GEMM3M_THREAD_TR CGEMM3M_THREAD_TR #define GEMM3M_THREAD_CR CGEMM3M_THREAD_CR #define GEMM3M_THREAD_RN CGEMM3M_THREAD_RN #define GEMM3M_THREAD_RT CGEMM3M_THREAD_RT #define GEMM3M_THREAD_RC CGEMM3M_THREAD_RC #define GEMM3M_THREAD_RR CGEMM3M_THREAD_RR #define SYMM3M_LU CSYMM3M_LU #define SYMM3M_LL CSYMM3M_LL #define SYMM3M_RU CSYMM3M_RU #define SYMM3M_RL CSYMM3M_RL #define SYMM3M_THREAD_LU CSYMM3M_THREAD_LU #define SYMM3M_THREAD_LL CSYMM3M_THREAD_LL #define SYMM3M_THREAD_RU CSYMM3M_THREAD_RU #define SYMM3M_THREAD_RL CSYMM3M_THREAD_RL #define HEMM3M_LU CHEMM3M_LU #define HEMM3M_LL CHEMM3M_LL #define HEMM3M_RU CHEMM3M_RU #define HEMM3M_RL CHEMM3M_RL #define HEMM3M_THREAD_LU CHEMM3M_THREAD_LU #define HEMM3M_THREAD_LL CHEMM3M_THREAD_LL #define HEMM3M_THREAD_RU CHEMM3M_THREAD_RU #define HEMM3M_THREAD_RL CHEMM3M_THREAD_RL #define SYMM_IUTCOPY CSYMM_IUTCOPY #define SYMM_ILTCOPY CSYMM_ILTCOPY #define SYMM_OUTCOPY CSYMM_OUTCOPY #define SYMM_OLTCOPY CSYMM_OLTCOPY #define AXPBY_K CAXPBY_K #define OMATCOPY_K_CN COMATCOPY_K_CN #define OMATCOPY_K_RN COMATCOPY_K_RN #define OMATCOPY_K_CT COMATCOPY_K_CT #define OMATCOPY_K_RT COMATCOPY_K_RT #define OMATCOPY_K_CNC COMATCOPY_K_CNC #define OMATCOPY_K_RNC COMATCOPY_K_RNC #define OMATCOPY_K_CTC COMATCOPY_K_CTC #define OMATCOPY_K_RTC COMATCOPY_K_RTC #define IMATCOPY_K_CN CIMATCOPY_K_CN #define IMATCOPY_K_RN CIMATCOPY_K_RN #define IMATCOPY_K_CT CIMATCOPY_K_CT #define IMATCOPY_K_RT CIMATCOPY_K_RT #define IMATCOPY_K_CNC CIMATCOPY_K_CNC #define IMATCOPY_K_RNC CIMATCOPY_K_RNC #define IMATCOPY_K_CTC CIMATCOPY_K_CTC #define IMATCOPY_K_RTC CIMATCOPY_K_RTC #define GEADD_K CGEADD_K #endif #endif #ifndef ASSEMBLER #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; extern BLASLONG dgemm_p; extern BLASLONG dgemm_q; extern BLASLONG dgemm_r; extern BLASLONG qgemm_p; extern BLASLONG qgemm_q; extern BLASLONG qgemm_r; extern BLASLONG cgemm_p; extern BLASLONG cgemm_q; extern BLASLONG cgemm_r; extern BLASLONG zgemm_p; extern BLASLONG zgemm_q; extern BLASLONG zgemm_r; extern BLASLONG xgemm_p; extern BLASLONG xgemm_q; extern BLASLONG xgemm_r; #endif typedef struct { void *a, *b, *c, *d, *alpha, *beta; BLASLONG m, n, k, lda, ldb, ldc, ldd; #ifdef SMP void *common; BLASLONG nthreads; #endif #ifdef PARAMTEST BLASLONG gemm_p, gemm_q, gemm_r; #endif #ifdef PREFETCHTEST BLASLONG prea, preb, prec, pred; #endif } blas_arg_t; #endif #ifdef XDOUBLE #define TRSV_NUU qtrsv_NUU #define TRSV_NUN qtrsv_NUN #define TRSV_NLU qtrsv_NLU #define TRSV_NLN qtrsv_NLN #define TRSV_TUU qtrsv_TUU #define TRSV_TUN qtrsv_TUN #define TRSV_TLU qtrsv_TLU #define TRSV_TLN qtrsv_TLN #define ZTRSV_NUU xtrsv_NUU #define ZTRSV_NUN xtrsv_NUN #define ZTRSV_NLU xtrsv_NLU #define ZTRSV_NLN xtrsv_NLN #define ZTRSV_TUU xtrsv_TUU #define ZTRSV_TUN xtrsv_TUN #define ZTRSV_TLU xtrsv_TLU #define ZTRSV_TLN xtrsv_TLN #define ZTRSV_RUU xtrsv_RUU #define ZTRSV_RUN xtrsv_RUN #define ZTRSV_RLU xtrsv_RLU #define ZTRSV_RLN xtrsv_RLN #define ZTRSV_CUU xtrsv_CUU #define ZTRSV_CUN xtrsv_CUN #define ZTRSV_CLU xtrsv_CLU #define ZTRSV_CLN xtrsv_CLN #define TRMV_NUU qtrmv_NUU #define TRMV_NUN qtrmv_NUN #define TRMV_NLU qtrmv_NLU #define TRMV_NLN qtrmv_NLN #define TRMV_TUU qtrmv_TUU #define TRMV_TUN qtrmv_TUN #define TRMV_TLU qtrmv_TLU #define TRMV_TLN qtrmv_TLN #define TRMV_THREAD_NUU qtrmv_thread_NUU #define TRMV_THREAD_NUN qtrmv_thread_NUN #define TRMV_THREAD_NLU qtrmv_thread_NLU #define TRMV_THREAD_NLN qtrmv_thread_NLN #define TRMV_THREAD_TUU qtrmv_thread_TUU #define TRMV_THREAD_TUN qtrmv_thread_TUN #define TRMV_THREAD_TLU qtrmv_thread_TLU #define TRMV_THREAD_TLN qtrmv_thread_TLN #define ZTRMV_NUU xtrmv_NUU #define ZTRMV_NUN xtrmv_NUN #define ZTRMV_NLU xtrmv_NLU #define ZTRMV_NLN xtrmv_NLN #define ZTRMV_TUU xtrmv_TUU #define ZTRMV_TUN xtrmv_TUN #define ZTRMV_TLU xtrmv_TLU #define ZTRMV_TLN xtrmv_TLN #define ZTRMV_RUU xtrmv_RUU #define ZTRMV_RUN xtrmv_RUN #define ZTRMV_RLU xtrmv_RLU #define ZTRMV_RLN xtrmv_RLN #define ZTRMV_CUU xtrmv_CUU #define ZTRMV_CUN xtrmv_CUN #define ZTRMV_CLU xtrmv_CLU #define ZTRMV_CLN xtrmv_CLN #define ZTRMV_THREAD_NUU xtrmv_thread_NUU #define ZTRMV_THREAD_NUN xtrmv_thread_NUN #define ZTRMV_THREAD_NLU xtrmv_thread_NLU #define ZTRMV_THREAD_NLN xtrmv_thread_NLN #define ZTRMV_THREAD_TUU xtrmv_thread_TUU #define ZTRMV_THREAD_TUN xtrmv_thread_TUN #define ZTRMV_THREAD_TLU xtrmv_thread_TLU #define ZTRMV_THREAD_TLN xtrmv_thread_TLN #define ZTRMV_THREAD_RUU xtrmv_thread_RUU #define ZTRMV_THREAD_RUN xtrmv_thread_RUN #define ZTRMV_THREAD_RLU xtrmv_thread_RLU #define ZTRMV_THREAD_RLN xtrmv_thread_RLN #define ZTRMV_THREAD_CUU xtrmv_thread_CUU #define ZTRMV_THREAD_CUN xtrmv_thread_CUN #define ZTRMV_THREAD_CLU xtrmv_thread_CLU #define ZTRMV_THREAD_CLN xtrmv_thread_CLN #elif defined(DOUBLE) #define TRSV_NUU dtrsv_NUU #define TRSV_NUN dtrsv_NUN #define TRSV_NLU dtrsv_NLU #define TRSV_NLN dtrsv_NLN #define TRSV_TUU dtrsv_TUU #define TRSV_TUN dtrsv_TUN #define TRSV_TLU dtrsv_TLU #define TRSV_TLN dtrsv_TLN #define ZTRSV_NUU ztrsv_NUU #define ZTRSV_NUN ztrsv_NUN #define ZTRSV_NLU ztrsv_NLU #define ZTRSV_NLN ztrsv_NLN #define ZTRSV_TUU ztrsv_TUU #define ZTRSV_TUN ztrsv_TUN #define ZTRSV_TLU ztrsv_TLU #define ZTRSV_TLN ztrsv_TLN #define ZTRSV_RUU ztrsv_RUU #define ZTRSV_RUN ztrsv_RUN #define ZTRSV_RLU ztrsv_RLU #define ZTRSV_RLN ztrsv_RLN #define ZTRSV_CUU ztrsv_CUU #define ZTRSV_CUN ztrsv_CUN #define ZTRSV_CLU ztrsv_CLU #define ZTRSV_CLN ztrsv_CLN #define TRMV_NUU dtrmv_NUU #define TRMV_NUN dtrmv_NUN #define TRMV_NLU dtrmv_NLU #define TRMV_NLN dtrmv_NLN #define TRMV_TUU dtrmv_TUU #define TRMV_TUN dtrmv_TUN #define TRMV_TLU dtrmv_TLU #define TRMV_TLN dtrmv_TLN #define TRMV_THREAD_NUU dtrmv_thread_NUU #define TRMV_THREAD_NUN dtrmv_thread_NUN #define TRMV_THREAD_NLU dtrmv_thread_NLU #define TRMV_THREAD_NLN dtrmv_thread_NLN #define TRMV_THREAD_TUU dtrmv_thread_TUU #define TRMV_THREAD_TUN dtrmv_thread_TUN #define TRMV_THREAD_TLU dtrmv_thread_TLU #define TRMV_THREAD_TLN dtrmv_thread_TLN #define ZTRMV_NUU ztrmv_NUU #define ZTRMV_NUN ztrmv_NUN #define ZTRMV_NLU ztrmv_NLU #define ZTRMV_NLN ztrmv_NLN #define ZTRMV_TUU ztrmv_TUU #define ZTRMV_TUN ztrmv_TUN #define ZTRMV_TLU ztrmv_TLU #define ZTRMV_TLN ztrmv_TLN #define ZTRMV_RUU ztrmv_RUU #define ZTRMV_RUN ztrmv_RUN #define ZTRMV_RLU ztrmv_RLU #define ZTRMV_RLN ztrmv_RLN #define ZTRMV_CUU ztrmv_CUU #define ZTRMV_CUN ztrmv_CUN #define ZTRMV_CLU ztrmv_CLU #define ZTRMV_CLN ztrmv_CLN #define ZTRMV_THREAD_NUU ztrmv_thread_NUU #define ZTRMV_THREAD_NUN ztrmv_thread_NUN #define ZTRMV_THREAD_NLU ztrmv_thread_NLU #define ZTRMV_THREAD_NLN ztrmv_thread_NLN #define ZTRMV_THREAD_TUU ztrmv_thread_TUU #define ZTRMV_THREAD_TUN ztrmv_thread_TUN #define ZTRMV_THREAD_TLU ztrmv_thread_TLU #define ZTRMV_THREAD_TLN ztrmv_thread_TLN #define ZTRMV_THREAD_RUU ztrmv_thread_RUU #define ZTRMV_THREAD_RUN ztrmv_thread_RUN #define ZTRMV_THREAD_RLU ztrmv_thread_RLU #define ZTRMV_THREAD_RLN ztrmv_thread_RLN #define ZTRMV_THREAD_CUU ztrmv_thread_CUU #define ZTRMV_THREAD_CUN ztrmv_thread_CUN #define ZTRMV_THREAD_CLU ztrmv_thread_CLU #define ZTRMV_THREAD_CLN ztrmv_thread_CLN #else #define TRSV_NUU strsv_NUU #define TRSV_NUN strsv_NUN #define TRSV_NLU strsv_NLU #define TRSV_NLN strsv_NLN #define TRSV_TUU strsv_TUU #define TRSV_TUN strsv_TUN #define TRSV_TLU strsv_TLU #define TRSV_TLN strsv_TLN #define ZTRSV_NUU ctrsv_NUU #define ZTRSV_NUN ctrsv_NUN #define ZTRSV_NLU ctrsv_NLU #define ZTRSV_NLN ctrsv_NLN #define ZTRSV_TUU ctrsv_TUU #define ZTRSV_TUN ctrsv_TUN #define ZTRSV_TLU ctrsv_TLU #define ZTRSV_TLN ctrsv_TLN #define ZTRSV_RUU ctrsv_RUU #define ZTRSV_RUN ctrsv_RUN #define ZTRSV_RLU ctrsv_RLU #define ZTRSV_RLN ctrsv_RLN #define ZTRSV_CUU ctrsv_CUU #define ZTRSV_CUN ctrsv_CUN #define ZTRSV_CLU ctrsv_CLU #define ZTRSV_CLN ctrsv_CLN #define TRMV_NUU strmv_NUU #define TRMV_NUN strmv_NUN #define TRMV_NLU strmv_NLU #define TRMV_NLN strmv_NLN #define TRMV_TUU strmv_TUU #define TRMV_TUN strmv_TUN #define TRMV_TLU strmv_TLU #define TRMV_TLN strmv_TLN #define TRMV_THREAD_NUU strmv_thread_NUU #define TRMV_THREAD_NUN strmv_thread_NUN #define TRMV_THREAD_NLU strmv_thread_NLU #define TRMV_THREAD_NLN strmv_thread_NLN #define TRMV_THREAD_TUU strmv_thread_TUU #define TRMV_THREAD_TUN strmv_thread_TUN #define TRMV_THREAD_TLU strmv_thread_TLU #define TRMV_THREAD_TLN strmv_thread_TLN #define ZTRMV_NUU ctrmv_NUU #define ZTRMV_NUN ctrmv_NUN #define ZTRMV_NLU ctrmv_NLU #define ZTRMV_NLN ctrmv_NLN #define ZTRMV_TUU ctrmv_TUU #define ZTRMV_TUN ctrmv_TUN #define ZTRMV_TLU ctrmv_TLU #define ZTRMV_TLN ctrmv_TLN #define ZTRMV_RUU ctrmv_RUU #define ZTRMV_RUN ctrmv_RUN #define ZTRMV_RLU ctrmv_RLU #define ZTRMV_RLN ctrmv_RLN #define ZTRMV_CUU ctrmv_CUU #define ZTRMV_CUN ctrmv_CUN #define ZTRMV_CLU ctrmv_CLU #define ZTRMV_CLN ctrmv_CLN #define ZTRMV_THREAD_NUU ctrmv_thread_NUU #define ZTRMV_THREAD_NUN ctrmv_thread_NUN #define ZTRMV_THREAD_NLU ctrmv_thread_NLU #define ZTRMV_THREAD_NLN ctrmv_thread_NLN #define ZTRMV_THREAD_TUU ctrmv_thread_TUU #define ZTRMV_THREAD_TUN ctrmv_thread_TUN #define ZTRMV_THREAD_TLU ctrmv_thread_TLU #define ZTRMV_THREAD_TLN ctrmv_thread_TLN #define ZTRMV_THREAD_RUU ctrmv_thread_RUU #define ZTRMV_THREAD_RUN ctrmv_thread_RUN #define ZTRMV_THREAD_RLU ctrmv_thread_RLU #define ZTRMV_THREAD_RLN ctrmv_thread_RLN #define ZTRMV_THREAD_CUU ctrmv_thread_CUU #define ZTRMV_THREAD_CUN ctrmv_thread_CUN #define ZTRMV_THREAD_CLU ctrmv_thread_CLU #define ZTRMV_THREAD_CLN ctrmv_thread_CLN #endif #define SGETF2 sgetf2_k #define DGETF2 dgetf2_k #define QGETF2 qgetf2_k #define CGETF2 cgetf2_k #define ZGETF2 zgetf2_k #define XGETF2 xgetf2_k #define SLASWP_PLUS slaswp_plus #define SLASWP_MINUS slaswp_minus #define DLASWP_PLUS dlaswp_plus #define DLASWP_MINUS dlaswp_minus #define QLASWP_PLUS qlaswp_plus #define QLASWP_MINUS qlaswp_minus #define CLASWP_PLUS claswp_plus #define CLASWP_MINUS claswp_minus #define ZLASWP_PLUS zlaswp_plus #define ZLASWP_MINUS zlaswp_minus #define XLASWP_PLUS xlaswp_plus #define XLASWP_MINUS xlaswp_minus #define SLARF_L slarf_L #define SLARF_R slarf_R #define DLARF_L dlarf_L #define DLARF_R dlarf_R #define QLARF_L qlarf_L #define QLARF_R qlarf_R #define CLARF_L clarf_L #define CLARF_R clarf_R #define ZLARF_L zlarf_L #define ZLARF_R zlarf_R #define XLARF_L xlarf_L #define XLARF_R xlarf_R #ifndef COMPLEX #ifdef XDOUBLE #define GETF2 QGETF2 #define GETRF QGETRF #define GETRS_N_SINGLE qgetrs_N_single #define GETRS_T_SINGLE qgetrs_T_single #define GETRS_R_SINGLE qgetrs_N_single #define GETRS_C_SINGLE qgetrs_T_single #define GETRS_N_PARALLEL qgetrs_N_parallel #define GETRS_T_PARALLEL qgetrs_T_parallel #define GETRS_R_PARALLEL qgetrs_N_parallel #define GETRS_C_PARALLEL qgetrs_T_parallel #define LASWP_PLUS QLASWP_PLUS #define LASWP_MINUS QLASWP_MINUS #define LASWP_NCOPY QLASWP_NCOPY #define GETRS_N QGETRS_N #define GETRS_T QGETRS_T #define GETRF_SINGLE qgetrf_single #define GETRF_PARALLEL qgetrf_parallel #define NEG_TCOPY QNEG_TCOPY #define LARF_L QLARF_L #define LARF_R QLARF_R #elif defined(DOUBLE) #define GETF2 DGETF2 #define GETRF DGETRF #define GETRS_N_SINGLE dgetrs_N_single #define GETRS_T_SINGLE dgetrs_T_single #define GETRS_R_SINGLE dgetrs_N_single #define GETRS_C_SINGLE dgetrs_T_single #define GETRS_N_PARALLEL dgetrs_N_parallel #define GETRS_T_PARALLEL dgetrs_T_parallel #define GETRS_R_PARALLEL dgetrs_N_parallel #define GETRS_C_PARALLEL dgetrs_T_parallel #define LASWP_PLUS DLASWP_PLUS #define LASWP_MINUS DLASWP_MINUS #define LASWP_NCOPY DLASWP_NCOPY #define GETRS_N DGETRS_N #define GETRS_T DGETRS_T #define GETRF_SINGLE dgetrf_single #define GETRF_PARALLEL dgetrf_parallel #define NEG_TCOPY DNEG_TCOPY #define LARF_L DLARF_L #define LARF_R DLARF_R #else #define GETF2 SGETF2 #define GETRF SGETRF #define GETRS_N_SINGLE sgetrs_N_single #define GETRS_T_SINGLE sgetrs_T_single #define GETRS_R_SINGLE sgetrs_N_single #define GETRS_C_SINGLE sgetrs_T_single #define GETRS_N_PARALLEL sgetrs_N_parallel #define GETRS_T_PARALLEL sgetrs_T_parallel #define GETRS_R_PARALLEL sgetrs_N_parallel #define GETRS_C_PARALLEL sgetrs_T_parallel #define LASWP_PLUS SLASWP_PLUS #define LASWP_MINUS SLASWP_MINUS #define LASWP_NCOPY SLASWP_NCOPY #define GETRS_N SGETRS_N #define GETRS_T SGETRS_T #define GETRF_SINGLE sgetrf_single #define GETRF_PARALLEL sgetrf_parallel #define NEG_TCOPY SNEG_TCOPY #define LARF_L SLARF_L #define LARF_R SLARF_R #endif #else #ifdef XDOUBLE #define GETF2 XGETF2 #define GETRF XGETRF #define GETRS_N_SINGLE xgetrs_N_single #define GETRS_T_SINGLE xgetrs_T_single #define GETRS_R_SINGLE xgetrs_R_single #define GETRS_C_SINGLE xgetrs_C_single #define GETRS_N_PARALLEL xgetrs_N_parallel #define GETRS_T_PARALLEL xgetrs_T_parallel #define GETRS_R_PARALLEL xgetrs_R_parallel #define GETRS_C_PARALLEL xgetrs_C_parallel #define LASWP_PLUS XLASWP_PLUS #define LASWP_MINUS XLASWP_MINUS #define LASWP_NCOPY XLASWP_NCOPY #define GETRS_N XGETRS_N #define GETRS_T XGETRS_T #define GETRF_SINGLE xgetrf_single #define GETRF_PARALLEL xgetrf_parallel #define NEG_TCOPY XNEG_TCOPY #define LARF_L XLARF_L #define LARF_R XLARF_R #elif defined(DOUBLE) #define GETF2 ZGETF2 #define GETRF ZGETRF #define GETRS_N_SINGLE zgetrs_N_single #define GETRS_T_SINGLE zgetrs_T_single #define GETRS_R_SINGLE zgetrs_R_single #define GETRS_C_SINGLE zgetrs_C_single #define GETRS_N_PARALLEL zgetrs_N_parallel #define GETRS_T_PARALLEL zgetrs_T_parallel #define GETRS_R_PARALLEL zgetrs_R_parallel #define GETRS_C_PARALLEL zgetrs_C_parallel #define LASWP_PLUS ZLASWP_PLUS #define LASWP_MINUS ZLASWP_MINUS #define LASWP_NCOPY ZLASWP_NCOPY #define GETRS_N ZGETRS_N #define GETRS_T ZGETRS_T #define GETRF_SINGLE zgetrf_single #define GETRF_PARALLEL zgetrf_parallel #define NEG_TCOPY ZNEG_TCOPY #define LARF_L ZLARF_L #define LARF_R ZLARF_R #else #define GETF2 CGETF2 #define GETRF CGETRF #define GETRS_N_SINGLE cgetrs_N_single #define GETRS_T_SINGLE cgetrs_T_single #define GETRS_R_SINGLE cgetrs_R_single #define GETRS_C_SINGLE cgetrs_C_single #define GETRS_N_PARALLEL cgetrs_N_parallel #define GETRS_T_PARALLEL cgetrs_T_parallel #define GETRS_R_PARALLEL cgetrs_R_parallel #define GETRS_C_PARALLEL cgetrs_C_parallel #define LASWP_PLUS CLASWP_PLUS #define LASWP_MINUS CLASWP_MINUS #define LASWP_NCOPY CLASWP_NCOPY #define GETRS_N CGETRS_N #define GETRS_T CGETRS_T #define GETRF_SINGLE cgetrf_single #define GETRF_PARALLEL cgetrf_parallel #define NEG_TCOPY CNEG_TCOPY #define LARF_L CLARF_L #define LARF_R CLARF_R #endif #endif #ifndef COMPLEX #ifdef XDOUBLE #define POTF2_U qpotf2_U #define POTF2_L qpotf2_L #define LAUU2_U qlauu2_U #define LAUU2_L qlauu2_L #define POTRF_U_SINGLE qpotrf_U_single #define POTRF_L_SINGLE qpotrf_L_single #define POTRF_U_PARALLEL qpotrf_U_parallel #define POTRF_L_PARALLEL qpotrf_L_parallel #define LAUUM_U_SINGLE qlauum_U_single #define LAUUM_L_SINGLE qlauum_L_single #define LAUUM_U_PARALLEL qlauum_U_parallel #define LAUUM_L_PARALLEL qlauum_L_parallel #define TRTI2_UU qtrti2_UU #define TRTI2_UN qtrti2_UN #define TRTI2_LU qtrti2_LU #define TRTI2_LN qtrti2_LN #define TRTRI_UU_SINGLE qtrtri_UU_single #define TRTRI_UN_SINGLE qtrtri_UN_single #define TRTRI_LU_SINGLE qtrtri_LU_single #define TRTRI_LN_SINGLE qtrtri_LN_single #define TRTRI_UU_PARALLEL qtrtri_UU_parallel #define TRTRI_UN_PARALLEL qtrtri_UN_parallel #define TRTRI_LU_PARALLEL qtrtri_LU_parallel #define TRTRI_LN_PARALLEL qtrtri_LN_parallel #elif defined(DOUBLE) #define POTF2_U dpotf2_U #define POTF2_L dpotf2_L #define LAUU2_U dlauu2_U #define LAUU2_L dlauu2_L #define POTRF_U_SINGLE dpotrf_U_single #define POTRF_L_SINGLE dpotrf_L_single #define POTRF_U_PARALLEL dpotrf_U_parallel #define POTRF_L_PARALLEL dpotrf_L_parallel #define LAUUM_U_SINGLE dlauum_U_single #define LAUUM_L_SINGLE dlauum_L_single #define LAUUM_U_PARALLEL dlauum_U_parallel #define LAUUM_L_PARALLEL dlauum_L_parallel #define TRTI2_UU dtrti2_UU #define TRTI2_UN dtrti2_UN #define TRTI2_LU dtrti2_LU #define TRTI2_LN dtrti2_LN #define TRTRI_UU_SINGLE dtrtri_UU_single #define TRTRI_UN_SINGLE dtrtri_UN_single #define TRTRI_LU_SINGLE dtrtri_LU_single #define TRTRI_LN_SINGLE dtrtri_LN_single #define TRTRI_UU_PARALLEL dtrtri_UU_parallel #define TRTRI_UN_PARALLEL dtrtri_UN_parallel #define TRTRI_LU_PARALLEL dtrtri_LU_parallel #define TRTRI_LN_PARALLEL dtrtri_LN_parallel #else #define POTF2_U spotf2_U #define POTF2_L spotf2_L #define LAUU2_U slauu2_U #define LAUU2_L slauu2_L #define POTRF_U_SINGLE spotrf_U_single #define POTRF_L_SINGLE spotrf_L_single #define POTRF_U_PARALLEL spotrf_U_parallel #define POTRF_L_PARALLEL spotrf_L_parallel #define LAUUM_U_SINGLE slauum_U_single #define LAUUM_L_SINGLE slauum_L_single #define LAUUM_U_PARALLEL slauum_U_parallel #define LAUUM_L_PARALLEL slauum_L_parallel #define TRTI2_UU strti2_UU #define TRTI2_UN strti2_UN #define TRTI2_LU strti2_LU #define TRTI2_LN strti2_LN #define TRTRI_UU_SINGLE strtri_UU_single #define TRTRI_UN_SINGLE strtri_UN_single #define TRTRI_LU_SINGLE strtri_LU_single #define TRTRI_LN_SINGLE strtri_LN_single #define TRTRI_UU_PARALLEL strtri_UU_parallel #define TRTRI_UN_PARALLEL strtri_UN_parallel #define TRTRI_LU_PARALLEL strtri_LU_parallel #define TRTRI_LN_PARALLEL strtri_LN_parallel #endif #else #ifdef XDOUBLE #define POTF2_U xpotf2_U #define POTF2_L xpotf2_L #define LAUU2_U xlauu2_U #define LAUU2_L xlauu2_L #define POTRF_U_SINGLE xpotrf_U_single #define POTRF_L_SINGLE xpotrf_L_single #define POTRF_U_PARALLEL xpotrf_U_parallel #define POTRF_L_PARALLEL xpotrf_L_parallel #define LAUUM_U_SINGLE xlauum_U_single #define LAUUM_L_SINGLE xlauum_L_single #define LAUUM_U_PARALLEL xlauum_U_parallel #define LAUUM_L_PARALLEL xlauum_L_parallel #define TRTI2_UU xtrti2_UU #define TRTI2_UN xtrti2_UN #define TRTI2_LU xtrti2_LU #define TRTI2_LN xtrti2_LN #define TRTRI_UU_SINGLE xtrtri_UU_single #define TRTRI_UN_SINGLE xtrtri_UN_single #define TRTRI_LU_SINGLE xtrtri_LU_single #define TRTRI_LN_SINGLE xtrtri_LN_single #define TRTRI_UU_PARALLEL xtrtri_UU_parallel #define TRTRI_UN_PARALLEL xtrtri_UN_parallel #define TRTRI_LU_PARALLEL xtrtri_LU_parallel #define TRTRI_LN_PARALLEL xtrtri_LN_parallel #elif defined(DOUBLE) #define POTF2_U zpotf2_U #define POTF2_L zpotf2_L #define LAUU2_U zlauu2_U #define LAUU2_L zlauu2_L #define POTRF_U_SINGLE zpotrf_U_single #define POTRF_L_SINGLE zpotrf_L_single #define POTRF_U_PARALLEL zpotrf_U_parallel #define POTRF_L_PARALLEL zpotrf_L_parallel #define LAUUM_U_SINGLE zlauum_U_single #define LAUUM_L_SINGLE zlauum_L_single #define LAUUM_U_PARALLEL zlauum_U_parallel #define LAUUM_L_PARALLEL zlauum_L_parallel #define TRTI2_UU ztrti2_UU #define TRTI2_UN ztrti2_UN #define TRTI2_LU ztrti2_LU #define TRTI2_LN ztrti2_LN #define TRTRI_UU_SINGLE ztrtri_UU_single #define TRTRI_UN_SINGLE ztrtri_UN_single #define TRTRI_LU_SINGLE ztrtri_LU_single #define TRTRI_LN_SINGLE ztrtri_LN_single #define TRTRI_UU_PARALLEL ztrtri_UU_parallel #define TRTRI_UN_PARALLEL ztrtri_UN_parallel #define TRTRI_LU_PARALLEL ztrtri_LU_parallel #define TRTRI_LN_PARALLEL ztrtri_LN_parallel #else #define POTF2_U cpotf2_U #define POTF2_L cpotf2_L #define LAUU2_U clauu2_U #define LAUU2_L clauu2_L #define POTRF_U_SINGLE cpotrf_U_single #define POTRF_L_SINGLE cpotrf_L_single #define POTRF_U_PARALLEL cpotrf_U_parallel #define POTRF_L_PARALLEL cpotrf_L_parallel #define LAUUM_U_SINGLE clauum_U_single #define LAUUM_L_SINGLE clauum_L_single #define LAUUM_U_PARALLEL clauum_U_parallel #define LAUUM_L_PARALLEL clauum_L_parallel #define TRTI2_UU ctrti2_UU #define TRTI2_UN ctrti2_UN #define TRTI2_LU ctrti2_LU #define TRTI2_LN ctrti2_LN #define TRTRI_UU_SINGLE ctrtri_UU_single #define TRTRI_UN_SINGLE ctrtri_UN_single #define TRTRI_LU_SINGLE ctrtri_LU_single #define TRTRI_LN_SINGLE ctrtri_LN_single #define TRTRI_UU_PARALLEL ctrtri_UU_parallel #define TRTRI_UN_PARALLEL ctrtri_UN_parallel #define TRTRI_LU_PARALLEL ctrtri_LU_parallel #define TRTRI_LN_PARALLEL ctrtri_LN_parallel #endif #endif #endif OpenBLAS-0.2.20/common_mips.h000066400000000000000000000053231313527062700156400ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #ifndef COMMON_MIPS #define COMMON_MIPS #define MB __sync_synchronize() #define WMB __sync_synchronize() #define INLINE inline #define RETURN_BY_COMPLEX #ifndef ASSEMBLER static inline unsigned int rpcc(void){ unsigned long ret; __asm__ __volatile__(".set push \n" "rdhwr %0, $30 \n" ".set pop" : "=r"(ret) : : "memory"); return ret; } #define RPCC_DEFINED static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } #define GET_IMAGE(res) #define GET_IMAGE_CANCEL #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ .arm ;\ .global REALNAME ;\ REALNAME: #define EPILOGUE #define PROFCODE #endif #define SEEK_ADDRESS #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE ( 4 << 20) #define BUFFER_SIZE (16 << 20) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #endif OpenBLAS-0.2.20/common_mips64.h000066400000000000000000000173631313527062700160210ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_MIPS64 #define COMMON_MIPS64 #define MB __sync_synchronize() #define WMB __sync_synchronize() #define INLINE inline #ifndef ASSEMBLER static inline unsigned int rpcc(void){ unsigned long ret; // unsigned long long tmp; //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); //ret=tmp; __asm__ __volatile__(".set push \n" ".set mips32r2\n" "rdhwr %0, $2\n" ".set pop": "=r"(ret):: "memory"); return ret; } #define RPCC_DEFINED #ifndef NO_AFFINITY #define WHEREAMI static inline int WhereAmI(void){ int ret=0; __asm__ __volatile__(".set push \n" ".set mips32r2\n" "rdhwr %0, $0\n" ".set pop": "=r"(ret):: "memory"); return ret; } #endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("mov.d %0, $f2" : "=f"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("mov.s %0, $f2" : "=f"(res) : : "memory") #endif #define GET_IMAGE_CANCEL #endif #ifdef ASSEMBLER #define HALT teq $0, $0 #define NOP move $0, $0 #ifdef DOUBLE #define LD ldc1 #define ST sdc1 #define MADD madd.d #define NMADD nmadd.d #define MSUB msub.d #define NMSUB nmsub.d #define ADD add.d #define SUB sub.d #define MUL mul.d #define MOV mov.d #define CMOVF movf.d #define CMOVT movt.d #define MTC dmtc1 #define FABS abs.d #define CMPEQ c.eq.d #define CMPLE c.le.d #define CMPLT c.lt.d #define NEG neg.d #else #define LD lwc1 #define ST swc1 #define MADD madd.s #define NMADD nmadd.s #define MSUB msub.s #define NMSUB nmsub.s #define ADD add.s #define SUB sub.s #define MUL mul.s #define MOV mov.s #define CMOVF movf.s #define CMOVT movt.s #define MTC mtc1 #define FABS abs.s #define CMPEQ c.eq.s #define CMPLE c.le.s #define CMPLT c.lt.s #define PLU plu.ps #define PLL pll.ps #define PUU puu.ps #define PUL pul.ps #define MADPS madd.ps #define CVTU cvt.s.pu #define CVTL cvt.s.pl #define NEG neg.s #endif #if defined(__64BIT__) && defined(USE64BITINT) #define LDINT ld #define LDARG ld #define SDARG sd #elif defined(__64BIT__) && !defined(USE64BITINT) #define LDINT lw #define LDARG ld #define SDARG sd #else #define LDINT lw #define LDARG lw #define SDARG sw #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ .text ;\ .set mips64 ;\ .align 5 ;\ .globl REALNAME ;\ .ent REALNAME ;\ .type REALNAME, @function ;\ REALNAME: ;\ .set noreorder ;\ .set nomacro #if defined(__linux__) && defined(__ELF__) #define GNUSTACK .section .note.GNU-stack,"",@progbits #else #define GNUSTACK #endif #define EPILOGUE \ .set macro ;\ .set reorder ;\ .end REALNAME ;\ GNUSTACK #define PROFCODE #endif #endif #define SEEK_ADDRESS #define BUFFER_SIZE ( 32 << 20) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif #if defined(LOONGSON3B) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif #ifndef PAGESIZE #define PAGESIZE (64UL << 10) #endif #define HUGE_PAGESIZE ( 2 << 20) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #if defined(LOONGSON3A) || defined(LOONGSON3B) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else #define PREFETCHD(x) #endif #endif OpenBLAS-0.2.20/common_param.h000066400000000000000000002430151313527062700157720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_PARAM_H #define COMMON_PARAM_H #ifndef ASSEMBLER #ifdef DYNAMIC_ARCH typedef struct { int dtb_entries; int offsetA, offsetB, align; int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; int exclusive_cache; float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); double (*dmin_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); #ifdef EXPRECISION int qgemm_p, qgemm_q, qgemm_r; int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); #endif int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float _Complex (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float _Complex (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int cgemm3m_p, cgemm3m_q, cgemm3m_r; int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn; int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; double (*zamax_k) (BLASLONG, double *, BLASLONG); double (*zamin_k) (BLASLONG, double *, BLASLONG); BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); double (*znrm2_k) (BLASLONG, double *, BLASLONG); double (*zasum_k) (BLASLONG, double *, BLASLONG); int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double _Complex (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double _Complex (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int zgemm3m_p, zgemm3m_q, zgemm3m_r; int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn; int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); #ifdef EXPRECISION int xgemm_p, xgemm_q, xgemm_r; int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble _Complex (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble _Complex (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int xgemm3m_p, xgemm3m_q, xgemm3m_r; int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn; int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); #endif void (*init)(void); int snum_opt, dnum_opt, qnum_opt; int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); int (*zgeadd_k) (BLASLONG, BLASLONG, float, double, double *, BLASLONG, double, double, double *, BLASLONG); } gotoblas_t; extern gotoblas_t *gotoblas; #define DTB_ENTRIES gotoblas -> dtb_entries #define GEMM_OFFSET_A gotoblas -> offsetA #define GEMM_OFFSET_B gotoblas -> offsetB #define GEMM_ALIGN gotoblas -> align #define HAVE_EX_L2 gotoblas -> exclusive_cache #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r #define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m #define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn #define QGEMM_P gotoblas -> qgemm_p #define QGEMM_Q gotoblas -> qgemm_q #define QGEMM_R gotoblas -> qgemm_r #define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn #define XGEMM_P gotoblas -> xgemm_p #define XGEMM_Q gotoblas -> xgemm_q #define XGEMM_R gotoblas -> xgemm_r #define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m #define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n #define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn #define CGEMM3M_P gotoblas -> cgemm3m_p #define CGEMM3M_Q gotoblas -> cgemm3m_q #define CGEMM3M_R gotoblas -> cgemm3m_r #define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m #define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n #define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn #define ZGEMM3M_P gotoblas -> zgemm3m_p #define ZGEMM3M_Q gotoblas -> zgemm3m_q #define ZGEMM3M_R gotoblas -> zgemm3m_r #define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m #define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n #define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn #define XGEMM3M_P gotoblas -> xgemm3m_p #define XGEMM3M_Q gotoblas -> xgemm3m_q #define XGEMM3M_R gotoblas -> xgemm3m_r #define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m #define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n #define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn #else #define DTB_ENTRIES DTB_DEFAULT_ENTRIES #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A #define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B #define GEMM_ALIGN GEMM_DEFAULT_ALIGN #ifdef HAVE_EXCLUSIVE_CACHE #define HAVE_EX_L2 1 #else #define HAVE_EX_L2 0 #endif #define SGEMM_P SGEMM_DEFAULT_P #define SGEMM_Q SGEMM_DEFAULT_Q #define SGEMM_R SGEMM_DEFAULT_R #define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M #define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N #ifdef SGEMM_DEFAULT_UNROLL_MN #define SGEMM_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN #else #define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) #endif #define DGEMM_P DGEMM_DEFAULT_P #define DGEMM_Q DGEMM_DEFAULT_Q #define DGEMM_R DGEMM_DEFAULT_R #define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N #ifdef DGEMM_DEFAULT_UNROLL_MN #define DGEMM_UNROLL_MN DGEMM_DEFAULT_UNROLL_MN #else #define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) #endif #define QGEMM_P QGEMM_DEFAULT_P #define QGEMM_Q QGEMM_DEFAULT_Q #define QGEMM_R QGEMM_DEFAULT_R #define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M #define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N #define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) #define CGEMM_P CGEMM_DEFAULT_P #define CGEMM_Q CGEMM_DEFAULT_Q #define CGEMM_R CGEMM_DEFAULT_R #define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M #define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N #ifdef CGEMM_DEFAULT_UNROLL_MN #define CGEMM_UNROLL_MN CGEMM_DEFAULT_UNROLL_MN #else #define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) #endif #define ZGEMM_P ZGEMM_DEFAULT_P #define ZGEMM_Q ZGEMM_DEFAULT_Q #define ZGEMM_R ZGEMM_DEFAULT_R #define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M #define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N #ifdef ZGEMM_DEFAULT_UNROLL_MN #define ZGEMM_UNROLL_MN ZGEMM_DEFAULT_UNROLL_MN #else #define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) #endif #define XGEMM_P XGEMM_DEFAULT_P #define XGEMM_Q XGEMM_DEFAULT_Q #define XGEMM_R XGEMM_DEFAULT_R #define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M #define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N #define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) #ifdef CGEMM3M_DEFAULT_UNROLL_N #define CGEMM3M_P CGEMM3M_DEFAULT_P #define CGEMM3M_Q CGEMM3M_DEFAULT_Q #define CGEMM3M_R CGEMM3M_DEFAULT_R #define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M #define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N #define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N)) #else #define CGEMM3M_P SGEMM_DEFAULT_P #define CGEMM3M_Q SGEMM_DEFAULT_Q #define CGEMM3M_R SGEMM_DEFAULT_R #define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M #define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N #define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) #endif #ifdef ZGEMM3M_DEFAULT_UNROLL_N #define ZGEMM3M_P ZGEMM3M_DEFAULT_P #define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q #define ZGEMM3M_R ZGEMM3M_DEFAULT_R #define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M #define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N #define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) #else #define ZGEMM3M_P DGEMM_DEFAULT_P #define ZGEMM3M_Q DGEMM_DEFAULT_Q #define ZGEMM3M_R DGEMM_DEFAULT_R #define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N #define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) #endif #define XGEMM3M_P QGEMM_DEFAULT_P #define XGEMM3M_Q QGEMM_DEFAULT_Q #define XGEMM3M_R QGEMM_DEFAULT_R #define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M #define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N #define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) #endif #endif #ifndef COMPLEX #if defined(XDOUBLE) #define GEMM_P QGEMM_P #define GEMM_Q QGEMM_Q #define GEMM_R QGEMM_R #define GEMM_UNROLL_M QGEMM_UNROLL_M #define GEMM_UNROLL_N QGEMM_UNROLL_N #define GEMM_UNROLL_MN QGEMM_UNROLL_MN #define GEMM_DEFAULT_P QGEMM_DEFAULT_P #define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q #define GEMM_DEFAULT_R QGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N #elif defined(DOUBLE) #define GEMM_P DGEMM_P #define GEMM_Q DGEMM_Q #define GEMM_R DGEMM_R #define GEMM_UNROLL_M DGEMM_UNROLL_M #define GEMM_UNROLL_N DGEMM_UNROLL_N #define GEMM_UNROLL_MN DGEMM_UNROLL_MN #define GEMM_DEFAULT_P DGEMM_DEFAULT_P #define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q #define GEMM_R SGEMM_R #define GEMM_UNROLL_M SGEMM_UNROLL_M #define GEMM_UNROLL_N SGEMM_UNROLL_N #define GEMM_UNROLL_MN SGEMM_UNROLL_MN #define GEMM_DEFAULT_P SGEMM_DEFAULT_P #define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q #define GEMM_DEFAULT_R SGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #endif #else #if defined(XDOUBLE) #define GEMM_P XGEMM_P #define GEMM_Q XGEMM_Q #define GEMM_R XGEMM_R #define GEMM_UNROLL_M XGEMM_UNROLL_M #define GEMM_UNROLL_N XGEMM_UNROLL_N #define GEMM_UNROLL_MN XGEMM_UNROLL_MN #define GEMM_DEFAULT_P XGEMM_DEFAULT_P #define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q #define GEMM_DEFAULT_R XGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N #elif defined(DOUBLE) #define GEMM_P ZGEMM_P #define GEMM_Q ZGEMM_Q #define GEMM_R ZGEMM_R #define GEMM_UNROLL_M ZGEMM_UNROLL_M #define GEMM_UNROLL_N ZGEMM_UNROLL_N #define GEMM_UNROLL_MN ZGEMM_UNROLL_MN #define GEMM_DEFAULT_P ZGEMM_DEFAULT_P #define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q #define GEMM_DEFAULT_R ZGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N #else #define GEMM_P CGEMM_P #define GEMM_Q CGEMM_Q #define GEMM_R CGEMM_R #define GEMM_UNROLL_M CGEMM_UNROLL_M #define GEMM_UNROLL_N CGEMM_UNROLL_N #define GEMM_UNROLL_MN CGEMM_UNROLL_MN #define GEMM_DEFAULT_P CGEMM_DEFAULT_P #define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q #define GEMM_DEFAULT_R CGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N #endif #endif #ifdef XDOUBLE #define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M #define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N #elif defined(DOUBLE) #define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M #define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N #else #define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M #define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N #endif #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif #ifndef QGEMM_DEFAULT_UNROLL_N #define QGEMM_DEFAULT_UNROLL_N 2 #endif #ifndef XGEMM_DEFAULT_UNROLL_M #define XGEMM_DEFAULT_UNROLL_M 2 #endif #ifndef XGEMM_DEFAULT_UNROLL_N #define XGEMM_DEFAULT_UNROLL_N 2 #endif #ifndef GEMM_THREAD #define GEMM_THREAD gemm_thread_n #endif #ifndef SGEMM_DEFAULT_R #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) #endif #ifndef DGEMM_DEFAULT_R #define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) #endif #ifndef QGEMM_DEFAULT_R #define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) #endif #ifndef CGEMM_DEFAULT_R #define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) #endif #ifndef ZGEMM_DEFAULT_R #define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) #endif #ifndef XGEMM_DEFAULT_R #define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) #endif #ifndef SNUMOPT #define SNUMOPT 2 #endif #ifndef DNUMOPT #define DNUMOPT 2 #endif #ifndef QNUMOPT #define QNUMOPT 1 #endif #ifndef GEMM3M_P #ifdef XDOUBLE #define GEMM3M_P XGEMM3M_P #elif defined(DOUBLE) #define GEMM3M_P ZGEMM3M_P #else #define GEMM3M_P CGEMM3M_P #endif #endif #ifndef GEMM3M_Q #ifdef XDOUBLE #define GEMM3M_Q XGEMM3M_Q #elif defined(DOUBLE) #define GEMM3M_Q ZGEMM3M_Q #else #define GEMM3M_Q CGEMM3M_Q #endif #endif #ifndef GEMM3M_R #ifdef XDOUBLE #define GEMM3M_R XGEMM3M_R #elif defined(DOUBLE) #define GEMM3M_R ZGEMM3M_R #else #define GEMM3M_R CGEMM3M_R #endif #endif #endif OpenBLAS-0.2.20/common_power.h000066400000000000000000000412621313527062700160260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_POWER #define COMMON_POWER #if defined(POWER8) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #else #define MB __asm__ __volatile__ ("sync") #define WMB __asm__ __volatile__ ("sync") #endif #define INLINE inline #ifdef PPC440 #define STDERR stdout #define QNONCACHE 0x1 #define QCOMMS 0x2 #define QFAST 0x4 #endif #ifndef ASSEMBLER void *qalloc(int flags, size_t bytes); static void INLINE blas_lock(volatile unsigned long *address){ long int ret, val = 1; do { while (*address) {YIELDING;}; #if defined(OS_LINUX) || defined(OS_DARWIN) __asm__ __volatile__ ( "0: lwarx %0, 0, %1\n" " cmpwi %0, 0\n" " bne- 1f\n" " stwcx. %2,0, %1\n" " bne- 0b\n" "1: " : "=&r"(ret) : "r"(address), "r" (val) : "cr0", "memory"); #else __asm__ __volatile__ ( ".machine \"any\"\n" " lwarx %0, 0, %1\n" " cmpwi %0, 0\n" " bne- $+12\n" " stwcx. %2,0, %1\n" " bne- $-16\n" : "=&r"(ret) : "r"(address), "r" (val) : "cr0", "memory"); #endif } while (ret); } #define BLAS_LOCK_DEFINED static inline unsigned long rpcc(void){ unsigned long ret; #ifdef OS_AIX __asm__ __volatile__(".machine \"any\" ;"); #endif __asm__ __volatile__ ("mftb %0" : "=r" (ret) : ); #if defined(POWER5) || defined(PPC970) return (ret << 6); #else return (ret << 3); #endif } #define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT #endif static inline unsigned long getstackaddr(void){ unsigned long addr; __asm__ __volatile__ ("mr %0, 1" : "=r"(addr) : : "memory"); return addr; }; #if defined(OS_LINUX) || defined(OS_AIX) #define GET_IMAGE(res) __asm__ __volatile__("fmr %0, 2" : "=f"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("fmr %0, f2" : "=f"(res) : : "memory") #define GET_IMAGE_CANCEL #endif #ifdef SMP static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } #endif #endif #ifdef ASSEMBLER #ifdef DOUBLE #define LFD lfd #define LFDX lfdx #define LFPDX lfpdx #define LFSDX lfsdx #define LFXDX lfxdx #define LFDU lfdu #define LFDUX lfdux #define LFPDUX lfpdux #define LFSDUX lfsdux #define LFXDUX lfxdux #define STFD stfd #define STFDX stfdx #define STFPDX stfpdx #define STFSDX stfsdx #define STFXDX stfxdx #define STFDU stfdu #define STFDUX stfdux #define STFPDUX stfpdux #define STFSDUX stfsdux #define STFXDUX stfxdux #define FMADD fmadd #define FMSUB fmsub #define FNMADD fnmadd #define FNMSUB fnmsub #define FMUL fmul #define FADD fadd #define FSUB fsub #else #define LFD lfs #define LFDX lfsx #define LFPDX lfpsx #define LFSDX lfssx #define LFXDX lfxsx #define LFDU lfsu #define LFDUX lfsux #define LFPDUX lfpsux #define LFSDUX lfssux #define LFXDUX lfxsux #define STFD stfs #define STFDX stfsx #define STFPDX stfpsx #define STFSDX stfssx #define STFXDX stfxsx #define STFDU stfsu #define STFDUX stfsux #define STFPDUX stfpsux #define STFSDUX stfssux #define STFXDUX stfxsux #define FMADD fmadds #define FMSUB fmsubs #define FNMADD fnmadds #define FNMSUB fnmsubs #define FMUL fmuls #define FADD fadds #define FSUB fsubs #endif #ifdef __64BIT__ #define LDLONG ld #else #define LDLONG lwz #endif #ifdef OS_DARWIN #define LL(x) L##x #endif #ifdef OS_LINUX #define LL(x) .L##x #endif #ifndef LL #define LL(x) __L##x #endif #if defined(__64BIT__) && defined(USE64BITINT) #define LDINT ld #elif defined(__64BIT__) && !defined(USE64BITINT) #define LDINT lwa #else #define LDINT lwz #endif /* #define DCBT(REGA, REGB, NUM) .long (0x7c00022c | (REGA << 16) | (REGB << 11) | ((NUM) << 21)) #define DCBTST(REGA, REGB, NUM) .long (0x7c0001ec | (REGA << 16) | (REGB << 11) | ((NUM) << 21)) */ #define DSTATTR_H(SIZE, COUNT, STRIDE) ((SIZE << 8) | (COUNT)) #define DSTATTR_L(SIZE, COUNT, STRIDE) (STRIDE) #if defined(PPC970) || defined(POWER3) || defined(POWER4) || defined(POWER5) || defined(PPCG4) #define HAVE_PREFETCH #endif #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) #define DCBT_ARG 0 #else #define DCBT_ARG 8 #endif #ifdef CELL #define L1_DUALFETCH #define L1_PREFETCHSIZE (64 + 128 * 13) #endif #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define L1_DUALFETCH #define L1_PREFETCHSIZE (96 + 128 * 12) #endif #if defined(POWER6) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst #endif #if defined(POWER8) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst #endif # #ifndef L1_PREFETCH #define L1_PREFETCH dcbt #endif #ifndef L1_PREFETCHW #define L1_PREFETCHW dcbtst #endif #if DCBT_ARG == 0 #define DCBT(REGA, REGB) L1_PREFETCH REGB, REGA #define DCBTST(REGA, REGB) L1_PREFETCHW REGB, REGA #else #define DCBT(REGA, REGB) L1_PREFETCH DCBT_ARG, REGB, REGA #define DCBTST(REGA, REGB) L1_PREFETCHW DCBT_ARG, REGB, REGA #endif #ifndef L1_PREFETCHSIZE #define L1_PREFETCHSIZE (96 + 128 * 12) #endif #if !defined(OS_DARWIN) || defined(NEEDPARAM) #define f0 0 #define f1 1 #define f2 2 #define f3 3 #define f4 4 #define f5 5 #define f6 6 #define f7 7 #define f8 8 #define f9 9 #define f10 10 #define f11 11 #define f12 12 #define f13 13 #define f14 14 #define f15 15 #define f16 16 #define f17 17 #define f18 18 #define f19 19 #define f20 20 #define f21 21 #define f22 22 #define f23 23 #define f24 24 #define f25 25 #define f26 26 #define f27 27 #define f28 28 #define f29 29 #define f30 30 #define f31 31 #define r0 0 #define r1 1 #define r2 2 #define r3 3 #define r4 4 #define r5 5 #define r6 6 #define r7 7 #define r8 8 #define r9 9 #define r10 10 #define r11 11 #define r12 12 #define r13 13 #define r14 14 #define r15 15 #define r16 16 #define r17 17 #define r18 18 #define r19 19 #define r20 20 #define r21 21 #define r22 22 #define r23 23 #define r24 24 #define r25 25 #define r26 26 #define r27 27 #define r28 28 #define r29 29 #define r30 30 #define r31 31 #define v0 0 #define v1 1 #define v2 2 #define v3 3 #define v4 4 #define v5 5 #define v6 6 #define v7 7 #define v8 8 #define v9 9 #define v10 10 #define v11 11 #define v12 12 #define v13 13 #define v14 14 #define v15 15 #define v16 16 #define v17 17 #define v18 18 #define v19 19 #define v20 20 #define v21 21 #define v22 22 #define v23 23 #define v24 24 #define v25 25 #define v26 26 #define v27 27 #define v28 28 #define v29 29 #define v30 30 #define v31 31 #define BO_dCTR_NZERO_AND_NOT 0 #define BO_dCTR_NZERO_AND_NOT_1 1 #define BO_dCTR_ZERO_AND_NOT 2 #define BO_dCTR_ZERO_AND_NOT_1 3 #define BO_IF_NOT 4 #define BO_IF_NOT_1 5 #define BO_IF_NOT_2 6 #define BO_IF_NOT_3 7 #define BO_dCTR_NZERO_AND 8 #define BO_dCTR_NZERO_AND_1 9 #define BO_dCTR_ZERO_AND 10 #define BO_dCTR_ZERO_AND_1 11 #define BO_IF 12 #define BO_IF_1 13 #define BO_IF_2 14 #define BO_IF_3 15 #define BO_dCTR_NZERO 16 #define BO_dCTR_NZERO_1 17 #define BO_dCTR_ZERO 18 #define BO_dCTR_ZERO_1 19 #define BO_ALWAYS 20 #define BO_ALWAYS_1 21 #define BO_ALWAYS_2 22 #define BO_ALWAYS_3 23 #define BO_dCTR_NZERO_8 24 #define BO_dCTR_NZERO_9 25 #define BO_dCTR_ZERO_8 26 #define BO_dCTR_ZERO_9 27 #define BO_ALWAYS_8 28 #define BO_ALWAYS_9 29 #define BO_ALWAYS_10 30 #define BO_ALWAYS_11 31 #define CR0_LT 0 #define CR0_GT 1 #define CR0_EQ 2 #define CR0_SO 3 #define CR1_FX 4 #define CR1_FEX 5 #define CR1_VX 6 #define CR1_OX 7 #define CR2_LT 8 #define CR2_GT 9 #define CR2_EQ 10 #define CR2_SO 11 #define CR3_LT 12 #define CR3_GT 13 #define CR3_EQ 14 #define CR3_SO 15 #define CR4_LT 16 #define CR4_GT 17 #define CR4_EQ 18 #define CR4_SO 19 #define CR5_LT 20 #define CR5_GT 21 #define CR5_EQ 22 #define CR5_SO 23 #define CR6_LT 24 #define CR6_GT 25 #define CR6_EQ 26 #define CR6_SO 27 #define CR7_LT 28 #define CR7_GT 29 #define CR7_EQ 30 #define CR7_SO 31 #define TO_LT 16 #define TO_GT 8 #define TO_EQ 4 #define TO_LLT 2 #define TO_LGT 1 #define CR0 0 #define CR1 1 #define CR2 2 #define CR3 3 #define CR4 4 #define CR5 5 #define CR6 6 #define CR7 7 #define cr0 0 #define cr1 1 #define cr2 2 #define cr3 3 #define cr4 4 #define cr5 5 #define cr6 6 #define cr7 7 #define VRsave 256 #endif #define CTR 9 #define SP r1 #ifdef __64BIT__ #define slwi sldi #define cmpwi cmpdi #define srawi sradi #define mullw mulld #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #if defined(ASSEMBLER) && !defined(NEEDPARAM) #ifdef OS_LINUX #ifndef __64BIT__ #define PROLOGUE \ .section .text;\ .align 6;\ .globl REALNAME;\ .type REALNAME, @function;\ REALNAME: #define EPILOGUE .size REALNAME, .-REALNAME #else #if _CALL_ELF == 2 #define PROLOGUE \ .section .text;\ .align 6;\ .globl REALNAME;\ .type REALNAME, @function;\ REALNAME: #define EPILOGUE .size REALNAME, .-REALNAME #else #define PROLOGUE \ .section .text;\ .align 5;\ .globl REALNAME;\ .section ".opd","aw";\ .align 3;\ REALNAME:;\ .quad .REALNAME, .TOC.@tocbase, 0;\ .previous;\ .size REALNAME, 24;\ .type .REALNAME, @function;\ .globl .REALNAME;\ .REALNAME: #define EPILOGUE \ .long 0 ; \ .byte 0,0,0,1,128,0,0,0 ; \ .size .REALNAME, .-.REALNAME; \ .section .note.GNU-stack,"",@progbits #endif #endif #ifdef PROFILE #ifndef __64BIT__ #define PROFCODE ;\ .section ".data";\ .align 2;\ .LP3:;\ .long 0;\ .section ".text";\ mflr r0;\ stw r0, 4(SP);\ lis r12, .LP3@ha;\ la r0, .LP3@l(r12);\ bl _mcount;\ lwz r0, 4(SP);\ mtlr r0 #else #define PROFCODE \ .globl _mcount; \ mflr r0; \ std r0, 16(SP); \ mr r11, SP; \ addi SP, SP, -256; \ std r11, 0(SP); \ std r3, 128(SP); \ std r4, 136(SP); \ std r5, 144(SP); \ std r6, 152(SP); \ std r7, 160(SP); \ std r8, 168(SP); \ std r9, 176(SP); \ std r10, 184(SP); \ stfd f3, 192(SP); \ stfd f4, 200(SP); \ bl ._mcount; \ nop; \ ld r3, 128(SP);\ ld r4, 136(SP);\ ld r5, 144(SP);\ ld r6, 152(SP);\ ld r7, 160(SP);\ ld r8, 168(SP);\ ld r9, 176(SP);\ ld r10, 184(SP);\ lfd f3, 192(SP);\ lfd f4, 200(SP);\ addi SP, SP, 256;\ ld r0, 16(SP);\ mtlr r0 #endif #else #define PROFCODE #endif #endif #if OS_AIX #ifndef __64BIT__ #define PROLOGUE \ .machine "any";\ .globl .REALNAME;\ .csect .text[PR],5;\ .REALNAME:; #define EPILOGUE \ _section_.text:;\ .csect .data[RW],4;\ .long _section_.text; #else #define PROLOGUE \ .machine "any";\ .globl .REALNAME;\ .csect .text[PR], 5;\ .REALNAME:; #define EPILOGUE \ _section_.text:;\ .csect .data[RW],4;\ .llong _section_.text; #endif #define PROFCODE #endif #ifdef OS_DARWIN #ifndef __64BIT__ .macro PROLOGUE .section __TEXT,__text,regular,pure_instructions .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 .machine ppc .text .align 4 .globl REALNAME REALNAME: .endmacro #else .macro PROLOGUE .section __TEXT,__text,regular,pure_instructions .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 .machine ppc64 .text .align 4 .globl REALNAME REALNAME: .endmacro #endif #ifndef PROFILE #define PROFCODE #define EPILOGUE .subsections_via_symbols #else #ifndef __64BIT__ .macro PROFCODE mflr r0 stw r0, 8(SP) addi SP, SP, -64 stw SP, 0(SP) stw r3, 12(SP) stw r4, 16(SP) stw r5, 20(SP) stw r6, 24(SP) stw r7, 28(SP) stw r8, 32(SP) stw r9, 36(SP) stw r10, 40(SP) stfd f1, 48(SP) stfd f2, 56(SP) mr r3, r0 bl Lmcount$stub nop lwz r3, 12(SP) lwz r4, 16(SP) lwz r5, 20(SP) lwz r6, 24(SP) lwz r7, 28(SP) lwz r8, 32(SP) lwz r9, 36(SP) lwz r10, 40(SP) lfd f1, 48(SP) lfd f2, 56(SP) addi SP, SP, 64 lwz r0, 8(SP) mtlr r0 .endmacro .macro EPILOGUE .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 .align 5 Lmcount$stub: .indirect_symbol mcount mflr r0 bcl 20,31,L00000000001$spb L00000000001$spb: mflr r11 addis r11,r11,ha16(Lmcount$lazy_ptr-L00000000001$spb) mtlr r0 lwzu r12,lo16(Lmcount$lazy_ptr-L00000000001$spb)(r11) mtctr r12 bctr .lazy_symbol_pointer Lmcount$lazy_ptr: .indirect_symbol mcount .long dyld_stub_binding_helper .subsections_via_symbols .endmacro #else .macro PROFCODE mflr r0 std r0, 16(SP) addi SP, SP, -128 std SP, 0(SP) std r3, 24(SP) std r4, 32(SP) std r5, 40(SP) std r6, 48(SP) std r7, 56(SP) std r8, 64(SP) std r9, 72(SP) std r10, 80(SP) stfd f1, 88(SP) stfd f2, 96(SP) mr r3, r0 bl Lmcount$stub nop ld r3, 24(SP) ld r4, 32(SP) ld r5, 40(SP) ld r6, 48(SP) ld r7, 56(SP) ld r8, 64(SP) ld r9, 72(SP) ld r10, 80(SP) lfd f1, 88(SP) lfd f2, 86(SP) addi SP, SP, 128 ld r0, 16(SP) mtlr r0 .endmacro .macro EPILOGUE .data .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 .align 5 Lmcount$stub: .indirect_symbol mcount mflr r0 bcl 20,31,L00000000001$spb L00000000001$spb: mflr r11 addis r11,r11,ha16(Lmcount$lazy_ptr-L00000000001$spb) mtlr r0 ld r12,lo16(Lmcount$lazy_ptr-L00000000001$spb)(r11) mtctr r12 bctr .lazy_symbol_pointer Lmcount$lazy_ptr: .indirect_symbol mcount .quad dyld_stub_binding_helper .subsections_via_symbols .endmacro #endif #endif #endif #endif #endif #define HALT mfspr r0, 1023 #ifdef OS_LINUX #if defined(PPC440) || defined(PPC440FP2) #undef MAX_CPU_NUMBER #define MAX_CPU_NUMBER 1 #endif #if !defined(__64BIT__) && !defined(PROFILE) && !defined(PPC440) && !defined(PPC440FP2) #define START_ADDRESS (0x0b000000UL) #else #define SEEK_ADDRESS #endif #endif #ifdef OS_AIX #ifndef __64BIT__ #define START_ADDRESS (0xf0000000UL) #else #define SEEK_ADDRESS #endif #endif #ifdef OS_DARWIN #define SEEK_ADDRESS #endif #if defined(PPC440) #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) #elif defined(POWER8) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) #endif #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE (16 << 20) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #ifdef OS_LINUX #ifndef __64BIT__ #define FRAMESLOT(X) (((X) * 4) + 8) #else #if _CALL_ELF == 2 #define FRAMESLOT(X) (((X) * 8) + 96) #else #define FRAMESLOT(X) (((X) * 8) + 112) #endif #endif #endif #if defined(OS_AIX) || defined(OS_DARWIN) #ifndef __64BIT__ #define FRAMESLOT(X) (((X) * 4) + 56) #else #define FRAMESLOT(X) (((X) * 8) + 112) #endif #endif #endif OpenBLAS-0.2.20/common_q.h000066400000000000000000000326711313527062700151360ustar00rootroot00000000000000#ifndef COMMON_Q_H #define COMMON_Q_H #ifndef DYNAMIC_ARCH #define QAMAX_K qamax_k #define QAMIN_K qamin_k #define QMAX_K qmax_k #define QMIN_K qmin_k #define IQAMAX_K iqamax_k #define IQAMIN_K iqamin_k #define IQMAX_K iqmax_k #define IQMIN_K iqmin_k #define QASUM_K qasum_k #define QAXPYU_K qaxpy_k #define QAXPYC_K qaxpy_k #define QCOPY_K qcopy_k #define QDOTU_K qdot_k #define QDOTC_K qdot_k #define QNRM2_K qnrm2_k #define QSCAL_K qscal_k #define QSWAP_K qswap_k #define QROT_K qrot_k #define QGEMV_N qgemv_n #define QGEMV_T qgemv_t #define QGEMV_R qgemv_n #define QGEMV_C qgemv_t #define QGEMV_O qgemv_n #define QGEMV_U qgemv_t #define QGEMV_S qgemv_n #define QGEMV_D qgemv_t #define QGERU_K qger_k #define QGERC_K qger_k #define QGERV_K qger_k #define QGERD_K qger_k #define QSYMV_U qsymv_U #define QSYMV_L qsymv_L #define QSYMV_THREAD_U qsymv_thread_U #define QSYMV_THREAD_L qsymv_thread_L #define QGEMM_ONCOPY qgemm_oncopy #define QGEMM_OTCOPY qgemm_otcopy #if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N #define QGEMM_INCOPY qgemm_oncopy #define QGEMM_ITCOPY qgemm_otcopy #else #define QGEMM_INCOPY qgemm_incopy #define QGEMM_ITCOPY qgemm_itcopy #endif #define QTRMM_OUNUCOPY qtrmm_ounucopy #define QTRMM_OUNNCOPY qtrmm_ounncopy #define QTRMM_OUTUCOPY qtrmm_outucopy #define QTRMM_OUTNCOPY qtrmm_outncopy #define QTRMM_OLNUCOPY qtrmm_olnucopy #define QTRMM_OLNNCOPY qtrmm_olnncopy #define QTRMM_OLTUCOPY qtrmm_oltucopy #define QTRMM_OLTNCOPY qtrmm_oltncopy #define QTRSM_OUNUCOPY qtrsm_ounucopy #define QTRSM_OUNNCOPY qtrsm_ounncopy #define QTRSM_OUTUCOPY qtrsm_outucopy #define QTRSM_OUTNCOPY qtrsm_outncopy #define QTRSM_OLNUCOPY qtrsm_olnucopy #define QTRSM_OLNNCOPY qtrsm_olnncopy #define QTRSM_OLTUCOPY qtrsm_oltucopy #define QTRSM_OLTNCOPY qtrsm_oltncopy #if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N #define QTRMM_IUNUCOPY qtrmm_ounucopy #define QTRMM_IUNNCOPY qtrmm_ounncopy #define QTRMM_IUTUCOPY qtrmm_outucopy #define QTRMM_IUTNCOPY qtrmm_outncopy #define QTRMM_ILNUCOPY qtrmm_olnucopy #define QTRMM_ILNNCOPY qtrmm_olnncopy #define QTRMM_ILTUCOPY qtrmm_oltucopy #define QTRMM_ILTNCOPY qtrmm_oltncopy #define QTRSM_IUNUCOPY qtrsm_ounucopy #define QTRSM_IUNNCOPY qtrsm_ounncopy #define QTRSM_IUTUCOPY qtrsm_outucopy #define QTRSM_IUTNCOPY qtrsm_outncopy #define QTRSM_ILNUCOPY qtrsm_olnucopy #define QTRSM_ILNNCOPY qtrsm_olnncopy #define QTRSM_ILTUCOPY qtrsm_oltucopy #define QTRSM_ILTNCOPY qtrsm_oltncopy #else #define QTRMM_IUNUCOPY qtrmm_iunucopy #define QTRMM_IUNNCOPY qtrmm_iunncopy #define QTRMM_IUTUCOPY qtrmm_iutucopy #define QTRMM_IUTNCOPY qtrmm_iutncopy #define QTRMM_ILNUCOPY qtrmm_ilnucopy #define QTRMM_ILNNCOPY qtrmm_ilnncopy #define QTRMM_ILTUCOPY qtrmm_iltucopy #define QTRMM_ILTNCOPY qtrmm_iltncopy #define QTRSM_IUNUCOPY qtrsm_iunucopy #define QTRSM_IUNNCOPY qtrsm_iunncopy #define QTRSM_IUTUCOPY qtrsm_iutucopy #define QTRSM_IUTNCOPY qtrsm_iutncopy #define QTRSM_ILNUCOPY qtrsm_ilnucopy #define QTRSM_ILNNCOPY qtrsm_ilnncopy #define QTRSM_ILTUCOPY qtrsm_iltucopy #define QTRSM_ILTNCOPY qtrsm_iltncopy #endif #define QGEMM_BETA qgemm_beta #define QGEMM_KERNEL qgemm_kernel #define QTRMM_KERNEL_LN qtrmm_kernel_LN #define QTRMM_KERNEL_LT qtrmm_kernel_LT #define QTRMM_KERNEL_LR qtrmm_kernel_LN #define QTRMM_KERNEL_LC qtrmm_kernel_LT #define QTRMM_KERNEL_RN qtrmm_kernel_RN #define QTRMM_KERNEL_RT qtrmm_kernel_RT #define QTRMM_KERNEL_RR qtrmm_kernel_RN #define QTRMM_KERNEL_RC qtrmm_kernel_RT #define QTRSM_KERNEL_LN qtrsm_kernel_LN #define QTRSM_KERNEL_LT qtrsm_kernel_LT #define QTRSM_KERNEL_LR qtrsm_kernel_LN #define QTRSM_KERNEL_LC qtrsm_kernel_LT #define QTRSM_KERNEL_RN qtrsm_kernel_RN #define QTRSM_KERNEL_RT qtrsm_kernel_RT #define QTRSM_KERNEL_RR qtrsm_kernel_RN #define QTRSM_KERNEL_RC qtrsm_kernel_RT #define QSYMM_OUTCOPY qsymm_outcopy #define QSYMM_OLTCOPY qsymm_oltcopy #if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N #define QSYMM_IUTCOPY qsymm_outcopy #define QSYMM_ILTCOPY qsymm_oltcopy #else #define QSYMM_IUTCOPY qsymm_iutcopy #define QSYMM_ILTCOPY qsymm_iltcopy #endif #define QNEG_TCOPY qneg_tcopy #define QLASWP_NCOPY qlaswp_ncopy #else #define QAMAX_K gotoblas -> qamax_k #define QAMIN_K gotoblas -> qamin_k #define QMAX_K gotoblas -> qmax_k #define QMIN_K gotoblas -> qmin_k #define IQAMAX_K gotoblas -> iqamax_k #define IQAMIN_K gotoblas -> iqamin_k #define IQMAX_K gotoblas -> iqmax_k #define IQMIN_K gotoblas -> iqmin_k #define QASUM_K gotoblas -> qasum_k #define QAXPYU_K gotoblas -> qaxpy_k #define QAXPYC_K gotoblas -> qaxpy_k #define QCOPY_K gotoblas -> qcopy_k #define QDOTU_K gotoblas -> qdot_k #define QDOTC_K gotoblas -> qdot_k #define QNRM2_K gotoblas -> qnrm2_k #define QSCAL_K gotoblas -> qscal_k #define QSWAP_K gotoblas -> qswap_k #define QROT_K gotoblas -> qrot_k #define QGEMV_N gotoblas -> qgemv_n #define QGEMV_T gotoblas -> qgemv_t #define QGEMV_R gotoblas -> qgemv_n #define QGEMV_C gotoblas -> qgemv_t #define QGEMV_O gotoblas -> qgemv_n #define QGEMV_U gotoblas -> qgemv_t #define QGEMV_S gotoblas -> qgemv_n #define QGEMV_D gotoblas -> qgemv_t #define QGERU_K gotoblas -> qger_k #define QGERC_K gotoblas -> qger_k #define QGERV_K gotoblas -> qger_k #define QGERD_K gotoblas -> qger_k #define QSYMV_U gotoblas -> qsymv_U #define QSYMV_L gotoblas -> qsymv_L #define QSYMV_THREAD_U qsymv_thread_U #define QSYMV_THREAD_L qsymv_thread_L #define QGEMM_ONCOPY gotoblas -> qgemm_oncopy #define QGEMM_OTCOPY gotoblas -> qgemm_otcopy #define QGEMM_INCOPY gotoblas -> qgemm_incopy #define QGEMM_ITCOPY gotoblas -> qgemm_itcopy #define QTRMM_OUNUCOPY gotoblas -> qtrmm_ounucopy #define QTRMM_OUTUCOPY gotoblas -> qtrmm_outucopy #define QTRMM_OLNUCOPY gotoblas -> qtrmm_olnucopy #define QTRMM_OLTUCOPY gotoblas -> qtrmm_oltucopy #define QTRSM_OUNUCOPY gotoblas -> qtrsm_ounucopy #define QTRSM_OUTUCOPY gotoblas -> qtrsm_outucopy #define QTRSM_OLNUCOPY gotoblas -> qtrsm_olnucopy #define QTRSM_OLTUCOPY gotoblas -> qtrsm_oltucopy #define QTRMM_IUNUCOPY gotoblas -> qtrmm_iunucopy #define QTRMM_IUTUCOPY gotoblas -> qtrmm_iutucopy #define QTRMM_ILNUCOPY gotoblas -> qtrmm_ilnucopy #define QTRMM_ILTUCOPY gotoblas -> qtrmm_iltucopy #define QTRSM_IUNUCOPY gotoblas -> qtrsm_iunucopy #define QTRSM_IUTUCOPY gotoblas -> qtrsm_iutucopy #define QTRSM_ILNUCOPY gotoblas -> qtrsm_ilnucopy #define QTRSM_ILTUCOPY gotoblas -> qtrsm_iltucopy #define QTRMM_OUNNCOPY gotoblas -> qtrmm_ounncopy #define QTRMM_OUTNCOPY gotoblas -> qtrmm_outncopy #define QTRMM_OLNNCOPY gotoblas -> qtrmm_olnncopy #define QTRMM_OLTNCOPY gotoblas -> qtrmm_oltncopy #define QTRSM_OUNNCOPY gotoblas -> qtrsm_ounncopy #define QTRSM_OUTNCOPY gotoblas -> qtrsm_outncopy #define QTRSM_OLNNCOPY gotoblas -> qtrsm_olnncopy #define QTRSM_OLTNCOPY gotoblas -> qtrsm_oltncopy #define QTRMM_IUNNCOPY gotoblas -> qtrmm_iunncopy #define QTRMM_IUTNCOPY gotoblas -> qtrmm_iutncopy #define QTRMM_ILNNCOPY gotoblas -> qtrmm_ilnncopy #define QTRMM_ILTNCOPY gotoblas -> qtrmm_iltncopy #define QTRSM_IUNNCOPY gotoblas -> qtrsm_iunncopy #define QTRSM_IUTNCOPY gotoblas -> qtrsm_iutncopy #define QTRSM_ILNNCOPY gotoblas -> qtrsm_ilnncopy #define QTRSM_ILTNCOPY gotoblas -> qtrsm_iltncopy #define QGEMM_BETA gotoblas -> qgemm_beta #define QGEMM_KERNEL gotoblas -> qgemm_kernel #define QTRMM_KERNEL_LN gotoblas -> qtrmm_kernel_LN #define QTRMM_KERNEL_LT gotoblas -> qtrmm_kernel_LT #define QTRMM_KERNEL_LR gotoblas -> qtrmm_kernel_LN #define QTRMM_KERNEL_LC gotoblas -> qtrmm_kernel_LT #define QTRMM_KERNEL_RN gotoblas -> qtrmm_kernel_RN #define QTRMM_KERNEL_RT gotoblas -> qtrmm_kernel_RT #define QTRMM_KERNEL_RR gotoblas -> qtrmm_kernel_RN #define QTRMM_KERNEL_RC gotoblas -> qtrmm_kernel_RT #define QTRSM_KERNEL_LN gotoblas -> qtrsm_kernel_LN #define QTRSM_KERNEL_LT gotoblas -> qtrsm_kernel_LT #define QTRSM_KERNEL_LR gotoblas -> qtrsm_kernel_LN #define QTRSM_KERNEL_LC gotoblas -> qtrsm_kernel_LT #define QTRSM_KERNEL_RN gotoblas -> qtrsm_kernel_RN #define QTRSM_KERNEL_RT gotoblas -> qtrsm_kernel_RT #define QTRSM_KERNEL_RR gotoblas -> qtrsm_kernel_RN #define QTRSM_KERNEL_RC gotoblas -> qtrsm_kernel_RT #define QSYMM_IUTCOPY gotoblas -> qsymm_iutcopy #define QSYMM_ILTCOPY gotoblas -> qsymm_iltcopy #define QSYMM_OUTCOPY gotoblas -> qsymm_outcopy #define QSYMM_OLTCOPY gotoblas -> qsymm_oltcopy #define QNEG_TCOPY gotoblas -> qneg_tcopy #define QLASWP_NCOPY gotoblas -> qlaswp_ncopy #endif #define QGEMM_NN qgemm_nn #define QGEMM_CN qgemm_tn #define QGEMM_TN qgemm_tn #define QGEMM_NC qgemm_nt #define QGEMM_NT qgemm_nt #define QGEMM_CC qgemm_tt #define QGEMM_CT qgemm_tt #define QGEMM_TC qgemm_tt #define QGEMM_TT qgemm_tt #define QGEMM_NR qgemm_nn #define QGEMM_TR qgemm_tn #define QGEMM_CR qgemm_tn #define QGEMM_RN qgemm_nn #define QGEMM_RT qgemm_nt #define QGEMM_RC qgemm_nt #define QGEMM_RR qgemm_nn #define QSYMM_LU qsymm_LU #define QSYMM_LL qsymm_LL #define QSYMM_RU qsymm_RU #define QSYMM_RL qsymm_RL #define QHEMM_LU qhemm_LU #define QHEMM_LL qhemm_LL #define QHEMM_RU qhemm_RU #define QHEMM_RL qhemm_RL #define QSYRK_UN qsyrk_UN #define QSYRK_UT qsyrk_UT #define QSYRK_LN qsyrk_LN #define QSYRK_LT qsyrk_LT #define QSYRK_UR qsyrk_UN #define QSYRK_UC qsyrk_UT #define QSYRK_LR qsyrk_LN #define QSYRK_LC qsyrk_LT #define QSYRK_KERNEL_U qsyrk_kernel_U #define QSYRK_KERNEL_L qsyrk_kernel_L #define QHERK_UN qsyrk_UN #define QHERK_LN qsyrk_LN #define QHERK_UC qsyrk_UT #define QHERK_LC qsyrk_LT #define QHER2K_UN qsyr2k_UN #define QHER2K_LN qsyr2k_LN #define QHER2K_UC qsyr2k_UT #define QHER2K_LC qsyr2k_LT #define QSYR2K_UN qsyr2k_UN #define QSYR2K_UT qsyr2k_UT #define QSYR2K_LN qsyr2k_LN #define QSYR2K_LT qsyr2k_LT #define QSYR2K_UR qsyr2k_UN #define QSYR2K_UC qsyr2k_UT #define QSYR2K_LR qsyr2k_LN #define QSYR2K_LC qsyr2k_LT #define QSYR2K_KERNEL_U qsyr2k_kernel_U #define QSYR2K_KERNEL_L qsyr2k_kernel_L #define QTRMM_LNUU qtrmm_LNUU #define QTRMM_LNUN qtrmm_LNUN #define QTRMM_LNLU qtrmm_LNLU #define QTRMM_LNLN qtrmm_LNLN #define QTRMM_LTUU qtrmm_LTUU #define QTRMM_LTUN qtrmm_LTUN #define QTRMM_LTLU qtrmm_LTLU #define QTRMM_LTLN qtrmm_LTLN #define QTRMM_LRUU qtrmm_LNUU #define QTRMM_LRUN qtrmm_LNUN #define QTRMM_LRLU qtrmm_LNLU #define QTRMM_LRLN qtrmm_LNLN #define QTRMM_LCUU qtrmm_LTUU #define QTRMM_LCUN qtrmm_LTUN #define QTRMM_LCLU qtrmm_LTLU #define QTRMM_LCLN qtrmm_LTLN #define QTRMM_RNUU qtrmm_RNUU #define QTRMM_RNUN qtrmm_RNUN #define QTRMM_RNLU qtrmm_RNLU #define QTRMM_RNLN qtrmm_RNLN #define QTRMM_RTUU qtrmm_RTUU #define QTRMM_RTUN qtrmm_RTUN #define QTRMM_RTLU qtrmm_RTLU #define QTRMM_RTLN qtrmm_RTLN #define QTRMM_RRUU qtrmm_RNUU #define QTRMM_RRUN qtrmm_RNUN #define QTRMM_RRLU qtrmm_RNLU #define QTRMM_RRLN qtrmm_RNLN #define QTRMM_RCUU qtrmm_RTUU #define QTRMM_RCUN qtrmm_RTUN #define QTRMM_RCLU qtrmm_RTLU #define QTRMM_RCLN qtrmm_RTLN #define QTRSM_LNUU qtrsm_LNUU #define QTRSM_LNUN qtrsm_LNUN #define QTRSM_LNLU qtrsm_LNLU #define QTRSM_LNLN qtrsm_LNLN #define QTRSM_LTUU qtrsm_LTUU #define QTRSM_LTUN qtrsm_LTUN #define QTRSM_LTLU qtrsm_LTLU #define QTRSM_LTLN qtrsm_LTLN #define QTRSM_LRUU qtrsm_LNUU #define QTRSM_LRUN qtrsm_LNUN #define QTRSM_LRLU qtrsm_LNLU #define QTRSM_LRLN qtrsm_LNLN #define QTRSM_LCUU qtrsm_LTUU #define QTRSM_LCUN qtrsm_LTUN #define QTRSM_LCLU qtrsm_LTLU #define QTRSM_LCLN qtrsm_LTLN #define QTRSM_RNUU qtrsm_RNUU #define QTRSM_RNUN qtrsm_RNUN #define QTRSM_RNLU qtrsm_RNLU #define QTRSM_RNLN qtrsm_RNLN #define QTRSM_RTUU qtrsm_RTUU #define QTRSM_RTUN qtrsm_RTUN #define QTRSM_RTLU qtrsm_RTLU #define QTRSM_RTLN qtrsm_RTLN #define QTRSM_RRUU qtrsm_RNUU #define QTRSM_RRUN qtrsm_RNUN #define QTRSM_RRLU qtrsm_RNLU #define QTRSM_RRLN qtrsm_RNLN #define QTRSM_RCUU qtrsm_RTUU #define QTRSM_RCUN qtrsm_RTUN #define QTRSM_RCLU qtrsm_RTLU #define QTRSM_RCLN qtrsm_RTLN #define QGEMM_THREAD_NN qgemm_thread_nn #define QGEMM_THREAD_CN qgemm_thread_tn #define QGEMM_THREAD_TN qgemm_thread_tn #define QGEMM_THREAD_NC qgemm_thread_nt #define QGEMM_THREAD_NT qgemm_thread_nt #define QGEMM_THREAD_CC qgemm_thread_tt #define QGEMM_THREAD_CT qgemm_thread_tt #define QGEMM_THREAD_TC qgemm_thread_tt #define QGEMM_THREAD_TT qgemm_thread_tt #define QGEMM_THREAD_NR qgemm_thread_nn #define QGEMM_THREAD_TR qgemm_thread_tn #define QGEMM_THREAD_CR qgemm_thread_tn #define QGEMM_THREAD_RN qgemm_thread_nn #define QGEMM_THREAD_RT qgemm_thread_nt #define QGEMM_THREAD_RC qgemm_thread_nt #define QGEMM_THREAD_RR qgemm_thread_nn #define QSYMM_THREAD_LU qsymm_thread_LU #define QSYMM_THREAD_LL qsymm_thread_LL #define QSYMM_THREAD_RU qsymm_thread_RU #define QSYMM_THREAD_RL qsymm_thread_RL #define QHEMM_THREAD_LU qhemm_thread_LU #define QHEMM_THREAD_LL qhemm_thread_LL #define QHEMM_THREAD_RU qhemm_thread_RU #define QHEMM_THREAD_RL qhemm_thread_RL #define QSYRK_THREAD_UN qsyrk_thread_UN #define QSYRK_THREAD_UT qsyrk_thread_UT #define QSYRK_THREAD_LN qsyrk_thread_LN #define QSYRK_THREAD_LT qsyrk_thread_LT #define QSYRK_THREAD_UR qsyrk_thread_UN #define QSYRK_THREAD_UC qsyrk_thread_UT #define QSYRK_THREAD_LR qsyrk_thread_LN #define QSYRK_THREAD_LC qsyrk_thread_LT #define QHERK_THREAD_UN qsyrk_thread_UN #define QHERK_THREAD_UT qsyrk_thread_UT #define QHERK_THREAD_LN qsyrk_thread_LN #define QHERK_THREAD_LT qsyrk_thread_LT #define QHERK_THREAD_UR qsyrk_thread_UN #define QHERK_THREAD_UC qsyrk_thread_UT #define QHERK_THREAD_LR qsyrk_thread_LN #define QHERK_THREAD_LC qsyrk_thread_LT #endif OpenBLAS-0.2.20/common_reference.h000066400000000000000000000075651313527062700166400ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #ifndef ASSEMBLER #define REF_BU f #define BLASFUNC_REF_2(x,y) BLASFUNC(x## y) #define BLASFUNC_REF_1(x,y) BLASFUNC_REF_2(x,y) #define BLASFUNC_REF(x) BLASFUNC_REF_1(x,REF_BU) void BLASFUNC_REF(srot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); void BLASFUNC_REF(drot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); void BLASFUNC_REF(qrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); void BLASFUNC_REF(csrot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); void BLASFUNC_REF(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); void BLASFUNC_REF(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); void BLASFUNC_REF(sswap) (blasint *, float *, blasint *, float *, blasint *); void BLASFUNC_REF(dswap) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC_REF(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC_REF(cswap) (blasint *, float *, blasint *, float *, blasint *); void BLASFUNC_REF(zswap) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC_REF(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC_REF(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC_REF(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC_REF(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC_REF(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); float _Complex BLASFUNC_REF(cdotu) (blasint *, float *, blasint *, float *, blasint *); float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *, blasint *); double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *); double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *); void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *); #endif OpenBLAS-0.2.20/common_s.h000066400000000000000000000351161313527062700151350ustar00rootroot00000000000000#ifndef COMMON_S_H #define COMMON_S_H #ifndef DYNAMIC_ARCH #define SAMAX_K samax_k #define SAMIN_K samin_k #define SMAX_K smax_k #define SMIN_K smin_k #define ISAMAX_K isamax_k #define ISAMIN_K isamin_k #define ISMAX_K ismax_k #define ISMIN_K ismin_k #define SASUM_K sasum_k #define SAXPYU_K saxpy_k #define SAXPYC_K saxpy_k #define SCOPY_K scopy_k #define SDOTU_K sdot_k #define SDOTC_K sdot_k #define SDSDOT_K dsdot_k #define DSDOT_K dsdot_k #define SNRM2_K snrm2_k #define SSCAL_K sscal_k #define SSWAP_K sswap_k #define SROT_K srot_k #define SGEMV_N sgemv_n #define SGEMV_T sgemv_t #define SGEMV_R sgemv_n #define SGEMV_C sgemv_t #define SGEMV_O sgemv_n #define SGEMV_U sgemv_t #define SGEMV_S sgemv_n #define SGEMV_D sgemv_t #define SGERU_K sger_k #define SGERC_K sger_k #define SGERV_K sger_k #define SGERD_K sger_k #define SSYMV_U ssymv_U #define SSYMV_L ssymv_L #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L #define SGEMM_ONCOPY sgemm_oncopy #define SGEMM_OTCOPY sgemm_otcopy #if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N #define SGEMM_INCOPY sgemm_oncopy #define SGEMM_ITCOPY sgemm_otcopy #else #define SGEMM_INCOPY sgemm_incopy #define SGEMM_ITCOPY sgemm_itcopy #endif #define STRMM_OUNUCOPY strmm_ounucopy #define STRMM_OUNNCOPY strmm_ounncopy #define STRMM_OUTUCOPY strmm_outucopy #define STRMM_OUTNCOPY strmm_outncopy #define STRMM_OLNUCOPY strmm_olnucopy #define STRMM_OLNNCOPY strmm_olnncopy #define STRMM_OLTUCOPY strmm_oltucopy #define STRMM_OLTNCOPY strmm_oltncopy #define STRSM_OUNUCOPY strsm_ounucopy #define STRSM_OUNNCOPY strsm_ounncopy #define STRSM_OUTUCOPY strsm_outucopy #define STRSM_OUTNCOPY strsm_outncopy #define STRSM_OLNUCOPY strsm_olnucopy #define STRSM_OLNNCOPY strsm_olnncopy #define STRSM_OLTUCOPY strsm_oltucopy #define STRSM_OLTNCOPY strsm_oltncopy #if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N #define STRMM_IUNUCOPY strmm_ounucopy #define STRMM_IUNNCOPY strmm_ounncopy #define STRMM_IUTUCOPY strmm_outucopy #define STRMM_IUTNCOPY strmm_outncopy #define STRMM_ILNUCOPY strmm_olnucopy #define STRMM_ILNNCOPY strmm_olnncopy #define STRMM_ILTUCOPY strmm_oltucopy #define STRMM_ILTNCOPY strmm_oltncopy #define STRSM_IUNUCOPY strsm_ounucopy #define STRSM_IUNNCOPY strsm_ounncopy #define STRSM_IUTUCOPY strsm_outucopy #define STRSM_IUTNCOPY strsm_outncopy #define STRSM_ILNUCOPY strsm_olnucopy #define STRSM_ILNNCOPY strsm_olnncopy #define STRSM_ILTUCOPY strsm_oltucopy #define STRSM_ILTNCOPY strsm_oltncopy #else #define STRMM_IUNUCOPY strmm_iunucopy #define STRMM_IUNNCOPY strmm_iunncopy #define STRMM_IUTUCOPY strmm_iutucopy #define STRMM_IUTNCOPY strmm_iutncopy #define STRMM_ILNUCOPY strmm_ilnucopy #define STRMM_ILNNCOPY strmm_ilnncopy #define STRMM_ILTUCOPY strmm_iltucopy #define STRMM_ILTNCOPY strmm_iltncopy #define STRSM_IUNUCOPY strsm_iunucopy #define STRSM_IUNNCOPY strsm_iunncopy #define STRSM_IUTUCOPY strsm_iutucopy #define STRSM_IUTNCOPY strsm_iutncopy #define STRSM_ILNUCOPY strsm_ilnucopy #define STRSM_ILNNCOPY strsm_ilnncopy #define STRSM_ILTUCOPY strsm_iltucopy #define STRSM_ILTNCOPY strsm_iltncopy #endif #define SGEMM_BETA sgemm_beta #define SGEMM_KERNEL sgemm_kernel #define STRMM_KERNEL_LN strmm_kernel_LN #define STRMM_KERNEL_LT strmm_kernel_LT #define STRMM_KERNEL_LR strmm_kernel_LN #define STRMM_KERNEL_LC strmm_kernel_LT #define STRMM_KERNEL_RN strmm_kernel_RN #define STRMM_KERNEL_RT strmm_kernel_RT #define STRMM_KERNEL_RR strmm_kernel_RN #define STRMM_KERNEL_RC strmm_kernel_RT #define STRSM_KERNEL_LN strsm_kernel_LN #define STRSM_KERNEL_LT strsm_kernel_LT #define STRSM_KERNEL_LR strsm_kernel_LN #define STRSM_KERNEL_LC strsm_kernel_LT #define STRSM_KERNEL_RN strsm_kernel_RN #define STRSM_KERNEL_RT strsm_kernel_RT #define STRSM_KERNEL_RR strsm_kernel_RN #define STRSM_KERNEL_RC strsm_kernel_RT #define SSYMM_OUTCOPY ssymm_outcopy #define SSYMM_OLTCOPY ssymm_oltcopy #if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N #define SSYMM_IUTCOPY ssymm_outcopy #define SSYMM_ILTCOPY ssymm_oltcopy #else #define SSYMM_IUTCOPY ssymm_iutcopy #define SSYMM_ILTCOPY ssymm_iltcopy #endif #define SNEG_TCOPY sneg_tcopy #define SLASWP_NCOPY slaswp_ncopy #define SAXPBY_K saxpby_k #define SOMATCOPY_K_CN somatcopy_k_cn #define SOMATCOPY_K_RN somatcopy_k_rn #define SOMATCOPY_K_CT somatcopy_k_ct #define SOMATCOPY_K_RT somatcopy_k_rt #define SIMATCOPY_K_CN simatcopy_k_cn #define SIMATCOPY_K_RN simatcopy_k_rn #define SIMATCOPY_K_CT simatcopy_k_ct #define SIMATCOPY_K_RT simatcopy_k_rt #define SGEADD_K sgeadd_k #else #define SAMAX_K gotoblas -> samax_k #define SAMIN_K gotoblas -> samin_k #define SMAX_K gotoblas -> smax_k #define SMIN_K gotoblas -> smin_k #define ISAMAX_K gotoblas -> isamax_k #define ISAMIN_K gotoblas -> isamin_k #define ISMAX_K gotoblas -> ismax_k #define ISMIN_K gotoblas -> ismin_k #define SASUM_K gotoblas -> sasum_k #define SAXPYU_K gotoblas -> saxpy_k #define SAXPYC_K gotoblas -> saxpy_k #define SCOPY_K gotoblas -> scopy_k #define SDOTU_K gotoblas -> sdot_k #define SDOTC_K gotoblas -> sdot_k #define SDSDOT_K gotoblas -> dsdot_k #define DSDOT_K gotoblas -> dsdot_k #define SNRM2_K gotoblas -> snrm2_k #define SSCAL_K gotoblas -> sscal_k #define SSWAP_K gotoblas -> sswap_k #define SROT_K gotoblas -> srot_k #define SGEMV_N gotoblas -> sgemv_n #define SGEMV_T gotoblas -> sgemv_t #define SGEMV_R gotoblas -> sgemv_n #define SGEMV_C gotoblas -> sgemv_t #define SGEMV_O gotoblas -> sgemv_n #define SGEMV_U gotoblas -> sgemv_t #define SGEMV_S gotoblas -> sgemv_n #define SGEMV_D gotoblas -> sgemv_t #define SGERU_K gotoblas -> sger_k #define SGERC_K gotoblas -> sger_k #define SGERV_K gotoblas -> sger_k #define SGERD_K gotoblas -> sger_k #define SSYMV_U gotoblas -> ssymv_U #define SSYMV_L gotoblas -> ssymv_L #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy #define SGEMM_OTCOPY gotoblas -> sgemm_otcopy #define SGEMM_INCOPY gotoblas -> sgemm_incopy #define SGEMM_ITCOPY gotoblas -> sgemm_itcopy #define STRMM_OUNUCOPY gotoblas -> strmm_ounucopy #define STRMM_OUTUCOPY gotoblas -> strmm_outucopy #define STRMM_OLNUCOPY gotoblas -> strmm_olnucopy #define STRMM_OLTUCOPY gotoblas -> strmm_oltucopy #define STRSM_OUNUCOPY gotoblas -> strsm_ounucopy #define STRSM_OUTUCOPY gotoblas -> strsm_outucopy #define STRSM_OLNUCOPY gotoblas -> strsm_olnucopy #define STRSM_OLTUCOPY gotoblas -> strsm_oltucopy #define STRMM_IUNUCOPY gotoblas -> strmm_iunucopy #define STRMM_IUTUCOPY gotoblas -> strmm_iutucopy #define STRMM_ILNUCOPY gotoblas -> strmm_ilnucopy #define STRMM_ILTUCOPY gotoblas -> strmm_iltucopy #define STRSM_IUNUCOPY gotoblas -> strsm_iunucopy #define STRSM_IUTUCOPY gotoblas -> strsm_iutucopy #define STRSM_ILNUCOPY gotoblas -> strsm_ilnucopy #define STRSM_ILTUCOPY gotoblas -> strsm_iltucopy #define STRMM_OUNNCOPY gotoblas -> strmm_ounncopy #define STRMM_OUTNCOPY gotoblas -> strmm_outncopy #define STRMM_OLNNCOPY gotoblas -> strmm_olnncopy #define STRMM_OLTNCOPY gotoblas -> strmm_oltncopy #define STRSM_OUNNCOPY gotoblas -> strsm_ounncopy #define STRSM_OUTNCOPY gotoblas -> strsm_outncopy #define STRSM_OLNNCOPY gotoblas -> strsm_olnncopy #define STRSM_OLTNCOPY gotoblas -> strsm_oltncopy #define STRMM_IUNNCOPY gotoblas -> strmm_iunncopy #define STRMM_IUTNCOPY gotoblas -> strmm_iutncopy #define STRMM_ILNNCOPY gotoblas -> strmm_ilnncopy #define STRMM_ILTNCOPY gotoblas -> strmm_iltncopy #define STRSM_IUNNCOPY gotoblas -> strsm_iunncopy #define STRSM_IUTNCOPY gotoblas -> strsm_iutncopy #define STRSM_ILNNCOPY gotoblas -> strsm_ilnncopy #define STRSM_ILTNCOPY gotoblas -> strsm_iltncopy #define SGEMM_BETA gotoblas -> sgemm_beta #define SGEMM_KERNEL gotoblas -> sgemm_kernel #define STRMM_KERNEL_LN gotoblas -> strmm_kernel_LN #define STRMM_KERNEL_LT gotoblas -> strmm_kernel_LT #define STRMM_KERNEL_LR gotoblas -> strmm_kernel_LN #define STRMM_KERNEL_LC gotoblas -> strmm_kernel_LT #define STRMM_KERNEL_RN gotoblas -> strmm_kernel_RN #define STRMM_KERNEL_RT gotoblas -> strmm_kernel_RT #define STRMM_KERNEL_RR gotoblas -> strmm_kernel_RN #define STRMM_KERNEL_RC gotoblas -> strmm_kernel_RT #define STRSM_KERNEL_LN gotoblas -> strsm_kernel_LN #define STRSM_KERNEL_LT gotoblas -> strsm_kernel_LT #define STRSM_KERNEL_LR gotoblas -> strsm_kernel_LN #define STRSM_KERNEL_LC gotoblas -> strsm_kernel_LT #define STRSM_KERNEL_RN gotoblas -> strsm_kernel_RN #define STRSM_KERNEL_RT gotoblas -> strsm_kernel_RT #define STRSM_KERNEL_RR gotoblas -> strsm_kernel_RN #define STRSM_KERNEL_RC gotoblas -> strsm_kernel_RT #define SSYMM_IUTCOPY gotoblas -> ssymm_iutcopy #define SSYMM_ILTCOPY gotoblas -> ssymm_iltcopy #define SSYMM_OUTCOPY gotoblas -> ssymm_outcopy #define SSYMM_OLTCOPY gotoblas -> ssymm_oltcopy #define SNEG_TCOPY gotoblas -> sneg_tcopy #define SLASWP_NCOPY gotoblas -> slaswp_ncopy #define SAXPBY_K gotoblas -> saxpby_k #define SOMATCOPY_K_CN gotoblas -> somatcopy_k_cn #define SOMATCOPY_K_RN gotoblas -> somatcopy_k_rn #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt #define SIMATCOPY_K_CN gotoblas -> simatcopy_k_cn #define SIMATCOPY_K_RN gotoblas -> simatcopy_k_rn #define SIMATCOPY_K_CT gotoblas -> simatcopy_k_ct #define SIMATCOPY_K_RT gotoblas -> simatcopy_k_rt #define SGEADD_K gotoblas -> sgeadd_k #endif #define SGEMM_NN sgemm_nn #define SGEMM_CN sgemm_tn #define SGEMM_TN sgemm_tn #define SGEMM_NC sgemm_nt #define SGEMM_NT sgemm_nt #define SGEMM_CC sgemm_tt #define SGEMM_CT sgemm_tt #define SGEMM_TC sgemm_tt #define SGEMM_TT sgemm_tt #define SGEMM_NR sgemm_nn #define SGEMM_TR sgemm_tn #define SGEMM_CR sgemm_tn #define SGEMM_RN sgemm_nn #define SGEMM_RT sgemm_nt #define SGEMM_RC sgemm_nt #define SGEMM_RR sgemm_nn #define SSYMM_LU ssymm_LU #define SSYMM_LL ssymm_LL #define SSYMM_RU ssymm_RU #define SSYMM_RL ssymm_RL #define SHEMM_LU shemm_LU #define SHEMM_LL shemm_LL #define SHEMM_RU shemm_RU #define SHEMM_RL shemm_RL #define SSYRK_UN ssyrk_UN #define SSYRK_UT ssyrk_UT #define SSYRK_LN ssyrk_LN #define SSYRK_LT ssyrk_LT #define SSYRK_UR ssyrk_UN #define SSYRK_UC ssyrk_UT #define SSYRK_LR ssyrk_LN #define SSYRK_LC ssyrk_LT #define SSYRK_KERNEL_U ssyrk_kernel_U #define SSYRK_KERNEL_L ssyrk_kernel_L #define SHERK_UN ssyrk_UN #define SHERK_LN ssyrk_LN #define SHERK_UC ssyrk_UT #define SHERK_LC ssyrk_LT #define SHER2K_UN ssyr2k_UN #define SHER2K_LN ssyr2k_LN #define SHER2K_UC ssyr2k_UT #define SHER2K_LC ssyr2k_LT #define SSYR2K_UN ssyr2k_UN #define SSYR2K_UT ssyr2k_UT #define SSYR2K_LN ssyr2k_LN #define SSYR2K_LT ssyr2k_LT #define SSYR2K_UR ssyr2k_UN #define SSYR2K_UC ssyr2k_UT #define SSYR2K_LR ssyr2k_LN #define SSYR2K_LC ssyr2k_LT #define SSYR2K_KERNEL_U ssyr2k_kernel_U #define SSYR2K_KERNEL_L ssyr2k_kernel_L #define STRMM_LNUU strmm_LNUU #define STRMM_LNUN strmm_LNUN #define STRMM_LNLU strmm_LNLU #define STRMM_LNLN strmm_LNLN #define STRMM_LTUU strmm_LTUU #define STRMM_LTUN strmm_LTUN #define STRMM_LTLU strmm_LTLU #define STRMM_LTLN strmm_LTLN #define STRMM_LRUU strmm_LNUU #define STRMM_LRUN strmm_LNUN #define STRMM_LRLU strmm_LNLU #define STRMM_LRLN strmm_LNLN #define STRMM_LCUU strmm_LTUU #define STRMM_LCUN strmm_LTUN #define STRMM_LCLU strmm_LTLU #define STRMM_LCLN strmm_LTLN #define STRMM_RNUU strmm_RNUU #define STRMM_RNUN strmm_RNUN #define STRMM_RNLU strmm_RNLU #define STRMM_RNLN strmm_RNLN #define STRMM_RTUU strmm_RTUU #define STRMM_RTUN strmm_RTUN #define STRMM_RTLU strmm_RTLU #define STRMM_RTLN strmm_RTLN #define STRMM_RRUU strmm_RNUU #define STRMM_RRUN strmm_RNUN #define STRMM_RRLU strmm_RNLU #define STRMM_RRLN strmm_RNLN #define STRMM_RCUU strmm_RTUU #define STRMM_RCUN strmm_RTUN #define STRMM_RCLU strmm_RTLU #define STRMM_RCLN strmm_RTLN #define STRSM_LNUU strsm_LNUU #define STRSM_LNUN strsm_LNUN #define STRSM_LNLU strsm_LNLU #define STRSM_LNLN strsm_LNLN #define STRSM_LTUU strsm_LTUU #define STRSM_LTUN strsm_LTUN #define STRSM_LTLU strsm_LTLU #define STRSM_LTLN strsm_LTLN #define STRSM_LRUU strsm_LNUU #define STRSM_LRUN strsm_LNUN #define STRSM_LRLU strsm_LNLU #define STRSM_LRLN strsm_LNLN #define STRSM_LCUU strsm_LTUU #define STRSM_LCUN strsm_LTUN #define STRSM_LCLU strsm_LTLU #define STRSM_LCLN strsm_LTLN #define STRSM_RNUU strsm_RNUU #define STRSM_RNUN strsm_RNUN #define STRSM_RNLU strsm_RNLU #define STRSM_RNLN strsm_RNLN #define STRSM_RTUU strsm_RTUU #define STRSM_RTUN strsm_RTUN #define STRSM_RTLU strsm_RTLU #define STRSM_RTLN strsm_RTLN #define STRSM_RRUU strsm_RNUU #define STRSM_RRUN strsm_RNUN #define STRSM_RRLU strsm_RNLU #define STRSM_RRLN strsm_RNLN #define STRSM_RCUU strsm_RTUU #define STRSM_RCUN strsm_RTUN #define STRSM_RCLU strsm_RTLU #define STRSM_RCLN strsm_RTLN #define SGEMM_THREAD_NN sgemm_thread_nn #define SGEMM_THREAD_CN sgemm_thread_tn #define SGEMM_THREAD_TN sgemm_thread_tn #define SGEMM_THREAD_NC sgemm_thread_nt #define SGEMM_THREAD_NT sgemm_thread_nt #define SGEMM_THREAD_CC sgemm_thread_tt #define SGEMM_THREAD_CT sgemm_thread_tt #define SGEMM_THREAD_TC sgemm_thread_tt #define SGEMM_THREAD_TT sgemm_thread_tt #define SGEMM_THREAD_NR sgemm_thread_nn #define SGEMM_THREAD_TR sgemm_thread_tn #define SGEMM_THREAD_CR sgemm_thread_tn #define SGEMM_THREAD_RN sgemm_thread_nn #define SGEMM_THREAD_RT sgemm_thread_nt #define SGEMM_THREAD_RC sgemm_thread_nt #define SGEMM_THREAD_RR sgemm_thread_nn #define SSYMM_THREAD_LU ssymm_thread_LU #define SSYMM_THREAD_LL ssymm_thread_LL #define SSYMM_THREAD_RU ssymm_thread_RU #define SSYMM_THREAD_RL ssymm_thread_RL #define SHEMM_THREAD_LU shemm_thread_LU #define SHEMM_THREAD_LL shemm_thread_LL #define SHEMM_THREAD_RU shemm_thread_RU #define SHEMM_THREAD_RL shemm_thread_RL #define SSYRK_THREAD_UN ssyrk_thread_UN #define SSYRK_THREAD_UT ssyrk_thread_UT #define SSYRK_THREAD_LN ssyrk_thread_LN #define SSYRK_THREAD_LT ssyrk_thread_LT #define SSYRK_THREAD_UR ssyrk_thread_UN #define SSYRK_THREAD_UC ssyrk_thread_UT #define SSYRK_THREAD_LR ssyrk_thread_LN #define SSYRK_THREAD_LC ssyrk_thread_LT #define SHERK_THREAD_UN ssyrk_thread_UN #define SHERK_THREAD_UT ssyrk_thread_UT #define SHERK_THREAD_LN ssyrk_thread_LN #define SHERK_THREAD_LT ssyrk_thread_LT #define SHERK_THREAD_UR ssyrk_thread_UN #define SHERK_THREAD_UC ssyrk_thread_UT #define SHERK_THREAD_LR ssyrk_thread_LN #define SHERK_THREAD_LC ssyrk_thread_LT #endif OpenBLAS-0.2.20/common_sparc.h000066400000000000000000000160241313527062700160000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_POWER #define COMMON_POWER #define MB __asm__ __volatile__ ("nop") #define WMB __asm__ __volatile__ ("nop") #ifndef ASSEMBLER static void __inline blas_lock(volatile unsigned long *address){ long int ret = 1; do { while (*address) {YIELDING;}; __asm__ __volatile__( "ldstub [%1], %0" : "=&r"(ret) : "r" (address) : "memory"); } while (ret); } #define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void){ unsigned long clocks; __asm__ __volatile__ ("rd %%tick, %0" : "=r" (clocks)); return clocks; }; #define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT #endif #ifndef __BIG_ENDIAN__ #define __BIG_ENDIAN__ #endif #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("fmovs %%f1, %0" : "=f"(res) : : "memory") #endif #define GET_IMAGE_CANCEL #ifdef SMP static __inline int blas_quickdivide(blasint x, blasint y){ return x / y; } #endif #endif #ifdef ASSEMBLER #ifndef __64BIT__ #define STACK_START 128 #define SAVESP save %sp, -64, %sp #else #define STACK_START 2423 #define SAVESP save %sp, -256, %sp #endif #define NOP or %g1, %g1, %g1 #ifdef DOUBLE #define LDF ldd #define STF std #define FADD faddd #define FMUL fmuld #define FMOV fmovd #define FABS fabsd #define FSUB fsubd #define FCMP fcmpd #define FMOVG fmovdg #define FMOVL fmovdl #define FSQRT fsqrtd #define FDIV fdivd #else #define LDF ld #define STF st #define FADD fadds #define FMUL fmuls #define FMOV fmovs #define FABS fabss #define FSUB fsubs #define FCMP fcmps #define FMOVG fmovsg #define FMOVL fmovsl #define FSQRT fsqrts #define FDIV fdivs #endif #define HALT prefetch [%g0], 5 #define FMADDS(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 1 << 5) | (rs2)) #define FMADDD(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 2 << 5) | (rs2)) #define FMSUBS(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 5 << 5) | (rs2)) #define FMSUBD(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 6 << 5) | (rs2)) #define FNMSUBS(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 9 << 5) | (rs2)) #define FNMSUBD(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (10 << 5) | (rs2)) #define FNMADDS(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (13 << 5) | (rs2)) #define FNMADDD(rs1, rs2, rs3, rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (14 << 5) | (rs2)) #define FCLRS(rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x61 << 5)) #define FCLRD(rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x60 << 5)) #define FONES(rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7f << 5)) #define FONED(rd) \ .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7e << 5)) #ifndef DOUBLE #define FCLR(a) FCLRS(a) #define FONE(a) FONES(a) #define FMADD(a, b, c, d) FMADDS(a, b, c, d) #define FMSUB(a, b, c, d) FMSUBS(a, b, c, d) #define FNMADD(a, b, c, d) FNMADDS(a, b, c, d) #define FNMSUB(a, b, c, d) FNMSUBS(a, b, c, d) #else #define FCLR(a) FCLRD(a) #define FONE(a) FONED(a) #define FMADD(a, b, c, d) FMADDD(a, b, c, d) #define FMSUB(a, b, c, d) FMSUBD(a, b, c, d) #define FNMADD(a, b, c, d) FNMADDD(a, b, c, d) #define FNMSUB(a, b, c, d) FNMSUBD(a, b, c, d) #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #ifdef sparc #define PROLOGUE \ .section ".text"; \ .align 32; \ .global REALNAME;\ .type REALNAME, #function; \ .proc 07; \ REALNAME:; #if defined(__linux__) && defined(__ELF__) #define GNUSTACK .section .note.GNU-stack,"",@progbits #else #define GNUSTACK #endif #define EPILOGUE \ .size REALNAME, .-REALNAME; \ GNUSTACK #endif #endif #ifdef sparc #define SEEK_ADDRESS #endif #define BUFFER_SIZE (32 << 20) #ifndef PAGESIZE #define PAGESIZE ( 8 << 10) #endif #define HUGE_PAGESIZE ( 4 << 20) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #endif OpenBLAS-0.2.20/common_stackalloc.h000066400000000000000000000067351313527062700170200ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define STACK_ALLOC_PROTECT #ifdef STACK_ALLOC_PROTECT // Try to detect stack smashing #include #define STACK_ALLOC_PROTECT_SET volatile int stack_check = 0x7fc01234; #define STACK_ALLOC_PROTECT_CHECK assert(stack_check == 0x7fc01234); #else #define STACK_ALLOC_PROTECT_SET #define STACK_ALLOC_PROTECT_CHECK #endif #if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 /* * Allocate a buffer on the stack if the size is smaller than MAX_STACK_ALLOC. * Stack allocation is much faster than blas_memory_alloc or malloc, particularly * when OpenBLAS is used from a multi-threaded application. * SIZE must be carefully chosen to be: * - as small as possible to maximize the number of stack allocation * - large enough to support all architectures and kernel * Chosing a too small SIZE will lead to a stack smashing. */ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ /* make it volatile because some function (ex: dgemv_n.S) */ \ /* do not restore all register */ \ volatile int stack_alloc_size = SIZE; \ if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ stack_alloc_size = 0; \ STACK_ALLOC_PROTECT_SET \ TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); #else //Original OpenBLAS/GotoBLAS codes. #define STACK_ALLOC(SIZE, TYPE, BUFFER) BUFFER = (TYPE *)blas_memory_alloc(1) #endif #if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 #define STACK_FREE(BUFFER) \ STACK_ALLOC_PROTECT_CHECK \ if(!stack_alloc_size) \ blas_memory_free(BUFFER); #else #define STACK_FREE(BUFFER) blas_memory_free(BUFFER) #endif OpenBLAS-0.2.20/common_thread.h000066400000000000000000000152471313527062700161450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_THREAD #define COMMON_THREAD #ifdef USE_OPENMP #include extern void goto_set_num_threads(int nthreads); #endif /* Basic Thread Debugging */ #undef SMP_DEBUG /* Thread Timing Debugging */ #undef TIMING_DEBUG /* Global Parameter */ extern int blas_cpu_number; extern int blas_num_threads; extern int blas_omp_linked; #define BLAS_LEGACY 0x8000U #define BLAS_PTHREAD 0x4000U #define BLAS_NODE 0x2000U #define BLAS_PREC 0x0003U #define BLAS_SINGLE 0x0000U #define BLAS_DOUBLE 0x0001U #define BLAS_XDOUBLE 0x0002U #define BLAS_REAL 0x0000U #define BLAS_COMPLEX 0x0004U #define BLAS_TRANSA 0x0030U /* 2bit */ #define BLAS_TRANSA_N 0x0000U #define BLAS_TRANSA_T 0x0010U #define BLAS_TRANSA_R 0x0020U #define BLAS_TRANSA_C 0x0030U #define BLAS_TRANSA_SHIFT 4 #define BLAS_TRANSB 0x0300U /* 2bit */ #define BLAS_TRANSB_N 0x0000U #define BLAS_TRANSB_T 0x0100U #define BLAS_TRANSB_R 0x0200U #define BLAS_TRANSB_C 0x0300U #define BLAS_TRANSB_SHIFT 8 #define BLAS_RSIDE 0x0400U #define BLAS_RSIDE_SHIFT 10 #define BLAS_UPLO 0x0800U #define BLAS_UPLO_SHIFT 11 #define BLAS_STATUS_NOTYET 0 #define BLAS_STATUS_QUEUED 1 #define BLAS_STATUS_RUNNING 2 #define BLAS_STATUS_FINISHED 4 typedef struct blas_queue { void *routine; BLASLONG position; BLASLONG assigned; blas_arg_t *args; void *range_m; void *range_n; void *sa, *sb; struct blas_queue *next; #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) CRITICAL_SECTION lock; HANDLE finish; #else pthread_mutex_t lock; pthread_cond_t finished; #endif int mode, status; #ifdef CONSISTENT_FPCSR unsigned int sse_mode, x87_mode; #endif #ifdef SMP_DEBUG int num; #endif #ifdef TIMING_DEBUG unsigned int clocks; #endif } blas_queue_t; #ifdef SMP_SERVER extern int blas_server_avail; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads=0; #endif if (blas_cpu_number == 1 #ifdef USE_OPENMP || omp_in_parallel() #endif ) return 1; #ifdef USE_OPENMP openmp_nthreads=omp_get_max_threads(); if (blas_cpu_number != openmp_nthreads) { goto_set_num_threads(openmp_nthreads); } #endif return blas_cpu_number; } static __inline void blas_queue_init(blas_queue_t *queue){ queue -> sa = NULL; queue -> sb = NULL; queue-> next = NULL; } int blas_thread_init(void); int BLASFUNC(blas_thread_shutdown)(void); int exec_blas(BLASLONG, blas_queue_t *); int exec_blas_async(BLASLONG, blas_queue_t *); int exec_blas_async_wait(BLASLONG, blas_queue_t *); #else int exec_blas_async(BLASLONG num_cpu, blas_param_t *param, pthread_t *); int exec_blas_async_wait(BLASLONG num_cpu, pthread_t *blas_threads); int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer); #endif #ifndef ASSEMBLER int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int threads); int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG); int trsm_thread(int mode, BLASLONG m, BLASLONG n, double alpha_r, double alpha_i, void *a, BLASLONG lda, void *c, BLASLONG ldc, int (*function)(), void *buffer); int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); int beta_thread(int mode, BLASLONG m, BLASLONG n, double alpha_r, double alpha_i, void *c, BLASLONG ldc, int (*fuction)()); int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *offsetA, BLASLONG lda, void *offsetB, BLASLONG jb, void *ipiv, BLASLONG offset, int (*function)(), void *buffer); #endif /* ENDIF ASSEMBLER */ #endif OpenBLAS-0.2.20/common_x.h000066400000000000000000000476541313527062700151540ustar00rootroot00000000000000#ifndef COMMON_X_H #define COMMON_X_H #ifndef DYNAMIC_ARCH #define XAMAX_K xamax_k #define XAMIN_K xamin_k #define XMAX_K xmax_k #define XMIN_K xmin_k #define IXAMAX_K ixamax_k #define IXAMIN_K ixamin_k #define IXMAX_K ixmax_k #define IXMIN_K ixmin_k #define XASUM_K xasum_k #define XAXPYU_K xaxpy_k #define XAXPYC_K xaxpyc_k #define XCOPY_K xcopy_k #define XDOTU_K xdotu_k #define XDOTC_K xdotc_k #define XNRM2_K xnrm2_k #define XSCAL_K xscal_k #define XSWAP_K xswap_k #define XROT_K xqrot_k #define XGEMV_N xgemv_n #define XGEMV_T xgemv_t #define XGEMV_R xgemv_r #define XGEMV_C xgemv_c #define XGEMV_O xgemv_o #define XGEMV_U xgemv_u #define XGEMV_S xgemv_s #define XGEMV_D xgemv_d #define XGERU_K xgeru_k #define XGERC_K xgerc_k #define XGERV_K xgerv_k #define XGERD_K xgerd_k #define XSYMV_U xsymv_U #define XSYMV_L xsymv_L #define XHEMV_U xhemv_U #define XHEMV_L xhemv_L #define XHEMV_V xhemv_V #define XHEMV_M xhemv_M #define XSYMV_THREAD_U xsymv_thread_U #define XSYMV_THREAD_L xsymv_thread_L #define XHEMV_THREAD_U xhemv_thread_U #define XHEMV_THREAD_L xhemv_thread_L #define XHEMV_THREAD_V xhemv_thread_V #define XHEMV_THREAD_M xhemv_thread_M #define XGEMM_ONCOPY xgemm_oncopy #define XGEMM_OTCOPY xgemm_otcopy #if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N #define XGEMM_INCOPY xgemm_oncopy #define XGEMM_ITCOPY xgemm_otcopy #else #define XGEMM_INCOPY xgemm_incopy #define XGEMM_ITCOPY xgemm_itcopy #endif #define XTRMM_OUNUCOPY xtrmm_ounucopy #define XTRMM_OUNNCOPY xtrmm_ounncopy #define XTRMM_OUTUCOPY xtrmm_outucopy #define XTRMM_OUTNCOPY xtrmm_outncopy #define XTRMM_OLNUCOPY xtrmm_olnucopy #define XTRMM_OLNNCOPY xtrmm_olnncopy #define XTRMM_OLTUCOPY xtrmm_oltucopy #define XTRMM_OLTNCOPY xtrmm_oltncopy #define XTRSM_OUNUCOPY xtrsm_ounucopy #define XTRSM_OUNNCOPY xtrsm_ounncopy #define XTRSM_OUTUCOPY xtrsm_outucopy #define XTRSM_OUTNCOPY xtrsm_outncopy #define XTRSM_OLNUCOPY xtrsm_olnucopy #define XTRSM_OLNNCOPY xtrsm_olnncopy #define XTRSM_OLTUCOPY xtrsm_oltucopy #define XTRSM_OLTNCOPY xtrsm_oltncopy #if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N #define XTRMM_IUNUCOPY xtrmm_ounucopy #define XTRMM_IUNNCOPY xtrmm_ounncopy #define XTRMM_IUTUCOPY xtrmm_outucopy #define XTRMM_IUTNCOPY xtrmm_outncopy #define XTRMM_ILNUCOPY xtrmm_olnucopy #define XTRMM_ILNNCOPY xtrmm_olnncopy #define XTRMM_ILTUCOPY xtrmm_oltucopy #define XTRMM_ILTNCOPY xtrmm_oltncopy #define XTRSM_IUNUCOPY xtrsm_ounucopy #define XTRSM_IUNNCOPY xtrsm_ounncopy #define XTRSM_IUTUCOPY xtrsm_outucopy #define XTRSM_IUTNCOPY xtrsm_outncopy #define XTRSM_ILNUCOPY xtrsm_olnucopy #define XTRSM_ILNNCOPY xtrsm_olnncopy #define XTRSM_ILTUCOPY xtrsm_oltucopy #define XTRSM_ILTNCOPY xtrsm_oltncopy #else #define XTRMM_IUNUCOPY xtrmm_iunucopy #define XTRMM_IUNNCOPY xtrmm_iunncopy #define XTRMM_IUTUCOPY xtrmm_iutucopy #define XTRMM_IUTNCOPY xtrmm_iutncopy #define XTRMM_ILNUCOPY xtrmm_ilnucopy #define XTRMM_ILNNCOPY xtrmm_ilnncopy #define XTRMM_ILTUCOPY xtrmm_iltucopy #define XTRMM_ILTNCOPY xtrmm_iltncopy #define XTRSM_IUNUCOPY xtrsm_iunucopy #define XTRSM_IUNNCOPY xtrsm_iunncopy #define XTRSM_IUTUCOPY xtrsm_iutucopy #define XTRSM_IUTNCOPY xtrsm_iutncopy #define XTRSM_ILNUCOPY xtrsm_ilnucopy #define XTRSM_ILNNCOPY xtrsm_ilnncopy #define XTRSM_ILTUCOPY xtrsm_iltucopy #define XTRSM_ILTNCOPY xtrsm_iltncopy #endif #define XGEMM_BETA xgemm_beta #define XGEMM_KERNEL_N xgemm_kernel_n #define XGEMM_KERNEL_L xgemm_kernel_l #define XGEMM_KERNEL_R xgemm_kernel_r #define XGEMM_KERNEL_B xgemm_kernel_b #define XTRMM_KERNEL_LN xtrmm_kernel_LN #define XTRMM_KERNEL_LT xtrmm_kernel_LT #define XTRMM_KERNEL_LR xtrmm_kernel_LR #define XTRMM_KERNEL_LC xtrmm_kernel_LC #define XTRMM_KERNEL_RN xtrmm_kernel_RN #define XTRMM_KERNEL_RT xtrmm_kernel_RT #define XTRMM_KERNEL_RR xtrmm_kernel_RR #define XTRMM_KERNEL_RC xtrmm_kernel_RC #define XTRSM_KERNEL_LN xtrsm_kernel_LN #define XTRSM_KERNEL_LT xtrsm_kernel_LT #define XTRSM_KERNEL_LR xtrsm_kernel_LR #define XTRSM_KERNEL_LC xtrsm_kernel_LC #define XTRSM_KERNEL_RN xtrsm_kernel_RN #define XTRSM_KERNEL_RT xtrsm_kernel_RT #define XTRSM_KERNEL_RR xtrsm_kernel_RR #define XTRSM_KERNEL_RC xtrsm_kernel_RC #define XSYMM_OUTCOPY xsymm_outcopy #define XSYMM_OLTCOPY xsymm_oltcopy #if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N #define XSYMM_IUTCOPY xsymm_outcopy #define XSYMM_ILTCOPY xsymm_oltcopy #else #define XSYMM_IUTCOPY xsymm_iutcopy #define XSYMM_ILTCOPY xsymm_iltcopy #endif #define XHEMM_OUTCOPY xhemm_outcopy #define XHEMM_OLTCOPY xhemm_oltcopy #if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N #define XHEMM_IUTCOPY xhemm_outcopy #define XHEMM_ILTCOPY xhemm_oltcopy #else #define XHEMM_IUTCOPY xhemm_iutcopy #define XHEMM_ILTCOPY xhemm_iltcopy #endif #define XGEMM3M_ONCOPYB xgemm3m_oncopyb #define XGEMM3M_ONCOPYR xgemm3m_oncopyr #define XGEMM3M_ONCOPYI xgemm3m_oncopyi #define XGEMM3M_OTCOPYB xgemm3m_otcopyb #define XGEMM3M_OTCOPYR xgemm3m_otcopyr #define XGEMM3M_OTCOPYI xgemm3m_otcopyi #define XGEMM3M_INCOPYB xgemm3m_incopyb #define XGEMM3M_INCOPYR xgemm3m_incopyr #define XGEMM3M_INCOPYI xgemm3m_incopyi #define XGEMM3M_ITCOPYB xgemm3m_itcopyb #define XGEMM3M_ITCOPYR xgemm3m_itcopyr #define XGEMM3M_ITCOPYI xgemm3m_itcopyi #define XSYMM3M_ILCOPYB xsymm3m_ilcopyb #define XSYMM3M_IUCOPYB xsymm3m_iucopyb #define XSYMM3M_ILCOPYR xsymm3m_ilcopyr #define XSYMM3M_IUCOPYR xsymm3m_iucopyr #define XSYMM3M_ILCOPYI xsymm3m_ilcopyi #define XSYMM3M_IUCOPYI xsymm3m_iucopyi #define XSYMM3M_OLCOPYB xsymm3m_olcopyb #define XSYMM3M_OUCOPYB xsymm3m_oucopyb #define XSYMM3M_OLCOPYR xsymm3m_olcopyr #define XSYMM3M_OUCOPYR xsymm3m_oucopyr #define XSYMM3M_OLCOPYI xsymm3m_olcopyi #define XSYMM3M_OUCOPYI xsymm3m_oucopyi #define XHEMM3M_ILCOPYB xhemm3m_ilcopyb #define XHEMM3M_IUCOPYB xhemm3m_iucopyb #define XHEMM3M_ILCOPYR xhemm3m_ilcopyr #define XHEMM3M_IUCOPYR xhemm3m_iucopyr #define XHEMM3M_ILCOPYI xhemm3m_ilcopyi #define XHEMM3M_IUCOPYI xhemm3m_iucopyi #define XHEMM3M_OLCOPYB xhemm3m_olcopyb #define XHEMM3M_OUCOPYB xhemm3m_oucopyb #define XHEMM3M_OLCOPYR xhemm3m_olcopyr #define XHEMM3M_OUCOPYR xhemm3m_oucopyr #define XHEMM3M_OLCOPYI xhemm3m_olcopyi #define XHEMM3M_OUCOPYI xhemm3m_oucopyi #define XGEMM3M_KERNEL xgemm3m_kernel #define XNEG_TCOPY xneg_tcopy #define XLASWP_NCOPY xlaswp_ncopy #else #define XAMAX_K gotoblas -> xamax_k #define XAMIN_K gotoblas -> xamin_k #define XMAX_K gotoblas -> xmax_k #define XMIN_K gotoblas -> xmin_k #define IXAMAX_K gotoblas -> ixamax_k #define IXAMIN_K gotoblas -> ixamin_k #define IXMAX_K gotoblas -> ixmax_k #define IXMIN_K gotoblas -> ixmin_k #define XASUM_K gotoblas -> xasum_k #define XAXPYU_K gotoblas -> xaxpy_k #define XAXPYC_K gotoblas -> xaxpyc_k #define XCOPY_K gotoblas -> xcopy_k #define XDOTU_K gotoblas -> xdotu_k #define XDOTC_K gotoblas -> xdotc_k #define XNRM2_K gotoblas -> xnrm2_k #define XSCAL_K gotoblas -> xscal_k #define XSWAP_K gotoblas -> xswap_k #define XROT_K gotoblas -> xqrot_k #define XGEMV_N gotoblas -> xgemv_n #define XGEMV_T gotoblas -> xgemv_t #define XGEMV_R gotoblas -> xgemv_r #define XGEMV_C gotoblas -> xgemv_c #define XGEMV_O gotoblas -> xgemv_o #define XGEMV_U gotoblas -> xgemv_u #define XGEMV_S gotoblas -> xgemv_s #define XGEMV_D gotoblas -> xgemv_d #define XGERU_K gotoblas -> xgeru_k #define XGERC_K gotoblas -> xgerc_k #define XGERV_K gotoblas -> xgerv_k #define XGERD_K gotoblas -> xgerd_k #define XSYMV_U gotoblas -> xsymv_U #define XSYMV_L gotoblas -> xsymv_L #define XHEMV_U gotoblas -> xhemv_U #define XHEMV_L gotoblas -> xhemv_L #define XHEMV_V gotoblas -> xhemv_V #define XHEMV_M gotoblas -> xhemv_M #define XSYMV_THREAD_U xsymv_thread_U #define XSYMV_THREAD_L xsymv_thread_L #define XHEMV_THREAD_U xhemv_thread_U #define XHEMV_THREAD_L xhemv_thread_L #define XHEMV_THREAD_V xhemv_thread_V #define XHEMV_THREAD_M xhemv_thread_M #define XGEMM_ONCOPY gotoblas -> xgemm_oncopy #define XGEMM_OTCOPY gotoblas -> xgemm_otcopy #define XGEMM_INCOPY gotoblas -> xgemm_incopy #define XGEMM_ITCOPY gotoblas -> xgemm_itcopy #define XTRMM_OUNUCOPY gotoblas -> xtrmm_ounucopy #define XTRMM_OUTUCOPY gotoblas -> xtrmm_outucopy #define XTRMM_OLNUCOPY gotoblas -> xtrmm_olnucopy #define XTRMM_OLTUCOPY gotoblas -> xtrmm_oltucopy #define XTRSM_OUNUCOPY gotoblas -> xtrsm_ounucopy #define XTRSM_OUTUCOPY gotoblas -> xtrsm_outucopy #define XTRSM_OLNUCOPY gotoblas -> xtrsm_olnucopy #define XTRSM_OLTUCOPY gotoblas -> xtrsm_oltucopy #define XTRMM_IUNUCOPY gotoblas -> xtrmm_iunucopy #define XTRMM_IUTUCOPY gotoblas -> xtrmm_iutucopy #define XTRMM_ILNUCOPY gotoblas -> xtrmm_ilnucopy #define XTRMM_ILTUCOPY gotoblas -> xtrmm_iltucopy #define XTRSM_IUNUCOPY gotoblas -> xtrsm_iunucopy #define XTRSM_IUTUCOPY gotoblas -> xtrsm_iutucopy #define XTRSM_ILNUCOPY gotoblas -> xtrsm_ilnucopy #define XTRSM_ILTUCOPY gotoblas -> xtrsm_iltucopy #define XTRMM_OUNNCOPY gotoblas -> xtrmm_ounncopy #define XTRMM_OUTNCOPY gotoblas -> xtrmm_outncopy #define XTRMM_OLNNCOPY gotoblas -> xtrmm_olnncopy #define XTRMM_OLTNCOPY gotoblas -> xtrmm_oltncopy #define XTRSM_OUNNCOPY gotoblas -> xtrsm_ounncopy #define XTRSM_OUTNCOPY gotoblas -> xtrsm_outncopy #define XTRSM_OLNNCOPY gotoblas -> xtrsm_olnncopy #define XTRSM_OLTNCOPY gotoblas -> xtrsm_oltncopy #define XTRMM_IUNNCOPY gotoblas -> xtrmm_iunncopy #define XTRMM_IUTNCOPY gotoblas -> xtrmm_iutncopy #define XTRMM_ILNNCOPY gotoblas -> xtrmm_ilnncopy #define XTRMM_ILTNCOPY gotoblas -> xtrmm_iltncopy #define XTRSM_IUNNCOPY gotoblas -> xtrsm_iunncopy #define XTRSM_IUTNCOPY gotoblas -> xtrsm_iutncopy #define XTRSM_ILNNCOPY gotoblas -> xtrsm_ilnncopy #define XTRSM_ILTNCOPY gotoblas -> xtrsm_iltncopy #define XGEMM_BETA gotoblas -> xgemm_beta #define XGEMM_KERNEL_N gotoblas -> xgemm_kernel_n #define XGEMM_KERNEL_L gotoblas -> xgemm_kernel_l #define XGEMM_KERNEL_R gotoblas -> xgemm_kernel_r #define XGEMM_KERNEL_B gotoblas -> xgemm_kernel_b #define XTRMM_KERNEL_LN gotoblas -> xtrmm_kernel_LN #define XTRMM_KERNEL_LT gotoblas -> xtrmm_kernel_LT #define XTRMM_KERNEL_LR gotoblas -> xtrmm_kernel_LR #define XTRMM_KERNEL_LC gotoblas -> xtrmm_kernel_LC #define XTRMM_KERNEL_RN gotoblas -> xtrmm_kernel_RN #define XTRMM_KERNEL_RT gotoblas -> xtrmm_kernel_RT #define XTRMM_KERNEL_RR gotoblas -> xtrmm_kernel_RR #define XTRMM_KERNEL_RC gotoblas -> xtrmm_kernel_RC #define XTRSM_KERNEL_LN gotoblas -> xtrsm_kernel_LN #define XTRSM_KERNEL_LT gotoblas -> xtrsm_kernel_LT #define XTRSM_KERNEL_LR gotoblas -> xtrsm_kernel_LR #define XTRSM_KERNEL_LC gotoblas -> xtrsm_kernel_LC #define XTRSM_KERNEL_RN gotoblas -> xtrsm_kernel_RN #define XTRSM_KERNEL_RT gotoblas -> xtrsm_kernel_RT #define XTRSM_KERNEL_RR gotoblas -> xtrsm_kernel_RR #define XTRSM_KERNEL_RC gotoblas -> xtrsm_kernel_RC #define XSYMM_IUTCOPY gotoblas -> xsymm_iutcopy #define XSYMM_ILTCOPY gotoblas -> xsymm_iltcopy #define XSYMM_OUTCOPY gotoblas -> xsymm_outcopy #define XSYMM_OLTCOPY gotoblas -> xsymm_oltcopy #define XHEMM_OUTCOPY gotoblas -> xhemm_outcopy #define XHEMM_OLTCOPY gotoblas -> xhemm_oltcopy #define XHEMM_IUTCOPY gotoblas -> xhemm_iutcopy #define XHEMM_ILTCOPY gotoblas -> xhemm_iltcopy #define XGEMM3M_ONCOPYB gotoblas -> xgemm3m_oncopyb #define XGEMM3M_ONCOPYR gotoblas -> xgemm3m_oncopyr #define XGEMM3M_ONCOPYI gotoblas -> xgemm3m_oncopyi #define XGEMM3M_OTCOPYB gotoblas -> xgemm3m_otcopyb #define XGEMM3M_OTCOPYR gotoblas -> xgemm3m_otcopyr #define XGEMM3M_OTCOPYI gotoblas -> xgemm3m_otcopyi #define XGEMM3M_INCOPYB gotoblas -> xgemm3m_incopyb #define XGEMM3M_INCOPYR gotoblas -> xgemm3m_incopyr #define XGEMM3M_INCOPYI gotoblas -> xgemm3m_incopyi #define XGEMM3M_ITCOPYB gotoblas -> xgemm3m_itcopyb #define XGEMM3M_ITCOPYR gotoblas -> xgemm3m_itcopyr #define XGEMM3M_ITCOPYI gotoblas -> xgemm3m_itcopyi #define XSYMM3M_ILCOPYB gotoblas -> xsymm3m_ilcopyb #define XSYMM3M_IUCOPYB gotoblas -> xsymm3m_iucopyb #define XSYMM3M_ILCOPYR gotoblas -> xsymm3m_ilcopyr #define XSYMM3M_IUCOPYR gotoblas -> xsymm3m_iucopyr #define XSYMM3M_ILCOPYI gotoblas -> xsymm3m_ilcopyi #define XSYMM3M_IUCOPYI gotoblas -> xsymm3m_iucopyi #define XSYMM3M_OLCOPYB gotoblas -> xsymm3m_olcopyb #define XSYMM3M_OUCOPYB gotoblas -> xsymm3m_oucopyb #define XSYMM3M_OLCOPYR gotoblas -> xsymm3m_olcopyr #define XSYMM3M_OUCOPYR gotoblas -> xsymm3m_oucopyr #define XSYMM3M_OLCOPYI gotoblas -> xsymm3m_olcopyi #define XSYMM3M_OUCOPYI gotoblas -> xsymm3m_oucopyi #define XHEMM3M_ILCOPYB gotoblas -> xhemm3m_ilcopyb #define XHEMM3M_IUCOPYB gotoblas -> xhemm3m_iucopyb #define XHEMM3M_ILCOPYR gotoblas -> xhemm3m_ilcopyr #define XHEMM3M_IUCOPYR gotoblas -> xhemm3m_iucopyr #define XHEMM3M_ILCOPYI gotoblas -> xhemm3m_ilcopyi #define XHEMM3M_IUCOPYI gotoblas -> xhemm3m_iucopyi #define XHEMM3M_OLCOPYB gotoblas -> xhemm3m_olcopyb #define XHEMM3M_OUCOPYB gotoblas -> xhemm3m_oucopyb #define XHEMM3M_OLCOPYR gotoblas -> xhemm3m_olcopyr #define XHEMM3M_OUCOPYR gotoblas -> xhemm3m_oucopyr #define XHEMM3M_OLCOPYI gotoblas -> xhemm3m_olcopyi #define XHEMM3M_OUCOPYI gotoblas -> xhemm3m_oucopyi #define XGEMM3M_KERNEL gotoblas -> xgemm3m_kernel #define XNEG_TCOPY gotoblas -> xneg_tcopy #define XLASWP_NCOPY gotoblas -> xlaswp_ncopy #endif #define XGEMM_NN xgemm_nn #define XGEMM_CN xgemm_cn #define XGEMM_TN xgemm_tn #define XGEMM_NC xgemm_nc #define XGEMM_NT xgemm_nt #define XGEMM_CC xgemm_cc #define XGEMM_CT xgemm_ct #define XGEMM_TC xgemm_tc #define XGEMM_TT xgemm_tt #define XGEMM_NR xgemm_nr #define XGEMM_TR xgemm_tr #define XGEMM_CR xgemm_cr #define XGEMM_RN xgemm_rn #define XGEMM_RT xgemm_rt #define XGEMM_RC xgemm_rc #define XGEMM_RR xgemm_rr #define XSYMM_LU xsymm_LU #define XSYMM_LL xsymm_LL #define XSYMM_RU xsymm_RU #define XSYMM_RL xsymm_RL #define XHEMM_LU xhemm_LU #define XHEMM_LL xhemm_LL #define XHEMM_RU xhemm_RU #define XHEMM_RL xhemm_RL #define XSYRK_UN xsyrk_UN #define XSYRK_UT xsyrk_UT #define XSYRK_LN xsyrk_LN #define XSYRK_LT xsyrk_LT #define XSYRK_UR xsyrk_UN #define XSYRK_UC xsyrk_UT #define XSYRK_LR xsyrk_LN #define XSYRK_LC xsyrk_LT #define XSYRK_KERNEL_U xsyrk_kernel_U #define XSYRK_KERNEL_L xsyrk_kernel_L #define XHERK_UN xherk_UN #define XHERK_LN xherk_LN #define XHERK_UC xherk_UC #define XHERK_LC xherk_LC #define XHER2K_UN xher2k_UN #define XHER2K_LN xher2k_LN #define XHER2K_UC xher2k_UC #define XHER2K_LC xher2k_LC #define XSYR2K_UN xsyr2k_UN #define XSYR2K_UT xsyr2k_UT #define XSYR2K_LN xsyr2k_LN #define XSYR2K_LT xsyr2k_LT #define XSYR2K_UR xsyr2k_UN #define XSYR2K_UC xsyr2k_UT #define XSYR2K_LR xsyr2k_LN #define XSYR2K_LC xsyr2k_LT #define XSYR2K_KERNEL_U xsyr2k_kernel_U #define XSYR2K_KERNEL_L xsyr2k_kernel_L #define XTRMM_LNUU xtrmm_LNUU #define XTRMM_LNUN xtrmm_LNUN #define XTRMM_LNLU xtrmm_LNLU #define XTRMM_LNLN xtrmm_LNLN #define XTRMM_LTUU xtrmm_LTUU #define XTRMM_LTUN xtrmm_LTUN #define XTRMM_LTLU xtrmm_LTLU #define XTRMM_LTLN xtrmm_LTLN #define XTRMM_LRUU xtrmm_LRUU #define XTRMM_LRUN xtrmm_LRUN #define XTRMM_LRLU xtrmm_LRLU #define XTRMM_LRLN xtrmm_LRLN #define XTRMM_LCUU xtrmm_LCUU #define XTRMM_LCUN xtrmm_LCUN #define XTRMM_LCLU xtrmm_LCLU #define XTRMM_LCLN xtrmm_LCLN #define XTRMM_RNUU xtrmm_RNUU #define XTRMM_RNUN xtrmm_RNUN #define XTRMM_RNLU xtrmm_RNLU #define XTRMM_RNLN xtrmm_RNLN #define XTRMM_RTUU xtrmm_RTUU #define XTRMM_RTUN xtrmm_RTUN #define XTRMM_RTLU xtrmm_RTLU #define XTRMM_RTLN xtrmm_RTLN #define XTRMM_RRUU xtrmm_RRUU #define XTRMM_RRUN xtrmm_RRUN #define XTRMM_RRLU xtrmm_RRLU #define XTRMM_RRLN xtrmm_RRLN #define XTRMM_RCUU xtrmm_RCUU #define XTRMM_RCUN xtrmm_RCUN #define XTRMM_RCLU xtrmm_RCLU #define XTRMM_RCLN xtrmm_RCLN #define XTRSM_LNUU xtrsm_LNUU #define XTRSM_LNUN xtrsm_LNUN #define XTRSM_LNLU xtrsm_LNLU #define XTRSM_LNLN xtrsm_LNLN #define XTRSM_LTUU xtrsm_LTUU #define XTRSM_LTUN xtrsm_LTUN #define XTRSM_LTLU xtrsm_LTLU #define XTRSM_LTLN xtrsm_LTLN #define XTRSM_LRUU xtrsm_LRUU #define XTRSM_LRUN xtrsm_LRUN #define XTRSM_LRLU xtrsm_LRLU #define XTRSM_LRLN xtrsm_LRLN #define XTRSM_LCUU xtrsm_LCUU #define XTRSM_LCUN xtrsm_LCUN #define XTRSM_LCLU xtrsm_LCLU #define XTRSM_LCLN xtrsm_LCLN #define XTRSM_RNUU xtrsm_RNUU #define XTRSM_RNUN xtrsm_RNUN #define XTRSM_RNLU xtrsm_RNLU #define XTRSM_RNLN xtrsm_RNLN #define XTRSM_RTUU xtrsm_RTUU #define XTRSM_RTUN xtrsm_RTUN #define XTRSM_RTLU xtrsm_RTLU #define XTRSM_RTLN xtrsm_RTLN #define XTRSM_RRUU xtrsm_RRUU #define XTRSM_RRUN xtrsm_RRUN #define XTRSM_RRLU xtrsm_RRLU #define XTRSM_RRLN xtrsm_RRLN #define XTRSM_RCUU xtrsm_RCUU #define XTRSM_RCUN xtrsm_RCUN #define XTRSM_RCLU xtrsm_RCLU #define XTRSM_RCLN xtrsm_RCLN #define XGEMM_THREAD_NN xgemm_thread_nn #define XGEMM_THREAD_CN xgemm_thread_cn #define XGEMM_THREAD_TN xgemm_thread_tn #define XGEMM_THREAD_NC xgemm_thread_nc #define XGEMM_THREAD_NT xgemm_thread_nt #define XGEMM_THREAD_CC xgemm_thread_cc #define XGEMM_THREAD_CT xgemm_thread_ct #define XGEMM_THREAD_TC xgemm_thread_tc #define XGEMM_THREAD_TT xgemm_thread_tt #define XGEMM_THREAD_NR xgemm_thread_nr #define XGEMM_THREAD_TR xgemm_thread_tr #define XGEMM_THREAD_CR xgemm_thread_cr #define XGEMM_THREAD_RN xgemm_thread_rn #define XGEMM_THREAD_RT xgemm_thread_rt #define XGEMM_THREAD_RC xgemm_thread_rc #define XGEMM_THREAD_RR xgemm_thread_rr #define XSYMM_THREAD_LU xsymm_thread_LU #define XSYMM_THREAD_LL xsymm_thread_LL #define XSYMM_THREAD_RU xsymm_thread_RU #define XSYMM_THREAD_RL xsymm_thread_RL #define XHEMM_THREAD_LU xhemm_thread_LU #define XHEMM_THREAD_LL xhemm_thread_LL #define XHEMM_THREAD_RU xhemm_thread_RU #define XHEMM_THREAD_RL xhemm_thread_RL #define XSYRK_THREAD_UN xsyrk_thread_UN #define XSYRK_THREAD_UT xsyrk_thread_UT #define XSYRK_THREAD_LN xsyrk_thread_LN #define XSYRK_THREAD_LT xsyrk_thread_LT #define XSYRK_THREAD_UR xsyrk_thread_UN #define XSYRK_THREAD_UC xsyrk_thread_UT #define XSYRK_THREAD_LR xsyrk_thread_LN #define XSYRK_THREAD_LC xsyrk_thread_LT #define XHERK_THREAD_UN xherk_thread_UN #define XHERK_THREAD_UT xherk_thread_UT #define XHERK_THREAD_LN xherk_thread_LN #define XHERK_THREAD_LT xherk_thread_LT #define XHERK_THREAD_UR xherk_thread_UR #define XHERK_THREAD_UC xherk_thread_UC #define XHERK_THREAD_LR xherk_thread_LR #define XHERK_THREAD_LC xherk_thread_LC #define XGEMM3M_NN xgemm3m_nn #define XGEMM3M_CN xgemm3m_cn #define XGEMM3M_TN xgemm3m_tn #define XGEMM3M_NC xgemm3m_nc #define XGEMM3M_NT xgemm3m_nt #define XGEMM3M_CC xgemm3m_cc #define XGEMM3M_CT xgemm3m_ct #define XGEMM3M_TC xgemm3m_tc #define XGEMM3M_TT xgemm3m_tt #define XGEMM3M_NR xgemm3m_nr #define XGEMM3M_TR xgemm3m_tr #define XGEMM3M_CR xgemm3m_cr #define XGEMM3M_RN xgemm3m_rn #define XGEMM3M_RT xgemm3m_rt #define XGEMM3M_RC xgemm3m_rc #define XGEMM3M_RR xgemm3m_rr #define XGEMM3M_THREAD_NN xgemm3m_thread_nn #define XGEMM3M_THREAD_CN xgemm3m_thread_cn #define XGEMM3M_THREAD_TN xgemm3m_thread_tn #define XGEMM3M_THREAD_NC xgemm3m_thread_nc #define XGEMM3M_THREAD_NT xgemm3m_thread_nt #define XGEMM3M_THREAD_CC xgemm3m_thread_cc #define XGEMM3M_THREAD_CT xgemm3m_thread_ct #define XGEMM3M_THREAD_TC xgemm3m_thread_tc #define XGEMM3M_THREAD_TT xgemm3m_thread_tt #define XGEMM3M_THREAD_NR xgemm3m_thread_nr #define XGEMM3M_THREAD_TR xgemm3m_thread_tr #define XGEMM3M_THREAD_CR xgemm3m_thread_cr #define XGEMM3M_THREAD_RN xgemm3m_thread_rn #define XGEMM3M_THREAD_RT xgemm3m_thread_rt #define XGEMM3M_THREAD_RC xgemm3m_thread_rc #define XGEMM3M_THREAD_RR xgemm3m_thread_rr #define XSYMM3M_LU xsymm3m_LU #define XSYMM3M_LL xsymm3m_LL #define XSYMM3M_RU xsymm3m_RU #define XSYMM3M_RL xsymm3m_RL #define XSYMM3M_THREAD_LU xsymm3m_thread_LU #define XSYMM3M_THREAD_LL xsymm3m_thread_LL #define XSYMM3M_THREAD_RU xsymm3m_thread_RU #define XSYMM3M_THREAD_RL xsymm3m_thread_RL #define XHEMM3M_LU xhemm3m_LU #define XHEMM3M_LL xhemm3m_LL #define XHEMM3M_RU xhemm3m_RU #define XHEMM3M_RL xhemm3m_RL #define XHEMM3M_THREAD_LU xhemm3m_thread_LU #define XHEMM3M_THREAD_LL xhemm3m_thread_LL #define XHEMM3M_THREAD_RU xhemm3m_thread_RU #define XHEMM3M_THREAD_RL xhemm3m_thread_RL #endif OpenBLAS-0.2.20/common_x86.h000066400000000000000000000235141313527062700153170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_X86 #define COMMON_X86 #ifndef ASSEMBLER #ifdef C_MSVC #include #endif #define MB #define WMB #ifdef C_SUN #define __asm__ __asm #define __volatile__ #endif static void __inline blas_lock(volatile BLASULONG *address){ int ret; do { while (*address) {YIELDING;}; #if defined(_MSC_VER) && !defined(__clang__) // use intrinsic instead of inline assembly ret = _InterlockedExchange((volatile LONG *)address, 1); // inline assembly /*__asm { mov eax, address mov ebx, 1 xchg [eax], ebx mov ret, ebx }*/ #else __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) : "0"(1), "m"(*address) : "memory"); #endif } while (ret); } #define BLAS_LOCK_DEFINED static __inline unsigned long long rpcc(void){ #if defined(_MSC_VER) && !defined(__clang__) return __rdtsc(); // use MSVC intrinsic #else unsigned int a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); return ((unsigned long long)a + ((unsigned long long)d << 32)); #endif }; #define RPCC_DEFINED static __inline unsigned long getstackaddr(void){ #if defined(_MSC_VER) && !defined(__clang__) return (unsigned long)_ReturnAddress(); // use MSVC intrinsic #else unsigned long addr; __asm__ __volatile__ ("mov %%esp, %0" : "=r"(addr) : : "memory"); return addr; #endif }; static __inline long double sqrt_long(long double val) { #if defined(_MSC_VER) && !defined(__clang__) return sqrt(val); // not sure if this will use fsqrt #else long double result; __asm__ __volatile__ ("fldt %1\n" "fsqrt\n" "fstpt %0\n" : "=m" (result) : "m"(val)); return result; #endif } #define SQRT(a) sqrt_long(a) /* This is due to gcc's bug */ void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); #define WHEREAMI static __inline int WhereAmI(void){ int eax, ebx, ecx, edx; int apicid; cpuid(1, &eax, &ebx, &ecx, &edx); apicid = BITMASK(ebx, 24, 0xff); return apicid; } #ifdef ENABLE_SSE_EXCEPTION #define IDEBUG_START \ { \ unsigned int fp_sse_mode, new_fp_mode; \ __asm__ __volatile__ ("stmxcsr %0" : "=m" (fp_sse_mode) : ); \ new_fp_mode = fp_sse_mode & ~0xd00; \ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (new_fp_mode) ); #define IDEBUG_END \ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (fp_sse_mode) ); \ } #endif #ifdef XDOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fstpt %0" : "=m"(res) : : "memory") #elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("fstpl %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("fstps %0" : "=m"(res) : : "memory"); #endif #define GET_IMAGE_CANCEL __asm__ __volatile__ ("ffree %st") #ifdef SMP extern unsigned int blas_quick_divide_table[]; static __inline int blas_quickdivide(unsigned int x, unsigned int y){ unsigned int result; if (y <= 1) return x; #if defined(_MSC_VER) && !defined(__clang__) result = x/y; return result; #else y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; #endif } #endif #endif #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE ( 4 << 20) #define BUFFER_SIZE (16 << 20) #define SEEK_ADDRESS #if defined(DOUBLE) || defined(XDOUBLE) #define MMXLOAD movq #define MMXSTORE movq #else #define MMXLOAD movd #define MMXSTORE movd #endif #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif #if defined(HAVE_3DNOW) #define EMMS femms #elif defined(HAVE_MMX) #define EMMS emms #endif #ifndef EMMS #define EMMS #endif #if defined(CORE2) || defined(PENTIUM4) #define movapd movaps #endif #define BRANCH .byte 0x3e #define NOBRANCH .byte 0x2e #define PADDING .byte 0x66; #define HALT hlt #ifndef COMPLEX #ifdef XDOUBLE #define LOCAL_BUFFER_SIZE QLOCAL_BUFFER_SIZE #elif defined DOUBLE #define LOCAL_BUFFER_SIZE DLOCAL_BUFFER_SIZE #else #define LOCAL_BUFFER_SIZE SLOCAL_BUFFER_SIZE #endif #else #ifdef XDOUBLE #define LOCAL_BUFFER_SIZE XLOCAL_BUFFER_SIZE #elif defined DOUBLE #define LOCAL_BUFFER_SIZE ZLOCAL_BUFFER_SIZE #else #define LOCAL_BUFFER_SIZE CLOCAL_BUFFER_SIZE #endif #endif #if defined(OS_WINDOWS) #if LOCAL_BUFFER_SIZE > 16384 #define STACK_TOUCHING \ movl $0, 4096 * 4(%esp);\ movl $0, 4096 * 3(%esp);\ movl $0, 4096 * 2(%esp);\ movl $0, 4096 * 1(%esp); #elif LOCAL_BUFFER_SIZE > 12288 #define STACK_TOUCHING \ movl $0, 4096 * 3(%esp);\ movl $0, 4096 * 2(%esp);\ movl $0, 4096 * 1(%esp); #elif LOCAL_BUFFER_SIZE > 8192 #define STACK_TOUCHING \ movl $0, 4096 * 2(%esp);\ movl $0, 4096 * 1(%esp); #elif LOCAL_BUFFER_SIZE > 4096 #define STACK_TOUCHING \ movl $0, 4096 * 1(%esp); #else #define STACK_TOUCHING #endif #else #define STACK_TOUCHING #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #if defined(F_INTERFACE_PATHSCALE) || defined(F_INTERFACE_OPEN64) #define RETURN_BY_STRUCT #elif defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) #define RETURN_BY_COMPLEX #else #define RETURN_BY_STACK #endif #ifdef OS_DARWIN #define PROLOGUE .text;.align 5; .globl REALNAME; REALNAME: #define EPILOGUE .subsections_via_symbols #define PROFCODE #endif #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define SAVEREGISTERS \ subl $32, %esp;\ movups %xmm6, 0(%esp);\ movups %xmm7, 16(%esp) #define RESTOREREGISTERS \ movups 0(%esp), %xmm6;\ movups 16(%esp), %xmm7;\ addl $32, %esp #else #define SAVEREGISTERS #define RESTOREREGISTERS #endif #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define PROLOGUE \ .text; \ .align 16; \ .globl REALNAME ;\ .def REALNAME;.scl 2;.type 32;.endef; \ REALNAME: #define PROFCODE #ifdef __clang__ #define EPILOGUE .end #else #define EPILOGUE .end REALNAME #endif #endif #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ .globl REALNAME ;\ .type REALNAME, @function; \ REALNAME: #ifdef PROFILE #define PROFCODE call mcount #else #define PROFCODE #endif #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits #endif #ifdef XDOUBLE #define FLD fldt #define FST fstpt #define FSTU fstt #define FMUL fmult #define FADD faddt #define FSUB fsubt #define FSUBR fsubrt #elif defined(DOUBLE) #define FLD fldl #define FST fstpl #define FSTU fstl #define FMUL fmull #define FADD faddl #define FSUB fsubl #define FSUBR fsubrl #else #define FLD flds #define FST fstps #define FSTU fsts #define FMUL fmuls #define FADD fadds #define FSUB fsubs #define FSUBR fsubrs #endif #endif #ifdef C_SUN #define ffreep fstp #endif #ifdef __APPLE__ #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 #define ALIGN_5 .align 5 #define ffreep fstp #endif #ifndef ALIGN_2 #define ALIGN_2 .align 4 #endif #ifndef ALIGN_3 #define ALIGN_3 .align 8 #endif #ifndef ALIGN_4 #define ALIGN_4 .align 16 #endif #ifndef ALIGN_5 #define ALIGN_5 .align 32 #endif #ifndef ALIGN_6 #define ALIGN_6 .align 64 #endif // ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. // Please check out http://www.sandpile.org/x86/opc_fpu.htm #ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif OpenBLAS-0.2.20/common_x86_64.h000066400000000000000000000261371313527062700156340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_X86 #define COMMON_X86 #ifndef ASSEMBLER #ifdef C_MSVC #include #endif #ifdef C_SUN #define __asm__ __asm #define __volatile__ #endif /* #ifdef HAVE_SSE2 #define MB __asm__ __volatile__ ("mfence"); #define WMB __asm__ __volatile__ ("sfence"); #else #define MB #define WMB #endif */ #define MB #define WMB static void __inline blas_lock(volatile BLASULONG *address){ #ifndef C_MSVC int ret; #else BLASULONG ret; #endif do { while (*address) {YIELDING;}; #ifndef C_MSVC __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) : "0"(1), "m"(*address) : "memory"); #else ret=InterlockedExchange64((volatile LONG64 *)(address), 1); #endif } while (ret); } #define BLAS_LOCK_DEFINED static __inline BLASULONG rpcc(void){ #ifdef C_MSVC return __rdtsc(); #else BLASULONG a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); return ((BLASULONG)a + ((BLASULONG)d << 32)); #endif } #define RPCC_DEFINED #define RPCC64BIT #ifndef C_MSVC static __inline BLASULONG getstackaddr(void){ BLASULONG addr; __asm__ __volatile__ ("movq %%rsp, %0" : "=r"(addr) : : "memory"); return addr; } #endif static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ #ifdef C_MSVC int cpuinfo[4]; __cpuid(cpuinfo, op); *eax=cpuinfo[0]; *ebx=cpuinfo[1]; *ecx=cpuinfo[2]; *edx=cpuinfo[3]; #else __asm__ __volatile__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op)); #endif } /* #define WHEREAMI */ static __inline int WhereAmI(void){ int eax, ebx, ecx, edx; int apicid; cpuid(1, &eax, &ebx, &ecx, &edx); apicid = BITMASK(ebx, 24, 0xff); return apicid; } #ifdef CORE_BARCELONA #define IFLUSH gotoblas_iflush() #define IFLUSH_HALF gotoblas_iflush_half() #endif #ifdef ENABLE_SSE_EXCEPTION #define IDEBUG_START \ { \ unsigned int fp_sse_mode, new_fp_mode; \ __asm__ __volatile__ ("stmxcsr %0" : "=m" (fp_sse_mode) : ); \ new_fp_mode = fp_sse_mode & ~0xd00; \ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (new_fp_mode) ); #define IDEBUG_END \ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (fp_sse_mode) ); \ } #endif #ifdef XDOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fstpt %0" : "=m"(res) : : "memory") #elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("movsd %%xmm1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("movss %%xmm1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL #ifdef SMP #if defined(USE64BITINT) static __inline blasint blas_quickdivide(blasint x, blasint y){ return x / y; } #elif defined (C_MSVC) static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ return x / y; } #else extern unsigned int blas_quick_divide_table[]; static __inline int blas_quickdivide(unsigned int x, unsigned int y){ unsigned int result; if (y <= 1) return x; y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; } #endif #endif #endif #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE ( 2 << 20) #define BUFFER_SIZE (32 << 20) #define SEEK_ADDRESS #ifdef F_INTERFACE_G77 #define RETURN_BY_STACK #define NEED_F2CCONV #endif #ifdef F_INTERFACE_G95 #define RETURN_BY_PACKED #endif #ifdef F_INTERFACE_GFORT #ifdef OS_WINDOWS #ifndef DOUBLE #define RETURN_BY_REGS #else #define RETURN_BY_STACK #endif #else #define RETURN_BY_PACKED #endif #endif #ifdef F_INTERFACE_INTEL #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_FUJITSU #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_FLANG #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_PGI #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_PATHSCALE #define RETURN_BY_PACKED #endif #ifdef F_INTERFACE_SUN #define RETURN_BY_PACKED #endif #ifdef ASSEMBLER #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif #if defined(HAVE_3DNOW) #define EMMS femms #elif defined(HAVE_MMX) #define EMMS emms #endif #ifndef EMMS #define EMMS #endif #define BRANCH .byte 0x3e #define NOBRANCH .byte 0x2e #define PADDING .byte 0x66 #ifdef OS_WINDOWS #define ARG1 %rcx #define ARG2 %rdx #define ARG3 %r8 #define ARG4 %r9 #else #define ARG1 %rdi #define ARG2 %rsi #define ARG3 %rdx #define ARG4 %rcx #define ARG5 %r8 #define ARG6 %r9 #endif #ifndef COMPLEX #ifdef XDOUBLE #define LOCAL_BUFFER_SIZE QLOCAL_BUFFER_SIZE #elif defined DOUBLE #define LOCAL_BUFFER_SIZE DLOCAL_BUFFER_SIZE #else #define LOCAL_BUFFER_SIZE SLOCAL_BUFFER_SIZE #endif #else #ifdef XDOUBLE #define LOCAL_BUFFER_SIZE XLOCAL_BUFFER_SIZE #elif defined DOUBLE #define LOCAL_BUFFER_SIZE ZLOCAL_BUFFER_SIZE #else #define LOCAL_BUFFER_SIZE CLOCAL_BUFFER_SIZE #endif #endif #if defined(OS_WINDOWS) #if LOCAL_BUFFER_SIZE > 16384 #define STACK_TOUCHING \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif LOCAL_BUFFER_SIZE > 12288 #define STACK_TOUCHING \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif LOCAL_BUFFER_SIZE > 8192 #define STACK_TOUCHING \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif LOCAL_BUFFER_SIZE > 4096 #define STACK_TOUCHING \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCHING #endif #else #define STACK_TOUCHING #endif #if defined(CORE2) #define movapd movaps #define andpd andps #define movlpd movlps #define movhpd movhps #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #ifdef OS_DARWIN #define PROLOGUE .text;.align 5; .globl REALNAME; REALNAME: #define EPILOGUE .subsections_via_symbols #define PROFCODE #endif #ifdef OS_WINDOWS #define SAVEREGISTERS \ subq $256, %rsp;\ movups %xmm6, 0(%rsp);\ movups %xmm7, 16(%rsp);\ movups %xmm8, 32(%rsp);\ movups %xmm9, 48(%rsp);\ movups %xmm10, 64(%rsp);\ movups %xmm11, 80(%rsp);\ movups %xmm12, 96(%rsp);\ movups %xmm13, 112(%rsp);\ movups %xmm14, 128(%rsp);\ movups %xmm15, 144(%rsp) #define RESTOREREGISTERS \ movups 0(%rsp), %xmm6;\ movups 16(%rsp), %xmm7;\ movups 32(%rsp), %xmm8;\ movups 48(%rsp), %xmm9;\ movups 64(%rsp), %xmm10;\ movups 80(%rsp), %xmm11;\ movups 96(%rsp), %xmm12;\ movups 112(%rsp), %xmm13;\ movups 128(%rsp), %xmm14;\ movups 144(%rsp), %xmm15;\ addq $256, %rsp #else #define SAVEREGISTERS #define RESTOREREGISTERS #endif #if defined(OS_WINDOWS) && !defined(C_PGI) #define PROLOGUE \ .text; \ .align 16; \ .globl REALNAME ;\ .def REALNAME;.scl 2;.type 32;.endef; \ REALNAME: #define PROFCODE #define EPILOGUE .end #endif #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ .globl REALNAME ;\ .type REALNAME, @function; \ REALNAME: #ifdef PROFILE #define PROFCODE call *mcount@GOTPCREL(%rip) #else #define PROFCODE #endif #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits #endif #endif #ifdef XDOUBLE #define FLD fldt #define FST fstpt #define MOVQ movq #elif defined(DOUBLE) #define FLD fldl #define FST fstpl #define FSTU fstl #define FMUL fmull #define FADD faddl #define MOVSD movsd #define MULSD mulsd #define MULPD mulpd #define CMPEQPD cmpeqpd #define COMISD comisd #define PSRLQ psrlq #define ANDPD andpd #define ADDPD addpd #define ADDSD addsd #define SUBPD subpd #define SUBSD subsd #define MOVQ movq #define MOVUPD movupd #define XORPD xorpd #else #define FLD flds #define FST fstps #define FSTU fsts #define FMUL fmuls #define FADD fadds #define MOVSD movss #define MULSD mulss #define MULPD mulps #define CMPEQPD cmpeqps #define COMISD comiss #define PSRLQ psrld #define ANDPD andps #define ADDPD addps #define ADDSD addss #define SUBPD subps #define SUBSD subss #define MOVQ movd #define MOVUPD movups #define XORPD xorps #endif #define HALT hlt #ifdef OS_DARWIN #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 #define ALIGN_5 .align 5 #define ffreep fstp #endif #ifndef ALIGN_2 #define ALIGN_2 .align 4 #endif #ifndef ALIGN_3 #define ALIGN_3 .align 8 #endif #ifndef ALIGN_4 #define ALIGN_4 .align 16 #endif #ifndef ALIGN_5 #define ALIGN_5 .align 32 #endif #ifndef ALIGN_6 #define ALIGN_6 .align 64 #endif // ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. // Please check out http://www.sandpile.org/x86/opc_fpu.htm #ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif #endif OpenBLAS-0.2.20/common_z.h000066400000000000000000000534411313527062700151450ustar00rootroot00000000000000#ifndef COMMON_Z_H #define COMMON_Z_H #ifndef DYNAMIC_ARCH #define ZAMAX_K zamax_k #define ZAMIN_K zamin_k #define ZMAX_K zmax_k #define ZMIN_K zmin_k #define IZAMAX_K izamax_k #define IZAMIN_K izamin_k #define IZMAX_K izmax_k #define IZMIN_K izmin_k #define ZASUM_K zasum_k #define ZAXPYU_K zaxpy_k #define ZAXPYC_K zaxpyc_k #define ZCOPY_K zcopy_k #define ZDOTU_K zdotu_k #define ZDOTC_K zdotc_k #define ZNRM2_K znrm2_k #define ZSCAL_K zscal_k #define ZSWAP_K zswap_k #define ZROT_K zdrot_k #define ZGEMV_N zgemv_n #define ZGEMV_T zgemv_t #define ZGEMV_R zgemv_r #define ZGEMV_C zgemv_c #define ZGEMV_O zgemv_o #define ZGEMV_U zgemv_u #define ZGEMV_S zgemv_s #define ZGEMV_D zgemv_d #define ZGERU_K zgeru_k #define ZGERC_K zgerc_k #define ZGERV_K zgerv_k #define ZGERD_K zgerd_k #define ZSYMV_U zsymv_U #define ZSYMV_L zsymv_L #define ZHEMV_U zhemv_U #define ZHEMV_L zhemv_L #define ZHEMV_V zhemv_V #define ZHEMV_M zhemv_M #define ZSYMV_THREAD_U zsymv_thread_U #define ZSYMV_THREAD_L zsymv_thread_L #define ZHEMV_THREAD_U zhemv_thread_U #define ZHEMV_THREAD_L zhemv_thread_L #define ZHEMV_THREAD_V zhemv_thread_V #define ZHEMV_THREAD_M zhemv_thread_M #define ZGEMM_ONCOPY zgemm_oncopy #define ZGEMM_OTCOPY zgemm_otcopy #if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N #define ZGEMM_INCOPY zgemm_oncopy #define ZGEMM_ITCOPY zgemm_otcopy #else #define ZGEMM_INCOPY zgemm_incopy #define ZGEMM_ITCOPY zgemm_itcopy #endif #define ZTRMM_OUNUCOPY ztrmm_ounucopy #define ZTRMM_OUNNCOPY ztrmm_ounncopy #define ZTRMM_OUTUCOPY ztrmm_outucopy #define ZTRMM_OUTNCOPY ztrmm_outncopy #define ZTRMM_OLNUCOPY ztrmm_olnucopy #define ZTRMM_OLNNCOPY ztrmm_olnncopy #define ZTRMM_OLTUCOPY ztrmm_oltucopy #define ZTRMM_OLTNCOPY ztrmm_oltncopy #define ZTRSM_OUNUCOPY ztrsm_ounucopy #define ZTRSM_OUNNCOPY ztrsm_ounncopy #define ZTRSM_OUTUCOPY ztrsm_outucopy #define ZTRSM_OUTNCOPY ztrsm_outncopy #define ZTRSM_OLNUCOPY ztrsm_olnucopy #define ZTRSM_OLNNCOPY ztrsm_olnncopy #define ZTRSM_OLTUCOPY ztrsm_oltucopy #define ZTRSM_OLTNCOPY ztrsm_oltncopy #if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N #define ZTRMM_IUNUCOPY ztrmm_ounucopy #define ZTRMM_IUNNCOPY ztrmm_ounncopy #define ZTRMM_IUTUCOPY ztrmm_outucopy #define ZTRMM_IUTNCOPY ztrmm_outncopy #define ZTRMM_ILNUCOPY ztrmm_olnucopy #define ZTRMM_ILNNCOPY ztrmm_olnncopy #define ZTRMM_ILTUCOPY ztrmm_oltucopy #define ZTRMM_ILTNCOPY ztrmm_oltncopy #define ZTRSM_IUNUCOPY ztrsm_ounucopy #define ZTRSM_IUNNCOPY ztrsm_ounncopy #define ZTRSM_IUTUCOPY ztrsm_outucopy #define ZTRSM_IUTNCOPY ztrsm_outncopy #define ZTRSM_ILNUCOPY ztrsm_olnucopy #define ZTRSM_ILNNCOPY ztrsm_olnncopy #define ZTRSM_ILTUCOPY ztrsm_oltucopy #define ZTRSM_ILTNCOPY ztrsm_oltncopy #else #define ZTRMM_IUNUCOPY ztrmm_iunucopy #define ZTRMM_IUNNCOPY ztrmm_iunncopy #define ZTRMM_IUTUCOPY ztrmm_iutucopy #define ZTRMM_IUTNCOPY ztrmm_iutncopy #define ZTRMM_ILNUCOPY ztrmm_ilnucopy #define ZTRMM_ILNNCOPY ztrmm_ilnncopy #define ZTRMM_ILTUCOPY ztrmm_iltucopy #define ZTRMM_ILTNCOPY ztrmm_iltncopy #define ZTRSM_IUNUCOPY ztrsm_iunucopy #define ZTRSM_IUNNCOPY ztrsm_iunncopy #define ZTRSM_IUTUCOPY ztrsm_iutucopy #define ZTRSM_IUTNCOPY ztrsm_iutncopy #define ZTRSM_ILNUCOPY ztrsm_ilnucopy #define ZTRSM_ILNNCOPY ztrsm_ilnncopy #define ZTRSM_ILTUCOPY ztrsm_iltucopy #define ZTRSM_ILTNCOPY ztrsm_iltncopy #endif #define ZGEMM_BETA zgemm_beta #define ZGEMM_KERNEL_N zgemm_kernel_n #define ZGEMM_KERNEL_L zgemm_kernel_l #define ZGEMM_KERNEL_R zgemm_kernel_r #define ZGEMM_KERNEL_B zgemm_kernel_b #define ZTRMM_KERNEL_LN ztrmm_kernel_LN #define ZTRMM_KERNEL_LT ztrmm_kernel_LT #define ZTRMM_KERNEL_LR ztrmm_kernel_LR #define ZTRMM_KERNEL_LC ztrmm_kernel_LC #define ZTRMM_KERNEL_RN ztrmm_kernel_RN #define ZTRMM_KERNEL_RT ztrmm_kernel_RT #define ZTRMM_KERNEL_RR ztrmm_kernel_RR #define ZTRMM_KERNEL_RC ztrmm_kernel_RC #define ZTRSM_KERNEL_LN ztrsm_kernel_LN #define ZTRSM_KERNEL_LT ztrsm_kernel_LT #define ZTRSM_KERNEL_LR ztrsm_kernel_LR #define ZTRSM_KERNEL_LC ztrsm_kernel_LC #define ZTRSM_KERNEL_RN ztrsm_kernel_RN #define ZTRSM_KERNEL_RT ztrsm_kernel_RT #define ZTRSM_KERNEL_RR ztrsm_kernel_RR #define ZTRSM_KERNEL_RC ztrsm_kernel_RC #define ZSYMM_OUTCOPY zsymm_outcopy #define ZSYMM_OLTCOPY zsymm_oltcopy #if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N #define ZSYMM_IUTCOPY zsymm_outcopy #define ZSYMM_ILTCOPY zsymm_oltcopy #else #define ZSYMM_IUTCOPY zsymm_iutcopy #define ZSYMM_ILTCOPY zsymm_iltcopy #endif #define ZHEMM_OUTCOPY zhemm_outcopy #define ZHEMM_OLTCOPY zhemm_oltcopy #if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N #define ZHEMM_IUTCOPY zhemm_outcopy #define ZHEMM_ILTCOPY zhemm_oltcopy #else #define ZHEMM_IUTCOPY zhemm_iutcopy #define ZHEMM_ILTCOPY zhemm_iltcopy #endif #define ZGEMM3M_ONCOPYB zgemm3m_oncopyb #define ZGEMM3M_ONCOPYR zgemm3m_oncopyr #define ZGEMM3M_ONCOPYI zgemm3m_oncopyi #define ZGEMM3M_OTCOPYB zgemm3m_otcopyb #define ZGEMM3M_OTCOPYR zgemm3m_otcopyr #define ZGEMM3M_OTCOPYI zgemm3m_otcopyi #define ZGEMM3M_INCOPYB zgemm3m_incopyb #define ZGEMM3M_INCOPYR zgemm3m_incopyr #define ZGEMM3M_INCOPYI zgemm3m_incopyi #define ZGEMM3M_ITCOPYB zgemm3m_itcopyb #define ZGEMM3M_ITCOPYR zgemm3m_itcopyr #define ZGEMM3M_ITCOPYI zgemm3m_itcopyi #define ZSYMM3M_ILCOPYB zsymm3m_ilcopyb #define ZSYMM3M_IUCOPYB zsymm3m_iucopyb #define ZSYMM3M_ILCOPYR zsymm3m_ilcopyr #define ZSYMM3M_IUCOPYR zsymm3m_iucopyr #define ZSYMM3M_ILCOPYI zsymm3m_ilcopyi #define ZSYMM3M_IUCOPYI zsymm3m_iucopyi #define ZSYMM3M_OLCOPYB zsymm3m_olcopyb #define ZSYMM3M_OUCOPYB zsymm3m_oucopyb #define ZSYMM3M_OLCOPYR zsymm3m_olcopyr #define ZSYMM3M_OUCOPYR zsymm3m_oucopyr #define ZSYMM3M_OLCOPYI zsymm3m_olcopyi #define ZSYMM3M_OUCOPYI zsymm3m_oucopyi #define ZHEMM3M_ILCOPYB zhemm3m_ilcopyb #define ZHEMM3M_IUCOPYB zhemm3m_iucopyb #define ZHEMM3M_ILCOPYR zhemm3m_ilcopyr #define ZHEMM3M_IUCOPYR zhemm3m_iucopyr #define ZHEMM3M_ILCOPYI zhemm3m_ilcopyi #define ZHEMM3M_IUCOPYI zhemm3m_iucopyi #define ZHEMM3M_OLCOPYB zhemm3m_olcopyb #define ZHEMM3M_OUCOPYB zhemm3m_oucopyb #define ZHEMM3M_OLCOPYR zhemm3m_olcopyr #define ZHEMM3M_OUCOPYR zhemm3m_oucopyr #define ZHEMM3M_OLCOPYI zhemm3m_olcopyi #define ZHEMM3M_OUCOPYI zhemm3m_oucopyi #define ZGEMM3M_KERNEL zgemm3m_kernel #define ZNEG_TCOPY zneg_tcopy #define ZLASWP_NCOPY zlaswp_ncopy #define ZAXPBY_K zaxpby_k #define ZOMATCOPY_K_CN zomatcopy_k_cn #define ZOMATCOPY_K_RN zomatcopy_k_rn #define ZOMATCOPY_K_CT zomatcopy_k_ct #define ZOMATCOPY_K_RT zomatcopy_k_rt #define ZOMATCOPY_K_CNC zomatcopy_k_cnc #define ZOMATCOPY_K_RNC zomatcopy_k_rnc #define ZOMATCOPY_K_CTC zomatcopy_k_ctc #define ZOMATCOPY_K_RTC zomatcopy_k_rtc #define ZIMATCOPY_K_CN zimatcopy_k_cn #define ZIMATCOPY_K_RN zimatcopy_k_rn #define ZIMATCOPY_K_CT zimatcopy_k_ct #define ZIMATCOPY_K_RT zimatcopy_k_rt #define ZIMATCOPY_K_CNC zimatcopy_k_cnc #define ZIMATCOPY_K_RNC zimatcopy_k_rnc #define ZIMATCOPY_K_CTC zimatcopy_k_ctc #define ZIMATCOPY_K_RTC zimatcopy_k_rtc #define ZGEADD_K zgeadd_k #else #define ZAMAX_K gotoblas -> zamax_k #define ZAMIN_K gotoblas -> zamin_k #define ZMAX_K gotoblas -> zmax_k #define ZMIN_K gotoblas -> zmin_k #define IZAMAX_K gotoblas -> izamax_k #define IZAMIN_K gotoblas -> izamin_k #define IZMAX_K gotoblas -> izmax_k #define IZMIN_K gotoblas -> izmin_k #define ZASUM_K gotoblas -> zasum_k #define ZAXPYU_K gotoblas -> zaxpy_k #define ZAXPYC_K gotoblas -> zaxpyc_k #define ZCOPY_K gotoblas -> zcopy_k #define ZDOTU_K gotoblas -> zdotu_k #define ZDOTC_K gotoblas -> zdotc_k #define ZNRM2_K gotoblas -> znrm2_k #define ZSCAL_K gotoblas -> zscal_k #define ZSWAP_K gotoblas -> zswap_k #define ZROT_K gotoblas -> zdrot_k #define ZGEMV_N gotoblas -> zgemv_n #define ZGEMV_T gotoblas -> zgemv_t #define ZGEMV_R gotoblas -> zgemv_r #define ZGEMV_C gotoblas -> zgemv_c #define ZGEMV_O gotoblas -> zgemv_o #define ZGEMV_U gotoblas -> zgemv_u #define ZGEMV_S gotoblas -> zgemv_s #define ZGEMV_D gotoblas -> zgemv_d #define ZGERU_K gotoblas -> zgeru_k #define ZGERC_K gotoblas -> zgerc_k #define ZGERV_K gotoblas -> zgerv_k #define ZGERD_K gotoblas -> zgerd_k #define ZSYMV_U gotoblas -> zsymv_U #define ZSYMV_L gotoblas -> zsymv_L #define ZHEMV_U gotoblas -> zhemv_U #define ZHEMV_L gotoblas -> zhemv_L #define ZHEMV_V gotoblas -> zhemv_V #define ZHEMV_M gotoblas -> zhemv_M #define ZSYMV_THREAD_U zsymv_thread_U #define ZSYMV_THREAD_L zsymv_thread_L #define ZHEMV_THREAD_U zhemv_thread_U #define ZHEMV_THREAD_L zhemv_thread_L #define ZHEMV_THREAD_V zhemv_thread_V #define ZHEMV_THREAD_M zhemv_thread_M #define ZGEMM_ONCOPY gotoblas -> zgemm_oncopy #define ZGEMM_OTCOPY gotoblas -> zgemm_otcopy #define ZGEMM_INCOPY gotoblas -> zgemm_incopy #define ZGEMM_ITCOPY gotoblas -> zgemm_itcopy #define ZTRMM_OUNUCOPY gotoblas -> ztrmm_ounucopy #define ZTRMM_OUTUCOPY gotoblas -> ztrmm_outucopy #define ZTRMM_OLNUCOPY gotoblas -> ztrmm_olnucopy #define ZTRMM_OLTUCOPY gotoblas -> ztrmm_oltucopy #define ZTRSM_OUNUCOPY gotoblas -> ztrsm_ounucopy #define ZTRSM_OUTUCOPY gotoblas -> ztrsm_outucopy #define ZTRSM_OLNUCOPY gotoblas -> ztrsm_olnucopy #define ZTRSM_OLTUCOPY gotoblas -> ztrsm_oltucopy #define ZTRMM_IUNUCOPY gotoblas -> ztrmm_iunucopy #define ZTRMM_IUTUCOPY gotoblas -> ztrmm_iutucopy #define ZTRMM_ILNUCOPY gotoblas -> ztrmm_ilnucopy #define ZTRMM_ILTUCOPY gotoblas -> ztrmm_iltucopy #define ZTRSM_IUNUCOPY gotoblas -> ztrsm_iunucopy #define ZTRSM_IUTUCOPY gotoblas -> ztrsm_iutucopy #define ZTRSM_ILNUCOPY gotoblas -> ztrsm_ilnucopy #define ZTRSM_ILTUCOPY gotoblas -> ztrsm_iltucopy #define ZTRMM_OUNNCOPY gotoblas -> ztrmm_ounncopy #define ZTRMM_OUTNCOPY gotoblas -> ztrmm_outncopy #define ZTRMM_OLNNCOPY gotoblas -> ztrmm_olnncopy #define ZTRMM_OLTNCOPY gotoblas -> ztrmm_oltncopy #define ZTRSM_OUNNCOPY gotoblas -> ztrsm_ounncopy #define ZTRSM_OUTNCOPY gotoblas -> ztrsm_outncopy #define ZTRSM_OLNNCOPY gotoblas -> ztrsm_olnncopy #define ZTRSM_OLTNCOPY gotoblas -> ztrsm_oltncopy #define ZTRMM_IUNNCOPY gotoblas -> ztrmm_iunncopy #define ZTRMM_IUTNCOPY gotoblas -> ztrmm_iutncopy #define ZTRMM_ILNNCOPY gotoblas -> ztrmm_ilnncopy #define ZTRMM_ILTNCOPY gotoblas -> ztrmm_iltncopy #define ZTRSM_IUNNCOPY gotoblas -> ztrsm_iunncopy #define ZTRSM_IUTNCOPY gotoblas -> ztrsm_iutncopy #define ZTRSM_ILNNCOPY gotoblas -> ztrsm_ilnncopy #define ZTRSM_ILTNCOPY gotoblas -> ztrsm_iltncopy #define ZGEMM_BETA gotoblas -> zgemm_beta #define ZGEMM_KERNEL_N gotoblas -> zgemm_kernel_n #define ZGEMM_KERNEL_L gotoblas -> zgemm_kernel_l #define ZGEMM_KERNEL_R gotoblas -> zgemm_kernel_r #define ZGEMM_KERNEL_B gotoblas -> zgemm_kernel_b #define ZTRMM_KERNEL_LN gotoblas -> ztrmm_kernel_LN #define ZTRMM_KERNEL_LT gotoblas -> ztrmm_kernel_LT #define ZTRMM_KERNEL_LR gotoblas -> ztrmm_kernel_LR #define ZTRMM_KERNEL_LC gotoblas -> ztrmm_kernel_LC #define ZTRMM_KERNEL_RN gotoblas -> ztrmm_kernel_RN #define ZTRMM_KERNEL_RT gotoblas -> ztrmm_kernel_RT #define ZTRMM_KERNEL_RR gotoblas -> ztrmm_kernel_RR #define ZTRMM_KERNEL_RC gotoblas -> ztrmm_kernel_RC #define ZTRSM_KERNEL_LN gotoblas -> ztrsm_kernel_LN #define ZTRSM_KERNEL_LT gotoblas -> ztrsm_kernel_LT #define ZTRSM_KERNEL_LR gotoblas -> ztrsm_kernel_LR #define ZTRSM_KERNEL_LC gotoblas -> ztrsm_kernel_LC #define ZTRSM_KERNEL_RN gotoblas -> ztrsm_kernel_RN #define ZTRSM_KERNEL_RT gotoblas -> ztrsm_kernel_RT #define ZTRSM_KERNEL_RR gotoblas -> ztrsm_kernel_RR #define ZTRSM_KERNEL_RC gotoblas -> ztrsm_kernel_RC #define ZSYMM_IUTCOPY gotoblas -> zsymm_iutcopy #define ZSYMM_ILTCOPY gotoblas -> zsymm_iltcopy #define ZSYMM_OUTCOPY gotoblas -> zsymm_outcopy #define ZSYMM_OLTCOPY gotoblas -> zsymm_oltcopy #define ZHEMM_OUTCOPY gotoblas -> zhemm_outcopy #define ZHEMM_OLTCOPY gotoblas -> zhemm_oltcopy #define ZHEMM_IUTCOPY gotoblas -> zhemm_iutcopy #define ZHEMM_ILTCOPY gotoblas -> zhemm_iltcopy #define ZGEMM3M_ONCOPYB gotoblas -> zgemm3m_oncopyb #define ZGEMM3M_ONCOPYR gotoblas -> zgemm3m_oncopyr #define ZGEMM3M_ONCOPYI gotoblas -> zgemm3m_oncopyi #define ZGEMM3M_OTCOPYB gotoblas -> zgemm3m_otcopyb #define ZGEMM3M_OTCOPYR gotoblas -> zgemm3m_otcopyr #define ZGEMM3M_OTCOPYI gotoblas -> zgemm3m_otcopyi #define ZGEMM3M_INCOPYB gotoblas -> zgemm3m_incopyb #define ZGEMM3M_INCOPYR gotoblas -> zgemm3m_incopyr #define ZGEMM3M_INCOPYI gotoblas -> zgemm3m_incopyi #define ZGEMM3M_ITCOPYB gotoblas -> zgemm3m_itcopyb #define ZGEMM3M_ITCOPYR gotoblas -> zgemm3m_itcopyr #define ZGEMM3M_ITCOPYI gotoblas -> zgemm3m_itcopyi #define ZSYMM3M_ILCOPYB gotoblas -> zsymm3m_ilcopyb #define ZSYMM3M_IUCOPYB gotoblas -> zsymm3m_iucopyb #define ZSYMM3M_ILCOPYR gotoblas -> zsymm3m_ilcopyr #define ZSYMM3M_IUCOPYR gotoblas -> zsymm3m_iucopyr #define ZSYMM3M_ILCOPYI gotoblas -> zsymm3m_ilcopyi #define ZSYMM3M_IUCOPYI gotoblas -> zsymm3m_iucopyi #define ZSYMM3M_OLCOPYB gotoblas -> zsymm3m_olcopyb #define ZSYMM3M_OUCOPYB gotoblas -> zsymm3m_oucopyb #define ZSYMM3M_OLCOPYR gotoblas -> zsymm3m_olcopyr #define ZSYMM3M_OUCOPYR gotoblas -> zsymm3m_oucopyr #define ZSYMM3M_OLCOPYI gotoblas -> zsymm3m_olcopyi #define ZSYMM3M_OUCOPYI gotoblas -> zsymm3m_oucopyi #define ZHEMM3M_ILCOPYB gotoblas -> zhemm3m_ilcopyb #define ZHEMM3M_IUCOPYB gotoblas -> zhemm3m_iucopyb #define ZHEMM3M_ILCOPYR gotoblas -> zhemm3m_ilcopyr #define ZHEMM3M_IUCOPYR gotoblas -> zhemm3m_iucopyr #define ZHEMM3M_ILCOPYI gotoblas -> zhemm3m_ilcopyi #define ZHEMM3M_IUCOPYI gotoblas -> zhemm3m_iucopyi #define ZHEMM3M_OLCOPYB gotoblas -> zhemm3m_olcopyb #define ZHEMM3M_OUCOPYB gotoblas -> zhemm3m_oucopyb #define ZHEMM3M_OLCOPYR gotoblas -> zhemm3m_olcopyr #define ZHEMM3M_OUCOPYR gotoblas -> zhemm3m_oucopyr #define ZHEMM3M_OLCOPYI gotoblas -> zhemm3m_olcopyi #define ZHEMM3M_OUCOPYI gotoblas -> zhemm3m_oucopyi #define ZGEMM3M_KERNEL gotoblas -> zgemm3m_kernel #define ZNEG_TCOPY gotoblas -> zneg_tcopy #define ZLASWP_NCOPY gotoblas -> zlaswp_ncopy #define ZAXPBY_K gotoblas -> zaxpby_k #define ZOMATCOPY_K_CN gotoblas -> zomatcopy_k_cn #define ZOMATCOPY_K_RN gotoblas -> zomatcopy_k_rn #define ZOMATCOPY_K_CT gotoblas -> zomatcopy_k_ct #define ZOMATCOPY_K_RT gotoblas -> zomatcopy_k_rt #define ZOMATCOPY_K_CNC gotoblas -> zomatcopy_k_cnc #define ZOMATCOPY_K_RNC gotoblas -> zomatcopy_k_rnc #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc #define ZIMATCOPY_K_CN gotoblas -> zimatcopy_k_cn #define ZIMATCOPY_K_RN gotoblas -> zimatcopy_k_rn #define ZIMATCOPY_K_CT gotoblas -> zimatcopy_k_ct #define ZIMATCOPY_K_RT gotoblas -> zimatcopy_k_rt #define ZIMATCOPY_K_CNC gotoblas -> zimatcopy_k_cnc #define ZIMATCOPY_K_RNC gotoblas -> zimatcopy_k_rnc #define ZIMATCOPY_K_CTC gotoblas -> zimatcopy_k_ctc #define ZIMATCOPY_K_RTC gotoblas -> zimatcopy_k_rtc #define ZGEADD_K gotoblas -> zgeadd_k #endif #define ZGEMM_NN zgemm_nn #define ZGEMM_CN zgemm_cn #define ZGEMM_TN zgemm_tn #define ZGEMM_NC zgemm_nc #define ZGEMM_NT zgemm_nt #define ZGEMM_CC zgemm_cc #define ZGEMM_CT zgemm_ct #define ZGEMM_TC zgemm_tc #define ZGEMM_TT zgemm_tt #define ZGEMM_NR zgemm_nr #define ZGEMM_TR zgemm_tr #define ZGEMM_CR zgemm_cr #define ZGEMM_RN zgemm_rn #define ZGEMM_RT zgemm_rt #define ZGEMM_RC zgemm_rc #define ZGEMM_RR zgemm_rr #define ZSYMM_LU zsymm_LU #define ZSYMM_LL zsymm_LL #define ZSYMM_RU zsymm_RU #define ZSYMM_RL zsymm_RL #define ZHEMM_LU zhemm_LU #define ZHEMM_LL zhemm_LL #define ZHEMM_RU zhemm_RU #define ZHEMM_RL zhemm_RL #define ZSYRK_UN zsyrk_UN #define ZSYRK_UT zsyrk_UT #define ZSYRK_LN zsyrk_LN #define ZSYRK_LT zsyrk_LT #define ZSYRK_UR zsyrk_UN #define ZSYRK_UC zsyrk_UT #define ZSYRK_LR zsyrk_LN #define ZSYRK_LC zsyrk_LT #define ZSYRK_KERNEL_U zsyrk_kernel_U #define ZSYRK_KERNEL_L zsyrk_kernel_L #define ZHERK_UN zherk_UN #define ZHERK_LN zherk_LN #define ZHERK_UC zherk_UC #define ZHERK_LC zherk_LC #define ZHER2K_UN zher2k_UN #define ZHER2K_LN zher2k_LN #define ZHER2K_UC zher2k_UC #define ZHER2K_LC zher2k_LC #define ZSYR2K_UN zsyr2k_UN #define ZSYR2K_UT zsyr2k_UT #define ZSYR2K_LN zsyr2k_LN #define ZSYR2K_LT zsyr2k_LT #define ZSYR2K_UR zsyr2k_UN #define ZSYR2K_UC zsyr2k_UT #define ZSYR2K_LR zsyr2k_LN #define ZSYR2K_LC zsyr2k_LT #define ZSYR2K_KERNEL_U zsyr2k_kernel_U #define ZSYR2K_KERNEL_L zsyr2k_kernel_L #define ZTRMM_LNUU ztrmm_LNUU #define ZTRMM_LNUN ztrmm_LNUN #define ZTRMM_LNLU ztrmm_LNLU #define ZTRMM_LNLN ztrmm_LNLN #define ZTRMM_LTUU ztrmm_LTUU #define ZTRMM_LTUN ztrmm_LTUN #define ZTRMM_LTLU ztrmm_LTLU #define ZTRMM_LTLN ztrmm_LTLN #define ZTRMM_LRUU ztrmm_LRUU #define ZTRMM_LRUN ztrmm_LRUN #define ZTRMM_LRLU ztrmm_LRLU #define ZTRMM_LRLN ztrmm_LRLN #define ZTRMM_LCUU ztrmm_LCUU #define ZTRMM_LCUN ztrmm_LCUN #define ZTRMM_LCLU ztrmm_LCLU #define ZTRMM_LCLN ztrmm_LCLN #define ZTRMM_RNUU ztrmm_RNUU #define ZTRMM_RNUN ztrmm_RNUN #define ZTRMM_RNLU ztrmm_RNLU #define ZTRMM_RNLN ztrmm_RNLN #define ZTRMM_RTUU ztrmm_RTUU #define ZTRMM_RTUN ztrmm_RTUN #define ZTRMM_RTLU ztrmm_RTLU #define ZTRMM_RTLN ztrmm_RTLN #define ZTRMM_RRUU ztrmm_RRUU #define ZTRMM_RRUN ztrmm_RRUN #define ZTRMM_RRLU ztrmm_RRLU #define ZTRMM_RRLN ztrmm_RRLN #define ZTRMM_RCUU ztrmm_RCUU #define ZTRMM_RCUN ztrmm_RCUN #define ZTRMM_RCLU ztrmm_RCLU #define ZTRMM_RCLN ztrmm_RCLN #define ZTRSM_LNUU ztrsm_LNUU #define ZTRSM_LNUN ztrsm_LNUN #define ZTRSM_LNLU ztrsm_LNLU #define ZTRSM_LNLN ztrsm_LNLN #define ZTRSM_LTUU ztrsm_LTUU #define ZTRSM_LTUN ztrsm_LTUN #define ZTRSM_LTLU ztrsm_LTLU #define ZTRSM_LTLN ztrsm_LTLN #define ZTRSM_LRUU ztrsm_LRUU #define ZTRSM_LRUN ztrsm_LRUN #define ZTRSM_LRLU ztrsm_LRLU #define ZTRSM_LRLN ztrsm_LRLN #define ZTRSM_LCUU ztrsm_LCUU #define ZTRSM_LCUN ztrsm_LCUN #define ZTRSM_LCLU ztrsm_LCLU #define ZTRSM_LCLN ztrsm_LCLN #define ZTRSM_RNUU ztrsm_RNUU #define ZTRSM_RNUN ztrsm_RNUN #define ZTRSM_RNLU ztrsm_RNLU #define ZTRSM_RNLN ztrsm_RNLN #define ZTRSM_RTUU ztrsm_RTUU #define ZTRSM_RTUN ztrsm_RTUN #define ZTRSM_RTLU ztrsm_RTLU #define ZTRSM_RTLN ztrsm_RTLN #define ZTRSM_RRUU ztrsm_RRUU #define ZTRSM_RRUN ztrsm_RRUN #define ZTRSM_RRLU ztrsm_RRLU #define ZTRSM_RRLN ztrsm_RRLN #define ZTRSM_RCUU ztrsm_RCUU #define ZTRSM_RCUN ztrsm_RCUN #define ZTRSM_RCLU ztrsm_RCLU #define ZTRSM_RCLN ztrsm_RCLN #define ZGEMM_THREAD_NN zgemm_thread_nn #define ZGEMM_THREAD_CN zgemm_thread_cn #define ZGEMM_THREAD_TN zgemm_thread_tn #define ZGEMM_THREAD_NC zgemm_thread_nc #define ZGEMM_THREAD_NT zgemm_thread_nt #define ZGEMM_THREAD_CC zgemm_thread_cc #define ZGEMM_THREAD_CT zgemm_thread_ct #define ZGEMM_THREAD_TC zgemm_thread_tc #define ZGEMM_THREAD_TT zgemm_thread_tt #define ZGEMM_THREAD_NR zgemm_thread_nr #define ZGEMM_THREAD_TR zgemm_thread_tr #define ZGEMM_THREAD_CR zgemm_thread_cr #define ZGEMM_THREAD_RN zgemm_thread_rn #define ZGEMM_THREAD_RT zgemm_thread_rt #define ZGEMM_THREAD_RC zgemm_thread_rc #define ZGEMM_THREAD_RR zgemm_thread_rr #define ZSYMM_THREAD_LU zsymm_thread_LU #define ZSYMM_THREAD_LL zsymm_thread_LL #define ZSYMM_THREAD_RU zsymm_thread_RU #define ZSYMM_THREAD_RL zsymm_thread_RL #define ZHEMM_THREAD_LU zhemm_thread_LU #define ZHEMM_THREAD_LL zhemm_thread_LL #define ZHEMM_THREAD_RU zhemm_thread_RU #define ZHEMM_THREAD_RL zhemm_thread_RL #define ZSYRK_THREAD_UN zsyrk_thread_UN #define ZSYRK_THREAD_UT zsyrk_thread_UT #define ZSYRK_THREAD_LN zsyrk_thread_LN #define ZSYRK_THREAD_LT zsyrk_thread_LT #define ZSYRK_THREAD_UR zsyrk_thread_UN #define ZSYRK_THREAD_UC zsyrk_thread_UT #define ZSYRK_THREAD_LR zsyrk_thread_LN #define ZSYRK_THREAD_LC zsyrk_thread_LT #define ZHERK_THREAD_UN zherk_thread_UN #define ZHERK_THREAD_UT zherk_thread_UT #define ZHERK_THREAD_LN zherk_thread_LN #define ZHERK_THREAD_LT zherk_thread_LT #define ZHERK_THREAD_UR zherk_thread_UR #define ZHERK_THREAD_UC zherk_thread_UC #define ZHERK_THREAD_LR zherk_thread_LR #define ZHERK_THREAD_LC zherk_thread_LC #define ZGEMM3M_NN zgemm3m_nn #define ZGEMM3M_CN zgemm3m_cn #define ZGEMM3M_TN zgemm3m_tn #define ZGEMM3M_NC zgemm3m_nc #define ZGEMM3M_NT zgemm3m_nt #define ZGEMM3M_CC zgemm3m_cc #define ZGEMM3M_CT zgemm3m_ct #define ZGEMM3M_TC zgemm3m_tc #define ZGEMM3M_TT zgemm3m_tt #define ZGEMM3M_NR zgemm3m_nr #define ZGEMM3M_TR zgemm3m_tr #define ZGEMM3M_CR zgemm3m_cr #define ZGEMM3M_RN zgemm3m_rn #define ZGEMM3M_RT zgemm3m_rt #define ZGEMM3M_RC zgemm3m_rc #define ZGEMM3M_RR zgemm3m_rr #define ZGEMM3M_THREAD_NN zgemm3m_thread_nn #define ZGEMM3M_THREAD_CN zgemm3m_thread_cn #define ZGEMM3M_THREAD_TN zgemm3m_thread_tn #define ZGEMM3M_THREAD_NC zgemm3m_thread_nc #define ZGEMM3M_THREAD_NT zgemm3m_thread_nt #define ZGEMM3M_THREAD_CC zgemm3m_thread_cc #define ZGEMM3M_THREAD_CT zgemm3m_thread_ct #define ZGEMM3M_THREAD_TC zgemm3m_thread_tc #define ZGEMM3M_THREAD_TT zgemm3m_thread_tt #define ZGEMM3M_THREAD_NR zgemm3m_thread_nr #define ZGEMM3M_THREAD_TR zgemm3m_thread_tr #define ZGEMM3M_THREAD_CR zgemm3m_thread_cr #define ZGEMM3M_THREAD_RN zgemm3m_thread_rn #define ZGEMM3M_THREAD_RT zgemm3m_thread_rt #define ZGEMM3M_THREAD_RC zgemm3m_thread_rc #define ZGEMM3M_THREAD_RR zgemm3m_thread_rr #define ZSYMM3M_LU zsymm3m_LU #define ZSYMM3M_LL zsymm3m_LL #define ZSYMM3M_RU zsymm3m_RU #define ZSYMM3M_RL zsymm3m_RL #define ZSYMM3M_THREAD_LU zsymm3m_thread_LU #define ZSYMM3M_THREAD_LL zsymm3m_thread_LL #define ZSYMM3M_THREAD_RU zsymm3m_thread_RU #define ZSYMM3M_THREAD_RL zsymm3m_thread_RL #define ZHEMM3M_LU zhemm3m_LU #define ZHEMM3M_LL zhemm3m_LL #define ZHEMM3M_RU zhemm3m_RU #define ZHEMM3M_RL zhemm3m_RL #define ZHEMM3M_THREAD_LU zhemm3m_thread_LU #define ZHEMM3M_THREAD_LL zhemm3m_thread_LL #define ZHEMM3M_THREAD_RU zhemm3m_thread_RU #define ZHEMM3M_THREAD_RL zhemm3m_thread_RL #endif OpenBLAS-0.2.20/common_zarch.h000066400000000000000000000074741313527062700160100ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #ifndef COMMON_ZARCH #define COMMON_ZARCH #define MB //__asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB //__asm__ __volatile__ ("dmb ishst" : : : "memory") #define INLINE inline #define RETURN_BY_COMPLEX #ifndef ASSEMBLER /* static void __inline blas_lock(volatile BLASULONG *address){ BLASULONG ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( "mov x4, #1 \n\t" "1: \n\t" "ldaxr x2, [%1] \n\t" "cbnz x2, 1b \n\t" "2: \n\t" "stxr w3, x4, [%1] \n\t" "cbnz w3, 1b \n\t" "mov %0, #0 \n\t" : "=r"(ret), "=r"(address) : "1"(address) : "memory", "x2" , "x3", "x4" ); } while (ret); } */ //#define BLAS_LOCK_DEFINED static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } #if defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ .text ;\ .align 256 ;\ .global REALNAME ;\ .type REALNAME, %function ;\ REALNAME: #define EPILOGUE #define PROFCODE #endif #define SEEK_ADDRESS #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE ( 4 << 20) #if defined(CORTEXA57) #define BUFFER_SIZE (20 << 20) #else #define BUFFER_SIZE (16 << 20) #endif #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #endif OpenBLAS-0.2.20/cpuid.S000066400000000000000000000057521313527062700144050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #if defined(__APPLE__) && defined(__i386__) /* Quick hack for Darwin/x86 */ .text .globl _cpuid _cpuid: pushl %esi pushl %ebx movl 12(%esp), %eax cpuid movl 16(%esp), %esi movl %eax, (%esi) movl 20(%esp), %esi movl %ebx, (%esi) movl 24(%esp), %esi movl %ecx, (%esi) movl 28(%esp), %esi movl %edx, (%esi) popl %ebx popl %esi ret .subsections_via_symbols #endif OpenBLAS-0.2.20/cpuid.h000066400000000000000000000161171313527062700144270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef CPUID_H #define CPUID_H #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) #define INTEL_AMD #endif #define VENDOR_INTEL 1 #define VENDOR_UMC 2 #define VENDOR_AMD 3 #define VENDOR_CYRIX 4 #define VENDOR_NEXGEN 5 #define VENDOR_CENTAUR 6 #define VENDOR_RISE 7 #define VENDOR_SIS 8 #define VENDOR_TRANSMETA 9 #define VENDOR_NSC 10 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define FAMILY_80486 4 #define FAMILY_P5 5 #define FAMILY_P6 6 #define FAMILY_PM 7 #define FAMILY_IA64 8 #ifdef INTEL_AMD #define GET_EXFAMILY 1 #define GET_EXMODEL 2 #define GET_TYPE 3 #define GET_FAMILY 4 #define GET_MODEL 5 #define GET_APICID 6 #define GET_LCOUNT 7 #define GET_CHUNKS 8 #define GET_STEPPING 9 #define GET_BLANDID 10 #define GET_FEATURE 11 #define GET_NUMSHARE 12 #define GET_NUMCORES 13 #endif #ifdef __ia64__ #define GET_ARCHREV 1 #define GET_FAMILY 2 #define GET_MODEL 3 #define GET_REVISION 4 #define GET_NUMBER 5 #endif #define CORE_UNKNOWN 0 #define CORE_80486 1 #define CORE_P5 2 #define CORE_P6 3 #define CORE_KATMAI 4 #define CORE_COPPERMINE 5 #define CORE_NORTHWOOD 6 #define CORE_PRESCOTT 7 #define CORE_BANIAS 8 #define CORE_ATHLON 9 #define CORE_OPTERON 10 #define CORE_BARCELONA 11 #define CORE_VIAC3 12 #define CORE_YONAH 13 #define CORE_CORE2 14 #define CORE_PENRYN 15 #define CORE_DUNNINGTON 16 #define CORE_NEHALEM 17 #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 #define CORE_BOBCAT 21 #define CORE_BULLDOZER 22 #define CORE_PILEDRIVER 23 #define CORE_HASWELL 24 #define CORE_STEAMROLLER 25 #define CORE_EXCAVATOR 26 #define CORE_ZEN 27 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) #define HAVE_SSE3 (1 << 2) #define HAVE_SSSE3 (1 << 3) #define HAVE_SSE4_1 (1 << 4) #define HAVE_SSE4_2 (1 << 5) #define HAVE_SSE4A (1 << 6) #define HAVE_SSE5 (1 << 7) #define HAVE_MMX (1 << 8) #define HAVE_3DNOW (1 << 9) #define HAVE_3DNOWEX (1 << 10) #define HAVE_CMOV (1 << 11) #define HAVE_PSE (1 << 12) #define HAVE_CFLUSH (1 << 13) #define HAVE_HIT (1 << 14) #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) #define HAVE_AVX (1 << 18) #define HAVE_FMA4 (1 << 19) #define HAVE_FMA3 (1 << 20) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 #define CACHE_INFO_L2 3 #define CACHE_INFO_L3 4 #define CACHE_INFO_L1_ITB 5 #define CACHE_INFO_L1_DTB 6 #define CACHE_INFO_L1_LITB 7 #define CACHE_INFO_L1_LDTB 8 #define CACHE_INFO_L2_ITB 9 #define CACHE_INFO_L2_DTB 10 #define CACHE_INFO_L2_LITB 11 #define CACHE_INFO_L2_LDTB 12 typedef struct { int size; int associative; int linesize; int shared; } cache_info_t; #define CPUTYPE_UNKNOWN 0 #define CPUTYPE_INTEL_UNKNOWN 1 #define CPUTYPE_UMC_UNKNOWN 2 #define CPUTYPE_AMD_UNKNOWN 3 #define CPUTYPE_CYRIX_UNKNOWN 4 #define CPUTYPE_NEXGEN_UNKNOWN 5 #define CPUTYPE_CENTAUR_UNKNOWN 6 #define CPUTYPE_RISE_UNKNOWN 7 #define CPUTYPE_SIS_UNKNOWN 8 #define CPUTYPE_TRANSMETA_UNKNOWN 9 #define CPUTYPE_NSC_UNKNOWN 10 #define CPUTYPE_80386 11 #define CPUTYPE_80486 12 #define CPUTYPE_PENTIUM 13 #define CPUTYPE_PENTIUM2 14 #define CPUTYPE_PENTIUM3 15 #define CPUTYPE_PENTIUMM 16 #define CPUTYPE_PENTIUM4 17 #define CPUTYPE_CORE2 18 #define CPUTYPE_PENRYN 19 #define CPUTYPE_DUNNINGTON 20 #define CPUTYPE_NEHALEM 21 #define CPUTYPE_ATOM 22 #define CPUTYPE_ITANIUM 23 #define CPUTYPE_ITANIUM2 24 #define CPUTYPE_AMD5X86 25 #define CPUTYPE_AMDK6 26 #define CPUTYPE_ATHLON 27 #define CPUTYPE_DURON 28 #define CPUTYPE_OPTERON 29 #define CPUTYPE_BARCELONA 30 #define CPUTYPE_SHANGHAI 31 #define CPUTYPE_ISTANBUL 32 #define CPUTYPE_CYRIX5X86 33 #define CPUTYPE_CYRIXM1 34 #define CPUTYPE_CYRIXM2 35 #define CPUTYPE_NEXGENNX586 36 #define CPUTYPE_CENTAURC6 37 #define CPUTYPE_RISEMP6 38 #define CPUTYPE_SYS55X 39 #define CPUTYPE_CRUSOETM3X 40 #define CPUTYPE_NSGEODE 41 #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 #define CPUTYPE_SANDYBRIDGE 44 #define CPUTYPE_BOBCAT 45 #define CPUTYPE_BULLDOZER 46 #define CPUTYPE_PILEDRIVER 47 #define CPUTYPE_HASWELL 48 #define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_ZEN 51 #endif OpenBLAS-0.2.20/cpuid_alpha.c000066400000000000000000000077401313527062700155710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #if defined(__alpha) && defined(__DECC) #include #endif int implver(void){ int arch; #ifndef __DECC asm __volatile__("implver %0" : "=r"(arch) : : "memory"); #else arch = asm("implver %v0"); #endif return arch; } void get_architecture(void){ printf("ALPHA"); } void get_subarchitecture(void){ printf("ev%d", implver() + 4); } void get_subdirname(void){ printf("alpha"); } void get_cpuconfig(void){ printf("#define EV%d\n", implver() + 4); switch (implver()){ case 0: printf("#define L1_DATA_SIZE 16384\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 2097152\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 32\n"); printf("#define DTB_SIZE 8192\n"); break; case 1: printf("#define L1_DATA_SIZE 16384\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 2097152\n"); printf("#define L2_LINESIZE 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 8192\n"); break; case 2: printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L2_SIZE 4194304\n"); printf("#define L2_LINESIZE 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 8192\n"); break; } } void get_libname(void){ printf("ev%d\n", implver() + 4); } OpenBLAS-0.2.20/cpuid_arm.c000066400000000000000000000163031313527062700152560ustar00rootroot00000000000000/************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #define CPU_UNKNOWN 0 #define CPU_ARMV6 1 #define CPU_ARMV7 2 #define CPU_CORTEXA9 3 #define CPU_CORTEXA15 4 static char *cpuname[] = { "UNKOWN", "ARMV6", "ARMV7", "CORTEXA9", "CORTEXA15" }; static char *cpuname_lower[] = { "unknown", "armv6", "armv7", "cortexa9", "cortexa15" }; int get_feature(char *search) { #ifdef linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if (!strncmp("Features", buffer, 8)) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if( p == NULL ) return 0; t = strtok(p," "); while( t = strtok(NULL," ")) { if (!strcmp(t, search)) { return(1); } } #endif return(0); } int detect(void) { #ifdef linux FILE *infile; char buffer[512], *p; p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if (!strncmp("CPU part", buffer, 8)) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if(p != NULL) { if (strstr(p, "0xc09")) { return CPU_CORTEXA9; } if (strstr(p, "0xc0f")) { return CPU_CORTEXA15; } if (strstr(p, "0xd07")) { return CPU_ARMV7; //ARMV8 on 32-bit } } p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if(p != NULL) { if (strstr(p, "ARMv7")) { if ( get_feature("vfpv4")) return CPU_ARMV7; if ( get_feature("vfpv3")) return CPU_ARMV7; if ( get_feature("vfp")) return CPU_ARMV6; } if (strstr(p, "ARMv6")) { if ( get_feature("vfp")) return CPU_ARMV6; } } p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if ((!strncmp("CPU architecture", buffer, 16))) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if(p != NULL) { if (strstr(p, "8")) { return CPU_ARMV7; //ARMV8 on 32-bit } } #endif return CPU_UNKNOWN; } char *get_corename(void) { return cpuname[detect()]; } void get_architecture(void) { printf("ARM"); } void get_subarchitecture(void) { int d = detect(); printf("%s", cpuname[d]); } void get_subdirname(void) { printf("arm"); } void get_cpuconfig(void) { int d = detect(); switch (d) { case CPU_CORTEXA9: printf("#define CORTEXA9\n"); printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 128\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); break; case CPU_CORTEXA15: printf("#define CORTEXA15\n"); printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 128\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); break; case CPU_ARMV7: printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); break; case CPU_ARMV6: printf("#define ARMV6\n"); printf("#define HAVE_VFP\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); break; } } void get_libname(void) { int d = detect(); printf("%s", cpuname_lower[d]); } void get_features(void) { #ifdef linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if (!strncmp("Features", buffer, 8)) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if( p == NULL ) return; t = strtok(p," "); while( t = strtok(NULL," ")) { if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; } if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; } if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; } if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; } } #endif return; } OpenBLAS-0.2.20/cpuid_arm64.c000066400000000000000000000205271313527062700154330ustar00rootroot00000000000000/************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 #define CPU_CORTEXA57 2 #define CPU_VULCAN 3 #define CPU_THUNDERX 4 #define CPU_THUNDERX2T99 5 static char *cpuname[] = { "UNKNOWN", "ARMV8" , "CORTEXA57", "VULCAN", "THUNDERX", "THUNDERX2T99" }; static char *cpuname_lower[] = { "unknown", "armv8" , "cortexa57", "vulcan", "thunderx", "thunderx2t99" }; int get_feature(char *search) { #ifdef linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if (!strncmp("Features", buffer, 8)) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if( p == NULL ) return 0; t = strtok(p," "); while( t = strtok(NULL," ")) { if (!strcmp(t, search)) { return(1); } } #endif return(0); } int detect(void) { #ifdef linux FILE *infile; char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if ((cpu_part != NULL) && (cpu_implementer != NULL)) { break; } if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { cpu_part = strchr(buffer, ':') + 2; cpu_part = strdup(cpu_part); } else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { cpu_implementer = strchr(buffer, ':') + 2; cpu_implementer = strdup(cpu_implementer); } } fclose(infile); if(cpu_part != NULL && cpu_implementer != NULL) { if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41")) return CPU_CORTEXA57; else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) return CPU_VULCAN; else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) return CPU_THUNDERX; else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ return CPU_THUNDERX2T99; } p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) || (!strncmp("CPU architecture", buffer, 16))) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if(p != NULL) { if (strstr(p, "AArch64")) { return CPU_ARMV8; } } #endif return CPU_UNKNOWN; } char *get_corename(void) { return cpuname[detect()]; } void get_architecture(void) { printf("ARM64"); } void get_subarchitecture(void) { int d = detect(); printf("%s", cpuname[d]); } void get_subdirname(void) { printf("arm64"); } void get_cpuconfig(void) { int d = detect(); switch (d) { case CPU_ARMV8: printf("#define ARMV8\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L2_SIZE 262144\n"); printf("#define L2_LINESIZE 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); break; case CPU_VULCAN: printf("#define VULCAN \n"); printf("#define HAVE_VFP \n"); printf("#define HAVE_VFPV3 \n"); printf("#define HAVE_NEON \n"); printf("#define HAVE_VFPV4 \n"); printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n"); printf("#define L1_DATA_SIZE 32768 \n"); printf("#define L1_DATA_LINESIZE 64 \n"); printf("#define L1_DATA_ASSOCIATIVE 8 \n"); printf("#define L2_SIZE 262144 \n"); printf("#define L2_LINESIZE 64 \n"); printf("#define L2_ASSOCIATIVE 8 \n"); printf("#define L3_SIZE 33554432 \n"); printf("#define L3_LINESIZE 64 \n"); printf("#define L3_ASSOCIATIVE 32 \n"); printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; case CPU_CORTEXA57: printf("#define CORTEXA57\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); printf("#define HAVE_NEON\n"); printf("#define HAVE_VFPV4\n"); printf("#define L1_CODE_SIZE 49152\n"); printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_ASSOCIATIVE 3\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_ASSOCIATIVE 2\n"); printf("#define L2_SIZE 2097152\n"); printf("#define L2_LINESIZE 64\n"); printf("#define L2_ASSOCIATIVE 16\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; case CPU_THUNDERX: printf("#define ARMV8\n"); printf("#define THUNDERX\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 128\n"); printf("#define L2_SIZE 16777216\n"); printf("#define L2_LINESIZE 128\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 16\n"); break; case CPU_THUNDERX2T99: printf("#define VULCAN \n"); printf("#define HAVE_VFP \n"); printf("#define HAVE_VFPV3 \n"); printf("#define HAVE_NEON \n"); printf("#define HAVE_VFPV4 \n"); printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n"); printf("#define L1_DATA_SIZE 32768 \n"); printf("#define L1_DATA_LINESIZE 64 \n"); printf("#define L1_DATA_ASSOCIATIVE 8 \n"); printf("#define L2_SIZE 262144 \n"); printf("#define L2_LINESIZE 64 \n"); printf("#define L2_ASSOCIATIVE 8 \n"); printf("#define L3_SIZE 33554432 \n"); printf("#define L3_LINESIZE 64 \n"); printf("#define L3_ASSOCIATIVE 32 \n"); printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; } } void get_libname(void) { int d = detect(); printf("%s", cpuname_lower[d]); } void get_features(void) { #ifdef linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { if (!strncmp("Features", buffer, 8)) { p = strchr(buffer, ':') + 2; break; } } fclose(infile); if( p == NULL ) return; t = strtok(p," "); while( t = strtok(NULL," ")) { } #endif return; } OpenBLAS-0.2.20/cpuid_ia64.c000066400000000000000000000113051313527062700152370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include #include "cpuid.h" #ifdef __ECC #include #endif static inline unsigned long cpuid(unsigned long regnum){ unsigned long value; #ifdef __ECC value = __getIndReg(_IA64_REG_INDR_CPUID, regnum); #else asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum)); #endif return value; } int have_cpuid(void){ return 1;} int get_vendor(void){ unsigned long cpuid0, cpuid1; char vendor[18]; cpuid0 = cpuid(0); cpuid1 = cpuid(1); *(unsigned long *)(&vendor[0]) = cpuid0; *(unsigned long *)(&vendor[8]) = cpuid1; vendor[17] = (char)0; if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; return VENDOR_UNKNOWN; } int get_cputype(int gettype){ unsigned long cpuid3; cpuid3 = cpuid(3); switch (gettype) { case GET_ARCHREV : return BITMASK(cpuid3, 32, 0xff); case GET_FAMILY : return BITMASK(cpuid3, 24, 0xff); case GET_MODEL : return BITMASK(cpuid3, 16, 0xff); case GET_REVISION : return BITMASK(cpuid3, 8, 0xff); case GET_NUMBER : return BITMASK(cpuid3, 0, 0xff); } return 0; } char *get_cpunamechar(void){ if (get_cputype(GET_FAMILY) == 0x07) return "ITANIUM"; if (get_cputype(GET_FAMILY) == 0x1f) return "ITANIUM2"; if (get_cputype(GET_FAMILY) == 0x20) return "ITANIUM2"; return "UNKNOWN"; } char *get_libname(void){ if (get_cputype(GET_FAMILY) == 0x07) { printf("itanium"); return NULL;} if (get_cputype(GET_FAMILY) == 0x1f) { printf("itanium2"); return NULL;} if (get_cputype(GET_FAMILY) == 0x20) { printf("itanium2"); return NULL;} printf("UNKNOWN"); return NULL; } void get_architecture(void){ printf("IA64"); } void get_subarchitecture(void){ printf("%s", get_cpunamechar()); } void get_subdirname(void){ printf("ia64"); } void get_cpuconfig(void){ printf("#define %s\n", get_cpunamechar()); printf("#define L1_DATA_SIZE 262144\n"); printf("#define L1_DATA_LINESIZE 128\n"); printf("#define L2_SIZE 1572864\n"); printf("#define L2_LINESIZE 128\n"); printf("#define DTB_SIZE 16384\n"); printf("#define DTB_DEFAULT_ENTRIES 128\n"); } OpenBLAS-0.2.20/cpuid_mips.c000066400000000000000000000147111313527062700154500ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define CPU_UNKNOWN 0 #define CPU_P5600 1 static char *cpuname[] = { "UNKOWN", "P5600" }; int detect(void){ #ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("cpu", buffer, 3)){ p = strchr(buffer, ':') + 2; #if 0 fprintf(stderr, "%s\n", p); #endif break; } } fclose(infile); if(p != NULL){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; }else if(strstr(p, "Loongson-3B")){ return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("system type", buffer, 11)){ p = strchr(buffer, ':') + 2; break; } } fclose(infile); if (strstr(p, "loongson3a")) return CPU_LOONGSON3A; }else{ return CPU_UNKNOWN; } } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; break; } } fclose(infile); if(p != NULL){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; }else if(strstr(p, "Loongson-3B")){ return CPU_LOONGSON3B; } } #endif return CPU_UNKNOWN; } char *get_corename(void){ return cpuname[detect()]; } void get_architecture(void){ printf("MIPS"); } void get_subarchitecture(void){ if(detect()==CPU_P5600){ printf("P5600"); }else{ printf("UNKNOWN"); } } void get_subdirname(void){ printf("mips"); } void get_cpuconfig(void){ if(detect()==CPU_P5600){ printf("#define P5600\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); }else{ printf("#define UNKNOWN\n"); } } void get_libname(void){ if(detect()==CPU_P5600) { printf("p5600\n"); }else{ printf("mips\n"); } } OpenBLAS-0.2.20/cpuid_mips64.c000066400000000000000000000205471313527062700156260ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define CPU_UNKNOWN 0 #define CPU_SICORTEX 1 #define CPU_LOONGSON3A 2 #define CPU_LOONGSON3B 3 #define CPU_I6400 4 #define CPU_P6600 5 static char *cpuname[] = { "UNKOWN", "SICORTEX", "LOONGSON3A", "LOONGSON3B", "I6400", "P6600" }; int detect(void){ #ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("cpu", buffer, 3)){ p = strchr(buffer, ':') + 2; #if 0 fprintf(stderr, "%s\n", p); #endif break; } } fclose(infile); if(p != NULL){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; }else if(strstr(p, "Loongson-3B")){ return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("system type", buffer, 11)){ p = strchr(buffer, ':') + 2; break; } } fclose(infile); if (strstr(p, "loongson3a")) return CPU_LOONGSON3A; }else{ return CPU_SICORTEX; } } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; break; } } fclose(infile); if(p != NULL){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; }else if(strstr(p, "Loongson-3B")){ return CPU_LOONGSON3B; } } #endif return CPU_UNKNOWN; } char *get_corename(void){ return cpuname[detect()]; } void get_architecture(void){ printf("MIPS64"); } void get_subarchitecture(void){ if(detect()==CPU_LOONGSON3A) { printf("LOONGSON3A"); }else if(detect()==CPU_LOONGSON3B){ printf("LOONGSON3B"); }else if(detect()==CPU_I6400){ printf("I6400"); }else if(detect()==CPU_P6600){ printf("P6600"); }else{ printf("SICORTEX"); } } void get_subdirname(void){ printf("mips64"); } void get_cpuconfig(void){ if(detect()==CPU_LOONGSON3A) { printf("#define LOONGSON3A\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); }else if(detect()==CPU_LOONGSON3B){ printf("#define LOONGSON3B\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); }else if(detect()==CPU_I6400){ printf("#define I6400\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); }else if(detect()==CPU_P6600){ printf("#define P6600\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); }else{ printf("#define SICORTEX\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); printf("#define L2_LINESIZE 32\n"); printf("#define DTB_DEFAULT_ENTRIES 32\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); } } void get_libname(void){ if(detect()==CPU_LOONGSON3A) { printf("loongson3a\n"); }else if(detect()==CPU_LOONGSON3B) { printf("loongson3b\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); }else if(detect()==CPU_P6600) { printf("p6600\n"); }else{ printf("mips64\n"); } } OpenBLAS-0.2.20/cpuid_power.c000066400000000000000000000132461313527062700156360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #ifdef _AIX #include #endif #ifdef __APPLE__ #include #include #include #include #endif #define CPUTYPE_UNKNOWN 0 #define CPUTYPE_POWER3 1 #define CPUTYPE_POWER4 2 #define CPUTYPE_PPC970 3 #define CPUTYPE_POWER5 4 #define CPUTYPE_POWER6 5 #define CPUTYPE_CELL 6 #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 char *cpuname[] = { "UNKNOWN", "POWER3", "POWER4", "PPC970", "POWER5", "POWER6", "CELL", "PPCG4", "POWER8" }; char *lowercpuname[] = { "unknown", "power3", "power4", "ppc970", "power5", "power6", "cell", "ppcg4", "power8" }; char *corename[] = { "UNKNOWN", "POWER3", "POWER4", "POWER4", "POWER4", "POWER6", "CELL", "PPCG4", "POWER8" }; int detect(void){ #ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("cpu", buffer, 3)){ p = strchr(buffer, ':') + 2; #if 0 fprintf(stderr, "%s\n", p); #endif break; } } fclose(infile); if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_UNKNOWN; #endif #ifdef _AIX return CPUTYPE_POWER5; #endif #ifdef __APPLE__ host_basic_info_data_t hostInfo; mach_msg_type_number_t infoCount; infoCount = HOST_BASIC_INFO_COUNT; host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; return CPUTYPE_PPC970; #endif } void get_architecture(void){ printf("POWER"); } void get_subdirname(void){ printf("power"); } void get_subarchitecture(void){ printf("%s", cpuname[detect()]); } void get_cpuconfig(void){ #if 0 #ifdef _AIX struct vminfo info; #endif #endif printf("#define %s\n", cpuname[detect()]); printf("#define CORE_%s\n", corename[detect()]); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 128\n"); printf("#define L2_SIZE 524288\n"); printf("#define L2_LINESIZE 128 \n"); printf("#define DTB_DEFAULT_ENTRIES 128\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); #if 0 #ifdef _AIX if (vmgetinfo(&info, VMINFO, 0) == 0) { if ((info.lgpg_size >> 20) >= 1024) { printf("#define ALLOC_HUGETLB\n"); } } #endif #endif } void get_libname(void){ printf("%s", lowercpuname[detect()]); } char *get_corename(void){ return cpuname[detect()]; } OpenBLAS-0.2.20/cpuid_sparc.c000066400000000000000000000056211313527062700156100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ void get_architecture(void){ printf("SPARC"); } void get_subarchitecture(void){ printf("v9"); } void get_subdirname(void){ printf("sparc"); } void get_cpuconfig(void){ printf("#define V9\n"); printf("#define DTB_DEFAULT_ENTRIES 32\n"); } void get_libname(void){ printf("v9\n"); } OpenBLAS-0.2.20/cpuid_x86.c000066400000000000000000001323521313527062700151270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "cpuid.h" #if defined(_MSC_VER) && !defined(__clang__) #define C_INLINE __inline #else #define C_INLINE inline #endif /* #ifdef NO_AVX #define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CORE_HASWELL CORE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA #define CORE_BULLDOZER CORE_BARCELONA #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA #define CORE_PILEDRIVER CORE_BARCELONA #endif */ #if defined(_MSC_VER) && !defined(__clang__) void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) { int cpuInfo[4] = {-1}; __cpuid(cpuInfo, op); *eax = cpuInfo[0]; *ebx = cpuInfo[1]; *ecx = cpuInfo[2]; *edx = cpuInfo[3]; } #else #ifndef CPUIDEMU #if defined(__APPLE__) && defined(__i386__) void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); #else static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ #if defined(__i386__) && defined(__PIC__) __asm__ __volatile__ ("mov %%ebx, %%edi;" "cpuid;" "xchgl %%ebx, %%edi;" : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); #else __asm__ __volatile__ ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); #endif } #endif #else typedef struct { unsigned int id, a, b, c, d; } idlist_t; typedef struct { char *vendor; char *name; int start, stop; } vendor_t; extern idlist_t idlist[]; extern vendor_t vendor[]; static int cv = VENDOR; void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx){ static int current = 0; int start = vendor[cv].start; int stop = vendor[cv].stop; int count = stop - start; if ((current < start) || (current > stop)) current = start; while ((count > 0) && (idlist[current].id != op)) { current ++; if (current > stop) current = start; count --; } *eax = idlist[current].a; *ebx = idlist[current].b; *ecx = idlist[current].c; *edx = idlist[current].d; } #endif #endif // _MSC_VER static C_INLINE int have_cpuid(void){ int eax, ebx, ecx, edx; cpuid(0, &eax, &ebx, &ecx, &edx); return eax; } static C_INLINE int have_excpuid(void){ int eax, ebx, ecx, edx; cpuid(0x80000000, &eax, &ebx, &ecx, &edx); return eax & 0xffff; } #ifndef NO_AVX static C_INLINE void xgetbv(int op, int * eax, int * edx){ //Use binary code for xgetbv #if defined(_MSC_VER) && !defined(__clang__) *eax = __xgetbv(op); #else __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); #endif } #endif int support_avx(){ #ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; cpuid(1, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ xgetbv(0, &eax, &edx); if((eax & 6) == 6){ ret=1; //OS support AVX } } return ret; #else return 0; #endif } int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; cpuid(0, &eax, &ebx, &ecx, &edx); *(int *)(&vendor[0]) = ebx; *(int *)(&vendor[4]) = edx; *(int *)(&vendor[8]) = ecx; vendor[12] = (char)0; if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor, " UMC UMC UMC")) return VENDOR_UMC; if (!strcmp(vendor, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; return VENDOR_UNKNOWN; } int get_cputype(int gettype){ int eax, ebx, ecx, edx; int extend_family, family; int extend_model, model; int type, stepping; int feature = 0; cpuid(1, &eax, &ebx, &ecx, &edx); switch (gettype) { case GET_EXFAMILY : return BITMASK(eax, 20, 0xff); case GET_EXMODEL : return BITMASK(eax, 16, 0x0f); case GET_TYPE : return BITMASK(eax, 12, 0x03); case GET_FAMILY : return BITMASK(eax, 8, 0x0f); case GET_MODEL : return BITMASK(eax, 4, 0x0f); case GET_APICID : return BITMASK(ebx, 24, 0x0f); case GET_LCOUNT : return BITMASK(ebx, 16, 0x0f); case GET_CHUNKS : return BITMASK(ebx, 8, 0x0f); case GET_STEPPING : return BITMASK(eax, 0, 0x0f); case GET_BLANDID : return BITMASK(ebx, 0, 0xff); case GET_NUMSHARE : if (have_cpuid() < 4) return 0; cpuid(4, &eax, &ebx, &ecx, &edx); return BITMASK(eax, 14, 0xfff); case GET_NUMCORES : if (have_cpuid() < 4) return 0; cpuid(4, &eax, &ebx, &ecx, &edx); return BITMASK(eax, 26, 0x3f); case GET_FEATURE : if ((edx & (1 << 3)) != 0) feature |= HAVE_PSE; if ((edx & (1 << 15)) != 0) feature |= HAVE_CMOV; if ((edx & (1 << 19)) != 0) feature |= HAVE_CFLUSH; if ((edx & (1 << 23)) != 0) feature |= HAVE_MMX; if ((edx & (1 << 25)) != 0) feature |= HAVE_SSE; if ((edx & (1 << 26)) != 0) feature |= HAVE_SSE2; if ((edx & (1 << 27)) != 0) { if (BITMASK(ebx, 16, 0x0f) > 0) feature |= HAVE_HIT; } if ((ecx & (1 << 0)) != 0) feature |= HAVE_SSE3; if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; #ifndef NO_AVX if (support_avx()) feature |= HAVE_AVX; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif if (have_excpuid() >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; #ifndef NO_AVX if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; #endif if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; } if (have_excpuid() >= 0x1a) { cpuid(0x8000001a, &eax, &ebx, &ecx, &edx); if ((eax & (1 << 0)) != 0) feature |= HAVE_128BITFPU; if ((eax & (1 << 1)) != 0) feature |= HAVE_FASTMOVU; } } return feature; } int get_cacheinfo(int type, cache_info_t *cacheinfo){ int eax, ebx, ecx, edx, cpuid_level; int info[15]; int i; cache_info_t LC1, LD1, L2, L3, ITB, DTB, LITB, LDTB, L2ITB, L2DTB, L2LITB, L2LDTB; LC1.size = 0; LC1.associative = 0; LC1.linesize = 0; LC1.shared = 0; LD1.size = 0; LD1.associative = 0; LD1.linesize = 0; LD1.shared = 0; L2.size = 0; L2.associative = 0; L2.linesize = 0; L2.shared = 0; L3.size = 0; L3.associative = 0; L3.linesize = 0; L3.shared = 0; ITB.size = 0; ITB.associative = 0; ITB.linesize = 0; ITB.shared = 0; DTB.size = 0; DTB.associative = 0; DTB.linesize = 0; DTB.shared = 0; LITB.size = 0; LITB.associative = 0; LITB.linesize = 0; LITB.shared = 0; LDTB.size = 0; LDTB.associative = 0; LDTB.linesize = 0; LDTB.shared = 0; L2ITB.size = 0; L2ITB.associative = 0; L2ITB.linesize = 0; L2ITB.shared = 0; L2DTB.size = 0; L2DTB.associative = 0; L2DTB.linesize = 0; L2DTB.shared = 0; L2LITB.size = 0; L2LITB.associative = 0; L2LITB.linesize = 0; L2LITB.shared = 0; L2LDTB.size = 0; L2LDTB.associative = 0; L2LDTB.linesize = 0; L2LDTB.shared = 0; cpuid(0, &cpuid_level, &ebx, &ecx, &edx); if (cpuid_level > 1) { cpuid(2, &eax, &ebx, &ecx, &edx); info[ 0] = BITMASK(eax, 8, 0xff); info[ 1] = BITMASK(eax, 16, 0xff); info[ 2] = BITMASK(eax, 24, 0xff); info[ 3] = BITMASK(ebx, 0, 0xff); info[ 4] = BITMASK(ebx, 8, 0xff); info[ 5] = BITMASK(ebx, 16, 0xff); info[ 6] = BITMASK(ebx, 24, 0xff); info[ 7] = BITMASK(ecx, 0, 0xff); info[ 8] = BITMASK(ecx, 8, 0xff); info[ 9] = BITMASK(ecx, 16, 0xff); info[10] = BITMASK(ecx, 24, 0xff); info[11] = BITMASK(edx, 0, 0xff); info[12] = BITMASK(edx, 8, 0xff); info[13] = BITMASK(edx, 16, 0xff); info[14] = BITMASK(edx, 24, 0xff); for (i = 0; i < 15; i++){ switch (info[i]){ /* This table is from http://www.sandpile.org/ia32/cpuid.htm */ case 0x01 : ITB.size = 4; ITB.associative = 4; ITB.linesize = 32; break; case 0x02 : LITB.size = 4096; LITB.associative = 0; LITB.linesize = 2; break; case 0x03 : DTB.size = 4; DTB.associative = 4; DTB.linesize = 64; break; case 0x04 : LDTB.size = 4096; LDTB.associative = 4; LDTB.linesize = 8; break; case 0x05 : LDTB.size = 4096; LDTB.associative = 4; LDTB.linesize = 32; break; case 0x06 : LC1.size = 8; LC1.associative = 4; LC1.linesize = 32; break; case 0x08 : LC1.size = 16; LC1.associative = 4; LC1.linesize = 32; break; case 0x09 : LC1.size = 32; LC1.associative = 4; LC1.linesize = 64; break; case 0x0a : LD1.size = 8; LD1.associative = 2; LD1.linesize = 32; break; case 0x0c : LD1.size = 16; LD1.associative = 4; LD1.linesize = 32; break; case 0x0d : LD1.size = 16; LD1.associative = 4; LD1.linesize = 64; break; case 0x0e : LD1.size = 24; LD1.associative = 6; LD1.linesize = 64; break; case 0x10 : LD1.size = 16; LD1.associative = 4; LD1.linesize = 32; break; case 0x15 : LC1.size = 16; LC1.associative = 4; LC1.linesize = 32; break; case 0x1a : L2.size = 96; L2.associative = 6; L2.linesize = 64; break; case 0x21 : L2.size = 256; L2.associative = 8; L2.linesize = 64; break; case 0x22 : L3.size = 512; L3.associative = 4; L3.linesize = 64; break; case 0x23 : L3.size = 1024; L3.associative = 8; L3.linesize = 64; break; case 0x25 : L3.size = 2048; L3.associative = 8; L3.linesize = 64; break; case 0x29 : L3.size = 4096; L3.associative = 8; L3.linesize = 64; break; case 0x2c : LD1.size = 32; LD1.associative = 8; LD1.linesize = 64; break; case 0x30 : LC1.size = 32; LC1.associative = 8; LC1.linesize = 64; break; case 0x39 : L2.size = 128; L2.associative = 4; L2.linesize = 64; break; case 0x3a : L2.size = 192; L2.associative = 6; L2.linesize = 64; break; case 0x3b : L2.size = 128; L2.associative = 2; L2.linesize = 64; break; case 0x3c : L2.size = 256; L2.associative = 4; L2.linesize = 64; break; case 0x3d : L2.size = 384; L2.associative = 6; L2.linesize = 64; break; case 0x3e : L2.size = 512; L2.associative = 4; L2.linesize = 64; break; case 0x41 : L2.size = 128; L2.associative = 4; L2.linesize = 32; break; case 0x42 : L2.size = 256; L2.associative = 4; L2.linesize = 32; break; case 0x43 : L2.size = 512; L2.associative = 4; L2.linesize = 32; break; case 0x44 : L2.size = 1024; L2.associative = 4; L2.linesize = 32; break; case 0x45 : L2.size = 2048; L2.associative = 4; L2.linesize = 32; break; case 0x46 : L3.size = 4096; L3.associative = 4; L3.linesize = 64; break; case 0x47 : L3.size = 8192; L3.associative = 8; L3.linesize = 64; break; case 0x48 : L2.size = 3184; L2.associative = 12; L2.linesize = 64; break; case 0x49 : if ((get_cputype(GET_FAMILY) == 0x0f) && (get_cputype(GET_MODEL) == 0x06)) { L3.size = 4096; L3.associative = 16; L3.linesize = 64; } else { L2.size = 4096; L2.associative = 16; L2.linesize = 64; } break; case 0x4a : L3.size = 6144; L3.associative = 12; L3.linesize = 64; break; case 0x4b : L3.size = 8192; L3.associative = 16; L3.linesize = 64; break; case 0x4c : L3.size = 12280; L3.associative = 12; L3.linesize = 64; break; case 0x4d : L3.size = 16384; L3.associative = 16; L3.linesize = 64; break; case 0x4e : L2.size = 6144; L2.associative = 24; L2.linesize = 64; break; case 0x4f : ITB.size = 4; ITB.associative = 0; ITB.linesize = 32; break; case 0x50 : ITB.size = 4; ITB.associative = 0; ITB.linesize = 64; LITB.size = 4096; LITB.associative = 0; LITB.linesize = 64; LITB.shared = 1; break; case 0x51 : ITB.size = 4; ITB.associative = 0; ITB.linesize = 128; LITB.size = 4096; LITB.associative = 0; LITB.linesize = 128; LITB.shared = 1; break; case 0x52 : ITB.size = 4; ITB.associative = 0; ITB.linesize = 256; LITB.size = 4096; LITB.associative = 0; LITB.linesize = 256; LITB.shared = 1; break; case 0x55 : LITB.size = 4096; LITB.associative = 0; LITB.linesize = 7; LITB.shared = 1; break; case 0x56 : LDTB.size = 4096; LDTB.associative = 4; LDTB.linesize = 16; break; case 0x57 : LDTB.size = 4096; LDTB.associative = 4; LDTB.linesize = 16; break; case 0x5b : DTB.size = 4; DTB.associative = 0; DTB.linesize = 64; LDTB.size = 4096; LDTB.associative = 0; LDTB.linesize = 64; LDTB.shared = 1; break; case 0x5c : DTB.size = 4; DTB.associative = 0; DTB.linesize = 128; LDTB.size = 4096; LDTB.associative = 0; LDTB.linesize = 128; LDTB.shared = 1; break; case 0x5d : DTB.size = 4; DTB.associative = 0; DTB.linesize = 256; LDTB.size = 4096; LDTB.associative = 0; LDTB.linesize = 256; LDTB.shared = 1; break; case 0x60 : LD1.size = 16; LD1.associative = 8; LD1.linesize = 64; break; case 0x63 : DTB.size = 2048; DTB.associative = 4; DTB.linesize = 32; LDTB.size = 4096; LDTB.associative= 4; LDTB.linesize = 32; case 0x66 : LD1.size = 8; LD1.associative = 4; LD1.linesize = 64; break; case 0x67 : LD1.size = 16; LD1.associative = 4; LD1.linesize = 64; break; case 0x68 : LD1.size = 32; LD1.associative = 4; LD1.linesize = 64; break; case 0x70 : LC1.size = 12; LC1.associative = 8; break; case 0x71 : LC1.size = 16; LC1.associative = 8; break; case 0x72 : LC1.size = 32; LC1.associative = 8; break; case 0x73 : LC1.size = 64; LC1.associative = 8; break; case 0x76 : ITB.size = 2048; ITB.associative = 0; ITB.linesize = 8; LITB.size = 4096; LITB.associative= 0; LITB.linesize = 8; case 0x77 : LC1.size = 16; LC1.associative = 4; LC1.linesize = 64; break; case 0x78 : L2.size = 1024; L2.associative = 4; L2.linesize = 64; break; case 0x79 : L2.size = 128; L2.associative = 8; L2.linesize = 64; break; case 0x7a : L2.size = 256; L2.associative = 8; L2.linesize = 64; break; case 0x7b : L2.size = 512; L2.associative = 8; L2.linesize = 64; break; case 0x7c : L2.size = 1024; L2.associative = 8; L2.linesize = 64; break; case 0x7d : L2.size = 2048; L2.associative = 8; L2.linesize = 64; break; case 0x7e : L2.size = 256; L2.associative = 8; L2.linesize = 128; break; case 0x7f : L2.size = 512; L2.associative = 2; L2.linesize = 64; break; case 0x81 : L2.size = 128; L2.associative = 8; L2.linesize = 32; break; case 0x82 : L2.size = 256; L2.associative = 8; L2.linesize = 32; break; case 0x83 : L2.size = 512; L2.associative = 8; L2.linesize = 32; break; case 0x84 : L2.size = 1024; L2.associative = 8; L2.linesize = 32; break; case 0x85 : L2.size = 2048; L2.associative = 8; L2.linesize = 32; break; case 0x86 : L2.size = 512; L2.associative = 4; L2.linesize = 64; break; case 0x87 : L2.size = 1024; L2.associative = 8; L2.linesize = 64; break; case 0x88 : L3.size = 2048; L3.associative = 4; L3.linesize = 64; break; case 0x89 : L3.size = 4096; L3.associative = 4; L3.linesize = 64; break; case 0x8a : L3.size = 8192; L3.associative = 4; L3.linesize = 64; break; case 0x8d : L3.size = 3096; L3.associative = 12; L3.linesize = 128; break; case 0x90 : ITB.size = 4; ITB.associative = 0; ITB.linesize = 64; break; case 0x96 : DTB.size = 4; DTB.associative = 0; DTB.linesize = 32; break; case 0x9b : L2DTB.size = 4; L2DTB.associative = 0; L2DTB.linesize = 96; break; case 0xb0 : ITB.size = 4; ITB.associative = 4; ITB.linesize = 128; break; case 0xb1 : LITB.size = 4096; LITB.associative = 4; LITB.linesize = 4; break; case 0xb2 : ITB.size = 4; ITB.associative = 4; ITB.linesize = 64; break; case 0xb3 : DTB.size = 4; DTB.associative = 4; DTB.linesize = 128; break; case 0xb4 : DTB.size = 4; DTB.associative = 4; DTB.linesize = 256; break; case 0xba : DTB.size = 4; DTB.associative = 4; DTB.linesize = 64; break; case 0xd0 : L3.size = 512; L3.associative = 4; L3.linesize = 64; break; case 0xd1 : L3.size = 1024; L3.associative = 4; L3.linesize = 64; break; case 0xd2 : L3.size = 2048; L3.associative = 4; L3.linesize = 64; break; case 0xd6 : L3.size = 1024; L3.associative = 8; L3.linesize = 64; break; case 0xd7 : L3.size = 2048; L3.associative = 8; L3.linesize = 64; break; case 0xd8 : L3.size = 4096; L3.associative = 8; L3.linesize = 64; break; case 0xdc : L3.size = 2048; L3.associative = 12; L3.linesize = 64; break; case 0xdd : L3.size = 4096; L3.associative = 12; L3.linesize = 64; break; case 0xde : L3.size = 8192; L3.associative = 12; L3.linesize = 64; break; case 0xe2 : L3.size = 2048; L3.associative = 16; L3.linesize = 64; break; case 0xe3 : L3.size = 4096; L3.associative = 16; L3.linesize = 64; break; case 0xe4 : L3.size = 8192; L3.associative = 16; L3.linesize = 64; break; } } } if (get_vendor() == VENDOR_INTEL) { cpuid(0x80000000, &cpuid_level, &ebx, &ecx, &edx); if (cpuid_level >= 0x80000006) { if(L2.size<=0){ //If we didn't detect L2 correctly before, cpuid(0x80000006, &eax, &ebx, &ecx, &edx); L2.size = BITMASK(ecx, 16, 0xffff); L2.associative = BITMASK(ecx, 12, 0x0f); switch (L2.associative){ case 0x06: L2.associative = 8; break; case 0x08: L2.associative = 16; break; } L2.linesize = BITMASK(ecx, 0, 0xff); } } } if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); LDTB.size = 4096; LDTB.associative = BITMASK(eax, 24, 0xff); if (LDTB.associative == 0xff) LDTB.associative = 0; LDTB.linesize = BITMASK(eax, 16, 0xff); LITB.size = 4096; LITB.associative = BITMASK(eax, 8, 0xff); if (LITB.associative == 0xff) LITB.associative = 0; LITB.linesize = BITMASK(eax, 0, 0xff); DTB.size = 4; DTB.associative = BITMASK(ebx, 24, 0xff); if (DTB.associative == 0xff) DTB.associative = 0; DTB.linesize = BITMASK(ebx, 16, 0xff); ITB.size = 4; ITB.associative = BITMASK(ebx, 8, 0xff); if (ITB.associative == 0xff) ITB.associative = 0; ITB.linesize = BITMASK(ebx, 0, 0xff); LD1.size = BITMASK(ecx, 24, 0xff); LD1.associative = BITMASK(ecx, 16, 0xff); if (LD1.associative == 0xff) LD1.associative = 0; LD1.linesize = BITMASK(ecx, 0, 0xff); LC1.size = BITMASK(ecx, 24, 0xff); LC1.associative = BITMASK(ecx, 16, 0xff); if (LC1.associative == 0xff) LC1.associative = 0; LC1.linesize = BITMASK(ecx, 0, 0xff); cpuid(0x80000006, &eax, &ebx, &ecx, &edx); L2LDTB.size = 4096; L2LDTB.associative = BITMASK(eax, 24, 0xff); if (L2LDTB.associative == 0xff) L2LDTB.associative = 0; L2LDTB.linesize = BITMASK(eax, 16, 0xff); L2LITB.size = 4096; L2LITB.associative = BITMASK(eax, 8, 0xff); if (L2LITB.associative == 0xff) L2LITB.associative = 0; L2LITB.linesize = BITMASK(eax, 0, 0xff); L2DTB.size = 4; L2DTB.associative = BITMASK(ebx, 24, 0xff); if (L2DTB.associative == 0xff) L2DTB.associative = 0; L2DTB.linesize = BITMASK(ebx, 16, 0xff); L2ITB.size = 4; L2ITB.associative = BITMASK(ebx, 8, 0xff); if (L2ITB.associative == 0xff) L2ITB.associative = 0; L2ITB.linesize = BITMASK(ebx, 0, 0xff); if(L2.size <= 0){ //If we didn't detect L2 correctly before, L2.size = BITMASK(ecx, 16, 0xffff); L2.associative = BITMASK(ecx, 12, 0xf); switch (L2.associative){ case 0x06: L2.associative = 8; break; case 0x08: L2.associative = 16; break; } if (L2.associative == 0xff) L2.associative = 0; L2.linesize = BITMASK(ecx, 0, 0xff); } L3.size = BITMASK(edx, 18, 0x3fff) * 512; L3.associative = BITMASK(edx, 12, 0xf); if (L3.associative == 0xff) L2.associative = 0; L3.linesize = BITMASK(edx, 0, 0xff); } switch (type) { case CACHE_INFO_L1_I : *cacheinfo = LC1; break; case CACHE_INFO_L1_D : *cacheinfo = LD1; break; case CACHE_INFO_L2 : *cacheinfo = L2; break; case CACHE_INFO_L3 : *cacheinfo = L3; break; case CACHE_INFO_L1_DTB : *cacheinfo = DTB; break; case CACHE_INFO_L1_ITB : *cacheinfo = ITB; break; case CACHE_INFO_L1_LDTB : *cacheinfo = LDTB; break; case CACHE_INFO_L1_LITB : *cacheinfo = LITB; break; case CACHE_INFO_L2_DTB : *cacheinfo = L2DTB; break; case CACHE_INFO_L2_ITB : *cacheinfo = L2ITB; break; case CACHE_INFO_L2_LDTB : *cacheinfo = L2LDTB; break; case CACHE_INFO_L2_LITB : *cacheinfo = L2LITB; break; } return 0; } int get_cpuname(void){ int family, exfamily, model, vendor, exmodel; if (!have_cpuid()) return CPUTYPE_80386; family = get_cputype(GET_FAMILY); exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); vendor = get_vendor(); if (vendor == VENDOR_INTEL){ switch (family) { case 0x4: return CPUTYPE_80486; case 0x5: return CPUTYPE_PENTIUM; case 0x6: switch (exmodel) { case 0: switch (model) { case 1: case 3: case 5: case 6: return CPUTYPE_PENTIUM2; case 7: case 8: case 10: case 11: return CPUTYPE_PENTIUM3; case 9: case 13: case 14: return CPUTYPE_PENTIUMM; case 15: return CPUTYPE_CORE2; } break; case 1: switch (model) { case 6: return CPUTYPE_CORE2; case 7: return CPUTYPE_PENRYN; case 10: case 11: case 14: case 15: return CPUTYPE_NEHALEM; case 12: return CPUTYPE_ATOM; case 13: return CPUTYPE_DUNNINGTON; } break; case 2: switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) // Pentium (Clarkdale) / Pentium Mobile (Arrandale) // Xeon (Clarkdale), 32nm return CPUTYPE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) if(support_avx()) return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; //OS doesn't support AVX case 12: //Xeon Processor 5600 (Westmere-EP) return CPUTYPE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) if(support_avx()) return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; case 14: // Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; } break; case 3: switch (model) { case 7: // Bay Trail return CPUTYPE_ATOM; case 10: case 14: // Ivy Bridge if(support_avx()) return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; case 12: case 15: if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; case 13: //Broadwell if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; } break; case 4: switch (model) { case 5: case 6: if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; case 7: case 15: //Broadwell if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; case 14: //Skylake if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; case 12: // Braswell case 13: // Avoton return CPUTYPE_NEHALEM; } break; case 5: switch (model) { case 6: //Broadwell if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; case 5: case 14: // Skylake if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; case 7: // Xeon Phi Knights Landing if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; case 12: // Apollo Lake return CPUTYPE_NEHALEM; } break; case 9: case 8: switch (model) { case 14: // Kaby Lake if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; } break; } break; case 0x7: return CPUTYPE_ITANIUM; case 0xf: switch (exfamily) { case 0 : return CPUTYPE_PENTIUM4; case 1 : return CPUTYPE_ITANIUM; } break; } return CPUTYPE_INTEL_UNKNOWN; } if (vendor == VENDOR_AMD){ switch (family) { case 0x4: return CPUTYPE_AMD5X86; case 0x5: return CPUTYPE_AMDK6; case 0x6: return CPUTYPE_ATHLON; case 0xf: switch (exfamily) { case 0: case 2: return CPUTYPE_OPTERON; case 1: case 3: case 7: case 10: return CPUTYPE_BARCELONA; case 5: return CPUTYPE_BOBCAT; case 6: switch (model) { case 1: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series if(support_avx()) return CPUTYPE_BULLDOZER; else return CPUTYPE_BARCELONA; //OS don't support AVX. case 2: //AMD Piledriver case 3: //AMD Richland if(support_avx()) return CPUTYPE_PILEDRIVER; else return CPUTYPE_BARCELONA; //OS don't support AVX. case 5: // New EXCAVATOR CPUS if(support_avx()) return CPUTYPE_EXCAVATOR; else return CPUTYPE_BARCELONA; //OS don't support AVX. case 0: case 8: switch(exmodel){ case 1: //AMD Trinity if(support_avx()) return CPUTYPE_PILEDRIVER; else return CPUTYPE_BARCELONA; //OS don't support AVX. case 3: if(support_avx()) return CPUTYPE_STEAMROLLER; else return CPUTYPE_BARCELONA; //OS don't support AVX. case 6: if(support_avx()) return CPUTYPE_EXCAVATOR; else return CPUTYPE_BARCELONA; //OS don't support AVX. } break; } break; case 8: switch (model) { case 1: // AMD Ryzen if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; #else return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator #endif else return CPUTYPE_BARCELONA; } } break; } return CPUTYPE_AMD_UNKNOWN; } if (vendor == VENDOR_CYRIX){ switch (family) { case 0x4: return CPUTYPE_CYRIX5X86; case 0x5: return CPUTYPE_CYRIXM1; case 0x6: return CPUTYPE_CYRIXM2; } return CPUTYPE_CYRIX_UNKNOWN; } if (vendor == VENDOR_NEXGEN){ switch (family) { case 0x5: return CPUTYPE_NEXGENNX586; } return CPUTYPE_NEXGEN_UNKNOWN; } if (vendor == VENDOR_CENTAUR){ switch (family) { case 0x5: return CPUTYPE_CENTAURC6; break; case 0x6: return CPUTYPE_NANO; break; } return CPUTYPE_VIAC3; } if (vendor == VENDOR_RISE){ switch (family) { case 0x5: return CPUTYPE_RISEMP6; } return CPUTYPE_RISE_UNKNOWN; } if (vendor == VENDOR_SIS){ switch (family) { case 0x5: return CPUTYPE_SYS55X; } return CPUTYPE_SIS_UNKNOWN; } if (vendor == VENDOR_TRANSMETA){ switch (family) { case 0x5: return CPUTYPE_CRUSOETM3X; } return CPUTYPE_TRANSMETA_UNKNOWN; } if (vendor == VENDOR_NSC){ switch (family) { case 0x5: return CPUTYPE_NSGEODE; } return CPUTYPE_NSC_UNKNOWN; } return CPUTYPE_UNKNOWN; } static char *cpuname[] = { "UNKNOWN", "INTEL_UNKNOWN", "UMC_UNKNOWN", "AMD_UNKNOWN", "CYRIX_UNKNOWN", "NEXGEN_UNKNOWN", "CENTAUR_UNKNOWN", "RISE_UNKNOWN", "SIS_UNKNOWN", "TRANSMETA_UNKNOWN", "NSC_UNKNOWN", "80386", "80486", "PENTIUM", "PENTIUM2", "PENTIUM3", "PENTIUMM", "PENTIUM4", "CORE2", "PENRYN", "DUNNINGTON", "NEHALEM", "ATOM", "ITANIUM", "ITANIUM2", "5X86", "K6", "ATHLON", "DURON", "OPTERON", "BARCELONA", "SHANGHAI", "ISTANBUL", "CYRIX5X86", "CYRIXM1", "CYRIXM2", "NEXGENNX586", "CENTAURC6", "RISEMP6", "SYS55X", "TM3X00", "NSGEODE", "VIAC3", "NANO", "SANDYBRIDGE", "BOBCAT", "BULLDOZER", "PILEDRIVER", "HASWELL", "STEAMROLLER", "EXCAVATOR", "ZEN", }; static char *lowercpuname[] = { "unknown", "intel_unknown", "umc_unknown", "amd_unknown", "cyrix_unknown", "nexgen_unknown", "centaur_unknown", "rise_unknown", "sis_unknown", "transmeta_unknown", "nsc_unknown", "80386", "80486", "pentium", "pentium2", "pentium3", "pentiumm", "pentium4", "core2", "penryn", "dunnington", "nehalem", "atom", "itanium", "itanium2", "5x86", "k6", "athlon", "duron", "opteron", "barcelona", "shanghai", "istanbul", "cyrix5x86", "cyrixm1", "cyrixm2", "nexgennx586", "centaurc6", "risemp6", "sys55x", "tms3x00", "nsgeode", "nano", "sandybridge", "bobcat", "bulldozer", "piledriver", "haswell", "steamroller", "excavator", "zen", }; static char *corename[] = { "UNKOWN", "80486", "P5", "P6", "KATMAI", "COPPERMINE", "NORTHWOOD", "PRESCOTT", "BANIAS", "ATHLON", "OPTERON", "BARCELONA", "VIAC3", "YONAH", "CORE2", "PENRYN", "DUNNINGTON", "NEHALEM", "ATOM", "NANO", "SANDYBRIDGE", "BOBCAT", "BULLDOZER", "PILEDRIVER", "HASWELL", "STEAMROLLER", "EXCAVATOR", "ZEN", }; static char *corename_lower[] = { "unknown", "80486", "p5", "p6", "katmai", "coppermine", "northwood", "prescott", "banias", "athlon", "opteron", "barcelona", "viac3", "yonah", "core2", "penryn", "dunnington", "nehalem", "atom", "nano", "sandybridge", "bobcat", "bulldozer", "piledriver", "haswell", "steamroller", "excavator", "zen", }; char *get_cpunamechar(void){ return cpuname[get_cpuname()]; } char *get_lower_cpunamechar(void){ return lowercpuname[get_cpuname()]; } int get_coretype(void){ int family, exfamily, model, exmodel, vendor; if (!have_cpuid()) return CORE_80486; family = get_cputype(GET_FAMILY); exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); vendor = get_vendor(); if (vendor == VENDOR_INTEL){ switch (family) { case 4: return CORE_80486; case 5: return CORE_P5; case 6: switch (exmodel) { case 0: switch (model) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: return CORE_P6; case 7: return CORE_KATMAI; case 8: case 10: case 11: return CORE_COPPERMINE; case 9: case 13: case 14: return CORE_BANIAS; case 15: return CORE_CORE2; } break; case 1: switch (model) { case 6: return CORE_CORE2; case 7: return CORE_PENRYN; case 10: case 11: case 14: case 15: return CORE_NEHALEM; case 12: return CORE_ATOM; case 13: return CORE_DUNNINGTON; } break; case 2: switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) // Pentium (Clarkdale) / Pentium Mobile (Arrandale) // Xeon (Clarkdale), 32nm return CORE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) if(support_avx()) return CORE_SANDYBRIDGE; else return CORE_NEHALEM; //OS doesn't support AVX case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) if(support_avx()) return CORE_SANDYBRIDGE; else return CORE_NEHALEM; //OS doesn't support AVX case 14: //Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; } break; case 3: switch (model) { case 10: case 14: if(support_avx()) return CORE_SANDYBRIDGE; else return CORE_NEHALEM; //OS doesn't support AVX case 12: case 15: if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; case 13: //broadwell if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; } break; case 4: switch (model) { case 5: case 6: if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; case 7: case 15: //broadwell if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; case 14: //Skylake if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; case 12: // Braswell case 13: // Avoton return CORE_NEHALEM; } break; case 5: switch (model) { case 6: //broadwell if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; case 5: case 14: // Skylake if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; case 7: // Phi Knights Landing if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; case 12: // Apollo Lake return CORE_NEHALEM; } break; case 9: case 8: if (model == 14) { // Kaby Lake if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; } } break; case 15: if (model <= 0x2) return CORE_NORTHWOOD; else return CORE_PRESCOTT; } } if (vendor == VENDOR_AMD){ if (family <= 0x5) return CORE_80486; if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else if (exfamily == 5) return CORE_BOBCAT; else if (exfamily == 6) { switch (model) { case 1: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series if(support_avx()) return CORE_BULLDOZER; else return CORE_BARCELONA; //OS don't support AVX. case 2: //AMD Piledriver case 3: //AMD Richland if(support_avx()) return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. case 5: // New EXCAVATOR if(support_avx()) return CORE_EXCAVATOR; else return CORE_BARCELONA; //OS don't support AVX. case 0: case 8: switch(exmodel){ case 1: //AMD Trinity if(support_avx()) return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. case 3: if(support_avx()) return CORE_STEAMROLLER; else return CORE_BARCELONA; //OS don't support AVX. case 6: if(support_avx()) return CORE_EXCAVATOR; else return CORE_BARCELONA; //OS don't support AVX. } break; } } else if (exfamily == 8) { switch (model) { case 1: // AMD Ryzen if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; #else return CORE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator #endif else return CORE_BARCELONA; } } else { return CORE_BARCELONA; } } } if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: return CORE_NANO; break; } return CORE_VIAC3; } return CORE_UNKNOWN; } void get_cpuconfig(void){ cache_info_t info; int features; printf("#define %s\n", cpuname[get_cpuname()]); if (get_coretype() != CORE_P5) { get_cacheinfo(CACHE_INFO_L1_I, &info); if (info.size > 0) { printf("#define L1_CODE_SIZE %d\n", info.size * 1024); printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); printf("#define L1_CODE_LINESIZE %d\n", info.linesize); } get_cacheinfo(CACHE_INFO_L1_D, &info); if (info.size > 0) { printf("#define L1_DATA_SIZE %d\n", info.size * 1024); printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); printf("#define L1_DATA_LINESIZE %d\n", info.linesize); } get_cacheinfo(CACHE_INFO_L2, &info); if (info.size > 0) { printf("#define L2_SIZE %d\n", info.size * 1024); printf("#define L2_ASSOCIATIVE %d\n", info.associative); printf("#define L2_LINESIZE %d\n", info.linesize); } else { //fall back for some virtual machines. printf("#define L2_SIZE 1048576\n"); printf("#define L2_ASSOCIATIVE 6\n"); printf("#define L2_LINESIZE 64\n"); } get_cacheinfo(CACHE_INFO_L3, &info); if (info.size > 0) { printf("#define L3_SIZE %d\n", info.size * 1024); printf("#define L3_ASSOCIATIVE %d\n", info.associative); printf("#define L3_LINESIZE %d\n", info.linesize); } get_cacheinfo(CACHE_INFO_L1_ITB, &info); if (info.size > 0) { printf("#define ITB_SIZE %d\n", info.size * 1024); printf("#define ITB_ASSOCIATIVE %d\n", info.associative); printf("#define ITB_ENTRIES %d\n", info.linesize); } get_cacheinfo(CACHE_INFO_L1_DTB, &info); if (info.size > 0) { printf("#define DTB_SIZE %d\n", info.size * 1024); printf("#define DTB_ASSOCIATIVE %d\n", info.associative); printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); } else { //fall back for some virtual machines. printf("#define DTB_DEFAULT_ENTRIES 32\n"); } features = get_cputype(GET_FEATURE); if (features & HAVE_CMOV ) printf("#define HAVE_CMOV\n"); if (features & HAVE_MMX ) printf("#define HAVE_MMX\n"); if (features & HAVE_SSE ) printf("#define HAVE_SSE\n"); if (features & HAVE_SSE2 ) printf("#define HAVE_SSE2\n"); if (features & HAVE_SSE3 ) printf("#define HAVE_SSE3\n"); if (features & HAVE_SSSE3) printf("#define HAVE_SSSE3\n"); if (features & HAVE_SSE4_1) printf("#define HAVE_SSE4_1\n"); if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); if (features & HAVE_128BITFPU) printf("#define HAVE_128BITFPU\n"); if (features & HAVE_FASTMOVU) printf("#define HAVE_FASTMOVU\n"); printf("#define NUM_SHAREDCACHE %d\n", get_cputype(GET_NUMSHARE) + 1); printf("#define NUM_CORES %d\n", get_cputype(GET_NUMCORES) + 1); features = get_coretype(); if (features > 0) printf("#define CORE_%s\n", corename[features]); } else { printf("#define DTB_DEFAULT_ENTRIES 16\n"); printf("#define L1_CODE_SIZE 8192\n"); printf("#define L1_DATA_SIZE 8192\n"); printf("#define L2_SIZE 0\n"); } } void get_architecture(void){ #ifndef __64BIT__ printf("X86"); #else printf("X86_64"); #endif } void get_subarchitecture(void){ printf("%s", get_cpunamechar()); } void get_subdirname(void){ #ifndef __64BIT__ printf("x86"); #else printf("x86_64"); #endif } char *get_corename(void){ return corename[get_coretype()]; } void get_libname(void){ printf("%s", corename_lower[get_coretype()]); } /* This if for Makefile */ void get_sse(void){ int features; features = get_cputype(GET_FEATURE); if (features & HAVE_MMX ) printf("HAVE_MMX=1\n"); if (features & HAVE_SSE ) printf("HAVE_SSE=1\n"); if (features & HAVE_SSE2 ) printf("HAVE_SSE2=1\n"); if (features & HAVE_SSE3 ) printf("HAVE_SSE3=1\n"); if (features & HAVE_SSSE3) printf("HAVE_SSSE3=1\n"); if (features & HAVE_SSE4_1) printf("HAVE_SSE4_1=1\n"); if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); } OpenBLAS-0.2.20/cpuid_zarch.c000066400000000000000000000055701313527062700156120ustar00rootroot00000000000000/************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #define CPU_GENERIC 0 #define CPU_Z13 1 static char *cpuname[] = { "ZARCH_GENERIC", "Z13" }; static char *cpuname_lower[] = { "zarch_generic", "z13" }; int detect(void) { FILE *infile; char buffer[512], *p; p = (char *)NULL; infile = fopen("/proc/sysinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("Type", buffer, 4)){ p = strchr(buffer, ':') + 2; #if 0 fprintf(stderr, "%s\n", p); #endif break; } } fclose(infile); if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13; return CPU_GENERIC; } void get_libname(void) { int d = detect(); printf("%s", cpuname_lower[d]); } char *get_corename(void) { return cpuname[detect()]; } void get_architecture(void) { printf("ZARCH"); } void get_subarchitecture(void) { int d = detect(); printf("%s", cpuname[d]); } void get_subdirname(void) { printf("zarch"); } void get_cpuconfig(void) { int d = detect(); switch (d){ case CPU_GENERIC: printf("#define ZARCH_GENERIC\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; case CPU_Z13: printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; } } OpenBLAS-0.2.20/ctest.c000066400000000000000000000041161313527062700144340ustar00rootroot00000000000000//LSB (Linux Standard Base) compiler //only support lsbc++ #if defined (__LSB_VERSION__) #if !defined (__cplusplus) COMPILER_LSB #else #error "OpenBLAS only supports lsbcc." #endif #endif #if defined(__clang__) COMPILER_CLANG #endif #if defined(__PGI) || defined(__PGIC__) COMPILER_PGI #endif #if defined(__PATHSCALE__) || defined(__PATHCC__) COMPILER_PATHSCALE #endif #if defined(__INTEL_COMPILER) || defined(__ICC) || defined(__ECC) COMPILER_INTEL #endif #if defined(__OPENCC__) COMPILER_OPEN64 #endif #if defined(__SUNPRO_C) COMPILER_SUN #endif #if defined(__IBMC__) || defined(__xlc__) COMPILER_IBM #endif #if defined(__DECCC__) COMPILER_DEC #endif #if defined(__GNUC__) COMPILER_GNU #endif #if defined(__ANDROID__) OS_ANDROID #endif #if defined(__linux__) OS_LINUX #endif #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) OS_FREEBSD #endif #if defined(__NetBSD__) OS_NETBSD #endif #if defined(__sun) OS_SUNOS #endif #if defined(__APPLE__) OS_DARWIN #endif #if defined(_AIX) OS_AIX #endif #if defined(__OSF) OS_OSF #endif #if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) OS_WINNT #endif #if defined(__CYGWIN__) OS_CYGWIN_NT #endif #if defined(__INTERIX) OS_INTERIX #endif #if defined(__gnu_hurd__) /* Hurd is very similar to GNU/Linux, it should work out of the box */ OS_LINUX #endif #if defined(__i386) || defined(_X86) ARCH_X86 #endif #if defined(__x86_64__) || defined(__amd64__) ARCH_X86_64 #endif #if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) ARCH_POWER #endif #if defined(__s390x__) || defined(__zarch__) ARCH_ZARCH #endif #ifdef __mips64 ARCH_MIPS64 #endif #if defined(__mips32) || defined(__mips) ARCH_MIPS #endif #ifdef __alpha ARCH_ALPHA #endif #if defined(__sparc) || defined(__sparc__) ARCH_SPARC #endif #if defined(__ia64__) || defined(__ia64) ARCH_IA64 #endif #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) BINARY_64 #endif #if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__) || defined(__arm__) ARCH_ARM #endif #if defined(__aarch64__) ARCH_ARM64 #endif OpenBLAS-0.2.20/ctest/000077500000000000000000000000001313527062700142665ustar00rootroot00000000000000OpenBLAS-0.2.20/ctest/CMakeLists.txt000066400000000000000000000030071313527062700170260ustar00rootroot00000000000000include_directories(${PROJECT_SOURCE_DIR}) enable_language(Fortran) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh "$1 < $2\n" ) foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) string(TOLOWER ${float_char_upper} float_char) #level1 add_executable(x${float_char}cblat1 c_${float_char}blat1.f c_${float_char}blas1.c) target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}_static) add_test(NAME "x${float_char}cblat1" COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") #level2 add_executable(x${float_char}cblat2 c_${float_char}blat2.f c_${float_char}blas2.c c_${float_char}2chke.c auxiliary.c c_xerbla.c constant.c) target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}_static) add_test(NAME "x${float_char}cblat2" COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") #level3 add_executable(x${float_char}cblat3 c_${float_char}blat3.f c_${float_char}blas3.c c_${float_char}3chke.c auxiliary.c c_xerbla.c constant.c) target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}_static) add_test(NAME "x${float_char}cblat3" COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") endforeach() OpenBLAS-0.2.20/ctest/LICENSE000066400000000000000000000017031313527062700152740ustar00rootroot00000000000000This directory contains the reference implementation of BLAS which is obtainable at: http://netlib.org/blas/ The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, 2010, is as follows: 2) Are there legal restrictions on the use of BLAS reference implementation software? The reference BLAS is a freely-available software package. It is available from netlib via anonymous ftp and the World Wide Web. Thus, it can be included in commercial software packages (and has been). We only ask that proper credit be given to the authors. Like all software, it is copyrighted. It is not trademarked, but we do ask the following: If you modify the source for these routines we ask that you change the name of the routine and comment the changes made to the original. We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. OpenBLAS-0.2.20/ctest/Makefile000066400000000000000000000107661313527062700157400ustar00rootroot00000000000000# # The Makefile compiles c wrappers and testers for CBLAS. # TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS LIB = $(TOPDIR)/$(LIBNAME) stestl1o = c_sblas1.o stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o constant.o stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o constant.o dtestl1o = c_dblas1.o dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o constant.o dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o constant.o ctestl1o = c_cblas1.o ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o ztestl1o = c_zblas1.o ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o all :: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat1 OMP_NUM_THREADS=2 ./xdcblat1 OMP_NUM_THREADS=2 ./xccblat1 OMP_NUM_THREADS=2 ./xzcblat1 else OPENBLAS_NUM_THREADS=2 ./xscblat1 OPENBLAS_NUM_THREADS=2 ./xdcblat1 OPENBLAS_NUM_THREADS=2 ./xccblat1 OPENBLAS_NUM_THREADS=2 ./xzcblat1 endif endif all2: xscblat2 xdcblat2 xccblat2 xzcblat2 ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 OMP_NUM_THREADS=2 ./xdcblat2 < din2 OMP_NUM_THREADS=2 ./xccblat2 < cin2 OMP_NUM_THREADS=2 ./xzcblat2 < zin2 else OPENBLAS_NUM_THREADS=2 ./xscblat2 < sin2 OPENBLAS_NUM_THREADS=2 ./xdcblat2 < din2 OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 endif endif all3: xscblat3 xdcblat3 xccblat3 xzcblat3 ifndef CROSS ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 OMP_NUM_THREADS=2 ./xdcblat3 < din3 OMP_NUM_THREADS=2 ./xccblat3 < cin3 OMP_NUM_THREADS=2 ./xzcblat3 < zin3 else OPENBLAS_NUM_THREADS=2 ./xscblat3 < sin3 OPENBLAS_NUM_THREADS=2 ./xdcblat3 < din3 OPENBLAS_NUM_THREADS=2 ./xccblat3 < cin3 OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 endif all3_3m: xzcblat3_3m xccblat3_3m ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m else OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif clean :: rm -f x* FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) CEXTRALIB = # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) # Double real xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) # Single complex xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) # Double complex xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) include $(TOPDIR)/Makefile.tail OpenBLAS-0.2.20/ctest/auxiliary.c000066400000000000000000000024621313527062700164450ustar00rootroot00000000000000/* * Written by T. H. Do, 1/23/98, SGI/CRAY Research. */ #include #include "common.h" #include "cblas_test.h" void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans) { if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) *trans = CblasNoTrans; else if( (strncmp( type,"t",1 )==0)||(strncmp( type,"T",1 )==0) ) *trans = CblasTrans; else if( (strncmp( type,"c",1 )==0)||(strncmp( type,"C",1 )==0) ) *trans = CblasConjTrans; else *trans = UNDEFINED; } void get_uplo_type(char *type, enum CBLAS_UPLO *uplo) { if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) *uplo = CblasUpper; else if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) *uplo = CblasLower; else *uplo = UNDEFINED; } void get_diag_type(char *type, enum CBLAS_DIAG *diag) { if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) *diag = CblasUnit; else if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) *diag = CblasNonUnit; else *diag = UNDEFINED; } void get_side_type(char *type, enum CBLAS_SIDE *side) { if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) *side = CblasLeft; else if( (strncmp( type,"r",1 )==0)||(strncmp( type,"R",1 )==0) ) *side = CblasRight; else *side = UNDEFINED; } OpenBLAS-0.2.20/ctest/c_c2chke.c000066400000000000000000001014301313527062700160720ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_c2chke(char *rout) { char *sf = ( rout ) ; float A[2] = {0.0,0.0}, X[2] = {0.0,0.0}, Y[2] = {0.0,0.0}, ALPHA[2] = {0.0,0.0}, BETA[2] = {0.0,0.0}, RALPHA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } cblas_ok = TRUE ; cblas_lerr = PASSED ; if (strncmp( sf,"cblas_cgemv",11)==0) { cblas_rout = "cblas_cgemv"; cblas_info = 1; cblas_cgemv(INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgemv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; cblas_cgemv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_cgbmv",11)==0) { cblas_rout = "cblas_cgbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_chemv",11)==0) { cblas_rout = "cblas_chemv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_chemv(INVALID, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chemv(CblasColMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chemv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_chemv(CblasColMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_chemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_chemv(CblasRowMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_chemv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_chemv(CblasRowMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_chemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_chbmv",11)==0) { cblas_rout = "cblas_chbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_chbmv(INVALID, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chbmv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_chbmv(CblasColMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_chbmv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_chpmv",11)==0) { cblas_rout = "cblas_chpmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_chpmv(INVALID, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chpmv(CblasColMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chpmv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_chpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_chpmv(CblasRowMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_chpmv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_chpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctrmv",11)==0) { cblas_rout = "cblas_ctrmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctrmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctbmv",11)==0) { cblas_rout = "cblas_ctbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctbmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctpmv",11)==0) { cblas_rout = "cblas_ctpmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctpmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctrsv",11)==0) { cblas_rout = "cblas_ctrsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctrsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctbsv",11)==0) { cblas_rout = "cblas_ctbsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctbsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctpsv",11)==0) { cblas_rout = "cblas_ctpsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctpsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_cgeru",10)==0) { cblas_rout = "cblas_cgeru"; cblas_info = 1; RowMajorStrg = FALSE; cblas_cgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_cgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_cgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_cgerc",10)==0) { cblas_rout = "cblas_cgerc"; cblas_info = 1; RowMajorStrg = FALSE; cblas_cgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_cgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_cgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_cher2",11)==0) { cblas_rout = "cblas_cher2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_cher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_cher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_cher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_chpr2",11)==0) { cblas_rout = "cblas_chpr2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_chpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_chpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_chpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); } else if (strncmp( sf,"cblas_cher",10)==0) { cblas_rout = "cblas_cher"; cblas_info = 1; RowMajorStrg = FALSE; cblas_cher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_cher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_cher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_chpr",10)==0) { cblas_rout = "cblas_chpr"; cblas_info = 1; RowMajorStrg = FALSE; cblas_chpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); chkxer(); } if (cblas_ok == TRUE) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_c3chke.c000066400000000000000000002251761313527062700161110ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_c3chke(char * rout) { char *sf = ( rout ) ; float A[4] = {0.0,0.0,0.0,0.0}, B[4] = {0.0,0.0,0.0,0.0}, C[4] = {0.0,0.0,0.0,0.0}, ALPHA[2] = {0.0,0.0}, BETA[2] = {0.0,0.0}, RALPHA = 0.0, RBETA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; cblas_ok = TRUE ; cblas_lerr = PASSED ; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } if (strncmp( sf,"cblas_cgemm" ,11)==0) { cblas_rout = "cblas_cgemm" ; cblas_info = 1; cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_chemm" ,11)==0) { cblas_rout = "cblas_chemm" ; cblas_info = 1; cblas_chemm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_csymm" ,11)==0) { cblas_rout = "cblas_csymm" ; cblas_info = 1; cblas_csymm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ctrmm" ,11)==0) { cblas_rout = "cblas_ctrmm" ; cblas_info = 1; cblas_ctrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ctrsm" ,11)==0) { cblas_rout = "cblas_ctrsm" ; cblas_info = 1; cblas_ctrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_cherk" ,11)==0) { cblas_rout = "cblas_cherk" ; cblas_info = 1; cblas_cherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_csyrk" ,11)==0) { cblas_rout = "cblas_csyrk" ; cblas_info = 1; cblas_csyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_cher2k" ,12)==0) { cblas_rout = "cblas_cher2k" ; cblas_info = 1; cblas_cher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_csyr2k" ,12)==0) { cblas_rout = "cblas_csyr2k" ; cblas_info = 1; cblas_csyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } if (cblas_ok == 1 ) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_c3chke_3m.c000066400000000000000000002514461313527062700165070ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_c3chke(char * rout) { char *sf = ( rout ) ; float A[4] = {0.0,0.0,0.0,0.0}, B[4] = {0.0,0.0,0.0,0.0}, C[4] = {0.0,0.0,0.0,0.0}, ALPHA[2] = {0.0,0.0}, BETA[2] = {0.0,0.0}, RALPHA = 0.0, RBETA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; cblas_ok = TRUE ; cblas_lerr = PASSED ; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } if (strncmp( sf,"cblas_cgemm3m" ,13)==0) { cblas_rout = "cblas_cgemm3" ; cblas_info = 1; cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_cgemm" ,11)==0) { cblas_rout = "cblas_cgemm" ; cblas_info = 1; cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_cgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_chemm" ,11)==0) { cblas_rout = "cblas_chemm" ; cblas_info = 1; cblas_chemm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_csymm" ,11)==0) { cblas_rout = "cblas_csymm" ; cblas_info = 1; cblas_csymm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ctrmm" ,11)==0) { cblas_rout = "cblas_ctrmm" ; cblas_info = 1; cblas_ctrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ctrsm" ,11)==0) { cblas_rout = "cblas_ctrsm" ; cblas_info = 1; cblas_ctrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_cherk" ,11)==0) { cblas_rout = "cblas_cherk" ; cblas_info = 1; cblas_cherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_csyrk" ,11)==0) { cblas_rout = "cblas_csyrk" ; cblas_info = 1; cblas_csyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_cher2k" ,12)==0) { cblas_rout = "cblas_cher2k" ; cblas_info = 1; cblas_cher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_csyr2k" ,12)==0) { cblas_rout = "cblas_csyr2k" ; cblas_info = 1; cblas_csyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } if (cblas_ok == 1 ) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_cblas1.c000066400000000000000000000032271313527062700161050ustar00rootroot00000000000000/* * c_cblas1.c * * The program is a C wrapper for ccblat1. * * Written by Keita Teranishi. 2/11/1998 * */ #include "common.h" #include "cblas_test.h" void F77_caxpy(const int *N, const void *alpha, void *X, const int *incX, void *Y, const int *incY) { cblas_caxpy(*N, alpha, X, *incX, Y, *incY); return; } void F77_ccopy(const int *N, void *X, const int *incX, void *Y, const int *incY) { cblas_ccopy(*N, X, *incX, Y, *incY); return; } void F77_cdotc(const int *N, void *X, const int *incX, void *Y, const int *incY, void *dotc) { cblas_cdotc_sub(*N, X, *incX, Y, *incY, dotc); return; } void F77_cdotu(const int *N, void *X, const int *incX, void *Y, const int *incY,void *dotu) { cblas_cdotu_sub(*N, X, *incX, Y, *incY, dotu); return; } void F77_cscal(const int *N, const void * *alpha, void *X, const int *incX) { cblas_cscal(*N, alpha, X, *incX); return; } void F77_csscal(const int *N, const float *alpha, void *X, const int *incX) { cblas_csscal(*N, *alpha, X, *incX); return; } void F77_cswap( const int *N, void *X, const int *incX, void *Y, const int *incY) { cblas_cswap(*N,X,*incX,Y,*incY); return; } int F77_icamax(const int *N, const void *X, const int *incX) { if (*N < 1 || *incX < 1) return(0); return (cblas_icamax(*N, X, *incX)+1); } float F77_scnrm2(const int *N, const void *X, const int *incX) { return cblas_scnrm2(*N, X, *incX); } float F77_scasum(const int *N, void *X, const int *incX) { return cblas_scasum(*N, X, *incX); } OpenBLAS-0.2.20/ctest/c_cblas2.c000066400000000000000000000636771313527062700161250ustar00rootroot00000000000000/* * Written by D.P. Manley, Digital Equipment Corporation. * Prefixed "C_" to BLAS routines and their declarations. * * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. */ #include #include "common.h" #include "cblas_test.h" void F77_cgemv(int *order, char *transp, int *m, int *n, const void *alpha, CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx, const void *beta, void *y, int *incy) { CBLAS_TEST_COMPLEX *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_cgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_cgemv( CblasColMajor, trans, *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_cgemv( UNDEFINED, trans, *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy) { CBLAS_TEST_COMPLEX *A; int i,j,irow,jcol,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; for( j=jcol; j<*n; j++ ){ A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; } } i=*ku; irow=*ku+*kl-i; for( j=0; j<*n; j++ ){ A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; } for( i=*ku+1; i<*ku+*kl+1; i++ ){ irow=*ku+*kl-i; jcol=i-(*ku); for( j=jcol; j<(*n+*kl); j++ ){ A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; } } cblas_cgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_cgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_cgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, CBLAS_TEST_COMPLEX *a, int *lda){ CBLAS_TEST_COMPLEX *A; int i,j,LDA; if (*order == TEST_ROW_MJR) { LDA = *n+1; A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_cgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; } free(A); } else if (*order == TEST_COL_MJR) cblas_cgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); else cblas_cgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); } void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, CBLAS_TEST_COMPLEX *a, int *lda) { CBLAS_TEST_COMPLEX *A; int i,j,LDA; if (*order == TEST_ROW_MJR) { LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_cgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; } free(A); } else if (*order == TEST_COL_MJR) cblas_cgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); else cblas_cgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); } void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ CBLAS_TEST_COMPLEX *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_chemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_chemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_chbmv(int *order, char *uplow, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ CBLAS_TEST_COMPLEX *A; int i,irow,j,jcol,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); else { LDA = *k+2; A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) { A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; } } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) { A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; } } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) { A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; } for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) { A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; } } } cblas_chbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } } else if (*order == TEST_COL_MJR) cblas_chbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_chbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ CBLAS_TEST_COMPLEX *A, *AP; int i,j,k,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, beta, y, *incy); else { LDA = *n; A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i #include "common.h" #include "cblas_test.h" #define TEST_COL_MJR 0 #define TEST_ROW_MJR 1 #define UNDEFINED -1 void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } if (transb == CblasNoTrans) { LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDB = *k+1; B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX )); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_COMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_COMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); else cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); } void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_COMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX )); B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_COMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_COMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_COMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } OpenBLAS-0.2.20/ctest/c_cblas3_3m.c000066400000000000000000000514101313527062700165030ustar00rootroot00000000000000/* * Written by D.P. Manley, Digital Equipment Corporation. * Prefixed "C_" to BLAS routines and their declarations. * * Modified by T. H. Do, 4/15/98, SGI/CRAY Research. */ #include #include "common.h" #include "cblas_test.h" #define TEST_COL_MJR 0 #define TEST_ROW_MJR 1 #define UNDEFINED -1 void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } if (transb == CblasNoTrans) { LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDB = *k+1; B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX )); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_COMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_COMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); else cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); } void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_COMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX )); B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_COMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_COMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_COMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else { LDA = *m+1; A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } if (transb == CblasNoTrans) { LDB = *n+1; B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDB = *k+1; B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } OpenBLAS-0.2.20/ctest/c_cblat1.f000066400000000000000000000752701313527062700161200ustar00rootroot00000000000000 PROGRAM CCBLAT1 * Test program for the COMPLEX Level 1 CBLAS. * Based upon the original CBLAS test routine together with: * F06GAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK1, CHECK2, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625E-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * Initialize PASS, INCX, INCY, and MODE for a new case. * The value 9999 for INCX, INCY or MODE will appear in the * detailed output, if any, for cases that do not involve * these parameters. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.LE.5) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.GE.6) THEN CALL CHECK1(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*15 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/'CBLAS_CDOTC'/ DATA L(2)/'CBLAS_CDOTU'/ DATA L(3)/'CBLAS_CAXPY'/ DATA L(4)/'CBLAS_CCOPY'/ DATA L(5)/'CBLAS_CSWAP'/ DATA L(6)/'CBLAS_SCNRM2'/ DATA L(7)/'CBLAS_SCASUM'/ DATA L(8)/'CBLAS_CSCAL'/ DATA L(9)/'CBLAS_CSSCAL'/ DATA L(10)/'CBLAS_ICAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,9X,A15) END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX CA REAL SA INTEGER I, J, LEN, NP1 * .. Local Arrays .. COMPLEX CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + MWPCS(5), MWPCT(5) REAL STRUE2(5), STRUE4(5) INTEGER ITRUE3(5) * .. External Functions .. REAL SCASUMTEST, SCNRM2TEST INTEGER ICAMAXTEST EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST * .. External Subroutines .. EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA, CA/0.3E0, (0.4E0,-0.7E0)/ DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0), + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (-0.17E0,-0.19E0), (0.13E0,-0.39E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (0.19E0,-0.17E0), (0.32E0,0.09E0), + (0.23E0,-0.24E0), (0.18E0,0.01E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + (2.0E0,3.0E0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (-0.17E0,-0.19E0), (8.0E0,9.0E0), + (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (0.11E0,-0.03E0), (3.0E0,6.0E0), + (-0.17E0,0.46E0), (4.0E0,7.0E0), + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), + (0.32E0,0.09E0), (6.0E0,9.0E0), + (0.23E0,-0.24E0), (8.0E0,3.0E0), + (0.18E0,0.01E0), (9.0E0,4.0E0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (0.03E0,-0.09E0), (0.15E0,-0.03E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (0.03E0,0.03E0), (-0.18E0,0.03E0), + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (0.09E0,0.03E0), (0.03E0,0.12E0), + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (0.03E0,-0.09E0), (8.0E0,9.0E0), + (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (0.03E0,0.03E0), (3.0E0,6.0E0), + (-0.18E0,0.03E0), (4.0E0,7.0E0), + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 DO 40 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN CX(I) = CV(I,NP1,INCX) 20 CONTINUE IF (ICASE.EQ.6) THEN * .. SCNRM2TEST .. CALL STEST1(SCNRM2TEST(N,CX,INCX),STRUE2(NP1), + STRUE2(NP1), SFAC) ELSE IF (ICASE.EQ.7) THEN * .. SCASUMTEST .. CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + STRUE4(NP1),SFAC) ELSE IF (ICASE.EQ.8) THEN * .. CSCAL .. CALL CSCAL(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN * .. CSSCALTEST .. CALL CSSCALTEST(N,SA,CX,INCX) CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.10) THEN * .. ICAMAXTEST .. CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF * 40 CONTINUE 60 CONTINUE * INCX = 1 IF (ICASE.EQ.8) THEN * CSCAL * Add a test for alpha equal to zero. CA = (0.0E0,0.0E0) DO 80 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 80 CONTINUE CALL CSCAL(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * CSSCALTEST * Add a test for alpha equal to zero. SA = 0.0E0 DO 100 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 100 CONTINUE CALL CSSCALTEST(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to one. SA = 1.0E0 DO 120 I = 1, 5 MWPCT(I) = CX(I) MWPCS(I) = CX(I) 120 CONTINUE CALL CSSCALTEST(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to minus one. SA = -1.0E0 DO 140 I = 1, 5 MWPCT(I) = -CX(I) MWPCS(I) = -CX(I) 140 CONTINUE CALL CSSCALTEST(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) END IF RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX CA,CTEMP INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. COMPLEX CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. EXTERNAL CDOTCTEST, CDOTUTEST * .. External Subroutines .. EXTERNAL CAXPYTEST, CCOPYTEST, CSWAPTEST, CTEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA CA/(0.4E0,-0.7E0)/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0), + (-0.1E0,-0.9E0), (0.2E0,-0.8E0), + (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/ DATA CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0), + (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0), + (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/ DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.32E0,-1.41E0), + (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (-1.55E0,0.5E0), + (0.03E0,-0.89E0), (-0.38E0,-0.96E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.78E0,0.06E0), (-0.9E0,0.5E0), + (0.06E0,-0.13E0), (0.1E0,-0.5E0), + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + (0.52E0,-1.51E0)/ DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.78E0,0.06E0), (-1.54E0,0.97E0), + (0.03E0,-0.89E0), (-0.18E0,-1.31E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0), + (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0), + (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0), + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + (0.32E0,-1.16E0)/ DATA CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0), + (0.65E0,-0.47E0), (-0.34E0,-1.22E0), + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + (-0.59E0,-1.46E0), (-1.04E0,-0.04E0), + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + (-0.83E0,0.59E0), (0.07E0,-0.37E0), + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/ DATA CT6/(0.0E0,0.0E0), (0.90E0,0.06E0), + (0.91E0,-0.77E0), (1.80E0,-0.10E0), + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0), + (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0), + (-0.55E0,0.23E0), (0.83E0,-0.39E0), + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0), + (1.95E0,1.22E0)/ DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0), + (-0.4E0,-0.7E0), (-0.1E0,-0.2E0), + (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0), + (0.6E0,-0.6E0)/ DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0), + (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0), + (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/ DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + (-0.4E0,-0.7E0), (-0.1E0,-0.9E0), + (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0)/ DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0), + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + (0.7E0,-0.8E0)/ DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + (-0.9E0,-0.4E0), (-0.1E0,-0.9E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0)/ DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0), + (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0), + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + (0.2E0,-0.8E0)/ DATA CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0), + (1.63E0,1.73E0), (2.90E0,2.78E0)/ DATA CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0), + (1.17E0,1.17E0), (1.17E0,1.17E0), + (1.17E0,1.17E0), (1.17E0,1.17E0), + (1.17E0,1.17E0), (1.17E0,1.17E0)/ DATA CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0), + (1.54E0,1.54E0), (1.54E0,1.54E0), + (1.54E0,1.54E0), (1.54E0,1.54E0), + (1.54E0,1.54E0), (1.54E0,1.54E0)/ * .. Executable Statements .. DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. initialize all argument arrays .. DO 20 I = 1, 7 CX(I) = CX1(I) CY(I) = CY1(I) 20 CONTINUE IF (ICASE.EQ.1) THEN * .. CDOTCTEST .. CALL CDOTCTEST(N,CX,INCX,CY,INCY,CTEMP) CDOT(1) = CTEMP CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.2) THEN * .. CDOTUTEST .. CALL CDOTUTEST(N,CX,INCX,CY,INCY,CTEMP) CDOT(1) = CTEMP CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.3) THEN * .. CAXPYTEST .. CALL CAXPYTEST(N,CA,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.4) THEN * .. CCOPYTEST .. CALL CCOPYTEST(N,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE IF (ICASE.EQ.5) THEN * .. CSWAPTEST .. CALL CSWAPTEST(N,CX,INCX,CY,INCY) CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF * 40 CONTINUE 60 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC INTEGER LEN * .. Array Arguments .. REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SD INTEGER I * .. External Functions .. REAL SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. REAL SCOMP1, SFAC, STRUE1 * .. Array Arguments .. REAL SSIZE(*) * .. Local Arrays .. REAL SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END REAL FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. REAL SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) * **************************** CTEST ***************************** * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. REAL SFAC INTEGER LEN * .. Array Arguments .. COMPLEX CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) * .. Local Scalars .. INTEGER I * .. Local Arrays .. REAL SCOMP(20), SSIZE(20), STRUE(20) * .. External Subroutines .. EXTERNAL STEST * .. Intrinsic Functions .. INTRINSIC AIMAG, REAL * .. Executable Statements .. DO 20 I = 1, LEN SCOMP(2*I-1) = REAL(CCOMP(I)) SCOMP(2*I) = AIMAG(CCOMP(I)) STRUE(2*I-1) = REAL(CTRUE(I)) STRUE(2*I) = AIMAG(CTRUE(I)) SSIZE(2*I-1) = REAL(CSIZE(I)) SSIZE(2*I) = AIMAG(CSIZE(I)) 20 CONTINUE * CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/ctest/c_cblat2.f000066400000000000000000003145771313527062700161270ustar00rootroot00000000000000 PROGRAM CBLAT2 * * Test program for the COMPLEX Level 2 Blas. * * The program must be driven by a short data file. The first 17 records * of the file are read using list-directed input, the last 17 records * are read using the format ( A12, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 34 lines: * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. * cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. * cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 17 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NTRA, LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANS CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LCE EXTERNAL SDIFF, LCE * .. External Subroutines .. EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6, $ CC2CHKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_cgemv ', 'cblas_cgbmv ', $ 'cblas_chemv ','cblas_chbmv ','cblas_chpmv ', $ 'cblas_ctrmv ','cblas_ctbmv ','cblas_ctpmv ', $ 'cblas_ctrsv ','cblas_ctbsv ','cblas_ctpsv ', $ 'cblas_cgerc ','cblas_cgeru ','cblas_cher ', $ 'cblas_chpr ','cblas_cher2 ','cblas_chpr2 '/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 90 CONTINUE IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 100 EPS = RHALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from CMVCH YT holds * the result computed by CMVCH. TRANS = 'N' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CC2CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 170, 180, $ 180, 190, 190 )ISNUM * Test CGEMV, 01, and CGBMV, 02. 140 IF (CORDER) THEN CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. 150 IF (CORDER) THEN CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test CTRMV, 06, CTBMV, 07, CTPMV, 08, * CTRSV, 09, CTBSV, 10, and CTPSV, 11. 160 IF (CORDER) THEN CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 0 ) END IF IF (RORDER) THEN CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 1 ) END IF GO TO 200 * Test CGERC, 12, CGERU, 13. 170 IF (CORDER) THEN CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test CHER, 14, and CHPR, 15. 180 IF (CORDER) THEN CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test CHER2, 16, and CHPR2, 17. 190 IF (CORDER) THEN CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT(' TESTS OF THE COMPLEX LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', $ 7('(', F4.1, ',', F4.1, ') ', : ) ) 9988 FORMAT( ' FOR BETA ', $ 7('(', F4.1, ',', F4.1, ') ', : ) ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT(A12, L2 ) 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of CBLAT2. * END SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests CGEMV and CGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*14 CTRANS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCGBMV, CCGEMV, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' BANDED = SNAME( 9: 9 ).EQ.'b' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CCGEMV( IORDER, TRANS, M, N, $ ALPHA, AA, LDA, XX, INCX, $ BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CTRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CCGBMV( IORDER, TRANS, M, N, KL, $ KU, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * * IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LCE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LCE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LCE( YS, YY, LY ) ELSE ISAME( 10 ) = LCERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LCE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LCE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LCE( YS, YY, LY ) ELSE ISAME( 12 ) = LCERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK1. * END SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests CHEMV, CHBMV and CHPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHBMV, CCHEMV, CCHPMV, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CCHEMV( IORDER, UPLO, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CCHBMV( IORDER, UPLO, N, K, ALPHA, $ AA, LDA, XX, INCX, BETA, $ YY, INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CCHPMV( IORDER, UPLO, N, ALPHA, AA, $ XX, INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LCE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LCE( YS, YY, LY ) ELSE ISAME( 9 ) = LCERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LCE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LCE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LCE( YS, YY, LY ) ELSE ISAME( 10 ) = LCERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( AS, AA, LAA ) ISAME( 5 ) = LCE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LCE( YS, YY, LY ) ELSE ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, $ BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, $ '), Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', $ F4.1, '), ', 'Y,', I2, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK2. * END SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) * * Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX TRANSL REAL ERR, ERRMAX INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*14 CUPLO,CTRANS,CDIAG CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CMAKE, CMVCH, CCTBMV, CCTBSV, CCTPMV, $ CCTPSV, CCTRMV, CCTRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'r' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero vector for CMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) IF (DIAG.EQ.'N')THEN CDIAG = ' CblasNonUnit' ELSE CDIAG = ' CblasUnit' END IF * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CCTRMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CCTBMV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CCTPMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CCTRSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CCTBSV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CCTPSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LCE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LCE( XS, XX, LX ) ELSE ISAME( 7 ) = LCERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LCE( XS, XX, LX ) ELSE ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LCE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LCE( XS, XX, LX ) ELSE ISAME( 6 ) = LCERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mv' )THEN * * Check the result. * CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ LDA, INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, $ LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK3. * END SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests CGERC and CGERU. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX ALPHA, ALS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL CONJ, NULL, RESET, SAME * .. Local Arrays .. COMPLEX W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCGERC, CCGERU, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Executable Statements .. CONJ = SNAME( 11: 11 ).EQ.'c' * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( CONJ )THEN IF( REWI ) $ REWIND NTRA CALL CCGERC( IORDER, M, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) ELSE IF( REWI ) $ REWIND NTRA CALL CCGERU( IORDER, M, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LCE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LCE( AS, AA, LAA ) ELSE ISAME( 8 ) = LCERES( 'ge', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF IF( CONJ ) $ W( 1 ) = CONJG( W( 1 ) ) CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests CHER and CHPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX ALPHA, TRANSL REAL ERR, ERRMAX, RALPHA, RALS INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. COMPLEX W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHER, CCHPR, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, CONJG, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF RALPHA = REAL( ALF( IA ) ) ALPHA = CMPLX( RALPHA, RZERO ) NULL = N.LE.0.OR.RALPHA.EQ.RZERO * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N RALS = RALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ RALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL CCHER( IORDER, UPLO, N, RALPHA, XX, $ INCX, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ RALPHA, INCX IF( REWI ) $ REWIND NTRA CALL CCHPR( IORDER, UPLO, N, RALPHA, $ XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = RALS.EQ.RALPHA ISAME( 4 ) = LCE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LCE( AS, AA, LAA ) ELSE ISAME( 6 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = CONJG( Z( J ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK5. * END SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests CHER2 and CHPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX ALPHA, ALS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. COMPLEX W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHER2, CCHPR2, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL CCHER2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL CCHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LCE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LCE( AS, AA, LAA ) ELSE ISAME( 8 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = ALPHA*CONJG( Z( J, 2 ) ) W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK6. * END SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0, RONE = 1.0 ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA REAL EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. COMPLEX A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) REAL G( * ) * .. Local Scalars .. COMPLEX C REAL ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL CTRAN, TRAN * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT * .. Statement Functions .. REAL ABS1 * .. Statement Function definitions .. ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) * .. Executable Statements .. TRAN = TRANS.EQ.'T' CTRAN = TRANS.EQ.'C' IF( TRAN.OR.CTRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 40 I = 1, ML YT( IY ) = ZERO G( IY ) = RZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE IF( CTRAN )THEN DO 20 J = 1, NL YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 20 CONTINUE ELSE DO 30 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) JX = JX + INCXL 30 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) IY = IY + INCYL 40 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 50 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 60 50 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 80 * * Report fatal error. * 60 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 70 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) END IF 70 CONTINUE * 80 CONTINUE RETURN * 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) * * End of CMVCH. * END LOGICAL FUNCTION LCE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LCE = .TRUE. GO TO 30 20 CONTINUE LCE = .FALSE. 30 RETURN * * End of LCE. * END LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge', 'he' or 'hp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'he' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE LCERES = .FALSE. 80 RETURN * * End of LCERES. * END COMPLEX FUNCTION CBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC CMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) RETURN * * End of CBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) COMPLEX ROGUE PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) REAL RROGUE PARAMETER ( RROGUE = -1.0E10 ) * .. Scalar Arguments .. COMPLEX TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX CBEG EXTERNAL CBEG * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, MIN, REAL * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'g' SYM = TYPE( 1: 1 ).EQ.'h' TRI = TYPE( 1: 1 ).EQ.'t' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = CBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = CONJG( A( I, J ) ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( SYM ) $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'gb' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE IF( SYM )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 130 CONTINUE ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE IF( SYM )THEN JJ = KK + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 170 CONTINUE ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE IF( SYM ) $ AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE ) END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of CMAKE. * END OpenBLAS-0.2.20/ctest/c_cblat3.f000066400000000000000000003037111313527062700161140ustar00rootroot00000000000000 PROGRAM CBLAT3 * * Test program for the COMPLEX Level 3 Blas. * * The program must be driven by a short data file. The first 13 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A12, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 22 lines: * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. * cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. * cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, $ LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LCE EXTERNAL SDIFF, LCE * .. External Subroutines .. EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_cgemm ', 'cblas_chemm ', $ 'cblas_csymm ', 'cblas_ctrmm ', 'cblas_ctrsm ', $ 'cblas_cherk ', 'cblas_csyrk ', 'cblas_cher2k', $ 'cblas_csyr2k'/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from CMMCH CT holds * the result computed by CMMCH. TRANSA = 'N' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CC3CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test CGEMM, 01. 140 IF (CORDER) THEN CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test CHEMM, 02, CSYMM, 03. 150 IF (CORDER) THEN CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test CTRMM, 04, CTRSM, 05. 160 IF (CORDER) THEN CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 0 ) END IF IF (RORDER) THEN CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 1 ) END IF GO TO 190 * Test CHERK, 06, CSYRK, 07. 170 IF (CORDER) THEN CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test CHER2K, 08, CSYR2K, 09. 180 IF (CORDER) THEN CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 0 ) END IF IF (RORDER) THEN CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 1 ) END IF GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT(' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT(' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A12,L2 ) 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of CBLAT3. * END SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCGEMM, CMAKE, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL CMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL CPRCN1(NTRA, NC, SNAME, IORDER, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, $ LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCGEMM( IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LCE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LCE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LCE( CS, CC, LCC ) ELSE ISAME( 12 ) = LCERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK1. * END * SUBROUTINE CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC COMPLEX ALPHA, BETA CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAME CHARACTER*14 CRC, CTA,CTB IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) END * SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CHEMM and CSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHEMM, CMAKE, CMMCH, CCSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL CMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL CCHEMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) ELSE CALL CCSYMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC) * 120 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK2. * END * SUBROUTINE CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, $ ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC COMPLEX ALPHA, BETA CHARACTER*1 SIDE, UPLO CHARACTER*12 SNAME CHARACTER*14 CRC, CS,CU IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) END * SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C, IORDER ) * * Tests CTRMM and CTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS REAL ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CMAKE, CMMCH, CCTRMM, CCTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for CMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL CMAKE( 'tr', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mm' )THEN IF( TRACE ) $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CCTRMM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN IF( TRACE ) $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CCTRSM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LCE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LCE( BS, BB, LBB ) ELSE ISAME( 10 ) = LCERES( 'ge', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mm' )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( TRACE ) $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK3. * END * SUBROUTINE CPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, LDA, LDB) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB COMPLEX ALPHA CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*12 SNAME CHARACTER*14 CRC, CS, CU, CA, CD IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN CD = ' CblasNonUnit' ELSE CD = ' CblasUnit' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', $ F4.1, '), A,', I3, ', B,', I3, ').' ) END * SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CHERK and CSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHERK, CMAKE, CMMCH, CCSYRK * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = REAL( ALPHA ) ALPHA = CMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL CPRCN6( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, $ LDC) IF( REWI ) $ REWIND NTRA CALL CCHERK( IORDER, UPLO, TRANS, N, K, $ RALPHA, AA, LDA, RBETA, CC, $ LDC ) ELSE IF( TRACE ) $ CALL CPRCN4( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCSYRK( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LCE( CS, CC, LCC ) ELSE ISAME( 9 ) = LCERES( SNAME( 8: 9 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL CMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL CPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, $ LDA, rBETA, LDC) ELSE CALL CPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC) END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END * SUBROUTINE CPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE CPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC REAL ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ IORDER ) * * Tests CHER2K and CSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHER2K, CMAKE, CMMCH, CCSYR2K * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL CPRCN7( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ RBETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCHER2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, RBETA, $ CC, LDC ) ELSE IF( TRACE ) $ CALL CPRCN5( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCSYR2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'he', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = CONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*CONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = CONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL CPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, RBETA, LDC) ELSE CALL CPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, BETA, LDC) END IF * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK5. * END * SUBROUTINE CPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE CPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC COMPLEX ALPHA REAL BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE CMAKE(TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'he', 'sy' or 'tr'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) COMPLEX ROGUE PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) REAL RROGUE PARAMETER ( RROGUE = -1.0E10 ) * .. Scalar Arguments .. COMPLEX TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX CBEG EXTERNAL CBEG * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, REAL * .. Executable Statements .. GEN = TYPE.EQ.'ge' HER = TYPE.EQ.'he' SYM = TYPE.EQ.'sy' TRI = TYPE.EQ.'tr' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = CBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = CONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of CMAKE. * END SUBROUTINE CMMCH(TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0, RONE = 1.0 ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA REAL EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) REAL G( * ) * .. Local Scalars .. COMPLEX CL REAL ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT * .. Statement Functions .. REAL ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )* $ CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of CMMCH. * END LOGICAL FUNCTION LCE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LCE = .TRUE. GO TO 30 20 CONTINUE LCE = .FALSE. 30 RETURN * * End of LCE. * END LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge' or 'he' or 'sy'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE LCERES = .FALSE. 80 RETURN * * End of LCERES. * END COMPLEX FUNCTION CBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC CMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) RETURN * * End of CBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END OpenBLAS-0.2.20/ctest/c_cblat3_3m.f000066400000000000000000003037171313527062700165210ustar00rootroot00000000000000 PROGRAM CBLAT3 * * Test program for the COMPLEX Level 3 Blas. * * The program must be driven by a short data file. The first 13 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A13, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 22 lines: * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. * cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. * cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. * cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, $ LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANSA, TRANSB CHARACTER*13 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*13 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LCE EXTERNAL SDIFF, LCE * .. External Subroutines .. EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*13 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_cgemm3m ', 'cblas_chemm ', $ 'cblas_csymm ', 'cblas_ctrmm ', 'cblas_ctrsm ', $ 'cblas_cherk ', 'cblas_csyrk ', 'cblas_cher2k', $ 'cblas_csyr2k'/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from CMMCH CT holds * the result computed by CMMCH. TRANSA = 'N' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CC3CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test CGEMM, 01. 140 IF (CORDER) THEN CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test CHEMM, 02, CSYMM, 03. 150 IF (CORDER) THEN CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test CTRMM, 04, CTRSM, 05. 160 IF (CORDER) THEN CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 0 ) END IF IF (RORDER) THEN CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 1 ) END IF GO TO 190 * Test CHERK, 06, CSYRK, 07. 170 IF (CORDER) THEN CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test CHER2K, 08, CSYR2K, 09. 180 IF (CORDER) THEN CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 0 ) END IF IF (RORDER) THEN CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 1 ) END IF GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT(' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT(' SUBPROGRAM NAME ', A13,' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT(' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A13,L2 ) 9987 FORMAT( 1X, A13,' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of CBLAT3. * END SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCGEMM3M, CMAKE, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL CMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL CPRCN1(NTRA, NC, SNAME, IORDER, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, $ LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCGEMM3M( IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LCE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LCE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LCE( CS, CC, LCC ) ELSE ISAME( 12 ) = LCERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A13,'(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK1. * END * SUBROUTINE CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC COMPLEX ALPHA, BETA CHARACTER*1 TRANSA, TRANSB CHARACTER*13 SNAME CHARACTER*14 CRC, CTA,CTB IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) END * SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CHEMM and CSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHEMM, CMAKE, CMMCH, CCSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL CMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL CCHEMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) ELSE CALL CCSYMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC) * 120 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK2. * END * SUBROUTINE CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, $ ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC COMPLEX ALPHA, BETA CHARACTER*1 SIDE, UPLO CHARACTER*13 SNAME CHARACTER*14 CRC, CS,CU IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) END * SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C, IORDER ) * * Tests CTRMM and CTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS REAL ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CMAKE, CMMCH, CCTRMM, CCTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for CMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL CMAKE( 'tr', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mm' )THEN IF( TRACE ) $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CCTRMM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN IF( TRACE ) $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CCTRSM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LCE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LCE( BS, BB, LBB ) ELSE ISAME( 10 ) = LCERES( 'ge', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mm' )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( TRACE ) $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT(' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A13,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK3. * END * SUBROUTINE CPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, LDA, LDB) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB COMPLEX ALPHA CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*13 SNAME CHARACTER*14 CRC, CS, CU, CA, CD IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN CD = ' CblasNonUnit' ELSE CD = ' CblasUnit' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', $ F4.1, '), A,', I3, ', B,', I3, ').' ) END * SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CHERK and CSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHERK, CMAKE, CMMCH, CCSYRK * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = REAL( ALPHA ) ALPHA = CMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL CPRCN6( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, $ LDC) IF( REWI ) $ REWIND NTRA CALL CCHERK( IORDER, UPLO, TRANS, N, K, $ RALPHA, AA, LDA, RBETA, CC, $ LDC ) ELSE IF( TRACE ) $ CALL CPRCN4( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCSYRK( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LCE( CS, CC, LCC ) ELSE ISAME( 9 ) = LCERES( SNAME( 8: 9 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL CMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL CPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, $ LDA, rBETA, LDC) ELSE CALL CPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC) END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END * SUBROUTINE CPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE CPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC REAL ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ IORDER ) * * Tests CHER2K and CSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CCHER2K, CMAKE, CMMCH, CCSYR2K * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL CPRCN7( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ RBETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCHER2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, RBETA, $ CC, LDC ) ELSE IF( TRACE ) $ CALL CPRCN5( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CCSYR2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'he', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = CONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*CONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = CONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL CPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, RBETA, LDC) ELSE CALL CPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, BETA, LDC) END IF * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK5. * END * SUBROUTINE CPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE CPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC COMPLEX ALPHA REAL BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE CMAKE(TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'he', 'sy' or 'tr'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) COMPLEX ROGUE PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) REAL RROGUE PARAMETER ( RROGUE = -1.0E10 ) * .. Scalar Arguments .. COMPLEX TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX CBEG EXTERNAL CBEG * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, REAL * .. Executable Statements .. GEN = TYPE.EQ.'ge' HER = TYPE.EQ.'he' SYM = TYPE.EQ.'sy' TRI = TYPE.EQ.'tr' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = CBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = CONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of CMAKE. * END SUBROUTINE CMMCH(TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0, RONE = 1.0 ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA REAL EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) REAL G( * ) * .. Local Scalars .. COMPLEX CL REAL ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT * .. Statement Functions .. REAL ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )* $ CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of CMMCH. * END LOGICAL FUNCTION LCE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LCE = .TRUE. GO TO 30 20 CONTINUE LCE = .FALSE. 30 RETURN * * End of LCE. * END LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge' or 'he' or 'sy'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE LCERES = .FALSE. 80 RETURN * * End of LCERES. * END COMPLEX FUNCTION CBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC CMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) RETURN * * End of CBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END OpenBLAS-0.2.20/ctest/c_d2chke.c000066400000000000000000000763651313527062700161150ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_d2chke(char *rout) { char *sf = ( rout ) ; double A[2] = {0.0,0.0}, X[2] = {0.0,0.0}, Y[2] = {0.0,0.0}, ALPHA=0.0, BETA=0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } cblas_ok = TRUE ; cblas_lerr = PASSED ; if (strncmp( sf,"cblas_dgemv",11)==0) { cblas_rout = "cblas_dgemv"; cblas_info = 1; cblas_dgemv(INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dgemv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; cblas_dgemv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dgbmv",11)==0) { cblas_rout = "cblas_dgbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dsymv",11)==0) { cblas_rout = "cblas_dsymv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dsymv(INVALID, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dsymv(CblasColMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dsymv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dsymv(CblasColMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsymv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dsymv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dsymv(CblasRowMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dsymv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dsymv(CblasRowMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsymv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dsymv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dsbmv",11)==0) { cblas_rout = "cblas_dsbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dsbmv(INVALID, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dsbmv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dsbmv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dspmv",11)==0) { cblas_rout = "cblas_dspmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dspmv(INVALID, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dspmv(CblasColMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dspmv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dspmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dspmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dspmv(CblasRowMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dspmv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dspmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dspmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtrmv",11)==0) { cblas_rout = "cblas_dtrmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtrmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtbmv",11)==0) { cblas_rout = "cblas_dtbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtbmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtpmv",11)==0) { cblas_rout = "cblas_dtpmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtpmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtrsv",11)==0) { cblas_rout = "cblas_dtrsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtrsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtbsv",11)==0) { cblas_rout = "cblas_dtbsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtbsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtpsv",11)==0) { cblas_rout = "cblas_dtpsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtpsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dger",10)==0) { cblas_rout = "cblas_dger"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dsyr2",11)==0) { cblas_rout = "cblas_dsyr2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dsyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dsyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dsyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dsyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dsyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dspr2",11)==0) { cblas_rout = "cblas_dspr2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); } else if (strncmp( sf,"cblas_dsyr",10)==0) { cblas_rout = "cblas_dsyr"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dsyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dsyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dsyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dsyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_dsyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_dsyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dsyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dspr",10)==0) { cblas_rout = "cblas_dspr"; cblas_info = 1; RowMajorStrg = FALSE; cblas_dspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); chkxer(); } if (cblas_ok == TRUE) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_d3chke.c000066400000000000000000001600261313527062700161020ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_d3chke(char *rout) { char *sf = ( rout ) ; double A[2] = {0.0,0.0}, B[2] = {0.0,0.0}, C[2] = {0.0,0.0}, ALPHA=0.0, BETA=0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } cblas_ok = TRUE ; cblas_lerr = PASSED ; if (strncmp( sf,"cblas_dgemm" ,11)==0) { cblas_rout = "cblas_dgemm" ; cblas_info = 1; cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_dgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dsymm" ,11)==0) { cblas_rout = "cblas_dsymm" ; cblas_info = 1; cblas_dsymm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dtrmm" ,11)==0) { cblas_rout = "cblas_dtrmm" ; cblas_info = 1; cblas_dtrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dtrsm" ,11)==0) { cblas_rout = "cblas_dtrsm" ; cblas_info = 1; cblas_dtrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dsyrk" ,11)==0) { cblas_rout = "cblas_dsyrk" ; cblas_info = 1; cblas_dsyrk( INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, INVALID, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_dsyr2k" ,12)==0) { cblas_rout = "cblas_dsyr2k" ; cblas_info = 1; cblas_dsyr2k( INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } if (cblas_ok == TRUE ) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_dblas1.c000066400000000000000000000034741313527062700161120ustar00rootroot00000000000000/* * c_dblas1.c * * The program is a C wrapper for dcblat1. * * Written by Keita Teranishi. 2/11/1998 * */ #include "common.h" #include "cblas_test.h" double F77_dasum(const int *N, double *X, const int *incX) { return cblas_dasum(*N, X, *incX); } void F77_daxpy(const int *N, const double *alpha, const double *X, const int *incX, double *Y, const int *incY) { cblas_daxpy(*N, *alpha, X, *incX, Y, *incY); return; } void F77_dcopy(const int *N, double *X, const int *incX, double *Y, const int *incY) { cblas_dcopy(*N, X, *incX, Y, *incY); return; } double F77_ddot(const int *N, const double *X, const int *incX, const double *Y, const int *incY) { return cblas_ddot(*N, X, *incX, Y, *incY); } double F77_dnrm2(const int *N, const double *X, const int *incX) { return cblas_dnrm2(*N, X, *incX); } void F77_drotg( double *a, double *b, double *c, double *s) { cblas_drotg(a,b,c,s); return; } void F77_drot( const int *N, double *X, const int *incX, double *Y, const int *incY, const double *c, const double *s) { cblas_drot(*N,X,*incX,Y,*incY,*c,*s); return; } void F77_dscal(const int *N, const double *alpha, double *X, const int *incX) { cblas_dscal(*N, *alpha, X, *incX); return; } void F77_dswap( const int *N, double *X, const int *incX, double *Y, const int *incY) { cblas_dswap(*N,X,*incX,Y,*incY); return; } double F77_dzasum(const int *N, void *X, const int *incX) { return cblas_dzasum(*N, X, *incX); } double F77_dznrm2(const int *N, const void *X, const int *incX) { return cblas_dznrm2(*N, X, *incX); } int F77_idamax(const int *N, const double *X, const int *incX) { if (*N < 1 || *incX < 1) return(0); return (cblas_idamax(*N, X, *incX)+1); } OpenBLAS-0.2.20/ctest/c_dblas2.c000066400000000000000000000405741313527062700161150ustar00rootroot00000000000000/* * Written by D.P. Manley, Digital Equipment Corporation. * Prefixed "C_" to BLAS routines and their declarations. * * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. */ #include #include "common.h" #include "cblas_test.h" void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy ) { double *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_dgemv( CblasRowMajor, trans, *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_dgemv( CblasColMajor, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); else cblas_dgemv( UNDEFINED, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda ) { double *A; int i,j,LDA; if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) { for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; } cblas_dger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) a[ (*lda)*j+i ]=A[ LDA*i+j ]; free(A); } else cblas_dger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); } void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, int *n, double *a, int *lda, double *x, int *incx) { double *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_dtrmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); free(A); } else if (*order == TEST_COL_MJR) cblas_dtrmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); else { cblas_dtrmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); } } void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, int *n, double *a, int *lda, double *x, int *incx ) { double *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_dtrsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); free(A); } else cblas_dtrsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); } void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy) { double *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_dsymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else cblas_dsymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, int *incx, double *a, int *lda) { double *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_dsyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) a[ (*lda)*j+i ]=A[ LDA*i+j ]; free(A); } else cblas_dsyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); } void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda) { double *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_dsyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) a[ (*lda)*j+i ]=A[ LDA*i+j ]; free(A); } else cblas_dsyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); } void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy ) { double *A; int i,irow,j,jcol,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*ku; irow=*ku+*kl-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=*ku+1; i<*ku+*kl+1; i++ ){ irow=*ku+*kl-i; jcol=i-(*ku); for( j=jcol; j<(*n+*kl); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else cblas_dgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, int *n, int *k, double *a, int *lda, double *x, int *incx) { double *A; int irow, jcol, i, j, LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *k+1; A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } } cblas_dtbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } else cblas_dtbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); } void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, int *n, int *k, double *a, int *lda, double *x, int *incx) { double *A; int irow, jcol, i, j, LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *k+1; A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } } cblas_dtbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } else cblas_dtbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); } void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy) { double *A; int i,j,irow,jcol,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *k+1; A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } } cblas_dsbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else cblas_dsbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, double *x, int *incx, double *beta, double *y, int *incy) { double *A,*AP; int i,j,k,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n; A = ( double* )malloc( LDA*LDA*sizeof( double ) ); AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i #include "common.h" #include "cblas_test.h" #define TEST_COL_MJR 0 #define TEST_ROW_MJR 1 #define UNDEFINED -1 void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc ) { double *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A = (double *)malloc( (*m)*LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else { LDA = *m+1; A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } if (transb == CblasNoTrans) { LDB = *n+1; B = ( double* )malloc( (*k)*LDB*sizeof( double ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; } else { LDB = *k+1; B = ( double* )malloc( LDB*(*n)*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; } LDC = *n+1; C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_dgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_dgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_dgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc ) { double *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; LDC = *n+1; C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_dsymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_dsymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_dsymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k, double *alpha, double *a, int *lda, double *beta, double *c, int *ldc ) { int i,j,LDA,LDC; double *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( double* )malloc( (*k)*LDA*sizeof( double ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDC = *n+1; C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_dsyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_dsyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else cblas_dsyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc ) { int i,j,LDA,LDB,LDC; double *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); B = ( double* )malloc( (*n)*LDB*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j]=a[j*(*lda)+i]; B[i*LDB+j]=b[j*(*ldb)+i]; } } else { LDA = *n+1; LDB = *n+1; A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); B = ( double* )malloc( LDB*(*k)*sizeof( double ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j]=a[j*(*lda)+i]; B[i*LDB+j]=b[j*(*ldb)+i]; } } LDC = *n+1; C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_dsyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_dsyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_dsyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, double *alpha, double *a, int *lda, double *b, int *ldb) { int i,j,LDA,LDB; double *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; cblas_dtrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) b[j*(*ldb)+i]=B[i*LDB+j]; free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_dtrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); else cblas_dtrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); } void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, double *alpha, double *a, int *lda, double *b, int *ldb) { int i,j,LDA,LDB; double *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; cblas_dtrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) b[j*(*ldb)+i]=B[i*LDB+j]; free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_dtrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); else cblas_dtrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); } OpenBLAS-0.2.20/ctest/c_dblat1.f000066400000000000000000000700131313527062700161070ustar00rootroot00000000000000 PROGRAM DCBLAT1 * Test program for the DOUBLE PRECISION Level 1 CBLAS. * Based upon the original CBLAS test routine together with: * F06EAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625D-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * .. Initialize PASS, INCX, INCY, and MODE for a new case. .. * .. the value 9999 for INCX, INCY or MODE will appear in the .. * .. detailed output, if any, for cases that do not involve .. * .. these parameters .. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.EQ.3) THEN CALL CHECK0(SFAC) ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + ICASE.EQ.10) THEN CALL CHECK1(SFAC) ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + ICASE.EQ.6) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.EQ.4) THEN CALL CHECK3(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*15 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/'CBLAS_DDOT'/ DATA L(2)/'CBLAS_DAXPY '/ DATA L(3)/'CBLAS_DROTG '/ DATA L(4)/'CBLAS_DROT '/ DATA L(5)/'CBLAS_DCOPY '/ DATA L(6)/'CBLAS_DSWAP '/ DATA L(7)/'CBLAS_DNRM2 '/ DATA L(8)/'CBLAS_DASUM '/ DATA L(9)/'CBLAS_DSCAL '/ DATA L(10)/'CBLAS_IDAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,9X,A15) END SUBROUTINE CHECK0(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SA, SB, SC, SS INTEGER K * .. Local Arrays .. DOUBLE PRECISION DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + DS1(8) * .. External Subroutines .. EXTERNAL DROTGTEST, STEST1 * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0, + 0.0D0, 1.0D0/ DATA DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0, + 1.0D0, 0.0D0/ DATA DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0, + 0.0D0, 1.0D0/ DATA DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0, + 1.0D0, 0.0D0/ DATA DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0, + 0.0D0, 1.0D0, 1.0D0/ DATA DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0, + 0.0D0, 1.0D0, 0.0D0/ * .. Executable Statements .. * * Compute true values which cannot be prestored * in decimal notation * DBTRUE(1) = 1.0D0/0.6D0 DBTRUE(3) = -1.0D0/0.6D0 DBTRUE(5) = 1.0D0/0.6D0 * DO 20 K = 1, 8 * .. Set N=K for identification in output if any .. N = K IF (ICASE.EQ.3) THEN * .. DROTGTEST .. IF (K.GT.8) GO TO 40 SA = DA1(K) SB = DB1(K) CALL DROTGTEST(SA,SB,SC,SS) CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) CALL STEST1(SC,DC1(K),DC1(K),SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' STOP END IF 20 CONTINUE 40 RETURN END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER I, LEN, NP1 * .. Local Arrays .. DOUBLE PRECISION DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + SA(10), STEMP(1), STRUE(8), SX(8) INTEGER ITRUE2(5) * .. External Functions .. DOUBLE PRECISION DASUMTEST, DNRM2TEST INTEGER IDAMAXTEST EXTERNAL DASUMTEST, DNRM2TEST, IDAMAXTEST * .. External Subroutines .. EXTERNAL ITEST1, DSCALTEST, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0, + 0.3D0, 0.3D0, 0.3D0, 0.3D0/ DATA DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + 2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, + 3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0, + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0, + -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0, + 5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0, + 6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0, + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0, + 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0, + -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + 0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0, + 2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0, + -0.5D0, 7.0D0, -0.1D0, 3.0D0/ DATA DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/ DATA DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/ DATA DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + 2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0, + 3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0, + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, + 0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0, + 5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0, + 6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0, + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, + 0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, + 9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0, + 2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0, + -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0, + 0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0, + -0.03D0, 3.0D0/ DATA ITRUE2/0, 1, 2, 2, 3/ * .. Executable Statements .. DO 80 INCX = 1, 2 DO 60 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN SX(I) = DV(I,NP1,INCX) 20 CONTINUE * IF (ICASE.EQ.7) THEN * .. DNRM2TEST .. STEMP(1) = DTRUE1(NP1) CALL STEST1(DNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.8) THEN * .. DASUMTEST .. STEMP(1) = DTRUE3(NP1) CALL STEST1(DASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.9) THEN * .. DSCALTEST .. CALL DSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) DO 40 I = 1, LEN STRUE(I) = DTRUE5(I,NP1,INCX) 40 CONTINUE CALL STEST(LEN,SX,STRUE,STRUE,SFAC) ELSE IF (ICASE.EQ.10) THEN * .. IDAMAXTEST .. CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF 60 CONTINUE 80 CONTINUE RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SA INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. DOUBLE PRECISION DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + DT8(7,4,4), DX1(7), + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + SX(7), SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. EXTERNAL DDOTTEST DOUBLE PRECISION DDOTTEST * .. External Subroutines .. EXTERNAL DAXPYTEST, DCOPYTEST, DSWAPTEST, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3D0/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + -0.4D0/ DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + 0.8D0/ DATA DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0, + 0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0, + -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/ DATA DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0, + 0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0, + 0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0, + -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0, + -0.75D0, 0.2D0, 1.04D0/ DATA DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0, + 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0, + 0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0, + 0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + 0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0, + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0, + 0.0D0/ DATA DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0, + 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0, + 0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0, + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0, + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0, + -0.5D0, 0.2D0, 0.8D0/ DATA SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/ DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0/ * .. Executable Statements .. * DO 120 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 100 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. Initialize all argument arrays .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) 20 CONTINUE * IF (ICASE.EQ.1) THEN * .. DDOTTEST .. CALL STEST1(DDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + SSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.2) THEN * .. DAXPYTEST .. CALL DAXPYTEST(N,SA,SX,INCX,SY,INCY) DO 40 J = 1, LENY STY(J) = DT8(J,KN,KI) 40 CONTINUE CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.5) THEN * .. DCOPYTEST .. DO 60 I = 1, 7 STY(I) = DT10Y(I,KN,KI) 60 CONTINUE CALL DCOPYTEST(N,SX,INCX,SY,INCY) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE IF (ICASE.EQ.6) THEN * .. DSWAPTEST .. CALL DSWAPTEST(N,SX,INCX,SY,INCY) DO 80 I = 1, 7 STX(I) = DT10X(I,KN,KI) STY(I) = DT10Y(I,KN,KI) 80 CONTINUE CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF 100 CONTINUE 120 CONTINUE RETURN END SUBROUTINE CHECK3(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SC, SS INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + MWPINY(11), MWPN(11), NS(4) * .. External Subroutines .. EXTERNAL STEST,DROTTEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + -0.4D0/ DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + 0.8D0/ DATA SC, SS/0.8D0, 0.6D0/ DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + 0.0D0, 0.0D0, 0.0D0/ DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + -0.18D0, 0.2D0, 0.16D0/ DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0/ * .. Executable Statements .. * DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * IF (ICASE.EQ.4) THEN * .. DROTTEST .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) STX(I) = DT9X(I,KN,KI) STY(I) = DT9Y(I,KN,KI) 20 CONTINUE CALL DROTTEST(N,SX,INCX,SY,INCY,SC,SS) CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' STOP END IF 40 CONTINUE 60 CONTINUE * MWPC(1) = 1 DO 80 I = 2, 11 MWPC(I) = 0 80 CONTINUE MWPS(1) = 0.0 DO 100 I = 2, 6 MWPS(I) = 1.0 100 CONTINUE DO 120 I = 7, 11 MWPS(I) = -1.0 120 CONTINUE MWPINX(1) = 1 MWPINX(2) = 1 MWPINX(3) = 1 MWPINX(4) = -1 MWPINX(5) = 1 MWPINX(6) = -1 MWPINX(7) = 1 MWPINX(8) = 1 MWPINX(9) = -1 MWPINX(10) = 1 MWPINX(11) = -1 MWPINY(1) = 1 MWPINY(2) = 1 MWPINY(3) = -1 MWPINY(4) = -1 MWPINY(5) = 2 MWPINY(6) = 1 MWPINY(7) = 1 MWPINY(8) = -1 MWPINY(9) = -1 MWPINY(10) = 2 MWPINY(11) = 1 DO 140 I = 1, 11 MWPN(I) = 5 140 CONTINUE MWPN(5) = 3 MWPN(10) = 3 DO 160 I = 1, 5 MWPX(I) = I MWPY(I) = I MWPTX(1,I) = I MWPTY(1,I) = I MWPTX(2,I) = I MWPTY(2,I) = -I MWPTX(3,I) = 6 - I MWPTY(3,I) = I - 6 MWPTX(4,I) = I MWPTY(4,I) = -I MWPTX(6,I) = 6 - I MWPTY(6,I) = I - 6 MWPTX(7,I) = -I MWPTY(7,I) = I MWPTX(8,I) = I - 6 MWPTY(8,I) = 6 - I MWPTX(9,I) = -I MWPTY(9,I) = I MWPTX(11,I) = I - 6 MWPTY(11,I) = 6 - I 160 CONTINUE MWPTX(5,1) = 1 MWPTX(5,2) = 3 MWPTX(5,3) = 5 MWPTX(5,4) = 4 MWPTX(5,5) = 5 MWPTY(5,1) = -1 MWPTY(5,2) = 2 MWPTY(5,3) = -2 MWPTY(5,4) = 4 MWPTY(5,5) = -3 MWPTX(10,1) = -1 MWPTX(10,2) = -3 MWPTX(10,3) = -5 MWPTX(10,4) = 4 MWPTX(10,5) = 5 MWPTY(10,1) = 1 MWPTY(10,2) = 2 MWPTY(10,3) = 2 MWPTY(10,4) = 4 MWPTY(10,5) = 3 DO 200 I = 1, 11 INCX = MWPINX(I) INCY = MWPINY(I) DO 180 K = 1, 5 COPYX(K) = MWPX(K) COPYY(K) = MWPY(K) MWPSTX(K) = MWPTX(I,K) MWPSTY(K) = MWPTY(I,K) 180 CONTINUE CALL DROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) 200 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN * .. Array Arguments .. DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SD INTEGER I * .. External Functions .. DOUBLE PRECISION SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. DOUBLE PRECISION SCOMP1, SFAC, STRUE1 * .. Array Arguments .. DOUBLE PRECISION SSIZE(*) * .. Local Arrays .. DOUBLE PRECISION SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END DOUBLE PRECISION FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. DOUBLE PRECISION SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/ctest/c_dblat2.f000066400000000000000000003150761313527062700161230ustar00rootroot00000000000000 PROGRAM DBLAT2 * * Test program for the DOUBLE PRECISION Level 2 Blas. * * The program must be driven by a short data file. The first 17 records * of the file are read using list-directed input, the last 16 records * are read using the format ( A12, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 33 lines: * 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 0.9 VALUES OF BETA * cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NTRA, LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANS CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LDE EXTERNAL DDIFF, LDE * .. External Subroutines .. EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6, $ CD2CHKE, DMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_dgemv ', 'cblas_dgbmv ', $ 'cblas_dsymv ','cblas_dsbmv ','cblas_dspmv ', $ 'cblas_dtrmv ','cblas_dtbmv ','cblas_dtpmv ', $ 'cblas_dtrsv ','cblas_dtbsv ','cblas_dtpsv ', $ 'cblas_dger ','cblas_dsyr ','cblas_dspr ', $ 'cblas_dsyr2 ','cblas_dspr2 '/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 90 CONTINUE IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 100 EPS = HALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from DMVCH YT holds * the result computed by DMVCH. TRANS = 'N' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CD2CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 180, 180, $ 190, 190 )ISNUM * Test DGEMV, 01, and DGBMV, 02. 140 IF (CORDER) THEN CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. 150 IF (CORDER) THEN CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test DTRMV, 06, DTBMV, 07, DTPMV, 08, * DTRSV, 09, DTBSV, 10, and DTPSV, 11. 160 IF (CORDER) THEN CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 0 ) END IF IF (RORDER) THEN CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 1 ) END IF GO TO 200 * Test DGER, 12. 170 IF (CORDER) THEN CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test DSYR, 13, and DSPR, 14. 180 IF (CORDER) THEN CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test DSYR2, 15, and DSPR2, 16. 190 IF (CORDER) THEN CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9988 FORMAT( ' FOR BETA ', 7F6.1 ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT( ' ERROR IN DMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT(A12, L2 ) 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of DBLAT2. * END SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests DGEMV and DGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*14 CTRANS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL CDGBMV, CDGEMV, DMAKE, DMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' BANDED = SNAME( 9: 9 ).EQ.'b' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CTRANS, M, N, ALPHA, LDA, INCX, $ BETA, INCY IF( REWI ) $ REWIND NTRA CALL CDGEMV( IORDER, TRANS, M, N, $ ALPHA, AA, LDA, XX, INCX, $ BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CTRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CDGBMV( IORDER, TRANS, M, N, KL, $ KU, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LDE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LDE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LDE( YS, YY, LY ) ELSE ISAME( 10 ) = LDERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LDE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LDE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LDE( YS, YY, LY ) ELSE ISAME( 12 ) = LDERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL DMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, $ ', A,', I3, ',',/ 10x,'X,', I2, ',', F4.1, ', Y,', $ I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK1. * END SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests DSYMV, DSBMV and DSPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, CDSBMV, CDSPMV, CDSYMV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'y' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CDSYMV( IORDER, UPLO, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CDSBMV( IORDER, UPLO, N, K, ALPHA, $ AA, LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CDSPMV( IORDER, UPLO, N, ALPHA, AA, $ XX, INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LDE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LDE( YS, YY, LY ) ELSE ISAME( 9 ) = LDERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LDE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LDE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LDE( YS, YY, LY ) ELSE ISAME( 10 ) = LDERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( AS, AA, LAA ) ISAME( 5 ) = LDE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LDE( YS, YY, LY ) ELSE ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, $ BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK2. * END SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) * * Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XT( NMAX ), $ XX( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. DOUBLE PRECISION ERR, ERRMAX, TRANSL INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*14 CUPLO,CTRANS,CDIAG CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, CDTBMV, CDTBSV, CDTPMV, $ CDTPSV, CDTRMV, CDTRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'r' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero vector for DMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) IF (DIAG.EQ.'N')THEN CDIAG = ' CblasNonUnit' ELSE CDIAG = ' CblasUnit' END IF * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CDTRMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CDTBMV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CDTPMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CDTRSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CDTBSV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CDTPSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LDE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LDE( XS, XX, LX ) ELSE ISAME( 7 ) = LDERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LDE( XS, XX, LX ) ELSE ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LDE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LDE( XS, XX, LX ) ELSE ISAME( 6 ) = LDERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mv' )THEN * * Check the result. * CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ LDA, INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, $ LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ INCX END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK3. * END SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests DGER. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL NULL, RESET, SAME * .. Local Arrays .. DOUBLE PRECISION W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DGER, DMAKE, DMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Executable Statements .. * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL CDGER( IORDER, M, N, ALPHA, XX, INCX, YY, $ INCY, AA, LDA ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LDE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LDE( AS, AA, LAA ) ELSE ISAME( 8 ) = LDERES( 'ge', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, $ ', Y,', I2, ', A,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK4. * END SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests DSYR and DSPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. DOUBLE PRECISION W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, CDSPR, CDSYR * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'y' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL CDSYR( IORDER, UPLO, N, ALPHA, XX, INCX, $ AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ ALPHA, INCX IF( REWI ) $ REWIND NTRA CALL CDSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LDE( AS, AA, LAA ) ELSE ISAME( 6 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = Z( J ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK5. * END SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests DSYR2 and DSPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. DOUBLE PRECISION W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, CDSPR2, CDSYR2 * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'y' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL CDSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL CDSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LDE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LDE( AS, AA, LAA ) ELSE ISAME( 8 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = Z( J, 2 ) W( 2 ) = Z( J, 1 ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK6. * END SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) DOUBLE PRECISION ROGUE PARAMETER ( ROGUE = -1.0D10 ) * .. Scalar Arguments .. DOUBLE PRECISION TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. DOUBLE PRECISION DBEG EXTERNAL DBEG * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'g' SYM = TYPE( 1: 1 ).EQ.'s' TRI = TYPE( 1: 1 ).EQ.'t' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = DBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'gb' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE 130 CONTINUE ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE 170 CONTINUE ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of DMAKE. * END SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA, EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. DOUBLE PRECISION A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), $ YY( * ) * .. Local Scalars .. DOUBLE PRECISION ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL TRAN * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 30 I = 1, ML YT( IY ) = ZERO G( IY ) = ZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE DO 20 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) JX = JX + INCXL 20 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) IY = IY + INCYL 30 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 40 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 50 40 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 70 * * Report fatal error. * 50 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 60 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) END IF 60 CONTINUE * 70 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) * * End of DMVCH. * END LOGICAL FUNCTION LDE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. DOUBLE PRECISION RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LDE = .TRUE. GO TO 30 20 CONTINUE LDE = .FALSE. 30 RETURN * * End of LDE. * END LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge', 'sy' or 'sp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'sy' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE LDERES = .FALSE. 80 RETURN * * End of LDERES. * END DOUBLE PRECISION FUNCTION DBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Intrinsic Functions .. INTRINSIC DBLE * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF DBEG = DBLE( I - 500 )/1001.0D0 RETURN * * End of DBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END OpenBLAS-0.2.20/ctest/c_dblat3.f000066400000000000000000002527011313527062700161170ustar00rootroot00000000000000 PROGRAM DBLAT3 * * Test program for the DOUBLE PRECISION Level 3 Blas. * * The program must be driven by a short data file. The first 13 records * of the file are read using list-directed input, the last 6 records * are read using the format ( A12, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 19 lines: * 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 1.3 VALUES OF BETA * cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. * cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, $ LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. DOUBLE PRECISION AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LDE EXTERNAL DDIFF, LDE * .. External Subroutines .. EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, CD3CHKE, $ DMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_dgemm ', 'cblas_dsymm ', $ 'cblas_dtrmm ', 'cblas_dtrsm ','cblas_dsyrk ', $ 'cblas_dsyr2k'/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * NOUTC = NOUT * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 70 CONTINUE IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 80 EPS = HALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from DMMCH CT holds * the result computed by DMMCH. TRANSA = 'N' TRANSB = 'N' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'T' TRANSB = 'N' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CD3CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM * Test DGEMM, 01. 140 IF (CORDER) THEN CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test DSYMM, 02. 150 IF (CORDER) THEN CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test DTRMM, 03, DTRSM, 04. 160 IF (CORDER) THEN CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 0 ) END IF IF (RORDER) THEN CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 1 ) END IF GO TO 190 * Test DSYRK, 05. 170 IF (CORDER) THEN CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test DSYR2K, 06. 180 IF (CORDER) THEN CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 0 ) END IF IF (RORDER) THEN CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 1 ) END IF GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9992 FORMAT( ' FOR BETA ', 7F6.1 ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN DMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A12,L2 ) 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of DBLAT3. * END SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) * * Tests DGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL CDGEMM, DMAKE, DMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL DPRCN1(NTRA, NC, SNAME, IORDER, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, $ LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CDGEMM( IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LDE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LDE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LDE( CS, CC, LCC ) ELSE ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL DMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', $ 'C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK1. * END SUBROUTINE DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC DOUBLE PRECISION ALPHA, BETA CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAME CHARACTER*14 CRC, CTA,CTB IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', $ F4.1, ', ', 'C,', I3, ').' ) END * SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) * * Tests DSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, CDSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the symmetric matrix A. * CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL DPRCN2(NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CDSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LDE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LDE( CS, CC, LCC ) ELSE ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC) * 120 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK2. * END * SUBROUTINE DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, $ ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC DOUBLE PRECISION ALPHA, BETA CHARACTER*1 SIDE, UPLO CHARACTER*12 SNAME CHARACTER*14 CRC, CS,CU IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', $ F4.1, ', ', 'C,', I3, ').' ) END * SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C, IORDER ) * * Tests DTRMM and DTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, CDTRMM, CDTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero matrix for DMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mm' )THEN IF( TRACE ) $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CDTRMM( IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN IF( TRACE ) $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CDTRSM( IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LDE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LDE( BS, BB, LBB ) ELSE ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mm' )THEN * * Check the result. * IF( LEFT )THEN CALL DMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL DMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL DMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL DMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( TRACE ) $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK3. * END * SUBROUTINE DPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, LDA, LDB) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB DOUBLE PRECISION ALPHA CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*12 SNAME CHARACTER*14 CRC, CS, CU, CA, CD IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN CD = ' CblasNonUnit' ELSE CD = ' CblasUnit' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ').' ) END * SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) * * Tests DSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, CDSYRK * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA BETS = BETA DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL DPRCN4( NTRA, NC, SNAME, IORDER, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CDSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, $ AA, LDA, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LDE( CS, CC, LCC ) ELSE ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA, $ A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA, $ A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL DPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK4. * END * SUBROUTINE DPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC DOUBLE PRECISION ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 20X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ IORDER ) * * Tests DSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. DOUBLE PRECISION AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, CDSYR2K * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N NULL = N.LE.0 * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BETS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL DPRCN5( NTRA, NC, SNAME, IORDER, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CDSYR2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LDE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LDE( CS, CC, LCC ) ELSE ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = AB( ( J - 1 )*2*NMAX + K + $ I ) W( K + I ) = AB( ( J - 1 )*2*NMAX + $ I ) 50 CONTINUE CALL DMMCH( 'T', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJAB ), 2*NMAX, $ W, 2*NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE DO 60 I = 1, K W( I ) = AB( ( K + I - 1 )*NMAX + $ J ) W( K + I ) = AB( ( I - 1 )*NMAX + $ J ) 60 CONTINUE CALL DMMCH( 'N', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJ ), NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL DPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK5. * END * SUBROUTINE DPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC DOUBLE PRECISION ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 20X, 2( I3, ',' ), $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) DOUBLE PRECISION ROGUE PARAMETER ( ROGUE = -1.0D10 ) * .. Scalar Arguments .. DOUBLE PRECISION TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. DOUBLE PRECISION DBEG EXTERNAL DBEG * .. Executable Statements .. GEN = TYPE.EQ.'GE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = DBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE END IF RETURN * * End of DMAKE. * END SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA, EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ), G( * ) * .. Local Scalars .. DOUBLE PRECISION ERRI INTEGER I, J, K LOGICAL TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 120 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE IF( .NOT.TRANA.AND.TRANB )THEN DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) 60 CONTINUE 70 CONTINUE ELSE IF( TRANA.AND.TRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) 80 CONTINUE 90 CONTINUE END IF DO 100 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 110 I = 1, M ERRI = ABS( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 130 110 CONTINUE * 120 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 150 * * Report fatal error. * 130 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 140 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 150 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of DMMCH. * END LOGICAL FUNCTION LDE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. DOUBLE PRECISION RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LDE = .TRUE. GO TO 30 20 CONTINUE LDE = .FALSE. 30 RETURN * * End of LDE. * END LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE LDERES = .FALSE. 80 RETURN * * End of LDERES. * END DOUBLE PRECISION FUNCTION DBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF DBEG = ( I - 500 )/1001.0D0 RETURN * * End of DBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END OpenBLAS-0.2.20/ctest/c_s2chke.c000066400000000000000000000763651313527062700161340ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_s2chke(char *rout) { char *sf = ( rout ) ; float A[2] = {0.0,0.0}, X[2] = {0.0,0.0}, Y[2] = {0.0,0.0}, ALPHA=0.0, BETA=0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } cblas_ok = TRUE ; cblas_lerr = PASSED ; if (strncmp( sf,"cblas_sgemv",11)==0) { cblas_rout = "cblas_sgemv"; cblas_info = 1; cblas_sgemv(INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sgemv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sgemv(CblasColMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_sgemv(CblasColMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_sgemv(CblasColMajor, CblasNoTrans, 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; cblas_sgemv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_sgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_sgbmv",11)==0) { cblas_rout = "cblas_sgbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_sgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ssymv",11)==0) { cblas_rout = "cblas_ssymv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ssymv(INVALID, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ssymv(CblasColMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ssymv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ssymv(CblasColMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssymv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_ssymv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ssymv(CblasRowMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ssymv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ssymv(CblasRowMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssymv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_ssymv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ssbmv",11)==0) { cblas_rout = "cblas_ssbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ssbmv(INVALID, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ssbmv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ssbmv(CblasColMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssbmv(CblasColMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ssbmv(CblasColMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ssbmv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ssbmv(CblasRowMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ssbmv(CblasRowMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_sspmv",11)==0) { cblas_rout = "cblas_sspmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_sspmv(INVALID, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sspmv(CblasColMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sspmv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_sspmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_sspmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_sspmv(CblasRowMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_sspmv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_sspmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_sspmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_strmv",11)==0) { cblas_rout = "cblas_strmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_strmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_strmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_strmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_strmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_strmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_stbmv",11)==0) { cblas_rout = "cblas_stbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_stbmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_stbmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_stbmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_stbmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_stbmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_stpmv",11)==0) { cblas_rout = "cblas_stpmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_stpmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_stpmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_stpmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_stpmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_stpmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_strsv",11)==0) { cblas_rout = "cblas_strsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_strsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_strsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_strsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_strsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_strsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_stbsv",11)==0) { cblas_rout = "cblas_stbsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_stbsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_stbsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_stbsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_stbsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_stbsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_stpsv",11)==0) { cblas_rout = "cblas_stpsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_stpsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_stpsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_stpsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_stpsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_stpsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_sger",10)==0) { cblas_rout = "cblas_sger"; cblas_info = 1; RowMajorStrg = FALSE; cblas_sger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_sger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_sger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_sger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_sger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ssyr2",11)==0) { cblas_rout = "cblas_ssyr2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ssyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ssyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ssyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ssyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ssyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_sspr2",11)==0) { cblas_rout = "cblas_sspr2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_sspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_sspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_sspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); } else if (strncmp( sf,"cblas_ssyr",10)==0) { cblas_rout = "cblas_ssyr"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ssyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ssyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ssyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ssyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ssyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ssyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ssyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_sspr",10)==0) { cblas_rout = "cblas_sspr"; cblas_info = 1; RowMajorStrg = FALSE; cblas_sspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); chkxer(); } if (cblas_ok == TRUE) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_s3chke.c000066400000000000000000001600301313527062700161140ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_s3chke(char *rout) { char *sf = ( rout ) ; float A[2] = {0.0,0.0}, B[2] = {0.0,0.0}, C[2] = {0.0,0.0}, ALPHA=0.0, BETA=0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } cblas_ok = TRUE ; cblas_lerr = PASSED ; if (strncmp( sf,"cblas_sgemm" ,11)==0) { cblas_rout = "cblas_sgemm" ; cblas_info = 1; cblas_sgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_sgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_sgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_sgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ssymm" ,11)==0) { cblas_rout = "cblas_ssymm" ; cblas_info = 1; cblas_ssymm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_strmm" ,11)==0) { cblas_rout = "cblas_strmm" ; cblas_info = 1; cblas_strmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_strsm" ,11)==0) { cblas_rout = "cblas_strsm" ; cblas_info = 1; cblas_strsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ssyrk" ,11)==0) { cblas_rout = "cblas_ssyrk" ; cblas_info = 1; cblas_ssyrk( INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, INVALID, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ssyr2k" ,12)==0) { cblas_rout = "cblas_ssyr2k" ; cblas_info = 1; cblas_ssyr2k( INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } if (cblas_ok == TRUE ) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_sblas1.c000066400000000000000000000034031313527062700161210ustar00rootroot00000000000000/* * c_sblas1.c * * The program is a C wrapper for scblat1. * * Written by Keita Teranishi. 2/11/1998 * */ #include "common.h" #include "cblas_test.h" float F77_sasum(blasint *N, float *X, blasint *incX) { return cblas_sasum(*N, X, *incX); } void F77_saxpy(blasint *N, const float *alpha, const float *X, blasint *incX, float *Y, blasint *incY) { cblas_saxpy(*N, *alpha, X, *incX, Y, *incY); return; } float F77_scasum(blasint *N, float *X, blasint *incX) { return cblas_scasum(*N, X, *incX); } float F77_scnrm2(blasint *N, const float *X, blasint *incX) { return cblas_scnrm2(*N, X, *incX); } void F77_scopy(blasint *N, const float *X, blasint *incX, float *Y, blasint *incY) { cblas_scopy(*N, X, *incX, Y, *incY); return; } float F77_sdot(blasint *N, const float *X, blasint *incX, const float *Y, blasint *incY) { return cblas_sdot(*N, X, *incX, Y, *incY); } float F77_snrm2(blasint *N, const float *X, blasint *incX) { return cblas_snrm2(*N, X, *incX); } void F77_srotg( float *a, float *b, float *c, float *s) { cblas_srotg(a,b,c,s); return; } void F77_srot( blasint *N, float *X, blasint *incX, float *Y, blasint *incY, const float *c, const float *s) { cblas_srot(*N,X,*incX,Y,*incY,*c,*s); return; } void F77_sscal(blasint *N, const float *alpha, float *X, blasint *incX) { cblas_sscal(*N, *alpha, X, *incX); return; } void F77_sswap( blasint *N, float *X, blasint *incX, float *Y, blasint *incY) { cblas_sswap(*N,X,*incX,Y,*incY); return; } int F77_isamax(blasint *N, const float *X, blasint *incX) { if (*N < 1 || *incX < 1) return(0); return (cblas_isamax(*N, X, *incX)+1); } OpenBLAS-0.2.20/ctest/c_sblas2.c000066400000000000000000000403671313527062700161340ustar00rootroot00000000000000/* * Written by D.P. Manley, Digital Equipment Corporation. * Prefixed "C_" to BLAS routines and their declarations. * * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. */ #include #include "common.h" #include "cblas_test.h" void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy ) { float *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_sgemv( CblasRowMajor, trans, *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_sgemv( CblasColMajor, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); else cblas_sgemv( UNDEFINED, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda ) { float *A; int i,j,LDA; if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); for( i=0; i<*m; i++ ) { for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; } cblas_sger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) a[ (*lda)*j+i ]=A[ LDA*i+j ]; free(A); } else cblas_sger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); } void F77_strmv(int *order, char *uplow, char *transp, char *diagn, int *n, float *a, int *lda, float *x, int *incx) { float *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_strmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); free(A); } else if (*order == TEST_COL_MJR) cblas_strmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); else { cblas_strmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); } } void F77_strsv(int *order, char *uplow, char *transp, char *diagn, int *n, float *a, int *lda, float *x, int *incx ) { float *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_strsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); free(A); } else cblas_strsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); } void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy) { float *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_ssymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else cblas_ssymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x, int *incx, float *a, int *lda) { float *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_ssyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) a[ (*lda)*j+i ]=A[ LDA*i+j ]; free(A); } else cblas_ssyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); } void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda) { float *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; cblas_ssyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) a[ (*lda)*j+i ]=A[ LDA*i+j ]; free(A); } else cblas_ssyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); } void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy ) { float *A; int i,irow,j,jcol,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) ); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*ku; irow=*ku+*kl-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=*ku+1; i<*ku+*kl+1; i++ ){ irow=*ku+*kl-i; jcol=i-(*ku); for( j=jcol; j<(*n+*kl); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } cblas_sgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else cblas_sgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_stbmv(int *order, char *uplow, char *transp, char *diagn, int *n, int *k, float *a, int *lda, float *x, int *incx) { float *A; int irow, jcol, i, j, LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *k+1; A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } } cblas_stbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } else cblas_stbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); } void F77_stbsv(int *order, char *uplow, char *transp, char *diagn, int *n, int *k, float *a, int *lda, float *x, int *incx) { float *A; int irow, jcol, i, j, LDA; enum CBLAS_TRANSPOSE trans; enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; get_transpose_type(transp,&trans); get_uplo_type(uplow,&uplo); get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *k+1; A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } } cblas_stbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } else cblas_stbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); } void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy) { float *A; int i,j,irow,jcol,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *k+1; A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) A[ LDA*j+irow ]=a[ (*lda)*j+i ]; for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } } cblas_ssbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } else cblas_ssbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, *beta, y, *incy ); } void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap, float *x, int *incx, float *beta, float *y, int *incy) { float *A,*AP; int i,j,k,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n; A = ( float* )malloc( LDA*LDA*sizeof( float ) ); AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i #include #include "common.h" #include "cblas_test.h" void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc ) { float *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A = (float *)malloc( (*m)*LDA*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else { LDA = *m+1; A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } if (transb == CblasNoTrans) { LDB = *n+1; B = ( float* )malloc( (*k)*LDB*sizeof( float ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; } else { LDB = *k+1; B = ( float* )malloc( LDB*(*n)*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; } LDC = *n+1; C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_sgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_sgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_sgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc ) { float *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; LDC = *n+1; C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_ssymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_ssymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_ssymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k, float *alpha, float *a, int *lda, float *beta, float *c, int *ldc ) { int i,j,LDA,LDC; float *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( float* )malloc( (*k)*LDA*sizeof( float ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDC = *n+1; C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_ssyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_ssyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else cblas_ssyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc ) { int i,j,LDA,LDB,LDC; float *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); B = ( float* )malloc( (*n)*LDB*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j]=a[j*(*lda)+i]; B[i*LDB+j]=b[j*(*ldb)+i]; } } else { LDA = *n+1; LDB = *n+1; A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); B = ( float* )malloc( LDB*(*k)*sizeof( float ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j]=a[j*(*lda)+i]; B[i*LDB+j]=b[j*(*ldb)+i]; } } LDC = *n+1; C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_ssyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_ssyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_ssyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, float *alpha, float *a, int *lda, float *b, int *ldb) { int i,j,LDA,LDB; float *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; cblas_strmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) b[j*(*ldb)+i]=B[i*LDB+j]; free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_strmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); else cblas_strmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); } void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, float *alpha, float *a, int *lda, float *b, int *ldb) { int i,j,LDA,LDB; float *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; cblas_strsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) b[j*(*ldb)+i]=B[i*LDB+j]; free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_strsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); else cblas_strsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, a, *lda, b, *ldb); } OpenBLAS-0.2.20/ctest/c_sblat1.f000066400000000000000000000700071313527062700161310ustar00rootroot00000000000000 PROGRAM SCBLAT1 * Test program for the REAL Level 1 CBLAS. * Based upon the original CBLAS test routine together with: * F06EAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625E-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * .. Initialize PASS, INCX, INCY, and MODE for a new case. .. * .. the value 9999 for INCX, INCY or MODE will appear in the .. * .. detailed output, if any, for cases that do not involve .. * .. these parameters .. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.EQ.3) THEN CALL CHECK0(SFAC) ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + ICASE.EQ.10) THEN CALL CHECK1(SFAC) ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + ICASE.EQ.6) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.EQ.4) THEN CALL CHECK3(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*15 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/'CBLAS_SDOT '/ DATA L(2)/'CBLAS_SAXPY '/ DATA L(3)/'CBLAS_SROTG '/ DATA L(4)/'CBLAS_SROT '/ DATA L(5)/'CBLAS_SCOPY '/ DATA L(6)/'CBLAS_SSWAP '/ DATA L(7)/'CBLAS_SNRM2 '/ DATA L(8)/'CBLAS_SASUM '/ DATA L(9)/'CBLAS_SSCAL '/ DATA L(10)/'CBLAS_ISAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,9X,A15) END SUBROUTINE CHECK0(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SA, SB, SC, SS INTEGER K * .. Local Arrays .. REAL DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + DS1(8) * .. External Subroutines .. EXTERNAL SROTGTEST, STEST1 * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0, + 0.0E0, 1.0E0/ DATA DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0, + 1.0E0, 0.0E0/ DATA DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0, + 0.0E0, 1.0E0/ DATA DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0, + 1.0E0, 0.0E0/ DATA DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0, + 0.0E0, 1.0E0, 1.0E0/ DATA DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0, + 0.0E0, 1.0E0, 0.0E0/ * .. Executable Statements .. * * Compute true values which cannot be prestored * in decimal notation * DBTRUE(1) = 1.0E0/0.6E0 DBTRUE(3) = -1.0E0/0.6E0 DBTRUE(5) = 1.0E0/0.6E0 * DO 20 K = 1, 8 * .. Set N=K for identification in output if any .. N = K IF (ICASE.EQ.3) THEN * .. SROTGTEST .. IF (K.GT.8) GO TO 40 SA = DA1(K) SB = DB1(K) CALL SROTGTEST(SA,SB,SC,SS) CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) CALL STEST1(SC,DC1(K),DC1(K),SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' STOP END IF 20 CONTINUE 40 RETURN END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER I, LEN, NP1 * .. Local Arrays .. REAL DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + SA(10), STEMP(1), STRUE(8), SX(8) INTEGER ITRUE2(5) * .. External Functions .. REAL SASUMTEST, SNRM2TEST INTEGER ISAMAXTEST EXTERNAL SASUMTEST, SNRM2TEST, ISAMAXTEST * .. External Subroutines .. EXTERNAL ITEST1, SSCALTEST, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0, + 0.3E0, 0.3E0, 0.3E0, 0.3E0/ DATA DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + 2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, + 3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0, + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0, + -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0, + 5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0, + 6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0, + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0, + 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0, + -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + 0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0, + 2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0, + -0.5E0, 7.0E0, -0.1E0, 3.0E0/ DATA DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/ DATA DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/ DATA DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + 2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0, + 3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0, + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, + 0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0, + 5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0, + 6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0, + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, + 0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, + 9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0, + 2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0, + -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0, + 0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0, + -0.03E0, 3.0E0/ DATA ITRUE2/0, 1, 2, 2, 3/ * .. Executable Statements .. DO 80 INCX = 1, 2 DO 60 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN SX(I) = DV(I,NP1,INCX) 20 CONTINUE * IF (ICASE.EQ.7) THEN * .. SNRM2TEST .. STEMP(1) = DTRUE1(NP1) CALL STEST1(SNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.8) THEN * .. SASUMTEST .. STEMP(1) = DTRUE3(NP1) CALL STEST1(SASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.9) THEN * .. SSCALTEST .. CALL SSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) DO 40 I = 1, LEN STRUE(I) = DTRUE5(I,NP1,INCX) 40 CONTINUE CALL STEST(LEN,SX,STRUE,STRUE,SFAC) ELSE IF (ICASE.EQ.10) THEN * .. ISAMAXTEST .. CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF 60 CONTINUE 80 CONTINUE RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SA INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. REAL DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + DT8(7,4,4), DX1(7), + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + SX(7), SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. REAL SDOTTEST EXTERNAL SDOTTEST * .. External Subroutines .. EXTERNAL SAXPYTEST, SCOPYTEST, SSWAPTEST, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3E0/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + -0.4E0/ DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + 0.8E0/ DATA DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0, + 0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0, + -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/ DATA DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0, + 0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0, + 0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0, + -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0, + -0.75E0, 0.2E0, 1.04E0/ DATA DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, + 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0, + 0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0, + 0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + 0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0, + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0, + 0.0E0/ DATA DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0, + 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0, + 0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0, + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0, + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0, + -0.5E0, 0.2E0, 0.8E0/ DATA SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/ DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0/ * .. Executable Statements .. * DO 120 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 100 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. Initialize all argument arrays .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) 20 CONTINUE * IF (ICASE.EQ.1) THEN * .. SDOTTEST .. CALL STEST1(SDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + SSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.2) THEN * .. SAXPYTEST .. CALL SAXPYTEST(N,SA,SX,INCX,SY,INCY) DO 40 J = 1, LENY STY(J) = DT8(J,KN,KI) 40 CONTINUE CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.5) THEN * .. SCOPYTEST .. DO 60 I = 1, 7 STY(I) = DT10Y(I,KN,KI) 60 CONTINUE CALL SCOPYTEST(N,SX,INCX,SY,INCY) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE IF (ICASE.EQ.6) THEN * .. SSWAPTEST .. CALL SSWAPTEST(N,SX,INCX,SY,INCY) DO 80 I = 1, 7 STX(I) = DT10X(I,KN,KI) STY(I) = DT10Y(I,KN,KI) 80 CONTINUE CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF 100 CONTINUE 120 CONTINUE RETURN END SUBROUTINE CHECK3(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SC, SS INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + MWPINY(11), MWPN(11), NS(4) * .. External Subroutines .. EXTERNAL SROTTEST, STEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + -0.4E0/ DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + 0.8E0/ DATA SC, SS/0.8E0, 0.6E0/ DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + 0.0E0, 0.0E0, 0.0E0/ DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + -0.18E0, 0.2E0, 0.16E0/ DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0/ * .. Executable Statements .. * DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * IF (ICASE.EQ.4) THEN * .. SROTTEST .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) STX(I) = DT9X(I,KN,KI) STY(I) = DT9Y(I,KN,KI) 20 CONTINUE CALL SROTTEST(N,SX,INCX,SY,INCY,SC,SS) CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' STOP END IF 40 CONTINUE 60 CONTINUE * MWPC(1) = 1 DO 80 I = 2, 11 MWPC(I) = 0 80 CONTINUE MWPS(1) = 0 DO 100 I = 2, 6 MWPS(I) = 1 100 CONTINUE DO 120 I = 7, 11 MWPS(I) = -1 120 CONTINUE MWPINX(1) = 1 MWPINX(2) = 1 MWPINX(3) = 1 MWPINX(4) = -1 MWPINX(5) = 1 MWPINX(6) = -1 MWPINX(7) = 1 MWPINX(8) = 1 MWPINX(9) = -1 MWPINX(10) = 1 MWPINX(11) = -1 MWPINY(1) = 1 MWPINY(2) = 1 MWPINY(3) = -1 MWPINY(4) = -1 MWPINY(5) = 2 MWPINY(6) = 1 MWPINY(7) = 1 MWPINY(8) = -1 MWPINY(9) = -1 MWPINY(10) = 2 MWPINY(11) = 1 DO 140 I = 1, 11 MWPN(I) = 5 140 CONTINUE MWPN(5) = 3 MWPN(10) = 3 DO 160 I = 1, 5 MWPX(I) = I MWPY(I) = I MWPTX(1,I) = I MWPTY(1,I) = I MWPTX(2,I) = I MWPTY(2,I) = -I MWPTX(3,I) = 6 - I MWPTY(3,I) = I - 6 MWPTX(4,I) = I MWPTY(4,I) = -I MWPTX(6,I) = 6 - I MWPTY(6,I) = I - 6 MWPTX(7,I) = -I MWPTY(7,I) = I MWPTX(8,I) = I - 6 MWPTY(8,I) = 6 - I MWPTX(9,I) = -I MWPTY(9,I) = I MWPTX(11,I) = I - 6 MWPTY(11,I) = 6 - I 160 CONTINUE MWPTX(5,1) = 1 MWPTX(5,2) = 3 MWPTX(5,3) = 5 MWPTX(5,4) = 4 MWPTX(5,5) = 5 MWPTY(5,1) = -1 MWPTY(5,2) = 2 MWPTY(5,3) = -2 MWPTY(5,4) = 4 MWPTY(5,5) = -3 MWPTX(10,1) = -1 MWPTX(10,2) = -3 MWPTX(10,3) = -5 MWPTX(10,4) = 4 MWPTX(10,5) = 5 MWPTY(10,1) = 1 MWPTY(10,2) = 2 MWPTY(10,3) = 2 MWPTY(10,4) = 4 MWPTY(10,5) = 3 DO 200 I = 1, 11 INCX = MWPINX(I) INCY = MWPINY(I) DO 180 K = 1, 5 COPYX(K) = MWPX(K) COPYY(K) = MWPY(K) MWPSTX(K) = MWPTX(I,K) MWPSTY(K) = MWPTY(I,K) 180 CONTINUE CALL SROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) 200 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC INTEGER LEN * .. Array Arguments .. REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SD INTEGER I * .. External Functions .. REAL SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. REAL SCOMP1, SFAC, STRUE1 * .. Array Arguments .. REAL SSIZE(*) * .. Local Arrays .. REAL SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END REAL FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. REAL SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/ctest/c_sblat2.f000066400000000000000000003147661313527062700161470ustar00rootroot00000000000000 PROGRAM SBLAT2 * * Test program for the REAL Level 2 Blas. * * The program must be driven by a short data file. The first 17 records * of the file are read using list-directed input, the last 16 records * are read using the format ( A12, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 33 lines: * 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 0.9 VALUES OF BETA * cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. * cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. * cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NTRA, LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANS CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LSE EXTERNAL SDIFF, LSE * .. External Subroutines .. EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6, $ CS2CHKE, SMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_sgemv ', 'cblas_sgbmv ', $ 'cblas_ssymv ','cblas_ssbmv ','cblas_sspmv ', $ 'cblas_strmv ','cblas_stbmv ','cblas_stpmv ', $ 'cblas_strsv ','cblas_stbsv ','cblas_stpsv ', $ 'cblas_sger ','cblas_ssyr ','cblas_sspr ', $ 'cblas_ssyr2 ','cblas_sspr2 '/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 90 CONTINUE IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 100 EPS = HALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from SMVCH YT holds * the result computed by SMVCH. TRANS = 'N' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CS2CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 180, 180, $ 190, 190 )ISNUM * Test SGEMV, 01, and SGBMV, 02. 140 IF (CORDER) THEN CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. 150 IF (CORDER) THEN CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test STRMV, 06, STBMV, 07, STPMV, 08, * STRSV, 09, STBSV, 10, and STPSV, 11. 160 IF (CORDER) THEN CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 0 ) END IF IF (RORDER) THEN CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 1 ) END IF GO TO 200 * Test SGER, 12. 170 IF (CORDER) THEN CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test SSYR, 13, and SSPR, 14. 180 IF (CORDER) THEN CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test SSYR2, 15, and SSPR2, 16. 190 IF (CORDER) THEN CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT( ' TESTS OF THE REAL LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9988 FORMAT( ' FOR BETA ', 7F6.1 ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT( ' ERROR IN SMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT(A12, L2 ) 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of SBLAT2. * END SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests SGEMV and SGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF PARAMETER ( ZERO = 0.0, HALF = 0.5 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*14 CTRANS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL CSGBMV, CSGEMV, SMAKE, SMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' BANDED = SNAME( 9: 9 ).EQ.'b' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CTRANS, M, N, ALPHA, LDA, INCX, $ BETA, INCY IF( REWI ) $ REWIND NTRA CALL CSGEMV( IORDER, TRANS, M, N, $ ALPHA, AA, LDA, XX, INCX, $ BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CTRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CSGBMV( IORDER, TRANS, M, N, KL, $ KU, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LSE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LSE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LSE( YS, YY, LY ) ELSE ISAME( 10 ) = LSERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LSE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LSE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LSE( YS, YY, LY ) ELSE ISAME( 12 ) = LSERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL SMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, $ ', A,', I3, ',',/ 10x, 'X,', I2, ',', F4.1, ', Y,', $ I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK1. * END SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests SSYMV, SSBMV and SSPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF PARAMETER ( ZERO = 0.0, HALF = 0.5 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, CSSBMV, CSSPMV, CSSYMV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'y' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CSSYMV( IORDER, UPLO, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CSSBMV( IORDER, UPLO, N, K, ALPHA, $ AA, LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CSSPMV( IORDER, UPLO, N, ALPHA, AA, $ XX, INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LSE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LSE( YS, YY, LY ) ELSE ISAME( 9 ) = LSERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LSE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LSE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LSE( YS, YY, LY ) ELSE ISAME( 10 ) = LSERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( AS, AA, LAA ) ISAME( 5 ) = LSE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LSE( YS, YY, LY ) ELSE ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK2. * END SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) * * Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XT( NMAX ), $ XX( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. REAL ERR, ERRMAX, TRANSL INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*14 CUPLO,CTRANS,CDIAG CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, CSTBMV, CSTBSV, CSTPMV, $ CSTPSV, CSTRMV, CSTRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'r' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero vector for SMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) IF (DIAG.EQ.'N')THEN CDIAG = ' CblasNonUnit' ELSE CDIAG = ' CblasUnit' END IF * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CSTRMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CSTBMV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CSTPMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CSTRSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CSTBSV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CSTPSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LSE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LSE( XS, XX, LX ) ELSE ISAME( 7 ) = LSERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LSE( XS, XX, LX ) ELSE ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LSE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LSE( XS, XX, LX ) ELSE ISAME( 6 ) = LSERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mv' )THEN * * Check the result. * CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ LDA, INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ K, LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ INCX END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK3. * END SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests SGER. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL NULL, RESET, SAME * .. Local Arrays .. REAL W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL CSGER, SMAKE, SMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Executable Statements .. * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL CSGER( IORDER, M, N, ALPHA, XX, INCX, YY, $ INCY, AA, LDA ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LSE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LSE( AS, AA, LAA ) ELSE ISAME( 8 ) = LSERES( 'ge', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, $ ', Y,', I2, ', A,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK4. * END SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests SSYR and SSPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. REAL W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, CSSPR, CSSYR * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'y' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL CSSYR( IORDER, UPLO, N, ALPHA, XX, INCX, $ AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ ALPHA, INCX IF( REWI ) $ REWIND NTRA CALL CSSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LSE( AS, AA, LAA ) ELSE ISAME( 6 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = Z( J ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK5. * END SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests SSYR2 and SSPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. REAL W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, CSSPR2, CSSYR2 * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'y' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL CSSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL CSSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LSE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LSE( AS, AA, LAA ) ELSE ISAME( 8 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = Z( J, 2 ) W( 2 ) = Z( J, 1 ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK6. * END SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) REAL ROGUE PARAMETER ( ROGUE = -1.0E10 ) * .. Scalar Arguments .. REAL TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. REAL SBEG EXTERNAL SBEG * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'g' SYM = TYPE( 1: 1 ).EQ.'s' TRI = TYPE( 1: 1 ).EQ.'t' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = SBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'gb' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE 130 CONTINUE ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE 170 CONTINUE ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of SMAKE. * END SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. Scalar Arguments .. REAL ALPHA, BETA, EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. REAL A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), $ YY( * ) * .. Local Scalars .. REAL ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL TRAN * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 30 I = 1, ML YT( IY ) = ZERO G( IY ) = ZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE DO 20 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) JX = JX + INCXL 20 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) IY = IY + INCYL 30 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 40 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 50 40 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 70 * * Report fatal error. * 50 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 60 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) END IF 60 CONTINUE * 70 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) * * End of SMVCH. * END LOGICAL FUNCTION LSE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. REAL RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LSE = .TRUE. GO TO 30 20 CONTINUE LSE = .FALSE. 30 RETURN * * End of LSE. * END LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge', 'sy' or 'sp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'sy' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE LSERES = .FALSE. 80 RETURN * * End of LSERES. * END REAL FUNCTION SBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Intrinsic Functions .. INTRINSIC REAL * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF SBEG = REAL( I - 500 )/1001.0 RETURN * * End of SBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END OpenBLAS-0.2.20/ctest/c_sblat3.f000066400000000000000000002531461313527062700161420ustar00rootroot00000000000000 PROGRAM SBLAT3 * * Test program for the REAL Level 3 Blas. * * The program must be driven by a short data file. The first 13 records * of the file are read using list-directed input, the last 6 records * are read using the format ( A12, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 19 lines: * 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 1.3 VALUES OF BETA * cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, $ LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. REAL AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LSE EXTERNAL SDIFF, LSE * .. External Subroutines .. EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, CS3CHKE, $ SMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_sgemm ', 'cblas_ssymm ', $ 'cblas_strmm ', 'cblas_strsm ','cblas_ssyrk ', $ 'cblas_ssyr2k'/ * .. Executable Statements .. * NOUTC = NOUT * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN * OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 70 CONTINUE IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 80 EPS = HALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from SMMCH CT holds * the result computed by SMMCH. TRANSA = 'N' TRANSB = 'N' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'T' TRANSB = 'N' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CS3CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM * Test SGEMM, 01. 140 IF (CORDER) THEN CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test SSYMM, 02. 150 IF (CORDER) THEN CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test STRMM, 03, STRSM, 04. 160 IF (CORDER) THEN CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 0 ) END IF IF (RORDER) THEN CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 1 ) END IF GO TO 190 * Test SSYRK, 05. 170 IF (CORDER) THEN CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test SSYR2K, 06. 180 IF (CORDER) THEN CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 0 ) END IF IF (RORDER) THEN CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 1 ) END IF GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE REAL LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9992 FORMAT( ' FOR BETA ', 7F6.1 ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* ', $ 'TESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN SMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A12,L2 ) 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of SBLAT3. * END SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests SGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL CSGEMM, SMAKE, SMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL SPRCN1(NTRA, NC, SNAME, IORDER, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, $ LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CSGEMM( IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LSE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LSE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LSE( CS, CC, LCC ) ELSE ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I+1 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL SMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', $ 'C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK1. * END * * * SUBROUTINE SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC REAL ALPHA, BETA CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAME CHARACTER*14 CRC, CTA,CTB IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', $ F4.1, ', ', 'C,', I3, ').' ) END * SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests SSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, CSSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the symmetric matrix A. * CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL SPRCN2(NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CSSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LSE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LSE( CS, CC, LCC ) ELSE ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I+1 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC) * 120 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK2. * END * SUBROUTINE SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, $ ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC REAL ALPHA, BETA CHARACTER*1 SIDE, UPLO CHARACTER*12 SNAME CHARACTER*14 CRC, CS,CU IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', $ F4.1, ', ', 'C,', I3, ').' ) END * SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C, IORDER ) * * Tests STRMM and STRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, CSTRMM, CSTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero matrix for SMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mm' )THEN IF( TRACE ) $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CSTRMM( IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN IF( TRACE ) $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CSTRSM( IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LSE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LSE( BS, BB, LBB ) ELSE ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I+1 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mm' )THEN * * Check the result. * IF( LEFT )THEN CALL SMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL SMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL SMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL SMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( TRACE ) $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK3. * END * SUBROUTINE SPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, LDA, LDB) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB REAL ALPHA CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*12 SNAME CHARACTER*14 CRC, CS, CU, CA, CD IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN CD = ' CblasNonUnit' ELSE CD = ' CblasUnit' END IF IF (IORDER.EQ.1)THEN CRC = 'CblasRowMajor' ELSE CRC = 'CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ').' ) END * SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests SSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, CSSYRK * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA BETS = BETA DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL SPRCN4( NTRA, NC, SNAME, IORDER, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CSSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, $ AA, LDA, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LSE( CS, CC, LCC ) ELSE ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I+1 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA, $ A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA, $ A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL SPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK4. * END * SUBROUTINE SPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC REAL ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 20X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ IORDER ) * * Tests SSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. REAL AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, CSSYR2K * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N NULL = N.LE.0 * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BETS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL SPRCN5( NTRA, NC, SNAME, IORDER, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CSSYR2K( IORDER, UPLO, TRANS, N, K, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LSE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LSE( CS, CC, LCC ) ELSE ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I+1 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = AB( ( J - 1 )*2*NMAX + K + $ I ) W( K + I ) = AB( ( J - 1 )*2*NMAX + $ I ) 50 CONTINUE CALL SMMCH( 'T', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJAB ), 2*NMAX, $ W, 2*NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE DO 60 I = 1, K W( I ) = AB( ( K + I - 1 )*NMAX + $ J ) W( K + I ) = AB( ( I - 1 )*NMAX + $ J ) 60 CONTINUE CALL SMMCH( 'N', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJ ), NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL SPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK5. * END * SUBROUTINE SPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC REAL ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 20X, 2( I3, ',' ), $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) REAL ROGUE PARAMETER ( ROGUE = -1.0E10 ) * .. Scalar Arguments .. REAL TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. REAL SBEG EXTERNAL SBEG * .. Executable Statements .. GEN = TYPE.EQ.'GE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = SBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE END IF RETURN * * End of SMAKE. * END SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. Scalar Arguments .. REAL ALPHA, BETA, EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ), G( * ) * .. Local Scalars .. REAL ERRI INTEGER I, J, K LOGICAL TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 120 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE IF( .NOT.TRANA.AND.TRANB )THEN DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) 60 CONTINUE 70 CONTINUE ELSE IF( TRANA.AND.TRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) 80 CONTINUE 90 CONTINUE END IF DO 100 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 110 I = 1, M ERRI = ABS( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 130 110 CONTINUE * 120 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 150 * * Report fatal error. * 130 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 140 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 150 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of SMMCH. * END LOGICAL FUNCTION LSE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. REAL RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LSE = .TRUE. GO TO 30 20 CONTINUE LSE = .FALSE. 30 RETURN * * End of LSE. * END LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE LSERES = .FALSE. 80 RETURN * * End of LSERES. * END REAL FUNCTION SBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF SBEG = ( I - 500 )/1001.0 RETURN * * End of SBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END OpenBLAS-0.2.20/ctest/c_xerbla.c000066400000000000000000000073721313527062700162220ustar00rootroot00000000000000#include #include #include #include #include "common.h" #include "cblas_test.h" void cblas_xerbla(blasint info, char *rout, char *form, ...) { extern int cblas_lerr, cblas_info, cblas_ok; extern int link_xerbla; extern int RowMajorStrg; extern char *cblas_rout; /* Initially, c__3chke will call this routine with * global variable link_xerbla=1, and F77_xerbla will set link_xerbla=0. * This is done to fool the linker into loading these subroutines first * instead of ones in the CBLAS or the legacy BLAS library. */ if (link_xerbla) return; if (cblas_rout != NULL && strcmp(cblas_rout, rout) != 0){ printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", rout, cblas_rout); cblas_ok = FALSE; } if (RowMajorStrg) { /* To properly check leading dimension problems in cblas__gemm, we * need to do the following trick. When cblas__gemm is called with * CblasRowMajor, the arguments A and B switch places in the call to * f77__gemm. Thus when we test for bad leading dimension problems * for A and B, lda is in position 11 instead of 9, and ldb is in * position 9 instead of 11. */ if (strstr(rout,"gemm") != 0) { if (info == 5 ) info = 4; else if (info == 4 ) info = 5; else if (info == 11) info = 9; else if (info == 9 ) info = 11; } else if (strstr(rout,"symm") != 0 || strstr(rout,"hemm") != 0) { if (info == 5 ) info = 4; else if (info == 4 ) info = 5; } else if (strstr(rout,"trmm") != 0 || strstr(rout,"trsm") != 0) { if (info == 7 ) info = 6; else if (info == 6 ) info = 7; } else if (strstr(rout,"gemv") != 0) { if (info == 4) info = 3; else if (info == 3) info = 4; } else if (strstr(rout,"gbmv") != 0) { if (info == 4) info = 3; else if (info == 3) info = 4; else if (info == 6) info = 5; else if (info == 5) info = 6; } else if (strstr(rout,"ger") != 0) { if (info == 3) info = 2; else if (info == 2) info = 3; else if (info == 8) info = 6; else if (info == 6) info = 8; } else if ( ( strstr(rout,"her2") != 0 || strstr(rout,"hpr2") != 0 ) && strstr(rout,"her2k") == 0 ) { if (info == 8) info = 6; else if (info == 6) info = 8; } } if (info != cblas_info){ printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, cblas_info, rout); cblas_lerr = PASSED; cblas_ok = FALSE; } else cblas_lerr = FAILED; } #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo) #else void F77_xerbla(char *srname, void *vinfo) #endif { #ifdef F77_Char char *srname; #endif char rout[] = {'c','b','l','a','s','_','\0','\0','\0','\0','\0','\0','\0'}; #ifdef F77_Integer F77_Integer *info=vinfo; F77_Integer i; extern F77_Integer link_xerbla; #else int *info=vinfo; int i; extern int link_xerbla; #endif #ifdef F77_Char srname = F2C_STR(F77_srname, XerblaStrLen); #endif /* See the comment in cblas_xerbla() above */ if (link_xerbla) { link_xerbla = 0; return; } for(i=0; i < 6; i++) rout[i+6] = tolower(srname[i]); for(i=11; i >= 9; i--) if (rout[i] == ' ') rout[i] = '\0'; /* We increment *info by 1 since the CBLAS interface adds one more * argument to all level 2 and 3 routines. */ cblas_xerbla(*info+1,rout,""); } #ifdef USE64BITINT #undef int #endif int BLASFUNC(xerbla)(char *name, blasint *info, blasint length) { F77_xerbla(name, info); }; OpenBLAS-0.2.20/ctest/c_z2chke.c000066400000000000000000001014311313527062700161220ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_z2chke(char *rout) { char *sf = ( rout ) ; double A[2] = {0.0,0.0}, X[2] = {0.0,0.0}, Y[2] = {0.0,0.0}, ALPHA[2] = {0.0,0.0}, BETA[2] = {0.0,0.0}, RALPHA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } cblas_ok = TRUE ; cblas_lerr = PASSED ; if (strncmp( sf,"cblas_zgemv",11)==0) { cblas_rout = "cblas_zgemv"; cblas_info = 1; cblas_zgemv(INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgemv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; cblas_zgemv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zgbmv",11)==0) { cblas_rout = "cblas_zgbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zhemv",11)==0) { cblas_rout = "cblas_zhemv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zhemv(INVALID, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhemv(CblasColMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhemv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zhemv(CblasColMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zhemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zhemv(CblasRowMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zhemv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zhemv(CblasRowMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zhemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zhbmv",11)==0) { cblas_rout = "cblas_zhbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zhbmv(INVALID, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhbmv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zhbmv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zhpmv",11)==0) { cblas_rout = "cblas_zhpmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zhpmv(INVALID, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhpmv(CblasColMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhpmv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_zhpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zhpmv(CblasRowMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_zhpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztrmv",11)==0) { cblas_rout = "cblas_ztrmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztrmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztbmv",11)==0) { cblas_rout = "cblas_ztbmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztbmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztpmv",11)==0) { cblas_rout = "cblas_ztpmv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztpmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztrsv",11)==0) { cblas_rout = "cblas_ztrsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztrsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztbsv",11)==0) { cblas_rout = "cblas_ztbsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztbsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztpsv",11)==0) { cblas_rout = "cblas_ztpsv"; cblas_info = 1; RowMajorStrg = FALSE; cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztpsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zgeru",10)==0) { cblas_rout = "cblas_zgeru"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zgerc",10)==0) { cblas_rout = "cblas_zgerc"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zher2",11)==0) { cblas_rout = "cblas_zher2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zhpr2",11)==0) { cblas_rout = "cblas_zhpr2"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zhpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zhpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zhpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); chkxer(); } else if (strncmp( sf,"cblas_zher",10)==0) { cblas_rout = "cblas_zher"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; cblas_zher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; cblas_zher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zhpr",10)==0) { cblas_rout = "cblas_zhpr"; cblas_info = 1; RowMajorStrg = FALSE; cblas_zhpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); chkxer(); } if (cblas_ok == TRUE) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_z3chke.c000066400000000000000000002252001313527062700161240ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_z3chke(char * rout) { char *sf = ( rout ) ; double A[4] = {0.0,0.0,0.0,0.0}, B[4] = {0.0,0.0,0.0,0.0}, C[4] = {0.0,0.0,0.0,0.0}, ALPHA[2] = {0.0,0.0}, BETA[2] = {0.0,0.0}, RALPHA = 0.0, RBETA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; cblas_ok = TRUE ; cblas_lerr = PASSED ; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } if (strncmp( sf,"cblas_zgemm" ,11)==0) { cblas_rout = "cblas_zgemm" ; cblas_info = 1; cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { cblas_rout = "cblas_zhemm" ; cblas_info = 1; cblas_zhemm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zsymm" ,11)==0) { cblas_rout = "cblas_zsymm" ; cblas_info = 1; cblas_zsymm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ztrmm" ,11)==0) { cblas_rout = "cblas_ztrmm" ; cblas_info = 1; cblas_ztrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ztrsm" ,11)==0) { cblas_rout = "cblas_ztrsm" ; cblas_info = 1; cblas_ztrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zherk" ,11)==0) { cblas_rout = "cblas_zherk" ; cblas_info = 1; cblas_zherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zsyrk" ,11)==0) { cblas_rout = "cblas_zsyrk" ; cblas_info = 1; cblas_zsyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zher2k" ,12)==0) { cblas_rout = "cblas_zher2k" ; cblas_info = 1; cblas_zher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zsyr2k" ,12)==0) { cblas_rout = "cblas_zsyr2k" ; cblas_info = 1; cblas_zsyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } if (cblas_ok == 1 ) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_z3chke_3m.c000066400000000000000000002514521313527062700165330ustar00rootroot00000000000000#include #include #include "common.h" #include "cblas_test.h" int cblas_ok, cblas_lerr, cblas_info; int link_xerbla=TRUE; char *cblas_rout; #ifdef F77_Char void F77_xerbla(F77_Char F77_srname, void *vinfo); #else void F77_xerbla(char *srname, void *vinfo); #endif void chkxer(void) { extern int cblas_ok, cblas_lerr, cblas_info; extern int link_xerbla; extern char *cblas_rout; if (cblas_lerr == 1 ) { printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); cblas_ok = 0 ; } cblas_lerr = 1 ; } void F77_z3chke(char * rout) { char *sf = ( rout ) ; double A[4] = {0.0,0.0,0.0,0.0}, B[4] = {0.0,0.0,0.0,0.0}, C[4] = {0.0,0.0,0.0,0.0}, ALPHA[2] = {0.0,0.0}, BETA[2] = {0.0,0.0}, RALPHA = 0.0, RBETA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; extern char *cblas_rout; cblas_ok = TRUE ; cblas_lerr = PASSED ; if (link_xerbla) /* call these first to link */ { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } if (strncmp( sf,"cblas_zgemm3m" ,13)==0) { cblas_rout = "cblas_zgemm3" ; cblas_info = 1; cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zgemm" ,11)==0) { cblas_rout = "cblas_zgemm" ; cblas_info = 1; cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; cblas_zgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { cblas_rout = "cblas_zhemm" ; cblas_info = 1; cblas_zhemm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zsymm" ,11)==0) { cblas_rout = "cblas_zsymm" ; cblas_info = 1; cblas_zsymm( INVALID, CblasRight, CblasLower, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ztrmm" ,11)==0) { cblas_rout = "cblas_ztrmm" ; cblas_info = 1; cblas_ztrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_ztrsm" ,11)==0) { cblas_rout = "cblas_ztrsm" ; cblas_info = 1; cblas_ztrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, INVALID, 0, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zherk" ,11)==0) { cblas_rout = "cblas_zherk" ; cblas_info = 1; cblas_zherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, RALPHA, A, 2, RBETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, RALPHA, A, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zsyrk" ,11)==0) { cblas_rout = "cblas_zsyrk" ; cblas_info = 1; cblas_zsyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, BETA, C, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, BETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zher2k" ,12)==0) { cblas_rout = "cblas_zher2k" ; cblas_info = 1; cblas_zher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ); chkxer(); } else if (strncmp( sf,"cblas_zsyr2k" ,12)==0) { cblas_rout = "cblas_zsyr2k" ; cblas_info = 1; cblas_zsyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = TRUE; cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); chkxer(); cblas_info = 13; RowMajorStrg = FALSE; cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); } if (cblas_ok == 1 ) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); } OpenBLAS-0.2.20/ctest/c_zblas1.c000066400000000000000000000032411313527062700161300ustar00rootroot00000000000000/* * c_zblas1.c * * The program is a C wrapper for zcblat1. * * Written by Keita Teranishi. 2/11/1998 * */ #include "common.h" #include "cblas_test.h" void F77_zaxpy(const int *N, const void *alpha, void *X, const int *incX, void *Y, const int *incY) { cblas_zaxpy(*N, alpha, X, *incX, Y, *incY); return; } void F77_zcopy(const int *N, void *X, const int *incX, void *Y, const int *incY) { cblas_zcopy(*N, X, *incX, Y, *incY); return; } void F77_zdotc(const int *N, const void *X, const int *incX, const void *Y, const int *incY,void *dotc) { cblas_zdotc_sub(*N, X, *incX, Y, *incY, dotc); return; } void F77_zdotu(const int *N, void *X, const int *incX, void *Y, const int *incY,void *dotu) { cblas_zdotu_sub(*N, X, *incX, Y, *incY, dotu); return; } void F77_zdscal(const int *N, const double *alpha, void *X, const int *incX) { cblas_zdscal(*N, *alpha, X, *incX); return; } void F77_zscal(const int *N, const void * *alpha, void *X, const int *incX) { cblas_zscal(*N, alpha, X, *incX); return; } void F77_zswap( const int *N, void *X, const int *incX, void *Y, const int *incY) { cblas_zswap(*N,X,*incX,Y,*incY); return; } int F77_izamax(const int *N, const void *X, const int *incX) { if (*N < 1 || *incX < 1) return(0); return(cblas_izamax(*N, X, *incX)+1); } double F77_dznrm2(const int *N, const void *X, const int *incX) { return cblas_dznrm2(*N, X, *incX); } double F77_dzasum(const int *N, void *X, const int *incX) { return cblas_dzasum(*N, X, *incX); } OpenBLAS-0.2.20/ctest/c_zblas2.c000066400000000000000000000637011313527062700161400ustar00rootroot00000000000000/* * Written by D.P. Manley, Digital Equipment Corporation. * Prefixed "C_" to BLAS routines and their declarations. * * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. */ #include #include "common.h" #include "cblas_test.h" void F77_zgemv(int *order, char *transp, int *m, int *n, const void *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx, const void *beta, void *y, int *incy) { CBLAS_TEST_ZOMPLEX *A; int i,j,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_zgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_zgemv( CblasColMajor, trans, *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_zgemv( UNDEFINED, trans, *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy) { CBLAS_TEST_ZOMPLEX *A; int i,j,irow,jcol,LDA; enum CBLAS_TRANSPOSE trans; get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; for( j=jcol; j<*n; j++ ){ A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; } } i=*ku; irow=*ku+*kl-i; for( j=0; j<*n; j++ ){ A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; } for( i=*ku+1; i<*ku+*kl+1; i++ ){ irow=*ku+*kl-i; jcol=i-(*ku); for( j=jcol; j<(*n+*kl); j++ ){ A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; } } cblas_zgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_zgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_zgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, CBLAS_TEST_ZOMPLEX *a, int *lda){ CBLAS_TEST_ZOMPLEX *A; int i,j,LDA; if (*order == TEST_ROW_MJR) { LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_zgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; } free(A); } else if (*order == TEST_COL_MJR) cblas_zgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); else cblas_zgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); } void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, CBLAS_TEST_ZOMPLEX *a, int *lda) { CBLAS_TEST_ZOMPLEX *A; int i,j,LDA; if (*order == TEST_ROW_MJR) { LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_zgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; } free(A); } else if (*order == TEST_COL_MJR) cblas_zgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); else cblas_zgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); } void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ CBLAS_TEST_ZOMPLEX *A; int i,j,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { LDA = *n+1; A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; } cblas_zhemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } else if (*order == TEST_COL_MJR) cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_zhemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_zhbmv(int *order, char *uplow, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ CBLAS_TEST_ZOMPLEX *A; int i,irow,j,jcol,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); else { LDA = *k+2; A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; jcol=(*k)-i; for( j=jcol; j<*n; j++ ) { A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; } } i=*k; irow=*k-i; for( j=0; j<*n; j++ ) { A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; } } else { i=0; irow=*k-i; for( j=0; j<*n; j++ ) { A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; } for( i=1; i<*k+1; i++ ){ irow=*k-i; jcol=i; for( j=jcol; j<(*n+*k); j++ ) { A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; } } } cblas_zhbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, beta, y, *incy ); free(A); } } else if (*order == TEST_COL_MJR) cblas_zhbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_zhbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); } void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ CBLAS_TEST_ZOMPLEX *A, *AP; int i,j,k,LDA; enum CBLAS_UPLO uplo; get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, beta, y, *incy); else { LDA = *n; A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_ZOMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i #include "common.h" #include "cblas_test.h" #define TEST_COL_MJR 0 #define TEST_ROW_MJR 1 #define UNDEFINED -1 void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } if (transb == CblasNoTrans) { LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDB = *k+1; B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_ZOMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_ZOMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); else cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); } void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_ZOMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_ZOMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_ZOMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_ZOMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } OpenBLAS-0.2.20/ctest/c_zblas3_3m.c000066400000000000000000000514071313527062700165400ustar00rootroot00000000000000/* * Written by D.P. Manley, Digital Equipment Corporation. * Prefixed "C_" to BLAS routines and their declarations. * * Modified by T. H. Do, 4/15/98, SGI/CRAY Research. */ #include #include "common.h" #include "cblas_test.h" #define TEST_COL_MJR 0 #define TEST_ROW_MJR 1 #define UNDEFINED -1 void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } if (transb == CblasNoTrans) { LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDB = *k+1; B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_UPLO uplo; enum CBLAS_SIDE side; get_uplo_type(uplow,&uplo); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[i*LDA+j]=a[j*(*lda)+i]; } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) B[i*LDB+j]=b[j*(*ldb)+i]; LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) c[j*(*ldc)+i]=C[i*LDC+j]; free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_ZOMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDC; CBLAS_TEST_ZOMPLEX *A, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(C); } else if (*order == TEST_COL_MJR) cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); else cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); } void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_ZOMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDB,LDC; CBLAS_TEST_ZOMPLEX *A, *B, *C; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); if (*order == TEST_ROW_MJR) { if (trans == CblasNoTrans) { LDA = *k+1; LDB = *k+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDA = *n+1; LDB = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ){ A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_ZOMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_ZOMPLEX *A, *B; enum CBLAS_SIDE side; enum CBLAS_DIAG diag; enum CBLAS_UPLO uplo; enum CBLAS_TRANSPOSE trans; get_uplo_type(uplow,&uplo); get_transpose_type(transp,&trans); get_diag_type(diagn,&diag); get_side_type(rtlf,&side); if (*order == TEST_ROW_MJR) { if (side == CblasLeft) { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else{ LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { b[j*(*ldb)+i].real=B[i*LDB+j].real; b[j*(*ldb)+i].imag=B[i*LDB+j].imag; } free(A); free(B); } else if (*order == TEST_COL_MJR) cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; int i,j,LDA, LDB, LDC; enum CBLAS_TRANSPOSE transa, transb; get_transpose_type(transpa, &transa); get_transpose_type(transpb, &transb); if (*order == TEST_ROW_MJR) { if (transa == CblasNoTrans) { LDA = *k+1; A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*k; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } else { LDA = *m+1; A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*k; i++ ) for( j=0; j<*m; j++ ) { A[i*LDA+j].real=a[j*(*lda)+i].real; A[i*LDA+j].imag=a[j*(*lda)+i].imag; } } if (transb == CblasNoTrans) { LDB = *n+1; B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); for( i=0; i<*k; i++ ) for( j=0; j<*n; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } else { LDB = *k+1; B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*k; j++ ) { B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } } LDC = *n+1; C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { c[j*(*ldc)+i].real=C[i*LDC+j].real; c[j*(*ldc)+i].imag=C[i*LDC+j].imag; } free(A); free(B); free(C); } else if (*order == TEST_COL_MJR) cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } OpenBLAS-0.2.20/ctest/c_zblat1.f000066400000000000000000000753141313527062700161460ustar00rootroot00000000000000 PROGRAM ZCBLAT1 * Test program for the COMPLEX*16 Level 1 CBLAS. * Based upon the original CBLAS test routine together with: * F06GAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK1, CHECK2, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625D-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * Initialize PASS, INCX, INCY, and MODE for a new case. * The value 9999 for INCX, INCY or MODE will appear in the * detailed output, if any, for cases that do not involve * these parameters. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.LE.5) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.GE.6) THEN CALL CHECK1(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*15 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/'CBLAS_ZDOTC'/ DATA L(2)/'CBLAS_ZDOTU'/ DATA L(3)/'CBLAS_ZAXPY'/ DATA L(4)/'CBLAS_ZCOPY'/ DATA L(5)/'CBLAS_ZSWAP'/ DATA L(6)/'CBLAS_DZNRM2'/ DATA L(7)/'CBLAS_DZASUM'/ DATA L(8)/'CBLAS_ZSCAL'/ DATA L(9)/'CBLAS_ZDSCAL'/ DATA L(10)/'CBLAS_IZAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,9X,A15) END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX*16 CA DOUBLE PRECISION SA INTEGER I, J, LEN, NP1 * .. Local Arrays .. COMPLEX*16 CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + MWPCS(5), MWPCT(5) DOUBLE PRECISION STRUE2(5), STRUE4(5) INTEGER ITRUE3(5) * .. External Functions .. DOUBLE PRECISION DZASUMTEST, DZNRM2TEST INTEGER IZAMAXTEST EXTERNAL DZASUMTEST, DZNRM2TEST, IZAMAXTEST * .. External Subroutines .. EXTERNAL ZSCALTEST, ZDSCALTEST, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA, CA/0.3D0, (0.4D0,-0.7D0)/ DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0), + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (-0.17D0,-0.19D0), (0.13D0,-0.39D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (0.19D0,-0.17D0), (0.32D0,0.09D0), + (0.23D0,-0.24D0), (0.18D0,0.01D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + (2.0D0,3.0D0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (-0.17D0,-0.19D0), (8.0D0,9.0D0), + (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (0.11D0,-0.03D0), (3.0D0,6.0D0), + (-0.17D0,0.46D0), (4.0D0,7.0D0), + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), + (0.32D0,0.09D0), (6.0D0,9.0D0), + (0.23D0,-0.24D0), (8.0D0,3.0D0), + (0.18D0,0.01D0), (9.0D0,4.0D0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (0.03D0,-0.09D0), (0.15D0,-0.03D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (0.03D0,0.03D0), (-0.18D0,0.03D0), + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (0.09D0,0.03D0), (0.03D0,0.12D0), + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (0.03D0,-0.09D0), (8.0D0,9.0D0), + (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (0.03D0,0.03D0), (3.0D0,6.0D0), + (-0.18D0,0.03D0), (4.0D0,7.0D0), + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 DO 40 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN CX(I) = CV(I,NP1,INCX) 20 CONTINUE IF (ICASE.EQ.6) THEN * .. DZNRM2TEST .. CALL STEST1(DZNRM2TEST(N,CX,INCX),STRUE2(NP1), + STRUE2(NP1),SFAC) ELSE IF (ICASE.EQ.7) THEN * .. DZASUMTEST .. CALL STEST1(DZASUMTEST(N,CX,INCX),STRUE4(NP1), + STRUE4(NP1),SFAC) ELSE IF (ICASE.EQ.8) THEN * .. ZSCALTEST .. CALL ZSCALTEST(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN * .. ZDSCALTEST .. CALL ZDSCALTEST(N,SA,CX,INCX) CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.10) THEN * .. IZAMAXTEST .. CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF * 40 CONTINUE 60 CONTINUE * INCX = 1 IF (ICASE.EQ.8) THEN * ZSCALTEST * Add a test for alpha equal to zero. CA = (0.0D0,0.0D0) DO 80 I = 1, 5 MWPCT(I) = (0.0D0,0.0D0) MWPCS(I) = (1.0D0,1.0D0) 80 CONTINUE CALL ZSCALTEST(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * ZDSCALTEST * Add a test for alpha equal to zero. SA = 0.0D0 DO 100 I = 1, 5 MWPCT(I) = (0.0D0,0.0D0) MWPCS(I) = (1.0D0,1.0D0) 100 CONTINUE CALL ZDSCALTEST(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to one. SA = 1.0D0 DO 120 I = 1, 5 MWPCT(I) = CX(I) MWPCS(I) = CX(I) 120 CONTINUE CALL ZDSCALTEST(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to minus one. SA = -1.0D0 DO 140 I = 1, 5 MWPCT(I) = -CX(I) MWPCS(I) = -CX(I) 140 CONTINUE CALL ZDSCALTEST(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) END IF RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX*16 CA,ZTEMP INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. COMPLEX*16 CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. EXTERNAL ZDOTCTEST, ZDOTUTEST * .. External Subroutines .. EXTERNAL ZAXPYTEST, ZCOPYTEST, ZSWAPTEST, CTEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA CA/(0.4D0,-0.7D0)/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0), + (-0.1D0,-0.9D0), (0.2D0,-0.8D0), + (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/ DATA CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0), + (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0), + (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/ DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.32D0,-1.41D0), + (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (-1.55D0,0.5D0), + (0.03D0,-0.89D0), (-0.38D0,-0.96D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.78D0,0.06D0), (-0.9D0,0.5D0), + (0.06D0,-0.13D0), (0.1D0,-0.5D0), + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + (0.52D0,-1.51D0)/ DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.78D0,0.06D0), (-1.54D0,0.97D0), + (0.03D0,-0.89D0), (-0.18D0,-1.31D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0), + (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0), + (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0), + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + (0.32D0,-1.16D0)/ DATA CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0), + (0.65D0,-0.47D0), (-0.34D0,-1.22D0), + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + (-0.59D0,-1.46D0), (-1.04D0,-0.04D0), + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + (-0.83D0,0.59D0), (0.07D0,-0.37D0), + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/ DATA CT6/(0.0D0,0.0D0), (0.90D0,0.06D0), + (0.91D0,-0.77D0), (1.80D0,-0.10D0), + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0), + (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0), + (-0.55D0,0.23D0), (0.83D0,-0.39D0), + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0), + (1.95D0,1.22D0)/ DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0), + (-0.4D0,-0.7D0), (-0.1D0,-0.2D0), + (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0), + (0.6D0,-0.6D0)/ DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0), + (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0), + (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/ DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + (-0.4D0,-0.7D0), (-0.1D0,-0.9D0), + (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0)/ DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0), + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + (0.7D0,-0.8D0)/ DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + (-0.9D0,-0.4D0), (-0.1D0,-0.9D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0)/ DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0), + (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0), + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + (0.2D0,-0.8D0)/ DATA CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0), + (1.63D0,1.73D0), (2.90D0,2.78D0)/ DATA CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0), + (1.17D0,1.17D0), (1.17D0,1.17D0), + (1.17D0,1.17D0), (1.17D0,1.17D0), + (1.17D0,1.17D0), (1.17D0,1.17D0)/ DATA CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0), + (1.54D0,1.54D0), (1.54D0,1.54D0), + (1.54D0,1.54D0), (1.54D0,1.54D0), + (1.54D0,1.54D0), (1.54D0,1.54D0)/ * .. Executable Statements .. DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. initialize all argument arrays .. DO 20 I = 1, 7 CX(I) = CX1(I) CY(I) = CY1(I) 20 CONTINUE IF (ICASE.EQ.1) THEN * .. ZDOTCTEST .. CALL ZDOTCTEST(N,CX,INCX,CY,INCY,ZTEMP) CDOT(1) = ZTEMP CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.2) THEN * .. ZDOTUTEST .. CALL ZDOTUTEST(N,CX,INCX,CY,INCY,ZTEMP) CDOT(1) = ZTEMP CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.3) THEN * .. ZAXPYTEST .. CALL ZAXPYTEST(N,CA,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.4) THEN * .. ZCOPYTEST .. CALL ZCOPYTEST(N,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE IF (ICASE.EQ.5) THEN * .. ZSWAPTEST .. CALL ZSWAPTEST(N,CX,INCX,CY,INCY) CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF * 40 CONTINUE 60 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN * .. Array Arguments .. DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SD INTEGER I * .. External Functions .. DOUBLE PRECISION SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. DOUBLE PRECISION SCOMP1, SFAC, STRUE1 * .. Array Arguments .. DOUBLE PRECISION SSIZE(*) * .. Local Arrays .. DOUBLE PRECISION SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END DOUBLE PRECISION FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. DOUBLE PRECISION SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) * **************************** CTEST ***************************** * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN * .. Array Arguments .. COMPLEX*16 CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) * .. Local Scalars .. INTEGER I * .. Local Arrays .. DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20) * .. External Subroutines .. EXTERNAL STEST * .. Intrinsic Functions .. INTRINSIC DIMAG, DBLE * .. Executable Statements .. DO 20 I = 1, LEN SCOMP(2*I-1) = DBLE(CCOMP(I)) SCOMP(2*I) = DIMAG(CCOMP(I)) STRUE(2*I-1) = DBLE(CTRUE(I)) STRUE(2*I) = DIMAG(CTRUE(I)) SSIZE(2*I-1) = DBLE(CSIZE(I)) SSIZE(2*I) = DIMAG(CSIZE(I)) 20 CONTINUE * CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/ctest/c_zblat2.f000066400000000000000000003153631313527062700161500ustar00rootroot00000000000000 PROGRAM ZBLAT2 * * Test program for the COMPLEX*16 Level 2 Blas. * * The program must be driven by a short data file. The first 17 records * of the file are read using list-directed input, the last 17 records * are read using the format ( A12, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 34 lines: * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. * cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 17 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NTRA, LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANS CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LZE EXTERNAL DDIFF, LZE * .. External Subroutines .. EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6, $ CZ2CHKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_zgemv ', 'cblas_zgbmv ', $ 'cblas_zhemv ','cblas_zhbmv ','cblas_zhpmv ', $ 'cblas_ztrmv ','cblas_ztbmv ','cblas_ztpmv ', $ 'cblas_ztrsv ','cblas_ztbsv ','cblas_ztpsv ', $ 'cblas_zgerc ','cblas_zgeru ','cblas_zher ', $ 'cblas_zhpr ','cblas_zher2 ','cblas_zhpr2 '/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 90 CONTINUE IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 100 EPS = RHALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from CMVCH YT holds * the result computed by CMVCH. TRANS = 'N' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CZ2CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 170, 180, $ 180, 190, 190 )ISNUM * Test ZGEMV, 01, and ZGBMV, 02. 140 IF (CORDER) THEN CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. 150 IF (CORDER) THEN CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G, 1 ) END IF GO TO 200 * Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, * ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. 160 IF (CORDER) THEN CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 0 ) END IF IF (RORDER) THEN CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 1 ) END IF GO TO 200 * Test ZGERC, 12, ZGERU, 13. 170 IF (CORDER) THEN CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test ZHER, 14, and ZHPR, 15. 180 IF (CORDER) THEN CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF GO TO 200 * Test ZHER2, 16, and ZHPR2, 17. 190 IF (CORDER) THEN CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 0 ) END IF IF (RORDER) THEN CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z, 1 ) END IF * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT(' TESTS OF THE COMPLEX*16 LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', $ 7('(', F4.1, ',', F4.1, ') ', : ) ) 9988 FORMAT( ' FOR BETA ', $ 7('(', F4.1, ',', F4.1, ') ', : ) ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT( A12, L2 ) 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of ZBLAT2. * END SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests CGEMV and CGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*14 CTRANS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZGBMV, CZGEMV, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' BANDED = SNAME( 9: 9 ).EQ.'b' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CZGEMV( IORDER, TRANS, M, N, $ ALPHA, AA, LDA, XX, INCX, $ BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CTRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CZGBMV( IORDER, TRANS, M, N, KL, $ KU, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * * IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LZE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LZE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LZE( YS, YY, LY ) ELSE ISAME( 10 ) = LZERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LZE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LZE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LZE( YS, YY, LY ) ELSE ISAME( 12 ) = LZERES( 'ge', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK1. * END SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G, IORDER ) * * Tests CHEMV, CHBMV and CHPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHBMV, CZHEMV, CZHPMV, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CZHEMV( IORDER, UPLO, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CZHBMV( IORDER, UPLO, N, K, ALPHA, $ AA, LDA, XX, INCX, BETA, $ YY, INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CZHPMV( IORDER, UPLO, N, ALPHA, AA, $ XX, INCX, BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LZE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LZE( YS, YY, LY ) ELSE ISAME( 9 ) = LZERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LZE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LZE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LZE( YS, YY, LY ) ELSE ISAME( 10 ) = LZERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( AS, AA, LAA ) ISAME( 5 ) = LZE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LZE( YS, YY, LY ) ELSE ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, $ BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, $ '), Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', $ F4.1, '), ', 'Y,', I2, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CZHK2. * END SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) * * Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX*16 TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*14 CUPLO,CTRANS,CDIAG CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZMAKE, ZMVCH, CZTBMV, CZTBSV, CZTPMV, $ CZTPSV, CZTRMV, CZTRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'r' BANDED = SNAME( 9: 9 ).EQ.'b' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero vector for ZMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) IF (TRANS.EQ.'N')THEN CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' ELSE CTRANS = 'CblasConjTrans' END IF * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) IF (DIAG.EQ.'N')THEN CDIAG = ' CblasNonUnit' ELSE CDIAG = ' CblasUnit' END IF * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CZTRMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CZTBMV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CZTPMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CZTRSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, LDA, XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CZTBSV( IORDER, UPLO, TRANS, DIAG, $ N, K, AA, LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ CUPLO, CTRANS, CDIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CZTPSV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LZE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LZE( XS, XX, LX ) ELSE ISAME( 7 ) = LZERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LZE( XS, XX, LX ) ELSE ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LZE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LZE( XS, XX, LX ) ELSE ISAME( 6 ) = LZERES( 'ge', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mv' )THEN * * Check the result. * CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ LDA, INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, $ LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, $ INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK3. * END SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests ZGERC and ZGERU. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL CONJ, NULL, RESET, SAME * .. Local Arrays .. COMPLEX*16 W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZGERC, CZGERU, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, DCONJG, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Executable Statements .. CONJ = SNAME( 11: 11 ).EQ.'c' * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( CONJ )THEN IF( REWI ) $ REWIND NTRA CALL CZGERC( IORDER, M, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) ELSE IF( REWI ) $ REWIND NTRA CALL CZGERU( IORDER, M, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LZE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LZE( AS, AA, LAA ) ELSE ISAME( 8 ) = LZERES( 'ge', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF IF( CONJ ) $ W( 1 ) = DCONJG( W( 1 ) ) CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK4. * END SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests ZHER and ZHPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX*16 ALPHA, TRANSL DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. COMPLEX*16 W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHER, CZHPR, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, DCMPLX, DCONJG, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF RALPHA = DBLE( ALF( IA ) ) ALPHA = DCMPLX( RALPHA, RZERO ) NULL = N.LE.0.OR.RALPHA.EQ.RZERO * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N RALS = RALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ RALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL CZHER( IORDER, UPLO, N, RALPHA, XX, $ INCX, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ RALPHA, INCX IF( REWI ) $ REWIND NTRA CALL CZHPR( IORDER, UPLO, N, RALPHA, $ XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = RALS.EQ.RALPHA ISAME( 4 ) = LZE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LZE( AS, AA, LAA ) ELSE ISAME( 6 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = DCONJG( Z( J ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CZHK5. * END SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z, IORDER ) * * Tests ZHER2 and ZHPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, $ IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*14 CUPLO CHARACTER*2 ICH * .. Local Arrays .. COMPLEX*16 W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHER2, CZHPR2, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, DCONJG, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 9: 9 ).EQ.'e' PACKED = SNAME( 9: 9 ).EQ.'p' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' ELSE CUPLO = ' CblasLower' END IF UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL CZHER2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL CZHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, $ YY, INCY, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LZE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LZE( AS, AA, LAA ) ELSE ISAME( 8 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) ) W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK6. * END SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) DOUBLE PRECISION G( * ) * .. Local Scalars .. COMPLEX*16 C DOUBLE PRECISION ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL CTRAN, TRAN * .. Intrinsic Functions .. INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT * .. Statement Functions .. DOUBLE PRECISION ABS1 * .. Statement Function definitions .. ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) * .. Executable Statements .. TRAN = TRANS.EQ.'T' CTRAN = TRANS.EQ.'C' IF( TRAN.OR.CTRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 40 I = 1, ML YT( IY ) = ZERO G( IY ) = RZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE IF( CTRAN )THEN DO 20 J = 1, NL YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 20 CONTINUE ELSE DO 30 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) JX = JX + INCXL 30 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) IY = IY + INCYL 40 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 50 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 60 50 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 80 * * Report fatal error. * 60 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 70 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) END IF 70 CONTINUE * 80 CONTINUE RETURN * 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) * * End of ZMVCH. * END LOGICAL FUNCTION LZE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX*16 RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LZE = .TRUE. GO TO 30 20 CONTINUE LZE = .FALSE. 30 RETURN * * End of LZE. * END LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge', 'he' or 'hp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'he' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE LZERES = .FALSE. 80 RETURN * * End of LZERES. * END COMPLEX*16 FUNCTION ZBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC DCMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF ZBEG = DCMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) RETURN * * End of ZBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) COMPLEX*16 ROGUE PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) DOUBLE PRECISION RROGUE PARAMETER ( RROGUE = -1.0D10 ) * .. Scalar Arguments .. COMPLEX*16 TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX*16 ZBEG EXTERNAL ZBEG * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, MAX, MIN, DBLE * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'g' SYM = TYPE( 1: 1 ).EQ.'h' TRI = TYPE( 1: 1 ).EQ.'t' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = ZBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = DCONJG( A( I, J ) ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( SYM ) $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'gb' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE IF( SYM )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 130 CONTINUE ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE IF( SYM )THEN JJ = KK + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 170 CONTINUE ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE IF( SYM ) $ AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE ) END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of ZMAKE. * END OpenBLAS-0.2.20/ctest/c_zblat3.f000066400000000000000000003042061313527062700161430ustar00rootroot00000000000000 PROGRAM ZBLAT3 * * Test program for the COMPLEX*16 Level 3 Blas. * * The program must be driven by a short data file. The first 13 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A12,L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 22 lines: * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. * ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. * ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. * ZHERK T PUT F FOR NO TEST. SAME COLUMNS. * ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. * ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. * ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, $ LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*12 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LZE EXTERNAL DDIFF, LZE * .. External Subroutines .. EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5,ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*12 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_zgemm ', 'cblas_zhemm ', $ 'cblas_zsymm ', 'cblas_ztrmm ', 'cblas_ztrsm ', $ 'cblas_zherk ', 'cblas_zsyrk ', 'cblas_zher2k', $ 'cblas_zsyr2k'/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from ZMMCH CT holds * the result computed by ZMMCH. TRANSA = 'N' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CZ3CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test ZGEMM, 01. 140 IF (CORDER) THEN CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test ZHEMM, 02, ZSYMM, 03. 150 IF (CORDER) THEN CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test ZTRMM, 04, ZTRSM, 05. 160 IF (CORDER) THEN CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 0 ) END IF IF (RORDER) THEN CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 1 ) END IF GO TO 190 * Test ZHERK, 06, ZSYRK, 07. 170 IF (CORDER) THEN CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test ZHER2K, 08, ZSYR2K, 09. 180 IF (CORDER) THEN CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 0 ) END IF IF (RORDER) THEN CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 1 ) END IF GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT('TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT(' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A12,L2 ) 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of ZBLAT3. * END SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZGEMM, ZMAKE, ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL ZMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL ZPRCN1(NTRA, NC, SNAME, IORDER, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, $ LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZGEMM( IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LZE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LZE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LZE( CS, CC, LCC ) ELSE ISAME( 12 ) = LZERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK1. * END * SUBROUTINE ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAME CHARACTER*14 CRC, CTA,CTB IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) END * SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZHEMM and ZSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHEMM, ZMAKE, ZMMCH, CZSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL ZMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL CZHEMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) ELSE CALL CZSYMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC) * 120 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK2. * END * SUBROUTINE ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, $ ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 SIDE, UPLO CHARACTER*12 SNAME CHARACTER*14 CRC, CS,CU IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) END * SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C, IORDER ) * * Tests ZTRMM and ZTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZMAKE, ZMMCH, CZTRMM, CZTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for ZMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL ZMAKE( 'tr', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mm' )THEN IF( TRACE ) $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CZTRMM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN IF( TRACE ) $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CZTRSM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LZE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LZE( BS, BB, LBB ) ELSE ISAME( 10 ) = LZERES( 'ge', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mm' )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( TRACE ) $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK3. * END * SUBROUTINE ZPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, LDA, LDB) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB DOUBLE COMPLEX ALPHA CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*12 SNAME CHARACTER*14 CRC, CS, CU, CA, CD IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN CD = ' CblasNonUnit' ELSE CD = ' CblasUnit' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', $ F4.1, '), A,', I3, ', B,', I3, ').' ) END * SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZHERK and ZSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHERK, ZMAKE, ZMMCH, CZSYRK * .. Intrinsic Functions .. INTRINSIC DCMPLX, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = DBLE( ALPHA ) ALPHA = DCMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL ZPRCN6( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, $ LDC) IF( REWI ) $ REWIND NTRA CALL CZHERK( IORDER, UPLO, TRANS, N, K, $ RALPHA, AA, LDA, RBETA, CC, $ LDC ) ELSE IF( TRACE ) $ CALL ZPRCN4( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZSYRK( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LZE( CS, CC, LCC ) ELSE ISAME( 9 ) = LZERES( SNAME( 8: 9 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL ZMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL ZPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, $ LDA, rBETA, LDC) ELSE CALL ZPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC) END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END * SUBROUTINE ZPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE ZPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC DOUBLE PRECISION ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ IORDER ) * * Tests ZHER2K and ZSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*12 SNAME * .. Array Arguments .. COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHER2K, ZMAKE, ZMMCH, CZSYR2K * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL ZPRCN7( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ RBETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZHER2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, RBETA, $ CC, LDC ) ELSE IF( TRACE ) $ CALL ZPRCN5( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZSYR2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'he', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = DCONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*DCONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = DCONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL ZPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, RBETA, LDC) ELSE CALL ZPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, BETA, LDC) END IF * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK5. * END * SUBROUTINE ZPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE ZPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC DOUBLE COMPLEX ALPHA DOUBLE PRECISION BETA CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'he', 'sy' or 'tr'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) COMPLEX*16 ROGUE PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) DOUBLE PRECISION RROGUE PARAMETER ( RROGUE = -1.0D10 ) * .. Scalar Arguments .. COMPLEX*16 TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX*16 ZBEG EXTERNAL ZBEG * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, DBLE * .. Executable Statements .. GEN = TYPE.EQ.'ge' HER = TYPE.EQ.'he' SYM = TYPE.EQ.'sy' TRI = TYPE.EQ.'tr' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = ZBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = DCONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of ZMAKE. * END SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) DOUBLE PRECISION G( * ) * .. Local Scalars .. COMPLEX*16 CL DOUBLE PRECISION ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT * .. Statement Functions .. DOUBLE PRECISION ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of ZMMCH. * END LOGICAL FUNCTION LZE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX*16 RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LZE = .TRUE. GO TO 30 20 CONTINUE LZE = .FALSE. 30 RETURN * * End of LZE. * END LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge' or 'he' or 'sy'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE LZERES = .FALSE. 80 RETURN * * End of LZERES. * END COMPLEX*16 FUNCTION ZBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC DCMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) RETURN * * End of ZBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END OpenBLAS-0.2.20/ctest/c_zblat3_3m.f000066400000000000000000003042141313527062700165410ustar00rootroot00000000000000 PROGRAM ZBLAT3 * * Test program for the COMPLEX*16 Level 3 Blas. * * The program must be driven by a short data file. The first 13 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A13,L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 22 lines: * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. * ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. * ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. * ZHERK T PUT F FOR NO TEST. SAME COLUMNS. * ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. * ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. * ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN, NOUT PARAMETER ( NIN = 5, NOUT = 6 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, $ LAYOUT LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR, CORDER, RORDER CHARACTER*1 TRANSA, TRANSB CHARACTER*13 SNAMET CHARACTER*32 SNAPS * .. Local Arrays .. COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*13 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LZE EXTERNAL DDIFF, LZE * .. External Subroutines .. EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5,ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*13 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'cblas_zgemm3m ', 'cblas_zhemm ', $ 'cblas_zsymm ', 'cblas_ztrmm ', 'cblas_ztrsm ', $ 'cblas_zherk ', 'cblas_zsyrk ', 'cblas_zher2k', $ 'cblas_zsyr2k'/ * .. Executable Statements .. * NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the flag that indicates whether row-major data layout to be tested. READ( NIN, FMT = * )LAYOUT * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) RORDER = .FALSE. CORDER = .FALSE. IF (LAYOUT.EQ.2) THEN RORDER = .TRUE. CORDER = .TRUE. WRITE( *, FMT = 10002 ) ELSE IF (LAYOUT.EQ.1) THEN RORDER = .TRUE. WRITE( *, FMT = 10001 ) ELSE IF (LAYOUT.EQ.0) THEN CORDER = .TRUE. WRITE( *, FMT = 10000 ) END IF WRITE( *, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from ZMMCH CT holds * the result computed by ZMMCH. TRANSA = 'N' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CZ3CHKE( SNAMES( ISNUM ) ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test ZGEMM, 01. 140 IF (CORDER) THEN CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test ZHEMM, 02, ZSYMM, 03. 150 IF (CORDER) THEN CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test ZTRMM, 04, ZTRSM, 05. 160 IF (CORDER) THEN CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 0 ) END IF IF (RORDER) THEN CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, $ 1 ) END IF GO TO 190 * Test ZHERK, 06, ZSYRK, 07. 170 IF (CORDER) THEN CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 0 ) END IF IF (RORDER) THEN CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G, 1 ) END IF GO TO 190 * Test ZHER2K, 08, ZSYR2K, 09. 180 IF (CORDER) THEN CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 0 ) END IF IF (RORDER) THEN CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ 1 ) END IF GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) 10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT('TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT(' SUBPROGRAM NAME ', A13,' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT(' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A13,L2 ) 9987 FORMAT( 1X, A13,' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of ZBLAT3. * END SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZGEMM3M, ZMAKE, ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL ZMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL ZPRCN1(NTRA, NC, SNAME, IORDER, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, $ LDB, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZGEMM3M( IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LZE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LZE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LZE( CS, CC, LCC ) ELSE ISAME( 12 ) = LZERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A13,'(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK1. * END * SUBROUTINE ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, $ K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 TRANSA, TRANSB CHARACTER*13 SNAME CHARACTER*14 CRC, CTA,CTB IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) END * SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZHEMM and ZSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHEMM, ZMAKE, ZMMCH, CZSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL ZMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL CZHEMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) ELSE CALL CZSYMM( IORDER, SIDE, UPLO, M, N, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'ge', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC) * 120 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK2. * END * SUBROUTINE ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, $ ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 SIDE, UPLO CHARACTER*13 SNAME CHARACTER*14 CRC, CS,CU IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) END * SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C, IORDER ) * * Tests ZTRMM and ZTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZMAKE, ZMMCH, CZTRMM, CZTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for ZMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL ZMAKE( 'tr', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 10: 11 ).EQ.'mm' )THEN IF( TRACE ) $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CZTRMM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN IF( TRACE ) $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB) IF( REWI ) $ REWIND NTRA CALL CZTRSM(IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, AA, LDA, $ BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LZE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LZE( BS, BB, LBB ) ELSE ISAME( 10 ) = LZERES( 'ge', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 10: 11 ).EQ.'mm' )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( TRACE ) $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT(' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT(1X, I6, ': ', A13,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK3. * END * SUBROUTINE ZPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, $ DIAG, M, N, ALPHA, LDA, LDB) INTEGER NOUT, NC, IORDER, M, N, LDA, LDB DOUBLE COMPLEX ALPHA CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*13 SNAME CHARACTER*14 CRC, CS, CU, CA, CD IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN CD = ' CblasNonUnit' ELSE CD = ' CblasUnit' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',') 9994 FORMAT( 10X, 2( A15, ',') , 2( I3, ',' ), ' (', F4.1, ',', $ F4.1, '), A,', I3, ', B,', I3, ').' ) END * SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZHERK and ZSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHERK, ZMAKE, ZMMCH, CZSYRK * .. Intrinsic Functions .. INTRINSIC DCMPLX, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = DBLE( ALPHA ) ALPHA = DCMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL ZPRCN6( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, $ LDC) IF( REWI ) $ REWIND NTRA CALL CZHERK( IORDER, UPLO, TRANS, N, K, $ RALPHA, AA, LDA, RBETA, CC, $ LDC ) ELSE IF( TRACE ) $ CALL ZPRCN4( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZSYRK( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LZE( CS, CC, LCC ) ELSE ISAME( 9 ) = LZERES( SNAME( 8: 9 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL ZMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL ZPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, $ LDA, rBETA, LDC) ELSE CALL ZPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC) END IF * 130 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END * SUBROUTINE ZPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE ZPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDC DOUBLE PRECISION ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, $ IORDER ) * * Tests ZHER2K and ZSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER LOGICAL FATAL, REWI, TRACE CHARACTER*13 SNAME * .. Array Arguments .. COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL CZHER2K, ZMAKE, ZMMCH, CZSYR2K * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 8: 9 ).EQ.'he' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ CALL ZPRCN7( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ RBETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZHER2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, RBETA, $ CC, LDC ) ELSE IF( TRACE ) $ CALL ZPRCN5( NTRA, NC, SNAME, IORDER, $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, $ BETA, LDC) IF( REWI ) $ REWIND NTRA CALL CZSYR2K( IORDER, UPLO, TRANS, N, K, $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'he', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = DCONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*DCONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = DCONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC ELSE IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN CALL ZPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, RBETA, LDC) ELSE CALL ZPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, $ ALPHA, LDA, LDB, BETA, LDC) END IF * 160 CONTINUE RETURN * 10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', $ 'RATIO ', F8.2, ' - SUSPECT *******' ) 10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', $ ' (', I6, ' CALL', 'S)' ) 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK5. * END * SUBROUTINE ZPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC DOUBLE COMPLEX ALPHA, BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) END * * SUBROUTINE ZPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, $ N, K, ALPHA, LDA, LDB, BETA, LDC) INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC DOUBLE COMPLEX ALPHA DOUBLE PRECISION BETA CHARACTER*1 UPLO, TRANSA CHARACTER*13 SNAME CHARACTER*14 CRC, CU, CA IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') ) 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) END * SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'ge', 'he', 'sy' or 'tr'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) COMPLEX*16 ROGUE PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) DOUBLE PRECISION RROGUE PARAMETER ( RROGUE = -1.0D10 ) * .. Scalar Arguments .. COMPLEX*16 TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX*16 ZBEG EXTERNAL ZBEG * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, DBLE * .. Executable Statements .. GEN = TYPE.EQ.'ge' HER = TYPE.EQ.'he' SYM = TYPE.EQ.'sy' TRI = TYPE.EQ.'tr' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = ZBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = DCONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'ge' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of ZMAKE. * END SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) DOUBLE PRECISION G( * ) * .. Local Scalars .. COMPLEX*16 CL DOUBLE PRECISION ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT * .. Statement Functions .. DOUBLE PRECISION ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of ZMMCH. * END LOGICAL FUNCTION LZE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX*16 RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LZE = .TRUE. GO TO 30 20 CONTINUE LZE = .FALSE. 30 RETURN * * End of LZE. * END LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'ge' or 'he' or 'sy'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'ge' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE LZERES = .FALSE. 80 RETURN * * End of LZERES. * END COMPLEX*16 FUNCTION ZBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC DCMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) RETURN * * End of ZBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END OpenBLAS-0.2.20/ctest/cblas_test.h000066400000000000000000000404311313527062700165640ustar00rootroot00000000000000/* * cblas_test.h * Written by Keita Teranishi */ #ifndef CBLAS_TEST_H #define CBLAS_TEST_H #include "cblas.h" #ifdef USE64BITINT #define int long #endif #define TRUE 1 #define PASSED 1 #define TEST_ROW_MJR 1 #define FALSE 0 #define FAILED 0 #define TEST_COL_MJR 0 #define INVALID -1 #define UNDEFINED -1 typedef struct { float real; float imag; } CBLAS_TEST_COMPLEX; typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; #if defined(ADD_) /* * Level 1 BLAS */ #define F77_srotg srotgtest_ #define F77_srotmg srotmgtest_ #define F77_srot srottest_ #define F77_srotm srotmtest_ #define F77_drotg drotgtest_ #define F77_drotmg drotmgtest_ #define F77_drot drottest_ #define F77_drotm drotmtest_ #define F77_sswap sswaptest_ #define F77_scopy scopytest_ #define F77_saxpy saxpytest_ #define F77_isamax isamaxtest_ #define F77_dswap dswaptest_ #define F77_dcopy dcopytest_ #define F77_daxpy daxpytest_ #define F77_idamax idamaxtest_ #define F77_cswap cswaptest_ #define F77_ccopy ccopytest_ #define F77_caxpy caxpytest_ #define F77_icamax icamaxtest_ #define F77_zswap zswaptest_ #define F77_zcopy zcopytest_ #define F77_zaxpy zaxpytest_ #define F77_izamax izamaxtest_ #define F77_sdot sdottest_ #define F77_ddot ddottest_ #define F77_dsdot dsdottest_ #define F77_sscal sscaltest_ #define F77_dscal dscaltest_ #define F77_cscal cscaltest_ #define F77_zscal zscaltest_ #define F77_csscal csscaltest_ #define F77_zdscal zdscaltest_ #define F77_cdotu cdotutest_ #define F77_cdotc cdotctest_ #define F77_zdotu zdotutest_ #define F77_zdotc zdotctest_ #define F77_snrm2 snrm2test_ #define F77_sasum sasumtest_ #define F77_dnrm2 dnrm2test_ #define F77_dasum dasumtest_ #define F77_scnrm2 scnrm2test_ #define F77_scasum scasumtest_ #define F77_dznrm2 dznrm2test_ #define F77_dzasum dzasumtest_ #define F77_sdsdot sdsdottest_ /* * Level 2 BLAS */ #define F77_s2chke cs2chke_ #define F77_d2chke cd2chke_ #define F77_c2chke cc2chke_ #define F77_z2chke cz2chke_ #define F77_ssymv cssymv_ #define F77_ssbmv cssbmv_ #define F77_sspmv csspmv_ #define F77_sger csger_ #define F77_ssyr cssyr_ #define F77_sspr csspr_ #define F77_ssyr2 cssyr2_ #define F77_sspr2 csspr2_ #define F77_dsymv cdsymv_ #define F77_dsbmv cdsbmv_ #define F77_dspmv cdspmv_ #define F77_dger cdger_ #define F77_dsyr cdsyr_ #define F77_dspr cdspr_ #define F77_dsyr2 cdsyr2_ #define F77_dspr2 cdspr2_ #define F77_chemv cchemv_ #define F77_chbmv cchbmv_ #define F77_chpmv cchpmv_ #define F77_cgeru ccgeru_ #define F77_cgerc ccgerc_ #define F77_cher ccher_ #define F77_chpr cchpr_ #define F77_cher2 ccher2_ #define F77_chpr2 cchpr2_ #define F77_zhemv czhemv_ #define F77_zhbmv czhbmv_ #define F77_zhpmv czhpmv_ #define F77_zgeru czgeru_ #define F77_zgerc czgerc_ #define F77_zher czher_ #define F77_zhpr czhpr_ #define F77_zher2 czher2_ #define F77_zhpr2 czhpr2_ #define F77_sgemv csgemv_ #define F77_sgbmv csgbmv_ #define F77_strmv cstrmv_ #define F77_stbmv cstbmv_ #define F77_stpmv cstpmv_ #define F77_strsv cstrsv_ #define F77_stbsv cstbsv_ #define F77_stpsv cstpsv_ #define F77_dgemv cdgemv_ #define F77_dgbmv cdgbmv_ #define F77_dtrmv cdtrmv_ #define F77_dtbmv cdtbmv_ #define F77_dtpmv cdtpmv_ #define F77_dtrsv cdtrsv_ #define F77_dtbsv cdtbsv_ #define F77_dtpsv cdtpsv_ #define F77_cgemv ccgemv_ #define F77_cgbmv ccgbmv_ #define F77_ctrmv cctrmv_ #define F77_ctbmv cctbmv_ #define F77_ctpmv cctpmv_ #define F77_ctrsv cctrsv_ #define F77_ctbsv cctbsv_ #define F77_ctpsv cctpsv_ #define F77_zgemv czgemv_ #define F77_zgbmv czgbmv_ #define F77_ztrmv cztrmv_ #define F77_ztbmv cztbmv_ #define F77_ztpmv cztpmv_ #define F77_ztrsv cztrsv_ #define F77_ztbsv cztbsv_ #define F77_ztpsv cztpsv_ /* * Level 3 BLAS */ #define F77_s3chke cs3chke_ #define F77_d3chke cd3chke_ #define F77_c3chke cc3chke_ #define F77_z3chke cz3chke_ #define F77_chemm cchemm_ #define F77_cherk ccherk_ #define F77_cher2k ccher2k_ #define F77_zhemm czhemm_ #define F77_zherk czherk_ #define F77_zher2k czher2k_ #define F77_sgemm csgemm_ #define F77_ssymm cssymm_ #define F77_ssyrk cssyrk_ #define F77_ssyr2k cssyr2k_ #define F77_strmm cstrmm_ #define F77_strsm cstrsm_ #define F77_dgemm cdgemm_ #define F77_dsymm cdsymm_ #define F77_dsyrk cdsyrk_ #define F77_dsyr2k cdsyr2k_ #define F77_dtrmm cdtrmm_ #define F77_dtrsm cdtrsm_ #define F77_cgemm ccgemm_ #define F77_cgemm3m ccgemm3m_ #define F77_csymm ccsymm_ #define F77_csyrk ccsyrk_ #define F77_csyr2k ccsyr2k_ #define F77_ctrmm cctrmm_ #define F77_ctrsm cctrsm_ #define F77_zgemm czgemm_ #define F77_zgemm3m czgemm3m_ #define F77_zsymm czsymm_ #define F77_zsyrk czsyrk_ #define F77_zsyr2k czsyr2k_ #define F77_ztrmm cztrmm_ #define F77_ztrsm cztrsm_ #elif defined(UPCASE) /* * Level 1 BLAS */ #define F77_srotg SROTGTEST #define F77_srotmg SROTMGTEST #define F77_srot SROTCTEST #define F77_srotm SROTMTEST #define F77_drotg DROTGTEST #define F77_drotmg DROTMGTEST #define F77_drot DROTTEST #define F77_drotm DROTMTEST #define F77_sswap SSWAPTEST #define F77_scopy SCOPYTEST #define F77_saxpy SAXPYTEST #define F77_isamax ISAMAXTEST #define F77_dswap DSWAPTEST #define F77_dcopy DCOPYTEST #define F77_daxpy DAXPYTEST #define F77_idamax IDAMAXTEST #define F77_cswap CSWAPTEST #define F77_ccopy CCOPYTEST #define F77_caxpy CAXPYTEST #define F77_icamax ICAMAXTEST #define F77_zswap ZSWAPTEST #define F77_zcopy ZCOPYTEST #define F77_zaxpy ZAXPYTEST #define F77_izamax IZAMAXTEST #define F77_sdot SDOTTEST #define F77_ddot DDOTTEST #define F77_dsdot DSDOTTEST #define F77_sscal SSCALTEST #define F77_dscal DSCALTEST #define F77_cscal CSCALTEST #define F77_zscal ZSCALTEST #define F77_csscal CSSCALTEST #define F77_zdscal ZDSCALTEST #define F77_cdotu CDOTUTEST #define F77_cdotc CDOTCTEST #define F77_zdotu ZDOTUTEST #define F77_zdotc ZDOTCTEST #define F77_snrm2 SNRM2TEST #define F77_sasum SASUMTEST #define F77_dnrm2 DNRM2TEST #define F77_dasum DASUMTEST #define F77_scnrm2 SCNRM2TEST #define F77_scasum SCASUMTEST #define F77_dznrm2 DZNRM2TEST #define F77_dzasum DZASUMTEST #define F77_sdsdot SDSDOTTEST /* * Level 2 BLAS */ #define F77_s2chke CS2CHKE #define F77_d2chke CD2CHKE #define F77_c2chke CC2CHKE #define F77_z2chke CZ2CHKE #define F77_ssymv CSSYMV #define F77_ssbmv CSSBMV #define F77_sspmv CSSPMV #define F77_sger CSGER #define F77_ssyr CSSYR #define F77_sspr CSSPR #define F77_ssyr2 CSSYR2 #define F77_sspr2 CSSPR2 #define F77_dsymv CDSYMV #define F77_dsbmv CDSBMV #define F77_dspmv CDSPMV #define F77_dger CDGER #define F77_dsyr CDSYR #define F77_dspr CDSPR #define F77_dsyr2 CDSYR2 #define F77_dspr2 CDSPR2 #define F77_chemv CCHEMV #define F77_chbmv CCHBMV #define F77_chpmv CCHPMV #define F77_cgeru CCGERU #define F77_cgerc CCGERC #define F77_cher CCHER #define F77_chpr CCHPR #define F77_cher2 CCHER2 #define F77_chpr2 CCHPR2 #define F77_zhemv CZHEMV #define F77_zhbmv CZHBMV #define F77_zhpmv CZHPMV #define F77_zgeru CZGERU #define F77_zgerc CZGERC #define F77_zher CZHER #define F77_zhpr CZHPR #define F77_zher2 CZHER2 #define F77_zhpr2 CZHPR2 #define F77_sgemv CSGEMV #define F77_sgbmv CSGBMV #define F77_strmv CSTRMV #define F77_stbmv CSTBMV #define F77_stpmv CSTPMV #define F77_strsv CSTRSV #define F77_stbsv CSTBSV #define F77_stpsv CSTPSV #define F77_dgemv CDGEMV #define F77_dgbmv CDGBMV #define F77_dtrmv CDTRMV #define F77_dtbmv CDTBMV #define F77_dtpmv CDTPMV #define F77_dtrsv CDTRSV #define F77_dtbsv CDTBSV #define F77_dtpsv CDTPSV #define F77_cgemv CCGEMV #define F77_cgbmv CCGBMV #define F77_ctrmv CCTRMV #define F77_ctbmv CCTBMV #define F77_ctpmv CCTPMV #define F77_ctrsv CCTRSV #define F77_ctbsv CCTBSV #define F77_ctpsv CCTPSV #define F77_zgemv CZGEMV #define F77_zgbmv CZGBMV #define F77_ztrmv CZTRMV #define F77_ztbmv CZTBMV #define F77_ztpmv CZTPMV #define F77_ztrsv CZTRSV #define F77_ztbsv CZTBSV #define F77_ztpsv CZTPSV /* * Level 3 BLAS */ #define F77_s3chke CS3CHKE #define F77_d3chke CD3CHKE #define F77_c3chke CC3CHKE #define F77_z3chke CZ3CHKE #define F77_chemm CCHEMM #define F77_cherk CCHERK #define F77_cher2k CCHER2K #define F77_zhemm CZHEMM #define F77_zherk CZHERK #define F77_zher2k CZHER2K #define F77_sgemm CSGEMM #define F77_ssymm CSSYMM #define F77_ssyrk CSSYRK #define F77_ssyr2k CSSYR2K #define F77_strmm CSTRMM #define F77_strsm CSTRSM #define F77_dgemm CDGEMM #define F77_dsymm CDSYMM #define F77_dsyrk CDSYRK #define F77_dsyr2k CDSYR2K #define F77_dtrmm CDTRMM #define F77_dtrsm CDTRSM #define F77_cgemm CCGEMM #define F77_cgemm3m CCGEMM3M #define F77_csymm CCSYMM #define F77_csyrk CCSYRK #define F77_csyr2k CCSYR2K #define F77_ctrmm CCTRMM #define F77_ctrsm CCTRSM #define F77_zgemm CZGEMM #define F77_zgemm3m CZGEMM3M #define F77_zsymm CZSYMM #define F77_zsyrk CZSYRK #define F77_zsyr2k CZSYR2K #define F77_ztrmm CZTRMM #define F77_ztrsm CZTRSM #elif defined(NOCHANGE) /* * Level 1 BLAS */ #define F77_srotg srotgtest #define F77_srotmg srotmgtest #define F77_srot srottest #define F77_srotm srotmtest #define F77_drotg drotgtest #define F77_drotmg drotmgtest #define F77_drot drottest #define F77_drotm drotmtest #define F77_sswap sswaptest #define F77_scopy scopytest #define F77_saxpy saxpytest #define F77_isamax isamaxtest #define F77_dswap dswaptest #define F77_dcopy dcopytest #define F77_daxpy daxpytest #define F77_idamax idamaxtest #define F77_cswap cswaptest #define F77_ccopy ccopytest #define F77_caxpy caxpytest #define F77_icamax icamaxtest #define F77_zswap zswaptest #define F77_zcopy zcopytest #define F77_zaxpy zaxpytest #define F77_izamax izamaxtest #define F77_sdot sdottest #define F77_ddot ddottest #define F77_dsdot dsdottest #define F77_sscal sscaltest #define F77_dscal dscaltest #define F77_cscal cscaltest #define F77_zscal zscaltest #define F77_csscal csscaltest #define F77_zdscal zdscaltest #define F77_cdotu cdotutest #define F77_cdotc cdotctest #define F77_zdotu zdotutest #define F77_zdotc zdotctest #define F77_snrm2 snrm2test #define F77_sasum sasumtest #define F77_dnrm2 dnrm2test #define F77_dasum dasumtest #define F77_scnrm2 scnrm2test #define F77_scasum scasumtest #define F77_dznrm2 dznrm2test #define F77_dzasum dzasumtest #define F77_sdsdot sdsdottest /* * Level 2 BLAS */ #define F77_s2chke cs2chke #define F77_d2chke cd2chke #define F77_c2chke cc2chke #define F77_z2chke cz2chke #define F77_ssymv cssymv #define F77_ssbmv cssbmv #define F77_sspmv csspmv #define F77_sger csger #define F77_ssyr cssyr #define F77_sspr csspr #define F77_ssyr2 cssyr2 #define F77_sspr2 csspr2 #define F77_dsymv cdsymv #define F77_dsbmv cdsbmv #define F77_dspmv cdspmv #define F77_dger cdger #define F77_dsyr cdsyr #define F77_dspr cdspr #define F77_dsyr2 cdsyr2 #define F77_dspr2 cdspr2 #define F77_chemv cchemv #define F77_chbmv cchbmv #define F77_chpmv cchpmv #define F77_cgeru ccgeru #define F77_cgerc ccgerc #define F77_cher ccher #define F77_chpr cchpr #define F77_cher2 ccher2 #define F77_chpr2 cchpr2 #define F77_zhemv czhemv #define F77_zhbmv czhbmv #define F77_zhpmv czhpmv #define F77_zgeru czgeru #define F77_zgerc czgerc #define F77_zher czher #define F77_zhpr czhpr #define F77_zher2 czher2 #define F77_zhpr2 czhpr2 #define F77_sgemv csgemv #define F77_sgbmv csgbmv #define F77_strmv cstrmv #define F77_stbmv cstbmv #define F77_stpmv cstpmv #define F77_strsv cstrsv #define F77_stbsv cstbsv #define F77_stpsv cstpsv #define F77_dgemv cdgemv #define F77_dgbmv cdgbmv #define F77_dtrmv cdtrmv #define F77_dtbmv cdtbmv #define F77_dtpmv cdtpmv #define F77_dtrsv cdtrsv #define F77_dtbsv cdtbsv #define F77_dtpsv cdtpsv #define F77_cgemv ccgemv #define F77_cgbmv ccgbmv #define F77_ctrmv cctrmv #define F77_ctbmv cctbmv #define F77_ctpmv cctpmv #define F77_ctrsv cctrsv #define F77_ctbsv cctbsv #define F77_ctpsv cctpsv #define F77_zgemv czgemv #define F77_zgbmv czgbmv #define F77_ztrmv cztrmv #define F77_ztbmv cztbmv #define F77_ztpmv cztpmv #define F77_ztrsv cztrsv #define F77_ztbsv cztbsv #define F77_ztpsv cztpsv /* * Level 3 BLAS */ #define F77_s3chke cs3chke #define F77_d3chke cd3chke #define F77_c3chke cc3chke #define F77_z3chke cz3chke #define F77_chemm cchemm #define F77_cherk ccherk #define F77_cher2k ccher2k #define F77_zhemm czhemm #define F77_zherk czherk #define F77_zher2k czher2k #define F77_sgemm csgemm #define F77_ssymm cssymm #define F77_ssyrk cssyrk #define F77_ssyr2k cssyr2k #define F77_strmm cstrmm #define F77_strsm cstrsm #define F77_dgemm cdgemm #define F77_dsymm cdsymm #define F77_dsyrk cdsyrk #define F77_dsyr2k cdsyr2k #define F77_dtrmm cdtrmm #define F77_dtrsm cdtrsm #define F77_cgemm ccgemm #define F77_cgemm3m ccgemm3m #define F77_csymm ccsymm #define F77_csyrk ccsyrk #define F77_csyr2k ccsyr2k #define F77_ctrmm cctrmm #define F77_ctrsm cctrsm #define F77_zgemm czgemm #define F77_zgemm3m czgemm3m #define F77_zsymm czsymm #define F77_zsyrk czsyrk #define F77_zsyr2k czsyr2k #define F77_ztrmm cztrmm #define F77_ztrsm cztrsm #endif void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans); void get_uplo_type(char *type, enum CBLAS_UPLO *uplo); void get_diag_type(char *type, enum CBLAS_DIAG *diag); void get_side_type(char *type, enum CBLAS_SIDE *side); #endif /* CBLAS_TEST_H */ OpenBLAS-0.2.20/ctest/cin2000066400000000000000000000031411313527062700150430ustar00rootroot00000000000000'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 5 9 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/cin3000066400000000000000000000020651313527062700150500ustar00rootroot00000000000000'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 5 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/cin3_3m000066400000000000000000000020761313527062700154510ustar00rootroot00000000000000'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 5 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA cblas_cgemm3m T PUT F FOR NO TEST. SAME COLUMNS. cblas_chemm F PUT F FOR NO TEST. SAME COLUMNS. cblas_csymm F PUT F FOR NO TEST. SAME COLUMNS. cblas_ctrmm F PUT F FOR NO TEST. SAME COLUMNS. cblas_ctrsm F PUT F FOR NO TEST. SAME COLUMNS. cblas_cherk F PUT F FOR NO TEST. SAME COLUMNS. cblas_csyrk F PUT F FOR NO TEST. SAME COLUMNS. cblas_cher2k F PUT F FOR NO TEST. SAME COLUMNS. cblas_csyr2k F PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/constant.c000066400000000000000000000000501313527062700162560ustar00rootroot00000000000000int CBLAS_CallFromC; int RowMajorStrg; OpenBLAS-0.2.20/ctest/din2000066400000000000000000000030031313527062700150410ustar00rootroot00000000000000'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 5 9 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 0.9 VALUES OF BETA cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/din3000066400000000000000000000015771313527062700150600ustar00rootroot00000000000000'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 1 2 3 5 7 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 1.3 VALUES OF BETA cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/sin2000066400000000000000000000030131313527062700150610ustar00rootroot00000000000000'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 5 9 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 0.9 VALUES OF BETA cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/sin3000066400000000000000000000015771313527062700150770ustar00rootroot00000000000000'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 5 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 1.3 VALUES OF BETA cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/zin2000066400000000000000000000031411313527062700150720ustar00rootroot00000000000000'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 5 9 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/zin3000066400000000000000000000020651313527062700150770ustar00rootroot00000000000000'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 5 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA cblas_zgemm T PUT F FOR NO TEST. SAME COLUMNS. cblas_zhemm T PUT F FOR NO TEST. SAME COLUMNS. cblas_zsymm T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztrmm T PUT F FOR NO TEST. SAME COLUMNS. cblas_ztrsm T PUT F FOR NO TEST. SAME COLUMNS. cblas_zherk T PUT F FOR NO TEST. SAME COLUMNS. cblas_zsyrk T PUT F FOR NO TEST. SAME COLUMNS. cblas_zher2k T PUT F FOR NO TEST. SAME COLUMNS. cblas_zsyr2k T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest/zin3_3m000066400000000000000000000020761313527062700155000ustar00rootroot00000000000000'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 5 9 35 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA cblas_zgemm3m T PUT F FOR NO TEST. SAME COLUMNS. cblas_zhemm F PUT F FOR NO TEST. SAME COLUMNS. cblas_zsymm F PUT F FOR NO TEST. SAME COLUMNS. cblas_ztrmm F PUT F FOR NO TEST. SAME COLUMNS. cblas_ztrsm F PUT F FOR NO TEST. SAME COLUMNS. cblas_zherk F PUT F FOR NO TEST. SAME COLUMNS. cblas_zsyrk F PUT F FOR NO TEST. SAME COLUMNS. cblas_zher2k F PUT F FOR NO TEST. SAME COLUMNS. cblas_zsyr2k F PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/ctest1.c000066400000000000000000000000361313527062700145120ustar00rootroot00000000000000int hogehoge(void){return 0;} OpenBLAS-0.2.20/ctest2.c000066400000000000000000000000321313527062700145070ustar00rootroot00000000000000int main(void){return 0;} OpenBLAS-0.2.20/driver/000077500000000000000000000000001313527062700144375ustar00rootroot00000000000000OpenBLAS-0.2.20/driver/level2/000077500000000000000000000000001313527062700156305ustar00rootroot00000000000000OpenBLAS-0.2.20/driver/level2/CMakeLists.txt000066400000000000000000000235711313527062700204000ustar00rootroot00000000000000 include_directories(${PROJECT_SOURCE_DIR}) # sources that need to be compiled twice, once with no flags and once with LOWER set(UL_SOURCES sbmv_k.c spmv_k.c spr_k.c spr2_k.c syr_k.c syr2_k.c ) # sources that need to be compiled several times, for UNIT, TRANSA set(U_SOURCES trmv_U.c tbmv_U.c tbsv_U.c tpmv_U.c tpsv_U.c trsv_U.c ) set(L_SOURCES trmv_L.c tbmv_L.c tbsv_L.c tpmv_L.c tpsv_L.c trsv_L.c ) set(UL_SMP_SOURCES symv_thread.c syr_thread.c syr2_thread.c spr_thread.c spr2_thread.c spmv_thread.c sbmv_thread.c ) set(NU_SMP_SOURCES trmv_thread.c tpmv_thread.c tbmv_thread.c ) set(ULVM_COMPLEX_SOURCES hbmv_k.c hpmv_k.c hpr_k.c hpr2_k.c her_k.c her2_k.c ) # objects that need LOWER set GenerateCombinationObjects("${UL_SOURCES}" "LOWER" "U" "" 1 "" "" 3) # gbmv uses a lowercase n and t GenerateNamedObjects("gbmv_k.c" "" "gbmv_n" false "" "" "" 3) GenerateNamedObjects("gbmv_k.c" "TRANS" "gbmv_t" false "" "" "" 3) # c/zgbmv GenerateNamedObjects("zgbmv_k.c" "CONJ" "gbmv_r" false "" "" "" 2) GenerateNamedObjects("zgbmv_k.c" "TRANS;CONJ" "gbmv_c" false "" "" "" 2) GenerateNamedObjects("zgbmv_k.c" "XCONJ" "gbmv_o" false "" "" "" 2) GenerateNamedObjects("zgbmv_k.c" "TRANS;XCONJ" "gbmv_u" false "" "" "" 2) GenerateNamedObjects("zgbmv_k.c" "CONJ;XCONJ" "gbmv_s" false "" "" "" 2) GenerateNamedObjects("zgbmv_k.c" "TRANS;CONJ;XCONJ" "gbmv_d" false "" "" "" 2) # special defines for complex foreach (float_type ${FLOAT_TYPES}) if (SMP) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "" "gbmv_thread_n" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") foreach (u_source ${U_SOURCES}) string(REGEX MATCH "[a-z]+" op_name ${u_source}) GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=1" 0 "${op_name}_NU" false ${float_type}) GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=2" 0 "${op_name}_TL" false ${float_type}) GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=3" 0 "${op_name}_RU" false ${float_type}) GenerateCombinationObjects("z${u_source}" "UNIT" "N" "TRANSA=4" 0 "${op_name}_CL" false ${float_type}) endforeach () foreach (l_source ${L_SOURCES}) string(REGEX MATCH "[a-z]+" op_name ${l_source}) GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=1" 0 "${op_name}_NL" false ${float_type}) GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=2" 0 "${op_name}_TU" false ${float_type}) GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=3" 0 "${op_name}_RL" false ${float_type}) GenerateCombinationObjects("z${l_source}" "UNIT" "N" "TRANSA=4" 0 "${op_name}_CU" false ${float_type}) endforeach () foreach (ulvm_source ${ULVM_COMPLEX_SOURCES}) string(REGEX MATCH "[a-z0-9]+" op_name ${ulvm_source}) GenerateNamedObjects("z${ulvm_source}" "" "${op_name}_U" false "" "" false ${float_type}) GenerateNamedObjects("z${ulvm_source}" "LOWER" "${op_name}_L" false "" "" false ${float_type}) GenerateNamedObjects("z${ulvm_source}" "HEMVREV" "${op_name}_V" false "" "" false ${float_type}) GenerateNamedObjects("z${ulvm_source}" "LOWER;HEMVREV" "${op_name}_M" false "" "" false ${float_type}) endforeach() if (SMP) GenerateNamedObjects("gemv_thread.c" "CONJ" "gemv_thread_r" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "CONJ;TRANSA" "gemv_thread_c" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "XCONJ" "gemv_thread_o" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "XCONJ;TRANSA" "gemv_thread_u" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "XCONJ;CONJ" "gemv_thread_s" false "" "" false ${float_type}) GenerateNamedObjects("gemv_thread.c" "XCONJ;CONJ;TRANSA" "gemv_thread_d" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "CONJ" "gbmv_thread_r" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "CONJ;TRANSA" "gbmv_thread_c" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "XCONJ" "gbmv_thread_o" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "XCONJ;TRANSA" "gbmv_thread_u" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "XCONJ;CONJ" "gbmv_thread_s" false "" "" false ${float_type}) GenerateNamedObjects("gbmv_thread.c" "XCONJ;CONJ;TRANSA" "gbmv_thread_d" false "" "" false ${float_type}) GenerateNamedObjects("ger_thread.c" "" "ger_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("ger_thread.c" "CONJ" "ger_thread_C" false "" "" false ${float_type}) GenerateNamedObjects("ger_thread.c" "XCONJ" "ger_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("ger_thread.c" "XCONJ;CONJ" "ger_thread_D" false "" "" false ${float_type}) GenerateNamedObjects("sbmv_thread.c" "HEMV" "hbmv_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("sbmv_thread.c" "HEMV;LOWER" "hbmv_thread_L" false "" "" false ${float_type}) GenerateNamedObjects("sbmv_thread.c" "HEMVREV" "hbmv_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("sbmv_thread.c" "LOWER;HEMVREV" "hbmv_thread_M" false "" "" false ${float_type}) GenerateNamedObjects("spmv_thread.c" "HEMV" "hpmv_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("spmv_thread.c" "HEMV;LOWER" "hpmv_thread_L" false "" "" false ${float_type}) GenerateNamedObjects("spmv_thread.c" "HEMVREV" "hpmv_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("spmv_thread.c" "LOWER;HEMVREV" "hpmv_thread_M" false "" "" false ${float_type}) GenerateNamedObjects("spr_thread.c" "HEMV" "hpr_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("spr_thread.c" "HEMV;LOWER" "hpr_thread_L" false "" "" false ${float_type}) GenerateNamedObjects("spr_thread.c" "HEMVREV" "hpr_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("spr_thread.c" "LOWER;HEMVREV" "hpr_thread_M" false "" "" false ${float_type}) GenerateNamedObjects("spr2_thread.c" "HEMV" "hpr2_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("spr2_thread.c" "HEMV;LOWER" "hpr2_thread_L" false "" "" false ${float_type}) GenerateNamedObjects("spr2_thread.c" "HEMVREV" "hpr2_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("spr2_thread.c" "LOWER;HEMVREV" "hpr2_thread_M" false "" "" false ${float_type}) GenerateNamedObjects("symv_thread.c" "HEMV" "hemv_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("symv_thread.c" "HEMV;LOWER" "hemv_thread_L" false "" "" false ${float_type}) GenerateNamedObjects("symv_thread.c" "HEMVREV" "hemv_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("symv_thread.c" "LOWER;HEMVREV" "hemv_thread_M" false "" "" false ${float_type}) GenerateNamedObjects("syr_thread.c" "HER" "her_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("syr_thread.c" "HER;LOWER" "her_thread_L" false "" "" false ${float_type}) GenerateNamedObjects("syr_thread.c" "HERREV" "her_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("syr_thread.c" "LOWER;HERREV" "her_thread_M" false "" "" false ${float_type}) GenerateNamedObjects("syr2_thread.c" "HER" "her2_thread_U" false "" "" false ${float_type}) GenerateNamedObjects("syr2_thread.c" "HER;LOWER" "her2_thread_L" false "" "" false ${float_type}) GenerateNamedObjects("syr2_thread.c" "HERREV" "her2_thread_V" false "" "" false ${float_type}) GenerateNamedObjects("syr2_thread.c" "LOWER;HERREV" "her2_thread_M" false "" "" false ${float_type}) foreach (nu_smp_src ${NU_SMP_SOURCES}) string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_src}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=1" 0 "${op_name}_N" false ${float_type}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=2" 0 "${op_name}_T" false ${float_type}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=3" 0 "${op_name}_R" false ${float_type}) GenerateCombinationObjects("${nu_smp_src}" "LOWER;UNIT" "U;N" "TRANSA=4" 0 "${op_name}_C" false ${float_type}) endforeach () endif () else () # For real number functions foreach (u_source ${U_SOURCES}) string(REGEX MATCH "[a-z]+" op_name ${u_source}) GenerateCombinationObjects("${u_source}" "UNIT" "N" "" 0 "${op_name}_NU" false ${float_type}) GenerateCombinationObjects("${u_source}" "UNIT" "N" "TRANSA" 0 "${op_name}_TL" false ${float_type}) endforeach () foreach (l_source ${L_SOURCES}) string(REGEX MATCH "[a-z]+" op_name ${l_source}) GenerateCombinationObjects("${l_source}" "UNIT" "N" "" 0 "${op_name}_NL" false ${float_type}) GenerateCombinationObjects("${l_source}" "UNIT" "N" "TRANSA" 0 "${op_name}_TU" false ${float_type}) endforeach () if (SMP) GenerateNamedObjects("ger_thread.c" "" "" false "" "" false ${float_type}) foreach(nu_smp_source ${NU_SMP_SOURCES}) string(REGEX MATCH "[a-z]+_[a-z]+" op_name ${nu_smp_source}) GenerateCombinationObjects("${nu_smp_source}" "LOWER;UNIT" "U;N" "" 0 "${op_name}_N" false ${float_type}) GenerateCombinationObjects("${nu_smp_source}" "LOWER;UNIT" "U;N" "TRANSA" 0 "${op_name}_T" false ${float_type}) endforeach() endif () endif () endforeach () if (SMP) GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "" 2) endif () add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) OpenBLAS-0.2.20/driver/level2/Makefile000066400000000000000000005334421313527062700173030ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = \ sgbmv_n.$(SUFFIX) sgbmv_t.$(SUFFIX) \ ssbmv_U.$(SUFFIX) ssbmv_L.$(SUFFIX) sspmv_U.$(SUFFIX) sspmv_L.$(SUFFIX) \ sspr_U.$(SUFFIX) sspr_L.$(SUFFIX) sspr2_U.$(SUFFIX) sspr2_L.$(SUFFIX) \ ssyr_U.$(SUFFIX) ssyr_L.$(SUFFIX) ssyr2_U.$(SUFFIX) ssyr2_L.$(SUFFIX) \ stbmv_NUU.$(SUFFIX) stbmv_NUN.$(SUFFIX) stbmv_NLU.$(SUFFIX) stbmv_NLN.$(SUFFIX) \ stbmv_TUU.$(SUFFIX) stbmv_TUN.$(SUFFIX) stbmv_TLU.$(SUFFIX) stbmv_TLN.$(SUFFIX) \ stbsv_NUU.$(SUFFIX) stbsv_NUN.$(SUFFIX) stbsv_NLU.$(SUFFIX) stbsv_NLN.$(SUFFIX) \ stbsv_TUU.$(SUFFIX) stbsv_TUN.$(SUFFIX) stbsv_TLU.$(SUFFIX) stbsv_TLN.$(SUFFIX) \ stpmv_NUU.$(SUFFIX) stpmv_NUN.$(SUFFIX) stpmv_NLU.$(SUFFIX) stpmv_NLN.$(SUFFIX) \ stpmv_TUU.$(SUFFIX) stpmv_TUN.$(SUFFIX) stpmv_TLU.$(SUFFIX) stpmv_TLN.$(SUFFIX) \ stpsv_NUU.$(SUFFIX) stpsv_NUN.$(SUFFIX) stpsv_NLU.$(SUFFIX) stpsv_NLN.$(SUFFIX) \ stpsv_TUU.$(SUFFIX) stpsv_TUN.$(SUFFIX) stpsv_TLU.$(SUFFIX) stpsv_TLN.$(SUFFIX) \ strmv_NUU.$(SUFFIX) strmv_NUN.$(SUFFIX) strmv_NLU.$(SUFFIX) strmv_NLN.$(SUFFIX) \ strmv_TUU.$(SUFFIX) strmv_TUN.$(SUFFIX) strmv_TLU.$(SUFFIX) strmv_TLN.$(SUFFIX) \ strsv_NUU.$(SUFFIX) strsv_NUN.$(SUFFIX) strsv_NLU.$(SUFFIX) strsv_NLN.$(SUFFIX) \ strsv_TUU.$(SUFFIX) strsv_TUN.$(SUFFIX) strsv_TLU.$(SUFFIX) strsv_TLN.$(SUFFIX) DBLASOBJS = \ dgbmv_n.$(SUFFIX) dgbmv_t.$(SUFFIX) \ dsbmv_U.$(SUFFIX) dsbmv_L.$(SUFFIX) dspmv_U.$(SUFFIX) dspmv_L.$(SUFFIX) \ dspr_U.$(SUFFIX) dspr_L.$(SUFFIX) dspr2_U.$(SUFFIX) dspr2_L.$(SUFFIX) \ dsyr_U.$(SUFFIX) dsyr_L.$(SUFFIX) dsyr2_U.$(SUFFIX) dsyr2_L.$(SUFFIX) \ dtbmv_NUU.$(SUFFIX) dtbmv_NUN.$(SUFFIX) dtbmv_NLU.$(SUFFIX) dtbmv_NLN.$(SUFFIX) \ dtbmv_TUU.$(SUFFIX) dtbmv_TUN.$(SUFFIX) dtbmv_TLU.$(SUFFIX) dtbmv_TLN.$(SUFFIX) \ dtbsv_NUU.$(SUFFIX) dtbsv_NUN.$(SUFFIX) dtbsv_NLU.$(SUFFIX) dtbsv_NLN.$(SUFFIX) \ dtbsv_TUU.$(SUFFIX) dtbsv_TUN.$(SUFFIX) dtbsv_TLU.$(SUFFIX) dtbsv_TLN.$(SUFFIX) \ dtpmv_NUU.$(SUFFIX) dtpmv_NUN.$(SUFFIX) dtpmv_NLU.$(SUFFIX) dtpmv_NLN.$(SUFFIX) \ dtpmv_TUU.$(SUFFIX) dtpmv_TUN.$(SUFFIX) dtpmv_TLU.$(SUFFIX) dtpmv_TLN.$(SUFFIX) \ dtpsv_NUU.$(SUFFIX) dtpsv_NUN.$(SUFFIX) dtpsv_NLU.$(SUFFIX) dtpsv_NLN.$(SUFFIX) \ dtpsv_TUU.$(SUFFIX) dtpsv_TUN.$(SUFFIX) dtpsv_TLU.$(SUFFIX) dtpsv_TLN.$(SUFFIX) \ dtrmv_NUU.$(SUFFIX) dtrmv_NUN.$(SUFFIX) dtrmv_NLU.$(SUFFIX) dtrmv_NLN.$(SUFFIX) \ dtrmv_TUU.$(SUFFIX) dtrmv_TUN.$(SUFFIX) dtrmv_TLU.$(SUFFIX) dtrmv_TLN.$(SUFFIX) \ dtrsv_NUU.$(SUFFIX) dtrsv_NUN.$(SUFFIX) dtrsv_NLU.$(SUFFIX) dtrsv_NLN.$(SUFFIX) \ dtrsv_TUU.$(SUFFIX) dtrsv_TUN.$(SUFFIX) dtrsv_TLU.$(SUFFIX) dtrsv_TLN.$(SUFFIX) QBLASOBJS = \ qgbmv_n.$(SUFFIX) qgbmv_t.$(SUFFIX) \ qsbmv_U.$(SUFFIX) qsbmv_L.$(SUFFIX) qspmv_U.$(SUFFIX) qspmv_L.$(SUFFIX) \ qspr_U.$(SUFFIX) qspr_L.$(SUFFIX) qspr2_U.$(SUFFIX) qspr2_L.$(SUFFIX) \ qsyr_U.$(SUFFIX) qsyr_L.$(SUFFIX) qsyr2_U.$(SUFFIX) qsyr2_L.$(SUFFIX) \ qtbmv_NUU.$(SUFFIX) qtbmv_NUN.$(SUFFIX) qtbmv_NLU.$(SUFFIX) qtbmv_NLN.$(SUFFIX) \ qtbmv_TUU.$(SUFFIX) qtbmv_TUN.$(SUFFIX) qtbmv_TLU.$(SUFFIX) qtbmv_TLN.$(SUFFIX) \ qtbsv_NUU.$(SUFFIX) qtbsv_NUN.$(SUFFIX) qtbsv_NLU.$(SUFFIX) qtbsv_NLN.$(SUFFIX) \ qtbsv_TUU.$(SUFFIX) qtbsv_TUN.$(SUFFIX) qtbsv_TLU.$(SUFFIX) qtbsv_TLN.$(SUFFIX) \ qtpmv_NUU.$(SUFFIX) qtpmv_NUN.$(SUFFIX) qtpmv_NLU.$(SUFFIX) qtpmv_NLN.$(SUFFIX) \ qtpmv_TUU.$(SUFFIX) qtpmv_TUN.$(SUFFIX) qtpmv_TLU.$(SUFFIX) qtpmv_TLN.$(SUFFIX) \ qtpsv_NUU.$(SUFFIX) qtpsv_NUN.$(SUFFIX) qtpsv_NLU.$(SUFFIX) qtpsv_NLN.$(SUFFIX) \ qtpsv_TUU.$(SUFFIX) qtpsv_TUN.$(SUFFIX) qtpsv_TLU.$(SUFFIX) qtpsv_TLN.$(SUFFIX) \ qtrmv_NUU.$(SUFFIX) qtrmv_NUN.$(SUFFIX) qtrmv_NLU.$(SUFFIX) qtrmv_NLN.$(SUFFIX) \ qtrmv_TUU.$(SUFFIX) qtrmv_TUN.$(SUFFIX) qtrmv_TLU.$(SUFFIX) qtrmv_TLN.$(SUFFIX) \ qtrsv_NUU.$(SUFFIX) qtrsv_NUN.$(SUFFIX) qtrsv_NLU.$(SUFFIX) qtrsv_NLN.$(SUFFIX) \ qtrsv_TUU.$(SUFFIX) qtrsv_TUN.$(SUFFIX) qtrsv_TLU.$(SUFFIX) qtrsv_TLN.$(SUFFIX) CBLASOBJS += \ cgbmv_n.$(SUFFIX) cgbmv_t.$(SUFFIX) cgbmv_r.$(SUFFIX) cgbmv_c.$(SUFFIX) \ cgbmv_o.$(SUFFIX) cgbmv_u.$(SUFFIX) cgbmv_s.$(SUFFIX) cgbmv_d.$(SUFFIX) \ chbmv_U.$(SUFFIX) chbmv_L.$(SUFFIX) chbmv_V.$(SUFFIX) chbmv_M.$(SUFFIX) \ cher_U.$(SUFFIX) cher_L.$(SUFFIX) cher_V.$(SUFFIX) cher_M.$(SUFFIX) \ cher2_U.$(SUFFIX) cher2_L.$(SUFFIX) cher2_V.$(SUFFIX) cher2_M.$(SUFFIX) \ chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \ chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \ chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \ csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \ ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \ ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \ ctbmv_CUU.$(SUFFIX) ctbmv_CUN.$(SUFFIX) ctbmv_CLU.$(SUFFIX) ctbmv_CLN.$(SUFFIX) \ ctbsv_NUU.$(SUFFIX) ctbsv_NUN.$(SUFFIX) ctbsv_NLU.$(SUFFIX) ctbsv_NLN.$(SUFFIX) \ ctbsv_TUU.$(SUFFIX) ctbsv_TUN.$(SUFFIX) ctbsv_TLU.$(SUFFIX) ctbsv_TLN.$(SUFFIX) \ ctbsv_RUU.$(SUFFIX) ctbsv_RUN.$(SUFFIX) ctbsv_RLU.$(SUFFIX) ctbsv_RLN.$(SUFFIX) \ ctbsv_CUU.$(SUFFIX) ctbsv_CUN.$(SUFFIX) ctbsv_CLU.$(SUFFIX) ctbsv_CLN.$(SUFFIX) \ ctpmv_NUU.$(SUFFIX) ctpmv_NUN.$(SUFFIX) ctpmv_NLU.$(SUFFIX) ctpmv_NLN.$(SUFFIX) \ ctpmv_TUU.$(SUFFIX) ctpmv_TUN.$(SUFFIX) ctpmv_TLU.$(SUFFIX) ctpmv_TLN.$(SUFFIX) \ ctpmv_RUU.$(SUFFIX) ctpmv_RUN.$(SUFFIX) ctpmv_RLU.$(SUFFIX) ctpmv_RLN.$(SUFFIX) \ ctpmv_CUU.$(SUFFIX) ctpmv_CUN.$(SUFFIX) ctpmv_CLU.$(SUFFIX) ctpmv_CLN.$(SUFFIX) \ ctpsv_NUU.$(SUFFIX) ctpsv_NUN.$(SUFFIX) ctpsv_NLU.$(SUFFIX) ctpsv_NLN.$(SUFFIX) \ ctpsv_TUU.$(SUFFIX) ctpsv_TUN.$(SUFFIX) ctpsv_TLU.$(SUFFIX) ctpsv_TLN.$(SUFFIX) \ ctpsv_RUU.$(SUFFIX) ctpsv_RUN.$(SUFFIX) ctpsv_RLU.$(SUFFIX) ctpsv_RLN.$(SUFFIX) \ ctpsv_CUU.$(SUFFIX) ctpsv_CUN.$(SUFFIX) ctpsv_CLU.$(SUFFIX) ctpsv_CLN.$(SUFFIX) \ ctrmv_NUU.$(SUFFIX) ctrmv_NUN.$(SUFFIX) ctrmv_NLU.$(SUFFIX) ctrmv_NLN.$(SUFFIX) \ ctrmv_TUU.$(SUFFIX) ctrmv_TUN.$(SUFFIX) ctrmv_TLU.$(SUFFIX) ctrmv_TLN.$(SUFFIX) \ ctrmv_RUU.$(SUFFIX) ctrmv_RUN.$(SUFFIX) ctrmv_RLU.$(SUFFIX) ctrmv_RLN.$(SUFFIX) \ ctrmv_CUU.$(SUFFIX) ctrmv_CUN.$(SUFFIX) ctrmv_CLU.$(SUFFIX) ctrmv_CLN.$(SUFFIX) \ ctrsv_NUU.$(SUFFIX) ctrsv_NUN.$(SUFFIX) ctrsv_NLU.$(SUFFIX) ctrsv_NLN.$(SUFFIX) \ ctrsv_TUU.$(SUFFIX) ctrsv_TUN.$(SUFFIX) ctrsv_TLU.$(SUFFIX) ctrsv_TLN.$(SUFFIX) \ ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) ZBLASOBJS += \ zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \ zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \ zhbmv_U.$(SUFFIX) zhbmv_L.$(SUFFIX) zhbmv_V.$(SUFFIX) zhbmv_M.$(SUFFIX) \ zher_U.$(SUFFIX) zher_L.$(SUFFIX) zher_V.$(SUFFIX) zher_M.$(SUFFIX) \ zher2_U.$(SUFFIX) zher2_L.$(SUFFIX) zher2_V.$(SUFFIX) zher2_M.$(SUFFIX) \ zhpmv_U.$(SUFFIX) zhpmv_L.$(SUFFIX) zhpmv_V.$(SUFFIX) zhpmv_M.$(SUFFIX) \ zhpr_U.$(SUFFIX) zhpr_L.$(SUFFIX) zhpr_V.$(SUFFIX) zhpr_M.$(SUFFIX) \ zhpr2_U.$(SUFFIX) zhpr2_L.$(SUFFIX) zhpr2_V.$(SUFFIX) zhpr2_M.$(SUFFIX) \ zsbmv_U.$(SUFFIX) zsbmv_L.$(SUFFIX) zspmv_U.$(SUFFIX) zspmv_L.$(SUFFIX) \ zspr_U.$(SUFFIX) zspr_L.$(SUFFIX) zspr2_U.$(SUFFIX) zspr2_L.$(SUFFIX) \ zsyr_U.$(SUFFIX) zsyr_L.$(SUFFIX) zsyr2_U.$(SUFFIX) zsyr2_L.$(SUFFIX) \ ztbmv_NUU.$(SUFFIX) ztbmv_NUN.$(SUFFIX) ztbmv_NLU.$(SUFFIX) ztbmv_NLN.$(SUFFIX) \ ztbmv_TUU.$(SUFFIX) ztbmv_TUN.$(SUFFIX) ztbmv_TLU.$(SUFFIX) ztbmv_TLN.$(SUFFIX) \ ztbmv_RUU.$(SUFFIX) ztbmv_RUN.$(SUFFIX) ztbmv_RLU.$(SUFFIX) ztbmv_RLN.$(SUFFIX) \ ztbmv_CUU.$(SUFFIX) ztbmv_CUN.$(SUFFIX) ztbmv_CLU.$(SUFFIX) ztbmv_CLN.$(SUFFIX) \ ztbsv_NUU.$(SUFFIX) ztbsv_NUN.$(SUFFIX) ztbsv_NLU.$(SUFFIX) ztbsv_NLN.$(SUFFIX) \ ztbsv_TUU.$(SUFFIX) ztbsv_TUN.$(SUFFIX) ztbsv_TLU.$(SUFFIX) ztbsv_TLN.$(SUFFIX) \ ztbsv_RUU.$(SUFFIX) ztbsv_RUN.$(SUFFIX) ztbsv_RLU.$(SUFFIX) ztbsv_RLN.$(SUFFIX) \ ztbsv_CUU.$(SUFFIX) ztbsv_CUN.$(SUFFIX) ztbsv_CLU.$(SUFFIX) ztbsv_CLN.$(SUFFIX) \ ztpmv_NUU.$(SUFFIX) ztpmv_NUN.$(SUFFIX) ztpmv_NLU.$(SUFFIX) ztpmv_NLN.$(SUFFIX) \ ztpmv_TUU.$(SUFFIX) ztpmv_TUN.$(SUFFIX) ztpmv_TLU.$(SUFFIX) ztpmv_TLN.$(SUFFIX) \ ztpmv_RUU.$(SUFFIX) ztpmv_RUN.$(SUFFIX) ztpmv_RLU.$(SUFFIX) ztpmv_RLN.$(SUFFIX) \ ztpmv_CUU.$(SUFFIX) ztpmv_CUN.$(SUFFIX) ztpmv_CLU.$(SUFFIX) ztpmv_CLN.$(SUFFIX) \ ztpsv_NUU.$(SUFFIX) ztpsv_NUN.$(SUFFIX) ztpsv_NLU.$(SUFFIX) ztpsv_NLN.$(SUFFIX) \ ztpsv_TUU.$(SUFFIX) ztpsv_TUN.$(SUFFIX) ztpsv_TLU.$(SUFFIX) ztpsv_TLN.$(SUFFIX) \ ztpsv_RUU.$(SUFFIX) ztpsv_RUN.$(SUFFIX) ztpsv_RLU.$(SUFFIX) ztpsv_RLN.$(SUFFIX) \ ztpsv_CUU.$(SUFFIX) ztpsv_CUN.$(SUFFIX) ztpsv_CLU.$(SUFFIX) ztpsv_CLN.$(SUFFIX) \ ztrmv_NUU.$(SUFFIX) ztrmv_NUN.$(SUFFIX) ztrmv_NLU.$(SUFFIX) ztrmv_NLN.$(SUFFIX) \ ztrmv_TUU.$(SUFFIX) ztrmv_TUN.$(SUFFIX) ztrmv_TLU.$(SUFFIX) ztrmv_TLN.$(SUFFIX) \ ztrmv_RUU.$(SUFFIX) ztrmv_RUN.$(SUFFIX) ztrmv_RLU.$(SUFFIX) ztrmv_RLN.$(SUFFIX) \ ztrmv_CUU.$(SUFFIX) ztrmv_CUN.$(SUFFIX) ztrmv_CLU.$(SUFFIX) ztrmv_CLN.$(SUFFIX) \ ztrsv_NUU.$(SUFFIX) ztrsv_NUN.$(SUFFIX) ztrsv_NLU.$(SUFFIX) ztrsv_NLN.$(SUFFIX) \ ztrsv_TUU.$(SUFFIX) ztrsv_TUN.$(SUFFIX) ztrsv_TLU.$(SUFFIX) ztrsv_TLN.$(SUFFIX) \ ztrsv_RUU.$(SUFFIX) ztrsv_RUN.$(SUFFIX) ztrsv_RLU.$(SUFFIX) ztrsv_RLN.$(SUFFIX) \ ztrsv_CUU.$(SUFFIX) ztrsv_CUN.$(SUFFIX) ztrsv_CLU.$(SUFFIX) ztrsv_CLN.$(SUFFIX) XBLASOBJS += \ xgbmv_n.$(SUFFIX) xgbmv_t.$(SUFFIX) xgbmv_r.$(SUFFIX) xgbmv_c.$(SUFFIX) \ xgbmv_o.$(SUFFIX) xgbmv_u.$(SUFFIX) xgbmv_s.$(SUFFIX) xgbmv_d.$(SUFFIX) \ xhbmv_U.$(SUFFIX) xhbmv_L.$(SUFFIX) xhbmv_V.$(SUFFIX) xhbmv_M.$(SUFFIX) \ xher_U.$(SUFFIX) xher_L.$(SUFFIX) xher_V.$(SUFFIX) xher_M.$(SUFFIX) \ xher2_U.$(SUFFIX) xher2_L.$(SUFFIX) xher2_V.$(SUFFIX) xher2_M.$(SUFFIX) \ xhpmv_U.$(SUFFIX) xhpmv_L.$(SUFFIX) xhpmv_V.$(SUFFIX) xhpmv_M.$(SUFFIX) \ xhpr_U.$(SUFFIX) xhpr_L.$(SUFFIX) xhpr_V.$(SUFFIX) xhpr_M.$(SUFFIX) \ xhpr2_U.$(SUFFIX) xhpr2_L.$(SUFFIX) xhpr2_V.$(SUFFIX) xhpr2_M.$(SUFFIX) \ xsbmv_U.$(SUFFIX) xsbmv_L.$(SUFFIX) xspmv_U.$(SUFFIX) xspmv_L.$(SUFFIX) \ xspr_U.$(SUFFIX) xspr_L.$(SUFFIX) xspr2_U.$(SUFFIX) xspr2_L.$(SUFFIX) \ xsyr_U.$(SUFFIX) xsyr_L.$(SUFFIX) xsyr2_U.$(SUFFIX) xsyr2_L.$(SUFFIX) \ xtbmv_NUU.$(SUFFIX) xtbmv_NUN.$(SUFFIX) xtbmv_NLU.$(SUFFIX) xtbmv_NLN.$(SUFFIX) \ xtbmv_TUU.$(SUFFIX) xtbmv_TUN.$(SUFFIX) xtbmv_TLU.$(SUFFIX) xtbmv_TLN.$(SUFFIX) \ xtbmv_RUU.$(SUFFIX) xtbmv_RUN.$(SUFFIX) xtbmv_RLU.$(SUFFIX) xtbmv_RLN.$(SUFFIX) \ xtbmv_CUU.$(SUFFIX) xtbmv_CUN.$(SUFFIX) xtbmv_CLU.$(SUFFIX) xtbmv_CLN.$(SUFFIX) \ xtbsv_NUU.$(SUFFIX) xtbsv_NUN.$(SUFFIX) xtbsv_NLU.$(SUFFIX) xtbsv_NLN.$(SUFFIX) \ xtbsv_TUU.$(SUFFIX) xtbsv_TUN.$(SUFFIX) xtbsv_TLU.$(SUFFIX) xtbsv_TLN.$(SUFFIX) \ xtbsv_RUU.$(SUFFIX) xtbsv_RUN.$(SUFFIX) xtbsv_RLU.$(SUFFIX) xtbsv_RLN.$(SUFFIX) \ xtbsv_CUU.$(SUFFIX) xtbsv_CUN.$(SUFFIX) xtbsv_CLU.$(SUFFIX) xtbsv_CLN.$(SUFFIX) \ xtpmv_NUU.$(SUFFIX) xtpmv_NUN.$(SUFFIX) xtpmv_NLU.$(SUFFIX) xtpmv_NLN.$(SUFFIX) \ xtpmv_TUU.$(SUFFIX) xtpmv_TUN.$(SUFFIX) xtpmv_TLU.$(SUFFIX) xtpmv_TLN.$(SUFFIX) \ xtpmv_RUU.$(SUFFIX) xtpmv_RUN.$(SUFFIX) xtpmv_RLU.$(SUFFIX) xtpmv_RLN.$(SUFFIX) \ xtpmv_CUU.$(SUFFIX) xtpmv_CUN.$(SUFFIX) xtpmv_CLU.$(SUFFIX) xtpmv_CLN.$(SUFFIX) \ xtpsv_NUU.$(SUFFIX) xtpsv_NUN.$(SUFFIX) xtpsv_NLU.$(SUFFIX) xtpsv_NLN.$(SUFFIX) \ xtpsv_TUU.$(SUFFIX) xtpsv_TUN.$(SUFFIX) xtpsv_TLU.$(SUFFIX) xtpsv_TLN.$(SUFFIX) \ xtpsv_RUU.$(SUFFIX) xtpsv_RUN.$(SUFFIX) xtpsv_RLU.$(SUFFIX) xtpsv_RLN.$(SUFFIX) \ xtpsv_CUU.$(SUFFIX) xtpsv_CUN.$(SUFFIX) xtpsv_CLU.$(SUFFIX) xtpsv_CLN.$(SUFFIX) \ xtrmv_NUU.$(SUFFIX) xtrmv_NUN.$(SUFFIX) xtrmv_NLU.$(SUFFIX) xtrmv_NLN.$(SUFFIX) \ xtrmv_TUU.$(SUFFIX) xtrmv_TUN.$(SUFFIX) xtrmv_TLU.$(SUFFIX) xtrmv_TLN.$(SUFFIX) \ xtrmv_RUU.$(SUFFIX) xtrmv_RUN.$(SUFFIX) xtrmv_RLU.$(SUFFIX) xtrmv_RLN.$(SUFFIX) \ xtrmv_CUU.$(SUFFIX) xtrmv_CUN.$(SUFFIX) xtrmv_CLU.$(SUFFIX) xtrmv_CLN.$(SUFFIX) \ xtrsv_NUU.$(SUFFIX) xtrsv_NUN.$(SUFFIX) xtrsv_NLU.$(SUFFIX) xtrsv_NLN.$(SUFFIX) \ xtrsv_TUU.$(SUFFIX) xtrsv_TUN.$(SUFFIX) xtrsv_TLU.$(SUFFIX) xtrsv_TLN.$(SUFFIX) \ xtrsv_RUU.$(SUFFIX) xtrsv_RUN.$(SUFFIX) xtrsv_RLU.$(SUFFIX) xtrsv_RLN.$(SUFFIX) \ xtrsv_CUU.$(SUFFIX) xtrsv_CUN.$(SUFFIX) xtrsv_CLU.$(SUFFIX) xtrsv_CLN.$(SUFFIX) HPLOBJS = \ dtrsv_NLU.$(SUFFIX) dtrsv_NUN.$(SUFFIX) dtrsv_NUU.$(SUFFIX) dtrsv_NLN.$(SUFFIX) \ dtrsv_TLN.$(SUFFIX) dtrsv_TLU.$(SUFFIX) dtrsv_TUN.$(SUFFIX) dtrsv_TUU.$(SUFFIX) ifdef SMP SBLASOBJS += \ sgemv_thread_n.$(SUFFIX) sgemv_thread_t.$(SUFFIX) \ sger_thread.$(SUFFIX) \ ssymv_thread_U.$(SUFFIX) ssymv_thread_L.$(SUFFIX) \ ssyr_thread_U.$(SUFFIX) ssyr_thread_L.$(SUFFIX) \ ssyr2_thread_U.$(SUFFIX) ssyr2_thread_L.$(SUFFIX) \ sspr_thread_U.$(SUFFIX) sspr_thread_L.$(SUFFIX) \ sspr2_thread_U.$(SUFFIX) sspr2_thread_L.$(SUFFIX) \ strmv_thread_NUU.$(SUFFIX) strmv_thread_NUN.$(SUFFIX) \ strmv_thread_NLU.$(SUFFIX) strmv_thread_NLN.$(SUFFIX) \ strmv_thread_TUU.$(SUFFIX) strmv_thread_TUN.$(SUFFIX) \ strmv_thread_TLU.$(SUFFIX) strmv_thread_TLN.$(SUFFIX) \ sspmv_thread_U.$(SUFFIX) sspmv_thread_L.$(SUFFIX) \ stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUN.$(SUFFIX) \ stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLN.$(SUFFIX) \ stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUN.$(SUFFIX) \ stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLN.$(SUFFIX) \ sgbmv_thread_n.$(SUFFIX) sgbmv_thread_t.$(SUFFIX) \ ssbmv_thread_U.$(SUFFIX) ssbmv_thread_L.$(SUFFIX) \ stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUN.$(SUFFIX) \ stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLN.$(SUFFIX) \ stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUN.$(SUFFIX) \ stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLN.$(SUFFIX) \ DBLASOBJS += \ dgemv_thread_n.$(SUFFIX) dgemv_thread_t.$(SUFFIX) \ dger_thread.$(SUFFIX) \ dsymv_thread_U.$(SUFFIX) dsymv_thread_L.$(SUFFIX) \ dsyr_thread_U.$(SUFFIX) dsyr_thread_L.$(SUFFIX) \ dsyr2_thread_U.$(SUFFIX) dsyr2_thread_L.$(SUFFIX) \ dspr_thread_U.$(SUFFIX) dspr_thread_L.$(SUFFIX) \ dspr2_thread_U.$(SUFFIX) dspr2_thread_L.$(SUFFIX) \ dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUN.$(SUFFIX) \ dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLN.$(SUFFIX) \ dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUN.$(SUFFIX) \ dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLN.$(SUFFIX) \ dspmv_thread_U.$(SUFFIX) dspmv_thread_L.$(SUFFIX) \ dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUN.$(SUFFIX) \ dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLN.$(SUFFIX) \ dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUN.$(SUFFIX) \ dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLN.$(SUFFIX) \ dgbmv_thread_n.$(SUFFIX) dgbmv_thread_t.$(SUFFIX) \ dsbmv_thread_U.$(SUFFIX) dsbmv_thread_L.$(SUFFIX) \ dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUN.$(SUFFIX) \ dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLN.$(SUFFIX) \ dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUN.$(SUFFIX) \ dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLN.$(SUFFIX) \ QBLASOBJS += \ qgemv_thread_n.$(SUFFIX) qgemv_thread_t.$(SUFFIX) \ qger_thread.$(SUFFIX) \ qsymv_thread_U.$(SUFFIX) qsymv_thread_L.$(SUFFIX) \ qsyr_thread_U.$(SUFFIX) qsyr_thread_L.$(SUFFIX) \ qsyr2_thread_U.$(SUFFIX) qsyr2_thread_L.$(SUFFIX) \ qspr_thread_U.$(SUFFIX) qspr_thread_L.$(SUFFIX) \ qspr2_thread_U.$(SUFFIX) qspr2_thread_L.$(SUFFIX) \ qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUN.$(SUFFIX) \ qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLN.$(SUFFIX) \ qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUN.$(SUFFIX) \ qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLN.$(SUFFIX) \ qspmv_thread_U.$(SUFFIX) qspmv_thread_L.$(SUFFIX) \ qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUN.$(SUFFIX) \ qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLN.$(SUFFIX) \ qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUN.$(SUFFIX) \ qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLN.$(SUFFIX) \ qgbmv_thread_n.$(SUFFIX) qgbmv_thread_t.$(SUFFIX) \ qsbmv_thread_U.$(SUFFIX) qsbmv_thread_L.$(SUFFIX) \ qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUN.$(SUFFIX) \ qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLN.$(SUFFIX) \ qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUN.$(SUFFIX) \ qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLN.$(SUFFIX) \ CBLASOBJS += \ cgemv_thread_n.$(SUFFIX) cgemv_thread_t.$(SUFFIX) \ cgemv_thread_r.$(SUFFIX) cgemv_thread_c.$(SUFFIX) \ cgemv_thread_o.$(SUFFIX) cgemv_thread_u.$(SUFFIX) \ cgemv_thread_s.$(SUFFIX) cgemv_thread_d.$(SUFFIX) \ cger_thread_U.$(SUFFIX) cger_thread_C.$(SUFFIX) \ cger_thread_V.$(SUFFIX) cger_thread_D.$(SUFFIX) \ csymv_thread_U.$(SUFFIX) csymv_thread_L.$(SUFFIX) \ chemv_thread_U.$(SUFFIX) chemv_thread_L.$(SUFFIX) \ chemv_thread_V.$(SUFFIX) chemv_thread_M.$(SUFFIX) \ csyr_thread_U.$(SUFFIX) csyr_thread_L.$(SUFFIX) \ cher_thread_U.$(SUFFIX) cher_thread_L.$(SUFFIX) \ cher_thread_V.$(SUFFIX) cher_thread_M.$(SUFFIX) \ csyr2_thread_U.$(SUFFIX) csyr2_thread_L.$(SUFFIX) \ cher2_thread_U.$(SUFFIX) cher2_thread_L.$(SUFFIX) \ cher2_thread_V.$(SUFFIX) cher2_thread_M.$(SUFFIX) \ cspr_thread_U.$(SUFFIX) cspr_thread_L.$(SUFFIX) \ chpr_thread_U.$(SUFFIX) chpr_thread_L.$(SUFFIX) \ chpr_thread_V.$(SUFFIX) chpr_thread_M.$(SUFFIX) \ cspr2_thread_U.$(SUFFIX) cspr2_thread_L.$(SUFFIX) \ chpr2_thread_U.$(SUFFIX) chpr2_thread_L.$(SUFFIX) \ chpr2_thread_V.$(SUFFIX) chpr2_thread_M.$(SUFFIX) \ ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUN.$(SUFFIX) \ ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLN.$(SUFFIX) \ ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUN.$(SUFFIX) \ ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLN.$(SUFFIX) \ ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUN.$(SUFFIX) \ ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLN.$(SUFFIX) \ ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUN.$(SUFFIX) \ ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLN.$(SUFFIX) \ cspmv_thread_U.$(SUFFIX) cspmv_thread_L.$(SUFFIX) \ chpmv_thread_U.$(SUFFIX) chpmv_thread_L.$(SUFFIX) \ chpmv_thread_V.$(SUFFIX) chpmv_thread_M.$(SUFFIX) \ ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUN.$(SUFFIX) \ ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLN.$(SUFFIX) \ ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUN.$(SUFFIX) \ ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLN.$(SUFFIX) \ ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUN.$(SUFFIX) \ ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLN.$(SUFFIX) \ ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUN.$(SUFFIX) \ ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLN.$(SUFFIX) \ cgbmv_thread_n.$(SUFFIX) cgbmv_thread_t.$(SUFFIX) \ cgbmv_thread_r.$(SUFFIX) cgbmv_thread_c.$(SUFFIX) \ cgbmv_thread_o.$(SUFFIX) cgbmv_thread_u.$(SUFFIX) \ cgbmv_thread_s.$(SUFFIX) cgbmv_thread_d.$(SUFFIX) \ csbmv_thread_U.$(SUFFIX) csbmv_thread_L.$(SUFFIX) \ chbmv_thread_U.$(SUFFIX) chbmv_thread_L.$(SUFFIX) \ chbmv_thread_V.$(SUFFIX) chbmv_thread_M.$(SUFFIX) \ ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUN.$(SUFFIX) \ ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLN.$(SUFFIX) \ ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUN.$(SUFFIX) \ ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLN.$(SUFFIX) \ ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUN.$(SUFFIX) \ ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLN.$(SUFFIX) \ ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUN.$(SUFFIX) \ ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLN.$(SUFFIX) \ ZBLASOBJS += \ zgemv_thread_n.$(SUFFIX) zgemv_thread_t.$(SUFFIX) \ zgemv_thread_r.$(SUFFIX) zgemv_thread_c.$(SUFFIX) \ zgemv_thread_o.$(SUFFIX) zgemv_thread_u.$(SUFFIX) \ zgemv_thread_s.$(SUFFIX) zgemv_thread_d.$(SUFFIX) \ zger_thread_U.$(SUFFIX) zger_thread_C.$(SUFFIX) \ zger_thread_V.$(SUFFIX) zger_thread_D.$(SUFFIX) \ zsymv_thread_U.$(SUFFIX) zsymv_thread_L.$(SUFFIX) \ zhemv_thread_U.$(SUFFIX) zhemv_thread_L.$(SUFFIX) \ zhemv_thread_V.$(SUFFIX) zhemv_thread_M.$(SUFFIX) \ zsyr_thread_U.$(SUFFIX) zsyr_thread_L.$(SUFFIX) \ zher_thread_U.$(SUFFIX) zher_thread_L.$(SUFFIX) \ zher_thread_V.$(SUFFIX) zher_thread_M.$(SUFFIX) \ zsyr2_thread_U.$(SUFFIX) zsyr2_thread_L.$(SUFFIX) \ zher2_thread_U.$(SUFFIX) zher2_thread_L.$(SUFFIX) \ zher2_thread_V.$(SUFFIX) zher2_thread_M.$(SUFFIX) \ zspr_thread_U.$(SUFFIX) zspr_thread_L.$(SUFFIX) \ zhpr_thread_U.$(SUFFIX) zhpr_thread_L.$(SUFFIX) \ zhpr_thread_V.$(SUFFIX) zhpr_thread_M.$(SUFFIX) \ zspr2_thread_U.$(SUFFIX) zspr2_thread_L.$(SUFFIX) \ zhpr2_thread_U.$(SUFFIX) zhpr2_thread_L.$(SUFFIX) \ zhpr2_thread_V.$(SUFFIX) zhpr2_thread_M.$(SUFFIX) \ ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUN.$(SUFFIX) \ ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLN.$(SUFFIX) \ ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUN.$(SUFFIX) \ ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLN.$(SUFFIX) \ ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUN.$(SUFFIX) \ ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLN.$(SUFFIX) \ ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUN.$(SUFFIX) \ ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLN.$(SUFFIX) \ zspmv_thread_U.$(SUFFIX) zspmv_thread_L.$(SUFFIX) \ zhpmv_thread_U.$(SUFFIX) zhpmv_thread_L.$(SUFFIX) \ zhpmv_thread_V.$(SUFFIX) zhpmv_thread_M.$(SUFFIX) \ ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUN.$(SUFFIX) \ ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLN.$(SUFFIX) \ ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUN.$(SUFFIX) \ ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLN.$(SUFFIX) \ ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUN.$(SUFFIX) \ ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLN.$(SUFFIX) \ ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUN.$(SUFFIX) \ ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLN.$(SUFFIX) \ zgbmv_thread_n.$(SUFFIX) zgbmv_thread_t.$(SUFFIX) \ zgbmv_thread_r.$(SUFFIX) zgbmv_thread_c.$(SUFFIX) \ zgbmv_thread_o.$(SUFFIX) zgbmv_thread_u.$(SUFFIX) \ zgbmv_thread_s.$(SUFFIX) zgbmv_thread_d.$(SUFFIX) \ zsbmv_thread_U.$(SUFFIX) zsbmv_thread_L.$(SUFFIX) \ zhbmv_thread_U.$(SUFFIX) zhbmv_thread_L.$(SUFFIX) \ zhbmv_thread_V.$(SUFFIX) zhbmv_thread_M.$(SUFFIX) \ ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUN.$(SUFFIX) \ ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLN.$(SUFFIX) \ ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUN.$(SUFFIX) \ ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLN.$(SUFFIX) \ ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUN.$(SUFFIX) \ ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLN.$(SUFFIX) \ ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUN.$(SUFFIX) \ ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLN.$(SUFFIX) \ XBLASOBJS += \ xgemv_thread_n.$(SUFFIX) xgemv_thread_t.$(SUFFIX) \ xgemv_thread_r.$(SUFFIX) xgemv_thread_c.$(SUFFIX) \ xgemv_thread_o.$(SUFFIX) xgemv_thread_u.$(SUFFIX) \ xgemv_thread_s.$(SUFFIX) xgemv_thread_d.$(SUFFIX) \ xger_thread_U.$(SUFFIX) xger_thread_C.$(SUFFIX) \ xger_thread_V.$(SUFFIX) xger_thread_D.$(SUFFIX) \ xsymv_thread_U.$(SUFFIX) xsymv_thread_L.$(SUFFIX) \ xhemv_thread_U.$(SUFFIX) xhemv_thread_L.$(SUFFIX) \ xhemv_thread_V.$(SUFFIX) xhemv_thread_M.$(SUFFIX) \ xsyr_thread_U.$(SUFFIX) xsyr_thread_L.$(SUFFIX) \ xher_thread_U.$(SUFFIX) xher_thread_L.$(SUFFIX) \ xher_thread_V.$(SUFFIX) xher_thread_M.$(SUFFIX) \ xsyr2_thread_U.$(SUFFIX) xsyr2_thread_L.$(SUFFIX) \ xher2_thread_U.$(SUFFIX) xher2_thread_L.$(SUFFIX) \ xher2_thread_V.$(SUFFIX) xher2_thread_M.$(SUFFIX) \ xspr_thread_U.$(SUFFIX) xspr_thread_L.$(SUFFIX) \ xhpr_thread_U.$(SUFFIX) xhpr_thread_L.$(SUFFIX) \ xhpr_thread_V.$(SUFFIX) xhpr_thread_M.$(SUFFIX) \ xspr2_thread_U.$(SUFFIX) xspr2_thread_L.$(SUFFIX) \ xhpr2_thread_U.$(SUFFIX) xhpr2_thread_L.$(SUFFIX) \ xhpr2_thread_V.$(SUFFIX) xhpr2_thread_M.$(SUFFIX) \ xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUN.$(SUFFIX) \ xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLN.$(SUFFIX) \ xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUN.$(SUFFIX) \ xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLN.$(SUFFIX) \ xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUN.$(SUFFIX) \ xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLN.$(SUFFIX) \ xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUN.$(SUFFIX) \ xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLN.$(SUFFIX) \ xspmv_thread_U.$(SUFFIX) xspmv_thread_L.$(SUFFIX) \ xhpmv_thread_U.$(SUFFIX) xhpmv_thread_L.$(SUFFIX) \ xhpmv_thread_V.$(SUFFIX) xhpmv_thread_M.$(SUFFIX) \ xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUN.$(SUFFIX) \ xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLN.$(SUFFIX) \ xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUN.$(SUFFIX) \ xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLN.$(SUFFIX) \ xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUN.$(SUFFIX) \ xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLN.$(SUFFIX) \ xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUN.$(SUFFIX) \ xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLN.$(SUFFIX) \ xgbmv_thread_n.$(SUFFIX) xgbmv_thread_t.$(SUFFIX) \ xgbmv_thread_r.$(SUFFIX) xgbmv_thread_c.$(SUFFIX) \ xgbmv_thread_o.$(SUFFIX) xgbmv_thread_u.$(SUFFIX) \ xgbmv_thread_s.$(SUFFIX) xgbmv_thread_d.$(SUFFIX) \ xsbmv_thread_U.$(SUFFIX) xsbmv_thread_L.$(SUFFIX) \ xhbmv_thread_U.$(SUFFIX) xhbmv_thread_L.$(SUFFIX) \ xhbmv_thread_V.$(SUFFIX) xhbmv_thread_M.$(SUFFIX) \ xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUN.$(SUFFIX) \ xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLN.$(SUFFIX) \ xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUN.$(SUFFIX) \ xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLN.$(SUFFIX) \ xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \ xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \ xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \ endif all :: sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_t.$(SUFFIX) cgbmv_t.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_r.$(SUFFIX) cgbmv_r.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_c.$(SUFFIX) cgbmv_c.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_o.$(SUFFIX) cgbmv_o.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< cgbmv_u.$(SUFFIX) cgbmv_u.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_t.$(SUFFIX) zgbmv_t.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_r.$(SUFFIX) zgbmv_r.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_c.$(SUFFIX) zgbmv_c.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_o.$(SUFFIX) zgbmv_o.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_u.$(SUFFIX) zgbmv_u.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_t.$(SUFFIX) xgbmv_t.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_r.$(SUFFIX) xgbmv_r.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_c.$(SUFFIX) xgbmv_c.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_o.$(SUFFIX) xgbmv_o.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_u.$(SUFFIX) xgbmv_u.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_thread_t.$(SUFFIX) cgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_thread_r.$(SUFFIX) cgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_thread_c.$(SUFFIX) cgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< cgbmv_thread_o.$(SUFFIX) cgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< cgbmv_thread_u.$(SUFFIX) cgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_t.$(SUFFIX) zgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_r.$(SUFFIX) zgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_c.$(SUFFIX) zgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_o.$(SUFFIX) zgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_u.$(SUFFIX) zgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_t.$(SUFFIX) xgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_r.$(SUFFIX) xgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_c.$(SUFFIX) xgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_o.$(SUFFIX) xgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_u.$(SUFFIX) xgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) cgemv_thread_t.$(SUFFIX) cgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) cgemv_thread_r.$(SUFFIX) cgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) cgemv_thread_c.$(SUFFIX) cgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) cgemv_thread_o.$(SUFFIX) cgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) cgemv_thread_u.$(SUFFIX) cgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) zgemv_thread_t.$(SUFFIX) zgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) zgemv_thread_r.$(SUFFIX) zgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) zgemv_thread_c.$(SUFFIX) zgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) zgemv_thread_o.$(SUFFIX) zgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) zgemv_thread_u.$(SUFFIX) zgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) xgemv_thread_t.$(SUFFIX) xgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) xgemv_thread_r.$(SUFFIX) xgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) xgemv_thread_c.$(SUFFIX) xgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) xgemv_thread_o.$(SUFFIX) xgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) xgemv_thread_u.$(SUFFIX) xgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) xgemv_thread_s.$(SUFFIX) xgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) xgemv_thread_d.$(SUFFIX) xgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) sger_thread.$(SUFFIX) sger_thread.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) dger_thread.$(SUFFIX) dger_thread.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) qger_thread.$(SUFFIX) qger_thread.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) cger_thread_U.$(SUFFIX) cger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) cger_thread_C.$(SUFFIX) cger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -UXCONJ $< -o $(@F) cger_thread_V.$(SUFFIX) cger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -DXCONJ $< -o $(@F) cger_thread_D.$(SUFFIX) cger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -DXCONJ $< -o $(@F) zger_thread_U.$(SUFFIX) zger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) zger_thread_C.$(SUFFIX) zger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -UXCONJ $< -o $(@F) zger_thread_V.$(SUFFIX) zger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -DXCONJ $< -o $(@F) zger_thread_D.$(SUFFIX) zger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -DXCONJ $< -o $(@F) xger_thread_U.$(SUFFIX) xger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) xger_thread_C.$(SUFFIX) xger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -UXCONJ $< -o $(@F) xger_thread_V.$(SUFFIX) xger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -DXCONJ $< -o $(@F) xger_thread_D.$(SUFFIX) xger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -DXCONJ $< -o $(@F) ssymv_thread_U.$(SUFFIX) ssymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) ssymv_thread_L.$(SUFFIX) ssymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dsymv_thread_U.$(SUFFIX) dsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dsymv_thread_L.$(SUFFIX) dsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qsymv_thread_U.$(SUFFIX) qsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qsymv_thread_L.$(SUFFIX) qsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) csymv_thread_U.$(SUFFIX) csymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) csymv_thread_L.$(SUFFIX) csymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zsymv_thread_U.$(SUFFIX) zsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zsymv_thread_L.$(SUFFIX) zsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xsymv_thread_U.$(SUFFIX) xsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xsymv_thread_L.$(SUFFIX) xsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) chemv_thread_U.$(SUFFIX) chemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) chemv_thread_L.$(SUFFIX) chemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) chemv_thread_V.$(SUFFIX) chemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) chemv_thread_M.$(SUFFIX) chemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zhemv_thread_U.$(SUFFIX) zhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) zhemv_thread_L.$(SUFFIX) zhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) zhemv_thread_V.$(SUFFIX) zhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zhemv_thread_M.$(SUFFIX) zhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xhemv_thread_U.$(SUFFIX) xhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) xhemv_thread_L.$(SUFFIX) xhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) xhemv_thread_V.$(SUFFIX) xhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xhemv_thread_M.$(SUFFIX) xhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) ssyr_thread_U.$(SUFFIX) ssyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) ssyr_thread_L.$(SUFFIX) ssyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dsyr_thread_U.$(SUFFIX) dsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dsyr_thread_L.$(SUFFIX) dsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qsyr_thread_U.$(SUFFIX) qsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qsyr_thread_L.$(SUFFIX) qsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) csyr_thread_U.$(SUFFIX) csyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) csyr_thread_L.$(SUFFIX) csyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zsyr_thread_U.$(SUFFIX) zsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zsyr_thread_L.$(SUFFIX) zsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xsyr_thread_U.$(SUFFIX) xsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xsyr_thread_L.$(SUFFIX) xsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cher_thread_U.$(SUFFIX) cher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) cher_thread_L.$(SUFFIX) cher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) cher_thread_V.$(SUFFIX) cher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) cher_thread_M.$(SUFFIX) cher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) zher_thread_U.$(SUFFIX) zher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) zher_thread_L.$(SUFFIX) zher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) zher_thread_V.$(SUFFIX) zher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) zher_thread_M.$(SUFFIX) zher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) xher_thread_U.$(SUFFIX) xher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) xher_thread_L.$(SUFFIX) xher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) xher_thread_V.$(SUFFIX) xher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) xher_thread_M.$(SUFFIX) xher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) ssyr2_thread_U.$(SUFFIX) ssyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) ssyr2_thread_L.$(SUFFIX) ssyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dsyr2_thread_U.$(SUFFIX) dsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dsyr2_thread_L.$(SUFFIX) dsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qsyr2_thread_U.$(SUFFIX) qsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qsyr2_thread_L.$(SUFFIX) qsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) csyr2_thread_U.$(SUFFIX) csyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) csyr2_thread_L.$(SUFFIX) csyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zsyr2_thread_U.$(SUFFIX) zsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zsyr2_thread_L.$(SUFFIX) zsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xsyr2_thread_U.$(SUFFIX) xsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xsyr2_thread_L.$(SUFFIX) xsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cher2_thread_U.$(SUFFIX) cher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) cher2_thread_L.$(SUFFIX) cher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) cher2_thread_V.$(SUFFIX) cher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) cher2_thread_M.$(SUFFIX) cher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) zher2_thread_U.$(SUFFIX) zher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) zher2_thread_L.$(SUFFIX) zher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) zher2_thread_V.$(SUFFIX) zher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) zher2_thread_M.$(SUFFIX) zher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) xher2_thread_U.$(SUFFIX) xher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) xher2_thread_L.$(SUFFIX) xher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) xher2_thread_V.$(SUFFIX) xher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) xher2_thread_M.$(SUFFIX) xher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) chbmv_U.$(SUFFIX) chbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) chbmv_L.$(SUFFIX) chbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) chbmv_V.$(SUFFIX) chbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) chbmv_M.$(SUFFIX) chbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zhbmv_U.$(SUFFIX) zhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zhbmv_L.$(SUFFIX) zhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) zhbmv_V.$(SUFFIX) zhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zhbmv_M.$(SUFFIX) zhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xhbmv_U.$(SUFFIX) xhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xhbmv_L.$(SUFFIX) xhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) xhbmv_V.$(SUFFIX) xhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xhbmv_M.$(SUFFIX) xhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) chbmv_thread_U.$(SUFFIX) chbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) chbmv_thread_L.$(SUFFIX) chbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) chbmv_thread_V.$(SUFFIX) chbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) chbmv_thread_M.$(SUFFIX) chbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zhbmv_thread_U.$(SUFFIX) zhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) zhbmv_thread_L.$(SUFFIX) zhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) zhbmv_thread_V.$(SUFFIX) zhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zhbmv_thread_M.$(SUFFIX) zhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xhbmv_thread_U.$(SUFFIX) xhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) xhbmv_thread_L.$(SUFFIX) xhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) xhbmv_thread_V.$(SUFFIX) xhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xhbmv_thread_M.$(SUFFIX) xhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) cher_U.$(SUFFIX) cher_U.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) cher_L.$(SUFFIX) cher_L.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) cher_V.$(SUFFIX) cher_V.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) cher_M.$(SUFFIX) cher_M.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zher_U.$(SUFFIX) zher_U.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) zher_L.$(SUFFIX) zher_L.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) zher_V.$(SUFFIX) zher_V.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zher_M.$(SUFFIX) zher_M.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xher_U.$(SUFFIX) xher_U.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) xher_L.$(SUFFIX) xher_L.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) xher_V.$(SUFFIX) xher_V.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xher_M.$(SUFFIX) xher_M.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) cher2_U.$(SUFFIX) cher2_U.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) cher2_L.$(SUFFIX) cher2_L.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) cher2_V.$(SUFFIX) cher2_V.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) cher2_M.$(SUFFIX) cher2_M.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) zher2_U.$(SUFFIX) zher2_U.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) zher2_L.$(SUFFIX) zher2_L.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) zher2_V.$(SUFFIX) zher2_V.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) zher2_M.$(SUFFIX) zher2_M.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) xher2_U.$(SUFFIX) xher2_U.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) xher2_L.$(SUFFIX) xher2_L.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) xher2_V.$(SUFFIX) xher2_V.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -DHEMVREV -o $(@F) xher2_M.$(SUFFIX) xher2_M.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) chpmv_U.$(SUFFIX) chpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) chpmv_L.$(SUFFIX) chpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) chpmv_V.$(SUFFIX) chpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) chpmv_M.$(SUFFIX) chpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zhpmv_U.$(SUFFIX) zhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zhpmv_L.$(SUFFIX) zhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) zhpmv_V.$(SUFFIX) zhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zhpmv_M.$(SUFFIX) zhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xhpmv_U.$(SUFFIX) xhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xhpmv_L.$(SUFFIX) xhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) xhpmv_V.$(SUFFIX) xhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xhpmv_M.$(SUFFIX) xhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) chpmv_thread_U.$(SUFFIX) chpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) chpmv_thread_L.$(SUFFIX) chpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) chpmv_thread_V.$(SUFFIX) chpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) chpmv_thread_M.$(SUFFIX) chpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zhpmv_thread_U.$(SUFFIX) zhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) zhpmv_thread_L.$(SUFFIX) zhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) zhpmv_thread_V.$(SUFFIX) zhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zhpmv_thread_M.$(SUFFIX) zhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xhpmv_thread_U.$(SUFFIX) xhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) xhpmv_thread_L.$(SUFFIX) xhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) xhpmv_thread_V.$(SUFFIX) xhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xhpmv_thread_M.$(SUFFIX) xhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) chpr_U.$(SUFFIX) chpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) chpr_L.$(SUFFIX) chpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) chpr_V.$(SUFFIX) chpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) chpr_M.$(SUFFIX) chpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zhpr_U.$(SUFFIX) zhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) zhpr_L.$(SUFFIX) zhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) zhpr_V.$(SUFFIX) zhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zhpr_M.$(SUFFIX) zhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xhpr_U.$(SUFFIX) xhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) xhpr_L.$(SUFFIX) xhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) xhpr_V.$(SUFFIX) xhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xhpr_M.$(SUFFIX) xhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) chpr_thread_U.$(SUFFIX) chpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMV $< -o $(@F) chpr_thread_L.$(SUFFIX) chpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMV $< -o $(@F) chpr_thread_V.$(SUFFIX) chpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) chpr_thread_M.$(SUFFIX) chpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) zhpr_thread_U.$(SUFFIX) zhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMV $< -o $(@F) zhpr_thread_L.$(SUFFIX) zhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMV $< -o $(@F) zhpr_thread_V.$(SUFFIX) zhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) zhpr_thread_M.$(SUFFIX) zhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) xhpr_thread_U.$(SUFFIX) xhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) xhpr_thread_L.$(SUFFIX) xhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) xhpr_thread_V.$(SUFFIX) xhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) xhpr_thread_M.$(SUFFIX) xhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) chpr2_U.$(SUFFIX) chpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) chpr2_L.$(SUFFIX) chpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) chpr2_V.$(SUFFIX) chpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) chpr2_M.$(SUFFIX) chpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) zhpr2_U.$(SUFFIX) zhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) zhpr2_L.$(SUFFIX) zhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) zhpr2_V.$(SUFFIX) zhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) zhpr2_M.$(SUFFIX) zhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) xhpr2_U.$(SUFFIX) xhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) xhpr2_L.$(SUFFIX) xhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) xhpr2_V.$(SUFFIX) xhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) xhpr2_M.$(SUFFIX) xhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) chpr2_thread_U.$(SUFFIX) chpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) chpr2_thread_L.$(SUFFIX) chpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) chpr2_thread_V.$(SUFFIX) chpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) chpr2_thread_M.$(SUFFIX) chpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) zhpr2_thread_U.$(SUFFIX) zhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) zhpr2_thread_L.$(SUFFIX) zhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) zhpr2_thread_V.$(SUFFIX) zhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) zhpr2_thread_M.$(SUFFIX) zhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) xhpr2_thread_U.$(SUFFIX) xhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) xhpr2_thread_L.$(SUFFIX) xhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) xhpr2_thread_V.$(SUFFIX) xhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) xhpr2_thread_M.$(SUFFIX) xhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) ssbmv_U.$(SUFFIX) ssbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) ssbmv_L.$(SUFFIX) ssbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dsbmv_U.$(SUFFIX) dsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dsbmv_L.$(SUFFIX) dsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qsbmv_U.$(SUFFIX) qsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qsbmv_L.$(SUFFIX) qsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) csbmv_U.$(SUFFIX) csbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) csbmv_L.$(SUFFIX) csbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zsbmv_U.$(SUFFIX) zsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zsbmv_L.$(SUFFIX) zsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xsbmv_U.$(SUFFIX) xsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xsbmv_L.$(SUFFIX) xsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) ssbmv_thread_U.$(SUFFIX) ssbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) ssbmv_thread_L.$(SUFFIX) ssbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dsbmv_thread_U.$(SUFFIX) dsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dsbmv_thread_L.$(SUFFIX) dsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qsbmv_thread_U.$(SUFFIX) qsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qsbmv_thread_L.$(SUFFIX) qsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) csbmv_thread_U.$(SUFFIX) csbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) csbmv_thread_L.$(SUFFIX) csbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zsbmv_thread_U.$(SUFFIX) zsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zsbmv_thread_L.$(SUFFIX) zsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xsbmv_thread_U.$(SUFFIX) xsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xsbmv_thread_L.$(SUFFIX) xsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) sspmv_U.$(SUFFIX) sspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) sspmv_L.$(SUFFIX) sspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dspmv_U.$(SUFFIX) dspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dspmv_L.$(SUFFIX) dspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qspmv_U.$(SUFFIX) qspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qspmv_L.$(SUFFIX) qspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cspmv_U.$(SUFFIX) cspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) cspmv_L.$(SUFFIX) cspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zspmv_U.$(SUFFIX) zspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zspmv_L.$(SUFFIX) zspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xspmv_U.$(SUFFIX) xspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xspmv_L.$(SUFFIX) xspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) sspmv_thread_U.$(SUFFIX) sspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) sspmv_thread_L.$(SUFFIX) sspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dspmv_thread_U.$(SUFFIX) dspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dspmv_thread_L.$(SUFFIX) dspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qspmv_thread_U.$(SUFFIX) qspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qspmv_thread_L.$(SUFFIX) qspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cspmv_thread_U.$(SUFFIX) cspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) cspmv_thread_L.$(SUFFIX) cspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zspmv_thread_U.$(SUFFIX) zspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zspmv_thread_L.$(SUFFIX) zspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xspmv_thread_U.$(SUFFIX) xspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xspmv_thread_L.$(SUFFIX) xspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) sspr_U.$(SUFFIX) sspr_U.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) sspr_L.$(SUFFIX) sspr_L.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dspr_U.$(SUFFIX) dspr_U.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dspr_L.$(SUFFIX) dspr_L.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qspr_U.$(SUFFIX) qspr_U.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qspr_L.$(SUFFIX) qspr_L.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cspr_U.$(SUFFIX) cspr_U.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) cspr_L.$(SUFFIX) cspr_L.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zspr_U.$(SUFFIX) zspr_U.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zspr_L.$(SUFFIX) zspr_L.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xspr_U.$(SUFFIX) xspr_U.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xspr_L.$(SUFFIX) xspr_L.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) sspr_thread_U.$(SUFFIX) sspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) sspr_thread_L.$(SUFFIX) sspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dspr_thread_U.$(SUFFIX) dspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dspr_thread_L.$(SUFFIX) dspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qspr_thread_U.$(SUFFIX) qspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qspr_thread_L.$(SUFFIX) qspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cspr_thread_U.$(SUFFIX) cspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) cspr_thread_L.$(SUFFIX) cspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zspr_thread_U.$(SUFFIX) zspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zspr_thread_L.$(SUFFIX) zspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xspr_thread_U.$(SUFFIX) xspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xspr_thread_L.$(SUFFIX) xspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) sspr2_U.$(SUFFIX) sspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) sspr2_L.$(SUFFIX) sspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dspr2_U.$(SUFFIX) dspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dspr2_L.$(SUFFIX) dspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qspr2_U.$(SUFFIX) qspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qspr2_L.$(SUFFIX) qspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cspr2_U.$(SUFFIX) cspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) cspr2_L.$(SUFFIX) cspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zspr2_U.$(SUFFIX) zspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zspr2_L.$(SUFFIX) zspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xspr2_U.$(SUFFIX) xspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xspr2_L.$(SUFFIX) xspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) sspr2_thread_U.$(SUFFIX) sspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) sspr2_thread_L.$(SUFFIX) sspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dspr2_thread_U.$(SUFFIX) dspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dspr2_thread_L.$(SUFFIX) dspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qspr2_thread_U.$(SUFFIX) qspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qspr2_thread_L.$(SUFFIX) qspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) cspr2_thread_U.$(SUFFIX) cspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) cspr2_thread_L.$(SUFFIX) cspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zspr2_thread_U.$(SUFFIX) zspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zspr2_thread_L.$(SUFFIX) zspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xspr2_thread_U.$(SUFFIX) xspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xspr2_thread_L.$(SUFFIX) xspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) ssyr_U.$(SUFFIX) ssyr_U.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) ssyr_L.$(SUFFIX) ssyr_L.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dsyr_U.$(SUFFIX) dsyr_U.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dsyr_L.$(SUFFIX) dsyr_L.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qsyr_U.$(SUFFIX) qsyr_U.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qsyr_L.$(SUFFIX) qsyr_L.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) csyr_U.$(SUFFIX) csyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) csyr_L.$(SUFFIX) csyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zsyr_U.$(SUFFIX) zsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zsyr_L.$(SUFFIX) zsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xsyr_U.$(SUFFIX) xsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xsyr_L.$(SUFFIX) xsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) ssyr2_U.$(SUFFIX) ssyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) ssyr2_L.$(SUFFIX) ssyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) dsyr2_U.$(SUFFIX) dsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) dsyr2_L.$(SUFFIX) dsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) qsyr2_U.$(SUFFIX) qsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) qsyr2_L.$(SUFFIX) qsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) csyr2_U.$(SUFFIX) csyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) csyr2_L.$(SUFFIX) csyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) zsyr2_U.$(SUFFIX) zsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) zsyr2_L.$(SUFFIX) zsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) xsyr2_U.$(SUFFIX) xsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) xsyr2_L.$(SUFFIX) xsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) stbmv_NUU.$(SUFFIX) stbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stbmv_NUN.$(SUFFIX) stbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stbmv_TLU.$(SUFFIX) stbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stbmv_TLN.$(SUFFIX) stbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) stbmv_NLU.$(SUFFIX) stbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stbmv_NLN.$(SUFFIX) stbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stbmv_TUU.$(SUFFIX) stbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stbmv_TUN.$(SUFFIX) stbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtbmv_NUU.$(SUFFIX) dtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtbmv_NUN.$(SUFFIX) dtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtbmv_TLU.$(SUFFIX) dtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtbmv_TLN.$(SUFFIX) dtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtbmv_NLU.$(SUFFIX) dtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtbmv_NLN.$(SUFFIX) dtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtbmv_TUU.$(SUFFIX) dtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtbmv_TUN.$(SUFFIX) dtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtbmv_NUU.$(SUFFIX) qtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtbmv_NUN.$(SUFFIX) qtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtbmv_TLU.$(SUFFIX) qtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtbmv_TLN.$(SUFFIX) qtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtbmv_NLU.$(SUFFIX) qtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtbmv_NLN.$(SUFFIX) qtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtbmv_TUU.$(SUFFIX) qtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtbmv_TUN.$(SUFFIX) qtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) ctbmv_NUU.$(SUFFIX) ctbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctbmv_NUN.$(SUFFIX) ctbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctbmv_TLU.$(SUFFIX) ctbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctbmv_TLN.$(SUFFIX) ctbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctbmv_RLU.$(SUFFIX) ctbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctbmv_RLN.$(SUFFIX) ctbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctbmv_CLU.$(SUFFIX) ctbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctbmv_CLN.$(SUFFIX) ctbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ctbmv_NLU.$(SUFFIX) ctbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctbmv_NLN.$(SUFFIX) ctbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctbmv_TUU.$(SUFFIX) ctbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctbmv_TUN.$(SUFFIX) ctbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctbmv_RUU.$(SUFFIX) ctbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctbmv_RUN.$(SUFFIX) ctbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctbmv_CUU.$(SUFFIX) ctbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctbmv_CUN.$(SUFFIX) ctbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztbmv_NUU.$(SUFFIX) ztbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztbmv_NUN.$(SUFFIX) ztbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztbmv_TLU.$(SUFFIX) ztbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztbmv_TLN.$(SUFFIX) ztbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztbmv_RLU.$(SUFFIX) ztbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztbmv_RLN.$(SUFFIX) ztbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztbmv_CLU.$(SUFFIX) ztbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztbmv_CLN.$(SUFFIX) ztbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztbmv_NLU.$(SUFFIX) ztbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztbmv_NLN.$(SUFFIX) ztbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztbmv_TUU.$(SUFFIX) ztbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztbmv_TUN.$(SUFFIX) ztbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztbmv_RUU.$(SUFFIX) ztbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztbmv_RUN.$(SUFFIX) ztbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztbmv_CUU.$(SUFFIX) ztbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztbmv_CUN.$(SUFFIX) ztbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtbmv_NUU.$(SUFFIX) xtbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtbmv_NUN.$(SUFFIX) xtbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtbmv_TLU.$(SUFFIX) xtbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtbmv_TLN.$(SUFFIX) xtbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtbmv_RLU.$(SUFFIX) xtbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtbmv_RLN.$(SUFFIX) xtbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtbmv_CLU.$(SUFFIX) xtbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtbmv_CLN.$(SUFFIX) xtbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtbmv_NLU.$(SUFFIX) xtbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtbmv_NLN.$(SUFFIX) xtbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtbmv_TUU.$(SUFFIX) xtbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtbmv_TUN.$(SUFFIX) xtbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtbmv_RUU.$(SUFFIX) xtbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtbmv_RUN.$(SUFFIX) xtbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtbmv_CUU.$(SUFFIX) xtbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtbmv_CUN.$(SUFFIX) xtbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) stbmv_thread_NUN.$(SUFFIX) stbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) stbmv_thread_TLN.$(SUFFIX) stbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) stbmv_thread_NLN.$(SUFFIX) stbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) stbmv_thread_TUN.$(SUFFIX) stbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) dtbmv_thread_NUN.$(SUFFIX) dtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) dtbmv_thread_TLN.$(SUFFIX) dtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) dtbmv_thread_NLN.$(SUFFIX) dtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) dtbmv_thread_TUN.$(SUFFIX) dtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) qtbmv_thread_NUN.$(SUFFIX) qtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) qtbmv_thread_TLN.$(SUFFIX) qtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) qtbmv_thread_NLN.$(SUFFIX) qtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) qtbmv_thread_TUN.$(SUFFIX) qtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) ctbmv_thread_NUN.$(SUFFIX) ctbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) ctbmv_thread_TLN.$(SUFFIX) ctbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) ctbmv_thread_RLN.$(SUFFIX) ctbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) ctbmv_thread_CLN.$(SUFFIX) ctbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) ctbmv_thread_NLN.$(SUFFIX) ctbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) ctbmv_thread_TUN.$(SUFFIX) ctbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) ctbmv_thread_RUN.$(SUFFIX) ctbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) ctbmv_thread_CUN.$(SUFFIX) ctbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) ztbmv_thread_NUN.$(SUFFIX) ztbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) ztbmv_thread_TLN.$(SUFFIX) ztbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) ztbmv_thread_RLN.$(SUFFIX) ztbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) ztbmv_thread_CLN.$(SUFFIX) ztbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) ztbmv_thread_NLN.$(SUFFIX) ztbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) ztbmv_thread_TUN.$(SUFFIX) ztbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) ztbmv_thread_RUN.$(SUFFIX) ztbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) ztbmv_thread_CUN.$(SUFFIX) ztbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) xtbmv_thread_NUN.$(SUFFIX) xtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) xtbmv_thread_TLN.$(SUFFIX) xtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) xtbmv_thread_RLN.$(SUFFIX) xtbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) xtbmv_thread_CLN.$(SUFFIX) xtbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) xtbmv_thread_NLN.$(SUFFIX) xtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) xtbmv_thread_TUN.$(SUFFIX) xtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) xtbmv_thread_RUN.$(SUFFIX) xtbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) xtbmv_thread_CUN.$(SUFFIX) xtbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) stbsv_NUU.$(SUFFIX) stbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stbsv_NUN.$(SUFFIX) stbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stbsv_TLU.$(SUFFIX) stbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stbsv_TLN.$(SUFFIX) stbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) stbsv_NLU.$(SUFFIX) stbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stbsv_NLN.$(SUFFIX) stbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stbsv_TUU.$(SUFFIX) stbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stbsv_TUN.$(SUFFIX) stbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtbsv_NUU.$(SUFFIX) dtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtbsv_NUN.$(SUFFIX) dtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtbsv_TLU.$(SUFFIX) dtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtbsv_TLN.$(SUFFIX) dtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtbsv_NLU.$(SUFFIX) dtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtbsv_NLN.$(SUFFIX) dtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtbsv_TUU.$(SUFFIX) dtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtbsv_TUN.$(SUFFIX) dtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtbsv_NUU.$(SUFFIX) qtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtbsv_NUN.$(SUFFIX) qtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtbsv_TLU.$(SUFFIX) qtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtbsv_TLN.$(SUFFIX) qtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtbsv_NLU.$(SUFFIX) qtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtbsv_NLN.$(SUFFIX) qtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtbsv_TUU.$(SUFFIX) qtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtbsv_TUN.$(SUFFIX) qtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) ctbsv_NUU.$(SUFFIX) ctbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctbsv_NUN.$(SUFFIX) ctbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctbsv_TLU.$(SUFFIX) ctbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctbsv_TLN.$(SUFFIX) ctbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctbsv_RLU.$(SUFFIX) ctbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctbsv_RLN.$(SUFFIX) ctbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctbsv_CLU.$(SUFFIX) ctbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctbsv_CLN.$(SUFFIX) ctbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ctbsv_NLU.$(SUFFIX) ctbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctbsv_NLN.$(SUFFIX) ctbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctbsv_TUU.$(SUFFIX) ctbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctbsv_TUN.$(SUFFIX) ctbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctbsv_RUU.$(SUFFIX) ctbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctbsv_RUN.$(SUFFIX) ctbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctbsv_CUU.$(SUFFIX) ctbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctbsv_CUN.$(SUFFIX) ctbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztbsv_NUU.$(SUFFIX) ztbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztbsv_NUN.$(SUFFIX) ztbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztbsv_TLU.$(SUFFIX) ztbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztbsv_TLN.$(SUFFIX) ztbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztbsv_RLU.$(SUFFIX) ztbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztbsv_RLN.$(SUFFIX) ztbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztbsv_CLU.$(SUFFIX) ztbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztbsv_CLN.$(SUFFIX) ztbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztbsv_NLU.$(SUFFIX) ztbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztbsv_NLN.$(SUFFIX) ztbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztbsv_TUU.$(SUFFIX) ztbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztbsv_TUN.$(SUFFIX) ztbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztbsv_RUU.$(SUFFIX) ztbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztbsv_RUN.$(SUFFIX) ztbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztbsv_CUU.$(SUFFIX) ztbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztbsv_CUN.$(SUFFIX) ztbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtbsv_NUU.$(SUFFIX) xtbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtbsv_NUN.$(SUFFIX) xtbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtbsv_TLU.$(SUFFIX) xtbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtbsv_TLN.$(SUFFIX) xtbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtbsv_RLU.$(SUFFIX) xtbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtbsv_RLN.$(SUFFIX) xtbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtbsv_CLU.$(SUFFIX) xtbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtbsv_CLN.$(SUFFIX) xtbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtbsv_NLU.$(SUFFIX) xtbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtbsv_NLN.$(SUFFIX) xtbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtbsv_TUU.$(SUFFIX) xtbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtbsv_TUN.$(SUFFIX) xtbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtbsv_RUU.$(SUFFIX) xtbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtbsv_RUN.$(SUFFIX) xtbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtbsv_CUU.$(SUFFIX) xtbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtbsv_CUN.$(SUFFIX) xtbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) stpmv_NUU.$(SUFFIX) stpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stpmv_NUN.$(SUFFIX) stpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stpmv_TLU.$(SUFFIX) stpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stpmv_TLN.$(SUFFIX) stpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) stpmv_NLU.$(SUFFIX) stpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stpmv_NLN.$(SUFFIX) stpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stpmv_TUU.$(SUFFIX) stpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stpmv_TUN.$(SUFFIX) stpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtpmv_NUU.$(SUFFIX) dtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtpmv_NUN.$(SUFFIX) dtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtpmv_TLU.$(SUFFIX) dtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtpmv_TLN.$(SUFFIX) dtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtpmv_NLU.$(SUFFIX) dtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtpmv_NLN.$(SUFFIX) dtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtpmv_TUU.$(SUFFIX) dtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtpmv_TUN.$(SUFFIX) dtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtpmv_NUU.$(SUFFIX) qtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtpmv_NUN.$(SUFFIX) qtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtpmv_TLU.$(SUFFIX) qtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtpmv_TLN.$(SUFFIX) qtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtpmv_NLU.$(SUFFIX) qtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtpmv_NLN.$(SUFFIX) qtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtpmv_TUU.$(SUFFIX) qtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtpmv_TUN.$(SUFFIX) qtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) ctpmv_NUU.$(SUFFIX) ctpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctpmv_NUN.$(SUFFIX) ctpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctpmv_TLU.$(SUFFIX) ctpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctpmv_TLN.$(SUFFIX) ctpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctpmv_RLU.$(SUFFIX) ctpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctpmv_RLN.$(SUFFIX) ctpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctpmv_CLU.$(SUFFIX) ctpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctpmv_CLN.$(SUFFIX) ctpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ctpmv_NLU.$(SUFFIX) ctpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctpmv_NLN.$(SUFFIX) ctpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctpmv_TUU.$(SUFFIX) ctpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctpmv_TUN.$(SUFFIX) ctpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctpmv_RUU.$(SUFFIX) ctpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctpmv_RUN.$(SUFFIX) ctpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctpmv_CUU.$(SUFFIX) ctpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctpmv_CUN.$(SUFFIX) ctpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztpmv_NUU.$(SUFFIX) ztpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztpmv_NUN.$(SUFFIX) ztpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztpmv_TLU.$(SUFFIX) ztpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztpmv_TLN.$(SUFFIX) ztpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztpmv_RLU.$(SUFFIX) ztpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztpmv_RLN.$(SUFFIX) ztpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztpmv_CLU.$(SUFFIX) ztpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztpmv_CLN.$(SUFFIX) ztpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztpmv_NLU.$(SUFFIX) ztpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztpmv_NLN.$(SUFFIX) ztpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztpmv_TUU.$(SUFFIX) ztpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztpmv_TUN.$(SUFFIX) ztpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztpmv_RUU.$(SUFFIX) ztpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztpmv_RUN.$(SUFFIX) ztpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztpmv_CUU.$(SUFFIX) ztpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztpmv_CUN.$(SUFFIX) ztpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtpmv_NUU.$(SUFFIX) xtpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtpmv_NUN.$(SUFFIX) xtpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtpmv_TLU.$(SUFFIX) xtpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtpmv_TLN.$(SUFFIX) xtpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtpmv_RLU.$(SUFFIX) xtpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtpmv_RLN.$(SUFFIX) xtpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtpmv_CLU.$(SUFFIX) xtpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtpmv_CLN.$(SUFFIX) xtpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtpmv_NLU.$(SUFFIX) xtpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtpmv_NLN.$(SUFFIX) xtpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtpmv_TUU.$(SUFFIX) xtpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtpmv_TUN.$(SUFFIX) xtpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtpmv_RUU.$(SUFFIX) xtpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtpmv_RUN.$(SUFFIX) xtpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtpmv_CUU.$(SUFFIX) xtpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtpmv_CUN.$(SUFFIX) xtpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) stpmv_thread_NUN.$(SUFFIX) stpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) stpmv_thread_TLN.$(SUFFIX) stpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) stpmv_thread_NLN.$(SUFFIX) stpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) stpmv_thread_TUN.$(SUFFIX) stpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) dtpmv_thread_NUN.$(SUFFIX) dtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) dtpmv_thread_TLN.$(SUFFIX) dtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) dtpmv_thread_NLN.$(SUFFIX) dtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) dtpmv_thread_TUN.$(SUFFIX) dtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) qtpmv_thread_NUN.$(SUFFIX) qtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) qtpmv_thread_TLN.$(SUFFIX) qtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) qtpmv_thread_NLN.$(SUFFIX) qtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) qtpmv_thread_TUN.$(SUFFIX) qtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) ctpmv_thread_NUN.$(SUFFIX) ctpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) ctpmv_thread_TLN.$(SUFFIX) ctpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) ctpmv_thread_RLN.$(SUFFIX) ctpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) ctpmv_thread_CLN.$(SUFFIX) ctpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) ctpmv_thread_NLN.$(SUFFIX) ctpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) ctpmv_thread_TUN.$(SUFFIX) ctpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) ctpmv_thread_RUN.$(SUFFIX) ctpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) ctpmv_thread_CUN.$(SUFFIX) ctpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) ztpmv_thread_NUN.$(SUFFIX) ztpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) ztpmv_thread_TLN.$(SUFFIX) ztpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) ztpmv_thread_RLN.$(SUFFIX) ztpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) ztpmv_thread_CLN.$(SUFFIX) ztpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) ztpmv_thread_NLN.$(SUFFIX) ztpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) ztpmv_thread_TUN.$(SUFFIX) ztpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) ztpmv_thread_RUN.$(SUFFIX) ztpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) ztpmv_thread_CUN.$(SUFFIX) ztpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) xtpmv_thread_NUN.$(SUFFIX) xtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) xtpmv_thread_TLN.$(SUFFIX) xtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) xtpmv_thread_RLN.$(SUFFIX) xtpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) xtpmv_thread_CLN.$(SUFFIX) xtpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) xtpmv_thread_NLN.$(SUFFIX) xtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) xtpmv_thread_TUN.$(SUFFIX) xtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) xtpmv_thread_RUN.$(SUFFIX) xtpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) xtpmv_thread_CUN.$(SUFFIX) xtpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) stpsv_NUU.$(SUFFIX) stpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stpsv_NUN.$(SUFFIX) stpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stpsv_TLU.$(SUFFIX) stpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stpsv_TLN.$(SUFFIX) stpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) stpsv_NLU.$(SUFFIX) stpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) stpsv_NLN.$(SUFFIX) stpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) stpsv_TUU.$(SUFFIX) stpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) stpsv_TUN.$(SUFFIX) stpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtpsv_NUU.$(SUFFIX) dtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtpsv_NUN.$(SUFFIX) dtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtpsv_TLU.$(SUFFIX) dtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtpsv_TLN.$(SUFFIX) dtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtpsv_NLU.$(SUFFIX) dtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtpsv_NLN.$(SUFFIX) dtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtpsv_TUU.$(SUFFIX) dtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtpsv_TUN.$(SUFFIX) dtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtpsv_NUU.$(SUFFIX) qtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtpsv_NUN.$(SUFFIX) qtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtpsv_TLU.$(SUFFIX) qtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtpsv_TLN.$(SUFFIX) qtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtpsv_NLU.$(SUFFIX) qtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtpsv_NLN.$(SUFFIX) qtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtpsv_TUU.$(SUFFIX) qtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtpsv_TUN.$(SUFFIX) qtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) ctpsv_NUU.$(SUFFIX) ctpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ctpsv_NUN.$(SUFFIX) ctpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ctpsv_TLU.$(SUFFIX) ctpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ctpsv_TLN.$(SUFFIX) ctpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ctpsv_RLU.$(SUFFIX) ctpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ctpsv_RLN.$(SUFFIX) ctpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ctpsv_CLU.$(SUFFIX) ctpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ctpsv_CLN.$(SUFFIX) ctpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) ctpsv_NLU.$(SUFFIX) ctpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ctpsv_NLN.$(SUFFIX) ctpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ctpsv_TUU.$(SUFFIX) ctpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ctpsv_TUN.$(SUFFIX) ctpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ctpsv_RUU.$(SUFFIX) ctpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ctpsv_RUN.$(SUFFIX) ctpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ctpsv_CUU.$(SUFFIX) ctpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ctpsv_CUN.$(SUFFIX) ctpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) ztpsv_NUU.$(SUFFIX) ztpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ztpsv_NUN.$(SUFFIX) ztpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ztpsv_TLU.$(SUFFIX) ztpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ztpsv_TLN.$(SUFFIX) ztpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ztpsv_RLU.$(SUFFIX) ztpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ztpsv_RLN.$(SUFFIX) ztpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ztpsv_CLU.$(SUFFIX) ztpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ztpsv_CLN.$(SUFFIX) ztpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) ztpsv_NLU.$(SUFFIX) ztpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ztpsv_NLN.$(SUFFIX) ztpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ztpsv_TUU.$(SUFFIX) ztpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ztpsv_TUN.$(SUFFIX) ztpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ztpsv_RUU.$(SUFFIX) ztpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ztpsv_RUN.$(SUFFIX) ztpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ztpsv_CUU.$(SUFFIX) ztpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ztpsv_CUN.$(SUFFIX) ztpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) xtpsv_NUU.$(SUFFIX) xtpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) xtpsv_NUN.$(SUFFIX) xtpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) xtpsv_TLU.$(SUFFIX) xtpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) xtpsv_TLN.$(SUFFIX) xtpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) xtpsv_RLU.$(SUFFIX) xtpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) xtpsv_RLN.$(SUFFIX) xtpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) xtpsv_CLU.$(SUFFIX) xtpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) xtpsv_CLN.$(SUFFIX) xtpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) xtpsv_NLU.$(SUFFIX) xtpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) xtpsv_NLN.$(SUFFIX) xtpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) xtpsv_TUU.$(SUFFIX) xtpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) xtpsv_TUN.$(SUFFIX) xtpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) xtpsv_RUU.$(SUFFIX) xtpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) xtpsv_RUN.$(SUFFIX) xtpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) xtpsv_CUU.$(SUFFIX) xtpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) xtpsv_CUN.$(SUFFIX) xtpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) strmv_NUU.$(SUFFIX) strmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) strmv_NUN.$(SUFFIX) strmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) strmv_TLU.$(SUFFIX) strmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) strmv_TLN.$(SUFFIX) strmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) strmv_NLU.$(SUFFIX) strmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) strmv_NLN.$(SUFFIX) strmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) strmv_TUU.$(SUFFIX) strmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) strmv_TUN.$(SUFFIX) strmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtrmv_NUU.$(SUFFIX) dtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtrmv_NUN.$(SUFFIX) dtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtrmv_TLU.$(SUFFIX) dtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtrmv_TLN.$(SUFFIX) dtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtrmv_NLU.$(SUFFIX) dtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtrmv_NLN.$(SUFFIX) dtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtrmv_TUU.$(SUFFIX) dtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtrmv_TUN.$(SUFFIX) dtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtrmv_NUU.$(SUFFIX) qtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtrmv_NUN.$(SUFFIX) qtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtrmv_TLU.$(SUFFIX) qtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtrmv_TLN.$(SUFFIX) qtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtrmv_NLU.$(SUFFIX) qtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtrmv_NLN.$(SUFFIX) qtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtrmv_TUU.$(SUFFIX) qtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtrmv_TUN.$(SUFFIX) qtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) ctrmv_NUU.$(SUFFIX) ctrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctrmv_NUN.$(SUFFIX) ctrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctrmv_TLU.$(SUFFIX) ctrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctrmv_TLN.$(SUFFIX) ctrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctrmv_RLU.$(SUFFIX) ctrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctrmv_RLN.$(SUFFIX) ctrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctrmv_CLU.$(SUFFIX) ctrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctrmv_CLN.$(SUFFIX) ctrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ctrmv_NLU.$(SUFFIX) ctrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ctrmv_NLN.$(SUFFIX) ctrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ctrmv_TUU.$(SUFFIX) ctrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ctrmv_TUN.$(SUFFIX) ctrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ctrmv_RUU.$(SUFFIX) ctrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ctrmv_RUN.$(SUFFIX) ctrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ctrmv_CUU.$(SUFFIX) ctrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ctrmv_CUN.$(SUFFIX) ctrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztrmv_NUU.$(SUFFIX) ztrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztrmv_NUN.$(SUFFIX) ztrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztrmv_TLU.$(SUFFIX) ztrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztrmv_TLN.$(SUFFIX) ztrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztrmv_RLU.$(SUFFIX) ztrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztrmv_RLN.$(SUFFIX) ztrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztrmv_CLU.$(SUFFIX) ztrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztrmv_CLN.$(SUFFIX) ztrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) ztrmv_NLU.$(SUFFIX) ztrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) ztrmv_NLN.$(SUFFIX) ztrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) ztrmv_TUU.$(SUFFIX) ztrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) ztrmv_TUN.$(SUFFIX) ztrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) ztrmv_RUU.$(SUFFIX) ztrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) ztrmv_RUN.$(SUFFIX) ztrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) ztrmv_CUU.$(SUFFIX) ztrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) ztrmv_CUN.$(SUFFIX) ztrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtrmv_NUU.$(SUFFIX) xtrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtrmv_NUN.$(SUFFIX) xtrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtrmv_TLU.$(SUFFIX) xtrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtrmv_TLN.$(SUFFIX) xtrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtrmv_RLU.$(SUFFIX) xtrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtrmv_RLN.$(SUFFIX) xtrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtrmv_CLU.$(SUFFIX) xtrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtrmv_CLN.$(SUFFIX) xtrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) xtrmv_NLU.$(SUFFIX) xtrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) xtrmv_NLN.$(SUFFIX) xtrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) xtrmv_TUU.$(SUFFIX) xtrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) xtrmv_TUN.$(SUFFIX) xtrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) xtrmv_RUU.$(SUFFIX) xtrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) xtrmv_RUN.$(SUFFIX) xtrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) xtrmv_CUU.$(SUFFIX) xtrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) xtrmv_CUN.$(SUFFIX) xtrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) strmv_thread_NUU.$(SUFFIX) strmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) strmv_thread_NUN.$(SUFFIX) strmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) strmv_thread_TLU.$(SUFFIX) strmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) strmv_thread_TLN.$(SUFFIX) strmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) strmv_thread_NLU.$(SUFFIX) strmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) strmv_thread_NLN.$(SUFFIX) strmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) strmv_thread_TUU.$(SUFFIX) strmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) strmv_thread_TUN.$(SUFFIX) strmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) dtrmv_thread_NUN.$(SUFFIX) dtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) dtrmv_thread_TLN.$(SUFFIX) dtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) dtrmv_thread_NLN.$(SUFFIX) dtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) dtrmv_thread_TUN.$(SUFFIX) dtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) qtrmv_thread_NUN.$(SUFFIX) qtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) qtrmv_thread_TLN.$(SUFFIX) qtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) qtrmv_thread_NLN.$(SUFFIX) qtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) qtrmv_thread_TUN.$(SUFFIX) qtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) ctrmv_thread_NUN.$(SUFFIX) ctrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) ctrmv_thread_TLN.$(SUFFIX) ctrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) ctrmv_thread_RLN.$(SUFFIX) ctrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) ctrmv_thread_CLN.$(SUFFIX) ctrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) ctrmv_thread_NLN.$(SUFFIX) ctrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) ctrmv_thread_TUN.$(SUFFIX) ctrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) ctrmv_thread_RUN.$(SUFFIX) ctrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) ctrmv_thread_CUN.$(SUFFIX) ctrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) ztrmv_thread_NUN.$(SUFFIX) ztrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) ztrmv_thread_TLN.$(SUFFIX) ztrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) ztrmv_thread_RLN.$(SUFFIX) ztrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) ztrmv_thread_CLN.$(SUFFIX) ztrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) ztrmv_thread_NLN.$(SUFFIX) ztrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) ztrmv_thread_TUN.$(SUFFIX) ztrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) ztrmv_thread_RUN.$(SUFFIX) ztrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) ztrmv_thread_CUN.$(SUFFIX) ztrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) xtrmv_thread_NUN.$(SUFFIX) xtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) xtrmv_thread_TLN.$(SUFFIX) xtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) xtrmv_thread_RLN.$(SUFFIX) xtrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) xtrmv_thread_CLN.$(SUFFIX) xtrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) xtrmv_thread_NLN.$(SUFFIX) xtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) xtrmv_thread_TUN.$(SUFFIX) xtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) xtrmv_thread_RUN.$(SUFFIX) xtrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) xtrmv_thread_CUN.$(SUFFIX) xtrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) strsv_NUU.$(SUFFIX) strsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) strsv_NUN.$(SUFFIX) strsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) strsv_TLU.$(SUFFIX) strsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) strsv_TLN.$(SUFFIX) strsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) strsv_NLU.$(SUFFIX) strsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) strsv_NLN.$(SUFFIX) strsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) strsv_TUU.$(SUFFIX) strsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) strsv_TUN.$(SUFFIX) strsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtrsv_NUU.$(SUFFIX) dtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtrsv_NUN.$(SUFFIX) dtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtrsv_TLU.$(SUFFIX) dtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtrsv_TLN.$(SUFFIX) dtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) dtrsv_NLU.$(SUFFIX) dtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) dtrsv_NLN.$(SUFFIX) dtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) dtrsv_TUU.$(SUFFIX) dtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) dtrsv_TUN.$(SUFFIX) dtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtrsv_NUU.$(SUFFIX) qtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtrsv_NUN.$(SUFFIX) qtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtrsv_TLU.$(SUFFIX) qtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtrsv_TLN.$(SUFFIX) qtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) qtrsv_NLU.$(SUFFIX) qtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) qtrsv_NLN.$(SUFFIX) qtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) qtrsv_TUU.$(SUFFIX) qtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) qtrsv_TUN.$(SUFFIX) qtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) ctrsv_NUU.$(SUFFIX) ctrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ctrsv_NUN.$(SUFFIX) ctrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ctrsv_TLU.$(SUFFIX) ctrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ctrsv_TLN.$(SUFFIX) ctrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ctrsv_RLU.$(SUFFIX) ctrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ctrsv_RLN.$(SUFFIX) ctrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ctrsv_CLU.$(SUFFIX) ctrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ctrsv_CLN.$(SUFFIX) ctrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) ctrsv_NLU.$(SUFFIX) ctrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ctrsv_NLN.$(SUFFIX) ctrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ctrsv_TUU.$(SUFFIX) ctrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ctrsv_TUN.$(SUFFIX) ctrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ctrsv_RUU.$(SUFFIX) ctrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ctrsv_RUN.$(SUFFIX) ctrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ctrsv_CUU.$(SUFFIX) ctrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ctrsv_CUN.$(SUFFIX) ctrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) ztrsv_NUU.$(SUFFIX) ztrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ztrsv_NUN.$(SUFFIX) ztrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ztrsv_TLU.$(SUFFIX) ztrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ztrsv_TLN.$(SUFFIX) ztrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ztrsv_RLU.$(SUFFIX) ztrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ztrsv_RLN.$(SUFFIX) ztrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ztrsv_CLU.$(SUFFIX) ztrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ztrsv_CLN.$(SUFFIX) ztrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) ztrsv_NLU.$(SUFFIX) ztrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) ztrsv_NLN.$(SUFFIX) ztrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) ztrsv_TUU.$(SUFFIX) ztrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) ztrsv_TUN.$(SUFFIX) ztrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) ztrsv_RUU.$(SUFFIX) ztrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) ztrsv_RUN.$(SUFFIX) ztrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) ztrsv_CUU.$(SUFFIX) ztrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) ztrsv_CUN.$(SUFFIX) ztrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) xtrsv_NUU.$(SUFFIX) xtrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) xtrsv_NUN.$(SUFFIX) xtrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) xtrsv_TLU.$(SUFFIX) xtrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) xtrsv_TLN.$(SUFFIX) xtrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) xtrsv_RLU.$(SUFFIX) xtrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) xtrsv_RLN.$(SUFFIX) xtrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) xtrsv_CLU.$(SUFFIX) xtrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) xtrsv_CLN.$(SUFFIX) xtrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) xtrsv_NLU.$(SUFFIX) xtrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) xtrsv_NLN.$(SUFFIX) xtrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) xtrsv_TUU.$(SUFFIX) xtrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) xtrsv_TUN.$(SUFFIX) xtrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) xtrsv_RUU.$(SUFFIX) xtrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) xtrsv_RUN.$(SUFFIX) xtrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/driver/level2/gbmv_k.c000066400000000000000000000076411313527062700172510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifndef TRANS #define M m #define N n #else #define N m #define M n #endif void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, offset_u, offset_l, start, end, length; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(M, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) + 4095) & ~4095); COPY_K(N, x, incx, X, 1); } offset_u = ku; offset_l = ku + m; for (i = 0; i < MIN(n, m + ku); i++) { start = MAX(offset_u, 0); end = MIN(offset_l, ku + kl + 1); length = end - start; #ifndef TRANS AXPYU_K(length, 0, 0, alpha * X[i], a + start, 1, Y + start - offset_u, 1, NULL, 0); #else Y[i] += alpha * DOTU_K(length, a + start, 1, X + start - offset_u, 1); #endif offset_u --; offset_l --; a += lda; } if (incy != 1) { COPY_K(M, Y, 1, y, incy); } return; } OpenBLAS-0.2.20/driver/level2/gbmv_thread.c000066400000000000000000000166521313527062700202700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #if !defined(CONJ) && !defined(XCONJ) #define MYAXPY AXPYU_K #define MYDOT DOTU_K #elif defined(CONJ) && !defined(XCONJ) #define MYAXPY AXPYC_K #define MYDOT DOTC_K #elif !defined(CONJ) && defined(XCONJ) #define MYAXPY AXPYU_K #define MYDOT DOTC_K #else #define MYAXPY AXPYC_K #define MYDOT DOTU_K #endif static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n_from, n_to; BLASLONG i, offset_l, offset_u, uu, ll, ku, kl; #ifdef TRANSA #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; ku = args -> ldc; kl = args -> ldd; n_from = 0; n_to = args -> n; if (range_m) y += *range_m * COMPSIZE; if (range_n) { n_from = *(range_n + 0); n_to = *(range_n + 1); a += n_from * lda * COMPSIZE; } n_to = MIN(n_to, args -> m + ku); #ifdef TRANSA if (incx != 1) { COPY_K(args -> m, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #endif SCAL_K( #ifndef TRANSA args -> m, #else args -> n, #endif 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); offset_u = ku - n_from; offset_l = ku - n_from + args -> m; #ifndef TRANSA x += n_from * incx * COMPSIZE; y -= offset_u * COMPSIZE; #else x -= offset_u * COMPSIZE; y += n_from * COMPSIZE; #endif for (i = n_from; i < n_to; i++) { uu = MAX(offset_u, 0); ll = MIN(offset_l, ku + kl + 1); #ifndef TRANSA MYAXPY(ll - uu, 0, 0, *(x + 0), #ifdef COMPLEX #ifndef XCONJ *(x + 1), #else -*(x + 1), #endif #endif a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0); x += incx * COMPSIZE; #else result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1); #ifndef COMPLEX *y = result; #else *(y + 0) += CREAL(result); #ifndef XCONJ *(y + 1) += CIMAG(result); #else *(y + 1) -= CIMAG(result); #endif #endif x += COMPSIZE; #endif y += COMPSIZE; offset_u --; offset_l --; a += lda * COMPSIZE; } return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.n = n; args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; args.lda = lda; args.ldb = incx; args.ldc = ku; args.ldd = kl; num_cpu = 0; range_n[0] = 0; i = n; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range_n[num_cpu + 1] = range_n[num_cpu] + width; #ifndef TRANSA range_m[num_cpu] = num_cpu * ((m + 15) & ~15); #else range_m[num_cpu] = num_cpu * ((n + 15) & ~15); #endif queue[num_cpu].mode = mode; queue[num_cpu].routine = gbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } if (num_cpu) { queue[0].sa = NULL; #ifndef TRANSA queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; #else queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; #endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } for (i = 1; i < num_cpu; i ++) { AXPYU_K( #ifndef TRANSA m, #else n, #endif 0, 0, #ifndef COMPLEX ONE, #else ONE, ZERO, #endif buffer + range_m[i] * COMPSIZE, 1, buffer, 1, NULL, 0); } AXPYU_K( #ifndef TRANSA m, #else n, #endif 0, 0, #ifndef COMPLEX alpha, #else alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); return 0; } OpenBLAS-0.2.20/driver/level2/gemv_thread.c000066400000000000000000000177051313527062700202730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifndef TRANSA #if !defined(CONJ) && !defined(XCONJ) #define GEMV GEMV_N #elif defined(CONJ) && !defined(XCONJ) #define GEMV GEMV_R #elif !defined(CONJ) && defined(XCONJ) #define GEMV GEMV_O #else #define GEMV GEMV_S #endif #else #if !defined(CONJ) && !defined(XCONJ) #define GEMV GEMV_T #elif defined(CONJ) && !defined(XCONJ) #define GEMV GEMV_C #elif !defined(CONJ) && defined(XCONJ) #define GEMV GEMV_U #else #define GEMV GEMV_D #endif #endif #ifndef TRANSA #define Y_DUMMY_NUM 1024 static FLOAT y_dummy[Y_DUMMY_NUM]; #endif static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx, incy; BLASLONG m_from, m_to, n_from, n_to; a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; incy = args -> ldc; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); a += m_from * COMPSIZE; #ifndef TRANSA y += m_from * incy * COMPSIZE; #endif } n_from = 0; n_to = args -> n; if (range_n) { n_from = *(range_n + 0); n_to = *(range_n + 1); a += n_from * lda * COMPSIZE; #ifdef TRANSA y += n_from * incy * COMPSIZE; #else //for split matrix row (n) direction and vector x of gemv_n x += n_from * incx * COMPSIZE; //store partial result for every thread y += (m_to - m_from) * 1 * COMPSIZE * pos; #endif } //fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d POS=%d\n", m_from, m_to, n_from, n_to, pos); GEMV(m_to - m_from, n_to - n_from, 0, *((FLOAT *)args -> alpha + 0), #ifdef COMPLEX *((FLOAT *)args -> alpha + 1), #endif a, lda, x, incx, y, incy, buffer); return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; #ifndef TRANSA int split_x=0; #endif #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.n = n; args.a = (void *)a; args.b = (void *)x; args.c = (void *)y; args.lda = lda; args.ldb = incx; args.ldc = incy; #ifndef COMPLEX args.alpha = (void *)α #else args.alpha = (void *) alpha; #endif num_cpu = 0; range[0] = 0; #ifndef TRANSA i = m; #else i = n; #endif while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = gemv_kernel; queue[num_cpu].args = &args; #ifndef TRANSA queue[num_cpu].range_m = &range[num_cpu]; queue[num_cpu].range_n = NULL; #else queue[num_cpu].range_m = NULL; queue[num_cpu].range_n = &range[num_cpu]; #endif queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } #ifndef TRANSA //try to split matrix on row direction and x. //Then, reduction. if (num_cpu < nthreads) { //too small to split or bigger than the y_dummy buffer. double MN = (double) m * (double) n; if ( MN <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD)) || m*COMPSIZE*nthreads > Y_DUMMY_NUM) goto Outer; num_cpu = 0; range[0] = 0; memset(y_dummy, 0, sizeof(FLOAT) * m * COMPSIZE * nthreads); args.ldc = 1; args.c = (void *)y_dummy; //split on row (n) and x i=n; split_x=1; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = gemv_kernel; queue[num_cpu].args = &args; queue[num_cpu].position = num_cpu; queue[num_cpu].range_m = NULL; queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } } Outer: #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } #ifndef TRANSA if(split_x==1){ //reduction for(i=0; i #include #include "common.h" #include "symcopy.h" #ifndef XCONJ #define AXPY AXPYU_K #else #define AXPY AXPYC_K #endif static int ger_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; FLOAT alpha_r; #ifdef COMPLEX FLOAT alpha_i; #endif BLASLONG lda, incx, incy; BLASLONG m, n_from, n_to; BLASLONG i; x = (FLOAT *)args -> a; y = (FLOAT *)args -> b; a = (FLOAT *)args -> c; incx = args -> lda; incy = args -> ldb; lda = args -> ldc; m = args -> m; alpha_r = *((FLOAT *)args -> alpha + 0); #ifdef COMPLEX alpha_i = *((FLOAT *)args -> alpha + 1); #endif n_from = 0; n_to = args -> n; if (range_n) { n_from = *(range_n + 0); n_to = *(range_n + 1); y += n_from * incy * COMPSIZE; a += n_from * lda * COMPSIZE; } if (incx != 1) { COPY_K(m, x, incx, buffer, 1); x = buffer; } for (i = n_from; i < n_to; i ++) { AXPY(m, 0, 0, #ifndef COMPLEX alpha_r * *y, #else #ifndef CONJ alpha_r * *(y + 0) - alpha_i * *(y + 1), alpha_r * *(y + 1) + alpha_i * *(y + 0), #else alpha_r * *(y + 0) + alpha_i * *(y + 1), - alpha_r * *(y + 1) + alpha_i * *(y + 0), #endif #endif x, 1, a, 1, NULL, 0); y += incy * COMPSIZE; a += lda * COMPSIZE; } return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.n = n; args.a = (void *)x; args.b = (void *)y; args.c = (void *)a; args.lda = incx; args.ldb = incy; args.ldc = lda; #ifndef COMPLEX args.alpha = (void *)α #else args.alpha = (void *) alpha; #endif num_cpu = 0; range_n[0] = 0; i = n; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range_n[num_cpu + 1] = range_n[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = ger_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level2/sbmv_k.c000066400000000000000000000075361313527062700172700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, length; FLOAT *X = x; FLOAT *Y = y; FLOAT *sbmvbuffer = (FLOAT *)buffer; FLOAT *bufferY = sbmvbuffer; FLOAT *bufferX = sbmvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) + 4095) & ~4095); sbmvbuffer = bufferX; COPY_K(n, y, incy, Y, 1); } if (incx != 1) { X = bufferX; sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, x, incx, X, 1); } for (i = 0; i < n; i++) { #ifndef LOWER length = i; if (length > k) length = k; AXPYU_K(length + 1, 0, 0, alpha * X[i], a + k - length, 1, Y + i - length, 1, NULL, 0); Y[i] += alpha * DOTU_K(length, a + k - length, 1, X + i - length, 1); #else length = k; if (n - i - 1 < k) length = n - i - 1; AXPYU_K(length + 1, 0, 0, alpha * X[i], a, 1, Y + i, 1, NULL, 0); Y[i] += alpha * DOTU_K(length, a + 1, 1, X + i + 1, 1); #endif a += lda; } if (incy != 1) { COPY_K(n, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/driver/level2/sbmv_thread.c000066400000000000000000000230711313527062700202750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #if !defined(HEMV) && !defined(HEMVREV) #define MYAXPY AXPYU_K #define MYDOT DOTU_K #elif defined HEMV #define MYAXPY AXPYU_K #define MYDOT DOTC_K #else #define MYAXPY AXPYC_K #define MYDOT DOTU_K #endif static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n, k, n_from, n_to; BLASLONG i, length; #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; lda = args -> lda; incx = args -> ldb; n = args -> n; k = args -> k; n_from = 0; n_to = n; //Use y as each thread's n* COMPSIZE elements in sb buffer y = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); a += n_from * lda * COMPSIZE; } if (incx != 1) { COPY_K(n, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); } SCAL_K(n, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); for (i = n_from; i < n_to; i++) { #ifndef LOWER length = i; if (length > k) length = k; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #else result = MYDOT(length , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif #else length = k; if (n - i - 1 < k) length = n - i - 1; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1); #else result = MYDOT(length , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ; #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1); #endif #endif #endif a += lda * COMPSIZE; } return 0; } #ifndef COMPLEX int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.n = n; args.k = k; args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; args.lda = lda; args.ldb = incx; args.ldc = incy; dnum = (double)n * (double)n / (double)nthreads; num_cpu = 0; if (n < 2 * k) { #ifndef LOWER range_m[MAX_CPU_NUMBER] = n; i = 0; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } if (width < 16) width = 16; if (width > n - i) width = n - i; } else { width = n - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } if (width < 16) width = 16; if (width > n - i) width = n - i; } else { width = n - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif } else { range_m[0] = 0; i = n; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * ((n + 15) & ~15); queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } } if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } for (i = 1; i < num_cpu; i ++) { AXPYU_K(n, 0, 0, #ifndef COMPLEX ONE, #else ONE, ZERO, #endif (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); } AXPYU_K(n, 0, 0, #ifndef COMPLEX alpha, #else alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); return 0; } OpenBLAS-0.2.20/driver/level2/spmv_k.c000066400000000000000000000072651313527062700173050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } for (i = 0; i < m; i++) { #ifndef LOWER if (i > 0) Y[i] += alpha * DOTU_K(i, a, 1, X, 1); AXPYU_K(i + 1, 0, 0, alpha * X[i], a, 1, Y, 1, NULL, 0); a += i + 1; #else Y[i] += alpha * DOTU_K(m - i, a + i, 1, X + i, 1); if (m - i > 1) AXPYU_K(m - i - 1, 0, 0, alpha * X[i], a + i + 1, 1, Y + i + 1, 1, NULL, 0); a += m - i - 1; #endif } if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/driver/level2/spmv_thread.c000066400000000000000000000221071313527062700203120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" #if! defined(HEMV) && !defined(HEMVREV) #define MYDOT DOTU_K #define MYAXPY AXPYU_K #elif defined HEMV #define MYDOT DOTC_K #define MYAXPY AXPYU_K #else #define MYDOT DOTU_K #define MYAXPY AXPYC_K #endif static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to, i; #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (range_n) y += *range_n * COMPSIZE; if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; } #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(i + 1, a, 1, x, 1); #else result = MYDOT(i , a, 1, x, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); a += (i + 1) * COMPSIZE; #else #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; args.ldb = incx; args.ldc = incy; dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = spmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = spmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } for (i = 1; i < num_cpu; i ++) { #ifndef LOWER AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); #else AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); #endif } AXPYU_K(m, 0, 0, #ifndef COMPLEX alpha, #else alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); return 0; } OpenBLAS-0.2.20/driver/level2/spr2_k.c000066400000000000000000000066601313527062700172040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X, 1, a, 1, NULL, 0); a += i + 1; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0); a += m - i; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/spr2_thread.c000066400000000000000000000250451313527062700202170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx, incy; BLASLONG i, m_from, m_to; FLOAT alpha_r; #ifdef COMPLEX FLOAT alpha_i; #endif x = (FLOAT *)args -> a; y = (FLOAT *)args -> b; a = (FLOAT *)args -> c; incx = args -> lda; incy = args -> ldb; alpha_r = *((FLOAT *)args -> alpha + 0); #ifdef COMPLEX alpha_i = *((FLOAT *)args -> alpha + 1); #endif m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } if (incy != 1) { #ifndef LOWER COPY_K(m_to, y, incy, buffer, 1); #else COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1); #endif y = buffer; } #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from + 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++){ #if !defined(HEMV) && !defined(HEMVREV) #ifndef COMPLEX if (x[i] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a, 1, NULL, 0); #endif } if (y[i] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a, 1, NULL, 0); #endif } #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a, 1, NULL, 0); #endif } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif } #endif #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #endif } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #endif } #ifndef LOWER a[i * COMPSIZE + 1] = ZERO; #else a[ 1] = ZERO; #endif #endif #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i) * COMPSIZE; #endif } return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)x; args.b = (void *)y; args.c = (void *)a; args.lda = incx; args.ldb = incy; #ifndef COMPLEX args.alpha = (void *)α #else args.alpha = (void *)alpha; #endif dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level2/spr_k.c000066400000000000000000000061731313527062700171210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef LOWER if (X[i] != ZERO) { AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X, 1, a, 1, NULL, 0); } a += i + 1; #else if (X[i] != ZERO) { AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0); } a += m - i; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/spr_thread.c000066400000000000000000000204161313527062700201320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x; BLASLONG incx; BLASLONG i, m_from, m_to; FLOAT alpha_r; #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) FLOAT alpha_i; #endif x = (FLOAT *)args -> a; a = (FLOAT *)args -> b; incx = args -> lda; alpha_r = *((FLOAT *)args -> alpha + 0); #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) alpha_i = *((FLOAT *)args -> alpha + 1); #endif m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; } #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from + 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++){ #if !defined(HEMV) && !defined(HEMVREV) #ifndef COMPLEX if (x[i] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a, 1, NULL, 0); #endif } #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif } #endif #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #endif } #ifndef LOWER a[i * COMPSIZE + 1] = ZERO; #else a[ 1] = ZERO; #endif #endif #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i) * COMPSIZE; #endif } return 0; } #if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV) int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)x; args.b = (void *)a; args.lda = incx; #if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV) args.alpha = (void *)α #else args.alpha = (void *)alpha; #endif dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level2/symv_thread.c000066400000000000000000000177511313527062700203340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" #if! defined(HEMV) && !defined(HEMVREV) #define MYSYMV_U SYMV_U #define MYSYMV_L SYMV_L #elif defined HEMV #define MYSYMV_U HEMV_U #define MYSYMV_L HEMV_L #else #define MYSYMV_U HEMV_V #define MYSYMV_L HEMV_M #endif static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG m_from, m_to; a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); MYSYMV_U (m_to, m_to - m_from, ONE, #ifdef COMPLEX ZERO, #endif a, lda, x, incx, y, 1, buffer); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); MYSYMV_L (args -> m - m_from, m_to - m_from, ONE, #ifdef COMPLEX ZERO, #endif a + m_from * (lda + 1) * COMPSIZE, lda, x + m_from * incx * COMPSIZE, incx, y + m_from * COMPSIZE, 1, buffer); #endif return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG width, i, num_cpu; double dnum; int mask = 3; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; args.lda = lda; args.ldb = incx; args.ldc = incy; dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)i; width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; if (width < 4) width = 4; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; queue[MAX_CPU_NUMBER - num_cpu - 1].range_m = &range_m[num_cpu]; queue[MAX_CPU_NUMBER - num_cpu - 1].range_n = &range_n[num_cpu]; queue[MAX_CPU_NUMBER - num_cpu - 1].sa = NULL; queue[MAX_CPU_NUMBER - num_cpu - 1].sb = NULL; queue[MAX_CPU_NUMBER - num_cpu - 1].next = &queue[MAX_CPU_NUMBER - num_cpu]; num_cpu ++; i += width; } if (num_cpu) { queue[MAX_CPU_NUMBER - num_cpu].sa = NULL; queue[MAX_CPU_NUMBER - num_cpu].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; queue[MAX_CPU_NUMBER - 1].next = NULL; exec_blas(num_cpu, &queue[MAX_CPU_NUMBER - num_cpu]); } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 4) width = 4; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = symv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } #endif #ifndef LOWER for (i = 0; i < num_cpu - 1; i ++) { AXPYU_K(range_m[i + 1], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer + range_n[num_cpu - 1] * COMPSIZE, 1, NULL, 0); } AXPYU_K(m, 0, 0, #ifndef COMPLEX alpha, #else alpha[0], alpha[1], #endif buffer + range_n[num_cpu - 1] * COMPSIZE, 1, y, incy, NULL, 0); #else for (i = 1; i < num_cpu; i ++) { AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); } AXPYU_K(m, 0, 0, #ifndef COMPLEX alpha, #else alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); #endif return 0; } OpenBLAS-0.2.20/driver/level2/syr2_k.c000066400000000000000000000066761313527062700172240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X, 1, a, 1, NULL, 0); a += lda; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0); a += 1 + lda; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/syr2_thread.c000066400000000000000000000251101313527062700202210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx, incy; BLASLONG i, m_from, m_to; FLOAT alpha_r; #ifdef COMPLEX FLOAT alpha_i; #endif x = (FLOAT *)args -> a; y = (FLOAT *)args -> b; a = (FLOAT *)args -> c; incx = args -> lda; incy = args -> ldb; lda = args -> ldc; alpha_r = *((FLOAT *)args -> alpha + 0); #ifdef COMPLEX alpha_i = *((FLOAT *)args -> alpha + 1); #endif m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } if (incy != 1) { #ifndef LOWER COPY_K(m_to, y, incy, buffer, 1); #else COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1); #endif y = buffer; } a += m_from * lda * COMPSIZE; for (i = m_from; i < m_to; i++){ #if !defined(HER) && !defined(HERREV) #ifndef COMPLEX if (x[i] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a + i, 1, NULL, 0); #endif } if (y[i] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a + i, 1, NULL, 0); #endif } #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif } #endif #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HERREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif #endif } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef HERREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif #endif } a[i * COMPSIZE + 1] = ZERO; #endif a += lda * COMPSIZE; } return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)x; args.b = (void *)y; args.c = (void *)a; args.lda = incx; args.ldb = incy; args.ldc = lda; #ifndef COMPLEX args.alpha = (void *)α #else args.alpha = (void *)alpha; #endif dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level2/syr_k.c000066400000000000000000000062111313527062700171230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef LOWER if (X[i] != ZERO) { AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X, 1, a, 1, NULL, 0); } a += lda; #else if (X[i] != ZERO) { AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0); } a += 1 + lda; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/syr_thread.c000066400000000000000000000203551313527062700201450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x; BLASLONG lda, incx; BLASLONG i, m_from, m_to; FLOAT alpha_r; #if defined(COMPLEX) && !defined(HER) && !defined(HERREV) FLOAT alpha_i; #endif x = (FLOAT *)args -> a; a = (FLOAT *)args -> b; incx = args -> lda; lda = args -> ldb; alpha_r = *((FLOAT *)args -> alpha + 0); #if defined(COMPLEX) && !defined(HER) && !defined(HERREV) alpha_i = *((FLOAT *)args -> alpha + 1); #endif m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; } a += m_from * lda * COMPSIZE; for (i = m_from; i < m_to; i++){ #if !defined(HER) && !defined(HERREV) #ifndef COMPLEX if (x[i * COMPSIZE] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a + i, 1, NULL, 0); #endif } #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif } #endif #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HERREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); #endif #endif } a[i * COMPSIZE + 1] = ZERO; #endif a += lda * COMPSIZE; } return 0; } #if !defined(COMPLEX) || defined(HER) || defined(HERREV) int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)x; args.b = (void *)a; args.lda = incx; args.ldb = lda; #if !defined(COMPLEX) || defined(HER) || defined(HERREV) args.alpha = (void *)α #else args.alpha = (void *)alpha; #endif dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level2/tbmv_L.c000066400000000000000000000067621313527062700172320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } a += (n - 1) * lda; for (i = n - 1; i >= 0; i--) { #ifndef TRANSA length = n - i - 1; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, B[i], a + 1, 1, B + i + 1, 1, NULL, 0); } #endif #ifndef UNIT #ifndef TRANSA B[i] *= a[0]; #else B[i] *= a[k]; #endif #endif #ifdef TRANSA length = i; if (length > k) length = k; if (length > 0) { B[i] += DOTU_K(length, a + k - length, 1, B + i - length, 1); } #endif a -= lda; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/tbmv_U.c000066400000000000000000000067261313527062700172430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #ifndef TRANSA length = i; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, B[i], a + k - length, 1, B + i - length, 1, NULL, 0); } #endif #ifndef UNIT #ifndef TRANSA B[i] *= a[k]; #else B[i] *= a[0]; #endif #endif #ifdef TRANSA length = n - i - 1; if (length > k) length = k; if (length > 0) { B[i] += DOTU_K(length, a + 1, 1, B + i + 1, 1); } #endif a += lda; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/tbmv_thread.c000066400000000000000000000237211313527062700203000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" #ifndef COMPLEX #ifndef TRANSA #undef TRANS #else #define TRANS #endif #define MYDOT DOTU_K #define MYAXPY AXPYU_K #else #if (TRANSA == 1) || (TRANSA == 3) #undef TRANS #else #define TRANS #endif #if (TRANSA == 1) || (TRANSA == 2) #define MYAXPY AXPYU_K #define MYDOT DOTU_K #else #define MYAXPY AXPYC_K #define MYDOT DOTC_K #endif #endif static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG k, lda, incx; BLASLONG n_from, n_to; BLASLONG i, length; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; k = args -> k; n_from = 0; n_to = args -> n; lda = args -> lda; incx = args -> ldb; if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); a += n_from * lda * COMPSIZE; } if (incx != 1) { COPY_K(args -> n, x, incx, buffer, 1); x = buffer; buffer += ((args -> n * COMPSIZE + 1023) & ~1023); } if (range_n) y += *range_n * COMPSIZE; SCAL_K(args -> n, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); for (i = n_from; i < n_to; i++) { #ifndef LOWER length = i; #else length = args -> n - i - 1; #endif if (length > k) length = k; #ifndef LOWER if (length > 0) { #ifndef TRANS MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(length, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else #ifndef LOWER *(y + i * COMPSIZE) += *(a + k * COMPSIZE) * *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + 0 * COMPSIZE) * *(x + i * COMPSIZE); #endif #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else #ifndef LOWER ar = *(a + k * COMPSIZE + 0); ai = *(a + k * COMPSIZE + 1); #else ar = *(a + 0); ai = *(a + 1); #endif xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (length > 0) { #ifndef TRANS MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(length, a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif a += lda * COMPSIZE; } return 0; } #ifndef COMPLEX int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.n = n; args.k = k; args.a = (void *)a; args.b = (void *)x; args.c = (void *)(buffer); args.lda = lda; args.ldb = incx; dnum = (double)n * (double)n / (double)nthreads; num_cpu = 0; if (n < 2 * k) { #ifndef LOWER range_m[MAX_CPU_NUMBER] = n; i = 0; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } if (width < 16) width = 16; if (width > n - i) width = n - i; } else { width = n - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } if (width < 16) width = 16; if (width > n - i) width = n - i; } else { width = n - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif } else { range_m[0] = 0; i = n; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } } if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } for (i = 1; i < num_cpu; i ++) { AXPYU_K(n, 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); } COPY_K(n, buffer, 1, x, incx); return 0; } OpenBLAS-0.2.20/driver/level2/tbsv_L.c000066400000000000000000000067261313527062700172400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #ifdef TRANSA length = i; if (length > k) length = k; if (length > 0) { B[i] -= DOTU_K(length, a + k - length, 1, B + i - length, 1); } #endif #ifndef UNIT #ifdef TRANSA B[i] /= a[k]; #else B[i] /= a[0]; #endif #endif #ifndef TRANSA length = n - i - 1; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, -B[i], a + 1, 1, B + i + 1, 1, NULL, 0); } #endif a += lda; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/tbsv_U.c000066400000000000000000000067631313527062700172520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } a += (n - 1) * lda; for (i = n - 1; i >= 0; i--) { #ifdef TRANSA length = n - i - 1; if (length > k) length = k; if (length > 0) { B[i] -= DOTU_K(length, a + 1, 1, B + i + 1, 1); } #endif #ifndef UNIT #ifdef TRANSA B[i] /= a[0]; #else B[i] /= a[k]; #endif #endif #ifndef TRANSA length = i; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, - B[i], a + k - length, 1, B + i - length, 1, NULL, 0); } #endif a -= lda; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/tpmv_L.c000066400000000000000000000064711313527062700172450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } a += (m + 1) * m / 2 - 1; for (i = 0; i < m; i++) { #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, B[m - i - 1], a + 1, 1, B + m - i, 1, NULL, 0); #endif #ifndef UNIT B[m - i - 1] *= a[0]; #endif #ifdef TRANSA if (i < m - 1) B[m - i - 1] += DOTU_K(m - i - 1, a - (m - i - 1), 1, B, 1); #endif #ifndef TRANSA a -= (i + 2); #else a -= (m - i); #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/tpmv_U.c000066400000000000000000000064451313527062700172570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } for (i = 0; i < m; i++) { #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, B[i], a, 1, B, 1, NULL, 0); #endif #ifndef UNIT #ifndef TRANSA B[i] *= a[i]; #else B[i] *= a[0]; #endif #endif #ifdef TRANSA if (i < m - 1) B[i] += DOTU_K(m - i - 1, a + 1, 1, B + i + 1, 1); #endif #ifndef TRANSA a += (i + 1); #else a += (m - i); #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/tpmv_thread.c000066400000000000000000000231341313527062700203140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" #ifndef COMPLEX #ifndef TRANSA #undef TRANS #else #define TRANS #endif #define MYDOT DOTU_K #define MYAXPY AXPYU_K #else #if TRANSA == 1 #undef TRANS #define MYDOT DOTU_K #define MYAXPY AXPYU_K #elif TRANSA == 2 #define TRANS #define MYDOT DOTU_K #define MYAXPY AXPYU_K #elif TRANSA == 3 #undef TRANS #define MYDOT DOTC_K #define MYAXPY AXPYC_K #else #define TRANS #define MYDOT DOTC_K #define MYAXPY AXPYC_K #endif #endif static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to; BLASLONG i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER if (i > 0) { #ifndef TRANS MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); #else result = MYDOT(i, a, 1, x, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + i * COMPSIZE + 0); ai = *(a + i * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (args -> m > i + 1) { #ifndef TRANS MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)a; args.b = (void *)x; args.c = (void *)(buffer); args.ldb = incx; args.ldc = incx; dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } #ifndef TRANS for (i = 1; i < num_cpu; i ++) { #ifndef LOWER AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); #else AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); #endif } #endif COPY_K(m, buffer, 1, x, incx); return 0; } OpenBLAS-0.2.20/driver/level2/tpsv_L.c000066400000000000000000000064351313527062700172530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } for (i = 0; i < m; i++) { #ifdef TRANSA if (i > 0) B[i] -= DOTU_K(i, a, 1, B, 1); #endif #ifndef UNIT #ifndef TRANSA B[i] /= a[0]; #else B[i] /= a[i]; #endif #endif #ifndef TRANSA if (i < m - 1) { AXPYU_K(m - i - 1 , 0, 0, - B[i], a + 1, 1, B + i + 1, 1, NULL, 0); } #endif #ifndef TRANSA a += (m - i); #else a += (i + 1); #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/tpsv_U.c000066400000000000000000000064341313527062700172630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } a += (m + 1) * m / 2 - 1; for (i = 0; i < m; i++) { #ifdef TRANSA if (i > 0) B[m - i - 1] -= DOTU_K(i, a + 1, 1, B + m - i, 1); #endif #ifndef UNIT B[m - i - 1] /= a[0]; #endif #ifndef TRANSA if (i < m - 1) AXPYU_K(m - i - 1, 0, 0, -B[m - i - 1], a - (m - i - 1), 1, B, 1, NULL, 0); #endif #ifndef TRANSA a -= (m - i); #else a -= (i + 2); #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/trmv_L.c000066400000000000000000000076551313527062700172540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ BLASLONG i, is, min_i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #ifndef TRANSA if (m - is > 0){ GEMV_N(m - is, min_i, 0, dp1, a + is + (is - min_i) * lda, lda, B + is - min_i, 1, B + is, 1, gemvbuffer); } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; FLOAT *BB = B + (is - i - 1); #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, BB[0], AA + 1, 1, BB + 1, 1, NULL, 0); #endif #ifndef UNIT BB[0] *= AA[0]; #endif #ifdef TRANSA if (i < min_i - 1) BB[0] += DOTU_K(min_i - i - 1, AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1); #endif } #ifdef TRANSA if (is - min_i > 0){ GEMV_T(is - min_i, min_i, 0, dp1, a + (is - min_i) * lda, lda, B, 1, B + is - min_i, 1, gemvbuffer); } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/trmv_U.c000066400000000000000000000075451313527062700172630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ BLASLONG i, is, min_i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = 0; is < m; is += DTB_ENTRIES){ min_i = MIN(m - is, DTB_ENTRIES); #ifndef TRANSA if (is > 0){ GEMV_N(is, min_i, 0, dp1, a + is * lda, lda, B + is, 1, B, 1, gemvbuffer); } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + is + (i + is) * lda; FLOAT *BB = B + is; #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, BB[i], AA, 1, BB, 1, NULL, 0); #endif #ifndef UNIT BB[i] *= AA[i]; #endif #ifdef TRANSA if (i < min_i - 1) BB[i] += DOTU_K(min_i - i - 1, AA + i + 1, 1, BB + i + 1, 1); #endif } #ifdef TRANSA if (m - is > min_i){ GEMV_T(m - is - min_i, min_i, 0, dp1, a + is + min_i + is * lda, lda, B + is + min_i, 1, B + is, 1, gemvbuffer); } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/trmv_thread.c000066400000000000000000000250361313527062700203210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" #ifndef COMPLEX #ifndef TRANSA #define MYGEMV GEMV_N #undef TRANS #else #define MYGEMV GEMV_T #define TRANS #endif #define MYDOT DOTU_K #define MYAXPY AXPYU_K #else #if TRANSA == 1 #define MYGEMV GEMV_N #undef TRANS #define MYDOT DOTU_K #define MYAXPY AXPYU_K #elif TRANSA == 2 #define MYGEMV GEMV_T #define TRANS #define MYDOT DOTU_K #define MYAXPY AXPYU_K #elif TRANSA == 3 #define MYGEMV GEMV_R #undef TRANS #define MYDOT DOTC_K #define MYAXPY AXPYC_K #else #define MYGEMV GEMV_C #define TRANS #define MYDOT DOTC_K #define MYAXPY AXPYC_K #endif #endif static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG m_from, m_to; BLASLONG i, is, min_i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif for (is = m_from; is < m_to; is += DTB_ENTRIES){ min_i = MIN(m_to - is, DTB_ENTRIES); #ifndef LOWER if (is > 0){ MYGEMV(is, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + is * lda * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y, 1, #else x, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif for (i = is; i < is + min_i; i++) { #ifndef LOWER if (i - is > 0) { #ifndef TRANS MYAXPY(i - is, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0); #else result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + (i + i * lda) * COMPSIZE + 0); ai = *(a + (i + i * lda) * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (is + min_i > i + 1) { #ifndef TRANS MYAXPY(is + min_i - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif } #ifdef LOWER if (args -> m > is + min_i){ MYGEMV(args -> m - is - min_i, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + (is + min_i + is * lda) * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y + (is + min_i) * COMPSIZE, 1, #else x + (is + min_i) * COMPSIZE, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif } return 0; } #ifndef COMPLEX int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)a; args.b = (void *)x; args.c = (void *)(buffer); args.lda = lda; args.ldb = incx; args.ldc = incx; dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } #ifndef TRANS for (i = 1; i < num_cpu; i ++) { #ifndef LOWER AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); #else AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); #endif } #endif COPY_K(m, buffer, 1, x, incx); return 0; } OpenBLAS-0.2.20/driver/level2/trsv_L.c000066400000000000000000000076721313527062700172610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dm1 = -1.; #undef GEMV_UNROLL #define GEMV_UNROLL DTB_ENTRIES int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i, is, min_i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = 0; is < m; is += GEMV_UNROLL){ min_i = MIN(m - is, GEMV_UNROLL); #ifdef TRANSA if (is > 0){ GEMV_T(is, min_i, 0, dm1, a + is * lda , lda, B, 1, B + is, 1, gemvbuffer); } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + is + (i + is) * lda; FLOAT *BB = B + is; #ifdef TRANSA if (i > 0) BB[i] -= DOTU_K(i, AA, 1, BB, 1); #endif #ifndef UNIT BB[i] /= AA[i]; #endif #ifndef TRANSA if (i < min_i - 1) { AXPYU_K(min_i - i - 1 , 0, 0, - BB[i], AA + i + 1, 1, BB + i + 1, 1, NULL, 0); } #endif } #ifndef TRANSA if (m - is > min_i){ GEMV_N(m - is - min_i, min_i, 0, dm1, a + is + min_i + is * lda, lda, B + is, 1, B + (is + min_i), 1, gemvbuffer); } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/trsv_U.c000066400000000000000000000076571313527062700172750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dm1 = -1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i, is, min_i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #ifdef TRANSA if (m - is > 0){ GEMV_T(m - is, min_i, 0, dm1, a + is + (is - min_i) * lda, lda, B + is, 1, B + is - min_i, 1, gemvbuffer); } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; FLOAT *BB = B + (is - i - 1); #ifdef TRANSA if (i > 0) BB[0] -= DOTU_K(i, AA + 1, 1, BB + 1, 1); #endif #ifndef UNIT BB[0] /= AA[0]; #endif #ifndef TRANSA if (i < min_i - 1) AXPYU_K(min_i - i - 1, 0, 0, -BB[0], AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1, NULL, 0); #endif } #ifndef TRANSA if (is - min_i > 0){ GEMV_N(is - min_i, min_i, 0, dm1, a + (is - min_i) * lda, lda, B + is - min_i, 1, B, 1, gemvbuffer); } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/zgbmv_k.c000066400000000000000000000115561313527062700174430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifndef XCONJ #ifndef CONJ #define ZAXPY AXPYU_K #define ZDOT DOTU_K #else #define ZAXPY AXPYC_K #define ZDOT DOTC_K #endif #else #ifndef CONJ #define ZAXPY AXPYU_K #define ZDOT DOTC_K #else #define ZAXPY AXPYC_K #define ZDOT DOTU_K #endif #endif #ifndef TRANS #define M m #define N n #else #define N m #define M n #endif void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, offset_u, offset_l, start, end, length; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; #ifdef TRANS OPENBLAS_COMPLEX_FLOAT temp; #endif if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(M, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(N, x, incx, X, 1); } offset_u = ku; offset_l = ku + m; for (i = 0; i < MIN(n, m + ku); i++) { start = MAX(offset_u, 0); end = MIN(offset_l, ku + kl + 1); length = end - start; #ifndef TRANS ZAXPY(length, 0, 0, #ifndef XCONJ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], #else alpha_r * X[i * 2 + 0] + alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], #endif a + start * 2, 1, Y + (start - offset_u) * 2, 1, NULL, 0); #else #ifndef XCONJ temp = ZDOT(length, a + start * 2, 1, X + (start - offset_u) * 2, 1); #else temp = ZDOT(length, X + (start - offset_u) * 2, 1, a + start * 2, 1); #endif #if !defined(XCONJ) || !defined(CONJ) Y[i * 2 + 0] += alpha_r * CREAL(temp) - alpha_i * CIMAG(temp); Y[i * 2 + 1] += alpha_i * CREAL(temp) + alpha_r * CIMAG(temp); #else Y[i * 2 + 0] += alpha_r * CREAL(temp) + alpha_i * CIMAG(temp); Y[i * 2 + 1] += alpha_i * CREAL(temp) - alpha_r * CIMAG(temp); #endif #endif offset_u --; offset_l --; a += lda * 2; } if (incy != 1) { COPY_K(M, Y, 1, y, incy); } return; } OpenBLAS-0.2.20/driver/level2/zhbmv_k.c000066400000000000000000000151601313527062700174370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, length; #ifndef LOWER BLASLONG offset; #endif FLOAT *X = x; FLOAT *Y = y; FLOAT *sbmvbuffer = (FLOAT *)buffer; FLOAT *bufferY = sbmvbuffer; FLOAT *bufferX = sbmvbuffer; FLOAT temp[2]; OPENBLAS_COMPLEX_FLOAT result; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); sbmvbuffer = bufferX; COPY_K(n, y, incy, Y, 1); } if (incx != 1) { X = bufferX; sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, x, incx, X, 1); } #ifndef LOWER offset = k; #endif for (i = 0; i < n; i++) { #ifndef HEMVREV #ifndef LOWER length = k - offset; if (length > 0) { AXPYU_K(length, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); } temp[0] = a[k * 2 + 0] * X[i * 2 + 0]; temp[1] = a[k * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { result = DOTC_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } if (offset > 0) offset --; #else length = k; if (n - i - 1 < k) length = n - i - 1; if (length > 0) { AXPYU_K(length, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); } temp[0] = a[0] * X[i * 2 + 0]; temp[1] = a[0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { result = DOTC_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } #endif #else #ifndef LOWER length = k - offset; if (length > 0) { AXPYC_K(length, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); } temp[0] = a[k * 2 + 0] * X[i * 2 + 0]; temp[1] = a[k * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } if (offset > 0) offset --; #else length = k; if (n - i - 1 < k) length = n - i - 1; if (length > 0) { AXPYC_K(length, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); } temp[0] = a[0] * X[i * 2 + 0]; temp[1] = a[0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (length > 0) { result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } #endif #endif a += lda * 2; } if (incy != 1) { COPY_K(n, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/driver/level2/zher2_k.c000066400000000000000000000112471313527062700173450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; lda *= 2; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += lda; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += 2 + lda; #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYC_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += lda; #else AXPYC_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYC_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += 2 + lda; #endif #endif } return 0; } OpenBLAS-0.2.20/driver/level2/zher_k.c000066400000000000000000000070141313527062700172600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; lda *= 2; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += lda; #else AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += 2 + lda; #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += lda; #else AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += 2 + lda; #endif #endif } return 0; } OpenBLAS-0.2.20/driver/level2/zhpmv_k.c000066400000000000000000000142001313527062700174470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; FLOAT temp[2]; OPENBLAS_COMPLEX_FLOAT result; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } for (i = 0; i < m; i++) { #ifndef HEMVREV #ifndef LOWER if (i > 0) { result = DOTC_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (i > 0) { AXPYU_K(i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; #else if (m - i > 1) { result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (m - i > 1) { AXPYU_K(m - i - 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } a += (m - i - 1) * 2; #endif #else #ifndef LOWER if (i > 0) { result = DOTU_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (i > 0) { AXPYC_K(i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; #else if (m - i > 1) { result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (m - i > 1) { AXPYC_K(m - i - 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } a += (m - i - 1) * 2; #endif #endif } if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/driver/level2/zhpr2_k.c000066400000000000000000000112431313527062700173540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += (i + 1) * 2; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += (m - i) * 2; #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYC_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += (i + 1) * 2; #else AXPYC_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYC_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += (m - i) * 2; #endif #endif } return 0; } OpenBLAS-0.2.20/driver/level2/zhpr_k.c000066400000000000000000000070121313527062700172710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += (i + 1) * 2; #else AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += (m - i) * 2; #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += (i + 1) * 2; #else AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += (m - i) * 2; #endif #endif } return 0; } OpenBLAS-0.2.20/driver/level2/zsbmv_k.c000066400000000000000000000111271313527062700174510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, length; #ifndef LOWER BLASLONG offset; #endif FLOAT *X = x; FLOAT *Y = y; FLOAT *sbmvbuffer = (FLOAT *)buffer; FLOAT *bufferY = sbmvbuffer; FLOAT *bufferX = sbmvbuffer; OPENBLAS_COMPLEX_FLOAT result; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); sbmvbuffer = bufferX; COPY_K(n, y, incy, Y, 1); } if (incx != 1) { X = bufferX; sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, x, incx, X, 1); } #ifndef LOWER offset = k; #endif for (i = 0; i < n; i++) { #ifndef LOWER length = k - offset; AXPYU_K(length + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); if (length > 0) { result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } if (offset > 0) offset --; #else length = k; if (n - i - 1 < k) length = n - i - 1; AXPYU_K(length + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y + i * COMPSIZE, 1, NULL, 0); if (length > 0) { result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } #endif a += lda * 2; } if (incy != 1) { COPY_K(n, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/driver/level2/zspmv_k.c000066400000000000000000000104031313527062700174630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; OPENBLAS_COMPLEX_FLOAT result; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } for (i = 0; i < m; i++) { #ifndef LOWER if (i > 0) { result = DOTU_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); a += (i + 1) * 2; #else result = DOTU_K(m - i, a + i * 2, 1, X + i * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); if (m - i > 1) AXPYU_K(m - i - 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); a += (m - i - 1) * 2; #endif } if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/driver/level2/zspr2_k.c000066400000000000000000000075131313527062700173740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a += (i + 1) * 2; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a += (m - i) * 2; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/zspr_k.c000066400000000000000000000066641313527062700173200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef LOWER if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X, 1, a, 1, NULL, 0); } a += (i + 1) * 2; #else if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); } a += (m - i) * 2; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/zsyr2_k.c000066400000000000000000000075321313527062700174060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; lda *= 2; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a += lda; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a += 2 + lda; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/zsyr_k.c000066400000000000000000000067021313527062700173220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; lda *= 2; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef LOWER if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X, 1, a, 1, NULL, 0); } a += lda; #else if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); } a += 2 + lda; #endif } return 0; } OpenBLAS-0.2.20/driver/level2/ztbmv_L.c000066400000000000000000000107711313527062700174170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } a += (n - 1) * lda * COMPSIZE; for (i = n - 1; i >= 0; i--) { #if (TRANSA == 1) || (TRANSA == 3) length = n - i - 1; if (length > k) length = k; if (length > 0) { #if TRANSA == 1 AXPYU_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #endif } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) atemp1 = a[0]; atemp2 = a[1]; #else atemp1 = a[k * 2 + 0]; atemp2 = a[k * 2 + 1]; #endif btemp1 = B[i * 2 + 0]; btemp2 = B[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) length = i; if (length > k) length = k; if (length > 0) { #if TRANSA == 2 temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); #else temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); #endif B[i * 2 + 0] += CREAL(temp); B[i * 2 + 1] += CIMAG(temp); } #endif a -= lda * COMPSIZE; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztbmv_U.c000066400000000000000000000107231313527062700174250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #if (TRANSA == 1) || (TRANSA == 3) length = i; if (length > k) length = k; if (length > 0) { #if TRANSA == 1 AXPYU_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #endif } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) atemp1 = a[k * 2 + 0]; atemp2 = a[k * 2 + 1]; #else atemp1 = a[0]; atemp2 = a[1]; #endif btemp1 = B[i * 2 + 0]; btemp2 = B[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) length = n - i - 1; if (length > k) length = k; if (length > 0) { #if TRANSA == 2 temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #else temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #endif B[i * 2 + 0] += CREAL(temp); B[i * 2 + 1] += CIMAG(temp); } #endif a += lda * COMPSIZE; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztbsv_L.c000066400000000000000000000111221313527062700174140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #if (TRANSA == 2) || (TRANSA == 4) length = i; if (length > k) length = k; if (length > 0) { #if TRANSA == 2 temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); #else temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); #endif B[i * 2 + 0] -= CREAL(temp); B[i * 2 + 1] -= CIMAG(temp); } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) ar = a[0]; ai = a[1]; #else ar = a[k * 2 + 0]; ai = a[k * 2 + 1]; #endif if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = B[i * 2 + 0]; bi = B[i * 2 + 1]; B[i * 2 + 0] = ar*br - ai*bi; B[i * 2 + 1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) length = n - i - 1; if (length > k) length = k; if (length > 0) { #if TRANSA == 1 AXPYU_K(length, 0, 0, -B[i * 2 + 0], -B[i * 2 + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(length, 0, 0, -B[i * 2 + 0], -B[i * 2 + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #endif } #endif a += lda * COMPSIZE; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztbsv_U.c000066400000000000000000000111721313527062700174320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } a += (n - 1) * lda * COMPSIZE; for (i = n - 1; i >= 0; i--) { #if (TRANSA == 2) || (TRANSA == 4) length = n - i - 1; if (length > k) length = k; if (length > 0) { #if TRANSA == 2 temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #else temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #endif B[i * 2 + 0] -= CREAL(temp); B[i * 2 + 1] -= CIMAG(temp); } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) ar = a[k * 2 + 0]; ai = a[k * 2 + 1]; #else ar = a[0]; ai = a[1]; #endif if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = B[i * 2 + 0]; bi = B[i * 2 + 1]; B[i * 2 + 0] = ar*br - ai*bi; B[i * 2 + 1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) length = i; if (length > k) length = k; if (length > 0) { #if TRANSA == 1 AXPYU_K(length, 0, 0, -B[i * 2 + 0], -B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(length, 0, 0, -B[i * 2 + 0], -B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #endif } #endif a -= lda * COMPSIZE; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztpmv_L.c000066400000000000000000000104051313527062700174270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } a += (m + 1) * m - 2; for (i = 0; i < m; i++) { #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #endif #endif #ifndef UNIT atemp1 = a[0]; atemp2 = a[1]; btemp1 = B[(m - i - 1) * 2 + 0]; btemp2 = B[(m - i - 1) * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < m - 1) { #if TRANSA == 2 temp = DOTU_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); #else temp = DOTC_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); #endif B[(m - i - 1) * 2 + 0] += CREAL(temp); B[(m - i - 1) * 2 + 1] += CIMAG(temp); } #endif #if (TRANSA == 1) || (TRANSA == 3) a -= (i + 2) * 2; #else a -= (m - i) * 2; #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztpmv_U.c000066400000000000000000000103261313527062700174420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } for (i = 0; i < m; i++) { #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a, 1, B, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a, 1, B, 1, NULL, 0); #endif #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) atemp1 = a[i * 2 + 0]; atemp2 = a[i * 2 + 1]; #else atemp1 = a[0]; atemp2 = a[1]; #endif btemp1 = B[i * 2 + 0]; btemp2 = B[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < m - 1) { #if TRANSA == 2 temp = DOTU_K(m - i - 1, a + 2, 1, B + (i + 1) * 2, 1); #else temp = DOTC_K(m - i - 1, a + 2, 1, B + (i + 1) * 2, 1); #endif B[i * 2 + 0] += CREAL(temp); B[i * 2 + 1] += CIMAG(temp); } #endif #if (TRANSA == 1) || (TRANSA == 3) a += (i + 1) * 2; #else a += (m - i) * 2; #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztpsv_L.c000066400000000000000000000107151313527062700174410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dm1 = -1.; int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } for (i = 0; i < m; i++) { #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 result = DOTU_K(i, a, 1, B, 1); #else result = DOTC_K(i, a, 1, B, 1); #endif B[i * COMPSIZE + 0] -= CREAL(result); B[i * COMPSIZE + 1] -= CIMAG(result); } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) ar = a[0]; ai = a[1]; #else ar = a[i * COMPSIZE + 0]; ai = a[i * COMPSIZE + 1]; #endif if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = B[i * COMPSIZE + 0]; bi = B[i * COMPSIZE + 1]; B[i * COMPSIZE + 0] = ar*br - ai*bi; B[i * COMPSIZE + 1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) if (i < m - 1) { #if TRANSA == 1 AXPYU_K(m - i - 1 , 0, 0, - B[i * COMPSIZE + 0], - B[i * COMPSIZE + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(m - i - 1 , 0, 0, - B[i * COMPSIZE + 0], - B[i * COMPSIZE + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #endif } #endif #if (TRANSA == 1) || (TRANSA == 3) a += (m - i) * 2; #else a += (i + 1) * 2; #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztpsv_U.c000066400000000000000000000107071313527062700174530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } a += (m + 1) * m - 2; for (i = 0; i < m; i++) { #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 result = DOTU_K(i, a + 2, 1, B + (m - i) * 2, 1); #else result = DOTC_K(i, a + 2, 1, B + (m - i) * 2, 1); #endif B[(m - i - 1) * 2 + 0] -= CREAL(result); B[(m - i - 1) * 2 + 1] -= CIMAG(result); } #endif #ifndef UNIT ar = a[0]; ai = a[1]; if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if (TRANSA == 1) || (TRANSA == 2) ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if (TRANSA == 1) || (TRANSA == 2) ai = -den; #else ai = den; #endif } br = B[(m - i - 1) * 2 + 0]; bi = B[(m - i - 1) * 2 + 1]; B[(m - i - 1) * 2 + 0] = ar*br - ai*bi; B[(m - i - 1) * 2 + 1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) if (i < m - 1) { #if TRANSA == 1 AXPYU_K (m - i - 1, 0, 0, - B[(m - i - 1) * 2 + 0], -B[(m - i - 1) * 2 + 1], a - (m - i - 1) * COMPSIZE, 1, B, 1, NULL, 0); #else AXPYC_K (m - i - 1, 0, 0, - B[(m - i - 1) * 2 + 0], -B[(m - i - 1) * 2 + 1], a - (m - i - 1) * COMPSIZE, 1, B, 1, NULL, 0); #endif } #endif #if (TRANSA == 1) || (TRANSA == 3) a -= (m - i) * 2; #else a -= (i + 2) * 2; #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztrmv_L.c000066400000000000000000000122301313527062700174270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static FLOAT dp1 = 1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #if (TRANSA == 1) || (TRANSA == 3) if (m - is > 0){ #if TRANSA == 1 GEMV_N(m - is, min_i, 0, dp1, ZERO, a + (is + (is - min_i) * lda) * 2, lda, B + (is - min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #else GEMV_R(m - is, min_i, 0, dp1, ZERO, a + (is + (is - min_i) * lda) * 2, lda, B + (is - min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * 2; FLOAT *BB = B + (is - i - 1) * 2; #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, BB[0], BB[1], AA + 2, 1, BB + 2, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, BB[0], BB[1], AA + 2, 1, BB + 2, 1, NULL, 0); #endif #endif #ifndef UNIT atemp1 = AA[0]; atemp2 = AA[1]; btemp1 = BB[0]; btemp2 = BB[1]; #if (TRANSA == 1) || (TRANSA == 2) BB[0] = atemp1 * btemp1 - atemp2 * btemp2; BB[1] = atemp1 * btemp2 + atemp2 * btemp1; #else BB[0] = atemp1 * btemp1 + atemp2 * btemp2; BB[1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < min_i - 1) { #if TRANSA == 2 temp = DOTU_K(min_i - i - 1, AA - (min_i - i - 1) * 2, 1, BB - (min_i - i - 1) * 2, 1); #else temp = DOTC_K(min_i - i - 1, AA - (min_i - i - 1) * 2, 1, BB - (min_i - i - 1) * 2, 1); #endif BB[0] += CREAL(temp); BB[1] += CIMAG(temp); } #endif } #if (TRANSA == 2) || (TRANSA == 4) if (is - min_i > 0){ #if TRANSA == 2 GEMV_T(is - min_i, min_i, 0, dp1, ZERO, a + (is - min_i) * lda * 2, lda, B, 1, B + (is - min_i) * 2, 1, gemvbuffer); #else GEMV_C(is - min_i, min_i, 0, dp1, ZERO, a + (is - min_i) * lda * 2, lda, B, 1, B + (is - min_i) * 2, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztrmv_U.c000066400000000000000000000123271313527062700174470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static FLOAT dp1 = 1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } for (is =0; is < m; is += DTB_ENTRIES){ min_i = MIN(m - is, DTB_ENTRIES); #if (TRANSA) == 1 || (TRANSA == 3) if (is > 0){ #if TRANSA == 1 GEMV_N(is, min_i, 0, dp1, ZERO, a + is * lda * 2, lda, B + is * 2, 1, B, 1, gemvbuffer); #else GEMV_R(is, min_i, 0, dp1, ZERO, a + is * lda * 2, lda, B + is * 2, 1, B, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is + (i + is) * lda) * 2; FLOAT *BB = B + is * 2; #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], AA, 1, BB, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], AA, 1, BB, 1, NULL, 0); #endif #endif #ifndef UNIT atemp1 = AA[i * 2 + 0]; atemp2 = AA[i * 2 + 1]; btemp1 = BB[i * 2 + 0]; btemp2 = BB[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) BB[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; BB[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else BB[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; BB[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < min_i - 1) { #if TRANSA == 2 temp = DOTU_K(min_i - i - 1, AA + (i + 1) * 2, 1, BB + (i + 1) * 2, 1); #else temp = DOTC_K(min_i - i - 1, AA + (i + 1) * 2, 1, BB + (i + 1) * 2, 1); #endif BB[i * 2 + 0] += CREAL(temp); BB[i * 2 + 1] += CIMAG(temp); } #endif } #if (TRANSA) == 2 || (TRANSA == 4) if (m - is > min_i){ #if TRANSA == 2 GEMV_T(m - is - min_i, min_i, 0, dp1, ZERO, a + (is + min_i + is * lda) * 2, lda, B + (is + min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #else GEMV_C(m - is - min_i, min_i, 0, dp1, ZERO, a + (is + min_i + is * lda) * 2, lda, B + (is + min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztrsv_L.c000066400000000000000000000130261313527062700174410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dm1 = -1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is =0; is < m; is += DTB_ENTRIES){ min_i = MIN(m - is, DTB_ENTRIES); #if (TRANSA == 2) || (TRANSA == 4) if (is > 0){ #if TRANSA == 2 GEMV_T(is, min_i, 0, dm1, ZERO, a + is * lda * COMPSIZE, lda, B, 1, B + is * COMPSIZE, 1, gemvbuffer); #else GEMV_C(is, min_i, 0, dm1, ZERO, a + is * lda * COMPSIZE, lda, B, 1, B + is * COMPSIZE, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is + (i + is) * lda) * COMPSIZE; FLOAT *BB = B + is * COMPSIZE; #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 result = DOTU_K(i, AA, 1, BB, 1); #else result = DOTC_K(i, AA, 1, BB, 1); #endif BB[i * COMPSIZE + 0] -= CREAL(result); BB[i * COMPSIZE + 1] -= CIMAG(result); } #endif #ifndef UNIT ar = AA[i * COMPSIZE + 0]; ai = AA[i * COMPSIZE + 1]; if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = BB[i * COMPSIZE + 0]; bi = BB[i * COMPSIZE + 1]; BB[i * COMPSIZE + 0] = ar*br - ai*bi; BB[i * COMPSIZE + 1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) if (i < min_i - 1) { #if TRANSA == 1 AXPYU_K(min_i - i - 1 , 0, 0, - BB[i * COMPSIZE + 0], - BB[i * COMPSIZE + 1], AA + (i + 1) * COMPSIZE, 1, BB + (i + 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(min_i - i - 1 , 0, 0, - BB[i * COMPSIZE + 0], - BB[i * COMPSIZE + 1], AA + (i + 1) * COMPSIZE, 1, BB + (i + 1) * COMPSIZE, 1, NULL, 0); #endif } #endif } #if (TRANSA == 1) || (TRANSA == 3) if (m - is > min_i){ #if TRANSA == 1 GEMV_N(m - is - min_i, min_i, 0, dm1, ZERO, a + (is + min_i + is * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is + min_i) * COMPSIZE, 1, gemvbuffer); #else GEMV_R(m - is - min_i, min_i, 0, dm1, ZERO, a + (is + min_i + is * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is + min_i) * COMPSIZE, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level2/ztrsv_U.c000066400000000000000000000127451313527062700174610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dm1 = -1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #if (TRANSA == 2) || (TRANSA == 4) if (m - is > 0){ #if TRANSA == 2 GEMV_T(m - is, min_i, 0, dm1, ZERO, a + (is + (is - min_i) * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is - min_i) * COMPSIZE, 1, gemvbuffer); #else GEMV_C(m - is, min_i, 0, dm1, ZERO, a + (is + (is - min_i) * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is - min_i) * COMPSIZE, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * COMPSIZE; FLOAT *BB = B + (is - i - 1) * COMPSIZE; #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 result = DOTU_K(i, AA + 2, 1, BB + 2, 1); #else result = DOTC_K(i, AA + 2, 1, BB + 2, 1); #endif BB[0] -= CREAL(result); BB[1] -= CIMAG(result); } #endif #ifndef UNIT ar = AA[0]; ai = AA[1]; if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = BB[0]; bi = BB[1]; BB[0] = ar*br - ai*bi; BB[1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) if (i < min_i - 1) { #if TRANSA == 1 AXPYU_K (min_i - i - 1, 0, 0, - BB[0], -BB[1], AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(min_i - i - 1, 0, 0, - BB[0], -BB[1], AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); #endif } #endif } #if (TRANSA == 1) || (TRANSA == 3) if (is - min_i > 0){ #if TRANSA == 1 GEMV_N(is - min_i, min_i, 0, dm1, ZERO, a + (is - min_i) * lda * COMPSIZE, lda, B + (is - min_i) * COMPSIZE, 1, B, 1, gemvbuffer); #else GEMV_R(is - min_i, min_i, 0, dm1, ZERO, a + (is - min_i) * lda * COMPSIZE, lda, B + (is - min_i) * COMPSIZE, 1, B, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; } OpenBLAS-0.2.20/driver/level3/000077500000000000000000000000001313527062700156315ustar00rootroot00000000000000OpenBLAS-0.2.20/driver/level3/CMakeLists.txt000066400000000000000000000147631313527062700204040ustar00rootroot00000000000000include_directories(${PROJECT_SOURCE_DIR}) # N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa # loop through gemm.c defines set(GEMM_DEFINES NN NT TN TT) set(GEMM_COMPLEX_DEFINES RN CN RT CT NR TR RR CR NC TC RC CC) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0) if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) endif () endforeach () set(TRMM_TRSM_SOURCES trmm_L.c trmm_R.c trsm_L.c trsm_R.c) foreach(trmm_trsm_source ${TRMM_TRSM_SOURCES}) string(REGEX MATCH "[a-z]+_[A-Z]+" op_name ${trmm_trsm_source}) GenerateCombinationObjects("${trmm_trsm_source}" "UPPER;UNIT" "L;N" "" 0 "${op_name}N") GenerateCombinationObjects("${trmm_trsm_source}" "UPPER;UNIT" "L;N" "TRANSA" 0 "${op_name}T") endforeach() GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "NN" 1) GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "" 1) GenerateCombinationObjects("syr2k_k.c" "LOWER;TRANS" "U;N" "" 1) GenerateCombinationObjects("syrk_kernel.c" "LOWER" "U" "" 2) GenerateCombinationObjects("syr2k_kernel.c" "LOWER" "U" "" 2) if (SMP) # N.B. these do NOT have a float type (e.g. DOUBLE) defined! GenerateNamedObjects("gemm_thread_m.c;gemm_thread_n.c;gemm_thread_mn.c;gemm_thread_variable.c;syrk_thread.c" "" "" 0 "" "" 1) if (NOT USE_SIMPLE_THREADED_LEVEL3) GenerateCombinationObjects("syrk_k.c" "LOWER;TRANS" "U;N" "THREADED_LEVEL3" 2 "syrk_thread") GenerateCombinationObjects("symm_k.c" "RSIDE;LOWER" "L;U" "THREADED_LEVEL3;NN" 2 "symm_thread") endif () endif () foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateCombinationObjects("zherk_kernel.c" "LOWER;CONJ" "U;N" "HERK" 2 "herk_kernel" false ${float_type}) # TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type}) GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type}) # Need to set CONJ for trmm and trsm GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type}) GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type}) GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_RR" false ${float_type}) GenerateCombinationObjects("trmm_R.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_RC" false ${float_type}) GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_LR" false ${float_type}) GenerateCombinationObjects("trsm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trsm_LC" false ${float_type}) GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trsm_RR" false ${float_type}) GenerateCombinationObjects("trsm_R.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trsm_RC" false ${float_type}) #hemm GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN" 0 "hemm_L" false ${float_type}) GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE" 0 "hemm_R" false ${float_type}) #her2k GenerateCombinationObjects("zher2k_kernel.c" "LOWER;CONJ" "U;N" "" 2 "her2k_kernel" false ${float_type}) GenerateNamedObjects("zher2k_k.c" "HER2K" "her2k_UN" false "" "" false ${float_type}) GenerateNamedObjects("zher2k_k.c" "HER2K;TRANS;CONJ" "her2k_UC" false "" "" false ${float_type}) GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER" "her2k_LN" false "" "" false ${float_type}) GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) #herk GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) #hemm GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type}) GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type}) #her2k GenerateNamedObjects("zher2k_k.c" "HER2K" "her2k_UN" false "" "" false ${float_type}) GenerateNamedObjects("zher2k_k.c" "HER2K;TRANS;CONJ" "her2k_UC" false "" "" false ${float_type}) GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER" "her2k_LN" false "" "" false ${float_type}) GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) endif() # special gemm defines for complex foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) string(TOLOWER ${gemm_define} gemm_define_LC) GenerateNamedObjects("gemm.c" "${gemm_define}" "gemm_${gemm_define_LC}" false "" "" false ${float_type}) if(USE_GEMM3M) GenerateNamedObjects("gemm3m.c" "${gemm_define}" "gemm3m_${gemm_define_LC}" false "" "" false ${float_type}) endif() if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false ${float_type}) if(USE_GEMM3M) GenerateNamedObjects("gemm3m.c" "${gemm_define};THREADED_LEVEL3" "gemm3m_thread_${gemm_define_LC}" false "" "" false ${float_type}) endif() endif () endforeach () # for gemm3m if(USE_GEMM3M) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type}) if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type}) endif () endforeach () endif() endif () endforeach () #HPLOBJS = # dgemm_nn.c dgemm_nt.c dgemm_tn.c dgemm_tt.c # dtrsm_LNUU.c dtrsm_LNUN.c dtrsm_LNLU.c dtrsm_LNLN.c # dtrsm_LTUU.c dtrsm_LTUN.c dtrsm_LTLU.c dtrsm_LTLN.c # dtrsm_RNUU.c dtrsm_RNUN.c dtrsm_RNLU.c dtrsm_RNLN.c # dtrsm_RTUU.c dtrsm_RTUN.c dtrsm_RTLU.c dtrsm_RTLN.c # #if (USE_SIMPLE_THREADED_LEVEL3) # HPLOBJS += dgemm_thread_nn.c dgemm_thread_nt.c # dgemm_thread_tn.c dgemm_thread_tt.c #endif # add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) OpenBLAS-0.2.20/driver/level3/Makefile000066400000000000000000006570721313527062700173120ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system USE_GEMM3M = 0 ifeq ($(ARCH), x86) USE_GEMM3M = 1 endif ifeq ($(ARCH), x86_64) USE_GEMM3M = 1 endif ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif SBLASOBJS += \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ strmm_LTUU.$(SUFFIX) strmm_LTUN.$(SUFFIX) strmm_LTLU.$(SUFFIX) strmm_LTLN.$(SUFFIX) \ strmm_RNUU.$(SUFFIX) strmm_RNUN.$(SUFFIX) strmm_RNLU.$(SUFFIX) strmm_RNLN.$(SUFFIX) \ strmm_RTUU.$(SUFFIX) strmm_RTUN.$(SUFFIX) strmm_RTLU.$(SUFFIX) strmm_RTLN.$(SUFFIX) \ strsm_LNUU.$(SUFFIX) strsm_LNUN.$(SUFFIX) strsm_LNLU.$(SUFFIX) strsm_LNLN.$(SUFFIX) \ strsm_LTUU.$(SUFFIX) strsm_LTUN.$(SUFFIX) strsm_LTLU.$(SUFFIX) strsm_LTLN.$(SUFFIX) \ strsm_RNUU.$(SUFFIX) strsm_RNUN.$(SUFFIX) strsm_RNLU.$(SUFFIX) strsm_RNLN.$(SUFFIX) \ strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \ ssymm_LU.$(SUFFIX) ssymm_LL.$(SUFFIX) ssymm_RU.$(SUFFIX) ssymm_RL.$(SUFFIX) \ ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \ ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) \ ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) DBLASOBJS += \ dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ dtrmm_LNUU.$(SUFFIX) dtrmm_LNUN.$(SUFFIX) dtrmm_LNLU.$(SUFFIX) dtrmm_LNLN.$(SUFFIX) \ dtrmm_LTUU.$(SUFFIX) dtrmm_LTUN.$(SUFFIX) dtrmm_LTLU.$(SUFFIX) dtrmm_LTLN.$(SUFFIX) \ dtrmm_RNUU.$(SUFFIX) dtrmm_RNUN.$(SUFFIX) dtrmm_RNLU.$(SUFFIX) dtrmm_RNLN.$(SUFFIX) \ dtrmm_RTUU.$(SUFFIX) dtrmm_RTUN.$(SUFFIX) dtrmm_RTLU.$(SUFFIX) dtrmm_RTLN.$(SUFFIX) \ dtrsm_LNUU.$(SUFFIX) dtrsm_LNUN.$(SUFFIX) dtrsm_LNLU.$(SUFFIX) dtrsm_LNLN.$(SUFFIX) \ dtrsm_LTUU.$(SUFFIX) dtrsm_LTUN.$(SUFFIX) dtrsm_LTLU.$(SUFFIX) dtrsm_LTLN.$(SUFFIX) \ dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \ dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX) \ dsymm_LU.$(SUFFIX) dsymm_LL.$(SUFFIX) dsymm_RU.$(SUFFIX) dsymm_RL.$(SUFFIX) \ dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \ dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \ dsyrk_kernel_U.$(SUFFIX) dsyrk_kernel_L.$(SUFFIX) \ dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) QBLASOBJS += \ qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \ qtrmm_LNUU.$(SUFFIX) qtrmm_LNUN.$(SUFFIX) qtrmm_LNLU.$(SUFFIX) qtrmm_LNLN.$(SUFFIX) \ qtrmm_LTUU.$(SUFFIX) qtrmm_LTUN.$(SUFFIX) qtrmm_LTLU.$(SUFFIX) qtrmm_LTLN.$(SUFFIX) \ qtrmm_RNUU.$(SUFFIX) qtrmm_RNUN.$(SUFFIX) qtrmm_RNLU.$(SUFFIX) qtrmm_RNLN.$(SUFFIX) \ qtrmm_RTUU.$(SUFFIX) qtrmm_RTUN.$(SUFFIX) qtrmm_RTLU.$(SUFFIX) qtrmm_RTLN.$(SUFFIX) \ qtrsm_LNUU.$(SUFFIX) qtrsm_LNUN.$(SUFFIX) qtrsm_LNLU.$(SUFFIX) qtrsm_LNLN.$(SUFFIX) \ qtrsm_LTUU.$(SUFFIX) qtrsm_LTUN.$(SUFFIX) qtrsm_LTLU.$(SUFFIX) qtrsm_LTLN.$(SUFFIX) \ qtrsm_RNUU.$(SUFFIX) qtrsm_RNUN.$(SUFFIX) qtrsm_RNLU.$(SUFFIX) qtrsm_RNLN.$(SUFFIX) \ qtrsm_RTUU.$(SUFFIX) qtrsm_RTUN.$(SUFFIX) qtrsm_RTLU.$(SUFFIX) qtrsm_RTLN.$(SUFFIX) \ qsymm_LU.$(SUFFIX) qsymm_LL.$(SUFFIX) qsymm_RU.$(SUFFIX) qsymm_RL.$(SUFFIX) \ qsyrk_UN.$(SUFFIX) qsyrk_UT.$(SUFFIX) qsyrk_LN.$(SUFFIX) qsyrk_LT.$(SUFFIX) \ qsyr2k_UN.$(SUFFIX) qsyr2k_UT.$(SUFFIX) qsyr2k_LN.$(SUFFIX) qsyr2k_LT.$(SUFFIX) \ qsyrk_kernel_U.$(SUFFIX) qsyrk_kernel_L.$(SUFFIX) \ qsyr2k_kernel_U.$(SUFFIX) qsyr2k_kernel_L.$(SUFFIX) CBLASOBJS += \ cgemm_nn.$(SUFFIX) cgemm_cn.$(SUFFIX) cgemm_tn.$(SUFFIX) cgemm_nc.$(SUFFIX) \ cgemm_nt.$(SUFFIX) cgemm_cc.$(SUFFIX) cgemm_ct.$(SUFFIX) cgemm_tc.$(SUFFIX) \ cgemm_tt.$(SUFFIX) cgemm_nr.$(SUFFIX) cgemm_tr.$(SUFFIX) cgemm_cr.$(SUFFIX) \ cgemm_rn.$(SUFFIX) cgemm_rt.$(SUFFIX) cgemm_rc.$(SUFFIX) cgemm_rr.$(SUFFIX) \ ctrmm_LNUU.$(SUFFIX) ctrmm_LNUN.$(SUFFIX) ctrmm_LNLU.$(SUFFIX) ctrmm_LNLN.$(SUFFIX) \ ctrmm_LTUU.$(SUFFIX) ctrmm_LTUN.$(SUFFIX) ctrmm_LTLU.$(SUFFIX) ctrmm_LTLN.$(SUFFIX) \ ctrmm_LRUU.$(SUFFIX) ctrmm_LRUN.$(SUFFIX) ctrmm_LRLU.$(SUFFIX) ctrmm_LRLN.$(SUFFIX) \ ctrmm_LCUU.$(SUFFIX) ctrmm_LCUN.$(SUFFIX) ctrmm_LCLU.$(SUFFIX) ctrmm_LCLN.$(SUFFIX) \ ctrmm_RNUU.$(SUFFIX) ctrmm_RNUN.$(SUFFIX) ctrmm_RNLU.$(SUFFIX) ctrmm_RNLN.$(SUFFIX) \ ctrmm_RTUU.$(SUFFIX) ctrmm_RTUN.$(SUFFIX) ctrmm_RTLU.$(SUFFIX) ctrmm_RTLN.$(SUFFIX) \ ctrmm_RRUU.$(SUFFIX) ctrmm_RRUN.$(SUFFIX) ctrmm_RRLU.$(SUFFIX) ctrmm_RRLN.$(SUFFIX) \ ctrmm_RCUU.$(SUFFIX) ctrmm_RCUN.$(SUFFIX) ctrmm_RCLU.$(SUFFIX) ctrmm_RCLN.$(SUFFIX) \ ctrsm_LNUU.$(SUFFIX) ctrsm_LNUN.$(SUFFIX) ctrsm_LNLU.$(SUFFIX) ctrsm_LNLN.$(SUFFIX) \ ctrsm_LTUU.$(SUFFIX) ctrsm_LTUN.$(SUFFIX) ctrsm_LTLU.$(SUFFIX) ctrsm_LTLN.$(SUFFIX) \ ctrsm_LRUU.$(SUFFIX) ctrsm_LRUN.$(SUFFIX) ctrsm_LRLU.$(SUFFIX) ctrsm_LRLN.$(SUFFIX) \ ctrsm_LCUU.$(SUFFIX) ctrsm_LCUN.$(SUFFIX) ctrsm_LCLU.$(SUFFIX) ctrsm_LCLN.$(SUFFIX) \ ctrsm_RNUU.$(SUFFIX) ctrsm_RNUN.$(SUFFIX) ctrsm_RNLU.$(SUFFIX) ctrsm_RNLN.$(SUFFIX) \ ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \ ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \ ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) \ csymm_LU.$(SUFFIX) csymm_LL.$(SUFFIX) csymm_RU.$(SUFFIX) csymm_RL.$(SUFFIX) \ chemm_LU.$(SUFFIX) chemm_LL.$(SUFFIX) chemm_RU.$(SUFFIX) chemm_RL.$(SUFFIX) \ csyrk_UN.$(SUFFIX) csyrk_UT.$(SUFFIX) csyrk_LN.$(SUFFIX) csyrk_LT.$(SUFFIX) \ cherk_UN.$(SUFFIX) cherk_UC.$(SUFFIX) cherk_LN.$(SUFFIX) cherk_LC.$(SUFFIX) \ csyr2k_UN.$(SUFFIX) csyr2k_UT.$(SUFFIX) csyr2k_LN.$(SUFFIX) csyr2k_LT.$(SUFFIX) \ cher2k_UN.$(SUFFIX) cher2k_UC.$(SUFFIX) cher2k_LN.$(SUFFIX) cher2k_LC.$(SUFFIX) \ csyrk_kernel_U.$(SUFFIX) csyrk_kernel_L.$(SUFFIX) \ cherk_kernel_UN.$(SUFFIX) cherk_kernel_UC.$(SUFFIX) \ cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ csyr2k_kernel_U.$(SUFFIX) csyr2k_kernel_L.$(SUFFIX) \ cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \ cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) ZBLASOBJS += \ zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \ zgemm_nt.$(SUFFIX) zgemm_cc.$(SUFFIX) zgemm_ct.$(SUFFIX) zgemm_tc.$(SUFFIX) \ zgemm_tt.$(SUFFIX) zgemm_nr.$(SUFFIX) zgemm_tr.$(SUFFIX) zgemm_cr.$(SUFFIX) \ zgemm_rn.$(SUFFIX) zgemm_rt.$(SUFFIX) zgemm_rc.$(SUFFIX) zgemm_rr.$(SUFFIX) \ ztrmm_LNUU.$(SUFFIX) ztrmm_LNUN.$(SUFFIX) ztrmm_LNLU.$(SUFFIX) ztrmm_LNLN.$(SUFFIX) \ ztrmm_LTUU.$(SUFFIX) ztrmm_LTUN.$(SUFFIX) ztrmm_LTLU.$(SUFFIX) ztrmm_LTLN.$(SUFFIX) \ ztrmm_LRUU.$(SUFFIX) ztrmm_LRUN.$(SUFFIX) ztrmm_LRLU.$(SUFFIX) ztrmm_LRLN.$(SUFFIX) \ ztrmm_LCUU.$(SUFFIX) ztrmm_LCUN.$(SUFFIX) ztrmm_LCLU.$(SUFFIX) ztrmm_LCLN.$(SUFFIX) \ ztrmm_RNUU.$(SUFFIX) ztrmm_RNUN.$(SUFFIX) ztrmm_RNLU.$(SUFFIX) ztrmm_RNLN.$(SUFFIX) \ ztrmm_RTUU.$(SUFFIX) ztrmm_RTUN.$(SUFFIX) ztrmm_RTLU.$(SUFFIX) ztrmm_RTLN.$(SUFFIX) \ ztrmm_RRUU.$(SUFFIX) ztrmm_RRUN.$(SUFFIX) ztrmm_RRLU.$(SUFFIX) ztrmm_RRLN.$(SUFFIX) \ ztrmm_RCUU.$(SUFFIX) ztrmm_RCUN.$(SUFFIX) ztrmm_RCLU.$(SUFFIX) ztrmm_RCLN.$(SUFFIX) \ ztrsm_LNUU.$(SUFFIX) ztrsm_LNUN.$(SUFFIX) ztrsm_LNLU.$(SUFFIX) ztrsm_LNLN.$(SUFFIX) \ ztrsm_LTUU.$(SUFFIX) ztrsm_LTUN.$(SUFFIX) ztrsm_LTLU.$(SUFFIX) ztrsm_LTLN.$(SUFFIX) \ ztrsm_LRUU.$(SUFFIX) ztrsm_LRUN.$(SUFFIX) ztrsm_LRLU.$(SUFFIX) ztrsm_LRLN.$(SUFFIX) \ ztrsm_LCUU.$(SUFFIX) ztrsm_LCUN.$(SUFFIX) ztrsm_LCLU.$(SUFFIX) ztrsm_LCLN.$(SUFFIX) \ ztrsm_RNUU.$(SUFFIX) ztrsm_RNUN.$(SUFFIX) ztrsm_RNLU.$(SUFFIX) ztrsm_RNLN.$(SUFFIX) \ ztrsm_RTUU.$(SUFFIX) ztrsm_RTUN.$(SUFFIX) ztrsm_RTLU.$(SUFFIX) ztrsm_RTLN.$(SUFFIX) \ ztrsm_RRUU.$(SUFFIX) ztrsm_RRUN.$(SUFFIX) ztrsm_RRLU.$(SUFFIX) ztrsm_RRLN.$(SUFFIX) \ ztrsm_RCUU.$(SUFFIX) ztrsm_RCUN.$(SUFFIX) ztrsm_RCLU.$(SUFFIX) ztrsm_RCLN.$(SUFFIX) \ zsymm_LU.$(SUFFIX) zsymm_LL.$(SUFFIX) zsymm_RU.$(SUFFIX) zsymm_RL.$(SUFFIX) \ zhemm_LU.$(SUFFIX) zhemm_LL.$(SUFFIX) zhemm_RU.$(SUFFIX) zhemm_RL.$(SUFFIX) \ zsyrk_UN.$(SUFFIX) zsyrk_UT.$(SUFFIX) zsyrk_LN.$(SUFFIX) zsyrk_LT.$(SUFFIX) \ zherk_UN.$(SUFFIX) zherk_UC.$(SUFFIX) zherk_LN.$(SUFFIX) zherk_LC.$(SUFFIX) \ zsyr2k_UN.$(SUFFIX) zsyr2k_UT.$(SUFFIX) zsyr2k_LN.$(SUFFIX) zsyr2k_LT.$(SUFFIX) \ zher2k_UN.$(SUFFIX) zher2k_UC.$(SUFFIX) zher2k_LN.$(SUFFIX) zher2k_LC.$(SUFFIX) \ zsyrk_kernel_U.$(SUFFIX) zsyrk_kernel_L.$(SUFFIX) \ zherk_kernel_UN.$(SUFFIX) zherk_kernel_UC.$(SUFFIX) \ zherk_kernel_LN.$(SUFFIX) zherk_kernel_LC.$(SUFFIX) \ zsyr2k_kernel_U.$(SUFFIX) zsyr2k_kernel_L.$(SUFFIX) \ zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \ zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) XBLASOBJS += \ xgemm_nn.$(SUFFIX) xgemm_cn.$(SUFFIX) xgemm_tn.$(SUFFIX) xgemm_nc.$(SUFFIX) \ xgemm_nt.$(SUFFIX) xgemm_cc.$(SUFFIX) xgemm_ct.$(SUFFIX) xgemm_tc.$(SUFFIX) \ xgemm_tt.$(SUFFIX) xgemm_nr.$(SUFFIX) xgemm_tr.$(SUFFIX) xgemm_cr.$(SUFFIX) \ xgemm_rn.$(SUFFIX) xgemm_rt.$(SUFFIX) xgemm_rc.$(SUFFIX) xgemm_rr.$(SUFFIX) \ xtrmm_LNUU.$(SUFFIX) xtrmm_LNUN.$(SUFFIX) xtrmm_LNLU.$(SUFFIX) xtrmm_LNLN.$(SUFFIX) \ xtrmm_LTUU.$(SUFFIX) xtrmm_LTUN.$(SUFFIX) xtrmm_LTLU.$(SUFFIX) xtrmm_LTLN.$(SUFFIX) \ xtrmm_LRUU.$(SUFFIX) xtrmm_LRUN.$(SUFFIX) xtrmm_LRLU.$(SUFFIX) xtrmm_LRLN.$(SUFFIX) \ xtrmm_LCUU.$(SUFFIX) xtrmm_LCUN.$(SUFFIX) xtrmm_LCLU.$(SUFFIX) xtrmm_LCLN.$(SUFFIX) \ xtrmm_RNUU.$(SUFFIX) xtrmm_RNUN.$(SUFFIX) xtrmm_RNLU.$(SUFFIX) xtrmm_RNLN.$(SUFFIX) \ xtrmm_RTUU.$(SUFFIX) xtrmm_RTUN.$(SUFFIX) xtrmm_RTLU.$(SUFFIX) xtrmm_RTLN.$(SUFFIX) \ xtrmm_RRUU.$(SUFFIX) xtrmm_RRUN.$(SUFFIX) xtrmm_RRLU.$(SUFFIX) xtrmm_RRLN.$(SUFFIX) \ xtrmm_RCUU.$(SUFFIX) xtrmm_RCUN.$(SUFFIX) xtrmm_RCLU.$(SUFFIX) xtrmm_RCLN.$(SUFFIX) \ xtrsm_LNUU.$(SUFFIX) xtrsm_LNUN.$(SUFFIX) xtrsm_LNLU.$(SUFFIX) xtrsm_LNLN.$(SUFFIX) \ xtrsm_LTUU.$(SUFFIX) xtrsm_LTUN.$(SUFFIX) xtrsm_LTLU.$(SUFFIX) xtrsm_LTLN.$(SUFFIX) \ xtrsm_LRUU.$(SUFFIX) xtrsm_LRUN.$(SUFFIX) xtrsm_LRLU.$(SUFFIX) xtrsm_LRLN.$(SUFFIX) \ xtrsm_LCUU.$(SUFFIX) xtrsm_LCUN.$(SUFFIX) xtrsm_LCLU.$(SUFFIX) xtrsm_LCLN.$(SUFFIX) \ xtrsm_RNUU.$(SUFFIX) xtrsm_RNUN.$(SUFFIX) xtrsm_RNLU.$(SUFFIX) xtrsm_RNLN.$(SUFFIX) \ xtrsm_RTUU.$(SUFFIX) xtrsm_RTUN.$(SUFFIX) xtrsm_RTLU.$(SUFFIX) xtrsm_RTLN.$(SUFFIX) \ xtrsm_RRUU.$(SUFFIX) xtrsm_RRUN.$(SUFFIX) xtrsm_RRLU.$(SUFFIX) xtrsm_RRLN.$(SUFFIX) \ xtrsm_RCUU.$(SUFFIX) xtrsm_RCUN.$(SUFFIX) xtrsm_RCLU.$(SUFFIX) xtrsm_RCLN.$(SUFFIX) \ xsymm_LU.$(SUFFIX) xsymm_LL.$(SUFFIX) xsymm_RU.$(SUFFIX) xsymm_RL.$(SUFFIX) \ xhemm_LU.$(SUFFIX) xhemm_LL.$(SUFFIX) xhemm_RU.$(SUFFIX) xhemm_RL.$(SUFFIX) \ xsyrk_UN.$(SUFFIX) xsyrk_UT.$(SUFFIX) xsyrk_LN.$(SUFFIX) xsyrk_LT.$(SUFFIX) \ xherk_UN.$(SUFFIX) xherk_UC.$(SUFFIX) xherk_LN.$(SUFFIX) xherk_LC.$(SUFFIX) \ xsyr2k_UN.$(SUFFIX) xsyr2k_UT.$(SUFFIX) xsyr2k_LN.$(SUFFIX) xsyr2k_LT.$(SUFFIX) \ xher2k_UN.$(SUFFIX) xher2k_UC.$(SUFFIX) xher2k_LN.$(SUFFIX) xher2k_LC.$(SUFFIX) \ xsyrk_kernel_U.$(SUFFIX) xsyrk_kernel_L.$(SUFFIX) \ xherk_kernel_UN.$(SUFFIX) xherk_kernel_UC.$(SUFFIX) \ xherk_kernel_LN.$(SUFFIX) xherk_kernel_LC.$(SUFFIX) \ xsyr2k_kernel_U.$(SUFFIX) xsyr2k_kernel_L.$(SUFFIX) \ xher2k_kernel_UN.$(SUFFIX) xher2k_kernel_UC.$(SUFFIX) \ xher2k_kernel_LN.$(SUFFIX) xher2k_kernel_LC.$(SUFFIX) ifeq ($(USE_GEMM3M), 1) CBLASOBJS += \ cgemm3m_nn.$(SUFFIX) cgemm3m_cn.$(SUFFIX) cgemm3m_tn.$(SUFFIX) cgemm3m_nc.$(SUFFIX) \ cgemm3m_nt.$(SUFFIX) cgemm3m_cc.$(SUFFIX) cgemm3m_ct.$(SUFFIX) cgemm3m_tc.$(SUFFIX) \ cgemm3m_tt.$(SUFFIX) cgemm3m_nr.$(SUFFIX) cgemm3m_tr.$(SUFFIX) cgemm3m_cr.$(SUFFIX) \ cgemm3m_rn.$(SUFFIX) cgemm3m_rt.$(SUFFIX) cgemm3m_rc.$(SUFFIX) cgemm3m_rr.$(SUFFIX) \ csymm3m_LU.$(SUFFIX) csymm3m_LL.$(SUFFIX) csymm3m_RU.$(SUFFIX) csymm3m_RL.$(SUFFIX) \ chemm3m_LU.$(SUFFIX) chemm3m_LL.$(SUFFIX) chemm3m_RU.$(SUFFIX) chemm3m_RL.$(SUFFIX) ZBLASOBJS += \ zgemm3m_nn.$(SUFFIX) zgemm3m_cn.$(SUFFIX) zgemm3m_tn.$(SUFFIX) zgemm3m_nc.$(SUFFIX) \ zgemm3m_nt.$(SUFFIX) zgemm3m_cc.$(SUFFIX) zgemm3m_ct.$(SUFFIX) zgemm3m_tc.$(SUFFIX) \ zgemm3m_tt.$(SUFFIX) zgemm3m_nr.$(SUFFIX) zgemm3m_tr.$(SUFFIX) zgemm3m_cr.$(SUFFIX) \ zgemm3m_rn.$(SUFFIX) zgemm3m_rt.$(SUFFIX) zgemm3m_rc.$(SUFFIX) zgemm3m_rr.$(SUFFIX) \ zsymm3m_LU.$(SUFFIX) zsymm3m_LL.$(SUFFIX) zsymm3m_RU.$(SUFFIX) zsymm3m_RL.$(SUFFIX) \ zhemm3m_LU.$(SUFFIX) zhemm3m_LL.$(SUFFIX) zhemm3m_RU.$(SUFFIX) zhemm3m_RL.$(SUFFIX) XBLASOBJS += \ xgemm3m_nn.$(SUFFIX) xgemm3m_cn.$(SUFFIX) xgemm3m_tn.$(SUFFIX) xgemm3m_nc.$(SUFFIX) \ xgemm3m_nt.$(SUFFIX) xgemm3m_cc.$(SUFFIX) xgemm3m_ct.$(SUFFIX) xgemm3m_tc.$(SUFFIX) \ xgemm3m_tt.$(SUFFIX) xgemm3m_nr.$(SUFFIX) xgemm3m_tr.$(SUFFIX) xgemm3m_cr.$(SUFFIX) \ xgemm3m_rn.$(SUFFIX) xgemm3m_rt.$(SUFFIX) xgemm3m_rc.$(SUFFIX) xgemm3m_rr.$(SUFFIX) \ xsymm3m_LU.$(SUFFIX) xsymm3m_LL.$(SUFFIX) xsymm3m_RU.$(SUFFIX) xsymm3m_RL.$(SUFFIX) \ xhemm3m_LU.$(SUFFIX) xhemm3m_LL.$(SUFFIX) xhemm3m_RU.$(SUFFIX) xhemm3m_RL.$(SUFFIX) endif ifdef SMP COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(SUFFIX) gemm_thread_variable.$(SUFFIX) COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) CBLASOBJS += cgemm_thread_nn.$(SUFFIX) cgemm_thread_nt.$(SUFFIX) cgemm_thread_nr.$(SUFFIX) cgemm_thread_nc.$(SUFFIX) CBLASOBJS += cgemm_thread_tn.$(SUFFIX) cgemm_thread_tt.$(SUFFIX) cgemm_thread_tr.$(SUFFIX) cgemm_thread_tc.$(SUFFIX) CBLASOBJS += cgemm_thread_rn.$(SUFFIX) cgemm_thread_rt.$(SUFFIX) cgemm_thread_rr.$(SUFFIX) cgemm_thread_rc.$(SUFFIX) CBLASOBJS += cgemm_thread_cn.$(SUFFIX) cgemm_thread_ct.$(SUFFIX) cgemm_thread_cr.$(SUFFIX) cgemm_thread_cc.$(SUFFIX) ZBLASOBJS += zgemm_thread_nn.$(SUFFIX) zgemm_thread_nt.$(SUFFIX) zgemm_thread_nr.$(SUFFIX) zgemm_thread_nc.$(SUFFIX) ZBLASOBJS += zgemm_thread_tn.$(SUFFIX) zgemm_thread_tt.$(SUFFIX) zgemm_thread_tr.$(SUFFIX) zgemm_thread_tc.$(SUFFIX) ZBLASOBJS += zgemm_thread_rn.$(SUFFIX) zgemm_thread_rt.$(SUFFIX) zgemm_thread_rr.$(SUFFIX) zgemm_thread_rc.$(SUFFIX) ZBLASOBJS += zgemm_thread_cn.$(SUFFIX) zgemm_thread_ct.$(SUFFIX) zgemm_thread_cr.$(SUFFIX) zgemm_thread_cc.$(SUFFIX) XBLASOBJS += xgemm_thread_nn.$(SUFFIX) xgemm_thread_nt.$(SUFFIX) xgemm_thread_nr.$(SUFFIX) xgemm_thread_nc.$(SUFFIX) XBLASOBJS += xgemm_thread_tn.$(SUFFIX) xgemm_thread_tt.$(SUFFIX) xgemm_thread_tr.$(SUFFIX) xgemm_thread_tc.$(SUFFIX) XBLASOBJS += xgemm_thread_rn.$(SUFFIX) xgemm_thread_rt.$(SUFFIX) xgemm_thread_rr.$(SUFFIX) xgemm_thread_rc.$(SUFFIX) XBLASOBJS += xgemm_thread_cn.$(SUFFIX) xgemm_thread_ct.$(SUFFIX) xgemm_thread_cr.$(SUFFIX) xgemm_thread_cc.$(SUFFIX) SBLASOBJS += ssymm_thread_LU.$(SUFFIX) ssymm_thread_LL.$(SUFFIX) ssymm_thread_RU.$(SUFFIX) ssymm_thread_RL.$(SUFFIX) DBLASOBJS += dsymm_thread_LU.$(SUFFIX) dsymm_thread_LL.$(SUFFIX) dsymm_thread_RU.$(SUFFIX) dsymm_thread_RL.$(SUFFIX) QBLASOBJS += qsymm_thread_LU.$(SUFFIX) qsymm_thread_LL.$(SUFFIX) qsymm_thread_RU.$(SUFFIX) qsymm_thread_RL.$(SUFFIX) CBLASOBJS += csymm_thread_LU.$(SUFFIX) csymm_thread_LL.$(SUFFIX) csymm_thread_RU.$(SUFFIX) csymm_thread_RL.$(SUFFIX) ZBLASOBJS += zsymm_thread_LU.$(SUFFIX) zsymm_thread_LL.$(SUFFIX) zsymm_thread_RU.$(SUFFIX) zsymm_thread_RL.$(SUFFIX) XBLASOBJS += xsymm_thread_LU.$(SUFFIX) xsymm_thread_LL.$(SUFFIX) xsymm_thread_RU.$(SUFFIX) xsymm_thread_RL.$(SUFFIX) CBLASOBJS += chemm_thread_LU.$(SUFFIX) chemm_thread_LL.$(SUFFIX) chemm_thread_RU.$(SUFFIX) chemm_thread_RL.$(SUFFIX) ZBLASOBJS += zhemm_thread_LU.$(SUFFIX) zhemm_thread_LL.$(SUFFIX) zhemm_thread_RU.$(SUFFIX) zhemm_thread_RL.$(SUFFIX) XBLASOBJS += xhemm_thread_LU.$(SUFFIX) xhemm_thread_LL.$(SUFFIX) xhemm_thread_RU.$(SUFFIX) xhemm_thread_RL.$(SUFFIX) SBLASOBJS += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX) DBLASOBJS += dsyrk_thread_UN.$(SUFFIX) dsyrk_thread_UT.$(SUFFIX) dsyrk_thread_LN.$(SUFFIX) dsyrk_thread_LT.$(SUFFIX) QBLASOBJS += qsyrk_thread_UN.$(SUFFIX) qsyrk_thread_UT.$(SUFFIX) qsyrk_thread_LN.$(SUFFIX) qsyrk_thread_LT.$(SUFFIX) CBLASOBJS += csyrk_thread_UN.$(SUFFIX) csyrk_thread_UT.$(SUFFIX) csyrk_thread_LN.$(SUFFIX) csyrk_thread_LT.$(SUFFIX) ZBLASOBJS += zsyrk_thread_UN.$(SUFFIX) zsyrk_thread_UT.$(SUFFIX) zsyrk_thread_LN.$(SUFFIX) zsyrk_thread_LT.$(SUFFIX) XBLASOBJS += xsyrk_thread_UN.$(SUFFIX) xsyrk_thread_UT.$(SUFFIX) xsyrk_thread_LN.$(SUFFIX) xsyrk_thread_LT.$(SUFFIX) CBLASOBJS += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX) ZBLASOBJS += zherk_thread_UN.$(SUFFIX) zherk_thread_UC.$(SUFFIX) zherk_thread_LN.$(SUFFIX) zherk_thread_LC.$(SUFFIX) XBLASOBJS += xherk_thread_UN.$(SUFFIX) xherk_thread_UC.$(SUFFIX) xherk_thread_LN.$(SUFFIX) xherk_thread_LC.$(SUFFIX) ifeq ($(USE_GEMM3M), 1) CBLASOBJS += cgemm3m_thread_nn.$(SUFFIX) cgemm3m_thread_nt.$(SUFFIX) cgemm3m_thread_nr.$(SUFFIX) cgemm3m_thread_nc.$(SUFFIX) CBLASOBJS += cgemm3m_thread_tn.$(SUFFIX) cgemm3m_thread_tt.$(SUFFIX) cgemm3m_thread_tr.$(SUFFIX) cgemm3m_thread_tc.$(SUFFIX) CBLASOBJS += cgemm3m_thread_rn.$(SUFFIX) cgemm3m_thread_rt.$(SUFFIX) cgemm3m_thread_rr.$(SUFFIX) cgemm3m_thread_rc.$(SUFFIX) CBLASOBJS += cgemm3m_thread_cn.$(SUFFIX) cgemm3m_thread_ct.$(SUFFIX) cgemm3m_thread_cr.$(SUFFIX) cgemm3m_thread_cc.$(SUFFIX) ZBLASOBJS += zgemm3m_thread_nn.$(SUFFIX) zgemm3m_thread_nt.$(SUFFIX) zgemm3m_thread_nr.$(SUFFIX) zgemm3m_thread_nc.$(SUFFIX) ZBLASOBJS += zgemm3m_thread_tn.$(SUFFIX) zgemm3m_thread_tt.$(SUFFIX) zgemm3m_thread_tr.$(SUFFIX) zgemm3m_thread_tc.$(SUFFIX) ZBLASOBJS += zgemm3m_thread_rn.$(SUFFIX) zgemm3m_thread_rt.$(SUFFIX) zgemm3m_thread_rr.$(SUFFIX) zgemm3m_thread_rc.$(SUFFIX) ZBLASOBJS += zgemm3m_thread_cn.$(SUFFIX) zgemm3m_thread_ct.$(SUFFIX) zgemm3m_thread_cr.$(SUFFIX) zgemm3m_thread_cc.$(SUFFIX) XBLASOBJS += xgemm3m_thread_nn.$(SUFFIX) xgemm3m_thread_nt.$(SUFFIX) xgemm3m_thread_nr.$(SUFFIX) xgemm3m_thread_nc.$(SUFFIX) XBLASOBJS += xgemm3m_thread_tn.$(SUFFIX) xgemm3m_thread_tt.$(SUFFIX) xgemm3m_thread_tr.$(SUFFIX) xgemm3m_thread_tc.$(SUFFIX) XBLASOBJS += xgemm3m_thread_rn.$(SUFFIX) xgemm3m_thread_rt.$(SUFFIX) xgemm3m_thread_rr.$(SUFFIX) xgemm3m_thread_rc.$(SUFFIX) XBLASOBJS += xgemm3m_thread_cn.$(SUFFIX) xgemm3m_thread_ct.$(SUFFIX) xgemm3m_thread_cr.$(SUFFIX) xgemm3m_thread_cc.$(SUFFIX) CBLASOBJS += csymm3m_thread_LU.$(SUFFIX) csymm3m_thread_LL.$(SUFFIX) csymm3m_thread_RU.$(SUFFIX) csymm3m_thread_RL.$(SUFFIX) ZBLASOBJS += zsymm3m_thread_LU.$(SUFFIX) zsymm3m_thread_LL.$(SUFFIX) zsymm3m_thread_RU.$(SUFFIX) zsymm3m_thread_RL.$(SUFFIX) XBLASOBJS += xsymm3m_thread_LU.$(SUFFIX) xsymm3m_thread_LL.$(SUFFIX) xsymm3m_thread_RU.$(SUFFIX) xsymm3m_thread_RL.$(SUFFIX) CBLASOBJS += chemm3m_thread_LU.$(SUFFIX) chemm3m_thread_LL.$(SUFFIX) chemm3m_thread_RU.$(SUFFIX) chemm3m_thread_RL.$(SUFFIX) ZBLASOBJS += zhemm3m_thread_LU.$(SUFFIX) zhemm3m_thread_LL.$(SUFFIX) zhemm3m_thread_RU.$(SUFFIX) zhemm3m_thread_RL.$(SUFFIX) XBLASOBJS += xhemm3m_thread_LU.$(SUFFIX) xhemm3m_thread_LL.$(SUFFIX) xhemm3m_thread_RU.$(SUFFIX) xhemm3m_thread_RL.$(SUFFIX) endif endif endif HPLOBJS = \ dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ dtrsm_LNUU.$(SUFFIX) dtrsm_LNUN.$(SUFFIX) dtrsm_LNLU.$(SUFFIX) dtrsm_LNLN.$(SUFFIX) \ dtrsm_LTUU.$(SUFFIX) dtrsm_LTUN.$(SUFFIX) dtrsm_LTLU.$(SUFFIX) dtrsm_LTLN.$(SUFFIX) \ dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \ dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \ dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) endif all :: sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) dgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) dgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) dgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) dgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) qgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) qgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) qgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) qgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) cgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) gemm_thread_m.$(SUFFIX) : gemm_thread_m.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) gemm_thread_n.$(SUFFIX) : gemm_thread_n.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) gemm_thread_mn.$(SUFFIX) : gemm_thread_mn.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) beta_thread.$(SUFFIX) : beta_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) dgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) dgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) dgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) dgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) qgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) qgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) qgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) qgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) cgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) strmm_LNUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_LNUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_LNLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_LNLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strmm_LTUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_LTUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_LTLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_LTLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) strmm_RNUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_RNUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_RNLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_RNLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strmm_RTUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_RTUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_RTLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_RTLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_LNUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_LNUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_LNLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_LNLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_LTUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_LTUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_LTLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_LTLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_RNUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_RNUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_RNLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_RNLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_RTUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_RTUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_RTLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_RTLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_LNUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_LNUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_LNLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_LNLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_LTUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_LTUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_LTLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_LTLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_RNUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_RNUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_RNLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_RNLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_RTUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_RTUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_RTLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_RTLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) ctrmm_LNUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LNUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LNLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LNLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LTUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LTUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LTLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LTLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LRUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LRUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_LRLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LRLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_LCUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LCUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_LCLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LCLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RNUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RNUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RNLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RNLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RTUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RTUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RTLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RTLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RRUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RRUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RRLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RRLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RCUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RCUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RCLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RCLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LNUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LNUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LNLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LNLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LTUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LTUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LTLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LTLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LRUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LRUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LRLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LRLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LCUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LCUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LCLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LCLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RNUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RNUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RNLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RNLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RTUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RTUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RTLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RTLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RRUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RRUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RRLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RRLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RCUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RCUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RCLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RCLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LNUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LNUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LNLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LNLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LTUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LTUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LTLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LTLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LRUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LRUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LRLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LRLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LCUU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LCUN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LCLU.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LCLN.$(SUFFIX) : trmm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RNUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RNUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RNLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RNLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RTUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RTUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RTLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RTLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RRUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RRUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RRLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RRLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RCUU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RCUN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RCLU.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RCLN.$(SUFFIX) : trmm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ssymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) ssymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) ssymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) ssymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) dsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) dsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) dsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) dsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) qsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) qsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) qsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) qsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) csymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) ssymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) ssymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) ssymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) ssymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) dsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) dsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) dsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) dsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) qsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) qsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) qsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) qsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) csymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) ssyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) ssyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) ssyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) ssyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) dsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) dsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) dsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) dsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) qsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) qsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) qsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) qsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) csyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) csyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) csyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) csyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) zsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) zsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) zsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) zsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) xsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) xsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) xsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) xsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) ssyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) ssyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) ssyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) ssyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) dsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) dsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) dsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) dsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) qsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) qsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) qsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) qsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) csyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) csyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) csyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) csyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) zsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) zsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) zsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) zsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) xsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) xsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) xsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) xsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) ssyrk_kernel_U.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) ssyrk_kernel_L.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) dsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) dsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) qsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) qsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) csyrk_kernel_U.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) csyrk_kernel_L.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) zsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) zsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) xsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) xsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) syrk_thread.$(SUFFIX) : syrk_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) ssyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) ssyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) ssyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) ssyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) dsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) dsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) dsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) dsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) qsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) qsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) qsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) qsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) csyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) csyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) csyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) csyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) zsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) zsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) zsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) zsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) xsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) xsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) xsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) xsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) ssyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) ssyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) dsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) dsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) qsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) qsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) csyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) csyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) zsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) zsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) xsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) xsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) chemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) chemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) zhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) zhemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) xhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) xhemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) chemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) chemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) zhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) zhemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) xhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) xhemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) cherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cherk_kernel_UN.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) cherk_kernel_UC.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) cherk_kernel_LN.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) cherk_kernel_LC.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) zherk_kernel_UN.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) zherk_kernel_UC.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) zherk_kernel_LN.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) zherk_kernel_LC.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) xherk_kernel_UN.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) xherk_kernel_UC.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) xherk_kernel_LN.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) xherk_kernel_LC.$(SUFFIX) : zherk_kernel.c $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) cherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) cherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) cherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) cherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) zherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) zherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) zherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) zherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) xherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) xherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) xherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) xherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) cher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) cher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) cher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) cher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) zher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) zher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) zher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) zher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) xher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) xher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) xher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) xher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) cgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) cgemmf.$(SUFFIX) : zgemmf.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX $< -o $(@F) zgemmf.$(SUFFIX) : zgemmf.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX $< -o $(@F) xgemmf.$(SUFFIX) : zgemmf.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX $< -o $(@F) cgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) csymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) csymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) chemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) chemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) chemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) chemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) strsm_LNUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_LNUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_LNLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_LNLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strsm_LTUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_LTUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_LTLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_LTLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) strsm_RNUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_RNUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_RNLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_RNLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strsm_RTUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_RTUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_RTLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_RTLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_LNUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_LNUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_LNLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_LNLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_LTUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_LTUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_LTLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_LTLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_RNUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_RNUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_RNLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_RNLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_RTUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_RTUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_RTLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_RTLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_LNUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_LNUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_LNLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_LNLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_LTUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_LTUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_LTLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_LTLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_RNUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_RNUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_RNLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_RNLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_RTUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_RTUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_RTLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_RTLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) ctrsm_LNUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LNUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LNLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LNLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LTUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LTUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LTLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LTLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LRUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LRUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_LRLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LRLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_LCUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LCUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_LCLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LCLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RNUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RNUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RNLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RNLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RTUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RTUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RTLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RTLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RRUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RRUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RRLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RRLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RCUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RCUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RCLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LNUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LNUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LNLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LNLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LTUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LTUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LTLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LTLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LRUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LRUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LRLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LRLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LCUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LCUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LCLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LCLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RNUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RNUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RNLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RNLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RTUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RTUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RTLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RTLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RRUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RRUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RRLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RRLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RCUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RCUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RCLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LNUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LNUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LNLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LNLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LTUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LTUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LTLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LTLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LRUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LRUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LRLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LRLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LCUU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LCUN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LCLU.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LCLN.$(SUFFIX) : trsm_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RNUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RNUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RNLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RNLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RTUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RTUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RTLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RTLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RRUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RRUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RRLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RRLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RCUU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RCUN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RCLU.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) dgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) dgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) dgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) dgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) qgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) qgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) qgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) qgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) cgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) gemm_thread_m.$(PSUFFIX) : gemm_thread_m.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) gemm_thread_n.$(PSUFFIX) : gemm_thread_n.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) gemm_thread_mn.$(PSUFFIX) : gemm_thread_mn.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) gemm_thread_variable.$(PSUFFIX) : gemm_thread_variable.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) sgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) sgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) sgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) dgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) dgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) dgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) dgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) qgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) qgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) qgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) qgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) cgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) strmm_LNUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_LNUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_LNLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_LNLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strmm_LTUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_LTUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_LTLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_LTLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) strmm_RNUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_RNUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_RNLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_RNLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strmm_RTUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strmm_RTUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strmm_RTLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strmm_RTLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_LNUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_LNUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_LNLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_LNLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_LTUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_LTUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_LTLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_LTLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_RNUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_RNUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_RNLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_RNLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrmm_RTUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrmm_RTUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrmm_RTLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrmm_RTLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_LNUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_LNUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_LNLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_LNLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_LTUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_LTUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_LTLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_LTLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_RNUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_RNUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_RNLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_RNLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrmm_RTUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrmm_RTUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrmm_RTLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrmm_RTLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) ctrmm_LNUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LNUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LNLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LNLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LTUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LTUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LTLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_LTLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_LRUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LRUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_LRLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LRLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_LCUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LCUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_LCLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_LCLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RNUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RNUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RNLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RNLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RTUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RTUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RTLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrmm_RTLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrmm_RRUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RRUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RRLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RRLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RCUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RCUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrmm_RCLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrmm_RCLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LNUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LNUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LNLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LNLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LTUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LTUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LTLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_LTLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_LRUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LRUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LRLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LRLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LCUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LCUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_LCLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_LCLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RNUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RNUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RNLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RNLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RTUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RTUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RTLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrmm_RTLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrmm_RRUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RRUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RRLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RRLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RCUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RCUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrmm_RCLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrmm_RCLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LNUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LNUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LNLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LNLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LTUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LTUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LTLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_LTLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_LRUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LRUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LRLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LRLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LCUU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LCUN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_LCLU.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_LCLN.$(PSUFFIX) : trmm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RNUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RNUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RNLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RNLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RTUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RTUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RTLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrmm_RTLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrmm_RRUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RRUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RRLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RRLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RCUU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RCUN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrmm_RCLU.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrmm_RCLN.$(PSUFFIX) : trmm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ssymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) ssymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) ssymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) ssymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) dsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) dsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) dsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) dsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) qsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) qsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) qsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) qsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) csymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) ssymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) ssymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) ssymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) ssymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) dsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) dsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) dsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) dsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) qsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) qsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) qsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) qsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) csymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) ssyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) ssyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) ssyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) ssyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) dsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) dsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) dsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) dsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) qsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) qsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) qsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) qsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) csyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) csyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) csyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) csyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) zsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) zsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) zsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) zsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) xsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) xsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) xsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) xsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) ssyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) ssyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) ssyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) ssyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) dsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) dsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) dsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) dsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) qsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) qsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) qsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) qsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) csyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) csyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) csyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) csyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) zsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) zsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) zsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) zsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) xsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) xsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) xsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) xsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) ssyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) ssyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) dsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) dsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) qsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) qsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) csyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) csyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) zsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) zsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) xsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) xsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) syrk_thread.$(PSUFFIX) : syrk_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) ssyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) ssyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) ssyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) ssyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) dsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) dsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) dsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) dsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) qsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) qsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) qsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) qsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) csyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) csyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) csyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) csyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) zsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) zsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) zsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) zsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) xsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) xsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) xsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) xsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) ssyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) ssyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) dsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) dsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) qsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) qsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) csyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) csyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) zsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) zsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) xsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) xsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) chemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) chemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) zhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) zhemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) xhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) xhemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) chemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) chemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) zhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) zhemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) xhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) xhemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) cherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) cherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) cherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) cherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) zherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) zherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) zherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) zherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) xherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) xherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) xherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) xherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) cherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) cherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) cherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) cherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) zherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) zherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) zherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) zherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) xherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) xherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) xherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) xherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) cher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) cher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) zher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) zher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) xher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) xher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) cher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) cher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) cher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) cher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) zher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) zher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) zher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) zher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) xher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) xher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) xher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) xher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) cgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) cgemmf.$(PSUFFIX) : zgemmf.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX $< -o $(@F) zgemmf.$(PSUFFIX) : zgemmf.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX $< -o $(@F) xgemmf.$(PSUFFIX) : zgemmf.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX $< -o $(@F) cgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) cgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) cgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) cgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) cgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) cgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) cgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) cgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) cgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) cgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) zgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) zgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) zgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) zgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) zgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) zgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) zgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) zgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) zgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) xgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) xgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) xgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) xgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) xgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) xgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) xgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) xgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) xgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) xgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) csymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) csymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) csymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) csymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) csymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zsymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xsymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) chemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) chemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) chemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) chemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) chemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) chemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) zhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) zhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) zhemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) xhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) xhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) xhemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) strsm_LNUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_LNUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_LNLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_LNLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strsm_LTUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_LTUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_LTLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_LTLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) strsm_RNUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_RNUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_RNLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_RNLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) strsm_RTUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) strsm_RTUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) strsm_RTLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) strsm_RTLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_LNUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_LNUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_LNLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_LNLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_LTUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_LTUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_LTLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_LTLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_RNUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_RNUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_RNLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_RNLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) dtrsm_RTUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) dtrsm_RTUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) dtrsm_RTLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) dtrsm_RTLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_LNUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_LNUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_LNLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_LNLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_LTUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_LTUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_LTLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_LTLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_RNUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_RNUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_RNLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_RNLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) qtrsm_RTUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) qtrsm_RTUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) qtrsm_RTLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) qtrsm_RTLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) ctrsm_LNUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LNUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LNLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LNLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LTUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LTUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LTLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_LTLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_LRUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LRUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_LRLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LRLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_LCUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LCUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_LCLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_LCLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RNUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RNUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RNLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RNLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RTUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RTUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RTLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ctrsm_RTLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ctrsm_RRUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RRUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RRLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RRLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RCUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RCUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ctrsm_RCLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ctrsm_RCLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LNUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LNUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LNLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LNLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LTUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LTUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LTLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_LTLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_LRUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LRUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LRLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LRLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LCUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LCUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_LCLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_LCLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RNUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RNUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RNLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RNLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RTUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RTUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RTLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) ztrsm_RTLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) ztrsm_RRUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RRUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RRLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RRLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RCUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RCUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) ztrsm_RCLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) ztrsm_RCLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LNUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LNUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LNLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LNLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LTUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LTUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LTLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_LTLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_LRUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LRUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LRLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LRLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LCUU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LCUN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_LCLU.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_LCLN.$(PSUFFIX) : trsm_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RNUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RNUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RNLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RNLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RTUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RTUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RTLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) xtrsm_RTLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) xtrsm_RRUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RRUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RRLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RRLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RCUU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RCUN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) xtrsm_RCLU.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) xtrsm_RCLN.$(PSUFFIX) : trsm_R.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/driver/level3/gemm.c000066400000000000000000000057511313527062700167320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #undef TIMING #ifdef PARAMTEST #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P (args -> gemm_p) #define GEMM_Q (args -> gemm_q) #define GEMM_R (args -> gemm_r) #endif #if 0 #undef GEMM_P #undef GEMM_Q #define GEMM_P 504 #define GEMM_Q 128 #endif #ifdef THREADED_LEVEL3 #include "level3_thread.c" #else #include "level3.c" #endif OpenBLAS-0.2.20/driver/level3/gemm3m.c000066400000000000000000000056451313527062700171740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #undef TIMING #ifdef PARAMTEST #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P (args -> gemm_p) #define GEMM_Q (args -> gemm_q) #define GEMM_R (args -> gemm_r) #endif #ifdef THREADED_LEVEL3 #include "level3_gemm3m_thread.c" #else #include "gemm3m_level3.c" #endif OpenBLAS-0.2.20/driver/level3/gemm3m_level3.c000066400000000000000000000360521313527062700204420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef BETA_OPERATION #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \ (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) #endif #ifndef ICOPYB_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) #else #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) #endif #endif #ifndef ICOPYR_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) #else #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) #endif #endif #ifndef ICOPYI_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) #else #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) #endif #endif #ifndef OCOPYB_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) #else #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) #endif #endif #ifndef OCOPYR_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) #else #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) #endif #endif #ifndef OCOPYI_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) #else #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) #endif #endif #ifndef KERNEL_FUNC #define KERNEL_FUNC GEMM3M_KERNEL #endif #ifndef KERNEL_OPERATION #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #endif #ifndef A #define A args -> a #endif #ifndef LDA #define LDA args -> lda #endif #ifndef B #define B args -> b #endif #ifndef LDB #define LDB args -> ldb #endif #ifndef C #define C args -> c #endif #ifndef LDC #define LDC args -> ldc #endif #ifndef M #define M args -> m #endif #ifndef N #define N args -> n #endif #ifndef K #define K args -> k #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ZERO #define ALPHA6 ONE #define ALPHA7 ONE #define ALPHA8 ZERO #define ALPHA11 ONE #define ALPHA12 -ONE #define ALPHA13 ZERO #define ALPHA14 ONE #define ALPHA17 -ONE #define ALPHA18 -ONE #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ONE #define ALPHA6 ZERO #define ALPHA7 ZERO #define ALPHA8 ONE #define ALPHA11 -ONE #define ALPHA12 -ONE #define ALPHA13 ONE #define ALPHA14 ZERO #define ALPHA17 -ONE #define ALPHA18 ONE #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ONE #define ALPHA6 ZERO #define ALPHA7 ZERO #define ALPHA8 ONE #define ALPHA11 -ONE #define ALPHA12 ONE #define ALPHA13 ONE #define ALPHA14 ZERO #define ALPHA17 -ONE #define ALPHA18 -ONE #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ZERO #define ALPHA6 -ONE #define ALPHA7 ONE #define ALPHA8 ZERO #define ALPHA11 ONE #define ALPHA12 ONE #define ALPHA13 ZERO #define ALPHA14 ONE #define ALPHA17 -ONE #define ALPHA18 ONE #endif #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy){ BLASLONG k, lda, ldb, ldc; FLOAT *alpha, *beta; FLOAT *a, *b, *c; BLASLONG m_from, m_to, n_from, n_to; BLASLONG ls, is, js, jjs; BLASLONG min_l, min_i, min_j, min_jj; #ifdef TIMING BLASULONG rpcc_counter; BLASULONG BLASLONG innercost = 0; BLASULONG BLASLONG outercost = 0; BLASULONG BLASLONG kernelcost = 0; double total; #endif k = K; a = (FLOAT *)A; b = (FLOAT *)B; c = (FLOAT *)C; lda = LDA; ldb = LDB; ldc = LDC; alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; m_from = 0; m_to = M; if (range_m) { m_from = *(((BLASLONG *)range_m) + 0); m_to = *(((BLASLONG *)range_m) + 1); } n_from = 0; n_to = N; if (range_n) { n_from = *(((BLASLONG *)range_n) + 0); n_to = *(((BLASLONG *)range_n) + 1); } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) #else if ((beta[0] != ONE) || (beta[1] != ZERO)) #endif BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); } if ((k == 0) || (alpha == NULL)) return 0; if ((alpha[0] == ZERO) #ifdef COMPLEX && (alpha[1] == ZERO) #endif ) return 0; #if 0 printf("GEMM: M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); printf("GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R); printf("GEMM: SA .. %p SB .. %p\n", sa, sb); #endif #ifdef TIMING innercost = 0; outercost = 0; kernelcost = 0; #endif for(js = n_from; js < n_to; js += GEMM3M_R){ min_j = n_to - js; if (min_j > GEMM3M_R) min_j = GEMM3M_R; for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM3M_Q * 2) { min_l = GEMM3M_Q; } else { if (min_l > GEMM3M_Q) { min_l = (min_l + 1) / 2; #ifdef UNROLL_X min_l = ((min_l + UNROLL_X - 1)/UNROLL_X) * UNROLL_X; #endif } } min_i = m_to - m_from; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else { if (min_i > GEMM3M_P) { min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } } START_RPCC(); ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; START_RPCC(); #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #else OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); #endif STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); STOP_RPCC(kernelcost); } for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else if (min_i > GEMM3M_P) { min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } START_RPCC(); ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); START_RPCC(); KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js); STOP_RPCC(kernelcost); } min_i = m_to - m_from; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else { if (min_i > GEMM3M_P) { min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } } START_RPCC(); ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; START_RPCC(); #if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #else OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); #endif STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); STOP_RPCC(kernelcost); } for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else if (min_i > GEMM3M_P) { min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } START_RPCC(); ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); START_RPCC(); KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js); STOP_RPCC(kernelcost); } min_i = m_to - m_from; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else { if (min_i > GEMM3M_P) { min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } } START_RPCC(); ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; START_RPCC(); #if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #else OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); #endif STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); STOP_RPCC(kernelcost); } for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else if (min_i > GEMM3M_P) { min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } START_RPCC(); ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); START_RPCC(); KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js); STOP_RPCC(kernelcost); } } /* end of js */ } /* end of ls */ #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100.); printf( " Total %10.3f%% %10.3f MFlops\n", ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100, 2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost); #endif return 0; } OpenBLAS-0.2.20/driver/level3/gemm_thread_m.c000066400000000000000000000073431313527062700205740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; if (!range_m) { range[0] = 0; i = arg -> m; } else { range[0] = range_m[0]; i = range_m[1] - range_m[0]; } num_cpu = 0; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; if (i < 0) width = width + i; range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = arg; queue[num_cpu].range_m = &range[num_cpu]; queue[num_cpu].range_n = range_n; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level3/gemm_thread_mn.c000066400000000000000000000121401313527062700207410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static const int divide_rule[][2] = {{ 0, 0}, { 1, 1}, { 1, 2}, { 1, 3}, { 2, 2}, { 1, 5}, { 2, 3}, { 1, 7}, { 2, 4}, { 3, 3}, { 2, 5}, { 1, 11}, { 2, 6}, { 1, 13}, { 2, 7}, { 3, 5}, { 4, 4}, { 1, 17}, { 3, 6}, { 1, 19}, { 4, 5}, { 3, 7}, { 2, 11}, { 1, 23}, { 4, 6}, { 5, 5}, { 2, 13}, { 3, 9}, { 4, 7}, { 1, 29}, { 5, 6}, { 1, 31}, { 4, 8}, { 3, 11}, { 2, 17}, { 5, 7}, { 6, 6}, { 1, 37}, { 2, 19}, { 3, 13}, { 5, 8}, { 1, 41}, { 6, 7}, { 1, 43}, { 4, 11}, { 5, 9}, { 2, 23}, { 1, 47}, { 6, 8}, { 7, 7}, { 5, 10}, { 3, 17}, { 4, 13}, { 1, 53}, { 6, 9}, { 5, 11}, { 7, 8}, { 3, 19}, { 2, 29}, { 1, 59}, { 6, 10}, { 1, 61}, { 2, 31}, { 7, 9}, { 8, 8}, }; int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; BLASLONG procs, num_cpu_m, num_cpu_n; BLASLONG width, i, j; BLASLONG divM, divN; divM = divide_rule[nthreads][0]; divN = divide_rule[nthreads][1]; if (!range_m) { range_M[0] = 0; i = arg -> m; } else { range_M[0] = range_m[0]; i = range_m[1] - range_m[0]; } num_cpu_m = 0; while (i > 0){ width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); i -= width; if (i < 0) width = width + i; range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; num_cpu_m ++; } if (!range_n) { range_N[0] = 0; i = arg -> n; } else { range_N[0] = range_n[0]; i = range_n[1] - range_n[0]; } num_cpu_n = 0; while (i > 0){ width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); i -= width; if (i < 0) width = width + i; range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; num_cpu_n ++; } procs = 0; for (j = 0; j < num_cpu_n; j++) { for (i = 0; i < num_cpu_m; i++) { queue[procs].mode = mode; queue[procs].routine = function; queue[procs].args = arg; queue[procs].range_m = &range_M[i]; queue[procs].range_n = &range_N[j]; queue[procs].sa = NULL; queue[procs].sb = NULL; queue[procs].next = &queue[procs + 1]; procs ++; } } if (procs) { queue[0].sa = sa; queue[0].sb = sb; queue[procs - 1].next = NULL; exec_blas(procs, queue); } return 0; } OpenBLAS-0.2.20/driver/level3/gemm_thread_n.c000066400000000000000000000077661313527062700206060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; if (!range_n) { range[0] = 0; i = arg -> n; } else { range[0] = range_n[0]; i = range_n[1] - range_n[0]; } num_cpu = 0; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; if (i < 0) width = width + i; range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; #if 0 //defined(LOONGSON3A) queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; #else queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; #endif queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } if (num_cpu) { #if 0 //defined(LOONGSON3A) queue[0].sa = sa; queue[0].sb = sa + GEMM_OFFSET_A1 * 5; #else queue[0].sa = sa; queue[0].sb = sb; #endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level3/gemm_thread_variable.c000066400000000000000000000104571313527062700221250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG divM, BLASLONG divN) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; BLASLONG procs, num_cpu_m, num_cpu_n; BLASLONG width, i, j; if (!range_m) { range_M[0] = 0; i = arg -> m; } else { range_M[0] = range_m[0]; i = range_m[1] - range_m[0]; } num_cpu_m = 0; while (i > 0){ width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); i -= width; if (i < 0) width = width + i; range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; num_cpu_m ++; } if (!range_n) { range_N[0] = 0; i = arg -> n; } else { range_N[0] = range_n[0]; i = range_n[1] - range_n[0]; } num_cpu_n = 0; while (i > 0){ width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); i -= width; if (i < 0) width = width + i; range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; num_cpu_n ++; } procs = 0; for (j = 0; j < num_cpu_n; j++) { for (i = 0; i < num_cpu_m; i++) { queue[procs].mode = mode; queue[procs].routine = function; queue[procs].args = arg; queue[procs].range_m = &range_M[i]; queue[procs].range_n = &range_N[j]; queue[procs].sa = NULL; queue[procs].sb = NULL; queue[procs].next = &queue[procs + 1]; procs ++; } } if (procs) { queue[0].sa = sa; queue[0].sb = sb; queue[procs - 1].next = NULL; exec_blas(procs, queue); } return 0; } OpenBLAS-0.2.20/driver/level3/hemm3m_k.c000066400000000000000000000112701313527062700174760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #undef TIMING #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \ (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) #ifndef RSIDE #ifndef LOWER #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYB(M, N, A, LDA, Y, X, BUFFER) #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYR(M, N, A, LDA, Y, X, BUFFER) #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYI(M, N, A, LDA, Y, X, BUFFER) #else #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYB(M, N, A, LDA, Y, X, BUFFER) #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYR(M, N, A, LDA, Y, X, BUFFER) #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYI(M, N, A, LDA, Y, X, BUFFER) #endif #endif #ifdef RSIDE #ifndef LOWER #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ HEMM3M_OUCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ HEMM3M_OUCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ HEMM3M_OUCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #else #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ HEMM3M_OLCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ HEMM3M_OLCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ HEMM3M_OLCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #endif #endif #ifndef RSIDE #define K args -> m #ifndef LOWER #define GEMM3M_LOCAL HEMM3M_LU #else #define GEMM3M_LOCAL HEMM3M_LL #endif #else #define K args -> n #ifndef LOWER #define GEMM3M_LOCAL HEMM3M_RU #else #define GEMM3M_LOCAL HEMM3M_RL #endif #endif #ifdef THREADED_LEVEL3 #include "level3_gemm3m_thread.c" #else #include "gemm3m_level3.c" #endif OpenBLAS-0.2.20/driver/level3/level3.c000066400000000000000000000307211313527062700171720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ /* This file is a template for level 3 operation */ #ifndef BETA_OPERATION #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) #ifndef COMPLEX #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], NULL, 0, NULL, 0, \ (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #else #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \ (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #endif #else #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA, NULL, 0, NULL, 0, \ (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #endif #endif #ifndef ICOPY_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef KERNEL_FUNC #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KERNEL_FUNC GEMM_KERNEL_N #endif #if defined(CN) || defined(CT) || defined(RN) || defined(RT) #define KERNEL_FUNC GEMM_KERNEL_L #endif #if defined(NC) || defined(TC) || defined(NR) || defined(TR) #define KERNEL_FUNC GEMM_KERNEL_R #endif #if defined(CC) || defined(CR) || defined(RC) || defined(RR) #define KERNEL_FUNC GEMM_KERNEL_B #endif #endif #ifndef KERNEL_OPERATION #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) #ifndef COMPLEX #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #else #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #endif #else #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #endif #endif #ifndef FUSED_KERNEL_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #ifndef COMPLEX #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #else #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #endif #else #ifndef COMPLEX #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #else #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #endif #endif #endif #ifndef A #define A args -> a #endif #ifndef LDA #define LDA args -> lda #endif #ifndef B #define B args -> b #endif #ifndef LDB #define LDB args -> ldb #endif #ifndef C #define C args -> c #endif #ifndef LDC #define LDC args -> ldc #endif #ifndef M #define M args -> m #endif #ifndef N #define N args -> n #endif #ifndef K #define K args -> k #endif #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ BLASLONG k, lda, ldb, ldc; FLOAT *alpha, *beta; FLOAT *a, *b, *c; BLASLONG m_from, m_to, n_from, n_to; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; #if !defined(FUSED_GEMM) || defined(TIMING) BLASLONG jjs, min_jj; #endif BLASLONG l1stride, gemm_p, l2size; #if defined(XDOUBLE) && defined(QUAD_PRECISION) xidouble xalpha; #endif #ifdef TIMING unsigned long long rpcc_counter; unsigned long long innercost = 0; unsigned long long outercost = 0; unsigned long long kernelcost = 0; double total; #endif k = K; a = (FLOAT *)A; b = (FLOAT *)B; c = (FLOAT *)C; lda = LDA; ldb = LDB; ldc = LDC; alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; m_from = 0; m_to = M; if (range_m) { m_from = *(((BLASLONG *)range_m) + 0); m_to = *(((BLASLONG *)range_m) + 1); } n_from = 0; n_to = N; if (range_n) { n_from = *(((BLASLONG *)range_n) + 0); n_to = *(((BLASLONG *)range_n) + 1); } if (beta) { #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) #ifndef COMPLEX if (beta[0] != ONE #else if ((beta[0] != ONE) || (beta[1] != ZERO) #endif #else if (((beta[0].x[1] != 0x3fff000000000000UL) || beta[0].x[0] != 0) #ifdef COMPLEX &&(((beta[1].x[0] | beta[1].x[1]) << 1) != 0) #endif #endif ) { #if defined(XDOUBLE) && defined(QUAD_PRECISION) xidouble xbeta; qtox(&xbeta, beta); #endif BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); } } if ((k == 0) || (alpha == NULL)) return 0; #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) if ((alpha[0] == ZERO) #ifdef COMPLEX && (alpha[1] == ZERO) #endif ) return 0; #else if (((alpha[0].x[0] | alpha[0].x[1] #ifdef COMPLEX | alpha[1].x[0] | alpha[1].x[1] #endif ) << 1) == 0) return 0; #endif #if defined(XDOUBLE) && defined(QUAD_PRECISION) qtox(&xalpha, alpha); #endif l2size = GEMM_P * GEMM_Q; #if 0 fprintf(stderr, "GEMM(Single): M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); fprintf(stderr, "GEMM(Single):: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); // fprintf(stderr, "GEMM: SA .. %p SB .. %p\n", sa, sb); // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); #endif #ifdef TIMING innercost = 0; outercost = 0; kernelcost = 0; #endif for(js = n_from; js < n_to; js += GEMM_R){ min_j = n_to - js; if (min_j > GEMM_R) min_j = GEMM_R; for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM_Q * 2) { gemm_p = GEMM_P; min_l = GEMM_Q; } else { if (min_l > GEMM_Q) { min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; } gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; } /* First, we have to move data A to L2 cache */ min_i = m_to - m_from; l1stride = 1; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; } else { l1stride = 0; } } START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(innercost); #if defined(FUSED_GEMM) && !defined(TIMING) FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, b, ldb, c, ldc, m_from, js, ls); #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(outercost); START_RPCC(); #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #else KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #endif STOP_RPCC(kernelcost); } #endif for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); START_RPCC(); #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); #else KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); #endif STOP_RPCC(kernelcost); } /* end of is */ } /* end of js */ } /* end of ls */ #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); #endif return 0; } OpenBLAS-0.2.20/driver/level3/level3_gemm3m_thread.c000066400000000000000000000673571313527062700220050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif #ifndef SWITCH_RATIO #define SWITCH_RATIO 2 #endif //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif #ifndef GEMM3M_LOCAL #if defined(NN) #define GEMM3M_LOCAL GEMM3M_NN #elif defined(NT) #define GEMM3M_LOCAL GEMM3M_NT #elif defined(NR) #define GEMM3M_LOCAL GEMM3M_NR #elif defined(NC) #define GEMM3M_LOCAL GEMM3M_NC #elif defined(TN) #define GEMM3M_LOCAL GEMM3M_TN #elif defined(TT) #define GEMM3M_LOCAL GEMM3M_TT #elif defined(TR) #define GEMM3M_LOCAL GEMM3M_TR #elif defined(TC) #define GEMM3M_LOCAL GEMM3M_TC #elif defined(RN) #define GEMM3M_LOCAL GEMM3M_RN #elif defined(RT) #define GEMM3M_LOCAL GEMM3M_RT #elif defined(RR) #define GEMM3M_LOCAL GEMM3M_RR #elif defined(RC) #define GEMM3M_LOCAL GEMM3M_RC #elif defined(CN) #define GEMM3M_LOCAL GEMM3M_CN #elif defined(CT) #define GEMM3M_LOCAL GEMM3M_CT #elif defined(CR) #define GEMM3M_LOCAL GEMM3M_CR #elif defined(CC) #define GEMM3M_LOCAL GEMM3M_CC #endif #endif typedef struct { volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #ifndef BETA_OPERATION #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \ (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) #endif #ifndef ICOPYB_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef ICOPYR_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef ICOPYI_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPYB_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); #else #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); #endif #endif #ifndef OCOPYR_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); #else #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); #endif #endif #ifndef OCOPYI_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); #else #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); #endif #endif #ifndef KERNEL_FUNC #define KERNEL_FUNC GEMM3M_KERNEL #endif #ifndef KERNEL_OPERATION #define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #endif #ifndef A #define A args -> a #endif #ifndef LDA #define LDA args -> lda #endif #ifndef B #define B args -> b #endif #ifndef LDB #define LDB args -> ldb #endif #ifndef C #define C args -> c #endif #ifndef LDC #define LDC args -> ldc #endif #ifndef M #define M args -> m #endif #ifndef N #define N args -> n #endif #ifndef K #define K args -> k #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ZERO #define ALPHA6 ONE #define ALPHA7 ONE #define ALPHA8 ZERO #define ALPHA11 ONE #define ALPHA12 -ONE #define ALPHA13 ZERO #define ALPHA14 ONE #define ALPHA17 -ONE #define ALPHA18 -ONE #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ONE #define ALPHA6 ZERO #define ALPHA7 ZERO #define ALPHA8 ONE #define ALPHA11 -ONE #define ALPHA12 -ONE #define ALPHA13 ONE #define ALPHA14 ZERO #define ALPHA17 -ONE #define ALPHA18 ONE #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ONE #define ALPHA6 ZERO #define ALPHA7 ZERO #define ALPHA8 ONE #define ALPHA11 -ONE #define ALPHA12 ONE #define ALPHA13 ONE #define ALPHA14 ZERO #define ALPHA17 -ONE #define ALPHA18 -ONE #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define ALPHA1 ONE #define ALPHA2 ONE #define ALPHA5 ZERO #define ALPHA6 -ONE #define ALPHA7 ONE #define ALPHA8 ZERO #define ALPHA11 ONE #define ALPHA12 ONE #define ALPHA13 ZERO #define ALPHA14 ONE #define ALPHA17 -ONE #define ALPHA18 ONE #endif #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ BLASLONG k, lda, ldb, ldc; BLASLONG m_from, m_to, n_from, n_to, N_from, N_to; FLOAT *alpha, *beta; FLOAT *a, *b, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; FLOAT *buffer[DIVIDE_RATE]; BLASLONG ls, min_l, jjs, min_jj; BLASLONG is, min_i, div_n; BLASLONG i, current; #ifdef TIMING BLASLONG rpcc_counter; BLASLONG copy_A = 0; BLASLONG copy_B = 0; BLASLONG kernel = 0; BLASLONG waiting1 = 0; BLASLONG waiting2 = 0; BLASLONG waiting3 = 0; BLASLONG waiting6[MAX_CPU_NUMBER]; BLASLONG ops = 0; for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif k = K; a = (FLOAT *)A; b = (FLOAT *)B; c = (FLOAT *)C; lda = LDA; ldb = LDB; ldc = LDC; alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; m_from = 0; m_to = M; if (range_m) { m_from = range_m[0]; m_to = range_m[1]; } n_from = 0; n_to = N; N_from = 0; N_to = N; if (range_n) { n_from = range_n[mypos + 0]; n_to = range_n[mypos + 1]; N_from = range_n[0]; N_to = range_n[args -> nthreads]; } if (beta) { if ((beta[0] != ONE) || (beta[1] != ZERO)) BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc); } if ((k == 0) || (alpha == NULL)) return 0; if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0; #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", mypos, m_from, m_to, n_from, n_to, N_from, N_to); #endif div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; buffer[0] = sb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N); } for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM3M_Q * 2) { min_l = GEMM3M_Q; } else { if (min_l > GEMM3M_Q) { min_l = (min_l + 1) / 2; } } min_i = m_to - m_from; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else { if (min_i > GEMM3M_P) { min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } } START_RPCC(); ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(copy_A); div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { START_RPCC(); /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; STOP_RPCC(waiting1); for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; START_RPCC(); #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #else OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #endif STOP_RPCC(copy_B); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, sa, buffer[bufferside] + min_l * (jjs - xxx), c, ldc, m_from, jjs); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } current = mypos; do { current ++; if (current >= args -> nthreads) current = 0; div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if (current != mypos) { START_RPCC(); /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; STOP_RPCC(waiting2); START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } } while (current != mypos); for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else if (min_i > GEMM3M_P) { min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } START_RPCC(); ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(copy_A); current = mypos; do { div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; #endif if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } current ++; if (current >= args -> nthreads) current = 0; } while (current != mypos); } /* end of is */ START_RPCC(); ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(copy_A); div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { START_RPCC(); /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; STOP_RPCC(waiting1); for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; START_RPCC(); #if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #else OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #endif STOP_RPCC(copy_B); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, sa, buffer[bufferside] + min_l * (jjs - xxx), c, ldc, m_from, jjs); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } current = mypos; do { current ++; if (current >= args -> nthreads) current = 0; div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if (current != mypos) { START_RPCC(); /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; STOP_RPCC(waiting2); START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } } while (current != mypos); for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else if (min_i > GEMM3M_P) { min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } START_RPCC(); ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(copy_A); current = mypos; do { div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; #endif if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } current ++; if (current >= args -> nthreads) current = 0; } while (current != mypos); } /* end of is */ START_RPCC(); ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(copy_A); div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { START_RPCC(); /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; STOP_RPCC(waiting1); for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; START_RPCC(); #if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #else OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #endif STOP_RPCC(copy_B); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, sa, buffer[bufferside] + min_l * (jjs - xxx), c, ldc, m_from, jjs); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } current = mypos; do { current ++; if (current >= args -> nthreads) current = 0; div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if (current != mypos) { START_RPCC(); /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; STOP_RPCC(waiting2); START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } } while (current != mypos); for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else if (min_i > GEMM3M_P) { min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; } START_RPCC(); ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(copy_A); current = mypos; do { div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; #endif if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } current ++; if (current >= args -> nthreads) current = 0; } while (current != mypos); } /* end of is */ } START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; } } STOP_RPCC(waiting3); #ifdef TIMING BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n", mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., (double)waiting /(double)total * 100., (double)ops/(double)kernel / 2. * 100.); fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); #if 0 fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", mypos, (double)waiting1/(double)waiting * 100., (double)waiting2/(double)waiting * 100., (double)waiting3/(double)waiting * 100.); #endif fprintf(stderr, "\n"); #endif return 0; } static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ blas_arg_t newarg; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_N[MAX_CPU_NUMBER + 1]; #ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; #else job_t * job = NULL; #endif BLASLONG num_cpu_m, num_cpu_n; BLASLONG nthreads = args -> nthreads; BLASLONG width, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; #else mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; #endif newarg.m = args -> m; newarg.n = args -> n; newarg.k = args -> k; newarg.a = args -> a; newarg.b = args -> b; newarg.c = args -> c; newarg.lda = args -> lda; newarg.ldb = args -> ldb; newarg.ldc = args -> ldc; newarg.alpha = args -> alpha; newarg.beta = args -> beta; newarg.nthreads = args -> nthreads; #ifdef USE_ALLOC_HEAP job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); if(job==NULL){ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); exit(1); } #endif newarg.common = (void *)job; if (!range_m) { range_M[0] = 0; m = args -> m; } else { range_M[0] = range_m[0]; m = range_m[1] - range_m[0]; } num_cpu_m = 0; while (m > 0){ width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); m -= width; if (m < 0) width = width + m; range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; num_cpu_m ++; } for (i = 0; i < num_cpu_m; i++) { queue[i].mode = mode; queue[i].routine = inner_thread; queue[i].args = &newarg; queue[i].range_m = &range_M[i]; queue[i].range_n = &range_N[0]; queue[i].sa = NULL; queue[i].sb = NULL; queue[i].next = &queue[i + 1]; } queue[0].sa = sa; queue[0].sb = sb; if (!range_n) { n_from = 0; n_to = args -> n; } else { n_from = range_n[0]; n_to = range_n[1]; } for(js = n_from; js < n_to; js += GEMM_R * nthreads){ n = n_to - js; if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; range_N[0] = js; num_cpu_n = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); n -= width; if (n < 0) width = width + n; range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; num_cpu_n ++; } for (j = 0; j < num_cpu_m; j++) { for (i = 0; i < num_cpu_m; i++) { for (k = 0; k < DIVIDE_RATE; k++) { job[j].working[i][CACHE_LINE_SIZE * k] = 0; } } } queue[num_cpu_m - 1].next = NULL; exec_blas(num_cpu_m, queue); } #ifdef USE_ALLOC_HEAP free(job); #endif return 0; } int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG nthreads = args -> nthreads; BLASLONG divN, divT; int mode; if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); m = m_to - m_from; } if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; } if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } divT = nthreads; divN = 1; while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { do { divT --; divN = 1; while (divT * divN < nthreads) divN ++; } while ((divT * divN != nthreads) && (divT > 1)); } args -> nthreads = divT; if (divN == 1){ gemm_driver(args, range_m, range_n, sa, sb, 0); } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ defined(CN) || defined(CT) || defined(CR) || defined(CC) mode |= (BLAS_TRANSA_T); #endif #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \ defined(NC) || defined(TC) || defined(RC) || defined(CC) mode |= (BLAS_TRANSB_T); #endif gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); } return 0; } OpenBLAS-0.2.20/driver/level3/level3_syr2k.c000066400000000000000000000264551313527062700203350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef KERNEL_OPERATION #ifndef COMPLEX #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) #else #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) #endif #endif #ifndef KERNEL_OPERATION_C #define KERNEL_OPERATION_C KERNEL_OPERATION #endif #ifndef ICOPY_OPERATION #ifndef TRANS #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #ifdef TRANS #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef M #define M args -> n #endif #ifndef N #define N args -> n #endif #ifndef K #define K args -> k #endif #ifndef A #define A args -> a #endif #ifndef B #define B args -> b #endif #ifndef C #define C args -> c #endif #ifndef LDA #define LDA args -> lda #endif #ifndef LDB #define LDB args -> ldb #endif #ifndef LDC #define LDC args -> ldc #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m_from, m_to, n_from, n_to, k, lda, ldb, ldc; FLOAT *a, *b, *c, *alpha, *beta; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; BLASLONG m_start, m_end; FLOAT *aa; k = K; a = (FLOAT *)A; b = (FLOAT *)B; c = (FLOAT *)C; lda = LDA; ldb = LDB; ldc = LDC; alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; m_from = 0; m_to = M; if (range_m) { m_from = *(((BLASLONG *)range_m) + 0); m_to = *(((BLASLONG *)range_m) + 1); } n_from = 0; n_to = N; if (range_n) { n_from = *(((BLASLONG *)range_n) + 0); n_to = *(((BLASLONG *)range_n) + 1); } if (beta) { #if !defined(COMPLEX) || defined(HER2K) if (beta[0] != ONE) #else if ((beta[0] != ONE) || (beta[1] != ZERO)) #endif syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); } if ((k == 0) || (alpha == NULL)) return 0; if ((alpha[0] == ZERO) #ifdef COMPLEX && (alpha[1] == ZERO) #endif ) return 0; for(js = n_from; js < n_to; js += GEMM_R){ min_j = n_to - js; if (min_j > GEMM_R) min_j = GEMM_R; #ifndef LOWER m_start = m_from; m_end = js + min_j; if (m_end > m_to) m_end = m_to; #else m_start = m_from; m_end = m_to; if (m_start < js) m_start = js; #endif for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else if (min_l > GEMM_Q) { min_l = (min_l + 1) / 2; } min_i = m_end - m_start; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } #ifndef LOWER if (m_start >= js) { ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); aa = sb + min_l * (m_start - js) * COMPSIZE; OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1); jjs = m_start + min_i; } else { ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); jjs = js; } for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1); } for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); } min_i = m_end - m_start; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } if (m_start >= js) { ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); aa = sb + min_l * (m_start - js) * COMPSIZE; OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0); jjs = m_start + min_i; } else { ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); jjs = js; } for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0); } for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); } #else aa = sb + min_l * (m_start - js) * COMPSIZE; ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1); for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ min_jj = m_start - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1); } for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } aa = sb + min_l * (is - js) * COMPSIZE; if (is < js + min_j) { ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa); KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 1); KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 1); } else { ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); } } min_i = m_end - m_start; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } aa = sb + min_l * (m_start - js) * COMPSIZE; ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0); for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ min_jj = m_start - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0); } for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } aa = sb + min_l * (is - js) * COMPSIZE; if (is < js + min_j) { ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa); KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 0); KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 0); } else { ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); } } #endif } } return 0; } OpenBLAS-0.2.20/driver/level3/level3_syrk.c000066400000000000000000000304121313527062700202370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef KERNEL_OPERATION #ifndef COMPLEX #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) #else #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) #endif #endif #ifndef ICOPY_OPERATION #ifndef TRANS #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #ifdef TRANS #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef M #define M args -> n #endif #ifndef N #define N args -> n #endif #ifndef K #define K args -> k #endif #ifndef A #define A args -> a #endif #ifndef C #define C args -> c #endif #ifndef LDA #define LDA args -> lda #endif #ifndef LDC #define LDC args -> ldc #endif #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m_from, m_to, n_from, n_to, k, lda, ldc; FLOAT *a, *c, *alpha, *beta; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; BLASLONG m_start, m_end; int shared = ((GEMM_UNROLL_M == GEMM_UNROLL_N) && !HAVE_EX_L2); FLOAT *aa; #ifdef TIMING unsigned long long rpcc_counter; unsigned long long innercost = 0; unsigned long long outercost = 0; unsigned long long kernelcost = 0; double total; #endif k = K; a = (FLOAT *)A; c = (FLOAT *)C; lda = LDA; ldc = LDC; alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; m_from = 0; m_to = M; if (range_m) { m_from = *(((BLASLONG *)range_m) + 0); m_to = *(((BLASLONG *)range_m) + 1); } n_from = 0; n_to = N; if (range_n) { n_from = *(((BLASLONG *)range_n) + 0); n_to = *(((BLASLONG *)range_n) + 1); } if (beta) { #if !defined(COMPLEX) || defined(HERK) if (beta[0] != ONE) #else if ((beta[0] != ONE) || (beta[1] != ZERO)) #endif syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); } if ((k == 0) || (alpha == NULL)) return 0; if ((alpha[0] == ZERO) #if defined(COMPLEX) && !defined(HERK) && (alpha[1] == ZERO) #endif ) return 0; #if 0 fprintf(stderr, "m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", m_from, m_to, n_from, n_to); #endif for(js = n_from; js < n_to; js += GEMM_R){ min_j = n_to - js; if (min_j > GEMM_R) min_j = GEMM_R; #ifndef LOWER m_start = m_from; m_end = js + min_j; if (m_end > m_to) m_end = m_to; #else m_start = m_from; m_end = m_to; if (m_start < js) m_start = js; #endif for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else if (min_l > GEMM_Q) { min_l = (min_l + 1) / 2; } min_i = m_end - m_start; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } #ifndef LOWER if (m_end >= js) { aa = sb + min_l * MAX(m_start - js, 0) * COMPSIZE; if (!shared) aa = sa; for(jjs = MAX(m_start, js); jjs < js + min_j; jjs += min_jj){ min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; if (!shared && (jjs - MAX(m_start, js) < min_i)) { START_RPCC(); ICOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sa + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(innercost); } START_RPCC(); OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, aa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, MAX(m_start, js), jjs); STOP_RPCC(kernelcost); } for(is = MAX(m_start, js) + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } aa = sb + min_l * (is - js) * COMPSIZE; if (!shared) { START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); aa = sa; } START_RPCC(); KERNEL_OPERATION(min_i, min_j, min_l, alpha, aa, sb, c, ldc, is, js); STOP_RPCC(kernelcost); } } if (m_start < js) { if (m_end < js) { START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; START_RPCC(); OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); STOP_RPCC(kernelcost); } } else { min_i = 0; } for(is = m_start + min_i; is < MIN(m_end, js); is += min_i){ min_i = MIN(m_end, js)- is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); START_RPCC(); KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); STOP_RPCC(kernelcost); } } #else if (m_start < js + min_j) { aa = sb + min_l * (m_start - js) * COMPSIZE; if (!shared) { START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); STOP_RPCC(innercost); } START_RPCC(); OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j + js - m_start)), a, lda, ls, m_start, aa); STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, m_start, m_start); STOP_RPCC(kernelcost); for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_N){ min_jj = m_start - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, (shared? (aa) : (sa)), sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); STOP_RPCC(kernelcost); } for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } if (is < js + min_j) { if (!shared) { START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); } aa = sb + min_l * (is - js) * COMPSIZE; START_RPCC(); OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j - is + js)), a, lda, ls, is, aa); STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, is, is); STOP_RPCC(kernelcost); START_RPCC(); KERNEL_OPERATION(min_i, is - js, min_l, alpha, (shared? (aa) : (sa)), sb, c, ldc, is, js); STOP_RPCC(kernelcost); } else { START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); START_RPCC(); KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); STOP_RPCC(kernelcost); } } } else { START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); STOP_RPCC(innercost); for(jjs = js; jjs < min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); STOP_RPCC(kernelcost); } for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(innercost); START_RPCC(); KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); STOP_RPCC(kernelcost); } } #endif } } #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., kernelcost / total * 100., (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / (double)DNUMOPT, (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / (double)DNUMOPT); #endif return 0; } OpenBLAS-0.2.20/driver/level3/level3_syrk_threaded.c000066400000000000000000000417621313527062700221110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif #ifndef SWITCH_RATIO #define SWITCH_RATIO 2 #endif //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif #ifndef SYRK_LOCAL #if !defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL SYRK_UN #elif !defined(LOWER) && defined(TRANS) #define SYRK_LOCAL SYRK_UT #elif defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL SYRK_LN #else #define SYRK_LOCAL SYRK_LT #endif #endif typedef struct { volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #ifndef KERNEL_OPERATION #ifndef COMPLEX #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) #else #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) #endif #endif #ifndef ICOPY_OPERATION #ifndef TRANS #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #ifdef TRANS #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef A #define A args -> a #endif #ifndef LDA #define LDA args -> lda #endif #ifndef C #define C args -> c #endif #ifndef LDC #define LDC args -> ldc #endif #ifndef M #define M args -> m #endif #ifndef N #define N args -> n #endif #ifndef K #define K args -> k #endif #undef TIMING #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ FLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda, ldc; BLASLONG m_from, m_to, n_from, n_to; FLOAT *alpha, *beta; FLOAT *a, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; BLASLONG ls, min_l, jjs, min_jj; BLASLONG is, min_i, div_n; BLASLONG i, current; #ifdef LOWER BLASLONG start_i; #endif #ifdef TIMING BLASLONG rpcc_counter; BLASLONG copy_A = 0; BLASLONG copy_B = 0; BLASLONG kernel = 0; BLASLONG waiting1 = 0; BLASLONG waiting2 = 0; BLASLONG waiting3 = 0; BLASLONG waiting6[MAX_CPU_NUMBER]; BLASLONG ops = 0; for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif k = K; a = (FLOAT *)A; c = (FLOAT *)C; lda = LDA; ldc = LDC; alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; m_from = 0; m_to = N; /* Global Range */ n_from = 0; n_to = N; if (range_n) { m_from = range_n[mypos + 0]; m_to = range_n[mypos + 1]; n_from = range_n[0]; n_to = range_n[args -> nthreads]; } if (beta) { #if !defined(COMPLEX) || defined(HERK) if (beta[0] != ONE) #else if ((beta[0] != ONE) || (beta[1] != ZERO)) #endif syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); } if ((k == 0) || (alpha == NULL)) return 0; if ((alpha[0] == ZERO) #if defined(COMPLEX) && !defined(HERK) && (alpha[1] == ZERO) #endif ) return 0; #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to); #endif div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; buffer[0] = sb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; } for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } min_i = m_to - m_from; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } } #ifdef LOWER xxx = (m_to - m_from - min_i) % GEMM_P; if (xxx) min_i -= GEMM_P - xxx; #endif START_RPCC(); #ifndef LOWER ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); #else ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa); #endif STOP_RPCC(copy_A); div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { START_RPCC(); /* Make sure if no one is using buffer */ #ifndef LOWER for (i = 0; i < mypos; i++) #else for (i = mypos + 1; i < args -> nthreads; i++) #endif while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; STOP_RPCC(waiting1); #ifndef LOWER for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(m_to, xxx + div_n) - jjs; if (xxx == m_from) { if (min_jj > min_i) min_jj = min_i; } else { if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; } START_RPCC(); OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); STOP_RPCC(copy_B); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, c, ldc, m_from, jjs); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } #else for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(m_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; START_RPCC(); OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); STOP_RPCC(copy_B); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, c, ldc, m_to - min_i, jjs); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } #endif #ifndef LOWER for (i = 0; i <= mypos; i++) #else for (i = mypos; i < args -> nthreads; i++) #endif job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; WMB; } #ifndef LOWER current = mypos + 1; while (current < args -> nthreads) { #else current = mypos - 1; while (current >= 0) { #endif div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { START_RPCC(); /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; STOP_RPCC(waiting2); START_RPCC(); #ifndef LOWER KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, xxx); #else KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_to - min_i, xxx); #endif STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } #ifndef LOWER current ++; #else current --; #endif } #ifndef LOWER for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; #else start_i = min_i; for(is = m_from; is < m_to - start_i; is += min_i){ min_i = m_to - start_i - is; #endif if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(copy_A); current = mypos; do { div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif #ifndef LOWER if (is + min_i >= m_to) { #else if (is + min_i >= m_to - start_i) { #endif /* Thread doesn't need this buffer any more */ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } #ifndef LOWER current ++; } while (current != args -> nthreads); #else current --; } while (current >= 0); #endif } } START_RPCC(); for (i = 0; i < args -> nthreads; i++) { if (i != mypos) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; } } } STOP_RPCC(waiting3); #ifdef TIMING BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., (double)waiting1 /(double)total * 100., (double)waiting2 /(double)total * 100., (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", mypos, (double)waiting1/(double)waiting * 100., (double)waiting2/(double)waiting * 100., (double)waiting3/(double)waiting * 100.); #endif fprintf(stderr, "\n"); #endif return 0; } int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ blas_arg_t newarg; #ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; #else job_t * job = NULL; #endif blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 100]; BLASLONG num_cpu; BLASLONG nthreads = args -> nthreads; BLASLONG width, i, j, k; BLASLONG n, n_from, n_to; int mode, mask; double dnum; if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = DGEMM_UNROLL_MN - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = SGEMM_UNROLL_MN - 1; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; mask = ZGEMM_UNROLL_MN - 1; #else mode = BLAS_SINGLE | BLAS_COMPLEX; mask = CGEMM_UNROLL_MN - 1; #endif #endif newarg.m = args -> m; newarg.n = args -> n; newarg.k = args -> k; newarg.a = args -> a; newarg.b = args -> b; newarg.c = args -> c; newarg.lda = args -> lda; newarg.ldb = args -> ldb; newarg.ldc = args -> ldc; newarg.alpha = args -> alpha; newarg.beta = args -> beta; #ifdef USE_ALLOC_HEAP job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); if(job==NULL){ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); exit(1); } #endif newarg.common = (void *)job; if (!range_n) { n_from = 0; n_to = args -> n; } else { n_from = range_n[0]; n_to = range_n[1] - range_n[0]; } #ifndef LOWER range[MAX_CPU_NUMBER] = n_to - n_from; range[0] = 0; num_cpu = 0; i = 0; n = n_to - n_from; dnum = (double)n * (double)n /(double)nthreads; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)i; width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) ); if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) ); if ((width > n - i) || (width < mask)) width = n - i; } else { width = n - i; } range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = range_m; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; #else range[0] = 0; num_cpu = 0; i = 0; n = n_to - n_from; dnum = (double)n * (double)n /(double)nthreads; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)i; width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); if ((width > n - i) || (width < mask)) width = n - i; } else { width = n - i; } range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = range; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif newarg.nthreads = num_cpu; if (num_cpu) { for (j = 0; j < num_cpu; j++) { for (i = 0; i < num_cpu; i++) { for (k = 0; k < DIVIDE_RATE; k++) { job[j].working[i][CACHE_LINE_SIZE * k] = 0; } } } queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } #ifdef USE_ALLOC_HEAP free(job); #endif return 0; } OpenBLAS-0.2.20/driver/level3/level3_thread.c000066400000000000000000000500761313527062700205260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif #ifndef SWITCH_RATIO #define SWITCH_RATIO 2 #endif //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif #ifndef GEMM_LOCAL #if defined(NN) #define GEMM_LOCAL GEMM_NN #elif defined(NT) #define GEMM_LOCAL GEMM_NT #elif defined(NR) #define GEMM_LOCAL GEMM_NR #elif defined(NC) #define GEMM_LOCAL GEMM_NC #elif defined(TN) #define GEMM_LOCAL GEMM_TN #elif defined(TT) #define GEMM_LOCAL GEMM_TT #elif defined(TR) #define GEMM_LOCAL GEMM_TR #elif defined(TC) #define GEMM_LOCAL GEMM_TC #elif defined(RN) #define GEMM_LOCAL GEMM_RN #elif defined(RT) #define GEMM_LOCAL GEMM_RT #elif defined(RR) #define GEMM_LOCAL GEMM_RR #elif defined(RC) #define GEMM_LOCAL GEMM_RC #elif defined(CN) #define GEMM_LOCAL GEMM_CN #elif defined(CT) #define GEMM_LOCAL GEMM_CT #elif defined(CR) #define GEMM_LOCAL GEMM_CR #elif defined(CC) #define GEMM_LOCAL GEMM_CC #endif #endif typedef struct { volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #ifndef BETA_OPERATION #ifndef COMPLEX #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], NULL, 0, NULL, 0, \ (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #else #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \ (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #endif #endif #ifndef ICOPY_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef KERNEL_FUNC #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KERNEL_FUNC GEMM_KERNEL_N #endif #if defined(CN) || defined(CT) || defined(RN) || defined(RT) #define KERNEL_FUNC GEMM_KERNEL_L #endif #if defined(NC) || defined(TC) || defined(NR) || defined(TR) #define KERNEL_FUNC GEMM_KERNEL_R #endif #if defined(CC) || defined(CR) || defined(RC) || defined(RR) #define KERNEL_FUNC GEMM_KERNEL_B #endif #endif #ifndef KERNEL_OPERATION #ifndef COMPLEX #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #else #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #endif #endif #ifndef FUSED_KERNEL_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) #ifndef COMPLEX #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #else #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #endif #else #ifndef COMPLEX #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #else #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #endif #endif #endif #ifndef A #define A args -> a #endif #ifndef LDA #define LDA args -> lda #endif #ifndef B #define B args -> b #endif #ifndef LDB #define LDB args -> ldb #endif #ifndef C #define C args -> c #endif #ifndef LDC #define LDC args -> ldc #endif #ifndef M #define M args -> m #endif #ifndef N #define N args -> n #endif #ifndef K #define K args -> k #endif #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ FLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda, ldb, ldc; BLASLONG m_from, m_to, n_from, n_to, N_from, N_to; FLOAT *alpha, *beta; FLOAT *a, *b, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; BLASLONG ls, min_l, jjs, min_jj; BLASLONG is, min_i, div_n; BLASLONG i, current; BLASLONG l1stride; #ifdef TIMING BLASULONG rpcc_counter; BLASULONG copy_A = 0; BLASULONG copy_B = 0; BLASULONG kernel = 0; BLASULONG waiting1 = 0; BLASULONG waiting2 = 0; BLASULONG waiting3 = 0; BLASULONG waiting6[MAX_CPU_NUMBER]; BLASULONG ops = 0; for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; #endif k = K; a = (FLOAT *)A; b = (FLOAT *)B; c = (FLOAT *)C; lda = LDA; ldb = LDB; ldc = LDC; alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; m_from = 0; m_to = M; if (range_m) { m_from = range_m[0]; m_to = range_m[1]; } n_from = 0; n_to = N; N_from = 0; N_to = N; if (range_n) { n_from = range_n[mypos + 0]; n_to = range_n[mypos + 1]; N_from = range_n[0]; N_to = range_n[args -> nthreads]; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) #else if ((beta[0] != ONE) || (beta[1] != ZERO)) #endif BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc); } if ((k == 0) || (alpha == NULL)) return 0; if ((alpha[0] == ZERO) #ifdef COMPLEX && (alpha[1] == ZERO) #endif ) return 0; #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", mypos, m_from, m_to, n_from, n_to, N_from, N_to); fprintf(stderr, "GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); #endif div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; buffer[0] = sb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE; } for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } l1stride = 1; min_i = m_to - m_from; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; } else { if (args -> nthreads == 1) l1stride = 0; } } START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); STOP_RPCC(copy_A); div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { START_RPCC(); /* Make sure if no one is using buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; STOP_RPCC(waiting1); #if defined(FUSED_GEMM) && !defined(TIMING) FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha, sa, buffer[bufferside], b, ldb, c, ldc, m_from, xxx, ls); #else for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride); STOP_RPCC(copy_B); START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride, c, ldc, m_from, jjs); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } #endif for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; WMB; } current = mypos; do { current ++; if (current >= args -> nthreads) current = 0; div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if (current != mypos) { START_RPCC(); /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; STOP_RPCC(waiting2); START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } } while (current != mypos); for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; } START_RPCC(); ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); STOP_RPCC(copy_A); current = mypos; do { div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } current ++; if (current >= args -> nthreads) current = 0; } while (current != mypos); } } START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; } } STOP_RPCC(waiting3); #ifdef TIMING BLASLONG waiting = waiting1 + waiting2 + waiting3; BLASLONG total = copy_A + copy_B + kernel + waiting; fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., (double)waiting1 /(double)total * 100., (double)waiting2 /(double)total * 100., (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); #if 0 fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", mypos, copy_A, copy_B, waiting); fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", mypos, (double)waiting1/(double)waiting * 100., (double)waiting2/(double)waiting * 100., (double)waiting3/(double)waiting * 100.); #endif fprintf(stderr, "\n"); #endif return 0; } static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ blas_arg_t newarg; #ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; #else job_t * job = NULL; #endif blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_N[MAX_CPU_NUMBER + 1]; BLASLONG num_cpu_m, num_cpu_n; BLASLONG nthreads = args -> nthreads; BLASLONG width, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; #else mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE; #else mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE; #endif #endif newarg.m = args -> m; newarg.n = args -> n; newarg.k = args -> k; newarg.a = args -> a; newarg.b = args -> b; newarg.c = args -> c; newarg.lda = args -> lda; newarg.ldb = args -> ldb; newarg.ldc = args -> ldc; newarg.alpha = args -> alpha; newarg.beta = args -> beta; newarg.nthreads = args -> nthreads; #ifdef USE_ALLOC_HEAP job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); if(job==NULL){ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); exit(1); } #endif newarg.common = (void *)job; #ifdef PARAMTEST newarg.gemm_p = args -> gemm_p; newarg.gemm_q = args -> gemm_q; newarg.gemm_r = args -> gemm_r; #endif if (!range_m) { range_M[0] = 0; m = args -> m; } else { range_M[0] = range_m[0]; m = range_m[1] - range_m[0]; } num_cpu_m = 0; while (m > 0){ width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); m -= width; if (m < 0) width = width + m; range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; num_cpu_m ++; } for (i = 0; i < num_cpu_m; i++) { queue[i].mode = mode; queue[i].routine = inner_thread; queue[i].args = &newarg; queue[i].range_m = &range_M[i]; queue[i].range_n = &range_N[0]; queue[i].sa = NULL; queue[i].sb = NULL; queue[i].next = &queue[i + 1]; } queue[0].sa = sa; queue[0].sb = sb; if (!range_n) { n_from = 0; n_to = args -> n; } else { n_from = range_n[0]; n_to = range_n[1]; } for(js = n_from; js < n_to; js += GEMM_R * nthreads){ n = n_to - js; if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; range_N[0] = js; num_cpu_n = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); n -= width; if (n < 0) width = width + n; range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; num_cpu_n ++; } for (j = 0; j < num_cpu_m; j++) { for (i = 0; i < num_cpu_m; i++) { for (k = 0; k < DIVIDE_RATE; k++) { job[j].working[i][CACHE_LINE_SIZE * k] = 0; } } } queue[num_cpu_m - 1].next = NULL; exec_blas(num_cpu_m, queue); } #ifdef USE_ALLOC_HEAP free(job); #endif return 0; } int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG nthreads = args -> nthreads; BLASLONG divN, divT; int mode; if (nthreads == 1) { GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); m = m_to - m_from; } if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; } if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) { GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } divT = nthreads; divN = 1; #if 0 while ((GEMM_P * divT > m * SWITCH_RATIO) && (divT > 1)) { do { divT --; divN = 1; while (divT * divN < nthreads) divN ++; } while ((divT * divN != nthreads) && (divT > 1)); } #endif // fprintf(stderr, "divN = %4ld divT = %4ld\n", divN, divT); args -> nthreads = divT; if (divN == 1){ gemm_driver(args, range_m, range_n, sa, sb, 0); } else { #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ defined(CN) || defined(CT) || defined(CR) || defined(CC) mode |= (BLAS_TRANSA_T); #endif #if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \ defined(NC) || defined(TC) || defined(RC) || defined(CC) mode |= (BLAS_TRANSB_T); #endif #ifdef OS_WINDOWS gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN); #else gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); #endif } return 0; } OpenBLAS-0.2.20/driver/level3/symm3m_k.c000066400000000000000000000112711313527062700175360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #undef TIMING #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \ (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) #ifndef RSIDE #ifndef LOWER #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYB(M, N, A, LDA, Y, X, BUFFER) #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYR(M, N, A, LDA, Y, X, BUFFER) #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYI(M, N, A, LDA, Y, X, BUFFER) #else #define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYB(M, N, A, LDA, Y, X, BUFFER) #define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYR(M, N, A, LDA, Y, X, BUFFER) #define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYI(M, N, A, LDA, Y, X, BUFFER) #endif #endif #ifdef RSIDE #ifndef LOWER #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ SYMM3M_OUCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ SYMM3M_OUCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ SYMM3M_OUCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #else #define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ SYMM3M_OLCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ SYMM3M_OLCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ SYMM3M_OLCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) #endif #endif #ifndef RSIDE #define K args -> m #ifndef LOWER #define GEMM3M_LOCAL SYMM3M_LU #else #define GEMM3M_LOCAL SYMM3M_LL #endif #else #define K args -> n #ifndef LOWER #define GEMM3M_LOCAL SYMM3M_RU #else #define GEMM3M_LOCAL SYMM3M_RL #endif #endif #ifdef THREADED_LEVEL3 #include "level3_gemm3m_thread.c" #else #include "gemm3m_level3.c" #endif OpenBLAS-0.2.20/driver/level3/symm_k.c000066400000000000000000000067001313527062700172770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #undef TIMING #ifndef RSIDE #ifndef LOWER #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_IUTCOPY(M, N, A, LDA, Y, X, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_ILTCOPY(M, N, A, LDA, Y, X, BUFFER); #endif #endif #ifdef RSIDE #ifndef LOWER #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_OUTCOPY(M, N, A, LDA, Y, X, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_OLTCOPY(M, N, A, LDA, Y, X, BUFFER); #endif #endif #ifndef RSIDE #define K args -> m #ifndef LOWER #define GEMM_LOCAL SYMM_LU #else #define GEMM_LOCAL SYMM_LL #endif #else #define K args -> n #ifndef LOWER #define GEMM_LOCAL SYMM_RU #else #define GEMM_LOCAL SYMM_RL #endif #endif #ifdef THREADED_LEVEL3 #include "level3_thread.c" #else #include "level3.c" #endif OpenBLAS-0.2.20/driver/level3/syr2k_k.c000066400000000000000000000073501313527062700173660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef KERNEL_FUNC #ifndef LOWER #define KERNEL_FUNC SYR2K_KERNEL_U #else #define KERNEL_FUNC SYR2K_KERNEL_L #endif #endif static __inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; #ifndef LOWER if (m_from > n_from) n_from = m_from; if (m_to > n_to ) m_to = n_to; #else if (m_from < n_from) m_from = n_from; if (m_to < n_to ) n_to = m_to; #endif c += (m_from + n_from * ldc) * COMPSIZE; m_to -= m_from; n_to -= n_from; for (i = 0; i < n_to; i++){ #ifndef LOWER SCAL_K(MIN(i + n_from - m_from + 1, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif c, 1, NULL, 0, NULL, 0); c += ldc * COMPSIZE; #else SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif c, 1, NULL, 0, NULL, 0); if (i < m_from - n_from) { c += ldc * COMPSIZE; } else { c += (1 + ldc) * COMPSIZE; } #endif } return 0; } #ifdef THREADED_LEVEL3 #include "level3_syr2k_threaded.c" #else #include "level3_syr2k.c" #endif OpenBLAS-0.2.20/driver/level3/syr2k_kernel.c000066400000000000000000000136661313527062700204230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX FLOAT alpha_i, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){ BLASLONG i, j; BLASLONG loop; FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE]; if (m + offset < 0) { #ifndef LOWER GEMM_KERNEL_N(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (n < offset) { #ifdef LOWER GEMM_KERNEL_N(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (offset > 0) { #ifdef LOWER GEMM_KERNEL_N(m, offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; n -= offset; offset = 0; if (n <= 0) return 0; } if (n > m + offset) { #ifndef LOWER GEMM_KERNEL_N(m, n - m - offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + (m + offset) * k * COMPSIZE, c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; if (n <= 0) return 0; } if (offset < 0) { #ifndef LOWER GEMM_KERNEL_N(-offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; m += offset; offset = 0; if (m <= 0) return 0; } if (m > n - offset) { #ifdef LOWER GEMM_KERNEL_N(m - n + offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (n - offset) * k * COMPSIZE, b, c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { int mm, nn; mm = (loop & ~(GEMM_UNROLL_MN - 1)); nn = MIN(GEMM_UNROLL_MN, n - loop); #ifndef LOWER GEMM_KERNEL_N(mm, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif if (flag) { GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif NULL, 0, NULL, 0, subbuffer, nn); GEMM_KERNEL_N(nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); #ifndef LOWER for (j = 0; j < nn; j ++) { for (i = 0; i <= j; i ++) { #ifndef COMPLEX c[i + loop + (j + loop) * ldc] += subbuffer[i + j * nn] + subbuffer[j + i * nn]; #else c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1]; #endif } } #else for (j = 0; j < nn; j ++) { for (i = j; i < nn; i ++) { #ifndef COMPLEX c[i + loop + (j + loop) * ldc] += subbuffer[i + j * nn] + subbuffer[j + i * nn]; #else c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1]; #endif } } #endif } #ifdef LOWER GEMM_KERNEL_N(m - mm - nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } return 0; } OpenBLAS-0.2.20/driver/level3/syrk_k.c000066400000000000000000000073641313527062700173110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #undef TIMING #ifndef KERNEL_FUNC #ifndef LOWER #define KERNEL_FUNC SYRK_KERNEL_U #else #define KERNEL_FUNC SYRK_KERNEL_L #endif #endif static __inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; #ifndef LOWER if (m_from > n_from) n_from = m_from; if (m_to > n_to ) m_to = n_to; #else if (m_from < n_from) m_from = n_from; if (m_to < n_to ) n_to = m_to; #endif c += (m_from + n_from * ldc) * COMPSIZE; m_to -= m_from; n_to -= n_from; for (i = 0; i < n_to; i++){ #ifndef LOWER SCAL_K(MIN(i + n_from - m_from + 1, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif c, 1, NULL, 0, NULL, 0); c += ldc * COMPSIZE; #else SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif c, 1, NULL, 0, NULL, 0); if (i < m_from - n_from) { c += ldc * COMPSIZE; } else { c += (1 + ldc) * COMPSIZE; } #endif } return 0; } #ifdef THREADED_LEVEL3 #include "level3_syrk_threaded.c" #else #include "level3_syrk.c" #endif OpenBLAS-0.2.20/driver/level3/syrk_kernel.c000066400000000000000000000135221313527062700203300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef CONJA #ifndef CONJB #define GEMM_KERNEL GEMM_KERNEL_N #else #define GEMM_KERNEL GEMM_KERNEL_R #endif #else #ifndef CONJB #define GEMM_KERNEL GEMM_KERNEL_L #else #define GEMM_KERNEL GEMM_KERNEL_B #endif #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX FLOAT alpha_i, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; BLASLONG loop; FLOAT *cc, *ss; FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE]; if (m + offset < 0) { #ifndef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (n < offset) { #ifdef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (offset > 0) { #ifdef LOWER GEMM_KERNEL(m, offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; n -= offset; offset = 0; if (n <= 0) return 0; } if (n > m + offset) { #ifndef LOWER GEMM_KERNEL(m, n - m - offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + (m + offset) * k * COMPSIZE, c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; if (n <= 0) return 0; } if (offset < 0) { #ifndef LOWER GEMM_KERNEL(-offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; m += offset; offset = 0; if (m <= 0) return 0; } if (m > n - offset) { #ifdef LOWER GEMM_KERNEL(m - n + offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (n - offset) * k * COMPSIZE, b, c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { int mm, nn; mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; nn = MIN(GEMM_UNROLL_MN, n - loop); #ifndef LOWER GEMM_KERNEL(mm, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif NULL, 0, NULL, 0, subbuffer, nn); GEMM_KERNEL(nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); cc = c + (loop + loop * ldc) * COMPSIZE; ss = subbuffer; #ifndef LOWER for (j = 0; j < nn; j ++) { for (i = 0; i <= j; i ++) { #ifndef COMPLEX cc[i] += ss[i]; #else cc[i * 2 + 0] += ss[i * 2 + 0]; cc[i * 2 + 1] += ss[i * 2 + 1]; #endif } ss += nn * COMPSIZE; cc += ldc * COMPSIZE; } #else for (j = 0; j < nn; j ++) { for (i = j; i < nn; i ++) { #ifndef COMPLEX cc[i] += ss[i]; #else cc[i * 2 + 0] += ss[i * 2 + 0]; cc[i * 2 + 1] += ss[i * 2 + 1]; #endif } ss += nn * COMPSIZE; cc += ldc * COMPSIZE; } #endif #ifdef LOWER GEMM_KERNEL(m - mm - nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } return 0; } OpenBLAS-0.2.20/driver/level3/syrk_thread.c000066400000000000000000000131671313527062700203240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include #include "common.h" int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, i; BLASLONG n_from, n_to; double dnum, nf, nt, di; int num_cpu; int mask = 0; if (!(mode & BLAS_COMPLEX)) { switch (mode & BLAS_PREC) { case BLAS_SINGLE: mask = SGEMM_UNROLL_MN - 1; break; case BLAS_DOUBLE: mask = DGEMM_UNROLL_MN - 1; break; #ifdef EXPRECISION case BLAS_XDOUBLE: mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; break; #endif } } else { switch (mode & BLAS_PREC) { case BLAS_SINGLE: mask = CGEMM_UNROLL_MN - 1; break; case BLAS_DOUBLE: mask = ZGEMM_UNROLL_MN - 1; break; #ifdef EXPRECISION case BLAS_XDOUBLE: mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; break; #endif } } n_from = 0; n_to = arg -> n; if (range_n) { n_from = *(range_n + 0); n_to = *(range_n + 1); } if (!(mode & BLAS_UPLO)) { nf = (double)(n_from); nt = (double)(n_to); dnum = (nt * nt - nf * nf) / (double)nthreads; num_cpu = 0; range[0] = n_from; i = n_from; while (i < n_to){ if (nthreads - num_cpu > 1) { di = (double)i; width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; } else { width = n_to - i; } range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } } else { nf = (double)(arg -> n - n_from); nt = (double)(arg -> n - n_to); dnum = (nt * nt - nf * nf) / (double)nthreads; num_cpu = 0; range[0] = n_from; i = n_from; while (i < n_to){ if (nthreads - num_cpu > 1) { di = (double)(arg -> n - i); width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; } else { width = n_to - i; } range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } } if (num_cpu) { queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/level3/trmm_L.c000066400000000000000000000270771313527062700172440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #define TRMM_KERNEL_N TRMM_KERNEL_LR #define TRMM_KERNEL_T TRMM_KERNEL_LC #else #define GEMM_KERNEL GEMM_KERNEL_N #define TRMM_KERNEL_N TRMM_KERNEL_LN #define TRMM_KERNEL_T TRMM_KERNEL_LT #endif #undef TIMING #ifdef TIMING #define START_RPCC() rpcc_counter = rpcc() #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter #else #define START_RPCC() #define STOP_RPCC(COUNTER) #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; #ifdef TIMING unsigned long long rpcc_counter; unsigned long long innercost = 0; unsigned long long outercost = 0; unsigned long long gemmcost = 0; unsigned long long trmmcost = 0; double total; #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; b += n_from * ldb * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) min_l = m; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA TRMM_IUTCOPY(min_l, min_i, a, lda, 0, 0, sa); #else TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } for(is = min_i; is < min_l; is += GEMM_P){ min_i = min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA TRMM_IUTCOPY(min_l, min_i, a, lda, 0, is, sa); #else TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_N(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is); STOP_RPCC(trmmcost); } for(ls = min_l; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = ls; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls ) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(gemmcost); START_RPCC(); GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } for(is = min_i; is < ls; is += GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); START_RPCC(); GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } for(is = ls; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA TRMM_IUTCOPY(min_l, min_i, a, lda, ls, is, sa); #else TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_N(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); STOP_RPCC(trmmcost); } } #else min_l = m; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } for(is = m - min_l + min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, is, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l); STOP_RPCC(trmmcost); } for(ls = m - min_l; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); #endif STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); STOP_RPCC(outercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } for(is = ls - min_l + min_i; is < ls; is += GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); #else TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); #endif STOP_RPCC(innercost); START_RPCC(); TRMM_KERNEL_T(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l); STOP_RPCC(trmmcost); } for(is = ls; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; START_RPCC(); #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); START_RPCC(); GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } } #endif } #ifdef TIMING total = (double)outercost + (double)innercost + (double)gemmcost + (double)trmmcost; printf( "Copy A : %5.2f Copy B: %5.2f GEMM Kernel : %5.2f TRMM Kerlnel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., gemmcost / total * 100., trmmcost / total * 100., (double)n * (double)n * (double)n / (double)(trmmcost + gemmcost) * 100. * (double)COMPSIZE / 2., (double)n * (double)n * (double)n / total * 100. * (double)COMPSIZE / 2.); #endif return 0; } OpenBLAS-0.2.20/driver/level3/trmm_R.c000066400000000000000000000252641313527062700172460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dp1 = 1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #define TRMM_KERNEL_N TRMM_KERNEL_RR #define TRMM_KERNEL_T TRMM_KERNEL_RC #else #define GEMM_KERNEL GEMM_KERNEL_N #define TRMM_KERNEL_N TRMM_KERNEL_RN #define TRMM_KERNEL_T TRMM_KERNEL_RT #endif #if 0 #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P 8 #define GEMM_Q 12 #define GEMM_R 16 #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; #if !((!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))) BLASLONG start_ls; #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); m = m_to - m_from; b += m_from * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } #if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; for(ls = js; ls < js + min_j; ls += GEMM_Q){ min_l = js + min_j - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((js + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, b + ((js + jjs) * ldb) * COMPSIZE, ldb); } for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); #else TRMM_OUTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); #endif TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + (ls - js + jjs) * min_l * COMPSIZE, b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, ls - js, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); TRMM_KERNEL_T(min_i, min_l, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + (ls - js) * min_l * COMPSIZE, b + (is + ls * ldb) * COMPSIZE, ldb, 0); } } for(ls = js + min_j; ls < n; ls += GEMM_Q){ min_l = n - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } } #else for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; start_ls = js - min_j; while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); #else TRMM_OLTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); #endif TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); } for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_l + jjs) * COMPSIZE, b + ((ls + min_l + jjs) * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRMM_KERNEL_N(min_i, min_l, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + ls * ldb) * COMPSIZE, ldb, 0); if (js - ls - min_l > 0) { GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * min_l * COMPSIZE, b + (is + (ls + min_l) * ldb) * COMPSIZE, ldb); } } } for(ls = 0; ls < js - min_j; ls += GEMM_Q){ min_l = js - min_j - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + ((jjs - min_j) * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } } #endif return 0; } OpenBLAS-0.2.20/driver/level3/trsm_L.c000066400000000000000000000200651313527062700172400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) #define TRSM_KERNEL TRSM_KERNEL_LR #else #define TRSM_KERNEL TRSM_KERNEL_LC #endif #else #define GEMM_KERNEL GEMM_KERNEL_N #if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) #define TRSM_KERNEL TRSM_KERNEL_LN #else #define TRSM_KERNEL TRSM_KERNEL_LT #endif #endif #if 0 #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P 8 #define GEMM_Q 12 #define GEMM_R 1600 #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; #if !((!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))) BLASLONG start_is; #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; b += n_from * ldb * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; #if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) for(ls = 0; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); } for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); } for(is = ls + min_l; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #else for(ls = m; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; start_is = ls - min_l; while (start_is + GEMM_P < ls) start_is += GEMM_P; min_i = ls - start_is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); } for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); } for(is = 0; is < ls - min_l; is += GEMM_P){ min_i = ls - min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #endif } return 0; } OpenBLAS-0.2.20/driver/level3/trsm_R.c000066400000000000000000000250101313527062700172410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" const static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) #define TRSM_KERNEL TRSM_KERNEL_RR #else #define TRSM_KERNEL TRSM_KERNEL_RC #endif #else #define GEMM_KERNEL GEMM_KERNEL_N #if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) #define TRSM_KERNEL TRSM_KERNEL_RN #else #define TRSM_KERNEL TRSM_KERNEL_RT #endif #endif #if 0 #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P 16 #define GEMM_Q 20 #define GEMM_R 24 #endif int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; #if !((defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA))) BLASLONG start_ls; #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); m = m_to - m_from; b += m_from * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; for(ls = 0; ls < js; ls += GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } for(ls = js; ls < js + min_j; ls += GEMM_Q){ min_l = js + min_j - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #else TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ min_jj = min_j - min_l - ls + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_l + jjs) * COMPSIZE, b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * min_l * COMPSIZE, b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); } } } #else for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; for (ls = js; ls < n; ls += GEMM_Q) { min_l = n - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs - min_j) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } start_ls = js - min_j; while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OLNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #else TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ min_jj = min_j - js + ls - jjs; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } } #endif return 0; } OpenBLAS-0.2.20/driver/level3/zhemm_k.c000066400000000000000000000067001313527062700174320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #undef TIMING #ifndef RSIDE #ifndef LOWER #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_IUTCOPY(M, N, A, LDA, Y, X, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_ILTCOPY(M, N, A, LDA, Y, X, BUFFER); #endif #endif #ifdef RSIDE #ifndef LOWER #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_OUTCOPY(M, N, A, LDA, Y, X, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_OLTCOPY(M, N, A, LDA, Y, X, BUFFER); #endif #endif #ifndef RSIDE #define K args -> m #ifndef LOWER #define GEMM_LOCAL HEMM_LU #else #define GEMM_LOCAL HEMM_LL #endif #else #define K args -> n #ifndef LOWER #define GEMM_LOCAL HEMM_RU #else #define GEMM_LOCAL HEMM_RL #endif #endif #ifdef THREADED_LEVEL3 #include "level3_thread.c" #else #include "level3.c" #endif OpenBLAS-0.2.20/driver/level3/zher2k_k.c000066400000000000000000000122251313527062700175160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef LOWER #ifndef CONJ #ifdef XDOUBLE #define KERNEL_FUNC xher2k_kernel_UN #elif defined(DOUBLE) #define KERNEL_FUNC zher2k_kernel_UN #else #define KERNEL_FUNC cher2k_kernel_UN #endif #else #ifdef XDOUBLE #define KERNEL_FUNC xher2k_kernel_UC #elif defined(DOUBLE) #define KERNEL_FUNC zher2k_kernel_UC #else #define KERNEL_FUNC cher2k_kernel_UC #endif #endif #else #ifndef CONJ #ifdef XDOUBLE #define KERNEL_FUNC xher2k_kernel_LN #elif defined(DOUBLE) #define KERNEL_FUNC zher2k_kernel_LN #else #define KERNEL_FUNC cher2k_kernel_LN #endif #else #ifdef XDOUBLE #define KERNEL_FUNC xher2k_kernel_LC #elif defined(DOUBLE) #define KERNEL_FUNC zher2k_kernel_LC #else #define KERNEL_FUNC cher2k_kernel_LC #endif #endif #endif #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) #define KERNEL_OPERATION_C(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ KERNEL_FUNC(M, N, K, ALPHA[0], -ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) #if !defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL HER2K_UN #elif !defined(LOWER) && defined(TRANS) #define SYRK_LOCAL HER2K_UC #elif defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL HER2K_LN #else #define SYRK_LOCAL HER2K_LC #endif #undef SCAL_K #ifdef XDOUBLE #define SCAL_K QSCAL_K #elif defined(DOUBLE) #define SCAL_K DSCAL_K #else #define SCAL_K SSCAL_K #endif static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; #ifndef LOWER if (m_from > n_from) n_from = m_from; if (m_to > n_to ) m_to = n_to; #else if (m_from < n_from) m_from = n_from; if (m_to < n_to ) n_to = m_to; #endif c += (m_from + n_from * ldc) * COMPSIZE; m_to -= m_from; n_to -= n_from; for (i = 0; i < n_to; i++){ #ifndef LOWER SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); if (i + n_from - m_from + 1 <= m_to) *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; c += ldc * COMPSIZE; #else SCAL_K(MIN(m_to - i + m_from - n_from, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); if (i < m_from - n_from) { c += ldc * COMPSIZE; } else { *(c + 1) = ZERO; c += (1 + ldc) * COMPSIZE; } #endif } return 0; } #ifdef THREADED_LEVEL3 #include "level3_syr2k_threaded.c" #else #include "level3_syr2k.c" #endif OpenBLAS-0.2.20/driver/level3/zher2k_kernel.c000066400000000000000000000140171313527062700205450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0 #else #define GEMM_KERNEL GEMM_KERNEL_L #define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){ BLASLONG i, j; BLASLONG loop; FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE]; if (m + offset < 0) { #ifndef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (n < offset) { #ifdef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (offset > 0) { #ifdef LOWER GEMM_KERNEL(m, offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; n -= offset; offset = 0; if (n <= 0) return 0; } if (n > m + offset) { #ifndef LOWER GEMM_KERNEL(m, n - m - offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + (m + offset) * k * COMPSIZE, c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; if (n <= 0) return 0; } if (offset < 0) { #ifndef LOWER GEMM_KERNEL(-offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; m += offset; offset = 0; if (m <= 0) return 0; } if (m > n - offset) { #ifdef LOWER GEMM_KERNEL(m - n + offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (n - offset) * k * COMPSIZE, b, c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { int mm, nn; mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; nn = MIN(GEMM_UNROLL_MN, n - loop); #ifndef LOWER GEMM_KERNEL(mm, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif if (flag) { GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif NULL, 0, NULL, 0, subbuffer, nn); GEMM_KERNEL(nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); #ifndef LOWER for (j = 0; j < nn; j ++) { for (i = 0; i <= j; i ++) { c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; if (i != j) { c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; } else { c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; } } } #else for (j = 0; j < nn; j ++) { for (i = j; i < nn; i ++) { c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; if (i != j) { c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; } else { c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; } } } #endif } #ifdef LOWER GEMM_KERNEL(m - mm - nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } return 0; } OpenBLAS-0.2.20/driver/level3/zherk_beta.c000066400000000000000000000071471313527062700201240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i, FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6, FLOAT *c, BLASLONG ldc, FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){ BLASLONG i; #ifndef LOWER for (i = from; i < to; i++){ SCAL_K(i * 2, 0, 0, alpha_r, c + i * ldc * 2, 1, NULL, 0, NULL, 0); if (alpha_r == ZERO ){ c[i * 2 + 0 + i * ldc * 2] = ZERO; c[i * 2 + 1 + i * ldc * 2] = ZERO; } else { c[i * 2 + 0 + i * ldc * 2] *= alpha_r; c[i * 2 + 1 + i * ldc * 2] = ZERO; } } #else for (i = from; i < to; i++){ if (alpha_r == ZERO) { c[i * 2 + 0 + i * ldc * 2] = ZERO; c[i * 2 + 1 + i * ldc * 2] = ZERO; } else { c[i * 2 + 0 + i * ldc * 2] *= alpha_r; c[i * 2 + 1 + i * ldc * 2] = ZERO; } SCAL_K((n - i - 1) * 2, 0, 0, alpha_r, c + 2 + i * (ldc + 1) * 2, 1, NULL, 0, NULL, 0); } #endif return 0; } OpenBLAS-0.2.20/driver/level3/zherk_k.c000066400000000000000000000116561313527062700174430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef LOWER #ifndef CONJ #ifdef XDOUBLE #define KERNEL_FUNC xherk_kernel_UN #elif defined(DOUBLE) #define KERNEL_FUNC zherk_kernel_UN #else #define KERNEL_FUNC cherk_kernel_UN #endif #else #ifdef XDOUBLE #define KERNEL_FUNC xherk_kernel_UC #elif defined(DOUBLE) #define KERNEL_FUNC zherk_kernel_UC #else #define KERNEL_FUNC cherk_kernel_UC #endif #endif #else #ifndef CONJ #ifdef XDOUBLE #define KERNEL_FUNC xherk_kernel_LN #elif defined(DOUBLE) #define KERNEL_FUNC zherk_kernel_LN #else #define KERNEL_FUNC cherk_kernel_LN #endif #else #ifdef XDOUBLE #define KERNEL_FUNC xherk_kernel_LC #elif defined(DOUBLE) #define KERNEL_FUNC zherk_kernel_LC #else #define KERNEL_FUNC cherk_kernel_LC #endif #endif #endif #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) #if !defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL HERK_UN #elif !defined(LOWER) && defined(TRANS) #define SYRK_LOCAL HERK_UC #elif defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL HERK_LN #else #define SYRK_LOCAL HERK_LC #endif #undef SCAL_K #ifdef XDOUBLE #define SCAL_K QSCAL_K #elif defined(DOUBLE) #define SCAL_K DSCAL_K #else #define SCAL_K SSCAL_K #endif static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; #ifndef LOWER if (m_from > n_from) n_from = m_from; if (m_to > n_to ) m_to = n_to; #else if (m_from < n_from) m_from = n_from; if (m_to < n_to ) n_to = m_to; #endif c += (m_from + n_from * ldc) * COMPSIZE; m_to -= m_from; n_to -= n_from; for (i = 0; i < n_to; i++){ #ifndef LOWER SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); if (i + n_from - m_from + 1 <= m_to) *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; c += ldc * COMPSIZE; #else SCAL_K(MIN(m_to - i + m_from - n_from, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); if (i < m_from - n_from) { c += ldc * COMPSIZE; } else { *(c + 1) = ZERO; c += (1 + ldc) * COMPSIZE; } #endif } return 0; } #ifdef THREADED_LEVEL3 #include "level3_syrk_threaded.c" #else #include "level3_syrk.c" #endif OpenBLAS-0.2.20/driver/level3/zherk_kernel.c000066400000000000000000000130001313527062700204520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0 #else #define GEMM_KERNEL GEMM_KERNEL_L #define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; BLASLONG loop; FLOAT *cc, *ss; FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE]; if (m + offset < 0) { #ifndef LOWER GEMM_KERNEL(m, n, k, alpha_r, ZERO, a, b, c, ldc); #endif return 0; } if (n < offset) { #ifdef LOWER GEMM_KERNEL(m, n, k, alpha_r, ZERO, a, b, c, ldc); #endif return 0; } if (offset > 0) { #ifdef LOWER GEMM_KERNEL(m, offset, k, alpha_r, ZERO, a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; n -= offset; offset = 0; if (n <= 0) return 0; } if (n > m + offset) { #ifndef LOWER GEMM_KERNEL(m, n - m - offset, k, alpha_r, ZERO, a, b + (m + offset) * k * COMPSIZE, c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; if (n <= 0) return 0; } if (offset < 0) { #ifndef LOWER GEMM_KERNEL(-offset, n, k, alpha_r, ZERO, a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; m += offset; offset = 0; if (m <= 0) return 0; } if (m > n - offset) { #ifdef LOWER GEMM_KERNEL(m - n + offset, n, k, alpha_r, ZERO, a + (n - offset) * k * COMPSIZE, b, c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { int mm, nn; mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; nn = MIN(GEMM_UNROLL_MN, n - loop); #ifndef LOWER GEMM_KERNEL(mm, nn, k, alpha_r, ZERO, a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif GEMM_BETA(nn, nn, 0, ZERO, ZERO, NULL, 0, NULL, 0, subbuffer, nn); GEMM_KERNEL(nn, nn, k, alpha_r, ZERO, a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); cc = c + (loop + loop * ldc) * COMPSIZE; ss = subbuffer; #ifndef LOWER for (j = 0; j < nn; j ++) { for (i = 0; i #include #include "common.h" int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i, FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6, FLOAT *c, BLASLONG ldc, FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){ BLASLONG i; #ifndef LOWER for (i = from; i < to; i++){ ZSCAL_K(i + 1, 0, 0, alpha_r, alpha_i, c + i * ldc * 2, 1, NULL, 0, NULL, 0); } #else for (i = from; i < to; i++){ ZSCAL_K(n - i, 0, 0, alpha_r, alpha_i, c + i * (ldc + 1) * 2, 1, NULL, 0, NULL, 0); } #endif return 0; } OpenBLAS-0.2.20/driver/mapper/000077500000000000000000000000001313527062700157235ustar00rootroot00000000000000OpenBLAS-0.2.20/driver/mapper/Makefile000066400000000000000000000006131313527062700173630ustar00rootroot00000000000000MODULENAME := mapper KDIR := /lib/modules/$(shell uname -r)/build PWD := $(shell pwd) CC := gcc -Wall ifeq ($(KERNELRELEASE),) all :: $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules else obj-m := $(MODULENAME).o endif load: insmod ./$(MODULENAME).ko unload: rmmod $(MODULENAME) setup: ./device_setup clean: rm -rf *.o *.ko Module.symvers *.mod.c .tmp_versions .mapper* modules.order OpenBLAS-0.2.20/driver/mapper/device_setup000066400000000000000000000003041313527062700203220ustar00rootroot00000000000000#!/bin/sh drivername=mapper devicename=/dev/$drivername major=`cat /proc/devices | grep $drivername | awk '{print $1;}'` rm -f $devicename mknod $devicename c $major 0 chmod go+rw $devicename OpenBLAS-0.2.20/driver/mapper/mapper.c000066400000000000000000000157371313527062700173700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include #include #include #include #include #include #include #ifdef CONFIG_BIGPHYS_AREA #include #endif #include #ifdef MODVERSIONS #include #endif #include typedef struct { pid_t pid; #ifndef CONFIG_BIGPHYS_AREA long size; #endif caddr_t address; } buffer_t; #define MAX_BUFF_SIZE 1024 #define MAX_LENGTH (4UL << 20) static spinlock_t lock __attribute__((aligned(64))); static buffer_t buffer[MAX_BUFF_SIZE]; static dev_t mapper_dev; static struct cdev mapper_cdev; static int mapper_open (struct inode *inode, struct file *fp){ return 0;} static int mapper_release(struct inode *inode, struct file *fp){ int pos; #ifndef CONFIG_BIGPHYS_AREA caddr_t addr; #endif // printk("Releasing memory... %d\n", current -> tgid); spin_lock(&lock); for (pos = 0; pos < MAX_BUFF_SIZE; pos ++) { if (buffer[pos].pid == (pid_t) current -> tgid) { #ifdef CONFIG_BIGPHYS_AREA bigphysarea_free_pages(buffer[pos].address); #else for (addr = buffer[pos].address; addr < buffer[pos].address + buffer[pos].size; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); } kfree(buffer[pos].address); buffer[pos].size = 0; #endif buffer[pos].pid = 0; buffer[pos].address = 0; } } spin_unlock(&lock); return 0; } int mapper_mapper(struct file *fp, struct vm_area_struct *vma){ int ret, pos; caddr_t alloc_addr; #ifndef CONFIG_BIGPHYS_AREA caddr_t addr; #endif long all_length, length, current_addr; all_length = vma->vm_end - vma->vm_start; current_addr = vma -> vm_start; spin_lock(&lock); while (all_length > 0) { length = all_length; if (length > MAX_LENGTH) length = MAX_LENGTH; all_length -= MAX_LENGTH; // printk("Allocating memory... %d\n", length); pos = 0; while ((pos < MAX_BUFF_SIZE) && (buffer[pos].address != 0)) pos ++; if (pos >= MAX_BUFF_SIZE) { printk("Memory Allocator : too much memory allocation requested.\n"); spin_unlock(&lock); return -EIO; } #ifdef CONFIG_BIGPHYS_AREA alloc_addr = (caddr_t)bigphysarea_alloc_pages(length >> PAGE_SHIFT, 1, GFP_KERNEL); #else alloc_addr = (caddr_t)kmalloc(length, GFP_KERNEL); #endif if (alloc_addr == (caddr_t)NULL) { spin_unlock(&lock); return -EIO; } #ifndef CONFIG_BIGPHYS_AREA for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) { clear_page(addr); SetPageReserved(virt_to_page(addr)); } #endif if ((ret = remap_pfn_range(vma, current_addr, virt_to_phys((void *)alloc_addr) >> PAGE_SHIFT, length, PAGE_SHARED)) < 0) { #ifdef CONFIG_BIGPHYS_AREA bigphysarea_free_pages((caddr_t)alloc_addr); #else for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) ClearPageReserved(virt_to_page(addr)); kfree((caddr_t)alloc_addr); #endif spin_unlock(&lock); return ret; } buffer[pos].pid = current -> tgid; buffer[pos].address = alloc_addr; #ifndef CONFIG_BIGPHYS_AREA buffer[pos].size = length; #endif current_addr += length; } spin_unlock(&lock); return 0; } static struct file_operations mapper_fops = { .open = mapper_open, .release = mapper_release, .mmap = mapper_mapper, .owner = THIS_MODULE, }; static int __init mapper_init(void){ int ret, i; ret = alloc_chrdev_region(&mapper_dev, 0, 1, "mapper"); cdev_init(&mapper_cdev, &mapper_fops); ret = cdev_add(&mapper_cdev, mapper_dev, 1); spin_lock_init(&lock); for (i = 0; i < MAX_BUFF_SIZE; i++) { buffer[i].pid = 0; #ifndef CONFIG_BIGPHYS_AREA buffer[i].size = 0; #endif buffer[i].address = 0; } return ret; } static void __exit mapper_exit(void){ int pos; for (pos = 0; pos < MAX_BUFF_SIZE; pos ++) { if (buffer[pos].address != 0) { #ifdef CONFIG_BIGPHYS_AREA bigphysarea_free_pages(buffer[pos].address); #else kfree(buffer[pos].address); #endif } } cdev_del(&mapper_cdev); unregister_chrdev_region(mapper_dev, 1); } module_init(mapper_init); module_exit(mapper_exit); MODULE_DESCRIPTION("BigPhysArea User Mapping Driver"); MODULE_LICENSE("Unknown"); OpenBLAS-0.2.20/driver/others/000077500000000000000000000000001313527062700157435ustar00rootroot00000000000000OpenBLAS-0.2.20/driver/others/CMakeLists.txt000066400000000000000000000034731313527062700205120ustar00rootroot00000000000000include_directories(${PROJECT_SOURCE_DIR}) if (${CORE} STREQUAL "PPC440") set(MEMORY memory_qalloc.c) else () set(MEMORY memory.c) endif () if (SMP) if (USE_OPENMP) set(BLAS_SERVER blas_server_omp.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(BLAS_SERVER blas_server_win32.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") set(BLAS_SERVER blas_server_win32.c) endif () if (NOT DEFINED BLAS_SERVER) set(BLAS_SERVER blas_server.c) endif () set(SMP_SOURCES ${BLAS_SERVER} divtable.c # TODO: Makefile has -UDOUBLE blas_l1_thread.c ) if (NOT NO_AFFINITY) list(APPEND SMP_SOURCES init.c) endif () endif () set(COMMON_SOURCES xerbla.c openblas_set_num_threads.c openblas_error_handle.c openblas_env.c openblas_get_num_procs.c openblas_get_num_threads.c ) # these need to have NAME/CNAME set, so use GenerateNamedObjects, but don't use standard name mangling GenerateNamedObjects("abs.c" "" "c_abs" 0 "" "" 1 ) GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1) GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) if (DYNAMIC_ARCH) list(APPEND COMMON_SOURCES dynamic.c) else () list(APPEND COMMON_SOURCES parameter.c) endif () #ifdef EXPRECISION #COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) #endif # #ifdef QUAD_PRECISION #COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) #endif # #ifdef USE_CUDA #COMMONOBJS += cuda_init.$(SUFFIX) #endif # #ifdef FUNCTION_PROFILE #COMMONOBJS += profile.$(SUFFIX) #endif #LIBOTHERS = libothers.$(LIBSUFFIX) #ifeq ($(DYNAMIC_ARCH), 1) #HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) #else #HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) #endif add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) OpenBLAS-0.2.20/driver/others/Makefile000066400000000000000000000135341313527062700174110ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) ifndef NO_AFFINITY COMMONOBJS += init.$(SUFFIX) endif endif # COMMONOBJS += info.$(SUFFIX) ifeq ($(DYNAMIC_ARCH), 1) COMMONOBJS += dynamic.$(SUFFIX) else COMMONOBJS += parameter.$(SUFFIX) endif ifdef EXPRECISION COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) endif ifdef QUAD_PRECISION COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(C_COMPILER), PGI) # COMMONOBJS += __builtin_stinit.$(SUFFIX) endif endif ifdef USE_CUDA COMMONOBJS += cuda_init.$(SUFFIX) endif ifdef FUNCTION_PROFILE COMMONOBJS += profile.$(SUFFIX) endif LIBOTHERS = libothers.$(LIBSUFFIX) ifeq ($(CORE), PPC440) MEMORY = memory_qalloc.c endif ifndef MEMORY MEMORY = memory.c endif ifeq ($(USE_OPENMP), 1) BLAS_SERVER = blas_server_omp.c else ifeq ($(OSNAME), WINNT) BLAS_SERVER = blas_server_win32.c endif ifeq ($(OSNAME), CYGWIN_NT) BLAS_SERVER = blas_server_win32.c endif ifeq ($(OSNAME), Interix) BLAS_SERVER = blas_server_win32.c endif endif ifndef BLAS_SERVER BLAS_SERVER = blas_server.c endif ifeq ($(DYNAMIC_ARCH), 1) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif xerbla.$(SUFFIX) : xerbla.c $(CC) $(CFLAGS) -c $< -o $(@F) dynamic.$(SUFFIX) : dynamic.c $(CC) $(CFLAGS) -c $< -o $(@F) dynamic.$(PSUFFIX) : dynamic.c $(CC) $(PFLAGS) -c $< -o $(@F) parameter.$(SUFFIX) : parameter.c ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) init.$(SUFFIX) : init.c $(CC) $(CFLAGS) -c $< -o $(@F) profile.$(SUFFIX) : profile.c $(CC) $(CFLAGS) -c $< -o $(@F) memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) openblas_get_num_threads.$(SUFFIX) : openblas_get_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) openblas_get_num_procs.$(SUFFIX) : openblas_get_num_procs.c $(CC) $(CFLAGS) -c $< -o $(@F) openblas_get_config.$(SUFFIX) : openblas_get_config.c $(CC) $(CFLAGS) -c $< -o $(@F) openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c $(CC) $(CFLAGS) -c $< -o $(@F) openblas_error_handle.$(SUFFIX) : openblas_error_handle.c $(CC) $(CFLAGS) -c $< -o $(@F) openblas_env.$(SUFFIX) : openblas_env.c $(CC) $(CFLAGS) -c $< -o $(@F) blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) cuda_init.$(SUFFIX) : cuda_init.c $(CUCC) $(COMMON_OPT) -I$(TOPDIR) $(CUFLAGS) -DCNAME=$(*F) -c $< -o $(@F) c_abs.$(SUFFIX) : abs.c $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) z_abs.$(SUFFIX) : abs.c $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) x_abs.$(SUFFIX) : abs.c $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) slamch.$(SUFFIX) : lamch.c $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) dlamch.$(SUFFIX) : lamch.c $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) qlamch.$(SUFFIX) : lamch.c $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) slamc3.$(SUFFIX) : lamc3.c $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) dlamc3.$(SUFFIX) : lamc3.c $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) qlamc3.$(SUFFIX) : lamc3.c $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) divtable.$(SUFFIX) : divtable.c $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) __builtin_stinit.$(SUFFIX) : $(ARCH)/builtin_stinit.S $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) addx.$(SUFFIX) : $(ARCH)/addx.c $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) mulx.$(SUFFIX) : $(ARCH)/mulx.c $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) xerbla.$(PSUFFIX) : xerbla.c $(CC) $(PFLAGS) -c $< -o $(@F) parameter.$(PSUFFIX) : parameter.c ../../param.h $(CC) $(PFLAGS) -c $< -o $(@F) init.$(PSUFFIX) : init.c $(CC) $(PFLAGS) -c $< -o $(@F) profile.$(PSUFFIX) : profile.c $(CC) $(PFLAGS) -c $< -o $(@F) memory.$(PSUFFIX) : $(MEMORY) ../../common.h ../../param.h $(CC) $(PFLAGS) -c $< -o $(@F) blas_server.$(PSUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h $(CC) $(PFLAGS) -c $< -o $(@F) blasL1thread.$(PSUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(PFLAGS) -c $< -o $(@F) cuda_init.$(PSUFFIX) : cuda_init.c $(CUCC) $(COMMON_OPT) -I$(TOPDIR) $(CUFLAGS) -DCNAME=$(*F) -c $< -o $(@F) c_abs.$(PSUFFIX) : abs.c $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) z_abs.$(PSUFFIX) : abs.c $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) x_abs.$(PSUFFIX) : abs.c $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) slamch.$(PUFFIX) : lamch.c $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) dlamch.$(PUFFIX) : lamch.c $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) qlamch.$(PUFFIX) : lamch.c $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) slamc3.$(PUFFIX) : lamc3.c $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) dlamc3.$(PUFFIX) : lamc3.c $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) qlamc3.$(PUFFIX) : lamc3.c $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) divtable.$(PSUFFIX) : divtable.c $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) __builtin_stinit.$(PPSUFFIX) : $(ARCH)/builtin_stinit.S $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) addx.$(PSUFFIX) : $(ARCH)/addx.c $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) mulx.$(PSUFFIX) : $(ARCH)/mulx.c $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) hpl : override CFLAGS += -DHPL hpl_p : override CFLAGS += -DHPL include $(TOPDIR)/Makefile.tail OpenBLAS-0.2.20/driver/others/abs.c000066400000000000000000000057651313527062700166710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" double fabs(double); double sqrt(double); #ifdef NEED_F2CCONV double #else FLOAT #endif CNAME(FLOAT *z){ FLOAT real = z[0]; FLOAT imag = z[1]; double temp; real = fabs(real); imag = fabs(imag); if(imag > real){ temp = real; real = imag; imag = temp; } if (imag == 0.) return real; temp = imag/real; temp = real * sqrt(1.0 + temp*temp); return temp; } OpenBLAS-0.2.20/driver/others/blas_l1_thread.c000066400000000000000000000135171313527062700207620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads){ blas_queue_t queue[MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER]; BLASLONG i, width, astride, bstride; int num_cpu, calc_type; calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; mode |= BLAS_LEGACY; for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); num_cpu = 0; i = m; while (i > 0){ /* Adjust Parameters */ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; if (i < 0) width = width + i; astride = width * lda; if (!(mode & BLAS_TRANSB_T)) { bstride = width * ldb; } else { bstride = width; } astride <<= calc_type; bstride <<= calc_type; args[num_cpu].m = width; args[num_cpu].n = n; args[num_cpu].k = k; args[num_cpu].a = (void *)a; args[num_cpu].b = (void *)b; args[num_cpu].c = (void *)c; args[num_cpu].lda = lda; args[num_cpu].ldb = ldb; args[num_cpu].ldc = ldc; args[num_cpu].alpha = alpha; queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = &args[num_cpu]; queue[num_cpu].next = &queue[num_cpu + 1]; a = (void *)((BLASULONG)a + astride); b = (void *)((BLASULONG)b + bstride); num_cpu ++; } if (num_cpu) { queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads){ blas_queue_t queue[MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER]; BLASLONG i, width, astride, bstride; int num_cpu, calc_type; calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; mode |= BLAS_LEGACY; for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); num_cpu = 0; i = m; while (i > 0){ /* Adjust Parameters */ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; if (i < 0) width = width + i; astride = width * lda; if (!(mode & BLAS_TRANSB_T)) { bstride = width * ldb; } else { bstride = width; } astride <<= calc_type; bstride <<= calc_type; args[num_cpu].m = width; args[num_cpu].n = n; args[num_cpu].k = k; args[num_cpu].a = (void *)a; args[num_cpu].b = (void *)b; args[num_cpu].c = (void *)((char *)c + num_cpu * sizeof(double)*2); args[num_cpu].lda = lda; args[num_cpu].ldb = ldb; args[num_cpu].ldc = ldc; args[num_cpu].alpha = alpha; queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = &args[num_cpu]; queue[num_cpu].next = &queue[num_cpu + 1]; a = (void *)((BLASULONG)a + astride); b = (void *)((BLASULONG)b + bstride); num_cpu ++; } if (num_cpu) { queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } return 0; } OpenBLAS-0.2.20/driver/others/blas_server.c000066400000000000000000000642421313527062700204260ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) #include #include #include #include #endif #ifndef likely #ifdef __GNUC__ #define likely(x) __builtin_expect(!!(x), 1) #else #define likely(x) (x) #endif #endif #ifndef unlikely #ifdef __GNUC__ #define unlikely(x) __builtin_expect(!!(x), 0) #else #define unlikely(x) (x) #endif #endif extern unsigned int openblas_thread_timeout(); #ifdef SMP_SERVER #undef MONITOR #undef TIMING #undef TIMING_DEBUG #undef NEED_STACKATTR #define ATTRIBUTE_SIZE 128 /* This is a thread server model implementation. The threads are */ /* spawned at first access to blas library, and still remains until */ /* destruction routine is called. The number of threads are */ /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* jobs is queued. */ /* We need this grobal for cheking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; /* Local Variables */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) static pthread_spinlock_t server_lock = 0; #else static unsigned long server_lock = 0; #endif #define THREAD_STATUS_SLEEP 2 #define THREAD_STATUS_WAKEUP 4 static pthread_t blas_threads [MAX_CPU_NUMBER]; typedef struct { blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE))); #if defined(OS_LINUX) && !defined(NO_AFFINITY) int node; #endif volatile long status; pthread_mutex_t lock; pthread_cond_t wakeup; } thread_status_t; static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE))); #ifndef THREAD_TIMEOUT #define THREAD_TIMEOUT 28 #endif static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); #ifdef MONITOR /* Monitor is a function to see thread's status for every seconds. */ /* Usually it turns off and it's for debugging. */ static pthread_t monitor_thread; static int main_status[MAX_CPU_NUMBER]; #define MAIN_ENTER 0x01 #define MAIN_EXIT 0x02 #define MAIN_TRYLOCK 0x03 #define MAIN_LOCKSUCCESS 0x04 #define MAIN_QUEUING 0x05 #define MAIN_RECEIVING 0x06 #define MAIN_RUNNING1 0x07 #define MAIN_RUNNING2 0x08 #define MAIN_RUNNING3 0x09 #define MAIN_WAITING 0x0a #define MAIN_SLEEPING 0x0b #define MAIN_FINISH 0x0c #define MAIN_DONE 0x0d #endif #define BLAS_QUEUE_FINISHED 3 #define BLAS_QUEUE_RUNNING 4 #ifdef TIMING BLASLONG exit_time[MAX_CPU_NUMBER]; #endif static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else #endif if (mode & BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else { /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } } else { #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], ((xdouble *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else #endif if (mode & BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], ((double *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], ((float *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } } } #if defined(OS_LINUX) && !defined(NO_AFFINITY) int gotoblas_set_affinity(int); int gotoblas_set_affinity2(int); int get_node(void); #endif static int increased_threads = 0; static void* blas_thread_server(void *arg){ /* Thread identifier */ BLASLONG cpu = (BLASLONG)arg; unsigned int last_tick; void *buffer, *sa, *sb; blas_queue_t *queue; blas_queue_t *tscq; #ifdef TIMING_DEBUG unsigned long start, stop; #endif #if defined(OS_LINUX) && !defined(NO_AFFINITY) if (!increased_threads) thread_status[cpu].node = gotoblas_set_affinity(cpu + 1); else thread_status[cpu].node = gotoblas_set_affinity(-1); #endif #ifdef MONITOR main_status[cpu] = MAIN_ENTER; #endif buffer = blas_memory_alloc(2); #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu); #endif while (1){ #ifdef MONITOR main_status[cpu] = MAIN_QUEUING; #endif #ifdef TIMING exit_time[cpu] = rpcc(); #endif last_tick = (unsigned int)rpcc(); pthread_mutex_lock (&thread_status[cpu].lock); tscq=thread_status[cpu].queue; pthread_mutex_unlock (&thread_status[cpu].lock); while(!tscq) { YIELDING; if ((unsigned int)rpcc() - last_tick > thread_timeout) { pthread_mutex_lock (&thread_status[cpu].lock); if (!thread_status[cpu].queue) { thread_status[cpu].status = THREAD_STATUS_SLEEP; while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { #ifdef MONITOR main_status[cpu] = MAIN_SLEEPING; #endif pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); } } pthread_mutex_unlock(&thread_status[cpu].lock); last_tick = (unsigned int)rpcc(); } pthread_mutex_lock (&thread_status[cpu].lock); tscq=thread_status[cpu].queue; pthread_mutex_unlock (&thread_status[cpu].lock); } queue = thread_status[cpu].queue; if ((long)queue == -1) break; #ifdef MONITOR main_status[cpu] = MAIN_RECEIVING; #endif #ifdef TIMING_DEBUG start = rpcc(); #endif if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; pthread_mutex_lock (&thread_status[cpu].lock); thread_status[cpu].queue = (blas_queue_t *)1; pthread_mutex_unlock (&thread_status[cpu].lock); sa = queue -> sa; sb = queue -> sb; #ifdef SMP_DEBUG if (queue -> args) { fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); } #endif #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; #endif if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } else { #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } queue->sb=sb; } #ifdef MONITOR main_status[cpu] = MAIN_RUNNING2; #endif if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, sb); } else if (queue -> mode & BLAS_PTHREAD) { void (*pthreadcompat)(void *) = queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); #endif #ifdef MONITOR main_status[cpu] = MAIN_FINISH; #endif // arm: make sure all results are written out _before_ // thread is marked as done and other threads use them WMB; pthread_mutex_lock (&thread_status[cpu].lock); thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ pthread_mutex_unlock (&thread_status[cpu].lock); WMB; } #ifdef MONITOR main_status[cpu] = MAIN_DONE; #endif #ifdef TIMING_DEBUG stop = rpcc(); fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1, start, stop, stop - start); #endif } /* Shutdown procedure */ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); #endif blas_memory_free(buffer); //pthread_exit(NULL); return NULL; } #ifdef MONITOR static BLASLONG num_suspend = 0; static int blas_monitor(void *arg){ int i; while(1){ for (i = 0; i < blas_num_threads - 1; i++){ switch (main_status[i]) { case MAIN_ENTER : fprintf(STDERR, "THREAD[%2d] : Entering.\n", i); break; case MAIN_EXIT : fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i); break; case MAIN_TRYLOCK : fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i); break; case MAIN_QUEUING : fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i); break; case MAIN_RECEIVING : fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i); break; case MAIN_RUNNING1 : fprintf(STDERR, "THREAD[%2d] : Running1.\n", i); break; case MAIN_RUNNING2 : fprintf(STDERR, "THREAD[%2d] : Running2.\n", i); break; case MAIN_RUNNING3 : fprintf(STDERR, "THREAD[%2d] : Running3.\n", i); break; case MAIN_WAITING : fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i); break; case MAIN_SLEEPING : fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i); break; case MAIN_FINISH : fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i); break; case MAIN_DONE : fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i); break; } fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend); } sleep(1); } return 0; } #endif /* Initializing routine */ int blas_thread_init(void){ BLASLONG i; int ret; int thread_timeout_env; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif if (blas_server_avail) return 0; #ifdef NEED_STACKATTR pthread_attr_init(&attr); pthread_attr_setguardsize(&attr, 0x1000U); pthread_attr_setstacksize( &attr, 0x1000U); #endif LOCK_COMMAND(&server_lock); if (!blas_server_avail){ thread_timeout_env=openblas_thread_timeout(); if (thread_timeout_env>0) { if (thread_timeout_env < 4) thread_timeout_env = 4; if (thread_timeout_env > 30) thread_timeout_env = 30; thread_timeout = (1 << thread_timeout_env); } for(i = 0; i < blas_num_threads - 1; i++){ thread_status[i].queue = (blas_queue_t *)NULL; thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_mutex_init(&thread_status[i].lock, NULL); pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, &blas_thread_server, (void *)i); #else ret=pthread_create(&blas_threads[i], NULL, &blas_thread_server, (void *)i); #endif if(ret!=0){ struct rlimit rlim; const char *msg = strerror(ret); fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max)); } #endif if(0 != raise(SIGINT)) { fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n"); exit(EXIT_FAILURE); } } } #ifdef MONITOR pthread_create(&monitor_thread, NULL, (void *)&blas_monitor, (void *)NULL); #endif blas_server_avail = 1; } UNLOCK_COMMAND(&server_lock); return 0; } /* User can call one of two routines. exec_blas_async ... immediately returns after jobs are queued. exec_blas ... returns after jobs are finished. */ static BLASULONG exec_queue_lock = 0; int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #ifdef SMP_SERVER // Handle lazy re-init of the thread-pool after a POSIX fork if (unlikely(blas_server_avail == 0)) blas_thread_init(); #endif BLASLONG i = 0; blas_queue_t *current = queue; blas_queue_t *tsiq,*tspq; #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) int node = get_node(); int nodes = get_num_nodes(); #endif #ifdef SMP_DEBUG int exec_count = 0; fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos); #endif blas_lock(&exec_queue_lock); while (queue) { queue -> position = pos; #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); #endif #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) /* Node Mapping Mode */ if (queue -> mode & BLAS_NODE) { do { while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; if (i < blas_num_threads - 1) break; i ++; if (i >= blas_num_threads - 1) { i = 0; node ++; if (node >= nodes) node = 0; } } while (1); } else { while(thread_status[i].queue) { i ++; if (i >= blas_num_threads - 1) i = 0; } } #else pthread_mutex_lock (&thread_status[i].lock); tsiq=thread_status[i].queue ; pthread_mutex_unlock (&thread_status[i].lock); while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; pthread_mutex_lock (&thread_status[i].lock); tsiq=thread_status[i].queue ; pthread_mutex_unlock (&thread_status[i].lock); } #endif queue -> assigned = i; WMB; pthread_mutex_lock (&thread_status[i].lock); thread_status[i].queue = queue; pthread_mutex_unlock (&thread_status[i].lock); WMB; queue = queue -> next; pos ++; #ifdef SMP_DEBUG exec_count ++; #endif } blas_unlock(&exec_queue_lock); #ifdef SMP_DEBUG fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count); #endif while (current) { pos = current -> assigned; pthread_mutex_lock (&thread_status[pos].lock); tspq=thread_status[pos].queue; pthread_mutex_unlock (&thread_status[pos].lock); if ((BLASULONG)tspq > 1) { pthread_mutex_lock (&thread_status[pos].lock); if (thread_status[pos].status == THREAD_STATUS_SLEEP) { #ifdef MONITOR num_suspend ++; #endif if (thread_status[pos].status == THREAD_STATUS_SLEEP) { thread_status[pos].status = THREAD_STATUS_WAKEUP; pthread_cond_signal(&thread_status[pos].wakeup); } } pthread_mutex_unlock(&thread_status[pos].lock); } current = current -> next; } return 0; } int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ blas_queue_t * tsqq; while ((num > 0) && queue) { pthread_mutex_lock(&thread_status[queue->assigned].lock); tsqq=thread_status[queue -> assigned].queue; pthread_mutex_unlock(&thread_status[queue->assigned].lock); while(tsqq) { YIELDING; pthread_mutex_lock(&thread_status[queue->assigned].lock); tsqq=thread_status[queue -> assigned].queue; pthread_mutex_unlock(&thread_status[queue->assigned].lock); }; queue = queue -> next; num --; } #ifdef SMP_DEBUG fprintf(STDERR, "Done.\n\n"); #endif return 0; } /* Execute Threads */ int exec_blas(BLASLONG num, blas_queue_t *queue){ #ifdef SMP_SERVER // Handle lazy re-init of the thread-pool after a POSIX fork if (unlikely(blas_server_avail == 0)) blas_thread_init(); #endif int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); #ifdef TIMING_DEBUG BLASULONG start, stop; #endif if ((num <= 0) || (queue == NULL)) return 0; #ifdef SMP_DEBUG fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); #endif #ifdef __ELF__ if (omp_in_parallel && (num > 1)) { if (omp_in_parallel() > 0) { fprintf(stderr, "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. " "Please rebuild the library with USE_OPENMP=1 option.\n"); } } #endif if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); #ifdef TIMING_DEBUG start = rpcc(); fprintf(STDERR, "\n"); #endif routine = queue -> routine; if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); } else if (queue -> mode & BLAS_PTHREAD) { void (*pthreadcompat)(void *) = queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, queue -> sa, queue -> sb, 0); #ifdef TIMING_DEBUG stop = rpcc(); #endif if ((num > 1) && queue -> next) { exec_blas_async_wait(num - 1, queue -> next); // arm: make sure results from other threads are visible MB; } #ifdef TIMING_DEBUG fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", start, stop, stop - start); #endif return 0; } void goto_set_num_threads(int num_threads) { long i; if (num_threads < 1) num_threads = blas_num_threads; #ifndef NO_AFFINITY if (num_threads == 1) { if (blas_cpu_number == 1){ //OpenBLAS is already single thread. return; }else{ //From multi-threads to single thread //Restore the original affinity mask gotoblas_set_affinity(-1); } } #endif if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > blas_num_threads) { LOCK_COMMAND(&server_lock); increased_threads = 1; for(i = blas_num_threads - 1; i < num_threads - 1; i++){ thread_status[i].queue = (blas_queue_t *)NULL; thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_mutex_init(&thread_status[i].lock, NULL); pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR pthread_create(&blas_threads[i], &attr, &blas_thread_server, (void *)i); #else pthread_create(&blas_threads[i], NULL, &blas_thread_server, (void *)i); #endif } blas_num_threads = num_threads; UNLOCK_COMMAND(&server_lock); } #ifndef NO_AFFINITY if(blas_cpu_number == 1 && num_threads > 1){ //Restore the thread 0 affinity. gotoblas_set_affinity(0); } #endif blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) //set parameters for different number of threads. blas_set_parameter(); #endif } void openblas_set_num_threads(int num_threads) { goto_set_num_threads(num_threads); } /* Compatible function with pthread_create / join */ int gotoblas_pthread(int numthreads, void *function, void *args, int stride) { blas_queue_t queue[MAX_CPU_NUMBER]; int i; if (numthreads <= 0) return 0; #ifdef SMP if (blas_cpu_number == 0) blas_get_cpu_number(); #ifdef SMP_SERVER if (blas_server_avail == 0) blas_thread_init(); #endif #endif for (i = 0; i < numthreads; i ++) { queue[i].mode = BLAS_PTHREAD; queue[i].routine = function; queue[i].args = args; queue[i].range_m = NULL; queue[i].range_n = NULL; queue[i].sa = args; queue[i].sb = args; queue[i].next = &queue[i + 1]; args += stride; } queue[numthreads - 1].next = NULL; exec_blas(numthreads, queue); return 0; } /* Shutdown procedure, but user don't have to call this routine. The */ /* kernel automatically kill threads. */ int BLASFUNC(blas_thread_shutdown)(void){ int i; if (!blas_server_avail) return 0; LOCK_COMMAND(&server_lock); for (i = 0; i < blas_num_threads - 1; i++) { blas_lock(&exec_queue_lock); thread_status[i].queue = (blas_queue_t *)-1; blas_unlock(&exec_queue_lock); pthread_mutex_lock (&thread_status[i].lock); thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_cond_signal (&thread_status[i].wakeup); pthread_mutex_unlock(&thread_status[i].lock); } for(i = 0; i < blas_num_threads - 1; i++){ pthread_join(blas_threads[i], NULL); } for(i = 0; i < blas_num_threads - 1; i++){ pthread_mutex_destroy(&thread_status[i].lock); pthread_cond_destroy (&thread_status[i].wakeup); } #ifdef NEED_STACKATTR pthread_attr_destory(&attr); #endif blas_server_avail = 0; UNLOCK_COMMAND(&server_lock); return 0; } #endif OpenBLAS-0.2.20/driver/others/blas_server_omp.c000066400000000000000000000225551313527062700213020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include //#include #include "common.h" #ifndef USE_OPENMP #include "blas_server.c" #else int blas_server_avail = 0; static void * blas_thread_buffer[MAX_CPU_NUMBER]; void goto_set_num_threads(int num_threads) { int i=0; if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > blas_num_threads) { blas_num_threads = num_threads; } blas_cpu_number = num_threads; omp_set_num_threads(blas_cpu_number); //adjust buffer for each thread for(i=0; i m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else #endif if (mode & BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else { /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } } else { #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], ((xdouble *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else #endif if (mode & BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], ((double *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], ((float *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } } } static void exec_threads(blas_queue_t *queue){ void *buffer, *sa, *sb; int pos=0, release_flag=0; buffer = NULL; sa = queue -> sa; sb = queue -> sb; #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { pos = omp_get_thread_num(); buffer = blas_thread_buffer[pos]; //fallback if(buffer==NULL) { buffer = blas_memory_alloc(2); release_flag=1; } if (sa == NULL) { sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); queue->sa=sa; } if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } else { #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } queue->sb=sb; } } if (queue -> mode & BLAS_LEGACY) { legacy_exec(queue -> routine, queue -> mode, queue -> args, sb); } else if (queue -> mode & BLAS_PTHREAD) { void (*pthreadcompat)(void *) = queue -> routine; (pthreadcompat)(queue -> args); } else { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); } if (release_flag) blas_memory_free(buffer); } int exec_blas(BLASLONG num, blas_queue_t *queue){ BLASLONG i; if ((num <= 0) || (queue == NULL)) return 0; #ifdef CONSISTENT_FPCSR for (i = 0; i < num; i ++) { __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); } #endif #pragma omp parallel for schedule(static) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 queue[i].position = i; #endif exec_threads(&queue[i]); } return 0; } #endif OpenBLAS-0.2.20/driver/others/blas_server_win32.c000066400000000000000000000331501313527062700214420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" /* This is a thread implementation for Win32 lazy implementation */ /* Thread server common infomation */ typedef struct{ CRITICAL_SECTION lock; HANDLE filled; HANDLE killed; blas_queue_t *queue; /* Parameter Pointer */ int shutdown; /* server shutdown flag */ } blas_pool_t; /* We need this grobal for cheking if initialization is finished. */ int blas_server_avail = 0; /* Local Variables */ static BLASULONG server_lock = 0; static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else #endif if (mode & BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else { /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } } else { #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], ((xdouble *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else #endif if (mode & BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], ((double *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } else { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], ((float *)args -> alpha)[1], args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); } } } /* This is a main routine of threads. Each thread waits until job is */ /* queued. */ static DWORD WINAPI blas_thread_server(void *arg){ /* Thread identifier */ #ifdef SMP_DEBUG BLASLONG cpu = (BLASLONG)arg; #endif void *buffer, *sa, *sb; blas_queue_t *queue; DWORD action; HANDLE handles[] = {pool.filled, pool.killed}; /* Each server needs each buffer */ buffer = blas_memory_alloc(2); #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); #endif while (1){ /* Waiting for Queue */ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); #endif do { action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); if (action == WAIT_OBJECT_0 + 1) break; #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); #endif EnterCriticalSection(&pool.lock); queue = pool.queue; if (queue) pool.queue = queue->next; LeaveCriticalSection(&pool.lock); if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; if (pool.queue) SetEvent(pool.filled); sa = queue -> sa; sb = queue -> sb; #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); #endif // fprintf(stderr, "queue start[%ld]!!!\n", cpu); #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; #endif if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } else { #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } queue->sb=sb; } #ifdef MONITOR main_status[cpu] = MAIN_RUNNING2; #endif if (!(queue -> mode & BLAS_LEGACY)) { (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); } else { legacy_exec(routine, queue -> mode, queue -> args, sb); } }else{ continue; //if queue == NULL } #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); #endif EnterCriticalSection(&queue->lock); queue -> status = BLAS_STATUS_FINISHED; LeaveCriticalSection(&queue->lock); SetEvent(queue->finish); } /* Shutdown procedure */ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); #endif blas_memory_free(buffer); return 0; } /* Initializing routine */ int blas_thread_init(void){ BLASLONG i; if (blas_server_avail || (blas_cpu_number <= 1)) return 0; LOCK_COMMAND(&server_lock); #ifdef SMP_DEBUG fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", blas_cpu_number); #endif if (!blas_server_avail){ InitializeCriticalSection(&pool.lock); pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); pool.shutdown = 0; pool.queue = NULL; for(i = 0; i < blas_cpu_number - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, 0, &blas_threads_id[i]); } blas_server_avail = 1; } UNLOCK_COMMAND(&server_lock); return 0; } /* User can call one of two routines. exec_blas_async ... immediately returns after jobs are queued. exec_blas ... returns after jobs are finished. */ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ blas_queue_t *current; current = queue; while (current) { InitializeCriticalSection(¤t -> lock); current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); current -> position = pos; #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); #endif current = current -> next; pos ++; } EnterCriticalSection(&pool.lock); if (pool.queue) { current = pool.queue; while (current -> next) current = current -> next; current -> next = queue; } else { pool.queue = queue; } LeaveCriticalSection(&pool.lock); SetEvent(pool.filled); return 0; } int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ #ifdef SMP_DEBUG fprintf(STDERR, "Synchronization Waiting.\n"); #endif while (num){ #ifdef SMP_DEBUG fprintf(STDERR, "Waiting Queue ..\n"); #endif WaitForSingleObject(queue->finish, INFINITE); CloseHandle(queue->finish); DeleteCriticalSection(&queue -> lock); queue = queue -> next; num --; } #ifdef SMP_DEBUG fprintf(STDERR, "Completely Done.\n\n"); #endif return 0; } /* Execute Threads */ int exec_blas(BLASLONG num, blas_queue_t *queue){ #ifndef ALL_THREADED int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); #endif if ((num <= 0) || (queue == NULL)) return 0; if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); routine = queue -> routine; if (!(queue -> mode & BLAS_LEGACY)) { (routine)(queue -> args, queue -> range_m, queue -> range_n, queue -> sa, queue -> sb, 0); } else { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); } if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); return 0; } /* Shutdown procedure, but user don't have to call this routine. The */ /* kernel automatically kill threads. */ int BLASFUNC(blas_thread_shutdown)(void){ int i; if (!blas_server_avail) return 0; LOCK_COMMAND(&server_lock); if (blas_server_avail){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ WaitForSingleObject(blas_threads[i], 5); //INFINITE); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP TerminateThread(blas_threads[i],0); #endif } blas_server_avail = 0; } UNLOCK_COMMAND(&server_lock); return 0; } void goto_set_num_threads(int num_threads) { long i; if (num_threads < 1) num_threads = blas_cpu_number; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > blas_num_threads) { LOCK_COMMAND(&server_lock); //increased_threads = 1; if (!blas_server_avail){ InitializeCriticalSection(&pool.lock); pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); pool.shutdown = 0; pool.queue = NULL; blas_server_avail = 1; } for(i = blas_num_threads - 1; i < num_threads - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, 0, &blas_threads_id[i]); } blas_num_threads = num_threads; UNLOCK_COMMAND(&server_lock); } blas_cpu_number = num_threads; } void openblas_set_num_threads(int num) { goto_set_num_threads(num); } OpenBLAS-0.2.20/driver/others/divtable.c000066400000000000000000000115471313527062700177110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #ifdef SMP #if !defined(USE64BITINT) || defined(ARCH_X86) unsigned int blas_quick_divide_table[] = { 0x00000000, 0x00000001, 0x80000001, 0x55555556, 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, 0x20000001, 0x1c71c71d, 0x1999999a, 0x1745d175, 0x15555556, 0x13b13b14, 0x12492493, 0x11111112, 0x10000001, 0x0f0f0f10, 0x0e38e38f, 0x0d79435f, 0x0ccccccd, 0x0c30c30d, 0x0ba2e8bb, 0x0b21642d, 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, 0x0924924a, 0x08d3dcb1, 0x08888889, 0x08421085, 0x08000001, 0x07c1f07d, 0x07878788, 0x07507508, 0x071c71c8, 0x06eb3e46, 0x06bca1b0, 0x06906907, 0x06666667, 0x063e7064, 0x06186187, 0x05f417d1, 0x05d1745e, 0x05b05b06, 0x0590b217, 0x0572620b, 0x05555556, 0x0539782a, 0x051eb852, 0x05050506, 0x04ec4ec5, 0x04d4873f, 0x04bda130, 0x04a7904b, 0x04924925, 0x047dc120, 0x0469ee59, 0x0456c798, 0x04444445, 0x04325c54, 0x04210843, 0x04104105, 0x04000001, }; #else BLASULONG blas_quick_divide_table[] = { 0x0000000000000000, 0x0000000000000001, 0x8000000000000001, 0x5555555555555557, 0x4000000000000001, 0x3333333333333335, 0x2aaaaaaaaaaaaaac, 0x2492492492492494, 0x2000000000000001, 0x1c71c71c71c71c73, 0x199999999999999b, 0x1745d1745d1745d3, 0x1555555555555557, 0x13b13b13b13b13b3, 0x124924924924924b, 0x1111111111111113, 0x1000000000000001, 0x0f0f0f0f0f0f0f11, 0x0e38e38e38e38e3a, 0x0d79435e50d79437, 0x0cccccccccccccce, 0x0c30c30c30c30c32, 0x0ba2e8ba2e8ba2ea, 0x0b21642c8590b218, 0x0aaaaaaaaaaaaaac, 0x0a3d70a3d70a3d72, 0x09d89d89d89d89da, 0x097b425ed097b427, 0x0924924924924926, 0x08d3dcb08d3dcb0a, 0x088888888888888a, 0x0842108421084212, 0x0800000000000001, 0x07c1f07c1f07c1f2, 0x0787878787878789, 0x0750750750750752, 0x071c71c71c71c71e, 0x06eb3e45306eb3e6, 0x06bca1af286bca1c, 0x0690690690690692, 0x0666666666666668, 0x063e7063e7063e72, 0x061861861861861a, 0x05f417d05f417d07, 0x05d1745d1745d176, 0x05b05b05b05b05b2, 0x0590b21642c8590d, 0x0572620ae4c415cb, 0x0555555555555557, 0x05397829cbc14e60, 0x051eb851eb851eba, 0x0505050505050507, 0x04ec4ec4ec4ec4ee, 0x04d4873ecade304f, 0x04bda12f684bda14, 0x04a7904a7904a792, 0x0492492492492494, 0x047dc11f7047dc13, 0x0469ee58469ee586, 0x0456c797dd49c343, 0x0444444444444446, 0x04325c53ef368eb2, 0x042108421084210a, 0x0410410410410412, 0x0400000000000001, }; #endif #endif OpenBLAS-0.2.20/driver/others/dynamic.c000066400000000000000000000424271313527062700175440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #ifdef ARCH_X86 #define EXTERN extern #else #define EXTERN #endif EXTERN gotoblas_t gotoblas_KATMAI; EXTERN gotoblas_t gotoblas_COPPERMINE; EXTERN gotoblas_t gotoblas_NORTHWOOD; EXTERN gotoblas_t gotoblas_BANIAS; EXTERN gotoblas_t gotoblas_ATHLON; extern gotoblas_t gotoblas_PRESCOTT; extern gotoblas_t gotoblas_ATOM; extern gotoblas_t gotoblas_NANO; extern gotoblas_t gotoblas_CORE2; extern gotoblas_t gotoblas_PENRYN; extern gotoblas_t gotoblas_DUNNINGTON; extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BOBCAT; #ifndef NO_AVX extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; extern gotoblas_t gotoblas_STEAMROLLER; extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; #endif #else //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA #define gotoblas_EXCAVATOR gotoblas_BARCELONA #define gotoblas_ZEN gotoblas_BARCELONA #endif #define VENDOR_INTEL 1 #define VENDOR_AMD 2 #define VENDOR_CENTAUR 3 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ //Use binary code for xgetbv __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } #endif int support_avx(){ #ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; cpuid(1, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ xgetbv(0, &eax, &edx); if((eax & 6) == 6){ ret=1; //OS support AVX } } return ret; #else return 0; #endif } extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" static int get_vendor(void){ int eax, ebx, ecx, edx; union { char vchar[16]; int vint[4]; } vendor; cpuid(0, &eax, &ebx, &ecx, &edx); *(&vendor.vint[0]) = ebx; *(&vendor.vint[1]) = edx; *(&vendor.vint[2]) = ecx; vendor.vchar[12] = '\0'; if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; return VENDOR_UNKNOWN; } static gotoblas_t *get_coretype(void){ int eax, ebx, ecx, edx; int family, exfamily, model, vendor, exmodel; cpuid(1, &eax, &ebx, &ecx, &edx); family = BITMASK(eax, 8, 0x0f); exfamily = BITMASK(eax, 20, 0xff); model = BITMASK(eax, 4, 0x0f); exmodel = BITMASK(eax, 16, 0x0f); vendor = get_vendor(); if (vendor == VENDOR_INTEL){ switch (family) { case 0x6: switch (exmodel) { case 0: if (model <= 0x7) return &gotoblas_KATMAI; if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; if (model == 14) return &gotoblas_BANIAS; if (model == 15) return &gotoblas_CORE2; return NULL; case 1: if (model == 6) return &gotoblas_CORE2; if (model == 7) return &gotoblas_PENRYN; if (model == 13) return &gotoblas_DUNNINGTON; if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; if (model == 12) return &gotoblas_ATOM; return NULL; case 2: //Intel Core (Clarkdale) / Core (Arrandale) // Pentium (Clarkdale) / Pentium Mobile (Arrandale) // Xeon (Clarkdale), 32nm if (model == 5) return &gotoblas_NEHALEM; //Intel Xeon Processor 5600 (Westmere-EP) //Xeon Processor E7 (Westmere-EX) //Xeon E7540 if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; //Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 if (model == 10 || model == 13) { if(support_avx()) return &gotoblas_SANDYBRIDGE; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } return NULL; case 3: //Intel Sandy Bridge 22nm (Ivy Bridge?) if (model == 10 || model == 14) { if(support_avx()) return &gotoblas_SANDYBRIDGE; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Haswell if (model == 12 || model == 15) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 13) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } if (model == 7) return &gotoblas_ATOM; //Bay Trail return NULL; case 4: //Intel Haswell if (model == 5 || model == 6) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Broadwell if (model == 7 || model == 15) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Skylake if (model == 14) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Braswell / Avoton if (model == 12 || model == 13) { return &gotoblas_NEHALEM; } return NULL; case 5: //Intel Broadwell if (model == 6) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Skylake if (model == 14 || model == 5) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Intel Phi Knights Landing if (model == 7) { if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } //Apollo Lake if (model == 12) { return &gotoblas_NEHALEM; } return NULL; case 9: case 8: if (model == 14 ) { // Kaby Lake if(support_avx()) return &gotoblas_HASWELL; else{ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } return NULL; } case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; return &gotoblas_PRESCOTT; } } if (vendor == VENDOR_AMD){ if (family <= 0xe) { // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon cpuid(0x80000000, &eax, &ebx, &ecx, &edx); if ( (eax & 0xffff) >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) return NULL; } else return NULL; return &gotoblas_ATHLON; } if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; } else if (exfamily == 5) { return &gotoblas_BOBCAT; } else if (exfamily == 6) { if(model == 1){ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series if(support_avx()) return &gotoblas_BULLDOZER; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if(model == 2 || model == 3){ //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 if(support_avx()) return &gotoblas_PILEDRIVER; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if(model == 5){ if(support_avx()) return &gotoblas_EXCAVATOR; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if(model == 0 || model == 8){ if (exmodel == 1) { //AMD Trinity if(support_avx()) return &gotoblas_PILEDRIVER; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if (exmodel == 3) { //AMD STEAMROLLER if(support_avx()) return &gotoblas_STEAMROLLER; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if (exmodel == 6) { if(support_avx()) return &gotoblas_EXCAVATOR; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } } } else if (exfamily == 8) { if (model == 1) { if(support_avx()) return &gotoblas_ZEN; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } }else { return &gotoblas_BARCELONA; } } } if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: return &gotoblas_NANO; } } return NULL; } static char *corename[] = { "Unknown", "Katmai", "Coppermine", "Northwood", "Prescott", "Banias", "Atom", "Core2", "Penryn", "Dunnington", "Nehalem", "Athlon", "Opteron", "Opteron_SSE3", "Barcelona", "Nano", "Sandybridge", "Bobcat", "Bulldozer", "Piledriver", "Haswell", "Steamroller", "Excavator", "Zen" }; char *gotoblas_corename(void) { if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; if (gotoblas == &gotoblas_ATOM) return corename[ 6]; if (gotoblas == &gotoblas_CORE2) return corename[ 7]; if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; if (gotoblas == &gotoblas_NEHALEM) return corename[10]; if (gotoblas == &gotoblas_ATHLON) return corename[11]; if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; if (gotoblas == &gotoblas_OPTERON) return corename[13]; if (gotoblas == &gotoblas_BARCELONA) return corename[14]; if (gotoblas == &gotoblas_NANO) return corename[15]; if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_BOBCAT) return corename[17]; if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_ZEN) return corename[23]; return corename[0]; } static gotoblas_t *force_coretype(char *coretype){ int i ; int found = -1; char message[128]; //char mname[20]; for ( i=1 ; i <= 23; i++) { if (!strncasecmp(coretype,corename[i],20)) { found = i; break; } } if (found < 0) { //strncpy(mname,coretype,20); snprintf(message, 128, "Core not found: %s\n",coretype); openblas_warning(1, message); return(NULL); } switch (found) { case 23: return (&gotoblas_ZEN); case 22: return (&gotoblas_EXCAVATOR); case 21: return (&gotoblas_STEAMROLLER); case 20: return (&gotoblas_HASWELL); case 19: return (&gotoblas_PILEDRIVER); case 18: return (&gotoblas_BULLDOZER); case 17: return (&gotoblas_BOBCAT); case 16: return (&gotoblas_SANDYBRIDGE); case 15: return (&gotoblas_NANO); case 14: return (&gotoblas_BARCELONA); case 13: return (&gotoblas_OPTERON); case 12: return (&gotoblas_OPTERON_SSE3); case 11: return (&gotoblas_ATHLON); case 10: return (&gotoblas_NEHALEM); case 9: return (&gotoblas_DUNNINGTON); case 8: return (&gotoblas_PENRYN); case 7: return (&gotoblas_CORE2); case 6: return (&gotoblas_ATOM); case 5: return (&gotoblas_BANIAS); case 4: return (&gotoblas_PRESCOTT); case 3: return (&gotoblas_NORTHWOOD); case 2: return (&gotoblas_COPPERMINE); case 1: return (&gotoblas_KATMAI); } return(NULL); } void gotoblas_dynamic_init(void) { char coremsg[128]; char coren[22]; char *p; if (gotoblas) return; p = getenv("OPENBLAS_CORETYPE"); if ( p ) { gotoblas = force_coretype(p); } else { gotoblas = get_coretype(); } #ifdef ARCH_X86 if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ if (sizeof(void*) == 8) { if (gotoblas == &gotoblas_KATMAI || gotoblas == &gotoblas_COPPERMINE || gotoblas == &gotoblas_NORTHWOOD || gotoblas == &gotoblas_BANIAS || gotoblas == &gotoblas_ATHLON) gotoblas = &gotoblas_PRESCOTT; } #endif if (gotoblas && gotoblas -> init) { strncpy(coren,gotoblas_corename(),20); sprintf(coremsg, "Core: %s\n",coren); openblas_warning(2, coremsg); gotoblas -> init(); } else { openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); exit(1); } } void gotoblas_dynamic_quit(void) { gotoblas = NULL; } OpenBLAS-0.2.20/driver/others/init.c000066400000000000000000000604101313527062700170530ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #if defined(OS_LINUX) && defined(SMP) #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #if defined(BIGNUMA) // max number of nodes as defined in numa.h // max cpus as defined in sched.h #define MAX_NODES 128 #define MAX_CPUS CPU_SETSIZE #else #define MAX_NODES 16 #define MAX_CPUS 256 #endif #define NCPUBITS (8*sizeof(unsigned long)) #define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) #define CPUELT(cpu) ((cpu) / NCPUBITS) #define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) #define SH_MAGIC 0x510510 #define CPUMAP_NAME "/sys/devices/system/node/node%d/cpumap" #define SHARE_NAME "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map" #define NODE_DIR "/sys/devices/system/node" //#undef DEBUG /* Private variables */ typedef struct { unsigned long lock; unsigned int magic; unsigned int shmid; int num_nodes; int num_procs; int final_num_procs; unsigned long avail [MAX_BITMASK_LEN]; int avail_count; unsigned long cpu_info [MAX_CPUS]; unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; int cpu_use[MAX_CPUS]; } shm_t; static cpu_set_t cpu_orig_mask[4]; static int cpu_mapping[MAX_CPUS]; static int node_mapping[MAX_CPUS * 4]; static int cpu_sub_mapping[MAX_CPUS]; static int disable_mapping; /* Number of cores per nodes */ static int node_cpu[MAX_NODES]; static int node_equal = 0; static shm_t *common = (void *)-1; static int shmid, pshmid; static void *paddr; static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; static int lprocmask_count = 0; static int numprocs = 1; static int numnodes = 1; #if 1 #define READ_CPU(x) ( (x) & 0xff) #define READ_NODE(x) (((x) >> 8) & 0xff) #define READ_CORE(x) (((x) >> 16) & 0xff) #define WRITE_CPU(x) (x) #define WRITE_NODE(x) ((x) << 8) #define WRITE_CORE(x) ((x) << 16) #else #define READ_CPU(x) ( (x) & 0xff) #define READ_CORE(x) (((x) >> 8) & 0xff) #define READ_NODE(x) (((x) >> 16) & 0xff) #define WRITE_CPU(x) (x) #define WRITE_CORE(x) ((x) << 8) #define WRITE_NODE(x) ((x) << 16) #endif static inline int popcount(unsigned long number) { int count = 0; while (number > 0) { if (number & 1) count ++; number >>= 1; } return count; } static inline int rcount(unsigned long number) { int count = -1; while ((number > 0) && ((number & 0)) == 0) { count ++; number >>= 1; } return count; } /*** Known issue: The number of CPUs/cores should less than sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. ***/ static inline void get_cpumap(int node, unsigned long * node_info) { int infile; unsigned long affinity[32]; char name[160]; char cpumap[160]; char *dummy; int i=0; int count=0; int k=0; sprintf(name, CPUMAP_NAME, node); infile = open(name, O_RDONLY); for(i=0; i<32; i++){ affinity[i] = 0; } if (infile != -1) { read(infile, cpumap, sizeof(cpumap)); for(i=0; i<160; i++){ if(cpumap[i] == '\n') break; if(cpumap[i] != ','){ name[k++]=cpumap[i]; //Enough data for Hex if(k >= NCPUBITS/4){ affinity[count++] = strtoul(name, &dummy, 16); k=0; } } } if(k!=0){ name[k]='\0'; affinity[count++] = strtoul(name, &dummy, 16); k=0; } // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... // revert the sequence for(i=0; i= NCPUBITS/4){ affinity[count++] = strtoul(name, &dummy, 16); k=0; } } } if(k!=0){ name[k]='\0'; affinity[count++] = strtoul(name, &dummy, 16); k=0; } // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... // revert the sequence for(i=0; i num_nodes = 0; dp = opendir(NODE_DIR); if (dp == NULL) { common -> num_nodes = 1; return 0; } for (node = 0; node < MAX_NODES; node ++) { for (j = 0; j node_info[node][j] = 0; } while ((dir = readdir(dp)) != NULL) { if (strncmp(dir->d_name, "node", 4)==0) { node = atoi(&dir -> d_name[4]); if (node > MAX_NODES) { fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); exit(1); } common -> num_nodes ++; get_cpumap(node, common->node_info[node]); } } closedir(dp); if (common -> num_nodes == 1) return 1; #ifdef DEBUG fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); for (node = 0; node < common -> num_nodes; node ++) fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); #endif return common -> num_nodes; } #if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 6) int sched_getcpu(void) { int cpu; FILE *fp = NULL; if ( (fp = fopen("/proc/self/stat", "r")) == NULL) return -1; if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) { fclose (fp); return -1; } fclose (fp); return(cpu); } #endif #endif static void numa_mapping(void) { int node, cpu, core; int i, j, h; unsigned long work, bit; int count = 0; int bitmask_idx = 0; int current_cpu; int current_node = 0; int cpu_count = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; for (cpu = 0; cpu < common -> num_procs; cpu ++) { bitmask_idx = CPUELT(cpu); if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); count ++; core ++; } } } #ifdef DEBUG fprintf(stderr, "\nFrom /sys ...\n\n"); for (cpu = 0; cpu < count; cpu++) fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); #endif current_cpu = sched_getcpu(); for (cpu = 0; cpu < count; cpu++) { if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) { current_node = READ_NODE(common -> cpu_info[cpu]); break; } } for (i = 0; i < MAX_BITMASK_LEN; i++) cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]); /* * If all the processes can be accommodated in the * in the current node itself, then bind to cores * from the current node only */ if (numprocs <= cpu_count) { /* * First sort all the cores in order from the current node. * Then take remaining nodes one by one in order, * and sort their cores in order. */ for (i = 0; i < count; i++) { for (j = 0; j < count - 1; j++) { int node_1, node_2; int core_1, core_2; int swap = 0; node_1 = READ_NODE(common -> cpu_info[j]); node_2 = READ_NODE(common -> cpu_info[j + 1]); core_1 = READ_CORE(common -> cpu_info[j]); core_2 = READ_CORE(common -> cpu_info[j + 1]); if (node_1 == node_2) { if (core_1 > core_2) swap = 1; } else { if ((node_2 == current_node) || ((node_1 != current_node) && (node_1 > node_2))) swap = 1; } if (swap) { unsigned long temp; temp = common->cpu_info[j]; common->cpu_info[j] = common->cpu_info[j + 1]; common->cpu_info[j + 1] = temp; } } } } else { h = 1; while (h < count) h = 2 * h + 1; while (h > 1) { h /= 2; for (i = h; i < count; i++) { work = common -> cpu_info[i]; bit = CPU_ISSET(i, &cpu_orig_mask[0]); j = i - h; while (work < common -> cpu_info[j]) { common -> cpu_info[j + h] = common -> cpu_info[j]; if (CPU_ISSET(j, &cpu_orig_mask[0])) { CPU_SET(j + h, &cpu_orig_mask[0]); } else { CPU_CLR(j + h, &cpu_orig_mask[0]); } j -= h; if (j < 0) break; } common -> cpu_info[j + h] = work; if (bit) { CPU_SET(j + h, &cpu_orig_mask[0]); } else { CPU_CLR(j + h, &cpu_orig_mask[0]); } } } } #ifdef DEBUG fprintf(stderr, "\nSorting ...\n\n"); for (cpu = 0; cpu < count; cpu++) fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu], READ_CPU(common -> cpu_info[cpu]), READ_CORE(common -> cpu_info[cpu]), READ_NODE(common -> cpu_info[cpu])); #endif } static void disable_hyperthread(void) { unsigned long share[MAX_BITMASK_LEN]; int cpu; int bitmask_idx = 0; int i=0, count=0; bitmask_idx = CPUELT(common -> num_procs); for(i=0; i< bitmask_idx; i++){ common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; } if(CPUMASK(common -> num_procs) != 1){ common -> avail[count++] = CPUMASK(common -> num_procs) - 1; } common -> avail_count = count; /* if(common->num_procs > 64){ */ /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ /* exit(1); */ /* }else if(common->num_procs == 64){ */ /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ /* }else */ /* common -> avail = (1UL << common -> num_procs) - 1; */ #ifdef DEBUG fprintf(stderr, "\nAvail CPUs : "); for(i=0; i avail[i]); fprintf(stderr, ".\n"); #endif for (cpu = 0; cpu < common -> num_procs; cpu ++) { get_share(cpu, 1, share); //When the shared cpu are in different element of share & avail array, this may be a bug. for (i = 0; i < count ; i++){ share[i] &= common->avail[i]; if (popcount(share[i]) > 1) { #ifdef DEBUG fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", cpu, share[i] & ~(CPUMASK(cpu))); #endif common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); } } } } static void disable_affinity(void) { int i=0; int bitmask_idx=0; int count=0; #ifdef DEBUG fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); #endif /* if(common->final_num_procs > 64){ */ /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ /* exit(1); */ /* }else if(common->final_num_procs == 64){ */ /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ /* }else */ /* lprocmask = (1UL << common -> final_num_procs) - 1; */ bitmask_idx = CPUELT(common -> final_num_procs); for(i=0; i< bitmask_idx; i++){ lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; } if(CPUMASK(common -> final_num_procs) != 1){ lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; } lprocmask_count = count; #ifndef USE_OPENMP for(i=0; i< count; i++){ lprocmask[i] &= common->avail[i]; } #endif #ifdef DEBUG fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); #endif } static void setup_mempolicy(void) { int cpu, mynode, maxcpu; for (cpu = 0; cpu < MAX_NODES; cpu ++) node_cpu[cpu] = 0; maxcpu = 0; for (cpu = 0; cpu < numprocs; cpu ++) { mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[cpu]]); lnodemask |= (1UL << mynode); node_cpu[mynode] ++; if (maxcpu < node_cpu[mynode]) maxcpu = node_cpu[mynode]; } node_equal = 1; for (cpu = 0; cpu < MAX_NODES; cpu ++) if ((node_cpu[cpu] != 0) && (node_cpu[cpu] != maxcpu)) node_equal = 0; if (lnodemask) { #ifdef DEBUG fprintf(stderr, "Node mask = %lx\n", lnodemask); #endif my_set_mempolicy(MPOL_INTERLEAVE, &lnodemask, sizeof(lnodemask) * 8); numnodes = popcount(lnodemask); } } static inline int is_dead(int id) { struct shmid_ds ds; return shmctl(id, IPC_STAT, &ds); } static void open_shmem(void) { int try = 0; do { #if defined(BIGNUMA) // raised to 32768, enough for 128 nodes and 1024 cups shmid = shmget(SH_MAGIC, 32768, 0666); #else shmid = shmget(SH_MAGIC, 4096, 0666); #endif if (shmid == -1) { #if defined(BIGNUMA) shmid = shmget(SH_MAGIC, 32768, IPC_CREAT | 0666); #else shmid = shmget(SH_MAGIC, 4096, IPC_CREAT | 0666); #endif } try ++; } while ((try < 10) && (shmid == -1)); if (shmid == -1) { fprintf(stderr, "GotoBLAS : Can't open shared memory. Terminated.\n"); exit(1); } if (shmid != -1) common = (shm_t *)shmat(shmid, NULL, 0); #ifdef DEBUG fprintf(stderr, "Shared Memory id = %x Address = %p\n", shmid, common); #endif } static void create_pshmem(void) { pshmid = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); paddr = shmat(pshmid, NULL, 0); shmctl(pshmid, IPC_RMID, 0); #ifdef DEBUG fprintf(stderr, "Private Shared Memory id = %x Address = %p\n", pshmid, paddr); #endif } static void local_cpu_map(void) { int cpu, id, mapping; int bitmask_idx = 0; cpu = 0; mapping = 0; do { id = common -> cpu_use[cpu]; if (id > 0) { if (is_dead(id)) common -> cpu_use[cpu] = 0; } bitmask_idx = CPUELT(cpu); if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { common -> cpu_use[cpu] = pshmid; cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); cpu_sub_mapping[mapping] = cpu; mapping ++; } cpu ++; } while ((mapping < numprocs) && (cpu < common -> final_num_procs)); disable_mapping = 0; if ((mapping < numprocs) || (numprocs == 1)) { for (cpu = 0; cpu < common -> final_num_procs; cpu ++) { if (common -> cpu_use[cpu] == pshmid) common -> cpu_use[cpu] = 0; } disable_mapping = 1; } #ifdef DEBUG for (cpu = 0; cpu < numprocs; cpu ++) { fprintf(stderr, "Local Mapping : %2d --> %2d (%2d)\n", cpu, cpu_mapping[cpu], cpu_sub_mapping[cpu]); } #endif } /* Public Functions */ int get_num_procs(void) { return numprocs; } int get_num_nodes(void) { return numnodes; } int get_node_equal(void) { return (((blas_cpu_number % numnodes) == 0) && node_equal); } int gotoblas_set_affinity(int pos) { cpu_set_t cpu_mask; int mynode = 1; /* if number of threads is larger than inital condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; } if (!disable_mapping) { mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[pos]]); #ifdef DEBUG fprintf(stderr, "Giving Affinity[%4d %3d] --> %3d My node = %3d\n", getpid(), pos, cpu_mapping[pos], mynode); #endif CPU_ZERO(&cpu_mask); CPU_SET (cpu_mapping[pos], &cpu_mask); sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); node_mapping[WhereAmI()] = mynode; } return mynode; } int get_node(void) { if (!disable_mapping) return node_mapping[WhereAmI()]; return 1; } static int initialized = 0; void gotoblas_affinity_init(void) { int cpu, num_avail; #ifndef USE_OPENMP cpu_set_t cpu_mask; #endif int i; if (initialized) return; initialized = 1; sched_getaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); #ifdef USE_OPENMP numprocs = 0; #else numprocs = readenv_atoi("OPENBLAS_NUM_THREADS"); if (numprocs == 0) numprocs = readenv_atoi("GOTO_NUM_THREADS"); #endif if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS"); numnodes = 1; if (numprocs == 1) { disable_mapping = 1; return; } create_pshmem(); open_shmem(); while ((common -> lock) && (common -> magic != SH_MAGIC)) { if (is_dead(common -> shmid)) { common -> lock = 0; common -> shmid = 0; common -> magic = 0; } else { sched_yield(); } } blas_lock(&common -> lock); if ((common -> shmid) && is_dead(common -> shmid)) common -> magic = 0; common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { cpu_set_t *cpusetp; int nums; int ret; #ifdef DEBUG fprintf(stderr, "Shared Memory Initialization.\n"); #endif //returns the number of processors which are currently online nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 3) common->num_procs = nums; #elif __GLIBC_PREREQ(2, 7) cpusetp = CPU_ALLOC(nums); if (cpusetp == NULL) { common->num_procs = nums; } else { size_t size; size = CPU_ALLOC_SIZE(nums); ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) common->num_procs = nums; else common->num_procs = CPU_COUNT_S(size,cpusetp); } CPU_FREE(cpusetp); #else ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); if (ret!=0) { common->num_procs = nums; } else { #if !__GLIBC_PREREQ(2, 6) int i; int n = 0; for (i=0;inum_procs = n; } #else common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); #endif #endif if(common -> num_procs > MAX_CPUS) { fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); exit(1); } for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; numa_check(); disable_hyperthread(); if (common -> num_nodes > 1) numa_mapping(); common -> final_num_procs = 0; for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; common -> magic = SH_MAGIC; } disable_affinity(); num_avail = 0; for(i=0; i num_avail)) numprocs = num_avail; #ifdef DEBUG fprintf(stderr, "Number of threads = %d\n", numprocs); #endif local_cpu_map(); blas_unlock(&common -> lock); #ifndef USE_OPENMP if (!disable_mapping) { #ifdef DEBUG fprintf(stderr, "Giving Affinity[%3d] --> %3d\n", 0, cpu_mapping[0]); #endif CPU_ZERO(&cpu_mask); CPU_SET (cpu_mapping[0], &cpu_mask); sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); node_mapping[WhereAmI()] = READ_NODE(common -> cpu_info[cpu_sub_mapping[0]]); setup_mempolicy(); if (readenv_atoi("OPENBLAS_MAIN_FREE") || readenv_atoi("GOTOBLAS_MAIN_FREE")) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); } } #endif #ifdef DEBUG fprintf(stderr, "Initialization is done.\n"); #endif } void gotoblas_affinity_quit(void) { int i; struct shmid_ds ds; #ifdef DEBUG fprintf(stderr, "Terminating ..\n"); #endif if ((numprocs == 1) || (initialized == 0)) return; if (!disable_mapping) { blas_lock(&common -> lock); for (i = 0; i < numprocs; i ++) common -> cpu_use[cpu_mapping[i]] = -1; blas_unlock(&common -> lock); } shmctl(shmid, IPC_STAT, &ds); if (ds.shm_nattch == 1) shmctl(shmid, IPC_RMID, 0); shmdt(common); shmdt(paddr); initialized = 0; } #else void gotoblas_affinity_init(void) {}; void gotoblas_set_affinity(int threads) {}; void gotoblas_set_affinity2(int threads) {}; void gotoblas_affinity_reschedule(void) {}; int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); } int get_num_nodes(void) { return 1; } int get_node(void) { return 1;} #endif OpenBLAS-0.2.20/driver/others/lamc3.c000066400000000000000000000053331313527062700171120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #ifdef NEED_F2CCONV double #else FLOAT #endif NAME(FLOAT *a, FLOAT *b){ return *a + *b; } OpenBLAS-0.2.20/driver/others/lamch.c000066400000000000000000000136561313527062700172060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #if 0 static FLOAT hdata[] __attribute__((aligned(128))) = { #ifdef XDOUBLE +0x1.0000000000000000P-00064L, +0x1.0000000000000000P-16382L, +0x1.0000000000000000P+00001L, +0x1.0000000000000000P-00063L, +0x1.0000000000000000P+00006L, +0x1.0000000000000000P+00000L, -0x1.ffe8000000000000P+00013L, +0x1.0000000000000000P-16382L, +0x1.0004000000000000P+00014L, +0x1.fffffffffffffffeP+16383L, #elif defined DOUBLE +0x1.0000000000000P-0053, +0x1.0000000000000P-1022, +0x1.0000000000000P+0001, +0x1.0000000000000P-0052, +0x1.a800000000000P+0005, +0x1.0000000000000P+0000, -0x1.fe80000000000P+0009, +0x1.0000000000000P-1022, +0x1.0000000000000P+0010, +0x1.fffffffffffffP+1023, #else +0x1.000000P-024f, +0x1.000000P-126f, +0x1.000000P+001f, +0x1.000000P-023f, +0x1.800000P+004f, +0x1.000000P+000f, -0x1.f40000P+006f, +0x1.000000P-126f, +0x1.000000P+007f, +0x1.fffffeP+127f, #endif }; #endif static unsigned int idata[] __attribute__((aligned(128))) = { #if defined XDOUBLE #ifndef __BIG_ENDIAN__ 0x00000000, 0x80000000, 0x00003fbf, 0x00000000, 0x00000000, 0x80000000, 0x00000001, 0x00000000, 0x00000000, 0x80000000, 0x00004000, 0x00000000, 0x00000000, 0x80000000, 0x00003fc0, 0x00000000, 0x00000000, 0x80000000, 0x00004005, 0x00000000, 0x00000000, 0x80000000, 0x00003fff, 0x00000000, 0x00000000, 0xff400000, 0x0000c00c, 0x00000000, 0x00000000, 0x80000000, 0x00000001, 0x00000000, 0x00000000, 0x80200000, 0x0000400d, 0x00000000, 0xffffffff, 0xffffffff, 0x00007ffe, 0x00000000, #else 0x00000000, 0x00003fbf, 0x80000000, 0x00000000, 0x00000000, 0x00000001, 0x80000000, 0x00000000, 0x00000000, 0x00004000, 0x80000000, 0x00000000, 0x00000000, 0x00003fc0, 0x80000000, 0x00000000, 0x00000000, 0x00004005, 0x80000000, 0x00000000, 0x00000000, 0x00003fff, 0x80000000, 0x00000000, 0x00000000, 0x0000c00c, 0xff400000, 0x00000000, 0x00000000, 0x00000001, 0x80000000, 0x00000000, 0x00000000, 0x0000400d, 0x80200000, 0x00000000, 0x00000000, 0x00007ffe, 0xffffffff, 0xffffffff, #endif #elif defined DOUBLE #ifndef __BIG_ENDIAN__ 0x00000000, 0x3ca00000, 0x00000000, 0x00100000, 0x00000000, 0x40000000, 0x00000000, 0x3cb00000, 0x00000000, 0x404a8000, 0x00000000, 0x3ff00000, 0x00000000, 0xc08fe800, 0x00000000, 0x00100000, 0x00000000, 0x40900000, 0xffffffff, 0x7fefffff, #else 0x3ca00000, 0x00000000, 0x00100000, 0x00000000, 0x40000000, 0x00000000, 0x3cb00000, 0x00000000, 0x404a8000, 0x00000000, 0x3ff00000, 0x00000000, 0xc08fe800, 0x00000000, 0x00100000, 0x00000000, 0x40900000, 0x00000000, 0x7fefffff, 0xffffffff, #endif #else 0x33800000, 0x00800000, 0x40000000, 0x34000000, 0x41c00000, 0x3f800000, 0xc2fa0000, 0x00800000, 0x43000000, 0x7f7fffff, #endif }; #ifdef NEED_F2CCONV double #else FLOAT #endif NAME(char *P){ char p = *P; int pos; FLOAT *hdata = (FLOAT *)idata; TOUPPER(p); switch (p) { case 'E': pos = 0; break; case 'S': pos = 1; break; case 'B': pos = 2; break; case 'P': pos = 3; break; case 'N': pos = 4; break; case 'R': pos = 5; break; case 'M': pos = 6; break; case 'U': pos = 7; break; case 'L': pos = 8; break; case 'O': pos = 9; break; default: pos = 0; break; } return hdata[pos]; } OpenBLAS-0.2.20/driver/others/lsame.c000066400000000000000000000053751313527062700172220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include int NAME(char *A, char *B){ char a = *A; char b = *B; if (a > 96) a -= 32; if (b > 96) b -= 32; return (a == b); } OpenBLAS-0.2.20/driver/others/memory.c000066400000000000000000001050441313527062700174230ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ //#undef DEBUG #include "common.h" #include #ifdef OS_WINDOWS #define ALLOC_WINDOWS #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif #else #define ALLOC_MMAP #define ALLOC_MALLOC #endif #include #include #include #ifndef OS_WINDOWS #include #ifndef NO_SYSV_IPC #include #endif #include #endif #include #ifdef OS_LINUX #include #include #include #include #include #include #include #endif #if defined(OS_FREEBSD) || defined(OS_DARWIN) #include #include #endif #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) #include #undef printf #define printf _cprintf #endif #ifdef OS_LINUX #ifndef MPOL_PREFERRED #define MPOL_PREFERRED 1 #endif #endif #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP) #define NO_WARMUP #endif #ifndef SHM_HUGETLB #define SHM_HUGETLB 04000 #endif #ifndef FIXED_PAGESIZE #define FIXED_PAGESIZE 4096 #endif #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #if defined(_MSC_VER) && !defined(__clang__) #define CONSTRUCTOR __cdecl #define DESTRUCTOR __cdecl #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) #else #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) #endif #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; #endif extern void openblas_warning(int verbose, const char * msg); #ifndef SMP #define blas_cpu_number 1 #define blas_num_threads 1 /* Dummy Function */ int goto_get_num_procs (void) { return 1;}; void goto_set_num_threads(int num_threads) {}; #else #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) #ifndef NO_AFFINITY int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; cpu_set_t *cpusetp; size_t size; int ret; int i,n; if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) return nums; #endif #if !defined(__GLIBC_PREREQ) return nums; #endif #if !__GLIBC_PREREQ(2, 3) return nums; #endif #if !__GLIBC_PREREQ(2, 7) ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0) blas_num_threads = blas_goto_num; else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; #ifdef DEBUG printf( "Adjusted number of threads : %3d\n", blas_num_threads); #endif blas_cpu_number = blas_num_threads; return blas_num_threads; } #endif int openblas_get_num_procs(void) { #ifndef SMP return 1; #else return get_num_procs(); #endif } int openblas_get_num_threads(void) { #ifndef SMP return 1; #else // init blas_cpu_number if needed blas_get_cpu_number(); return blas_cpu_number; #endif } struct release_t { void *address; void (*func)(struct release_t *); long attr; }; int hugetlb_allocated = 0; static struct release_t release_info[NUM_BUFFERS]; static int release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) static int hot_alloc = 0; #endif /* Global lock for memory allocation */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) static pthread_spinlock_t alloc_lock = 0; #else static BLASULONG alloc_lock = 0UL; #endif #ifdef ALLOC_MMAP static void alloc_mmap_free(struct release_t *release){ if (munmap(release -> address, BUFFER_SIZE)) { printf("OpenBLAS : munmap failed\n"); } } #ifdef NO_WARMUP static void *alloc_mmap(void *address){ void *map_address; if (address){ map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); } if (map_address != (void *)-1) { LOCK_COMMAND(&alloc_lock); release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; UNLOCK_COMMAND(&alloc_lock); } #ifdef OS_LINUX my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); #endif return map_address; } #else #define BENCH_ITERATION 4 #define SCALING 2 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { BLASULONG original, *p; BLASULONG start, stop, min; int iter, i, count; min = (BLASULONG)-1; original = *(BLASULONG *)(address + size - PAGESIZE); *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address; for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { p = (BLASULONG *)address; count = size / PAGESIZE; start = rpcc(); for (i = 0; i < count; i ++) { p = (BLASULONG *)(*p); } stop = rpcc(); if (min > stop - start) min = stop - start; } *(BLASULONG *)(address + size - PAGESIZE + 0) = original; *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; return min; } static void *alloc_mmap(void *address){ void *map_address, *best_address; BLASULONG best, start, current; BLASULONG allocsize; if (address){ /* Just give up use advanced operation */ map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); #ifdef OS_LINUX my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #if defined(OS_LINUX) && !defined(NO_WARMUP) if (hot_alloc == 0) { map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); #ifdef OS_LINUX my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); #endif } else { #endif map_address = mmap(NULL, BUFFER_SIZE * SCALING, MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { #ifdef OS_LINUX #ifdef DEBUG int ret=0; ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); if(ret==-1){ int errsv=errno; perror("OpenBLAS alloc_mmap:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); } #else my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif allocsize = DGEMM_P * DGEMM_Q * sizeof(double); start = (BLASULONG)map_address; current = (SCALING - 1) * BUFFER_SIZE; while(current > 0) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; } *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; start = (BLASULONG)map_address; best = (BLASULONG)-1; best_address = map_address; while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { current = run_bench(start, allocsize); if (best > current) { best = current; best_address = (void *)start; } start += PAGESIZE; } if ((BLASULONG)best_address > (BLASULONG)map_address) munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); map_address = best_address; #if defined(OS_LINUX) && !defined(NO_WARMUP) hot_alloc = 2; #endif } } #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif LOCK_COMMAND(&alloc_lock); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; } UNLOCK_COMMAND(&alloc_lock); return map_address; } #endif #endif #ifdef ALLOC_MALLOC static void alloc_malloc_free(struct release_t *release){ free(release -> address); } static void *alloc_malloc(void *address){ void *map_address; map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; release_pos ++; } return map_address; } #endif #ifdef ALLOC_QALLOC void *qalloc(int flags, size_t bytes); void *qfree (void *address); #define QNONCACHE 0x1 #define QCOMMS 0x2 #define QFAST 0x4 static void alloc_qalloc_free(struct release_t *release){ qfree(release -> address); } static void *alloc_qalloc(void *address){ void *map_address; map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; release_pos ++; } return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); } #endif #ifdef ALLOC_WINDOWS static void alloc_windows_free(struct release_t *release){ VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); } static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, BUFFER_SIZE, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; release_pos ++; } return map_address; } #endif #ifdef ALLOC_DEVICEDRIVER #ifndef DEVICEDRIVER_NAME #define DEVICEDRIVER_NAME "/dev/mapper" #endif static void alloc_devicedirver_free(struct release_t *release){ if (munmap(release -> address, BUFFER_SIZE)) { printf("OpenBLAS : Bugphysarea unmap failed.\n"); } if (close(release -> attr)) { printf("OpenBLAS : Bugphysarea close failed.\n"); } } static void *alloc_devicedirver(void *address){ int fd; void *map_address; if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) { return (void *)-1; } map_address = mmap(address, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; release_pos ++; } return map_address; } #endif #ifdef ALLOC_SHM static void alloc_shm_free(struct release_t *release){ if (shmdt(release -> address)) { printf("OpenBLAS : Shared memory unmap failed.\n"); } } static void *alloc_shm(void *address){ void *map_address; int shmid; shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); map_address = (void *)shmat(shmid, address, 0); if (map_address != (void *)-1){ #ifdef OS_LINUX my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); #endif shmctl(shmid, IPC_RMID, 0); release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; release_pos ++; } return map_address; } #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS static void alloc_hugetlb_free(struct release_t *release){ #if defined(OS_LINUX) || defined(OS_AIX) if (shmdt(release -> address)) { printf("OpenBLAS : Hugepage unmap failed.\n"); } #endif #ifdef __sun__ munmap(release -> address, BUFFER_SIZE); #endif #ifdef OS_WINDOWS VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); #endif } static void *alloc_hugetlb(void *address){ void *map_address = (void *)-1; #if defined(OS_LINUX) || defined(OS_AIX) int shmid; shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, #ifdef OS_LINUX SHM_HUGETLB | #endif #ifdef OS_AIX SHM_LGPAGE | SHM_PIN | #endif IPC_CREAT | SHM_R | SHM_W); if (shmid != -1) { map_address = (void *)shmat(shmid, address, SHM_RND); #ifdef OS_LINUX my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); #endif if (map_address != (void *)-1){ shmctl(shmid, IPC_RMID, 0); } } #endif #ifdef __sun__ struct memcntl_mha mha; mha.mha_cmd = MHA_MAPSIZE_BSSBRK; mha.mha_flags = 0; mha.mha_pagesize = HUGE_PAGESIZE; memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); #endif #ifdef OS_WINDOWS HANDLE hToken; TOKEN_PRIVILEGES tp; if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1; tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; } if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) { CloseHandle(hToken); return (void*)-1; } map_address = (void *)VirtualAlloc(address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); tp.Privileges[0].Attributes = 0; AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); if (map_address == (void *)NULL) map_address = (void *)-1; #endif if (map_address != (void *)-1){ release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; release_pos ++; } return map_address; } #endif #endif #ifdef ALLOC_HUGETLBFILE static int hugetlb_pid = 0; static void alloc_hugetlbfile_free(struct release_t *release){ if (munmap(release -> address, BUFFER_SIZE)) { printf("OpenBLAS : HugeTLBfs unmap failed.\n"); } if (close(release -> attr)) { printf("OpenBLAS : HugeTLBfs close failed.\n"); } } static void *alloc_hugetlbfile(void *address){ void *map_address = (void *)-1; int fd; char filename[64]; if (!hugetlb_pid) hugetlb_pid = getpid(); sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid); if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) { return (void *)-1; } unlink(filename); map_address = mmap(address, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; release_pos ++; } return map_address; } #endif #ifdef SEEK_ADDRESS static BLASULONG base_address = 0UL; #else static BLASULONG base_address = BASE_ADDRESS; #endif static volatile struct { BLASULONG lock; void *addr; #if defined(WHEREAMI) && !defined(USE_OPENMP) int pos; #endif int used; #ifndef __64BIT__ char dummy[48]; #else char dummy[40]; #endif } memory[NUM_BUFFERS]; static int memory_initialized = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ /* 0 : Level 3 functions */ /* 1 : Level 2 functions */ /* 2 : Thread */ void *blas_memory_alloc(int procpos){ int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) int mypos; #endif void *map_address; void *(*memoryalloc[])(void *address) = { #ifdef ALLOC_DEVICEDRIVER alloc_devicedirver, #endif /* Hugetlb implicitly assumes ALLOC_SHM */ #ifdef ALLOC_SHM alloc_shm, #endif #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) alloc_hugetlb, #endif #ifdef ALLOC_MMAP alloc_mmap, #endif #ifdef ALLOC_QALLOC alloc_qalloc, #endif #ifdef ALLOC_WINDOWS alloc_windows, #endif #ifdef ALLOC_MALLOC alloc_malloc, #endif NULL, }; void *(**func)(void *address); LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { #if defined(WHEREAMI) && !defined(USE_OPENMP) for (position = 0; position < NUM_BUFFERS; position ++){ memory[position].addr = (void *)0; memory[position].pos = -1; memory[position].used = 0; memory[position].lock = 0; } #endif #ifdef DYNAMIC_ARCH gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) gotoblas_affinity_init(); #endif #ifdef SMP if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #ifndef DYNAMIC_ARCH blas_set_parameter(); #endif #endif memory_initialized = 1; } UNLOCK_COMMAND(&alloc_lock); #ifdef DEBUG printf("Alloc Start ...\n"); #endif #if defined(WHEREAMI) && !defined(USE_OPENMP) mypos = WhereAmI(); position = mypos; while (position >= NUM_BUFFERS) position >>= 1; do { if (!memory[position].used && (memory[position].pos == mypos)) { blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; blas_unlock(&memory[position].lock); } position ++; } while (position < NUM_BUFFERS); #endif position = 0; do { /* if (!memory[position].used) { */ blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; blas_unlock(&memory[position].lock); /* } */ position ++; } while (position < NUM_BUFFERS); goto error; allocation : #ifdef DEBUG printf(" Position -> %d\n", position); #endif memory[position].used = 1; blas_unlock(&memory[position].lock); if (!memory[position].addr) { do { #ifdef DEBUG printf("Allocation Start : %lx\n", base_address); #endif map_address = (void *)-1; func = &memoryalloc[0]; while ((func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); } #endif #ifdef ALLOC_HUGETLBFILE if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); #endif } #endif #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif func ++; } #ifdef DEBUG printf(" Success -> %08lx\n", map_address); #endif if (((BLASLONG) map_address) == -1) base_address = 0UL; if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; } while ((BLASLONG)map_address == -1); LOCK_COMMAND(&alloc_lock); memory[position].addr = map_address; UNLOCK_COMMAND(&alloc_lock); #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } #if defined(WHEREAMI) && !defined(USE_OPENMP) if (memory[position].pos == -1) memory[position].pos = mypos; #endif #ifdef DYNAMIC_ARCH if (memory_initialized == 1) { LOCK_COMMAND(&alloc_lock); if (memory_initialized == 1) { if (!gotoblas) gotoblas_dynamic_init(); memory_initialized = 2; } UNLOCK_COMMAND(&alloc_lock); } #endif #ifdef DEBUG printf("Mapped : %p %3d\n\n", (void *)memory[position].addr, position); #endif return (void *)memory[position].addr; error: printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); return NULL; } void blas_memory_free(void *free_area){ int position; #ifdef DEBUG printf("Unmapped Start : %p ...\n", free_area); #endif position = 0; LOCK_COMMAND(&alloc_lock); while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; if (memory[position].addr != free_area) goto error; #ifdef DEBUG printf(" Position : %d\n", position); #endif // arm: ensure all writes are finished before other thread takes this memory WMB; memory[position].used = 0; UNLOCK_COMMAND(&alloc_lock); #ifdef DEBUG printf("Unmap Succeeded.\n\n"); #endif return; error: printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); #ifdef DEBUG for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif UNLOCK_COMMAND(&alloc_lock); return; } void *blas_memory_alloc_nolock(int unused) { void *map_address; map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); return map_address; } void blas_memory_free_nolock(void * map_address) { free(map_address); } void blas_shutdown(void){ int pos; #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { release_info[pos].func(&release_info[pos]); } #ifdef SEEK_ADDRESS base_address = 0UL; #else base_address = BASE_ADDRESS; #endif for (pos = 0; pos < NUM_BUFFERS; pos ++){ memory[pos].addr = (void *)0; memory[pos].used = 0; #if defined(WHEREAMI) && !defined(USE_OPENMP) memory[pos].pos = -1; #endif memory[pos].lock = 0; } UNLOCK_COMMAND(&alloc_lock); return; } #if defined(OS_LINUX) && !defined(NO_WARMUP) #ifdef SMP #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) static pthread_spinlock_t init_lock = 0; #else static BLASULONG init_lock = 0UL; #endif #endif static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, void *sa, void *sb, BLASLONG pos) { #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) size_t size; BLASULONG buffer; size = BUFFER_SIZE - PAGESIZE; buffer = (BLASULONG)sa + GEMM_OFFSET_A; #if defined(OS_LINUX) && !defined(NO_WARMUP) if (hot_alloc != 2) { #endif #ifdef SMP LOCK_COMMAND(&init_lock); #endif while (size > 0) { *(int *)buffer = size; buffer += PAGESIZE; size -= PAGESIZE; } #ifdef SMP UNLOCK_COMMAND(&init_lock); #endif size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); buffer = (BLASULONG)sa + GEMM_OFFSET_A; while (size > 0) { *(int *)buffer = size; buffer += 64; size -= 64; } #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif #endif } #ifdef SMP static void _init_thread_memory(void *buffer) { blas_queue_t queue[MAX_CPU_NUMBER]; int num_cpu; for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) { blas_queue_init(&queue[num_cpu]); queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL; queue[num_cpu].routine = &_touch_memory; queue[num_cpu].args = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; } queue[num_cpu - 1].next = NULL; queue[0].sa = buffer; exec_blas(num_cpu, queue); } #endif static void gotoblas_memory_init(void) { void *buffer; hot_alloc = 1; buffer = (void *)blas_memory_alloc(0); #ifdef SMP if (blas_cpu_number == 0) blas_get_cpu_number(); #ifdef SMP_SERVER if (blas_server_avail == 0) blas_thread_init(); #endif _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); #else _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); #endif blas_memory_free(buffer); } #endif /* Initialization for all function; this function should be called before main */ static int gotoblas_initialized = 0; extern void openblas_read_env(); void CONSTRUCTOR gotoblas_init(void) { if (gotoblas_initialized) return; #ifdef SMP openblas_fork_handler(); #endif openblas_read_env(); #ifdef PROFILE moncontrol (0); #endif #ifdef DYNAMIC_ARCH gotoblas_dynamic_init(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) gotoblas_affinity_init(); #endif #if defined(OS_LINUX) && !defined(NO_WARMUP) gotoblas_memory_init(); #endif //#if defined(OS_LINUX) #if 0 struct rlimit curlimit; if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) { if ( curlimit.rlim_cur != curlimit.rlim_max ) { curlimit.rlim_cur = curlimit.rlim_max; setrlimit(RLIMIT_STACK, &curlimit); } } #endif #ifdef SMP if (blas_cpu_number == 0) blas_get_cpu_number(); #ifdef SMP_SERVER if (blas_server_avail == 0) blas_thread_init(); #endif #endif #ifdef FUNCTION_PROFILE gotoblas_profile_init(); #endif gotoblas_initialized = 1; #ifdef PROFILE moncontrol (1); #endif } void DESTRUCTOR gotoblas_quit(void) { if (gotoblas_initialized == 0) return; blas_shutdown(); #ifdef PROFILE moncontrol (0); #endif #ifdef FUNCTION_PROFILE gotoblas_profile_quit(); #endif #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) gotoblas_affinity_quit(); #endif #ifdef DYNAMIC_ARCH gotoblas_dynamic_quit(); #endif gotoblas_initialized = 0; #ifdef PROFILE moncontrol (1); #endif } #if defined(_MSC_VER) && !defined(__clang__) BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) { switch (ul_reason_for_call) { case DLL_PROCESS_ATTACH: gotoblas_init(); break; case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: break; case DLL_PROCESS_DETACH: gotoblas_quit(); break; default: break; } return TRUE; } /* This is to allow static linking. Code adapted from Google performance tools: https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc Reference: https://sourceware.org/ml/pthreads-win32/2008/msg00028.html http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp */ static int on_process_term(void) { gotoblas_quit(); return 0; } #ifdef _WIN64 #pragma comment(linker, "/INCLUDE:_tls_used") #else #pragma comment(linker, "/INCLUDE:__tls_used") #endif #ifdef _WIN64 #pragma const_seg(".CRT$XLB") #else #pragma data_seg(".CRT$XLB") #endif static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #ifdef _WIN64 #pragma const_seg() #else #pragma data_seg() #endif #ifdef _WIN64 #pragma const_seg(".CRT$XTU") #else #pragma data_seg(".CRT$XTU") #endif static int(*p_process_term)(void) = on_process_term; #ifdef _WIN64 #pragma const_seg() #else #pragma data_seg() #endif #endif #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) /* Don't call me; this is just work around for PGI / Sun bug */ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); #else asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif } #endif OpenBLAS-0.2.20/driver/others/memory_qalloc.c000066400000000000000000000062431313527062700207570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef SMP #define blas_cpu_number 1 #else int blas_cpu_number = 1; int blas_get_cpu_number(void){ return blas_cpu_number; } #endif #define FIXED_PAGESIZE 4096 void *sa = NULL; void *sb = NULL; static double static_buffer[BUFFER_SIZE/sizeof(double)]; void *blas_memory_alloc(int numproc){ if (sa == NULL){ #if 1 sa = (void *)qalloc(QFAST, BUFFER_SIZE); #else sa = (void *)malloc(BUFFER_SIZE); #endif sb = (void *)&static_buffer[0]; } return sa; } void blas_memory_free(void *free_area){ return; } OpenBLAS-0.2.20/driver/others/openblas_env.c000066400000000000000000000062151313527062700205660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2011-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" static int openblas_env_verbose=0; static unsigned int openblas_env_thread_timeout=0; static int openblas_env_block_factor=0; static int openblas_env_openblas_num_threads=0; static int openblas_env_goto_num_threads=0; static int openblas_env_omp_num_threads=0; int openblas_verbose() { return openblas_env_verbose;} unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} int openblas_block_factor() { return openblas_env_block_factor;} int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} void openblas_read_env() { int ret=0; env_var_t p; if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); if(ret<0) ret=0; openblas_env_verbose=ret; ret=0; if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p); if(ret<0) ret=0; openblas_env_block_factor=ret; ret=0; if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p); if(ret<0) ret=0; openblas_env_thread_timeout=(unsigned int)ret; ret=0; if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; openblas_env_openblas_num_threads=ret; ret=0; if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; openblas_env_goto_num_threads=ret; ret=0; if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; openblas_env_omp_num_threads=ret; } OpenBLAS-0.2.20/driver/others/openblas_error_handle.c000066400000000000000000000036551313527062700224470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" extern int openblas_verbose(); void openblas_warning(int verbose, const char * msg) { int current_verbose; current_verbose=openblas_verbose(); if(current_verbose >= verbose){ fprintf(stderr, "%s", msg); } } OpenBLAS-0.2.20/driver/others/openblas_get_config.c000066400000000000000000000047071313527062700221060ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" #include static char* openblas_config_str="" #ifdef USE64BITINT "USE64BITINT " #endif #ifdef NO_CBLAS "NO_CBLAS " #endif #ifdef NO_LAPACK "NO_LAPACK " #endif #ifdef NO_LAPACKE "NO_LAPACKE " #endif #ifdef DYNAMIC_ARCH "DYNAMIC_ARCH " #endif #ifdef NO_AFFINITY "NO_AFFINITY " #endif #ifndef DYNAMIC_ARCH CHAR_CORENAME #endif ; #ifdef DYNAMIC_ARCH char *gotoblas_corename(); static char tmp_config_str[256]; #endif char* CNAME() { #ifndef DYNAMIC_ARCH return openblas_config_str; #else strcpy(tmp_config_str, openblas_config_str); strcat(tmp_config_str, gotoblas_corename()); return tmp_config_str; #endif } char* openblas_get_corename() { #ifndef DYNAMIC_ARCH return CHAR_CORENAME; #else return gotoblas_corename(); #endif } OpenBLAS-0.2.20/driver/others/openblas_get_num_procs.c000066400000000000000000000034711313527062700226430ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" extern int openblas_get_num_procs(void); int openblas_get_num_procs_(void) { return openblas_get_num_procs(); } OpenBLAS-0.2.20/driver/others/openblas_get_num_threads.c000066400000000000000000000034771313527062700231550ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" extern int openblas_get_num_threads(void); int openblas_get_num_threads_(void) { return openblas_get_num_threads(); } OpenBLAS-0.2.20/driver/others/openblas_get_parallel.c000066400000000000000000000040201313527062700224210ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2013 Martin Koehler, grisuthedragon@users.github.com All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" #if defined(USE_OPENMP) static int parallel = 2 ; #elif defined(SMP_SERVER) static int parallel = 1; #else static int parallel = 0; #endif #ifdef NEEDBUNDERSCORE int CNAME() { return parallel; } int NAME() { return parallel; } #else //The CNAME and NAME are the same. int NAME() { return parallel; } #endif OpenBLAS-0.2.20/driver/others/openblas_set_num_threads.c000066400000000000000000000037701313527062700231650ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" #ifdef SMP_SERVER extern void openblas_set_num_threads(int num_threads) ; void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } #else //Single thread void openblas_set_num_threads(int num_threads) { } void openblas_set_num_threads_(int* num_threads){ } #endif OpenBLAS-0.2.20/driver/others/parameter.c000066400000000000000000000447751313527062700201100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" extern int openblas_block_factor(); int get_L2_size(void); #define DEFAULT_GEMM_P 128 #define DEFAULT_GEMM_Q 128 #define DEFAULT_GEMM_R 128 #define DEFAULT_GEMM_OFFSET_A 0 #define DEFAULT_GEMM_OFFSET_B 0 /* Global Parameter */ #if GEMM_OFFSET_A == gemm_offset_a BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; #else BLASLONG gemm_offset_a = GEMM_OFFSET_A; #endif #if GEMM_OFFSET_B == gemm_offset_b BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; #else BLASLONG gemm_offset_b = GEMM_OFFSET_B; #endif #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else BLASLONG sgemm_p = SGEMM_P; #endif #if DGEMM_P == dgemm_p BLASLONG dgemm_p = DEFAULT_GEMM_P; #else BLASLONG dgemm_p = DGEMM_P; #endif #if CGEMM_P == cgemm_p BLASLONG cgemm_p = DEFAULT_GEMM_P; #else BLASLONG cgemm_p = CGEMM_P; #endif #if ZGEMM_P == zgemm_p BLASLONG zgemm_p = DEFAULT_GEMM_P; #else BLASLONG zgemm_p = ZGEMM_P; #endif #if SGEMM_Q == sgemm_q BLASLONG sgemm_q = DEFAULT_GEMM_Q; #else BLASLONG sgemm_q = SGEMM_Q; #endif #if DGEMM_Q == dgemm_q BLASLONG dgemm_q = DEFAULT_GEMM_Q; #else BLASLONG dgemm_q = DGEMM_Q; #endif #if CGEMM_Q == cgemm_q BLASLONG cgemm_q = DEFAULT_GEMM_Q; #else BLASLONG cgemm_q = CGEMM_Q; #endif #if ZGEMM_Q == zgemm_q BLASLONG zgemm_q = DEFAULT_GEMM_Q; #else BLASLONG zgemm_q = ZGEMM_Q; #endif #if SGEMM_R == sgemm_r BLASLONG sgemm_r = DEFAULT_GEMM_R; #else BLASLONG sgemm_r = SGEMM_R; #endif #if DGEMM_R == dgemm_r BLASLONG dgemm_r = DEFAULT_GEMM_R; #else BLASLONG dgemm_r = DGEMM_R; #endif #if CGEMM_R == cgemm_r BLASLONG cgemm_r = DEFAULT_GEMM_R; #else BLASLONG cgemm_r = CGEMM_R; #endif #if ZGEMM_R == zgemm_r BLASLONG zgemm_r = DEFAULT_GEMM_R; #else BLASLONG zgemm_r = ZGEMM_R; #endif #if defined(EXPRECISION) || defined(QUAD_PRECISION) #if QGEMM_P == qgemm_p BLASLONG qgemm_p = DEFAULT_GEMM_P; #else BLASLONG qgemm_p = QGEMM_P; #endif #if XGEMM_P == xgemm_p BLASLONG xgemm_p = DEFAULT_GEMM_P; #else BLASLONG xgemm_p = XGEMM_P; #endif #if QGEMM_Q == qgemm_q BLASLONG qgemm_q = DEFAULT_GEMM_Q; #else BLASLONG qgemm_q = QGEMM_Q; #endif #if XGEMM_Q == xgemm_q BLASLONG xgemm_q = DEFAULT_GEMM_Q; #else BLASLONG xgemm_q = XGEMM_Q; #endif #if QGEMM_R == qgemm_r BLASLONG qgemm_r = DEFAULT_GEMM_R; #else BLASLONG qgemm_r = QGEMM_R; #endif #if XGEMM_R == xgemm_r BLASLONG xgemm_r = DEFAULT_GEMM_R; #else BLASLONG xgemm_r = XGEMM_R; #endif #endif #if defined(ARCH_X86) || defined(ARCH_X86_64) int get_L2_size(void){ int eax, ebx, ecx, edx; #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); return BITMASK(ecx, 16, 0xffff); #else int info[15]; int i; cpuid(2, &eax, &ebx, &ecx, &edx); info[ 0] = BITMASK(eax, 8, 0xff); info[ 1] = BITMASK(eax, 16, 0xff); info[ 2] = BITMASK(eax, 24, 0xff); info[ 3] = BITMASK(ebx, 0, 0xff); info[ 4] = BITMASK(ebx, 8, 0xff); info[ 5] = BITMASK(ebx, 16, 0xff); info[ 6] = BITMASK(ebx, 24, 0xff); info[ 7] = BITMASK(ecx, 0, 0xff); info[ 8] = BITMASK(ecx, 8, 0xff); info[ 9] = BITMASK(ecx, 16, 0xff); info[10] = BITMASK(ecx, 24, 0xff); info[11] = BITMASK(edx, 0, 0xff); info[12] = BITMASK(edx, 8, 0xff); info[13] = BITMASK(edx, 16, 0xff); info[14] = BITMASK(edx, 24, 0xff); for (i = 0; i < 15; i++){ switch (info[i]){ case 0x3b : case 0x41 : case 0x79 : return 128; break; case 0x3c : case 0x42 : case 0x7a : case 0x7e : case 0x82 : return 256; break; case 0x43 : case 0x7b : case 0x7f : case 0x83 : case 0x86 : return 512; break; case 0x44 : case 0x78 : case 0x7c : case 0x84 : case 0x87 : return 1024; break; case 0x45 : case 0x7d : case 0x85 : return 2048; case 0x49 : return 4096; break; } } /* Never reached */ return 0; #endif } void blas_set_parameter(void){ int factor; #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) int size = 16; #else int size = get_L2_size(); #endif #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) size >>= 7; #if defined(CORE_BANIAS) && (HAVE_HIT > 1) sgemm_p = 64 / HAVE_HIT * size; dgemm_p = 32 / HAVE_HIT * size; cgemm_p = 32 / HAVE_HIT * size; zgemm_p = 16 / HAVE_HIT * size; #ifdef EXPRECISION qgemm_p = 16 / HAVE_HIT * size; xgemm_p = 8 / HAVE_HIT * size; #endif #ifdef QUAD_PRECISION qgemm_p = 8 / HAVE_HIT * size; xgemm_p = 4 / HAVE_HIT * size; #endif #else sgemm_p = 64 * size; dgemm_p = 32 * size; cgemm_p = 32 * size; zgemm_p = 16 * size; #ifdef EXPRECISION qgemm_p = 16 * size; xgemm_p = 8 * size; #endif #ifdef QUAD_PRECISION qgemm_p = 8 * size; xgemm_p = 4 * size; #endif #endif #endif #if defined(CORE_NORTHWOOD) size >>= 7; #ifdef ALLOC_HUGETLB sgemm_p = 128 * size; dgemm_p = 64 * size; cgemm_p = 64 * size; zgemm_p = 32 * size; #ifdef EXPRECISION qgemm_p = 32 * size; xgemm_p = 16 * size; #endif #ifdef QUAD_PRECISION qgemm_p = 16 * size; xgemm_p = 8 * size; #endif #else sgemm_p = 96 * size; dgemm_p = 48 * size; cgemm_p = 48 * size; zgemm_p = 24 * size; #ifdef EXPRECISION qgemm_p = 24 * size; xgemm_p = 12 * size; #endif #ifdef QUAD_PRECISION qgemm_p = 12 * size; xgemm_p = 6 * size; #endif #endif #endif #if defined(CORE_CORE2) size >>= 9; sgemm_p = 92 * size; dgemm_p = 46 * size; cgemm_p = 46 * size; zgemm_p = 23 * size; #ifdef EXPRECISION qgemm_p = 23 * size; xgemm_p = 11 * size; #endif #ifdef QUAD_PRECISION qgemm_p = 11 * size; xgemm_p = 5 * size; #endif #endif #if defined(PENRYN) size >>= 9; sgemm_p = 1024; dgemm_p = 512; cgemm_p = 512; zgemm_p = 256; #ifdef EXPRECISION qgemm_p = 256; xgemm_p = 128; #endif #ifdef QUAD_PRECISION qgemm_p = 21 * size + 4; xgemm_p = 10 * size + 2; #endif #endif #if defined(DUNNINGTON) size >>= 9; sgemm_p = 384; dgemm_p = 384; cgemm_p = 384; zgemm_p = 384; #ifdef EXPRECISION qgemm_p = 384; xgemm_p = 384; #endif #ifdef QUAD_PRECISION qgemm_p = 21 * size + 4; xgemm_p = 10 * size + 2; #endif #endif #if defined(NEHALEM) sgemm_p = 1024; dgemm_p = 512; cgemm_p = 512; zgemm_p = 256; #ifdef EXPRECISION qgemm_p = 256; xgemm_p = 128; #endif #endif #if defined(SANDYBRIDGE) sgemm_p = 1024; dgemm_p = 512; cgemm_p = 512; zgemm_p = 256; #ifdef EXPRECISION qgemm_p = 256; xgemm_p = 128; #endif #endif #if defined(CORE_PRESCOTT) || defined(GENERIC) size >>= 6; if (size > 16) size = 16; sgemm_p = 56 * size; dgemm_p = 28 * size; cgemm_p = 28 * size; zgemm_p = 14 * size; #ifdef EXPRECISION qgemm_p = 14 * size; xgemm_p = 7 * size; #endif #ifdef QUAD_PRECISION qgemm_p = 7 * size; xgemm_p = 3 * size; #endif #endif #if defined(CORE_OPTERON) sgemm_p = 224 + 14 * (size >> 5); dgemm_p = 112 + 14 * (size >> 6); cgemm_p = 116 + 14 * (size >> 6); zgemm_p = 58 + 14 * (size >> 7); #ifdef EXPRECISION qgemm_p = 58 + 14 * (size >> 7); xgemm_p = 29 + 14 * (size >> 8); #endif #ifdef QUAD_PRECISION qgemm_p = 29 + 14 * (size >> 8); xgemm_p = 15 + 14 * (size >> 9); #endif #endif #if defined(ATOM) size >>= 8; sgemm_p = 256; dgemm_p = 128; cgemm_p = 128; zgemm_p = 64; #ifdef EXPRECISION qgemm_p = 64; xgemm_p = 32; #endif #ifdef QUAD_PRECISION qgemm_p = 32; xgemm_p = 16; #endif #endif #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) size >>= 8; sgemm_p = 232 * size; dgemm_p = 116 * size; cgemm_p = 116 * size; zgemm_p = 58 * size; #ifdef EXPRECISION qgemm_p = 58 * size; xgemm_p = 26 * size; #endif #ifdef QUAD_PRECISION qgemm_p = 26 * size; xgemm_p = 13 * size; #endif #endif factor=openblas_block_factor(); if (factor>0) { if (factor < 10) factor = 10; if (factor > 200) factor = 200; sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L; dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L; cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L; zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L; #ifdef EXPRECISION qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L; xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L; #endif } if (sgemm_p == 0) sgemm_p = 64; if (dgemm_p == 0) dgemm_p = 64; if (cgemm_p == 0) cgemm_p = 64; if (zgemm_p == 0) zgemm_p = 64; #ifdef EXPRECISION if (qgemm_p == 0) qgemm_p = 64; if (xgemm_p == 0) xgemm_p = 64; #endif #ifdef QUAD_PRECISION if (qgemm_p == 0) qgemm_p = 64; if (xgemm_p == 0) xgemm_p = 64; #endif sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M; dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M; cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M; zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M; #ifdef QUAD_PRECISION qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M; xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; #endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; #if defined(EXPRECISION) || defined(QUAD_PRECISION) qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; #endif #if 0 fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R); fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R); fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R); fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R); #endif return; } #if 0 int get_current_cpu_info(void){ int nlprocs, ncores, cmplegacy; int htt = 0; int apicid = 0; #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON) int eax, ebx, ecx, edx; cpuid(1, &eax, &ebx, &ecx, &edx); nlprocs = BITMASK(ebx, 16, 0xff); apicid = BITMASK(ebx, 24, 0xff); htt = BITMASK(edx, 28, 0x01); #endif #if defined(CORE_PRESCOTT) cpuid(4, &eax, &ebx, &ecx, &edx); ncores = BITMASK(eax, 26, 0x3f); if (htt == 0) nlprocs = 0; #endif #if defined(CORE_OPTERON) cpuid(0x80000008, &eax, &ebx, &ecx, &edx); ncores = BITMASK(ecx, 0, 0xff); cpuid(0x80000001, &eax, &ebx, &ecx, &edx); cmplegacy = BITMASK(ecx, 1, 0x01); if (htt == 0) { nlprocs = 0; ncores = 0; cmplegacy = 0; } #endif ncores ++; fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores); return 0; } #endif #endif #if defined(ARCH_IA64) static inline BLASULONG cpuid(BLASULONG regnum){ BLASULONG value; #ifndef __ECC asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum)); #else value = __getIndReg(_IA64_REG_INDR_CPUID, regnum); #endif return value; } #if 1 void blas_set_parameter(void){ BLASULONG cpuid3, size; cpuid3 = cpuid(3); size = BITMASK(cpuid3, 16, 0xff); sgemm_p = 192 * (size + 1); dgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1); zgemm_p = 48 * (size + 1); #ifdef EXPRECISION qgemm_p = 64 * (size + 1); xgemm_p = 32 * (size + 1); #endif #ifdef QUAD_PRECISION qgemm_p = 32 * (size + 1); xgemm_p = 16 * (size + 1); #endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; #if defined(EXPRECISION) || defined(QUAD_PRECISION) qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; #endif return; } #else #define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size" #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info" void blas_set_parameter(void){ BLASULONG cpuid3; int size = 0; #if 1 char buffer[128]; FILE *infile; if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) { fgets(buffer, sizeof(buffer), infile); fclose(infile); size = atoi(buffer) / 1536; } if (size <= 0) { if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) { while(fgets(buffer, sizeof(buffer), infile) != NULL) { if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break; } fgets(buffer, sizeof(buffer), infile); fclose(infile); *strstr(buffer, "bytes") = (char)NULL; size = atoi(strchr(buffer, ':') + 1) / 1572864; } } #endif /* The last resort */ if (size <= 0) { cpuid3 = cpuid(3); size = BITMASK(cpuid3, 16, 0xff) + 1; } sgemm_p = 320 * size; dgemm_p = 160 * size; cgemm_p = 160 * size; zgemm_p = 80 * size; #ifdef EXPRECISION qgemm_p = 80 * size; xgemm_p = 40 * size; #endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; #ifdef EXPRECISION qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; #endif return; } #endif #endif #if defined(ARCH_MIPS64) void blas_set_parameter(void){ #if defined(LOONGSON3A) #ifdef SMP if(blas_num_threads == 1){ #endif //single thread dgemm_r = 1024; #ifdef SMP }else{ //multi thread dgemm_r = 200; } #endif #endif #if defined(LOONGSON3B) #ifdef SMP if(blas_num_threads == 1 || blas_num_threads == 2){ #endif //single thread dgemm_r = 640; #ifdef SMP }else{ //multi thread dgemm_r = 160; } #endif #endif } #endif #if defined(ARCH_ARM64) #if defined(VULCAN) || defined(THUNDERX2T99) unsigned long dgemm_prefetch_size_a; unsigned long dgemm_prefetch_size_b; unsigned long dgemm_prefetch_size_c; #endif void blas_set_parameter(void) { #if defined(VULCAN) || defined(THUNDERX2T99) dgemm_p = 160; dgemm_q = 128; dgemm_r = 4096; sgemm_p = 128; sgemm_q = 352; sgemm_r = 4096; cgemm_p = 128; cgemm_q = 224; cgemm_r = 4096; zgemm_p = 128; zgemm_q = 112; zgemm_r = 4096; dgemm_prefetch_size_a = 3584; dgemm_prefetch_size_b = 512; dgemm_prefetch_size_c = 128; #endif } #endif OpenBLAS-0.2.20/driver/others/profile.c000066400000000000000000000124171313527062700175540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #include #include #define USE_FUNCTABLE #include "../../interface/functable.h" func_profile_t function_profile_table[MAX_PROF_TABLE]; int gotoblas_profile = 1; static struct sigaction sa, ig; void gotoblas_profile_quit(void) { int i; unsigned long long calls, fops, cycles, tcycles, area; sigaction(SIGPROF, &ig, NULL); calls = 0; fops = 0; cycles = 0; tcycles = 0; area = 0; for (i = 0; i < MAX_PROF_TABLE; i ++) { if (function_profile_table[i].calls) { calls += function_profile_table[i].calls; cycles += function_profile_table[i].cycles; tcycles += function_profile_table[i].tcycles; area += function_profile_table[i].area; fops += function_profile_table[i].fops; } } if (cycles > 0) { fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n"); fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n"); for (i = 0; i < MAX_PROF_TABLE; i ++) { if (function_profile_table[i].calls) { #ifndef OS_WINDOWS fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n", #else fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n", #endif func_table[i], function_profile_table[i].calls, (double)function_profile_table[i].cycles / (double)cycles * 100., (double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100., (double)function_profile_table[i].area / (double)function_profile_table[i].cycles, function_profile_table[i].cycles ); } } fprintf(stderr, " --------------------------------------------------------------------\n"); #ifndef OS_WINDOWS fprintf(stderr, "%-12s : %10Ld %10.3f%% %8.2f\n", #else fprintf(stderr, "%-12s : %10lld %10.3f%% %8.2f\n", #endif "Total", calls, (double)fops / (double)tcycles * 100., (double)area / (double)cycles); } sigaction(SIGPROF, &sa, NULL); } void gotoblas_profile_clear(void) { int i; for (i = 0; i < MAX_PROF_TABLE; i ++) { function_profile_table[i].calls = 0; function_profile_table[i].cycles = 0; function_profile_table[i].tcycles = 0; function_profile_table[i].area = 0; function_profile_table[i].fops = 0; } } void gotoblas_profile_init(void) { gotoblas_profile_clear(); bzero(&sa, sizeof(struct sigaction)); sa.sa_handler = (void *)gotoblas_profile_quit; sa.sa_flags = SA_NODEFER | SA_RESETHAND; bzero(&ig, sizeof(struct sigaction)); ig.sa_handler = SIG_IGN; ig.sa_flags |= SA_NODEFER | SA_RESETHAND; sigaction(SIGPROF, &sa, NULL); } OpenBLAS-0.2.20/driver/others/xerbla.c000066400000000000000000000065351313527062700173750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) #include #undef printf #define printf _cprintf #endif #ifdef INTERFACE64 #define MSGFMT " ** On entry to %6s parameter number %2ld had an illegal value\n" #else #define MSGFMT " ** On entry to %6s parameter number %2d had an illegal value\n" #endif #ifdef __ELF__ int __xerbla(char *message, blasint *info, blasint length){ printf(MSGFMT, message, *info); return 0; } int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("__xerbla"))); #else int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){ printf(MSGFMT, message, *info); return 0; } #endif OpenBLAS-0.2.20/exports/000077500000000000000000000000001313527062700146505ustar00rootroot00000000000000OpenBLAS-0.2.20/exports/Makefile000066400000000000000000000174321313527062700163170ustar00rootroot00000000000000TOPDIR = .. include ../Makefile.system ifndef EXPRECISION EXPRECISION = 0 endif ifndef NO_CBLAS NO_CBLAS = 0 endif ifndef NO_LAPACK NO_LAPACK = 0 endif ifndef NO_LAPACKE NO_LAPACKE = 0 endif ifndef NEED2UNDERSCORES NEED2UNDERSCORES=0 endif ifndef ONLY_CBLAS ONLY_CBLAS = 0 endif ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) ifndef ONLY_CBLAS EXTRALIB += -lgfortran endif endif ifeq ($(USE_OPENMP), 1) ifeq ($(C_COMPILER), GCC) EXTRALIB += -lgomp endif endif endif ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(F_COMPILER), GFORTRAN) ifndef ONLY_CBLAS EXTRALIB += -lgfortran endif endif endif all:: libs:: prof:: hpl:: libgoto_hpl.$(LIBSUFFIX) hpl_p:: libgoto_hpl_p.$(LIBSUFFIX) libgoto_hpl.$(LIBSUFFIX) : ../$(LIBNAME) rm -f $(@F) $(LD) -r $(LDFLAGS) -o goto.$(SUFFIX) --whole-archive $< --no-whole-archive $(AR) cq $(@F) goto.$(SUFFIX) $(RANLIB) libgoto_hpl.$(LIBSUFFIX) libgoto_hpl_p.$(LIBSUFFIX) : ../$(LIBNAME_P) rm -f $(@F) $(LD) -r $(LDFLAGS) -o goto.$(PSUFFIX) --whole-archive $< --no-whole-archive $(AR) cq $(@F) goto.$(PSUFFIX) $(RANLIB) libgoto_hpl_p.$(LIBSUFFIX) libgoto_hpl.dll : libgoto_hpl.$(LIBSUFFIX) dllinit.$(SUFFIX) libgoto_hpl.def $(DLLWRAP) -o $(@F) --def libgoto_hpl.def --entry _dllinit -s dllinit.$(SUFFIX) --dllname libgoto_hpl.dll libgoto_hpl.$(LIBSUFFIX) lib /machine:X64 /def:libgoto_hpl.def dyn : $(LIBDYNNAME) zip : dll zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) dll : ../$(LIBDLLNAME) # On Windows, we only generate a DLL without a version suffix. This is because # applications which link against the dynamic library reference a fixed DLL name # in their import table. By instead using a stable name it is possible to # upgrade between library versions, without needing to re-link an application. # For more details see: https://github.com/xianyi/OpenBLAS/issues/127. ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) $(CC) $(CFLAGS) $(LDFLAGS) libopenblas.def dllinit.$(SUFFIX) \ -shared -o ../$(LIBDLLNAME) -Wl,--out-implib,../$(LIBDLLNAME).a \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) libopenblas.def : gensymbol perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else ../$(LIBNAME).osx.renamed : ../$(LIBNAME) objconv.def $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).osx.renamed $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) #only build without Fortran $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) so : ../$(LIBSONAME) ifeq ($(OSNAME), Android) INTERNALNAME = $(LIBPREFIX).so else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ../$(LIBSONAME) : ../$(LIBNAME) linktest.c else ../$(LIBNAME).renamed : ../$(LIBNAME) objcopy.def $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c endif ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else #for LSB env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. endif rm -f linktest endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) so : ../$(LIBSONAME) ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ../$(LIBSONAME) : ../$(LIBNAME) linktest.c else ../$(LIBNAME).renamed : ../$(LIBNAME) objcopy.def $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c endif $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ $(FEXTRALIB) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest endif ifeq ($(OSNAME), OSF1) so : ../$(LIBSONAME) ../$(LIBSONAME) : $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) ../$(LIBNAME) endif ifeq ($(OSNAME), SunOS) so : ../$(LIBSONAME) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest endif ifeq ($(OSNAME), AIX) ifeq ($(COMPILER_F77), xlf) goto32.$(SUFFIX) : ../$(LIBNAME) aix.def ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lxlf90 -lc -lm -lpthread goto64.$(SUFFIX) : ../$(LIBNAME) aix.def ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lxlf90 -lc -lm -lpthread else goto32.$(SUFFIX) : ../$(LIBNAME) aix.def ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lg2c -lc -lm goto64.$(SUFFIX) : ../$(LIBNAME) aix.def ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lg2c -lc -lm endif endif static : ../$(LIBNAME) $(LD) $(LDFLAGS) -r -o goto.$(SUFFIX) \ --whole-archive ../$(LIBNAME) --no-whole-archive rm -f ../$(LIBNAME) $(AR) -cq ../$(LIBNAME) goto.$(SUFFIX) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed include ../Makefile.tail OpenBLAS-0.2.20/exports/check_objs.sh000077500000000000000000000015101313527062700172760ustar00rootroot00000000000000#!/bin/bash while read OBJ; do if echo "$OBJ"|grep "_$" >/dev/null then [ "$OBJ" = "caxpyc_" ] && continue [ "$OBJ" = "zaxpyc_" ] && continue [ "$OBJ" = "blas_thread_shutdown_" ] && continue O1=$(echo "$OBJ"|sed -e 's/_$//' ) if grep -w "$O1" exports/gensymbol >/dev/null then true else echo "$O1" fi continue fi if echo "$OBJ"|grep "^cblas" >/dev/null then if grep -w "$OBJ" exports/gensymbol >/dev/null then true else echo "$OBJ" fi continue fi if echo "$OBJ"|grep "^LAPACKE" >/dev/null then if grep -w "$OBJ" exports/gensymbol >/dev/null then true else echo "$OBJ" fi continue fi if echo "$OBJ"|grep "^lapack" >/dev/null then if grep -w "$OBJ" exports/gensymbol >/dev/null then true else echo "$OBJ" fi fi done OpenBLAS-0.2.20/exports/dllinit.c000066400000000000000000000056151313527062700164620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" void gotoblas_init(void); void gotoblas_quit(void); BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { if (reason == DLL_PROCESS_ATTACH) { gotoblas_init(); } if (reason == DLL_PROCESS_DETACH) { gotoblas_quit(); } return TRUE; } OpenBLAS-0.2.20/exports/gensymbol000066400000000000000000002677651313527062700166210ustar00rootroot00000000000000#!/usr/bin/perl # Changelog # 2017/09/03 staticfloat # Added zsymv and csymv into @lapackobjs2 so they are properly renamed # # 2017/07/01 Saar # removed zsymv_ and csymv_ from @blasobs, because these functions # are now in lapack-3.7.0 # added blas_thread_shutdown_ # added Cblas_cgemm3m and Cblas_zgemm3m # added somatcopy_, simatcopy_ ... # added new functions from lapack-3.7.0 # added LAPACKE deprecated objs from lapack-3.7.0 # # 2017/08/01 Saar # removed blas_thread_shutdown_ # @blasobjs = ( caxpy,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru, chbmv,chemm,chemv,cher2,cher2k,cher,cherk, chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, ctrsv, damax,damin,dasum,daxpy,dcabs1,dcopy,ddot,dgbmv,dgemm, dgemv,dger,dmax,dmin,dnrm2,drot,drotg,drotm,drotmg,dsbmv, dscal,dsdot,dspmv,dspr2, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,dzamax,dzamin,dzasum,dznrm2, icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, smax,smin,snrm2, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, zdscal,zgbmv,zgemm,zgemv,zgerc,zgeru, zhbmv,zhemm,zhemv,zher2,zher2k,zher,zherk,zhpmv,zhpr2, zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, xerbla, saxpby,daxpby,caxpby,zaxpby, sgeadd,dgeadd,cgeadd,zgeadd, somatcopy, simatcopy, domatcopy, dimatcopy, comatcopy, cimatcopy, zomatcopy, zimatcopy, ); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_dzasum, cblas_dznrm2, cblas_icamax, cblas_idamax, cblas_isamax, cblas_izamax, cblas_sasum, cblas_saxpy, cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, cblas_strsv, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd ); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qgemv,qger,qmax,qmin, qnrm2, qsbmv,qscal,qspmv,qspr2, qspr,qswap,qsymm,qsymv,qsyr2,qsyr2k,qsyr,qsyrk,qtbmv,qtbsv, qtpmv,qtpsv,qtrmm,qtrmv,qtrsm,qtrsv, qxamax,qxamin,qxasum,qxnrm2, xaxpy,xcopy,xdotc,xdotu, xqscal,xgbmv,xgemm,xgemv,xgerc,xgeru, xhbmv,xhemm,xhemv,xher2,xher2k,xher,xherk,xhpmv,xhpr2, xhpr,xscal,xswap,xsymm,xsyr2k,xsyrk,xtbmv, xtbsv,xtpmv,xtpsv,xtrmm,xtrmv,xtrsm,xtrsv, # qrot,qrotg,qrotm,qrotmg, # xdrot,xrotg, ); @gemm3mobjs = ( cgemm3m,zgemm3m ); @cblasgemm3mobjs = ( cblas_cgemm3m,cblas_zgemm3m ); #both underscore and no underscore @misc_common_objs = ( openblas_get_parallel, openblas_get_num_procs, openblas_set_num_threads, openblas_get_num_threads, ); @misc_no_underscore_objs = ( goto_set_num_threads, openblas_get_config, openblas_get_corename, ); @misc_underscore_objs = ( ); @lapackobjs = ( # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, sgetf2, dgetf2, cgetf2, zgetf2, sgetrf, dgetrf, cgetrf, zgetrf, slaswp, dlaswp, claswp, zlaswp, sgetrs, dgetrs, cgetrs, zgetrs, slauu2, dlauu2, clauu2, zlauu2, slauum, dlauum, clauum, zlauum, spotf2, dpotf2, cpotf2, zpotf2, spotrf, dpotrf, cpotrf, zpotrf, strti2, dtrti2, ctrti2, ztrti2, strtri, dtrtri, ctrtri, ztrtri, spotri, dpotri, cpotri, zpotri, ); @lapackobjs2 = ( # These routines are provided by LAPACK (reference implementation). # # This list is prepared by copying all routines listed in # `lapack-3.4.1/SRC/Makefile` and replacing the '.o' suffix with a comma. # Thereafter the following routines should be removed: # - those provided by OpenBLAS (see @lapackobjs) # - extra precision routines (see @lapack_extendedprecision_objs) # Each of these have been marked individually with "already provided" or "excluded". # ALLAUX -- Auxiliary routines called from all precisions # already provided by @blasobjs: xerbla, lsame ilaenv, ieeeck, lsamen, iparmq, ilaprec, ilatrans, ilauplo, iladiag, ilaver, slamch, slamc3, # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. # excluded: second_$(TIMER) sbdsdc, sbdsqr, sdisna, slabad, slacpy, sladiv, slae2, slaebz, slaed0, slaed1, slaed2, slaed3, slaed4, slaed5, slaed6, slaed7, slaed8, slaed9, slaeda, slaev2, slagtf, slagts, slamrg, slanst, slapy2, slapy3, slarnv, slarra, slarrb, slarrc, slarrd, slarre, slarrf, slarrj, slarrk, slarrr, slaneg, slartg, slaruv, slas2, slascl, slasd0, slasd1, slasd2, slasd3, slasd4, slasd5, slasd6, slasd7, slasd8, slasda, slasdq, slasdt, slaset, slasq1, slasq2, slasq3, slasq4, slasq5, slasq6, slasr, slasrt, slassq, slasv2, spttrf, sstebz, sstedc, ssteqr, ssterf, slaisnan, sisnan, slartgp, slartgs, # DZLAUX -- Auxiliary routines called from both DOUBLE and COMPLEX*16. # excluded: dsecnd_$(TIMER) dbdsdc, dbdsqr, ddisna, dlabad, dlacpy, dladiv, dlae2, dlaebz, dlaed0, dlaed1, dlaed2, dlaed3, dlaed4, dlaed5, dlaed6, dlaed7, dlaed8, dlaed9, dlaeda, dlaev2, dlagtf, dlagts, dlamrg, dlanst, dlapy2, dlapy3, dlarnv, dlarra, dlarrb, dlarrc, dlarrd, dlarre, dlarrf, dlarrj, dlarrk, dlarrr, dlaneg, dlartg, dlaruv, dlas2, dlascl, dlasd0, dlasd1, dlasd2, dlasd3, dlasd4, dlasd5, dlasd6, dlasd7, dlasd8, dlasda, dlasdq, dlasdt, dlaset, dlasq1, dlasq2, dlasq3, dlasq4, dlasq5, dlasq6, dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, dsteqr, dsterf, dlaisnan, disnan, dlartgp, dlartgs, dlamch, dlamc3, # SLASRC -- Single precision real LAPACK routines # already provided by @lapackobjs: # sgesv, sgetf2, slaswp, slauu2, slauum, spotf2, spotri, strti2, strtri sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, sgehd2, sgehrd, sgelq2, sgelqf, sgels, sgelsd, sgelss, sgelsy, sgeql2, sgeqlf, sgeqp3, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, sgetri, sggbak, sggbal, sgges, sggesx, sggev, sggevx, sggglm, sgghrd, sgglse, sggqrf, sggrqf, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, shsein, shseqr, slabrd, slacon, slacn2, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, slapll, slapmt, slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, slaqtr, slar1v, slar2v, ilaslr, ilaslc, slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, slarrv, slartv, slarz, slarzb, slarzt, slasy2, slasyf, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, sopgtr, sopmtr, sorg2l, sorg2r, sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, sorgrq, sorgtr, sorm2l, sorm2r, sormbr, sormhr, sorml2, sormlq, sormql, sormqr, sormr2, sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, spbstf, spbsv, spbsvx, spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, sposvx, spstrf, spstf2, sppcon, sppequ, spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, ssbtrd, sspcon, sspev, sspevd, sspevx, sspgst, sspgv, sspgvd, sspgvx, ssprfs, sspsv, sspsvx, ssptrd, ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytri2, ssytri2x, ssyswapr, ssytrs, ssytrs2, ssyconv, stbcon, stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, stptrs, strcon, strevc, strexc, strrfs, strsen, strsna, strsyl, strtrs, stzrzf, sstemr, slansf, spftrf, spftri, spftrs, ssfrk, stfsm, stftri, stfttp, stfttr, stpttf, stpttr, strttf, strttp, sgejsv, sgesvj, sgsvj0, sgsvj1, sgeequb, ssyequb, spoequb, sgbequb, sbbcsd, slapmr, sorbdb, sorbdb1, sorbdb2, sorbdb3, sorbdb4, sorbdb5, sorbdb6, sorcsd, sorcsd2by1, sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, stpqrt, stpqrt2, stpmqrt, stprfb, # DSLASRC -- Double-single mixed precision real routines called from # single, single-extra and double precision real LAPACK # routines (i.e. from SLASRC, SXLASRC, DLASRC). # # already provided by @lapackobjs: # sgetrs, spotrf, sgetrf spotrs, # CLASRC -- Single precision complex LAPACK routines # already provided by @blasobjs: # already provided by @lapackobjs: # cgesv, cgetf2, claswp, clauu2, clauum, cpotf2, cpotri, ctrti2, ctrtri cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, cgehd2, cgehrd, cgelq2, cgelqf, cgels, cgelsd, cgelss, cgelsy, cgeql2, cgeqlf, cgeqp3, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, cgetri, cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, cgghrd, cgglse, cggqrf, cggrqf, cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, chetf2, chetrd, chetrf, chetri, chetri2, chetri2x, cheswapr, chetrs, chetrs2, chgeqz, chpcon, chpev, chpevd, chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, chpsvx, chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, clacgv, clacon, clacn2, clacp2, clacpy, clacrm, clacrt, cladiv, claed0, claed7, claed8, claein, claesy, claev2, clags2, clagtm, clahef, clahqr, clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt, clanhb, clanhe, clanhp, clanhs, clanht, clansb, clansp, clansy, clantb, clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge, claqhb, claqhe, claqhp, claqp2, claqps, claqsb, claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, claqsp, claqsy, clar1v, clar2v, ilaclr, ilaclc, clarf, clarfb, clarfg, clarft, clarfgp, clarfx, clargv, clarnv, clarrv, clartg, clartv, clarz, clarzb, clarzt, clascl, claset, clasr, classq, clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, cposv, cposvx, cpstrf, cpstf2, cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, crot, cspcon, cspmv, cspr, csprfs, cspsv, cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, cstegr, cstein, csteqr, csycon, csymv, csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, csytri2, csytri2x, csyswapr, csytrs, csytrs2, csyconv, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, ctprfs, ctptri, ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, ctrsyl, ctrtrs, ctzrzf, cung2l, cung2r, cungbr, cunghr, cungl2, cunglq, cungql, cungqr, cungr2, cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2, cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz, cunmtr, cupgtr, cupmtr, icmax1, scsum1, cstemr, chfrk, ctfttp, clanhf, cpftrf, cpftri, cpftrs, ctfsm, ctftri, ctfttr, ctpttf, ctpttr, ctrttf, ctrttp, cgeequb, cgbequb, csyequb, cpoequb, cheequb, cbbcsd, clapmr, cunbdb, cunbdb1, cunbdb2, cunbdb3, cunbdb4, cunbdb5, cunbdb6, cuncsd, cuncsd2by1, cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, ctpqrt, ctpqrt2, ctpmqrt, ctprfb, # ZCLASRC -- Double-single mixed precision complex routines called from # single, single-extra and double precision complex LAPACK # routines (i.e. from CLASRC, CXLASRC, ZLASRC). # # already provided by @lapackobjs: # cgetrs, cpotrf, cgetrf cpotrs, # DLASRC -- Double precision real LAPACK routines # already provided by @lapackobjs: # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, # dtrti2, dtrtri dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, dgehd2, dgehrd, dgelq2, dgelqf, dgels, dgelsd, dgelss, dgelsy, dgeql2, dgeqlf, dgeqp3, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, dgetri, dggbak, dggbal, dgges, dggesx, dggev, dggevx, dggglm, dgghrd, dgglse, dggqrf, dggrqf, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, dhsein, dhseqr, dlabrd, dlacon, dlacn2, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, dlapll, dlapmt, dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, dlaqtr, dlar1v, dlar2v, iladlr, iladlc, dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, dlarrv, dlartv, dlarz, dlarzb, dlarzt, dlasy2, dlasyf, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dopgtr, dopmtr, dorg2l, dorg2r, dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, dorgrq, dorgtr, dorm2l, dorm2r, dormbr, dormhr, dorml2, dormlq, dormql, dormqr, dormr2, dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, dpbstf, dpbsv, dpbsvx, dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, dposvx, dpotrs, dpstrf, dpstf2, dppcon, dppequ, dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, dsbtrd, dspcon, dspev, dspevd, dspevx, dspgst, dspgv, dspgvd, dspgvx, dsprfs, dspsv, dspsvx, dsptrd, dsptrf, dsptri, dsptrs, dstegr, dstein, dstev, dstevd, dstevr, dstevx, dsycon, dsyev, dsyevd, dsyevr, dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, dsysv, dsysvx, dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytri2, dsytri2x, dsyswapr, dsytrs, dsytrs2, dsyconv, dtbcon, dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, dtptrs, dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, dtrtrs, dtzrzf, dstemr, dsgesv, dsposv, dlag2s, slag2d, dlat2s, dlansf, dpftrf, dpftri, dpftrs, dsfrk, dtfsm, dtftri, dtfttp, dtfttr, dtpttf, dtpttr, dtrttf, dtrttp, dgejsv, dgesvj, dgsvj0, dgsvj1, dgeequb, dsyequb, dpoequb, dgbequb, dbbcsd, dlapmr, dorbdb, dorbdb1, dorbdb2, dorbdb3, dorbdb4, dorbdb5, dorbdb6, dorcsd, dorcsd2by1, dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, dtpqrt, dtpqrt2, dtpmqrt, dtprfb, # ZLASRC -- Double precision complex LAPACK routines # already provided by @blasobjs: # already provided by @lapackobjs: # zgesv, zgetrs, zgetf2, zlaswp, zlauu2, zlauum, zpotf2, zpotrf, zpotri, # ztrti2, ztrtri zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, zgehd2, zgehrd, zgelq2, zgelqf, zgels, zgelsd, zgelss, zgelsy, zgeql2, zgeqlf, zgeqp3, zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf, zgesc2, zgesdd, zgesvd, zgesvx, zgetc2, zgetri, zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, zgghrd, zgglse, zggqrf, zggrqf, zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, zhetf2, zhetrd, zhetrf, zhetri, zhetri2, zhetri2x, zheswapr, zhetrs, zhetrs2, zhgeqz, zhpcon, zhpev, zhpevd, zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, zhpsvx, zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, zlacgv, zlacon, zlacn2, zlacp2, zlacpy, zlacrm, zlacrt, zladiv, zlaed0, zlaed7, zlaed8, zlaein, zlaesy, zlaev2, zlags2, zlagtm, zlahef, zlahqr, zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange, zlangt, zlanhb, zlanhe, zlanhp, zlanhs, zlanht, zlansb, zlansp, zlansy, zlantb, zlantp, zlantr, zlapll, zlapmt, zlaqgb, zlaqge, zlaqhb, zlaqhe, zlaqhp, zlaqp2, zlaqps, zlaqsb, zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, zlaqsp, zlaqsy, zlar1v, zlar2v, ilazlr, ilazlc, zlarcm, zlarf, zlarfb, zlarfg, zlarft, zlarfgp, zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, zlassq, zlasyf, zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, zposv, zposvx, zpotrs, zpstrf, zpstf2, zppcon, zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, zrot, zspcon, zspmv, zspr, zsprfs, zspsv, zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, zstegr, zstein, zsteqr, zsycon, zsymv, zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, zsytri2, zsytri2x, zsyswapr, zsytrs, zsytrs2, zsyconv, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, ztprfs, ztptri, ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, ztrsyl, ztrtrs, ztzrzf, zung2l, zung2r, zungbr, zunghr, zungl2, zunglq, zungql, zungqr, zungr2, zungrq, zungtr, zunm2l, zunm2r, zunmbr, zunmhr, zunml2, zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz, zunmtr, zupgtr, zupmtr, izmax1, dzsum1, zstemr, zcgesv, zcposv, zlag2c, clag2z, zlat2c, zhfrk, ztfttp, zlanhf, zpftrf, zpftri, zpftrs, ztfsm, ztftri, ztfttr, ztpttf, ztpttr, ztrttf, ztrttp, zgeequb, zgbequb, zsyequb, zpoequb, zheequb, zbbcsd, zlapmr, zunbdb, zunbdb1, zunbdb2, zunbdb3, zunbdb4, zunbdb5, zunbdb6, zuncsd, zuncsd2by1, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, # functions added for lapack-3.6.0 cgejsv, cgesvdx, cgesvj, cgetrf2, cgges3, cggev3, cgghd3, cggsvd3, cggsvp3, cgsvj0, cgsvj1, clagge, claghe, clagsy, clahilb, clakf2, clarge, clarnd, claror, clarot, clatm1, clatm2, clatm3, clatm5, clatm6, clatme, clatmr, clatms, clatmt, cpotrf2, csbmv, cspr2, csyr2, cunm22, dbdsvdx, dgesvdx, dgetrf2, dgges3, dggev3, dgghd3, dggsvd3, dggsvp3, dladiv2, dlagge, dlagsy, dlahilb, dlakf2, dlaran, dlarge, dlarnd, dlaror, dlarot, dlatm1, dlatm2, dlatm3, dlatm5, dlatm6, dlatm7, dlatme, dlatmr, dlatms, dlatmt, dorm22, dpotrf2, dsecnd, sbdsvdx, second, sgesvdx, sgetrf2, sgges3, sggev3, sgghd3, sggsvd3, sggsvp3, sladiv2, slagge, slagsy, slahilb, slakf2, slaran, slarge, slarnd, slaror, slarot, slatm1, slatm2, slatm3, slatm5, slatm6, slatm7, slatme, slatmr, slatms, slatmt, sorm22, spotrf2, zgejsv, zgesvdx, zgesvj, zgetrf2, zgges3, zggev3, zgghd3, zggsvd3, zggsvp3, zgsvj0, zgsvj1, zlagge, zlaghe, zlagsy, zlahilb, zlakf2, zlarge, zlarnd, zlaror, zlarot, zlatm1, zlatm2, zlatm3, zlatm5, zlatm6, zlatme, zlatmr, zlatms, zlatmt, zpotrf2, zsbmv, zspr2, zsyr2, zunm22, # functions added for lapack-3.7.0 slarfy, slasyf_rk, ssyconvf_rook, ssytf2_rk, ssytrf_rk, ssytrs_3, ssytri_3, ssytri_3x, ssycon_3, ssysv_rk, slasyf_aa, ssysv_aa, ssytrf_aa, ssytrs_aa, strevc3, sgelqt, sgelqt3, sgemlqt, sgetsls, sgeqr, slatsqr, slamtsqr, sgemqr, sgelq, slaswlq, slamswlq, sgemlq, stplqt, stplqt2, stpmlqt, ssytrd_2stage, ssytrd_sy2sb, ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage, ssyev_2stage, ssyevx_2stage, ssyevr_2stage, ssbev_2stage, ssbevx_2stage, ssbevd_2stage, ssygv_2stage, dlarfy, dlasyf_rk, dsyconvf, dsyconvf_rook, dsytf2_rk, dsytrf_rk, dsytrs_3, dsytri_3, dsytri_3x, dsycon_3, dsysv_rk, dlasyf_aa, dsysv_aa, dsytrf_aa, dsytrs_aa, dtrevc3, dgelqt, dgelqt3, dgemlqt, dgetsls, dgeqr, dlatsqr, dlamtsqr, dgemqr, dgelq, dlaswlq, dlamswlq, dgemlq, dtplqt, dtplqt2, dtpmlqt, dsytrd_2stage, dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels, dsyevd_2stage, dsyev_2stage, dsyevx_2stage, dsyevr_2stage, dsbev_2stage, dsbevx_2stage, dsbevd_2stage, dsygv_2stage, chetf2_rk, chetrf_rk, chetri_3, chetri_3x, chetrs_3, checon_3, chesv_rk, chesv_aa, chetrf_aa, chetrs_aa, clahef_aa, clahef_rk, clarfy, clasyf_rk, clasyf_aa, csyconvf, csyconvf_rook, csytf2_rk, csytrf_rk, csytrf_aa, csytrs_3, csytrs_aa, csytri_3, csytri_3x, csycon_3, csysv_rk, csysv_aa, ctrevc3, cgelqt, cgelqt3, cgemlqt, cgetsls, cgeqr, clatsqr, clamtsqr, cgemqr, cgelq, claswlq, clamswlq, cgemlq, ctplqt, ctplqt2, ctpmlqt, chetrd_2stage, chetrd_he2hb, chetrd_hb2st, chb2st_kernels, cheevd_2stage, cheev_2stage, cheevx_2stage, cheevr_2stage, chbev_2stage, chbevx_2stage, chbevd_2stage, chegv_2stage, zhetf2_rk, zhetrf_rk, zhetri_3, zhetri_3x, zhetrs_3, zhecon_3, zhesv_rk, zhesv_aa, zhetrf_aa, zhetrs_aa, zlahef_aa, zlahef_rk, zlarfy, zlasyf_rk, zlasyf_aa, zsyconvf, zsyconvf_rook, zsytrs_aa, zsytf2_rk, zsytrf_rk, zsytrf_aa, zsytrs_3, zsytri_3, zsytri_3x, zsycon_3, zsysv_rk, zsysv_aa, ztrevc3, ztplqt, ztplqt2, ztpmlqt, zgelqt, zgelqt3, zgemlqt, zgetsls, zgeqr, zlatsqr, zlamtsqr, zgemqr, zgelq, zlaswlq, zlamswlq, zgemlq, zhetrd_2stage, zhetrd_he2hb, zhetrd_hb2st, zhb2st_kernels, zheevd_2stage, zheev_2stage, zheevx_2stage, zheevr_2stage, zhbev_2stage, zhbevx_2stage, zhbevd_2stage, zhegv_2stage, sladiv1, dladiv1, iparam2stage, ); @lapack_extendedprecision_objs = ( zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, ); @lapack_deprecated_objs = ( cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp, cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd, cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm, cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf, ); @lapacke_deprecated_objs = ( LAPACKE_cggsvp, LAPACKE_cggsvp_work, LAPACKE_dggsvp, LAPACKE_dggsvp_work, LAPACKE_sggsvp, LAPACKE_sggsvp_work, LAPACKE_zggsvp, LAPACKE_zggsvp_work, LAPACKE_cggsvd, LAPACKE_cggsvd_work, LAPACKE_dggsvd, LAPACKE_dggsvd_work, LAPACKE_sggsvd, LAPACKE_sggsvd_work, LAPACKE_zggsvd, LAPACKE_zggsvd_work, LAPACKE_cgeqpf, LAPACKE_cgeqpf_work, LAPACKE_dgeqpf, LAPACKE_dgeqpf_work, LAPACKE_sgeqpf, LAPACKE_sgeqpf_work, LAPACKE_zgeqpf, LAPACKE_zgeqpf_work, ); @lapackeobjs = ( # LAPACK C interface routines. # # This list is prepared in a similar manner to @lapackobjs2, however the # functions all begin with an uppercase prefix (with the exception of the # make_complex_* routines). # # The functions corresponding to @(MATGEN_OBJ) and @(SRCX_OBJ) are not # exported since the respective LAPACK routines are not built by default. # @(OBJ) from `lapack-3.4.1/lapacke/utils/Makefile` LAPACKE_cgb_nancheck, LAPACKE_cgb_trans, LAPACKE_cge_nancheck, LAPACKE_cge_trans, LAPACKE_cgg_nancheck, LAPACKE_cgg_trans, LAPACKE_cgt_nancheck, LAPACKE_chb_nancheck, LAPACKE_chb_trans, LAPACKE_che_nancheck, LAPACKE_che_trans, LAPACKE_chp_nancheck, LAPACKE_chp_trans, LAPACKE_chs_nancheck, LAPACKE_chs_trans, LAPACKE_c_nancheck, LAPACKE_cpb_nancheck, LAPACKE_cpb_trans, LAPACKE_cpf_nancheck, LAPACKE_cpf_trans, LAPACKE_cpo_nancheck, LAPACKE_cpo_trans, LAPACKE_cpp_nancheck, LAPACKE_cpp_trans, LAPACKE_cpt_nancheck, LAPACKE_csp_nancheck, LAPACKE_csp_trans, LAPACKE_cst_nancheck, LAPACKE_csy_nancheck, LAPACKE_csy_trans, LAPACKE_ctb_nancheck, LAPACKE_ctb_trans, LAPACKE_ctf_nancheck, LAPACKE_ctf_trans, LAPACKE_ctp_nancheck, LAPACKE_ctp_trans, LAPACKE_ctr_nancheck, LAPACKE_ctr_trans, LAPACKE_dgb_nancheck, LAPACKE_dgb_trans, LAPACKE_dge_nancheck, LAPACKE_dge_trans, LAPACKE_dgg_nancheck, LAPACKE_dgg_trans, LAPACKE_dgt_nancheck, LAPACKE_dhs_nancheck, LAPACKE_dhs_trans, LAPACKE_d_nancheck, LAPACKE_dpb_nancheck, LAPACKE_dpb_trans, LAPACKE_dpf_nancheck, LAPACKE_dpf_trans, LAPACKE_dpo_nancheck, LAPACKE_dpo_trans, LAPACKE_dpp_nancheck, LAPACKE_dpp_trans, LAPACKE_dpt_nancheck, LAPACKE_dsb_nancheck, LAPACKE_dsb_trans, LAPACKE_dsp_nancheck, LAPACKE_dsp_trans, LAPACKE_dst_nancheck, LAPACKE_dsy_nancheck, LAPACKE_dsy_trans, LAPACKE_dtb_nancheck, LAPACKE_dtb_trans, LAPACKE_dtf_nancheck, LAPACKE_dtf_trans, LAPACKE_dtp_nancheck, LAPACKE_dtp_trans, LAPACKE_dtr_nancheck, LAPACKE_dtr_trans, LAPACKE_lsame, LAPACKE_sgb_nancheck, LAPACKE_sgb_trans, LAPACKE_sge_nancheck, LAPACKE_sge_trans, LAPACKE_sgg_nancheck, LAPACKE_sgg_trans, LAPACKE_sgt_nancheck, LAPACKE_shs_nancheck, LAPACKE_shs_trans, LAPACKE_s_nancheck, LAPACKE_spb_nancheck, LAPACKE_spb_trans, LAPACKE_spf_nancheck, LAPACKE_spf_trans, LAPACKE_spo_nancheck, LAPACKE_spo_trans, LAPACKE_spp_nancheck, LAPACKE_spp_trans, LAPACKE_spt_nancheck, LAPACKE_ssb_nancheck, LAPACKE_ssb_trans, LAPACKE_ssp_nancheck, LAPACKE_ssp_trans, LAPACKE_sst_nancheck, LAPACKE_ssy_nancheck, LAPACKE_ssy_trans, LAPACKE_stb_nancheck, LAPACKE_stb_trans, LAPACKE_stf_nancheck, LAPACKE_stf_trans, LAPACKE_stp_nancheck, LAPACKE_stp_trans, LAPACKE_str_nancheck, LAPACKE_str_trans, LAPACKE_xerbla, LAPACKE_zgb_nancheck, LAPACKE_zgb_trans, LAPACKE_zge_nancheck, LAPACKE_zge_trans, LAPACKE_zgg_nancheck, LAPACKE_zgg_trans, LAPACKE_zgt_nancheck, LAPACKE_zhb_nancheck, LAPACKE_zhb_trans, LAPACKE_zhe_nancheck, LAPACKE_zhe_trans, LAPACKE_zhp_nancheck, LAPACKE_zhp_trans, LAPACKE_zhs_nancheck, LAPACKE_zhs_trans, LAPACKE_z_nancheck, LAPACKE_zpb_nancheck, LAPACKE_zpb_trans, LAPACKE_zpf_nancheck, LAPACKE_zpf_trans, LAPACKE_zpo_nancheck, LAPACKE_zpo_trans, LAPACKE_zpp_nancheck, LAPACKE_zpp_trans, LAPACKE_zpt_nancheck, LAPACKE_zsp_nancheck, LAPACKE_zsp_trans, LAPACKE_zst_nancheck, LAPACKE_zsy_nancheck, LAPACKE_zsy_trans, LAPACKE_ztb_nancheck, LAPACKE_ztb_trans, LAPACKE_ztf_nancheck, LAPACKE_ztf_trans, LAPACKE_ztp_nancheck, LAPACKE_ztp_trans, LAPACKE_ztr_nancheck, LAPACKE_ztr_trans, lapack_make_complex_float, lapack_make_complex_double, # @(SRC_OBJ) from `lapack-3.5.0/lapacke/src/Makefile` LAPACKE_cbbcsd, LAPACKE_cbbcsd_work, LAPACKE_cbdsqr, LAPACKE_cbdsqr_work, LAPACKE_cgbbrd, LAPACKE_cgbbrd_work, LAPACKE_cgbcon, LAPACKE_cgbcon_work, LAPACKE_cgbequ, LAPACKE_cgbequ_work, LAPACKE_cgbequb, LAPACKE_cgbequb_work, LAPACKE_cgbrfs, LAPACKE_cgbrfs_work, LAPACKE_cgbsv, LAPACKE_cgbsv_work, LAPACKE_cgbsvx, LAPACKE_cgbsvx_work, LAPACKE_cgbtrf, LAPACKE_cgbtrf_work, LAPACKE_cgbtrs, LAPACKE_cgbtrs_work, LAPACKE_cgebak, LAPACKE_cgebak_work, LAPACKE_cgebal, LAPACKE_cgebal_work, LAPACKE_cgebrd, LAPACKE_cgebrd_work, LAPACKE_cgecon, LAPACKE_cgecon_work, LAPACKE_cgeequ, LAPACKE_cgeequ_work, LAPACKE_cgeequb, LAPACKE_cgeequb_work, LAPACKE_cgees, LAPACKE_cgees_work, LAPACKE_cgeesx, LAPACKE_cgeesx_work, LAPACKE_cgeev, LAPACKE_cgeev_work, LAPACKE_cgeevx, LAPACKE_cgeevx_work, LAPACKE_cgehrd, LAPACKE_cgehrd_work, LAPACKE_cgelq2, LAPACKE_cgelq2_work, LAPACKE_cgelqf, LAPACKE_cgelqf_work, LAPACKE_cgels, LAPACKE_cgels_work, LAPACKE_cgelsd, LAPACKE_cgelsd_work, LAPACKE_cgelss, LAPACKE_cgelss_work, LAPACKE_cgelsy, LAPACKE_cgelsy_work, LAPACKE_cgemqrt, LAPACKE_cgemqrt_work, LAPACKE_cgeqlf, LAPACKE_cgeqlf_work, LAPACKE_cgeqp3, LAPACKE_cgeqp3_work, LAPACKE_cgeqr2, LAPACKE_cgeqr2_work, LAPACKE_cgeqrf, LAPACKE_cgeqrf_work, LAPACKE_cgeqrfp, LAPACKE_cgeqrfp_work, LAPACKE_cgeqrt, LAPACKE_cgeqrt2, LAPACKE_cgeqrt2_work, LAPACKE_cgeqrt3, LAPACKE_cgeqrt3_work, LAPACKE_cgeqrt_work, LAPACKE_cgerfs, LAPACKE_cgerfs_work, LAPACKE_cgerqf, LAPACKE_cgerqf_work, LAPACKE_cgesdd, LAPACKE_cgesdd_work, LAPACKE_cgesv, LAPACKE_cgesv_work, LAPACKE_cgesvd, LAPACKE_cgesvd_work, LAPACKE_cgesvx, LAPACKE_cgesvx_work, LAPACKE_cgetf2, LAPACKE_cgetf2_work, LAPACKE_cgetrf, LAPACKE_cgetrf_work, LAPACKE_cgetri, LAPACKE_cgetri_work, LAPACKE_cgetrs, LAPACKE_cgetrs_work, LAPACKE_cggbak, LAPACKE_cggbak_work, LAPACKE_cggbal, LAPACKE_cggbal_work, LAPACKE_cgges, LAPACKE_cgges_work, LAPACKE_cggesx, LAPACKE_cggesx_work, LAPACKE_cggev, LAPACKE_cggev_work, LAPACKE_cggevx, LAPACKE_cggevx_work, LAPACKE_cggglm, LAPACKE_cggglm_work, LAPACKE_cgghrd, LAPACKE_cgghrd_work, LAPACKE_cgglse, LAPACKE_cgglse_work, LAPACKE_cggqrf, LAPACKE_cggqrf_work, LAPACKE_cggrqf, LAPACKE_cggrqf_work, LAPACKE_cgtcon, LAPACKE_cgtcon_work, LAPACKE_cgtrfs, LAPACKE_cgtrfs_work, LAPACKE_cgtsv, LAPACKE_cgtsv_work, LAPACKE_cgtsvx, LAPACKE_cgtsvx_work, LAPACKE_cgttrf, LAPACKE_cgttrf_work, LAPACKE_cgttrs, LAPACKE_cgttrs_work, LAPACKE_chbev, LAPACKE_chbev_work, LAPACKE_chbevd, LAPACKE_chbevd_work, LAPACKE_chbevx, LAPACKE_chbevx_work, LAPACKE_chbgst, LAPACKE_chbgst_work, LAPACKE_chbgv, LAPACKE_chbgv_work, LAPACKE_chbgvd, LAPACKE_chbgvd_work, LAPACKE_chbgvx, LAPACKE_chbgvx_work, LAPACKE_chbtrd, LAPACKE_chbtrd_work, LAPACKE_checon, LAPACKE_checon_work, LAPACKE_cheequb, LAPACKE_cheequb_work, LAPACKE_cheev, LAPACKE_cheev_work, LAPACKE_cheevd, LAPACKE_cheevd_work, LAPACKE_cheevr, LAPACKE_cheevr_work, LAPACKE_cheevx, LAPACKE_cheevx_work, LAPACKE_chegst, LAPACKE_chegst_work, LAPACKE_chegv, LAPACKE_chegv_work, LAPACKE_chegvd, LAPACKE_chegvd_work, LAPACKE_chegvx, LAPACKE_chegvx_work, LAPACKE_cherfs, LAPACKE_cherfs_work, LAPACKE_chesv, LAPACKE_chesv_work, LAPACKE_chesvx, LAPACKE_chesvx_work, LAPACKE_cheswapr, LAPACKE_cheswapr_work, LAPACKE_chetrd, LAPACKE_chetrd_work, LAPACKE_chetrf, LAPACKE_chetrf_work, LAPACKE_chetri, LAPACKE_chetri2, LAPACKE_chetri2_work, LAPACKE_chetri2x, LAPACKE_chetri2x_work, LAPACKE_chetri_work, LAPACKE_chetrs, LAPACKE_chetrs2, LAPACKE_chetrs2_work, LAPACKE_chetrs_work, LAPACKE_chfrk, LAPACKE_chfrk_work, LAPACKE_chgeqz, LAPACKE_chgeqz_work, LAPACKE_chpcon, LAPACKE_chpcon_work, LAPACKE_chpev, LAPACKE_chpev_work, LAPACKE_chpevd, LAPACKE_chpevd_work, LAPACKE_chpevx, LAPACKE_chpevx_work, LAPACKE_chpgst, LAPACKE_chpgst_work, LAPACKE_chpgv, LAPACKE_chpgv_work, LAPACKE_chpgvd, LAPACKE_chpgvd_work, LAPACKE_chpgvx, LAPACKE_chpgvx_work, LAPACKE_chprfs, LAPACKE_chprfs_work, LAPACKE_chpsv, LAPACKE_chpsv_work, LAPACKE_chpsvx, LAPACKE_chpsvx_work, LAPACKE_chptrd, LAPACKE_chptrd_work, LAPACKE_chptrf, LAPACKE_chptrf_work, LAPACKE_chptri, LAPACKE_chptri_work, LAPACKE_chptrs, LAPACKE_chptrs_work, LAPACKE_chsein, LAPACKE_chsein_work, LAPACKE_chseqr, LAPACKE_chseqr_work, LAPACKE_clacgv, LAPACKE_clacgv_work, LAPACKE_clacn2, LAPACKE_clacn2_work, LAPACKE_clacp2, LAPACKE_clacp2_work, LAPACKE_clacpy, LAPACKE_clacpy_work, LAPACKE_clag2z, LAPACKE_clag2z_work, LAPACKE_clange, LAPACKE_clange_work, LAPACKE_clanhe, LAPACKE_clanhe_work, LAPACKE_clansy, LAPACKE_clansy_work, LAPACKE_clantr, LAPACKE_clantr_work, LAPACKE_clapmr, LAPACKE_clapmr_work, LAPACKE_clarfb, LAPACKE_clarfb_work, LAPACKE_clarfg, LAPACKE_clarfg_work, LAPACKE_clarft, LAPACKE_clarft_work, LAPACKE_clarfx, LAPACKE_clarfx_work, LAPACKE_clarnv, LAPACKE_clarnv_work, LAPACKE_claset, LAPACKE_claset_work, LAPACKE_claswp, LAPACKE_claswp_work, LAPACKE_clauum, LAPACKE_clauum_work, LAPACKE_cpbcon, LAPACKE_cpbcon_work, LAPACKE_cpbequ, LAPACKE_cpbequ_work, LAPACKE_cpbrfs, LAPACKE_cpbrfs_work, LAPACKE_cpbstf, LAPACKE_cpbstf_work, LAPACKE_cpbsv, LAPACKE_cpbsv_work, LAPACKE_cpbsvx, LAPACKE_cpbsvx_work, LAPACKE_cpbtrf, LAPACKE_cpbtrf_work, LAPACKE_cpbtrs, LAPACKE_cpbtrs_work, LAPACKE_cpftrf, LAPACKE_cpftrf_work, LAPACKE_cpftri, LAPACKE_cpftri_work, LAPACKE_cpftrs, LAPACKE_cpftrs_work, LAPACKE_cpocon, LAPACKE_cpocon_work, LAPACKE_cpoequ, LAPACKE_cpoequ_work, LAPACKE_cpoequb, LAPACKE_cpoequb_work, LAPACKE_cporfs, LAPACKE_cporfs_work, LAPACKE_cposv, LAPACKE_cposv_work, LAPACKE_cposvx, LAPACKE_cposvx_work, LAPACKE_cpotrf, LAPACKE_cpotrf_work, LAPACKE_cpotri, LAPACKE_cpotri_work, LAPACKE_cpotrs, LAPACKE_cpotrs_work, LAPACKE_cppcon, LAPACKE_cppcon_work, LAPACKE_cppequ, LAPACKE_cppequ_work, LAPACKE_cpprfs, LAPACKE_cpprfs_work, LAPACKE_cppsv, LAPACKE_cppsv_work, LAPACKE_cppsvx, LAPACKE_cppsvx_work, LAPACKE_cpptrf, LAPACKE_cpptrf_work, LAPACKE_cpptri, LAPACKE_cpptri_work, LAPACKE_cpptrs, LAPACKE_cpptrs_work, LAPACKE_cpstrf, LAPACKE_cpstrf_work, LAPACKE_cptcon, LAPACKE_cptcon_work, LAPACKE_cpteqr, LAPACKE_cpteqr_work, LAPACKE_cptrfs, LAPACKE_cptrfs_work, LAPACKE_cptsv, LAPACKE_cptsv_work, LAPACKE_cptsvx, LAPACKE_cptsvx_work, LAPACKE_cpttrf, LAPACKE_cpttrf_work, LAPACKE_cpttrs, LAPACKE_cpttrs_work, LAPACKE_cspcon, LAPACKE_cspcon_work, LAPACKE_csprfs, LAPACKE_csprfs_work, LAPACKE_cspsv, LAPACKE_cspsv_work, LAPACKE_cspsvx, LAPACKE_cspsvx_work, LAPACKE_csptrf, LAPACKE_csptrf_work, LAPACKE_csptri, LAPACKE_csptri_work, LAPACKE_csptrs, LAPACKE_csptrs_work, LAPACKE_cstedc, LAPACKE_cstedc_work, LAPACKE_cstegr, LAPACKE_cstegr_work, LAPACKE_cstein, LAPACKE_cstein_work, LAPACKE_cstemr, LAPACKE_cstemr_work, LAPACKE_csteqr, LAPACKE_csteqr_work, LAPACKE_csycon, LAPACKE_csycon_work, LAPACKE_csyconv, LAPACKE_csyconv_work, LAPACKE_csyequb, LAPACKE_csyequb_work, LAPACKE_csyrfs, LAPACKE_csyrfs_work, LAPACKE_csysv, LAPACKE_csysv_rook, LAPACKE_csysv_rook_work, LAPACKE_csysv_work, LAPACKE_csysvx, LAPACKE_csysvx_work, LAPACKE_csyswapr, LAPACKE_csyswapr_work, LAPACKE_csytrf, LAPACKE_csytrf_work, LAPACKE_csytri, LAPACKE_csytri2, LAPACKE_csytri2_work, LAPACKE_csytri2x, LAPACKE_csytri2x_work, LAPACKE_csytri_work, LAPACKE_csytrs, LAPACKE_csytrs2, LAPACKE_csytrs2_work, LAPACKE_csytrs_work, LAPACKE_ctbcon, LAPACKE_ctbcon_work, LAPACKE_ctbrfs, LAPACKE_ctbrfs_work, LAPACKE_ctbtrs, LAPACKE_ctbtrs_work, LAPACKE_ctfsm, LAPACKE_ctfsm_work, LAPACKE_ctftri, LAPACKE_ctftri_work, LAPACKE_ctfttp, LAPACKE_ctfttp_work, LAPACKE_ctfttr, LAPACKE_ctfttr_work, LAPACKE_ctgevc, LAPACKE_ctgevc_work, LAPACKE_ctgexc, LAPACKE_ctgexc_work, LAPACKE_ctgsen, LAPACKE_ctgsen_work, LAPACKE_ctgsja, LAPACKE_ctgsja_work, LAPACKE_ctgsna, LAPACKE_ctgsna_work, LAPACKE_ctgsyl, LAPACKE_ctgsyl_work, LAPACKE_ctpcon, LAPACKE_ctpcon_work, LAPACKE_ctpmqrt, LAPACKE_ctpmqrt_work, LAPACKE_ctpqrt, LAPACKE_ctpqrt2, LAPACKE_ctpqrt2_work, LAPACKE_ctpqrt_work, LAPACKE_ctprfb, LAPACKE_ctprfb_work, LAPACKE_ctprfs, LAPACKE_ctprfs_work, LAPACKE_ctptri, LAPACKE_ctptri_work, LAPACKE_ctptrs, LAPACKE_ctptrs_work, LAPACKE_ctpttf, LAPACKE_ctpttf_work, LAPACKE_ctpttr, LAPACKE_ctpttr_work, LAPACKE_ctrcon, LAPACKE_ctrcon_work, LAPACKE_ctrevc, LAPACKE_ctrevc_work, LAPACKE_ctrexc, LAPACKE_ctrexc_work, LAPACKE_ctrrfs, LAPACKE_ctrrfs_work, LAPACKE_ctrsen, LAPACKE_ctrsen_work, LAPACKE_ctrsna, LAPACKE_ctrsna_work, LAPACKE_ctrsyl, LAPACKE_ctrsyl_work, LAPACKE_ctrtri, LAPACKE_ctrtri_work, LAPACKE_ctrtrs, LAPACKE_ctrtrs_work, LAPACKE_ctrttf, LAPACKE_ctrttf_work, LAPACKE_ctrttp, LAPACKE_ctrttp_work, LAPACKE_ctzrzf, LAPACKE_ctzrzf_work, LAPACKE_cunbdb, LAPACKE_cunbdb_work, LAPACKE_cuncsd, LAPACKE_cuncsd_work, LAPACKE_cungbr, LAPACKE_cungbr_work, LAPACKE_cunghr, LAPACKE_cunghr_work, LAPACKE_cunglq, LAPACKE_cunglq_work, LAPACKE_cungql, LAPACKE_cungql_work, LAPACKE_cungqr, LAPACKE_cungqr_work, LAPACKE_cungrq, LAPACKE_cungrq_work, LAPACKE_cungtr, LAPACKE_cungtr_work, LAPACKE_cunmbr, LAPACKE_cunmbr_work, LAPACKE_cunmhr, LAPACKE_cunmhr_work, LAPACKE_cunmlq, LAPACKE_cunmlq_work, LAPACKE_cunmql, LAPACKE_cunmql_work, LAPACKE_cunmqr, LAPACKE_cunmqr_work, LAPACKE_cunmrq, LAPACKE_cunmrq_work, LAPACKE_cunmrz, LAPACKE_cunmrz_work, LAPACKE_cunmtr, LAPACKE_cunmtr_work, LAPACKE_cupgtr, LAPACKE_cupgtr_work, LAPACKE_cupmtr, LAPACKE_cupmtr_work, LAPACKE_dbbcsd, LAPACKE_dbbcsd_work, LAPACKE_dbdsdc, LAPACKE_dbdsdc_work, LAPACKE_dbdsqr, LAPACKE_dbdsqr_work, LAPACKE_ddisna, LAPACKE_ddisna_work, LAPACKE_dgbbrd, LAPACKE_dgbbrd_work, LAPACKE_dgbcon, LAPACKE_dgbcon_work, LAPACKE_dgbequ, LAPACKE_dgbequ_work, LAPACKE_dgbequb, LAPACKE_dgbequb_work, LAPACKE_dgbrfs, LAPACKE_dgbrfs_work, LAPACKE_dgbsv, LAPACKE_dgbsv_work, LAPACKE_dgbsvx, LAPACKE_dgbsvx_work, LAPACKE_dgbtrf, LAPACKE_dgbtrf_work, LAPACKE_dgbtrs, LAPACKE_dgbtrs_work, LAPACKE_dgebak, LAPACKE_dgebak_work, LAPACKE_dgebal, LAPACKE_dgebal_work, LAPACKE_dgebrd, LAPACKE_dgebrd_work, LAPACKE_dgecon, LAPACKE_dgecon_work, LAPACKE_dgeequ, LAPACKE_dgeequ_work, LAPACKE_dgeequb, LAPACKE_dgeequb_work, LAPACKE_dgees, LAPACKE_dgees_work, LAPACKE_dgeesx, LAPACKE_dgeesx_work, LAPACKE_dgeev, LAPACKE_dgeev_work, LAPACKE_dgeevx, LAPACKE_dgeevx_work, LAPACKE_dgehrd, LAPACKE_dgehrd_work, LAPACKE_dgejsv, LAPACKE_dgejsv_work, LAPACKE_dgelq2, LAPACKE_dgelq2_work, LAPACKE_dgelqf, LAPACKE_dgelqf_work, LAPACKE_dgels, LAPACKE_dgels_work, LAPACKE_dgelsd, LAPACKE_dgelsd_work, LAPACKE_dgelss, LAPACKE_dgelss_work, LAPACKE_dgelsy, LAPACKE_dgelsy_work, LAPACKE_dgemqrt, LAPACKE_dgemqrt_work, LAPACKE_dgeqlf, LAPACKE_dgeqlf_work, LAPACKE_dgeqp3, LAPACKE_dgeqp3_work, LAPACKE_dgeqr2, LAPACKE_dgeqr2_work, LAPACKE_dgeqrf, LAPACKE_dgeqrf_work, LAPACKE_dgeqrfp, LAPACKE_dgeqrfp_work, LAPACKE_dgeqrt, LAPACKE_dgeqrt2, LAPACKE_dgeqrt2_work, LAPACKE_dgeqrt3, LAPACKE_dgeqrt3_work, LAPACKE_dgeqrt_work, LAPACKE_dgerfs, LAPACKE_dgerfs_work, LAPACKE_dgerqf, LAPACKE_dgerqf_work, LAPACKE_dgesdd, LAPACKE_dgesdd_work, LAPACKE_dgesv, LAPACKE_dgesv_work, LAPACKE_dgesvd, LAPACKE_dgesvd_work, LAPACKE_dgesvj, LAPACKE_dgesvj_work, LAPACKE_dgesvx, LAPACKE_dgesvx_work, LAPACKE_dgetf2, LAPACKE_dgetf2_work, LAPACKE_dgetrf, LAPACKE_dgetrf_work, LAPACKE_dgetri, LAPACKE_dgetri_work, LAPACKE_dgetrs, LAPACKE_dgetrs_work, LAPACKE_dggbak, LAPACKE_dggbak_work, LAPACKE_dggbal, LAPACKE_dggbal_work, LAPACKE_dgges, LAPACKE_dgges_work, LAPACKE_dggesx, LAPACKE_dggesx_work, LAPACKE_dggev, LAPACKE_dggev_work, LAPACKE_dggevx, LAPACKE_dggevx_work, LAPACKE_dggglm, LAPACKE_dggglm_work, LAPACKE_dgghrd, LAPACKE_dgghrd_work, LAPACKE_dgglse, LAPACKE_dgglse_work, LAPACKE_dggqrf, LAPACKE_dggqrf_work, LAPACKE_dggrqf, LAPACKE_dggrqf_work, LAPACKE_dgtcon, LAPACKE_dgtcon_work, LAPACKE_dgtrfs, LAPACKE_dgtrfs_work, LAPACKE_dgtsv, LAPACKE_dgtsv_work, LAPACKE_dgtsvx, LAPACKE_dgtsvx_work, LAPACKE_dgttrf, LAPACKE_dgttrf_work, LAPACKE_dgttrs, LAPACKE_dgttrs_work, LAPACKE_dhgeqz, LAPACKE_dhgeqz_work, LAPACKE_dhsein, LAPACKE_dhsein_work, LAPACKE_dhseqr, LAPACKE_dhseqr_work, LAPACKE_dlacn2, LAPACKE_dlacn2_work, LAPACKE_dlacpy, LAPACKE_dlacpy_work, LAPACKE_dlag2s, LAPACKE_dlag2s_work, LAPACKE_dlamch, LAPACKE_dlamch_work, LAPACKE_dlange, LAPACKE_dlange_work, LAPACKE_dlansy, LAPACKE_dlansy_work, LAPACKE_dlantr, LAPACKE_dlantr_work, LAPACKE_dlapmr, LAPACKE_dlapmr_work, LAPACKE_dlapy2, LAPACKE_dlapy2_work, LAPACKE_dlapy3, LAPACKE_dlapy3_work, LAPACKE_dlarfb, LAPACKE_dlarfb_work, LAPACKE_dlarfg, LAPACKE_dlarfg_work, LAPACKE_dlarft, LAPACKE_dlarft_work, LAPACKE_dlarfx, LAPACKE_dlarfx_work, LAPACKE_dlarnv, LAPACKE_dlarnv_work, LAPACKE_dlartgp, LAPACKE_dlartgp_work, LAPACKE_dlartgs, LAPACKE_dlartgs_work, LAPACKE_dlaset, LAPACKE_dlaset_work, LAPACKE_dlasrt, LAPACKE_dlasrt_work, LAPACKE_dlaswp, LAPACKE_dlaswp_work, LAPACKE_dlauum, LAPACKE_dlauum_work, LAPACKE_dopgtr, LAPACKE_dopgtr_work, LAPACKE_dopmtr, LAPACKE_dopmtr_work, LAPACKE_dorbdb, LAPACKE_dorbdb_work, LAPACKE_dorcsd, LAPACKE_dorcsd_work, LAPACKE_dorgbr, LAPACKE_dorgbr_work, LAPACKE_dorghr, LAPACKE_dorghr_work, LAPACKE_dorglq, LAPACKE_dorglq_work, LAPACKE_dorgql, LAPACKE_dorgql_work, LAPACKE_dorgqr, LAPACKE_dorgqr_work, LAPACKE_dorgrq, LAPACKE_dorgrq_work, LAPACKE_dorgtr, LAPACKE_dorgtr_work, LAPACKE_dormbr, LAPACKE_dormbr_work, LAPACKE_dormhr, LAPACKE_dormhr_work, LAPACKE_dormlq, LAPACKE_dormlq_work, LAPACKE_dormql, LAPACKE_dormql_work, LAPACKE_dormqr, LAPACKE_dormqr_work, LAPACKE_dormrq, LAPACKE_dormrq_work, LAPACKE_dormrz, LAPACKE_dormrz_work, LAPACKE_dormtr, LAPACKE_dormtr_work, LAPACKE_dpbcon, LAPACKE_dpbcon_work, LAPACKE_dpbequ, LAPACKE_dpbequ_work, LAPACKE_dpbrfs, LAPACKE_dpbrfs_work, LAPACKE_dpbstf, LAPACKE_dpbstf_work, LAPACKE_dpbsv, LAPACKE_dpbsv_work, LAPACKE_dpbsvx, LAPACKE_dpbsvx_work, LAPACKE_dpbtrf, LAPACKE_dpbtrf_work, LAPACKE_dpbtrs, LAPACKE_dpbtrs_work, LAPACKE_dpftrf, LAPACKE_dpftrf_work, LAPACKE_dpftri, LAPACKE_dpftri_work, LAPACKE_dpftrs, LAPACKE_dpftrs_work, LAPACKE_dpocon, LAPACKE_dpocon_work, LAPACKE_dpoequ, LAPACKE_dpoequ_work, LAPACKE_dpoequb, LAPACKE_dpoequb_work, LAPACKE_dporfs, LAPACKE_dporfs_work, LAPACKE_dposv, LAPACKE_dposv_work, LAPACKE_dposvx, LAPACKE_dposvx_work, LAPACKE_dpotrf, LAPACKE_dpotrf_work, LAPACKE_dpotri, LAPACKE_dpotri_work, LAPACKE_dpotrs, LAPACKE_dpotrs_work, LAPACKE_dppcon, LAPACKE_dppcon_work, LAPACKE_dppequ, LAPACKE_dppequ_work, LAPACKE_dpprfs, LAPACKE_dpprfs_work, LAPACKE_dppsv, LAPACKE_dppsv_work, LAPACKE_dppsvx, LAPACKE_dppsvx_work, LAPACKE_dpptrf, LAPACKE_dpptrf_work, LAPACKE_dpptri, LAPACKE_dpptri_work, LAPACKE_dpptrs, LAPACKE_dpptrs_work, LAPACKE_dpstrf, LAPACKE_dpstrf_work, LAPACKE_dptcon, LAPACKE_dptcon_work, LAPACKE_dpteqr, LAPACKE_dpteqr_work, LAPACKE_dptrfs, LAPACKE_dptrfs_work, LAPACKE_dptsv, LAPACKE_dptsv_work, LAPACKE_dptsvx, LAPACKE_dptsvx_work, LAPACKE_dpttrf, LAPACKE_dpttrf_work, LAPACKE_dpttrs, LAPACKE_dpttrs_work, LAPACKE_dsbev, LAPACKE_dsbev_work, LAPACKE_dsbevd, LAPACKE_dsbevd_work, LAPACKE_dsbevx, LAPACKE_dsbevx_work, LAPACKE_dsbgst, LAPACKE_dsbgst_work, LAPACKE_dsbgv, LAPACKE_dsbgv_work, LAPACKE_dsbgvd, LAPACKE_dsbgvd_work, LAPACKE_dsbgvx, LAPACKE_dsbgvx_work, LAPACKE_dsbtrd, LAPACKE_dsbtrd_work, LAPACKE_dsfrk, LAPACKE_dsfrk_work, LAPACKE_dsgesv, LAPACKE_dsgesv_work, LAPACKE_dspcon, LAPACKE_dspcon_work, LAPACKE_dspev, LAPACKE_dspev_work, LAPACKE_dspevd, LAPACKE_dspevd_work, LAPACKE_dspevx, LAPACKE_dspevx_work, LAPACKE_dspgst, LAPACKE_dspgst_work, LAPACKE_dspgv, LAPACKE_dspgv_work, LAPACKE_dspgvd, LAPACKE_dspgvd_work, LAPACKE_dspgvx, LAPACKE_dspgvx_work, LAPACKE_dsposv, LAPACKE_dsposv_work, LAPACKE_dsprfs, LAPACKE_dsprfs_work, LAPACKE_dspsv, LAPACKE_dspsv_work, LAPACKE_dspsvx, LAPACKE_dspsvx_work, LAPACKE_dsptrd, LAPACKE_dsptrd_work, LAPACKE_dsptrf, LAPACKE_dsptrf_work, LAPACKE_dsptri, LAPACKE_dsptri_work, LAPACKE_dsptrs, LAPACKE_dsptrs_work, LAPACKE_dstebz, LAPACKE_dstebz_work, LAPACKE_dstedc, LAPACKE_dstedc_work, LAPACKE_dstegr, LAPACKE_dstegr_work, LAPACKE_dstein, LAPACKE_dstein_work, LAPACKE_dstemr, LAPACKE_dstemr_work, LAPACKE_dsteqr, LAPACKE_dsteqr_work, LAPACKE_dsterf, LAPACKE_dsterf_work, LAPACKE_dstev, LAPACKE_dstev_work, LAPACKE_dstevd, LAPACKE_dstevd_work, LAPACKE_dstevr, LAPACKE_dstevr_work, LAPACKE_dstevx, LAPACKE_dstevx_work, LAPACKE_dsycon, LAPACKE_dsycon_work, LAPACKE_dsyconv, LAPACKE_dsyconv_work, LAPACKE_dsyequb, LAPACKE_dsyequb_work, LAPACKE_dsyev, LAPACKE_dsyev_work, LAPACKE_dsyevd, LAPACKE_dsyevd_work, LAPACKE_dsyevr, LAPACKE_dsyevr_work, LAPACKE_dsyevx, LAPACKE_dsyevx_work, LAPACKE_dsygst, LAPACKE_dsygst_work, LAPACKE_dsygv, LAPACKE_dsygv_work, LAPACKE_dsygvd, LAPACKE_dsygvd_work, LAPACKE_dsygvx, LAPACKE_dsygvx_work, LAPACKE_dsyrfs, LAPACKE_dsyrfs_work, LAPACKE_dsysv, LAPACKE_dsysv_rook, LAPACKE_dsysv_rook_work, LAPACKE_dsysv_work, LAPACKE_dsysvx, LAPACKE_dsysvx_work, LAPACKE_dsyswapr, LAPACKE_dsyswapr_work, LAPACKE_dsytrd, LAPACKE_dsytrd_work, LAPACKE_dsytrf, LAPACKE_dsytrf_work, LAPACKE_dsytri, LAPACKE_dsytri2, LAPACKE_dsytri2_work, LAPACKE_dsytri2x, LAPACKE_dsytri2x_work, LAPACKE_dsytri_work, LAPACKE_dsytrs, LAPACKE_dsytrs2, LAPACKE_dsytrs2_work, LAPACKE_dsytrs_work, LAPACKE_dtbcon, LAPACKE_dtbcon_work, LAPACKE_dtbrfs, LAPACKE_dtbrfs_work, LAPACKE_dtbtrs, LAPACKE_dtbtrs_work, LAPACKE_dtfsm, LAPACKE_dtfsm_work, LAPACKE_dtftri, LAPACKE_dtftri_work, LAPACKE_dtfttp, LAPACKE_dtfttp_work, LAPACKE_dtfttr, LAPACKE_dtfttr_work, LAPACKE_dtgevc, LAPACKE_dtgevc_work, LAPACKE_dtgexc, LAPACKE_dtgexc_work, LAPACKE_dtgsen, LAPACKE_dtgsen_work, LAPACKE_dtgsja, LAPACKE_dtgsja_work, LAPACKE_dtgsna, LAPACKE_dtgsna_work, LAPACKE_dtgsyl, LAPACKE_dtgsyl_work, LAPACKE_dtpcon, LAPACKE_dtpcon_work, LAPACKE_dtpmqrt, LAPACKE_dtpmqrt_work, LAPACKE_dtpqrt, LAPACKE_dtpqrt2, LAPACKE_dtpqrt2_work, LAPACKE_dtpqrt_work, LAPACKE_dtprfb, LAPACKE_dtprfb_work, LAPACKE_dtprfs, LAPACKE_dtprfs_work, LAPACKE_dtptri, LAPACKE_dtptri_work, LAPACKE_dtptrs, LAPACKE_dtptrs_work, LAPACKE_dtpttf, LAPACKE_dtpttf_work, LAPACKE_dtpttr, LAPACKE_dtpttr_work, LAPACKE_dtrcon, LAPACKE_dtrcon_work, LAPACKE_dtrevc, LAPACKE_dtrevc_work, LAPACKE_dtrexc, LAPACKE_dtrexc_work, LAPACKE_dtrrfs, LAPACKE_dtrrfs_work, LAPACKE_dtrsen, LAPACKE_dtrsen_work, LAPACKE_dtrsna, LAPACKE_dtrsna_work, LAPACKE_dtrsyl, LAPACKE_dtrsyl_work, LAPACKE_dtrtri, LAPACKE_dtrtri_work, LAPACKE_dtrtrs, LAPACKE_dtrtrs_work, LAPACKE_dtrttf, LAPACKE_dtrttf_work, LAPACKE_dtrttp, LAPACKE_dtrttp_work, LAPACKE_dtzrzf, LAPACKE_dtzrzf_work, LAPACKE_sbbcsd, LAPACKE_sbbcsd_work, LAPACKE_sbdsdc, LAPACKE_sbdsdc_work, LAPACKE_sbdsqr, LAPACKE_sbdsqr_work, LAPACKE_sdisna, LAPACKE_sdisna_work, LAPACKE_sgbbrd, LAPACKE_sgbbrd_work, LAPACKE_sgbcon, LAPACKE_sgbcon_work, LAPACKE_sgbequ, LAPACKE_sgbequ_work, LAPACKE_sgbequb, LAPACKE_sgbequb_work, LAPACKE_sgbrfs, LAPACKE_sgbrfs_work, LAPACKE_sgbsv, LAPACKE_sgbsv_work, LAPACKE_sgbsvx, LAPACKE_sgbsvx_work, LAPACKE_sgbtrf, LAPACKE_sgbtrf_work, LAPACKE_sgbtrs, LAPACKE_sgbtrs_work, LAPACKE_sgebak, LAPACKE_sgebak_work, LAPACKE_sgebal, LAPACKE_sgebal_work, LAPACKE_sgebrd, LAPACKE_sgebrd_work, LAPACKE_sgecon, LAPACKE_sgecon_work, LAPACKE_sgeequ, LAPACKE_sgeequ_work, LAPACKE_sgeequb, LAPACKE_sgeequb_work, LAPACKE_sgees, LAPACKE_sgees_work, LAPACKE_sgeesx, LAPACKE_sgeesx_work, LAPACKE_sgeev, LAPACKE_sgeev_work, LAPACKE_sgeevx, LAPACKE_sgeevx_work, LAPACKE_sgehrd, LAPACKE_sgehrd_work, LAPACKE_sgejsv, LAPACKE_sgejsv_work, LAPACKE_sgelq2, LAPACKE_sgelq2_work, LAPACKE_sgelqf, LAPACKE_sgelqf_work, LAPACKE_sgels, LAPACKE_sgels_work, LAPACKE_sgelsd, LAPACKE_sgelsd_work, LAPACKE_sgelss, LAPACKE_sgelss_work, LAPACKE_sgelsy, LAPACKE_sgelsy_work, LAPACKE_sgemqrt, LAPACKE_sgemqrt_work, LAPACKE_sgeqlf, LAPACKE_sgeqlf_work, LAPACKE_sgeqp3, LAPACKE_sgeqp3_work, LAPACKE_sgeqr2, LAPACKE_sgeqr2_work, LAPACKE_sgeqrf, LAPACKE_sgeqrf_work, LAPACKE_sgeqrfp, LAPACKE_sgeqrfp_work, LAPACKE_sgeqrt, LAPACKE_sgeqrt2, LAPACKE_sgeqrt2_work, LAPACKE_sgeqrt3, LAPACKE_sgeqrt3_work, LAPACKE_sgeqrt_work, LAPACKE_sgerfs, LAPACKE_sgerfs_work, LAPACKE_sgerqf, LAPACKE_sgerqf_work, LAPACKE_sgesdd, LAPACKE_sgesdd_work, LAPACKE_sgesv, LAPACKE_sgesv_work, LAPACKE_sgesvd, LAPACKE_sgesvd_work, LAPACKE_sgesvj, LAPACKE_sgesvj_work, LAPACKE_sgesvx, LAPACKE_sgesvx_work, LAPACKE_sgetf2, LAPACKE_sgetf2_work, LAPACKE_sgetrf, LAPACKE_sgetrf_work, LAPACKE_sgetri, LAPACKE_sgetri_work, LAPACKE_sgetrs, LAPACKE_sgetrs_work, LAPACKE_sggbak, LAPACKE_sggbak_work, LAPACKE_sggbal, LAPACKE_sggbal_work, LAPACKE_sgges, LAPACKE_sgges_work, LAPACKE_sggesx, LAPACKE_sggesx_work, LAPACKE_sggev, LAPACKE_sggev_work, LAPACKE_sggevx, LAPACKE_sggevx_work, LAPACKE_sggglm, LAPACKE_sggglm_work, LAPACKE_sgghrd, LAPACKE_sgghrd_work, LAPACKE_sgglse, LAPACKE_sgglse_work, LAPACKE_sggqrf, LAPACKE_sggqrf_work, LAPACKE_sggrqf, LAPACKE_sggrqf_work, LAPACKE_sgtcon, LAPACKE_sgtcon_work, LAPACKE_sgtrfs, LAPACKE_sgtrfs_work, LAPACKE_sgtsv, LAPACKE_sgtsv_work, LAPACKE_sgtsvx, LAPACKE_sgtsvx_work, LAPACKE_sgttrf, LAPACKE_sgttrf_work, LAPACKE_sgttrs, LAPACKE_sgttrs_work, LAPACKE_shgeqz, LAPACKE_shgeqz_work, LAPACKE_shsein, LAPACKE_shsein_work, LAPACKE_shseqr, LAPACKE_shseqr_work, LAPACKE_slacn2, LAPACKE_slacn2_work, LAPACKE_slacpy, LAPACKE_slacpy_work, LAPACKE_slag2d, LAPACKE_slag2d_work, LAPACKE_slamch, LAPACKE_slamch_work, LAPACKE_slange, LAPACKE_slange_work, LAPACKE_slansy, LAPACKE_slansy_work, LAPACKE_slantr, LAPACKE_slantr_work, LAPACKE_slapmr, LAPACKE_slapmr_work, LAPACKE_slapy2, LAPACKE_slapy2_work, LAPACKE_slapy3, LAPACKE_slapy3_work, LAPACKE_slarfb, LAPACKE_slarfb_work, LAPACKE_slarfg, LAPACKE_slarfg_work, LAPACKE_slarft, LAPACKE_slarft_work, LAPACKE_slarfx, LAPACKE_slarfx_work, LAPACKE_slarnv, LAPACKE_slarnv_work, LAPACKE_slartgp, LAPACKE_slartgp_work, LAPACKE_slartgs, LAPACKE_slartgs_work, LAPACKE_slaset, LAPACKE_slaset_work, LAPACKE_slasrt, LAPACKE_slasrt_work, LAPACKE_slaswp, LAPACKE_slaswp_work, LAPACKE_slauum, LAPACKE_slauum_work, LAPACKE_sopgtr, LAPACKE_sopgtr_work, LAPACKE_sopmtr, LAPACKE_sopmtr_work, LAPACKE_sorbdb, LAPACKE_sorbdb_work, LAPACKE_sorcsd, LAPACKE_sorcsd_work, LAPACKE_sorgbr, LAPACKE_sorgbr_work, LAPACKE_sorghr, LAPACKE_sorghr_work, LAPACKE_sorglq, LAPACKE_sorglq_work, LAPACKE_sorgql, LAPACKE_sorgql_work, LAPACKE_sorgqr, LAPACKE_sorgqr_work, LAPACKE_sorgrq, LAPACKE_sorgrq_work, LAPACKE_sorgtr, LAPACKE_sorgtr_work, LAPACKE_sormbr, LAPACKE_sormbr_work, LAPACKE_sormhr, LAPACKE_sormhr_work, LAPACKE_sormlq, LAPACKE_sormlq_work, LAPACKE_sormql, LAPACKE_sormql_work, LAPACKE_sormqr, LAPACKE_sormqr_work, LAPACKE_sormrq, LAPACKE_sormrq_work, LAPACKE_sormrz, LAPACKE_sormrz_work, LAPACKE_sormtr, LAPACKE_sormtr_work, LAPACKE_spbcon, LAPACKE_spbcon_work, LAPACKE_spbequ, LAPACKE_spbequ_work, LAPACKE_spbrfs, LAPACKE_spbrfs_work, LAPACKE_spbstf, LAPACKE_spbstf_work, LAPACKE_spbsv, LAPACKE_spbsv_work, LAPACKE_spbsvx, LAPACKE_spbsvx_work, LAPACKE_spbtrf, LAPACKE_spbtrf_work, LAPACKE_spbtrs, LAPACKE_spbtrs_work, LAPACKE_spftrf, LAPACKE_spftrf_work, LAPACKE_spftri, LAPACKE_spftri_work, LAPACKE_spftrs, LAPACKE_spftrs_work, LAPACKE_spocon, LAPACKE_spocon_work, LAPACKE_spoequ, LAPACKE_spoequ_work, LAPACKE_spoequb, LAPACKE_spoequb_work, LAPACKE_sporfs, LAPACKE_sporfs_work, LAPACKE_sposv, LAPACKE_sposv_work, LAPACKE_sposvx, LAPACKE_sposvx_work, LAPACKE_spotrf, LAPACKE_spotrf_work, LAPACKE_spotri, LAPACKE_spotri_work, LAPACKE_spotrs, LAPACKE_spotrs_work, LAPACKE_sppcon, LAPACKE_sppcon_work, LAPACKE_sppequ, LAPACKE_sppequ_work, LAPACKE_spprfs, LAPACKE_spprfs_work, LAPACKE_sppsv, LAPACKE_sppsv_work, LAPACKE_sppsvx, LAPACKE_sppsvx_work, LAPACKE_spptrf, LAPACKE_spptrf_work, LAPACKE_spptri, LAPACKE_spptri_work, LAPACKE_spptrs, LAPACKE_spptrs_work, LAPACKE_spstrf, LAPACKE_spstrf_work, LAPACKE_sptcon, LAPACKE_sptcon_work, LAPACKE_spteqr, LAPACKE_spteqr_work, LAPACKE_sptrfs, LAPACKE_sptrfs_work, LAPACKE_sptsv, LAPACKE_sptsv_work, LAPACKE_sptsvx, LAPACKE_sptsvx_work, LAPACKE_spttrf, LAPACKE_spttrf_work, LAPACKE_spttrs, LAPACKE_spttrs_work, LAPACKE_ssbev, LAPACKE_ssbev_work, LAPACKE_ssbevd, LAPACKE_ssbevd_work, LAPACKE_ssbevx, LAPACKE_ssbevx_work, LAPACKE_ssbgst, LAPACKE_ssbgst_work, LAPACKE_ssbgv, LAPACKE_ssbgv_work, LAPACKE_ssbgvd, LAPACKE_ssbgvd_work, LAPACKE_ssbgvx, LAPACKE_ssbgvx_work, LAPACKE_ssbtrd, LAPACKE_ssbtrd_work, LAPACKE_ssfrk, LAPACKE_ssfrk_work, LAPACKE_sspcon, LAPACKE_sspcon_work, LAPACKE_sspev, LAPACKE_sspev_work, LAPACKE_sspevd, LAPACKE_sspevd_work, LAPACKE_sspevx, LAPACKE_sspevx_work, LAPACKE_sspgst, LAPACKE_sspgst_work, LAPACKE_sspgv, LAPACKE_sspgv_work, LAPACKE_sspgvd, LAPACKE_sspgvd_work, LAPACKE_sspgvx, LAPACKE_sspgvx_work, LAPACKE_ssprfs, LAPACKE_ssprfs_work, LAPACKE_sspsv, LAPACKE_sspsv_work, LAPACKE_sspsvx, LAPACKE_sspsvx_work, LAPACKE_ssptrd, LAPACKE_ssptrd_work, LAPACKE_ssptrf, LAPACKE_ssptrf_work, LAPACKE_ssptri, LAPACKE_ssptri_work, LAPACKE_ssptrs, LAPACKE_ssptrs_work, LAPACKE_sstebz, LAPACKE_sstebz_work, LAPACKE_sstedc, LAPACKE_sstedc_work, LAPACKE_sstegr, LAPACKE_sstegr_work, LAPACKE_sstein, LAPACKE_sstein_work, LAPACKE_sstemr, LAPACKE_sstemr_work, LAPACKE_ssteqr, LAPACKE_ssteqr_work, LAPACKE_ssterf, LAPACKE_ssterf_work, LAPACKE_sstev, LAPACKE_sstev_work, LAPACKE_sstevd, LAPACKE_sstevd_work, LAPACKE_sstevr, LAPACKE_sstevr_work, LAPACKE_sstevx, LAPACKE_sstevx_work, LAPACKE_ssycon, LAPACKE_ssycon_work, LAPACKE_ssyconv, LAPACKE_ssyconv_work, LAPACKE_ssyequb, LAPACKE_ssyequb_work, LAPACKE_ssyev, LAPACKE_ssyev_work, LAPACKE_ssyevd, LAPACKE_ssyevd_work, LAPACKE_ssyevr, LAPACKE_ssyevr_work, LAPACKE_ssyevx, LAPACKE_ssyevx_work, LAPACKE_ssygst, LAPACKE_ssygst_work, LAPACKE_ssygv, LAPACKE_ssygv_work, LAPACKE_ssygvd, LAPACKE_ssygvd_work, LAPACKE_ssygvx, LAPACKE_ssygvx_work, LAPACKE_ssyrfs, LAPACKE_ssyrfs_work, LAPACKE_ssysv, LAPACKE_ssysv_rook, LAPACKE_ssysv_rook_work, LAPACKE_ssysv_work, LAPACKE_ssysvx, LAPACKE_ssysvx_work, LAPACKE_ssyswapr, LAPACKE_ssyswapr_work, LAPACKE_ssytrd, LAPACKE_ssytrd_work, LAPACKE_ssytrf, LAPACKE_ssytrf_work, LAPACKE_ssytri, LAPACKE_ssytri2, LAPACKE_ssytri2_work, LAPACKE_ssytri2x, LAPACKE_ssytri2x_work, LAPACKE_ssytri_work, LAPACKE_ssytrs, LAPACKE_ssytrs2, LAPACKE_ssytrs2_work, LAPACKE_ssytrs_work, LAPACKE_stbcon, LAPACKE_stbcon_work, LAPACKE_stbrfs, LAPACKE_stbrfs_work, LAPACKE_stbtrs, LAPACKE_stbtrs_work, LAPACKE_stfsm, LAPACKE_stfsm_work, LAPACKE_stftri, LAPACKE_stftri_work, LAPACKE_stfttp, LAPACKE_stfttp_work, LAPACKE_stfttr, LAPACKE_stfttr_work, LAPACKE_stgevc, LAPACKE_stgevc_work, LAPACKE_stgexc, LAPACKE_stgexc_work, LAPACKE_stgsen, LAPACKE_stgsen_work, LAPACKE_stgsja, LAPACKE_stgsja_work, LAPACKE_stgsna, LAPACKE_stgsna_work, LAPACKE_stgsyl, LAPACKE_stgsyl_work, LAPACKE_stpcon, LAPACKE_stpcon_work, LAPACKE_stpmqrt, LAPACKE_stpmqrt_work, LAPACKE_stpqrt2, LAPACKE_stpqrt2_work, LAPACKE_stprfb, LAPACKE_stprfb_work, LAPACKE_stprfs, LAPACKE_stprfs_work, LAPACKE_stptri, LAPACKE_stptri_work, LAPACKE_stptrs, LAPACKE_stptrs_work, LAPACKE_stpttf, LAPACKE_stpttf_work, LAPACKE_stpttr, LAPACKE_stpttr_work, LAPACKE_strcon, LAPACKE_strcon_work, LAPACKE_strevc, LAPACKE_strevc_work, LAPACKE_strexc, LAPACKE_strexc_work, LAPACKE_strrfs, LAPACKE_strrfs_work, LAPACKE_strsen, LAPACKE_strsen_work, LAPACKE_strsna, LAPACKE_strsna_work, LAPACKE_strsyl, LAPACKE_strsyl_work, LAPACKE_strtri, LAPACKE_strtri_work, LAPACKE_strtrs, LAPACKE_strtrs_work, LAPACKE_strttf, LAPACKE_strttf_work, LAPACKE_strttp, LAPACKE_strttp_work, LAPACKE_stzrzf, LAPACKE_stzrzf_work, LAPACKE_zbbcsd, LAPACKE_zbbcsd_work, LAPACKE_zbdsqr, LAPACKE_zbdsqr_work, LAPACKE_zcgesv, LAPACKE_zcgesv_work, LAPACKE_zcposv, LAPACKE_zcposv_work, LAPACKE_zgbbrd, LAPACKE_zgbbrd_work, LAPACKE_zgbcon, LAPACKE_zgbcon_work, LAPACKE_zgbequ, LAPACKE_zgbequ_work, LAPACKE_zgbequb, LAPACKE_zgbequb_work, LAPACKE_zgbrfs, LAPACKE_zgbrfs_work, LAPACKE_zgbsv, LAPACKE_zgbsv_work, LAPACKE_zgbsvx, LAPACKE_zgbsvx_work, LAPACKE_zgbtrf, LAPACKE_zgbtrf_work, LAPACKE_zgbtrs, LAPACKE_zgbtrs_work, LAPACKE_zgebak, LAPACKE_zgebak_work, LAPACKE_zgebal, LAPACKE_zgebal_work, LAPACKE_zgebrd, LAPACKE_zgebrd_work, LAPACKE_zgecon, LAPACKE_zgecon_work, LAPACKE_zgeequ, LAPACKE_zgeequ_work, LAPACKE_zgeequb, LAPACKE_zgeequb_work, LAPACKE_zgees, LAPACKE_zgees_work, LAPACKE_zgeesx, LAPACKE_zgeesx_work, LAPACKE_zgeev, LAPACKE_zgeev_work, LAPACKE_zgeevx, LAPACKE_zgeevx_work, LAPACKE_zgehrd, LAPACKE_zgehrd_work, LAPACKE_zgelq2, LAPACKE_zgelq2_work, LAPACKE_zgelqf, LAPACKE_zgelqf_work, LAPACKE_zgels, LAPACKE_zgels_work, LAPACKE_zgelsd, LAPACKE_zgelsd_work, LAPACKE_zgelss, LAPACKE_zgelss_work, LAPACKE_zgelsy, LAPACKE_zgelsy_work, LAPACKE_zgemqrt, LAPACKE_zgemqrt_work, LAPACKE_zgeqlf, LAPACKE_zgeqlf_work, LAPACKE_zgeqp3, LAPACKE_zgeqp3_work, LAPACKE_zgeqr2, LAPACKE_zgeqr2_work, LAPACKE_zgeqrf, LAPACKE_zgeqrf_work, LAPACKE_zgeqrfp, LAPACKE_zgeqrfp_work, LAPACKE_zgeqrt, LAPACKE_zgeqrt2, LAPACKE_zgeqrt2_work, LAPACKE_zgeqrt3, LAPACKE_zgeqrt3_work, LAPACKE_zgeqrt_work, LAPACKE_zgerfs, LAPACKE_zgerfs_work, LAPACKE_zgerqf, LAPACKE_zgerqf_work, LAPACKE_zgesdd, LAPACKE_zgesdd_work, LAPACKE_zgesv, LAPACKE_zgesv_work, LAPACKE_zgesvd, LAPACKE_zgesvd_work, LAPACKE_zgesvx, LAPACKE_zgesvx_work, LAPACKE_zgetf2, LAPACKE_zgetf2_work, LAPACKE_zgetrf, LAPACKE_zgetrf_work, LAPACKE_zgetri, LAPACKE_zgetri_work, LAPACKE_zgetrs, LAPACKE_zgetrs_work, LAPACKE_zggbak, LAPACKE_zggbak_work, LAPACKE_zggbal, LAPACKE_zggbal_work, LAPACKE_zgges, LAPACKE_zgges_work, LAPACKE_zggesx, LAPACKE_zggesx_work, LAPACKE_zggev, LAPACKE_zggev_work, LAPACKE_zggevx, LAPACKE_zggevx_work, LAPACKE_zggglm, LAPACKE_zggglm_work, LAPACKE_zgghrd, LAPACKE_zgghrd_work, LAPACKE_zgglse, LAPACKE_zgglse_work, LAPACKE_zggqrf, LAPACKE_zggqrf_work, LAPACKE_zggrqf, LAPACKE_zggrqf_work, LAPACKE_zgtcon, LAPACKE_zgtcon_work, LAPACKE_zgtrfs, LAPACKE_zgtrfs_work, LAPACKE_zgtsv, LAPACKE_zgtsv_work, LAPACKE_zgtsvx, LAPACKE_zgtsvx_work, LAPACKE_zgttrf, LAPACKE_zgttrf_work, LAPACKE_zgttrs, LAPACKE_zgttrs_work, LAPACKE_zhbev, LAPACKE_zhbev_work, LAPACKE_zhbevd, LAPACKE_zhbevd_work, LAPACKE_zhbevx, LAPACKE_zhbevx_work, LAPACKE_zhbgst, LAPACKE_zhbgst_work, LAPACKE_zhbgv, LAPACKE_zhbgv_work, LAPACKE_zhbgvd, LAPACKE_zhbgvd_work, LAPACKE_zhbgvx, LAPACKE_zhbgvx_work, LAPACKE_zhbtrd, LAPACKE_zhbtrd_work, LAPACKE_zhecon, LAPACKE_zhecon_work, LAPACKE_zheequb, LAPACKE_zheequb_work, LAPACKE_zheev, LAPACKE_zheev_work, LAPACKE_zheevd, LAPACKE_zheevd_work, LAPACKE_zheevr, LAPACKE_zheevr_work, LAPACKE_zheevx, LAPACKE_zheevx_work, LAPACKE_zhegst, LAPACKE_zhegst_work, LAPACKE_zhegv, LAPACKE_zhegv_work, LAPACKE_zhegvd, LAPACKE_zhegvd_work, LAPACKE_zhegvx, LAPACKE_zhegvx_work, LAPACKE_zherfs, LAPACKE_zherfs_work, LAPACKE_zhesv, LAPACKE_zhesv_work, LAPACKE_zhesvx, LAPACKE_zhesvx_work, LAPACKE_zheswapr, LAPACKE_zheswapr_work, LAPACKE_zhetrd, LAPACKE_zhetrd_work, LAPACKE_zhetrf, LAPACKE_zhetrf_work, LAPACKE_zhetri, LAPACKE_zhetri2, LAPACKE_zhetri2_work, LAPACKE_zhetri2x, LAPACKE_zhetri2x_work, LAPACKE_zhetri_work, LAPACKE_zhetrs, LAPACKE_zhetrs2, LAPACKE_zhetrs2_work, LAPACKE_zhetrs_work, LAPACKE_zhfrk, LAPACKE_zhfrk_work, LAPACKE_zhgeqz, LAPACKE_zhgeqz_work, LAPACKE_zhpcon, LAPACKE_zhpcon_work, LAPACKE_zhpev, LAPACKE_zhpev_work, LAPACKE_zhpevd, LAPACKE_zhpevd_work, LAPACKE_zhpevx, LAPACKE_zhpevx_work, LAPACKE_zhpgst, LAPACKE_zhpgst_work, LAPACKE_zhpgv, LAPACKE_zhpgv_work, LAPACKE_zhpgvd, LAPACKE_zhpgvd_work, LAPACKE_zhpgvx, LAPACKE_zhpgvx_work, LAPACKE_zhprfs, LAPACKE_zhprfs_work, LAPACKE_zhpsv, LAPACKE_zhpsv_work, LAPACKE_zhpsvx, LAPACKE_zhpsvx_work, LAPACKE_zhptrd, LAPACKE_zhptrd_work, LAPACKE_zhptrf, LAPACKE_zhptrf_work, LAPACKE_zhptri, LAPACKE_zhptri_work, LAPACKE_zhptrs, LAPACKE_zhptrs_work, LAPACKE_zhsein, LAPACKE_zhsein_work, LAPACKE_zhseqr, LAPACKE_zhseqr_work, LAPACKE_zlacgv, LAPACKE_zlacgv_work, LAPACKE_zlacn2, LAPACKE_zlacn2_work, LAPACKE_zlacp2, LAPACKE_zlacp2_work, LAPACKE_zlacpy, LAPACKE_zlacpy_work, LAPACKE_zlag2c, LAPACKE_zlag2c_work, LAPACKE_zlange, LAPACKE_zlange_work, LAPACKE_zlanhe, LAPACKE_zlanhe_work, LAPACKE_zlansy, LAPACKE_zlansy_work, LAPACKE_zlantr, LAPACKE_zlantr_work, LAPACKE_zlapmr, LAPACKE_zlapmr_work, LAPACKE_zlarfb, LAPACKE_zlarfb_work, LAPACKE_zlarfg, LAPACKE_zlarfg_work, LAPACKE_zlarft, LAPACKE_zlarft_work, LAPACKE_zlarfx, LAPACKE_zlarfx_work, LAPACKE_zlarnv, LAPACKE_zlarnv_work, LAPACKE_zlaset, LAPACKE_zlaset_work, LAPACKE_zlaswp, LAPACKE_zlaswp_work, LAPACKE_zlauum, LAPACKE_zlauum_work, LAPACKE_zpbcon, LAPACKE_zpbcon_work, LAPACKE_zpbequ, LAPACKE_zpbequ_work, LAPACKE_zpbrfs, LAPACKE_zpbrfs_work, LAPACKE_zpbstf, LAPACKE_zpbstf_work, LAPACKE_zpbsv, LAPACKE_zpbsv_work, LAPACKE_zpbsvx, LAPACKE_zpbsvx_work, LAPACKE_zpbtrf, LAPACKE_zpbtrf_work, LAPACKE_zpbtrs, LAPACKE_zpbtrs_work, LAPACKE_zpftrf, LAPACKE_zpftrf_work, LAPACKE_zpftri, LAPACKE_zpftri_work, LAPACKE_zpftrs, LAPACKE_zpftrs_work, LAPACKE_zpocon, LAPACKE_zpocon_work, LAPACKE_zpoequ, LAPACKE_zpoequ_work, LAPACKE_zpoequb, LAPACKE_zpoequb_work, LAPACKE_zporfs, LAPACKE_zporfs_work, LAPACKE_zposv, LAPACKE_zposv_work, LAPACKE_zposvx, LAPACKE_zposvx_work, LAPACKE_zpotrf, LAPACKE_zpotrf_work, LAPACKE_zpotri, LAPACKE_zpotri_work, LAPACKE_zpotrs, LAPACKE_zpotrs_work, LAPACKE_zppcon, LAPACKE_zppcon_work, LAPACKE_zppequ, LAPACKE_zppequ_work, LAPACKE_zpprfs, LAPACKE_zpprfs_work, LAPACKE_zppsv, LAPACKE_zppsv_work, LAPACKE_zppsvx, LAPACKE_zppsvx_work, LAPACKE_zpptrf, LAPACKE_zpptrf_work, LAPACKE_zpptri, LAPACKE_zpptri_work, LAPACKE_zpptrs, LAPACKE_zpptrs_work, LAPACKE_zpstrf, LAPACKE_zpstrf_work, LAPACKE_zptcon, LAPACKE_zptcon_work, LAPACKE_zpteqr, LAPACKE_zpteqr_work, LAPACKE_zptrfs, LAPACKE_zptrfs_work, LAPACKE_zptsv, LAPACKE_zptsv_work, LAPACKE_zptsvx, LAPACKE_zptsvx_work, LAPACKE_zpttrf, LAPACKE_zpttrf_work, LAPACKE_zpttrs, LAPACKE_zpttrs_work, LAPACKE_zspcon, LAPACKE_zspcon_work, LAPACKE_zsprfs, LAPACKE_zsprfs_work, LAPACKE_zspsv, LAPACKE_zspsv_work, LAPACKE_zspsvx, LAPACKE_zspsvx_work, LAPACKE_zsptrf, LAPACKE_zsptrf_work, LAPACKE_zsptri, LAPACKE_zsptri_work, LAPACKE_zsptrs, LAPACKE_zsptrs_work, LAPACKE_zstedc, LAPACKE_zstedc_work, LAPACKE_zstegr, LAPACKE_zstegr_work, LAPACKE_zstein, LAPACKE_zstein_work, LAPACKE_zstemr, LAPACKE_zstemr_work, LAPACKE_zsteqr, LAPACKE_zsteqr_work, LAPACKE_zsycon, LAPACKE_zsycon_work, LAPACKE_zsyconv, LAPACKE_zsyconv_work, LAPACKE_zsyequb, LAPACKE_zsyequb_work, LAPACKE_zsyrfs, LAPACKE_zsyrfs_work, LAPACKE_zsysv, LAPACKE_zsysv_rook, LAPACKE_zsysv_rook_work, LAPACKE_zsysv_work, LAPACKE_zsysvx, LAPACKE_zsysvx_work, LAPACKE_zsyswapr, LAPACKE_zsyswapr_work, LAPACKE_zsytrf, LAPACKE_zsytrf_work, LAPACKE_zsytri, LAPACKE_zsytri2, LAPACKE_zsytri2_work, LAPACKE_zsytri2x, LAPACKE_zsytri2x_work, LAPACKE_zsytri_work, LAPACKE_zsytrs, LAPACKE_zsytrs2, LAPACKE_zsytrs2_work, LAPACKE_zsytrs_work, LAPACKE_ztbcon, LAPACKE_ztbcon_work, LAPACKE_ztbrfs, LAPACKE_ztbrfs_work, LAPACKE_ztbtrs, LAPACKE_ztbtrs_work, LAPACKE_ztfsm, LAPACKE_ztfsm_work, LAPACKE_ztftri, LAPACKE_ztftri_work, LAPACKE_ztfttp, LAPACKE_ztfttp_work, LAPACKE_ztfttr, LAPACKE_ztfttr_work, LAPACKE_ztgevc, LAPACKE_ztgevc_work, LAPACKE_ztgexc, LAPACKE_ztgexc_work, LAPACKE_ztgsen, LAPACKE_ztgsen_work, LAPACKE_ztgsja, LAPACKE_ztgsja_work, LAPACKE_ztgsna, LAPACKE_ztgsna_work, LAPACKE_ztgsyl, LAPACKE_ztgsyl_work, LAPACKE_ztpcon, LAPACKE_ztpcon_work, LAPACKE_ztpmqrt, LAPACKE_ztpmqrt_work, LAPACKE_ztpqrt, LAPACKE_ztpqrt2, LAPACKE_ztpqrt2_work, LAPACKE_ztpqrt_work, LAPACKE_ztprfb, LAPACKE_ztprfb_work, LAPACKE_ztprfs, LAPACKE_ztprfs_work, LAPACKE_ztptri, LAPACKE_ztptri_work, LAPACKE_ztptrs, LAPACKE_ztptrs_work, LAPACKE_ztpttf, LAPACKE_ztpttf_work, LAPACKE_ztpttr, LAPACKE_ztpttr_work, LAPACKE_ztrcon, LAPACKE_ztrcon_work, LAPACKE_ztrevc, LAPACKE_ztrevc_work, LAPACKE_ztrexc, LAPACKE_ztrexc_work, LAPACKE_ztrrfs, LAPACKE_ztrrfs_work, LAPACKE_ztrsen, LAPACKE_ztrsen_work, LAPACKE_ztrsna, LAPACKE_ztrsna_work, LAPACKE_ztrsyl, LAPACKE_ztrsyl_work, LAPACKE_ztrtri, LAPACKE_ztrtri_work, LAPACKE_ztrtrs, LAPACKE_ztrtrs_work, LAPACKE_ztrttf, LAPACKE_ztrttf_work, LAPACKE_ztrttp, LAPACKE_ztrttp_work, LAPACKE_ztzrzf, LAPACKE_ztzrzf_work, LAPACKE_zunbdb, LAPACKE_zunbdb_work, LAPACKE_zuncsd, LAPACKE_zuncsd_work, LAPACKE_zungbr, LAPACKE_zungbr_work, LAPACKE_zunghr, LAPACKE_zunghr_work, LAPACKE_zunglq, LAPACKE_zunglq_work, LAPACKE_zungql, LAPACKE_zungql_work, LAPACKE_zungqr, LAPACKE_zungqr_work, LAPACKE_zungrq, LAPACKE_zungrq_work, LAPACKE_zungtr, LAPACKE_zungtr_work, LAPACKE_zunmbr, LAPACKE_zunmbr_work, LAPACKE_zunmhr, LAPACKE_zunmhr_work, LAPACKE_zunmlq, LAPACKE_zunmlq_work, LAPACKE_zunmql, LAPACKE_zunmql_work, LAPACKE_zunmqr, LAPACKE_zunmqr_work, LAPACKE_zunmrq, LAPACKE_zunmrq_work, LAPACKE_zunmrz, LAPACKE_zunmrz_work, LAPACKE_zunmtr, LAPACKE_zunmtr_work, LAPACKE_zupgtr, LAPACKE_zupgtr_work, LAPACKE_zupmtr, LAPACKE_zupmtr_work, LAPACKE_zsyr, LAPACKE_csyr, LAPACKE_zsyr_work, LAPACKE_csyr_work, LAPACKE_ilaver, ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the ## corresponding LAPACK extended precision routines. #LAPACKE_cgbrfsx, #LAPACKE_cporfsx, #LAPACKE_dgerfsx, #LAPACKE_sgbrfsx, #LAPACKE_ssyrfsx, #LAPACKE_zherfsx, #LAPACKE_cgbrfsx_work, #LAPACKE_cporfsx_work, #LAPACKE_dgerfsx_work, #LAPACKE_sgbrfsx_work, #LAPACKE_ssyrfsx_work, #LAPACKE_zherfsx_work, #LAPACKE_cgerfsx, #LAPACKE_csyrfsx, #LAPACKE_dporfsx, #LAPACKE_sgerfsx, #LAPACKE_zgbrfsx, #LAPACKE_zporfsx, #LAPACKE_cgerfsx_work, #LAPACKE_csyrfsx_work, #LAPACKE_dporfsx_work, #LAPACKE_sgerfsx_work, #LAPACKE_zgbrfsx_work, #LAPACKE_zporfsx_work, #LAPACKE_cherfsx, #LAPACKE_dgbrfsx, #LAPACKE_dsyrfsx, #LAPACKE_sporfsx, #LAPACKE_zgerfsx, #LAPACKE_zsyrfsx, #LAPACKE_cherfsx_work, #LAPACKE_dgbrfsx_work, #LAPACKE_dsyrfsx_work, #LAPACKE_sporfsx_work, #LAPACKE_zgerfsx_work, #LAPACKE_zsyrfsx_work, #LAPACKE_cgbsvxx, #LAPACKE_cposvxx, #LAPACKE_dgesvxx, #LAPACKE_sgbsvxx, #LAPACKE_ssysvxx, #LAPACKE_zhesvxx, #LAPACKE_cgbsvxx_work, #LAPACKE_cposvxx_work, #LAPACKE_dgesvxx_work, #LAPACKE_sgbsvxx_work, #LAPACKE_ssysvxx_work, #LAPACKE_zhesvxx_work, #LAPACKE_cgesvxx, #LAPACKE_csysvxx, #LAPACKE_dposvxx, #LAPACKE_sgesvxx, #LAPACKE_zgbsvxx, #LAPACKE_zposvxx, #LAPACKE_cgesvxx_work, #LAPACKE_csysvxx_work, #LAPACKE_dposvxx_work, #LAPACKE_sgesvxx_work, #LAPACKE_zgbsvxx_work, #LAPACKE_zposvxx_work, #LAPACKE_chesvxx, #LAPACKE_dgbsvxx, #LAPACKE_dsysvxx, #LAPACKE_sposvxx, #LAPACKE_zgesvxx, #LAPACKE_zsysvxx, #LAPACKE_chesvxx_work, #LAPACKE_dgbsvxx_work, #LAPACKE_dsysvxx_work, #LAPACKE_sposvxx_work, #LAPACKE_zgesvxx_work, #LAPACKE_zsysvxx_work, ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg ## (see `lapack-3.4.1/TESTING/MATGEN`). LAPACKE_clatms, LAPACKE_clatms_work, LAPACKE_dlatms, LAPACKE_dlatms_work, LAPACKE_slatms, LAPACKE_slatms_work, LAPACKE_zlatms, LAPACKE_zlatms_work, LAPACKE_clagge, LAPACKE_clagge_work, LAPACKE_dlagge, LAPACKE_dlagge_work, LAPACKE_slagge, LAPACKE_slagge_work, LAPACKE_zlagge, LAPACKE_zlagge_work, LAPACKE_claghe, LAPACKE_claghe_work, LAPACKE_zlaghe, LAPACKE_zlaghe_work, LAPACKE_clagsy, LAPACKE_clagsy_work, LAPACKE_dlagsy, LAPACKE_dlagsy_work, LAPACKE_slagsy, LAPACKE_slagsy_work, LAPACKE_zlagsy, LAPACKE_zlagsy_work, ## new function from lapack-3.6.0 LAPACKE_cgejsv, LAPACKE_cgejsv_work, LAPACKE_cgesvdx, LAPACKE_cgesvdx_work, LAPACKE_cgesvj, LAPACKE_cgesvj_work, LAPACKE_cgetrf2, LAPACKE_cgetrf2_work, LAPACKE_cgges3, LAPACKE_cgges3_work, LAPACKE_cggev3, LAPACKE_cggev3_work, LAPACKE_cgghd3, LAPACKE_cgghd3_work, LAPACKE_cggsvd3, LAPACKE_cggsvd3_work, LAPACKE_cggsvp3, LAPACKE_cggsvp3_work, LAPACKE_chetrf_rook, LAPACKE_chetrf_rook_work, LAPACKE_chetrs_rook, LAPACKE_chetrs_rook_work, LAPACKE_clapmt, LAPACKE_clapmt_work, LAPACKE_clascl, LAPACKE_clascl_work, LAPACKE_cpotrf2, LAPACKE_cpotrf2_work, LAPACKE_csytrf_rook, LAPACKE_csytrf_rook_work, LAPACKE_csytrs_rook, LAPACKE_csytrs_rook_work, LAPACKE_cuncsd2by1, LAPACKE_cuncsd2by1_work, LAPACKE_dbdsvdx, LAPACKE_dbdsvdx_work, LAPACKE_dgesvdx, LAPACKE_dgesvdx_work, LAPACKE_dgetrf2, LAPACKE_dgetrf2_work, LAPACKE_dgges3, LAPACKE_dgges3_work, LAPACKE_dggev3, LAPACKE_dggev3_work, LAPACKE_dgghd3, LAPACKE_dgghd3_work, LAPACKE_dggsvd3, LAPACKE_dggsvd3_work, LAPACKE_dggsvp3, LAPACKE_dggsvp3_work, LAPACKE_dlapmt, LAPACKE_dlapmt_work, LAPACKE_dlascl, LAPACKE_dlascl_work, LAPACKE_dorcsd2by1, LAPACKE_dorcsd2by1_work, LAPACKE_dpotrf2, LAPACKE_dpotrf2_work, LAPACKE_dsytrf_rook, LAPACKE_dsytrf_rook_work, LAPACKE_dsytrs_rook, LAPACKE_dsytrs_rook_work, LAPACKE_sbdsvdx, LAPACKE_sbdsvdx_work, LAPACKE_sgesvdx, LAPACKE_sgesvdx_work, LAPACKE_sgetrf2, LAPACKE_sgetrf2_work, LAPACKE_sgges3, LAPACKE_sgges3_work, LAPACKE_sggev3, LAPACKE_sggev3_work, LAPACKE_sgghd3, LAPACKE_sgghd3_work, LAPACKE_sggsvd3, LAPACKE_sggsvd3_work, LAPACKE_sggsvp3, LAPACKE_sggsvp3_work, LAPACKE_slapmt, LAPACKE_slapmt_work, LAPACKE_slascl, LAPACKE_slascl_work, LAPACKE_sorcsd2by1, LAPACKE_sorcsd2by1_work, LAPACKE_spotrf2, LAPACKE_spotrf2_work, LAPACKE_ssytrf_rook, LAPACKE_ssytrf_rook_work, LAPACKE_ssytrs_rook, LAPACKE_ssytrs_rook_work, LAPACKE_stpqrt, LAPACKE_stpqrt_work, LAPACKE_zgejsv, LAPACKE_zgejsv_work, LAPACKE_zgesvdx, LAPACKE_zgesvdx_work, LAPACKE_zgesvj, LAPACKE_zgesvj_work, LAPACKE_zgetrf2, LAPACKE_zgetrf2_work, LAPACKE_zgges3, LAPACKE_zgges3_work, LAPACKE_zggev3, LAPACKE_zggev3_work, LAPACKE_zgghd3, LAPACKE_zgghd3_work, LAPACKE_zggsvd3, LAPACKE_zggsvd3_work, LAPACKE_zggsvp3, LAPACKE_zggsvp3_work, LAPACKE_zhetrf_rook, LAPACKE_zhetrf_rook_work, LAPACKE_zhetrs_rook, LAPACKE_zhetrs_rook_work, LAPACKE_zlapmt, LAPACKE_zlapmt_work, LAPACKE_zlascl, LAPACKE_zlascl_work, LAPACKE_zpotrf2, LAPACKE_zpotrf2_work, LAPACKE_zsytrf_rook, LAPACKE_zsytrf_rook_work, LAPACKE_zsytrs_rook, LAPACKE_zsytrs_rook_work, LAPACKE_zuncsd2by1, LAPACKE_zuncsd2by1_work, ## new function from lapack-3.7.0 LAPACKE_cgemqr, LAPACKE_cgemqr_work, LAPACKE_cgetsls, LAPACKE_cgetsls_work, LAPACKE_chbev_2stage, LAPACKE_chbev_2stage_work, LAPACKE_chbevd_2stage, LAPACKE_chbevd_2stage_work, LAPACKE_chbevx_2stage, LAPACKE_chbevx_2stage_work, LAPACKE_checon_3, LAPACKE_checon_3_work, LAPACKE_cheev_2stage, LAPACKE_cheev_2stage_work, LAPACKE_cheevd_2stage, LAPACKE_cheevd_2stage_work, LAPACKE_cheevr_2stage, LAPACKE_cheevr_2stage_work, LAPACKE_cheevx_2stage, LAPACKE_cheevx_2stage_work, LAPACKE_chegv_2stage, LAPACKE_chegv_2stage_work, LAPACKE_chesv_aa, LAPACKE_chesv_aa_work, LAPACKE_chesv_rk, LAPACKE_chesv_rk_work, LAPACKE_chetrf_aa, LAPACKE_chetrf_aa_work, LAPACKE_chetrf_rk, LAPACKE_chetrf_rk_work, LAPACKE_chetri_3, LAPACKE_chetri_3_work, LAPACKE_chetrs_aa, LAPACKE_chetrs_aa_work, LAPACKE_chetrs_3, LAPACKE_chetrs_3_work, LAPACKE_csycon_3, LAPACKE_csycon_3_work, LAPACKE_csysv_aa, LAPACKE_csysv_aa_work, LAPACKE_csysv_rk, LAPACKE_csysv_rk_work, LAPACKE_csytrf_aa, LAPACKE_csytrf_aa_work, LAPACKE_csytrf_rk, LAPACKE_csytrf_rk_work, LAPACKE_csytri_3, LAPACKE_csytri_3_work, LAPACKE_csytrs_aa, LAPACKE_csytrs_aa_work, LAPACKE_csytrs_3, LAPACKE_csytrs_3_work, LAPACKE_dgemqr, LAPACKE_dgemqr_work, LAPACKE_dgetsls, LAPACKE_dgetsls_work, LAPACKE_dsbev_2stage, LAPACKE_dsbev_2stage_work, LAPACKE_dsbevd_2stage, LAPACKE_dsbevd_2stage_work, LAPACKE_dsbevx_2stage, LAPACKE_dsbevx_2stage_work, LAPACKE_dsycon_3, LAPACKE_dsycon_3_work, LAPACKE_dsyev_2stage, LAPACKE_dsyev_2stage_work, LAPACKE_dsyevd_2stage, LAPACKE_dsyevd_2stage_work, LAPACKE_dsyevr_2stage, LAPACKE_dsyevr_2stage_work, LAPACKE_dsyevx_2stage, LAPACKE_dsyevx_2stage_work, LAPACKE_dsygv_2stage, LAPACKE_dsygv_2stage_work, LAPACKE_dsysv_aa, LAPACKE_dsysv_aa_work, LAPACKE_dsysv_rk, LAPACKE_dsysv_rk_work, LAPACKE_dsytrf_aa, LAPACKE_dsytrf_aa_work, LAPACKE_dsytrf_rk, LAPACKE_dsytrf_rk_work, LAPACKE_dsytri_3, LAPACKE_dsytri_3_work, LAPACKE_dsytrs_aa, LAPACKE_dsytrs_aa_work, LAPACKE_dsytrs_3, LAPACKE_dsytrs_3_work, LAPACKE_sgemqr, LAPACKE_sgemqr_work, LAPACKE_sgetsls, LAPACKE_sgetsls_work, LAPACKE_ssbev_2stage, LAPACKE_ssbev_2stage_work, LAPACKE_ssbevd_2stage, LAPACKE_ssbevd_2stage_work, LAPACKE_ssbevx_2stage, LAPACKE_ssbevx_2stage_work, LAPACKE_ssycon_3, LAPACKE_ssycon_3_work, LAPACKE_ssyev_2stage, LAPACKE_ssyev_2stage_work, LAPACKE_ssyevd_2stage, LAPACKE_ssyevd_2stage_work, LAPACKE_ssyevr_2stage, LAPACKE_ssyevr_2stage_work, LAPACKE_ssyevx_2stage, LAPACKE_ssyevx_2stage_work, LAPACKE_ssygv_2stage, LAPACKE_ssygv_2stage_work, LAPACKE_ssysv_aa, LAPACKE_ssysv_aa_work, LAPACKE_ssysv_rk, LAPACKE_ssysv_rk_work, LAPACKE_ssytrf_aa, LAPACKE_ssytrf_aa_work, LAPACKE_ssytrf_rk, LAPACKE_ssytrf_rk_work, LAPACKE_ssytri_3, LAPACKE_ssytri_3_work, LAPACKE_ssytrs_aa, LAPACKE_ssytrs_aa_work, LAPACKE_ssytrs_3, LAPACKE_ssytrs_3_work, LAPACKE_zgemqr, LAPACKE_zgemqr_work, LAPACKE_zgetsls, LAPACKE_zgetsls_work, LAPACKE_zhbev_2stage, LAPACKE_zhbev_2stage_work, LAPACKE_zhbevd_2stage, LAPACKE_zhbevd_2stage_work, LAPACKE_zhbevx_2stage, LAPACKE_zhbevx_2stage_work, LAPACKE_zhecon_3, LAPACKE_zhecon_3_work, LAPACKE_zheev_2stage, LAPACKE_zheev_2stage_work, LAPACKE_zheevd_2stage, LAPACKE_zheevd_2stage_work, LAPACKE_zheevr_2stage, LAPACKE_zheevr_2stage_work, LAPACKE_zheevx_2stage, LAPACKE_zheevx_2stage_work, LAPACKE_zhegv_2stage, LAPACKE_zhegv_2stage_work, LAPACKE_zhesv_aa, LAPACKE_zhesv_aa_work, LAPACKE_zhesv_rk, LAPACKE_zhesv_rk_work, LAPACKE_zhetrf_aa, LAPACKE_zhetrf_aa_work, LAPACKE_zhetrf_rk, LAPACKE_zhetrf_rk_work, LAPACKE_zhetri_3, LAPACKE_zhetri_3_work, LAPACKE_zhetrs_aa, LAPACKE_zhetrs_aa_work, LAPACKE_zhetrs_3, LAPACKE_zhetrs_3_work, LAPACKE_zsycon_3, LAPACKE_zsycon_3_work, LAPACKE_zsysv_aa, LAPACKE_zsysv_aa_work, LAPACKE_zsysv_rk, LAPACKE_zsysv_rk_work, LAPACKE_zsytrf_aa, LAPACKE_zsytrf_aa_work, LAPACKE_zsytrf_rk, LAPACKE_zsytrf_rk_work, LAPACKE_zsytri_3, LAPACKE_zsytri_3_work, LAPACKE_zsytrs_aa, LAPACKE_zsytrs_aa_work, LAPACKE_zsytrs_3, LAPACKE_zsytrs_3_work, ); #These function may need 2 underscores. @lapack_embeded_underscore_objs=( xerbla_array, chla_transtype, slasyf_rook, ssytf2_rook, ssytrf_rook, ssytrs_rook, ssytri_rook, ssycon_rook, ssysv_rook, chetf2_rook, chetrf_rook, chetri_rook, chetrs_rook, checon_rook, chesv_rook, clahef_rook, clasyf_rook, csytf2_rook, csytrf_rook, csytrs_rook, csytri_rook, csycon_rook, csysv_rook, dlasyf_rook, dsytf2_rook, dsytrf_rook, dsytrs_rook, dsytri_rook, dsycon_rook, dsysv_rook, zhetf2_rook, zhetrf_rook, zhetri_rook, zhetrs_rook, zhecon_rook, zhesv_rook, zlahef_rook, zlasyf_rook, zsytf2_rook, zsytrf_rook, zsytrs_rook, zsytri_rook, zsycon_rook, zsysv_rook, ); if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @underscore_objs = (@misc_underscore_objs); } elsif ($ARGV[5] == 1) { #NO_LAPACK=1 @underscore_objs = (@blasobjs, @misc_underscore_objs); } elsif (-d "../lapack-netlib") { if ($ARGV[7] == 0) { # NEED2UNDERSCORES=0 # Don't need 2 underscores @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs, @lapack_embeded_underscore_objs); } else { # Need 2 underscores @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); @need_2underscore_objs = (@lapack_embeded_underscore_objs); }; if ($ARGV[11] == 1) { #BUILD_LAPACK_DEPRECATED=1 @underscore_objs = (@underscore_objs, @lapack_deprecated_objs); } } else { @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); } if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @gemm3mobjs=(); @exblasobjs=(); } if ($ARGV[3] == 1) { @underscore_objs = (@underscore_objs, @exblasobjs); }; if ($ARGV[1] eq "x86_64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[4] == 0) { @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); }else{ #NO_CBLAS=1 @no_underscore_objs = (@misc_no_underscore_objs); } if ($ARGV[6] == 1) { #NO_LAPACKE=1 @no_underscore_objs = (@no_underscore_objs); } else { if ($ARGV[11] == 1) { #BUILD_LAPACK_DEPRECATED=1 @no_underscore_objs = (@no_underscore_objs, @lapackeobjs, @lapacke_deprecated_objs); } else { @no_underscore_objs = (@no_underscore_objs, @lapackeobjs); } } @hplobjs = (daxpy, dcopy, dscal, idamax, dgemv, dtrsv, dger, dgemm, dtrsm); @hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T); $bu = $ARGV[2]; $bu = "" if (($bu eq "0") || ($bu eq "1")); $symbolprefix = $ARGV[9]; $symbolsuffix = $ARGV[10]; if ($ARGV[0] eq "osx") { @underscore_objs = (@underscore_objs, @misc_common_objs); @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); foreach $objs (@underscore_objs) { print "_", $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; } foreach $objs (@need_2underscore_objs) { print "_", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; } foreach $objs (@no_underscore_objs) { print "_", $symbolprefix, $objs, $symbolsuffix, "\n"; } exit(0); } if ($ARGV[0] eq "aix"){ @underscore_objs = (@underscore_objs, @misc_common_objs); @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); foreach $objs (@underscore_objs) { print $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; } foreach $objs (@need_2underscore_objs) { print $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; } foreach $objs (@no_underscore_objs) { print $symbolprefix, $objs, $symbolsuffix, "\n"; } exit(0); } if ($ARGV[0] eq "objcopy") { @underscore_objs = (@underscore_objs, @misc_common_objs); @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); foreach $objs (@underscore_objs) { print $objs, $bu, " ", $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; } foreach $objs (@need_2underscore_objs) { print $objs, $bu, $bu, " ", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; } foreach $objs (@no_underscore_objs) { print $objs, " ", $symbolprefix, $objs, $symbolsuffix, "\n"; } exit(0); } if ($ARGV[0] eq "objconv") { @underscore_objs = (@underscore_objs, @misc_common_objs); @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); foreach $objs (@underscore_objs) { print "-nr:_", $objs, $bu, ":_", $symbolprefix, $objs, $bu, $symbolsuffix, "\n"; } foreach $objs (@need_2underscore_objs) { print "-nr:_", $objs, $bu, $bu, ":_", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "\n"; } foreach $objs (@no_underscore_objs) { print "-nr:_", $objs, ":_", $symbolprefix, $objs, $symbolsuffix, "\n"; } exit(0); } if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $objs, "_", $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "_ \@", $count, "\n"; $count ++; } foreach $objs (@need_2underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $objs, "__", $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "__ \@", $count, "\n"; $count ++; } #for misc_common_objs foreach $objs (@misc_common_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t",$symbolprefix, $objs, "_", $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "_ \@", $count, "\n"; $count ++; } foreach $objs (@no_underscore_objs) { print "\t",$symbolprefix,$objs,$symbolsuffix,"=$objs"," \@", $count, "\n"; $count ++; } exit(0); } if ($ARGV[0] eq "win2khpl") { print "EXPORTS\n"; $count = 1; foreach $objs (@hplobjs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $objs, "_", $symbolsuffix, "=$objs","_ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "_ \@", $count, "\n"; $count ++; } exit(0); } if ($ARGV[0] eq "microsoft"){ @underscore_objs = (@underscore_objs, @misc_common_objs); print "EXPORTS\n"; $count = 1; foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t",$symbolprefix, $objs, $symbolsuffix, " = $objs","_\n"; $count ++; print "\t",$symbolprefix, $objs, "\_", $symbolsuffix, " = $objs","_\n"; $count ++; print "\t",$symbolprefix, $uppercase, $symbolsuffix, " = $objs","_\n"; $count ++; print "\t",$symbolprefix, $uppercase, "\_", $symbolsuffix, " = $objs","_\n"; $count ++; } foreach $objs (@need_2underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t",$symbolprefix, $objs, $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $objs, "__", $symbolsuffix, "=$objs","__ \@", $count, "\n"; $count ++; print "\t",$symbolprefix, $uppercase, $symbolsuffix, "=$objs", "__ \@", $count, "\n"; $count ++; } exit(0); } if ($ARGV[0] eq "linktest") { @underscore_objs = (@underscore_objs, @misc_common_objs); @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); print "int main(void){\n"; foreach $objs (@underscore_objs) { print $symbolprefix, $objs, $bu, $symbolsuffix, "();\n" if $objs ne "xerbla"; } foreach $objs (@need_2underscore_objs) { print $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "();\n"; } foreach $objs (@no_underscore_objs) { print $symbolprefix, $objs, $symbolsuffix, "();\n"; } print "return 0;}\n"; exit(0); } OpenBLAS-0.2.20/f_check000066400000000000000000000170221313527062700144530ustar00rootroot00000000000000#!/usr/bin/perl $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); # # 1. Not specified # 1.1 Automatically detect, then check compiler # 1.2 If no fortran compiler is detected, gfortran is default with NOFORTRAN definition # 2. Specified # 2.1 If path is correct, check compiler # 2.2 If path is not correct, but still valid compiler name, force setting # 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition # $makefile = shift(@ARGV); $config = shift(@ARGV); $nofortran = 0; $compiler = join(" ", @ARGV); $compiler_bin = shift(@ARGV); # f77 is too ambiguous $compiler = "" if $compiler eq "f77"; @path = split(/:/, $ENV{"PATH"}); if ($compiler eq "") { @lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95", "sunf77", "sunf90", "sunf95", "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", "pathf90", "pathf95", "pgf95", "pgf90", "pgf77", "flang", "ifort"); OUTER: foreach $lists (@lists) { foreach $path (@path) { if (-x $path . "/" . $lists) { $compiler = $lists; $compiler_bin = $lists; last OUTER; } } } } if ($compiler eq "") { $nofortran = 1; $compiler = "gfortran"; $vendor = GFORTRAN; $bu = "_"; } else { $data = `which $compiler_bin > /dev/null 2> /dev/null`; $vendor = ""; if (!$?) { $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; if ($data =~ /zhoge_/) { $bu = "_"; } if ($data =~ /GNU/) { $data =~ /(\d)\.(\d).(\d)/; $major = $1; $minor = $2; if ($major >= 4) { $vendor = GFORTRAN; $openmp = "-fopenmp"; } else { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; } else { $vendor = G77; $openmp = ""; } } } if ($data =~ /g95/) { $vendor = G95; $openmp = ""; } if ($data =~ /Intel/) { $vendor = INTEL; $openmp = "-openmp"; } if ($data =~ /Sun Fortran/) { $vendor = SUN; $openmp = "-xopenmp=parallel"; } if ($data =~ /PathScale/) { $vendor = PATHSCALE; $openmp = "-openmp"; } if ($data =~ /Open64/) { $vendor = OPEN64; $openmp = "-mp"; } if ($data =~ /PGF/) { $vendor = PGI; $openmp = "-mp"; } if ($data =~ /IBM XL/) { $vendor = IBM; $openmp = "-openmp"; } # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; if ($data =~ /zho_ge__/) { $need2bu = 1; } } if ($vendor eq "") { if ($compiler =~ /g77/) { $vendor = G77; $bu = "_"; $openmp = ""; } if ($compiler =~ /g95/) { $vendor = G95; $bu = "_"; $openmp = ""; } if ($compiler =~ /gfortran/) { $vendor = GFORTRAN; $bu = "_"; $openmp = "-fopenmp"; } if ($compiler =~ /ifort/) { $vendor = INTEL; $bu = "_"; $openmp = "-openmp"; } if ($compiler =~ /pathf/) { $vendor = PATHSCALE; $bu = "_"; $openmp = "-mp"; } if ($compiler =~ /pgf/) { $vendor = PGI; $bu = "_"; $openmp = "-mp"; } if ($compiler =~ /ftn/) { $vendor = PGI; $bu = "_"; $openmp = "-openmp"; } if ($compiler =~ /frt/) { $vendor = FUJITSU; $bu = "_"; $openmp = "-openmp"; } if ($compiler =~ /sunf77|sunf90|sunf95/) { $vendor = SUN; $bu = "_"; $openmp = "-xopenmp=parallel"; } if ($compiler =~ /ppuf/) { $vendor = IBM; $openmp = "-openmp"; } if ($compiler =~ /xlf/) { $vendor = IBM; $openmp = "-openmp"; } if ($compiler =~ /open64/) { $vendor = OPEN64; $openmp = "-mp"; } if ($compiler =~ /flang/) { $vendor = FLANG; $bu = "_"; $openmp = "-fopenmp"; } if ($vendor eq "") { $nofortran = 1; $compiler = "gfortran"; $vendor = GFORTRAN; $bu = "_"; $openmp = ""; } } } $data = `which $compiler_bin > /dev/null 2> /dev/null`; if (!$?) { $binary = $ENV{"BINARY"}; $openmp = "" if $ENV{USE_OPENMP} != 1; if ($binary == 32) { $link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; if ($?) { $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } #For gfortran MIPS if ($?) { $mips_data = `$compiler_bin -E -dM - < /dev/null`; if ($mips_data =~ /_MIPS_ISA_MIPS64/) { $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } else { $link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } } $binary = "" if ($?); } if ($binary == 64) { $link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; if ($?) { $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } #For gfortran MIPS if ($?) { $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } $binary = "" if ($?); } if ($binary eq "") { $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; } } $linker_L = ""; $linker_l = ""; $linker_a = ""; if ($link ne "") { $link =~ s/\-Y\sP\,/\-Y/g; $link =~ s/\-rpath\s+/\-rpath\@/g; $link =~ s/\-rpath-link\s+/\-rpath-link\@/g; @flags = split(/[\s\,\n]/, $link); # remove leading and trailing quotes from each flag. @flags = map {s/^['"]|['"]$//g; $_} @flags; foreach $flags (@flags) { if ( ($flags =~ /^\-L/) && ($flags !~ /^-LIST:/) && ($flags !~ /^-LANG:/) ) { if ($vendor eq "PGI") { $flags =~ s/lib$/libso/; } $linker_L .= $flags . " "; } if ($flags =~ /^\-Y/) { next if ($hostos eq 'SunOS'); $linker_L .= "-Wl,". $flags . " "; } if ($flags =~ /^\--exclude-libs/) { $linker_L .= "-Wl,". $flags . " "; $flags=""; } if ($flags =~ /^\-rpath\@/) { $flags =~ s/\@/\,/g; if ($vendor eq "PGI") { $flags =~ s/lib$/libso/; } $linker_L .= "-Wl,". $flags . " " ; } if ($flags =~ /^\-rpath-link\@/) { $flags =~ s/\@/\,/g; if ($vendor eq "PGI") { $flags =~ s/lib$/libso/; } $linker_L .= "-Wl,". $flags . " " ; } if ( ($flags =~ /^\-l/) && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) && ($flags !~ /numa/) && ($flags !~ /crt[0-9]/) && ($flags !~ /gcc/) && ($flags !~ /user32/) && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) && ($flags !~ /^\-l$/) ) { $linker_l .= $flags . " "; } $linker_a .= $flags . " " if $flags =~ /\.a$/; } } if ($vendor eq "INTEL"){ $linker_a .= "-lgfortran" } if ($vendor eq "FLANG"){ $linker_a .= "-lflang" } open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; open(CONFFILE, ">> $config" ) || die "Can't append $config"; print MAKEFILE "F_COMPILER=$vendor\n"; print MAKEFILE "FC=$compiler\n"; print MAKEFILE "BU=$bu\n" if $bu ne ""; print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1; print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne ""; print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne ""; print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne ""; print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne ""; if (($linker_l ne "") || ($linker_a ne "")) { print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n"; } close(MAKEFILE); close(CONFFILE); OpenBLAS-0.2.20/ftest.f000066400000000000000000000001331313527062700144350ustar00rootroot00000000000000 double complex function zhoge() zhoge = (0.0d0,0.0d0) return end OpenBLAS-0.2.20/ftest2.f000066400000000000000000000000361313527062700145210ustar00rootroot00000000000000 program main end OpenBLAS-0.2.20/ftest3.f000066400000000000000000000001351313527062700145220ustar00rootroot00000000000000 double complex function zho_ge() zho_ge = (0.0d0,0.0d0) return end OpenBLAS-0.2.20/gen_config_h.c000066400000000000000000000013471313527062700157220ustar00rootroot00000000000000#include #include #include int main(int argc, char**argv) { FILE *fp; char line[100]; char line2[80]; char *s; int i; fprintf(stdout,"#ifndef OPENBLAS_CONFIG_H\n"); fprintf(stdout,"#define OPENBLAS_CONFIG_H\n"); fp=fopen(argv[1],"r"); do{ s=fgets(line,80,fp); if (s== NULL) break; memset(line2,0,80); i=sscanf(line,"#define %70c",line2); if (i!=0) { fprintf(stdout,"#define OPENBLAS_%s",line2); } else { fprintf(stdout,"\n"); } } while (1); fclose(fp); fprintf(stdout,"#define OPENBLAS_VERSION \"OpenBLAS %s\"\n", VERSION); fp=fopen(argv[2],"r"); do{ s=fgets(line,100,fp); if (s== NULL) break; fprintf(stdout,"%s",line); } while(1); fclose(fp); fprintf(stdout,"#endif /* OPENBLAS_CONFIG_H */\n"); exit(0); } OpenBLAS-0.2.20/getarch.c000066400000000000000000001107431313527062700147330ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #if defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64) #define OS_WINDOWS #endif #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) #define INTEL_AMD #endif #include #include #ifdef OS_WINDOWS #include #endif #if defined(__FreeBSD__) || defined(__APPLE__) #include #include #endif #if defined(linux) || defined(__sun__) #include #include #endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ /* #define FORCE_NORTHWOOD */ /* #define FORCE_PRESCOTT */ /* #define FORCE_BANIAS */ /* #define FORCE_YONAH */ /* #define FORCE_CORE2 */ /* #define FORCE_PENRYN */ /* #define FORCE_DUNNINGTON */ /* #define FORCE_NEHALEM */ /* #define FORCE_SANDYBRIDGE */ /* #define FORCE_ATOM */ /* #define FORCE_ATHLON */ /* #define FORCE_OPTERON */ /* #define FORCE_OPTERON_SSE3 */ /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ /* #define FORCE_BOBCAT */ /* #define FORCE_BULLDOZER */ /* #define FORCE_PILEDRIVER */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ /* #define FORCE_POWER3 */ /* #define FORCE_POWER4 */ /* #define FORCE_POWER5 */ /* #define FORCE_POWER6 */ /* #define FORCE_POWER7 */ /* #define FORCE_POWER8 */ /* #define FORCE_PPCG4 */ /* #define FORCE_PPC970 */ /* #define FORCE_PPC970MP */ /* #define FORCE_PPC440 */ /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3A */ /* #define FORCE_LOONGSON3B */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ /* #define FORCE_ITANIUM2 */ /* #define FORCE_SPARC */ /* #define FORCE_SPARCV7 */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENTIUM2" #define ARCHCONFIG "-DPENTIUM2 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX" #define LIBNAME "p2" #define CORENAME "P5" #endif #ifdef FORCE_KATMAI #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENTIUM3" #define ARCHCONFIG "-DPENTIUM3 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " #define LIBNAME "katmai" #define CORENAME "KATMAI" #endif #ifdef FORCE_COPPERMINE #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENTIUM3" #define ARCHCONFIG "-DPENTIUM3 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " #define LIBNAME "coppermine" #define CORENAME "COPPERMINE" #endif #ifdef FORCE_NORTHWOOD #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENTIUM4" #define ARCHCONFIG "-DPENTIUM4 " \ "-DL1_DATA_SIZE=8192 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " #define LIBNAME "northwood" #define CORENAME "NORTHWOOD" #endif #ifdef FORCE_PRESCOTT #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENTIUM4" #define ARCHCONFIG "-DPENTIUM4 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3" #define LIBNAME "prescott" #define CORENAME "PRESCOTT" #endif #ifdef FORCE_BANIAS #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "BANIAS" #define ARCHCONFIG "-DPENTIUMM " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " #define LIBNAME "banias" #define CORENAME "BANIAS" #endif #ifdef FORCE_YONAH #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "YONAH" #define ARCHCONFIG "-DPENTIUMM " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " #define LIBNAME "yonah" #define CORENAME "YONAH" #endif #ifdef FORCE_CORE2 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "CONRORE" #define ARCHCONFIG "-DCORE2 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=256 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" #define LIBNAME "core2" #define CORENAME "CORE2" #endif #ifdef FORCE_PENRYN #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENRYN" #define ARCHCONFIG "-DPENRYN " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=256 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1" #define LIBNAME "penryn" #define CORENAME "PENRYN" #endif #ifdef FORCE_DUNNINGTON #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "DUNNINGTON" #define ARCHCONFIG "-DDUNNINGTON " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DL3_SIZE=16777216 -DL3_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=256 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1" #define LIBNAME "dunnington" #define CORENAME "DUNNINGTON" #endif #ifdef FORCE_NEHALEM #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "NEHALEM" #define ARCHCONFIG "-DNEHALEM " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" #define LIBNAME "nehalem" #define CORENAME "NEHALEM" #endif #ifdef FORCE_SANDYBRIDGE #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" #endif #ifdef FORCE_HASWELL #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ "-DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #endif #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "ATOM" #define ARCHCONFIG "-DATOM " \ "-DL1_DATA_SIZE=24576 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" #define LIBNAME "atom" #define CORENAME "ATOM" #endif #ifdef FORCE_ATHLON #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "ATHLON" #define ARCHCONFIG "-DATHLON " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE " #define LIBNAME "athlon" #define CORENAME "ATHLON" #endif #ifdef FORCE_OPTERON #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "OPTERON" #define ARCHCONFIG "-DOPTERON " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " #define LIBNAME "opteron" #define CORENAME "OPTERON" #endif #ifdef FORCE_OPTERON_SSE3 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "OPTERON" #define ARCHCONFIG "-DOPTERON " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3" #define LIBNAME "opteron" #define CORENAME "OPTERON" #endif #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "BARCELONA" #define ARCHCONFIG "-DBARCELONA " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" #define LIBNAME "barcelona" #define CORENAME "BARCELONA" #endif #if defined(FORCE_BOBCAT) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "BOBCAT" #define ARCHCONFIG "-DBOBCAT " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" #define LIBNAME "bobcat" #define CORENAME "BOBCAT" #endif #if defined (FORCE_BULLDOZER) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "BULLDOZER" #define ARCHCONFIG "-DBULLDOZER " \ "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ "-DHAVE_AVX -DHAVE_FMA4" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" #endif #if defined (FORCE_PILEDRIVER) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PILEDRIVER" #define ARCHCONFIG "-DPILEDRIVER " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" #define LIBNAME "piledriver" #define CORENAME "PILEDRIVER" #endif #if defined (FORCE_STEAMROLLER) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "STEAMROLLER" #define ARCHCONFIG "-DSTEAMROLLER " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" #define LIBNAME "steamroller" #define CORENAME "STEAMROLLER" #endif #if defined (FORCE_EXCAVATOR) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "EXCAVATOR" #define ARCHCONFIG "-DEXCAVATOR " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" #define LIBNAME "excavator" #define CORENAME "EXCAVATOR" #endif #if defined (FORCE_ZEN) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL2_CODE_ASSOCIATIVE=8 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=16777216 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=8 " \ "-DITB_DEFAULT_ENTRIES=64 -DITB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_AVX -DHAVE_FMA3 -DFMA3" #define LIBNAME "zen" #define CORENAME "ZEN" #endif #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "GENERIC" #define ARCHCONFIG "-DGENERIC " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2" #define LIBNAME "generic" #define CORENAME "GENERIC" #endif #ifdef FORCE_VIAC3 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "VIAC3" #define ARCHCONFIG "-DVIAC3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=65536 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE " #define LIBNAME "viac3" #define CORENAME "VIAC3" #endif #ifdef FORCE_NANO #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "NANO" #define ARCHCONFIG "-DNANO " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" #define LIBNAME "nano" #define CORENAME "NANO" #endif #ifdef FORCE_POWER3 #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER3" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPOWER3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=256 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "power3" #define CORENAME "POWER3" #endif #ifdef FORCE_POWER4 #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER4" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPOWER4 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=1509949 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=6 " #define LIBNAME "power4" #define CORENAME "POWER4" #endif #ifdef FORCE_POWER5 #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER5" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPOWER5 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=1509949 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=6 " #define LIBNAME "power5" #define CORENAME "POWER5" #endif #if defined(FORCE_POWER6) || defined(FORCE_POWER7) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER6" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPOWER6 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "power6" #define CORENAME "POWER6" #endif #if defined(FORCE_POWER8) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER8" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPOWER8 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "power8" #define CORENAME "POWER8" #endif #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "PPCG4" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPPCG4 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "ppcg4" #define CORENAME "PPCG4" #endif #ifdef FORCE_PPC970 #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "PPC970" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPPC970 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "ppc970" #define CORENAME "PPC970" #endif #ifdef FORCE_PPC970MP #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "PPC970" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPPC970 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=1024976 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "ppc970mp" #define CORENAME "PPC970" #endif #ifdef FORCE_PPC440 #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "PPC440" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPPC440 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=16384 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " #define LIBNAME "ppc440" #define CORENAME "PPC440" #endif #ifdef FORCE_PPC440FP2 #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "PPC440FP2" #define SUBDIRNAME "power" #define ARCHCONFIG "-DPPC440FP2 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=16384 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " #define LIBNAME "ppc440FP2" #define CORENAME "PPC440FP2" #endif #ifdef FORCE_CELL #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "CELL" #define SUBDIRNAME "power" #define ARCHCONFIG "-DCELL " \ "-DL1_DATA_SIZE=262144 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "cell" #define CORENAME "CELL" #endif #ifdef FORCE_SICORTEX #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "SICORTEX" #define SUBDIRNAME "mips" #define ARCHCONFIG "-DSICORTEX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "mips" #define CORENAME "sicortex" #endif #ifdef FORCE_LOONGSON3A #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "LOONGSON3A" #define SUBDIRNAME "mips64" #define ARCHCONFIG "-DLOONGSON3A " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " #define LIBNAME "loongson3a" #define CORENAME "LOONGSON3A" #else #endif #ifdef FORCE_LOONGSON3B #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "LOONGSON3B" #define SUBDIRNAME "mips64" #define ARCHCONFIG "-DLOONGSON3B " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " #define LIBNAME "loongson3b" #define CORENAME "LOONGSON3B" #else #endif #ifdef FORCE_I6400 #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "I6400" #define SUBDIRNAME "mips64" #define ARCHCONFIG "-DI6400 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "i6400" #define CORENAME "I6400" #else #endif #ifdef FORCE_P6600 #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "P6600" #define SUBDIRNAME "mips64" #define ARCHCONFIG "-DP6600 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "p6600" #define CORENAME "P6600" #else #endif #ifdef FORCE_P5600 #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "P5600" #define SUBDIRNAME "mips" #define ARCHCONFIG "-DP5600 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "p5600" #define CORENAME "P5600" #else #endif #ifdef FORCE_ITANIUM2 #define FORCE #define ARCHITECTURE "IA64" #define SUBARCHITECTURE "ITANIUM2" #define SUBDIRNAME "ia64" #define ARCHCONFIG "-DITANIUM2 " \ "-DL1_DATA_SIZE=262144 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=1572864 -DL2_LINESIZE=128 -DDTB_SIZE=16384 -DDTB_DEFAULT_ENTRIES=128 " #define LIBNAME "itanium2" #define CORENAME "itanium2" #endif #ifdef FORCE_SPARC #define FORCE #define ARCHITECTURE "SPARC" #define SUBARCHITECTURE "SPARC" #define SUBDIRNAME "sparc" #define ARCHCONFIG "-DSPARC -DV9 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1572864 -DL2_LINESIZE=64 -DDTB_SIZE=8192 -DDTB_DEFAULT_ENTRIES=64 " #define LIBNAME "sparc" #define CORENAME "sparc" #endif #ifdef FORCE_SPARCV7 #define FORCE #define ARCHITECTURE "SPARC" #define SUBARCHITECTURE "SPARC" #define SUBDIRNAME "sparc" #define ARCHCONFIG "-DSPARC -DV7 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1572864 -DL2_LINESIZE=64 -DDTB_SIZE=8192 -DDTB_DEFAULT_ENTRIES=64 " #define LIBNAME "sparcv7" #define CORENAME "sparcv7" #endif #ifdef FORCE_GENERIC #define FORCE #define ARCHITECTURE "GENERIC" #define SUBARCHITECTURE "GENERIC" #define SUBDIRNAME "generic" #define ARCHCONFIG "-DGENERIC " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " #define LIBNAME "generic" #define CORENAME "generic" #endif #ifdef FORCE_ARMV7 #define FORCE #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "ARMV7" #define SUBDIRNAME "arm" #define ARCHCONFIG "-DARMV7 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ "-DHAVE_VFPV3 -DHAVE_VFP" #define LIBNAME "armv7" #define CORENAME "ARMV7" #else #endif #ifdef FORCE_CORTEXA9 #define FORCE #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA9" #define SUBDIRNAME "arm" #define ARCHCONFIG "-DCORTEXA9 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" #define LIBNAME "cortexa9" #define CORENAME "CORTEXA9" #else #endif #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA15" #define SUBDIRNAME "arm" #define ARCHCONFIG "-DCORTEXA15 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" #define LIBNAME "cortexa15" #define CORENAME "CORTEXA15" #else #endif #ifdef FORCE_ARMV6 #define FORCE #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "ARMV6" #define SUBDIRNAME "arm" #define ARCHCONFIG "-DARMV6 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ "-DHAVE_VFP" #define LIBNAME "armv6" #define CORENAME "ARMV6" #else #endif #ifdef FORCE_ARMV5 #define FORCE #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "ARMV5" #define SUBDIRNAME "arm" #define ARCHCONFIG "-DARMV5 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " #define LIBNAME "armv5" #define CORENAME "ARMV5" #else #endif #ifdef FORCE_ARMV8 #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "ARMV8" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DARMV8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " #define LIBNAME "armv8" #define CORENAME "ARMV8" #endif #ifdef FORCE_CORTEXA57 #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "CORTEXA57" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DCORTEXA57 " \ "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" #define LIBNAME "cortexa57" #define CORENAME "CORTEXA57" #else #endif #ifdef FORCE_VULCAN #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "VULCAN" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DVULCAN " \ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" #define LIBNAME "vulcan" #define CORENAME "VULCAN" #else #endif #ifdef FORCE_THUNDERX #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "THUNDERX" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DTHUNDERX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " #define LIBNAME "thunderx" #define CORENAME "THUNDERX" #else #endif #ifdef FORCE_THUNDERX2T99 #define FORCE #define ARCHITECTURE "ARM64" #define SUBARCHITECTURE "THUNDERX2T99" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DTHUNDERX2T99 " \ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" #define LIBNAME "thunderx2t99" #define CORENAME "THUNDERX2T99" #else #endif #ifndef FORCE #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) #ifndef POWER #define POWER #endif #define OPENBLAS_SUPPORTED #endif #if defined(__zarch__) || defined(__s390x__) #define ZARCH #include "cpuid_zarch.c" #define OPENBLAS_SUPPORTED #endif #ifdef INTEL_AMD #include "cpuid_x86.c" #define OPENBLAS_SUPPORTED #endif #ifdef __ia64__ #include "cpuid_ia64.c" #define OPENBLAS_SUPPORTED #endif #ifdef __alpha #include "cpuid_alpha.c" #define OPENBLAS_SUPPORTED #endif #ifdef POWER #include "cpuid_power.c" #define OPENBLAS_SUPPORTED #endif #ifdef sparc #include "cpuid_sparc.c" #define OPENBLAS_SUPPORTED #endif #ifdef __mips__ #ifdef __mips64 #include "cpuid_mips64.c" #else #include "cpuid_mips.c" #endif #define OPENBLAS_SUPPORTED #endif #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED #endif #ifdef __aarch64__ #include "cpuid_arm64.c" #define OPENBLAS_SUPPORTED #endif #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif #else #endif static int get_num_cores(void) { #ifdef OS_WINDOWS SYSTEM_INFO sysinfo; #elif defined(__FreeBSD__) || defined(__APPLE__) int m[2], count; size_t len; #endif #if defined(linux) || defined(__sun__) //returns the number of processors which are currently online return sysconf(_SC_NPROCESSORS_CONF); #elif defined(OS_WINDOWS) GetSystemInfo(&sysinfo); return sysinfo.dwNumberOfProcessors; #elif defined(__FreeBSD__) || defined(__APPLE__) m[0] = CTL_HW; m[1] = HW_NCPU; len = sizeof(int); sysctl(m, 2, &count, &len, NULL, 0); return count; #else return 2; #endif } int main(int argc, char *argv[]){ #ifdef FORCE char buffer[8192], *p, *q; int length; #endif if (argc == 1) return 0; switch (argv[1][0]) { case '0' : /* for Makefile */ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) printf("CORE=%s\n", get_corename()); #endif #endif #ifdef FORCE printf("LIBCORE=%s\n", LIBNAME); #else printf("LIBCORE="); get_libname(); printf("\n"); #endif printf("NUM_CORES=%d\n", get_num_cores()); #if defined(__arm__) && !defined(FORCE) get_features(); #endif #ifdef INTEL_AMD #ifndef FORCE get_sse(); #else sprintf(buffer, "%s", ARCHCONFIG); p = &buffer[0]; while (*p) { if ((*p == '-') && (*(p + 1) == 'D')) { p += 2; while ((*p != ' ') && (*p != '\0')) { if (*p == '=') { printf("="); p ++; while ((*p != ' ') && (*p != '\0')) { printf("%c", *p); p ++; } } else { printf("%c", *p); p ++; if ((*p == ' ') || (*p =='\0')) printf("=1"); } } printf("\n"); } else p ++; } #endif #endif #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 printf("MAKE += -j %d\n", MAKE_NB_JOBS); #else // Let make use parent -j argument or -j1 if there // is no make parent #endif #elif NO_PARALLEL_MAKE==1 printf("MAKE += -j 1\n"); #else #ifndef OS_WINDOWS printf("MAKE += -j %d\n", get_num_cores()); #endif #endif break; case '1' : /* For config.h */ #ifdef FORCE sprintf(buffer, "%s -DCORE_%s\n", ARCHCONFIG, CORENAME); p = &buffer[0]; while (*p) { if ((*p == '-') && (*(p + 1) == 'D')) { p += 2; printf("#define "); while ((*p != ' ') && (*p != '\0')) { if (*p == '=') { printf(" "); p ++; while ((*p != ' ') && (*p != '\0')) { printf("%c", *p); p ++; } } else { if (*p != '\n') printf("%c", *p); p ++; } } printf("\n"); } else p ++; } #else get_cpuconfig(); #endif #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif break; case '2' : /* SMP */ if (get_num_cores() > 1) printf("SMP=1\n"); break; } fflush(stdout); return 0; } OpenBLAS-0.2.20/getarch_2nd.c000066400000000000000000000052061313527062700154730ustar00rootroot00000000000000#include #ifndef BUILD_KERNEL #include "config.h" #else #include "config_kernel.h" #endif #include "param.h" int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); printf("DGEMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); printf("QGEMM_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M); printf("QGEMM_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); printf("CGEMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M); printf("CGEMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N); printf("ZGEMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M); printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M); printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N); #ifdef CGEMM3M_DEFAULT_UNROLL_M printf("CGEMM3M_UNROLL_M=%d\n", CGEMM3M_DEFAULT_UNROLL_M); #else printf("CGEMM3M_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); #endif #ifdef CGEMM3M_DEFAULT_UNROLL_N printf("CGEMM3M_UNROLL_N=%d\n", CGEMM3M_DEFAULT_UNROLL_N); #else printf("CGEMM3M_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); #endif #ifdef ZGEMM3M_DEFAULT_UNROLL_M printf("ZGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M); #else printf("ZGEMM3M_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); #endif #ifdef ZGEMM3M_DEFAULT_UNROLL_N printf("ZGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N); #else printf("ZGEMM3M_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); #endif #ifdef XGEMM3M_DEFAULT_UNROLL_M printf("XGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M); #else printf("XGEMM3M_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M); #endif #ifdef XGEMM3M_DEFAULT_UNROLL_N printf("XGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N); #else printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); #endif } if ((argc >= 2) && (*argv[1] == '1')) { #if defined(ARCH_X86) || defined(ARCH_X86_64) printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); #endif #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); } return 0; } OpenBLAS-0.2.20/interface/000077500000000000000000000000001313527062700151045ustar00rootroot00000000000000OpenBLAS-0.2.20/interface/CMakeLists.txt000066400000000000000000000133531313527062700176510ustar00rootroot00000000000000 include_directories(${PROJECT_SOURCE_DIR}) set(BLAS1_SOURCES copy.c nrm2.c ) set(BLAS1_REAL_ONLY_SOURCES rotm.c rotmg.c # N.B. these do not have complex counterparts rot.c asum.c ) # these will have 'z' prepended for the complex version set(BLAS1_MANGLED_SOURCES axpy.c swap.c scal.c dot.c rotg.c axpby.c ) # TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c trsv.c trmv.c symv.c syr.c syr2.c gbmv.c sbmv.c spmv.c spr.c spr2.c tbsv.c tbmv.c tpsv.c tpmv.c ) set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES hemv.c hbmv.c her.c her2.c hpmv.c hpr.c hpr2.c ) # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c trsm.c syrk.c syr2k.c ) set(BLAS3_MANGLED_SOURCES omatcopy.c imatcopy.c geadd.c ) # generate the BLAS objs once with and once without cblas set (CBLAS_FLAGS "") if (NOT DEFINED NO_FBLAS) list(APPEND CBLAS_FLAGS 0) endif () if (NOT DEFINED NO_CBLAS) list(APPEND CBLAS_FLAGS 1) endif () foreach (CBLAS_FLAG ${CBLAS_FLAGS}) # TODO: don't compile complex sources with cblas for now, the naming schemes are all different and they will have to be handled separately from SINGLE/DOUBLE set(DISABLE_COMPLEX 0) set(MANGLE_COMPLEX 3) if (CBLAS_FLAG EQUAL 1) # set(DISABLE_COMPLEX 1) # set(MANGLE_COMPLEX 1) endif () GenerateNamedObjects("${BLAS1_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) #sdsdot, dsdot GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") GenerateNamedObjects("dsdot.c" "" "dsdot" ${CBLAS_FLAG} "" "" true "SINGLE") # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) # max and imax are compiled 4 times GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) GenerateNamedObjects("max.c" "USE_ABS" "amax" ${CBLAS_FLAG}) GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "amin" ${CBLAS_FLAG}) GenerateNamedObjects("max.c" "USE_MIN" "min" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "" "i*max" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) # complex-specific sources foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "" "dotu" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("symm.c" "HEMM" "hemm" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("syrk.c" "HEMM" "herk" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) if (USE_GEMM3M) GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) endif() endif () if (${float_type} STREQUAL "COMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "sscal" ${CBLAS_FLAG} "" "" false "COMPLEX") GenerateNamedObjects("nrm2.c" "" "scnrm2" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("zrot.c" "" "csrot" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") endif () if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") GenerateNamedObjects("nrm2.c" "" "dznrm2" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("zrot.c" "" "zdrot" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") endif () endforeach () endforeach () #Special functions for CBLAS if (NOT DEFINED NO_CBLAS) foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") #cblas_dotc_sub cblas_dotu_sub GenerateNamedObjects("zdot.c" "FORCE_USE_STACK" "dotu_sub" 1 "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "FORCE_USE_STACK;CONJ" "dotc_sub" 1 "" "" false ${float_type}) endif() endforeach () endif() if (NOT DEFINED NO_LAPACK) set(LAPACK_SOURCES lapack/gesv.c ) # prepend z for complex versions set(LAPACK_MANGLED_SOURCES lapack/getrf.c lapack/getrs.c lapack/potrf.c lapack/getf2.c lapack/potf2.c lapack/laswp.c lapack/lauu2.c lapack/lauum.c lapack/trti2.c lapack/trtri.c ) GenerateNamedObjects("${LAPACK_SOURCES}") GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) endif () add_library(interface OBJECT ${OPENBLAS_SRC}) OpenBLAS-0.2.20/interface/Makefile000066400000000000000000001745701313527062700165620ustar00rootroot00000000000000TOPDIR = .. include $(TOPDIR)/Makefile.system SUPPORT_GEMM3M = 0 ifeq ($(ARCH), x86) SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), x86_64) SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), ia64) SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), MIPS) SUPPORT_GEMM3M = 1 endif ifndef NO_FBLAS SBLAS1OBJS = \ saxpy.$(SUFFIX) sswap.$(SUFFIX) \ scopy.$(SUFFIX) sscal.$(SUFFIX) \ sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ sasum.$(SUFFIX) snrm2.$(SUFFIX) \ smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ saxpby.$(SUFFIX) SBLAS2OBJS = \ sgemv.$(SUFFIX) sger.$(SUFFIX) \ strsv.$(SUFFIX) strmv.$(SUFFIX) ssymv.$(SUFFIX) \ ssyr.$(SUFFIX) ssyr2.$(SUFFIX) sgbmv.$(SUFFIX) \ ssbmv.$(SUFFIX) sspmv.$(SUFFIX) \ sspr.$(SUFFIX) sspr2.$(SUFFIX) \ stbsv.$(SUFFIX) stbmv.$(SUFFIX) \ stpsv.$(SUFFIX) stpmv.$(SUFFIX) SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ dcopy.$(SUFFIX) dscal.$(SUFFIX) \ ddot.$(SUFFIX) \ dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ daxpby.$(SUFFIX) DBLAS2OBJS = \ dgemv.$(SUFFIX) dger.$(SUFFIX) \ dtrsv.$(SUFFIX) dtrmv.$(SUFFIX) dsymv.$(SUFFIX) \ dsyr.$(SUFFIX) dsyr2.$(SUFFIX) dgbmv.$(SUFFIX) \ dsbmv.$(SUFFIX) dspmv.$(SUFFIX) \ dspr.$(SUFFIX) dspr2.$(SUFFIX) \ dtbsv.$(SUFFIX) dtbmv.$(SUFFIX) \ dtpsv.$(SUFFIX) dtpmv.$(SUFFIX) DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ dgeadd.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ scamax.$(SUFFIX) icamax.$(SUFFIX) \ scamin.$(SUFFIX) icamin.$(SUFFIX) \ csrot.$(SUFFIX) crotg.$(SUFFIX) \ caxpby.$(SUFFIX) CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ csbmv.$(SUFFIX) \ cspr2.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ cher.$(SUFFIX) cher2.$(SUFFIX) \ chpmv.$(SUFFIX) chpr.$(SUFFIX) chpr2.$(SUFFIX) CBLAS3OBJS = \ cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ cgeadd.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ dzamax.$(SUFFIX) izamax.$(SUFFIX) \ dzamin.$(SUFFIX) izamin.$(SUFFIX) \ zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ zaxpby.$(SUFFIX) ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ zsbmv.$(SUFFIX) \ zspr2.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ zher.$(SUFFIX) zher2.$(SUFFIX) \ zhpmv.$(SUFFIX) zhpr.$(SUFFIX) zhpr2.$(SUFFIX) ZBLAS3OBJS = \ zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ zgeadd.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) # CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) CBLAS3OBJS += cgemm3m.$(SUFFIX) # ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) ZBLAS3OBJS += zgemm3m.$(SUFFIX) endif ifdef EXPRECISION QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ qdot.$(SUFFIX) \ qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ QBLAS2OBJS = \ qgemv.$(SUFFIX) qger.$(SUFFIX) \ qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ qspr.$(SUFFIX) qspr2.$(SUFFIX) \ qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) QBLAS3OBJS = \ qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ XBLAS2OBJS = \ xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ xspr.$(SUFFIX) xspr2.$(SUFFIX) \ xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ xher.$(SUFFIX) xher2.$(SUFFIX) \ xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) XBLAS3OBJS = \ xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) endif endif ifdef QUAD_PRECISION QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ QBLAS2OBJS = \ qgemv.$(SUFFIX) qger.$(SUFFIX) \ qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ qspr.$(SUFFIX) qspr2.$(SUFFIX) \ qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) QBLAS3OBJS = \ qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ XBLAS2OBJS = \ xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ xspr.$(SUFFIX) xspr2.$(SUFFIX) \ xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ xher.$(SUFFIX) xher2.$(SUFFIX) \ xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) XBLAS3OBJS = \ xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) endif endif endif HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ dgemv.$(SUFFIX) dtrsv.$(SUFFIX) dger.$(SUFFIX) \ idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) CSBLAS1OBJS = \ cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ cblas_strsv.$(SUFFIX) cblas_ssyr.$(SUFFIX) cblas_ssyr2.$(SUFFIX) cblas_sgbmv.$(SUFFIX) \ cblas_ssbmv.$(SUFFIX) cblas_sspmv.$(SUFFIX) cblas_sspr.$(SUFFIX) cblas_sspr2.$(SUFFIX) \ cblas_stbmv.$(SUFFIX) cblas_stbsv.$(SUFFIX) cblas_stpmv.$(SUFFIX) cblas_stpsv.$(SUFFIX) CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ cblas_dtrsv.$(SUFFIX) cblas_dsyr.$(SUFFIX) cblas_dsyr2.$(SUFFIX) cblas_dgbmv.$(SUFFIX) \ cblas_dsbmv.$(SUFFIX) cblas_dspmv.$(SUFFIX) cblas_dspr.$(SUFFIX) cblas_dspr2.$(SUFFIX) \ cblas_dtbmv.$(SUFFIX) cblas_dtbsv.$(SUFFIX) cblas_dtpmv.$(SUFFIX) cblas_dtpsv.$(SUFFIX) CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ cblas_dgeadd.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_ccopy.$(SUFFIX) \ cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ cblas_cgbmv.$(SUFFIX) cblas_chbmv.$(SUFFIX) cblas_chemv.$(SUFFIX) \ cblas_cher.$(SUFFIX) cblas_cher2.$(SUFFIX) cblas_chpmv.$(SUFFIX) \ cblas_chpr.$(SUFFIX) cblas_chpr2.$(SUFFIX) cblas_ctbmv.$(SUFFIX) \ cblas_ctbsv.$(SUFFIX) cblas_ctpmv.$(SUFFIX) cblas_ctpsv.$(SUFFIX) \ cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX) CCBLAS3OBJS = \ cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX) CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) CZBLAS2OBJS = \ cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ cblas_zgbmv.$(SUFFIX) cblas_zhbmv.$(SUFFIX) cblas_zhemv.$(SUFFIX) \ cblas_zher.$(SUFFIX) cblas_zher2.$(SUFFIX) cblas_zhpmv.$(SUFFIX) \ cblas_zhpr.$(SUFFIX) cblas_zhpr2.$(SUFFIX) cblas_ztbmv.$(SUFFIX) \ cblas_ztbsv.$(SUFFIX) cblas_ztpmv.$(SUFFIX) cblas_ztpsv.$(SUFFIX) \ cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX) CZBLAS3OBJS = \ cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ cblas_zgeadd.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) # CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) CCBLAS3OBJS += cblas_cgemm3m.$(SUFFIX) # ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) endif ifndef NO_CBLAS override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS3OBJS += $(CDBLAS3OBJS) CBLAS1OBJS += $(CCBLAS1OBJS) CBLAS2OBJS += $(CCBLAS2OBJS) CBLAS3OBJS += $(CCBLAS3OBJS) ZBLAS1OBJS += $(CZBLAS1OBJS) ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) #SLAPACKOBJS = \ # sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ # spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ # slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) spotri.$(SUFFIX) SLAPACKOBJS = \ sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) #DLAPACKOBJS = \ # dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ # dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ # dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dpotri.$(SUFFIX) DLAPACKOBJS = \ dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) QLAPACKOBJS = \ qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ #CLAPACKOBJS = \ # cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ # cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ # clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX) CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ # zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ # zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) zpotri.$(SUFFIX) ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \ xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ ifneq ($(NO_LAPACK), 1) SBLASOBJS += $(SLAPACKOBJS) DBLASOBJS += $(DLAPACKOBJS) #QBLASOBJS += $(QLAPACKOBJS) CBLASOBJS += $(CLAPACKOBJS) ZBLASOBJS += $(ZLAPACKOBJS) #XBLASOBJS += $(XLAPACKOBJS) endif FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif ifdef QUAD_PRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=) include $(TOPDIR)/Makefile.tail all :: libs ifdef FUNCTION_PROFILE $(BLASOBJS) $(BLASOBJS_P) : functable.h $(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) functable.h : Makefile ./create $(FUNCALLFILES) > functable.h endif clean :: @rm -f functable.h level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -c $< -o $(@F) drot.$(SUFFIX) drot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -c $< -o $(@F) qrot.$(SUFFIX) qrot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -c $< -o $(@F) csrot.$(SUFFIX) csrot.$(PSUFFIX) : zrot.c $(CC) $(CFLAGS) -c $< -o $(@F) zdrot.$(SUFFIX) zdrot.$(PSUFFIX) : zrot.c $(CC) $(CFLAGS) -c $< -o $(@F) xqrot.$(SUFFIX) xqrot.$(PSUFFIX) : zrot.c $(CC) $(CFLAGS) -c $< -o $(@F) srotm.$(SUFFIX) srotm.$(PSUFFIX): rotm.c $(CC) -c $(CFLAGS) $< -o $(@F) drotm.$(SUFFIX) drotm.$(PSUFFIX): rotm.c $(CC) -c $(CFLAGS) $< -o $(@F) qrotm.$(SUFFIX) qrotm.$(PSUFFIX): rotm.c $(CC) -c $(CFLAGS) $< -o $(@F) srotmg.$(SUFFIX) srotmg.$(PSUFFIX): rotmg.c $(CC) -c $(CFLAGS) $< -o $(@F) drotmg.$(SUFFIX) drotmg.$(PSUFFIX): rotmg.c $(CC) -c $(CFLAGS) $< -o $(@F) qrotmg.$(SUFFIX) qrotmg.$(PSUFFIX): rotmg.c $(CC) -c $(CFLAGS) $< -o $(@F) srotg.$(SUFFIX) srotg.$(PSUFFIX): rotg.c $(CC) -c $(CFLAGS) $< -o $(@F) drotg.$(SUFFIX) drotg.$(PSUFFIX): rotg.c $(CC) -c $(CFLAGS) $< -o $(@F) qrotg.$(SUFFIX) qrotg.$(PSUFFIX): rotg.c $(CC) -c $(CFLAGS) $< -o $(@F) crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c $(CC) -c $(CFLAGS) $< -o $(@F) zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c $(CC) -c $(CFLAGS) $< -o $(@F) xrotg.$(SUFFIX) xrotg.$(PSUFFIX): zrotg.c $(CC) -c $(CFLAGS) $< -o $(@F) sasum.$(SUFFIX) sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) dasum.$(SUFFIX) dasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) qasum.$(SUFFIX) qasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) scasum.$(SUFFIX) scasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) dnrm2.$(SUFFIX) dnrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) qnrm2.$(SUFFIX) qnrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) scnrm2.$(SUFFIX) scnrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) dznrm2.$(SUFFIX) dznrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) qxnrm2.$(SUFFIX) qxnrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) samax.$(SUFFIX) samax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) damax.$(SUFFIX) damax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) qamax.$(SUFFIX) qamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) scamax.$(SUFFIX) scamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) dzamax.$(SUFFIX) dzamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) qxamax.$(SUFFIX) qxamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) samin.$(SUFFIX) samin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) damin.$(SUFFIX) damin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) qamin.$(SUFFIX) qamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) scamin.$(SUFFIX) scamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) dzamin.$(SUFFIX) dzamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) qxamin.$(SUFFIX) qxamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) smax.$(SUFFIX) smax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) dmax.$(SUFFIX) dmax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) qmax.$(SUFFIX) qmax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) smin.$(SUFFIX) smin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) dmin.$(SUFFIX) dmin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) qmin.$(SUFFIX) qmin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) isamax.$(SUFFIX) isamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) idamax.$(SUFFIX) idamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) iqamax.$(SUFFIX) iqamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) icamax.$(SUFFIX) icamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) izamax.$(SUFFIX) izamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) ixamax.$(SUFFIX) ixamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) isamin.$(SUFFIX) isamin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) idamin.$(SUFFIX) idamin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) iqamin.$(SUFFIX) iqamin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) icamin.$(SUFFIX) icamin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) izamin.$(SUFFIX) izamin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) ixamin.$(SUFFIX) ixamin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) ismax.$(SUFFIX) ismax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) idmax.$(SUFFIX) idmax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) iqmax.$(SUFFIX) iqmax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) ismin.$(SUFFIX) ismin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) idmin.$(SUFFIX) idmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) iqmin.$(SUFFIX) iqmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c $(CC) $(CFLAGS) -c $< -o $(@F) dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -c $< -o $(@F) sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -c $< -o $(@F) ddot.$(SUFFIX) ddot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -c $< -o $(@F) qdot.$(SUFFIX) qdot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -c $< -o $(@F) cdotu.$(SUFFIX) cdotu.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) cdotc.$(SUFFIX) cdotc.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) zdotu.$(SUFFIX) zdotu.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) zdotc.$(SUFFIX) zdotc.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) xdotu.$(SUFFIX) xdotu.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) xdotc.$(SUFFIX) xdotc.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) saxpy.$(SUFFIX) saxpy.$(PSUFFIX) : axpy.c $(CC) $(CFLAGS) -c $< -o $(@F) daxpy.$(SUFFIX) daxpy.$(PSUFFIX) : axpy.c $(CC) $(CFLAGS) -c $< -o $(@F) qaxpy.$(SUFFIX) qaxpy.$(PSUFFIX) : axpy.c $(CC) $(CFLAGS) -c $< -o $(@F) caxpy.$(SUFFIX) caxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -c $< -o $(@F) zaxpy.$(SUFFIX) zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -c $< -o $(@F) xaxpy.$(SUFFIX) xaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -c $< -o $(@F) caxpyc.$(SUFFIX) caxpyc.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) zaxpyc.$(SUFFIX) zaxpyc.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) xaxpyc.$(SUFFIX) xaxpyc.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c $(CC) $(CFLAGS) -c $< -o $(@F) dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c $(CC) $(CFLAGS) -c $< -o $(@F) qscal.$(SUFFIX) qscal.$(PSUFFIX) : scal.c $(CC) $(CFLAGS) -c $< -o $(@F) cscal.$(SUFFIX) cscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -c $< -o $(@F) zscal.$(SUFFIX) zscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -c $< -o $(@F) xscal.$(SUFFIX) xscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -c $< -o $(@F) csscal.$(SUFFIX) csscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) zdscal.$(SUFFIX) zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) xqscal.$(SUFFIX) xqscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) scopy.$(SUFFIX) scopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -c $< -o $(@F) dcopy.$(SUFFIX) dcopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -c $< -o $(@F) qcopy.$(SUFFIX) qcopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -c $< -o $(@F) ccopy.$(SUFFIX) ccopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -c $< -o $(@F) zcopy.$(SUFFIX) zcopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -c $< -o $(@F) xcopy.$(SUFFIX) xcopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -c $< -o $(@F) sswap.$(SUFFIX) sswap.$(PSUFFIX) : swap.c $(CC) $(CFLAGS) -c $< -o $(@F) dswap.$(SUFFIX) dswap.$(PSUFFIX) : swap.c $(CC) $(CFLAGS) -c $< -o $(@F) qswap.$(SUFFIX) qswap.$(PSUFFIX) : swap.c $(CC) $(CFLAGS) -c $< -o $(@F) cswap.$(SUFFIX) cswap.$(PSUFFIX) : zswap.c $(CC) $(CFLAGS) -c $< -o $(@F) zswap.$(SUFFIX) zswap.$(PSUFFIX) : zswap.c $(CC) $(CFLAGS) -c $< -o $(@F) xswap.$(SUFFIX) xswap.$(PSUFFIX) : zswap.c $(CC) $(CFLAGS) -c $< -o $(@F) sger.$(SUFFIX) sger.$(PSUFFIX) : ger.c $(CC) -c $(CFLAGS) $< -o $(@F) dger.$(SUFFIX) dger.$(PSUFFIX) : ger.c $(CC) -c $(CFLAGS) $< -o $(@F) qger.$(SUFFIX) qger.$(PSUFFIX) : ger.c $(CC) -c $(CFLAGS) $< -o $(@F) cgeru.$(SUFFIX) cgeru.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) cgerc.$(SUFFIX) cgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) zgeru.$(SUFFIX) zgeru.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) zgerc.$(SUFFIX) zgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) ifndef USE_NETLIB_GEMV sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< else sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f $(FC) -c $(FFLAGS) -o $(@F) $< dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f $(FC) -c $(FFLAGS) -o $(@F) $< endif qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< ifndef USE_NETLIB_GEMV cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< else cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f $(FC) -c $(FFLAGS) -o $(@F) $< zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f $(FC) -c $(FFLAGS) -o $(@F) $< endif xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c $(CC) -c $(CFLAGS) -o $(@F) $< strsv.$(SUFFIX) strsv.$(PSUFFIX) : trsv.c $(CC) -c $(CFLAGS) $< -o $(@F) dtrsv.$(SUFFIX) dtrsv.$(PSUFFIX) : trsv.c $(CC) -c $(CFLAGS) $< -o $(@F) qtrsv.$(SUFFIX) qtrsv.$(PSUFFIX) : trsv.c $(CC) -c $(CFLAGS) $< -o $(@F) ctrsv.$(SUFFIX) ctrsv.$(PSUFFIX) : ztrsv.c $(CC) -c $(CFLAGS) $< -o $(@F) ztrsv.$(SUFFIX) ztrsv.$(PSUFFIX) : ztrsv.c $(CC) -c $(CFLAGS) $< -o $(@F) xtrsv.$(SUFFIX) xtrsv.$(PSUFFIX) : ztrsv.c $(CC) -c $(CFLAGS) $< -o $(@F) strmv.$(SUFFIX) strmv.$(PSUFFIX) : trmv.c $(CC) -c $(CFLAGS) $< -o $(@F) dtrmv.$(SUFFIX) dtrmv.$(PSUFFIX) : trmv.c $(CC) -c $(CFLAGS) $< -o $(@F) qtrmv.$(SUFFIX) qtrmv.$(PSUFFIX) : trmv.c $(CC) -c $(CFLAGS) $< -o $(@F) ctrmv.$(SUFFIX) ctrmv.$(PSUFFIX) : ztrmv.c $(CC) -c $(CFLAGS) $< -o $(@F) ztrmv.$(SUFFIX) ztrmv.$(PSUFFIX) : ztrmv.c $(CC) -c $(CFLAGS) $< -o $(@F) xtrmv.$(SUFFIX) xtrmv.$(PSUFFIX) : ztrmv.c $(CC) -c $(CFLAGS) $< -o $(@F) ssymv.$(SUFFIX) ssymv.$(PSUFFIX) : symv.c $(CC) -c $(CFLAGS) $< -o $(@F) dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c $(CC) -c $(CFLAGS) $< -o $(@F) qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c $(CC) -c $(CFLAGS) $< -o $(@F) csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) ssyr.$(SUFFIX) ssyr.$(PSUFFIX) : syr.c $(CC) -c $(CFLAGS) $< -o $(@F) dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c $(CC) -c $(CFLAGS) $< -o $(@F) qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c $(CC) -c $(CFLAGS) $< -o $(@F) csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) ssyr2.$(SUFFIX) ssyr2.$(PSUFFIX) : syr2.c $(CC) -c $(CFLAGS) $< -o $(@F) dsyr2.$(SUFFIX) dsyr2.$(PSUFFIX) : syr2.c $(CC) -c $(CFLAGS) $< -o $(@F) qsyr2.$(SUFFIX) qsyr2.$(PSUFFIX) : syr2.c $(CC) -c $(CFLAGS) $< -o $(@F) csyr2.$(SUFFIX) csyr2.$(PSUFFIX) : zsyr2.c $(CC) -c $(CFLAGS) $< -o $(@F) zsyr2.$(SUFFIX) zsyr2.$(PSUFFIX) : zsyr2.c $(CC) -c $(CFLAGS) $< -o $(@F) xsyr2.$(SUFFIX) xsyr2.$(PSUFFIX) : zsyr2.c $(CC) -c $(CFLAGS) $< -o $(@F) sgbmv.$(SUFFIX) sgbmv.$(PSUFFIX): gbmv.c $(CC) -c $(CFLAGS) -o $(@F) $< dgbmv.$(SUFFIX) dgbmv.$(PSUFFIX): gbmv.c $(CC) -c $(CFLAGS) -o $(@F) $< qgbmv.$(SUFFIX) qgbmv.$(PSUFFIX): gbmv.c $(CC) -c $(CFLAGS) -o $(@F) $< cgbmv.$(SUFFIX) cgbmv.$(PSUFFIX): zgbmv.c $(CC) -c $(CFLAGS) -o $(@F) $< zgbmv.$(SUFFIX) zgbmv.$(PSUFFIX): zgbmv.c $(CC) -c $(CFLAGS) -o $(@F) $< xgbmv.$(SUFFIX) xgbmv.$(PSUFFIX): zgbmv.c $(CC) -c $(CFLAGS) -o $(@F) $< ssbmv.$(SUFFIX) ssbmv.$(PSUFFIX) : sbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) dsbmv.$(SUFFIX) dsbmv.$(PSUFFIX) : sbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) qsbmv.$(SUFFIX) qsbmv.$(PSUFFIX) : sbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) csbmv.$(SUFFIX) csbmv.$(PSUFFIX) : zsbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) zsbmv.$(SUFFIX) zsbmv.$(PSUFFIX) : zsbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) xsbmv.$(SUFFIX) xsbmv.$(PSUFFIX) : zsbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) sspmv.$(SUFFIX) sspmv.$(PSUFFIX) : spmv.c $(CC) -c $(CFLAGS) $< -o $(@F) dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c $(CC) -c $(CFLAGS) $< -o $(@F) qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c $(CC) -c $(CFLAGS) $< -o $(@F) cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) sspr.$(SUFFIX) sspr.$(PSUFFIX) : spr.c $(CC) -c $(CFLAGS) $< -o $(@F) dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c $(CC) -c $(CFLAGS) $< -o $(@F) qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c $(CC) -c $(CFLAGS) $< -o $(@F) cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) sspr2.$(SUFFIX) sspr2.$(PSUFFIX) : spr2.c $(CC) -c $(CFLAGS) $< -o $(@F) dspr2.$(SUFFIX) dspr2.$(PSUFFIX) : spr2.c $(CC) -c $(CFLAGS) $< -o $(@F) qspr2.$(SUFFIX) qspr2.$(PSUFFIX) : spr2.c $(CC) -c $(CFLAGS) $< -o $(@F) cspr2.$(SUFFIX) cspr2.$(PSUFFIX) : zspr2.c $(CC) -c $(CFLAGS) $< -o $(@F) zspr2.$(SUFFIX) zspr2.$(PSUFFIX) : zspr2.c $(CC) -c $(CFLAGS) $< -o $(@F) xspr2.$(SUFFIX) xspr2.$(PSUFFIX) : zspr2.c $(CC) -c $(CFLAGS) $< -o $(@F) stbmv.$(SUFFIX) stbmv.$(PSUFFIX) : tbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) dtbmv.$(SUFFIX) dtbmv.$(PSUFFIX) : tbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) qtbmv.$(SUFFIX) qtbmv.$(PSUFFIX) : tbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) ctbmv.$(SUFFIX) ctbmv.$(PSUFFIX) : ztbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) ztbmv.$(SUFFIX) ztbmv.$(PSUFFIX) : ztbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) xtbmv.$(SUFFIX) xtbmv.$(PSUFFIX) : ztbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) stbsv.$(SUFFIX) stbsv.$(PSUFFIX) : tbsv.c $(CC) -c $(CFLAGS) $< -o $(@F) dtbsv.$(SUFFIX) dtbsv.$(PSUFFIX) : tbsv.c $(CC) -c $(CFLAGS) $< -o $(@F) qtbsv.$(SUFFIX) qtbsv.$(PSUFFIX) : tbsv.c $(CC) -c $(CFLAGS) $< -o $(@F) ctbsv.$(SUFFIX) ctbsv.$(PSUFFIX) : ztbsv.c $(CC) -c $(CFLAGS) $< -o $(@F) ztbsv.$(SUFFIX) ztbsv.$(PSUFFIX) : ztbsv.c $(CC) -c $(CFLAGS) $< -o $(@F) xtbsv.$(SUFFIX) xtbsv.$(PSUFFIX) : ztbsv.c $(CC) -c $(CFLAGS) $< -o $(@F) stpsv.$(SUFFIX) stpsv.$(PSUFFIX) : tpsv.c $(CC) -c $(CFLAGS) $< -o $(@F) dtpsv.$(SUFFIX) dtpsv.$(PSUFFIX) : tpsv.c $(CC) -c $(CFLAGS) $< -o $(@F) qtpsv.$(SUFFIX) qtpsv.$(PSUFFIX) : tpsv.c $(CC) -c $(CFLAGS) $< -o $(@F) ctpsv.$(SUFFIX) ctpsv.$(PSUFFIX) : ztpsv.c $(CC) -c $(CFLAGS) $< -o $(@F) ztpsv.$(SUFFIX) ztpsv.$(PSUFFIX) : ztpsv.c $(CC) -c $(CFLAGS) $< -o $(@F) xtpsv.$(SUFFIX) xtpsv.$(PSUFFIX) : ztpsv.c $(CC) -c $(CFLAGS) $< -o $(@F) stpmv.$(SUFFIX) stpmv.$(PSUFFIX) : tpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) dtpmv.$(SUFFIX) dtpmv.$(PSUFFIX) : tpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) qtpmv.$(SUFFIX) qtpmv.$(PSUFFIX) : tpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) ctpmv.$(SUFFIX) ctpmv.$(PSUFFIX) : ztpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) ztpmv.$(SUFFIX) ztpmv.$(PSUFFIX) : ztpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) xtpmv.$(SUFFIX) xtpmv.$(PSUFFIX) : ztpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) chemv.$(SUFFIX) chemv.$(PSUFFIX) : zhemv.c $(CC) -c $(CFLAGS) $< -o $(@F) zhemv.$(SUFFIX) zhemv.$(PSUFFIX) : zhemv.c $(CC) -c $(CFLAGS) $< -o $(@F) xhemv.$(SUFFIX) xhemv.$(PSUFFIX) : zhemv.c $(CC) -c $(CFLAGS) $< -o $(@F) chbmv.$(SUFFIX) chbmv.$(PSUFFIX) : zhbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) zhbmv.$(SUFFIX) zhbmv.$(PSUFFIX) : zhbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) xhbmv.$(SUFFIX) xhbmv.$(PSUFFIX) : zhbmv.c $(CC) -c $(CFLAGS) $< -o $(@F) cher.$(SUFFIX) cher.$(PSUFFIX) : zher.c $(CC) -c $(CFLAGS) $< -o $(@F) zher.$(SUFFIX) zher.$(PSUFFIX) : zher.c $(CC) -c $(CFLAGS) $< -o $(@F) xher.$(SUFFIX) xher.$(PSUFFIX) : zher.c $(CC) -c $(CFLAGS) $< -o $(@F) cher2.$(SUFFIX) cher2.$(PSUFFIX) : zher2.c $(CC) -c $(CFLAGS) $< -o $(@F) zher2.$(SUFFIX) zher2.$(PSUFFIX) : zher2.c $(CC) -c $(CFLAGS) $< -o $(@F) xher2.$(SUFFIX) xher2.$(PSUFFIX) : zher2.c $(CC) -c $(CFLAGS) $< -o $(@F) chpmv.$(SUFFIX) chpmv.$(PSUFFIX) : zhpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) zhpmv.$(SUFFIX) zhpmv.$(PSUFFIX) : zhpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) xhpmv.$(SUFFIX) xhpmv.$(PSUFFIX) : zhpmv.c $(CC) -c $(CFLAGS) $< -o $(@F) chpr.$(SUFFIX) chpr.$(PSUFFIX) : zhpr.c $(CC) -c $(CFLAGS) $< -o $(@F) zhpr.$(SUFFIX) zhpr.$(PSUFFIX) : zhpr.c $(CC) -c $(CFLAGS) $< -o $(@F) xhpr.$(SUFFIX) xhpr.$(PSUFFIX) : zhpr.c $(CC) -c $(CFLAGS) $< -o $(@F) chpr2.$(SUFFIX) chpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) dgemm.$(SUFFIX) dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) qgemm.$(SUFFIX) qgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) cgemm.$(SUFFIX) cgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) dsymm.$(SUFFIX) dsymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) qsymm.$(SUFFIX) qsymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) csymm.$(SUFFIX) csymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) zsymm.$(SUFFIX) zsymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) xsymm.$(SUFFIX) xsymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) strmm.$(SUFFIX) strmm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) dtrmm.$(SUFFIX) dtrmm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) qtrmm.$(SUFFIX) qtrmm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) ctrmm.$(SUFFIX) ctrmm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) ztrmm.$(SUFFIX) ztrmm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) xtrmm.$(SUFFIX) xtrmm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) strsm.$(SUFFIX) strsm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) $< -o $(@F) dtrsm.$(SUFFIX) dtrsm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) $< -o $(@F) qtrsm.$(SUFFIX) qtrsm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) $< -o $(@F) ctrsm.$(SUFFIX) ctrsm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) $< -o $(@F) ztrsm.$(SUFFIX) ztrsm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) $< -o $(@F) xtrsm.$(SUFFIX) xtrsm.$(PSUFFIX) : trsm.c $(CC) -c $(CFLAGS) $< -o $(@F) ssyrk.$(SUFFIX) ssyrk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) $< -o $(@F) dsyrk.$(SUFFIX) dsyrk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) $< -o $(@F) qsyrk.$(SUFFIX) qsyrk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) $< -o $(@F) csyrk.$(SUFFIX) csyrk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) $< -o $(@F) zsyrk.$(SUFFIX) zsyrk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) $< -o $(@F) xsyrk.$(SUFFIX) xsyrk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) $< -o $(@F) ssyr2k.$(SUFFIX) ssyr2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) $< -o $(@F) dsyr2k.$(SUFFIX) dsyr2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) $< -o $(@F) qsyr2k.$(SUFFIX) qsyr2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) $< -o $(@F) csyr2k.$(SUFFIX) csyr2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) $< -o $(@F) zsyr2k.$(SUFFIX) zsyr2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) $< -o $(@F) xsyr2k.$(SUFFIX) xsyr2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) $< -o $(@F) chemm.$(SUFFIX) chemm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) zhemm.$(SUFFIX) zhemm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) xhemm.$(SUFFIX) xhemm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) cherk.$(SUFFIX) cherk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) zherk.$(SUFFIX) zherk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) xherk.$(SUFFIX) xherk.$(PSUFFIX) : syrk.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) cher2k.$(SUFFIX) cher2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) zher2k.$(SUFFIX) zher2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) xher2k.$(SUFFIX) xher2k.$(PSUFFIX) : syr2k.c $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) cgemm3m.$(SUFFIX) cgemm3m.$(PSUFFIX) : gemm.c $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) zgemm3m.$(SUFFIX) zgemm3m.$(PSUFFIX) : gemm.c $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) xgemm3m.$(SUFFIX) xgemm3m.$(PSUFFIX) : gemm.c $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) csymm3m.$(SUFFIX) csymm3m.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) zsymm3m.$(SUFFIX) zsymm3m.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) xsymm3m.$(SUFFIX) xsymm3m.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) chemm3m.$(SUFFIX) chemm3m.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) zhemm3m.$(SUFFIX) zhemm3m.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) xhemm3m.$(SUFFIX) xhemm3m.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) cblas_isamax.$(SUFFIX) cblas_isamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) cblas_idamax.$(SUFFIX) cblas_idamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) cblas_idmax.$(SUFFIX) cblas_idmax.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dasum.$(SUFFIX) cblas_dasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_cdotu.$(SUFFIX) cblas_cdotu.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) cblas_cdotc.$(SUFFIX) cblas_cdotc.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) cblas_zdotu.$(SUFFIX) cblas_zdotu.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) cblas_zdotc.$(SUFFIX) cblas_zdotc.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) cblas_cdotu_sub.$(SUFFIX) cblas_cdotu_sub.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) cblas_cdotc_sub.$(SUFFIX) cblas_cdotc_sub.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) cblas_zdotu_sub.$(SUFFIX) cblas_zdotu_sub.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) cblas_zdotc_sub.$(SUFFIX) cblas_zdotc_sub.$(PSUFFIX) : zdot.c $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) cblas_snrm2.$(SUFFIX) cblas_snrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dnrm2.$(SUFFIX) cblas_dnrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_scnrm2.$(SUFFIX) cblas_scnrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dznrm2.$(SUFFIX) cblas_dznrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_saxpy.$(SUFFIX) cblas_saxpy.$(PSUFFIX) : axpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_scopy.$(SUFFIX) cblas_scopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dcopy.$(SUFFIX) cblas_dcopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_ccopy.$(SUFFIX) cblas_ccopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_zcopy.$(SUFFIX) cblas_zcopy.$(PSUFFIX) : copy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sswap.$(SUFFIX) cblas_sswap.$(PSUFFIX) : swap.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dswap.$(SUFFIX) cblas_dswap.$(PSUFFIX) : swap.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_cswap.$(SUFFIX) cblas_cswap.$(PSUFFIX) : zswap.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_zswap.$(SUFFIX) cblas_zswap.$(PSUFFIX) : zswap.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_srot.$(SUFFIX) cblas_srot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_drot.$(SUFFIX) cblas_drot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_drotm.$(SUFFIX) cblas_drotm.$(PSUFFIX): rotm.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_srotmg.$(SUFFIX) cblas_srotmg.$(PSUFFIX): rotmg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_drotmg.$(SUFFIX) cblas_drotmg.$(PSUFFIX): rotmg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sscal.$(SUFFIX) cblas_sscal.$(PSUFFIX) : scal.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_dscal.$(SUFFIX) cblas_dscal.$(PSUFFIX) : scal.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_cscal.$(SUFFIX) cblas_cscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_zscal.$(SUFFIX) cblas_zscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_dgemv.$(SUFFIX) cblas_dgemv.$(PSUFFIX): gemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_cgemv.$(SUFFIX) cblas_cgemv.$(PSUFFIX): zgemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_zgemv.$(SUFFIX) cblas_zgemv.$(PSUFFIX): zgemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_sger.$(SUFFIX) cblas_sger.$(PSUFFIX) : ger.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dger.$(SUFFIX) cblas_dger.$(PSUFFIX) : ger.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_cgeru.$(SUFFIX) cblas_cgeru.$(PSUFFIX) : zger.c $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) cblas_cgerc.$(SUFFIX) cblas_cgerc.$(PSUFFIX) : zger.c $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) cblas_zgeru.$(SUFFIX) cblas_zgeru.$(PSUFFIX) : zger.c $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) cblas_zgerc.$(SUFFIX) cblas_zgerc.$(PSUFFIX) : zger.c $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) cblas_strsv.$(SUFFIX) cblas_strsv.$(PSUFFIX) : trsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dtrsv.$(SUFFIX) cblas_dtrsv.$(PSUFFIX) : trsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ctrsv.$(SUFFIX) cblas_ctrsv.$(PSUFFIX) : ztrsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ztrsv.$(SUFFIX) cblas_ztrsv.$(PSUFFIX) : ztrsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_strmv.$(SUFFIX) cblas_strmv.$(PSUFFIX) : trmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dtrmv.$(SUFFIX) cblas_dtrmv.$(PSUFFIX) : trmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ctrmv.$(SUFFIX) cblas_ctrmv.$(PSUFFIX) : ztrmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ztrmv.$(SUFFIX) cblas_ztrmv.$(PSUFFIX) : ztrmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ssyr.$(SUFFIX) cblas_ssyr.$(PSUFFIX) : syr.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dsyr.$(SUFFIX) cblas_dsyr.$(PSUFFIX) : syr.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_cher.$(SUFFIX) cblas_cher.$(PSUFFIX) : zher.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zher.$(SUFFIX) cblas_zher.$(PSUFFIX) : zher.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ssyr2.$(SUFFIX) cblas_ssyr2.$(PSUFFIX) : syr2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dsyr2.$(SUFFIX) cblas_dsyr2.$(PSUFFIX) : syr2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_cher2.$(SUFFIX) cblas_cher2.$(PSUFFIX) : zher2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zher2.$(SUFFIX) cblas_zher2.$(PSUFFIX) : zher2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_sgbmv.$(SUFFIX) cblas_sgbmv.$(PSUFFIX): gbmv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_dgbmv.$(SUFFIX) cblas_dgbmv.$(PSUFFIX): gbmv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_cgbmv.$(SUFFIX) cblas_cgbmv.$(PSUFFIX): zgbmv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_zgbmv.$(SUFFIX) cblas_zgbmv.$(PSUFFIX): zgbmv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< cblas_ssbmv.$(SUFFIX) cblas_ssbmv.$(PSUFFIX) : sbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dsbmv.$(SUFFIX) cblas_dsbmv.$(PSUFFIX) : sbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_chbmv.$(SUFFIX) cblas_chbmv.$(PSUFFIX) : zhbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zhbmv.$(SUFFIX) cblas_zhbmv.$(PSUFFIX) : zhbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_sspmv.$(SUFFIX) cblas_sspmv.$(PSUFFIX) : spmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dspmv.$(SUFFIX) cblas_dspmv.$(PSUFFIX) : spmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_sspr.$(SUFFIX) cblas_sspr.$(PSUFFIX) : spr.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dspr.$(SUFFIX) cblas_dspr.$(PSUFFIX) : spr.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_chpr.$(SUFFIX) cblas_chpr.$(PSUFFIX) : zhpr.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zhpr.$(SUFFIX) cblas_zhpr.$(PSUFFIX) : zhpr.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_sspr2.$(SUFFIX) cblas_sspr2.$(PSUFFIX) : spr2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dspr2.$(SUFFIX) cblas_dspr2.$(PSUFFIX) : spr2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_chpr2.$(SUFFIX) cblas_chpr2.$(PSUFFIX) : zhpr2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zhpr2.$(SUFFIX) cblas_zhpr2.$(PSUFFIX) : zhpr2.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_stbmv.$(SUFFIX) cblas_stbmv.$(PSUFFIX) : tbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dtbmv.$(SUFFIX) cblas_dtbmv.$(PSUFFIX) : tbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ctbmv.$(SUFFIX) cblas_ctbmv.$(PSUFFIX) : ztbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ztbmv.$(SUFFIX) cblas_ztbmv.$(PSUFFIX) : ztbmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_stbsv.$(SUFFIX) cblas_stbsv.$(PSUFFIX) : tbsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dtbsv.$(SUFFIX) cblas_dtbsv.$(PSUFFIX) : tbsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ctbsv.$(SUFFIX) cblas_ctbsv.$(PSUFFIX) : ztbsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ztbsv.$(SUFFIX) cblas_ztbsv.$(PSUFFIX) : ztbsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_stpmv.$(SUFFIX) cblas_stpmv.$(PSUFFIX) : tpmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dtpmv.$(SUFFIX) cblas_dtpmv.$(PSUFFIX) : tpmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ctpmv.$(SUFFIX) cblas_ctpmv.$(PSUFFIX) : ztpmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ztpmv.$(SUFFIX) cblas_ztpmv.$(PSUFFIX) : ztpmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_chpmv.$(SUFFIX) cblas_chpmv.$(PSUFFIX) : zhpmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zhpmv.$(SUFFIX) cblas_zhpmv.$(PSUFFIX) : zhpmv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_stpsv.$(SUFFIX) cblas_stpsv.$(PSUFFIX) : tpsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dtpsv.$(SUFFIX) cblas_dtpsv.$(PSUFFIX) : tpsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ctpsv.$(SUFFIX) cblas_ctpsv.$(PSUFFIX) : ztpsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ztpsv.$(SUFFIX) cblas_ztpsv.$(PSUFFIX) : ztpsv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ssymv.$(SUFFIX) cblas_ssymv.$(PSUFFIX) : symv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dsymv.$(SUFFIX) cblas_dsymv.$(PSUFFIX) : symv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_chemv.$(SUFFIX) cblas_chemv.$(PSUFFIX) : zhemv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dsymm.$(SUFFIX) cblas_dsymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_csymm.$(SUFFIX) cblas_csymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zsymm.$(SUFFIX) cblas_zsymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ssyrk.$(SUFFIX) cblas_ssyrk.$(PSUFFIX) : syrk.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dsyrk.$(SUFFIX) cblas_dsyrk.$(PSUFFIX) : syrk.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_csyrk.$(SUFFIX) cblas_csyrk.$(PSUFFIX) : syrk.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zsyrk.$(SUFFIX) cblas_zsyrk.$(PSUFFIX) : syrk.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ssyr2k.$(SUFFIX) cblas_ssyr2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dsyr2k.$(SUFFIX) cblas_dsyr2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_csyr2k.$(SUFFIX) cblas_csyr2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_zsyr2k.$(SUFFIX) cblas_zsyr2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_strmm.$(SUFFIX) cblas_strmm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) cblas_dtrmm.$(SUFFIX) cblas_dtrmm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) cblas_ctrmm.$(SUFFIX) cblas_ctrmm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) cblas_ztrmm.$(SUFFIX) cblas_ztrmm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) cblas_strsm.$(SUFFIX) cblas_strsm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_dtrsm.$(SUFFIX) cblas_dtrsm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ctrsm.$(SUFFIX) cblas_ctrsm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_ztrsm.$(SUFFIX) cblas_ztrsm.$(PSUFFIX) : trsm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) cblas_chemm.$(SUFFIX) cblas_chemm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) cblas_zhemm.$(SUFFIX) cblas_zhemm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) cblas_cherk.$(SUFFIX) cblas_cherk.$(PSUFFIX) : syrk.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) cblas_zherk.$(SUFFIX) cblas_zherk.$(PSUFFIX) : syrk.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) cblas_cgemm3m.$(SUFFIX) cblas_cgemm3m.$(PSUFFIX) : gemm.c $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) cblas_zgemm3m.$(SUFFIX) cblas_zgemm3m.$(PSUFFIX) : gemm.c $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c $(CC) -c $(CFLAGS) $< -o $(@F) dgetf2.$(SUFFIX) dgetf2.$(PSUFFIX) : lapack/getf2.c $(CC) -c $(CFLAGS) $< -o $(@F) qgetf2.$(SUFFIX) qgetf2.$(PSUFFIX) : getf2.c $(CC) -c $(CFLAGS) $< -o $(@F) cgetf2.$(SUFFIX) cgetf2.$(PSUFFIX) : lapack/zgetf2.c $(CC) -c $(CFLAGS) $< -o $(@F) zgetf2.$(SUFFIX) zgetf2.$(PSUFFIX) : lapack/zgetf2.c $(CC) -c $(CFLAGS) $< -o $(@F) xgetf2.$(SUFFIX) xgetf2.$(PSUFFIX) : zgetf2.c $(CC) -c $(CFLAGS) $< -o $(@F) sgetrf.$(SUFFIX) sgetrf.$(PSUFFIX) : lapack/getrf.c $(CC) -c $(CFLAGS) $< -o $(@F) dgetrf.$(SUFFIX) dgetrf.$(PSUFFIX) : lapack/getrf.c $(CC) -c $(CFLAGS) $< -o $(@F) qgetrf.$(SUFFIX) qgetrf.$(PSUFFIX) : getrf.c $(CC) -c $(CFLAGS) $< -o $(@F) cgetrf.$(SUFFIX) cgetrf.$(PSUFFIX) : lapack/zgetrf.c $(CC) -c $(CFLAGS) $< -o $(@F) zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : lapack/zgetrf.c $(CC) -c $(CFLAGS) $< -o $(@F) xgetrf.$(SUFFIX) xgetrf.$(PSUFFIX) : zgetrf.c $(CC) -c $(CFLAGS) $< -o $(@F) slauu2.$(SUFFIX) slauu2.$(PSUFFIX) : lapack/lauu2.c $(CC) -c $(CFLAGS) $< -o $(@F) dlauu2.$(SUFFIX) dlauu2.$(PSUFFIX) : lapack/lauu2.c $(CC) -c $(CFLAGS) $< -o $(@F) qlauu2.$(SUFFIX) qlauu2.$(PSUFFIX) : lauu2.c $(CC) -c $(CFLAGS) $< -o $(@F) clauu2.$(SUFFIX) clauu2.$(PSUFFIX) : lapack/zlauu2.c $(CC) -c $(CFLAGS) $< -o $(@F) zlauu2.$(SUFFIX) zlauu2.$(PSUFFIX) : lapack/zlauu2.c $(CC) -c $(CFLAGS) $< -o $(@F) xlauu2.$(SUFFIX) xlauu2.$(PSUFFIX) : zlauu2.c $(CC) -c $(CFLAGS) $< -o $(@F) slauum.$(SUFFIX) slauum.$(PSUFFIX) : lapack/lauum.c $(CC) -c $(CFLAGS) $< -o $(@F) dlauum.$(SUFFIX) dlauum.$(PSUFFIX) : lapack/lauum.c $(CC) -c $(CFLAGS) $< -o $(@F) qlauum.$(SUFFIX) qlauum.$(PSUFFIX) : lauum.c $(CC) -c $(CFLAGS) $< -o $(@F) clauum.$(SUFFIX) clauum.$(PSUFFIX) : lapack/zlauum.c $(CC) -c $(CFLAGS) $< -o $(@F) zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : lapack/zlauum.c $(CC) -c $(CFLAGS) $< -o $(@F) xlauum.$(SUFFIX) xlauum.$(PSUFFIX) : zlauum.c $(CC) -c $(CFLAGS) $< -o $(@F) spotf2.$(SUFFIX) spotf2.$(PSUFFIX) : lapack/potf2.c $(CC) -c $(CFLAGS) $< -o $(@F) dpotf2.$(SUFFIX) dpotf2.$(PSUFFIX) : lapack/potf2.c $(CC) -c $(CFLAGS) $< -o $(@F) qpotf2.$(SUFFIX) qpotf2.$(PSUFFIX) : potf2.c $(CC) -c $(CFLAGS) $< -o $(@F) cpotf2.$(SUFFIX) cpotf2.$(PSUFFIX) : lapack/zpotf2.c $(CC) -c $(CFLAGS) $< -o $(@F) zpotf2.$(SUFFIX) zpotf2.$(PSUFFIX) : lapack/zpotf2.c $(CC) -c $(CFLAGS) $< -o $(@F) xpotf2.$(SUFFIX) xpotf2.$(PSUFFIX) : zpotf2.c $(CC) -c $(CFLAGS) $< -o $(@F) spotrf.$(SUFFIX) spotrf.$(PSUFFIX) : lapack/potrf.c $(CC) -c $(CFLAGS) $< -o $(@F) dpotrf.$(SUFFIX) dpotrf.$(PSUFFIX) : lapack/potrf.c $(CC) -c $(CFLAGS) $< -o $(@F) qpotrf.$(SUFFIX) qpotrf.$(PSUFFIX) : potrf.c $(CC) -c $(CFLAGS) $< -o $(@F) cpotrf.$(SUFFIX) cpotrf.$(PSUFFIX) : lapack/zpotrf.c $(CC) -c $(CFLAGS) $< -o $(@F) zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : lapack/zpotrf.c $(CC) -c $(CFLAGS) $< -o $(@F) xpotrf.$(SUFFIX) xpotrf.$(PSUFFIX) : zpotrf.c $(CC) -c $(CFLAGS) $< -o $(@F) strti2.$(SUFFIX) strti2.$(PSUFFIX) : lapack/trti2.c $(CC) -c $(CFLAGS) $< -o $(@F) dtrti2.$(SUFFIX) dtrti2.$(PSUFFIX) : lapack/trti2.c $(CC) -c $(CFLAGS) $< -o $(@F) qtrti2.$(SUFFIX) qtrti2.$(PSUFFIX) : trti2.c $(CC) -c $(CFLAGS) $< -o $(@F) ctrti2.$(SUFFIX) ctrti2.$(PSUFFIX) : lapack/ztrti2.c $(CC) -c $(CFLAGS) $< -o $(@F) ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c $(CC) -c $(CFLAGS) $< -o $(@F) xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c $(CC) -c $(CFLAGS) $< -o $(@F) strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c $(CC) -c $(CFLAGS) $< -o $(@F) dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c $(CC) -c $(CFLAGS) $< -o $(@F) qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c $(CC) -c $(CFLAGS) $< -o $(@F) ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c $(CC) -c $(CFLAGS) $< -o $(@F) ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c $(CC) -c $(CFLAGS) $< -o $(@F) xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c $(CC) -c $(CFLAGS) $< -o $(@F) slaswp.$(SUFFIX) slaswp.$(PSUFFIX) : lapack/laswp.c $(CC) -c $(CFLAGS) $< -o $(@F) dlaswp.$(SUFFIX) dlaswp.$(PSUFFIX) : lapack/laswp.c $(CC) -c $(CFLAGS) $< -o $(@F) qlaswp.$(SUFFIX) qlaswp.$(PSUFFIX) : laswp.c $(CC) -c $(CFLAGS) $< -o $(@F) claswp.$(SUFFIX) claswp.$(PSUFFIX) : lapack/zlaswp.c $(CC) -c $(CFLAGS) $< -o $(@F) zlaswp.$(SUFFIX) zlaswp.$(PSUFFIX) : lapack/zlaswp.c $(CC) -c $(CFLAGS) $< -o $(@F) xlaswp.$(SUFFIX) xlaswp.$(PSUFFIX) : zlaswp.c $(CC) -c $(CFLAGS) $< -o $(@F) sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c $(CC) -c $(CFLAGS) $< -o $(@F) cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c $(CC) -c $(CFLAGS) $< -o $(@F) sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) dgesv.$(SUFFIX) dgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) qgesv.$(SUFFIX) qgesv.$(PSUFFIX) : gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) cgesv.$(SUFFIX) cgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : lapack/gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) xgesv.$(SUFFIX) xgesv.$(PSUFFIX) : gesv.c $(CC) -c $(CFLAGS) $< -o $(@F) spotri.$(SUFFIX) spotri.$(PSUFFIX) : lapack/potri.c $(CC) -c $(CFLAGS) $< -o $(@F) dpotri.$(SUFFIX) dpotri.$(PSUFFIX) : lapack/potri.c $(CC) -c $(CFLAGS) $< -o $(@F) qpotri.$(SUFFIX) qpotri.$(PSUFFIX) : potri.c $(CC) -c $(CFLAGS) $< -o $(@F) cpotri.$(SUFFIX) cpotri.$(PSUFFIX) : lapack/zpotri.c $(CC) -c $(CFLAGS) $< -o $(@F) zpotri.$(SUFFIX) zpotri.$(PSUFFIX) : lapack/zpotri.c $(CC) -c $(CFLAGS) $< -o $(@F) xpotri.$(SUFFIX) xpotri.$(PSUFFIX) : zpotri.c $(CC) -c $(CFLAGS) $< -o $(@F) slarf.$(SUFFIX) slarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) dlarf.$(SUFFIX) dlarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) qlarf.$(SUFFIX) qlarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) clarf.$(SUFFIX) clarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) ############# BLAS EXTENSIONS ##################################### daxpby.$(SUFFIX) daxpby.$(PSUFFIX) : axpby.c $(CC) $(CFLAGS) -c $< -o $(@F) cblas_daxpby.$(SUFFIX) cblas_daxpby.$(PSUFFIX) : axpby.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) saxpby.$(SUFFIX) saxpby.$(PSUFFIX) : axpby.c $(CC) $(CFLAGS) -c $< -o $(@F) cblas_saxpby.$(SUFFIX) cblas_saxpby.$(PSUFFIX) : axpby.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) zaxpby.$(SUFFIX) zaxpby.$(PSUFFIX) : zaxpby.c $(CC) $(CFLAGS) -c $< -o $(@F) cblas_zaxpby.$(SUFFIX) cblas_zaxpby.$(PSUFFIX) : zaxpby.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) caxpby.$(SUFFIX) caxpby.$(PSUFFIX) : zaxpby.c $(CC) $(CFLAGS) -c $< -o $(@F) cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_domatcopy.$(SUFFIX) cblas_domatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_somatcopy.$(SUFFIX) cblas_somatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_comatcopy.$(SUFFIX) cblas_comatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_zomatcopy.$(SUFFIX) cblas_zomatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_dimatcopy.$(SUFFIX) cblas_dimatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_simatcopy.$(SUFFIX) cblas_simatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_cimatcopy.$(SUFFIX) cblas_cimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c $(CC) -c $(CFLAGS) $< -o $(@F) dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c $(CC) -c $(CFLAGS) $< -o $(@F) cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c $(CC) -c $(CFLAGS) $< -o $(@F) zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c $(CC) -c $(CFLAGS) $< -o $(@F) cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) OpenBLAS-0.2.20/interface/asum.c000066400000000000000000000064471313527062700162300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ BLASLONG n = *N; BLASLONG incx = *INCX; FLOATRET ret; PRINT_DEBUG_NAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); ret = (FLOATRET)ASUM_K(n, x, incx); FUNCTION_PROFILE_END(COMPSIZE, n, n); IDEBUG_END; return ret; } #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ FLOAT ret; PRINT_DEBUG_CNAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); ret = ASUM_K(n, x, incx); FUNCTION_PROFILE_END(COMPSIZE, n, n); IDEBUG_END; return ret; } #endif OpenBLAS-0.2.20/interface/axpby.c000066400000000000000000000046541313527062700164040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /****************************************************************** 2014/06/07 Saar ******************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) { BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT alpha = *ALPHA; FLOAT beta = *BETA; #else void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { #endif if (n <= 0) return; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; AXPBY_K(n, alpha, x, incx, beta, y, incy); FUNCTION_PROFILE_END(1, 2 * n, 2 * n); return; } OpenBLAS-0.2.20/interface/axpy.c000066400000000000000000000100471313527062700162330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT alpha = *ALPHA; #else void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #endif #ifdef SMP int mode, nthreads; #endif #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif if (n <= 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; #ifdef SMP nthreads = num_cpu_avail(1); //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; //Temporarily work-around the low performance issue with small imput size & //multithreads. if (n <= 10000) nthreads = 1; if (nthreads == 1) { #endif AXPYU_K(n, 0, 0, alpha, x, incx, y, incy, NULL, 0); #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif blas_level1_thread(mode, n, 0, 0, &alpha, x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); } #endif FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/copy.c000066400000000000000000000064011313527062700162230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; PRINT_DEBUG_NAME; #else void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ PRINT_DEBUG_CNAME; #endif if (n <= 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * COMPSIZE; if (incy < 0) y -= (n - 1) * incy * COMPSIZE; COPY_K(n, x, incx, y, incy); FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/create000077500000000000000000000005131313527062700162740ustar00rootroot00000000000000#!/usr/bin/perl $count = 0; foreach (@ARGV) { print "#define\tinterface_", $_, "\t\t", $count, "\n"; $count ++; } print "#ifdef USE_FUNCTABLE\n"; print "#define MAX_PROF_TABLE ", $count, "\n"; print "static char *func_table[] = {\n"; foreach (@ARGV) { print "\"", $_, "\",\n"; } print "};\n"; print "#endif\n"; OpenBLAS-0.2.20/interface/dot.c000066400000000000000000000070361313527062700160440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOATRET ret; PRINT_DEBUG_NAME; if (n <= 0) return 0.; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; ret = (FLOATRET)DOTU_K(n, x, incx, y, incy); FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; return ret; } #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ FLOAT ret; PRINT_DEBUG_CNAME; if (n <= 0) return 0.; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; ret = DOTU_K(n, x, incx, y, incy); FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; return ret; } #endif OpenBLAS-0.2.20/interface/dsdot.c000066400000000000000000000070131313527062700163660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; double ret = 0.0; PRINT_DEBUG_NAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; ret=DSDOT_K(n, x, incx, y, incy); FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; return ret; } #else double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ double ret = 0.0; PRINT_DEBUG_CNAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; ret=DSDOT_K(n, x, incx, y, incy); FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; return ret; } #endif OpenBLAS-0.2.20/interface/gbmv.c000066400000000000000000000152631313527062700162120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QGBMV " #elif defined(DOUBLE) #define ERROR_NAME "DGBMV " #else #define ERROR_NAME "SGBMV " #endif static void (*gbmv[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qgbmv_n, qgbmv_t, #elif defined(DOUBLE) dgbmv_n, dgbmv_t, #else sgbmv_n, sgbmv_t, #endif }; #ifdef SMP static int (*gbmv_thread[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qgbmv_thread_n, qgbmv_thread_t, #elif defined(DOUBLE) dgbmv_thread_n, dgbmv_thread_t, #else sgbmv_thread_n, sgbmv_thread_t, #endif }; #endif #ifndef CBLAS void NAME(char *TRANS, blasint *M, blasint *N, blasint *KU, blasint *KL, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint ku = *KU; blasint kl = *KL; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT *buffer; #ifdef SMP int nthreads; #endif FLOAT alpha = *ALPHA; FLOAT beta = *BETA; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 0; if (trans == 'C') i = 1; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, blasint ku, blasint kl, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny, info, t; int trans; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; info = -1; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; info = -1; t = n; n = m; m = t; t = ku; ku = kl; kl = t; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans) lenx = m; if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx-1)*incx; if (incy < 0) y -= (leny-1)*incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gbmv[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n / 2 + n, m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/geadd.c000066400000000000000000000105601313527062700163160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #if defined(DOUBLE) #define ERROR_NAME "DGEADD " #else #define ERROR_NAME "SGEADD " #endif #ifndef CBLAS void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *BETA, FLOAT *c, blasint *LDC) { blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint ldc = *LDC; FLOAT alpha = *ALPHA; FLOAT beta = *BETA; blasint info; PRINT_DEBUG_NAME; info = 0; if (lda < MAX(1, m)) info = 6; if (ldc < MAX(1, m)) info = 8; if (n < 0) info = 2; if (m < 0) info = 1; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta, FLOAT *c, blasint ldc) { /* void CNAME(enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta, FLOAT *c, blasint ldc){ */ blasint info, t; PRINT_DEBUG_CNAME; info = 0; if (order == CblasColMajor) { info = -1; if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (order == CblasRowMajor) { info = -1; t = n; n = m; m = t; if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; IDEBUG_START; FUNCTION_PROFILE_START(); GEADD_K(m,n,alpha, a, lda, beta, c, ldc); FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/gemm.c000066400000000000000000000304231313527062700161770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef COMPLEX #ifdef XDOUBLE #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " #else #define ERROR_NAME "SGEMM " #endif #else #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " #elif defined(DOUBLE) #define ERROR_NAME "ZGEMM " #else #define ERROR_NAME "CGEMM " #endif #else #ifdef XDOUBLE #define ERROR_NAME "XGEMM3M " #elif defined(DOUBLE) #define ERROR_NAME "ZGEMM3M " #else #define ERROR_NAME "CGEMM3M " #endif #endif #endif #ifndef GEMM_MULTITHREAD_THRESHOLD #define GEMM_MULTITHREAD_THRESHOLD 4 #endif static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef GEMM3M GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, GEMM_NC, GEMM_TC, GEMM_RC, GEMM_CC, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) GEMM_THREAD_NN, GEMM_THREAD_TN, GEMM_THREAD_RN, GEMM_THREAD_CN, GEMM_THREAD_NT, GEMM_THREAD_TT, GEMM_THREAD_RT, GEMM_THREAD_CT, GEMM_THREAD_NR, GEMM_THREAD_TR, GEMM_THREAD_RR, GEMM_THREAD_CR, GEMM_THREAD_NC, GEMM_THREAD_TC, GEMM_THREAD_RC, GEMM_THREAD_CC, #endif #else GEMM3M_NN, GEMM3M_TN, GEMM3M_RN, GEMM3M_CN, GEMM3M_NT, GEMM3M_TT, GEMM3M_RT, GEMM3M_CT, GEMM3M_NR, GEMM3M_TR, GEMM3M_RR, GEMM3M_CR, GEMM3M_NC, GEMM3M_TC, GEMM3M_RC, GEMM3M_CC, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) GEMM3M_THREAD_NN, GEMM3M_THREAD_TN, GEMM3M_THREAD_RN, GEMM3M_THREAD_CN, GEMM3M_THREAD_NT, GEMM3M_THREAD_TT, GEMM3M_THREAD_RT, GEMM3M_THREAD_CT, GEMM3M_THREAD_NR, GEMM3M_THREAD_TR, GEMM3M_THREAD_RR, GEMM3M_THREAD_CR, GEMM3M_THREAD_NC, GEMM3M_THREAD_TC, GEMM3M_THREAD_RC, GEMM3M_THREAD_CC, #endif #endif }; #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, blasint *M, blasint *N, blasint *K, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ blas_arg_t args; int transa, transb, nrowa, nrowb; blasint info; char transA, transB; FLOAT *buffer; FLOAT *sa, *sb; #ifdef SMP int nthreads_max; int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; #endif PRINT_DEBUG_NAME; args.m = *M; args.n = *N; args.k = *K; args.a = (void *)a; args.b = (void *)b; args.c = (void *)c; args.lda = *ldA; args.ldb = *ldB; args.ldc = *ldC; args.alpha = (void *)alpha; args.beta = (void *)beta; transA = *TRANSA; transB = *TRANSB; TOUPPER(transA); TOUPPER(transB); transa = -1; transb = -1; if (transA == 'N') transa = 0; if (transA == 'T') transa = 1; #ifndef COMPLEX if (transA == 'R') transa = 0; if (transA == 'C') transa = 1; #else if (transA == 'R') transa = 2; if (transA == 'C') transa = 3; #endif if (transB == 'N') transb = 0; if (transB == 'T') transb = 1; #ifndef COMPLEX if (transB == 'R') transb = 0; if (transB == 'C') transb = 1; #else if (transB == 'R') transb = 2; if (transB == 'C') transb = 3; #endif nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; if (transb & 1) nrowb = args.n; info = 0; if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (transb < 0) info = 2; if (transa < 0) info = 1; if (info){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, blasint n, blasint k, #ifndef COMPLEX FLOAT alpha, #else FLOAT *alpha, #endif FLOAT *a, blasint lda, FLOAT *b, blasint ldb, #ifndef COMPLEX FLOAT beta, #else FLOAT *beta, #endif FLOAT *c, blasint ldc) { blas_arg_t args; int transa, transb; blasint nrowa, nrowb, info; XFLOAT *buffer; XFLOAT *sa, *sb; #ifdef SMP int nthreads_max; int nthreads_avail; double MNK; #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; #endif PRINT_DEBUG_CNAME; #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β #else args.alpha = (void *)alpha; args.beta = (void *)beta; #endif transa = -1; transb = -1; info = 0; if (order == CblasColMajor) { args.m = m; args.n = n; args.k = k; args.a = (void *)a; args.b = (void *)b; args.c = (void *)c; args.lda = lda; args.ldb = ldb; args.ldc = ldc; if (TransA == CblasNoTrans) transa = 0; if (TransA == CblasTrans) transa = 1; #ifndef COMPLEX if (TransA == CblasConjNoTrans) transa = 0; if (TransA == CblasConjTrans) transa = 1; #else if (TransA == CblasConjNoTrans) transa = 2; if (TransA == CblasConjTrans) transa = 3; #endif if (TransB == CblasNoTrans) transb = 0; if (TransB == CblasTrans) transb = 1; #ifndef COMPLEX if (TransB == CblasConjNoTrans) transb = 0; if (TransB == CblasConjTrans) transb = 1; #else if (TransB == CblasConjNoTrans) transb = 2; if (TransB == CblasConjTrans) transb = 3; #endif nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; if (transb & 1) nrowb = args.n; info = -1; if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (transb < 0) info = 2; if (transa < 0) info = 1; } if (order == CblasRowMajor) { args.m = n; args.n = m; args.k = k; args.a = (void *)b; args.b = (void *)a; args.c = (void *)c; args.lda = ldb; args.ldb = lda; args.ldc = ldc; if (TransB == CblasNoTrans) transa = 0; if (TransB == CblasTrans) transa = 1; #ifndef COMPLEX if (TransB == CblasConjNoTrans) transa = 0; if (TransB == CblasConjTrans) transa = 1; #else if (TransB == CblasConjNoTrans) transa = 2; if (TransB == CblasConjTrans) transa = 3; #endif if (TransA == CblasNoTrans) transb = 0; if (TransA == CblasTrans) transb = 1; #ifndef COMPLEX if (TransA == CblasConjNoTrans) transb = 0; if (TransA == CblasConjTrans) transb = 1; #else if (TransA == CblasConjNoTrans) transb = 2; if (TransA == CblasConjTrans) transb = 3; #endif nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; if (transb & 1) nrowb = args.n; info = -1; if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (transb < 0) info = 2; if (transa < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((args.m == 0) || (args.n == 0)) return; #if 0 fprintf(stderr, "m = %4d n = %d k = %d lda = %4d ldb = %4d ldc = %4d\n", args.m, args.n, args.k, args.lda, args.ldb, args.ldc); #endif IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (XFLOAT *)blas_memory_alloc(0); sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); nthreads_max = num_cpu_avail(3); nthreads_avail = nthreads_max; #ifndef COMPLEX MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; #else MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) nthreads_max = 1; #endif args.common = NULL; if ( nthreads_max > nthreads_avail ) args.nthreads = nthreads_avail; else args.nthreads = nthreads_max; if (args.nthreads == 1) { #endif (gemm[(transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { #ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef NO_AFFINITY nodes = get_num_nodes(); if ((nodes > 1) && get_node_equal()) { args.nthreads /= nodes; gemm_thread_mn(mode, &args, NULL, NULL, gemm[16 | (transb << 2) | transa], sa, sb, nodes); } else { #endif (gemm[16 | (transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); #else GEMM_THREAD(mode, &args, NULL, NULL, gemm[(transb << 2) | transa], sa, sb, args.nthreads); #endif #ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef NO_AFFINITY } #endif #endif #endif #ifdef SMP } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.k + args.k * args.n + args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/gemv.c000066400000000000000000000155001313527062700162070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #include "l1param.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QGEMV " #elif defined(DOUBLE) #define ERROR_NAME "DGEMV " #else #define ERROR_NAME "SGEMV " #endif #ifdef SMP static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qgemv_thread_n, qgemv_thread_t, #elif defined DOUBLE dgemv_thread_n, dgemv_thread_t, #else sgemv_thread_n, sgemv_thread_t, #endif }; #endif #ifndef CBLAS void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT alpha = *ALPHA; FLOAT beta = *BETA; FLOAT *buffer; int buffer_size; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 0; if (trans == 'C') i = 1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans, buffer_size; blasint info, t; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans) lenx = m; if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; buffer_size = m + n + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT) ; #endif // for alignment buffer_size = (buffer_size + 3) & ~3; STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) nthreads = 1; else nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif STACK_FREE(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/ger.c000066400000000000000000000126031313527062700160270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef SMP #ifdef __64BIT__ #define SMPTEST 1 #endif #endif #ifdef XDOUBLE #define ERROR_NAME "QGER " #elif defined DOUBLE #define ERROR_NAME "DGER " #else #define ERROR_NAME "SGER " #endif #define GER GERU_K #if defined XDOUBLE #define GER_THREAD qger_thread #elif defined DOUBLE #define GER_THREAD dger_thread #else #define GER_THREAD sger_thread #endif #ifndef CBLAS void NAME(blasint *M, blasint *N, FLOAT *Alpha, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ blasint m = *M; blasint n = *N; FLOAT alpha = *Alpha; blasint incx = *INCX; blasint incy = *INCY; blasint lda = *LDA; FLOAT *buffer; #ifdef SMPTEST int nthreads; #endif blasint info; PRINT_DEBUG_NAME; info = 0; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; if (info){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT *buffer; blasint info, t; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_CNAME; info = 0; if (order == CblasColMajor) { info = -1; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (order == CblasRowMajor) { info = -1; t = n; n = m; m = t; t = incx; incx = incy; incy = t; buffer = x; x = y; y = buffer; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif /* Quick return if possible. */ if (m == 0 || n == 0) return; if (alpha == 0.) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incy < 0) y -= (n - 1) * incy; if (incx < 0) x -= (m - 1) * incx; STACK_ALLOC(m, FLOAT, buffer); #ifdef SMPTEST // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD) nthreads = num_cpu_avail(2); else nthreads = 1; if (nthreads == 1) { #endif GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); #ifdef SMPTEST } else { GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); } #endif STACK_FREE(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/imatcopy.c000066400000000000000000000127641313527062700171070ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** * 2014-06-10 Saar * 2015-09-07 grisuthedragon ***********************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #if defined(DOUBLE) #define ERROR_NAME "DIMATCOPY" #else #define ERROR_NAME "SIMATCOPY" #endif #define BlasRowMajor 0 #define BlasColMajor 1 #define BlasNoTrans 0 #define BlasTrans 1 #undef malloc #undef free /* Enables the New IMATCOPY code with inplace operation if lda == ldb */ #define NEW_IMATCOPY #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; Order = *ORDER; Trans = *TRANS; TOUPPER(Order); TOUPPER(Trans); if ( Order == 'C' ) order = BlasColMajor; if ( Order == 'R' ) order = BlasRowMajor; if ( Trans == 'N' ) trans = BlasNoTrans; if ( Trans == 'R' ) trans = BlasNoTrans; if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTrans; #else void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb) { int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; blasint *lda, *ldb, *rows, *cols; FLOAT *alpha; if ( CORDER == CblasColMajor) order = BlasColMajor; if ( CORDER == CblasRowMajor) order = BlasRowMajor; if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans) trans = BlasNoTrans; if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans; rows = &crows; cols = &ccols; alpha = &calpha; lda = &clda; ldb = &cldb; #endif if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasTrans && *ldb < *cols ) info = 9; } if ( order == BlasRowMajor) { if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; if ( trans == BlasTrans && *ldb < *rows ) info = 9; } if ( order == BlasColMajor && *lda < *rows ) info = 7; if ( order == BlasRowMajor && *lda < *cols ) info = 7; if ( *cols <= 0 ) info = 4; if ( *rows <= 0 ) info = 3; if ( trans < 0 ) info = 2; if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #ifdef NEW_IMATCOPY if ( *lda == *ldb ) { if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); } else { IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); } } else { if ( trans == BlasNoTrans ) { IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); } else { IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); } } return; } #endif if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT); else msize = (*ldb) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) { printf("Memory alloc failed\n"); exit(1); } if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb ); } else { OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); } } else { if ( trans == BlasNoTrans ) { OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); } else { OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); } } free(b); return; } OpenBLAS-0.2.20/interface/imax.c000066400000000000000000000104221313527062700162050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #undef MAX_K #ifdef USE_ABS #ifndef USE_MIN /* ABS & MAX */ #ifndef COMPLEX #ifdef XDOUBLE #define MAX_K IQAMAX_K #elif defined(DOUBLE) #define MAX_K IDAMAX_K #else #define MAX_K ISAMAX_K #endif #else #ifdef XDOUBLE #define MAX_K IXAMAX_K #elif defined(DOUBLE) #define MAX_K IZAMAX_K #else #define MAX_K ICAMAX_K #endif #endif #else /* ABS & MIN */ #ifndef COMPLEX #ifdef XDOUBLE #define MAX_K IQAMIN_K #elif defined(DOUBLE) #define MAX_K IDAMIN_K #else #define MAX_K ISAMIN_K #endif #else #ifdef XDOUBLE #define MAX_K IXAMIN_K #elif defined(DOUBLE) #define MAX_K IZAMIN_K #else #define MAX_K ICAMIN_K #endif #endif #endif #else #ifndef USE_MIN /* MAX */ #ifdef XDOUBLE #define MAX_K IQMAX_K #elif defined(DOUBLE) #define MAX_K IDMAX_K #else #define MAX_K ISMAX_K #endif #else /* MIN */ #ifdef XDOUBLE #define MAX_K IQMIN_K #elif defined(DOUBLE) #define MAX_K IDMIN_K #else #define MAX_K ISMIN_K #endif #endif #endif #ifndef CBLAS blasint NAME(blasint *N, FLOAT *x, blasint *INCX){ BLASLONG n = *N; BLASLONG incx = *INCX; blasint ret; PRINT_DEBUG_NAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); ret = (blasint)MAX_K(n, x, incx); if(ret > n) ret=n; FUNCTION_PROFILE_END(COMPSIZE, n, 0); IDEBUG_END; return ret; } #else CBLAS_INDEX CNAME(blasint n, FLOAT *x, blasint incx){ CBLAS_INDEX ret; PRINT_DEBUG_CNAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); ret = MAX_K(n, x, incx); if (ret > n) ret=n; if (ret) ret --; FUNCTION_PROFILE_END(COMPSIZE, n, 0); IDEBUG_END; return ret; } #endif OpenBLAS-0.2.20/interface/lapack/000077500000000000000000000000001313527062700163375ustar00rootroot00000000000000OpenBLAS-0.2.20/interface/lapack/fortran/000077500000000000000000000000001313527062700200125ustar00rootroot00000000000000OpenBLAS-0.2.20/interface/lapack/fortran/dlaqr5.f000066400000000000000000001155221313527062700213570ustar00rootroot00000000000000! Copyright (c) 2013-2016, The OpenBLAS Project ! All rights reserved. ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions are ! met: ! 1. Redistributions of source code must retain the above copyright ! notice, this list of conditions and the following disclaimer. ! 2. Redistributions in binary form must reproduce the above copyright ! notice, this list of conditions and the following disclaimer in ! the documentation and/or other materials provided with the ! distribution. ! 3. Neither the name of the OpenBLAS project nor the names of ! its contributors may be used to endorse or promote products ! derived from this software without specific prior written permission. ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ! ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ! DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ! SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ! OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ! USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *> \brief \b DLAQR5 performs a single small-bulge multi-shift QR sweep. * * =========== DOCUMENTATION =========== * * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * *> \htmlonly *> Download DLAQR5 + dependencies *> *> [TGZ] *> *> [ZIP] *> *> [TXT] *> \endhtmlonly * * Definition: * =========== * * SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, * LDU, NV, WV, LDWV, NH, WH, LDWH ) * * .. Scalar Arguments .. * INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, * $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV * LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. * DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), * $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), * $ Z( LDZ, * ) * .. * * *> \par Purpose: * ============= *> *> \verbatim *> *> DLAQR5, called by DLAQR0, performs a *> single small-bulge multi-shift QR sweep. *> \endverbatim * * Arguments: * ========== * *> \param[in] WANTT *> \verbatim *> WANTT is logical scalar *> WANTT = .true. if the quasi-triangular Schur factor *> is being computed. WANTT is set to .false. otherwise. *> \endverbatim *> *> \param[in] WANTZ *> \verbatim *> WANTZ is logical scalar *> WANTZ = .true. if the orthogonal Schur factor is being *> computed. WANTZ is set to .false. otherwise. *> \endverbatim *> *> \param[in] KACC22 *> \verbatim *> KACC22 is integer with value 0, 1, or 2. *> Specifies the computation mode of far-from-diagonal *> orthogonal updates. *> = 0: DLAQR5 does not accumulate reflections and does not *> use matrix-matrix multiply to update far-from-diagonal *> matrix entries. *> = 1: DLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. *> = 2: DLAQR5 accumulates reflections, uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries, *> and takes advantage of 2-by-2 block structure during *> matrix multiplies. *> \endverbatim *> *> \param[in] N *> \verbatim *> N is integer scalar *> N is the order of the Hessenberg matrix H upon which this *> subroutine operates. *> \endverbatim *> *> \param[in] KTOP *> \verbatim *> KTOP is integer scalar *> \endverbatim *> *> \param[in] KBOT *> \verbatim *> KBOT is integer scalar *> These are the first and last rows and columns of an *> isolated diagonal block upon which the QR sweep is to be *> applied. It is assumed without a check that *> either KTOP = 1 or H(KTOP,KTOP-1) = 0 *> and *> either KBOT = N or H(KBOT+1,KBOT) = 0. *> \endverbatim *> *> \param[in] NSHFTS *> \verbatim *> NSHFTS is integer scalar *> NSHFTS gives the number of simultaneous shifts. NSHFTS *> must be positive and even. *> \endverbatim *> *> \param[in,out] SR *> \verbatim *> SR is DOUBLE PRECISION array of size (NSHFTS) *> \endverbatim *> *> \param[in,out] SI *> \verbatim *> SI is DOUBLE PRECISION array of size (NSHFTS) *> SR contains the real parts and SI contains the imaginary *> parts of the NSHFTS shifts of origin that define the *> multi-shift QR sweep. On output SR and SI may be *> reordered. *> \endverbatim *> *> \param[in,out] H *> \verbatim *> H is DOUBLE PRECISION array of size (LDH,N) *> On input H contains a Hessenberg matrix. On output a *> multi-shift QR sweep with shifts SR(J)+i*SI(J) is applied *> to the isolated diagonal block in rows and columns KTOP *> through KBOT. *> \endverbatim *> *> \param[in] LDH *> \verbatim *> LDH is integer scalar *> LDH is the leading dimension of H just as declared in the *> calling procedure. LDH.GE.MAX(1,N). *> \endverbatim *> *> \param[in] ILOZ *> \verbatim *> ILOZ is INTEGER *> \endverbatim *> *> \param[in] IHIZ *> \verbatim *> IHIZ is INTEGER *> Specify the rows of Z to which transformations must be *> applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N *> \endverbatim *> *> \param[in,out] Z *> \verbatim *> Z is DOUBLE PRECISION array of size (LDZ,IHI) *> If WANTZ = .TRUE., then the QR Sweep orthogonal *> similarity transformation is accumulated into *> Z(ILOZ:IHIZ,ILO:IHI) from the right. *> If WANTZ = .FALSE., then Z is unreferenced. *> \endverbatim *> *> \param[in] LDZ *> \verbatim *> LDZ is integer scalar *> LDA is the leading dimension of Z just as declared in *> the calling procedure. LDZ.GE.N. *> \endverbatim *> *> \param[out] V *> \verbatim *> V is DOUBLE PRECISION array of size (LDV,NSHFTS/2) *> \endverbatim *> *> \param[in] LDV *> \verbatim *> LDV is integer scalar *> LDV is the leading dimension of V as declared in the *> calling procedure. LDV.GE.3. *> \endverbatim *> *> \param[out] U *> \verbatim *> U is DOUBLE PRECISION array of size *> (LDU,3*NSHFTS-3) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is integer scalar *> LDU is the leading dimension of U just as declared in the *> in the calling subroutine. LDU.GE.3*NSHFTS-3. *> \endverbatim *> *> \param[in] NH *> \verbatim *> NH is integer scalar *> NH is the number of columns in array WH available for *> workspace. NH.GE.1. *> \endverbatim *> *> \param[out] WH *> \verbatim *> WH is DOUBLE PRECISION array of size (LDWH,NH) *> \endverbatim *> *> \param[in] LDWH *> \verbatim *> LDWH is integer scalar *> Leading dimension of WH just as declared in the *> calling procedure. LDWH.GE.3*NSHFTS-3. *> \endverbatim *> *> \param[in] NV *> \verbatim *> NV is integer scalar *> NV is the number of rows in WV agailable for workspace. *> NV.GE.1. *> \endverbatim *> *> \param[out] WV *> \verbatim *> WV is DOUBLE PRECISION array of size *> (LDWV,3*NSHFTS-3) *> \endverbatim *> *> \param[in] LDWV *> \verbatim *> LDWV is integer scalar *> LDWV is the leading dimension of WV as declared in the *> in the calling subroutine. LDWV.GE.NV. *> \endverbatim * * Authors: * ======== * *> \author Univ. of Tennessee *> \author Univ. of California Berkeley *> \author Univ. of Colorado Denver *> \author NAG Ltd. * *> \date September 2012 * *> \ingroup doubleOTHERauxiliary * *> \par Contributors: * ================== *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA * *> \par References: * ================ *> *> K. Braman, R. Byers and R. Mathias, The Multi-Shift QR *> Algorithm Part I: Maintaining Well Focused Shifts, and Level 3 *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> * ===================================================================== SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) * * -- LAPACK auxiliary routine (version 3.4.2) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * September 2012 * * .. Scalar Arguments .. INTEGER IHIZ, ILOZ, KACC22, KBOT, KTOP, LDH, LDU, LDV, $ LDWH, LDWV, LDZ, N, NH, NSHFTS, NV LOGICAL WANTT, WANTZ * .. * .. Array Arguments .. DOUBLE PRECISION H( LDH, * ), SI( * ), SR( * ), U( LDU, * ), $ V( LDV, * ), WH( LDWH, * ), WV( LDWV, * ), $ Z( LDZ, * ) * .. * * ================================================================ * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0d0, ONE = 1.0d0 ) * .. * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, $ NS, NU LOGICAL ACCUM, BLK22, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH EXTERNAL DLAMCH * .. * .. Intrinsic Functions .. * INTRINSIC ABS, DBLE, MAX, MIN, MOD * .. * .. Local Arrays .. DOUBLE PRECISION VT( 3 ) * temp scalars DOUBLE PRECISION tempv1, tempv2, tempv3, $ tempv4, tempv5, tempv6, $ temph1, temph2, temph3, $ temph4, temph5, temph6, $ tempz1, tempz2, tempz3, $ tempz4, tempz5, tempz6, $ tempu1, tempu2, tempu3, $ tempu4, tempu5, tempu6, $ REFSU1 INTEGER JBEGIN, M1 * .. * .. External Subroutines .. EXTERNAL DGEMM, DLABAD, DLACPY, DLAQR1, DLARFG, DLASET, $ DTRMM * .. * .. Executable Statements .. * * ==== If there are no shifts, then there is nothing to do. ==== * IF( NSHFTS.LT.2 ) $ RETURN * * ==== If the active block is empty or 1-by-1, then there * . is nothing to do. ==== * IF( KTOP.GE.KBOT ) $ RETURN * * ==== Shuffle shifts into pairs of real shifts and pairs * . of complex conjugate shifts assuming complex * . conjugate shifts are already adjacent to one * . another. ==== * DO 10 I = 1, NSHFTS - 2, 2 IF( SI( I ).NE.-SI( I+1 ) ) THEN * SWAP = SR( I ) SR( I ) = SR( I+1 ) SR( I+1 ) = SR( I+2 ) SR( I+2 ) = SWAP * SWAP = SI( I ) SI( I ) = SI( I+1 ) SI( I+1 ) = SI( I+2 ) SI( I+2 ) = SWAP END IF 10 CONTINUE * * ==== NSHFTS is supposed to be even, but if it is odd, * . then simply reduce it by one. The shuffle above * . ensures that the dropped shift is real and that * . the remaining shifts are paired. ==== * NS = NSHFTS - MOD( NSHFTS, 2 ) * * ==== Machine constants for deflation ==== * SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N ) / ULP ) * * ==== Use accumulated reflections to update far-from-diagonal * . entries ? ==== * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * * ==== If so, exploit the 2-by-2 block structure? ==== * BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) * * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) $ H( KTOP+2, KTOP ) = ZERO * * ==== NBMPS = number of 2-shift bulges in the chain ==== * NBMPS = NS / 2 * * ==== KDU = width of slab ==== * KDU = 6*NBMPS - 3 * * ==== Create and chase chains of NBMPS bulges ==== * DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 NDCOL = INCOL + KDU IF( ACCUM ) $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge * . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The * . following loop chases a 3*NBMPS column long chain of * . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small * . 2-by-2 bulge, if there is room. The inactive bulges * . (if any) must wait until the active bulges have moved * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) M22 = MBOT + 1 BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * DO 20 M = MTOP, MBOT K = KRCOL + 3*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), $ V( 1, M ) ) ALPHA = V( 1, M ) CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) * * ==== A Bulge may collapse because of vigilant * . deflation or destructive underflow. In the * . underflow case, try the two-small-subdiagonals * . trick to try to reinflate the bulge. ==== * IF( H( K+3, K ).NE.ZERO .OR. H( K+3, K+1 ).NE. $ ZERO .OR. H( K+3, K+2 ).EQ.ZERO ) THEN * * ==== Typical case: not collapsed (yet). ==== * H( K+1, K ) = BETA H( K+2, K ) = ZERO H( K+3, K ) = ZERO ELSE * * ==== Atypical case: collapsed. Attempt to * . reintroduce ignoring H(K+1,K) and H(K+2,K). * . If the fill resulting from the new * . reflector is too large, then abandon it. * . Otherwise, use the new one. ==== * CALL DLAQR1( 3, H( K+1, K+1 ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), $ VT ) ALPHA = VT( 1 ) CALL DLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )* $ H( K+2, K ) ) * IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+ $ ABS( REFSUM*VT( 3 ) ).GT.ULP* $ ( ABS( H( K, K ) )+ABS( H( K+1, $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN * * ==== Starting a new bulge here would * . create non-negligible fill. Use * . the old one with trepidation. ==== * H( K+1, K ) = BETA H( K+2, K ) = ZERO H( K+3, K ) = ZERO ELSE * * ==== Stating a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== * H( K+1, K ) = H( K+1, K ) - REFSUM H( K+2, K ) = ZERO H( K+3, K ) = ZERO V( 1, M ) = VT( 1 ) V( 2, M ) = VT( 2 ) V( 3, M ) = VT( 3 ) END IF END IF END IF 20 CONTINUE * * ==== Generate a 2-by-2 reflection, if needed. ==== * K = KRCOL + 3*( M22-1 ) IF( BMP22 ) THEN IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), $ V( 1, M22 ) ) BETA = V( 1, M22 ) CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) ELSE BETA = H( K+1, K ) V( 2, M22 ) = H( K+2, K ) CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) H( K+1, K ) = BETA H( K+2, K ) = ZERO END IF END IF * * ==== Multiply H by reflections from the left ==== * IF( ACCUM ) THEN JBOT = MIN( NDCOL, KBOT ) ELSE IF( WANTT ) THEN JBOT = N ELSE JBOT = KBOT END IF DO 40 J = MAX( KTOP, KRCOL ), JBOT MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) DO 30 M = MTOP, MEND M1 = M -1 tempv1 = V( 1, M ) K = KRCOL + 2*M1 tempv2 = V( 2, M ) K = K + M1 tempv3 = V( 3, M ) temph1 = H( K+1, J ) temph2 = H( K+2, J ) temph3 = H( K+3, J ) REFSUM = tempv1*( temph1+tempv2* $ temph2+tempv3*temph3 ) H( K+1, J ) = temph1 - REFSUM H( K+2, J ) = temph2 - REFSUM*tempv2 H( K+3, J ) = temph3 - REFSUM*tempv3 30 CONTINUE 40 CONTINUE IF( BMP22 ) THEN K = KRCOL + 3*( M22-1 ) DO 50 J = MAX( K+1, KTOP ), JBOT REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* $ H( K+2, J ) ) H( K+1, J ) = H( K+1, J ) - REFSUM H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) 50 CONTINUE END IF * * ==== Multiply H by reflections from the right. * . Delay filling in the last row until the * . vigilant deflation check is complete. ==== * IF( ACCUM ) THEN JTOP = MAX( KTOP, INCOL ) ELSE IF( WANTT ) THEN JTOP = 1 ELSE JTOP = KTOP END IF DO 90 M = MTOP, MBOT IF( V( 1, M ).NE.ZERO ) THEN tempv1 = V( 1, M ) tempv2 = V( 2, M ) tempv3 = V( 3, M ) K = KRCOL + 3*( M-1 ) JBEGIN = JTOP IF ( MOD( MIN( KBOT, K+3 )-JTOP+1, 2).GT.0 ) THEN J = JBEGIN temph1 = H( J, K+1 ) temph2 = H( J, K+2 ) temph3 = H( J, K+3 ) REFSUM = tempv1* ( temph1+tempv2*temph2+ $ tempv3*temph3 ) H( J, K+1 ) = temph1 - REFSUM H( J, K+2 ) = temph2 - REFSUM*tempv2 H( J, K+3 ) = temph3 - REFSUM*tempv3 JBEGIN = JBEGIN + 1 END IF DO 60 J = JBEGIN, MIN( KBOT, K+3 ), 2 temph1 = H( J, K+1 ) temph4 = H( J+1, K+1 ) temph2 = H( J, K+2 ) temph5 = H( J+1, K+2 ) temph3 = H( J, K+3 ) temph6 = H( J+1, K+3 ) REFSUM = tempv1* ( temph1+tempv2*temph2+ $ tempv3*temph3 ) REFSU1 = tempv1* ( temph4+tempv2*temph5+ $ tempv3*temph6 ) H( J, K+1 ) = temph1 - REFSUM H( J+1, K+1 ) = temph4 - REFSU1 H( J, K+2 ) = temph2 - REFSUM*tempv2 H( J+1, K+2 ) = temph5 - REFSU1*tempv2 H( J, K+3 ) = temph3 - REFSUM*tempv3 H( J+1, K+3 ) = temph6 - REFSU1*tempv3 60 CONTINUE * IF( ACCUM ) THEN * * ==== Accumulate U. (If necessary, update Z later * . with with an efficient matrix-matrix * . multiply.) ==== * KMS = K - INCOL JBEGIN=MAX( 1, KTOP-INCOL ) IF ( MOD(KDU-JBEGIN+1,2).GT.0 ) THEN J = JBEGIN tempu1 = U( J, KMS+1 ) tempu2 = U( J, KMS+2 ) tempu3 = U( J, KMS+3 ) REFSUM = tempv1* ( tempu1+tempv2*tempu2+ $ tempv3*tempu3 ) U( J, KMS+1 ) = tempu1 - REFSUM U( J, KMS+2 ) = tempu2 - REFSUM*tempv2 U( J, KMS+3 ) = tempu3 - REFSUM*tempv3 JBEGIN = JBEGIN + 1 END IF DO 70 J = JBEGIN, KDU , 2 tempu1 = U( J, KMS+1 ) tempu4 = U( J+1, KMS+1 ) tempu2 = U( J, KMS+2 ) tempu5 = U( J+1, KMS+2 ) tempu3 = U( J, KMS+3 ) tempu6 = U( J+1, KMS+3 ) REFSUM = tempv1* ( tempu1+tempv2*tempu2+ $ tempv3*tempu3 ) REFSU1 = tempv1* ( tempu4+tempv2*tempu5+ $ tempv3*tempu6 ) U( J, KMS+1 ) = tempu1 - REFSUM U( J+1, KMS+1 ) = tempu4 - REFSU1 U( J, KMS+2 ) = tempu2 - REFSUM*tempv2 U( J+1, KMS+2 ) = tempu5 - REFSU1*tempv2 U( J, KMS+3 ) = tempu3 - REFSUM*tempv3 U( J+1, KMS+3 ) = tempu6 - REFSU1*tempv3 70 CONTINUE ELSE IF( WANTZ ) THEN * * ==== U is not accumulated, so update Z * . now by multiplying by reflections * . from the right. ==== * JBEGIN = ILOZ IF ( MOD(IHIZ-ILOZ+1,2).GT.0 ) THEN J = JBEGIN tempz1 = Z( J, K+1 ) tempz2 = Z( J, K+2 ) tempz3 = Z( J, K+3 ) REFSUM = tempv1* ( tempz1+tempv2*tempz2+ $ tempv3*tempz3 ) Z( J, K+1 ) = tempz1 - REFSUM Z( J, K+2 ) = tempz2 - REFSUM*tempv2 Z( J, K+3 ) = tempz3 - REFSUM*tempv3 JBEGIN = JBEGIN + 1 END IF DO 80 J = JBEGIN, IHIZ, 2 tempz1 = Z( J, K+1 ) tempz4 = Z( J+1, K+1 ) tempz2 = Z( J, K+2 ) tempz5 = Z( J+1, K+2 ) tempz3 = Z( J, K+3 ) tempz6 = Z( J+1, K+3 ) REFSUM = tempv1* ( tempz1+tempv2*tempz2+ $ tempv3*tempz3 ) REFSU1 = tempv1* ( tempz4+tempv2*tempz5+ $ tempv3*tempz6 ) Z( J, K+1 ) = tempz1 - REFSUM Z( J, K+2 ) = tempz2 - REFSUM*tempv2 Z( J, K+3 ) = tempz3 - REFSUM*tempv3 Z( J+1, K+1 ) = tempz4 - REFSU1 Z( J+1, K+2 ) = tempz5 - REFSU1*tempv2 Z( J+1, K+3 ) = tempz6 - REFSU1*tempv3 80 CONTINUE END IF END IF 90 CONTINUE * * ==== Special case: 2-by-2 reflection (if needed) ==== * K = KRCOL + 3*( M22-1 ) IF( BMP22 ) THEN IF ( V( 1, M22 ).NE.ZERO ) THEN DO 100 J = JTOP, MIN( KBOT, K+3 ) REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* $ H( J, K+2 ) ) H( J, K+1 ) = H( J, K+1 ) - REFSUM H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) 100 CONTINUE * IF( ACCUM ) THEN KMS = K - INCOL DO 110 J = MAX( 1, KTOP-INCOL ), KDU REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ $ V( 2, M22 )*U( J, KMS+2 ) ) U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM U( J, KMS+2 ) = U( J, KMS+2 ) - $ REFSUM*V( 2, M22 ) 110 CONTINUE ELSE IF( WANTZ ) THEN DO 120 J = ILOZ, IHIZ REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* $ Z( J, K+2 ) ) Z( J, K+1 ) = Z( J, K+1 ) - REFSUM Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) 120 CONTINUE END IF END IF END IF * * ==== Vigilant deflation check ==== * MSTART = MTOP IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) $ MSTART = MSTART + 1 MEND = MBOT IF( BMP22 ) $ MEND = MEND + 1 IF( KRCOL.EQ.KBOT-2 ) $ MEND = MEND + 1 DO 130 M = MSTART, MEND K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals * . criterion and the Ahues & Tisseur (LAWN 122, 1997) * . criteria both be satisfied. The latter improves * . accuracy in some examples. Falling back on an * . alternate convergence criterion when TST1 or TST2 * . is zero (as done here) is traditional but probably * . unnecessary. ==== * IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN IF( K.GE.KTOP+1 ) $ TST1 = TST1 + ABS( H( K, K-1 ) ) IF( K.GE.KTOP+2 ) $ TST1 = TST1 + ABS( H( K, K-2 ) ) IF( K.GE.KTOP+3 ) $ TST1 = TST1 + ABS( H( K, K-3 ) ) IF( K.LE.KBOT-2 ) $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) IF( K.LE.KBOT-3 ) $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) IF( K.LE.KBOT-4 ) $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) END IF IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) $ THEN H12 = MAX( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) H21 = MIN( ABS( H( K+1, K ) ), ABS( H( K, K+1 ) ) ) H11 = MAX( ABS( H( K+1, K+1 ) ), $ ABS( H( K, K )-H( K+1, K+1 ) ) ) H22 = MIN( ABS( H( K+1, K+1 ) ), $ ABS( H( K, K )-H( K+1, K+1 ) ) ) SCL = H11 + H12 TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF 130 CONTINUE * * ==== Fill in the last row of each bulge. ==== * MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) DO 140 M = MTOP, MEND K = KRCOL + 3*( M-1 ) REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) H( K+4, K+1 ) = -REFSUM H( K+4, K+2 ) = -REFSUM*V( 2, M ) H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) 140 CONTINUE * * ==== End of near-the-diagonal bulge chase. ==== * 150 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as * . well. ==== * IF( ACCUM ) THEN IF( WANTT ) THEN JTOP = 1 JBOT = N ELSE JTOP = KTOP JBOT = KBOT END IF IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN * * ==== Updates not exploiting the 2-by-2 block * . structure of U. K1 and NU keep track of * . the location and size of U in the special * . cases of introducing bulges and chasing * . bulges off the bottom. In these special * . cases and in case the number of shifts * . is NS = 2, there is no 2-by-2 block * . structure to exploit. ==== * K1 = MAX( 1, KTOP-INCOL ) NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * * ==== Horizontal Multiply ==== * DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH JLEN = MIN( NH, JBOT-JCOL+1 ) CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, $ LDWH ) CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, $ H( INCOL+K1, JCOL ), LDH ) 160 CONTINUE * * ==== Vertical multiply ==== * DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, $ H( JROW, INCOL+K1 ), LDH ) 170 CONTINUE * * ==== Z multiply (also vertical) ==== * IF( WANTZ ) THEN DO 180 JROW = ILOZ, IHIZ, NV JLEN = MIN( NV, IHIZ-JROW+1 ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, $ Z( JROW, INCOL+K1 ), LDZ ) 180 CONTINUE END IF ELSE * * ==== Updates exploiting U's 2-by-2 block structure. * . (I2, I4, J2, J4 are the last rows and columns * . of the blocks.) ==== * I2 = ( KDU+1 ) / 2 I4 = KDU J2 = I4 - I2 J4 = KDU * * ==== KZS and KNZ deal with the band of zeros * . along the diagonal of one of the triangular * . blocks. ==== * KZS = ( J4-J2 ) - ( NS+1 ) KNZ = NS + 1 * * ==== Horizontal multiply ==== * DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH JLEN = MIN( NH, JBOT-JCOL+1 ) * * ==== Copy bottom of H to top+KZS of scratch ==== * (The first KZS rows get multiplied by zero.) ==== * CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), $ LDH, WH( KZS+1, 1 ), LDWH ) * * ==== Multiply by U21**T ==== * CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), $ LDWH ) * * ==== Multiply top of H by U11**T ==== * CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) * * ==== Copy top of H to bottom of WH ==== * CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, $ WH( I2+1, 1 ), LDWH ) * * ==== Multiply by U21**T ==== * CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) * * ==== Multiply by U22 ==== * CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, $ U( J2+1, I2+1 ), LDU, $ H( INCOL+1+J2, JCOL ), LDH, ONE, $ WH( I2+1, 1 ), LDWH ) * * ==== Copy it back ==== * CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH, $ H( INCOL+1, JCOL ), LDH ) 190 CONTINUE * * ==== Vertical multiply ==== * DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) * * ==== Copy right of H to scratch (the first KZS * . columns get multiplied by zero) ==== * CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), $ LDH, WV( 1, 1+KZS ), LDWV ) * * ==== Multiply by U21 ==== * CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), $ LDWV ) * * ==== Multiply by U11 ==== * CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, $ LDWV ) * * ==== Copy left of H to right of scratch ==== * CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, $ WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U21 ==== * CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U22 ==== * CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, $ H( JROW, INCOL+1+J2 ), LDH, $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), $ LDWV ) * * ==== Copy it back ==== * CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, $ H( JROW, INCOL+1 ), LDH ) 200 CONTINUE * * ==== Multiply Z (also vertical) ==== * IF( WANTZ ) THEN DO 210 JROW = ILOZ, IHIZ, NV JLEN = MIN( NV, IHIZ-JROW+1 ) * * ==== Copy right of Z to left of scratch (first * . KZS columns get multiplied by zero) ==== * CALL DLACPY( 'ALL', JLEN, KNZ, $ Z( JROW, INCOL+1+J2 ), LDZ, $ WV( 1, 1+KZS ), LDWV ) * * ==== Multiply by U12 ==== * CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, $ LDWV ) CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), $ LDWV ) * * ==== Multiply by U11 ==== * CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, $ WV, LDWV ) * * ==== Copy left of Z to right of scratch ==== * CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), $ LDZ, WV( 1, 1+I2 ), LDWV ) * * ==== Multiply by U21 ==== * CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), $ LDWV ) * * ==== Multiply by U22 ==== * CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, $ Z( JROW, INCOL+1+J2 ), LDZ, $ U( J2+1, I2+1 ), LDU, ONE, $ WV( 1, 1+I2 ), LDWV ) * * ==== Copy the result back to Z ==== * CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, $ Z( JROW, INCOL+1 ), LDZ ) 210 CONTINUE END IF END IF END IF 220 CONTINUE * * ==== End of DLAQR5 ==== * END OpenBLAS-0.2.20/interface/lapack/gesv.c000066400000000000000000000113461313527062700174540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef COMPLEX #ifdef XDOUBLE #define ERROR_NAME "QGESV " #elif defined(DOUBLE) #define ERROR_NAME "DGESV " #else #define ERROR_NAME "SGESV " #endif #else #ifdef XDOUBLE #define ERROR_NAME "XGESV " #elif defined(DOUBLE) #define ERROR_NAME "ZGESV " #else #define ERROR_NAME "CGESV " #endif #endif int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, FLOAT *b, blasint *ldB, blasint *Info){ blas_arg_t args; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.m = *N; args.n = *NRHS; args.a = (void *)a; args.lda = *ldA; args.b = (void *)b; args.ldb = *ldB; args.c = (void *)ipiv; info = 0; if (args.ldb < MAX(1,args.m)) info = 7; if (args.lda < MAX(1,args.m)) info = 4; if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } args.alpha = NULL; args.beta = NULL; *Info = 0; if (args.m == 0 || args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif args.n = *N; info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); if (info == 0){ args.n = *NRHS; GETRS_N_SINGLE(&args, NULL, NULL, sa, sb, 0); } #ifdef SMP } else { args.n = *N; info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); if (info == 0){ args.n = *NRHS; GETRS_N_PARALLEL(&args, NULL, NULL, sa, sb, 0); } } #endif #ifndef PPC440 blas_memory_free(buffer); #endif *Info = info; FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, *N * *N, 2. / 3. * *N * *N * *N + *N * *N); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/getf2.c000066400000000000000000000077271313527062700175270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QGETF2" #elif defined(DOUBLE) #define ERROR_NAME "DGETF2" #else #define ERROR_NAME "SGETF2" #endif int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ blas_arg_t args; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.m = *M; args.n = *N; args.a = (void *)a; args.lda = *ldA; args.c = (void *)ipiv; info = 0; if (args.lda < MAX(1,args.m)) info = 4; if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.m == 0 || args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = GETF2(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/getrf.c000066400000000000000000000102251313527062700176120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QGETRF" #elif defined(DOUBLE) #define ERROR_NAME "DGETRF" #else #define ERROR_NAME "SGETRF" #endif int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ blas_arg_t args; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.m = *M; args.n = *N; args.a = (void *)a; args.lda = *ldA; args.c = (void *)ipiv; info = 0; if (args.lda < MAX(1,args.m)) info = 4; if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.m == 0 || args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/getrs.c000066400000000000000000000115161313527062700176330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QGETRS" #elif defined(DOUBLE) #define ERROR_NAME "DGETRS" #else #define ERROR_NAME "SGETRS" #endif static blasint (*getrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { GETRS_N_SINGLE, GETRS_T_SINGLE, }; #ifdef SMP static blasint (*getrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { GETRS_N_PARALLEL, GETRS_T_PARALLEL, }; #endif int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, FLOAT *b, blasint *ldB, blasint *Info){ char trans_arg = *TRANS; blas_arg_t args; blasint info; int trans; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.m = *N; args.n = *NRHS; args.a = (void *)a; args.lda = *ldA; args.b = (void *)b; args.ldb = *ldB; args.c = (void *)ipiv; info = 0; TOUPPER(trans_arg); trans = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (args.ldb < MAX(1, args.m)) info = 8; if (args.lda < MAX(1, args.m)) info = 5; if (args.n < 0) info = 3; if (args.m < 0) info = 2; if (trans < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return 0; } args.alpha = NULL; args.beta = NULL; *Info = info; if (args.m == 0 || args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif (getrs_single[trans])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { (getrs_parallel[trans])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/larf.c.obsolete000066400000000000000000000077541313527062700212570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif static int (*larf[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { LARF_L, LARF_R, }; int NAME(char *SIDE, blasint *M, blasint *N, FLOAT *v, blasint *incV, FLOAT *tau, FLOAT *c, blasint *ldC, FLOAT *work){ blas_arg_t args; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; char side_arg = *SIDE; int side; PRINT_DEBUG_NAME; TOUPPER(side_arg); args.m = *M; args.n = *N; args.a = (void *)v; args.lda = *incV; args.c = (void *)c; args.ldc = *ldC; args.alpha = (void *)tau; side = -1; if (side_arg == 'L') side = 0; if (side_arg == 'R') side = 1; if (args.m == 0 || args.n == 0) return 0; #ifndef COMPLEX if (*tau == ZERO) return 0; #else if ((*(tau + 0) == ZERO) && (*(tau + 1) == ZERO)) return 0; #endif IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif larf[side](&args, NULL, NULL, sa, sb, 0); #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/laswp.c000066400000000000000000000076271313527062700176450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, blasint *, BLASLONG) = { #ifdef XDOUBLE qlaswp_plus, qlaswp_minus, #elif defined(DOUBLE) dlaswp_plus, dlaswp_minus, #else slaswp_plus, slaswp_minus, #endif }; int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ blasint n = *N; blasint lda = *LDA; blasint k1 = *K1; blasint k2 = *K2; blasint incx = *INCX; int flag; #ifdef SMP int mode, nthreads; FLOAT dummyalpha[2] = {ZERO, ZERO}; #endif PRINT_DEBUG_NAME; if (incx == 0 || n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); flag = (incx < 0); #ifdef SMP nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif (laswp[flag])(n, k1, k2, ZERO, a, lda, NULL, 0, ipiv, incx); #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); } #endif FUNCTION_PROFILE_END(COMPSIZE, n * (k2 - k1), 0); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/lauu2.c000066400000000000000000000104761313527062700175430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QLAUU2" #elif defined(DOUBLE) #define ERROR_NAME "DLAUU2" #else #define ERROR_NAME "SLAUU2" #endif static blasint (*lauu2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifdef XDOUBLE qlauu2_U, qlauu2_L, #elif defined(DOUBLE) dlauu2_U, dlauu2_L, #else slauu2_U, slauu2_L, #endif }; int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = (lauu2[uplo])(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 1./6. * args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/lauum.c000066400000000000000000000111351313527062700176270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QLAUUM" #elif defined(DOUBLE) #define ERROR_NAME "DLAUUM" #else #define ERROR_NAME "SLAUUM" #endif static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { LAUUM_U_SINGLE, LAUUM_L_SINGLE, }; #ifdef SMP static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, }; #endif int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 1./6. * args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/potf2.c000066400000000000000000000104761313527062700175450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QPOTF2" #elif defined(DOUBLE) #define ERROR_NAME "DPOTF2" #else #define ERROR_NAME "SPOTF2" #endif static blasint (*potf2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifdef XDOUBLE qpotf2_U, qpotf2_L, #elif defined(DOUBLE) dpotf2_U, dpotf2_L, #else spotf2_U, spotf2_L, #endif }; int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = (potf2[uplo])(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 1./6. * args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/potrf.c000066400000000000000000000111351313527062700176360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QPOTRF" #elif defined(DOUBLE) #define ERROR_NAME "DPOTRF" #else #define ERROR_NAME "SPOTRF" #endif static blasint (*potrf_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { POTRF_U_SINGLE, POTRF_L_SINGLE, }; #ifdef SMP static blasint (*potrf_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { POTRF_U_PARALLEL, POTRF_L_PARALLEL, }; #endif int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = (potrf_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 1./6. * args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/potri.c000066400000000000000000000121631313527062700176430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QPOTRI" #elif defined(DOUBLE) #define ERROR_NAME "DPOTRI" #else #define ERROR_NAME "SPOTRI" #endif static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UN_SINGLE, TRTRI_LN_SINGLE, }; static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ LAUUM_U_SINGLE, LAUUM_L_SINGLE, }; #ifdef SMP static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UN_PARALLEL, TRTRI_LN_PARALLEL, }; static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, }; #endif int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif info = (trtri_single[uplo])(&args, NULL, NULL, sa, sb, 0); if (!info) { info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); } *Info = info; #ifdef SMP } else { info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); if (!info) { info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } *Info = info; } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/trti2.c000066400000000000000000000111701313527062700175470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTRTI2" #elif defined(DOUBLE) #define ERROR_NAME "DTRTI2" #else #define ERROR_NAME "STRTI2" #endif static blasint (*trti2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifdef XDOUBLE qtrti2_UU, qtrti2_UN, qtrti2_LU, qtrti2_LN, #elif defined(DOUBLE) dtrti2_UU, dtrti2_UN, dtrti2_LU, dtrti2_LN, #else strti2_UU, strti2_UN, strti2_LU, strti2_LN, #endif }; int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint diag_arg = *DIAG; blasint uplo, diag; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); TOUPPER(diag_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; diag = -1; if (diag_arg == 'U') diag = 0; if (diag_arg == 'N') diag = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 5; if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = (trti2[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/trtri.c000066400000000000000000000120541313527062700176510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTRTRI" #elif defined(DOUBLE) #define ERROR_NAME "DTRTRI" #else #define ERROR_NAME "STRTRI" #endif static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UU_SINGLE, TRTRI_UN_SINGLE, TRTRI_LU_SINGLE, TRTRI_LN_SINGLE, }; #ifdef SMP static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UU_PARALLEL, TRTRI_UN_PARALLEL, TRTRI_LU_PARALLEL, TRTRI_LN_PARALLEL, }; #endif int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint diag_arg = *DIAG; blasint uplo, diag; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); TOUPPER(diag_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; diag = -1; if (diag_arg == 'U') diag = 0; if (diag_arg == 'N') diag = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 5; if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; if (diag) { if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { *Info = IAMIN_K(args.n, args.a, args.lda + 1); return 0; } } IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zgetf2.c000066400000000000000000000077271313527062700177210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XGETF2" #elif defined(DOUBLE) #define ERROR_NAME "ZGETF2" #else #define ERROR_NAME "CGETF2" #endif int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ blas_arg_t args; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.m = *M; args.n = *N; args.a = (void *)a; args.lda = *ldA; args.c = (void *)ipiv; info = 0; if (args.lda < MAX(1,args.m)) info = 4; if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.m == 0 || args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = GETF2(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zgetrf.c000066400000000000000000000102301313527062700200000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XGETRF" #elif defined(DOUBLE) #define ERROR_NAME "ZGETRF" #else #define ERROR_NAME "CGETRF" #endif int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ blas_arg_t args; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.m = *M; args.n = *N; args.a = (void *)a; args.lda = *ldA; args.c = (void *)ipiv; info = 0; if (args.lda < MAX(1,args.m)) info = 4; if (args.n < 0) info = 2; if (args.m < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.m == 0 || args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zgetrs.c000066400000000000000000000115771313527062700200340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XGETRS" #elif defined(DOUBLE) #define ERROR_NAME "ZGETRS" #else #define ERROR_NAME "CGETRS" #endif static blasint (*getrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ GETRS_N_SINGLE, GETRS_T_SINGLE, GETRS_R_SINGLE, GETRS_C_SINGLE, }; #ifdef SMP static blasint (*getrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ GETRS_N_PARALLEL, GETRS_T_PARALLEL, GETRS_R_PARALLEL, GETRS_C_PARALLEL, }; #endif int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, FLOAT *b, blasint *ldB, blasint *Info){ char trans_arg = *TRANS; blas_arg_t args; blasint info; int trans; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.m = *N; args.n = *NRHS; args.a = (void *)a; args.lda = *ldA; args.b = (void *)b; args.ldb = *ldB; args.c = (void *)ipiv; info = 0; TOUPPER(trans_arg); trans = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (args.ldb < MAX(1, args.m)) info = 8; if (args.lda < MAX(1, args.m)) info = 5; if (args.n < 0) info = 3; if (args.m < 0) info = 2; if (trans < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return 0; } args.alpha = NULL; args.beta = NULL; *Info = info; if (args.m == 0 || args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif (getrs_single[trans])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { (getrs_parallel[trans])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zlaswp.c000066400000000000000000000076441313527062700200360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, blasint *, BLASLONG) = { #ifdef XDOUBLE xlaswp_plus, xlaswp_minus, #elif defined(DOUBLE) zlaswp_plus, zlaswp_minus, #else claswp_plus, claswp_minus, #endif }; int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ blasint n = *N; blasint lda = *LDA; blasint k1 = *K1; blasint k2 = *K2; blasint incx = *INCX; int flag; #ifdef SMP int mode; FLOAT dummyalpha[2] = {ZERO, ZERO}; int nthreads; #endif PRINT_DEBUG_NAME; if (incx == 0 || n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); flag = (incx < 0); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (laswp[flag])(n, k1, k2, ZERO, ZERO, a, lda, NULL, 0, ipiv, incx); #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); } #endif FUNCTION_PROFILE_END(COMPSIZE, n * (k2 - k1), 0); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zlauu2.c000066400000000000000000000105111313527062700177230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QLAUU2" #elif defined(DOUBLE) #define ERROR_NAME "ZLAUU2" #else #define ERROR_NAME "CLAUU2" #endif static blasint (*lauu2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifdef XDOUBLE xlauu2_U, xlauu2_L, #elif defined(DOUBLE) zlauu2_U, zlauu2_L, #else clauu2_U, clauu2_L, #endif }; int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = (lauu2[uplo])(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 6. * 1./6. * args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zlauum.c000066400000000000000000000111341313527062700200200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XLAUUM" #elif defined(DOUBLE) #define ERROR_NAME "ZLAUUM" #else #define ERROR_NAME "CLAUUM" #endif static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { LAUUM_U_SINGLE, LAUUM_L_SINGLE, }; #ifdef SMP static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, }; #endif int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zpotf2.c000066400000000000000000000105111313527062700177250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XPOTF2" #elif defined(DOUBLE) #define ERROR_NAME "ZPOTF2" #else #define ERROR_NAME "CPOTF2" #endif static blasint (*potf2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifdef XDOUBLE xpotf2_U, xpotf2_L, #elif defined(DOUBLE) zpotf2_U, zpotf2_L, #else cpotf2_U, cpotf2_L, #endif }; int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = (potf2[uplo])(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 6. * 1./6. * args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zpotrf.c000066400000000000000000000111471313527062700200330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XPOTRF" #elif defined(DOUBLE) #define ERROR_NAME "ZPOTRF" #else #define ERROR_NAME "CPOTRF" #endif static blasint (*potrf_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ POTRF_U_SINGLE, POTRF_L_SINGLE, }; #ifdef SMP static blasint (*potrf_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ POTRF_U_PARALLEL, POTRF_L_PARALLEL, }; #endif int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = (potrf_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 6. * 1./6. * args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/zpotri.c000066400000000000000000000121721313527062700200350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XPOTRI" #elif defined(DOUBLE) #define ERROR_NAME "ZPOTRI" #else #define ERROR_NAME "CPOTRI" #endif static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UN_SINGLE, TRTRI_LN_SINGLE, }; static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ LAUUM_U_SINGLE, LAUUM_L_SINGLE, }; #ifdef SMP static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UN_PARALLEL, TRTRI_LN_PARALLEL, }; static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, }; #endif int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif info = (trtri_single[uplo])(&args, NULL, NULL, sa, sb, 0); if (!info) { info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); } *Info = info; #ifdef SMP } else { info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); if (!info) { info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } *Info = info; } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/ztrti2.c000066400000000000000000000111601313527062700177400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTRTI2" #elif defined(DOUBLE) #define ERROR_NAME "ZTRTI2" #else #define ERROR_NAME "CTRTI2" #endif static blasint (*trti2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifdef XDOUBLE xtrti2_UU, xtrti2_UN, xtrti2_LU, xtrti2_LN, #elif defined(DOUBLE) ztrti2_UU, ztrti2_UN, ztrti2_LU, ztrti2_LN, #else ctrti2_UU, ctrti2_UN, ctrti2_LU, ctrti2_LN, #endif }; int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint diag_arg = *DIAG; blasint uplo, diag; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); TOUPPER(diag_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; diag = -1; if (diag_arg == 'U') diag = 0; if (diag_arg == 'N') diag = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 5; if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = (trti2[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 6. * args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/lapack/ztrtri.c000066400000000000000000000120671313527062700200470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTRTRI" #elif defined(DOUBLE) #define ERROR_NAME "ZTRTRI" #else #define ERROR_NAME "CTRTRI" #endif static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UU_SINGLE, TRTRI_UN_SINGLE, TRTRI_LU_SINGLE, TRTRI_LN_SINGLE, }; #ifdef SMP static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ TRTRI_UU_PARALLEL, TRTRI_UN_PARALLEL, TRTRI_LU_PARALLEL, TRTRI_LN_PARALLEL, }; #endif int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint diag_arg = *DIAG; blasint uplo, diag; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); TOUPPER(diag_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; diag = -1; if (diag_arg == 'U') diag = 0; if (diag_arg == 'N') diag = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 5; if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; if (diag) { if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { *Info = IAMIN_K(args.n, args.a, args.lda + 1); return 0; } } IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 6. * args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); IDEBUG_END; return 0; } OpenBLAS-0.2.20/interface/max.c000066400000000000000000000102661313527062700160420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #undef MAX_K #ifdef USE_ABS #ifndef USE_MIN /* ABS & MAX */ #ifndef COMPLEX #ifdef XDOUBLE #define MAX_K QAMAX_K #elif defined(DOUBLE) #define MAX_K DAMAX_K #else #define MAX_K SAMAX_K #endif #else #ifdef XDOUBLE #define MAX_K XAMAX_K #elif defined(DOUBLE) #define MAX_K ZAMAX_K #else #define MAX_K CAMAX_K #endif #endif #else /* ABS & MIN */ #ifndef COMPLEX #ifdef XDOUBLE #define MAX_K QAMIN_K #elif defined(DOUBLE) #define MAX_K DAMIN_K #else #define MAX_K SAMIN_K #endif #else #ifdef XDOUBLE #define MAX_K XAMIN_K #elif defined(DOUBLE) #define MAX_K ZAMIN_K #else #define MAX_K CAMIN_K #endif #endif #endif #else #ifndef USE_MIN /* MAX */ #ifdef XDOUBLE #define MAX_K QMAX_K #elif defined(DOUBLE) #define MAX_K DMAX_K #else #define MAX_K SMAX_K #endif #else /* MIN */ #ifdef XDOUBLE #define MAX_K QMIN_K #elif defined(DOUBLE) #define MAX_K DMIN_K #else #define MAX_K SMIN_K #endif #endif #endif #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ BLASLONG n = *N; BLASLONG incx = *INCX; FLOATRET ret; PRINT_DEBUG_NAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); ret = (FLOATRET)MAX_K(n, x, incx); FUNCTION_PROFILE_END(COMPSIZE, n, 0); IDEBUG_END; return ret; } #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ FLOAT ret; PRINT_DEBUG_CNAME; if (n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); ret = MAX_K(n, x, incx); FUNCTION_PROFILE_END(COMPSIZE, n, 0); IDEBUG_END; return ret; } #endif OpenBLAS-0.2.20/interface/netlib/000077500000000000000000000000001313527062700163615ustar00rootroot00000000000000OpenBLAS-0.2.20/interface/netlib/cgemv.f000066400000000000000000000177221313527062700176420ustar00rootroot00000000000000 SUBROUTINE CGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * .. Scalar Arguments .. COMPLEX ALPHA,BETA INTEGER INCX,INCY,LDA,M,N CHARACTER TRANS * .. * .. Array Arguments .. COMPLEX A(LDA,*),X(*),Y(*) * .. * * Purpose * ======= * * CGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or * * y := alpha*A**H*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Arguments * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - COMPLEX array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * Further Details * =============== * * Level 2 Blas routine. * The vector and matrix arguments are not referenced when N = 0, or M = 0 * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER (ONE= (1.0E+0,0.0E+0)) COMPLEX ZERO PARAMETER (ZERO= (0.0E+0,0.0E+0)) * .. * .. Local Scalars .. COMPLEX TEMP INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY LOGICAL NOCONJ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG,MAX * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 1 ELSE IF (M.LT.0) THEN INFO = 2 ELSE IF (N.LT.0) THEN INFO = 3 ELSE IF (LDA.LT.MAX(1,M)) THEN INFO = 6 ELSE IF (INCX.EQ.0) THEN INFO = 8 ELSE IF (INCY.EQ.0) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('CGEMV ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN * NOCONJ = LSAME(TRANS,'T') * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF (LSAME(TRANS,'N')) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF (INCX.GT.0) THEN KX = 1 ELSE KX = 1 - (LENX-1)*INCX END IF IF (INCY.GT.0) THEN KY = 1 ELSE KY = 1 - (LENY-1)*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF (BETA.NE.ONE) THEN IF (INCY.EQ.1) THEN IF (BETA.EQ.ZERO) THEN DO 10 I = 1,LENY Y(I) = ZERO 10 CONTINUE ELSE DO 20 I = 1,LENY Y(I) = BETA*Y(I) 20 CONTINUE END IF ELSE IY = KY IF (BETA.EQ.ZERO) THEN DO 30 I = 1,LENY Y(IY) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1,LENY Y(IY) = BETA*Y(IY) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF (ALPHA.EQ.ZERO) RETURN IF (LSAME(TRANS,'N')) THEN * * Form y := alpha*A*x + y. * JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) DO 50 I = 1,M Y(I) = Y(I) + TEMP*A(I,J) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) IY = KY DO 70 I = 1,M Y(IY) = Y(IY) + TEMP*A(I,J) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A**T*x + y or y := alpha*A**H*x + y. * JY = KY IF (INCX.EQ.1) THEN DO 110 J = 1,N TEMP = ZERO IF (NOCONJ) THEN DO 90 I = 1,M TEMP = TEMP + A(I,J)*X(I) 90 CONTINUE ELSE DO 100 I = 1,M TEMP = TEMP + CONJG(A(I,J))*X(I) 100 CONTINUE END IF Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 110 CONTINUE ELSE DO 140 J = 1,N TEMP = ZERO IX = KX IF (NOCONJ) THEN DO 120 I = 1,M TEMP = TEMP + A(I,J)*X(IX) IX = IX + INCX 120 CONTINUE ELSE DO 130 I = 1,M TEMP = TEMP + CONJG(A(I,J))*X(IX) IX = IX + INCX 130 CONTINUE END IF Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of CGEMV . * END OpenBLAS-0.2.20/interface/netlib/dgemv.f000066400000000000000000000165301313527062700176370ustar00rootroot00000000000000 SUBROUTINE DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA,BETA INTEGER INCX,INCY,LDA,M,N CHARACTER TRANS * .. * .. Array Arguments .. DOUBLE PRECISION A(LDA,*),X(*),Y(*) * .. * * Purpose * ======= * * DGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Arguments * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * Further Details * =============== * * Level 2 Blas routine. * The vector and matrix arguments are not referenced when N = 0, or M = 0 * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE,ZERO PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) * .. * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 1 ELSE IF (M.LT.0) THEN INFO = 2 ELSE IF (N.LT.0) THEN INFO = 3 ELSE IF (LDA.LT.MAX(1,M)) THEN INFO = 6 ELSE IF (INCX.EQ.0) THEN INFO = 8 ELSE IF (INCY.EQ.0) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('DGEMV ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF (LSAME(TRANS,'N')) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF (INCX.GT.0) THEN KX = 1 ELSE KX = 1 - (LENX-1)*INCX END IF IF (INCY.GT.0) THEN KY = 1 ELSE KY = 1 - (LENY-1)*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF (BETA.NE.ONE) THEN IF (INCY.EQ.1) THEN IF (BETA.EQ.ZERO) THEN DO 10 I = 1,LENY Y(I) = ZERO 10 CONTINUE ELSE DO 20 I = 1,LENY Y(I) = BETA*Y(I) 20 CONTINUE END IF ELSE IY = KY IF (BETA.EQ.ZERO) THEN DO 30 I = 1,LENY Y(IY) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1,LENY Y(IY) = BETA*Y(IY) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF (ALPHA.EQ.ZERO) RETURN IF (LSAME(TRANS,'N')) THEN * * Form y := alpha*A*x + y. * JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) DO 50 I = 1,M Y(I) = Y(I) + TEMP*A(I,J) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) IY = KY DO 70 I = 1,M Y(IY) = Y(IY) + TEMP*A(I,J) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A**T*x + y. * JY = KY IF (INCX.EQ.1) THEN DO 100 J = 1,N TEMP = ZERO DO 90 I = 1,M TEMP = TEMP + A(I,J)*X(I) 90 CONTINUE Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 100 CONTINUE ELSE DO 120 J = 1,N TEMP = ZERO IX = KX DO 110 I = 1,M TEMP = TEMP + A(I,J)*X(IX) IX = IX + INCX 110 CONTINUE Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of DGEMV . * END OpenBLAS-0.2.20/interface/netlib/sgemv.f000066400000000000000000000164501313527062700176570ustar00rootroot00000000000000 SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * .. Scalar Arguments .. REAL ALPHA,BETA INTEGER INCX,INCY,LDA,M,N CHARACTER TRANS * .. * .. Array Arguments .. REAL A(LDA,*),X(*),Y(*) * .. * * Purpose * ======= * * SGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Arguments * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A**T*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - REAL array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - REAL array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * Further Details * =============== * * Level 2 Blas routine. * The vector and matrix arguments are not referenced when N = 0, or M = 0 * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * ===================================================================== * * .. Parameters .. REAL ONE,ZERO PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) * .. * .. Local Scalars .. REAL TEMP INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 1 ELSE IF (M.LT.0) THEN INFO = 2 ELSE IF (N.LT.0) THEN INFO = 3 ELSE IF (LDA.LT.MAX(1,M)) THEN INFO = 6 ELSE IF (INCX.EQ.0) THEN INFO = 8 ELSE IF (INCY.EQ.0) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('SGEMV ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF (LSAME(TRANS,'N')) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF (INCX.GT.0) THEN KX = 1 ELSE KX = 1 - (LENX-1)*INCX END IF IF (INCY.GT.0) THEN KY = 1 ELSE KY = 1 - (LENY-1)*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF (BETA.NE.ONE) THEN IF (INCY.EQ.1) THEN IF (BETA.EQ.ZERO) THEN DO 10 I = 1,LENY Y(I) = ZERO 10 CONTINUE ELSE DO 20 I = 1,LENY Y(I) = BETA*Y(I) 20 CONTINUE END IF ELSE IY = KY IF (BETA.EQ.ZERO) THEN DO 30 I = 1,LENY Y(IY) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1,LENY Y(IY) = BETA*Y(IY) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF (ALPHA.EQ.ZERO) RETURN IF (LSAME(TRANS,'N')) THEN * * Form y := alpha*A*x + y. * JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) DO 50 I = 1,M Y(I) = Y(I) + TEMP*A(I,J) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) IY = KY DO 70 I = 1,M Y(IY) = Y(IY) + TEMP*A(I,J) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A**T*x + y. * JY = KY IF (INCX.EQ.1) THEN DO 100 J = 1,N TEMP = ZERO DO 90 I = 1,M TEMP = TEMP + A(I,J)*X(I) 90 CONTINUE Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 100 CONTINUE ELSE DO 120 J = 1,N TEMP = ZERO IX = KX DO 110 I = 1,M TEMP = TEMP + A(I,J)*X(IX) IX = IX + INCX 110 CONTINUE Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of SGEMV . * END OpenBLAS-0.2.20/interface/netlib/zgemv.f000066400000000000000000000177711313527062700176750ustar00rootroot00000000000000 SUBROUTINE ZGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * .. Scalar Arguments .. DOUBLE COMPLEX ALPHA,BETA INTEGER INCX,INCY,LDA,M,N CHARACTER TRANS * .. * .. Array Arguments .. DOUBLE COMPLEX A(LDA,*),X(*),Y(*) * .. * * Purpose * ======= * * ZGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y, or * * y := alpha*A**H*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Arguments * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A**T*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A**H*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - COMPLEX*16 array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX*16 array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * Further Details * =============== * * Level 2 Blas routine. * The vector and matrix arguments are not referenced when N = 0, or M = 0 * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * ===================================================================== * * .. Parameters .. DOUBLE COMPLEX ONE PARAMETER (ONE= (1.0D+0,0.0D+0)) DOUBLE COMPLEX ZERO PARAMETER (ZERO= (0.0D+0,0.0D+0)) * .. * .. Local Scalars .. DOUBLE COMPLEX TEMP INTEGER I,INFO,IX,IY,J,JX,JY,KX,KY,LENX,LENY LOGICAL NOCONJ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DCONJG,MAX * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 1 ELSE IF (M.LT.0) THEN INFO = 2 ELSE IF (N.LT.0) THEN INFO = 3 ELSE IF (LDA.LT.MAX(1,M)) THEN INFO = 6 ELSE IF (INCX.EQ.0) THEN INFO = 8 ELSE IF (INCY.EQ.0) THEN INFO = 11 END IF IF (INFO.NE.0) THEN CALL XERBLA('ZGEMV ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN * NOCONJ = LSAME(TRANS,'T') * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF (LSAME(TRANS,'N')) THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF (INCX.GT.0) THEN KX = 1 ELSE KX = 1 - (LENX-1)*INCX END IF IF (INCY.GT.0) THEN KY = 1 ELSE KY = 1 - (LENY-1)*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF (BETA.NE.ONE) THEN IF (INCY.EQ.1) THEN IF (BETA.EQ.ZERO) THEN DO 10 I = 1,LENY Y(I) = ZERO 10 CONTINUE ELSE DO 20 I = 1,LENY Y(I) = BETA*Y(I) 20 CONTINUE END IF ELSE IY = KY IF (BETA.EQ.ZERO) THEN DO 30 I = 1,LENY Y(IY) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1,LENY Y(IY) = BETA*Y(IY) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF (ALPHA.EQ.ZERO) RETURN IF (LSAME(TRANS,'N')) THEN * * Form y := alpha*A*x + y. * JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) DO 50 I = 1,M Y(I) = Y(I) + TEMP*A(I,J) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N IF (X(JX).NE.ZERO) THEN TEMP = ALPHA*X(JX) IY = KY DO 70 I = 1,M Y(IY) = Y(IY) + TEMP*A(I,J) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A**T*x + y or y := alpha*A**H*x + y. * JY = KY IF (INCX.EQ.1) THEN DO 110 J = 1,N TEMP = ZERO IF (NOCONJ) THEN DO 90 I = 1,M TEMP = TEMP + A(I,J)*X(I) 90 CONTINUE ELSE DO 100 I = 1,M TEMP = TEMP + DCONJG(A(I,J))*X(I) 100 CONTINUE END IF Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 110 CONTINUE ELSE DO 140 J = 1,N TEMP = ZERO IX = KX IF (NOCONJ) THEN DO 120 I = 1,M TEMP = TEMP + A(I,J)*X(IX) IX = IX + INCX 120 CONTINUE ELSE DO 130 I = 1,M TEMP = TEMP + DCONJG(A(I,J))*X(IX) IX = IX + INCX 130 CONTINUE END IF Y(JY) = Y(JY) + ALPHA*TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of ZGEMV . * END OpenBLAS-0.2.20/interface/nrm2.c000066400000000000000000000064611313527062700161350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ BLASLONG n = *N; BLASLONG incx = *INCX; FLOATRET ret; PRINT_DEBUG_NAME; if (n <= 0) return 0.; IDEBUG_START; FUNCTION_PROFILE_START(); ret = (FLOATRET)NRM2_K(n, x, incx); FUNCTION_PROFILE_END(COMPSIZE, n, 2 * n); IDEBUG_END; return ret; } #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ FLOAT ret; PRINT_DEBUG_CNAME; if (n <= 0) return 0.; IDEBUG_START; FUNCTION_PROFILE_START(); ret = NRM2_K(n, x, incx); FUNCTION_PROFILE_END(COMPSIZE, n, 2 * n); IDEBUG_END; return ret; } #endif OpenBLAS-0.2.20/interface/omatcopy.c000066400000000000000000000103421313527062700171030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** * 2014/06/09 Saar ***********************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #if defined(DOUBLE) #define ERROR_NAME "DOMATCOPY" #else #define ERROR_NAME "SOMATCOPY" #endif #define BlasRowMajor 0 #define BlasColMajor 1 #define BlasNoTrans 0 #define BlasTrans 1 #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb) { char Order, Trans; int order=-1,trans=-1; blasint info = -1; Order = *ORDER; Trans = *TRANS; TOUPPER(Order); TOUPPER(Trans); if ( Order == 'C' ) order = BlasColMajor; if ( Order == 'R' ) order = BlasRowMajor; if ( Trans == 'N' ) trans = BlasNoTrans; if ( Trans == 'R' ) trans = BlasNoTrans; if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTrans; #else void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, FLOAT *b, blasint cldb) { blasint *rows, *cols, *lda, *ldb; FLOAT *alpha; int order=-1,trans=-1; blasint info = -1; if ( CORDER == CblasColMajor ) order = BlasColMajor; if ( CORDER == CblasRowMajor ) order = BlasRowMajor; if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans ) trans = BlasNoTrans; if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans; rows = &crows; cols = &ccols; lda = &clda; ldb = &cldb; alpha = &calpha; #endif if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasTrans && *ldb < *cols ) info = 9; } if ( order == BlasRowMajor) { if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; if ( trans == BlasTrans && *ldb < *rows ) info = 9; } if ( order == BlasColMajor && *lda < *rows ) info = 7; if ( order == BlasRowMajor && *lda < *cols ) info = 7; if ( *cols <= 0 ) info = 4; if ( *rows <= 0 ) info = 3; if ( trans < 0 ) info = 2; if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb ); } else { OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb ); } } else { if ( trans == BlasNoTrans ) { OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb ); } else { OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb ); } } return; } OpenBLAS-0.2.20/interface/rot.c000066400000000000000000000064441313527062700160640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT c = *C; FLOAT s = *S; PRINT_DEBUG_NAME; #else void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, FLOAT s){ PRINT_DEBUG_CNAME; #endif if (n <= 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; ROT_K(n, x, incx, y, incy, c, s); FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/rotg.c000066400000000000000000000032331313527062700162240ustar00rootroot00000000000000#include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ #else void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ #endif #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da = *DA; long double db = *DB; long double c; long double s; long double r, roe, z; long double ada = fabs(da); long double adb = fabs(db); long double scale = ada + adb; #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif roe = db; if (ada > adb) roe = da; if (scale == ZERO) { *C = ONE; *S = ZERO; *DA = ZERO; *DB = ZERO; } else { r = sqrt(da * da + db * db); if (roe < 0) r = -r; c = da / r; s = db / r; z = ONE; if (da != ZERO) { if (ada > adb){ z = s; } else { z = ONE / c; } } *C = c; *S = s; *DA = r; *DB = z; } #else FLOAT da = *DA; FLOAT db = *DB; FLOAT c = *C; FLOAT s = *S; FLOAT r, roe, z; FLOAT ada = fabs(da); FLOAT adb = fabs(db); FLOAT scale = ada + adb; #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif roe = db; if (ada > adb) roe = da; if (scale == ZERO) { *C = ONE; *S = ZERO; *DA = ZERO; *DB = ZERO; } else { FLOAT aa = da / scale; FLOAT bb = db / scale; r = scale * sqrt(aa * aa + bb * bb); if (roe < 0) r = -r; c = da / r; s = db / r; z = ONE; if (ada > adb) z = s; if ((ada <= adb) && (c != ZERO)) z = ONE / c; *C = c; *S = s; *DA = r; *DB = z; } #endif return; } OpenBLAS-0.2.20/interface/rotm.c000066400000000000000000000052511313527062700162340ustar00rootroot00000000000000#include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; #else void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ #endif blasint i__1, i__2; blasint i__; FLOAT w, z__; blasint kx, ky; FLOAT dh11, dh12, dh22, dh21, dflag; blasint nsteps; #ifndef CBLAS PRINT_DEBUG_CNAME; #else PRINT_DEBUG_CNAME; #endif --dparam; --dy; --dx; dflag = dparam[1]; if (n <= 0 || dflag == - 2.0) goto L140; if (! (incx == incy && incx > 0)) goto L70; nsteps = n * incx; if (dflag < 0.) { goto L50; } else if (dflag == 0) { goto L10; } else { goto L30; } L10: dh12 = dparam[4]; dh21 = dparam[3]; i__1 = nsteps; i__2 = incx; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { w = dx[i__]; z__ = dy[i__]; dx[i__] = w + z__ * dh12; dy[i__] = w * dh21 + z__; /* L20: */ } goto L140; L30: dh11 = dparam[2]; dh22 = dparam[5]; i__2 = nsteps; i__1 = incx; for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { w = dx[i__]; z__ = dy[i__]; dx[i__] = w * dh11 + z__; dy[i__] = -w + dh22 * z__; /* L40: */ } goto L140; L50: dh11 = dparam[2]; dh12 = dparam[4]; dh21 = dparam[3]; dh22 = dparam[5]; i__1 = nsteps; i__2 = incx; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { w = dx[i__]; z__ = dy[i__]; dx[i__] = w * dh11 + z__ * dh12; dy[i__] = w * dh21 + z__ * dh22; /* L60: */ } goto L140; L70: kx = 1; ky = 1; if (incx < 0) { kx = (1 - n) * incx + 1; } if (incy < 0) { ky = (1 - n) * incy + 1; } if (dflag < 0.) { goto L120; } else if (dflag == 0) { goto L80; } else { goto L100; } L80: dh12 = dparam[4]; dh21 = dparam[3]; i__2 = n; for (i__ = 1; i__ <= i__2; ++i__) { w = dx[kx]; z__ = dy[ky]; dx[kx] = w + z__ * dh12; dy[ky] = w * dh21 + z__; kx += incx; ky += incy; /* L90: */ } goto L140; L100: dh11 = dparam[2]; dh22 = dparam[5]; i__2 = n; for (i__ = 1; i__ <= i__2; ++i__) { w = dx[kx]; z__ = dy[ky]; dx[kx] = w * dh11 + z__; dy[ky] = -w + dh22 * z__; kx += incx; ky += incy; /* L110: */ } goto L140; L120: dh11 = dparam[2]; dh12 = dparam[4]; dh21 = dparam[3]; dh22 = dparam[5]; i__2 = n; for (i__ = 1; i__ <= i__2; ++i__) { w = dx[kx]; z__ = dy[ky]; dx[kx] = w * dh11 + z__ * dh12; dy[ky] = w * dh21 + z__ * dh22; kx += incx; ky += incy; /* L130: */ } L140: return; } OpenBLAS-0.2.20/interface/rotmg.c000066400000000000000000000112751313527062700164060ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2014/05/02 Saar * fixed two bugs as reported by Brendan Tracey * Test with lapack-3.5.0 : OK * **************************************************************************************/ #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #define GAM 4096.e0 #define GAMSQ 16777216.e0 #define RGAMSQ 5.9604645e-8 #define TWO 2.e0 #ifdef DOUBLE #define ABS(x) fabs(x) #else #define ABS(x) fabsf(x) #endif #ifndef CBLAS void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){ FLOAT dy1 = *DY1; #else void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ #endif FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp; if(*dd1 < ZERO) { dflag = -ONE; dh11 = ZERO; dh12 = ZERO; dh21 = ZERO; dh22 = ZERO; *dd1 = ZERO; *dd2 = ZERO; *dx1 = ZERO; } else { dp2 = *dd2 * dy1; if(dp2 == ZERO) { dflag = -TWO; dparam[0] = dflag; return; } dp1 = *dd1 * *dx1; dq2 = dp2 * dy1; dq1 = dp1 * *dx1; if(ABS(dq1) > ABS(dq2)) { dh21 = - dy1 / *dx1; dh12 = dp2 / dp1; du = ONE - dh12 * dh21; if(du > ZERO) { dflag = ZERO; *dd1 = *dd1 / du; *dd2 = *dd2 / du; *dx1 = *dx1 * du; } } else { if(dq2 < ZERO) { dflag = -ONE; dh11 = ZERO; dh12 = ZERO; dh21 = ZERO; dh22 = ZERO; *dd1 = ZERO; *dd2 = ZERO; *dx1 = ZERO; } else { dflag = ONE; dh11 = dp1 / dp2; dh22 = *dx1 / dy1; du = ONE + dh11 * dh22; dtemp = *dd2 / du; *dd2 = *dd1 / du; *dd1 = dtemp; *dx1 = dy1 * du; } } if(*dd1 != ZERO) { while( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) ) { if(dflag == ZERO) { dh11 = ONE; dh22 = ONE; dflag = -ONE; } else { if(dflag == ONE) { dh21 = -ONE; dh12 = ONE; dflag = -ONE; } } if( *dd1 <= RGAMSQ ) { *dd1 = *dd1 * (GAM * GAM); *dx1 = *dx1 / GAM; dh11 = dh11 / GAM; dh12 = dh12 / GAM; } else { *dd1 = *dd1 / (GAM * GAM); *dx1 = *dx1 * GAM; dh11 = dh11 * GAM; dh12 = dh12 * GAM; } } } if(*dd2 != ZERO) { while( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) ) { if(dflag == ZERO) { dh11 = ONE; dh22 = ONE; dflag = -ONE; } else { if(dflag == ONE) { dh21 = -ONE; dh12 = ONE; dflag = -ONE; } } if( ABS(*dd2) <= RGAMSQ ) { *dd2 = *dd2 * (GAM * GAM); dh21 = dh21 / GAM; dh22 = dh22 / GAM; } else { *dd2 = *dd2 / (GAM * GAM); dh21 = dh21 * GAM; dh22 = dh22 * GAM; } } } } if(dflag < ZERO) { dparam[1] = dh11; dparam[2] = dh21; dparam[3] = dh12; dparam[4] = dh22; } else { if(dflag == ZERO) { dparam[2] = dh21; dparam[3] = dh12; } else { dparam[1] = dh11; dparam[4] = dh22; } } dparam[0] = dflag; return; } OpenBLAS-0.2.20/interface/sbmv.c000066400000000000000000000142411313527062700162210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif /* #ifdef SMP #ifdef __64BIT__ #define SMPTEST 1 #endif #endif */ #ifdef XDOUBLE #define ERROR_NAME "QSBMV " #elif defined(DOUBLE) #define ERROR_NAME "DSBMV " #else #define ERROR_NAME "SSBMV " #endif static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qsbmv_U, qsbmv_L, #elif defined(DOUBLE) dsbmv_U, dsbmv_L, #else ssbmv_U, ssbmv_L, #endif }; #ifdef SMPTEST static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qsbmv_thread_U, qsbmv_thread_L, #elif defined(DOUBLE) dsbmv_thread_U, dsbmv_thread_L, #else ssbmv_thread_U, ssbmv_thread_L, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; blasint k = *K; FLOAT alpha = *ALPHA; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta = *BETA; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, blasint k, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; int uplo; blasint info; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMPTEST nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (sbmv[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMPTEST } else { (sbmv_thread[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/scal.c000066400000000000000000000072251313527062700162000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ blasint n = *N; blasint incx = *INCX; FLOAT alpha = *ALPHA; #else void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #endif #ifdef SMP int mode, nthreads; #endif #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif if (incx <= 0 || n <= 0) return; if (alpha == ONE) return; IDEBUG_START; FUNCTION_PROFILE_START(); #ifdef SMP nthreads = num_cpu_avail(1); if (n <= 1048576 ) nthreads = 1; if (nthreads == 1) { #endif SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0); #ifdef SMP } else { #ifdef DOUBLE mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif blas_level1_thread(mode, n, 0, 0, #ifndef CBLAS ALPHA, #else &alpha, #endif x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); } #endif FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/sdsdot.c000066400000000000000000000071171313527062700165560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOATRET ret; PRINT_DEBUG_NAME; if (n <= 0) return(*a) ; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; ret = (FLOATRET)(SDSDOT_K(n, x, incx, y, incy) + *a); FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; return ret; } #else FLOAT CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ FLOAT ret; PRINT_DEBUG_CNAME; if (n <= 0) return (alpha); IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; ret = SDSDOT_K(n, x, incx, y, incy) + alpha; FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; return ret; } #endif OpenBLAS-0.2.20/interface/spmv.c000066400000000000000000000133451313527062700162430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QSPMV " #elif defined(DOUBLE) #define ERROR_NAME "DSPMV " #else #define ERROR_NAME "SSPMV " #endif static int (*spmv[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qspmv_U, qspmv_L, #elif defined(DOUBLE) dspmv_U, dspmv_L, #else sspmv_U, sspmv_L, #endif }; #ifdef SMPTEST static int (*spmv_thread[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qspmv_thread_U, qspmv_thread_L, #elif defined(DOUBLE) dspmv_thread_U, dspmv_thread_L, #else sspmv_thread_U, sspmv_thread_L, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint incx = *INCX; FLOAT beta = *BETA; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 9; if (incx == 0) info = 6; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *a, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; int uplo; blasint info; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 9; if (incx == 0) info = 6; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incy == 0) info = 9; if (incx == 0) info = 6; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMPTEST nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spmv[uplo])(n, alpha, a, x, incx, y, incy, buffer); #ifdef SMPTEST } else { (spmv_thread[uplo])(n, alpha, a, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/spr.c000066400000000000000000000124321313527062700160560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QSPR " #elif defined(DOUBLE) #define ERROR_NAME "DSPR " #else #define ERROR_NAME "SSPR " #endif static int (*spr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { #ifdef XDOUBLE qspr_U, qspr_L, #elif defined(DOUBLE) dspr_U, dspr_L, #else sspr_U, sspr_L, #endif }; #ifdef SMP static int (*spr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { #ifdef XDOUBLE qspr_thread_U, qspr_thread_L, #elif defined(DOUBLE) dspr_thread_U, dspr_thread_L, #else sspr_thread_U, sspr_thread_L, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spr[uplo])(n, alpha, x, incx, a, buffer); #ifdef SMP } else { (spr_thread[uplo])(n, alpha, x, incx, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/spr2.c000066400000000000000000000131211313527062700161340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QSPR2 " #elif defined(DOUBLE) #define ERROR_NAME "DSPR2 " #else #define ERROR_NAME "SSPR2 " #endif static int (*spr2[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { #ifdef XDOUBLE qspr2_U, qspr2_L, #elif defined(DOUBLE) dspr2_U, dspr2_L, #else sspr2_U, sspr2_L, #endif }; #ifdef SMP static int (*spr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { #ifdef XDOUBLE qspr2_thread_U, qspr2_thread_L, #elif defined(DOUBLE) dspr2_thread_U, dspr2_thread_L, #else sspr2_thread_U, sspr2_thread_L, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint incx = *INCX; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spr2[uplo])(n, alpha, x, incx, y, incy, a, buffer); #ifdef SMP } else { (spr2_thread[uplo])(n, alpha, x, incx, y, incy, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/swap.c000066400000000000000000000102721313527062700162240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #if defined(THUNDERX2T99) || defined(VULCAN) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance // benefits. Keep the multi-threading code for the record. #undef SMP #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; #else void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #endif #ifdef SMP int mode, nthreads; FLOAT dummyalpha[2] = {ZERO, ZERO}; #endif #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif if (n <= 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; #ifdef SMP //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) nthreads = 1; else nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif SWAP_K(n, 0, 0, ZERO, x, incx, y, incy, NULL, 0); #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif blas_level1_thread(mode, n, 0, 0, dummyalpha, x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); } #endif FUNCTION_PROFILE_END(1, 2 * n, 0); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/symm.c000066400000000000000000000240071313527062700162400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef COMPLEX #ifdef XDOUBLE #define ERROR_NAME "QSYMM " #elif defined(DOUBLE) #define ERROR_NAME "DSYMM " #else #define ERROR_NAME "SSYMM " #endif #else #ifndef GEMM3M #ifndef HEMM #ifdef XDOUBLE #define ERROR_NAME "XSYMM " #elif defined(DOUBLE) #define ERROR_NAME "ZSYMM " #else #define ERROR_NAME "CSYMM " #endif #else #ifdef XDOUBLE #define ERROR_NAME "XHEMM " #elif defined(DOUBLE) #define ERROR_NAME "ZHEMM " #else #define ERROR_NAME "CHEMM " #endif #endif #else #ifndef HEMM #ifdef XDOUBLE #define ERROR_NAME "XSYMM3M " #elif defined(DOUBLE) #define ERROR_NAME "ZSYMM3M " #else #define ERROR_NAME "CSYMM3M " #endif #else #ifdef XDOUBLE #define ERROR_NAME "XHEMM3M " #elif defined(DOUBLE) #define ERROR_NAME "ZHEMM3M " #else #define ERROR_NAME "CHEMM3M " #endif #endif #endif #endif #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE #define MODE (BLAS_XDOUBLE | BLAS_REAL) #elif defined(DOUBLE) #define MODE (BLAS_DOUBLE | BLAS_REAL) #else #define MODE (BLAS_SINGLE | BLAS_REAL) #endif #else #ifdef XDOUBLE #define MODE (BLAS_XDOUBLE | BLAS_COMPLEX) #elif defined(DOUBLE) #define MODE (BLAS_DOUBLE | BLAS_COMPLEX) #else #define MODE (BLAS_SINGLE | BLAS_COMPLEX) #endif #endif #endif static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef GEMM3M #ifndef HEMM SYMM_LU, SYMM_LL, SYMM_RU, SYMM_RL, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) SYMM_THREAD_LU, SYMM_THREAD_LL, SYMM_THREAD_RU, SYMM_THREAD_RL, #endif #else HEMM_LU, HEMM_LL, HEMM_RU, HEMM_RL, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) HEMM_THREAD_LU, HEMM_THREAD_LL, HEMM_THREAD_RU, HEMM_THREAD_RL, #endif #endif #else #ifndef HEMM SYMM3M_LU, SYMM3M_LL, SYMM3M_RU, SYMM3M_RL, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) SYMM3M_THREAD_LU, SYMM3M_THREAD_LL, SYMM3M_THREAD_RU, SYMM3M_THREAD_RL, #endif #else HEMM3M_LU, HEMM3M_LL, HEMM3M_RU, HEMM3M_RL, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) HEMM3M_THREAD_LU, HEMM3M_THREAD_LL, HEMM3M_THREAD_RU, HEMM3M_THREAD_RL, #endif #endif #endif }; #ifndef CBLAS void NAME(char *SIDE, char *UPLO, blasint *M, blasint *N, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ char side_arg = *SIDE; char uplo_arg = *UPLO; blas_arg_t args; FLOAT *buffer; FLOAT *sa, *sb; #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif blasint info; int side; int uplo; PRINT_DEBUG_NAME; args.alpha = (void *)alpha; args.beta = (void *)beta; TOUPPER(side_arg); TOUPPER(uplo_arg); side = -1; uplo = -1; if (side_arg == 'L') side = 0; if (side_arg == 'R') side = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; args.m = *M; args.n = *N; args.c = (void *)c; args.ldc = *ldC; info = 0; if (args.ldc < MAX(1, args.m)) info = 12; if (!side) { args.a = (void *)a; args.b = (void *)b; args.lda = *ldA; args.ldb = *ldB; if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; } else { args.a = (void *)b; args.b = (void *)a; args.lda = *ldB; args.ldb = *ldA; if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint m, blasint n, #ifndef COMPLEX FLOAT alpha, #else FLOAT *alpha, #endif FLOAT *a, blasint lda, FLOAT *b, blasint ldb, #ifndef COMPLEX FLOAT beta, #else FLOAT *beta, #endif FLOAT *c, blasint ldc) { blas_arg_t args; int side, uplo; blasint info; FLOAT *buffer; FLOAT *sa, *sb; #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif PRINT_DEBUG_CNAME; #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β #else args.alpha = (void *)alpha; args.beta = (void *)beta; #endif args.c = (void *)c; args.ldc = ldc; side = -1; uplo = -1; info = 0; if (order == CblasColMajor) { if (Side == CblasLeft) side = 0; if (Side == CblasRight) side = 1; if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; args.m = m; args.n = n; if (args.ldc < MAX(1, args.m)) info = 12; if (!side) { args.a = (void *)a; args.b = (void *)b; args.lda = lda; args.ldb = ldb; if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; } else { args.a = (void *)b; args.b = (void *)a; args.lda = ldb; args.ldb = lda; if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; } if (order == CblasRowMajor) { if (Side == CblasLeft) side = 1; if (Side == CblasRight) side = 0; if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; args.m = n; args.n = m; if (args.ldc < MAX(1, args.m)) info = 12; if (!side) { args.a = (void *)a; args.b = (void *)b; args.lda = lda; args.ldb = ldb; if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; } else { args.a = (void *)b; args.b = (void *)a; args.lda = ldb; args.ldb = lda; if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (args.m == 0 || args.n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { #endif (symm[(side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { #ifndef NO_AFFINITY nodes = get_num_nodes(); if (nodes > 1) { args.nthreads /= nodes; gemm_thread_mn(MODE, &args, NULL, NULL, symm[4 | (side << 1) | uplo ], sa, sb, nodes); } else { #endif #ifndef USE_SIMPLE_THREADED_LEVEL3 (symm[4 | (side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); #else GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); #endif #ifndef NO_AFFINITY } #endif } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, (!side)? args.m * (args.m / 2 + args.n) : args.n * (args.m + args.n / 2), (!side)? 2 * args.m * args.m * args.n : 2 * args.m * args.n * args.n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/symv.c000066400000000000000000000137231313527062700162540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QSYMV " #elif defined(DOUBLE) #define ERROR_NAME "DSYMV " #else #define ERROR_NAME "SSYMV " #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta = *BETA; blasint incy = *INCY; int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (symv_thread[uplo])(n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/syr.c000066400000000000000000000126531313527062700160740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QSYR " #elif defined(DOUBLE) #define ERROR_NAME "DSYR " #else #define ERROR_NAME "SSYR " #endif static int (*syr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE qsyr_U, qsyr_L, #elif defined(DOUBLE) dsyr_U, dsyr_L, #else ssyr_U, ssyr_L, #endif }; #ifdef SMP static int (*syr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qsyr_thread_U, qsyr_thread_L, #elif defined(DOUBLE) dsyr_thread_U, dsyr_thread_L, #else ssyr_thread_U, ssyr_thread_L, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (syr[uplo])(n, alpha, x, incx, a, lda, buffer); #ifdef SMP } else { (syr_thread[uplo])(n, alpha, x, incx, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/syr2.c000066400000000000000000000133361313527062700161550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QSYR2 " #elif defined(DOUBLE) #define ERROR_NAME "DSYR2 " #else #define ERROR_NAME "SSYR2 " #endif static int (*syr2[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE qsyr2_U, qsyr2_L, #elif defined(DOUBLE) dsyr2_U, dsyr2_L, #else ssyr2_U, ssyr2_L, #endif }; #ifdef SMP static int (*syr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qsyr2_thread_U, qsyr2_thread_L, #elif defined(DOUBLE) dsyr2_thread_U, dsyr2_thread_L, #else ssyr2_thread_U, ssyr2_thread_L, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (syr2[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer); #ifdef SMP } else { (syr2_thread[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/syr2k.c000066400000000000000000000224111313527062700163220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef COMPLEX #ifdef XDOUBLE #define ERROR_NAME "QSYR2K" #elif defined(DOUBLE) #define ERROR_NAME "DSYR2K" #else #define ERROR_NAME "SSYR2K" #endif #else #ifndef HEMM #ifdef XDOUBLE #define ERROR_NAME "XSYR2K" #elif defined(DOUBLE) #define ERROR_NAME "ZSYR2K" #else #define ERROR_NAME "CSYR2K" #endif #else #ifdef XDOUBLE #define ERROR_NAME "XHER2K" #elif defined(DOUBLE) #define ERROR_NAME "ZHER2K" #else #define ERROR_NAME "CHER2K" #endif #endif #endif static int (*syr2k[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef HEMM SYR2K_UN, SYR2K_UC, SYR2K_LN, SYR2K_LC, #else HER2K_UN, HER2K_UC, HER2K_LN, HER2K_LC, #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, blasint *N, blasint *K, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ char uplo_arg = *UPLO; char trans_arg = *TRANS; blas_arg_t args; FLOAT *buffer; FLOAT *sa, *sb; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif blasint info; int uplo; int trans; int nrowa; PRINT_DEBUG_NAME; args.n = *N; args.k = *K; args.a = (void *)a; args.b = (void *)b; args.c = (void *)c; args.lda = *ldA; args.ldb = *ldB; args.ldc = *ldC; args.alpha = (void *)alpha; args.beta = (void *)beta; TOUPPER(uplo_arg); TOUPPER(trans_arg); uplo = -1; trans = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; #ifndef COMPLEX if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'C') trans = 1; #else #ifdef HEMM if (trans_arg == 'N') trans = 0; if (trans_arg == 'C') trans = 1; #else if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; #endif #endif nrowa = args.n; if (trans & 1) nrowa = args.k; info = 0; if (args.ldc < MAX(1,args.n)) info = 12; if (args.ldb < MAX(1,nrowa)) info = 9; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; if (args.n < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint n, blasint k, #ifndef COMPLEX FLOAT alpha, #else FLOAT *alpha, #endif FLOAT *a, blasint lda, FLOAT *b, blasint ldb, #if !defined(COMPLEX) || defined(HEMM) FLOAT beta, #else FLOAT *beta, #endif FLOAT *c, blasint ldc) { blas_arg_t args; int uplo, trans; blasint info, nrowa; FLOAT *buffer; FLOAT *sa, *sb; #ifdef HEMM FLOAT CAlpha[2]; #endif #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif PRINT_DEBUG_CNAME; args.n = n; args.k = k; args.a = (void *)a; args.b = (void *)b; args.c = (void *)c; args.lda = lda; args.ldb = ldb; args.ldc = ldc; #ifndef COMPLEX args.alpha = (void *)α #else args.alpha = (void *)alpha; #endif #if !defined(COMPLEX) || defined(HEMM) args.beta = (void *)β #else args.beta = (void *)beta; #endif trans = -1; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (Trans == CblasNoTrans) trans = 0; #ifndef COMPLEX if (Trans == CblasTrans) trans = 1; if (Trans == CblasConjNoTrans) trans = 0; if (Trans == CblasConjTrans) trans = 1; #elif !defined(HEMM) if (Trans == CblasTrans) trans = 1; #else if (Trans == CblasConjTrans) trans = 1; #endif info = -1; nrowa = args.n; if (trans & 1) nrowa = args.k; if (args.ldc < MAX(1,args.n)) info = 12; if (args.ldb < MAX(1,nrowa)) info = 9; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; if (args.n < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { #ifdef HEMM CAlpha[0] = alpha[0]; CAlpha[1] = -alpha[1]; args.alpha = (void *)CAlpha; #endif if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (Trans == CblasNoTrans) trans = 1; #ifndef COMPLEX if (Trans == CblasTrans) trans = 0; if (Trans == CblasConjNoTrans) trans = 1; if (Trans == CblasConjTrans) trans = 0; #elif !defined(HEMM) if (Trans == CblasTrans) trans = 0; #else if (Trans == CblasConjTrans) trans = 0; #endif info = -1; nrowa = args.n; if (trans & 1) nrowa = args.k; if (args.ldc < MAX(1,args.n)) info = 12; if (args.ldb < MAX(1,nrowa)) info = 9; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; if (args.n < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (args.n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP if (!trans){ mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); } else { mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); } mode |= (uplo << BLAS_UPLO_SHIFT); args.common = NULL; args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { #endif (syr2k[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { syrk_thread(mode, &args, NULL, NULL, syr2k[(uplo << 1) | trans ], sa, sb, args.nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, 2 * args.n * args.k + args.n * args.n, 2 * args.n * args.n * args.k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/syrk.c000066400000000000000000000223011313527062700162360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef COMPLEX #ifdef XDOUBLE #define ERROR_NAME "QSYRK " #elif defined(DOUBLE) #define ERROR_NAME "DSYRK " #else #define ERROR_NAME "SSYRK " #endif #else #ifndef HEMM #ifdef XDOUBLE #define ERROR_NAME "XSYRK " #elif defined(DOUBLE) #define ERROR_NAME "ZSYRK " #else #define ERROR_NAME "CSYRK " #endif #else #ifdef XDOUBLE #define ERROR_NAME "XHERK " #elif defined(DOUBLE) #define ERROR_NAME "ZHERK " #else #define ERROR_NAME "CHERK " #endif #endif #endif static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef HEMM SYRK_UN, SYRK_UC, SYRK_LN, SYRK_LC, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) SYRK_THREAD_UN, SYRK_THREAD_UC, SYRK_THREAD_LN, SYRK_THREAD_LC, #endif #else HERK_UN, HERK_UC, HERK_LN, HERK_LC, #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) HERK_THREAD_UN, HERK_THREAD_UC, HERK_THREAD_LN, HERK_THREAD_LC, #endif #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, blasint *N, blasint *K, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *beta, FLOAT *c, blasint *ldC){ char uplo_arg = *UPLO; char trans_arg = *TRANS; blas_arg_t args; FLOAT *buffer; FLOAT *sa, *sb; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif blasint info; int uplo; int trans; int nrowa; PRINT_DEBUG_NAME; args.n = *N; args.k = *K; args.a = (void *)a; args.c = (void *)c; args.lda = *ldA; args.ldc = *ldC; args.alpha = (void *)alpha; args.beta = (void *)beta; TOUPPER(uplo_arg); TOUPPER(trans_arg); uplo = -1; trans = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; #ifndef COMPLEX if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'C') trans = 1; #else #ifdef HEMM if (trans_arg == 'N') trans = 0; if (trans_arg == 'C') trans = 1; #else if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; #endif #endif nrowa = args.n; if (trans & 1) nrowa = args.k; info = 0; if (args.ldc < MAX(1,args.n)) info = 10; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; if (args.n < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint n, blasint k, #if !defined(COMPLEX) || defined(HEMM) FLOAT alpha, #else FLOAT *alpha, #endif FLOAT *a, blasint lda, #if !defined(COMPLEX) || defined(HEMM) FLOAT beta, #else FLOAT *beta, #endif FLOAT *c, blasint ldc) { blas_arg_t args; int uplo, trans; blasint info, nrowa; FLOAT *buffer; FLOAT *sa, *sb; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif PRINT_DEBUG_CNAME; args.n = n; args.k = k; args.a = (void *)a; args.c = (void *)c; args.lda = lda; args.ldc = ldc; #if !defined(COMPLEX) || defined(HEMM) args.alpha = (void *)α args.beta = (void *)β #else args.alpha = (void *)alpha; args.beta = (void *)beta; #endif trans = -1; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (Trans == CblasNoTrans) trans = 0; #ifndef COMPLEX if (Trans == CblasTrans) trans = 1; if (Trans == CblasConjNoTrans) trans = 0; if (Trans == CblasConjTrans) trans = 1; #elif !defined(HEMM) if (Trans == CblasTrans) trans = 1; #else if (Trans == CblasConjTrans) trans = 1; #endif info = -1; nrowa = args.n; if (trans & 1) nrowa = args.k; if (args.ldc < MAX(1,args.n)) info = 10; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; if (args.n < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (Trans == CblasNoTrans) trans = 1; #ifndef COMPLEX if (Trans == CblasTrans) trans = 0; if (Trans == CblasConjNoTrans) trans = 1; if (Trans == CblasConjTrans) trans = 0; #elif !defined(HEMM) if (Trans == CblasTrans) trans = 0; #else if (Trans == CblasConjTrans) trans = 0; #endif info = -1; nrowa = args.n; if (trans & 1) nrowa = args.k; if (args.ldc < MAX(1,args.n)) info = 10; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; if (args.n < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (args.n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP if (!trans){ mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); } else { mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); } mode |= (uplo << BLAS_UPLO_SHIFT); args.common = NULL; args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { #endif (syrk[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { #ifndef USE_SIMPLE_THREADED_LEVEL3 (syrk[4 | (uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); #else syrk_thread(mode, &args, NULL, NULL, syrk[(uplo << 1) | trans ], sa, sb, args.nthreads); #endif } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.n * args.k + args.n * args.n / 2, args.n * args.n * args.k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/tbmv.c000066400000000000000000000166171313527062700162330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTBMV " #elif defined(DOUBLE) #define ERROR_NAME "DTBMV " #else #define ERROR_NAME "STBMV " #endif static int (*tbmv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qtbmv_NUU, qtbmv_NUN, qtbmv_NLU, qtbmv_NLN, qtbmv_TUU, qtbmv_TUN, qtbmv_TLU, qtbmv_TLN, #elif defined(DOUBLE) dtbmv_NUU, dtbmv_NUN, dtbmv_NLU, dtbmv_NLN, dtbmv_TUU, dtbmv_TUN, dtbmv_TLU, dtbmv_TLN, #else stbmv_NUU, stbmv_NUN, stbmv_NLU, stbmv_NLN, stbmv_TUU, stbmv_TUN, stbmv_TLU, stbmv_TLN, #endif }; #ifdef SMP static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qtbmv_thread_NUU, qtbmv_thread_NUN, qtbmv_thread_NLU, qtbmv_thread_NLN, qtbmv_thread_TUU, qtbmv_thread_TUN, qtbmv_thread_TLU, qtbmv_thread_TLN, #elif defined(DOUBLE) dtbmv_thread_NUU, dtbmv_thread_NUN, dtbmv_thread_NLU, dtbmv_thread_NLN, dtbmv_thread_TUU, dtbmv_thread_TUN, dtbmv_thread_TLU, dtbmv_thread_TLN, #else stbmv_thread_NUU, stbmv_thread_NUN, stbmv_thread_NLU, stbmv_thread_NLN, stbmv_thread_TUU, stbmv_thread_TUN, stbmv_thread_TLU, stbmv_thread_TLN, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint k = *K; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (tbmv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); #ifdef SMP } else { (tbmv_thread[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/tbsv.c000066400000000000000000000150261313527062700162320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTBSV " #elif defined(DOUBLE) #define ERROR_NAME "DTBSV " #else #define ERROR_NAME "STBSV " #endif static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qtbsv_NUU, qtbsv_NUN, qtbsv_NLU, qtbsv_NLN, qtbsv_TUU, qtbsv_TUN, qtbsv_TLU, qtbsv_TLN, #elif defined(DOUBLE) dtbsv_NUU, dtbsv_NUN, dtbsv_NLU, dtbsv_NLN, dtbsv_TUU, dtbsv_TUN, dtbsv_TLU, dtbsv_TLN, #else stbsv_NUU, stbsv_NUN, stbsv_NLU, stbsv_NLN, stbsv_TUU, stbsv_TUN, stbsv_TLU, stbsv_TLN, #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint k = *K; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); (tbsv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/tpmv.c000066400000000000000000000160231313527062700162400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTPMV " #elif defined(DOUBLE) #define ERROR_NAME "DTPMV " #else #define ERROR_NAME "STPMV " #endif static int (*tpmv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qtpmv_NUU, qtpmv_NUN, qtpmv_NLU, qtpmv_NLN, qtpmv_TUU, qtpmv_TUN, qtpmv_TLU, qtpmv_TLN, #elif defined(DOUBLE) dtpmv_NUU, dtpmv_NUN, dtpmv_NLU, dtpmv_NLN, dtpmv_TUU, dtpmv_TUN, dtpmv_TLU, dtpmv_TLN, #else stpmv_NUU, stpmv_NUN, stpmv_NLU, stpmv_NLN, stpmv_TUU, stpmv_TUN, stpmv_TLU, stpmv_TLN, #endif }; #ifdef SMP static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qtpmv_thread_NUU, qtpmv_thread_NUN, qtpmv_thread_NLU, qtpmv_thread_NLN, qtpmv_thread_TUU, qtpmv_thread_TUN, qtpmv_thread_TLU, qtpmv_thread_TLN, #elif defined(DOUBLE) dtpmv_thread_NUU, dtpmv_thread_NUN, dtpmv_thread_NLU, dtpmv_thread_NLN, dtpmv_thread_TUU, dtpmv_thread_TUN, dtpmv_thread_TLU, dtpmv_thread_TLN, #else stpmv_thread_NUU, stpmv_thread_NUN, stpmv_thread_NLU, stpmv_thread_NLN, stpmv_thread_TUU, stpmv_thread_TUN, stpmv_thread_TLU, stpmv_thread_TLN, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (tpmv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); #ifdef SMP } else { (tpmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/tpsv.c000066400000000000000000000142661313527062700162550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTPSV " #elif defined(DOUBLE) #define ERROR_NAME "DTPSV " #else #define ERROR_NAME "STPSV " #endif static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qtpsv_NUU, qtpsv_NUN, qtpsv_NLU, qtpsv_NLN, qtpsv_TUU, qtpsv_TUN, qtpsv_TLU, qtpsv_TLN, #elif defined(DOUBLE) dtpsv_NUU, dtpsv_NUN, dtpsv_NLU, dtpsv_NLN, dtpsv_TUU, dtpsv_TUN, dtpsv_TLU, dtpsv_TLN, #else stpsv_NUU, stpsv_NUN, stpsv_NLU, stpsv_NLN, stpsv_TUU, stpsv_TUN, stpsv_TLU, stpsv_TLN, #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); (tpsv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/trmv.c000066400000000000000000000163321313527062700162450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTRMV " #elif defined(DOUBLE) #define ERROR_NAME "DTRMV " #else #define ERROR_NAME "STRMV " #endif static int (*trmv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE qtrmv_NUU, qtrmv_NUN, qtrmv_NLU, qtrmv_NLN, qtrmv_TUU, qtrmv_TUN, qtrmv_TLU, qtrmv_TLN, #elif defined(DOUBLE) dtrmv_NUU, dtrmv_NUN, dtrmv_NLU, dtrmv_NLN, dtrmv_TUU, dtrmv_TUN, dtrmv_TLU, dtrmv_TLN, #else strmv_NUU, strmv_NUN, strmv_NLU, strmv_NLN, strmv_TUU, strmv_TUN, strmv_TLU, strmv_TLN, #endif }; #ifdef SMP static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE qtrmv_thread_NUU, qtrmv_thread_NUN, qtrmv_thread_NLU, qtrmv_thread_NLN, qtrmv_thread_TUU, qtrmv_thread_TUN, qtrmv_thread_TLU, qtrmv_thread_TLN, #elif defined(DOUBLE) dtrmv_thread_NUU, dtrmv_thread_NUN, dtrmv_thread_NLU, dtrmv_thread_NLN, dtrmv_thread_TUU, dtrmv_thread_TUN, dtrmv_thread_TLU, dtrmv_thread_TLN, #else strmv_thread_NUU, strmv_thread_NUN, strmv_thread_NLU, strmv_thread_NLN, strmv_thread_TUU, strmv_thread_TUN, strmv_thread_TLU, strmv_thread_TLN, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); #ifdef SMP } else { (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/trsm.c000066400000000000000000000251431313527062700162420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef TRMM #ifndef COMPLEX #ifdef XDOUBLE #define ERROR_NAME "QTRSM " #elif defined(DOUBLE) #define ERROR_NAME "DTRSM " #else #define ERROR_NAME "STRSM " #endif #else #ifdef XDOUBLE #define ERROR_NAME "XTRSM " #elif defined(DOUBLE) #define ERROR_NAME "ZTRSM " #else #define ERROR_NAME "CTRSM " #endif #endif #else #ifndef COMPLEX #ifdef XDOUBLE #define ERROR_NAME "QTRMM " #elif defined(DOUBLE) #define ERROR_NAME "DTRMM " #else #define ERROR_NAME "STRMM " #endif #else #ifdef XDOUBLE #define ERROR_NAME "XTRMM " #elif defined(DOUBLE) #define ERROR_NAME "ZTRMM " #else #define ERROR_NAME "CTRMM " #endif #endif #endif static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef TRMM TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, TRSM_LTUU, TRSM_LTUN, TRSM_LTLU, TRSM_LTLN, TRSM_LRUU, TRSM_LRUN, TRSM_LRLU, TRSM_LRLN, TRSM_LCUU, TRSM_LCUN, TRSM_LCLU, TRSM_LCLN, TRSM_RNUU, TRSM_RNUN, TRSM_RNLU, TRSM_RNLN, TRSM_RTUU, TRSM_RTUN, TRSM_RTLU, TRSM_RTLN, TRSM_RRUU, TRSM_RRUN, TRSM_RRLU, TRSM_RRLN, TRSM_RCUU, TRSM_RCUN, TRSM_RCLU, TRSM_RCLN, #else TRMM_LNUU, TRMM_LNUN, TRMM_LNLU, TRMM_LNLN, TRMM_LTUU, TRMM_LTUN, TRMM_LTLU, TRMM_LTLN, TRMM_LRUU, TRMM_LRUN, TRMM_LRLU, TRMM_LRLN, TRMM_LCUU, TRMM_LCUN, TRMM_LCLU, TRMM_LCLN, TRMM_RNUU, TRMM_RNUN, TRMM_RNLU, TRMM_RNLN, TRMM_RTUU, TRMM_RTUN, TRMM_RTLU, TRMM_RTLN, TRMM_RRUU, TRMM_RRUN, TRMM_RRLU, TRMM_RRLN, TRMM_RCUU, TRMM_RCUN, TRMM_RCLU, TRMM_RCLN, #endif }; #ifndef CBLAS void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, blasint *M, blasint *N, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB){ char side_arg = *SIDE; char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blas_arg_t args; FLOAT *buffer; FLOAT *sa, *sb; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif blasint info; int side; int uplo; int unit; int trans; int nrowa; PRINT_DEBUG_NAME; args.m = *M; args.n = *N; args.a = (void *)a; args.b = (void *)b; args.lda = *ldA; args.ldb = *ldB; args.beta = (void *)alpha; TOUPPER(side_arg); TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); side = -1; trans = -1; unit = -1; uplo = -1; if (side_arg == 'L') side = 0; if (side_arg == 'R') side = 1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; nrowa = args.m; if (side & 1) nrowa = args.n; info = 0; if (args.ldb < MAX(1,args.m)) info = 11; if (args.lda < MAX(1,nrowa)) info = 9; if (args.n < 0) info = 6; if (args.m < 0) info = 5; if (unit < 0) info = 4; if (trans < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, enum CBLAS_DIAG Diag, blasint m, blasint n, #ifndef COMPLEX FLOAT alpha, #else FLOAT *alpha, #endif FLOAT *a, blasint lda, FLOAT *b, blasint ldb) { blas_arg_t args; int side, uplo, trans, unit; blasint info, nrowa; XFLOAT *buffer; XFLOAT *sa, *sb; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif PRINT_DEBUG_CNAME; args.a = (void *)a; args.b = (void *)b; args.lda = lda; args.ldb = ldb; #ifndef COMPLEX args.beta = (void *)α #else args.beta = (void *)alpha; #endif side = -1; uplo = -1; trans = -1; unit = -1; info = 0; if (order == CblasColMajor) { args.m = m; args.n = n; if (Side == CblasLeft) side = 0; if (Side == CblasRight) side = 1; if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (Trans == CblasNoTrans) trans = 0; if (Trans == CblasTrans) trans = 1; #ifndef COMPLEX if (Trans == CblasConjNoTrans) trans = 0; if (Trans == CblasConjTrans) trans = 1; #else if (Trans == CblasConjNoTrans) trans = 2; if (Trans == CblasConjTrans) trans = 3; #endif if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; nrowa = args.m; if (side & 1) nrowa = args.n; if (args.ldb < MAX(1,args.m)) info = 11; if (args.lda < MAX(1,nrowa)) info = 9; if (args.n < 0) info = 6; if (args.m < 0) info = 5; if (unit < 0) info = 4; if (trans < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; } if (order == CblasRowMajor) { args.m = n; args.n = m; if (Side == CblasLeft) side = 1; if (Side == CblasRight) side = 0; if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (Trans == CblasNoTrans) trans = 0; if (Trans == CblasTrans) trans = 1; #ifndef COMPLEX if (Trans == CblasConjNoTrans) trans = 0; if (Trans == CblasConjTrans) trans = 1; #else if (Trans == CblasConjNoTrans) trans = 2; if (Trans == CblasConjTrans) trans = 3; #endif if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; nrowa = args.m; if (side & 1) nrowa = args.n; if (args.ldb < MAX(1,args.m)) info = 11; if (args.lda < MAX(1,nrowa)) info = 9; if (args.n < 0) info = 6; if (args.m < 0) info = 5; if (unit < 0) info = 4; if (trans < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((args.m == 0) || (args.n == 0)) return; IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); args.nthreads = num_cpu_avail(3); if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; if (args.nthreads == 1) { #endif (trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { if (!side) { gemm_thread_n(mode, &args, NULL, NULL, trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit], sa, sb, args.nthreads); } else { gemm_thread_m(mode, &args, NULL, NULL, trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit], sa, sb, args.nthreads); } } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, (!side) ? args.m * (args.m + args.n) : args.n * (args.m + args.n), (!side) ? args.m * args.m * args.n : args.m * args.n * args.n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/trsv.c000066400000000000000000000145531313527062700162560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QTRSV " #elif defined(DOUBLE) #define ERROR_NAME "DTRSV " #else #define ERROR_NAME "STRSV " #endif static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE qtrsv_NUU, qtrsv_NUN, qtrsv_NLU, qtrsv_NLN, qtrsv_TUU, qtrsv_TUN, qtrsv_TLU, qtrsv_TLN, #elif defined(DOUBLE) dtrsv_NUU, dtrsv_NUN, dtrsv_NLU, dtrsv_NLN, dtrsv_TUU, dtrsv_TUN, dtrsv_TLU, dtrsv_TLN, #else strsv_NUU, strsv_NUN, strsv_NLU, strsv_NLN, strsv_TUU, strsv_TUN, strsv_TLU, strsv_TLN, #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/xerbla.c000066400000000000000000000005641313527062700165320ustar00rootroot00000000000000#ifdef CBLAS #include #include #include #include #include "common.h" void CNAME(blasint p, char *rout, char *form, ...) { va_list args; va_start(args, form); if (p) fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout); vfprintf(stderr, form, args); va_end(args); exit(-1); } #endif OpenBLAS-0.2.20/interface/zaxpby.c000066400000000000000000000050361313527062700165710ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /********************************************************************** 2014/06/07 Saar **********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) { blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; #else void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy) { #endif FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); if (n <= 0) return; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * 2; if (incy < 0) y -= (n - 1) * incy * 2; AXPBY_K (n, alpha_r, alpha_i, x, incx, beta_r, beta_i, y, incy); FUNCTION_PROFILE_END(4, 2 * n, 2 * n); return; } OpenBLAS-0.2.20/interface/zaxpy.c000066400000000000000000000102331313527062700164220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; #else void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #endif FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); #ifdef SMP int mode, nthreads; #endif #ifndef CBLAS PRINT_DEBUG_CNAME; #else PRINT_DEBUG_CNAME; #endif if (n <= 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * 2; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP nthreads = num_cpu_avail(1); //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; if (nthreads == 1) { #endif #ifndef CONJ AXPYU_K (n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); #else AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); #endif #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, #ifndef CONJ (void *)AXPYU_K, #else (void *)AXPYC_K, #endif nthreads); } #endif FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zdot.c000066400000000000000000000131271313527062700162340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef RETURN_BY_STRUCT #ifdef XDOUBLE #define MYTYPE myxcomplex_t #elif defined DOUBLE #define MYTYPE myzcomplex_t #else #define MYTYPE myccomplex_t #endif #endif #ifndef CBLAS #ifdef RETURN_BY_STRUCT MYTYPE NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #elif defined RETURN_BY_STACK void NAME(OPENBLAS_COMPLEX_FLOAT *result, blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #else OPENBLAS_COMPLEX_FLOAT NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { #endif BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; #ifndef RETURN_BY_STACK OPENBLAS_COMPLEX_FLOAT ret; #endif #ifdef RETURN_BY_STRUCT MYTYPE myret; #endif #ifndef RETURN_BY_STRUCT OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); #endif PRINT_DEBUG_NAME; if (n <= 0) { #ifdef RETURN_BY_STRUCT myret.r = 0.; myret.i = 0.; return myret; #elif defined RETURN_BY_STACK *result = zero; return; #else return zero; #endif } IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * 2; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef RETURN_BY_STRUCT #ifndef CONJ ret = DOTU_K(n, x, incx, y, incy); #else ret = DOTC_K(n, x, incx, y, incy); #endif myret.r = CREAL ret; myret.i = CIMAG ret; FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; return myret; #elif defined RETURN_BY_STACK #ifndef CONJ *result = DOTU_K(n, x, incx, y, incy); #else *result = DOTC_K(n, x, incx, y, incy); #endif FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; #else #ifndef CONJ ret = DOTU_K(n, x, incx, y, incy); #else ret = DOTC_K(n, x, incx, y, incy); #endif FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; return ret; #endif } #else #ifdef FORCE_USE_STACK void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, OPENBLAS_COMPLEX_FLOAT *result){ #else OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ OPENBLAS_COMPLEX_FLOAT ret; OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); #endif PRINT_DEBUG_CNAME; if (n <= 0) { #ifdef FORCE_USE_STACK OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); *result = zero; // CREAL(*result) = 0.0; // CIMAG(*result) = 0.0; return; #else return zero; #endif } if (incx < 0) x -= (n - 1) * incx * 2; if (incy < 0) y -= (n - 1) * incy * 2; IDEBUG_START; FUNCTION_PROFILE_START(); #ifdef FORCE_USE_STACK #ifndef CONJ *result = DOTU_K(n, x, incx, y, incy); #else *result = DOTC_K(n, x, incx, y, incy); #endif FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; #else #ifndef CONJ ret = DOTU_K(n, x, incx, y, incy); #else ret = DOTC_K(n, x, incx, y, incy); #endif FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; return ret; #endif } #endif OpenBLAS-0.2.20/interface/zgbmv.c000066400000000000000000000170471313527062700164060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XGBMV " #elif defined(DOUBLE) #define ERROR_NAME "ZGBMV " #else #define ERROR_NAME "CGBMV " #endif static void (*gbmv[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xgbmv_n, xgbmv_t, xgbmv_r, xgbmv_c, xgbmv_o, xgbmv_u, xgbmv_s, xgbmv_d, #elif defined(DOUBLE) zgbmv_n, zgbmv_t, zgbmv_r, zgbmv_c, zgbmv_o, zgbmv_u, zgbmv_s, zgbmv_d, #else cgbmv_n, cgbmv_t, cgbmv_r, cgbmv_c, cgbmv_o, cgbmv_u, cgbmv_s, cgbmv_d, #endif }; #ifdef SMP static int (*gbmv_thread[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xgbmv_thread_n, xgbmv_thread_t, xgbmv_thread_r, xgbmv_thread_c, xgbmv_thread_o, xgbmv_thread_u, xgbmv_thread_s, xgbmv_thread_d, #elif defined(DOUBLE) zgbmv_thread_n, zgbmv_thread_t, zgbmv_thread_r, zgbmv_thread_c, zgbmv_thread_o, zgbmv_thread_u, zgbmv_thread_s, zgbmv_thread_d, #else cgbmv_thread_n, cgbmv_thread_t, cgbmv_thread_r, cgbmv_thread_c, cgbmv_thread_o, cgbmv_thread_u, cgbmv_thread_s, cgbmv_thread_d, #endif }; #endif #ifndef CBLAS void NAME(char *TRANS, blasint *M, blasint *N, blasint *KU, blasint *KL, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint ku = *KU; blasint kl = *KL; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT *buffer; #ifdef SMP int nthreads; #endif FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 2; if (trans == 'C') i = 3; if (trans == 'O') i = 4; if (trans == 'U') i = 5; if (trans == 'S') i = 6; if (trans == 'D') i = 7; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, blasint ku, blasint kl, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; info = -1; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; info = -1; t = n; n = m; m = t; t = ku; ku = kl; kl = t; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans & 1) lenx = m; if (trans & 1) leny = n; if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gbmv[(int)trans])(m, n, kl, ku, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gbmv_thread[(int)trans])(m, n, kl, ku, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, m * n / 2 + n, m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zgeadd.c000066400000000000000000000105331313527062700165100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #if defined(DOUBLE) #define ERROR_NAME "ZGEADD " #else #define ERROR_NAME "CGEADD " #endif #ifndef CBLAS void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *BETA, FLOAT *c, blasint *LDC) { blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint ldc = *LDC; blasint info; PRINT_DEBUG_NAME; info = 0; if (lda < MAX(1, m)) info = 6; if (ldc < MAX(1, m)) info = 8; if (n < 0) info = 2; if (m < 0) info = 1; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *BETA, FLOAT *c, blasint ldc) { /* void CNAME(enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta, FLOAT *c, blasint ldc){ */ blasint info, t; PRINT_DEBUG_CNAME; info = 0; if (order == CblasColMajor) { info = -1; if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (order == CblasRowMajor) { info = -1; t = n; n = m; m = t; if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; IDEBUG_START; FUNCTION_PROFILE_START(); GEADD_K(m,n,ALPHA[0],ALPHA[1], a, lda, BETA[0], BETA[1], c, ldc); FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zgemv.c000066400000000000000000000175141313527062700164100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XGEMV " #elif defined(DOUBLE) #define ERROR_NAME "ZGEMV " #else #define ERROR_NAME "CGEMV " #endif #ifdef SMP static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, xgemv_thread_d, #elif defined DOUBLE zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, zgemv_thread_d, #else cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, cgemv_thread_d, #endif }; #endif #ifndef CBLAS void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT *buffer; int buffer_size; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; blasint info; blasint lenx, leny; blasint i; FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 2; if (trans == 'C') i = 3; if (trans == 'O') i = 4; if (trans == 'U') i = 5; if (trans == 'S') i = 6; if (trans == 'D') i = 7; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1,m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans, buffer_size; blasint info, t; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif /* Quick return if possible. */ if (m == 0 || n == 0) return; lenx = n; leny = m; if (trans & 1) lenx = m; if (trans & 1) leny = n; if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT) ; #endif // for alignment buffer_size = (buffer_size + 3) & ~3; STACK_ALLOC(buffer_size, FLOAT, buffer); #if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746) if(trans && stack_alloc_size) memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size)); #endif #ifdef SMP if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) nthreads = 1; else nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zger.c000066400000000000000000000152001313527062700162150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef SMP #ifdef __64BIT__ #define SMPTEST 1 #endif #endif #ifdef XDOUBLE #ifndef CONJ #define ERROR_NAME "XGERU " #else #define ERROR_NAME "XGERC " #endif #elif defined DOUBLE #ifndef CONJ #define ERROR_NAME "ZGERU " #else #define ERROR_NAME "ZGERC " #endif #else #ifndef CONJ #define ERROR_NAME "CGERU " #else #define ERROR_NAME "CGERC " #endif #endif #if defined XDOUBLE #ifndef CONJ #define GER GERU_K #define GER_THREAD xger_thread_U #else #define GER GERC_K #define GER_THREAD xger_thread_C #define GERV GERV_K #define GERV_THREAD xger_thread_V #endif #elif defined DOUBLE #ifndef CONJ #define GER GERU_K #define GER_THREAD zger_thread_U #else #define GER GERC_K #define GER_THREAD zger_thread_C #define GERV GERV_K #define GERV_THREAD zger_thread_V #endif #else #ifndef CONJ #define GER GERU_K #define GER_THREAD cger_thread_U #else #define GER GERC_K #define GER_THREAD cger_thread_C #define GERV GERV_K #define GERV_THREAD cger_thread_V #endif #endif #ifndef CBLAS void NAME(blasint *M, blasint *N, FLOAT *Alpha, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ blasint m = *M; blasint n = *N; FLOAT alpha_r = Alpha[0]; FLOAT alpha_i = Alpha[1]; blasint incx = *INCX; blasint incy = *INCY; blasint lda = *LDA; FLOAT *buffer; #ifdef SMPTEST int nthreads; #endif blasint info; PRINT_DEBUG_NAME; info = 0; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; if (info){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *Alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT alpha_r = Alpha[0]; FLOAT alpha_i = Alpha[1]; FLOAT *buffer; blasint info, t; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_CNAME; info = 0; if (order == CblasColMajor) { info = -1; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (order == CblasRowMajor) { info = -1; t = n; n = m; m = t; t = incx; incx = incy; incy = t; buffer = x; x = y; y = buffer; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif /* Quick return if possible. */ if (m == 0 || n == 0) return; if ((alpha_r == 0.) && (alpha_i == 0.)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incy < 0) y -= (n - 1) * incy * 2; if (incx < 0) x -= (m - 1) * incx * 2; STACK_ALLOC(2 * m, FLOAT, buffer); #ifdef SMPTEST // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 if(1L * m * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) nthreads = num_cpu_avail(2); else nthreads = 1; if (nthreads == 1) { #endif #if !defined(CBLAS) || !defined(CONJ) GER(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); #else if (order == CblasColMajor) { GER(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); } else { GERV(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); } #endif #ifdef SMPTEST } else { #if !defined(CBLAS) || !defined(CONJ) GER_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); #else if (order == CblasColMajor) { GER_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); } else { GERV_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); } #endif } #endif STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zhbmv.c000066400000000000000000000151111313527062700163750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XHBMV " #elif defined(DOUBLE) #define ERROR_NAME "ZHBMV " #else #define ERROR_NAME "CHBMV " #endif static int (*hbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xhbmv_U, xhbmv_L, xhbmv_V, xhbmv_M, #elif defined(DOUBLE) zhbmv_U, zhbmv_L, zhbmv_V, zhbmv_M, #else chbmv_U, chbmv_L, chbmv_V, chbmv_M, #endif }; #ifdef SMPBUG static int (*hbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xhbmv_thread_U, xhbmv_thread_L, xhbmv_thread_V, xhbmv_thread_M, #elif defined(DOUBLE) zhbmv_thread_U, zhbmv_thread_L, zhbmv_thread_V, zhbmv_thread_M, #else chbmv_thread_U, chbmv_thread_L, chbmv_thread_V, chbmv_thread_M, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; blasint k = *K; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMPBUG int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'V') uplo = 2; if (uplo_arg == 'M') uplo = 3; info = 0; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, blasint k, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMPBUG int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) y -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMPBUG nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (hbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMPBUG } else { (hbmv_thread[uplo])(n, k, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zhemv.c000066400000000000000000000145761313527062700164160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XHEMV " #elif defined(DOUBLE) #define ERROR_NAME "ZHEMV " #else #define ERROR_NAME "CHEMV " #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; #ifdef SMP int nthreads; #endif int (*hemv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { HEMV_U, HEMV_L, HEMV_V, HEMV_M, }; #ifdef SMP int (*hemv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { HEMV_THREAD_U, HEMV_THREAD_L, HEMV_THREAD_V, HEMV_THREAD_M, }; #endif blasint info; int uplo; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'V') uplo = 2; if (uplo_arg == 'M') uplo = 3; info = 0; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy) { FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif int (*hemv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { HEMV_U, HEMV_L, HEMV_V, HEMV_M, }; #ifdef SMP int (*hemv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { HEMV_THREAD_U, HEMV_THREAD_L, HEMV_THREAD_V, HEMV_THREAD_M, }; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; if (incy < 0 ) y -= (n - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (hemv[uplo])(n, n, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (hemv_thread[uplo])(n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zher.c000066400000000000000000000130721313527062700162230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XHER " #elif defined(DOUBLE) #define ERROR_NAME "ZHER " #else #define ERROR_NAME "CHER " #endif static int (*her[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE xher_U, xher_L, xher_V, xher_M, #elif defined(DOUBLE) zher_U, zher_L, zher_V, zher_M, #else cher_U, cher_L, cher_V, cher_M, #endif }; #ifdef SMP static int (*her_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xher_thread_U, xher_thread_L, xher_thread_V, xher_thread_M, #elif defined(DOUBLE) zher_thread_U, zher_thread_L, zher_thread_V, zher_thread_M, #else cher_thread_U, cher_thread_L, cher_thread_V, cher_thread_M, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (her[uplo])(n, alpha, x, incx, a, lda, buffer); #ifdef SMP } else { (her_thread[uplo])(n, alpha, x, incx, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zher2.c000066400000000000000000000140031313527062700163000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XHER2 " #elif defined(DOUBLE) #define ERROR_NAME "ZHER2 " #else #define ERROR_NAME "CHER2 " #endif static int (*her2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE xher2_U, xher2_L, xher2_V, xher2_M, #elif defined(DOUBLE) zher2_U, zher2_L, zher2_V, zher2_M, #else cher2_U, cher2_L, cher2_V, cher2_M, #endif }; #ifdef SMP static int (*her2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xher2_thread_U, xher2_thread_L, xher2_thread_V, xher2_thread_M, #elif defined(DOUBLE) zher2_thread_U, zher2_thread_L, zher2_thread_V, zher2_thread_M, #else cher2_thread_U, cher2_thread_L, cher2_thread_V, cher2_thread_M, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (lda < MAX(1, n)) info = 9; if (incx == 0) info = 7; if (incy == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; if (incy < 0 ) y -= (n - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (her2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); #ifdef SMP } else { (her2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zhpmv.c000066400000000000000000000141521313527062700164170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XHPMV " #elif defined(DOUBLE) #define ERROR_NAME "ZHPMV " #else #define ERROR_NAME "CHPMV " #endif static int (*hpmv[])(BLASLONG, FLOAT, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xhpmv_U, xhpmv_L, xhpmv_V, xhpmv_M, #elif defined(DOUBLE) zhpmv_U, zhpmv_L, zhpmv_V, zhpmv_M, #else chpmv_U, chpmv_L, chpmv_V, chpmv_M, #endif }; #ifdef SMP static int (*hpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xhpmv_thread_U, xhpmv_thread_L, xhpmv_thread_V, xhpmv_thread_M, #elif defined(DOUBLE) zhpmv_thread_U, zhpmv_thread_L, zhpmv_thread_V, zhpmv_thread_M, #else chpmv_thread_U, chpmv_thread_L, chpmv_thread_V, chpmv_thread_M, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 9; if (incx == 0) info = 6; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *a, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 9; if (incx == 0) info = 6; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (incy == 0) info = 9; if (incx == 0) info = 6; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; if (incy < 0 ) y -= (n - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (hpmv[uplo])(n, alpha_r, alpha_i, a, x, incx, y, incy, buffer); #ifdef SMP } else { (hpmv_thread[uplo])(n, ALPHA, a, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zhpr.c000066400000000000000000000126511313527062700162400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XHPR " #elif defined(DOUBLE) #define ERROR_NAME "ZHPR " #else #define ERROR_NAME "CHPR " #endif static int (*hpr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { #ifdef XDOUBLE xhpr_U, xhpr_L, xhpr_V, xhpr_M, #elif defined(DOUBLE) zhpr_U, zhpr_L, zhpr_V, zhpr_M, #else chpr_U, chpr_L, chpr_V, chpr_M, #endif }; #ifdef SMP static int (*hpr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { #ifdef XDOUBLE xhpr_thread_U, xhpr_thread_L, xhpr_thread_V, xhpr_thread_M, #elif defined(DOUBLE) zhpr_thread_U, zhpr_thread_L, zhpr_thread_V, zhpr_thread_M, #else chpr_thread_U, chpr_thread_L, chpr_thread_V, chpr_thread_M, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (hpr[uplo])(n, alpha, x, incx, a, buffer); #ifdef SMP } else { (hpr_thread[uplo])(n, alpha, x, incx, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zhpr2.c000066400000000000000000000135701313527062700163230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XHPR2 " #elif defined(DOUBLE) #define ERROR_NAME "ZHPR2 " #else #define ERROR_NAME "CHPR2 " #endif static int (*hpr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { #ifdef XDOUBLE xhpr2_U, xhpr2_L, xhpr2_V, xhpr2_M, #elif defined(DOUBLE) zhpr2_U, zhpr2_L, zhpr2_V, zhpr2_M, #else chpr2_U, chpr2_L, chpr2_V, chpr2_M, #endif }; #ifdef SMP static int (*hpr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { #ifdef XDOUBLE xhpr2_thread_U, xhpr2_thread_L, xhpr2_thread_V, xhpr2_thread_M, #elif defined(DOUBLE) zhpr2_thread_U, zhpr2_thread_L, zhpr2_thread_V, zhpr2_thread_M, #else chpr2_thread_U, chpr2_thread_L, chpr2_thread_V, chpr2_thread_M, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint incx = *INCX; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a) { FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (incx == 0) info = 7; if (incy == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; if (incy < 0 ) y -= (n - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (hpr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); #ifdef SMP } else { (hpr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zimatcopy.c000066400000000000000000000171521313527062700172750ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** * 2014-06-10 Saar * 2015-09-07 grisuthedragon ***********************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #if defined(DOUBLE) #define ERROR_NAME "ZIMATCOPY" #else #define ERROR_NAME "CIMATCOPY" #endif #define BlasRowMajor 0 #define BlasColMajor 1 #define BlasNoTrans 0 #define BlasTrans 1 #define BlasTransConj 2 #define BlasConj 3 #define NEW_IMATCOPY #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; Order = *ORDER; Trans = *TRANS; TOUPPER(Order); TOUPPER(Trans); if ( Order == 'C' ) order = BlasColMajor; if ( Order == 'R' ) order = BlasRowMajor; if ( Trans == 'N' ) trans = BlasNoTrans; if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTransConj; if ( Trans == 'R' ) trans = BlasConj; #else void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, blasint cldb) { blasint *rows, *cols, *lda, *ldb; int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; if ( CORDER == CblasColMajor ) order = BlasColMajor; if ( CORDER == CblasRowMajor ) order = BlasRowMajor; if ( CTRANS == CblasNoTrans) trans = BlasNoTrans; if ( CTRANS == CblasConjNoTrans ) trans = BlasConj; if ( CTRANS == CblasTrans) trans = BlasTrans; if ( CTRANS == CblasConjTrans) trans = BlasTransConj; rows = &crows; cols = &ccols; lda = &clda; ldb = &cldb; #endif if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasConj && *ldb < *rows ) info = 9; if ( trans == BlasTrans && *ldb < *cols ) info = 9; if ( trans == BlasTransConj && *ldb < *cols ) info = 9; } if ( order == BlasRowMajor) { if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; if ( trans == BlasConj && *ldb < *cols ) info = 9; if ( trans == BlasTrans && *ldb < *rows ) info = 9; if ( trans == BlasTransConj && *ldb < *rows ) info = 9; } if ( order == BlasColMajor && *lda < *rows ) info = 7; if ( order == BlasRowMajor && *lda < *cols ) info = 7; if ( *cols <= 0 ) info = 4; if ( *rows <= 0 ) info = 3; if ( trans < 0 ) info = 2; if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #ifdef NEW_IMATCOPY if (*lda == *ldb && *cols == *rows) { if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); } if ( trans == BlasConj ) { IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); } if ( trans == BlasTrans ) { IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); } if ( trans == BlasTransConj ) { IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); } } else { if ( trans == BlasNoTrans ) { IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); } if ( trans == BlasConj ) { IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); } if ( trans == BlasTrans ) { IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); } if ( trans == BlasTransConj ) { IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); } } return; } #endif if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; else msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) { printf("Memory alloc failed in zimatcopy\n"); exit(1); } if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } if ( trans == BlasConj ) { OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } if ( trans == BlasTrans ) { OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } if ( trans == BlasTransConj ) { OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } } else { if ( trans == BlasNoTrans ) { OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } if ( trans == BlasConj ) { OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } if ( trans == BlasTrans ) { OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } if ( trans == BlasTransConj ) { OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); free(b); return; } } free(b); return; } OpenBLAS-0.2.20/interface/zomatcopy.c000066400000000000000000000122711313527062700173000ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** * 2014/06/09 Saar ***********************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #if defined(DOUBLE) #define ERROR_NAME "ZOMATCOPY" #else #define ERROR_NAME "COMATCOPY" #endif #define BlasRowMajor 0 #define BlasColMajor 1 #define BlasNoTrans 0 #define BlasTrans 1 #define BlasTransConj 2 #define BlasConj 3 #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb) { char Order, Trans; int order=-1,trans=-1; blasint info = -1; Order = *ORDER; Trans = *TRANS; TOUPPER(Order); TOUPPER(Trans); if ( Order == 'C' ) order = BlasColMajor; if ( Order == 'R' ) order = BlasRowMajor; if ( Trans == 'N' ) trans = BlasNoTrans; if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTransConj; if ( Trans == 'R' ) trans = BlasConj; #else void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, FLOAT*b, blasint cldb) { blasint *rows, *cols, *lda, *ldb; int order=-1,trans=-1; blasint info = -1; if ( CORDER == CblasColMajor ) order = BlasColMajor; if ( CORDER == CblasRowMajor ) order = BlasRowMajor; if ( CTRANS == CblasNoTrans) trans = BlasNoTrans; if ( CTRANS == CblasConjNoTrans ) trans = BlasConj; if ( CTRANS == CblasTrans) trans = BlasTrans; if ( CTRANS == CblasConjTrans) trans = BlasTransConj; rows = &crows; cols = &ccols; lda = &clda; ldb = &cldb; #endif if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasConj && *ldb < *rows ) info = 9; if ( trans == BlasTrans && *ldb < *cols ) info = 9; if ( trans == BlasTransConj && *ldb < *cols ) info = 9; } if ( order == BlasRowMajor) { if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; if ( trans == BlasConj && *ldb < *cols ) info = 9; if ( trans == BlasTrans && *ldb < *rows ) info = 9; if ( trans == BlasTransConj && *ldb < *rows ) info = 9; } if ( order == BlasColMajor && *lda < *rows ) info = 7; if ( order == BlasRowMajor && *lda < *cols ) info = 7; if ( *cols <= 0 ) info = 4; if ( *rows <= 0 ) info = 3; if ( trans < 0 ) info = 2; if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } if ( trans == BlasConj ) { OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } if ( trans == BlasTrans ) { OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } if ( trans == BlasTransConj ) { OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } } else { if ( trans == BlasNoTrans ) { OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } if ( trans == BlasConj ) { OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } if ( trans == BlasTrans ) { OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } if ( trans == BlasTransConj ) { OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); return; } } return; } OpenBLAS-0.2.20/interface/zrot.c000066400000000000000000000062361313527062700162550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT c = *C; FLOAT s = *S; PRINT_DEBUG_NAME; if (n <= 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * 2 * incx; if (incy < 0) y -= (n - 1) * 2 * incy; ROT_K(n, x, incx, y, incy, c, s); FUNCTION_PROFILE_END(4, n, n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zrotg.c000066400000000000000000000046131313527062700164210ustar00rootroot00000000000000#include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da_r = *(DA + 0); long double da_i = *(DA + 1); long double db_r = *(DB + 0); long double db_i = *(DB + 1); long double r; long double ada = fabs(da_r) + fabs(da_i); PRINT_DEBUG_NAME; IDEBUG_START; FUNCTION_PROFILE_START(); if (ada == ZERO) { *C = ZERO; *(S + 0) = ONE; *(S + 1) = ZERO; *(DA + 0) = db_r; *(DA + 1) = db_i; } else { long double alpha_r, alpha_i; ada = sqrt(da_r * da_r + da_i * da_i); r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i); alpha_r = da_r / ada; alpha_i = da_i / ada; *(C + 0) = ada / r; *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; *(DA + 0) = alpha_r * r; *(DA + 1) = alpha_i * r; } #else FLOAT da_r = *(DA + 0); FLOAT da_i = *(DA + 1); FLOAT db_r = *(DB + 0); FLOAT db_i = *(DB + 1); FLOAT r; FLOAT ada = fabs(da_r) + fabs(da_i); FLOAT adb; PRINT_DEBUG_NAME; IDEBUG_START; FUNCTION_PROFILE_START(); if (ada == ZERO) { *C = ZERO; *(S + 0) = ONE; *(S + 1) = ZERO; *(DA + 0) = db_r; *(DA + 1) = db_i; } else { FLOAT scale; FLOAT aa_r, aa_i, bb_r, bb_i; FLOAT alpha_r, alpha_i; aa_r = fabs(da_r); aa_i = fabs(da_i); if (aa_i > aa_r) { aa_r = fabs(da_i); aa_i = fabs(da_r); } scale = (aa_i / aa_r); ada = aa_r * sqrt(ONE + scale * scale); bb_r = fabs(db_r); bb_i = fabs(db_i); if (bb_i > bb_r) { bb_r = fabs(bb_i); bb_i = fabs(bb_r); } scale = (bb_i / bb_r); adb = bb_r * sqrt(ONE + scale * scale); scale = ada + adb; aa_r = da_r / scale; aa_i = da_i / scale; bb_r = db_r / scale; bb_i = db_i / scale; r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); alpha_r = da_r / ada; alpha_i = da_i / ada; *(C + 0) = ada / r; *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; *(DA + 0) = alpha_r * r; *(DA + 1) = alpha_i * r; } #endif FUNCTION_PROFILE_END(4, 4, 4); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zsbmv.c000066400000000000000000000121621313527062700164130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif /* #ifdef SMP #ifdef __64BIT__ #define SMPTEST 1 #endif #endif */ #ifdef XDOUBLE #define ERROR_NAME "XSBMV " #elif defined(DOUBLE) #define ERROR_NAME "ZSBMV " #else #define ERROR_NAME "CSBMV " #endif static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xsbmv_U, xsbmv_L, #elif defined(DOUBLE) zsbmv_U, zsbmv_L, #else csbmv_U, csbmv_L, #endif }; #ifdef SMPTEST static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xsbmv_thread_U, xsbmv_thread_L, #elif defined(DOUBLE) zsbmv_thread_U, zsbmv_thread_L, #else csbmv_thread_U, csbmv_thread_L, #endif }; #endif void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; blasint k = *K; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMPTEST nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (sbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); #ifdef SMPTEST } else { (sbmv_thread[uplo])(n, k, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zscal.c000066400000000000000000000075371313527062700164000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ blasint n = *N; blasint incx = *INCX; #ifndef SSCAL FLOAT *alpha=ALPHA; #else FLOAT alpha[2] = {ALPHA[0], ZERO}; #endif #else #ifndef SSCAL void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx){ FLOAT *alpha=ALPHA; #else void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){ FLOAT alpha[2] = {alpha_r, ZERO}; #endif #endif #ifdef SMP int mode; int nthreads; #endif #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif if (incx <= 0 || n <= 0) return; if ((alpha[0] == ONE) && (alpha[1] == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); #ifdef SMP nthreads = num_cpu_avail(1); if ( n <= 1048576 ) nthreads = 1; if (nthreads == 1) { #endif SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); #ifdef SMP } else { #ifdef DOUBLE mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); } #endif FUNCTION_PROFILE_END(4, n, n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zspmv.c000066400000000000000000000115551313527062700164360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "ZSPMV " #elif defined(DOUBLE) #define ERROR_NAME "ZSPMV " #else #define ERROR_NAME "CSPMV " #endif static int (*spmv[])(BLASLONG, FLOAT, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xspmv_U, xspmv_L, #elif defined(DOUBLE) zspmv_U, zspmv_L, #else cspmv_U, cspmv_L, #endif }; #ifdef SMPTEST static int (*spmv_thread[])(BLASLONG, FLOAT *, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xspmv_thread_U, xspmv_thread_L, #elif defined(DOUBLE) zspmv_thread_U, zspmv_thread_L, #else cspmv_thread_U, cspmv_thread_L, #endif }; #endif void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMPTEST int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 9; if (incx == 0) info = 6; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMPTEST nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spmv[uplo])(n, alpha_r, alpha_i, a, b, incx, c, incy, buffer); #ifdef SMPTEST } else { (spmv_thread[uplo])(n, ALPHA, a, b, incx, c, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zspr.c000066400000000000000000000107131313527062700162500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XSPR " #elif defined(DOUBLE) #define ERROR_NAME "ZSPR " #else #define ERROR_NAME "CSPR " #endif static int (*spr[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { #ifdef XDOUBLE xspr_U, xspr_L, #elif defined(DOUBLE) zspr_U, zspr_L, #else cspr_U, cspr_L, #endif }; #ifdef SMP static int (*spr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { #ifdef XDOUBLE xspr_thread_U, xspr_thread_L, #elif defined(DOUBLE) zspr_thread_U, zspr_thread_L, #else cspr_thread_U, cspr_thread_L, #endif }; #endif void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spr[uplo])(n, alpha_r, alpha_i, x, incx, a, buffer); #ifdef SMP } else { (spr_thread[uplo])(n, ALPHA, x, incx, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zspr2.c000066400000000000000000000112271313527062700163330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XSPR2 " #elif defined(DOUBLE) #define ERROR_NAME "ZSPR2 " #else #define ERROR_NAME "CSPR2 " #endif static int (*spr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { #ifdef XDOUBLE xspr2_U, xspr2_L, #elif defined(DOUBLE) zspr2_U, zspr2_L, #else cspr2_U, cspr2_L, #endif }; #ifdef SMP static int (*spr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { #ifdef XDOUBLE xspr2_thread_U, xspr2_thread_L, #elif defined(DOUBLE) zspr2_thread_U, zspr2_thread_L, #else cspr2_thread_U, cspr2_thread_L, #endif }; #endif void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint incx = *INCX; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); #ifdef SMP } else { (spr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zswap.c000066400000000000000000000076171313527062700164270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; #else void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #endif #ifdef SMP int mode; FLOAT dummyalpha[2] = {ZERO, ZERO}; int nthreads; #endif #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif if (n <= 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * 2; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP nthreads = num_cpu_avail(1); //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; if (nthreads == 1) { #endif SWAP_K(n, 0, 0, ZERO, ZERO, x, incx, y, incy, NULL, 0); #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif blas_level1_thread(mode, n, 0, 0, dummyalpha, x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); } #endif FUNCTION_PROFILE_END(2, 2 * n, 0); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zsymv.c000066400000000000000000000114021313527062700164360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XSYMV " #elif defined(DOUBLE) #define ERROR_NAME "ZSYMV " #else #define ERROR_NAME "CSYMV " #endif void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (symv[uplo])(n, n, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); #ifdef SMP } else { (symv_thread[uplo])(n, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zsyr.c000066400000000000000000000131311313527062700162560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XSYR " #elif defined(DOUBLE) #define ERROR_NAME "ZSYR " #else #define ERROR_NAME "CSYR " #endif static int (*syr[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE xsyr_U, xsyr_L, #elif defined(DOUBLE) zsyr_U, zsyr_L, #else csyr_U, csyr_L, #endif }; #ifdef SMP static int (*syr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xsyr_thread_U, xsyr_thread_L, #elif defined(DOUBLE) zsyr_thread_U, zsyr_thread_L, #else csyr_thread_U, csyr_thread_L, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { FLOAT *buffer; int trans, uplo; blasint info; FLOAT * ALPHA = α FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; trans = -1; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (lda < MAX(1, n)) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (syr[uplo])(n, alpha_r, alpha_i, x, incx, a, lda, buffer); #ifdef SMP } else { (syr_thread[uplo])(n, ALPHA, x, incx, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/zsyr2.c000066400000000000000000000113761313527062700163510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "QSYR2 " #elif defined(DOUBLE) #define ERROR_NAME "ZSYR2 " #else #define ERROR_NAME "CSYR2 " #endif static int (*syr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE xsyr2_U, xsyr2_L, #elif defined(DOUBLE) zsyr2_U, zsyr2_L, #else csyr2_U, csyr2_L, #endif }; #ifdef SMP static int (*syr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xsyr2_thread_U, xsyr2_thread_L, #elif defined(DOUBLE) zsyr2_thread_U, zsyr2_thread_L, #else csyr2_thread_U, csyr2_thread_L, #endif }; #endif void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (syr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); #ifdef SMP } else { (syr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/ztbmv.c000066400000000000000000000201431313527062700164120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTBMV " #elif defined(DOUBLE) #define ERROR_NAME "ZTBMV " #else #define ERROR_NAME "CTBMV " #endif static int (*tbmv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xtbmv_NUU, xtbmv_NUN, xtbmv_NLU, xtbmv_NLN, xtbmv_TUU, xtbmv_TUN, xtbmv_TLU, xtbmv_TLN, xtbmv_RUU, xtbmv_RUN, xtbmv_RLU, xtbmv_RLN, xtbmv_CUU, xtbmv_CUN, xtbmv_CLU, xtbmv_CLN, #elif defined(DOUBLE) ztbmv_NUU, ztbmv_NUN, ztbmv_NLU, ztbmv_NLN, ztbmv_TUU, ztbmv_TUN, ztbmv_TLU, ztbmv_TLN, ztbmv_RUU, ztbmv_RUN, ztbmv_RLU, ztbmv_RLN, ztbmv_CUU, ztbmv_CUN, ztbmv_CLU, ztbmv_CLN, #else ctbmv_NUU, ctbmv_NUN, ctbmv_NLU, ctbmv_NLN, ctbmv_TUU, ctbmv_TUN, ctbmv_TLU, ctbmv_TLN, ctbmv_RUU, ctbmv_RUN, ctbmv_RLU, ctbmv_RLN, ctbmv_CUU, ctbmv_CUN, ctbmv_CLU, ctbmv_CLN, #endif }; #ifdef SMP static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xtbmv_thread_NUU, xtbmv_thread_NUN, xtbmv_thread_NLU, xtbmv_thread_NLN, xtbmv_thread_TUU, xtbmv_thread_TUN, xtbmv_thread_TLU, xtbmv_thread_TLN, xtbmv_thread_RUU, xtbmv_thread_RUN, xtbmv_thread_RLU, xtbmv_thread_RLN, xtbmv_thread_CUU, xtbmv_thread_CUN, xtbmv_thread_CLU, xtbmv_thread_CLN, #elif defined(DOUBLE) ztbmv_thread_NUU, ztbmv_thread_NUN, ztbmv_thread_NLU, ztbmv_thread_NLN, ztbmv_thread_TUU, ztbmv_thread_TUN, ztbmv_thread_TLU, ztbmv_thread_TLN, ztbmv_thread_RUU, ztbmv_thread_RUN, ztbmv_thread_RLU, ztbmv_thread_RLN, ztbmv_thread_CUU, ztbmv_thread_CUN, ztbmv_thread_CLU, ztbmv_thread_CLN, #else ctbmv_thread_NUU, ctbmv_thread_NUN, ctbmv_thread_NLU, ctbmv_thread_NLN, ctbmv_thread_TUU, ctbmv_thread_TUN, ctbmv_thread_TLU, ctbmv_thread_TLN, ctbmv_thread_RUU, ctbmv_thread_RUN, ctbmv_thread_RLU, ctbmv_thread_RLN, ctbmv_thread_CUU, ctbmv_thread_CUN, ctbmv_thread_CLU, ctbmv_thread_CLN, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint k = *K; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (tbmv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); #ifdef SMP } else { (tbmv_thread[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/ztbsv.c000066400000000000000000000154561313527062700164330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTBSV " #elif defined(DOUBLE) #define ERROR_NAME "ZTBSV " #else #define ERROR_NAME "CTBSV " #endif static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xtbsv_NUU, xtbsv_NUN, xtbsv_NLU, xtbsv_NLN, xtbsv_TUU, xtbsv_TUN, xtbsv_TLU, xtbsv_TLN, xtbsv_RUU, xtbsv_RUN, xtbsv_RLU, xtbsv_RLN, xtbsv_CUU, xtbsv_CUN, xtbsv_CLU, xtbsv_CLN, #elif defined(DOUBLE) ztbsv_NUU, ztbsv_NUN, ztbsv_NLU, ztbsv_NLN, ztbsv_TUU, ztbsv_TUN, ztbsv_TLU, ztbsv_TLN, ztbsv_RUU, ztbsv_RUN, ztbsv_RLU, ztbsv_RLN, ztbsv_CUU, ztbsv_CUN, ztbsv_CLU, ztbsv_CLN, #else ctbsv_NUU, ctbsv_NUN, ctbsv_NLU, ctbsv_NLN, ctbsv_TUU, ctbsv_TUN, ctbsv_TLU, ctbsv_TLN, ctbsv_RUU, ctbsv_RUN, ctbsv_RLU, ctbsv_RLN, ctbsv_CUU, ctbsv_CUN, ctbsv_CLU, ctbsv_CLN, #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint k = *K; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); (tbsv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/ztpmv.c000066400000000000000000000173501313527062700164360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTPMV " #elif defined(DOUBLE) #define ERROR_NAME "ZTPMV " #else #define ERROR_NAME "CTPMV " #endif static int (*tpmv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xtpmv_NUU, xtpmv_NUN, xtpmv_NLU, xtpmv_NLN, xtpmv_TUU, xtpmv_TUN, xtpmv_TLU, xtpmv_TLN, xtpmv_RUU, xtpmv_RUN, xtpmv_RLU, xtpmv_RLN, xtpmv_CUU, xtpmv_CUN, xtpmv_CLU, xtpmv_CLN, #elif defined(DOUBLE) ztpmv_NUU, ztpmv_NUN, ztpmv_NLU, ztpmv_NLN, ztpmv_TUU, ztpmv_TUN, ztpmv_TLU, ztpmv_TLN, ztpmv_RUU, ztpmv_RUN, ztpmv_RLU, ztpmv_RLN, ztpmv_CUU, ztpmv_CUN, ztpmv_CLU, ztpmv_CLN, #else ctpmv_NUU, ctpmv_NUN, ctpmv_NLU, ctpmv_NLN, ctpmv_TUU, ctpmv_TUN, ctpmv_TLU, ctpmv_TLN, ctpmv_RUU, ctpmv_RUN, ctpmv_RLU, ctpmv_RLN, ctpmv_CUU, ctpmv_CUN, ctpmv_CLU, ctpmv_CLN, #endif }; #ifdef SMP static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xtpmv_thread_NUU, xtpmv_thread_NUN, xtpmv_thread_NLU, xtpmv_thread_NLN, xtpmv_thread_TUU, xtpmv_thread_TUN, xtpmv_thread_TLU, xtpmv_thread_TLN, xtpmv_thread_RUU, xtpmv_thread_RUN, xtpmv_thread_RLU, xtpmv_thread_RLN, xtpmv_thread_CUU, xtpmv_thread_CUN, xtpmv_thread_CLU, xtpmv_thread_CLN, #elif defined(DOUBLE) ztpmv_thread_NUU, ztpmv_thread_NUN, ztpmv_thread_NLU, ztpmv_thread_NLN, ztpmv_thread_TUU, ztpmv_thread_TUN, ztpmv_thread_TLU, ztpmv_thread_TLN, ztpmv_thread_RUU, ztpmv_thread_RUN, ztpmv_thread_RLU, ztpmv_thread_RLN, ztpmv_thread_CUU, ztpmv_thread_CUN, ztpmv_thread_CLU, ztpmv_thread_CLN, #else ctpmv_thread_NUU, ctpmv_thread_NUN, ctpmv_thread_NLU, ctpmv_thread_NLN, ctpmv_thread_TUU, ctpmv_thread_TUN, ctpmv_thread_TLU, ctpmv_thread_TLN, ctpmv_thread_RUU, ctpmv_thread_RUN, ctpmv_thread_RLU, ctpmv_thread_RLN, ctpmv_thread_CUU, ctpmv_thread_CUN, ctpmv_thread_CLU, ctpmv_thread_CLN, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; #ifdef SMP int nthreads; #endif if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (tpmv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); #ifdef SMP } else { (tpmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/ztpsv.c000066400000000000000000000147161313527062700164470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTPSV " #elif defined(DOUBLE) #define ERROR_NAME "ZTPSV " #else #define ERROR_NAME "CTPSV " #endif static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xtpsv_NUU, xtpsv_NUN, xtpsv_NLU, xtpsv_NLN, xtpsv_TUU, xtpsv_TUN, xtpsv_TLU, xtpsv_TLN, xtpsv_RUU, xtpsv_RUN, xtpsv_RLU, xtpsv_RLN, xtpsv_CUU, xtpsv_CUN, xtpsv_CLU, xtpsv_CLN, #elif defined(DOUBLE) ztpsv_NUU, ztpsv_NUN, ztpsv_NLU, ztpsv_NLN, ztpsv_TUU, ztpsv_TUN, ztpsv_TLU, ztpsv_TLN, ztpsv_RUU, ztpsv_RUN, ztpsv_RLU, ztpsv_RLN, ztpsv_CUU, ztpsv_CUN, ztpsv_CLU, ztpsv_CLN, #else ctpsv_NUU, ctpsv_NUN, ctpsv_NLU, ctpsv_NLN, ctpsv_TUU, ctpsv_TUN, ctpsv_TLU, ctpsv_TLN, ctpsv_RUU, ctpsv_RUN, ctpsv_RLU, ctpsv_RLN, ctpsv_CUU, ctpsv_CUN, ctpsv_CLU, ctpsv_CLN, #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); (tpsv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/ztrmv.c000066400000000000000000000210111313527062700164250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTRMV " #elif defined(DOUBLE) #define ERROR_NAME "ZTRMV " #else #define ERROR_NAME "CTRMV " #endif static int (*trmv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { #ifdef XDOUBLE xtrmv_NUU, xtrmv_NUN, xtrmv_NLU, xtrmv_NLN, xtrmv_TUU, xtrmv_TUN, xtrmv_TLU, xtrmv_TLN, xtrmv_RUU, xtrmv_RUN, xtrmv_RLU, xtrmv_RLN, xtrmv_CUU, xtrmv_CUN, xtrmv_CLU, xtrmv_CLN, #elif defined(DOUBLE) ztrmv_NUU, ztrmv_NUN, ztrmv_NLU, ztrmv_NLN, ztrmv_TUU, ztrmv_TUN, ztrmv_TLU, ztrmv_TLN, ztrmv_RUU, ztrmv_RUN, ztrmv_RLU, ztrmv_RLN, ztrmv_CUU, ztrmv_CUN, ztrmv_CLU, ztrmv_CLN, #else ctrmv_NUU, ctrmv_NUN, ctrmv_NLU, ctrmv_NLN, ctrmv_TUU, ctrmv_TUN, ctrmv_TLU, ctrmv_TLN, ctrmv_RUU, ctrmv_RUN, ctrmv_RLU, ctrmv_RLN, ctrmv_CUU, ctrmv_CUN, ctrmv_CLU, ctrmv_CLN, #endif }; #ifdef SMP static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE xtrmv_thread_NUU, xtrmv_thread_NUN, xtrmv_thread_NLU, xtrmv_thread_NLN, xtrmv_thread_TUU, xtrmv_thread_TUN, xtrmv_thread_TLU, xtrmv_thread_TLN, xtrmv_thread_RUU, xtrmv_thread_RUN, xtrmv_thread_RLU, xtrmv_thread_RLN, xtrmv_thread_CUU, xtrmv_thread_CUN, xtrmv_thread_CLU, xtrmv_thread_CLN, #elif defined(DOUBLE) ztrmv_thread_NUU, ztrmv_thread_NUN, ztrmv_thread_NLU, ztrmv_thread_NLN, ztrmv_thread_TUU, ztrmv_thread_TUN, ztrmv_thread_TLU, ztrmv_thread_TLN, ztrmv_thread_RUU, ztrmv_thread_RUN, ztrmv_thread_RLU, ztrmv_thread_RLN, ztrmv_thread_CUU, ztrmv_thread_CUN, ztrmv_thread_CLU, ztrmv_thread_CLN, #else ctrmv_thread_NUU, ctrmv_thread_NUN, ctrmv_thread_NLU, ctrmv_thread_NLN, ctrmv_thread_TUU, ctrmv_thread_TUN, ctrmv_thread_TLU, ctrmv_thread_TLN, ctrmv_thread_RUU, ctrmv_thread_RUN, ctrmv_thread_RLU, ctrmv_thread_RLN, ctrmv_thread_CUU, ctrmv_thread_CUN, ctrmv_thread_CLU, ctrmv_thread_CLN, #endif }; #endif #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans, buffer_size; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit, buffer_size; blasint info; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; #ifdef SMP // Calibrated on a Xeon E5-2630 if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) { nthreads = num_cpu_avail(2); if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) nthreads = 2; } else nthreads = 1; if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } else #endif { buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); // It seems to be required for some K8 or Barcelona CPU buffer_size += 8; if(incx != 1) buffer_size += n * 2; } STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP if (nthreads == 1) { #endif (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); #ifdef SMP } else { (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); } #endif STACK_FREE(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/interface/ztrsv.c000066400000000000000000000152051313527062700164430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif #ifdef XDOUBLE #define ERROR_NAME "XTRSV " #elif defined(DOUBLE) #define ERROR_NAME "ZTRSV " #else #define ERROR_NAME "CTRSV " #endif static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { #ifdef XDOUBLE xtrsv_NUU, xtrsv_NUN, xtrsv_NLU, xtrsv_NLN, xtrsv_TUU, xtrsv_TUN, xtrsv_TLU, xtrsv_TLN, xtrsv_RUU, xtrsv_RUN, xtrsv_RLU, xtrsv_RLN, xtrsv_CUU, xtrsv_CUN, xtrsv_CLU, xtrsv_CLN, #elif defined(DOUBLE) ztrsv_NUU, ztrsv_NUN, ztrsv_NLU, ztrsv_NLN, ztrsv_TUU, ztrsv_TUN, ztrsv_TLU, ztrsv_TLN, ztrsv_RUU, ztrsv_RUN, ztrsv_RLU, ztrsv_RLN, ztrsv_CUU, ztrsv_CUN, ztrsv_CLU, ztrsv_CLN, #else ctrsv_NUU, ctrsv_NUN, ctrsv_NLU, ctrsv_NLN, ctrsv_TUU, ctrsv_TUN, ctrsv_TLU, ctrsv_TLN, ctrsv_RUU, ctrsv_RUN, ctrsv_RLU, ctrsv_RLN, ctrsv_CUU, ctrsv_CUN, ctrsv_CLU, ctrsv_CLN, #endif }; #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; } OpenBLAS-0.2.20/kernel/000077500000000000000000000000001313527062700144245ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/CMakeLists.txt000066400000000000000000000731261313527062700171750ustar00rootroot00000000000000 include_directories(${PROJECT_SOURCE_DIR}) include("${PROJECT_SOURCE_DIR}/cmake/kernel.cmake") # Makefile if (DEFINED TARGET_CORE) #override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) set(BUILD_KERNEL 1) set(KDIR "") set(TSUFFIX "_${TARGET_CORE}") else () set(TARGET_CORE ${CORE}) set(KDIR "") set(TSUFFIX "") endif () SetDefaultL1() SetDefaultL2() SetDefaultL3() ParseMakefileVars("${KERNELDIR}/KERNEL") ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") if (${ARCH} STREQUAL "x86") if (NOT MSVC) GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) else() GenerateNamedObjects("${KERNELDIR}/cpuid_win.c" "" "" false "" "" true) endif() endif () # don't use float type name mangling here GenerateNamedObjects("${KERNELDIR}/${LSAME_KERNEL}" "F_INTERFACE" "lsame" false "" "" true) GenerateNamedObjects("${KERNELDIR}/${SCABS_KERNEL}" "COMPLEX;F_INTERFACE" "scabs1" false "" "" true) GenerateNamedObjects("${KERNELDIR}/${DCABS_KERNEL}" "DOUBLE;COMPLEX;F_INTERFACE" "dcabs1" false "" "" true) # Makefile.L1 foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) if (DEFINED ${float_char}MAXKERNEL) GenerateNamedObjects("${KERNELDIR}/${${float_char}MAXKERNEL}" "" "max_k" false "" "" false ${float_type}) endif () if (DEFINED ${float_char}MINKERNEL) GenerateNamedObjects("${KERNELDIR}/${${float_char}MINKERNEL}" "" "min_k" false "" "" false ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${I${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false ${float_type}) if (DEFINED I${float_char}MAXKERNEL) GenerateNamedObjects("${KERNELDIR}/${I${float_char}MAXKERNEL}" "" "i*max_k" false "" "" false ${float_type}) endif () if (DEFINED I${float_char}MINKERNEL) GenerateNamedObjects("${KERNELDIR}/${I${float_char}MINKERNEL}" "" "i*min_k" false "" "" false ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}ASUMKERNEL}" "" "asum_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "" "axpy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dotu_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "CONJ" "dotc_k" false "" "" false ${float_type}) else () GenerateNamedObjects("${KERNELDIR}/${${float_char}DOTKERNEL}" "" "dot_k" false "" "" false ${float_type}) endif () if (${float_type} STREQUAL "COMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "srot_k" false "" "" false ${float_type}) endif() if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "drot_k" false "" "" false ${float_type}) endif() endforeach () #dsdot,sdsdot GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") # Makefile.L2 GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "XCONJ" "gerv_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ;XCONJ" "gerd_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANSA" "gemv_t" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "CONJ" "gemv_r" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "CONJ;TRANSA" "gemv_c" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ" "gemv_o" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "XCONJ;CONJ;TRANSA" "gemv_d" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_U_KERNEL}" "HEMV" "hemv_U" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_L_KERNEL}" "HEMV;LOWER" "hemv_L" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_V_KERNEL}" "HEMV;HEMVREV" "hemv_V" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}HEMV_M_KERNEL}" "HEMV;HEMVREV;LOWER" "hemv_M" false "" "" false ${float_type}) else () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVNKERNEL}" "" "gemv_n" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) endif () endforeach () # Makefile.L3 set(USE_TRMM false) if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "{CORE}" STREQUAL "zen") set(USE_TRMM true) endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () if (${float_char}GEMMITCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMITCOPY}" "${float_type}" "${${float_char}GEMMITCOPYOBJ}" false "" "" true ${float_type}) endif () if (${float_char}GEMMONCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMONCOPY}" "${float_type}" "${${float_char}GEMMONCOPYOBJ}" false "" "" true ${float_type}) endif () if (${float_char}GEMMOTCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "${float_type}" "${${float_char}GEMMOTCOPYOBJ}" false "" "" true ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_BETA}" "" "gemm_beta" false "" "" false ${float_type}) if (USE_TRMM) set(TRMM_KERNEL "${${float_char}TRMMKERNEL}") else () set(TRMM_KERNEL "${${float_char}GEMMKERNEL}") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") # just enumerate all these. there is an extra define for these indicating which side is a conjugate (e.g. CN NC NN) that I don't really want to work into GenerateCombinationObjects GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NN" "gemm_kernel_n" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CN" "gemm_kernel_l" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;NN" "trmm_kernel_LN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;TRANSA;NN" "trmm_kernel_LT" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;CONJ;CN" "trmm_kernel_LR" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;TRANSA;CONJ;CN" "trmm_kernel_LC" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;NN" "trmm_kernel_RN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;TRANSA;NN" "trmm_kernel_RT" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;CONJ;NC" "trmm_kernel_RR" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;TRANSA;CONJ;NC" "trmm_kernel_RC" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false ${float_type}) #hemm GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) # symm for c and z GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) #gemm3m if (USE_GEMM3M) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM3MKERNEL}" "NN" "gemm3m_kernel" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_oncopyb" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_oncopyr" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_oncopyi" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_otcopyb" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_otcopyr" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_otcopyi" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_incopyb" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_incopyr" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_incopyi" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_itcopyb" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_itcopyr" false "" "" false ${float_type}) GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_itcopyi" false "" "" false ${float_type}) endif() else () #For real GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) # symm for s and d GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) if (NOT DEFINED ${float_char}OMATCOPY_CN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_CN ../arm/zomatcopy_cn.c) else () set(${float_char}OMATCOPY_CN ../arm/omatcopy_cn.c) endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_RN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_RN ../arm/zomatcopy_rn.c) else () set(${float_char}OMATCOPY_RN ../arm/omatcopy_rn.c) endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_CT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_CT ../arm/zomatcopy_ct.c) else () set(${float_char}OMATCOPY_CT ../arm/omatcopy_ct.c) endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_RT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_RT ../arm/zomatcopy_rt.c) else () set(${float_char}OMATCOPY_RT ../arm/omatcopy_rt.c) endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CN}" "" "omatcopy_k_cn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RN}" "ROWM" "omatcopy_k_rn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CT}" "" "omatcopy_k_ct" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RT}" "ROWM" "omatcopy_k_rt" false "" "" false ${float_type}) if (NOT DEFINED ${float_char}OMATCOPY_CNC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_CNC ../arm/zomatcopy_cnc.c) endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_RNC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_RNC ../arm/zomatcopy_rnc.c) endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_CTC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_CTC ../arm/zomatcopy_ctc.c) endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_RTC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}OMATCOPY_RTC ../arm/zomatcopy_rtc.c) endif () endif () if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CNC}" "CONJ" "omatcopy_k_cnc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RNC}" "CONJ;ROWM" "omatcopy_k_rnc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_CTC}" "CONJ" "omatcopy_k_ctc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}OMATCOPY_RTC}" "CONJ;ROWM" "omatcopy_k_rtc" false "" "" false ${float_type}) endif() #imatcopy if (NOT DEFINED ${float_char}IMATCOPY_CN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_CN ../generic/zimatcopy_cn.c) else () set(${float_char}IMATCOPY_CN ../generic/imatcopy_cn.c) endif () endif () if (NOT DEFINED ${float_char}IMATCOPY_RN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_RN ../generic/zimatcopy_rn.c) else () set(${float_char}IMATCOPY_RN ../generic/imatcopy_rn.c) endif () endif () if (NOT DEFINED ${float_char}IMATCOPY_CT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_CT ../generic/zimatcopy_ct.c) else () set(${float_char}IMATCOPY_CT ../generic/imatcopy_ct.c) endif () endif () if (NOT DEFINED ${float_char}IMATCOPY_RT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_RT ../generic/zimatcopy_rt.c) else () set(${float_char}IMATCOPY_RT ../generic/imatcopy_rt.c) endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CN}" "" "imatcopy_k_cn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RN}" "ROWM" "imatcopy_k_rn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CT}" "" "imatcopy_k_ct" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RT}" "ROWM" "imatcopy_k_rt" false "" "" false ${float_type}) if (NOT DEFINED ${float_char}IMATCOPY_CNC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_CNC ../generic/zimatcopy_cnc.c) endif () endif () if (NOT DEFINED ${float_char}IMATCOPY_RNC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_RNC ../generic/zimatcopy_rnc.c) endif () endif () if (NOT DEFINED ${float_char}IMATCOPY_CTC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_CTC ../generic/zimatcopy_ctc.c) endif () endif () if (NOT DEFINED ${float_char}IMATCOPY_RTC) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") set(${float_char}IMATCOPY_RTC ../generic/zimatcopy_rtc.c) endif () endif () if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CNC}" "CONJ" "imatcopy_k_cnc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RNC}" "CONJ;ROWM" "imatcopy_k_rnc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_CTC}" "CONJ" "imatcopy_k_ctc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}IMATCOPY_RTC}" "CONJ;ROWM" "imatcopy_k_rtc" false "" "" false ${float_type}) endif() #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () # Makefile.LA #DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) add_library(kernel OBJECT ${OPENBLAS_SRC}) OpenBLAS-0.2.20/kernel/Makefile000066400000000000000000000065211313527062700160700ustar00rootroot00000000000000ifdef TARGET_CORE TARGET = $(TARGET_CORE) endif TOPDIR = .. include $(TOPDIR)/Makefile.system ifdef TARGET_CORE override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) BUILD_KERNEL = 1 KDIR = TSUFFIX = _$(TARGET_CORE) else TARGET_CORE = $(CORE) KDIR = TSUFFIX = endif -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) include $(KERNELDIR)/KERNEL include Makefile.L1 include Makefile.L2 include Makefile.L3 include Makefile.LA HPLOBJS = \ dgemm_kernel.$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) \ dtrsm_kernel_LN.$(SUFFIX) dtrsm_kernel_LT.$(SUFFIX) \ dtrsm_kernel_RN.$(SUFFIX) dtrsm_kernel_RT.$(SUFFIX) \ daxpy_k.$(SUFFIX) dcopy_k.$(SUFFIX) ddot_k.$(SUFFIX) \ dger_k.$(SUFFIX) dscal_k.$(SUFFIX) idamax_k.$(SUFFIX) \ dgemv_n.$(SUFFIX) dgemv_t.$(SUFFIX) dgemm_beta.$(SUFFIX) \ dtrsm_iunucopy.$(SUFFIX) dtrsm_iunncopy.$(SUFFIX) \ dtrsm_ilnucopy.$(SUFFIX) dtrsm_ilnncopy.$(SUFFIX) \ dtrsm_iutucopy.$(SUFFIX) dtrsm_iutncopy.$(SUFFIX) \ dtrsm_iltucopy.$(SUFFIX) dtrsm_iltncopy.$(SUFFIX) \ dtrsm_ounucopy.$(SUFFIX) dtrsm_ounncopy.$(SUFFIX) \ dtrsm_olnucopy.$(SUFFIX) dtrsm_olnncopy.$(SUFFIX) \ dtrsm_outucopy.$(SUFFIX) dtrsm_outncopy.$(SUFFIX) \ dtrsm_oltucopy.$(SUFFIX) dtrsm_oltncopy.$(SUFFIX) COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) ifeq ($(DYNAMIC_ARCH), 1) SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX) endif KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h ifneq ($(NO_LAPACK), 1) KERNEL_INTERFACE += ../common_lapack.h endif ifeq ($(ARCH), x86) COMMONOBJS += cpuid.$(SUFFIX) endif ifdef EXPRECISION COMMONOBJS += qconjg.$(SUFFIX) qcabs1.$(SUFFIX) endif ifdef QUAD_PRECISION COMMONOBJS += qconjg.$(SUFFIX) qcabs1.$(SUFFIX) endif all : libs scabs1.$(SUFFIX): $(KERNELDIR)/$(SCABS_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DF_INTERFACE $< -o $(@F) dcabs1.$(SUFFIX): $(KERNELDIR)/$(DCABS_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DF_INTERFACE $< -o $(@F) qcabs1.$(SUFFIX): $(KERNELDIR)/$(QCABS_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) qconjg.$(SUFFIX): $(KERNELDIR)/qconjg.S $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h $(CC) -c $(CFLAGS) $< -o $@ setparam$(TSUFFIX).c : setparam-ref.c sed 's/TS/$(TSUFFIX)/g' $< > $(@F) kernel$(TSUFFIX).h : $(KERNEL_INTERFACE) sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F) cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S $(CC) -c $(CFLAGS) $< -o $(@F) scabs1.$(PSUFFIX): $(KERNELDIR)/$(SCABS_KERNEL) $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DF_INTERFACE $< -o $(@F) dcabs1.$(PSUFFIX): $(KERNELDIR)/$(DCABS_KERNEL) $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DF_INTERFACE $< -o $(@F) qcabs1.$(PSUFFIX): $(KERNELDIR)/$(QCABS_KERNEL) $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) qconjg.$(PSUFFIX): $(KERNELDIR)/qconjg.S $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) $(CC) -c $(PFLAGS) -DF_INTERFACE $< -o $(@F) cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S $(CC) -c $(PFLAGS) $< -o $(@F) #ifdef DYNAMIC_ARCH clean :: @rm -f setparam_*.c kernel_*.h setparam.h kernel.h #endif include $(TOPDIR)/Makefile.tail OpenBLAS-0.2.20/kernel/Makefile.L1000066400000000000000000000574371313527062700163570ustar00rootroot00000000000000### AMAX ### ifndef SAMAXKERNEL SAMAXKERNEL = amax.S endif ifndef DAMAXKERNEL DAMAXKERNEL = amax.S endif ifndef QAMAXKERNEL QAMAXKERNEL = amax.S endif ifndef CAMAXKERNEL CAMAXKERNEL = zamax.S endif ifndef ZAMAXKERNEL ZAMAXKERNEL = zamax.S endif ifndef XAMAXKERNEL XAMAXKERNEL = zamax.S endif ### AMIN ### ifndef SAMINKERNEL SAMINKERNEL = amin.S endif ifndef DAMINKERNEL DAMINKERNEL = amin.S endif ifndef QAMINKERNEL QAMINKERNEL = amin.S endif ifndef CAMINKERNEL CAMINKERNEL = zamin.S endif ifndef ZAMINKERNEL ZAMINKERNEL = zamin.S endif ifndef XAMINKERNEL XAMINKERNEL = zamin.S endif ### MAX ### ifndef SMAXKERNEL SMAXKERNEL = max.S endif ifndef DMAXKERNEL DMAXKERNEL = max.S endif ifndef QMAXKERNEL QMAXKERNEL = max.S endif ### MIN ### ifndef SMINKERNEL SMINKERNEL = min.S endif ifndef DMINKERNEL DMINKERNEL = min.S endif ifndef QMINKERNEL QMINKERNEL = min.S endif ### IAMAX ### ifndef ISAMAXKERNEL ISAMAXKERNEL = iamax.S endif ifndef IDAMAXKERNEL IDAMAXKERNEL = iamax.S endif ifndef IQAMAXKERNEL IQAMAXKERNEL = iamax.S endif ifndef ICAMAXKERNEL ICAMAXKERNEL = izamax.S endif ifndef IZAMAXKERNEL IZAMAXKERNEL = izamax.S endif ifndef IXAMAXKERNEL IXAMAXKERNEL = izamax.S endif ### IAMIN ### ifndef ISAMINKERNEL ISAMINKERNEL = iamin.S endif ifndef IDAMINKERNEL IDAMINKERNEL = iamin.S endif ifndef IQAMINKERNEL IQAMINKERNEL = iamin.S endif ifndef ICAMINKERNEL ICAMINKERNEL = izamin.S endif ifndef IZAMINKERNEL IZAMINKERNEL = izamin.S endif ifndef IXAMINKERNEL IXAMINKERNEL = izamin.S endif ### IMAX ### ifndef ISMAXKERNEL ISMAXKERNEL = iamax.S endif ifndef IDMAXKERNEL IDMAXKERNEL = iamax.S endif ifndef IQMAXKERNEL IQMAXKERNEL = iamax.S endif ### IMIN ### ifndef ISMINKERNEL ISMINKERNEL = iamin.S endif ifndef IDMINKERNEL IDMINKERNEL = iamin.S endif ifndef IQMINKERNEL IQMINKERNEL = iamin.S endif ### ASUM ### ifndef SASUMKERNEL SASUMKERNEL = asum.S endif ifndef DASUMKERNEL DASUMKERNEL = asum.S endif ifndef CASUMKERNEL CASUMKERNEL = zasum.S endif ifndef ZASUMKERNEL ZASUMKERNEL = zasum.S endif ifndef QASUMKERNEL QASUMKERNEL = asum.S endif ifndef XASUMKERNEL XASUMKERNEL = zasum.S endif ### AXPY ### ifndef SAXPYKERNEL SAXPYKERNEL = axpy.S endif ifndef DAXPYKERNEL DAXPYKERNEL = axpy.S endif ifndef CAXPYKERNEL CAXPYKERNEL = zaxpy.S endif ifndef ZAXPYKERNEL ZAXPYKERNEL = zaxpy.S endif ifndef QAXPYKERNEL QAXPYKERNEL = axpy.S endif ifndef XAXPYKERNEL XAXPYKERNEL = zaxpy.S endif ### COPY ### ifndef SCOPYKERNEL SCOPYKERNEL = copy.S endif ifndef DCOPYKERNEL DCOPYKERNEL = copy.S endif ifndef CCOPYKERNEL CCOPYKERNEL = zcopy.S endif ifndef ZCOPYKERNEL ZCOPYKERNEL = zcopy.S endif ifndef QCOPYKERNEL QCOPYKERNEL = copy.S endif ifndef XCOPYKERNEL XCOPYKERNEL = zcopy.S endif ### DOT ### ifndef SDOTKERNEL SDOTKERNEL = dot.S endif ifndef DDOTKERNEL DDOTKERNEL = dot.S endif ifndef CDOTKERNEL CDOTKERNEL = zdot.S endif ifndef ZDOTKERNEL ZDOTKERNEL = zdot.S endif ifndef QDOTKERNEL QDOTKERNEL = dot.S endif ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif ### NRM2 ### ifndef SNRM2KERNEL SNRM2KERNEL = nrm2.S endif ifndef DNRM2KERNEL DNRM2KERNEL = nrm2.S endif ifndef QNRM2KERNEL QNRM2KERNEL = nrm2.S endif ifndef CNRM2KERNEL CNRM2KERNEL = znrm2.S endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.S endif ifndef XNRM2KERNEL XNRM2KERNEL = znrm2.S endif ### ROT ### ifndef SROTKERNEL SROTKERNEL = rot.S endif ifndef DROTKERNEL DROTKERNEL = rot.S endif ifndef QROTKERNEL QROTKERNEL = rot.S endif ifndef CROTKERNEL CROTKERNEL = zrot.S endif ifndef ZROTKERNEL ZROTKERNEL = zrot.S endif ifndef XROTKERNEL XROTKERNEL = zrot.S endif ### SCAL ### ifndef SSCALKERNEL SSCALKERNEL = scal.S endif ifndef DSCALKERNEL DSCALKERNEL = scal.S endif ifndef CSCALKERNEL CSCALKERNEL = zscal.S endif ifndef ZSCALKERNEL ZSCALKERNEL = zscal.S endif ifndef QSCALKERNEL QSCALKERNEL = scal.S endif ifndef XSCALKERNEL XSCALKERNEL = zscal.S endif ### SWAP ### ifndef SSWAPKERNEL SSWAPKERNEL = swap.S endif ifndef DSWAPKERNEL DSWAPKERNEL = swap.S endif ifndef CSWAPKERNEL CSWAPKERNEL = zswap.S endif ifndef ZSWAPKERNEL ZSWAPKERNEL = zswap.S endif ifndef QSWAPKERNEL QSWAPKERNEL = swap.S endif ifndef XSWAPKERNEL XSWAPKERNEL = zswap.S endif ### GEMV ### ifndef SGEMVNKERNEL SGEMVNKERNEL = gemv_n.S endif ifndef SGEMVTKERNEL SGEMVTKERNEL = gemv_t.S endif ifndef DGEMVNKERNEL DGEMVNKERNEL = gemv_n.S endif ifndef DGEMVTKERNEL DGEMVTKERNEL = gemv_t.S endif ifndef CGEMVNKERNEL CGEMVNKERNEL = zgemv_n.S endif ifndef CGEMVTKERNEL CGEMVTKERNEL = zgemv_t.S endif ifndef ZGEMVNKERNEL ZGEMVNKERNEL = zgemv_n.S endif ifndef ZGEMVTKERNEL ZGEMVTKERNEL = zgemv_t.S endif ifndef QGEMVNKERNEL QGEMVNKERNEL = gemv_n.S endif ifndef QGEMVTKERNEL QGEMVTKERNEL = gemv_t.S endif ifndef XGEMVNKERNEL XGEMVNKERNEL = zgemv_n.S endif ifndef XGEMVTKERNEL XGEMVTKERNEL = zgemv_t.S endif ifndef SCABS_KERNEL SCABS_KERNEL = cabs.S endif ifndef DCABS_KERNEL DCABS_KERNEL = cabs.S endif ifndef QCABS_KERNEL QCABS_KERNEL = cabs.S endif ifndef LSAME_KERNEL LSAME_KERNEL = lsame.S endif ### AXPBY ### ifndef SAXPBYKERNEL SAXPBYKERNEL = ../arm/axpby.c endif ifndef DAXPBYKERNEL DAXPBYKERNEL = ../arm/axpby.c endif ifndef CAXPBYKERNEL CAXPBYKERNEL = ../arm/zaxpby.c endif ifndef ZAXPBYKERNEL ZAXPBYKERNEL = ../arm/zaxpby.c endif SBLASOBJS += \ samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ saxpby_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ daxpby_k$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) XBLASOBJS += \ xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) ### AMAX ### $(KDIR)samax_k$(TSUFFIX).$(SUFFIX) $(KDIR)samax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)damax_k$(TSUFFIX).$(SUFFIX) $(KDIR)damax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)qamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)camax_k$(TSUFFIX).$(SUFFIX) $(KDIR)camax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)zamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)xamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ ### AMIN ### $(KDIR)samin_k$(TSUFFIX).$(SUFFIX) $(KDIR)samin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)damin_k$(TSUFFIX).$(SUFFIX) $(KDIR)damin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)qamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)camin_k$(TSUFFIX).$(SUFFIX) $(KDIR)camin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)zamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)xamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ ### MAX ### $(KDIR)smax_k$(TSUFFIX).$(SUFFIX) $(KDIR)smax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)dmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)qmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ ### MIN ### $(KDIR)smin_k$(TSUFFIX).$(SUFFIX) $(KDIR)smin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)dmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)qmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ ### IAMAX ### $(KDIR)isamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)idamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)iqamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)icamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)izamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)ixamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ ### IAMIN ### $(KDIR)isamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)idamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)iqamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)icamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)izamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)ixamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ ### IMAX ### $(KDIR)ismax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)idmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ $(KDIR)iqmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ ### IMIN ### $(KDIR)ismin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)dasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)casum_k$(TSUFFIX).$(SUFFIX) $(KDIR)casum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ ifdef DSDOTKERNEL $(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ $(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ else $(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ $(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ endif $(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@ $(KDIR)zdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ $< -o $@ $(KDIR)xdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ $< -o $@ $(KDIR)xdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ $< -o $@ $(KDIR)cdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ $< -o $@ $(KDIR)cdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ $< -o $@ $(KDIR)snrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)snrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SNRM2KERNEL) $(CC) $(CFLAGS) -UCOMPLEX -c -UDOUBLE $< -o $@ $(KDIR)dnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)dnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DNRM2KERNEL) $(CC) $(CFLAGS) -UCOMPLEX -c -DDOUBLE $< -o $@ $(KDIR)qnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)qnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QNRM2KERNEL) $(CC) $(CFLAGS) -UCOMPLEX -c -DXDOUBLE $< -o $@ $(KDIR)cnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)cnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CNRM2KERNEL) $(CC) $(CFLAGS) -DCOMPLEX -c -UDOUBLE $< -o $@ $(KDIR)znrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)znrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZNRM2KERNEL) $(CC) $(CFLAGS) -DCOMPLEX -c -DDOUBLE $< -o $@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XNRM2KERNEL) $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ $(KDIR)zdrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DDOUBLE $< -o $@ $(KDIR)xqrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)xqrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)sscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)sscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSCALKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)dscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)dscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSCALKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)qscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSCALKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)cscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)cscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSCALKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ $(KDIR)zscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)zscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSCALKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ $(KDIR)xscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)xscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSCALKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)sswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)sswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSWAPKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)dswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)dswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSWAPKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)qswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSWAPKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)cswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)cswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSWAPKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ $(KDIR)zswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)zswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSWAPKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSWAPKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ OpenBLAS-0.2.20/kernel/Makefile.L2000066400000000000000000000421511313527062700163430ustar00rootroot00000000000000### GEMV ### ifndef SGEMVNKERNEL SGEMVNKERNEL = gemv_n.S endif ifndef SGEMVTKERNEL SGEMVTKERNEL = gemv_t.S endif ifndef DGEMVNKERNEL DGEMVNKERNEL = gemv_n.S endif ifndef DGEMVTKERNEL DGEMVTKERNEL = gemv_t.S endif ifndef CGEMVNKERNEL CGEMVNKERNEL = zgemv_n.S endif ifndef CGEMVTKERNEL CGEMVTKERNEL = zgemv_t.S endif ifndef ZGEMVNKERNEL ZGEMVNKERNEL = zgemv_n.S endif ifndef ZGEMVTKERNEL ZGEMVTKERNEL = zgemv_t.S endif ifndef QGEMVNKERNEL QGEMVNKERNEL = gemv_n.S endif ifndef QGEMVTKERNEL QGEMVTKERNEL = gemv_t.S endif ifndef XGEMVNKERNEL XGEMVNKERNEL = zgemv_n.S endif ifndef XGEMVTKERNEL XGEMVTKERNEL = zgemv_t.S endif ### GER ### ifndef SGERKERNEL SGERKERNEL = ../generic/ger.c endif ifndef DGERKERNEL DGERKERNEL = ../generic/ger.c endif ifndef QGERKERNEL QGERKERNEL = ../generic/ger.c endif ifndef CGERUKERNEL CGERUKERNEL = ../generic/zger.c endif ifndef CGERCKERNEL CGERCKERNEL = ../generic/zger.c endif ifndef ZGERUKERNEL ZGERUKERNEL = ../generic/zger.c endif ifndef ZGERCKERNEL ZGERCKERNEL = ../generic/zger.c endif ifndef XGERUKERNEL XGERUKERNEL = ../generic/zger.c endif ifndef XGERCKERNEL XGERCKERNEL = ../generic/zger.c endif ### SYMV ### ifndef SSYMV_U_KERNEL SSYMV_U_KERNEL = ../generic/symv_k.c endif ifndef SSYMV_L_KERNEL SSYMV_L_KERNEL = ../generic/symv_k.c endif ifndef DSYMV_U_KERNEL DSYMV_U_KERNEL = ../generic/symv_k.c endif ifndef DSYMV_L_KERNEL DSYMV_L_KERNEL = ../generic/symv_k.c endif ifndef QSYMV_U_KERNEL QSYMV_U_KERNEL = ../generic/symv_k.c endif ifndef QSYMV_L_KERNEL QSYMV_L_KERNEL = ../generic/symv_k.c endif ifndef CSYMV_U_KERNEL CSYMV_U_KERNEL = ../generic/zsymv_k.c endif ifndef CSYMV_L_KERNEL CSYMV_L_KERNEL = ../generic/zsymv_k.c endif ifndef ZSYMV_U_KERNEL ZSYMV_U_KERNEL = ../generic/zsymv_k.c endif ifndef ZSYMV_L_KERNEL ZSYMV_L_KERNEL = ../generic/zsymv_k.c endif ifndef XSYMV_U_KERNEL XSYMV_U_KERNEL = ../generic/zsymv_k.c endif ifndef XSYMV_L_KERNEL XSYMV_L_KERNEL = ../generic/zsymv_k.c endif ### HEMV ### ifndef CHEMV_U_KERNEL CHEMV_U_KERNEL = ../generic/zhemv_k.c endif ifndef CHEMV_L_KERNEL CHEMV_L_KERNEL = ../generic/zhemv_k.c endif ifndef CHEMV_V_KERNEL CHEMV_V_KERNEL = ../generic/zhemv_k.c endif ifndef CHEMV_M_KERNEL CHEMV_M_KERNEL = ../generic/zhemv_k.c endif ifndef ZHEMV_U_KERNEL ZHEMV_U_KERNEL = ../generic/zhemv_k.c endif ifndef ZHEMV_L_KERNEL ZHEMV_L_KERNEL = ../generic/zhemv_k.c endif ifndef ZHEMV_V_KERNEL ZHEMV_V_KERNEL = ../generic/zhemv_k.c endif ifndef ZHEMV_M_KERNEL ZHEMV_M_KERNEL = ../generic/zhemv_k.c endif ifndef XHEMV_U_KERNEL XHEMV_U_KERNEL = ../generic/zhemv_k.c endif ifndef XHEMV_L_KERNEL XHEMV_L_KERNEL = ../generic/zhemv_k.c endif ifndef XHEMV_V_KERNEL XHEMV_V_KERNEL = ../generic/zhemv_k.c endif ifndef XHEMV_M_KERNEL XHEMV_M_KERNEL = ../generic/zhemv_k.c endif SBLASOBJS += \ sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ sger_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ dger_k$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qgemv_n$(TSUFFIX).$(SUFFIX) qgemv_t$(TSUFFIX).$(SUFFIX) qsymv_U$(TSUFFIX).$(SUFFIX) qsymv_L$(TSUFFIX).$(SUFFIX) \ qger_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \ cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) \ csymv_U$(TSUFFIX).$(SUFFIX) csymv_L$(TSUFFIX).$(SUFFIX) \ chemv_U$(TSUFFIX).$(SUFFIX) chemv_L$(TSUFFIX).$(SUFFIX) chemv_V$(TSUFFIX).$(SUFFIX) chemv_M$(TSUFFIX).$(SUFFIX) \ cgeru_k$(TSUFFIX).$(SUFFIX) cgerc_k$(TSUFFIX).$(SUFFIX) cgerv_k$(TSUFFIX).$(SUFFIX) cgerd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zgemv_n$(TSUFFIX).$(SUFFIX) zgemv_t$(TSUFFIX).$(SUFFIX) zgemv_r$(TSUFFIX).$(SUFFIX) zgemv_c$(TSUFFIX).$(SUFFIX) \ zgemv_o$(TSUFFIX).$(SUFFIX) zgemv_u$(TSUFFIX).$(SUFFIX) zgemv_s$(TSUFFIX).$(SUFFIX) zgemv_d$(TSUFFIX).$(SUFFIX) \ zsymv_U$(TSUFFIX).$(SUFFIX) zsymv_L$(TSUFFIX).$(SUFFIX) \ zhemv_U$(TSUFFIX).$(SUFFIX) zhemv_L$(TSUFFIX).$(SUFFIX) zhemv_V$(TSUFFIX).$(SUFFIX) zhemv_M$(TSUFFIX).$(SUFFIX) \ zgeru_k$(TSUFFIX).$(SUFFIX) zgerc_k$(TSUFFIX).$(SUFFIX) zgerv_k$(TSUFFIX).$(SUFFIX) zgerd_k$(TSUFFIX).$(SUFFIX) XBLASOBJS += \ xgemv_n$(TSUFFIX).$(SUFFIX) xgemv_t$(TSUFFIX).$(SUFFIX) xgemv_r$(TSUFFIX).$(SUFFIX) xgemv_c$(TSUFFIX).$(SUFFIX) \ xgemv_o$(TSUFFIX).$(SUFFIX) xgemv_u$(TSUFFIX).$(SUFFIX) xgemv_s$(TSUFFIX).$(SUFFIX) xgemv_d$(TSUFFIX).$(SUFFIX) \ xsymv_U$(TSUFFIX).$(SUFFIX) xsymv_L$(TSUFFIX).$(SUFFIX) \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@ $(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ $(KDIR)cgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ $(KDIR)cgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ $(KDIR)cgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ $(KDIR)cgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ $(KDIR)cgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ $(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ $(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ $(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ $(KDIR)zgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ $(KDIR)zgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ $(KDIR)zgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ $(KDIR)zgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ $(KDIR)zgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ $(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ $(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ $(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ $(KDIR)xgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ $(KDIR)xgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ $(KDIR)xgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ $(KDIR)xgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ $(KDIR)xgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ $(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ $(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ $(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@ $(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@ $(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@ $(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@ $(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@ $(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@ $(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@ $(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@ $(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@ $(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@ $(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@ $(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@ $(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE $< -o $@ $(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE $< -o $@ $(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@ $(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@ $(KDIR)cgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ $< -o $@ $(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ -DXCONJ $< -o $@ $(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@ $(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@ $(KDIR)zgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ $< -o $@ $(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ -DXCONJ $< -o $@ $(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@ $(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@ $(KDIR)xgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ $< -o $@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@ $(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@ $(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ $(KDIR)chemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_L_KERNEL) $(CHEMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $@ $(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_V_KERNEL) $(CHEMV_U_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ $(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ $(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@ $(KDIR)zhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_L_KERNEL) $(ZHEMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $@ $(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_V_KERNEL) $(ZHEMV_U_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ $(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ $(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@ $(KDIR)xhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_L_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_V_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ OpenBLAS-0.2.20/kernel/Makefile.L3000066400000000000000000005700571313527062700163570ustar00rootroot00000000000000USE_GEMM3M = 0 ifeq ($(ARCH), x86) USE_GEMM3M = 1 endif ifeq ($(ARCH), x86_64) USE_GEMM3M = 1 endif ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif ifeq ($(ARCH), arm) USE_TRMM = 1 endif ifeq ($(ARCH), arm64) USE_TRMM = 1 endif ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif ifeq ($(TARGET), GENERIC) USE_TRMM = 1 endif ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif ifeq ($(CORE), ZEN) USE_TRMM = 1 endif ifeq ($(CORE), POWER8) USE_TRMM = 1 endif ifeq ($(CORE), Z13) USE_TRMM = 1 endif SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) DKERNELOBJS += \ dgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) QKERNELOBJS += \ qgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) CKERNELOBJS += \ cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) ZKERNELOBJS += \ zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) XKERNELOBJS += \ xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) QBLASOBJS += $(QKERNELOBJS) CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ DBLASOBJS += \ dgemm_beta$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ QBLASOBJS += \ qgemm_beta$(TSUFFIX).$(SUFFIX) \ qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ CBLASOBJS += \ cgemm_beta$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ ZBLASOBJS += \ zgemm_beta$(TSUFFIX).$(SUFFIX) \ ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ XBLASOBJS += \ xgemm_beta$(TSUFFIX).$(SUFFIX) \ xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ ifeq ($(USE_GEMM3M), 1) CBLASOBJS += cgemm3m_kernel$(TSUFFIX).$(SUFFIX) ZBLASOBJS += zgemm3m_kernel$(TSUFFIX).$(SUFFIX) XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) endif SBLASOBJS += \ strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \ strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \ strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \ strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ CBLASOBJS += \ ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \ ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) XBLASOBJS += \ xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX) ifeq ($(USE_GEMM3M), 1) CBLASOBJS += \ cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) XBLASOBJS += \ xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) endif ###### BLAS extensions ##### SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ sgeadd_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dgeadd_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cgeadd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ endif ifdef EXPRECISION $(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) $(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ endif endif $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ endif ifdef EXPRECISION $(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) $(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ $(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ $(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ $(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ $(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ $(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) $(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif $(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) $(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ endif ifdef EXPRECISION $(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) $(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ endif endif $(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) $(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif $(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) $(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ endif ifdef EXPRECISION $(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) $(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ $(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ $(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ $(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ $(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ $(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ $(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ $(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ $(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ $(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ $(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ $(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ $(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ $(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ $(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ $(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ $(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ $(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ $(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ $(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ $(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ $(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ $(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ $(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ $(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ $(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ $(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ $(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ $(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ ##### BLAS extensions ###### ifndef DOMATCOPY_CN DOMATCOPY_CN = ../arm/omatcopy_cn.c endif $(KDIR)domatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef DOMATCOPY_RN DOMATCOPY_RN = ../arm/omatcopy_rn.c endif $(KDIR)domatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef DOMATCOPY_CT DOMATCOPY_CT = ../arm/omatcopy_ct.c endif $(KDIR)domatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef DOMATCOPY_RT DOMATCOPY_RT = ../arm/omatcopy_rt.c endif $(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef DIMATCOPY_CN DIMATCOPY_CN = ../generic/imatcopy_cn.c endif $(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef DIMATCOPY_RN DIMATCOPY_RN = ../generic/imatcopy_rn.c endif $(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef DIMATCOPY_CT DIMATCOPY_CT = ../generic/imatcopy_ct.c endif $(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef DIMATCOPY_RT DIMATCOPY_RT = ../generic/imatcopy_rt.c endif $(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef SOMATCOPY_CN SOMATCOPY_CN = ../arm/omatcopy_cn.c endif $(KDIR)somatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef SOMATCOPY_RN SOMATCOPY_RN = ../arm/omatcopy_rn.c endif $(KDIR)somatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef SOMATCOPY_CT SOMATCOPY_CT = ../arm/omatcopy_ct.c endif $(KDIR)somatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef SOMATCOPY_RT SOMATCOPY_RT = ../arm/omatcopy_rt.c endif $(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef SIMATCOPY_CN SIMATCOPY_CN = ../generic/imatcopy_cn.c endif $(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef SIMATCOPY_RN SIMATCOPY_RN = ../generic/imatcopy_rn.c endif $(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef SIMATCOPY_CT SIMATCOPY_CT = ../generic/imatcopy_ct.c endif $(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef SIMATCOPY_RT SIMATCOPY_RT = ../generic/imatcopy_rt.c endif $(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ ifndef COMATCOPY_CN COMATCOPY_CN = ../arm/zomatcopy_cn.c endif $(KDIR)comatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef COMATCOPY_RN COMATCOPY_RN = ../arm/zomatcopy_rn.c endif $(KDIR)comatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef COMATCOPY_CT COMATCOPY_CT = ../arm/zomatcopy_ct.c endif $(KDIR)comatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef COMATCOPY_RT COMATCOPY_RT = ../arm/zomatcopy_rt.c endif $(KDIR)comatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef COMATCOPY_CNC COMATCOPY_CNC = ../arm/zomatcopy_cnc.c endif $(KDIR)comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CNC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef COMATCOPY_RNC COMATCOPY_RNC = ../arm/zomatcopy_rnc.c endif $(KDIR)comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RNC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef COMATCOPY_CTC COMATCOPY_CTC = ../arm/zomatcopy_ctc.c endif $(KDIR)comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef COMATCOPY_RTC COMATCOPY_RTC = ../arm/zomatcopy_rtc.c endif $(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef CIMATCOPY_CN CIMATCOPY_CN = ../generic/zimatcopy_cn.c endif $(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef CIMATCOPY_RN CIMATCOPY_RN = ../generic/zimatcopy_rn.c endif $(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef CIMATCOPY_CT CIMATCOPY_CT = ../generic/zimatcopy_ct.c endif $(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef CIMATCOPY_RT CIMATCOPY_RT = ../generic/zimatcopy_rt.c endif $(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef CIMATCOPY_CNC CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c endif $(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef CIMATCOPY_RNC CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c endif $(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef CIMATCOPY_CTC CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c endif $(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef CIMATCOPY_RTC CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c endif $(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef ZOMATCOPY_CN ZOMATCOPY_CN = ../arm/zomatcopy_cn.c endif $(KDIR)zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef ZOMATCOPY_RN ZOMATCOPY_RN = ../arm/zomatcopy_rn.c endif $(KDIR)zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef ZOMATCOPY_CT ZOMATCOPY_CT = ../arm/zomatcopy_ct.c endif $(KDIR)zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef ZOMATCOPY_RT ZOMATCOPY_RT = ../arm/zomatcopy_rt.c endif $(KDIR)zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef ZOMATCOPY_CNC ZOMATCOPY_CNC = ../arm/zomatcopy_cnc.c endif $(KDIR)zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CNC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef ZOMATCOPY_RNC ZOMATCOPY_RNC = ../arm/zomatcopy_rnc.c endif $(KDIR)zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RNC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef ZOMATCOPY_CTC ZOMATCOPY_CTC = ../arm/zomatcopy_ctc.c endif $(KDIR)zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef ZOMATCOPY_RTC ZOMATCOPY_RTC = ../arm/zomatcopy_rtc.c endif $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef ZIMATCOPY_CN ZIMATCOPY_CN = ../generic/zimatcopy_cn.c endif $(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef ZIMATCOPY_RN ZIMATCOPY_RN = ../generic/zimatcopy_rn.c endif $(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef ZIMATCOPY_CT ZIMATCOPY_CT = ../generic/zimatcopy_ct.c endif $(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ ifndef ZIMATCOPY_RT ZIMATCOPY_RT = ../generic/zimatcopy_rt.c endif $(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ ifndef ZIMATCOPY_CNC ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c endif $(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef ZIMATCOPY_RNC ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c endif $(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef ZIMATCOPY_CTC ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c endif $(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ ifndef ZIMATCOPY_RTC ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c endif $(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ ifndef SGEADD_K SGEADD_K = ../generic/geadd.c endif $(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef DGEADD_K DGEADD_K = ../generic/geadd.c endif $(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ ifndef CGEADD_K CGEADD_K = ../generic/zgeadd.c endif $(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ ifndef ZGEADD_K ZGEADD_K = ../generic/zgeadd.c endif $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ OpenBLAS-0.2.20/kernel/Makefile.LA000066400000000000000000000044011313527062700163560ustar00rootroot00000000000000ifneq ($(NO_LAPACK), 1) SBLASOBJS += sneg_tcopy$(TSUFFIX).$(SUFFIX) slaswp_ncopy$(TSUFFIX).$(SUFFIX) DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) QBLASOBJS += qneg_tcopy$(TSUFFIX).$(SUFFIX) qlaswp_ncopy$(TSUFFIX).$(SUFFIX) CBLASOBJS += cneg_tcopy$(TSUFFIX).$(SUFFIX) claswp_ncopy$(TSUFFIX).$(SUFFIX) ZBLASOBJS += zneg_tcopy$(TSUFFIX).$(SUFFIX) zlaswp_ncopy$(TSUFFIX).$(SUFFIX) XBLASOBJS += xneg_tcopy$(TSUFFIX).$(SUFFIX) xlaswp_ncopy$(TSUFFIX).$(SUFFIX) endif $(KDIR)sneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)sneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)dneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)dneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)qneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)qneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(QGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)cneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)cneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)zneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)zneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)xneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)xneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(XGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)slaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)slaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)dlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)dlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)qlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)qlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)claswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)claswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)zlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)zlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $< -o $@ $(KDIR)xlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)xlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $< -o $@ OpenBLAS-0.2.20/kernel/alpha/000077500000000000000000000000001313527062700155115ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/alpha/KERNEL000066400000000000000000000047441313527062700164250ustar00rootroot00000000000000ifndef SAMINKERNEL SAMINKERNEL = amax.S endif ifndef DAMINKERNEL DAMINKERNEL = amax.S endif ifndef CAMINKERNEL CAMINKERNEL = zamax.S endif ifndef ZAMINKERNEL ZAMINKERNEL = zamax.S endif ifndef SMINKERNEL SMINKERNEL = max.S endif ifndef DMINKERNEL DMINKERNEL = max.S endif ifndef ISAMINKERNEL ISAMINKERNEL = iamax.S endif ifndef IDAMINKERNEL IDAMINKERNEL = iamax.S endif ifndef ICAMINKERNEL ICAMINKERNEL = izamax.S endif ifndef IZAMINKERNEL IZAMINKERNEL = izamax.S endif ifndef ISMINKERNEL ISMINKERNEL = iamax.S endif ifndef IDMINKERNEL IDMINKERNEL = iamax.S endif ifndef CCOPYKERNEL CCOPYKERNEL = copy.S endif ifndef ZCOPYKERNEL ZCOPYKERNEL = copy.S endif ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif ifndef DNRM2KERNEL DNRM2KERNEL = dnrm2.S endif ifndef CNRM2KERNEL CNRM2KERNEL = cnrm2.S endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.S endif SGEMMKERNEL = gemm_kernel_4x4.S SGEMM_BETA = gemm_beta.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4.S DGEMM_BETA = gemm_beta.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2.S CGEMM_BETA = zgemm_beta.S CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2.S ZGEMM_BETA = zgemm_beta.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) SGEMM_BETA = gemm_beta.S DGEMM_BETA = gemm_beta.S CGEMM_BETA = zgemm_beta.S ZGEMM_BETA = zgemm_beta.S STRSMKERNEL_LN = trsm_kernel_4x4_LN.S STRSMKERNEL_LT = trsm_kernel_4x4_LT.S STRSMKERNEL_RN = trsm_kernel_4x4_LT.S STRSMKERNEL_RT = trsm_kernel_4x4_RT.S DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S OpenBLAS-0.2.20/kernel/alpha/Makefile000066400000000000000000000000121313527062700171420ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/alpha/amax.S000066400000000000000000000141631313527062700165700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #ifndef USE_MIN #define CMPLT(a, b) cmptlt a, b #else #define CMPLT(a, b) cmptlt b, a #endif #define STACKSIZE 6 * 8 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 lda $sp, -STACKSIZE($sp) nop .align 4 stt $f2, 0($sp) fclr $f16 cmplt $31, N, $2 unop stt $f3, 8($sp) fclr $f17 cmplt $31, INCX, $3 unop stt $f4, 16($sp) fclr $f18 SXADDQ INCX, $31, INCX unop stt $f5, 24($sp) fclr $f19 and $2, $3, $0 unop stt $f6, 32($sp) fclr $f0 sra N, 3, $1 beq $0, $End # if (n <= 0) or (incx <= 0) return .align 4 LD $f20, 0 * SIZE(X) unop fabs $f20, $f0 ble $1, $L15 .align 4 fabs $f20, $f1 unop addq X, INCX, X unop LD $f21, 0 * SIZE(X) fabs $f20, $f2 addq X, INCX, X unop LD $f22, 0 * SIZE(X) fabs $f20, $f3 addq X, INCX, X unop LD $f23, 0 * SIZE(X) fabs $f20, $f4 addq X, INCX, X unop LD $f24, 0 * SIZE(X) addq X, INCX, X fabs $f20, $f5 unop LD $f25, 0 * SIZE(X) fabs $f20, $f6 addq X, INCX, X unop LD $f26, 0 * SIZE(X) fabs $f20, $f28 addq X, INCX, X lda $1, -1($1) LD $f27, 0 * SIZE(X) unop addq X, INCX, X ble $1, $L13 .align 4 $L12: fcmovne $f16, $f12, $f4 unop fabs $f20, $f29 ldl $31, 56 * SIZE(X) fcmovne $f17, $f13, $f5 LD $f20, 0 * SIZE(X) fabs $f21, $f30 addq X, INCX, X fcmovne $f18, $f14, $f6 LD $f21, 0 * SIZE(X) fabs $f22, $f10 addq X, INCX, X fcmovne $f19, $f15, $f28 LD $f22, 0 * SIZE(X) fabs $f23, $f11 addq X, INCX, X fabs $f24, $f12 LD $f23, 0 * SIZE(X) CMPLT($f0, $f29), $f16 addq X, INCX, X fabs $f25, $f13 LD $f24, 0 * SIZE(X) CMPLT($f1, $f30), $f17 addq X, INCX, X fabs $f26, $f14 LD $f25, 0 * SIZE(X) CMPLT($f2, $f10), $f18 addq X, INCX, X fabs $f27, $f15 LD $f26, 0 * SIZE(X) CMPLT($f3, $f11), $f19 addq X, INCX, X fcmovne $f16, $f29, $f0 LD $f27, 0 * SIZE(X) CMPLT($f4, $f12), $f16 addq X, INCX, X fcmovne $f17, $f30, $f1 unop CMPLT($f5, $f13), $f17 lda $1, -1($1) # i -- fcmovne $f18, $f10, $f2 unop CMPLT($f6, $f14), $f18 unop fcmovne $f19, $f11, $f3 unop CMPLT($f28, $f15), $f19 bgt $1,$L12 .align 4 $L13: fcmovne $f16, $f12, $f4 fabs $f20, $f29 fcmovne $f17, $f13, $f5 fabs $f21, $f30 fcmovne $f18, $f14, $f6 fabs $f22, $f10 fcmovne $f19, $f15, $f28 fabs $f23, $f11 fabs $f24, $f12 CMPLT($f0, $f29), $f16 fabs $f25, $f13 CMPLT($f1, $f30), $f17 fabs $f26, $f14 CMPLT($f2, $f10), $f18 fabs $f27, $f15 CMPLT($f3, $f11), $f19 fcmovne $f16, $f29, $f0 CMPLT($f4, $f12), $f16 fcmovne $f17, $f30, $f1 CMPLT($f5, $f13), $f17 fcmovne $f18, $f10, $f2 CMPLT($f6, $f14), $f18 fcmovne $f19, $f11, $f3 CMPLT($f28, $f15), $f19 fcmovne $f16, $f12, $f4 CMPLT($f0, $f1), $f16 fcmovne $f17, $f13, $f5 CMPLT($f2, $f3), $f17 fcmovne $f18, $f14, $f6 CMPLT($f4, $f5), $f18 fcmovne $f19, $f15, $f28 CMPLT($f6, $f28), $f19 fcmovne $f16, $f1, $f0 fcmovne $f17, $f3, $f2 fcmovne $f18, $f5, $f4 fcmovne $f19, $f28, $f6 CMPLT($f0, $f2), $f16 CMPLT($f4, $f6), $f17 fcmovne $f16, $f2, $f0 fcmovne $f17, $f6, $f4 CMPLT($f0, $f4), $f16 fcmovne $f16, $f4, $f0 .align 4 $L15: and N, 7, $1 unop unop ble $1, $End .align 4 $L16: LD $f20, 0 * SIZE(X) addq X, INCX, X fabs $f20, $f29 CMPLT($f0, $f29), $f16 fcmovne $f16, $f29, $f0 lda $1, -1($1) # i -- bgt $1, $L16 .align 4 $End: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/asum.S000066400000000000000000000112451313527062700166050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 88 #define N $16 #define X $17 #define INCX $18 #define I $19 #define s0 $f0 #define s1 $f1 #define s2 $f10 #define s3 $f11 #define a0 $f12 #define a1 $f13 #define a2 $f14 #define a3 $f15 #define a4 $f16 #define a5 $f17 #define a6 $f18 #define a7 $f19 #define t0 $f20 #define t1 $f21 #define t2 $f22 #define t3 $f23 PROLOGUE PROFCODE fclr s0 unop fclr t0 ble N, $L999 sra N, 3, I fclr s1 fclr s2 ble I, $L15 LD a0, 0 * SIZE(X) fclr t1 SXADDQ INCX, X, X fclr t2 LD a1, 0 * SIZE(X) fclr t3 SXADDQ INCX, X, X fclr s3 LD a2, 0 * SIZE(X) SXADDQ INCX, X, X LD a3, 0 * SIZE(X) SXADDQ INCX, X, X LD a4, 0 * SIZE(X) SXADDQ INCX, X, X LD a5, 0 * SIZE(X) SXADDQ INCX, X, X lda I, -1(I) ble I, $L13 .align 4 $L12: ADD s0, t0, s0 ldl $31, PREFETCHSIZE * 2 * SIZE(X) fabs a0, t0 lda I, -1(I) ADD s1, t1, s1 LD a6, 0 * SIZE(X) fabs a1, t1 SXADDQ INCX, X, X ADD s2, t2, s2 LD a7, 0 * SIZE(X) fabs a2, t2 SXADDQ INCX, X, X ADD s3, t3, s3 LD a0, 0 * SIZE(X) fabs a3, t3 SXADDQ INCX, X, X ADD s0, t0, s0 LD a1, 0 * SIZE(X) fabs a4, t0 SXADDQ INCX, X, X ADD s1, t1, s1 LD a2, 0 * SIZE(X) fabs a5, t1 SXADDQ INCX, X, X ADD s2, t2, s2 LD a3, 0 * SIZE(X) fabs a6, t2 SXADDQ INCX, X, X ADD s3, t3, s3 LD a4, 0 * SIZE(X) fabs a7, t3 SXADDQ INCX, X, X LD a5, 0 * SIZE(X) unop SXADDQ INCX, X, X bne I, $L12 .align 4 $L13: ADD s0, t0, s0 LD a6, 0 * SIZE(X) fabs a0, t0 SXADDQ INCX, X, X ADD s1, t1, s1 LD a7, 0 * SIZE(X) fabs a1, t1 SXADDQ INCX, X, X ADD s2, t2, s2 fabs a2, t2 ADD s3, t3, s3 fabs a3, t3 ADD s0, t0, s0 fabs a4, t0 ADD s1, t1, s1 fabs a5, t1 ADD s2, t2, s2 fabs a6, t2 ADD s3, t3, s3 fabs a7, t3 ADD s1, t1, s1 ADD s2, t2, s2 ADD s3, t3, s3 ADD s0, s1, s0 ADD s2, s3, s2 .align 4 $L15: and N, 7, I ADD s0, s2, s0 unop ble I, $L999 .align 4 $L17: ADD s0, t0, s0 LD a0, 0 * SIZE(X) SXADDQ INCX, X, X fabs a0, t0 lda I, -1(I) bne I, $L17 .align 4 $L999: ADD s0, t0, s0 ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/axpy.S000066400000000000000000000214261313527062700166230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 40 PROLOGUE PROFCODE .frame $sp, 16, $26, 0 ldq $24, 0($sp) fmov $f19, $f30 ldl $23, 8($sp) lda $sp, -16($sp) #ifndef PROFILE .prologue 0 #else .prologue 1 #endif nop sra $16, 3, $1 stt $f2, 0($sp) cmpeq $21, 1, $3 stt $f3, 8($sp) cmpeq $23, 1, $4 and $16, 7, $2 ble $16, $End and $3, $4, $3 fbeq $f30, $End beq $3, $Sub ble $1, $Remain .align 4 LD $f10, 0*SIZE($20) LD $f11, 1*SIZE($20) LD $f12, 2*SIZE($20) LD $f13, 3*SIZE($20) LD $f18, 0*SIZE($24) LD $f19, 1*SIZE($24) LD $f20, 2*SIZE($24) LD $f21, 3*SIZE($24) LD $f14, 4*SIZE($20) LD $f15, 5*SIZE($20) LD $f16, 6*SIZE($20) LD $f17, 7*SIZE($20) LD $f22, 4*SIZE($24) LD $f23, 5*SIZE($24) LD $f24, 6*SIZE($24) LD $f25, 7*SIZE($24) subq $1, 1, $1 addq $20, 8*SIZE, $20 unop ble $1, $LoopEnd .align 4 $Loop: ldt $f31, PREFETCHSIZE * SIZE($24) ldl $31, PREFETCHSIZE * SIZE($20) MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 LD $f10, 0*SIZE($20) MUL $f30, $f11, $f27 LD $f11, 1*SIZE($20) MUL $f30, $f12, $f28 LD $f12, 2*SIZE($20) MUL $f30, $f13, $f29 LD $f13, 3*SIZE($20) ADD $f18, $f26, $f0 LD $f18, 8*SIZE($24) MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 LD $f14, 4*SIZE($20) ADD $f19, $f27, $f1 LD $f19, 9*SIZE($24) MUL $f30, $f15, $f27 LD $f15, 5*SIZE($20) ADD $f20, $f28, $f2 LD $f20, 10*SIZE($24) MUL $f30, $f16, $f28 LD $f16, 6*SIZE($20) ADD $f21, $f29, $f3 LD $f21, 11*SIZE($24) MUL $f30, $f17, $f29 LD $f17, 7*SIZE($20) ST $f0, 0*SIZE($24) ADD $f22, $f26, $f0 ST $f1, 1*SIZE($24) ADD $f23, $f27, $f1 ST $f2, 2*SIZE($24) ADD $f24, $f28, $f2 ST $f3, 3*SIZE($24) ADD $f25, $f29, $f3 LD $f22, 12*SIZE($24) LD $f23, 13*SIZE($24) LD $f24, 14*SIZE($24) LD $f25, 15*SIZE($24) ST $f0, 4*SIZE($24) ST $f1, 5*SIZE($24) ST $f2, 6*SIZE($24) ST $f3, 7*SIZE($24) subq $1, 1, $1 addq $24, 8*SIZE, $24 addq $20, 8*SIZE, $20 bgt $1, $Loop .align 4 $LoopEnd: MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 MUL $f30, $f11, $f27 MUL $f30, $f12, $f28 MUL $f30, $f13, $f29 ADD $f18, $f26, $f0 MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ADD $f19, $f27, $f1 MUL $f30, $f15, $f27 ADD $f20, $f28, $f2 MUL $f30, $f16, $f28 ADD $f21, $f29, $f3 MUL $f30, $f17, $f29 ST $f0, 0*SIZE($24) ADD $f22, $f26, $f0 ST $f1, 1*SIZE($24) ADD $f23, $f27, $f1 ST $f2, 2*SIZE($24) ADD $f24, $f28, $f2 ST $f3, 3*SIZE($24) ADD $f25, $f29, $f3 ST $f0, 4*SIZE($24) ST $f1, 5*SIZE($24) ST $f2, 6*SIZE($24) ST $f3, 7*SIZE($24) addq $24, 8*SIZE, $24 .align 4 $Remain: ble $2, $End .align 4 $RemainLoop: LD $f10, 0*SIZE($20) LD $f11, 0*SIZE($24) addq $20, SIZE, $20 addq $24, SIZE, $24 MUL $f30, $f10, $f12 subq $2, 1, $2 ADD $f11, $f12, $f13 ST $f13, -1*SIZE($24) bgt $2, $RemainLoop .align 4 $End: ldt $f2, 0($sp) ldt $f3, 8($sp) lda $sp, 16($sp) ret .align 4 $Sub: SXSUBL $16, SIZE, $22 subq $1, 1, $4 ble $1, $SubRemain .align 4 LD $f10, 0($20) SXADDQ $21, $20, $20 LD $f11, 0($20) SXADDQ $21, $20, $20 LD $f12, 0($20) SXADDQ $21, $20, $20 LD $f13, 0($20) SXADDQ $21, $20, $20 LD $f18, 0($24) SXADDQ $23, $24, $22 LD $f19, 0($22) SXADDQ $23, $22, $22 LD $f20, 0($22) SXADDQ $23, $22, $22 LD $f21, 0($22) SXADDQ $23, $22, $22 LD $f14, 0($20) SXADDQ $21, $20, $20 LD $f15, 0($20) SXADDQ $21, $20, $20 LD $f16, 0($20) SXADDQ $21, $20, $20 LD $f17, 0($20) SXADDQ $21, $20, $20 LD $f22, 0($22) SXADDQ $23, $22, $22 LD $f23, 0($22) SXADDQ $23, $22, $22 LD $f24, 0($22) SXADDQ $23, $22, $22 LD $f25, 0($22) SXADDQ $23, $22, $22 unop ble $4, $SubLoopEnd .align 4 $SubLoop: MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 LD $f10, 0($20) unop SXADDQ $21, $20, $20 MUL $f30, $f11, $f27 LD $f11, 0($20) unop SXADDQ $21, $20, $20 MUL $f30, $f12, $f28 LD $f12, 0($20) unop SXADDQ $21, $20, $20 MUL $f30, $f13, $f29 LD $f13, 0($20) unop SXADDQ $21, $20, $20 ADD $f18, $f26, $f0 MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 LD $f14, 0($20) SXADDQ $21, $20, $20 ADD $f19, $f27, $f1 MUL $f30, $f15, $f27 LD $f15, 0($20) SXADDQ $21, $20, $20 ADD $f20, $f28, $f2 MUL $f30, $f16, $f28 LD $f16, 0($20) SXADDQ $21, $20, $20 ADD $f21, $f29, $f3 MUL $f30, $f17, $f29 LD $f17, 0($20) SXADDQ $21, $20, $20 ST $f0, 0($24) SXADDQ $23, $24, $24 ADD $f22, $f26, $f0 unop ST $f1, 0($24) SXADDQ $23, $24, $24 ADD $f23, $f27, $f1 unop ST $f2, 0($24) SXADDQ $23, $24, $24 ADD $f24, $f28, $f2 unop ST $f3, 0($24) SXADDQ $23, $24, $24 ADD $f25, $f29, $f3 unop LD $f18, 0($22) SXADDQ $23, $22, $22 LD $f19, 0($22) SXADDQ $23, $22, $22 LD $f20, 0($22) SXADDQ $23, $22, $22 LD $f21, 0($22) SXADDQ $23, $22, $22 LD $f22, 0($22) SXADDQ $23, $22, $22 LD $f23, 0($22) SXADDQ $23, $22, $22 LD $f24, 0($22) SXADDQ $23, $22, $22 LD $f25, 0($22) SXADDQ $23, $22, $22 ST $f0, 0($24) SXADDQ $23, $24, $24 ST $f1, 0($24) SXADDQ $23, $24, $24 ST $f2, 0($24) SXADDQ $23, $24, $24 ST $f3, 0($24) SXADDQ $23, $24, $24 subq $4, 1, $4 bgt $4, $SubLoop .align 4 $SubLoopEnd: MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 MUL $f30, $f11, $f27 MUL $f30, $f12, $f28 MUL $f30, $f13, $f29 ADD $f18, $f26, $f0 MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 ADD $f19, $f27, $f1 MUL $f30, $f15, $f27 ADD $f20, $f28, $f2 MUL $f30, $f16, $f28 ADD $f21, $f29, $f3 MUL $f30, $f17, $f29 ST $f0, 0($24) SXADDQ $23, $24, $24 ST $f1, 0($24) SXADDQ $23, $24, $24 ST $f2, 0($24) SXADDQ $23, $24, $24 ST $f3, 0($24) SXADDQ $23, $24, $24 ADD $f22, $f26, $f0 ADD $f23, $f27, $f1 ADD $f24, $f28, $f2 ADD $f25, $f29, $f3 ST $f0, 0($24) SXADDQ $23, $24, $24 ST $f1, 0($24) SXADDQ $23, $24, $24 ST $f2, 0($24) SXADDQ $23, $24, $24 ST $f3, 0($24) SXADDQ $23, $24, $24 .align 4 $SubRemain: ble $2, $SubEnd .align 4 $SubRemainLoop: LD $f10, 0($20) LD $f11, 0($24) SXADDQ $21, $20, $20 MUL $f30, $f10, $f12 subq $2, 1, $2 ADD $f11, $f12, $f13 ST $f13, 0($24) SXADDQ $23, $24, $24 bgt $2, $SubRemainLoop .align 4 $SubEnd: ldt $f2, 0($sp) ldt $f3, 8($sp) lda $sp, 16($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/cabs.S000066400000000000000000000060171313527062700165510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" .set noat .set noreorder .text .align 5 .globl NAME .ent NAME NAME: .frame $sp, 0, $26, 0 #ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount #endif LD $f10, 0($16) LD $f11, SIZE($16) #ifndef PROFILE .prologue 0 #else .prologue 1 #endif fabs $f10, $f12 fabs $f11, $f0 ADD $f12, $f0, $f0 ret .end NAME .ident VERSION OpenBLAS-0.2.20/kernel/alpha/cnrm2.S000066400000000000000000000171701313527062700166640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCH_SIZE 80 #define N $16 #define X $17 #define INCX $18 #define XX $19 #define I $0 #define a0 $f0 #define a1 $f1 #define a2 $f10 #define a3 $f11 #define t0 $f12 #define t1 $f13 #define t2 $f14 #define t3 $f15 #define x0 $f16 #define x1 $f17 #define x2 $f18 #define x3 $f19 #define x4 $f20 #define x5 $f21 #define x6 $f22 #define x7 $f23 PROLOGUE #if defined(EV4) || defined(EV5) .frame $30,16,$26,0 .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) PROFCODE .prologue 1 #else PROFCODE #endif fclr a0 sll INCX, ZBASE_SHIFT, INCX fclr a1 ble N, $L999 fclr a2 cmpeq INCX, 2 * SIZE, $0 fclr a3 beq $0, $L20 fclr t0 sra N, 3, I fclr t1 ble I, $L15 fclr t2 LD x0, 0 * SIZE(X) fclr t3 LD x1, 1 * SIZE(X) LD x2, 2 * SIZE(X) LD x3, 3 * SIZE(X) LD x4, 4 * SIZE(X) LD x5, 5 * SIZE(X) LD x6, 6 * SIZE(X) LD x7, 7 * SIZE(X) lda I, -1(I) ble I, $L12 .align 4 $L11: addt a0, t0, a0 ldl $31, (PREFETCH_SIZE) * SIZE(X) mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 mov X, XX mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(X) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(X) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(X) addt a3, t3, a3 unop mult x7, x7, t3 LD x7, 15 * SIZE(X) addt a0, t0, a0 unop mult x0, x0, t0 LD x0, 16 * SIZE(X) addt a1, t1, a1 lda X, 16 * SIZE(X) mult x1, x1, t1 LD x1, 17 * SIZE(XX) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 18 * SIZE(XX) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 19 * SIZE(XX) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 20 * SIZE(XX) addt a1, t1, a1 lda I, -1(I) mult x5, x5, t1 LD x5, 21 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 22 * SIZE(XX) addt a3, t3, a3 mult x7, x7, t3 LD x7, 23 * SIZE(XX) bgt I, $L11 .align 4 $L12: addt a0, t0, a0 mov X, XX mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 unop mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(XX) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(XX) addt a3, t3, a3 lda X, 16 * SIZE(X) mult x7, x7, t3 LD x7, 15 * SIZE(XX) addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L15: and N, 7, I ble I, $L998 .align 4 $L16: LD x0, 0 * SIZE(X) LD x1, 1 * SIZE(X) lda X, 2 * SIZE(X) addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 lda I, -1(I) bgt I, $L16 bsr $31, $L998 .align 4 $L20: fclr t0 sra N, 2, I fclr t1 ble I, $L25 LD x0, 0 * SIZE(X) fclr t2 LD x1, 1 * SIZE(X) addq X, INCX, X LD x2, 0 * SIZE(X) fclr t3 LD x3, 1 * SIZE(X) addq X, INCX, X LD x4, 0 * SIZE(X) lda I, -1(I) LD x5, 1 * SIZE(X) addq X, INCX, X LD x6, 0 * SIZE(X) ble I, $L22 .align 4 $L21: addt a0, t0, a0 LD x7, 1 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 LD x0, 0 * SIZE(X) mult x1, x1, t1 unop addt a2, t2, a2 LD x1, 1 * SIZE(X) mult x2, x2, t2 addq X, INCX, X addt a3, t3, a3 LD x2, 0 * SIZE(X) mult x3, x3, t3 unop addt a0, t0, a0 LD x3, 1 * SIZE(X) mult x4, x4, t0 addq X, INCX, X addt a1, t1, a1 LD x4, 0 * SIZE(X) mult x5, x5, t1 lda I, -1(I) addt a2, t2, a2 LD x5, 1 * SIZE(X) mult x6, x6, t2 addq X, INCX, X addt a3, t3, a3 LD x6, 0 * SIZE(X) mult x7, x7, t3 bgt I, $L21 .align 4 $L22: addt a0, t0, a0 LD x7, 1 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 mult x1, x1, t1 addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L25: and N, 3, I ble I, $L998 .align 4 $L26: LD x0, 0 * SIZE(X) lda I, -1(I) LD x1, 1 * SIZE(X) addq X, INCX, X addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 bgt I, $L26 .align 4 $L998: addt a0, t0, a0 addt a1, t1, a1 addt a0, a1, a0 addt a2, a3, a2 #if defined(EV4) || defined(EV5) addt a0, a2, $f16 jsr $26, ($27), sqrt !lituse_jsr!2 ldah $29, 0($26) !gpdisp!3 lda $29, 0($29) !gpdisp!3 #else addt a0, a2, a0 sqrtt a0, a0 #endif .align 4 $L999: #if defined(EV4) || defined(EV5) ldq $26, 0($sp) lda $sp, 16($sp) #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/copy.S000066400000000000000000000167321313527062700166200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #define Y $19 #define INCY $20 PROLOGUE PROFCODE .frame $sp, 0, $26, 0 #ifndef PROFILE .prologue 0 #else .prologue 1 #endif cmpeq INCX, 1, $0 ble N, $End #ifndef COMPLEX sra N, 4, $4 #else sra N, 3, $4 #endif cmpeq INCY, 1, $1 and $0, $1, $0 beq $0, $Sub #ifndef COMPLEX and N, 15, $5 #else and N, 7, $5 #endif ble $4, $Remain LD $f10, 0*SIZE(X) LD $f11, 1*SIZE(X) LD $f12, 2*SIZE(X) LD $f13, 3*SIZE(X) LD $f14, 4*SIZE(X) LD $f15, 5*SIZE(X) LD $f16, 6*SIZE(X) LD $f17, 7*SIZE(X) LD $f18, 8*SIZE(X) LD $f19, 9*SIZE(X) LD $f20, 10*SIZE(X) LD $f21, 11*SIZE(X) LD $f22, 12*SIZE(X) LD $f23, 13*SIZE(X) LD $f24, 14*SIZE(X) LD $f25, 15*SIZE(X) subq $4, 1, $4 lda X, 16*SIZE(X) ble $4, $MainLoopEnd .align 4 $MainLoop: ST $f10, 0*SIZE(Y) ST $f11, 1*SIZE(Y) ST $f12, 2*SIZE(Y) ST $f13, 3*SIZE(Y) LD $f10, 0*SIZE(X) LD $f11, 1*SIZE(X) LD $f12, 2*SIZE(X) LD $f13, 3*SIZE(X) ST $f14, 4*SIZE(Y) ST $f15, 5*SIZE(Y) ST $f16, 6*SIZE(Y) ST $f17, 7*SIZE(Y) LD $f14, 4*SIZE(X) LD $f15, 5*SIZE(X) LD $f16, 6*SIZE(X) LD $f17, 7*SIZE(X) ST $f18, 8*SIZE(Y) ST $f19, 9*SIZE(Y) ST $f20, 10*SIZE(Y) ST $f21, 11*SIZE(Y) LD $f18, 8*SIZE(X) LD $f19, 9*SIZE(X) LD $f20, 10*SIZE(X) LD $f21, 11*SIZE(X) ST $f22, 12*SIZE(Y) ST $f23, 13*SIZE(Y) ST $f24, 14*SIZE(Y) ST $f25, 15*SIZE(Y) LD $f22, 12*SIZE(X) LD $f23, 13*SIZE(X) LD $f24, 14*SIZE(X) LD $f25, 15*SIZE(X) subq $4, 1, $4 lda Y, 16*SIZE(Y) lda X, 16*SIZE(X) bgt $4, $MainLoop .align 4 $MainLoopEnd: ST $f10, 0*SIZE(Y) ST $f11, 1*SIZE(Y) ST $f12, 2*SIZE(Y) ST $f13, 3*SIZE(Y) ST $f14, 4*SIZE(Y) ST $f15, 5*SIZE(Y) ST $f16, 6*SIZE(Y) ST $f17, 7*SIZE(Y) ST $f18, 8*SIZE(Y) ST $f19, 9*SIZE(Y) ST $f20, 10*SIZE(Y) ST $f21, 11*SIZE(Y) ST $f22, 12*SIZE(Y) ST $f23, 13*SIZE(Y) ST $f24, 14*SIZE(Y) ST $f25, 15*SIZE(Y) lda Y, 16*SIZE(Y) .align 4 $Remain: ble $5, $End .align 4 $RemainLoop: #ifndef COMPLEX LD $f10, 0*SIZE(X) lda X, 1*SIZE(X) ST $f10, 0*SIZE(Y) lda Y, 1*SIZE(Y) #else LD $f10, 0*SIZE(X) LD $f11, 1*SIZE(X) lda X, 2*SIZE(X) ST $f10, 0*SIZE(Y) ST $f11, 1*SIZE(Y) lda Y, 2*SIZE(Y) #endif subq $5, 1, $5 bgt $5, $RemainLoop .align 4 $End: ret .align 4 $Sub: #ifdef COMPLEX addq INCX, INCX, INCX addq INCY, INCY, INCY and N, 7, $5 #else and N, 15, $5 #endif ble $4, $SubRemain .align 4 $SubMainLoop: #ifndef COMPLEX LD $f10, 0(X) SXADDQ INCX, X, X LD $f11, 0(X) SXADDQ INCX, X, X LD $f12, 0(X) SXADDQ INCX, X, X LD $f13, 0(X) SXADDQ INCX, X, X LD $f14, 0(X) SXADDQ INCX, X, X LD $f15, 0(X) SXADDQ INCX, X, X LD $f16, 0(X) SXADDQ INCX, X, X LD $f17, 0(X) SXADDQ INCX, X, X LD $f18, 0(X) SXADDQ INCX, X, X LD $f19, 0(X) SXADDQ INCX, X, X LD $f20, 0(X) SXADDQ INCX, X, X LD $f21, 0(X) SXADDQ INCX, X, X LD $f22, 0(X) SXADDQ INCX, X, X LD $f23, 0(X) SXADDQ INCX, X, X LD $f24, 0(X) SXADDQ INCX, X, X LD $f25, 0(X) SXADDQ INCX, X, X ST $f10, 0(Y) SXADDQ INCY, Y, Y ST $f11, 0(Y) SXADDQ INCY, Y, Y ST $f12, 0(Y) SXADDQ INCY, Y, Y ST $f13, 0(Y) SXADDQ INCY, Y, Y ST $f14, 0(Y) SXADDQ INCY, Y, Y ST $f15, 0(Y) SXADDQ INCY, Y, Y ST $f16, 0(Y) SXADDQ INCY, Y, Y ST $f17, 0(Y) SXADDQ INCY, Y, Y ST $f18, 0(Y) SXADDQ INCY, Y, Y ST $f19, 0(Y) SXADDQ INCY, Y, Y ST $f20, 0(Y) SXADDQ INCY, Y, Y ST $f21, 0(Y) SXADDQ INCY, Y, Y ST $f22, 0(Y) SXADDQ INCY, Y, Y ST $f23, 0(Y) SXADDQ INCY, Y, Y ST $f24, 0(Y) SXADDQ INCY, Y, Y ST $f25, 0(Y) SXADDQ INCY, Y, Y #else LD $f10, 0(X) LD $f11, SIZE(X) SXADDQ INCX, X, X LD $f12, 0(X) LD $f13, SIZE(X) SXADDQ INCX, X, X LD $f14, 0(X) LD $f15, SIZE(X) SXADDQ INCX, X, X LD $f16, 0(X) LD $f17, SIZE(X) SXADDQ INCX, X, X LD $f18, 0(X) LD $f19, SIZE(X) SXADDQ INCX, X, X LD $f20, 0(X) LD $f21, SIZE(X) SXADDQ INCX, X, X LD $f22, 0(X) LD $f23, SIZE(X) SXADDQ INCX, X, X LD $f24, 0(X) LD $f25, SIZE(X) SXADDQ INCX, X, X ST $f10, 0(Y) ST $f11, SIZE(Y) SXADDQ INCY, Y, Y ST $f12, 0(Y) ST $f13, SIZE(Y) SXADDQ INCY, Y, Y ST $f14, 0(Y) ST $f15, SIZE(Y) SXADDQ INCY, Y, Y ST $f16, 0(Y) ST $f17, SIZE(Y) SXADDQ INCY, Y, Y ST $f18, 0(Y) ST $f19, SIZE(Y) SXADDQ INCY, Y, Y ST $f20, 0(Y) ST $f21, SIZE(Y) SXADDQ INCY, Y, Y ST $f22, 0(Y) ST $f23, SIZE(Y) SXADDQ INCY, Y, Y ST $f24, 0(Y) ST $f25, SIZE(Y) SXADDQ INCY, Y, Y #endif subq $4, 1, $4 bgt $4, $SubMainLoop .align 4 $SubRemain: ble $5, $SubEnd .align 4 $SubRemainLoop: #ifndef COMPLEX LD $f10, 0(X) SXADDQ INCX, X, X ST $f10, 0(Y) SXADDQ INCY, Y, Y #else LD $f10, 0(X) LD $f11, SIZE(X) SXADDQ INCX, X, X ST $f10, 0(Y) ST $f11, SIZE(Y) SXADDQ INCY, Y, Y #endif subq $5, 1, $5 bgt $5, $SubRemainLoop .align 4 $SubEnd: ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/cscal.S000066400000000000000000000125301313527062700167230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ .set noat .set noreorder #define ASSEMBLER #include "common.h" #include "version.h" .globl NAME .ent NAME NAME: #ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount #endif #ifndef C_INTERFACE ldl $16, 0($16) # n mov $18, $20 # Store Address ldl $19, 0($19) # incx nop LD $f1, 0($17) # alpha #else mov $18, $20 # Store Address fmov $f17, $f1 # alpha #endif #ifndef PROFILE .prologue 0 #else .prologue 1 #endif sra $16, 1, $21 # 4-unrolling ble $16, $End lda $23, -1($19) ble $19, $End bgt $23, $INC_NOT_1 .align 4 ble $21, $Sub lda $21, -1($21) LD $f10, 0*SIZE($18) LD $f11, 1*SIZE($18) LD $f12, 2*SIZE($18) LD $f13, 3*SIZE($18) lda $18, 4*SIZE($18) ble $21, $MainRemain .align 4 $MainLoop: MUL $f10, $f1, $f20 LD $f10, 0*SIZE($18) MUL $f11, $f1, $f21 LD $f11, 1*SIZE($18) MUL $f12, $f1, $f22 LD $f12, 2*SIZE($18) MUL $f13, $f1, $f23 LD $f13, 3*SIZE($18) lda $18, 4*SIZE($18) lda $21, -1($21) ST $f20, 0*SIZE($20) ST $f21, 1*SIZE($20) ST $f22, 2*SIZE($20) ST $f23, 3*SIZE($20) lda $20, 4*SIZE($20) bgt $21, $MainLoop .align 4 $MainRemain: MUL $f10, $f1, $f20 MUL $f11, $f1, $f21 MUL $f12, $f1, $f22 MUL $f13, $f1, $f23 ST $f20, 0*SIZE($20) ST $f21, 1*SIZE($20) ST $f22, 2*SIZE($20) ST $f23, 3*SIZE($20) lda $20, 4*SIZE($20) .align 4 $Sub: blbc $16, $End LD $f10, 0*SIZE($18) LD $f11, 1*SIZE($18) MUL $f10, $f1, $f20 MUL $f11, $f1, $f21 ST $f20, 0*SIZE($20) ST $f21, 1*SIZE($20) .align 4 $End: ret .align 4 $INC_NOT_1: addl $19, $19, $19 ble $21, $INC_Sub lda $21, -1($21) LD $f10, 0*SIZE($18) LD $f11, 1*SIZE($18) SXADDQ $19, $18, $18 LD $f12, 0*SIZE($18) LD $f13, 1*SIZE($18) SXADDQ $19, $18, $18 ble $21, $INC_MainRemain .align 4 $INC_MainLoop: MUL $f10, $f1, $f20 LD $f10, 0*SIZE($18) MUL $f11, $f1, $f21 LD $f11, 1*SIZE($18) SXADDQ $19, $18, $18 MUL $f12, $f1, $f22 LD $f12, 0*SIZE($18) MUL $f13, $f1, $f23 LD $f13, 1*SIZE($18) SXADDQ $19, $18, $18 ST $f20, 0*SIZE($20) lda $21, -1($21) ST $f21, 1*SIZE($20) SXADDQ $19, $20, $20 ST $f22, 0*SIZE($20) ST $f23, 1*SIZE($20) SXADDQ $19, $20, $20 unop bgt $21, $INC_MainLoop .align 4 $INC_MainRemain: MUL $f10, $f1, $f20 MUL $f11, $f1, $f21 MUL $f12, $f1, $f22 MUL $f13, $f1, $f23 ST $f20, 0*SIZE($20) ST $f21, 1*SIZE($20) SXADDQ $19, $20, $20 ST $f22, 0*SIZE($20) ST $f23, 1*SIZE($20) SXADDQ $19, $20, $20 .align 4 $INC_Sub: blbc $16, $INC_End LD $f10, 0*SIZE($18) LD $f11, 1*SIZE($18) MUL $f10, $f1, $f20 MUL $f11, $f1, $f21 ST $f20, 0*SIZE($20) ST $f21, 1*SIZE($20) .align 4 $INC_End: ret .end NAME .ident VERSION OpenBLAS-0.2.20/kernel/alpha/dnrm2.S000066400000000000000000000172331313527062700166650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCH_SIZE 80 #define N $16 #define X $17 #define INCX $18 #define XX $19 #define I $0 #define a0 $f0 #define a1 $f1 #define a2 $f10 #define a3 $f11 #define t0 $f12 #define t1 $f13 #define t2 $f14 #define t3 $f15 #define x0 $f16 #define x1 $f17 #define x2 $f18 #define x3 $f19 #define x4 $f20 #define x5 $f21 #define x6 $f22 #define x7 $f23 PROLOGUE #if defined(EV4) || defined(EV5) .frame $30,16,$26,0 .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) PROFCODE .prologue 1 #else PROFCODE #endif fclr a0 SXADDQ INCX, 0, INCX fclr a1 ble N, $L999 fclr a2 cmpeq INCX, SIZE, $0 fclr a3 beq $0, $L20 fclr t0 sra N, 4, I fclr t1 ble I, $L15 fclr t2 LD x0, 0 * SIZE(X) fclr t3 LD x1, 1 * SIZE(X) LD x2, 2 * SIZE(X) LD x3, 3 * SIZE(X) LD x4, 4 * SIZE(X) LD x5, 5 * SIZE(X) LD x6, 6 * SIZE(X) LD x7, 7 * SIZE(X) lda I, -1(I) ble I, $L12 .align 4 $L11: addt a0, t0, a0 ldl $31, (PREFETCH_SIZE) * SIZE(X) mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 mov X, XX mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(X) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(X) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(X) addt a3, t3, a3 unop mult x7, x7, t3 LD x7, 15 * SIZE(X) addt a0, t0, a0 unop mult x0, x0, t0 LD x0, 16 * SIZE(X) addt a1, t1, a1 lda X, 16 * SIZE(X) mult x1, x1, t1 LD x1, 17 * SIZE(XX) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 18 * SIZE(XX) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 19 * SIZE(XX) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 20 * SIZE(XX) addt a1, t1, a1 lda I, -1(I) mult x5, x5, t1 LD x5, 21 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 22 * SIZE(XX) addt a3, t3, a3 mult x7, x7, t3 LD x7, 23 * SIZE(XX) bgt I, $L11 .align 4 $L12: addt a0, t0, a0 mov X, XX mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 unop mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(XX) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(XX) addt a3, t3, a3 lda X, 16 * SIZE(X) mult x7, x7, t3 LD x7, 15 * SIZE(XX) addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a1, t1, a1 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L15: and N, 15, I ble I, $L998 .align 4 $L16: LD x0, 0 * SIZE(X) lda X, 1 * SIZE(X) addt a0, t0, a0 mult x0, x0, t0 lda I, -1(I) bgt I, $L16 bsr $31, $L998 .align 4 $L20: fclr t0 sra N, 3, I fclr t1 ble I, $L25 fclr t2 fclr t3 LD x0, 0 * SIZE(X) addq X, INCX, X LD x1, 0 * SIZE(X) addq X, INCX, X LD x2, 0 * SIZE(X) addq X, INCX, X LD x3, 0 * SIZE(X) addq X, INCX, X LD x4, 0 * SIZE(X) addq X, INCX, X LD x5, 0 * SIZE(X) addq X, INCX, X LD x6, 0 * SIZE(X) addq X, INCX, X lda I, -1(I) ble I, $L22 .align 4 $L21: addt a0, t0, a0 LD x7, 0 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 LD x0, 0 * SIZE(X) mult x1, x1, t1 addq X, INCX, X addt a2, t2, a2 LD x1, 0 * SIZE(X) mult x2, x2, t2 addq X, INCX, X addt a3, t3, a3 LD x2, 0 * SIZE(X) mult x3, x3, t3 addq X, INCX, X addt a0, t0, a0 LD x3, 0 * SIZE(X) mult x4, x4, t0 addq X, INCX, X addt a1, t1, a1 LD x4, 0 * SIZE(X) mult x5, x5, t1 addq X, INCX, X addt a2, t2, a2 LD x5, 0 * SIZE(X) mult x6, x6, t2 addq X, INCX, X addt a3, t3, a3 LD x6, 0 * SIZE(X) mult x7, x7, t3 addq X, INCX, X lda I, -1(I) bgt I, $L21 .align 4 $L22: addt a0, t0, a0 LD x7, 0 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 unop mult x1, x1, t1 unop addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a1, t1, a1 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L25: and N, 7, I ble I, $L998 .align 4 $L26: LD x0, 0 * SIZE(X) addq X, INCX, X addt a0, t0, a0 mult x0, x0, t0 lda I, -1(I) bgt I, $L26 .align 4 $L998: addt a0, t0, a0 addt a0, a1, a0 addt a2, a3, a2 #if defined(EV4) || defined(EV5) addt a0, a2, $f16 jsr $26, ($27), sqrt !lituse_jsr!2 ldah $29, 0($26) !gpdisp!3 lda $29, 0($29) !gpdisp!3 #else addt a0, a2, a0 sqrtt a0, a0 #endif .align 4 $L999: #if defined(EV4) || defined(EV5) ldq $26, 0($sp) lda $sp, 16($sp) #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/dot.S000066400000000000000000000224731313527062700164330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 88 #define N $16 #define X $17 #define INCX $18 #define Y $19 #define INCY $20 #define I $5 #define s0 $f0 #define s1 $f30 #define s2 $f1 #define s3 $f2 #define a0 $f10 #define a1 $f11 #define a2 $f12 #define a3 $f13 #define a4 $f14 #define a5 $f15 #define a6 $f16 #define a7 $f17 #define b0 $f18 #define b1 $f19 #define b2 $f20 #define b3 $f21 #define b4 $f22 #define b5 $f23 #define b6 $f24 #define b7 $f25 #define t0 $f26 #define t1 $f27 #define t2 $f28 #define t3 $f29 PROLOGUE PROFCODE .frame $sp, 16, $26, 0 lda $sp, -16($sp) fclr s0 stt $f2, 0($sp) fclr s1 fclr s2 nop fclr s3 ble N, $L999 fclr t0 cmpeq INCX, 1, $21 fclr t1 cmpeq INCY, 1, $22 fclr t2 and $21, $22, $22 fclr t3 beq $22, $L20 #ifndef DOUBLE srl N, 4, I ble I, $L15 LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) LD b0, 0 * SIZE(Y) LD b1, 1 * SIZE(Y) LD a2, 2 * SIZE(X) LD a3, 3 * SIZE(X) LD b2, 2 * SIZE(Y) LD b3, 3 * SIZE(Y) LD a4, 4 * SIZE(X) LD a5, 5 * SIZE(X) LD b4, 4 * SIZE(Y) LD b5, 5 * SIZE(Y) LD a6, 6 * SIZE(X) LD a7, 7 * SIZE(X) addq X, 16 * SIZE, X subq I, 1, I addq Y, 16 * SIZE, Y ble I, $L13 .align 4 $L12: ldl $31, PREFETCHSIZE * 2 * SIZE(X) subq I, 1, I ldl $31, PREFETCHSIZE * 2 * SIZE(Y) addq X, 16 * SIZE, X ADD s0, t0, s0 LD b6, -10 * SIZE(Y) MUL a0, b0, t0 LD b7, -9 * SIZE(Y) ADD s1, t1, s1 LD a0, -24 * SIZE(X) MUL a1, b1, t1 LD a1, -23 * SIZE(X) ADD s2, t2, s2 LD b0, -8 * SIZE(Y) MUL a2, b2, t2 LD b1, -7 * SIZE(Y) ADD s3, t3, s3 LD a2, -22 * SIZE(X) MUL a3, b3, t3 LD a3, -21 * SIZE(X) ADD s0, t0, s0 LD b2, -6 * SIZE(Y) MUL a4, b4, t0 LD b3, -5 * SIZE(Y) ADD s1, t1, s1 LD a4, -20 * SIZE(X) MUL a5, b5, t1 LD a5, -19 * SIZE(X) ADD s2, t2, s2 LD b4, -4 * SIZE(Y) MUL a6, b6, t2 LD b5, -3 * SIZE(Y) ADD s3, t3, s3 LD a6, -18 * SIZE(X) MUL a7, b7, t3 LD a7, -17 * SIZE(X) ADD s0, t0, s0 LD b6, -2 * SIZE(Y) MUL a0, b0, t0 LD b7, -1 * SIZE(Y) ADD s1, t1, s1 LD a0, -16 * SIZE(X) MUL a1, b1, t1 LD a1, -15 * SIZE(X) ADD s2, t2, s2 LD b0, 0 * SIZE(Y) MUL a2, b2, t2 LD b1, 1 * SIZE(Y) ADD s3, t3, s3 LD a2, -14 * SIZE(X) MUL a3, b3, t3 LD a3, -13 * SIZE(X) ADD s0, t0, s0 LD b2, 2 * SIZE(Y) MUL a4, b4, t0 LD b3, 3 * SIZE(Y) ADD s1, t1, s1 LD a4, -12 * SIZE(X) MUL a5, b5, t1 LD a5, -11 * SIZE(X) ADD s2, t2, s2 LD b4, 4 * SIZE(Y) MUL a6, b6, t2 LD b5, 5 * SIZE(Y) ADD s3, t3, s3 LD a6, -10 * SIZE(X) MUL a7, b7, t3 LD a7, -9 * SIZE(X) addq Y, 16 * SIZE, Y bgt I, $L12 nop fnop .align 4 $L13: ADD s0, t0, s0 LD b6,-10 * SIZE(Y) MUL a0, b0, t0 LD b7, -9 * SIZE(Y) ADD s1, t1, s1 LD a0, -8 * SIZE(X) MUL a1, b1, t1 LD a1, -7 * SIZE(X) ADD s2, t2, s2 LD b0, -8 * SIZE(Y) MUL a2, b2, t2 LD b1, -7 * SIZE(Y) ADD s3, t3, s3 LD a2, -6 * SIZE(X) MUL a3, b3, t3 LD a3, -5 * SIZE(X) ADD s0, t0, s0 LD b2, -6 * SIZE(Y) MUL a4, b4, t0 LD b3, -5 * SIZE(Y) ADD s1, t1, s1 LD a4, -4 * SIZE(X) MUL a5, b5, t1 LD a5, -3 * SIZE(X) ADD s2, t2, s2 LD b4, -4 * SIZE(Y) MUL a6, b6, t2 LD b5, -3 * SIZE(Y) ADD s3, t3, s3 LD a6, -2 * SIZE(X) MUL a7, b7, t3 LD a7, -1 * SIZE(X) ADD s0, t0, s0 LD b6, -2 * SIZE(Y) MUL a0, b0, t0 LD b7, -1 * SIZE(Y) ADD s1, t1, s1 MUL a1, b1, t1 ADD s2, t2, s2 MUL a2, b2, t2 ADD s3, t3, s3 MUL a3, b3, t3 ADD s0, t0, s0 MUL a4, b4, t0 ADD s1, t1, s1 MUL a5, b5, t1 ADD s2, t2, s2 MUL a6, b6, t2 ADD s3, t3, s3 MUL a7, b7, t3 .align 4 $L15: ADD s0, t0, s0 and N, 15, I ADD s1, t1, s1 ble I, $L18 .align 4 #else srl N, 3, I ble I, $L15 LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) LD b0, 0 * SIZE(Y) LD b1, 1 * SIZE(Y) LD a2, 2 * SIZE(X) LD a3, 3 * SIZE(X) LD b2, 2 * SIZE(Y) LD b3, 3 * SIZE(Y) LD a4, 4 * SIZE(X) LD a5, 5 * SIZE(X) LD b4, 4 * SIZE(Y) LD b5, 5 * SIZE(Y) LD a6, 6 * SIZE(X) LD a7, 7 * SIZE(X) addq X, 8 * SIZE, X subq I, 1, I addq Y, 8 * SIZE, Y ble I, $L13 .align 4 $L12: ldl $31, PREFETCHSIZE * SIZE(X) subq I, 1, I ldl $31, PREFETCHSIZE * SIZE(Y) addq X, 8 * SIZE, X ADD s0, t0, s0 LD b6, -2 * SIZE(Y) MUL a0, b0, t0 LD b7, -1 * SIZE(Y) ADD s1, t1, s1 LD a0, -8 * SIZE(X) MUL a1, b1, t1 LD a1, -7 * SIZE(X) ADD s2, t2, s2 LD b0, 0 * SIZE(Y) MUL a2, b2, t2 LD b1, 1 * SIZE(Y) ADD s3, t3, s3 LD a2, -6 * SIZE(X) MUL a3, b3, t3 LD a3, -5 * SIZE(X) ADD s0, t0, s0 LD b2, 2 * SIZE(Y) MUL a4, b4, t0 LD b3, 3 * SIZE(Y) ADD s1, t1, s1 LD a4, -4 * SIZE(X) MUL a5, b5, t1 LD a5, -3 * SIZE(X) ADD s2, t2, s2 LD b4, 4 * SIZE(Y) MUL a6, b6, t2 LD b5, 5 * SIZE(Y) ADD s3, t3, s3 LD a6, -2 * SIZE(X) MUL a7, b7, t3 LD a7, -1 * SIZE(X) addq Y, 8 * SIZE, Y bgt I, $L12 nop fnop .align 4 $L13: ADD s0, t0, s0 LD b6, -2 * SIZE(Y) MUL a0, b0, t0 LD b7, -1 * SIZE(Y) ADD s1, t1, s1 MUL a1, b1, t1 ADD s2, t2, s2 MUL a2, b2, t2 ADD s3, t3, s3 MUL a3, b3, t3 ADD s0, t0, s0 MUL a4, b4, t0 ADD s1, t1, s1 MUL a5, b5, t1 ADD s2, t2, s2 MUL a6, b6, t2 ADD s3, t3, s3 MUL a7, b7, t3 .align 4 $L15: ADD s0, t0, s0 and N, 7, I ADD s1, t1, s1 ble I, $L18 .align 4 #endif $L16: LD a0, 0 * SIZE(X) addq X, SIZE, X LD b0, 0 * SIZE(Y) addq Y, SIZE, Y ADD s2, t2, s2 MUL a0, b0, t2 subq I, 1, I bgt I, $L16 .align 4 $L18: ADD s2, t2, s2 ADD s3, t3, s3 br $L999 .align 4 $L20: srl N, 2, I ble I, $L25 LD a0, 0 * SIZE(X) SXADDQ INCX, X, X LD b0, 0 * SIZE(Y) SXADDQ INCY, Y, Y LD a1, 0 * SIZE(X) SXADDQ INCX, X, X LD b1, 0 * SIZE(Y) SXADDQ INCY, Y, Y LD a2, 0 * SIZE(X) SXADDQ INCX, X, X LD b2, 0 * SIZE(Y) SXADDQ INCY, Y, Y LD a3, 0 * SIZE(X) SXADDQ INCX, X, X LD b3, 0 * SIZE(Y) subq I, 1, I SXADDQ INCY, Y, Y ble I, $L23 .align 4 $L22: ADD s0, t0, s0 MUL a0, b0, t0 ADD s1, t1, s1 MUL a1, b1, t1 ADD s2, t2, s2 MUL a2, b2, t2 ADD s3, t3, s3 MUL a3, b3, t3 LD a0, 0 * SIZE(X) SXADDQ INCX, X, X LD b0, 0 * SIZE(Y) SXADDQ INCY, Y, Y LD a1, 0 * SIZE(X) SXADDQ INCX, X, X LD b1, 0 * SIZE(Y) SXADDQ INCY, Y, Y LD a2, 0 * SIZE(X) SXADDQ INCX, X, X LD b2, 0 * SIZE(Y) SXADDQ INCY, Y, Y LD a3, 0 * SIZE(X) SXADDQ INCX, X, X LD b3, 0 * SIZE(Y) SXADDQ INCY, Y, Y subq I, 1, I bgt I, $L22 nop fnop .align 4 $L23: ADD s0, t0, s0 MUL a0, b0, t0 ADD s1, t1, s1 MUL a1, b1, t1 ADD s2, t2, s2 MUL a2, b2, t2 ADD s3, t3, s3 MUL a3, b3, t3 .align 4 $L25: ADD s0, t0, s0 and N, 3, I ADD s1, t1, s1 ble I, $L28 .align 4 $L26: LD a0, 0 * SIZE(X) SXADDQ INCX, X, X LD b0, 0 * SIZE(Y) SXADDQ INCY, Y, Y ADD s2, t2, s2 MUL a0, b0, t2 subq I, 1, I bgt I, $L26 .align 4 $L28: ADD s2, t2, s2 ADD s3, t3, s3 .align 4 $L999: ADD s2, s3, s2 ldt $f2, 0($sp) ADD s0, s1, s0 lda $sp, 16($sp) ADD s0, s2, s0 ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/gemm_beta.S000066400000000000000000000112221313527062700175530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" .set noat .set noreorder .text .align 5 .globl CNAME .ent CNAME CNAME: .frame $sp, 0, $26, 0 #ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount #endif ldq $18, 16($sp) ble $16, $End ldl $19, 24($sp) ble $17, $End #ifndef PROFILE .prologue 0 #else .prologue 1 #endif fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) .align 4 $BETA_NE_ZERO: sra $16, 3, $2 # i = (m >> 3) mov $18, $1 # c_offset = c lda $17, -1($17) # j -- ble $2,$L52 .align 4 $L51: lds $f31, 64($1) lda $2, -1($2) LD $f14, 0*SIZE($1) LD $f15, 1*SIZE($1) LD $f16, 2*SIZE($1) LD $f17, 3*SIZE($1) LD $f18, 4*SIZE($1) LD $f11, 5*SIZE($1) LD $f21, 6*SIZE($1) LD $f22, 7*SIZE($1) MUL $f19, $f14, $f23 MUL $f19, $f15, $f24 MUL $f19, $f16, $f25 MUL $f19, $f17, $f26 MUL $f19, $f18, $f27 MUL $f19, $f11, $f28 MUL $f19, $f21, $f29 MUL $f19, $f22, $f30 ST $f23, 0*SIZE($1) ST $f24, 1*SIZE($1) ST $f25, 2*SIZE($1) ST $f26, 3*SIZE($1) ST $f27, 4*SIZE($1) ST $f28, 5*SIZE($1) ST $f29, 6*SIZE($1) ST $f30, 7*SIZE($1) lda $1,8*SIZE($1) bgt $2,$L51 .align 4 $L52: and $16, 7, $2 ble $2,$L54 .align 4 $L53: LD $f12, 0($1) lda $2, -1($2) MUL $f19, $f12, $f23 ST $f23, 0($1) lda $1, SIZE($1) bgt $2,$L53 .align 4 $L54: SXADDQ $19, $18, $18 # c += ldc bgt $17,$BETA_NE_ZERO clr $0 ret .align 4 $BETA_EQ_ZERO: sra $16, 3, $2 # i = (m >> 3) lda $4, 8*SIZE($18) mov $18, $1 # c_offset = c lda $17, -1($17) # j -- ble $2,$L42 .align 4 $L41: ST $f31, 0*SIZE($1) ST $f31, 1*SIZE($1) ST $f31, 2*SIZE($1) ST $f31, 3*SIZE($1) ST $f31, 4*SIZE($1) ST $f31, 5*SIZE($1) ST $f31, 6*SIZE($1) ST $f31, 7*SIZE($1) lda $2, -1($2) lda $4, 8*SIZE($4) lda $1, 8*SIZE($1) bgt $2,$L41 .align 4 $L42: and $16, 7, $2 ble $2,$L44 .align 4 $L43: lda $2, -1($2) ST $f31, 0($1) lda $1, SIZE($1) bgt $2, $L43 .align 4 $L44: SXADDQ $19, $18, $18 # c += ldc bgt $17,$BETA_EQ_ZERO clr $0 .align 4 $End: ret .ident VERSION .end CNAME OpenBLAS-0.2.20/kernel/alpha/gemm_kernel_4x4.S000066400000000000000000001212151313527062700206230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 56 #define UNOP #endif #ifdef EV4 #define UNOP #endif #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $20 #define B $21 #define C $22 #define LDC $23 #define C1 $19 #define C2 $24 #define C3 $25 #define C4 $27 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define BB $3 #define OFFSET $4 #define ALPHA 64($sp) PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 lda $sp, -STACKSIZE($sp) ldq C, 0 + STACKSIZE($sp) ldq LDC, 8 + STACKSIZE($sp) #ifdef TRMMKERNEL ldq OFFSET, 16 + STACKSIZE($sp) #endif SXADDQ LDC, 0, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) stt $f19, ALPHA cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #if defined(TRMMKERNEL) && !defined(LEFT) subq $31, OFFSET, KK #endif sra N, 2, J ble J, $L40 .align 4 $L01: mov C, C1 addq C, LDC, C2 mov A, AO s4addq K, 0, BB #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif addq C2, LDC, C3 s4addq LDC, C, C SXADDQ BB, B, BB fclr t1 addq C3, LDC, C4 fclr t2 sra M, 2, I fclr t3 fclr t4 ble I, $L20 .align 4 $L11: #if defined(EV5) || defined(EV6) ldl $31, 0 * SIZE(BB) ldl $31, 8 * SIZE(BB) unop lda BB, 16 * SIZE(BB) #endif #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 4, TMP1 #else addq KK, 4, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c06 LD b4, 3 * SIZE(B) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(B) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 #else sll KK, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c06 LD b4, 3 * SIZE(BO) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(TMP1) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(BO) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 #endif lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble L, $L15 .align 5 $L12: /* 1 */ ADD c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD c12, t2, c12 unop MUL b1, a2, t2 unop ADD c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD c01, t1, c01 unop MUL b5, a6, t1 unop ADD c02, t2, c02 unop MUL b5, a4, t2 unop ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD c11, t1, c11 ldt alpha, ALPHA MUL b1, a1, t1 #ifndef TRMMKERNEL blbs K, $L18 #else blbs TMP1, $L18 #endif .align 4 ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L18: ADD c12, t2, c12 unop MUL b1, a2, t2 #ifndef TRMMKERNEL LD a5, 0 * SIZE(C1) #else unop #endif ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a1, t4 #ifndef TRMMKERNEL LD b5, 1 * SIZE(C1) #else unop #endif ADD c01, t1, c01 unop MUL b1, a3, t1 unop ADD c02, t2, c02 unop MUL b1, a4, t2 #ifndef TRMMKERNEL LD b1, 0 * SIZE(C2) #else unop #endif ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 #ifndef TRMMKERNEL LD a1, 0 * SIZE(C3) #else unop #endif ADD c08, t3, c08 unop MUL b4, a2, t3 #ifndef TRMMKERNEL LD a2, 2 * SIZE(C1) #else unop #endif ADD c13, t4, c13 unop MUL b2, a3, t4 #ifndef TRMMKERNEL LD b2, 3 * SIZE(C1) #else unop #endif ADD c09, t1, c09 lda I, -1(I) MUL b3, a3, t1 unop ADD c10, t2, c10 unop MUL b3, a4, t2 #ifndef TRMMKERNEL LD b3, 0 * SIZE(C4) #else unop #endif ADD c14, t3, c14 unop MUL b4, a4, t3 #ifndef TRMMKERNEL LD a4, 1 * SIZE(C2) #else unop #endif ADD c07, t4, c07 unop MUL b4, a3, t4 #ifndef TRMMKERNEL LD a3, 2 * SIZE(C2) #else unop #endif ADD c11, t1, c11 unop MUL alpha, c01, c01 #ifndef TRMMKERNEL LD b4, 3 * SIZE(C2) #else unop #endif ADD c12, t2, c12 unop MUL alpha, c02, c02 #ifndef TRMMKERNEL LD t1, 1 * SIZE(C3) #else unop #endif ADD c16, t3, c16 unop MUL alpha, c03, c03 #ifndef TRMMKERNEL LD t2, 2 * SIZE(C3) #else unop #endif ADD c15, t4, c15 unop MUL alpha, c04, c04 #ifndef TRMMKERNEL LD t3, 3 * SIZE(C3) #else unop #endif MUL alpha, c05, c05 unop #ifndef TRMMKERNEL ADD c01, a5, c01 LD t4, 1 * SIZE(C4) #else unop unop #endif MUL alpha, c06, c06 #ifndef TRMMKERNEL unop ADD c02, b5, c02 LD a5, 2 * SIZE(C4) #endif MUL alpha, c07, c07 #ifndef TRMMKERNEL unop ADD c03, a2, c03 LD b5, 3 * SIZE(C4) #endif MUL alpha, c08, c08 #ifndef TRMMKERNEL unop ADD c04, b2, c04 unop #endif MUL alpha, c09, c09 ST c01, 0 * SIZE(C1) #ifndef TRMMKERNEL ADD c05, b1, c05 unop #endif MUL alpha, c10, c10 ST c02, 1 * SIZE(C1) #ifndef TRMMKERNEL ADD c06, a4, c06 unop #endif MUL alpha, c11, c11 ST c03, 2 * SIZE(C1) #ifndef TRMMKERNEL ADD c07, a3, c07 unop #endif MUL alpha, c12, c12 ST c04, 3 * SIZE(C1) #ifndef TRMMKERNEL ADD c08, b4, c08 #else unop #endif lda C1, 4 * SIZE(C1) MUL alpha, c13, c13 ST c05, 0 * SIZE(C2) #ifndef TRMMKERNEL ADD c09, a1, c09 unop #endif MUL alpha, c14, c14 ST c06, 1 * SIZE(C2) #ifndef TRMMKERNEL ADD c10, t1, c10 unop #endif MUL alpha, c15, c15 ST c07, 2 * SIZE(C2) #ifndef TRMMKERNEL ADD c11, t2, c11 unop #endif MUL alpha, c16, c16 ST c08, 3 * SIZE(C2) #ifndef TRMMKERNEL ADD c12, t3, c12 #else unop #endif lda C2, 4 * SIZE(C2) #ifndef TRMMKERNEL ADD c13, b3, c13 #else unop #endif ST c09, 0 * SIZE(C3) fclr t1 lda C4, 4 * SIZE(C4) #ifndef TRMMKERNEL ADD c14, t4, c14 #else unop #endif ST c10, 1 * SIZE(C3) fclr t2 unop #ifndef TRMMKERNEL ADD c15, a5, c15 #else unop #endif ST c11, 2 * SIZE(C3) fclr t3 unop #ifndef TRMMKERNEL ADD c16, b5, c16 #else unop #endif ST c12, 3 * SIZE(C3) fclr t4 lda C3, 4 * SIZE(C3) ST c13, -4 * SIZE(C4) ST c14, -3 * SIZE(C4) ST c15, -2 * SIZE(C4) ST c16, -1 * SIZE(C4) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 4, TMP1 #else subq TMP1, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 4, KK #endif bgt I, $L11 .align 4 $L20: and M, 2, I ble I, $L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 2, TMP1 #else addq KK, 4, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c01 LD b4, 3 * SIZE(B) fclr c05 lda BO, 4 * SIZE(B) fclr c02 fclr c06 ble L, $L25 #else sll KK, BASE_SHIFT + 1, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c01 LD b4, 3 * SIZE(BO) fclr c05 lda BO, 4 * SIZE(BO) fclr c02 fclr c06 ble L, $L25 #endif .align 4 $L22: ADD c09, t1, c09 unop MUL a1, b1, t1 unop ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 unop ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD c09, t1, c09 ldt alpha, ALPHA MUL a1, b1, t1 #ifndef TRMMKERNEL blbs K, $L28 #else blbs TMP1, $L28 #endif ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 unop ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L28: ADD c10, t2, c10 unop MUL a2, b1, t2 #ifndef TRMMKERNEL LD a3, 0 * SIZE(C1) #else unop #endif ADD c13, t3, c13 unop MUL a1, b2, t3 #ifndef TRMMKERNEL LD a4, 1 * SIZE(C1) #else unop #endif ADD c14, t4, c14 unop MUL a2, b2, t4 #ifndef TRMMKERNEL LD a5, 0 * SIZE(C2) #else unop #endif ADD c01, t1, c01 unop MUL a1, b3, t1 #ifndef TRMMKERNEL LD b5, 1 * SIZE(C2) #else unop #endif ADD c02, t2, c02 unop MUL a2, b3, t2 #ifndef TRMMKERNEL LD b1, 0 * SIZE(C3) #else unop #endif ADD c05, t3, c05 unop MUL a1, b4, t3 #ifndef TRMMKERNEL LD b2, 1 * SIZE(C3) #else unop #endif ADD c06, t4, c06 unop MUL a2, b4, t4 #ifndef TRMMKERNEL LD b3, 0 * SIZE(C4) #else unop #endif ADD c09, t1, c09 unop MUL alpha, c01, c01 #ifndef TRMMKERNEL LD b4, 1 * SIZE(C4) #else unop #endif ADD c10, t2, c10 unop MUL alpha, c02, c02 unop ADD c13, t3, c13 MUL alpha, c05, c05 ADD c14, t4, c14 MUL alpha, c06, c06 MUL alpha, c09, c09 #ifndef TRMMKERNEL ADD c01, a3, c01 #endif MUL alpha, c10, c10 #ifndef TRMMKERNEL ADD c02, a4, c02 #endif MUL alpha, c13, c13 #ifndef TRMMKERNEL ADD c05, a5, c05 #endif MUL alpha, c14, c14 #ifndef TRMMKERNEL ADD c06, b5, c06 #endif #ifndef TRMMKERNEL ADD c09, b1, c09 unop #endif ST c01, 0 * SIZE(C1) fclr t1 #ifndef TRMMKERNEL ADD c10, b2, c10 unop #endif ST c02, 1 * SIZE(C1) fclr t2 #ifndef TRMMKERNEL ADD c13, b3, c13 unop #endif ST c05, 0 * SIZE(C2) fclr t3 #ifndef TRMMKERNEL ADD c14, b4, c14 unop #endif ST c06, 1 * SIZE(C2) fclr t4 ST c09, 0 * SIZE(C3) lda C1, 2 * SIZE(C1) ST c10, 1 * SIZE(C3) lda C2, 2 * SIZE(C2) ST c13, 0 * SIZE(C4) lda C3, 2 * SIZE(C3) ST c14, 1 * SIZE(C4) lda C4, 2 * SIZE(C4) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 2, TMP1 #else subq TMP1, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 2, KK #endif .align 4 $L30: and M, 1, I ble I, $L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 1, TMP1 #else addq KK, 4, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif LD b2, 1 * SIZE(B) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c09 LD b4, 3 * SIZE(B) fclr c13 lda BO, 4 * SIZE(B) ble L, $L35 #else sll KK, BASE_SHIFT + 0, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c09 LD b4, 3 * SIZE(BO) fclr c13 lda BO, 4 * SIZE(BO) ble L, $L35 #endif .align 4 $L32: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 0 * SIZE(BO) ADD c05, t2, c05 lda AO, 2 * SIZE(AO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 LD b5, 3 * SIZE(BO) MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, -1 * SIZE(AO) ADD c01, t1, c01 MUL a2, b1, t1 LD b1, 4 * SIZE(BO) lda BO, 8 * SIZE(BO) ADD c05, t2, c05 MUL a2, b2, t2 LD b2, -3 * SIZE(BO) ADD c09, t3, c09 LD b4, -1 * SIZE(BO) MUL a2, b3, t3 LD b3, -2 * SIZE(BO) ADD c13, t4, c13 MUL a2, b5, t4 LD a2, 0 * SIZE(AO) bgt L, $L32 .align 4 $L35: ADD c01, t1, c01 ldt alpha, ALPHA MUL a1, b1, t1 #ifndef TRMMKERNEL blbs K, $L38 #else blbs TMP1, $L38 #endif .align 4 ADD c05, t2, c05 LD b1, 0 * SIZE(BO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, 0 * SIZE(AO) lda AO, 1 * SIZE(AO) ADD c01, t1, c01 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L38: ADD c05, t2, c05 unop MUL a1, b2, t2 #ifndef TRMMKERNEL LD a5, 0 * SIZE(C1) #else unop #endif ADD c09, t3, c09 unop MUL a1, b3, t3 #ifndef TRMMKERNEL LD b5, 0 * SIZE(C2) #else unop #endif ADD c13, t4, c13 unop MUL a1, b4, t4 #ifndef TRMMKERNEL LD a2, 0 * SIZE(C3) #else unop #endif ADD c01, t1, c01 unop MUL alpha, c01, c01 #ifndef TRMMKERNEL LD a3, 0 * SIZE(C4) #else unop #endif ADD c05, t2, c05 unop MUL alpha, c05, c05 unop ADD c09, t3, c09 MUL alpha, c09, c09 ADD c13, t4, c13 MUL alpha, c13, c13 #ifndef TRMMKERNEL ADD c01, a5, c01 ADD c05, b5, c05 ADD c09, a2, c09 ADD c13, a3, c13 #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c13, 0 * SIZE(C4) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 1, TMP1 #else subq TMP1, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 1, KK #endif .align 4 $L39: mov BO, B lda J, -1(J) #if defined(TRMMKERNEL) && !defined(LEFT) addq KK, 4, KK #else unop #endif bgt J, $L01 .align 4 $L40: and N, 2, J ble J, $L80 mov C, C1 addq C, LDC, C2 mov A, AO fclr t1 addq C2, LDC, C fclr t2 #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif sra M, 2, I fclr t3 fclr t4 ble I, $L60 .align 4 $L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 4, TMP1 #else addq KK, 2, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif lda BO, 2 * SIZE(B) lda AO, 4 * SIZE(AO) ble L, $L55 #else sll KK, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda L, -2(TMP1) lda BO, 2 * SIZE(BO) lda AO, 4 * SIZE(AO) ble L, $L55 #endif .align 4 $L52: ADD c05, t1, c05 unop MUL a1, b1, t1 unop ADD c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD c07, t3, c07 unop MUL a3, b1, t3 unop ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD c06, t2, c06 unop MUL a2, b3, t2 unop ADD c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L52 .align 4 $L55: ADD c05, t1, c05 ldt alpha, ALPHA MUL a1, b1, t1 #ifndef TRMMKERNEL blbs K, $L58 #else blbs TMP1, $L58 #endif .align 4 ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L58: ADD c06, t2, c06 unop MUL a2, b1, t2 #ifndef TRMMKERNEL LD c09, 0 * SIZE(C1) #else unop #endif ADD c07, t3, c07 unop MUL a3, b1, t3 #ifndef TRMMKERNEL LD c10, 1 * SIZE(C1) #else unop #endif ADD c08, t4, c08 unop MUL a4, b1, t4 #ifndef TRMMKERNEL LD c11, 2 * SIZE(C1) #else unop #endif ADD c01, t1, c01 unop MUL a1, b2, t1 #ifndef TRMMKERNEL LD c12, 3 * SIZE(C1) #else unop #endif ADD c02, t2, c02 unop MUL a2, b2, t2 #ifndef TRMMKERNEL LD c13, 0 * SIZE(C2) unop #endif ADD c03, t3, c03 unop MUL a3, b2, t3 #ifndef TRMMKERNEL LD c14, 1 * SIZE(C2) #else unop #endif ADD c04, t4, c04 unop MUL a4, b2, t4 #ifndef TRMMKERNEL LD c15, 2 * SIZE(C2) #else unop #endif ADD c05, t1, c05 unop MUL alpha, c01, c01 #ifndef TRMMKERNEL LD c16, 3 * SIZE(C2) #else unop #endif ADD c06, t2, c06 lda I, -1(I) MUL alpha, c02, c02 unop ADD c07, t3, c07 MUL alpha, c03, c03 ADD c08, t4, c08 MUL alpha, c04, c04 MUL alpha, c05, c05 #ifndef TRMMKERNEL ADD c01, c09, c01 #endif MUL alpha, c06, c06 #ifndef TRMMKERNEL ADD c02, c10, c02 #endif MUL alpha, c07, c07 #ifndef TRMMKERNEL ADD c03, c11, c03 #endif MUL alpha, c08, c08 #ifndef TRMMKERNEL ADD c04, c12, c04 #endif #ifndef TRMMKERNEL ADD c05, c13, c05 #endif ST c01, 0 * SIZE(C1) #ifndef TRMMKERNEL ADD c06, c14, c06 #endif ST c02, 1 * SIZE(C1) #ifndef TRMMKERNEL ADD c07, c15, c07 #endif ST c03, 2 * SIZE(C1) #ifndef TRMMKERNEL ADD c08, c16, c08 #endif ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) fclr t1 ST c06, 1 * SIZE(C2) fclr t2 ST c07, 2 * SIZE(C2) fclr t3 ST c08, 3 * SIZE(C2) fclr t4 lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 4, TMP1 #else subq TMP1, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 4, KK #endif bgt I, $L51 .align 4 $L60: and M, 2, I ble I, $L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 2, TMP1 #else addq KK, 2, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble L, $L65 #else sll KK, BASE_SHIFT + 1, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble L, $L65 #endif .align 4 $L62: ADD c01, t1, c01 unop MUL a1, b1, t1 unop ADD c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L62 .align 4 $L65: ADD c01, t1, c01 ldt alpha, ALPHA MUL a1, b1, t1 #ifndef TRMMKERNEL blbs K, $L68 #else blbs TMP1, $L68 #endif .align 4 ADD c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L68: ADD c02, t2, c02 unop MUL a2, b1, t2 #ifndef TRMMKERNEL LD c09, 0 * SIZE(C1) #else unop #endif ADD c05, t3, c05 unop MUL a1, b2, t3 #ifndef TRMMKERNEL LD c10, 1 * SIZE(C1) #else unop #endif ADD c06, t4, c06 unop MUL a2, b2, t4 #ifndef TRMMKERNEL LD c11, 0 * SIZE(C2) #else unop #endif ADD c01, t1, c01 unop MUL alpha, c01, c01 #ifndef TRMMKERNEL LD c12, 1 * SIZE(C2) #else unop #endif ADD c02, t2, c02 lda C1, 2 * SIZE(C1) MUL alpha, c02, c02 lda C2, 2 * SIZE(C2) ADD c05, t3, c05 MUL alpha, c05, c05 ADD c06, t4, c06 MUL alpha, c06, c06 #ifndef TRMMKERNEL ADD c01, c09, c01 ADD c02, c10, c02 ADD c05, c11, c05 ADD c06, c12, c06 #endif ST c01, -2 * SIZE(C1) fclr t1 ST c02, -1 * SIZE(C1) fclr t2 ST c05, -2 * SIZE(C2) fclr t3 ST c06, -1 * SIZE(C2) fclr t4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 2, TMP1 #else subq TMP1, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 2, KK #endif .align 4 $L70: and M, 1, I ble I, $L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 1, TMP1 #else addq KK, 2, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) fclr c02 LD b2, 1 * SIZE(B) fclr c06 #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif LD b3, 2 * SIZE(B) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble L, $L75 #else sll KK, BASE_SHIFT + 0, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) fclr c02 LD b2, 1 * SIZE(BO) fclr c06 #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif LD b3, 2 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble L, $L75 #endif .align 4 $L72: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 2 * SIZE(BO) ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 1 * SIZE(AO) LD b2, 3 * SIZE(BO) ADD c02, t3, c02 lda AO, 2 * SIZE(AO) MUL a2, b3, t3 LD b3, 4 * SIZE(BO) ADD c06, t4, c06 MUL a2, b4, t4 LD a2, 0 * SIZE(AO) LD b4, 5 * SIZE(BO) lda BO, 4 * SIZE(BO) unop unop bgt L, $L72 .align 4 $L75: ADD c01, t1, c01 ldt alpha, ALPHA MUL a1, b1, t1 #ifndef TRMMKERNEL blbs K, $L78 #else blbs TMP1, $L78 #endif .align 4 ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) ADD c01, t1, c01 LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L78: ADD c05, t2, c05 MUL a1, b2, t2 #ifndef TRMMKERNEL LD a5, 0 * SIZE(C1) #else unop #endif ADD c02, t3, c02 ADD c06, t4, c06 #ifndef TRMMKERNEL LD b5, 0 * SIZE(C2) #else unop #endif ADD c01, c02, c01 ADD c05, c06, c05 ADD c01, t1, c01 ADD c05, t2, c05 MUL alpha, c01, c01 MUL alpha, c05, c05 #ifndef TRMMKERNEL ADD c01, a5, c01 ADD c05, b5, c05 #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 1, TMP1 #else subq TMP1, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 1, KK #endif .align 4 $L79: mov BO, B #if defined(TRMMKERNEL) && !defined(LEFT) addq KK, 2, KK #else unop #endif unop unop .align 4 $L80: and N, 1, J ble J, $L999 mov C, C1 mov A, AO #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif sra M, 2, I ble I, $L100 .align 4 $L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 4, TMP1 #else addq KK, 1, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 #ifndef TRMMKERNEL sra K, 2, L #else sra TMP1, 2, L #endif mov B, BO unop ble L, $L95 #else sll KK, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 #ifndef TRMMKERNEL sra K, 2, L #else sra TMP1, 2, L #endif unop ble L, $L95 #endif .align 5 $L92: ADD c01, t1, c01 unop MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda L, -1(L) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 8 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 9 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 10 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 11 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 LD a1, 12 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD a2, 13 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b3, t3 LD a3, 14 * SIZE(AO) ADD c04, t4, c04 MUL a4, b3, t4 LD a5, 15 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c01, t1, c01 MUL a1, b4, t1 LD a1, 16 * SIZE(AO) lda AO, 16 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b4, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L92 .align 4 $L95: #ifndef TRMMKERNEL and K, 3, L #else and TMP1, 3, L #endif ldt alpha, ALPHA unop ble L, $L98 .align 4 $L96: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda BO, 1 * SIZE(BO) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 0 * SIZE(BO) lda AO, 4 * SIZE(AO) bgt L, $L96 .align 4 $L98: #ifndef TRMMKERNEL ADD c01, t1, c01 LD c05, 0 * SIZE(C1) ADD c02, t2, c02 LD c06, 1 * SIZE(C1) ADD c03, t3, c03 LD c07, 2 * SIZE(C1) ADD c04, t4, c04 LD c08, 3 * SIZE(C1) #else ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 #endif MUL alpha, c01, c01 MUL alpha, c02, c02 MUL alpha, c03, c03 MUL alpha, c04, c04 #ifndef TRMMKERNEL ADD c01, c05, c01 ADD c02, c06, c02 ADD c03, c07, c03 ADD c04, c08, c04 #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) lda C1, 4 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 4, TMP1 #else subq TMP1, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 4, KK #endif lda I, -1(I) bgt I, $L91 .align 4 $L100: and M, 2, I unop unop ble I, $L110 .align 4 $L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 2, TMP1 #else addq KK, 1, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 #ifndef TRMMKERNEL sra K, 2, L #else sra TMP1, 2, L #endif mov B, BO unop ble L, $L105 #else sll KK, BASE_SHIFT + 1, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 #ifndef TRMMKERNEL sra K, 2, L #else sra TMP1, 2, L #endif unop ble L, $L105 #endif .align 5 $L102: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 5 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c03, t3, c03 lda BO, 4 * SIZE(BO) MUL a3, b2, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a5, 7 * SIZE(AO) LD b2, 1 * SIZE(BO) ADD c01, t1, c01 MUL a1, b3, t1 LD a1, 8 * SIZE(AO) lda AO, 8 * SIZE(AO) ADD c02, t2, c02 MUL a2, b3, t2 LD b3, 2 * SIZE(BO) LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L102 .align 4 $L105: #ifndef TRMMKERNEL and K, 3, L #else and TMP1, 3, L #endif ldt alpha, ALPHA #ifndef TRMMKERNEL LD a3, 0 * SIZE(C1) LD a4, 1 * SIZE(C1) #endif ble L, $L108 .align 4 $L106: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 2 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) unop lda BO, 1 * SIZE(BO) bgt L, $L106 .align 4 $L108: ADD c01, t1, c01 fclr t1 ADD c02, t2, c02 fclr t2 ADD c03, t3, c03 fclr t3 ADD c04, t4, c04 fclr t4 ADD c01, c03, c01 ADD c02, c04, c02 MUL alpha, c01, c01 MUL alpha, c02, c02 #ifndef TRMMKERNEL ADD c01, a3, c01 ADD c02, a4, c02 #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) lda C1, 2 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 2, TMP1 #else subq TMP1, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 2, KK #endif .align 4 $L110: and M, 1, I ble I, $L999 .align 4 $L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 1, TMP1 #else addq KK, 1, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 #ifndef TRMMKERNEL sra K, 2, L #else sra TMP1, 2, L #endif mov B, BO unop ble L, $L115 #else sll KK, BASE_SHIFT + 0, TMP1 addq AO, TMP1, AO sll KK, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 #ifndef TRMMKERNEL sra K, 2, L #else sra TMP1, 2, L #endif unop ble L, $L115 #endif .align 4 $L112: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 4 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c02, t2, c02 MUL a2, b2, t2 LD a2, 5 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c03, t3, c03 MUL a3, b3, t3 LD a3, 6 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c04, t4, c04 MUL a4, b4, t4 LD a4, 7 * SIZE(AO) LD b4, 7 * SIZE(BO) lda L, -1(L) lda AO, 4 * SIZE(AO) lda BO, 4 * SIZE(BO) bgt L, $L112 .align 4 $L115: #ifndef TRMMKERNEL and K, 3, L #else and TMP1, 3, L #endif ldt alpha, ALPHA #ifndef TRMMKERNEL LD a2, 0 * SIZE(C1) #endif ble L, $L118 .align 4 $L116: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) lda L, -1(L) lda AO, 1 * SIZE(AO) lda BO, 1 * SIZE(BO) bgt L, $L116 .align 4 $L118: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c02, c01 ADD c03, c04, c03 ADD c01, c03, c01 MUL alpha, c01, c01 #ifndef TRMMKERNEL ADD c01, a2, c01 #endif ST c01, 0 * SIZE(C1) .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/gemv_n.S000066400000000000000000000534651313527062700171250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define STACKSIZE 64 #define PREFETCHSIZE 32 #define M $16 #define N $17 #define A $20 #define LDA $21 #define X $18 #define INCX $19 #define Y $22 #define INCY $23 #define BUFFER $24 #define I $25 #define J $27 #define Y1 $4 #define A1 $5 #define A2 $6 #define A3 $7 #define A4 $8 #define alpha $f19 #define alpha1 $f0 #define alpha2 $f1 #define alpha3 $f10 #define alpha4 $f11 #define y0 $f12 #define y1 $f13 #define y2 $f14 #define y3 $f15 #define y4 $f16 #define y5 $f17 #define y6 $f18 #define y7 $f21 #define a0 $f22 #define a1 $f23 #define a2 $f24 #define a3 $f25 #define a4 $f26 #define a5 $f27 #define a6 $f28 #define a7 $f29 #define a8 $f2 #define a9 $f3 #define a10 $f4 #define a11 $f5 #define a12 $f6 #define a13 $f7 #define a14 $f8 #define a15 $f9 PROLOGUE lda $sp, -STACKSIZE($sp) ldq X, 0 + STACKSIZE($sp) ldq INCX, 8 + STACKSIZE($sp) ldq Y, 16 + STACKSIZE($sp) ldq INCY, 24 + STACKSIZE($sp) ldq BUFFER, 32 + STACKSIZE($sp) stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) PROFCODE cmple M, 0, $0 SXADDQ INCX, 0, INCX cmple N, 0, $1 SXADDQ INCY, 0, INCY or $0, $1, $0 bne $0, $L999 SXADDQ LDA, 0, LDA cmpeq INCY, SIZE, $0 bne $0, $L10 mov BUFFER, Y1 mov Y, BUFFER mov Y1, Y sra M, 3, I ble I, $L05 .align 4 $L02: ST $f31, 0 * SIZE(Y1) ST $f31, 1 * SIZE(Y1) ST $f31, 2 * SIZE(Y1) ST $f31, 3 * SIZE(Y1) ST $f31, 4 * SIZE(Y1) ST $f31, 5 * SIZE(Y1) ST $f31, 6 * SIZE(Y1) ST $f31, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) lda I, -1(I) bgt I, $L02 .align 4 $L05: and M, 7, I ble I, $L10 .align 4 $L06: ST $f31, 0 * SIZE(Y1) addq Y1, SIZE, Y1 lda I, -1(I) bgt I, $L06 .align 4 $L10: sra N, 2, J ble J, $L20 .align 4 $L11: LD alpha1, 0 * SIZE(X) addq X, INCX, X LD alpha2, 0 * SIZE(X) addq X, INCX, X LD alpha3, 0 * SIZE(X) addq X, INCX, X LD alpha4, 0 * SIZE(X) addq X, INCX, X MUL alpha, alpha1, alpha1 MUL alpha, alpha2, alpha2 MUL alpha, alpha3, alpha3 MUL alpha, alpha4, alpha4 mov A, A1 addq A, LDA, A2 addq A2, LDA, A3 addq A3, LDA, A4 s4addq LDA, A, A mov Y, Y1 ldl $31, 4 * SIZE(X) sra M, 3, I ble I, $L15 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD a4, 0 * SIZE(A2) LD a5, 1 * SIZE(A2) LD a6, 2 * SIZE(A2) LD a7, 3 * SIZE(A2) LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) LD y2, 2 * SIZE(Y1) LD y3, 3 * SIZE(Y1) LD a8, 0 * SIZE(A3) LD a9, 1 * SIZE(A3) LD a10, 2 * SIZE(A3) LD a11, 3 * SIZE(A3) LD y4, 4 * SIZE(Y1) LD y5, 5 * SIZE(Y1) LD y6, 6 * SIZE(Y1) LD y7, 7 * SIZE(Y1) MUL alpha1, a0, a0 LD a12, 0 * SIZE(A4) MUL alpha1, a1, a1 LD a13, 1 * SIZE(A4) MUL alpha1, a2, a2 LD a14, 2 * SIZE(A4) MUL alpha1, a3, a3 LD a15, 3 * SIZE(A4) ADD y0, a0, y0 LD a0, 4 * SIZE(A1) MUL alpha2, a4, a4 unop ADD y1, a1, y1 LD a1, 5 * SIZE(A1) MUL alpha2, a5, a5 unop ADD y2, a2, y2 LD a2, 6 * SIZE(A1) MUL alpha2, a6, a6 unop ADD y3, a3, y3 LD a3, 7 * SIZE(A1) MUL alpha2, a7, a7 unop ADD y0, a4, y0 LD a4, 4 * SIZE(A2) MUL alpha3, a8, a8 unop ADD y1, a5, y1 LD a5, 5 * SIZE(A2) MUL alpha3, a9, a9 lda I, -1(I) ADD y2, a6, y2 LD a6, 6 * SIZE(A2) MUL alpha3, a10, a10 unop ADD y3, a7, y3 LD a7, 7 * SIZE(A2) MUL alpha3, a11, a11 unop ADD y0, a8, y0 LD a8, 4 * SIZE(A3) MUL alpha4, a12, a12 ble I, $L13 .align 4 $L12: ADD y1, a9, y1 LD a9, 5 * SIZE(A3) MUL alpha4, a13, a13 ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) ADD y2, a10, y2 LD a10, 6 * SIZE(A3) MUL alpha4, a14, a14 unop ADD y3, a11, y3 LD a11, 7 * SIZE(A3) MUL alpha4, a15, a15 lda I, -1(I) ADD y0, a12, y0 LD a12, 4 * SIZE(A4) MUL alpha1, a0, a0 lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) ADD y1, a13, y1 LD a13, 5 * SIZE(A4) MUL alpha1, a1, a1 unop ADD y2, a14, y2 LD a14, 6 * SIZE(A4) MUL alpha1, a2, a2 unop ADD y3, a15, y3 LD a15, 7 * SIZE(A4) MUL alpha1, a3, a3 ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) ADD y4, a0, y4 ST y0, 0 * SIZE(Y1) MUL alpha2, a4, a4 LD a0, 8 * SIZE(A1) ADD y5, a1, y5 ST y1, 1 * SIZE(Y1) MUL alpha2, a5, a5 LD a1, 9 * SIZE(A1) ADD y6, a2, y6 ST y2, 2 * SIZE(Y1) MUL alpha2, a6, a6 LD a2, 10 * SIZE(A1) ADD y7, a3, y7 ST y3, 3 * SIZE(Y1) MUL alpha2, a7, a7 LD a3, 11 * SIZE(A1) ADD y4, a4, y4 LD a4, 8 * SIZE(A2) MUL alpha3, a8, a8 LD y0, 8 * SIZE(Y1) ADD y5, a5, y5 LD a5, 9 * SIZE(A2) MUL alpha3, a9, a9 LD y1, 9 * SIZE(Y1) ADD y6, a6, y6 LD a6, 10 * SIZE(A2) MUL alpha3, a10, a10 LD y2, 10 * SIZE(Y1) ADD y7, a7, y7 LD a7, 11 * SIZE(A2) MUL alpha3, a11, a11 LD y3, 11 * SIZE(Y1) ADD y4, a8, y4 LD a8, 8 * SIZE(A3) MUL alpha4, a12, a12 ldl $31, (PREFETCHSIZE + 0) * SIZE(A3) ADD y5, a9, y5 LD a9, 9 * SIZE(A3) MUL alpha4, a13, a13 lda A1, 8 * SIZE(A1) ADD y6, a10, y6 LD a10, 10 * SIZE(A3) MUL alpha4, a14, a14 lda A2, 8 * SIZE(A2) ADD y7, a11, y7 LD a11, 11 * SIZE(A3) MUL alpha4, a15, a15 lda Y1, 8 * SIZE(Y1) ADD y4, a12, y4 LD a12, 8 * SIZE(A4) MUL alpha1, a0, a0 unop ADD y5, a13, y5 LD a13, 9 * SIZE(A4) MUL alpha1, a1, a1 lda A3, 8 * SIZE(A3) ADD y6, a14, y6 LD a14, 10 * SIZE(A4) MUL alpha1, a2, a2 ldl $31, (PREFETCHSIZE + 0) * SIZE(A4) ADD y7, a15, y7 LD a15, 11 * SIZE(A4) MUL alpha1, a3, a3 lda A4, 8 * SIZE(A4) ADD y0, a0, y0 LD a0, 4 * SIZE(A1) MUL alpha2, a4, a4 ST y4, -4 * SIZE(Y1) ADD y1, a1, y1 LD a1, 5 * SIZE(A1) MUL alpha2, a5, a5 ST y5, -3 * SIZE(Y1) ADD y2, a2, y2 LD a2, 6 * SIZE(A1) MUL alpha2, a6, a6 ST y6, -2 * SIZE(Y1) ADD y3, a3, y3 LD a3, 7 * SIZE(A1) MUL alpha2, a7, a7 ST y7, -1 * SIZE(Y1) ADD y0, a4, y0 LD a4, 4 * SIZE(A2) MUL alpha3, a8, a8 LD y4, 4 * SIZE(Y1) ADD y1, a5, y1 LD a5, 5 * SIZE(A2) MUL alpha3, a9, a9 LD y5, 5 * SIZE(Y1) ADD y2, a6, y2 LD a6, 6 * SIZE(A2) MUL alpha3, a10, a10 LD y6, 6 * SIZE(Y1) ADD y3, a7, y3 LD a7, 7 * SIZE(A2) MUL alpha3, a11, a11 LD y7, 7 * SIZE(Y1) ADD y0, a8, y0 LD a8, 4 * SIZE(A3) MUL alpha4, a12, a12 bgt I, $L12 .align 4 $L13: ADD y1, a9, y1 LD a9, 5 * SIZE(A3) MUL alpha4, a13, a13 unop ADD y2, a10, y2 LD a10, 6 * SIZE(A3) MUL alpha4, a14, a14 unop ADD y3, a11, y3 LD a11, 7 * SIZE(A3) MUL alpha4, a15, a15 unop ADD y0, a12, y0 LD a12, 4 * SIZE(A4) MUL alpha1, a0, a0 unop ADD y1, a13, y1 LD a13, 5 * SIZE(A4) MUL alpha1, a1, a1 unop ADD y2, a14, y2 LD a14, 6 * SIZE(A4) MUL alpha1, a2, a2 unop ADD y3, a15, y3 LD a15, 7 * SIZE(A4) MUL alpha1, a3, a3 unop ST y0, 0 * SIZE(Y1) ADD y4, a0, y4 unop MUL alpha2, a4, a4 ST y1, 1 * SIZE(Y1) ADD y5, a1, y5 unop MUL alpha2, a5, a5 ST y2, 2 * SIZE(Y1) ADD y6, a2, y6 unop MUL alpha2, a6, a6 ST y3, 3 * SIZE(Y1) ADD y7, a3, y7 lda Y1, 8 * SIZE(Y1) MUL alpha2, a7, a7 ADD y4, a4, y4 MUL alpha3, a8, a8 ADD y5, a5, y5 MUL alpha3, a9, a9 ADD y6, a6, y6 MUL alpha3, a10, a10 ADD y7, a7, y7 MUL alpha3, a11, a11 ADD y4, a8, y4 MUL alpha4, a12, a12 ADD y5, a9, y5 MUL alpha4, a13, a13 ADD y6, a10, y6 MUL alpha4, a14, a14 ADD y7, a11, y7 MUL alpha4, a15, a15 ADD y4, a12, y4 ADD y5, a13, y5 ADD y6, a14, y6 ADD y7, a15, y7 ST y4, -4 * SIZE(Y1) lda A1, 8 * SIZE(A1) ST y5, -3 * SIZE(Y1) lda A2, 8 * SIZE(A2) ST y6, -2 * SIZE(Y1) lda A3, 8 * SIZE(A3) ST y7, -1 * SIZE(Y1) lda A4, 8 * SIZE(A4) .align 4 $L15: and M, 4, I ble I, $L16 LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) LD y2, 2 * SIZE(Y1) LD y3, 3 * SIZE(Y1) LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD a4, 0 * SIZE(A2) LD a5, 1 * SIZE(A2) LD a6, 2 * SIZE(A2) LD a7, 3 * SIZE(A2) LD a8, 0 * SIZE(A3) LD a9, 1 * SIZE(A3) LD a10, 2 * SIZE(A3) LD a11, 3 * SIZE(A3) MUL alpha1, a0, a0 LD a12, 0 * SIZE(A4) MUL alpha1, a1, a1 LD a13, 1 * SIZE(A4) MUL alpha1, a2, a2 LD a14, 2 * SIZE(A4) MUL alpha1, a3, a3 LD a15, 3 * SIZE(A4) ADD y0, a0, y0 MUL alpha2, a4, a4 ADD y1, a1, y1 MUL alpha2, a5, a5 ADD y2, a2, y2 MUL alpha2, a6, a6 ADD y3, a3, y3 MUL alpha2, a7, a7 ADD y0, a4, y0 MUL alpha3, a8, a8 ADD y1, a5, y1 MUL alpha3, a9, a9 ADD y2, a6, y2 MUL alpha3, a10, a10 ADD y3, a7, y3 MUL alpha3, a11, a11 ADD y0, a8, y0 MUL alpha4, a12, a12 ADD y1, a9, y1 MUL alpha4, a13, a13 ADD y2, a10, y2 MUL alpha4, a14, a14 ADD y3, a11, y3 MUL alpha4, a15, a15 ADD y0, a12, y0 lda Y1, 4 * SIZE(Y1) ADD y1, a13, y1 unop ADD y2, a14, y2 unop ADD y3, a15, y3 unop ST y0, -4 * SIZE(Y1) lda A1, 4 * SIZE(A1) ST y1, -3 * SIZE(Y1) lda A2, 4 * SIZE(A2) ST y2, -2 * SIZE(Y1) lda A3, 4 * SIZE(A3) ST y3, -1 * SIZE(Y1) lda A4, 4 * SIZE(A4) .align 4 $L16: and M, 2, I ble I, $L17 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 0 * SIZE(A2) LD a3, 1 * SIZE(A2) LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) LD a4, 0 * SIZE(A3) MUL alpha1, a0, a0 LD a5, 1 * SIZE(A3) MUL alpha1, a1, a1 LD a6, 0 * SIZE(A4) MUL alpha2, a2, a2 LD a7, 1 * SIZE(A4) MUL alpha2, a3, a3 ADD y0, a0, y0 MUL alpha3, a4, a4 ADD y1, a1, y1 MUL alpha3, a5, a5 ADD y0, a2, y0 MUL alpha4, a6, a6 ADD y1, a3, y1 MUL alpha4, a7, a7 ADD y0, a4, y0 lda A1, 2 * SIZE(A1) ADD y1, a5, y1 lda A2, 2 * SIZE(A2) ADD y0, a6, y0 lda A3, 2 * SIZE(A3) ADD y1, a7, y1 lda A4, 2 * SIZE(A4) ST y0, 0 * SIZE(Y1) unop ST y1, 1 * SIZE(Y1) lda Y1, 2 * SIZE(Y1) .align 4 $L17: blbc M, $L18 LD y0, 0 * SIZE(Y1) LD a0, 0 * SIZE(A1) LD a1, 0 * SIZE(A2) LD a2, 0 * SIZE(A3) LD a3, 0 * SIZE(A4) MUL alpha1, a0, a0 MUL alpha2, a1, a1 MUL alpha3, a2, a2 MUL alpha4, a3, a3 ADD y0, a0, y0 ADD y0, a1, y0 ADD y0, a2, y0 ADD y0, a3, y0 ST y0, 0 * SIZE(Y1) .align 4 $L18: lda J, -1(J) bgt J, $L11 .align 4 $L20: and N, 2, J ble J, $L30 LD alpha1, 0 * SIZE(X) addq X, INCX, X LD alpha2, 0 * SIZE(X) addq X, INCX, X mov A, A1 MUL alpha, alpha1, alpha1 addq A, LDA, A2 MUL alpha, alpha2, alpha2 addq A2, LDA, A mov Y, Y1 sra M, 3, I ble I, $L25 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD a4, 0 * SIZE(A2) LD a5, 1 * SIZE(A2) LD a6, 2 * SIZE(A2) LD a7, 3 * SIZE(A2) LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) LD y2, 2 * SIZE(Y1) LD y3, 3 * SIZE(Y1) MUL alpha1, a0, a0 LD y4, 4 * SIZE(Y1) MUL alpha1, a1, a1 LD y5, 5 * SIZE(Y1) MUL alpha1, a2, a2 LD y6, 6 * SIZE(Y1) MUL alpha1, a3, a3 LD y7, 7 * SIZE(Y1) ADD y0, a0, y0 LD a0, 4 * SIZE(A1) MUL alpha2, a4, a4 ADD y1, a1, y1 LD a1, 5 * SIZE(A1) MUL alpha2, a5, a5 ADD y2, a2, y2 LD a2, 6 * SIZE(A1) MUL alpha2, a6, a6 ADD y3, a3, y3 LD a3, 7 * SIZE(A1) MUL alpha2, a7, a7 ADD y0, a4, y0 LD a4, 4 * SIZE(A2) MUL alpha1, a0, a0 ADD y1, a5, y1 LD a5, 5 * SIZE(A2) MUL alpha1, a1, a1 ADD y2, a6, y2 LD a6, 6 * SIZE(A2) MUL alpha1, a2, a2 ADD y3, a7, y3 LD a7, 7 * SIZE(A2) MUL alpha1, a3, a3 lda I, -1(I) ble I, $L23 .align 4 $L22: ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) lda I, -1(I) ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) lda A2, 8 * SIZE(A2) ADD y4, a0, y4 ST y0, 0 * SIZE(Y1) MUL alpha2, a4, a4 LD a0, 8 * SIZE(A1) ADD y5, a1, y5 ST y1, 1 * SIZE(Y1) MUL alpha2, a5, a5 LD a1, 9 * SIZE(A1) ADD y6, a2, y6 ST y2, 2 * SIZE(Y1) MUL alpha2, a6, a6 LD a2, 10 * SIZE(A1) ADD y7, a3, y7 ST y3, 3 * SIZE(Y1) MUL alpha2, a7, a7 LD a3, 11 * SIZE(A1) ADD y4, a4, y4 LD a4, 0 * SIZE(A2) MUL alpha1, a0, a0 LD y0, 8 * SIZE(Y1) ADD y5, a5, y5 LD a5, 1 * SIZE(A2) MUL alpha1, a1, a1 LD y1, 9 * SIZE(Y1) ADD y6, a6, y6 LD a6, 2 * SIZE(A2) MUL alpha1, a2, a2 LD y2, 10 * SIZE(Y1) ADD y7, a7, y7 LD a7, 3 * SIZE(A2) MUL alpha1, a3, a3 LD y3, 11 * SIZE(Y1) ADD y0, a0, y0 ST y4, 4 * SIZE(Y1) MUL alpha2, a4, a4 LD a0, 12 * SIZE(A1) ADD y1, a1, y1 ST y5, 5 * SIZE(Y1) MUL alpha2, a5, a5 LD a1, 13 * SIZE(A1) ADD y2, a2, y2 ST y6, 6 * SIZE(Y1) MUL alpha2, a6, a6 LD a2, 14 * SIZE(A1) ADD y3, a3, y3 ST y7, 7 * SIZE(Y1) MUL alpha2, a7, a7 LD a3, 15 * SIZE(A1) ADD y0, a4, y0 LD a4, 4 * SIZE(A2) MUL alpha1, a0, a0 LD y4, 12 * SIZE(Y1) ADD y1, a5, y1 LD a5, 5 * SIZE(A2) MUL alpha1, a1, a1 LD y5, 13 * SIZE(Y1) ADD y2, a6, y2 LD a6, 6 * SIZE(A2) MUL alpha1, a2, a2 LD y6, 14 * SIZE(Y1) ADD y3, a7, y3 LD a7, 7 * SIZE(A2) MUL alpha1, a3, a3 LD y7, 15 * SIZE(Y1) lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) lda A1, 8 * SIZE(A1) lda Y1, 8 * SIZE(Y1) bgt I, $L22 .align 4 $L23: ADD y4, a0, y4 ST y0, 0 * SIZE(Y1) MUL alpha2, a4, a4 unop ADD y5, a1, y5 ST y1, 1 * SIZE(Y1) MUL alpha2, a5, a5 unop ADD y6, a2, y6 ST y2, 2 * SIZE(Y1) MUL alpha2, a6, a6 unop ADD y7, a3, y7 ST y3, 3 * SIZE(Y1) MUL alpha2, a7, a7 unop ADD y4, a4, y4 ADD y5, a5, y5 ADD y6, a6, y6 ADD y7, a7, y7 ST y4, 4 * SIZE(Y1) lda A1, 8 * SIZE(A1) ST y5, 5 * SIZE(Y1) lda A2, 8 * SIZE(A2) ST y6, 6 * SIZE(Y1) unop ST y7, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) .align 4 $L25: and M, 4, I ble I, $L26 LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) LD y2, 2 * SIZE(Y1) LD y3, 3 * SIZE(Y1) LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) MUL alpha1, a0, a0 LD a4, 0 * SIZE(A2) MUL alpha1, a1, a1 LD a5, 1 * SIZE(A2) MUL alpha1, a2, a2 LD a6, 2 * SIZE(A2) MUL alpha1, a3, a3 LD a7, 3 * SIZE(A2) ADD y0, a0, y0 MUL alpha2, a4, a4 ADD y1, a1, y1 MUL alpha2, a5, a5 ADD y2, a2, y2 MUL alpha2, a6, a6 ADD y3, a3, y3 MUL alpha2, a7, a7 ADD y0, a4, y0 lda Y1, 4 * SIZE(Y1) ADD y1, a5, y1 unop ADD y2, a6, y2 unop ADD y3, a7, y3 unop ST y0, -4 * SIZE(Y1) lda A1, 4 * SIZE(A1) ST y1, -3 * SIZE(Y1) lda A2, 4 * SIZE(A2) ST y2, -2 * SIZE(Y1) lda A3, 4 * SIZE(A3) ST y3, -1 * SIZE(Y1) lda A4, 4 * SIZE(A4) .align 4 $L26: and M, 2, I ble I, $L27 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 0 * SIZE(A2) LD a3, 1 * SIZE(A2) LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) MUL alpha1, a0, a0 MUL alpha1, a1, a1 MUL alpha2, a2, a2 MUL alpha2, a3, a3 ADD y0, a0, y0 lda A1, 2 * SIZE(A1) ADD y1, a1, y1 lda A2, 2 * SIZE(A2) ADD y0, a2, y0 unop ADD y1, a3, y1 unop ST y0, 0 * SIZE(Y1) unop ST y1, 1 * SIZE(Y1) lda Y1, 2 * SIZE(Y1) .align 4 $L27: blbc M, $L30 LD y0, 0 * SIZE(Y1) LD a0, 0 * SIZE(A1) LD a1, 0 * SIZE(A2) MUL alpha1, a0, a0 MUL alpha2, a1, a1 ADD y0, a0, y0 ADD y0, a1, y0 ST y0, 0 * SIZE(Y1) .align 4 $L30: blbc N, $L990 LD alpha1, 0 * SIZE(X) mov A, A1 MUL alpha, alpha1, alpha1 mov Y, Y1 sra M, 3, I ble I, $L35 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD a4, 4 * SIZE(A1) LD a5, 5 * SIZE(A1) LD a6, 6 * SIZE(A1) LD a7, 7 * SIZE(A1) LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) LD y2, 2 * SIZE(Y1) LD y3, 3 * SIZE(Y1) LD y4, 4 * SIZE(Y1) LD y5, 5 * SIZE(Y1) LD y6, 6 * SIZE(Y1) LD y7, 7 * SIZE(Y1) MUL alpha1, a0, a0 MUL alpha1, a1, a1 MUL alpha1, a2, a2 MUL alpha1, a3, a3 lda I, -1(I) ble I, $L33 .align 4 $L32: ADD y0, a0, y0 LD y4, 4 * SIZE(Y1) MUL alpha1, a4, a4 LD a0, 8 * SIZE(A1) ADD y1, a1, y1 LD y5, 5 * SIZE(Y1) MUL alpha1, a5, a5 LD a1, 9 * SIZE(A1) ADD y2, a2, y2 LD y6, 6 * SIZE(Y1) MUL alpha1, a6, a6 LD a2, 10 * SIZE(A1) ADD y3, a3, y3 LD y7, 7 * SIZE(Y1) MUL alpha1, a7, a7 LD a3, 11 * SIZE(A1) ST y0, 0 * SIZE(Y1) ST y1, 1 * SIZE(Y1) ST y2, 2 * SIZE(Y1) ST y3, 3 * SIZE(Y1) ADD y4, a4, y4 LD y0, 8 * SIZE(Y1) MUL alpha1, a0, a0 LD a4, 12 * SIZE(A1) ADD y5, a5, y5 LD y1, 9 * SIZE(Y1) MUL alpha1, a1, a1 LD a5, 13 * SIZE(A1) ADD y6, a6, y6 LD y2, 10 * SIZE(Y1) MUL alpha1, a2, a2 LD a6, 14 * SIZE(A1) ADD y7, a7, y7 LD y3, 11 * SIZE(Y1) MUL alpha1, a3, a3 LD a7, 15 * SIZE(A1) ST y4, 4 * SIZE(Y1) lda I, -1(I) ST y5, 5 * SIZE(Y1) lda A1, 8 * SIZE(A1) ST y6, 6 * SIZE(Y1) ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) ST y7, 7 * SIZE(Y1) lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) lda Y1, 8 * SIZE(Y1) bgt I, $L32 .align 4 $L33: ADD y0, a0, y0 LD y4, 4 * SIZE(Y1) MUL alpha1, a4, a4 unop ADD y1, a1, y1 LD y5, 5 * SIZE(Y1) MUL alpha1, a5, a5 unop ADD y2, a2, y2 LD y6, 6 * SIZE(Y1) MUL alpha1, a6, a6 unop ADD y3, a3, y3 LD y7, 7 * SIZE(Y1) MUL alpha1, a7, a7 unop ADD y4, a4, y4 ST y0, 0 * SIZE(Y1) ADD y5, a5, y5 ST y1, 1 * SIZE(Y1) ADD y6, a6, y6 ST y2, 2 * SIZE(Y1) ADD y7, a7, y7 ST y3, 3 * SIZE(Y1) ST y4, 4 * SIZE(Y1) unop ST y5, 5 * SIZE(Y1) unop ST y6, 6 * SIZE(Y1) lda A1, 8 * SIZE(A1) ST y7, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) .align 4 $L35: and M, 4, I ble I, $L36 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) MUL alpha1, a0, a0 LD y0, 0 * SIZE(Y1) MUL alpha1, a1, a1 LD y1, 1 * SIZE(Y1) MUL alpha1, a2, a2 LD y2, 2 * SIZE(Y1) MUL alpha1, a3, a3 LD y3, 3 * SIZE(Y1) ADD y0, a0, y0 ADD y1, a1, y1 ADD y2, a2, y2 ADD y3, a3, y3 ST y0, 0 * SIZE(Y1) lda A1, 4 * SIZE(A1) ST y1, 1 * SIZE(Y1) lda A2, 4 * SIZE(A2) ST y2, 2 * SIZE(Y1) unop ST y3, 3 * SIZE(Y1) lda Y1, 4 * SIZE(Y1) .align 4 $L36: and M, 2, I ble I, $L37 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD y0, 0 * SIZE(Y1) MUL alpha1, a0, a0 LD y1, 1 * SIZE(Y1) MUL alpha1, a1, a1 ADD y0, a0, y0 ADD y1, a1, y1 ST y0, 0 * SIZE(Y1) lda A1, 2 * SIZE(A1) ST y1, 1 * SIZE(Y1) lda Y1, 2 * SIZE(Y1) .align 4 $L37: blbc M, $L990 LD y0, 0 * SIZE(Y1) LD a0, 0 * SIZE(A1) MUL alpha1, a0, a0 ADD y0, a0, y0 ST y0, 0 * SIZE(Y1) .align 4 $L990: cmpeq INCY, SIZE, $0 bne $0, $L999 mov BUFFER, Y1 sra M, 3, I ble I, $L995 .align 4 $L992: LD a0, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a1, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a2, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a3, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD y0, 0 * SIZE(Y) LD y1, 1 * SIZE(Y) LD y2, 2 * SIZE(Y) LD y3, 3 * SIZE(Y) LD a4, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a5, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a6, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a7, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD y4, 4 * SIZE(Y) LD y5, 5 * SIZE(Y) LD y6, 6 * SIZE(Y) LD y7, 7 * SIZE(Y) ADD a0, y0, a0 ADD a1, y1, a1 ADD a2, y2, a2 ADD a3, y3, a3 ADD a4, y4, a4 ADD a5, y5, a5 ADD a6, y6, a6 ADD a7, y7, a7 ST a0, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a1, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a2, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a3, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a4, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a5, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a6, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a7, 0 * SIZE(Y1) addq Y1, INCY, Y1 lda I, -1(I) lda Y, 8 * SIZE(Y) bgt I, $L992 .align 4 $L995: and M, 7, I ble I, $L999 .align 4 $L996: LD a0, 0 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD y0, 0 * SIZE(Y) lda Y, 1 * SIZE(Y) ADD a0, y0, a0 ST a0, 0 * SIZE(Y1) addq Y1, INCY, Y1 lda I, -1(I) bgt I, $L996 .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/gemv_t.S000066400000000000000000000417071313527062700171270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define STACKSIZE 64 #define PREFETCHSIZE 32 #define M $16 #define N $17 #define A $20 #define LDA $21 #define X $18 #define INCX $19 #define Y $22 #define INCY $23 #define BUFFER $24 #define I $25 #define J $27 #define X1 $3 #define Y1 $4 #define A1 $5 #define A2 $6 #define A3 $7 #define A4 $8 #define alpha $f19 #define s0 $f0 #define s1 $f1 #define s2 $f10 #define s3 $f11 #define t0 $f12 #define t1 $f13 #define t2 $f14 #define t3 $f15 #define x0 $f16 #define x1 $f17 #define x2 $f18 #define x3 $f21 #define a0 $f22 #define a1 $f23 #define a2 $f24 #define a3 $f25 #define a4 $f26 #define a5 $f27 #define a6 $f28 #define a7 $f29 #define a8 $f2 #define a9 $f3 #define a10 $f4 #define a11 $f5 #define a12 $f6 #define a13 $f7 #define a14 $f8 #define a15 $f9 PROLOGUE lda $sp, -STACKSIZE($sp) ldq X, 0 + STACKSIZE($sp) ldq INCX, 8 + STACKSIZE($sp) ldq Y, 16 + STACKSIZE($sp) ldq INCY, 24 + STACKSIZE($sp) ldq BUFFER, 32 + STACKSIZE($sp) stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) PROFCODE cmple M, 0, $0 SXADDQ INCX, 0, INCX cmple N, 0, $1 SXADDQ INCY, 0, INCY or $0, $1, $0 bne $0, $L999 cmpeq INCX, SIZE, $0 mov X, X1 SXADDQ LDA, 0, LDA bne $0, $L10 sra M, 3, I mov BUFFER, Y1 mov BUFFER, X ble I, $L05 .align 4 $L02: ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) lda I, -1(I) LD a0, 0 * SIZE(X1) addq X1, INCX, X1 LD a1, 0 * SIZE(X1) addq X1, INCX, X1 LD a2, 0 * SIZE(X1) addq X1, INCX, X1 LD a3, 0 * SIZE(X1) addq X1, INCX, X1 ST a0, 0 * SIZE(Y1) ST a1, 1 * SIZE(Y1) ST a2, 2 * SIZE(Y1) ST a3, 3 * SIZE(Y1) LD a4, 0 * SIZE(X1) addq X1, INCX, X1 LD a5, 0 * SIZE(X1) addq X1, INCX, X1 LD a6, 0 * SIZE(X1) addq X1, INCX, X1 LD a7, 0 * SIZE(X1) addq X1, INCX, X1 ST a4, 4 * SIZE(Y1) ST a5, 5 * SIZE(Y1) ST a6, 6 * SIZE(Y1) ST a7, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) bgt I, $L02 .align 4 $L05: and M, 7, I ble I, $L10 .align 4 $L06: LD a0, 0 * SIZE(X1) addq X1, INCX, X1 ST a0, 0 * SIZE(Y1) addq Y1, SIZE, Y1 lda I, -1(I) bgt I, $L06 .align 4 $L10: mov Y, Y1 fclr t0 unop fclr t1 sra N, 2, J fclr t2 fclr t3 ble J, $L20 .align 4 $L11: mov A, A1 fclr s0 addq A, LDA, A2 fclr s1 addq A2, LDA, A3 fclr s2 addq A3, LDA, A4 fclr s3 s4addq LDA, A, A unop mov X, X1 lds $f31, 3 * SIZE(Y) sra M, 3, I ble I, $L15 LD x0, 0 * SIZE(X1) LD x1, 1 * SIZE(X1) LD x2, 2 * SIZE(X1) LD a0, 0 * SIZE(A1) LD a1, 0 * SIZE(A2) LD a2, 0 * SIZE(A3) LD a3, 0 * SIZE(A4) LD a4, 1 * SIZE(A1) LD a5, 1 * SIZE(A2) LD a6, 1 * SIZE(A3) LD a7, 1 * SIZE(A4) LD a8, 2 * SIZE(A1) LD a9, 2 * SIZE(A2) LD a10, 2 * SIZE(A3) LD a11, 2 * SIZE(A4) LD a12, 3 * SIZE(A1) LD a13, 3 * SIZE(A2) LD a14, 3 * SIZE(A3) LD a15, 3 * SIZE(A4) lda I, -1(I) ble I, $L13 .align 4 $L12: ADD s0, t0, s0 LD x3, 3 * SIZE(X1) MUL x0, a0, t0 LD a0, 4 * SIZE(A1) ADD s1, t1, s1 ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) MUL x0, a1, t1 LD a1, 4 * SIZE(A2) ADD s2, t2, s2 unop MUL x0, a2, t2 LD a2, 4 * SIZE(A3) ADD s3, t3, s3 unop MUL x0, a3, t3 LD a3, 4 * SIZE(A4) ADD s0, t0, s0 LD x0, 4 * SIZE(X1) MUL x1, a4, t0 LD a4, 5 * SIZE(A1) ADD s1, t1, s1 lda A1, 8 * SIZE(A1) MUL x1, a5, t1 LD a5, 5 * SIZE(A2) ADD s2, t2, s2 unop MUL x1, a6, t2 LD a6, 5 * SIZE(A3) ADD s3, t3, s3 unop MUL x1, a7, t3 LD a7, 5 * SIZE(A4) ADD s0, t0, s0 LD x1, 5 * SIZE(X1) MUL x2, a8, t0 LD a8, -2 * SIZE(A1) ADD s1, t1, s1 ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) MUL x2, a9, t1 LD a9, 6 * SIZE(A2) ADD s2, t2, s2 lda A2, 8 * SIZE(A2) MUL x2, a10, t2 LD a10, 6 * SIZE(A3) ADD s3, t3, s3 lda A3, 8 * SIZE(A3) MUL x2, a11, t3 LD a11, 6 * SIZE(A4) ADD s0, t0, s0 LD x2, 6 * SIZE(X1) MUL x3, a12, t0 LD a12, -1 * SIZE(A1) ADD s1, t1, s1 lda A4, 8 * SIZE(A4) MUL x3, a13, t1 LD a13, -1 * SIZE(A2) ADD s2, t2, s2 unop MUL x3, a14, t2 LD a14, -1 * SIZE(A3) ADD s3, t3, s3 unop MUL x3, a15, t3 LD a15, -1 * SIZE(A4) ADD s0, t0, s0 LD x3, 7 * SIZE(X1) MUL x0, a0, t0 LD a0, 0 * SIZE(A1) ADD s1, t1, s1 ldl $31, (PREFETCHSIZE - 8) * SIZE(A3) MUL x0, a1, t1 LD a1, 0 * SIZE(A2) ADD s2, t2, s2 unop MUL x0, a2, t2 LD a2, 0 * SIZE(A3) ADD s3, t3, s3 unop MUL x0, a3, t3 LD a3, 0 * SIZE(A4) ADD s0, t0, s0 LD x0, 8 * SIZE(X1) MUL x1, a4, t0 LD a4, 1 * SIZE(A1) ADD s1, t1, s1 unop MUL x1, a5, t1 LD a5, 1 * SIZE(A2) ADD s2, t2, s2 unop MUL x1, a6, t2 LD a6, 1 * SIZE(A3) ADD s3, t3, s3 unop MUL x1, a7, t3 LD a7, 1 * SIZE(A4) ADD s0, t0, s0 LD x1, 9 * SIZE(X1) MUL x2, a8, t0 LD a8, 2 * SIZE(A1) ADD s1, t1, s1 ldl $31, (PREFETCHSIZE - 8) * SIZE(A4) MUL x2, a9, t1 LD a9, 2 * SIZE(A2) ADD s2, t2, s2 lda X1, 8 * SIZE(X1) MUL x2, a10, t2 LD a10, 2 * SIZE(A3) ADD s3, t3, s3 lda I, -1(I) MUL x2, a11, t3 LD a11, 2 * SIZE(A4) ADD s0, t0, s0 LD x2, 2 * SIZE(X1) MUL x3, a12, t0 LD a12, 3 * SIZE(A1) ADD s1, t1, s1 ldl $31, (PREFETCHSIZE - 8) * SIZE(X1) MUL x3, a13, t1 LD a13, 3 * SIZE(A2) ADD s2, t2, s2 unop MUL x3, a14, t2 LD a14, 3 * SIZE(A3) ADD s3, t3, s3 MUL x3, a15, t3 LD a15, 3 * SIZE(A4) bgt I, $L12 .align 4 $L13: ADD s0, t0, s0 LD x3, 3 * SIZE(X1) MUL x0, a0, t0 LD a0, 4 * SIZE(A1) ADD s1, t1, s1 unop MUL x0, a1, t1 LD a1, 4 * SIZE(A2) ADD s2, t2, s2 unop MUL x0, a2, t2 LD a2, 4 * SIZE(A3) ADD s3, t3, s3 unop MUL x0, a3, t3 LD a3, 4 * SIZE(A4) ADD s0, t0, s0 LD x0, 4 * SIZE(X1) MUL x1, a4, t0 LD a4, 5 * SIZE(A1) ADD s1, t1, s1 unop MUL x1, a5, t1 LD a5, 5 * SIZE(A2) ADD s2, t2, s2 unop MUL x1, a6, t2 LD a6, 5 * SIZE(A3) ADD s3, t3, s3 unop MUL x1, a7, t3 LD a7, 5 * SIZE(A4) ADD s0, t0, s0 LD x1, 5 * SIZE(X1) MUL x2, a8, t0 LD a8, 6 * SIZE(A1) ADD s1, t1, s1 unop MUL x2, a9, t1 LD a9, 6 * SIZE(A2) ADD s2, t2, s2 unop MUL x2, a10, t2 LD a10, 6 * SIZE(A3) ADD s3, t3, s3 unop MUL x2, a11, t3 LD a11, 6 * SIZE(A4) ADD s0, t0, s0 LD x2, 6 * SIZE(X1) MUL x3, a12, t0 LD a12, 7 * SIZE(A1) ADD s1, t1, s1 lda A1, 8 * SIZE(A1) MUL x3, a13, t1 LD a13, 7 * SIZE(A2) ADD s2, t2, s2 lda A2, 8 * SIZE(A2) MUL x3, a14, t2 LD a14, 7 * SIZE(A3) ADD s3, t3, s3 lda A3, 8 * SIZE(A3) MUL x3, a15, t3 LD a15, 7 * SIZE(A4) ADD s0, t0, s0 LD x3, 7 * SIZE(X1) MUL x0, a0, t0 unop ADD s1, t1, s1 lda X1, 8 * SIZE(X1) MUL x0, a1, t1 lda A4, 8 * SIZE(A4) ADD s2, t2, s2 MUL x0, a2, t2 ADD s3, t3, s3 MUL x0, a3, t3 ADD s0, t0, s0 MUL x1, a4, t0 ADD s1, t1, s1 MUL x1, a5, t1 ADD s2, t2, s2 MUL x1, a6, t2 ADD s3, t3, s3 MUL x1, a7, t3 ADD s0, t0, s0 MUL x2, a8, t0 ADD s1, t1, s1 MUL x2, a9, t1 ADD s2, t2, s2 MUL x2, a10, t2 ADD s3, t3, s3 MUL x2, a11, t3 ADD s0, t0, s0 MUL x3, a12, t0 ADD s1, t1, s1 MUL x3, a13, t1 ADD s2, t2, s2 MUL x3, a14, t2 ADD s3, t3, s3 MUL x3, a15, t3 .align 4 $L15: and M, 7, I ble I, $L18 LD x0, 0 * SIZE(X1) LD a0, 0 * SIZE(A1) LD a1, 0 * SIZE(A2) LD a2, 0 * SIZE(A3) LD a3, 0 * SIZE(A4) lda I, -1(I) ble I, $L17 .align 4 $L16: ADD s0, t0, s0 lda A4, 1 * SIZE(A4) MUL x0, a0, t0 LD a0, 1 * SIZE(A1) ADD s1, t1, s1 lda A1, 1 * SIZE(A1) MUL x0, a1, t1 LD a1, 1 * SIZE(A2) ADD s2, t2, s2 lda A2, 1 * SIZE(A2) MUL x0, a2, t2 LD a2, 1 * SIZE(A3) ADD s3, t3, s3 lda A3, 1 * SIZE(A3) MUL x0, a3, t3 LD a3, 0 * SIZE(A4) LD x0, 1 * SIZE(X1) lda X1, 1 * SIZE(X1) lda I, -1(I) bgt I, $L16 .align 4 $L17: ADD s0, t0, s0 MUL x0, a0, t0 ADD s1, t1, s1 MUL x0, a1, t1 ADD s2, t2, s2 MUL x0, a2, t2 ADD s3, t3, s3 MUL x0, a3, t3 .align 4 $L18: LD a0, 0 * SIZE(Y) addq Y, INCY, Y LD a1, 0 * SIZE(Y) addq Y, INCY, Y LD a2, 0 * SIZE(Y) addq Y, INCY, Y LD a3, 0 * SIZE(Y) addq Y, INCY, Y ADD s0, t0, s0 ADD s1, t1, s1 ADD s2, t2, s2 ADD s3, t3, s3 MUL alpha, s0, s0 MUL alpha, s1, s1 MUL alpha, s2, s2 MUL alpha, s3, s3 ADD a0, s0, a0 fclr t0 ADD a1, s1, a1 fclr t1 ADD a2, s2, a2 fclr t2 ADD a3, s3, a3 fclr t3 ST a0, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a1, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a2, 0 * SIZE(Y1) addq Y1, INCY, Y1 ST a3, 0 * SIZE(Y1) addq Y1, INCY, Y1 lda J, -1(J) bgt J, $L11 .align 4 $L20: and N, 2, J ble J, $L30 mov A, A1 addq A, LDA, A2 addq A2, LDA, A fclr s0 mov X, X1 fclr s1 sra M, 3, I fclr s2 fclr s3 ble I, $L25 LD a0, 0 * SIZE(A1) LD a1, 0 * SIZE(A2) LD a2, 1 * SIZE(A1) LD a3, 1 * SIZE(A2) LD a4, 2 * SIZE(A1) LD a5, 2 * SIZE(A2) LD a6, 3 * SIZE(A1) LD a7, 3 * SIZE(A2) LD a8, 4 * SIZE(A1) LD a9, 4 * SIZE(A2) LD a10, 5 * SIZE(A1) LD a11, 5 * SIZE(A2) LD a12, 6 * SIZE(A1) LD a13, 6 * SIZE(A2) LD a14, 7 * SIZE(A1) LD a15, 7 * SIZE(A2) LD x0, 0 * SIZE(X1) LD x1, 1 * SIZE(X1) LD x2, 2 * SIZE(X1) lda I, -1(I) ble I, $L23 .align 4 $L22: ADD s0, t0, s0 LD x3, 3 * SIZE(X1) MUL x0, a0, t0 LD a0, 8 * SIZE(A1) ADD s1, t1, s1 ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) MUL x0, a1, t1 LD a1, 8 * SIZE(A2) ADD s0, t2, s0 LD x0, 4 * SIZE(X1) MUL x1, a2, t2 LD a2, 9 * SIZE(A1) ADD s1, t3, s1 unop MUL x1, a3, t3 LD a3, 9 * SIZE(A2) ADD s0, t0, s0 LD x1, 5 * SIZE(X1) MUL x2, a4, t0 LD a4, 10 * SIZE(A1) ADD s1, t1, s1 lda I, -1(I) MUL x2, a5, t1 LD a5, 10 * SIZE(A2) ADD s0, t2, s0 LD x2, 6 * SIZE(X1) MUL x3, a6, t2 LD a6, 11 * SIZE(A1) ADD s1, t3, s1 lda X1, 8 * SIZE(X1) MUL x3, a7, t3 LD a7, 11 * SIZE(A2) ADD s0, t0, s0 LD x3, -1 * SIZE(X1) MUL x0, a8, t0 LD a8, 12 * SIZE(A1) ADD s1, t1, s1 ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) MUL x0, a9, t1 LD a9, 12 * SIZE(A2) ADD s0, t0, s0 LD x0, 0 * SIZE(X1) MUL x1, a10, t0 LD a10, 13 * SIZE(A1) ADD s1, t1, s1 lda A1, 8 * SIZE(A1) MUL x1, a11, t1 LD a11, 13 * SIZE(A2) ADD s0, t0, s0 LD x1, 1 * SIZE(X1) MUL x2, a12, t0 LD a12, 6 * SIZE(A1) ADD s1, t1, s1 MUL x2, a13, t1 LD a13, 14 * SIZE(A2) lda A2, 8 * SIZE(A2) ADD s0, t0, s0 LD x2, 2 * SIZE(X1) MUL x3, a14, t0 LD a14, 7 * SIZE(A1) ADD s1, t1, s1 MUL x3, a15, t1 LD a15, 7 * SIZE(A2) bgt I, $L22 .align 4 $L23: ADD s0, t0, s0 LD x3, 3 * SIZE(X1) MUL x0, a0, t0 lda A1, 8 * SIZE(A1) ADD s1, t1, s1 unop MUL x0, a1, t1 unop ADD s0, t2, s0 LD x0, 4 * SIZE(X1) MUL x1, a2, t2 lda A2, 8 * SIZE(A2) ADD s1, t3, s1 unop MUL x1, a3, t3 unop ADD s0, t0, s0 LD x1, 5 * SIZE(X1) MUL x2, a4, t0 unop ADD s1, t1, s1 unop MUL x2, a5, t1 unop ADD s0, t2, s0 LD x2, 6 * SIZE(X1) MUL x3, a6, t2 unop ADD s1, t3, s1 unop MUL x3, a7, t3 unop ADD s0, t0, s0 LD x3, 7 * SIZE(X1) MUL x0, a8, t0 lda X1, 8 * SIZE(X1) ADD s1, t1, s1 unop MUL x0, a9, t1 unop ADD s0, t0, s0 MUL x1, a10, t0 ADD s1, t1, s1 MUL x1, a11, t1 ADD s0, t0, s0 MUL x2, a12, t0 ADD s1, t1, s1 MUL x2, a13, t1 ADD s0, t0, s0 MUL x3, a14, t0 ADD s1, t1, s1 MUL x3, a15, t1 .align 4 $L25: and M, 7, I ble I, $L28 LD a0, 0 * SIZE(A1) LD a1, 0 * SIZE(A2) LD x0, 0 * SIZE(X1) lda I, -1(I) ble I, $L27 .align 4 $L26: ADD s0, t0, s0 lda A2, 1 * SIZE(A2) MUL x0, a0, t0 LD a0, 1 * SIZE(A1) ADD s1, t1, s1 lda A1, 1 * SIZE(A1) MUL x0, a1, t1 LD a1, 0 * SIZE(A2) LD x0, 1 * SIZE(X1) lda X1, 1 * SIZE(X1) lda I, -1(I) bgt I, $L26 .align 4 $L27: ADD s0, t0, s0 MUL x0, a0, t0 ADD s1, t1, s1 MUL x0, a1, t1 .align 4 $L28: LD a0, 0 * SIZE(Y) addq Y, INCY, Y LD a1, 0 * SIZE(Y) addq Y, INCY, Y ADD s0, t0, s0 ADD s1, t1, s1 ADD s2, t2, s2 ADD s3, t3, s3 ADD s0, s2, s0 ADD s1, s3, s1 MUL alpha, s0, s0 MUL alpha, s1, s1 ADD a0, s0, a0 ADD a1, s1, a1 ST a0, 0 * SIZE(Y1) fclr t0 addq Y1, INCY, Y1 fclr t1 ST a1, 0 * SIZE(Y1) fclr t2 addq Y1, INCY, Y1 fclr t3 .align 4 $L30: blbc N, $L999 mov A, A1 fclr s0 mov X, X1 fclr s1 sra M, 3, I fclr s2 fclr s3 ble I, $L35 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a8, 0 * SIZE(X1) LD a9, 1 * SIZE(X1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD a10, 2 * SIZE(X1) LD a11, 3 * SIZE(X1) LD a4, 4 * SIZE(A1) LD a5, 5 * SIZE(A1) LD a12, 4 * SIZE(X1) LD a13, 5 * SIZE(X1) LD a6, 6 * SIZE(A1) LD a7, 7 * SIZE(A1) LD a14, 6 * SIZE(X1) lda I, -1(I) ble I, $L33 .align 4 $L32: ADD s0, t0, s0 LD a15, 7 * SIZE(X1) MUL a0, a8, t0 LD a0, 8 * SIZE(A1) ADD s1, t1, s1 LD a8, 8 * SIZE(X1) MUL a1, a9, t1 LD a1, 9 * SIZE(A1) ADD s2, t2, s2 LD a9, 9 * SIZE(X1) MUL a2, a10, t2 LD a2, 10 * SIZE(A1) ADD s3, t3, s3 LD a10, 10 * SIZE(X1) MUL a3, a11, t3 LD a3, 11 * SIZE(A1) ADD s0, t0, s0 LD a11, 11 * SIZE(X1) MUL a4, a12, t0 LD a4, 12 * SIZE(A1) ADD s1, t1, s1 LD a12, 12 * SIZE(X1) MUL a5, a13, t1 LD a5, 13 * SIZE(A1) ADD s2, t2, s2 LD a13, 13 * SIZE(X1) MUL a6, a14, t2 LD a6, 14 * SIZE(A1) ADD s3, t3, s3 LD a14, 14 * SIZE(X1) MUL a7, a15, t3 LD a7, 15 * SIZE(A1) lda A1, 8 * SIZE(A1) lda I, -1(I) lda X1, 8 * SIZE(X1) bgt I, $L32 .align 4 $L33: ADD s0, t0, s0 LD a15, 7 * SIZE(X1) MUL a0, a8, t0 lda A1, 8 * SIZE(A1) ADD s1, t1, s1 unop MUL a1, a9, t1 lda X1, 8 * SIZE(X1) ADD s2, t2, s2 MUL a2, a10, t2 ADD s3, t3, s3 MUL a3, a11, t3 ADD s0, t0, s0 MUL a4, a12, t0 ADD s1, t1, s1 MUL a5, a13, t1 ADD s2, t2, s2 MUL a6, a14, t2 ADD s3, t3, s3 MUL a7, a15, t3 .align 4 $L35: and M, 7, I ble I, $L38 LD a0, 0 * SIZE(A1) LD x0, 0 * SIZE(X1) lda I, -1(I) ble I, $L37 .align 4 $L36: ADD s0, t0, s0 MUL x0, a0, t0 LD a0, 1 * SIZE(A1) LD x0, 1 * SIZE(X1) lda A1, 1 * SIZE(A1) lda X1, 1 * SIZE(X1) lda I, -1(I) bgt I, $L36 .align 4 $L37: ADD s0, t0, s0 MUL x0, a0, t0 .align 4 $L38: LD a0, 0 * SIZE(Y) ADD s0, t0, s0 ADD s1, t1, s1 ADD s2, t2, s2 ADD s3, t3, s3 ADD s0, s2, s0 ADD s1, s3, s1 ADD s0, s1, s0 MUL alpha, s0, s0 ADD a0, s0, a0 ST a0, 0 * SIZE(Y1) .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/iamax.S000066400000000000000000000206511313527062700167400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #define XX $19 #ifndef USE_MIN #define CMPLT(a, b) cmptlt a, b #else #define CMPLT(a, b) cmptlt b, a #endif #define STACKSIZE 6 * 8 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 #ifdef F_INTERFACE ldl N, 0(N) # n ldl INCX, 0(INCX) # incx #endif lda $sp, -STACKSIZE($sp) mov X, XX .align 4 stt $f2, 0($sp) fclr $f16 cmplt $31, N, $2 unop stt $f3, 8($sp) fclr $f17 cmplt $31, INCX, $3 unop stt $f4, 16($sp) fclr $f18 SXADDQ INCX, $31, INCX unop stt $f5, 24($sp) fclr $f19 and $2, $3, $2 clr $0 stt $f6, 32($sp) fclr $f0 sra N, 3, $1 beq $2, $End # if (n <= 0) or (incx <= 0) return .align 4 LD $f20, 0 * SIZE(X) unop fabs $f20, $f0 ble $1, $L15 .align 4 fabs $f20, $f1 unop addq X, INCX, X unop LD $f21, 0 * SIZE(X) fabs $f20, $f2 addq X, INCX, X unop LD $f22, 0 * SIZE(X) fabs $f20, $f3 addq X, INCX, X unop LD $f23, 0 * SIZE(X) fabs $f20, $f4 addq X, INCX, X unop LD $f24, 0 * SIZE(X) addq X, INCX, X fabs $f20, $f5 unop LD $f25, 0 * SIZE(X) fabs $f20, $f6 addq X, INCX, X unop LD $f26, 0 * SIZE(X) fabs $f20, $f28 addq X, INCX, X lda $1, -1($1) LD $f27, 0 * SIZE(X) unop addq X, INCX, X ble $1, $L13 .align 4 $L12: fcmovne $f16, $f12, $f4 unop fabs $f20, $f29 ldl $31, 56 * SIZE(X) fcmovne $f17, $f13, $f5 LD $f20, 0 * SIZE(X) fabs $f21, $f30 addq X, INCX, X fcmovne $f18, $f14, $f6 LD $f21, 0 * SIZE(X) fabs $f22, $f10 addq X, INCX, X fcmovne $f19, $f15, $f28 LD $f22, 0 * SIZE(X) fabs $f23, $f11 addq X, INCX, X fabs $f24, $f12 LD $f23, 0 * SIZE(X) CMPLT($f0, $f29), $f16 addq X, INCX, X fabs $f25, $f13 LD $f24, 0 * SIZE(X) CMPLT($f1, $f30), $f17 addq X, INCX, X fabs $f26, $f14 LD $f25, 0 * SIZE(X) CMPLT($f2, $f10), $f18 addq X, INCX, X fabs $f27, $f15 LD $f26, 0 * SIZE(X) CMPLT($f3, $f11), $f19 addq X, INCX, X fcmovne $f16, $f29, $f0 LD $f27, 0 * SIZE(X) CMPLT($f4, $f12), $f16 addq X, INCX, X fcmovne $f17, $f30, $f1 unop CMPLT($f5, $f13), $f17 lda $1, -1($1) # i -- fcmovne $f18, $f10, $f2 unop CMPLT($f6, $f14), $f18 unop fcmovne $f19, $f11, $f3 unop CMPLT($f28, $f15), $f19 bgt $1,$L12 .align 4 $L13: fcmovne $f16, $f12, $f4 fabs $f20, $f29 fcmovne $f17, $f13, $f5 fabs $f21, $f30 fcmovne $f18, $f14, $f6 fabs $f22, $f10 fcmovne $f19, $f15, $f28 fabs $f23, $f11 fabs $f24, $f12 CMPLT($f0, $f29), $f16 fabs $f25, $f13 CMPLT($f1, $f30), $f17 fabs $f26, $f14 CMPLT($f2, $f10), $f18 fabs $f27, $f15 CMPLT($f3, $f11), $f19 fcmovne $f16, $f29, $f0 CMPLT($f4, $f12), $f16 fcmovne $f17, $f30, $f1 CMPLT($f5, $f13), $f17 fcmovne $f18, $f10, $f2 CMPLT($f6, $f14), $f18 fcmovne $f19, $f11, $f3 CMPLT($f28, $f15), $f19 fcmovne $f16, $f12, $f4 CMPLT($f0, $f1), $f16 fcmovne $f17, $f13, $f5 CMPLT($f2, $f3), $f17 fcmovne $f18, $f14, $f6 CMPLT($f4, $f5), $f18 fcmovne $f19, $f15, $f28 CMPLT($f6, $f28), $f19 fcmovne $f16, $f1, $f0 fcmovne $f17, $f3, $f2 fcmovne $f18, $f5, $f4 fcmovne $f19, $f28, $f6 CMPLT($f0, $f2), $f16 CMPLT($f4, $f6), $f17 fcmovne $f16, $f2, $f0 fcmovne $f17, $f6, $f4 CMPLT($f0, $f4), $f16 fcmovne $f16, $f4, $f0 .align 4 $L15: and N, 7, $1 unop unop ble $1, $L20 .align 4 $L16: LD $f20, 0 * SIZE(X) addq X, INCX, X fabs $f20, $f29 CMPLT($f0, $f29), $f16 fcmovne $f16, $f29, $f0 lda $1, -1($1) # i -- bgt $1, $L16 .align 4 $L20: sra N, 3, $1 ble $1, $L40 .align 4 LD $f10, 0 * SIZE(XX) addq XX, INCX, XX LD $f11, 0 * SIZE(XX) addq XX, INCX, XX LD $f12, 0 * SIZE(XX) addq XX, INCX, XX LD $f13, 0 * SIZE(XX) addq XX, INCX, XX LD $f14, 0 * SIZE(XX) addq XX, INCX, XX LD $f15, 0 * SIZE(XX) addq XX, INCX, XX LD $f16, 0 * SIZE(XX) addq XX, INCX, XX LD $f17, 0 * SIZE(XX) addq XX, INCX, XX fabs $f10, $f18 fabs $f11, $f19 fabs $f12, $f20 fabs $f13, $f21 lda $1, -1($1) ble $1, $L23 .align 4 $L22: LD $f10, 0 * SIZE(XX) fabs $f14, $f22 addq XX, INCX, XX cmpteq $f0, $f18, $f2 LD $f11, 0 * SIZE(XX) fabs $f15, $f23 addq XX, INCX, XX cmpteq $f0, $f19, $f3 LD $f12, 0 * SIZE(XX) fabs $f16, $f24 addq XX, INCX, XX cmpteq $f0, $f20, $f4 LD $f13, 0 * SIZE(XX) fabs $f17, $f25 addq XX, INCX, XX cmpteq $f0, $f21, $f5 LD $f14, 0 * SIZE(XX) lda $1, -1($1) # i -- cmpteq $f0, $f22, $f26 addq XX, INCX, XX lda $0, 1($0) fbne $f2, $End LD $f15, 0 * SIZE(XX) cmpteq $f0, $f23, $f27 lda $0, 1($0) fbne $f3, $End addq XX, INCX, XX cmpteq $f0, $f24, $f28 lda $0, 1($0) fbne $f4, $End LD $f16, 0 * SIZE(XX) cmpteq $f0, $f25, $f29 lda $0, 1($0) fbne $f5, $End addq XX, INCX, XX lda $0, 1($0) fabs $f10, $f18 fbne $f26, $End LD $f17, 0 * SIZE(XX) lda $0, 1($0) fabs $f11, $f19 fbne $f27, $End addq XX, INCX, XX lda $0, 1($0) fabs $f12, $f20 fbne $f28, $End lda $0, 1($0) fabs $f13, $f21 fbne $f29, $End bgt $1, $L22 .align 4 $L23: fabs $f14, $f22 cmpteq $f0, $f18, $f2 fabs $f15, $f23 cmpteq $f0, $f19, $f3 fabs $f16, $f24 cmpteq $f0, $f20, $f4 fabs $f17, $f25 cmpteq $f0, $f21, $f5 cmpteq $f0, $f22, $f26 lda $0, 1($0) unop fbne $f2, $End cmpteq $f0, $f23, $f27 lda $0, 1($0) unop fbne $f3, $End cmpteq $f0, $f24, $f28 lda $0, 1($0) unop fbne $f4, $End cmpteq $f0, $f25, $f29 lda $0, 1($0) unop fbne $f5, $End lda $0, 1($0) fbne $f26, $End lda $0, 1($0) fbne $f27, $End lda $0, 1($0) fbne $f28, $End lda $0, 1($0) fbne $f29, $End .align 4 $L40: LD $f20, 0 * SIZE(XX) addq XX, INCX, XX fabs $f20, $f25 cmpteq $f0, $f25, $f29 lda $0, 1($0) fbne $f29, $End br $31, $L40 .align 4 $End: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/imax.S000066400000000000000000000164731313527062700166060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #define XX $19 #ifndef USE_MIN #define CMPLT(a, b) cmptlt a, b #else #define CMPLT(a, b) cmptlt b, a #endif #define STACKSIZE 8 * 8 PROLOGUE PROFCODE clr $0 mov X, XX .align 4 cmplt $31, N, $2 cmplt $31, INCX, $3 SXADDQ INCX, $31, INCX and $2, $3, $2 sra N, 3, $1 fclr $f0 unop beq $2, $End # if (n <= 0) or (incx <= 0) return .align 4 LD $f0, 0 * SIZE(X) unop unop ble $1, $L15 .align 4 fmov $f0, $f1 addq X, INCX, X fmov $f0, $f10 lda $1, -1($1) LD $f21, 0 * SIZE(X) fmov $f0, $f11 addq X, INCX, X fmov $f0, $f12 LD $f22, 0 * SIZE(X) fmov $f0, $f13 addq X, INCX, X fmov $f0, $f14 LD $f23, 0 * SIZE(X) fmov $f0, $f15 addq X, INCX, X fmov $f0, $f20 LD $f24, 0 * SIZE(X) addq X, INCX, X LD $f25, 0 * SIZE(X) addq X, INCX, X LD $f26, 0 * SIZE(X) addq X, INCX, X LD $f27, 0 * SIZE(X) addq X, INCX, X CMPLT($f0, $f20), $f16 CMPLT($f1, $f21), $f17 CMPLT($f10, $f22), $f18 CMPLT($f11, $f23), $f19 ble $1, $L13 .align 4 $L12: fcmovne $f16, $f20, $f0 LD $f20, 0 * SIZE(X) CMPLT($f12, $f24), $f16 addq X, INCX, X fcmovne $f17, $f21, $f1 LD $f21, 0 * SIZE(X) CMPLT($f13, $f25), $f17 addq X, INCX, X fcmovne $f18, $f22, $f10 LD $f22, 0 * SIZE(X) CMPLT($f14, $f26), $f18 addq X, INCX, X fcmovne $f19, $f23, $f11 LD $f23, 0 * SIZE(X) CMPLT($f15, $f27), $f19 addq X, INCX, X fcmovne $f16, $f24, $f12 LD $f24, 0 * SIZE(X) CMPLT($f0, $f20), $f16 addq X, INCX, X fcmovne $f17, $f25, $f13 LD $f25, 0 * SIZE(X) CMPLT($f1, $f21), $f17 addq X, INCX, X fcmovne $f18, $f26, $f14 LD $f26, 0 * SIZE(X) CMPLT($f10, $f22), $f18 addq X, INCX, X fcmovne $f19, $f27, $f15 LD $f27, 0 * SIZE(X) CMPLT($f11, $f23), $f19 lda $1, -1($1) # i -- addq X, INCX, X unop unop bgt $1,$L12 .align 4 $L13: fcmovne $f16, $f20, $f0 CMPLT($f12, $f24), $f16 fcmovne $f17, $f21, $f1 CMPLT($f13, $f25), $f17 fcmovne $f18, $f22, $f10 CMPLT($f14, $f26), $f18 fcmovne $f19, $f23, $f11 CMPLT($f15, $f27), $f19 fcmovne $f16, $f24, $f12 CMPLT($f0, $f1), $f16 fcmovne $f17, $f25, $f13 CMPLT($f10, $f11), $f17 fcmovne $f18, $f26, $f14 CMPLT($f12, $f13), $f18 fcmovne $f19, $f27, $f15 CMPLT($f14, $f15), $f19 fcmovne $f16, $f1, $f0 fcmovne $f17, $f11, $f10 fcmovne $f18, $f13, $f12 fcmovne $f19, $f15, $f14 CMPLT($f0, $f10), $f16 CMPLT($f12, $f14), $f17 fcmovne $f16, $f10, $f0 fcmovne $f17, $f14, $f12 CMPLT($f0, $f12), $f16 fcmovne $f16, $f12, $f0 .align 4 $L15: and N, 7, $1 unop unop ble $1, $L20 .align 4 $L16: LD $f20, 0 * SIZE(X) addq X, INCX, X CMPLT($f0, $f20), $f16 fcmovne $f16, $f20, $f0 lda $1, -1($1) # i -- bgt $1, $L16 .align 4 $L20: sra N, 3, $1 ble $1, $L40 .align 4 LD $f10, 0 * SIZE(XX) addq XX, INCX, XX LD $f11, 0 * SIZE(XX) addq XX, INCX, XX LD $f12, 0 * SIZE(XX) addq XX, INCX, XX LD $f13, 0 * SIZE(XX) addq XX, INCX, XX LD $f14, 0 * SIZE(XX) addq XX, INCX, XX LD $f15, 0 * SIZE(XX) addq XX, INCX, XX LD $f16, 0 * SIZE(XX) addq XX, INCX, XX LD $f17, 0 * SIZE(XX) addq XX, INCX, XX cmpteq $f0, $f10, $f20 cmpteq $f0, $f11, $f21 cmpteq $f0, $f12, $f22 cmpteq $f0, $f13, $f23 lda $1, -1($1) ble $1, $L23 .align 4 $L22: LD $f10, 0 * SIZE(XX) cmpteq $f0, $f14, $f24 lda $0, 1($0) addq XX, INCX, XX fbne $f20, $End LD $f11, 0 * SIZE(XX) cmpteq $f0, $f15, $f25 lda $0, 1($0) addq XX, INCX, XX fbne $f21, $End LD $f12, 0 * SIZE(XX) cmpteq $f0, $f16, $f26 lda $0, 1($0) addq XX, INCX, XX fbne $f22, $End LD $f13, 0 * SIZE(XX) cmpteq $f0, $f17, $f27 lda $0, 1($0) addq XX, INCX, XX fbne $f23, $End LD $f14, 0 * SIZE(XX) cmpteq $f0, $f10, $f20 lda $0, 1($0) addq XX, INCX, XX fbne $f24, $End LD $f15, 0 * SIZE(XX) cmpteq $f0, $f11, $f21 lda $0, 1($0) addq XX, INCX, XX fbne $f25, $End LD $f16, 0 * SIZE(XX) lda $1, -1($1) # i -- cmpteq $f0, $f12, $f22 lda $0, 1($0) addq XX, INCX, XX fbne $f26, $End LD $f17, 0 * SIZE(XX) cmpteq $f0, $f13, $f23 lda $0, 1($0) addq XX, INCX, XX fbne $f27, $End bgt $1, $L22 .align 4 $L23: lda $0, 1($0) cmpteq $f0, $f14, $f24 unop fbne $f20, $End lda $0, 1($0) cmpteq $f0, $f15, $f25 unop fbne $f21, $End lda $0, 1($0) cmpteq $f0, $f16, $f26 unop fbne $f22, $End lda $0, 1($0) cmpteq $f0, $f17, $f27 unop fbne $f23, $End lda $0, 1($0) fbne $f24, $End lda $0, 1($0) fbne $f25, $End lda $0, 1($0) fbne $f26, $End lda $0, 1($0) fbne $f27, $End .align 4 $L40: LD $f20, 0 * SIZE(XX) addq XX, INCX, XX cmpteq $f0, $f20, $f29 lda $0, 1($0) fbne $f29, $End br $31, $L40 .align 4 $End: ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/izamax.S000066400000000000000000000201611313527062700171260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #define XX $19 #ifndef USE_MIN #define CMPLT(a, b) cmptlt a, b #else #define CMPLT(a, b) cmptlt b, a #endif #define STACKSIZE 8 * 8 PROLOGUE PROFCODE lda $sp, -STACKSIZE($sp) stt $f2, 0($sp) fclr $f16 cmplt $31, N, $2 unop stt $f3, 8($sp) fclr $f17 cmplt $31, INCX, $3 unop stt $f4, 16($sp) fclr $f18 SXADDQ INCX, $31, INCX unop stt $f5, 24($sp) fclr $f19 and $2, $3, $2 clr $0 stt $f6, 32($sp) mov X, XX stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) fclr $f0 beq $2, $End # if (n <= 0) or (incx <= 0) return .align 4 LD $f20, 0 * SIZE(X) LD $f21, 1 * SIZE(X) sra N, 2, $1 addq INCX, INCX, INCX fabs $f20, $f20 fabs $f21, $f21 addt $f20, $f21, $f0 ble $1, $L15 .align 4 lda $1, -1($1) unop addq X, INCX, X unop LD $f22, 0 * SIZE(X) fmov $f0, $f1 LD $f23, 1 * SIZE(X) addq X, INCX, X LD $f24, 0 * SIZE(X) fmov $f0, $f2 LD $f25, 1 * SIZE(X) addq X, INCX, X LD $f26, 0 * SIZE(X) fmov $f0, $f3 LD $f27, 1 * SIZE(X) addq X, INCX, X fabs $f20, $f8 fabs $f21, $f9 fabs $f22, $f10 fabs $f23, $f11 fabs $f24, $f12 fabs $f25, $f13 fabs $f26, $f14 fabs $f27, $f15 ble $1, $L14 .align 4 LD $f20, 0 * SIZE(X) LD $f21, 1 * SIZE(X) lda $1, -1($1) addq X, INCX, X LD $f22, 0 * SIZE(X) LD $f23, 1 * SIZE(X) unop addq X, INCX, X LD $f24, 0 * SIZE(X) LD $f25, 1 * SIZE(X) unop addq X, INCX, X LD $f26, 0 * SIZE(X) LD $f27, 1 * SIZE(X) addq X, INCX, X ble $1, $L13 .align 4 $L12: addt $f8, $f9, $f16 unop fabs $f20, $f8 ldl $31, 64 * SIZE(X) addt $f10, $f11, $f17 unop fabs $f21, $f9 LD $f20, 0 * SIZE(X) addt $f12, $f13, $f18 LD $f21, 1 * SIZE(X) fabs $f22, $f10 addq X, INCX, X addt $f14, $f15, $f19 LD $f22, 0 * SIZE(X) fabs $f23, $f11 unop CMPLT($f0, $f16), $f4 LD $f23, 1 * SIZE(X) fabs $f24, $f12 addq X, INCX, X CMPLT($f1, $f17), $f5 LD $f24, 0 * SIZE(X) fabs $f25, $f13 unop CMPLT($f2, $f18), $f6 LD $f25, 1 * SIZE(X) fabs $f26, $f14 addq X, INCX, X CMPLT($f3, $f19), $f7 LD $f26, 0 * SIZE(X) fabs $f27, $f15 unop fcmovne $f4, $f16, $f0 LD $f27, 1 * SIZE(X) addq X, INCX, X lda $1, -1($1) # i -- fcmovne $f5, $f17, $f1 fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 bgt $1,$L12 .align 4 $L13: addt $f8, $f9, $f16 fabs $f20, $f8 addt $f10, $f11, $f17 fabs $f21, $f9 addt $f12, $f13, $f18 fabs $f22, $f10 addt $f14, $f15, $f19 fabs $f23, $f11 CMPLT($f0, $f16), $f4 fabs $f24, $f12 CMPLT($f1, $f17), $f5 fabs $f25, $f13 CMPLT($f2, $f18), $f6 fabs $f26, $f14 CMPLT($f3, $f19), $f7 fabs $f27, $f15 fcmovne $f4, $f16, $f0 fcmovne $f5, $f17, $f1 fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 .align 4 $L14: addt $f8, $f9, $f16 addt $f10, $f11, $f17 addt $f12, $f13, $f18 addt $f14, $f15, $f19 CMPLT($f0, $f16), $f4 CMPLT($f1, $f17), $f5 CMPLT($f2, $f18), $f6 CMPLT($f3, $f19), $f7 fcmovne $f4, $f16, $f0 fcmovne $f5, $f17, $f1 fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 CMPLT($f0, $f1), $f16 CMPLT($f2, $f3), $f17 fcmovne $f16, $f1, $f0 fcmovne $f17, $f3, $f2 CMPLT($f0, $f2), $f16 fcmovne $f16, $f2, $f0 .align 4 $L15: and N, 3, $1 unop unop ble $1, $L20 .align 4 $L16: LD $f20, 0 * SIZE(X) LD $f21, 1 * SIZE(X) unop addq X, INCX, X fabs $f20, $f29 fabs $f21, $f30 addt $f29, $f30, $f29 CMPLT($f0, $f29), $f16 fcmovne $f16, $f29, $f0 lda $1, -1($1) # i -- bgt $1, $L16 .align 4 $L20: sra N, 2, $1 ble $1, $L40 .align 4 LD $f10, 0 * SIZE(XX) LD $f11, 1 * SIZE(XX) addq XX, INCX, XX LD $f12, 0 * SIZE(XX) LD $f13, 1 * SIZE(XX) addq XX, INCX, XX LD $f14, 0 * SIZE(XX) LD $f15, 1 * SIZE(XX) addq XX, INCX, XX LD $f16, 0 * SIZE(XX) LD $f17, 1 * SIZE(XX) addq XX, INCX, XX fabs $f10, $f18 fabs $f11, $f19 fabs $f12, $f20 fabs $f13, $f21 lda $1, -1($1) ble $1, $L23 .align 4 $L22: LD $f10, 0 * SIZE(XX) fabs $f14, $f22 LD $f11, 1 * SIZE(XX) addq XX, INCX, XX LD $f12, 0 * SIZE(XX) fabs $f15, $f23 LD $f13, 1 * SIZE(XX) addq XX, INCX, XX LD $f14, 0 * SIZE(XX) fabs $f16, $f24 LD $f15, 1 * SIZE(XX) addq XX, INCX, XX LD $f16, 0 * SIZE(XX) fabs $f17, $f25 LD $f17, 1 * SIZE(XX) addq XX, INCX, XX addt $f18, $f19, $f4 addt $f20, $f21, $f5 addt $f22, $f23, $f6 addt $f24, $f25, $f7 cmpteq $f0, $f4, $f26 cmpteq $f0, $f5, $f27 cmpteq $f0, $f6, $f28 cmpteq $f0, $f7, $f29 fabs $f10, $f18 lda $0, 1($0) lda $1, -1($1) # i -- fbne $f26, $End fabs $f11, $f19 lda $0, 1($0) unop fbne $f27, $End fabs $f12, $f20 lda $0, 1($0) unop fbne $f28, $End fabs $f13, $f21 lda $0, 1($0) fbne $f29, $End bgt $1, $L22 .align 4 $L23: fabs $f14, $f22 fabs $f15, $f23 fabs $f16, $f24 fabs $f17, $f25 addt $f18, $f19, $f4 addt $f20, $f21, $f5 addt $f22, $f23, $f6 addt $f24, $f25, $f7 cmpteq $f0, $f4, $f26 cmpteq $f0, $f5, $f27 cmpteq $f0, $f6, $f28 cmpteq $f0, $f7, $f29 lda $0, 1($0) fbne $f26, $End lda $0, 1($0) fbne $f27, $End lda $0, 1($0) fbne $f28, $End lda $0, 1($0) fbne $f29, $End .align 4 $L40: LD $f10, 0 * SIZE(XX) LD $f11, 1 * SIZE(XX) addq XX, INCX, XX fabs $f10, $f18 fabs $f11, $f19 addt $f18, $f19, $f18 cmpteq $f0, $f18, $f2 lda $0, 1($0) fbne $f2, $End br $31, $L40 .align 4 $End: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/lsame.S000066400000000000000000000061431313527062700167420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "version.h" .set noat .set noreorder .text .align 5 .globl lsame_ .ent lsame_ lsame_: .frame $sp,0,$26,0 #ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount .prologue 1 #else .prologue 0 #endif ldq_u $5, 0($16) ldq_u $6, 0($17) extbl $5, $16, $5 extbl $6, $17, $6 subl $5, 96, $1 subl $6, 96, $2 subl $5, 32, $3 subl $6, 32, $4 cmovgt $1, $3, $5 cmovgt $2, $4, $6 cmpeq $5, $6, $0 .align 4 $End: ret .end lsame_ .ident VERSION OpenBLAS-0.2.20/kernel/alpha/max.S000066400000000000000000000130001313527062700164140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #ifndef USE_MIN #define CMPLT(a, b) cmptlt a, b #else #define CMPLT(a, b) cmptlt b, a #endif #define STACKSIZE 8 * 8 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 #ifdef F_INTERFACE ldl N, 0(N) # n ldl INCX, 0(INCX) # incx #endif lda $sp, -STACKSIZE($sp) nop .align 4 cmplt $31, N, $2 cmplt $31, INCX, $3 SXADDQ INCX, $31, INCX and $2, $3, $0 sra N, 3, $1 fclr $f0 unop beq $0, $End # if (n <= 0) or (incx <= 0) return .align 4 LD $f0, 0 * SIZE(X) unop unop ble $1, $L15 .align 4 fmov $f0, $f1 addq X, INCX, X fmov $f0, $f10 lda $1, -1($1) LD $f21, 0 * SIZE(X) fmov $f0, $f11 addq X, INCX, X fmov $f0, $f12 LD $f22, 0 * SIZE(X) fmov $f0, $f13 addq X, INCX, X fmov $f0, $f14 LD $f23, 0 * SIZE(X) fmov $f0, $f15 addq X, INCX, X fmov $f0, $f20 LD $f24, 0 * SIZE(X) addq X, INCX, X LD $f25, 0 * SIZE(X) addq X, INCX, X LD $f26, 0 * SIZE(X) addq X, INCX, X LD $f27, 0 * SIZE(X) addq X, INCX, X CMPLT($f0, $f20), $f16 CMPLT($f1, $f21), $f17 CMPLT($f10, $f22), $f18 CMPLT($f11, $f23), $f19 ble $1, $L13 .align 4 $L12: fcmovne $f16, $f20, $f0 LD $f20, 0 * SIZE(X) CMPLT($f12, $f24), $f16 addq X, INCX, X fcmovne $f17, $f21, $f1 LD $f21, 0 * SIZE(X) CMPLT($f13, $f25), $f17 addq X, INCX, X fcmovne $f18, $f22, $f10 LD $f22, 0 * SIZE(X) CMPLT($f14, $f26), $f18 addq X, INCX, X fcmovne $f19, $f23, $f11 LD $f23, 0 * SIZE(X) CMPLT($f15, $f27), $f19 addq X, INCX, X fcmovne $f16, $f24, $f12 LD $f24, 0 * SIZE(X) CMPLT($f0, $f20), $f16 addq X, INCX, X fcmovne $f17, $f25, $f13 LD $f25, 0 * SIZE(X) CMPLT($f1, $f21), $f17 addq X, INCX, X fcmovne $f18, $f26, $f14 LD $f26, 0 * SIZE(X) CMPLT($f10, $f22), $f18 addq X, INCX, X fcmovne $f19, $f27, $f15 LD $f27, 0 * SIZE(X) CMPLT($f11, $f23), $f19 lda $1, -1($1) # i -- addq X, INCX, X unop unop bgt $1,$L12 .align 4 $L13: fcmovne $f16, $f20, $f0 CMPLT($f12, $f24), $f16 fcmovne $f17, $f21, $f1 CMPLT($f13, $f25), $f17 fcmovne $f18, $f22, $f10 CMPLT($f14, $f26), $f18 fcmovne $f19, $f23, $f11 CMPLT($f15, $f27), $f19 fcmovne $f16, $f24, $f12 CMPLT($f0, $f1), $f16 fcmovne $f17, $f25, $f13 CMPLT($f10, $f11), $f17 fcmovne $f18, $f26, $f14 CMPLT($f12, $f13), $f18 fcmovne $f19, $f27, $f15 CMPLT($f14, $f15), $f19 fcmovne $f16, $f1, $f0 fcmovne $f17, $f11, $f10 fcmovne $f18, $f13, $f12 fcmovne $f19, $f15, $f14 CMPLT($f0, $f10), $f16 CMPLT($f12, $f14), $f17 fcmovne $f16, $f10, $f0 fcmovne $f17, $f14, $f12 CMPLT($f0, $f12), $f16 fcmovne $f16, $f12, $f0 .align 4 $L15: and N, 7, $1 unop unop ble $1, $End .align 4 $L16: LD $f20, 0 * SIZE(X) addq X, INCX, X CMPLT($f0, $f20), $f16 fcmovne $f16, $f20, $f0 lda $1, -1($1) # i -- bgt $1, $L16 .align 4 $End: lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/rot.S000066400000000000000000000244231313527062700164460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #define Y $19 #define INCY $20 #define I $21 #define XX $23 #define YY $24 #define C $f10 #define S $f11 #define PREFETCH_SIZE 80 PROLOGUE PROFCODE .frame $sp, 0, $26, 0 #ifndef PROFILE .prologue 0 #else .prologue 1 #endif fmov $f21, C LD S, 0($sp) cmpeq INCX, 1, $23 cmpeq INCY, 1, $24 ble N, $L998 and $23, $24, $23 beq $23, $L50 sra N, 3, I ble I, $L15 LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) LD $f15, 1*SIZE(Y) LD $f16, 2*SIZE(X) LD $f17, 2*SIZE(Y) LD $f18, 3*SIZE(X) LD $f19, 3*SIZE(Y) MUL C, $f12, $f21 unop MUL S, $f13, $f22 MUL C, $f13, $f23 LD $f13, 4*SIZE(Y) MUL S, $f12, $f24 LD $f12, 4*SIZE(X) MUL C, $f14, $f25 lda I, -1(I) MUL S, $f15, $f26 ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 5*SIZE(Y) MUL S, $f14, $f28 SUB $f23, $f24, $f24 ble I, $L13 .align 4 $L12: MUL C, $f16, $f21 lds $f31, (PREFETCH_SIZE) * SIZE(X) unop LD $f14, 5*SIZE(X) ST $f22, 0*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 lds $f31, (PREFETCH_SIZE) * SIZE(Y) unop LD $f17, 6*SIZE(Y) ST $f24, 0*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 LD $f16, 6*SIZE(X) unop unop ST $f26, 1*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop LD $f19, 7*SIZE(Y) ST $f28, 1*SIZE(Y) MUL S, $f18, $f28 unop SUB $f23, $f24, $f24 MUL C, $f12, $f21 LD $f18, 7*SIZE(X) unop unop ST $f22, 2*SIZE(X) unop MUL S, $f13, $f22 ADD $f25, $f26, $f26 MUL C, $f13, $f23 LD $f13, 8*SIZE(Y) unop unop ST $f24, 2*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 LD $f12, 8*SIZE(X) unop unop ST $f26, 3*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 9*SIZE(Y) unop unop ST $f28, 3*SIZE(Y) MUL S, $f14, $f28 unop SUB $f23, $f24, $f24 MUL C, $f16, $f21 LD $f14, 9*SIZE(X) unop unop ST $f22, 4*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 LD $f17, 10*SIZE(Y) unop unop ST $f24, 4*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 LD $f16, 10*SIZE(X) unop unop ST $f26, 5*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 LD $f19, 11*SIZE(Y) unop unop ST $f28, 5*SIZE(Y) MUL S, $f18, $f28 lda I, -1(I) SUB $f23, $f24, $f24 MUL C, $f12, $f21 LD $f18, 11*SIZE(X) unop unop ST $f22, 6*SIZE(X) MUL S, $f13, $f22 unop ADD $f25, $f26, $f26 MUL C, $f13, $f23 LD $f13, 12*SIZE(Y) lda X, 8*SIZE(X) unop ST $f24, 6*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 LD $f12, 4*SIZE(X) lda Y, 8*SIZE(Y) unop ST $f26, -1*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 5*SIZE(Y) unop unop ST $f28, -1*SIZE(Y) MUL S, $f14, $f28 SUB $f23, $f24, $f24 bgt I, $L12 .align 4 $L13: MUL C, $f16, $f21 LD $f14, 5*SIZE(X) unop unop ST $f22, 0*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 unop unop LD $f17, 6*SIZE(Y) ST $f24, 0*SIZE(Y) MUL S, $f16, $f24 LD $f16, 6*SIZE(X) SUB $f27, $f28, $f28 MUL C, $f18, $f25 unop unop unop ST $f26, 1*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop LD $f19, 7*SIZE(Y) ST $f28, 1*SIZE(Y) MUL S, $f18, $f28 LD $f18, 7*SIZE(X) SUB $f23, $f24, $f24 MUL C, $f12, $f21 unop unop unop ST $f22, 2*SIZE(X) unop MUL S, $f13, $f22 ADD $f25, $f26, $f26 MUL C, $f13, $f23 unop unop unop ST $f24, 2*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 unop unop unop ST $f26, 3*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 unop unop unop ST $f28, 3*SIZE(Y) MUL S, $f14, $f28 unop SUB $f23, $f24, $f24 MUL C, $f16, $f21 unop unop unop ST $f22, 4*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 unop unop unop ST $f24, 4*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 unop unop unop ST $f26, 5*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop unop ST $f28, 5*SIZE(Y) MUL S, $f18, $f28 unop SUB $f23, $f24, $f24 ST $f22, 6*SIZE(X) ADD $f25, $f26, $f26 ST $f24, 6*SIZE(Y) SUB $f27, $f28, $f28 ST $f26, 7*SIZE(X) lda X, 8*SIZE(X) ST $f28, 7*SIZE(Y) lda Y, 8*SIZE(Y) .align 4 $L15: and N, 7, I ble I, $L998 .align 4 $L16: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f25 SUB $f23, $f24, $f26 lda I, -1(I) ST $f25, 0*SIZE(X) lda X, 1 * SIZE(X) ST $f26, 0*SIZE(Y) lda Y, 1 * SIZE(Y) bgt I, $L16 .align 4 $L998: clr $0 ret .align 4 $L50: mov X, XX mov Y, YY sra N, 3, I ble I, $L55 .align 4 $L51: LD $f12, 0*SIZE(X) SXADDQ INCX, X, X LD $f13, 0*SIZE(Y) SXADDQ INCY, Y, Y LD $f14, 0*SIZE(X) SXADDQ INCX, X, X LD $f15, 0*SIZE(Y) SXADDQ INCY, Y, Y LD $f16, 0*SIZE(X) SXADDQ INCX, X, X LD $f17, 0*SIZE(Y) SXADDQ INCY, Y, Y LD $f18, 0*SIZE(X) SXADDQ INCX, X, X LD $f19, 0*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 ST $f22, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f24, 0*SIZE(YY) SXADDQ INCY, YY, YY MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f26, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 0*SIZE(YY) SXADDQ INCY, YY, YY MUL C, $f16, $f21 MUL S, $f17, $f22 MUL C, $f17, $f23 MUL S, $f16, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 ST $f22, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f24, 0*SIZE(YY) SXADDQ INCY, YY, YY MUL C, $f18, $f25 MUL S, $f19, $f26 MUL C, $f19, $f27 MUL S, $f18, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f26, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 0*SIZE(YY) SXADDQ INCY, YY, YY LD $f12, 0*SIZE(X) SXADDQ INCX, X, X LD $f13, 0*SIZE(Y) SXADDQ INCY, Y, Y LD $f14, 0*SIZE(X) SXADDQ INCX, X, X LD $f15, 0*SIZE(Y) SXADDQ INCY, Y, Y LD $f16, 0*SIZE(X) SXADDQ INCX, X, X LD $f17, 0*SIZE(Y) SXADDQ INCY, Y, Y LD $f18, 0*SIZE(X) SXADDQ INCX, X, X LD $f19, 0*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 ST $f22, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f24, 0*SIZE(YY) SXADDQ INCY, YY, YY MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f26, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 0*SIZE(YY) SXADDQ INCY, YY, YY MUL C, $f16, $f21 MUL S, $f17, $f22 MUL C, $f17, $f23 MUL S, $f16, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 ST $f22, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f24, 0*SIZE(YY) SXADDQ INCY, YY, YY MUL C, $f18, $f25 MUL S, $f19, $f26 MUL C, $f19, $f27 MUL S, $f18, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f26, 0*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 0*SIZE(YY) SXADDQ INCY, YY, YY lda I, -1(I) bgt I, $L51 .align 4 $L55: and N, 7, I ble I, $L999 .align 4 $L56: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f25 SUB $f23, $f24, $f26 lda I, -1(I) ST $f25, 0*SIZE(X) SXADDQ INCX, X, X ST $f26, 0*SIZE(Y) SXADDQ INCY, Y, Y bgt I, $L56 .align 4 $L999: clr $0 ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/scal.S000066400000000000000000000213241313527062700165610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 88 #define N $16 #define X $20 #define INCX $21 #define XX $18 #define I $19 #define ALPHA $f19 #define s0 $f0 #define s1 $f1 #define s2 $f10 #define s3 $f11 #define a0 $f12 #define a1 $f13 #define a2 $f14 #define a3 $f15 #define a4 $f16 #define a5 $f17 #define a6 $f18 #define a7 $f21 #define t0 $f22 #define t1 $f23 #define t2 $f24 #define t3 $f25 PROLOGUE PROFCODE mov X, XX ble N, $L999 cmpeq INCX, 1, $0 beq $0, $L20 #ifndef DOUBLE sra N, 4, I ble I, $L15 LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) LD a2, 2 * SIZE(X) LD a3, 3 * SIZE(X) LD a4, 4 * SIZE(X) MUL a0, ALPHA, t0 LD a5, 5 * SIZE(X) MUL a1, ALPHA, t1 LD a6, 6 * SIZE(X) MUL a2, ALPHA, t2 LD a7, 7 * SIZE(X) MUL a3, ALPHA, t3 ST t0, 0 * SIZE(X) MUL a4, ALPHA, t0 ST t1, 1 * SIZE(X) MUL a5, ALPHA, t1 ST t2, 2 * SIZE(X) MUL a6, ALPHA, t2 ST t3, 3 * SIZE(X) MUL a7, ALPHA, t3 LD a0, 8 * SIZE(X) LD a1, 9 * SIZE(X) LD a2, 10 * SIZE(X) LD a3, 11 * SIZE(X) ST t0, 4 * SIZE(X) MUL a0, ALPHA, t0 ST t1, 5 * SIZE(X) MUL a1, ALPHA, t1 ST t2, 6 * SIZE(X) MUL a2, ALPHA, t2 ST t3, 7 * SIZE(X) MUL a3, ALPHA, t3 LD a4, 12 * SIZE(X) LD a5, 13 * SIZE(X) LD a6, 14 * SIZE(X) LD a7, 15 * SIZE(X) lda I, -1(I) ble I, $L13 .align 4 $L12: ST t0, 8 * SIZE(X) MUL a4, ALPHA, t0 ST t1, 9 * SIZE(X) MUL a5, ALPHA, t1 ST t2, 10 * SIZE(X) MUL a6, ALPHA, t2 ST t3, 11 * SIZE(X) MUL a7, ALPHA, t3 LD a0, 16 * SIZE(X) LD a1, 17 * SIZE(X) LD a2, 18 * SIZE(X) LD a3, 19 * SIZE(X) ST t0, 12 * SIZE(X) MUL a0, ALPHA, t0 ST t1, 13 * SIZE(X) MUL a1, ALPHA, t1 ST t2, 14 * SIZE(X) MUL a2, ALPHA, t2 ST t3, 15 * SIZE(X) MUL a3, ALPHA, t3 LD a4, 20 * SIZE(X) LD a5, 21 * SIZE(X) LD a6, 22 * SIZE(X) LD a7, 23 * SIZE(X) ST t0, 16 * SIZE(X) MUL a4, ALPHA, t0 ST t1, 17 * SIZE(X) MUL a5, ALPHA, t1 ST t2, 18 * SIZE(X) MUL a6, ALPHA, t2 ST t3, 19 * SIZE(X) MUL a7, ALPHA, t3 LD a0, 24 * SIZE(X) LD a1, 25 * SIZE(X) LD a2, 26 * SIZE(X) LD a3, 27 * SIZE(X) ST t0, 20 * SIZE(X) MUL a0, ALPHA, t0 ST t1, 21 * SIZE(X) MUL a1, ALPHA, t1 ST t2, 22 * SIZE(X) MUL a2, ALPHA, t2 ST t3, 23 * SIZE(X) MUL a3, ALPHA, t3 LD a4, 28 * SIZE(X) LD a5, 29 * SIZE(X) LD a6, 30 * SIZE(X) LD a7, 31 * SIZE(X) lds $f31, PREFETCHSIZE * SIZE(X) lda I, -1(I) addq X, 16 * SIZE, X bne I, $L12 .align 4 $L13: ST t0, 8 * SIZE(X) MUL a4, ALPHA, t0 ST t1, 9 * SIZE(X) MUL a5, ALPHA, t1 ST t2, 10 * SIZE(X) MUL a6, ALPHA, t2 ST t3, 11 * SIZE(X) MUL a7, ALPHA, t3 ST t0, 12 * SIZE(X) ST t1, 13 * SIZE(X) ST t2, 14 * SIZE(X) ST t3, 15 * SIZE(X) addq X, 16 * SIZE, X .align 4 $L15: and N, 15, I #else sra N, 3, I ble I, $L15 LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) LD a2, 2 * SIZE(X) LD a3, 3 * SIZE(X) LD a4, 4 * SIZE(X) MUL a0, ALPHA, t0 LD a5, 5 * SIZE(X) MUL a1, ALPHA, t1 LD a6, 6 * SIZE(X) MUL a2, ALPHA, t2 LD a7, 7 * SIZE(X) MUL a3, ALPHA, t3 lda I, -1(I) ble I, $L13 .align 4 $L12: ST t0, 0 * SIZE(X) MUL a4, ALPHA, t0 ST t1, 1 * SIZE(X) MUL a5, ALPHA, t1 ST t2, 2 * SIZE(X) MUL a6, ALPHA, t2 ST t3, 3 * SIZE(X) MUL a7, ALPHA, t3 LD a0, 8 * SIZE(X) lda I, -1(I) LD a1, 9 * SIZE(X) addq X, 8 * SIZE, X LD a2, 2 * SIZE(X) LD a3, 3 * SIZE(X) ST t0, -4 * SIZE(X) MUL a0, ALPHA, t0 ST t1, -3 * SIZE(X) MUL a1, ALPHA, t1 ST t2, -2 * SIZE(X) MUL a2, ALPHA, t2 ST t3, -1 * SIZE(X) MUL a3, ALPHA, t3 LD a4, 4 * SIZE(X) LD a5, 5 * SIZE(X) LD a6, 6 * SIZE(X) LD a7, 7 * SIZE(X) lds $f31, PREFETCHSIZE * SIZE(X) bne I, $L12 .align 4 $L13: ST t0, 0 * SIZE(X) MUL a4, ALPHA, t0 ST t1, 1 * SIZE(X) MUL a5, ALPHA, t1 ST t2, 2 * SIZE(X) MUL a6, ALPHA, t2 ST t3, 3 * SIZE(X) MUL a7, ALPHA, t3 ST t0, 4 * SIZE(X) ST t1, 5 * SIZE(X) ST t2, 6 * SIZE(X) ST t3, 7 * SIZE(X) addq X, 8 * SIZE, X .align 4 $L15: and N, 7, I #endif unop unop ble I, $L999 .align 4 $L17: LD a0, 0 * SIZE(X) MUL a0, ALPHA, t0 ST t0, 0 * SIZE(X) addq X, SIZE, X lda I, -1(I) bne I, $L17 ret .align 4 $L20: sra N, 3, I ble I, $L25 LD a0, 0 * SIZE(X) SXADDQ INCX, X, X LD a1, 0 * SIZE(X) SXADDQ INCX, X, X LD a2, 0 * SIZE(X) SXADDQ INCX, X, X LD a3, 0 * SIZE(X) SXADDQ INCX, X, X LD a4, 0 * SIZE(X) MUL a0, ALPHA, t0 lda I, -1(I) SXADDQ INCX, X, X LD a5, 0 * SIZE(X) MUL a1, ALPHA, t1 SXADDQ INCX, X, X unop LD a6, 0 * SIZE(X) MUL a2, ALPHA, t2 SXADDQ INCX, X, X unop LD a7, 0 * SIZE(X) MUL a3, ALPHA, t3 SXADDQ INCX, X, X ble I, $L23 .align 4 $L22: ST t0, 0 * SIZE(XX) MUL a4, ALPHA, t0 lds $f31, PREFETCHSIZE * SIZE(X) SXADDQ INCX, XX, XX LD a0, 0 * SIZE(X) SXADDQ INCX, X, X lda I, -1(I) unop ST t1, 0 * SIZE(XX) MUL a5, ALPHA, t1 SXADDQ INCX, XX, XX unop LD a1, 0 * SIZE(X) SXADDQ INCX, X, X ST t2, 0 * SIZE(XX) MUL a6, ALPHA, t2 SXADDQ INCX, XX, XX unop LD a2, 0 * SIZE(X) SXADDQ INCX, X, X ST t3, 0 * SIZE(XX) MUL a7, ALPHA, t3 SXADDQ INCX, XX, XX unop LD a3, 0 * SIZE(X) SXADDQ INCX, X, X ST t0, 0 * SIZE(XX) MUL a0, ALPHA, t0 SXADDQ INCX, XX, XX unop LD a4, 0 * SIZE(X) SXADDQ INCX, X, X ST t1, 0 * SIZE(XX) MUL a1, ALPHA, t1 SXADDQ INCX, XX, XX unop LD a5, 0 * SIZE(X) SXADDQ INCX, X, X ST t2, 0 * SIZE(XX) MUL a2, ALPHA, t2 SXADDQ INCX, XX, XX unop LD a6, 0 * SIZE(X) SXADDQ INCX, X, X ST t3, 0 * SIZE(XX) MUL a3, ALPHA, t3 SXADDQ INCX, XX, XX unop LD a7, 0 * SIZE(X) SXADDQ INCX, X, X unop bne I, $L22 .align 4 $L23: ST t0, 0 * SIZE(XX) MUL a4, ALPHA, t0 SXADDQ INCX, XX, XX ST t1, 0 * SIZE(XX) MUL a5, ALPHA, t1 SXADDQ INCX, XX, XX ST t2, 0 * SIZE(XX) MUL a6, ALPHA, t2 SXADDQ INCX, XX, XX ST t3, 0 * SIZE(XX) MUL a7, ALPHA, t3 SXADDQ INCX, XX, XX ST t0, 0 * SIZE(XX) SXADDQ INCX, XX, XX ST t1, 0 * SIZE(XX) SXADDQ INCX, XX, XX ST t2, 0 * SIZE(XX) SXADDQ INCX, XX, XX ST t3, 0 * SIZE(XX) SXADDQ INCX, XX, XX .align 4 $L25: and N, 7, I unop unop ble I, $L999 .align 4 $L27: LD a0, 0 * SIZE(X) MUL a0, ALPHA, t0 ST t0, 0 * SIZE(XX) SXADDQ INCX, X, X SXADDQ INCX, XX, XX lda I, -1(I) bne I, $L27 .align 4 $L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/snrm2.S000066400000000000000000000172331313527062700167040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCH_SIZE 80 #define N $16 #define X $17 #define INCX $18 #define XX $19 #define I $0 #define a0 $f0 #define a1 $f1 #define a2 $f10 #define a3 $f11 #define t0 $f12 #define t1 $f13 #define t2 $f14 #define t3 $f15 #define x0 $f16 #define x1 $f17 #define x2 $f18 #define x3 $f19 #define x4 $f20 #define x5 $f21 #define x6 $f22 #define x7 $f23 PROLOGUE #if defined(EV4) || defined(EV5) .frame $30,16,$26,0 .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) PROFCODE .prologue 1 #else PROFCODE #endif fclr a0 SXADDQ INCX, 0, INCX fclr a1 ble N, $L999 fclr a2 cmpeq INCX, SIZE, $0 fclr a3 beq $0, $L20 fclr t0 sra N, 4, I fclr t1 ble I, $L15 fclr t2 LD x0, 0 * SIZE(X) fclr t3 LD x1, 1 * SIZE(X) LD x2, 2 * SIZE(X) LD x3, 3 * SIZE(X) LD x4, 4 * SIZE(X) LD x5, 5 * SIZE(X) LD x6, 6 * SIZE(X) LD x7, 7 * SIZE(X) lda I, -1(I) ble I, $L12 .align 4 $L11: addt a0, t0, a0 ldl $31, (PREFETCH_SIZE) * SIZE(X) mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 mov X, XX mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(X) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(X) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(X) addt a3, t3, a3 unop mult x7, x7, t3 LD x7, 15 * SIZE(X) addt a0, t0, a0 unop mult x0, x0, t0 LD x0, 16 * SIZE(X) addt a1, t1, a1 lda X, 16 * SIZE(X) mult x1, x1, t1 LD x1, 17 * SIZE(XX) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 18 * SIZE(XX) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 19 * SIZE(XX) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 20 * SIZE(XX) addt a1, t1, a1 lda I, -1(I) mult x5, x5, t1 LD x5, 21 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 22 * SIZE(XX) addt a3, t3, a3 mult x7, x7, t3 LD x7, 23 * SIZE(XX) bgt I, $L11 .align 4 $L12: addt a0, t0, a0 mov X, XX mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 unop mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(XX) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(XX) addt a3, t3, a3 lda X, 16 * SIZE(X) mult x7, x7, t3 LD x7, 15 * SIZE(XX) addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a1, t1, a1 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L15: and N, 15, I ble I, $L998 .align 4 $L16: LD x0, 0 * SIZE(X) lda X, 1 * SIZE(X) addt a0, t0, a0 mult x0, x0, t0 lda I, -1(I) bgt I, $L16 bsr $31, $L998 .align 4 $L20: fclr t0 sra N, 3, I fclr t1 ble I, $L25 fclr t2 fclr t3 LD x0, 0 * SIZE(X) addq X, INCX, X LD x1, 0 * SIZE(X) addq X, INCX, X LD x2, 0 * SIZE(X) addq X, INCX, X LD x3, 0 * SIZE(X) addq X, INCX, X LD x4, 0 * SIZE(X) addq X, INCX, X LD x5, 0 * SIZE(X) addq X, INCX, X LD x6, 0 * SIZE(X) addq X, INCX, X lda I, -1(I) ble I, $L22 .align 4 $L21: addt a0, t0, a0 LD x7, 0 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 LD x0, 0 * SIZE(X) mult x1, x1, t1 addq X, INCX, X addt a2, t2, a2 LD x1, 0 * SIZE(X) mult x2, x2, t2 addq X, INCX, X addt a3, t3, a3 LD x2, 0 * SIZE(X) mult x3, x3, t3 addq X, INCX, X addt a0, t0, a0 LD x3, 0 * SIZE(X) mult x4, x4, t0 addq X, INCX, X addt a1, t1, a1 LD x4, 0 * SIZE(X) mult x5, x5, t1 addq X, INCX, X addt a2, t2, a2 LD x5, 0 * SIZE(X) mult x6, x6, t2 addq X, INCX, X addt a3, t3, a3 LD x6, 0 * SIZE(X) mult x7, x7, t3 addq X, INCX, X lda I, -1(I) bgt I, $L21 .align 4 $L22: addt a0, t0, a0 LD x7, 0 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 unop mult x1, x1, t1 unop addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a1, t1, a1 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L25: and N, 7, I ble I, $L998 .align 4 $L26: LD x0, 0 * SIZE(X) addq X, INCX, X addt a0, t0, a0 mult x0, x0, t0 lda I, -1(I) bgt I, $L26 .align 4 $L998: addt a0, t0, a0 addt a0, a1, a0 addt a2, a3, a2 #if defined(EV4) || defined(EV5) addt a0, a2, $f16 jsr $26, ($27), sqrt !lituse_jsr!2 ldah $29, 0($26) !gpdisp!3 lda $29, 0($29) !gpdisp!3 #else addt a0, a2, a0 sqrtt a0, a0 #endif .align 4 $L999: #if defined(EV4) || defined(EV5) ldq $26, 0($sp) lda $sp, 16($sp) #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/staticbuffer.S000066400000000000000000000053531313527062700203240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ALLOC_STATIC .align 8 .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 #endif OpenBLAS-0.2.20/kernel/alpha/swap.S000066400000000000000000000140631313527062700166130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" PROLOGUE PROFCODE .frame $sp, 0, $26, 0 mov $20, $17 mov $21, $18 ldq $19, 0($sp) ldl $20, 8($sp) #ifndef PROFILE .prologue 0 #else .prologue 1 #endif subl $18, 1, $1 subl $20, 1, $2 ble $16, $SubEnd # if n <= 0 goto $End or $1, $2, $1 sra $16, 3, $21 and $16, 7, $22 bne $1, $Sub ble $21, $MainRemain .align 4 $MainLoop: LD $f10, 0*SIZE($19) LD $f11, 1*SIZE($19) LD $f12, 2*SIZE($19) LD $f13, 3*SIZE($19) LD $f14, 4*SIZE($19) LD $f15, 5*SIZE($19) LD $f16, 6*SIZE($19) LD $f17, 7*SIZE($19) LD $f20, 0*SIZE($17) LD $f21, 1*SIZE($17) LD $f22, 2*SIZE($17) LD $f23, 3*SIZE($17) LD $f24, 4*SIZE($17) LD $f25, 5*SIZE($17) LD $f26, 6*SIZE($17) LD $f27, 7*SIZE($17) lds $f31, 32*SIZE($17) unop lds $f31, 32*SIZE($19) subl $21, 1, $21 ST $f10, 0*SIZE($17) ST $f11, 1*SIZE($17) ST $f12, 2*SIZE($17) ST $f13, 3*SIZE($17) ST $f14, 4*SIZE($17) ST $f15, 5*SIZE($17) ST $f16, 6*SIZE($17) ST $f17, 7*SIZE($17) ST $f20, 0*SIZE($19) ST $f21, 1*SIZE($19) ST $f22, 2*SIZE($19) ST $f23, 3*SIZE($19) ST $f24, 4*SIZE($19) ST $f25, 5*SIZE($19) ST $f26, 6*SIZE($19) ST $f27, 7*SIZE($19) lda $17, 8*SIZE($17) lda $19, 8*SIZE($19) bgt $21, $MainLoop .align 4 $MainRemain: ble $22, $MainEnd .align 4 $MainRemainLoop: LD $f10, 0*SIZE($19) LD $f20, 0*SIZE($17) lda $17, 1*SIZE($17) lda $19, 1*SIZE($19) subl $22, 1, $22 ST $f10, -1*SIZE($17) ST $f20, -1*SIZE($19) bgt $22, $MainRemainLoop .align 4 $MainEnd: clr $0 ret .align 4 $Sub: mov $17, $23 mov $19, $24 ble $21, $SubRemain .align 4 $SubLoop: LD $f10, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f11, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f12, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f13, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f14, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f15, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f16, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f17, 0*SIZE($19) SXADDQ $20, $19, $19 LD $f20, 0*SIZE($17) SXADDQ $18, $17, $17 LD $f21, 0*SIZE($17) SXADDQ $18, $17, $17 LD $f22, 0*SIZE($17) SXADDQ $18, $17, $17 LD $f23, 0*SIZE($17) SXADDQ $18, $17, $17 LD $f24, 0*SIZE($17) SXADDQ $18, $17, $17 LD $f25, 0*SIZE($17) SXADDQ $18, $17, $17 LD $f26, 0*SIZE($17) SXADDQ $18, $17, $17 LD $f27, 0*SIZE($17) SXADDQ $18, $17, $17 ST $f10, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f11, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f12, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f13, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f14, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f15, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f16, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f17, 0*SIZE($23) SXADDQ $18, $23, $23 ST $f20, 0*SIZE($24) SXADDQ $20, $24, $24 ST $f21, 0*SIZE($24) SXADDQ $20, $24, $24 ST $f22, 0*SIZE($24) SXADDQ $20, $24, $24 ST $f23, 0*SIZE($24) SXADDQ $20, $24, $24 ST $f24, 0*SIZE($24) SXADDQ $20, $24, $24 ST $f25, 0*SIZE($24) SXADDQ $20, $24, $24 ST $f26, 0*SIZE($24) SXADDQ $20, $24, $24 ST $f27, 0*SIZE($24) SXADDQ $20, $24, $24 subl $21, 1, $21 bgt $21, $SubLoop .align 4 $SubRemain: ble $22, $SubEnd .align 4 $SubRemainLoop: LD $f10, 0*SIZE($19) LD $f20, 0*SIZE($17) subl $22, 1, $22 ST $f10, 0*SIZE($17) ST $f20, 0*SIZE($19) SXADDQ $18, $17, $17 SXADDQ $20, $19, $19 bgt $22, $SubRemainLoop .align 4 $SubEnd: clr $0 ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/trsm_kernel_4x4_LN.S000066400000000000000000001631071313527062700212620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 56 #define UNOP #endif #ifdef EV4 #define UNOP #endif #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $20 #define B $21 #define C $22 #define LDC $23 #define C1 $19 #define C2 $24 #define C3 $25 #define C4 $27 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define AORIG $3 #define OFFSET $4 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 lda $sp, -STACKSIZE($sp) ldq C, 0 + STACKSIZE($sp) ldq LDC, 8 + STACKSIZE($sp) ldq OFFSET, 16 + STACKSIZE($sp) SXADDQ LDC, 0, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #ifdef LN mulq M, K, TMP1 SXADDQ TMP1, A, A SXADDQ M, C, C #endif #ifdef RN negq OFFSET, KK #endif #ifdef RT mulq N, K, TMP1 SXADDQ TMP1, B, B mulq N, LDC, TMP1 addq TMP1, C, C subq N, OFFSET, KK #endif sra N, 2, J ble J, $L40 .align 4 $L01: #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 subq B, TMP1, B s4addq LDC, 0, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 addq C2, LDC, C3 #ifndef RT s4addq LDC, C, C #endif fclr t1 addq C3, LDC, C4 fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif fclr t3 fclr t4 and M, 1, I ble I, $L20 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c09 LD b4, 3 * SIZE(B) fclr c13 lda BO, 4 * SIZE(B) ble KK, $L38 ble L, $L35 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c09 LD b4, 3 * SIZE(BO) fclr c13 lda BO, 4 * SIZE(BO) ble TMP1, $L38 ble L, $L35 #endif .align 4 $L32: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 0 * SIZE(BO) ADD c05, t2, c05 lda AO, 2 * SIZE(AO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 LD b5, 3 * SIZE(BO) MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, -1 * SIZE(AO) ADD c01, t1, c01 MUL a2, b1, t1 LD b1, 4 * SIZE(BO) lda BO, 8 * SIZE(BO) ADD c05, t2, c05 MUL a2, b2, t2 LD b2, -3 * SIZE(BO) ADD c09, t3, c09 LD b4, -1 * SIZE(BO) MUL a2, b3, t3 LD b3, -2 * SIZE(BO) ADD c13, t4, c13 MUL a2, b5, t4 LD a2, 0 * SIZE(AO) bgt L, $L32 .align 4 $L35: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L37 #else blbs TMP1, $L37 #endif .align 4 ADD c05, t2, c05 LD b1, 0 * SIZE(BO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, 0 * SIZE(AO) lda AO, 1 * SIZE(AO) ADD c01, t1, c01 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L37: ADD c05, t2, c05 MUL a1, b2, t2 ADD c09, t3, c09 MUL a1, b3, t3 ADD c13, t4, c13 lda AO, 1 * SIZE(AO) MUL a1, b4, t4 lda BO, 4 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 ADD c09, t3, c09 ADD c13, t4, c13 $L38: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c01, t1 SUB c09, t1, c09 MUL a4, c01, t1 SUB c13, t1, c13 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b2, c05, t1 SUB c09, t1, c09 MUL b3, c05, t1 SUB c13, t1, c13 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a2, c09, t1 SUB c13, t1, c13 MUL a3, c13, c13 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a2, c13, t1 SUB c09, t1, c09 MUL a3, c13, t1 SUB c05, t1, c05 MUL a4, c13, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b2, c09, t1 SUB c05, t1, c05 MUL b3, c09, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) ST c09, 2 * SIZE(AO) ST c13, 3 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) lda C3, -1 * SIZE(C3) lda C4, -1 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c13, 0 * SIZE(C4) #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L20: and M, 2, I ble I, $L30 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c01 LD b4, 3 * SIZE(B) fclr c05 lda BO, 4 * SIZE(B) fclr c02 fclr c06 ble KK, $L28 ble L, $L25 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c01 LD b4, 3 * SIZE(BO) fclr c05 lda BO, 4 * SIZE(BO) fclr c02 fclr c06 ble TMP1, $L28 ble L, $L25 #endif .align 4 $L22: ADD c09, t1, c09 unop MUL a1, b1, t1 unop ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 unop ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD c09, t1, c09 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L27 #else blbs TMP1, $L27 #endif ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 unop ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L27: ADD c10, t2, c10 MUL a2, b1, t2 ADD c13, t3, c13 MUL a1, b2, t3 ADD c14, t4, c14 MUL a2, b2, t4 ADD c01, t1, c01 MUL a1, b3, t1 ADD c02, t2, c02 MUL a2, b3, t2 ADD c05, t3, c05 MUL a1, b4, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b4, t4 lda BO, 4 * SIZE(BO) ADD c09, t1, c09 ADD c10, t2, c10 ADD c13, t3, c13 ADD c14, t4, c14 .align 4 $L28: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 SUB b1, c09, c09 SUB b2, c10, c10 SUB b3, c13, c13 SUB b4, c14, c14 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c02, c02 MUL a3, c06, c06 MUL a3, c10, c10 MUL a3, c14, c14 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c02, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a4, c01, t1 MUL a4, c02, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b2, c05, t1 MUL b2, c06, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL b3, c05, t1 MUL b3, c06, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a2, c09, t1 MUL a2, c10, t2 SUB c13, t1, c13 SUB c14, t2, c14 MUL a3, c13, c13 MUL a3, c14, c14 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a2, c13, t1 MUL a2, c14, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a3, c13, t1 MUL a3, c14, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a4, c13, t1 MUL a4, c14, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b2, c09, t1 MUL b2, c10, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL b3, c09, t1 MUL b3, c10, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) ST c09, 4 * SIZE(AO) ST c10, 5 * SIZE(AO) ST c13, 6 * SIZE(AO) ST c14, 7 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) lda C3, -2 * SIZE(C3) lda C4, -2 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) lda C3, 2 * SIZE(C3) lda C4, 2 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L30: sra M, 2, I ble I, $L39 .align 4 $L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c06 LD b4, 3 * SIZE(B) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(KK) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(B) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble KK, $L18 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c06 LD b4, 3 * SIZE(BO) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(TMP1) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(BO) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble TMP1, $L18 #endif ble L, $L15 .align 5 $L12: /* 1 */ ADD c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD c12, t2, c12 unop MUL b1, a2, t2 unop ADD c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD c01, t1, c01 unop MUL b5, a6, t1 unop ADD c02, t2, c02 unop MUL b5, a4, t2 unop ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD c11, t1, c11 MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L17 #else blbs TMP1, $L17 #endif .align 4 ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L17: ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 MUL b1, a4, t2 ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 MUL b3, a1, t1 ADD c04, t2, c04 MUL b3, a2, t2 ADD c08, t3, c08 MUL b4, a2, t3 ADD c13, t4, c13 MUL b2, a3, t4 ADD c09, t1, c09 MUL b3, a3, t1 ADD c10, t2, c10 MUL b3, a4, t2 ADD c14, t3, c14 MUL b4, a4, t3 ADD c07, t4, c07 lda AO, 4 * SIZE(AO) MUL b4, a3, t4 lda BO, 4 * SIZE(BO) ADD c11, t1, c11 ADD c12, t2, c12 ADD c16, t3, c16 ADD c15, t4, c15 .align 4 $L18: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 LD a1, 8 * SIZE(BO) LD a2, 9 * SIZE(BO) LD a3, 10 * SIZE(BO) LD a4, 11 * SIZE(BO) LD b1, 12 * SIZE(BO) LD b2, 13 * SIZE(BO) LD b3, 14 * SIZE(BO) LD b4, 15 * SIZE(BO) SUB a1, c03, c03 SUB a2, c07, c07 SUB a3, c11, c11 SUB a4, c15, c15 SUB b1, c04, c04 SUB b2, c08, c08 SUB b3, c12, c12 SUB b4, c16, c16 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 12 * SIZE(AO) LD b2, 13 * SIZE(AO) LD b3, 14 * SIZE(AO) LD b4, 15 * SIZE(AO) SUB a1, c09, c09 SUB a2, c10, c10 SUB a3, c11, c11 SUB a4, c12, c12 SUB b1, c13, c13 SUB b2, c14, c14 SUB b3, c15, c15 SUB b4, c16, c16 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a1, c12, c12 MUL a1, c16, c16 MUL a2, c04, t1 MUL a2, c08, t2 MUL a2, c12, t3 MUL a2, c16, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a3, c04, t1 MUL a3, c08, t2 MUL a3, c12, t3 MUL a3, c16, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a4, c04, t1 MUL a4, c08, t2 MUL a4, c12, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b1, c11, c11 MUL b1, c15, c15 MUL b2, c03, t1 MUL b2, c07, t2 MUL b2, c11, t3 MUL b2, c15, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL b3, c03, t1 MUL b3, c07, t2 MUL b3, c11, t3 MUL b3, c15, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c01, t1 MUL a3, c05, t2 MUL a3, c09, t3 MUL a3, c13, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a4, c01, t1 MUL a4, c05, t2 MUL a4, c09, t3 MUL a4, c13, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b1, c10, c10 MUL b1, c14, c14 MUL b2, c02, t1 MUL b2, c06, t2 MUL b2, c10, t3 MUL b2, c14, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL b3, c02, t1 MUL b3, c06, t2 MUL b3, c10, t3 MUL b3, c14, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a1, c11, c11 MUL a1, c15, c15 MUL a2, c03, t1 MUL a2, c07, t2 MUL a2, c11, t3 MUL a2, c15, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 MUL a3, c04, c04 MUL a3, c08, c08 MUL a3, c12, c12 MUL a3, c16, c16 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c03, t3 MUL a3, c04, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c01, t1 MUL a4, c02, t2 MUL a4, c03, t3 MUL a4, c04, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b1, c07, c07 MUL b1, c08, c08 MUL b2, c05, t1 MUL b2, c06, t2 MUL b2, c07, t3 MUL b2, c08, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL b3, c05, t1 MUL b3, c06, t2 MUL b3, c07, t3 MUL b3, c08, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 MUL a2, c09, t1 MUL a2, c10, t2 MUL a2, c11, t3 MUL a2, c12, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 MUL a3, c13, c13 MUL a3, c14, c14 MUL a3, c15, c15 MUL a3, c16, c16 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a1, c15, c15 MUL a1, c16, c16 MUL a2, c13, t1 MUL a2, c14, t2 MUL a2, c15, t3 MUL a2, c16, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a3, c13, t1 MUL a3, c14, t2 MUL a3, c15, t3 MUL a3, c16, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a4, c13, t1 MUL a4, c14, t2 MUL a4, c15, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b1, c11, c11 MUL b1, c12, c12 MUL b2, c09, t1 MUL b2, c10, t2 MUL b2, c11, t3 MUL b2, c12, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL b3, c09, t1 MUL b3, c10, t2 MUL b3, c11, t3 MUL b3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) ST c03, 8 * SIZE(BO) ST c07, 9 * SIZE(BO) ST c11, 10 * SIZE(BO) ST c15, 11 * SIZE(BO) ST c04, 12 * SIZE(BO) ST c08, 13 * SIZE(BO) ST c12, 14 * SIZE(BO) ST c16, 15 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) ST c09, 8 * SIZE(AO) ST c10, 9 * SIZE(AO) ST c11, 10 * SIZE(AO) ST c12, 11 * SIZE(AO) ST c13, 12 * SIZE(AO) ST c14, 13 * SIZE(AO) ST c15, 14 * SIZE(AO) ST c16, 15 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) lda C3, -4 * SIZE(C3) lda C4, -4 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c11, 2 * SIZE(C3) ST c12, 3 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) ST c15, 2 * SIZE(C4) ST c16, 3 * SIZE(C4) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) lda C3, 4 * SIZE(C3) lda C4, 4 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L11 .align 4 $L39: #ifdef LN sll K, 2 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 4, KK #endif #ifdef RT subq KK, 4, KK #endif lda J, -1(J) bgt J, $L01 .align 4 $L40: and N, 2, J ble J, $L80 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 subq B, TMP1, B addq LDC, LDC, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 fclr t1 #ifndef RT addq C2, LDC, C #endif fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif fclr t3 fclr t4 and M, 1, I ble I, $L60 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) fclr c02 LD b2, 1 * SIZE(B) fclr c06 lda L, -2(KK) LD b3, 2 * SIZE(B) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L78 ble L, $L75 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) fclr c02 LD b2, 1 * SIZE(BO) fclr c06 lda L, -2(TMP1) LD b3, 2 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L78 ble L, $L75 #endif .align 4 $L72: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 2 * SIZE(BO) ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 1 * SIZE(AO) LD b2, 3 * SIZE(BO) ADD c02, t3, c02 lda AO, 2 * SIZE(AO) MUL a2, b3, t3 LD b3, 4 * SIZE(BO) ADD c06, t4, c06 MUL a2, b4, t4 LD a2, 0 * SIZE(AO) LD b4, 5 * SIZE(BO) lda BO, 4 * SIZE(BO) unop unop bgt L, $L72 .align 4 $L75: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L77 #else blbs TMP1, $L77 #endif .align 4 ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) ADD c01, t1, c01 LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L77: ADD c05, t2, c05 MUL a1, b2, t2 ADD c02, t3, c02 ADD c06, t4, c06 ADD c01, c02, c01 lda AO, 1 * SIZE(AO) ADD c05, c06, c05 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 .align 4 $L78: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c05, c05 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L60: and M, 2, I ble I, $L70 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L68 ble L, $L65 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L68 ble L, $L65 #endif .align 4 $L62: ADD c01, t1, c01 unop MUL a1, b1, t1 unop ADD c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L62 .align 4 $L65: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L67 #else blbs TMP1, $L67 #endif .align 4 ADD c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L67: ADD c02, t2, c02 MUL a2, b1, t2 ADD c05, t3, c05 MUL a1, b2, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b2, t4 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c02, t2, c02 ADD c05, t3, c05 ADD c06, t4, c06 .align 4 $L68: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c02, c02 MUL a3, c06, c06 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c05, c05 MUL a3, c06, c06 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L70: sra M, 2, I ble I, $L79 .align 4 $L51: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda L, -2(KK) lda BO, 2 * SIZE(B) lda AO, 4 * SIZE(AO) ble KK, $L58 ble L, $L55 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda L, -2(TMP1) lda BO, 2 * SIZE(BO) lda AO, 4 * SIZE(AO) ble TMP1, $L58 ble L, $L55 #endif .align 4 $L52: ADD c05, t1, c05 unop MUL a1, b1, t1 unop ADD c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD c07, t3, c07 unop MUL a3, b1, t3 unop ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD c06, t2, c06 unop MUL a2, b3, t2 unop ADD c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L52 .align 4 $L55: ADD c05, t1, c05 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L57 #else blbs TMP1, $L57 #endif .align 4 ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L57: ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 MUL a4, b1, t4 ADD c01, t1, c01 MUL a1, b2, t1 ADD c02, t2, c02 MUL a2, b2, t2 ADD c03, t3, c03 MUL a3, b2, t3 ADD c04, t4, c04 lda AO, 4 * SIZE(AO) MUL a4, b2, t4 lda BO, 2 * SIZE(BO) ADD c05, t1, c05 ADD c06, t2, c06 ADD c07, t3, c07 ADD c08, t4, c08 .align 4 $L58: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 SUB b1, c03, c03 SUB b2, c07, c07 SUB b3, c04, c04 SUB b4, c08, c08 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a2, c04, t1 MUL a2, c08, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a3, c04, t1 MUL a3, c08, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a4, c04, t1 MUL a4, c08, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b2, c03, t1 MUL b2, c07, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL b3, c03, t1 MUL b3, c07, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c05, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a4, c01, t1 MUL a4, c05, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b2, c02, t1 MUL b2, c06, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL b3, c02, t1 MUL b3, c06, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a2, c03, t1 MUL a2, c07, t2 SUB c04, t1, c04 SUB c08, t2, c08 MUL a3, c04, c04 MUL a3, c08, c08 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c05, c05 MUL a3, c06, c06 MUL a3, c07, c07 MUL a3, c08, c08 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) ST c03, 4 * SIZE(BO) ST c07, 5 * SIZE(BO) ST c04, 6 * SIZE(BO) ST c08, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L51 .align 4 $L79: #ifdef LN sll K, 1 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 2, KK #endif #ifdef RT subq KK, 2, KK #endif .align 4 $L80: and N, 1, J ble J, $L999 #ifdef RT sll K, BASE_SHIFT, TMP1 subq B, TMP1, B subq C, LDC, C #endif mov C, C1 #ifndef RT addq C, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif and M, 1, I ble I, $L100 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO unop ble L, $L115 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L115 #endif .align 4 $L112: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 4 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c02, t2, c02 MUL a2, b2, t2 LD a2, 5 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c03, t3, c03 MUL a3, b3, t3 LD a3, 6 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c04, t4, c04 MUL a4, b4, t4 LD a4, 7 * SIZE(AO) LD b4, 7 * SIZE(BO) lda L, -1(L) lda AO, 4 * SIZE(AO) lda BO, 4 * SIZE(BO) bgt L, $L112 .align 4 $L115: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L118 .align 4 $L116: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) lda L, -1(L) lda AO, 1 * SIZE(AO) lda BO, 1 * SIZE(BO) bgt L, $L116 .align 4 $L118: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c02, c01 ADD c03, c04, c03 ADD c01, c03, c01 #if defined(LN) || defined(RT) subq KK, 1, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) SUB a1, c01, c01 #else LD a1, 0 * SIZE(AO) SUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) #else ST c01, 0 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) #ifndef LN lda C1, 1 * SIZE(C1) #endif #ifdef RT SXADDQ K, AORIG, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L100: and M, 2, I ble I, $L110 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L105 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L ble L, $L105 #endif .align 5 $L102: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 5 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c03, t3, c03 lda BO, 4 * SIZE(BO) MUL a3, b2, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a5, 7 * SIZE(AO) LD b2, 1 * SIZE(BO) ADD c01, t1, c01 MUL a1, b3, t1 LD a1, 8 * SIZE(AO) lda AO, 8 * SIZE(AO) ADD c02, t2, c02 MUL a2, b3, t2 LD b3, 2 * SIZE(BO) LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L102 .align 4 $L105: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L108 .align 4 $L106: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 2 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) unop lda BO, 1 * SIZE(BO) bgt L, $L106 .align 4 $L108: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c03, c01 ADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) #ifndef LN lda C1, 2 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L110: sra M, 2, I ble I, $L119 .align 4 $L91: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L95 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L95 #endif .align 5 $L92: ADD c01, t1, c01 unop MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda L, -1(L) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 8 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 9 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 10 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 11 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 LD a1, 12 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD a2, 13 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b3, t3 LD a3, 14 * SIZE(AO) ADD c04, t4, c04 MUL a4, b3, t4 LD a5, 15 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c01, t1, c01 MUL a1, b4, t1 LD a1, 16 * SIZE(AO) lda AO, 16 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b4, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L92 .align 4 $L95: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif unop ble L, $L98 .align 4 $L96: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda BO, 1 * SIZE(BO) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 0 * SIZE(BO) lda AO, 4 * SIZE(AO) bgt L, $L96 .align 4 $L98: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a2, c04, t1 SUB c03, t1, c03 MUL a3, c04, t1 SUB c02, t1, c02 MUL a4, c04, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b2, c03, t1 SUB c02, t1, c02 MUL b3, c03, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c01, t1 SUB c03, t1, c03 MUL a4, c01, t1 SUB c04, t1, c04 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b2, c02, t1 SUB c03, t1, c03 MUL b3, c02, t1 SUB c04, t1, c04 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a2, c03, t1 SUB c04, t1, c04 MUL a3, c04, c04 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c03, 2 * SIZE(BO) ST c04, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) #ifndef LN lda C1, 4 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L91 .align 4 $L119: #ifdef LN SXADDQ K, B, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 1, KK #endif #ifdef RT subq KK, 1, KK #endif .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/trsm_kernel_4x4_LT.S000066400000000000000000001631051313527062700212660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 56 #define UNOP #endif #ifdef EV4 #define UNOP #endif #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $20 #define B $21 #define C $22 #define LDC $23 #define C1 $19 #define C2 $24 #define C3 $25 #define C4 $27 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define AORIG $3 #define OFFSET $4 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 lda $sp, -STACKSIZE($sp) ldq C, 0 + STACKSIZE($sp) ldq LDC, 8 + STACKSIZE($sp) ldq OFFSET, 16 + STACKSIZE($sp) SXADDQ LDC, 0, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #ifdef LN mulq M, K, TMP1 SXADDQ TMP1, A, A SXADDQ M, C, C #endif #ifdef RN negq OFFSET, KK #endif #ifdef RT mulq N, K, TMP1 SXADDQ TMP1, B, B mulq N, LDC, TMP1 addq TMP1, C, C subq N, OFFSET, KK #endif sra N, 2, J ble J, $L40 .align 4 $L01: #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 subq B, TMP1, B s4addq LDC, 0, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 addq C2, LDC, C3 #ifndef RT s4addq LDC, C, C #endif fclr t1 addq C3, LDC, C4 fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I fclr t3 fclr t4 ble I, $L20 .align 4 $L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c06 LD b4, 3 * SIZE(B) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(KK) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(B) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble KK, $L18 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c06 LD b4, 3 * SIZE(BO) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(TMP1) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(BO) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble TMP1, $L18 #endif ble L, $L15 .align 5 $L12: /* 1 */ ADD c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD c12, t2, c12 unop MUL b1, a2, t2 unop ADD c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD c01, t1, c01 unop MUL b5, a6, t1 unop ADD c02, t2, c02 unop MUL b5, a4, t2 unop ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD c11, t1, c11 MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L17 #else blbs TMP1, $L17 #endif .align 4 ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L17: ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 MUL b1, a4, t2 ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 MUL b3, a1, t1 ADD c04, t2, c04 MUL b3, a2, t2 ADD c08, t3, c08 MUL b4, a2, t3 ADD c13, t4, c13 MUL b2, a3, t4 ADD c09, t1, c09 MUL b3, a3, t1 ADD c10, t2, c10 MUL b3, a4, t2 ADD c14, t3, c14 MUL b4, a4, t3 ADD c07, t4, c07 lda AO, 4 * SIZE(AO) MUL b4, a3, t4 lda BO, 4 * SIZE(BO) ADD c11, t1, c11 ADD c12, t2, c12 ADD c16, t3, c16 ADD c15, t4, c15 .align 4 $L18: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 LD a1, 8 * SIZE(BO) LD a2, 9 * SIZE(BO) LD a3, 10 * SIZE(BO) LD a4, 11 * SIZE(BO) LD b1, 12 * SIZE(BO) LD b2, 13 * SIZE(BO) LD b3, 14 * SIZE(BO) LD b4, 15 * SIZE(BO) SUB a1, c03, c03 SUB a2, c07, c07 SUB a3, c11, c11 SUB a4, c15, c15 SUB b1, c04, c04 SUB b2, c08, c08 SUB b3, c12, c12 SUB b4, c16, c16 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 12 * SIZE(AO) LD b2, 13 * SIZE(AO) LD b3, 14 * SIZE(AO) LD b4, 15 * SIZE(AO) SUB a1, c09, c09 SUB a2, c10, c10 SUB a3, c11, c11 SUB a4, c12, c12 SUB b1, c13, c13 SUB b2, c14, c14 SUB b3, c15, c15 SUB b4, c16, c16 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a1, c12, c12 MUL a1, c16, c16 MUL a2, c04, t1 MUL a2, c08, t2 MUL a2, c12, t3 MUL a2, c16, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a3, c04, t1 MUL a3, c08, t2 MUL a3, c12, t3 MUL a3, c16, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a4, c04, t1 MUL a4, c08, t2 MUL a4, c12, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b1, c11, c11 MUL b1, c15, c15 MUL b2, c03, t1 MUL b2, c07, t2 MUL b2, c11, t3 MUL b2, c15, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL b3, c03, t1 MUL b3, c07, t2 MUL b3, c11, t3 MUL b3, c15, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c01, t1 MUL a3, c05, t2 MUL a3, c09, t3 MUL a3, c13, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a4, c01, t1 MUL a4, c05, t2 MUL a4, c09, t3 MUL a4, c13, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b1, c10, c10 MUL b1, c14, c14 MUL b2, c02, t1 MUL b2, c06, t2 MUL b2, c10, t3 MUL b2, c14, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL b3, c02, t1 MUL b3, c06, t2 MUL b3, c10, t3 MUL b3, c14, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a1, c11, c11 MUL a1, c15, c15 MUL a2, c03, t1 MUL a2, c07, t2 MUL a2, c11, t3 MUL a2, c15, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 MUL a3, c04, c04 MUL a3, c08, c08 MUL a3, c12, c12 MUL a3, c16, c16 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c03, t3 MUL a3, c04, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c01, t1 MUL a4, c02, t2 MUL a4, c03, t3 MUL a4, c04, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b1, c07, c07 MUL b1, c08, c08 MUL b2, c05, t1 MUL b2, c06, t2 MUL b2, c07, t3 MUL b2, c08, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL b3, c05, t1 MUL b3, c06, t2 MUL b3, c07, t3 MUL b3, c08, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 MUL a2, c09, t1 MUL a2, c10, t2 MUL a2, c11, t3 MUL a2, c12, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 MUL a3, c13, c13 MUL a3, c14, c14 MUL a3, c15, c15 MUL a3, c16, c16 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a1, c15, c15 MUL a1, c16, c16 MUL a2, c13, t1 MUL a2, c14, t2 MUL a2, c15, t3 MUL a2, c16, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a3, c13, t1 MUL a3, c14, t2 MUL a3, c15, t3 MUL a3, c16, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a4, c13, t1 MUL a4, c14, t2 MUL a4, c15, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b1, c11, c11 MUL b1, c12, c12 MUL b2, c09, t1 MUL b2, c10, t2 MUL b2, c11, t3 MUL b2, c12, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL b3, c09, t1 MUL b3, c10, t2 MUL b3, c11, t3 MUL b3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) ST c03, 8 * SIZE(BO) ST c07, 9 * SIZE(BO) ST c11, 10 * SIZE(BO) ST c15, 11 * SIZE(BO) ST c04, 12 * SIZE(BO) ST c08, 13 * SIZE(BO) ST c12, 14 * SIZE(BO) ST c16, 15 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) ST c09, 8 * SIZE(AO) ST c10, 9 * SIZE(AO) ST c11, 10 * SIZE(AO) ST c12, 11 * SIZE(AO) ST c13, 12 * SIZE(AO) ST c14, 13 * SIZE(AO) ST c15, 14 * SIZE(AO) ST c16, 15 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) lda C3, -4 * SIZE(C3) lda C4, -4 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c11, 2 * SIZE(C3) ST c12, 3 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) ST c15, 2 * SIZE(C4) ST c16, 3 * SIZE(C4) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) lda C3, 4 * SIZE(C3) lda C4, 4 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L11 .align 4 $L20: and M, 2, I ble I, $L30 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c01 LD b4, 3 * SIZE(B) fclr c05 lda BO, 4 * SIZE(B) fclr c02 fclr c06 ble KK, $L28 ble L, $L25 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c01 LD b4, 3 * SIZE(BO) fclr c05 lda BO, 4 * SIZE(BO) fclr c02 fclr c06 ble TMP1, $L28 ble L, $L25 #endif .align 4 $L22: ADD c09, t1, c09 unop MUL a1, b1, t1 unop ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 unop ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD c09, t1, c09 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L27 #else blbs TMP1, $L27 #endif ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 unop ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L27: ADD c10, t2, c10 MUL a2, b1, t2 ADD c13, t3, c13 MUL a1, b2, t3 ADD c14, t4, c14 MUL a2, b2, t4 ADD c01, t1, c01 MUL a1, b3, t1 ADD c02, t2, c02 MUL a2, b3, t2 ADD c05, t3, c05 MUL a1, b4, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b4, t4 lda BO, 4 * SIZE(BO) ADD c09, t1, c09 ADD c10, t2, c10 ADD c13, t3, c13 ADD c14, t4, c14 .align 4 $L28: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 SUB b1, c09, c09 SUB b2, c10, c10 SUB b3, c13, c13 SUB b4, c14, c14 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c02, c02 MUL a3, c06, c06 MUL a3, c10, c10 MUL a3, c14, c14 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c02, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a4, c01, t1 MUL a4, c02, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b2, c05, t1 MUL b2, c06, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL b3, c05, t1 MUL b3, c06, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a2, c09, t1 MUL a2, c10, t2 SUB c13, t1, c13 SUB c14, t2, c14 MUL a3, c13, c13 MUL a3, c14, c14 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a2, c13, t1 MUL a2, c14, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a3, c13, t1 MUL a3, c14, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a4, c13, t1 MUL a4, c14, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b2, c09, t1 MUL b2, c10, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL b3, c09, t1 MUL b3, c10, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) ST c09, 4 * SIZE(AO) ST c10, 5 * SIZE(AO) ST c13, 6 * SIZE(AO) ST c14, 7 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) lda C3, -2 * SIZE(C3) lda C4, -2 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) lda C3, 2 * SIZE(C3) lda C4, 2 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L30: and M, 1, I ble I, $L39 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c09 LD b4, 3 * SIZE(B) fclr c13 lda BO, 4 * SIZE(B) ble KK, $L38 ble L, $L35 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c09 LD b4, 3 * SIZE(BO) fclr c13 lda BO, 4 * SIZE(BO) ble TMP1, $L38 ble L, $L35 #endif .align 4 $L32: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 0 * SIZE(BO) ADD c05, t2, c05 lda AO, 2 * SIZE(AO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 LD b5, 3 * SIZE(BO) MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, -1 * SIZE(AO) ADD c01, t1, c01 MUL a2, b1, t1 LD b1, 4 * SIZE(BO) lda BO, 8 * SIZE(BO) ADD c05, t2, c05 MUL a2, b2, t2 LD b2, -3 * SIZE(BO) ADD c09, t3, c09 LD b4, -1 * SIZE(BO) MUL a2, b3, t3 LD b3, -2 * SIZE(BO) ADD c13, t4, c13 MUL a2, b5, t4 LD a2, 0 * SIZE(AO) bgt L, $L32 .align 4 $L35: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L37 #else blbs TMP1, $L37 #endif .align 4 ADD c05, t2, c05 LD b1, 0 * SIZE(BO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, 0 * SIZE(AO) lda AO, 1 * SIZE(AO) ADD c01, t1, c01 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L37: ADD c05, t2, c05 MUL a1, b2, t2 ADD c09, t3, c09 MUL a1, b3, t3 ADD c13, t4, c13 lda AO, 1 * SIZE(AO) MUL a1, b4, t4 lda BO, 4 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 ADD c09, t3, c09 ADD c13, t4, c13 $L38: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c01, t1 SUB c09, t1, c09 MUL a4, c01, t1 SUB c13, t1, c13 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b2, c05, t1 SUB c09, t1, c09 MUL b3, c05, t1 SUB c13, t1, c13 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a2, c09, t1 SUB c13, t1, c13 MUL a3, c13, c13 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a2, c13, t1 SUB c09, t1, c09 MUL a3, c13, t1 SUB c05, t1, c05 MUL a4, c13, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b2, c09, t1 SUB c05, t1, c05 MUL b3, c09, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) ST c09, 2 * SIZE(AO) ST c13, 3 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) lda C3, -1 * SIZE(C3) lda C4, -1 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c13, 0 * SIZE(C4) #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L39: #ifdef LN sll K, 2 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 4, KK #endif #ifdef RT subq KK, 4, KK #endif lda J, -1(J) bgt J, $L01 .align 4 $L40: and N, 2, J ble J, $L80 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 subq B, TMP1, B addq LDC, LDC, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 fclr t1 #ifndef RT addq C2, LDC, C #endif fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I fclr t3 fclr t4 ble I, $L60 .align 4 $L51: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda L, -2(KK) lda BO, 2 * SIZE(B) lda AO, 4 * SIZE(AO) ble KK, $L58 ble L, $L55 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda L, -2(TMP1) lda BO, 2 * SIZE(BO) lda AO, 4 * SIZE(AO) ble TMP1, $L58 ble L, $L55 #endif .align 4 $L52: ADD c05, t1, c05 unop MUL a1, b1, t1 unop ADD c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD c07, t3, c07 unop MUL a3, b1, t3 unop ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD c06, t2, c06 unop MUL a2, b3, t2 unop ADD c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L52 .align 4 $L55: ADD c05, t1, c05 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L57 #else blbs TMP1, $L57 #endif .align 4 ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L57: ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 MUL a4, b1, t4 ADD c01, t1, c01 MUL a1, b2, t1 ADD c02, t2, c02 MUL a2, b2, t2 ADD c03, t3, c03 MUL a3, b2, t3 ADD c04, t4, c04 lda AO, 4 * SIZE(AO) MUL a4, b2, t4 lda BO, 2 * SIZE(BO) ADD c05, t1, c05 ADD c06, t2, c06 ADD c07, t3, c07 ADD c08, t4, c08 .align 4 $L58: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 SUB b1, c03, c03 SUB b2, c07, c07 SUB b3, c04, c04 SUB b4, c08, c08 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a2, c04, t1 MUL a2, c08, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a3, c04, t1 MUL a3, c08, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a4, c04, t1 MUL a4, c08, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b2, c03, t1 MUL b2, c07, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL b3, c03, t1 MUL b3, c07, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c05, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a4, c01, t1 MUL a4, c05, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b2, c02, t1 MUL b2, c06, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL b3, c02, t1 MUL b3, c06, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a2, c03, t1 MUL a2, c07, t2 SUB c04, t1, c04 SUB c08, t2, c08 MUL a3, c04, c04 MUL a3, c08, c08 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c05, c05 MUL a3, c06, c06 MUL a3, c07, c07 MUL a3, c08, c08 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) ST c03, 4 * SIZE(BO) ST c07, 5 * SIZE(BO) ST c04, 6 * SIZE(BO) ST c08, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L51 .align 4 $L60: and M, 2, I ble I, $L70 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L68 ble L, $L65 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L68 ble L, $L65 #endif .align 4 $L62: ADD c01, t1, c01 unop MUL a1, b1, t1 unop ADD c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L62 .align 4 $L65: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L67 #else blbs TMP1, $L67 #endif .align 4 ADD c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L67: ADD c02, t2, c02 MUL a2, b1, t2 ADD c05, t3, c05 MUL a1, b2, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b2, t4 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c02, t2, c02 ADD c05, t3, c05 ADD c06, t4, c06 .align 4 $L68: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c02, c02 MUL a3, c06, c06 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c05, c05 MUL a3, c06, c06 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L70: and M, 1, I ble I, $L79 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) fclr c02 LD b2, 1 * SIZE(B) fclr c06 lda L, -2(KK) LD b3, 2 * SIZE(B) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L78 ble L, $L75 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) fclr c02 LD b2, 1 * SIZE(BO) fclr c06 lda L, -2(TMP1) LD b3, 2 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L78 ble L, $L75 #endif .align 4 $L72: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 2 * SIZE(BO) ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 1 * SIZE(AO) LD b2, 3 * SIZE(BO) ADD c02, t3, c02 lda AO, 2 * SIZE(AO) MUL a2, b3, t3 LD b3, 4 * SIZE(BO) ADD c06, t4, c06 MUL a2, b4, t4 LD a2, 0 * SIZE(AO) LD b4, 5 * SIZE(BO) lda BO, 4 * SIZE(BO) unop unop bgt L, $L72 .align 4 $L75: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L77 #else blbs TMP1, $L77 #endif .align 4 ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) ADD c01, t1, c01 LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L77: ADD c05, t2, c05 MUL a1, b2, t2 ADD c02, t3, c02 ADD c06, t4, c06 ADD c01, c02, c01 lda AO, 1 * SIZE(AO) ADD c05, c06, c05 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 .align 4 $L78: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c05, c05 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L79: #ifdef LN sll K, 1 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 2, KK #endif #ifdef RT subq KK, 2, KK #endif .align 4 $L80: and N, 1, J ble J, $L999 #ifdef RT sll K, BASE_SHIFT, TMP1 subq B, TMP1, B subq C, LDC, C #endif mov C, C1 #ifndef RT addq C, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I ble I, $L100 .align 4 $L91: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L95 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L95 #endif .align 5 $L92: ADD c01, t1, c01 unop MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda L, -1(L) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 8 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 9 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 10 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 11 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 LD a1, 12 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD a2, 13 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b3, t3 LD a3, 14 * SIZE(AO) ADD c04, t4, c04 MUL a4, b3, t4 LD a5, 15 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c01, t1, c01 MUL a1, b4, t1 LD a1, 16 * SIZE(AO) lda AO, 16 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b4, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L92 .align 4 $L95: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif unop ble L, $L98 .align 4 $L96: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda BO, 1 * SIZE(BO) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 0 * SIZE(BO) lda AO, 4 * SIZE(AO) bgt L, $L96 .align 4 $L98: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a2, c04, t1 SUB c03, t1, c03 MUL a3, c04, t1 SUB c02, t1, c02 MUL a4, c04, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b2, c03, t1 SUB c02, t1, c02 MUL b3, c03, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c01, t1 SUB c03, t1, c03 MUL a4, c01, t1 SUB c04, t1, c04 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b2, c02, t1 SUB c03, t1, c03 MUL b3, c02, t1 SUB c04, t1, c04 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a2, c03, t1 SUB c04, t1, c04 MUL a3, c04, c04 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c03, 2 * SIZE(BO) ST c04, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) #ifndef LN lda C1, 4 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L91 .align 4 $L100: and M, 2, I ble I, $L110 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L105 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L ble L, $L105 #endif .align 5 $L102: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 5 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c03, t3, c03 lda BO, 4 * SIZE(BO) MUL a3, b2, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a5, 7 * SIZE(AO) LD b2, 1 * SIZE(BO) ADD c01, t1, c01 MUL a1, b3, t1 LD a1, 8 * SIZE(AO) lda AO, 8 * SIZE(AO) ADD c02, t2, c02 MUL a2, b3, t2 LD b3, 2 * SIZE(BO) LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L102 .align 4 $L105: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L108 .align 4 $L106: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 2 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) unop lda BO, 1 * SIZE(BO) bgt L, $L106 .align 4 $L108: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c03, c01 ADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) #ifndef LN lda C1, 2 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L110: and M, 1, I ble I, $L119 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO unop ble L, $L115 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L115 #endif .align 4 $L112: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 4 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c02, t2, c02 MUL a2, b2, t2 LD a2, 5 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c03, t3, c03 MUL a3, b3, t3 LD a3, 6 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c04, t4, c04 MUL a4, b4, t4 LD a4, 7 * SIZE(AO) LD b4, 7 * SIZE(BO) lda L, -1(L) lda AO, 4 * SIZE(AO) lda BO, 4 * SIZE(BO) bgt L, $L112 .align 4 $L115: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L118 .align 4 $L116: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) lda L, -1(L) lda AO, 1 * SIZE(AO) lda BO, 1 * SIZE(BO) bgt L, $L116 .align 4 $L118: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c02, c01 ADD c03, c04, c03 ADD c01, c03, c01 #if defined(LN) || defined(RT) subq KK, 1, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) SUB a1, c01, c01 #else LD a1, 0 * SIZE(AO) SUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) #else ST c01, 0 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) #ifndef LN lda C1, 1 * SIZE(C1) #endif #ifdef RT SXADDQ K, AORIG, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L119: #ifdef LN SXADDQ K, B, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 1, KK #endif #ifdef RT subq KK, 1, KK #endif .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/trsm_kernel_4x4_RT.S000066400000000000000000001631051313527062700212740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 56 #define UNOP #endif #ifdef EV4 #define UNOP #endif #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $20 #define B $21 #define C $22 #define LDC $23 #define C1 $19 #define C2 $24 #define C3 $25 #define C4 $27 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define AORIG $3 #define OFFSET $4 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 lda $sp, -STACKSIZE($sp) ldq C, 0 + STACKSIZE($sp) ldq LDC, 8 + STACKSIZE($sp) ldq OFFSET, 16 + STACKSIZE($sp) SXADDQ LDC, 0, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #ifdef LN mulq M, K, TMP1 SXADDQ TMP1, A, A SXADDQ M, C, C #endif #ifdef RN negq OFFSET, KK #endif #ifdef RT mulq N, K, TMP1 SXADDQ TMP1, B, B mulq N, LDC, TMP1 addq TMP1, C, C subq N, OFFSET, KK #endif and N, 1, J ble J, $L40 #ifdef RT sll K, BASE_SHIFT, TMP1 subq B, TMP1, B subq C, LDC, C #endif mov C, C1 #ifndef RT addq C, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I ble I, $L100 .align 4 $L91: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L95 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L95 #endif .align 5 $L92: ADD c01, t1, c01 unop MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda L, -1(L) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 8 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 9 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 10 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 11 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 LD a1, 12 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD a2, 13 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b3, t3 LD a3, 14 * SIZE(AO) ADD c04, t4, c04 MUL a4, b3, t4 LD a5, 15 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c01, t1, c01 MUL a1, b4, t1 LD a1, 16 * SIZE(AO) lda AO, 16 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b4, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L92 .align 4 $L95: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif unop ble L, $L98 .align 4 $L96: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 lda BO, 1 * SIZE(BO) MUL a2, b1, t2 LD a2, 5 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b1, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b1, t4 LD a4, 7 * SIZE(AO) LD b1, 0 * SIZE(BO) lda AO, 4 * SIZE(AO) bgt L, $L96 .align 4 $L98: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a2, c04, t1 SUB c03, t1, c03 MUL a3, c04, t1 SUB c02, t1, c02 MUL a4, c04, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b2, c03, t1 SUB c02, t1, c02 MUL b3, c03, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c01, t1 SUB c03, t1, c03 MUL a4, c01, t1 SUB c04, t1, c04 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b2, c02, t1 SUB c03, t1, c03 MUL b3, c02, t1 SUB c04, t1, c04 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a2, c03, t1 SUB c04, t1, c04 MUL a3, c04, c04 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c03, 2 * SIZE(BO) ST c04, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) #ifndef LN lda C1, 4 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L91 .align 4 $L100: and M, 2, I ble I, $L110 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO ble L, $L105 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L ble L, $L105 #endif .align 5 $L102: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 4 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 5 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c03, t3, c03 lda BO, 4 * SIZE(BO) MUL a3, b2, t3 LD a3, 6 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a5, 7 * SIZE(AO) LD b2, 1 * SIZE(BO) ADD c01, t1, c01 MUL a1, b3, t1 LD a1, 8 * SIZE(AO) lda AO, 8 * SIZE(AO) ADD c02, t2, c02 MUL a2, b3, t2 LD b3, 2 * SIZE(BO) LD a2, 1 * SIZE(AO) ADD c03, t3, c03 LD a4, 3 * SIZE(AO) MUL a3, b4, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 3 * SIZE(BO) bgt L, $L102 .align 4 $L105: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L108 .align 4 $L106: ADD c01, t1, c01 lda L, -1(L) MUL a1, b1, t1 LD a1, 2 * SIZE(AO) ADD c02, t2, c02 MUL a2, b1, t2 LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) unop lda BO, 1 * SIZE(BO) bgt L, $L106 .align 4 $L108: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c03, c01 ADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a2, c02, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 MUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) #ifndef LN lda C1, 2 * SIZE(C1) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L110: and M, 1, I ble I, $L119 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c04 sra KK, 2, L mov B, BO unop ble L, $L115 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c04 sra TMP1, 2, L unop ble L, $L115 #endif .align 4 $L112: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 4 * SIZE(AO) LD b1, 4 * SIZE(BO) ADD c02, t2, c02 MUL a2, b2, t2 LD a2, 5 * SIZE(AO) LD b2, 5 * SIZE(BO) ADD c03, t3, c03 MUL a3, b3, t3 LD a3, 6 * SIZE(AO) LD b3, 6 * SIZE(BO) ADD c04, t4, c04 MUL a4, b4, t4 LD a4, 7 * SIZE(AO) LD b4, 7 * SIZE(BO) lda L, -1(L) lda AO, 4 * SIZE(AO) lda BO, 4 * SIZE(BO) bgt L, $L112 .align 4 $L115: #if defined(LT) || defined(RN) and KK, 3, L #else and TMP1, 3, L #endif ble L, $L118 .align 4 $L116: ADD c01, t1, c01 MUL a1, b1, t1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) lda L, -1(L) lda AO, 1 * SIZE(AO) lda BO, 1 * SIZE(BO) bgt L, $L116 .align 4 $L118: ADD c01, t1, c01 ADD c02, t2, c02 ADD c03, t3, c03 ADD c04, t4, c04 ADD c01, c02, c01 ADD c03, c04, c03 ADD c01, c03, c01 #if defined(LN) || defined(RT) subq KK, 1, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO addq B, TMP2, BO #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) SUB a1, c01, c01 #else LD a1, 0 * SIZE(AO) SUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) MUL a1, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) #else ST c01, 0 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) #ifndef LN lda C1, 1 * SIZE(C1) #endif #ifdef RT SXADDQ K, AORIG, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L119: #ifdef LN SXADDQ K, B, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 1, KK #endif #ifdef RT subq KK, 1, KK #endif .align 4 $L40: and N, 2, J ble J, $L80 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 subq B, TMP1, B addq LDC, LDC, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 fclr t1 #ifndef RT addq C2, LDC, C #endif fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I fclr t3 fclr t4 ble I, $L60 .align 4 $L51: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda L, -2(KK) lda BO, 2 * SIZE(B) lda AO, 4 * SIZE(AO) ble KK, $L58 ble L, $L55 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c03 LD a2, 1 * SIZE(AO) fclr c07 LD a3, 2 * SIZE(AO) fclr c04 LD a4, 3 * SIZE(AO) fclr c08 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda L, -2(TMP1) lda BO, 2 * SIZE(BO) lda AO, 4 * SIZE(AO) ble TMP1, $L58 ble L, $L55 #endif .align 4 $L52: ADD c05, t1, c05 unop MUL a1, b1, t1 unop ADD c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD c07, t3, c07 unop MUL a3, b1, t3 unop ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD c06, t2, c06 unop MUL a2, b3, t2 unop ADD c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L52 .align 4 $L55: ADD c05, t1, c05 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L57 #else blbs TMP1, $L57 #endif .align 4 ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L57: ADD c06, t2, c06 MUL a2, b1, t2 ADD c07, t3, c07 MUL a3, b1, t3 ADD c08, t4, c08 MUL a4, b1, t4 ADD c01, t1, c01 MUL a1, b2, t1 ADD c02, t2, c02 MUL a2, b2, t2 ADD c03, t3, c03 MUL a3, b2, t3 ADD c04, t4, c04 lda AO, 4 * SIZE(AO) MUL a4, b2, t4 lda BO, 2 * SIZE(BO) ADD c05, t1, c05 ADD c06, t2, c06 ADD c07, t3, c07 ADD c08, t4, c08 .align 4 $L58: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 SUB b1, c03, c03 SUB b2, c07, c07 SUB b3, c04, c04 SUB b4, c08, c08 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a2, c04, t1 MUL a2, c08, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a3, c04, t1 MUL a3, c08, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a4, c04, t1 MUL a4, c08, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b2, c03, t1 MUL b2, c07, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL b3, c03, t1 MUL b3, c07, t2 SUB c01, t1, c01 SUB c05, t2, c05 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c05, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL a4, c01, t1 MUL a4, c05, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b2, c02, t1 MUL b2, c06, t2 SUB c03, t1, c03 SUB c07, t2, c07 MUL b3, c02, t1 MUL b3, c06, t2 SUB c04, t1, c04 SUB c08, t2, c08 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a2, c03, t1 MUL a2, c07, t2 SUB c04, t1, c04 SUB c08, t2, c08 MUL a3, c04, c04 MUL a3, c08, c08 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c05, c05 MUL a3, c06, c06 MUL a3, c07, c07 MUL a3, c08, c08 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) ST c03, 4 * SIZE(BO) ST c07, 5 * SIZE(BO) ST c04, 6 * SIZE(BO) ST c08, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L51 .align 4 $L60: and M, 2, I ble I, $L70 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L68 ble L, $L65 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L68 ble L, $L65 #endif .align 4 $L62: ADD c01, t1, c01 unop MUL a1, b1, t1 unop ADD c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L62 .align 4 $L65: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L67 #else blbs TMP1, $L67 #endif .align 4 ADD c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L67: ADD c02, t2, c02 MUL a2, b1, t2 ADD c05, t3, c05 MUL a1, b2, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b2, t4 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c02, t2, c02 ADD c05, t3, c05 ADD c06, t4, c06 .align 4 $L68: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 SUB a4, c06, c06 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a2, c02, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c05, t2, c05 MUL a3, c01, c01 MUL a3, c05, c05 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a2, c01, t1 MUL a2, c05, t2 SUB c02, t1, c02 SUB c06, t2, c06 MUL a3, c02, c02 MUL a3, c06, c06 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c05, c05 MUL a3, c06, c06 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c02, 2 * SIZE(BO) ST c06, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L70: and M, 1, I ble I, $L79 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) fclr c02 LD b2, 1 * SIZE(B) fclr c06 lda L, -2(KK) LD b3, 2 * SIZE(B) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 2 * SIZE(B) ble KK, $L78 ble L, $L75 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) fclr c02 LD b2, 1 * SIZE(BO) fclr c06 lda L, -2(TMP1) LD b3, 2 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 2 * SIZE(BO) ble TMP1, $L78 ble L, $L75 #endif .align 4 $L72: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 2 * SIZE(BO) ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 1 * SIZE(AO) LD b2, 3 * SIZE(BO) ADD c02, t3, c02 lda AO, 2 * SIZE(AO) MUL a2, b3, t3 LD b3, 4 * SIZE(BO) ADD c06, t4, c06 MUL a2, b4, t4 LD a2, 0 * SIZE(AO) LD b4, 5 * SIZE(BO) lda BO, 4 * SIZE(BO) unop unop bgt L, $L72 .align 4 $L75: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L77 #else blbs TMP1, $L77 #endif .align 4 ADD c05, t2, c05 MUL a1, b2, t2 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) ADD c01, t1, c01 LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L77: ADD c05, t2, c05 MUL a1, b2, t2 ADD c02, t3, c02 ADD c06, t4, c06 ADD c01, c02, c01 lda AO, 1 * SIZE(AO) ADD c05, c06, c05 lda BO, 2 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 .align 4 $L78: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c05, c05 #endif #ifdef RT LD a1, 3 * SIZE(BO) LD a2, 2 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L79: #ifdef LN sll K, 1 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 2, KK #endif #ifdef RT subq KK, 2, KK #endif .align 4 $L80: sra N, 2, J ble J, $L999 .align 4 $L01: #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 subq B, TMP1, B s4addq LDC, 0, TMP1 subq C, TMP1, C #endif mov C, C1 addq C, LDC, C2 addq C2, LDC, C3 #ifndef RT s4addq LDC, C, C #endif fclr t1 addq C3, LDC, C4 fclr t2 #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 2, I fclr t3 fclr t4 ble I, $L20 .align 4 $L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c02 LD b3, 2 * SIZE(B) fclr c06 LD b4, 3 * SIZE(B) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(KK) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(B) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble KK, $L18 #else #ifdef LN sll K, BASE_SHIFT + 2, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 2, TMP1 addq AORIG, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c11 LD a2, 1 * SIZE(AO) fclr c12 LD a3, 2 * SIZE(AO) fclr c16 LD a4, 3 * SIZE(AO) fclr c15 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c02 LD b3, 2 * SIZE(BO) fclr c06 LD b4, 3 * SIZE(BO) fclr c05 lds $f31, 4 * SIZE(C1) fclr c03 lda L, -2(TMP1) fclr c04 lds $f31, 7 * SIZE(C2) fclr c08 lda BO, 4 * SIZE(BO) fclr c13 lds $f31, 4 * SIZE(C3) fclr c09 lda AO, 4 * SIZE(AO) fclr c10 lds $f31, 7 * SIZE(C4) fclr c14 fclr c07 ble TMP1, $L18 #endif ble L, $L15 .align 5 $L12: /* 1 */ ADD c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD c12, t2, c12 unop MUL b1, a2, t2 unop ADD c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD c01, t1, c01 unop MUL b5, a6, t1 unop ADD c02, t2, c02 unop MUL b5, a4, t2 unop ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD c11, t1, c11 MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L17 #else blbs TMP1, $L17 #endif .align 4 ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L17: ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 MUL b1, a4, t2 ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 MUL b3, a1, t1 ADD c04, t2, c04 MUL b3, a2, t2 ADD c08, t3, c08 MUL b4, a2, t3 ADD c13, t4, c13 MUL b2, a3, t4 ADD c09, t1, c09 MUL b3, a3, t1 ADD c10, t2, c10 MUL b3, a4, t2 ADD c14, t3, c14 MUL b4, a4, t3 ADD c07, t4, c07 lda AO, 4 * SIZE(AO) MUL b4, a3, t4 lda BO, 4 * SIZE(BO) ADD c11, t1, c11 ADD c12, t2, c12 ADD c16, t3, c16 ADD c15, t4, c15 .align 4 $L18: #if defined(LN) || defined(RT) #ifdef LN subq KK, 4, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 2, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 LD a1, 8 * SIZE(BO) LD a2, 9 * SIZE(BO) LD a3, 10 * SIZE(BO) LD a4, 11 * SIZE(BO) LD b1, 12 * SIZE(BO) LD b2, 13 * SIZE(BO) LD b3, 14 * SIZE(BO) LD b4, 15 * SIZE(BO) SUB a1, c03, c03 SUB a2, c07, c07 SUB a3, c11, c11 SUB a4, c15, c15 SUB b1, c04, c04 SUB b2, c08, c08 SUB b3, c12, c12 SUB b4, c16, c16 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c05, c05 SUB b2, c06, c06 SUB b3, c07, c07 SUB b4, c08, c08 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 12 * SIZE(AO) LD b2, 13 * SIZE(AO) LD b3, 14 * SIZE(AO) LD b4, 15 * SIZE(AO) SUB a1, c09, c09 SUB a2, c10, c10 SUB a3, c11, c11 SUB a4, c12, c12 SUB b1, c13, c13 SUB b2, c14, c14 SUB b3, c15, c15 SUB b4, c16, c16 #endif #ifdef LN LD a1, 15 * SIZE(AO) LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) MUL a1, c04, c04 MUL a1, c08, c08 MUL a1, c12, c12 MUL a1, c16, c16 MUL a2, c04, t1 MUL a2, c08, t2 MUL a2, c12, t3 MUL a2, c16, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a3, c04, t1 MUL a3, c08, t2 MUL a3, c12, t3 MUL a3, c16, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a4, c04, t1 MUL a4, c08, t2 MUL a4, c12, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) MUL b1, c03, c03 MUL b1, c07, c07 MUL b1, c11, c11 MUL b1, c15, c15 MUL b2, c03, t1 MUL b2, c07, t2 MUL b2, c11, t3 MUL b2, c15, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL b3, c03, t1 MUL b3, c07, t2 MUL b3, c11, t3 MUL b3, c15, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 LD a1, 5 * SIZE(AO) LD a2, 4 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c01, t1 MUL a3, c05, t2 MUL a3, c09, t3 MUL a3, c13, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL a4, c01, t1 MUL a4, c05, t2 MUL a4, c09, t3 MUL a4, c13, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) MUL b1, c02, c02 MUL b1, c06, c06 MUL b1, c10, c10 MUL b1, c14, c14 MUL b2, c02, t1 MUL b2, c06, t2 MUL b2, c10, t3 MUL b2, c14, t4 SUB c03, t1, c03 SUB c07, t2, c07 SUB c11, t3, c11 SUB c15, t4, c15 MUL b3, c02, t1 MUL b3, c06, t2 MUL b3, c10, t3 MUL b3, c14, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) MUL a1, c03, c03 MUL a1, c07, c07 MUL a1, c11, c11 MUL a1, c15, c15 MUL a2, c03, t1 MUL a2, c07, t2 MUL a2, c11, t3 MUL a2, c15, t4 SUB c04, t1, c04 SUB c08, t2, c08 SUB c12, t3, c12 SUB c16, t4, c16 MUL a3, c04, c04 MUL a3, c08, c08 MUL a3, c12, c12 MUL a3, c16, c16 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 MUL a2, c01, t1 MUL a2, c02, t2 MUL a2, c03, t3 MUL a2, c04, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c03, t3 MUL a3, c04, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c01, t1 MUL a4, c02, t2 MUL a4, c03, t3 MUL a4, c04, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b1, c07, c07 MUL b1, c08, c08 MUL b2, c05, t1 MUL b2, c06, t2 MUL b2, c07, t3 MUL b2, c08, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL b3, c05, t1 MUL b3, c06, t2 MUL b3, c07, t3 MUL b3, c08, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 MUL a2, c09, t1 MUL a2, c10, t2 MUL a2, c11, t3 MUL a2, c12, t4 SUB c13, t1, c13 SUB c14, t2, c14 SUB c15, t3, c15 SUB c16, t4, c16 MUL a3, c13, c13 MUL a3, c14, c14 MUL a3, c15, c15 MUL a3, c16, c16 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a1, c15, c15 MUL a1, c16, c16 MUL a2, c13, t1 MUL a2, c14, t2 MUL a2, c15, t3 MUL a2, c16, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a3, c13, t1 MUL a3, c14, t2 MUL a3, c15, t3 MUL a3, c16, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL a4, c13, t1 MUL a4, c14, t2 MUL a4, c15, t3 MUL a4, c16, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b1, c11, c11 MUL b1, c12, c12 MUL b2, c09, t1 MUL b2, c10, t2 MUL b2, c11, t3 MUL b2, c12, t4 SUB c05, t1, c05 SUB c06, t2, c06 SUB c07, t3, c07 SUB c08, t4, c08 MUL b3, c09, t1 MUL b3, c10, t2 MUL b3, c11, t3 MUL b3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a1, c07, c07 MUL a1, c08, c08 MUL a2, c05, t1 MUL a2, c06, t2 MUL a2, c07, t3 MUL a2, c08, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a3, c01, c01 MUL a3, c02, c02 MUL a3, c03, c03 MUL a3, c04, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) ST c03, 8 * SIZE(BO) ST c07, 9 * SIZE(BO) ST c11, 10 * SIZE(BO) ST c15, 11 * SIZE(BO) ST c04, 12 * SIZE(BO) ST c08, 13 * SIZE(BO) ST c12, 14 * SIZE(BO) ST c16, 15 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c05, 4 * SIZE(AO) ST c06, 5 * SIZE(AO) ST c07, 6 * SIZE(AO) ST c08, 7 * SIZE(AO) ST c09, 8 * SIZE(AO) ST c10, 9 * SIZE(AO) ST c11, 10 * SIZE(AO) ST c12, 11 * SIZE(AO) ST c13, 12 * SIZE(AO) ST c14, 13 * SIZE(AO) ST c15, 14 * SIZE(AO) ST c16, 15 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) lda C3, -4 * SIZE(C3) lda C4, -4 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c07, 2 * SIZE(C2) ST c08, 3 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c11, 2 * SIZE(C3) ST c12, 3 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) ST c15, 2 * SIZE(C4) ST c16, 3 * SIZE(C4) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) lda C3, 4 * SIZE(C3) lda C4, 4 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 2, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #ifdef LT addq KK, 4, KK #endif #ifdef LN subq KK, 4, KK #endif lda I, -1(I) bgt I, $L11 .align 4 $L20: and M, 2, I ble I, $L30 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c01 LD b4, 3 * SIZE(B) fclr c05 lda BO, 4 * SIZE(B) fclr c02 fclr c06 ble KK, $L28 ble L, $L25 #else #ifdef LN sll K, BASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c01 LD b4, 3 * SIZE(BO) fclr c05 lda BO, 4 * SIZE(BO) fclr c02 fclr c06 ble TMP1, $L28 ble L, $L25 #endif .align 4 $L22: ADD c09, t1, c09 unop MUL a1, b1, t1 unop ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 unop ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD c09, t1, c09 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L27 #else blbs TMP1, $L27 #endif ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 unop ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L27: ADD c10, t2, c10 MUL a2, b1, t2 ADD c13, t3, c13 MUL a1, b2, t3 ADD c14, t4, c14 MUL a2, b2, t4 ADD c01, t1, c01 MUL a1, b3, t1 ADD c02, t2, c02 MUL a2, b3, t2 ADD c05, t3, c05 MUL a1, b4, t3 ADD c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b4, t4 lda BO, 4 * SIZE(BO) ADD c09, t1, c09 ADD c10, t2, c10 ADD c13, t3, c13 ADD c14, t4, c14 .align 4 $L28: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 SUB b1, c02, c02 SUB b2, c06, c06 SUB b3, c10, c10 SUB b4, c14, c14 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 SUB a4, c06, c06 SUB b1, c09, c09 SUB b2, c10, c10 SUB b3, c13, c13 SUB b4, c14, c14 #endif #ifdef LN LD a1, 3 * SIZE(AO) LD a2, 2 * SIZE(AO) LD a3, 0 * SIZE(AO) MUL a1, c02, c02 MUL a1, c06, c06 MUL a1, c10, c10 MUL a1, c14, c14 MUL a2, c02, t1 MUL a2, c06, t2 MUL a2, c10, t3 MUL a2, c14, t4 SUB c01, t1, c01 SUB c05, t2, c05 SUB c09, t3, c09 SUB c13, t4, c13 MUL a3, c01, c01 MUL a3, c05, c05 MUL a3, c09, c09 MUL a3, c13, c13 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 MUL a2, c01, t1 MUL a2, c05, t2 MUL a2, c09, t3 MUL a2, c13, t4 SUB c02, t1, c02 SUB c06, t2, c06 SUB c10, t3, c10 SUB c14, t4, c14 MUL a3, c02, c02 MUL a3, c06, c06 MUL a3, c10, c10 MUL a3, c14, c14 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a1, c02, c02 MUL a2, c01, t1 MUL a2, c02, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a3, c01, t1 MUL a3, c02, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a4, c01, t1 MUL a4, c02, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b1, c06, c06 MUL b2, c05, t1 MUL b2, c06, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL b3, c05, t1 MUL b3, c06, t2 SUB c13, t1, c13 SUB c14, t2, c14 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a1, c10, c10 MUL a2, c09, t1 MUL a2, c10, t2 SUB c13, t1, c13 SUB c14, t2, c14 MUL a3, c13, c13 MUL a3, c14, c14 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a1, c14, c14 MUL a2, c13, t1 MUL a2, c14, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a3, c13, t1 MUL a3, c14, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL a4, c13, t1 MUL a4, c14, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b1, c10, c10 MUL b2, c09, t1 MUL b2, c10, t2 SUB c05, t1, c05 SUB c06, t2, c06 MUL b3, c09, t1 MUL b3, c10, t2 SUB c01, t1, c01 SUB c02, t2, c02 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a1, c06, c06 MUL a2, c05, t1 MUL a2, c06, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a3, c01, c01 MUL a3, c02, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) ST c02, 4 * SIZE(BO) ST c06, 5 * SIZE(BO) ST c10, 6 * SIZE(BO) ST c14, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c05, 2 * SIZE(AO) ST c06, 3 * SIZE(AO) ST c09, 4 * SIZE(AO) ST c10, 5 * SIZE(AO) ST c13, 6 * SIZE(AO) ST c14, 7 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) lda C3, -2 * SIZE(C3) lda C4, -2 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c06, 1 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c10, 1 * SIZE(C3) ST c13, 0 * SIZE(C4) ST c14, 1 * SIZE(C4) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) lda C3, 2 * SIZE(C3) lda C4, 2 * SIZE(C4) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, 1 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif .align 4 $L30: and M, 1, I ble I, $L39 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(B) lda L, -2(KK) LD b2, 1 * SIZE(B) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(B) fclr c09 LD b4, 3 * SIZE(B) fclr c13 lda BO, 4 * SIZE(B) ble KK, $L38 ble L, $L35 #else #ifdef LN sll K, BASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c01 LD a2, 1 * SIZE(AO) fclr c05 LD b1, 0 * SIZE(BO) lda L, -2(TMP1) LD b2, 1 * SIZE(BO) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(BO) fclr c09 LD b4, 3 * SIZE(BO) fclr c13 lda BO, 4 * SIZE(BO) ble TMP1, $L38 ble L, $L35 #endif .align 4 $L32: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 0 * SIZE(BO) ADD c05, t2, c05 lda AO, 2 * SIZE(AO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 LD b5, 3 * SIZE(BO) MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, -1 * SIZE(AO) ADD c01, t1, c01 MUL a2, b1, t1 LD b1, 4 * SIZE(BO) lda BO, 8 * SIZE(BO) ADD c05, t2, c05 MUL a2, b2, t2 LD b2, -3 * SIZE(BO) ADD c09, t3, c09 LD b4, -1 * SIZE(BO) MUL a2, b3, t3 LD b3, -2 * SIZE(BO) ADD c13, t4, c13 MUL a2, b5, t4 LD a2, 0 * SIZE(AO) bgt L, $L32 .align 4 $L35: ADD c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L37 #else blbs TMP1, $L37 #endif .align 4 ADD c05, t2, c05 LD b1, 0 * SIZE(BO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, 0 * SIZE(AO) lda AO, 1 * SIZE(AO) ADD c01, t1, c01 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L37: ADD c05, t2, c05 MUL a1, b2, t2 ADD c09, t3, c09 MUL a1, b3, t3 ADD c13, t4, c13 lda AO, 1 * SIZE(AO) MUL a1, b4, t4 lda BO, 4 * SIZE(BO) ADD c01, t1, c01 ADD c05, t2, c05 ADD c09, t3, c09 ADD c13, t4, c13 $L38: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 4, TMP1 #endif sll TMP1, BASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq B, TMP2, BO #else lda AO, -1 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 SUB a4, c13, c13 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 MUL a1, c13, c13 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 MUL a3, c01, t1 SUB c09, t1, c09 MUL a4, c01, t1 SUB c13, t1, c13 LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) MUL b1, c05, c05 MUL b2, c05, t1 SUB c09, t1, c09 MUL b3, c05, t1 SUB c13, t1, c13 LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) MUL a1, c09, c09 MUL a2, c09, t1 SUB c13, t1, c13 MUL a3, c13, c13 #endif #ifdef RT LD a1, 15 * SIZE(BO) LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) MUL a1, c13, c13 MUL a2, c13, t1 SUB c09, t1, c09 MUL a3, c13, t1 SUB c05, t1, c05 MUL a4, c13, t1 SUB c01, t1, c01 LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) MUL b1, c09, c09 MUL b2, c09, t1 SUB c05, t1, c05 MUL b3, c09, t1 SUB c01, t1, c01 LD a1, 5 * SIZE(BO) LD a2, 4 * SIZE(BO) LD a3, 0 * SIZE(BO) MUL a1, c05, c05 MUL a2, c05, t1 SUB c01, t1, c01 MUL a3, c01, c01 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c05, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c13, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c05, 1 * SIZE(AO) ST c09, 2 * SIZE(AO) ST c13, 3 * SIZE(AO) #endif #ifdef LN lda C1, -1 * SIZE(C1) lda C2, -1 * SIZE(C2) lda C3, -1 * SIZE(C3) lda C4, -1 * SIZE(C4) #endif ST c01, 0 * SIZE(C1) ST c05, 0 * SIZE(C2) ST c09, 0 * SIZE(C3) ST c13, 0 * SIZE(C4) #ifdef RT sll K, 0 + BASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, BASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, BASE_SHIFT + 2, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L39: #ifdef LN sll K, 2 + BASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 4, KK #endif #ifdef RT subq KK, 4, KK #endif lda J, -1(J) bgt J, $L01 .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zamax.S000066400000000000000000000144621313527062700167640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #ifndef USE_MIN #define CMPLT(a, b) cmptlt a, b #else #define CMPLT(a, b) cmptlt b, a #endif #define STACKSIZE 8 * 8 PROLOGUE PROFCODE .frame $sp, STACKSIZE, $26, 0 lda $sp, -STACKSIZE($sp) stt $f2, 0($sp) fclr $f16 cmplt $31, N, $2 stt $f3, 8($sp) fclr $f17 cmplt $31, INCX, $3 unop stt $f4, 16($sp) fclr $f18 SXADDQ INCX, $31, INCX unop stt $f5, 24($sp) fclr $f19 and $2, $3, $0 unop stt $f6, 32($sp) unop stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) fclr $f0 beq $0, $End # if (n <= 0) or (incx <= 0) return .align 4 LD $f20, 0 * SIZE(X) LD $f21, 1 * SIZE(X) sra N, 2, $1 addq INCX, INCX, INCX fabs $f20, $f20 fabs $f21, $f21 addt $f20, $f21, $f0 ble $1, $L15 .align 4 lda $1, -1($1) unop addq X, INCX, X unop LD $f22, 0 * SIZE(X) fmov $f0, $f1 LD $f23, 1 * SIZE(X) addq X, INCX, X LD $f24, 0 * SIZE(X) fmov $f0, $f2 LD $f25, 1 * SIZE(X) addq X, INCX, X LD $f26, 0 * SIZE(X) fmov $f0, $f3 LD $f27, 1 * SIZE(X) addq X, INCX, X fabs $f20, $f8 fabs $f21, $f9 fabs $f22, $f10 fabs $f23, $f11 fabs $f24, $f12 fabs $f25, $f13 fabs $f26, $f14 fabs $f27, $f15 ble $1, $L14 .align 4 LD $f20, 0 * SIZE(X) LD $f21, 1 * SIZE(X) lda $1, -1($1) addq X, INCX, X LD $f22, 0 * SIZE(X) LD $f23, 1 * SIZE(X) unop addq X, INCX, X LD $f24, 0 * SIZE(X) LD $f25, 1 * SIZE(X) unop addq X, INCX, X LD $f26, 0 * SIZE(X) LD $f27, 1 * SIZE(X) addq X, INCX, X ble $1, $L13 .align 4 $L12: addt $f8, $f9, $f16 unop fabs $f20, $f8 ldl $31, 64 * SIZE(X) addt $f10, $f11, $f17 unop fabs $f21, $f9 LD $f20, 0 * SIZE(X) addt $f12, $f13, $f18 LD $f21, 1 * SIZE(X) fabs $f22, $f10 addq X, INCX, X addt $f14, $f15, $f19 LD $f22, 0 * SIZE(X) fabs $f23, $f11 unop CMPLT($f0, $f16), $f4 LD $f23, 1 * SIZE(X) fabs $f24, $f12 addq X, INCX, X CMPLT($f1, $f17), $f5 LD $f24, 0 * SIZE(X) fabs $f25, $f13 unop CMPLT($f2, $f18), $f6 LD $f25, 1 * SIZE(X) fabs $f26, $f14 addq X, INCX, X CMPLT($f3, $f19), $f7 LD $f26, 0 * SIZE(X) fabs $f27, $f15 unop fcmovne $f4, $f16, $f0 LD $f27, 1 * SIZE(X) addq X, INCX, X lda $1, -1($1) # i -- fcmovne $f5, $f17, $f1 fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 bgt $1,$L12 .align 4 $L13: addt $f8, $f9, $f16 fabs $f20, $f8 addt $f10, $f11, $f17 fabs $f21, $f9 addt $f12, $f13, $f18 fabs $f22, $f10 addt $f14, $f15, $f19 fabs $f23, $f11 CMPLT($f0, $f16), $f4 fabs $f24, $f12 CMPLT($f1, $f17), $f5 fabs $f25, $f13 CMPLT($f2, $f18), $f6 fabs $f26, $f14 CMPLT($f3, $f19), $f7 fabs $f27, $f15 fcmovne $f4, $f16, $f0 fcmovne $f5, $f17, $f1 fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 .align 4 $L14: addt $f8, $f9, $f16 addt $f10, $f11, $f17 addt $f12, $f13, $f18 addt $f14, $f15, $f19 CMPLT($f0, $f16), $f4 CMPLT($f1, $f17), $f5 CMPLT($f2, $f18), $f6 CMPLT($f3, $f19), $f7 fcmovne $f4, $f16, $f0 fcmovne $f5, $f17, $f1 fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 CMPLT($f0, $f1), $f16 CMPLT($f2, $f3), $f17 fcmovne $f16, $f1, $f0 fcmovne $f17, $f3, $f2 CMPLT($f0, $f2), $f16 fcmovne $f16, $f2, $f0 .align 4 $L15: and N, 3, $1 unop unop ble $1, $End .align 4 $L16: LD $f20, 0 * SIZE(X) LD $f21, 1 * SIZE(X) unop addq X, INCX, X fabs $f20, $f29 fabs $f21, $f30 addt $f29, $f30, $f29 CMPLT($f0, $f29), $f16 fcmovne $f16, $f29, $f0 lda $1, -1($1) # i -- bgt $1, $L16 .align 4 $End: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zasum.S000066400000000000000000000111541313527062700167760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 88 #define N $16 #define X $17 #define INCX $18 #define I $19 #define s0 $f0 #define s1 $f1 #define s2 $f10 #define s3 $f11 #define a0 $f12 #define a1 $f13 #define a2 $f14 #define a3 $f15 #define a4 $f16 #define a5 $f17 #define a6 $f18 #define a7 $f19 #define t0 $f20 #define t1 $f21 #define t2 $f22 #define t3 $f23 PROLOGUE PROFCODE fclr s0 unop fclr t0 addq INCX, INCX, INCX fclr s1 unop fclr t1 ble N, $L999 fclr s2 sra N, 2, I fclr s3 ble I, $L15 LD a0, 0 * SIZE(X) fclr t2 LD a1, 1 * SIZE(X) SXADDQ INCX, X, X LD a2, 0 * SIZE(X) fclr t3 LD a3, 1 * SIZE(X) SXADDQ INCX, X, X LD a4, 0 * SIZE(X) LD a5, 1 * SIZE(X) SXADDQ INCX, X, X lda I, -1(I) ble I, $L13 .align 4 $L12: ADD s0, t0, s0 ldl $31, PREFETCHSIZE * SIZE(X) fabs a0, t0 lda I, -1(I) ADD s1, t1, s1 LD a6, 0 * SIZE(X) fabs a1, t1 unop ADD s2, t2, s2 LD a7, 1 * SIZE(X) fabs a2, t2 SXADDQ INCX, X, X ADD s3, t3, s3 LD a0, 0 * SIZE(X) fabs a3, t3 unop ADD s0, t0, s0 LD a1, 1 * SIZE(X) fabs a4, t0 SXADDQ INCX, X, X ADD s1, t1, s1 LD a2, 0 * SIZE(X) fabs a5, t1 unop ADD s2, t2, s2 LD a3, 1 * SIZE(X) fabs a6, t2 SXADDQ INCX, X, X ADD s3, t3, s3 LD a4, 0 * SIZE(X) fabs a7, t3 unop LD a5, 1 * SIZE(X) unop SXADDQ INCX, X, X bne I, $L12 .align 4 $L13: ADD s0, t0, s0 LD a6, 0 * SIZE(X) fabs a0, t0 ADD s1, t1, s1 LD a7, 1 * SIZE(X) fabs a1, t1 SXADDQ INCX, X, X ADD s2, t2, s2 fabs a2, t2 ADD s3, t3, s3 fabs a3, t3 ADD s0, t0, s0 fabs a4, t0 ADD s1, t1, s1 fabs a5, t1 ADD s2, t2, s2 fabs a6, t2 ADD s3, t3, s3 fabs a7, t3 ADD s2, t2, s2 ADD s3, t3, s3 .align 4 $L15: ADD s0, s2, s0 and N, 3, I ADD s1, s3, s1 ble I, $L999 .align 4 $L17: ADD s0, t0, s0 LD a0, 0 * SIZE(X) fabs a0, t0 lda I, -1(I) ADD s1, t1, s1 LD a1, 1 * SIZE(X) fabs a1, t1 SXADDQ INCX, X, X bne I, $L17 .align 4 $L999: ADD s0, t0, s0 ADD s1, t1, s1 ADD s0, s1, s0 ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zaxpy.S000066400000000000000000000271421313527062700170160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 40 #ifndef CONJ #define ADD1 SUB #define ADD2 ADD #else #define ADD1 ADD #define ADD2 SUB #endif PROLOGUE PROFCODE .frame $sp, 16, $26, 0 ldl $19, 0($sp) fmov $f19, $f29 ldq $20, 8($sp) fmov $f20, $f30 mov $21, $18 ldl $21, 16($sp) lda $sp, -64($sp) nop stt $f2, 0($sp) cmpeq $19, 1, $1 stt $f3, 8($sp) cmpeq $21, 1, $2 stt $f4, 16($sp) and $16, 3, $5 stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) #ifndef PROFILE .prologue 0 #else .prologue 1 #endif and $1, $2, $1 ble $16, $End sra $16, 2, $4 beq $1, $Sub ble $4, $Remain subq $4, 1, $4 LD $f0, 0*SIZE($18) LD $f1, 1*SIZE($18) LD $f2, 2*SIZE($18) LD $f3, 3*SIZE($18) LD $f4, 4*SIZE($18) LD $f5, 5*SIZE($18) LD $f6, 6*SIZE($18) LD $f7, 7*SIZE($18) LD $f8, 0*SIZE($20) LD $f28, 1*SIZE($20) LD $f10, 2*SIZE($20) LD $f11, 3*SIZE($20) LD $f12, 4*SIZE($20) LD $f13, 5*SIZE($20) LD $f14, 6*SIZE($20) LD $f15, 7*SIZE($20) addq $18, 8*SIZE, $18 ble $4, $MainLoopEnd .align 4 $MainLoop: ldt $f31, PREFETCHSIZE * SIZE($20) ldl $31, PREFETCHSIZE * SIZE($18) MUL $f29, $f0, $f20 LD $f31, 9*SIZE($18) MUL $f30, $f1, $f21 unop MUL $f30, $f0, $f22 LD $f0, 0*SIZE($18) MUL $f29, $f1, $f23 LD $f1, 1*SIZE($18) MUL $f29, $f2, $f24 unop MUL $f30, $f3, $f25 nop MUL $f30, $f2, $f26 LD $f2, 2*SIZE($18) MUL $f29, $f3, $f27 LD $f3, 3*SIZE($18) ADD1 $f20, $f21, $f16 MUL $f29, $f4, $f20 ADD2 $f22, $f23, $f17 MUL $f30, $f5, $f21 ADD1 $f24, $f25, $f18 unop MUL $f30, $f4, $f22 LD $f4, 4*SIZE($18) ADD2 $f26, $f27, $f19 addq $20, 8*SIZE, $20 MUL $f29, $f5, $f23 LD $f5, 5*SIZE($18) ADD $f16, $f8, $f16 LD $f8, 0*SIZE($20) MUL $f29, $f6, $f24 unop ADD $f17, $f28, $f17 LD $f28, 1*SIZE($20) MUL $f30, $f7, $f25 unop ADD $f18, $f10, $f18 LD $f10, 2*SIZE($20) MUL $f30, $f6, $f26 LD $f6, 6*SIZE($18) ADD $f19, $f11, $f19 LD $f11, 3*SIZE($20) MUL $f29, $f7, $f27 LD $f7, 7*SIZE($18) ST $f16,-8*SIZE($20) ADD1 $f20, $f21, $f16 ST $f17,-7*SIZE($20) ADD2 $f22, $f23, $f17 ST $f18,-6*SIZE($20) ADD1 $f24, $f25, $f18 ST $f19,-5*SIZE($20) ADD2 $f26, $f27, $f19 ADD $f16, $f12, $f16 LD $f12, 4*SIZE($20) ADD $f17, $f13, $f17 LD $f13, 5*SIZE($20) ADD $f18, $f14, $f18 LD $f14, 6*SIZE($20) ADD $f19, $f15, $f19 LD $f15, 7*SIZE($20) ST $f16,-4*SIZE($20) addq $18, 8*SIZE, $18 ST $f17,-3*SIZE($20) subq $4, 1, $4 ST $f18,-2*SIZE($20) nop ST $f19,-1*SIZE($20) bgt $4, $MainLoop .align 4 $MainLoopEnd: MUL $f29, $f0, $f20 MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 MUL $f29, $f2, $f24 MUL $f30, $f3, $f25 MUL $f30, $f2, $f26 MUL $f29, $f3, $f27 ADD1 $f20, $f21, $f16 MUL $f29, $f4, $f20 ADD2 $f22, $f23, $f17 MUL $f30, $f5, $f21 ADD1 $f24, $f25, $f18 MUL $f30, $f4, $f22 ADD2 $f26, $f27, $f19 MUL $f29, $f5, $f23 ADD $f16, $f8, $f16 MUL $f29, $f6, $f24 ADD $f17, $f28, $f17 MUL $f30, $f7, $f25 ADD $f18, $f10, $f18 MUL $f30, $f6, $f26 ADD $f19, $f11, $f19 MUL $f29, $f7, $f27 ST $f16, 0*SIZE($20) ADD1 $f20, $f21, $f16 ST $f17, 1*SIZE($20) ADD2 $f22, $f23, $f17 ST $f18, 2*SIZE($20) ADD1 $f24, $f25, $f18 ST $f19, 3*SIZE($20) ADD2 $f26, $f27, $f19 ADD $f16, $f12, $f16 ADD $f17, $f13, $f17 ADD $f18, $f14, $f18 ADD $f19, $f15, $f19 ST $f16, 4*SIZE($20) ST $f17, 5*SIZE($20) ST $f18, 6*SIZE($20) ST $f19, 7*SIZE($20) unop addq $20, 8*SIZE, $20 unop ble $5, $End .align 4 $Remain: subq $5, 1, $6 ble $5, $End LD $f0, 0*SIZE($18) LD $f1, 1*SIZE($18) LD $f8, 0*SIZE($20) LD $f28, 1*SIZE($20) addq $18, 2*SIZE, $18 ble $6, $RemainLoopEnd .align 4 $RemainLoop: MUL $f29, $f0, $f20 subq $6, 1, $6 MUL $f30, $f1, $f21 addq $20, 2*SIZE, $20 MUL $f30, $f0, $f22 LD $f0, 0*SIZE($18) MUL $f29, $f1, $f23 LD $f1, 1*SIZE($18) ADD1 $f20, $f21, $f16 ADD2 $f22, $f23, $f17 ADD $f16, $f8, $f16 LD $f8, 0*SIZE($20) ADD $f17, $f28, $f17 LD $f28, 1*SIZE($20) ST $f16,-2*SIZE($20) addq $18, 2*SIZE, $18 ST $f17,-1*SIZE($20) bgt $6, $RemainLoop .align 4 $RemainLoopEnd: MUL $f29, $f0, $f20 MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 ADD1 $f20, $f21, $f16 ADD2 $f22, $f23, $f17 ADD $f16, $f8, $f16 ADD $f17, $f28, $f17 ST $f16, 0*SIZE($20) nop ST $f17, 1*SIZE($20) nop .align 4 $End: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) lda $sp, 64($sp) ret .align 4 $Sub: SXSUBL $16, SIZE, $22 addq $22, $22, $22 # Complex .align 4 addq $19, $19, $19 # Complex addq $21, $21, $21 # Complex ble $4, $SubRemain LD $f0, 0*SIZE($18) LD $f1, 1*SIZE($18) SXADDQ $19, $18, $18 LD $f2, 0*SIZE($18) LD $f3, 1*SIZE($18) SXADDQ $19, $18, $18 LD $f4, 0*SIZE($18) LD $f5, 1*SIZE($18) SXADDQ $19, $18, $18 LD $f6, 0*SIZE($18) LD $f7, 1*SIZE($18) SXADDQ $19, $18, $18 LD $f8, 0*SIZE($20) LD $f28, 1*SIZE($20) SXADDQ $21, $20, $24 LD $f10, 0*SIZE($24) LD $f11, 1*SIZE($24) SXADDQ $21, $24, $24 LD $f12, 0*SIZE($24) LD $f13, 1*SIZE($24) SXADDQ $21, $24, $24 LD $f14, 0*SIZE($24) LD $f15, 1*SIZE($24) SXADDQ $21, $24, $24 subq $4, 1, $4 ble $4, $SubMainLoopEnd .align 4 $SubMainLoop: MUL $f29, $f0, $f20 unop MUL $f30, $f1, $f21 unop MUL $f30, $f0, $f22 LD $f0, 0*SIZE($18) MUL $f29, $f1, $f23 LD $f1, 1*SIZE($18) MUL $f29, $f2, $f24 SXADDQ $19, $18, $18 MUL $f30, $f3, $f25 unop MUL $f30, $f2, $f26 LD $f2, 0*SIZE($18) MUL $f29, $f3, $f27 LD $f3, 1*SIZE($18) ADD1 $f20, $f21, $f16 SXADDQ $19, $18, $18 MUL $f29, $f4, $f20 unop ADD2 $f22, $f23, $f17 unop MUL $f30, $f5, $f21 unop ADD1 $f24, $f25, $f18 unop MUL $f30, $f4, $f22 LD $f4, 0*SIZE($18) ADD2 $f26, $f27, $f19 unop MUL $f29, $f5, $f23 LD $f5, 1*SIZE($18) ADD $f16, $f8, $f16 LD $f8, 0*SIZE($24) MUL $f29, $f6, $f24 SXADDQ $19, $18, $18 ADD $f17, $f28, $f17 LD $f28, 1*SIZE($24) MUL $f30, $f7, $f25 SXADDQ $21, $24, $24 ADD $f18, $f10, $f18 LD $f10, 0*SIZE($24) MUL $f30, $f6, $f26 LD $f6, 0*SIZE($18) ADD $f19, $f11, $f19 LD $f11, 1*SIZE($24) MUL $f29, $f7, $f27 LD $f7, 1*SIZE($18) ST $f16, 0*SIZE($20) SXADDQ $19, $18, $18 ADD1 $f20, $f21, $f16 unop ST $f17, 1*SIZE($20) SXADDQ $21, $20, $20 ADD2 $f22, $f23, $f17 unop ST $f18, 0*SIZE($20) SXADDQ $21, $24, $24 ADD1 $f24, $f25, $f18 unop ST $f19, 1*SIZE($20) unop ADD2 $f26, $f27, $f19 SXADDQ $21, $20, $20 ADD $f16, $f12, $f16 unop LD $f12, 0*SIZE($24) unop ADD $f17, $f13, $f17 unop LD $f13, 1*SIZE($24) SXADDQ $21, $24, $24 ADD $f18, $f14, $f18 subq $4, 1, $4 LD $f14, 0*SIZE($24) unop ADD $f19, $f15, $f19 unop LD $f15, 1*SIZE($24) SXADDQ $21, $24, $24 ST $f16, 0*SIZE($20) ST $f17, 1*SIZE($20) SXADDQ $21, $20, $20 unop ST $f18, 0*SIZE($20) ST $f19, 1*SIZE($20) SXADDQ $21, $20, $20 bgt $4, $SubMainLoop .align 4 $SubMainLoopEnd: MUL $f29, $f0, $f20 MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 MUL $f29, $f2, $f24 MUL $f30, $f3, $f25 MUL $f30, $f2, $f26 MUL $f29, $f3, $f27 ADD1 $f20, $f21, $f16 MUL $f29, $f4, $f20 ADD2 $f22, $f23, $f17 MUL $f30, $f5, $f21 ADD1 $f24, $f25, $f18 MUL $f30, $f4, $f22 ADD2 $f26, $f27, $f19 MUL $f29, $f5, $f23 ADD $f16, $f8, $f16 MUL $f29, $f6, $f24 ADD $f17, $f28, $f17 MUL $f30, $f7, $f25 ADD $f18, $f10, $f18 MUL $f30, $f6, $f26 ADD $f19, $f11, $f19 MUL $f29, $f7, $f27 ST $f16, 0*SIZE($20) ADD1 $f20, $f21, $f16 ST $f17, 1*SIZE($20) ADD2 $f22, $f23, $f17 SXADDQ $21, $20, $20 nop ST $f18, 0*SIZE($20) ADD1 $f24, $f25, $f18 ST $f19, 1*SIZE($20) ADD2 $f26, $f27, $f19 SXADDQ $21, $20, $20 ADD $f16, $f12, $f16 ADD $f17, $f13, $f17 ADD $f18, $f14, $f18 ADD $f19, $f15, $f19 ST $f16, 0*SIZE($20) ST $f17, 1*SIZE($20) SXADDQ $21, $20, $20 ST $f18, 0*SIZE($20) ST $f19, 1*SIZE($20) SXADDQ $21, $20, $20 ble $5, $SubEnd .align 4 $SubRemain: subq $5, 1, $6 ble $5, $SubEnd LD $f0, 0*SIZE($18) LD $f1, 1*SIZE($18) LD $f8, 0*SIZE($20) LD $f28, 1*SIZE($20) SXADDQ $19, $18, $18 SXADDQ $21, $20, $24 ble $6, $SubRemainLoopEnd .align 4 $SubRemainLoop: MUL $f29, $f0, $f20 MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 LD $f0, 0*SIZE($18) MUL $f29, $f1, $f23 LD $f1, 1*SIZE($18) ADD1 $f20, $f21, $f16 SXADDQ $19, $18, $18 ADD2 $f22, $f23, $f17 nop ADD $f16, $f8, $f16 LD $f8, 0*SIZE($24) ADD $f17, $f28, $f17 LD $f28, 1*SIZE($24) SXADDQ $21, $24, $24 subq $6, 1, $6 ST $f16, 0*SIZE($20) ST $f17, 1*SIZE($20) SXADDQ $21, $20, $20 bgt $6, $SubRemainLoop .align 4 $SubRemainLoopEnd: MUL $f29, $f0, $f20 MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 ADD1 $f20, $f21, $f16 ADD2 $f22, $f23, $f17 ADD $f16, $f8, $f16 ADD $f17, $f28, $f17 ST $f16, 0*SIZE($20) nop ST $f17, 1*SIZE($20) nop .align 4 $SubEnd: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) lda $sp, 64($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zdot.S000066400000000000000000000207621313527062700166240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 88 #define N $16 #define X $17 #define INCX $18 #define Y $19 #define INCY $20 #define XX $21 #define YY $23 #define I $5 #define s0 $f0 #define s1 $f1 #define s2 $f2 #define s3 $f30 #define a0 $f10 #define a1 $f11 #define a2 $f12 #define a3 $f13 #define a4 $f14 #define a5 $f15 #define a6 $f16 #define a7 $f17 #define b0 $f18 #define b1 $f19 #define b2 $f20 #define b3 $f21 #define b4 $f22 #define b5 $f23 #define b6 $f24 #define b7 $f25 #define t0 $f26 #define t1 $f27 #define t2 $f28 #define t3 $f29 PROLOGUE PROFCODE .frame $sp, 16, $26, 0 lda $sp, -16($sp) fclr s0 stt $f2, 0($sp) fclr s1 fclr s2 addq INCX, INCX, INCX fclr s3 ble N, $L999 addq INCY, INCY, INCY fclr t0 fclr t1 fclr t2 fclr t3 srl N, 3, I ble I, $L25 LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) LD b0, 0 * SIZE(Y) LD b1, 1 * SIZE(Y) SXADDQ INCX, X, X SXADDQ INCY, Y, Y LD a2, 0 * SIZE(X) LD a3, 1 * SIZE(X) LD b2, 0 * SIZE(Y) LD b3, 1 * SIZE(Y) SXADDQ INCX, X, X SXADDQ INCY, Y, Y LD a4, 0 * SIZE(X) LD a5, 1 * SIZE(X) LD b4, 0 * SIZE(Y) LD b5, 1 * SIZE(Y) SXADDQ INCX, X, X SXADDQ INCY, Y, Y LD a6, 0 * SIZE(X) LD b6, 0 * SIZE(Y) subq I, 1, I ble I, $L23 .align 4 $L22: ADD s0, t0, s0 LD a7, 1 * SIZE(X) MUL a0, b0, t0 LD b7, 1 * SIZE(Y) ADD s1, t1, s1 ldl $31, PREFETCHSIZE * SIZE(X) MUL a0, b1, t1 SXADDQ INCX, X, X ADD s2, t2, s2 ldl $31, PREFETCHSIZE * SIZE(Y) MUL a1, b0, t2 SXADDQ INCY, Y, Y ADD s3, t3, s3 LD a0, 0 * SIZE(X) MUL a1, b1, t3 LD a1, 1 * SIZE(X) ADD s0, t0, s0 LD b0, 0 * SIZE(Y) MUL a2, b2, t0 LD b1, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a2, b3, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a3, b2, t2 unop ADD s3, t3, s3 LD a2, 0 * SIZE(X) MUL a3, b3, t3 LD a3, 1 * SIZE(X) ADD s0, t0, s0 LD b2, 0 * SIZE(Y) MUL a4, b4, t0 LD b3, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a4, b5, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a5, b4, t2 unop ADD s3, t3, s3 LD a4, 0 * SIZE(X) MUL a5, b5, t3 LD a5, 1 * SIZE(X) ADD s0, t0, s0 LD b4, 0 * SIZE(Y) MUL a6, b6, t0 LD b5, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a6, b7, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a7, b6, t2 unop ADD s3, t3, s3 LD a6, 0 * SIZE(X) MUL a7, b7, t3 LD a7, 1 * SIZE(X) ADD s0, t0, s0 LD b6, 0 * SIZE(Y) MUL a0, b0, t0 LD b7, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a0, b1, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a1, b0, t2 unop ADD s3, t3, s3 LD a0, 0 * SIZE(X) MUL a1, b1, t3 LD a1, 1 * SIZE(X) ADD s0, t0, s0 LD b0, 0 * SIZE(Y) MUL a2, b2, t0 LD b1, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a2, b3, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a3, b2, t2 unop ADD s3, t3, s3 LD a2, 0 * SIZE(X) MUL a3, b3, t3 LD a3, 1 * SIZE(X) ADD s0, t0, s0 LD b2, 0 * SIZE(Y) MUL a4, b4, t0 LD b3, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a4, b5, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a5, b4, t2 subq I, 1, I ADD s3, t3, s3 LD a4, 0 * SIZE(X) MUL a5, b5, t3 LD a5, 1 * SIZE(X) ADD s0, t0, s0 LD b4, 0 * SIZE(Y) MUL a6, b6, t0 LD b5, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a6, b7, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 LD a6, 0 * SIZE(X) MUL a7, b6, t2 unop ADD s3, t3, s3 LD b6, 0 * SIZE(Y) MUL a7, b7, t3 bgt I, $L22 .align 4 $L23: ADD s0, t0, s0 LD a7, 1 * SIZE(X) MUL a0, b0, t0 LD b7, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a0, b1, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a1, b0, t2 unop ADD s3, t3, s3 LD a0, 0 * SIZE(X) MUL a1, b1, t3 LD a1, 1 * SIZE(X) ADD s0, t0, s0 LD b0, 0 * SIZE(Y) MUL a2, b2, t0 LD b1, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a2, b3, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a3, b2, t2 unop ADD s3, t3, s3 LD a2, 0 * SIZE(X) MUL a3, b3, t3 LD a3, 1 * SIZE(X) ADD s0, t0, s0 LD b2, 0 * SIZE(Y) MUL a4, b4, t0 LD b3, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a4, b5, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a5, b4, t2 unop ADD s3, t3, s3 LD a4, 0 * SIZE(X) MUL a5, b5, t3 LD a5, 1 * SIZE(X) ADD s0, t0, s0 LD b4, 0 * SIZE(Y) MUL a6, b6, t0 LD b5, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a6, b7, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 unop MUL a7, b6, t2 unop ADD s3, t3, s3 LD a6, 0 * SIZE(X) MUL a7, b7, t3 LD a7, 1 * SIZE(X) ADD s0, t0, s0 LD b6, 0 * SIZE(Y) MUL a0, b0, t0 LD b7, 1 * SIZE(Y) ADD s1, t1, s1 SXADDQ INCX, X, X MUL a0, b1, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 MUL a1, b0, t2 ADD s3, t3, s3 MUL a1, b1, t3 ADD s0, t0, s0 MUL a2, b2, t0 ADD s1, t1, s1 MUL a2, b3, t1 ADD s2, t2, s2 MUL a3, b2, t2 ADD s3, t3, s3 MUL a3, b3, t3 ADD s0, t0, s0 MUL a4, b4, t0 ADD s1, t1, s1 MUL a4, b5, t1 ADD s2, t2, s2 MUL a5, b4, t2 ADD s3, t3, s3 MUL a5, b5, t3 ADD s0, t0, s0 MUL a6, b6, t0 ADD s1, t1, s1 MUL a6, b7, t1 ADD s2, t2, s2 MUL a7, b6, t2 ADD s3, t3, s3 MUL a7, b7, t3 .align 4 $L25: and N, 7, I unop unop ble I, $L998 LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) LD b0, 0 * SIZE(Y) LD b1, 1 * SIZE(Y) SXADDQ INCX, X, X subq I, 1, I SXADDQ INCY, Y, Y ble I, $L28 .align 4 $L26: ADD s0, t0, s0 mov X, XX MUL a0, b0, t0 mov Y, YY ADD s1, t1, s1 SXADDQ INCX, X, X MUL a0, b1, t1 SXADDQ INCY, Y, Y ADD s2, t2, s2 LD a0, 0 * SIZE(XX) MUL a1, b0, t2 LD b0, 0 * SIZE(YY) ADD s3, t3, s3 subq I, 1, I MUL a1, b1, t3 LD a1, 1 * SIZE(XX) LD b1, 1 * SIZE(YY) bgt I, $L26 .align 4 $L28: ADD s0, t0, s0 MUL a0, b0, t0 ADD s1, t1, s1 MUL a0, b1, t1 ADD s2, t2, s2 MUL a1, b0, t2 ADD s3, t3, s3 MUL a1, b1, t3 .align 4 $L998: ADD s0, t0, s0 ADD s1, t1, s1 ADD s2, t2, s2 ADD s3, t3, s3 #ifndef CONJ SUB s0, s3, s0 ADD s1, s2, s1 #else ADD s0, s3, s0 SUB s1, s2, s1 #endif .align 4 $L999: ldt $f2, 0($sp) lda $sp, 16($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zgemm_beta.S000066400000000000000000000114671313527062700177600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" .set noat .set noreorder .text .align 5 .globl CNAME .ent CNAME CNAME: .frame $sp, 0, $26, 0 #ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount .prologue 1 #else .prologue 0 #endif ldq $18, 24($sp) ble $16, $End ldl $19, 32($sp) ble $17, $End addq $19, $19, $19 fbne $f19,$Main fbne $f20,$Main .align 4 $L13: mov $18, $1 lda $17, -1($17) SXADDQ $19, $18, $18 mov $16, $2 .align 4 $L12: ST $f31, 0*SIZE($1) ST $f31, 1*SIZE($1) lda $2, -1($2) lda $1, 2*SIZE($1) bgt $2, $L12 bgt $17,$L13 clr $0 ret .align 4 /* Main Routine */ $Main: sra $16, 1, $2 # $2 = (m >> 1) mov $18, $1 # c_offset = c lda $17, -1($17) # n -- SXADDQ $19, $18, $18 # c += ldc beq $2, $L18 LD $f14, 0*SIZE($1) LD $f15, 1*SIZE($1) LD $f24, 2*SIZE($1) LD $f25, 3*SIZE($1) lda $2, -1($2) # $2 -- ble $2, $L19 .align 4 $L23: MUL $f19, $f14, $f10 lds $f31, 9*SIZE($1) MUL $f20, $f15, $f11 lda $2, -1($2) MUL $f19, $f15, $f12 LD $f15, 5*SIZE($1) MUL $f20, $f14, $f13 LD $f14, 4*SIZE($1) MUL $f19, $f24, $f16 unop MUL $f20, $f25, $f17 unop MUL $f19, $f25, $f18 LD $f25, 7*SIZE($1) SUB $f10, $f11, $f22 unop MUL $f20, $f24, $f21 LD $f24, 6*SIZE($1) ADD $f12, $f13, $f23 lda $1, 4*SIZE($1) SUB $f16, $f17, $f26 ADD $f18, $f21, $f27 ST $f22,-4*SIZE($1) ST $f23,-3*SIZE($1) ST $f26,-2*SIZE($1) ST $f27,-1*SIZE($1) unop bgt $2,$L23 .align 4 $L19: MUL $f19, $f14, $f10 MUL $f20, $f15, $f11 MUL $f19, $f15, $f12 MUL $f20, $f14, $f13 MUL $f19, $f24, $f16 MUL $f20, $f25, $f17 MUL $f19, $f25, $f18 MUL $f20, $f24, $f21 SUB $f10, $f11, $f22 ADD $f12, $f13, $f23 SUB $f16, $f17, $f26 ADD $f18, $f21, $f27 lda $1, 4*SIZE($1) ST $f22, -4*SIZE($1) ST $f23, -3*SIZE($1) ST $f26, -2*SIZE($1) ST $f27, -1*SIZE($1) blbs $16, $L18 bgt $17, $Main clr $0 ret .align 4 $L18: LD $f14, 0*SIZE($1) LD $f15, 1*SIZE($1) MUL $f19, $f15, $f13 MUL $f20, $f14, $f10 MUL $f19, $f14, $f12 MUL $f20, $f15, $f11 ADD $f13, $f10, $f26 SUB $f12, $f11, $f27 ST $f26, 1*SIZE($1) ST $f27, 0*SIZE($1) lda $1, 2*SIZE($1) bgt $17, $Main .align 4 $End: clr $0 ret .ident VERSION .end CNAME OpenBLAS-0.2.20/kernel/alpha/zgemm_kernel_2x2.S000066400000000000000000000625751313527062700210260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 48 #define UNOP #endif #ifdef EV4 #define UNOP #endif .set noat .set noreorder .arch ev6 .text .align 5 .globl CNAME .ent CNAME #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $21 #define B $22 #define C $20 #define LDC $23 #define C1 $19 #define C2 $24 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha_i $f29 #define alpha_r $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define BB $3 #define OFFSET $4 #define ALPHA_R 64($sp) #define ALPHA_I 72($sp) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 ADD #define ADD2 ADD #define ADD3 SUB #define ADD4 ADD #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 ADD #define ADD2 ADD #define ADD3 ADD #define ADD4 SUB #else #define ADD1 ADD #define ADD2 SUB #define ADD3 SUB #define ADD4 SUB #endif CNAME: .frame $sp, STACKSIZE, $26, 0 #ifdef PROFILE ldgp $gp, 0($27) lda $at, _mcount jsr $at, ($at), _mcount #endif #ifndef PROFILE .prologue 0 #else .prologue 1 #endif lda $sp, -STACKSIZE($sp) ldq B, 0 + STACKSIZE($sp) ldq C, 8 + STACKSIZE($sp) ldq LDC, 16 + STACKSIZE($sp) #ifdef TRMMKERNEL ldq OFFSET, 24 + STACKSIZE($sp) #endif sll LDC, ZBASE_SHIFT, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) stt $f19, ALPHA_R stt $f20, ALPHA_I cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #if defined(TRMMKERNEL) && !defined(LEFT) subq $31, OFFSET, KK #endif sra N, 1, J ble J, $L30 .align 4 $L01: mov C, C1 addq C, LDC, C2 mov A, AO s4addq K, 0, BB #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif SXADDQ BB, B, BB addq C2, LDC, C unop sra M, 1, I fclr t1 fclr t2 fclr t3 fclr t4 fclr c01 fclr c05 ble I, $L20 .align 4 $L11: #ifndef EV4 ldl $31, 0 * SIZE(BB) ldl $31, 8 * SIZE(BB) unop lda BB, 16 * SIZE(BB) #endif #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 2, TMP1 #else addq KK, 2, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c07 lda BO, 4 * SIZE(B) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble L, $L15 #else sll KK, ZBASE_SHIFT + 1, TMP1 addq AO, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c07 lda BO, 4 * SIZE(BO) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(TMP1) fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble L, $L15 #endif .align 5 $L12: /* 1 */ ADD1 c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD3 c12, t2, c12 unop MUL b1, a2, t2 unop ADD2 c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD4 c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD1 c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD3 c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD1 c03, t1, c03 unop MUL b3, a1, t1 unop ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD1 c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD1 c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD3 c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD2 c16, t3, c16 unop MUL b2, a2, t3 unop ADD4 c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD1 c01, t1, c01 unop MUL b5, a6, t1 unop ADD3 c02, t2, c02 unop MUL b5, a4, t2 unop ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD1 c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD3 c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD1 c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD1 c11, t1, c11 ldt alpha_r, ALPHA_R MUL b1, a1, t1 #ifndef TRMMKERNEL blbs K, $L18 #else blbs TMP1, $L18 #endif .align 4 ADD3 c12, t2, c12 MUL b1, a2, t2 ADD2 c16, t3, c16 MUL b2, a2, t3 ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD1 c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD1 c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L18: ADD3 c12, t2, c12 unop MUL b1, a2, t2 ldt alpha_i, ALPHA_I ADD2 c16, t3, c16 unop MUL b2, a2, t3 #ifndef TRMMKERNEL LD a5, 0 * SIZE(C1) #else unop #endif ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 unop MUL b1, a4, t2 #ifndef TRMMKERNEL LD b1, 1 * SIZE(C1) #else unop #endif ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 unop MUL b3, a1, t1 #ifndef TRMMKERNEL LD a1, 2 * SIZE(C1) #else unop #endif ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 #ifndef TRMMKERNEL LD a2, 3 * SIZE(C1) #else unop #endif ADD4 c13, t4, c13 unop MUL b2, a3, t4 #ifndef TRMMKERNEL LD b2, 0 * SIZE(C2) #else unop #endif ADD1 c09, t1, c09 lda I, -1(I) MUL b3, a3, t1 unop ADD3 c10, t2, c10 unop MUL b3, a4, t2 #ifndef TRMMKERNEL LD b3, 1 * SIZE(C2) #else unop #endif ADD2 c14, t3, c14 unop MUL b4, a4, t3 #ifndef TRMMKERNEL LD a4, 2 * SIZE(C2) #else unop #endif ADD4 c07, t4, c07 unop MUL b4, a3, t4 #ifndef TRMMKERNEL LD a3, 3 * SIZE(C2) #else unop #endif ADD1 c11, t1, c11 ADD3 c12, t2, c12 ADD2 c16, t3, c16 ADD4 c15, t4, c15 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 ADD c09, c14, c09 MUL alpha_r, c01, t1 ADD c10, c13, c10 MUL alpha_r, c02, t2 ADD c11, c16, c11 MUL alpha_r, c03, t3 ADD c12, c15, c12 MUL alpha_r, c04, t4 #ifndef TRMMKERNEL ADD a5, t1, a5 MUL alpha_i, c02, t1 ADD b1, t2, b1 MUL alpha_i, c01, t2 ADD a1, t3, a1 MUL alpha_i, c04, t3 ADD a2, t4, a2 MUL alpha_i, c03, t4 #else ADD $f31, t1, a5 MUL alpha_i, c02, t1 ADD $f31, t2, b1 MUL alpha_i, c01, t2 ADD $f31, t3, a1 MUL alpha_i, c04, t3 ADD $f31, t4, a2 MUL alpha_i, c03, t4 #endif SUB a5, t1, a5 MUL alpha_r, c09, t1 ADD b1, t2, b1 MUL alpha_r, c10, t2 SUB a1, t3, a1 MUL alpha_r, c11, t3 ADD a2, t4, a2 MUL alpha_r, c12, t4 #ifndef TRMMKERNEL ADD b2, t1, b2 MUL alpha_i, c10, t1 ADD b3, t2, b3 MUL alpha_i, c09, t2 ADD a4, t3, a4 MUL alpha_i, c12, t3 ADD a3, t4, a3 MUL alpha_i, c11, t4 #else ADD $f31, t1, b2 MUL alpha_i, c10, t1 ADD $f31, t2, b3 MUL alpha_i, c09, t2 ADD $f31, t3, a4 MUL alpha_i, c12, t3 ADD $f31, t4, a3 MUL alpha_i, c11, t4 #endif SUB b2, t1, b2 ST a5, 0 * SIZE(C1) fclr t1 unop ADD b3, t2, b3 ST b1, 1 * SIZE(C1) fclr t2 unop SUB a4, t3, a4 ST a1, 2 * SIZE(C1) fclr t3 unop ADD a3, t4, a3 ST a2, 3 * SIZE(C1) fclr t4 unop ST b2, 0 * SIZE(C2) fclr c01 ST b3, 1 * SIZE(C2) fclr c05 ST a4, 2 * SIZE(C2) lda C1, 4 * SIZE(C1) ST a3, 3 * SIZE(C2) lda C2, 4 * SIZE(C2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 2, TMP1 #else subq TMP1, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 2, KK #endif bgt I, $L11 .align 4 $L20: and M, 1, I ble I, $L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 1, TMP1 #else addq KK, 2, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 4 * SIZE(B) #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif ble L, $L25 #else sll KK, ZBASE_SHIFT + 0, TMP1 addq AO, TMP1, AO sll KK, ZBASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 4 * SIZE(BO) lda L, -2(TMP1) ble L, $L25 #endif .align 5 $L22: ADD1 c09, t1, c09 unop MUL a1, b1, t1 unop ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 unop ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD2 c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD1 c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD3 c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD2 c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD1 c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD1 c09, t1, c09 ldt alpha_r, ALPHA_R MUL a1, b1, t1 #ifndef TRMMKERNEL blbs K, $L28 #else blbs TMP1, $L28 #endif .align 4 ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 unop ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD1 c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L28: ADD3 c10, t2, c10 unop MUL a2, b1, t2 ldt alpha_i, ALPHA_I ADD4 c13, t3, c13 unop MUL a1, b2, t3 #ifndef TRMMKERNEL LD c03, 0 * SIZE(C1) #else unop #endif ADD2 c14, t4, c14 unop MUL a2, b2, t4 #ifndef TRMMKERNEL LD c04, 1 * SIZE(C1) #else unop #endif ADD1 c01, t1, c01 unop MUL a1, b3, t1 #ifndef TRMMKERNEL LD c11, 0 * SIZE(C2) #else unop #endif ADD3 c02, t2, c02 unop MUL a2, b3, t2 #ifndef TRMMKERNEL LD c12, 1 * SIZE(C2) #else unop #endif ADD4 c05, t3, c05 MUL a1, b4, t3 ADD2 c06, t4, c06 MUL a2, b4, t4 ADD1 c09, t1, c09 ADD3 c10, t2, c10 ADD4 c13, t3, c13 ADD2 c14, t4, c14 ADD c01, c06, c01 ADD c02, c05, c02 ADD c09, c14, c09 ADD c10, c13, c10 MUL alpha_r, c01, t1 MUL alpha_r, c02, t2 MUL alpha_r, c09, t3 MUL alpha_r, c10, t4 #ifndef TRMMKERNEL ADD c03, t1, c03 MUL alpha_i, c02, t1 ADD c04, t2, c04 MUL alpha_i, c01, t2 ADD c11, t3, c11 MUL alpha_i, c10, t3 ADD c12, t4, c12 MUL alpha_i, c09, t4 #else ADD $f31, t1, c03 MUL alpha_i, c02, t1 ADD $f31, t2, c04 MUL alpha_i, c01, t2 ADD $f31, t3, c11 MUL alpha_i, c10, t3 ADD $f31, t4, c12 MUL alpha_i, c09, t4 #endif SUB c03, t1, c03 ADD c04, t2, c04 SUB c11, t3, c11 ADD c12, t4, c12 ST c03, 0 * SIZE(C1) ST c04, 1 * SIZE(C1) ST c11, 0 * SIZE(C2) ST c12, 1 * SIZE(C2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 1, TMP1 #else subq TMP1, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 1, KK #endif .align 4 $L29: mov BO, B lda J, -1(J) #if defined(TRMMKERNEL) && !defined(LEFT) addq KK, 2, KK #else unop #endif bgt J, $L01 .align 4 $L30: and N, 1, J ble J, $L999 mov C, C1 mov A, AO #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif sra M, 1, I ble I, $L50 .align 4 $L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 2, TMP1 #else addq KK, 1, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda BO, 2 * SIZE(B) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif fclr c04 fclr c08 ble L, $L45 #else sll KK, ZBASE_SHIFT + 1, TMP1 addq AO, TMP1, AO sll KK, ZBASE_SHIFT + 0, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda BO, 2 * SIZE(BO) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 lda L, -2(TMP1) fclr c04 fclr c08 ble L, $L45 #endif .align 5 $L42: ADD4 c05, t1, c05 unop MUL a1, b1, t1 unop ADD2 c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD4 c07, t3, c07 unop MUL a3, b1, t3 unop ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD4 c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD2 c06, t2, c06 unop MUL a2, b3, t2 unop ADD4 c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD2 c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD1 c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD3 c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L42 .align 4 $L45: ADD4 c05, t1, c05 ldt alpha_r, ALPHA_R MUL b1, a1, t1 #ifndef TRMMKERNEL blbs K, $L48 #else blbs TMP1, $L48 #endif .align 4 ADD2 c06, t2, c06 MUL a2, b1, t2 ADD4 c07, t3, c07 MUL a3, b1, t3 ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD4 c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L48: ADD2 c06, t2, c06 unop MUL a2, b1, t2 ldt alpha_i, ALPHA_I ADD4 c07, t3, c07 lda I, -1(I) MUL a3, b1, t3 #ifndef TRMMKERNEL LD c09, 0 * SIZE(C1) #else unop #endif ADD2 c08, t4, c08 unop MUL a4, b1, t4 #ifndef TRMMKERNEL LD c10, 1 * SIZE(C1) #else unop #endif ADD1 c01, t1, c01 unop MUL a1, b2, t1 #ifndef TRMMKERNEL LD c11, 2 * SIZE(C1) #else unop #endif ADD3 c02, t2, c02 unop MUL a2, b2, t2 #ifndef TRMMKERNEL LD c12, 3 * SIZE(C1) #else unop #endif ADD1 c03, t3, c03 MUL a3, b2, t3 ADD3 c04, t4, c04 MUL a4, b2, t4 ADD4 c05, t1, c05 ADD2 c06, t2, c06 ADD4 c07, t3, c07 ADD2 c08, t4, c08 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 MUL alpha_r, c01, t1 MUL alpha_r, c02, t2 MUL alpha_r, c03, t3 MUL alpha_r, c04, t4 #ifndef TRMMKERNEL ADD c09, t1, c09 MUL alpha_i, c02, t1 ADD c10, t2, c10 MUL alpha_i, c01, t2 ADD c11, t3, c11 MUL alpha_i, c04, t3 ADD c12, t4, c12 MUL alpha_i, c03, t4 #else ADD $f31, t1, c09 MUL alpha_i, c02, t1 ADD $f31, t2, c10 MUL alpha_i, c01, t2 ADD $f31, t3, c11 MUL alpha_i, c04, t3 ADD $f31, t4, c12 MUL alpha_i, c03, t4 #endif SUB c09, t1, c09 ADD c10, t2, c10 SUB c11, t3, c11 ADD c12, t4, c12 ST c09, 0 * SIZE(C1) ST c10, 1 * SIZE(C1) ST c11, 2 * SIZE(C1) ST c12, 3 * SIZE(C1) lda C1, 4 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) subq K, KK, TMP1 #ifdef LEFT subq TMP1, 2, TMP1 #else subq TMP1, 1, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT + 0, TMP2 addq BO, TMP2, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq KK, 2, KK #endif bgt I, $L41 .align 4 $L50: and M, 1, I ble I, $L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef TRMMKERNEL #ifdef LEFT addq KK, 1, TMP1 #else addq KK, 1, TMP1 #endif #endif LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(B) #ifndef TRMMKERNEL lda L, -2(K) #else lda L, -2(TMP1) #endif ble L, $L55 #else sll KK, ZBASE_SHIFT + 0, TMP1 addq AO, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(BO) lda L, -2(TMP1) ble L, $L55 #endif .align 5 $L52: ADD1 c01, t1, c01 unop MUL a1, b1, t1 unop ADD3 c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD4 c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L52 .align 4 $L55: ADD1 c01, t1, c01 ldt alpha_r, ALPHA_R MUL a1, b1, t1 #ifndef TRMMKERNEL blbs K, $L58 #else blbs TMP1, $L58 #endif .align 4 ADD3 c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L58: ADD3 c02, t2, c02 unop MUL a2, b1, t2 ldt alpha_i, ALPHA_I ADD4 c05, t3, c05 unop MUL a1, b2, t3 #ifndef TRMMKERNEL LD c03, 0 * SIZE(C1) #else unop #endif ADD2 c06, t4, c06 unop MUL a2, b2, t4 #ifndef TRMMKERNEL LD c04, 1 * SIZE(C1) #else unop #endif ADD1 c01, t1, c01 ADD3 c02, t2, c02 ADD4 c05, t3, c05 ADD2 c06, t4, c06 ADD c01, c06, c01 ADD c02, c05, c02 MUL alpha_r, c01, t1 MUL alpha_r, c02, t2 MUL alpha_i, c02, t3 MUL alpha_i, c01, t4 #ifndef TRMMKERNEL ADD c03, t1, c03 ADD c04, t2, c04 #else ADD $f31, t1, c03 ADD $f31, t2, c04 #endif SUB c03, t3, c03 ADD c04, t4, c04 ST c03, 0 * SIZE(C1) ST c04, 1 * SIZE(C1) .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret .ident VERSION .end CNAME OpenBLAS-0.2.20/kernel/alpha/zgemv_n.S000066400000000000000000000421231313527062700173040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define STACKSIZE 64 #define PREFETCHSIZE 32 #define M $16 #define N $17 #define A $21 #define LDA $18 #define X $19 #define INCX $20 #define Y $22 #define INCY $23 #define BUFFER $24 #define I $25 #define J $27 #define Y1 $4 #define A1 $5 #define A2 $6 #define alpha_r $f19 #define alpha_i $f20 #define alpha1 $f0 #define alpha2 $f1 #define alpha3 $f10 #define alpha4 $f11 #define y0 $f12 #define y1 $f13 #define y2 $f14 #define y3 $f15 #define y4 $f16 #define y5 $f17 #define y6 $f18 #define y7 $f21 #define a0 $f22 #define a1 $f23 #define a2 $f24 #define a3 $f25 #define a4 $f26 #define a5 $f27 #define a6 $f28 #define a7 $f29 #define t0 $f2 #define t1 $f3 #define t2 $f4 #define t3 $f5 #if !defined(CONJ) && !defined(XCONJ) #define ADD1 ADD #define ADD2 ADD #define ADD3 SUB #define ADD4 ADD #elif defined(CONJ) && !defined(XCONJ) #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #elif !defined(CONJ) && defined(XCONJ) #define ADD1 ADD #define ADD2 ADD #define ADD3 ADD #define ADD4 SUB #else #define ADD1 ADD #define ADD2 SUB #define ADD3 SUB #define ADD4 SUB #endif PROLOGUE lda $sp, -STACKSIZE($sp) ldq LDA, 0 + STACKSIZE($sp) ldq X, 8 + STACKSIZE($sp) ldq INCX, 16 + STACKSIZE($sp) ldq Y, 24 + STACKSIZE($sp) ldq INCY, 32 + STACKSIZE($sp) ldq BUFFER, 40 + STACKSIZE($sp) stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) PROFCODE cmple M, 0, $0 sll INCX, ZBASE_SHIFT, INCX cmple N, 0, $1 sll INCY, ZBASE_SHIFT, INCY or $0, $1, $0 bne $0, $L999 cmpeq INCY, 2 * SIZE, $0 sll LDA, ZBASE_SHIFT,LDA bne $0, $L10 mov BUFFER, Y1 mov Y, BUFFER mov Y1, Y sra M, 2, I ble I, $L05 .align 4 $L02: ST $f31, 0 * SIZE(Y1) ST $f31, 1 * SIZE(Y1) ST $f31, 2 * SIZE(Y1) ST $f31, 3 * SIZE(Y1) ST $f31, 4 * SIZE(Y1) ST $f31, 5 * SIZE(Y1) ST $f31, 6 * SIZE(Y1) ST $f31, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) lda I, -1(I) bgt I, $L02 .align 4 $L05: and M, 3, I ble I, $L10 .align 4 $L06: ST $f31, 0 * SIZE(Y1) ST $f31, 1 * SIZE(Y1) addq Y1, 2 * SIZE, Y1 lda I, -1(I) bgt I, $L06 .align 4 $L10: sra N, 1, J ble J, $L20 .align 4 $L11: LD alpha1, 0 * SIZE(X) LD alpha2, 1 * SIZE(X) addq X, INCX, X LD alpha3, 0 * SIZE(X) LD alpha4, 1 * SIZE(X) addq X, INCX, X MUL alpha_r, alpha1, y0 MUL alpha_r, alpha2, y1 MUL alpha_r, alpha3, y2 MUL alpha_r, alpha4, y3 MUL alpha_i, alpha2, t0 mov A, A1 MUL alpha_i, alpha1, t1 addq A, LDA, A2 MUL alpha_i, alpha4, t2 addq A2, LDA, A MUL alpha_i, alpha3, t3 mov Y, Y1 #ifndef XCONJ SUB y0, t0, alpha1 ADD y1, t1, alpha2 SUB y2, t2, alpha3 ADD y3, t3, alpha4 #else ADD y0, t0, alpha1 SUB y1, t1, alpha2 ADD y2, t2, alpha3 SUB y3, t3, alpha4 #endif ldl $31, 4 * SIZE(X) sra M, 2, I ble I, $L15 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD a4, 0 * SIZE(A2) LD a5, 1 * SIZE(A2) LD a6, 2 * SIZE(A2) LD a7, 3 * SIZE(A2) MUL alpha1, a0, t0 LD y0, 0 * SIZE(Y1) MUL alpha1, a1, t1 LD y1, 1 * SIZE(Y1) MUL alpha1, a2, t2 LD y2, 2 * SIZE(Y1) MUL alpha1, a3, t3 LD y3, 3 * SIZE(Y1) ADD1 y0, t0, y0 unop MUL alpha3, a4, t0 LD y4, 4 * SIZE(Y1) ADD2 y1, t1, y1 unop MUL alpha3, a5, t1 LD y5, 5 * SIZE(Y1) ADD1 y2, t2, y2 unop MUL alpha3, a6, t2 LD y6, 6 * SIZE(Y1) ADD2 y3, t3, y3 unop MUL alpha3, a7, t3 LD y7, 7 * SIZE(Y1) ADD1 y0, t0, y0 unop MUL alpha2, a1, t0 LD a1, 5 * SIZE(A1) ADD2 y1, t1, y1 unop MUL alpha2, a0, t1 LD a0, 4 * SIZE(A1) ADD1 y2, t2, y2 unop MUL alpha2, a3, t2 LD a3, 7 * SIZE(A1) ADD2 y3, t3, y3 unop MUL alpha2, a2, t3 LD a2, 6 * SIZE(A1) ADD3 y0, t0, y0 unop MUL alpha4, a5, t0 LD a5, 5 * SIZE(A2) ADD4 y1, t1, y1 unop MUL alpha4, a4, t1 LD a4, 4 * SIZE(A2) ADD3 y2, t2, y2 unop MUL alpha4, a7, t2 LD a7, 7 * SIZE(A2) ADD4 y3, t3, y3 unop MUL alpha4, a6, t3 LD a6, 6 * SIZE(A2) ADD3 y0, t0, y0 MUL alpha1, a0, t0 ADD4 y1, t1, y1 MUL alpha1, a1, t1 ADD3 y2, t2, y2 unop MUL alpha1, a2, t2 unop ADD4 y3, t3, y3 lda I, -1(I) MUL alpha1, a3, t3 ble I, $L13 .align 4 $L12: ADD1 y4, t0, y4 ST y0, 0 * SIZE(Y1) MUL alpha3, a4, t0 ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) ADD2 y5, t1, y5 ST y1, 1 * SIZE(Y1) MUL alpha3, a5, t1 lda I, -1(I) ADD1 y6, t2, y6 ST y2, 2 * SIZE(Y1) MUL alpha3, a6, t2 unop ADD2 y7, t3, y7 ST y3, 3 * SIZE(Y1) MUL alpha3, a7, t3 unop ADD1 y4, t0, y4 unop MUL alpha2, a1, t0 LD a1, 9 * SIZE(A1) ADD2 y5, t1, y5 unop MUL alpha2, a0, t1 LD a0, 8 * SIZE(A1) ADD1 y6, t2, y6 unop MUL alpha2, a3, t2 LD a3, 11 * SIZE(A1) ADD2 y7, t3, y7 unop MUL alpha2, a2, t3 LD a2, 10 * SIZE(A1) ADD3 y4, t0, y4 lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) MUL alpha4, a5, t0 LD a5, 9 * SIZE(A2) ADD4 y5, t1, y5 unop MUL alpha4, a4, t1 LD a4, 8 * SIZE(A2) ADD3 y6, t2, y6 unop MUL alpha4, a7, t2 LD a7, 11 * SIZE(A2) ADD4 y7, t3, y7 unop MUL alpha4, a6, t3 LD a6, 10 * SIZE(A2) ADD3 y4, t0, y4 unop MUL alpha1, a0, t0 LD y0, 8 * SIZE(Y1) ADD4 y5, t1, y5 unop MUL alpha1, a1, t1 LD y1, 9 * SIZE(Y1) ADD3 y6, t2, y6 unop MUL alpha1, a2, t2 LD y2, 10 * SIZE(Y1) ADD4 y7, t3, y7 unop MUL alpha1, a3, t3 LD y3, 11 * SIZE(Y1) ADD1 y0, t0, y0 ST y4, 4 * SIZE(Y1) MUL alpha3, a4, t0 ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) ADD2 y1, t1, y1 ST y5, 5 * SIZE(Y1) MUL alpha3, a5, t1 unop ADD1 y2, t2, y2 ST y6, 6 * SIZE(Y1) MUL alpha3, a6, t2 unop ADD2 y3, t3, y3 ST y7, 7 * SIZE(Y1) MUL alpha3, a7, t3 lda Y1, 8 * SIZE(Y1) ADD1 y0, t0, y0 unop MUL alpha2, a1, t0 LD a1, 13 * SIZE(A1) ADD2 y1, t1, y1 unop MUL alpha2, a0, t1 LD a0, 12 * SIZE(A1) ADD1 y2, t2, y2 unop MUL alpha2, a3, t2 LD a3, 15 * SIZE(A1) ADD2 y3, t3, y3 unop MUL alpha2, a2, t3 LD a2, 14 * SIZE(A1) ADD3 y0, t0, y0 unop MUL alpha4, a5, t0 LD a5, 13 * SIZE(A2) ADD4 y1, t1, y1 unop MUL alpha4, a4, t1 LD a4, 12 * SIZE(A2) ADD3 y2, t2, y2 unop MUL alpha4, a7, t2 LD a7, 15 * SIZE(A2) ADD4 y3, t3, y3 unop MUL alpha4, a6, t3 LD a6, 14 * SIZE(A2) ADD3 y0, t0, y0 unop MUL alpha1, a0, t0 LD y4, 4 * SIZE(Y1) ADD4 y1, t1, y1 lda A2, 8 * SIZE(A2) MUL alpha1, a1, t1 LD y5, 5 * SIZE(Y1) ADD3 y2, t2, y2 lda A1, 8 * SIZE(A1) MUL alpha1, a2, t2 LD y6, 6 * SIZE(Y1) ADD4 y3, t3, y3 MUL alpha1, a3, t3 LD y7, 7 * SIZE(Y1) bgt I, $L12 .align 4 $L13: ADD1 y4, t0, y4 ST y0, 0 * SIZE(Y1) MUL alpha3, a4, t0 unop ADD2 y5, t1, y5 ST y1, 1 * SIZE(Y1) MUL alpha3, a5, t1 unop ADD1 y6, t2, y6 ST y2, 2 * SIZE(Y1) MUL alpha3, a6, t2 unop ADD2 y7, t3, y7 ST y3, 3 * SIZE(Y1) MUL alpha3, a7, t3 unop ADD1 y4, t0, y4 MUL alpha2, a1, t0 ADD2 y5, t1, y5 MUL alpha2, a0, t1 ADD1 y6, t2, y6 MUL alpha2, a3, t2 ADD2 y7, t3, y7 MUL alpha2, a2, t3 ADD3 y4, t0, y4 MUL alpha4, a5, t0 ADD4 y5, t1, y5 MUL alpha4, a4, t1 ADD3 y6, t2, y6 MUL alpha4, a7, t2 ADD4 y7, t3, y7 MUL alpha4, a6, t3 ADD3 y4, t0, y4 ADD4 y5, t1, y5 ADD3 y6, t2, y6 ADD4 y7, t3, y7 ST y4, 4 * SIZE(Y1) lda A1, 8 * SIZE(A1) ST y5, 5 * SIZE(Y1) lda A2, 8 * SIZE(A2) ST y6, 6 * SIZE(Y1) unop ST y7, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) .align 4 $L15: and M, 2, I ble I, $L17 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD a4, 0 * SIZE(A2) LD a5, 1 * SIZE(A2) LD a6, 2 * SIZE(A2) LD a7, 3 * SIZE(A2) MUL alpha1, a0, t0 LD y0, 0 * SIZE(Y1) MUL alpha1, a1, t1 LD y1, 1 * SIZE(Y1) MUL alpha1, a2, t2 LD y2, 2 * SIZE(Y1) MUL alpha1, a3, t3 LD y3, 3 * SIZE(Y1) ADD1 y0, t0, y0 MUL alpha3, a4, t0 ADD2 y1, t1, y1 MUL alpha3, a5, t1 ADD1 y2, t2, y2 MUL alpha3, a6, t2 ADD2 y3, t3, y3 MUL alpha3, a7, t3 ADD1 y0, t0, y0 MUL alpha2, a1, t0 ADD2 y1, t1, y1 MUL alpha2, a0, t1 ADD1 y2, t2, y2 MUL alpha2, a3, t2 ADD2 y3, t3, y3 MUL alpha2, a2, t3 ADD3 y0, t0, y0 MUL alpha4, a5, t0 ADD4 y1, t1, y1 MUL alpha4, a4, t1 ADD3 y2, t2, y2 MUL alpha4, a7, t2 ADD4 y3, t3, y3 MUL alpha4, a6, t3 ADD3 y0, t0, y0 ADD4 y1, t1, y1 ADD3 y2, t2, y2 ADD4 y3, t3, y3 ST y0, 0 * SIZE(Y1) lda A1, 4 * SIZE(A1) ST y1, 1 * SIZE(Y1) lda A2, 4 * SIZE(A2) ST y2, 2 * SIZE(Y1) unop ST y3, 3 * SIZE(Y1) lda Y1, 4 * SIZE(Y1) .align 4 $L17: blbc M, $L18 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 0 * SIZE(A2) LD a3, 1 * SIZE(A2) LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) MUL alpha1, a0, t0 MUL alpha1, a1, t1 ADD1 y0, t0, y0 MUL alpha3, a2, t0 ADD2 y1, t1, y1 MUL alpha3, a3, t1 ADD1 y0, t0, y0 MUL alpha2, a1, t0 ADD2 y1, t1, y1 MUL alpha2, a0, t1 ADD3 y0, t0, y0 MUL alpha4, a3, t0 ADD4 y1, t1, y1 MUL alpha4, a2, t1 ADD3 y0, t0, y0 ADD4 y1, t1, y1 ST y0, 0 * SIZE(Y1) ST y1, 1 * SIZE(Y1) .align 4 $L18: lda J, -1(J) bgt J, $L11 .align 4 $L20: blbc N, $L990 LD alpha1, 0 * SIZE(X) LD alpha2, 1 * SIZE(X) MUL alpha_r, alpha1, y0 MUL alpha_r, alpha2, y1 MUL alpha_i, alpha2, t0 mov A, A1 MUL alpha_i, alpha1, t1 mov Y, Y1 #ifndef XCONJ SUB y0, t0, alpha1 ADD y1, t1, alpha2 #else ADD y0, t0, alpha1 SUB y1, t1, alpha2 #endif sra M, 2, I ble I, $L25 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) LD y2, 2 * SIZE(Y1) LD y3, 3 * SIZE(Y1) MUL alpha1, a0, t0 LD a4, 4 * SIZE(A1) MUL alpha1, a1, t1 LD a5, 5 * SIZE(A1) MUL alpha1, a2, t2 LD a6, 6 * SIZE(A1) MUL alpha1, a3, t3 LD a7, 7 * SIZE(A1) ADD1 y0, t0, y0 unop MUL alpha2, a1, t0 LD a1, 9 * SIZE(A1) ADD2 y1, t1, y1 unop MUL alpha2, a0, t1 LD a0, 8 * SIZE(A1) ADD1 y2, t2, y2 unop MUL alpha2, a3, t2 LD a3, 11 * SIZE(A1) ADD2 y3, t3, y3 unop MUL alpha2, a2, t3 LD a2, 10 * SIZE(A1) ADD3 y0, t0, y0 unop LD y4, 4 * SIZE(Y1) MUL alpha1, a4, t0 ADD4 y1, t1, y1 unop LD y5, 5 * SIZE(Y1) MUL alpha1, a5, t1 ADD3 y2, t2, y2 LD y6, 6 * SIZE(Y1) MUL alpha1, a6, t2 lda I, -1(I) ADD4 y3, t3, y3 LD y7, 7 * SIZE(Y1) MUL alpha1, a7, t3 ble I, $L23 .align 4 $L22: ADD1 y4, t0, y4 ST y0, 0 * SIZE(Y1) MUL alpha2, a5, t0 LD a5, 13 * SIZE(A1) ADD2 y5, t1, y5 ST y1, 1 * SIZE(Y1) MUL alpha2, a4, t1 LD a4, 12 * SIZE(A1) ADD1 y6, t2, y6 ST y2, 2 * SIZE(Y1) MUL alpha2, a7, t2 LD a7, 15 * SIZE(A1) ADD2 y7, t3, y7 ST y3, 3 * SIZE(Y1) MUL alpha2, a6, t3 LD a6, 14 * SIZE(A1) ADD3 y4, t0, y4 LD y0, 8 * SIZE(Y1) MUL alpha1, a0, t0 ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) ADD4 y5, t1, y5 LD y1, 9 * SIZE(Y1) MUL alpha1, a1, t1 lda I, -1(I) ADD3 y6, t2, y6 LD y2, 10 * SIZE(Y1) MUL alpha1, a2, t2 unop ADD4 y7, t3, y7 LD y3, 11 * SIZE(Y1) MUL alpha1, a3, t3 unop ADD1 y0, t0, y0 ST y4, 4 * SIZE(Y1) MUL alpha2, a1, t0 LD a1, 17 * SIZE(A1) ADD2 y1, t1, y1 ST y5, 5 * SIZE(Y1) MUL alpha2, a0, t1 LD a0, 16 * SIZE(A1) ADD1 y2, t2, y2 ST y6, 6 * SIZE(Y1) MUL alpha2, a3, t2 LD a3, 19 * SIZE(A1) ADD2 y3, t3, y3 ST y7, 7 * SIZE(Y1) MUL alpha2, a2, t3 LD a2, 18 * SIZE(A1) ADD3 y0, t0, y0 LD y4, 12 * SIZE(Y1) MUL alpha1, a4, t0 ldl $31, (PREFETCHSIZE + 0) * SIZE(Y1) ADD4 y1, t1, y1 LD y5, 13 * SIZE(Y1) MUL alpha1, a5, t1 lda A1, 8 * SIZE(A1) ADD3 y2, t2, y2 LD y6, 14 * SIZE(Y1) MUL alpha1, a6, t2 lda Y1, 8 * SIZE(Y1) ADD4 y3, t3, y3 LD y7, 7 * SIZE(Y1) MUL alpha1, a7, t3 bgt I, $L22 .align 4 $L23: ADD1 y4, t0, y4 ST y0, 0 * SIZE(Y1) MUL alpha2, a5, t0 unop ADD2 y5, t1, y5 ST y1, 1 * SIZE(Y1) MUL alpha2, a4, t1 unop ADD1 y6, t2, y6 ST y2, 2 * SIZE(Y1) MUL alpha2, a7, t2 unop ADD2 y7, t3, y7 ST y3, 3 * SIZE(Y1) MUL alpha2, a6, t3 unop ADD3 y4, t0, y4 ADD4 y5, t1, y5 ADD3 y6, t2, y6 ADD4 y7, t3, y7 ST y4, 4 * SIZE(Y1) unop ST y5, 5 * SIZE(Y1) unop ST y6, 6 * SIZE(Y1) lda A1, 8 * SIZE(A1) ST y7, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) .align 4 $L25: and M, 2, I ble I, $L27 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 2 * SIZE(A1) LD a3, 3 * SIZE(A1) MUL alpha1, a0, t0 LD y0, 0 * SIZE(Y1) MUL alpha1, a1, t1 LD y1, 1 * SIZE(Y1) MUL alpha1, a2, t2 LD y2, 2 * SIZE(Y1) MUL alpha1, a3, t3 LD y3, 3 * SIZE(Y1) ADD1 y0, t0, y0 MUL alpha2, a1, t0 ADD2 y1, t1, y1 MUL alpha2, a0, t1 ADD1 y2, t2, y2 MUL alpha2, a3, t2 ADD2 y3, t3, y3 MUL alpha2, a2, t3 ADD3 y0, t0, y0 ADD4 y1, t1, y1 ADD3 y2, t2, y2 ADD4 y3, t3, y3 ST y0, 0 * SIZE(Y1) ST y1, 1 * SIZE(Y1) ST y2, 2 * SIZE(Y1) lda A1, 4 * SIZE(A1) ST y3, 3 * SIZE(Y1) lda Y1, 4 * SIZE(Y1) .align 4 $L27: blbc M, $L990 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) MUL alpha1, a0, t0 LD y0, 0 * SIZE(Y1) MUL alpha1, a1, t1 LD y1, 1 * SIZE(Y1) ADD1 y0, t0, y0 MUL alpha2, a1, t0 ADD2 y1, t1, y1 MUL alpha2, a0, t1 ADD3 y0, t0, y0 ADD4 y1, t1, y1 ST y0, 0 * SIZE(Y1) ST y1, 1 * SIZE(Y1) .align 4 $L990: cmpeq INCY, 2 * SIZE, $0 bne $0, $L999 mov BUFFER, Y1 sra M, 2, I ble I, $L995 .align 4 $L992: LD a0, 0 * SIZE(BUFFER) LD a1, 1 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a2, 0 * SIZE(BUFFER) LD a3, 1 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD y0, 0 * SIZE(Y) LD y1, 1 * SIZE(Y) LD y2, 2 * SIZE(Y) LD y3, 3 * SIZE(Y) LD a4, 0 * SIZE(BUFFER) LD a5, 1 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD a6, 0 * SIZE(BUFFER) LD a7, 1 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD y4, 4 * SIZE(Y) LD y5, 5 * SIZE(Y) LD y6, 6 * SIZE(Y) LD y7, 7 * SIZE(Y) ADD a0, y0, a0 ADD a1, y1, a1 ADD a2, y2, a2 ADD a3, y3, a3 ST a0, 0 * SIZE(Y1) ADD a4, y4, a4 ST a1, 1 * SIZE(Y1) ADD a5, y5, a5 addq Y1, INCY, Y1 ST a2, 0 * SIZE(Y1) ADD a6, y6, a6 ST a3, 1 * SIZE(Y1) ADD a7, y7, a7 addq Y1, INCY, Y1 ST a4, 0 * SIZE(Y1) ST a5, 1 * SIZE(Y1) addq Y1, INCY, Y1 ST a6, 0 * SIZE(Y1) ST a7, 1 * SIZE(Y1) addq Y1, INCY, Y1 lda I, -1(I) lda Y, 8 * SIZE(Y) bgt I, $L992 .align 4 $L995: and M, 3, I ble I, $L999 .align 4 $L996: LD a0, 0 * SIZE(BUFFER) LD a1, 1 * SIZE(BUFFER) addq BUFFER, INCY, BUFFER LD y0, 0 * SIZE(Y) LD y1, 1 * SIZE(Y) lda Y, 2 * SIZE(Y) ADD a0, y0, a0 ADD a1, y1, a1 ST a0, 0 * SIZE(Y1) ST a1, 1 * SIZE(Y1) addq Y1, INCY, Y1 lda I, -1(I) bgt I, $L996 .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zgemv_t.S000066400000000000000000000351271313527062700173200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define STACKSIZE 64 #define PREFETCHSIZE 32 #define M $16 #define N $17 #define A $21 #define LDA $18 #define X $19 #define INCX $20 #define Y $22 #define INCY $23 #define BUFFER $24 #define I $25 #define J $27 #define X1 $3 #define Y1 $4 #define A1 $5 #define A2 $6 #define alpha_r $f19 #define alpha_i $f20 #define s0 $f0 #define s1 $f1 #define s2 $f10 #define s3 $f11 #define t0 $f12 #define t1 $f13 #define t2 $f14 #define t3 $f15 #define x0 $f16 #define x1 $f17 #define x2 $f18 #define x3 $f21 #define a0 $f22 #define a1 $f23 #define a2 $f24 #define a3 $f25 #define a4 $f26 #define a5 $f27 #define a6 $f28 #define a7 $f29 #define a8 $f2 #define a9 $f3 #define a10 $f4 #define a11 $f5 #define a12 $f6 #define a13 $f7 #define a14 $f8 #define a15 $f9 #if !defined(CONJ) && !defined(XCONJ) #define ADD1 ADD #define ADD2 ADD #define ADD3 SUB #define ADD4 ADD #elif !defined(CONJ) && defined(XCONJ) #define ADD1 ADD #define ADD2 ADD #define ADD3 ADD #define ADD4 SUB #elif defined(CONJ) && !defined(XCONJ) #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #else #define ADD1 ADD #define ADD2 SUB #define ADD3 SUB #define ADD4 SUB #endif PROLOGUE lda $sp, -STACKSIZE($sp) ldq LDA, 0 + STACKSIZE($sp) ldq X, 8 + STACKSIZE($sp) ldq INCX, 16 + STACKSIZE($sp) ldq Y, 24 + STACKSIZE($sp) ldq INCY, 32 + STACKSIZE($sp) ldq BUFFER, 40 + STACKSIZE($sp) stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) PROFCODE cmple M, 0, $0 sll INCX, ZBASE_SHIFT, INCX cmple N, 0, $1 sll INCY, ZBASE_SHIFT, INCY or $0, $1, $0 bne $0, $L999 cmpeq INCX, 2 * SIZE, $0 mov X, X1 sll LDA, ZBASE_SHIFT,LDA bne $0, $L10 sra M, 2, I mov BUFFER, Y1 mov BUFFER, X ble I, $L05 .align 4 $L02: ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) lda I, -1(I) LD a0, 0 * SIZE(X1) LD a1, 1 * SIZE(X1) addq X1, INCX, X1 LD a2, 0 * SIZE(X1) LD a3, 1 * SIZE(X1) addq X1, INCX, X1 ST a0, 0 * SIZE(Y1) ST a1, 1 * SIZE(Y1) ST a2, 2 * SIZE(Y1) ST a3, 3 * SIZE(Y1) LD a4, 0 * SIZE(X1) LD a5, 1 * SIZE(X1) addq X1, INCX, X1 LD a6, 0 * SIZE(X1) LD a7, 1 * SIZE(X1) addq X1, INCX, X1 ST a4, 4 * SIZE(Y1) ST a5, 5 * SIZE(Y1) ST a6, 6 * SIZE(Y1) ST a7, 7 * SIZE(Y1) lda Y1, 8 * SIZE(Y1) bgt I, $L02 .align 4 $L05: and M, 3, I ble I, $L10 .align 4 $L06: LD a0, 0 * SIZE(X1) LD a1, 1 * SIZE(X1) addq X1, INCX, X1 ST a0, 0 * SIZE(Y1) ST a1, 1 * SIZE(Y1) lda Y1, 2 * SIZE(Y1) lda I, -1(I) bgt I, $L06 .align 4 $L10: mov Y, Y1 fclr t0 unop fclr t1 sra N, 1, J fclr t2 fclr t3 ble J, $L20 .align 4 $L11: mov A, A1 fclr s0 addq A, LDA, A2 fclr s1 addq A2, LDA, A unop mov X, X1 lds $f31, 3 * SIZE(Y) sra M, 2, I fclr s2 fclr s3 ble I, $L15 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 0 * SIZE(A2) LD a3, 1 * SIZE(A2) LD a4, 2 * SIZE(A1) LD a5, 3 * SIZE(A1) LD a6, 2 * SIZE(A2) LD a7, 3 * SIZE(A2) LD a8, 4 * SIZE(A1) LD a9, 5 * SIZE(A1) LD a10, 4 * SIZE(A2) LD a11, 5 * SIZE(A2) LD a12, 6 * SIZE(A1) LD a13, 7 * SIZE(A1) LD a14, 6 * SIZE(A2) LD a15, 7 * SIZE(A2) LD x0, 0 * SIZE(X1) LD x1, 1 * SIZE(X1) LD x2, 2 * SIZE(X1) lda I, -1(I) ble I, $L13 .align 4 $L12: ADD3 s0, t0, s0 unop MUL x0, a0, t0 LD x3, 3 * SIZE(X1) ADD4 s1, t1, s1 ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) MUL x0, a1, t1 unop ADD3 s2, t2, s2 unop MUL x0, a2, t2 unop ADD4 s3, t3, s3 unop MUL x0, a3, t3 LD x0, 4 * SIZE(X1) ADD1 s0, t0, s0 unop MUL x1, a1, t0 LD a1, 9 * SIZE(A1) ADD2 s1, t1, s1 unop MUL x1, a0, t1 LD a0, 8 * SIZE(A1) ADD1 s2, t2, s2 unop MUL x1, a3, t2 LD a3, 9 * SIZE(A2) ADD2 s3, t3, s3 unop MUL x1, a2, t3 LD a2, 8 * SIZE(A2) ADD3 s0, t0, s0 unop MUL x2, a4, t0 LD x1, 5 * SIZE(X1) ADD4 s1, t1, s1 MUL x2, a5, t1 ADD3 s2, t2, s2 MUL x2, a6, t2 ADD4 s3, t3, s3 unop MUL x2, a7, t3 LD x2, 6 * SIZE(X1) ADD1 s0, t0, s0 unop MUL x3, a5, t0 LD a5, 11 * SIZE(A1) ADD2 s1, t1, s1 unop MUL x3, a4, t1 LD a4, 10 * SIZE(A1) ADD1 s2, t2, s2 unop MUL x3, a7, t2 LD a7, 11 * SIZE(A2) ADD2 s3, t3, s3 unop MUL x3, a6, t3 LD a6, 10 * SIZE(A2) ADD3 s0, t0, s0 unop MUL x0, a8, t0 LD x3, 7 * SIZE(X1) ADD4 s1, t1, s1 ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) MUL x0, a9, t1 unop ADD3 s2, t2, s2 lda I, -1(I) MUL x0, a10, t2 unop ADD4 s3, t3, s3 unop MUL x0, a11, t3 LD x0, 8 * SIZE(X1) ADD1 s0, t0, s0 unop MUL x1, a9, t0 LD a9, 13 * SIZE(A1) ADD2 s1, t1, s1 unop MUL x1, a8, t1 LD a8, 12 * SIZE(A1) ADD1 s2, t2, s2 lda A1, 8 * SIZE(A1) MUL x1, a11, t2 LD a11, 13 * SIZE(A2) ADD2 s3, t3, s3 unop MUL x1, a10, t3 LD a10, 12 * SIZE(A2) ADD3 s0, t0, s0 unop MUL x2, a12, t0 LD x1, 9 * SIZE(X1) ADD4 s1, t1, s1 ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) MUL x2, a13, t1 lda A2, 8 * SIZE(A2) ADD3 s2, t2, s2 unop MUL x2, a14, t2 unop ADD4 s3, t3, s3 unop MUL x2, a15, t3 LD x2, 10 * SIZE(X1) ADD1 s0, t0, s0 unop MUL x3, a13, t0 LD a13, 7 * SIZE(A1) ADD2 s1, t1, s1 lda X1, 8 * SIZE(X1) MUL x3, a12, t1 LD a12, 6 * SIZE(A1) ADD1 s2, t2, s2 unop MUL x3, a15, t2 LD a15, 7 * SIZE(A2) ADD2 s3, t3, s3 MUL x3, a14, t3 LD a14, 6 * SIZE(A2) bgt I, $L12 .align 4 $L13: ADD3 s0, t0, s0 unop MUL x0, a0, t0 LD x3, 3 * SIZE(X1) ADD4 s1, t1, s1 MUL x0, a1, t1 ADD3 s2, t2, s2 MUL x0, a2, t2 ADD4 s3, t3, s3 unop MUL x0, a3, t3 LD x0, 4 * SIZE(X1) ADD1 s0, t0, s0 MUL x1, a1, t0 ADD2 s1, t1, s1 MUL x1, a0, t1 ADD1 s2, t2, s2 unop MUL x1, a3, t2 unop ADD2 s3, t3, s3 lda A1, 8 * SIZE(A1) MUL x1, a2, t3 LD x1, 5 * SIZE(X1) ADD3 s0, t0, s0 MUL x2, a4, t0 ADD4 s1, t1, s1 MUL x2, a5, t1 ADD3 s2, t2, s2 unop MUL x2, a6, t2 unop ADD4 s3, t3, s3 lda A2, 8 * SIZE(A2) MUL x2, a7, t3 LD x2, 6 * SIZE(X1) ADD1 s0, t0, s0 MUL x3, a5, t0 ADD2 s1, t1, s1 MUL x3, a4, t1 ADD1 s2, t2, s2 unop MUL x3, a7, t2 lda X1, 8 * SIZE(X1) ADD2 s3, t3, s3 unop MUL x3, a6, t3 LD x3, -1 * SIZE(X1) ADD3 s0, t0, s0 MUL x0, a8, t0 ADD4 s1, t1, s1 MUL x0, a9, t1 ADD3 s2, t2, s2 MUL x0, a10, t2 ADD4 s3, t3, s3 MUL x0, a11, t3 ADD1 s0, t0, s0 MUL x1, a9, t0 ADD2 s1, t1, s1 MUL x1, a8, t1 ADD1 s2, t2, s2 MUL x1, a11, t2 ADD2 s3, t3, s3 MUL x1, a10, t3 ADD3 s0, t0, s0 MUL x2, a12, t0 ADD4 s1, t1, s1 MUL x2, a13, t1 ADD3 s2, t2, s2 MUL x2, a14, t2 ADD4 s3, t3, s3 MUL x2, a15, t3 ADD1 s0, t0, s0 MUL x3, a13, t0 ADD2 s1, t1, s1 MUL x3, a12, t1 ADD1 s2, t2, s2 MUL x3, a15, t2 ADD2 s3, t3, s3 MUL x3, a14, t3 .align 4 $L15: and M, 3, I ble I, $L18 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a2, 0 * SIZE(A2) LD a3, 1 * SIZE(A2) LD x0, 0 * SIZE(X1) lda I, -1(I) ble I, $L17 .align 4 $L16: ADD3 s0, t0, s0 lda I, -1(I) MUL x0, a0, t0 LD x1, 1 * SIZE(X1) ADD4 s1, t1, s1 MUL x0, a1, t1 ADD3 s2, t2, s2 MUL x0, a2, t2 ADD4 s3, t3, s3 unop MUL x0, a3, t3 LD x0, 2 * SIZE(X1) ADD1 s0, t0, s0 lda A2, 2 * SIZE(A2) MUL x1, a1, t0 LD a1, 3 * SIZE(A1) ADD2 s1, t1, s1 lda X1, 2 * SIZE(X1) MUL x1, a0, t1 LD a0, 2 * SIZE(A1) ADD1 s2, t2, s2 lda A1, 2 * SIZE(A1) MUL x1, a3, t2 LD a3, 1 * SIZE(A2) ADD2 s3, t3, s3 MUL x1, a2, t3 LD a2, 0 * SIZE(A2) bgt I, $L16 .align 4 $L17: ADD3 s0, t0, s0 unop MUL x0, a0, t0 LD x1, 1 * SIZE(X1) ADD4 s1, t1, s1 unop MUL x0, a1, t1 unop ADD3 s2, t2, s2 MUL x0, a2, t2 ADD4 s3, t3, s3 MUL x0, a3, t3 ADD1 s0, t0, s0 MUL x1, a1, t0 ADD2 s1, t1, s1 MUL x1, a0, t1 ADD1 s2, t2, s2 MUL x1, a3, t2 ADD2 s3, t3, s3 MUL x1, a2, t3 .align 4 $L18: LD a0, 0 * SIZE(Y) unop LD a1, 1 * SIZE(Y) addq Y, INCY, Y LD a2, 0 * SIZE(Y) unop LD a3, 1 * SIZE(Y) addq Y, INCY, Y ADD3 s0, t0, s0 ADD4 s1, t1, s1 ADD3 s2, t2, s2 ADD4 s3, t3, s3 MUL alpha_r, s0, t0 MUL alpha_r, s1, t1 MUL alpha_r, s2, t2 MUL alpha_r, s3, t3 ADD a0, t0, a0 MUL alpha_i, s1, t0 ADD a1, t1, a1 MUL alpha_i, s0, t1 ADD a2, t2, a2 MUL alpha_i, s3, t2 ADD a3, t3, a3 MUL alpha_i, s2, t3 SUB a0, t0, a0 ADD a1, t1, a1 SUB a2, t2, a2 ADD a3, t3, a3 ST a0, 0 * SIZE(Y1) fclr t0 ST a1, 1 * SIZE(Y1) addq Y1, INCY, Y1 ST a2, 0 * SIZE(Y1) fclr t1 ST a3, 1 * SIZE(Y1) addq Y1, INCY, Y1 fclr t2 lda J, -1(J) fclr t3 bgt J, $L11 .align 4 $L20: blbc N, $L999 mov A, A1 fclr s0 fclr s1 mov X, X1 sra M, 2, I fclr s2 fclr s3 ble I, $L25 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD a4, 2 * SIZE(A1) LD a5, 3 * SIZE(A1) LD a8, 4 * SIZE(A1) LD a9, 5 * SIZE(A1) LD a12, 6 * SIZE(A1) LD a13, 7 * SIZE(A1) LD x0, 0 * SIZE(X1) LD x1, 1 * SIZE(X1) LD x2, 2 * SIZE(X1) lda I, -1(I) ble I, $L23 .align 4 $L22: ADD3 s0, t0, s0 ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) MUL x0, a0, t0 LD x3, 3 * SIZE(X1) ADD4 s1, t1, s1 unop MUL x0, a1, t1 LD x0, 4 * SIZE(X1) ADD1 s2, t0, s2 lda I, -1(I) MUL x1, a1, t0 LD a1, 9 * SIZE(A1) ADD2 s3, t1, s3 unop MUL x1, a0, t1 LD a0, 8 * SIZE(A1) ADD3 s0, t0, s0 unop MUL x2, a4, t0 LD x1, 5 * SIZE(X1) ADD4 s1, t1, s1 unop MUL x2, a5, t1 LD x2, 6 * SIZE(X1) ADD1 s2, t0, s2 unop MUL x3, a5, t0 LD a5, 11 * SIZE(A1) ADD2 s3, t1, s3 unop MUL x3, a4, t1 LD a4, 10 * SIZE(A1) ADD3 s0, t0, s0 unop MUL x0, a8, t0 LD x3, 7 * SIZE(X1) ADD4 s1, t1, s1 unop MUL x0, a9, t1 LD x0, 8 * SIZE(X1) ADD1 s2, t0, s2 unop MUL x1, a9, t0 LD a9, 13 * SIZE(A1) ADD2 s3, t1, s3 unop MUL x1, a8, t1 LD a8, 12 * SIZE(A1) ADD3 s0, t0, s0 unop MUL x2, a12, t0 LD x1, 9 * SIZE(X1) ADD4 s1, t1, s1 lda A1, 8 * SIZE(A1) MUL x2, a13, t1 LD x2, 10 * SIZE(X1) ADD1 s2, t0, s2 lda X1, 8 * SIZE(X1) MUL x3, a13, t0 LD a13, 7 * SIZE(A1) ADD2 s3, t1, s3 MUL x3, a12, t1 LD a12, 6 * SIZE(A1) bgt I, $L22 .align 4 $L23: ADD3 s0, t0, s0 unop MUL x0, a0, t0 LD x3, 3 * SIZE(X1) ADD4 s1, t1, s1 unop MUL x0, a1, t1 LD x0, 4 * SIZE(X1) ADD1 s2, t0, s2 unop MUL x1, a1, t0 lda A1, 8 * SIZE(A1) ADD2 s3, t1, s3 unop MUL x1, a0, t1 LD x1, 5 * SIZE(X1) ADD3 s0, t0, s0 unop MUL x2, a4, t0 unop ADD4 s1, t1, s1 unop MUL x2, a5, t1 LD x2, 6 * SIZE(X1) ADD1 s2, t0, s2 unop MUL x3, a5, t0 lda X1, 8 * SIZE(X1) ADD2 s3, t1, s3 unop MUL x3, a4, t1 LD x3, -1 * SIZE(X1) ADD3 s0, t0, s0 MUL x0, a8, t0 ADD4 s1, t1, s1 MUL x0, a9, t1 ADD1 s2, t0, s2 MUL x1, a9, t0 ADD2 s3, t1, s3 MUL x1, a8, t1 ADD3 s0, t0, s0 MUL x2, a12, t0 ADD4 s1, t1, s1 MUL x2, a13, t1 ADD1 s2, t0, s2 MUL x3, a13, t0 ADD2 s3, t1, s3 MUL x3, a12, t1 .align 4 $L25: and M, 3, I ble I, $L28 LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) LD x0, 0 * SIZE(X1) lda I, -1(I) ble I, $L27 .align 4 $L26: ADD3 s0, t0, s0 lda A1, 2 * SIZE(A1) MUL x0, a0, t0 LD x1, 1 * SIZE(X1) ADD4 s1, t1, s1 lda I, -1(I) MUL x0, a1, t1 LD x0, 2 * SIZE(X1) ADD1 s0, t0, s0 lda X1, 2 * SIZE(X1) MUL x1, a1, t0 LD a1, 1 * SIZE(A1) ADD2 s1, t1, s1 MUL x1, a0, t1 LD a0, 0 * SIZE(A1) bgt I, $L26 .align 4 $L27: ADD3 s0, t0, s0 unop MUL x0, a0, t0 LD x1, 1 * SIZE(X1) ADD4 s1, t1, s1 unop MUL x0, a1, t1 unop ADD1 s0, t0, s0 MUL x1, a1, t0 ADD2 s1, t1, s1 MUL x1, a0, t1 .align 4 $L28: LD a0, 0 * SIZE(Y) LD a1, 1 * SIZE(Y) ADD3 s0, t0, s0 ADD4 s1, t1, s1 ADD3 s2, t2, s2 ADD4 s3, t3, s3 ADD s0, s2, s0 ADD s1, s3, s1 MUL alpha_r, s0, t0 MUL alpha_r, s1, t1 ADD a0, t0, a0 MUL alpha_i, s1, t0 ADD a1, t1, a1 MUL alpha_i, s0, t1 SUB a0, t0, a0 ADD a1, t1, a1 ST a0, 0 * SIZE(Y1) ST a1, 1 * SIZE(Y1) .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) lda $sp, STACKSIZE($sp) ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/znrm2.S000066400000000000000000000171701313527062700167130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCH_SIZE 80 #define N $16 #define X $17 #define INCX $18 #define XX $19 #define I $0 #define a0 $f0 #define a1 $f1 #define a2 $f10 #define a3 $f11 #define t0 $f12 #define t1 $f13 #define t2 $f14 #define t3 $f15 #define x0 $f16 #define x1 $f17 #define x2 $f18 #define x3 $f19 #define x4 $f20 #define x5 $f21 #define x6 $f22 #define x7 $f23 PROLOGUE #if defined(EV4) || defined(EV5) .frame $30,16,$26,0 .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) PROFCODE .prologue 1 #else PROFCODE #endif fclr a0 sll INCX, ZBASE_SHIFT, INCX fclr a1 ble N, $L999 fclr a2 cmpeq INCX, 2 * SIZE, $0 fclr a3 beq $0, $L20 fclr t0 sra N, 3, I fclr t1 ble I, $L15 fclr t2 LD x0, 0 * SIZE(X) fclr t3 LD x1, 1 * SIZE(X) LD x2, 2 * SIZE(X) LD x3, 3 * SIZE(X) LD x4, 4 * SIZE(X) LD x5, 5 * SIZE(X) LD x6, 6 * SIZE(X) LD x7, 7 * SIZE(X) lda I, -1(I) ble I, $L12 .align 4 $L11: addt a0, t0, a0 ldl $31, (PREFETCH_SIZE) * SIZE(X) mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 mov X, XX mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(X) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(X) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(X) addt a3, t3, a3 unop mult x7, x7, t3 LD x7, 15 * SIZE(X) addt a0, t0, a0 unop mult x0, x0, t0 LD x0, 16 * SIZE(X) addt a1, t1, a1 lda X, 16 * SIZE(X) mult x1, x1, t1 LD x1, 17 * SIZE(XX) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 18 * SIZE(XX) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 19 * SIZE(XX) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 20 * SIZE(XX) addt a1, t1, a1 lda I, -1(I) mult x5, x5, t1 LD x5, 21 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 22 * SIZE(XX) addt a3, t3, a3 mult x7, x7, t3 LD x7, 23 * SIZE(XX) bgt I, $L11 .align 4 $L12: addt a0, t0, a0 mov X, XX mult x0, x0, t0 LD x0, 8 * SIZE(X) addt a1, t1, a1 unop mult x1, x1, t1 LD x1, 9 * SIZE(X) addt a2, t2, a2 unop mult x2, x2, t2 LD x2, 10 * SIZE(X) addt a3, t3, a3 unop mult x3, x3, t3 LD x3, 11 * SIZE(X) addt a0, t0, a0 unop mult x4, x4, t0 LD x4, 12 * SIZE(XX) addt a1, t1, a1 unop mult x5, x5, t1 LD x5, 13 * SIZE(XX) addt a2, t2, a2 unop mult x6, x6, t2 LD x6, 14 * SIZE(XX) addt a3, t3, a3 lda X, 16 * SIZE(X) mult x7, x7, t3 LD x7, 15 * SIZE(XX) addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L15: and N, 7, I ble I, $L998 .align 4 $L16: LD x0, 0 * SIZE(X) LD x1, 1 * SIZE(X) lda X, 2 * SIZE(X) addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 lda I, -1(I) bgt I, $L16 bsr $31, $L998 .align 4 $L20: fclr t0 sra N, 2, I fclr t1 ble I, $L25 LD x0, 0 * SIZE(X) fclr t2 LD x1, 1 * SIZE(X) addq X, INCX, X LD x2, 0 * SIZE(X) fclr t3 LD x3, 1 * SIZE(X) addq X, INCX, X LD x4, 0 * SIZE(X) lda I, -1(I) LD x5, 1 * SIZE(X) addq X, INCX, X LD x6, 0 * SIZE(X) ble I, $L22 .align 4 $L21: addt a0, t0, a0 LD x7, 1 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 LD x0, 0 * SIZE(X) mult x1, x1, t1 unop addt a2, t2, a2 LD x1, 1 * SIZE(X) mult x2, x2, t2 addq X, INCX, X addt a3, t3, a3 LD x2, 0 * SIZE(X) mult x3, x3, t3 unop addt a0, t0, a0 LD x3, 1 * SIZE(X) mult x4, x4, t0 addq X, INCX, X addt a1, t1, a1 LD x4, 0 * SIZE(X) mult x5, x5, t1 lda I, -1(I) addt a2, t2, a2 LD x5, 1 * SIZE(X) mult x6, x6, t2 addq X, INCX, X addt a3, t3, a3 LD x6, 0 * SIZE(X) mult x7, x7, t3 bgt I, $L21 .align 4 $L22: addt a0, t0, a0 LD x7, 1 * SIZE(X) mult x0, x0, t0 addq X, INCX, X addt a1, t1, a1 mult x1, x1, t1 addt a2, t2, a2 mult x2, x2, t2 addt a3, t3, a3 mult x3, x3, t3 addt a0, t0, a0 mult x4, x4, t0 addt a1, t1, a1 mult x5, x5, t1 addt a2, t2, a2 mult x6, x6, t2 addt a3, t3, a3 mult x7, x7, t3 addt a2, t2, a2 addt a3, t3, a3 .align 4 $L25: and N, 3, I ble I, $L998 .align 4 $L26: LD x0, 0 * SIZE(X) lda I, -1(I) LD x1, 1 * SIZE(X) addq X, INCX, X addt a0, t0, a0 mult x0, x0, t0 addt a1, t1, a1 mult x1, x1, t1 bgt I, $L26 .align 4 $L998: addt a0, t0, a0 addt a1, t1, a1 addt a0, a1, a0 addt a2, a3, a2 #if defined(EV4) || defined(EV5) addt a0, a2, $f16 jsr $26, ($27), sqrt !lituse_jsr!2 ldah $29, 0($26) !gpdisp!3 lda $29, 0($29) !gpdisp!3 #else addt a0, a2, a0 sqrtt a0, a0 #endif .align 4 $L999: #if defined(EV4) || defined(EV5) ldq $26, 0($sp) lda $sp, 16($sp) #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zrot.S000066400000000000000000000246421313527062700166430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define N $16 #define X $17 #define INCX $18 #define Y $19 #define INCY $20 #define I $21 #define XX $23 #define YY $24 #define C $f10 #define S $f11 #define PREFETCH_SIZE 80 PROLOGUE PROFCODE .frame $sp, 0, $26, 0 #ifndef PROFILE .prologue 0 #else .prologue 1 #endif fmov $f21, C LD S, 0($sp) addq INCX, INCX, INCX addq INCY, INCY, INCY cmpeq INCX, 2, $23 cmpeq INCY, 2, $24 ble N, $L998 and $23, $24, $23 beq $23, $L50 sra N, 2, I ble I, $L15 LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) LD $f15, 1*SIZE(Y) LD $f16, 2*SIZE(X) LD $f17, 2*SIZE(Y) LD $f18, 3*SIZE(X) LD $f19, 3*SIZE(Y) MUL C, $f12, $f21 unop MUL S, $f13, $f22 MUL C, $f13, $f23 LD $f13, 4*SIZE(Y) MUL S, $f12, $f24 LD $f12, 4*SIZE(X) MUL C, $f14, $f25 lda I, -1(I) MUL S, $f15, $f26 ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 5*SIZE(Y) MUL S, $f14, $f28 SUB $f23, $f24, $f24 ble I, $L13 .align 4 $L12: MUL C, $f16, $f21 lds $f31, (PREFETCH_SIZE) * SIZE(X) unop LD $f14, 5*SIZE(X) ST $f22, 0*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 lds $f31, (PREFETCH_SIZE) * SIZE(Y) unop LD $f17, 6*SIZE(Y) ST $f24, 0*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 LD $f16, 6*SIZE(X) unop unop ST $f26, 1*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop LD $f19, 7*SIZE(Y) ST $f28, 1*SIZE(Y) MUL S, $f18, $f28 unop SUB $f23, $f24, $f24 MUL C, $f12, $f21 LD $f18, 7*SIZE(X) unop unop ST $f22, 2*SIZE(X) unop MUL S, $f13, $f22 ADD $f25, $f26, $f26 MUL C, $f13, $f23 LD $f13, 8*SIZE(Y) unop unop ST $f24, 2*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 LD $f12, 8*SIZE(X) unop unop ST $f26, 3*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 9*SIZE(Y) unop unop ST $f28, 3*SIZE(Y) MUL S, $f14, $f28 unop SUB $f23, $f24, $f24 MUL C, $f16, $f21 LD $f14, 9*SIZE(X) unop unop ST $f22, 4*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 LD $f17, 10*SIZE(Y) unop unop ST $f24, 4*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 LD $f16, 10*SIZE(X) unop unop ST $f26, 5*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 LD $f19, 11*SIZE(Y) unop unop ST $f28, 5*SIZE(Y) MUL S, $f18, $f28 lda I, -1(I) SUB $f23, $f24, $f24 MUL C, $f12, $f21 LD $f18, 11*SIZE(X) unop unop ST $f22, 6*SIZE(X) MUL S, $f13, $f22 unop ADD $f25, $f26, $f26 MUL C, $f13, $f23 LD $f13, 12*SIZE(Y) lda X, 8*SIZE(X) unop ST $f24, 6*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 LD $f12, 4*SIZE(X) lda Y, 8*SIZE(Y) unop ST $f26, -1*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 5*SIZE(Y) unop unop ST $f28, -1*SIZE(Y) MUL S, $f14, $f28 SUB $f23, $f24, $f24 bgt I, $L12 .align 4 $L13: MUL C, $f16, $f21 LD $f14, 5*SIZE(X) unop unop ST $f22, 0*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 unop unop LD $f17, 6*SIZE(Y) ST $f24, 0*SIZE(Y) MUL S, $f16, $f24 LD $f16, 6*SIZE(X) SUB $f27, $f28, $f28 MUL C, $f18, $f25 unop unop unop ST $f26, 1*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop LD $f19, 7*SIZE(Y) ST $f28, 1*SIZE(Y) MUL S, $f18, $f28 LD $f18, 7*SIZE(X) SUB $f23, $f24, $f24 MUL C, $f12, $f21 unop unop unop ST $f22, 2*SIZE(X) unop MUL S, $f13, $f22 ADD $f25, $f26, $f26 MUL C, $f13, $f23 unop unop unop ST $f24, 2*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 unop unop unop ST $f26, 3*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 unop unop unop ST $f28, 3*SIZE(Y) MUL S, $f14, $f28 unop SUB $f23, $f24, $f24 MUL C, $f16, $f21 unop unop unop ST $f22, 4*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 unop unop unop ST $f24, 4*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 unop unop unop ST $f26, 5*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop unop ST $f28, 5*SIZE(Y) MUL S, $f18, $f28 unop SUB $f23, $f24, $f24 ST $f22, 6*SIZE(X) ADD $f25, $f26, $f26 ST $f24, 6*SIZE(Y) SUB $f27, $f28, $f28 ST $f26, 7*SIZE(X) lda X, 8*SIZE(X) ST $f28, 7*SIZE(Y) lda Y, 8*SIZE(Y) .align 4 $L15: and N, 3, I ble I, $L998 .align 4 $L16: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) LD $f15, 1*SIZE(Y) MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(X) ST $f24, 0*SIZE(Y) lda I, -1(I) ST $f26, 1*SIZE(X) lda X, 2 * SIZE(X) ST $f28, 1*SIZE(Y) lda Y, 2 * SIZE(Y) bgt I, $L16 .align 4 $L998: clr $0 ret .align 4 $L50: mov X, XX mov Y, YY sra N, 2, I ble I, $L55 .align 4 $L51: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY lda I, -1(I) bgt I, $L51 .align 4 $L55: and N, 3, I ble I, $L999 .align 4 $L56: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) LD $f15, 1*SIZE(Y) MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(X) ST $f24, 0*SIZE(Y) lda I, -1(I) ST $f26, 1*SIZE(X) ST $f28, 1*SIZE(Y) SXADDQ INCX, X, X SXADDQ INCY, Y, Y bgt I, $L56 .align 4 $L999: clr $0 ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zscal.S000066400000000000000000000132351313527062700167550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #define PREFETCHSIZE 88 #define N $16 #define X $21 #define INCX $17 #define XX $18 #define I $19 #define ALPHA_R $f19 #define ALPHA_I $f20 #define s0 $f0 #define s1 $f1 #define s2 $f10 #define s3 $f11 #define a0 $f12 #define a1 $f13 #define a2 $f14 #define a3 $f15 #define a4 $f16 #define a5 $f17 #define a6 $f18 #define a7 $f21 #define t0 $f22 #define t1 $f23 #define t2 $f24 #define t3 $f25 #define t4 $f26 #define t5 $f27 #define t6 $f28 #define t7 $f29 PROLOGUE PROFCODE ldq INCX, 0($sp) mov X, XX ble N, $L999 addq INCX, INCX, INCX sra N, 2, I ble I, $L15 LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) SXADDQ INCX, X, X LD a2, 0 * SIZE(X) LD a3, 1 * SIZE(X) SXADDQ INCX, X, X LD a4, 0 * SIZE(X) LD a5, 1 * SIZE(X) SXADDQ INCX, X, X LD a6, 0 * SIZE(X) LD a7, 1 * SIZE(X) SXADDQ INCX, X, X MUL a0, ALPHA_R, t0 MUL a1, ALPHA_I, t1 MUL a0, ALPHA_I, t2 MUL a1, ALPHA_R, t3 SUB t0, t1, t4 ADD t2, t3, t5 lda I, -1(I) ble I, $L13 .align 4 $L12: ST t4, 0 * SIZE(XX) MUL a2, ALPHA_R, t0 ST t5, 1 * SIZE(XX) MUL a3, ALPHA_I, t1 MUL a2, ALPHA_I, t2 LD a0, 0 * SIZE(X) MUL a3, ALPHA_R, t3 LD a1, 1 * SIZE(X) SUB t0, t1, t6 SXADDQ INCX, XX, XX ADD t2, t3, t7 SXADDQ INCX, X, X MUL a4, ALPHA_R, t0 ST t6, 0 * SIZE(XX) MUL a5, ALPHA_I, t1 ST t7, 1 * SIZE(XX) MUL a4, ALPHA_I, t2 LD a2, 0 * SIZE(X) MUL a5, ALPHA_R, t3 LD a3, 1 * SIZE(X) SUB t0, t1, t4 SXADDQ INCX, XX, XX ADD t2, t3, t5 SXADDQ INCX, X, X MUL a6, ALPHA_R, t0 ST t4, 0 * SIZE(XX) MUL a7, ALPHA_I, t1 ST t5, 1 * SIZE(XX) MUL a6, ALPHA_I, t2 LD a4, 0 * SIZE(X) MUL a7, ALPHA_R, t3 LD a5, 1 * SIZE(X) SUB t0, t1, t6 SXADDQ INCX, XX, XX ADD t2, t3, t7 SXADDQ INCX, X, X MUL a0, ALPHA_R, t0 ST t6, 0 * SIZE(XX) MUL a1, ALPHA_I, t1 ST t7, 1 * SIZE(XX) MUL a0, ALPHA_I, t2 LD a6, 0 * SIZE(X) MUL a1, ALPHA_R, t3 LD a7, 1 * SIZE(X) SUB t0, t1, t4 lda I, -1(I) ADD t2, t3, t5 SXADDQ INCX, XX, XX lds $f31, PREFETCHSIZE * SIZE(X) unop SXADDQ INCX, X, X bne I, $L12 .align 4 $L13: MUL a2, ALPHA_R, t0 MUL a3, ALPHA_I, t1 ST t4, 0 * SIZE(XX) MUL a2, ALPHA_I, t2 ST t5, 1 * SIZE(XX) MUL a3, ALPHA_R, t3 SUB t0, t1, t6 SXADDQ INCX, XX, XX ADD t2, t3, t7 unop ST t6, 0 * SIZE(XX) MUL a4, ALPHA_R, t0 ST t7, 1 * SIZE(XX) MUL a5, ALPHA_I, t1 MUL a4, ALPHA_I, t2 MUL a5, ALPHA_R, t3 SUB t0, t1, t4 SXADDQ INCX, XX, XX ADD t2, t3, t5 unop MUL a6, ALPHA_R, t0 ST t4, 0 * SIZE(XX) MUL a7, ALPHA_I, t1 ST t5, 1 * SIZE(XX) MUL a6, ALPHA_I, t2 MUL a7, ALPHA_R, t3 SUB t0, t1, t6 SXADDQ INCX, XX, XX ADD t2, t3, t7 ST t6, 0 * SIZE(XX) ST t7, 1 * SIZE(XX) SXADDQ INCX, XX, XX .align 4 $L15: and N, 3, I unop unop ble I, $L999 .align 4 $L17: LD a0, 0 * SIZE(X) LD a1, 1 * SIZE(X) SXADDQ INCX, X, X MUL a0, ALPHA_R, t0 MUL a1, ALPHA_I, t1 MUL a0, ALPHA_I, t2 MUL a1, ALPHA_R, t3 SUB t0, t1, t4 ADD t2, t3, t5 ST t4, 0 * SIZE(XX) ST t5, 1 * SIZE(XX) SXADDQ INCX, XX, XX lda I, -1(I) bne I, $L17 .align 4 $L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/zswap.S000066400000000000000000000137071313527062700170110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" PROLOGUE PROFCODE .frame $sp, 0, $26, 0 mov $21, $17 ldl $18, 0($sp) ldq $19, 8($sp) ldl $20, 16($sp) #ifndef PROFILE .prologue 0 #else .prologue 1 #endif ble $16, $SubEnd # if n <= 0 goto $End cmpeq $18, 1, $1 addq $18, $18, $18 cmpeq $20, 1, $2 addq $20, $20, $20 sra $16, 2, $21 and $1, $2, $1 and $16, 3, $22 beq $1, $Sub ble $21, $MainRemain .align 4 $MainLoop: LD $f10, 0*SIZE($19) LD $f11, 1*SIZE($19) LD $f12, 2*SIZE($19) LD $f13, 3*SIZE($19) LD $f14, 4*SIZE($19) LD $f15, 5*SIZE($19) LD $f16, 6*SIZE($19) LD $f17, 7*SIZE($19) LD $f20, 0*SIZE($17) LD $f21, 1*SIZE($17) LD $f22, 2*SIZE($17) LD $f23, 3*SIZE($17) LD $f24, 4*SIZE($17) LD $f25, 5*SIZE($17) LD $f26, 6*SIZE($17) LD $f27, 7*SIZE($17) lds $f31, 16*SIZE($17) unop lds $f31, 16*SIZE($19) subl $21, 1, $21 ST $f10, 0*SIZE($17) ST $f11, 1*SIZE($17) ST $f12, 2*SIZE($17) ST $f13, 3*SIZE($17) ST $f14, 4*SIZE($17) ST $f15, 5*SIZE($17) ST $f16, 6*SIZE($17) ST $f17, 7*SIZE($17) ST $f20, 0*SIZE($19) ST $f21, 1*SIZE($19) ST $f22, 2*SIZE($19) ST $f23, 3*SIZE($19) ST $f24, 4*SIZE($19) ST $f25, 5*SIZE($19) ST $f26, 6*SIZE($19) ST $f27, 7*SIZE($19) lda $17, 8*SIZE($17) lda $19, 8*SIZE($19) bgt $21, $MainLoop .align 4 $MainRemain: ble $22, $MainEnd .align 4 $MainRemainLoop: LD $f10, 0*SIZE($19) LD $f11, 1*SIZE($19) LD $f20, 0*SIZE($17) LD $f21, 1*SIZE($17) lda $17, 2*SIZE($17) lda $19, 2*SIZE($19) subl $22, 1, $22 ST $f10, -2*SIZE($17) ST $f11, -1*SIZE($17) ST $f20, -2*SIZE($19) ST $f21, -1*SIZE($19) bgt $22, $MainRemainLoop .align 4 $MainEnd: clr $0 ret .align 4 $Sub: mov $17, $23 mov $19, $24 ble $21, $SubRemain .align 4 $SubLoop: LD $f10, 0*SIZE($19) LD $f11, 1*SIZE($19) SXADDQ $20, $19, $19 LD $f12, 0*SIZE($19) LD $f13, 1*SIZE($19) SXADDQ $20, $19, $19 LD $f14, 0*SIZE($19) LD $f15, 1*SIZE($19) SXADDQ $20, $19, $19 LD $f16, 0*SIZE($19) LD $f17, 1*SIZE($19) SXADDQ $20, $19, $19 LD $f20, 0*SIZE($17) LD $f21, 1*SIZE($17) SXADDQ $18, $17, $17 LD $f22, 0*SIZE($17) LD $f23, 1*SIZE($17) SXADDQ $18, $17, $17 LD $f24, 0*SIZE($17) LD $f25, 1*SIZE($17) SXADDQ $18, $17, $17 LD $f26, 0*SIZE($17) LD $f27, 1*SIZE($17) SXADDQ $18, $17, $17 ST $f10, 0*SIZE($23) ST $f11, 1*SIZE($23) SXADDQ $18, $23, $23 ST $f12, 0*SIZE($23) ST $f13, 1*SIZE($23) SXADDQ $18, $23, $23 ST $f14, 0*SIZE($23) ST $f15, 1*SIZE($23) SXADDQ $18, $23, $23 ST $f16, 0*SIZE($23) ST $f17, 1*SIZE($23) SXADDQ $18, $23, $23 ST $f20, 0*SIZE($24) ST $f21, 1*SIZE($24) SXADDQ $20, $24, $24 ST $f22, 0*SIZE($24) ST $f23, 1*SIZE($24) SXADDQ $20, $24, $24 ST $f24, 0*SIZE($24) ST $f25, 1*SIZE($24) SXADDQ $20, $24, $24 ST $f26, 0*SIZE($24) ST $f27, 1*SIZE($24) SXADDQ $20, $24, $24 subl $21, 1, $21 bgt $21, $SubLoop .align 4 $SubRemain: ble $22, $SubEnd .align 4 $SubRemainLoop: LD $f10, 0*SIZE($19) LD $f11, 1*SIZE($19) LD $f20, 0*SIZE($17) LD $f21, 1*SIZE($17) subl $22, 1, $22 ST $f10, 0*SIZE($17) ST $f11, 1*SIZE($17) ST $f20, 0*SIZE($19) ST $f21, 1*SIZE($19) SXADDQ $18, $17, $17 SXADDQ $20, $19, $19 bgt $22, $SubRemainLoop .align 4 $SubEnd: clr $0 ret EPILOGUE OpenBLAS-0.2.20/kernel/alpha/ztrsm_kernel_2x2_LN.S000066400000000000000000001002501313527062700214360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 48 #define UNOP #endif #ifdef EV4 #define UNOP #endif .set noat .set noreorder .arch ev6 .text .align 5 .globl CNAME .ent CNAME #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $21 #define B $22 #define C $20 #define LDC $23 #define C1 $19 #define C2 $24 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha_i $f29 #define alpha_r $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define AORIG $3 #define OFFSET $4 #if defined(LN) || defined(LT) #ifndef CONJ #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #define ADD5 SUB #define ADD6 ADD #else #define ADD1 ADD #define ADD2 ADD #define ADD3 SUB #define ADD4 ADD #define ADD5 ADD #define ADD6 SUB #endif #else #ifndef CONJ #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #define ADD5 SUB #define ADD6 ADD #else #define ADD1 ADD #define ADD2 ADD #define ADD3 ADD #define ADD4 SUB #define ADD5 ADD #define ADD6 SUB #endif #endif CNAME: .frame $sp, STACKSIZE, $26, 0 #ifdef PROFILE ldgp $gp, 0($27) lda $at, _mcount jsr $at, ($at), _mcount #endif #ifndef PROFILE .prologue 0 #else .prologue 1 #endif lda $sp, -STACKSIZE($sp) ldq B, 0 + STACKSIZE($sp) ldq C, 8 + STACKSIZE($sp) ldq LDC, 16 + STACKSIZE($sp) ldq OFFSET, 24 + STACKSIZE($sp) sll LDC, ZBASE_SHIFT, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #ifdef LN addq M, M, TMP2 mulq TMP2, K, TMP1 SXADDQ TMP1, A, A SXADDQ TMP2, C, C #endif #ifdef RN negq OFFSET, KK #endif #ifdef RT mulq N, K, TMP1 addq TMP1, TMP1, TMP1 SXADDQ TMP1, B, B mulq N, LDC, TMP1 addq TMP1, C, C subq N, OFFSET, KK #endif sra N, 1, J ble J, $L30 .align 4 $L01: #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 subq B, TMP1, B subq C, LDC, C2 subq C2, LDC, C1 subq C2, LDC, C #else mov C, C1 addq C, LDC, C2 addq C2, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif and M, 1, I fclr t1 fclr t2 fclr t3 fclr t4 fclr c01 fclr c05 ble I, $L20 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 4 * SIZE(B) lda L, -2(KK) ble KK, $L28 ble L, $L25 #else #ifdef LN sll K, ZBASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 4 * SIZE(BO) lda L, -2(TMP1) ble TMP1, $L28 ble L, $L25 #endif .align 5 $L22: ADD1 c09, t1, c09 unop MUL a1, b1, t1 unop ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 unop ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD2 c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD1 c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD3 c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD2 c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD1 c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD1 c09, t1, c09 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L27 #else blbs TMP1, $L27 #endif .align 4 ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 unop ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD1 c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L27: ADD3 c10, t2, c10 MUL a2, b1, t2 ADD4 c13, t3, c13 MUL a1, b2, t3 ADD2 c14, t4, c14 MUL a2, b2, t4 ADD1 c01, t1, c01 MUL a1, b3, t1 ADD3 c02, t2, c02 MUL a2, b3, t2 ADD4 c05, t3, c05 MUL a1, b4, t3 ADD2 c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b4, t4 lda BO, 4 * SIZE(BO) ADD1 c09, t1, c09 ADD3 c10, t2, c10 ADD4 c13, t3, c13 ADD2 c14, t4, c14 ADD c01, c06, c01 ADD c02, c05, c02 ADD c09, c14, c09 ADD c10, c13, c10 .align 4 $L28: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 MUL a3, c01, t1 MUL a3, c02, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a4, c02, t1 MUL a4, c01, t2 ADD6 c09, t1, c09 ADD5 c10, t2, c10 LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c09, t1, c09 ADD6 c10, t2, c10 #endif #ifdef RT LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) LD a3, 4 * SIZE(BO) LD a4, 5 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c09, t1, c09 ADD6 c10, t2, c10 MUL a3, c09, t1 MUL a3, c10, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a4, c10, t1 MUL a4, c09, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c10, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c09, 2 * SIZE(AO) ST c10, 3 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c09, 0 * SIZE(C2) ST c10, 1 * SIZE(C2) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) #endif #ifdef RT sll K, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L20: sra M, 1, I fclr t1 fclr t2 fclr t3 fclr t4 fclr c01 fclr c05 ble I, $L29 .align 4 $L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c07 lda BO, 4 * SIZE(B) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(KK) fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble KK, $L18 ble L, $L15 #else #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c07 lda BO, 4 * SIZE(BO) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(TMP1) fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble TMP1, $L18 ble L, $L15 #endif .align 5 $L12: /* 1 */ ADD1 c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD3 c12, t2, c12 unop MUL b1, a2, t2 unop ADD2 c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD4 c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD1 c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD3 c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD1 c03, t1, c03 unop MUL b3, a1, t1 unop ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD1 c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD1 c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD3 c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD2 c16, t3, c16 unop MUL b2, a2, t3 unop ADD4 c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD1 c01, t1, c01 unop MUL b5, a6, t1 unop ADD3 c02, t2, c02 unop MUL b5, a4, t2 unop ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD1 c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD3 c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD1 c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD1 c11, t1, c11 unop MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L17 #else blbs TMP1, $L17 #endif .align 4 ADD3 c12, t2, c12 MUL b1, a2, t2 ADD2 c16, t3, c16 MUL b2, a2, t3 ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD1 c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD1 c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L17: ADD3 c12, t2, c12 MUL b1, a2, t2 ADD2 c16, t3, c16 MUL b2, a2, t3 ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 MUL b1, a4, t2 ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 MUL b3, a1, t1 ADD3 c04, t2, c04 MUL b3, a2, t2 ADD2 c08, t3, c08 MUL b4, a2, t3 ADD4 c13, t4, c13 MUL b2, a3, t4 ADD1 c09, t1, c09 MUL b3, a3, t1 ADD3 c10, t2, c10 MUL b3, a4, t2 ADD2 c14, t3, c14 MUL b4, a4, t3 ADD4 c07, t4, c07 lda AO, 4 * SIZE(AO) MUL b4, a3, t4 lda BO, 4 * SIZE(BO) ADD1 c11, t1, c11 ADD3 c12, t2, c12 ADD2 c16, t3, c16 ADD4 c15, t4, c15 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 ADD c09, c14, c09 ADD c10, c13, c10 ADD c11, c16, c11 ADD c12, c15, c12 .align 4 $L18: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 SUB b1, c03, c03 SUB b2, c04, c04 SUB b3, c11, c11 SUB b4, c12, c12 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c09, c09 SUB b2, c10, c10 SUB b3, c11, c11 SUB b4, c12, c12 #endif #ifdef LN LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c03, c03 MUL a1, c04, c04 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c03, t1, c03 ADD6 c04, t2, c04 ADD5 c11, t3, c11 ADD6 c12, t4, c12 MUL a3, c03, t1 MUL a3, c04, t2 MUL a3, c11, t3 MUL a3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c09, t3, c09 SUB c10, t4, c10 MUL a4, c04, t1 MUL a4, c03, t2 MUL a4, c12, t3 MUL a4, c11, t4 ADD6 c01, t1, c01 ADD5 c02, t2, c02 ADD6 c09, t3, c09 ADD5 c10, t4, c10 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c09, t3 MUL a3, c10, t4 SUB c03, t1, c03 SUB c04, t2, c04 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c02, t1 MUL a4, c01, t2 MUL a4, c10, t3 MUL a4, c09, t4 ADD6 c03, t1, c03 ADD5 c04, t2, c04 ADD6 c11, t3, c11 ADD5 c12, t4, c12 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c03, c03 MUL a1, c04, c04 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c03, t1, c03 ADD6 c04, t2, c04 ADD5 c11, t3, c11 ADD6 c12, t4, c12 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c03, t3 MUL a3, c04, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c02, t1 MUL a4, c01, t2 MUL a4, c04, t3 MUL a4, c03, t4 ADD6 c09, t1, c09 ADD5 c10, t2, c10 ADD6 c11, t3, c11 ADD5 c12, t4, c12 LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c09, t1, c09 ADD6 c10, t2, c10 ADD5 c11, t3, c11 ADD6 c12, t4, c12 #endif #ifdef RT LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) LD a3, 4 * SIZE(BO) LD a4, 5 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c09, t1, c09 ADD6 c10, t2, c10 ADD5 c11, t3, c11 ADD6 c12, t4, c12 MUL a3, c09, t1 MUL a3, c10, t2 MUL a3, c11, t3 MUL a3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a4, c10, t1 MUL a4, c09, t2 MUL a4, c12, t3 MUL a4, c11, t4 ADD6 c01, t1, c01 ADD5 c02, t2, c02 ADD6 c03, t3, c03 ADD5 c04, t4, c04 LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c10, 3 * SIZE(BO) ST c03, 4 * SIZE(BO) ST c04, 5 * SIZE(BO) ST c11, 6 * SIZE(BO) ST c12, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c09, 4 * SIZE(AO) ST c10, 5 * SIZE(AO) ST c11, 6 * SIZE(AO) ST c12, 7 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c09, 0 * SIZE(C2) ST c10, 1 * SIZE(C2) ST c11, 2 * SIZE(C2) ST c12, 3 * SIZE(C2) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 1, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif fclr c01 fclr c05 lda I, -1(I) bgt I, $L11 .align 4 $L29: #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 2, KK #endif #ifdef RT subq KK, 2, KK #endif lda J, -1(J) bgt J, $L01 .align 4 $L30: and N, 1, J ble J, $L999 #ifdef RT sll K, ZBASE_SHIFT, TMP1 subq B, TMP1, B subq C, LDC, C1 subq C, LDC, C #else mov C, C1 addq C, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif and M, 1, I ble I, $L50 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(B) lda L, -2(KK) ble KK, $L58 ble L, $L55 #else #ifdef LN sll K, ZBASE_SHIFT, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(BO) lda L, -2(TMP1) ble TMP1, $L58 ble L, $L55 #endif .align 5 $L52: ADD1 c01, t1, c01 unop MUL a1, b1, t1 unop ADD3 c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD4 c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L52 .align 4 $L55: ADD1 c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L57 #else blbs TMP1, $L57 #endif .align 4 ADD3 c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L57: ADD3 c02, t2, c02 MUL a2, b1, t2 ADD4 c05, t3, c05 MUL a1, b2, t3 ADD2 c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b2, t4 lda BO, 2 * SIZE(BO) ADD1 c01, t1, c01 ADD3 c02, t2, c02 ADD4 c05, t3, c05 ADD2 c06, t4, c06 ADD c01, c06, c01 ADD c02, c05, c02 $L58: #if defined(LN) || defined(RT) subq KK, 1, TMP1 sll TMP1, ZBASE_SHIFT, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) #ifndef LN lda C1, 2 * SIZE(C1) #endif #ifdef RT sll K, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L50: sra M, 1, I ble I, $L59 .align 4 $L41: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda BO, 2 * SIZE(B) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 lda L, -2(KK) fclr c04 fclr c08 ble KK, $L48 ble L, $L45 #else #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda BO, 2 * SIZE(BO) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 lda L, -2(TMP1) fclr c04 fclr c08 ble TMP1, $L48 ble L, $L45 #endif .align 5 $L42: ADD4 c05, t1, c05 unop MUL a1, b1, t1 unop ADD2 c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD4 c07, t3, c07 unop MUL a3, b1, t3 unop ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD4 c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD2 c06, t2, c06 unop MUL a2, b3, t2 unop ADD4 c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD2 c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD1 c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD3 c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L42 .align 4 $L45: ADD4 c05, t1, c05 MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L47 #else blbs TMP1, $L47 #endif .align 4 ADD2 c06, t2, c06 MUL a2, b1, t2 ADD4 c07, t3, c07 MUL a3, b1, t3 ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD4 c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L47: ADD2 c06, t2, c06 MUL a2, b1, t2 ADD4 c07, t3, c07 MUL a3, b1, t3 ADD2 c08, t4, c08 MUL a4, b1, t4 ADD1 c01, t1, c01 MUL a1, b2, t1 ADD3 c02, t2, c02 MUL a2, b2, t2 ADD1 c03, t3, c03 MUL a3, b2, t3 ADD3 c04, t4, c04 lda AO, 4 * SIZE(AO) MUL a4, b2, t4 lda BO, 2 * SIZE(BO) ADD4 c05, t1, c05 ADD2 c06, t2, c06 ADD4 c07, t3, c07 ADD2 c08, t4, c08 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 $L48: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #endif #ifdef LN LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c03, t1, c03 ADD6 c04, t2, c04 MUL a3, c03, t1 MUL a3, c04, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a4, c04, t1 MUL a4, c03, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 MUL a3, c01, t1 MUL a3, c02, t2 SUB c03, t1, c03 SUB c04, t2, c04 MUL a4, c02, t1 MUL a4, c01, t2 ADD6 c03, t1, c03 ADD5 c04, t2, c04 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c03, t1, c03 ADD6 c04, t2, c04 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c03, 2 * SIZE(BO) ST c04, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) #ifndef LN lda C1, 4 * SIZE(C1) #endif #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif lda I, -1(I) bgt I, $L41 .align 4 $L59: #ifdef LN sll K, ZBASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 1, KK #endif #ifdef RT subq KK, 1, KK #endif .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret .ident VERSION .end CNAME OpenBLAS-0.2.20/kernel/alpha/ztrsm_kernel_2x2_LT.S000066400000000000000000001001571313527062700214520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 48 #define UNOP #endif #ifdef EV4 #define UNOP #endif .set noat .set noreorder .arch ev6 .text .align 5 .globl CNAME .ent CNAME #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $21 #define B $22 #define C $20 #define LDC $23 #define C1 $19 #define C2 $24 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha_i $f29 #define alpha_r $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define AORIG $3 #define OFFSET $4 #if defined(LN) || defined(LT) #ifndef CONJ #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #define ADD5 SUB #define ADD6 ADD #else #define ADD1 ADD #define ADD2 ADD #define ADD3 SUB #define ADD4 ADD #define ADD5 ADD #define ADD6 SUB #endif #else #ifndef CONJ #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #define ADD5 SUB #define ADD6 ADD #else #define ADD1 ADD #define ADD2 ADD #define ADD3 ADD #define ADD4 SUB #define ADD5 ADD #define ADD6 SUB #endif #endif CNAME: .frame $sp, STACKSIZE, $26, 0 #ifdef PROFILE ldgp $gp, 0($27) lda $at, _mcount jsr $at, ($at), _mcount #endif #ifndef PROFILE .prologue 0 #else .prologue 1 #endif lda $sp, -STACKSIZE($sp) ldq B, 0 + STACKSIZE($sp) ldq C, 8 + STACKSIZE($sp) ldq LDC, 16 + STACKSIZE($sp) ldq OFFSET, 24 + STACKSIZE($sp) sll LDC, ZBASE_SHIFT, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #ifdef LN addq M, M, TMP2 mulq TMP2, K, TMP1 SXADDQ TMP1, A, A SXADDQ TMP2, C, C #endif #ifdef RN negq OFFSET, KK #endif #ifdef RT mulq N, K, TMP1 addq TMP1, TMP1, TMP1 SXADDQ TMP1, B, B mulq N, LDC, TMP1 addq TMP1, C, C subq N, OFFSET, KK #endif sra N, 1, J ble J, $L30 .align 4 $L01: #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 subq B, TMP1, B subq C, LDC, C2 subq C2, LDC, C1 subq C2, LDC, C #else mov C, C1 addq C, LDC, C2 addq C2, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I fclr t1 fclr t2 fclr t3 fclr t4 fclr c01 fclr c05 ble I, $L20 .align 4 $L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c07 lda BO, 4 * SIZE(B) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(KK) fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble KK, $L18 ble L, $L15 #else #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c07 lda BO, 4 * SIZE(BO) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(TMP1) fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble TMP1, $L18 ble L, $L15 #endif .align 5 $L12: /* 1 */ ADD1 c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD3 c12, t2, c12 unop MUL b1, a2, t2 unop ADD2 c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD4 c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD1 c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD3 c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD1 c03, t1, c03 unop MUL b3, a1, t1 unop ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD1 c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD1 c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD3 c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD2 c16, t3, c16 unop MUL b2, a2, t3 unop ADD4 c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD1 c01, t1, c01 unop MUL b5, a6, t1 unop ADD3 c02, t2, c02 unop MUL b5, a4, t2 unop ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD1 c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD3 c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD1 c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD1 c11, t1, c11 unop MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L17 #else blbs TMP1, $L17 #endif .align 4 ADD3 c12, t2, c12 MUL b1, a2, t2 ADD2 c16, t3, c16 MUL b2, a2, t3 ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD1 c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD1 c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L17: ADD3 c12, t2, c12 MUL b1, a2, t2 ADD2 c16, t3, c16 MUL b2, a2, t3 ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 MUL b1, a4, t2 ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 MUL b3, a1, t1 ADD3 c04, t2, c04 MUL b3, a2, t2 ADD2 c08, t3, c08 MUL b4, a2, t3 ADD4 c13, t4, c13 MUL b2, a3, t4 ADD1 c09, t1, c09 MUL b3, a3, t1 ADD3 c10, t2, c10 MUL b3, a4, t2 ADD2 c14, t3, c14 MUL b4, a4, t3 ADD4 c07, t4, c07 lda AO, 4 * SIZE(AO) MUL b4, a3, t4 lda BO, 4 * SIZE(BO) ADD1 c11, t1, c11 ADD3 c12, t2, c12 ADD2 c16, t3, c16 ADD4 c15, t4, c15 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 ADD c09, c14, c09 ADD c10, c13, c10 ADD c11, c16, c11 ADD c12, c15, c12 .align 4 $L18: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 SUB b1, c03, c03 SUB b2, c04, c04 SUB b3, c11, c11 SUB b4, c12, c12 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c09, c09 SUB b2, c10, c10 SUB b3, c11, c11 SUB b4, c12, c12 #endif #ifdef LN LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c03, c03 MUL a1, c04, c04 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c03, t1, c03 ADD6 c04, t2, c04 ADD5 c11, t3, c11 ADD6 c12, t4, c12 MUL a3, c03, t1 MUL a3, c04, t2 MUL a3, c11, t3 MUL a3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c09, t3, c09 SUB c10, t4, c10 MUL a4, c04, t1 MUL a4, c03, t2 MUL a4, c12, t3 MUL a4, c11, t4 ADD6 c01, t1, c01 ADD5 c02, t2, c02 ADD6 c09, t3, c09 ADD5 c10, t4, c10 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c09, t3 MUL a3, c10, t4 SUB c03, t1, c03 SUB c04, t2, c04 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c02, t1 MUL a4, c01, t2 MUL a4, c10, t3 MUL a4, c09, t4 ADD6 c03, t1, c03 ADD5 c04, t2, c04 ADD6 c11, t3, c11 ADD5 c12, t4, c12 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c03, c03 MUL a1, c04, c04 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c03, t1, c03 ADD6 c04, t2, c04 ADD5 c11, t3, c11 ADD6 c12, t4, c12 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c03, t3 MUL a3, c04, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c02, t1 MUL a4, c01, t2 MUL a4, c04, t3 MUL a4, c03, t4 ADD6 c09, t1, c09 ADD5 c10, t2, c10 ADD6 c11, t3, c11 ADD5 c12, t4, c12 LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c09, t1, c09 ADD6 c10, t2, c10 ADD5 c11, t3, c11 ADD6 c12, t4, c12 #endif #ifdef RT LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) LD a3, 4 * SIZE(BO) LD a4, 5 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c09, t1, c09 ADD6 c10, t2, c10 ADD5 c11, t3, c11 ADD6 c12, t4, c12 MUL a3, c09, t1 MUL a3, c10, t2 MUL a3, c11, t3 MUL a3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a4, c10, t1 MUL a4, c09, t2 MUL a4, c12, t3 MUL a4, c11, t4 ADD6 c01, t1, c01 ADD5 c02, t2, c02 ADD6 c03, t3, c03 ADD5 c04, t4, c04 LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c10, 3 * SIZE(BO) ST c03, 4 * SIZE(BO) ST c04, 5 * SIZE(BO) ST c11, 6 * SIZE(BO) ST c12, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c09, 4 * SIZE(AO) ST c10, 5 * SIZE(AO) ST c11, 6 * SIZE(AO) ST c12, 7 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c09, 0 * SIZE(C2) ST c10, 1 * SIZE(C2) ST c11, 2 * SIZE(C2) ST c12, 3 * SIZE(C2) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 1, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif fclr c01 fclr c05 lda I, -1(I) bgt I, $L11 .align 4 $L20: and M, 1, I ble I, $L29 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 4 * SIZE(B) lda L, -2(KK) ble KK, $L28 ble L, $L25 #else #ifdef LN sll K, ZBASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 4 * SIZE(BO) lda L, -2(TMP1) ble TMP1, $L28 ble L, $L25 #endif .align 5 $L22: ADD1 c09, t1, c09 unop MUL a1, b1, t1 unop ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 unop ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD2 c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD1 c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD3 c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD2 c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD1 c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD1 c09, t1, c09 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L27 #else blbs TMP1, $L27 #endif .align 4 ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 unop ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD1 c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L27: ADD3 c10, t2, c10 MUL a2, b1, t2 ADD4 c13, t3, c13 MUL a1, b2, t3 ADD2 c14, t4, c14 MUL a2, b2, t4 ADD1 c01, t1, c01 MUL a1, b3, t1 ADD3 c02, t2, c02 MUL a2, b3, t2 ADD4 c05, t3, c05 MUL a1, b4, t3 ADD2 c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b4, t4 lda BO, 4 * SIZE(BO) ADD1 c09, t1, c09 ADD3 c10, t2, c10 ADD4 c13, t3, c13 ADD2 c14, t4, c14 ADD c01, c06, c01 ADD c02, c05, c02 ADD c09, c14, c09 ADD c10, c13, c10 .align 4 $L28: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 MUL a3, c01, t1 MUL a3, c02, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a4, c02, t1 MUL a4, c01, t2 ADD6 c09, t1, c09 ADD5 c10, t2, c10 LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c09, t1, c09 ADD6 c10, t2, c10 #endif #ifdef RT LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) LD a3, 4 * SIZE(BO) LD a4, 5 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c09, t1, c09 ADD6 c10, t2, c10 MUL a3, c09, t1 MUL a3, c10, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a4, c10, t1 MUL a4, c09, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c10, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c09, 2 * SIZE(AO) ST c10, 3 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c09, 0 * SIZE(C2) ST c10, 1 * SIZE(C2) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) #endif #ifdef RT sll K, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L29: #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 2, KK #endif #ifdef RT subq KK, 2, KK #endif lda J, -1(J) bgt J, $L01 .align 4 $L30: and N, 1, J ble J, $L999 #ifdef RT sll K, ZBASE_SHIFT, TMP1 subq B, TMP1, B subq C, LDC, C1 subq C, LDC, C #else mov C, C1 addq C, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I ble I, $L50 .align 4 $L41: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda BO, 2 * SIZE(B) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 lda L, -2(KK) fclr c04 fclr c08 ble KK, $L48 ble L, $L45 #else #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda BO, 2 * SIZE(BO) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 lda L, -2(TMP1) fclr c04 fclr c08 ble TMP1, $L48 ble L, $L45 #endif .align 5 $L42: ADD4 c05, t1, c05 unop MUL a1, b1, t1 unop ADD2 c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD4 c07, t3, c07 unop MUL a3, b1, t3 unop ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD4 c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD2 c06, t2, c06 unop MUL a2, b3, t2 unop ADD4 c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD2 c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD1 c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD3 c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L42 .align 4 $L45: ADD4 c05, t1, c05 MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L47 #else blbs TMP1, $L47 #endif .align 4 ADD2 c06, t2, c06 MUL a2, b1, t2 ADD4 c07, t3, c07 MUL a3, b1, t3 ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD4 c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L47: ADD2 c06, t2, c06 MUL a2, b1, t2 ADD4 c07, t3, c07 MUL a3, b1, t3 ADD2 c08, t4, c08 MUL a4, b1, t4 ADD1 c01, t1, c01 MUL a1, b2, t1 ADD3 c02, t2, c02 MUL a2, b2, t2 ADD1 c03, t3, c03 MUL a3, b2, t3 ADD3 c04, t4, c04 lda AO, 4 * SIZE(AO) MUL a4, b2, t4 lda BO, 2 * SIZE(BO) ADD4 c05, t1, c05 ADD2 c06, t2, c06 ADD4 c07, t3, c07 ADD2 c08, t4, c08 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 $L48: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #endif #ifdef LN LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c03, t1, c03 ADD6 c04, t2, c04 MUL a3, c03, t1 MUL a3, c04, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a4, c04, t1 MUL a4, c03, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 MUL a3, c01, t1 MUL a3, c02, t2 SUB c03, t1, c03 SUB c04, t2, c04 MUL a4, c02, t1 MUL a4, c01, t2 ADD6 c03, t1, c03 ADD5 c04, t2, c04 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c03, t1, c03 ADD6 c04, t2, c04 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c03, 2 * SIZE(BO) ST c04, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) #ifndef LN lda C1, 4 * SIZE(C1) #endif #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif lda I, -1(I) bgt I, $L41 .align 4 $L50: and M, 1, I ble I, $L59 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(B) lda L, -2(KK) ble KK, $L58 ble L, $L55 #else #ifdef LN sll K, ZBASE_SHIFT, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(BO) lda L, -2(TMP1) ble TMP1, $L58 ble L, $L55 #endif .align 5 $L52: ADD1 c01, t1, c01 unop MUL a1, b1, t1 unop ADD3 c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD4 c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L52 .align 4 $L55: ADD1 c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L57 #else blbs TMP1, $L57 #endif .align 4 ADD3 c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L57: ADD3 c02, t2, c02 MUL a2, b1, t2 ADD4 c05, t3, c05 MUL a1, b2, t3 ADD2 c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b2, t4 lda BO, 2 * SIZE(BO) ADD1 c01, t1, c01 ADD3 c02, t2, c02 ADD4 c05, t3, c05 ADD2 c06, t4, c06 ADD c01, c06, c01 ADD c02, c05, c02 $L58: #if defined(LN) || defined(RT) subq KK, 1, TMP1 sll TMP1, ZBASE_SHIFT, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) #ifndef LN lda C1, 2 * SIZE(C1) #endif #ifdef RT sll K, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L59: #ifdef LN sll K, ZBASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 1, KK #endif #ifdef RT subq KK, 1, KK #endif .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret .ident VERSION .end CNAME OpenBLAS-0.2.20/kernel/alpha/ztrsm_kernel_2x2_RT.S000066400000000000000000001001571313527062700214600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "version.h" #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." #endif #ifdef EV6 #define PREFETCHSIZE 56 #define UNOP unop #endif #ifdef EV5 #define PREFETCHSIZE 48 #define UNOP #endif #ifdef EV4 #define UNOP #endif .set noat .set noreorder .arch ev6 .text .align 5 .globl CNAME .ent CNAME #define STACKSIZE 80 #define M $16 #define N $17 #define K $18 #define A $21 #define B $22 #define C $20 #define LDC $23 #define C1 $19 #define C2 $24 #define AO $at #define BO $5 #define I $6 #define J $7 #define L $8 #define a1 $f16 #define a2 $f17 #define a3 $f18 #define a4 $f19 #define b1 $f20 #define b2 $f21 #define b3 $f22 #define b4 $f23 #define t1 $f24 #define t2 $f25 #define t3 $f26 #define t4 $f27 #define a5 $f28 #define a6 $f30 #define b5 $f29 #define alpha_i $f29 #define alpha_r $f30 #define c01 $f0 #define c02 $f1 #define c03 $f2 #define c04 $f3 #define c05 $f4 #define c06 $f5 #define c07 $f6 #define c08 $f7 #define c09 $f8 #define c10 $f9 #define c11 $f10 #define c12 $f11 #define c13 $f12 #define c14 $f13 #define c15 $f14 #define c16 $f15 #define TMP1 $0 #define TMP2 $1 #define KK $2 #define AORIG $3 #define OFFSET $4 #if defined(LN) || defined(LT) #ifndef CONJ #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #define ADD5 SUB #define ADD6 ADD #else #define ADD1 ADD #define ADD2 ADD #define ADD3 SUB #define ADD4 ADD #define ADD5 ADD #define ADD6 SUB #endif #else #ifndef CONJ #define ADD1 ADD #define ADD2 SUB #define ADD3 ADD #define ADD4 ADD #define ADD5 SUB #define ADD6 ADD #else #define ADD1 ADD #define ADD2 ADD #define ADD3 ADD #define ADD4 SUB #define ADD5 ADD #define ADD6 SUB #endif #endif CNAME: .frame $sp, STACKSIZE, $26, 0 #ifdef PROFILE ldgp $gp, 0($27) lda $at, _mcount jsr $at, ($at), _mcount #endif #ifndef PROFILE .prologue 0 #else .prologue 1 #endif lda $sp, -STACKSIZE($sp) ldq B, 0 + STACKSIZE($sp) ldq C, 8 + STACKSIZE($sp) ldq LDC, 16 + STACKSIZE($sp) ldq OFFSET, 24 + STACKSIZE($sp) sll LDC, ZBASE_SHIFT, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999 #ifdef LN addq M, M, TMP2 mulq TMP2, K, TMP1 SXADDQ TMP1, A, A SXADDQ TMP2, C, C #endif #ifdef RN negq OFFSET, KK #endif #ifdef RT mulq N, K, TMP1 addq TMP1, TMP1, TMP1 SXADDQ TMP1, B, B mulq N, LDC, TMP1 addq TMP1, C, C subq N, OFFSET, KK #endif and N, 1, J ble J, $L30 #ifdef RT sll K, ZBASE_SHIFT, TMP1 subq B, TMP1, B subq C, LDC, C1 subq C, LDC, C #else mov C, C1 addq C, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I ble I, $L50 .align 4 $L41: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda BO, 2 * SIZE(B) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 lda L, -2(KK) fclr c04 fclr c08 ble KK, $L48 ble L, $L45 #else #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda BO, 2 * SIZE(BO) fclr c03 lda AO, 4 * SIZE(AO) fclr c07 lda L, -2(TMP1) fclr c04 fclr c08 ble TMP1, $L48 ble L, $L45 #endif .align 5 $L42: ADD4 c05, t1, c05 unop MUL a1, b1, t1 unop ADD2 c06, t2, c06 lda L, -2(L) MUL a2, b1, t2 unop ADD4 c07, t3, c07 unop MUL a3, b1, t3 unop ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 2 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 lda BO, 4 * SIZE(BO) MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 unop MUL a4, b2, t4 LD a5, 3 * SIZE(AO) ADD4 c05, t1, c05 unop MUL a1, b3, t1 LD b2, -1 * SIZE(BO) ADD2 c06, t2, c06 unop MUL a2, b3, t2 unop ADD4 c07, t3, c07 unop MUL a3, b3, t3 lda AO, 8 * SIZE(AO) ADD2 c08, t4, c08 unop MUL a5, b3, t4 LD b3, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b4, t1 LD a1, -4 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b4, t2 LD a2, -3 * SIZE(AO) ADD1 c03, t3, c03 LD a4, -1 * SIZE(AO) MUL a3, b4, t3 LD a3, -2 * SIZE(AO) ADD3 c04, t4, c04 MUL a5, b4, t4 LD b4, 1 * SIZE(BO) bgt L, $L42 .align 4 $L45: ADD4 c05, t1, c05 MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L47 #else blbs TMP1, $L47 #endif .align 4 ADD2 c06, t2, c06 MUL a2, b1, t2 ADD4 c07, t3, c07 MUL a3, b1, t3 ADD2 c08, t4, c08 unop MUL a4, b1, t4 LD b1, 0 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b2, t1 LD a1, 0 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b2, t2 LD a2, 1 * SIZE(AO) ADD1 c03, t3, c03 unop MUL a3, b2, t3 LD a3, 2 * SIZE(AO) ADD3 c04, t4, c04 MUL a4, b2, t4 LD a4, 3 * SIZE(AO) lda AO, 4 * SIZE(AO) ADD4 c05, t1, c05 LD b2, 1 * SIZE(BO) MUL a1, b1, t1 lda BO, 2 * SIZE(BO) .align 4 $L47: ADD2 c06, t2, c06 MUL a2, b1, t2 ADD4 c07, t3, c07 MUL a3, b1, t3 ADD2 c08, t4, c08 MUL a4, b1, t4 ADD1 c01, t1, c01 MUL a1, b2, t1 ADD3 c02, t2, c02 MUL a2, b2, t2 ADD1 c03, t3, c03 MUL a3, b2, t3 ADD3 c04, t4, c04 lda AO, 4 * SIZE(AO) MUL a4, b2, t4 lda BO, 2 * SIZE(BO) ADD4 c05, t1, c05 ADD2 c06, t2, c06 ADD4 c07, t3, c07 ADD2 c08, t4, c08 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 $L48: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 1, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 #endif #ifdef LN LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c03, t1, c03 ADD6 c04, t2, c04 MUL a3, c03, t1 MUL a3, c04, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a4, c04, t1 MUL a4, c03, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 MUL a3, c01, t1 MUL a3, c02, t2 SUB c03, t1, c03 SUB c04, t2, c04 MUL a4, c02, t1 MUL a4, c01, t2 ADD6 c03, t1, c03 ADD5 c04, t2, c04 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c03, t1, c03 ADD6 c04, t2, c04 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c03, 2 * SIZE(BO) ST c04, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) #ifndef LN lda C1, 4 * SIZE(C1) #endif #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif lda I, -1(I) bgt I, $L41 .align 4 $L50: and M, 1, I ble I, $L59 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(B) fclr c01 LD b2, 1 * SIZE(B) fclr c05 LD b3, 2 * SIZE(B) fclr c02 LD b4, 3 * SIZE(B) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(B) lda L, -2(KK) ble KK, $L58 ble L, $L55 #else #ifdef LN sll K, ZBASE_SHIFT, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr t1 LD a2, 1 * SIZE(AO) fclr t2 LD a3, 2 * SIZE(AO) fclr t3 LD a4, 3 * SIZE(AO) fclr t4 LD b1, 0 * SIZE(BO) fclr c01 LD b2, 1 * SIZE(BO) fclr c05 LD b3, 2 * SIZE(BO) fclr c02 LD b4, 3 * SIZE(BO) fclr c06 lda AO, 2 * SIZE(AO) lda BO, 2 * SIZE(BO) lda L, -2(TMP1) ble TMP1, $L58 ble L, $L55 #endif .align 5 $L52: ADD1 c01, t1, c01 unop MUL a1, b1, t1 unop ADD3 c02, t2, c02 lda AO, 4 * SIZE(AO) MUL a2, b1, t2 LD b1, 2 * SIZE(BO) ADD4 c05, t3, c05 lda L, -2(L) MUL a1, b2, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, -1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, 3 * SIZE(BO) MUL a3, b3, t1 lda BO, 4 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, 0 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b4, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b4, t4 LD b4, 1 * SIZE(BO) unop LD a4, 1 * SIZE(AO) unop unop bgt L, $L52 .align 4 $L55: ADD1 c01, t1, c01 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L57 #else blbs TMP1, $L57 #endif .align 4 ADD3 c02, t2, c02 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c05, t3, c05 lda BO, 2 * SIZE(BO) MUL a1, b2, t3 LD a1, 0 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b2, t4 LD a2, 1 * SIZE(AO) ADD1 c01, t1, c01 LD b2, -1 * SIZE(BO) MUL a1, b1, t1 lda AO, 2 * SIZE(AO) .align 4 $L57: ADD3 c02, t2, c02 MUL a2, b1, t2 ADD4 c05, t3, c05 MUL a1, b2, t3 ADD2 c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b2, t4 lda BO, 2 * SIZE(BO) ADD1 c01, t1, c01 ADD3 c02, t2, c02 ADD4 c05, t3, c05 ADD2 c06, t4, c06 ADD c01, c06, c01 ADD c02, c05, c02 $L58: #if defined(LN) || defined(RT) subq KK, 1, TMP1 sll TMP1, ZBASE_SHIFT, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -2 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) #ifndef LN lda C1, 2 * SIZE(C1) #endif #ifdef RT sll K, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L59: #ifdef LN sll K, ZBASE_SHIFT, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 1, KK #endif #ifdef RT subq KK, 1, KK #endif .align 4 $L30: sra N, 1, J ble J, $L999 .align 4 $L01: #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 subq B, TMP1, B subq C, LDC, C2 subq C2, LDC, C1 subq C2, LDC, C #else mov C, C1 addq C, LDC, C2 addq C2, LDC, C #endif #ifdef LN addq M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I fclr t1 fclr t2 fclr t3 fclr t4 fclr c01 fclr c05 ble I, $L20 .align 4 $L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) fclr c03 LD b4, 3 * SIZE(B) fclr c07 lda BO, 4 * SIZE(B) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(KK) fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble KK, $L18 ble L, $L15 #else #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AO addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) fclr c03 LD b4, 3 * SIZE(BO) fclr c07 lda BO, 4 * SIZE(BO) fclr c11 lda AO, 4 * SIZE(AO) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(TMP1) fclr c08 lds $f31, 4 * SIZE(C2) fclr c12 fclr c16 ble TMP1, $L18 ble L, $L15 #endif .align 5 $L12: /* 1 */ ADD1 c11, t1, c11 #ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO) #else unop #endif MUL b1, a1, t1 #ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO) #else unop #endif ADD3 c12, t2, c12 unop MUL b1, a2, t2 unop ADD2 c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD4 c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO) /* 2 */ ADD1 c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD3 c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a1, t4 unop /* 3 */ ADD1 c03, t1, c03 unop MUL b3, a1, t1 unop ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) /* 4 */ ADD1 c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO) /* 5 */ ADD1 c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD3 c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD2 c16, t3, c16 unop MUL b2, a2, t3 unop ADD4 c15, t4, c15 unop MUL b2, a5, t4 unop /* 6 */ ADD1 c01, t1, c01 unop MUL b5, a6, t1 unop ADD3 c02, t2, c02 unop MUL b5, a4, t2 unop ADD2 c06, t3, c06 unop MUL b2, a4, t3 unop ADD4 c05, t4, c05 unop MUL b4, a5, t4 unop /* 7 */ ADD1 c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD3 c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO) /* 8 */ ADD1 c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4 $L15: ADD1 c11, t1, c11 unop MUL b1, a1, t1 #if defined(LT) || defined(RN) blbs KK, $L17 #else blbs TMP1, $L17 #endif .align 4 ADD3 c12, t2, c12 MUL b1, a2, t2 ADD2 c16, t3, c16 MUL b2, a2, t3 ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD3 c04, t2, c04 unop MUL b3, a2, t2 unop ADD2 c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD4 c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD1 c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD3 c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD2 c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD4 c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD1 c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4 $L17: ADD3 c12, t2, c12 MUL b1, a2, t2 ADD2 c16, t3, c16 MUL b2, a2, t3 ADD4 c15, t4, c15 MUL b2, a1, t4 ADD1 c01, t1, c01 MUL b1, a3, t1 ADD3 c02, t2, c02 MUL b1, a4, t2 ADD2 c06, t3, c06 MUL b2, a4, t3 ADD4 c05, t4, c05 MUL b4, a1, t4 ADD1 c03, t1, c03 MUL b3, a1, t1 ADD3 c04, t2, c04 MUL b3, a2, t2 ADD2 c08, t3, c08 MUL b4, a2, t3 ADD4 c13, t4, c13 MUL b2, a3, t4 ADD1 c09, t1, c09 MUL b3, a3, t1 ADD3 c10, t2, c10 MUL b3, a4, t2 ADD2 c14, t3, c14 MUL b4, a4, t3 ADD4 c07, t4, c07 lda AO, 4 * SIZE(AO) MUL b4, a3, t4 lda BO, 4 * SIZE(BO) ADD1 c11, t1, c11 ADD3 c12, t2, c12 ADD2 c16, t3, c16 ADD4 c15, t4, c15 ADD c01, c06, c01 ADD c02, c05, c02 ADD c03, c08, c03 ADD c04, c07, c04 ADD c09, c14, c09 ADD c10, c13, c10 ADD c11, c16, c11 ADD c12, c15, c12 .align 4 $L18: #if defined(LN) || defined(RT) #ifdef LN subq KK, 2, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 1, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -4 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 SUB b1, c03, c03 SUB b2, c04, c04 SUB b3, c11, c11 SUB b4, c12, c12 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) LD b4, 7 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 SUB a4, c04, c04 SUB b1, c09, c09 SUB b2, c10, c10 SUB b3, c11, c11 SUB b4, c12, c12 #endif #ifdef LN LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c03, c03 MUL a1, c04, c04 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c03, t1, c03 ADD6 c04, t2, c04 ADD5 c11, t3, c11 ADD6 c12, t4, c12 MUL a3, c03, t1 MUL a3, c04, t2 MUL a3, c11, t3 MUL a3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c09, t3, c09 SUB c10, t4, c10 MUL a4, c04, t1 MUL a4, c03, t2 MUL a4, c12, t3 MUL a4, c11, t4 ADD6 c01, t1, c01 ADD5 c02, t2, c02 ADD6 c09, t3, c09 ADD5 c10, t4, c10 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 #endif #ifdef LT LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c09, t3 MUL a3, c10, t4 SUB c03, t1, c03 SUB c04, t2, c04 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c02, t1 MUL a4, c01, t2 MUL a4, c10, t3 MUL a4, c09, t4 ADD6 c03, t1, c03 ADD5 c04, t2, c04 ADD6 c11, t3, c11 ADD5 c12, t4, c12 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) MUL a2, c04, t1 MUL a2, c03, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c03, c03 MUL a1, c04, c04 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c03, t1, c03 ADD6 c04, t2, c04 ADD5 c11, t3, c11 ADD6 c12, t4, c12 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 MUL a3, c01, t1 MUL a3, c02, t2 MUL a3, c03, t3 MUL a3, c04, t4 SUB c09, t1, c09 SUB c10, t2, c10 SUB c11, t3, c11 SUB c12, t4, c12 MUL a4, c02, t1 MUL a4, c01, t2 MUL a4, c04, t3 MUL a4, c03, t4 ADD6 c09, t1, c09 ADD5 c10, t2, c10 ADD6 c11, t3, c11 ADD5 c12, t4, c12 LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c09, t1, c09 ADD6 c10, t2, c10 ADD5 c11, t3, c11 ADD6 c12, t4, c12 #endif #ifdef RT LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) LD a3, 4 * SIZE(BO) LD a4, 5 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a2, c12, t3 MUL a2, c11, t4 MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 MUL a1, c12, c12 ADD5 c09, t1, c09 ADD6 c10, t2, c10 ADD5 c11, t3, c11 ADD6 c12, t4, c12 MUL a3, c09, t1 MUL a3, c10, t2 MUL a3, c11, t3 MUL a3, c12, t4 SUB c01, t1, c01 SUB c02, t2, c02 SUB c03, t3, c03 SUB c04, t4, c04 MUL a4, c10, t1 MUL a4, c09, t2 MUL a4, c12, t3 MUL a4, c11, t4 ADD6 c01, t1, c01 ADD5 c02, t2, c02 ADD6 c03, t3, c03 ADD5 c04, t4, c04 LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c04, t3 MUL a2, c03, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 MUL a1, c04, c04 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c03, t3, c03 ADD6 c04, t4, c04 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c10, 3 * SIZE(BO) ST c03, 4 * SIZE(BO) ST c04, 5 * SIZE(BO) ST c11, 6 * SIZE(BO) ST c12, 7 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c03, 2 * SIZE(AO) ST c04, 3 * SIZE(AO) ST c09, 4 * SIZE(AO) ST c10, 5 * SIZE(AO) ST c11, 6 * SIZE(AO) ST c12, 7 * SIZE(AO) #endif #ifdef LN lda C1, -4 * SIZE(C1) lda C2, -4 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c03, 2 * SIZE(C1) ST c04, 3 * SIZE(C1) ST c09, 0 * SIZE(C2) ST c10, 1 * SIZE(C2) ST c11, 2 * SIZE(C2) ST c12, 3 * SIZE(C2) #ifndef LN lda C1, 4 * SIZE(C1) lda C2, 4 * SIZE(C2) #endif fclr t1 fclr t2 fclr t3 fclr t4 #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 1, TMP1 addq AO, TMP1, AO addq BO, TMP1, BO #endif #ifdef LT addq KK, 2, KK #endif #ifdef LN subq KK, 2, KK #endif fclr c01 fclr c05 lda I, -1(I) bgt I, $L11 .align 4 $L20: and M, 1, I ble I, $L29 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(B) fclr c10 LD b2, 1 * SIZE(B) fclr c14 LD b3, 2 * SIZE(B) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(B) lda BO, 4 * SIZE(B) lda L, -2(KK) ble KK, $L28 ble L, $L25 #else #ifdef LN sll K, ZBASE_SHIFT + 0, TMP1 subq AORIG, TMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TMP1 addq AORIG, TMP1, AO sll KK, ZBASE_SHIFT + 1, TMP1 addq B, TMP1, BO subq K, KK, TMP1 LD a1, 0 * SIZE(AO) fclr c09 LD a2, 1 * SIZE(AO) fclr c13 LD a3, 2 * SIZE(AO) fclr c02 LD a4, 3 * SIZE(AO) fclr c06 LD b1, 0 * SIZE(BO) fclr c10 LD b2, 1 * SIZE(BO) fclr c14 LD b3, 2 * SIZE(BO) lda AO, 2 * SIZE(AO) LD b4, 3 * SIZE(BO) lda BO, 4 * SIZE(BO) lda L, -2(TMP1) ble TMP1, $L28 ble L, $L25 #endif .align 5 $L22: ADD1 c09, t1, c09 unop MUL a1, b1, t1 unop ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 unop ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD2 c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD1 c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD3 c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD2 c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD1 c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD3 c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD2 c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4 $L25: ADD1 c09, t1, c09 MUL a1, b1, t1 #if defined(LT) || defined(RN) blbs KK, $L27 #else blbs TMP1, $L27 #endif .align 4 ADD3 c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD4 c13, t3, c13 unop MUL a1, b2, t3 unop ADD2 c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD1 c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD3 c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD4 c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD2 c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD1 c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4 $L27: ADD3 c10, t2, c10 MUL a2, b1, t2 ADD4 c13, t3, c13 MUL a1, b2, t3 ADD2 c14, t4, c14 MUL a2, b2, t4 ADD1 c01, t1, c01 MUL a1, b3, t1 ADD3 c02, t2, c02 MUL a2, b3, t2 ADD4 c05, t3, c05 MUL a1, b4, t3 ADD2 c06, t4, c06 lda AO, 2 * SIZE(AO) MUL a2, b4, t4 lda BO, 4 * SIZE(BO) ADD1 c09, t1, c09 ADD3 c10, t2, c10 ADD4 c13, t3, c13 ADD2 c14, t4, c14 ADD c01, c06, c01 ADD c02, c05, c02 ADD c09, c14, c09 ADD c10, c13, c10 .align 4 $L28: #if defined(LN) || defined(RT) #ifdef LN subq KK, 1, TMP1 #else subq KK, 2, TMP1 #endif sll TMP1, ZBASE_SHIFT + 0, TMP2 addq AORIG, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq B, TMP2, BO #else lda AO, -2 * SIZE(AO) lda BO, -4 * SIZE(BO) #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 SUB a4, c10, c10 #endif #if defined(LN) || defined(LT) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a2, c10, t3 MUL a2, c09, t4 MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c01, t1, c01 ADD6 c02, t2, c02 ADD5 c09, t3, c09 ADD6 c10, t4, c10 #endif #ifdef RN LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 MUL a3, c01, t1 MUL a3, c02, t2 SUB c09, t1, c09 SUB c10, t2, c10 MUL a4, c02, t1 MUL a4, c01, t2 ADD6 c09, t1, c09 ADD5 c10, t2, c10 LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c09, t1, c09 ADD6 c10, t2, c10 #endif #ifdef RT LD a1, 6 * SIZE(BO) LD a2, 7 * SIZE(BO) LD a3, 4 * SIZE(BO) LD a4, 5 * SIZE(BO) MUL a2, c10, t1 MUL a2, c09, t2 MUL a1, c09, c09 MUL a1, c10, c10 ADD5 c09, t1, c09 ADD6 c10, t2, c10 MUL a3, c09, t1 MUL a3, c10, t2 SUB c01, t1, c01 SUB c02, t2, c02 MUL a4, c10, t1 MUL a4, c09, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) MUL a2, c02, t1 MUL a2, c01, t2 MUL a1, c01, c01 MUL a1, c02, c02 ADD5 c01, t1, c01 ADD6 c02, t2, c02 #endif #if defined(LN) || defined(LT) ST c01, 0 * SIZE(BO) ST c02, 1 * SIZE(BO) ST c09, 2 * SIZE(BO) ST c10, 3 * SIZE(BO) #else ST c01, 0 * SIZE(AO) ST c02, 1 * SIZE(AO) ST c09, 2 * SIZE(AO) ST c10, 3 * SIZE(AO) #endif #ifdef LN lda C1, -2 * SIZE(C1) lda C2, -2 * SIZE(C2) #endif ST c01, 0 * SIZE(C1) ST c02, 1 * SIZE(C1) ST c09, 0 * SIZE(C2) ST c10, 1 * SIZE(C2) #ifndef LN lda C1, 2 * SIZE(C1) lda C2, 2 * SIZE(C2) #endif #ifdef RT sll K, ZBASE_SHIFT, TMP1 addq AORIG, TMP1, AORIG #endif #if defined(LT) || defined(RN) subq K, KK, TMP1 sll TMP1, ZBASE_SHIFT + 0, TMP2 addq AO, TMP2, AO sll TMP1, ZBASE_SHIFT + 1, TMP2 addq BO, TMP2, BO #endif #ifdef LT addq KK, 1, KK #endif #ifdef LN subq KK, 1, KK #endif .align 4 $L29: #ifdef LN sll K, ZBASE_SHIFT + 1, TMP1 addq B, TMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN addq KK, 2, KK #endif #ifdef RT subq KK, 2, KK #endif lda J, -1(J) bgt J, $L01 .align 4 $L999: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) clr $0 lda $sp, STACKSIZE($sp) ret .ident VERSION .end CNAME OpenBLAS-0.2.20/kernel/arm/000077500000000000000000000000001313527062700152035ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/arm/KERNEL000066400000000000000000000012431313527062700161060ustar00rootroot00000000000000ifndef SNRM2KERNEL SNRM2KERNEL = nrm2.c endif ifndef DNRM2KERNEL DNRM2KERNEL = nrm2.c endif ifndef CNRM2KERNEL CNRM2KERNEL = znrm2.c endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.c endif ifndef SCABS_KERNEL SCABS_KERNEL = ../generic/cabs.c endif ifndef DCABS_KERNEL DCABS_KERNEL = ../generic/cabs.c endif ifndef QCABS_KERNEL QCABS_KERNEL = ../generic/cabs.c endif ifndef LSAME_KERNEL LSAME_KERNEL = ../generic/lsame.c endif ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA DGEMM_BETA = ../generic/gemm_beta.c endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif OpenBLAS-0.2.20/kernel/arm/KERNEL.ARMV5000066400000000000000000000070401313527062700167400ustar00rootroot00000000000000SAMAXKERNEL = ../arm/amax.c DAMAXKERNEL = ../arm/amax.c CAMAXKERNEL = ../arm/zamax.c ZAMAXKERNEL = ../arm/zamax.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c ZAMINKERNEL = ../arm/zamin.c SMAXKERNEL = ../arm/max.c DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = ../arm/iamax.c ICAMAXKERNEL = ../arm/izamax.c IZAMAXKERNEL = ../arm/izamax.c ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = ../arm/izamin.c ISMAXKERNEL = ../arm/imax.c IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c SCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = ../arm/copy.c CCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = ../arm/zcopy.c SDOTKERNEL = ../arm/dot.c DDOTKERNEL = ../arm/dot.c CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = ../arm/rot.c DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c SSCALKERNEL = ../arm/scal.c DSCALKERNEL = ../arm/scal.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c OpenBLAS-0.2.20/kernel/arm/KERNEL.ARMV6000066400000000000000000000053761313527062700167530ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ARMV5 SAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = iamax_vfp.S CAMAXKERNEL = iamax_vfp.S ZAMAXKERNEL = iamax_vfp.S SAMINKERNEL = iamax_vfp.S DAMINKERNEL = iamax_vfp.S CAMINKERNEL = iamax_vfp.S ZAMINKERNEL = iamax_vfp.S SMAXKERNEL = iamax_vfp.S DMAXKERNEL = iamax_vfp.S SMINKERNEL = iamax_vfp.S DMINKERNEL = iamax_vfp.S ISAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S ICAMAXKERNEL = iamax_vfp.S IZAMAXKERNEL = iamax_vfp.S ISAMINKERNEL = iamax_vfp.S IDAMINKERNEL = iamax_vfp.S ICAMINKERNEL = iamax_vfp.S IZAMINKERNEL = iamax_vfp.S ISMAXKERNEL = iamax_vfp.S IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S SASUMKERNEL = asum_vfp.S DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S SAXPYKERNEL = axpy_vfp.S DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S SROTKERNEL = rot_vfp.S DROTKERNEL = rot_vfp.S CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S SNRM2KERNEL = nrm2_vfp.S DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S SGEMVNKERNEL = gemv_n_vfp.S DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = cgemv_n_vfp.S ZGEMVNKERNEL = zgemv_n_vfp.S SGEMVTKERNEL = gemv_t_vfp.S DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o endif SGEMMONCOPY = sgemm_ncopy_2_vfp.S SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x2_vfp.S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o endif DGEMMONCOPY = dgemm_ncopy_2_vfp.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S ZGEMMONCOPY = zgemm_ncopy_2_vfp.S ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S OpenBLAS-0.2.20/kernel/arm/KERNEL.ARMV7000066400000000000000000000014751313527062700167500ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ARMV6 SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMONCOPY = dgemm_ncopy_4_vfp.S DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S OpenBLAS-0.2.20/kernel/arm/KERNEL.CORTEXA15000066400000000000000000000000411313527062700173530ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ARMV7OpenBLAS-0.2.20/kernel/arm/KERNEL.CORTEXA9000066400000000000000000000000411313527062700172760ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ARMV7OpenBLAS-0.2.20/kernel/arm/Makefile000066400000000000000000000000121313527062700166340ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/arm/amax.c000066400000000000000000000044451313527062700163040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); maxf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) > maxf ) { maxf = ABS(x[ix]); } ix += inc_x; i++; } return(maxf); } OpenBLAS-0.2.20/kernel/arm/amin.c000066400000000000000000000044451313527062700163020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; if (n <= 0 || inc_x <= 0) return(minf); minf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) < minf ) { minf = ABS(x[ix]); } ix += inc_x; i++; } return(minf); } OpenBLAS-0.2.20/kernel/arm/asum.c000066400000000000000000000043131313527062700163150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; if (n <= 0 || inc_x <= 0) return(sumf); n *= inc_x; while(i < n) { sumf += ABS(x[i]); i += inc_x; } return(sumf); } OpenBLAS-0.2.20/kernel/arm/asum_vfp.S000066400000000000000000000206761313527062700171620ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define I r12 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ #if !defined(COMPLEX) #if defined(DOUBLE) .macro KERNEL_F4 pld [ X, #X_PRE ] fldmiad X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 fldmiad X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 vadd.f64 d0 , d0, d6 vadd.f64 d1 , d1, d7 .endm .macro KERNEL_F1 fldmiad X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 .endm .macro KERNEL_S4 fldmiad X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X fldmiad X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X fldmiad X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X fldmiad X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X .endm .macro KERNEL_S1 fldmiad X, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 add X, X, INC_X .endm #else .macro KERNEL_F4 fldmias X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 fldmias X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 vadd.f32 s0 , s0, s6 vadd.f32 s1 , s1, s7 .endm .macro KERNEL_F1 fldmias X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 .endm .macro KERNEL_S4 fldmias X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X fldmias X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X fldmias X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X fldmias X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X .endm .macro KERNEL_S1 fldmias X, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 add X, X, INC_X .endm #endif #else #if defined(DOUBLE) .macro KERNEL_F4 pld [ X, #X_PRE ] fldmiad X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 fldmiad X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 vadd.f64 d0 , d0, d6 vadd.f64 d1 , d1, d7 pld [ X, #X_PRE ] fldmiad X!, { d4 - d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 fldmiad X!, { d6 - d7 } vabs.f64 d6, d6 vadd.f64 d1 , d1, d5 vabs.f64 d7, d7 vadd.f64 d0 , d0, d6 vadd.f64 d1 , d1, d7 .endm .macro KERNEL_F1 fldmiad X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 fldmiad X!, { d4 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 .endm .macro KERNEL_S4 fldmiad X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X fldmiad X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X fldmiad X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X fldmiad X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X .endm .macro KERNEL_S1 fldmiad X, { d4 -d5 } vabs.f64 d4, d4 vadd.f64 d0 , d0, d4 vabs.f64 d5, d5 vadd.f64 d0 , d0, d5 add X, X, INC_X .endm #else .macro KERNEL_F4 pld [ X, #X_PRE ] fldmias X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 fldmias X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 vadd.f32 s0 , s0, s6 vadd.f32 s1 , s1, s7 fldmias X!, { s4 - s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 fldmias X!, { s6 - s7 } vabs.f32 s6, s6 vadd.f32 s1 , s1, s5 vabs.f32 s7, s7 vadd.f32 s0 , s0, s6 vadd.f32 s1 , s1, s7 .endm .macro KERNEL_F1 fldmias X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 fldmias X!, { s4 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 .endm .macro KERNEL_S4 fldmias X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X fldmias X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X fldmias X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X fldmias X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X .endm .macro KERNEL_S1 fldmias X, { s4 -s5 } vabs.f32 s4, s4 vadd.f32 s0 , s0, s4 vabs.f32 s5, s5 vadd.f32 s0 , s0, s5 add X, X, INC_X .endm #endif #endif /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 movs r12, #0 // clear floating point register vmov s0, r12 vmov s1, r12 #if defined(DOUBLE) vcvt.f64.f32 d0, s0 vcvt.f64.f32 d1, s1 #endif cmp N, #0 ble asum_kernel_L999 cmp INC_X, #0 beq asum_kernel_L999 cmp INC_X, #1 bne asum_kernel_S_BEGIN asum_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble asum_kernel_F1 .align 5 asum_kernel_F4: #if !defined(DOUBLE) && !defined(COMPLEX) pld [ X, #X_PRE ] #endif KERNEL_F4 subs I, I, #1 ble asum_kernel_F1 KERNEL_F4 subs I, I, #1 bne asum_kernel_F4 asum_kernel_F1: ands I, N, #3 ble asum_kernel_L999 asum_kernel_F10: KERNEL_F1 subs I, I, #1 bne asum_kernel_F10 b asum_kernel_L999 asum_kernel_S_BEGIN: #if defined(COMPLEX) #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 #else lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 #endif #else #if defined(DOUBLE) lsl INC_X, INC_X, #3 // INC_X * SIZE #else lsl INC_X, INC_X, #2 // INC_X * SIZE #endif #endif asrs I, N, #2 // I = N / 4 ble asum_kernel_S1 .align 5 asum_kernel_S4: KERNEL_S4 subs I, I, #1 bne asum_kernel_S4 asum_kernel_S1: ands I, N, #3 ble asum_kernel_L999 asum_kernel_S10: KERNEL_S1 subs I, I, #1 bne asum_kernel_S10 asum_kernel_L999: #if defined(DOUBLE) vadd.f64 d0 , d0, d1 // set return value #else vadd.f32 s0 , s0, s1 // set return value #endif #if !defined(__ARM_PCS_VFP) #if !defined(DOUBLE) vmov r0, s0 #else vmov r0, r1, d0 #endif #endif bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/axpby.c000066400000000000000000000045021313527062700164730ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix,iy; if ( n < 0 ) return(0); ix = 0; iy = 0; if ( beta == 0.0 ) { if ( alpha == 0.0 ) { while(i < n) { y[iy] = 0.0 ; iy += inc_y ; i++ ; } } else { while(i < n) { y[iy] = alpha * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } } } else { if ( alpha == 0.0 ) { while(i < n) { y[iy] = beta * y[iy] ; iy += inc_y ; i++ ; } } else { while(i < n) { y[iy] = alpha * x[ix] + beta * y[iy] ; ix += inc_x ; iy += inc_y ; i++ ; } } } return(0); } OpenBLAS-0.2.20/kernel/arm/axpy.c000066400000000000000000000044061313527062700163340ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix,iy; if ( n < 0 ) return(0); if ( da == 0.0 ) return(0); ix = 0; iy = 0; while(i < n) { y[iy] += da * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/arm/axpy_vfp.S000066400000000000000000000235121313527062700171660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/14 Saar * BLASTEST : xOK * CTEST : xOK * TEST : xOK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #if !defined(__ARM_PCS_VFP) #if !defined(COMPLEX) #if !defined(DOUBLE) #define OLD_ALPHA r3 #define OLD_X [fp, #0 ] #define OLD_INC_X [fp, #4 ] #define OLD_Y [fp, #8 ] #define OLD_INC_Y [fp, #12 ] #else #define OLD_ALPHA [fp, #0] #define OLD_X [fp, #8 ] #define OLD_INC_X [fp, #12 ] #define OLD_Y [fp, #16 ] #define OLD_INC_Y [fp, #20 ] #endif #else //COMPLEX #if !defined(DOUBLE) #define OLD_ALPHAR r3 #define OLD_ALPHAI [fp, #0 ] #define OLD_X [fp, #4 ] #define OLD_INC_X [fp, #8 ] #define OLD_Y [fp, #12 ] #define OLD_INC_Y [fp, #16 ] #else #define OLD_ALPHAR [fp, #0] #define OLD_ALPHAI [fp, #8] #define OLD_X [fp, #16 ] #define OLD_INC_X [fp, #20 ] #define OLD_Y [fp, #24 ] #define OLD_INC_Y [fp, #28 ] #endif #endif //!defined(COMPLEX) #else //__ARM_PCS_VFP #define OLD_INC_X [fp, #0 ] #define OLD_Y [fp, #4 ] #define OLD_INC_Y [fp, #8 ] #endif //!defined(__ARM_PCS_VFP) #define N r0 #define Y r1 #define INC_X r2 #define X r3 #define INC_Y r4 #define I r12 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ /*****************************************************************************************/ #if !defined(CONJ) #if defined(DOUBLE) #define FMAC_R1 fmacd #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #endif #else // CONJ #if defined(DOUBLE) #define FMAC_R1 fmacd #define FMAC_R2 fmacd #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif #endif #if !defined(COMPLEX) #if defined(DOUBLE) .macro KERNEL_F4 pld [ X, #X_PRE ] fldmiad X!, { d4 - d7 } pld [ Y, #X_PRE ] fldmiad Y , { d8 - d11 } fmacd d8 , d0, d4 fstmiad Y!, { d8 } fmacd d9 , d0, d5 fstmiad Y!, { d9 } fmacd d10, d0, d6 fstmiad Y!, { d10 } fmacd d11, d0, d7 fstmiad Y!, { d11 } .endm .macro KERNEL_F1 fldmiad X!, { d4 } fldmiad Y , { d8 } fmacd d8 , d0, d4 fstmiad Y!, { d8 } .endm .macro KERNEL_S1 fldmiad X , { d4 } fldmiad Y , { d8 } fmacd d8 , d0, d4 fstmiad Y , { d8 } add X, X, INC_X add Y, Y, INC_Y .endm #else .macro KERNEL_F4 fldmias X!, { s4 - s7 } fldmias Y , { s8 - s11 } fmacs s8 , s0, s4 fstmias Y!, { s8 } fmacs s9 , s0, s5 fstmias Y!, { s9 } fmacs s10, s0, s6 fstmias Y!, { s10 } fmacs s11, s0, s7 fstmias Y!, { s11 } .endm .macro KERNEL_F1 fldmias X!, { s4 } fldmias Y , { s8 } fmacs s8 , s0, s4 fstmias Y!, { s8 } .endm .macro KERNEL_S1 fldmias X , { s4 } fldmias Y , { s8 } fmacs s8 , s0, s4 fstmias Y , { s8 } add X, X, INC_X add Y, Y, INC_Y .endm #endif #else #if defined(DOUBLE) .macro KERNEL_F4 pld [ X, #X_PRE ] fldmiad X!, { d4 - d7 } pld [ Y, #X_PRE ] fldmiad Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 fstmiad Y!, { d8 } fstmiad Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 fstmiad Y!, { d10 } fstmiad Y!, { d11 } pld [ X, #X_PRE ] fldmiad X!, { d4 - d7 } pld [ Y, #X_PRE ] fldmiad Y , { d8 - d11 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 fstmiad Y!, { d8 } fstmiad Y!, { d9 } FMAC_R1 d10, d0, d6 FMAC_R2 d10, d1, d7 FMAC_I1 d11, d0, d7 FMAC_I2 d11, d1, d6 fstmiad Y!, { d10 } fstmiad Y!, { d11 } .endm .macro KERNEL_F1 fldmiad X!, { d4 - d5 } fldmiad Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 fstmiad Y!, { d8 } fstmiad Y!, { d9 } .endm .macro KERNEL_S1 fldmiad X , { d4 - d5 } fldmiad Y , { d8 - d9 } FMAC_R1 d8 , d0, d4 FMAC_R2 d8 , d1, d5 FMAC_I1 d9 , d0, d5 FMAC_I2 d9 , d1, d4 fstmiad Y , { d8 - d9 } add X, X, INC_X add Y, Y, INC_Y .endm #else .macro KERNEL_F4 pld [ X, #X_PRE ] fldmias X!, { s4 - s7 } pld [ Y, #X_PRE ] fldmias Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 fstmias Y!, { s8 } fstmias Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 fstmias Y!, { s10 } fstmias Y!, { s11 } fldmias X!, { s4 - s7 } fldmias Y , { s8 - s11 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 fstmias Y!, { s8 } fstmias Y!, { s9 } FMAC_R1 s10, s0, s6 FMAC_R2 s10, s1, s7 FMAC_I1 s11, s0, s7 FMAC_I2 s11, s1, s6 fstmias Y!, { s10 } fstmias Y!, { s11 } .endm .macro KERNEL_F1 fldmias X!, { s4 - s5 } fldmias Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 fstmias Y!, { s8 } fstmias Y!, { s9 } .endm .macro KERNEL_S1 fldmias X , { s4 - s5 } fldmias Y , { s8 - s9 } FMAC_R1 s8 , s0, s4 FMAC_R2 s8 , s1, s5 FMAC_I1 s9 , s0, s5 FMAC_I2 s9 , s1, s4 fstmias Y , { s8 - s9 } add X, X, INC_X add Y, Y, INC_Y .endm #endif #endif /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 , fp} add fp, sp, #8 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) #if !defined(COMPLEX) #if !defined(DOUBLE) vmov s0, OLD_ALPHA ldr X, OLD_X #else vldr d0, OLD_ALPHA ldr X, OLD_X #endif #else //COMPLEX #if !defined(DOUBLE) vmov s0, OLD_ALPHAR vldr s1, OLD_ALPHAI ldr X, OLD_X #else vldr d0, OLD_ALPHAR vldr d1, OLD_ALPHAI ldr X, OLD_X #endif #endif #endif ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y sub r12, fp, #128 #if defined(DOUBLE) vstm r12, { d8 - d15} // store floating point registers #else vstm r12, { s8 - s15} // store floating point registers #endif cmp N, #0 ble axpy_kernel_L999 cmp INC_X, #0 beq axpy_kernel_L999 cmp INC_Y, #0 beq axpy_kernel_L999 cmp INC_X, #1 bne axpy_kernel_S_BEGIN cmp INC_Y, #1 bne axpy_kernel_S_BEGIN axpy_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble axpy_kernel_F1 .align 5 axpy_kernel_F4: #if !defined(COMPLEX) && !defined(DOUBLE) pld [ X, #X_PRE ] pld [ Y, #X_PRE ] #endif KERNEL_F4 subs I, I, #1 ble axpy_kernel_F1 KERNEL_F4 subs I, I, #1 bne axpy_kernel_F4 axpy_kernel_F1: ands I, N, #3 ble axpy_kernel_L999 axpy_kernel_F10: KERNEL_F1 subs I, I, #1 bne axpy_kernel_F10 b axpy_kernel_L999 axpy_kernel_S_BEGIN: #if defined(COMPLEX) #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 #else lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 #endif #else #if defined(DOUBLE) lsl INC_X, INC_X, #3 // INC_X * SIZE lsl INC_Y, INC_Y, #3 // INC_Y * SIZE #else lsl INC_X, INC_X, #2 // INC_X * SIZE lsl INC_Y, INC_Y, #2 // INC_Y * SIZE #endif #endif asrs I, N, #2 // I = N / 4 ble axpy_kernel_S1 .align 5 axpy_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne axpy_kernel_S4 axpy_kernel_S1: ands I, N, #3 ble axpy_kernel_L999 axpy_kernel_S10: KERNEL_S1 subs I, I, #1 bne axpy_kernel_S10 axpy_kernel_L999: sub r3, fp, #128 #if defined(DOUBLE) vldm r3, { d8 - d15 } // restore floating point registers #else vldm r3, { s8 - s15 } // restore floating point registers #endif mov r0, #0 // set return value sub sp, fp, #8 pop {r4,fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/ccopy_vfp.S000066400000000000000000000112401313527062700173150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/07 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define OLD_Y r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define OLD_INC_Y [fp, #4 ] #define I r5 #define Y r6 #define INC_Y r7 #define X_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY_F4 pld [ X, #X_PRE ] fldmias X!, { s0 - s7 } fstmias Y!, { s0 - s7 } .endm .macro COPY_F1 fldmias X!, { s0 - s1 } fstmias Y!, { s0 - s1 } .endm /*************************************************************************************************************************/ .macro COPY_S4 nop fldmias X, { s0 - s1 } fstmias Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y fldmias X, { s2 - s3 } fstmias Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y fldmias X, { s0 - s1 } fstmias Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y fldmias X, { s2 - s3 } fstmias Y, { s2 - s3 } add X, X, INC_X add Y, Y, INC_Y .endm .macro COPY_S1 fldmias X, { s0 - s1 } fstmias Y, { s0 - s1 } add X, X, INC_X add Y, Y, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 vstm r4, { s8 - s15} // store floating point registers mov Y, OLD_Y ldr INC_Y, OLD_INC_Y cmp N, #0 ble ccopy_kernel_L999 cmp INC_X, #0 beq ccopy_kernel_L999 cmp INC_Y, #0 beq ccopy_kernel_L999 cmp INC_X, #1 bne ccopy_kernel_S_BEGIN cmp INC_Y, #1 bne ccopy_kernel_S_BEGIN ccopy_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble ccopy_kernel_F1 ccopy_kernel_F4: COPY_F4 subs I, I, #1 bne ccopy_kernel_F4 ccopy_kernel_F1: ands I, N, #3 ble ccopy_kernel_L999 ccopy_kernel_F10: COPY_F1 subs I, I, #1 bne ccopy_kernel_F10 b ccopy_kernel_L999 ccopy_kernel_S_BEGIN: lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 asrs I, N, #2 // I = N / 4 ble ccopy_kernel_S1 ccopy_kernel_S4: COPY_S4 subs I, I, #1 bne ccopy_kernel_S4 ccopy_kernel_S1: ands I, N, #3 ble ccopy_kernel_L999 ccopy_kernel_S10: COPY_S1 subs I, I, #1 bne ccopy_kernel_S10 ccopy_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers mov r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/cdot_vfp.S000066400000000000000000000146071313527062700171430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #if !defined(__ARM_PCS_VFP) #define OLD_RETURN_ADDR r0 #define OLD_N r1 #define OLD_X r2 #define OLD_INC_X r3 #define OLD_Y [fp, #0 ] #define OLD_INC_Y [fp, #4 ] #define RETURN_ADDR r8 #else #define OLD_Y r3 #define OLD_INC_Y [fp, #0 ] #endif #define I r5 #define Y r6 #define INC_Y r7 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro KERNEL_F4 pld [ X, #X_PRE ] pld [ Y, #X_PRE ] fldmias X!, { s4 - s5 } fldmias Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fldmias X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 fldmias Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 fmacs s3 , s7, s10 fldmias X!, { s4 - s5 } fldmias Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fldmias X!, { s6 - s7 } fmacs s2 , s5, s9 fmacs s3 , s5, s8 fldmias Y!, { s10 - s11 } fmacs s0 , s6, s10 fmacs s1 , s6, s11 fmacs s2 , s7, s11 fmacs s3 , s7, s10 .endm .macro KERNEL_F1 fldmias X!, { s4 - s5 } fldmias Y!, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 fmacs s3 , s5, s8 .endm /*************************************************************************************************************************/ .macro KERNEL_S4 nop fldmias X, { s4 - s5 } fldmias Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 fmacs s3 , s5, s8 add X, X, INC_X add Y, Y, INC_Y fldmias X, { s4 - s5 } fldmias Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 fmacs s3 , s5, s8 add X, X, INC_X add Y, Y, INC_Y fldmias X, { s4 - s5 } fldmias Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 fmacs s3 , s5, s8 add X, X, INC_X add Y, Y, INC_Y fldmias X, { s4 - s5 } fldmias Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 fmacs s3 , s5, s8 add X, X, INC_X add Y, Y, INC_Y .endm .macro KERNEL_S1 fldmias X, { s4 - s5 } fldmias Y, { s8 - s9 } fmacs s0 , s4, s8 fmacs s1 , s4, s9 fmacs s2 , s5, s9 fmacs s3 , s5, s8 add X, X, INC_X add Y, Y, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 vstm r4, { s8 - s15} // store floating point registers movs r4, #0 // clear floating point register vmov s0, r4 vmov s1, s0 vmov s2, s0 vmov s3, s0 #if !defined(__ARM_PCS_VFP) mov RETURN_ADDR, OLD_RETURN_ADDR mov N, OLD_N mov X, OLD_X mov INC_X, OLD_INC_X ldr Y, OLD_Y ldr INC_Y, OLD_INC_Y #else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y #endif cmp N, #0 ble cdot_kernel_L999 cmp INC_X, #0 beq cdot_kernel_L999 cmp INC_Y, #0 beq cdot_kernel_L999 cmp INC_X, #1 bne cdot_kernel_S_BEGIN cmp INC_Y, #1 bne cdot_kernel_S_BEGIN cdot_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble cdot_kernel_F1 cdot_kernel_F4: KERNEL_F4 subs I, I, #1 bne cdot_kernel_F4 cdot_kernel_F1: ands I, N, #3 ble cdot_kernel_L999 cdot_kernel_F10: KERNEL_F1 subs I, I, #1 bne cdot_kernel_F10 b cdot_kernel_L999 cdot_kernel_S_BEGIN: lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 asrs I, N, #2 // I = N / 4 ble cdot_kernel_S1 cdot_kernel_S4: KERNEL_S4 subs I, I, #1 bne cdot_kernel_S4 cdot_kernel_S1: ands I, N, #3 ble cdot_kernel_L999 cdot_kernel_S10: KERNEL_S1 subs I, I, #1 bne cdot_kernel_S10 cdot_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers #if !defined(CONJ) vsub.f32 s0 , s0, s2 vadd.f32 s1 , s1, s3 #else vadd.f32 s0 , s0, s2 vsub.f32 s1 , s1, s3 #endif #if !defined(__ARM_PCS_VFP) vstm RETURN_ADDR, {s0 - s1} #endif sub sp, fp, #28 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/cgemm_kernel_2x2_vfp.S000066400000000000000000000460521313527062700213340ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R s0 #define OLD_ALPHA_I s1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, # -240] #define FP_ZERO_1 [fp, # -236] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP r3 #define OLD_ALPHAI_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #8 ] #define B [fp, #12 ] #define C [fp, #16 ] #define OLD_LDC [fp, #20 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 /************************************************************************************** * Macro definitions **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT2x2 flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 vmov.f32 s12, s8 vmov.f32 s13, s8 vmov.f32 s14, s8 vmov.f32 s15, s8 .endm .macro KERNEL2x2_I pld [ AO, #A_PRE ] fldmias AO!, { s0 - s3 } pld [ BO, #B_PRE ] fldmias BO!, { s4 - s7 } fmuls s8 , s0, s4 fmuls s9 , s0, s5 fmuls s10 , s2, s4 fmuls s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmuls s12 , s0, s6 fmuls s13 , s0, s7 fmuls s14 , s2, s6 fmuls s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] fldmias AO!, { s0 - s3 } pld [ BO, #B_PRE ] fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_M2 fldmias AO!, { s0 - s3 } fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_E fldmias AO!, { s0 - s3 } fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_SUB fldmias AO!, { s0 - s3 } fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 fstmias CO1, { s4 - s7 } fldmias CO2, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 fstmias CO2, { s4 - s7 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s12, s8 vmov.f32 s13, s8 .endm .macro KERNEL1x2_I flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmuls s8 , s0, s4 KMAC_R s8 , s1, s5 fmuls s9 , s0, s5 KMAC_I s9 , s1, s4 fmuls s12 , s0, s6 KMAC_R s12 , s1, s7 fmuls s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M1 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M2 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_E flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 fstmias CO1, { s4 - s5 } fldmias CO2, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 fstmias CO2, { s4 - s5 } add CO1, CO1, #8 .endm /******************************************************************************/ .macro INIT2x1 flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 .endm .macro KERNEL2x1_I flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmuls s8 , s0, s4 KMAC_R s8 , s1, s5 fmuls s9 , s0, s5 KMAC_I s9 , s1, s4 fmuls s10 , s2, s4 KMAC_R s10 , s3, s5 fmuls s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M1 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M2 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_E flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro SAVE2x1 flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 fstmias CO1, { s4 - s7 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm .macro KERNEL1x1_I flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmuls s8 , s0, s4 KMAC_R s8 , s1, s5 fmuls s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M1 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M2 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_E flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro SAVE1x1 flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 fstmias CO1, { s4 - s5 } add CO1, CO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { s8 - s15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC ldr K1, K ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble cgemm_kernel_L1_BEGIN cgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] cgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble cgemm_kernel_L2_M1_BEGIN cgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt cgemm_kernel_L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 cgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt cgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_30: tst L, #3 ble cgemm_kernel_L2_M2_40 tst L, #2 ble cgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_32: tst L, #1 ble cgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_40: INIT2x2 cgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L2_M2_100 cgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne cgemm_kernel_L2_M2_46 cgemm_kernel_L2_M2_100: SAVE2x2 cgemm_kernel_L2_M2_END: subs I, I, #1 bne cgemm_kernel_L2_M2_20 cgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble cgemm_kernel_L2_END cgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble cgemm_kernel_L2_M1_40 cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt cgemm_kernel_L2_M1_22 cgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L2_M1_100 cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt cgemm_kernel_L2_M1_42 cgemm_kernel_L2_M1_100: SAVE1x2 cgemm_kernel_L2_END: mov r3, BC mov r4, K1 lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 subs J , #1 // j-- bgt cgemm_kernel_L2_BEGIN /*********************************************************************************************/ cgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 ble cgemm_kernel_L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A cgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble cgemm_kernel_L1_M1_BEGIN cgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt cgemm_kernel_L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 cgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt cgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b cgemm_kernel_L1_M2_44 cgemm_kernel_L1_M2_30: tst L, #3 ble cgemm_kernel_L1_M2_40 tst L, #2 ble cgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b cgemm_kernel_L1_M2_44 cgemm_kernel_L1_M2_32: tst L, #1 ble cgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b cgemm_kernel_L1_M2_44 cgemm_kernel_L1_M2_40: INIT2x1 cgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L1_M2_100 cgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne cgemm_kernel_L1_M2_46 cgemm_kernel_L1_M2_100: SAVE2x1 cgemm_kernel_L1_M2_END: subs I, I, #1 bne cgemm_kernel_L1_M2_20 cgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble cgemm_kernel_L1_END cgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble cgemm_kernel_L1_M1_40 cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt cgemm_kernel_L1_M1_22 cgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L1_M1_100 cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt cgemm_kernel_L1_M1_42 cgemm_kernel_L1_M1_100: SAVE1x1 cgemm_kernel_L1_END: cgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/cgemm_kernel_2x2_vfpv3.S000066400000000000000000000525361313527062700216110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * 2013/11/01 Saar * UNROLL_N 2 * UNROLL_M 2 * CGEMM_P 96 * CGEMM_Q 120 * CGEMM_R 4096 * A_PRE 96 * B_PRE 96 * C_PRE 64 * * Performance on Odroid U2: * * 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS * 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS * 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS * 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R s0 #define OLD_ALPHA_I s1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, # -240] #define FP_ZERO_1 [fp, # -236] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP r3 #define OLD_ALPHAI_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #8 ] #define B [fp, #12 ] #define C [fp, #16 ] #define OLD_LDC [fp, #20 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubs #define FADD_I fadds #define FMAC_R1 vmls.f32 #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) #define FADD_R fadds #define FADD_I fsubs #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define FADD_R fadds #define FADD_I fsubs #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #else #define FADD_R fsubs #define FADD_I fadds #define FMAC_R1 vmls.f32 #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 vmls.f32 #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT2x2 flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 vmov.f32 s22, s16 vmov.f32 s23, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s26, s16 vmov.f32 s27, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 vmov.f32 s30, s16 vmov.f32 s31, s16 .endm .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldmias AO!, { s0 - s1 } fldmias BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 fldmias AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 fldmias BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 fldmias AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 fldmias BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 fldmias AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 fldmias BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 fmuls s31 , s3, s10 .endm .macro KERNEL2x2_M1 fmacs s16 , s0, s8 fldmias AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 fldmias BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 fldmias AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 fldmias BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 fmacs s22 , s2, s10 fmacs s30 , s3, s11 fmacs s23 , s2, s11 fmacs s31 , s3, s10 .endm .macro KERNEL2x2_M2 pld [ AO , #A_PRE ] fmacs s16 , s4, s12 pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 fldmias AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fldmias BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 fldmias AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 fldmias BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 fmacs s22 , s6, s14 fmacs s30 , s7, s15 fmacs s23 , s6, s15 fmacs s31 , s7, s14 .endm .macro KERNEL2x2_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fmacs s19 , s6, s13 fmacs s27 , s7, s12 fmacs s20 , s4, s14 fmacs s28 , s5, s15 fmacs s21 , s4, s15 fmacs s29 , s5, s14 fmacs s22 , s6, s14 fmacs s30 , s7, s15 fmacs s23 , s6, s15 fmacs s31 , s7, s14 .endm .macro KERNEL2x2_SUB fldmias AO!, { s0 - s1 } fldmias BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 fldmias AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 fldmias BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 fmacs s27 , s3, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 fmacs s22 , s2, s10 fmacs s30 , s3, s11 fmacs s23 , s2, s11 fmacs s31 , s3, s10 .endm .macro SAVE2x2 pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s7 } fldmias CO2, { s8 - s11 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FADD_R s18, s26 , s18 FADD_I s19, s27 , s19 FADD_R s20, s28 , s20 FADD_I s21, s29 , s21 FADD_R s22, s30 , s22 FADD_I s23, s31 , s23 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 FMAC_R1 s6 , s0 , s18 FMAC_I1 s7 , s0 , s19 FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 FMAC_R1 s8 , s0 , s20 FMAC_I1 s9 , s0 , s21 FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 FMAC_R1 s10, s0 , s22 FMAC_I1 s11, s0 , s23 FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 fstmias CO1, { s4 - s7 } fstmias CO2, { s8 - s11 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 .endm .macro KERNEL1x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] flds s11, [ BO, #12 ] fmuls s16 , s0, s8 fmuls s24 , s1, s9 fmuls s17 , s0, s9 fmuls s25 , s1, s8 fmuls s20 , s0, s10 fmuls s28 , s1, s11 fmuls s21 , s0, s11 fmuls s29 , s1, s10 add BO , BO, #16 add AO , AO, #8 pld [ BO , #B_PRE ] flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] flds s14, [ BO, #8 ] flds s15, [ BO, #12 ] add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M1 pld [ BO , #B_PRE ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] flds s14, [ BO, #8 ] flds s15, [ BO, #12 ] add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s20 , s4, s14 fmacs s28 , s5, s15 fmacs s21 , s4, s15 fmacs s29 , s5, s14 flds s0 , [ AO, #0 ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] flds s11, [ BO, #12 ] add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s20 , s4, s14 fmacs s28 , s5, s15 fmacs s21 , s4, s15 fmacs s29 , s5, s14 .endm .macro KERNEL1x2_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] flds s11, [ BO, #12 ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 add BO , BO, #16 add AO , AO, #8 .endm .macro SAVE1x2 pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s5 } fldmias CO2, { s8 - s9 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FADD_R s20, s28 , s20 FADD_I s21, s29 , s21 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 FMAC_R1 s8 , s0 , s20 FMAC_I1 s9 , s0 , s21 FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 fstmias CO1, { s4 - s5 } fstmias CO2, { s8 - s9 } add CO1, CO1, #8 .endm /******************************************************************************/ .macro INIT2x1 flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s26, s16 vmov.f32 s27, s16 .endm .macro KERNEL2x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmuls s16 , s0, s8 fmuls s24 , s1, s9 fmuls s17 , s0, s9 fmuls s25 , s1, s8 fmuls s18 , s2, s8 fmuls s26 , s3, s9 fmuls s19 , s2, s9 fmuls s27 , s3, s8 add BO , BO, #8 add AO , AO, #16 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s6 , [ AO, #8 ] flds s7 , [ AO, #12 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M1 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 fmacs s27 , s3, s8 flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s6 , [ AO, #8 ] flds s7 , [ AO, #12 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fmacs s19 , s6, s13 fmacs s27 , s7, s12 flds s0 , [ AO, #0 ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fmacs s19 , s6, s13 fmacs s27 , s7, s12 .endm .macro KERNEL2x1_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 fmacs s27 , s3, s8 add BO , BO, #8 add AO , AO, #16 .endm .macro SAVE2x1 pld [ CO1 , #C_PRE ] flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s7 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FADD_R s18, s26 , s18 FADD_I s19, s27 , s19 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 FMAC_R1 s6 , s0 , s18 FMAC_I1 s7 , s0 , s19 FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 fstmias CO1, { s4 - s7 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 .endm .macro KERNEL1x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmuls s16 , s0, s8 fmuls s24 , s1, s9 fmuls s17 , s0, s9 fmuls s25 , s1, s8 add BO , BO, #8 add AO , AO, #8 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M1 fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M2 fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 flds s0 , [ AO, #0 ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 .endm .macro KERNEL1x1_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 add BO , BO, #8 add AO , AO, #8 .endm .macro SAVE1x1 pld [ CO1 , #C_PRE ] flds s0, ALPHA_R flds s1, ALPHA_I fldmias CO1, { s4 - s5 } FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 fstmias CO1, { s4 - s5 } add CO1, CO1, #8 .endm /******************************************************************************/ /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC ldr K1, K ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble cgemm_kernel_L1_BEGIN cgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] cgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble cgemm_kernel_L2_M1_BEGIN cgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt cgemm_kernel_L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 cgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt cgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_30: tst L, #3 ble cgemm_kernel_L2_M2_40 tst L, #2 ble cgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_32: tst L, #1 ble cgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_40: INIT2x2 cgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L2_M2_100 cgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne cgemm_kernel_L2_M2_46 cgemm_kernel_L2_M2_100: SAVE2x2 cgemm_kernel_L2_M2_END: subs I, I, #1 bne cgemm_kernel_L2_M2_20 cgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble cgemm_kernel_L2_END cgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble cgemm_kernel_L2_M1_40 cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt cgemm_kernel_L2_M1_22 cgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L2_M1_100 cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt cgemm_kernel_L2_M1_42 cgemm_kernel_L2_M1_100: SAVE1x2 cgemm_kernel_L2_END: mov r3, BC mov r4, K1 lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 subs J , #1 // j-- bgt cgemm_kernel_L2_BEGIN /*********************************************************************************************/ cgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 ble cgemm_kernel_L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A cgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble cgemm_kernel_L1_M1_BEGIN cgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt cgemm_kernel_L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 cgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt cgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b cgemm_kernel_L1_M2_44 cgemm_kernel_L1_M2_30: tst L, #3 ble cgemm_kernel_L1_M2_40 tst L, #2 ble cgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b cgemm_kernel_L1_M2_44 cgemm_kernel_L1_M2_32: tst L, #1 ble cgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b cgemm_kernel_L1_M2_44 cgemm_kernel_L1_M2_40: INIT2x1 cgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L1_M2_100 cgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne cgemm_kernel_L1_M2_46 cgemm_kernel_L1_M2_100: SAVE2x1 cgemm_kernel_L1_M2_END: subs I, I, #1 bne cgemm_kernel_L1_M2_20 cgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble cgemm_kernel_L1_END cgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble cgemm_kernel_L1_M1_40 cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt cgemm_kernel_L1_M1_22 cgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L1_M1_100 cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt cgemm_kernel_L1_M1_42 cgemm_kernel_L1_M1_100: SAVE1x1 cgemm_kernel_L1_END: cgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/cgemm_ncopy_2_vfp.S000066400000000000000000000124371313527062700207320ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_A r2 #define OLD_LDA r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define LDA [fp, #-260 ] #define B [fp, #4 ] #define M r0 #define N r1 #define A r2 #define BO r5 #define AO1 r6 #define AO2 r7 #define I r3 #define J r12 #define A_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY2x2 flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] flds s4 , [ AO1, #8 ] flds s5 , [ AO1, #12 ] flds s2 , [ AO2, #0 ] flds s3 , [ AO2, #4 ] add AO1, AO1, #16 flds s6 , [ AO2, #8 ] flds s7 , [ AO2, #12 ] fstmias BO!, { s0 - s7 } add AO2, AO2, #16 .endm .macro COPY1x2 flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] flds s2 , [ AO2, #0 ] flds s3 , [ AO2, #4 ] add AO1, AO1, #8 fstmias BO!, { s0 - s3 } add AO2, AO2, #8 .endm .macro COPY2x1 flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] fstmias BO!, { s0 - s3 } add AO1, AO1, #16 .endm .macro COPY1x1 flds s0 , [ AO1, #0 ] flds s1 , [ AO1, #4 ] fstmias BO!, { s0 - s1 } add AO1, AO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack lsl r3, r3, #3 // lda = lda * 4 * 2 str r3, LDA sub r4, fp, #128 vstm r4, { s8 - s15} // store floating point registers ldr BO, B /*********************************************************************************************/ cgemm_ncopy_L2_BEGIN: asrs J, N, #1 // J = N / 2 ble cgemm_ncopy_L1_BEGIN cgemm_ncopy_L2_M2_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add AO2, AO1, r4 add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #1 // I = M / 2 ble cgemm_ncopy_L2_M2_40 cgemm_ncopy_L2_M2_20: pld [ AO1, #A_PRE ] pld [ AO2, #A_PRE ] COPY2x2 subs I , I , #1 ble cgemm_ncopy_L2_M2_40 COPY2x2 subs I , I , #1 bne cgemm_ncopy_L2_M2_20 cgemm_ncopy_L2_M2_40: ands I, M , #1 ble cgemm_ncopy_L2_M2_END cgemm_ncopy_L2_M2_60: COPY1x2 subs I , I , #1 bne cgemm_ncopy_L2_M2_60 cgemm_ncopy_L2_M2_END: subs J , J, #1 // j-- bne cgemm_ncopy_L2_M2_BEGIN /*********************************************************************************************/ cgemm_ncopy_L1_BEGIN: tst N, #1 ble cgemm_ncopy_L999 cgemm_ncopy_L1_M2_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #1 // I = M / 2 ble cgemm_ncopy_L1_M2_40 cgemm_ncopy_L1_M2_20: COPY2x1 subs I , I , #1 bne cgemm_ncopy_L1_M2_20 cgemm_ncopy_L1_M2_40: ands I, M , #1 ble cgemm_ncopy_L1_M2_END cgemm_ncopy_L1_M2_60: COPY1x1 subs I , I , #1 bne cgemm_ncopy_L1_M2_60 cgemm_ncopy_L1_M2_END: cgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/cgemm_tcopy_2_vfp.S000066400000000000000000000122461313527062700207360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/07 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_A r2 #define OLD_LDA r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define B [fp, #4 ] #define A [fp, #-248 ] #define M r0 #define N r1 #define M4 r2 #define LDA r5 #define AO1 r6 #define BO1 r7 #define BO2 r8 #define I r4 #define J r12 #define A_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY2x2 fldmias AO1, { s0 - s3 } add r3, AO1, LDA fldmias r3, { s4 - s7 } fstmias BO1, { s0 - s7 } add AO1, AO1, #16 add BO1, BO1, M4 .endm .macro COPY1x2 fldmias AO1, { s0 -s1 } add r3, AO1, LDA fldmias r3, { s2 - s3 } fstmias BO2, { s0 - s3 } add AO1, AO1, #8 add BO2, BO2, #16 .endm /*************************************************************************************************************************/ .macro COPY2x1 fldmias AO1, { s0 - s3 } fstmias BO1, { s0 - s3 } add AO1, AO1, #16 add BO1, BO1, M4 .endm .macro COPY1x1 fldmias AO1, { s0 - s1 } fstmias BO2, { s0 - s1 } add AO1, AO1, #8 add BO2, BO2, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack str OLD_A, A // store A lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2 sub r4, fp, #128 vstm r4, { s8 - s15} // store floating point registers lsl r4 , M, #3 // M * SIZE * 2 ldr r3, B and BO2 , N , #-2 mul BO2, BO2, r4 add BO2 , BO2, r3 lsl M4, M, #4 // M4 = M * 2 * SIZE * 2 cgemm_tcopy_L2_BEGIN: asrs J, M, #1 // J = N / 2 ble cgemm_tcopy_L1_BEGIN cgemm_tcopy_L2_M2_BEGIN: ldr AO1, A // AO1 = A lsl r3, LDA, #1 // r3 = 2 * LDA add r3, r3 , AO1 // A = A + 2 * LDA str r3, A // store A ldr BO1, B add r3, BO1, #32 // B = B + 4 * SIZE *2 str r3, B asrs I, N, #1 // I = M / 2 ble cgemm_tcopy_L2_M2_60 cgemm_tcopy_L2_M2_40: COPY2x2 subs I, I, #1 bne cgemm_tcopy_L2_M2_40 cgemm_tcopy_L2_M2_60: tst N , #1 ble cgemm_tcopy_L2_M2_END COPY1x2 cgemm_tcopy_L2_M2_END: subs J , J, #1 // j-- bne cgemm_tcopy_L2_M2_BEGIN /*********************************************************************************************/ cgemm_tcopy_L1_BEGIN: tst M, #1 ble cgemm_tcopy_L999 cgemm_tcopy_L1_M2_BEGIN: ldr AO1, A // AO1 = A add r3, LDA , AO1 // A = A + 1 * LDA str r3, A // store A ldr BO1, B add r3, BO1, #16 // B = B + 2 * SIZE *2 str r3, B asrs I, N, #1 // I = M / 2 ble cgemm_tcopy_L1_M2_60 cgemm_tcopy_L1_M2_40: COPY2x1 subs I, I, #1 bne cgemm_tcopy_L1_M2_40 cgemm_tcopy_L1_M2_60: tst N , #1 ble cgemm_tcopy_L1_M2_END COPY1x1 cgemm_tcopy_L1_M2_END: cgemm_tcopy_L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers mov r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/cgemv_n_vfp.S000066400000000000000000000325701313527062700176270ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/29 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR r3 #define OLD_ALPHAI [fp, #0 ] #define OLD_A_SOFTFP [fp, #4 ] #define OLD_LDA [fp, #8 ] #define X [fp, #12 ] #define OLD_INC_X [fp, #16 ] #define Y [fp, #20 ] #define OLD_INC_Y [fp, #24 ] #else #define OLD_LDA [fp, #0 ] #define X [fp, #4 ] #define OLD_INC_X [fp, #8 ] #define Y [fp, #12 ] #define OLD_INC_Y [fp, #16 ] #endif #define OLD_A r3 #define OLD_M r0 #define AO1 r0 #define N r1 #define J r2 #define AO2 r4 #define XO r5 #define YO r6 #define LDA r7 #define INC_X r8 #define INC_Y r9 #define I r12 #define FP_ZERO [fp, #-228] #define FP_ZERO_0 [fp, #-228] #define FP_ZERO_1 [fp, #-224] #define ALPHA_I [fp, #-236] #define ALPHA_R [fp, #-244] #define M [fp, #-252 ] #define A [fp, #-256 ] #define X_PRE 64 #define Y_PRE 0 #define A_PRE 0 /**************************************************************************************/ #if !defined(CONJ) && !defined(XCONJ) #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif .macro INIT_F4 pld [ YO, #Y_PRE ] flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 vmov.f32 s12, s8 vmov.f32 s13, s8 vmov.f32 s14, s8 vmov.f32 s15, s8 .endm .macro KERNEL_F4X4 pld [ XO, #X_PRE ] KERNEL_F4X1 KERNEL_F4X1 KERNEL_F4X1 KERNEL_F4X1 .endm .macro KERNEL_F4X1 pld [ AO2, #A_PRE ] flds s0 , [ AO1 ] flds s1 , [ AO1, #4 ] flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] flds s4 , [ XO ] flds s5 , [ XO, #4 ] fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 flds s0 , [ AO1, #16 ] flds s1 , [ AO1, #20 ] flds s2 , [ AO1, #24 ] flds s3 , [ AO1, #28 ] fmacs s12 , s0, s4 fmacs s13 , s0, s5 fmacs s14 , s2, s4 fmacs s15 , s2, s5 KMAC_R s12 , s1, s5 KMAC_I s13 , s1, s4 KMAC_R s14 , s3, s5 KMAC_I s15 , s3, s4 add XO , XO, #8 add AO1 , AO1, LDA add AO2 , AO2, LDA .endm .macro SAVE_F4 flds s0, ALPHA_R flds s1, ALPHA_I fldmias YO, { s4 - s7 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 fstmias YO!, { s4 - s7 } fldmias YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 fstmias YO!, { s4 - s7 } .endm .macro INIT_F1 flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm .macro KERNEL_F1X1 flds s0 , [ AO1 ] flds s1 , [ AO1, #4 ] flds s4 , [ XO ] flds s5 , [ XO, #4 ] fmacs s8 , s0, s4 fmacs s9 , s0, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 add XO , XO, #8 add AO1 , AO1, LDA .endm .macro SAVE_F1 flds s0, ALPHA_R flds s1, ALPHA_I fldmias YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 fstmias YO, { s4 - s5 } add YO, YO, #8 .endm /****************************************************************************************/ .macro INIT_S4 flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 vmov.f32 s12, s8 vmov.f32 s13, s8 vmov.f32 s14, s8 vmov.f32 s15, s8 .endm .macro KERNEL_S4X4 KERNEL_S4X1 KERNEL_S4X1 KERNEL_S4X1 KERNEL_S4X1 .endm .macro KERNEL_S4X1 flds s0 , [ AO1 ] flds s1 , [ AO1, #4 ] flds s2 , [ AO1, #8 ] flds s3 , [ AO1, #12 ] flds s4 , [ XO ] flds s5 , [ XO, #4 ] fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 flds s0 , [ AO1, #16 ] flds s1 , [ AO1, #20 ] flds s2 , [ AO1, #24 ] flds s3 , [ AO1, #28 ] fmacs s12 , s0, s4 fmacs s13 , s0, s5 fmacs s14 , s2, s4 fmacs s15 , s2, s5 KMAC_R s12 , s1, s5 KMAC_I s13 , s1, s4 KMAC_R s14 , s3, s5 KMAC_I s15 , s3, s4 add XO , XO, INC_X add AO1 , AO1, LDA add AO2 , AO2, LDA .endm .macro SAVE_S4 flds s0, ALPHA_R flds s1, ALPHA_I fldmias YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 fstmias YO, { s4 - s5 } add YO, YO, INC_Y fldmias YO, { s6 - s7 } FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 fstmias YO, { s6 - s7 } add YO, YO, INC_Y fldmias YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 fstmias YO, { s4 - s5 } add YO, YO, INC_Y fldmias YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 fstmias YO, { s6 - s7 } add YO, YO, INC_Y .endm .macro INIT_S1 flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm .macro KERNEL_S1X1 flds s0 , [ AO1 ] flds s1 , [ AO1, #4 ] flds s4 , [ XO ] flds s5 , [ XO, #4 ] fmacs s8 , s0, s4 fmacs s9 , s0, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 add XO , XO, INC_X add AO1 , AO1, LDA .endm .macro SAVE_S1 flds s0, ALPHA_R flds s1, ALPHA_I fldmias YO, { s4 - s5 } FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 fstmias YO, { s4 - s5 } add YO, YO, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9 , fp} add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r12, fp, #192 #if defined(DOUBLE) vstm r12, { d8 - d15 } // store floating point registers #else vstm r12, { s8 - s15 } // store floating point registers #endif movs r12, #0 str r12, FP_ZERO str r12, FP_ZERO_1 cmp OLD_M, #0 ble cgemvn_kernel_L999 cmp N, #0 ble cgemvn_kernel_L999 #if !defined(__ARM_PCS_VFP) vmov s0, OLD_ALPHAR vldr s1, OLD_ALPHAI ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A str OLD_M, M vstr s0 , ALPHA_R vstr s1 , ALPHA_I ldr INC_X , OLD_INC_X ldr INC_Y , OLD_INC_Y cmp INC_X, #0 beq cgemvn_kernel_L999 cmp INC_Y, #0 beq cgemvn_kernel_L999 ldr LDA, OLD_LDA #if defined(DOUBLE) lsl LDA, LDA, #4 // LDA * SIZE * 2 #else lsl LDA, LDA, #3 // LDA * SIZE * 2 #endif cmp INC_X, #1 bne cgemvn_kernel_S4_BEGIN cmp INC_Y, #1 bne cgemvn_kernel_S4_BEGIN cgemvn_kernel_F4_BEGIN: ldr YO , Y ldr I, M asrs I, I, #2 // I = M / 4 ble cgemvn_kernel_F1_BEGIN cgemvn_kernel_F4X4: ldr AO1, A add AO2, AO1, LDA add r3 , AO1, #32 str r3 , A add AO2, AO2, LDA add AO2, AO2, LDA ldr XO , X INIT_F4 asrs J, N, #2 // J = N / 4 ble cgemvn_kernel_F4X1 cgemvn_kernel_F4X4_10: KERNEL_F4X4 subs J, J, #1 bne cgemvn_kernel_F4X4_10 cgemvn_kernel_F4X1: ands J, N , #3 ble cgemvn_kernel_F4_END cgemvn_kernel_F4X1_10: KERNEL_F4X1 subs J, J, #1 bne cgemvn_kernel_F4X1_10 cgemvn_kernel_F4_END: SAVE_F4 subs I , I , #1 bne cgemvn_kernel_F4X4 cgemvn_kernel_F1_BEGIN: ldr I, M ands I, I , #3 ble cgemvn_kernel_L999 cgemvn_kernel_F1X1: ldr AO1, A add r3, AO1, #8 str r3, A ldr XO , X INIT_F1 mov J, N cgemvn_kernel_F1X1_10: KERNEL_F1X1 subs J, J, #1 bne cgemvn_kernel_F1X1_10 cgemvn_kernel_F1_END: SAVE_F1 subs I , I , #1 bne cgemvn_kernel_F1X1 b cgemvn_kernel_L999 /*************************************************************************************************************/ cgemvn_kernel_S4_BEGIN: #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 #else lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 #endif ldr YO , Y ldr I, M asrs I, I, #2 // I = M / 4 ble cgemvn_kernel_S1_BEGIN cgemvn_kernel_S4X4: ldr AO1, A add AO2, AO1, LDA add r3 , AO1, #32 str r3 , A ldr XO , X INIT_S4 asrs J, N, #2 // J = N / 4 ble cgemvn_kernel_S4X1 cgemvn_kernel_S4X4_10: KERNEL_S4X4 subs J, J, #1 bne cgemvn_kernel_S4X4_10 cgemvn_kernel_S4X1: ands J, N , #3 ble cgemvn_kernel_S4_END cgemvn_kernel_S4X1_10: KERNEL_S4X1 subs J, J, #1 bne cgemvn_kernel_S4X1_10 cgemvn_kernel_S4_END: SAVE_S4 subs I , I , #1 bne cgemvn_kernel_S4X4 cgemvn_kernel_S1_BEGIN: ldr I, M ands I, I , #3 ble cgemvn_kernel_L999 cgemvn_kernel_S1X1: ldr AO1, A add r3, AO1, #8 str r3, A ldr XO , X INIT_S1 mov J, N cgemvn_kernel_S1X1_10: KERNEL_S1X1 subs J, J, #1 bne cgemvn_kernel_S1X1_10 cgemvn_kernel_S1_END: SAVE_S1 subs I , I , #1 bne cgemvn_kernel_S1X1 /*************************************************************************************************************/ cgemvn_kernel_L999: sub r3, fp, #192 #if defined(DOUBLE) vldm r3, { d8 - d15 } // restore floating point registers #else vldm r3, { s8 - s15 } // restore floating point registers #endif mov r0, #0 // set return value sub sp, fp, #28 pop {r4 -r9 ,fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/cgemv_t_vfp.S000066400000000000000000000254211313527062700176320ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/29 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR r3 #define OLD_ALPHAI [fp, #0 ] #define OLD_A_SOFTFP [fp, #4 ] #define OLD_LDA [fp, #8 ] #define X [fp, #12 ] #define OLD_INC_X [fp, #16 ] #define Y [fp, #20 ] #define OLD_INC_Y [fp, #24 ] #else #define OLD_LDA [fp, #0 ] #define X [fp, #4 ] #define OLD_INC_X [fp, #8 ] #define Y [fp, #12 ] #define OLD_INC_Y [fp, #16 ] #endif #define OLD_A r3 #define OLD_N r1 #define M r0 #define AO1 r1 #define J r2 #define AO2 r4 #define XO r5 #define YO r6 #define LDA r7 #define INC_X r8 #define INC_Y r9 #define I r12 #define FP_ZERO [fp, #-228] #define FP_ZERO_0 [fp, #-228] #define FP_ZERO_1 [fp, #-224] #define N [fp, #-252 ] #define A [fp, #-256 ] #define X_PRE 512 #define A_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ #if !defined(CONJ) && !defined(XCONJ) #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif .macro INIT_F2 flds s12, FP_ZERO vmov.f32 s13, s12 vmov.f32 s14, s12 vmov.f32 s15, s12 .endm .macro KERNEL_F2X4 KERNEL_F2X1 KERNEL_F2X1 KERNEL_F2X1 KERNEL_F2X1 .endm .macro KERNEL_F2X1 fldmias XO! , { s2 - s3 } fldmias AO1!, { s4 - s5 } fldmias AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 KMAC_R s12 , s5 , s3 KMAC_I s13 , s5 , s2 fmacs s14 , s8 , s2 fmacs s15 , s8 , s3 KMAC_R s14 , s9 , s3 KMAC_I s15 , s9 , s2 .endm .macro SAVE_F2 fldmias YO, { s4 - s7 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 fstmias YO!, { s4 - s7 } .endm /************************************************************************************************/ .macro INIT_F1 flds s12, FP_ZERO vmov.f32 s13, s12 .endm .macro KERNEL_F1X4 KERNEL_F1X1 KERNEL_F1X1 KERNEL_F1X1 KERNEL_F1X1 .endm .macro KERNEL_F1X1 fldmias XO! , { s2 - s3 } fldmias AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 KMAC_R s12 , s5 , s3 KMAC_I s13 , s5 , s2 .endm .macro SAVE_F1 fldmias YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 fstmias YO!, { s4 - s5 } .endm /************************************************************************************************/ .macro INIT_S2 flds s12, FP_ZERO vmov.f32 s13, s12 vmov.f32 s14, s12 vmov.f32 s15, s12 .endm .macro KERNEL_S2X4 KERNEL_S2X1 KERNEL_S2X1 KERNEL_S2X1 KERNEL_S2X1 .endm .macro KERNEL_S2X1 fldmias XO , { s2 - s3 } fldmias AO1!, { s4 - s5 } fldmias AO2!, { s8 - s9 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 KMAC_R s12 , s5 , s3 KMAC_I s13 , s5 , s2 fmacs s14 , s8 , s2 fmacs s15 , s8 , s3 KMAC_R s14 , s9 , s3 KMAC_I s15 , s9 , s2 add XO, XO, INC_X .endm .macro SAVE_S2 fldmias YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 fstmias YO, { s4 - s5 } add YO, YO, INC_Y fldmias YO, { s6 - s7 } FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 fstmias YO, { s6 - s7 } add YO, YO, INC_Y .endm /************************************************************************************************/ .macro INIT_S1 flds s12, FP_ZERO vmov.f32 s13, s12 .endm .macro KERNEL_S1X4 KERNEL_S1X1 KERNEL_S1X1 KERNEL_S1X1 KERNEL_S1X1 .endm .macro KERNEL_S1X1 fldmias XO , { s2 - s3 } fldmias AO1!, { s4 - s5 } fmacs s12 , s4 , s2 fmacs s13 , s4 , s3 KMAC_R s12 , s5 , s3 KMAC_I s13 , s5 , s2 add XO, XO, INC_X .endm .macro SAVE_S1 fldmias YO, { s4 - s5 } FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 fstmias YO, { s4 - s5 } add YO, YO, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9 , fp} add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r12, fp, #192 #if defined(DOUBLE) vstm r12, { d8 - d15 } // store floating point registers #else vstm r12, { s8 - s15 } // store floating point registers #endif movs r12, #0 str r12, FP_ZERO str r12, FP_ZERO_1 cmp M, #0 ble cgemvt_kernel_L999 cmp OLD_N, #0 ble cgemvt_kernel_L999 #if !defined(__ARM_PCS_VFP) vmov s0, OLD_ALPHAR vldr s1, OLD_ALPHAI ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A str OLD_N, N ldr INC_X , OLD_INC_X ldr INC_Y , OLD_INC_Y cmp INC_X, #0 beq cgemvt_kernel_L999 cmp INC_Y, #0 beq cgemvt_kernel_L999 ldr LDA, OLD_LDA #if defined(DOUBLE) lsl LDA, LDA, #4 // LDA * SIZE #else lsl LDA, LDA, #3 // LDA * SIZE #endif cmp INC_X, #1 bne cgemvt_kernel_S2_BEGIN cmp INC_Y, #1 bne cgemvt_kernel_S2_BEGIN cgemvt_kernel_F2_BEGIN: ldr YO , Y ldr J, N asrs J, J, #1 // J = N / 2 ble cgemvt_kernel_F1_BEGIN cgemvt_kernel_F2X4: ldr AO1, A add AO2, AO1, LDA add r3 , AO2, LDA str r3 , A ldr XO , X INIT_F2 asrs I, M, #2 // I = M / 4 ble cgemvt_kernel_F2X1 cgemvt_kernel_F2X4_10: KERNEL_F2X4 subs I, I, #1 bne cgemvt_kernel_F2X4_10 cgemvt_kernel_F2X1: ands I, M , #3 ble cgemvt_kernel_F2_END cgemvt_kernel_F2X1_10: KERNEL_F2X1 subs I, I, #1 bne cgemvt_kernel_F2X1_10 cgemvt_kernel_F2_END: SAVE_F2 subs J , J , #1 bne cgemvt_kernel_F2X4 cgemvt_kernel_F1_BEGIN: ldr J, N ands J, J, #1 ble cgemvt_kernel_L999 cgemvt_kernel_F1X4: ldr AO1, A ldr XO , X INIT_F1 asrs I, M, #2 // I = M / 4 ble cgemvt_kernel_F1X1 cgemvt_kernel_F1X4_10: KERNEL_F1X4 subs I, I, #1 bne cgemvt_kernel_F1X4_10 cgemvt_kernel_F1X1: ands I, M , #3 ble cgemvt_kernel_F1_END cgemvt_kernel_F1X1_10: KERNEL_F1X1 subs I, I, #1 bne cgemvt_kernel_F1X1_10 cgemvt_kernel_F1_END: SAVE_F1 b cgemvt_kernel_L999 /*************************************************************************************************************/ cgemvt_kernel_S2_BEGIN: #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE lsl INC_Y, INC_Y, #4 // INC_Y * SIZE #else lsl INC_X, INC_X, #3 // INC_X * SIZE lsl INC_Y, INC_Y, #3 // INC_Y * SIZE #endif ldr YO , Y ldr J, N asrs J, J, #1 // J = N / 2 ble cgemvt_kernel_S1_BEGIN cgemvt_kernel_S2X4: ldr AO1, A add AO2, AO1, LDA add r3 , AO2, LDA str r3 , A ldr XO , X INIT_S2 asrs I, M, #2 // I = M / 4 ble cgemvt_kernel_S2X1 cgemvt_kernel_S2X4_10: KERNEL_S2X4 subs I, I, #1 bne cgemvt_kernel_S2X4_10 cgemvt_kernel_S2X1: ands I, M , #3 ble cgemvt_kernel_S2_END cgemvt_kernel_S2X1_10: KERNEL_S2X1 subs I, I, #1 bne cgemvt_kernel_S2X1_10 cgemvt_kernel_S2_END: SAVE_S2 subs J , J , #1 bne cgemvt_kernel_S2X4 cgemvt_kernel_S1_BEGIN: ldr J, N ands J, J, #1 ble cgemvt_kernel_L999 cgemvt_kernel_S1X4: ldr AO1, A ldr XO , X INIT_S1 asrs I, M, #2 // I = M / 4 ble cgemvt_kernel_S1X1 cgemvt_kernel_S1X4_10: KERNEL_S1X4 subs I, I, #1 bne cgemvt_kernel_S1X4_10 cgemvt_kernel_S1X1: ands I, M , #3 ble cgemvt_kernel_S1_END cgemvt_kernel_S1X1_10: KERNEL_S1X1 subs I, I, #1 bne cgemvt_kernel_S1X1_10 cgemvt_kernel_S1_END: SAVE_S1 /*************************************************************************************************************/ cgemvt_kernel_L999: sub r3, fp, #192 #if defined(DOUBLE) vldm r3, { d8 - d15 } // restore floating point registers #else vldm r3, { s8 - s15 } // restore floating point registers #endif mov r0, #0 // set return value sub sp, fp, #28 pop {r4 -r9 ,fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/copy.c000066400000000000000000000042101313527062700163160ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; if ( n < 0 ) return(0); while(i < n) { y[iy] = x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/arm/ctrmm_kernel_2x2_vfp.S000066400000000000000000000566301313527062700213710ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/10/16 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R s0 #define OLD_ALPHA_I s1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define KKK [fp, #-240] #define KK [fp, #-244 ] #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-232] #define FP_ZERO_0 [fp, #-232] #define FP_ZERO_1 [fp, #-228] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP r3 #define OLD_ALPHAI_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #8 ] #define B [fp, #12 ] #define C [fp, #16 ] #define OLD_LDC [fp, #20 ] #define OFFSET [fp, #24 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 /************************************************************************************** * Macro definitions **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif .macro INIT2x2 flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 vmov.f32 s12, s8 vmov.f32 s13, s8 vmov.f32 s14, s8 vmov.f32 s15, s8 .endm .macro KERNEL2x2_I pld [ AO, #A_PRE ] fldmias AO!, { s0 - s3 } pld [ BO, #B_PRE ] fldmias BO!, { s4 - s7 } fmuls s8 , s0, s4 fmuls s9 , s0, s5 fmuls s10 , s2, s4 fmuls s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmuls s12 , s0, s6 fmuls s13 , s0, s7 fmuls s14 , s2, s6 fmuls s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_M1 pld [ AO, #A_PRE ] fldmias AO!, { s0 - s3 } pld [ BO, #B_PRE ] fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_M2 fldmias AO!, { s0 - s3 } fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_E fldmias AO!, { s0 - s3 } fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro KERNEL2x2_SUB fldmias AO!, { s0 - s3 } fldmias BO!, { s4 - s7 } fmacs s8 , s0, s4 fmacs s9 , s0, s5 fmacs s10 , s2, s4 fmacs s11 , s2, s5 KMAC_R s8 , s1, s5 KMAC_I s9 , s1, s4 KMAC_R s10 , s3, s5 KMAC_I s11 , s3, s4 fmacs s12 , s0, s6 fmacs s13 , s0, s7 fmacs s14 , s2, s6 fmacs s15 , s2, s7 KMAC_R s12 , s1, s7 KMAC_I s13 , s1, s6 KMAC_R s14 , s3, s7 KMAC_I s15 , s3, s6 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I flds s4, FP_ZERO vmov.f32 s5, s4 vmov.f32 s6, s4 vmov.f32 s7, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 fstmias CO1, { s4 - s7 } flds s4, FP_ZERO vmov.f32 s5, s4 vmov.f32 s6, s4 vmov.f32 s7, s4 FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 FMAC_R1 s6 , s0 , s14 FMAC_I1 s7 , s0 , s15 FMAC_R2 s6 , s1 , s15 FMAC_I2 s7 , s1 , s14 fstmias CO2, { s4 - s7 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s12, s8 vmov.f32 s13, s8 .endm .macro KERNEL1x2_I flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmuls s8 , s0, s4 KMAC_R s8 , s1, s5 fmuls s9 , s0, s5 KMAC_I s9 , s1, s4 fmuls s12 , s0, s6 KMAC_R s12 , s1, s7 fmuls s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M1 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M2 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_E flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] flds s6 , [ BO, #8 ] flds s7 , [ BO, #12 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s12 , s0, s6 KMAC_R s12 , s1, s7 fmacs s13 , s0, s7 KMAC_I s13 , s1, s6 add BO , BO, #16 add AO , AO, #8 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I flds s4, FP_ZERO vmov.f32 s5, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 fstmias CO1, { s4 - s5 } flds s4, FP_ZERO vmov.f32 s5, s4 FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 FMAC_R2 s4 , s1 , s13 FMAC_I2 s5 , s1 , s12 fstmias CO2, { s4 - s5 } add CO1, CO1, #8 .endm /******************************************************************************/ .macro INIT2x1 flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 .endm .macro KERNEL2x1_I flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmuls s8 , s0, s4 KMAC_R s8 , s1, s5 fmuls s9 , s0, s5 KMAC_I s9 , s1, s4 fmuls s10 , s2, s4 KMAC_R s10 , s3, s5 fmuls s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M1 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M2 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_E flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 fmacs s10 , s2, s4 KMAC_R s10 , s3, s5 fmacs s11 , s2, s5 KMAC_I s11 , s3, s4 add BO , BO, #8 add AO , AO, #16 .endm .macro SAVE2x1 flds s0, ALPHA_R flds s1, ALPHA_I flds s4, FP_ZERO vmov.f32 s5, s4 vmov.f32 s6, s4 vmov.f32 s7, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 FMAC_R1 s6 , s0 , s10 FMAC_I1 s7 , s0 , s11 FMAC_R2 s6 , s1 , s11 FMAC_I2 s7 , s1 , s10 fstmias CO1, { s4 - s7 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm .macro KERNEL1x1_I flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmuls s8 , s0, s4 KMAC_R s8 , s1, s5 fmuls s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M1 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M2 flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_E flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s4 , [ BO ] flds s5 , [ BO, #4 ] fmacs s8 , s0, s4 KMAC_R s8 , s1, s5 fmacs s9 , s0, s5 KMAC_I s9 , s1, s4 add BO , BO, #8 add AO , AO, #8 .endm .macro SAVE1x1 flds s0, ALPHA_R flds s1, ALPHA_I flds s4, FP_ZERO vmov.f32 s5, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 FMAC_R2 s4 , s1 , s9 FMAC_I2 s5 , s1 , s8 fstmias CO1, { s4 - s5 } add CO1, CO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { s8 - s15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC ldr r3, OFFSET #ifndef LEFT neg r3 , r3 #endif str r3 , KK ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble _L1_BEGIN _L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] _L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L2_M1_BEGIN _L2_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt _L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_30: tst L, #3 ble _L2_M2_40 tst L, #2 ble _L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_32: tst L, #1 ble _L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_40: INIT2x2 _L2_M2_44: ands L , K1, #7 // L = L % 8 ble _L2_M2_100 _L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne _L2_M2_46 _L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L2_M2_END: subs I, I, #1 bne _L2_M2_20 _L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L2_END _L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L2_M1_40 _L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_22 _L2_M1_40: ands L , K1, #7 // L = L % 8 ble _L2_M1_100 _L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_42 _L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L2_END: mov r3, BC ldr r4, K lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO str r3 , KK #endif subs J , #1 // j-- bgt _L2_BEGIN /*********************************************************************************************/ _L1_BEGIN: ldr J , N tst J , #1 ble _L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A _L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L1_M1_BEGIN _L1_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt _L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_30: tst L, #3 ble _L1_M2_40 tst L, #2 ble _L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_32: tst L, #1 ble _L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_40: INIT2x1 _L1_M2_44: ands L , K1, #7 // L = L % 8 ble _L1_M2_100 _L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne _L1_M2_46 _L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L1_M2_END: subs I, I, #1 bne _L1_M2_20 _L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L1_END _L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L1_M1_40 _L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_22 _L1_M1_40: ands L , K1, #7 // L = L % 8 ble _L1_M1_100 _L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_42 _L1_M1_100: SAVE1x1 _L1_END: _L999: sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/ctrmm_kernel_2x2_vfpv3.S000066400000000000000000000616211313527062700216360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/10/16 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R s0 #define OLD_ALPHA_I s1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define KKK [fp, #-240] #define KK [fp, #-244 ] #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-236] #define FP_ZERO_0 [fp, #-236] #define FP_ZERO_1 [fp, #-232] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP r3 #define OLD_ALPHAI_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #8 ] #define B [fp, #12 ] #define C [fp, #16 ] #define OLD_LDC [fp, #20 ] #define OFFSET [fp, #24 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubs #define FADD_I fadds #define FMAC_R1 vnmul.f32 #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) #define FADD_R fadds #define FADD_I fsubs #define FMAC_R1 fmuls #define FMAC_R2 fmacs #define FMAC_I1 vnmul.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define FADD_R fadds #define FADD_I fsubs #define FMAC_R1 fmuls #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls #define FMAC_I2 fmacs #else #define FADD_R fsubs #define FADD_I fadds #define FMAC_R1 vnmul.f32 #define FMAC_R2 fmacs #define FMAC_I1 vnmul.f32 #define FMAC_I2 vmls.f32 #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT2x2 flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 vmov.f32 s22, s16 vmov.f32 s23, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s26, s16 vmov.f32 s27, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 vmov.f32 s30, s16 vmov.f32 s31, s16 .endm .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldmias AO!, { s0 - s1 } fldmias BO!, { s8 - s9 } fmuls s16 , s0, s8 fmuls s24 , s1, s9 fldmias AO!, { s2 - s3 } fmuls s17 , s0, s9 fmuls s25 , s1, s8 fldmias BO!, { s10 - s11 } fmuls s18 , s2, s8 fmuls s26 , s3, s9 fldmias AO!, { s4 - s5 } fmuls s19 , s2, s9 fmuls s27 , s3, s8 fldmias BO!, { s12 - s13 } fmuls s20 , s0, s10 fmuls s28 , s1, s11 fldmias AO!, { s6 - s7 } fmuls s21 , s0, s11 fmuls s29 , s1, s10 fldmias BO!, { s14 - s15 } fmuls s22 , s2, s10 fmuls s30 , s3, s11 fmuls s23 , s2, s11 fmuls s31 , s3, s10 .endm .macro KERNEL2x2_M1 fmacs s16 , s0, s8 fldmias AO!, { s4 - s5 } fmacs s24 , s1, s9 fmacs s17 , s0, s9 fldmias BO!, { s12 - s13 } fmacs s25 , s1, s8 fmacs s18 , s2, s8 fldmias AO!, { s6 - s7 } fmacs s26 , s3, s9 fmacs s19 , s2, s9 fldmias BO!, { s14 - s15 } fmacs s27 , s3, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 fmacs s22 , s2, s10 fmacs s30 , s3, s11 fmacs s23 , s2, s11 fmacs s31 , s3, s10 .endm .macro KERNEL2x2_M2 pld [ AO , #A_PRE ] fmacs s16 , s4, s12 pld [ BO , #B_PRE ] fmacs s24 , s5, s13 fmacs s17 , s4, s13 fldmias AO!, { s0 - s1 } fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fldmias BO!, { s8 - s9 } fmacs s19 , s6, s13 fmacs s27 , s7, s12 fldmias AO!, { s2 - s3 } fmacs s20 , s4, s14 fmacs s28 , s5, s15 fldmias BO!, { s10 - s11 } fmacs s21 , s4, s15 fmacs s29 , s5, s14 fmacs s22 , s6, s14 fmacs s30 , s7, s15 fmacs s23 , s6, s15 fmacs s31 , s7, s14 .endm .macro KERNEL2x2_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fmacs s19 , s6, s13 fmacs s27 , s7, s12 fmacs s20 , s4, s14 fmacs s28 , s5, s15 fmacs s21 , s4, s15 fmacs s29 , s5, s14 fmacs s22 , s6, s14 fmacs s30 , s7, s15 fmacs s23 , s6, s15 fmacs s31 , s7, s14 .endm .macro KERNEL2x2_SUB fldmias AO!, { s0 - s1 } fldmias BO!, { s8 - s9 } fmacs s16 , s0, s8 fmacs s24 , s1, s9 fldmias AO!, { s2 - s3 } fmacs s17 , s0, s9 fmacs s25 , s1, s8 fldmias BO!, { s10 - s11 } fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 fmacs s27 , s3, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 fmacs s22 , s2, s10 fmacs s30 , s3, s11 fmacs s23 , s2, s11 fmacs s31 , s3, s10 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FADD_R s18, s26 , s18 FADD_I s19, s27 , s19 FADD_R s20, s28 , s20 FADD_I s21, s29 , s21 FADD_R s22, s30 , s22 FADD_I s23, s31 , s23 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 FMAC_R1 s6 , s0 , s18 FMAC_I1 s7 , s0 , s19 FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 FMAC_R1 s8 , s0 , s20 FMAC_I1 s9 , s0 , s21 FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 FMAC_R1 s10, s0 , s22 FMAC_I1 s11, s0 , s23 FMAC_R2 s10, s1 , s23 FMAC_I2 s11, s1 , s22 fstmias CO1, { s4 - s7 } fstmias CO2, { s8 - s11 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 .endm .macro KERNEL1x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] flds s11, [ BO, #12 ] fmuls s16 , s0, s8 fmuls s24 , s1, s9 fmuls s17 , s0, s9 fmuls s25 , s1, s8 fmuls s20 , s0, s10 fmuls s28 , s1, s11 fmuls s21 , s0, s11 fmuls s29 , s1, s10 add BO , BO, #16 add AO , AO, #8 pld [ BO , #B_PRE ] flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] flds s14, [ BO, #8 ] flds s15, [ BO, #12 ] add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M1 pld [ BO , #B_PRE ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] flds s14, [ BO, #8 ] flds s15, [ BO, #12 ] add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s20 , s4, s14 fmacs s28 , s5, s15 fmacs s21 , s4, s15 fmacs s29 , s5, s14 flds s0 , [ AO, #0 ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] flds s11, [ BO, #12 ] add BO , BO, #16 add AO , AO, #8 .endm .macro KERNEL1x2_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s20 , s4, s14 fmacs s28 , s5, s15 fmacs s21 , s4, s15 fmacs s29 , s5, s14 .endm .macro KERNEL1x2_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] flds s11, [ BO, #12 ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 add BO , BO, #16 add AO , AO, #8 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA_R flds s1, ALPHA_I FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FADD_R s20, s28 , s20 FADD_I s21, s29 , s21 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 FMAC_R1 s8 , s0 , s20 FMAC_I1 s9 , s0 , s21 FMAC_R2 s8 , s1 , s21 FMAC_I2 s9 , s1 , s20 fstmias CO1, { s4 - s5 } fstmias CO2, { s8 - s9 } add CO1, CO1, #8 .endm /******************************************************************************/ .macro INIT2x1 flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s26, s16 vmov.f32 s27, s16 .endm .macro KERNEL2x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmuls s16 , s0, s8 fmuls s24 , s1, s9 fmuls s17 , s0, s9 fmuls s25 , s1, s8 fmuls s18 , s2, s8 fmuls s26 , s3, s9 fmuls s19 , s2, s9 fmuls s27 , s3, s8 add BO , BO, #8 add AO , AO, #16 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s6 , [ AO, #8 ] flds s7 , [ AO, #12 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M1 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 fmacs s27 , s3, s8 flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s6 , [ AO, #8 ] flds s7 , [ AO, #12 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fmacs s19 , s6, s13 fmacs s27 , s7, s12 flds s0 , [ AO, #0 ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] add BO , BO, #8 add AO , AO, #16 .endm .macro KERNEL2x1_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fmacs s19 , s6, s13 fmacs s27 , s7, s12 .endm .macro KERNEL2x1_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s2 , [ AO, #8 ] flds s3 , [ AO, #12 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 fmacs s27 , s3, s8 add BO , BO, #8 add AO , AO, #16 .endm .macro SAVE2x1 flds s0, ALPHA_R flds s1, ALPHA_I FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FADD_R s18, s26 , s18 FADD_I s19, s27 , s19 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 FMAC_R1 s6 , s0 , s18 FMAC_I1 s7 , s0 , s19 FMAC_R2 s6 , s1 , s19 FMAC_I2 s7 , s1 , s18 fstmias CO1, { s4 - s7 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 .endm .macro KERNEL1x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmuls s16 , s0, s8 fmuls s24 , s1, s9 fmuls s17 , s0, s9 fmuls s25 , s1, s8 add BO , BO, #8 add AO , AO, #8 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M1 fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 flds s4 , [ AO, #0 ] flds s5 , [ AO, #4 ] flds s12, [ BO ] flds s13, [ BO, #4 ] add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_M2 fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 flds s0 , [ AO, #0 ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] add BO , BO, #8 add AO , AO, #8 .endm .macro KERNEL1x1_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 .endm .macro KERNEL1x1_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmacs s16 , s0, s8 fmacs s24 , s1, s9 fmacs s17 , s0, s9 fmacs s25 , s1, s8 add BO , BO, #8 add AO , AO, #8 .endm .macro SAVE1x1 flds s0, ALPHA_R flds s1, ALPHA_I FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FMAC_R1 s4 , s0 , s16 FMAC_I1 s5 , s0 , s17 FMAC_R2 s4 , s1 , s17 FMAC_I2 s5 , s1 , s16 fstmias CO1, { s4 - s5 } add CO1, CO1, #8 .endm /******************************************************************************/ /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC ldr r3, OFFSET #ifndef LEFT neg r3 , r3 #endif str r3 , KK ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble _L1_BEGIN _L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] _L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L2_M1_BEGIN _L2_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt _L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_30: tst L, #3 ble _L2_M2_40 tst L, #2 ble _L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_32: tst L, #1 ble _L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_40: INIT2x2 _L2_M2_44: ands L , K1, #7 // L = L % 8 ble _L2_M2_100 _L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne _L2_M2_46 _L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L2_M2_END: subs I, I, #1 bne _L2_M2_20 _L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L2_END _L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L2_M1_40 _L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_22 _L2_M1_40: ands L , K1, #7 // L = L % 8 ble _L2_M1_100 _L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_42 _L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L2_END: mov r3, BC ldr r4, K lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO str r3 , KK #endif subs J , #1 // j-- bgt _L2_BEGIN /*********************************************************************************************/ _L1_BEGIN: ldr J , N tst J , #1 ble _L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A _L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L1_M1_BEGIN _L1_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt _L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_30: tst L, #3 ble _L1_M2_40 tst L, #2 ble _L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_32: tst L, #1 ble _L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_40: INIT2x1 _L1_M2_44: ands L , K1, #7 // L = L % 8 ble _L1_M2_100 _L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne _L1_M2_46 _L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #4 // 2 * 4 * 2 float values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L1_M2_END: subs I, I, #1 bne _L1_M2_20 _L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L1_END _L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 * 4 * 2 float values add BO , BO , r4 lsls r4 , r3 , #3 // 1 * 4 * 2 float values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L1_M1_40 _L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_22 _L1_M1_40: ands L , K1, #7 // L = L % 8 ble _L1_M1_100 _L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_42 _L1_M1_100: SAVE1x1 _L1_END: _L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dcopy_vfp.S000066400000000000000000000111341313527062700173200ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/07 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define OLD_Y r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define OLD_INC_Y [fp, #4 ] #define I r5 #define Y r6 #define INC_Y r7 #define X_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY_F4 pld [ X, #X_PRE ] fldmiad X!, { d0 - d3 } fstmiad Y!, { d0 - d3 } .endm .macro COPY_F1 fldmiad X!, { d0 } fstmiad Y!, { d0 } .endm /*************************************************************************************************************************/ .macro COPY_S4 nop fldmiad X, { d0 } fstmiad Y, { d0 } add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d1 } fstmiad Y, { d1 } add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d0 } fstmiad Y, { d0 } add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d1 } fstmiad Y, { d1 } add X, X, INC_X add Y, Y, INC_Y .endm .macro COPY_S1 fldmiad X, { d0 } fstmiad Y, { d0 } add X, X, INC_X add Y, Y, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers mov Y, OLD_Y ldr INC_Y, OLD_INC_Y cmp N, #0 ble dcopy_kernel_L999 cmp INC_X, #0 beq dcopy_kernel_L999 cmp INC_Y, #0 beq dcopy_kernel_L999 cmp INC_X, #1 bne dcopy_kernel_S_BEGIN cmp INC_Y, #1 bne dcopy_kernel_S_BEGIN dcopy_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble dcopy_kernel_F1 dcopy_kernel_F4: COPY_F4 subs I, I, #1 bne dcopy_kernel_F4 dcopy_kernel_F1: ands I, N, #3 ble dcopy_kernel_L999 dcopy_kernel_F10: COPY_F1 subs I, I, #1 bne dcopy_kernel_F10 b dcopy_kernel_L999 dcopy_kernel_S_BEGIN: lsl INC_X, INC_X, #3 // INC_X * SIZE lsl INC_Y, INC_Y, #3 // INC_Y * SIZE asrs I, N, #2 // I = N / 4 ble dcopy_kernel_S1 dcopy_kernel_S4: COPY_S4 subs I, I, #1 bne dcopy_kernel_S4 dcopy_kernel_S1: ands I, N, #3 ble dcopy_kernel_L999 dcopy_kernel_S10: COPY_S1 subs I, I, #1 bne dcopy_kernel_S10 dcopy_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers mov r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/ddot_vfp.S000066400000000000000000000124721313527062700171420ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * 2016/01/23 Saar * Bugfix for Refs #750 and #740 **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define OLD_Y r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define OLD_INC_Y [fp, #4 ] #define I r5 #define Y r6 #define INC_Y r7 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro KERNEL_F4 pld [ X, #X_PRE ] fldmiad X!, { d8 } pld [ Y, #X_PRE ] fldmiad Y!, { d4 } fldmiad Y!, { d5 } fmacd d0 , d4, d8 fldmiad X!, { d9 } fldmiad Y!, { d6 } fmacd d1 , d5, d9 fldmiad X!, { d10 } fldmiad X!, { d11 } fmacd d0 , d6, d10 fldmiad Y!, { d7 } fmacd d1 , d7, d11 .endm .macro KERNEL_F1 fldmiad X!, { d4 } fldmiad Y!, { d8 } fmacd d0 , d4, d8 .endm /*************************************************************************************************************************/ .macro KERNEL_S4 nop fldmiad X, { d4 } fldmiad Y, { d8 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d4, d8 fldmiad X, { d5 } fldmiad Y, { d9 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d5, d9 fldmiad X, { d6 } fldmiad Y, { d10 } add X, X, INC_X add Y, Y, INC_Y fmacd d0 , d6, d10 fldmiad X, { d7 } fldmiad Y, { d11 } add X, X, INC_X add Y, Y, INC_Y fmacd d1 , d7, d11 .endm .macro KERNEL_S1 fldmiad X, { d4 } fldmiad Y, { d8 } add X, X, INC_X fmacd d0 , d4, d8 add Y, Y, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers mov Y, OLD_Y ldr INC_Y, OLD_INC_Y movs r4, #0 // clear floating point register vmov s0, r4 vmov s1, r4 vcvt.f64.f32 d0, s0 vcvt.f64.f32 d1, s1 cmp N, #0 ble ddot_kernel_L999 cmp INC_X, #0 beq ddot_kernel_L999 cmp INC_Y, #0 beq ddot_kernel_L999 cmp INC_X, #1 bne ddot_kernel_S_BEGIN cmp INC_Y, #1 bne ddot_kernel_S_BEGIN ddot_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble ddot_kernel_F1 ddot_kernel_F4: KERNEL_F4 subs I, I, #1 ble ddot_kernel_F1 KERNEL_F4 subs I, I, #1 bne ddot_kernel_F4 ddot_kernel_F1: ands I, N, #3 ble ddot_kernel_L999 ddot_kernel_F10: KERNEL_F1 subs I, I, #1 bne ddot_kernel_F10 b ddot_kernel_L999 ddot_kernel_S_BEGIN: lsl INC_X, INC_X, #3 // INC_X * SIZE lsl INC_Y, INC_Y, #3 // INC_Y * SIZE asrs I, N, #2 // I = N / 4 ble ddot_kernel_S1 ddot_kernel_S4: KERNEL_S4 subs I, I, #1 bne ddot_kernel_S4 ddot_kernel_S1: ands I, N, #3 ble ddot_kernel_L999 ddot_kernel_S10: KERNEL_S1 subs I, I, #1 bne ddot_kernel_S10 ddot_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers vadd.f64 d0 , d0, d1 // set return value #if !defined(__ARM_PCS_VFP) vmov r0, r1, d0 #endif sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dgemm_kernel_4x2_vfp.S000066400000000000000000000312131313527062700213300ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/27 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA d0 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define A [fp, #-268 ] #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, # -240] #define FP_ZERO_1 [fp, # -236] #define ALPHA [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHA_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #12 ] #define B [fp, #16 ] #define C [fp, #20 ] #define OLD_LDC [fp, #24 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 32 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT4x2 fldd d8, FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 vmov.f64 d14, d8 vmov.f64 d15, d8 .endm .macro KERNEL4x2_SUB pld [ AO, #A_PRE ] fldd d4 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d8 , d0, d4 fldd d2 , [ AO, #16 ] fmacd d9 , d1, d4 fldd d3 , [ AO, #24 ] fmacd d10 , d2, d4 fldd d5 , [ BO, #8 ] fmacd d11 , d3, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 add AO , AO, #32 fmacd d14 , d2, d5 add BO , BO, #16 fmacd d15 , d3, d5 .endm .macro SAVE4x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fldd d4 , [CO1] fldd d5 , [CO1, #8 ] pld [ CO1, #C_PRE ] fmacd d4 , d0 , d8 fldd d6 , [CO1, #16 ] fmacd d5 , d0 , d9 fldd d7 , [CO1, #24 ] fmacd d6 , d0 , d10 fstd d4 , [CO1] fmacd d7 , d0 , d11 fstd d5 , [CO1, #8 ] fstd d6 , [CO1, #16 ] fstd d7 , [CO1, #24 ] fldd d4 , [CO2] fldd d5 , [CO2, #8 ] pld [ CO2, #C_PRE ] fmacd d4 , d0 , d12 fldd d6 , [CO2, #16 ] fmacd d5 , d0 , d13 fldd d7 , [CO2, #24 ] fmacd d6 , d0 , d14 fstd d4 , [CO2] fmacd d7 , d0 , d15 add CO1, CO1, #32 fstd d5 , [CO2, #8 ] fstd d6 , [CO2, #16 ] fstd d7 , [CO2, #24 ] .endm /******************************************************************************/ .macro INIT2x2 fldd d8, FP_ZERO vmov.f64 d9, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 .endm .macro KERNEL2x2_SUB fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d8 , d0, d4 fmacd d9 , d1, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 add AO , AO, #16 add BO , BO, #16 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fldd d4 , [CO1] fldd d5 , [CO1, #8 ] fmacd d4 , d0 , d8 fmacd d5 , d0 , d9 fstd d4 , [CO1] fstd d5 , [CO1, #8 ] fldd d4 , [CO2] fldd d5 , [CO2, #8 ] fmacd d4 , d0 , d12 fmacd d5 , d0 , d13 fstd d4 , [CO2] fstd d5 , [CO2, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 fldd d8, FP_ZERO vmov.f64 d12, d8 .endm .macro KERNEL1x2_SUB fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d0 , [ AO ] fmacd d8 , d0, d4 fmacd d12 , d0, d5 add AO , AO, #8 add BO , BO, #16 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fldd d4 , [CO1] fmacd d4 , d0 , d8 fstd d4 , [CO1] fldd d4 , [CO2] fmacd d4 , d0 , d12 fstd d4 , [CO2] add CO1, CO1, #8 .endm /******************************************************************************/ .macro INIT4x1 fldd d8, FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 .endm .macro KERNEL4x1_SUB fldd d4 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fmacd d8 , d0, d4 fmacd d9 , d1, d4 fmacd d10 , d2, d4 fmacd d11 , d3, d4 add AO , AO, #32 add BO , BO, #8 .endm .macro SAVE4x1 fldd d0, ALPHA fldd d4 , [CO1] fldd d5 , [CO1, #8 ] fldd d6 , [CO1, #16 ] fldd d7 , [CO1, #24 ] fmacd d4 , d0 , d8 fmacd d5 , d0 , d9 fmacd d6 , d0 , d10 fmacd d7 , d0 , d11 fstd d4 , [CO1] fstd d5 , [CO1, #8 ] fstd d6 , [CO1, #16 ] fstd d7 , [CO1, #24 ] add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x1 fldd d8, FP_ZERO vmov.f64 d9 , d8 .endm .macro KERNEL2x1_SUB fldd d4 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d8 , d0, d4 fmacd d9 , d1, d4 add AO , AO, #16 add BO , BO, #8 .endm .macro SAVE2x1 fldd d0, ALPHA fldd d4 , [CO1] fldd d5 , [CO1, #8 ] fmacd d4 , d0 , d8 fmacd d5 , d0 , d9 fstd d4 , [CO1] fstd d5 , [CO1, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 fldd d8, FP_ZERO .endm .macro KERNEL1x1_SUB fldd d4 , [ BO ] fldd d0 , [ AO ] fmacd d8 , d0, d4 add AO , AO, #8 add BO , BO, #8 .endm .macro SAVE1x1 fldd d0, ALPHA fldd d4 , [CO1] fmacd d4 , d0 , d8 fstd d4 , [CO1] add CO1, CO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA, OLD_ALPHA_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA, ALPHA sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC ldr K1, K ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble dgemm_kernel_L1_BEGIN /*********************************************************************************************/ dgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A dgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble dgemm_kernel_L2_M2_BEGIN dgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L2_M4_40 .align 5 dgemm_kernel_L2_M4_22: pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB pld [ BO, #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M4_22 dgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M4_100 dgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M4_42 dgemm_kernel_L2_M4_100: SAVE4x2 dgemm_kernel_L2_M4_END: subs I, I, #1 bgt dgemm_kernel_L2_M4_20 dgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 ble dgemm_kernel_L2_END tst I, #2 // I = I / 2 ble dgemm_kernel_L2_M1_BEGIN dgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L2_M2_40 dgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M2_22 dgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M2_100 dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M2_42 dgemm_kernel_L2_M2_100: SAVE2x2 dgemm_kernel_L2_M2_END: dgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 ble dgemm_kernel_L2_END dgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L2_M1_40 dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M1_22 dgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M1_100 dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M1_42 dgemm_kernel_L2_M1_100: SAVE1x2 dgemm_kernel_L2_END: mov r3, BC mov r4, K1 lsl r4, r4, #4 // k * 2 * 8 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 subs J , #1 // j-- bgt dgemm_kernel_L2_BEGIN /*********************************************************************************************/ dgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 ble dgemm_kernel_L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A dgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble dgemm_kernel_L1_M2_BEGIN dgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L1_M4_40 .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M4_22 dgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M4_100 dgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M4_42 dgemm_kernel_L1_M4_100: SAVE4x1 dgemm_kernel_L1_M4_END: subs I, I, #1 bgt dgemm_kernel_L1_M4_20 dgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 ble dgemm_kernel_L1_END tst I, #2 // I = I / 2 ble dgemm_kernel_L1_M1_BEGIN dgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L1_M2_40 dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M2_22 dgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M2_100 dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M2_42 dgemm_kernel_L1_M2_100: SAVE2x1 dgemm_kernel_L1_M2_END: dgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 ble dgemm_kernel_L1_END dgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L1_M1_40 dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M1_22 dgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M1_100 dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M1_42 dgemm_kernel_L1_M1_100: SAVE1x1 dgemm_kernel_L1_END: dgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dgemm_kernel_4x4_vfpv3.S000066400000000000000000000547411313527062700216160ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/11 Saar * UNROLL_N 4 * UNROLL_M 4 * DGEMM_P 128 * DGEMM_Q 96 * DGEMM_R 512 * A_PRE 96 * B_PRE 96 * C_PRE 64 * * Performance on Odroid U2: * * 1 Core: 1.57 GFLOPS ATLAS: 1.59 GFLOPS * 2 Cores: 3.14 GFLOPS ATLAS: 3.16 GFLOPS * 3 Cores: 4.56 GFLOPS ATLAS: 4.60 GFLOPS * 4 Cores: 5.82 GFLOPS ATLAS: 5.41 GFLOPS **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA d0 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define A [fp, #-268 ] #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, # -240] #define FP_ZERO_1 [fp, # -236] #define ALPHA [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHA_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #12 ] #define B [fp, #16 ] #define C [fp, #20 ] #define OLD_LDC [fp, #24 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT4x4 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 vmov.f64 d30, d16 vmov.f64 d31, d16 .endm .macro KERNEL4x4_I pld [ BO , #B_PRE ] fldd d8 , [ BO ] fldd d0 , [ AO ] pld [ AO , #A_PRE ] fldd d1 , [ AO, #8 ] fmuld d16 , d0, d8 fldd d2 , [ AO, #16 ] fmuld d17 , d1, d8 fldd d3 , [ AO, #24 ] fmuld d18 , d2, d8 fldd d9 , [ BO, #8 ] fmuld d19 , d3, d8 fldd d10, [ BO, #16 ] fmuld d20 , d0, d9 fldd d11, [ BO, #24 ] fmuld d21 , d1, d9 add BO , BO, #32 add AO , AO, #32 fmuld d22 , d2, d9 pld [ BO , #B_PRE ] fldd d12, [ BO ] fmuld d23 , d3, d9 pld [ AO , #A_PRE ] fldd d4 , [ AO, #0 ] fmuld d24 , d0, d10 fldd d5 , [ AO, #8 ] fmuld d25 , d1, d10 fldd d6 , [ AO, #16 ] fmuld d26 , d2, d10 fldd d7 , [ AO, #24 ] fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] fmuld d28 , d0, d11 fldd d14, [ BO, #16 ] fmuld d29 , d1, d11 fldd d15, [ BO, #24 ] fmuld d30 , d2, d11 fmuld d31 , d3, d11 .endm .macro KERNEL4x4_M2 fmacd d16 , d4, d12 pld [ AO , #A_PRE+32 ] fmacd d17 , d5, d12 fldd d0 , [ AO , #32 ] fmacd d18 , d6, d12 pld [ BO , #B_PRE+32 ] fmacd d19 , d7, d12 fldd d8 , [ BO , #32 ] fmacd d20 , d4, d13 fldd d1 , [ AO, #40 ] fmacd d21 , d5, d13 fldd d2 , [ AO, #48 ] fmacd d22 , d6, d13 fldd d3 , [ AO, #56 ] fmacd d23 , d7, d13 fmacd d24 , d4, d14 fmacd d25 , d5, d14 fldd d9 , [ BO, #40 ] fmacd d26 , d6, d14 fldd d10, [ BO, #48 ] fmacd d27 , d7, d14 fldd d11, [ BO, #56 ] fmacd d28 , d4, d15 fmacd d29 , d5, d15 add AO , AO, #64 fmacd d30 , d6, d15 add BO , BO, #64 fmacd d31 , d7, d15 .endm .macro KERNEL4x4_M1 fmacd d16 , d0, d8 pld [ AO , #A_PRE ] fmacd d17 , d1, d8 fldd d4 , [ AO ] fmacd d18 , d2, d8 pld [ BO , #B_PRE ] fmacd d19 , d3, d8 fldd d12, [ BO ] fmacd d20 , d0, d9 fldd d5 , [ AO, #8 ] fmacd d21 , d1, d9 fldd d6 , [ AO, #16 ] fmacd d22 , d2, d9 fldd d7 , [ AO, #24 ] fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fldd d13, [ BO, #8 ] fmacd d26 , d2, d10 fldd d14, [ BO, #16 ] fmacd d27 , d3, d10 fldd d15, [ BO, #24 ] fmacd d28 , d0, d11 fmacd d29 , d1, d11 fmacd d30 , d2, d11 fmacd d31 , d3, d11 .endm .macro KERNEL4x4_E fmacd d16 , d4, d12 fmacd d17 , d5, d12 add BO , BO, #32 add AO , AO, #32 fmacd d18 , d6, d12 fmacd d19 , d7, d12 fmacd d20 , d4, d13 fmacd d21 , d5, d13 fmacd d22 , d6, d13 fmacd d23 , d7, d13 fmacd d24 , d4, d14 fmacd d25 , d5, d14 fmacd d26 , d6, d14 fmacd d27 , d7, d14 fmacd d28 , d4, d15 fmacd d29 , d5, d15 fmacd d30 , d6, d15 fmacd d31 , d7, d15 .endm .macro KERNEL4x4_SUB fldd d8 , [ BO ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] pld [ AO , #A_PRE ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fldd d2 , [ AO, #16 ] fmacd d17 , d1, d8 fldd d3 , [ AO, #24 ] fmacd d18 , d2, d8 fldd d9 , [ BO, #8 ] fmacd d19 , d3, d8 fldd d10, [ BO, #16 ] fmacd d20 , d0, d9 fldd d11, [ BO, #24 ] fmacd d21 , d1, d9 fmacd d22 , d2, d9 fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fmacd d26 , d2, d10 fmacd d27 , d3, d10 fmacd d28 , d0, d11 fmacd d29 , d1, d11 add AO , AO, #32 fmacd d30 , d2, d11 add BO , BO, #32 fmacd d31 , d3, d11 .endm .macro SAVE4x4 pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA add r4 , CO2, r3 pld [ CO2 , #C_PRE ] fldmiad CO1, { d8 - d11 } pld [ r4 , #C_PRE ] fmacd d8 , d0 , d16 fldd d12, [CO2] fmacd d9 , d0 , d17 fldd d13, [CO2, #8 ] fmacd d10, d0 , d18 fldd d14, [CO2, #16 ] fmacd d11, d0 , d19 fldd d15, [CO2, #24 ] fmacd d12, d0 , d20 fstd d8 , [CO1] fmacd d13, d0 , d21 fstd d9 , [CO1, #8 ] fmacd d14, d0 , d22 fstd d10, [CO1, #16 ] fmacd d15, d0 , d23 fstd d11, [CO1, #24 ] fldmiad r4, { d8 - d11 } fmacd d8 , d0 , d24 fstd d12, [CO2] fmacd d9 , d0 , d25 fstd d13, [CO2, #8 ] fmacd d10, d0 , d26 fstd d14, [CO2, #16 ] fmacd d11, d0 , d27 fstd d15, [CO2, #24 ] add CO2, r4 , r3 pld [ CO2 , #C_PRE ] fldmiad CO2, { d12 - d15 } fstd d8 , [r4 ] fmacd d12, d0 , d28 fstd d9 , [r4 , #8 ] fmacd d13, d0 , d29 fstd d10, [r4 , #16 ] fmacd d14, d0 , d30 fstd d11, [r4 , #24 ] fmacd d15, d0 , d31 fstmiad CO2, { d12 - d15 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x4 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 .endm .macro KERNEL2x4_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d20 , d0, d9 fmacd d21 , d1, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fmacd d28 , d0, d11 fmacd d29 , d1, d11 add AO , AO, #16 add BO , BO, #32 .endm .macro SAVE2x4 ldr r3 , LDC add CO2 , CO1, r3 add r4 , CO2, r3 fldd d0, ALPHA fldd d8 , [CO1] fldd d9 , [CO1, #8 ] fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fldd d12, [CO2] fldd d13, [CO2, #8 ] fmacd d12, d0 , d20 fmacd d13, d0 , d21 fstd d12, [CO2] fstd d13, [CO2, #8 ] fldd d8 , [r4 ] fldd d9 , [r4 , #8 ] fmacd d8 , d0 , d24 fmacd d9 , d0 , d25 fstd d8 , [r4 ] fstd d9 , [r4 , #8 ] add CO2, r4 , r3 fldd d12, [CO2] fldd d13, [CO2, #8 ] fmacd d12, d0 , d28 fmacd d13, d0 , d29 fstd d12, [CO2] fstd d13, [CO2, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x4 fldd d16, FP_ZERO vmov.f64 d20, d16 vmov.f64 d24, d16 vmov.f64 d28, d16 .endm .macro KERNEL1x4_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fldd d0 , [ AO ] fmacd d16 , d0, d8 fmacd d20 , d0, d9 fmacd d24 , d0, d10 fmacd d28 , d0, d11 add AO , AO, #8 add BO , BO, #32 .endm .macro SAVE1x4 ldr r3 , LDC add CO2 , CO1, r3 add r4 , CO2, r3 fldd d0, ALPHA fldd d8 , [CO1] fmacd d8 , d0 , d16 fstd d8 , [CO1] fldd d12, [CO2] fmacd d12, d0 , d20 fstd d12, [CO2] fldd d8 , [r4 ] fmacd d8 , d0 , d24 fstd d8 , [r4 ] add CO2, r4 , r3 fldd d12, [CO2] fmacd d12, d0 , d28 fstd d12, [CO2] add CO1, CO1, #8 .endm /******************************************************************************/ /******************************************************************************/ .macro INIT4x2 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 .endm .macro KERNEL4x2_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d18 , d2, d8 fmacd d19 , d3, d8 fmacd d20 , d0, d9 fmacd d21 , d1, d9 fmacd d22 , d2, d9 fmacd d23 , d3, d9 add AO , AO, #32 add BO , BO, #16 .endm .macro SAVE4x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fldd d8 , [CO1] fldd d9 , [CO1, #8 ] fldd d10, [CO1, #16 ] fldd d11, [CO1, #24 ] fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 fmacd d10, d0 , d18 fmacd d11, d0 , d19 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fstd d10, [CO1, #16 ] fstd d11, [CO1, #24 ] fldd d12, [CO2] fldd d13, [CO2, #8 ] fldd d14, [CO2, #16 ] fldd d15, [CO2, #24 ] fmacd d12, d0 , d20 fmacd d13, d0 , d21 fmacd d14, d0 , d22 fmacd d15, d0 , d23 fstd d12, [CO2] fstd d13, [CO2, #8 ] fstd d14, [CO2, #16 ] fstd d15, [CO2, #24 ] add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x2 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 .endm .macro KERNEL2x2_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d20 , d0, d9 fmacd d21 , d1, d9 add AO , AO, #16 add BO , BO, #16 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fldd d8 , [CO1] fldd d9 , [CO1, #8 ] fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fldd d12, [CO2] fldd d13, [CO2, #8 ] fmacd d12, d0 , d20 fmacd d13, d0 , d21 fstd d12, [CO2] fstd d13, [CO2, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 fldd d16, FP_ZERO vmov.f64 d20, d16 .endm .macro KERNEL1x2_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d0 , [ AO ] fmacd d16 , d0, d8 fmacd d20 , d0, d9 add AO , AO, #8 add BO , BO, #16 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fldd d8 , [CO1] fmacd d8 , d0 , d16 fstd d8 , [CO1] fldd d12, [CO2] fmacd d12, d0 , d20 fstd d12, [CO2] add CO1, CO1, #8 .endm /******************************************************************************/ /******************************************************************************/ .macro INIT4x1 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 .endm .macro KERNEL4x1_SUB fldd d8 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d18 , d2, d8 fmacd d19 , d3, d8 add AO , AO, #32 add BO , BO, #8 .endm .macro SAVE4x1 fldd d0, ALPHA fldd d8 , [CO1] fldd d9 , [CO1, #8 ] fldd d10, [CO1, #16 ] fldd d11, [CO1, #24 ] fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 fmacd d10, d0 , d18 fmacd d11, d0 , d19 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fstd d10, [CO1, #16 ] fstd d11, [CO1, #24 ] add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x1 fldd d16, FP_ZERO vmov.f64 d17, d16 .endm .macro KERNEL2x1_SUB fldd d8 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 add AO , AO, #16 add BO , BO, #8 .endm .macro SAVE2x1 fldd d0, ALPHA fldd d8 , [CO1] fldd d9 , [CO1, #8 ] fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 fldd d16, FP_ZERO .endm .macro KERNEL1x1_SUB fldd d8 , [ BO ] fldd d0 , [ AO ] fmacd d16 , d0, d8 add AO , AO, #8 add BO , BO, #8 .endm .macro SAVE1x1 fldd d0, ALPHA fldd d8 , [CO1] fmacd d8 , d0 , d16 fstd d8 , [CO1] add CO1, CO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA, OLD_ALPHA_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA, ALPHA movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC ldr K1, K ldr BC, B ldr J, N asrs J, J, #2 // J = J / 4 ble dgemm_kernel_L2_BEGIN dgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #2 // LDC * 4 add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] dgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble dgemm_kernel_L4_M2_BEGIN dgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #2 blt dgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #2 ble dgemm_kernel_L4_M4_22a .align 5 dgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #1 bgt dgemm_kernel_L4_M4_22 dgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b dgemm_kernel_L4_M4_44 dgemm_kernel_L4_M4_32: tst L, #1 ble dgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b dgemm_kernel_L4_M4_44 dgemm_kernel_L4_M4_40: INIT4x4 dgemm_kernel_L4_M4_44: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L4_M4_100 dgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 bne dgemm_kernel_L4_M4_46 dgemm_kernel_L4_M4_100: SAVE4x4 dgemm_kernel_L4_M4_END: subs I, I, #1 bne dgemm_kernel_L4_M4_20 dgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 ble dgemm_kernel_L4_END tst I, #2 // I = I / 2 ble dgemm_kernel_L4_M1_BEGIN dgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L4_M2_40 dgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs L, L, #1 bgt dgemm_kernel_L4_M2_22 dgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L4_M2_100 dgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 bgt dgemm_kernel_L4_M2_42 dgemm_kernel_L4_M2_100: SAVE2x4 dgemm_kernel_L4_M2_END: dgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 ble dgemm_kernel_L4_END dgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L4_M1_40 dgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs L, L, #1 bgt dgemm_kernel_L4_M1_22 dgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L4_M1_100 dgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 bgt dgemm_kernel_L4_M1_42 dgemm_kernel_L4_M1_100: SAVE1x4 dgemm_kernel_L4_END: mov r3, BC mov r4, K1 lsl r4, r4, #5 // k * 4 * 8 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 subs J , #1 // j-- bgt dgemm_kernel_L4_BEGIN /*********************************************************************************************/ dgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 ble dgemm_kernel_L999 tst J , #2 ble dgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A dgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble dgemm_kernel_L2_M2_BEGIN dgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L2_M4_40 .align 5 dgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M4_22 dgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M4_100 dgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M4_42 dgemm_kernel_L2_M4_100: SAVE4x2 dgemm_kernel_L2_M4_END: subs I, I, #1 bgt dgemm_kernel_L2_M4_20 dgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 ble dgemm_kernel_L2_END tst I, #2 // I = I / 2 ble dgemm_kernel_L2_M1_BEGIN dgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L2_M2_40 dgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M2_22 dgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M2_100 dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M2_42 dgemm_kernel_L2_M2_100: SAVE2x2 dgemm_kernel_L2_M2_END: dgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 ble dgemm_kernel_L2_END dgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L2_M1_40 dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M1_22 dgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M1_100 dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt dgemm_kernel_L2_M1_42 dgemm_kernel_L2_M1_100: SAVE1x2 dgemm_kernel_L2_END: mov r3, BC mov r4, K1 lsl r4, r4, #4 // k * 2 * 8 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 /*********************************************************************************************/ dgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 ble dgemm_kernel_L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A dgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble dgemm_kernel_L1_M2_BEGIN dgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L1_M4_40 .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M4_22 dgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M4_100 dgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M4_42 dgemm_kernel_L1_M4_100: SAVE4x1 dgemm_kernel_L1_M4_END: subs I, I, #1 bgt dgemm_kernel_L1_M4_20 dgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 ble dgemm_kernel_L1_END tst I, #2 // I = I / 2 ble dgemm_kernel_L1_M1_BEGIN dgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L1_M2_40 dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M2_22 dgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M2_100 dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M2_42 dgemm_kernel_L1_M2_100: SAVE2x1 dgemm_kernel_L1_M2_END: dgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 ble dgemm_kernel_L1_END dgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble dgemm_kernel_L1_M1_40 dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M1_22 dgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M1_100 dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt dgemm_kernel_L1_M1_42 dgemm_kernel_L1_M1_100: SAVE1x1 dgemm_kernel_L1_END: dgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dgemm_ncopy_2_vfp.S000066400000000000000000000107441313527062700207320ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/24 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_A r2 #define OLD_LDA r3 #define B [fp, #4 ] #define M r0 #define N r1 #define A r2 #define BO r5 #define AO1 r6 #define AO2 r7 #define LDA r8 #define I r3 #define J r12 #define A_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY2x2 fldd d0 , [ AO1, #0 ] fldd d2 , [ AO1, #8 ] fldd d1 , [ AO2, #0 ] fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 fstmiad BO!, { d0 - d3 } add AO2, AO2, #16 .endm .macro COPY1x2 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 fstmiad BO!, { d0 - d1 } add AO2, AO2, #8 .endm .macro COPY2x1 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] fstmiad BO!, { d0 - d1 } add AO1, AO1, #16 .endm .macro COPY1x1 fldd d0 , [ AO1, #0 ] fstmiad BO!, { d0 } add AO1, AO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 lsl LDA, OLD_LDA, #3 // lda = lda * 8 ldr BO, B /*********************************************************************************************/ dgemm_ncopy_L2_BEGIN: asrs J, N, #1 // J = N / 2 ble dgemm_ncopy_L1_BEGIN dgemm_ncopy_L2_M2_BEGIN: mov AO1, A // AO1 = A add AO2, AO1, LDA add A , AO2, LDA // A = A + 2 * LDA asrs I, M, #1 // I = M / 2 ble dgemm_ncopy_L2_M2_40 dgemm_ncopy_L2_M2_20: COPY2x2 subs I , I , #1 bne dgemm_ncopy_L2_M2_20 dgemm_ncopy_L2_M2_40: ands I, M , #1 ble dgemm_ncopy_L2_M2_END dgemm_ncopy_L2_M2_60: COPY1x2 subs I , I , #1 bne dgemm_ncopy_L2_M2_60 dgemm_ncopy_L2_M2_END: subs J , J, #1 // j-- bne dgemm_ncopy_L2_M2_BEGIN /*********************************************************************************************/ dgemm_ncopy_L1_BEGIN: tst N, #1 ble dgemm_ncopy_L999 dgemm_ncopy_L1_M2_BEGIN: mov AO1, A // AO1 = A add A , AO1, LDA // A = A + 1 * LDA asrs I, M, #1 // I = M / 2 ble dgemm_ncopy_L1_M2_40 dgemm_ncopy_L1_M2_20: COPY2x1 subs I , I , #1 bne dgemm_ncopy_L1_M2_20 dgemm_ncopy_L1_M2_40: ands I, M , #1 ble dgemm_ncopy_L1_M2_END dgemm_ncopy_L1_M2_60: COPY1x1 subs I , I , #1 bne dgemm_ncopy_L1_M2_60 dgemm_ncopy_L1_M2_END: dgemm_ncopy_L999: movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dgemm_ncopy_4_vfp.S000066400000000000000000000147741313527062700207430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_A r2 #define OLD_LDA r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define LDA [fp, #-260 ] #define B [fp, #4 ] #define M r0 #define N r1 #define A r2 #define BO r5 #define AO1 r6 #define AO2 r7 #define AO3 r8 #define AO4 r9 #define I r3 #define J r12 #define A_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY4x4 pld [ AO1, #A_PRE ] pld [ AO2, #A_PRE ] pld [ AO3, #A_PRE ] pld [ AO4, #A_PRE ] fldd d0 , [ AO1, #0 ] fldd d1 , [ AO2, #0 ] fldd d2 , [ AO3, #0 ] fldd d3 , [ AO4, #0 ] fldd d4 , [ AO1, #8 ] fldd d8 , [ AO1, #16 ] fldd d12, [ AO1, #24 ] fldd d5 , [ AO2, #8 ] add AO1, AO1, #32 fldd d9 , [ AO2, #16 ] fldd d13, [ AO2, #24 ] fldd d6 , [ AO3, #8 ] add AO2, AO2, #32 fldd d10, [ AO3, #16 ] fldd d14, [ AO3, #24 ] fldd d7 , [ AO4, #8 ] add AO3, AO3, #32 fldd d11, [ AO4, #16 ] fldd d15, [ AO4, #24 ] fstmiad BO!, { d0 - d3 } add AO4, AO4, #32 fstmiad BO!, { d4 - d7 } fstmiad BO!, { d8 - d15 } .endm .macro COPY1x4 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 fldd d2 , [ AO3, #0 ] add AO2, AO2, #8 fldd d3 , [ AO4, #0 ] add AO3, AO3, #8 fstmiad BO!, { d0 - d3 } add AO4, AO4, #8 .endm .macro COPY4x2 fldd d0 , [ AO1, #0 ] fldd d2 , [ AO1, #8 ] fldd d4 , [ AO1, #16 ] fldd d6 , [ AO1, #24 ] fldd d1 , [ AO2, #0 ] fldd d3 , [ AO2, #8 ] add AO1, AO1, #32 fldd d5 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] fstmiad BO!, { d0 - d7 } add AO2, AO2, #32 .endm .macro COPY1x2 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO2, #0 ] add AO1, AO1, #8 fstmiad BO!, { d0 - d1 } add AO2, AO2, #8 .endm .macro COPY4x1 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] fstmiad BO!, { d0 - d3 } add AO1, AO1, #32 .endm .macro COPY1x1 fldd d0 , [ AO1, #0 ] fstmiad BO!, { d0 } add AO1, AO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack lsl r3, r3, #3 // lda = lda * 8 str r3, LDA sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers ldr BO, B dgemm_ncopy_L4_BEGIN: asrs J, N, #2 // J = N / 4 ble dgemm_ncopy_L2_BEGIN dgemm_ncopy_L4_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add AO2, AO1, r4 add AO3, AO2, r4 add AO4, AO3, r4 add A , AO4, r4 // A = A + 4 * LDA asrs I, M, #2 // I = M / 4 ble dgemm_ncopy_L4_M4_40 dgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 bne dgemm_ncopy_L4_M4_20 dgemm_ncopy_L4_M4_40: ands I, M , #3 ble dgemm_ncopy_L4_M4_END dgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 bne dgemm_ncopy_L4_M4_60 dgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- bne dgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ dgemm_ncopy_L2_BEGIN: tst N, #3 ble dgemm_ncopy_L999 tst N, #2 ble dgemm_ncopy_L1_BEGIN dgemm_ncopy_L2_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add AO2, AO1, r4 add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #2 // I = M / 4 ble dgemm_ncopy_L2_M4_40 dgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 bne dgemm_ncopy_L2_M4_20 dgemm_ncopy_L2_M4_40: ands I, M , #3 ble dgemm_ncopy_L2_M4_END dgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 bne dgemm_ncopy_L2_M4_60 dgemm_ncopy_L2_M4_END: /*********************************************************************************************/ dgemm_ncopy_L1_BEGIN: tst N, #1 ble dgemm_ncopy_L999 dgemm_ncopy_L1_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #2 // I = M / 4 ble dgemm_ncopy_L1_M4_40 dgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 bne dgemm_ncopy_L1_M4_20 dgemm_ncopy_L1_M4_40: ands I, M , #3 ble dgemm_ncopy_L1_M4_END dgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 bne dgemm_ncopy_L1_M4_60 dgemm_ncopy_L1_M4_END: dgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dgemm_tcopy_4_vfp.S000066400000000000000000000164551313527062700207470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/06 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_A r2 #define OLD_LDA r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define B [fp, #4 ] #define A [fp, #-248 ] #define M r0 #define N r1 #define M4 r2 #define LDA r5 #define AO1 r6 #define BO1 r7 #define BO2 r8 #define BO3 r9 #define I r4 #define J r12 #define A_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY4x4 pld [ AO1, #A_PRE ] fldmiad AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] fldmiad r3, { d4 - d7 } add r3, r3, LDA pld [ r3, #A_PRE ] fldmiad r3, { d8 - d11 } add r3, r3, LDA pld [ r3, #A_PRE ] fldmiad r3, { d12 - d15 } fstmiad BO1, { d0 - d15 } add AO1, AO1, #32 add BO1, BO1, M4 .endm .macro COPY2x4 fldmiad AO1, { d0 - d1 } add r3, AO1, LDA fldmiad r3, { d2 - d3 } add r3, r3, LDA fldmiad r3, { d4 - d5 } add r3, r3, LDA fldmiad r3, { d6 - d7 } fstmiad BO2, { d0 - d7 } add AO1, AO1, #16 add BO2, BO2, #64 .endm .macro COPY1x4 fldmiad AO1, { d0 } add r3, AO1, LDA fldmiad r3, { d1 } add r3, r3, LDA fldmiad r3, { d2 } add r3, r3, LDA fldmiad r3, { d3 } fstmiad BO3, { d0 - d3 } add AO1, AO1, #8 add BO3, BO3, #32 .endm /*************************************************************************************************************************/ .macro COPY4x2 pld [ AO1, #A_PRE ] fldmiad AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] fldmiad r3, { d4 - d7 } fstmiad BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 .endm .macro COPY2x2 fldmiad AO1, { d0 - d1 } add r3, AO1, LDA fldmiad r3, { d2 - d3 } fstmiad BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 .endm .macro COPY1x2 fldmiad AO1, { d0 } add r3, AO1, LDA fldmiad r3, { d1 } fstmiad BO3, { d0 - d1 } add AO1, AO1, #8 add BO3, BO3, #16 .endm /*************************************************************************************************************************/ .macro COPY4x1 pld [ AO1, #A_PRE ] fldmiad AO1, { d0 - d3 } fstmiad BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 .endm .macro COPY2x1 fldmiad AO1, { d0 - d1 } fstmiad BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 .endm .macro COPY1x1 fldmiad AO1, { d0 } fstmiad BO3, { d0 } add AO1, AO1, #8 add BO3, BO3, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack str OLD_A, A // store A lsl LDA, OLD_LDA, #3 // lda = lda * SIZE sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers lsl r4 , M, #3 // M * SIZE ldr r3, B and BO2 , N , #-4 and BO3 , N , #-2 mul BO2, BO2, r4 mul BO3, BO3, r4 add BO2 , BO2, r3 add BO3 , BO3, r3 lsl M4, M, #5 // M4 = M * 4 * SIZE dgemm_tcopy_L4_BEGIN: asrs J, M, #2 // J = N / 4 ble dgemm_tcopy_L2_BEGIN dgemm_tcopy_L4_M4_BEGIN: ldr AO1, A // AO1 = A lsl r3, LDA, #2 // r3 = 4 * LDA add r3, r3 , AO1 // A = A + 4 * LDA str r3, A // store A ldr BO1, B add r3, BO1, #128 // B = B + 16 * SIZE str r3, B asrs I, N, #2 // I = M / 4 ble dgemm_tcopy_L4_M4_40 dgemm_tcopy_L4_M4_20: COPY4x4 subs I , I , #1 bne dgemm_tcopy_L4_M4_20 dgemm_tcopy_L4_M4_40: tst N , #2 ble dgemm_tcopy_L4_M4_60 COPY2x4 dgemm_tcopy_L4_M4_60: tst N, #1 ble dgemm_tcopy_L4_M4_END COPY1x4 dgemm_tcopy_L4_M4_END: subs J , J, #1 // j-- bne dgemm_tcopy_L4_M4_BEGIN /*********************************************************************************************/ dgemm_tcopy_L2_BEGIN: tst M, #3 ble dgemm_tcopy_L999 tst M, #2 ble dgemm_tcopy_L1_BEGIN dgemm_tcopy_L2_M4_BEGIN: ldr AO1, A // AO1 = A lsl r3, LDA, #1 // r3 = 2 * LDA add r3, r3 , AO1 // A = A + 2 * LDA str r3, A // store A ldr BO1, B add r3, BO1, #64 // B = B + 8 * SIZE str r3, B asrs I, N, #2 // I = M / 4 ble dgemm_tcopy_L2_M4_40 dgemm_tcopy_L2_M4_20: COPY4x2 subs I , I , #1 bne dgemm_tcopy_L2_M4_20 dgemm_tcopy_L2_M4_40: tst N , #2 ble dgemm_tcopy_L2_M4_60 COPY2x2 dgemm_tcopy_L2_M4_60: tst N , #1 ble dgemm_tcopy_L2_M4_END COPY1x2 dgemm_tcopy_L2_M4_END: /*********************************************************************************************/ dgemm_tcopy_L1_BEGIN: tst M, #1 ble dgemm_tcopy_L999 dgemm_tcopy_L1_M4_BEGIN: ldr AO1, A // AO1 = A add r3, LDA , AO1 // A = A + 1 * LDA str r3, A // store A ldr BO1, B add r3, BO1, #32 // B = B + 4 * SIZE str r3, B asrs I, N, #2 // I = M / 4 ble dgemm_tcopy_L1_M4_40 dgemm_tcopy_L1_M4_20: COPY4x1 subs I , I , #1 bne dgemm_tcopy_L1_M4_20 dgemm_tcopy_L1_M4_40: tst N , #2 ble dgemm_tcopy_L1_M4_60 COPY2x1 dgemm_tcopy_L1_M4_60: tst N , #1 ble dgemm_tcopy_L1_M4_END COPY1x1 dgemm_tcopy_L1_M4_END: dgemm_tcopy_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers mov r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dot.c000066400000000000000000000044241313527062700161410ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif { BLASLONG i=0; BLASLONG ix=0,iy=0; double dot = 0.0 ; if ( n < 0 ) return(dot); while(i < n) { dot += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(dot); } OpenBLAS-0.2.20/kernel/arm/dtrmm_kernel_4x2_vfp.S000066400000000000000000000461421313527062700213710ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 252 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA d0 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define KK [fp, #-240 ] #define KKK [fp, #-244] #define C [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define A [fp, #-268 ] #define FP_ZERO [fp, #-232] #define FP_ZERO_0 [fp, #-232] #define FP_ZERO_1 [fp, #-228] #define ALPHA [fp, #-276 ] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHA_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #12 ] #define B [fp, #16 ] #define OLD_C [fp, #20 ] #define OLD_LDC [fp, #24 ] #define OFFSET [fp, #28 ] #else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 64 #define B_PRE 64 #define C_PRE 64 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT4x2 fldd d8 , FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 vmov.f64 d14, d8 vmov.f64 d15, d8 .endm .macro KERNEL4x2_SUB fldd d4 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] pld [ AO , #A_PRE ] fmacd d8 , d0, d4 fldd d2 , [ AO, #16 ] fmacd d9 , d1, d4 fldd d3 , [ AO, #24 ] fmacd d10 , d2, d4 fldd d5 , [ BO, #8 ] fmacd d11 , d3, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 add AO , AO, #32 fmacd d14 , d2, d5 add BO , BO, #16 fmacd d15 , d3, d5 .endm .macro SAVE4x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fmuld d4 , d0 , d8 fmuld d5 , d0 , d9 fmuld d6 , d0 , d10 fmuld d7 , d0 , d11 fstd d4 , [CO1] fstd d5 , [CO1, #8 ] fstd d6 , [CO1, #16 ] fstd d7 , [CO1, #24 ] fmuld d4 , d0 , d12 fmuld d5 , d0 , d13 fmuld d6 , d0 , d14 fmuld d7 , d0 , d15 fstd d4 , [CO2] fstd d5 , [CO2, #8 ] fstd d6 , [CO2, #16 ] fstd d7 , [CO2, #24 ] add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x2 fldd d8 , FP_ZERO vmov.f64 d9, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 .endm .macro KERNEL2x2_SUB fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d8 , d0, d4 fmacd d9 , d1, d4 fmacd d12 , d0, d5 fmacd d13 , d1, d5 add AO , AO, #16 add BO , BO, #16 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fmuld d4 , d0 , d8 fmuld d5 , d0 , d9 fstd d4 , [CO1] fstd d5 , [CO1, #8 ] fmuld d4 , d0 , d12 fmuld d5 , d0 , d13 fstd d4 , [CO2] fstd d5 , [CO2, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 fldd d8 , FP_ZERO vmov.f64 d12, d8 .endm .macro KERNEL1x2_SUB fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d0 , [ AO ] fmacd d8 , d0, d4 fmacd d12 , d0, d5 add AO , AO, #8 add BO , BO, #16 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fmuld d4 , d0 , d8 fstd d4 , [CO1] fmuld d4 , d0 , d12 fstd d4 , [CO2] add CO1, CO1, #8 .endm /******************************************************************************/ .macro INIT4x1 fldd d8 , FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 .endm .macro KERNEL4x1_SUB fldd d4 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fmacd d8 , d0, d4 fmacd d9 , d1, d4 fmacd d10 , d2, d4 fmacd d11 , d3, d4 add AO , AO, #32 add BO , BO, #8 .endm .macro SAVE4x1 fldd d0, ALPHA fmuld d4 , d0 , d8 fmuld d5 , d0 , d9 fmuld d6 , d0 , d10 fmuld d7 , d0 , d11 fstd d4 , [CO1] fstd d5 , [CO1, #8 ] fstd d6 , [CO1, #16 ] fstd d7 , [CO1, #24 ] add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x1 fldd d8 , FP_ZERO vmov.f64 d9 , d8 .endm .macro KERNEL2x1_SUB fldd d4 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d8 , d0, d4 fmacd d9 , d1, d4 add AO , AO, #16 add BO , BO, #8 .endm .macro SAVE2x1 fldd d0, ALPHA fmuld d4 , d0 , d8 fmuld d5 , d0 , d9 fstd d4 , [CO1] fstd d5 , [CO1, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 fldd d8 , FP_ZERO .endm .macro KERNEL1x1_SUB fldd d4 , [ BO ] fldd d0 , [ AO ] fmacd d8 , d0, d4 add AO , AO, #8 add BO , BO, #8 .endm .macro SAVE1x1 fldd d0, ALPHA fmuld d4 , d0 , d8 fstd d4 , [CO1] add CO1, CO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA, OLD_ALPHA_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA, ALPHA sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC ldr r3, OLD_C str r3, C ldr BC, B ldr r3, OFFSET #ifndef LEFT neg r3 , r3 #endif str r3 , KK ldr J, N asrs J, J, #1 // J = J / 2 ble _L1_BEGIN _L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A _L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble _L2_M2_BEGIN _L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #4 // number of values in AO #else add L , L , #2 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L2_M4_40 .align 5 _L2_M4_22: pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB pld [ BO , #B_PRE ] KERNEL4x2_SUB KERNEL4x2_SUB subs L, L, #1 bgt _L2_M4_22 _L2_M4_40: ands L , K1, #7 // L = L % 8 ble _L2_M4_100 _L2_M4_42: KERNEL4x2_SUB subs L, L, #1 bgt _L2_M4_42 _L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #4 // number of values in AO str r3 , KK #endif _L2_M4_END: subs I, I, #1 bgt _L2_M4_20 _L2_M2_BEGIN: ldr I, M tst I , #3 ble _L2_END tst I, #2 // I = I / 2 ble _L2_M1_BEGIN _L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #2 // number of values in AO #else add L , L , #2 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L2_M2_40 _L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs L, L, #1 bgt _L2_M2_22 _L2_M2_40: ands L , K1, #7 // L = L % 8 ble _L2_M2_100 _L2_M2_42: KERNEL2x2_SUB subs L, L, #1 bgt _L2_M2_42 _L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L2_M2_END: _L2_M1_BEGIN: tst I, #1 // I = I % 2 ble _L2_END _L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #1 // number of values in AO #else add L , L , #2 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L2_M1_40 _L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_22 _L2_M1_40: ands L , K1, #7 // L = L % 8 ble _L2_M1_100 _L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_42 _L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L2_END: mov r3, BC ldr r4, K lsl r4, r4, #4 // k * 2 * 8 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO str r3 , KK #endif subs J , #1 // j-- bgt _L2_BEGIN /*********************************************************************************************/ _L1_BEGIN: ldr J , N tst J , #1 ble _L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A //pld [AO , #A_PRE-96] //pld [AO , #A_PRE-64] //pld [AO , #A_PRE-32] _L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble _L1_M2_BEGIN _L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #4 // number of values in AO #else add L , L , #1 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L1_M4_40 .align 5 _L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs L, L, #1 bgt _L1_M4_22 _L1_M4_40: ands L , K1, #7 // L = L % 8 ble _L1_M4_100 _L1_M4_42: KERNEL4x1_SUB subs L, L, #1 bgt _L1_M4_42 _L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #4 // number of values in AO str r3 , KK #endif _L1_M4_END: subs I, I, #1 bgt _L1_M4_20 _L1_M2_BEGIN: ldr I, M tst I , #3 ble _L1_END tst I, #2 // I = I / 2 ble _L1_M1_BEGIN _L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #2 // number of values in AO #else add L , L , #1 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L1_M2_40 _L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs L, L, #1 bgt _L1_M2_22 _L1_M2_40: ands L , K1, #7 // L = L % 8 ble _L1_M2_100 _L1_M2_42: KERNEL2x1_SUB subs L, L, #1 bgt _L1_M2_42 _L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L1_M2_END: _L1_M1_BEGIN: tst I, #1 // I = I % 2 ble _L1_END _L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #1 // number of values in AO #else add L , L , #1 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L1_M1_40 _L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_22 _L1_M1_40: ands L , K1, #7 // L = L % 8 ble _L1_M1_100 _L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_42 _L1_M1_100: SAVE1x1 _L1_END: _L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/dtrmm_kernel_4x4_vfpv3.S000066400000000000000000001015711313527062700216420ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/23 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 252 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA d0 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define KK [fp, #-240 ] #define KKK [fp, #-244] #define C [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define A [fp, #-268 ] #define FP_ZERO [fp, #-236] #define FP_ZERO_0 [fp, #-236] #define FP_ZERO_1 [fp, #-232] #define ALPHA [fp, #-276 ] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHA_SOFTFP [fp, #4] #define OLD_A_SOFTFP [fp, #12 ] #define B [fp, #16 ] #define OLD_C [fp, #20 ] #define OLD_LDC [fp, #24 ] #define OFFSET [fp, #28 ] #else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 64 #define B_PRE 64 #define C_PRE 64 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT4x4 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 vmov.f64 d30, d16 vmov.f64 d31, d16 .endm .macro KERNEL4x4_I pld [ BO , #B_PRE ] fldd d8 , [ BO ] pld [ AO , #A_PRE ] fldmiad AO!, { d0 - d1} fmuld d16 , d0, d8 fldmiad AO!, { d2 - d3} fmuld d17 , d1, d8 fldd d9 , [ BO, #8 ] fmuld d18 , d2, d8 fldd d10, [ BO, #16 ] fmuld d19 , d3, d8 fldd d11, [ BO, #24 ] fmuld d20 , d0, d9 fmuld d21 , d1, d9 add BO , BO, #32 fmuld d22 , d2, d9 fldd d12, [ BO ] fmuld d23 , d3, d9 fmuld d24 , d0, d10 fldmiad AO!, { d4 - d5 } fmuld d25 , d1, d10 fmuld d26 , d2, d10 fldmiad AO!, { d6 - d7 } fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] fmuld d28 , d0, d11 fldd d14, [ BO, #16 ] fmuld d29 , d1, d11 fldd d15, [ BO, #24 ] fmuld d30 , d2, d11 fmuld d31 , d3, d11 add BO , BO, #32 .endm .macro KERNEL4x4_S pld [ BO , #B_PRE ] fldd d8 , [ BO ] pld [ AO , #A_PRE ] fldmiad AO!, { d0 - d1} fmacd d16 , d0, d8 fldmiad AO!, { d2 - d3} fmacd d17 , d1, d8 fldd d9 , [ BO, #8 ] fmacd d18 , d2, d8 fldd d10, [ BO, #16 ] fmacd d19 , d3, d8 fldd d11, [ BO, #24 ] fmacd d20 , d0, d9 fmacd d21 , d1, d9 add BO , BO, #32 fmacd d22 , d2, d9 fldd d12, [ BO ] fmacd d23 , d3, d9 fmacd d24 , d0, d10 fldmiad AO!, { d4 - d5 } fmacd d25 , d1, d10 fmacd d26 , d2, d10 fldmiad AO!, { d6 - d7 } fmacd d27 , d3, d10 fldd d13, [ BO, #8 ] fmacd d28 , d0, d11 fldd d14, [ BO, #16 ] fmacd d29 , d1, d11 fldd d15, [ BO, #24 ] fmacd d30 , d2, d11 fmacd d31 , d3, d11 add BO , BO, #32 .endm .macro KERNEL4x4_M1 fmacd d16 , d4, d12 pld [ AO , #A_PRE ] fmacd d17 , d5, d12 fmacd d18 , d6, d12 pld [ BO , #B_PRE ] fmacd d19 , d7, d12 fmacd d20 , d4, d13 fldd d8 , [ BO ] fmacd d21 , d5, d13 fmacd d22 , d6, d13 fldmiad AO!, { d0 - d1 } fmacd d23 , d7, d13 fmacd d24 , d4, d14 fldmiad AO!, { d2 - d3 } fmacd d25 , d5, d14 fldd d9 , [ BO, #8 ] fmacd d26 , d6, d14 fldd d10, [ BO, #16 ] fmacd d27 , d7, d14 fldd d11, [ BO, #24 ] fmacd d28 , d4, d15 fmacd d29 , d5, d15 fmacd d30 , d6, d15 add BO , BO, #32 fmacd d31 , d7, d15 .endm .macro KERNEL4x4_M2 fmacd d16 , d0, d8 pld [ AO , #A_PRE ] fmacd d17 , d1, d8 pld [ BO , #B_PRE ] fmacd d18 , d2, d8 fldd d12, [ BO ] fmacd d19 , d3, d8 fmacd d20 , d0, d9 fldmiad AO!, { d4 - d5 } fmacd d21 , d1, d9 fmacd d22 , d2, d9 fldmiad AO!, { d6 - d7 } fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fmacd d26 , d2, d10 fmacd d27 , d3, d10 fldd d13, [ BO, #8 ] fmacd d28 , d0, d11 fldd d14, [ BO, #16 ] fmacd d29 , d1, d11 fldd d15, [ BO, #24 ] fmacd d30 , d2, d11 fmacd d31 , d3, d11 add BO , BO, #32 .endm .macro KERNEL4x4_E fmacd d16 , d4, d12 pld [ AO , #A_PRE ] fmacd d17 , d5, d12 fmacd d18 , d6, d12 pld [ BO , #B_PRE ] fmacd d19 , d7, d12 fmacd d20 , d4, d13 fmacd d21 , d5, d13 fmacd d22 , d6, d13 fmacd d23 , d7, d13 fmacd d24 , d4, d14 fmacd d25 , d5, d14 fmacd d26 , d6, d14 fmacd d27 , d7, d14 fmacd d28 , d4, d15 fmacd d29 , d5, d15 fmacd d30 , d6, d15 fmacd d31 , d7, d15 .endm .macro KERNEL4x4_SUB pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] fldd d8 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fmacd d16 , d0, d8 fldd d9 , [ BO, #8 ] fmacd d17 , d1, d8 fldd d10, [ BO, #16 ] fmacd d18 , d2, d8 fldd d11, [ BO, #24 ] fmacd d19 , d3, d8 fmacd d20 , d0, d9 fmacd d21 , d1, d9 fmacd d22 , d2, d9 fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fmacd d26 , d2, d10 fmacd d27 , d3, d10 fmacd d28 , d0, d11 fmacd d29 , d1, d11 add AO , AO, #32 fmacd d30 , d2, d11 add BO , BO, #32 fmacd d31 , d3, d11 .endm .macro SAVE4x4 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA add r4 , CO2, r3 fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fmuld d10, d0 , d18 fmuld d11, d0 , d19 fmuld d12, d0 , d20 fstd d8 , [CO1] fmuld d13, d0 , d21 fstd d9 , [CO1, #8 ] fmuld d14, d0 , d22 fstd d10, [CO1, #16 ] fmuld d15, d0 , d23 fstd d11, [CO1, #24 ] fmuld d8 , d0 , d24 fstd d12, [CO2] fmuld d9 , d0 , d25 fstd d13, [CO2, #8 ] fmuld d10, d0 , d26 fstd d14, [CO2, #16 ] fmuld d11, d0 , d27 fstd d15, [CO2, #24 ] add CO2, r4 , r3 fstd d8 , [r4 ] fmuld d12, d0 , d28 fstd d9 , [r4 , #8 ] fmuld d13, d0 , d29 fstd d10, [r4 , #16 ] fmuld d14, d0 , d30 fstd d11, [r4 , #24 ] fmuld d15, d0 , d31 fstmiad CO2, { d12 - d15 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x4 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 .endm .macro KERNEL2x4_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d20 , d0, d9 fmacd d21 , d1, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fmacd d28 , d0, d11 fmacd d29 , d1, d11 add AO , AO, #16 add BO , BO, #32 .endm .macro SAVE2x4 ldr r3 , LDC add CO2 , CO1, r3 add r4 , CO2, r3 fldd d0, ALPHA fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fmuld d12, d0 , d20 fmuld d13, d0 , d21 fstd d12, [CO2] fstd d13, [CO2, #8 ] fmuld d8 , d0 , d24 fmuld d9 , d0 , d25 fstd d8 , [r4 ] fstd d9 , [r4 , #8 ] add CO2, r4 , r3 fmuld d12, d0 , d28 fmuld d13, d0 , d29 fstd d12, [CO2] fstd d13, [CO2, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x4 fldd d16, FP_ZERO vmov.f64 d20, d16 vmov.f64 d24, d16 vmov.f64 d28, d16 .endm .macro KERNEL1x4_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fldd d0 , [ AO ] fmacd d16 , d0, d8 fmacd d20 , d0, d9 fmacd d24 , d0, d10 fmacd d28 , d0, d11 add AO , AO, #8 add BO , BO, #32 .endm .macro SAVE1x4 ldr r3 , LDC add CO2 , CO1, r3 add r4 , CO2, r3 fldd d0, ALPHA fmuld d8 , d0 , d16 fstd d8 , [CO1] fmuld d12, d0 , d20 fstd d12, [CO2] fmuld d8 , d0 , d24 fstd d8 , [r4 ] add CO2, r4 , r3 fmuld d12, d0 , d28 fstd d12, [CO2] add CO1, CO1, #8 .endm /******************************************************************************/ /******************************************************************************/ .macro INIT4x2 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 .endm .macro KERNEL4x2_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d18 , d2, d8 fmacd d19 , d3, d8 fmacd d20 , d0, d9 fmacd d21 , d1, d9 fmacd d22 , d2, d9 fmacd d23 , d3, d9 add AO , AO, #32 add BO , BO, #16 .endm .macro SAVE4x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fmuld d10, d0 , d18 fmuld d11, d0 , d19 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fstd d10, [CO1, #16 ] fstd d11, [CO1, #24 ] fmuld d12, d0 , d20 fmuld d13, d0 , d21 fmuld d14, d0 , d22 fmuld d15, d0 , d23 fstd d12, [CO2] fstd d13, [CO2, #8 ] fstd d14, [CO2, #16 ] fstd d15, [CO2, #24 ] add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x2 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 .endm .macro KERNEL2x2_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d20 , d0, d9 fmacd d21 , d1, d9 add AO , AO, #16 add BO , BO, #16 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fmuld d12, d0 , d20 fmuld d13, d0 , d21 fstd d12, [CO2] fstd d13, [CO2, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x2 fldd d16, FP_ZERO vmov.f64 d20, d16 .endm .macro KERNEL1x2_SUB fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d0 , [ AO ] fmacd d16 , d0, d8 fmacd d20 , d0, d9 add AO , AO, #8 add BO , BO, #16 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA fmuld d8 , d0 , d16 fstd d8 , [CO1] fmuld d12, d0 , d20 fstd d12, [CO2] add CO1, CO1, #8 .endm /******************************************************************************/ /******************************************************************************/ .macro INIT4x1 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 .endm .macro KERNEL4x1_SUB fldd d8 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 fmacd d18 , d2, d8 fmacd d19 , d3, d8 add AO , AO, #32 add BO , BO, #8 .endm .macro SAVE4x1 fldd d0, ALPHA fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fmuld d10, d0 , d18 fmuld d11, d0 , d19 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] fstd d10, [CO1, #16 ] fstd d11, [CO1, #24 ] add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT2x1 fldd d16, FP_ZERO vmov.f64 d17, d16 .endm .macro KERNEL2x1_SUB fldd d8 , [ BO ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fmacd d17 , d1, d8 add AO , AO, #16 add BO , BO, #8 .endm .macro SAVE2x1 fldd d0, ALPHA fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fstd d8 , [CO1] fstd d9 , [CO1, #8 ] add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT1x1 fldd d16, FP_ZERO .endm .macro KERNEL1x1_SUB fldd d8 , [ BO ] fldd d0 , [ AO ] fmacd d16 , d0, d8 add AO , AO, #8 add BO , BO, #8 .endm .macro SAVE1x1 fldd d0, ALPHA fmuld d8 , d0 , d16 fstd d8 , [CO1] add CO1, CO1, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA, OLD_ALPHA_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA, ALPHA sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC ldr r3, OLD_C str r3, C ldr BC, B ldr r3, OFFSET #ifndef LEFT neg r3 , r3 #endif str r3 , KK ldr J, N asrs J, J, #2 // J = J / 4 ble _L2_BEGIN _L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #2 // LDC * 4 add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] _L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble _L4_M2_BEGIN _L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #5 // 4 double values add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #4 // number of values in AO #else add L , L , #4 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #5 // L = L / 8 ble _L4_M4_40 .align 5 KERNEL4x4_I KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_E subs L, L, #1 ble _L4_M4_41 _L4_M4_22: KERNEL4x4_S KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_E subs L, L, #1 ble _L4_M4_41 b _L4_M4_22 _L4_M4_40: INIT4x4 _L4_M4_41: ands L , K1, #31 // L = L % 8 ble _L4_M4_100 _L4_M4_42: KERNEL4x4_SUB subs L, L, #1 bgt _L4_M4_42 _L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #5 // 4 double values add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #4 // number of values in AO str r3 , KK #endif _L4_M4_END: subs I, I, #1 bgt _L4_M4_20 _L4_M2_BEGIN: ldr I, M tst I , #3 ble _L4_END tst I, #2 // I = I / 2 ble _L4_M1_BEGIN _L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #5 // 4 double values add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #2 // number of values in AO #else add L , L , #4 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L4_M2_40 _L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs L, L, #1 bgt _L4_M2_22 _L4_M2_40: ands L , K1, #7 // L = L % 8 ble _L4_M2_100 _L4_M2_42: KERNEL2x4_SUB subs L, L, #1 bgt _L4_M2_42 _L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #5 // 4 double values add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L4_M2_END: _L4_M1_BEGIN: tst I, #1 // I = I % 2 ble _L4_END _L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #5 // 4 double values add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #1 // number of values in AO #else add L , L , #4 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L4_M1_40 _L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs L, L, #1 bgt _L4_M1_22 _L4_M1_40: ands L , K1, #7 // L = L % 8 ble _L4_M1_100 _L4_M1_42: KERNEL1x4_SUB subs L, L, #1 bgt _L4_M1_42 _L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #5 // 4 double values add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L4_END: mov r3, BC ldr r4, K lsl r4, r4, #5 // k * 4 * 8 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #4 // number of values in BO str r3 , KK #endif subs J , #1 // j-- bgt _L4_BEGIN /*********************************************************************************************/ _L2_BEGIN: ldr J , N tst J , #3 ble _L999 tst J , #2 ble _L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A //pld [AO , #A_PRE-96] //pld [AO , #A_PRE-64] //pld [AO , #A_PRE-32] _L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble _L2_M2_BEGIN _L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #4 // number of values in AO #else add L , L , #2 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L2_M4_40 .align 5 _L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs L, L, #1 bgt _L2_M4_22 _L2_M4_40: ands L , K1, #7 // L = L % 8 ble _L2_M4_100 _L2_M4_42: KERNEL4x2_SUB subs L, L, #1 bgt _L2_M4_42 _L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #4 // number of values in AO str r3 , KK #endif _L2_M4_END: subs I, I, #1 bgt _L2_M4_20 _L2_M2_BEGIN: ldr I, M tst I , #3 ble _L2_END tst I, #2 // I = I / 2 ble _L2_M1_BEGIN _L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #2 // number of values in AO #else add L , L , #2 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L2_M2_40 _L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs L, L, #1 bgt _L2_M2_22 _L2_M2_40: ands L , K1, #7 // L = L % 8 ble _L2_M2_100 _L2_M2_42: KERNEL2x2_SUB subs L, L, #1 bgt _L2_M2_42 _L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L2_M2_END: _L2_M1_BEGIN: tst I, #1 // I = I % 2 ble _L2_END _L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #1 // number of values in AO #else add L , L , #2 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L2_M1_40 _L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_22 _L2_M1_40: ands L , K1, #7 // L = L % 8 ble _L2_M1_100 _L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_42 _L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 2 double values add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L2_END: mov r3, BC ldr r4, K lsl r4, r4, #4 // k * 2 * 8 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO str r3 , KK #endif /*********************************************************************************************/ _L1_BEGIN: ldr J , N tst J , #1 ble _L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A //pld [AO , #A_PRE-96] //pld [AO , #A_PRE-64] //pld [AO , #A_PRE-32] _L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 ble _L1_M2_BEGIN _L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #4 // number of values in AO #else add L , L , #1 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L1_M4_40 .align 5 _L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs L, L, #1 bgt _L1_M4_22 _L1_M4_40: ands L , K1, #7 // L = L % 8 ble _L1_M4_100 _L1_M4_42: KERNEL4x1_SUB subs L, L, #1 bgt _L1_M4_42 _L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #5 // 4 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #4 // number of values in AO str r3 , KK #endif _L1_M4_END: subs I, I, #1 bgt _L1_M4_20 _L1_M2_BEGIN: ldr I, M tst I , #3 ble _L1_END tst I, #2 // I = I / 2 ble _L1_M1_BEGIN _L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #2 // number of values in AO #else add L , L , #1 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L1_M2_40 _L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs L, L, #1 bgt _L1_M2_22 _L1_M2_40: ands L , K1, #7 // L = L % 8 ble _L1_M2_100 _L1_M2_42: KERNEL2x1_SUB subs L, L, #1 bgt _L1_M2_42 _L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #4 // 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L1_M2_END: _L1_M1_BEGIN: tst I, #1 // I = I % 2 ble _L1_END _L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #3 // 1 double value add BO , BO , r4 lsls r4 , r3 , #3 // 1 double value add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr L , K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr L , K ldr r3, KK sub L , L, r3 str L , KKK #else ldr L , KK #ifdef LEFT add L , L , #1 // number of values in AO #else add L , L , #1 // number of values in BO #endif str L , KKK #endif mov K1, L asrs L , K1, #3 // L = L / 8 ble _L1_M1_40 _L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_22 _L1_M1_40: ands L , K1, #7 // L = L % 8 ble _L1_M1_100 _L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_42 _L1_M1_100: SAVE1x1 _L1_END: _L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/gemv_n.c000066400000000000000000000045371313527062700166330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * * 2013/09/14 Saar * * BLASTEST float : OK * * BLASTEST double : OK * CTEST : OK * TEST : OK * * * **************************************************************************************/ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG j; FLOAT *a_ptr; FLOAT temp; ix = 0; a_ptr = a; for (j=0; j #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; BLASLONG max=0; if (n <= 0 || inc_x <= 0) return(max); maxf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) > maxf ) { max = i; maxf = ABS(x[ix]); } ix += inc_x; i++; } return(max+1); } OpenBLAS-0.2.20/kernel/arm/iamax_vfp.S000066400000000000000000000165721313527062700173140ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/14 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define INDEX r3 #define Z r4 #define I r12 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ #if defined(USE_ABS) #if defined(DOUBLE) #define VABS(x0,x1) vabs.f64 x0, x1 #else #define VABS(x0,x1) vabs.f32 x0, x1 #endif #else #define VABS(x0,x1) nop #endif /*****************************************************************************************/ #if defined(USE_MIN) #define MOVCOND movlt #if defined(DOUBLE) #define VMOVCOND vmovlt.f64 #else #define VMOVCOND vmovlt.f32 #endif #else #define MOVCOND movgt #if defined(DOUBLE) #define VMOVCOND vmovgt.f64 #else #define VMOVCOND vmovgt.f32 #endif #endif /*****************************************************************************************/ #if !defined(COMPLEX) #if defined(DOUBLE) .macro INIT_F fldmiad X!, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z .endm .macro KERNEL_F1 fldmiad X!, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 vmrs APSR_nzcv, fpscr VMOVCOND d0, d4 MOVCOND INDEX, Z .endm .macro INIT_S fldmiad X, { d0 } VABS( d0, d0 ) mov Z, #1 mov INDEX, Z add X, X, INC_X .endm .macro KERNEL_S1 fldmiad X, { d4 } add Z, Z, #1 VABS( d4, d4 ) vcmpe.f64 d4, d0 vmrs APSR_nzcv, fpscr VMOVCOND d0, d4 MOVCOND INDEX, Z add X, X, INC_X .endm #else .macro INIT_F fldmias X!, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z .endm .macro KERNEL_F1 fldmias X!, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 vmrs APSR_nzcv, fpscr VMOVCOND s0, s4 MOVCOND INDEX, Z .endm .macro INIT_S fldmias X, { s0 } VABS( s0, s0 ) mov Z, #1 mov INDEX, Z add X, X, INC_X .endm .macro KERNEL_S1 fldmias X, { s4 } add Z, Z, #1 VABS( s4, s4 ) vcmpe.f32 s4, s0 vmrs APSR_nzcv, fpscr VMOVCOND s0, s4 MOVCOND INDEX, Z add X, X, INC_X .endm #endif #else #if defined(DOUBLE) .macro INIT_F fldmiad X!, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 mov Z, #1 mov INDEX, Z .endm .macro KERNEL_F1 fldmiad X!, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 vadd.f64 d4 , d4, d5 vcmpe.f64 d4, d0 vmrs APSR_nzcv, fpscr VMOVCOND d0, d4 MOVCOND INDEX, Z .endm .macro INIT_S fldmiad X, { d0 -d1 } vabs.f64 d0, d0 vabs.f64 d1, d1 vadd.f64 d0 , d0, d1 mov Z, #1 mov INDEX, Z add X, X, INC_X .endm .macro KERNEL_S1 fldmiad X, { d4 - d5 } add Z, Z, #1 vabs.f64 d4, d4 vabs.f64 d5, d5 vadd.f64 d4 , d4, d5 vcmpe.f64 d4, d0 vmrs APSR_nzcv, fpscr VMOVCOND d0, d4 MOVCOND INDEX, Z add X, X, INC_X .endm #else .macro INIT_F fldmias X!, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 mov Z, #1 mov INDEX, Z .endm .macro KERNEL_F1 fldmias X!, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 vadd.f32 s4 , s4, s5 vcmpe.f32 s4, s0 vmrs APSR_nzcv, fpscr VMOVCOND s0, s4 MOVCOND INDEX, Z .endm .macro INIT_S fldmias X, { s0 -s1 } vabs.f32 s0, s0 vabs.f32 s1, s1 vadd.f32 s0 , s0, s1 mov Z, #1 mov INDEX, Z add X, X, INC_X .endm .macro KERNEL_S1 fldmias X, { s4 - s5 } add Z, Z, #1 vabs.f32 s4, s4 vabs.f32 s5, s5 vadd.f32 s4 , s4, s5 vcmpe.f32 s4, s0 vmrs APSR_nzcv, fpscr VMOVCOND s0, s4 MOVCOND INDEX, Z add X, X, INC_X .endm #endif #endif /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4} movs r12, #0 // clear floating point register vmov s0, r12 #if defined(DOUBLE) vcvt.f64.f32 d0, s0 #endif mov INDEX, #0 cmp N, #0 ble iamax_kernel_L999 cmp INC_X, #0 beq iamax_kernel_L999 cmp INC_X, #1 bne iamax_kernel_S_BEGIN iamax_kernel_F_BEGIN: INIT_F subs N, N , #1 ble iamax_kernel_L999 asrs I, N, #2 // I = N / 4 ble iamax_kernel_F1 .align 5 iamax_kernel_F4: pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 #if defined(COMPLEX) && defined(DOUBLE) pld [ X, #X_PRE ] #endif KERNEL_F1 KERNEL_F1 subs I, I, #1 ble iamax_kernel_F1 #if defined(COMPLEX) || defined(DOUBLE) pld [ X, #X_PRE ] #endif KERNEL_F1 KERNEL_F1 #if defined(COMPLEX) && defined(DOUBLE) pld [ X, #X_PRE ] #endif KERNEL_F1 KERNEL_F1 subs I, I, #1 bne iamax_kernel_F4 iamax_kernel_F1: ands I, N, #3 ble iamax_kernel_L999 iamax_kernel_F10: KERNEL_F1 subs I, I, #1 bne iamax_kernel_F10 b iamax_kernel_L999 iamax_kernel_S_BEGIN: #if defined(COMPLEX) #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 #else lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 #endif #else #if defined(DOUBLE) lsl INC_X, INC_X, #3 // INC_X * SIZE #else lsl INC_X, INC_X, #2 // INC_X * SIZE #endif #endif INIT_S subs N, N , #1 ble iamax_kernel_L999 asrs I, N, #2 // I = N / 4 ble iamax_kernel_S1 .align 5 iamax_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne iamax_kernel_S4 iamax_kernel_S1: ands I, N, #3 ble iamax_kernel_L999 iamax_kernel_S10: KERNEL_S1 subs I, I, #1 bne iamax_kernel_S10 iamax_kernel_L999: mov r0, INDEX // set return value pop {r4} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/iamin.c000066400000000000000000000045221313527062700164470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; BLASLONG min=0; if (n <= 0 || inc_x <= 0) return(min); minf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) < ABS(minf) ) { min = i; minf = ABS(x[ix]); } ix += inc_x; i++; } return(min+1); } OpenBLAS-0.2.20/kernel/arm/imax.c000066400000000000000000000043671313527062700163170ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; BLASLONG max=0; if (n <= 0 || inc_x <= 0) return(max); maxf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] > maxf ) { max = i; maxf = x[ix]; } ix += inc_x; i++; } return(max+1); } OpenBLAS-0.2.20/kernel/arm/imin.c000066400000000000000000000042731313527062700163110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/08/19 Saar * BLASTEST float * BLASTEST double * **************************************************************************************/ #include "common.h" #include BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; BLASLONG min=0; if (n <= 0 || inc_x <= 0) return(min); minf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] > minf ) { min = i; minf = x[ix]; } ix += inc_x; i++; } return(min+1); } OpenBLAS-0.2.20/kernel/arm/izamax.c000066400000000000000000000046311313527062700166440ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf; BLASLONG max=0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(max); inc_x2 = 2 * inc_x; maxf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) > maxf ) { max = i; maxf = CABS1(x,ix); } ix += inc_x2; i++; } return(max+1); } OpenBLAS-0.2.20/kernel/arm/izamin.c000066400000000000000000000046411313527062700166430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf; BLASLONG min=0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(min); inc_x2 = 2 * inc_x; minf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) < minf ) { min = i; minf = CABS1(x,ix); } ix += inc_x2; i++; } return(min+1); } OpenBLAS-0.2.20/kernel/arm/max.c000066400000000000000000000043251313527062700161400ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); maxf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] > maxf ) { maxf = x[ix]; } ix += inc_x; i++; } return(maxf); } OpenBLAS-0.2.20/kernel/arm/min.c000066400000000000000000000043251313527062700161360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; if (n <= 0 || inc_x <= 0) return(minf); minf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] < minf ) { minf = x[ix]; } ix += inc_x; i++; } return(minf); } OpenBLAS-0.2.20/kernel/arm/nrm2.c000066400000000000000000000050141313527062700162250ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/13 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT scale = 0.0; FLOAT ssq = 1.0; FLOAT absxi = 0.0; if (n <= 0 || inc_x <= 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; while(i < n) { if ( x[i] != 0.0 ) { absxi = ABS( x[i] ); if ( scale < absxi ) { ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); scale = absxi ; } else { ssq += ( absxi/scale ) * ( absxi/scale ); } } i += inc_x; } scale = scale * sqrt( ssq ); return(scale); } OpenBLAS-0.2.20/kernel/arm/nrm2_vfp.S000066400000000000000000000314441313527062700170660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/22 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define I r12 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ #if !defined(COMPLEX) #if defined(DOUBLE) .macro KERNEL_F1 fldmiad X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_F1_NEXT_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmiad X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_S1_NEXT: add X, X, INC_X .endm #else .macro KERNEL_F1 fldmias X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_F1_NEXT_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmias X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_S1_NEXT: add X, X, INC_X .endm #endif #else #if defined(DOUBLE) .macro KERNEL_F1 fldmiad X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_F1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_END_\@ vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_END_\@ vdiv.f64 d2 , d0, d5 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d5 // scale = x KERNEL_F1_END_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmiad X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT_\@ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT_\@ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_S1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_END_\@ vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_END_\@ vdiv.f64 d2 , d0, d5 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d5 // scale = x KERNEL_S1_END_\@: add X, X, INC_X .endm #else .macro KERNEL_F1 fldmias X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_F1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_END_\@ vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_END_\@ vdiv.f32 s2 , s0, s5 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s5 // scale = x KERNEL_F1_END_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmias X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT_\@ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT_\@ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_S1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_END_\@ vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_END_\@ vdiv.f32 s2 , s0, s5 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s5 // scale = x KERNEL_S1_END_\@: add X, X, INC_X .endm #endif #endif /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE b nrm2_begin #if defined(COMPLEX) #if defined(DOUBLE) znrm2_zero: .word 0x00000000 .word 0x00000000 znrm2_one: .word 0x00000000 .word 0x3ff00000 #else cnrm2_zero: .word 0x00000000 cnrm2_one: .word 0x3f800000 #endif #else #if defined(DOUBLE) dnrm2_zero: .word 0x00000000 .word 0x00000000 dnrm2_one: .word 0x00000000 .word 0x3ff00000 #else snrm2_zero: .word 0x00000000 snrm2_one: .word 0x3f800000 #endif #endif .align 5 nrm2_begin: #if defined(COMPLEX) #if defined(DOUBLE) vldr.64 d0 , znrm2_zero vldr.64 d1 , znrm2_one // ssq=1.0 vmov.f64 d7 , d1 // value 1.0 vmov.f64 d6 , d0 // value 0.0 #else vldr.32 s0 , cnrm2_zero vldr.32 s1 , cnrm2_one // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 vmov.f32 s6 , s0 // value 0.0 #endif #else #if defined(DOUBLE) vldr.64 d0 , dnrm2_zero vldr.64 d1 , dnrm2_one // ssq=1.0 vmov.f64 d7 , d1 // value 1.0 vmov.f64 d6 , d0 // value 0.0 #else vldr.32 s0 , snrm2_zero vldr.32 s1 , snrm2_one // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 vmov.f32 s6 , s0 // value 0.0 #endif #endif cmp N, #0 ble nrm2_kernel_L999 cmp INC_X, #0 beq nrm2_kernel_L999 cmp INC_X, #1 bne nrm2_kernel_S_BEGIN nrm2_kernel_F_BEGIN: asrs I, N, #3 // I = N / 8 ble nrm2_kernel_F1 nrm2_kernel_F8: KERNEL_F8 subs I, I, #1 bne nrm2_kernel_F8 nrm2_kernel_F1: ands I, N, #7 ble nrm2_kernel_L999 nrm2_kernel_F10: KERNEL_F1 subs I, I, #1 bne nrm2_kernel_F10 b nrm2_kernel_L999 nrm2_kernel_S_BEGIN: #if defined(COMPLEX) #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 #else lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 #endif #else #if defined(DOUBLE) lsl INC_X, INC_X, #3 // INC_X * SIZE #else lsl INC_X, INC_X, #2 // INC_X * SIZE #endif #endif nrm2_kernel_S1: mov I, N .align 5 nrm2_kernel_S10: KERNEL_S1 subs I, I, #1 bne nrm2_kernel_S10 nrm2_kernel_L999: #if defined(DOUBLE) vsqrt.f64 d1, d1 vmul.f64 d0, d0, d1 #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 #endif #if !defined(__ARM_PCS_VFP) #if !defined(DOUBLE) vmov r0, s0 #else vmov r0, r1, d0 #endif #endif bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/nrm2_vfpv3.S000066400000000000000000000301051313527062700173300ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/16 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define I r12 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ #if !defined(COMPLEX) #if defined(DOUBLE) .macro KERNEL_F1 fldmiad X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_F1_NEXT_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmiad X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_S1_NEXT: add X, X, INC_X .endm #else .macro KERNEL_F1 fldmias X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_F1_NEXT_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmias X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_S1_NEXT: add X, X, INC_X .endm #endif #else #if defined(DOUBLE) .macro KERNEL_F1 fldmiad X!, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_F1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_END_\@ vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_END_\@ vdiv.f64 d2 , d0, d5 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d5 // scale = x KERNEL_F1_END_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmiad X, { d4 - d5 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT_\@ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT_\@ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x KERNEL_S1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_END_\@ vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_END_\@ vdiv.f64 d2 , d0, d5 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d5 // scale = x KERNEL_S1_END_\@: add X, X, INC_X .endm #else .macro KERNEL_F1 fldmias X!, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_NEXT_\@ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_NEXT_\@ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_F1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_F1_END_\@ vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_F1_END_\@ vdiv.f32 s2 , s0, s5 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s5 // scale = x KERNEL_F1_END_\@: .endm .macro KERNEL_F8 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 pld [ X, #X_PRE ] KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro KERNEL_S1 fldmias X, { s4 - s5 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_NEXT_\@ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_NEXT_\@ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x KERNEL_S1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr beq KERNEL_S1_END_\@ vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) bge KERNEL_S1_END_\@ vdiv.f32 s2 , s0, s5 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s5 // scale = x KERNEL_S1_END_\@: add X, X, INC_X .endm #endif #endif /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 #if defined(DOUBLE) movs r12 , #0 vmov.f32 s0 , r12 // scale=0.0 vcvt.f64.f32 d0, s0 vmov.f64 d1 , #1.0 // ssq=1.0 vmov.f64 d7 , d1 // value 1.0 vmov.f64 d6 , d0 // value 0.0 #else movs r12 , #0 vmov.f32 s0 , r12 // scale=0.0 vmov.f32 s1 , #1.0 // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 vmov.f32 s6 , s0 // value 0.0 #endif cmp N, #0 ble nrm2_kernel_L999 cmp INC_X, #0 beq nrm2_kernel_L999 cmp INC_X, #1 bne nrm2_kernel_S_BEGIN nrm2_kernel_F_BEGIN: asrs I, N, #3 // I = N / 8 ble nrm2_kernel_F1 nrm2_kernel_F8: KERNEL_F8 subs I, I, #1 bne nrm2_kernel_F8 nrm2_kernel_F1: ands I, N, #7 ble nrm2_kernel_L999 nrm2_kernel_F10: KERNEL_F1 subs I, I, #1 bne nrm2_kernel_F10 b nrm2_kernel_L999 nrm2_kernel_S_BEGIN: #if defined(COMPLEX) #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 #else lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 #endif #else #if defined(DOUBLE) lsl INC_X, INC_X, #3 // INC_X * SIZE #else lsl INC_X, INC_X, #2 // INC_X * SIZE #endif #endif nrm2_kernel_S1: mov I, N .align 5 nrm2_kernel_S10: KERNEL_S1 subs I, I, #1 bne nrm2_kernel_S10 nrm2_kernel_L999: #if defined(DOUBLE) vsqrt.f64 d1, d1 vmul.f64 d0, d0, d1 #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 #endif #if !defined(__ARM_PCS_VFP) #if defined(DOUBLE) vmov r0, r1, d0 #else vmov r0, s0 #endif #endif bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/omatcopy_cn.c000066400000000000000000000047271313527062700176740ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" /***************************************************** * 2014/06/09 Saar * * Order ColMajor * No Trans * ******************************************************/ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) { BLASLONG i,j; FLOAT *aptr,*bptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); aptr = a; bptr = b; if ( alpha == 0.0 ) { for ( i=0; i int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp; if ( n < 0 ) return(0); while(i < n) { temp = x[ix] ; x[ix] = y[iy] ; y[iy] = temp ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/arm/swap_vfp.S000066400000000000000000000155171313527062700171650ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/14 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #if !defined(__ARM_PCS_VFP) #if !defined(COMPLEX) #if !defined(DOUBLE) #define OLD_X [fp, #0 ] #define OLD_INC_X [fp, #4 ] #define OLD_Y [fp, #8 ] #define OLD_INC_Y [fp, #12 ] #else #define OLD_X [fp, #8 ] #define OLD_INC_X [fp, #12] #define OLD_Y [fp, #16] #define OLD_INC_Y [fp, #20] #endif #else //COMPLEX #if !defined(DOUBLE) #define OLD_X [fp, #4 ] #define OLD_INC_X [fp, #8 ] #define OLD_Y [fp, #12 ] #define OLD_INC_Y [fp, #16 ] #else #define OLD_X [fp, #16] #define OLD_INC_X [fp, #20] #define OLD_Y [fp, #24] #define OLD_INC_Y [fp, #28] #endif #endif // !defined(__ARM_PCS_VFP) #else #define OLD_INC_X [fp, #0 ] #define OLD_Y [fp, #4 ] #define OLD_INC_Y [fp, #8 ] #endif #define N r0 #define Y r1 #define INC_X r2 #define X r3 #define INC_Y r4 #define I r12 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ /*****************************************************************************************/ #if !defined(COMPLEX) #if defined(DOUBLE) .macro KERNEL_F4 pld [ X, #X_PRE ] pld [ Y, #X_PRE ] fldmiad X, { d0 - d3 } fldmiad Y, { d4 - d7 } fstmiad Y!, { d0 - d3 } fstmiad X!, { d4 - d7} .endm .macro KERNEL_F1 fldmiad X, { d0 } fldmiad Y, { d4 } fstmiad Y!, { d0 } fstmiad X!, { d4 } .endm .macro KERNEL_S1 fldmiad X, { d0 } fldmiad Y, { d4 } fstmiad Y, { d0 } fstmiad X, { d4 } add X, X, INC_X add Y, Y, INC_Y .endm #else .macro KERNEL_F4 fldmias X, { s0 - s3 } fldmias Y, { s4 - s7 } fstmias Y!, { s0 - s3 } fstmias X!, { s4 - s7} .endm .macro KERNEL_F1 fldmias X, { s0 } fldmias Y, { s4 } fstmias Y!, { s0 } fstmias X!, { s4 } .endm .macro KERNEL_S1 fldmias X, { s0 } fldmias Y, { s4 } fstmias Y, { s0 } fstmias X, { s4 } add X, X, INC_X add Y, Y, INC_Y .endm #endif #else #if defined(DOUBLE) .macro KERNEL_F4 pld [ X, #X_PRE ] pld [ Y, #X_PRE ] fldmiad X, { d0 - d3 } fldmiad Y, { d4 - d7 } fstmiad Y!, { d0 - d3 } fstmiad X!, { d4 - d7} pld [ X, #X_PRE ] pld [ Y, #X_PRE ] fldmiad X, { d0 - d3 } fldmiad Y, { d4 - d7 } fstmiad Y!, { d0 - d3 } fstmiad X!, { d4 - d7} .endm .macro KERNEL_F1 fldmiad X, { d0 - d1 } fldmiad Y, { d4 - d5 } fstmiad Y!, { d0 - d1 } fstmiad X!, { d4 - d5 } .endm .macro KERNEL_S1 fldmiad X, { d0 - d1 } fldmiad Y, { d4 - d5 } fstmiad Y, { d0 - d1 } fstmiad X, { d4 - d5 } add X, X, INC_X add Y, Y, INC_Y .endm #else .macro KERNEL_F4 pld [ X, #X_PRE ] pld [ Y, #X_PRE ] fldmias X, { s0 - s3 } fldmias Y, { s4 - s7 } fstmias Y!, { s0 - s3 } fstmias X!, { s4 - s7} fldmias X, { s0 - s3 } fldmias Y, { s4 - s7 } fstmias Y!, { s0 - s3 } fstmias X!, { s4 - s7} .endm .macro KERNEL_F1 fldmias X, { s0 - s1 } fldmias Y, { s4 - s5 } fstmias Y!, { s0 - s1 } fstmias X!, { s4 - s5 } .endm .macro KERNEL_S1 fldmias X, { s0 - s1 } fldmias Y, { s4 - s5 } fstmias Y, { s0 - s1 } fstmias X, { s4 - s5 } add X, X, INC_X add Y, Y, INC_Y .endm #endif #endif /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 , fp} add fp, sp, #8 #if !defined(__ARM_PCS_VFP) ldr X, OLD_X #endif ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y cmp N, #0 ble swap_kernel_L999 cmp INC_X, #0 beq swap_kernel_L999 cmp INC_Y, #0 beq swap_kernel_L999 cmp INC_X, #1 bne swap_kernel_S_BEGIN cmp INC_Y, #1 bne swap_kernel_S_BEGIN swap_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble swap_kernel_F1 .align 5 swap_kernel_F4: #if !defined(COMPLEX) && !defined(DOUBLE) pld [ X, #X_PRE ] pld [ Y, #X_PRE ] #endif KERNEL_F4 subs I, I, #1 ble swap_kernel_F1 KERNEL_F4 subs I, I, #1 bne swap_kernel_F4 swap_kernel_F1: ands I, N, #3 ble swap_kernel_L999 swap_kernel_F10: KERNEL_F1 subs I, I, #1 bne swap_kernel_F10 b swap_kernel_L999 swap_kernel_S_BEGIN: #if defined(COMPLEX) #if defined(DOUBLE) lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 #else lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 #endif #else #if defined(DOUBLE) lsl INC_X, INC_X, #3 // INC_X * SIZE lsl INC_Y, INC_Y, #3 // INC_Y * SIZE #else lsl INC_X, INC_X, #2 // INC_X * SIZE lsl INC_Y, INC_Y, #2 // INC_Y * SIZE #endif #endif asrs I, N, #2 // I = N / 4 ble swap_kernel_S1 .align 5 swap_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne swap_kernel_S4 swap_kernel_S1: ands I, N, #3 ble swap_kernel_L999 swap_kernel_S10: KERNEL_S1 subs I, I, #1 bne swap_kernel_S10 swap_kernel_L999: mov r0, #0 // set return value sub sp, fp, #8 pop {r4,fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/symv_L.c000066400000000000000000000044361313527062700166270ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG jx,jy; BLASLONG j; FLOAT temp1; FLOAT temp2; #if 0 if ( m != offset ) printf("Symv_L: m=%d offset=%d\n",m,offset); #endif jx = 0; jy = 0; for (j=0; j #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; maxf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) > maxf ) { maxf = CABS1(x,ix); } ix += inc_x2; i++; } return(maxf); } OpenBLAS-0.2.20/kernel/arm/zamin.c000066400000000000000000000045701313527062700164730ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; minf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) < minf ) { minf = CABS1(x,ix); } ix += inc_x2; i++; } return(minf); } OpenBLAS-0.2.20/kernel/arm/zasum.c000066400000000000000000000044401313527062700165100ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(sumf); inc_x2 = 2 * inc_x; n *= inc_x2; while(i < n) { sumf += CABS1(x,i); i += inc_x2; } return(sumf); } OpenBLAS-0.2.20/kernel/arm/zaxpby.c000066400000000000000000000060611313527062700166670ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*************************************************************************** * 2014/06/07 Saar * ***************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix,iy; FLOAT temp; BLASLONG inc_x2, inc_y2; if ( n <= 0 ) return(0); ix = 0; iy = 0; inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; if ( beta_r == 0.0 && beta_i == 0.0) { if ( alpha_r == 0.0 && alpha_i == 0.0 ) { while(i < n) { y[iy] = 0.0 ; y[iy+1] = 0.0 ; iy += inc_y2 ; i++ ; } } else { while(i < n) { y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } } else { if ( alpha_r == 0.0 && alpha_i == 0.0 ) { while(i < n) { temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; y[iy] = temp; iy += inc_y2 ; i++ ; } } else { while(i < n) { temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; y[iy] = temp; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } } return(0); } OpenBLAS-0.2.20/kernel/arm/zaxpy.c000066400000000000000000000051111313527062700165200ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/15 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix,iy; BLASLONG inc_x2; BLASLONG inc_y2; if ( n < 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); ix = 0; iy = 0; inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { #if !defined(CONJ) y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif ix += inc_x2 ; iy += inc_y2 ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/arm/zcopy.c000066400000000000000000000043571313527062700165240ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; BLASLONG inc_x2; BLASLONG inc_y2; if ( n < 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { y[iy] = x[ix] ; y[iy+1] = x[ix+1] ; ix += inc_x2; iy += inc_y2; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/arm/zcopy_vfp.S000066400000000000000000000112671313527062700173550ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/07 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 #define OLD_Y r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define OLD_INC_Y [fp, #4 ] #define I r5 #define Y r6 #define INC_Y r7 #define X_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY_F4 pld [ X, #X_PRE ] pld [ X, #X_PRE+32 ] fldmiad X!, { d0 - d7 } fstmiad Y!, { d0 - d7 } .endm .macro COPY_F1 fldmiad X!, { d0 - d1 } fstmiad Y!, { d0 - d1 } .endm /*************************************************************************************************************************/ .macro COPY_S4 nop fldmiad X, { d0 - d1 } fstmiad Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d2 - d3 } fstmiad Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d0 - d1 } fstmiad Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d2 - d3 } fstmiad Y, { d2 - d3 } add X, X, INC_X add Y, Y, INC_Y .endm .macro COPY_S1 fldmiad X, { d0 - d1 } fstmiad Y, { d0 - d1 } add X, X, INC_X add Y, Y, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers mov Y, OLD_Y ldr INC_Y, OLD_INC_Y cmp N, #0 ble zcopy_kernel_L999 cmp INC_X, #0 beq zcopy_kernel_L999 cmp INC_Y, #0 beq zcopy_kernel_L999 cmp INC_X, #1 bne zcopy_kernel_S_BEGIN cmp INC_Y, #1 bne zcopy_kernel_S_BEGIN zcopy_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble zcopy_kernel_F1 zcopy_kernel_F4: COPY_F4 subs I, I, #1 bne zcopy_kernel_F4 zcopy_kernel_F1: ands I, N, #3 ble zcopy_kernel_L999 zcopy_kernel_F10: COPY_F1 subs I, I, #1 bne zcopy_kernel_F10 b zcopy_kernel_L999 zcopy_kernel_S_BEGIN: lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 asrs I, N, #2 // I = N / 4 ble zcopy_kernel_S1 zcopy_kernel_S4: COPY_S4 subs I, I, #1 bne zcopy_kernel_S4 zcopy_kernel_S1: ands I, N, #3 ble zcopy_kernel_L999 zcopy_kernel_S10: COPY_S1 subs I, I, #1 bne zcopy_kernel_S10 zcopy_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers mov r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/zdot.c000066400000000000000000000052131313527062700163300ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : FAIL * BLASTEST double : FAIL * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot[2]; OPENBLAS_COMPLEX_FLOAT result; BLASLONG inc_x2; BLASLONG inc_y2; dot[0]=0.0; dot[1]=0.0; CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; inc_y2 = 2 * inc_y ; while(i < n) { #if !defined(CONJ) dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; #else dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; #endif ix += inc_x2 ; iy += inc_y2 ; i++ ; } CREAL(result) = dot[0]; CIMAG(result) = dot[1]; return(result); } OpenBLAS-0.2.20/kernel/arm/zdot_vfp.S000066400000000000000000000150051313527062700171630ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/11 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define N r0 #define X r1 #define INC_X r2 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #if !defined(__ARM_PCS_VFP) #define OLD_RETURN_ADDR r0 #define OLD_N r1 #define OLD_X r2 #define OLD_INC_X r3 #define OLD_Y [fp, #0 ] #define OLD_INC_Y [fp, #4 ] #define RETURN_ADDR r8 #else #define OLD_Y r3 #define OLD_INC_Y [fp, #0 ] #endif #define I r5 #define Y r6 #define INC_Y r7 #define X_PRE 512 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro KERNEL_F4 pld [ X, #X_PRE ] pld [ Y, #X_PRE ] fldmiad X!, { d4 - d5 } fldmiad Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fldmiad X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 fldmiad Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 pld [ X, #X_PRE ] fmacd d2 , d7, d11 fmacd d3 , d7, d10 pld [ Y, #X_PRE ] fldmiad X!, { d4 - d5 } fldmiad Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fldmiad X!, { d6 - d7 } fmacd d2 , d5, d9 fmacd d3 , d5, d8 fldmiad Y!, { d10 - d11 } fmacd d0 , d6, d10 fmacd d1 , d6, d11 fmacd d2 , d7, d11 fmacd d3 , d7, d10 .endm .macro KERNEL_F1 fldmiad X!, { d4 - d5 } fldmiad Y!, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 fmacd d3 , d5, d8 .endm /*************************************************************************************************************************/ .macro KERNEL_S4 nop fldmiad X, { d4 - d5 } fldmiad Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 fmacd d3 , d5, d8 add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d4 - d5 } fldmiad Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 fmacd d3 , d5, d8 add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d4 - d5 } fldmiad Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 fmacd d3 , d5, d8 add X, X, INC_X add Y, Y, INC_Y fldmiad X, { d4 - d5 } fldmiad Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 fmacd d3 , d5, d8 add X, X, INC_X add Y, Y, INC_Y .endm .macro KERNEL_S1 fldmiad X, { d4 - d5 } fldmiad Y, { d8 - d9 } fmacd d0 , d4, d8 fmacd d1 , d4, d9 fmacd d2 , d5, d9 fmacd d3 , d5, d8 add X, X, INC_X add Y, Y, INC_Y .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers movs r4, #0 // clear floating point register vmov s0, r4 vcvt.f64.f32 d0, s0 vcvt.f64.f32 d1, s0 vcvt.f64.f32 d2, s0 vcvt.f64.f32 d3, s0 #if !defined(__ARM_PCS_VFP) mov RETURN_ADDR, OLD_RETURN_ADDR mov N, OLD_N mov X, OLD_X mov INC_X, OLD_INC_X ldr Y, OLD_Y ldr INC_Y, OLD_INC_Y #else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y #endif cmp N, #0 ble zdot_kernel_L999 cmp INC_X, #0 beq zdot_kernel_L999 cmp INC_Y, #0 beq zdot_kernel_L999 cmp INC_X, #1 bne zdot_kernel_S_BEGIN cmp INC_Y, #1 bne zdot_kernel_S_BEGIN zdot_kernel_F_BEGIN: asrs I, N, #2 // I = N / 4 ble zdot_kernel_F1 zdot_kernel_F4: KERNEL_F4 subs I, I, #1 bne zdot_kernel_F4 zdot_kernel_F1: ands I, N, #3 ble zdot_kernel_L999 zdot_kernel_F10: KERNEL_F1 subs I, I, #1 bne zdot_kernel_F10 b zdot_kernel_L999 zdot_kernel_S_BEGIN: lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 asrs I, N, #2 // I = N / 4 ble zdot_kernel_S1 zdot_kernel_S4: KERNEL_S4 subs I, I, #1 bne zdot_kernel_S4 zdot_kernel_S1: ands I, N, #3 ble zdot_kernel_L999 zdot_kernel_S10: KERNEL_S1 subs I, I, #1 bne zdot_kernel_S10 zdot_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers #if !defined(CONJ) vsub.f64 d0 , d0, d2 vadd.f64 d1 , d1, d3 #else vadd.f64 d0 , d0, d2 vsub.f64 d1 , d1, d3 #endif #if !defined(__ARM_PCS_VFP) vstm RETURN_ADDR, {d0 - d1} #endif sub sp, fp, #28 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/zgemm_kernel_2x2_vfp.S000066400000000000000000000475011313527062700213630ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * ***************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R d0 #define OLD_ALPHA_I d1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, # -240] #define FP_ZERO_1 [fp, # -236] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP [fp, #4] #define OLD_ALPHAI_SOFTFP [fp, #12] #define OLD_A_SOFTFP [fp, #20 ] #define B [fp, #24 ] #define C [fp, #28 ] #define OLD_LDC [fp, #32 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT2x2 fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 vmov.f64 d14, d8 vmov.f64 d15, d8 .endm .macro KERNEL2x2_I pld [ AO, #A_PRE ] pld [ BO, #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 fmuld d10 , d2, d4 KMAC_R d10 , d3, d5 fmuld d11 , d2, d5 KMAC_I d11 , d3, d4 fmuld d12 , d0, d6 KMAC_R d12 , d1, d7 fmuld d13 , d0, d7 KMAC_I d13 , d1, d6 fmuld d14 , d2, d6 KMAC_R d14 , d3, d7 fmuld d15 , d2, d7 KMAC_I d15 , d3, d6 add BO , BO, #32 add AO , AO, #32 .endm .macro KERNEL2x2_M1 fldd d0 , [ AO ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 fldd d2 , [ AO, #16 ] KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 KMAC_R d10 , d3, d5 pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 pld [ BO, #B_PRE ] fmacd d12 , d0, d6 fmacd d13 , d0, d7 KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 fmacd d15 , d2, d7 add BO , BO, #32 KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 .endm .macro KERNEL2x2_M2 fldd d0 , [ AO ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 fldd d2 , [ AO, #16 ] KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 KMAC_R d10 , d3, d5 pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 pld [ BO, #B_PRE ] fmacd d12 , d0, d6 fmacd d13 , d0, d7 KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 fmacd d15 , d2, d7 add BO , BO, #32 KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 .endm .macro KERNEL2x2_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 KMAC_R d14 , d3, d7 fmacd d15 , d2, d7 KMAC_I d15 , d3, d6 add BO , BO, #32 add AO , AO, #32 .endm .macro KERNEL2x2_SUB fldd d0 , [ AO ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 fldd d2 , [ AO, #16 ] KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 KMAC_R d10 , d3, d5 pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 pld [ BO, #B_PRE ] fmacd d12 , d0, d6 fmacd d13 , d0, d7 KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 fmacd d15 , d2, d7 add BO , BO, #32 KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 fstmiad CO1, { d4 - d7 } fldmiad CO2, { d4 - d7 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 fstmiad CO2, { d4 - d7 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x2 fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d12, d8 vmov.f64 d13, d8 .endm .macro KERNEL1x2_I fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 fmuld d12 , d0, d6 KMAC_R d12 , d1, d7 fmuld d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M1 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M2 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 fstmiad CO1, { d4 - d5 } fldmiad CO2, { d4 - d5 } FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 fstmiad CO2, { d4 - d5 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT2x1 fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 .endm .macro KERNEL2x1_I fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 fmuld d10 , d2, d4 KMAC_R d10 , d3, d5 fmuld d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M1 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M2 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro SAVE2x1 fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d7 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 fstmiad CO1, { d4 - d7 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x1 fldd d8 , FP_ZERO vmov.f64 d9 , d8 .endm .macro KERNEL1x1_I fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M1 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M2 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro SAVE1x1 fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d5 } FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 fstmiad CO1, { d4 - d5 } add CO1, CO1, #16 .endm /******************************************************************************/ /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC ldr K1, K ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble zgemm_kernel_L1_BEGIN zgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] zgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble zgemm_kernel_L2_M1_BEGIN zgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt zgemm_kernel_L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 zgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt zgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_30: tst L, #3 ble zgemm_kernel_L2_M2_40 tst L, #2 ble zgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_32: tst L, #1 ble zgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_40: INIT2x2 zgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L2_M2_100 zgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne zgemm_kernel_L2_M2_46 zgemm_kernel_L2_M2_100: SAVE2x2 zgemm_kernel_L2_M2_END: subs I, I, #1 bne zgemm_kernel_L2_M2_20 zgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble zgemm_kernel_L2_END zgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble zgemm_kernel_L2_M1_40 zgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt zgemm_kernel_L2_M1_22 zgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L2_M1_100 zgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt zgemm_kernel_L2_M1_42 zgemm_kernel_L2_M1_100: SAVE1x2 zgemm_kernel_L2_END: mov r3, BC mov r4, K1 lsl r4, r4, #5 // k * 2 * 8 * 2 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 subs J , #1 // j-- bgt zgemm_kernel_L2_BEGIN /*********************************************************************************************/ zgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 ble zgemm_kernel_L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A zgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble zgemm_kernel_L1_M1_BEGIN zgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt zgemm_kernel_L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 zgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt zgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b zgemm_kernel_L1_M2_44 zgemm_kernel_L1_M2_30: tst L, #3 ble zgemm_kernel_L1_M2_40 tst L, #2 ble zgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b zgemm_kernel_L1_M2_44 zgemm_kernel_L1_M2_32: tst L, #1 ble zgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b zgemm_kernel_L1_M2_44 zgemm_kernel_L1_M2_40: INIT2x1 zgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L1_M2_100 zgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne zgemm_kernel_L1_M2_46 zgemm_kernel_L1_M2_100: SAVE2x1 zgemm_kernel_L1_M2_END: subs I, I, #1 bne zgemm_kernel_L1_M2_20 zgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble zgemm_kernel_L1_END zgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble zgemm_kernel_L1_M1_40 zgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt zgemm_kernel_L1_M1_22 zgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L1_M1_100 zgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt zgemm_kernel_L1_M1_42 zgemm_kernel_L1_M1_100: SAVE1x1 zgemm_kernel_L1_END: zgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/zgemm_kernel_2x2_vfpv3.S000066400000000000000000000540121313527062700216270ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * 2013/11/02 Saar * UNROLL_N 2 * UNROLL_M 2 * ZGEMM_P 64 * ZGEMM_Q 120 * ZGEMM_R 4096 * A_PRE 96 * B_PRE 96 * C_PRE 64 * * Performance on Odroid U2: * * 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS * 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS * 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS * 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R d0 #define OLD_ALPHA_I d1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, # -240] #define FP_ZERO_1 [fp, # -236] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP [fp, #4] #define OLD_ALPHAI_SOFTFP [fp, #12] #define OLD_A_SOFTFP [fp, #20 ] #define B [fp, #24 ] #define C [fp, #28 ] #define OLD_LDC [fp, #32 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubd #define FADD_I faddd #define FMAC_R1 vmls.f64 #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) #define FADD_R faddd #define FADD_I fsubd #define FMAC_R1 fmacd #define FMAC_R2 fmacd #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define FADD_R faddd #define FADD_I fsubd #define FMAC_R1 fmacd #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #else #define FADD_R fsubd #define FADD_I faddd #define FMAC_R1 vmls.f64 #define FMAC_R2 fmacd #define FMAC_I1 vmls.f64 #define FMAC_I2 vmls.f64 #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT2x2 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 vmov.f64 d30, d16 vmov.f64 d31, d16 .endm .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmuld d16 , d0, d8 fldd d2 , [ AO, #16 ] fmuld d24 , d1, d9 fldd d3 , [ AO, #24 ] fmuld d17 , d0, d9 fldd d10, [ BO, #16 ] fmuld d25 , d1, d8 fldd d11, [ BO, #24 ] fmuld d18 , d2, d8 add BO , BO, #32 fmuld d26 , d3, d9 add AO , AO, #32 fmuld d19 , d2, d9 pld [ BO , #B_PRE ] fmuld d27 , d3, d8 pld [ AO , #A_PRE ] fmuld d20 , d0, d10 fldd d4 , [ AO, #0 ] fmuld d28 , d1, d11 fldd d5 , [ AO, #8 ] fmuld d21 , d0, d11 fldd d12, [ BO ] fmuld d29 , d1, d10 fldd d13, [ BO, #8 ] fmuld d22 , d2, d10 fldd d6 , [ AO, #16 ] fmuld d30 , d3, d11 fldd d7 , [ AO, #24 ] fmuld d23 , d2, d11 fldd d14, [ BO, #16 ] fmuld d31 , d3, d10 fldd d15, [ BO, #24 ] add BO , BO, #32 add AO , AO, #32 .endm .macro KERNEL2x2_M1 pld [ AO , #A_PRE ] fmacd d16 , d0, d8 pld [ BO , #B_PRE ] fmacd d24 , d1, d9 fldd d4 , [ AO, #0 ] fmacd d17 , d0, d9 fldd d5 , [ AO, #8 ] fmacd d25 , d1, d8 fldd d12, [ BO ] fmacd d18 , d2, d8 fldd d13, [ BO, #8 ] fmacd d26 , d3, d9 fldd d6 , [ AO, #16 ] fmacd d19 , d2, d9 fldd d7 , [ AO, #24 ] fmacd d27 , d3, d8 fmacd d20 , d0, d10 fldd d14, [ BO, #16 ] fmacd d28 , d1, d11 fmacd d21 , d0, d11 fldd d15, [ BO, #24 ] fmacd d29 , d1, d10 fmacd d22 , d2, d10 add BO , BO, #32 fmacd d30 , d3, d11 fmacd d23 , d2, d11 add AO , AO, #32 fmacd d31 , d3, d10 .endm .macro KERNEL2x2_M2 pld [ AO , #A_PRE ] fmacd d16 , d4, d12 pld [ BO , #B_PRE ] fmacd d24 , d5, d13 fldd d0 , [ AO, #0 ] fmacd d17 , d4, d13 fldd d1 , [ AO, #8 ] fmacd d25 , d5, d12 fmacd d18 , d6, d12 fldd d8 , [ BO ] fmacd d26 , d7, d13 fldd d9 , [ BO, #8 ] fmacd d19 , d6, d13 fmacd d27 , d7, d12 fldd d2 , [ AO, #16 ] fmacd d20 , d4, d14 fldd d3 , [ AO, #24 ] fmacd d28 , d5, d15 fmacd d21 , d4, d15 fldd d10, [ BO, #16 ] fmacd d29 , d5, d14 fldd d11, [ BO, #24 ] fmacd d22 , d6, d14 fmacd d30 , d7, d15 add BO , BO, #32 fmacd d23 , d6, d15 add AO , AO, #32 fmacd d31 , d7, d14 .endm .macro KERNEL2x2_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d18 , d6, d12 fmacd d26 , d7, d13 fmacd d19 , d6, d13 fmacd d27 , d7, d12 fmacd d20 , d4, d14 fmacd d28 , d5, d15 fmacd d21 , d4, d15 fmacd d29 , d5, d14 fmacd d22 , d6, d14 fmacd d30 , d7, d15 fmacd d23 , d6, d15 fmacd d31 , d7, d14 .endm .macro KERNEL2x2_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmacd d16 , d0, d8 fldd d2 , [ AO, #16 ] fmacd d24 , d1, d9 fldd d3 , [ AO, #24 ] fmacd d17 , d0, d9 fldd d10, [ BO, #16 ] fmacd d25 , d1, d8 fldd d11, [ BO, #24 ] fmacd d18 , d2, d8 fmacd d26 , d3, d9 fmacd d19 , d2, d9 fmacd d27 , d3, d8 fmacd d20 , d0, d10 fmacd d28 , d1, d11 fmacd d21 , d0, d11 fmacd d29 , d1, d10 fmacd d22 , d2, d10 add BO , BO, #32 fmacd d30 , d3, d11 fmacd d23 , d2, d11 add AO , AO, #32 fmacd d31 , d3, d10 .endm .macro SAVE2x2 pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d7 } fldmiad CO2, { d8 - d11 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FADD_R d18, d26 , d18 FADD_I d19, d27 , d19 FADD_R d20, d28 , d20 FADD_I d21, d29 , d21 FADD_R d22, d30 , d22 FADD_I d23, d31 , d23 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 FMAC_R1 d6 , d0 , d18 FMAC_I1 d7 , d0 , d19 FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 FMAC_R1 d8 , d0 , d20 FMAC_I1 d9 , d0 , d21 FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 FMAC_R1 d10, d0 , d22 FMAC_I1 d11, d0 , d23 FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 fstmiad CO1, { d4 - d7 } fstmiad CO2, { d8 - d11 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x2 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 .endm .macro KERNEL1x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fmuld d16 , d0, d8 fmuld d24 , d1, d9 fmuld d17 , d0, d9 fmuld d25 , d1, d8 fmuld d20 , d0, d10 fmuld d28 , d1, d11 fmuld d21 , d0, d11 fmuld d29 , d1, d10 add BO , BO, #32 add AO , AO, #16 pld [ BO , #B_PRE ] fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] fldd d14, [ BO, #16 ] fldd d15, [ BO, #24 ] add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M1 pld [ BO , #B_PRE ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d20 , d0, d10 fmacd d28 , d1, d11 fmacd d21 , d0, d11 fmacd d29 , d1, d10 fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] fldd d14, [ BO, #16 ] fldd d15, [ BO, #24 ] add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d20 , d4, d14 fmacd d28 , d5, d15 fmacd d21 , d4, d15 fmacd d29 , d5, d14 fldd d0 , [ AO, #0 ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d20 , d4, d14 fmacd d28 , d5, d15 fmacd d21 , d4, d15 fmacd d29 , d5, d14 .endm .macro KERNEL1x2_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d20 , d0, d10 fmacd d28 , d1, d11 fmacd d21 , d0, d11 fmacd d29 , d1, d10 add BO , BO, #32 add AO , AO, #16 .endm .macro SAVE1x2 pld [ CO1 , #C_PRE ] ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d5 } fldmiad CO2, { d8 - d9 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FADD_R d20, d28 , d20 FADD_I d21, d29 , d21 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 FMAC_R1 d8 , d0 , d20 FMAC_I1 d9 , d0 , d21 FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 fstmiad CO1, { d4 - d5 } fstmiad CO2, { d8 - d9 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT2x1 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 .endm .macro KERNEL2x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmuld d16 , d0, d8 fmuld d24 , d1, d9 fmuld d17 , d0, d9 fmuld d25 , d1, d8 fmuld d18 , d2, d8 fmuld d26 , d3, d9 fmuld d19 , d2, d9 fmuld d27 , d3, d8 add BO , BO, #16 add AO , AO, #32 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d6 , [ AO, #16 ] fldd d7 , [ AO, #24 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M1 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d18 , d2, d8 fmacd d26 , d3, d9 fmacd d19 , d2, d9 fmacd d27 , d3, d8 fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d6 , [ AO, #16 ] fldd d7 , [ AO, #24 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d18 , d6, d12 fmacd d26 , d7, d13 fmacd d19 , d6, d13 fmacd d27 , d7, d12 fldd d0 , [ AO, #0 ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d18 , d6, d12 fmacd d26 , d7, d13 fmacd d19 , d6, d13 fmacd d27 , d7, d12 .endm .macro KERNEL2x1_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d18 , d2, d8 fmacd d26 , d3, d9 fmacd d19 , d2, d9 fmacd d27 , d3, d8 add BO , BO, #16 add AO , AO, #32 .endm .macro SAVE2x1 pld [ CO1 , #C_PRE ] fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d7 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FADD_R d18, d26 , d18 FADD_I d19, d27 , d19 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 FMAC_R1 d6 , d0 , d18 FMAC_I1 d7 , d0 , d19 FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 fstmiad CO1, { d4 - d7 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x1 fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 .endm .macro KERNEL1x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmuld d16 , d0, d8 fmuld d24 , d1, d9 fmuld d17 , d0, d9 fmuld d25 , d1, d8 add BO , BO, #16 add AO , AO, #16 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M1 fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M2 fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fldd d0 , [ AO, #0 ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 .endm .macro KERNEL1x1_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 add BO , BO, #16 add AO , AO, #16 .endm .macro SAVE1x1 pld [ CO1 , #C_PRE ] fldd d0, ALPHA_R fldd d1, ALPHA_I fldmiad CO1, { d4 - d5 } FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 fstmiad CO1, { d4 - d5 } add CO1, CO1, #16 .endm /******************************************************************************/ /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC ldr K1, K ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble zgemm_kernel_L1_BEGIN zgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] zgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble zgemm_kernel_L2_M1_BEGIN zgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt zgemm_kernel_L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 zgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt zgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_30: tst L, #3 ble zgemm_kernel_L2_M2_40 tst L, #2 ble zgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_32: tst L, #1 ble zgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_40: INIT2x2 zgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L2_M2_100 zgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne zgemm_kernel_L2_M2_46 zgemm_kernel_L2_M2_100: SAVE2x2 zgemm_kernel_L2_M2_END: subs I, I, #1 bne zgemm_kernel_L2_M2_20 zgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble zgemm_kernel_L2_END zgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 ble zgemm_kernel_L2_M1_40 zgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt zgemm_kernel_L2_M1_22 zgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L2_M1_100 zgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt zgemm_kernel_L2_M1_42 zgemm_kernel_L2_M1_100: SAVE1x2 zgemm_kernel_L2_END: mov r3, BC mov r4, K1 lsl r4, r4, #5 // k * 2 * 8 * 2 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 subs J , #1 // j-- bgt zgemm_kernel_L2_BEGIN /*********************************************************************************************/ zgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 ble zgemm_kernel_L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C ldr AO, A // AO = A zgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble zgemm_kernel_L1_M1_BEGIN zgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 blt zgemm_kernel_L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 zgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt zgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b zgemm_kernel_L1_M2_44 zgemm_kernel_L1_M2_30: tst L, #3 ble zgemm_kernel_L1_M2_40 tst L, #2 ble zgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b zgemm_kernel_L1_M2_44 zgemm_kernel_L1_M2_32: tst L, #1 ble zgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b zgemm_kernel_L1_M2_44 zgemm_kernel_L1_M2_40: INIT2x1 zgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L1_M2_100 zgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne zgemm_kernel_L1_M2_46 zgemm_kernel_L1_M2_100: SAVE2x1 zgemm_kernel_L1_M2_END: subs I, I, #1 bne zgemm_kernel_L1_M2_20 zgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble zgemm_kernel_L1_END zgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 ble zgemm_kernel_L1_M1_40 zgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt zgemm_kernel_L1_M1_22 zgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L1_M1_100 zgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt zgemm_kernel_L1_M1_42 zgemm_kernel_L1_M1_100: SAVE1x1 zgemm_kernel_L1_END: zgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/zgemm_ncopy_2_vfp.S000066400000000000000000000123621313527062700207560ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_A r2 #define OLD_LDA r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define LDA [fp, #-260 ] #define B [fp, #4 ] #define M r0 #define N r1 #define A r2 #define BO r5 #define AO1 r6 #define AO2 r7 #define I r3 #define J r12 #define A_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY2x2 pld [ AO1, #A_PRE ] pld [ AO2, #A_PRE ] fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] fldd d4 , [ AO1, #16 ] fldd d5 , [ AO1, #24 ] fldd d2 , [ AO2, #0 ] fldd d3 , [ AO2, #8 ] add AO1, AO1, #32 fldd d6 , [ AO2, #16 ] fldd d7 , [ AO2, #24 ] fstmiad BO!, { d0 - d7 } add AO2, AO2, #32 .endm .macro COPY1x2 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] fldd d2 , [ AO2, #0 ] fldd d3 , [ AO2, #8 ] add AO1, AO1, #16 fstmiad BO!, { d0 - d3 } add AO2, AO2, #16 .endm .macro COPY2x1 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] fldd d2 , [ AO1, #16 ] fldd d3 , [ AO1, #24 ] fstmiad BO!, { d0 - d3 } add AO1, AO1, #32 .endm .macro COPY1x1 fldd d0 , [ AO1, #0 ] fldd d1 , [ AO1, #8 ] fstmiad BO!, { d0 - d1 } add AO1, AO1, #16 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack lsl r3, r3, #4 // lda = lda * 8 * 2 str r3, LDA sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers ldr BO, B /*********************************************************************************************/ zgemm_ncopy_L2_BEGIN: asrs J, N, #1 // J = N / 2 ble zgemm_ncopy_L1_BEGIN zgemm_ncopy_L2_M2_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add AO2, AO1, r4 add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #1 // I = M / 2 ble zgemm_ncopy_L2_M2_40 zgemm_ncopy_L2_M2_20: COPY2x2 subs I , I , #1 bne zgemm_ncopy_L2_M2_20 zgemm_ncopy_L2_M2_40: ands I, M , #1 ble zgemm_ncopy_L2_M2_END zgemm_ncopy_L2_M2_60: COPY1x2 subs I , I , #1 bne zgemm_ncopy_L2_M2_60 zgemm_ncopy_L2_M2_END: subs J , J, #1 // j-- bne zgemm_ncopy_L2_M2_BEGIN /*********************************************************************************************/ zgemm_ncopy_L1_BEGIN: tst N, #1 ble zgemm_ncopy_L999 zgemm_ncopy_L1_M2_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #1 // I = M / 2 ble zgemm_ncopy_L1_M2_40 zgemm_ncopy_L1_M2_20: COPY2x1 subs I , I , #1 bne zgemm_ncopy_L1_M2_20 zgemm_ncopy_L1_M2_40: ands I, M , #1 ble zgemm_ncopy_L1_M2_END zgemm_ncopy_L1_M2_60: COPY1x1 subs I , I , #1 bne zgemm_ncopy_L1_M2_60 zgemm_ncopy_L1_M2_END: zgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/zgemm_tcopy_2_vfp.S000066400000000000000000000123221313527062700207600ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/07 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_A r2 #define OLD_LDA r3 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define B [fp, #4 ] #define A [fp, #-248 ] #define M r0 #define N r1 #define M4 r2 #define LDA r5 #define AO1 r6 #define BO1 r7 #define BO2 r8 #define I r4 #define J r12 #define A_PRE 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro COPY2x2 pld [ AO1, #A_PRE ] fldmiad AO1, { d0 - d3 } add r3, AO1, LDA pld [ r3, #A_PRE ] fldmiad r3, { d4 - d7 } fstmiad BO1, { d0 - d7 } add AO1, AO1, #32 add BO1, BO1, M4 .endm .macro COPY1x2 fldmiad AO1, { d0 -d1 } add r3, AO1, LDA fldmiad r3, { d2 - d3 } fstmiad BO2, { d0 - d3 } add AO1, AO1, #16 add BO2, BO2, #32 .endm /*************************************************************************************************************************/ .macro COPY2x1 fldmiad AO1, { d0 - d3 } fstmiad BO1, { d0 - d3 } add AO1, AO1, #32 add BO1, BO1, M4 .endm .macro COPY1x1 fldmiad AO1, { d0 - d1 } fstmiad BO2, { d0 - d1 } add AO1, AO1, #16 add BO2, BO2, #16 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack str OLD_A, A // store A lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2 sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers lsl r4 , M, #4 // M * SIZE * 2 ldr r3, B and BO2 , N , #-2 mul BO2, BO2, r4 add BO2 , BO2, r3 lsl M4, M, #5 // M4 = M * 2 * SIZE * 2 zgemm_tcopy_L2_BEGIN: asrs J, M, #1 // J = N / 2 ble zgemm_tcopy_L1_BEGIN zgemm_tcopy_L2_M2_BEGIN: ldr AO1, A // AO1 = A lsl r3, LDA, #1 // r3 = 2 * LDA add r3, r3 , AO1 // A = A + 2 * LDA str r3, A // store A ldr BO1, B add r3, BO1, #64 // B = B + 4 * SIZE *2 str r3, B asrs I, N, #1 // I = M / 2 ble zgemm_tcopy_L2_M2_60 zgemm_tcopy_L2_M2_40: COPY2x2 subs I, I, #1 bne zgemm_tcopy_L2_M2_40 zgemm_tcopy_L2_M2_60: tst N , #1 ble zgemm_tcopy_L2_M2_END COPY1x2 zgemm_tcopy_L2_M2_END: subs J , J, #1 // j-- bne zgemm_tcopy_L2_M2_BEGIN /*********************************************************************************************/ zgemm_tcopy_L1_BEGIN: tst M, #1 ble zgemm_tcopy_L999 zgemm_tcopy_L1_M2_BEGIN: ldr AO1, A // AO1 = A add r3, LDA , AO1 // A = A + 1 * LDA str r3, A // store A ldr BO1, B add r3, BO1, #32 // B = B + 2 * SIZE *2 str r3, B asrs I, N, #1 // I = M / 2 ble zgemm_tcopy_L1_M2_60 zgemm_tcopy_L1_M2_40: COPY2x1 subs I, I, #1 bne zgemm_tcopy_L1_M2_40 zgemm_tcopy_L1_M2_60: tst N , #1 ble zgemm_tcopy_L1_M2_END COPY1x1 zgemm_tcopy_L1_M2_END: zgemm_tcopy_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers mov r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/zgemv_n.c000066400000000000000000000103161313527062700170150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * * 2013/11/23 Saar * * BLASTEST float : OK * * BLASTEST double : OK * CTEST : OK * TEST : OK * * * **************************************************************************************/ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG j; FLOAT *a_ptr; FLOAT temp_r,temp_i; BLASLONG inc_x2,inc_y2; BLASLONG lda2; BLASLONG i2; lda2 = 2*lda; ix = 0; a_ptr = a; if ( inc_x == 1 && inc_y == 1 ) { for (j=0; j #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT scale = 0.0; FLOAT ssq = 1.0; BLASLONG inc_x2; FLOAT temp; if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; n *= inc_x2; while(i < n) { if ( x[i] != 0.0 ) { temp = ABS( x[i] ); if ( scale < temp ) { ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); scale = temp ; } else { ssq += ( temp / scale ) * ( temp / scale ); } } if ( x[i+1] != 0.0 ) { temp = ABS( x[i+1] ); if ( scale < temp ) { ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); scale = temp ; } else { ssq += ( temp / scale ) * ( temp / scale ); } } i += inc_x2; } scale = scale * sqrt( ssq ); return(scale); } OpenBLAS-0.2.20/kernel/arm/zomatcopy_cn.c000066400000000000000000000044671313527062700200670ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" /***************************************************** * 2014/06/09 Saar * * Order ColMajor * No Trans * ******************************************************/ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) { BLASLONG i,j,ia; FLOAT *aptr,*bptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); aptr = a; bptr = b; lda *= 2; ldb *= 2; for ( i=0; i int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp[2]; BLASLONG inc_x2; BLASLONG inc_y2; if ( n < 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { temp[0] = x[ix] ; temp[1] = x[ix+1] ; x[ix] = y[iy] ; x[ix+1] = y[iy+1] ; y[iy] = temp[0] ; y[iy+1] = temp[1] ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/arm/ztrmm_kernel_2x2_vfp.S000066400000000000000000000620411313527062700214110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/11/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R d0 #define OLD_ALPHA_I d1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define KKK [fp, #-240] #define KK [fp, #-244 ] #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-232] #define FP_ZERO_0 [fp, #-232] #define FP_ZERO_1 [fp, #-228] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP [fp, #4] #define OLD_ALPHAI_SOFTFP [fp, #12] #define OLD_A_SOFTFP [fp, #20 ] #define B [fp, #24 ] #define C [fp, #28 ] #define OLD_LDC [fp, #32 ] #define OFFSET [fp, #36 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 /************************************************************************************** * Macro definitions **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT2x2 fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 vmov.f64 d14, d8 vmov.f64 d15, d8 .endm .macro KERNEL2x2_I pld [ AO, #A_PRE ] pld [ BO, #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 fmuld d10 , d2, d4 KMAC_R d10 , d3, d5 fmuld d11 , d2, d5 KMAC_I d11 , d3, d4 fmuld d12 , d0, d6 KMAC_R d12 , d1, d7 fmuld d13 , d0, d7 KMAC_I d13 , d1, d6 fmuld d14 , d2, d6 KMAC_R d14 , d3, d7 fmuld d15 , d2, d7 KMAC_I d15 , d3, d6 add BO , BO, #32 add AO , AO, #32 .endm .macro KERNEL2x2_M1 fldd d0 , [ AO ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 fldd d2 , [ AO, #16 ] KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 KMAC_R d10 , d3, d5 pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 pld [ BO, #B_PRE ] fmacd d12 , d0, d6 fmacd d13 , d0, d7 KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 fmacd d15 , d2, d7 add BO , BO, #32 KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 .endm .macro KERNEL2x2_M2 fldd d0 , [ AO ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 fldd d2 , [ AO, #16 ] KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 KMAC_R d10 , d3, d5 pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 pld [ BO, #B_PRE ] fmacd d12 , d0, d6 fmacd d13 , d0, d7 KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 fmacd d15 , d2, d7 add BO , BO, #32 KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 .endm .macro KERNEL2x2_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 KMAC_R d14 , d3, d7 fmacd d15 , d2, d7 KMAC_I d15 , d3, d6 add BO , BO, #32 add AO , AO, #32 .endm .macro KERNEL2x2_SUB fldd d0 , [ AO ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 fldd d1 , [ AO, #8 ] fmacd d9 , d0, d5 fldd d2 , [ AO, #16 ] KMAC_R d8 , d1, d5 fldd d3 , [ AO, #24 ] KMAC_I d9 , d1, d4 fldd d6 , [ BO, #16 ] fmacd d10 , d2, d4 fldd d7 , [ BO, #24 ] fmacd d11 , d2, d5 KMAC_R d10 , d3, d5 pld [ AO, #A_PRE ] KMAC_I d11 , d3, d4 pld [ BO, #B_PRE ] fmacd d12 , d0, d6 fmacd d13 , d0, d7 KMAC_R d12 , d1, d7 KMAC_I d13 , d1, d6 fmacd d14 , d2, d6 fmacd d15 , d2, d7 add BO , BO, #32 KMAC_R d14 , d3, d7 add AO , AO, #32 KMAC_I d15 , d3, d6 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I fldd d4 , FP_ZERO vmov.f64 d5 , d4 vmov.f64 d6 , d4 vmov.f64 d7 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 fstmiad CO1, { d4 - d7 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 vmov.f64 d6 , d4 vmov.f64 d7 , d4 FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 FMAC_R1 d6 , d0 , d14 FMAC_I1 d7 , d0 , d15 FMAC_R2 d6 , d1 , d15 FMAC_I2 d7 , d1 , d14 fstmiad CO2, { d4 - d7 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x2 fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d12, d8 vmov.f64 d13, d8 .endm .macro KERNEL1x2_I fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 fmuld d12 , d0, d6 KMAC_R d12 , d1, d7 fmuld d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M1 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M2 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fldd d6 , [ BO, #16 ] fldd d7 , [ BO, #24 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d12 , d0, d6 KMAC_R d12 , d1, d7 fmacd d13 , d0, d7 KMAC_I d13 , d1, d6 add BO , BO, #32 add AO , AO, #16 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I fldd d4 , FP_ZERO vmov.f64 d5 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 fstmiad CO1, { d4 - d5 } fldd d4 , FP_ZERO vmov.f64 d5 , d4 FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 FMAC_R2 d4 , d1 , d13 FMAC_I2 d5 , d1 , d12 fstmiad CO2, { d4 - d5 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT2x1 fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 .endm .macro KERNEL2x1_I fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 fmuld d10 , d2, d4 KMAC_R d10 , d3, d5 fmuld d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M1 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M2 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 fmacd d10 , d2, d4 KMAC_R d10 , d3, d5 fmacd d11 , d2, d5 KMAC_I d11 , d3, d4 add BO , BO, #16 add AO , AO, #32 .endm .macro SAVE2x1 fldd d0, ALPHA_R fldd d1, ALPHA_I fldd d4 , FP_ZERO vmov.f64 d5 , d4 vmov.f64 d6 , d4 vmov.f64 d7 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 FMAC_R1 d6 , d0 , d10 FMAC_I1 d7 , d0 , d11 FMAC_R2 d6 , d1 , d11 FMAC_I2 d7 , d1 , d10 fstmiad CO1, { d4 - d7 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x1 fldd d8 , FP_ZERO vmov.f64 d9 , d8 .endm .macro KERNEL1x1_I fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmuld d8 , d0, d4 KMAC_R d8 , d1, d5 fmuld d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M1 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M2 fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_E fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d4 , [ BO ] fldd d5 , [ BO, #8 ] fmacd d8 , d0, d4 KMAC_R d8 , d1, d5 fmacd d9 , d0, d5 KMAC_I d9 , d1, d4 add BO , BO, #16 add AO , AO, #16 .endm .macro SAVE1x1 fldd d0, ALPHA_R fldd d1, ALPHA_I fldd d4 , FP_ZERO vmov.f64 d5 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 FMAC_R2 d4 , d1 , d9 FMAC_I2 d5 , d1 , d8 fstmiad CO1, { d4 - d5 } add CO1, CO1, #16 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC ldr r3, OFFSET #ifndef LEFT neg r3 , r3 #endif str r3 , KK ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble _L1_BEGIN _L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] _L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L2_M1_BEGIN _L2_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt _L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_30: tst L, #3 ble _L2_M2_40 tst L, #2 ble _L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_32: tst L, #1 ble _L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_40: INIT2x2 _L2_M2_44: ands L , K1, #7 // L = L % 8 ble _L2_M2_100 _L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne _L2_M2_46 _L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L2_M2_END: subs I, I, #1 bne _L2_M2_20 _L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L2_END _L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L2_M1_40 _L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_22 _L2_M1_40: ands L , K1, #7 // L = L % 8 ble _L2_M1_100 _L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_42 _L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L2_END: mov r3, BC ldr r4, K lsl r4, r4, #5 // k * 2 * 8 * 2 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO str r3 , KK #endif subs J , #1 // j-- bgt _L2_BEGIN /*********************************************************************************************/ _L1_BEGIN: ldr J , N tst J , #1 ble _L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A _L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L1_M1_BEGIN _L1_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt _L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_30: tst L, #3 ble _L1_M2_40 tst L, #2 ble _L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_32: tst L, #1 ble _L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_40: INIT2x1 _L1_M2_44: ands L , K1, #7 // L = L % 8 ble _L1_M2_100 _L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne _L1_M2_46 _L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L1_M2_END: subs I, I, #1 bne _L1_M2_20 _L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L1_END _L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L1_M1_40 _L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_22 _L1_M1_40: ands L , K1, #7 // L = L % 8 ble _L1_M1_100 _L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_42 _L1_M1_100: SAVE1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L1_END: _L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm/ztrmm_kernel_2x2_vfpv3.S000066400000000000000000000641441313527062700216700ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/10/16 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #define ASSEMBLER #include "common.h" #define STACKSIZE 256 #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 #define OLD_A r3 #define OLD_ALPHA_R d0 #define OLD_ALPHA_I d1 /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * registers *******************************************************/ #define KKK [fp, #-240] #define KK [fp, #-244 ] #define A [fp, #-248 ] #define LDC [fp, #-252 ] #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] #define FP_ZERO [fp, #-236] #define FP_ZERO_0 [fp, #-236] #define FP_ZERO_1 [fp, #-232] #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] #if !defined(__ARM_PCS_VFP) #define OLD_ALPHAR_SOFTFP [fp, #4] #define OLD_ALPHAI_SOFTFP [fp, #12] #define OLD_A_SOFTFP [fp, #20 ] #define B [fp, #24 ] #define C [fp, #28 ] #define OLD_LDC [fp, #32 ] #define OFFSET [fp, #36 ] #else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] #endif #define I r0 #define J r1 #define L r2 #define AO r5 #define BO r6 #define CO1 r8 #define CO2 r9 #define K1 r7 #define BC r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 64 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubd #define FADD_I faddd #define FMAC_R1 vnmul.f64 #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) #define FADD_R faddd #define FADD_I fsubd #define FMAC_R1 fmuld #define FMAC_R2 fmacd #define FMAC_I1 vnmul.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define FADD_R faddd #define FADD_I fsubd #define FMAC_R1 fmuld #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld #define FMAC_I2 fmacd #else #define FADD_R fsubd #define FADD_I faddd #define FMAC_R1 vnmul.f64 #define FMAC_R2 fmacd #define FMAC_I1 vnmul.f64 #define FMAC_I2 vmls.f64 #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT2x2 fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 vmov.f64 d30, d16 vmov.f64 d31, d16 .endm .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmuld d16 , d0, d8 fldd d2 , [ AO, #16 ] fmuld d24 , d1, d9 fldd d3 , [ AO, #24 ] fmuld d17 , d0, d9 fldd d10, [ BO, #16 ] fmuld d25 , d1, d8 fldd d11, [ BO, #24 ] fmuld d18 , d2, d8 add BO , BO, #32 fmuld d26 , d3, d9 add AO , AO, #32 fmuld d19 , d2, d9 pld [ BO , #B_PRE ] fmuld d27 , d3, d8 pld [ AO , #A_PRE ] fmuld d20 , d0, d10 fldd d4 , [ AO, #0 ] fmuld d28 , d1, d11 fldd d5 , [ AO, #8 ] fmuld d21 , d0, d11 fldd d12, [ BO ] fmuld d29 , d1, d10 fldd d13, [ BO, #8 ] fmuld d22 , d2, d10 fldd d6 , [ AO, #16 ] fmuld d30 , d3, d11 fldd d7 , [ AO, #24 ] fmuld d23 , d2, d11 fldd d14, [ BO, #16 ] fmuld d31 , d3, d10 fldd d15, [ BO, #24 ] add BO , BO, #32 add AO , AO, #32 .endm .macro KERNEL2x2_M1 pld [ AO , #A_PRE ] fmacd d16 , d0, d8 pld [ BO , #B_PRE ] fmacd d24 , d1, d9 fldd d4 , [ AO, #0 ] fmacd d17 , d0, d9 fldd d5 , [ AO, #8 ] fmacd d25 , d1, d8 fldd d12, [ BO ] fmacd d18 , d2, d8 fldd d13, [ BO, #8 ] fmacd d26 , d3, d9 fldd d6 , [ AO, #16 ] fmacd d19 , d2, d9 fldd d7 , [ AO, #24 ] fmacd d27 , d3, d8 fmacd d20 , d0, d10 fldd d14, [ BO, #16 ] fmacd d28 , d1, d11 fmacd d21 , d0, d11 fldd d15, [ BO, #24 ] fmacd d29 , d1, d10 fmacd d22 , d2, d10 add BO , BO, #32 fmacd d30 , d3, d11 fmacd d23 , d2, d11 add AO , AO, #32 fmacd d31 , d3, d10 .endm .macro KERNEL2x2_M2 pld [ AO , #A_PRE ] fmacd d16 , d4, d12 pld [ BO , #B_PRE ] fmacd d24 , d5, d13 fldd d0 , [ AO, #0 ] fmacd d17 , d4, d13 fldd d1 , [ AO, #8 ] fmacd d25 , d5, d12 fmacd d18 , d6, d12 fldd d8 , [ BO ] fmacd d26 , d7, d13 fldd d9 , [ BO, #8 ] fmacd d19 , d6, d13 fmacd d27 , d7, d12 fldd d2 , [ AO, #16 ] fmacd d20 , d4, d14 fldd d3 , [ AO, #24 ] fmacd d28 , d5, d15 fmacd d21 , d4, d15 fldd d10, [ BO, #16 ] fmacd d29 , d5, d14 fldd d11, [ BO, #24 ] fmacd d22 , d6, d14 fmacd d30 , d7, d15 add BO , BO, #32 fmacd d23 , d6, d15 add AO , AO, #32 fmacd d31 , d7, d14 .endm .macro KERNEL2x2_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d18 , d6, d12 fmacd d26 , d7, d13 fmacd d19 , d6, d13 fmacd d27 , d7, d12 fmacd d20 , d4, d14 fmacd d28 , d5, d15 fmacd d21 , d4, d15 fmacd d29 , d5, d14 fmacd d22 , d6, d14 fmacd d30 , d7, d15 fmacd d23 , d6, d15 fmacd d31 , d7, d14 .endm .macro KERNEL2x2_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmacd d16 , d0, d8 fldd d2 , [ AO, #16 ] fmacd d24 , d1, d9 fldd d3 , [ AO, #24 ] fmacd d17 , d0, d9 fldd d10, [ BO, #16 ] fmacd d25 , d1, d8 fldd d11, [ BO, #24 ] fmacd d18 , d2, d8 fmacd d26 , d3, d9 fmacd d19 , d2, d9 fmacd d27 , d3, d8 fmacd d20 , d0, d10 fmacd d28 , d1, d11 fmacd d21 , d0, d11 fmacd d29 , d1, d10 fmacd d22 , d2, d10 add BO , BO, #32 fmacd d30 , d3, d11 fmacd d23 , d2, d11 add AO , AO, #32 fmacd d31 , d3, d10 .endm .macro SAVE2x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FADD_R d18, d26 , d18 FADD_I d19, d27 , d19 FADD_R d20, d28 , d20 FADD_I d21, d29 , d21 FADD_R d22, d30 , d22 FADD_I d23, d31 , d23 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 FMAC_R1 d6 , d0 , d18 FMAC_I1 d7 , d0 , d19 FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 FMAC_R1 d8 , d0 , d20 FMAC_I1 d9 , d0 , d21 FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 FMAC_R1 d10, d0 , d22 FMAC_I1 d11, d0 , d23 FMAC_R2 d10, d1 , d23 FMAC_I2 d11, d1 , d22 fstmiad CO1, { d4 - d7 } fstmiad CO2, { d8 - d11 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x2 fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 .endm .macro KERNEL1x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fmuld d16 , d0, d8 fmuld d24 , d1, d9 fmuld d17 , d0, d9 fmuld d25 , d1, d8 fmuld d20 , d0, d10 fmuld d28 , d1, d11 fmuld d21 , d0, d11 fmuld d29 , d1, d10 add BO , BO, #32 add AO , AO, #16 pld [ BO , #B_PRE ] fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] fldd d14, [ BO, #16 ] fldd d15, [ BO, #24 ] add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M1 pld [ BO , #B_PRE ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d20 , d0, d10 fmacd d28 , d1, d11 fmacd d21 , d0, d11 fmacd d29 , d1, d10 fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] fldd d14, [ BO, #16 ] fldd d15, [ BO, #24 ] add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d20 , d4, d14 fmacd d28 , d5, d15 fmacd d21 , d4, d15 fmacd d29 , d5, d14 fldd d0 , [ AO, #0 ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] add BO , BO, #32 add AO , AO, #16 .endm .macro KERNEL1x2_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d20 , d4, d14 fmacd d28 , d5, d15 fmacd d21 , d4, d15 fmacd d29 , d5, d14 .endm .macro KERNEL1x2_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fldd d10, [ BO, #16 ] fldd d11, [ BO, #24 ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d20 , d0, d10 fmacd d28 , d1, d11 fmacd d21 , d0, d11 fmacd d29 , d1, d10 add BO , BO, #32 add AO , AO, #16 .endm .macro SAVE1x2 ldr r3 , LDC add CO2 , CO1, r3 fldd d0, ALPHA_R fldd d1, ALPHA_I FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FADD_R d20, d28 , d20 FADD_I d21, d29 , d21 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 FMAC_R1 d8 , d0 , d20 FMAC_I1 d9 , d0 , d21 FMAC_R2 d8 , d1 , d21 FMAC_I2 d9 , d1 , d20 fstmiad CO1, { d4 - d5 } fstmiad CO2, { d8 - d9 } add CO1, CO1, #16 .endm /******************************************************************************/ .macro INIT2x1 fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 .endm .macro KERNEL2x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmuld d16 , d0, d8 fmuld d24 , d1, d9 fmuld d17 , d0, d9 fmuld d25 , d1, d8 fmuld d18 , d2, d8 fmuld d26 , d3, d9 fmuld d19 , d2, d9 fmuld d27 , d3, d8 add BO , BO, #16 add AO , AO, #32 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d6 , [ AO, #16 ] fldd d7 , [ AO, #24 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M1 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d18 , d2, d8 fmacd d26 , d3, d9 fmacd d19 , d2, d9 fmacd d27 , d3, d8 fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d6 , [ AO, #16 ] fldd d7 , [ AO, #24 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_M2 pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d18 , d6, d12 fmacd d26 , d7, d13 fmacd d19 , d6, d13 fmacd d27 , d7, d12 fldd d0 , [ AO, #0 ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] add BO , BO, #16 add AO , AO, #32 .endm .macro KERNEL2x1_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d18 , d6, d12 fmacd d26 , d7, d13 fmacd d19 , d6, d13 fmacd d27 , d7, d12 .endm .macro KERNEL2x1_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d2 , [ AO, #16 ] fldd d3 , [ AO, #24 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fmacd d18 , d2, d8 fmacd d26 , d3, d9 fmacd d19 , d2, d9 fmacd d27 , d3, d8 add BO , BO, #16 add AO , AO, #32 .endm .macro SAVE2x1 fldd d0, ALPHA_R fldd d1, ALPHA_I FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FADD_R d18, d26 , d18 FADD_I d19, d27 , d19 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 FMAC_R1 d6 , d0 , d18 FMAC_I1 d7 , d0 , d19 FMAC_R2 d6 , d1 , d19 FMAC_I2 d7 , d1 , d18 fstmiad CO1, { d4 - d7 } add CO1, CO1, #32 .endm /******************************************************************************/ .macro INIT1x1 fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 .endm .macro KERNEL1x1_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmuld d16 , d0, d8 fmuld d24 , d1, d9 fmuld d17 , d0, d9 fmuld d25 , d1, d8 add BO , BO, #16 add AO , AO, #16 pld [ BO , #B_PRE ] pld [ AO , #A_PRE ] fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M1 fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 fldd d4 , [ AO, #0 ] fldd d5 , [ AO, #8 ] fldd d12, [ BO ] fldd d13, [ BO, #8 ] add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_M2 fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fldd d0 , [ AO, #0 ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL1x1_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 .endm .macro KERNEL1x1_SUB fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmacd d16 , d0, d8 fmacd d24 , d1, d9 fmacd d17 , d0, d9 fmacd d25 , d1, d8 add BO , BO, #16 add AO , AO, #16 .endm .macro SAVE1x1 fldd d0, ALPHA_R fldd d1, ALPHA_I FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FMAC_R1 d4 , d0 , d16 FMAC_I1 d5 , d0 , d17 FMAC_R2 d4 , d1 , d17 FMAC_I2 d5 , d1 , d16 fstmiad CO1, { d4 - d5 } add CO1, CO1, #16 .endm /******************************************************************************/ /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 push {r4 - r9, fp} add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack #if !defined(__ARM_PCS_VFP) vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP ldr OLD_A, OLD_A_SOFTFP #endif str OLD_M, M str OLD_N, N str OLD_K, K str OLD_A, A vstr OLD_ALPHA_R, ALPHA_R vstr OLD_ALPHA_I, ALPHA_I sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers movs r4, #0 str r4, FP_ZERO str r4, FP_ZERO_1 ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC ldr r3, OFFSET #ifndef LEFT neg r3 , r3 #endif str r3 , KK ldr BC, B ldr J, N asrs J, J, #1 // J = J / 2 ble _L1_BEGIN _L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A pld [AO , #A_PRE-64] pld [AO , #A_PRE-32] _L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L2_M1_BEGIN _L2_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L2_M2_30 .align 5 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 sub L, L, #2 _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt _L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_30: tst L, #3 ble _L2_M2_40 tst L, #2 ble _L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_32: tst L, #1 ble _L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b _L2_M2_44 _L2_M2_40: INIT2x2 _L2_M2_44: ands L , K1, #7 // L = L % 8 ble _L2_M2_100 _L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne _L2_M2_46 _L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L2_M2_END: subs I, I, #1 bne _L2_M2_20 _L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L2_END _L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #2 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L2_M1_40 _L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_22 _L2_M1_40: ands L , K1, #7 // L = L % 8 ble _L2_M1_100 _L2_M1_42: KERNEL1x2_SUB subs L, L, #1 bgt _L2_M1_42 _L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L2_END: mov r3, BC ldr r4, K lsl r4, r4, #5 // k * 2 * 8 * 2 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO str r3 , KK #endif subs J , #1 // j-- bgt _L2_BEGIN /*********************************************************************************************/ _L1_BEGIN: ldr J , N tst J , #1 ble _L999 ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 str r3 , C // store C #if defined(LEFT) ldr r3 , OFFSET str r3 , KK #endif ldr AO, A // AO = A _L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 ble _L1_M1_BEGIN _L1_M2_20: #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #2 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 cmp L , #3 blt _L1_M2_30 .align 5 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 sub L, L, #2 _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 subs L, L, #1 bgt _L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_30: tst L, #3 ble _L1_M2_40 tst L, #2 ble _L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_32: tst L, #1 ble _L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_M2 KERNEL2x1_M1 KERNEL2x1_E b _L1_M2_44 _L1_M2_40: INIT2x1 _L1_M2_44: ands L , K1, #7 // L = L % 8 ble _L1_M2_100 _L1_M2_46: KERNEL2x1_SUB subs L, L, #1 bne _L1_M2_46 _L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #5 // 2 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in AO str r3 , KK #endif _L1_M2_END: subs I, I, #1 bne _L1_M2_20 _L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 ble _L1_END _L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) mov BO, BC #else mov BO, BC ldr r3 , KK lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #ifndef TRMMKERNEL ldr K1, K #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) ldr K1, K ldr r3, KK sub K1, K1, r3 str K1, KKK #else ldr K1, KK #ifdef LEFT add K1, K1, #1 // number of values in AO #else add K1, K1, #1 // number of values in BO #endif str K1, KKK #endif asrs L , K1, #3 // L = L / 8 ble _L1_M1_40 _L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_22 _L1_M1_40: ands L , K1, #7 // L = L % 8 ble _L1_M1_100 _L1_M1_42: KERNEL1x1_SUB subs L, L, #1 bgt _L1_M1_42 _L1_M1_100: SAVE1x1 #if (defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) ldr r3 , K ldr r4 , KKK sub r3 , r3 , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add BO , BO , r4 lsls r4 , r3 , #4 // 1 * 8 * 2 double values add AO , AO , r4 #endif #if defined(LEFT) ldr r3 , KK add r3 , r3 , #1 // number of values in AO str r3 , KK #endif _L1_END: _L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 pop {r4 - r9, fp} bx lr EPILOGUE OpenBLAS-0.2.20/kernel/arm64/000077500000000000000000000000001313527062700153555ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/arm64/KERNEL000066400000000000000000000012431313527062700162600ustar00rootroot00000000000000ifndef SNRM2KERNEL SNRM2KERNEL = nrm2.c endif ifndef DNRM2KERNEL DNRM2KERNEL = nrm2.c endif ifndef CNRM2KERNEL CNRM2KERNEL = znrm2.c endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.c endif ifndef SCABS_KERNEL SCABS_KERNEL = ../generic/cabs.c endif ifndef DCABS_KERNEL DCABS_KERNEL = ../generic/cabs.c endif ifndef QCABS_KERNEL QCABS_KERNEL = ../generic/cabs.c endif ifndef LSAME_KERNEL LSAME_KERNEL = ../generic/lsame.c endif ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA DGEMM_BETA = ../generic/gemm_beta.c endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif OpenBLAS-0.2.20/kernel/arm64/KERNEL.ARMV8000066400000000000000000000070271313527062700171220ustar00rootroot00000000000000SAMAXKERNEL = ../arm/amax.c DAMAXKERNEL = ../arm/amax.c CAMAXKERNEL = ../arm/zamax.c ZAMAXKERNEL = ../arm/zamax.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c ZAMINKERNEL = ../arm/zamin.c SMAXKERNEL = ../arm/max.c DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = ../arm/iamax.c ICAMAXKERNEL = ../arm/izamax.c IZAMAXKERNEL = ../arm/izamax.c ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = ../arm/izamin.c ISMAXKERNEL = ../arm/imax.c IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c SCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = ../arm/copy.c CCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = ../arm/zcopy.c SDOTKERNEL = ../arm/dot.c DDOTKERNEL = ../arm/dot.c CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = ../arm/rot.c DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c SSCALKERNEL = ../arm/scal.c DSCALKERNEL = ../arm/scal.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = sgemm_kernel_4x4.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c OpenBLAS-0.2.20/kernel/arm64/KERNEL.CORTEXA57000066400000000000000000000071321313527062700175430ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ARMV8 SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S ISAMAXKERNEL = iamax.S IDAMAXKERNEL = iamax.S ICAMAXKERNEL = izamax.S IZAMAXKERNEL = izamax.S SASUMKERNEL = asum.S DASUMKERNEL = asum.S CASUMKERNEL = casum.S ZASUMKERNEL = zasum.S SAXPYKERNEL = axpy.S DAXPYKERNEL = axpy.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S SCOPYKERNEL = copy.S DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S SDOTKERNEL = dot.S DDOTKERNEL = dot.S CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S SNRM2KERNEL = nrm2.S DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S SROTKERNEL = rot.S DROTKERNEL = rot.S CROTKERNEL = zrot.S ZROTKERNEL = zrot.S SSCALKERNEL = scal.S DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S SSWAPKERNEL = swap.S DSWAPKERNEL = swap.S CSWAPKERNEL = swap.S ZSWAPKERNEL = swap.S SGEMVNKERNEL = gemv_n.S DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S SGEMVTKERNEL = gemv_t.S DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o endif SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) ifeq ($(DGEMM_UNROLL_M), 8) DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S else DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o endif ifeq ($(DGEMM_UNROLL_N), 4) DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S else DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c ZGEMMINCOPYOBJ = zgemm_incopy.o ZGEMMITCOPYOBJ = zgemm_itcopy.o endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o OpenBLAS-0.2.20/kernel/arm64/KERNEL.THUNDERX000066400000000000000000000001661313527062700175230ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ARMV8 SDOTKERNEL=dot_thunderx.c DDOTKERNEL=ddot_thunderx.c DAXPYKERNEL=daxpy_thunderx.c OpenBLAS-0.2.20/kernel/arm64/KERNEL.THUNDERX2T99000066400000000000000000000027321313527062700201140ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.CORTEXA57 SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c CASUMKERNEL = casum_thunderx2t99.c ZASUMKERNEL = zasum_thunderx2t99.c SCOPYKERNEL = copy_thunderx2t99.c DCOPYKERNEL = copy_thunderx2t99.c CCOPYKERNEL = copy_thunderx2t99.c ZCOPYKERNEL = copy_thunderx2t99.c SSWAPKERNEL = swap_thunderx2t99.S DSWAPKERNEL = swap_thunderx2t99.S CSWAPKERNEL = swap_thunderx2t99.S ZSWAPKERNEL = swap_thunderx2t99.S ISAMAXKERNEL = iamax_thunderx2t99.c IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c SNRM2KERNEL = scnrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c DAXPYKERNEL = daxpy_thunderx2t99.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S endif ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S endif ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S endif ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S endif OpenBLAS-0.2.20/kernel/arm64/KERNEL.VULCAN000066400000000000000000000000531313527062700172450ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.THUNDERX2T99 OpenBLAS-0.2.20/kernel/arm64/KERNEL.XGENE1000066400000000000000000000000411313527062700172010ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ARMV8OpenBLAS-0.2.20/kernel/arm64/Makefile000066400000000000000000000000121313527062700170060ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/arm64/amax.S000066400000000000000000000115361313527062700164350ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if defined(USE_MIN) #define COND le #else #define COND ge #endif #if !defined(DOUBLE) #define REG0 wzr #define MAXF s0 #define TMPF s1 #define TMPVF {v1.s}[0] #define SZ 4 #else #define REG0 xzr #define MAXF d0 #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro INIT_F1 ldr MAXF, [X], #SZ #if defined(USE_ABS) fabs MAXF, MAXF #endif .endm .macro KERNEL_F1 ldr TMPF, [X], #SZ #if defined(USE_ABS) fabs TMPF, TMPF #endif fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND .endm .macro INIT_F4 #if !defined(DOUBLE) ld1 {v0.4s}, [X], #16 #if defined(USE_ABS) fabs v0.4s, v0.4s #endif #if defined(USE_MIN) fminv MAXF, v0.4s #else fmaxv MAXF, v0.4s #endif #else // DOUBLE ld2 {v0.2d,v1.2d}, [X], #32 #if defined(USE_ABS) fabs v0.2d, v0.2d fabs v1.2d, v1.2d #endif #if defined(USE_MIN) fmin v0.2d, v0.2d, v1.2d fminp MAXF, v0.2d #else fmax v0.2d, v0.2d, v1.2d fmaxp MAXF, v0.2d #endif #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld1 {v1.4s}, [X], #16 #if defined(USE_ABS) fabs v1.4s, v1.4s #endif #if defined(USE_MIN) fminv TMPF, v1.4s #else fmaxv TMPF, v1.4s #endif #else // DOUBLE ld2 {v1.2d,v2.2d}, [X], #32 #if defined(USE_ABS) fabs v1.2d, v1.2d fabs v2.2d, v2.2d #endif #if defined(USE_MIN) fmin v1.2d, v1.2d, v2.2d fminp TMPF, v1.2d #else fmax v1.2d, v1.2d, v2.2d fmaxp TMPF, v1.2d #endif #endif fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 ld1 {v0.s}[0], [X], INC_X #else lsl INC_X, INC_X, #3 ld1 {v0.d}[0], [X], INC_X #endif #if defined(USE_ABS) fabs MAXF, MAXF #endif .endm .macro KERNEL_S1 ld1 TMPVF, [X], INC_X #if defined(USE_ABS) fabs TMPF, TMPF #endif fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble amax_kernel_zero cmp INC_X, xzr ble amax_kernel_zero cmp INC_X, #1 bne amax_kernel_S_BEGIN amax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq amax_kernel_F1_INIT INIT_F4 subs I, I, #1 beq amax_kernel_F1 amax_kernel_F4: KERNEL_F4 subs I, I, #1 bne amax_kernel_F4 amax_kernel_F1: ands I, N, #3 ble amax_kernel_L999 amax_kernel_F10: KERNEL_F1 subs I, I, #1 bne amax_kernel_F10 ret amax_kernel_F1_INIT: INIT_F1 subs N, N, #1 b amax_kernel_F1 amax_kernel_S_BEGIN: INIT_S subs N, N, #1 ble amax_kernel_L999 asr I, N, #2 cmp I, xzr ble amax_kernel_S1 amax_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne amax_kernel_S4 amax_kernel_S1: ands I, N, #3 ble amax_kernel_L999 amax_kernel_S10: KERNEL_S1 subs I, I, #1 bne amax_kernel_S10 amax_kernel_L999: ret amax_kernel_zero: fmov MAXF, REG0 ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/asum.S000066400000000000000000000104031313527062700164440ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define REG0 wzr #define SUMF s0 #define TMPF s1 #define TMPVF {v1.s}[0] #define SZ 4 #else #define REG0 xzr #define SUMF d0 #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro KERNEL_F1 ldr TMPF, [X], #SZ fabs TMPF, TMPF fadd SUMF, SUMF, TMPF .endm .macro KERNEL_F8 #if !defined(DOUBLE) ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] fabs v1.4s, v1.4s // ABS() each value fabs v2.4s, v2.4s // ABS() each value fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] PRFM PLDL1KEEP, [X, #1024] #else // DOUBLE ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] add X, X, #64 fabs v2.2d, v2.2d fabs v3.2d, v3.2d fabs v4.2d, v4.2d fabs v5.2d, v5.2d PRFM PLDL1KEEP, [X, #1024] fadd v2.2d, v2.2d, v3.2d fadd v4.2d, v4.2d, v5.2d fadd v0.2d, v0.2d, v2.2d fadd v0.2d, v0.2d, v4.2d #endif .endm .macro KERNEL_F8_FINALIZE #if !defined(DOUBLE) ext v1.16b, v0.16b, v0.16b, #8 fadd v0.2s, v0.2s, v1.2s faddp SUMF, v0.2s #else faddp SUMF, v0.2d #endif .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 #else lsl INC_X, INC_X, #3 #endif .endm .macro KERNEL_S1 ld1 TMPVF, [X], INC_X fabs TMPF, TMPF fadd SUMF, SUMF, TMPF .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE fmov SUMF, REG0 #if !defined(DOUBLE) fmov s1, SUMF #else fmov d1, SUMF #endif cmp N, xzr ble asum_kernel_L999 cmp INC_X, xzr ble asum_kernel_L999 cmp INC_X, #1 bne asum_kernel_S_BEGIN asum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr beq asum_kernel_F1 asum_kernel_F8: KERNEL_F8 subs I, I, #1 bne asum_kernel_F8 KERNEL_F8_FINALIZE asum_kernel_F1: ands I, N, #7 ble asum_kernel_L999 asum_kernel_F10: KERNEL_F1 subs I, I, #1 bne asum_kernel_F10 asum_kernel_L999: ret asum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble asum_kernel_S1 asum_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne asum_kernel_S4 asum_kernel_S1: ands I, N, #3 ble asum_kernel_L999 asum_kernel_S10: KERNEL_S1 subs I, I, #1 bne asum_kernel_S10 ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/axpy.S000066400000000000000000000110341313527062700164610ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x3 /* X vector address */ #define INC_X x4 /* X stride */ #define Y x5 /* Y vector address */ #define INC_Y x6 /* Y stride */ #define I x1 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define DA s0 /* scale input value */ #define TMPX s1 #define TMPVX {v1.s}[0] #define TMPY s2 #define TMPVY {v2.s}[0] #define SZ 4 #else #define DA d0 /* scale input value */ #define TMPX d1 #define TMPVX {v1.d}[0] #define TMPY d2 #define TMPVY {v2.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro KERNEL_F1 ldr TMPX, [X], #SZ ldr TMPY, [Y] fmadd TMPY, TMPX, DA, TMPY str TMPY, [Y], #SZ .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld1 {v1.4s}, [X], #16 ld1 {v2.4s}, [Y] fmla v2.4s, v1.4s, v0.s[0] st1 {v2.4s}, [Y], #16 #else // DOUBLE ld1 {v1.2d, v2.2d}, [X], #32 ld1 {v3.2d, v4.2d}, [Y] fmla v3.2d, v1.2d, v0.d[0] fmla v4.2d, v2.2d, v0.d[0] st1 {v3.2d, v4.2d}, [Y], #32 #endif .endm .macro KERNEL_F8 #if !defined(DOUBLE) ld1 {v1.4s, v2.4s}, [X], #32 ld1 {v3.4s, v4.4s}, [Y] fmla v3.4s, v1.4s, v0.s[0] fmla v4.4s, v2.4s, v0.s[0] st1 {v3.4s, v4.4s}, [Y], #32 #else // DOUBLE ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y] fmla v16.2d, v1.2d, v0.d[0] fmla v17.2d, v2.2d, v0.d[0] fmla v18.2d, v3.2d, v0.d[0] fmla v19.2d, v4.2d, v0.d[0] st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64 #endif PRFM PLDL1KEEP, [X, #512] PRFM PLDL1KEEP, [Y, #512] .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 lsl INC_Y, INC_Y, #2 #else lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #endif .endm .macro KERNEL_S1 ld1 TMPVX, [X], INC_X ldr TMPY, [Y] fmadd TMPY, TMPX, DA, TMPY st1 TMPVY, [Y], INC_Y .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble axpy_kernel_L999 fcmp DA, #0.0 beq axpy_kernel_L999 cmp INC_X, #1 bne axpy_kernel_S_BEGIN cmp INC_Y, #1 bne axpy_kernel_S_BEGIN axpy_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr beq axpy_kernel_F1 axpy_kernel_F8: KERNEL_F8 subs I, I, #1 bne axpy_kernel_F8 axpy_kernel_F1: ands I, N, #7 ble axpy_kernel_L999 axpy_kernel_F10: KERNEL_F1 subs I, I, #1 bne axpy_kernel_F10 mov w0, wzr ret axpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble axpy_kernel_S1 axpy_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne axpy_kernel_S4 axpy_kernel_S1: ands I, N, #3 ble axpy_kernel_L999 axpy_kernel_S10: KERNEL_S1 subs I, I, #1 bne axpy_kernel_S10 axpy_kernel_L999: mov w0, wzr ret OpenBLAS-0.2.20/kernel/arm64/casum.S000066400000000000000000000074571313527062700166260ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #define REG0 wzr #define SUMF s0 #define TMPF s1 #define TMPVF {v1.s}[0] #define SZ 4 /******************************************************************************/ .macro KERNEL_F1 ld1 {v1.2s}, [X], #8 fabs v1.2s, v1.2s ext v2.8b, v1.8b, v1.8b, #4 fadd TMPF, TMPF, s2 fadd SUMF, SUMF, TMPF .endm .macro KERNEL_F8 ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] add X, X, #64 fabs v1.4s, v1.4s fabs v2.4s, v2.4s fabs v3.4s, v3.4s fabs v4.4s, v4.4s PRFM PLDL1KEEP, [X, #1024] fadd v1.4s, v1.4s, v2.4s fadd v3.4s, v3.4s, v4.4s fadd v0.4s, v0.4s, v1.4s fadd v0.4s, v0.4s, v3.4s .endm .macro KERNEL_F8_FINALIZE ext v1.16b, v0.16b, v0.16b, #8 fadd v0.2s, v0.2s, v1.2s faddp SUMF, v0.2s .endm .macro INIT_S lsl INC_X, INC_X, #3 .endm .macro KERNEL_S1 ld1 {v1.2s}, [X], INC_X fabs v1.2s, v1.2s ext v2.8b, v1.8b, v1.8b, #4 fadd TMPF, TMPF, s2 fadd SUMF, SUMF, TMPF .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE fmov SUMF, REG0 fmov s1, SUMF cmp N, xzr ble asum_kernel_L999 cmp INC_X, xzr ble asum_kernel_L999 cmp INC_X, #1 bne asum_kernel_S_BEGIN asum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr beq asum_kernel_F1 asum_kernel_F8: KERNEL_F8 subs I, I, #1 bne asum_kernel_F8 KERNEL_F8_FINALIZE asum_kernel_F1: ands I, N, #7 ble asum_kernel_L999 asum_kernel_F10: KERNEL_F1 subs I, I, #1 bne asum_kernel_F10 asum_kernel_L999: ret asum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble asum_kernel_S1 asum_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne asum_kernel_S4 asum_kernel_S1: ands I, N, #3 ble asum_kernel_L999 asum_kernel_S10: KERNEL_S1 subs I, I, #1 bne asum_kernel_S10 ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/casum_thunderx2t99.c000066400000000000000000000166241313527062700212130ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define J "x5" /* loop variable */ #define REG0 "wzr" #define SUMF "s0" #define SUMFD "d0" /******************************************************************************/ #define KERNEL_F1 \ "ldr d1, ["X"] \n" \ "add "X", "X", #8 \n" \ "fabs v1.2s, v1.2s \n" \ "ext v2.8b, v1.8b, v1.8b, #4 \n" \ "fadd s1, s1, s2 \n" \ "fadd "SUMF", "SUMF", s1 \n" #define KERNEL_F32 \ "ldr q16, ["X"] \n" \ "ldr q17, ["X", #16] \n" \ "ldr q18, ["X", #32] \n" \ "ldr q19, ["X", #48] \n" \ "ldp q20, q21, ["X", #64] \n" \ "ldp q22, q23, ["X", #96] \n" \ "fabs v16.4s, v16.4s \n" \ "fabs v17.4s, v17.4s \n" \ "fabs v18.4s, v18.4s \n" \ "fabs v19.4s, v19.4s \n" \ "ldp q24, q25, ["X", #128] \n" \ "ldp q26, q27, ["X", #160] \n" \ "fabs v20.4s, v20.4s \n" \ "fabs v21.4s, v21.4s \n" \ "fabs v22.4s, v22.4s \n" \ "fabs v23.4s, v23.4s \n" \ "fadd v16.4s, v16.4s, v17.4s \n" \ "fadd v18.4s, v18.4s, v19.4s \n" \ "ldp q28, q29, ["X", #192] \n" \ "ldp q30, q31, ["X", #224] \n" \ "fabs v24.4s, v24.4s \n" \ "fabs v25.4s, v25.4s \n" \ "fabs v26.4s, v26.4s \n" \ "fabs v27.4s, v27.4s \n" \ "add "X", "X", #256 \n" \ "fadd v20.4s, v20.4s, v21.4s \n" \ "fadd v22.4s, v22.4s, v23.4s \n" \ "fabs v28.4s, v28.4s \n" \ "fabs v29.4s, v29.4s \n" \ "fabs v30.4s, v30.4s \n" \ "fabs v31.4s, v31.4s \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "fadd v24.4s, v24.4s, v25.4s \n" \ "fadd v26.4s, v26.4s, v27.4s \n" \ "fadd v0.4s, v0.4s, v16.4s \n" \ "fadd v1.4s, v1.4s, v18.4s \n" \ "fadd v2.4s, v2.4s, v20.4s \n" \ "fadd v3.4s, v3.4s, v22.4s \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fadd v28.4s, v28.4s, v29.4s \n" \ "fadd v30.4s, v30.4s, v31.4s \n" \ "fadd v4.4s, v4.4s, v24.4s \n" \ "fadd v5.4s, v5.4s, v26.4s \n" \ "fadd v6.4s, v6.4s, v28.4s \n" \ "fadd v7.4s, v7.4s, v30.4s \n" #define KERNEL_F32_FINALIZE \ "fadd v0.4s, v0.4s, v1.4s \n" \ "fadd v2.4s, v2.4s, v3.4s \n" \ "fadd v4.4s, v4.4s, v5.4s \n" \ "fadd v6.4s, v6.4s, v7.4s \n" \ "fadd v0.4s, v0.4s, v2.4s \n" \ "fadd v4.4s, v4.4s, v6.4s \n" \ "fadd v0.4s, v0.4s, v4.4s \n" \ "ext v1.16b, v0.16b, v0.16b, #8 \n" \ "fadd v0.2s, v0.2s, v1.2s \n" \ "faddp "SUMF", v0.2s \n" #define INIT_S \ "lsl "INC_X", "INC_X", #3 \n" #define KERNEL_S1 \ "ldr d1, ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fabs v1.2s, v1.2s \n" \ "ext v2.8b, v1.8b, v1.8b, #4 \n" \ "fadd s1, s1, s2 \n" \ "fadd "SUMF", "SUMF", s1 \n" #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT asum = 0.0 ; if ( n < 0 ) return(asum); __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " fmov "SUMF", "REG0" \n" " fmov s1, "REG0" \n" " fmov s2, "REG0" \n" " fmov s3, "REG0" \n" " fmov s4, "REG0" \n" " fmov s5, "REG0" \n" " fmov s6, "REG0" \n" " fmov s7, "REG0" \n" " cmp "N", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Lasum_kernel_S_BEGIN \n" ".Lasum_kernel_F_BEGIN: \n" " asr "J", "N", #5 \n" " cmp "J", xzr \n" " beq .Lasum_kernel_F1 \n" ".Lasum_kernel_F32: \n" " "KERNEL_F32" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F32 \n" " "KERNEL_F32_FINALIZE" \n" ".Lasum_kernel_F1: \n" " ands "J", "N", #31 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F10 \n" " b .Lasum_kernel_L999 \n" ".Lasum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lasum_kernel_S1 \n" ".Lasum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S4 \n" ".Lasum_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S10 \n" ".Lasum_kernel_L999: \n" " fmov %[ASUM_], "SUMFD" \n" : [ASUM_] "=r" (asum) //%0 : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return asum; } #if defined(SMP) static int casum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { *result = casum_compute(n, x, inc_x); return 0; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif FLOAT asum = 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { asum = casum_compute(n, x, inc_x); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; FLOAT *ptr; mode = BLAS_SINGLE | BLAS_COMPLEX; blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)casum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { asum = asum + (*ptr); ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); } } #else asum = casum_compute(n, x, inc_x); #endif return asum; } OpenBLAS-0.2.20/kernel/arm64/cgemm_kernel_4x4.S000066400000000000000000001035451313527062700206400ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define ppC x16 #define ppA x17 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] #define alpha1_R s14 #define alphaV1_R v14.s[0] #define alpha1_I s15 #define alphaV1_I v15.s[0] #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 ppC // 17 ppA // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R //v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I //v02 ppA00_R, ppA01_R, ppA02_R, ppA03_R //v03 ppA00_I, ppA01_I, ppA02_I, ppA03_I //v04 pA10_R, pA11_R, pA12_R, pA13_R //v05 pA10_I, pA11_I, pA12_I, pA13_I //v06 ppA10_R, ppA11_R, ppA12_R, ppA13_R //v07 ppA10_I, ppA11_I, ppA12_I, ppA13_I //v08 must save pB00_R, pB01_R, pB02_R, pB03_R //v09 must save pB00_I, pB01_I, pB02_I, pB03_I //v10 must save ALPHA0_R //v11 must save ALPHA0_I //v12 must save pB10_R, pB11_R, pB12_R, pB13_R //v13 must save pB10_I, pB11_I, pB12_I, pB13_I //v14 must save ALPHA1_R //v15 must save ALPHA1_I //v16 must save pC00_R, pC01_R, pC02_R, pC03_R //v17 must save pC00_I, pC01_I, pC02_I, pC03_I //v18 ppC00_R, ppC01_R, ppC02_R, ppC03_R //v19 ppC00_I, ppC01_I, ppC02_I, ppC03_I //v20 pC10_R, pC11_R, pC12_R, pC13_R //v21 pC10_I, pC11_I, pC12_I, pC13_I //v22 ppC10_R, ppC11_R, ppC12_R, ppC13_R //v23 ppC10_I, ppC11_I, ppC12_I, ppC13_I //v24 pC20_R, pC21_R, pC22_R, pC23_R //v25 pC20_I, pC21_I, pC22_I, pC23_I //v26 ppC20_R, ppC21_R, ppC22_R, ppC23_R //v27 ppC20_I, ppC21_I, ppC22_I, ppC23_I //v28 pC30_R, pC31_R, pC32_R, pC33_R //v29 pC30_I, pC31_I, pC32_I, pC33_I //v30 ppC30_R, ppC31_R, ppC32_R, ppC33_R //v31 ppC30_I, ppC31_I, ppC32_I, ppC33_I /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, s16 fmov s18, s17 fmov s19, s16 fmov s20, s17 fmov s21, s16 fmov s22, s17 fmov s23, s16 fmov s24, s17 fmov s25, s16 fmov s26, s17 fmov s27, s16 fmov s28, s17 fmov s29, s16 fmov s30, s17 fmov s31, s16 .endm .macro KERNEL8x4_I ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v9.s[0] #else fmul v17.4s, v0.4s, v9.s[0] #endif OP_ir v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v9.s[1] #else fmul v21.4s, v0.4s, v9.s[1] #endif OP_ir v21.4s, v1.4s, v8.s[1] fmul v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v9.s[2] #else fmul v25.4s, v0.4s, v9.s[2] #endif OP_ir v25.4s, v1.4s, v8.s[2] fmul v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v9.s[3] #else fmul v29.4s, v0.4s, v9.s[3] #endif OP_ir v29.4s, v1.4s, v8.s[3] fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b fmls v19.4s, v2.4s, v9.s[0] #else fmul v19.4s, v2.4s, v9.s[0] #endif OP_ir v19.4s, v3.4s, v8.s[0] fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b fmls v23.4s, v2.4s, v9.s[1] #else fmul v23.4s, v2.4s, v9.s[1] #endif OP_ir v23.4s, v3.4s, v8.s[1] fmul v26.4s, v2.4s, v8.s[2] OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b fmls v27.4s, v2.4s, v9.s[2] #else fmul v27.4s, v2.4s, v9.s[2] #endif OP_ir v27.4s, v3.4s, v8.s[2] fmul v30.4s, v2.4s, v8.s[3] OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b fmls v31.4s, v2.4s, v9.s[3] #else fmul v31.4s, v2.4s, v9.s[3] #endif OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 ld2 {v4.4s, v5.4s} , [pA] add pA, pA, #32 ld2 {v6.4s, v7.4s} , [ppA] add ppA, ppA, #32 .endm .macro KERNEL8x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // for next round add pB, pB, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] prfm PLDL1KEEP, [pB, #512] OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] ld2 {v4.4s, v5.4s} , [pA] // for next round add pA, pA, #32 OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] prfm PLDL1KEEP, [pA, #512] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v6.4s, v7.4s} , [ppA] // for next round add ppA, ppA, #32 OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] prfm PLDL1KEEP, [ppA, #512] OP_rr v26.4s, v2.4s, v8.s[2] OP_ii v26.4s, v3.4s, v9.s[2] OP_ri v27.4s, v2.4s, v9.s[2] OP_ir v27.4s, v3.4s, v8.s[2] OP_rr v30.4s, v2.4s, v8.s[3] OP_ii v30.4s, v3.4s, v9.s[3] OP_ri v31.4s, v2.4s, v9.s[3] OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro KERNEL8x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // for next round add pB, pB, #32 OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] prfm PLDL1KEEP, [pA, #512] OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] ld2 {v0.4s, v1.4s}, [pA] // for next round add pA, pA, #32 OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] prfm PLDL1KEEP, [ppA, #512] OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] ld2 {v2.4s, v3.4s}, [ppA] // for next round add ppA, ppA, #32 OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] prfm PLDL1KEEP, [pB, #512] OP_rr v26.4s, v6.4s, v12.s[2] OP_ii v26.4s, v7.4s, v13.s[2] OP_ri v27.4s, v6.4s, v13.s[2] OP_ir v27.4s, v7.4s, v12.s[2] OP_rr v30.4s, v6.4s, v12.s[3] OP_ii v30.4s, v7.4s, v13.s[3] OP_ri v31.4s, v6.4s, v13.s[3] OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] OP_rr v26.4s, v6.4s, v12.s[2] OP_ii v26.4s, v7.4s, v13.s[2] OP_ri v27.4s, v6.4s, v13.s[2] OP_ir v27.4s, v7.4s, v12.s[2] OP_rr v30.4s, v6.4s, v12.s[3] OP_ii v30.4s, v7.4s, v13.s[3] OP_ri v31.4s, v6.4s, v13.s[3] OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] OP_rr v26.4s, v2.4s, v8.s[2] OP_ii v26.4s, v3.4s, v9.s[2] OP_ri v27.4s, v2.4s, v9.s[2] OP_ir v27.4s, v3.4s, v8.s[2] OP_rr v30.4s, v2.4s, v8.s[3] OP_ii v30.4s, v3.4s, v9.s[3] OP_ri v31.4s, v2.4s, v9.s[3] OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro SAVE8x4 mov pCRow1, pCRow0 add pCRow2, pCRow1, #32 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV1_I fmla v1.4s, v17.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmla v3.4s, v18.4s, alphaV1_I fmla v3.4s, v19.4s, alphaV1_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow2, pCRow1, #32 ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV1_I fmla v5.4s, v21.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I fmla v7.4s, v22.4s, alphaV1_I fmla v7.4s, v23.4s, alphaV1_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow2, pCRow1, #32 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmla v1.4s, v24.4s, alphaV1_I fmla v1.4s, v25.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I fmla v3.4s, v26.4s, alphaV1_I fmla v3.4s, v27.4s, alphaV1_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow2, pCRow1, #32 ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmla v5.4s, v28.4s, alphaV1_I fmla v5.4s, v29.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I fmla v7.4s, v30.4s, alphaV1_I fmla v7.4s, v31.4s, alphaV1_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV1_I fmla v1.4s, v17.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV1_I fmla v5.4s, v21.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmla v1.4s, v24.4s, alphaV1_I fmla v1.4s, v25.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmla v5.4s, v28.4s, alphaV1_I fmla v5.4s, v29.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL2x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] OP_rr v24.2s, v0.2s, v8.s[2] OP_ii v24.2s, v1.2s, v9.s[2] OP_ri v25.2s, v0.2s, v9.s[2] OP_ir v25.2s, v1.2s, v8.s[2] OP_rr v28.2s, v0.2s, v8.s[3] OP_ii v28.2s, v1.2s, v9.s[3] OP_ri v29.2s, v0.2s, v9.s[3] OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV1_I fmla v1.2s, v17.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmla v5.2s, v20.2s, alphaV1_I fmla v5.2s, v21.2s, alphaV1_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I fmla v1.2s, v24.2s, alphaV1_I fmla v1.2s, v25.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I fmla v5.2s, v28.2s, alphaV1_I fmla v5.2s, v29.2s, alphaV1_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL1x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] OP_rr s24, s0, v8.s[2] OP_ii s24, s1, v9.s[2] OP_ri s25, s0, v9.s[2] OP_ir s25, s1, v8.s[2] OP_rr s28, s0, v8.s[3] OP_ii s28, s1, v9.s[3] OP_ri s29, s0, v9.s[3] OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV1_I fmla s1, s17, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmla s5, s20, alphaV1_I fmla s5, s21, alphaV1_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s24, alphaV0_R fmls s0, s25, alphaV0_I fmla s1, s24, alphaV1_I fmla s1, s25, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s28, alphaV0_R fmls s4, s29, alphaV0_I fmla s5, s28, alphaV1_I fmla s5, s29, alphaV1_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL4x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV1_I fmla v1.4s, v17.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV1_I fmla v5.4s, v21.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL2x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV1_I fmla v1.2s, v17.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmla v5.2s, v20.2s, alphaV1_I fmla v5.2s, v21.2s, alphaV1_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, wzr .endm .macro KERNEL1x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV1_I fmla s1, s17, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmla s5, s20, alphaV1_I fmla s5, s21, alphaV1_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] .endm .macro SAVE4x1 mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV1_I fmla v1.4s, v17.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL2x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] .endm .macro SAVE2x1 mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV1_I fmla v1.2s, v17.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL1x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] .endm .macro SAVE1x1 mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV1_I fmla s1, s17, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0_R, s0 fmov alpha0_I, s1 fmov alpha1_R, s0 fmov alpha1_I, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble cgemm_kernel_L2_BEGIN /******************************************************************************/ cgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 lsl temp, origK, #5 // k * 4 * 8 mov pA, origPA // pA = start of A array add ppA, temp, pA cgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble cgemm_kernel_L4_M4_BEGIN cgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt cgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 // subtract 2 ble cgemm_kernel_L4_M8_22a .align 5 cgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M8_22 cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 cgemm_kernel_L4_M8_32: tst counterL, #1 ble cgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E b cgemm_kernel_L4_M8_44 cgemm_kernel_L4_M8_40: INIT8x4 cgemm_kernel_L4_M8_44: ands counterL , origK, #1 ble cgemm_kernel_L4_M8_100 cgemm_kernel_L4_M8_46: KERNEL8x4_SUB cgemm_kernel_L4_M8_100: SAVE8x4 cgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 bne cgemm_kernel_L4_M8_20 cgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble cgemm_kernel_L4_END tst counterI, #4 ble cgemm_kernel_L4_M2_BEGIN cgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble cgemm_kernel_L4_M4_40 cgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M4_22 cgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L4_M4_100 cgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M4_42 cgemm_kernel_L4_M4_100: SAVE4x4 cgemm_kernel_L4_M4_END: cgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L4_M1_BEGIN cgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L4_M2_40 cgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M2_22 cgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L4_M2_100 cgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M2_42 cgemm_kernel_L4_M2_100: SAVE2x4 cgemm_kernel_L4_M2_END: cgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L4_END cgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L4_M1_40 cgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M1_22 cgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L4_M1_100 cgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M1_42 cgemm_kernel_L4_M1_100: SAVE1x4 cgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- bgt cgemm_kernel_L4_BEGIN /******************************************************************************/ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble cgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble cgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A cgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble cgemm_kernel_L2_M2_BEGIN cgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M4_40 .align 5 cgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M4_22 cgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M4_100 cgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M4_42 cgemm_kernel_L2_M4_100: SAVE4x2 cgemm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt cgemm_kernel_L2_M4_20 cgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L2_M1_BEGIN cgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M2_40 cgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M2_22 cgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M2_100 cgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M2_42 cgemm_kernel_L2_M2_100: SAVE2x2 cgemm_kernel_L2_M2_END: cgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L2_END cgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble cgemm_kernel_L2_M1_40 cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M1_22 cgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M1_100 cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M1_42 cgemm_kernel_L2_M1_100: SAVE1x2 cgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ cgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble cgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A cgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble cgemm_kernel_L1_M2_BEGIN cgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M4_40 .align 5 cgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M4_22 cgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M4_100 cgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M4_42 cgemm_kernel_L1_M4_100: SAVE4x1 cgemm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt cgemm_kernel_L1_M4_20 cgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L1_M1_BEGIN cgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M2_40 cgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M2_22 cgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M2_100 cgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M2_42 cgemm_kernel_L1_M2_100: SAVE2x1 cgemm_kernel_L1_M2_END: cgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L1_END cgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M1_40 cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M1_22 cgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M1_100 cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M1_42 cgemm_kernel_L1_M1_100: SAVE1x1 cgemm_kernel_L1_END: cgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/cgemm_kernel_8x4.S000066400000000000000000001260061313527062700206410ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alphaR w17 #define alphaI w18 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 448 #define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I //v08 must save pB0_00_R, pB0_01_R //v09 must save pB0_00_I, pB0_01_I //v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R //v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I //v12 must save pB1_00_R, pB1_01_R //v13 must save pB1_00_I, pB1_01_I //v14 must save pB1_02_R, pB1_03_R //v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL8x4_I ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v9.s[0] #else fmul v17.4s, v0.4s, v9.s[0] #endif OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v10.2s, v11.2s}, [pB] add pB, pB, #16 fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b fmls v19.4s, v2.4s, v9.s[0] #else fmul v19.4s, v2.4s, v9.s[0] #endif OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v12.2s, v13.2s}, [pB] add pB, pB, #16 fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v9.s[1] #else fmul v21.4s, v0.4s, v9.s[1] #endif OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v14.2s, v15.2s}, [pB] add pB, pB, #16 fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b fmls v23.4s, v2.4s, v9.s[1] #else fmul v23.4s, v2.4s, v9.s[1] #endif OP_ir v23.4s, v3.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 fmul v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v11.s[0] #else fmul v25.4s, v0.4s, v11.s[0] #endif OP_ir v25.4s, v1.4s, v10.s[0] ld2 {v6.4s, v7.4s}, [pA] add pA, pA, #32 fmul v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b fmls v27.4s, v2.4s, v11.s[0] #else fmul v27.4s, v2.4s, v11.s[0] #endif OP_ir v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v28.4s, v0.4s, v10.s[1] OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v11.s[1] #else fmul v29.4s, v0.4s, v11.s[1] #endif OP_ir v29.4s, v1.4s, v10.s[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmul v30.4s, v2.4s, v10.s[1] OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b fmls v31.4s, v2.4s, v11.s[1] #else fmul v31.4s, v2.4s, v11.s[1] #endif OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.2s, v13.2s}, [pB] add pB, pB, #16 OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v6.4s, v7.4s}, [pA] add pA, pA, #32 OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] ld2 {v14.2s, v15.2s}, [pB] add pB, pB, #16 OP_rr v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v11.s[0] OP_ri v25.4s, v0.4s, v11.s[0] OP_ir v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v11.s[0] OP_ri v27.4s, v2.4s, v11.s[0] OP_ir v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.4s, v0.4s, v10.s[1] OP_ii v28.4s, v1.4s, v11.s[1] OP_ri v29.4s, v0.4s, v11.s[1] OP_ir v29.4s, v1.4s, v10.s[1] OP_rr v30.4s, v2.4s, v10.s[1] OP_ii v30.4s, v3.4s, v11.s[1] OP_ri v31.4s, v2.4s, v11.s[1] OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] ld2 {v10.2s, v11.2s}, [pB] add pB, pB, #16 OP_rr v24.4s, v4.4s, v14.s[0] OP_ii v24.4s, v5.4s, v15.s[0] OP_ri v25.4s, v4.4s, v15.s[0] OP_ir v25.4s, v5.4s, v14.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.4s, v6.4s, v14.s[0] OP_ii v26.4s, v7.4s, v15.s[0] OP_ri v27.4s, v6.4s, v15.s[0] OP_ir v27.4s, v7.4s, v14.s[0] OP_rr v28.4s, v4.4s, v14.s[1] OP_ii v28.4s, v5.4s, v15.s[1] OP_ri v29.4s, v4.4s, v15.s[1] OP_ir v29.4s, v5.4s, v14.s[1] OP_rr v30.4s, v6.4s, v14.s[1] OP_ii v30.4s, v7.4s, v15.s[1] OP_ri v31.4s, v6.4s, v15.s[1] OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] OP_rr v24.4s, v4.4s, v14.s[0] OP_ii v24.4s, v5.4s, v15.s[0] OP_ri v25.4s, v4.4s, v15.s[0] OP_ir v25.4s, v5.4s, v14.s[0] OP_rr v26.4s, v6.4s, v14.s[0] OP_ii v26.4s, v7.4s, v15.s[0] OP_ri v27.4s, v6.4s, v15.s[0] OP_ir v27.4s, v7.4s, v14.s[0] OP_rr v28.4s, v4.4s, v14.s[1] OP_ii v28.4s, v5.4s, v15.s[1] OP_ri v29.4s, v4.4s, v15.s[1] OP_ir v29.4s, v5.4s, v14.s[1] OP_rr v30.4s, v6.4s, v14.s[1] OP_ii v30.4s, v7.4s, v15.s[1] OP_ri v31.4s, v6.4s, v15.s[1] OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v10.2s, v11.2s}, [pB] add pB, pB, #16 OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v11.s[0] OP_ri v25.4s, v0.4s, v11.s[0] OP_ir v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v11.s[0] OP_ri v27.4s, v2.4s, v11.s[0] OP_ir v27.4s, v3.4s, v10.s[0] OP_rr v28.4s, v0.4s, v10.s[1] OP_ii v28.4s, v1.4s, v11.s[1] OP_ri v29.4s, v0.4s, v11.s[1] OP_ir v29.4s, v1.4s, v10.s[1] OP_rr v30.4s, v2.4s, v10.s[1] OP_ii v30.4s, v3.4s, v11.s[1] OP_ri v31.4s, v2.4s, v11.s[1] OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ld2 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 ld2 {v2.4s, v3.4s}, [pCRow0] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmla v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, #32 ld2 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I fmla v7.4s, v22.4s, alphaV0_I fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ld2 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmla v1.4s, v24.4s, alphaV0_I fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow2] add pCRow2, pCRow2, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I fmla v3.4s, v26.4s, alphaV0_I fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow3] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmla v5.4s, v28.4s, alphaV0_I fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow3] add pCRow3, pCRow3, #32 ld2 {v6.4s, v7.4s}, [pCRow3] fmla v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I fmla v7.4s, v30.4s, alphaV0_I fmla v7.4s, v31.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_I ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v9.s[0] #else fmul v17.4s, v0.4s, v9.s[0] #endif OP_ir v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v9.s[1] #else fmul v21.4s, v0.4s, v9.s[1] #endif OP_ir v21.4s, v1.4s, v8.s[1] fmul v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v9.s[2] #else fmul v25.4s, v0.4s, v9.s[2] #endif OP_ir v25.4s, v1.4s, v8.s[2] fmul v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v9.s[3] #else fmul v29.4s, v0.4s, v9.s[3] #endif OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 .endm .macro KERNEL4x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmla v1.4s, v24.4s, alphaV0_I fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmla v5.4s, v28.4s, alphaV0_I fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL2x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] OP_rr v24.2s, v0.2s, v8.s[2] OP_ii v24.2s, v1.2s, v9.s[2] OP_ri v25.2s, v0.2s, v9.s[2] OP_ir v25.2s, v1.2s, v8.s[2] OP_rr v28.2s, v0.2s, v8.s[3] OP_ii v28.2s, v1.2s, v9.s[3] OP_ri v29.2s, v0.2s, v9.s[3] OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmla v5.2s, v20.2s, alphaV0_I fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I fmla v1.2s, v24.2s, alphaV0_I fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I fmla v5.2s, v28.2s, alphaV0_I fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL1x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] OP_rr s24, s0, v8.s[2] OP_ii s24, s1, v9.s[2] OP_ri s25, s0, v9.s[2] OP_ir s25, s1, v8.s[2] OP_rr s28, s0, v8.s[3] OP_ii s28, s1, v9.s[3] OP_ri s29, s0, v9.s[3] OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmla s5, s20, alphaV0_I fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s24, alphaV0_R fmls s0, s25, alphaV0_I fmla s1, s24, alphaV0_I fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s28, alphaV0_R fmls s4, s29, alphaV0_I fmla s5, s28, alphaV0_I fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 .endm .macro KERNEL8x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmla v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I fmla v7.4s, v22.4s, alphaV0_I fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL4x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL2x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmla v5.2s, v20.2s, alphaV0_I fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, wzr .endm .macro KERNEL1x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmla s5, s20, alphaV0_I fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 .endm .macro KERNEL8x1_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v8.s[1] OP_ri v17.4s, v0.4s, v8.s[1] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v8.s[1] OP_ri v19.4s, v2.4s, v8.s[1] OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 ld2 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmla v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] .endm .macro SAVE4x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL2x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] .endm .macro SAVE2x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL1x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] .endm .macro SAVE1x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alphaR, s0 fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble cgemm_kernel_L2_BEGIN /******************************************************************************/ cgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array cgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble cgemm_kernel_L4_M4_BEGIN .align 5 cgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 blt cgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble cgemm_kernel_L4_M8_22a .align 5 cgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M8_22 .align 5 cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 .align 5 cgemm_kernel_L4_M8_32: tst counterL, #1 ble cgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 cgemm_kernel_L4_M8_40: INIT8x4 cgemm_kernel_L4_M8_44: ands counterL , origK, #7 ble cgemm_kernel_L4_M8_100 .align 5 cgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 bne cgemm_kernel_L4_M8_46 cgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 cgemm_kernel_L4_M8_END: subs counterI, counterI, #1 bne cgemm_kernel_L4_M8_20 cgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble cgemm_kernel_L4_END tst counterI, #4 ble cgemm_kernel_L4_M2_BEGIN cgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt cgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble cgemm_kernel_L4_M4_22a .align 5 cgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M4_22 cgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b cgemm_kernel_L4_M4_44 cgemm_kernel_L4_M4_32: tst counterL, #1 ble cgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b cgemm_kernel_L4_M4_44 cgemm_kernel_L4_M4_40: INIT4x4 cgemm_kernel_L4_M4_44: ands counterL , origK, #1 ble cgemm_kernel_L4_M4_100 cgemm_kernel_L4_M4_46: KERNEL4x4_SUB cgemm_kernel_L4_M4_100: SAVE4x4 cgemm_kernel_L4_M4_END: cgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L4_M1_BEGIN cgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L4_M2_40 cgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M2_22 cgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L4_M2_100 cgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M2_42 cgemm_kernel_L4_M2_100: SAVE2x4 cgemm_kernel_L4_M2_END: cgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L4_END cgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L4_M1_40 cgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M1_22 cgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L4_M1_100 cgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M1_42 cgemm_kernel_L4_M1_100: SAVE1x4 cgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- bgt cgemm_kernel_L4_BEGIN /******************************************************************************/ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble cgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble cgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A cgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble cgemm_kernel_L2_M4_BEGIN cgemm_kernel_L2_M8_20: INIT8x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M8_40 .align 5 cgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M8_22 cgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M8_100 cgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M8_42 cgemm_kernel_L2_M8_100: SAVE8x2 cgemm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt cgemm_kernel_L2_M8_20 cgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble cgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 ble cgemm_kernel_L2_M2_BEGIN cgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M4_40 .align 5 cgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M4_22 cgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M4_100 cgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M4_42 cgemm_kernel_L2_M4_100: SAVE4x2 cgemm_kernel_L2_M4_END: cgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L2_M1_BEGIN cgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M2_40 cgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M2_22 cgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M2_100 cgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M2_42 cgemm_kernel_L2_M2_100: SAVE2x2 cgemm_kernel_L2_M2_END: cgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L2_END cgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble cgemm_kernel_L2_M1_40 cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M1_22 cgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M1_100 cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M1_42 cgemm_kernel_L2_M1_100: SAVE1x2 cgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ cgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble cgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A cgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble cgemm_kernel_L1_M4_BEGIN cgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M8_40 .align 5 cgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M8_22 cgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M8_100 cgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M8_42 cgemm_kernel_L1_M8_100: SAVE8x1 cgemm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt cgemm_kernel_L1_M8_20 cgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble cgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 ble cgemm_kernel_L1_M2_BEGIN cgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M4_40 .align 5 cgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M4_22 cgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M4_100 cgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M4_42 cgemm_kernel_L1_M4_100: SAVE4x1 cgemm_kernel_L1_M4_END: cgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L1_M1_BEGIN cgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M2_40 cgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M2_22 cgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M2_100 cgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M2_42 cgemm_kernel_L1_M2_100: SAVE2x1 cgemm_kernel_L1_M2_END: cgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L1_END cgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M1_40 cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M1_22 cgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M1_100 cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M1_42 cgemm_kernel_L1_M1_100: SAVE1x1 cgemm_kernel_L1_END: cgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S000066400000000000000000001262321313527062700232130ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alphaR w17 #define alphaI w18 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 448 #define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I //v08 must save pB0_00_R, pB0_01_R //v09 must save pB0_00_I, pB0_01_I //v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R //v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I //v12 must save pB1_00_R, pB1_01_R //v13 must save pB1_00_I, pB1_01_I //v14 must save pB1_02_R, pB1_03_R //v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL8x4_I ldr q8, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v8.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v8.s[1] #else fmul v17.4s, v0.4s, v8.s[1] #endif OP_ir v17.4s, v1.4s, v8.s[0] ldr q10, [pB] add pB, pB, #16 fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v8.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b fmls v19.4s, v2.4s, v8.s[1] #else fmul v19.4s, v2.4s, v8.s[1] #endif OP_ir v19.4s, v3.4s, v8.s[0] ldr q12, [pB] add pB, pB, #16 fmul v20.4s, v0.4s, v8.s[2] OP_ii v20.4s, v1.4s, v8.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v8.s[3] #else fmul v21.4s, v0.4s, v8.s[3] #endif OP_ir v21.4s, v1.4s, v8.s[2] ldr q14, [pB] add pB, pB, #16 fmul v22.4s, v2.4s, v8.s[2] OP_ii v22.4s, v3.4s, v8.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b fmls v23.4s, v2.4s, v8.s[3] #else fmul v23.4s, v2.4s, v8.s[3] #endif OP_ir v23.4s, v3.4s, v8.s[2] ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 fmul v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v10.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v10.s[1] #else fmul v25.4s, v0.4s, v10.s[1] #endif OP_ir v25.4s, v1.4s, v10.s[0] ld2 {v6.4s, v7.4s}, [pA] add pA, pA, #32 fmul v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v10.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b fmls v27.4s, v2.4s, v10.s[1] #else fmul v27.4s, v2.4s, v10.s[1] #endif OP_ir v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v28.4s, v0.4s, v10.s[2] OP_ii v28.4s, v1.4s, v10.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v10.s[3] #else fmul v29.4s, v0.4s, v10.s[3] #endif OP_ir v29.4s, v1.4s, v10.s[2] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmul v30.4s, v2.4s, v10.s[2] OP_ii v30.4s, v3.4s, v10.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b fmls v31.4s, v2.4s, v10.s[3] #else fmul v31.4s, v2.4s, v10.s[3] #endif OP_ir v31.4s, v3.4s, v10.s[2] .endm .macro KERNEL8x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v8.s[1] OP_ri v17.4s, v0.4s, v8.s[1] OP_ir v17.4s, v1.4s, v8.s[0] ldr q12, [pB] add pB, pB, #16 OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v8.s[1] OP_ri v19.4s, v2.4s, v8.s[1] OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[2] OP_ii v20.4s, v1.4s, v8.s[3] OP_ri v21.4s, v0.4s, v8.s[3] OP_ir v21.4s, v1.4s, v8.s[2] ld2 {v6.4s, v7.4s}, [pA] add pA, pA, #32 OP_rr v22.4s, v2.4s, v8.s[2] OP_ii v22.4s, v3.4s, v8.s[3] OP_ri v23.4s, v2.4s, v8.s[3] OP_ir v23.4s, v3.4s, v8.s[2] ldr q14, [pB] add pB, pB, #16 OP_rr v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v10.s[1] OP_ri v25.4s, v0.4s, v10.s[1] OP_ir v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v10.s[1] OP_ri v27.4s, v2.4s, v10.s[1] OP_ir v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.4s, v0.4s, v10.s[2] OP_ii v28.4s, v1.4s, v10.s[3] OP_ri v29.4s, v0.4s, v10.s[3] OP_ir v29.4s, v1.4s, v10.s[2] OP_rr v30.4s, v2.4s, v10.s[2] OP_ii v30.4s, v3.4s, v10.s[3] OP_ri v31.4s, v2.4s, v10.s[3] OP_ir v31.4s, v3.4s, v10.s[2] .endm .macro KERNEL8x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v12.s[1] OP_ri v17.4s, v4.4s, v12.s[1] OP_ir v17.4s, v5.4s, v12.s[0] ldr q8, [pB] add pB, pB, #16 OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v12.s[1] OP_ri v19.4s, v6.4s, v12.s[1] OP_ir v19.4s, v7.4s, v12.s[0] ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v4.4s, v12.s[2] OP_ii v20.4s, v5.4s, v12.s[3] OP_ri v21.4s, v4.4s, v12.s[3] OP_ir v21.4s, v5.4s, v12.s[2] ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v22.4s, v6.4s, v12.s[2] OP_ii v22.4s, v7.4s, v12.s[3] OP_ri v23.4s, v6.4s, v12.s[3] OP_ir v23.4s, v7.4s, v12.s[2] ldr q10, [pB] add pB, pB, #16 OP_rr v24.4s, v4.4s, v14.s[0] OP_ii v24.4s, v5.4s, v14.s[1] OP_ri v25.4s, v4.4s, v14.s[1] OP_ir v25.4s, v5.4s, v14.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.4s, v6.4s, v14.s[0] OP_ii v26.4s, v7.4s, v14.s[1] OP_ri v27.4s, v6.4s, v14.s[1] OP_ir v27.4s, v7.4s, v14.s[0] OP_rr v28.4s, v4.4s, v14.s[2] OP_ii v28.4s, v5.4s, v14.s[3] OP_ri v29.4s, v4.4s, v14.s[3] OP_ir v29.4s, v5.4s, v14.s[2] OP_rr v30.4s, v6.4s, v14.s[2] OP_ii v30.4s, v7.4s, v14.s[3] OP_ri v31.4s, v6.4s, v14.s[3] OP_ir v31.4s, v7.4s, v14.s[2] .endm .macro KERNEL8x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v12.s[1] OP_ri v17.4s, v4.4s, v12.s[1] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v12.s[1] OP_ri v19.4s, v6.4s, v12.s[1] OP_ir v19.4s, v7.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[2] OP_ii v20.4s, v5.4s, v12.s[3] OP_ri v21.4s, v4.4s, v12.s[3] OP_ir v21.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.4s, v6.4s, v12.s[2] OP_ii v22.4s, v7.4s, v12.s[3] OP_ri v23.4s, v6.4s, v12.s[3] OP_ir v23.4s, v7.4s, v12.s[2] OP_rr v24.4s, v4.4s, v14.s[0] OP_ii v24.4s, v5.4s, v14.s[1] OP_ri v25.4s, v4.4s, v14.s[1] OP_ir v25.4s, v5.4s, v14.s[0] OP_rr v26.4s, v6.4s, v14.s[0] OP_ii v26.4s, v7.4s, v14.s[1] OP_ri v27.4s, v6.4s, v14.s[1] OP_ir v27.4s, v7.4s, v14.s[0] OP_rr v28.4s, v4.4s, v14.s[2] OP_ii v28.4s, v5.4s, v14.s[3] OP_ri v29.4s, v4.4s, v14.s[3] OP_ir v29.4s, v5.4s, v14.s[2] OP_rr v30.4s, v6.4s, v14.s[2] OP_ii v30.4s, v7.4s, v14.s[3] OP_ri v31.4s, v6.4s, v14.s[3] OP_ir v31.4s, v7.4s, v14.s[2] .endm .macro KERNEL8x4_SUB ldr q8, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v8.s[1] OP_ri v17.4s, v0.4s, v8.s[1] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[2] OP_ii v20.4s, v1.4s, v8.s[3] OP_ri v21.4s, v0.4s, v8.s[3] OP_ir v21.4s, v1.4s, v8.s[2] ldr q10, [pB] add pB, pB, #16 OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v8.s[1] OP_ri v19.4s, v2.4s, v8.s[1] OP_ir v19.4s, v3.4s, v8.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v22.4s, v2.4s, v8.s[2] OP_ii v22.4s, v3.4s, v8.s[3] OP_ri v23.4s, v2.4s, v8.s[3] OP_ir v23.4s, v3.4s, v8.s[2] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v10.s[1] OP_ri v25.4s, v0.4s, v10.s[1] OP_ir v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v10.s[1] OP_ri v27.4s, v2.4s, v10.s[1] OP_ir v27.4s, v3.4s, v10.s[0] OP_rr v28.4s, v0.4s, v10.s[2] OP_ii v28.4s, v1.4s, v10.s[3] OP_ri v29.4s, v0.4s, v10.s[3] OP_ir v29.4s, v1.4s, v10.s[2] OP_rr v30.4s, v2.4s, v10.s[2] OP_ii v30.4s, v3.4s, v10.s[3] OP_ri v31.4s, v2.4s, v10.s[3] OP_ir v31.4s, v3.4s, v10.s[2] .endm .macro SAVE8x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ld2 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 ld2 {v2.4s, v3.4s}, [pCRow0] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmla v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, #32 ld2 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I fmla v7.4s, v22.4s, alphaV0_I fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ld2 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmla v1.4s, v24.4s, alphaV0_I fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow2] add pCRow2, pCRow2, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I fmla v3.4s, v26.4s, alphaV0_I fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow3] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmla v5.4s, v28.4s, alphaV0_I fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow3] add pCRow3, pCRow3, #32 ld2 {v6.4s, v7.4s}, [pCRow3] fmla v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I fmla v7.4s, v30.4s, alphaV0_I fmla v7.4s, v31.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_I ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v9.s[0] #else fmul v17.4s, v0.4s, v9.s[0] #endif OP_ir v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v9.s[1] #else fmul v21.4s, v0.4s, v9.s[1] #endif OP_ir v21.4s, v1.4s, v8.s[1] fmul v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v9.s[2] #else fmul v25.4s, v0.4s, v9.s[2] #endif OP_ir v25.4s, v1.4s, v8.s[2] fmul v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v9.s[3] #else fmul v29.4s, v0.4s, v9.s[3] #endif OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 .endm .macro KERNEL4x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmla v1.4s, v24.4s, alphaV0_I fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmla v5.4s, v28.4s, alphaV0_I fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL2x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] OP_rr v24.2s, v0.2s, v8.s[2] OP_ii v24.2s, v1.2s, v9.s[2] OP_ri v25.2s, v0.2s, v9.s[2] OP_ir v25.2s, v1.2s, v8.s[2] OP_rr v28.2s, v0.2s, v8.s[3] OP_ii v28.2s, v1.2s, v9.s[3] OP_ri v29.2s, v0.2s, v9.s[3] OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmla v5.2s, v20.2s, alphaV0_I fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I fmla v1.2s, v24.2s, alphaV0_I fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I fmla v5.2s, v28.2s, alphaV0_I fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL1x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] OP_rr s24, s0, v8.s[2] OP_ii s24, s1, v9.s[2] OP_ri s25, s0, v9.s[2] OP_ir s25, s1, v8.s[2] OP_rr s28, s0, v8.s[3] OP_ii s28, s1, v9.s[3] OP_ri s29, s0, v9.s[3] OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmla s5, s20, alphaV0_I fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s24, alphaV0_R fmls s0, s25, alphaV0_I fmla s1, s24, alphaV0_I fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s28, alphaV0_R fmls s4, s29, alphaV0_I fmla s5, s28, alphaV0_I fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 .endm .macro KERNEL8x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmla v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I fmla v7.4s, v22.4s, alphaV0_I fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL4x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmla v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL2x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmla v5.2s, v20.2s, alphaV0_I fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, wzr .endm .macro KERNEL1x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmla s5, s20, alphaV0_I fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 .endm .macro KERNEL8x1_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v8.s[1] OP_ri v17.4s, v0.4s, v8.s[1] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v8.s[1] OP_ri v19.4s, v2.4s, v8.s[1] OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 ld2 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmla v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] .endm .macro SAVE4x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmla v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL2x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] .endm .macro SAVE2x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmla v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL1x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] .endm .macro SAVE1x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmla s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm .macro KERNEL8x4_M1_M2_x1 KERNEL8x4_M1 KERNEL8x4_M2 .endm .macro KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x1 KERNEL8x4_M1_M2_x1 .endm .macro KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x2 .endm .macro KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x4 .endm .macro KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alphaR, s0 fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble cgemm_kernel_L2_BEGIN /******************************************************************************/ cgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array cgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble cgemm_kernel_L4_M4_BEGIN .align 5 cgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #5 // origK / 32 cmp counterL , #2 blt cgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1_M2_x1 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x8 subs counterL, counterL, #2 // subtract 2 ble cgemm_kernel_L4_M8_22a .align 5 cgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x16 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M8_22 .align 5 cgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x1 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 .align 5 cgemm_kernel_L4_M8_32: tst counterL, #1 ble cgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 cgemm_kernel_L4_M8_40: INIT8x4 cgemm_kernel_L4_M8_44: ands counterL , origK, #31 ble cgemm_kernel_L4_M8_100 .align 5 cgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 bne cgemm_kernel_L4_M8_46 cgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 cgemm_kernel_L4_M8_END: subs counterI, counterI, #1 bne cgemm_kernel_L4_M8_20 cgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble cgemm_kernel_L4_END tst counterI, #4 ble cgemm_kernel_L4_M2_BEGIN cgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt cgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble cgemm_kernel_L4_M4_22a .align 5 cgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M4_22 cgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b cgemm_kernel_L4_M4_44 cgemm_kernel_L4_M4_32: tst counterL, #1 ble cgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b cgemm_kernel_L4_M4_44 cgemm_kernel_L4_M4_40: INIT4x4 cgemm_kernel_L4_M4_44: ands counterL , origK, #1 ble cgemm_kernel_L4_M4_100 cgemm_kernel_L4_M4_46: KERNEL4x4_SUB cgemm_kernel_L4_M4_100: SAVE4x4 cgemm_kernel_L4_M4_END: cgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L4_M1_BEGIN cgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L4_M2_40 cgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M2_22 cgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L4_M2_100 cgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M2_42 cgemm_kernel_L4_M2_100: SAVE2x4 cgemm_kernel_L4_M2_END: cgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L4_END cgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L4_M1_40 cgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M1_22 cgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L4_M1_100 cgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L4_M1_42 cgemm_kernel_L4_M1_100: SAVE1x4 cgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- bgt cgemm_kernel_L4_BEGIN /******************************************************************************/ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble cgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble cgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A cgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble cgemm_kernel_L2_M4_BEGIN cgemm_kernel_L2_M8_20: INIT8x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M8_40 .align 5 cgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M8_22 cgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M8_100 cgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M8_42 cgemm_kernel_L2_M8_100: SAVE8x2 cgemm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt cgemm_kernel_L2_M8_20 cgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble cgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 ble cgemm_kernel_L2_M2_BEGIN cgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M4_40 .align 5 cgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M4_22 cgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M4_100 cgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M4_42 cgemm_kernel_L2_M4_100: SAVE4x2 cgemm_kernel_L2_M4_END: cgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L2_M1_BEGIN cgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble cgemm_kernel_L2_M2_40 cgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M2_22 cgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M2_100 cgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M2_42 cgemm_kernel_L2_M2_100: SAVE2x2 cgemm_kernel_L2_M2_END: cgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L2_END cgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble cgemm_kernel_L2_M1_40 cgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M1_22 cgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L2_M1_100 cgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L2_M1_42 cgemm_kernel_L2_M1_100: SAVE1x2 cgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ cgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble cgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A cgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble cgemm_kernel_L1_M4_BEGIN cgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M8_40 .align 5 cgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M8_22 cgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M8_100 cgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M8_42 cgemm_kernel_L1_M8_100: SAVE8x1 cgemm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt cgemm_kernel_L1_M8_20 cgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble cgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 ble cgemm_kernel_L1_M2_BEGIN cgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M4_40 .align 5 cgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M4_22 cgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M4_100 cgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M4_42 cgemm_kernel_L1_M4_100: SAVE4x1 cgemm_kernel_L1_M4_END: cgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble cgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble cgemm_kernel_L1_M1_BEGIN cgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M2_40 cgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M2_22 cgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M2_100 cgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M2_42 cgemm_kernel_L1_M2_100: SAVE2x1 cgemm_kernel_L1_M2_END: cgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble cgemm_kernel_L1_END cgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble cgemm_kernel_L1_M1_40 cgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M1_22 cgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble cgemm_kernel_L1_M1_100 cgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt cgemm_kernel_L1_M1_42 cgemm_kernel_L1_M1_100: SAVE1x1 cgemm_kernel_L1_END: cgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/copy.S000066400000000000000000000110471313527062700164560ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define Y x3 /* Y vector address */ #define INC_Y x4 /* Y stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define TMPF s0 #define TMPVF {v0.s}[0] #define SZ 4 #else #define TMPF d0 #define TMPVF {v0.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro KERNEL_F1 #if !defined(COMPLEX) ldr TMPF, [X], #SZ str TMPF, [Y], #SZ #else #if !defined(DOUBLE) ldr d0, [X], #8 str d0, [Y], #8 #else ldr q0, [X], #16 str q0, [Y], #16 #endif #endif .endm .macro KERNEL_F4 #if !defined(COMPLEX) #if !defined(DOUBLE) ldr q0, [X], #16 str q0, [Y], #16 #else // DOUBLE ldr q0, [X], #16 str q0, [Y], #16 ldr q1, [X], #16 str q1, [Y], #16 #endif #else // COMPLEX #if !defined(DOUBLE) ldr q0, [X], #16 str q0, [Y], #16 ldr q1, [X], #16 str q1, [Y], #16 #else // DOUBLE ldr q0, [X], #16 str q0, [Y], #16 ldr q1, [X], #16 str q1, [Y], #16 ldr q2, [X], #16 str q2, [Y], #16 ldr q3, [X], #16 str q3, [Y], #16 #endif #endif .endm .macro INIT_S #if !defined(COMPLEX) #if !defined(DOUBLE) lsl INC_X, INC_X, #2 lsl INC_Y, INC_Y, #2 #else lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #endif #else #if !defined(DOUBLE) lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #else lsl INC_X, INC_X, #4 lsl INC_Y, INC_Y, #4 #endif #endif .endm .macro KERNEL_S1 #if !defined(COMPLEX) #if !defined(DOUBLE) ldr w10, [X] add X, X, INC_X str w10, [Y] add Y, Y, INC_Y #else ldr x10, [X] add X, X, INC_X str x10, [Y] add Y, Y, INC_Y #endif #else #if !defined(DOUBLE) ld1 {v0.2s}, [X] add X, X, INC_X st1 {v0.2s}, [Y] add Y, Y, INC_Y #else ld1 {v0.2d}, [X] add X, X, INC_X st1 {v0.2d}, [Y] add Y, Y, INC_Y #endif #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble copy_kernel_L999 cmp INC_X, #1 bne copy_kernel_S_BEGIN cmp INC_Y, #1 bne copy_kernel_S_BEGIN copy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq copy_kernel_F1 copy_kernel_F4: KERNEL_F4 subs I, I, #1 bne copy_kernel_F4 copy_kernel_F1: ands I, N, #3 ble copy_kernel_L999 copy_kernel_F10: KERNEL_F1 subs I, I, #1 bne copy_kernel_F10 mov w0, wzr ret copy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble copy_kernel_S1 copy_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne copy_kernel_S4 copy_kernel_S1: ands I, N, #3 ble copy_kernel_L999 copy_kernel_S10: KERNEL_S1 subs I, I, #1 bne copy_kernel_S10 copy_kernel_L999: mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/copy_thunderx2t99.c000066400000000000000000000127461313527062700210560ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define N "x0" /* vector length */ #define X "x1" /* X vector address */ #define INC_X "x2" /* X stride */ #define Y "x3" /* Y vector address */ #define INC_Y "x4" /* Y stride */ #define J "x5" /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(COMPLEX) #if !defined(DOUBLE) #define TMPF "s0" #define INC_SHIFT "2" #define N_DIV_SHIFT "2" #define N_REM_MASK "3" #else #define TMPF "d0" #define INC_SHIFT "3" #define N_DIV_SHIFT "1" #define N_REM_MASK "1" #endif #else #if !defined(DOUBLE) #define TMPF "d0" #define INC_SHIFT "3" #define N_DIV_SHIFT "1" #define N_REM_MASK "1" #else #define TMPF "q0" #define INC_SHIFT "4" #define N_DIV_SHIFT "0" #define N_REM_MASK "0" #endif #endif #define KERNEL_F1 \ "ldr "TMPF", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "str "TMPF", ["Y"] \n" \ "add "Y", "Y", "INC_Y" \n" #define KERNEL_F \ "ldr q0, ["X"], #16 \n" \ "str q0, ["Y"], #16 \n" #define INIT \ "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ "lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { if ( n < 0 ) return 0; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " mov "Y", %[Y_] \n" " mov "INC_Y", %[INCY_] \n" " cmp "N", xzr \n" " ble .Lcopy_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Lcopy_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" " bne .Lcopy_kernel_S_BEGIN \n" ".Lcopy_kernel_F_BEGIN: \n" " "INIT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq .Lcopy_kernel_F1 \n" " .align 5 \n" ".Lcopy_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" " bne .Lcopy_kernel_F \n" ".Lcopy_kernel_F1: \n" #if defined(COMPLEX) && defined(DOUBLE) " b .Lcopy_kernel_L999 \n" #else " ands "J", "N", #"N_REM_MASK" \n" " ble .Lcopy_kernel_L999 \n" #endif ".Lcopy_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lcopy_kernel_F10 \n" " b .Lcopy_kernel_L999 \n" ".Lcopy_kernel_S_BEGIN: \n" " "INIT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lcopy_kernel_S1 \n" ".Lcopy_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lcopy_kernel_S4 \n" ".Lcopy_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lcopy_kernel_L999 \n" ".Lcopy_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lcopy_kernel_S10 \n" ".Lcopy_kernel_L999: \n" : : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x), //%3 [Y_] "r" (y), //%4 [INCY_] "r" (inc_y) //%5 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0" ); return 0; } #if defined(SMP) static int copy_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) { do_copy(n, x, inc_x, y, inc_y); return 0; } #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif if (n <= 0) return 0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { do_copy(n, x, inc_x, y, inc_y); } else { int mode = 0; #if !defined(COMPLEX) mode = BLAS_REAL; #else mode = BLAS_COMPLEX; #endif #if !defined(DOUBLE) mode |= BLAS_SINGLE; #else mode |= BLAS_DOUBLE; #endif blas_level1_thread(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, NULL, 0, ( void *)copy_thread_function, nthreads); } #else do_copy(n, x, inc_x, y, inc_y); #endif return 0; } OpenBLAS-0.2.20/kernel/arm64/ctrmm_kernel_4x4.S000066400000000000000000001001521313527062700206610ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define temp x16 #define tempOffset x17 #define tempK x18 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] #define alpha1_R s14 #define alphaV1_R v14.s[0] #define alpha1_I s15 #define alphaV1_I v15.s[0] #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R //v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I //v02 //v03 //v04 pA10_R, pA11_R, pA12_R, pA13_R //v05 pA10_I, pA11_I, pA12_I, pA13_I //v06 //v07 //v08 must save pB00_R, pB01_R, pB02_R, pB03_R //v09 must save pB00_I, pB01_I, pB02_I, pB03_I //v10 must save ALPHA0_R //v11 must save ALPHA0_I //v12 must save pB10_R, pB11_R, pB12_R, pB13_R //v13 must save pB10_I, pB11_I, pB12_I, pB13_I //v14 must save ALPHA1_R //v15 must save ALPHA1_I //v16 must save pC00_R, pC01_R, pC02_R, pC03_R //v17 must save pC00_I, pC01_I, pC02_I, pC03_I //v18 //v19 //v20 pC10_R, pC11_R, pC12_R, pC13_R //v21 pC10_I, pC11_I, pC12_I, pC13_I //v22 //v23 //v24 pC20_R, pC21_R, pC22_R, pC23_R //v25 pC20_I, pC21_I, pC22_I, pC23_I //v26 //v27 //v28 pC30_R, pC31_R, pC32_R, pC33_R //v29 pC30_I, pC31_I, pC32_I, pC33_I //v30 //v31 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_I ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v9.s[0] #else fmul v17.4s, v0.4s, v9.s[0] #endif OP_ir v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v9.s[1] #else fmul v21.4s, v0.4s, v9.s[1] #endif OP_ir v21.4s, v1.4s, v8.s[1] fmul v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v9.s[2] #else fmul v25.4s, v0.4s, v9.s[2] #endif OP_ir v25.4s, v1.4s, v8.s[2] fmul v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v9.s[3] #else fmul v29.4s, v0.4s, v9.s[3] #endif OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 .endm .macro KERNEL4x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV1_I fmla v1.4s, v17.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmul v5.4s, v20.4s, alphaV1_I fmla v5.4s, v21.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmul v1.4s, v24.4s, alphaV1_I fmla v1.4s, v25.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmul v5.4s, v28.4s, alphaV1_I fmla v5.4s, v29.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL2x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] OP_rr v24.2s, v0.2s, v8.s[2] OP_ii v24.2s, v1.2s, v9.s[2] OP_ri v25.2s, v0.2s, v9.s[2] OP_ir v25.2s, v1.2s, v8.s[2] OP_rr v28.2s, v0.2s, v8.s[3] OP_ii v28.2s, v1.2s, v9.s[3] OP_ri v29.2s, v0.2s, v9.s[3] OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmul v1.2s, v16.2s, alphaV1_I fmla v1.2s, v17.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmul v5.2s, v20.2s, alphaV1_I fmla v5.2s, v21.2s, alphaV1_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I fmul v1.2s, v24.2s, alphaV1_I fmla v1.2s, v25.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I fmul v5.2s, v28.2s, alphaV1_I fmla v5.2s, v29.2s, alphaV1_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL1x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] OP_rr s24, s0, v8.s[2] OP_ii s24, s1, v9.s[2] OP_ri s25, s0, v9.s[2] OP_ir s25, s1, v8.s[2] OP_rr s28, s0, v8.s[3] OP_ii s28, s1, v9.s[3] OP_ri s29, s0, v9.s[3] OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmul s1, s16, alphaV1_I fmla s1, s17, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmul s5, s20, alphaV1_I fmla s5, s21, alphaV1_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s0, s24, alphaV0_R fmls s0, s25, alphaV0_I fmul s1, s24, alphaV1_I fmla s1, s25, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s4, s28, alphaV0_R fmls s4, s29, alphaV0_I fmul s5, s28, alphaV1_I fmla s5, s29, alphaV1_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL4x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV1_I fmla v1.4s, v17.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmul v5.4s, v20.4s, alphaV1_I fmla v5.4s, v21.4s, alphaV1_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL2x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmul v1.2s, v16.2s, alphaV1_I fmla v1.2s, v17.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmul v5.2s, v20.2s, alphaV1_I fmla v5.2s, v21.2s, alphaV1_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, wzr .endm .macro KERNEL1x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmul s1, s16, alphaV1_I fmla s1, s17, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmul s5, s20, alphaV1_I fmla s5, s21, alphaV1_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] .endm .macro SAVE4x1 mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV1_I fmla v1.4s, v17.4s, alphaV1_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL2x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] .endm .macro SAVE2x1 mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmul v1.2s, v16.2s, alphaV1_I fmla v1.2s, v17.2s, alphaV1_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL1x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] .endm .macro SAVE1x1 mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmul s1, s16, alphaV1_I fmla s1, s17, alphaV1_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0_R, s0 fmov alpha0_I, s1 fmov alpha1_R, s0 fmov alpha1_I, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble ctrmm_kernel_L2_BEGIN /******************************************************************************/ ctrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array ctrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble ctrmm_kernel_L4_M2_BEGIN ctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt ctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble ctrmm_kernel_L4_M4_22a .align 5 ctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M4_22 ctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_32: tst counterL, #1 ble ctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_40: INIT4x4 ctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 ble ctrmm_kernel_L4_M4_100 ctrmm_kernel_L4_M4_46: KERNEL4x4_SUB ctrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ctrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne ctrmm_kernel_L4_M4_20 ctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble ctrmm_kernel_L4_M1_BEGIN ctrmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L4_M2_40 ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M2_22 ctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L4_M2_100 ctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M2_42 ctrmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ctrmm_kernel_L4_M2_END: ctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ctrmm_kernel_L4_END ctrmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L4_M1_40 ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M1_22 ctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L4_M1_100 ctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M1_42 ctrmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif ctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- bgt ctrmm_kernel_L4_BEGIN /******************************************************************************/ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble ctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble ctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A ctrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble ctrmm_kernel_L2_M2_BEGIN ctrmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble ctrmm_kernel_L2_M4_40 .align 5 ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M4_22 ctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L2_M4_100 ctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M4_42 ctrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ctrmm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt ctrmm_kernel_L2_M4_20 ctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble ctrmm_kernel_L2_M1_BEGIN ctrmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble ctrmm_kernel_L2_M2_40 ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M2_22 ctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L2_M2_100 ctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M2_42 ctrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ctrmm_kernel_L2_M2_END: ctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ctrmm_kernel_L2_END ctrmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble ctrmm_kernel_L2_M1_40 ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M1_22 ctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L2_M1_100 ctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M1_42 ctrmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif ctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ ctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble ctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A ctrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble ctrmm_kernel_L1_M2_BEGIN ctrmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L1_M4_40 .align 5 ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M4_22 ctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L1_M4_100 ctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M4_42 ctrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ctrmm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt ctrmm_kernel_L1_M4_20 ctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble ctrmm_kernel_L1_M1_BEGIN ctrmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L1_M2_40 ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M2_22 ctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L1_M2_100 ctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M2_42 ctrmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ctrmm_kernel_L1_M2_END: ctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ctrmm_kernel_L1_END ctrmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L1_M1_40 ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M1_22 ctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L1_M1_100 ctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M1_42 ctrmm_kernel_L1_M1_100: SAVE1x1 ctrmm_kernel_L1_END: ctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/ctrmm_kernel_8x4.S000066400000000000000000001450211313527062700206710ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alphaR w17 #define alphaI w18 #define temp x19 #define tempOffset x20 #define tempK x21 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 448 #define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I //v08 must save pB0_00_R, pB0_01_R //v09 must save pB0_00_I, pB0_01_I //v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R //v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I //v12 must save pB1_00_R, pB1_01_R //v13 must save pB1_00_I, pB1_01_I //v14 must save pB1_02_R, pB1_03_R //v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL8x4_I ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v9.s[0] #else fmul v17.4s, v0.4s, v9.s[0] #endif OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v10.2s, v11.2s}, [pB] add pB, pB, #16 fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b fmls v19.4s, v2.4s, v9.s[0] #else fmul v19.4s, v2.4s, v9.s[0] #endif OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v12.2s, v13.2s}, [pB] add pB, pB, #16 fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v9.s[1] #else fmul v21.4s, v0.4s, v9.s[1] #endif OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v14.2s, v15.2s}, [pB] add pB, pB, #16 fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b fmls v23.4s, v2.4s, v9.s[1] #else fmul v23.4s, v2.4s, v9.s[1] #endif OP_ir v23.4s, v3.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 fmul v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v11.s[0] #else fmul v25.4s, v0.4s, v11.s[0] #endif OP_ir v25.4s, v1.4s, v10.s[0] ld2 {v6.4s, v7.4s}, [pA] add pA, pA, #32 fmul v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b fmls v27.4s, v2.4s, v11.s[0] #else fmul v27.4s, v2.4s, v11.s[0] #endif OP_ir v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v28.4s, v0.4s, v10.s[1] OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v11.s[1] #else fmul v29.4s, v0.4s, v11.s[1] #endif OP_ir v29.4s, v1.4s, v10.s[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmul v30.4s, v2.4s, v10.s[1] OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b fmls v31.4s, v2.4s, v11.s[1] #else fmul v31.4s, v2.4s, v11.s[1] #endif OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.2s, v13.2s}, [pB] add pB, pB, #16 OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v6.4s, v7.4s}, [pA] add pA, pA, #32 OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] ld2 {v14.2s, v15.2s}, [pB] add pB, pB, #16 OP_rr v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v11.s[0] OP_ri v25.4s, v0.4s, v11.s[0] OP_ir v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v11.s[0] OP_ri v27.4s, v2.4s, v11.s[0] OP_ir v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.4s, v0.4s, v10.s[1] OP_ii v28.4s, v1.4s, v11.s[1] OP_ri v29.4s, v0.4s, v11.s[1] OP_ir v29.4s, v1.4s, v10.s[1] OP_rr v30.4s, v2.4s, v10.s[1] OP_ii v30.4s, v3.4s, v11.s[1] OP_ri v31.4s, v2.4s, v11.s[1] OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] ld2 {v10.2s, v11.2s}, [pB] add pB, pB, #16 OP_rr v24.4s, v4.4s, v14.s[0] OP_ii v24.4s, v5.4s, v15.s[0] OP_ri v25.4s, v4.4s, v15.s[0] OP_ir v25.4s, v5.4s, v14.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.4s, v6.4s, v14.s[0] OP_ii v26.4s, v7.4s, v15.s[0] OP_ri v27.4s, v6.4s, v15.s[0] OP_ir v27.4s, v7.4s, v14.s[0] OP_rr v28.4s, v4.4s, v14.s[1] OP_ii v28.4s, v5.4s, v15.s[1] OP_ri v29.4s, v4.4s, v15.s[1] OP_ir v29.4s, v5.4s, v14.s[1] OP_rr v30.4s, v6.4s, v14.s[1] OP_ii v30.4s, v7.4s, v15.s[1] OP_ri v31.4s, v6.4s, v15.s[1] OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] OP_rr v24.4s, v4.4s, v14.s[0] OP_ii v24.4s, v5.4s, v15.s[0] OP_ri v25.4s, v4.4s, v15.s[0] OP_ir v25.4s, v5.4s, v14.s[0] OP_rr v26.4s, v6.4s, v14.s[0] OP_ii v26.4s, v7.4s, v15.s[0] OP_ri v27.4s, v6.4s, v15.s[0] OP_ir v27.4s, v7.4s, v14.s[0] OP_rr v28.4s, v4.4s, v14.s[1] OP_ii v28.4s, v5.4s, v15.s[1] OP_ri v29.4s, v4.4s, v15.s[1] OP_ir v29.4s, v5.4s, v14.s[1] OP_rr v30.4s, v6.4s, v14.s[1] OP_ii v30.4s, v7.4s, v15.s[1] OP_ri v31.4s, v6.4s, v15.s[1] OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v10.2s, v11.2s}, [pB] add pB, pB, #16 OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v24.4s, v0.4s, v10.s[0] OP_ii v24.4s, v1.4s, v11.s[0] OP_ri v25.4s, v0.4s, v11.s[0] OP_ir v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.4s, v2.4s, v10.s[0] OP_ii v26.4s, v3.4s, v11.s[0] OP_ri v27.4s, v2.4s, v11.s[0] OP_ir v27.4s, v3.4s, v10.s[0] OP_rr v28.4s, v0.4s, v10.s[1] OP_ii v28.4s, v1.4s, v11.s[1] OP_ri v29.4s, v0.4s, v11.s[1] OP_ir v29.4s, v1.4s, v10.s[1] OP_rr v30.4s, v2.4s, v10.s[1] OP_ii v30.4s, v3.4s, v11.s[1] OP_ri v31.4s, v2.4s, v11.s[1] OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmul v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmul v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, #32 fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I fmul v7.4s, v22.4s, alphaV0_I fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmul v1.4s, v24.4s, alphaV0_I fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow2] add pCRow2, pCRow2, #32 fmul v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I fmul v3.4s, v26.4s, alphaV0_I fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmul v5.4s, v28.4s, alphaV0_I fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow3] add pCRow3, pCRow3, #32 fmul v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I fmul v7.4s, v30.4s, alphaV0_I fmla v7.4s, v31.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_I ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 fmul v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.4s, v0.4s, v9.s[0] #else fmul v17.4s, v0.4s, v9.s[0] #endif OP_ir v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.4s, v0.4s, v9.s[1] #else fmul v21.4s, v0.4s, v9.s[1] #endif OP_ir v21.4s, v1.4s, v8.s[1] fmul v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.4s, v0.4s, v9.s[2] #else fmul v25.4s, v0.4s, v9.s[2] #endif OP_ir v25.4s, v1.4s, v8.s[2] fmul v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.4s, v0.4s, v9.s[3] #else fmul v29.4s, v0.4s, v9.s[3] #endif OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 ld2 {v4.4s, v5.4s}, [pA] add pA, pA, #32 .endm .macro KERNEL4x4_M1 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E OP_rr v16.4s, v4.4s, v12.s[0] OP_ii v16.4s, v5.4s, v13.s[0] OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] OP_rr v24.4s, v4.4s, v12.s[2] OP_ii v24.4s, v5.4s, v13.s[2] OP_ri v25.4s, v4.4s, v13.s[2] OP_ir v25.4s, v5.4s, v12.s[2] OP_rr v28.4s, v4.4s, v12.s[3] OP_ii v28.4s, v5.4s, v13.s[3] OP_ri v29.4s, v4.4s, v13.s[3] OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v24.4s, v0.4s, v8.s[2] OP_ii v24.4s, v1.4s, v9.s[2] OP_ri v25.4s, v0.4s, v9.s[2] OP_ir v25.4s, v1.4s, v8.s[2] OP_rr v28.4s, v0.4s, v8.s[3] OP_ii v28.4s, v1.4s, v9.s[3] OP_ri v29.4s, v0.4s, v9.s[3] OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmul v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I fmul v1.4s, v24.4s, alphaV0_I fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I fmul v5.4s, v28.4s, alphaV0_I fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL2x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] OP_rr v24.2s, v0.2s, v8.s[2] OP_ii v24.2s, v1.2s, v9.s[2] OP_ri v25.2s, v0.2s, v9.s[2] OP_ir v25.2s, v1.2s, v8.s[2] OP_rr v28.2s, v0.2s, v8.s[3] OP_ii v28.2s, v1.2s, v9.s[3] OP_ri v29.2s, v0.2s, v9.s[3] OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmul v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmul v5.2s, v20.2s, alphaV0_I fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I fmul v1.2s, v24.2s, alphaV0_I fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I fmul v5.2s, v28.2s, alphaV0_I fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 fmov s24, s16 fmov s25, s17 fmov s28, s16 fmov s29, s17 .endm .macro KERNEL1x4_SUB ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] OP_rr s24, s0, v8.s[2] OP_ii s24, s1, v9.s[2] OP_ri s25, s0, v9.s[2] OP_ir s25, s1, v8.s[2] OP_rr s28, s0, v8.s[3] OP_ii s28, s1, v9.s[3] OP_ri s29, s0, v9.s[3] OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmul s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmul s5, s20, alphaV0_I fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s0, s24, alphaV0_R fmls s0, s25, alphaV0_I fmul s1, s24, alphaV0_I fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s4, s28, alphaV0_R fmls s4, s29, alphaV0_I fmul s5, s28, alphaV0_I fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 .endm .macro KERNEL8x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmul v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmul v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I fmul v7.4s, v22.4s, alphaV0_I fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL4x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I fmul v5.4s, v20.4s, alphaV0_I fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s17, wzr fmov s20, s16 fmov s21, s17 .endm .macro KERNEL2x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] OP_rr v20.2s, v0.2s, v8.s[1] OP_ii v20.2s, v1.2s, v9.s[1] OP_ri v21.2s, v0.2s, v9.s[1] OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmul v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I fmul v5.2s, v20.2s, alphaV0_I fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, wzr .endm .macro KERNEL1x2_SUB ld2 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] OP_rr s20, s0, v8.s[1] OP_ii s20, s1, v9.s[1] OP_ri s21, s0, v9.s[1] OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmul s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I fmul s5, s20, alphaV0_I fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 .endm .macro KERNEL8x1_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v8.s[1] OP_ri v17.4s, v0.4s, v8.s[1] OP_ir v17.4s, v1.4s, v8.s[0] OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v8.s[1] OP_ri v19.4s, v2.4s, v8.s[1] OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I fmul v3.4s, v18.4s, alphaV0_I fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] .endm .macro SAVE4x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I fmul v1.4s, v16.4s, alphaV0_I fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL2x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 OP_rr v16.2s, v0.2s, v8.s[0] OP_ii v16.2s, v1.2s, v9.s[0] OP_ri v17.2s, v0.2s, v9.s[0] OP_ir v17.2s, v1.2s, v8.s[0] .endm .macro SAVE2x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I fmul v1.2s, v16.2s, alphaV0_I fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL1x1_SUB ld2 {v8.s, v9.s}[0], [pB] add pB, pB, #8 ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 OP_rr s16, s0, v8.s[0] OP_ii s16, s1, v9.s[0] OP_ri s17, s0, v9.s[0] OP_ir s17, s1, v8.s[0] .endm .macro SAVE1x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I fmul s1, s16, alphaV0_I fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alphaR, s0 fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble ctrmm_kernel_L2_BEGIN /******************************************************************************/ ctrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array ctrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble ctrmm_kernel_L4_M4_BEGIN ctrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 cmp counterL , #2 blt ctrmm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble ctrmm_kernel_L4_M8_22a .align 5 ctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M8_22 .align 5 ctrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 .align 5 ctrmm_kernel_L4_M8_32: tst counterL, #1 ble ctrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 ctrmm_kernel_L4_M8_40: INIT8x4 ctrmm_kernel_L4_M8_44: ands counterL , tempK, #7 ble ctrmm_kernel_L4_M8_100 .align 5 ctrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 bne ctrmm_kernel_L4_M8_46 ctrmm_kernel_L4_M8_100: SAVE8x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] ctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 bne ctrmm_kernel_L4_M8_20 ctrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble ctrmm_kernel_L4_END tst counterI, #4 ble ctrmm_kernel_L4_M2_BEGIN ctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt ctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble ctrmm_kernel_L4_M4_22a .align 5 ctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M4_22 ctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_32: tst counterL, #1 ble ctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_40: INIT4x4 ctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 ble ctrmm_kernel_L4_M4_100 ctrmm_kernel_L4_M4_46: KERNEL4x4_SUB ctrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ctrmm_kernel_L4_M4_END: ctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble ctrmm_kernel_L4_M1_BEGIN ctrmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L4_M2_40 ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M2_22 ctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L4_M2_100 ctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M2_42 ctrmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ctrmm_kernel_L4_M2_END: ctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ctrmm_kernel_L4_END ctrmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L4_M1_40 ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M1_22 ctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L4_M1_100 ctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M1_42 ctrmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif ctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- bgt ctrmm_kernel_L4_BEGIN /******************************************************************************/ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble ctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble ctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A ctrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble ctrmm_kernel_L2_M4_BEGIN ctrmm_kernel_L2_M8_20: INIT8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble ctrmm_kernel_L2_M8_40 .align 5 ctrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M8_22 ctrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L2_M8_100 ctrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M8_42 ctrmm_kernel_L2_M8_100: SAVE8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif ctrmm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt ctrmm_kernel_L2_M8_20 ctrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble ctrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 ble ctrmm_kernel_L2_M2_BEGIN ctrmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble ctrmm_kernel_L2_M4_40 .align 5 ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M4_22 ctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L2_M4_100 ctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M4_42 ctrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ctrmm_kernel_L2_M4_END: ctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble ctrmm_kernel_L2_M1_BEGIN ctrmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble ctrmm_kernel_L2_M2_40 ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M2_22 ctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L2_M2_100 ctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M2_42 ctrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ctrmm_kernel_L2_M2_END: ctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ctrmm_kernel_L2_END ctrmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble ctrmm_kernel_L2_M1_40 ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M1_22 ctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L2_M1_100 ctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L2_M1_42 ctrmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif ctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ ctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble ctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A ctrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble ctrmm_kernel_L1_M4_BEGIN ctrmm_kernel_L1_M8_20: INIT8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #3 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L1_M8_40 .align 5 ctrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M8_22 ctrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L1_M8_100 ctrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M8_42 ctrmm_kernel_L1_M8_100: SAVE8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif ctrmm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt ctrmm_kernel_L1_M8_20 ctrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble ctrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 ble ctrmm_kernel_L1_M2_BEGIN ctrmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L1_M4_40 .align 5 ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M4_22 ctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L1_M4_100 ctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M4_42 ctrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ctrmm_kernel_L1_M4_END: ctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble ctrmm_kernel_L1_M1_BEGIN ctrmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L1_M2_40 ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M2_22 ctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L1_M2_100 ctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M2_42 ctrmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ctrmm_kernel_L1_M2_END: ctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ctrmm_kernel_L1_END ctrmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ctrmm_kernel_L1_M1_40 ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M1_22 ctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ctrmm_kernel_L1_M1_100 ctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt ctrmm_kernel_L1_M1_42 ctrmm_kernel_L1_M1_100: SAVE1x1 ctrmm_kernel_L1_END: ctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dasum_thunderx2t99.c000066400000000000000000000163421313527062700212110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define J "x5" /* loop variable */ #define REG0 "xzr" #define SUMF "d0" #define TMPF "d1" /******************************************************************************/ #define KERNEL_F1 \ "ldr "TMPF", ["X"] \n" \ "add "X", "X", #8 \n" \ "fabs "TMPF", "TMPF" \n" \ "fadd "SUMF", "SUMF", "TMPF" \n" #define KERNEL_F32 \ "ldr q16, ["X"] \n" \ "ldr q17, ["X", #16] \n" \ "ldr q18, ["X", #32] \n" \ "ldr q19, ["X", #48] \n" \ "ldp q20, q21, ["X", #64] \n" \ "ldp q22, q23, ["X", #96] \n" \ "fabs v16.2d, v16.2d \n" \ "fabs v17.2d, v17.2d \n" \ "fabs v18.2d, v18.2d \n" \ "fabs v19.2d, v19.2d \n" \ "ldp q24, q25, ["X", #128] \n" \ "ldp q26, q27, ["X", #160] \n" \ "fabs v20.2d, v20.2d \n" \ "fabs v21.2d, v21.2d \n" \ "fabs v22.2d, v22.2d \n" \ "fabs v23.2d, v23.2d \n" \ "fadd v16.2d, v16.2d, v17.2d \n" \ "fadd v18.2d, v18.2d, v19.2d \n" \ "ldp q28, q29, ["X", #192] \n" \ "ldp q30, q31, ["X", #224] \n" \ "fabs v24.2d, v24.2d \n" \ "fabs v25.2d, v25.2d \n" \ "fabs v26.2d, v26.2d \n" \ "fabs v27.2d, v27.2d \n" \ "add "X", "X", #256 \n" \ "fadd v20.2d, v20.2d, v21.2d \n" \ "fadd v22.2d, v22.2d, v23.2d \n" \ "fabs v28.2d, v28.2d \n" \ "fabs v29.2d, v29.2d \n" \ "fabs v30.2d, v30.2d \n" \ "fabs v31.2d, v31.2d \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "fadd v24.2d, v24.2d, v25.2d \n" \ "fadd v26.2d, v26.2d, v27.2d \n" \ "fadd v28.2d, v28.2d, v29.2d \n" \ "fadd v30.2d, v30.2d, v31.2d \n" \ "fadd v0.2d, v0.2d, v16.2d \n" \ "fadd v1.2d, v1.2d, v18.2d \n" \ "fadd v2.2d, v2.2d, v20.2d \n" \ "fadd v3.2d, v3.2d, v22.2d \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fadd v4.2d, v4.2d, v24.2d \n" \ "fadd v5.2d, v5.2d, v26.2d \n" \ "fadd v6.2d, v6.2d, v28.2d \n" \ "fadd v7.2d, v7.2d, v30.2d \n" #define KERNEL_F32_FINALIZE \ "fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \ "fadd v6.2d, v6.2d, v7.2d \n" \ "fadd v0.2d, v0.2d, v2.2d \n" \ "fadd v4.2d, v4.2d, v6.2d \n" \ "fadd v0.2d, v0.2d, v4.2d \n" \ "faddp "SUMF", v0.2d \n" #define INIT_S \ "lsl "INC_X", "INC_X", #3 \n" #define KERNEL_S1 \ "ldr "TMPF", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fabs "TMPF", "TMPF" \n" \ "fadd "SUMF", "SUMF", "TMPF" \n" #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT asum = 0.0 ; if ( n < 0 ) return(asum); __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " fmov "SUMF", "REG0" \n" " fmov d1, "REG0" \n" " fmov d2, "REG0" \n" " fmov d3, "REG0" \n" " fmov d4, "REG0" \n" " fmov d5, "REG0" \n" " fmov d6, "REG0" \n" " fmov d7, "REG0" \n" " cmp "N", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Lasum_kernel_S_BEGIN \n" ".Lasum_kernel_F_BEGIN: \n" " asr "J", "N", #5 \n" " cmp "J", xzr \n" " beq .Lasum_kernel_F1 \n" ".align 5 \n" ".Lasum_kernel_F32: \n" " "KERNEL_F32" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F32 \n" " "KERNEL_F32_FINALIZE" \n" ".Lasum_kernel_F1: \n" " ands "J", "N", #31 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F10 \n" " b .Lasum_kernel_L999 \n" ".Lasum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lasum_kernel_S1 \n" ".Lasum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S4 \n" ".Lasum_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S10 \n" ".Lasum_kernel_L999: \n" " fmov %[ASUM_], "SUMF" \n" : [ASUM_] "=r" (asum) //%0 : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return asum; } #if defined(SMP) static int dasum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { *result = dasum_compute(n, x, inc_x); return 0; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif FLOAT asum = 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { asum = dasum_compute(n, x, inc_x); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; FLOAT *ptr; mode = BLAS_DOUBLE; blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)dasum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { asum = asum + (*ptr); ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); } } #else asum = dasum_compute(n, x, inc_x); #endif return asum; } OpenBLAS-0.2.20/kernel/arm64/daxpy_thunderx.c000066400000000000000000000075721313527062700206020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); //#define prefetch(a) static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; double a = *alpha; #if 0 prefetch(x + 128/sizeof(*x)); prefetch(y + 128/sizeof(*y)); #endif prefetch(x + 2*128/sizeof(*x)); prefetch(y + 2*128/sizeof(*y)); prefetch(x + 3*128/sizeof(*x)); prefetch(y + 3*128/sizeof(*y)); prefetch(x + 4*128/sizeof(*x)); prefetch(y + 4*128/sizeof(*y)); while(i < n) { double y0, y1, y2, y3; double y4, y5, y6, y7; double *xx; double *yy; y0 = a * x[0] + y[0]; y1 = a * x[1] + y[1]; y2 = a * x[2] + y[2]; y3 = a * x[3] + y[3]; y4 = a * x[4] + y[4]; y5 = a * x[5] + y[5]; y6 = a * x[6] + y[6]; y7 = a * x[7] + y[7]; asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); y[0] = y0; y[1] = y1; y[2] = y2; y[3] = y3; y[4] = y4; y[5] = y5; y[6] = y6; y[7] = y7; xx = (x + 4*128/sizeof(*x)); yy = (y + 4*128/sizeof(*y)); asm("":"+r"(yy)::"memory"); prefetch(xx); prefetch(yy); y += 8; x += 8; i += 8 ; } } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; if ( n <= 0 ) return(0); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -32; if ( n1 ) daxpy_kernel_8(n1, x, y , &da ); i = n1; while(i < n) { y[i] += da * x[i] ; i++ ; } return(0); } BLASLONG n1 = n & -4; while(i < n1) { FLOAT m1 = da * x[ix] ; FLOAT m2 = da * x[ix+inc_x] ; FLOAT m3 = da * x[ix+2*inc_x] ; FLOAT m4 = da * x[ix+3*inc_x] ; y[iy] += m1 ; y[iy+inc_y] += m2 ; y[iy+2*inc_y] += m3 ; y[iy+3*inc_y] += m4 ; ix += inc_x*4 ; iy += inc_y*4 ; i+=4 ; } while(i < n) { y[iy] += da * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/arm64/daxpy_thunderx2t99.S000066400000000000000000000103431313527062700212000ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x3 /* X vector address */ #define INC_X x4 /* X stride */ #define Y x5 /* Y vector address */ #define INC_Y x6 /* Y stride */ #define I x1 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #define DA d0 /* scale input value */ #define TMPX d1 #define TMPVX {v1.d}[0] #define TMPY d2 #define TMPVY {v2.d}[0] #define SZ 8 /******************************************************************************/ .macro KERNEL_F1 ldr TMPX, [X], #SZ ldr TMPY, [Y] fmadd TMPY, TMPX, DA, TMPY str TMPY, [Y], #SZ .endm .macro KERNEL_F16 ldp q4, q5, [X] ldp q16, q17, [Y] ldp q6, q7, [X, #32] ldp q18, q19, [Y, #32] fmla v16.2d, v4.2d, v0.d[0] fmla v17.2d, v5.2d, v0.d[0] PRFM PLDL1KEEP, [X, #896] PRFM PLDL1KEEP, [Y, #896] stp q16, q17, [Y] ldp q20, q21, [X, #64] ldp q24, q25, [Y, #64] fmla v18.2d, v6.2d, v0.d[0] fmla v19.2d, v7.2d, v0.d[0] PRFM PLDL1KEEP, [X, #896+64] PRFM PLDL1KEEP, [Y, #896+64] stp q18, q19, [Y, #32] ldp q22, q23, [X, #96] ldp q26, q27, [Y, #96] fmla v24.2d, v20.2d, v0.d[0] fmla v25.2d, v21.2d, v0.d[0] stp q24, q25, [Y, #64] fmla v26.2d, v22.2d, v0.d[0] fmla v27.2d, v23.2d, v0.d[0] stp q26, q27, [Y, #96] add Y, Y, #128 add X, X, #128 .endm .macro KERNEL_F32 KERNEL_F16 KERNEL_F16 .endm .macro INIT_S lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 .endm .macro KERNEL_S1 ld1 TMPVX, [X], INC_X ldr TMPY, [Y] fmadd TMPY, TMPX, DA, TMPY st1 TMPVY, [Y], INC_Y .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble axpy_kernel_L999 fcmp DA, #0.0 beq axpy_kernel_L999 cmp INC_X, #1 bne axpy_kernel_S_BEGIN cmp INC_Y, #1 bne axpy_kernel_S_BEGIN axpy_kernel_F_BEGIN: asr I, N, #5 cmp I, xzr beq axpy_kernel_F1 .align 5 axpy_kernel_F32: KERNEL_F32 subs I, I, #1 bne axpy_kernel_F32 axpy_kernel_F1: ands I, N, #31 ble axpy_kernel_L999 axpy_kernel_F10: KERNEL_F1 subs I, I, #1 bne axpy_kernel_F10 b axpy_kernel_L999 axpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble axpy_kernel_S1 axpy_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne axpy_kernel_S4 axpy_kernel_S1: ands I, N, #3 ble axpy_kernel_L999 axpy_kernel_S10: KERNEL_S1 subs I, I, #1 bne axpy_kernel_S10 axpy_kernel_L999: mov w0, wzr ret OpenBLAS-0.2.20/kernel/arm64/ddot_thunderx.c000066400000000000000000000063271313527062700204040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot = 0.0 ; if ( n < 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { float64x2_t vdot0 = {0.0, 0.0}; float64x2_t vdot1 = {0.0, 0.0}; float64x2_t vdot2 = {0.0, 0.0}; float64x2_t vdot3 = {0.0, 0.0}; float64x2_t *vx = (float64x2_t*)x; float64x2_t *vy = (float64x2_t*)y; #if 0 prefetch(x + 128/sizeof(*x)); prefetch(y + 128/sizeof(*y)); #endif prefetch(x + 2*128/sizeof(*x)); prefetch(y + 2*128/sizeof(*y)); prefetch(x + 3*128/sizeof(*x)); prefetch(y + 3*128/sizeof(*y)); int n1 = n&-8; while(i < n1) { #if 0 vdot0 = vfmaq_f64 (vdot0, vy[0], vx[0]); vdot1 = vfmaq_f64 (vdot1, vy[1], vx[1]); vdot2 = vfmaq_f64 (vdot2, vy[2], vx[2]); vdot3 = vfmaq_f64 (vdot3, vy[3], vx[3]); #else vdot0 = vy[0] * vx[0] + vdot0; vdot1 = vy[1] * vx[1] + vdot1; vdot2 = vy[2] * vx[2] + vdot2; vdot3 = vy[3] * vx[3] + vdot3; #endif vy += 4; vx += 4; i += 8; prefetch(vx + 3*128/sizeof(*x)); prefetch(vy + 3*128/sizeof(*y)); } dot = vaddvq_f64 (vdot0 + vdot1); dot += vaddvq_f64 (vdot2 + vdot3); i = n1; while(i < n) { dot += y[i] * x[i] ; i++ ; } return(dot); } while(i < n) { dot += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(dot); } OpenBLAS-0.2.20/kernel/arm64/dgemm_kernel_4x4.S000066400000000000000000000614121313527062700206350ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define ppC x17 #define ppCRow0 x18 #define ppCRow1 x19 #define ppCRow2 x20 #define ppCRow3 x21 #define ppA x22 #define alpha x23 #define alpha0 d10 #define alphaV0 v10.d[0] #define A_PRE_SIZE 1024 #define B_PRE_SIZE 1024 #define C_PRE_SIZE 128 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 ppC // 18 must save ppCRow0 // 19 must save ppCRow1 // 20 must save ppCRow2 // 21 must save ppCRow3 // 22 must save ppA // 23 must save alpha // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA00, pA01 //v01 pA02, pA03 //v02 ppA00, ppA01 //v03 ppA02, ppA03 //v04 pA10, pA11 //v05 pA12, pA13 //v06 ppA10, ppA11 //v07 ppA12, ppA13 //v08 must save pB00, pB01 //v09 must save pB02, pB03 //v10 must save ALPHA0 //v11 must save //v12 must save pB10, pB11 //v13 must save pB12, pB13 //v14 must save //v15 must save //v16 must save C00, C01 //v17 must save C02, C03 //v18 ppC00, ppC01 //v19 ppC02, ppC03 //v20 C10, C11 //v21 C12, C13 //v22 ppC10, ppC11 //v23 ppC12, ppC13 //v24 C20, C21 //v25 C22, C23 //v26 ppC20, ppC21 //v27 ppC22, ppC23 //v28 C30, C31 //v29 C32, C33 //v30 ppC30, ppC31 //v31 ppC32, ppC33 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov d16, xzr fmov d17, d16 fmov d18, d17 fmov d19, d16 fmov d20, d17 fmov d21, d16 fmov d22, d17 fmov d23, d16 fmov d24, d17 fmov d25, d16 fmov d26, d17 fmov d27, d16 fmov d28, d17 fmov d29, d16 fmov d30, d17 fmov d31, d16 .endm .macro KERNEL8x4_I ldp d8, d9, [pB] add pB, pB, #16 ldp d10, d11, [pB] add pB, pB, #16 ldp q0, q1, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] fmul v29.2d, v1.2d, v11.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 fmul v20.2d, v0.2d, v9.d[0] fmul v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v18.2d, v2.2d, v8.d[0] fmul v31.2d, v3.2d, v11.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] fmul v22.2d, v2.2d, v9.d[0] fmul v27.2d, v3.2d, v10.d[0] ldp d12, d13, [pB] add pB, pB, #16 fmul v24.2d, v0.2d, v10.d[0] fmul v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] // for next round add pA, pA, #32 fmul v26.2d, v2.2d, v10.d[0] fmul v23.2d, v3.2d, v9.d[0] ldp q6, q7, [ppA] // for next round add ppA, ppA, #32 fmul v28.2d, v0.2d, v11.d[0] fmul v17.2d, v1.2d, v8.d[0] ldp d14, d15, [pB] add pB, pB, #16 fmul v30.2d, v2.2d, v11.d[0] fmul v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v29.2d, v5.2d, v15.d[0] ldp d8, d9, [pB] add pB, pB, #16 fmla v18.2d, v6.2d, v12.d[0] fmla v31.2d, v7.2d, v15.d[0] ldp d10, d11, [pB] add pB, pB, #16 fmla v20.2d, v4.2d, v13.d[0] fmla v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.2d, v6.2d, v13.d[0] fmla v27.2d, v7.2d, v14.d[0] fmla v24.2d, v4.2d, v14.d[0] fmla v21.2d, v5.2d, v13.d[0] ldp q0, q1, [pA] add pA, pA, #32 fmla v26.2d, v6.2d, v14.d[0] fmla v23.2d, v7.2d, v13.d[0] fmla v28.2d, v4.2d, v15.d[0] fmla v17.2d, v5.2d, v12.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 fmla v30.2d, v6.2d, v15.d[0] fmla v19.2d, v7.2d, v12.d[0] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v11.d[0] ldp d12, d13, [pB] add pB, pB, #16 fmla v18.2d, v2.2d, v8.d[0] fmla v31.2d, v3.2d, v11.d[0] ldp d14, d15, [pB] add pB, pB, #16 fmla v20.2d, v0.2d, v9.d[0] fmla v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v22.2d, v2.2d, v9.d[0] fmla v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] fmla v24.2d, v0.2d, v10.d[0] fmla v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] add pA, pA, #32 fmla v26.2d, v2.2d, v10.d[0] fmla v23.2d, v3.2d, v9.d[0] fmla v28.2d, v0.2d, v11.d[0] fmla v17.2d, v1.2d, v8.d[0] ldp q6, q7, [ppA] add ppA, ppA, #32 fmla v30.2d, v2.2d, v11.d[0] fmla v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] fmla v25.2d, v5.2d, v14.d[0] fmla v18.2d, v6.2d, v12.d[0] fmla v27.2d, v7.2d, v14.d[0] fmla v20.2d, v4.2d, v13.d[0] fmla v29.2d, v5.2d, v15.d[0] fmla v22.2d, v6.2d, v13.d[0] fmla v31.2d, v7.2d, v15.d[0] fmla v24.2d, v4.2d, v14.d[0] fmla v17.2d, v5.2d, v12.d[0] fmla v26.2d, v6.2d, v14.d[0] fmla v19.2d, v7.2d, v12.d[0] fmla v28.2d, v4.2d, v15.d[0] fmla v21.2d, v5.2d, v13.d[0] fmla v30.2d, v6.2d, v15.d[0] fmla v23.2d, v7.2d, v13.d[0] .endm .macro KERNEL8x4_SUB ldp d8, d9, [pB] add pB, pB, #16 ldp d10, d11, [pB] add pB, pB, #16 ldp q0, q1, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v11.d[0] fmla v20.2d, v0.2d, v9.d[0] fmla v25.2d, v1.2d, v10.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 fmla v24.2d, v0.2d, v10.d[0] fmla v21.2d, v1.2d, v9.d[0] fmla v28.2d, v0.2d, v11.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v2.2d, v8.d[0] fmla v31.2d, v3.2d, v11.d[0] fmla v22.2d, v2.2d, v9.d[0] fmla v27.2d, v3.2d, v10.d[0] fmla v26.2d, v2.2d, v10.d[0] fmla v23.2d, v3.2d, v9.d[0] fmla v30.2d, v2.2d, v11.d[0] fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x4 fmov alpha0, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add ppCRow0, pCRow0, #32 ldp q0, q1, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #64 ldp q2, q3, [ppCRow0] fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 stp q2, q3, [ppCRow0] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add ppCRow1, pCRow1, #32 ldp q4, q5, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 stp q4, q5, [pCRow1] add pCRow1, pCRow1, #64 ldp q6, q7, [ppCRow1] fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 stp q6, q7, [ppCRow1] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add ppCRow2, pCRow2, #32 ldp q0, q1, [pCRow2] fmla v0.2d, v24.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0 stp q0, q1, [pCRow2] add pCRow2, pCRow2, #64 ldp q2, q3, [ppCRow2] fmla v2.2d, v26.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0 stp q2, q3, [ppCRow2] prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] add ppCRow3, pCRow3, #32 ldp q4, q5, [pCRow3] fmla v4.2d, v28.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0 stp q4, q5, [pCRow3] add pCRow3, pCRow3, #64 ldp q6, q7, [ppCRow3] fmla v6.2d, v30.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0 stp q6, q7, [ppCRow3] .endm /******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 fmov d24, d17 fmov d25, d16 fmov d28, d17 fmov d29, d16 .endm .macro KERNEL4x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v28.2d, alphaV0 fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d20, d16 fmov d24, d20 fmov d28, d16 .endm .macro KERNEL2x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow1] fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL1x4_SUB ldr d0, [pA] add pA, pA, #8 ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] .endm .macro SAVE1x4 fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 .endm .macro KERNEL4x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL2x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr .endm .macro KERNEL1x2_SUB ld1 {v8.2d} , [pB] add pB , pB, #16 ldr d0 , [pA] add pA, pA, #8 fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 fmov alpha0, alpha add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 .endm .macro KERNEL4x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr .endm .macro KERNEL2x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d}, [pA] add pA , pA, #16 fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr .endm .macro KERNEL1x1_SUB ldr d8, [pB] add pB , pB, #8 ldr d0, [pA] add pA , pA, #8 fmadd d16, d0, d8, d16 .endm .macro SAVE1x1 fmov alpha0, alpha ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha, d0 prfm PLDL1KEEP, [origPA] prfm PLDL1KEEP, [origPB] lsl LDC, LDC, #3 // ldc = ldc * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble dgemm_kernel_L2_BEGIN dgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC lsl temp, origK, #5 // k * 4 * 8 mov pA, origPA // pA = start of A array add ppA, temp, pA prfm PLDL1KEEP, [ppA] //------------------------------------------------------------------------------ dgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #2 // L = K / 4 cmp counterL , #2 blt dgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a .align 5 dgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 .align 5 dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 ble dgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 dgemm_kernel_L4_M8_40: INIT8x4 dgemm_kernel_L4_M8_44: ands counterL , origK, #3 ble dgemm_kernel_L4_M8_100 .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 bne dgemm_kernel_L4_M8_46 dgemm_kernel_L4_M8_100: lsl temp, origK, #5 prfm PLDL1KEEP, [pA, temp] prfm PLDL1KEEP, [ppA, temp] prfm PLDL1KEEP, [origPB] SAVE8x4 dgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 bne dgemm_kernel_L4_M8_20 dgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dgemm_kernel_L4_END tst counterI, #4 ble dgemm_kernel_L4_M2_BEGIN dgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dgemm_kernel_L4_M4_40 dgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_22 dgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M4_100 dgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_42 dgemm_kernel_L4_M4_100: SAVE4x4 dgemm_kernel_L4_M4_END: dgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L4_M1_BEGIN dgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M2_40 dgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_22 dgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M2_100 dgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_42 dgemm_kernel_L4_M2_100: SAVE2x4 dgemm_kernel_L4_M2_END: dgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L4_END dgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M1_40 dgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_22 dgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M1_100 dgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_42 dgemm_kernel_L4_M1_100: SAVE1x4 dgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- bgt dgemm_kernel_L4_BEGIN /******************************************************************************/ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble dgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble dgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A dgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble dgemm_kernel_L2_M2_BEGIN dgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M4_40 .align 5 dgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_22 dgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M4_100 dgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_42 dgemm_kernel_L2_M4_100: SAVE4x2 dgemm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt dgemm_kernel_L2_M4_20 dgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L2_M1_BEGIN dgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M2_40 dgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_22 dgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M2_100 dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_42 dgemm_kernel_L2_M2_100: SAVE2x2 dgemm_kernel_L2_M2_END: dgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L2_END dgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dgemm_kernel_L2_M1_40 dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_22 dgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M1_100 dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_42 dgemm_kernel_L2_M1_100: SAVE1x2 dgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ dgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble dgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // update pC to point to next mov pA, origPA // pA = A dgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dgemm_kernel_L1_M2_BEGIN dgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M4_40 .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_22 dgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M4_100 dgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_42 dgemm_kernel_L1_M4_100: SAVE4x1 dgemm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt dgemm_kernel_L1_M4_20 dgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L1_M1_BEGIN dgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M2_40 dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_22 dgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M2_100 dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_42 dgemm_kernel_L1_M2_100: SAVE2x1 dgemm_kernel_L1_M2_END: dgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L1_END dgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M1_40 dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_22 dgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M1_100 dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_42 dgemm_kernel_L1_M1_100: SAVE1x1 dgemm_kernel_L1_END: dgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dgemm_kernel_4x8.S000066400000000000000000000741431313527062700206460ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define alpha0 d2 #define alphaV0 v2.d[0] #define alpha1 d3 #define alphaV1 v3.d[0] #define alpha2 d6 #define alphaV2 v6.d[0] #define alpha3 d7 #define alphaV3 v7.d[0] // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA00, pA01 //v01 pA02, pA03 //v02 ALPHA0 //v03 ALPHA1 //v04 pA10, pA11 //v05 pA12, pA13 //v06 ALPHA2 //v07 ALPHA3 //v08 must save pB0_0, pB0_1 //v09 must save pB0_2, pB0_3 //v10 must save pB0_4, pB0_5 //v11 must save pB0_6, pB0_7 //v12 must save pB1_0, pB1_1 //v13 must save pB1_2, pB1_3 //v14 must save pB1_4, pB1_5 //v15 must save pB1_6, pB1_7 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 //v19 C06, C07 //v20 C10, C11 //v21 C12, C13 //v22 C14, C15 //v23 C16, C17 //v24 C20, C21 //v25 C22, C23 //v26 C24, C25 //v27 C26, C27 //v28 C30, C31 //v29 C32, C33 //v30 C34, C35 //v31 C36, C37 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x8 fmov d16, xzr fmov d17, xzr fmov d18, xzr fmov d19, d16 fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 fmov d24, xzr fmov d25, d16 fmov d26, d17 fmov d27, d18 fmov d28, xzr fmov d29, d16 fmov d30, d17 fmov d31, d18 .endm .macro KERNEL4x8_I ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmul v16.2d, v0.2d, v8.d[0] fmul v17.2d, v1.2d, v8.d[0] fmul v18.2d, v0.2d, v8.d[1] fmul v19.2d, v1.2d, v8.d[1] fmul v20.2d, v0.2d, v9.d[0] fmul v21.2d, v1.2d, v9.d[0] fmul v22.2d, v0.2d, v9.d[1] fmul v23.2d, v1.2d, v9.d[1] fmul v24.2d, v0.2d, v10.d[0] fmul v25.2d, v1.2d, v10.d[0] fmul v26.2d, v0.2d, v10.d[1] fmul v27.2d, v1.2d, v10.d[1] fmul v28.2d, v0.2d, v11.d[0] fmul v29.2d, v1.2d, v11.d[0] fmul v30.2d, v0.2d, v11.d[1] fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 ld1 {v14.2d, v15.2d}, [pB] add pB, pB, #32 .endm .macro KERNEL4x8_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v0.2d, v8.d[1] fmla v19.2d, v1.2d, v8.d[1] fmla v20.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v9.d[0] fmla v22.2d, v0.2d, v9.d[1] fmla v23.2d, v1.2d, v9.d[1] fmla v24.2d, v0.2d, v10.d[0] fmla v25.2d, v1.2d, v10.d[0] fmla v26.2d, v0.2d, v10.d[1] fmla v27.2d, v1.2d, v10.d[1] fmla v28.2d, v0.2d, v11.d[0] fmla v29.2d, v1.2d, v11.d[0] fmla v30.2d, v0.2d, v11.d[1] fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 ld1 {v14.2d, v15.2d}, [pB] add pB, pB, #32 prfm PLDL1KEEP, [pA, #512] .endm .macro KERNEL4x8_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v17.2d, v5.2d, v12.d[0] fmla v18.2d, v4.2d, v12.d[1] fmla v19.2d, v5.2d, v12.d[1] fmla v20.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v13.d[0] fmla v22.2d, v4.2d, v13.d[1] fmla v23.2d, v5.2d, v13.d[1] fmla v24.2d, v4.2d, v14.d[0] fmla v25.2d, v5.2d, v14.d[0] fmla v26.2d, v4.2d, v14.d[1] fmla v27.2d, v5.2d, v14.d[1] fmla v28.2d, v4.2d, v15.d[0] fmla v29.2d, v5.2d, v15.d[0] fmla v30.2d, v4.2d, v15.d[1] fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 prfm PLDL1KEEP, [pB, #512] .endm .macro KERNEL4x8_E fmla v16.2d, v4.2d, v12.d[0] fmla v17.2d, v5.2d, v12.d[0] fmla v18.2d, v4.2d, v12.d[1] fmla v19.2d, v5.2d, v12.d[1] fmla v20.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v13.d[0] fmla v22.2d, v4.2d, v13.d[1] fmla v23.2d, v5.2d, v13.d[1] fmla v24.2d, v4.2d, v14.d[0] fmla v25.2d, v5.2d, v14.d[0] fmla v26.2d, v4.2d, v14.d[1] fmla v27.2d, v5.2d, v14.d[1] fmla v28.2d, v4.2d, v15.d[0] fmla v29.2d, v5.2d, v15.d[0] fmla v30.2d, v4.2d, v15.d[1] fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v0.2d, v8.d[1] fmla v19.2d, v1.2d, v8.d[1] fmla v20.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v9.d[0] fmla v22.2d, v0.2d, v9.d[1] fmla v23.2d, v1.2d, v9.d[1] fmla v24.2d, v0.2d, v10.d[0] fmla v25.2d, v1.2d, v10.d[0] fmla v26.2d, v0.2d, v10.d[1] fmla v27.2d, v1.2d, v10.d[1] fmla v28.2d, v0.2d, v11.d[0] fmla v29.2d, v1.2d, v11.d[0] fmla v30.2d, v0.2d, v11.d[1] fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 add pCRow1, pCRow0, LDC ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v10.2d, v11.2d}, [pCRow1] fmla v10.2d, v18.2d, alphaV2 fmla v11.2d, v19.2d, alphaV3 st1 {v10.2d, v11.2d}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow2] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV1 st1 {v12.2d, v13.2d}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v14.2d, v15.2d}, [pCRow1] fmla v14.2d, v22.2d, alphaV2 fmla v15.2d, v23.2d, alphaV3 st1 {v14.2d, v15.2d}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 fmla v9.2d, v25.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v10.2d, v11.2d}, [pCRow1] fmla v10.2d, v26.2d, alphaV2 fmla v11.2d, v27.2d, alphaV3 st1 {v10.2d, v11.2d}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow2] fmla v12.2d, v28.2d, alphaV0 fmla v13.2d, v29.2d, alphaV1 st1 {v12.2d, v13.2d}, [pCRow2] ld1 {v14.2d, v15.2d}, [pCRow1] fmla v14.2d, v30.2d, alphaV2 fmla v15.2d, v31.2d, alphaV3 st1 {v14.2d, v15.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x8 fmov d16, xzr fmov d18, xzr fmov d20, xzr fmov d22, d16 fmov d24, xzr fmov d26, d16 fmov d28, xzr fmov d30, d16 .endm .macro KERNEL2x8_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v18.2d, v0.2d, v8.d[1] fmla v20.2d, v0.2d, v9.d[0] fmla v22.2d, v0.2d, v9.d[1] fmla v24.2d, v0.2d, v10.d[0] fmla v26.2d, v0.2d, v10.d[1] fmla v28.2d, v0.2d, v11.d[0] fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 add pCRow1, pCRow0, LDC ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v10.2d}, [pCRow1] fmla v10.2d, v18.2d, alphaV2 st1 {v10.2d}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow2] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v14.2d}, [pCRow1] fmla v14.2d, v22.2d, alphaV2 st1 {v14.2d}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v8.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v10.2d}, [pCRow1] fmla v10.2d, v26.2d, alphaV2 st1 {v10.2d}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow2] fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v14.2d}, [pCRow1] fmla v14.2d, v30.2d, alphaV2 st1 {v14.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x8 fmov d16, xzr fmov d20, xzr fmov d24, xzr fmov d28, xzr .endm .macro KERNEL1x8_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ldr d0, [pA] add pA, pA, #8 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] fmla v24.2d, v10.2d, v0.d[0] fmla v28.2d, v11.2d, v0.d[0] .endm .macro SAVE1x8 add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC ld1 {v10.d}[0], [pCRow2] ld1 {v10.d}[1], [pCRow1] fmla v10.2d, v20.2d, alphaV1 st1 {v10.d}[0], [pCRow2] st1 {v10.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] fmla v12.2d, v24.2d, alphaV2 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC ld1 {v14.d}[0], [pCRow2] ld1 {v14.d}[1], [pCRow1] fmla v14.2d, v28.2d, alphaV3 st1 {v14.d}[0], [pCRow2] st1 {v14.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 fmov d24, d17 fmov d25, d16 fmov d28, d17 fmov d29, d16 .endm .macro KERNEL4x4_I ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] fmul v29.2d, v1.2d, v9.d[1] fmul v20.2d, v0.2d, v8.d[1] fmul v25.2d, v1.2d, v9.d[0] fmul v24.2d, v0.2d, v9.d[0] fmul v21.2d, v1.2d, v8.d[1] fmul v28.2d, v0.2d, v9.d[1] fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 .endm .macro KERNEL4x4_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 fmla v20.2d, v4.2d, v12.d[1] fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 fmla v24.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] fmla v28.2d, v4.2d, v13.d[1] fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E fmla v16.2d, v4.2d, v12.d[0] fmla v29.2d, v5.2d, v13.d[1] fmla v20.2d, v4.2d, v12.d[1] fmla v25.2d, v5.2d, v13.d[0] fmla v24.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] fmla v28.2d, v4.2d, v13.d[1] fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV2 fmla v13.2d, v21.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 fmla v9.2d, v25.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v28.2d, alphaV2 fmla v13.2d, v29.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d20, d16 fmov d24, d20 fmov d28, d16 .endm .macro KERNEL2x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV1 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV2 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow1] fmla v12.2d, v28.2d, alphaV3 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL1x4_SUB ldr d0, [pA] add pA, pA, #8 ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] .endm .macro SAVE1x4 add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] fmla v12.2d, v20.2d, alphaV1 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 .endm .macro KERNEL4x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV2 fmla v13.2d, v21.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL2x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV1 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr .endm .macro KERNEL1x2_SUB ld1 {v8.2d} , [pB] add pB , pB, #16 ldr d0 , [pA] add pA, pA, #8 fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 .endm .macro KERNEL4x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr .endm .macro KERNEL2x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d}, [pA] add pA , pA, #16 fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr .endm .macro KERNEL1x1_SUB ldr d8, [pB] add pB , pB, #8 ldr d0, [pA] add pA , pA, #8 fmadd d16, d0, d8, d16 .endm .macro SAVE1x1 ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0, d0 fmov alpha1, d0 fmov alpha2, d0 fmov alpha3, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble dgemm_kernel_L4_BEGIN /******************************************************************************/ dgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 mov pA, origPA // pA = start of A array dgemm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dgemm_kernel_L8_M2_BEGIN dgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt dgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 ble dgemm_kernel_L8_M4_22a .align 5 dgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 bgt dgemm_kernel_L8_M4_22 dgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E b dgemm_kernel_L8_M4_44 dgemm_kernel_L8_M4_32: tst counterL, #1 ble dgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E b dgemm_kernel_L8_M4_44 dgemm_kernel_L8_M4_40: INIT4x8 dgemm_kernel_L8_M4_44: ands counterL , origK, #1 ble dgemm_kernel_L8_M4_100 dgemm_kernel_L8_M4_46: KERNEL4x8_SUB dgemm_kernel_L8_M4_100: SAVE4x8 dgemm_kernel_L8_M4_END: subs counterI, counterI, #1 bne dgemm_kernel_L8_M4_20 dgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L8_M1_BEGIN dgemm_kernel_L8_M2_20: INIT2x8 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L8_M2_40 dgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L8_M2_22 dgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L8_M2_100 dgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L8_M2_42 dgemm_kernel_L8_M2_100: SAVE2x8 dgemm_kernel_L8_M2_END: dgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L8_END dgemm_kernel_L8_M1_20: INIT1x8 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L8_M1_40 dgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L8_M1_22 dgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L8_M1_100 dgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L8_M1_42 dgemm_kernel_L8_M1_100: SAVE1x8 dgemm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 subs counterJ, counterJ , #1 // j-- bgt dgemm_kernel_L8_BEGIN /******************************************************************************/ dgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 ble dgemm_kernel_L999 tst counterJ , #4 ble dgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 mov pA, origPA // pA = start of A array dgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dgemm_kernel_L4_M2_BEGIN dgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt dgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble dgemm_kernel_L4_M4_22a .align 5 dgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_22 dgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b dgemm_kernel_L4_M4_44 dgemm_kernel_L4_M4_32: tst counterL, #1 ble dgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b dgemm_kernel_L4_M4_44 dgemm_kernel_L4_M4_40: INIT4x4 dgemm_kernel_L4_M4_44: ands counterL , origK, #1 ble dgemm_kernel_L4_M4_100 dgemm_kernel_L4_M4_46: KERNEL4x4_SUB dgemm_kernel_L4_M4_100: SAVE4x4 dgemm_kernel_L4_M4_END: subs counterI, counterI, #1 bne dgemm_kernel_L4_M4_20 dgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L4_M1_BEGIN dgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M2_40 dgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_22 dgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M2_100 dgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_42 dgemm_kernel_L4_M2_100: SAVE2x4 dgemm_kernel_L4_M2_END: dgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L4_END dgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M1_40 dgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_22 dgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M1_100 dgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_42 dgemm_kernel_L4_M1_100: SAVE1x4 dgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble dgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble dgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A dgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble dgemm_kernel_L2_M2_BEGIN dgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M4_40 .align 5 dgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_22 dgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M4_100 dgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_42 dgemm_kernel_L2_M4_100: SAVE4x2 dgemm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt dgemm_kernel_L2_M4_20 dgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L2_M1_BEGIN dgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M2_40 dgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_22 dgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M2_100 dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_42 dgemm_kernel_L2_M2_100: SAVE2x2 dgemm_kernel_L2_M2_END: dgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L2_END dgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dgemm_kernel_L2_M1_40 dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_22 dgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M1_100 dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_42 dgemm_kernel_L2_M1_100: SAVE1x2 dgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ dgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble dgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A dgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dgemm_kernel_L1_M2_BEGIN dgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M4_40 .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_22 dgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M4_100 dgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_42 dgemm_kernel_L1_M4_100: SAVE4x1 dgemm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt dgemm_kernel_L1_M4_20 dgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L1_M1_BEGIN dgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M2_40 dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_22 dgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M2_100 dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_42 dgemm_kernel_L1_M2_100: SAVE2x1 dgemm_kernel_L1_M2_END: dgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L1_END dgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M1_40 dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_22 dgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M1_100 dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_42 dgemm_kernel_L1_M1_100: SAVE1x1 dgemm_kernel_L1_END: dgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dgemm_kernel_8x4.S000066400000000000000000000765741313527062700206600ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alpha x17 #define alpha0 d10 #define alphaV0 v10.d[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 448 #define C_PRE_SIZE 128 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_0, pA0_1 //v01 pA0_2, pA0_3 //v02 pA0_4, pA0_5 //v03 pA0_6, pA0_7 //v04 pA1_0, pA1_1 //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 //v08 must save pB0_0 //v09 must save pB0_1 //v10 must save pB0_2 --> ALPHA0 //v11 must save pB0_3 //v12 must save pB1_0 //v13 must save pB1_1 //v14 must save pB1_2 //v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 //v19 C06, C07 //v20 C10, C11 //v21 C12, C13 //v22 C14, C15 //v23 C16, C17 //v24 C20, C21 //v25 C22, C23 //v26 C24, C25 //v27 C26, C27 //v28 C30, C31 //v29 C32, C33 //v30 C34, C35 //v31 C36, C37 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, xzr fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 fmov d24, xzr fmov d25, d16 fmov d26, d17 fmov d27, d18 fmov d28, xzr fmov d29, d16 fmov d30, d17 fmov d31, d18 .endm .macro KERNEL8x4_I ldp q0, q1, [pA], #32 ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.d[0] fmul v20.2d, v0.2d, v9.d[0] ldp d10, d11, [pB], #16 fmul v17.2d, v1.2d, v8.d[0] fmul v21.2d, v1.2d, v9.d[0] ldp q2, q3, [pA], #32 fmul v24.2d, v0.2d, v10.d[0] fmul v28.2d, v0.2d, v11.d[0] ldp q4, q5, [pA], #32 fmul v25.2d, v1.2d, v10.d[0] fmul v29.2d, v1.2d, v11.d[0] ldp d12, d13, [pB], #16 fmul v18.2d, v2.2d, v8.d[0] fmul v22.2d, v2.2d, v9.d[0] ldp d14, d15, [pB], #16 fmul v26.2d, v2.2d, v10.d[0] fmul v30.2d, v2.2d, v11.d[0] ldp q6, q7, [pA], #32 fmul v19.2d, v3.2d, v8.d[0] fmul v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v31.2d, v3.2d, v11.d[0] fmul v23.2d, v3.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v9.d[0] ldp q4, q5, [pA], #32 fmla v24.2d, v0.2d, v10.d[0] fmla v28.2d, v0.2d, v11.d[0] ldp d12, d13, [pB], #16 fmla v17.2d, v1.2d, v8.d[0] fmla v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmla v21.2d, v1.2d, v9.d[0] fmla v29.2d, v1.2d, v11.d[0] ldp d14, d15, [pB], #16 fmla v18.2d, v2.2d, v8.d[0] fmla v22.2d, v2.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v26.2d, v2.2d, v10.d[0] fmla v30.2d, v2.2d, v11.d[0] fmla v19.2d, v3.2d, v8.d[0] fmla v23.2d, v3.2d, v9.d[0] ldp q6, q7, [pA], #32 fmla v27.2d, v3.2d, v10.d[0] fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v20.2d, v4.2d, v13.d[0] fmla v24.2d, v4.2d, v14.d[0] fmla v28.2d, v4.2d, v15.d[0] ldp q0, q1, [pA], #32 fmla v17.2d, v5.2d, v12.d[0] fmla v25.2d, v5.2d, v14.d[0] ldp d8, d9, [pB], #16 fmla v21.2d, v5.2d, v13.d[0] fmla v29.2d, v5.2d, v15.d[0] ldp d10, d11, [pB], #16 fmla v18.2d, v6.2d, v12.d[0] fmla v22.2d, v6.2d, v13.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v26.2d, v6.2d, v14.d[0] fmla v30.2d, v6.2d, v15.d[0] fmla v19.2d, v7.2d, v12.d[0] fmla v23.2d, v7.2d, v13.d[0] ldp q2, q3, [pA], #32 fmla v27.2d, v7.2d, v14.d[0] fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] fmla v20.2d, v4.2d, v13.d[0] fmla v24.2d, v4.2d, v14.d[0] fmla v28.2d, v4.2d, v15.d[0] fmla v17.2d, v5.2d, v12.d[0] fmla v25.2d, v5.2d, v14.d[0] fmla v21.2d, v5.2d, v13.d[0] fmla v29.2d, v5.2d, v15.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v18.2d, v6.2d, v12.d[0] fmla v22.2d, v6.2d, v13.d[0] fmla v26.2d, v6.2d, v14.d[0] fmla v30.2d, v6.2d, v15.d[0] fmla v19.2d, v7.2d, v12.d[0] fmla v23.2d, v7.2d, v13.d[0] fmla v27.2d, v7.2d, v14.d[0] fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB ldp q0, q1, [pA], #32 ldp d8, d9, [pB], #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v9.d[0] ldp d10, d11, [pB], #16 fmla v17.2d, v1.2d, v8.d[0] fmla v21.2d, v1.2d, v9.d[0] ldp q2, q3, [pA], #32 fmla v24.2d, v0.2d, v10.d[0] fmla v28.2d, v0.2d, v11.d[0] fmla v25.2d, v1.2d, v10.d[0] fmla v29.2d, v1.2d, v11.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v18.2d, v2.2d, v8.d[0] fmla v22.2d, v2.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmla v26.2d, v2.2d, v10.d[0] fmla v30.2d, v2.2d, v11.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v19.2d, v3.2d, v8.d[0] fmla v27.2d, v3.2d, v10.d[0] fmla v31.2d, v3.2d, v11.d[0] fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 fmov alpha0, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ldp q0, q1, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 ldp q2, q3, [pCRow0] fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ldp q4, q5, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 ldp q6, q7, [pCRow1] fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ldp q0, q1, [pCRow2] fmla v0.2d, v24.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0 stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 ldp q2, q3, [pCRow2] fmla v2.2d, v26.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0 stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ldp q4, q5, [pCRow3] fmla v4.2d, v28.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0 stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 ldp q6, q7, [pCRow3] fmla v6.2d, v30.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0 stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 fmov d24, d17 fmov d25, d16 fmov d28, d17 fmov d29, d16 .endm .macro KERNEL4x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow1, pCRow1, #32 ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow2, pCRow2, #32 ld1 {v12.2d, v13.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow3] prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d20, d16 fmov d24, d20 fmov d28, d16 .endm .macro KERNEL2x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow1, pCRow1, #16 ld1 {v8.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow2, pCRow2, #16 ld1 {v12.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow3] prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] add pCRow3, pCRow3, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL1x4_SUB ldr d0, [pA] add pA, pA, #8 ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] .endm .macro SAVE1x4 fmov alpha0, alpha ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #8 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow1, pCRow1, #8 ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow3] fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow3] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow2, pCRow2, #8 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] add pCRow3, pCRow3, #8 .endm /******************************************************************************/ .macro INIT8x2 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 .endm .macro KERNEL8x2_SUB ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] fmla v22.2d, v2.2d, v8.d[1] fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #64 ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow1, pCRow1, #64 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 .endm .macro KERNEL4x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow1, pCRow1, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL2x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow1, pCRow1, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr .endm .macro KERNEL1x2_SUB ld1 {v8.2d} , [pB] add pB , pB, #16 ldr d0 , [pA] add pA, pA, #8 fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 fmov alpha0, alpha ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #8 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow1, pCRow1, #8 .endm /******************************************************************************/ .macro INIT8x1 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 .endm .macro KERNEL8x1_SUB ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 ldr d8, [pB] add pB , pB, #8 ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 .endm .macro KERNEL4x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr .endm .macro KERNEL2x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d}, [pA] add pA , pA, #16 fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr .endm .macro KERNEL1x1_SUB ldr d8, [pB] add pB , pB, #8 ldr d0, [pA] add pA , pA, #8 fmadd d16, d0, d8, d16 .endm .macro SAVE1x1 fmov alpha0, alpha ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble dgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 dgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array dgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? blt dgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a .align 5 dgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 .align 5 dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 ble dgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 dgemm_kernel_L4_M8_40: INIT8x4 dgemm_kernel_L4_M8_44: ands counterL , origK, #7 ble dgemm_kernel_L4_M8_100 .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 bne dgemm_kernel_L4_M8_46 dgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 dgemm_kernel_L4_M8_END: subs counterI, counterI, #1 bne dgemm_kernel_L4_M8_20 dgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dgemm_kernel_L4_END tst counterI, #4 ble dgemm_kernel_L4_M2_BEGIN dgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M4_40 .align 5 dgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_22 dgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M4_100 dgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_42 dgemm_kernel_L4_M4_100: SAVE4x4 dgemm_kernel_L4_M4_END: dgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L4_M1_BEGIN dgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M2_40 .align 5 dgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_22 dgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] dgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_42 dgemm_kernel_L4_M2_100: SAVE2x4 dgemm_kernel_L4_M2_END: dgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L4_END dgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M1_40 .align 5 dgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_22 dgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] dgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_42 dgemm_kernel_L4_M1_100: SAVE1x4 dgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- bgt dgemm_kernel_L4_BEGIN /******************************************************************************/ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble dgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble dgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC add pC, pCRow1, LDC mov pA, origPA // pA = A dgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dgemm_kernel_L2_M4_BEGIN .align 5 dgemm_kernel_L2_M8_20: INIT8x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M8_40 .align 5 dgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M8_22 dgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M8_42 dgemm_kernel_L2_M8_100: SAVE8x2 dgemm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt dgemm_kernel_L2_M8_20 dgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 ble dgemm_kernel_L2_M2_BEGIN dgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M4_40 .align 5 dgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_22 dgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_42 dgemm_kernel_L2_M4_100: SAVE4x2 dgemm_kernel_L2_M4_END: dgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L2_M1_BEGIN dgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M2_40 dgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M2_100 dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_42 dgemm_kernel_L2_M2_100: SAVE2x2 dgemm_kernel_L2_M2_END: dgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L2_END dgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dgemm_kernel_L2_M1_40 dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] dgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M1_100 dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_42 dgemm_kernel_L2_M1_100: SAVE1x2 dgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ dgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble dgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A dgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dgemm_kernel_L1_M4_BEGIN .align 5 dgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M8_40 .align 5 dgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M8_22 dgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M8_42 dgemm_kernel_L1_M8_100: SAVE8x1 dgemm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt dgemm_kernel_L1_M8_20 dgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 ble dgemm_kernel_L1_M2_BEGIN dgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M4_40 .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_22 dgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_42 dgemm_kernel_L1_M4_100: SAVE4x1 dgemm_kernel_L1_M4_END: dgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L1_M1_BEGIN dgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M2_40 dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M2_100 dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_42 dgemm_kernel_L1_M2_100: SAVE2x1 dgemm_kernel_L1_M2_END: dgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L1_END dgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M1_40 dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_22 dgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_42 dgemm_kernel_L1_M1_100: SAVE1x1 dgemm_kernel_L1_END: dgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S000066400000000000000000001026211313527062700232100ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alpha x17 #define alpha0 d10 #define alphaV0 v10.d[0] #define A_PRE_SIZE x20 #define B_PRE_SIZE x21 #define C_PRE_SIZE x22 #define A_PRE_SIZE_64 x23 #define B_PRE_SIZE_64 x24 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_0, pA0_1 //v01 pA0_2, pA0_3 //v02 pA0_4, pA0_5 //v03 pA0_6, pA0_7 //v04 pA1_0, pA1_1 //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 //v08 must save pB0_0 //v09 must save pB0_1 //v10 must save pB0_2 --> ALPHA0 //v11 must save pB0_3 //v12 must save pB1_0 //v13 must save pB1_1 //v14 must save pB1_2 //v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 //v19 C06, C07 //v20 C10, C11 //v21 C12, C13 //v22 C14, C15 //v23 C16, C17 //v24 C20, C21 //v25 C22, C23 //v26 C24, C25 //v27 C26, C27 //v28 C30, C31 //v29 C32, C33 //v30 C34, C35 //v31 C36, C37 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, xzr fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 fmov d24, xzr fmov d25, d16 fmov d26, d17 fmov d27, d18 fmov d28, xzr fmov d29, d16 fmov d30, d17 fmov d31, d18 .endm .macro KERNEL8x4_I ldp q0, q1, [pA] ldp q8, q9, [pB] ldp q2, q3, [pA, #32] ldp q4, q5, [pA, #64] ldp q12, q13, [pB, #32] ldp q6, q7, [pA, #96] fmul v16.2d, v0.2d, v8.d[0] fmul v20.2d, v0.2d, v8.d[1] fmul v17.2d, v1.2d, v8.d[0] fmul v21.2d, v1.2d, v8.d[1] add pA, pA, #128 add pB, pB, #64 fmul v24.2d, v0.2d, v9.d[0] fmul v28.2d, v0.2d, v9.d[1] fmul v25.2d, v1.2d, v9.d[0] fmul v29.2d, v1.2d, v9.d[1] prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] fmul v18.2d, v2.2d, v8.d[0] fmul v22.2d, v2.2d, v8.d[1] fmul v26.2d, v2.2d, v9.d[0] fmul v30.2d, v2.2d, v9.d[1] fmul v19.2d, v3.2d, v8.d[0] fmul v27.2d, v3.2d, v9.d[0] fmul v31.2d, v3.2d, v9.d[1] fmul v23.2d, v3.2d, v8.d[1] .endm .macro KERNEL8x4_M1_M2 ldp q12, q13, [pB] ldp q4, q5, [pA] ldp q6, q7, [pA, #32] fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] prfm PLDL1KEEP, [pA, A_PRE_SIZE] fmla v17.2d, v1.2d, v8.d[0] fmla v25.2d, v1.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v29.2d, v1.2d, v9.d[1] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] fmla v18.2d, v2.2d, v8.d[0] fmla v22.2d, v2.2d, v8.d[1] fmla v26.2d, v2.2d, v9.d[0] fmla v30.2d, v2.2d, v9.d[1] prfm PLDL1KEEP, [pA, #3840] fmla v19.2d, v3.2d, v8.d[0] fmla v23.2d, v3.2d, v8.d[1] fmla v27.2d, v3.2d, v9.d[0] fmla v31.2d, v3.2d, v9.d[1] ldp q8, q9, [pB, #32] ldp q0, q1, [pA, #64] ldp q2, q3, [pA, #96] fmla v16.2d, v4.2d, v12.d[0] fmla v20.2d, v4.2d, v12.d[1] fmla v24.2d, v4.2d, v13.d[0] fmla v28.2d, v4.2d, v13.d[1] prfm PLDL1KEEP, [pB, B_PRE_SIZE] fmla v17.2d, v5.2d, v12.d[0] fmla v25.2d, v5.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] fmla v29.2d, v5.2d, v13.d[1] fmla v18.2d, v6.2d, v12.d[0] fmla v22.2d, v6.2d, v12.d[1] fmla v26.2d, v6.2d, v13.d[0] fmla v30.2d, v6.2d, v13.d[1] add pB, pB, #64 add pA, pA, #128 fmla v19.2d, v7.2d, v12.d[0] fmla v23.2d, v7.2d, v12.d[1] fmla v27.2d, v7.2d, v13.d[0] fmla v31.2d, v7.2d, v13.d[1] .endm .macro KERNEL8x4_M1 ldp q12, q13, [pB] ldp q4, q5, [pA] ldp q6, q7, [pA, #32] fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] prfm PLDL1KEEP, [pA, A_PRE_SIZE] fmla v17.2d, v1.2d, v8.d[0] fmla v25.2d, v1.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v29.2d, v1.2d, v9.d[1] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] fmla v18.2d, v2.2d, v8.d[0] fmla v22.2d, v2.2d, v8.d[1] fmla v26.2d, v2.2d, v9.d[0] fmla v30.2d, v2.2d, v9.d[1] add pB, pB, #32 add pA, pA, #64 fmla v19.2d, v3.2d, v8.d[0] fmla v23.2d, v3.2d, v8.d[1] fmla v27.2d, v3.2d, v9.d[0] fmla v31.2d, v3.2d, v9.d[1] .endm .macro KERNEL8x4_M2 ldp q8, q9, [pB] ldp q0, q1, [pA] ldp q2, q3, [pA, #32] fmla v16.2d, v4.2d, v12.d[0] fmla v20.2d, v4.2d, v12.d[1] fmla v24.2d, v4.2d, v13.d[0] fmla v28.2d, v4.2d, v13.d[1] prfm PLDL1KEEP, [pB, B_PRE_SIZE] fmla v17.2d, v5.2d, v12.d[0] fmla v25.2d, v5.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] fmla v29.2d, v5.2d, v13.d[1] fmla v18.2d, v6.2d, v12.d[0] fmla v22.2d, v6.2d, v12.d[1] fmla v26.2d, v6.2d, v13.d[0] fmla v30.2d, v6.2d, v13.d[1] add pB, pB, #32 add pA, pA, #64 fmla v19.2d, v7.2d, v12.d[0] fmla v23.2d, v7.2d, v12.d[1] fmla v27.2d, v7.2d, v13.d[0] fmla v31.2d, v7.2d, v13.d[1] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] fmla v20.2d, v4.2d, v12.d[1] fmla v24.2d, v4.2d, v13.d[0] fmla v28.2d, v4.2d, v13.d[1] prfm PLDL1KEEP, [pB, B_PRE_SIZE] fmla v17.2d, v5.2d, v12.d[0] fmla v25.2d, v5.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] fmla v29.2d, v5.2d, v13.d[1] fmla v18.2d, v6.2d, v12.d[0] fmla v22.2d, v6.2d, v12.d[1] fmla v26.2d, v6.2d, v13.d[0] fmla v30.2d, v6.2d, v13.d[1] fmla v19.2d, v7.2d, v12.d[0] fmla v23.2d, v7.2d, v12.d[1] fmla v27.2d, v7.2d, v13.d[0] fmla v31.2d, v7.2d, v13.d[1] .endm .macro KERNEL8x4_SUB ldp q0, q1, [pA] ldp q8, q9, [pB] ldp q2, q3, [pA, #32] prfm PLDL1KEEP, [pA, A_PRE_SIZE] fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v17.2d, v1.2d, v8.d[0] fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v29.2d, v1.2d, v9.d[1] prfm PLDL1KEEP, [pB, B_PRE_SIZE] fmla v18.2d, v2.2d, v8.d[0] fmla v22.2d, v2.2d, v8.d[1] fmla v26.2d, v2.2d, v9.d[0] fmla v30.2d, v2.2d, v9.d[1] add pB, pB, #32 add pA, pA, #64 fmla v19.2d, v3.2d, v8.d[0] fmla v27.2d, v3.2d, v9.d[0] fmla v31.2d, v3.2d, v9.d[1] fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x4 fmov alpha0, alpha ldr q0, [pCRow0] ldr q1, [pCRow0, #16] ldr q2, [pCRow0, #32] ldr q3, [pCRow0, #48] ldr q4, [pCRow1] ldr q5, [pCRow1, #16] ldr q6, [pCRow1, #32] ldr q7, [pCRow1, #48] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 stp q0, q1, [pCRow0] fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 stp q2, q3, [pCRow0, #32] ldr q0, [pCRow2] ldr q1, [pCRow2, #16] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 stp q4, q5, [pCRow1] ldr q2, [pCRow2, #32] ldr q3, [pCRow2, #48] fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 stp q6, q7, [pCRow1, #32] ldr q4, [pCRow3] ldr q5, [pCRow3, #16] fmla v0.2d, v24.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0 stp q0, q1, [pCRow2] ldr q6, [pCRow3, #32] ldr q7, [pCRow3, #48] fmla v2.2d, v26.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0 stp q2, q3, [pCRow2, #32] fmla v4.2d, v28.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0 stp q4, q5, [pCRow3] fmla v6.2d, v30.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0 stp q6, q7, [pCRow3, #32] add pCRow0, pCRow0, #64 add pCRow1, pCRow1, #64 add pCRow2, pCRow2, #64 add pCRow3, pCRow3, #64 .endm /******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 fmov d24, d17 fmov d25, d16 fmov d28, d17 fmov d29, d16 .endm .macro KERNEL4x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] add pCRow1, pCRow1, #32 ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] add pCRow2, pCRow2, #32 ld1 {v12.2d, v13.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow3] prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d20, d16 fmov d24, d20 fmov d28, d16 .endm .macro KERNEL2x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] add pCRow1, pCRow1, #16 ld1 {v8.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] add pCRow2, pCRow2, #16 ld1 {v12.2d}, [pCRow3] fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow3] prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE] add pCRow3, pCRow3, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL1x4_SUB ldr d0, [pA] add pA, pA, #8 ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] .endm .macro SAVE1x4 fmov alpha0, alpha ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #8 prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] add pCRow1, pCRow1, #8 ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow3] fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow3] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] add pCRow2, pCRow2, #8 prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE] add pCRow3, pCRow3, #8 .endm /******************************************************************************/ .macro INIT8x2 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 .endm .macro KERNEL8x2_SUB ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] prfm PLDL1KEEP, [pA, A_PRE_SIZE] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] fmla v22.2d, v2.2d, v8.d[1] fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #64 ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] add pCRow1, pCRow1, #64 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 .endm .macro KERNEL4x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #32 ld1 {v12.2d, v13.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] add pCRow1, pCRow1, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL2x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #16 ld1 {v12.2d}, [pCRow1] fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] add pCRow1, pCRow1, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr .endm .macro KERNEL1x2_SUB ld1 {v8.2d} , [pB] add pB , pB, #16 ldr d0 , [pA] add pA, pA, #8 fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 fmov alpha0, alpha ld1 {v8.d}[0], [pCRow0] ld1 {v8.d}[1], [pCRow1] fmla v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #8 prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] add pCRow1, pCRow1, #8 .endm /******************************************************************************/ .macro INIT8x1 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 .endm .macro KERNEL8x1_SUB ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 ldr d8, [pB] add pB , pB, #8 ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] prfm PLDL1KEEP, [pA, A_PRE_SIZE] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 .endm .macro KERNEL4x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr .endm .macro KERNEL2x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d}, [pA] add pA , pA, #16 fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr .endm .macro KERNEL1x1_SUB ldr d8, [pB] add pB , pB, #8 ldr d0, [pA] add pA , pA, #8 fmadd d16, d0, d8, d16 .endm .macro SAVE1x1 fmov alpha0, alpha ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] add pCRow0, pCRow0, #8 .endm .macro KERNEL8x4_M1_M2_x1 KERNEL8x4_M1_M2 .endm .macro KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x1 KERNEL8x4_M1_M2_x1 .endm .macro KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x2 .endm .macro KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x4 .endm .macro KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8 .endm .macro KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x16 .endm .macro KERNEL8x4_M1_M2_x64 KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x32 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] ldr A_PRE_SIZE, =dgemm_prefetch_size_a ldr A_PRE_SIZE, [A_PRE_SIZE] ldr B_PRE_SIZE, =dgemm_prefetch_size_b ldr B_PRE_SIZE, [B_PRE_SIZE] ldr C_PRE_SIZE, =dgemm_prefetch_size_c ldr C_PRE_SIZE, [C_PRE_SIZE] add A_PRE_SIZE_64, A_PRE_SIZE, #64 add B_PRE_SIZE_64, B_PRE_SIZE, #64 fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble dgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 dgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array dgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #7 // L = K / 128 cmp counterL , #2 // is there at least 4 to do? blt dgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x1 subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a .align 5 dgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x64 subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 .align 5 dgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1_M2_x1 KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 ble dgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x2 KERNEL8x4_M1 KERNEL8x4_E b dgemm_kernel_L4_M8_44 dgemm_kernel_L4_M8_40: INIT8x4 dgemm_kernel_L4_M8_44: ands counterL , origK, #127 ble dgemm_kernel_L4_M8_100 .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 bne dgemm_kernel_L4_M8_46 dgemm_kernel_L4_M8_100: prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE] prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 dgemm_kernel_L4_M8_END: subs counterI, counterI, #1 bne dgemm_kernel_L4_M8_20 dgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dgemm_kernel_L4_END tst counterI, #4 ble dgemm_kernel_L4_M2_BEGIN dgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M4_40 .align 5 dgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL4x4_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_22 dgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M4_100 dgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M4_42 dgemm_kernel_L4_M4_100: SAVE4x4 dgemm_kernel_L4_M4_END: dgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L4_M1_BEGIN dgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M2_40 .align 5 dgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL2x4_SUB KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL2x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_22 dgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] dgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M2_42 dgemm_kernel_L4_M2_100: SAVE2x4 dgemm_kernel_L4_M2_END: dgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L4_END dgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L4_M1_40 .align 5 dgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x4_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x4_SUB KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x4_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_22 dgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] dgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L4_M1_42 dgemm_kernel_L4_M1_100: SAVE1x4 dgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- bgt dgemm_kernel_L4_BEGIN /******************************************************************************/ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble dgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble dgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC add pC, pCRow1, LDC mov pA, origPA // pA = A dgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dgemm_kernel_L2_M4_BEGIN .align 5 dgemm_kernel_L2_M8_20: INIT8x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M8_40 .align 5 dgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M8_22 dgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] dgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M8_42 dgemm_kernel_L2_M8_100: SAVE8x2 dgemm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt dgemm_kernel_L2_M8_20 dgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 ble dgemm_kernel_L2_M2_BEGIN dgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M4_40 .align 5 dgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x2_SUB KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_22 dgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] dgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L2_M4_42 dgemm_kernel_L2_M4_100: SAVE4x2 dgemm_kernel_L2_M4_END: dgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L2_M1_BEGIN dgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dgemm_kernel_L2_M2_40 dgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL2x2_SUB KERNEL2x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] dgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M2_100 dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M2_42 dgemm_kernel_L2_M2_100: SAVE2x2 dgemm_kernel_L2_M2_END: dgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L2_END dgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dgemm_kernel_L2_M1_40 dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] dgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L2_M1_100 dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L2_M1_42 dgemm_kernel_L2_M1_100: SAVE1x2 dgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ dgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble dgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A dgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dgemm_kernel_L1_M4_BEGIN .align 5 dgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M8_40 .align 5 dgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M8_22 dgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] dgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M8_42 dgemm_kernel_L1_M8_100: SAVE8x1 dgemm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt dgemm_kernel_L1_M8_20 dgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 ble dgemm_kernel_L1_M2_BEGIN dgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M4_40 .align 5 dgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x1_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x1_SUB KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_22 dgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] dgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 bgt dgemm_kernel_L1_M4_42 dgemm_kernel_L1_M4_100: SAVE4x1 dgemm_kernel_L1_M4_END: dgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble dgemm_kernel_L1_M1_BEGIN dgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M2_40 dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] dgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M2_100 dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M2_42 dgemm_kernel_L1_M2_100: SAVE2x1 dgemm_kernel_L1_M2_END: dgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dgemm_kernel_L1_END dgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dgemm_kernel_L1_M1_40 dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_22 dgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble dgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt dgemm_kernel_L1_M1_42 dgemm_kernel_L1_M1_100: SAVE1x1 dgemm_kernel_L1_END: dgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dgemm_ncopy_4.S000066400000000000000000000153101313527062700202250ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 #define N x1 #define A00 x2 #define LDA x3 #define B00 x4 #define A01 x5 #define A02 x6 #define A03 x7 #define A04 x8 #define I x9 #define J x10 #define TEMP1 x11 #define TEMP2 x12 #define A_PREFETCH 2560 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm .macro COPY4x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldp q0, q1, [A01], #32 ins v8.d[0], v0.d[0] ins v10.d[0], v0.d[1] ins v12.d[0], v1.d[0] ins v14.d[0], v1.d[1] ldp q2, q3, [A02], #32 ins v8.d[1], v2.d[0] ins v10.d[1], v2.d[1] ins v12.d[1], v3.d[0] ins v14.d[1], v3.d[1] ldp q4, q5, [A03], #32 ins v9.d[0], v4.d[0] ins v11.d[0], v4.d[1] ins v13.d[0], v5.d[0] ins v15.d[0], v5.d[1] ldp q6, q7, [A04], #32 ins v9.d[1], v6.d[0] ins v11.d[1], v6.d[1] ins v13.d[1], v7.d[0] ins v15.d[1], v7.d[1] st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] add B00, B00, #64 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] add B00, B00, #64 .endm .macro COPY1x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 ldr d2, [A03], #8 ldr d3, [A04], #8 st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] add B00, B00, #32 .endm .macro COPY4x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldp q0, q1, [A01], #32 ins v8.d[0], v0.d[0] ins v9.d[0], v0.d[1] ins v10.d[0], v1.d[0] ins v11.d[0], v1.d[1] ldp q2, q3, [A02], #32 ins v8.d[1], v2.d[0] ins v9.d[1], v2.d[1] ins v10.d[1], v3.d[0] ins v11.d[1], v3.d[1] st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] add B00, B00, #64 .endm .macro COPY1x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 stp d0, d1, [B00] add B00, B00, #16 .endm .macro COPY4x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldp q0, q1, [A01], #32 stp q0, q1, [B00], #32 .endm .macro COPY1x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldr d0, [A01], #8 str d0, [B00], #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 SAVE_REGS lsl LDA, LDA, #3 // LDA = LDA * SIZE dgemm_ncopy_L4_BEGIN: asr J, N, #2 // J = N / 4 cmp J, #0 ble dgemm_ncopy_L2_BEGIN .align 5 dgemm_ncopy_L4_M4_BEGIN: mov A01, A00 add A02, A01, LDA add A03, A02, LDA add A04, A03, LDA add A00, A04, LDA asr I, M, #2 // I = M / 4 cmp I, #0 ble dgemm_ncopy_L4_M4_40 .align 5 dgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 bne dgemm_ncopy_L4_M4_20 dgemm_ncopy_L4_M4_40: and I, M , #3 cmp I, #0 ble dgemm_ncopy_L4_M4_END .align 5 dgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 bne dgemm_ncopy_L4_M4_60 dgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- bne dgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ dgemm_ncopy_L2_BEGIN: tst N, #3 ble dgemm_ncopy_L999 tst N, #2 ble dgemm_ncopy_L1_BEGIN dgemm_ncopy_L2_M4_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #2 // I = M / 4 cmp I, #0 ble dgemm_ncopy_L2_M4_40 .align 5 dgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 bne dgemm_ncopy_L2_M4_20 dgemm_ncopy_L2_M4_40: and I, M , #3 cmp I, #0 ble dgemm_ncopy_L2_M4_END .align 5 dgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 bne dgemm_ncopy_L2_M4_60 dgemm_ncopy_L2_M4_END: /*********************************************************************************************/ dgemm_ncopy_L1_BEGIN: tst N, #1 ble dgemm_ncopy_L999 dgemm_ncopy_L1_M4_BEGIN: mov A01, A00 asr I, M, #2 // I = M / 4 cmp I, #0 ble dgemm_ncopy_L1_M4_40 .align 5 dgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 bne dgemm_ncopy_L1_M4_20 dgemm_ncopy_L1_M4_40: and I, M , #3 cmp I, #0 ble dgemm_ncopy_L1_M4_END .align 5 dgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 bne dgemm_ncopy_L1_M4_60 dgemm_ncopy_L1_M4_END: dgemm_ncopy_L999: mov x0, #0 RESTORE_REGS ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dgemm_ncopy_8.S000066400000000000000000000254731313527062700202440ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 #define N x1 #define A00 x2 #define LDA x3 #define B00 x4 #define A01 x5 #define A02 x6 #define A03 x7 #define A04 x8 #define A05 x9 #define A06 x10 #define A07 x11 #define A08 x12 #define I x13 #define J x14 #define TEMP1 x15 #define TEMP2 x16 #define A_PREFETCH 2560 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm /*************************************************************************************/ .macro COPY8x8 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] //prfm PLDL1KEEP, [A05, #A_PREFETCH] //prfm PLDL1KEEP, [A06, #A_PREFETCH] //prfm PLDL1KEEP, [A07, #A_PREFETCH] //prfm PLDL1KEEP, [A08, #A_PREFETCH] COPY4x8 COPY4x8 .endm .macro COPY4x8 ldp q0, q1, [A01], #32 ins v16.d[0], v0.d[0] ins v20.d[0], v0.d[1] ins v24.d[0], v1.d[0] ins v28.d[0], v1.d[1] ldp q2, q3, [A02], #32 ins v16.d[1], v2.d[0] ins v20.d[1], v2.d[1] ins v24.d[1], v3.d[0] ins v28.d[1], v3.d[1] ldp q4, q5, [A03], #32 ins v17.d[0], v4.d[0] ins v21.d[0], v4.d[1] ins v25.d[0], v5.d[0] ins v29.d[0], v5.d[1] ldp q6, q7, [A04], #32 ins v17.d[1], v6.d[0] ins v21.d[1], v6.d[1] ins v25.d[1], v7.d[0] ins v29.d[1], v7.d[1] ldp q8, q9, [A05], #32 ins v18.d[0], v8.d[0] ins v22.d[0], v8.d[1] ins v26.d[0], v9.d[0] ins v30.d[0], v9.d[1] ldp q10, q11, [A06], #32 ins v18.d[1], v10.d[0] ins v22.d[1], v10.d[1] ins v26.d[1], v11.d[0] ins v30.d[1], v11.d[1] ldp q12, q13, [A07], #32 ins v19.d[0], v12.d[0] ins v23.d[0], v12.d[1] ins v27.d[0], v13.d[0] ins v31.d[0], v13.d[1] ldp q14, q15, [A08], #32 ins v19.d[1], v14.d[0] ins v23.d[1], v14.d[1] ins v27.d[1], v15.d[0] ins v31.d[1], v15.d[1] st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [B00] add B00, B00, #64 st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [B00] add B00, B00, #64 st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] add B00, B00, #64 st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] add B00, B00, #64 .endm .macro COPY1x8 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] //prfm PLDL1KEEP, [A05, #A_PREFETCH] //prfm PLDL1KEEP, [A06, #A_PREFETCH] //prfm PLDL1KEEP, [A07, #A_PREFETCH] //prfm PLDL1KEEP, [A08, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 ldr d2, [A03], #8 ldr d3, [A04], #8 ldr d4, [A05], #8 ldr d5, [A06], #8 ldr d6, [A07], #8 ldr d7, [A08], #8 st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] add B00, B00, #32 st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B00] add B00, B00, #32 .endm /*************************************************************************************/ .macro COPY8x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldp q0, q1, [A01], #32 ins v8.d[0], v0.d[0] ins v10.d[0], v0.d[1] ins v12.d[0], v1.d[0] ins v14.d[0], v1.d[1] ldp q2, q3, [A02], #32 ins v8.d[1], v2.d[0] ins v10.d[1], v2.d[1] ins v12.d[1], v3.d[0] ins v14.d[1], v3.d[1] ldp q4, q5, [A03], #32 ins v9.d[0], v4.d[0] ins v11.d[0], v4.d[1] ins v13.d[0], v5.d[0] ins v15.d[0], v5.d[1] ldp q6, q7, [A04], #32 ins v9.d[1], v6.d[0] ins v11.d[1], v6.d[1] ins v13.d[1], v7.d[0] ins v15.d[1], v7.d[1] st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] add B00, B00, #64 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] add B00, B00, #64 ldp q16, q17, [A01], #32 ins v24.d[0], v16.d[0] ins v26.d[0], v16.d[1] ins v28.d[0], v17.d[0] ins v30.d[0], v17.d[1] ldp q18, q19, [A02], #32 ins v24.d[1], v18.d[0] ins v26.d[1], v18.d[1] ins v28.d[1], v19.d[0] ins v30.d[1], v19.d[1] ldp q20, q21, [A03], #32 ins v25.d[0], v20.d[0] ins v27.d[0], v20.d[1] ins v29.d[0], v21.d[0] ins v31.d[0], v21.d[1] ldp q22, q23, [A04], #32 ins v25.d[1], v22.d[0] ins v27.d[1], v22.d[1] ins v29.d[1], v23.d[0] ins v31.d[1], v23.d[1] st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] add B00, B00, #64 st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] add B00, B00, #64 .endm .macro COPY1x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 ldr d2, [A03], #8 ldr d3, [A04], #8 st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] add B00, B00, #32 .endm /*************************************************************************************/ .macro COPY8x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A01], #32 ins v8.d[0], v0.d[0] ins v9.d[0], v0.d[1] ins v10.d[0], v1.d[0] ins v11.d[0], v1.d[1] ins v12.d[0], v2.d[0] ins v13.d[0], v2.d[1] ins v14.d[0], v3.d[0] ins v15.d[0], v3.d[1] ldp q4, q5, [A02], #32 ldp q6, q7, [A02], #32 ins v8.d[1], v4.d[0] ins v9.d[1], v4.d[1] ins v10.d[1], v5.d[0] ins v11.d[1], v5.d[1] ins v12.d[1], v6.d[0] ins v13.d[1], v6.d[1] ins v14.d[1], v7.d[0] ins v15.d[1], v7.d[1] st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] add B00, B00, #64 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] add B00, B00, #64 .endm .macro COPY1x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 stp d0, d1, [B00] add B00, B00, #16 .endm /*************************************************************************************/ .macro COPY8x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A01], #32 stp q0, q1, [B00], #32 stp q2, q3, [B00], #32 .endm .macro COPY1x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldr d0, [A01], #8 str d0, [B00], #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 SAVE_REGS lsl LDA, LDA, #3 // LDA = LDA * SIZE dgemm_ncopy_L8_BEGIN: asr J, N, #3 // J = N / 8 cmp J, #0 ble dgemm_ncopy_L4_BEGIN dgemm_ncopy_L8_M8_BEGIN: mov A01, A00 add A02, A01, LDA add A03, A02, LDA add A04, A03, LDA add A05, A04, LDA add A06, A05, LDA add A07, A06, LDA add A08, A07, LDA add A00, A08, LDA asr I, M, #3 // I = M / 8 cmp I, #0 ble dgemm_ncopy_L8_M8_40 dgemm_ncopy_L8_M8_20: COPY8x8 subs I , I , #1 bne dgemm_ncopy_L8_M8_20 dgemm_ncopy_L8_M8_40: and I, M , #7 cmp I, #0 ble dgemm_ncopy_L8_M8_END dgemm_ncopy_L8_M8_60: COPY1x8 subs I , I , #1 bne dgemm_ncopy_L8_M8_60 dgemm_ncopy_L8_M8_END: subs J , J, #1 // j-- bne dgemm_ncopy_L8_M8_BEGIN /*********************************************************************************************/ dgemm_ncopy_L4_BEGIN: tst N, #7 ble dgemm_ncopy_L999 tst N, #4 ble dgemm_ncopy_L2_BEGIN dgemm_ncopy_L4_M8_BEGIN: mov A01, A00 add A02, A01, LDA add A03, A02, LDA add A04, A03, LDA add A00, A04, LDA asr I, M, #3 // I = M / 8 cmp I, #0 ble dgemm_ncopy_L4_M8_40 dgemm_ncopy_L4_M8_20: COPY8x4 subs I , I , #1 bne dgemm_ncopy_L4_M8_20 dgemm_ncopy_L4_M8_40: and I, M , #7 cmp I, #0 ble dgemm_ncopy_L4_M8_END dgemm_ncopy_L4_M8_60: COPY1x4 subs I , I , #1 bne dgemm_ncopy_L4_M8_60 dgemm_ncopy_L4_M8_END: /*********************************************************************************************/ dgemm_ncopy_L2_BEGIN: tst N, #3 ble dgemm_ncopy_L999 tst N, #2 ble dgemm_ncopy_L1_BEGIN dgemm_ncopy_L2_M8_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #3 // I = M / 8 cmp I, #0 ble dgemm_ncopy_L2_M8_40 dgemm_ncopy_L2_M8_20: COPY8x2 subs I , I , #1 bne dgemm_ncopy_L2_M8_20 dgemm_ncopy_L2_M8_40: and I, M , #7 cmp I, #0 ble dgemm_ncopy_L2_M8_END dgemm_ncopy_L2_M8_60: COPY1x2 subs I , I , #1 bne dgemm_ncopy_L2_M8_60 dgemm_ncopy_L2_M8_END: /*********************************************************************************************/ dgemm_ncopy_L1_BEGIN: tst N, #1 ble dgemm_ncopy_L999 dgemm_ncopy_L1_M8_BEGIN: mov A01, A00 asr I, M, #3 // I = M / 8 cmp I, #0 ble dgemm_ncopy_L1_M8_40 dgemm_ncopy_L1_M8_20: COPY8x1 subs I , I , #1 bne dgemm_ncopy_L1_M8_20 dgemm_ncopy_L1_M8_40: and I, M , #7 cmp I, #0 ble dgemm_ncopy_L1_M8_END dgemm_ncopy_L1_M8_60: COPY1x1 subs I , I , #1 bne dgemm_ncopy_L1_M8_60 dgemm_ncopy_L1_M8_END: dgemm_ncopy_L999: mov x0, #0 RESTORE_REGS ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dgemm_tcopy_4.S000066400000000000000000000175171313527062700202460ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 #define N x1 #define A x2 #define LDA x3 #define B x4 #define M4 x5 #define A01 x6 #define A02 x7 #define A03 x8 #define A04 x9 #define B01 x10 #define B02 x11 #define B03 x12 #define B04 x13 #define I x14 #define J x15 #define TEMP1 x16 #define TEMP2 x17 #define A_PREFETCH 2560 #define B_PREFETCH 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm .macro COPY4x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A02], #32 ////prfm PLDL1KEEP, [B01, #B_PREFETCH] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] add TEMP1, B01, #64 ldp q4, q5, [A03], #32 ldp q6, q7, [A04], #32 ////prfm PLDL1KEEP, [B01, #B_PREFETCH] st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] add B01, B01, M4 .endm .macro COPY2x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr q0, [A01], #16 ldr q1, [A02], #16 ldr q2, [A03], #16 ldr q3, [A04], #16 ////prfm PLDL1KEEP, [B02, #B_PREFETCH] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] add B02, B02, #64 .endm .macro COPY1x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 ldr d2, [A03], #8 ldr d3, [A04], #8 ////prfm PLDL1KEEP, [B03, #B_PREFETCH] st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B03] add B03, B03, #32 .endm /*************************************************************************************************************************/ .macro COPY4x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A02], #32 ////prfm PLDL1KEEP, [B01, #B_PREFETCH] st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] add B01, B01, M4 .endm .macro COPY2x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr q0, [A01], #16 ldr q1, [A02], #16 ////prfm PLDL1KEEP, [B02, #B_PREFETCH] stp q0, q1, [B02] add B02, B02, #32 .endm .macro COPY1x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 ////prfm PLDL1KEEP, [B03, #B_PREFETCH] stp d0, d1, [B03] add B03, B03, #16 .endm /*************************************************************************************************************************/ .macro COPY4x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldp q0, q1, [A01], #32 ////prfm PLDL1KEEP, [B01, #B_PREFETCH] stp q0, q1, [B01] add B01, B01, M4 .endm .macro COPY2x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldr q0, [A01], #16 ////prfm PLDL1KEEP, [B02, #B_PREFETCH] str q0, [B02] add B02, B02, #16 .endm .macro COPY1x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldr d0, [A01], #8 ////prfm PLDL1KEEP, [B03, #B_PREFETCH] str d0, [B03] add B03, B03, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 SAVE_REGS lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl TEMP1, M, #3 // x12 = M * SIZE and B02 , N , #-4 and B03 , N , #-2 mul B02, B02, TEMP1 mul B03, B03, TEMP1 add B02 , B02, B add B03 , B03, B lsl M4, M, #5 // M4 = M * 4 * SIZE dgemm_tcopy_L4_BEGIN: asr J, M, #2 // J = M / 4 cmp J, #0 ble dgemm_tcopy_L2_BEGIN .align 5 dgemm_tcopy_L4_M4_BEGIN: mov A01, A add A02, A01, LDA add A03, A02, LDA add A04, A03, LDA add A, A04, LDA mov B01, B add B, B01, #128 // B = B + 16 * SIZE asr I, N, #2 // I = N / 4 cmp I, #0 ble dgemm_tcopy_L4_M4_40 .align 5 dgemm_tcopy_L4_M4_20: COPY4x4 subs I , I , #1 bne dgemm_tcopy_L4_M4_20 dgemm_tcopy_L4_M4_40: tst N , #2 ble dgemm_tcopy_L4_M4_60 COPY2x4 dgemm_tcopy_L4_M4_60: tst N, #1 ble dgemm_tcopy_L4_M4_END COPY1x4 dgemm_tcopy_L4_M4_END: subs J , J, #1 // j-- bne dgemm_tcopy_L4_M4_BEGIN /*********************************************************************************************/ dgemm_tcopy_L2_BEGIN: tst M, #3 ble dgemm_tcopy_L999 tst M, #2 ble dgemm_tcopy_L1_BEGIN dgemm_tcopy_L2_M4_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA mov B01, B add B, B01, #64 // B = B + 8 * SIZE asr I, N, #2 // I = N / 4 cmp I, #0 ble dgemm_tcopy_L2_M4_40 .align 5 dgemm_tcopy_L2_M4_20: COPY4x2 subs I , I , #1 bne dgemm_tcopy_L2_M4_20 dgemm_tcopy_L2_M4_40: tst N , #2 ble dgemm_tcopy_L2_M4_60 COPY2x2 dgemm_tcopy_L2_M4_60: tst N , #1 ble dgemm_tcopy_L2_M4_END COPY1x2 dgemm_tcopy_L2_M4_END: /*********************************************************************************************/ dgemm_tcopy_L1_BEGIN: tst M, #1 ble dgemm_tcopy_L999 dgemm_tcopy_L1_M4_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #2 // I = M / 4 cmp I, #0 ble dgemm_tcopy_L1_M4_40 .align 5 dgemm_tcopy_L1_M4_20: COPY4x1 subs I , I , #1 bne dgemm_tcopy_L1_M4_20 dgemm_tcopy_L1_M4_40: tst N , #2 ble dgemm_tcopy_L1_M4_60 COPY2x1 dgemm_tcopy_L1_M4_60: tst N , #1 ble dgemm_tcopy_L1_M4_END COPY1x1 dgemm_tcopy_L1_M4_END: dgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dgemm_tcopy_8.S000066400000000000000000000323041313527062700202410ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 #define N x1 #define A x2 #define LDA x3 #define B x4 #define M8 x5 #define A01 x6 #define A02 x7 #define A03 x8 #define A04 x9 #define A05 x10 #define A06 x11 #define A07 x12 #define A08 x13 #define B01 x14 #define B02 x15 #define B03 x16 #define B04 x17 #define I x18 #define J x19 #define TEMP1 x20 #define TEMP2 x21 #define A_PREFETCH 2560 #define B_PREFETCH 256 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm /*************************************************************************************************************************/ .macro COPY8x8 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] //prfm PLDL1KEEP, [A05, #A_PREFETCH] //prfm PLDL1KEEP, [A06, #A_PREFETCH] //prfm PLDL1KEEP, [A07, #A_PREFETCH] //prfm PLDL1KEEP, [A08, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A01], #32 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] add TEMP1, B01, #64 ldp q4, q5, [A02], #32 ldp q6, q7, [A02], #32 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q8, q9, [A03], #32 ldp q10, q11, [A03], #32 st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q12, q13, [A04], #32 ldp q14, q15, [A04], #32 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q16, q17, [A05], #32 ldp q18, q19, [A05], #32 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q20, q21, [A06], #32 ldp q22, q23, [A06], #32 st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q24, q25, [A07], #32 ldp q26, q27, [A07], #32 st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q28, q29, [A08], #32 ldp q30, q31, [A08], #32 st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [TEMP1] add TEMP1, TEMP1, #64 add B01, B01, M8 .endm .macro COPY4x8 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] //prfm PLDL1KEEP, [A05, #A_PREFETCH] //prfm PLDL1KEEP, [A06, #A_PREFETCH] //prfm PLDL1KEEP, [A07, #A_PREFETCH] //prfm PLDL1KEEP, [A08, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A02], #32 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] add B02, B02, #64 ldp q4, q5, [A03], #32 ldp q6, q7, [A04], #32 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] add B02, B02, #64 ldp q8, q9, [A05], #32 ldp q10, q11, [A06], #32 st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B02] add B02, B02, #64 ldp q12, q13, [A07], #32 ldp q14, q15, [A08], #32 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B02] add B02, B02, #64 .endm .macro COPY2x8 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] //prfm PLDL1KEEP, [A05, #A_PREFETCH] //prfm PLDL1KEEP, [A06, #A_PREFETCH] //prfm PLDL1KEEP, [A07, #A_PREFETCH] //prfm PLDL1KEEP, [A08, #A_PREFETCH] ldr q0, [A01], #16 ldr q1, [A02], #16 ldr q2, [A03], #16 ldr q3, [A04], #16 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] add B03, B03, #64 ldr q4, [A05], #16 ldr q5, [A06], #16 ldr q6, [A07], #16 ldr q7, [A08], #16 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B03] add B03, B03, #64 .endm .macro COPY1x8 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] //prfm PLDL1KEEP, [A05, #A_PREFETCH] //prfm PLDL1KEEP, [A06, #A_PREFETCH] //prfm PLDL1KEEP, [A07, #A_PREFETCH] //prfm PLDL1KEEP, [A08, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 ldr d2, [A03], #8 ldr d3, [A04], #8 st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] add B04, B04, #32 ldr d4, [A05], #8 ldr d5, [A06], #8 ldr d6, [A07], #8 ldr d7, [A08], #8 st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B04] add B04, B04, #32 .endm /*************************************************************************************************************************/ .macro COPY8x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A01], #32 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] add TEMP1, B01, #64 ldp q4, q5, [A02], #32 ldp q6, q7, [A02], #32 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q8, q9, [A03], #32 ldp q10, q11, [A03], #32 st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] add TEMP1, TEMP1, #64 ldp q12, q13, [A04], #32 ldp q14, q15, [A04], #32 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] add TEMP1, TEMP1, #64 add B01, B01, M8 .endm .macro COPY4x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A02], #32 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] add B02, B02, #64 ldp q4, q5, [A03], #32 ldp q6, q7, [A04], #32 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] add B02, B02, #64 .endm .macro COPY2x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr q0, [A01], #16 ldr q1, [A02], #16 ldr q2, [A03], #16 ldr q3, [A04], #16 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] add B03, B03, #64 .endm .macro COPY1x4 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] //prfm PLDL1KEEP, [A03, #A_PREFETCH] //prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 ldr d2, [A03], #8 ldr d3, [A04], #8 st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] add B04, B04, #32 .endm /*************************************************************************************************************************/ .macro COPY8x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A01], #32 ldp q4, q5, [A02], #32 ldp q6, q7, [A02], #32 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] add TEMP1, B01, #64 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] add B01, B01, M8 .endm .macro COPY4x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A02], #32 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] add B02, B02, #64 .endm .macro COPY2x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr q0, [A01], #16 ldr q1, [A02], #16 stp q0, q1, [B03] add B03, B03, #32 .endm .macro COPY1x2 //prfm PLDL1KEEP, [A01, #A_PREFETCH] //prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr d0, [A01], #8 ldr d1, [A02], #8 stp d0, d1, [B04] add B04, B04, #16 .endm /*************************************************************************************************************************/ .macro COPY8x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldp q0, q1, [A01], #32 ldp q2, q3, [A01], #32 stp q0, q1, [B01] add TEMP1, B01, #32 stp q2, q3, [TEMP1] add B01, B01, M8 .endm .macro COPY4x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldp q0, q1, [A01], #32 stp q0, q1, [B02] add B02, B02, #32 .endm .macro COPY2x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldr q0, [A01], #16 str q0, [B03] add B03, B03, #16 .endm .macro COPY1x1 //prfm PLDL1KEEP, [A01, #A_PREFETCH] ldr d0, [A01], #8 str d0, [B04] add B04, B04, #8 .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 SAVE_REGS lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl TEMP1, M, #3 // TEMP1 = M * SIZE and B02 , N , #-8 and B03 , N , #-4 and B04 , N , #-2 mul B02, B02, TEMP1 mul B03, B03, TEMP1 mul B04, B04, TEMP1 add B02 , B02, B add B03 , B03, B add B04 , B04, B lsl M8, M, #6 // M8 = M * 8 * SIZE dgemm_tcopy_L8_BEGIN: asr J, M, #3 // J = M / 4 cmp J, #0 ble dgemm_tcopy_L4_BEGIN .align 5 dgemm_tcopy_L8_M8_BEGIN: mov A01, A add A02, A01, LDA add A03, A02, LDA add A04, A03, LDA add A05, A04, LDA add A06, A05, LDA add A07, A06, LDA add A08, A07, LDA add A, A08, LDA mov B01, B add B, B01, #512 // B = B + 64 * SIZE asr I, N, #3 // I = N / 8 cmp I, #0 ble dgemm_tcopy_L8_M8_40 .align 5 dgemm_tcopy_L8_M8_20: COPY8x8 subs I , I , #1 bne dgemm_tcopy_L8_M8_20 dgemm_tcopy_L8_M8_40: tst N , #4 ble dgemm_tcopy_L8_M8_60 COPY4x8 dgemm_tcopy_L8_M8_60: tst N , #2 ble dgemm_tcopy_L8_M8_80 COPY2x8 dgemm_tcopy_L8_M8_80: tst N, #1 ble dgemm_tcopy_L8_M8_END COPY1x8 dgemm_tcopy_L8_M8_END: subs J , J, #1 // j-- bne dgemm_tcopy_L8_M8_BEGIN /*********************************************************************************************/ dgemm_tcopy_L4_BEGIN: tst M, #7 ble dgemm_tcopy_L999 tst M, #4 ble dgemm_tcopy_L2_BEGIN dgemm_tcopy_L4_M8_BEGIN: mov A01, A add A02, A01, LDA add A03, A02, LDA add A04, A03, LDA add A, A04, LDA mov B01, B add B, B01, #256 // B = B + 32 * SIZE asr I, N, #3 // I = N / 8 cmp I, #0 ble dgemm_tcopy_L4_M8_40 .align 5 dgemm_tcopy_L4_M8_20: COPY8x4 subs I , I , #1 bne dgemm_tcopy_L4_M8_20 dgemm_tcopy_L4_M8_40: tst N , #4 ble dgemm_tcopy_L4_M8_60 COPY4x4 dgemm_tcopy_L4_M8_60: tst N , #2 ble dgemm_tcopy_L4_M8_80 COPY2x4 dgemm_tcopy_L4_M8_80: tst N, #1 ble dgemm_tcopy_L4_M8_END COPY1x4 dgemm_tcopy_L4_M8_END: /*********************************************************************************************/ dgemm_tcopy_L2_BEGIN: tst M, #3 ble dgemm_tcopy_L999 tst M, #2 ble dgemm_tcopy_L1_BEGIN dgemm_tcopy_L2_M8_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA mov B01, B add B, B01, #128 // B = B + 16 * SIZE asr I, N, #3 // I = N / 8 cmp I, #0 ble dgemm_tcopy_L2_M8_40 .align 5 dgemm_tcopy_L2_M8_20: COPY8x2 subs I , I , #1 bne dgemm_tcopy_L2_M8_20 dgemm_tcopy_L2_M8_40: tst N , #4 ble dgemm_tcopy_L2_M8_60 COPY4x2 dgemm_tcopy_L2_M8_60: tst N , #2 ble dgemm_tcopy_L2_M8_80 COPY2x2 dgemm_tcopy_L2_M8_80: tst N , #1 ble dgemm_tcopy_L2_M8_END COPY1x2 dgemm_tcopy_L2_M8_END: /*********************************************************************************************/ dgemm_tcopy_L1_BEGIN: tst M, #1 ble dgemm_tcopy_L999 dgemm_tcopy_L1_M8_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #3 // I = M / 8 cmp I, #0 ble dgemm_tcopy_L1_M8_40 .align 5 dgemm_tcopy_L1_M8_20: COPY8x1 subs I , I , #1 bne dgemm_tcopy_L1_M8_20 dgemm_tcopy_L1_M8_40: tst N , #4 ble dgemm_tcopy_L1_M8_60 COPY4x1 dgemm_tcopy_L1_M8_60: tst N , #2 ble dgemm_tcopy_L1_M8_80 COPY2x1 dgemm_tcopy_L1_M8_80: tst N , #1 ble dgemm_tcopy_L1_M8_END COPY1x1 dgemm_tcopy_L1_M8_END: dgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dot.S000066400000000000000000000114441313527062700162730ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define Y x3 /* Y vector address */ #define INC_Y x4 /* Y stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #if !defined(DSDOT) #define REG0 wzr #define DOTF s0 #else // DSDOT #define REG0 xzr #define DOTF d0 #endif #define DOTI s1 #define TMPX s2 #define LD1VX {v2.s}[0] #define TMPY s3 #define LD1VY {v3.s}[0] #define TMPVY v3.s[0] #define SZ 4 #else #define REG0 xzr #define DOTF d0 #define DOTI d1 #define TMPX d2 #define LD1VX {v2.d}[0] #define TMPY d3 #define LD1VY {v3.d}[0] #define TMPVY v3.d[0] #define SZ 8 #endif /******************************************************************************/ .macro KERNEL_F1 ldr TMPX, [X], #SZ ldr TMPY, [Y], #SZ #if !defined(DSDOT) fmadd DOTF, TMPX, TMPY, DOTF #else // DSDOT fmul TMPX, TMPX, TMPY fcvt d2, TMPX fadd DOTF, DOTF, d2 #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld1 {v2.4s}, [X], #16 ld1 {v3.4s}, [Y], #16 #if !defined(DSDOT) fmla v0.4s, v2.4s, v3.4s #else fmul v2.4s, v2.4s, v3.4s ext v3.16b, v2.16b, v2.16b, #8 fcvtl v2.2d, v2.2s fcvtl v3.2d, v3.2s fadd v0.2d, v0.2d, v2.2d fadd v0.2d, v0.2d, v3.2d #endif #else //DOUBLE ld1 {v2.2d, v3.2d}, [X], #32 ld1 {v4.2d, v5.2d}, [Y], #32 fmul v2.2d, v2.2d, v4.2d fmul v3.2d, v3.2d, v5.2d fadd v0.2d, v0.2d, v2.2d fadd v0.2d, v0.2d, v3.2d #endif PRFM PLDL1KEEP, [X, #1024] PRFM PLDL1KEEP, [Y, #1024] .endm .macro KERNEL_F4_FINALIZE #if !defined(DOUBLE) #if !defined(DSDOT) ext v1.16b, v0.16b, v0.16b, #8 fadd v0.2s, v0.2s, v1.2s faddp DOTF, v0.2s #else faddp DOTF, v0.2d #endif #else //DOUBLE faddp DOTF, v0.2d #endif .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 lsl INC_Y, INC_Y, #2 #else lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #endif .endm .macro KERNEL_S1 ld1 LD1VX, [X], INC_X ld1 LD1VY, [Y], INC_Y #if !defined(DSDOT) fmadd DOTF, TMPX, TMPY, DOTF #else // DSDOT fmul TMPX, TMPX, TMPY fcvt d2, TMPX fadd DOTF, DOTF, d2 #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE fmov DOTF, REG0 #if defined(DOUBLE) fmov d6, DOTF #endif cmp N, xzr ble dot_kernel_L999 cmp INC_X, #1 bne dot_kernel_S_BEGIN cmp INC_Y, #1 bne dot_kernel_S_BEGIN dot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq dot_kernel_F1 dot_kernel_F4: KERNEL_F4 subs I, I, #1 bne dot_kernel_F4 KERNEL_F4_FINALIZE dot_kernel_F1: ands I, N, #3 ble dot_kernel_L999 dot_kernel_F10: KERNEL_F1 subs I, I, #1 bne dot_kernel_F10 ret dot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble dot_kernel_S1 dot_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne dot_kernel_S4 dot_kernel_S1: ands I, N, #3 ble dot_kernel_L999 dot_kernel_S10: KERNEL_S1 subs I, I, #1 bne dot_kernel_S10 dot_kernel_L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dot_thunderx.c000066400000000000000000000052471313527062700202400ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif { BLASLONG i=0; BLASLONG ix=0,iy=0; #if defined(DSDOT) double dot = 0.0 ; #else FLOAT dot = 0.0 ; #endif if ( n < 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { int n1 = n & -4; while(i < n1) { #if defined(DSDOT) dot += (double) y[i] * (double) x[i] + (double) y[i+1] * (double) x[i+1] + (double) y[i+2] * (double) x[i+2] + (double) y[i+3] * (double) x[i+3] ; #else dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] ; #endif i+=4 ; } while(i < n) { #if defined(DSDOT) dot += (double) y[i] * (double) x[i] ; #else dot += y[i] * x[i] ; #endif i++ ; } return(dot); } while(i < n) { #if defined(DSDOT) dot += (double) y[iy] * (double) x[ix] ; #else dot += y[iy] * x[ix] ; #endif ix += inc_x ; iy += inc_y ; i++ ; } return(dot); } OpenBLAS-0.2.20/kernel/arm64/dot_thunderx2t99.c000066400000000000000000000301031313527062700206550ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if !defined(DSDOT) #define RETURN_TYPE FLOAT #else #define RETURN_TYPE double #endif #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define Y "x3" /* "Y" vector address */ #define INC_Y "x4" /* "Y" stride */ #define J "x5" /* loop variable */ #if !defined(DOUBLE) #if !defined(DSDOT) #define REG0 "wzr" #define DOTF "s0" #define TMPX "s16" #define TMPY "s24" #define INC_SHIFT "2" #define N_DIV_SHIFT "6" #define N_REM_MASK "63" #else #define REG0 "xzr" #define DOTF "d0" #define TMPX "s16" #define TMPX1 "d2" #define TMPY "s24" #define TMPY1 "d3" #define INC_SHIFT "2" #define N_DIV_SHIFT "4" #define N_REM_MASK "15" #endif #else #define REG0 "xzr" #define DOTF "d0" #define TMPX "d16" #define TMPY "d24" #define INC_SHIFT "3" #define N_DIV_SHIFT "5" #define N_REM_MASK "31" #endif #if !defined(DOUBLE) #if !defined(DSDOT) #define KERNEL_F1 \ " ldr "TMPX", ["X"] \n" \ " ldr "TMPY", ["Y"] \n" \ " add "X", "X", "INC_X" \n" \ " add "Y", "Y", "INC_Y" \n" \ " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" #define KERNEL_F \ " ldp q16, q17, ["X"] \n" \ " ldp q24, q25, ["Y"] \n" \ " ldp q18, q19, ["X", #32] \n" \ " ldp q26, q27, ["Y", #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ " ldp q20, q21, ["X", #64] \n" \ " ldp q28, q29, ["Y", #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ " ldp q22, q23, ["X", #96] \n" \ " ldp q30, q31, ["Y", #96] \n" \ " add "Y", "Y", #128 \n" \ " add "X", "X", #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ " PRFM PLDL1KEEP, ["X", #896] \n" \ " PRFM PLDL1KEEP, ["Y", #896] \n" \ " PRFM PLDL1KEEP, ["X", #896+64] \n" \ " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" \ " ldp q16, q17, ["X"] \n" \ " ldp q24, q25, ["Y"] \n" \ " ldp q18, q19, ["X", #32] \n" \ " ldp q26, q27, ["Y", #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ " ldp q20, q21, ["X", #64] \n" \ " ldp q28, q29, ["Y", #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ " ldp q22, q23, ["X", #96] \n" \ " ldp q30, q31, ["Y", #96] \n" \ " add "Y", "Y", #128 \n" \ " add "X", "X", #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ " PRFM PLDL1KEEP, ["X", #896] \n" \ " PRFM PLDL1KEEP, ["Y", #896] \n" \ " PRFM PLDL1KEEP, ["X", #896+64] \n" \ " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" #define KERNEL_F_FINALIZE \ " fadd v0.4s, v0.4s, v1.4s \n" \ " fadd v2.4s, v2.4s, v3.4s \n" \ " fadd v4.4s, v4.4s, v5.4s \n" \ " fadd v6.4s, v6.4s, v7.4s \n" \ " fadd v0.4s, v0.4s, v2.4s \n" \ " fadd v4.4s, v4.4s, v6.4s \n" \ " fadd v0.4s, v0.4s, v4.4s \n" \ " faddp v0.4s, v0.4s, v0.4s \n" \ " faddp v0.4s, v0.4s, v0.4s \n" #else /* !defined(DSDOT) */ #define KERNEL_F1 \ " ldr "TMPX", ["X"] \n" \ " ldr "TMPY", ["Y"] \n" \ " add "X", "X", "INC_X" \n" \ " add "Y", "Y", "INC_Y" \n" \ " fcvt "TMPX1", "TMPX" \n" \ " fcvt "TMPY1", "TMPY" \n" \ " fmul "TMPX1", "TMPX1", "TMPY1" \n" \ " fadd "DOTF", "DOTF", "TMPX1" \n" #define KERNEL_F \ " ldp q18, q19, ["X"] \n" \ " ldp q26, q27, ["Y"] \n" \ " fcvtl v16.2d, v18.2s \n" \ " fcvtl2 v17.2d, v18.4s \n" \ " fcvtl v18.2d, v19.2s \n" \ " fcvtl2 v19.2d, v19.4s \n" \ " fcvtl v24.2d, v26.2s \n" \ " fcvtl2 v25.2d, v26.4s \n" \ " fcvtl v26.2d, v27.2s \n" \ " fcvtl2 v27.2d, v27.4s \n" \ " ldp q22, q23, ["X", #32] \n" \ " ldp q30, q31, ["Y", #32] \n" \ " fcvtl v20.2d, v22.2s \n" \ " fcvtl2 v21.2d, v22.4s \n" \ " fcvtl v22.2d, v23.2s \n" \ " fcvtl2 v23.2d, v23.4s \n" \ " fcvtl v28.2d, v30.2s \n" \ " fcvtl2 v29.2d, v30.4s \n" \ " fcvtl v30.2d, v31.2s \n" \ " fcvtl2 v31.2d, v31.4s \n" \ " PRFM PLDL1KEEP, ["X", #896] \n" \ " PRFM PLDL1KEEP, ["Y", #896] \n" \ " PRFM PLDL1KEEP, ["X", #896+64] \n" \ " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ " add "Y", "Y", #64 \n" \ " add "X", "X", #64 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" #define KERNEL_F_FINALIZE \ " fadd v0.2d, v0.2d, v1.2d \n" \ " fadd v2.2d, v2.2d, v3.2d \n" \ " fadd v4.2d, v4.2d, v5.2d \n" \ " fadd v6.2d, v6.2d, v7.2d \n" \ " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ " faddp "DOTF", v0.2d \n" #endif /* !defined(DSDOT) */ #else /* !defined(DOUBLE) */ #define KERNEL_F1 \ " ldr "TMPX", ["X"] \n" \ " ldr "TMPY", ["Y"] \n" \ " add "X", "X", "INC_X" \n" \ " add "Y", "Y", "INC_Y" \n" \ " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" #define KERNEL_F \ " ldp q16, q17, ["X"] \n" \ " ldp q24, q25, ["Y"] \n" \ " ldp q18, q19, ["X", #32] \n" \ " ldp q26, q27, ["Y", #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ " ldp q20, q21, ["X", #64] \n" \ " ldp q28, q29, ["Y", #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ " ldp q22, q23, ["X", #96] \n" \ " ldp q30, q31, ["Y", #96] \n" \ " add "Y", "Y", #128 \n" \ " add "X", "X", #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ " PRFM PLDL1KEEP, ["X", #896] \n" \ " PRFM PLDL1KEEP, ["Y", #896] \n" \ " PRFM PLDL1KEEP, ["X", #896+64] \n" \ " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" \ " ldp q16, q17, ["X"] \n" \ " ldp q24, q25, ["Y"] \n" \ " ldp q18, q19, ["X", #32] \n" \ " ldp q26, q27, ["Y", #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ " ldp q20, q21, ["X", #64] \n" \ " ldp q28, q29, ["Y", #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ " ldp q22, q23, ["X", #96] \n" \ " ldp q30, q31, ["Y", #96] \n" \ " add "Y", "Y", #128 \n" \ " add "X", "X", #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ " PRFM PLDL1KEEP, ["X", #896] \n" \ " PRFM PLDL1KEEP, ["Y", #896] \n" \ " PRFM PLDL1KEEP, ["X", #896+64] \n" \ " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" #define KERNEL_F_FINALIZE \ " fadd v0.2d, v0.2d, v1.2d \n" \ " fadd v2.2d, v2.2d, v3.2d \n" \ " fadd v4.2d, v4.2d, v5.2d \n" \ " fadd v6.2d, v6.2d, v7.2d \n" \ " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ " faddp "DOTF", v0.2d \n" #endif /* !defined(DOUBLE) */ #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { RETURN_TYPE dot = 0.0 ; if ( n < 0 ) return dot; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " mov "Y", %[Y_] \n" " mov "INC_Y", %[INCY_] \n" " fmov "DOTF", "REG0" \n" " fmov d1, xzr \n" " fmov d2, xzr \n" " fmov d3, xzr \n" " fmov d4, xzr \n" " fmov d5, xzr \n" " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" " ble .Ldot_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Ldot_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" " bne .Ldot_kernel_S_BEGIN \n" ".Ldot_kernel_F_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq .Ldot_kernel_F1 \n" " .align 5 \n" ".Ldot_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" ".Ldot_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" " ble .Ldot_kernel_L999 \n" ".Ldot_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_F10 \n" " b .Ldot_kernel_L999 \n" ".Ldot_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Ldot_kernel_S1 \n" ".Ldot_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_S4 \n" ".Ldot_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Ldot_kernel_L999 \n" ".Ldot_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_S10 \n" ".Ldot_kernel_L999: \n" " str "DOTF", [%[DOT_]] \n" : : [DOT_] "r" (&dot), //%0 [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x), //%3 [Y_] "r" (y), //%4 [INCY_] "r" (inc_y) //%5 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return dot; } #if defined(SMP) static int dot_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); return 0; } #endif RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif RETURN_TYPE dot = 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0 || inc_y == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; RETURN_TYPE *ptr; #if !defined(DOUBLE) mode = BLAS_SINGLE | BLAS_REAL; #else mode = BLAS_DOUBLE | BLAS_REAL; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, ( void *)dot_thread_function, nthreads); ptr = (RETURN_TYPE *)result; for (i = 0; i < nthreads; i++) { dot = dot + (*ptr); ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); } } #else dot = dot_compute(n, x, inc_x, y, inc_y); #endif return dot; } OpenBLAS-0.2.20/kernel/arm64/dtrmm_kernel_4x4.S000066400000000000000000000627761313527062700207050ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define temp x16 #define tempOffset x17 #define tempK x18 #define alpha0 d10 #define alphaV0 v10.d[0] #define alpha1 d11 #define alphaV1 v11.d[0] #define alpha2 d14 #define alphaV2 v14.d[0] #define alpha3 d15 #define alphaV3 v15.d[0] // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA00, pA01 //v01 pA02, pA03 //v02 //v03 //v04 pA10, pA11 //v05 pA12, pA13 //v06 //v07 //v08 must save pB00, pB01 //v09 must save pB02, pB03 //v10 must save ALPHA0 //v11 must save ALPHA1 //v12 must save pB10, pB11 //v13 must save pB12, pB13 //v14 must save ALPHA2 //v15 must save ALPHA3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 //v19 //v20 C10, C11 //v21 C12, C13 //v22 //v23 //v24 C20, C21 //v25 C22, C23 //v26 //v27 //v28 C30, C31 //v29 C32, C33 //v30 //v31 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 fmov d24, d17 fmov d25, d16 fmov d28, d17 fmov d29, d16 .endm .macro KERNEL4x4_I ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] fmul v29.2d, v1.2d, v9.d[1] fmul v20.2d, v0.2d, v8.d[1] fmul v25.2d, v1.2d, v9.d[0] fmul v24.2d, v0.2d, v9.d[0] fmul v21.2d, v1.2d, v8.d[1] fmul v28.2d, v0.2d, v9.d[1] fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 .endm .macro KERNEL4x4_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 fmla v20.2d, v4.2d, v12.d[1] fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 fmla v24.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] fmla v28.2d, v4.2d, v13.d[1] fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E fmla v16.2d, v4.2d, v12.d[0] fmla v29.2d, v5.2d, v13.d[1] fmla v20.2d, v4.2d, v12.d[1] fmla v25.2d, v5.2d, v13.d[0] fmla v24.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] fmla v28.2d, v4.2d, v13.d[1] fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV2 fmul v13.2d, v21.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 fmul v9.2d, v25.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV2 fmul v13.2d, v29.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d20, d16 fmov d24, d20 fmov d28, d16 .endm .macro KERNEL2x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV1 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV2 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV3 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL1x4_SUB ldr d0, [pA] add pA, pA, #8 ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] .endm .macro SAVE1x4 add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v12.2d, v20.2d, alphaV1 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 .endm .macro KERNEL4x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV2 fmul v13.2d, v21.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL2x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC fmul v12.2d, v20.2d, alphaV1 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr .endm .macro KERNEL1x2_SUB ld1 {v8.2d} , [pB] add pB , pB, #16 ldr d0 , [pA] add pA, pA, #8 fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 .endm .macro KERNEL4x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr .endm .macro KERNEL2x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d}, [pA] add pA , pA, #16 fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr .endm .macro KERNEL1x1_SUB ldr d8, [pB] add pB , pB, #8 ldr d0, [pA] add pA , pA, #8 fmadd d16, d0, d8, d16 .endm .macro SAVE1x1 fmul d8, d16, alpha0 str d8, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0, d0 fmov alpha1, d0 fmov alpha2, d0 fmov alpha3, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble dtrmm_kernel_L2_BEGIN /******************************************************************************/ dtrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array dtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dtrmm_kernel_L4_M2_BEGIN dtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble dtrmm_kernel_L4_M4_22a .align 5 dtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M4_22 dtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b dtrmm_kernel_L4_M4_44 dtrmm_kernel_L4_M4_32: tst counterL, #1 ble dtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b dtrmm_kernel_L4_M4_44 dtrmm_kernel_L4_M4_40: INIT4x4 dtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 ble dtrmm_kernel_L4_M4_100 dtrmm_kernel_L4_M4_46: KERNEL4x4_SUB dtrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne dtrmm_kernel_L4_M4_20 dtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L4_M1_BEGIN dtrmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L4_M2_40 dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M2_22 dtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L4_M2_100 dtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M2_42 dtrmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L4_M2_END: dtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L4_END dtrmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L4_M1_40 dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M1_22 dtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L4_M1_100 dtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M1_42 dtrmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif dtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- bgt dtrmm_kernel_L4_BEGIN /******************************************************************************/ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble dtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble dtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A dtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble dtrmm_kernel_L2_M2_BEGIN dtrmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dtrmm_kernel_L2_M4_40 .align 5 dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M4_22 dtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M4_100 dtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M4_42 dtrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt dtrmm_kernel_L2_M4_20 dtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L2_M1_BEGIN dtrmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dtrmm_kernel_L2_M2_40 dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M2_22 dtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M2_100 dtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M2_42 dtrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L2_M2_END: dtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L2_END dtrmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dtrmm_kernel_L2_M1_40 dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M1_22 dtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M1_100 dtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M1_42 dtrmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif dtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ dtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble dtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A dtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dtrmm_kernel_L1_M2_BEGIN dtrmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M4_40 .align 5 dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M4_22 dtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M4_100 dtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M4_42 dtrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt dtrmm_kernel_L1_M4_20 dtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L1_M1_BEGIN dtrmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M2_40 dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M2_22 dtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M2_100 dtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M2_42 dtrmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L1_M2_END: dtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L1_END dtrmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M1_40 dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M1_22 dtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M1_100 dtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M1_42 dtrmm_kernel_L1_M1_100: SAVE1x1 dtrmm_kernel_L1_END: dtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dtrmm_kernel_4x8.S000066400000000000000000001130261313527062700206720ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define temp x16 #define tempOffset x17 #define tempK x18 #define alpha0 d2 #define alphaV0 v2.d[0] #define alpha1 d3 #define alphaV1 v3.d[0] #define alpha2 d6 #define alphaV2 v6.d[0] #define alpha3 d7 #define alphaV3 v7.d[0] // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA00, pA01 //v01 pA02, pA03 //v02 ALPHA0 //v03 ALPHA1 //v04 pA10, pA11 //v05 pA12, pA13 //v06 ALPHA2 //v07 ALPHA3 //v08 must save pB0_0, pB0_1 //v09 must save pB0_2, pB0_3 //v10 must save pB0_4, pB0_5 //v11 must save pB0_6, pB0_7 //v12 must save pB1_0, pB1_1 //v13 must save pB1_2, pB1_3 //v14 must save pB1_4, pB1_5 //v15 must save pB1_6, pB1_7 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 //v19 C06, C07 //v20 C10, C11 //v21 C12, C13 //v22 C14, C15 //v23 C16, C17 //v24 C20, C21 //v25 C22, C23 //v26 C24, C25 //v27 C26, C27 //v28 C30, C31 //v29 C32, C33 //v30 C34, C35 //v31 C36, C37 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x8 fmov d16, xzr fmov d17, xzr fmov d18, xzr fmov d19, d16 fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 fmov d24, xzr fmov d25, d16 fmov d26, d17 fmov d27, d18 fmov d28, xzr fmov d29, d16 fmov d30, d17 fmov d31, d18 .endm .macro KERNEL4x8_I ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmul v16.2d, v0.2d, v8.d[0] fmul v17.2d, v1.2d, v8.d[0] fmul v18.2d, v0.2d, v8.d[1] fmul v19.2d, v1.2d, v8.d[1] fmul v20.2d, v0.2d, v9.d[0] fmul v21.2d, v1.2d, v9.d[0] fmul v22.2d, v0.2d, v9.d[1] fmul v23.2d, v1.2d, v9.d[1] fmul v24.2d, v0.2d, v10.d[0] fmul v25.2d, v1.2d, v10.d[0] fmul v26.2d, v0.2d, v10.d[1] fmul v27.2d, v1.2d, v10.d[1] fmul v28.2d, v0.2d, v11.d[0] fmul v29.2d, v1.2d, v11.d[0] fmul v30.2d, v0.2d, v11.d[1] fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 ld1 {v14.2d, v15.2d}, [pB] add pB, pB, #32 .endm .macro KERNEL4x8_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v0.2d, v8.d[1] fmla v19.2d, v1.2d, v8.d[1] fmla v20.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v9.d[0] fmla v22.2d, v0.2d, v9.d[1] fmla v23.2d, v1.2d, v9.d[1] fmla v24.2d, v0.2d, v10.d[0] fmla v25.2d, v1.2d, v10.d[0] fmla v26.2d, v0.2d, v10.d[1] fmla v27.2d, v1.2d, v10.d[1] fmla v28.2d, v0.2d, v11.d[0] fmla v29.2d, v1.2d, v11.d[0] fmla v30.2d, v0.2d, v11.d[1] fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 ld1 {v14.2d, v15.2d}, [pB] add pB, pB, #32 prfm PLDL1KEEP, [pA, #512] .endm .macro KERNEL4x8_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v17.2d, v5.2d, v12.d[0] fmla v18.2d, v4.2d, v12.d[1] fmla v19.2d, v5.2d, v12.d[1] fmla v20.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v13.d[0] fmla v22.2d, v4.2d, v13.d[1] fmla v23.2d, v5.2d, v13.d[1] fmla v24.2d, v4.2d, v14.d[0] fmla v25.2d, v5.2d, v14.d[0] fmla v26.2d, v4.2d, v14.d[1] fmla v27.2d, v5.2d, v14.d[1] fmla v28.2d, v4.2d, v15.d[0] fmla v29.2d, v5.2d, v15.d[0] fmla v30.2d, v4.2d, v15.d[1] fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 prfm PLDL1KEEP, [pB, #512] .endm .macro KERNEL4x8_E fmla v16.2d, v4.2d, v12.d[0] fmla v17.2d, v5.2d, v12.d[0] fmla v18.2d, v4.2d, v12.d[1] fmla v19.2d, v5.2d, v12.d[1] fmla v20.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v13.d[0] fmla v22.2d, v4.2d, v13.d[1] fmla v23.2d, v5.2d, v13.d[1] fmla v24.2d, v4.2d, v14.d[0] fmla v25.2d, v5.2d, v14.d[0] fmla v26.2d, v4.2d, v14.d[1] fmla v27.2d, v5.2d, v14.d[1] fmla v28.2d, v4.2d, v15.d[0] fmla v29.2d, v5.2d, v15.d[0] fmla v30.2d, v4.2d, v15.d[1] fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v0.2d, v8.d[1] fmla v19.2d, v1.2d, v8.d[1] fmla v20.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v9.d[0] fmla v22.2d, v0.2d, v9.d[1] fmla v23.2d, v1.2d, v9.d[1] fmla v24.2d, v0.2d, v10.d[0] fmla v25.2d, v1.2d, v10.d[0] fmla v26.2d, v0.2d, v10.d[1] fmla v27.2d, v1.2d, v10.d[1] fmla v28.2d, v0.2d, v11.d[0] fmla v29.2d, v1.2d, v11.d[0] fmla v30.2d, v0.2d, v11.d[1] fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow2, pCRow1, LDC fmul v10.2d, v18.2d, alphaV2 fmul v11.2d, v19.2d, alphaV3 st1 {v10.2d, v11.2d}, [pCRow1] add pCRow1, pCRow2, LDC fmul v12.2d, v20.2d, alphaV0 fmul v13.2d, v21.2d, alphaV1 st1 {v12.2d, v13.2d}, [pCRow2] add pCRow2, pCRow1, LDC fmul v14.2d, v22.2d, alphaV2 fmul v15.2d, v23.2d, alphaV3 st1 {v14.2d, v15.2d}, [pCRow1] add pCRow1, pCRow2, LDC fmul v8.2d, v24.2d, alphaV0 fmul v9.2d, v25.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow2, pCRow1, LDC fmul v10.2d, v26.2d, alphaV2 fmul v11.2d, v27.2d, alphaV3 st1 {v10.2d, v11.2d}, [pCRow1] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV0 fmul v13.2d, v29.2d, alphaV1 st1 {v12.2d, v13.2d}, [pCRow2] fmul v14.2d, v30.2d, alphaV2 fmul v15.2d, v31.2d, alphaV3 st1 {v14.2d, v15.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x8 fmov d16, xzr fmov d18, xzr fmov d20, xzr fmov d22, d16 fmov d24, xzr fmov d26, d16 fmov d28, xzr fmov d30, d16 .endm .macro KERNEL2x8_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v18.2d, v0.2d, v8.d[1] fmla v20.2d, v0.2d, v9.d[0] fmla v22.2d, v0.2d, v9.d[1] fmla v24.2d, v0.2d, v10.d[0] fmla v26.2d, v0.2d, v10.d[1] fmla v28.2d, v0.2d, v11.d[0] fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow2, pCRow1, LDC fmul v10.2d, v18.2d, alphaV2 st1 {v10.2d}, [pCRow1] add pCRow1, pCRow2, LDC fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow2] add pCRow2, pCRow1, LDC fmul v14.2d, v22.2d, alphaV2 st1 {v14.2d}, [pCRow1] add pCRow1, pCRow2, LDC fmul v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow2, pCRow1, LDC fmul v10.2d, v26.2d, alphaV2 st1 {v10.2d}, [pCRow1] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow2] add pCRow2, pCRow1, LDC fmul v14.2d, v30.2d, alphaV2 st1 {v14.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x8 fmov d16, xzr fmov d20, xzr fmov d24, xzr fmov d28, xzr .endm .macro KERNEL1x8_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ldr d0, [pA] add pA, pA, #8 ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] fmla v24.2d, v10.2d, v0.d[0] fmla v28.2d, v11.2d, v0.d[0] .endm .macro SAVE1x8 add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v10.2d, v20.2d, alphaV1 st1 {v10.d}[0], [pCRow2] st1 {v10.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v12.2d, v24.2d, alphaV2 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v14.2d, v28.2d, alphaV3 st1 {v14.d}[0], [pCRow2] st1 {v14.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 fmov d24, d17 fmov d25, d16 fmov d28, d17 fmov d29, d16 .endm .macro KERNEL4x4_I ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] fmul v29.2d, v1.2d, v9.d[1] fmul v20.2d, v0.2d, v8.d[1] fmul v25.2d, v1.2d, v9.d[0] fmul v24.2d, v0.2d, v9.d[0] fmul v21.2d, v1.2d, v8.d[1] fmul v28.2d, v0.2d, v9.d[1] fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 .endm .macro KERNEL4x4_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 fmla v20.2d, v4.2d, v12.d[1] fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 fmla v24.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] fmla v28.2d, v4.2d, v13.d[1] fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E fmla v16.2d, v4.2d, v12.d[0] fmla v29.2d, v5.2d, v13.d[1] fmla v20.2d, v4.2d, v12.d[1] fmla v25.2d, v5.2d, v13.d[0] fmla v24.2d, v4.2d, v13.d[0] fmla v21.2d, v5.2d, v12.d[1] fmla v28.2d, v4.2d, v13.d[1] fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV2 fmul v13.2d, v21.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 fmul v9.2d, v25.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV2 fmul v13.2d, v29.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d20, d16 fmov d24, d20 fmov d28, d16 .endm .macro KERNEL2x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV1 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV2 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV3 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL1x4_SUB ldr d0, [pA] add pA, pA, #8 ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] .endm .macro SAVE1x4 add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v12.2d, v20.2d, alphaV1 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 .endm .macro KERNEL4x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV2 fmul v13.2d, v21.2d, alphaV3 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL2x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC fmul v12.2d, v20.2d, alphaV1 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr .endm .macro KERNEL1x2_SUB ld1 {v8.2d} , [pB] add pB , pB, #16 ldr d0 , [pA] add pA, pA, #8 fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 .endm .macro KERNEL4x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV1 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr .endm .macro KERNEL2x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d}, [pA] add pA , pA, #16 fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr .endm .macro KERNEL1x1_SUB ldr d8, [pB] add pB , pB, #8 ldr d0, [pA] add pA , pA, #8 fmadd d16, d0, d8, d16 .endm .macro SAVE1x1 fmul d8, d16, alpha0 str d8, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0, d0 fmov alpha1, d0 fmov alpha2, d0 fmov alpha3, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble dtrmm_kernel_L4_BEGIN /******************************************************************************/ dtrmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array dtrmm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dtrmm_kernel_L8_M2_BEGIN dtrmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #6 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #8 #endif asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 ble dtrmm_kernel_L8_M4_22a .align 5 dtrmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L8_M4_22 dtrmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E b dtrmm_kernel_L8_M4_44 dtrmm_kernel_L8_M4_32: tst counterL, #1 ble dtrmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E b dtrmm_kernel_L8_M4_44 dtrmm_kernel_L8_M4_40: INIT4x8 dtrmm_kernel_L8_M4_44: ands counterL, tempK, #1 ble dtrmm_kernel_L8_M4_100 dtrmm_kernel_L8_M4_46: KERNEL4x8_SUB dtrmm_kernel_L8_M4_100: SAVE4x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #8 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #6 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L8_M4_END: subs counterI, counterI, #1 bne dtrmm_kernel_L8_M4_20 dtrmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L8_M1_BEGIN dtrmm_kernel_L8_M2_20: INIT2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pA, pA, temp lsl temp, tempOffset, #6 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #8 #endif asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L8_M2_40 dtrmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L8_M2_22 dtrmm_kernel_L8_M2_40: ands counterL, tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L8_M2_100 dtrmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L8_M2_42 dtrmm_kernel_L8_M2_100: SAVE2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #8 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #6 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L8_M2_END: dtrmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L8_END dtrmm_kernel_L8_M1_20: INIT1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pA, pA, temp lsl temp, tempOffset, #6 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #8 #endif asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L8_M1_40 dtrmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L8_M1_22 dtrmm_kernel_L8_M1_40: ands counterL, tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L8_M1_100 dtrmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L8_M1_42 dtrmm_kernel_L8_M1_100: SAVE1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #8 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #6 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif dtrmm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 #if !defined(LEFT) add tempOffset, tempOffset, #8 #endif subs counterJ, counterJ , #1 // j-- bgt dtrmm_kernel_L8_BEGIN /******************************************************************************/ dtrmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 ble dtrmm_kernel_L999 tst counterJ , #4 ble dtrmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array dtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dtrmm_kernel_L4_M2_BEGIN dtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble dtrmm_kernel_L4_M4_22a .align 5 dtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M4_22 dtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b dtrmm_kernel_L4_M4_44 dtrmm_kernel_L4_M4_32: tst counterL, #1 ble dtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b dtrmm_kernel_L4_M4_44 dtrmm_kernel_L4_M4_40: INIT4x4 dtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 ble dtrmm_kernel_L4_M4_100 dtrmm_kernel_L4_M4_46: KERNEL4x4_SUB dtrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne dtrmm_kernel_L4_M4_20 dtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L4_M1_BEGIN dtrmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L4_M2_40 dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M2_22 dtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L4_M2_100 dtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M2_42 dtrmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L4_M2_END: dtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L4_END dtrmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L4_M1_40 dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M1_22 dtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L4_M1_100 dtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M1_42 dtrmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif dtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif /******************************************************************************/ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble dtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble dtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A dtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble dtrmm_kernel_L2_M2_BEGIN dtrmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dtrmm_kernel_L2_M4_40 .align 5 dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M4_22 dtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M4_100 dtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M4_42 dtrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt dtrmm_kernel_L2_M4_20 dtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L2_M1_BEGIN dtrmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dtrmm_kernel_L2_M2_40 dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M2_22 dtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M2_100 dtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M2_42 dtrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L2_M2_END: dtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L2_END dtrmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dtrmm_kernel_L2_M1_40 dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M1_22 dtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M1_100 dtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M1_42 dtrmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif dtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ dtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble dtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A dtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble dtrmm_kernel_L1_M2_BEGIN dtrmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M4_40 .align 5 dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M4_22 dtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M4_100 dtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M4_42 dtrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt dtrmm_kernel_L1_M4_20 dtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L1_M1_BEGIN dtrmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M2_40 dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M2_22 dtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M2_100 dtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M2_42 dtrmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L1_M2_END: dtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L1_END dtrmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M1_40 dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M1_22 dtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M1_100 dtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M1_42 dtrmm_kernel_L1_M1_100: SAVE1x1 dtrmm_kernel_L1_END: dtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dtrmm_kernel_8x4.S000066400000000000000000001073221313527062700206740ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alpha x17 #define temp x18 #define tempOffset x19 #define tempK x20 #define alpha0 d10 #define alphaV0 v10.d[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 448 #define C_PRE_SIZE 128 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_0, pA0_1 //v01 pA0_2, pA0_3 //v02 pA0_4, pA0_5 //v03 pA0_6, pA0_7 //v04 pA1_0, pA1_1 //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 //v08 must save pB0_0 //v09 must save pB0_1 //v10 must save pB0_2 --> ALPHA0 //v11 must save pB0_3 //v12 must save pB1_0 //v13 must save pB1_1 //v14 must save pB1_2 //v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 //v19 C06, C07 //v20 C10, C11 //v21 C12, C13 //v22 C14, C15 //v23 C16, C17 //v24 C20, C21 //v25 C22, C23 //v26 C24, C25 //v27 C26, C27 //v28 C30, C31 //v29 C32, C33 //v30 C34, C35 //v31 C36, C37 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x4 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, xzr fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 fmov d24, xzr fmov d25, d16 fmov d26, d17 fmov d27, d18 fmov d28, xzr fmov d29, d16 fmov d30, d17 fmov d31, d18 .endm .macro KERNEL8x4_I ldp q0, q1, [pA], #32 ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.d[0] fmul v20.2d, v0.2d, v9.d[0] ldp d10, d11, [pB], #16 fmul v17.2d, v1.2d, v8.d[0] fmul v21.2d, v1.2d, v9.d[0] ldp q2, q3, [pA], #32 fmul v24.2d, v0.2d, v10.d[0] fmul v28.2d, v0.2d, v11.d[0] ldp q4, q5, [pA], #32 fmul v25.2d, v1.2d, v10.d[0] fmul v29.2d, v1.2d, v11.d[0] ldp d12, d13, [pB], #16 fmul v18.2d, v2.2d, v8.d[0] fmul v22.2d, v2.2d, v9.d[0] ldp d14, d15, [pB], #16 fmul v26.2d, v2.2d, v10.d[0] fmul v30.2d, v2.2d, v11.d[0] ldp q6, q7, [pA], #32 fmul v19.2d, v3.2d, v8.d[0] fmul v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v31.2d, v3.2d, v11.d[0] fmul v23.2d, v3.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v9.d[0] ldp q4, q5, [pA], #32 fmla v24.2d, v0.2d, v10.d[0] fmla v28.2d, v0.2d, v11.d[0] ldp d12, d13, [pB], #16 fmla v17.2d, v1.2d, v8.d[0] fmla v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmla v21.2d, v1.2d, v9.d[0] fmla v29.2d, v1.2d, v11.d[0] ldp d14, d15, [pB], #16 fmla v18.2d, v2.2d, v8.d[0] fmla v22.2d, v2.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v26.2d, v2.2d, v10.d[0] fmla v30.2d, v2.2d, v11.d[0] fmla v19.2d, v3.2d, v8.d[0] fmla v23.2d, v3.2d, v9.d[0] ldp q6, q7, [pA], #32 fmla v27.2d, v3.2d, v10.d[0] fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.d[0] fmla v20.2d, v4.2d, v13.d[0] fmla v24.2d, v4.2d, v14.d[0] fmla v28.2d, v4.2d, v15.d[0] ldp q0, q1, [pA], #32 fmla v17.2d, v5.2d, v12.d[0] fmla v25.2d, v5.2d, v14.d[0] ldp d8, d9, [pB], #16 fmla v21.2d, v5.2d, v13.d[0] fmla v29.2d, v5.2d, v15.d[0] ldp d10, d11, [pB], #16 fmla v18.2d, v6.2d, v12.d[0] fmla v22.2d, v6.2d, v13.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v26.2d, v6.2d, v14.d[0] fmla v30.2d, v6.2d, v15.d[0] fmla v19.2d, v7.2d, v12.d[0] fmla v23.2d, v7.2d, v13.d[0] ldp q2, q3, [pA], #32 fmla v27.2d, v7.2d, v14.d[0] fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] fmla v20.2d, v4.2d, v13.d[0] fmla v24.2d, v4.2d, v14.d[0] fmla v28.2d, v4.2d, v15.d[0] fmla v17.2d, v5.2d, v12.d[0] fmla v25.2d, v5.2d, v14.d[0] fmla v21.2d, v5.2d, v13.d[0] fmla v29.2d, v5.2d, v15.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v18.2d, v6.2d, v12.d[0] fmla v22.2d, v6.2d, v13.d[0] fmla v26.2d, v6.2d, v14.d[0] fmla v30.2d, v6.2d, v15.d[0] fmla v19.2d, v7.2d, v12.d[0] fmla v23.2d, v7.2d, v13.d[0] fmla v27.2d, v7.2d, v14.d[0] fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB ldp q0, q1, [pA], #32 ldp d8, d9, [pB], #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v9.d[0] ldp d10, d11, [pB], #16 fmla v17.2d, v1.2d, v8.d[0] fmla v21.2d, v1.2d, v9.d[0] ldp q2, q3, [pA], #32 fmla v24.2d, v0.2d, v10.d[0] fmla v28.2d, v0.2d, v11.d[0] fmla v25.2d, v1.2d, v10.d[0] fmla v29.2d, v1.2d, v11.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v18.2d, v2.2d, v8.d[0] fmla v22.2d, v2.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmla v26.2d, v2.2d, v10.d[0] fmla v30.2d, v2.2d, v11.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v19.2d, v3.2d, v8.d[0] fmla v27.2d, v3.2d, v10.d[0] fmla v31.2d, v3.2d, v11.d[0] fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 fmov alpha0, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0 fmul v1.2d, v17.2d, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v2.2d, v18.2d, alphaV0 fmul v3.2d, v19.2d, alphaV0 stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.2d, v20.2d, alphaV0 fmul v5.2d, v21.2d, alphaV0 stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v6.2d, v22.2d, alphaV0 fmul v7.2d, v23.2d, alphaV0 stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.2d, v24.2d, alphaV0 fmul v1.2d, v25.2d, alphaV0 stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v2.2d, v26.2d, alphaV0 fmul v3.2d, v27.2d, alphaV0 stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0 fmul v5.2d, v29.2d, alphaV0 stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v6.2d, v30.2d, alphaV0 fmul v7.2d, v31.2d, alphaV0 stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 fmov d24, d17 fmov d25, d16 fmov d28, d17 fmov d29, d16 .endm .macro KERNEL4x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v29.2d, v1.2d, v9.d[1] fmla v20.2d, v0.2d, v8.d[1] fmla v25.2d, v1.2d, v9.d[0] fmla v24.2d, v0.2d, v9.d[0] fmla v21.2d, v1.2d, v8.d[1] fmla v28.2d, v0.2d, v9.d[1] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV0 fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 fmul v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV0 fmul v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d20, d16 fmov d24, d20 fmov d28, d16 .endm .macro KERNEL2x4_SUB ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v24.2d, v0.2d, v9.d[0] fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL1x4_SUB ldr d0, [pA] add pA, pA, #8 ld1 {v8.2d, v9.2d}, [pB] add pB, pB, #32 fmla v16.2d, v8.2d, v0.d[0] fmla v20.2d, v9.2d, v0.d[0] .endm .macro SAVE1x4 fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x2 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 fmov d20, xzr fmov d21, d16 fmov d22, d17 fmov d23, d18 .endm .macro KERNEL8x2_SUB ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] fmla v22.2d, v2.2d, v8.d[1] fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v0.2d, v16.2d, alphaV0 fmul v1.2d, v17.2d, alphaV0 fmul v2.2d, v18.2d, alphaV0 fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmul v4.2d, v20.2d, alphaV0 fmul v5.2d, v21.2d, alphaV0 fmul v6.2d, v22.2d, alphaV0 fmul v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, d16 fmov d20, d17 fmov d21, d16 .endm .macro KERNEL4x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2d, v20.2d, alphaV0 fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d20, d16 .endm .macro KERNEL2x2_SUB ld1 {v8.2d}, [pB] add pB, pB, #16 ld1 {v0.2d}, [pA] add pA, pA, #16 fmla v16.2d, v0.2d, v8.d[0] fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr .endm .macro KERNEL1x2_SUB ld1 {v8.2d} , [pB] add pB , pB, #16 ldr d0 , [pA] add pA, pA, #8 fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 fmov alpha0, alpha add pCRow1 , pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 st1 {v8.d}[0], [pCRow0] st1 {v8.d}[1], [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT8x1 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 .endm .macro KERNEL8x1_SUB ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 ldr d8, [pB] add pB , pB, #8 ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] fmla v18.2d, v2.2d, v8.d[0] fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 fmov alpha0, alpha fmul v0.2d, v16.2d, alphaV0 fmul v1.2d, v17.2d, alphaV0 fmul v2.2d, v18.2d, alphaV0 fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 .endm .macro KERNEL4x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 fmla v16.2d, v0.2d, v8.d[0] fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr .endm .macro KERNEL2x1_SUB ldr d8, [pB] add pB , pB, #8 ld1 {v0.2d}, [pA] add pA , pA, #16 fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr .endm .macro KERNEL1x1_SUB ldr d8, [pB] add pB , pB, #8 ldr d0, [pA] add pA , pA, #8 fmadd d16, d0, d8, d16 .endm .macro SAVE1x1 fmov alpha0, alpha fmul d8, d16, alpha0 str d8, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble dtrmm_kernel_L2_BEGIN /******************************************************************************/ dtrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array dtrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dtrmm_kernel_L4_M4_BEGIN .align 5 dtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dtrmm_kernel_L4_M8_22a .align 5 dtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M8_22 .align 5 dtrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 .align 5 dtrmm_kernel_L4_M8_32: tst counterL, #1 ble dtrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 dtrmm_kernel_L4_M8_40: INIT8x4 dtrmm_kernel_L4_M8_44: ands counterL , tempK, #7 ble dtrmm_kernel_L4_M8_100 .align 5 dtrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 bne dtrmm_kernel_L4_M8_46 dtrmm_kernel_L4_M8_100: SAVE8x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] dtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 bne dtrmm_kernel_L4_M8_20 dtrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dtrmm_kernel_L4_END tst counterI, #4 ble dtrmm_kernel_L4_M2_BEGIN dtrmm_kernel_L4_M4_20: INIT4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L4_M4_40 dtrmm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M4_22 dtrmm_kernel_L4_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L4_M4_100 dtrmm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M4_42 dtrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L4_M4_END: dtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L4_M1_BEGIN dtrmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L4_M2_40 dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M2_22 dtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L4_M2_100 dtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M2_42 dtrmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L4_M2_END: dtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L4_END dtrmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L4_M1_40 dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M1_22 dtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L4_M1_100 dtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M1_42 dtrmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif dtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- bgt dtrmm_kernel_L4_BEGIN /******************************************************************************/ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble dtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble dtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A dtrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dtrmm_kernel_L2_M4_BEGIN dtrmm_kernel_L2_M8_20: INIT8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dtrmm_kernel_L2_M8_40 .align 5 dtrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M8_22 dtrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M8_100 dtrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M8_42 dtrmm_kernel_L2_M8_100: SAVE8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif dtrmm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt dtrmm_kernel_L2_M8_20 dtrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dtrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 ble dtrmm_kernel_L2_M2_BEGIN dtrmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dtrmm_kernel_L2_M4_40 .align 5 dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M4_22 dtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M4_100 dtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M4_42 dtrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L2_M4_END: dtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L2_M1_BEGIN dtrmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble dtrmm_kernel_L2_M2_40 dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M2_22 dtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M2_100 dtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M2_42 dtrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L2_M2_END: dtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L2_END dtrmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble dtrmm_kernel_L2_M1_40 dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M1_22 dtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L2_M1_100 dtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L2_M1_42 dtrmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif dtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ dtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble dtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A dtrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble dtrmm_kernel_L1_M4_BEGIN dtrmm_kernel_L1_M8_20: INIT8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #3 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M8_40 .align 5 dtrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M8_22 dtrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M8_100 dtrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M8_42 dtrmm_kernel_L1_M8_100: SAVE8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif dtrmm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt dtrmm_kernel_L1_M8_20 dtrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble dtrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 ble dtrmm_kernel_L1_M2_BEGIN dtrmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M4_40 .align 5 dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M4_22 dtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M4_100 dtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M4_42 dtrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif dtrmm_kernel_L1_M4_END: dtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble dtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble dtrmm_kernel_L1_M1_BEGIN dtrmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M2_40 dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M2_22 dtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M2_100 dtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M2_42 dtrmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif dtrmm_kernel_L1_M2_END: dtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble dtrmm_kernel_L1_END dtrmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble dtrmm_kernel_L1_M1_40 dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M1_22 dtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble dtrmm_kernel_L1_M1_100 dtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt dtrmm_kernel_L1_M1_42 dtrmm_kernel_L1_M1_100: SAVE1x1 dtrmm_kernel_L1_END: dtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/dznrm2_thunderx2t99.c000066400000000000000000000267331313527062700213210ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif #define N "x0" /* vector length */ #define X "x1" /* X vector address */ #define INC_X "x2" /* X stride */ #define J "x3" /* loop variable */ #define K "x4" /* loop variable */ #if !defined(COMPLEX) #define INC_SHIFT "3" #define SZ "8" #else #define INC_SHIFT "4" #define SZ "16" #endif #define SSQ "d0" #define SCALE "d1" #define REGZERO "d5" #define REGONE "d6" #define CUR_MAX "d7" #define CUR_MAXINV "d8" #define CUR_MAXINV_V "v8.2d" #define CUR_MAX_V "v8.2d" static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, double *ssq, double *scale) { *ssq = 0.0; *scale = 0.0; if (n <= 0) return; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " fmov "SCALE", xzr \n" " fmov "SSQ", #1.0 \n" " cmp "N", xzr \n" " ble .Lnrm2_kernel_L999 \n" " cmp "INC_X", xzr \n" " ble .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_F_BEGIN: \n" " fmov "REGZERO", xzr \n" " fmov "REGONE", #1.0 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " mov "J", "N" \n" " cmp "J", xzr \n" " beq .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_F_ZERO_SKIP: \n" " ldr d4, ["X"] \n" " fcmp d4, "REGZERO" \n" " bne .Lnrm2_kernel_F_INIT \n" #if defined(COMPLEX) " ldr d4, ["X", #8] \n" " fcmp d4, "REGZERO" \n" " bne .Lnrm2_kernel_F_INIT_I \n" #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" " beq .Lnrm2_kernel_L999 \n" " b .Lnrm2_kernel_F_ZERO_SKIP \n" ".Lnrm2_kernel_F_INIT: \n" " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" " fdiv d4, d4, "CUR_MAX" \n" " fmul d4, d4, d4 \n" " fadd "SSQ", "SSQ", d4 \n" " fmov "SCALE", "CUR_MAX" \n" #if defined(COMPLEX) ".Lnrm2_kernel_F_INIT_I: \n" " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" " fdiv d3, d3, "CUR_MAX" \n" " fmul d3, d3, d3 \n" " fadd "SSQ", "SSQ", d3 \n" " fmov "SCALE", "CUR_MAX" \n" #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" " beq .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_F_START: \n" " cmp "INC_X", #"SZ" \n" " bne .Lnrm2_kernel_F1 \n" " asr "K", "J", #4 \n" " cmp "K", xzr \n" " beq .Lnrm2_kernel_F1 \n" ".Lnrm2_kernel_F: \n" " ldp q16, q17, ["X"] \n" " ldp q18, q19, ["X", #32] \n" " ldp q20, q21, ["X", #64] \n" " ldp q22, q23, ["X", #96] \n" " add "X", "X", #128 \n" " fabs v16.2d, v16.2d \n" " fabs v17.2d, v17.2d \n" " fabs v18.2d, v18.2d \n" " fabs v19.2d, v19.2d \n" " fabs v20.2d, v20.2d \n" " fabs v21.2d, v21.2d \n" " fabs v22.2d, v22.2d \n" " fabs v23.2d, v23.2d \n" " fmaxp v24.2d, v16.2d, v17.2d \n" " fmaxp v25.2d, v18.2d, v19.2d \n" " fmaxp v26.2d, v20.2d, v21.2d \n" " fmaxp v27.2d, v22.2d, v23.2d \n" " fmaxp v24.2d, v24.2d, v25.2d \n" " fmaxp v26.2d, v26.2d, v27.2d \n" " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" " dup "CUR_MAXINV_V", v8.d[0] \n" " fmul v16.2d, v16.2d, "CUR_MAXINV_V" \n" " fmul v17.2d, v17.2d, "CUR_MAXINV_V" \n" " fmul v18.2d, v18.2d, "CUR_MAXINV_V" \n" " fmul v19.2d, v19.2d, "CUR_MAXINV_V" \n" " fmul v20.2d, v20.2d, "CUR_MAXINV_V" \n" " fmul v21.2d, v21.2d, "CUR_MAXINV_V" \n" " fmul v22.2d, v22.2d, "CUR_MAXINV_V" \n" " fmul v23.2d, v23.2d, "CUR_MAXINV_V" \n" " //fdiv v16.2d, v16.2d, "CUR_MAX_V" \n" " //fdiv v17.2d, v17.2d, "CUR_MAX_V" \n" " //fdiv v18.2d, v18.2d, "CUR_MAX_V" \n" " //fdiv v19.2d, v19.2d, "CUR_MAX_V" \n" " //fdiv v20.2d, v20.2d, "CUR_MAX_V" \n" " //fdiv v21.2d, v21.2d, "CUR_MAX_V" \n" " //fdiv v22.2d, v22.2d, "CUR_MAX_V" \n" " //fdiv v23.2d, v23.2d, "CUR_MAX_V" \n" " fmul v24.2d, v16.2d, v16.2d \n" " fmul v25.2d, v17.2d, v17.2d \n" " fmul v26.2d, v18.2d, v18.2d \n" " fmul v27.2d, v19.2d, v19.2d \n" " fmla v24.2d, v20.2d, v20.2d \n" " fmla v25.2d, v21.2d, v21.2d \n" " fmla v26.2d, v22.2d, v22.2d \n" " fmla v27.2d, v23.2d, v23.2d \n" " fadd v24.2d, v24.2d, v25.2d \n" " fadd v26.2d, v26.2d, v27.2d \n" " fadd v24.2d, v24.2d, v26.2d \n" " faddp d24, v24.2d \n" " fadd "SSQ", "SSQ", d24 \n" " fmov "SCALE", "CUR_MAX" \n" #if defined(COMPLEX) " ldp q16, q17, ["X"] \n" " ldp q18, q19, ["X", #32] \n" " ldp q20, q21, ["X", #64] \n" " ldp q22, q23, ["X", #96] \n" " add "X", "X", #128 \n" " fabs v16.2d, v16.2d \n" " fabs v17.2d, v17.2d \n" " fabs v18.2d, v18.2d \n" " fabs v19.2d, v19.2d \n" " fabs v20.2d, v20.2d \n" " fabs v21.2d, v21.2d \n" " fabs v22.2d, v22.2d \n" " fabs v23.2d, v23.2d \n" " fmaxp v24.2d, v16.2d, v17.2d \n" " fmaxp v25.2d, v18.2d, v19.2d \n" " fmaxp v26.2d, v20.2d, v21.2d \n" " fmaxp v27.2d, v22.2d, v23.2d \n" " fmaxp v24.2d, v24.2d, v25.2d \n" " fmaxp v26.2d, v26.2d, v27.2d \n" " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" " dup "CUR_MAXINV_V", v8.d[0] \n" " fmul v16.2d, v16.2d, "CUR_MAXINV_V" \n" " fmul v17.2d, v17.2d, "CUR_MAXINV_V" \n" " fmul v18.2d, v18.2d, "CUR_MAXINV_V" \n" " fmul v19.2d, v19.2d, "CUR_MAXINV_V" \n" " fmul v20.2d, v20.2d, "CUR_MAXINV_V" \n" " fmul v21.2d, v21.2d, "CUR_MAXINV_V" \n" " fmul v22.2d, v22.2d, "CUR_MAXINV_V" \n" " fmul v23.2d, v23.2d, "CUR_MAXINV_V" \n" " //fdiv v16.2d, v16.2d, "CUR_MAX_V" \n" " //fdiv v17.2d, v17.2d, "CUR_MAX_V" \n" " //fdiv v18.2d, v18.2d, "CUR_MAX_V" \n" " //fdiv v19.2d, v19.2d, "CUR_MAX_V" \n" " //fdiv v20.2d, v20.2d, "CUR_MAX_V" \n" " //fdiv v21.2d, v21.2d, "CUR_MAX_V" \n" " //fdiv v22.2d, v22.2d, "CUR_MAX_V" \n" " //fdiv v23.2d, v23.2d, "CUR_MAX_V" \n" " fmul v24.2d, v16.2d, v16.2d \n" " fmul v25.2d, v17.2d, v17.2d \n" " fmul v26.2d, v18.2d, v18.2d \n" " fmul v27.2d, v19.2d, v19.2d \n" " fmla v24.2d, v20.2d, v20.2d \n" " fmla v25.2d, v21.2d, v21.2d \n" " fmla v26.2d, v22.2d, v22.2d \n" " fmla v27.2d, v23.2d, v23.2d \n" " fadd v24.2d, v24.2d, v25.2d \n" " fadd v26.2d, v26.2d, v27.2d \n" " fadd v24.2d, v24.2d, v26.2d \n" " faddp d24, v24.2d \n" " fadd "SSQ", "SSQ", d24 \n" " fmov "SCALE", "CUR_MAX" \n" #endif " subs "K", "K", #1 \n" " bne .Lnrm2_kernel_F \n" ".Lnrm2_kernel_F_DONE: \n" " ands "J", "J", #15 \n" " beq .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_F1: \n" " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" " fdiv d4, d4, "CUR_MAX" \n" " fmul d4, d4, d4 \n" " fadd "SSQ", "SSQ", d4 \n" " fmov "SCALE", "CUR_MAX" \n" #if defined(COMPLEX) " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" " fdiv d3, d3, "CUR_MAX" \n" " fmul d3, d3, d3 \n" " fadd "SSQ", "SSQ", d3 \n" " fmov "SCALE", "CUR_MAX" \n" #endif " add "X", "X", "INC_X" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_F1 \n" ".Lnrm2_kernel_L999: \n" " str "SSQ", [%[SSQ_]] \n" " str "SCALE", [%[SCALE_]] \n" : : [SSQ_] "r" (ssq), //%0 [SCALE_] "r" (scale), //%1 [N_] "r" (n), //%2 [X_] "r" (x), //%3 [INCX_] "r" (inc_x) //%4 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" ); } #if defined(SMP) static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) { nrm2_compute(n, x, inc_x, result, result + 1); return 0; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha[2]; #endif FLOAT ssq, scale; if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (n <= 10000) nthreads = 1; if (nthreads == 1) { nrm2_compute(n, x, inc_x, &ssq, &scale); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; double *ptr; #if !defined(COMPLEX) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)nrm2_thread_function, nthreads); scale = 0.0; ssq = 1.0; ptr = (double *)result; for (i = 0; i < nthreads; i++) { FLOAT cur_scale, cur_ssq; cur_ssq = *ptr; cur_scale = *(ptr + 1); if (cur_scale != 0) { if (cur_scale > scale) { scale = (scale / cur_scale); ssq = ssq * scale * scale; ssq += cur_ssq; scale = cur_scale; } else { cur_scale = (cur_scale / scale); cur_ssq = cur_ssq * cur_scale * cur_scale; ssq += cur_ssq; } } ptr = (double *)(((char *)ptr) + sizeof(double) * 2); } } #else nrm2_compute(n, x, inc_x, &ssq, &scale); #endif ssq = sqrt(ssq) * scale; return ssq; } OpenBLAS-0.2.20/kernel/arm64/dznrm2_thunderx2t99_fast.c000066400000000000000000000164111313527062700223260ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif #define N "x0" /* vector length */ #define X "x1" /* X vector address */ #define INC_X "x2" /* X stride */ #define J "x5" /* loop variable */ #define TMPF "d16" #define SSQ "d0" #if !defined(COMPLEX) #define N_DIV_SHIFT "5" #define N_REM_MASK "31" #define INC_SHIFT "3" #else #define N_DIV_SHIFT "4" #define N_REM_MASK "15" #define INC_SHIFT "4" #endif #define KERNEL_F \ "ldp q16, q17, ["X"] \n" \ "ldp q18, q19, ["X", #32] \n" \ "ldp q20, q21, ["X", #64] \n" \ "ldp q22, q23, ["X", #96] \n" \ "ldp q24, q25, ["X", #128] \n" \ "ldp q26, q27, ["X", #160] \n" \ "ldp q28, q29, ["X", #192] \n" \ "ldp q30, q31, ["X", #224] \n" \ "add "X", "X", #256 \n" \ "fmla v0.2d, v16.2d, v16.2d \n" \ "fmla v1.2d, v17.2d, v17.2d \n" \ "fmla v2.2d, v18.2d, v18.2d \n" \ "fmla v3.2d, v19.2d, v19.2d \n" \ "prfm PLDL1KEEP, ["X", #1024] \n" \ "prfm PLDL1KEEP, ["X", #1024+64] \n" \ "fmla v4.2d, v20.2d, v20.2d \n" \ "fmla v5.2d, v21.2d, v21.2d \n" \ "fmla v6.2d, v22.2d, v22.2d \n" \ "fmla v7.2d, v23.2d, v23.2d \n" \ "prfm PLDL1KEEP, ["X", #1024+128] \n" \ "prfm PLDL1KEEP, ["X", #1024+192] \n" \ "fmla v0.2d, v24.2d, v24.2d \n" \ "fmla v1.2d, v25.2d, v25.2d \n" \ "fmla v2.2d, v26.2d, v26.2d \n" \ "fmla v3.2d, v27.2d, v27.2d \n" \ "fmla v4.2d, v28.2d, v28.2d \n" \ "fmla v5.2d, v29.2d, v29.2d \n" \ "fmla v6.2d, v30.2d, v30.2d \n" \ "fmla v7.2d, v31.2d, v31.2d \n" #if !defined(COMPLEX) #define KERNEL_F1 \ "ldr "TMPF", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fmadd "SSQ", "TMPF", "TMPF", "SSQ" \n" #define KERNEL_F_FINALIZE \ "fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \ "fadd v6.2d, v6.2d, v7.2d \n" \ "fadd v0.2d, v0.2d, v2.2d \n" \ "fadd v4.2d, v4.2d, v6.2d \n" \ "fadd v0.2d, v0.2d, v4.2d \n" \ "faddp "SSQ", v0.2d \n" #define KERNEL_FINALIZE \ "" #else #define KERNEL_F1 \ "ldr q16, ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fmla v0.2d, v16.2d, v16.2d \n" #define KERNEL_F_FINALIZE \ "fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \ "fadd v6.2d, v6.2d, v7.2d \n" \ "fadd v0.2d, v0.2d, v2.2d \n" \ "fadd v4.2d, v4.2d, v6.2d \n" \ "fadd v0.2d, v0.2d, v4.2d \n" #define KERNEL_FINALIZE \ "faddp "SSQ", v0.2d \n" #endif static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { double ret = 0.0 ; if (n <= 0) return ret; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " fmov "SSQ", xzr \n" " fmov d1, xzr \n" " fmov d2, xzr \n" " fmov d3, xzr \n" " fmov d4, xzr \n" " fmov d5, xzr \n" " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" " ble .Lnrm2_kernel_L999 \n" " cmp "INC_X", xzr \n" " ble .Lnrm2_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Lnrm2_kernel_S_BEGIN \n" ".Lnrm2_kernel_F_BEGIN: \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq .Lnrm2_kernel_F1 \n" " .align 5 \n" ".Lnrm2_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_F \n" " "KERNEL_F_FINALIZE" \n" ".Lnrm2_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" " ble .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_F10 \n" " b .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lnrm2_kernel_S1 \n" ".Lnrm2_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_S4 \n" ".Lnrm2_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_S10 \n" ".Lnrm2_kernel_L999: \n" " "KERNEL_FINALIZE" \n" " str "SSQ", [%[RET_]] \n" : : [RET_] "r" (&ret), //%0 [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return ret; } #if defined(SMP) static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) { *(double *)result = nrm2_compute(n, x, inc_x); return 0; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha[2]; #endif FLOAT nrm2 = 0.0; if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (n <= 10000) nthreads = 1; if (nthreads == 1) { nrm2 = nrm2_compute(n, x, inc_x); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; double *ptr; #if !defined(COMPLEX) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)nrm2_thread_function, nthreads); ptr = (double *)result; for (i = 0; i < nthreads; i++) { nrm2 = nrm2 + (*ptr); ptr = (double *)(((char *)ptr) + sizeof(double) * 2); } } #else nrm2 = nrm2_compute(n, x, inc_x); #endif nrm2 = sqrt(nrm2); return nrm2; } OpenBLAS-0.2.20/kernel/arm64/gemv_n.S000066400000000000000000000162051313527062700167600ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 /* Y vector length */ #define N x1 /* X vector length */ #define A x3 /* A vector address */ #define LDA x4 /* A stride */ #define X x5 /* X vector address */ #define INC_X x6 /* X stride */ #define Y x7 /* Y vector address */ #define INC_Y x2 /* Y stride */ #define A_PTR x9 /* loop A vector address */ #define Y_IPTR x10 /* loop Y vector address */ #define J x11 /* loop variable */ #define I x12 /* loop variable */ #define Y_OPTR x13 /* loop Y vector address */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define ALPHA s0 #define TEMP s1 #define TEMPV {v1.s}[0] #define TMP1 s2 #define TMPV1 {v2.s}[0] #define TMP2 s3 #define TMPV2 {v3.s}[0] #define SZ 4 #define SHZ 2 #else #define ALPHA d0 #define TEMP d1 #define TEMPV {v1.d}[0] #define TMP1 d2 #define TMPV1 {v2.d}[0] #define TMP2 d3 #define TMPV2 {v3.d}[0] #define SZ 8 #define SHZ 3 #endif #define A_PRE_SIZE 768 #define Y_PRE_SIZE 768 /******************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm .macro KERNEL_F16 #if !defined(DOUBLE) ld1 {v2.4s, v3.4s}, [A_PTR], #32 ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 fmla v4.4s, v1.4s, v2.4s prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.4s, v1.4s, v3.4s st1 {v4.4s, v5.4s}, [Y_OPTR], #32 ld1 {v6.4s, v7.4s}, [A_PTR], #32 ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 fmla v8.4s, v1.4s, v6.4s prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.4s, v1.4s, v7.4s st1 {v8.4s, v9.4s}, [Y_OPTR], #32 #else //DOUBLE ld1 {v2.2d, v3.2d}, [A_PTR], #32 ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 fmla v4.2d, v1.2d, v2.2d prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.2d, v1.2d, v3.2d st1 {v4.2d, v5.2d}, [Y_OPTR], #32 ld1 {v6.2d, v7.2d}, [A_PTR], #32 ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 fmla v8.2d, v1.2d, v6.2d prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.2d, v1.2d, v7.2d st1 {v8.2d, v9.2d}, [Y_OPTR], #32 ld1 {v10.2d, v11.2d}, [A_PTR], #32 ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 fmla v12.2d, v1.2d, v10.2d prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v13.2d, v1.2d, v11.2d st1 {v12.2d, v13.2d}, [Y_OPTR], #32 ld1 {v14.2d, v15.2d}, [A_PTR], #32 ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 fmla v16.2d, v1.2d, v14.2d prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v17.2d, v1.2d, v15.2d st1 {v16.2d, v17.2d}, [Y_OPTR], #32 #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld1 {v2.4s}, [A_PTR], #16 ld1 {v3.4s}, [Y_IPTR], #16 fmla v3.4s, v1.4s, v2.4s st1 {v3.4s}, [Y_OPTR], #16 #else ld1 {v2.2d}, [A_PTR], #16 ld1 {v3.2d}, [Y_IPTR], #16 fmla v3.2d, v1.2d, v2.2d st1 {v3.2d}, [Y_OPTR], #16 ld1 {v4.2d}, [A_PTR], #16 ld1 {v5.2d}, [Y_IPTR], #16 fmla v5.2d, v1.2d, v4.2d st1 {v5.2d}, [Y_OPTR], #16 #endif .endm .macro KERNEL_F1 ld1 TMPV1, [A_PTR], #SZ ld1 TMPV2, [Y_IPTR] fmadd TMP2, TEMP, TMP1, TMP2 st1 TMPV2, [Y_IPTR], #SZ .endm .macro INIT_S lsl INC_Y, INC_Y, #SHZ .endm .macro KERNEL_S1 ld1 TMPV1, [A_PTR], #SZ ld1 TMPV2, [Y_IPTR] fmadd TMP2, TEMP, TMP1, TMP2 st1 TMPV2, [Y_IPTR], INC_Y .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE ldr INC_Y, [sp] SAVE_REGS cmp N, xzr ble gemv_n_kernel_L999 cmp M, xzr ble gemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ mov J, N cmp INC_Y, #1 bne gemv_n_kernel_S_BEGIN gemv_n_kernel_F_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP #if !defined(DOUBLE) ins v1.s[1], v1.s[0] ins v1.s[2], v1.s[0] ins v1.s[3], v1.s[0] #else ins v1.d[1], v1.d[0] #endif mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y gemv_n_kernel_F32: asr I, M, #5 cmp I, xzr beq gemv_n_kernel_F4 gemv_n_kernel_F320: KERNEL_F16 KERNEL_F16 subs I, I, #1 bne gemv_n_kernel_F320 gemv_n_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr beq gemv_n_kernel_F1 gemv_n_kernel_F40: KERNEL_F4 subs I, I, #1 bne gemv_n_kernel_F40 gemv_n_kernel_F1: ands I, M, #3 ble gemv_n_kernel_F_END gemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 bne gemv_n_kernel_F10 gemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 bne gemv_n_kernel_F_LOOP b gemv_n_kernel_L999 gemv_n_kernel_S_BEGIN: INIT_S gemv_n_kernel_S_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP mov A_PTR, A mov Y_IPTR, Y asr I, M, #2 cmp I, xzr ble gemv_n_kernel_S1 gemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne gemv_n_kernel_S4 gemv_n_kernel_S1: ands I, M, #3 ble gemv_n_kernel_S_END gemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 bne gemv_n_kernel_S10 gemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 bne gemv_n_kernel_S_LOOP gemv_n_kernel_L999: mov w0, wzr RESTORE_REGS ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/gemv_t.S000066400000000000000000000201241313527062700167610ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 /* Y vector length */ #define N x1 /* X vector length */ #define A x3 /* A vector address */ #define LDA x4 /* A stride */ #define X x5 /* X vector address */ #define INC_X x6 /* X stride */ #define Y x7 /* Y vector address */ #define INC_Y x2 /* Y stride */ #define A_PTR x9 /* loop A vector address */ #define X_PTR x10 /* loop X vector address */ #define J x11 /* loop variable */ #define I x12 /* loop variable */ #define X_PREFETCH_SIZE 768 #define A_PREFETCH_SIZE 768 /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define REG0 wzr #define ALPHA s0 #define TEMP s1 #define TEMP1 s2 #define TEMP2 s3 #define TEMP3 s4 #define TEMPV {v1.s}[0] #define TMP1 s2 #define TMPV1 {v2.s}[0] #define TMP2 s3 #define TMPV2 {v3.s}[0] #define SZ 4 #define SHZ 2 #else #define REG0 xzr #define ALPHA d0 #define TEMP d1 #define TEMP1 d2 #define TEMP2 d3 #define TEMP3 d4 #define TEMPV {v1.d}[0] #define TMP1 d2 #define TMPV1 {v2.d}[0] #define TMP2 d3 #define TMPV2 {v3.d}[0] #define SZ 8 #define SHZ 3 #endif /******************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm .macro KERNEL_F32 #if !defined(DOUBLE) ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 fmla v1.4s, v5.4s, v9.4s prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v6.4s, v10.4s prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v7.4s, v11.4s ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 fmla v4.4s, v8.4s, v12.4s ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 fmla v1.4s, v13.4s, v17.4s prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v14.4s, v18.4s prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v15.4s, v19.4s fmla v4.4s, v16.4s, v20.4s #else ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d #endif .endm .macro KERNEL_F32_FINALIZE #if !defined(DOUBLE) fadd v1.4s, v1.4s, v2.4s fadd v1.4s, v1.4s, v3.4s fadd v1.4s, v1.4s, v4.4s #else fadd v1.2d, v1.2d, v2.2d fadd v1.2d, v1.2d, v3.2d fadd v1.2d, v1.2d, v4.2d #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld1 {v2.4s}, [A_PTR], #16 ld1 {v3.4s}, [X_PTR], #16 fmla v1.4s, v2.4s, v3.4s #else ld1 {v2.2d}, [A_PTR], #16 ld1 {v3.2d}, [X_PTR], #16 fmla v1.2d, v2.2d, v3.2d ld1 {v4.2d}, [A_PTR], #16 ld1 {v5.2d}, [X_PTR], #16 fmla v1.2d, v4.2d, v5.2d #endif .endm .macro KERNEL_F4_FINALIZE #if !defined(DOUBLE) ext v2.16b, v1.16b, v1.16b, #8 fadd v1.2s, v1.2s, v2.2s faddp TEMP, v1.2s #else faddp TEMP, v1.2d #endif .endm .macro KERNEL_F1 ld1 TMPV1, [A_PTR], #SZ ld1 TMPV2, [X_PTR], #SZ fmadd TEMP, TMP1, TMP2, TEMP .endm .macro INIT_S lsl INC_X, INC_X, #SHZ .endm .macro KERNEL_S1 ld1 TMPV1, [A_PTR], #SZ ld1 TMPV2, [X_PTR], INC_X fmadd TEMP, TMP1, TMP2, TEMP .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE ldr INC_Y, [sp] SAVE_REGS cmp N, xzr ble gemv_t_kernel_L999 cmp M, xzr ble gemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ mov J, N cmp INC_X, #1 bne gemv_t_kernel_S_BEGIN gemv_t_kernel_F_LOOP: fmov TEMP, REG0 fmov TEMP1, REG0 fmov TEMP2, REG0 fmov TEMP3, REG0 mov A_PTR, A mov X_PTR, X gemv_t_kernel_F32: asr I, M, #5 cmp I, xzr beq gemv_t_kernel_F4 gemv_t_kernel_F320: KERNEL_F32 subs I, I, #1 bne gemv_t_kernel_F320 KERNEL_F32_FINALIZE gemv_t_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr beq gemv_t_kernel_F1 gemv_t_kernel_F40: KERNEL_F4 subs I, I, #1 bne gemv_t_kernel_F40 gemv_t_kernel_F1: KERNEL_F4_FINALIZE ands I, M, #3 ble gemv_t_kernel_F_END gemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 bne gemv_t_kernel_F10 gemv_t_kernel_F_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y bne gemv_t_kernel_F_LOOP b gemv_t_kernel_L999 gemv_t_kernel_S_BEGIN: INIT_S gemv_t_kernel_S_LOOP: fmov TEMP, REG0 mov A_PTR, A mov X_PTR, X asr I, M, #2 cmp I, xzr ble gemv_t_kernel_S1 gemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne gemv_t_kernel_S4 gemv_t_kernel_S1: ands I, M, #3 ble gemv_t_kernel_S_END gemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 bne gemv_t_kernel_S10 gemv_t_kernel_S_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y bne gemv_t_kernel_S_LOOP gemv_t_kernel_L999: RESTORE_REGS mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/iamax.S000066400000000000000000000140241313527062700166010ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define INDEX x3 /* index of max/min value */ #define Z x4 /* vector index */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if defined(USE_MIN) #define COND le #else #define COND ge #endif #if !defined(DOUBLE) #define MAXF s0 #define TMPF s1 #define TMPVF {v1.s}[0] #define SZ 4 #else #define MAXF d0 #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 ld1 {v0.s}[0], [X], INC_X #else lsl INC_X, INC_X, #3 ld1 {v0.d}[0], [X], INC_X #endif mov Z, #1 mov INDEX, Z fabs MAXF, MAXF .endm .macro KERNEL_F8 #if !defined(DOUBLE) ldp q2, q3, [X], #32 fabs v2.4s, v2.4s fabs v3.4s, v3.4s fmax v2.4s, v2.4s, v3.4s fmaxv TMPF, v2.4s fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND csel INDEX, INDEX, Z, COND add Z, Z, #8 #else ldp q2, q3, [X], #32 ldp q4, q5, [X], #32 fabs v2.2d, v2.2d fabs v3.2d, v3.2d fabs v4.2d, v4.2d fabs v5.2d, v5.2d fmax v2.2d, v2.2d, v3.2d fmax v4.2d, v4.2d, v5.2d fmax v2.2d, v2.2d, v4.2d fmaxp TMPF, v2.2d fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND csel INDEX, INDEX, Z, COND add Z, Z, #8 #endif PRFM PLDL1KEEP, [X, #1024] .endm .macro KERNEL_F8_FINALIZE sub x6, INDEX, #1 #if !defined(DOUBLE) lsl x6, x6, #2 add x7, x7, x6 ldp q2, q3, [x7] fabs v2.4s, v2.4s fabs v3.4s, v3.4s ins v4.s[0], v3.s[0] ins v5.s[0], v3.s[1] ins v6.s[0], v3.s[2] ins v7.s[0], v3.s[3] add x6, INDEX, #7 fcmp MAXF, s7 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, s6 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, s5 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v2.s[0] ins v5.s[0], v2.s[1] ins v6.s[0], v2.s[2] ins v7.s[0], v2.s[3] sub x6, x6, #1 fcmp MAXF, s7 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, s6 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, s5 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq #else add x6, x6, #4 lsl x6, x6, #3 add x7, x7, x6 ldp q2, q3, [x7] fabs v2.2d, v2.2d fabs v3.2d, v3.2d ins v4.d[0], v2.d[0] ins v5.d[0], v2.d[1] ins v6.d[0], v3.d[0] ins v7.d[0], v3.d[1] add x6, INDEX, #7 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, d6 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, d5 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, d4 csel INDEX, x6, INDEX, eq sub x7, x7, #32 ldp q2, q3, [x7] fabs v2.2d, v2.2d fabs v3.2d, v3.2d ins v4.d[0], v2.d[0] ins v5.d[0], v2.d[1] ins v6.d[0], v3.d[0] ins v7.d[0], v3.d[1] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, d6 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, d5 csel INDEX, x6, INDEX, eq sub x6, x6, #1 fcmp MAXF, d4 csel INDEX, x6, INDEX, eq #endif .endm .macro KERNEL_S1 ld1 TMPVF, [X], INC_X add Z, Z, #1 fabs TMPF, TMPF fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND csel INDEX, INDEX, Z, COND .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble iamax_kernel_zero cmp INC_X, xzr ble iamax_kernel_zero cmp INC_X, #1 bne iamax_kernel_S_BEGIN mov x7, X iamax_kernel_F_BEGIN: INIT_S subs N, N, #1 ble iamax_kernel_L999 asr I, N, #3 cmp I, xzr beq iamax_kernel_F1 add Z, Z, #1 iamax_kernel_F8: KERNEL_F8 subs I, I, #1 bne iamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 iamax_kernel_F1: ands I, N, #7 ble iamax_kernel_L999 iamax_kernel_F10: KERNEL_S1 subs I, I, #1 bne iamax_kernel_F10 b iamax_kernel_L999 iamax_kernel_S_BEGIN: INIT_S subs N, N, #1 ble iamax_kernel_L999 asr I, N, #2 cmp I, xzr ble iamax_kernel_S1 iamax_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne iamax_kernel_S4 iamax_kernel_S1: ands I, N, #3 ble iamax_kernel_L999 iamax_kernel_S10: KERNEL_S1 subs I, I, #1 bne iamax_kernel_S10 iamax_kernel_L999: mov x0, INDEX ret iamax_kernel_zero: mov x0, xzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/iamax_thunderx2t99.c000066400000000000000000000246621313527062700212030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define INDEX "x3" /* index of max/min value */ #define Z "x4" /* vector index */ #define J "x5" /* loop variable */ #if !defined(DOUBLE) #define MAXF "s0" #define TMPF0 "s1" #define TMPF1 "s4" #define N_KERNEL_SIZE "64" #define SZ "4" #define N_DIV_SHIFT "6" #define N_REM_MASK "63" #define INC_SHIFT "2" #else #define MAXF "d0" #define TMPF0 "d1" #define TMPF1 "d4" #define N_KERNEL_SIZE "32" #define SZ "8" #define N_DIV_SHIFT "5" #define N_REM_MASK "31" #define INC_SHIFT "3" #endif /******************************************************************************/ #if !defined(DOUBLE) #define KERNEL_F \ "ldp q2, q3, ["X"] \n" \ "ldp q4, q5, ["X", #32] \n" \ "ldp q6, q7, ["X", #64] \n" \ "ldp q16, q17, ["X", #96] \n" \ "ldp q18, q19, ["X", #128] \n" \ "ldp q20, q21, ["X", #160] \n" \ "ldp q22, q23, ["X", #192] \n" \ "ldp q24, q25, ["X", #224] \n" \ "add "X", "X", #256 \n" \ "fabs v2.4s, v2.4s \n" \ "fabs v3.4s, v3.4s \n" \ "fabs v4.4s, v4.4s \n" \ "fabs v5.4s, v5.4s \n" \ "fabs v6.4s, v6.4s \n" \ "fabs v7.4s, v7.4s \n" \ "fabs v16.4s, v16.4s \n" \ "fabs v17.4s, v17.4s \n" \ "fabs v18.4s, v18.4s \n" \ "fabs v19.4s, v19.4s \n" \ "fabs v20.4s, v20.4s \n" \ "fabs v21.4s, v21.4s \n" \ "fabs v22.4s, v22.4s \n" \ "fabs v23.4s, v23.4s \n" \ "fabs v24.4s, v24.4s \n" \ "fabs v25.4s, v25.4s \n" \ "fmax v2.4s, v2.4s, v3.4s \n" \ "fmax v4.4s, v4.4s, v5.4s \n" \ "fmax v6.4s, v6.4s, v7.4s \n" \ "fmax v16.4s, v16.4s, v17.4s \n" \ "fmax v18.4s, v18.4s, v19.4s \n" \ "fmax v20.4s, v20.4s, v21.4s \n" \ "fmax v22.4s, v22.4s, v23.4s \n" \ "fmax v24.4s, v24.4s, v25.4s \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fmax v2.4s, v2.4s, v4.4s \n" \ "fmax v6.4s, v6.4s, v16.4s \n" \ "fmax v18.4s, v18.4s, v20.4s \n" \ "fmax v22.4s, v22.4s, v24.4s \n" \ "fmax v2.4s, v2.4s, v6.4s \n" \ "fmax v18.4s, v18.4s, v22.4s \n" \ "fmax v2.4s, v2.4s, v18.4s \n" \ "fmaxv "TMPF0", v2.4s \n" \ "fcmp "MAXF", "TMPF0" \n" \ "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ "csel "INDEX", "INDEX", "Z", ge \n" \ "add "Z", "Z", #"N_KERNEL_SIZE" \n" #else #define KERNEL_F \ "ldp q2, q3, ["X"] \n" \ "ldp q4, q5, ["X", #32] \n" \ "ldp q6, q7, ["X", #64] \n" \ "ldp q16, q17, ["X", #96] \n" \ "ldp q18, q19, ["X", #128] \n" \ "ldp q20, q21, ["X", #160] \n" \ "ldp q22, q23, ["X", #192] \n" \ "ldp q24, q25, ["X", #224] \n" \ "add "X", "X", #256 \n" \ "fabs v2.2d, v2.2d \n" \ "fabs v3.2d, v3.2d \n" \ "fabs v4.2d, v4.2d \n" \ "fabs v5.2d, v5.2d \n" \ "fabs v6.2d, v6.2d \n" \ "fabs v7.2d, v7.2d \n" \ "fabs v16.2d, v16.2d \n" \ "fabs v17.2d, v17.2d \n" \ "fabs v18.2d, v18.2d \n" \ "fabs v19.2d, v19.2d \n" \ "fabs v20.2d, v20.2d \n" \ "fabs v21.2d, v21.2d \n" \ "fabs v22.2d, v22.2d \n" \ "fabs v23.2d, v23.2d \n" \ "fabs v24.2d, v24.2d \n" \ "fabs v25.2d, v25.2d \n" \ "fmax v2.2d, v2.2d, v3.2d \n" \ "fmax v4.2d, v4.2d, v5.2d \n" \ "fmax v6.2d, v6.2d, v7.2d \n" \ "fmax v16.2d, v16.2d, v17.2d \n" \ "fmax v18.2d, v18.2d, v19.2d \n" \ "fmax v20.2d, v20.2d, v21.2d \n" \ "fmax v22.2d, v22.2d, v23.2d \n" \ "fmax v24.2d, v24.2d, v25.2d \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fmax v2.2d, v2.2d, v4.2d \n" \ "fmax v6.2d, v6.2d, v16.2d \n" \ "fmax v18.2d, v18.2d, v20.2d \n" \ "fmax v22.2d, v22.2d, v24.2d \n" \ "fmax v2.2d, v2.2d, v6.2d \n" \ "fmax v18.2d, v18.2d, v22.2d \n" \ "fmax v2.2d, v2.2d, v18.2d \n" \ "ins v3.d[0], v2.d[1] \n" \ "fmax "TMPF0", d3, d2 \n" \ "fcmp "MAXF", "TMPF0" \n" \ "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ "csel "INDEX", "INDEX", "Z", ge \n" \ "add "Z", "Z", #"N_KERNEL_SIZE" \n" #endif #define KERNEL_F_FINALIZE \ "sub x6, "INDEX", #1 \n" \ "lsl x6, x6, #"INC_SHIFT" \n" \ "add x7, x7, x6 \n" \ "mov x6, #0 \n" \ "1: \n" \ "add x6, x6, #1 \n" \ "cmp x6, #"N_KERNEL_SIZE" \n" \ "bge 2f \n" \ "ldr "TMPF1", [x7] \n" \ "fabs "TMPF1", "TMPF1" \n" \ "fcmp "MAXF", "TMPF1" \n" \ "add x7, x7, #"SZ" \n" \ "bne 1b \n" \ "2: \n" \ "sub x6, x6, #1 \n" \ "add "INDEX", "INDEX", x6 \n" #define INIT \ "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ "ldr "MAXF", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "mov "Z", #1 \n" \ "mov "INDEX", "Z" \n" \ "fabs "MAXF", "MAXF" \n" #define KERNEL_S1 \ "ldr "TMPF0", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "add "Z", "Z", #1 \n" \ "fabs "TMPF0", "TMPF0" \n" \ "fcmp "MAXF", "TMPF0" \n" \ "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ "csel "INDEX", "INDEX", "Z", ge \n" #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG index = 0; if ( n < 0 ) return index; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " cmp "N", xzr \n" " ble .Liamax_kernel_zero \n" " cmp "INC_X", xzr \n" " ble .Liamax_kernel_zero \n" " cmp "INC_X", #1 \n" " bne .Liamax_kernel_S_BEGIN \n" " mov x7, "X" \n" ".Liamax_kernel_F_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" " ble .Liamax_kernel_L999 \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq .Liamax_kernel_F1 \n" " add "Z", "Z", #1 \n" ".Liamax_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" " bne .Liamax_kernel_F \n" " "KERNEL_F_FINALIZE" \n" " sub "Z", "Z", #1 \n" ".Liamax_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" " ble .Liamax_kernel_L999 \n" ".Liamax_kernel_F10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Liamax_kernel_F10 \n" " b .Liamax_kernel_L999 \n" ".Liamax_kernel_S_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" " ble .Liamax_kernel_L999 \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Liamax_kernel_S1 \n" ".Liamax_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Liamax_kernel_S4 \n" ".Liamax_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Liamax_kernel_L999 \n" ".Liamax_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Liamax_kernel_S10 \n" ".Liamax_kernel_L999: \n" " mov x0, "INDEX" \n" " b .Liamax_kernel_DONE \n" ".Liamax_kernel_zero: \n" " mov x0, xzr \n" ".Liamax_kernel_DONE: \n" " mov %[INDEX_], "INDEX" \n" : [INDEX_] "=r" (index) //%0 : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return index; } #if defined(SMP) static int iamax_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { *(BLASLONG *)result = iamax_compute(n, x, inc_x); return 0; } #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif BLASLONG max_index = 0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { max_index = iamax_compute(n, x, inc_x); } else { BLASLONG i, width, cur_index; int num_cpu; int mode; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; FLOAT max = -1.0; #if !defined(DOUBLE) mode = BLAS_SINGLE; #else mode = BLAS_DOUBLE; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)iamax_thread_function, nthreads); num_cpu = 0; i = n; cur_index = 0; while (i > 0) { FLOAT elem; BLASLONG cur_max_index; cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2]; elem = x[((cur_index + cur_max_index - 1) * inc_x)]; elem = fabs(elem); if (elem >= max) { max = elem; max_index = cur_index + cur_max_index; } width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; cur_index += width; num_cpu ++; } } #else max_index = iamax_compute(n, x, inc_x); #endif return max_index; } OpenBLAS-0.2.20/kernel/arm64/izamax.S000066400000000000000000000157521313527062700170040ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define INDEX x3 /* index of max/min value */ #define Z x4 /* vector index */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if defined(USE_MIN) #define COND le #else #define COND ge #endif #if !defined(DOUBLE) #define MAXF s0 #define TMPF s1 #define TMPVF {v1.s}[0] #define SZ 4 #else #define MAXF d0 #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #3 ld1 {v0.2s}, [X], INC_X mov Z, #1 mov INDEX, Z fabs v0.2s, v0.2s ext v1.8b, v0.8b, v0.8b, #4 fadd MAXF, s0, s1 #else lsl INC_X, INC_X, #4 ld1 {v0.2d}, [X], INC_X mov Z, #1 mov INDEX, Z fabs v0.2d, v0.2d faddp MAXF, v0.2d #endif .endm .macro KERNEL_F8 #if !defined(DOUBLE) ldp q2, q3, [X], #32 ldp q4, q5, [X], #32 fabs v2.4s, v2.4s fabs v3.4s, v3.4s fabs v4.4s, v4.4s fabs v5.4s, v5.4s faddp v2.4s, v2.4s, v3.4s faddp v3.4s, v4.4s, v5.4s fmax v2.4s, v2.4s, v3.4s fmaxv TMPF, v2.4s fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND csel INDEX, INDEX, Z, COND add Z, Z, #8 #else ldp q2, q3, [X], #32 ldp q4, q5, [X], #32 ldp q16, q17, [X], #32 ldp q18, q19, [X], #32 fabs v2.2d, v2.2d fabs v3.2d, v3.2d fabs v4.2d, v4.2d fabs v5.2d, v5.2d fabs v16.2d, v16.2d fabs v17.2d, v17.2d fabs v18.2d, v18.2d fabs v19.2d, v19.2d faddp v2.2d, v2.2d, v3.2d faddp v3.2d, v4.2d, v5.2d faddp v4.2d, v16.2d, v17.2d faddp v5.2d, v18.2d, v19.2d fmax v2.2d, v2.2d, v3.2d fmax v4.2d, v4.2d, v5.2d fmax v2.2d, v2.2d, v4.2d fmaxp TMPF, v2.2d fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND csel INDEX, INDEX, Z, COND add Z, Z, #8 #endif PRFM PLDL1KEEP, [X, #1024] .endm .macro KERNEL_F8_FINALIZE sub x6, INDEX, #1 #if !defined(DOUBLE) lsl x6, x6, #3 add x7, x7, x6 ldp q2, q3, [x7] ldp q4, q5, [x7, #32] fabs v2.4s, v2.4s fabs v3.4s, v3.4s fabs v4.4s, v4.4s fabs v5.4s, v5.4s faddp v2.4s, v2.4s, v3.4s faddp v3.4s, v4.4s, v5.4s ins v4.s[0], v3.s[3] add x6, INDEX, #7 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v3.s[2] sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v3.s[1] sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v3.s[0] sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v2.s[3] sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v2.s[2] sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v2.s[1] sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq ins v4.s[0], v2.s[0] sub x6, x6, #1 fcmp MAXF, s4 csel INDEX, x6, INDEX, eq #else lsl x6, x6, #4 add x7, x7, x6 ldp q2, q3, [x7] ldp q4, q5, [x7, #32] ldp q16, q17, [x7, #64] ldp q18, q19, [x7, #96] fabs v2.2d, v2.2d fabs v3.2d, v3.2d fabs v4.2d, v4.2d fabs v5.2d, v5.2d fabs v16.2d, v16.2d fabs v17.2d, v17.2d fabs v18.2d, v18.2d fabs v19.2d, v19.2d faddp v2.2d, v2.2d, v3.2d faddp v3.2d, v4.2d, v5.2d faddp v4.2d, v16.2d, v17.2d faddp v5.2d, v18.2d, v19.2d ins v7.d[0], v5.d[1] add x6, INDEX, #7 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq ins v7.d[0], v5.d[0] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq ins v7.d[0], v4.d[1] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq ins v7.d[0], v4.d[0] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq ins v7.d[0], v3.d[1] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq ins v7.d[0], v3.d[0] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq ins v7.d[0], v2.d[1] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq ins v7.d[0], v2.d[0] sub x6, x6, #1 fcmp MAXF, d7 csel INDEX, x6, INDEX, eq #endif .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v1.2s}, [X], INC_X add Z, Z, #1 fabs v1.2s, v1.2s ext v2.8b, v1.8b, v1.8b, #4 fadd TMPF, s1, s2 #else ld1 {v1.2d}, [X], INC_X add Z, Z, #1 fabs v1.2d, v1.2d faddp TMPF, v1.2d #endif fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND csel INDEX, INDEX, Z, COND .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble iamax_kernel_zero cmp INC_X, xzr ble iamax_kernel_zero cmp INC_X, #1 bne iamax_kernel_S_BEGIN mov x7, X iamax_kernel_F_BEGIN: INIT_S subs N, N, #1 ble iamax_kernel_L999 asr I, N, #3 cmp I, xzr ble iamax_kernel_F1 add Z, Z, #1 iamax_kernel_F8: KERNEL_F8 subs I, I, #1 bne iamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 iamax_kernel_F1: ands I, N, #7 ble iamax_kernel_L999 iamax_kernel_F10: KERNEL_S1 subs I, I, #1 bne iamax_kernel_F10 b iamax_kernel_L999 iamax_kernel_S_BEGIN: INIT_S subs N, N, #1 ble iamax_kernel_L999 asr I, N, #2 cmp I, xzr ble iamax_kernel_S1 iamax_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne iamax_kernel_S4 iamax_kernel_S1: ands I, N, #3 ble iamax_kernel_L999 iamax_kernel_S10: KERNEL_S1 subs I, I, #1 bne iamax_kernel_S10 iamax_kernel_L999: mov x0, INDEX ret iamax_kernel_zero: mov x0, xzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/izamax_thunderx2t99.c000066400000000000000000000256351313527062700213760ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define INDEX "x3" /* index of max/min value */ #define Z "x4" /* vector index */ #define J "x5" /* loop variable */ #if !defined(DOUBLE) #define MAXF "s0" #define TMPF0 "s1" #define TMPF0V "v1.2s" #define TMPF1 "d4" #define TMPF1V "v4.2s" #define N_KERNEL_SIZE "32" #define SZ "8" #define N_DIV_SHIFT "5" #define N_REM_MASK "31" #define INC_SHIFT "3" #else #define MAXF "d0" #define TMPF0 "d1" #define TMPF0V "v1.2d" #define TMPF1 "q4" #define TMPF1V "v4.2d" #define N_KERNEL_SIZE "16" #define SZ "16" #define N_DIV_SHIFT "4" #define N_REM_MASK "15" #define INC_SHIFT "4" #endif /******************************************************************************/ #if !defined(DOUBLE) #define KERNEL_F \ "ldp q2, q3, ["X"] \n" \ "ldp q4, q5, ["X", #32] \n" \ "ldp q6, q7, ["X", #64] \n" \ "ldp q16, q17, ["X", #96] \n" \ "ldp q18, q19, ["X", #128] \n" \ "ldp q20, q21, ["X", #160] \n" \ "ldp q22, q23, ["X", #192] \n" \ "ldp q24, q25, ["X", #224] \n" \ "add "X", "X", #256 \n" \ "fabs v2.4s, v2.4s \n" \ "fabs v3.4s, v3.4s \n" \ "fabs v4.4s, v4.4s \n" \ "fabs v5.4s, v5.4s \n" \ "fabs v6.4s, v6.4s \n" \ "fabs v7.4s, v7.4s \n" \ "fabs v16.4s, v16.4s \n" \ "fabs v17.4s, v17.4s \n" \ "fabs v18.4s, v18.4s \n" \ "fabs v19.4s, v19.4s \n" \ "fabs v20.4s, v20.4s \n" \ "fabs v21.4s, v21.4s \n" \ "fabs v22.4s, v22.4s \n" \ "fabs v23.4s, v23.4s \n" \ "fabs v24.4s, v24.4s \n" \ "fabs v25.4s, v25.4s \n" \ "faddp v2.4s, v2.4s, v3.4s \n" \ "faddp v4.4s, v4.4s, v5.4s \n" \ "faddp v6.4s, v6.4s, v7.4s \n" \ "faddp v16.4s, v16.4s, v17.4s \n" \ "faddp v18.4s, v18.4s, v19.4s \n" \ "faddp v20.4s, v20.4s, v21.4s \n" \ "faddp v22.4s, v22.4s, v23.4s \n" \ "faddp v24.4s, v24.4s, v25.4s \n" \ "fmax v2.4s, v2.4s, v4.4s \n" \ "fmax v6.4s, v6.4s, v16.4s \n" \ "fmax v18.4s, v18.4s, v20.4s \n" \ "fmax v22.4s, v22.4s, v24.4s \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fmax v2.4s, v2.4s, v6.4s \n" \ "fmax v18.4s, v18.4s, v22.4s \n" \ "fmax v2.4s, v2.4s, v18.4s \n" \ "fmaxv "TMPF0", v2.4s \n" \ "fcmp "MAXF", "TMPF0" \n" \ "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ "csel "INDEX", "INDEX", "Z", ge \n" \ "add "Z", "Z", #"N_KERNEL_SIZE" \n" #else #define KERNEL_F \ "ldp q2, q3, ["X"] \n" \ "ldp q4, q5, ["X", #32] \n" \ "ldp q6, q7, ["X", #64] \n" \ "ldp q16, q17, ["X", #96] \n" \ "ldp q18, q19, ["X", #128] \n" \ "ldp q20, q21, ["X", #160] \n" \ "ldp q22, q23, ["X", #192] \n" \ "ldp q24, q25, ["X", #224] \n" \ "add "X", "X", #256 \n" \ "fabs v2.2d, v2.2d \n" \ "fabs v3.2d, v3.2d \n" \ "fabs v4.2d, v4.2d \n" \ "fabs v5.2d, v5.2d \n" \ "fabs v6.2d, v6.2d \n" \ "fabs v7.2d, v7.2d \n" \ "fabs v16.2d, v16.2d \n" \ "fabs v17.2d, v17.2d \n" \ "fabs v18.2d, v18.2d \n" \ "fabs v19.2d, v19.2d \n" \ "fabs v20.2d, v20.2d \n" \ "fabs v21.2d, v21.2d \n" \ "fabs v22.2d, v22.2d \n" \ "fabs v23.2d, v23.2d \n" \ "fabs v24.2d, v24.2d \n" \ "fabs v25.2d, v25.2d \n" \ "faddp v2.2d, v2.2d, v3.2d \n" \ "faddp v4.2d, v4.2d, v5.2d \n" \ "faddp v6.2d, v6.2d, v7.2d \n" \ "faddp v16.2d, v16.2d, v17.2d \n" \ "faddp v18.2d, v18.2d, v19.2d \n" \ "faddp v20.2d, v20.2d, v21.2d \n" \ "faddp v22.2d, v22.2d, v23.2d \n" \ "faddp v24.2d, v24.2d, v25.2d \n" \ "fmax v2.2d, v2.2d, v4.2d \n" \ "fmax v6.2d, v6.2d, v16.2d \n" \ "fmax v18.2d, v18.2d, v20.2d \n" \ "fmax v22.2d, v22.2d, v24.2d \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fmax v2.2d, v2.2d, v6.2d \n" \ "fmax v18.2d, v18.2d, v22.2d \n" \ "fmax v2.2d, v2.2d, v18.2d \n" \ "ins v3.d[0], v2.d[1] \n" \ "fmax "TMPF0", d3, d2 \n" \ "fcmp "MAXF", "TMPF0" \n" \ "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ "csel "INDEX", "INDEX", "Z", ge \n" \ "add "Z", "Z", #"N_KERNEL_SIZE" \n" #endif #define KERNEL_F_FINALIZE \ "sub x6, "INDEX", #1 \n" \ "lsl x6, x6, #"INC_SHIFT" \n" \ "add x7, x7, x6 \n" \ "mov x6, #0 \n" \ "1: \n" \ "add x6, x6, #1 \n" \ "cmp x6, #"N_KERNEL_SIZE" \n" \ "bge 2f \n" \ "ldr "TMPF1", [x7] \n" \ "fabs "TMPF1V", "TMPF1V" \n" \ "faddp "TMPF0V", "TMPF1V", "TMPF1V" \n" \ "fcmp "MAXF", "TMPF0" \n" \ "add x7, x7, #"SZ" \n" \ "bne 1b \n" \ "2: \n" \ "sub x6, x6, #1 \n" \ "add "INDEX", "INDEX", x6 \n" #define INIT \ "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ "ldr "TMPF1", ["X"] \n" \ "fabs "TMPF1V", "TMPF1V" \n" \ "faddp "TMPF0V", "TMPF1V", "TMPF1V" \n" \ "fmov "MAXF" , "TMPF0" \n" \ "add "X", "X", "INC_X" \n" \ "mov "Z", #1 \n" \ "mov "INDEX", "Z" \n" \ "fabs "MAXF", "MAXF" \n" #define KERNEL_S1 \ "ldr "TMPF1", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "add "Z", "Z", #1 \n" \ "fabs "TMPF1V", "TMPF1V" \n" \ "faddp "TMPF0V", "TMPF1V", "TMPF1V" \n" \ "fcmp "MAXF", "TMPF0" \n" \ "fcsel "MAXF", "MAXF", "TMPF0", ge \n" \ "csel "INDEX", "INDEX", "Z", ge \n" #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG index = 0; if ( n < 0 ) return index; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " cmp "N", xzr \n" " ble .Lizamax_kernel_zero \n" " cmp "INC_X", xzr \n" " ble .Lizamax_kernel_zero \n" " cmp "INC_X", #1 \n" " bne .Lizamax_kernel_S_BEGIN \n" " mov x7, "X" \n" ".Lizamax_kernel_F_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" " ble .Lizamax_kernel_L999 \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq .Lizamax_kernel_F1 \n" " add "Z", "Z", #1 \n" ".Lizamax_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" " bne .Lizamax_kernel_F \n" " "KERNEL_F_FINALIZE" \n" " sub "Z", "Z", #1 \n" ".Lizamax_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" " ble .Lizamax_kernel_L999 \n" ".Lizamax_kernel_F10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lizamax_kernel_F10 \n" " b .Lizamax_kernel_L999 \n" ".Lizamax_kernel_S_BEGIN: \n" " "INIT" \n" " subs "N", "N", #1 \n" " ble .Lizamax_kernel_L999 \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lizamax_kernel_S1 \n" ".Lizamax_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lizamax_kernel_S4 \n" ".Lizamax_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lizamax_kernel_L999 \n" ".Lizamax_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lizamax_kernel_S10 \n" ".Lizamax_kernel_L999: \n" " mov x0, "INDEX" \n" " b .Lizamax_kernel_DONE \n" ".Lizamax_kernel_zero: \n" " mov x0, xzr \n" ".Lizamax_kernel_DONE: \n" " mov %[INDEX_], "INDEX" \n" : [INDEX_] "=r" (index) //%0 : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return index; } #if defined(SMP) static int izamax_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { *(BLASLONG *)result = izamax_compute(n, x, inc_x); return 0; } #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha[2]; #endif BLASLONG max_index = 0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { max_index = izamax_compute(n, x, inc_x); } else { BLASLONG i, width, cur_index; int num_cpu; int mode; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; FLOAT max = -1.0; #if !defined(DOUBLE) mode = BLAS_SINGLE | BLAS_COMPLEX; #else mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)izamax_thread_function, nthreads); num_cpu = 0; i = n; cur_index = 0; while (i > 0) { FLOAT elem_r, elem_i; BLASLONG cur_max_index; cur_max_index = *(BLASLONG *)&result[num_cpu * sizeof(double) * 2]; elem_r = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 0]; elem_i = x[((cur_index + cur_max_index - 1) * inc_x * 2) + 1]; elem_r = fabs(elem_r) + fabs(elem_i); if (elem_r >= max) { max = elem_r; max_index = cur_index + cur_max_index; } width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; cur_index += width; num_cpu ++; } } #else max_index = izamax_compute(n, x, inc_x); #endif return max_index; } OpenBLAS-0.2.20/kernel/arm64/nrm2.S000066400000000000000000000110211313527062700163520ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 #define X x1 #define INC_X x2 #define I x3 #if !defined(DOUBLE) #define SSQ s0 #define SCALE s1 #define REGZERO s5 #define REGONE s6 #else #define SSQ d0 #define SCALE d1 #define REGZERO d5 #define REGONE d6 #endif /******************************************************************************* * Macro definitions *******************************************************************************/ .macro KERNEL_F1 #if !defined(DOUBLE) ldr s4, [X], #4 fcmp s4, REGZERO beq KERNEL_F1_NEXT_\@ fabs s4, s4 fcmp SCALE, s4 bge KERNEL_F1_SCALE_GE_X_\@ fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 b KERNEL_F1_NEXT_\@ KERNEL_F1_SCALE_GE_X_\@: fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X], #8 fcmp d4, REGZERO beq KERNEL_F1_NEXT_\@ fabs d4, d4 fcmp SCALE, d4 bge KERNEL_F1_SCALE_GE_X_\@ fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 b KERNEL_F1_NEXT_\@ KERNEL_F1_SCALE_GE_X_\@: fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] #endif KERNEL_F1_NEXT_\@: .endm .macro KERNEL_S1 #if !defined(DOUBLE) ldr s4, [X] fcmp s4, REGZERO beq KERNEL_S1_NEXT fabs s4, s4 fcmp SCALE, s4 bge KERNEL_S1_SCALE_GE_X fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 b KERNEL_S1_NEXT KERNEL_S1_SCALE_GE_X: fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X] fcmp d4, REGZERO beq KERNEL_S1_NEXT fabs d4, d4 fcmp SCALE, d4 bge KERNEL_S1_SCALE_GE_X fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 b KERNEL_S1_NEXT KERNEL_S1_SCALE_GE_X: fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] #endif KERNEL_S1_NEXT: add X, X, INC_X .endm .macro KERNEL_F8 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 // INC_X * SIZE #else lsl INC_X, INC_X, #3 // INC_X * SIZE #endif .endm .macro INIT eor v1.16b, v1.16b, v1.16b // scale=0.0 fmov SSQ, #1.0 fmov REGONE, SSQ fmov REGZERO, SCALE .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 INIT cmp N, #0 ble nrm2_kernel_L999 cmp INC_X, #0 beq nrm2_kernel_L999 cmp INC_X, #1 bne nrm2_kernel_S_BEGIN nrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr ble nrm2_kernel_F1 nrm2_kernel_F8: KERNEL_F8 subs I, I, #1 bne nrm2_kernel_F8 nrm2_kernel_F1: ands I, N, #7 ble nrm2_kernel_L999 nrm2_kernel_F10: KERNEL_F1 subs I, I, #1 bne nrm2_kernel_F10 b nrm2_kernel_L999 nrm2_kernel_S_BEGIN: INIT_S mov I, N .align 5 nrm2_kernel_S10: KERNEL_S1 subs I, I, #1 bne nrm2_kernel_S10 nrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/rot.S000066400000000000000000000135141313527062700163110ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define Y x3 /* Y vector address */ #define INC_Y x4 /* Y stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define C s0 /* scale input value */ #define S s1 /* scale input value */ #else #define C d0 /* scale input value */ #define S d1 /* scale input value */ #endif /******************************************************************************/ .macro INIT #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // [C, C] #else ins v0.d[1], v0.d[0] // [C, C] #endif .endm .macro INIT_F1 #if !defined(DOUBLE) eor v2.16b, v2.16b, v2.16b fsub s2, s2, S ins v1.s[1], v2.s[0] // [-S, S] #else eor v2.16b, v2.16b, v2.16b fsub d2, d2, S ins v1.d[1], v2.d[0] // [-S, S] #endif .endm .macro KERNEL_F1 #if !defined(DOUBLE) ld1 {v2.s}[0], [X] ld1 {v2.s}[1], [Y] // [Y, X] ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] st1 {v4.s}[0], [X], #4 st1 {v4.s}[1], [Y], #4 #else ld1 {v2.d}[0], [X] ld1 {v2.d}[1], [Y] // [Y, X] ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] st1 {v4.d}[0], [X], #8 st1 {v4.d}[1], [Y], #8 #endif .endm .macro KERNEL_INIT_F4 #if !defined(DOUBLE) ins v0.d[1], v0.d[0] // [C, C, C, C] ins v1.s[1], v1.s[0] ins v1.d[1], v1.d[0] // [S, S, S, S] #else ins v1.d[1], v1.d[0] // [S, S] #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld1 {v2.4s}, [X] fmul v4.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 ld1 {v3.4s}, [Y] fmla v4.4s, v1.4s, v3.4s // C*X3+S*Y3, ..., C*X0+S*Y0 st1 {v4.4s}, [X], #16 fmul v5.4s, v0.4s, v3.4s // C*Y3, C*Y2, C*Y1, C*Y0 fmls v5.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 st1 {v5.4s}, [Y], #16 #else // DOUBLE ld1 {v2.2d, v3.2d}, [X] fmul v6.2d, v0.2d, v2.2d // C*X1, C*X0 fmul v7.2d, v0.2d, v3.2d // C*X3, C*X2 ld1 {v4.2d, v5.2d}, [Y] fmla v6.2d, v1.2d, v4.2d // C*X1+S*Y1, C*X0+S*Y0 fmla v7.2d, v1.2d, v5.2d // C*X3+S*Y3, C*X2+S*Y2 st1 {v6.2d, v7.2d}, [X], #32 fmul v16.2d, v0.2d, v4.2d // C*Y1, C*Y0 fmul v17.2d, v0.2d, v5.2d // C*Y3, C*Y2 fmls v16.2d, v1.2d, v2.2d // C*Y1-S*X1, C*Y0-S*X0 fmls v17.2d, v1.2d, v3.2d // C*Y3-S*X3, C*Y2-S*X2 st1 {v16.2d, v17.2d}, [Y], #32 PRFM PLDL1KEEP, [X, #512] PRFM PLDL1KEEP, [Y, #512] #endif .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 lsl INC_Y, INC_Y, #2 #else lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #endif .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v2.s}[0], [X] ld1 {v2.s}[1], [Y] // [Y, X] ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] st1 {v4.s}[0], [X], INC_X st1 {v4.s}[1], [Y], INC_Y #else ld1 {v2.d}[0], [X] ld1 {v2.d}[1], [Y] // [Y, X] ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] st1 {v4.d}[0], [X], INC_X st1 {v4.d}[1], [Y], INC_Y #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble rot_kernel_L999 INIT cmp INC_X, #1 bne rot_kernel_S_BEGIN cmp INC_Y, #1 bne rot_kernel_S_BEGIN rot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq rot_kernel_F1 KERNEL_INIT_F4 rot_kernel_F4: KERNEL_F4 subs I, I, #1 bne rot_kernel_F4 rot_kernel_F1: ands I, N, #3 ble rot_kernel_L999 INIT_F1 rot_kernel_F10: KERNEL_F1 subs I, I, #1 bne rot_kernel_F10 mov w0, wzr ret rot_kernel_S_BEGIN: INIT_S INIT_F1 asr I, N, #2 cmp I, xzr ble rot_kernel_S1 rot_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne rot_kernel_S4 rot_kernel_S1: ands I, N, #3 ble rot_kernel_L999 rot_kernel_S10: KERNEL_S1 subs I, I, #1 bne rot_kernel_S10 rot_kernel_L999: mov w0, wzr ret OpenBLAS-0.2.20/kernel/arm64/sasum_thunderx2t99.c000066400000000000000000000164171313527062700212330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define J "x5" /* loop variable */ #define REG0 "wzr" #define SUMF "s0" #define SUMFD "d0" /******************************************************************************/ #define KERNEL_F1 \ "ldr s1, ["X"] \n" \ "add "X", "X", #4 \n" \ "fabs s1, s1 \n" \ "fadd "SUMF", "SUMF", s1 \n" #define KERNEL_F64 \ "ldr q16, ["X"] \n" \ "ldr q17, ["X", #16] \n" \ "ldr q18, ["X", #32] \n" \ "ldr q19, ["X", #48] \n" \ "ldp q20, q21, ["X", #64] \n" \ "ldp q22, q23, ["X", #96] \n" \ "fabs v16.4s, v16.4s \n" \ "fabs v17.4s, v17.4s \n" \ "fabs v18.4s, v18.4s \n" \ "fabs v19.4s, v19.4s \n" \ "ldp q24, q25, ["X", #128] \n" \ "ldp q26, q27, ["X", #160] \n" \ "fabs v20.4s, v20.4s \n" \ "fabs v21.4s, v21.4s \n" \ "fabs v22.4s, v22.4s \n" \ "fabs v23.4s, v23.4s \n" \ "fadd v16.4s, v16.4s, v17.4s \n" \ "fadd v18.4s, v18.4s, v19.4s \n" \ "ldp q28, q29, ["X", #192] \n" \ "ldp q30, q31, ["X", #224] \n" \ "fabs v24.4s, v24.4s \n" \ "fabs v25.4s, v25.4s \n" \ "fabs v26.4s, v26.4s \n" \ "fabs v27.4s, v27.4s \n" \ "add "X", "X", #256 \n" \ "fadd v20.4s, v20.4s, v21.4s \n" \ "fadd v22.4s, v22.4s, v23.4s \n" \ "fabs v28.4s, v28.4s \n" \ "fabs v29.4s, v29.4s \n" \ "fabs v30.4s, v30.4s \n" \ "fabs v31.4s, v31.4s \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "fadd v24.4s, v24.4s, v25.4s \n" \ "fadd v26.4s, v26.4s, v27.4s \n" \ "fadd v0.4s, v0.4s, v16.4s \n" \ "fadd v1.4s, v1.4s, v18.4s \n" \ "fadd v2.4s, v2.4s, v20.4s \n" \ "fadd v3.4s, v3.4s, v22.4s \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fadd v28.4s, v28.4s, v29.4s \n" \ "fadd v30.4s, v30.4s, v31.4s \n" \ "fadd v4.4s, v4.4s, v24.4s \n" \ "fadd v5.4s, v5.4s, v26.4s \n" \ "fadd v6.4s, v6.4s, v28.4s \n" \ "fadd v7.4s, v7.4s, v30.4s \n" #define KERNEL_F64_FINALIZE \ "fadd v0.4s, v0.4s, v1.4s \n" \ "fadd v2.4s, v2.4s, v3.4s \n" \ "fadd v4.4s, v4.4s, v5.4s \n" \ "fadd v6.4s, v6.4s, v7.4s \n" \ "fadd v0.4s, v0.4s, v2.4s \n" \ "fadd v4.4s, v4.4s, v6.4s \n" \ "fadd v0.4s, v0.4s, v4.4s \n" \ "ext v1.16b, v0.16b, v0.16b, #8 \n" \ "fadd v0.2s, v0.2s, v1.2s \n" \ "faddp "SUMF", v0.2s \n" #define INIT_S \ "lsl "INC_X", "INC_X", #2 \n" #define KERNEL_S1 \ "ldr s1, ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fabs s1, s1 \n" \ "fadd "SUMF", "SUMF", s1 \n" #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT asum = 0.0 ; if ( n < 0 ) return(asum); __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " fmov "SUMF", "REG0" \n" " fmov s1, "REG0" \n" " fmov s2, "REG0" \n" " fmov s3, "REG0" \n" " fmov s4, "REG0" \n" " fmov s5, "REG0" \n" " fmov s6, "REG0" \n" " fmov s7, "REG0" \n" " cmp "N", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Lasum_kernel_S_BEGIN \n" ".Lasum_kernel_F_BEGIN: \n" " asr "J", "N", #6 \n" " cmp "J", xzr \n" " beq .Lasum_kernel_F1 \n" ".align 5 \n" ".Lasum_kernel_F64: \n" " "KERNEL_F64" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F64 \n" " "KERNEL_F64_FINALIZE" \n" ".Lasum_kernel_F1: \n" " ands "J", "N", #63 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F10 \n" " b .Lasum_kernel_L999 \n" ".Lasum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lasum_kernel_S1 \n" ".Lasum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S4 \n" ".Lasum_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S10 \n" ".Lasum_kernel_L999: \n" " fmov %[ASUM_], "SUMFD" \n" : [ASUM_] "=r" (asum) //%0 : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return asum; } #if defined(SMP) static int sasum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { *result = sasum_compute(n, x, inc_x); return 0; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif FLOAT asum = 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { asum = sasum_compute(n, x, inc_x); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; FLOAT *ptr; mode = BLAS_SINGLE; blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)sasum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { asum = asum + (*ptr); ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); } } #else asum = sasum_compute(n, x, inc_x); #endif return asum; } OpenBLAS-0.2.20/kernel/arm64/scal.S000066400000000000000000000117511313527062700164300ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x3 /* X vector address */ #define X_COPY x5 /* X vector address */ #define INC_X x4 /* X stride */ #define I x1 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define DA s0 /* scale input value */ #define DAV {v0.s}[0] #define TMPF s1 #define TMPVF {v1.s}[0] #define SZ 4 #else #define DA d0 /* scale input value */ #define DAV {v0.d}[0] #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro KERNEL_F1 ldr TMPF, [X] fmul TMPF, TMPF, DA str TMPF, [X], #SZ .endm .macro KERNEL_INIT_F8 #if !defined(DOUBLE) ins v0.s[1], v0.s[0] ins v0.s[2], v0.s[0] ins v0.s[3], v0.s[0] #else ins v0.d[1], v0.d[0] #endif .endm .macro KERNEL_F8 #if !defined(DOUBLE) ld1 {v1.4s, v2.4s}, [X] fmul v1.4s, v1.4s, v0.4s fmul v2.4s, v2.4s, v0.4s st1 {v1.4s, v2.4s}, [X], #32 #else // DOUBLE ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X] fmul v1.2d, v1.2d, v0.2d fmul v2.2d, v2.2d, v0.2d fmul v3.2d, v3.2d, v0.2d fmul v4.2d, v4.2d, v0.2d st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 #endif PRFM PLDL1KEEP, [X, #1024] .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #2 #else lsl INC_X, INC_X, #3 #endif .endm .macro KERNEL_S1 ldr TMPF, [X] fmul TMPF, TMPF, DA st1 TMPVF, [X], INC_X .endm .macro KERNEL_S4 #if !defined(DOUBLE) ldr s1, [X] add X, X, INC_X fmul s1, s1, s0 str s1, [X_COPY] add X_COPY, X_COPY, INC_X ldr s2, [X] add X, X, INC_X fmul s2, s2, s0 str s2, [X_COPY] add X_COPY, X_COPY, INC_X ldr s3, [X] add X, X, INC_X fmul s3, s3, s0 str s3, [X_COPY] add X_COPY, X_COPY, INC_X ldr s4, [X] add X, X, INC_X fmul s4, s4, s0 str s4, [X_COPY] add X_COPY, X_COPY, INC_X #else ldr d1, [X] add X, X, INC_X fmul d1, d1, d0 str d1, [X_COPY] add X_COPY, X_COPY, INC_X ldr d2, [X] add X, X, INC_X fmul d2, d2, d0 str d2, [X_COPY] add X_COPY, X_COPY, INC_X ldr d3, [X] add X, X, INC_X fmul d3, d3, d0 str d3, [X_COPY] add X_COPY, X_COPY, INC_X ldr d4, [X] add X, X, INC_X fmul d4, d4, d0 str d4, [X_COPY] add X_COPY, X_COPY, INC_X #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble scal_kernel_L999 fcmp DA, #0.0 beq scal_kernel_zero cmp INC_X, #1 bne scal_kernel_S_BEGIN scal_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr beq scal_kernel_F1 KERNEL_INIT_F8 scal_kernel_F8: KERNEL_F8 subs I, I, #1 bne scal_kernel_F8 scal_kernel_F1: ands I, N, #7 ble scal_kernel_L999 scal_kernel_F10: KERNEL_F1 subs I, I, #1 bne scal_kernel_F10 mov w0, wzr ret scal_kernel_S_BEGIN: INIT_S mov X_COPY, X asr I, N, #2 cmp I, xzr ble scal_kernel_S1 scal_kernel_S4: KERNEL_S4 subs I, I, #1 bne scal_kernel_S4 scal_kernel_S1: ands I, N, #3 ble scal_kernel_L999 scal_kernel_S10: KERNEL_S1 subs I, I, #1 bne scal_kernel_S10 scal_kernel_L999: mov w0, wzr ret scal_kernel_zero: INIT_S scal_kernel_Z1: st1 DAV, [X], INC_X subs N, N, #1 bne scal_kernel_Z1 mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/scnrm2_thunderx2t99.c000066400000000000000000000232261313527062700213030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif #if !defined(COMPLEX) #define N "x0" /* vector length */ #define X "x1" /* X vector address */ #define INC_X "x2" /* X stride */ #define J "x5" /* loop variable */ #define TMPF "s16" #define TMPFD "d17" #define SSQD "d0" #define N_DIV_SHIFT "6" #define N_REM_MASK "63" #define INC_SHIFT "2" #define KERNEL_F1 \ "ldr "TMPF", ["X"], #4 \n" \ "fcvt "TMPFD", "TMPF" \n" \ "fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" #define KERNEL_F \ KERNEL_F32 \ KERNEL_F32 #define KERNEL_F32 \ "ldur q16, ["X"] \n" \ "ldur q18, ["X", #16] \n" \ "ldur q20, ["X", #32] \n" \ "ldur q22, ["X", #48] \n" \ "ldur q24, ["X", #64] \n" \ "ldur q26, ["X", #80] \n" \ "ldur q28, ["X", #96] \n" \ "ldur q30, ["X", #112] \n" \ "add "X", "X", #128 \n" \ "fcvtl2 v17.2d, v16.4s \n" \ "fcvtl v16.2d, v16.2s \n" \ "fcvtl2 v19.2d, v18.4s \n" \ "fcvtl v18.2d, v18.2s \n" \ "fcvtl2 v21.2d, v20.4s \n" \ "fcvtl v20.2d, v20.2s \n" \ "fcvtl2 v23.2d, v22.4s \n" \ "fcvtl v22.2d, v22.2s \n" \ "fcvtl2 v25.2d, v24.4s \n" \ "fcvtl v24.2d, v24.2s \n" \ "fcvtl2 v27.2d, v26.4s \n" \ "fcvtl v26.2d, v26.2s \n" \ "fcvtl2 v29.2d, v28.4s \n" \ "fcvtl v28.2d, v28.2s \n" \ "fcvtl2 v31.2d, v30.4s \n" \ "fcvtl v30.2d, v30.2s \n" \ "fmla v0.2d, v16.2d, v16.2d \n" \ "fmla v1.2d, v17.2d, v17.2d \n" \ "fmla v2.2d, v18.2d, v18.2d \n" \ "fmla v3.2d, v19.2d, v19.2d \n" \ "fmla v4.2d, v20.2d, v20.2d \n" \ "fmla v5.2d, v21.2d, v21.2d \n" \ "fmla v6.2d, v22.2d, v22.2d \n" \ "fmla v7.2d, v23.2d, v23.2d \n" \ "fmla v0.2d, v24.2d, v24.2d \n" \ "fmla v1.2d, v25.2d, v25.2d \n" \ "fmla v2.2d, v26.2d, v26.2d \n" \ "fmla v3.2d, v27.2d, v27.2d \n" \ "fmla v4.2d, v28.2d, v28.2d \n" \ "fmla v5.2d, v29.2d, v29.2d \n" \ "fmla v6.2d, v30.2d, v30.2d \n" \ "fmla v7.2d, v31.2d, v31.2d \n" \ "prfm PLDL1KEEP, ["X", #1024] \n" \ "prfm PLDL1KEEP, ["X", #1024+64] \n" #define KERNEL_F_FINALIZE \ "fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \ "fadd v6.2d, v6.2d, v7.2d \n" \ "fadd v0.2d, v0.2d, v2.2d \n" \ "fadd v4.2d, v4.2d, v6.2d \n" \ "fadd v0.2d, v0.2d, v4.2d \n" \ "faddp "SSQD", v0.2d \n" #define KERNEL_S1 \ "ldr "TMPF", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fcvt "TMPFD", "TMPF" \n" \ "fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" #define KERNEL_FINALIZE \ "" #else #define N "x0" /* vector length */ #define X "x1" /* X vector address */ #define INC_X "x2" /* X stride */ #define J "x5" /* loop variable */ #define TMPF "d16" #define SSQD "d0" #define N_DIV_SHIFT "4" #define N_REM_MASK "15" #define INC_SHIFT "3" #define KERNEL_F1 \ "ldr "TMPF", ["X"] \n" \ "add "X", "X", #8 \n" \ "fcvtl v16.2d, v16.2s \n" \ "fmla v0.2d, v16.2d, v16.2d \n" #define KERNEL_F \ "ldur q16, ["X"] \n" \ "ldur q18, ["X", #16] \n" \ "ldur q20, ["X", #32] \n" \ "ldur q22, ["X", #48] \n" \ "ldur q24, ["X", #64] \n" \ "ldur q26, ["X", #80] \n" \ "ldur q28, ["X", #96] \n" \ "ldur q30, ["X", #112] \n" \ "add "X", "X", #128 \n" \ "fcvtl2 v17.2d, v16.4s \n" \ "fcvtl v16.2d, v16.2s \n" \ "fcvtl2 v19.2d, v18.4s \n" \ "fcvtl v18.2d, v18.2s \n" \ "fcvtl2 v21.2d, v20.4s \n" \ "fcvtl v20.2d, v20.2s \n" \ "fcvtl2 v23.2d, v22.4s \n" \ "fcvtl v22.2d, v22.2s \n" \ "fcvtl2 v25.2d, v24.4s \n" \ "fcvtl v24.2d, v24.2s \n" \ "fcvtl2 v27.2d, v26.4s \n" \ "fcvtl v26.2d, v26.2s \n" \ "fcvtl2 v29.2d, v28.4s \n" \ "fcvtl v28.2d, v28.2s \n" \ "fcvtl2 v31.2d, v30.4s \n" \ "fcvtl v30.2d, v30.2s \n" \ "fmla v0.2d, v16.2d, v16.2d \n" \ "fmla v1.2d, v17.2d, v17.2d \n" \ "fmla v2.2d, v18.2d, v18.2d \n" \ "fmla v3.2d, v19.2d, v19.2d \n" \ "fmla v4.2d, v20.2d, v20.2d \n" \ "fmla v5.2d, v21.2d, v21.2d \n" \ "fmla v6.2d, v22.2d, v22.2d \n" \ "fmla v7.2d, v23.2d, v23.2d \n" \ "fmla v0.2d, v24.2d, v24.2d \n" \ "fmla v1.2d, v25.2d, v25.2d \n" \ "fmla v2.2d, v26.2d, v26.2d \n" \ "fmla v3.2d, v27.2d, v27.2d \n" \ "fmla v4.2d, v28.2d, v28.2d \n" \ "fmla v5.2d, v29.2d, v29.2d \n" \ "fmla v6.2d, v30.2d, v30.2d \n" \ "fmla v7.2d, v31.2d, v31.2d \n" \ "prfm PLDL1KEEP, ["X", #1024] \n" \ "prfm PLDL1KEEP, ["X", #1024+64] \n" #define KERNEL_F_FINALIZE \ "fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \ "fadd v6.2d, v6.2d, v7.2d \n" \ "fadd v0.2d, v0.2d, v2.2d \n" \ "fadd v4.2d, v4.2d, v6.2d \n" \ "fadd v0.2d, v0.2d, v4.2d \n" #define KERNEL_FINALIZE \ "faddp "SSQD", v0.2d \n" #define KERNEL_S1 \ "ldr "TMPF", ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fcvtl v16.2d, v16.2s \n" \ "fmla v0.2d, v16.2d, v16.2d \n" #endif static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { double ret = 0.0 ; if (n <= 0) return ret; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " fmov "SSQD", xzr \n" " fmov d1, xzr \n" " fmov d2, xzr \n" " fmov d3, xzr \n" " fmov d4, xzr \n" " fmov d5, xzr \n" " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" " ble .Lnrm2_kernel_L999 \n" " cmp "INC_X", xzr \n" " ble .Lnrm2_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Lnrm2_kernel_S_BEGIN \n" ".Lnrm2_kernel_F_BEGIN: \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq .Lnrm2_kernel_S_BEGIN \n" " .align 5 \n" ".Lnrm2_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_F \n" " "KERNEL_F_FINALIZE" \n" ".Lnrm2_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" " ble .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_F10 \n" " b .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lnrm2_kernel_S1 \n" ".Lnrm2_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_S4 \n" ".Lnrm2_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lnrm2_kernel_L999 \n" ".Lnrm2_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lnrm2_kernel_S10 \n" ".Lnrm2_kernel_L999: \n" " "KERNEL_FINALIZE" \n" " fmov %[RET_], "SSQD" \n" : [RET_] "=r" (ret) //%0 : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return ret; } #if defined(SMP) static int nrm2_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) { *(double *)result = nrm2_compute(n, x, inc_x); return 0; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha[2]; #endif FLOAT nrm2 = 0.0; double nrm2_double = 0.0; if (n <= 0 || inc_x <= 0) return 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (n <= 10000) nthreads = 1; if (nthreads == 1) { nrm2_double = nrm2_compute(n, x, inc_x); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; double *ptr; #if !defined(COMPLEX) mode = BLAS_SINGLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)nrm2_thread_function, nthreads); ptr = (double *)result; for (i = 0; i < nthreads; i++) { nrm2_double = nrm2_double + (*ptr); ptr = (double *)(((char *)ptr) + sizeof(double) * 2); } } #else nrm2_double = nrm2_compute(n, x, inc_x); #endif nrm2 = sqrt(nrm2_double); return nrm2; } OpenBLAS-0.2.20/kernel/arm64/sgemm_kernel_16x4.S000066400000000000000000001072141313527062700207400ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alpha w17 #define alpha0 s10 #define alphaV0 v10.s[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 224 #define C_PRE_SIZE 160 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03 //v01 pA0_04, pA0_05, pA0_06, pA0_07 //v02 pA0_08, pA0_09, pA0_10, pA0_11 //v03 pA0_12, pA0_13, pA0_14, pA0_15 //v04 pA1_00, pA1_01, pA1_02, pA1_03 //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 //v08 must save pB00 //v09 must save pB01 //v10 must save pB02 //v11 must save pB03 //v12 must save pB10 //v13 must save pB11 //v14 must save pB12 //v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 //v19 C12, C13, C14, C15 //v20 C16, C17, C18, C19 //v21 C20, C21, C22, C23 //v22 C24, C25, C26, C27 //v23 C28, C29, C30, C31 //v24 C32, C33, C34, C35 //v25 C36, C37, C38, C39 //v26 C40, C41, C42, C43 //v27 C44, C45, C46, C47 //v28 C48, C49, C50, C51 //v29 C52, C53, C54, C55 //v30 C56, C57, C58, C59 //v31 C60, C61, C62, C63 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT16x4 fmov s16, wzr fmov s17, wzr fmov s18, s16 fmov s19, s17 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL16x4_I ldp q0, q1, [pA], #32 ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] ldp s10, s11, [pB], #8 fmul v24.4s, v0.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] ldp q2, q3, [pA], #32 fmul v17.4s, v1.4s, v8.s[0] fmul v21.4s, v1.4s, v9.s[0] ldp q4, q5, [pA], #32 fmul v25.4s, v1.4s, v10.s[0] fmul v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 fmul v18.4s, v2.4s, v8.s[0] fmul v22.4s, v2.4s, v9.s[0] ldp s14, s15, [pB], #8 fmul v19.4s, v3.4s, v8.s[0] fmul v23.4s, v3.4s, v9.s[0] ldp q6, q7, [pA], #32 fmul v26.4s, v2.4s, v10.s[0] fmul v30.4s, v2.4s, v11.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v27.4s, v3.4s, v10.s[0] fmul v31.4s, v3.4s, v11.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] ldp q4, q5, [pA], #32 fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] ldp s12, s13, [pB], #8 fmla v22.4s, v2.4s, v9.s[0] fmla v23.4s, v3.4s, v9.s[0] ldp s14, s15, [pB], #8 fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmla v26.4s, v2.4s, v10.s[0] fmla v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] ldp q6, q7, [pA], #32 fmla v30.4s, v2.4s, v11.s[0] fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] ldp q0, q1, [pA], #32 fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] ldp s8, s9, [pB], #8 fmla v22.4s, v6.4s, v13.s[0] fmla v23.4s, v7.4s, v13.s[0] ldp s10, s11, [pB], #8 fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v26.4s, v6.4s, v14.s[0] fmla v27.4s, v7.4s, v14.s[0] ldp q2, q3, [pA], #32 fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] fmla v30.4s, v6.4s, v15.s[0] fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v29.4s, v5.4s, v15.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v18.4s, v6.4s, v12.s[0] fmla v22.4s, v6.4s, v13.s[0] fmla v26.4s, v6.4s, v14.s[0] fmla v30.4s, v6.4s, v15.s[0] fmla v19.4s, v7.4s, v12.s[0] fmla v23.4s, v7.4s, v13.s[0] fmla v27.4s, v7.4s, v14.s[0] fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB ldp q0, q1, [pA], #32 ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] ldp s10, s11, [pB], #8 fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] ldp q2, q3, [pA], #32 fmla v17.4s, v1.4s, v8.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v29.4s, v1.4s, v11.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v22.4s, v2.4s, v9.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v19.4s, v3.4s, v8.s[0] fmla v23.4s, v3.4s, v9.s[0] fmla v26.4s, v2.4s, v10.s[0] fmla v30.4s, v2.4s, v11.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v27.4s, v3.4s, v10.s[0] fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 fmov alpha0, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 ldp q2, q3, [pCRow0] fmla v2.4s, v18.4s, alphaV0 fmla v3.4s, v19.4s, alphaV0 stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ldp q4, q5, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV0 stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 ldp q6, q7, [pCRow1] fmla v6.4s, v22.4s, alphaV0 fmla v7.4s, v23.4s, alphaV0 stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ldp q0, q1, [pCRow2] fmla v0.4s, v24.4s, alphaV0 fmla v1.4s, v25.4s, alphaV0 stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 ldp q2, q3, [pCRow2] fmla v2.4s, v26.4s, alphaV0 fmla v3.4s, v27.4s, alphaV0 stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ldp q4, q5, [pCRow3] fmla v4.4s, v28.4s, alphaV0 fmla v5.4s, v29.4s, alphaV0 stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 ldp q6, q7, [pCRow3] fmla v6.4s, v30.4s, alphaV0 fmla v7.4s, v31.4s, alphaV0 stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, s16 fmov s24, wzr fmov s25, s16 fmov s28, wzr fmov s29, s16 .endm .macro KERNEL8x4_I ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] fmul v21.4s, v1.4s, v9.s[0] fmul v24.4s, v0.4s, v10.s[0] fmul v25.4s, v1.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] fmul v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q4, [pA], #16 ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q4, [pA], #16 ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 fmov alpha0, alpha ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 ldp q2, q3, [pCRow1] fmla v2.4s, v20.4s, alphaV0 fmla v3.4s, v21.4s, alphaV0 stp q2, q3, [pCRow1] add pCRow1, pCRow1, #32 ldp q4, q5, [pCRow2] fmla v4.4s, v24.4s, alphaV0 fmla v5.4s, v25.4s, alphaV0 stp q4, q5, [pCRow2] add pCRow2, pCRow2, #32 ldp q6, q7, [pCRow3] fmla v6.4s, v28.4s, alphaV0 fmla v7.4s, v29.4s, alphaV0 stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s20, wzr fmov s24, wzr fmov s28, wzr .endm .macro KERNEL4x4_I ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] fmul v24.4s, v0.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 fmla v16.4s, v1.4s, v12.s[0] fmla v20.4s, v1.4s, v13.s[0] fmla v24.4s, v1.4s, v14.s[0] fmla v28.4s, v1.4s, v15.s[0] ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 .endm .macro KERNEL4x4_E fmla v16.4s, v1.4s, v12.s[0] fmla v20.4s, v1.4s, v13.s[0] fmla v24.4s, v1.4s, v14.s[0] fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 fmov alpha0, alpha ldr q0, [pCRow0] fmla v0.4s, v16.4s, alphaV0 str q0, [pCRow0] add pCRow0, pCRow0, #16 ldr q1, [pCRow1] fmla v1.4s, v20.4s, alphaV0 str q1, [pCRow1] add pCRow1, pCRow1, #16 ldr q2, [pCRow2] fmla v2.4s, v24.4s, alphaV0 str q2, [pCRow2] add pCRow2, pCRow2, #16 ldr q3, [pCRow3] fmla v3.4s, v28.4s, alphaV0 str q3, [pCRow3] add pCRow3, pCRow3, #16 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s20, s16 fmov s24, s20 fmov s28, s16 .endm .macro KERNEL2x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v9.s[0] fmla v24.2s, v0.2s, v10.s[0] fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 fmov alpha0, alpha ldr d0, [pCRow0] fmla v0.2s, v16.2s, alphaV0 str d0, [pCRow0] add pCRow0, pCRow0, #8 ldr d1, [pCRow1] fmla v1.2s, v20.2s, alphaV0 str d1, [pCRow1] add pCRow1, pCRow1, #8 ldr d0, [pCRow2] fmla v0.2s, v24.2s, alphaV0 str d0, [pCRow2] add pCRow2, pCRow2, #8 ldr d1, [pCRow3] fmla v1.2s, v28.2s, alphaV0 str d1, [pCRow3] add pCRow3, pCRow3, #8 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL1x4_SUB ldr s0, [pA] add pA, pA, #4 ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 fmla v16.2s, v8.2s, v0.s[0] fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 fmov alpha0, alpha ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 add pCRow1, pCRow1, #4 ld1 {v12.s}[0], [pCRow2] ld1 {v12.s}[1], [pCRow3] fmla v12.2s, v20.2s, alphaV0 st1 {v12.s}[0], [pCRow2] st1 {v12.s}[1], [pCRow3] add pCRow2, pCRow2, #4 add pCRow3, pCRow3, #4 .endm /******************************************************************************/ .macro INIT16x2 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, wzr fmov s23, s16 .endm .macro KERNEL16x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] fmla v22.4s, v2.4s, v8.s[1] fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 fmla v2.4s, v18.4s, alphaV0 fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV0 fmla v6.4s, v22.4s, alphaV0 fmla v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL8x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL4x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 fmov alpha0, alpha ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV0 fmla v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL2x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 fmov alpha0, alpha ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr .endm .macro KERNEL1x2_SUB ld1 {v8.2s} , [pB] add pB , pB, #8 ldr s0 , [pA] add pA, pA, #4 fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 fmov alpha0, alpha add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT16x1 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 .endm .macro KERNEL16x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 fmov alpha0, alpha ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 fmla v2.4s, v18.4s, alphaV0 fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL8x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 fmov alpha0, alpha ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 fmov alpha0, alpha ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr .endm .macro KERNEL2x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s}, [pA] add pA , pA, #8 fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 fmov alpha0, alpha ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr .endm .macro KERNEL1x1_SUB ldr s8, [pB] add pB , pB, #4 ldr s0, [pA] add pA , pA, #4 fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 fmov alpha0, alpha ldr s8, [pCRow0] fmla s8, s16, alphaV0 str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE sgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble sgemm_kernel_L2_BEGIN /******************************************************************************/ sgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array sgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 ble sgemm_kernel_L4_M8_BEGIN .align 5 sgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 blt sgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #2 ble sgemm_kernel_L4_M16_22a .align 5 sgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M16_22 .align 5 sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 .align 5 sgemm_kernel_L4_M16_32: tst counterL, #1 ble sgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 sgemm_kernel_L4_M16_40: INIT16x4 sgemm_kernel_L4_M16_44: ands counterL , origK, #7 ble sgemm_kernel_L4_M16_100 .align 5 sgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 bne sgemm_kernel_L4_M16_46 sgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 sgemm_kernel_L4_M16_END: subs counterI, counterI, #1 bne sgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ sgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 ble sgemm_kernel_L4_END tst counterI, #8 ble sgemm_kernel_L4_M4_BEGIN sgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L4_M8_22a .align 5 sgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M8_22 sgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E b sgemm_kernel_L4_M8_44 sgemm_kernel_L4_M8_32: tst counterL, #1 ble sgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E b sgemm_kernel_L4_M8_44 sgemm_kernel_L4_M8_40: INIT8x4 sgemm_kernel_L4_M8_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M8_100 sgemm_kernel_L4_M8_46: KERNEL8x4_SUB sgemm_kernel_L4_M8_100: SAVE8x4 sgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ sgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L4_END tst counterI, #4 ble sgemm_kernel_L4_M2_BEGIN sgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L4_M4_22a .align 5 sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M4_22 sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_32: tst counterL, #1 ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_40: INIT4x4 sgemm_kernel_L4_M4_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M4_100 sgemm_kernel_L4_M4_46: KERNEL4x4_SUB sgemm_kernel_L4_M4_100: SAVE4x4 sgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ sgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L4_M1_BEGIN sgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M2_40 sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_22 sgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M2_100 sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_42 sgemm_kernel_L4_M2_100: SAVE2x4 sgemm_kernel_L4_M2_END: sgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L4_END sgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M1_40 sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_22 sgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M1_100 sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_42 sgemm_kernel_L4_M1_100: SAVE1x4 sgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- bgt sgemm_kernel_L4_BEGIN /******************************************************************************/ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble sgemm_kernel_L999 tst counterJ , #2 ble sgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A sgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 ble sgemm_kernel_L2_M8_BEGIN sgemm_kernel_L2_M16_20: INIT16x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M16_40 .align 5 sgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M16_22 sgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M16_100 sgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M16_42 sgemm_kernel_L2_M16_100: SAVE16x2 sgemm_kernel_L2_M16_END: subs counterI, counterI, #1 bgt sgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ sgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 ble sgemm_kernel_L2_END tst counterI, #8 ble sgemm_kernel_L2_M4_BEGIN sgemm_kernel_L2_M8_20: INIT8x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M8_40 .align 5 sgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M8_22 sgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M8_100 sgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M8_42 sgemm_kernel_L2_M8_100: SAVE8x2 sgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ sgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L2_END tst counterI, #4 ble sgemm_kernel_L2_M2_BEGIN sgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M4_40 .align 5 sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_22 sgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M4_100 sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_42 sgemm_kernel_L2_M4_100: SAVE4x2 sgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ sgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L2_M1_BEGIN sgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M2_40 sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_22 sgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M2_100 sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_42 sgemm_kernel_L2_M2_100: SAVE2x2 sgemm_kernel_L2_M2_END: sgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L2_END sgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L2_M1_40 sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_22 sgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M1_100 sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_42 sgemm_kernel_L2_M1_100: SAVE1x2 sgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ sgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble sgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A sgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 ble sgemm_kernel_L1_M8_BEGIN sgemm_kernel_L1_M16_20: INIT16x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M16_40 .align 5 sgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M16_22 sgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M16_100 sgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M16_42 sgemm_kernel_L1_M16_100: SAVE16x1 sgemm_kernel_L1_M16_END: subs counterI, counterI, #1 bgt sgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ sgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 ble sgemm_kernel_L1_END tst counterI, #8 ble sgemm_kernel_L1_M4_BEGIN sgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M8_40 .align 5 sgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M8_22 sgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M8_100 sgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M8_42 sgemm_kernel_L1_M8_100: SAVE8x1 sgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ sgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L1_END tst counterI, #4 ble sgemm_kernel_L1_M2_BEGIN sgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M4_40 .align 5 sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_22 sgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M4_100 sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_42 sgemm_kernel_L1_M4_100: SAVE4x1 sgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ sgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L1_M1_BEGIN sgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M2_40 sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_22 sgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M2_100 sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_42 sgemm_kernel_L1_M2_100: SAVE2x1 sgemm_kernel_L1_M2_END: sgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L1_END sgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M1_40 sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_22 sgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M1_100 sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_42 sgemm_kernel_L1_M1_100: SAVE1x1 sgemm_kernel_L1_END: sgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S000066400000000000000000001103351313527062700233070ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alpha w17 #define alpha0 s10 #define alphaV0 v10.s[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 224 #define C_PRE_SIZE 160 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03 //v01 pA0_04, pA0_05, pA0_06, pA0_07 //v02 pA0_08, pA0_09, pA0_10, pA0_11 //v03 pA0_12, pA0_13, pA0_14, pA0_15 //v04 pA1_00, pA1_01, pA1_02, pA1_03 //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 //v08 must save pB00 //v09 must save pB01 //v10 must save pB02 //v11 must save pB03 //v12 must save pB10 //v13 must save pB11 //v14 must save pB12 //v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 //v19 C12, C13, C14, C15 //v20 C16, C17, C18, C19 //v21 C20, C21, C22, C23 //v22 C24, C25, C26, C27 //v23 C28, C29, C30, C31 //v24 C32, C33, C34, C35 //v25 C36, C37, C38, C39 //v26 C40, C41, C42, C43 //v27 C44, C45, C46, C47 //v28 C48, C49, C50, C51 //v29 C52, C53, C54, C55 //v30 C56, C57, C58, C59 //v31 C60, C61, C62, C63 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT16x4 fmov s16, wzr fmov s17, wzr fmov s18, s16 fmov s19, s17 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL16x4_I ldur q0, [pA] ldur q1, [pA, #16] ldur q8, [pB] fmul v16.4s, v0.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] fmul v24.4s, v0.4s, v8.s[2] fmul v28.4s, v0.4s, v8.s[3] ldur q2, [pA, #32] ldur q3, [pA, #48] fmul v17.4s, v1.4s, v8.s[0] fmul v21.4s, v1.4s, v8.s[1] ldur q4, [pA, #64] ldur q5, [pA, #80] fmul v25.4s, v1.4s, v8.s[2] fmul v29.4s, v1.4s, v8.s[3] ldur q12, [pB, #16] fmul v18.4s, v2.4s, v8.s[0] fmul v22.4s, v2.4s, v8.s[1] fmul v19.4s, v3.4s, v8.s[0] fmul v23.4s, v3.4s, v8.s[1] ldur q6, [pA, #96] ldur q7, [pA, #112] add pB, pB, #32 add pA, pA, #128 fmul v26.4s, v2.4s, v8.s[2] fmul v30.4s, v2.4s, v8.s[3] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v27.4s, v3.4s, v8.s[2] fmul v31.4s, v3.4s, v8.s[3] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] ldur q4, [pA] ldur q5, [pA, #16] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] ldur q12, [pB] fmla v22.4s, v2.4s, v8.s[1] fmla v23.4s, v3.4s, v8.s[1] add pB, pB, #16 fmla v24.4s, v0.4s, v8.s[2] fmla v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmla v26.4s, v2.4s, v8.s[2] fmla v27.4s, v3.4s, v8.s[2] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v28.4s, v0.4s, v8.s[3] fmla v29.4s, v1.4s, v8.s[3] ldur q6, [pA, #32] ldur q7, [pA, #48] add pA, pA, #64 fmla v30.4s, v2.4s, v8.s[3] fmla v31.4s, v3.4s, v8.s[3] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] ldur q0, [pA] ldur q1, [pA, #16] fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v21.4s, v5.4s, v12.s[1] ldur q8, [pB] fmla v22.4s, v6.4s, v12.s[1] fmla v23.4s, v7.4s, v12.s[1] add pB, pB, #16 fmla v24.4s, v4.4s, v12.s[2] fmla v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v26.4s, v6.4s, v12.s[2] fmla v27.4s, v7.4s, v12.s[2] ldur q2, [pA, #32] ldur q3, [pA, #48] add pA, pA, #64 fmla v28.4s, v4.4s, v12.s[3] fmla v29.4s, v5.4s, v12.s[3] fmla v30.4s, v6.4s, v12.s[3] fmla v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v24.4s, v4.4s, v12.s[2] fmla v28.4s, v4.4s, v12.s[3] fmla v17.4s, v5.4s, v12.s[0] fmla v21.4s, v5.4s, v12.s[1] fmla v25.4s, v5.4s, v12.s[2] fmla v29.4s, v5.4s, v12.s[3] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v18.4s, v6.4s, v12.s[0] fmla v22.4s, v6.4s, v12.s[1] fmla v26.4s, v6.4s, v12.s[2] fmla v30.4s, v6.4s, v12.s[3] fmla v19.4s, v7.4s, v12.s[0] fmla v23.4s, v7.4s, v12.s[1] fmla v27.4s, v7.4s, v12.s[2] fmla v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL16x4_SUB ldur q0, [pA] ldur q1, [pA, #16] ldur q8, [pB] fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] add pB, pB, #16 fmla v24.4s, v0.4s, v8.s[2] fmla v28.4s, v0.4s, v8.s[3] ldur q2, [pA, #32] ldur q3, [pA, #48] add pA, pA, #64 fmla v17.4s, v1.4s, v8.s[0] fmla v21.4s, v1.4s, v8.s[1] fmla v25.4s, v1.4s, v8.s[2] fmla v29.4s, v1.4s, v8.s[3] fmla v18.4s, v2.4s, v8.s[0] fmla v22.4s, v2.4s, v8.s[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v19.4s, v3.4s, v8.s[0] fmla v23.4s, v3.4s, v8.s[1] fmla v26.4s, v2.4s, v8.s[2] fmla v30.4s, v2.4s, v8.s[3] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v27.4s, v3.4s, v8.s[2] fmla v31.4s, v3.4s, v8.s[3] .endm .macro SAVE16x4 fmov alpha0, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ldur q0, [pCRow0] ldur q1, [pCRow0, #16] ldur q2, [pCRow0, #32] ldur q3, [pCRow0, #48] ldur q4, [pCRow1] ldur q5, [pCRow1, #16] ldur q6, [pCRow1, #32] ldur q7, [pCRow1, #48] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 stp q0, q1, [pCRow0] fmla v2.4s, v18.4s, alphaV0 fmla v3.4s, v19.4s, alphaV0 stp q2, q3, [pCRow0, #32] ldur q0, [pCRow2] ldur q1, [pCRow2, #16] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV0 stp q4, q5, [pCRow1] ldur q2, [pCRow2, #32] ldur q3, [pCRow2, #48] fmla v6.4s, v22.4s, alphaV0 fmla v7.4s, v23.4s, alphaV0 stp q6, q7, [pCRow1, #32] ldur q4, [pCRow3] ldur q5, [pCRow3, #16] fmla v0.4s, v24.4s, alphaV0 fmla v1.4s, v25.4s, alphaV0 stp q0, q1, [pCRow2] ldur q6, [pCRow3, #32] ldur q7, [pCRow3, #48] fmla v2.4s, v26.4s, alphaV0 fmla v3.4s, v27.4s, alphaV0 stp q2, q3, [pCRow2, #32] fmla v4.4s, v28.4s, alphaV0 fmla v5.4s, v29.4s, alphaV0 stp q4, q5, [pCRow3] fmla v6.4s, v30.4s, alphaV0 fmla v7.4s, v31.4s, alphaV0 stp q6, q7, [pCRow3, #32] add pCRow0, pCRow0, #64 add pCRow1, pCRow1, #64 add pCRow2, pCRow2, #64 add pCRow3, pCRow3, #64 .endm /******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, s16 fmov s24, wzr fmov s25, s16 fmov s28, wzr fmov s29, s16 .endm .macro KERNEL8x4_I ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] fmul v21.4s, v1.4s, v9.s[0] fmul v24.4s, v0.4s, v10.s[0] fmul v25.4s, v1.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] fmul v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q4, [pA], #16 ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q4, [pA], #16 ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 fmov alpha0, alpha ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 ldp q2, q3, [pCRow1] fmla v2.4s, v20.4s, alphaV0 fmla v3.4s, v21.4s, alphaV0 stp q2, q3, [pCRow1] add pCRow1, pCRow1, #32 ldp q4, q5, [pCRow2] fmla v4.4s, v24.4s, alphaV0 fmla v5.4s, v25.4s, alphaV0 stp q4, q5, [pCRow2] add pCRow2, pCRow2, #32 ldp q6, q7, [pCRow3] fmla v6.4s, v28.4s, alphaV0 fmla v7.4s, v29.4s, alphaV0 stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s20, wzr fmov s24, wzr fmov s28, wzr .endm .macro KERNEL4x4_I ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] fmul v24.4s, v0.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 fmla v16.4s, v1.4s, v12.s[0] fmla v20.4s, v1.4s, v13.s[0] fmla v24.4s, v1.4s, v14.s[0] fmla v28.4s, v1.4s, v15.s[0] ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 .endm .macro KERNEL4x4_E fmla v16.4s, v1.4s, v12.s[0] fmla v20.4s, v1.4s, v13.s[0] fmla v24.4s, v1.4s, v14.s[0] fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 fmov alpha0, alpha ldr q0, [pCRow0] fmla v0.4s, v16.4s, alphaV0 str q0, [pCRow0] add pCRow0, pCRow0, #16 ldr q1, [pCRow1] fmla v1.4s, v20.4s, alphaV0 str q1, [pCRow1] add pCRow1, pCRow1, #16 ldr q2, [pCRow2] fmla v2.4s, v24.4s, alphaV0 str q2, [pCRow2] add pCRow2, pCRow2, #16 ldr q3, [pCRow3] fmla v3.4s, v28.4s, alphaV0 str q3, [pCRow3] add pCRow3, pCRow3, #16 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s20, s16 fmov s24, s20 fmov s28, s16 .endm .macro KERNEL2x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v9.s[0] fmla v24.2s, v0.2s, v10.s[0] fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 fmov alpha0, alpha ldr d0, [pCRow0] fmla v0.2s, v16.2s, alphaV0 str d0, [pCRow0] add pCRow0, pCRow0, #8 ldr d1, [pCRow1] fmla v1.2s, v20.2s, alphaV0 str d1, [pCRow1] add pCRow1, pCRow1, #8 ldr d0, [pCRow2] fmla v0.2s, v24.2s, alphaV0 str d0, [pCRow2] add pCRow2, pCRow2, #8 ldr d1, [pCRow3] fmla v1.2s, v28.2s, alphaV0 str d1, [pCRow3] add pCRow3, pCRow3, #8 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL1x4_SUB ldr s0, [pA] add pA, pA, #4 ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 fmla v16.2s, v8.2s, v0.s[0] fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 fmov alpha0, alpha ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 add pCRow1, pCRow1, #4 ld1 {v12.s}[0], [pCRow2] ld1 {v12.s}[1], [pCRow3] fmla v12.2s, v20.2s, alphaV0 st1 {v12.s}[0], [pCRow2] st1 {v12.s}[1], [pCRow3] add pCRow2, pCRow2, #4 add pCRow3, pCRow3, #4 .endm /******************************************************************************/ .macro INIT16x2 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, wzr fmov s23, s16 .endm .macro KERNEL16x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] fmla v22.4s, v2.4s, v8.s[1] fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 fmla v2.4s, v18.4s, alphaV0 fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV0 fmla v6.4s, v22.4s, alphaV0 fmla v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL8x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL4x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 fmov alpha0, alpha ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV0 fmla v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL2x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 fmov alpha0, alpha ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr .endm .macro KERNEL1x2_SUB ld1 {v8.2s} , [pB] add pB , pB, #8 ldr s0 , [pA] add pA, pA, #4 fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 fmov alpha0, alpha add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT16x1 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 .endm .macro KERNEL16x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 fmov alpha0, alpha ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 fmla v2.4s, v18.4s, alphaV0 fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL8x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 fmov alpha0, alpha ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 fmov alpha0, alpha ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr .endm .macro KERNEL2x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s}, [pA] add pA , pA, #8 fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 fmov alpha0, alpha ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr .endm .macro KERNEL1x1_SUB ldr s8, [pB] add pB , pB, #4 ldr s0, [pA] add pA , pA, #4 fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 fmov alpha0, alpha ldr s8, [pCRow0] fmla s8, s16, alphaV0 str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm .macro KERNEL16x4_M1_M2_x1 KERNEL16x4_M1 KERNEL16x4_M2 .endm .macro KERNEL16x4_M1_M2_x2 KERNEL16x4_M1_M2_x1 KERNEL16x4_M1_M2_x1 .endm .macro KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x2 KERNEL16x4_M1_M2_x2 .endm .macro KERNEL16x4_M1_M2_x8 KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x4 .endm .macro KERNEL16x4_M1_M2_x16 KERNEL16x4_M1_M2_x8 KERNEL16x4_M1_M2_x8 .endm .macro KERNEL16x4_M1_M2_x32 KERNEL16x4_M1_M2_x16 KERNEL16x4_M1_M2_x16 .endm .macro KERNEL16x4_M1_M2_x64 KERNEL16x4_M1_M2_x32 KERNEL16x4_M1_M2_x32 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE sgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble sgemm_kernel_L2_BEGIN /******************************************************************************/ sgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array sgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 ble sgemm_kernel_L4_M8_BEGIN .align 5 sgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #4 // L = K / 16 cmp counterL , #2 blt sgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x2 KERNEL16x4_M1_M2_x1 subs counterL, counterL, #2 ble sgemm_kernel_L4_M16_22a .align 5 sgemm_kernel_L4_M16_22: KERNEL16x4_M1_M2_x8 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M16_22 .align 5 sgemm_kernel_L4_M16_22a: KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x2 KERNEL16x4_M1_M2_x1 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 .align 5 sgemm_kernel_L4_M16_32: tst counterL, #1 ble sgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x2 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 sgemm_kernel_L4_M16_40: INIT16x4 sgemm_kernel_L4_M16_44: ands counterL , origK, #15 ble sgemm_kernel_L4_M16_100 .align 5 sgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 bne sgemm_kernel_L4_M16_46 sgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 sgemm_kernel_L4_M16_END: subs counterI, counterI, #1 bne sgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ sgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 ble sgemm_kernel_L4_END tst counterI, #8 ble sgemm_kernel_L4_M4_BEGIN sgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L4_M8_22a .align 5 sgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M8_22 sgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E b sgemm_kernel_L4_M8_44 sgemm_kernel_L4_M8_32: tst counterL, #1 ble sgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E b sgemm_kernel_L4_M8_44 sgemm_kernel_L4_M8_40: INIT8x4 sgemm_kernel_L4_M8_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M8_100 sgemm_kernel_L4_M8_46: KERNEL8x4_SUB sgemm_kernel_L4_M8_100: SAVE8x4 sgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ sgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L4_END tst counterI, #4 ble sgemm_kernel_L4_M2_BEGIN sgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L4_M4_22a .align 5 sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M4_22 sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_32: tst counterL, #1 ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_40: INIT4x4 sgemm_kernel_L4_M4_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M4_100 sgemm_kernel_L4_M4_46: KERNEL4x4_SUB sgemm_kernel_L4_M4_100: SAVE4x4 sgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ sgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L4_M1_BEGIN sgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M2_40 sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_22 sgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M2_100 sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_42 sgemm_kernel_L4_M2_100: SAVE2x4 sgemm_kernel_L4_M2_END: sgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L4_END sgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M1_40 sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_22 sgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M1_100 sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_42 sgemm_kernel_L4_M1_100: SAVE1x4 sgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- bgt sgemm_kernel_L4_BEGIN /******************************************************************************/ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble sgemm_kernel_L999 tst counterJ , #2 ble sgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A sgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 ble sgemm_kernel_L2_M8_BEGIN sgemm_kernel_L2_M16_20: INIT16x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M16_40 .align 5 sgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M16_22 sgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M16_100 sgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M16_42 sgemm_kernel_L2_M16_100: SAVE16x2 sgemm_kernel_L2_M16_END: subs counterI, counterI, #1 bgt sgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ sgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 ble sgemm_kernel_L2_END tst counterI, #8 ble sgemm_kernel_L2_M4_BEGIN sgemm_kernel_L2_M8_20: INIT8x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M8_40 .align 5 sgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M8_22 sgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M8_100 sgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M8_42 sgemm_kernel_L2_M8_100: SAVE8x2 sgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ sgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L2_END tst counterI, #4 ble sgemm_kernel_L2_M2_BEGIN sgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M4_40 .align 5 sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_22 sgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M4_100 sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_42 sgemm_kernel_L2_M4_100: SAVE4x2 sgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ sgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L2_M1_BEGIN sgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M2_40 sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_22 sgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M2_100 sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_42 sgemm_kernel_L2_M2_100: SAVE2x2 sgemm_kernel_L2_M2_END: sgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L2_END sgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L2_M1_40 sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_22 sgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M1_100 sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_42 sgemm_kernel_L2_M1_100: SAVE1x2 sgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ sgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble sgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A sgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 ble sgemm_kernel_L1_M8_BEGIN sgemm_kernel_L1_M16_20: INIT16x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M16_40 .align 5 sgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M16_22 sgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M16_100 sgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M16_42 sgemm_kernel_L1_M16_100: SAVE16x1 sgemm_kernel_L1_M16_END: subs counterI, counterI, #1 bgt sgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ sgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 ble sgemm_kernel_L1_END tst counterI, #8 ble sgemm_kernel_L1_M4_BEGIN sgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M8_40 .align 5 sgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M8_22 sgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M8_100 sgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M8_42 sgemm_kernel_L1_M8_100: SAVE8x1 sgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ sgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L1_END tst counterI, #4 ble sgemm_kernel_L1_M2_BEGIN sgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M4_40 .align 5 sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_22 sgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M4_100 sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_42 sgemm_kernel_L1_M4_100: SAVE4x1 sgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ sgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L1_M1_BEGIN sgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M2_40 sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_22 sgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M2_100 sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_42 sgemm_kernel_L1_M2_100: SAVE2x1 sgemm_kernel_L1_M2_END: sgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L1_END sgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M1_40 sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_22 sgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M1_100 sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_42 sgemm_kernel_L1_M1_100: SAVE1x1 sgemm_kernel_L1_END: sgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/sgemm_kernel_4x4.S000066400000000000000000000714351313527062700206620ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA_0 x15 #define pA_1 x16 #define pA_2 x17 #define pA_3 x18 #define alpha0 s10 #define alphaV0 v10.s[0] #define alpha1 s11 #define alphaV1 v11.s[0] #define alpha2 s14 #define alphaV2 v14.s[0] #define alpha3 s15 #define alphaV3 v15.s[0] // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA_0 // 16 pA_1 // 17 pA_2 // 18 must save pA_3 // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp /***************************** FOR 16x4 ***************************************/ //v00 ALPHA -> pA00_0, pA01_0, pA02_0, pA03_0 //v01 pA10_0, pA11_0, pA12_0, pA13_0 //v02 pA00_1, pA01_1, pA02_1, pA03_1 //v03 pA10_1, pA11_1, pA12_1, pA13_1 //v04 pA00_2, pA01_2, pA02_2, pA03_2 //v05 pA10_2, pA11_2, pA12_2, pA13_2 //v06 pA00_3, pA01_3, pA02_3, pA03_3 //v07 pA10_3, pA11_3, pA12_3, pA13_3 //v08 must save pB00, pB01, pB02, pB03 //v09 must save //v10 must save ALPHA0 //v11 must save ALPHA1 //v12 must save pB10, pB11, pB12, pB13 //v13 must save //v14 must save ALPHA2 //v15 must save ALPHA3 //v16 must save C00_0, C01_0, C02_0, C03_0 //v17 must save C10_0, C11_0, C12_0, C13_0 //v18 C20_0, C21_0, C22_0, C23_0 //v19 C30_0, C31_0, C32_0, C33_0 //v20 C00_1, C01_1, C02_1, C03_1 //v21 C10_1, C11_1, C12_1, C13_1 //v22 C20_1, C21_1, C22_1, C23_1 //v23 C30_1, C31_1, C32_1, C33_1 //v24 C00_2, C01_2, C02_2, C03_2 //v25 C10_2, C11_2, C12_2, C13_2 //v26 C20_2, C21_2, C22_2, C23_2 //v27 C30_2, C31_2, C32_2, C33_2 //v28 C00_3, C01_3, C02_3, C03_3 //v29 C10_3, C11_3, C12_3, C13_3 //v30 C20_3, C21_3, C22_3, C23_3 //v31 C30_3, C31_3, C32_3, C33_3 /***************************** EXCEPT FOR 16x4 ********************************/ //v00 ALPHA -> pA00, pA01 //v01 pA02, pA03 //v02 ppA00, ppA01 //v03 ppA02, ppA03 //v04 pA10, pA11 //v05 pA12, pA13 //v06 ppA10, ppA11 //v07 ppA12, ppA13 //v08 must save pB00, pB01 //v09 must save pB02, pB03 //v10 must save ALPHA0 //v11 must save ALPHA1 //v12 must save pB10, pB11 //v13 must save pB12, pB13 //v14 must save ALPHA2 //v15 must save ALPHA3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 ppC00, ppC01 //v19 ppC02, ppC03 //v20 C10, C11 //v21 C12, C13 //v22 ppC10, ppC11 //v23 ppC12, ppC13 //v24 C20, C21 //v25 C22, C23 //v26 ppC20, ppC21 //v27 ppC22, ppC23 //v28 C30, C31 //v29 C32, C33 //v30 ppC30, ppC31 //v31 ppC32, ppC33 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT16x4 fmov s16, wzr fmov s17, s16 fmov s18, s17 fmov s19, s16 fmov s20, s17 fmov s21, s16 fmov s22, s17 fmov s23, s16 fmov s24, s17 fmov s25, s16 fmov s26, s17 fmov s27, s16 fmov s28, s17 fmov s29, s16 fmov s30, s17 fmov s31, s16 .endm .macro KERNEL16x4_I ld1 {v8.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 fmul v16.4s, v0.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 fmul v24.4s, v0.4s, v8.s[2] fmul v28.4s, v0.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 fmul v17.4s, v2.4s, v8.s[0] fmul v21.4s, v2.4s, v8.s[1] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 fmul v25.4s, v2.4s, v8.s[2] fmul v29.4s, v2.4s, v8.s[3] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 fmul v18.4s, v4.4s, v8.s[0] fmul v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 fmul v22.4s, v4.4s, v8.s[1] fmul v23.4s, v6.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 fmul v26.4s, v4.4s, v8.s[2] fmul v27.4s, v6.4s, v8.s[2] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 fmul v30.4s, v4.4s, v8.s[3] fmul v31.4s, v6.4s, v8.s[3] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 .endm .macro KERNEL16x4_M2 fmla v16.4s, v1.4s, v12.s[0] fmla v17.4s, v3.4s, v12.s[0] ld1 {v8.4s}, [pB] // for next round add pB, pB, #16 fmla v18.4s, v5.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] ld1 {v0.4s}, [pA_0] // for next round add pA_0, pA_0, #16 fmla v20.4s, v1.4s, v12.s[1] fmla v21.4s, v3.4s, v12.s[1] ld1 {v2.4s}, [pA_1] // for next round add pA_1, pA_1, #16 fmla v22.4s, v5.4s, v12.s[1] fmla v23.4s, v7.4s, v12.s[1] ld1 {v4.4s}, [pA_2] // for next round add pA_2, pA_2, #16 fmla v24.4s, v1.4s, v12.s[2] fmla v25.4s, v3.4s, v12.s[2] ld1 {v6.4s}, [pA_3] // for next round add pA_3, pA_3, #16 fmla v26.4s, v5.4s, v12.s[2] fmla v27.4s, v7.4s, v12.s[2] prfm PLDL1KEEP, [pA_2, #512] fmla v28.4s, v1.4s, v12.s[3] fmla v29.4s, v3.4s, v12.s[3] prfm PLDL1KEEP, [pA_3, #512] fmla v30.4s, v5.4s, v12.s[3] fmla v31.4s, v7.4s, v12.s[3] prfm PLDL1KEEP, [pB, #512] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v2.4s, v8.s[0] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 fmla v18.4s, v4.4s, v8.s[0] fmla v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v2.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 fmla v22.4s, v4.4s, v8.s[1] fmla v23.4s, v6.4s, v8.s[1] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 fmla v24.4s, v0.4s, v8.s[2] fmla v25.4s, v2.4s, v8.s[2] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 fmla v26.4s, v4.4s, v8.s[2] fmla v27.4s, v6.4s, v8.s[2] prfm PLDL1KEEP, [pA_0, #512] fmla v28.4s, v0.4s, v8.s[3] fmla v29.4s, v2.4s, v8.s[3] prfm PLDL1KEEP, [pA_1, #512] fmla v30.4s, v4.4s, v8.s[3] fmla v31.4s, v6.4s, v8.s[3] .endm .macro KERNEL16x4_E fmla v16.4s, v1.4s, v12.s[0] fmla v17.4s, v3.4s, v12.s[0] fmla v18.4s, v5.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] fmla v20.4s, v1.4s, v12.s[1] fmla v21.4s, v3.4s, v12.s[1] fmla v22.4s, v5.4s, v12.s[1] fmla v23.4s, v7.4s, v12.s[1] fmla v24.4s, v1.4s, v12.s[2] fmla v25.4s, v3.4s, v12.s[2] fmla v26.4s, v5.4s, v12.s[2] fmla v27.4s, v7.4s, v12.s[2] fmla v28.4s, v1.4s, v12.s[3] fmla v29.4s, v3.4s, v12.s[3] fmla v30.4s, v5.4s, v12.s[3] fmla v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL16x4_SUB ld1 {v8.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v24.4s, v0.4s, v8.s[2] fmla v28.4s, v0.4s, v8.s[3] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 fmla v17.4s, v2.4s, v8.s[0] fmla v21.4s, v2.4s, v8.s[1] fmla v25.4s, v2.4s, v8.s[2] fmla v29.4s, v2.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 fmla v18.4s, v4.4s, v8.s[0] fmla v22.4s, v4.4s, v8.s[1] fmla v26.4s, v4.4s, v8.s[2] fmla v30.4s, v4.4s, v8.s[3] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 fmla v19.4s, v6.4s, v8.s[0] fmla v23.4s, v6.4s, v8.s[1] fmla v27.4s, v6.4s, v8.s[2] fmla v31.4s, v6.4s, v8.s[3] .endm .macro SAVE16x4 mov pCRow1, pCRow0 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 fmla v2.4s, v18.4s, alphaV2 fmla v3.4s, v19.4s, alphaV3 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV1 fmla v6.4s, v22.4s, alphaV2 fmla v7.4s, v23.4s, alphaV3 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0 fmla v1.4s, v25.4s, alphaV1 fmla v2.4s, v26.4s, alphaV2 fmla v3.4s, v27.4s, alphaV3 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] add pCRow1, pCRow1, LDC ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0 fmla v5.4s, v29.4s, alphaV1 fmla v6.4s, v30.4s, alphaV2 fmla v7.4s, v31.4s, alphaV3 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, s16 fmov s18, s17 fmov s19, s16 fmov s20, s17 fmov s21, s16 fmov s22, s17 fmov s23, s16 fmov s24, s17 fmov s25, s16 fmov s26, s17 fmov s27, s16 fmov s28, s17 fmov s29, s16 fmov s30, s17 fmov s31, s16 .endm .macro KERNEL8x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] ld1 {v2.2s, v3.2s}, [pA_1] add pA_1, pA_1, #16 fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] fmla v18.2s, v2.2s, v8.s[0] fmla v31.2s, v3.2s, v9.s[1] fmla v22.2s, v2.2s, v8.s[1] fmla v27.2s, v3.2s, v9.s[0] fmla v26.2s, v2.2s, v9.s[0] fmla v23.2s, v3.2s, v8.s[1] fmla v30.2s, v2.2s, v9.s[1] fmla v19.2s, v3.2s, v8.s[0] .endm .macro SAVE8x4 mov pCRow1, pCRow0 ld1 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0 fmla v1.2s, v17.2s, alphaV1 st1 {v0.2s, v1.2s}, [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow1, #16 ld1 {v2.2s, v3.2s}, [pCRow1] fmla v2.2s, v18.2s, alphaV2 fmla v3.2s, v19.2s, alphaV3 st1 {v2.2s, v3.2s}, [pCRow1] ld1 {v4.2s, v5.2s}, [pCRow2] fmla v4.2s, v20.2s, alphaV0 fmla v5.2s, v21.2s, alphaV1 st1 {v4.2s, v5.2s}, [pCRow2] add pCRow1, pCRow2, LDC add pCRow2, pCRow2, #16 ld1 {v6.2s, v7.2s}, [pCRow2] fmla v6.2s, v22.2s, alphaV2 fmla v7.2s, v23.2s, alphaV3 st1 {v6.2s, v7.2s}, [pCRow2] ld1 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0 fmla v1.2s, v25.2s, alphaV1 st1 {v0.2s, v1.2s}, [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow1, #16 ld1 {v2.2s, v3.2s}, [pCRow1] fmla v2.2s, v26.2s, alphaV2 fmla v3.2s, v27.2s, alphaV3 st1 {v2.2s, v3.2s}, [pCRow1] ld1 {v4.2s, v5.2s}, [pCRow2] fmla v4.2s, v28.2s, alphaV0 fmla v5.2s, v29.2s, alphaV1 st1 {v4.2s, v5.2s}, [pCRow2] add pCRow2, pCRow2, #16 ld1 {v6.2s, v7.2s}, [pCRow2] fmla v6.2s, v30.2s, alphaV2 fmla v7.2s, v31.2s, alphaV3 st1 {v6.2s, v7.2s}, [pCRow2] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV2 fmla v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2s, v9.2s}, [pCRow2] fmla v8.2s, v24.2s, alphaV0 fmla v9.2s, v25.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v28.2s, alphaV2 fmla v13.2s, v29.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s20, s16 fmov s24, s20 fmov s28, s16 .endm .macro KERNEL2x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v24.2s, v0.2s, v9.s[0] fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2s}, [pCRow2] fmla v8.2s, v24.2s, alphaV2 st1 {v8.2s}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v28.2s, alphaV3 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL1x4_SUB ldr s0, [pA_0] add pA_0, pA_0, #4 ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 fmla v16.2s, v8.2s, v0.s[0] fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 add pCRow1, pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC ld1 {v12.s}[0], [pCRow2] ld1 {v12.s}[1], [pCRow1] fmla v12.2s, v20.2s, alphaV1 st1 {v12.s}[0], [pCRow2] st1 {v12.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL4x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV2 fmla v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL2x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr .endm .macro KERNEL1x2_SUB ld1 {v8.2s} , [pB] add pB , pB, #8 ldr s0 , [pA_0] add pA_0, pA_0, #4 fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s, v1.2s}, [pA_0] add pA_0 , pA_0, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr .endm .macro KERNEL2x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s}, [pA_0] add pA_0 , pA_0, #8 fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr .endm .macro KERNEL1x1_SUB ldr s8, [pB] add pB , pB, #4 ldr s0, [pA_0] add pA_0 , pA_0, #4 fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 ldr s8, [pCRow0] fmadd s8, s16, alpha0, s8 str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0, s0 fmov alpha1, s0 fmov alpha2, s0 fmov alpha3, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble sgemm_kernel_L2_BEGIN /******************************************************************************/ sgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 lsl temp, origK, #4 // k * 4 * 4 mov pA_0, origPA // pA_0 = start of A array add pA_1, temp, pA_0 add pA_2, temp, pA_1 add pA_3, temp, pA_2 sgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 ble sgemm_kernel_L4_M8_BEGIN sgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M16_32 KERNEL16x4_I // do one in the K KERNEL16x4_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L4_M16_22a .align 5 sgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M16_22 sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 sgemm_kernel_L4_M16_32: tst counterL, #1 ble sgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_E b sgemm_kernel_L4_M16_44 sgemm_kernel_L4_M16_40: INIT16x4 sgemm_kernel_L4_M16_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M16_100 sgemm_kernel_L4_M16_46: KERNEL16x4_SUB sgemm_kernel_L4_M16_100: SAVE16x4 sgemm_kernel_L4_M16_END: lsl temp, origK, #4 // k * 4 * 4 = Four rows of A add pA_0, pA_0, temp add pA_0, pA_0, temp add pA_0, pA_0, temp add pA_1, pA_0, temp add pA_2, pA_1, temp add pA_3, pA_2, temp subs counterI, counterI, #1 bne sgemm_kernel_L4_M16_20 sgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 ble sgemm_kernel_L4_END tst counterI, #8 ble sgemm_kernel_L4_M4_BEGIN sgemm_kernel_L4_M8_20: INIT8x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L4_M8_40 sgemm_kernel_L4_M8_22: KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M8_22 sgemm_kernel_L4_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M8_100 sgemm_kernel_L4_M8_42: KERNEL8x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M8_42 sgemm_kernel_L4_M8_100: SAVE8x4 sgemm_kernel_L4_M8_END: lsl temp, origK, #4 // k * 4 * 4 add pA_0, pA_0, temp sgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L4_END tst counterI, #4 ble sgemm_kernel_L4_M2_BEGIN sgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L4_M4_40 sgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M4_22 sgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M4_100 sgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M4_42 sgemm_kernel_L4_M4_100: SAVE4x4 sgemm_kernel_L4_M4_END: sgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L4_M1_BEGIN sgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M2_40 sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_22 sgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M2_100 sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_42 sgemm_kernel_L4_M2_100: SAVE2x4 sgemm_kernel_L4_M2_END: sgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L4_END sgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M1_40 sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_22 sgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M1_100 sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_42 sgemm_kernel_L4_M1_100: SAVE1x4 sgemm_kernel_L4_END: lsl temp, origK, #4 add origPB, origPB, temp // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- bgt sgemm_kernel_L4_BEGIN /******************************************************************************/ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble sgemm_kernel_L999 tst counterJ , #2 ble sgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA_0, origPA // pA_0 = A sgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble sgemm_kernel_L2_M2_BEGIN sgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M4_40 .align 5 sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_22 sgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M4_100 sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_42 sgemm_kernel_L2_M4_100: SAVE4x2 sgemm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt sgemm_kernel_L2_M4_20 sgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L2_M1_BEGIN sgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M2_40 sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_22 sgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M2_100 sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_42 sgemm_kernel_L2_M2_100: SAVE2x2 sgemm_kernel_L2_M2_END: sgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L2_END sgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L2_M1_40 sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_22 sgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M1_100 sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_42 sgemm_kernel_L2_M1_100: SAVE1x2 sgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ sgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble sgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA_0, origPA // pA_0 = A sgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble sgemm_kernel_L1_M2_BEGIN sgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M4_40 .align 5 sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_22 sgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M4_100 sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_42 sgemm_kernel_L1_M4_100: SAVE4x1 sgemm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt sgemm_kernel_L1_M4_20 sgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L1_M1_BEGIN sgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M2_40 sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_22 sgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M2_100 sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_42 sgemm_kernel_L1_M2_100: SAVE2x1 sgemm_kernel_L1_M2_END: sgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L1_END sgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M1_40 sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_22 sgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M1_100 sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_42 sgemm_kernel_L1_M1_100: SAVE1x1 sgemm_kernel_L1_END: sgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/sgemm_kernel_8x8.S000066400000000000000000001235741313527062700206740ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define temp x16 #define alpha0 s10 #define alphaV0 v10.s[0] #define alpha1 s11 #define alphaV1 v11.s[0] #define alpha2 s14 #define alphaV2 v14.s[0] #define alpha3 s15 #define alphaV3 v15.s[0] // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 // 18 must save // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 //v01 pA0_4, pA0_5, pA0_6, pA0_7 //v02 pA1_0, pA1_1, pA1_2, pA1_3 //v03 pA1_4, pA1_5, pA1_6, pA1_7 //v04 pB0_0, pB0_1, pB0_2, pB0_3 //v05 pB0_4, pB0_5, pB0_6, pB0_7 //v06 pB1_0, pB1_1, pB1_2, pB1_3 //v07 pB1_4, pB1_5, pB1_6, pB1_7 //v08 must save //v09 must save //v10 must save ALPHA0 //v11 must save ALPHA1 //v12 must save //v13 must save //v14 must save ALPHA2 //v15 must save ALPHA3 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 //v19 C12, C13, C14, C15 //v20 C16, C17, C18, C19 //v21 C20, C21, C22, C23 //v22 C24, C25, C26, C27 //v23 C28, C29, C30, C31 //v24 C32, C33, C34, C35 //v25 C36, C37, C38, C39 //v26 C40, C41, C42, C43 //v27 C44, C45, C46, C47 //v28 C48, C49, C50, C51 //v29 C52, C53, C54, C55 //v30 C56, C57, C58, C59 //v31 C60, C61, C62, C63 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x8 fmov s16, wzr fmov s17, wzr fmov s18, s16 fmov s19, s17 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL8x8_I ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmul v16.4s, v0.4s, v4.s[0] fmul v17.4s, v1.4s, v4.s[0] fmul v18.4s, v0.4s, v4.s[1] fmul v19.4s, v1.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] fmul v21.4s, v1.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] fmul v23.4s, v1.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] fmul v25.4s, v1.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v27.4s, v1.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] fmul v29.4s, v1.4s, v5.s[2] fmul v30.4s, v0.4s, v5.s[3] fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x8_M1 fmla v16.4s, v0.4s, v4.s[0] fmla v17.4s, v1.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v19.4s, v1.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v21.4s, v1.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v25.4s, v1.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v27.4s, v1.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v29.4s, v1.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x8_M2 fmla v16.4s, v2.4s, v6.s[0] fmla v17.4s, v3.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v19.4s, v3.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v21.4s, v3.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v25.4s, v3.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v27.4s, v3.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v29.4s, v3.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x8_E fmla v16.4s, v2.4s, v6.s[0] fmla v17.4s, v3.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v19.4s, v3.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v21.4s, v3.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v25.4s, v3.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v27.4s, v3.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v29.4s, v3.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v4.s[0] fmla v17.4s, v1.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v19.4s, v1.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v21.4s, v1.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v25.4s, v1.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v27.4s, v1.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v29.4s, v1.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV2 fmla v3.4s, v19.4s, alphaV3 st1 {v2.4s, v3.4s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v4.4s, v5.4s}, [pCRow2] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV2 fmla v7.4s, v23.4s, alphaV3 st1 {v6.4s, v7.4s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0 fmla v1.4s, v25.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v26.4s, alphaV2 fmla v3.4s, v27.4s, alphaV3 st1 {v2.4s, v3.4s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v4.4s, v5.4s}, [pCRow2] fmla v4.4s, v28.4s, alphaV0 fmla v5.4s, v29.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow2] ld1 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v30.4s, alphaV2 fmla v7.4s, v31.4s, alphaV3 st1 {v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x8 fmov s16, wzr fmov s18, wzr fmov s20, wzr fmov s22, s16 fmov s24, wzr fmov s26, s16 fmov s28, s18 fmov s30, s20 .endm .macro KERNEL4x8_I ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 fmul v16.4s, v0.4s, v4.s[0] fmul v18.4s, v0.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x8_M1 fmla v16.4s, v0.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x8_M2 fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x8_E fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 add pCRow1, pCRow0, LDC ld1 {v0.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 st1 {v0.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v2.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV2 st1 {v2.4s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v4.4s}, [pCRow2] fmla v4.4s, v20.4s, alphaV0 st1 {v4.4s}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v6.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV2 st1 {v6.4s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v0.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0 st1 {v0.4s}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v2.4s}, [pCRow1] fmla v2.4s, v26.4s, alphaV2 st1 {v2.4s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v4.4s}, [pCRow2] fmla v4.4s, v28.4s, alphaV0 st1 {v4.4s}, [pCRow2] ld1 {v6.4s}, [pCRow1] fmla v6.4s, v30.4s, alphaV2 st1 {v6.4s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x8 fmov s16, wzr fmov s18, wzr fmov s20, wzr fmov s22, s16 fmov s24, wzr fmov s26, s16 fmov s28, s18 fmov s30, s20 .endm .macro KERNEL2x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v4.s[0] fmla v18.2s, v0.2s, v4.s[1] fmla v20.2s, v0.2s, v4.s[2] fmla v22.2s, v0.2s, v4.s[3] fmla v24.2s, v0.2s, v5.s[0] fmla v26.2s, v0.2s, v5.s[1] fmla v28.2s, v0.2s, v5.s[2] fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 add pCRow1, pCRow0, LDC ld1 {v0.2s}, [pCRow0] fmla v0.2s, v16.2s, alphaV0 st1 {v0.2s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v2.2s}, [pCRow1] fmla v2.2s, v18.2s, alphaV2 st1 {v2.2s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v4.2s}, [pCRow2] fmla v4.2s, v20.2s, alphaV0 st1 {v4.2s}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v6.2s}, [pCRow1] fmla v6.2s, v22.2s, alphaV2 st1 {v6.2s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v0.2s}, [pCRow2] fmla v0.2s, v24.2s, alphaV0 st1 {v0.2s}, [pCRow2] add pCRow2, pCRow1, LDC ld1 {v2.2s}, [pCRow1] fmla v2.2s, v26.2s, alphaV2 st1 {v2.2s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v4.2s}, [pCRow2] fmla v4.2s, v28.2s, alphaV0 st1 {v4.2s}, [pCRow2] ld1 {v6.2s}, [pCRow1] fmla v6.2s, v30.2s, alphaV2 st1 {v6.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x8 fmov s16, wzr fmov s18, wzr fmov s20, wzr fmov s22, s16 fmov s24, wzr fmov s26, s16 fmov s28, s18 fmov s30, s20 .endm .macro KERNEL1x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ldr s0, [pA] add pA, pA, #4 fmla s16, s0, v4.s[0] fmla s18, s0, v4.s[1] fmla s20, s0, v4.s[2] fmla s22, s0, v4.s[3] fmla s24, s0, v5.s[0] fmla s26, s0, v5.s[1] fmla s28, s0, v5.s[2] fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 add pCRow1, pCRow0, LDC ldr s0, [pCRow0] fmla s0, s16, alphaV0 str s0, [pCRow0] add pCRow2, pCRow1, LDC ldr s2, [pCRow1] fmla s2, s18, alphaV2 str s2, [pCRow1] add pCRow1, pCRow2, LDC ldr s4, [pCRow2] fmla s4, s20, alphaV0 str s4, [pCRow2] add pCRow2, pCRow1, LDC ldr s6, [pCRow1] fmla s6, s22, alphaV2 str s6, [pCRow1] add pCRow1, pCRow2, LDC ldr s0, [pCRow2] fmla s0, s24, alphaV0 str s0, [pCRow2] add pCRow2, pCRow1, LDC ldr s2, [pCRow1] fmla s2, s26, alphaV2 str s2, [pCRow1] add pCRow1, pCRow2, LDC ldr s4, [pCRow2] fmla s4, s28, alphaV0 str s4, [pCRow2] ldr s6, [pCRow1] fmla s6, s30, alphaV2 str s6, [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, s16 fmov s24, wzr fmov s25, s16 fmov s28, wzr fmov s29, s16 .endm .macro KERNEL8x4_I ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] fmul v21.4s, v1.4s, v8.s[1] fmul v24.4s, v0.4s, v9.s[0] fmul v25.4s, v1.4s, v9.s[0] fmul v28.4s, v0.4s, v9.s[1] fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 ld1 {v4.4s}, [pA] add pA, pA, #16 ld1 {v5.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] fmla v24.4s, v0.4s, v9.s[0] fmla v25.4s, v1.4s, v9.s[0] fmla v28.4s, v0.4s, v9.s[1] fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 ld1 {v4.4s}, [pA] add pA, pA, #16 ld1 {v5.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v21.4s, v5.4s, v12.s[1] fmla v24.4s, v4.4s, v13.s[0] fmla v25.4s, v5.4s, v13.s[0] fmla v28.4s, v4.4s, v13.s[1] fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v21.4s, v5.4s, v12.s[1] fmla v24.4s, v4.4s, v13.s[0] fmla v25.4s, v5.4s, v13.s[0] fmla v28.4s, v4.4s, v13.s[1] fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] fmla v24.4s, v0.4s, v9.s[0] fmla v25.4s, v1.4s, v9.s[0] fmla v28.4s, v0.4s, v9.s[1] fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow2, LDC ld1 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0 fmla v1.4s, v25.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow2] ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0 fmla v5.4s, v29.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_I ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmul v16.2s, v0.2s, v8.s[0] fmul v29.2s, v1.2s, v9.s[1] fmul v20.2s, v0.2s, v8.s[1] fmul v25.2s, v1.2s, v9.s[0] fmul v24.2s, v0.2s, v9.s[0] fmul v21.2s, v1.2s, v8.s[1] fmul v28.2s, v0.2s, v9.s[1] fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 ld1 {v4.2s, v5.2s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x4_M1 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 fmla v16.2s, v4.2s, v12.s[0] fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 fmla v20.2s, v4.2s, v12.s[1] fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 fmla v24.2s, v4.2s, v13.s[0] fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] fmla v28.2s, v4.2s, v13.s[1] fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E fmla v16.2s, v4.2s, v12.s[0] fmla v29.2s, v5.2s, v13.s[1] fmla v20.2s, v4.2s, v12.s[1] fmla v25.2s, v5.2s, v13.s[0] fmla v24.2s, v4.2s, v13.s[0] fmla v21.2s, v5.2s, v12.s[1] fmla v28.2s, v4.2s, v13.s[1] fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV2 fmla v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2s, v9.2s}, [pCRow2] fmla v8.2s, v24.2s, alphaV0 fmla v9.2s, v25.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v28.2s, alphaV2 fmla v13.2s, v29.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s20, s16 fmov s24, s20 fmov s28, s16 .endm .macro KERNEL2x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v24.2s, v0.2s, v9.s[0] fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2s}, [pCRow2] fmla v8.2s, v24.2s, alphaV2 st1 {v8.2s}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v28.2s, alphaV3 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL1x4_SUB ldr s0, [pA] add pA, pA, #4 ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 fmla v16.2s, v8.2s, v0.s[0] fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 add pCRow1, pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC ld1 {v12.s}[0], [pCRow2] ld1 {v12.s}[1], [pCRow1] fmla v12.2s, v20.2s, alphaV1 st1 {v12.s}[0], [pCRow2] st1 {v12.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL8x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL4x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV2 fmla v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL2x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] fmla v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr .endm .macro KERNEL1x2_SUB ld1 {v8.2s} , [pB] add pB , pB, #8 ldr s0 , [pA] add pA, pA, #4 fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL8x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 fmla v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr .endm .macro KERNEL2x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s}, [pA] add pA , pA, #8 fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr .endm .macro KERNEL1x1_SUB ldr s8, [pB] add pB , pB, #4 ldr s0, [pA] add pA , pA, #4 fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 ldr s8, [pCRow0] fmla s8, s16, alphaV0 str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE sgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0, s0 fmov alpha1, s0 fmov alpha2, s0 fmov alpha3, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble sgemm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ sgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 mov pA, origPA // pA = start of A array /******************************************************************************/ sgemm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble sgemm_kernel_L8_M4_BEGIN sgemm_kernel_L8_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L8_M8_22a .align 5 sgemm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L8_M8_22 sgemm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E b sgemm_kernel_L8_M8_44 sgemm_kernel_L8_M8_32: tst counterL, #1 ble sgemm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E b sgemm_kernel_L8_M8_44 sgemm_kernel_L8_M8_40: INIT8x8 sgemm_kernel_L8_M8_44: ands counterL , origK, #1 ble sgemm_kernel_L8_M8_100 sgemm_kernel_L8_M8_46: KERNEL8x8_SUB sgemm_kernel_L8_M8_100: SAVE8x8 sgemm_kernel_L8_M8_END: subs counterI, counterI, #1 bne sgemm_kernel_L8_M8_20 /******************************************************************************/ sgemm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L8_END tst counterI, #4 ble sgemm_kernel_L8_M2_BEGIN sgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L8_M4_22a .align 5 sgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L8_M4_22 sgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E b sgemm_kernel_L8_M4_44 sgemm_kernel_L8_M4_32: tst counterL, #1 ble sgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E b sgemm_kernel_L8_M4_44 sgemm_kernel_L8_M4_40: INIT4x8 sgemm_kernel_L8_M4_44: ands counterL , origK, #1 ble sgemm_kernel_L8_M4_100 sgemm_kernel_L8_M4_46: KERNEL4x8_SUB sgemm_kernel_L8_M4_100: SAVE4x8 sgemm_kernel_L8_M4_END: /******************************************************************************/ sgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L8_M1_BEGIN sgemm_kernel_L8_M2_20: INIT2x8 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L8_M2_40 sgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L8_M2_22 sgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L8_M2_100 sgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L8_M2_42 sgemm_kernel_L8_M2_100: SAVE2x8 sgemm_kernel_L8_M2_END: /******************************************************************************/ sgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L8_END sgemm_kernel_L8_M1_20: INIT1x8 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L8_M1_40 sgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L8_M1_22 sgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L8_M1_100 sgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L8_M1_42 sgemm_kernel_L8_M1_100: SAVE1x8 sgemm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp subs counterJ, counterJ , #1 // j-- bgt sgemm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ sgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 ble sgemm_kernel_L999 tst counterJ , #4 ble sgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #2 mov pA, origPA // pA = A /******************************************************************************/ sgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble sgemm_kernel_L4_M4_BEGIN sgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L4_M8_22a .align 5 sgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M8_22 sgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E b sgemm_kernel_L4_M8_44 sgemm_kernel_L4_M8_32: tst counterL, #1 ble sgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E b sgemm_kernel_L4_M8_44 sgemm_kernel_L4_M8_40: INIT8x4 sgemm_kernel_L4_M8_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M8_100 sgemm_kernel_L4_M8_46: KERNEL8x4_SUB sgemm_kernel_L4_M8_100: SAVE8x4 sgemm_kernel_L4_M8_END: subs counterI, counterI, #1 bne sgemm_kernel_L4_M8_20 /******************************************************************************/ sgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L4_END tst counterI, #4 ble sgemm_kernel_L4_M2_BEGIN sgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt sgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble sgemm_kernel_L4_M4_22a .align 5 sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M4_22 sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_32: tst counterL, #1 ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_40: INIT4x4 sgemm_kernel_L4_M4_44: ands counterL , origK, #1 ble sgemm_kernel_L4_M4_100 sgemm_kernel_L4_M4_46: KERNEL4x4_SUB sgemm_kernel_L4_M4_100: SAVE4x4 sgemm_kernel_L4_M4_END: /******************************************************************************/ sgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L4_M1_BEGIN sgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M2_40 sgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_22 sgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M2_100 sgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M2_42 sgemm_kernel_L4_M2_100: SAVE2x4 sgemm_kernel_L4_M2_END: /******************************************************************************/ sgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L4_END sgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M1_40 sgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_22 sgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M1_100 sgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M1_42 sgemm_kernel_L4_M1_100: SAVE1x4 sgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 /******************************************************************************/ /******************************************************************************/ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble sgemm_kernel_L999 tst counterJ , #2 ble sgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A /******************************************************************************/ sgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 ble sgemm_kernel_L2_M4_BEGIN sgemm_kernel_L2_M8_20: INIT8x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M8_40 .align 5 sgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M8_22 sgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M8_100 sgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M8_42 sgemm_kernel_L2_M8_100: SAVE8x2 sgemm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt sgemm_kernel_L2_M8_20 /******************************************************************************/ sgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L2_END tst counterI, #4 ble sgemm_kernel_L2_M2_BEGIN sgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M4_40 .align 5 sgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_22 sgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M4_100 sgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M4_42 sgemm_kernel_L2_M4_100: SAVE4x2 sgemm_kernel_L2_M4_END: /******************************************************************************/ sgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L2_M1_BEGIN sgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M2_40 sgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_22 sgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M2_100 sgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M2_42 sgemm_kernel_L2_M2_100: SAVE2x2 sgemm_kernel_L2_M2_END: /******************************************************************************/ sgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L2_END sgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L2_M1_40 sgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_22 sgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M1_100 sgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L2_M1_42 sgemm_kernel_L2_M1_100: SAVE1x2 sgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ /******************************************************************************/ sgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble sgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A /******************************************************************************/ sgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 ble sgemm_kernel_L1_M4_BEGIN sgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M8_40 .align 5 sgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M8_22 sgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M8_100 sgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M8_42 sgemm_kernel_L1_M8_100: SAVE8x1 sgemm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt sgemm_kernel_L1_M8_20 /******************************************************************************/ sgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble sgemm_kernel_L1_END tst counterI, #4 ble sgemm_kernel_L1_M2_BEGIN sgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M4_40 .align 5 sgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_22 sgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M4_100 sgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M4_42 sgemm_kernel_L1_M4_100: SAVE4x1 sgemm_kernel_L1_M4_END: /******************************************************************************/ sgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble sgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L1_M1_BEGIN sgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M2_40 sgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_22 sgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M2_100 sgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M2_42 sgemm_kernel_L1_M2_100: SAVE2x1 sgemm_kernel_L1_M2_END: /******************************************************************************/ sgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L1_END sgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M1_40 sgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_22 sgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M1_100 sgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L1_M1_42 sgemm_kernel_L1_M1_100: SAVE1x1 sgemm_kernel_L1_END: /******************************************************************************/ sgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/strmm_kernel_16x4.S000066400000000000000000001324751313527062700210010ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alpha w17 #define temp x18 #define tempOffset x19 #define tempK x20 #define alpha0 s10 #define alphaV0 v10.s[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 224 #define C_PRE_SIZE 160 // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03 //v01 pA0_04, pA0_05, pA0_06, pA0_07 //v02 pA0_08, pA0_09, pA0_10, pA0_11 //v03 pA0_12, pA0_13, pA0_14, pA0_15 //v04 pA1_00, pA1_01, pA1_02, pA1_03 //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 //v08 must save pB00 //v09 must save pB01 //v10 must save pB02 //v11 must save pB03 //v12 must save pB10 //v13 must save pB11 //v14 must save pB12 //v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 //v19 C12, C13, C14, C15 //v20 C16, C17, C18, C19 //v21 C20, C21, C22, C23 //v22 C24, C25, C26, C27 //v23 C28, C29, C30, C31 //v24 C32, C33, C34, C35 //v25 C36, C37, C38, C39 //v26 C40, C41, C42, C43 //v27 C44, C45, C46, C47 //v28 C48, C49, C50, C51 //v29 C52, C53, C54, C55 //v30 C56, C57, C58, C59 //v31 C60, C61, C62, C63 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT16x4 fmov s16, wzr fmov s17, wzr fmov s18, s16 fmov s19, s17 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL16x4_I ldp q0, q1, [pA], #32 ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] ldp s10, s11, [pB], #8 fmul v24.4s, v0.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] ldp q2, q3, [pA], #32 fmul v17.4s, v1.4s, v8.s[0] fmul v21.4s, v1.4s, v9.s[0] ldp q4, q5, [pA], #32 fmul v25.4s, v1.4s, v10.s[0] fmul v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 fmul v18.4s, v2.4s, v8.s[0] fmul v22.4s, v2.4s, v9.s[0] ldp s14, s15, [pB], #8 fmul v19.4s, v3.4s, v8.s[0] fmul v23.4s, v3.4s, v9.s[0] ldp q6, q7, [pA], #32 fmul v26.4s, v2.4s, v10.s[0] fmul v30.4s, v2.4s, v11.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v27.4s, v3.4s, v10.s[0] fmul v31.4s, v3.4s, v11.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] ldp q4, q5, [pA], #32 fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] ldp s12, s13, [pB], #8 fmla v22.4s, v2.4s, v9.s[0] fmla v23.4s, v3.4s, v9.s[0] ldp s14, s15, [pB], #8 fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] fmla v26.4s, v2.4s, v10.s[0] fmla v27.4s, v3.4s, v10.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] ldp q6, q7, [pA], #32 fmla v30.4s, v2.4s, v11.s[0] fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] ldp q0, q1, [pA], #32 fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] ldp s8, s9, [pB], #8 fmla v22.4s, v6.4s, v13.s[0] fmla v23.4s, v7.4s, v13.s[0] ldp s10, s11, [pB], #8 fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v26.4s, v6.4s, v14.s[0] fmla v27.4s, v7.4s, v14.s[0] ldp q2, q3, [pA], #32 fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] fmla v30.4s, v6.4s, v15.s[0] fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v29.4s, v5.4s, v15.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v18.4s, v6.4s, v12.s[0] fmla v22.4s, v6.4s, v13.s[0] fmla v26.4s, v6.4s, v14.s[0] fmla v30.4s, v6.4s, v15.s[0] fmla v19.4s, v7.4s, v12.s[0] fmla v23.4s, v7.4s, v13.s[0] fmla v27.4s, v7.4s, v14.s[0] fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB ldp q0, q1, [pA], #32 ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] ldp s10, s11, [pB], #8 fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] ldp q2, q3, [pA], #32 fmla v17.4s, v1.4s, v8.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v29.4s, v1.4s, v11.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v22.4s, v2.4s, v9.s[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v19.4s, v3.4s, v8.s[0] fmla v23.4s, v3.4s, v9.s[0] fmla v26.4s, v2.4s, v10.s[0] fmla v30.4s, v2.4s, v11.s[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v27.4s, v3.4s, v10.s[0] fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 fmov alpha0, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 fmul v2.4s, v18.4s, alphaV0 fmul v3.4s, v19.4s, alphaV0 stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0 fmul v5.4s, v21.4s, alphaV0 stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 fmul v6.4s, v22.4s, alphaV0 fmul v7.4s, v23.4s, alphaV0 stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0 fmul v1.4s, v25.4s, alphaV0 stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 fmul v2.4s, v26.4s, alphaV0 fmul v3.4s, v27.4s, alphaV0 stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0 fmul v5.4s, v29.4s, alphaV0 stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 fmul v6.4s, v30.4s, alphaV0 fmul v7.4s, v31.4s, alphaV0 stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, s16 fmov s24, wzr fmov s25, s16 fmov s28, wzr fmov s29, s16 .endm .macro KERNEL8x4_I ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] fmul v21.4s, v1.4s, v9.s[0] fmul v24.4s, v0.4s, v10.s[0] fmul v25.4s, v1.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] fmul v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q4, [pA], #16 ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q4, [pA], #16 ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v13.s[0] fmla v21.4s, v5.4s, v13.s[0] fmla v24.4s, v4.4s, v14.s[0] fmla v25.4s, v5.4s, v14.s[0] fmla v28.4s, v4.4s, v15.s[0] fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v21.4s, v1.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v25.4s, v1.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV0 stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 fmul v2.4s, v20.4s, alphaV0 fmul v3.4s, v21.4s, alphaV0 stp q2, q3, [pCRow1] add pCRow1, pCRow1, #32 fmul v4.4s, v24.4s, alphaV0 fmul v5.4s, v25.4s, alphaV0 stp q4, q5, [pCRow2] add pCRow2, pCRow2, #32 fmul v6.4s, v28.4s, alphaV0 fmul v7.4s, v29.4s, alphaV0 stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s20, wzr fmov s24, wzr fmov s28, wzr .endm .macro KERNEL4x4_I ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v20.4s, v0.4s, v9.s[0] fmul v24.4s, v0.4s, v10.s[0] fmul v28.4s, v0.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] ldp s12, s13, [pB], #8 ldp s14, s15, [pB], #8 ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 fmla v16.4s, v1.4s, v12.s[0] fmla v20.4s, v1.4s, v13.s[0] fmla v24.4s, v1.4s, v14.s[0] fmla v28.4s, v1.4s, v15.s[0] ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 .endm .macro KERNEL4x4_E fmla v16.4s, v1.4s, v12.s[0] fmla v20.4s, v1.4s, v13.s[0] fmla v24.4s, v1.4s, v14.s[0] fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr q0, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v20.4s, v0.4s, v9.s[0] fmla v24.4s, v0.4s, v10.s[0] fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 str q0, [pCRow0] add pCRow0, pCRow0, #16 fmul v1.4s, v20.4s, alphaV0 str q1, [pCRow1] add pCRow1, pCRow1, #16 fmul v2.4s, v24.4s, alphaV0 str q2, [pCRow2] add pCRow2, pCRow2, #16 fmul v3.4s, v28.4s, alphaV0 str q3, [pCRow3] add pCRow3, pCRow3, #16 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s20, s16 fmov s24, s20 fmov s28, s16 .endm .macro KERNEL2x4_SUB ldp s8, s9, [pB], #8 ldp s10, s11, [pB], #8 ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v9.s[0] fmla v24.2s, v0.2s, v10.s[0] fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 fmov alpha0, alpha fmul v0.2s, v16.2s, alphaV0 str d0, [pCRow0] add pCRow0, pCRow0, #8 fmul v1.2s, v20.2s, alphaV0 str d1, [pCRow1] add pCRow1, pCRow1, #8 fmul v0.2s, v24.2s, alphaV0 str d0, [pCRow2] add pCRow2, pCRow2, #8 fmul v1.2s, v28.2s, alphaV0 str d1, [pCRow3] add pCRow3, pCRow3, #8 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL1x4_SUB ldr s0, [pA] add pA, pA, #4 ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 fmla v16.2s, v8.2s, v0.s[0] fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 add pCRow1, pCRow1, #4 fmul v12.2s, v20.2s, alphaV0 st1 {v12.s}[0], [pCRow2] st1 {v12.s}[1], [pCRow3] add pCRow2, pCRow2, #4 add pCRow3, pCRow3, #4 .endm /******************************************************************************/ .macro INIT16x2 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 fmov s20, wzr fmov s21, s16 fmov s22, wzr fmov s23, s16 .endm .macro KERNEL16x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] fmla v22.4s, v2.4s, v8.s[1] fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV0 fmul v2.4s, v18.4s, alphaV0 fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmul v4.4s, v20.4s, alphaV0 fmul v5.4s, v21.4s, alphaV0 fmul v6.4s, v22.4s, alphaV0 fmul v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL8x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0 fmul v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL4x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2s, v20.2s, alphaV0 fmul v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL2x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC fmul v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr .endm .macro KERNEL1x2_SUB ld1 {v8.2s} , [pB] add pB , pB, #8 ldr s0 , [pA] add pA, pA, #4 fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 fmov alpha0, alpha add pCRow1 , pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT16x1 fmov s16, wzr fmov s17, wzr fmov s18, wzr fmov s19, s16 .endm .macro KERNEL16x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV0 fmul v2.4s, v18.4s, alphaV0 fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL8x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr .endm .macro KERNEL2x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s}, [pA] add pA , pA, #8 fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr .endm .macro KERNEL1x1_SUB ldr s8, [pB] add pB , pB, #4 ldr s0, [pA] add pA , pA, #4 fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 fmov alpha0, alpha fmul s8, s16, alpha0 str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE strmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble strmm_kernel_L2_BEGIN /******************************************************************************/ strmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array strmm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 ble strmm_kernel_L4_M8_BEGIN .align 5 strmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #16 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 cmp counterL , #2 blt strmm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #2 ble strmm_kernel_L4_M16_22a .align 5 strmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M16_22 .align 5 strmm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 .align 5 strmm_kernel_L4_M16_32: tst counterL, #1 ble strmm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 strmm_kernel_L4_M16_40: INIT16x4 strmm_kernel_L4_M16_44: ands counterL , tempK, #7 ble strmm_kernel_L4_M16_100 .align 5 strmm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 bne strmm_kernel_L4_M16_46 strmm_kernel_L4_M16_100: SAVE16x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #16 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #16 #endif prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] strmm_kernel_L4_M16_END: subs counterI, counterI, #1 bne strmm_kernel_L4_M16_20 //------------------------------------------------------------------------------ strmm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 ble strmm_kernel_L4_END tst counterI, #8 ble strmm_kernel_L4_M4_BEGIN strmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt strmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 ble strmm_kernel_L4_M8_22a .align 5 strmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M8_22 strmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E b strmm_kernel_L4_M8_44 strmm_kernel_L4_M8_32: tst counterL, #1 ble strmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E b strmm_kernel_L4_M8_44 strmm_kernel_L4_M8_40: INIT8x4 strmm_kernel_L4_M8_44: ands counterL , tempK, #1 ble strmm_kernel_L4_M8_100 strmm_kernel_L4_M8_46: KERNEL8x4_SUB strmm_kernel_L4_M8_100: SAVE8x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif strmm_kernel_L4_M8_END: //------------------------------------------------------------------------------ strmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble strmm_kernel_L4_END tst counterI, #4 ble strmm_kernel_L4_M2_BEGIN strmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt strmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble strmm_kernel_L4_M4_22a .align 5 strmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M4_22 strmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b strmm_kernel_L4_M4_44 strmm_kernel_L4_M4_32: tst counterL, #1 ble strmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b strmm_kernel_L4_M4_44 strmm_kernel_L4_M4_40: INIT4x4 strmm_kernel_L4_M4_44: ands counterL , tempK, #1 ble strmm_kernel_L4_M4_100 strmm_kernel_L4_M4_46: KERNEL4x4_SUB strmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L4_M4_END: //------------------------------------------------------------------------------ strmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L4_M1_BEGIN strmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L4_M2_40 strmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M2_22 strmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L4_M2_100 strmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M2_42 strmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L4_M2_END: strmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L4_END strmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L4_M1_40 strmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M1_22 strmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L4_M1_100 strmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M1_42 strmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif strmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- bgt strmm_kernel_L4_BEGIN /******************************************************************************/ strmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble strmm_kernel_L999 tst counterJ , #2 ble strmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A strmm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 ble strmm_kernel_L2_M8_BEGIN strmm_kernel_L2_M16_20: INIT16x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #3 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #16 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M16_40 .align 5 strmm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M16_22 strmm_kernel_L2_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M16_100 strmm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M16_42 strmm_kernel_L2_M16_100: SAVE16x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #16 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #16 #endif strmm_kernel_L2_M16_END: subs counterI, counterI, #1 bgt strmm_kernel_L2_M16_20 //------------------------------------------------------------------------------ strmm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 ble strmm_kernel_L2_END tst counterI, #8 ble strmm_kernel_L2_M4_BEGIN strmm_kernel_L2_M8_20: INIT8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #3 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M8_40 .align 5 strmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M8_22 strmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M8_100 strmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M8_42 strmm_kernel_L2_M8_100: SAVE8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif strmm_kernel_L2_M8_END: //------------------------------------------------------------------------------ strmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble strmm_kernel_L2_END tst counterI, #4 ble strmm_kernel_L2_M2_BEGIN strmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M4_40 .align 5 strmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M4_22 strmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M4_100 strmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M4_42 strmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L2_M4_END: //------------------------------------------------------------------------------ strmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L2_M1_BEGIN strmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M2_40 strmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M2_22 strmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M2_100 strmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M2_42 strmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L2_M2_END: strmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L2_END strmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble strmm_kernel_L2_M1_40 strmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M1_22 strmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M1_100 strmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M1_42 strmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif strmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ strmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble strmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A strmm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 ble strmm_kernel_L1_M8_BEGIN strmm_kernel_L1_M16_20: INIT16x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pA, pA, temp lsl temp, tempOffset, #2 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #16 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M16_40 .align 5 strmm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M16_22 strmm_kernel_L1_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M16_100 strmm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M16_42 strmm_kernel_L1_M16_100: SAVE16x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #16 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #16 #endif strmm_kernel_L1_M16_END: subs counterI, counterI, #1 bgt strmm_kernel_L1_M16_20 //------------------------------------------------------------------------------ strmm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 ble strmm_kernel_L1_END tst counterI, #8 ble strmm_kernel_L1_M4_BEGIN strmm_kernel_L1_M8_20: INIT8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #2 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M8_40 .align 5 strmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M8_22 strmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M8_100 strmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M8_42 strmm_kernel_L1_M8_100: SAVE8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif strmm_kernel_L1_M8_END: //------------------------------------------------------------------------------ strmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble strmm_kernel_L1_END tst counterI, #4 ble strmm_kernel_L1_M2_BEGIN strmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M4_40 .align 5 strmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M4_22 strmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M4_100 strmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M4_42 strmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L1_M4_END: //------------------------------------------------------------------------------ strmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L1_M1_BEGIN strmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M2_40 strmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M2_22 strmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M2_100 strmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M2_42 strmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L1_M2_END: strmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L1_END strmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M1_40 strmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M1_22 strmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M1_100 strmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M1_42 strmm_kernel_L1_M1_100: SAVE1x1 strmm_kernel_L1_END: strmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/strmm_kernel_4x4.S000066400000000000000000000635531313527062700207160ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define temp x16 #define tempOffset x17 #define tempK x18 #define alpha0 s10 #define alphaV0 v10.s[0] #define alpha1 s11 #define alphaV1 v11.s[0] #define alpha2 s14 #define alphaV2 v14.s[0] #define alpha3 s15 #define alphaV3 v15.s[0] // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA00, pA01 //v01 pA02, pA03 //v02 //v03 //v04 pA10, pA11 //v05 pA12, pA13 //v06 //v07 //v08 must save pB00, pB01 //v09 must save pB02, pB03 //v10 must save ALPHA0 //v11 must save ALPHA1 //v12 must save pB10, pB11 //v13 must save pB12, pB13 //v14 must save ALPHA2 //v15 must save ALPHA3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 //v19 //v20 C10, C11 //v21 C12, C13 //v22 //v23 //v24 C20, C21 //v25 C22, C23 //v26 //v27 //v28 C30, C31 //v29 C32, C33 //v30 //v31 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_I ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmul v16.2s, v0.2s, v8.s[0] fmul v29.2s, v1.2s, v9.s[1] fmul v20.2s, v0.2s, v8.s[1] fmul v25.2s, v1.2s, v9.s[0] fmul v24.2s, v0.2s, v9.s[0] fmul v21.2s, v1.2s, v8.s[1] fmul v28.2s, v0.2s, v9.s[1] fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 ld1 {v4.2s, v5.2s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x4_M1 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 fmla v16.2s, v4.2s, v12.s[0] fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 fmla v20.2s, v4.2s, v12.s[1] fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 fmla v24.2s, v4.2s, v13.s[0] fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] fmla v28.2s, v4.2s, v13.s[1] fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E fmla v16.2s, v4.2s, v12.s[0] fmla v29.2s, v5.2s, v13.s[1] fmla v20.2s, v4.2s, v12.s[1] fmla v25.2s, v5.2s, v13.s[0] fmla v24.2s, v4.2s, v13.s[0] fmla v21.2s, v5.2s, v12.s[1] fmla v28.2s, v4.2s, v13.s[1] fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2s, v20.2s, alphaV2 fmul v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2s, v24.2s, alphaV0 fmul v9.2s, v25.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2s, v28.2s, alphaV2 fmul v13.2s, v29.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s20, s16 fmov s24, s20 fmov s28, s16 .endm .macro KERNEL2x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v24.2s, v0.2s, v9.s[0] fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2s, v24.2s, alphaV2 st1 {v8.2s}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2s, v28.2s, alphaV3 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL1x4_SUB ldr s0, [pA] add pA, pA, #4 ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 fmla v16.2s, v8.2s, v0.s[0] fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 add pCRow1, pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v12.2s, v20.2s, alphaV1 st1 {v12.s}[0], [pCRow2] st1 {v12.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL4x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2s, v20.2s, alphaV2 fmul v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL2x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC fmul v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr .endm .macro KERNEL1x2_SUB ld1 {v8.2s} , [pB] add pB , pB, #8 ldr s0 , [pA] add pA, pA, #4 fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr .endm .macro KERNEL2x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s}, [pA] add pA , pA, #8 fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr .endm .macro KERNEL1x1_SUB ldr s8, [pB] add pB , pB, #4 ldr s0, [pA] add pA , pA, #4 fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 fmul s8, s16, alpha0 str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE strmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0, s0 fmov alpha1, s0 fmov alpha2, s0 fmov alpha3, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble strmm_kernel_L2_BEGIN /******************************************************************************/ strmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array strmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble strmm_kernel_L4_M2_BEGIN strmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt strmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble strmm_kernel_L4_M4_22a .align 5 strmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M4_22 strmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b strmm_kernel_L4_M4_44 strmm_kernel_L4_M4_32: tst counterL, #1 ble strmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b strmm_kernel_L4_M4_44 strmm_kernel_L4_M4_40: INIT4x4 strmm_kernel_L4_M4_44: ands counterL , tempK, #1 ble strmm_kernel_L4_M4_100 strmm_kernel_L4_M4_46: KERNEL4x4_SUB strmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne strmm_kernel_L4_M4_20 strmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L4_M1_BEGIN strmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L4_M2_40 strmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M2_22 strmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L4_M2_100 strmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M2_42 strmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L4_M2_END: strmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L4_END strmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L4_M1_40 strmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M1_22 strmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L4_M1_100 strmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M1_42 strmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif strmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- bgt strmm_kernel_L4_BEGIN /******************************************************************************/ strmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble strmm_kernel_L999 tst counterJ , #2 ble strmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A strmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble strmm_kernel_L2_M2_BEGIN strmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M4_40 .align 5 strmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M4_22 strmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M4_100 strmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M4_42 strmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt strmm_kernel_L2_M4_20 strmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L2_M1_BEGIN strmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M2_40 strmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M2_22 strmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M2_100 strmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M2_42 strmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L2_M2_END: strmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L2_END strmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble strmm_kernel_L2_M1_40 strmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M1_22 strmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M1_100 strmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M1_42 strmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif strmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ strmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble strmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A strmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble strmm_kernel_L1_M2_BEGIN strmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M4_40 .align 5 strmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M4_22 strmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M4_100 strmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M4_42 strmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt strmm_kernel_L1_M4_20 strmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L1_M1_BEGIN strmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M2_40 strmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M2_22 strmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M2_100 strmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M2_42 strmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L1_M2_END: strmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L1_END strmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M1_40 strmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M1_22 strmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M1_100 strmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M1_42 strmm_kernel_L1_M1_100: SAVE1x1 #if 0 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif #endif strmm_kernel_L1_END: #if 0 #if !defined(LEFT) add tempOffset, tempOffset, #1 #endif #endif strmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/strmm_kernel_8x8.S000066400000000000000000001474501313527062700207250ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pA x15 #define temp x16 #define tempOffset x17 #define tempK x18 #define alpha0 s10 #define alphaV0 v10.s[0] #define alpha1 s11 #define alphaV1 v11.s[0] #define alpha2 s14 #define alphaV2 v14.s[0] #define alpha3 s15 #define alphaV3 v15.s[0] // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pA // 16 temp // 17 tempOffset // 18 must save tempK // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 //v01 pA0_4, pA0_5, pA0_6, pA0_7 //v02 pA1_0, pA1_1, pA1_2, pA1_3 //v03 pA1_4, pA1_5, pA1_6, pA1_7 //v04 pB0_0, pB0_1, pB0_2, pB0_3 //v05 pB0_4, pB0_5, pB0_6, pB0_7 //v06 pB1_0, pB1_1, pB1_2, pB1_3 //v07 pB1_4, pB1_5, pB1_6, pB1_7 //v08 must save //v09 must save //v10 must save ALPHA0 //v11 must save ALPHA1 //v12 must save //v13 must save //v14 must save ALPHA2 //v15 must save ALPHA3 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 //v19 C12, C13, C14, C15 //v20 C16, C17, C18, C19 //v21 C20, C21, C22, C23 //v22 C24, C25, C26, C27 //v23 C28, C29, C30, C31 //v24 C32, C33, C34, C35 //v25 C36, C37, C38, C39 //v26 C40, C41, C42, C43 //v27 C44, C45, C46, C47 //v28 C48, C49, C50, C51 //v29 C52, C53, C54, C55 //v30 C56, C57, C58, C59 //v31 C60, C61, C62, C63 /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT8x8 fmov s16, wzr fmov s17, wzr fmov s18, s16 fmov s19, s17 fmov s20, wzr fmov s21, s16 fmov s22, s17 fmov s23, s18 fmov s24, wzr fmov s25, s16 fmov s26, s17 fmov s27, s18 fmov s28, wzr fmov s29, s16 fmov s30, s17 fmov s31, s18 .endm .macro KERNEL8x8_I ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmul v16.4s, v0.4s, v4.s[0] fmul v17.4s, v1.4s, v4.s[0] fmul v18.4s, v0.4s, v4.s[1] fmul v19.4s, v1.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] fmul v21.4s, v1.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] fmul v23.4s, v1.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] fmul v25.4s, v1.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v27.4s, v1.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] fmul v29.4s, v1.4s, v5.s[2] fmul v30.4s, v0.4s, v5.s[3] fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x8_M1 fmla v16.4s, v0.4s, v4.s[0] fmla v17.4s, v1.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v19.4s, v1.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v21.4s, v1.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v25.4s, v1.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v27.4s, v1.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v29.4s, v1.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 ld1 {v3.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x8_M2 fmla v16.4s, v2.4s, v6.s[0] fmla v17.4s, v3.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v19.4s, v3.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v21.4s, v3.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v25.4s, v3.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v27.4s, v3.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v29.4s, v3.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x8_E fmla v16.4s, v2.4s, v6.s[0] fmla v17.4s, v3.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v19.4s, v3.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v21.4s, v3.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v25.4s, v3.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v27.4s, v3.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v29.4s, v3.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v4.s[0] fmla v17.4s, v1.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v19.4s, v1.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v21.4s, v1.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v25.4s, v1.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v27.4s, v1.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v29.4s, v1.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v2.4s, v18.4s, alphaV2 fmul v3.4s, v19.4s, alphaV3 st1 {v2.4s, v3.4s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v4.4s, v20.4s, alphaV0 fmul v5.4s, v21.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow2] add pCRow2, pCRow1, LDC fmul v6.4s, v22.4s, alphaV2 fmul v7.4s, v23.4s, alphaV3 st1 {v6.4s, v7.4s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v0.4s, v24.4s, alphaV0 fmul v1.4s, v25.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow2] add pCRow2, pCRow1, LDC fmul v2.4s, v26.4s, alphaV2 fmul v3.4s, v27.4s, alphaV3 st1 {v2.4s, v3.4s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v4.4s, v28.4s, alphaV0 fmul v5.4s, v29.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow2] fmul v6.4s, v30.4s, alphaV2 fmul v7.4s, v31.4s, alphaV3 st1 {v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x8 fmov s16, wzr fmov s18, wzr fmov s20, wzr fmov s22, s16 fmov s24, wzr fmov s26, s16 fmov s28, s18 fmov s30, s20 .endm .macro KERNEL4x8_I ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 fmul v16.4s, v0.4s, v4.s[0] fmul v18.4s, v0.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x8_M1 fmla v16.4s, v0.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 ld1 {v7.4s}, [pB] add pB, pB, #16 ld1 {v2.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x8_M2 fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x8_E fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 st1 {v0.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v2.4s, v18.4s, alphaV2 st1 {v2.4s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v4.4s, v20.4s, alphaV0 st1 {v4.4s}, [pCRow2] add pCRow2, pCRow1, LDC fmul v6.4s, v22.4s, alphaV2 st1 {v6.4s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v0.4s, v24.4s, alphaV0 st1 {v0.4s}, [pCRow2] add pCRow2, pCRow1, LDC fmul v2.4s, v26.4s, alphaV2 st1 {v2.4s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v4.4s, v28.4s, alphaV0 st1 {v4.4s}, [pCRow2] fmul v6.4s, v30.4s, alphaV2 st1 {v6.4s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x8 fmov s16, wzr fmov s18, wzr fmov s20, wzr fmov s22, s16 fmov s24, wzr fmov s26, s16 fmov s28, s18 fmov s30, s20 .endm .macro KERNEL2x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v4.s[0] fmla v18.2s, v0.2s, v4.s[1] fmla v20.2s, v0.2s, v4.s[2] fmla v22.2s, v0.2s, v4.s[3] fmla v24.2s, v0.2s, v5.s[0] fmla v26.2s, v0.2s, v5.s[1] fmla v28.2s, v0.2s, v5.s[2] fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 add pCRow1, pCRow0, LDC fmul v0.2s, v16.2s, alphaV0 st1 {v0.2s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v2.2s, v18.2s, alphaV2 st1 {v2.2s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v4.2s, v20.2s, alphaV0 st1 {v4.2s}, [pCRow2] add pCRow2, pCRow1, LDC fmul v6.2s, v22.2s, alphaV2 st1 {v6.2s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v0.2s, v24.2s, alphaV0 st1 {v0.2s}, [pCRow2] add pCRow2, pCRow1, LDC fmul v2.2s, v26.2s, alphaV2 st1 {v2.2s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v4.2s, v28.2s, alphaV0 st1 {v4.2s}, [pCRow2] fmul v6.2s, v30.2s, alphaV2 st1 {v6.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x8 fmov s16, wzr fmov s18, wzr fmov s20, wzr fmov s22, s16 fmov s24, wzr fmov s26, s16 fmov s28, s18 fmov s30, s20 .endm .macro KERNEL1x8_SUB ld1 {v4.4s}, [pB] add pB, pB, #16 ld1 {v5.4s}, [pB] add pB, pB, #16 ldr s0, [pA] add pA, pA, #4 fmla s16, s0, v4.s[0] fmla s18, s0, v4.s[1] fmla s20, s0, v4.s[2] fmla s22, s0, v4.s[3] fmla s24, s0, v5.s[0] fmla s26, s0, v5.s[1] fmla s28, s0, v5.s[2] fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 add pCRow1, pCRow0, LDC fmul s0, s16, alphaV0 str s0, [pCRow0] add pCRow2, pCRow1, LDC fmul s2, s18, alphaV2 str s2, [pCRow1] add pCRow1, pCRow2, LDC fmul s4, s20, alphaV0 str s4, [pCRow2] add pCRow2, pCRow1, LDC fmul s6, s22, alphaV2 str s6, [pCRow1] add pCRow1, pCRow2, LDC fmul s0, s24, alphaV0 str s0, [pCRow2] add pCRow2, pCRow1, LDC fmul s2, s26, alphaV2 str s2, [pCRow1] add pCRow1, pCRow2, LDC fmul s4, s28, alphaV0 str s4, [pCRow2] fmul s6, s30, alphaV2 str s6, [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT8x4 fmov s16, wzr fmov s17, wzr fmov s20, wzr fmov s21, s16 fmov s24, wzr fmov s25, s16 fmov s28, wzr fmov s29, s16 .endm .macro KERNEL8x4_I ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] fmul v20.4s, v0.4s, v8.s[1] fmul v21.4s, v1.4s, v8.s[1] fmul v24.4s, v0.4s, v9.s[0] fmul v25.4s, v1.4s, v9.s[0] fmul v28.4s, v0.4s, v9.s[1] fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 ld1 {v4.4s}, [pA] add pA, pA, #16 ld1 {v5.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] fmla v24.4s, v0.4s, v9.s[0] fmla v25.4s, v1.4s, v9.s[0] fmla v28.4s, v0.4s, v9.s[1] fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 ld1 {v4.4s}, [pA] add pA, pA, #16 ld1 {v5.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v21.4s, v5.4s, v12.s[1] fmla v24.4s, v4.4s, v13.s[0] fmla v25.4s, v5.4s, v13.s[0] fmla v28.4s, v4.4s, v13.s[1] fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] fmla v20.4s, v4.4s, v12.s[1] fmla v21.4s, v5.4s, v12.s[1] fmla v24.4s, v4.4s, v13.s[0] fmla v25.4s, v5.4s, v13.s[0] fmla v28.4s, v4.4s, v13.s[1] fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] fmla v24.4s, v0.4s, v9.s[0] fmla v25.4s, v1.4s, v9.s[0] fmla v28.4s, v0.4s, v9.s[1] fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0 fmul v5.4s, v21.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow2, LDC fmul v0.4s, v24.4s, alphaV0 fmul v1.4s, v25.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow2] fmul v4.4s, v28.4s, alphaV0 fmul v5.4s, v29.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 fmov s24, s17 fmov s25, s16 fmov s28, s17 fmov s29, s16 .endm .macro KERNEL4x4_I ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmul v16.2s, v0.2s, v8.s[0] fmul v29.2s, v1.2s, v9.s[1] fmul v20.2s, v0.2s, v8.s[1] fmul v25.2s, v1.2s, v9.s[0] fmul v24.2s, v0.2s, v9.s[0] fmul v21.2s, v1.2s, v8.s[1] fmul v28.2s, v0.2s, v9.s[1] fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 ld1 {v4.2s, v5.2s}, [pA] add pA, pA, #16 .endm .macro KERNEL4x4_M1 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 fmla v16.2s, v4.2s, v12.s[0] fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 fmla v20.2s, v4.2s, v12.s[1] fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 fmla v24.2s, v4.2s, v13.s[0] fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] fmla v28.2s, v4.2s, v13.s[1] fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E fmla v16.2s, v4.2s, v12.s[0] fmla v29.2s, v5.2s, v13.s[1] fmla v20.2s, v4.2s, v12.s[1] fmla v25.2s, v5.2s, v13.s[0] fmla v24.2s, v4.2s, v13.s[0] fmla v21.2s, v5.2s, v12.s[1] fmla v28.2s, v4.2s, v13.s[1] fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v29.2s, v1.2s, v9.s[1] fmla v20.2s, v0.2s, v8.s[1] fmla v25.2s, v1.2s, v9.s[0] fmla v24.2s, v0.2s, v9.s[0] fmla v21.2s, v1.2s, v8.s[1] fmla v28.2s, v0.2s, v9.s[1] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2s, v20.2s, alphaV2 fmul v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2s, v24.2s, alphaV0 fmul v9.2s, v25.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2s, v28.2s, alphaV2 fmul v13.2s, v29.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x4 fmov s16, wzr fmov s20, s16 fmov s24, s20 fmov s28, s16 .endm .macro KERNEL2x4_SUB ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v24.2s, v0.2s, v9.s[0] fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2s, v24.2s, alphaV2 st1 {v8.2s}, [pCRow2] add pCRow1, pCRow2, LDC fmul v12.2s, v28.2s, alphaV3 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x4 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL1x4_SUB ldr s0, [pA] add pA, pA, #4 ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 fmla v16.2s, v8.2s, v0.s[0] fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 add pCRow1, pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC fmul v12.2s, v20.2s, alphaV1 st1 {v12.s}[0], [pCRow2] st1 {v12.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT8x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL8x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0 fmul v5.4s, v21.4s, alphaV1 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x2 fmov s16, wzr fmov s17, s16 fmov s20, s17 fmov s21, s16 .endm .macro KERNEL4x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC fmul v12.2s, v20.2s, alphaV2 fmul v13.2s, v21.2s, alphaV3 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x2 fmov s16, wzr fmov s20, s16 .endm .macro KERNEL2x2_SUB ld1 {v8.2s}, [pB] add pB, pB, #8 ld1 {v0.2s}, [pA] add pA, pA, #8 fmla v16.2s, v0.2s, v8.s[0] fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC fmul v12.2s, v20.2s, alphaV1 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x2 fmov s16, wzr .endm .macro KERNEL1x2_SUB ld1 {v8.2s} , [pB] add pB , pB, #8 ldr s0 , [pA] add pA, pA, #4 fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 add pCRow1 , pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 .endm /******************************************************************************/ .macro INIT8x1 fmov s16, wzr fmov s17, wzr .endm .macro KERNEL8x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.4s}, [pA] add pA, pA, #16 ld1 {v1.4s}, [pA] add pA, pA, #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 fmul v0.4s, v16.4s, alphaV0 fmul v1.4s, v17.4s, alphaV1 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT4x1 fmov s16, wzr fmov s17, s16 .endm .macro KERNEL4x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 fmla v16.2s, v0.2s, v8.s[0] fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 fmul v8.2s, v16.2s, alphaV0 fmul v9.2s, v17.2s, alphaV1 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x1 fmov s16, wzr .endm .macro KERNEL2x1_SUB ldr s8, [pB] add pB , pB, #4 ld1 {v0.2s}, [pA] add pA , pA, #8 fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 .endm /******************************************************************************/ .macro INIT1x1 fmov s16, wzr .endm .macro KERNEL1x1_SUB ldr s8, [pB] add pB , pB, #4 ldr s0, [pA] add pA , pA, #4 fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 fmul s8, s16, alpha0 str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE strmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] fmov alpha0, s0 fmov alpha1, s0 fmov alpha2, s0 fmov alpha3, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble strmm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ strmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array /******************************************************************************/ strmm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble strmm_kernel_L8_M4_BEGIN strmm_kernel_L8_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #8 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt strmm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 ble strmm_kernel_L8_M8_22a .align 5 strmm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 bgt strmm_kernel_L8_M8_22 strmm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E b strmm_kernel_L8_M8_44 strmm_kernel_L8_M8_32: tst counterL, #1 ble strmm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E b strmm_kernel_L8_M8_44 strmm_kernel_L8_M8_40: INIT8x8 strmm_kernel_L8_M8_44: ands counterL , tempK, #1 ble strmm_kernel_L8_M8_100 strmm_kernel_L8_M8_46: KERNEL8x8_SUB strmm_kernel_L8_M8_100: SAVE8x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #8 #endif lsl temp, tempK, #5 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif strmm_kernel_L8_M8_END: subs counterI, counterI, #1 bne strmm_kernel_L8_M8_20 /******************************************************************************/ strmm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 ble strmm_kernel_L8_END tst counterI, #4 ble strmm_kernel_L8_M2_BEGIN strmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #8 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt strmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 ble strmm_kernel_L8_M4_22a .align 5 strmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 bgt strmm_kernel_L8_M4_22 strmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E b strmm_kernel_L8_M4_44 strmm_kernel_L8_M4_32: tst counterL, #1 ble strmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E b strmm_kernel_L8_M4_44 strmm_kernel_L8_M4_40: INIT4x8 strmm_kernel_L8_M4_44: ands counterL , tempK, #1 ble strmm_kernel_L8_M4_100 strmm_kernel_L8_M4_46: KERNEL4x8_SUB strmm_kernel_L8_M4_100: SAVE4x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #8 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L8_M4_END: /******************************************************************************/ strmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L8_M1_BEGIN strmm_kernel_L8_M2_20: INIT2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #8 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L8_M2_40 strmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB subs counterL, counterL, #1 bgt strmm_kernel_L8_M2_22 strmm_kernel_L8_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L8_M2_100 strmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 bgt strmm_kernel_L8_M2_42 strmm_kernel_L8_M2_100: SAVE2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #8 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L8_M2_END: /******************************************************************************/ strmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L8_END strmm_kernel_L8_M1_20: INIT1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pA, pA, temp lsl temp, tempOffset, #5 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #8 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L8_M1_40 strmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB subs counterL, counterL, #1 bgt strmm_kernel_L8_M1_22 strmm_kernel_L8_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L8_M1_100 strmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 bgt strmm_kernel_L8_M1_42 strmm_kernel_L8_M1_100: SAVE1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #8 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif strmm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp #if !defined(LEFT) add tempOffset, tempOffset, #8 #endif subs counterJ, counterJ , #1 // j-- bgt strmm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ strmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 ble strmm_kernel_L999 tst counterJ , #4 ble strmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #2 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A /******************************************************************************/ strmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 ble strmm_kernel_L4_M4_BEGIN strmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt strmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 ble strmm_kernel_L4_M8_22a .align 5 strmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M8_22 strmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E b strmm_kernel_L4_M8_44 strmm_kernel_L4_M8_32: tst counterL, #1 ble strmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E b strmm_kernel_L4_M8_44 strmm_kernel_L4_M8_40: INIT8x4 strmm_kernel_L4_M8_44: ands counterL , tempK, #1 ble strmm_kernel_L4_M8_100 strmm_kernel_L4_M8_46: KERNEL8x4_SUB strmm_kernel_L4_M8_100: SAVE8x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif strmm_kernel_L4_M8_END: subs counterI, counterI, #1 bne strmm_kernel_L4_M8_20 /******************************************************************************/ strmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 ble strmm_kernel_L4_END tst counterI, #4 ble strmm_kernel_L4_M2_BEGIN strmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt strmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 ble strmm_kernel_L4_M4_22a .align 5 strmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M4_22 strmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E b strmm_kernel_L4_M4_44 strmm_kernel_L4_M4_32: tst counterL, #1 ble strmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E b strmm_kernel_L4_M4_44 strmm_kernel_L4_M4_40: INIT4x4 strmm_kernel_L4_M4_44: ands counterL , tempK, #1 ble strmm_kernel_L4_M4_100 strmm_kernel_L4_M4_46: KERNEL4x4_SUB strmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L4_M4_END: /******************************************************************************/ strmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L4_M1_BEGIN strmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pA, pA, temp lsl temp, tempOffset, #4 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L4_M2_40 strmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M2_22 strmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L4_M2_100 strmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M2_42 strmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L4_M2_END: /******************************************************************************/ strmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L4_END strmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L4_M1_40 strmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M1_22 strmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L4_M1_100 strmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt strmm_kernel_L4_M1_42 strmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif strmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif /******************************************************************************/ /******************************************************************************/ strmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble strmm_kernel_L999 tst counterJ , #2 ble strmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A /******************************************************************************/ strmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 ble strmm_kernel_L2_M4_BEGIN strmm_kernel_L2_M8_20: INIT8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #3 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M8_40 .align 5 strmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M8_22 strmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M8_100 strmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M8_42 strmm_kernel_L2_M8_100: SAVE8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif strmm_kernel_L2_M8_END: subs counterI, counterI, #1 bgt strmm_kernel_L2_M8_20 /******************************************************************************/ strmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 ble strmm_kernel_L2_END tst counterI, #4 ble strmm_kernel_L2_M2_BEGIN strmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M4_40 .align 5 strmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M4_22 strmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M4_100 strmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M4_42 strmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L2_M4_END: /******************************************************************************/ strmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L2_M1_BEGIN strmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble strmm_kernel_L2_M2_40 strmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M2_22 strmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M2_100 strmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M2_42 strmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L2_M2_END: /******************************************************************************/ strmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L2_END strmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #3 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble strmm_kernel_L2_M1_40 strmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M1_22 strmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L2_M1_100 strmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt strmm_kernel_L2_M1_42 strmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #2 add pA, pA, temp lsl temp, tempK, #3 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif strmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ /******************************************************************************/ strmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble strmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A /******************************************************************************/ strmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 ble strmm_kernel_L1_M4_BEGIN strmm_kernel_L1_M8_20: INIT8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #2 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #8 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M8_40 .align 5 strmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M8_22 strmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M8_100 strmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M8_42 strmm_kernel_L1_M8_100: SAVE8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #8 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #8 #endif strmm_kernel_L1_M8_END: subs counterI, counterI, #1 bgt strmm_kernel_L1_M8_20 /******************************************************************************/ strmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 ble strmm_kernel_L1_END tst counterI, #4 ble strmm_kernel_L1_M2_BEGIN strmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M4_40 .align 5 strmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M4_22 strmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M4_100 strmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M4_42 strmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif strmm_kernel_L1_M4_END: /******************************************************************************/ strmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble strmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble strmm_kernel_L1_M1_BEGIN strmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #3 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M2_40 strmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M2_22 strmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M2_100 strmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M2_42 strmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #3 add pA, pA, temp lsl temp, tempK, #2 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif strmm_kernel_L1_M2_END: /******************************************************************************/ strmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble strmm_kernel_L1_END strmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #2 add pB, pB, temp lsl temp, tempOffset, #2 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble strmm_kernel_L1_M1_40 strmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M1_22 strmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble strmm_kernel_L1_M1_100 strmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt strmm_kernel_L1_M1_42 strmm_kernel_L1_M1_100: SAVE1x1 strmm_kernel_L1_END: /******************************************************************************/ strmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/swap.S000066400000000000000000000127111313527062700164550ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x3 /* X vector address */ #define INC_X x4 /* X stride */ #define Y x5 /* Y vector address */ #define INC_Y x6 /* Y stride */ #define I x1 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define TMP0 s0 #define TMPV0 {v0.s}[0] #define TMP1 s1 #define TMPV1 {v1.s}[0] #define SZ 4 #else #define TMP0 d0 #define TMPV0 {v0.d}[0] #define TMP1 d1 #define TMPV1 {v1.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro KERNEL_F1 #if !defined(COMPLEX) ldr TMP0, [X] ldr TMP1, [Y] str TMP0, [Y], #SZ str TMP1, [X], #SZ #else #if !defined(DOUBLE) ld1 {v0.2s}, [X] ld1 {v1.2s}, [Y] st1 {v0.2s}, [Y], #8 st1 {v1.2s}, [X], #8 #else ld1 {v0.2d}, [X] ld1 {v1.2d}, [Y] st1 {v0.2d}, [Y], #16 st1 {v1.2d}, [X], #16 #endif #endif .endm .macro KERNEL_F8 #if !defined(COMPLEX) #if !defined(DOUBLE) ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 #else // DOUBLE ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 #endif #else // COMPLEX #if !defined(DOUBLE) ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 #else // DOUBLE ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 ld1 {v0.4s, v1.4s}, [X] ld1 {v2.4s, v3.4s}, [Y] st1 {v0.4s, v1.4s}, [Y], #32 st1 {v2.4s, v3.4s}, [X], #32 #endif #endif .endm .macro INIT_S #if !defined(COMPLEX) #if !defined(DOUBLE) lsl INC_X, INC_X, #2 lsl INC_Y, INC_Y, #2 #else lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #endif #else #if !defined(DOUBLE) lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #else lsl INC_X, INC_X, #4 lsl INC_Y, INC_Y, #4 #endif #endif .endm .macro KERNEL_S1 #if !defined(COMPLEX) #if !defined(DOUBLE) ldr w10, [X] ldr w11, [Y] str w10, [Y] str w11, [X] #else ldr x10, [X] ldr x11, [Y] str x10, [Y] str x11, [X] #endif #else #if !defined(DOUBLE) ldr x10, [X] ldr x11, [Y] str x10, [Y] str x11, [X] #else ldr x10, [X] ldr x11, [Y] str x10, [Y] str x11, [X] ldr x12, [X, #8] ldr x13, [Y, #8] str x12, [Y, #8] str x13, [X, #8] #endif #endif add Y, Y, INC_Y add X, X, INC_X .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble swap_kernel_L999 cmp INC_X, #1 bne swap_kernel_S_BEGIN cmp INC_Y, #1 bne swap_kernel_S_BEGIN swap_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr beq swap_kernel_F1 swap_kernel_F8: KERNEL_F8 subs I, I, #1 bne swap_kernel_F8 swap_kernel_F1: ands I, N, #7 ble swap_kernel_L999 swap_kernel_F10: KERNEL_F1 subs I, I, #1 bne swap_kernel_F10 b swap_kernel_L999 swap_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble swap_kernel_S1 swap_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne swap_kernel_S4 swap_kernel_S1: ands I, N, #3 ble swap_kernel_L999 swap_kernel_S10: KERNEL_S1 subs I, I, #1 bne swap_kernel_S10 swap_kernel_L999: mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/swap_thunderx2t99.S000066400000000000000000000077161313527062700210370ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x3 /* X vector address */ #define INC_X x4 /* X stride */ #define Y x5 /* Y vector address */ #define INC_Y x6 /* Y stride */ #define I x1 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(COMPLEX) #if !defined(DOUBLE) #define TMPF0 s0 #define TMPF1 s1 #define INC_SHIFT 2 #define N_DIV_SHIFT 2 #define N_REM_MASK 3 #else #define TMPF0 d0 #define TMPF1 d1 #define INC_SHIFT 3 #define N_DIV_SHIFT 1 #define N_REM_MASK 1 #endif #else #if !defined(DOUBLE) #define TMPF0 d0 #define TMPF1 d1 #define INC_SHIFT 3 #define N_DIV_SHIFT 1 #define N_REM_MASK 1 #else #define TMPF0 q0 #define TMPF1 q1 #define INC_SHIFT 4 #define N_DIV_SHIFT 0 #define N_REM_MASK 0 #endif #endif .macro KERNEL_F1 ldr TMPF0, [X] ldr TMPF1, [Y] str TMPF0, [Y] str TMPF1, [X] add X, X, INC_X add Y, Y, INC_Y .endm .macro KERNEL_F ldr q0, [X] ldr q1, [Y] add X, X, #16 add Y, Y, #16 prfm PLDL1STRM, [X, #1024] prfm PLDL1STRM, [Y, #1024] str q0, [Y, #-16] str q1, [X, #-16] .endm .macro INIT lsl INC_X, INC_X, #INC_SHIFT lsl INC_Y, INC_Y, #INC_SHIFT .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble .Lswap_kernel_L999 cmp INC_X, #1 bne .Lswap_kernel_S_BEGIN cmp INC_Y, #1 bne .Lswap_kernel_S_BEGIN .Lswap_kernel_F_BEGIN: INIT asr I, N, #N_DIV_SHIFT cmp I, xzr beq .Lswap_kernel_F1 .align 5 .Lswap_kernel_F: KERNEL_F subs I, I, #1 bne .Lswap_kernel_F .Lswap_kernel_F1: #if defined(DOUBLE) && defined(COMPLEX) b .Lswap_kernel_L999 #else ands I, N, #N_REM_MASK ble .Lswap_kernel_L999 #endif .Lswap_kernel_F10: KERNEL_F1 subs I, I, #1 bne .Lswap_kernel_F10 b .Lswap_kernel_L999 .Lswap_kernel_S_BEGIN: INIT asr I, N, #2 cmp I, xzr ble .Lswap_kernel_S1 .Lswap_kernel_S4: KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 subs I, I, #1 bne .Lswap_kernel_S4 .Lswap_kernel_S1: ands I, N, #3 ble .Lswap_kernel_L999 .Lswap_kernel_S10: KERNEL_F1 subs I, I, #1 bne .Lswap_kernel_S10 .Lswap_kernel_L999: mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zamax.S000066400000000000000000000131311313527062700166200ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if defined(USE_MIN) #define COND le #else #define COND ge #endif #if !defined(DOUBLE) #define REG0 wzr #define MAXF s0 #define TMPF s1 #define TMPVF {v1.s}[0] #define SZ 4 #else #define REG0 xzr #define MAXF d0 #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 #endif /******************************************************************************/ .macro INIT_F1 #if !defined(DOUBLE) ld1 {v0.2s}, [X], #8 fabs v0.2s, v0.2s ext v1.8b, v0.8b, v0.8b, #4 fadd MAXF, s0, s1 #else ld1 {v0.2d}, [X], #16 fabs v0.2d, v0.2d faddp MAXF, v0.2d #endif .endm .macro KERNEL_F1 #if !defined(DOUBLE) ld1 {v1.2s}, [X], #8 fabs v1.2s, v1.2s ext v2.8b, v1.8b, v1.8b, #4 fadd TMPF, s1, s2 #else ld1 {v1.2d}, [X], #16 fabs v1.2d, v1.2d faddp TMPF, v1.2d #endif fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND .endm .macro INIT_F4 #if !defined(DOUBLE) ld2 {v0.4s,v1.4s}, [X], #32 fabs v0.4s, v0.4s // [X6, X4, X2, X0] fabs v1.4s, v1.4s // [X7, X5, X3, X1] fadd v0.4s, v0.4s, v1.4s // [X7+X6, X5+X4, X3+X2, X1+X0] #if defined(USE_MIN) fminv MAXF, v0.4s #else fmaxv MAXF, v0.4s #endif #else // DOUBLE ld4 {v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64 fabs v0.2d, v0.2d fabs v1.2d, v1.2d fabs v2.2d, v2.2d fabs v3.2d, v3.2d fadd v0.2d, v0.2d, v1.2d fadd v2.2d, v2.2d, v3.2d #if defined(USE_MIN) fmin v0.2d, v0.2d, v2.2d fminp MAXF, v0.2d #else fmax v0.2d, v0.2d, v2.2d fmaxp MAXF, v0.2d #endif #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld2 {v1.4s,v2.4s}, [X], #32 fabs v1.4s, v1.4s // [X6, X4, X2, X0] fabs v2.4s, v2.4s // [X7, X5, X3, X1] fadd v1.4s, v1.4s, v2.4s // [X7+X6, X5+X4, X3+X2, X1+X0] #if defined(USE_MIN) fminv TMPF, v1.4s #else fmaxv TMPF, v1.4s #endif #else // DOUBLE ld4 {v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64 fabs v1.2d, v1.2d fabs v2.2d, v2.2d fabs v3.2d, v3.2d fabs v4.2d, v4.2d fadd v1.2d, v1.2d, v2.2d fadd v3.2d, v3.2d, v4.2d #if defined(USE_MIN) fmin v1.2d, v1.2d, v3.2d fminp MAXF, v1.2d #else fmax v1.2d, v1.2d, v3.2d fmaxp MAXF, v1.2d #endif #endif fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #3 ld1 {v0.2s}, [X], INC_X fabs v0.2s, v0.2s ext v1.8b, v0.8b, v0.8b, #4 fadd MAXF, s0, s1 #else lsl INC_X, INC_X, #4 ld1 {v0.2d}, [X], INC_X fabs v0.2d, v0.2d faddp MAXF, v0.2d #endif .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v1.2s}, [X], INC_X fabs v1.2s, v1.2s ext v2.8b, v1.8b, v1.8b, #4 fadd TMPF, s1, s2 #else ld1 {v1.2d}, [X], INC_X fabs v1.2d, v1.2d faddp TMPF, v1.2d #endif fcmp MAXF, TMPF fcsel MAXF, MAXF, TMPF, COND .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble amax_kernel_zero cmp INC_X, xzr ble amax_kernel_zero cmp INC_X, #1 bne amax_kernel_S_BEGIN amax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq amax_kernel_F1_INIT INIT_F4 subs I, I, #1 beq amax_kernel_F1 amax_kernel_F4: KERNEL_F4 subs I, I, #1 bne amax_kernel_F4 amax_kernel_F1: ands I, N, #3 ble amax_kernel_L999 amax_kernel_F10: KERNEL_F1 subs I, I, #1 bne amax_kernel_F10 ret amax_kernel_F1_INIT: INIT_F1 subs N, N, #1 b amax_kernel_F1 amax_kernel_S_BEGIN: INIT_S subs N, N, #1 ble amax_kernel_L999 asr I, N, #2 cmp I, xzr ble amax_kernel_S1 amax_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne amax_kernel_S4 amax_kernel_S1: ands I, N, #3 ble amax_kernel_L999 amax_kernel_S10: KERNEL_S1 subs I, I, #1 bne amax_kernel_S10 amax_kernel_L999: ret amax_kernel_zero: fmov MAXF, REG0 ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zasum.S000066400000000000000000000072371313527062700166510ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #define REG0 xzr #define SUMF d0 #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 /******************************************************************************/ .macro KERNEL_F1 ld1 {v1.2d}, [X], #16 fabs v1.2d, v1.2d faddp TMPF, v1.2d fadd SUMF, SUMF, TMPF .endm .macro KERNEL_F4 ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 fabs v1.2d, v1.2d fabs v2.2d, v2.2d fabs v3.2d, v3.2d fabs v4.2d, v4.2d fadd v1.2d, v1.2d, v2.2d fadd v3.2d, v3.2d, v4.2d fadd v0.2d, v0.2d, v1.2d fadd v0.2d, v0.2d, v3.2d PRFM PLDL1KEEP, [X, #1024] .endm .macro KERNEL_F4_FINALIZE faddp SUMF, v0.2d .endm .macro INIT_S lsl INC_X, INC_X, #4 .endm .macro KERNEL_S1 ld1 {v1.2d}, [X], INC_X fabs v1.2d, v1.2d faddp TMPF, v1.2d fadd SUMF, SUMF, TMPF .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE fmov SUMF, REG0 cmp N, xzr ble asum_kernel_L999 cmp INC_X, xzr ble asum_kernel_L999 cmp INC_X, #1 bne asum_kernel_S_BEGIN asum_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq asum_kernel_F1 asum_kernel_F4: KERNEL_F4 subs I, I, #1 bne asum_kernel_F4 KERNEL_F4_FINALIZE asum_kernel_F1: ands I, N, #3 ble asum_kernel_L999 asum_kernel_F10: KERNEL_F1 subs I, I, #1 bne asum_kernel_F10 asum_kernel_L999: ret asum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble asum_kernel_S1 asum_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne asum_kernel_S4 asum_kernel_S1: ands I, N, #3 ble asum_kernel_L999 asum_kernel_S10: KERNEL_S1 subs I, I, #1 bne asum_kernel_S10 ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zasum_thunderx2t99.c000066400000000000000000000164221313527062700212360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define J "x5" /* loop variable */ #define REG0 "xzr" #define SUMF "d0" #define TMPF "d1" /******************************************************************************/ #define KERNEL_F1 \ "ldr q1, ["X"] \n" \ "add "X", "X", #16 \n" \ "fabs v1.2d, v1.2d \n" \ "faddp d1, v1.2d \n" \ "fadd "SUMF", "SUMF", d1 \n" #define KERNEL_F16 \ "ldr q16, ["X"] \n" \ "ldr q17, ["X", #16] \n" \ "ldr q18, ["X", #32] \n" \ "ldr q19, ["X", #48] \n" \ "ldp q20, q21, ["X", #64] \n" \ "ldp q22, q23, ["X", #96] \n" \ "fabs v16.2d, v16.2d \n" \ "fabs v17.2d, v17.2d \n" \ "fabs v18.2d, v18.2d \n" \ "fabs v19.2d, v19.2d \n" \ "ldp q24, q25, ["X", #128] \n" \ "ldp q26, q27, ["X", #160] \n" \ "fabs v20.2d, v20.2d \n" \ "fabs v21.2d, v21.2d \n" \ "fabs v22.2d, v22.2d \n" \ "fabs v23.2d, v23.2d \n" \ "fadd v16.2d, v16.2d, v17.2d \n" \ "fadd v18.2d, v18.2d, v19.2d \n" \ "ldp q28, q29, ["X", #192] \n" \ "ldp q30, q31, ["X", #224] \n" \ "fabs v24.2d, v24.2d \n" \ "fabs v25.2d, v25.2d \n" \ "fabs v26.2d, v26.2d \n" \ "fabs v27.2d, v27.2d \n" \ "add "X", "X", #256 \n" \ "fadd v20.2d, v20.2d, v21.2d \n" \ "fadd v22.2d, v22.2d, v23.2d \n" \ "fabs v28.2d, v28.2d \n" \ "fabs v29.2d, v29.2d \n" \ "fabs v30.2d, v30.2d \n" \ "fabs v31.2d, v31.2d \n" \ "PRFM PLDL1KEEP, ["X", #1024] \n" \ "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ "fadd v24.2d, v24.2d, v25.2d \n" \ "fadd v26.2d, v26.2d, v27.2d \n" \ "fadd v28.2d, v28.2d, v29.2d \n" \ "fadd v30.2d, v30.2d, v31.2d \n" \ "fadd v0.2d, v0.2d, v16.2d \n" \ "fadd v1.2d, v1.2d, v18.2d \n" \ "fadd v2.2d, v2.2d, v20.2d \n" \ "fadd v3.2d, v3.2d, v22.2d \n" \ "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ "fadd v4.2d, v4.2d, v24.2d \n" \ "fadd v5.2d, v5.2d, v26.2d \n" \ "fadd v6.2d, v6.2d, v28.2d \n" \ "fadd v7.2d, v7.2d, v30.2d \n" #define KERNEL_F16_FINALIZE \ "fadd v0.2d, v0.2d, v1.2d \n" \ "fadd v2.2d, v2.2d, v3.2d \n" \ "fadd v4.2d, v4.2d, v5.2d \n" \ "fadd v6.2d, v6.2d, v7.2d \n" \ "fadd v0.2d, v0.2d, v2.2d \n" \ "fadd v4.2d, v4.2d, v6.2d \n" \ "fadd v0.2d, v0.2d, v4.2d \n" \ "faddp "SUMF", v0.2d \n" #define INIT_S \ "lsl "INC_X", "INC_X", #4 \n" #define KERNEL_S1 \ "ldr q1, ["X"] \n" \ "add "X", "X", "INC_X" \n" \ "fabs v1.2d, v1.2d \n" \ "faddp d1, v1.2d \n" \ "fadd "SUMF", "SUMF", d1 \n" #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT asum = 0.0 ; if ( n < 0 ) return(asum); __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " fmov "SUMF", "REG0" \n" " fmov d1, "REG0" \n" " fmov d2, "REG0" \n" " fmov d3, "REG0" \n" " fmov d4, "REG0" \n" " fmov d5, "REG0" \n" " fmov d6, "REG0" \n" " fmov d7, "REG0" \n" " cmp "N", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", xzr \n" " ble .Lasum_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Lasum_kernel_S_BEGIN \n" ".Lasum_kernel_F_BEGIN: \n" " asr "J", "N", #4 \n" " cmp "J", xzr \n" " beq .Lasum_kernel_F1 \n" ".align 5 \n" ".Lasum_kernel_F16: \n" " "KERNEL_F16" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F16 \n" " "KERNEL_F16_FINALIZE" \n" ".Lasum_kernel_F1: \n" " ands "J", "N", #15 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_F10 \n" " b .Lasum_kernel_L999 \n" ".Lasum_kernel_S_BEGIN: \n" " "INIT_S" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Lasum_kernel_S1 \n" ".Lasum_kernel_S4: \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S4 \n" ".Lasum_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Lasum_kernel_L999 \n" ".Lasum_kernel_S10: \n" " "KERNEL_S1" \n" " subs "J", "J", #1 \n" " bne .Lasum_kernel_S10 \n" ".Lasum_kernel_L999: \n" " fmov %[ASUM_], "SUMF" \n" : [ASUM_] "=r" (asum) //%0 : [N_] "r" (n), //%1 [X_] "r" (x), //%2 [INCX_] "r" (inc_x) //%3 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return asum; } #if defined(SMP) static int zasum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { *result = zasum_compute(n, x, inc_x); return 0; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif FLOAT asum = 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { asum = zasum_compute(n, x, inc_x); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; FLOAT *ptr; mode = BLAS_DOUBLE | BLAS_COMPLEX; blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, ( void *)zasum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { asum = asum + (*ptr); ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); } } #else asum = zasum_compute(n, x, inc_x); #endif return asum; } OpenBLAS-0.2.20/kernel/arm64/zaxpy.S000066400000000000000000000165061313527062700166640ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x3 /* X vector address */ #define INC_X x4 /* X stride */ #define Y x5 /* Y vector address */ #define INC_Y x6 /* Y stride */ #define I x1 /* loop variable */ #define Y_COPY x7 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define DA_R s0 /* scale input value */ #define DA_I s1 /* scale input value */ #define SZ 4 #else #define DA_R d0 /* scale input value */ #define DA_I d1 /* scale input value */ #define SZ 8 #endif /******************************************************************************/ .macro INIT #if !defined(CONJ) #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R eor v2.16b, v2.16b, v2.16b fsub s2, s2, DA_I ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I #else ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R eor v2.16b, v2.16b, v2.16b fsub d2, d2, DA_I ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I #endif #else #if !defined(DOUBLE) eor v2.16b, v2.16b, v2.16b fsub s2, s2, DA_R ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I #else eor v2.16b, v2.16b, v2.16b fsub d2, d2, DA_R ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I #endif #endif .endm .macro KERNEL_F1 #if !defined(DOUBLE) ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] // Y[iy+1] += +-DA_R * X[ix+1] fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] // Y[iy+1] += DA_I * X[ix] st1 {v3.2s}, [Y], #8 #else ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] // Y[iy+1] += +-DA_R * X[ix+1] fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] // Y[iy+1] += DA_I * X[ix] st1 {v3.2d}, [Y], #16 #endif .endm .macro KERNEL_INIT_F4 #if !defined(DOUBLE) ins v16.s[0], v0.s[0] ins v16.s[1], v16.s[0] ins v16.d[1], v16.d[0] #if !defined(CONJ) ins v17.s[0], v1.s[1] #else ins v17.s[0], v1.s[0] #endif ins v17.s[1], v17.s[0] ins v17.d[1], v17.d[0] #else //DOUBLE ins v16.d[0], v0.d[0] ins v16.d[1], v16.d[0] #if !defined(CONJ) ins v17.d[0], v1.d[1] #else ins v17.d[0], v1.d[0] #endif ins v17.d[1], v17.d[0] #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld2 {v2.4s, v3.4s}, [X], #32 ld2 {v4.4s, v5.4s}, [Y_COPY], #32 fmla v4.4s, v2.4s, v16.4s #if !defined(CONJ) fmls v4.4s, v3.4s, v17.4s #else fmla v4.4s, v3.4s, v17.4s #endif fmla v5.4s, v2.4s, v17.4s #if !defined(CONJ) fmla v5.4s, v3.4s, v16.4s #else fmls v5.4s, v3.4s, v16.4s #endif st2 {v4.4s, v5.4s}, [Y], #32 #else // DOUBLE ld2 {v2.2d, v3.2d}, [X], #32 ld2 {v4.2d, v5.2d}, [Y_COPY], #32 fmla v4.2d, v2.2d, v16.2d #if !defined(CONJ) fmls v4.2d, v3.2d, v17.2d #else fmla v4.2d, v3.2d, v17.2d #endif fmla v5.2d, v2.2d, v17.2d #if !defined(CONJ) fmla v5.2d, v3.2d, v16.2d #else fmls v5.2d, v3.2d, v16.2d #endif st2 {v4.2d, v5.2d}, [Y], #32 ld2 {v18.2d, v19.2d}, [X], #32 ld2 {v20.2d, v21.2d}, [Y_COPY], #32 fmla v20.2d, v18.2d, v16.2d #if !defined(CONJ) fmls v20.2d, v19.2d, v17.2d #else fmla v20.2d, v19.2d, v17.2d #endif fmla v21.2d, v18.2d, v17.2d #if !defined(CONJ) fmla v21.2d, v19.2d, v16.2d #else fmls v21.2d, v19.2d, v16.2d #endif st2 {v20.2d, v21.2d}, [Y], #32 #endif PRFM PLDL1KEEP, [X, #512] PRFM PLDL1KEEP, [Y, #512] .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #else lsl INC_X, INC_X, #4 lsl INC_Y, INC_Y, #4 #endif .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] // Y[iy+1] += +-DA_R * X[ix+1] fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] // Y[iy+1] += DA_I * X[ix] st1 {v3.2s}, [Y], INC_Y #else ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] // Y[iy+1] += +-DA_R * X[ix+1] fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] // Y[iy+1] += DA_I * X[ix] st1 {v3.2d}, [Y], INC_Y #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble zaxpy_kernel_L999 mov Y_COPY, Y fcmp DA_R, #0.0 bne .L1 fcmp DA_I, #0.0 beq zaxpy_kernel_L999 .L1: INIT cmp INC_X, #1 bne zaxpy_kernel_S_BEGIN cmp INC_Y, #1 bne zaxpy_kernel_S_BEGIN zaxpy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq zaxpy_kernel_F1 KERNEL_INIT_F4 zaxpy_kernel_F4: KERNEL_F4 subs I, I, #1 bne zaxpy_kernel_F4 zaxpy_kernel_F1: ands I, N, #3 ble zaxpy_kernel_L999 zaxpy_kernel_F10: KERNEL_F1 subs I, I, #1 bne zaxpy_kernel_F10 mov w0, wzr ret zaxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble zaxpy_kernel_S1 zaxpy_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne zaxpy_kernel_S4 zaxpy_kernel_S1: ands I, N, #3 ble zaxpy_kernel_L999 zaxpy_kernel_S10: KERNEL_S1 subs I, I, #1 bne zaxpy_kernel_S10 zaxpy_kernel_L999: mov w0, wzr ret OpenBLAS-0.2.20/kernel/arm64/zdot.S000066400000000000000000000201671313527062700164670ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define Y x3 /* Y vector address */ #define INC_Y x4 /* Y stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #if !defined(DSDOT) #define REG0 wzr #define DOTF s0 #else // DSDOT #define REG0 xzr #define DOTF d0 #endif #define DOTI s1 #define TMPX s2 #define LD1VX {v2.s}[0] #define TMPY s3 #define LD1VY {v3.s}[0] #define TMPVY v3.s[0] #define SZ 4 #else #define REG0 xzr #define DOTF d0 #define DOTI d1 #define TMPX d2 #define LD1VX {v2.d}[0] #define TMPY d3 #define LD1VY {v3.d}[0] #define TMPVY v3.d[0] #define SZ 8 #endif /******************************************************************************/ .macro KERNEL_F1 #if !defined(DOUBLE) ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2 ins v4.s[0], v2.s[1] // V4 = X[ix+1] #if !defined(CONJ) fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] #else fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] #endif #else // DOUBLE ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2 ins v4.d[0], v2.d[1] // V4 = X[ix+1] #if !defined(CONJ) fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] #else fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] #endif #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2 fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy] fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1] PRFM PLDL1KEEP, [X, #1024] PRFM PLDL1KEEP, [Y, #1024] #if !defined(CONJ) fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1] fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy] #else fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1] fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy] #endif #else // DOUBLE ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 ld2 {v16.2d, v17.2d}, [Y], #32 fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy] fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1] ld2 {v4.2d, v5.2d}, [X], #32 ld2 {v18.2d, v19.2d}, [Y], #32 fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1] fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1] PRFM PLDL1KEEP, [X, #1024] PRFM PLDL1KEEP, [Y, #1024] #if !defined(CONJ) fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1] fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1] fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy] fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy] #else fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1] fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1] fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy] fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy] #endif #endif .endm .macro KERNEL_F4_FINALIZE #if !defined(DOUBLE) ext v2.16b, v0.16b, v0.16b, #8 fadd v0.2s, v0.2s, v2.2s faddp DOTF, v0.2s ext v3.16b, v1.16b, v1.16b, #8 fadd v1.2s, v1.2s, v3.2s faddp DOTI, v1.2s #else fadd v0.2d, v0.2d, v20.2d faddp DOTF, v0.2d fadd v1.2d, v1.2d, v21.2d faddp DOTI, v1.2d #endif .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #else lsl INC_X, INC_X, #4 lsl INC_Y, INC_Y, #4 #endif .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] #if !defined(CONJ) fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] #else fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] #endif #else // DOUBLE ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] #if !defined(CONJ) fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] #else fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] #endif #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE fmov DOTF, REG0 fmov DOTI, DOTF #if !defined(DOUBLE) fmov s20, DOTF fmov s21, DOTI #else fmov d20, DOTF fmov d21, DOTI #endif cmp N, xzr ble dot_kernel_L999 cmp INC_X, #1 bne dot_kernel_S_BEGIN cmp INC_Y, #1 bne dot_kernel_S_BEGIN dot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq dot_kernel_F1 dot_kernel_F4: KERNEL_F4 subs I, I, #1 bne dot_kernel_F4 KERNEL_F4_FINALIZE dot_kernel_F1: ands I, N, #3 ble dot_kernel_L999 dot_kernel_F10: KERNEL_F1 subs I, I, #1 bne dot_kernel_F10 ret dot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble dot_kernel_S1 dot_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne dot_kernel_S4 dot_kernel_S1: ands I, N, #3 ble dot_kernel_L999 dot_kernel_S10: KERNEL_S1 subs I, I, #1 bne dot_kernel_S10 dot_kernel_L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zdot_thunderx2t99.c000066400000000000000000000242241313527062700210560ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #define N "x0" /* vector length */ #define X "x1" /* "X" vector address */ #define INC_X "x2" /* "X" stride */ #define Y "x3" /* "Y" vector address */ #define INC_Y "x4" /* "Y" stride */ #define J "x5" /* loop variable */ #if !defined(DOUBLE) #define REG0 "wzr" #define DOTF "s0" #define DOTI "s1" #define INC_SHIFT "3" #define N_DIV_SHIFT "4" #define N_REM_MASK "15" #else #define REG0 "xzr" #define DOTF "d0" #define DOTI "d1" #define INC_SHIFT "4" #define N_DIV_SHIFT "3" #define N_REM_MASK "7" #endif #if !defined(CONJ) #define f_ii "fmls" #define f_ir "fmla" #define a_ii "fsub" #define a_ir "fadd" #else #define f_ii "fmla" #define f_ir "fmls" #define a_ii "fadd" #define a_ir "fsub" #endif #if !defined(DOUBLE) #define KERNEL_F1 \ " ldr d16, ["X"] \n" \ " ldr d24, ["Y"] \n" \ " add "X", "X", "INC_X" \n" \ " add "Y", "Y", "INC_Y" \n" \ " ins v17.s[0], v16.s[1] \n" \ " fmla "DOTF", s16, v24.s[0] \n" \ " "f_ii" "DOTF", s17, v24.s[1] \n" \ " "f_ir" "DOTI", s17, v24.s[0] \n" \ " fmla "DOTI", s16, v24.s[1] \n" #define KERNEL_F \ " ld2 {v16.4s, v17.4s}, ["X"] \n" \ " ld2 {v24.4s, v25.4s}, ["Y"] \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " ld2 {v18.4s, v19.4s}, ["X"] \n" \ " ld2 {v26.4s, v27.4s}, ["Y"] \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ " fmla v2.4s, v16.4s, v25.4s \n" \ " fmla v3.4s, v17.4s, v24.4s \n" \ " ld2 {v20.4s, v21.4s}, ["X"] \n" \ " ld2 {v28.4s, v29.4s}, ["Y"] \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " fmla v4.4s, v18.4s, v26.4s \n" \ " fmla v5.4s, v19.4s, v27.4s \n" \ " fmla v6.4s, v18.4s, v27.4s \n" \ " fmla v7.4s, v19.4s, v26.4s \n" \ " ld2 {v22.4s, v23.4s}, ["X"] \n" \ " ld2 {v30.4s, v31.4s}, ["Y"] \n" \ " fmla v0.4s, v20.4s, v28.4s \n" \ " fmla v1.4s, v21.4s, v29.4s \n" \ " fmla v2.4s, v20.4s, v29.4s \n" \ " fmla v3.4s, v21.4s, v28.4s \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " PRFM PLDL1KEEP, ["X", #1024] \n" \ " PRFM PLDL1KEEP, ["Y", #1024] \n" \ " PRFM PLDL1KEEP, ["X", #1024+64] \n" \ " PRFM PLDL1KEEP, ["Y", #1024+64] \n" \ " fmla v4.4s, v22.4s, v30.4s \n" \ " fmla v5.4s, v23.4s, v31.4s \n" \ " fmla v6.4s, v22.4s, v31.4s \n" \ " fmla v7.4s, v23.4s, v30.4s \n" #define KERNEL_F_FINALIZE \ " fadd v0.4s, v0.4s, v4.4s \n" \ " fadd v1.4s, v1.4s, v5.4s \n" \ " fadd v2.4s, v2.4s, v6.4s \n" \ " fadd v3.4s, v3.4s, v7.4s \n" \ " "a_ii" v0.4s, v0.4s, v1.4s \n" \ " "a_ir" v1.4s, v2.4s, v3.4s \n" \ " faddp v0.4s, v0.4s, v0.4s \n" \ " faddp v0.4s, v0.4s, v0.4s \n" \ " faddp v1.4s, v1.4s, v1.4s \n" \ " faddp v1.4s, v1.4s, v1.4s \n" #else #define KERNEL_F1 \ " ldr q16, ["X"] \n" \ " ldr q24, ["Y"] \n" \ " add "X", "X", "INC_X" \n" \ " add "Y", "Y", "INC_Y" \n" \ " ins v17.d[0], v16.d[1] \n" \ " fmla "DOTF", d16, v24.d[0] \n" \ " "f_ii" "DOTF", d17, v24.d[1] \n" \ " "f_ir" "DOTI", d17, v24.d[0] \n" \ " fmla "DOTI", d16, v24.d[1] \n" #define KERNEL_F \ " ld2 {v16.2d, v17.2d}, ["X"] \n" \ " ld2 {v24.2d, v25.2d}, ["Y"] \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " ld2 {v18.2d, v19.2d}, ["X"] \n" \ " ld2 {v26.2d, v27.2d}, ["Y"] \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ " fmla v2.2d, v16.2d, v25.2d \n" \ " fmla v3.2d, v17.2d, v24.2d \n" \ " ld2 {v20.2d, v21.2d}, ["X"] \n" \ " ld2 {v28.2d, v29.2d}, ["Y"] \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " fmla v4.2d, v18.2d, v26.2d \n" \ " fmla v5.2d, v19.2d, v27.2d \n" \ " fmla v6.2d, v18.2d, v27.2d \n" \ " fmla v7.2d, v19.2d, v26.2d \n" \ " ld2 {v22.2d, v23.2d}, ["X"] \n" \ " ld2 {v30.2d, v31.2d}, ["Y"] \n" \ " fmla v0.2d, v20.2d, v28.2d \n" \ " fmla v1.2d, v21.2d, v29.2d \n" \ " fmla v2.2d, v20.2d, v29.2d \n" \ " fmla v3.2d, v21.2d, v28.2d \n" \ " add "X", "X", #32 \n" \ " add "Y", "Y", #32 \n" \ " PRFM PLDL1KEEP, ["X", #1024] \n" \ " PRFM PLDL1KEEP, ["Y", #1024] \n" \ " PRFM PLDL1KEEP, ["X", #1024+64] \n" \ " PRFM PLDL1KEEP, ["Y", #1024+64] \n" \ " fmla v4.2d, v22.2d, v30.2d \n" \ " fmla v5.2d, v23.2d, v31.2d \n" \ " fmla v6.2d, v22.2d, v31.2d \n" \ " fmla v7.2d, v23.2d, v30.2d \n" #define KERNEL_F_FINALIZE \ " fadd v0.2d, v0.2d, v4.2d \n" \ " fadd v1.2d, v1.2d, v5.2d \n" \ " fadd v2.2d, v2.2d, v6.2d \n" \ " fadd v3.2d, v3.2d, v7.2d \n" \ " "a_ii" v0.2d, v0.2d, v1.2d \n" \ " "a_ir" v1.2d, v2.2d, v3.2d \n" \ " faddp "DOTF", v0.2d \n" \ " faddp "DOTI", v1.2d \n" #endif #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); #endif static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, OPENBLAS_COMPLEX_FLOAT *result) { FLOAT dotr = 0.0, doti = 0.0; CREAL(*result) = 0.0; CIMAG(*result) = 0.0; if ( n < 0 ) return; __asm__ __volatile__ ( " mov "N", %[N_] \n" " mov "X", %[X_] \n" " mov "INC_X", %[INCX_] \n" " mov "Y", %[Y_] \n" " mov "INC_Y", %[INCY_] \n" " fmov "DOTF", "REG0" \n" " fmov "DOTI", "REG0" \n" " fmov d2, xzr \n" " fmov d3, xzr \n" " fmov d4, xzr \n" " fmov d5, xzr \n" " fmov d6, xzr \n" " fmov d7, xzr \n" " cmp "N", xzr \n" " ble .Ldot_kernel_L999 \n" " cmp "INC_X", #1 \n" " bne .Ldot_kernel_S_BEGIN \n" " cmp "INC_Y", #1 \n" " bne .Ldot_kernel_S_BEGIN \n" ".Ldot_kernel_F_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #"N_DIV_SHIFT" \n" " cmp "J", xzr \n" " beq .Ldot_kernel_F1 \n" " .align 5 \n" ".Ldot_kernel_F: \n" " "KERNEL_F" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" ".Ldot_kernel_F1: \n" " ands "J", "N", #"N_REM_MASK" \n" " ble .Ldot_kernel_L999 \n" ".Ldot_kernel_F10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_F10 \n" " b .Ldot_kernel_L999 \n" ".Ldot_kernel_S_BEGIN: \n" " lsl "INC_X", "INC_X", "INC_SHIFT" \n" " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" " asr "J", "N", #2 \n" " cmp "J", xzr \n" " ble .Ldot_kernel_S1 \n" ".Ldot_kernel_S4: \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_S4 \n" ".Ldot_kernel_S1: \n" " ands "J", "N", #3 \n" " ble .Ldot_kernel_L999 \n" ".Ldot_kernel_S10: \n" " "KERNEL_F1" \n" " subs "J", "J", #1 \n" " bne .Ldot_kernel_S10 \n" ".Ldot_kernel_L999: \n" " str "DOTF", [%[DOTR_]] \n" " str "DOTI", [%[DOTI_]] \n" : : [DOTR_] "r" (&dotr), //%0 [DOTI_] "r" (&doti), //%1 [N_] "r" (n), //%2 [X_] "r" (x), //%3 [INCX_] "r" (inc_x), //%4 [Y_] "r" (y), //%5 [INCY_] "r" (inc_y) //%6 : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); CREAL(*result) = dotr; CIMAG(*result) = doti; return; } #if defined(SMP) static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { zdot_compute(n, x, inc_x, y, inc_y, (void *)result); return 0; } #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { #if defined(SMP) int nthreads; FLOAT dummy_alpha; #endif OPENBLAS_COMPLEX_FLOAT zdot; CREAL(zdot) = 0.0; CIMAG(zdot) = 0.0; #if defined(SMP) nthreads = num_cpu_avail(1); if (inc_x == 0 || inc_y == 0) nthreads = 1; if (n <= 10000) nthreads = 1; if (nthreads == 1) { zdot_compute(n, x, inc_x, y, inc_y, &zdot); } else { int mode, i; char result[MAX_CPU_NUMBER * sizeof(double) * 2]; OPENBLAS_COMPLEX_FLOAT *ptr; #if !defined(DOUBLE) mode = BLAS_SINGLE | BLAS_COMPLEX; #else mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, ( void *)zdot_thread_function, nthreads); ptr = (OPENBLAS_COMPLEX_FLOAT *)result; for (i = 0; i < nthreads; i++) { CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); ptr = (void *)(((char *)ptr) + sizeof(double) * 2); } } #else zdot_compute(n, x, inc_x, y, inc_y, &zdot); #endif return zdot; } OpenBLAS-0.2.20/kernel/arm64/zgemm_kernel_4x4.S000066400000000000000000001026441313527062700206660ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alphaR x17 #define alphaI x18 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 448 #define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 alpha_save_R // 18 must save alpha_save_I // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA00_R, pA01_R //v01 ALPHA_I -> pA00_I, pA01_I //v02 pA02_R, pA03_R //v03 pA02_I, pA03_I //v04 pA10_R, pA11_R //v05 pA10_I, pA11_I //v06 pA12_R, pA13_R //v07 pA12_I, pA13_I //v08 must save pB00_R, pB01_R //v09 must save pB00_I, pB01_I //v10 must save pB02_R, pB03_R OR ALPHA0_R //v11 must save pB02_I, pB03_I OR ALPHA0_I //v12 must save pB10_R, pB11_R //v13 must save pB10_I, pB11_I //v14 must save pB12_R, pB13_R OR ALPHA1_R //v15 must save pB12_I, pB13_I OR ALPHA1_R //v16 must save pC00_R, pC01_R //v17 must save pC00_I, pC01_I //v18 pC02_R, pC03_R //v19 pC02_I, pC03_I //v20 pC10_R, pC11_R //v21 pC10_I, pC11_I //v22 pC12_R, pC13_R //v23 pC12_I, pC13_I //v24 pC20_R, pC21_R //v25 pC20_I, pC21_I //v26 pC22_R, pC23_R //v27 pC22_I, pC23_I //v28 pC30_R, pC31_R //v29 pC30_I, pC31_I //v30 pC32_R, pC33_R //v31 pC32_I, pC33_I /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d18, d17 fmov d19, d16 fmov d20, d17 fmov d21, d16 fmov d22, d17 fmov d23, d16 fmov d24, d17 fmov d25, d16 fmov d26, d17 fmov d27, d16 fmov d28, d17 fmov d29, d16 fmov d30, d17 fmov d31, d16 .endm .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.2d, v0.2d, v9.d[0] #else fmul v17.2d, v0.2d, v9.d[0] #endif OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.2d, v0.2d, v9.d[1] #else fmul v21.2d, v0.2d, v9.d[1] #endif OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b fmls v23.2d, v2.2d, v9.d[1] #else fmul v23.2d, v2.2d, v9.d[1] #endif OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 fmul v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b fmls v19.2d, v2.2d, v9.d[0] #else fmul v19.2d, v2.2d, v9.d[0] #endif OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v4.2d, v5.2d} , [pA] add pA, pA, #32 fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.2d, v0.2d, v11.d[0] #else fmul v25.2d, v0.2d, v11.d[0] #endif OP_ir v25.2d, v1.2d, v10.d[0] ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b fmls v27.2d, v2.2d, v11.d[0] #else fmul v27.2d, v2.2d, v11.d[0] #endif OP_ir v27.2d, v3.2d, v10.d[0] ld2 {v14.2d, v15.2d}, [pB] add pB, pB, #32 fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.2d, v0.2d, v11.d[1] #else fmul v29.2d, v0.2d, v11.d[1] #endif OP_ir v29.2d, v1.2d, v10.d[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b fmls v31.2d, v2.2d, v11.d[1] #else fmul v31.2d, v2.2d, v11.d[1] #endif OP_ir v31.2d, v3.2d, v10.d[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v4.2d, v5.2d} , [pA] add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v14.2d, v15.2d}, [pB] add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v10.d[1] OP_rr v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] OP_ri v31.2d, v2.2d, v11.d[1] OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 OP_rr v16.2d, v4.2d, v12.d[0] OP_ii v16.2d, v5.2d, v13.d[0] OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] OP_ii v18.2d, v7.2d, v13.d[0] OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] OP_ri v29.2d, v4.2d, v15.d[1] OP_ir v29.2d, v5.2d, v14.d[1] OP_rr v30.2d, v6.2d, v14.d[1] OP_ii v30.2d, v7.2d, v15.d[1] OP_ri v31.2d, v6.2d, v15.d[1] OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E OP_rr v16.2d, v4.2d, v12.d[0] OP_ii v16.2d, v5.2d, v13.d[0] OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] OP_rr v18.2d, v6.2d, v12.d[0] OP_ii v18.2d, v7.2d, v13.d[0] OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] OP_ri v29.2d, v4.2d, v15.d[1] OP_ir v29.2d, v5.2d, v14.d[1] OP_rr v30.2d, v6.2d, v14.d[1] OP_ii v30.2d, v7.2d, v15.d[1] OP_ri v31.2d, v6.2d, v15.d[1] OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v10.d[1] OP_rr v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] OP_ri v31.2d, v2.2d, v11.d[1] OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ld2 {v0.2d, v1.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow0] add pCRow0, pCRow0, #32 ld2 {v2.2d, v3.2d}, [pCRow0] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmla v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow1] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I fmla v7.2d, v22.2d, alphaV0_I fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ld2 {v0.2d, v1.2d}, [pCRow2] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I fmla v1.2d, v24.2d, alphaV0_I fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow2] add pCRow2, pCRow2, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I fmla v3.2d, v26.2d, alphaV0_I fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ld2 {v4.2d, v5.2d}, [pCRow3] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I fmla v5.2d, v28.2d, alphaV0_I fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow3] add pCRow3, pCRow3, #32 ld2 {v6.2d, v7.2d}, [pCRow3] fmla v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I fmla v7.2d, v30.2d, alphaV0_I fmla v7.2d, v31.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 fmov d24, d16 fmov d25, d17 fmov d28, d16 fmov d29, d17 .endm .macro KERNEL2x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I fmla v1.2d, v24.2d, alphaV0_I fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I fmla v5.2d, v28.2d, alphaV0_I fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 fmov d24, d16 fmov d25, d17 fmov d28, d16 fmov d29, d17 .endm .macro KERNEL1x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] OP_rr d20, d0, v8.d[1] OP_ii d20, d1, v9.d[1] OP_ri d21, d0, v9.d[1] OP_ir d21, d1, v8.d[1] OP_rr d24, d0, v10.d[0] OP_ii d24, d1, v11.d[0] OP_ri d25, d0, v11.d[0] OP_ir d25, d1, v10.d[0] OP_rr d28, d0, v10.d[1] OP_ii d28, d1, v11.d[1] OP_ri d29, d0, v11.d[1] OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmla d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I fmla d5, d20, alphaV0_I fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d24, alphaV0_R fmls d0, d25, alphaV0_I fmla d1, d24, alphaV0_I fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d28, alphaV0_R fmls d4, d29, alphaV0_I fmla d5, d28, alphaV0_I fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 fmov d20, d16 fmov d21, d17 fmov d22, d16 fmov d23, d17 .endm .macro KERNEL4x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmla v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow2] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I fmla v7.2d, v22.2d, alphaV0_I fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 .endm .macro KERNEL2x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr fmov d17, xzr fmov d20, xzr fmov d21, xzr .endm .macro KERNEL1x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] OP_rr d20, d0, v8.d[1] OP_ii d20, d1, v9.d[1] OP_ri d21, d0, v9.d[1] OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmla d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I fmla d5, d20, alphaV0_I fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 fmov d18, d16 fmov d19, d17 .endm .macro KERNEL4x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] .endm .macro SAVE4x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmla v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr fmov d17, xzr .endm .macro KERNEL2x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] .endm .macro SAVE2x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr fmov d17, xzr .endm .macro KERNEL1x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] .endm .macro SAVE1x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmla d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alphaR, d0 fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble zgemm_kernel_L2_BEGIN zgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array zgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble zgemm_kernel_L4_M2_BEGIN .align 5 zgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 blt zgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 ble zgemm_kernel_L4_M4_22a .align 5 zgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt zgemm_kernel_L4_M4_22 .align 5 zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 .align 5 zgemm_kernel_L4_M4_32: tst counterL, #1 ble zgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 zgemm_kernel_L4_M4_40: INIT4x4 zgemm_kernel_L4_M4_44: ands counterL , origK, #7 ble zgemm_kernel_L4_M4_100 .align 5 zgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 bne zgemm_kernel_L4_M4_46 zgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 zgemm_kernel_L4_M4_END: subs counterI, counterI, #1 bne zgemm_kernel_L4_M4_20 zgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble zgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble zgemm_kernel_L4_M1_BEGIN zgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L4_M2_40 zgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M2_22 zgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L4_M2_100 zgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M2_42 zgemm_kernel_L4_M2_100: SAVE2x4 zgemm_kernel_L4_M2_END: zgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble zgemm_kernel_L4_END zgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L4_M1_40 zgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M1_22 zgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L4_M1_100 zgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M1_42 zgemm_kernel_L4_M1_100: SAVE1x4 zgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- bgt zgemm_kernel_L4_BEGIN /******************************************************************************/ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble zgemm_kernel_L999 tst counterJ , #2 ble zgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A zgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble zgemm_kernel_L2_M2_BEGIN zgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble zgemm_kernel_L2_M4_40 .align 5 zgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M4_22 zgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L2_M4_100 zgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M4_42 zgemm_kernel_L2_M4_100: SAVE4x2 zgemm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt zgemm_kernel_L2_M4_20 zgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble zgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble zgemm_kernel_L2_M1_BEGIN zgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble zgemm_kernel_L2_M2_40 zgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M2_22 zgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L2_M2_100 zgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M2_42 zgemm_kernel_L2_M2_100: SAVE2x2 zgemm_kernel_L2_M2_END: zgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble zgemm_kernel_L2_END zgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble zgemm_kernel_L2_M1_40 zgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M1_22 zgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L2_M1_100 zgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M1_42 zgemm_kernel_L2_M1_100: SAVE1x2 zgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ zgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble zgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A zgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble zgemm_kernel_L1_M2_BEGIN zgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L1_M4_40 .align 5 zgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M4_22 zgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L1_M4_100 zgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M4_42 zgemm_kernel_L1_M4_100: SAVE4x1 zgemm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt zgemm_kernel_L1_M4_20 zgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble zgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble zgemm_kernel_L1_M1_BEGIN zgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L1_M2_40 zgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M2_22 zgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L1_M2_100 zgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M2_42 zgemm_kernel_L1_M2_100: SAVE2x1 zgemm_kernel_L1_M2_END: zgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble zgemm_kernel_L1_END zgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L1_M1_40 zgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M1_22 zgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L1_M1_100 zgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M1_42 zgemm_kernel_L1_M1_100: SAVE1x1 zgemm_kernel_L1_END: zgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S000066400000000000000000001027521313527062700232370ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 X3 x4 x5 x6 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define temp x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alphaR x17 #define alphaI x18 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] #define A_PRE_SIZE 3584 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 alpha_save_R // 18 must save alpha_save_I // 19 must save // 20 must save // 21 must save // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA00_R, pA01_R //v01 ALPHA_I -> pA00_I, pA01_I //v02 pA02_R, pA03_R //v03 pA02_I, pA03_I //v04 pA10_R, pA11_R //v05 pA10_I, pA11_I //v06 pA12_R, pA13_R //v07 pA12_I, pA13_I //v08 must save pB00_R, pB01_R //v09 must save pB00_I, pB01_I //v10 must save pB02_R, pB03_R OR ALPHA0_R //v11 must save pB02_I, pB03_I OR ALPHA0_I //v12 must save pB10_R, pB11_R //v13 must save pB10_I, pB11_I //v14 must save pB12_R, pB13_R OR ALPHA1_R //v15 must save pB12_I, pB13_I OR ALPHA1_R //v16 must save pC00_R, pC01_R //v17 must save pC00_I, pC01_I //v18 pC02_R, pC03_R //v19 pC02_I, pC03_I //v20 pC10_R, pC11_R //v21 pC10_I, pC11_I //v22 pC12_R, pC13_R //v23 pC12_I, pC13_I //v24 pC20_R, pC21_R //v25 pC20_I, pC21_I //v26 pC22_R, pC23_R //v27 pC22_I, pC23_I //v28 pC30_R, pC31_R //v29 pC30_I, pC31_I //v30 pC32_R, pC33_R //v31 pC32_I, pC33_I /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d18, d17 fmov d19, d16 fmov d20, d17 fmov d21, d16 fmov d22, d17 fmov d23, d16 fmov d24, d17 fmov d25, d16 fmov d26, d17 fmov d27, d16 fmov d28, d17 fmov d29, d16 fmov d30, d17 fmov d31, d16 .endm .macro KERNEL4x4_I ldr q8, [pB] ldr q9, [pB, #16] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v8.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.2d, v0.2d, v8.d[1] #else fmul v17.2d, v0.2d, v8.d[1] #endif OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmul v20.2d, v0.2d, v9.d[0] OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.2d, v0.2d, v9.d[1] #else fmul v21.2d, v0.2d, v9.d[1] #endif OP_ir v21.2d, v1.2d, v9.d[0] ldr q10, [pB] ldr q11, [pB, #16] add pB, pB, #32 fmul v22.2d, v2.2d, v9.d[0] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b fmls v23.2d, v2.2d, v9.d[1] #else fmul v23.2d, v2.2d, v9.d[1] #endif OP_ir v23.2d, v3.2d, v9.d[0] ldr q12, [pB] ldr q13, [pB, #16] add pB, pB, #32 fmul v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v8.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b fmls v19.2d, v2.2d, v8.d[1] #else fmul v19.2d, v2.2d, v8.d[1] #endif OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v4.2d, v5.2d} , [pA] add pA, pA, #32 fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v10.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.2d, v0.2d, v10.d[1] #else fmul v25.2d, v0.2d, v10.d[1] #endif OP_ir v25.2d, v1.2d, v10.d[0] ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v10.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b fmls v27.2d, v2.2d, v10.d[1] #else fmul v27.2d, v2.2d, v10.d[1] #endif OP_ir v27.2d, v3.2d, v10.d[0] ldr q14, [pB] ldr q15, [pB, #16] add pB, pB, #32 fmul v28.2d, v0.2d, v11.d[0] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.2d, v0.2d, v11.d[1] #else fmul v29.2d, v0.2d, v11.d[1] #endif OP_ir v29.2d, v1.2d, v11.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v30.2d, v2.2d, v11.d[0] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b fmls v31.2d, v2.2d, v11.d[1] #else fmul v31.2d, v2.2d, v11.d[1] #endif OP_ir v31.2d, v3.2d, v11.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v8.d[1] OP_ri v17.2d, v0.2d, v8.d[1] OP_ir v17.2d, v1.2d, v8.d[0] ldr q12, [pB] ldr q13, [pB, #16] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v8.d[1] OP_ri v19.2d, v2.2d, v8.d[1] OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v4.2d, v5.2d} , [pA] add pA, pA, #32 OP_rr v20.2d, v0.2d, v9.d[0] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v9.d[0] ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v9.d[0] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v9.d[0] ldr q14, [pB] ldr q15, [pB, #16] add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v10.d[1] OP_ri v25.2d, v0.2d, v10.d[1] OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v10.d[1] OP_ri v27.2d, v2.2d, v10.d[1] OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v11.d[0] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v11.d[0] OP_rr v30.2d, v2.2d, v11.d[0] OP_ii v30.2d, v3.2d, v11.d[1] OP_ri v31.2d, v2.2d, v11.d[1] OP_ir v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL4x4_M2 OP_rr v16.2d, v4.2d, v12.d[0] OP_ii v16.2d, v5.2d, v12.d[1] OP_ri v17.2d, v4.2d, v12.d[1] OP_ir v17.2d, v5.2d, v12.d[0] ldr q8, [pB] ldr q9, [pB, #16] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] OP_ii v18.2d, v7.2d, v12.d[1] OP_ri v19.2d, v6.2d, v12.d[1] OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v20.2d, v4.2d, v13.d[0] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v13.d[0] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v13.d[0] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v13.d[0] ldr q10, [pB] ldr q11, [pB, #16] add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v14.d[1] OP_ri v25.2d, v4.2d, v14.d[1] OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v14.d[1] OP_ri v27.2d, v6.2d, v14.d[1] OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v15.d[0] OP_ii v28.2d, v5.2d, v15.d[1] OP_ri v29.2d, v4.2d, v15.d[1] OP_ir v29.2d, v5.2d, v15.d[0] OP_rr v30.2d, v6.2d, v15.d[0] OP_ii v30.2d, v7.2d, v15.d[1] OP_ri v31.2d, v6.2d, v15.d[1] OP_ir v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL4x4_E OP_rr v16.2d, v4.2d, v12.d[0] OP_ii v16.2d, v5.2d, v12.d[1] OP_ri v17.2d, v4.2d, v12.d[1] OP_ir v17.2d, v5.2d, v12.d[0] OP_rr v18.2d, v6.2d, v12.d[0] OP_ii v18.2d, v7.2d, v12.d[1] OP_ri v19.2d, v6.2d, v12.d[1] OP_ir v19.2d, v7.2d, v12.d[0] OP_rr v20.2d, v4.2d, v13.d[0] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v13.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.2d, v6.2d, v13.d[0] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v13.d[0] OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v14.d[1] OP_ri v25.2d, v4.2d, v14.d[1] OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v14.d[1] OP_ri v27.2d, v6.2d, v14.d[1] OP_ir v27.2d, v7.2d, v14.d[0] OP_rr v28.2d, v4.2d, v15.d[0] OP_ii v28.2d, v5.2d, v15.d[1] OP_ri v29.2d, v4.2d, v15.d[1] OP_ir v29.2d, v5.2d, v15.d[0] OP_rr v30.2d, v6.2d, v15.d[0] OP_ii v30.2d, v7.2d, v15.d[1] OP_ri v31.2d, v6.2d, v15.d[1] OP_ir v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL4x4_SUB ldr q8, [pB] ldr q9, [pB, #16] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v8.d[1] OP_ri v17.2d, v0.2d, v8.d[1] OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v20.2d, v0.2d, v9.d[0] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v9.d[0] ldr q10, [pB] ldr q11, [pB, #16] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v8.d[1] OP_ri v19.2d, v2.2d, v8.d[1] OP_ir v19.2d, v3.2d, v8.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.2d, v2.2d, v9.d[0] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v10.d[1] OP_ri v25.2d, v0.2d, v10.d[1] OP_ir v25.2d, v1.2d, v10.d[0] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v10.d[1] OP_ri v27.2d, v2.2d, v10.d[1] OP_ir v27.2d, v3.2d, v10.d[0] OP_rr v28.2d, v0.2d, v11.d[0] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v11.d[0] OP_rr v30.2d, v2.2d, v11.d[0] OP_ii v30.2d, v3.2d, v11.d[1] OP_ri v31.2d, v2.2d, v11.d[1] OP_ir v31.2d, v3.2d, v11.d[0] .endm .macro SAVE4x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ld2 {v0.2d, v1.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow0] add pCRow0, pCRow0, #32 ld2 {v2.2d, v3.2d}, [pCRow0] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmla v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow1] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I fmla v7.2d, v22.2d, alphaV0_I fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] ld2 {v0.2d, v1.2d}, [pCRow2] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I fmla v1.2d, v24.2d, alphaV0_I fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow2] add pCRow2, pCRow2, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I fmla v3.2d, v26.2d, alphaV0_I fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] ld2 {v4.2d, v5.2d}, [pCRow3] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I fmla v5.2d, v28.2d, alphaV0_I fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow3] add pCRow3, pCRow3, #32 ld2 {v6.2d, v7.2d}, [pCRow3] fmla v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I fmla v7.2d, v30.2d, alphaV0_I fmla v7.2d, v31.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 fmov d24, d16 fmov d25, d17 fmov d28, d16 fmov d29, d17 .endm .macro KERNEL2x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I fmla v1.2d, v24.2d, alphaV0_I fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I fmla v5.2d, v28.2d, alphaV0_I fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 fmov d24, d16 fmov d25, d17 fmov d28, d16 fmov d29, d17 .endm .macro KERNEL1x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] OP_rr d20, d0, v8.d[1] OP_ii d20, d1, v9.d[1] OP_ri d21, d0, v9.d[1] OP_ir d21, d1, v8.d[1] OP_rr d24, d0, v10.d[0] OP_ii d24, d1, v11.d[0] OP_ri d25, d0, v11.d[0] OP_ir d25, d1, v10.d[0] OP_rr d28, d0, v10.d[1] OP_ii d28, d1, v11.d[1] OP_ri d29, d0, v11.d[1] OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmla d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I fmla d5, d20, alphaV0_I fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d24, alphaV0_R fmls d0, d25, alphaV0_I fmla d1, d24, alphaV0_I fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d28, alphaV0_R fmls d4, d29, alphaV0_I fmla d5, d28, alphaV0_I fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 fmov d20, d16 fmov d21, d17 fmov d22, d16 fmov d23, d17 .endm .macro KERNEL4x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmla v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow2] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I fmla v7.2d, v22.2d, alphaV0_I fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 .endm .macro KERNEL2x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmla v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr fmov d17, xzr fmov d20, xzr fmov d21, xzr .endm .macro KERNEL1x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] OP_rr d20, d0, v8.d[1] OP_ii d20, d1, v9.d[1] OP_ri d21, d0, v9.d[1] OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmla d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I fmla d5, d20, alphaV0_I fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 fmov d18, d16 fmov d19, d17 .endm .macro KERNEL4x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] .endm .macro SAVE4x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmla v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr fmov d17, xzr .endm .macro KERNEL2x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] .endm .macro SAVE2x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmla v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr fmov d17, xzr .endm .macro KERNEL1x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] .endm .macro SAVE1x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmla d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alphaR, d0 fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble zgemm_kernel_L2_BEGIN zgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC mov pA, origPA // pA = start of A array zgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble zgemm_kernel_L4_M2_BEGIN .align 5 zgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 blt zgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 ble zgemm_kernel_L4_M4_22a .align 5 zgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt zgemm_kernel_L4_M4_22 .align 5 zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 .align 5 zgemm_kernel_L4_M4_32: tst counterL, #1 ble zgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 zgemm_kernel_L4_M4_40: INIT4x4 zgemm_kernel_L4_M4_44: ands counterL , origK, #7 ble zgemm_kernel_L4_M4_100 .align 5 zgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 bne zgemm_kernel_L4_M4_46 zgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 zgemm_kernel_L4_M4_END: subs counterI, counterI, #1 bne zgemm_kernel_L4_M4_20 zgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble zgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble zgemm_kernel_L4_M1_BEGIN zgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L4_M2_40 zgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M2_22 zgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L4_M2_100 zgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M2_42 zgemm_kernel_L4_M2_100: SAVE2x4 zgemm_kernel_L4_M2_END: zgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble zgemm_kernel_L4_END zgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L4_M1_40 zgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M1_22 zgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L4_M1_100 zgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L4_M1_42 zgemm_kernel_L4_M1_100: SAVE1x4 zgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- bgt zgemm_kernel_L4_BEGIN /******************************************************************************/ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble zgemm_kernel_L999 tst counterJ , #2 ble zgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 mov pA, origPA // pA = A zgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble zgemm_kernel_L2_M2_BEGIN zgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble zgemm_kernel_L2_M4_40 .align 5 zgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M4_22 zgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L2_M4_100 zgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M4_42 zgemm_kernel_L2_M4_100: SAVE4x2 zgemm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt zgemm_kernel_L2_M4_20 zgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble zgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble zgemm_kernel_L2_M1_BEGIN zgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble zgemm_kernel_L2_M2_40 zgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M2_22 zgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L2_M2_100 zgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M2_42 zgemm_kernel_L2_M2_100: SAVE2x2 zgemm_kernel_L2_M2_END: zgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble zgemm_kernel_L2_END zgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble zgemm_kernel_L2_M1_40 zgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M1_22 zgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L2_M1_100 zgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L2_M1_42 zgemm_kernel_L2_M1_100: SAVE1x2 zgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ zgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble zgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A zgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble zgemm_kernel_L1_M2_BEGIN zgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L1_M4_40 .align 5 zgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M4_22 zgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L1_M4_100 zgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M4_42 zgemm_kernel_L1_M4_100: SAVE4x1 zgemm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt zgemm_kernel_L1_M4_20 zgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble zgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble zgemm_kernel_L1_M1_BEGIN zgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L1_M2_40 zgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M2_22 zgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L1_M2_100 zgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M2_42 zgemm_kernel_L1_M2_100: SAVE2x1 zgemm_kernel_L1_M2_END: zgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble zgemm_kernel_L1_END zgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble zgemm_kernel_L1_M1_40 zgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M1_22 zgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 ble zgemm_kernel_L1_M1_100 zgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt zgemm_kernel_L1_M1_42 zgemm_kernel_L1_M1_100: SAVE1x1 zgemm_kernel_L1_END: zgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zgemv_n.S000066400000000000000000000256011313527062700171520ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 /* Y vector length */ #define N x1 /* X vector length */ #define A x3 /* A vector address */ #define LDA x4 /* A stride */ #define X x5 /* X vector address */ #define INC_X x6 /* X stride */ #define Y x7 /* Y vector address */ #define INC_Y x2 /* Y stride */ #define A_PTR x9 /* loop A vector address */ #define Y_IPTR x10 /* loop Y vector address */ #define J x11 /* loop variable */ #define I x12 /* loop variable */ #define Y_OPTR x13 /* loop Y vector address */ #define X_PTR x14 /* loop X vector address */ #define A_PRE_SIZE 768 #define Y_PRE_SIZE 768 /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define ALPHA_R s0 #define ALPHA_I s1 #define SHZ 3 #else #define ALPHA_R d0 #define ALPHA_I d1 #define SHZ 4 #endif /******************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm .macro INIT #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) eor v2.16b, v2.16b, v2.16b fsub s2, s2, ALPHA_I ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) #if !defined(XCONJ) ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) #endif #else ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) eor v2.16b, v2.16b, v2.16b fsub d2, d2, ALPHA_I ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) #if !defined(XCONJ) ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) #endif #endif .endm .macro INIT_LOOP #if !defined(DOUBLE) ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] fmul v2.2s, v0.2s, v2.2s fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] ins v3.s[0], v2.s[1] /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(CONJ) #if !defined(XCONJ) dup v21.4s, v2.s[0] // R[TEMP] dup v22.4s, v2.s[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub s25, s25, s3 dup v23.4s, v25.s[0] // -I[TEMP] dup v24.4s, v3.s[0] // I[TEMP] #else dup v21.4s, v2.s[0] // R[TEMP] dup v22.4s, v2.s[0] // R[TEMP] dup v23.4s, v3.s[0] // I[TEMP] eor v25.16b, v25.16b, v25.16b fsub s25, s25, s3 dup v24.4s, v25.s[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) dup v21.4s, v2.s[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub s25, s25, s2 dup v22.4s, v25.s[0] // R[TEMP] dup v23.4s, v3.s[0] // I[TEMP] dup v24.4s, v3.s[0] // I[TEMP] #else dup v21.4s, v2.s[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub s25, s25, s2 dup v22.4s, v25.s[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub s25, s25, s3 dup v23.4s, v25.s[0] // I[TEMP] dup v24.4s, v25.s[0] // I[TEMP] #endif #endif // CONJ /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b fsub s4, s4, s3 ins v3.s[1], v4.s[0] ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] #else eor v4.16b, v4.16b, v4.16b fsub s4, s4, s3 ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] #endif #else // CONJ #if !defined(XCONJ) ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] eor v4.16b, v4.16b, v4.16b fsub s4, s4, s2 ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] #else eor v4.16b, v4.16b, v4.16b fsub s3, s4, s3 ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] eor v4.16b, v4.16b, v4.16b fsub s4, s4, s2 ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] #endif #endif // CONJ #else // DOUBLE ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] fmul v2.2d, v0.2d, v2.2d fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] ins v3.d[0], v2.d[1] // I(TEMP) /****** INIT_LOOP FOR F4 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) dup v21.2d, v2.d[0] // R[TEMP] dup v22.2d, v2.d[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub d25, d25, d3 dup v23.2d, v25.d[0] // -I[TEMP] dup v24.2d, v3.d[0] // I[TEMP] #else dup v21.2d, v2.d[0] // R[TEMP] dup v22.2d, v2.d[0] // R[TEMP] dup v23.2d, v3.d[0] // I[TEMP] eor v25.16b, v25.16b, v25.16b fsub d25, d25, d3 dup v24.2d, v25.d[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) dup v21.2d, v2.d[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub d25, d25, d2 dup v22.2d, v25.d[0] // R[TEMP] dup v23.2d, v3.d[0] // I[TEMP] dup v24.2d, v3.d[0] // I[TEMP] #else dup v21.2d, v2.d[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub d25, d25, d2 dup v22.2d, v25.d[0] // R[TEMP] eor v25.16b, v25.16b, v25.16b fsub d25, d25, d3 dup v23.2d, v25.d[0] // I[TEMP] dup v24.2d, v25.d[0] // I[TEMP] #endif #endif // CONJ /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b fsub d4, d4, d3 ins v3.d[1], v4.d[0] ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] #else eor v4.16b, v4.16b, v4.16b fsub d4, d4, d3 ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] #endif #else // CONJ #if !defined(XCONJ) ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] eor v4.16b, v4.16b, v4.16b fsub d4, d4, d2 ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] #else eor v4.16b, v4.16b, v4.16b fsub d3, d4, d3 ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] eor v4.16b, v4.16b, v4.16b fsub d4, d4, d2 ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] #endif #endif // CONJ #endif // DOUBLE .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld2 {v13.4s, v14.4s}, [A_PTR], #32 ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v15.4s, v21.4s, v13.4s fmla v15.4s, v23.4s, v14.4s fmla v16.4s, v22.4s, v14.4s fmla v16.4s, v24.4s, v13.4s st2 {v15.4s, v16.4s}, [Y_OPTR], #32 #else // DOUBLE ld2 {v13.2d, v14.2d}, [A_PTR], #32 ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v15.2d, v21.2d, v13.2d fmla v15.2d, v23.2d, v14.2d fmla v16.2d, v22.2d, v14.2d fmla v16.2d, v24.2d, v13.2d st2 {v15.2d, v16.2d}, [Y_OPTR], #32 ld2 {v17.2d, v18.2d}, [A_PTR], #32 ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v19.2d, v21.2d, v17.2d fmla v19.2d, v23.2d, v18.2d fmla v20.2d, v22.2d, v18.2d fmla v20.2d, v24.2d, v17.2d st2 {v19.2d, v20.2d}, [Y_OPTR], #32 #endif .endm .macro KERNEL_F1 #if !defined(DOUBLE) ld1 {v4.2s}, [A_PTR], #8 ld1 {v5.2s}, [Y_IPTR], #8 ext v6.8b, v4.8b, v4.8b, #4 fmla v5.2s, v2.2s, v4.2s fmla v5.2s, v3.2s, v6.2s st1 {v5.2s}, [Y_OPTR], #8 #else // DOUBLE ld1 {v4.2d}, [A_PTR], #16 ld1 {v5.2d}, [Y_IPTR], #16 ext v6.16b, v4.16b, v4.16b, #8 fmla v5.2d, v2.2d, v4.2d fmla v5.2d, v3.2d, v6.2d st1 {v5.2d}, [Y_OPTR], #16 #endif .endm .macro INIT_S lsl INC_Y, INC_Y, #SHZ .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v4.2s}, [A_PTR], #8 ld1 {v5.2s}, [Y_IPTR], INC_Y ext v6.8b, v4.8b, v4.8b, #4 fmla v5.2s, v2.2s, v4.2s fmla v5.2s, v3.2s, v6.2s st1 {v5.2s}, [Y_OPTR], INC_Y #else // DOUBLE ld1 {v4.2d}, [A_PTR], #16 ld1 {v5.2d}, [Y_IPTR], INC_Y ext v6.16b, v4.16b, v4.16b, #8 fmla v5.2d, v2.2d, v4.2d fmla v5.2d, v3.2d, v6.2d st1 {v5.2d}, [Y_OPTR], INC_Y #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE ldr INC_Y, [sp] SAVE_REGS cmp N, xzr ble zgemv_n_kernel_L999 cmp M, xzr ble zgemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ mov J, N INIT cmp INC_Y, #1 bne zgemv_n_kernel_S_BEGIN zgemv_n_kernel_F_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y mov X_PTR, X add X, X, INC_X INIT_LOOP asr I, M, #2 cmp I, xzr beq zgemv_n_kernel_F1 zgemv_n_kernel_F4: KERNEL_F4 subs I, I, #1 bne zgemv_n_kernel_F4 zgemv_n_kernel_F1: ands I, M, #3 ble zgemv_n_kernel_F_END zgemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 bne zgemv_n_kernel_F10 zgemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 bne zgemv_n_kernel_F_LOOP b zgemv_n_kernel_L999 zgemv_n_kernel_S_BEGIN: INIT_S zgemv_n_kernel_S_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y mov X_PTR, X add X, X, INC_X INIT_LOOP asr I, M, #2 cmp I, xzr ble zgemv_n_kernel_S1 zgemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne zgemv_n_kernel_S4 zgemv_n_kernel_S1: ands I, M, #3 ble zgemv_n_kernel_S_END zgemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 bne zgemv_n_kernel_S10 zgemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 bne zgemv_n_kernel_S_LOOP zgemv_n_kernel_L999: RESTORE_REGS mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zgemv_t.S000066400000000000000000000250401313527062700171550ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define M x0 /* Y vector length */ #define N x1 /* X vector length */ #define A x3 /* A vector address */ #define LDA x4 /* A stride */ #define X x5 /* X vector address */ #define INC_X x6 /* X stride */ #define Y x7 /* Y vector address */ #define INC_Y x2 /* Y stride */ #define A_PTR x9 /* loop A vector address */ #define X_PTR x10 /* loop Y vector address */ #define J x11 /* loop variable */ #define I x12 /* loop variable */ #define A_PRE_SIZE 768 #define X_PRE_SIZE 768 /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define ALPHA_R s0 #define ALPHA_I s1 #define ALPHA_R_COPY s7 #define ALPHA_I_COPY s8 #define SHZ 3 #else #define ALPHA_R d0 #define ALPHA_I d1 #define ALPHA_R_COPY d7 #define ALPHA_I_COPY d8 #define SHZ 4 #endif /******************************************************************************/ .macro SAVE_REGS add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] .endm .macro RESTORE_REGS ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) .endm .macro INIT #if !defined(XCONJ) #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R eor v2.16b, v2.16b, v2.16b fsub s2, s2, ALPHA_I ins v1.s[1], v2.s[0] ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I #else ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R eor v2.16b, v2.16b, v2.16b fsub d2, d2, ALPHA_I ins v1.d[1], v2.d[0] ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I #endif #else // XCONJ #if !defined(DOUBLE) eor v2.16b, v2.16b, v2.16b fsub s2, s2, ALPHA_R ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I #else eor v2.16b, v2.16b, v2.16b fsub d2, d2, ALPHA_R ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I #endif #endif .endm .macro INIT_LOOP fmov d9, xzr // TEMP_R = [0, 0] fmov d10, xzr // TEMP_I = [0, 0] #if !defined(DOUBLE) #else fmov d15, xzr // TEMP_R = [0, 0] fmov d16, xzr // TEMP_I = [0, 0] #endif fmov d2, xzr // TEMP = [0, 0] .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld2 {v11.4s, v12.4s}, [X_PTR], #32 ld2 {v13.4s, v14.4s}, [A_PTR], #32 prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] #else fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] #endif #else // DOUBLE ld2 {v11.2d, v12.2d}, [X_PTR], #32 ld2 {v13.2d, v14.2d}, [A_PTR], #32 prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] #else fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] #endif ld2 {v17.2d, v18.2d}, [X_PTR], #32 ld2 {v19.2d, v20.2d}, [A_PTR], #32 prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] #else fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] #endif #endif //DOUBLE .endm .macro KERNEL_F4_FINALIZE #if !defined(DOUBLE) ext v21.16b, v9.16b, v9.16b, #8 fadd v9.2s, v9.2s, v21.2s faddp s9, v9.2s ext v21.16b, v10.16b, v10.16b, #8 fadd v10.2s, v10.2s, v21.2s faddp s10, v10.2s ins v2.s[0], v9.s[0] ins v2.s[1], v10.s[0] #else fadd v9.2d, v9.2d, v15.2d fadd v10.2d, v10.2d, v16.2d faddp d9, v9.2d faddp d10, v10.2d ins v2.d[0], v9.d[0] ins v2.d[1], v10.d[0] #endif .endm .macro KERNEL_F1 #if !defined(DOUBLE) ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] ld1 {v5.s}[0], [A_PTR], #4 // A1 ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] eor v16.16b, v16.16b, v16.16b fsub s16, s16, s5 ins v5.s[1], v16.s[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] #endif ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] fmla v2.2s, v4.2s, v6.2s fmla v2.2s, v5.2s, v7.2s #else // DOUBLE ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] ld1 {v5.d}[0], [A_PTR], #8 // A1 ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] eor v16.16b, v16.16b, v16.16b fsub d16, d16, d5 ins v5.d[1], v16.d[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] #endif ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] fmla v2.2d, v4.2d, v6.2d fmla v2.2d, v5.2d, v7.2d #endif .endm .macro INIT_S lsl INC_X, INC_X, #SHZ .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] ld1 {v5.s}[0], [A_PTR], #4 // A1 ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] eor v16.16b, v16.16b, v16.16b fsub s16, s16, s5 ins v5.s[1], v16.s[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] #endif ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] fmla v2.2s, v4.2s, v6.2s fmla v2.2s, v5.2s, v7.2s #else // DOUBLE ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] ld1 {v5.d}[0], [A_PTR], #8 // A1 ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] eor v16.16b, v16.16b, v16.16b fsub d16, d16, d5 ins v5.d[1], v16.d[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] #endif ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] fmla v2.2d, v4.2d, v6.2d fmla v2.2d, v5.2d, v7.2d #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE ldr INC_Y, [sp] SAVE_REGS cmp N, xzr ble zgemv_t_kernel_L999 cmp M, xzr ble zgemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ mov J, N INIT cmp INC_X, #1 bne zgemv_t_kernel_S_BEGIN zgemv_t_kernel_F_LOOP: mov A_PTR, A mov X_PTR, X INIT_LOOP asr I, M, #2 cmp I, xzr beq zgemv_t_kernel_F1 zgemv_t_kernel_F4: KERNEL_F4 subs I, I, #1 bne zgemv_t_kernel_F4 KERNEL_F4_FINALIZE zgemv_t_kernel_F1: ands I, M, #3 ble zgemv_t_kernel_F_END zgemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 bne zgemv_t_kernel_F10 zgemv_t_kernel_F_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] fmla v4.2s, v0.2s, v2.2s fmla v4.2s, v1.2s, v3.2s st1 {v4.2s}, [Y], INC_Y #else // DOUBLE ld1 {v4.2d}, [Y] ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] fmla v4.2d, v0.2d, v2.2d fmla v4.2d, v1.2d, v3.2d st1 {v4.2d}, [Y], INC_Y #endif add A, A, LDA subs J, J, #1 bne zgemv_t_kernel_F_LOOP b zgemv_t_kernel_L999 zgemv_t_kernel_S_BEGIN: INIT_S zgemv_t_kernel_S_LOOP: mov A_PTR, A mov X_PTR, X INIT_LOOP asr I, M, #2 cmp I, xzr ble zgemv_t_kernel_S1 zgemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne zgemv_t_kernel_S4 zgemv_t_kernel_S1: ands I, M, #3 ble zgemv_t_kernel_S_END zgemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 bne zgemv_t_kernel_S10 zgemv_t_kernel_S_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] fmla v4.2s, v0.2s, v2.2s fmla v4.2s, v1.2s, v3.2s st1 {v4.2s}, [Y], INC_Y #else // DOUBLE ld1 {v4.2d}, [Y] ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] fmla v4.2d, v0.2d, v2.2d fmla v4.2d, v1.2d, v3.2d st1 {v4.2d}, [Y], INC_Y #endif add A, A, LDA subs J, J, #1 bne zgemv_t_kernel_S_LOOP zgemv_t_kernel_L999: RESTORE_REGS mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/znrm2.S000066400000000000000000000135171313527062700165600ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 #define X x1 #define INC_X x2 #define I x3 #if !defined(DOUBLE) #define SSQ s0 #define SCALE s1 #define REGZERO s6 #define REGONE s7 #else #define SSQ d0 #define SCALE d1 #define REGZERO d6 #define REGONE d7 #endif /************************************************************************************** * Macro definitions **************************************************************************************/ .macro KERNEL_F1 #if !defined(DOUBLE) ldr s4, [X], #4 fcmp s4, REGZERO beq KERNEL_F1_NEXT_\@ fabs s4, s4 fcmp SCALE, s4 bge KERNEL_F1_SCALE_GE_XR_\@ fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 b KERNEL_F1_NEXT_\@ KERNEL_F1_SCALE_GE_XR_\@: fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] KERNEL_F1_NEXT_\@: ldr s5, [X], #4 fcmp s5, REGZERO beq KERNEL_F1_END_\@ fabs s5, s5 fcmp SCALE, s5 bge KERNEL_F1_SCALE_GE_XI_\@ fdiv s2, SCALE, s5 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s5 b KERNEL_F1_END_\@ KERNEL_F1_SCALE_GE_XI_\@: fdiv s2, s5, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X], #8 fcmp d4, REGZERO beq KERNEL_F1_NEXT_\@ fabs d4, d4 fcmp SCALE, d4 bge KERNEL_F1_SCALE_GE_XR_\@ fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 b KERNEL_F1_NEXT_\@ KERNEL_F1_SCALE_GE_XR_\@: fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] KERNEL_F1_NEXT_\@: ldr d5, [X], #8 fcmp d5, REGZERO beq KERNEL_F1_END_\@ fabs d5, d5 fcmp SCALE, d5 bge KERNEL_F1_SCALE_GE_XI_\@ fdiv d2, SCALE, d5 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d5 b KERNEL_F1_END_\@ KERNEL_F1_SCALE_GE_XI_\@: fdiv d2, d5, SCALE fmla SSQ, d2, v2.d[0] #endif KERNEL_F1_END_\@: .endm .macro KERNEL_S1 #if !defined(DOUBLE) ldr s4, [X] fcmp s4, REGZERO beq KERNEL_S1_NEXT_\@ fabs s4, s4 fcmp SCALE, s4 bge KERNEL_S1_SCALE_GE_XR_\@ fdiv s2, SCALE, s4 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s4 b KERNEL_S1_NEXT_\@ KERNEL_S1_SCALE_GE_XR_\@: fdiv s2, s4, SCALE fmla SSQ, s2, v2.s[0] KERNEL_S1_NEXT_\@: ldr s5, [X, #4] fcmp s5, REGZERO beq KERNEL_S1_END_\@ fabs s5, s5 fcmp SCALE, s5 bge KERNEL_S1_SCALE_GE_XI_\@ fdiv s2, SCALE, s5 fmul s2, s2, s2 fmul s3, SSQ, s2 fadd SSQ, REGONE, s3 fmov SCALE, s5 b KERNEL_S1_END_\@ KERNEL_S1_SCALE_GE_XI_\@: fdiv s2, s5, SCALE fmla SSQ, s2, v2.s[0] #else ldr d4, [X] fcmp d4, REGZERO beq KERNEL_S1_NEXT_\@ fabs d4, d4 fcmp SCALE, d4 bge KERNEL_S1_SCALE_GE_XR_\@ fdiv d2, SCALE, d4 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d4 b KERNEL_S1_NEXT_\@ KERNEL_S1_SCALE_GE_XR_\@: fdiv d2, d4, SCALE fmla SSQ, d2, v2.d[0] KERNEL_S1_NEXT_\@: ldr d5, [X, #8] fcmp d5, REGZERO beq KERNEL_S1_END_\@ fabs d5, d5 fcmp SCALE, d5 bge KERNEL_S1_SCALE_GE_XI_\@ fdiv d2, SCALE, d5 fmul d2, d2, d2 fmul d3, SSQ, d2 fadd SSQ, REGONE, d3 fmov SCALE, d5 b KERNEL_S1_END_\@ KERNEL_S1_SCALE_GE_XI_\@: fdiv d2, d5, SCALE fmla SSQ, d2, v2.d[0] #endif KERNEL_S1_END_\@: add X, X, INC_X .endm .macro KERNEL_F8 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 KERNEL_F1 .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #3 // INC_X * SIZE #else lsl INC_X, INC_X, #4 // INC_X * SIZE #endif .endm .macro INIT eor v1.16b, v1.16b, v1.16b // scale=0.0 fmov SSQ, #1.0 fmov REGONE, SSQ fmov REGZERO, SCALE .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ PROLOGUE .align 5 INIT cmp N, #0 ble nrm2_kernel_L999 cmp INC_X, #0 beq nrm2_kernel_L999 cmp INC_X, #1 bne nrm2_kernel_S_BEGIN nrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr ble nrm2_kernel_F1 nrm2_kernel_F8: KERNEL_F8 subs I, I, #1 bne nrm2_kernel_F8 nrm2_kernel_F1: ands I, N, #7 ble nrm2_kernel_L999 nrm2_kernel_F10: KERNEL_F1 subs I, I, #1 bne nrm2_kernel_F10 b nrm2_kernel_L999 nrm2_kernel_S_BEGIN: INIT_S mov I, N .align 5 nrm2_kernel_S10: KERNEL_S1 subs I, I, #1 bne nrm2_kernel_S10 nrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/zrot.S000066400000000000000000000151251313527062700165030ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x1 /* X vector address */ #define INC_X x2 /* X stride */ #define Y x3 /* Y vector address */ #define INC_Y x4 /* Y stride */ #define I x5 /* loop variable */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define C s0 /* scale input value */ #define S s1 /* scale input value */ #else #define C d0 /* scale input value */ #define S d1 /* scale input value */ #endif /******************************************************************************/ .macro INIT #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // [C, C] ins v1.s[1], v1.s[0] // [S, S] #else ins v0.d[1], v0.d[0] // [C, C] ins v1.d[1], v1.d[0] // [S, S] #endif .endm .macro KERNEL_F1 #if !defined(DOUBLE) ld1 {v2.2s}, [X] ld1 {v3.2s}, [Y] fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] st1 {v4.2s}, [X], #8 st1 {v5.2s}, [Y], #8 #else ld1 {v2.2d}, [X] ld1 {v3.2d}, [Y] fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] st1 {v4.2d}, [X], #16 st1 {v5.2d}, [Y], #16 #endif .endm .macro KERNEL_INIT_F4 #if !defined(DOUBLE) ins v0.d[1], v0.d[0] // [C, C, C, C] ins v1.d[1], v1.d[0] // [S, S, S, S] #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld1 {v2.4s, v3.4s}, [X] ld1 {v4.4s, v5.4s}, [Y] fmul v6.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 fmul v7.4s, v0.4s, v3.4s // C*X7, C*X6, C*X5, C*X4 fmla v6.4s, v1.4s, v4.4s // C*X3+S*Y3, ..., C*X0+S*Y0 fmla v7.4s, v1.4s, v5.4s // C*X7+S*Y7, ..., C*X4+S*Y4 fmul v16.4s, v0.4s, v4.4s // C*Y3, C*Y2, C*Y1, C*Y0 fmul v17.4s, v0.4s, v5.4s // C*Y7, C*Y6, C*Y5, C*Y4 fmls v16.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 fmls v17.4s, v1.4s, v3.4s // C*Y7-S*X7, ..., C*Y4-S*X4 st1 {v6.4s,v7.4s}, [X], #32 st1 {v16.4s,v17.4s}, [Y], #32 #else // DOUBLE ld1 {v2.2d, v3.2d}, [X] ld1 {v4.2d, v5.2d}, [Y] fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 st1 {v6.2d,v7.2d}, [X], #32 st1 {v16.2d,v17.2d}, [Y], #32 ld1 {v2.2d, v3.2d}, [X] ld1 {v4.2d, v5.2d}, [Y] fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 st1 {v6.2d,v7.2d}, [X], #32 st1 {v16.2d,v17.2d}, [Y], #32 #endif .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 #else lsl INC_X, INC_X, #4 lsl INC_Y, INC_Y, #4 #endif .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v2.2s}, [X] ld1 {v3.2s}, [Y] fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] st1 {v4.2s}, [X], INC_X st1 {v5.2s}, [Y], INC_Y #else ld1 {v2.2d}, [X] ld1 {v3.2d}, [Y] fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] st1 {v4.2d}, [X], INC_X st1 {v5.2d}, [Y], INC_Y #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE cmp N, xzr ble rot_kernel_L999 INIT cmp INC_X, #1 bne rot_kernel_S_BEGIN cmp INC_Y, #1 bne rot_kernel_S_BEGIN rot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq rot_kernel_F1 KERNEL_INIT_F4 rot_kernel_F4: KERNEL_F4 subs I, I, #1 bne rot_kernel_F4 rot_kernel_F1: ands I, N, #3 ble rot_kernel_L999 rot_kernel_F10: KERNEL_F1 subs I, I, #1 bne rot_kernel_F10 mov w0, wzr ret rot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble rot_kernel_S1 rot_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne rot_kernel_S4 rot_kernel_S1: ands I, N, #3 ble rot_kernel_L999 rot_kernel_S10: KERNEL_S1 subs I, I, #1 bne rot_kernel_S10 rot_kernel_L999: mov w0, wzr ret OpenBLAS-0.2.20/kernel/arm64/zscal.S000066400000000000000000000210161313527062700166150ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" #define N x0 /* vector length */ #define X x3 /* X vector address */ #define INC_X x4 /* X stride */ #define I x5 /* loop variable */ #define X_COPY x6 /* Copy of X */ /******************************************************************************* * Macro definitions *******************************************************************************/ #if !defined(DOUBLE) #define DA_R s0 /* real scale input value */ #define DA_I s1 /* imaginary scale input value */ #else #define DA_R d0 /* real scale input value */ #define DA_I d1 /* imaginary scale input value */ #endif /******************************************************************************/ .macro INIT #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R #else ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R #endif .endm .macro KERNEL_F1 #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul s3, DA_R, v2.s[0] // DA_R*X0 fmul s5, DA_I, v2.s[1] // DA_I*X1 fsub s3, s3, s5 // DA_R*X0-DA_I*X1 fmul s4, DA_I, v2.s[0] // DA_I*X0 fmul s5, DA_R, v2.s[1] // DA_R*X1 fadd s4, s4, s5 // DA_I*X0+DA_R*X1 ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 st1 {v3.2s}, [X], #8 #else ld1 {v2.2d}, [X] // X1, X0 fmul d3, DA_R, v2.d[0] // DA_R*X0 fmul d5, DA_I, v2.d[1] // DA_I*X1 fsub d3, d3, d5 // DA_R*X0-DA_I*X1 fmul d4, DA_I, v2.d[0] // DA_I*X0 fmul d5, DA_R, v2.d[1] // DA_R*X1 fadd d4, d4, d5 // DA_I*X0+DA_R*X1 ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 st1 {v3.2d}, [X], #16 #endif .endm .macro KERNEL_INIT_F4 #if !defined(DOUBLE) ins v16.s[0], v0.s[0] ins v16.s[1], v16.s[0] ins v16.d[1], v16.d[0] ins v17.s[0], v1.s[0] ins v17.s[1], v17.s[0] ins v17.d[1], v17.d[0] #else //DOUBLE ins v16.d[0], v0.d[0] ins v16.d[1], v16.d[0] ins v17.d[0], v1.d[0] ins v17.d[1], v17.d[0] #endif .endm .macro KERNEL_F4 #if !defined(DOUBLE) ld2 {v2.4s, v3.4s}, [X], #32 fmul v4.4s, v2.4s, v16.4s fmul v6.4s, v3.4s, v17.4s fsub v4.4s, v4.4s, v6.4s fmul v5.4s, v2.4s, v17.4s fmul v6.4s, v3.4s, v16.4s fadd v5.4s, v5.4s, v6.4s st2 {v4.4s, v5.4s}, [X_COPY], #32 #else // DOUBLE ld2 {v2.2d, v3.2d}, [X], #32 fmul v4.2d, v2.2d, v16.2d fmul v6.2d, v3.2d, v17.2d fsub v4.2d, v4.2d, v6.2d fmul v5.2d, v2.2d, v17.2d fmul v6.2d, v3.2d, v16.2d fadd v5.2d, v5.2d, v6.2d st2 {v4.2d, v5.2d}, [X_COPY], #32 ld2 {v18.2d, v19.2d}, [X], #32 fmul v20.2d, v18.2d, v16.2d fmul v6.2d, v19.2d, v17.2d fsub v20.2d, v20.2d, v6.2d fmul v21.2d, v18.2d, v17.2d fmul v6.2d, v19.2d, v16.2d fadd v21.2d, v21.2d, v6.2d st2 {v20.2d, v21.2d}, [X_COPY], #32 #endif PRFM PLDL1KEEP, [X, #1024] .endm .macro INIT_S #if !defined(DOUBLE) lsl INC_X, INC_X, #3 #else lsl INC_X, INC_X, #4 #endif .endm .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul s3, DA_R, v2.s[0] // DA_R*X0 fmul s5, DA_I, v2.s[1] // DA_I*X1 fsub s3, s3, s5 // DA_R*X0-DA_I*X1 fmul s4, DA_I, v2.s[0] // DA_I*X0 fmul s5, DA_R, v2.s[1] // DA_R*X1 fadd s4, s4, s5 // DA_I*X0+DA_R*X1 ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 st1 {v3.2s}, [X], INC_X #else ld1 {v2.2d}, [X] // X1, X0 fmul d3, DA_R, v2.d[0] // DA_R*X0 fmul d5, DA_I, v2.d[1] // DA_I*X1 fsub d3, d3, d5 // DA_R*X0-DA_I*X1 fmul d4, DA_I, v2.d[0] // DA_I*X0 fmul d5, DA_R, v2.d[1] // DA_R*X1 fadd d4, d4, d5 // DA_I*X0+DA_R*X1 ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 st1 {v3.2d}, [X], INC_X #endif .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE b zscal_begin data_ar: .word 0x3e44fae6 data_ai: .word 0x3d320fa2 data_xr: .word 0x3f4baff1 data_xi: .word 0xbe8ef0bd zscal_begin: ldr s20, data_ar ldr s21, data_ai ldr s22, data_xr ldr s23, data_xi fmul s24, s22, s21 fmla s24, s23, v20.s[0] fmul s25, s22, s21 fmul s26, s23, s20 fadd s25, s25, s26 mov X_COPY, X cmp N, xzr ble zscal_kernel_L999 fcmp DA_R, #0.0 bne zscal_kernel_R_non_zero fcmp DA_I, #0.0 beq zscal_kernel_RI_zero b zscal_kernel_R_zero zscal_kernel_R_non_zero: fcmp DA_I, #0.0 beq zscal_kernel_I_zero /******************************************************************************* * A_R != 0 && A_I != 0 *******************************************************************************/ zscal_kernel_RI_non_zero: INIT cmp INC_X, #1 bne zscal_kernel_S_BEGIN zscal_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr beq zscal_kernel_F1 KERNEL_INIT_F4 zscal_kernel_F4: KERNEL_F4 subs I, I, #1 bne zscal_kernel_F4 zscal_kernel_F1: ands I, N, #3 ble zscal_kernel_L999 zscal_kernel_F10: KERNEL_F1 subs I, I, #1 bne zscal_kernel_F10 mov w0, wzr ret zscal_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr ble zscal_kernel_S1 zscal_kernel_S4: KERNEL_S1 KERNEL_S1 KERNEL_S1 KERNEL_S1 subs I, I, #1 bne zscal_kernel_S4 zscal_kernel_S1: ands I, N, #3 ble zscal_kernel_L999 zscal_kernel_S10: KERNEL_S1 subs I, I, #1 bne zscal_kernel_S10 zscal_kernel_L999: mov w0, wzr ret /******************************************************************************* * A_R == 0 && A_I != 0 *******************************************************************************/ zscal_kernel_R_zero: INIT_S #if !defined(DOUBLE) eor v2.16b, v2.16b, v2.16b fsub s2, s2, DA_I ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I #else eor v2.16b, v2.16b, v2.16b fsub d2, d2, DA_I ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I #endif zscal_kernel_R_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1 st1 {v2.2s}, [X] #else ld1 {v2.2d}, [X] // X1, X0 fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0 ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1 st1 {v2.2d}, [X] #endif add X, X, INC_X subs N, N, #1 bne zscal_kernel_R_zero_1 mov w0, wzr ret /******************************************************************************* * A_R != 0 && A_I == 0 *******************************************************************************/ zscal_kernel_I_zero: INIT_S #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R #else ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R #endif zscal_kernel_I_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 st1 {v2.2s}, [X] #else ld1 {v2.2d}, [X] // X1, X0 fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 st1 {v2.2d}, [X] #endif add X, X, INC_X subs N, N, #1 bne zscal_kernel_I_zero_1 mov w0, wzr ret /******************************************************************************* * A_R == 0 && A_I == 0 *******************************************************************************/ zscal_kernel_RI_zero: INIT_S zscal_kernel_RI_zero_1: stp DA_R, DA_I, [X] add X, X, INC_X subs N, N, #1 bne zscal_kernel_RI_zero_1 mov w0, wzr ret EPILOGUE OpenBLAS-0.2.20/kernel/arm64/ztrmm_kernel_4x4.S000066400000000000000000001160501313527062700207140ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #define ASSEMBLER #include "common.h" /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7 */ /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ #define origM x0 #define origN x1 #define origK x2 #define origPA x3 #define origPB x4 #define pC x5 #define LDC x6 #define offset x7 #define counterL x8 #define counterI x9 #define counterJ x10 #define pB x11 #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 #define pCRow3 x15 #define pA x16 #define alphaR x17 #define alphaI x18 #define temp x19 #define tempOffset x20 #define tempK x21 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] #define A_PRE_SIZE 2560 #define B_PRE_SIZE 448 #define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmla #define OP_ir fmla #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmls #define OP_ir fmla #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define OP_rr fmla #define OP_ii fmla #define OP_ri fmla #define OP_ir fmls #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) #define OP_rr fmla #define OP_ii fmls #define OP_ri fmls #define OP_ir fmls #endif // 00 origM // 01 origN // 02 origK // 03 origPA // 04 origPB // 05 pC // 06 origLDC -> LDC // 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ // 11 pB // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 // 15 pCRow3 // 16 pA // 17 alpha_save_R // 18 must save alpha_save_I // 19 must save temp // 20 must save tempOffset // 21 must save tempK // 22 must save // 23 must save // 24 must save // 25 must save // 26 must save // 27 must save // 28 must save // 29 frame // 30 link // 31 sp //v00 ALPHA_R -> pA00_R, pA01_R //v01 ALPHA_I -> pA00_I, pA01_I //v02 pA02_R, pA03_R //v03 pA02_I, pA03_I //v04 pA10_R, pA11_R //v05 pA10_I, pA11_I //v06 pA12_R, pA13_R //v07 pA12_I, pA13_I //v08 must save pB00_R, pB01_R //v09 must save pB00_I, pB01_I //v10 must save pB02_R, pB03_R OR ALPHA0_R //v11 must save pB02_I, pB03_I OR ALPHA0_I //v12 must save pB10_R, pB11_R //v13 must save pB10_I, pB11_I //v14 must save pB12_R, pB13_R OR ALPHA1_R //v15 must save pB12_I, pB13_I OR ALPHA1_R //v16 must save pC00_R, pC01_R //v17 must save pC00_I, pC01_I //v18 pC02_R, pC03_R //v19 pC02_I, pC03_I //v20 pC10_R, pC11_R //v21 pC10_I, pC11_I //v22 pC12_R, pC13_R //v23 pC12_I, pC13_I //v24 pC20_R, pC21_R //v25 pC20_I, pC21_I //v26 pC22_R, pC23_R //v27 pC22_I, pC23_I //v28 pC30_R, pC31_R //v29 pC30_I, pC31_I //v30 pC32_R, pC33_R //v31 pC32_I, pC33_I /******************************************************************************* * Macro definitions *******************************************************************************/ .macro INIT4x4 fmov d16, xzr fmov d17, d16 fmov d18, d17 fmov d19, d16 fmov d20, d17 fmov d21, d16 fmov d22, d17 fmov d23, d16 fmov d24, d17 fmov d25, d16 fmov d26, d17 fmov d27, d16 fmov d28, d17 fmov d29, d16 fmov d30, d17 fmov d31, d16 .endm .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b fmls v17.2d, v0.2d, v9.d[0] #else fmul v17.2d, v0.2d, v9.d[0] #endif OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b fmls v21.2d, v0.2d, v9.d[1] #else fmul v21.2d, v0.2d, v9.d[1] #endif OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b fmls v23.2d, v2.2d, v9.d[1] #else fmul v23.2d, v2.2d, v9.d[1] #endif OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 fmul v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b fmls v19.2d, v2.2d, v9.d[0] #else fmul v19.2d, v2.2d, v9.d[0] #endif OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v4.2d, v5.2d} , [pA] add pA, pA, #32 fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b fmls v25.2d, v0.2d, v11.d[0] #else fmul v25.2d, v0.2d, v11.d[0] #endif OP_ir v25.2d, v1.2d, v10.d[0] ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b fmls v27.2d, v2.2d, v11.d[0] #else fmul v27.2d, v2.2d, v11.d[0] #endif OP_ir v27.2d, v3.2d, v10.d[0] ld2 {v14.2d, v15.2d}, [pB] add pB, pB, #32 fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b fmls v29.2d, v0.2d, v11.d[1] #else fmul v29.2d, v0.2d, v11.d[1] #endif OP_ir v29.2d, v1.2d, v10.d[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b fmls v31.2d, v2.2d, v11.d[1] #else fmul v31.2d, v2.2d, v11.d[1] #endif OP_ir v31.2d, v3.2d, v10.d[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v4.2d, v5.2d} , [pA] add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v14.2d, v15.2d}, [pB] add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v10.d[1] OP_rr v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] OP_ri v31.2d, v2.2d, v11.d[1] OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 OP_rr v16.2d, v4.2d, v12.d[0] OP_ii v16.2d, v5.2d, v13.d[0] OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] OP_ii v18.2d, v7.2d, v13.d[0] OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] OP_ri v29.2d, v4.2d, v15.d[1] OP_ir v29.2d, v5.2d, v14.d[1] OP_rr v30.2d, v6.2d, v14.d[1] OP_ii v30.2d, v7.2d, v15.d[1] OP_ri v31.2d, v6.2d, v15.d[1] OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E OP_rr v16.2d, v4.2d, v12.d[0] OP_ii v16.2d, v5.2d, v13.d[0] OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] OP_rr v18.2d, v6.2d, v12.d[0] OP_ii v18.2d, v7.2d, v13.d[0] OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] OP_ri v29.2d, v4.2d, v15.d[1] OP_ir v29.2d, v5.2d, v14.d[1] OP_rr v30.2d, v6.2d, v14.d[1] OP_ii v30.2d, v7.2d, v15.d[1] OP_ri v31.2d, v6.2d, v15.d[1] OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v10.d[1] OP_rr v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] OP_ri v31.2d, v2.2d, v11.d[1] OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmul v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow0] add pCRow0, pCRow0, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmul v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #32 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmul v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, #32 fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I fmul v7.2d, v22.2d, alphaV0_I fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow1] add pCRow1, pCRow1, #32 prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I fmul v1.2d, v24.2d, alphaV0_I fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow2] add pCRow2, pCRow2, #32 fmul v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I fmul v3.2d, v26.2d, alphaV0_I fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow2, pCRow2, #32 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I fmul v5.2d, v28.2d, alphaV0_I fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow3] add pCRow3, pCRow3, #32 fmul v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I fmul v7.2d, v30.2d, alphaV0_I fmla v7.2d, v31.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow3] add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT2x4 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 fmov d24, d16 fmov d25, d17 fmov d28, d16 fmov d29, d17 .endm .macro KERNEL2x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] OP_ri v29.2d, v0.2d, v11.d[1] OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmul v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmul v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I fmul v1.2d, v24.2d, alphaV0_I fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I fmul v5.2d, v28.2d, alphaV0_I fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x4 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 fmov d24, d16 fmov d25, d17 fmov d28, d16 fmov d29, d17 .endm .macro KERNEL1x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v10.2d, v11.2d}, [pB] add pB, pB, #32 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] OP_rr d20, d0, v8.d[1] OP_ii d20, d1, v9.d[1] OP_ri d21, d0, v9.d[1] OP_ir d21, d1, v8.d[1] OP_rr d24, d0, v10.d[0] OP_ii d24, d1, v11.d[0] OP_ri d25, d0, v11.d[0] OP_ir d25, d1, v10.d[0] OP_rr d28, d0, v10.d[1] OP_ii d28, d1, v11.d[1] OP_ri d29, d0, v11.d[1] OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmul d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I fmul d5, d20, alphaV0_I fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d0, d24, alphaV0_R fmls d0, d25, alphaV0_I fmul d1, d24, alphaV0_I fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d28, alphaV0_R fmls d4, d29, alphaV0_I fmul d5, d28, alphaV0_I fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT4x2 fmov d16, xzr fmov d17, xzr fmov d18, d16 fmov d19, d17 fmov d20, d16 fmov d21, d17 fmov d22, d16 fmov d23, d17 .endm .macro KERNEL4x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmul v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmul v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmul v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I fmul v7.2d, v22.2d, alphaV0_I fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT2x2 fmov d16, xzr fmov d17, xzr fmov d20, d16 fmov d21, d17 .endm .macro KERNEL2x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmul v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I fmul v5.2d, v20.2d, alphaV0_I fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x2 fmov d16, xzr fmov d17, xzr fmov d20, xzr fmov d21, xzr .endm .macro KERNEL1x2_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] OP_rr d20, d0, v8.d[1] OP_ii d20, d1, v9.d[1] OP_ri d21, d0, v9.d[1] OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmul d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I fmul d5, d20, alphaV0_I fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT4x1 fmov d16, xzr fmov d17, d16 fmov d18, d16 fmov d19, d17 .endm .macro KERNEL4x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] OP_rr v18.2d, v2.2d, v8.d[0] OP_ii v18.2d, v3.2d, v9.d[0] OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] .endm .macro SAVE4x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmul v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I fmul v3.2d, v18.2d, alphaV0_I fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 .endm /******************************************************************************/ .macro INIT2x1 fmov d16, xzr fmov d17, xzr .endm .macro KERNEL2x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] .endm .macro SAVE2x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I fmul v1.2d, v16.2d, alphaV0_I fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ .macro INIT1x1 fmov d16, xzr fmov d17, xzr .endm .macro KERNEL1x1_SUB ld2 {v8.d, v9.d}[0], [pB] add pB, pB, #16 ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 OP_rr d16, d0, v8.d[0] OP_ii d16, d1, v9.d[0] OP_ri d17, d0, v9.d[0] OP_ir d17, d1, v8.d[0] .endm .macro SAVE1x1 fmov alpha0_R, alphaR fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I fmul d1, d16, alphaV0_I fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 .endm /******************************************************************************* * End of macro definitions *******************************************************************************/ PROLOGUE .align 5 add sp, sp, #-(11 * 16) stp d8, d9, [sp, #(0 * 16)] stp d10, d11, [sp, #(1 * 16)] stp d12, d13, [sp, #(2 * 16)] stp d14, d15, [sp, #(3 * 16)] stp d16, d17, [sp, #(4 * 16)] stp x18, x19, [sp, #(5 * 16)] stp x20, x21, [sp, #(6 * 16)] stp x22, x23, [sp, #(7 * 16)] stp x24, x25, [sp, #(8 * 16)] stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPA] fmov alphaR, d0 fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 #if !defined(LEFT) neg tempOffset, offset #endif mov pB, origPB mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble ztrmm_kernel_L2_BEGIN ztrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC add pC, pCRow3, LDC #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = start of A array ztrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble ztrmm_kernel_L4_M2_BEGIN .align 5 ztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pB, pB, temp add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 cmp counterL , #2 blt ztrmm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #2 ble ztrmm_kernel_L4_M4_22a .align 5 ztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M4_22 .align 5 ztrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 .align 5 ztrmm_kernel_L4_M4_32: tst counterL, #1 ble ztrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 ztrmm_kernel_L4_M4_40: INIT4x4 ztrmm_kernel_L4_M4_44: ands counterL , tempK, #7 ble ztrmm_kernel_L4_M4_100 .align 5 ztrmm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 bne ztrmm_kernel_L4_M4_46 ztrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #6 add pA, pA, temp add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] ztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne ztrmm_kernel_L4_M4_20 ztrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ztrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 ble ztrmm_kernel_L4_M1_BEGIN ztrmm_kernel_L4_M2_20: INIT2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pA, pA, temp lsl temp, tempOffset, #6 add pB, pB, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ztrmm_kernel_L4_M2_40 ztrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M2_22 ztrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L4_M2_100 ztrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M2_42 ztrmm_kernel_L4_M2_100: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #6 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ztrmm_kernel_L4_M2_END: ztrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ztrmm_kernel_L4_END ztrmm_kernel_L4_M1_20: INIT1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #6 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #4 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ztrmm_kernel_L4_M1_40 ztrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M1_22 ztrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L4_M1_100 ztrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M1_42 ztrmm_kernel_L4_M1_100: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #4 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #6 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif ztrmm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- bgt ztrmm_kernel_L4_BEGIN /******************************************************************************/ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 ble ztrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 ble ztrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC add pC,pC,LDC, lsl #1 #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A ztrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble ztrmm_kernel_L2_M2_BEGIN ztrmm_kernel_L2_M4_20: INIT4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #6 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble ztrmm_kernel_L2_M4_40 .align 5 ztrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L2_M4_22 ztrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L2_M4_100 ztrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L2_M4_42 ztrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ztrmm_kernel_L2_M4_END: subs counterI, counterI, #1 bgt ztrmm_kernel_L2_M4_20 ztrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ztrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 ble ztrmm_kernel_L2_M1_BEGIN ztrmm_kernel_L2_M2_20: INIT2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 ble ztrmm_kernel_L2_M2_40 ztrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L2_M2_22 ztrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L2_M2_100 ztrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L2_M2_42 ztrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ztrmm_kernel_L2_M2_END: ztrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ztrmm_kernel_L2_END ztrmm_kernel_L2_M1_20: INIT1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #5 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #2 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 ble ztrmm_kernel_L2_M1_40 ztrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L2_M1_22 ztrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L2_M1_100 ztrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L2_M1_42 ztrmm_kernel_L2_M1_100: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #1 #else sub tempK, tempK, #2 #endif lsl temp, tempK, #4 add pA, pA, temp lsl temp, tempK, #5 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #1 #endif ztrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ ztrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 ble ztrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next #if defined(LEFT) mov tempOffset, offset #endif mov pA, origPA // pA = A ztrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble ztrmm_kernel_L1_M2_BEGIN ztrmm_kernel_L1_M4_20: INIT4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #6 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #4 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ztrmm_kernel_L1_M4_40 .align 5 ztrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L1_M4_22 ztrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L1_M4_100 ztrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L1_M4_42 ztrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #4 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #6 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #4 #endif ztrmm_kernel_L1_M4_END: subs counterI, counterI, #1 bgt ztrmm_kernel_L1_M4_20 ztrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 ble ztrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 ble ztrmm_kernel_L1_M1_BEGIN ztrmm_kernel_L1_M2_20: INIT2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #5 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #2 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ztrmm_kernel_L1_M2_40 ztrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L1_M2_22 ztrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L1_M2_100 ztrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L1_M2_42 ztrmm_kernel_L1_M2_100: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub tempK, origK, tempOffset #if defined(LEFT) sub tempK, tempK, #2 #else sub tempK, tempK, #1 #endif lsl temp, tempK, #5 add pA, pA, temp lsl temp, tempK, #4 add pB, pB, temp #endif #if defined(LEFT) add tempOffset, tempOffset, #2 #endif ztrmm_kernel_L1_M2_END: ztrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 ble ztrmm_kernel_L1_END ztrmm_kernel_L1_M1_20: INIT1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB #else mov pB, origPB lsl temp, tempOffset, #4 add pB, pB, temp lsl temp, tempOffset, #4 add pA, pA, temp #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub tempK, origK, tempOffset #elif defined(LEFT) add tempK, tempOffset, #1 #else add tempK, tempOffset, #1 #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble ztrmm_kernel_L1_M1_40 ztrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L1_M1_22 ztrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 ble ztrmm_kernel_L1_M1_100 ztrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 bgt ztrmm_kernel_L1_M1_42 ztrmm_kernel_L1_M1_100: SAVE1x1 ztrmm_kernel_L1_END: ztrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] ldp d12, d13, [sp, #(2 * 16)] ldp d14, d15, [sp, #(3 * 16)] ldp d16, d17, [sp, #(4 * 16)] ldp x18, x19, [sp, #(5 * 16)] ldp x20, x21, [sp, #(6 * 16)] ldp x22, x23, [sp, #(7 * 16)] ldp x24, x25, [sp, #(8 * 16)] ldp x26, x27, [sp, #(9 * 16)] ldr x28, [sp, #(10 * 16)] add sp, sp, #(11*16) ret EPILOGUE OpenBLAS-0.2.20/kernel/generic/000077500000000000000000000000001313527062700160405ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/generic/cabs.c000066400000000000000000000053121313527062700171150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" FLOAT NAME(FLOAT *a){ return fabs(a[0]) + fabs(a[1]); } OpenBLAS-0.2.20/kernel/generic/dot.c000066400000000000000000000052471313527062700170020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif { BLASLONG i=0; BLASLONG ix=0,iy=0; #if defined(DSDOT) double dot = 0.0 ; #else FLOAT dot = 0.0 ; #endif if ( n < 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { int n1 = n & -4; while(i < n1) { #if defined(DSDOT) dot += (double) y[i] * (double) x[i] + (double) y[i+1] * (double) x[i+1] + (double) y[i+2] * (double) x[i+2] + (double) y[i+3] * (double) x[i+3] ; #else dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] ; #endif i+=4 ; } while(i < n) { #if defined(DSDOT) dot += (double) y[i] * (double) x[i] ; #else dot += y[i] * x[i] ; #endif i++ ; } return(dot); } while(i < n) { #if defined(DSDOT) dot += (double) y[iy] * (double) x[ix] ; #else dot += y[iy] * x[ix] ; #endif ix += inc_x ; iy += inc_y ; i++ ; } return(dot); } OpenBLAS-0.2.20/kernel/generic/geadd.c000066400000000000000000000042141313527062700172510ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb) { BLASLONG i; FLOAT *aptr,*bptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); aptr = a; bptr = b; if ( alpha == 0.0 ) { for ( i=0; i> 3); if (i > 0){ do { *(c_offset1 + 0) = ZERO; *(c_offset1 + 1) = ZERO; *(c_offset1 + 2) = ZERO; *(c_offset1 + 3) = ZERO; *(c_offset1 + 4) = ZERO; *(c_offset1 + 5) = ZERO; *(c_offset1 + 6) = ZERO; *(c_offset1 + 7) = ZERO; c_offset1 += 8; i --; } while (i > 0); } i = (m & 7); if (i > 0){ do { *c_offset1 = ZERO; c_offset1 ++; i --; } while (i > 0); } j --; } while (j > 0); } else { j = n; do { c_offset1 = c_offset; c_offset += ldc; i = (m >> 3); if (i > 0){ do { ctemp1 = *(c_offset1 + 0); ctemp2 = *(c_offset1 + 1); ctemp3 = *(c_offset1 + 2); ctemp4 = *(c_offset1 + 3); ctemp5 = *(c_offset1 + 4); ctemp6 = *(c_offset1 + 5); ctemp7 = *(c_offset1 + 6); ctemp8 = *(c_offset1 + 7); ctemp1 *= beta; ctemp2 *= beta; ctemp3 *= beta; ctemp4 *= beta; ctemp5 *= beta; ctemp6 *= beta; ctemp7 *= beta; ctemp8 *= beta; *(c_offset1 + 0) = ctemp1; *(c_offset1 + 1) = ctemp2; *(c_offset1 + 2) = ctemp3; *(c_offset1 + 3) = ctemp4; *(c_offset1 + 4) = ctemp5; *(c_offset1 + 5) = ctemp6; *(c_offset1 + 6) = ctemp7; *(c_offset1 + 7) = ctemp8; c_offset1 += 8; i --; } while (i > 0); } i = (m & 7); if (i > 0){ do { ctemp1 = *c_offset1; ctemp1 *= beta; *c_offset1 = ctemp1; c_offset1 ++; i --; } while (i > 0); } j --; } while (j > 0); } return 0; }; OpenBLAS-0.2.20/kernel/generic/gemm_ncopy_1.c000066400000000000000000000070131313527062700205620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1; FLOAT *b_offset; a_offset = a; b_offset = b; j = n; if (j > 0){ do { a_offset1 = a_offset; a_offset += lda; i = (m >> 3); if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); *(b_offset + 1) = *(a_offset1 + 1); *(b_offset + 2) = *(a_offset1 + 2); *(b_offset + 3) = *(a_offset1 + 3); *(b_offset + 4) = *(a_offset1 + 4); *(b_offset + 5) = *(a_offset1 + 5); *(b_offset + 6) = *(a_offset1 + 6); *(b_offset + 7) = *(a_offset1 + 7); a_offset1 += 8; b_offset += 8; i --; } while (i > 0); } i = (m & 7); if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); a_offset1 ++; b_offset ++; i --; } while (i > 0); } j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_ncopy_16.c000066400000000000000000000265731313527062700206640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; j = (n >> 4); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset5 = aoffset4 + lda; aoffset6 = aoffset5 + lda; aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset9 = aoffset8 + lda; aoffset10 = aoffset9 + lda; aoffset11 = aoffset10 + lda; aoffset12 = aoffset11 + lda; aoffset13 = aoffset12 + lda; aoffset14 = aoffset13 + lda; aoffset15 = aoffset14 + lda; aoffset16 = aoffset15 + lda; aoffset += 16 * lda; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); ctemp17 = *(aoffset9 + 0); ctemp18 = *(aoffset9 + 1); ctemp19 = *(aoffset10 + 0); ctemp20 = *(aoffset10 + 1); ctemp21 = *(aoffset11 + 0); ctemp22 = *(aoffset11 + 1); ctemp23 = *(aoffset12 + 0); ctemp24 = *(aoffset12 + 1); ctemp25 = *(aoffset13 + 0); ctemp26 = *(aoffset13 + 1); ctemp27 = *(aoffset14 + 0); ctemp28 = *(aoffset14 + 1); ctemp29 = *(aoffset15 + 0); ctemp30 = *(aoffset15 + 1); ctemp31 = *(aoffset16 + 0); ctemp32 = *(aoffset16 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp07; *(boffset + 4) = ctemp09; *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; *(boffset + 8) = ctemp17; *(boffset + 9) = ctemp19; *(boffset + 10) = ctemp21; *(boffset + 11) = ctemp23; *(boffset + 12) = ctemp25; *(boffset + 13) = ctemp27; *(boffset + 14) = ctemp29; *(boffset + 15) = ctemp31; *(boffset + 16) = ctemp02; *(boffset + 17) = ctemp04; *(boffset + 18) = ctemp06; *(boffset + 19) = ctemp08; *(boffset + 20) = ctemp10; *(boffset + 21) = ctemp12; *(boffset + 22) = ctemp14; *(boffset + 23) = ctemp16; *(boffset + 24) = ctemp18; *(boffset + 25) = ctemp20; *(boffset + 26) = ctemp22; *(boffset + 27) = ctemp24; *(boffset + 28) = ctemp26; *(boffset + 29) = ctemp28; *(boffset + 30) = ctemp30; *(boffset + 31) = ctemp32; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; aoffset5 += 2; aoffset6 += 2; aoffset7 += 2; aoffset8 += 2; aoffset9 += 2; aoffset10 += 2; aoffset11 += 2; aoffset12 += 2; aoffset13 += 2; aoffset14 += 2; aoffset15 += 2; aoffset16 += 2; boffset += 32; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp03 = *(aoffset2 + 0); ctemp05 = *(aoffset3 + 0); ctemp07 = *(aoffset4 + 0); ctemp09 = *(aoffset5 + 0); ctemp11 = *(aoffset6 + 0); ctemp13 = *(aoffset7 + 0); ctemp15 = *(aoffset8 + 0); ctemp17 = *(aoffset9 + 0); ctemp19 = *(aoffset10 + 0); ctemp21 = *(aoffset11 + 0); ctemp23 = *(aoffset12 + 0); ctemp25 = *(aoffset13 + 0); ctemp27 = *(aoffset14 + 0); ctemp29 = *(aoffset15 + 0); ctemp31 = *(aoffset16 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp07; *(boffset + 4) = ctemp09; *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; *(boffset + 8) = ctemp17; *(boffset + 9) = ctemp19; *(boffset + 10) = ctemp21; *(boffset + 11) = ctemp23; *(boffset + 12) = ctemp25; *(boffset + 13) = ctemp27; *(boffset + 14) = ctemp29; *(boffset + 15) = ctemp31; boffset += 16; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 8){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset5 = aoffset4 + lda; aoffset6 = aoffset5 + lda; aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp07; *(boffset + 4) = ctemp09; *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; *(boffset + 8) = ctemp02; *(boffset + 9) = ctemp04; *(boffset + 10) = ctemp06; *(boffset + 11) = ctemp08; *(boffset + 12) = ctemp10; *(boffset + 13) = ctemp12; *(boffset + 14) = ctemp14; *(boffset + 15) = ctemp16; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; aoffset5 += 2; aoffset6 += 2; aoffset7 += 2; aoffset8 += 2; boffset += 16; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp03 = *(aoffset2 + 0); ctemp05 = *(aoffset3 + 0); ctemp07 = *(aoffset4 + 0); ctemp09 = *(aoffset5 + 0); ctemp11 = *(aoffset6 + 0); ctemp13 = *(aoffset7 + 0); ctemp15 = *(aoffset8 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp07; *(boffset + 4) = ctemp09; *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; boffset += 8; } } if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp07; *(boffset + 4) = ctemp02; *(boffset + 5) = ctemp04; *(boffset + 6) = ctemp06; *(boffset + 7) = ctemp08; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; boffset += 8; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp03 = *(aoffset2 + 0); ctemp05 = *(aoffset3 + 0); ctemp07 = *(aoffset4 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp07; boffset += 4; } } if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp02; *(boffset + 3) = ctemp04; aoffset1 += 2; aoffset2 += 2; boffset += 4; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp03 = *(aoffset2 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; boffset += 2; } } if (n & 1){ aoffset1 = aoffset; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; aoffset1 += 2; boffset += 2; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); *(boffset + 0) = ctemp01; boffset += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_ncopy_2.c000066400000000000000000000103071313527062700205630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset; a_offset = a; b_offset = b; j = (n >> 1); if (j > 0){ do { a_offset1 = a_offset; a_offset2 = a_offset + lda; a_offset += 2 * lda; i = (m >> 2); if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); *(b_offset + 1) = *(a_offset2 + 0); *(b_offset + 2) = *(a_offset1 + 1); *(b_offset + 3) = *(a_offset2 + 1); *(b_offset + 4) = *(a_offset1 + 2); *(b_offset + 5) = *(a_offset2 + 2); *(b_offset + 6) = *(a_offset1 + 3); *(b_offset + 7) = *(a_offset2 + 3); a_offset1 += 4; a_offset2 += 4; b_offset += 8; i --; } while (i > 0); } i = (m & 3); if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); *(b_offset + 1) = *(a_offset2 + 0); a_offset1 ++; a_offset2 ++; b_offset += 2; i --; } while (i > 0); } j --; } while (j > 0); } if (n & 1){ i = (m >> 3); if (i > 0){ do { *(b_offset + 0) = *(a_offset + 0); *(b_offset + 1) = *(a_offset + 1); *(b_offset + 2) = *(a_offset + 2); *(b_offset + 3) = *(a_offset + 3); *(b_offset + 4) = *(a_offset + 4); *(b_offset + 5) = *(a_offset + 5); *(b_offset + 6) = *(a_offset + 6); *(b_offset + 7) = *(a_offset + 7); a_offset += 8; b_offset += 8; i --; } while (i > 0); } i = (m & 7); if (i > 0){ do { *(b_offset + 0) = *(a_offset + 0); a_offset ++; b_offset ++; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_ncopy_4.c000066400000000000000000000145101313527062700205650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; j = (n >> 2); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; i = (m >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); ctemp16 = *(a_offset4 + 3); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; *(b_offset + 4) = ctemp2; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp10; *(b_offset + 7) = ctemp14; *(b_offset + 8) = ctemp3; *(b_offset + 9) = ctemp7; *(b_offset + 10) = ctemp11; *(b_offset + 11) = ctemp15; *(b_offset + 12) = ctemp4; *(b_offset + 13) = ctemp8; *(b_offset + 14) = ctemp12; *(b_offset + 15) = ctemp16; a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; b_offset += 16; i --; }while(i > 0); } i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp5 = *(a_offset2 + 0); ctemp9 = *(a_offset3 + 0); ctemp13 = *(a_offset4 + 0); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; a_offset1 ++; a_offset2 ++; a_offset3 ++; a_offset4 ++; b_offset += 4; i --; }while(i > 0); } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; i = (m >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp2; *(b_offset + 3) = ctemp6; *(b_offset + 4) = ctemp3; *(b_offset + 5) = ctemp7; *(b_offset + 6) = ctemp4; *(b_offset + 7) = ctemp8; a_offset1 += 4; a_offset2 += 4; b_offset += 8; i --; }while(i > 0); } i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp5 = *(a_offset2 + 0); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; a_offset1 ++; a_offset2 ++; b_offset += 2; i --; }while(i > 0); } } /* end of if(j > 0) */ if (n & 1){ a_offset1 = a_offset; i = (m >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; a_offset1 += 4; b_offset += 4; i --; }while(i > 0); } i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); *(b_offset + 0) = ctemp1; a_offset1 ++; b_offset += 1; i --; }while(i > 0); } } /* end of if(j > 0) */ return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_ncopy_6.c000066400000000000000000000145101313527062700205670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; j = (n >> 2); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; i = (m >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); ctemp16 = *(a_offset4 + 3); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; *(b_offset + 4) = ctemp2; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp10; *(b_offset + 7) = ctemp14; *(b_offset + 8) = ctemp3; *(b_offset + 9) = ctemp7; *(b_offset + 10) = ctemp11; *(b_offset + 11) = ctemp15; *(b_offset + 12) = ctemp4; *(b_offset + 13) = ctemp8; *(b_offset + 14) = ctemp12; *(b_offset + 15) = ctemp16; a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; b_offset += 16; i --; }while(i > 0); } i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp5 = *(a_offset2 + 0); ctemp9 = *(a_offset3 + 0); ctemp13 = *(a_offset4 + 0); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; a_offset1 ++; a_offset2 ++; a_offset3 ++; a_offset4 ++; b_offset += 4; i --; }while(i > 0); } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; i = (m >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp2; *(b_offset + 3) = ctemp6; *(b_offset + 4) = ctemp3; *(b_offset + 5) = ctemp7; *(b_offset + 6) = ctemp4; *(b_offset + 7) = ctemp8; a_offset1 += 4; a_offset2 += 4; b_offset += 8; i --; }while(i > 0); } i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp5 = *(a_offset2 + 0); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; a_offset1 ++; a_offset2 ++; b_offset += 2; i --; }while(i > 0); } } /* end of if(j > 0) */ if (n & 1){ a_offset1 = a_offset; i = (m >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; a_offset1 += 4; b_offset += 4; i --; }while(i > 0); } i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); *(b_offset + 0) = ctemp1; a_offset1 ++; b_offset += 1; i --; }while(i > 0); } } /* end of if(j > 0) */ return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_ncopy_8.c000066400000000000000000000264121313527062700205750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; FLOAT ctemp33, ctemp34, ctemp35, ctemp36; FLOAT ctemp37, ctemp38, ctemp39, ctemp40; FLOAT ctemp41, ctemp42, ctemp43, ctemp44; FLOAT ctemp45, ctemp46, ctemp47, ctemp48; FLOAT ctemp49, ctemp50, ctemp51, ctemp52; FLOAT ctemp53, ctemp54, ctemp55, ctemp56; FLOAT ctemp57, ctemp58, ctemp59, ctemp60; FLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; boffset = b; j = (n >> 3); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset5 = aoffset4 + lda; aoffset6 = aoffset5 + lda; aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; i = (m >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); ctemp33 = *(aoffset5 + 0); ctemp34 = *(aoffset5 + 1); ctemp35 = *(aoffset5 + 2); ctemp36 = *(aoffset5 + 3); ctemp37 = *(aoffset5 + 4); ctemp38 = *(aoffset5 + 5); ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); ctemp41 = *(aoffset6 + 0); ctemp42 = *(aoffset6 + 1); ctemp43 = *(aoffset6 + 2); ctemp44 = *(aoffset6 + 3); ctemp45 = *(aoffset6 + 4); ctemp46 = *(aoffset6 + 5); ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); ctemp49 = *(aoffset7 + 0); ctemp50 = *(aoffset7 + 1); ctemp51 = *(aoffset7 + 2); ctemp52 = *(aoffset7 + 3); ctemp53 = *(aoffset7 + 4); ctemp54 = *(aoffset7 + 5); ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); ctemp57 = *(aoffset8 + 0); ctemp58 = *(aoffset8 + 1); ctemp59 = *(aoffset8 + 2); ctemp60 = *(aoffset8 + 3); ctemp61 = *(aoffset8 + 4); ctemp62 = *(aoffset8 + 5); ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp09; *(boffset + 2) = ctemp17; *(boffset + 3) = ctemp25; *(boffset + 4) = ctemp33; *(boffset + 5) = ctemp41; *(boffset + 6) = ctemp49; *(boffset + 7) = ctemp57; *(boffset + 8) = ctemp02; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp18; *(boffset + 11) = ctemp26; *(boffset + 12) = ctemp34; *(boffset + 13) = ctemp42; *(boffset + 14) = ctemp50; *(boffset + 15) = ctemp58; *(boffset + 16) = ctemp03; *(boffset + 17) = ctemp11; *(boffset + 18) = ctemp19; *(boffset + 19) = ctemp27; *(boffset + 20) = ctemp35; *(boffset + 21) = ctemp43; *(boffset + 22) = ctemp51; *(boffset + 23) = ctemp59; *(boffset + 24) = ctemp04; *(boffset + 25) = ctemp12; *(boffset + 26) = ctemp20; *(boffset + 27) = ctemp28; *(boffset + 28) = ctemp36; *(boffset + 29) = ctemp44; *(boffset + 30) = ctemp52; *(boffset + 31) = ctemp60; *(boffset + 32) = ctemp05; *(boffset + 33) = ctemp13; *(boffset + 34) = ctemp21; *(boffset + 35) = ctemp29; *(boffset + 36) = ctemp37; *(boffset + 37) = ctemp45; *(boffset + 38) = ctemp53; *(boffset + 39) = ctemp61; *(boffset + 40) = ctemp06; *(boffset + 41) = ctemp14; *(boffset + 42) = ctemp22; *(boffset + 43) = ctemp30; *(boffset + 44) = ctemp38; *(boffset + 45) = ctemp46; *(boffset + 46) = ctemp54; *(boffset + 47) = ctemp62; *(boffset + 48) = ctemp07; *(boffset + 49) = ctemp15; *(boffset + 50) = ctemp23; *(boffset + 51) = ctemp31; *(boffset + 52) = ctemp39; *(boffset + 53) = ctemp47; *(boffset + 54) = ctemp55; *(boffset + 55) = ctemp63; *(boffset + 56) = ctemp08; *(boffset + 57) = ctemp16; *(boffset + 58) = ctemp24; *(boffset + 59) = ctemp32; *(boffset + 60) = ctemp40; *(boffset + 61) = ctemp48; *(boffset + 62) = ctemp56; *(boffset + 63) = ctemp64; aoffset1 += 8; aoffset2 += 8; aoffset3 += 8; aoffset4 += 8; aoffset5 += 8; aoffset6 += 8; aoffset7 += 8; aoffset8 += 8; boffset += 64; i --; }while(i > 0); } i = (m & 7); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp09 = *(aoffset2 + 0); ctemp17 = *(aoffset3 + 0); ctemp25 = *(aoffset4 + 0); ctemp33 = *(aoffset5 + 0); ctemp41 = *(aoffset6 + 0); ctemp49 = *(aoffset7 + 0); ctemp57 = *(aoffset8 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp09; *(boffset + 2) = ctemp17; *(boffset + 3) = ctemp25; *(boffset + 4) = ctemp33; *(boffset + 5) = ctemp41; *(boffset + 6) = ctemp49; *(boffset + 7) = ctemp57; aoffset1 ++; aoffset2 ++; aoffset3 ++; aoffset4 ++; aoffset5 ++; aoffset6 ++; aoffset7 ++; aoffset8 ++; boffset += 8; i --; }while(i > 0); } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; i = (m >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp05; *(boffset + 2) = ctemp09; *(boffset + 3) = ctemp13; *(boffset + 4) = ctemp02; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp10; *(boffset + 7) = ctemp14; *(boffset + 8) = ctemp03; *(boffset + 9) = ctemp07; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp15; *(boffset + 12) = ctemp04; *(boffset + 13) = ctemp08; *(boffset + 14) = ctemp12; *(boffset + 15) = ctemp16; aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; aoffset4 += 4; boffset += 16; i --; }while(i > 0); } i = (m & 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset2 + 0); ctemp03 = *(aoffset3 + 0); ctemp04 = *(aoffset4 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; aoffset1 ++; aoffset2 ++; aoffset3 ++; aoffset4 ++; boffset += 4; i --; }while(i > 0); } } /* end of if(j > 0) */ if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp02; *(boffset + 3) = ctemp04; aoffset1 += 2; aoffset2 += 2; boffset += 4; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset2 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; aoffset1 ++; aoffset2 ++; boffset += 2; } } /* end of if(j > 0) */ if (n & 1){ aoffset1 = aoffset; i = m; if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); *(boffset + 0) = ctemp01; aoffset1 ++; boffset ++; i --; }while(i > 0); } } /* end of if(j > 0) */ return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_tcopy_1.c000066400000000000000000000062271313527062700205760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1; FLOAT *b_offset, *b_offset1; a_offset = a; b_offset = b; i = m; if (i > 0) { do { a_offset1 = a_offset; a_offset += lda; b_offset1 = b_offset; b_offset ++; j = n; if (j > 0) { do { *(b_offset1 + 0) = *(a_offset1 + 0); a_offset1 ++; b_offset1 += m; j --; } while (j > 0); } i --; } while (i > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_tcopy_16.c000066400000000000000000000242421313527062700206610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; #if 0 fprintf(stderr, "m = %d n = %d\n", m, n); #endif j = (n >> 4); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 16; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); ctemp17 = *(aoffset2 + 0); ctemp18 = *(aoffset2 + 1); ctemp19 = *(aoffset2 + 2); ctemp20 = *(aoffset2 + 3); ctemp21 = *(aoffset2 + 4); ctemp22 = *(aoffset2 + 5); ctemp23 = *(aoffset2 + 6); ctemp24 = *(aoffset2 + 7); ctemp25 = *(aoffset2 + 8); ctemp26 = *(aoffset2 + 9); ctemp27 = *(aoffset2 + 10); ctemp28 = *(aoffset2 + 11); ctemp29 = *(aoffset2 + 12); ctemp30 = *(aoffset2 + 13); ctemp31 = *(aoffset2 + 14); ctemp32 = *(aoffset2 + 15); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp13; *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; *(boffset + 16) = ctemp17; *(boffset + 17) = ctemp18; *(boffset + 18) = ctemp19; *(boffset + 19) = ctemp20; *(boffset + 20) = ctemp21; *(boffset + 21) = ctemp22; *(boffset + 22) = ctemp23; *(boffset + 23) = ctemp24; *(boffset + 24) = ctemp25; *(boffset + 25) = ctemp26; *(boffset + 26) = ctemp27; *(boffset + 27) = ctemp28; *(boffset + 28) = ctemp29; *(boffset + 29) = ctemp30; *(boffset + 30) = ctemp31; *(boffset + 31) = ctemp32; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 32; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp13; *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; boffset += 16; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 8){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp13; *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; boffset += 8; } } if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; boffset += 4; } } if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; boffset += 2; } } if (n & 1){ aoffset1 = aoffset; aoffset2 = aoffset + lda; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset2 + 0); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 2; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); *(boffset + 0) = ctemp01; boffset += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_tcopy_2.c000066400000000000000000000074451313527062700206020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset, *b_offset1, *b_offset2; a_offset = a; b_offset = b; b_offset2 = b + m * (n & ~1); i = (m >> 1); if (i > 0) { do { a_offset1 = a_offset; a_offset2 = a_offset + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 4; j = (n >> 1); if (j > 0){ do { *(b_offset1 + 0) = *(a_offset1 + 0); *(b_offset1 + 1) = *(a_offset1 + 1); *(b_offset1 + 2) = *(a_offset2 + 0); *(b_offset1 + 3) = *(a_offset2 + 1); a_offset1 += 2; a_offset2 += 2; b_offset1 += m * 2; j--; } while (j > 0); } if (n & 1){ *(b_offset2 + 0) = *(a_offset1 + 0); *(b_offset2 + 1) = *(a_offset2 + 0); b_offset2 += 2; } i --; } while (i > 0); } if (m & 1) { j = (n >> 1); if (j > 0){ do { *(b_offset + 0) = *(a_offset + 0); *(b_offset + 1) = *(a_offset + 1); a_offset += 2; b_offset += m * 2; j--; } while (j > 0); } if (n & 1){ *(b_offset2 + 0) = *(a_offset + 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_tcopy_4.c000066400000000000000000000166061313527062700206030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; b_offset2 = b + m * (n & ~3); b_offset3 = b + m * (n & ~1); j = (m >> 2); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; b_offset1 = b_offset; b_offset += 16; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); ctemp16 = *(a_offset4 + 3); a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; *(b_offset1 + 7) = ctemp8; *(b_offset1 + 8) = ctemp9; *(b_offset1 + 9) = ctemp10; *(b_offset1 + 10) = ctemp11; *(b_offset1 + 11) = ctemp12; *(b_offset1 + 12) = ctemp13; *(b_offset1 + 13) = ctemp14; *(b_offset1 + 14) = ctemp15; *(b_offset1 + 15) = ctemp16; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); ctemp5 = *(a_offset3 + 0); ctemp6 = *(a_offset3 + 1); ctemp7 = *(a_offset4 + 0); ctemp8 = *(a_offset4 + 1); a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; *(b_offset2 + 4) = ctemp5; *(b_offset2 + 5) = ctemp6; *(b_offset2 + 6) = ctemp7; *(b_offset2 + 7) = ctemp8; b_offset2 += 8; } if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); ctemp3 = *(a_offset3 + 0); ctemp4 = *(a_offset4 + 0); *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; *(b_offset3 + 2) = ctemp3; *(b_offset3 + 3) = ctemp4; b_offset3 += 4; } j--; }while(j > 0); } if (m & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 8; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); a_offset1 += 4; a_offset2 += 4; *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; *(b_offset1 + 7) = ctemp8; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); a_offset1 += 2; a_offset2 += 2; *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; b_offset2 += 4; } if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; b_offset3 += 2; } } if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); a_offset1 += 4; *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; b_offset1 += 4 * m; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); a_offset1 += 2; *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; } if (n & 1) { ctemp1 = *(a_offset1 + 0); *(b_offset3 + 0) = ctemp1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_tcopy_6.c000066400000000000000000000166061313527062700206050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; b_offset2 = b + m * (n & ~3); b_offset3 = b + m * (n & ~1); j = (m >> 2); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; b_offset1 = b_offset; b_offset += 16; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); ctemp16 = *(a_offset4 + 3); a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; *(b_offset1 + 7) = ctemp8; *(b_offset1 + 8) = ctemp9; *(b_offset1 + 9) = ctemp10; *(b_offset1 + 10) = ctemp11; *(b_offset1 + 11) = ctemp12; *(b_offset1 + 12) = ctemp13; *(b_offset1 + 13) = ctemp14; *(b_offset1 + 14) = ctemp15; *(b_offset1 + 15) = ctemp16; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); ctemp5 = *(a_offset3 + 0); ctemp6 = *(a_offset3 + 1); ctemp7 = *(a_offset4 + 0); ctemp8 = *(a_offset4 + 1); a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; *(b_offset2 + 4) = ctemp5; *(b_offset2 + 5) = ctemp6; *(b_offset2 + 6) = ctemp7; *(b_offset2 + 7) = ctemp8; b_offset2 += 8; } if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); ctemp3 = *(a_offset3 + 0); ctemp4 = *(a_offset4 + 0); *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; *(b_offset3 + 2) = ctemp3; *(b_offset3 + 3) = ctemp4; b_offset3 += 4; } j--; }while(j > 0); } if (m & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 8; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); a_offset1 += 4; a_offset2 += 4; *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; *(b_offset1 + 7) = ctemp8; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); a_offset1 += 2; a_offset2 += 2; *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; b_offset2 += 4; } if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; b_offset3 += 2; } } if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); a_offset1 += 4; *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; b_offset1 += 4 * m; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); a_offset1 += 2; *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; } if (n & 1) { ctemp1 = *(a_offset1 + 0); *(b_offset3 + 0) = ctemp1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/gemm_tcopy_8.c000066400000000000000000000476731313527062700206170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; FLOAT ctemp33, ctemp34, ctemp35, ctemp36; FLOAT ctemp37, ctemp38, ctemp39, ctemp40; FLOAT ctemp41, ctemp42, ctemp43, ctemp44; FLOAT ctemp45, ctemp46, ctemp47, ctemp48; FLOAT ctemp49, ctemp50, ctemp51, ctemp52; FLOAT ctemp53, ctemp54, ctemp55, ctemp56; FLOAT ctemp57, ctemp58, ctemp59, ctemp60; FLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; boffset = b; #if 0 fprintf(stderr, "M = %d N = %d\n", m, n); #endif boffset2 = b + m * (n & ~7); boffset3 = b + m * (n & ~3); boffset4 = b + m * (n & ~1); j = (m >> 3); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset5 = aoffset4 + lda; aoffset6 = aoffset5 + lda; aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; boffset1 = boffset; boffset += 64; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); aoffset4 += 8; ctemp33 = *(aoffset5 + 0); ctemp34 = *(aoffset5 + 1); ctemp35 = *(aoffset5 + 2); ctemp36 = *(aoffset5 + 3); ctemp37 = *(aoffset5 + 4); ctemp38 = *(aoffset5 + 5); ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); aoffset5 += 8; ctemp41 = *(aoffset6 + 0); ctemp42 = *(aoffset6 + 1); ctemp43 = *(aoffset6 + 2); ctemp44 = *(aoffset6 + 3); ctemp45 = *(aoffset6 + 4); ctemp46 = *(aoffset6 + 5); ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); aoffset6 += 8; ctemp49 = *(aoffset7 + 0); ctemp50 = *(aoffset7 + 1); ctemp51 = *(aoffset7 + 2); ctemp52 = *(aoffset7 + 3); ctemp53 = *(aoffset7 + 4); ctemp54 = *(aoffset7 + 5); ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); aoffset7 += 8; ctemp57 = *(aoffset8 + 0); ctemp58 = *(aoffset8 + 1); ctemp59 = *(aoffset8 + 2); ctemp60 = *(aoffset8 + 3); ctemp61 = *(aoffset8 + 4); ctemp62 = *(aoffset8 + 5); ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); aoffset8 += 8; *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; *(boffset1 + 3) = ctemp04; *(boffset1 + 4) = ctemp05; *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; *(boffset1 + 11) = ctemp12; *(boffset1 + 12) = ctemp13; *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; *(boffset1 + 16) = ctemp17; *(boffset1 + 17) = ctemp18; *(boffset1 + 18) = ctemp19; *(boffset1 + 19) = ctemp20; *(boffset1 + 20) = ctemp21; *(boffset1 + 21) = ctemp22; *(boffset1 + 22) = ctemp23; *(boffset1 + 23) = ctemp24; *(boffset1 + 24) = ctemp25; *(boffset1 + 25) = ctemp26; *(boffset1 + 26) = ctemp27; *(boffset1 + 27) = ctemp28; *(boffset1 + 28) = ctemp29; *(boffset1 + 29) = ctemp30; *(boffset1 + 30) = ctemp31; *(boffset1 + 31) = ctemp32; *(boffset1 + 32) = ctemp33; *(boffset1 + 33) = ctemp34; *(boffset1 + 34) = ctemp35; *(boffset1 + 35) = ctemp36; *(boffset1 + 36) = ctemp37; *(boffset1 + 37) = ctemp38; *(boffset1 + 38) = ctemp39; *(boffset1 + 39) = ctemp40; *(boffset1 + 40) = ctemp41; *(boffset1 + 41) = ctemp42; *(boffset1 + 42) = ctemp43; *(boffset1 + 43) = ctemp44; *(boffset1 + 44) = ctemp45; *(boffset1 + 45) = ctemp46; *(boffset1 + 46) = ctemp47; *(boffset1 + 47) = ctemp48; *(boffset1 + 48) = ctemp49; *(boffset1 + 49) = ctemp50; *(boffset1 + 50) = ctemp51; *(boffset1 + 51) = ctemp52; *(boffset1 + 52) = ctemp53; *(boffset1 + 53) = ctemp54; *(boffset1 + 54) = ctemp55; *(boffset1 + 55) = ctemp56; *(boffset1 + 56) = ctemp57; *(boffset1 + 57) = ctemp58; *(boffset1 + 58) = ctemp59; *(boffset1 + 59) = ctemp60; *(boffset1 + 60) = ctemp61; *(boffset1 + 61) = ctemp62; *(boffset1 + 62) = ctemp63; *(boffset1 + 63) = ctemp64; boffset1 += m * 8; i --; }while(i > 0); } if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; ctemp17 = *(aoffset5 + 0); ctemp18 = *(aoffset5 + 1); ctemp19 = *(aoffset5 + 2); ctemp20 = *(aoffset5 + 3); aoffset5 += 4; ctemp21 = *(aoffset6 + 0); ctemp22 = *(aoffset6 + 1); ctemp23 = *(aoffset6 + 2); ctemp24 = *(aoffset6 + 3); aoffset6 += 4; ctemp25 = *(aoffset7 + 0); ctemp26 = *(aoffset7 + 1); ctemp27 = *(aoffset7 + 2); ctemp28 = *(aoffset7 + 3); aoffset7 += 4; ctemp29 = *(aoffset8 + 0); ctemp30 = *(aoffset8 + 1); ctemp31 = *(aoffset8 + 2); ctemp32 = *(aoffset8 + 3); aoffset8 += 4; *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; *(boffset2 + 3) = ctemp04; *(boffset2 + 4) = ctemp05; *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; *(boffset2 + 8) = ctemp09; *(boffset2 + 9) = ctemp10; *(boffset2 + 10) = ctemp11; *(boffset2 + 11) = ctemp12; *(boffset2 + 12) = ctemp13; *(boffset2 + 13) = ctemp14; *(boffset2 + 14) = ctemp15; *(boffset2 + 15) = ctemp16; *(boffset2 + 16) = ctemp17; *(boffset2 + 17) = ctemp18; *(boffset2 + 18) = ctemp19; *(boffset2 + 19) = ctemp20; *(boffset2 + 20) = ctemp21; *(boffset2 + 21) = ctemp22; *(boffset2 + 22) = ctemp23; *(boffset2 + 23) = ctemp24; *(boffset2 + 24) = ctemp25; *(boffset2 + 25) = ctemp26; *(boffset2 + 26) = ctemp27; *(boffset2 + 27) = ctemp28; *(boffset2 + 28) = ctemp29; *(boffset2 + 29) = ctemp30; *(boffset2 + 30) = ctemp31; *(boffset2 + 31) = ctemp32; boffset2 += 32; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); aoffset5 += 2; ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); aoffset6 += 2; ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); aoffset7 += 2; ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); aoffset8 += 2; *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; *(boffset3 + 3) = ctemp04; *(boffset3 + 4) = ctemp05; *(boffset3 + 5) = ctemp06; *(boffset3 + 6) = ctemp07; *(boffset3 + 7) = ctemp08; *(boffset3 + 8) = ctemp09; *(boffset3 + 9) = ctemp10; *(boffset3 + 10) = ctemp11; *(boffset3 + 11) = ctemp12; *(boffset3 + 12) = ctemp13; *(boffset3 + 13) = ctemp14; *(boffset3 + 14) = ctemp15; *(boffset3 + 15) = ctemp16; boffset3 += 16; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; ctemp03 = *(aoffset3 + 0); aoffset3 ++; ctemp04 = *(aoffset4 + 0); aoffset4 ++; ctemp05 = *(aoffset5 + 0); aoffset5 ++; ctemp06 = *(aoffset6 + 0); aoffset6 ++; ctemp07 = *(aoffset7 + 0); aoffset7 ++; ctemp08 = *(aoffset8 + 0); aoffset8 ++; *(boffset4 + 0) = ctemp01; *(boffset4 + 1) = ctemp02; *(boffset4 + 2) = ctemp03; *(boffset4 + 3) = ctemp04; *(boffset4 + 4) = ctemp05; *(boffset4 + 5) = ctemp06; *(boffset4 + 6) = ctemp07; *(boffset4 + 7) = ctemp08; boffset4 += 8; } j--; }while(j > 0); } if (m & 4){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; boffset1 = boffset; boffset += 32; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); aoffset4 += 8; *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; *(boffset1 + 3) = ctemp04; *(boffset1 + 4) = ctemp05; *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; *(boffset1 + 11) = ctemp12; *(boffset1 + 12) = ctemp13; *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; *(boffset1 + 16) = ctemp17; *(boffset1 + 17) = ctemp18; *(boffset1 + 18) = ctemp19; *(boffset1 + 19) = ctemp20; *(boffset1 + 20) = ctemp21; *(boffset1 + 21) = ctemp22; *(boffset1 + 22) = ctemp23; *(boffset1 + 23) = ctemp24; *(boffset1 + 24) = ctemp25; *(boffset1 + 25) = ctemp26; *(boffset1 + 26) = ctemp27; *(boffset1 + 27) = ctemp28; *(boffset1 + 28) = ctemp29; *(boffset1 + 29) = ctemp30; *(boffset1 + 30) = ctemp31; *(boffset1 + 31) = ctemp32; boffset1 += 8 * m; i --; }while(i > 0); } if (n & 4) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; *(boffset2 + 3) = ctemp04; *(boffset2 + 4) = ctemp05; *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; *(boffset2 + 8) = ctemp09; *(boffset2 + 9) = ctemp10; *(boffset2 + 10) = ctemp11; *(boffset2 + 11) = ctemp12; *(boffset2 + 12) = ctemp13; *(boffset2 + 13) = ctemp14; *(boffset2 + 14) = ctemp15; *(boffset2 + 15) = ctemp16; boffset2 += 16; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; *(boffset3 + 3) = ctemp04; *(boffset3 + 4) = ctemp05; *(boffset3 + 5) = ctemp06; *(boffset3 + 6) = ctemp07; *(boffset3 + 7) = ctemp08; boffset3 += 8; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; ctemp03 = *(aoffset3 + 0); aoffset3 ++; ctemp04 = *(aoffset4 + 0); aoffset4 ++; *(boffset4 + 0) = ctemp01; *(boffset4 + 1) = ctemp02; *(boffset4 + 2) = ctemp03; *(boffset4 + 3) = ctemp04; boffset4 += 4; } } if (m & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; boffset1 = boffset; boffset += 16; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; *(boffset1 + 3) = ctemp04; *(boffset1 + 4) = ctemp05; *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; *(boffset1 + 11) = ctemp12; *(boffset1 + 12) = ctemp13; *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; boffset1 += 8 * m; i --; }while(i > 0); } if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; *(boffset2 + 3) = ctemp04; *(boffset2 + 4) = ctemp05; *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; boffset2 += 8; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; *(boffset3 + 3) = ctemp04; boffset3 += 4; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; *(boffset4 + 0) = ctemp01; *(boffset4 + 1) = ctemp02; boffset4 += 2; } } if (m & 1){ aoffset1 = aoffset; aoffset += lda; boffset1 = boffset; boffset += 8; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; *(boffset1 + 3) = ctemp04; *(boffset1 + 4) = ctemp05; *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; boffset1 += 8 * m; i --; }while(i > 0); } if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; *(boffset2 + 3) = ctemp04; boffset2 += 4; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; boffset3 += 2; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; *(boffset4 + 0) = ctemp01; boffset4 ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/gemmkernel_2x2.c000066400000000000000000000112551313527062700210310ustar00rootroot00000000000000#include "common.h" int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc #ifdef TRMMKERNEL ,BLASLONG offset #endif ) { BLASLONG i,j,k; FLOAT *C0,*C1,*ptrba,*ptrbb; FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; for (j=0; j #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ FLOAT *X = x; if (incx != 1) { X = buffer; COPY_K(m, x, incx, X, 1); } while (n > 0) { AXPYU_K(m, 0, 0, alpha * *y, X, 1, a, 1, NULL, 0); a += lda; y += incy; n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/imatcopy_cn.c000066400000000000000000000044021313527062700205110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" /***************************************************** * 2015-09-07 grisuthedragon ******************************************************/ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda) { BLASLONG i,j; FLOAT *aptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); if ( alpha == 1.0 ) return(0); aptr = a; if ( alpha == 0.0 ) { for ( i=0; i #include "common.h" #define a2 (a1 + 1) int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip1, ip2; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; FLOAT A1, A2, B1, B2; a--; k1 --; ipiv += k1; if (n <= 0) return 0; j = n; do { piv = ipiv; a1 = a + k1 + 1; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; b1 = a + ip1; b2 = a + ip2; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; } else { *(buffer + 0) = A1; *(buffer + 1) = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; *(buffer + 1) = A1; } else { *(buffer + 0) = A2; *(buffer + 1) = B2; *b2 = A1; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = A2; *b1 = A1; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = A1; *b1 = A2; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *b1 = A1; *b2 = A2; } } buffer += 2; b1 = a + ip1; b2 = a + ip2; a1 += 2; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *a1; B1 = *b1; if (a1 == b1) { *(buffer + 0) = A1; } else { *(buffer + 0) = B1; *b1 = A1; } } a += lda; j --; } while (j > 0); return 0; } OpenBLAS-0.2.20/kernel/generic/laswp_ncopy_2.c000066400000000000000000000145431313527062700207720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define PREFETCHSIZE 12 #define a2 (a1 + 1) #define a4 (a3 + 1) int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip1, ip2; blasint *piv; FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; a--; k1 --; ipiv += k1; if (n <= 0) return 0; j = (n >> 1); if (j > 0) { do { piv = ipiv; a1 = a + k1 + 1; a3 = a1 + 1 * lda; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; i = ((k2 - k1) >> 1); if (i > 0) { do { #ifdef __GNUC__ __builtin_prefetch(a1 + PREFETCHSIZE, 0, 0); __builtin_prefetch(a3 + PREFETCHSIZE, 0, 0); #endif A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A3; *(buffer + 2) = A2; *(buffer + 3) = A4; } else { *(buffer + 0) = A1; *(buffer + 1) = A3; *(buffer + 2) = B2; *(buffer + 3) = B4; *b2 = A2; *b4 = A4; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; *(buffer + 1) = A4; *(buffer + 2) = A1; *(buffer + 3) = A3; } else { *(buffer + 0) = A2; *(buffer + 1) = A4; *(buffer + 2) = B2; *(buffer + 3) = B4; *b2 = A1; *b4 = A3; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = A2; *(buffer + 3) = A4; *b1 = A1; *b3 = A3; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = A1; *(buffer + 3) = A3; *b1 = A2; *b3 = A4; } else { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = B2; *(buffer + 3) = B4; *b1 = A1; *b2 = A2; *b3 = A3; *b4 = A4; } } buffer += 4; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; a1 += 2; a3 += 2; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A3; } else { *(buffer + 0) = B1; *(buffer + 1) = B3; *b1 = A1; *b3 = A3; } buffer += 2; } a += 2 * lda; j --; } while (j > 0); } if (n & 1) { piv = ipiv; a1 = a + k1 + 1; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; b1 = a + ip1; b2 = a + ip2; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; } else { *(buffer + 0) = A1; *(buffer + 1) = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; *(buffer + 1) = A1; } else { *(buffer + 0) = A2; *(buffer + 1) = B2; *b2 = A1; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = A2; *b1 = A1; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = A1; *b1 = A2; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *b1 = A1; *b2 = A2; } } buffer += 2; b1 = a + ip1; b2 = a + ip2; a1 += 2; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *a1; B1 = *b1; if (a1 == b1) { *(buffer + 0) = A1; } else { *(buffer + 0) = B1; *b1 = A1; } } } return 0; } OpenBLAS-0.2.20/kernel/generic/laswp_ncopy_4.c000066400000000000000000000233361313527062700207740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define PREFETCHSIZE 8 #define a2 (a1 + 1) #define a4 (a3 + 1) #define a6 (a5 + 1) #define a8 (a7 + 1) int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip1, ip2; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; FLOAT *b5, *b6, *b7, *b8; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; FLOAT A5, A6, B5, B6, A7, A8, B7, B8; a--; k1 --; ipiv += k1; if (n <= 0) return 0; j = (n >> 2); if (j > 0) { do { piv = ipiv; a1 = a + k1 + 1; a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; i = ((k2 - k1) >> 1); if (i > 0) { do { #ifdef __GNUC__ __builtin_prefetch(a1 + PREFETCHSIZE, 0, 0); __builtin_prefetch(a3 + PREFETCHSIZE, 0, 0); __builtin_prefetch(a5 + PREFETCHSIZE, 0, 0); __builtin_prefetch(a7 + PREFETCHSIZE, 0, 0); #endif A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; A5 = *a5; A6 = *a6; A7 = *a7; A8 = *a8; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; B5 = *b5; B6 = *b6; B7 = *b7; B8 = *b8; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A3; *(buffer + 2) = A5; *(buffer + 3) = A7; *(buffer + 4) = A2; *(buffer + 5) = A4; *(buffer + 6) = A6; *(buffer + 7) = A8; } else { *(buffer + 0) = A1; *(buffer + 1) = A3; *(buffer + 2) = A5; *(buffer + 3) = A7; *(buffer + 4) = B2; *(buffer + 5) = B4; *(buffer + 6) = B6; *(buffer + 7) = B8; *b2 = A2; *b4 = A4; *b6 = A6; *b8 = A8; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; *(buffer + 1) = A4; *(buffer + 2) = A6; *(buffer + 3) = A8; *(buffer + 4) = A1; *(buffer + 5) = A3; *(buffer + 6) = A5; *(buffer + 7) = A7; } else { *(buffer + 0) = A2; *(buffer + 1) = A4; *(buffer + 2) = A6; *(buffer + 3) = A8; *(buffer + 4) = B2; *(buffer + 5) = B4; *(buffer + 6) = B6; *(buffer + 7) = B8; *b2 = A1; *b4 = A3; *b6 = A5; *b8 = A7; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = B5; *(buffer + 3) = B7; *(buffer + 4) = A2; *(buffer + 5) = A4; *(buffer + 6) = A6; *(buffer + 7) = A8; *b1 = A1; *b3 = A3; *b5 = A5; *b7 = A7; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = B5; *(buffer + 3) = B7; *(buffer + 4) = A1; *(buffer + 5) = A3; *(buffer + 6) = A5; *(buffer + 7) = A7; *b1 = A2; *b3 = A4; *b5 = A6; *b7 = A8; } else { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = B5; *(buffer + 3) = B7; *(buffer + 4) = B2; *(buffer + 5) = B4; *(buffer + 6) = B6; *(buffer + 7) = B8; *b1 = A1; *b2 = A2; *b3 = A3; *b4 = A4; *b5 = A5; *b6 = A6; *b7 = A7; *b8 = A8; } } buffer += 8; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; a1 += 2; a3 += 2; a5 += 2; a7 += 2; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; A5 = *a5; B5 = *b5; A7 = *a7; B7 = *b7; if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A3; *(buffer + 2) = A5; *(buffer + 3) = A7; } else { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = B5; *(buffer + 3) = B7; *b1 = A1; *b3 = A3; *b5 = A5; *b7 = A7; } buffer += 4; } a += 4 * lda; j --; } while (j > 0); } if (n & 2) { piv = ipiv; a1 = a + k1 + 1; a3 = a1 + 1 * lda; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A3; *(buffer + 2) = A2; *(buffer + 3) = A4; } else { *(buffer + 0) = A1; *(buffer + 1) = A3; *(buffer + 2) = B2; *(buffer + 3) = B4; *b2 = A2; *b4 = A4; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; *(buffer + 1) = A4; *(buffer + 2) = A1; *(buffer + 3) = A3; } else { *(buffer + 0) = A2; *(buffer + 1) = A4; *(buffer + 2) = B2; *(buffer + 3) = B4; *b2 = A1; *b4 = A3; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = A2; *(buffer + 3) = A4; *b1 = A1; *b3 = A3; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = A1; *(buffer + 3) = A3; *b1 = A2; *b3 = A4; } else { *(buffer + 0) = B1; *(buffer + 1) = B3; *(buffer + 2) = B2; *(buffer + 3) = B4; *b1 = A1; *b2 = A2; *b3 = A3; *b4 = A4; } } buffer += 4; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; a1 += 2; a3 += 2; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A3; } else { *(buffer + 0) = B1; *(buffer + 1) = B3; *b1 = A1; *b3 = A3; } buffer += 2; } a += 2 * lda; } if (n & 1) { piv = ipiv; a1 = a + k1 + 1; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; b1 = a + ip1; b2 = a + ip2; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; } else { *(buffer + 0) = A1; *(buffer + 1) = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; *(buffer + 1) = A1; } else { *(buffer + 0) = A2; *(buffer + 1) = B2; *b2 = A1; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = A2; *b1 = A1; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = A1; *b1 = A2; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *b1 = A1; *b2 = A2; } } buffer += 2; b1 = a + ip1; b2 = a + ip2; a1 += 2; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *a1; B1 = *b1; if (a1 == b1) { *(buffer + 0) = A1; } else { *(buffer + 0) = B1; *b1 = A1; } } } return 0; } OpenBLAS-0.2.20/kernel/generic/laswp_ncopy_8.c000066400000000000000000000162371313527062700210020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define PREFETCHSIZE 4 int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip; blasint *piv; FLOAT *dx1, *dy1; FLOAT *dx2, *dy2; FLOAT *dx3, *dy3; FLOAT *dx4, *dy4; FLOAT *dx5, *dy5; FLOAT *dx6, *dy6; FLOAT *dx7, *dy7; FLOAT *dx8, *dy8; FLOAT atemp1, btemp1; FLOAT atemp2, btemp2; FLOAT atemp3, btemp3; FLOAT atemp4, btemp4; FLOAT atemp5, btemp5; FLOAT atemp6, btemp6; FLOAT atemp7, btemp7; FLOAT atemp8, btemp8; a--; ipiv += k1 - 1; if (n <= 0) return 0; if (k1 > k2) return 0; j = (n >> 3); if (j > 0) { do { piv = ipiv; i = k1; do { ip = *piv; piv ++; dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda * 1; dy2 = a + ip + lda * 1; dx3 = a + i + lda * 2; dy3 = a + ip + lda * 2; dx4 = a + i + lda * 3; dy4 = a + ip + lda * 3; dx5 = a + i + lda * 4; dy5 = a + ip + lda * 4; dx6 = a + i + lda * 5; dy6 = a + ip + lda * 5; dx7 = a + i + lda * 6; dy7 = a + ip + lda * 6; dx8 = a + i + lda * 7; dy8 = a + ip + lda * 7; #ifdef __GNUC__ __builtin_prefetch(dx1 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx2 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx3 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx4 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx5 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx6 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx7 + PREFETCHSIZE, 0, 1); __builtin_prefetch(dx8 + PREFETCHSIZE, 0, 1); #endif atemp1 = *dx1; btemp1 = *dy1; atemp2 = *dx2; btemp2 = *dy2; atemp3 = *dx3; btemp3 = *dy3; atemp4 = *dx4; btemp4 = *dy4; atemp5 = *dx5; btemp5 = *dy5; atemp6 = *dx6; btemp6 = *dy6; atemp7 = *dx7; btemp7 = *dy7; atemp8 = *dx8; btemp8 = *dy8; if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; *dy3 = atemp3; *dy4 = atemp4; *dy5 = atemp5; *dy6 = atemp6; *dy7 = atemp7; *dy8 = atemp8; *(buffer + 0) = btemp1; *(buffer + 1) = btemp2; *(buffer + 2) = btemp3; *(buffer + 3) = btemp4; *(buffer + 4) = btemp5; *(buffer + 5) = btemp6; *(buffer + 6) = btemp7; *(buffer + 7) = btemp8; } else { *(buffer + 0) = atemp1; *(buffer + 1) = atemp2; *(buffer + 2) = atemp3; *(buffer + 3) = atemp4; *(buffer + 4) = atemp5; *(buffer + 5) = atemp6; *(buffer + 6) = atemp7; *(buffer + 7) = atemp8; } buffer += 8; i++; } while (i <= k2); a += 8 * lda; j --; } while (j > 0); } if (n & 4) { piv = ipiv; ip = *piv; piv ++; dx1 = a + k1; dy1 = a + ip; dx2 = a + k1 + lda * 1; dy2 = a + ip + lda * 1; dx3 = a + k1 + lda * 2; dy3 = a + ip + lda * 2; dx4 = a + k1 + lda * 3; dy4 = a + ip + lda * 3; i = k1; do { atemp1 = *dx1; atemp2 = *dx2; atemp3 = *dx3; atemp4 = *dx4; btemp1 = *dy1; btemp2 = *dy2; btemp3 = *dy3; btemp4 = *dy4; if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; *dy3 = atemp3; *dy4 = atemp4; *(buffer + 0) = btemp1; *(buffer + 1) = btemp2; *(buffer + 2) = btemp3; *(buffer + 3) = btemp4; } else { *(buffer + 0) = atemp1; *(buffer + 1) = atemp2; *(buffer + 2) = atemp3; *(buffer + 3) = atemp4; } ip = *piv; piv ++; i++; dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda * 1; dy2 = a + ip + lda * 1; dx3 = a + i + lda * 2; dy3 = a + ip + lda * 2; dx4 = a + i + lda * 3; dy4 = a + ip + lda * 3; buffer += 4; } while (i <= k2); a += 4 * lda; } if (n & 2) { piv = ipiv; i = k1; do { ip = *piv; piv ++; dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda; dy2 = a + ip + lda; atemp1 = *dx1; btemp1 = *dy1; atemp2 = *dx2; btemp2 = *dy2; if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; *(buffer + 0) = btemp1; *(buffer + 1) = btemp2; } else { *(buffer + 0) = atemp1; *(buffer + 1) = atemp2; } buffer += 2; i++; } while (i <= k2); a += 2 * lda; } if (n & 1) { piv = ipiv; i = k1; do { ip = *piv; piv ++; dx1 = a + i; dy1 = a + ip; atemp1 = *dx1; btemp1 = *dy1; if (ip != i) { *dy1 = atemp1; *buffer = btemp1; } else { *buffer = atemp1; } buffer ++; i++; } while (i <= k2); a += lda; } return 0; } OpenBLAS-0.2.20/kernel/generic/lsame.c000066400000000000000000000053751313527062700173170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include int NAME(char *A, char *B){ char a = *A; char b = *B; if (a > 96) a -= 32; if (b > 96) b -= 32; return (a == b); } OpenBLAS-0.2.20/kernel/generic/neg_tcopy_1.c000066400000000000000000000062301313527062700204140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1; FLOAT *b_offset, *b_offset1; a_offset = a; b_offset = b; i = m; if (i > 0) { do { a_offset1 = a_offset; a_offset += lda; b_offset1 = b_offset; b_offset ++; j = n; if (j > 0) { do { *(b_offset1 + 0) = -*(a_offset1 + 0); a_offset1 ++; b_offset1 += m; j --; } while (j > 0); } i --; } while (i > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/neg_tcopy_16.c000066400000000000000000000243771313527062700205160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; #if 0 fprintf(stderr, "m = %d n = %d\n", m, n); #endif j = (n >> 4); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 16; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); ctemp17 = *(aoffset2 + 0); ctemp18 = *(aoffset2 + 1); ctemp19 = *(aoffset2 + 2); ctemp20 = *(aoffset2 + 3); ctemp21 = *(aoffset2 + 4); ctemp22 = *(aoffset2 + 5); ctemp23 = *(aoffset2 + 6); ctemp24 = *(aoffset2 + 7); ctemp25 = *(aoffset2 + 8); ctemp26 = *(aoffset2 + 9); ctemp27 = *(aoffset2 + 10); ctemp28 = *(aoffset2 + 11); ctemp29 = *(aoffset2 + 12); ctemp30 = *(aoffset2 + 13); ctemp31 = *(aoffset2 + 14); ctemp32 = *(aoffset2 + 15); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; *(boffset + 11) = -ctemp12; *(boffset + 12) = -ctemp13; *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; *(boffset + 16) = -ctemp17; *(boffset + 17) = -ctemp18; *(boffset + 18) = -ctemp19; *(boffset + 19) = -ctemp20; *(boffset + 20) = -ctemp21; *(boffset + 21) = -ctemp22; *(boffset + 22) = -ctemp23; *(boffset + 23) = -ctemp24; *(boffset + 24) = -ctemp25; *(boffset + 25) = -ctemp26; *(boffset + 26) = -ctemp27; *(boffset + 27) = -ctemp28; *(boffset + 28) = -ctemp29; *(boffset + 29) = -ctemp30; *(boffset + 30) = -ctemp31; *(boffset + 31) = -ctemp32; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 32; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; *(boffset + 11) = -ctemp12; *(boffset + 12) = -ctemp13; *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; boffset += 16; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 8){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; *(boffset + 11) = -ctemp12; *(boffset + 12) = -ctemp13; *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; boffset += 8; } } if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; boffset += 4; } } if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; boffset += 2; } } if (n & 1){ aoffset1 = aoffset; aoffset2 = aoffset + lda; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset2 + 0); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 2; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); *(boffset + 0) = -ctemp01; boffset += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/neg_tcopy_2.c000066400000000000000000000074571313527062700204310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset, *b_offset1, *b_offset2; a_offset = a; b_offset = b; b_offset2 = b + m * (n & ~1); i = (m >> 1); if (i > 0) { do { a_offset1 = a_offset; a_offset2 = a_offset + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 4; j = (n >> 1); if (j > 0){ do { *(b_offset1 + 0) = -*(a_offset1 + 0); *(b_offset1 + 1) = -*(a_offset1 + 1); *(b_offset1 + 2) = -*(a_offset2 + 0); *(b_offset1 + 3) = -*(a_offset2 + 1); a_offset1 += 2; a_offset2 += 2; b_offset1 += m * 2; j--; } while (j > 0); } if (n & 1){ *(b_offset2 + 0) = -*(a_offset1 + 0); *(b_offset2 + 1) = -*(a_offset2 + 0); b_offset2 += 2; } i --; } while (i > 0); } if (m & 1) { j = (n >> 1); if (j > 0){ do { *(b_offset + 0) = -*(a_offset + 0); *(b_offset + 1) = -*(a_offset + 1); a_offset += 2; b_offset += m * 2; j--; } while (j > 0); } if (n & 1){ *(b_offset2 + 0) = -*(a_offset + 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/neg_tcopy_4.c000066400000000000000000000166671313527062700204360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; b_offset2 = b + m * (n & ~3); b_offset3 = b + m * (n & ~1); j = (m >> 2); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; b_offset1 = b_offset; b_offset += 16; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); ctemp16 = *(a_offset4 + 3); a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; *(b_offset1 + 4) = -ctemp5; *(b_offset1 + 5) = -ctemp6; *(b_offset1 + 6) = -ctemp7; *(b_offset1 + 7) = -ctemp8; *(b_offset1 + 8) = -ctemp9; *(b_offset1 + 9) = -ctemp10; *(b_offset1 + 10) = -ctemp11; *(b_offset1 + 11) = -ctemp12; *(b_offset1 + 12) = -ctemp13; *(b_offset1 + 13) = -ctemp14; *(b_offset1 + 14) = -ctemp15; *(b_offset1 + 15) = -ctemp16; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); ctemp5 = *(a_offset3 + 0); ctemp6 = *(a_offset3 + 1); ctemp7 = *(a_offset4 + 0); ctemp8 = *(a_offset4 + 1); a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; *(b_offset2 + 2) = -ctemp3; *(b_offset2 + 3) = -ctemp4; *(b_offset2 + 4) = -ctemp5; *(b_offset2 + 5) = -ctemp6; *(b_offset2 + 6) = -ctemp7; *(b_offset2 + 7) = -ctemp8; b_offset2 += 8; } if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); ctemp3 = *(a_offset3 + 0); ctemp4 = *(a_offset4 + 0); *(b_offset3 + 0) = -ctemp1; *(b_offset3 + 1) = -ctemp2; *(b_offset3 + 2) = -ctemp3; *(b_offset3 + 3) = -ctemp4; b_offset3 += 4; } j--; }while(j > 0); } if (m & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 8; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); a_offset1 += 4; a_offset2 += 4; *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; *(b_offset1 + 4) = -ctemp5; *(b_offset1 + 5) = -ctemp6; *(b_offset1 + 6) = -ctemp7; *(b_offset1 + 7) = -ctemp8; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); a_offset1 += 2; a_offset2 += 2; *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; *(b_offset2 + 2) = -ctemp3; *(b_offset2 + 3) = -ctemp4; b_offset2 += 4; } if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); *(b_offset3 + 0) = -ctemp1; *(b_offset3 + 1) = -ctemp2; b_offset3 += 2; } } if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); a_offset1 += 4; *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; b_offset1 += 4 * m; i --; }while(i > 0); } if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); a_offset1 += 2; *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; } if (n & 1) { ctemp1 = *(a_offset1 + 0); *(b_offset3 + 0) = -ctemp1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/neg_tcopy_8.c000066400000000000000000000502341313527062700204260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; FLOAT ctemp33, ctemp34, ctemp35, ctemp36; FLOAT ctemp37, ctemp38, ctemp39, ctemp40; FLOAT ctemp41, ctemp42, ctemp43, ctemp44; FLOAT ctemp45, ctemp46, ctemp47, ctemp48; FLOAT ctemp49, ctemp50, ctemp51, ctemp52; FLOAT ctemp53, ctemp54, ctemp55, ctemp56; FLOAT ctemp57, ctemp58, ctemp59, ctemp60; FLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; boffset = b; #if 0 fprintf(stderr, "M = %d N = %d\n", m, n); #endif boffset2 = b + m * (n & ~7); boffset3 = b + m * (n & ~3); boffset4 = b + m * (n & ~1); j = (m >> 3); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset5 = aoffset4 + lda; aoffset6 = aoffset5 + lda; aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; boffset1 = boffset; boffset += 64; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); aoffset4 += 8; ctemp33 = *(aoffset5 + 0); ctemp34 = *(aoffset5 + 1); ctemp35 = *(aoffset5 + 2); ctemp36 = *(aoffset5 + 3); ctemp37 = *(aoffset5 + 4); ctemp38 = *(aoffset5 + 5); ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); aoffset5 += 8; ctemp41 = *(aoffset6 + 0); ctemp42 = *(aoffset6 + 1); ctemp43 = *(aoffset6 + 2); ctemp44 = *(aoffset6 + 3); ctemp45 = *(aoffset6 + 4); ctemp46 = *(aoffset6 + 5); ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); aoffset6 += 8; ctemp49 = *(aoffset7 + 0); ctemp50 = *(aoffset7 + 1); ctemp51 = *(aoffset7 + 2); ctemp52 = *(aoffset7 + 3); ctemp53 = *(aoffset7 + 4); ctemp54 = *(aoffset7 + 5); ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); aoffset7 += 8; ctemp57 = *(aoffset8 + 0); ctemp58 = *(aoffset8 + 1); ctemp59 = *(aoffset8 + 2); ctemp60 = *(aoffset8 + 3); ctemp61 = *(aoffset8 + 4); ctemp62 = *(aoffset8 + 5); ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); aoffset8 += 8; *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; *(boffset1 + 3) = -ctemp04; *(boffset1 + 4) = -ctemp05; *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; *(boffset1 + 11) = -ctemp12; *(boffset1 + 12) = -ctemp13; *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; *(boffset1 + 16) = -ctemp17; *(boffset1 + 17) = -ctemp18; *(boffset1 + 18) = -ctemp19; *(boffset1 + 19) = -ctemp20; *(boffset1 + 20) = -ctemp21; *(boffset1 + 21) = -ctemp22; *(boffset1 + 22) = -ctemp23; *(boffset1 + 23) = -ctemp24; *(boffset1 + 24) = -ctemp25; *(boffset1 + 25) = -ctemp26; *(boffset1 + 26) = -ctemp27; *(boffset1 + 27) = -ctemp28; *(boffset1 + 28) = -ctemp29; *(boffset1 + 29) = -ctemp30; *(boffset1 + 30) = -ctemp31; *(boffset1 + 31) = -ctemp32; *(boffset1 + 32) = -ctemp33; *(boffset1 + 33) = -ctemp34; *(boffset1 + 34) = -ctemp35; *(boffset1 + 35) = -ctemp36; *(boffset1 + 36) = -ctemp37; *(boffset1 + 37) = -ctemp38; *(boffset1 + 38) = -ctemp39; *(boffset1 + 39) = -ctemp40; *(boffset1 + 40) = -ctemp41; *(boffset1 + 41) = -ctemp42; *(boffset1 + 42) = -ctemp43; *(boffset1 + 43) = -ctemp44; *(boffset1 + 44) = -ctemp45; *(boffset1 + 45) = -ctemp46; *(boffset1 + 46) = -ctemp47; *(boffset1 + 47) = -ctemp48; *(boffset1 + 48) = -ctemp49; *(boffset1 + 49) = -ctemp50; *(boffset1 + 50) = -ctemp51; *(boffset1 + 51) = -ctemp52; *(boffset1 + 52) = -ctemp53; *(boffset1 + 53) = -ctemp54; *(boffset1 + 54) = -ctemp55; *(boffset1 + 55) = -ctemp56; *(boffset1 + 56) = -ctemp57; *(boffset1 + 57) = -ctemp58; *(boffset1 + 58) = -ctemp59; *(boffset1 + 59) = -ctemp60; *(boffset1 + 60) = -ctemp61; *(boffset1 + 61) = -ctemp62; *(boffset1 + 62) = -ctemp63; *(boffset1 + 63) = -ctemp64; boffset1 += m * 8; i --; }while(i > 0); } if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; ctemp17 = *(aoffset5 + 0); ctemp18 = *(aoffset5 + 1); ctemp19 = *(aoffset5 + 2); ctemp20 = *(aoffset5 + 3); aoffset5 += 4; ctemp21 = *(aoffset6 + 0); ctemp22 = *(aoffset6 + 1); ctemp23 = *(aoffset6 + 2); ctemp24 = *(aoffset6 + 3); aoffset6 += 4; ctemp25 = *(aoffset7 + 0); ctemp26 = *(aoffset7 + 1); ctemp27 = *(aoffset7 + 2); ctemp28 = *(aoffset7 + 3); aoffset7 += 4; ctemp29 = *(aoffset8 + 0); ctemp30 = *(aoffset8 + 1); ctemp31 = *(aoffset8 + 2); ctemp32 = *(aoffset8 + 3); aoffset8 += 4; *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; *(boffset2 + 3) = -ctemp04; *(boffset2 + 4) = -ctemp05; *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; *(boffset2 + 8) = -ctemp09; *(boffset2 + 9) = -ctemp10; *(boffset2 + 10) = -ctemp11; *(boffset2 + 11) = -ctemp12; *(boffset2 + 12) = -ctemp13; *(boffset2 + 13) = -ctemp14; *(boffset2 + 14) = -ctemp15; *(boffset2 + 15) = -ctemp16; *(boffset2 + 16) = -ctemp17; *(boffset2 + 17) = -ctemp18; *(boffset2 + 18) = -ctemp19; *(boffset2 + 19) = -ctemp20; *(boffset2 + 20) = -ctemp21; *(boffset2 + 21) = -ctemp22; *(boffset2 + 22) = -ctemp23; *(boffset2 + 23) = -ctemp24; *(boffset2 + 24) = -ctemp25; *(boffset2 + 25) = -ctemp26; *(boffset2 + 26) = -ctemp27; *(boffset2 + 27) = -ctemp28; *(boffset2 + 28) = -ctemp29; *(boffset2 + 29) = -ctemp30; *(boffset2 + 30) = -ctemp31; *(boffset2 + 31) = -ctemp32; boffset2 += 32; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); aoffset5 += 2; ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); aoffset6 += 2; ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); aoffset7 += 2; ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); aoffset8 += 2; *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; *(boffset3 + 3) = -ctemp04; *(boffset3 + 4) = -ctemp05; *(boffset3 + 5) = -ctemp06; *(boffset3 + 6) = -ctemp07; *(boffset3 + 7) = -ctemp08; *(boffset3 + 8) = -ctemp09; *(boffset3 + 9) = -ctemp10; *(boffset3 + 10) = -ctemp11; *(boffset3 + 11) = -ctemp12; *(boffset3 + 12) = -ctemp13; *(boffset3 + 13) = -ctemp14; *(boffset3 + 14) = -ctemp15; *(boffset3 + 15) = -ctemp16; boffset3 += 16; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; ctemp03 = *(aoffset3 + 0); aoffset3 ++; ctemp04 = *(aoffset4 + 0); aoffset4 ++; ctemp05 = *(aoffset5 + 0); aoffset5 ++; ctemp06 = *(aoffset6 + 0); aoffset6 ++; ctemp07 = *(aoffset7 + 0); aoffset7 ++; ctemp08 = *(aoffset8 + 0); aoffset8 ++; *(boffset4 + 0) = -ctemp01; *(boffset4 + 1) = -ctemp02; *(boffset4 + 2) = -ctemp03; *(boffset4 + 3) = -ctemp04; *(boffset4 + 4) = -ctemp05; *(boffset4 + 5) = -ctemp06; *(boffset4 + 6) = -ctemp07; *(boffset4 + 7) = -ctemp08; boffset4 += 8; } j--; }while(j > 0); } if (m & 4){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; boffset1 = boffset; boffset += 32; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); aoffset4 += 8; *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; *(boffset1 + 3) = -ctemp04; *(boffset1 + 4) = -ctemp05; *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; *(boffset1 + 11) = -ctemp12; *(boffset1 + 12) = -ctemp13; *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; *(boffset1 + 16) = -ctemp17; *(boffset1 + 17) = -ctemp18; *(boffset1 + 18) = -ctemp19; *(boffset1 + 19) = -ctemp20; *(boffset1 + 20) = -ctemp21; *(boffset1 + 21) = -ctemp22; *(boffset1 + 22) = -ctemp23; *(boffset1 + 23) = -ctemp24; *(boffset1 + 24) = -ctemp25; *(boffset1 + 25) = -ctemp26; *(boffset1 + 26) = -ctemp27; *(boffset1 + 27) = -ctemp28; *(boffset1 + 28) = -ctemp29; *(boffset1 + 29) = -ctemp30; *(boffset1 + 30) = -ctemp31; *(boffset1 + 31) = -ctemp32; boffset1 += 8 * m; i --; }while(i > 0); } if (n & 4) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; *(boffset2 + 3) = -ctemp04; *(boffset2 + 4) = -ctemp05; *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; *(boffset2 + 8) = -ctemp09; *(boffset2 + 9) = -ctemp10; *(boffset2 + 10) = -ctemp11; *(boffset2 + 11) = -ctemp12; *(boffset2 + 12) = -ctemp13; *(boffset2 + 13) = -ctemp14; *(boffset2 + 14) = -ctemp15; *(boffset2 + 15) = -ctemp16; boffset2 += 16; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; *(boffset3 + 3) = -ctemp04; *(boffset3 + 4) = -ctemp05; *(boffset3 + 5) = -ctemp06; *(boffset3 + 6) = -ctemp07; *(boffset3 + 7) = -ctemp08; boffset3 += 8; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; ctemp03 = *(aoffset3 + 0); aoffset3 ++; ctemp04 = *(aoffset4 + 0); aoffset4 ++; *(boffset4 + 0) = -ctemp01; *(boffset4 + 1) = -ctemp02; *(boffset4 + 2) = -ctemp03; *(boffset4 + 3) = -ctemp04; boffset4 += 4; } } if (m & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; boffset1 = boffset; boffset += 16; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; *(boffset1 + 3) = -ctemp04; *(boffset1 + 4) = -ctemp05; *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; *(boffset1 + 11) = -ctemp12; *(boffset1 + 12) = -ctemp13; *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; boffset1 += 8 * m; i --; }while(i > 0); } if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; *(boffset2 + 3) = -ctemp04; *(boffset2 + 4) = -ctemp05; *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; boffset2 += 8; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; *(boffset3 + 3) = -ctemp04; boffset3 += 4; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; *(boffset4 + 0) = -ctemp01; *(boffset4 + 1) = -ctemp02; boffset4 += 2; } } if (m & 1){ aoffset1 = aoffset; aoffset += lda; boffset1 = boffset; boffset += 8; i = (n >> 3); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; *(boffset1 + 3) = -ctemp04; *(boffset1 + 4) = -ctemp05; *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; boffset1 += 8 * m; i --; }while(i > 0); } if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; *(boffset2 + 3) = -ctemp04; boffset2 += 4; } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; boffset3 += 2; } if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; *(boffset4 + 0) = -ctemp01; boffset4 ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_lcopy_1.c000066400000000000000000000062571313527062700206310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01; FLOAT *ao1; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; b += 1; offset --; i --; } posX += 1; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_lcopy_16.c000066400000000000000000000233611313527062700207120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; js = (n >> 4); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; if (offset > -8) ao9 = a + posX + 8 + posY * lda; else ao9 = a + posY + (posX + 8) * lda; if (offset > -9) ao10 = a + posX + 9 + posY * lda; else ao10 = a + posY + (posX + 9) * lda; if (offset > -10) ao11 = a + posX + 10 + posY * lda; else ao11 = a + posY + (posX + 10) * lda; if (offset > -11) ao12 = a + posX + 11 + posY * lda; else ao12 = a + posY + (posX + 11) * lda; if (offset > -12) ao13 = a + posX + 12 + posY * lda; else ao13 = a + posY + (posX + 12) * lda; if (offset > -13) ao14 = a + posX + 13 + posY * lda; else ao14 = a + posY + (posX + 13) * lda; if (offset > -14) ao15 = a + posX + 14 + posY * lda; else ao15 = a + posY + (posX + 14) * lda; if (offset > -15) ao16 = a + posX + 15 + posY * lda; else ao16 = a + posY + (posX + 15) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); data05 = *(ao5 + 0); data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); data09 = *(ao9 + 0); data10 = *(ao10 + 0); data11 = *(ao11 + 0); data12 = *(ao12 + 0); data13 = *(ao13 + 0); data14 = *(ao14 + 0); data15 = *(ao15 + 0); data16 = *(ao16 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; if (offset > -3) ao4 += lda; else ao4 ++; if (offset > -4) ao5 += lda; else ao5 ++; if (offset > -5) ao6 += lda; else ao6 ++; if (offset > -6) ao7 += lda; else ao7 ++; if (offset > -7) ao8 += lda; else ao8 ++; if (offset > -8) ao9 += lda; else ao9 ++; if (offset > -9) ao10 += lda; else ao10 ++; if (offset > -10) ao11 += lda; else ao11 ++; if (offset > -11) ao12 += lda; else ao12 ++; if (offset > -12) ao13 += lda; else ao13 ++; if (offset > -13) ao14 += lda; else ao14 ++; if (offset > -14) ao15 += lda; else ao15 ++; if (offset > -15) ao16 += lda; else ao16 ++; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b += 16; offset --; i --; } posX += 16; js --; } if (n & 8) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); data05 = *(ao5 + 0); data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; if (offset > -3) ao4 += lda; else ao4 ++; if (offset > -4) ao5 += lda; else ao5 ++; if (offset > -5) ao6 += lda; else ao6 ++; if (offset > -6) ao7 += lda; else ao7 ++; if (offset > -7) ao8 += lda; else ao8 ++; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; if (offset > -3) ao4 += lda; else ao4 ++; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_lcopy_2.c000066400000000000000000000073141313527062700206250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1, *ao2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_lcopy_4.c000066400000000000000000000112021313527062700206160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; if (offset > -3) ao4 += lda; else ao4 ++; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_lcopy_6.c000066400000000000000000000112021313527062700206200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; if (offset > -3) ao4 += lda; else ao4 ++; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_lcopy_8.c000066400000000000000000000145221313527062700206320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); data05 = *(ao5 + 0); data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; if (offset > -3) ao4 += lda; else ao4 ++; if (offset > -4) ao5 += lda; else ao5 ++; if (offset > -5) ao6 += lda; else ao6 ++; if (offset > -6) ao7 += lda; else ao7 ++; if (offset > -7) ao8 += lda; else ao8 ++; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; if (offset > -3) ao4 += lda; else ao4 ++; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_ucopy_1.c000066400000000000000000000062531313527062700206360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01; FLOAT *ao1; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_ucopy_16.c000066400000000000000000000233731313527062700207260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; js = (n >> 4); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; if (offset > -8) ao9 = a + posY + (posX + 8) * lda; else ao9 = a + posX + 8 + posY * lda; if (offset > -9) ao10 = a + posY + (posX + 9) * lda; else ao10 = a + posX + 9 + posY * lda; if (offset > -10) ao11 = a + posY + (posX + 10) * lda; else ao11 = a + posX + 10 + posY * lda; if (offset > -11) ao12 = a + posY + (posX + 11) * lda; else ao12 = a + posX + 11 + posY * lda; if (offset > -12) ao13 = a + posY + (posX + 12) * lda; else ao13 = a + posX + 12 + posY * lda; if (offset > -13) ao14 = a + posY + (posX + 13) * lda; else ao14 = a + posX + 13 + posY * lda; if (offset > -14) ao15 = a + posY + (posX + 14) * lda; else ao15 = a + posX + 14 + posY * lda; if (offset > -15) ao16 = a + posY + (posX + 15) * lda; else ao16 = a + posX + 15 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); data05 = *(ao5 + 0); data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); data09 = *(ao9 + 0); data10 = *(ao10 + 0); data11 = *(ao11 + 0); data12 = *(ao12 + 0); data13 = *(ao13 + 0); data14 = *(ao14 + 0); data15 = *(ao15 + 0); data16 = *(ao16 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; if (offset > -3) ao4 ++; else ao4 += lda; if (offset > -4) ao5 ++; else ao5 += lda; if (offset > -5) ao6 ++; else ao6 += lda; if (offset > -6) ao7 ++; else ao7 += lda; if (offset > -7) ao8 ++; else ao8 += lda; if (offset > -8) ao9 ++; else ao9 += lda; if (offset > -9) ao10 ++; else ao10 += lda; if (offset > -10) ao11 ++; else ao11 += lda; if (offset > -11) ao12 ++; else ao12 += lda; if (offset > -12) ao13 ++; else ao13 += lda; if (offset > -13) ao14 ++; else ao14 += lda; if (offset > -14) ao15 ++; else ao15 += lda; if (offset > -15) ao16 ++; else ao16 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b += 16; offset --; i --; } posX += 16; js --; } if (n & 8) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); data05 = *(ao5 + 0); data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; if (offset > -3) ao4 ++; else ao4 += lda; if (offset > -4) ao5 ++; else ao5 += lda; if (offset > -5) ao6 ++; else ao6 += lda; if (offset > -6) ao7 ++; else ao7 += lda; if (offset > -7) ao8 ++; else ao8 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; if (offset > -3) ao4 ++; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_ucopy_2.c000066400000000000000000000073131313527062700206350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1, *ao2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_ucopy_4.c000066400000000000000000000112001313527062700206250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; if (offset > -3) ao4 ++; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_ucopy_6.c000066400000000000000000000112001313527062700206270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; if (offset > -3) ao4 ++; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symm_ucopy_8.c000066400000000000000000000145221313527062700206430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); data05 = *(ao5 + 0); data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; if (offset > -3) ao4 ++; else ao4 += lda; if (offset > -4) ao5 ++; else ao5 += lda; if (offset > -5) ao6 ++; else ao6 += lda; if (offset > -6) ao7 ++; else ao7 += lda; if (offset > -7) ao8 ++; else ao8 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; if (offset > -3) ao4 ++; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/symv_k.c000066400000000000000000000110331313527062700175120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ BLASLONG is, min_i; FLOAT *X = x; FLOAT *Y = y; FLOAT *symbuffer = buffer; FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) + 4095) & ~4095); FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } #ifndef LOWER for(is = m - offset; is < m; is += SYMV_P){ min_i = MIN(m - is, SYMV_P); #else for(is = 0; is < offset; is += SYMV_P){ min_i = MIN(offset - is, SYMV_P); #endif #ifndef LOWER if (is >0){ GEMV_T(is, min_i, 0, alpha, a + is * lda, lda, X, 1, Y + is, 1, gemvbuffer); GEMV_N(is, min_i, 0, alpha, a + is * lda, lda, X + is, 1, Y, 1, gemvbuffer); } #endif #ifdef LOWER SYMCOPY_L(min_i, a + is + is * lda, lda, symbuffer); #else SYMCOPY_U(min_i, a + is + is * lda, lda, symbuffer); #endif GEMV_N(min_i, min_i, 0, alpha, symbuffer, min_i, X + is, 1, Y + is, 1, gemvbuffer); #ifdef LOWER if (m - is > min_i){ GEMV_T(m - is - min_i, min_i, 0, alpha, a + (is + min_i) + is * lda, lda, X + (is + min_i), 1, Y + is, 1, gemvbuffer); GEMV_N(m - is - min_i, min_i, 0, alpha, a + (is + min_i) + is * lda, lda, X + is, 1, Y + (is + min_i), 1, gemvbuffer); } #endif } /* end of is */ if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_lncopy_1.c000066400000000000000000000065741313527062700210030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, X; FLOAT data01; FLOAT *ao1; while (n > 0) { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X < posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += 1; } X ++; i --; } while (i > 0); } posY += 1; n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_lncopy_16.c000066400000000000000000000775361313527062700210770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X, ii; FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; js = (n >> 4); if (js > 0){ do { X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; a09 = a + posY + (posX + 8) * lda; a10 = a + posY + (posX + 9) * lda; a11 = a + posY + (posX + 10) * lda; a12 = a + posY + (posX + 11) * lda; a13 = a + posY + (posX + 12) * lda; a14 = a + posY + (posX + 13) * lda; a15 = a + posY + (posX + 14) * lda; a16 = a + posY + (posX + 15) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; a09 = a + posX + (posY + 8) * lda; a10 = a + posX + (posY + 9) * lda; a11 = a + posX + (posY + 10) * lda; a12 = a + posX + (posY + 11) * lda; a13 = a + posX + (posY + 12) * lda; a14 = a + posX + (posY + 13) * lda; a15 = a + posX + (posY + 14) * lda; a16 = a + posX + (posY + 15) * lda; } i = (m >> 4); if (i > 0) { do { if (X > posY) { for (ii = 0; ii < 16; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); b[ 11] = *(a12 + 0); b[ 12] = *(a13 + 0); b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; a09 ++; a10 ++; a11 ++; a12 ++; a13 ++; a14 ++; a15 ++; a16 ++; b += 16; } } else if (X < posY) { a01 += 16 * lda; a02 += 16 * lda; a03 += 16 * lda; a04 += 16 * lda; a05 += 16 * lda; a06 += 16 * lda; a07 += 16 * lda; a08 += 16 * lda; a09 += 16 * lda; a10 += 16 * lda; a11 += 16 * lda; a12 += 16 * lda; a13 += 16 * lda; a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; b += 256; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(a01 + 1); #ifdef UNIT b[ 17] = ONE; #else b[ 17] = *(a02 + 1); #endif b[ 18] = ZERO; b[ 19] = ZERO; b[ 20] = ZERO; b[ 21] = ZERO; b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; b[ 27] = ZERO; b[ 28] = ZERO; b[ 29] = ZERO; b[ 30] = ZERO; b[ 31] = ZERO; b[ 32] = *(a01 + 2); b[ 33] = *(a02 + 2); #ifdef UNIT b[ 34] = ONE; #else b[ 34] = *(a03 + 2); #endif b[ 35] = ZERO; b[ 36] = ZERO; b[ 37] = ZERO; b[ 38] = ZERO; b[ 39] = ZERO; b[ 40] = ZERO; b[ 41] = ZERO; b[ 42] = ZERO; b[ 43] = ZERO; b[ 44] = ZERO; b[ 45] = ZERO; b[ 46] = ZERO; b[ 47] = ZERO; b[ 48] = *(a01 + 3); b[ 49] = *(a02 + 3); b[ 50] = *(a03 + 3); #ifdef UNIT b[ 51] = ONE; #else b[ 51] = *(a04 + 3); #endif b[ 52] = ZERO; b[ 53] = ZERO; b[ 54] = ZERO; b[ 55] = ZERO; b[ 56] = ZERO; b[ 57] = ZERO; b[ 58] = ZERO; b[ 59] = ZERO; b[ 60] = ZERO; b[ 61] = ZERO; b[ 62] = ZERO; b[ 63] = ZERO; b[ 64] = *(a01 + 4); b[ 65] = *(a02 + 4); b[ 66] = *(a03 + 4); b[ 67] = *(a04 + 4); #ifdef UNIT b[ 68] = ONE; #else b[ 68] = *(a05 + 4); #endif b[ 69] = ZERO; b[ 70] = ZERO; b[ 71] = ZERO; b[ 72] = ZERO; b[ 73] = ZERO; b[ 74] = ZERO; b[ 75] = ZERO; b[ 76] = ZERO; b[ 77] = ZERO; b[ 78] = ZERO; b[ 79] = ZERO; b[ 80] = *(a01 + 5); b[ 81] = *(a02 + 5); b[ 82] = *(a03 + 5); b[ 83] = *(a04 + 5); b[ 84] = *(a05 + 5); #ifdef UNIT b[ 85] = ONE; #else b[ 85] = *(a06 + 5); #endif b[ 86] = ZERO; b[ 87] = ZERO; b[ 88] = ZERO; b[ 89] = ZERO; b[ 90] = ZERO; b[ 91] = ZERO; b[ 92] = ZERO; b[ 93] = ZERO; b[ 94] = ZERO; b[ 95] = ZERO; b[ 96] = *(a01 + 6); b[ 97] = *(a02 + 6); b[ 98] = *(a03 + 6); b[ 99] = *(a04 + 6); b[100] = *(a05 + 6); b[101] = *(a06 + 6); #ifdef UNIT b[102] = ONE; #else b[102] = *(a07 + 6); #endif b[103] = ZERO; b[104] = ZERO; b[105] = ZERO; b[106] = ZERO; b[107] = ZERO; b[108] = ZERO; b[109] = ZERO; b[110] = ZERO; b[111] = ZERO; b[112] = *(a01 + 7); b[113] = *(a02 + 7); b[114] = *(a03 + 7); b[115] = *(a04 + 7); b[116] = *(a05 + 7); b[117] = *(a06 + 7); b[118] = *(a07 + 7); #ifdef UNIT b[119] = ONE; #else b[119] = *(a08 + 7); #endif b[120] = ZERO; b[121] = ZERO; b[122] = ZERO; b[123] = ZERO; b[124] = ZERO; b[125] = ZERO; b[126] = ZERO; b[127] = ZERO; b[128] = *(a01 + 8); b[129] = *(a02 + 8); b[130] = *(a03 + 8); b[131] = *(a04 + 8); b[132] = *(a05 + 8); b[133] = *(a06 + 8); b[134] = *(a07 + 8); b[135] = *(a08 + 8); #ifdef UNIT b[136] = ONE; #else b[136] = *(a09 + 8); #endif b[137] = ZERO; b[138] = ZERO; b[139] = ZERO; b[140] = ZERO; b[141] = ZERO; b[142] = ZERO; b[143] = ZERO; b[144] = *(a01 + 9); b[145] = *(a02 + 9); b[146] = *(a03 + 9); b[147] = *(a04 + 9); b[148] = *(a05 + 9); b[149] = *(a06 + 9); b[150] = *(a07 + 9); b[151] = *(a08 + 9); b[152] = *(a09 + 9); #ifdef UNIT b[153] = ONE; #else b[153] = *(a10 + 9); #endif b[154] = ZERO; b[155] = ZERO; b[156] = ZERO; b[157] = ZERO; b[158] = ZERO; b[159] = ZERO; b[160] = *(a01 + 10); b[161] = *(a02 + 10); b[162] = *(a03 + 10); b[163] = *(a04 + 10); b[164] = *(a05 + 10); b[165] = *(a06 + 10); b[166] = *(a07 + 10); b[167] = *(a08 + 10); b[168] = *(a09 + 10); b[169] = *(a10 + 10); #ifdef UNIT b[170] = ONE; #else b[170] = *(a11 + 10); #endif b[171] = ZERO; b[172] = ZERO; b[173] = ZERO; b[174] = ZERO; b[175] = ZERO; b[176] = *(a01 + 11); b[177] = *(a02 + 11); b[178] = *(a03 + 11); b[179] = *(a04 + 11); b[180] = *(a05 + 11); b[181] = *(a06 + 11); b[182] = *(a07 + 11); b[183] = *(a08 + 11); b[184] = *(a09 + 11); b[185] = *(a10 + 11); b[186] = *(a11 + 11); #ifdef UNIT b[187] = ONE; #else b[187] = *(a12 + 11); #endif b[188] = ZERO; b[189] = ZERO; b[190] = ZERO; b[191] = ZERO; b[192] = *(a01 + 12); b[193] = *(a02 + 12); b[194] = *(a03 + 12); b[195] = *(a04 + 12); b[196] = *(a05 + 12); b[197] = *(a06 + 12); b[198] = *(a07 + 12); b[199] = *(a08 + 12); b[200] = *(a09 + 12); b[201] = *(a10 + 12); b[202] = *(a11 + 12); b[203] = *(a12 + 12); #ifdef UNIT b[204] = ONE; #else b[204] = *(a13 + 12); #endif b[205] = ZERO; b[206] = ZERO; b[207] = ZERO; b[208] = *(a01 + 13); b[209] = *(a02 + 13); b[210] = *(a03 + 13); b[211] = *(a04 + 13); b[212] = *(a05 + 13); b[213] = *(a06 + 13); b[214] = *(a07 + 13); b[215] = *(a08 + 13); b[216] = *(a09 + 13); b[217] = *(a10 + 13); b[218] = *(a11 + 13); b[219] = *(a12 + 13); b[220] = *(a13 + 13); #ifdef UNIT b[221] = ONE; #else b[221] = *(a14 + 13); #endif b[222] = ZERO; b[223] = ZERO; b[224] = *(a01 + 14); b[225] = *(a02 + 14); b[226] = *(a03 + 14); b[227] = *(a04 + 14); b[228] = *(a05 + 14); b[229] = *(a06 + 14); b[230] = *(a07 + 14); b[231] = *(a08 + 14); b[232] = *(a09 + 14); b[233] = *(a10 + 14); b[234] = *(a11 + 14); b[235] = *(a12 + 14); b[236] = *(a13 + 14); b[237] = *(a14 + 14); #ifdef UNIT b[238] = ONE; #else b[238] = *(a15 + 14); #endif b[239] = ZERO; b[240] = *(a01 + 15); b[241] = *(a02 + 15); b[242] = *(a03 + 15); b[243] = *(a04 + 15); b[244] = *(a05 + 15); b[245] = *(a06 + 15); b[246] = *(a07 + 15); b[247] = *(a08 + 15); b[248] = *(a09 + 15); b[249] = *(a10 + 15); b[250] = *(a11 + 15); b[251] = *(a12 + 15); b[252] = *(a13 + 15); b[253] = *(a14 + 15); b[254] = *(a15 + 15); #ifdef UNIT b[255] = ONE; #else b[255] = *(a16 + 15); #endif a01 += 16; a02 += 16; a03 += 16; a04 += 16; a05 += 16; a06 += 16; a07 += 16; a08 += 16; a09 += 16; a10 += 16; a11 += 16; a12 += 16; a13 += 16; a14 += 16; a15 += 16; a16 += 16; b += 256; } X += 16; i --; } while (i > 0); } i = (m & 15); if (i) { if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); b[ 11] = *(a12 + 0); b[ 12] = *(a13 + 0); b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; a09 ++; a10 ++; a11 ++; a12 ++; a13 ++; a14 ++; a15 ++; a16 ++; b += 16; } } else if (X < posY) { a01 += i * lda; a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; a08 += i * lda; a09 += i * lda; a10 += i * lda; a11 += i * lda; a12 += i * lda; a13 += i * lda; a14 += i * lda; a15 += i * lda; a16 += i * lda; b += 16 * i; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; if (i >= 2) { b[ 0] = *(a01 + 1); #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 3) { b[ 0] = *(a01 + 2); b[ 1] = *(a02 + 2); #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 4) { b[ 0] = *(a01 + 3); b[ 1] = *(a02 + 3); b[ 2] = *(a03 + 3); #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 11] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 5) { b[ 0] = *(a01 + 4); b[ 1] = *(a02 + 4); b[ 2] = *(a03 + 4); b[ 3] = *(a04 + 4); #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 6) { b[ 0] = *(a01 + 5); b[ 1] = *(a02 + 5); b[ 2] = *(a03 + 5); b[ 3] = *(a04 + 5); b[ 4] = *(a05 + 5); #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 7) { b[ 0] = *(a01 + 6); b[ 1] = *(a02 + 6); b[ 2] = *(a03 + 6); b[ 3] = *(a04 + 6); b[ 4] = *(a05 + 6); b[ 5] = *(a06 + 6); #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 8) { b[ 0] = *(a01 + 7); b[ 1] = *(a02 + 7); b[ 2] = *(a03 + 7); b[ 3] = *(a04 + 7); b[ 4] = *(a05 + 7); b[ 5] = *(a06 + 7); b[ 6] = *(a07 + 7); #ifdef UNIT b[ 7] = ONE; #else b[ 7] = *(a08 + 7); #endif b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 9) { b[ 0] = *(a01 + 8); b[ 1] = *(a02 + 8); b[ 2] = *(a03 + 8); b[ 3] = *(a04 + 8); b[ 4] = *(a05 + 8); b[ 5] = *(a06 + 8); b[ 6] = *(a07 + 8); b[ 7] = *(a08 + 8); #ifdef UNIT b[ 8] = ONE; #else b[ 8] = *(a09 + 8); #endif b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 10) { b[ 0] = *(a01 + 9); b[ 1] = *(a02 + 9); b[ 2] = *(a03 + 9); b[ 3] = *(a04 + 9); b[ 4] = *(a05 + 9); b[ 5] = *(a06 + 9); b[ 6] = *(a07 + 9); b[ 7] = *(a08 + 9); b[ 8] = *(a09 + 9); #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a10 + 9); #endif b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 11) { b[ 0] = *(a01 + 10); b[ 1] = *(a02 + 10); b[ 2] = *(a03 + 10); b[ 3] = *(a04 + 10); b[ 4] = *(a05 + 10); b[ 5] = *(a06 + 10); b[ 6] = *(a07 + 10); b[ 7] = *(a08 + 10); b[ 8] = *(a09 + 10); b[ 9] = *(a10 + 10); #ifdef UNIT b[ 10] = ONE; #else b[ 10] = *(a11 + 10); #endif b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 12) { b[ 0] = *(a01 + 11); b[ 1] = *(a02 + 11); b[ 2] = *(a03 + 11); b[ 3] = *(a04 + 11); b[ 4] = *(a05 + 11); b[ 5] = *(a06 + 11); b[ 6] = *(a07 + 11); b[ 7] = *(a08 + 11); b[ 8] = *(a09 + 11); b[ 9] = *(a10 + 11); b[ 10] = *(a11 + 11); #ifdef UNIT b[ 11] = ONE; #else b[ 11] = *(a12 + 11); #endif b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 13) { b[ 0] = *(a01 + 12); b[ 1] = *(a02 + 12); b[ 2] = *(a03 + 12); b[ 3] = *(a04 + 12); b[ 4] = *(a05 + 12); b[ 5] = *(a06 + 12); b[ 6] = *(a07 + 12); b[ 7] = *(a08 + 12); b[ 8] = *(a09 + 12); b[ 9] = *(a10 + 12); b[ 10] = *(a11 + 12); b[ 11] = *(a12 + 12); #ifdef UNIT b[ 12] = ONE; #else b[ 12] = *(a13 + 12); #endif b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 14) { b[ 0] = *(a01 + 13); b[ 1] = *(a02 + 13); b[ 2] = *(a03 + 13); b[ 3] = *(a04 + 13); b[ 4] = *(a05 + 13); b[ 5] = *(a06 + 13); b[ 6] = *(a07 + 13); b[ 7] = *(a08 + 13); b[ 8] = *(a09 + 13); b[ 9] = *(a10 + 13); b[ 10] = *(a11 + 13); b[ 11] = *(a12 + 13); b[ 12] = *(a13 + 13); #ifdef UNIT b[ 13] = ONE; #else b[ 13] = *(a14 + 13); #endif b[ 14] = ZERO; b[ 15] = ZERO; b += 16; } if (i >= 15) { b[ 0] = *(a01 + 14); b[ 1] = *(a02 + 14); b[ 2] = *(a03 + 14); b[ 3] = *(a04 + 14); b[ 4] = *(a05 + 14); b[ 5] = *(a06 + 14); b[ 6] = *(a07 + 14); b[ 7] = *(a08 + 14); b[ 8] = *(a09 + 14); b[ 9] = *(a10 + 14); b[ 10] = *(a11 + 14); b[ 11] = *(a12 + 14); b[ 12] = *(a13 + 14); b[ 13] = *(a14 + 14); #ifdef UNIT b[ 14] = ONE; #else b[ 14] = *(a15 + 14); #endif b[ 15] = ZERO; b += 16; } } } posY += 16; js --; } while (js > 0); } /* End of main loop */ if (n & 8){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X > posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; b += 8; } } else if (X < posY) { a01 += 8 * lda; a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; b += 64; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = *(a01 + 1); #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a02 + 1); #endif b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(a01 + 2); b[ 17] = *(a02 + 2); #ifdef UNIT b[ 18] = ONE; #else b[ 18] = *(a03 + 2); #endif b[ 19] = ZERO; b[ 20] = ZERO; b[ 21] = ZERO; b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = *(a01 + 3); b[ 25] = *(a02 + 3); b[ 26] = *(a03 + 3); #ifdef UNIT b[ 27] = ONE; #else b[ 27] = *(a04 + 3); #endif b[ 28] = ZERO; b[ 29] = ZERO; b[ 30] = ZERO; b[ 31] = ZERO; b[ 32] = *(a01 + 4); b[ 33] = *(a02 + 4); b[ 34] = *(a03 + 4); b[ 35] = *(a04 + 4); #ifdef UNIT b[ 36] = ONE; #else b[ 36] = *(a05 + 4); #endif b[ 37] = ZERO; b[ 38] = ZERO; b[ 39] = ZERO; b[ 40] = *(a01 + 5); b[ 41] = *(a02 + 5); b[ 42] = *(a03 + 5); b[ 43] = *(a04 + 5); b[ 44] = *(a05 + 5); #ifdef UNIT b[ 45] = ONE; #else b[ 45] = *(a06 + 5); #endif b[ 46] = ZERO; b[ 47] = ZERO; b[ 48] = *(a01 + 6); b[ 49] = *(a02 + 6); b[ 50] = *(a03 + 6); b[ 51] = *(a04 + 6); b[ 52] = *(a05 + 6); b[ 53] = *(a06 + 6); #ifdef UNIT b[ 54] = ONE; #else b[ 54] = *(a07 + 6); #endif b[ 55] = ZERO; b[ 56] = *(a01 + 7); b[ 57] = *(a02 + 7); b[ 58] = *(a03 + 7); b[ 59] = *(a04 + 7); b[ 60] = *(a05 + 7); b[ 61] = *(a06 + 7); b[ 62] = *(a07 + 7); #ifdef UNIT b[ 63] = ONE; #else b[ 63] = *(a08 + 7); #endif a01 += 8; a02 += 8; a03 += 8; a04 += 8; a05 += 8; a06 += 8; a07 += 8; a08 += 8; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; b += 8; } } else if (X < posY) { a01 += i * lda; a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; a08 += i * lda; b += 8 * i; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if (i >= 2) { b[ 0] = *(a01 + 1); #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = *(a01 + 2); b[ 1] = *(a02 + 2); #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 4) { b[ 0] = *(a01 + 3); b[ 1] = *(a02 + 3); b[ 2] = *(a03 + 3); #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 5) { b[ 0] = *(a01 + 4); b[ 1] = *(a02 + 4); b[ 2] = *(a03 + 4); b[ 3] = *(a04 + 4); #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 6) { b[ 0] = *(a01 + 5); b[ 1] = *(a02 + 5); b[ 2] = *(a03 + 5); b[ 3] = *(a04 + 5); b[ 4] = *(a05 + 5); #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 7) { b[ 0] = *(a01 + 6); b[ 1] = *(a02 + 6); b[ 2] = *(a03 + 6); b[ 3] = *(a04 + 6); b[ 4] = *(a05 + 6); b[ 5] = *(a06 + 6); #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = ZERO; b += 8; } } } posY += 8; } if (n & 4){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); a01 ++; a02 ++; a03 ++; a04 ++; b += 4; } } else if (X < posY) { a01 += 4 * lda; a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; b += 16; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = *(a01 + 1); #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a02 + 1); #endif b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = *(a01 + 2); b[ 9] = *(a02 + 2); #ifdef UNIT b[ 10] = ONE; #else b[ 10] = *(a03 + 2); #endif b[ 11] = ZERO; b[ 12] = *(a01 + 3); b[ 13] = *(a02 + 3); b[ 14] = *(a03 + 3); #ifdef UNIT b[ 15] = ONE; #else b[ 15] = *(a04 + 3); #endif a01 += 4; a02 += 4; a03 += 4; a04 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); a01 ++; a02 ++; a03 ++; a04 ++; b += 4; } } else if (X < posY) { a01 += i * lda; a02 += i * lda; a03 += i * lda; a04 += i * lda; b += 4 * i; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if (i >= 2) { b[ 0] = *(a01 + 1); #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = *(a01 + 2); b[ 1] = *(a02 + 2); #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = ZERO; b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a01 + 1); b[ 3] = *(a02 + 1); a01 += 2; a02 += 2; b += 4; } else if (X < posY) { a01 += 2 * lda; a02 += 2 * lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = *(a01 + 1); #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a02 + 1); #endif a01 += 2; a02 += 2; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); a01 ++; a02 ++; b += 2; } else if (X < posY) { a01 += lda; a02 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; } else { a01 = a + posX + (posY + 0) * lda; } i = m; if (m > 0) { do { if (X > posY) { b[ 0] = *(a01 + 0); a01 += 1; b += 1; } else if (X < posY) { a01 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_lncopy_2.c000066400000000000000000000117661313527062700210030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data03; b[ 2] = data02; b[ 3] = data04; ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = data04; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { data01 = *(ao1 + 0); data03 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data03; ao1 += 1; ao2 += 1; b += 2; } else if (X < posY) { ao1 += lda; b += 2; } else { #ifdef UNIT data03 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data03; #else data01 = *(ao1 + 0); data03 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data03; #endif ao1 += 1; ao2 += 1; b += 2; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X < posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += 1; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_lncopy_4.c000066400000000000000000000237551313527062700210060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = data02; b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; b[ 8] = data03; b[ 9] = data07; b[10] = data11; b[11] = data15; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = data16; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data12 = *(ao3 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data02; b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data07; b[10] = ONE; b[11] = ZERO; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data02; b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data07; b[10] = data11; b[11] = ZERO; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = data16; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data03; b[ 2] = data05; b[ 3] = data07; b[ 4] = data02; b[ 5] = data04; b[ 6] = data06; b[ 7] = data08; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { ao1 += lda; b += 4; } } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data15 = *(ao4 + 2); } b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = data15; b += 4; } #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data06 = *(ao2 + 1); data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data11 = *(ao3 + 2); data15 = *(ao4 + 2); } b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data15; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data05; b[ 2] = data02; b[ 3] = data06; ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = data06; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data02; ao1 += 1; ao2 += 1; b += 2; } else if (X < posY) { ao1 += lda; b += 2; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data05; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data05; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; b += 1; ao1 += 1; } else if (X < posY) { b += 1; ao1 += lda; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += 1; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_lncopy_6.c000066400000000000000000000237551313527062700210100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = data02; b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; b[ 8] = data03; b[ 9] = data07; b[10] = data11; b[11] = data15; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = data16; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data12 = *(ao3 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data02; b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data07; b[10] = ONE; b[11] = ZERO; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data02; b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data07; b[10] = data11; b[11] = ZERO; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = data16; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data03; b[ 2] = data05; b[ 3] = data07; b[ 4] = data02; b[ 5] = data04; b[ 6] = data06; b[ 7] = data08; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { ao1 += lda; b += 4; } } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data15 = *(ao4 + 2); } b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = data15; b += 4; } #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data06 = *(ao2 + 1); data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data11 = *(ao3 + 2); data15 = *(ao4 + 2); } b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data15; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data05; b[ 2] = data02; b[ 3] = data06; ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = data06; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data02; ao1 += 1; ao2 += 1; b += 2; } else if (X < posY) { ao1 += lda; b += 2; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data05; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data05; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; b += 1; ao1 += 1; } else if (X < posY) { b += 1; ao1 += lda; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += 1; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_lncopy_8.c000066400000000000000000000564501313527062700210100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; ao5 = a + posY + (posX + 4) * lda; ao6 = a + posY + (posX + 5) * lda; ao7 = a + posY + (posX + 6) * lda; ao8 = a + posY + (posX + 7) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; ao5 = a + posX + (posY + 4) * lda; ao6 = a + posX + (posY + 5) * lda; ao7 = a + posX + (posY + 6) * lda; ao8 = a + posX + (posY + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data37 = *(ao5 + 4); data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data45 = *(ao6 + 4); data46 = *(ao6 + 5); data47 = *(ao6 + 6); data48 = *(ao6 + 7); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); data61 = *(ao8 + 4); data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b[ 8] = data02; b[ 9] = data10; b[10] = data18; b[11] = data26; b[12] = data34; b[13] = data42; b[14] = data50; b[15] = data58; b[16] = data03; b[17] = data11; b[18] = data19; b[19] = data27; b[20] = data35; b[21] = data43; b[22] = data51; b[23] = data59; b[24] = data04; b[25] = data12; b[26] = data20; b[27] = data28; b[28] = data36; b[29] = data44; b[30] = data52; b[31] = data60; b[32] = data05; b[33] = data13; b[34] = data21; b[35] = data29; b[36] = data37; b[37] = data45; b[38] = data53; b[39] = data61; b[40] = data06; b[41] = data14; b[42] = data22; b[43] = data30; b[44] = data38; b[45] = data46; b[46] = data54; b[47] = data62; b[48] = data07; b[49] = data15; b[50] = data23; b[51] = data31; b[52] = data39; b[53] = data47; b[54] = data55; b[55] = data63; b[56] = data08; b[57] = data16; b[58] = data24; b[59] = data32; b[60] = data40; b[61] = data48; b[62] = data56; b[63] = data64; ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; ao5 += 8; ao6 += 8; ao7 += 8; ao8 += 8; b += 64; } else if (X < posY) { ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 64; } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); #ifndef UNIT data10 = *(ao2 + 1); #endif data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); #ifndef UNIT data19 = *(ao3 + 2); #endif data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); #ifndef UNIT data28 = *(ao4 + 3); #endif data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); #ifndef UNIT data37 = *(ao5 + 4); #endif data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); #ifndef UNIT data46 = *(ao6 + 5); #endif data47 = *(ao6 + 6); data48 = *(ao6 + 7); #ifndef UNIT data55 = *(ao7 + 6); #endif data56 = *(ao7 + 7); #ifndef UNIT data64 = *(ao8 + 7); #endif #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data02; #ifdef UNIT b[ 9] = ONE; #else b[ 9] = data10; #endif b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = data03; b[17] = data11; #ifdef UNIT b[18] = ONE; #else b[18] = data19; #endif b[19] = ZERO; b[20] = ZERO; b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; b[24] = data04; b[25] = data12; b[26] = data20; #ifdef UNIT b[27] = ONE; #else b[27] = data28; #endif b[28] = ZERO; b[29] = ZERO; b[30] = ZERO; b[31] = ZERO; b[32] = data05; b[33] = data13; b[34] = data21; b[35] = data29; #ifdef UNIT b[36] = ONE; #else b[36] = data37; #endif b[37] = ZERO; b[38] = ZERO; b[39] = ZERO; b[40] = data06; b[41] = data14; b[42] = data22; b[43] = data30; b[44] = data38; #ifdef UNIT b[45] = ONE; #else b[45] = data46; #endif b[46] = ZERO; b[47] = ZERO; b[48] = data07; b[49] = data15; b[50] = data23; b[51] = data31; b[52] = data39; b[53] = data47; #ifdef UNIT b[54] = ONE; #else b[54] = data55; #endif b[55] = ZERO; b[56] = data08; b[57] = data16; b[58] = data24; b[59] = data32; b[60] = data40; b[61] = data48; b[62] = data56; #ifdef UNIT b[63] = ONE; #else b[63] = data64; #endif ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; ao5 += 8; ao6 += 8; ao7 += 8; ao8 += 8; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X > posY) { if (m & 4) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b[ 8] = data02; b[ 9] = data10; b[10] = data18; b[11] = data26; b[12] = data34; b[13] = data42; b[14] = data50; b[15] = data58; b[16] = data03; b[17] = data11; b[18] = data19; b[19] = data27; b[20] = data35; b[21] = data43; b[22] = data51; b[23] = data59; b[24] = data04; b[25] = data12; b[26] = data20; b[27] = data28; b[28] = data36; b[29] = data44; b[30] = data52; b[31] = data60; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; ao5 += 4; ao6 += 4; ao7 += 4; ao8 += 4; b += 32; } if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data57 = *(ao8 + 0); data58 = *(ao8 + 1); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b[ 8] = data02; b[ 9] = data10; b[10] = data18; b[11] = data26; b[12] = data34; b[13] = data42; b[14] = data50; b[15] = data58; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); data33 = *(ao5 + 0); data41 = *(ao6 + 0); data49 = *(ao7 + 0); data57 = *(ao8 + 0); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b += 8; } } else if (X < posY) { if (m & 4) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } if (m & 2) { ao1 += 2 * lda; b += 16; } if (m & 1) { b += 8; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); } if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); #endif data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); } if (i >= 5) { #ifndef UNIT data37 = *(ao5 + 4); #endif data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); } if (i >= 6) { #ifndef UNIT data46 = *(ao6 + 5); #endif data47 = *(ao6 + 6); data48 = *(ao6 + 7); } if (i >= 7) { #ifndef UNIT data55 = *(ao7 + 6); #endif data56 = *(ao7 + 7); } #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if(i >= 2) { b[ 0] = data02; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = data03; b[ 1] = data11; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 4) { b[ 0] = data04; b[ 1] = data12; b[ 2] = data20; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = data28; #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 5) { b[ 0] = data05; b[ 1] = data13; b[ 2] = data21; b[ 3] = data29; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = data37; #endif b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 6) { b[ 0] = data06; b[ 1] = data14; b[ 2] = data22; b[ 3] = data30; b[ 4] = data38; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = data46; #endif b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 7) { b[ 0] = data07; b[ 1] = data15; b[ 2] = data23; b[ 3] = data31; b[ 4] = data39; b[ 5] = data47; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = data55; #endif b[ 7] = ZERO; b += 8; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; b[ 8] = data03; b[ 9] = data11; b[10] = data19; b[11] = data27; b[12] = data04; b[13] = data12; b[14] = data20; b[15] = data28; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data20 = *(ao3 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data02; b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data11; b[10] = ONE; b[11] = ZERO; b[12] = data04; b[13] = data12; b[14] = data20; b[15] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data02; b[ 5] = data10; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data11; b[10] = data19; b[11] = ZERO; b[12] = data04; b[13] = data12; b[14] = data20; b[15] = data28; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b += 4; } } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; b += 8; } if (m & 1) { b += 4; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data11 = *(ao2 + 2); data12 = *(ao2 + 3); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data20 = *(ao3 + 3); } #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if(i >= 2) { b[ 0] = data02; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = data03; b[ 1] = data11; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = ZERO; b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data09; b[ 2] = data02; b[ 3] = data10; ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data02; b[ 3] = data10; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data09; b += 2; } else if (X < posY) { b += 2; } else { #ifdef UNIT data09 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data09; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data09; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (m > 0) { do { if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X < posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 ++; b ++; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_ltcopy_1.c000066400000000000000000000066001313527062700207770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, X; FLOAT data01; FLOAT *ao1; while (n > 0) { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { ao1 += 1; b += 1; } else if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += 1; } X ++; i --; } while (i > 0); } posY += 1; n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_ltcopy_16.c000066400000000000000000000763711313527062700211010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, ii; BLASLONG X; FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; js = (n >> 4); if (js > 0){ do { X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; a09 = a + posY + (posX + 8) * lda; a10 = a + posY + (posX + 9) * lda; a11 = a + posY + (posX + 10) * lda; a12 = a + posY + (posX + 11) * lda; a13 = a + posY + (posX + 12) * lda; a14 = a + posY + (posX + 13) * lda; a15 = a + posY + (posX + 14) * lda; a16 = a + posY + (posX + 15) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; a09 = a + posX + (posY + 8) * lda; a10 = a + posX + (posY + 9) * lda; a11 = a + posX + (posY + 10) * lda; a12 = a + posX + (posY + 11) * lda; a13 = a + posX + (posY + 12) * lda; a14 = a + posX + (posY + 13) * lda; a15 = a + posX + (posY + 14) * lda; a16 = a + posX + (posY + 15) * lda; } i = (m >> 4); if (i > 0) { do { if (X > posY) { a01 += 16; a02 += 16; a03 += 16; a04 += 16; a05 += 16; a06 += 16; a07 += 16; a08 += 16; a09 += 16; a10 += 16; a11 += 16; a12 += 16; a13 += 16; a14 += 16; a15 += 16; a16 += 16; b += 256; } else if (X < posY) { for (ii = 0; ii < 16; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; b += 16; } a02 += 16 * lda; a03 += 16 * lda; a04 += 16 * lda; a05 += 16 * lda; a06 += 16 * lda; a07 += 16 * lda; a08 += 16 * lda; a09 += 16 * lda; a10 += 16 * lda; a11 += 16 * lda; a12 += 16 * lda; a13 += 16 * lda; a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); b[ 16] = ZERO; #ifdef UNIT b[ 17] = ONE; #else b[ 17] = *(a02 + 1); #endif b[ 18] = *(a02 + 2); b[ 19] = *(a02 + 3); b[ 20] = *(a02 + 4); b[ 21] = *(a02 + 5); b[ 22] = *(a02 + 6); b[ 23] = *(a02 + 7); b[ 24] = *(a02 + 8); b[ 25] = *(a02 + 9); b[ 26] = *(a02 + 10); b[ 27] = *(a02 + 11); b[ 28] = *(a02 + 12); b[ 29] = *(a02 + 13); b[ 30] = *(a02 + 14); b[ 31] = *(a02 + 15); b[ 32] = ZERO; b[ 33] = ZERO; #ifdef UNIT b[ 34] = ONE; #else b[ 34] = *(a03 + 2); #endif b[ 35] = *(a03 + 3); b[ 36] = *(a03 + 4); b[ 37] = *(a03 + 5); b[ 38] = *(a03 + 6); b[ 39] = *(a03 + 7); b[ 40] = *(a03 + 8); b[ 41] = *(a03 + 9); b[ 42] = *(a03 + 10); b[ 43] = *(a03 + 11); b[ 44] = *(a03 + 12); b[ 45] = *(a03 + 13); b[ 46] = *(a03 + 14); b[ 47] = *(a03 + 15); b[ 48] = ZERO; b[ 49] = ZERO; b[ 50] = ZERO; #ifdef UNIT b[ 51] = ONE; #else b[ 51] = *(a04 + 3); #endif b[ 52] = *(a04 + 4); b[ 53] = *(a04 + 5); b[ 54] = *(a04 + 6); b[ 55] = *(a04 + 7); b[ 56] = *(a04 + 8); b[ 57] = *(a04 + 9); b[ 58] = *(a04 + 10); b[ 59] = *(a04 + 11); b[ 60] = *(a04 + 12); b[ 61] = *(a04 + 13); b[ 62] = *(a04 + 14); b[ 63] = *(a04 + 15); b[ 64] = ZERO; b[ 65] = ZERO; b[ 66] = ZERO; b[ 67] = ZERO; #ifdef UNIT b[ 68] = ONE; #else b[ 68] = *(a05 + 4); #endif b[ 69] = *(a05 + 5); b[ 70] = *(a05 + 6); b[ 71] = *(a05 + 7); b[ 72] = *(a05 + 8); b[ 73] = *(a05 + 9); b[ 74] = *(a05 + 10); b[ 75] = *(a05 + 11); b[ 76] = *(a05 + 12); b[ 77] = *(a05 + 13); b[ 78] = *(a05 + 14); b[ 79] = *(a05 + 15); b[ 80] = ZERO; b[ 81] = ZERO; b[ 82] = ZERO; b[ 83] = ZERO; b[ 84] = ZERO; #ifdef UNIT b[ 85] = ONE; #else b[ 85] = *(a06 + 5); #endif b[ 86] = *(a06 + 6); b[ 87] = *(a06 + 7); b[ 88] = *(a06 + 8); b[ 89] = *(a06 + 9); b[ 90] = *(a06 + 10); b[ 91] = *(a06 + 11); b[ 92] = *(a06 + 12); b[ 93] = *(a06 + 13); b[ 94] = *(a06 + 14); b[ 95] = *(a06 + 15); b[ 96] = ZERO; b[ 97] = ZERO; b[ 98] = ZERO; b[ 99] = ZERO; b[100] = ZERO; b[101] = ZERO; #ifdef UNIT b[102] = ONE; #else b[102] = *(a07 + 6); #endif b[103] = *(a07 + 7); b[104] = *(a07 + 8); b[105] = *(a07 + 9); b[106] = *(a07 + 10); b[107] = *(a07 + 11); b[108] = *(a07 + 12); b[109] = *(a07 + 13); b[110] = *(a07 + 14); b[111] = *(a07 + 15); b[112] = ZERO; b[113] = ZERO; b[114] = ZERO; b[115] = ZERO; b[116] = ZERO; b[117] = ZERO; b[118] = ZERO; #ifdef UNIT b[119] = ONE; #else b[119] = *(a08 + 7); #endif b[120] = *(a08 + 8); b[121] = *(a08 + 9); b[122] = *(a08 + 10); b[123] = *(a08 + 11); b[124] = *(a08 + 12); b[125] = *(a08 + 13); b[126] = *(a08 + 14); b[127] = *(a08 + 15); b[128] = ZERO; b[129] = ZERO; b[130] = ZERO; b[131] = ZERO; b[132] = ZERO; b[133] = ZERO; b[134] = ZERO; b[135] = ZERO; #ifdef UNIT b[136] = ONE; #else b[136] = *(a09 + 8); #endif b[137] = *(a09 + 9); b[138] = *(a09 + 10); b[139] = *(a09 + 11); b[140] = *(a09 + 12); b[141] = *(a09 + 13); b[142] = *(a09 + 14); b[143] = *(a09 + 15); b[144] = ZERO; b[145] = ZERO; b[146] = ZERO; b[147] = ZERO; b[148] = ZERO; b[149] = ZERO; b[150] = ZERO; b[151] = ZERO; b[152] = ZERO; #ifdef UNIT b[153] = ONE; #else b[153] = *(a10 + 9); #endif b[154] = *(a10 + 10); b[155] = *(a10 + 11); b[156] = *(a10 + 12); b[157] = *(a10 + 13); b[158] = *(a10 + 14); b[159] = *(a10 + 15); b[160] = ZERO; b[161] = ZERO; b[162] = ZERO; b[163] = ZERO; b[164] = ZERO; b[165] = ZERO; b[166] = ZERO; b[167] = ZERO; b[168] = ZERO; b[169] = ZERO; #ifdef UNIT b[170] = ONE; #else b[170] = *(a11 + 10); #endif b[171] = *(a11 + 11); b[172] = *(a11 + 12); b[173] = *(a11 + 13); b[174] = *(a11 + 14); b[175] = *(a11 + 15); b[176] = ZERO; b[177] = ZERO; b[178] = ZERO; b[179] = ZERO; b[180] = ZERO; b[181] = ZERO; b[182] = ZERO; b[183] = ZERO; b[184] = ZERO; b[185] = ZERO; b[186] = ZERO; #ifdef UNIT b[187] = ONE; #else b[187] = *(a12 + 11); #endif b[188] = *(a12 + 12); b[189] = *(a12 + 13); b[190] = *(a12 + 14); b[191] = *(a12 + 15); b[192] = ZERO; b[193] = ZERO; b[194] = ZERO; b[195] = ZERO; b[196] = ZERO; b[197] = ZERO; b[198] = ZERO; b[199] = ZERO; b[200] = ZERO; b[201] = ZERO; b[202] = ZERO; b[203] = ZERO; #ifdef UNIT b[204] = ONE; #else b[204] = *(a13 + 12); #endif b[205] = *(a13 + 13); b[206] = *(a13 + 14); b[207] = *(a13 + 15); b[208] = ZERO; b[209] = ZERO; b[210] = ZERO; b[211] = ZERO; b[212] = ZERO; b[213] = ZERO; b[214] = ZERO; b[215] = ZERO; b[216] = ZERO; b[217] = ZERO; b[218] = ZERO; b[219] = ZERO; b[220] = ZERO; #ifdef UNIT b[221] = ONE; #else b[221] = *(a14 + 13); #endif b[222] = *(a14 + 14); b[223] = *(a14 + 15); b[224] = ZERO; b[225] = ZERO; b[226] = ZERO; b[227] = ZERO; b[228] = ZERO; b[229] = ZERO; b[230] = ZERO; b[231] = ZERO; b[232] = ZERO; b[233] = ZERO; b[234] = ZERO; b[235] = ZERO; b[236] = ZERO; b[237] = ZERO; #ifdef UNIT b[238] = ONE; #else b[238] = *(a15 + 14); #endif b[239] = *(a15 + 15); b[240] = ZERO; b[241] = ZERO; b[242] = ZERO; b[243] = ZERO; b[244] = ZERO; b[245] = ZERO; b[246] = ZERO; b[247] = ZERO; b[248] = ZERO; b[249] = ZERO; b[250] = ZERO; b[251] = ZERO; b[252] = ZERO; b[253] = ZERO; b[254] = ZERO; #ifdef UNIT b[255] = ONE; #else b[255] = *(a16 + 15); #endif a01 += 16; a02 += 16; a03 += 16; a04 += 16; a05 += 16; a06 += 16; a07 += 16; a08 += 16; a09 += 16; a10 += 16; a11 += 16; a12 += 16; a13 += 16; a14 += 16; a15 += 16; a16 += 16; b += 256; } X += 16; i --; } while (i > 0); } i = (m & 15); if (i > 0) { if (X > posY) { a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; a08 += i; a09 += i; a10 += i; a11 += i; a12 += i; a13 += i; a14 += i; a15 += i; a16 += i; b += 16 * i; } else if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; a02 += lda; a03 += lda; a04 += lda; a05 += lda; a06 += lda; a07 += lda; a08 += lda; a09 += lda; a10 += lda; a11 += lda; a12 += lda; a13 += lda; a14 += lda; a15 += lda; a16 += lda; b += 16; } } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); b += 16; if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = *(a02 + 2); b[ 3] = *(a02 + 3); b[ 4] = *(a02 + 4); b[ 5] = *(a02 + 5); b[ 6] = *(a02 + 6); b[ 7] = *(a02 + 7); b[ 8] = *(a02 + 8); b[ 9] = *(a02 + 9); b[10] = *(a02 + 10); b[11] = *(a02 + 11); b[12] = *(a02 + 12); b[13] = *(a02 + 13); b[14] = *(a02 + 14); b[15] = *(a02 + 15); b += 16; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = *(a03 + 3); b[ 4] = *(a03 + 4); b[ 5] = *(a03 + 5); b[ 6] = *(a03 + 6); b[ 7] = *(a03 + 7); b[ 8] = *(a03 + 8); b[ 9] = *(a03 + 9); b[10] = *(a03 + 10); b[11] = *(a03 + 11); b[12] = *(a03 + 12); b[13] = *(a03 + 13); b[14] = *(a03 + 14); b[15] = *(a03 + 15); b += 16; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = *(a04 + 4); b[ 5] = *(a04 + 5); b[ 6] = *(a04 + 6); b[ 7] = *(a04 + 7); b[ 8] = *(a04 + 8); b[ 9] = *(a04 + 9); b[10] = *(a04 + 10); b[11] = *(a04 + 11); b[12] = *(a04 + 12); b[13] = *(a04 + 13); b[14] = *(a04 + 14); b[15] = *(a04 + 15); b += 16; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = *(a05 + 5); b[ 6] = *(a05 + 6); b[ 7] = *(a05 + 7); b[ 8] = *(a05 + 8); b[ 9] = *(a05 + 9); b[10] = *(a05 + 10); b[11] = *(a05 + 11); b[12] = *(a05 + 12); b[13] = *(a05 + 13); b[14] = *(a05 + 14); b[15] = *(a05 + 15); b += 16; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = *(a06 + 6); b[ 7] = *(a06 + 7); b[ 8] = *(a06 + 8); b[ 9] = *(a06 + 9); b[10] = *(a06 + 10); b[11] = *(a06 + 11); b[12] = *(a06 + 12); b[13] = *(a06 + 13); b[14] = *(a06 + 14); b[15] = *(a06 + 15); b += 16; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = *(a07 + 7); b[ 8] = *(a07 + 8); b[ 9] = *(a07 + 9); b[10] = *(a07 + 10); b[11] = *(a07 + 11); b[12] = *(a07 + 12); b[13] = *(a07 + 13); b[14] = *(a07 + 14); b[15] = *(a07 + 15); b += 16; } if (i >= 8) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; #ifdef UNIT b[ 7] = ONE; #else b[ 7] = *(a08 + 7); #endif b[ 8] = *(a08 + 8); b[ 9] = *(a08 + 9); b[10] = *(a08 + 10); b[11] = *(a08 + 11); b[12] = *(a08 + 12); b[13] = *(a08 + 13); b[14] = *(a08 + 14); b[15] = *(a08 + 15); b += 16; } if (i >= 9) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; #ifdef UNIT b[ 8] = ONE; #else b[ 8] = *(a09 + 8); #endif b[ 9] = *(a09 + 9); b[10] = *(a09 + 10); b[11] = *(a09 + 11); b[12] = *(a09 + 12); b[13] = *(a09 + 13); b[14] = *(a09 + 14); b[15] = *(a09 + 15); b += 16; } if (i >= 10) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a10 + 9); #endif b[10] = *(a10 + 10); b[11] = *(a10 + 11); b[12] = *(a10 + 12); b[13] = *(a10 + 13); b[14] = *(a10 + 14); b[15] = *(a10 + 15); b += 16; } if (i >= 11) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[10] = ONE; #else b[10] = *(a11 + 10); #endif b[11] = *(a11 + 11); b[12] = *(a11 + 12); b[13] = *(a11 + 13); b[14] = *(a11 + 14); b[15] = *(a11 + 15); b += 16; } if (i >= 12) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; #ifdef UNIT b[11] = ONE; #else b[11] = *(a12 + 11); #endif b[12] = *(a12 + 12); b[13] = *(a12 + 13); b[14] = *(a12 + 14); b[15] = *(a12 + 15); b += 16; } if (i >= 13) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; #ifdef UNIT b[12] = ONE; #else b[12] = *(a13 + 12); #endif b[13] = *(a13 + 13); b[14] = *(a13 + 14); b[15] = *(a13 + 15); b += 16; } if (i >= 14) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; #ifdef UNIT b[13] = ONE; #else b[13] = *(a14 + 13); #endif b[14] = *(a14 + 14); b[15] = *(a14 + 15); b += 16; } if (i >= 15) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; #ifdef UNIT b[14] = ONE; #else b[14] = *(a15 + 14); #endif b[15] = *(a15 + 15); b += 16; } } } posY += 16; js --; } while (js > 0); } /* End of main loop */ if (n & 8){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X > posY) { a01 += 8; a02 += 8; a03 += 8; a04 += 8; a05 += 8; a06 += 8; a07 += 8; a08 += 8; b += 64; } else if (X < posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; b += 8; } a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a02 + 1); #endif b[ 10] = *(a02 + 2); b[ 11] = *(a02 + 3); b[ 12] = *(a02 + 4); b[ 13] = *(a02 + 5); b[ 14] = *(a02 + 6); b[ 15] = *(a02 + 7); b[ 16] = ZERO; b[ 17] = ZERO; #ifdef UNIT b[ 18] = ONE; #else b[ 18] = *(a03 + 2); #endif b[ 19] = *(a03 + 3); b[ 20] = *(a03 + 4); b[ 21] = *(a03 + 5); b[ 22] = *(a03 + 6); b[ 23] = *(a03 + 7); b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; #ifdef UNIT b[ 27] = ONE; #else b[ 27] = *(a04 + 3); #endif b[ 28] = *(a04 + 4); b[ 29] = *(a04 + 5); b[ 30] = *(a04 + 6); b[ 31] = *(a04 + 7); b[ 32] = ZERO; b[ 33] = ZERO; b[ 34] = ZERO; b[ 35] = ZERO; #ifdef UNIT b[ 36] = ONE; #else b[ 36] = *(a05 + 4); #endif b[ 37] = *(a05 + 5); b[ 38] = *(a05 + 6); b[ 39] = *(a05 + 7); b[ 40] = ZERO; b[ 41] = ZERO; b[ 42] = ZERO; b[ 43] = ZERO; b[ 44] = ZERO; #ifdef UNIT b[ 45] = ONE; #else b[ 45] = *(a06 + 5); #endif b[ 46] = *(a06 + 6); b[ 47] = *(a06 + 7); b[ 48] = ZERO; b[ 49] = ZERO; b[ 50] = ZERO; b[ 51] = ZERO; b[ 52] = ZERO; b[ 53] = ZERO; #ifdef UNIT b[ 54] = ONE; #else b[ 54] = *(a07 + 6); #endif b[ 55] = *(a07 + 7); b[ 56] = ZERO; b[ 57] = ZERO; b[ 58] = ZERO; b[ 59] = ZERO; b[ 60] = ZERO; b[ 61] = ZERO; b[ 62] = ZERO; #ifdef UNIT b[ 63] = ONE; #else b[ 63] = *(a08 + 7); #endif a01 += 8; a02 += 8; a03 += 8; a04 += 8; a05 += 8; a06 += 8; a07 += 8; a08 += 8; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i > 0) { if (X > posY) { a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; a08 += i; b += 8 * i; } else if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; a02 += lda; a03 += lda; a04 += lda; a05 += lda; a06 += lda; a07 += lda; a08 += lda; b += 8; } } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b += 8; if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = *(a02 + 2); b[ 3] = *(a02 + 3); b[ 4] = *(a02 + 4); b[ 5] = *(a02 + 5); b[ 6] = *(a02 + 6); b[ 7] = *(a02 + 7); b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = *(a03 + 3); b[ 4] = *(a03 + 4); b[ 5] = *(a03 + 5); b[ 6] = *(a03 + 6); b[ 7] = *(a03 + 7); b += 8; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = *(a04 + 4); b[ 5] = *(a04 + 5); b[ 6] = *(a04 + 6); b[ 7] = *(a04 + 7); b += 8; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = *(a05 + 5); b[ 6] = *(a05 + 6); b[ 7] = *(a05 + 7); b += 8; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = *(a06 + 6); b[ 7] = *(a06 + 7); b += 8; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = *(a07 + 7); b += 8; } } } posY += 8; } if (n & 4){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { a01 += 4; a02 += 4; a03 += 4; a04 += 4; b += 16; } else if (X < posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); a01 += lda; b += 4; } a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a02 + 1); #endif b[ 6] = *(a02 + 2); b[ 7] = *(a02 + 3); b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[ 10] = ONE; #else b[ 10] = *(a03 + 2); #endif b[ 11] = *(a03 + 3); b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; #ifdef UNIT b[ 15] = ONE; #else b[ 15] = *(a04 + 3); #endif a01 += 4; a02 += 4; a03 += 4; a04 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i > 0) { if (X > posY) { a01 += i; a02 += i; a03 += i; a04 += i; b += 4 * i; } else if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); a01 += lda; a02 += lda; a03 += lda; a04 += lda; b += 4; } } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b += 4; if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = *(a02 + 2); b[ 3] = *(a02 + 3); b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = *(a03 + 3); b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; } else { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { a01 += 2; a02 += 2; b += 4; } else if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a02 + 0); b[ 3] = *(a02 + 1); a01 += 2 * lda; a02 += 2 * lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a02 + 1); #endif a01 += 2; a02 += 2; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { a01 ++; a02 ++; b += 2; } else if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); a01 += lda; a02 += lda; b += 2; } } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a01 + 1); b += 2; } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { a01 = a + posY + (posX + 0) * lda; } else { a01 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { b ++; a01 ++; } else if (X < posY) { b[ 0] = *(a01 + 0); a01 += lda; b ++; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif a01 ++; b ++; } X += 1; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_ltcopy_2.c000066400000000000000000000120031313527062700207720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = data04; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { ao1 += 1; ao2 += 1; b += 2; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = data02; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif ao1 += 2; b += 2; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { ao1 += 1; b += 1; } else if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += 1; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_ltcopy_4.c000066400000000000000000000242071313527062700210050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data12 = *(ao3 + 3); b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = ONE; b[ 6] = data07; b[ 7] = data08; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = data12; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; b[11] = data12; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = data16; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += lda; b += 4; } } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); if (i >= 2) { data07 = *(ao2 + 2); data08 = *(ao2 + 3); } if (i >= 3) { data12 = *(ao3 + 3); } b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; b[ 2] = data07; b[ 3] = data08; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = data12; b += 4; } #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); if (i >= 2) { data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); } if (i >= 3) { data11 = *(ao3 + 2); data12 = *(ao3 + 3); } b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; b[ 2] = data07; b[ 3] = data08; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data12; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; b[ 3] = data06; ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = data06; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X > posY) { ao1 += 1; ao2 += 1; b += 2; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = data02; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { b += 1; ao1 += 1; } else if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += 1; b += 1; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_ltcopy_6.c000066400000000000000000000242071313527062700210070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data12 = *(ao3 + 3); b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = ONE; b[ 6] = data07; b[ 7] = data08; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = data12; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; b[11] = data12; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = data16; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += lda; b += 4; } } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); if (i >= 2) { data07 = *(ao2 + 2); data08 = *(ao2 + 3); } if (i >= 3) { data12 = *(ao3 + 3); } b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; b[ 2] = data07; b[ 3] = data08; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = data12; b += 4; } #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); if (i >= 2) { data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); } if (i >= 3) { data11 = *(ao3 + 2); data12 = *(ao3 + 3); } b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; b[ 2] = data07; b[ 3] = data08; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data12; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; b[ 3] = data06; ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = data06; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X > posY) { ao1 += 1; ao2 += 1; b += 2; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = data02; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { b += 1; ao1 += 1; } else if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += 1; b += 1; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_ltcopy_8.c000066400000000000000000000576401313527062700210200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; ao5 = a + posY + (posX + 4) * lda; ao6 = a + posY + (posX + 5) * lda; ao7 = a + posY + (posX + 6) * lda; ao8 = a + posY + (posX + 7) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; ao5 = a + posX + (posY + 4) * lda; ao6 = a + posX + (posY + 5) * lda; ao7 = a + posX + (posY + 6) * lda; ao8 = a + posX + (posY + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X > posY) { ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; ao5 += 8; ao6 += 8; ao7 += 8; ao8 += 8; b += 64; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data37 = *(ao5 + 4); data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data45 = *(ao6 + 4); data46 = *(ao6 + 5); data47 = *(ao6 + 6); data48 = *(ao6 + 7); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); data61 = *(ao8 + 4); data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; b[32] = data33; b[33] = data34; b[34] = data35; b[35] = data36; b[36] = data37; b[37] = data38; b[38] = data39; b[39] = data40; b[40] = data41; b[41] = data42; b[42] = data43; b[43] = data44; b[44] = data45; b[45] = data46; b[46] = data47; b[47] = data48; b[48] = data49; b[49] = data50; b[50] = data51; b[51] = data52; b[52] = data53; b[53] = data54; b[54] = data55; b[55] = data56; b[56] = data57; b[57] = data58; b[58] = data59; b[59] = data60; b[60] = data61; b[61] = data62; b[62] = data63; b[63] = data64; ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 64; } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); #ifndef UNIT data10 = *(ao2 + 1); #endif data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); #ifndef UNIT data19 = *(ao3 + 2); #endif data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); #ifndef UNIT data28 = *(ao4 + 3); #endif data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); #ifndef UNIT data37 = *(ao5 + 4); #endif data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); #ifndef UNIT data46 = *(ao6 + 5); #endif data47 = *(ao6 + 6); data48 = *(ao6 + 7); #ifndef UNIT data55 = *(ao7 + 6); #endif data56 = *(ao7 + 7); #ifndef UNIT data64 = *(ao8 + 7); #endif #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; #else b[ 9] = data10; #endif b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = ZERO; b[17] = ZERO; #ifdef UNIT b[18] = ONE; #else b[18] = data19; #endif b[19] = data20; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; #ifdef UNIT b[27] = ONE; #else b[27] = data28; #endif b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; b[32] = ZERO; b[33] = ZERO; b[34] = ZERO; b[35] = ZERO; #ifdef UNIT b[36] = ONE; #else b[36] = data37; #endif b[37] = data38; b[38] = data39; b[39] = data40; b[40] = ZERO; b[41] = ZERO; b[42] = ZERO; b[43] = ZERO; b[44] = ZERO; #ifdef UNIT b[45] = ONE; #else b[45] = data46; #endif b[46] = data47; b[47] = data48; b[48] = ZERO; b[49] = ZERO; b[50] = ZERO; b[51] = ZERO; b[52] = ZERO; b[53] = ZERO; #ifdef UNIT b[54] = ONE; #else b[54] = data55; #endif b[55] = data56; b[56] = ZERO; b[57] = ZERO; b[58] = ZERO; b[59] = ZERO; b[60] = ZERO; b[61] = ZERO; b[62] = ZERO; #ifdef UNIT b[63] = ONE; #else b[63] = data64; #endif ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; ao5 += 8; ao6 += 8; ao7 += 8; ao8 += 8; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X > posY) { if (m & 4) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; ao5 += 4; ao6 += 4; ao7 += 4; ao8 += 4; b += 32; } if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } if (m & 1) { b += 8; } } else if (X < posY) { if (m & 4) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 2 * lda; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); } if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); #endif data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); } if (i >= 5) { #ifndef UNIT data37 = *(ao5 + 4); #endif data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); } if (i >= 6) { #ifndef UNIT data46 = *(ao6 + 5); #endif data47 = *(ao6 + 6); data48 = *(ao6 + 7); } if (i >= 7) { #ifndef UNIT data55 = *(ao7 + 6); #endif data56 = *(ao7 + 7); } #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = data11; b[ 3] = data12; b[ 4] = data13; b[ 5] = data14; b[ 6] = data15; b[ 7] = data16; b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = data20; b[ 4] = data21; b[ 5] = data22; b[ 6] = data23; b[ 7] = data24; b += 8; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = data28; #endif b[ 4] = data29; b[ 5] = data30; b[ 6] = data31; b[ 7] = data32; b += 8; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = data37; #endif b[ 5] = data38; b[ 6] = data39; b[ 7] = data40; b += 8; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = data46; #endif b[ 6] = data47; b[ 7] = data48; b += 8; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = data55; #endif b[ 7] = data56; b += 8; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data09; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; b[ 8] = data17; b[ 9] = data18; b[10] = data19; b[11] = data20; b[12] = data25; b[13] = data26; b[14] = data27; b[15] = data28; b += 16; } else { #ifdef UNIT data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data20 = *(ao3 + 3); b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = ONE; b[ 6] = data11; b[ 7] = data12; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = data20; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data19; b[11] = data20; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = data28; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { b += 4; } } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data09; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; ao1 += 2 * lda; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data11 = *(ao2 + 2); data12 = *(ao2 + 3); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data20 = *(ao3 + 3); } #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = data11; b[ 3] = data12; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = data20; b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } else { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); ao1 += 2 * lda; ao2 += 2 * lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b += 4; } else { #ifdef UNIT data02 = *(ao1 + 1); b[ 0] = ONE; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = data10; #endif ao1 += 2; ao2 += 2; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { b += 2; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; b += 2; } else { #ifdef UNIT data09 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data09; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data09; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY + (posX + 0) * lda; } else { ao1 = a + posX + (posY + 0) * lda; } i = m; if (m > 0) { do { if (X > posY) { ao1 += 1; b += 1; } else if (X < posY) { data01 = *(ao1 + 0); ao1 += lda; b[ 0] = data01; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 ++; b ++; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_uncopy_1.c000066400000000000000000000065331313527062700210070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, X; FLOAT data01; FLOAT *ao1; while (n > 0) { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X > posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += lda; } X += 1; i --; } while (i > 0); } posY ++; n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_uncopy_16.c000066400000000000000000000754271313527062700211050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X, ii; FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; js = (n >> 4); if (js > 0){ do { X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; a09 = a + posX + (posY + 8) * lda; a10 = a + posX + (posY + 9) * lda; a11 = a + posX + (posY + 10) * lda; a12 = a + posX + (posY + 11) * lda; a13 = a + posX + (posY + 12) * lda; a14 = a + posX + (posY + 13) * lda; a15 = a + posX + (posY + 14) * lda; a16 = a + posX + (posY + 15) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; a09 = a + posY + (posX + 8) * lda; a10 = a + posY + (posX + 9) * lda; a11 = a + posY + (posX + 10) * lda; a12 = a + posY + (posX + 11) * lda; a13 = a + posY + (posX + 12) * lda; a14 = a + posY + (posX + 13) * lda; a15 = a + posY + (posX + 14) * lda; a16 = a + posY + (posX + 15) * lda; } i = (m >> 4); if (i > 0) { do { if (X < posY) { for (ii = 0; ii < 16; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); b[ 11] = *(a12 + 0); b[ 12] = *(a13 + 0); b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; a09 ++; a10 ++; a11 ++; a12 ++; a13 ++; a14 ++; a15 ++; a16 ++; b += 16; } } else if (X > posY) { a01 += 16 * lda; a02 += 16 * lda; a03 += 16 * lda; a04 += 16 * lda; a05 += 16 * lda; a06 += 16 * lda; a07 += 16 * lda; a08 += 16 * lda; a09 += 16 * lda; a10 += 16 * lda; a11 += 16 * lda; a12 += 16 * lda; a13 += 16 * lda; a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; b += 256; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); b[ 11] = *(a12 + 0); b[ 12] = *(a13 + 0); b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); b[ 16] = ZERO; #ifdef UNIT b[ 17] = ONE; #else b[ 17] = *(a02 + 1); #endif b[ 18] = *(a03 + 1); b[ 19] = *(a04 + 1); b[ 20] = *(a05 + 1); b[ 21] = *(a06 + 1); b[ 22] = *(a07 + 1); b[ 23] = *(a08 + 1); b[ 24] = *(a09 + 1); b[ 25] = *(a10 + 1); b[ 26] = *(a11 + 1); b[ 27] = *(a12 + 1); b[ 28] = *(a13 + 1); b[ 29] = *(a14 + 1); b[ 30] = *(a15 + 1); b[ 31] = *(a16 + 1); b[ 32] = ZERO; b[ 33] = ZERO; #ifdef UNIT b[ 34] = ONE; #else b[ 34] = *(a03 + 2); #endif b[ 35] = *(a04 + 2); b[ 36] = *(a05 + 2); b[ 37] = *(a06 + 2); b[ 38] = *(a07 + 2); b[ 39] = *(a08 + 2); b[ 40] = *(a09 + 2); b[ 41] = *(a10 + 2); b[ 42] = *(a11 + 2); b[ 43] = *(a12 + 2); b[ 44] = *(a13 + 2); b[ 45] = *(a14 + 2); b[ 46] = *(a15 + 2); b[ 47] = *(a16 + 2); b[ 48] = ZERO; b[ 49] = ZERO; b[ 50] = ZERO; #ifdef UNIT b[ 51] = ONE; #else b[ 51] = *(a04 + 3); #endif b[ 52] = *(a05 + 3); b[ 53] = *(a06 + 3); b[ 54] = *(a07 + 3); b[ 55] = *(a08 + 3); b[ 56] = *(a09 + 3); b[ 57] = *(a10 + 3); b[ 58] = *(a11 + 3); b[ 59] = *(a12 + 3); b[ 60] = *(a13 + 3); b[ 61] = *(a14 + 3); b[ 62] = *(a15 + 3); b[ 63] = *(a16 + 3); b[ 64] = ZERO; b[ 65] = ZERO; b[ 66] = ZERO; b[ 67] = ZERO; #ifdef UNIT b[ 68] = ONE; #else b[ 68] = *(a05 + 4); #endif b[ 69] = *(a06 + 4); b[ 70] = *(a07 + 4); b[ 71] = *(a08 + 4); b[ 72] = *(a09 + 4); b[ 73] = *(a10 + 4); b[ 74] = *(a11 + 4); b[ 75] = *(a12 + 4); b[ 76] = *(a13 + 4); b[ 77] = *(a14 + 4); b[ 78] = *(a15 + 4); b[ 79] = *(a16 + 4); b[ 80] = ZERO; b[ 81] = ZERO; b[ 82] = ZERO; b[ 83] = ZERO; b[ 84] = ZERO; #ifdef UNIT b[ 85] = ONE; #else b[ 85] = *(a06 + 5); #endif b[ 86] = *(a07 + 5); b[ 87] = *(a08 + 5); b[ 88] = *(a09 + 5); b[ 89] = *(a10 + 5); b[ 90] = *(a11 + 5); b[ 91] = *(a12 + 5); b[ 92] = *(a13 + 5); b[ 93] = *(a14 + 5); b[ 94] = *(a15 + 5); b[ 95] = *(a16 + 5); b[ 96] = ZERO; b[ 97] = ZERO; b[ 98] = ZERO; b[ 99] = ZERO; b[100] = ZERO; b[101] = ZERO; #ifdef UNIT b[102] = ONE; #else b[102] = *(a07 + 6); #endif b[103] = *(a08 + 6); b[104] = *(a09 + 6); b[105] = *(a10 + 6); b[106] = *(a11 + 6); b[107] = *(a12 + 6); b[108] = *(a13 + 6); b[109] = *(a14 + 6); b[110] = *(a15 + 6); b[111] = *(a16 + 6); b[112] = ZERO; b[113] = ZERO; b[114] = ZERO; b[115] = ZERO; b[116] = ZERO; b[117] = ZERO; b[118] = ZERO; #ifdef UNIT b[119] = ONE; #else b[119] = *(a08 + 7); #endif b[120] = *(a09 + 7); b[121] = *(a10 + 7); b[122] = *(a11 + 7); b[123] = *(a12 + 7); b[124] = *(a13 + 7); b[125] = *(a14 + 7); b[126] = *(a15 + 7); b[127] = *(a16 + 7); b[128] = ZERO; b[129] = ZERO; b[130] = ZERO; b[131] = ZERO; b[132] = ZERO; b[133] = ZERO; b[134] = ZERO; b[135] = ZERO; #ifdef UNIT b[136] = ONE; #else b[136] = *(a09 + 8); #endif b[137] = *(a10 + 8); b[138] = *(a11 + 8); b[139] = *(a12 + 8); b[140] = *(a13 + 8); b[141] = *(a14 + 8); b[142] = *(a15 + 8); b[143] = *(a16 + 8); b[144] = ZERO; b[145] = ZERO; b[146] = ZERO; b[147] = ZERO; b[148] = ZERO; b[149] = ZERO; b[150] = ZERO; b[151] = ZERO; b[152] = ZERO; #ifdef UNIT b[153] = ONE; #else b[153] = *(a10 + 9); #endif b[154] = *(a11 + 9); b[155] = *(a12 + 9); b[156] = *(a13 + 9); b[157] = *(a14 + 9); b[158] = *(a15 + 9); b[159] = *(a16 + 9); b[160] = ZERO; b[161] = ZERO; b[162] = ZERO; b[163] = ZERO; b[164] = ZERO; b[165] = ZERO; b[166] = ZERO; b[167] = ZERO; b[168] = ZERO; b[169] = ZERO; #ifdef UNIT b[170] = ONE; #else b[170] = *(a11 + 10); #endif b[171] = *(a12 + 10); b[172] = *(a13 + 10); b[173] = *(a14 + 10); b[174] = *(a15 + 10); b[175] = *(a16 + 10); b[176] = ZERO; b[177] = ZERO; b[178] = ZERO; b[179] = ZERO; b[180] = ZERO; b[181] = ZERO; b[182] = ZERO; b[183] = ZERO; b[184] = ZERO; b[185] = ZERO; b[186] = ZERO; #ifdef UNIT b[187] = ONE; #else b[187] = *(a12 + 11); #endif b[188] = *(a13 + 11); b[189] = *(a14 + 11); b[190] = *(a15 + 11); b[191] = *(a16 + 11); b[192] = ZERO; b[193] = ZERO; b[194] = ZERO; b[195] = ZERO; b[196] = ZERO; b[197] = ZERO; b[198] = ZERO; b[199] = ZERO; b[200] = ZERO; b[201] = ZERO; b[202] = ZERO; b[203] = ZERO; #ifdef UNIT b[204] = ONE; #else b[204] = *(a13 + 12); #endif b[205] = *(a14 + 12); b[206] = *(a15 + 12); b[207] = *(a16 + 12); b[208] = ZERO; b[209] = ZERO; b[210] = ZERO; b[211] = ZERO; b[212] = ZERO; b[213] = ZERO; b[214] = ZERO; b[215] = ZERO; b[216] = ZERO; b[217] = ZERO; b[218] = ZERO; b[219] = ZERO; b[220] = ZERO; #ifdef UNIT b[221] = ONE; #else b[221] = *(a14 + 13); #endif b[222] = *(a15 + 13); b[223] = *(a16 + 13); b[224] = ZERO; b[225] = ZERO; b[226] = ZERO; b[227] = ZERO; b[228] = ZERO; b[229] = ZERO; b[230] = ZERO; b[231] = ZERO; b[232] = ZERO; b[233] = ZERO; b[234] = ZERO; b[235] = ZERO; b[236] = ZERO; b[237] = ZERO; #ifdef UNIT b[238] = ONE; #else b[238] = *(a15 + 14); #endif b[239] = *(a16 + 14); b[240] = ZERO; b[241] = ZERO; b[242] = ZERO; b[243] = ZERO; b[244] = ZERO; b[245] = ZERO; b[246] = ZERO; b[247] = ZERO; b[248] = ZERO; b[249] = ZERO; b[250] = ZERO; b[251] = ZERO; b[252] = ZERO; b[253] = ZERO; b[254] = ZERO; #ifdef UNIT b[255] = ONE; #else b[255] = *(a16 + 15); #endif a01 += 16 * lda; a02 += 16 * lda; a03 += 16 * lda; a04 += 16 * lda; a05 += 16 * lda; a06 += 16 * lda; a07 += 16 * lda; a08 += 16 * lda; a09 += 16 * lda; a10 += 16 * lda; a11 += 16 * lda; a12 += 16 * lda; a13 += 16 * lda; a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; b += 256; } X += 16; i --; } while (i > 0); } i = (m & 15); if (i) { if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); b[ 11] = *(a12 + 0); b[ 12] = *(a13 + 0); b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; a09 ++; a10 ++; a11 ++; a12 ++; a13 ++; a14 ++; a15 ++; a16 ++; b += 16; } } else if (X > posY) { a01 += i * lda; a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; a08 += i * lda; a09 += i * lda; a10 += i * lda; a11 += i * lda; a12 += i * lda; a13 += i * lda; a14 += i * lda; a15 += i * lda; a16 += i * lda; b += 16 * i; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); b[ 11] = *(a12 + 0); b[ 12] = *(a13 + 0); b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); b += 16; if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = *(a03 + 1); b[ 3] = *(a04 + 1); b[ 4] = *(a05 + 1); b[ 5] = *(a06 + 1); b[ 6] = *(a07 + 1); b[ 7] = *(a08 + 1); b[ 8] = *(a09 + 1); b[ 9] = *(a10 + 1); b[ 10] = *(a11 + 1); b[ 11] = *(a12 + 1); b[ 12] = *(a13 + 1); b[ 13] = *(a14 + 1); b[ 14] = *(a15 + 1); b[ 15] = *(a16 + 1); b += 16; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = *(a04 + 2); b[ 4] = *(a05 + 2); b[ 5] = *(a06 + 2); b[ 6] = *(a07 + 2); b[ 7] = *(a08 + 2); b[ 8] = *(a09 + 2); b[ 9] = *(a10 + 2); b[ 10] = *(a11 + 2); b[ 11] = *(a12 + 2); b[ 12] = *(a13 + 2); b[ 13] = *(a14 + 2); b[ 14] = *(a15 + 2); b[ 15] = *(a16 + 2); b += 16; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = *(a05 + 3); b[ 5] = *(a06 + 3); b[ 6] = *(a07 + 3); b[ 7] = *(a08 + 3); b[ 8] = *(a09 + 3); b[ 9] = *(a10 + 3); b[ 10] = *(a11 + 3); b[ 11] = *(a12 + 3); b[ 12] = *(a13 + 3); b[ 13] = *(a14 + 3); b[ 14] = *(a15 + 3); b[ 15] = *(a16 + 3); b += 16; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = *(a06 + 4); b[ 6] = *(a07 + 4); b[ 7] = *(a08 + 4); b[ 8] = *(a09 + 4); b[ 9] = *(a10 + 4); b[ 10] = *(a11 + 4); b[ 11] = *(a12 + 4); b[ 12] = *(a13 + 4); b[ 13] = *(a14 + 4); b[ 14] = *(a15 + 4); b[ 15] = *(a16 + 4); b += 16; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = *(a07 + 5); b[ 7] = *(a08 + 5); b[ 8] = *(a09 + 5); b[ 9] = *(a10 + 5); b[ 10] = *(a11 + 5); b[ 11] = *(a12 + 5); b[ 12] = *(a13 + 5); b[ 13] = *(a14 + 5); b[ 14] = *(a15 + 5); b[ 15] = *(a16 + 5); b += 16; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = *(a08 + 6); b[ 8] = *(a09 + 6); b[ 9] = *(a10 + 6); b[ 10] = *(a11 + 6); b[ 11] = *(a12 + 6); b[ 12] = *(a13 + 6); b[ 13] = *(a14 + 6); b[ 14] = *(a15 + 6); b[ 15] = *(a16 + 6); b += 16; } if (i >= 8) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; #ifdef UNIT b[ 7] = ONE; #else b[ 7] = *(a08 + 7); #endif b[ 8] = *(a09 + 7); b[ 9] = *(a10 + 7); b[ 10] = *(a11 + 7); b[ 11] = *(a12 + 7); b[ 12] = *(a13 + 7); b[ 13] = *(a14 + 7); b[ 14] = *(a15 + 7); b[ 15] = *(a16 + 7); b += 16; } if (i >= 9) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; #ifdef UNIT b[ 8] = ONE; #else b[ 8] = *(a09 + 8); #endif b[ 9] = *(a10 + 8); b[ 10] = *(a11 + 8); b[ 11] = *(a12 + 8); b[ 12] = *(a13 + 8); b[ 13] = *(a14 + 8); b[ 14] = *(a15 + 8); b[ 15] = *(a16 + 8); b += 16; } if (i >= 10) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a10 + 9); #endif b[ 10] = *(a11 + 9); b[ 11] = *(a12 + 9); b[ 12] = *(a13 + 9); b[ 13] = *(a14 + 9); b[ 14] = *(a15 + 9); b[ 15] = *(a16 + 9); b += 16; } if (i >= 11) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[ 10] = ONE; #else b[ 10] = *(a11 + 10); #endif b[ 11] = *(a12 + 10); b[ 12] = *(a13 + 10); b[ 13] = *(a14 + 10); b[ 14] = *(a15 + 10); b[ 15] = *(a16 + 10); b += 16; } if (i >= 12) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; #ifdef UNIT b[ 11] = ONE; #else b[ 11] = *(a12 + 11); #endif b[ 12] = *(a13 + 11); b[ 13] = *(a14 + 11); b[ 14] = *(a15 + 11); b[ 15] = *(a16 + 11); b += 16; } if (i >= 13) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; #ifdef UNIT b[ 12] = ONE; #else b[ 12] = *(a13 + 12); #endif b[ 13] = *(a14 + 12); b[ 14] = *(a15 + 12); b[ 15] = *(a16 + 12); b += 16; } if (i >= 14) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; #ifdef UNIT b[ 13] = ONE; #else b[ 13] = *(a14 + 13); #endif b[ 14] = *(a15 + 13); b[ 15] = *(a16 + 13); b += 16; } if (i >= 15) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; #ifdef UNIT b[ 14] = ONE; #else b[ 14] = *(a15 + 14); #endif b[ 15] = *(a16 + 14); b += 16; } } } posY += 16; js --; } while (js > 0); } /* End of main loop */ if (n & 8){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X < posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; b += 8; } } else if (X > posY) { a01 += 8 * lda; a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; b += 64; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a02 + 1); #endif b[ 10] = *(a03 + 1); b[ 11] = *(a04 + 1); b[ 12] = *(a05 + 1); b[ 13] = *(a06 + 1); b[ 14] = *(a07 + 1); b[ 15] = *(a08 + 1); b[ 16] = ZERO; b[ 17] = ZERO; #ifdef UNIT b[ 18] = ONE; #else b[ 18] = *(a03 + 2); #endif b[ 19] = *(a04 + 2); b[ 20] = *(a05 + 2); b[ 21] = *(a06 + 2); b[ 22] = *(a07 + 2); b[ 23] = *(a08 + 2); b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; #ifdef UNIT b[ 27] = ONE; #else b[ 27] = *(a04 + 3); #endif b[ 28] = *(a05 + 3); b[ 29] = *(a06 + 3); b[ 30] = *(a07 + 3); b[ 31] = *(a08 + 3); b[ 32] = ZERO; b[ 33] = ZERO; b[ 34] = ZERO; b[ 35] = ZERO; #ifdef UNIT b[ 36] = ONE; #else b[ 36] = *(a05 + 4); #endif b[ 37] = *(a06 + 4); b[ 38] = *(a07 + 4); b[ 39] = *(a08 + 4); b[ 40] = ZERO; b[ 41] = ZERO; b[ 42] = ZERO; b[ 43] = ZERO; b[ 44] = ZERO; #ifdef UNIT b[ 45] = ONE; #else b[ 45] = *(a06 + 5); #endif b[ 46] = *(a07 + 5); b[ 47] = *(a08 + 5); b[ 48] = ZERO; b[ 49] = ZERO; b[ 50] = ZERO; b[ 51] = ZERO; b[ 52] = ZERO; b[ 53] = ZERO; #ifdef UNIT b[ 54] = ONE; #else b[ 54] = *(a07 + 6); #endif b[ 55] = *(a08 + 6); b[ 56] = ZERO; b[ 57] = ZERO; b[ 58] = ZERO; b[ 59] = ZERO; b[ 60] = ZERO; b[ 61] = ZERO; b[ 62] = ZERO; #ifdef UNIT b[ 63] = ONE; #else b[ 63] = *(a08 + 7); #endif a01 += 8 * lda; a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); a01 ++; a02 ++; a03 ++; a04 ++; a05 ++; a06 ++; a07 ++; a08 ++; b += 8; } } else if (X > posY) { a01 += i * lda; a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; a08 += i * lda; b += 8 * i; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = *(a05 + 0); b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b += 8; if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = *(a03 + 1); b[ 3] = *(a04 + 1); b[ 4] = *(a05 + 1); b[ 5] = *(a06 + 1); b[ 6] = *(a07 + 1); b[ 7] = *(a08 + 1); b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = *(a04 + 2); b[ 4] = *(a05 + 2); b[ 5] = *(a06 + 2); b[ 6] = *(a07 + 2); b[ 7] = *(a08 + 2); b += 8; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = *(a05 + 3); b[ 5] = *(a06 + 3); b[ 6] = *(a07 + 3); b[ 7] = *(a08 + 3); b += 8; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = *(a06 + 4); b[ 6] = *(a07 + 4); b[ 7] = *(a08 + 4); b += 8; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = *(a07 + 5); b[ 7] = *(a08 + 5); b += 8; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = *(a08 + 6); b += 8; } } } posY += 8; } if (n & 4){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); a01 ++; a02 ++; a03 ++; a04 ++; b += 4; } } else if (X > posY) { a01 += 4 * lda; a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; b += 16; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a02 + 1); #endif b[ 6] = *(a03 + 1); b[ 7] = *(a04 + 1); b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[ 10] = ONE; #else b[ 10] = *(a03 + 2); #endif b[ 11] = *(a04 + 2); b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; #ifdef UNIT b[ 15] = ONE; #else b[ 15] = *(a04 + 3); #endif a01 += 4 * lda; a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); a01 ++; a02 ++; a03 ++; a04 ++; b += 4; } } else if (X > posY) { a01 += i * lda; a02 += i * lda; a03 += i * lda; a04 += i * lda; b += 4 * i; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b += 4; if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = *(a03 + 1); b[ 3] = *(a04 + 1); b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = *(a04 + 2); b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a01 + 1); b[ 3] = *(a02 + 1); a01 += 2; a02 += 2; b += 4; } else if (X > posY) { a01 += 2 * lda; a02 += 2 * lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a02 + 1); #endif a01 += 2 * lda; a02 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); a01 ++; a02 ++; b += 2; } else if (X > posY) { a01 += lda; a02 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; } else { a01 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { b[ 0] = *(a01 + 0); a01 += 1; b += 1; } else if (X > posY) { a01 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_uncopy_2.c000066400000000000000000000117131313527062700210040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data03; b[ 2] = data02; b[ 3] = data04; ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data03 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data03; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data03; b[ 2] = ZERO; b[ 3] = data04; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { data01 = *(ao1 + 0); data03 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data03; ao1 += 1; ao2 += 1; b += 2; } else if (X > posY) { ao1 += lda; b += 2; } else { #ifdef UNIT data03 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data03; #else data01 = *(ao1 + 0); data03 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data03; #endif ao1 += lda; b += 2; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X > posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += lda; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_uncopy_4.c000066400000000000000000000240461313527062700210110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = data02; b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; b[ 8] = data03; b[ 9] = data07; b[10] = data11; b[11] = data15; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = data16; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = ZERO; b[ 5] = ONE; b[ 6] = data10; b[ 7] = data14; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = data15; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = ZERO; b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; b[11] = data15; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = data16; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data03; b[ 2] = data05; b[ 3] = data07; b[ 4] = data02; b[ 5] = data04; b[ 6] = data06; b[ 7] = data08; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data03 = *(ao2 + 0); data05 = *(ao3 + 0); data07 = *(ao4 + 0); b[ 0] = data01; b[ 1] = data03; b[ 2] = data05; b[ 3] = data07; ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { ao1 += lda; b += 4; } } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data15 = *(ao4 + 2); } b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = data15; b += 4; } #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data06 = *(ao2 + 1); data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data11 = *(ao3 + 2); data15 = *(ao4 + 2); } b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data15; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data05; b[ 2] = data02; b[ 3] = data06; ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data05; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data05; b[ 2] = ZERO; b[ 3] = data06; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X < posY) { data01 = *(ao1 + 0); data05 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data05; ao1 += 1; ao2 += 1; b += 2; } else if (X > posY) { ao1 += lda; ao2 += lda; b += 2; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data05; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data05; #endif ao1 += lda; ao2 += lda; b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X > posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += lda; b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_uncopy_6.c000066400000000000000000000377311313527062700210200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X, mm; FLOAT data01, data02, data03, data04, data05, data06; FLOAT data07, data08, data09, data10, data11, data12; FLOAT data13, data14, data15, data16, data17, data18; FLOAT data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30; FLOAT data31, data32, data33, data34, data35, data36; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; //js = (n >> 2); js = n/6; if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; ao5 = a + posX + (posY + 4) * lda; ao6 = a + posX + (posY + 5) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; ao5 = a + posY + (posX + 4) * lda; ao6 = a + posY + (posX + 5) * lda; } i = m/6; if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao2 + 0); data08 = *(ao2 + 1); data09 = *(ao2 + 2); data10 = *(ao2 + 3); data11 = *(ao2 + 4); data12 = *(ao2 + 5); data13 = *(ao3 + 0); data14 = *(ao3 + 1); data15 = *(ao3 + 2); data16 = *(ao3 + 3); data17 = *(ao3 + 4); data18 = *(ao3 + 5); data19 = *(ao4 + 0); data20 = *(ao4 + 1); data21 = *(ao4 + 2); data22 = *(ao4 + 3); data23 = *(ao4 + 4); data24 = *(ao4 + 5); data25 = *(ao5 + 0); data26 = *(ao5 + 1); data27 = *(ao5 + 2); data28 = *(ao5 + 3); data29 = *(ao5 + 4); data30 = *(ao5 + 5); data31 = *(ao6 + 0); data32 = *(ao6 + 1); data33 = *(ao6 + 2); data34 = *(ao6 + 3); data35 = *(ao6 + 4); data36 = *(ao6 + 5); b[ 0] = data01; b[ 1] = data07; b[ 2] = data13; b[ 3] = data19; b[ 4] = data25; b[ 5] = data31; b[ 6] = data02; b[ 7] = data08; b[ 8] = data14; b[ 9] = data20; b[10] = data26; b[11] = data32; b[12] = data03; b[13] = data09; b[14] = data15; b[15] = data21; b[16] = data27; b[17] = data33; b[18] = data04; b[19] = data10; b[20] = data16; b[21] = data22; b[22] = data28; b[23] = data34; b[24] = data05; b[25] = data11; b[26] = data17; b[27] = data23; b[28] = data29; b[29] = data35; b[30] = data06; b[31] = data12; b[32] = data18; b[33] = data24; b[34] = data30; b[35] = data36; ao1 += 6; ao2 += 6; ao3 += 6; ao4 += 6; ao5 += 6; ao6 += 6; b += 36; } else if (X > posY) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; b[19] = ZERO; b[20] = ZERO; b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; b[27] = ZERO; b[28] = ZERO; b[29] = ZERO; b[30] = ZERO; b[31] = ZERO; b[32] = ZERO; b[33] = ZERO; b[34] = ZERO; b[35] = ZERO; ao1 += 6 * lda; ao2 += 6 * lda; ao3 += 6 * lda; ao4 += 6 * lda; ao5 += 6 * lda; ao6 += 6 * lda; b += 36; } else { data01 = *(ao1 + 0); data07 = *(ao2 + 0); data13 = *(ao3 + 0); data19 = *(ao4 + 0); data25 = *(ao5 + 0); data31 = *(ao6 + 0); data08 = *(ao2 + 1); data14 = *(ao3 + 1); data20 = *(ao4 + 1); data26 = *(ao5 + 1); data32 = *(ao6 + 1); data15 = *(ao3 + 2); data21 = *(ao4 + 2); data27 = *(ao5 + 2); data33 = *(ao6 + 2); data22 = *(ao4 + 3); data28 = *(ao5 + 3); data34 = *(ao6 + 3); data29 = *(ao5 + 4); data35 = *(ao6 + 4); data36 = *(ao6 + 5); #ifdef UNIT b[ 0] = ONE; b[ 1] = data07; b[ 2] = data13; b[ 3] = data19; b[ 4] = data25; b[ 5] = data31; b[ 6] = ZERO; b[ 7] = ONE; b[ 8] = data14; b[ 9] = data20; b[10] = data26; b[11] = data32; b[12] = ZERO; b[13] = ZERO; b[14] = ONE; b[15] = data21; b[16] = data27; b[17] = data33; b[18] = ZERO; b[19] = ZERO; b[20] = ZERO; b[21] = ONE; b[22] = data28; b[23] = data34; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; b[27] = ZERO; b[28] = ONE; b[29] = data35; b[30] = ZERO; b[31] = ZERO; b[32] = ZERO; b[33] = ZERO; b[34] = ZERO; b[35] = ONE; #else b[ 0] = data01; b[ 1] = data07; b[ 2] = data13; b[ 3] = data19; b[ 4] = data25; b[ 5] = data31; b[ 6] = ZERO; b[ 7] = data08; b[ 8] = data14; b[ 9] = data20; b[10] = data26; b[11] = data32; b[12] = ZERO; b[13] = ZERO; b[14] = data15; b[15] = data21; b[16] = data27; b[17] = data33; b[18] = ZERO; b[19] = ZERO; b[20] = ZERO; b[21] = data22; b[22] = data28; b[23] = data34; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; b[27] = ZERO; b[28] = data29; b[29] = data35; b[30] = ZERO; b[31] = ZERO; b[32] = ZERO; b[33] = ZERO; b[34] = ZERO; b[35] = data36; #endif ao1 += 6; ao2 += 6; ao3 += 6; ao4 += 6; ao5 += 6; ao6 += 7; b += 36; } X += 6; i --; } while (i > 0); } mm = m - m/6; if (mm & 4) { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = data02; b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; b[ 8] = data03; b[ 9] = data07; b[10] = data11; b[11] = data15; b[12] = data04; b[13] = data08; b[14] = data12; b[15] = data16; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X > posY) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; b[19] = ZERO; b[20] = ZERO; b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = ZERO; b[ 5] = ONE; b[ 6] = data10; b[ 7] = data14; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = data15; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b[ 4] = ZERO; b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; b[11] = data15; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = data16; #endif ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } X += 4; } if (mm & 3) { if (X < posY) { if (mm & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data03; b[ 2] = data05; b[ 3] = data07; b[ 4] = data02; b[ 5] = data04; b[ 6] = data06; b[ 7] = data08; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (mm & 1) { data01 = *(ao1 + 0); data03 = *(ao2 + 0); data05 = *(ao3 + 0); data07 = *(ao4 + 0); b[ 0] = data01; b[ 1] = data03; b[ 2] = data05; b[ 3] = data07; ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { ao1 += lda; b += 4; } } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data15 = *(ao4 + 2); } b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = data15; b += 4; } #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data09 = *(ao3 + 0); data13 = *(ao4 + 0); if (i >= 2) { data06 = *(ao2 + 1); data10 = *(ao3 + 1); data14 = *(ao4 + 1); } if (i >= 3) { data11 = *(ao3 + 2); data15 = *(ao4 + 2); } b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; b[ 2] = data10; b[ 3] = data14; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data15; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data05; b[ 2] = data02; b[ 3] = data06; ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data05; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data05; b[ 2] = ZERO; b[ 3] = data06; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X < posY) { data01 = *(ao1 + 0); data05 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data05; ao1 += 1; ao2 += 1; b += 2; } else if (X > posY) { ao1 += lda; ao2 += lda; b += 2; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data05; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data05; #endif ao1 += lda; ao2 += lda; b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X > posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += lda; b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_uncopy_8.c000066400000000000000000000566251313527062700210250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; ao5 = a + posX + (posY + 4) * lda; ao6 = a + posX + (posY + 5) * lda; ao7 = a + posX + (posY + 6) * lda; ao8 = a + posX + (posY + 7) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; ao5 = a + posY + (posX + 4) * lda; ao6 = a + posY + (posX + 5) * lda; ao7 = a + posY + (posX + 6) * lda; ao8 = a + posY + (posX + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data37 = *(ao5 + 4); data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data45 = *(ao6 + 4); data46 = *(ao6 + 5); data47 = *(ao6 + 6); data48 = *(ao6 + 7); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); data61 = *(ao8 + 4); data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b[ 8] = data02; b[ 9] = data10; b[10] = data18; b[11] = data26; b[12] = data34; b[13] = data42; b[14] = data50; b[15] = data58; b[16] = data03; b[17] = data11; b[18] = data19; b[19] = data27; b[20] = data35; b[21] = data43; b[22] = data51; b[23] = data59; b[24] = data04; b[25] = data12; b[26] = data20; b[27] = data28; b[28] = data36; b[29] = data44; b[30] = data52; b[31] = data60; b[32] = data05; b[33] = data13; b[34] = data21; b[35] = data29; b[36] = data37; b[37] = data45; b[38] = data53; b[39] = data61; b[40] = data06; b[41] = data14; b[42] = data22; b[43] = data30; b[44] = data38; b[45] = data46; b[46] = data54; b[47] = data62; b[48] = data07; b[49] = data15; b[50] = data23; b[51] = data31; b[52] = data39; b[53] = data47; b[54] = data55; b[55] = data63; b[56] = data08; b[57] = data16; b[58] = data24; b[59] = data32; b[60] = data40; b[61] = data48; b[62] = data56; b[63] = data64; ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; ao5 += 8; ao6 += 8; ao7 += 8; ao8 += 8; b += 64; } else if (X > posY) { ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 64; } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data09 = *(ao2 + 0); #ifndef UNIT data10 = *(ao2 + 1); #endif data17 = *(ao3 + 0); data18 = *(ao3 + 1); #ifndef UNIT data19 = *(ao3 + 2); #endif data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); #ifndef UNIT data28 = *(ao4 + 3); #endif data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); #ifndef UNIT data37 = *(ao5 + 4); #endif data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data45 = *(ao6 + 4); #ifndef UNIT data46 = *(ao6 + 5); #endif data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); #ifndef UNIT data55 = *(ao7 + 6); #endif data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); data61 = *(ao8 + 4); data62 = *(ao8 + 5); data63 = *(ao8 + 6); #ifndef UNIT data64 = *(ao8 + 7); #endif #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; #else b[ 9] = data10; #endif b[10] = data18; b[11] = data26; b[12] = data34; b[13] = data42; b[14] = data50; b[15] = data58; b[16] = ZERO; b[17] = ZERO; #ifdef UNIT b[18] = ONE; #else b[18] = data19; #endif b[19] = data27; b[20] = data35; b[21] = data43; b[22] = data51; b[23] = data59; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; #ifdef UNIT b[27] = ONE; #else b[27] = data28; #endif b[28] = data36; b[29] = data44; b[30] = data52; b[31] = data60; b[32] = ZERO; b[33] = ZERO; b[34] = ZERO; b[35] = ZERO; #ifdef UNIT b[36] = ONE; #else b[36] = data37; #endif b[37] = data45; b[38] = data53; b[39] = data61; b[40] = ZERO; b[41] = ZERO; b[42] = ZERO; b[43] = ZERO; b[44] = ZERO; #ifdef UNIT b[45] = ONE; #else b[45] = data46; #endif b[46] = data54; b[47] = data62; b[48] = ZERO; b[49] = ZERO; b[50] = ZERO; b[51] = ZERO; b[52] = ZERO; b[53] = ZERO; #ifdef UNIT b[54] = ONE; #else b[54] = data55; #endif b[55] = data63; b[56] = ZERO; b[57] = ZERO; b[58] = ZERO; b[59] = ZERO; b[60] = ZERO; b[61] = ZERO; b[62] = ZERO; #ifdef UNIT b[63] = ONE; #else b[63] = data64; #endif ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X < posY) { if (m & 4) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b[ 8] = data02; b[ 9] = data10; b[10] = data18; b[11] = data26; b[12] = data34; b[13] = data42; b[14] = data50; b[15] = data58; b[16] = data03; b[17] = data11; b[18] = data19; b[19] = data27; b[20] = data35; b[21] = data43; b[22] = data51; b[23] = data59; b[24] = data04; b[25] = data12; b[26] = data20; b[27] = data28; b[28] = data36; b[29] = data44; b[30] = data52; b[31] = data60; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; ao5 += 4; ao6 += 4; ao7 += 4; ao8 += 4; b += 32; } if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data57 = *(ao8 + 0); data58 = *(ao8 + 1); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b[ 8] = data02; b[ 9] = data10; b[10] = data18; b[11] = data26; b[12] = data34; b[13] = data42; b[14] = data50; b[15] = data58; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); data33 = *(ao5 + 0); data41 = *(ao6 + 0); data49 = *(ao7 + 0); data57 = *(ao8 + 0); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b += 8; } } else if (X > posY) { if (m & 4) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } if (m & 2) { ao1 += 2 * lda; b += 16; } if (m & 1) { b += 8; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); data33 = *(ao5 + 0); data41 = *(ao6 + 0); data49 = *(ao7 + 0); data57 = *(ao8 + 0); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data18 = *(ao3 + 1); data26 = *(ao4 + 1); data34 = *(ao5 + 1); data42 = *(ao6 + 1); data50 = *(ao7 + 1); data58 = *(ao8 + 1); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data27 = *(ao4 + 2); data35 = *(ao5 + 2); data43 = *(ao6 + 2); data51 = *(ao7 + 2); data59 = *(ao8 + 2); } if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); #endif data36 = *(ao5 + 3); data44 = *(ao6 + 3); data52 = *(ao7 + 3); data60 = *(ao8 + 3); } if (i >= 5) { #ifndef UNIT data37 = *(ao5 + 4); #endif data45 = *(ao6 + 4); data53 = *(ao7 + 4); data61 = *(ao8 + 4); } if (i >= 6) { #ifndef UNIT data46 = *(ao6 + 5); #endif data54 = *(ao7 + 5); data62 = *(ao8 + 5); } if (i >= 7) { #ifndef UNIT data55 = *(ao7 + 6); #endif data63 = *(ao8 + 6); } #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data33; b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; b += 8; if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = data18; b[ 3] = data26; b[ 4] = data34; b[ 5] = data42; b[ 6] = data50; b[ 7] = data58; b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = data27; b[ 4] = data35; b[ 5] = data43; b[ 6] = data51; b[ 7] = data59; b += 8; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = data28; #endif b[ 4] = data36; b[ 5] = data44; b[ 6] = data52; b[ 7] = data60; b += 8; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = data37; #endif b[ 5] = data45; b[ 6] = data53; b[ 7] = data61; b += 8; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = data46; #endif b[ 6] = data54; b[ 7] = data62; b += 8; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = data55; #endif b[ 7] = data63; b += 8; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; b[ 8] = data03; b[ 9] = data11; b[10] = data19; b[11] = data27; b[12] = data04; b[13] = data12; b[14] = data20; b[15] = data28; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data09 = *(ao2 + 0); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); b[ 0] = ONE; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = ZERO; b[ 5] = ONE; b[ 6] = data18; b[ 7] = data26; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = data27; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ONE; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = ZERO; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data19; b[11] = data27; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = data28; #endif ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b += 4; } } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; b += 8; } if (m & 1) { b += 4; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data18 = *(ao3 + 1); data26 = *(ao4 + 1); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data27 = *(ao4 + 2); } #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; b += 4; if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = data18; b[ 3] = data26; b += 4; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = data27; b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data09; b[ 2] = data02; b[ 3] = data10; ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data09 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data09; b[ 2] = ZERO; b[ 3] = ONE; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data09; b[ 2] = ZERO; b[ 3] = data10; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data09; b += 2; } else if (X > posY) { b += 2; } else { #ifdef UNIT data09 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data09; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data09; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += 1; b += 1; } else if (X > posY) { ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += lda; b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_utcopy_1.c000066400000000000000000000065401313527062700210130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, X; FLOAT data01; FLOAT *ao1; while (n > 0) { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { b += 1; ao1 += 1; } else if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; b += 1; ao1 += lda; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += lda; } X += 1; i --; } while (i > 0); } posY ++; n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_utcopy_16.c000066400000000000000000000744621313527062700211110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, ii; BLASLONG X; FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; js = (n >> 4); if (js > 0){ do { X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; a09 = a + posX + (posY + 8) * lda; a10 = a + posX + (posY + 9) * lda; a11 = a + posX + (posY + 10) * lda; a12 = a + posX + (posY + 11) * lda; a13 = a + posX + (posY + 12) * lda; a14 = a + posX + (posY + 13) * lda; a15 = a + posX + (posY + 14) * lda; a16 = a + posX + (posY + 15) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; a09 = a + posY + (posX + 8) * lda; a10 = a + posY + (posX + 9) * lda; a11 = a + posY + (posX + 10) * lda; a12 = a + posY + (posX + 11) * lda; a13 = a + posY + (posX + 12) * lda; a14 = a + posY + (posX + 13) * lda; a15 = a + posY + (posX + 14) * lda; a16 = a + posY + (posX + 15) * lda; } i = (m >> 4); if (i > 0) { do { if (X < posY) { a01 += 16; a02 += 16; a03 += 16; a04 += 16; a05 += 16; a06 += 16; a07 += 16; a08 += 16; a09 += 16; a10 += 16; a11 += 16; a12 += 16; a13 += 16; a14 += 16; a15 += 16; a16 += 16; b += 256; } else if (X > posY) { for (ii = 0; ii < 16; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; b += 16; } a02 += 16 * lda; a03 += 16 * lda; a04 += 16 * lda; a05 += 16 * lda; a06 += 16 * lda; a07 += 16 * lda; a08 += 16 * lda; a09 += 16 * lda; a10 += 16 * lda; a11 += 16 * lda; a12 += 16 * lda; a13 += 16 * lda; a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(a02 + 0); #ifdef UNIT b[ 17] = ONE; #else b[ 17] = *(a02 + 1); #endif b[ 18] = ZERO; b[ 19] = ZERO; b[ 20] = ZERO; b[ 21] = ZERO; b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; b[ 27] = ZERO; b[ 28] = ZERO; b[ 29] = ZERO; b[ 30] = ZERO; b[ 31] = ZERO; b[ 32] = *(a03 + 0); b[ 33] = *(a03 + 1); #ifdef UNIT b[ 34] = ONE; #else b[ 34] = *(a03 + 2); #endif b[ 35] = ZERO; b[ 36] = ZERO; b[ 37] = ZERO; b[ 38] = ZERO; b[ 39] = ZERO; b[ 40] = ZERO; b[ 41] = ZERO; b[ 42] = ZERO; b[ 43] = ZERO; b[ 44] = ZERO; b[ 45] = ZERO; b[ 46] = ZERO; b[ 47] = ZERO; b[ 48] = *(a04 + 0); b[ 49] = *(a04 + 1); b[ 50] = *(a04 + 2); #ifdef UNIT b[ 51] = ONE; #else b[ 51] = *(a04 + 3); #endif b[ 52] = ZERO; b[ 53] = ZERO; b[ 54] = ZERO; b[ 55] = ZERO; b[ 56] = ZERO; b[ 57] = ZERO; b[ 58] = ZERO; b[ 59] = ZERO; b[ 60] = ZERO; b[ 61] = ZERO; b[ 62] = ZERO; b[ 63] = ZERO; b[ 64] = *(a05 + 0); b[ 65] = *(a05 + 1); b[ 66] = *(a05 + 2); b[ 67] = *(a05 + 3); #ifdef UNIT b[ 68] = ONE; #else b[ 68] = *(a05 + 4); #endif b[ 69] = ZERO; b[ 70] = ZERO; b[ 71] = ZERO; b[ 72] = ZERO; b[ 73] = ZERO; b[ 74] = ZERO; b[ 75] = ZERO; b[ 76] = ZERO; b[ 77] = ZERO; b[ 78] = ZERO; b[ 79] = ZERO; b[ 80] = *(a06 + 0); b[ 81] = *(a06 + 1); b[ 82] = *(a06 + 2); b[ 83] = *(a06 + 3); b[ 84] = *(a06 + 4); #ifdef UNIT b[ 85] = ONE; #else b[ 85] = *(a06 + 5); #endif b[ 86] = ZERO; b[ 87] = ZERO; b[ 88] = ZERO; b[ 89] = ZERO; b[ 90] = ZERO; b[ 91] = ZERO; b[ 92] = ZERO; b[ 93] = ZERO; b[ 94] = ZERO; b[ 95] = ZERO; b[ 96] = *(a07 + 0); b[ 97] = *(a07 + 1); b[ 98] = *(a07 + 2); b[ 99] = *(a07 + 3); b[100] = *(a07 + 4); b[101] = *(a07 + 5); #ifdef UNIT b[102] = ONE; #else b[102] = *(a07 + 6); #endif b[103] = ZERO; b[104] = ZERO; b[105] = ZERO; b[106] = ZERO; b[107] = ZERO; b[108] = ZERO; b[109] = ZERO; b[110] = ZERO; b[111] = ZERO; b[112] = *(a08 + 0); b[113] = *(a08 + 1); b[114] = *(a08 + 2); b[115] = *(a08 + 3); b[116] = *(a08 + 4); b[117] = *(a08 + 5); b[118] = *(a08 + 6); #ifdef UNIT b[119] = ONE; #else b[119] = *(a08 + 7); #endif b[120] = ZERO; b[121] = ZERO; b[122] = ZERO; b[123] = ZERO; b[124] = ZERO; b[125] = ZERO; b[126] = ZERO; b[127] = ZERO; b[128] = *(a09 + 0); b[129] = *(a09 + 1); b[130] = *(a09 + 2); b[131] = *(a09 + 3); b[132] = *(a09 + 4); b[133] = *(a09 + 5); b[134] = *(a09 + 6); b[135] = *(a09 + 7); #ifdef UNIT b[136] = ONE; #else b[136] = *(a09 + 8); #endif b[137] = ZERO; b[138] = ZERO; b[139] = ZERO; b[140] = ZERO; b[141] = ZERO; b[142] = ZERO; b[143] = ZERO; b[144] = *(a10 + 0); b[145] = *(a10 + 1); b[146] = *(a10 + 2); b[147] = *(a10 + 3); b[148] = *(a10 + 4); b[149] = *(a10 + 5); b[150] = *(a10 + 6); b[151] = *(a10 + 7); b[152] = *(a10 + 8); #ifdef UNIT b[153] = ONE; #else b[153] = *(a10 + 9); #endif b[154] = ZERO; b[155] = ZERO; b[156] = ZERO; b[157] = ZERO; b[158] = ZERO; b[159] = ZERO; b[160] = *(a11 + 0); b[161] = *(a11 + 1); b[162] = *(a11 + 2); b[163] = *(a11 + 3); b[164] = *(a11 + 4); b[165] = *(a11 + 5); b[166] = *(a11 + 6); b[167] = *(a11 + 7); b[168] = *(a11 + 8); b[169] = *(a11 + 9); #ifdef UNIT b[170] = ONE; #else b[170] = *(a11 + 10); #endif b[171] = ZERO; b[172] = ZERO; b[173] = ZERO; b[174] = ZERO; b[175] = ZERO; b[176] = *(a12 + 0); b[177] = *(a12 + 1); b[178] = *(a12 + 2); b[179] = *(a12 + 3); b[180] = *(a12 + 4); b[181] = *(a12 + 5); b[182] = *(a12 + 6); b[183] = *(a12 + 7); b[184] = *(a12 + 8); b[185] = *(a12 + 9); b[186] = *(a12 + 10); #ifdef UNIT b[187] = ONE; #else b[187] = *(a12 + 11); #endif b[188] = ZERO; b[189] = ZERO; b[190] = ZERO; b[191] = ZERO; b[192] = *(a13 + 0); b[193] = *(a13 + 1); b[194] = *(a13 + 2); b[195] = *(a13 + 3); b[196] = *(a13 + 4); b[197] = *(a13 + 5); b[198] = *(a13 + 6); b[199] = *(a13 + 7); b[200] = *(a13 + 8); b[201] = *(a13 + 9); b[202] = *(a13 + 10); b[203] = *(a13 + 11); #ifdef UNIT b[204] = ONE; #else b[204] = *(a13 + 12); #endif b[205] = ZERO; b[206] = ZERO; b[207] = ZERO; b[208] = *(a14 + 0); b[209] = *(a14 + 1); b[210] = *(a14 + 2); b[211] = *(a14 + 3); b[212] = *(a14 + 4); b[213] = *(a14 + 5); b[214] = *(a14 + 6); b[215] = *(a14 + 7); b[216] = *(a14 + 8); b[217] = *(a14 + 9); b[218] = *(a14 + 10); b[219] = *(a14 + 11); b[220] = *(a14 + 12); #ifdef UNIT b[221] = ONE; #else b[221] = *(a14 + 13); #endif b[222] = ZERO; b[223] = ZERO; b[224] = *(a15 + 0); b[225] = *(a15 + 1); b[226] = *(a15 + 2); b[227] = *(a15 + 3); b[228] = *(a15 + 4); b[229] = *(a15 + 5); b[230] = *(a15 + 6); b[231] = *(a15 + 7); b[232] = *(a15 + 8); b[233] = *(a15 + 9); b[234] = *(a15 + 10); b[235] = *(a15 + 11); b[236] = *(a15 + 12); b[237] = *(a15 + 13); #ifdef UNIT b[238] = ONE; #else b[238] = *(a15 + 14); #endif b[239] = ZERO; b[240] = *(a16 + 0); b[241] = *(a16 + 1); b[242] = *(a16 + 2); b[243] = *(a16 + 3); b[244] = *(a16 + 4); b[245] = *(a16 + 5); b[246] = *(a16 + 6); b[247] = *(a16 + 7); b[248] = *(a16 + 8); b[249] = *(a16 + 9); b[250] = *(a16 + 10); b[251] = *(a16 + 11); b[252] = *(a16 + 12); b[253] = *(a16 + 13); b[254] = *(a16 + 14); #ifdef UNIT b[255] = ONE; #else b[255] = *(a16 + 15); #endif a01 += 16 * lda; a02 += 16 * lda; a03 += 16 * lda; a04 += 16 * lda; a05 += 16 * lda; a06 += 16 * lda; a07 += 16 * lda; a08 += 16 * lda; a09 += 16 * lda; a10 += 16 * lda; a11 += 16 * lda; a12 += 16 * lda; a13 += 16 * lda; a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; b += 256; } X += 16; i --; } while (i > 0); } i = (m & 15); if (i > 0) { if (X < posY) { a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; a08 += i; a09 += i; a10 += i; a11 += i; a12 += i; a13 += i; a14 += i; a15 += i; a16 += i; b += 16 * i; } else if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; a02 += lda; a03 += lda; a04 += lda; a05 += lda; a06 += lda; a07 += lda; a08 += lda; a09 += lda; a10 += lda; a11 += lda; a12 += lda; a13 += lda; a14 += lda; a15 += lda; a16 += lda; b += 16; } } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; if (i >= 2) { b[ 0] = *(a02 + 0); #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 3) { b[ 0] = *(a03 + 0); b[ 1] = *(a03 + 1); #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 4) { b[ 0] = *(a04 + 0); b[ 1] = *(a04 + 1); b[ 2] = *(a04 + 2); #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = ZERO; b += 16; } if (i >= 5) { b[ 0] = *(a05 + 0); b[ 1] = *(a05 + 1); b[ 2] = *(a05 + 2); b[ 3] = *(a05 + 3); #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 6) { b[ 0] = *(a06 + 0); b[ 1] = *(a06 + 1); b[ 2] = *(a06 + 2); b[ 3] = *(a06 + 3); b[ 4] = *(a06 + 4); #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 7) { b[ 0] = *(a07 + 0); b[ 1] = *(a07 + 1); b[ 2] = *(a07 + 2); b[ 3] = *(a07 + 3); b[ 4] = *(a07 + 4); b[ 5] = *(a07 + 5); #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 8) { b[ 0] = *(a08 + 0); b[ 1] = *(a08 + 1); b[ 2] = *(a08 + 2); b[ 3] = *(a08 + 3); b[ 4] = *(a08 + 4); b[ 5] = *(a08 + 5); b[ 6] = *(a08 + 6); #ifdef UNIT b[ 7] = ONE; #else b[ 7] = *(a08 + 7); #endif b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = ZERO; b += 16; } if (i >= 9) { b[ 0] = *(a09 + 0); b[ 1] = *(a09 + 1); b[ 2] = *(a09 + 2); b[ 3] = *(a09 + 3); b[ 4] = *(a09 + 4); b[ 5] = *(a09 + 5); b[ 6] = *(a09 + 6); b[ 7] = *(a09 + 7); #ifdef UNIT b[ 8] = ONE; #else b[ 8] = *(a09 + 8); #endif b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 10) { b[ 0] = *(a10 + 0); b[ 1] = *(a10 + 1); b[ 2] = *(a10 + 2); b[ 3] = *(a10 + 3); b[ 4] = *(a10 + 4); b[ 5] = *(a10 + 5); b[ 6] = *(a10 + 6); b[ 7] = *(a10 + 7); b[ 8] = *(a10 + 8); #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a10 + 9); #endif b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 11) { b[ 0] = *(a11 + 0); b[ 1] = *(a11 + 1); b[ 2] = *(a11 + 2); b[ 3] = *(a11 + 3); b[ 4] = *(a11 + 4); b[ 5] = *(a11 + 5); b[ 6] = *(a11 + 6); b[ 7] = *(a11 + 7); b[ 8] = *(a11 + 8); b[ 9] = *(a11 + 9); #ifdef UNIT b[10] = ONE; #else b[10] = *(a11 + 10); #endif b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 12) { b[ 0] = *(a12 + 0); b[ 1] = *(a12 + 1); b[ 2] = *(a12 + 2); b[ 3] = *(a12 + 3); b[ 4] = *(a12 + 4); b[ 5] = *(a12 + 5); b[ 6] = *(a12 + 6); b[ 7] = *(a12 + 7); b[ 8] = *(a12 + 8); b[ 9] = *(a12 + 9); b[10] = *(a12 + 10); #ifdef UNIT b[11] = ONE; #else b[11] = *(a12 + 11); #endif b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 13) { b[ 0] = *(a13 + 0); b[ 1] = *(a13 + 1); b[ 2] = *(a13 + 2); b[ 3] = *(a13 + 3); b[ 4] = *(a13 + 4); b[ 5] = *(a13 + 5); b[ 6] = *(a13 + 6); b[ 7] = *(a13 + 7); b[ 8] = *(a13 + 8); b[ 9] = *(a13 + 9); b[10] = *(a13 + 10); b[11] = *(a13 + 11); #ifdef UNIT b[12] = ONE; #else b[12] = *(a13 + 12); #endif b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 14) { b[ 0] = *(a14 + 0); b[ 1] = *(a14 + 1); b[ 2] = *(a14 + 2); b[ 3] = *(a14 + 3); b[ 4] = *(a14 + 4); b[ 5] = *(a14 + 5); b[ 6] = *(a14 + 6); b[ 7] = *(a14 + 7); b[ 8] = *(a14 + 8); b[ 9] = *(a14 + 9); b[10] = *(a14 + 10); b[11] = *(a14 + 11); b[12] = *(a14 + 12); #ifdef UNIT b[13] = ONE; #else b[13] = *(a14 + 13); #endif b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 15) { b[ 0] = *(a15 + 0); b[ 1] = *(a15 + 1); b[ 2] = *(a15 + 2); b[ 3] = *(a15 + 3); b[ 4] = *(a15 + 4); b[ 5] = *(a15 + 5); b[ 6] = *(a15 + 6); b[ 7] = *(a15 + 7); b[ 8] = *(a15 + 8); b[ 9] = *(a15 + 9); b[10] = *(a15 + 10); b[11] = *(a15 + 11); b[12] = *(a15 + 12); b[13] = *(a15 + 13); #ifdef UNIT b[14] = ONE; #else b[14] = *(a15 + 14); #endif b[15] = ZERO; } } } posY += 16; js --; } while (js > 0); } /* End of main loop */ if (n & 8){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; a05 = a + posX + (posY + 4) * lda; a06 = a + posX + (posY + 5) * lda; a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; a05 = a + posY + (posX + 4) * lda; a06 = a + posY + (posX + 5) * lda; a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X < posY) { a01 += 8; a02 += 8; a03 += 8; a04 += 8; a05 += 8; a06 += 8; a07 += 8; a08 += 8; b += 64; } else if (X > posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; b += 8; } a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = *(a02 + 0); #ifdef UNIT b[ 9] = ONE; #else b[ 9] = *(a02 + 1); #endif b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(a03 + 0); b[ 17] = *(a03 + 1); #ifdef UNIT b[ 18] = ONE; #else b[ 18] = *(a03 + 2); #endif b[ 19] = ZERO; b[ 20] = ZERO; b[ 21] = ZERO; b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = *(a04 + 0); b[ 25] = *(a04 + 1); b[ 26] = *(a04 + 2); #ifdef UNIT b[ 27] = ONE; #else b[ 27] = *(a04 + 3); #endif b[ 28] = ZERO; b[ 29] = ZERO; b[ 30] = ZERO; b[ 31] = ZERO; b[ 32] = *(a05 + 0); b[ 33] = *(a05 + 1); b[ 34] = *(a05 + 2); b[ 35] = *(a05 + 3); #ifdef UNIT b[ 36] = ONE; #else b[ 36] = *(a05 + 4); #endif b[ 37] = ZERO; b[ 38] = ZERO; b[ 39] = ZERO; b[ 40] = *(a06 + 0); b[ 41] = *(a06 + 1); b[ 42] = *(a06 + 2); b[ 43] = *(a06 + 3); b[ 44] = *(a06 + 4); #ifdef UNIT b[ 45] = ONE; #else b[ 45] = *(a06 + 5); #endif b[ 46] = ZERO; b[ 47] = ZERO; b[ 48] = *(a07 + 0); b[ 49] = *(a07 + 1); b[ 50] = *(a07 + 2); b[ 51] = *(a07 + 3); b[ 52] = *(a07 + 4); b[ 53] = *(a07 + 5); #ifdef UNIT b[ 54] = ONE; #else b[ 54] = *(a07 + 6); #endif b[ 55] = ZERO; b[ 56] = *(a08 + 0); b[ 57] = *(a08 + 1); b[ 58] = *(a08 + 2); b[ 59] = *(a08 + 3); b[ 60] = *(a08 + 4); b[ 61] = *(a08 + 5); b[ 62] = *(a08 + 6); #ifdef UNIT b[ 63] = ONE; #else b[ 63] = *(a08 + 7); #endif a01 += 8 * lda; a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i > 0) { if (X < posY) { a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; a08 += i; b += 8 * i; } else if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; b += 8; } a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; a08 += i * lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if (i >= 2) { b[ 0] = *(a02 + 0); #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = *(a03 + 0); b[ 1] = *(a03 + 1); #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 4) { b[ 0] = *(a04 + 0); b[ 1] = *(a04 + 1); b[ 2] = *(a04 + 2); #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a04 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 5) { b[ 0] = *(a05 + 0); b[ 1] = *(a05 + 1); b[ 2] = *(a05 + 2); b[ 3] = *(a05 + 3); #ifdef UNIT b[ 4] = ONE; #else b[ 4] = *(a05 + 4); #endif b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 6) { b[ 0] = *(a06 + 0); b[ 1] = *(a06 + 1); b[ 2] = *(a06 + 2); b[ 3] = *(a06 + 3); b[ 4] = *(a06 + 4); #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a06 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 7) { b[ 0] = *(a07 + 0); b[ 1] = *(a07 + 1); b[ 2] = *(a07 + 2); b[ 3] = *(a07 + 3); b[ 4] = *(a07 + 4); b[ 5] = *(a07 + 5); #ifdef UNIT b[ 6] = ONE; #else b[ 6] = *(a07 + 6); #endif b[ 7] = ZERO; b += 8; } } } posY += 8; } if (n & 4){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { a01 += 4; a02 += 4; a03 += 4; a04 += 4; b += 16; } else if (X > posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); a01 += lda; b += 4; } a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = *(a02 + 0); #ifdef UNIT b[ 5] = ONE; #else b[ 5] = *(a02 + 1); #endif b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = *(a03 + 0); b[ 9] = *(a03 + 1); #ifdef UNIT b[ 10] = ONE; #else b[ 10] = *(a03 + 2); #endif b[ 11] = ZERO; b[ 12] = *(a04 + 0); b[ 13] = *(a04 + 1); b[ 14] = *(a04 + 2); #ifdef UNIT b[ 15] = ONE; #else b[ 15] = *(a04 + 3); #endif a01 += 4 * lda; a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i > 0) { if (X < posY) { a01 += i; a02 += i; a03 += i; a04 += i; b += 4 * i; } else if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); a01 += lda; b += 4; } a02 += lda; a03 += lda; a04 += lda; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if (i >= 2) { b[ 0] = *(a02 + 0); #ifdef UNIT b[ 1] = ONE; #else b[ 1] = *(a02 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = *(a03 + 0); b[ 1] = *(a03 + 1); #ifdef UNIT b[ 2] = ONE; #else b[ 2] = *(a03 + 2); #endif b[ 3] = ZERO; b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; } else { a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { a01 += 2; a02 += 2; b += 4; } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a02 + 0); b[ 3] = *(a02 + 1); a01 += 2 * lda; a02 += 2 * lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; b[ 2] = *(a02 + 0); #ifdef UNIT b[ 3] = ONE; #else b[ 3] = *(a02 + 1); #endif a01 += 2 * lda; a02 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { a01 ++; a02 ++; b += 2; } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); a01 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = *(a01 + 1); #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { a01 = a + posX + (posY + 0) * lda; } else { a01 = a + posY + (posX + 0) * lda; } i = m; if (i > 0) { do { if (X < posY) { a01 += 1; b ++; } else if (X > posY) { b[ 0] = *(a01 + 0); a01 += lda; b ++; } else { #ifdef UNIT b[ 0] = ONE; #else b[ 0] = *(a01 + 0); #endif a01 += lda; b ++; } X += 1; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_utcopy_2.c000066400000000000000000000116571313527062700210210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data03 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = ONE; #else data01 = *(ao1 + 0); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { ao1 += 1; ao2 += 1; b += 2; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); b[ 0] = data01; b[ 1] = ZERO; #endif ao1 += lda; b += 2; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { b += 1; ao1 += 1; } else if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; b += 1; ao1 += lda; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif b += 1; ao1 += lda; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_utcopy_4.c000066400000000000000000000236661313527062700210260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = ONE; b[11] = ZERO; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = ZERO; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; #endif ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += lda; b += 4; } } else { #ifdef UNIT if (i >= 2) { data05 = *(ao2 + 0); } if (i >= 3) { data09 = *(ao3 + 0); data10 = *(ao3 + 1); } b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if(i >= 2) { b[ 0] = data05; b[ 1] = ONE; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = data09; b[ 1] = data10; b[ 2] = ONE; b[ 3] = ZERO; b += 4; } #else data01 = *(ao1 + 0); if (i >= 2) { data05 = *(ao2 + 0); data06 = *(ao2 + 1); } if (i >= 3) { data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); } b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if(i >= 2) { b[ 0] = data05; b[ 1] = data06; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = data09; b[ 1] = data10; b[ 2] = data11; b[ 3] = ZERO; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; b[ 3] = data06; ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data05; b[ 3] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data05; b[ 3] = data06; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X < posY) { ao1 += 2; b += 2; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); b[ 0] = data01; b[ 1] = ZERO; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { b += 1; ao1 += 1; } else if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += lda; b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_utcopy_6.c000066400000000000000000000236661313527062700210300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } else { #ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = ONE; b[11] = ZERO; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = ZERO; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; #endif ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += lda; b += 4; } } else { #ifdef UNIT if (i >= 2) { data05 = *(ao2 + 0); } if (i >= 3) { data09 = *(ao3 + 0); data10 = *(ao3 + 1); } b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if(i >= 2) { b[ 0] = data05; b[ 1] = ONE; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = data09; b[ 1] = data10; b[ 2] = ONE; b[ 3] = ZERO; b += 4; } #else data01 = *(ao1 + 0); if (i >= 2) { data05 = *(ao2 + 0); data06 = *(ao2 + 1); } if (i >= 3) { data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); } b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if(i >= 2) { b[ 0] = data05; b[ 1] = data06; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = data09; b[ 1] = data10; b[ 2] = data11; b[ 3] = ZERO; b += 4; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; b[ 3] = data06; ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data05 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data05; b[ 3] = ONE; #else data01 = *(ao1 + 0); data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data05; b[ 3] = data06; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X < posY) { ao1 += 2; b += 2; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); b[ 0] = data01; b[ 1] = ZERO; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { b += 1; ao1 += 1; } else if (X > posY) { data01 = *(ao1 + 0); b[ 0] = data01; ao1 += lda; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += lda; b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmm_utcopy_8.c000066400000000000000000000632031313527062700210210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; ao5 = a + posX + (posY + 4) * lda; ao6 = a + posX + (posY + 5) * lda; ao7 = a + posX + (posY + 6) * lda; ao8 = a + posX + (posY + 7) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; ao5 = a + posY + (posX + 4) * lda; ao6 = a + posY + (posX + 5) * lda; ao7 = a + posY + (posX + 6) * lda; ao8 = a + posY + (posX + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X < posY) { ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; ao5 += 8; ao6 += 8; ao7 += 8; ao8 += 8; b += 64; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data37 = *(ao5 + 4); data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data45 = *(ao6 + 4); data46 = *(ao6 + 5); data47 = *(ao6 + 6); data48 = *(ao6 + 7); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); data61 = *(ao8 + 4); data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; b[32] = data33; b[33] = data34; b[34] = data35; b[35] = data36; b[36] = data37; b[37] = data38; b[38] = data39; b[39] = data40; b[40] = data41; b[41] = data42; b[42] = data43; b[43] = data44; b[44] = data45; b[45] = data46; b[46] = data47; b[47] = data48; b[48] = data49; b[49] = data50; b[50] = data51; b[51] = data52; b[52] = data53; b[53] = data54; b[54] = data55; b[55] = data56; b[56] = data57; b[57] = data58; b[58] = data59; b[59] = data60; b[60] = data61; b[61] = data62; b[62] = data63; b[63] = data64; ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 64; } else { #ifdef UNIT data09 = *(ao2 + 0); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data45 = *(ao6 + 4); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); data61 = *(ao8 + 4); data62 = *(ao8 + 5); data63 = *(ao8 + 6); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = ONE; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = data17; b[17] = data18; b[18] = ONE; b[19] = ZERO; b[20] = ZERO; b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = ONE; b[28] = ZERO; b[29] = ZERO; b[30] = ZERO; b[31] = ZERO; b[32] = data33; b[33] = data34; b[34] = data35; b[35] = data36; b[36] = ONE; b[37] = ZERO; b[38] = ZERO; b[39] = ZERO; b[40] = data41; b[41] = data42; b[42] = data43; b[43] = data44; b[44] = data45; b[45] = ONE; b[46] = ZERO; b[47] = ZERO; b[48] = data49; b[49] = data50; b[50] = data51; b[51] = data52; b[52] = data53; b[53] = data54; b[54] = ONE; b[55] = ZERO; b[56] = data57; b[57] = data58; b[58] = data59; b[59] = data60; b[60] = data61; b[61] = data62; b[62] = data63; b[63] = ONE; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data37 = *(ao5 + 4); data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); data45 = *(ao6 + 4); data46 = *(ao6 + 5); data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); data55 = *(ao7 + 6); data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); data61 = *(ao8 + 4); data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = ZERO; b[20] = ZERO; b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = ZERO; b[29] = ZERO; b[30] = ZERO; b[31] = ZERO; b[32] = data33; b[33] = data34; b[34] = data35; b[35] = data36; b[36] = data37; b[37] = ZERO; b[38] = ZERO; b[39] = ZERO; b[40] = data41; b[41] = data42; b[42] = data43; b[43] = data44; b[44] = data45; b[45] = data46; b[46] = ZERO; b[47] = ZERO; b[48] = data49; b[49] = data50; b[50] = data51; b[51] = data52; b[52] = data53; b[53] = data54; b[54] = data55; b[55] = ZERO; b[56] = data57; b[57] = data58; b[58] = data59; b[59] = data60; b[60] = data61; b[61] = data62; b[62] = data63; b[63] = data64; #endif ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 64; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X < posY) { if (m & 4) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; ao5 += 4; ao6 += 4; ao7 += 4; ao8 += 4; b += 32; } if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } if (m & 1) { b += 8; } } else if (X > posY) { if (m & 4) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 2 * lda; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); data33 = *(ao5 + 0); data41 = *(ao6 + 0); data49 = *(ao7 + 0); data57 = *(ao8 + 0); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data18 = *(ao3 + 1); data26 = *(ao4 + 1); data34 = *(ao5 + 1); data42 = *(ao6 + 1); data50 = *(ao7 + 1); data58 = *(ao8 + 1); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data27 = *(ao4 + 2); data35 = *(ao5 + 2); data43 = *(ao6 + 2); data51 = *(ao7 + 2); data59 = *(ao8 + 2); } if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); #endif data36 = *(ao5 + 3); data44 = *(ao6 + 3); data52 = *(ao7 + 3); data60 = *(ao8 + 3); } if (i >= 5) { #ifndef UNIT data37 = *(ao5 + 4); #endif data45 = *(ao6 + 4); data53 = *(ao7 + 4); data61 = *(ao8 + 4); } if (i >= 6) { #ifndef UNIT data46 = *(ao6 + 5); #endif data54 = *(ao7 + 5); data62 = *(ao8 + 5); } if (i >= 7) { #ifndef UNIT data55 = *(ao7 + 6); #endif data63 = *(ao8 + 6); } #ifdef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if(i >= 2) { b[ 0] = data09; #ifdef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = data17; b[ 1] = data18; #ifdef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 4) { b[ 0] = data25; b[ 1] = data26; b[ 2] = data27; #ifdef UNIT b[ 3] = ONE; #else b[ 3] = data28; #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 5) { b[ 0] = data33; b[ 1] = data34; b[ 2] = data35; b[ 3] = data36; #ifdef UNIT b[ 4] = ONE; #else b[ 4] = data37; #endif b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 6) { b[ 0] = data41; b[ 1] = data42; b[ 2] = data43; b[ 3] = data44; b[ 4] = data45; #ifdef UNIT b[ 5] = ONE; #else b[ 5] = data46; #endif b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 7) { b[ 0] = data49; b[ 1] = data50; b[ 2] = data51; b[ 3] = data52; b[ 4] = data53; b[ 5] = data54; #ifdef UNIT b[ 6] = ONE; #else b[ 6] = data55; #endif b[ 7] = ZERO; b += 8; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; ao3 = a + posX + (posY + 2) * lda; ao4 = a + posX + (posY + 3) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; ao3 = a + posY + (posX + 2) * lda; ao4 = a + posY + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data09; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; b[ 8] = data17; b[ 9] = data18; b[10] = data19; b[11] = data20; b[12] = data25; b[13] = data26; b[14] = data27; b[15] = data28; b += 16; } else { #ifdef UNIT data09 = *(ao2 + 0); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data09; b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data17; b[ 9] = data18; b[10] = ONE; b[11] = ZERO; b[12] = data25; b[13] = data26; b[14] = data27; b[15] = ONE; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data09; b[ 5] = data10; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data17; b[ 9] = data18; b[10] = data19; b[11] = ZERO; b[12] = data25; b[13] = data26; b[14] = data27; b[15] = data28; #endif ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 16; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { if (m & 2) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } if (m & 1) { b += 4; } } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data09; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; ao1 += 2 * lda; b += 8; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; } } else { #ifndef UNIT data01 = *(ao1 + 0); #endif data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); if (i >= 2) { #ifndef UNIT data10 = *(ao2 + 1); #endif data18 = *(ao3 + 1); data26 = *(ao4 + 1); } if (i >= 3) { #ifndef UNIT data19 = *(ao3 + 2); #endif data27 = *(ao4 + 2); } #ifndef UNIT b[ 0] = ONE; #else b[ 0] = data01; #endif b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b += 4; if(i >= 2) { b[ 0] = data09; #ifndef UNIT b[ 1] = ONE; #else b[ 1] = data10; #endif b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } if (i >= 3) { b[ 0] = data17; b[ 1] = data18; #ifndef UNIT b[ 2] = ONE; #else b[ 2] = data19; #endif b[ 3] = ZERO; b += 4; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; ao2 = a + posX + (posY + 1) * lda; } else { ao1 = a + posY + (posX + 0) * lda; ao2 = a + posY + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); ao1 += 2 * lda; ao2 += 2 * lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b += 4; } else { #ifdef UNIT data09 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data09; b[ 3] = ONE; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data09; b[ 3] = data10; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { b += 2; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; b += 2; } else { #ifdef UNIT data09 = *(ao2 + 0); b[ 0] = ONE; b[ 1] = data09; #else data01 = *(ao1 + 0); data09 = *(ao2 + 0); b[ 0] = data01; b[ 1] = data09; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { ao1 += 1; b += 1; } else if (X > posY) { data01 = *(ao1 + 0); ao1 += lda; b[ 0] = data01; b += 1; } else { #ifdef UNIT b[ 0] = ONE; #else data01 = *(ao1 + 0); b[ 0] = data01; #endif ao1 += lda; b += 1; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/trmmkernel_16x2.c000066400000000000000000000427721313527062700211600ustar00rootroot00000000000000#include "common.h" int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) { BLASLONG i,j,k; FLOAT *C0,*C1,*ptrba,*ptrbb; FLOAT res0_0; FLOAT res0_1; FLOAT res0_2; FLOAT res0_3; FLOAT res0_4; FLOAT res0_5; FLOAT res0_6; FLOAT res0_7; FLOAT res0_8; FLOAT res0_9; FLOAT res0_10; FLOAT res0_11; FLOAT res0_12; FLOAT res0_13; FLOAT res0_14; FLOAT res0_15; FLOAT res1_0; FLOAT res1_1; FLOAT res1_2; FLOAT res1_3; FLOAT res1_4; FLOAT res1_5; FLOAT res1_6; FLOAT res1_7; FLOAT res1_8; FLOAT res1_9; FLOAT res1_10; FLOAT res1_11; FLOAT res1_12; FLOAT res1_13; FLOAT res1_14; FLOAT res1_15; FLOAT a0; FLOAT a1; FLOAT b0; FLOAT b1; BLASLONG off, temp; #if !defined(LEFT) off = -offset; #endif for (j=0; j int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) { BLASLONG i,j,k; FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; FLOAT res0_0; FLOAT res0_1; FLOAT res0_2; FLOAT res0_3; FLOAT res1_0; FLOAT res1_1; FLOAT res1_2; FLOAT res1_3; FLOAT res2_0; FLOAT res2_1; FLOAT res2_2; FLOAT res2_3; FLOAT res3_0; FLOAT res3_1; FLOAT res3_2; FLOAT res3_3; FLOAT a0; FLOAT a1; FLOAT b0; FLOAT b1; FLOAT b2; FLOAT b3; BLASLONG off, temp; bool left; bool transposed; bool backwards; #ifdef LEFT left = true; #else left = false; #endif #ifdef TRANSA transposed = true; #else transposed = false; #endif backwards = left != transposed; if (!left) { off = -offset; } for (j=0; j int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) { BLASLONG i,j,k; FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; FLOAT res0_0; FLOAT res0_1; FLOAT res0_2; FLOAT res0_3; FLOAT res1_0; FLOAT res1_1; FLOAT res1_2; FLOAT res1_3; FLOAT res2_0; FLOAT res2_1; FLOAT res2_2; FLOAT res2_3; FLOAT res3_0; FLOAT res3_1; FLOAT res3_2; FLOAT res3_3; FLOAT res4_0; FLOAT res4_1; FLOAT res4_2; FLOAT res4_3; FLOAT res5_0; FLOAT res5_1; FLOAT res5_2; FLOAT res5_3; FLOAT res6_0; FLOAT res6_1; FLOAT res6_2; FLOAT res6_3; FLOAT res7_0; FLOAT res7_1; FLOAT res7_2; FLOAT res7_3; FLOAT a0; FLOAT a1; FLOAT b0; FLOAT b1; FLOAT b2; FLOAT b3; FLOAT b4; FLOAT b5; FLOAT b6; FLOAT b7; BLASLONG off, temp; bool left; bool transposed; bool backwards; #ifdef LEFT left = true; #else left = false; #endif #ifdef TRANSA transposed = true; #else transposed = false; #endif backwards = left != transposed; if (!left) { off = -offset; } for (j=0; j= 0; i--) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = 0; k < i; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a -= m; b -= 2 * n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (m - 1) * m * 2; b += (m - 1) * n * 2; for (i = m - 1; i >= 0; i--) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a -= m * 2; b -= 4 * n; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * j * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, cc, ldc); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_kernel_LT.c000066400000000000000000000170221313527062700211320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < m; i++) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = i + 1; k < m; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a += m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < m; i++) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = i + 1; k < m; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a += m * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_kernel_RN.c000066400000000000000000000167361313527062700211450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < n; i++) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = i + 1; k < n; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b += n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < n; i++) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = -aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = i + 1; k < n; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b += n * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); kk = -offset; while (j > 0) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } kk += GEMM_UNROLL_N; b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; kk += j; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_kernel_RT.c000066400000000000000000000201251313527062700211360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; a += (n - 1) * m; b += (n - 1) * n; for (i = n - 1; i >= 0; i--) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = 0; k < i; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b -= n; a -= 2 * m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (n - 1) * m * 2; b += (n - 1) * n * 2; for (i = n - 1; i >= 0; i--) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = - aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b -= n * 2; a -= 4 * m; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif kk = n - offset; c += n * ldc * COMPSIZE; b += n * k * COMPSIZE; if (n & (GEMM_UNROLL_N - 1)) { j = 1; while (j < GEMM_UNROLL_N) { if (n & j) { aa = a; b -= j * k * COMPSIZE; c -= j * ldc* COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - j) * i * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= j; } j <<= 1; } } j = (n >> GEMM_UNROLL_N_SHIFT); if (j > 0) { do { aa = a; b -= GEMM_UNROLL_N * k * COMPSIZE; c -= GEMM_UNROLL_N * ldc * COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= GEMM_UNROLL_N; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_lncopy_1.c000066400000000000000000000063541313527062700210050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; #ifndef UNIT FLOAT data01; #endif FLOAT *a1; jj = offset; j = n; while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) *(b + 0) = *(a1 + 0); a1 ++; b ++; i --; ii ++; } a += lda; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_lncopy_16.c000066400000000000000000000150341313527062700210660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; jj = offset; j = (n >> 4); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; a9 = a + 8 * lda; a10 = a + 9 * lda; a11 = a + 10 * lda; a12 = a + 11 * lda; a13 = a + 12 * lda; a14 = a + 13 * lda; a15 = a + 14 * lda; a16 = a + 15 * lda; a += 16 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 16)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } if (ii - jj >= 16) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); *(b + 2) = *(a3 + 0); *(b + 3) = *(a4 + 0); *(b + 4) = *(a5 + 0); *(b + 5) = *(a6 + 0); *(b + 6) = *(a7 + 0); *(b + 7) = *(a8 + 0); *(b + 8) = *(a9 + 0); *(b + 9) = *(a10 + 0); *(b + 10) = *(a11 + 0); *(b + 11) = *(a12 + 0); *(b + 12) = *(a13 + 0); *(b + 13) = *(a14 + 0); *(b + 14) = *(a15 + 0); *(b + 15) = *(a16 + 0); } a1 ++; a2 ++; a3 ++; a4 ++; a5 ++; a6 ++; a7 ++; a8 ++; a9 ++; a10 ++; a11 ++; a12 ++; a13 ++; a14 ++; a15 ++; a16 ++; b += 16; ii ++; } jj += 16; j --; } if (n & 8) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; a += 8 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); *(b + 2) = *(a3 + 0); *(b + 3) = *(a4 + 0); *(b + 4) = *(a5 + 0); *(b + 5) = *(a6 + 0); *(b + 6) = *(a7 + 0); *(b + 7) = *(a8 + 0); } a1 ++; a2 ++; a3 ++; a4 ++; a5 ++; a6 ++; a7 ++; a8 ++; b += 8; ii ++; } jj += 8; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a += 4 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); *(b + 2) = *(a3 + 0); *(b + 3) = *(a4 + 0); } a1 ++; a2 ++; a3 ++; a4 ++; b += 4; ii ++; } jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; a += 2 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); } a1 ++; a2 ++; b += 2; ii ++; } jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); } a1 ++; b += 1; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_lncopy_2.c000066400000000000000000000102011313527062700207700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT *a1, *a2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data02; *(b + 3) = INV(data04); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data02; *(b + 3) = data04; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2 * lda; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1+= 1; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_lncopy_4.c000066400000000000000000000154411313527062700210050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); #ifndef UNIT data11 = *(a3 + 2); #endif data12 = *(a3 + 3); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 4) = data02; *(b + 5) = INV(data06); *(b + 8) = data03; *(b + 9) = data07; *(b + 10) = INV(data11); *(b + 12) = data04; *(b + 13) = data08; *(b + 14) = data12; *(b + 15) = INV(data16); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 4) = data02; *(b + 5) = data06; *(b + 6) = data10; *(b + 7) = data14; *(b + 8) = data03; *(b + 9) = data07; *(b + 10) = data11; *(b + 11) = data15; *(b + 12) = data04; *(b + 13) = data08; *(b + 14) = data12; *(b + 15) = data16; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data06 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 4) = data02; *(b + 5) = INV(data06); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); data05 = *(a3 + 0); data06 = *(a3 + 1); data07 = *(a4 + 0); data08 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data05; *(b + 3) = data07; *(b + 4) = data02; *(b + 5) = data04; *(b + 6) = data06; *(b + 7) = data08; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); data03 = *(a3 + 0); data04 = *(a4 + 0); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4 * lda; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data02; *(b + 3) = INV(data04); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data02; *(b + 3) = data04; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2 * lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1+= 1; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_lncopy_6.c000066400000000000000000000154411313527062700210070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); #ifndef UNIT data11 = *(a3 + 2); #endif data12 = *(a3 + 3); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 4) = data02; *(b + 5) = INV(data06); *(b + 8) = data03; *(b + 9) = data07; *(b + 10) = INV(data11); *(b + 12) = data04; *(b + 13) = data08; *(b + 14) = data12; *(b + 15) = INV(data16); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 4) = data02; *(b + 5) = data06; *(b + 6) = data10; *(b + 7) = data14; *(b + 8) = data03; *(b + 9) = data07; *(b + 10) = data11; *(b + 11) = data15; *(b + 12) = data04; *(b + 13) = data08; *(b + 14) = data12; *(b + 15) = data16; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data06 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 4) = data02; *(b + 5) = INV(data06); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); data05 = *(a3 + 0); data06 = *(a3 + 1); data07 = *(a4 + 0); data08 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data05; *(b + 3) = data07; *(b + 4) = data02; *(b + 5) = data04; *(b + 6) = data06; *(b + 7) = data08; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); data03 = *(a3 + 0); data04 = *(a4 + 0); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4 * lda; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data02; *(b + 3) = INV(data04); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data02; *(b + 3) = data04; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2 * lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1+= 1; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_lncopy_8.c000066400000000000000000000372401313527062700210120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; j = (n >> 3); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; ii = 0; i = (m >> 3); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); #ifndef UNIT data19 = *(a3 + 2); #endif data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); #ifndef UNIT data28 = *(a4 + 3); #endif data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); #ifndef UNIT data37 = *(a5 + 4); #endif data38 = *(a5 + 5); data39 = *(a5 + 6); data40 = *(a5 + 7); #ifndef UNIT data46 = *(a6 + 5); #endif data47 = *(a6 + 6); data48 = *(a6 + 7); #ifndef UNIT data55 = *(a7 + 6); #endif data56 = *(a7 + 7); #ifndef UNIT data64 = *(a8 + 7); #endif *(b + 0) = INV(data01); *(b + 8) = data02; *(b + 9) = INV(data10); *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = INV(data19); *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; *(b + 27) = INV(data28); *(b + 32) = data05; *(b + 33) = data13; *(b + 34) = data21; *(b + 35) = data29; *(b + 36) = INV(data37); *(b + 40) = data06; *(b + 41) = data14; *(b + 42) = data22; *(b + 43) = data30; *(b + 44) = data38; *(b + 45) = INV(data46); *(b + 48) = data07; *(b + 49) = data15; *(b + 50) = data23; *(b + 51) = data31; *(b + 52) = data39; *(b + 53) = data47; *(b + 54) = INV(data55); *(b + 56) = data08; *(b + 57) = data16; *(b + 58) = data24; *(b + 59) = data32; *(b + 60) = data40; *(b + 61) = data48; *(b + 62) = data56; *(b + 63) = INV(data64); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); data37 = *(a5 + 4); data38 = *(a5 + 5); data39 = *(a5 + 6); data40 = *(a5 + 7); data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data45 = *(a6 + 4); data46 = *(a6 + 5); data47 = *(a6 + 6); data48 = *(a6 + 7); data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data53 = *(a7 + 4); data54 = *(a7 + 5); data55 = *(a7 + 6); data56 = *(a7 + 7); data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); data61 = *(a8 + 4); data62 = *(a8 + 5); data63 = *(a8 + 6); data64 = *(a8 + 7); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; *(b + 19) = data27; *(b + 20) = data35; *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; *(b + 27) = data28; *(b + 28) = data36; *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; *(b + 32) = data05; *(b + 33) = data13; *(b + 34) = data21; *(b + 35) = data29; *(b + 36) = data37; *(b + 37) = data45; *(b + 38) = data53; *(b + 39) = data61; *(b + 40) = data06; *(b + 41) = data14; *(b + 42) = data22; *(b + 43) = data30; *(b + 44) = data38; *(b + 45) = data46; *(b + 46) = data54; *(b + 47) = data62; *(b + 48) = data07; *(b + 49) = data15; *(b + 50) = data23; *(b + 51) = data31; *(b + 52) = data39; *(b + 53) = data47; *(b + 54) = data55; *(b + 55) = data63; *(b + 56) = data08; *(b + 57) = data16; *(b + 58) = data24; *(b + 59) = data32; *(b + 60) = data40; *(b + 61) = data48; *(b + 62) = data56; *(b + 63) = data64; } a1 += 8; a2 += 8; a3 += 8; a4 += 8; a5 += 8; a6 += 8; a7 += 8; a8 += 8; b += 64; i --; ii += 8; } if (m & 4) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); #ifndef UNIT data19 = *(a3 + 2); #endif data20 = *(a3 + 3); #ifndef UNIT data28 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 8) = data02; *(b + 9) = INV(data10); *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = INV(data19); *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; *(b + 27) = INV(data28); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; *(b + 19) = data27; *(b + 20) = data35; *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; *(b + 27) = data28; *(b + 28) = data36; *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; a5 += 4; a6 += 4; a7 += 4; a8 += 4; b += 32; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 8) = data02; *(b + 9) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); data33 = *(a5 + 0); data34 = *(a5 + 1); data41 = *(a6 + 0); data42 = *(a6 + 1); data49 = *(a7 + 0); data50 = *(a7 + 1); data57 = *(a8 + 0); data58 = *(a8 + 1); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; a5 += 2; a6 += 2; a7 += 2; a8 += 2; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data09 = *(a2 + 0); data17 = *(a3 + 0); data25 = *(a4 + 0); data33 = *(a5 + 0); data41 = *(a6 + 0); data49 = *(a7 + 0); data57 = *(a8 + 0); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; } b += 8; } a += 8 * lda; jj += 8; j --; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; ii = 0; i = (m >> 2); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); #ifndef UNIT data19 = *(a3 + 2); #endif data20 = *(a3 + 3); #ifndef UNIT data28 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 4) = data02; *(b + 5) = INV(data10); *(b + 8) = data03; *(b + 9) = data11; *(b + 10) = INV(data19); *(b + 12) = data04; *(b + 13) = data12; *(b + 14) = data20; *(b + 15) = INV(data28); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data02; *(b + 5) = data10; *(b + 6) = data18; *(b + 7) = data26; *(b + 8) = data03; *(b + 9) = data11; *(b + 10) = data19; *(b + 11) = data27; *(b + 12) = data04; *(b + 13) = data12; *(b + 14) = data20; *(b + 15) = data28; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 4) = data02; *(b + 5) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data02; *(b + 5) = data10; *(b + 6) = data18; *(b + 7) = data26; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data09 = *(a2 + 0); data17 = *(a3 + 0); data25 = *(a4 + 0); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; } b += 4; } a += 4 * lda; jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; ii = 0; i = (m >> 1); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data02; *(b + 3) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data02; *(b + 3) = data10; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data09 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data09; } b += 2; } a += 2 * lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; i = m; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_ltcopy_1.c000066400000000000000000000063541313527062700210130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; #ifndef UNIT FLOAT data01; #endif FLOAT *a1; jj = offset; j = n; while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) *(b + 0) = *(a1 + 0); a1 += lda; b ++; i --; ii ++; } a ++; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_ltcopy_16.c000066400000000000000000000125301313527062700210720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1; jj = offset; j = (n >> 4); while (j > 0){ a1 = a; a += 16; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 16)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); for (k = ii - jj + 1; k < 16; k ++) { *(b + k) = *(a1 + k); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); *(b + 8) = *(a1 + 8); *(b + 9) = *(a1 + 9); *(b + 10) = *(a1 + 10); *(b + 11) = *(a1 + 11); *(b + 12) = *(a1 + 12); *(b + 13) = *(a1 + 13); *(b + 14) = *(a1 + 14); *(b + 15) = *(a1 + 15); } b += 16; a1 += lda; ii ++; } jj += 16; j --; } j = (n & 8); if (j > 0) { a1 = a; a += 8; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); for (k = ii - jj + 1; k < 8; k ++) { *(b + k) = *(a1 + k); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); } b += 8; a1 += lda; ii ++; } jj += 8; } j = (n & 4); if (j > 0) { a1 = a; a += 4; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); for (k = ii - jj + 1; k < 4; k ++) { *(b + k) = *(a1 + k); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); } b += 4; a1 += lda; ii ++; } jj += 4; } j = (n & 2); if (j > 0) { a1 = a; a += 2; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); for (k = ii - jj + 1; k < 2; k ++) { *(b + k) = *(a1 + k); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); } b += 2; a1 += lda; ii ++; } jj += 2; } j = (n & 1); if (j > 0) { a1 = a; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); } b += 1; a1 += lda; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_ltcopy_2.c000066400000000000000000000102741313527062700210100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT *a1, *a2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 3) = INV(data04); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); *(b + 0) = INV(data01); *(b + 1) = data02; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1 * lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_ltcopy_4.c000066400000000000000000000161341313527062700210130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); #ifndef UNIT data11 = *(a3 + 2); #endif data12 = *(a3 + 3); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 5) = INV(data06); *(b + 6) = data07; *(b + 7) = data08; *(b + 10) = INV(data11); *(b + 11) = data12; *(b + 15) = INV(data16); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 5) = INV(data06); *(b + 6) = data07; *(b + 7) = data08; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2 * lda; a2 += 2 * lda; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 3) = INV(data04); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1 * lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_ltcopy_6.c000066400000000000000000000161341313527062700210150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); #ifndef UNIT data11 = *(a3 + 2); #endif data12 = *(a3 + 3); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 5) = INV(data06); *(b + 6) = data07; *(b + 7) = data08; *(b + 10) = INV(data11); *(b + 11) = data12; *(b + 15) = INV(data16); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 5) = INV(data06); *(b + 6) = data07; *(b + 7) = data08; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2 * lda; a2 += 2 * lda; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 3) = INV(data04); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1 * lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_ltcopy_8.c000066400000000000000000000426051313527062700210210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; j = (n >> 3); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; ii = 0; i = (m >> 3); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); #ifndef UNIT data19 = *(a3 + 2); #endif data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); #ifndef UNIT data28 = *(a4 + 3); #endif data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); #ifndef UNIT data37 = *(a5 + 4); #endif data38 = *(a5 + 5); data39 = *(a5 + 6); data40 = *(a5 + 7); #ifndef UNIT data46 = *(a6 + 5); #endif data47 = *(a6 + 6); data48 = *(a6 + 7); #ifndef UNIT data55 = *(a7 + 6); #endif data56 = *(a7 + 7); #ifndef UNIT data64 = *(a8 + 7); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 9) = INV(data10); *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 18) = INV(data19); *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 27) = INV(data28); *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; *(b + 36) = INV(data37); *(b + 37) = data38; *(b + 38) = data39; *(b + 39) = data40; *(b + 45) = INV(data46); *(b + 46) = data47; *(b + 47) = data48; *(b + 54) = INV(data55); *(b + 55) = data56; *(b + 63) = INV(data64); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); data37 = *(a5 + 4); data38 = *(a5 + 5); data39 = *(a5 + 6); data40 = *(a5 + 7); data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data45 = *(a6 + 4); data46 = *(a6 + 5); data47 = *(a6 + 6); data48 = *(a6 + 7); data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data53 = *(a7 + 4); data54 = *(a7 + 5); data55 = *(a7 + 6); data56 = *(a7 + 7); data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); data61 = *(a8 + 4); data62 = *(a8 + 5); data63 = *(a8 + 6); data64 = *(a8 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = data19; *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = data28; *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; *(b + 32) = data33; *(b + 33) = data34; *(b + 34) = data35; *(b + 35) = data36; *(b + 36) = data37; *(b + 37) = data38; *(b + 38) = data39; *(b + 39) = data40; *(b + 40) = data41; *(b + 41) = data42; *(b + 42) = data43; *(b + 43) = data44; *(b + 44) = data45; *(b + 45) = data46; *(b + 46) = data47; *(b + 47) = data48; *(b + 48) = data49; *(b + 49) = data50; *(b + 50) = data51; *(b + 51) = data52; *(b + 52) = data53; *(b + 53) = data54; *(b + 54) = data55; *(b + 55) = data56; *(b + 56) = data57; *(b + 57) = data58; *(b + 58) = data59; *(b + 59) = data60; *(b + 60) = data61; *(b + 61) = data62; *(b + 62) = data63; *(b + 63) = data64; } a1 += 8 * lda; a2 += 8 * lda; a3 += 8 * lda; a4 += 8 * lda; a5 += 8 * lda; a6 += 8 * lda; a7 += 8 * lda; a8 += 8 * lda; b += 64; i --; ii += 8; } if (m & 4) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); #ifndef UNIT data19 = *(a3 + 2); #endif data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); #ifndef UNIT data28 = *(a4 + 3); #endif data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 9) = INV(data10); *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 18) = INV(data19); *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 27) = INV(data28); *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = data19; *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = data28; *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 32; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 9) = INV(data10); *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 2 * lda; a2 += 2 * lda; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } b += 8; } a += 8; jj += 8; j --; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; ii = 0; i = (m >> 2); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); #ifndef UNIT data19 = *(a3 + 2); #endif data20 = *(a3 + 3); #ifndef UNIT data28 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 5) = INV(data10); *(b + 6) = data11; *(b + 7) = data12; *(b + 10) = INV(data19); *(b + 11) = data20; *(b + 15) = INV(data28); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data09; *(b + 5) = data10; *(b + 6) = data11; *(b + 7) = data12; *(b + 8) = data17; *(b + 9) = data18; *(b + 10) = data19; *(b + 11) = data20; *(b + 12) = data25; *(b + 13) = data26; *(b + 14) = data27; *(b + 15) = data28; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 16; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data10 = *(a2 + 1); #endif data11 = *(a2 + 2); data12 = *(a2 + 3); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 6) = INV(data10); *(b + 7) = data11; *(b + 8) = data12; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data09; *(b + 5) = data10; *(b + 6) = data11; *(b + 7) = data12; } a1 += 2 * lda; a2 += 2 * lda; b += 8; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; // *(b + 4) = data05; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; ii = 0; i = (m >> 1); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); #ifndef UNIT data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 3) = INV(data10); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); *(b + 0) = INV(data01); *(b + 1) = data02; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; i = m; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_uncopy_1.c000066400000000000000000000063541313527062700210160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; #ifndef UNIT FLOAT data01; #endif FLOAT *a1; jj = offset; j = n; while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) *(b + 0) = *(a1 + 0); a1 ++; b ++; i --; ii ++; } a += lda; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_uncopy_16.c000066400000000000000000000150531313527062700211000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; jj = offset; j = (n >> 4); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; a9 = a + 8 * lda; a10 = a + 9 * lda; a11 = a + 10 * lda; a12 = a + 11 * lda; a13 = a + 12 * lda; a14 = a + 13 * lda; a15 = a + 14 * lda; a16 = a + 15 * lda; a += 16 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 16)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 16; k ++) { *(b + k) = *(a1 + k * lda); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); *(b + 2) = *(a3 + 0); *(b + 3) = *(a4 + 0); *(b + 4) = *(a5 + 0); *(b + 5) = *(a6 + 0); *(b + 6) = *(a7 + 0); *(b + 7) = *(a8 + 0); *(b + 8) = *(a9 + 0); *(b + 9) = *(a10 + 0); *(b + 10) = *(a11 + 0); *(b + 11) = *(a12 + 0); *(b + 12) = *(a13 + 0); *(b + 13) = *(a14 + 0); *(b + 14) = *(a15 + 0); *(b + 15) = *(a16 + 0); } a1 ++; a2 ++; a3 ++; a4 ++; a5 ++; a6 ++; a7 ++; a8 ++; a9 ++; a10 ++; a11 ++; a12 ++; a13 ++; a14 ++; a15 ++; a16 ++; b += 16; ii ++; } jj += 16; j --; } if (n & 8) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; a += 8 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 8; k ++) { *(b + k) = *(a1 + k * lda); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); *(b + 2) = *(a3 + 0); *(b + 3) = *(a4 + 0); *(b + 4) = *(a5 + 0); *(b + 5) = *(a6 + 0); *(b + 6) = *(a7 + 0); *(b + 7) = *(a8 + 0); } a1 ++; a2 ++; a3 ++; a4 ++; a5 ++; a6 ++; a7 ++; a8 ++; b += 8; ii ++; } jj += 8; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a += 4 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 4; k ++) { *(b + k) = *(a1 + k * lda); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); *(b + 2) = *(a3 + 0); *(b + 3) = *(a4 + 0); } a1 ++; a2 ++; a3 ++; a4 ++; b += 4; ii ++; } jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; a += 2 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 2; k ++) { *(b + k) = *(a1 + k * lda); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); } a1 ++; a2 ++; b += 2; ii ++; } jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 1; k ++) { *(b + k) = *(a1 + k * lda); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); } a1 ++; b += 1; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_uncopy_2.c000066400000000000000000000102571313527062700210140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT *a1, *a2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data03; *(b + 3) = INV(data04); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data02; *(b + 3) = data04; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a2 + 0); *(b + 0) = INV(data01); *(b + 1) = data02; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2 * lda; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1+= 1; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_uncopy_4.c000066400000000000000000000161351313527062700210170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif data09 = *(a3 + 0); data10 = *(a3 + 1); #ifndef UNIT data11 = *(a3 + 2); #endif data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 5) = INV(data06); *(b + 6) = data10; *(b + 7) = data14; *(b + 10) = INV(data11); *(b + 11) = data15; *(b + 15) = INV(data16); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 4) = data02; *(b + 5) = data06; *(b + 6) = data10; *(b + 7) = data14; *(b + 8) = data03; *(b + 9) = data07; *(b + 10) = data11; *(b + 11) = data15; *(b + 12) = data04; *(b + 13) = data08; *(b + 14) = data12; *(b + 15) = data16; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif data09 = *(a3 + 0); data10 = *(a3 + 1); data13 = *(a4 + 0); data14 = *(a4 + 1); *(b + 0) = INV(data01); *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 5) = INV(data06); *(b + 6) = data10; *(b + 7) = data14; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); data05 = *(a3 + 0); data06 = *(a3 + 1); data07 = *(a4 + 0); data08 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2; a2 += 2; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); data09 = *(a3 + 0); data13 = *(a4 + 0); *(b + 0) = INV(data01); *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); data03 = *(a3 + 0); data04 = *(a4 + 0); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4 * lda; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data03; *(b + 3) = INV(data04); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data02; *(b + 3) = data04; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); *(b + 0) = INV(data01); *(b + 1) = data03; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2 * lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1+= 1; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_uncopy_6.c000066400000000000000000000161351313527062700210210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif data09 = *(a3 + 0); data10 = *(a3 + 1); #ifndef UNIT data11 = *(a3 + 2); #endif data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 5) = INV(data06); *(b + 6) = data10; *(b + 7) = data14; *(b + 10) = INV(data11); *(b + 11) = data15; *(b + 15) = INV(data16); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 4) = data02; *(b + 5) = data06; *(b + 6) = data10; *(b + 7) = data14; *(b + 8) = data03; *(b + 9) = data07; *(b + 10) = data11; *(b + 11) = data15; *(b + 12) = data04; *(b + 13) = data08; *(b + 14) = data12; *(b + 15) = data16; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif data09 = *(a3 + 0); data10 = *(a3 + 1); data13 = *(a4 + 0); data14 = *(a4 + 1); *(b + 0) = INV(data01); *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; *(b + 5) = INV(data06); *(b + 6) = data10; *(b + 7) = data14; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); data05 = *(a3 + 0); data06 = *(a3 + 1); data07 = *(a4 + 0); data08 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2; a2 += 2; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); data09 = *(a3 + 0); data13 = *(a4 + 0); *(b + 0) = INV(data01); *(b + 1) = data05; *(b + 2) = data09; *(b + 3) = data13; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); data03 = *(a3 + 0); data04 = *(a4 + 0); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4 * lda; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data03; *(b + 3) = INV(data04); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data03; *(b + 2) = data02; *(b + 3) = data04; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); *(b + 0) = INV(data01); *(b + 1) = data03; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2 * lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1+= 1; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_uncopy_8.c000066400000000000000000000427271313527062700210310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; j = (n >> 3); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; ii = 0; i = (m >> 3); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); #ifndef UNIT data19 = *(a3 + 2); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); #ifndef UNIT data28 = *(a4 + 3); #endif data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); #ifndef UNIT data37 = *(a5 + 4); #endif data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data45 = *(a6 + 4); #ifndef UNIT data46 = *(a6 + 5); #endif data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data53 = *(a7 + 4); data54 = *(a7 + 5); #ifndef UNIT data55 = *(a7 + 6); #endif data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); data61 = *(a8 + 4); data62 = *(a8 + 5); data63 = *(a8 + 6); #ifndef UNIT data64 = *(a8 + 7); #endif *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 9) = INV(data10); *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; *(b + 18) = INV(data19); *(b + 19) = data27; *(b + 20) = data35; *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; *(b + 27) = INV(data28); *(b + 28) = data36; *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; *(b + 36) = INV(data37); *(b + 37) = data45; *(b + 38) = data53; *(b + 39) = data61; *(b + 45) = INV(data46); *(b + 46) = data54; *(b + 47) = data62; *(b + 54) = INV(data55); *(b + 55) = data63; *(b + 63) = INV(data64); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); data37 = *(a5 + 4); data38 = *(a5 + 5); data39 = *(a5 + 6); data40 = *(a5 + 7); data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data45 = *(a6 + 4); data46 = *(a6 + 5); data47 = *(a6 + 6); data48 = *(a6 + 7); data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data53 = *(a7 + 4); data54 = *(a7 + 5); data55 = *(a7 + 6); data56 = *(a7 + 7); data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); data61 = *(a8 + 4); data62 = *(a8 + 5); data63 = *(a8 + 6); data64 = *(a8 + 7); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; *(b + 19) = data27; *(b + 20) = data35; *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; *(b + 27) = data28; *(b + 28) = data36; *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; *(b + 32) = data05; *(b + 33) = data13; *(b + 34) = data21; *(b + 35) = data29; *(b + 36) = data37; *(b + 37) = data45; *(b + 38) = data53; *(b + 39) = data61; *(b + 40) = data06; *(b + 41) = data14; *(b + 42) = data22; *(b + 43) = data30; *(b + 44) = data38; *(b + 45) = data46; *(b + 46) = data54; *(b + 47) = data62; *(b + 48) = data07; *(b + 49) = data15; *(b + 50) = data23; *(b + 51) = data31; *(b + 52) = data39; *(b + 53) = data47; *(b + 54) = data55; *(b + 55) = data63; *(b + 56) = data08; *(b + 57) = data16; *(b + 58) = data24; *(b + 59) = data32; *(b + 60) = data40; *(b + 61) = data48; *(b + 62) = data56; *(b + 63) = data64; } a1 += 8; a2 += 8; a3 += 8; a4 += 8; a5 += 8; a6 += 8; a7 += 8; a8 += 8; b += 64; i --; ii += 8; } if (m & 4) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); #ifndef UNIT data19 = *(a3 + 2); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); #ifndef UNIT data28 = *(a4 + 3); #endif data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 9) = INV(data10); *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; *(b + 18) = INV(data19); *(b + 19) = data27; *(b + 20) = data35; *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; *(b + 27) = INV(data28); *(b + 28) = data36; *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; *(b + 19) = data27; *(b + 20) = data35; *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; *(b + 27) = data28; *(b + 28) = data36; *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; a5 += 4; a6 += 4; a7 += 4; a8 += 4; b += 32; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); data33 = *(a5 + 0); data34 = *(a5 + 1); data41 = *(a6 + 0); data42 = *(a6 + 1); data49 = *(a7 + 0); data50 = *(a7 + 1); data57 = *(a8 + 0); data58 = *(a8 + 1); *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 9) = INV(data10); *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); data33 = *(a5 + 0); data34 = *(a5 + 1); data41 = *(a6 + 0); data42 = *(a6 + 1); data49 = *(a7 + 0); data50 = *(a7 + 1); data57 = *(a8 + 0); data58 = *(a8 + 1); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; *(b + 11) = data26; *(b + 12) = data34; *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; a5 += 2; a6 += 2; a7 += 2; a8 += 2; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); data17 = *(a3 + 0); data25 = *(a4 + 0); data33 = *(a5 + 0); data41 = *(a6 + 0); data49 = *(a7 + 0); data57 = *(a8 + 0); *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data33; *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; } b += 8; ii += 1; } a += 8 * lda; jj += 8; j --; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; ii = 0; i = (m >> 2); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); #ifndef UNIT data19 = *(a3 + 2); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); #ifndef UNIT data28 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 5) = INV(data10); *(b + 6) = data18; *(b + 7) = data26; *(b + 10) = INV(data19); *(b + 11) = data27; *(b + 15) = INV(data28); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data02; *(b + 5) = data10; *(b + 6) = data18; *(b + 7) = data26; *(b + 8) = data03; *(b + 9) = data11; *(b + 10) = data19; *(b + 11) = data27; *(b + 12) = data04; *(b + 13) = data12; *(b + 14) = data20; *(b + 15) = data28; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 5) = INV(data10); *(b + 6) = data18; *(b + 7) = data26; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; *(b + 4) = data02; *(b + 5) = data10; *(b + 6) = data18; *(b + 7) = data26; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); data17 = *(a3 + 0); data25 = *(a4 + 0); *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; } if (ii < jj) { data01 = *(a1 + 0); data09 = *(a2 + 0); data17 = *(a3 + 0); data25 = *(a4 + 0); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data17; *(b + 3) = data25; } b += 4; ii += 1; } a += 4 * lda; jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; ii = 0; i = (m >> 1); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 1) = data09; *(b + 3) = INV(data10); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data09; *(b + 2) = data02; *(b + 3) = data10; } a1 += 2; a2 += 2; b += 4; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); *(b + 0) = INV(data01); *(b + 1) = data09; } if (ii < jj) { data01 = *(a1 + 0); data09 = *(a2 + 0); *(b + 0) = data01; *(b + 1) = data09; } b += 2; ii += 1; } a += 2 * lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; i = m; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii < jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1; b += 1; i --; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_utcopy_1.c000066400000000000000000000063531313527062700210230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; #ifndef UNIT FLOAT data01; #endif FLOAT *a1; jj = offset; j = n; while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) *(b + 0) = *(a1 + 0); a1 += lda; b ++; i --; ii ++; } a ++; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_utcopy_16.c000066400000000000000000000126071313527062700211100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1; jj = offset; j = (n >> 4); while (j > 0){ a1 = a; a += 16; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 16)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); } *(b + ii - jj) = INV(*(a1 + ii - jj)); } if (ii - jj >= 16) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); *(b + 8) = *(a1 + 8); *(b + 9) = *(a1 + 9); *(b + 10) = *(a1 + 10); *(b + 11) = *(a1 + 11); *(b + 12) = *(a1 + 12); *(b + 13) = *(a1 + 13); *(b + 14) = *(a1 + 14); *(b + 15) = *(a1 + 15); } b += 16; a1 += lda; ii ++; } jj += 16; j --; } j = (n & 8); if (j > 0) { a1 = a; a += 8; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); } *(b + ii - jj) = INV(*(a1 + ii - jj)); } if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); } b += 8; a1 += lda; ii ++; } jj += 8; } j = (n & 4); if (j > 0) { a1 = a; a += 4; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); } *(b + ii - jj) = INV(*(a1 + ii - jj)); } if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); } b += 4; a1 += lda; ii ++; } jj += 4; } j = (n & 2); if (j > 0) { a1 = a; a += 2; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); } *(b + ii - jj) = INV(*(a1 + ii - jj)); } if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); } b += 2; a1 += lda; ii ++; } jj += 2; } j = (n & 1); if (j > 0) { a1 = a; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); } *(b + ii - jj) = INV(*(a1 + ii - jj)); } if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); } b += 1; a1 += lda; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_utcopy_2.c000066400000000000000000000102171313527062700210160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT *a1, *a2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data03; *(b + 3) = INV(data04); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1 * lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_utcopy_4.c000066400000000000000000000154541313527062700210300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif data09 = *(a3 + 0); data10 = *(a3 + 1); #ifndef UNIT data11 = *(a3 + 2); #endif data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 4) = data05; *(b + 5) = INV(data06); *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = INV(data11); *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = INV(data16); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 4) = data05; *(b + 5) = INV(data06); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2 * lda; a2 += 2 * lda; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data03; *(b + 3) = INV(data04); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1 * lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_utcopy_6.c000066400000000000000000000154541313527062700210320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *a1, *a2, *a3, *a4; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif data09 = *(a3 + 0); data10 = *(a3 + 1); #ifndef UNIT data11 = *(a3 + 2); #endif data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); #ifndef UNIT data16 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 4) = data05; *(b + 5) = INV(data06); *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = INV(data11); *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = INV(data16); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); data09 = *(a3 + 0); data10 = *(a3 + 1); data11 = *(a3 + 2); data12 = *(a3 + 3); data13 = *(a4 + 0); data14 = *(a4 + 1); data15 = *(a4 + 2); data16 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 16; i --; ii += 4; } if ((m & 2) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data05 = *(a2 + 0); #ifndef UNIT data06 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 4) = data05; *(b + 5) = INV(data06); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2 * lda; a2 += 2 * lda; b += 8; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data03 = *(a2 + 0); #ifndef UNIT data04 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data03; *(b + 3) = INV(data04); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += 1 * lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/trsm_utcopy_8.c000066400000000000000000000367761313527062700210460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef UNIT #define INV(a) (ONE / (a)) #else #define INV(a) (ONE) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT data33, data34, data35, data36, data37, data38, data39, data40; FLOAT data41, data42, data43, data44, data45, data46, data47, data48; FLOAT data49, data50, data51, data52, data53, data54, data55, data56; FLOAT data57, data58, data59, data60, data61, data62, data63, data64; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; jj = offset; j = (n >> 3); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; i = (m >> 3); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); #ifndef UNIT data19 = *(a3 + 2); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); #ifndef UNIT data28 = *(a4 + 3); #endif data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); #ifndef UNIT data37 = *(a5 + 4); #endif data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data45 = *(a6 + 4); #ifndef UNIT data46 = *(a6 + 5); #endif data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data53 = *(a7 + 4); data54 = *(a7 + 5); #ifndef UNIT data55 = *(a7 + 6); #endif data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); data61 = *(a8 + 4); data62 = *(a8 + 5); data63 = *(a8 + 6); #ifndef UNIT data64 = *(a8 + 7); #endif *(b + 0) = INV(data01); *(b + 8) = data09; *(b + 9) = INV(data10); *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = INV(data19); *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = INV(data28); *(b + 32) = data33; *(b + 33) = data34; *(b + 34) = data35; *(b + 35) = data36; *(b + 36) = INV(data37); *(b + 40) = data41; *(b + 41) = data42; *(b + 42) = data43; *(b + 43) = data44; *(b + 44) = data45; *(b + 45) = INV(data46); *(b + 48) = data49; *(b + 49) = data50; *(b + 50) = data51; *(b + 51) = data52; *(b + 52) = data53; *(b + 53) = data54; *(b + 54) = INV(data55); *(b + 56) = data57; *(b + 57) = data58; *(b + 58) = data59; *(b + 59) = data60; *(b + 60) = data61; *(b + 61) = data62; *(b + 62) = data63; *(b + 63) = INV(data64); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); data33 = *(a5 + 0); data34 = *(a5 + 1); data35 = *(a5 + 2); data36 = *(a5 + 3); data37 = *(a5 + 4); data38 = *(a5 + 5); data39 = *(a5 + 6); data40 = *(a5 + 7); data41 = *(a6 + 0); data42 = *(a6 + 1); data43 = *(a6 + 2); data44 = *(a6 + 3); data45 = *(a6 + 4); data46 = *(a6 + 5); data47 = *(a6 + 6); data48 = *(a6 + 7); data49 = *(a7 + 0); data50 = *(a7 + 1); data51 = *(a7 + 2); data52 = *(a7 + 3); data53 = *(a7 + 4); data54 = *(a7 + 5); data55 = *(a7 + 6); data56 = *(a7 + 7); data57 = *(a8 + 0); data58 = *(a8 + 1); data59 = *(a8 + 2); data60 = *(a8 + 3); data61 = *(a8 + 4); data62 = *(a8 + 5); data63 = *(a8 + 6); data64 = *(a8 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = data19; *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = data28; *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; *(b + 32) = data33; *(b + 33) = data34; *(b + 34) = data35; *(b + 35) = data36; *(b + 36) = data37; *(b + 37) = data38; *(b + 38) = data39; *(b + 39) = data40; *(b + 40) = data41; *(b + 41) = data42; *(b + 42) = data43; *(b + 43) = data44; *(b + 44) = data45; *(b + 45) = data46; *(b + 46) = data47; *(b + 47) = data48; *(b + 48) = data49; *(b + 49) = data50; *(b + 50) = data51; *(b + 51) = data52; *(b + 52) = data53; *(b + 53) = data54; *(b + 54) = data55; *(b + 55) = data56; *(b + 56) = data57; *(b + 57) = data58; *(b + 58) = data59; *(b + 59) = data60; *(b + 60) = data61; *(b + 61) = data62; *(b + 62) = data63; *(b + 63) = data64; } a1 += 8 * lda; a2 += 8 * lda; a3 += 8 * lda; a4 += 8 * lda; a5 += 8 * lda; a6 += 8 * lda; a7 += 8 * lda; a8 += 8 * lda; b += 64; i --; ii += 8; } if (m & 4) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); #ifndef UNIT data19 = *(a3 + 2); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); #ifndef UNIT data28 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 8) = data09; *(b + 9) = INV(data10); *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = INV(data19); *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = INV(data28); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = data19; *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = data28; *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 32; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 8) = data09; *(b + 9) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 2 * lda; a2 += 2 * lda; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } b += 8; } a += 8; jj += 8; j --; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; i = (m >> 2); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); #ifndef UNIT data19 = *(a3 + 2); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); #ifndef UNIT data28 = *(a4 + 3); #endif *(b + 0) = INV(data01); *(b + 4) = data09; *(b + 5) = INV(data10); *(b + 8) = data17; *(b + 9) = data18; *(b + 10) = INV(data19); *(b + 12) = data25; *(b + 13) = data26; *(b + 14) = data27; *(b + 15) = INV(data28); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data09; *(b + 5) = data10; *(b + 6) = data11; *(b + 7) = data12; *(b + 8) = data17; *(b + 9) = data18; *(b + 10) = data19; *(b + 11) = data20; *(b + 12) = data25; *(b + 13) = data26; *(b + 14) = data27; *(b + 15) = data28; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 16; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data09; *(b + 5) = data10; *(b + 6) = data11; *(b + 7) = data12; } a1 += 2 * lda; a2 += 2 * lda; b += 8; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif data09 = *(a2 + 0); #ifndef UNIT data10 = *(a2 + 1); #endif *(b + 0) = INV(data01); *(b + 2) = data09; *(b + 3) = INV(data10); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; } a1 += 2 * lda; a2 += 2 * lda; b += 4; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } b += 2; } a += 2; jj += 2; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); #endif *(b + 0) = INV(data01); } if (ii > jj) { data01 = *(a1 + 0); *(b + 0) = data01; } a1 += lda; b += 1; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgeadd.c000066400000000000000000000043551313527062700174510ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alphar, FLOAT alphai, FLOAT *a, BLASLONG lda, FLOAT betar, FLOAT betai , FLOAT *b, BLASLONG ldb) { BLASLONG i; FLOAT *aptr,*bptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); aptr = a; bptr = b; lda *= 2; ldb *= 2; if ( alphar == 0.0 && alphai == 0.0 ) { for ( i=0; i #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i; FLOAT *a_offset, a1, a2; lda *= 2; while (n > 0) { a_offset = a; a += lda; for (i = 0; i < m; i ++) { a1 = *(a_offset + 0); a2 = *(a_offset + 1); *(b + 0) = CMULT(a1, a2); a_offset += 2; b ++; } n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3m_ncopy_2.c000066400000000000000000000077671313527062700212350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset; FLOAT a1, a2, a3, a4; lda *= 2; a_offset = a; b_offset = b; j = (n >> 1); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); a_offset1 += 2; a_offset2 += 2; b_offset += 2; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 1) { a_offset1 = a_offset; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset + 0) = CMULT(a1, a2); a_offset1 += 2; b_offset += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3m_ncopy_4.c000066400000000000000000000112411313527062700212150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset; FLOAT a1, a2, a3, a4, a5, a6, a7, a8; lda *= 2; a_offset = a; b_offset = b; j = (n >> 2); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); a5 = *(a_offset3 + 0); a6 = *(a_offset3 + 1); a7 = *(a_offset4 + 0); a8 = *(a_offset4 + 1); *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); *(b_offset + 2) = CMULT(a5, a6); *(b_offset + 3) = CMULT(a7, a8); a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; b_offset += 4; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 2) { a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); a_offset1 += 2; a_offset2 += 2; b_offset += 2; } } if (n & 1) { a_offset1 = a_offset; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset + 0) = CMULT(a1, a2); a_offset1 += 2; b_offset += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3m_ncopy_8.c000066400000000000000000000145351313527062700212320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; FLOAT *b_offset; FLOAT a1, a2, a3, a4, a5, a6, a7, a8; FLOAT a9, a10, a11, a12, a13, a14, a15, a16; #if 0 #ifdef REAL_ONLY fprintf(stderr, "NON Real "); #elif defined(IMAGE_ONLY) fprintf(stderr, "NON Image "); #else fprintf(stderr, "NON Both "); #endif #ifdef ICOPY fprintf(stderr, " ICOPY %ld x %ld\n", m, n); #else fprintf(stderr, " OCOPY %ld x %ld\n", m, n); #endif #endif lda *= 2; a_offset = a; b_offset = b; j = (n >> 3); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset5 = a_offset4 + lda; a_offset6 = a_offset5 + lda; a_offset7 = a_offset6 + lda; a_offset8 = a_offset7 + lda; a_offset += 8 * lda; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); a5 = *(a_offset3 + 0); a6 = *(a_offset3 + 1); a7 = *(a_offset4 + 0); a8 = *(a_offset4 + 1); a9 = *(a_offset5 + 0); a10 = *(a_offset5 + 1); a11 = *(a_offset6 + 0); a12 = *(a_offset6 + 1); a13 = *(a_offset7 + 0); a14 = *(a_offset7 + 1); a15 = *(a_offset8 + 0); a16 = *(a_offset8 + 1); *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); *(b_offset + 2) = CMULT(a5, a6); *(b_offset + 3) = CMULT(a7, a8); *(b_offset + 4) = CMULT(a9, a10); *(b_offset + 5) = CMULT(a11, a12); *(b_offset + 6) = CMULT(a13, a14); *(b_offset + 7) = CMULT(a15, a16); a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; a_offset5 += 2; a_offset6 += 2; a_offset7 += 2; a_offset8 += 2; b_offset += 8; } j--; }while(j > 0); } if (n & 4){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); a5 = *(a_offset3 + 0); a6 = *(a_offset3 + 1); a7 = *(a_offset4 + 0); a8 = *(a_offset4 + 1); *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); *(b_offset + 2) = CMULT(a5, a6); *(b_offset + 3) = CMULT(a7, a8); a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; b_offset += 4; } } if (n & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); a_offset1 += 2; a_offset2 += 2; b_offset += 2; } } if (n & 1){ a_offset1 = a_offset; for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset + 0) = CMULT(a1, a2); a_offset1 += 2; b_offset += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3m_tcopy_1.c000066400000000000000000000067211313527062700212270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i; FLOAT *a_offset, a1, a2; lda *= 2; while (n > 0) { a_offset = a; a += 2; for (i = 0; i < m; i ++) { a1 = *(a_offset + 0); a2 = *(a_offset + 1); *(b + 0) = CMULT(a1, a2); a_offset += lda; b ++; } n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3m_tcopy_2.c000066400000000000000000000114511313527062700212240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset, *b_offset1, *b_offset2; FLOAT a1, a2, a3, a4, a5, a6, a7, a8; a_offset = a; b_offset = b; lda *= 2; b_offset2 = b + m * (n & ~1); j = (m >> 1); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 4; i = (n >> 1); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset2 + 0); a6 = *(a_offset2 + 1); a7 = *(a_offset2 + 2); a8 = *(a_offset2 + 3); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); a_offset1 += 4; a_offset2 += 4; b_offset1 += m * 2; i --; }while(i > 0); } if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); b_offset2 += 2; } j--; }while(j > 0); } if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; i = (n >> 1); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); a_offset1 += 4; b_offset1 += 2 * m; i --; }while(i > 0); } if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset2 + 0) = CMULT(a1, a2); } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3m_tcopy_4.c000066400000000000000000000220351313527062700212260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; FLOAT a1, a2, a3, a4, a5, a6, a7, a8; a_offset = a; b_offset = b; lda *= 2; b_offset2 = b + m * (n & ~3); b_offset3 = b + m * (n & ~1); j = (m >> 2); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; b_offset1 = b_offset; b_offset += 16; i = (n >> 2); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); *(b_offset1 + 4) = CMULT(a1, a2); *(b_offset1 + 5) = CMULT(a3, a4); *(b_offset1 + 6) = CMULT(a5, a6); *(b_offset1 + 7) = CMULT(a7, a8); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); a5 = *(a_offset3 + 4); a6 = *(a_offset3 + 5); a7 = *(a_offset3 + 6); a8 = *(a_offset3 + 7); *(b_offset1 + 8) = CMULT(a1, a2); *(b_offset1 + 9) = CMULT(a3, a4); *(b_offset1 + 10) = CMULT(a5, a6); *(b_offset1 + 11) = CMULT(a7, a8); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); a5 = *(a_offset4 + 4); a6 = *(a_offset4 + 5); a7 = *(a_offset4 + 6); a8 = *(a_offset4 + 7); *(b_offset1 + 12) = CMULT(a1, a2); *(b_offset1 + 13) = CMULT(a3, a4); *(b_offset1 + 14) = CMULT(a5, a6); *(b_offset1 + 15) = CMULT(a7, a8); a_offset1 += 8; a_offset2 += 8; a_offset3 += 8; a_offset4 += 8; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset2 + 0); a6 = *(a_offset2 + 1); a7 = *(a_offset2 + 2); a8 = *(a_offset2 + 3); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); a5 = *(a_offset4 + 0); a6 = *(a_offset4 + 1); a7 = *(a_offset4 + 2); a8 = *(a_offset4 + 3); *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); *(b_offset2 + 7) = CMULT(a7, a8); a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; b_offset2 += 8; } if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); a5 = *(a_offset3 + 0); a6 = *(a_offset3 + 1); a7 = *(a_offset4 + 0); a8 = *(a_offset4 + 1); *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); *(b_offset3 + 2) = CMULT(a5, a6); *(b_offset3 + 3) = CMULT(a7, a8); b_offset3 += 4; } j--; }while(j > 0); } if (m & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 8; i = (n >> 2); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); *(b_offset1 + 4) = CMULT(a1, a2); *(b_offset1 + 5) = CMULT(a3, a4); *(b_offset1 + 6) = CMULT(a5, a6); *(b_offset1 + 7) = CMULT(a7, a8); a_offset1 += 8; a_offset2 += 8; b_offset1 += m * 4; i --; }while(i > 0); } if (n & 2) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset2 + 0); a6 = *(a_offset2 + 1); a7 = *(a_offset2 + 2); a8 = *(a_offset2 + 3); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); a_offset1 += 4; a_offset2 += 4; b_offset2 += 4; } if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); b_offset3 += 2; } } if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; i = (n >> 2); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); a_offset1 += 8; b_offset1 += 4 * m; i --; }while(i > 0); } if (n & 2) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); a_offset1 += 4; } if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset3 + 0) = CMULT(a1, a2); } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3m_tcopy_8.c000066400000000000000000000656711313527062700212470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; FLOAT a1, a2, a3, a4, a5, a6, a7, a8; FLOAT a9, a10, a11, a12, a13, a14, a15, a16; #if 0 #ifdef REAL_ONLY fprintf(stderr, "TNS Real "); #elif defined(IMAGE_ONLY) fprintf(stderr, "TNS Image "); #else fprintf(stderr, "TNS Both "); #endif #ifdef ICOPY fprintf(stderr, " ICOPY %ld x %ld\n", m, n); #else fprintf(stderr, " OCOPY %ld x %ld\n", m, n); #endif #endif a_offset = a; b_offset = b; lda *= 2; b_offset2 = b + m * (n & ~7); b_offset3 = b + m * (n & ~3); b_offset4 = b + m * (n & ~1); j = (m >> 3); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset5 = a_offset4 + lda; a_offset6 = a_offset5 + lda; a_offset7 = a_offset6 + lda; a_offset8 = a_offset7 + lda; a_offset += 8 * lda; b_offset1 = b_offset; b_offset += 64; i = (n >> 3); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); a9 = *(a_offset1 + 8); a10 = *(a_offset1 + 9); a11 = *(a_offset1 + 10); a12 = *(a_offset1 + 11); a13 = *(a_offset1 + 12); a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); *(b_offset1 + 4) = CMULT(a9, a10); *(b_offset1 + 5) = CMULT(a11, a12); *(b_offset1 + 6) = CMULT(a13, a14); *(b_offset1 + 7) = CMULT(a15, a16); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); a9 = *(a_offset2 + 8); a10 = *(a_offset2 + 9); a11 = *(a_offset2 + 10); a12 = *(a_offset2 + 11); a13 = *(a_offset2 + 12); a14 = *(a_offset2 + 13); a15 = *(a_offset2 + 14); a16 = *(a_offset2 + 15); *(b_offset1 + 8) = CMULT(a1, a2); *(b_offset1 + 9) = CMULT(a3, a4); *(b_offset1 + 10) = CMULT(a5, a6); *(b_offset1 + 11) = CMULT(a7, a8); *(b_offset1 + 12) = CMULT(a9, a10); *(b_offset1 + 13) = CMULT(a11, a12); *(b_offset1 + 14) = CMULT(a13, a14); *(b_offset1 + 15) = CMULT(a15, a16); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); a5 = *(a_offset3 + 4); a6 = *(a_offset3 + 5); a7 = *(a_offset3 + 6); a8 = *(a_offset3 + 7); a9 = *(a_offset3 + 8); a10 = *(a_offset3 + 9); a11 = *(a_offset3 + 10); a12 = *(a_offset3 + 11); a13 = *(a_offset3 + 12); a14 = *(a_offset3 + 13); a15 = *(a_offset3 + 14); a16 = *(a_offset3 + 15); *(b_offset1 + 16) = CMULT(a1, a2); *(b_offset1 + 17) = CMULT(a3, a4); *(b_offset1 + 18) = CMULT(a5, a6); *(b_offset1 + 19) = CMULT(a7, a8); *(b_offset1 + 20) = CMULT(a9, a10); *(b_offset1 + 21) = CMULT(a11, a12); *(b_offset1 + 22) = CMULT(a13, a14); *(b_offset1 + 23) = CMULT(a15, a16); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); a5 = *(a_offset4 + 4); a6 = *(a_offset4 + 5); a7 = *(a_offset4 + 6); a8 = *(a_offset4 + 7); a9 = *(a_offset4 + 8); a10 = *(a_offset4 + 9); a11 = *(a_offset4 + 10); a12 = *(a_offset4 + 11); a13 = *(a_offset4 + 12); a14 = *(a_offset4 + 13); a15 = *(a_offset4 + 14); a16 = *(a_offset4 + 15); *(b_offset1 + 24) = CMULT(a1, a2); *(b_offset1 + 25) = CMULT(a3, a4); *(b_offset1 + 26) = CMULT(a5, a6); *(b_offset1 + 27) = CMULT(a7, a8); *(b_offset1 + 28) = CMULT(a9, a10); *(b_offset1 + 29) = CMULT(a11, a12); *(b_offset1 + 30) = CMULT(a13, a14); *(b_offset1 + 31) = CMULT(a15, a16); a1 = *(a_offset5 + 0); a2 = *(a_offset5 + 1); a3 = *(a_offset5 + 2); a4 = *(a_offset5 + 3); a5 = *(a_offset5 + 4); a6 = *(a_offset5 + 5); a7 = *(a_offset5 + 6); a8 = *(a_offset5 + 7); a9 = *(a_offset5 + 8); a10 = *(a_offset5 + 9); a11 = *(a_offset5 + 10); a12 = *(a_offset5 + 11); a13 = *(a_offset5 + 12); a14 = *(a_offset5 + 13); a15 = *(a_offset5 + 14); a16 = *(a_offset5 + 15); *(b_offset1 + 32) = CMULT(a1, a2); *(b_offset1 + 33) = CMULT(a3, a4); *(b_offset1 + 34) = CMULT(a5, a6); *(b_offset1 + 35) = CMULT(a7, a8); *(b_offset1 + 36) = CMULT(a9, a10); *(b_offset1 + 37) = CMULT(a11, a12); *(b_offset1 + 38) = CMULT(a13, a14); *(b_offset1 + 39) = CMULT(a15, a16); a1 = *(a_offset6 + 0); a2 = *(a_offset6 + 1); a3 = *(a_offset6 + 2); a4 = *(a_offset6 + 3); a5 = *(a_offset6 + 4); a6 = *(a_offset6 + 5); a7 = *(a_offset6 + 6); a8 = *(a_offset6 + 7); a9 = *(a_offset6 + 8); a10 = *(a_offset6 + 9); a11 = *(a_offset6 + 10); a12 = *(a_offset6 + 11); a13 = *(a_offset6 + 12); a14 = *(a_offset6 + 13); a15 = *(a_offset6 + 14); a16 = *(a_offset6 + 15); *(b_offset1 + 40) = CMULT(a1, a2); *(b_offset1 + 41) = CMULT(a3, a4); *(b_offset1 + 42) = CMULT(a5, a6); *(b_offset1 + 43) = CMULT(a7, a8); *(b_offset1 + 44) = CMULT(a9, a10); *(b_offset1 + 45) = CMULT(a11, a12); *(b_offset1 + 46) = CMULT(a13, a14); *(b_offset1 + 47) = CMULT(a15, a16); a1 = *(a_offset7 + 0); a2 = *(a_offset7 + 1); a3 = *(a_offset7 + 2); a4 = *(a_offset7 + 3); a5 = *(a_offset7 + 4); a6 = *(a_offset7 + 5); a7 = *(a_offset7 + 6); a8 = *(a_offset7 + 7); a9 = *(a_offset7 + 8); a10 = *(a_offset7 + 9); a11 = *(a_offset7 + 10); a12 = *(a_offset7 + 11); a13 = *(a_offset7 + 12); a14 = *(a_offset7 + 13); a15 = *(a_offset7 + 14); a16 = *(a_offset7 + 15); *(b_offset1 + 48) = CMULT(a1, a2); *(b_offset1 + 49) = CMULT(a3, a4); *(b_offset1 + 50) = CMULT(a5, a6); *(b_offset1 + 51) = CMULT(a7, a8); *(b_offset1 + 52) = CMULT(a9, a10); *(b_offset1 + 53) = CMULT(a11, a12); *(b_offset1 + 54) = CMULT(a13, a14); *(b_offset1 + 55) = CMULT(a15, a16); a1 = *(a_offset8 + 0); a2 = *(a_offset8 + 1); a3 = *(a_offset8 + 2); a4 = *(a_offset8 + 3); a5 = *(a_offset8 + 4); a6 = *(a_offset8 + 5); a7 = *(a_offset8 + 6); a8 = *(a_offset8 + 7); a9 = *(a_offset8 + 8); a10 = *(a_offset8 + 9); a11 = *(a_offset8 + 10); a12 = *(a_offset8 + 11); a13 = *(a_offset8 + 12); a14 = *(a_offset8 + 13); a15 = *(a_offset8 + 14); a16 = *(a_offset8 + 15); *(b_offset1 + 56) = CMULT(a1, a2); *(b_offset1 + 57) = CMULT(a3, a4); *(b_offset1 + 58) = CMULT(a5, a6); *(b_offset1 + 59) = CMULT(a7, a8); *(b_offset1 + 60) = CMULT(a9, a10); *(b_offset1 + 61) = CMULT(a11, a12); *(b_offset1 + 62) = CMULT(a13, a14); *(b_offset1 + 63) = CMULT(a15, a16); a_offset1 += 16; a_offset2 += 16; a_offset3 += 16; a_offset4 += 16; a_offset5 += 16; a_offset6 += 16; a_offset7 += 16; a_offset8 += 16; b_offset1 += m * 8; i --; }while(i > 0); } if (n & 4){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); *(b_offset2 + 7) = CMULT(a7, a8); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); a5 = *(a_offset3 + 4); a6 = *(a_offset3 + 5); a7 = *(a_offset3 + 6); a8 = *(a_offset3 + 7); *(b_offset2 + 8) = CMULT(a1, a2); *(b_offset2 + 9) = CMULT(a3, a4); *(b_offset2 + 10) = CMULT(a5, a6); *(b_offset2 + 11) = CMULT(a7, a8); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); a5 = *(a_offset4 + 4); a6 = *(a_offset4 + 5); a7 = *(a_offset4 + 6); a8 = *(a_offset4 + 7); *(b_offset2 + 12) = CMULT(a1, a2); *(b_offset2 + 13) = CMULT(a3, a4); *(b_offset2 + 14) = CMULT(a5, a6); *(b_offset2 + 15) = CMULT(a7, a8); a1 = *(a_offset5 + 0); a2 = *(a_offset5 + 1); a3 = *(a_offset5 + 2); a4 = *(a_offset5 + 3); a5 = *(a_offset5 + 4); a6 = *(a_offset5 + 5); a7 = *(a_offset5 + 6); a8 = *(a_offset5 + 7); *(b_offset2 + 16) = CMULT(a1, a2); *(b_offset2 + 17) = CMULT(a3, a4); *(b_offset2 + 18) = CMULT(a5, a6); *(b_offset2 + 19) = CMULT(a7, a8); a1 = *(a_offset6 + 0); a2 = *(a_offset6 + 1); a3 = *(a_offset6 + 2); a4 = *(a_offset6 + 3); a5 = *(a_offset6 + 4); a6 = *(a_offset6 + 5); a7 = *(a_offset6 + 6); a8 = *(a_offset6 + 7); *(b_offset2 + 20) = CMULT(a1, a2); *(b_offset2 + 21) = CMULT(a3, a4); *(b_offset2 + 22) = CMULT(a5, a6); *(b_offset2 + 23) = CMULT(a7, a8); a1 = *(a_offset7 + 0); a2 = *(a_offset7 + 1); a3 = *(a_offset7 + 2); a4 = *(a_offset7 + 3); a5 = *(a_offset7 + 4); a6 = *(a_offset7 + 5); a7 = *(a_offset7 + 6); a8 = *(a_offset7 + 7); *(b_offset2 + 24) = CMULT(a1, a2); *(b_offset2 + 25) = CMULT(a3, a4); *(b_offset2 + 26) = CMULT(a5, a6); *(b_offset2 + 27) = CMULT(a7, a8); a1 = *(a_offset8 + 0); a2 = *(a_offset8 + 1); a3 = *(a_offset8 + 2); a4 = *(a_offset8 + 3); a5 = *(a_offset8 + 4); a6 = *(a_offset8 + 5); a7 = *(a_offset8 + 6); a8 = *(a_offset8 + 7); *(b_offset2 + 28) = CMULT(a1, a2); *(b_offset2 + 29) = CMULT(a3, a4); *(b_offset2 + 30) = CMULT(a5, a6); *(b_offset2 + 31) = CMULT(a7, a8); a_offset1 += 8; a_offset2 += 8; a_offset3 += 8; a_offset4 += 8; a_offset5 += 8; a_offset6 += 8; a_offset7 += 8; a_offset8 += 8; b_offset2 += 32; } if (n & 2){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); *(b_offset3 + 2) = CMULT(a1, a2); *(b_offset3 + 3) = CMULT(a3, a4); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); *(b_offset3 + 4) = CMULT(a1, a2); *(b_offset3 + 5) = CMULT(a3, a4); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); *(b_offset3 + 6) = CMULT(a1, a2); *(b_offset3 + 7) = CMULT(a3, a4); a1 = *(a_offset5 + 0); a2 = *(a_offset5 + 1); a3 = *(a_offset5 + 2); a4 = *(a_offset5 + 3); *(b_offset3 + 8) = CMULT(a1, a2); *(b_offset3 + 9) = CMULT(a3, a4); a1 = *(a_offset6 + 0); a2 = *(a_offset6 + 1); a3 = *(a_offset6 + 2); a4 = *(a_offset6 + 3); *(b_offset3 + 10) = CMULT(a1, a2); *(b_offset3 + 11) = CMULT(a3, a4); a1 = *(a_offset7 + 0); a2 = *(a_offset7 + 1); a3 = *(a_offset7 + 2); a4 = *(a_offset7 + 3); *(b_offset3 + 12) = CMULT(a1, a2); *(b_offset3 + 13) = CMULT(a3, a4); a1 = *(a_offset8 + 0); a2 = *(a_offset8 + 1); a3 = *(a_offset8 + 2); a4 = *(a_offset8 + 3); *(b_offset3 + 14) = CMULT(a1, a2); *(b_offset3 + 15) = CMULT(a3, a4); a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; a_offset5 += 4; a_offset6 += 4; a_offset7 += 4; a_offset8 += 4; b_offset3 += 16; } if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset4 + 0) = CMULT(a1, a2); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); *(b_offset4 + 1) = CMULT(a1, a2); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); *(b_offset4 + 2) = CMULT(a1, a2); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); *(b_offset4 + 3) = CMULT(a1, a2); a1 = *(a_offset5 + 0); a2 = *(a_offset5 + 1); *(b_offset4 + 4) = CMULT(a1, a2); a1 = *(a_offset6 + 0); a2 = *(a_offset6 + 1); *(b_offset4 + 5) = CMULT(a1, a2); a1 = *(a_offset7 + 0); a2 = *(a_offset7 + 1); *(b_offset4 + 6) = CMULT(a1, a2); a1 = *(a_offset8 + 0); a2 = *(a_offset8 + 1); *(b_offset4 + 7) = CMULT(a1, a2); b_offset4 += 8; } j--; }while(j > 0); } if (m & 4){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; b_offset1 = b_offset; b_offset += 32; i = (n >> 3); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); a9 = *(a_offset1 + 8); a10 = *(a_offset1 + 9); a11 = *(a_offset1 + 10); a12 = *(a_offset1 + 11); a13 = *(a_offset1 + 12); a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); *(b_offset1 + 4) = CMULT(a9, a10); *(b_offset1 + 5) = CMULT(a11, a12); *(b_offset1 + 6) = CMULT(a13, a14); *(b_offset1 + 7) = CMULT(a15, a16); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); a9 = *(a_offset2 + 8); a10 = *(a_offset2 + 9); a11 = *(a_offset2 + 10); a12 = *(a_offset2 + 11); a13 = *(a_offset2 + 12); a14 = *(a_offset2 + 13); a15 = *(a_offset2 + 14); a16 = *(a_offset2 + 15); *(b_offset1 + 8) = CMULT(a1, a2); *(b_offset1 + 9) = CMULT(a3, a4); *(b_offset1 + 10) = CMULT(a5, a6); *(b_offset1 + 11) = CMULT(a7, a8); *(b_offset1 + 12) = CMULT(a9, a10); *(b_offset1 + 13) = CMULT(a11, a12); *(b_offset1 + 14) = CMULT(a13, a14); *(b_offset1 + 15) = CMULT(a15, a16); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); a5 = *(a_offset3 + 4); a6 = *(a_offset3 + 5); a7 = *(a_offset3 + 6); a8 = *(a_offset3 + 7); a9 = *(a_offset3 + 8); a10 = *(a_offset3 + 9); a11 = *(a_offset3 + 10); a12 = *(a_offset3 + 11); a13 = *(a_offset3 + 12); a14 = *(a_offset3 + 13); a15 = *(a_offset3 + 14); a16 = *(a_offset3 + 15); *(b_offset1 + 16) = CMULT(a1, a2); *(b_offset1 + 17) = CMULT(a3, a4); *(b_offset1 + 18) = CMULT(a5, a6); *(b_offset1 + 19) = CMULT(a7, a8); *(b_offset1 + 20) = CMULT(a9, a10); *(b_offset1 + 21) = CMULT(a11, a12); *(b_offset1 + 22) = CMULT(a13, a14); *(b_offset1 + 23) = CMULT(a15, a16); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); a5 = *(a_offset4 + 4); a6 = *(a_offset4 + 5); a7 = *(a_offset4 + 6); a8 = *(a_offset4 + 7); a9 = *(a_offset4 + 8); a10 = *(a_offset4 + 9); a11 = *(a_offset4 + 10); a12 = *(a_offset4 + 11); a13 = *(a_offset4 + 12); a14 = *(a_offset4 + 13); a15 = *(a_offset4 + 14); a16 = *(a_offset4 + 15); *(b_offset1 + 24) = CMULT(a1, a2); *(b_offset1 + 25) = CMULT(a3, a4); *(b_offset1 + 26) = CMULT(a5, a6); *(b_offset1 + 27) = CMULT(a7, a8); *(b_offset1 + 28) = CMULT(a9, a10); *(b_offset1 + 29) = CMULT(a11, a12); *(b_offset1 + 30) = CMULT(a13, a14); *(b_offset1 + 31) = CMULT(a15, a16); a_offset1 += 16; a_offset2 += 16; a_offset3 += 16; a_offset4 += 16; b_offset1 += m * 8; i --; }while(i > 0); } if (n & 4){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); *(b_offset2 + 7) = CMULT(a7, a8); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); a5 = *(a_offset3 + 4); a6 = *(a_offset3 + 5); a7 = *(a_offset3 + 6); a8 = *(a_offset3 + 7); *(b_offset2 + 8) = CMULT(a1, a2); *(b_offset2 + 9) = CMULT(a3, a4); *(b_offset2 + 10) = CMULT(a5, a6); *(b_offset2 + 11) = CMULT(a7, a8); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); a5 = *(a_offset4 + 4); a6 = *(a_offset4 + 5); a7 = *(a_offset4 + 6); a8 = *(a_offset4 + 7); *(b_offset2 + 12) = CMULT(a1, a2); *(b_offset2 + 13) = CMULT(a3, a4); *(b_offset2 + 14) = CMULT(a5, a6); *(b_offset2 + 15) = CMULT(a7, a8); a_offset1 += 8; a_offset2 += 8; a_offset3 += 8; a_offset4 += 8; b_offset2 += 16; } if (n & 2){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); *(b_offset3 + 2) = CMULT(a1, a2); *(b_offset3 + 3) = CMULT(a3, a4); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); *(b_offset3 + 4) = CMULT(a1, a2); *(b_offset3 + 5) = CMULT(a3, a4); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); *(b_offset3 + 6) = CMULT(a1, a2); *(b_offset3 + 7) = CMULT(a3, a4); a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; b_offset3 += 8; } if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset4 + 0) = CMULT(a1, a2); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); *(b_offset4 + 1) = CMULT(a1, a2); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); *(b_offset4 + 2) = CMULT(a1, a2); a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); *(b_offset4 + 3) = CMULT(a1, a2); b_offset4 += 4; } } if (m & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 16; i = (n >> 3); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); a9 = *(a_offset1 + 8); a10 = *(a_offset1 + 9); a11 = *(a_offset1 + 10); a12 = *(a_offset1 + 11); a13 = *(a_offset1 + 12); a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); *(b_offset1 + 4) = CMULT(a9, a10); *(b_offset1 + 5) = CMULT(a11, a12); *(b_offset1 + 6) = CMULT(a13, a14); *(b_offset1 + 7) = CMULT(a15, a16); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); a9 = *(a_offset2 + 8); a10 = *(a_offset2 + 9); a11 = *(a_offset2 + 10); a12 = *(a_offset2 + 11); a13 = *(a_offset2 + 12); a14 = *(a_offset2 + 13); a15 = *(a_offset2 + 14); a16 = *(a_offset2 + 15); *(b_offset1 + 8) = CMULT(a1, a2); *(b_offset1 + 9) = CMULT(a3, a4); *(b_offset1 + 10) = CMULT(a5, a6); *(b_offset1 + 11) = CMULT(a7, a8); *(b_offset1 + 12) = CMULT(a9, a10); *(b_offset1 + 13) = CMULT(a11, a12); *(b_offset1 + 14) = CMULT(a13, a14); *(b_offset1 + 15) = CMULT(a15, a16); a_offset1 += 16; a_offset2 += 16; b_offset1 += m * 8; i --; }while(i > 0); } if (n & 4){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); a5 = *(a_offset2 + 4); a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); *(b_offset2 + 7) = CMULT(a7, a8); a_offset1 += 8; a_offset2 += 8; b_offset2 += 8; } if (n & 2){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); *(b_offset3 + 2) = CMULT(a1, a2); *(b_offset3 + 3) = CMULT(a3, a4); a_offset1 += 4; a_offset2 += 4; b_offset3 += 4; } if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset4 + 0) = CMULT(a1, a2); a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); *(b_offset4 + 1) = CMULT(a1, a2); b_offset4 += 2; } } if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; i = (n >> 3); if (i > 0){ do{ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); a9 = *(a_offset1 + 8); a10 = *(a_offset1 + 9); a11 = *(a_offset1 + 10); a12 = *(a_offset1 + 11); a13 = *(a_offset1 + 12); a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); *(b_offset1 + 4) = CMULT(a9, a10); *(b_offset1 + 5) = CMULT(a11, a12); *(b_offset1 + 6) = CMULT(a13, a14); *(b_offset1 + 7) = CMULT(a15, a16); a_offset1 += 16; b_offset1 += m * 8; i --; }while(i > 0); } if (n & 4){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); a5 = *(a_offset1 + 4); a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); a_offset1 += 8; b_offset2 += 4; } if (n & 2){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); a_offset1 += 4; b_offset3 += 2; } if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); *(b_offset4 + 0) = CMULT(a1, a2); } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm3mkernel_dump.c000066400000000000000000000034241313527062700220140ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2011-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc) { return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_beta.c000066400000000000000000000115431313527062700203220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta_r, FLOAT beta_i, FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ BLASLONG i, j; FLOAT *c_offset, *c_offset1; FLOAT atemp1, atemp2, atemp3, atemp4; FLOAT btemp1, btemp2, btemp3, btemp4; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; ldc *= 2; c_offset = c; if (beta_r == 0. && beta_i == 0.) { j = n; do { c_offset1 = c_offset; c_offset += ldc; i = (m >> 2); if (i > 0){ do { *(c_offset1 + 0) = ZERO; *(c_offset1 + 1) = ZERO; *(c_offset1 + 2) = ZERO; *(c_offset1 + 3) = ZERO; *(c_offset1 + 4) = ZERO; *(c_offset1 + 5) = ZERO; *(c_offset1 + 6) = ZERO; *(c_offset1 + 7) = ZERO; c_offset1 += 8; i--; } while (i > 0); } i = (m & 3); if (i > 0){ do { *(c_offset1 + 0) = ZERO; *(c_offset1 + 1) = ZERO; c_offset1 += 2; i--; } while (i > 0); } j --; } while (j > 0); } else { j = n; do { c_offset1 = c_offset; c_offset += ldc; i = (m >> 1); if (i > 0){ do { atemp1 = *(c_offset1 + 0); atemp2 = *(c_offset1 + 1); atemp3 = *(c_offset1 + 2); atemp4 = *(c_offset1 + 3); btemp1 = beta_r * atemp1; btemp2 = beta_i * atemp2; btemp3 = beta_r * atemp2; btemp4 = beta_i * atemp1; ctemp1 = btemp1 - btemp2; ctemp2 = btemp3 + btemp4; btemp1 = beta_r * atemp3; btemp2 = beta_i * atemp4; btemp3 = beta_r * atemp4; btemp4 = beta_i * atemp3; ctemp3 = btemp1 - btemp2; ctemp4 = btemp3 + btemp4; *(c_offset1 + 0) = ctemp1; *(c_offset1 + 1) = ctemp2; *(c_offset1 + 2) = ctemp3; *(c_offset1 + 3) = ctemp4; c_offset1 += 4; i --; } while (i > 0); } i = (m & 1); if (i > 0){ do { atemp1 = *(c_offset1 + 0); atemp2 = *(c_offset1 + 1); btemp1 = beta_r * atemp1; btemp2 = beta_i * atemp2; btemp3 = beta_r * atemp2; btemp4 = beta_i * atemp1; ctemp1 = btemp1 - btemp2; ctemp2 = btemp3 + btemp4; *(c_offset1 + 0) = ctemp1; *(c_offset1 + 1) = ctemp2; c_offset1 += 2; i --; } while (i > 0); } j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_ncopy_1.c000066400000000000000000000074711313527062700207640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset; FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; a_offset = a; b_offset = b; lda *= 2; i = n; if (i > 0){ do { j = (m >> 2); if (j > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; *(b_offset + 4) = ctemp5; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp7; *(b_offset + 7) = ctemp8; a_offset += 8; b_offset += 8; j --; } while(j>0); } j = (m & 3); if (j > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; a_offset += 2; b_offset += 2; j --; } while(j>0); } a_offset += lda - m * 2; i--; } while (i > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_ncopy_2.c000066400000000000000000000126341313527062700207620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; lda *= 2; i = (n >> 1); if (i > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset + lda; a_offset += 2 * lda; j = (m >> 2); if (j > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); ctemp5 = *(a_offset1 + 2); ctemp6 = *(a_offset1 + 3); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); ctemp9 = *(a_offset1 + 4); ctemp10 = *(a_offset1 + 5); ctemp11 = *(a_offset2 + 4); ctemp12 = *(a_offset2 + 5); ctemp13 = *(a_offset1 + 6); ctemp14 = *(a_offset1 + 7); ctemp15 = *(a_offset2 + 6); ctemp16 = *(a_offset2 + 7); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; *(b_offset + 4) = ctemp5; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp7; *(b_offset + 7) = ctemp8; *(b_offset + 8) = ctemp9; *(b_offset + 9) = ctemp10; *(b_offset +10) = ctemp11; *(b_offset +11) = ctemp12; *(b_offset +12) = ctemp13; *(b_offset +13) = ctemp14; *(b_offset +14) = ctemp15; *(b_offset +15) = ctemp16; a_offset1 += 8; a_offset2 += 8; b_offset += 16; j --; } while(j>0); } j = (m & 3); if (j > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; a_offset1 += 2; a_offset2 += 2; b_offset += 4; j --; } while(j>0); } i --; } while(i>0); } if (n & 1){ j = (m >> 2); if (j > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp5 = *(a_offset + 2); ctemp6 = *(a_offset + 3); ctemp9 = *(a_offset + 4); ctemp10 = *(a_offset + 5); ctemp13 = *(a_offset + 6); ctemp14 = *(a_offset + 7); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp5; *(b_offset + 3) = ctemp6; *(b_offset + 4) = ctemp9; *(b_offset + 5) = ctemp10; *(b_offset + 6) = ctemp13; *(b_offset + 7) = ctemp14; a_offset += 8; b_offset += 8; j --; } while(j>0); } j = (m & 3); if (j > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; a_offset += 2; b_offset += 2; j --; } while(j > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_ncopy_4.c000066400000000000000000000245321313527062700207640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; lda *= 2; #if 0 fprintf(stderr, "m = %d n = %d\n", m,n ); #endif j = (n >> 2); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; i = (m >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp09; *(boffset + 3) = ctemp10; *(boffset + 4) = ctemp17; *(boffset + 5) = ctemp18; *(boffset + 6) = ctemp25; *(boffset + 7) = ctemp26; *(boffset + 8) = ctemp03; *(boffset + 9) = ctemp04; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp19; *(boffset + 13) = ctemp20; *(boffset + 14) = ctemp27; *(boffset + 15) = ctemp28; *(boffset + 16) = ctemp05; *(boffset + 17) = ctemp06; *(boffset + 18) = ctemp13; *(boffset + 19) = ctemp14; *(boffset + 20) = ctemp21; *(boffset + 21) = ctemp22; *(boffset + 22) = ctemp29; *(boffset + 23) = ctemp30; *(boffset + 24) = ctemp07; *(boffset + 25) = ctemp08; *(boffset + 26) = ctemp15; *(boffset + 27) = ctemp16; *(boffset + 28) = ctemp23; *(boffset + 29) = ctemp24; *(boffset + 30) = ctemp31; *(boffset + 31) = ctemp32; aoffset1 += 8; aoffset2 += 8; aoffset3 += 8; aoffset4 += 8; boffset += 32; i --; }while(i > 0); } if (m & 2) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp06; *(boffset + 4) = ctemp09; *(boffset + 5) = ctemp10; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp14; *(boffset + 8) = ctemp03; *(boffset + 9) = ctemp04; *(boffset + 10) = ctemp07; *(boffset + 11) = ctemp08; *(boffset + 12) = ctemp11; *(boffset + 13) = ctemp12; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; aoffset4 += 4; boffset += 16; } if (m & 1) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; boffset += 8; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; i = (m >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp09; *(boffset + 3) = ctemp10; *(boffset + 4) = ctemp03; *(boffset + 5) = ctemp04; *(boffset + 6) = ctemp11; *(boffset + 7) = ctemp12; *(boffset + 8) = ctemp05; *(boffset + 9) = ctemp06; *(boffset + 10) = ctemp13; *(boffset + 11) = ctemp14; *(boffset + 12) = ctemp07; *(boffset + 13) = ctemp08; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; aoffset1 += 8; aoffset2 += 8; boffset += 16; i --; }while(i > 0); } if (m & 2) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp05; *(boffset + 3) = ctemp06; *(boffset + 4) = ctemp03; *(boffset + 5) = ctemp04; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; aoffset1 += 4; aoffset2 += 4; boffset += 8; } if (m & 1) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; aoffset1 += 2; aoffset2 += 2; boffset += 4; } } if (n & 1){ aoffset1 = aoffset; i = (m >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; aoffset1 += 8; boffset += 8; i --; }while(i > 0); } if (m & 2) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; aoffset1 += 4; boffset += 4; } if (m & 1) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_ncopy_4_sandy.c000066400000000000000000000162161313527062700221620ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include #include "common.h" int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; BLASLONG ii; FLOAT *src0,*src1,*src2,*src3,*dest0; for (j=0; j #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; aoffset = a; boffset = b; lda *= 2; j = (n >> 3); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset5 = aoffset4 + lda; aoffset6 = aoffset5 + lda; aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; i = m; if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp13; *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; aoffset5 += 2; aoffset6 += 2; aoffset7 += 2; aoffset8 += 2; boffset += 16; i --; }while(i > 0); } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; i = m; if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; boffset += 8; i --; }while(i > 0); } } /* end of if(j > 0) */ if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; i = m; if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; aoffset1 += 2; aoffset2 += 2; boffset += 4; i --; }while(i > 0); } } /* end of if(j > 0) */ if (n & 1){ aoffset1 = aoffset; i = m; if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; aoffset1 += 2; boffset += 2; i --; }while(i > 0); } } /* end of if(j > 0) */ return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_ncopy_8_sandy.c000066400000000000000000000302461313527062700221650ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include #include "common.h" int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; BLASLONG ii; FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; for (j=0; j #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset; FLOAT *b_offset, *b_offset1; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; a_offset = a; b_offset = b; lda *= 2; j = m; m *= 2; if (j > 0){ do { b_offset1 = b_offset; b_offset += 2; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; b_offset1 += m; *(b_offset1 + 0) = ctemp3; *(b_offset1 + 1) = ctemp4; b_offset1 += m; *(b_offset1 + 0) = ctemp5; *(b_offset1 + 1) = ctemp6; b_offset1 += m; *(b_offset1 + 0) = ctemp7; *(b_offset1 + 1) = ctemp8; b_offset1 += m; a_offset += 8; i --; } while(i>0); } i = (n & 3); if (i > 0){ do { ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; b_offset1 += m; a_offset += 2; i --; } while(i > 0); } a_offset += lda - n * 2; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_tcopy_2.c000066400000000000000000000145011313527062700207630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset, *b_offset1, *b_offset2; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; b_offset2 = b + m * (n & ~1) * 2; lda *= 2; j = (m >> 1); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 8; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset1 + 4); ctemp6 = *(a_offset1 + 5); ctemp7 = *(a_offset1 + 6); ctemp8 = *(a_offset1 + 7); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); ctemp13 = *(a_offset2 + 4); ctemp14 = *(a_offset2 + 5); ctemp15 = *(a_offset2 + 6); ctemp16 = *(a_offset2 + 7); *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; *(b_offset1 + 4) = ctemp9; *(b_offset1 + 5) = ctemp10; *(b_offset1 + 6) = ctemp11; *(b_offset1 + 7) = ctemp12; b_offset1 += m * 4; *(b_offset1 + 0) = ctemp5; *(b_offset1 + 1) = ctemp6; *(b_offset1 + 2) = ctemp7; *(b_offset1 + 3) = ctemp8; *(b_offset1 + 4) = ctemp13; *(b_offset1 + 5) = ctemp14; *(b_offset1 + 6) = ctemp15; *(b_offset1 + 7) = ctemp16; b_offset1 += m * 4; a_offset1 += 8; a_offset2 += 8; i --; } while(i>0); } if (n & 2){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; *(b_offset1 + 4) = ctemp9; *(b_offset1 + 5) = ctemp10; *(b_offset1 + 6) = ctemp11; *(b_offset1 + 7) = ctemp12; b_offset1 += m * 4; a_offset1 += 4; a_offset2 += 4; } if (n & 1){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp9; *(b_offset2 + 3) = ctemp10; b_offset2 += 4; } j--; } while(j > 0); } if (m & 1){ i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; b_offset += m * 4; *(b_offset + 0) = ctemp5; *(b_offset + 1) = ctemp6; *(b_offset + 2) = ctemp7; *(b_offset + 3) = ctemp8; b_offset += m * 4; a_offset += 8; i --; } while(i > 0); } if (n & 2){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; b_offset += m * 4; a_offset += 4; } if (n & 1){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_tcopy_4.c000066400000000000000000000252441313527062700207730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *boffset, *boffset1, *boffset2, *boffset3; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; lda *= 2; boffset2 = b + 2 * m * (n & ~3); boffset3 = b + 2 * m * (n & ~1); #if 0 fprintf(stderr, "m = %d n = %d\n", m,n ); #endif j = (m >> 2); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; boffset1 = boffset; boffset += 32; i = (n >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; *(boffset1 + 3) = ctemp04; *(boffset1 + 4) = ctemp05; *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; *(boffset1 + 11) = ctemp12; *(boffset1 + 12) = ctemp13; *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; *(boffset1 + 16) = ctemp17; *(boffset1 + 17) = ctemp18; *(boffset1 + 18) = ctemp19; *(boffset1 + 19) = ctemp20; *(boffset1 + 20) = ctemp21; *(boffset1 + 21) = ctemp22; *(boffset1 + 22) = ctemp23; *(boffset1 + 23) = ctemp24; *(boffset1 + 24) = ctemp25; *(boffset1 + 25) = ctemp26; *(boffset1 + 26) = ctemp27; *(boffset1 + 27) = ctemp28; *(boffset1 + 28) = ctemp29; *(boffset1 + 29) = ctemp30; *(boffset1 + 30) = ctemp31; *(boffset1 + 31) = ctemp32; aoffset1 += 8; aoffset2 += 8; aoffset3 += 8; aoffset4 += 8; boffset1 += m * 8; i --; }while(i > 0); } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; *(boffset2 + 3) = ctemp04; *(boffset2 + 4) = ctemp05; *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; *(boffset2 + 8) = ctemp09; *(boffset2 + 9) = ctemp10; *(boffset2 + 10) = ctemp11; *(boffset2 + 11) = ctemp12; *(boffset2 + 12) = ctemp13; *(boffset2 + 13) = ctemp14; *(boffset2 + 14) = ctemp15; *(boffset2 + 15) = ctemp16; aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; aoffset4 += 4; boffset2 += 16; } if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; *(boffset3 + 3) = ctemp04; *(boffset3 + 4) = ctemp05; *(boffset3 + 5) = ctemp06; *(boffset3 + 6) = ctemp07; *(boffset3 + 7) = ctemp08; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; boffset3 += 8; } j--; }while(j > 0); } if (m & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; boffset1 = boffset; boffset += 16; i = (n >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; *(boffset1 + 3) = ctemp04; *(boffset1 + 4) = ctemp05; *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; *(boffset1 + 11) = ctemp12; *(boffset1 + 12) = ctemp13; *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; aoffset1 += 8; aoffset2 += 8; // aoffset3 += 8; // aoffset4 += 8; boffset1 += m * 8; i --; }while(i > 0); } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; *(boffset2 + 3) = ctemp04; *(boffset2 + 4) = ctemp05; *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; aoffset1 += 4; aoffset2 += 4; boffset2 += 8; } if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; *(boffset3 + 3) = ctemp04; aoffset1 += 2; aoffset2 += 2; boffset3 += 4; } } if (m & 1){ aoffset1 = aoffset; boffset1 = boffset; i = (n >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; *(boffset1 + 3) = ctemp04; *(boffset1 + 4) = ctemp05; *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; aoffset1 += 8; boffset1 += m * 8; i --; }while(i > 0); } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; *(boffset2 + 3) = ctemp04; aoffset1 += 4; boffset2 += 4; } if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_tcopy_4_sandy.c000066400000000000000000000161661313527062700221740ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include #include "common.h" int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; BLASLONG ii; FLOAT *src0,*src1,*src2,*src3,*dest0; FLOAT *dest1,*dest2; ii = col&-4; ii = ii*(2*row); dest2 = dest+ii; ii = col&-2; ii = ii*(2*row); dest1 = dest+ii; for (j=0; j #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; lda *= 2; #if 0 fprintf(stderr, "M = %d N = %d\n", m, n); #endif j = (n >> 3); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 16; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); ctemp17 = *(aoffset2 + 0); ctemp18 = *(aoffset2 + 1); ctemp19 = *(aoffset2 + 2); ctemp20 = *(aoffset2 + 3); ctemp21 = *(aoffset2 + 4); ctemp22 = *(aoffset2 + 5); ctemp23 = *(aoffset2 + 6); ctemp24 = *(aoffset2 + 7); ctemp25 = *(aoffset2 + 8); ctemp26 = *(aoffset2 + 9); ctemp27 = *(aoffset2 + 10); ctemp28 = *(aoffset2 + 11); ctemp29 = *(aoffset2 + 12); ctemp30 = *(aoffset2 + 13); ctemp31 = *(aoffset2 + 14); ctemp32 = *(aoffset2 + 15); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp13; *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; *(boffset + 16) = ctemp17; *(boffset + 17) = ctemp18; *(boffset + 18) = ctemp19; *(boffset + 19) = ctemp20; *(boffset + 20) = ctemp21; *(boffset + 21) = ctemp22; *(boffset + 22) = ctemp23; *(boffset + 23) = ctemp24; *(boffset + 24) = ctemp25; *(boffset + 25) = ctemp26; *(boffset + 26) = ctemp27; *(boffset + 27) = ctemp28; *(boffset + 28) = ctemp29; *(boffset + 29) = ctemp30; *(boffset + 30) = ctemp31; *(boffset + 31) = ctemp32; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 32; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp13; *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; boffset += 16; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp12; *(boffset + 12) = ctemp13; *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; boffset += 8; } } if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; *(boffset + 4) = ctemp05; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; boffset += 4; } } if (n & 1){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; boffset += 2; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zgemm_tcopy_8_sandy.c000066400000000000000000000265101313527062700221720ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include #include "common.h" int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; BLASLONG ii; FLOAT *src0,*src1,*src2,*src3,*dest0; FLOAT *dest1,*dest2,*dest4; ii = col&-8; ii = ii*(2*row); dest4 = dest+ii; ii = col&-4; ii = ii*(2*row); dest2 = dest+ii; ii = col&-2; ii = ii*(2*row); dest1 = dest+ii; for (j=0; j #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ FLOAT *X = x; if (incx != 1) { X = buffer; COPY_K(m, x, incx, X, 1); } lda *= 2; incy *= 2; while (n > 0) { FLOAT beta_r = y[0]; FLOAT beta_i = y[1]; #ifndef XCONJ AXPYU_K #else AXPYC_K #endif (m, 0, 0, #ifndef CONJ alpha_r * beta_r - alpha_i * beta_i, alpha_r * beta_i + alpha_i * beta_r, #else alpha_r * beta_r + alpha_i * beta_i, -alpha_r * beta_i + alpha_i * beta_r, #endif X, 1, a, 1, NULL, 0); a += lda; y += incy; n --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_lcopy_1.c000066400000000000000000000074671313527062700212300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01; FLOAT *ao1; lda *= 2; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_lcopy_2.c000066400000000000000000000114311313527062700212130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1, *ao2; lda *= 2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); } else if (offset < -1) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); break; } } if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_lcopy_4.c000066400000000000000000000155401313527062700212220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); } else if (offset < -3) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); break; case -2 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), ZERO); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); break; case -3 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), ZERO); break; } } if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); } else if (offset < -1) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); break; } } if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_lcopy_8.c000066400000000000000000000303051313527062700212220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); } else if (offset < -7) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); break; case -2 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), ZERO); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); break; case -3 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), ZERO); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); break; case -4 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), ZERO); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); break; case -5 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), ZERO); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); break; case -6 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), ZERO); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); break; case -7 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), ZERO); break; } } if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; if (offset > -4) ao5 += lda; else ao5 += 2; if (offset > -5) ao6 += lda; else ao6 += 2; if (offset > -6) ao7 += lda; else ao7 += 2; if (offset > -7) ao8 += lda; else ao8 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); } else if (offset < -3) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); break; case -2 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), ZERO); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); break; case -3 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), ZERO); break; } } if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); } else if (offset < -1) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); break; } } if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_ucopy_1.c000066400000000000000000000074701313527062700212330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01; FLOAT *ao1; lda *= 2; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_ucopy_2.c000066400000000000000000000114371313527062700212320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1, *ao2; lda *= 2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); } else if (offset < -1) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); break; } } if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_ucopy_4.c000066400000000000000000000155571313527062700212430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); } else if (offset < -3) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); break; case -2 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), ZERO); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); break; case -3 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), ZERO); break; } } if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); } else if (offset < -1) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); break; } } if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm3m_ucopy_8.c000066400000000000000000000303051313527062700212330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); } else if (offset < -7) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); break; case -2 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), ZERO); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); break; case -3 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), ZERO); data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); break; case -4 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), ZERO); data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); break; case -5 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), ZERO); data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); break; case -6 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), ZERO); data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); break; case -7 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), ZERO); break; } } if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; if (offset > -4) ao5 += 2; else ao5 += lda; if (offset > -5) ao6 += 2; else ao6 += lda; if (offset > -6) ao7 += 2; else ao7 += lda; if (offset > -7) ao8 += 2; else ao8 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); } else if (offset < -3) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); break; case -2 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), ZERO); data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); break; case -3 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), ZERO); break; } } if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); } else if (offset < -1) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); } else { switch (offset) { case 0 : data01 = CMULT(*(ao1 + 0), ZERO); data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); break; case -1 : data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), ZERO); break; } } if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); } else if (offset < 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); } else { data01 = CMULT(*(ao1 + 0), ZERO); } if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_ltcopy_1.c000066400000000000000000000066121313527062700211430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1; lda *= 2; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = -data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_ltcopy_2.c000066400000000000000000000106371313527062700211460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; lda *= 2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; } else if (offset < -1) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; break; case -1 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = ZERO; break; } } b += 4; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = -data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_ltcopy_4.c000066400000000000000000000150601313527062700211430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; } else if (offset < -3) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; break; case -1 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; break; case -2 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = ZERO; b[ 6] = data07; b[ 7] = data08; break; case -3 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = ZERO; break; } } b += 8; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; } else if (offset < -1) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; break; case -1 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = ZERO; break; } } b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = -data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_ltcopy_8.c000066400000000000000000000276651313527062700211650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); data09 = *(ao5 + 0); data10 = *(ao5 + 1); data11 = *(ao6 + 0); data12 = *(ao6 + 1); data13 = *(ao7 + 0); data14 = *(ao7 + 1); data15 = *(ao8 + 0); data16 = *(ao8 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; if (offset > -4) ao5 += lda; else ao5 += 2; if (offset > -5) ao6 += lda; else ao6 += 2; if (offset > -6) ao7 += lda; else ao7 += 2; if (offset > -7) ao8 += lda; else ao8 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; } else if (offset < -7) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; break; case -1 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; break; case -2 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = ZERO; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; break; case -3 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; break; case -4 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = ZERO; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; break; case -5 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = ZERO; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; break; case -6 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = ZERO; b[14] = data15; b[15] = data16; break; case -7 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = ZERO; break; } } b += 16; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; } else if (offset < -3) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; break; case -1 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; break; case -2 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = ZERO; b[ 6] = data07; b[ 7] = data08; break; case -3 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = ZERO; break; } } b += 8; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; } else if (offset < -1) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; break; case -1 : b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = ZERO; break; } } b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { b[ 0] = data01; b[ 1] = data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = -data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_utcopy_1.c000066400000000000000000000066101313527062700211520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1; lda *= 2; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_utcopy_2.c000066400000000000000000000106351313527062700211550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; lda *= 2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; } else if (offset < -1) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = -data04; break; case -1 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = ZERO; break; } } b += 4; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_utcopy_4.c000066400000000000000000000150561313527062700211610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; } else if (offset < -3) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; break; case -1 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; break; case -2 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = ZERO; b[ 6] = data07; b[ 7] = -data08; break; case -3 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = ZERO; break; } } b += 8; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; } else if (offset < -1) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = -data04; break; case -1 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = ZERO; break; } } b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemm_utcopy_8.c000066400000000000000000000276621313527062700211730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); data09 = *(ao5 + 0); data10 = *(ao5 + 1); data11 = *(ao6 + 0); data12 = *(ao6 + 1); data13 = *(ao7 + 0); data14 = *(ao7 + 1); data15 = *(ao8 + 0); data16 = *(ao8 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; if (offset > -4) ao5 += 2; else ao5 += lda; if (offset > -5) ao6 += 2; else ao6 += lda; if (offset > -6) ao7 += 2; else ao7 += lda; if (offset > -7) ao8 += 2; else ao8 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; } else if (offset < -7) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; break; case -1 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; break; case -2 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = ZERO; b[ 6] = data07; b[ 7] = -data08; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; break; case -3 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = -data10; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; break; case -4 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = ZERO; b[10] = data11; b[11] = -data12; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; break; case -5 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = ZERO; b[12] = data13; b[13] = -data14; b[14] = data15; b[15] = -data16; break; case -6 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = ZERO; b[14] = data15; b[15] = -data16; break; case -7 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = ZERO; break; } } b += 16; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; } else if (offset < -3) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = -data04; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; break; case -1 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = ZERO; b[ 4] = data05; b[ 5] = -data06; b[ 6] = data07; b[ 7] = -data08; break; case -2 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = ZERO; b[ 6] = data07; b[ 7] = -data08; break; case -3 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = ZERO; break; } } b += 8; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; b[ 2] = data03; b[ 3] = -data04; } else if (offset < -1) { b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; } else { switch (offset) { case 0 : b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = -data04; break; case -1 : b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = ZERO; break; } } b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { b[ 0] = data01; b[ 1] = -data02; } else if (offset < 0) { b[ 0] = data01; b[ 1] = data02; } else { b[ 0] = data01; b[ 1] = ZERO; } b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zhemv_k.c000066400000000000000000000130551313527062700176530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ BLASLONG is, min_i; FLOAT *X = x; FLOAT *Y = y; FLOAT *symbuffer = buffer; FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) * 2 + 4095) & ~4095); FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } #ifndef LOWER for(is = m - offset; is < m; is += SYMV_P){ min_i = MIN(m - is, SYMV_P); #else for(is = 0; is < offset; is += SYMV_P){ min_i = MIN(offset - is, SYMV_P); #endif #ifndef LOWER if (is > 0){ #ifndef HEMVREV GEMV_C(is, min_i, 0, alpha_r, alpha_i, a + is * lda * 2, lda, X, 1, Y + is * 2, 1, gemvbuffer); GEMV_N(is, min_i, 0, alpha_r, alpha_i, a + is * lda * 2, lda, X + is * 2, 1, Y, 1, gemvbuffer); #else GEMV_T(is, min_i, 0, alpha_r, alpha_i, a + is * lda * 2, lda, X, 1, Y + is * 2, 1, gemvbuffer); GEMV_R(is, min_i, 0, alpha_r, alpha_i, a + is * lda * 2, lda, X + is * 2, 1, Y, 1, gemvbuffer); #endif } #endif #ifndef HEMVREV #ifdef LOWER ZHEMCOPY_L(min_i, a + (is + is * lda) * 2, lda, symbuffer); #else ZHEMCOPY_U(min_i, a + (is + is * lda) * 2, lda, symbuffer); #endif #else #ifdef LOWER ZHEMCOPY_M(min_i, a + (is + is * lda) * 2, lda, symbuffer); #else ZHEMCOPY_V(min_i, a + (is + is * lda) * 2, lda, symbuffer); #endif #endif GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, symbuffer, min_i, X + is * 2, 1, Y + is * 2, 1, gemvbuffer); #ifdef LOWER if (m - is - min_i > 0){ #ifndef HEMVREV GEMV_C(m - is - min_i, min_i, 0, alpha_r, alpha_i, a + ((is + min_i) + is * lda) * 2, lda, X + (is + min_i) * 2, 1, Y + is * 2, 1, gemvbuffer); GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i, a + ((is + min_i) + is * lda) * 2, lda, X + is * 2, 1, Y + (is + min_i) * 2, 1, gemvbuffer); #else GEMV_T(m - is - min_i, min_i, 0, alpha_r, alpha_i, a + ((is + min_i) + is * lda) * 2, lda, X + (is + min_i) * 2, 1, Y + is * 2, 1, gemvbuffer); GEMV_R(m - is - min_i, min_i, 0, alpha_r, alpha_i, a + ((is + min_i) + is * lda) * 2, lda, X + is * 2, 1, Y + (is + min_i) * 2, 1, gemvbuffer); #endif } #endif } /* end of is */ if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/kernel/generic/zimatcopy_cn.c000066400000000000000000000045051313527062700207070ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" /***************************************************** * 2015-09-07 grisuthedragon ******************************************************/ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a , BLASLONG lda) { BLASLONG i,j,ia; FLOAT *aptr; FLOAT a0, a1; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); if ( alpha_r == 1.0 && alpha_i == 0.0) return(0); aptr = a; lda *= 2; for ( i=0; i #include "common.h" #define a2 (a1 + 2) int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip1, ip2; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; FLOAT A1, A2, A3, A4; FLOAT B1, B2, B3, B4; a -= 2; lda *= 2; k1 --; ipiv += k1; if (n <= 0) return 0; j = n; do { piv = ipiv; a1 = a + (k1 + 1) * 2; ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; b1 = a + ip1; b2 = a + ip2; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A3; *(buffer + 3) = A4; } else { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = A1; *(buffer + 3) = A2; } else { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = A3; *(buffer + 3) = A4; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = A1; *(buffer + 3) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } buffer += 4; b1 = a + ip1; b2 = a + ip2; a1 += 4; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } buffer += 2; } a += lda; j --; } while (j > 0); return 0; } OpenBLAS-0.2.20/kernel/generic/zlaswp_ncopy_2.c000066400000000000000000000207421313527062700211620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define a2 (a1 + 2) #define a4 (a3 + 2) int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip1, ip2; blasint *piv; FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; FLOAT A1, A2, A3, A4; FLOAT A5, A6, A7, A8; FLOAT B1, B2, B3, B4; FLOAT B5, B6, B7, B8; a -= 2; lda *= 2; k1 --; ipiv += k1; if (n <= 0) return 0; j = (n >> 1); if (j > 0) { do { piv = ipiv; a1 = a + (k1 + 1) * 2; a3 = a1 + lda; ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; b1 = a + ip1; b2 = a + ip2; b3 = b1 + lda; b4 = b2 + lda; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); A8 = *(a4 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A5; *(buffer + 3) = A6; *(buffer + 4) = A3; *(buffer + 5) = A4; *(buffer + 6) = A7; *(buffer + 7) = A8; } else { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A5; *(buffer + 3) = A6; *(buffer + 4) = B3; *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; *(b2 + 0) = A3; *(b2 + 1) = A4; *(b4 + 0) = A7; *(b4 + 1) = A8; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = A7; *(buffer + 3) = A8; *(buffer + 4) = A1; *(buffer + 5) = A2; *(buffer + 6) = A5; *(buffer + 7) = A6; } else { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = A7; *(buffer + 3) = A8; *(buffer + 4) = B3; *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; *(b2 + 0) = A1; *(b2 + 1) = A2; *(b4 + 0) = A5; *(b4 + 1) = A6; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B5; *(buffer + 3) = B6; *(buffer + 4) = A3; *(buffer + 5) = A4; *(buffer + 6) = A7; *(buffer + 7) = A8; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b3 + 0) = A5; *(b3 + 1) = A6; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B5; *(buffer + 3) = B6; *(buffer + 4) = A1; *(buffer + 5) = A2; *(buffer + 6) = A5; *(buffer + 7) = A6; *(b1 + 0) = A3; *(b1 + 1) = A4; *(b3 + 0) = A7; *(b3 + 1) = A8; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B5; *(buffer + 3) = B6; *(buffer + 4) = B3; *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(b3 + 0) = A5; *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; } } buffer += 8; b1 = a + ip1; b2 = a + ip2; b3 = b1 + lda; b4 = b2 + lda; a1 += 4; a3 += 4; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); A3 = *(a3 + 0); A4 = *(a3 + 1); B3 = *(b3 + 0); B4 = *(b3 + 1); if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A3; *(buffer + 3) = A4; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b3 + 0) = A3; *(b3 + 1) = A4; } buffer += 4; } a += 2 * lda; j --; } while (j > 0); } if (n & 1) { piv = ipiv; a1 = a + (k1 + 1) * 2; ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; b1 = a + ip1; b2 = a + ip2; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A3; *(buffer + 3) = A4; } else { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = A1; *(buffer + 3) = A2; } else { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = A3; *(buffer + 3) = A4; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = A1; *(buffer + 3) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } buffer += 4; b1 = a + ip1; b2 = a + ip2; a1 += 4; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } buffer += 2; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zlaswp_ncopy_4.c000066400000000000000000000355231313527062700211670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define a2 (a1 + 2) #define a4 (a3 + 2) #define a6 (a5 + 2) #define a8 (a7 + 2) int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ BLASLONG i, j, ip1, ip2; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; FLOAT *b5, *b6, *b7, *b8; FLOAT A1, A2, A3, A4, A5, A6, A7, A8; FLOAT B1, B2, B3, B4, B5, B6, B7, B8; FLOAT A9, A10, A11, A12, A13, A14, A15, A16; FLOAT B9, B10, B11, B12, B13, B14, B15, B16; a -= 2; lda *= 2; k1 --; ipiv += k1; if (n <= 0) return 0; j = (n >> 2); if (j > 0) { do { piv = ipiv; a1 = a + (k1 + 1) * 2; a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *(a1 + 0); A9 = *(a1 + 1); A2 = *(a2 + 0); A10 = *(a2 + 1); A3 = *(a3 + 0); A11 = *(a3 + 1); A4 = *(a4 + 0); A12 = *(a4 + 1); A5 = *(a5 + 0); A13 = *(a5 + 1); A6 = *(a6 + 0); A14 = *(a6 + 1); A7 = *(a7 + 0); A15 = *(a7 + 1); A8 = *(a8 + 0); A16 = *(a8 + 1); B1 = *(b1 + 0); B9 = *(b1 + 1); B2 = *(b2 + 0); B10 = *(b2 + 1); B3 = *(b3 + 0); B11 = *(b3 + 1); B4 = *(b4 + 0); B12 = *(b4 + 1); B5 = *(b5 + 0); B13 = *(b5 + 1); B6 = *(b6 + 0); B14 = *(b6 + 1); B7 = *(b7 + 0); B15 = *(b7 + 1); B8 = *(b8 + 0); B16 = *(b8 + 1); ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A9; *(buffer + 2) = A3; *(buffer + 3) = A11; *(buffer + 4) = A5; *(buffer + 5) = A13; *(buffer + 6) = A7; *(buffer + 7) = A15; *(buffer + 8) = A2; *(buffer + 9) = A10; *(buffer + 10) = A4; *(buffer + 11) = A12; *(buffer + 12) = A6; *(buffer + 13) = A14; *(buffer + 14) = A8; *(buffer + 15) = A16; } else { *(buffer + 0) = A1; *(buffer + 1) = A9; *(buffer + 2) = A3; *(buffer + 3) = A11; *(buffer + 4) = A5; *(buffer + 5) = A13; *(buffer + 6) = A7; *(buffer + 7) = A15; *(buffer + 8) = B2; *(buffer + 9) = B10; *(buffer + 10) = B4; *(buffer + 11) = B12; *(buffer + 12) = B6; *(buffer + 13) = B14; *(buffer + 14) = B8; *(buffer + 15) = B16; *(b2 + 0) = A2; *(b2 + 1) = A10; *(b4 + 0) = A4; *(b4 + 1) = A12; *(b6 + 0) = A6; *(b6 + 1) = A14; *(b8 + 0) = A8; *(b8 + 1) = A16; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; *(buffer + 1) = A10; *(buffer + 2) = A4; *(buffer + 3) = A12; *(buffer + 4) = A6; *(buffer + 5) = A14; *(buffer + 6) = A8; *(buffer + 7) = A16; *(buffer + 8) = A1; *(buffer + 9) = A9; *(buffer + 10) = A3; *(buffer + 11) = A11; *(buffer + 12) = A5; *(buffer + 13) = A13; *(buffer + 14) = A7; *(buffer + 15) = A15; } else { *(buffer + 0) = A2; *(buffer + 1) = A10; *(buffer + 2) = A4; *(buffer + 3) = A12; *(buffer + 4) = A6; *(buffer + 5) = A14; *(buffer + 6) = A8; *(buffer + 7) = A16; *(buffer + 8) = B2; *(buffer + 9) = B10; *(buffer + 10) = B4; *(buffer + 11) = B12; *(buffer + 12) = B6; *(buffer + 13) = B14; *(buffer + 14) = B8; *(buffer + 15) = B16; *(b2 + 0) = A1; *(b2 + 1) = A9; *(b4 + 0) = A3; *(b4 + 1) = A11; *(b6 + 0) = A5; *(b6 + 1) = A13; *(b8 + 0) = A7; *(b8 + 1) = A15; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B9; *(buffer + 2) = B3; *(buffer + 3) = B11; *(buffer + 4) = B5; *(buffer + 5) = B13; *(buffer + 6) = B7; *(buffer + 7) = B15; *(buffer + 8) = A2; *(buffer + 9) = A10; *(buffer + 10) = A4; *(buffer + 11) = A12; *(buffer + 12) = A6; *(buffer + 13) = A14; *(buffer + 14) = A8; *(buffer + 15) = A16; *(b1 + 0) = A1; *(b1 + 1) = A9; *(b3 + 0) = A3; *(b3 + 1) = A11; *(b5 + 0) = A5; *(b5 + 1) = A13; *(b7 + 0) = A7; *(b7 + 1) = A15; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B9; *(buffer + 2) = B3; *(buffer + 3) = B11; *(buffer + 4) = B5; *(buffer + 5) = B13; *(buffer + 6) = B7; *(buffer + 7) = B15; *(buffer + 8) = A1; *(buffer + 9) = A9; *(buffer + 10) = A3; *(buffer + 11) = A11; *(buffer + 12) = A5; *(buffer + 13) = A13; *(buffer + 14) = A7; *(buffer + 15) = A15; *(b1 + 0) = A2; *(b1 + 1) = A10; *(b3 + 0) = A4; *(b3 + 1) = A12; *(b5 + 0) = A6; *(b5 + 1) = A14; *(b7 + 0) = A8; *(b7 + 1) = A16; } else { *(buffer + 0) = B1; *(buffer + 1) = B9; *(buffer + 2) = B3; *(buffer + 3) = B11; *(buffer + 4) = B5; *(buffer + 5) = B13; *(buffer + 6) = B7; *(buffer + 7) = B15; *(buffer + 8) = B2; *(buffer + 9) = B10; *(buffer + 10) = B4; *(buffer + 11) = B12; *(buffer + 12) = B6; *(buffer + 13) = B14; *(buffer + 14) = B8; *(buffer + 15) = B16; *(b1 + 0) = A1; *(b1 + 1) = A9; *(b2 + 0) = A2; *(b2 + 1) = A10; *(b3 + 0) = A3; *(b3 + 1) = A11; *(b4 + 0) = A4; *(b4 + 1) = A12; *(b5 + 0) = A5; *(b5 + 1) = A13; *(b6 + 0) = A6; *(b6 + 1) = A14; *(b7 + 0) = A7; *(b7 + 1) = A15; *(b8 + 0) = A8; *(b8 + 1) = A16; } } buffer += 16; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; a1 += 4; a3 += 4; a5 += 4; a7 += 4; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *(a1 + 0); A9 = *(a1 + 1); B1 = *(b1 + 0); B9 = *(b1 + 1); A3 = *(a3 + 0); A11 = *(a3 + 1); B3 = *(b3 + 0); B11 = *(b3 + 1); A5 = *(a5 + 0); A13 = *(a5 + 1); B5 = *(b5 + 0); B13 = *(b5 + 1); A7 = *(a7 + 0); A15 = *(a7 + 1); B7 = *(b7 + 0); B15 = *(b7 + 1); if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A9; *(buffer + 2) = A3; *(buffer + 3) = A11; *(buffer + 4) = A5; *(buffer + 5) = A13; *(buffer + 6) = A7; *(buffer + 7) = A15; } else { *(buffer + 0) = B1; *(buffer + 1) = B9; *(buffer + 2) = B3; *(buffer + 3) = B11; *(buffer + 4) = B5; *(buffer + 5) = B13; *(buffer + 6) = B7; *(buffer + 7) = B15; *(b1 + 0) = A1; *(b1 + 1) = A9; *(b3 + 0) = A3; *(b3 + 1) = A11; *(b5 + 0) = A5; *(b5 + 1) = A13; *(b7 + 0) = A7; *(b7 + 1) = A15; } buffer += 8; } a += 4 * lda; j --; } while (j > 0); } if (n & 2) { piv = ipiv; a1 = a + (k1 + 1) * 2; a3 = a1 + lda; ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; b1 = a + ip1; b2 = a + ip2; b3 = b1 + lda; b4 = b2 + lda; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); A8 = *(a4 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A5; *(buffer + 3) = A6; *(buffer + 4) = A3; *(buffer + 5) = A4; *(buffer + 6) = A7; *(buffer + 7) = A8; } else { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A5; *(buffer + 3) = A6; *(buffer + 4) = B3; *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; *(b2 + 0) = A3; *(b2 + 1) = A4; *(b4 + 0) = A7; *(b4 + 1) = A8; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = A7; *(buffer + 3) = A8; *(buffer + 4) = A1; *(buffer + 5) = A2; *(buffer + 6) = A5; *(buffer + 7) = A6; } else { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = A7; *(buffer + 3) = A8; *(buffer + 4) = B3; *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; *(b2 + 0) = A1; *(b2 + 1) = A2; *(b4 + 0) = A5; *(b4 + 1) = A6; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B5; *(buffer + 3) = B6; *(buffer + 4) = A3; *(buffer + 5) = A4; *(buffer + 6) = A7; *(buffer + 7) = A8; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b3 + 0) = A5; *(b3 + 1) = A6; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B5; *(buffer + 3) = B6; *(buffer + 4) = A1; *(buffer + 5) = A2; *(buffer + 6) = A5; *(buffer + 7) = A6; *(b1 + 0) = A3; *(b1 + 1) = A4; *(b3 + 0) = A7; *(b3 + 1) = A8; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B5; *(buffer + 3) = B6; *(buffer + 4) = B3; *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(b3 + 0) = A5; *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; } } buffer += 8; b1 = a + ip1; b2 = a + ip2; b3 = b1 + lda; b4 = b2 + lda; a1 += 4; a3 += 4; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); A3 = *(a3 + 0); A4 = *(a3 + 1); B3 = *(b3 + 0); B4 = *(b3 + 1); if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A3; *(buffer + 3) = A4; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b3 + 0) = A3; *(b3 + 1) = A4; } buffer += 4; } a += 2 * lda; } if (n & 1) { piv = ipiv; a1 = a + (k1 + 1) * 2; ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; b1 = a + ip1; b2 = a + ip2; i = ((k2 - k1) >> 1); if (i > 0) { do { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A3; *(buffer + 3) = A4; } else { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = A1; *(buffer + 3) = A2; } else { *(buffer + 0) = A3; *(buffer + 1) = A4; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } else { if (b2 == a2) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = A3; *(buffer + 3) = A4; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = A1; *(buffer + 3) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(buffer + 2) = B3; *(buffer + 3) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } buffer += 4; b1 = a + ip1; b2 = a + ip2; a1 += 4; i --; } while (i > 0); } i = ((k2 - k1) & 1); if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; } else { *(buffer + 0) = B1; *(buffer + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } buffer += 2; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zneg_tcopy_1.c000066400000000000000000000077451313527062700206220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset; FLOAT *b_offset, *b_offset1; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; a_offset = a; b_offset = b; lda *= 2; j = m; m *= 2; if (j > 0){ do { b_offset1 = b_offset; b_offset += 2; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; b_offset1 += m; *(b_offset1 + 0) = -ctemp3; *(b_offset1 + 1) = -ctemp4; b_offset1 += m; *(b_offset1 + 0) = -ctemp5; *(b_offset1 + 1) = -ctemp6; b_offset1 += m; *(b_offset1 + 0) = -ctemp7; *(b_offset1 + 1) = -ctemp8; b_offset1 += m; a_offset += 8; i --; } while(i>0); } i = (n & 3); if (i > 0){ do { ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; b_offset1 += m; a_offset += 2; i --; } while(i > 0); } a_offset += lda - n * 2; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/generic/zneg_tcopy_2.c000066400000000000000000000145531313527062700206160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2; FLOAT *b_offset, *b_offset1, *b_offset2; FLOAT ctemp1, ctemp2, ctemp3, ctemp4; FLOAT ctemp5, ctemp6, ctemp7, ctemp8; FLOAT ctemp9, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; a_offset = a; b_offset = b; b_offset2 = b + m * (n & ~1) * 2; lda *= 2; j = (m >> 1); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; b_offset1 = b_offset; b_offset += 8; i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp5 = *(a_offset1 + 4); ctemp6 = *(a_offset1 + 5); ctemp7 = *(a_offset1 + 6); ctemp8 = *(a_offset1 + 7); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); ctemp13 = *(a_offset2 + 4); ctemp14 = *(a_offset2 + 5); ctemp15 = *(a_offset2 + 6); ctemp16 = *(a_offset2 + 7); *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; *(b_offset1 + 4) = -ctemp9; *(b_offset1 + 5) = -ctemp10; *(b_offset1 + 6) = -ctemp11; *(b_offset1 + 7) = -ctemp12; b_offset1 += m * 4; *(b_offset1 + 0) = -ctemp5; *(b_offset1 + 1) = -ctemp6; *(b_offset1 + 2) = -ctemp7; *(b_offset1 + 3) = -ctemp8; *(b_offset1 + 4) = -ctemp13; *(b_offset1 + 5) = -ctemp14; *(b_offset1 + 6) = -ctemp15; *(b_offset1 + 7) = -ctemp16; b_offset1 += m * 4; a_offset1 += 8; a_offset2 += 8; i --; } while(i>0); } if (n & 2){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; *(b_offset1 + 4) = -ctemp9; *(b_offset1 + 5) = -ctemp10; *(b_offset1 + 6) = -ctemp11; *(b_offset1 + 7) = -ctemp12; b_offset1 += m * 4; a_offset1 += 4; a_offset2 += 4; } if (n & 1){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; *(b_offset2 + 2) = -ctemp9; *(b_offset2 + 3) = -ctemp10; b_offset2 += 4; } j--; } while(j > 0); } if (m & 1){ i = (n >> 2); if (i > 0){ do{ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); *(b_offset + 0) = -ctemp1; *(b_offset + 1) = -ctemp2; *(b_offset + 2) = -ctemp3; *(b_offset + 3) = -ctemp4; b_offset += m * 4; *(b_offset + 0) = -ctemp5; *(b_offset + 1) = -ctemp6; *(b_offset + 2) = -ctemp7; *(b_offset + 3) = -ctemp8; b_offset += m * 4; a_offset += 8; i --; } while(i > 0); } if (n & 2){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); *(b_offset + 0) = -ctemp1; *(b_offset + 1) = -ctemp2; *(b_offset + 2) = -ctemp3; *(b_offset + 3) = -ctemp4; b_offset += m * 4; a_offset += 4; } if (n & 1){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zneg_tcopy_4.c000066400000000000000000000254001313527062700206110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; FLOAT *boffset, *boffset1, *boffset2, *boffset3; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; lda *= 2; boffset2 = b + 2 * m * (n & ~3); boffset3 = b + 2 * m * (n & ~1); #if 0 fprintf(stderr, "m = %d n = %d\n", m,n ); #endif j = (m >> 2); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; boffset1 = boffset; boffset += 32; i = (n >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); ctemp20 = *(aoffset3 + 3); ctemp21 = *(aoffset3 + 4); ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); ctemp28 = *(aoffset4 + 3); ctemp29 = *(aoffset4 + 4); ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; *(boffset1 + 3) = -ctemp04; *(boffset1 + 4) = -ctemp05; *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; *(boffset1 + 11) = -ctemp12; *(boffset1 + 12) = -ctemp13; *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; *(boffset1 + 16) = -ctemp17; *(boffset1 + 17) = -ctemp18; *(boffset1 + 18) = -ctemp19; *(boffset1 + 19) = -ctemp20; *(boffset1 + 20) = -ctemp21; *(boffset1 + 21) = -ctemp22; *(boffset1 + 22) = -ctemp23; *(boffset1 + 23) = -ctemp24; *(boffset1 + 24) = -ctemp25; *(boffset1 + 25) = -ctemp26; *(boffset1 + 26) = -ctemp27; *(boffset1 + 27) = -ctemp28; *(boffset1 + 28) = -ctemp29; *(boffset1 + 29) = -ctemp30; *(boffset1 + 30) = -ctemp31; *(boffset1 + 31) = -ctemp32; aoffset1 += 8; aoffset2 += 8; aoffset3 += 8; aoffset4 += 8; boffset1 += m * 8; i --; }while(i > 0); } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; *(boffset2 + 3) = -ctemp04; *(boffset2 + 4) = -ctemp05; *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; *(boffset2 + 8) = -ctemp09; *(boffset2 + 9) = -ctemp10; *(boffset2 + 10) = -ctemp11; *(boffset2 + 11) = -ctemp12; *(boffset2 + 12) = -ctemp13; *(boffset2 + 13) = -ctemp14; *(boffset2 + 14) = -ctemp15; *(boffset2 + 15) = -ctemp16; aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; aoffset4 += 4; boffset2 += 16; } if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; *(boffset3 + 3) = -ctemp04; *(boffset3 + 4) = -ctemp05; *(boffset3 + 5) = -ctemp06; *(boffset3 + 6) = -ctemp07; *(boffset3 + 7) = -ctemp08; aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; boffset3 += 8; } j--; }while(j > 0); } if (m & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; boffset1 = boffset; boffset += 16; i = (n >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; *(boffset1 + 3) = -ctemp04; *(boffset1 + 4) = -ctemp05; *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; *(boffset1 + 11) = -ctemp12; *(boffset1 + 12) = -ctemp13; *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; aoffset1 += 8; aoffset2 += 8; aoffset3 += 8; aoffset4 += 8; boffset1 += m * 8; i --; }while(i > 0); } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; *(boffset2 + 3) = -ctemp04; *(boffset2 + 4) = -ctemp05; *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; aoffset1 += 4; aoffset2 += 4; boffset2 += 8; } if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; *(boffset3 + 3) = -ctemp04; aoffset1 += 2; aoffset2 += 2; boffset3 += 4; } } if (m & 1){ aoffset1 = aoffset; boffset1 = boffset; i = (n >> 2); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; *(boffset1 + 3) = -ctemp04; *(boffset1 + 4) = -ctemp05; *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; aoffset1 += 8; boffset1 += m * 8; i --; }while(i > 0); } if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; *(boffset2 + 3) = -ctemp04; aoffset1 += 4; boffset2 += 4; } if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zneg_tcopy_8.c000066400000000000000000000235131313527062700206200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; FLOAT *aoffset; FLOAT *aoffset1, *aoffset2; FLOAT *boffset; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; FLOAT ctemp17, ctemp18, ctemp19, ctemp20; FLOAT ctemp21, ctemp22, ctemp23, ctemp24; FLOAT ctemp25, ctemp26, ctemp27, ctemp28; FLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; lda *= 2; #if 0 fprintf(stderr, "M = %d N = %d\n", m, n); #endif j = (n >> 3); if (j > 0){ do{ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 16; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); ctemp17 = *(aoffset2 + 0); ctemp18 = *(aoffset2 + 1); ctemp19 = *(aoffset2 + 2); ctemp20 = *(aoffset2 + 3); ctemp21 = *(aoffset2 + 4); ctemp22 = *(aoffset2 + 5); ctemp23 = *(aoffset2 + 6); ctemp24 = *(aoffset2 + 7); ctemp25 = *(aoffset2 + 8); ctemp26 = *(aoffset2 + 9); ctemp27 = *(aoffset2 + 10); ctemp28 = *(aoffset2 + 11); ctemp29 = *(aoffset2 + 12); ctemp30 = *(aoffset2 + 13); ctemp31 = *(aoffset2 + 14); ctemp32 = *(aoffset2 + 15); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; *(boffset + 11) = -ctemp12; *(boffset + 12) = -ctemp13; *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; *(boffset + 16) = -ctemp17; *(boffset + 17) = -ctemp18; *(boffset + 18) = -ctemp19; *(boffset + 19) = -ctemp20; *(boffset + 20) = -ctemp21; *(boffset + 21) = -ctemp22; *(boffset + 22) = -ctemp23; *(boffset + 23) = -ctemp24; *(boffset + 24) = -ctemp25; *(boffset + 25) = -ctemp26; *(boffset + 26) = -ctemp27; *(boffset + 27) = -ctemp28; *(boffset + 28) = -ctemp29; *(boffset + 29) = -ctemp30; *(boffset + 30) = -ctemp31; *(boffset + 31) = -ctemp32; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 32; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset1 + 8); ctemp10 = *(aoffset1 + 9); ctemp11 = *(aoffset1 + 10); ctemp12 = *(aoffset1 + 11); ctemp13 = *(aoffset1 + 12); ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; *(boffset + 11) = -ctemp12; *(boffset + 12) = -ctemp13; *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; boffset += 16; } j--; }while(j > 0); } /* end of if(j > 0) */ if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); ctemp12 = *(aoffset2 + 3); ctemp13 = *(aoffset2 + 4); ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; *(boffset + 11) = -ctemp12; *(boffset + 12) = -ctemp13; *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset1 + 4); ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; boffset += 8; } } if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; *(boffset + 4) = -ctemp05; *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; boffset += 4; } } if (n & 1){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; i --; }while(i > 0); } if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; boffset += 2; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_lcopy_1.c000066400000000000000000000072471313527062700212630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ lda *= 2; BLASLONG i, js, offset; FLOAT data01; FLOAT *ao1; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_lcopy_2.c000066400000000000000000000104051313527062700212520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ lda *= 2; BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1, *ao2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_lcopy_4.c000066400000000000000000000124641313527062700212630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_lcopy_8.c000066400000000000000000000163761313527062700212750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; if (offset > -4) ao5 += lda; else ao5 += 2; if (offset > -5) ao6 += lda; else ao6 += 2; if (offset > -6) ao7 += lda; else ao7 += 2; if (offset > -7) ao8 += lda; else ao8 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_ucopy_1.c000066400000000000000000000072461313527062700212730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01; FLOAT *ao1; lda *= 2; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_ucopy_2.c000066400000000000000000000104041313527062700212620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1, *ao2; lda *= 2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_ucopy_4.c000066400000000000000000000124651313527062700212750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm3m_ucopy_8.c000066400000000000000000000163771313527062700213070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_ALPHA #define REAL_PART(a, b) (a) #define IMAGE_PART(a, b) (b) #else #define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) #define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) #endif #if defined(REAL_ONLY) #define CMULT(a, b) (REAL_PART(a, b)) #elif defined(IMAGE_ONLY) #define CMULT(a, b) (IMAGE_PART(a, b)) #else #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, #ifdef USE_ALPHA FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; if (offset > -4) ao5 += 2; else ao5 += lda; if (offset > -5) ao6 += 2; else ao6 += lda; if (offset > -6) ao7 += 2; else ao7 += lda; if (offset > -7) ao8 += 2; else ao8 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b ++; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_lcopy_1.c000066400000000000000000000064001313527062700210110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1; lda *= 2; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_lcopy_2.c000066400000000000000000000076441313527062700210250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; lda *= 2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_lcopy_4.c000066400000000000000000000121371313527062700210200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_lcopy_8.c000066400000000000000000000165061313527062700210300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); data09 = *(ao5 + 0); data10 = *(ao5 + 1); data11 = *(ao6 + 0); data12 = *(ao6 + 1); data13 = *(ao7 + 0); data14 = *(ao7 + 1); data15 = *(ao8 + 0); data16 = *(ao8 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; if (offset > -4) ao5 += lda; else ao5 += 2; if (offset > -5) ao6 += lda; else ao6 += 2; if (offset > -6) ao7 += lda; else ao7 += 2; if (offset > -7) ao8 += lda; else ao8 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b += 16; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; if (offset > -3) ao4 += lda; else ao4 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_ucopy_1.c000066400000000000000000000063771313527062700210370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02; FLOAT *ao1; lda *= 2; js = n; while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } posX ++; js --; } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_ucopy_2.c000066400000000000000000000076431313527062700210350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04; FLOAT *ao1, *ao2; lda *= 2; js = (n >> 1); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 2; js --; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_ucopy_4.c000066400000000000000000000121351313527062700210270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT *ao1, *ao2, *ao3, *ao4; lda *= 2; js = (n >> 2); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 4; js --; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymm_ucopy_8.c000066400000000000000000000165061313527062700210410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda *= 2; js = (n >> 3); while (js > 0){ offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); data09 = *(ao5 + 0); data10 = *(ao5 + 1); data11 = *(ao6 + 0); data12 = *(ao6 + 1); data13 = *(ao7 + 0); data14 = *(ao7 + 1); data15 = *(ao8 + 0); data16 = *(ao8 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; if (offset > -4) ao5 += 2; else ao5 += lda; if (offset > -5) ao6 += 2; else ao6 += lda; if (offset > -6) ao7 += 2; else ao7 += lda; if (offset > -7) ao8 += 2; else ao8 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b += 16; offset --; i --; } posX += 8; js --; } if (n & 4) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); data05 = *(ao3 + 0); data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; if (offset > -3) ao4 += 2; else ao4 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; offset --; i --; } posX += 4; } if (n & 2) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; offset --; i --; } posX += 2; } if (n & 1) { offset = posX - posY; if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; b[ 1] = data02; b += 2; offset --; i --; } } return 0; } OpenBLAS-0.2.20/kernel/generic/zsymv_k.c000066400000000000000000000114671313527062700177170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #include "symcopy.h" int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ BLASLONG is, min_i; FLOAT *X = x; FLOAT *Y = y; FLOAT *symbuffer = buffer; FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) * 2 + 4095) & ~4095); FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } #ifndef LOWER for(is = m - offset; is < m; is += SYMV_P){ min_i = MIN(m - is, SYMV_P); #else for(is = 0; is < offset; is += SYMV_P){ min_i = MIN(offset - is, SYMV_P); #endif #ifndef LOWER if (is >0){ GEMV_T(is, min_i, 0, alpha_r, alpha_i, a + is * lda * COMPSIZE, lda, X, 1, Y + is * COMPSIZE, 1, gemvbuffer); GEMV_N(is, min_i, 0, alpha_r, alpha_i, a + is * lda * COMPSIZE, lda, X + is * COMPSIZE, 1, Y, 1, gemvbuffer); } #endif #ifdef LOWER ZSYMCOPY_L(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer); #else ZSYMCOPY_U(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer); #endif GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, symbuffer, min_i, X + is * COMPSIZE, 1, Y + is * COMPSIZE, 1, gemvbuffer); #ifdef LOWER if (m - is > min_i){ GEMV_T(m - is - min_i, min_i, 0, alpha_r, alpha_i, a + ((is + min_i) + is * lda) * COMPSIZE, lda, X + (is + min_i) * COMPSIZE, 1, Y + is * COMPSIZE, 1, gemvbuffer); GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i, a + ((is + min_i) + is * lda) * COMPSIZE, lda, X + is * COMPSIZE, 1, Y + (is + min_i) * COMPSIZE, 1, gemvbuffer); } #endif } /* end of is */ if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_lncopy_1.c000066400000000000000000000071411313527062700211640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02; FLOAT *ao1; lda += lda; js = n; if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += 2; b += 2; } else if (X < posY) { ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif ao1 += 2; b += 2; } X ++; i --; } while (i > 0); } posY ++; js --; } while (js > 0); } /* End of main loop */ return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_lncopy_2.c000066400000000000000000000133771313527062700211750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT *ao1, *ao2; lda += lda; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; b[ 3] = data06; b[ 4] = data03; b[ 5] = data04; b[ 6] = data07; b[ 7] = data08; ao1 += 4; ao2 += 4; b += 8; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data03; b[ 5] = data04; b[ 6] = ONE; b[ 7] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data03; b[ 5] = data04; b[ 6] = data07; b[ 7] = data08; #endif ao1 += 4; ao2 += 4; b += 8; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { ao1 += lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; b[ 0] = ZERO; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; #endif b += 4; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; b += 2; ao1 += 2; } else if (X < posY) { b += 2; ao1 += lda; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif b += 2; ao1 += 2; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_lncopy_4.c000066400000000000000000000337001313527062700211670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT *ao1, *ao2, *ao3, *ao4; lda += lda; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b[ 8] = data03; b[ 9] = data04; b[10] = data11; b[11] = data12; b[12] = data19; b[13] = data20; b[14] = data27; b[15] = data28; b[16] = data05; b[17] = data06; b[18] = data13; b[19] = data14; b[20] = data21; b[21] = data22; b[22] = data29; b[23] = data30; b[24] = data07; b[25] = data08; b[26] = data15; b[27] = data16; b[28] = data23; b[29] = data24; b[30] = data31; b[31] = data32; ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data23 = *(ao3 + 6); data24 = *(ao3 + 7); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data04; b[10] = ONE; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = data05; b[17] = data06; b[18] = data13; b[19] = data14; b[20] = ONE; b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; b[24] = data07; b[25] = data08; b[26] = data15; b[27] = data16; b[28] = data23; b[29] = data24; b[30] = ONE; b[31] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data03; b[ 9] = data04; b[10] = data11; b[11] = data12; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = data05; b[17] = data06; b[18] = data13; b[19] = data14; b[20] = data21; b[21] = data22; b[22] = ZERO; b[23] = ZERO; b[24] = data07; b[25] = data08; b[26] = data15; b[27] = data16; b[28] = data23; b[29] = data24; b[30] = data31; b[31] = data32; #endif ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b[ 8] = data03; b[ 9] = data04; b[10] = data11; b[11] = data12; b[12] = data19; b[13] = data20; b[14] = data27; b[15] = data28; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 16; } if (m & 1) { ao1 += lda; b += 8; } } else { #ifdef UNIT if (i >= 2) { data03 = *(ao1 + 2); data04 = *(ao1 + 3); } if (i >= 3) { data05 = *(ao1 + 4); data06 = *(ao1 + 5); data13 = *(ao2 + 4); data14 = *(ao2 + 5); } b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if (i >= 2) { b[ 0] = data03; b[ 1] = data04; b[ 2] = ONE; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = data05; b[ 1] = data06; b[ 2] = data13; b[ 3] = data14; b[ 4] = ONE; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (i >= 2) { data03 = *(ao1 + 2); data04 = *(ao1 + 3); data11 = *(ao2 + 2); data12 = *(ao2 + 3); } if (i >= 3) { data05 = *(ao1 + 4); data06 = *(ao1 + 5); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data21 = *(ao3 + 4); data22 = *(ao3 + 5); } b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if (i >= 2) { b[ 0] = data03; b[ 1] = data04; b[ 2] = data11; b[ 3] = data12; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = data05; b[ 1] = data06; b[ 2] = data13; b[ 3] = data14; b[ 4] = data21; b[ 5] = data22; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data03; b[ 5] = data04; b[ 6] = data11; b[ 7] = data12; ao1 += 4; ao2 += 4; b += 8; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data03; b[ 5] = data04; b[ 6] = ONE; b[ 7] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data03; b[ 5] = data04; b[ 6] = data11; b[ 7] = data12; #endif ao1 += 4; ao2 += 4; b += 8; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { ao1 += lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += 2; b += 2; } else if (X < posY) { ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif ao1 += lda; b += 2; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_lncopy_8.c000066400000000000000000000441361313527062700212000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X, ii; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda += lda; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; ao5 = a + posY * 2 + (posX + 4) * lda; ao6 = a + posY * 2 + (posX + 5) * lda; ao7 = a + posY * 2 + (posX + 6) * lda; ao8 = a + posY * 2 + (posX + 7) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; ao5 = a + posX * 2 + (posY + 4) * lda; ao6 = a + posX * 2 + (posY + 5) * lda; ao7 = a + posX * 2 + (posY + 6) * lda; ao8 = a + posX * 2 + (posY + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X > posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); b[ 11] = *(ao6 + 1); b[ 12] = *(ao7 + 0); b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } } else if (X < posY) { ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 128; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(ao1 + 2); b[ 17] = *(ao1 + 3); #ifdef UNIT b[ 18] = ONE; b[ 19] = ZERO; #else b[ 18] = *(ao2 + 2); b[ 19] = *(ao2 + 3); #endif b[ 20] = ZERO; b[ 21] = ZERO; b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; b[ 27] = ZERO; b[ 28] = ZERO; b[ 29] = ZERO; b[ 30] = ZERO; b[ 31] = ZERO; b[ 32] = *(ao1 + 4); b[ 33] = *(ao1 + 5); b[ 34] = *(ao2 + 4); b[ 35] = *(ao2 + 5); #ifdef UNIT b[ 36] = ONE; b[ 37] = ZERO; #else b[ 36] = *(ao3 + 4); b[ 37] = *(ao3 + 5); #endif b[ 38] = ZERO; b[ 39] = ZERO; b[ 40] = ZERO; b[ 41] = ZERO; b[ 42] = ZERO; b[ 43] = ZERO; b[ 44] = ZERO; b[ 45] = ZERO; b[ 46] = ZERO; b[ 47] = ZERO; b[ 48] = *(ao1 + 6); b[ 49] = *(ao1 + 7); b[ 50] = *(ao2 + 6); b[ 51] = *(ao2 + 7); b[ 52] = *(ao3 + 6); b[ 53] = *(ao3 + 7); #ifdef UNIT b[ 54] = ONE; b[ 55] = ZERO; #else b[ 54] = *(ao4 + 6); b[ 55] = *(ao4 + 7); #endif b[ 56] = ZERO; b[ 57] = ZERO; b[ 58] = ZERO; b[ 59] = ZERO; b[ 60] = ZERO; b[ 61] = ZERO; b[ 62] = ZERO; b[ 63] = ZERO; b[ 64] = *(ao1 + 8); b[ 65] = *(ao1 + 9); b[ 66] = *(ao2 + 8); b[ 67] = *(ao2 + 9); b[ 68] = *(ao3 + 8); b[ 69] = *(ao3 + 9); b[ 70] = *(ao4 + 8); b[ 71] = *(ao4 + 9); #ifdef UNIT b[ 72] = ONE; b[ 73] = ZERO; #else b[ 72] = *(ao5 + 8); b[ 73] = *(ao5 + 9); #endif b[ 74] = ZERO; b[ 75] = ZERO; b[ 76] = ZERO; b[ 77] = ZERO; b[ 78] = ZERO; b[ 79] = ZERO; b[ 80] = *(ao1 + 10); b[ 81] = *(ao1 + 11); b[ 82] = *(ao2 + 10); b[ 83] = *(ao2 + 11); b[ 84] = *(ao3 + 10); b[ 85] = *(ao3 + 11); b[ 86] = *(ao4 + 10); b[ 87] = *(ao4 + 11); b[ 88] = *(ao5 + 10); b[ 89] = *(ao5 + 11); #ifdef UNIT b[ 90] = ONE; b[ 91] = ZERO; #else b[ 90] = *(ao6 + 10); b[ 91] = *(ao6 + 11); #endif b[ 92] = ZERO; b[ 93] = ZERO; b[ 94] = ZERO; b[ 95] = ZERO; b[ 96] = *(ao1 + 12); b[ 97] = *(ao1 + 13); b[ 98] = *(ao2 + 12); b[ 99] = *(ao2 + 13); b[100] = *(ao3 + 12); b[101] = *(ao3 + 13); b[102] = *(ao4 + 12); b[103] = *(ao4 + 13); b[104] = *(ao5 + 12); b[105] = *(ao5 + 13); b[106] = *(ao6 + 12); b[107] = *(ao6 + 13); #ifdef UNIT b[108] = ONE; b[109] = ZERO; #else b[108] = *(ao7 + 12); b[109] = *(ao7 + 13); #endif b[110] = ZERO; b[111] = ZERO; b[112] = *(ao1 + 14); b[113] = *(ao1 + 15); b[114] = *(ao2 + 14); b[115] = *(ao2 + 15); b[116] = *(ao3 + 14); b[117] = *(ao3 + 15); b[118] = *(ao4 + 14); b[119] = *(ao4 + 15); b[120] = *(ao5 + 14); b[121] = *(ao5 + 15); b[122] = *(ao6 + 14); b[123] = *(ao6 + 15); b[124] = *(ao7 + 14); b[125] = *(ao7 + 15); #ifdef UNIT b[126] = ONE; b[127] = ZERO; #else b[126] = *(ao8 + 14); b[127] = *(ao8 + 15); #endif ao1 += 16; ao2 += 16; ao3 += 16; ao4 += 16; ao5 += 16; ao6 += 16; ao7 += 16; ao8 += 16; b += 128; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); b[ 11] = *(ao6 + 1); b[ 12] = *(ao7 + 0); b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } } else if (X < posY) { ao1 += i * lda; ao2 += i * lda; ao3 += i * lda; ao4 += i * lda; ao5 += i * lda; ao6 += i * lda; ao7 += i * lda; ao8 += i * lda; b += 16 * i; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b += 16; if (i >= 2) { b[ 0] = *(ao1 + 2); b[ 1] = *(ao1 + 3); #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(ao2 + 2); b[ 3] = *(ao2 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 3) { b[ 0] = *(ao1 + 4); b[ 1] = *(ao1 + 5); b[ 2] = *(ao2 + 4); b[ 3] = *(ao2 + 5); #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(ao3 + 4); b[ 5] = *(ao3 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 4) { b[ 0] = *(ao1 + 6); b[ 1] = *(ao1 + 7); b[ 2] = *(ao2 + 6); b[ 3] = *(ao2 + 7); b[ 4] = *(ao3 + 6); b[ 5] = *(ao3 + 7); #ifdef UNIT b[ 6] = ONE; b[ 7] = ZERO; #else b[ 6] = *(ao4 + 6); b[ 7] = *(ao4 + 7); #endif b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 5) { b[ 0] = *(ao1 + 8); b[ 1] = *(ao1 + 9); b[ 2] = *(ao2 + 8); b[ 3] = *(ao2 + 9); b[ 4] = *(ao3 + 8); b[ 5] = *(ao3 + 9); b[ 6] = *(ao4 + 8); b[ 7] = *(ao4 + 9); #ifdef UNIT b[ 8] = ONE; b[ 9] = ZERO; #else b[ 8] = *(ao5 + 8); b[ 9] = *(ao5 + 9); #endif b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 6) { b[ 0] = *(ao1 + 10); b[ 1] = *(ao1 + 11); b[ 2] = *(ao2 + 10); b[ 3] = *(ao2 + 11); b[ 4] = *(ao3 + 10); b[ 5] = *(ao3 + 11); b[ 6] = *(ao4 + 10); b[ 7] = *(ao4 + 11); b[ 8] = *(ao5 + 10); b[ 9] = *(ao5 + 11); #ifdef UNIT b[10] = ONE; b[11] = ZERO; #else b[10] = *(ao6 + 10); b[11] = *(ao6 + 11); #endif b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 7) { b[ 0] = *(ao1 + 12); b[ 1] = *(ao1 + 13); b[ 2] = *(ao2 + 12); b[ 3] = *(ao2 + 13); b[ 4] = *(ao3 + 12); b[ 5] = *(ao3 + 13); b[ 6] = *(ao4 + 12); b[ 7] = *(ao4 + 13); b[ 8] = *(ao5 + 12); b[ 9] = *(ao5 + 13); b[10] = *(ao6 + 12); b[11] = *(ao6 + 13); #ifdef UNIT b[12] = ONE; b[13] = ZERO; #else b[12] = *(ao7 + 12); b[13] = *(ao7 + 13); #endif b[14] = ZERO; b[15] = ZERO; b += 16; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = *(ao1 + 2); b[ 9] = *(ao1 + 3); #ifdef UNIT b[ 10] = ONE; b[ 11] = ZERO; #else b[ 10] = *(ao2 + 2); b[ 11] = *(ao2 + 3); #endif b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(ao1 + 4); b[ 17] = *(ao1 + 5); b[ 18] = *(ao2 + 4); b[ 19] = *(ao2 + 5); #ifdef UNIT b[ 20] = ONE; b[ 21] = ZERO; #else b[ 20] = *(ao3 + 4); b[ 21] = *(ao3 + 5); #endif b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = *(ao1 + 6); b[ 25] = *(ao1 + 7); b[ 26] = *(ao2 + 6); b[ 27] = *(ao2 + 7); b[ 28] = *(ao3 + 6); b[ 29] = *(ao3 + 7); #ifdef UNIT b[ 30] = ONE; b[ 31] = ZERO; #else b[ 30] = *(ao4 + 6); b[ 31] = *(ao4 + 7); #endif ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X < posY) { ao1 += i * lda; ao2 += i * lda; ao3 += i * lda; ao4 += i * lda; b += 8 * i; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if (i >= 2) { b[ 0] = *(ao1 + 2); b[ 1] = *(ao1 + 3); #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(ao2 + 2); b[ 3] = *(ao2 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = *(ao1 + 4); b[ 1] = *(ao1 + 5); b[ 2] = *(ao2 + 4); b[ 3] = *(ao2 + 5); #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(ao3 + 4); b[ 5] = *(ao3 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao1 + 2); b[ 5] = *(ao1 + 3); b[ 6] = *(ao2 + 2); b[ 7] = *(ao2 + 3); ao1 += 4; ao2 += 4; b += 8; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = *(ao1 + 2); b[ 5] = *(ao1 + 3); #ifdef UNIT b[ 6] = ONE; b[ 7] = ZERO; #else b[ 6] = *(ao2 + 2); b[ 7] = *(ao2 + 3); #endif ao1 += 4; ao2 += 4; b += 8; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b += 4; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; } i = m; if (m > 0) { do { if (X > posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); ao1 += 2; b += 2; } else if (X < posY) { ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif ao1 += 2; b += 2; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_ltcopy_1.c000066400000000000000000000071461313527062700211770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02; FLOAT *ao1; lda += lda; js = n; if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { ao1 += 2; b += 2; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif ao1 += 2; b += 2; } X ++; i --; } while (i > 0); } posY ++; js --; } while (js > 0); } /* End of main loop */ return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_ltcopy_2.c000066400000000000000000000135421313527062700211750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data1, data2, data3, data4, data5, data6, data7, data8; FLOAT *ao1, *ao2; lda += lda; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { ao1 += 4; ao2 += 4; b += 8; } else if (X < posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); data5 = *(ao2 + 0); data6 = *(ao2 + 1); data7 = *(ao2 + 2); data8 = *(ao2 + 3); b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; b[ 3] = data4; b[ 4] = data5; b[ 5] = data6; b[ 6] = data7; b[ 7] = data8; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data3 = *(ao1 + 2); data4 = *(ao1 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data3; b[ 3] = data4; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ONE; b[ 7] = ZERO; #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); data7 = *(ao2 + 2); data8 = *(ao2 + 3); b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; b[ 3] = data4; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = data7; b[ 7] = data8; #endif ao1 += 4; ao2 += 4; b += 8; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X > posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; b[ 3] = data4; ao1 += lda; b += 4; } else { #ifdef UNIT data3 = *(ao1 + 2); data4 = *(ao1 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data3; b[ 3] = data4; #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; b[ 3] = data4; #endif b += 4; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { b += 2; ao1 += 2; } else if (X < posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); b[ 0] = data1; b[ 1] = data2; b += 2; ao1 += lda; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); b[ 0] = data1; b[ 1] = data2; #endif b += 2; ao1 += 2; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_ltcopy_4.c000066400000000000000000000347211313527062700212010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT *ao1, *ao2, *ao3, *ao4; lda += lda; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data23 = *(ao3 + 6); data24 = *(ao3 + 7); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = ZERO; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; b[19] = ZERO; b[20] = ONE; b[21] = ZERO; b[22] = data23; b[23] = data24; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; b[27] = ZERO; b[28] = ZERO; b[29] = ZERO; b[30] = ONE; b[31] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; b[19] = ZERO; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; b[27] = ZERO; b[28] = ZERO; b[29] = ZERO; b[30] = data31; b[31] = data32; #endif ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X > posY) { if (m & 2) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } if (m & 1) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 2 * lda; ao2 += 2 * lda; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; ao1 += lda; b += 8; } } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); if (i >= 2) { data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); } if (i >= 3) { data23 = *(ao3 + 6); data24 = *(ao3 + 7); } b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = ZERO; b[ 4] = data13; b[ 5] = data14; b[ 6] = data15; b[ 7] = data16; b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ONE; b[ 5] = ZERO; b[ 6] = data23; b[ 7] = data24; b += 8; } #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); if (i >= 2) { data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); } if (i >= 3) { data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); } b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b += 8; if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data12; b[ 4] = data13; b[ 5] = data14; b[ 6] = data15; b[ 7] = data16; b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data21; b[ 5] = data22; b[ 6] = data23; b[ 7] = data24; b += 8; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { ao1 += 4; ao2 += 4; b += 8; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data09; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ONE; b[ 7] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = data11; b[ 7] = data12; #endif ao1 += 4; ao2 += 4; b += 8; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X > posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += lda; b += 4; } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posY * 2 + (posX + 0) * lda; } else { ao1 = a + posX * 2 + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { b += 2; ao1 += 2; } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif b += 2; } X ++; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_ltcopy_8.c000066400000000000000000000447671313527062700212200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, ii; BLASLONG X; FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; lda *= 2; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; a03 = a + posY * 2 + (posX + 2) * lda; a04 = a + posY * 2 + (posX + 3) * lda; a05 = a + posY * 2 + (posX + 4) * lda; a06 = a + posY * 2 + (posX + 5) * lda; a07 = a + posY * 2 + (posX + 6) * lda; a08 = a + posY * 2 + (posX + 7) * lda; } else { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; a03 = a + posX * 2 + (posY + 2) * lda; a04 = a + posX * 2 + (posY + 3) * lda; a05 = a + posX * 2 + (posY + 4) * lda; a06 = a + posX * 2 + (posY + 5) * lda; a07 = a + posX * 2 + (posY + 6) * lda; a08 = a + posX * 2 + (posY + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X > posY) { a01 += 16; a02 += 16; a03 += 16; a04 += 16; a05 += 16; a06 += 16; a07 += 16; a08 += 16; b += 128; } else if (X < posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; b += 16; } a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); b[ 16] = ZERO; b[ 17] = ZERO; #ifdef UNIT b[ 18] = ONE; b[ 19] = ZERO; #else b[ 18] = *(a02 + 2); b[ 19] = *(a02 + 3); #endif b[ 20] = *(a02 + 4); b[ 21] = *(a02 + 5); b[ 22] = *(a02 + 6); b[ 23] = *(a02 + 7); b[ 24] = *(a02 + 8); b[ 25] = *(a02 + 9); b[ 26] = *(a02 + 10); b[ 27] = *(a02 + 11); b[ 28] = *(a02 + 12); b[ 29] = *(a02 + 13); b[ 30] = *(a02 + 14); b[ 31] = *(a02 + 15); b[ 32] = ZERO; b[ 33] = ZERO; b[ 34] = ZERO; b[ 35] = ZERO; #ifdef UNIT b[ 36] = ONE; b[ 37] = ZERO; #else b[ 36] = *(a03 + 4); b[ 37] = *(a03 + 5); #endif b[ 38] = *(a03 + 6); b[ 39] = *(a03 + 7); b[ 40] = *(a03 + 8); b[ 41] = *(a03 + 9); b[ 42] = *(a03 + 10); b[ 43] = *(a03 + 11); b[ 44] = *(a03 + 12); b[ 45] = *(a03 + 13); b[ 46] = *(a03 + 14); b[ 47] = *(a03 + 15); b[ 48] = ZERO; b[ 49] = ZERO; b[ 50] = ZERO; b[ 51] = ZERO; b[ 52] = ZERO; b[ 53] = ZERO; #ifdef UNIT b[ 54] = ONE; b[ 55] = ZERO; #else b[ 54] = *(a04 + 6); b[ 55] = *(a04 + 7); #endif b[ 56] = *(a04 + 8); b[ 57] = *(a04 + 9); b[ 58] = *(a04 + 10); b[ 59] = *(a04 + 11); b[ 60] = *(a04 + 12); b[ 61] = *(a04 + 13); b[ 62] = *(a04 + 14); b[ 63] = *(a04 + 15); b[ 64] = ZERO; b[ 65] = ZERO; b[ 66] = ZERO; b[ 67] = ZERO; b[ 68] = ZERO; b[ 69] = ZERO; b[ 70] = ZERO; b[ 71] = ZERO; #ifdef UNIT b[ 72] = ONE; b[ 73] = ZERO; #else b[ 72] = *(a05 + 8); b[ 73] = *(a05 + 9); #endif b[ 74] = *(a05 + 10); b[ 75] = *(a05 + 11); b[ 76] = *(a05 + 12); b[ 77] = *(a05 + 13); b[ 78] = *(a05 + 14); b[ 79] = *(a05 + 15); b[ 80] = ZERO; b[ 81] = ZERO; b[ 82] = ZERO; b[ 83] = ZERO; b[ 84] = ZERO; b[ 85] = ZERO; b[ 86] = ZERO; b[ 87] = ZERO; b[ 88] = ZERO; b[ 89] = ZERO; #ifdef UNIT b[ 90] = ONE; b[ 91] = ZERO; #else b[ 90] = *(a06 + 10); b[ 91] = *(a06 + 11); #endif b[ 92] = *(a06 + 12); b[ 93] = *(a06 + 13); b[ 94] = *(a06 + 14); b[ 95] = *(a06 + 15); b[ 96] = ZERO; b[ 97] = ZERO; b[ 98] = ZERO; b[ 99] = ZERO; b[100] = ZERO; b[101] = ZERO; b[102] = ZERO; b[103] = ZERO; b[104] = ZERO; b[105] = ZERO; b[106] = ZERO; b[107] = ZERO; #ifdef UNIT b[108] = ONE; b[109] = ZERO; #else b[108] = *(a07 + 12); b[109] = *(a07 + 13); #endif b[110] = *(a07 + 14); b[111] = *(a07 + 15); b[112] = ZERO; b[113] = ZERO; b[114] = ZERO; b[115] = ZERO; b[116] = ZERO; b[117] = ZERO; b[118] = ZERO; b[119] = ZERO; b[120] = ZERO; b[121] = ZERO; b[122] = ZERO; b[123] = ZERO; b[124] = ZERO; b[125] = ZERO; #ifdef UNIT b[126] = ONE; b[127] = ZERO; #else b[126] = *(a08 + 14); b[127] = *(a08 + 15); #endif a01 += 16; a02 += 16; a03 += 16; a04 += 16; a05 += 16; a06 += 16; a07 += 16; a08 += 16; b += 128; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i > 0) { if (X > posY) { a01 += 2 * i; a02 += 2 * i; a03 += 2 * i; a04 += 2 * i; a05 += 2 * i; a06 += 2 * i; a07 += 2 * i; a08 += 2 * i; b += 16 * i; } else if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; a02 += lda; a03 += lda; a04 += lda; a05 += lda; a06 += lda; a07 += lda; a08 += lda; b += 16; } } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); b += 16; if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(a02 + 2); b[ 3] = *(a02 + 3); #endif b[ 4] = *(a02 + 4); b[ 5] = *(a02 + 5); b[ 6] = *(a02 + 6); b[ 7] = *(a02 + 7); b[ 8] = *(a02 + 8); b[ 9] = *(a02 + 9); b[10] = *(a02 + 10); b[11] = *(a02 + 11); b[12] = *(a02 + 12); b[13] = *(a02 + 13); b[14] = *(a02 + 14); b[15] = *(a02 + 15); b += 16; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(a03 + 4); b[ 5] = *(a03 + 5); #endif b[ 6] = *(a03 + 6); b[ 7] = *(a03 + 7); b[ 8] = *(a03 + 8); b[ 9] = *(a03 + 9); b[10] = *(a03 + 10); b[11] = *(a03 + 11); b[12] = *(a03 + 12); b[13] = *(a03 + 13); b[14] = *(a03 + 14); b[15] = *(a03 + 15); b += 16; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; b[ 7] = ZERO; #else b[ 6] = *(a04 + 6); b[ 7] = *(a04 + 7); #endif b[ 8] = *(a04 + 8); b[ 9] = *(a04 + 9); b[10] = *(a04 + 10); b[11] = *(a04 + 11); b[12] = *(a04 + 12); b[13] = *(a04 + 13); b[14] = *(a04 + 14); b[15] = *(a04 + 15); b += 16; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; #ifdef UNIT b[ 8] = ONE; b[ 9] = ZERO; #else b[ 8] = *(a05 + 8); b[ 9] = *(a05 + 9); #endif b[10] = *(a05 + 10); b[11] = *(a05 + 11); b[12] = *(a05 + 12); b[13] = *(a05 + 13); b[14] = *(a05 + 14); b[15] = *(a05 + 15); b += 16; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[10] = ONE; b[11] = ZERO; #else b[10] = *(a06 + 10); b[11] = *(a06 + 11); #endif b[12] = *(a06 + 12); b[13] = *(a06 + 13); b[14] = *(a06 + 14); b[15] = *(a06 + 15); b += 16; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; #ifdef UNIT b[12] = ONE; b[13] = ZERO; #else b[12] = *(a07 + 12); b[13] = *(a07 + 13); #endif b[14] = *(a07 + 14); b[15] = *(a07 + 15); b += 16; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; a03 = a + posY * 2 + (posX + 2) * lda; a04 = a + posY * 2 + (posX + 3) * lda; } else { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; a03 = a + posX * 2 + (posY + 2) * lda; a04 = a + posX * 2 + (posY + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X > posY) { a01 += 8; a02 += 8; a03 += 8; a04 += 8; b += 32; } else if (X < posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; b += 8; } a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[ 10] = ONE; b[ 11] = ZERO; #else b[ 10] = *(a02 + 2); b[ 11] = *(a02 + 3); #endif b[ 12] = *(a02 + 4); b[ 13] = *(a02 + 5); b[ 14] = *(a02 + 6); b[ 15] = *(a02 + 7); b[ 16] = ZERO; b[ 17] = ZERO; b[ 18] = ZERO; b[ 19] = ZERO; #ifdef UNIT b[ 20] = ONE; b[ 21] = ZERO; #else b[ 20] = *(a03 + 4); b[ 21] = *(a03 + 5); #endif b[ 22] = *(a03 + 6); b[ 23] = *(a03 + 7); b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; b[ 27] = ZERO; b[ 28] = ZERO; b[ 29] = ZERO; #ifdef UNIT b[ 30] = ONE; b[ 31] = ZERO; #else b[ 30] = *(a04 + 6); b[ 31] = *(a04 + 7); #endif a01 += 8; a02 += 8; a03 += 8; a04 += 8; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i > 0) { if (X > posY) { a01 += 2 * i; a02 += 2 * i; a03 += 2 * i; a04 += 2 * i; b += 8 * i; } else if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; a02 += lda; a03 += lda; a04 += lda; b += 8; } } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b += 8; if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(a02 + 2); b[ 3] = *(a02 + 3); #endif b[ 4] = *(a02 + 4); b[ 5] = *(a02 + 5); b[ 6] = *(a02 + 6); b[ 7] = *(a02 + 7); b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(a03 + 4); b[ 5] = *(a03 + 5); #endif b[ 6] = *(a03 + 6); b[ 7] = *(a03 + 7); b += 8; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; } else { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X > posY) { a01 += 4; a02 += 4; b += 8; } else if (X < posY) { b[0] = *(a01 + 0); b[1] = *(a01 + 1); b[2] = *(a01 + 2); b[3] = *(a01 + 3); b[4] = *(a02 + 0); b[5] = *(a02 + 1); b[6] = *(a02 + 2); b[7] = *(a02 + 3); a01 += 2 * lda; a02 += 2 * lda; b += 8; } else { #ifdef UNIT b[0] = ONE; b[1] = ZERO; #else b[0] = *(a01 + 0); b[1] = *(a01 + 1); #endif b[2] = *(a01 + 2); b[3] = *(a01 + 3); b[4] = ZERO; b[5] = ZERO; #ifdef UNIT b[6] = ONE; b[7] = ZERO; #else b[6] = *(a02 + 2); b[7] = *(a02 + 3); #endif a01 += 4; a02 += 4; b += 8; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i > 0) { if (X > posY) { a01 += 2; a02 += 2; b += 4; } else if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); a01 += lda; a02 += lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b += 4; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { a01 = a + posY * 2 + (posX + 0) * lda; } else { a01 = a + posX * 2 + (posY + 0) * lda; } i = m; if (i > 0) { do { if (X > posY) { a01 += 2; b += 2; } else if (X < posY) { b[0] = *(a01 + 0); b[1] = *(a01 + 1); a01 += lda; b += 2; } else { #ifdef UNIT b[0] = ONE; b[1] = ZERO; #else b[0] = *(a01 + 0); b[1] = *(a01 + 1); #endif a01 += 2; b += 2; } X += 1; i --; } while (i > 0); } posY += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_uncopy_1.c000066400000000000000000000071471313527062700212030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02; FLOAT *ao1; lda += lda; js = n; if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } i = m; if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += 2; b += 2; } else if (X > posY) { ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif ao1 += lda; b += 2; } X ++; i --; } while (i > 0); } posY ++; js --; } while (js > 0); } /* End of main loop */ return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_uncopy_2.c000066400000000000000000000136061313527062700212010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT *ao1, *ao2; lda += lda; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; b[ 3] = data06; b[ 4] = data03; b[ 5] = data04; b[ 6] = data07; b[ 7] = data08; ao1 += 4; ao2 += 4; b += 8; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data05 = *(ao2 + 0); data06 = *(ao2 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data05; b[ 3] = data06; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ONE; b[ 7] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; b[ 3] = data06; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = data07; b[ 7] = data08; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { ao1 += lda; b += 4; } else { #ifdef UNIT data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; b[ 3] = data04; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; #endif b += 4; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += 2; b += 2; } else if (X > posY) { b += 2; ao1 += lda; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif b += 2; ao1 += lda; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_uncopy_4.c000066400000000000000000000343441313527062700212050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT *ao1, *ao2, *ao3, *ao4; lda += lda; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b[ 8] = data03; b[ 9] = data04; b[10] = data11; b[11] = data12; b[12] = data19; b[13] = data20; b[14] = data27; b[15] = data28; b[16] = data05; b[17] = data06; b[18] = data13; b[19] = data14; b[20] = data21; b[21] = data22; b[22] = data29; b[23] = data30; b[24] = data07; b[25] = data08; b[26] = data15; b[27] = data16; b[28] = data23; b[29] = data24; b[30] = data31; b[31] = data32; ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } else { #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; b[11] = ZERO; b[12] = data19; b[13] = data20; b[14] = data27; b[15] = data28; b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; b[19] = ZERO; b[20] = ONE; b[21] = ZERO; b[22] = data29; b[23] = data30; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; b[27] = ZERO; b[28] = ZERO; b[29] = ZERO; b[30] = ONE; b[31] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; b[11] = data12; b[12] = data19; b[13] = data20; b[14] = data27; b[15] = data28; b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; b[19] = ZERO; b[20] = data21; b[21] = data22; b[22] = data29; b[23] = data30; b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; b[27] = ZERO; b[28] = ZERO; b[29] = ZERO; b[30] = data31; b[31] = data32; #endif ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b[ 8] = data03; b[ 9] = data04; b[10] = data11; b[11] = data12; b[12] = data19; b[13] = data20; b[14] = data27; b[15] = data28; ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 16; } if (m & 1) { ao1 += lda; b += 8; } } else { #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); if (i >= 2) { data19 = *(ao3 + 2); data20 = *(ao3 + 3); data27 = *(ao4 + 2); data28 = *(ao4 + 3); } if (i >= 3) { data29 = *(ao4 + 4); data30 = *(ao4 + 5); } b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b += 8; if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ONE; b[ 3] = ZERO; b[ 4] = data19; b[ 5] = data20; b[ 6] = data27; b[ 7] = data28; b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ONE; b[ 5] = ZERO; b[ 6] = data29; b[ 7] = data30; b += 8; } #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); if (i >= 2) { data11 = *(ao2 + 2); data12 = *(ao2 + 3); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data27 = *(ao4 + 2); data28 = *(ao4 + 3); } if (i >= 3) { data21 = *(ao3 + 4); data22 = *(ao3 + 5); data29 = *(ao4 + 4); data30 = *(ao4 + 5); } b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data17; b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; b += 8; if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = data11; b[ 3] = data12; b[ 4] = data19; b[ 5] = data20; b[ 6] = data27; b[ 7] = data28; b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data21; b[ 5] = data22; b[ 6] = data29; b[ 7] = data30; b += 8; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = data03; b[ 5] = data04; b[ 6] = data11; b[ 7] = data12; ao1 += 4; ao2 += 4; b += 8; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data09; b[ 3] = data10; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ONE; b[ 7] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = data11; b[ 7] = data12; #endif ao1 += 4; ao2 += 4; b += 8; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data09; b[ 3] = data10; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; b[ 3] = data10; #endif ao1 += 2; ao2 += 2; b += 4; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += 2; b += 2; } else if (X > posY) { ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif ao1 += 2; b += 2; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_uncopy_8.c000066400000000000000000000452451313527062700212130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, ii; BLASLONG X; FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; lda += lda; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; ao5 = a + posX * 2 + (posY + 4) * lda; ao6 = a + posX * 2 + (posY + 5) * lda; ao7 = a + posX * 2 + (posY + 6) * lda; ao8 = a + posX * 2 + (posY + 7) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; ao5 = a + posY * 2 + (posX + 4) * lda; ao6 = a + posY * 2 + (posX + 5) * lda; ao7 = a + posY * 2 + (posX + 6) * lda; ao8 = a + posY * 2 + (posX + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X < posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); b[ 11] = *(ao6 + 1); b[ 12] = *(ao7 + 0); b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } } else if (X > posY) { ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 128; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); b[ 11] = *(ao6 + 1); b[ 12] = *(ao7 + 0); b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); b[ 16] = ZERO; b[ 17] = ZERO; #ifdef UNIT b[ 18] = ONE; b[ 19] = ZERO; #else b[ 18] = *(ao2 + 2); b[ 19] = *(ao2 + 3); #endif b[ 20] = *(ao3 + 2); b[ 21] = *(ao3 + 3); b[ 22] = *(ao4 + 2); b[ 23] = *(ao4 + 3); b[ 24] = *(ao5 + 2); b[ 25] = *(ao5 + 3); b[ 26] = *(ao6 + 2); b[ 27] = *(ao6 + 3); b[ 28] = *(ao7 + 2); b[ 29] = *(ao7 + 3); b[ 30] = *(ao8 + 2); b[ 31] = *(ao8 + 3); b[ 32] = ZERO; b[ 33] = ZERO; b[ 34] = ZERO; b[ 35] = ZERO; #ifdef UNIT b[ 36] = ONE; b[ 37] = ZERO; #else b[ 36] = *(ao3 + 4); b[ 37] = *(ao3 + 5); #endif b[ 38] = *(ao4 + 4); b[ 39] = *(ao4 + 5); b[ 40] = *(ao5 + 4); b[ 41] = *(ao5 + 5); b[ 42] = *(ao6 + 4); b[ 43] = *(ao6 + 5); b[ 44] = *(ao7 + 4); b[ 45] = *(ao7 + 5); b[ 46] = *(ao8 + 4); b[ 47] = *(ao8 + 5); b[ 48] = ZERO; b[ 49] = ZERO; b[ 50] = ZERO; b[ 51] = ZERO; b[ 52] = ZERO; b[ 53] = ZERO; #ifdef UNIT b[ 54] = ONE; b[ 55] = ZERO; #else b[ 54] = *(ao4 + 6); b[ 55] = *(ao4 + 7); #endif b[ 56] = *(ao5 + 6); b[ 57] = *(ao5 + 7); b[ 58] = *(ao6 + 6); b[ 59] = *(ao6 + 7); b[ 60] = *(ao7 + 6); b[ 61] = *(ao7 + 7); b[ 62] = *(ao8 + 6); b[ 63] = *(ao8 + 7); b[ 64] = ZERO; b[ 65] = ZERO; b[ 66] = ZERO; b[ 67] = ZERO; b[ 68] = ZERO; b[ 69] = ZERO; b[ 70] = ZERO; b[ 71] = ZERO; #ifdef UNIT b[ 72] = ONE; b[ 73] = ZERO; #else b[ 72] = *(ao5 + 8); b[ 73] = *(ao5 + 9); #endif b[ 74] = *(ao6 + 8); b[ 75] = *(ao6 + 9); b[ 76] = *(ao7 + 8); b[ 77] = *(ao7 + 9); b[ 78] = *(ao8 + 8); b[ 79] = *(ao8 + 9); b[ 80] = ZERO; b[ 81] = ZERO; b[ 82] = ZERO; b[ 83] = ZERO; b[ 84] = ZERO; b[ 85] = ZERO; b[ 86] = ZERO; b[ 87] = ZERO; b[ 88] = ZERO; b[ 89] = ZERO; #ifdef UNIT b[ 90] = ONE; b[ 91] = ZERO; #else b[ 90] = *(ao6 + 10); b[ 91] = *(ao6 + 11); #endif b[ 92] = *(ao7 + 10); b[ 93] = *(ao7 + 11); b[ 94] = *(ao8 + 10); b[ 95] = *(ao8 + 11); b[ 96] = ZERO; b[ 97] = ZERO; b[ 98] = ZERO; b[ 99] = ZERO; b[100] = ZERO; b[101] = ZERO; b[102] = ZERO; b[103] = ZERO; b[104] = ZERO; b[105] = ZERO; b[106] = ZERO; b[107] = ZERO; #ifdef UNIT b[108] = ONE; b[109] = ZERO; #else b[108] = *(ao7 + 12); b[109] = *(ao7 + 13); #endif b[110] = *(ao8 + 12); b[111] = *(ao8 + 13); b[112] = ZERO; b[113] = ZERO; b[114] = ZERO; b[115] = ZERO; b[116] = ZERO; b[117] = ZERO; b[118] = ZERO; b[119] = ZERO; b[120] = ZERO; b[121] = ZERO; b[122] = ZERO; b[123] = ZERO; b[124] = ZERO; b[125] = ZERO; #ifdef UNIT b[126] = ONE; b[127] = ZERO; #else b[126] = *(ao8 + 14); b[127] = *(ao8 + 15); #endif ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; ao4 += 8 * lda; ao5 += 8 * lda; ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; b += 128; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); b[ 11] = *(ao6 + 1); b[ 12] = *(ao7 + 0); b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; ao5 += 2; ao6 += 2; ao7 += 2; ao8 += 2; b += 16; } } else if (X > posY) { ao1 += i * lda; ao2 += i * lda; ao3 += i * lda; ao4 += i * lda; ao5 += i * lda; ao6 += i * lda; ao7 += i * lda; ao8 += i * lda; b += 16 * i; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[10] = *(ao6 + 0); b[11] = *(ao6 + 1); b[12] = *(ao7 + 0); b[13] = *(ao7 + 1); b[14] = *(ao8 + 0); b[15] = *(ao8 + 1); b += 16; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(ao2 + 2); b[ 3] = *(ao2 + 3); #endif b[ 4] = *(ao3 + 2); b[ 5] = *(ao3 + 3); b[ 6] = *(ao4 + 2); b[ 7] = *(ao4 + 3); b[ 8] = *(ao5 + 2); b[ 9] = *(ao5 + 3); b[10] = *(ao6 + 2); b[11] = *(ao6 + 3); b[12] = *(ao7 + 2); b[13] = *(ao7 + 3); b[14] = *(ao8 + 2); b[15] = *(ao8 + 3); b += 16; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(ao3 + 4); b[ 5] = *(ao3 + 5); #endif b[ 6] = *(ao4 + 4); b[ 7] = *(ao4 + 5); b[ 8] = *(ao5 + 4); b[ 9] = *(ao5 + 5); b[10] = *(ao6 + 4); b[11] = *(ao6 + 5); b[12] = *(ao7 + 4); b[13] = *(ao7 + 5); b[14] = *(ao8 + 4); b[15] = *(ao8 + 5); b += 16; } if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; b[ 7] = ZERO; #else b[ 6] = *(ao4 + 6); b[ 7] = *(ao4 + 7); #endif b[ 8] = *(ao5 + 6); b[ 9] = *(ao5 + 7); b[10] = *(ao6 + 6); b[11] = *(ao6 + 7); b[12] = *(ao7 + 6); b[13] = *(ao7 + 7); b[14] = *(ao8 + 6); b[15] = *(ao8 + 7); b += 16; } if (i >= 5) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; #ifdef UNIT b[ 8] = ONE; b[ 9] = ZERO; #else b[ 8] = *(ao5 + 8); b[ 9] = *(ao5 + 9); #endif b[10] = *(ao6 + 8); b[11] = *(ao6 + 9); b[12] = *(ao7 + 8); b[13] = *(ao7 + 9); b[14] = *(ao8 + 8); b[15] = *(ao8 + 9); b += 16; } if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[10] = ONE; b[11] = ZERO; #else b[10] = *(ao6 + 10); b[11] = *(ao6 + 11); #endif b[12] = *(ao7 + 10); b[13] = *(ao7 + 11); b[14] = *(ao8 + 10); b[15] = *(ao8 + 11); b += 16; } if (i >= 7) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; #ifdef UNIT b[12] = ONE; b[13] = ZERO; #else b[12] = *(ao7 + 12); b[13] = *(ao7 + 13); #endif b[14] = *(ao8 + 12); b[15] = *(ao8 + 13); b += 16; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT b[ 10] = ONE; b[ 11] = ZERO; #else b[ 10] = *(ao2 + 2); b[ 11] = *(ao2 + 3); #endif b[ 12] = *(ao3 + 2); b[ 13] = *(ao3 + 3); b[ 14] = *(ao4 + 2); b[ 15] = *(ao4 + 3); b[ 16] = ZERO; b[ 17] = ZERO; b[ 18] = ZERO; b[ 19] = ZERO; #ifdef UNIT b[ 20] = ONE; b[ 21] = ZERO; #else b[ 20] = *(ao3 + 4); b[ 21] = *(ao3 + 5); #endif b[ 22] = *(ao4 + 4); b[ 23] = *(ao4 + 5); b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; b[ 27] = ZERO; b[ 28] = ZERO; b[ 29] = ZERO; #ifdef UNIT b[ 30] = ONE; b[ 31] = ZERO; #else b[ 30] = *(ao4 + 6); b[ 31] = *(ao4 + 7); #endif ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X > posY) { ao1 += i * lda; ao2 += i * lda; ao3 += i * lda; ao4 += i * lda; b += 8 * i; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao3 + 0); b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b += 8; if(i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(ao2 + 2); b[ 3] = *(ao2 + 3); #endif b[ 4] = *(ao3 + 2); b[ 5] = *(ao3 + 3); b[ 6] = *(ao4 + 2); b[ 7] = *(ao4 + 3); b += 8; } if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(ao3 + 4); b[ 5] = *(ao3 + 5); #endif b[ 6] = *(ao4 + 4); b[ 7] = *(ao4 + 5); b += 8; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = *(ao1 + 2); b[ 5] = *(ao1 + 3); b[ 6] = *(ao2 + 2); b[ 7] = *(ao2 + 3); ao1 += 4; ao2 += 4; b += 8; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT b[ 6] = ONE; b[ 7] = ZERO; #else b[ 6] = *(ao2 + 2); b[ 7] = *(ao2 + 3); #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; b += 4; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); #endif b += 2; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); ao1 += 2; b += 2; } else if (X > posY) { ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); #endif ao1 += lda; b += 2; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_utcopy_1.c000066400000000000000000000071511313527062700212040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02; FLOAT *ao1; lda += lda; js = n; if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } i = m; if (i > 0) { do { if (X < posY) { ao1 += 2; b += 2; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif ao1 += lda; b += 2; } X ++; i --; } while (i > 0); } posY ++; js --; } while (js > 0); } /* End of main loop */ return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_utcopy_2.c000066400000000000000000000135511313527062700212060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data1, data2, data3, data4, data5, data6, data7, data8; FLOAT *ao1, *ao2; lda += lda; js = (n >> 1); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { ao1 += 4; ao2 += 4; b += 8; } else if (X > posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); data5 = *(ao2 + 0); data6 = *(ao2 + 1); data7 = *(ao2 + 2); data8 = *(ao2 + 3); b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; b[ 3] = data4; b[ 4] = data5; b[ 5] = data6; b[ 6] = data7; b[ 7] = data8; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data5 = *(ao2 + 0); data6 = *(ao2 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data5; b[ 5] = data6; b[ 6] = ONE; b[ 7] = ZERO; #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); data5 = *(ao2 + 0); data6 = *(ao2 + 1); data7 = *(ao2 + 2); data8 = *(ao2 + 3); b[ 0] = data1; b[ 1] = data2; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data5; b[ 5] = data6; b[ 6] = data7; b[ 7] = data8; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } X += 2; i --; } while (i > 0); } if (m & 1) { if (X < posY) { ao1 += 2; ao2 += 2; b += 4; } else if (X > posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; b[ 3] = data4; ao1 += lda; b += 4; } else { #ifdef UNIT data5 = *(ao2 + 0); data6 = *(ao2 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data5; b[ 3] = data6; #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); data5 = *(ao2 + 0); data6 = *(ao2 + 1); b[ 0] = data1; b[ 1] = data2; b[ 2] = data5; b[ 3] = data6; #endif b += 4; } } posY += 2; js --; } while (js > 0); } /* End of main loop */ if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { ao1 += 2; b += 2; } else if (X > posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); b[ 0] = data1; b[ 1] = data2; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); b[ 0] = data1; b[ 1] = data2; #endif ao1 += lda; b += 2; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_utcopy_4.c000066400000000000000000000337701313527062700212150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js; BLASLONG X; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; FLOAT data09, data10, data11, data12, data13, data14, data15, data16; FLOAT data17, data18, data19, data20, data21, data22, data23, data24; FLOAT data25, data26, data27, data28, data29, data30, data31, data32; FLOAT *ao1, *ao2, *ao3, *ao4; lda += lda; js = (n >> 2); if (js > 0){ do { X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; ao3 = a + posX * 2 + (posY + 2) * lda; ao4 = a + posX * 2 + (posY + 3) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; ao3 = a + posY * 2 + (posX + 2) * lda; ao4 = a + posY * 2 + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = data21; b[21] = data22; b[22] = data23; b[23] = data24; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } else { #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = ONE; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = ONE; b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = ONE; b[31] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b[16] = data17; b[17] = data18; b[18] = data19; b[19] = data20; b[20] = data21; b[21] = data22; b[22] = ZERO; b[23] = ZERO; b[24] = data25; b[25] = data26; b[26] = data27; b[27] = data28; b[28] = data29; b[29] = data30; b[30] = data31; b[31] = data32; #endif ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { if (m & 2) { ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } if (m & 1) { ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; b[ 8] = data09; b[ 9] = data10; b[10] = data11; b[11] = data12; b[12] = data13; b[13] = data14; b[14] = data15; b[15] = data16; ao1 += 2 * lda; ao2 += 2 * lda; b += 16; } if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data05 = *(ao1 + 4); data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data05; b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; ao1 += lda; b += 8; } } else { #ifdef UNIT if (i >= 2) { data09 = *(ao2 + 0); data10 = *(ao2 + 1); } if (i >= 3) { data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); } b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if (i >= 2) { b[ 0] = data09; b[ 1] = data10; b[ 2] = ONE; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = data17; b[ 1] = data18; b[ 2] = data19; b[ 3] = data20; b[ 4] = ONE; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); if (i >= 2) { data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); } if (i >= 3) { data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); } b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if (i >= 2) { b[ 0] = data09; b[ 1] = data10; b[ 2] = data11; b[ 3] = data12; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = data17; b[ 1] = data18; b[ 2] = data19; b[ 3] = data20; b[ 4] = data21; b[ 5] = data22; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } #endif } } posY += 4; js --; } while (js > 0); } /* End of main loop */ if (n & 2){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; ao2 = a + posX * 2 + (posY + 1) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; ao2 = a + posY * 2 + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { ao1 += 4; ao2 += 4; b += 8; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b[ 4] = data09; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } else { #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data09; b[ 5] = data10; b[ 6] = ONE; b[ 7] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = data09; b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; #endif ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X < posY) { b += 4; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; b[ 3] = ZERO; #endif b += 4; } } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { b += 2; ao1 += 2; } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); b[ 0] = data01; b[ 1] = data02; #endif b += 2; ao1 += lda; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmm_utcopy_8.c000066400000000000000000000444761313527062700212260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ BLASLONG i, js, ii; BLASLONG X; FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; lda *= 2; js = (n >> 3); if (js > 0){ do { X = posX; if (posX <= posY) { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; a03 = a + posX * 2 + (posY + 2) * lda; a04 = a + posX * 2 + (posY + 3) * lda; a05 = a + posX * 2 + (posY + 4) * lda; a06 = a + posX * 2 + (posY + 5) * lda; a07 = a + posX * 2 + (posY + 6) * lda; a08 = a + posX * 2 + (posY + 7) * lda; } else { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; a03 = a + posY * 2 + (posX + 2) * lda; a04 = a + posY * 2 + (posX + 3) * lda; a05 = a + posY * 2 + (posX + 4) * lda; a06 = a + posY * 2 + (posX + 5) * lda; a07 = a + posY * 2 + (posX + 6) * lda; a08 = a + posY * 2 + (posX + 7) * lda; } i = (m >> 3); if (i > 0) { do { if (X < posY) { a01 += 16; a02 += 16; a03 += 16; a04 += 16; a05 += 16; a06 += 16; a07 += 16; a08 += 16; b += 128; } else if (X > posY) { for (ii = 0; ii < 8; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; b += 16; } a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; b[ 11] = ZERO; b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(a02 + 0); b[ 17] = *(a02 + 1); #ifdef UNIT b[ 18] = ONE; b[ 19] = ZERO; #else b[ 18] = *(a02 + 2); b[ 19] = *(a02 + 3); #endif b[ 20] = ZERO; b[ 21] = ZERO; b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; b[ 27] = ZERO; b[ 28] = ZERO; b[ 29] = ZERO; b[ 30] = ZERO; b[ 31] = ZERO; b[ 32] = *(a03 + 0); b[ 33] = *(a03 + 1); b[ 34] = *(a03 + 2); b[ 35] = *(a03 + 3); #ifdef UNIT b[ 36] = ONE; b[ 37] = ZERO; #else b[ 36] = *(a03 + 4); b[ 37] = *(a03 + 5); #endif b[ 38] = ZERO; b[ 39] = ZERO; b[ 40] = ZERO; b[ 41] = ZERO; b[ 42] = ZERO; b[ 43] = ZERO; b[ 44] = ZERO; b[ 45] = ZERO; b[ 46] = ZERO; b[ 47] = ZERO; b[ 48] = *(a04 + 0); b[ 49] = *(a04 + 1); b[ 50] = *(a04 + 2); b[ 51] = *(a04 + 3); b[ 52] = *(a04 + 4); b[ 53] = *(a04 + 5); #ifdef UNIT b[ 54] = ONE; b[ 55] = ZERO; #else b[ 54] = *(a04 + 6); b[ 55] = *(a04 + 7); #endif b[ 56] = ZERO; b[ 57] = ZERO; b[ 58] = ZERO; b[ 59] = ZERO; b[ 60] = ZERO; b[ 61] = ZERO; b[ 62] = ZERO; b[ 63] = ZERO; b[ 64] = *(a05 + 0); b[ 65] = *(a05 + 1); b[ 66] = *(a05 + 2); b[ 67] = *(a05 + 3); b[ 68] = *(a05 + 4); b[ 69] = *(a05 + 5); b[ 70] = *(a05 + 6); b[ 71] = *(a05 + 7); #ifdef UNIT b[ 72] = ONE; b[ 73] = ZERO; #else b[ 72] = *(a05 + 8); b[ 73] = *(a05 + 9); #endif b[ 74] = ZERO; b[ 75] = ZERO; b[ 76] = ZERO; b[ 77] = ZERO; b[ 78] = ZERO; b[ 79] = ZERO; b[ 80] = *(a06 + 0); b[ 81] = *(a06 + 1); b[ 82] = *(a06 + 2); b[ 83] = *(a06 + 3); b[ 84] = *(a06 + 4); b[ 85] = *(a06 + 5); b[ 86] = *(a06 + 6); b[ 87] = *(a06 + 7); b[ 88] = *(a06 + 8); b[ 89] = *(a06 + 9); #ifdef UNIT b[ 90] = ONE; b[ 91] = ZERO; #else b[ 90] = *(a06 + 10); b[ 91] = *(a06 + 11); #endif b[ 92] = ZERO; b[ 93] = ZERO; b[ 94] = ZERO; b[ 95] = ZERO; b[ 96] = *(a07 + 0); b[ 97] = *(a07 + 1); b[ 98] = *(a07 + 2); b[ 99] = *(a07 + 3); b[100] = *(a07 + 4); b[101] = *(a07 + 5); b[102] = *(a07 + 6); b[103] = *(a07 + 7); b[104] = *(a07 + 8); b[105] = *(a07 + 9); b[106] = *(a07 + 10); b[107] = *(a07 + 11); #ifdef UNIT b[108] = ONE; b[109] = ZERO; #else b[108] = *(a07 + 12); b[109] = *(a07 + 13); #endif b[110] = ZERO; b[111] = ZERO; b[112] = *(a08 + 0); b[113] = *(a08 + 1); b[114] = *(a08 + 2); b[115] = *(a08 + 3); b[116] = *(a08 + 4); b[117] = *(a08 + 5); b[118] = *(a08 + 6); b[119] = *(a08 + 7); b[120] = *(a08 + 8); b[121] = *(a08 + 9); b[122] = *(a08 + 10); b[123] = *(a08 + 11); b[124] = *(a08 + 12); b[125] = *(a08 + 13); #ifdef UNIT b[126] = ONE; b[127] = ZERO; #else b[126] = *(a08 + 14); b[127] = *(a08 + 15); #endif a01 += 8 * lda; a02 += 8 * lda; a03 += 8 * lda; a04 += 8 * lda; a05 += 8 * lda; a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; b += 128; } X += 8; i --; } while (i > 0); } i = (m & 7); if (i) { if (X < posY) { a01 += 2 * i; a02 += 2 * i; a03 += 2 * i; a04 += 2 * i; a05 += 2 * i; a06 += 2 * i; a07 += 2 * i; a08 += 2 * i; b += 16 * i; } else if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); b[ 11] = *(a01 + 11); b[ 12] = *(a01 + 12); b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); a01 += lda; a02 += lda; a03 += lda; a04 += lda; a05 += lda; a06 += lda; a07 += lda; a08 += lda; b += 16; } } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; if(i >= 2) { b[ 0] = *(a02 + 0); b[ 1] = *(a02 + 1); #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(a02 + 2); b[ 3] = *(a02 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 3) { b[ 0] = *(a03 + 0); b[ 1] = *(a03 + 1); b[ 2] = *(a03 + 2); b[ 3] = *(a03 + 3); #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(a03 + 4); b[ 5] = *(a03 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 4) { b[ 0] = *(a04 + 0); b[ 1] = *(a04 + 1); b[ 2] = *(a04 + 2); b[ 3] = *(a04 + 3); b[ 4] = *(a04 + 4); b[ 5] = *(a04 + 5); #ifdef UNIT b[ 6] = ONE; b[ 7] = ZERO; #else b[ 6] = *(a04 + 6); b[ 7] = *(a04 + 7); #endif b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 5) { b[ 0] = *(a05 + 0); b[ 1] = *(a05 + 1); b[ 2] = *(a05 + 2); b[ 3] = *(a05 + 3); b[ 4] = *(a05 + 4); b[ 5] = *(a05 + 5); b[ 6] = *(a05 + 6); b[ 7] = *(a05 + 7); #ifdef UNIT b[ 8] = ONE; b[ 9] = ZERO; #else b[ 8] = *(a05 + 8); b[ 9] = *(a05 + 9); #endif b[10] = ZERO; b[11] = ZERO; b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 6) { b[ 0] = *(a06 + 0); b[ 1] = *(a06 + 1); b[ 2] = *(a06 + 2); b[ 3] = *(a06 + 3); b[ 4] = *(a06 + 4); b[ 5] = *(a06 + 5); b[ 6] = *(a06 + 6); b[ 7] = *(a06 + 7); b[ 8] = *(a06 + 8); b[ 9] = *(a06 + 9); #ifdef UNIT b[10] = ONE; b[11] = ZERO; #else b[10] = *(a06 + 10); b[11] = *(a06 + 11); #endif b[12] = ZERO; b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; b += 16; } if (i >= 7) { b[ 0] = *(a07 + 0); b[ 1] = *(a07 + 1); b[ 2] = *(a07 + 2); b[ 3] = *(a07 + 3); b[ 4] = *(a07 + 4); b[ 5] = *(a07 + 5); b[ 6] = *(a07 + 6); b[ 7] = *(a07 + 7); b[ 8] = *(a07 + 8); b[ 9] = *(a07 + 9); b[10] = *(a07 + 10); b[11] = *(a07 + 11); #ifdef UNIT b[12] = ONE; b[13] = ZERO; #else b[12] = *(a07 + 12); b[13] = *(a07 + 13); #endif b[14] = ZERO; b[15] = ZERO; b += 16; } } } posY += 8; js --; } while (js > 0); } /* End of main loop */ if (n & 4){ X = posX; if (posX <= posY) { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; a03 = a + posX * 2 + (posY + 2) * lda; a04 = a + posX * 2 + (posY + 3) * lda; } else { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; a03 = a + posY * 2 + (posX + 2) * lda; a04 = a + posY * 2 + (posX + 3) * lda; } i = (m >> 2); if (i > 0) { do { if (X < posY) { a01 += 8; a02 += 8; a03 += 8; a04 += 8; b += 32; } else if (X > posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; b += 8; } a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b[ 8] = *(a02 + 0); b[ 9] = *(a02 + 1); #ifdef UNIT b[ 10] = ONE; b[ 11] = ZERO; #else b[ 10] = *(a02 + 2); b[ 11] = *(a02 + 3); #endif b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; b[ 16] = *(a03 + 0); b[ 17] = *(a03 + 1); b[ 18] = *(a03 + 2); b[ 19] = *(a03 + 3); #ifdef UNIT b[ 20] = ONE; b[ 21] = ZERO; #else b[ 20] = *(a03 + 4); b[ 21] = *(a03 + 5); #endif b[ 22] = ZERO; b[ 23] = ZERO; b[ 24] = *(a04 + 0); b[ 25] = *(a04 + 1); b[ 26] = *(a04 + 2); b[ 27] = *(a04 + 3); b[ 28] = *(a04 + 4); b[ 29] = *(a04 + 5); #ifdef UNIT b[ 30] = ONE; b[ 31] = ZERO; #else b[ 30] = *(a04 + 6); b[ 31] = *(a04 + 7); #endif a01 += 4 * lda; a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; b += 32; } X += 4; i --; } while (i > 0); } i = (m & 3); if (i) { if (X < posY) { a01 += 2 * i; a02 += 2 * i; a03 += 2 * i; a04 += 2 * i; b += 8 * i; } else if (X > posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a01 + 4); b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); a01 += lda; a02 += lda; a03 += lda; a04 += lda; b += 8; } } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; if(i >= 2) { b[ 0] = *(a02 + 0); b[ 1] = *(a02 + 1); #ifdef UNIT b[ 2] = ONE; b[ 3] = ZERO; #else b[ 2] = *(a02 + 2); b[ 3] = *(a02 + 3); #endif b[ 4] = ZERO; b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } if (i >= 3) { b[ 0] = *(a03 + 0); b[ 1] = *(a03 + 1); b[ 2] = *(a03 + 2); b[ 3] = *(a03 + 3); #ifdef UNIT b[ 4] = ONE; b[ 5] = ZERO; #else b[ 4] = *(a03 + 4); b[ 5] = *(a03 + 5); #endif b[ 6] = ZERO; b[ 7] = ZERO; b += 8; } } } posY += 4; } if (n & 2){ X = posX; if (posX <= posY) { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; } else { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; } i = (m >> 1); if (i > 0) { do { if (X < posY) { a01 += 4; a02 += 4; b += 8; } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b[ 4] = *(a02 + 0); b[ 5] = *(a02 + 1); b[ 6] = *(a02 + 2); b[ 7] = *(a02 + 3); a01 += 2 * lda; a02 += 2 * lda; b += 8; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = ZERO; b[ 3] = ZERO; b[ 4] = *(a02 + 0); b[ 5] = *(a02 + 1); #ifdef UNIT b[ 6] = ONE; b[ 7] = ZERO; #else b[ 6] = *(a02 + 2); b[ 7] = *(a02 + 3); #endif a01 += 2 * lda; a02 += 2 * lda; b += 8; } X += 2; i --; } while (i > 0); } i = (m & 1); if (i) { if (X < posY) { b += 4; } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b += 4; } } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif b[ 2] = *(a02 + 0); b[ 3] = *(a02 + 1); b += 4; } posY += 2; } if (n & 1){ X = posX; if (posX <= posY) { a01 = a + posX * 2 + (posY + 0) * lda; } else { a01 = a + posY * 2 + (posX + 0) * lda; } i = m; if (m > 0) { do { if (X < posY) { a01 += 2; b += 2; } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); a01 += lda; b += 2; } else { #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); #endif a01 += lda; b += 2; } X += 1; i --; } while (i > 0); } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrmmkernel_2x2.c000066400000000000000000000571111313527062700212560ustar00rootroot00000000000000#include "common.h" /******************************** ADD1 a*c ADD2 b*c ADD3 a*d ADD4 b*d *********************************/ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, FLOAT* C,BLASLONG ldc, BLASLONG offset) { BLASLONG i,j,k; FLOAT *C0,*C1,*ptrba,*ptrbb; FLOAT res0,res1,res2,res3,res4,res5,res6,res7,load0,load1,load2,load3,load4,load5,load6,load7,load8,load9,load10,load11,load12,load13,load14,load15; BLASLONG off, temp; #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset; #endif for (j=0; j #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02; FLOAT *a1; lda *= 2; jj = offset; j = n; while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += 2; b += 2; i --; ii ++; } a += lda; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_lncopy_2.c000066400000000000000000000110441313527062700211700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data07 = *(a2 + 2); data08 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 4) = data03; *(b + 5) = data04; compinv(b + 6, data07, data08); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data05; *(b + 3) = data06; *(b + 4) = data03; *(b + 5) = data04; *(b + 6) = data07; *(b + 7) = data08; } a1 += 4; a2 += 4; b += 8; i --; ii += 2; } if (m & 1) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data05 = *(a2 + 0); data06 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data05; *(b + 3) = data06; } b += 4; } a += 2 * lda; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1+= 2; b += 2; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_lncopy_4.c000066400000000000000000000221661313527062700212010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT data09, data10, data11, data12; FLOAT data13, data14, data15, data16; FLOAT data17, data18, data19, data20; FLOAT data21, data22, data23, data24; FLOAT data25, data26, data27, data28; FLOAT data29, data30, data31, data32; FLOAT *a1, *a2, *a3, *a4; lda *= 2; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; ii = 0; i = (m >> 2); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); #ifndef UNIT data21 = *(a3 + 4); data22 = *(a3 + 5); #endif data23 = *(a3 + 6); data24 = *(a3 + 7); #ifndef UNIT data31 = *(a4 + 6); data32 = *(a4 + 7); #endif compinv(b + 0, data01, data02); *(b + 8) = data03; *(b + 9) = data04; compinv(b + 10, data11, data12); *(b + 16) = data05; *(b + 17) = data06; *(b + 18) = data13; *(b + 19) = data14; compinv(b + 20, data21, data22); *(b + 24) = data07; *(b + 25) = data08; *(b + 26) = data15; *(b + 27) = data16; *(b + 28) = data23; *(b + 29) = data24; compinv(b + 30, data31, data32); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; *(b + 8) = data03; *(b + 9) = data04; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data19; *(b + 13) = data20; *(b + 14) = data27; *(b + 15) = data28; *(b + 16) = data05; *(b + 17) = data06; *(b + 18) = data13; *(b + 19) = data14; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data29; *(b + 23) = data30; *(b + 24) = data07; *(b + 25) = data08; *(b + 26) = data15; *(b + 27) = data16; *(b + 28) = data23; *(b + 29) = data24; *(b + 30) = data31; *(b + 31) = data32; } a1 += 8; a2 += 8; a3 += 8; a4 += 8; b += 32; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 4) = data03; *(b + 5) = data04; compinv(b + 6, data11, data12); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; *(b + 8) = data03; *(b + 9) = data04; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data19; *(b + 13) = data20; *(b + 14) = data27; *(b + 15) = data28; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii += 1; } a += 4 * lda; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; ii = 0; i = (m >> 1); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 4) = data03; *(b + 5) = data04; compinv(b + 6, data11, data12); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data03; *(b + 5) = data04; *(b + 6) = data11; *(b + 7) = data12; } a1 += 4; a2 += 4; b += 8; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; } a1 += 2; a2 += 2; b += 4; ii += 1; } a += 2 * lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; i = m; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += 2; b += 2; i --; ii += 1; } a += lda; jj += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_lncopy_8.c000066400000000000000000000140421313527062700211770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; FLOAT data1, data2; lda *= 2; jj = offset; j = (n >> 3); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; a += 8 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a2 + 0); *(b + 3) = *(a2 + 1); *(b + 4) = *(a3 + 0); *(b + 5) = *(a3 + 1); *(b + 6) = *(a4 + 0); *(b + 7) = *(a4 + 1); *(b + 8) = *(a5 + 0); *(b + 9) = *(a5 + 1); *(b + 10) = *(a6 + 0); *(b + 11) = *(a6 + 1); *(b + 12) = *(a7 + 0); *(b + 13) = *(a7 + 1); *(b + 14) = *(a8 + 0); *(b + 15) = *(a8 + 1); } a1 += 2; a2 += 2; a3 += 2; a4 += 2; a5 += 2; a6 += 2; a7 += 2; a8 += 2; b += 16; ii ++; } jj += 8; j --; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a += 4 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a2 + 0); *(b + 3) = *(a2 + 1); *(b + 4) = *(a3 + 0); *(b + 5) = *(a3 + 1); *(b + 6) = *(a4 + 0); *(b + 7) = *(a4 + 1); } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii ++; } jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; a += 2 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a2 + 0); *(b + 3) = *(a2 + 1); } a1 += 2; a2 += 2; b += 4; ii ++; } jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); } a1 += 2; b += 2; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_ltcopy_1.c000066400000000000000000000064121313527062700212000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02; FLOAT *a1; lda *= 2; jj = offset; j = n; while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += lda; b += 2; i --; ii ++; } a += 2; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_ltcopy_2.c000066400000000000000000000112071313527062700211770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data07 = *(a2 + 2); data08 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 2) = data03; *(b + 3) = data04; compinv(b + 6, data07, data08); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2 * lda; a2 += 2 * lda; b += 8; i --; ii += 2; } if (m & 1) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); compinv(b + 0, data01, data02); *(b + 2) = data03; *(b + 3) = data04; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += 1 * lda; b += 2; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_ltcopy_4.c000066400000000000000000000233431313527062700212050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT data09, data10, data11, data12; FLOAT data13, data14, data15, data16; FLOAT data17, data18, data19, data20; FLOAT data21, data22, data23, data24; FLOAT data25, data26, data27, data28; FLOAT data29, data30, data31, data32; FLOAT *a1, *a2, *a3, *a4; lda *= 2; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; ii = 0; i = (m >> 2); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); #ifndef UNIT data21 = *(a3 + 4); data22 = *(a3 + 5); #endif data23 = *(a3 + 6); data24 = *(a3 + 7); #ifndef UNIT data31 = *(a4 + 6); data32 = *(a4 + 7); #endif compinv(b + 0, data01, data02); *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; compinv(b + 10, data11, data12); *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; compinv(b + 20, data21, data22); *(b + 22) = data23; *(b + 23) = data24; compinv(b + 30, data31, data32); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = data19; *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = data28; *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 32; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); compinv(b + 0, data01, data02); *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; compinv(b + 10, data11, data12); *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 2 * lda; a2 += 2 * lda; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); compinv(b + 0, data01, data02); *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += lda; b += 8; ii += 1; } a += 8; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; ii = 0; i = (m >> 1); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 2) = data03; *(b + 3) = data04; compinv(b + 6, data11, data12); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data09; *(b + 5) = data10; *(b + 6) = data11; *(b + 7) = data12; } a1 += 2 * lda; a2 += 2 * lda; b += 8; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data03 = *(a1 + 2); data04 = *(a1 + 3); compinv(b + 0, data01, data02); *(b + 2) = data03; *(b + 3) = data04; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += lda; b += 4; ii += 1; } a += 4; jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; i = m; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += lda; b += 2; i --; ii += 1; } a += 2; jj += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_ltcopy_8.c000066400000000000000000000126101313527062700212040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1; FLOAT data1, data2; lda *= 2; jj = offset; j = (n >> 3); while (j > 0){ a1 = a; a += 16; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); for (k = ii - jj + 1; k < 8; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); *(b + k * 2 + 1) = *(a1 + k * 2 + 1); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); *(b + 8) = *(a1 + 8); *(b + 9) = *(a1 + 9); *(b + 10) = *(a1 + 10); *(b + 11) = *(a1 + 11); *(b + 12) = *(a1 + 12); *(b + 13) = *(a1 + 13); *(b + 14) = *(a1 + 14); *(b + 15) = *(a1 + 15); } b += 16; a1 += lda; ii ++; } jj += 8; j --; } j = (n & 4); if (j > 0) { a1 = a; a += 8; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); for (k = ii - jj + 1; k < 4; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); *(b + k * 2 + 1) = *(a1 + k * 2 + 1); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); } b += 8; a1 += lda; ii ++; } jj += 4; } j = (n & 2); if (j > 0) { a1 = a; a += 4; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); for (k = ii - jj + 1; k < 2; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); *(b + k * 2 + 1) = *(a1 + k * 2 + 1); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); } b += 4; a1 += lda; ii ++; } jj += 2; } j = (n & 1); if (j > 0) { a1 = a; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); } b += 2; a1 += lda; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_uncopy_1.c000066400000000000000000000064111313527062700212020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02; FLOAT *a1; lda *= 2; jj = offset; j = n; while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += 2; b += 2; i --; ii ++; } a += lda; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_uncopy_2.c000066400000000000000000000111711313527062700212020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data05 = *(a2 + 0); data06 = *(a2 + 1); #ifndef UNIT data07 = *(a2 + 2); data08 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 2) = data05; *(b + 3) = data06; compinv(b + 6, data07, data08); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data05; *(b + 3) = data06; *(b + 4) = data03; *(b + 5) = data04; *(b + 6) = data07; *(b + 7) = data08; } a1 += 4; a2 += 4; b += 8; i --; ii += 2; } if (m & 1) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data05 = *(a2 + 0); data06 = *(a2 + 1); compinv(b + 0, data01, data02); *(b + 2) = data05; *(b + 3) = data06; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a2 + 0); data04 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 2 * lda; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1+= 2; b += 2; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_uncopy_4.c000066400000000000000000000234371313527062700212140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT data09, data10, data11, data12; FLOAT data13, data14, data15, data16; FLOAT data17, data18, data19, data20; FLOAT data21, data22, data23, data24; FLOAT data25, data26, data27, data28; FLOAT data29, data30, data31, data32; FLOAT *a1, *a2, *a3, *a4; lda *= 2; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; ii = 0; i = (m >> 2); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); #ifndef UNIT data21 = *(a3 + 4); data22 = *(a3 + 5); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); #ifndef UNIT data31 = *(a4 + 6); data32 = *(a4 + 7); #endif compinv(b + 0, data01, data02); *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; compinv(b + 10, data11, data12); *(b + 12) = data19; *(b + 13) = data20; *(b + 14) = data27; *(b + 15) = data28; compinv(b + 20, data21, data22); *(b + 22) = data29; *(b + 23) = data30; compinv(b + 30, data31, data32); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; *(b + 8) = data03; *(b + 9) = data04; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data19; *(b + 13) = data20; *(b + 14) = data27; *(b + 15) = data28; *(b + 16) = data05; *(b + 17) = data06; *(b + 18) = data13; *(b + 19) = data14; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data29; *(b + 23) = data30; *(b + 24) = data07; *(b + 25) = data08; *(b + 26) = data15; *(b + 27) = data16; *(b + 28) = data23; *(b + 29) = data24; *(b + 30) = data31; *(b + 31) = data32; } a1 += 8; a2 += 8; a3 += 8; a4 += 8; b += 32; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); compinv(b + 0, data01, data02); *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; compinv(b + 10, data11, data12); *(b + 12) = data19; *(b + 13) = data20; *(b + 14) = data27; *(b + 15) = data28; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; *(b + 8) = data03; *(b + 9) = data04; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data19; *(b + 13) = data20; *(b + 14) = data27; *(b + 15) = data28; } a1 += 4; a2 += 4; a3 += 4; a4 += 4; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); compinv(b + 0, data01, data02); *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); data17 = *(a3 + 0); data18 = *(a3 + 1); data25 = *(a4 + 0); data26 = *(a4 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data17; *(b + 5) = data18; *(b + 6) = data25; *(b + 7) = data26; } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii += 1; } a += 4 * lda; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; ii = 0; i = (m >> 1); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 2) = data09; *(b + 3) = data10; compinv(b + 6, data11, data12); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; *(b + 4) = data03; *(b + 5) = data04; *(b + 6) = data11; *(b + 7) = data12; } a1 += 4; a2 += 4; b += 8; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); compinv(b + 0, data01, data02); *(b + 2) = data09; *(b + 3) = data10; } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data09 = *(a2 + 0); data10 = *(a2 + 1); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data09; *(b + 3) = data10; } a1 += 2; a2 += 2; b += 4; ii += 1; } a += 2 *lda; jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; i = m; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii < jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += 2; b += 2; i --; ii += 1; } a += lda; jj += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_uncopy_8.c000066400000000000000000000140611313527062700212110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; FLOAT data1, data2; lda *= 2; jj = offset; j = (n >> 3); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a5 = a + 4 * lda; a6 = a + 5 * lda; a7 = a + 6 * lda; a8 = a + 7 * lda; a += 8 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); for (k = ii - jj + 1; k < 8; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a2 + 0); *(b + 3) = *(a2 + 1); *(b + 4) = *(a3 + 0); *(b + 5) = *(a3 + 1); *(b + 6) = *(a4 + 0); *(b + 7) = *(a4 + 1); *(b + 8) = *(a5 + 0); *(b + 9) = *(a5 + 1); *(b + 10) = *(a6 + 0); *(b + 11) = *(a6 + 1); *(b + 12) = *(a7 + 0); *(b + 13) = *(a7 + 1); *(b + 14) = *(a8 + 0); *(b + 15) = *(a8 + 1); } a1 += 2; a2 += 2; a3 += 2; a4 += 2; a5 += 2; a6 += 2; a7 += 2; a8 += 2; b += 16; ii ++; } jj += 8; j --; } if (n & 4) { a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; a += 4 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); for (k = ii - jj + 1; k < 4; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a2 + 0); *(b + 3) = *(a2 + 1); *(b + 4) = *(a3 + 0); *(b + 5) = *(a3 + 1); *(b + 6) = *(a4 + 0); *(b + 7) = *(a4 + 1); } a1 += 2; a2 += 2; a3 += 2; a4 += 2; b += 8; ii ++; } jj += 4; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; a += 2 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); for (k = ii - jj + 1; k < 2; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a2 + 0); *(b + 3) = *(a2 + 1); } a1 += 2; a2 += 2; b += 4; ii ++; } jj += 2; } if (n & 1) { a1 = a + 0 * lda; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); compinv(b + (ii - jj) * 2, data1, data2); for (k = ii - jj + 1; k < 1; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); } a1 += 2; b += 2; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_utcopy_1.c000066400000000000000000000064131313527062700212120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02; FLOAT *a1; lda *= 2; jj = offset; j = (n); while (j > 0){ a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += lda; b += 2; i --; ii ++; } a += 2; jj ++; j --; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_utcopy_2.c000066400000000000000000000110601313527062700212050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2; jj = offset; j = (n >> 1); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; i = (m >> 1); ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data05 = *(a2 + 0); data06 = *(a2 + 1); #ifndef UNIT data07 = *(a2 + 2); data08 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 4) = data05; *(b + 5) = data06; compinv(b + 6, data07, data08); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a2 + 0); data06 = *(a2 + 1); data07 = *(a2 + 2); data08 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += 2 * lda; a2 += 2 * lda; b += 8; i --; ii += 2; } if ((m & 1) != 0) { if (ii== jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } b += 4; } a += 4; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; i = m; ii = 0; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += 1 * lda; b += 2; i --; ii += 1; } } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_utcopy_4.c000066400000000000000000000221141313527062700212110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj; FLOAT data01, data02, data03, data04; FLOAT data05, data06, data07, data08; FLOAT data09, data10, data11, data12; FLOAT data13, data14, data15, data16; FLOAT data17, data18, data19, data20; FLOAT data21, data22, data23, data24; FLOAT data25, data26, data27, data28; FLOAT data29, data30, data31, data32; FLOAT *a1, *a2, *a3, *a4; lda *= 2; jj = offset; j = (n >> 2); while (j > 0){ a1 = a + 0 * lda; a2 = a + 1 * lda; a3 = a + 2 * lda; a4 = a + 3 * lda; ii = 0; i = (m >> 2); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); #ifndef UNIT data21 = *(a3 + 4); data22 = *(a3 + 5); #endif data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); #ifndef UNIT data31 = *(a4 + 6); data32 = *(a4 + 7); #endif compinv(b + 0, data01, data02); *(b + 8) = data09; *(b + 9) = data10; compinv(b + 10, data11, data12); *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = data19; *(b + 19) = data20; compinv(b + 20, data21, data22); *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = data28; *(b + 28) = data29; *(b + 29) = data30; compinv(b + 30, data31, data32); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); data17 = *(a3 + 0); data18 = *(a3 + 1); data19 = *(a3 + 2); data20 = *(a3 + 3); data21 = *(a3 + 4); data22 = *(a3 + 5); data23 = *(a3 + 6); data24 = *(a3 + 7); data25 = *(a4 + 0); data26 = *(a4 + 1); data27 = *(a4 + 2); data28 = *(a4 + 3); data29 = *(a4 + 4); data30 = *(a4 + 5); data31 = *(a4 + 6); data32 = *(a4 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; *(b + 16) = data17; *(b + 17) = data18; *(b + 18) = data19; *(b + 19) = data20; *(b + 20) = data21; *(b + 21) = data22; *(b + 22) = data23; *(b + 23) = data24; *(b + 24) = data25; *(b + 25) = data26; *(b + 26) = data27; *(b + 27) = data28; *(b + 28) = data29; *(b + 29) = data30; *(b + 30) = data31; *(b + 31) = data32; } a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; a4 += 4 * lda; b += 32; i --; ii += 4; } if (m & 2) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 8) = data09; *(b + 9) = data10; compinv(b + 10, data11, data12); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); data13 = *(a2 + 4); data14 = *(a2 + 5); data15 = *(a2 + 6); data16 = *(a2 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; *(b + 8) = data09; *(b + 9) = data10; *(b + 10) = data11; *(b + 11) = data12; *(b + 12) = data13; *(b + 13) = data14; *(b + 14) = data15; *(b + 15) = data16; } a1 += 2 * lda; a2 += 2 * lda; b += 16; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data05 = *(a1 + 4); data06 = *(a1 + 5); data07 = *(a1 + 6); data08 = *(a1 + 7); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data05; *(b + 5) = data06; *(b + 6) = data07; *(b + 7) = data08; } a1 += lda; b += 8; ii += 1; } a += 8; jj += 4; j --; } if (n & 2) { a1 = a + 0 * lda; a2 = a + 1 * lda; ii = 0; i = (m >> 1); while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif data09 = *(a2 + 0); data10 = *(a2 + 1); #ifndef UNIT data11 = *(a2 + 2); data12 = *(a2 + 3); #endif compinv(b + 0, data01, data02); *(b + 4) = data09; *(b + 5) = data10; compinv(b + 6, data11, data12); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); data09 = *(a2 + 0); data10 = *(a2 + 1); data11 = *(a2 + 2); data12 = *(a2 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; *(b + 4) = data09; *(b + 5) = data10; *(b + 6) = data11; *(b + 7) = data12; } a1 += 2 * lda; a2 += 2 * lda; b += 8; i --; ii += 2; } if (m & 1) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); *(b + 0) = data01; *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; } a1 += lda; b += 4; ii += 1; } a += 4; jj += 2; j --; } if (n & 1) { a1 = a + 0 * lda; ii = 0; i = m; while (i > 0) { if (ii == jj) { #ifndef UNIT data01 = *(a1 + 0); data02 = *(a1 + 1); #endif compinv(b + 0, data01, data02); } if (ii > jj) { data01 = *(a1 + 0); data02 = *(a1 + 1); *(b + 0) = data01; *(b + 1) = data02; } a1 += lda; b += 2; i --; ii += 1; } a += 2; jj += 1; } return 0; } OpenBLAS-0.2.20/kernel/generic/ztrsm_utcopy_8.c000066400000000000000000000127571313527062700212310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ BLASLONG i, ii, j, jj, k; FLOAT *a1, data1, data2; lda *= 2; jj = offset; j = (n >> 3); while (j > 0){ a1 = a; a += 16; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); *(b + k * 2 + 1) = *(a1 + k * 2 + 1); } data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); *(b + 8) = *(a1 + 8); *(b + 9) = *(a1 + 9); *(b + 10) = *(a1 + 10); *(b + 11) = *(a1 + 11); *(b + 12) = *(a1 + 12); *(b + 13) = *(a1 + 13); *(b + 14) = *(a1 + 14); *(b + 15) = *(a1 + 15); } b += 16; a1 += lda; ii ++; } jj += 8; j --; } j = (n & 4); if (j > 0) { a1 = a; a += 8; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); *(b + k * 2 + 1) = *(a1 + k * 2 + 1); } data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); *(b + 4) = *(a1 + 4); *(b + 5) = *(a1 + 5); *(b + 6) = *(a1 + 6); *(b + 7) = *(a1 + 7); } b += 8; a1 += lda; ii ++; } jj += 4; } j = (n & 2); if (j > 0) { a1 = a; a += 4; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); *(b + k * 2 + 1) = *(a1 + k * 2 + 1); } data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); *(b + 2) = *(a1 + 2); *(b + 3) = *(a1 + 3); } b += 4; a1 += lda; ii ++; } jj += 2; } j = (n & 1); if (j > 0) { a1 = a; ii = 0; for (i = 0; i < m; i++) { if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); *(b + k * 2 + 1) = *(a1 + k * 2 + 1); } data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); } if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); } b += 2; a1 += lda; ii ++; } } return 0; } OpenBLAS-0.2.20/kernel/ia64/000077500000000000000000000000001313527062700151675ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/ia64/KERNEL000066400000000000000000000064071313527062700161010ustar00rootroot00000000000000SAXPYKERNEL = saxpy.S DAXPYKERNEL = daxpy.S QAXPYKERNEL = qaxpy.S CAXPYKERNEL = caxpy.S ZAXPYKERNEL = zaxpy.S XAXPYKERNEL = zaxpy.S SDOTKERNEL = sdot.S DDOTKERNEL = ddot.S QDOTKERNEL = qdot.S CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S XDOTKERNEL = xdot.S SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S QAMAXKERNEL = amax.S CAMAXKERNEL = izamax.S ZAMAXKERNEL = izamax.S XAMAXKERNEL = izamax.S SAMINKERNEL = amax.S DAMINKERNEL = amax.S QAMINKERNEL = amax.S CAMINKERNEL = izamax.S ZAMINKERNEL = izamax.S XAMINKERNEL = izamax.S SMAXKERNEL = amax.S DMAXKERNEL = amax.S QMAXKERNEL = amax.S SMINKERNEL = amax.S DMINKERNEL = amax.S QMINKERNEL = amax.S ISAMAXKERNEL = iamax.S IDAMAXKERNEL = iamax.S IQAMAXKERNEL = iamax.S ICAMAXKERNEL = izamax.S IZAMAXKERNEL = izamax.S IXAMAXKERNEL = izamax.S ISAMINKERNEL = iamax.S IDAMINKERNEL = iamax.S IQAMINKERNEL = iamax.S ICAMINKERNEL = izamax.S IZAMINKERNEL = izamax.S IXAMINKERNEL = izamax.S ISMAXKERNEL = iamax.S IDMAXKERNEL = iamax.S IQMAXKERNEL = iamax.S ISMINKERNEL = iamax.S IDMINKERNEL = iamax.S IQMINKERNEL = iamax.S CASUMKERNEL = asum.S ZASUMKERNEL = asum.S XASUMKERNEL = asum.S CNRM2KERNEL = nrm2.S ZNRM2KERNEL = nrm2.S XNRM2KERNEL = nrm2.S QCOPYKERNEL = qcopy.S XCOPYKERNEL = xcopy.S QSCALKERNEL = qscal.S QGEMVNKERNEL = qgemv_n.S QGEMVTKERNEL = qgemv_t.S XGEMVNKERNEL = xgemv_n.S XGEMVTKERNEL = xgemv_t.S SGEMMKERNEL = gemm_kernel.S SGEMM_BETA = gemm_beta.S SGEMMONCOPY = gemm_ncopy.S SGEMMOTCOPY = gemm_tcopy.S SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) DGEMMKERNEL = gemm_kernel.S DGEMM_BETA = gemm_beta.S DGEMMONCOPY = gemm_ncopy.S DGEMMOTCOPY = gemm_tcopy.S DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) QGEMMKERNEL = qgemm_kernel.S QGEMM_BETA = ../generic/gemm_beta.c QGEMMONCOPY = ../generic/gemm_ncopy_8.c QGEMMOTCOPY = ../generic/gemm_tcopy_8.c QGEMMONCOPYOBJ = qgemm_oncopy.$(SUFFIX) QGEMMOTCOPYOBJ = qgemm_otcopy.$(SUFFIX) CGEMMKERNEL = zgemm_kernel.S CGEMM_BETA = zgemm_beta.S CGEMMONCOPY = zgemm_ncopy.S CGEMMOTCOPY = zgemm_tcopy.S CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) ZGEMMKERNEL = zgemm_kernel.S ZGEMM_BETA = zgemm_beta.S ZGEMMONCOPY = zgemm_ncopy.S ZGEMMOTCOPY = zgemm_tcopy.S ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) XGEMMKERNEL = zgemm_kernel.S XGEMM_BETA = ../generic/zgemm_beta.c XGEMMONCOPY = ../generic/zgemm_ncopy_4.c XGEMMOTCOPY = ../generic/zgemm_tcopy_4.c XGEMMONCOPYOBJ = xgemm_oncopy.$(SUFFIX) XGEMMOTCOPYOBJ = xgemm_otcopy.$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S STRSMKERNEL_RT = trsm_kernel_RT.S DTRSMKERNEL_LN = trsm_kernel_LN.S DTRSMKERNEL_LT = trsm_kernel_LT.S DTRSMKERNEL_RN = trsm_kernel_LT.S DTRSMKERNEL_RT = trsm_kernel_RT.S CTRSMKERNEL_LN = ztrsm_kernel_LN.S CTRSMKERNEL_LT = ztrsm_kernel_LT.S CTRSMKERNEL_RN = ztrsm_kernel_LT.S CTRSMKERNEL_RT = ztrsm_kernel_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S OpenBLAS-0.2.20/kernel/ia64/Makefile000066400000000000000000000000111313527062700166170ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/ia64/amax.S000066400000000000000000000173211313527062700162450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16 + 4) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16 + 8) #else #define PREFETCH_SIZE (32 * 16 + 16) #endif #if !defined(USE_MIN) && defined(USE_ABS) #define FMAX famax #elif !defined(USE_MIN) && !defined(USE_ABS) #define FMAX fmax #elif defined(USE_MIN) && defined(USE_ABS) #define FMAX famin #else #define FMAX fmin #endif #define RET r8 #define N r32 #define DX r33 #define INCX r34 #define PRE1 r2 #define J r14 #define K r15 #define X2 r16 #define X3 r17 #define INCX5 r18 #define INCX16 r19 #define DMAX1 f8 #define DMAX2 f9 #define DMAX3 f10 #define DMAX4 f11 #define DMAX5 f12 #define DMAX6 f13 #define DMAX7 f14 #define DMAX8 f15 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi mov RET = 0 mov DMAX1 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } ;; .body #ifdef F_INTERFACE { .mmi LDINT N = [N] LDINT INCX = [INCX] nop.i 0 } ;; #ifndef USE64BITINT { .mii nop.m 0 sxt4 N = N sxt4 INCX = INCX } ;; #endif #endif { .mii mov PR = pr cmp.ge p6, p0 = 0, INCX } { .mbb cmp.ge p8, p0 = 0, N (p8) br.ret.sptk.many b0 (p6) br.ret.sptk.many b0 } ;; { .mmi LDFD DMAX1 = [DX] shladd INCX = INCX, BASE_SHIFT, r0 mov pr.rot= 0 } ;; { .mmf add DX = DX, INCX adds K = -1, N mov DMAX2 = DMAX1 } ;; { .mfi shladd X2 = INCX, 2, DX mov DMAX5 = DMAX1 shr J = K, 4 } { .mmf cmp.eq p16, p0 = r0, r0 nop.m 0 mov DMAX6 = DMAX1 } ;; { .mfi shladd INCX5 = INCX, 2, INCX mov DMAX3 = DMAX1 mov ar.ec= 4 } { .mmf #ifdef XDOUBLE shladd INCX16= INCX, 3, r0 #else shladd INCX16= INCX, 4, r0 #endif adds J = -1, J mov DMAX7 = DMAX1 } ;; { .mfi adds PRE1 = PREFETCH_SIZE * SIZE, DX mov DMAX4 = DMAX1 mov ar.lc = J } { .mfb cmp.eq p7 ,p0 = -1, J mov DMAX8 = DMAX1 (p7) br.cond.dpnt .L15 } .align 32 ;; .L10: { .mmf (p16) lfetch.nt1 [PRE1], INCX16 (p16) LDFD f32 = [DX], INCX (p19) FMAX DMAX1 = f35, DMAX1 } { .mmf (p16) LDFD f48 = [X2], INCX nop.m 0 (p19) FMAX DMAX5 = f51, DMAX5 } ;; { .mmf (p16) LDFD f36 = [DX], INCX nop.m 0 (p19) FMAX DMAX2 = f39, DMAX2 } { .mmf (p16) LDFD f52 = [X2], INCX nop.m 0 (p19) FMAX DMAX6 = f55, DMAX6 } ;; { .mmf (p16) LDFD f40 = [DX], INCX nop.m 0 (p19) FMAX DMAX3 = f43, DMAX3 } { .mmf (p16) LDFD f56 = [X2], INCX nop.m 0 (p19) FMAX DMAX7 = f59, DMAX7 } ;; { .mmf (p16) LDFD f44 = [DX], INCX5 nop.m 0 (p19) FMAX DMAX4 = f47, DMAX4 } { .mmf (p16) LDFD f60 = [X2], INCX5 nop.m 0 (p19) FMAX DMAX8 = f63, DMAX8 } ;; { .mmf #ifdef XDOUBLE (p16) lfetch.nt1 [PRE1], INCX16 #endif (p16) LDFD f64 = [DX], INCX #ifndef XDOUBLE nop.m 0 #endif (p19) FMAX DMAX1 = f67, DMAX1 } { .mmf (p16) LDFD f80 = [X2], INCX nop.m 0 (p19) FMAX DMAX5 = f83, DMAX5 } ;; { .mmf (p16) LDFD f68 = [DX], INCX nop.m 0 (p19) FMAX DMAX2 = f71, DMAX2 } { .mmf (p16) LDFD f84 = [X2], INCX nop.m 0 (p19) FMAX DMAX6 = f87, DMAX6 } ;; { .mmf (p16) LDFD f72 = [DX], INCX nop.m 0 (p19) FMAX DMAX3 = f75, DMAX3 } { .mmf (p16) LDFD f88 = [X2], INCX nop.m 0 (p19) FMAX DMAX7 = f91, DMAX7 } ;; { .mmf (p16) LDFD f76 = [DX], INCX5 nop.m 0 (p19) FMAX DMAX4 = f79, DMAX4 } { .mfb (p16) LDFD f92 = [X2], INCX5 (p19) FMAX DMAX8 = f95, DMAX8 br.ctop.sptk.few .L10 } .align 32 ;; .L15: and J = 15, K tbit.z p0, p12 = K, 3 mov X3 = DX ;; { .mmi (p12) LDFD f32 = [DX], INCX (p12) LDFD f36 = [X2], INCX tbit.z p0, p13 = K, 2 } { .mib cmp.eq p8 ,p0 = r0, J tbit.z p0, p14 = K, 1 (p8) br.cond.dpnt .L99 } ;; { .mmi (p12) LDFD f33 = [DX], INCX (p12) LDFD f37 = [X2], INCX tbit.z p0, p15 = K, 0 } ;; { .mmi (p12) LDFD f34 = [DX], INCX (p12) LDFD f38 = [X2], INCX (p12) shladd X3 = INCX, 3, X3 } ;; { .mmi (p12) LDFD f35 = [DX], INCX5 (p12) LDFD f39 = [X2], INCX5 (p13) shladd X3 = INCX, 2, X3 } ;; { .mmi (p13) LDFD f40 = [DX], INCX (p14) LDFD f44 = [X3], INCX nop.i 0 } ;; { .mmi (p13) LDFD f41 = [DX], INCX (p14) LDFD f45 = [X3], INCX nop.i 0 } ;; { .mmf (p13) LDFD f42 = [DX], INCX nop.m 0 (p12) FMAX DMAX1 = f32, DMAX1 } { .mmf (p15) LDFD f46 = [X3], INCX nop.m 0 (p12) FMAX DMAX5 = f36, DMAX5 } ;; { .mmf (p13) LDFD f43 = [DX], INCX nop.m 0 (p12) FMAX DMAX2 = f33, DMAX2 } (p12) FMAX DMAX6 = f37, DMAX6 (p12) FMAX DMAX3 = f34, DMAX3 (p12) FMAX DMAX7 = f38, DMAX7 (p12) FMAX DMAX4 = f35, DMAX4 (p12) FMAX DMAX8 = f39, DMAX8 ;; (p13) FMAX DMAX1 = f40, DMAX1 (p14) FMAX DMAX5 = f44, DMAX5 (p13) FMAX DMAX2 = f41, DMAX2 (p14) FMAX DMAX6 = f45, DMAX6 (p13) FMAX DMAX3 = f42, DMAX3 (p15) FMAX DMAX7 = f46, DMAX7 (p13) FMAX DMAX4 = f43, DMAX4 ;; .align 32 .L99: { .mfi nop.m 0 FMAX DMAX1 = DMAX5, DMAX1 mov ar.lc = ARLC } { .mmf nop.m 0 nop.m 0 FMAX DMAX2 = DMAX6, DMAX2 } ;; { .mfi nop.m 0 FMAX DMAX3 = DMAX7, DMAX3 mov pr = PR, -65474 } { .mmf nop.m 0 nop.m 0 FMAX DMAX4 = DMAX8, DMAX4 } ;; { .mmf FMAX DMAX1 = DMAX2, DMAX1 } { .mmf FMAX DMAX3 = DMAX4, DMAX3 } ;; #ifndef USE_ABS { .mfb FMAX DMAX1 = DMAX3, DMAX1 br.ret.sptk.many b0 } #else { .mmf FMAX DMAX1 = DMAX3, DMAX1 } ;; { .mfb fabs DMAX1 = DMAX1 br.ret.sptk.many b0 } #endif ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/asum.S000066400000000000000000000172321313527062700162650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16 + 4) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16 + 8) #else #define PREFETCH_SIZE (32 * 16 + 16) #endif #ifndef COMPLEX #define COMPADD 0 #define STRIDE INCX #else #define COMPADD 1 #define STRIDE SIZE #endif #define PRE1 r2 #define I r17 #define J r18 #define INCX16 r21 #define PR r30 #define ARLC r31 #define N r32 #define X r33 #define INCX r34 PROLOGUE .prologue PROFCODE { .mfi adds PRE1 = PREFETCH_SIZE * SIZE, X mov f8 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } ;; .body #ifdef F_INTERFACE { .mmi LDINT N = [N] LDINT INCX = [INCX] nop.i 0 } ;; #ifndef USE64BITINT { .mii nop.m 0 sxt4 N = N sxt4 INCX = INCX } ;; #endif #endif { .mmi cmp.lt p0, p6 = r0, INCX cmp.lt p0, p7 = r0, N shr I = N, (4 - COMPADD) } { .mbb and J = ((1 << (4 - COMPADD)) - 1), N (p6) br.ret.sptk.many b0 (p7) br.ret.sptk.many b0 } ;; { .mfi adds I = -1, I mov f10 = f0 mov PR = pr } { .mfi cmp.eq p9, p0 = r0, J mov f9 = f0 tbit.z p0, p12 = N, 3 - COMPADD } ;; { .mmi cmp.eq p16, p0 = r0, r0 cmp.ne p17, p0 = r0, r0 mov ar.ec= 3 } { .mfi cmp.ne p18, p0 = r0, r0 mov f11 = f0 shl INCX = INCX, BASE_SHIFT + COMPADD } ;; { .mmi #ifdef XDOUBLE shladd INCX16 = INCX, (3 - COMPADD), r0 #else shladd INCX16 = INCX, (4 - COMPADD), r0 #endif cmp.ne p19, p0 = r0, r0 mov ar.lc = I } { .mmb cmp.gt p8 ,p0 = r0, I #ifdef COMPLEX adds INCX = - SIZE, INCX #else nop.m 0 #endif (p8) br.cond.dpnt .L55 } ;; .align 32 .L52: { .mmf (p16) lfetch.nt1 [PRE1], INCX16 (p16) LDFD f32 = [X], STRIDE (p18) fabs f34 = f34 } { .mfb (p19) FADD f8 = f8, f71 } ;; { .mmf (p16) LDFD f35 = [X], INCX (p18) fabs f37 = f37 } { .mfb (p19) FADD f9 = f9, f74 } ;; { .mmf (p16) LDFD f38 = [X], STRIDE (p18) fabs f40 = f40 } { .mfb (p19) FADD f10 = f10, f77 } ;; { .mmf (p16) LDFD f41 = [X], INCX (p18) fabs f43 = f43 } { .mfb (p19) FADD f11 = f11, f80 } ;; { .mmf (p16) LDFD f44 = [X], STRIDE (p18) fabs f46 = f46 } { .mfb (p18) FADD f8 = f8, f34 } ;; { .mmf (p16) LDFD f47 = [X], INCX (p18) fabs f49 = f49 } { .mfb (p18) FADD f9 = f9, f37 } ;; { .mmf (p16) LDFD f50 = [X], STRIDE (p18) fabs f52 = f52 } { .mfb (p18) FADD f10 = f10, f40 } ;; { .mmf (p16) LDFD f53 = [X], INCX (p18) fabs f55 = f55 } { .mfb (p18) FADD f11 = f11, f43 } ;; { .mmf #ifdef XDOUBLE (p16) lfetch.nt1 [PRE1], INCX16 #endif (p16) LDFD f56 = [X], STRIDE (p18) fabs f58 = f58 } { .mfb (p18) FADD f8 = f8, f46 } ;; { .mmf (p16) LDFD f59 = [X], INCX (p18) fabs f61 = f61 } { .mfb (p18) FADD f9 = f9, f49 } ;; { .mmf (p16) LDFD f62 = [X], STRIDE (p18) fabs f64 = f64 } { .mfb (p18) FADD f10 = f10, f52 } ;; { .mmf (p16) LDFD f65 = [X], INCX (p18) fabs f67 = f67 } { .mfb (p18) FADD f11 = f11, f55 } ;; { .mmf (p16) LDFD f68 = [X], STRIDE (p18) fabs f70 = f70 } { .mfb (p18) FADD f8 = f8, f58 } ;; { .mmf (p16) LDFD f71 = [X], INCX (p18) fabs f73 = f73 } { .mfb (p18) FADD f9 = f9, f61 } ;; { .mmf (p16) LDFD f74 = [X], STRIDE (p18) fabs f76 = f76 } { .mfb (p18) FADD f10 = f10, f64 } ;; { .mmf (p16) LDFD f77 = [X], INCX (p18) fabs f79 = f79 } { .mfb (p18) FADD f11 = f11, f67 br.ctop.sptk.few .L52 } ;; FADD f8 = f8, f71 FADD f9 = f9, f74 FADD f10 = f10, f77 FADD f11 = f11, f80 .align 32 ;; .L55: (p12) LDFD f32 = [X], STRIDE (p9) br.cond.dptk .L998 ;; (p12) LDFD f33 = [X], INCX ;; (p12) LDFD f34 = [X], STRIDE ;; (p12) LDFD f35 = [X], INCX tbit.z p0, p13 = N, (2 - COMPADD) ;; (p12) LDFD f36 = [X], STRIDE tbit.z p0, p14 = N, (1 - COMPADD) ;; (p12) LDFD f37 = [X], INCX #ifndef COMPLEX tbit.z p0, p15 = N, 0 #endif ;; (p12) LDFD f38 = [X], STRIDE (p12) fabs f32 = f32 ;; (p12) LDFD f39 = [X], INCX (p12) fabs f33 = f33 ;; (p13) LDFD f40 = [X], STRIDE (p12) fabs f34 = f34 ;; (p13) LDFD f41 = [X], INCX (p12) fabs f35 = f35 ;; (p13) LDFD f42 = [X], STRIDE (p12) fabs f36 = f36 (p12) FADD f8 = f8, f32 ;; (p13) LDFD f43 = [X], INCX (p12) fabs f37 = f37 (p12) FADD f9 = f9, f33 ;; (p14) LDFD f44 = [X], STRIDE (p12) fabs f38 = f38 (p12) FADD f10 = f10, f34 ;; (p14) LDFD f45 = [X], INCX (p12) fabs f39 = f39 (p12) FADD f11 = f11, f35 ;; #ifndef COMPLEX (p15) LDFD f46 = [X] #endif (p13) fabs f40 = f40 (p12) FADD f8 = f8, f36 ;; (p13) fabs f41 = f41 (p12) FADD f9 = f9, f37 (p13) fabs f42 = f42 (p12) FADD f10 = f10, f38 (p13) fabs f43 = f43 (p12) FADD f11 = f11, f39 ;; (p14) fabs f44 = f44 (p13) FADD f8 = f8, f40 (p14) fabs f45 = f45 (p13) FADD f9 = f9, f41 #ifndef COMPLEX (p15) fabs f46 = f46 #endif (p13) FADD f10 = f10, f42 ;; (p13) FADD f11 = f11, f43 (p14) FADD f8 = f8, f44 (p14) FADD f9 = f9, f45 #ifndef COMPLEX (p15) FADD f10 = f10, f46 #endif ;; .align 32 .L998: { .mfi FADD f8 = f8, f9 mov ar.lc = ARLC } { .mmf FADD f10 = f10, f11 } ;; { .mii mov pr = PR, -65474 } ;; { .mfb FADD f8 = f8, f10 br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/cabs.S000066400000000000000000000054741313527062700162350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE .prologue .body LDFD f8 = [r32], SIZE ;; LDFD f6 = [r32] ;; fabs f8 = f8 fabs f6 = f6 ;; FADD f8 = f6, f8 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/caxpy.S000066400000000000000000000260261313527062700164450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_SIZE (32 * 16) #ifndef CONJ #define FMA1 FNMA #define FMA2 FMA #else #define FMA1 FMA #define FMA2 FNMA #endif #define SP r12 #define N r32 #define X1 r37 #define INCX r38 #define Y1 r39 #define INCY r36 #define PREX1 r2 #define PREY1 r3 #define I r33 #define J r34 #define Y2 r35 #define X2 r14 #define YY1 r15 #define YY2 r16 #define YY3 r17 #define YY4 r18 #define INCXM1 r19 #define INCYM1 r20 #define INCX3M1 r21 #define INCY3M1 r22 #define INCX7M1 r23 #define INCY7M1 r24 #define X3 r8 #define Y3 r9 #define X4 r10 #define Y4 r11 #define INCX8 r25 #define INCY8 r26 #define ARLC r29 #define PR r30 #define ALPHA_R f8 #define ALPHA_I f9 PROLOGUE .prologue PROFCODE { .mmi adds r14 = 16, SP and J = 7, N .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.gt p15, p0 = r0, N shr I = N, 3 (p15) br.ret.sptk.many b0 } ;; { .mmi ld8 INCY = [r14] nop __LINE__ mov PR = pr } { .mmi adds PREX1 = (PREFETCH_SIZE + 0) * SIZE, X1 adds PREY1 = (PREFETCH_SIZE + 0) * SIZE, Y1 shl INCX = INCX, ZBASE_SHIFT } ;; { .mii adds I = -1, I mov pr.rot= 0 shl INCY = INCY, ZBASE_SHIFT } ;; { .mmi adds INCXM1 = -SIZE, INCX adds INCYM1 = -SIZE, INCY mov ar.ec = 3 } { .mmi shladd X2 = INCX, 1, X1 shladd Y2 = INCY, 1, Y1 cmp.eq p16, p0 = r0, r0 } ;; { .mmi shladd INCX3M1 = INCX, 1, INCXM1 shladd INCY3M1 = INCY, 1, INCYM1 shladd INCX8 = INCX, 3, r0 } { .mmi shladd X3 = INCX, 1, X2 shladd Y3 = INCY, 1, Y2 shladd INCY8 = INCY, 3, r0 } ;; { .mmi shladd X4 = INCX, 1, X3 shladd Y4 = INCY, 1, Y3 shladd INCX7M1 = INCX, 2, INCX3M1 } { .mmi mov YY1 = Y1 mov YY2 = Y2 shladd INCY7M1 = INCY, 2, INCY3M1 } ;; { .mmi mov YY3 = Y3 mov YY4 = Y4 mov ar.lc = I } { .mib cmp.eq p11 ,p0 = -1, I tbit.z p0, p13 = N, 2 (p11) br.cond.dpnt .L25 } ;; .align 32 .L22: { .mmf (p19) STFD [YY3] = f14 (p19) STFD [YY4] = f15 (p18) FMA2 f14 = ALPHA_R, f64, f112 } { .mmf (p16) LDFD f80 = [Y1], 1 * SIZE (p16) LDFD f92 = [Y2], 1 * SIZE (p18) FMA2 f15 = ALPHA_R, f76, f124 } ;; { .mmf (p16) lfetch.excl.nt1 [PREY1], INCY8 (p16) LDFD f104 = [Y3], 1 * SIZE (p18) FMA1 f6 = ALPHA_I, f40, f6 } { .mmf (p16) LDFD f116 = [Y4], 1 * SIZE nop __LINE__ (p18) FMA1 f7 = ALPHA_I, f52, f7 } ;; { .mmf (p16) LDFD f86 = [Y1], INCYM1 (p16) LDFD f98 = [Y2], INCYM1 (p18) FMA1 f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA1 f11 = ALPHA_I, f76, f11 } ;; { .mmf (p16) LDFD f110 = [Y3], INCYM1 (p16) LDFD f122 = [Y4], INCYM1 (p18) FMA f12 = ALPHA_I, f34, f12 } { .mmf (p19) add YY1 = YY1, INCY7M1 (p19) add YY2 = YY2, INCY7M1 (p18) FMA f13 = ALPHA_I, f46, f13 } ;; { .mmf (p16) LDFD f32 = [X1], 1 * SIZE (p16) LDFD f44 = [X2], 1 * SIZE (p18) FMA f14 = ALPHA_I, f58, f14 } { .mmf (p19) add YY3 = YY3, INCY7M1 (p19) add YY4 = YY4, INCY7M1 (p18) FMA f15 = ALPHA_I, f70, f15 } ;; { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA_R, f37, f85 } { .mmf (p16) LDFD f56 = [X3], 1 * SIZE (p16) LDFD f68 = [X4], 1 * SIZE (p18) FMA f7 = ALPHA_R, f49, f97 } ;; { .mmf (p18) STFD [YY3] = f10, 1 * SIZE (p18) STFD [YY4] = f11, 1 * SIZE (p18) FMA f10 = ALPHA_R, f61, f109 } { .mmf (p16) LDFD f38 = [X1], INCXM1 (p16) LDFD f50 = [X2], INCXM1 (p18) FMA f11 = ALPHA_R, f73, f121 } ;; { .mmf (p18) STFD [YY1] = f12 (p18) STFD [YY2] = f13 (p18) FMA2 f12 = ALPHA_R, f43, f91 } { .mmf (p16) LDFD f62 = [X3], INCXM1 (p16) LDFD f74 = [X4], INCXM1 (p18) FMA2 f13 = ALPHA_R, f55, f103 } ;; { .mmf (p18) STFD [YY3] = f14 (p18) STFD [YY4] = f15 (p18) FMA2 f14 = ALPHA_R, f67, f115 } { .mmf (p16) LDFD f83 = [Y1], 1 * SIZE (p16) LDFD f95 = [Y2], 1 * SIZE (p18) FMA2 f15 = ALPHA_R, f79, f127 } ;; { .mmf (p16) LDFD f107 = [Y3], 1 * SIZE (p16) LDFD f119 = [Y4], 1 * SIZE (p18) FMA1 f6 = ALPHA_I, f43, f6 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA1 f7 = ALPHA_I, f55, f7 } ;; { .mmf (p16) LDFD f89 = [Y1], INCY7M1 (p16) LDFD f101 = [Y2], INCY7M1 (p18) FMA1 f10 = ALPHA_I, f67, f10 } { .mmf (p18) add YY1 = YY1, INCYM1 (p18) add YY2 = YY2, INCYM1 (p18) FMA1 f11 = ALPHA_I, f79, f11 } ;; { .mmf (p16) LDFD f113 = [Y3], INCY7M1 (p16) LDFD f125 = [Y4], INCY7M1 (p18) FMA f12 = ALPHA_I, f37, f12 } { .mmf (p18) add YY3 = YY3, INCYM1 (p18) add YY4 = YY4, INCYM1 (p18) FMA f13 = ALPHA_I, f49, f13 } ;; { .mmf (p16) LDFD f35 = [X1], 1 * SIZE (p16) LDFD f47 = [X2], 1 * SIZE (p18) FMA f14 = ALPHA_I, f61, f14 } { .mmf (p16) LDFD f59 = [X3], 1 * SIZE (p16) LDFD f71 = [X4], 1 * SIZE (p18) FMA f15 = ALPHA_I, f73, f15 } ;; { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p17) FMA f6 = ALPHA_R, f33, f81 } { .mmf (p16) LDFD f41 = [X1], INCX7M1 (p16) LDFD f53 = [X2], INCX7M1 (p17) FMA f7 = ALPHA_R, f45, f93 } ;; { .mmf (p18) STFD [YY3] = f10, 1 * SIZE (p18) STFD [YY4] = f11, 1 * SIZE (p17) FMA f10 = ALPHA_R, f57, f105 } { .mmf (p16) LDFD f65 = [X3], INCX7M1 (p16) LDFD f77 = [X4], INCX7M1 (p17) FMA f11 = ALPHA_R, f69, f117 } ;; { .mmf (p18) STFD [YY1] = f12 (p18) STFD [YY2] = f13 (p17) FMA2 f12 = ALPHA_R, f39, f87 } { .mfb (p16) lfetch.nt1 [PREX1], INCX8 (p17) FMA2 f13 = ALPHA_R, f51, f99 br.ctop.sptk.few .L22 } ;; (p19) add YY1 = YY1, INCY7M1 (p19) add YY2 = YY2, INCY7M1 ;; { .mmf (p19) STFD [YY3] = f14 (p19) STFD [YY4] = f15 } { .mmf (p19) add YY3 = YY3, INCY7M1 (p19) add YY4 = YY4, INCY7M1 } ;; .align 32 .L25: { .mmi (p13) LDFD f32 = [X1], 1 * SIZE (p13) LDFD f36 = [X2], 1 * SIZE mov ar.lc = ARLC } ;; { .mmi (p13) LDFD f80 = [Y1], 1 * SIZE (p13) LDFD f84 = [Y2], 1 * SIZE mov pr = PR, -65474 } ;; { .mmi (p13) LDFD f33 = [X1], INCXM1 (p13) LDFD f37 = [X2], INCXM1 cmp.eq p12, p0 = r0, J } ;; { .mmb (p13) LDFD f81 = [Y1], INCYM1 (p13) LDFD f85 = [Y2], INCYM1 (p12) br.ret.sptk.many b0 } ;; { .mmi (p13) LDFD f34 = [X1], 1 * SIZE (p13) LDFD f38 = [X2], 1 * SIZE tbit.z p0, p14 = N, 1 } ;; { .mmi (p13) LDFD f82 = [Y1], 1 * SIZE (p13) LDFD f86 = [Y2], 1 * SIZE tbit.z p0, p15 = N, 0 } ;; { .mmf (p13) LDFD f35 = [X1], INCX3M1 (p13) LDFD f39 = [X2], INCX3M1 (p13) FMA f80 = ALPHA_R, f32, f80 } ;; { .mmf (p13) LDFD f83 = [Y1], INCY3M1 (p13) LDFD f87 = [Y2], INCY3M1 (p13) FMA f84 = ALPHA_R, f36, f84 } ;; { .mmf (p14) LDFD f40 = [X1], 1 * SIZE (p14) LDFD f88 = [Y1], 1 * SIZE (p13) FMA2 f81 = ALPHA_R, f33, f81 } ;; { .mmf (p14) LDFD f41 = [X1], INCXM1 (p14) LDFD f89 = [Y1], INCYM1 (p13) FMA2 f85 = ALPHA_R, f37, f85 } ;; { .mmf (p14) LDFD f42 = [X1], 1 * SIZE (p14) LDFD f90 = [Y1], 1 * SIZE (p13) FMA f82 = ALPHA_R, f34, f82 } ;; { .mmf (p14) LDFD f43 = [X1], INCXM1 (p14) LDFD f91 = [Y1], INCYM1 (p13) FMA f86 = ALPHA_R, f38, f86 } ;; { .mmf (p15) LDFD f44 = [X1], 1 * SIZE (p15) LDFD f92 = [Y1], 1 * SIZE (p13) FMA2 f83 = ALPHA_R, f35, f83 } ;; { .mmf (p15) LDFD f45 = [X1] (p15) LDFD f93 = [Y1] (p13) FMA2 f87 = ALPHA_R, f39, f87 } ;; (p13) FMA1 f80 = ALPHA_I, f33, f80 (p13) FMA1 f84 = ALPHA_I, f37, f84 (p13) FMA f81 = ALPHA_I, f32, f81 (p13) FMA f85 = ALPHA_I, f36, f85 (p13) FMA1 f82 = ALPHA_I, f35, f82 (p13) FMA1 f86 = ALPHA_I, f39, f86 (p13) FMA f83 = ALPHA_I, f34, f83 (p13) FMA f87 = ALPHA_I, f38, f87 ;; { .mmf (p13) STFD [YY1] = f80, 1 * SIZE (p13) STFD [YY2] = f84, 1 * SIZE (p14) FMA f88 = ALPHA_R, f40, f88 } ;; { .mmf (p13) STFD [YY1] = f81 (p13) STFD [YY2] = f85 (p14) FMA2 f89 = ALPHA_R, f41, f89 } { .mmf (p13) add YY1 = YY1, INCYM1 (p13) add YY2 = YY2, INCYM1 (p14) FMA f90 = ALPHA_R, f42, f90 } ;; { .mmf (p13) STFD [YY1] = f82, 1 * SIZE (p13) STFD [YY2] = f86, 1 * SIZE (p14) FMA2 f91 = ALPHA_R, f43, f91 } ;; { .mmf (p13) STFD [YY1] = f83 (p13) STFD [YY2] = f87 (p15) FMA f92 = ALPHA_R, f44, f92 } { .mmf (p13) add YY1 = YY1, INCY3M1 nop __LINE__ (p15) FMA2 f93 = ALPHA_R, f45, f93 } ;; (p14) FMA1 f88 = ALPHA_I, f41, f88 (p14) FMA f89 = ALPHA_I, f40, f89 (p14) FMA1 f90 = ALPHA_I, f43, f90 (p14) FMA f91 = ALPHA_I, f42, f91 ;; { .mmf (p14) STFD [YY1] = f88, 1 * SIZE (p15) FMA1 f92 = ALPHA_I, f45, f92 } ;; { .mmf (p14) STFD [YY1] = f89 (p14) add YY1 = YY1, INCYM1 (p15) FMA f93 = ALPHA_I, f44, f93 } ;; (p14) STFD [YY1] = f90, 1 * SIZE ;; (p14) STFD [YY1] = f91 (p14) add YY1 = YY1, INCYM1 ;; (p15) STFD [YY1] = f92, 1 * SIZE ;; { .mmb (p15) STFD [YY1] = f93 nop __LINE__ br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/copy.S000066400000000000000000000357571313527062700163060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREA r2 #define PREB r3 #define I r14 #define J r15 #define X2 r16 #define Y2 r17 #define INCX3 r18 #define INCY3 r19 #define INCX5 r20 #define INCY5 r21 #define INCX16 r22 #define INCY16 r23 #define XX r24 #define YY r25 #define XA r26 #define YA r27 #define PR r30 #define ARLC r31 #ifdef DOUBLE #define PREFETCH_SIZE (4 * 32) #else #define PREFETCH_SIZE (4 * 64) #endif PROLOGUE .prologue PROFCODE { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N tbit.z p0, p7 = X1, BASE_SHIFT (p6) br.ret.sptk.many b0 } ;; .body { .mmi sub XA = Y1, X1 (p7) LDFD f32 = [X1], INCX mov PR = pr } { .mmi mov YY = Y1 (p7) adds N = -1, N (p7) add Y1 = Y1, INCY } ;; { .mmi shladd INCX5 = INCX, 2, INCX shladd INCY5 = INCY, 2, INCY mov pr.rot = 0 } { .mmi mov XX = X1 nop.m 0 shr.u XA = XA, BASE_SHIFT } ;; { .mmi and J = 15, N cmp.eq p16, p0 = r0, r0 shr I = N, 4 } { .mmb cmp.ne p6, p0 = SIZE, INCX #ifdef DOUBLE adds XA = 2, XA #else nop.m 0 #endif (p6) br.cond.dpnt .L100 } ;; /* INCX == 1 */ { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } { .mmi #ifdef DOUBLE and XA = 31, XA #else and XA = 63, XA #endif adds I = -1, I tbit.z p0, p13 = N, 2 } ;; { .mmi shladd X2 = INCX, 2, X1 shladd Y2 = INCY, 2, Y1 mov ar.lc = I } { .mib #ifdef DOUBLE cmp.gt p8, p0 = 15, XA #else cmp.gt p8, p0 = 30, XA #endif cmp.eq p9, p0 = r0, J (p8)br.cond.dpnt .L30 } ;; { .mmi (p7) STFD [YY] = f32 cmp.gt p8 ,p0 = r0, I mov ar.ec = 5 } { .mmb adds PREA = PREFETCH_SIZE * SIZE + 32, X1 #ifdef DOUBLE adds PREB = PREFETCH_SIZE * SIZE + 32, Y1 #else adds PREB = PREFETCH_SIZE * SIZE - 40, Y1 #endif (p8) br.cond.dpnt .L25 } ;; .align 32 .L22: { .mmi (p20) STFD [Y1] = f36 (p20) STFD [Y2] = f56 (p20) add Y1 = INCY, Y1 } { .mmi (p16) lfetch.nt1 [PREA], INCX16 (p16) LDFPD f32, f37 = [X1], 2 * SIZE (p20) add Y2 = INCY, Y2 } ;; { .mmi (p20) STFD [Y1] = f41 (p20) STFD [Y2] = f61 (p20) add Y1 = INCY, Y1 } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY16 (p16) LDFPD f42, f47 = [X1], 2 * SIZE (p20) add Y2 = INCY, Y2 } ;; { .mmi (p20) STFD [Y1] = f46 (p20) STFD [Y2] = f66 (p20) add Y1 = INCY, Y1 } { .mmi (p16) LDFPD f52, f57 = [X1], 2 * SIZE nop.m 0 (p20) add Y2 = INCY, Y2 } ;; { .mmi (p20) STFD [Y1] = f51 (p20) STFD [Y2] = f71 (p20) add Y1 = INCY5, Y1 } { .mmi (p16) LDFPD f62, f67 = [X1], 2 * SIZE nop.m 0 (p20) add Y2 = INCY5, Y2 } ;; { .mmi (p20) STFD [Y1] = f76 (p20) STFD [Y2] = f96 (p16) adds XX = 8 * SIZE, X1 } { .mmi (p16) LDFPD f72, f77 = [X1], 2 * SIZE (p20) add Y1 = INCY, Y1 (p20) add Y2 = INCY, Y2 } ;; { .mmi (p20) STFD [Y1] = f81 (p20) STFD [Y2] = f101 (p20) add Y1 = INCY, Y1 } { .mmi (p16) LDFPD f82, f87 = [X1], 2 * SIZE nop.m 0 (p20) add Y2 = INCY, Y2 } ;; { .mmi (p20) STFD [Y1] = f86 (p20) STFD [Y2] = f106 (p16) shladd X2 = INCX, 2, XX } { .mmi (p16) LDFPD f92, f97 = [X1], 2 * SIZE (p20) add Y1 = INCY, Y1 (p20) add Y2 = INCY, Y2 } ;; { .mmi (p20) STFD [Y1] = f91 (p20) STFD [Y2] = f111 (p20) add Y1 = INCY5, Y1 } { .mmb (p16) LDFPD f102, f107 = [X1], 2 * SIZE (p20) add Y2 = INCY5, Y2 br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmi (p12) LDFPD f48, f49 = [X1], 2 * SIZE (p12) LDFPD f52, f53 = [X2], 2 * SIZE mov ar.lc = ARLC } { .mmi (p12) adds XX = 8 * SIZE, XX nop.m 0 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) LDFPD f50, f51 = [X1] (p12) LDFPD f54, f55 = [X2] mov pr = PR, -65474 } { .mmb (p12) adds X1 = 6 * SIZE, X1 (p13) adds XX = 4 * SIZE, XX (p9) br.ret.sptk.many b0 } ;; { .mmi (p13) LDFPD f56, f57 = [X1], 2 * SIZE (p14) LDFPD f60, f61 = [XX], 2 * SIZE tbit.z p0, p15 = N, 0 } ;; { .mmi (p13) LDFPD f58, f59 = [X1], 2 * SIZE (p15) LDFD f62 = [XX] nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f52 mov YY = Y1 } { .mmi (p12) add Y1 = INCY, Y1 (p12) add Y2 = INCY, Y2 nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p12) add Y1 = INCY, Y1 } { .mmi (p12) add Y2 = INCY, Y2 (p12) shladd YY = INCY, 3, YY nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f54 (p12) add Y1 = INCY, Y1 } { .mmi (p12) add Y2 = INCY, Y2 (p13) shladd YY = INCY, 2, YY nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p12) add Y1 = INCY5, Y1 } { .mmi (p12) add Y2 = INCY5, Y2 nop.m 0 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f56 (p14) STFD [YY] = f60 (p13) add Y1 = INCY, Y1 } { .mmi (p14) add YY = INCY, YY nop.m 0 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f57 (p14) STFD [YY] = f61 (p13) add Y1 = INCY, Y1 } { .mmi (p14) add YY = INCY, YY nop.m 0 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f58 (p15) STFD [YY] = f62 (p13) add Y1 = INCY, Y1 } ;; { .mmb (p13) STFD [Y1] = f59 nop.m 0 br.ret.sptk.many b0 } .align 32 ;; .L30: { .mmi (p7) STFD [YY] = f32 cmp.gt p8 ,p0 = r0, I mov ar.ec = 4 } { .mmb adds PREA = PREFETCH_SIZE * SIZE + 24, X1 #ifdef DOUBLE adds PREB = PREFETCH_SIZE * SIZE + 64, Y1 #else adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 #endif (p8) br.cond.dpnt .L35 } ;; .align 32 .L32: { .mmi (p19) STFD [Y1] = f35 (p19) STFD [Y2] = f55 (p19) add Y1 = INCY, Y1 } { .mmi (p16) lfetch.nt1 [PREA], INCX16 (p16) LDFPD f32, f37 = [X1], 2 * SIZE (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f40 (p19) STFD [Y2] = f60 (p19) add Y1 = INCY, Y1 } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY16 (p16) LDFPD f42, f47 = [X1], 2 * SIZE (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f45 (p19) STFD [Y2] = f65 (p19) add Y1 = INCY, Y1 } { .mmi (p16) LDFPD f52, f57 = [X1], 2 * SIZE nop.m 0 (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f50 (p19) STFD [Y2] = f70 (p19) add Y1 = INCY5, Y1 } { .mmi (p16) LDFPD f62, f67 = [X1], 2 * SIZE nop.m 0 (p19) add Y2 = INCY5, Y2 } ;; { .mmi (p19) STFD [Y1] = f75 (p19) STFD [Y2] = f95 (p16) adds XX = 8 * SIZE, X1 } { .mmi (p16) LDFPD f72, f77 = [X1], 2 * SIZE (p19) add Y1 = INCY, Y1 (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f80 (p19) STFD [Y2] = f100 (p19) add Y1 = INCY, Y1 } { .mmi (p16) LDFPD f82, f87 = [X1], 2 * SIZE nop.m 0 (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f85 (p19) STFD [Y2] = f105 (p16) shladd X2 = INCX, 2, XX } { .mmi (p16) LDFPD f92, f97 = [X1], 2 * SIZE (p19) add Y1 = INCY, Y1 (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f90 (p19) STFD [Y2] = f110 (p19) add Y1 = INCY5, Y1 } { .mmb (p16) LDFPD f102, f107 = [X1], 2 * SIZE (p19) add Y2 = INCY5, Y2 br.ctop.sptk.few .L32 } ;; .align 32 .L35: { .mmi (p12) LDFPD f48, f49 = [X1], 2 * SIZE (p12) LDFPD f52, f53 = [X2], 2 * SIZE mov ar.lc = ARLC } { .mmi (p12) adds XX = 8 * SIZE, XX nop.m 0 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) LDFPD f50, f51 = [X1] (p12) LDFPD f54, f55 = [X2] mov pr = PR, -65474 } { .mmi (p12) adds X1 = 6 * SIZE, X1 (p12) adds X2 = 6 * SIZE, X2 (p13) adds XX = 4 * SIZE, XX } ;; { .mmi (p13) LDFPD f56, f57 = [X1], 2 * SIZE (p14) LDFPD f60, f61 = [XX], 2 * SIZE tbit.z p0, p15 = N, 0 } ;; { .mmb (p13) LDFPD f58, f59 = [X1], 2 * SIZE (p15) LDFD f62 = [XX] (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f52 mov YY = Y1 } { .mmi (p12) add Y1 = INCY, Y1 (p12) add Y2 = INCY, Y2 nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p12) add Y1 = INCY, Y1 } { .mmi (p12) add Y2 = INCY, Y2 (p12) shladd YY = INCY, 3, YY nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f54 (p12) add Y1 = INCY, Y1 } { .mmi (p12) add Y2 = INCY, Y2 (p13) shladd YY = INCY, 2, YY nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 nop.i 0 } { .mmi (p12) add Y1 = INCY5, Y1 (p12) add Y2 = INCY5, Y2 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f56 (p14) STFD [YY] = f60 nop.i 0 } { .mmi (p13) add Y1 = INCY, Y1 (p14) add YY = INCY, YY nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f57 (p14) STFD [YY] = f61 nop.i 0 } { .mmi (p13) add Y1 = INCY, Y1 (p14) add YY = INCY, YY nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f58 (p15) STFD [YY] = f62 (p13) add Y1 = INCY, Y1 } ;; { .mib (p13) STFD [Y1] = f59 nop.i 0 br.ret.sptk.many b0 } .align 32 ;; /* INCX != 1 */ .L100: { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } { .mmi nop.m 0 nop.m 0 nop.i 0 } ;; { .mmi adds PREA = PREFETCH_SIZE * SIZE, X1 adds PREB = PREFETCH_SIZE * SIZE, Y1 mov ar.ec = 6 } { .mmi cmp.eq p8 ,p0 = r0, I cmp.eq p9, p0 = r0, J adds I = -1, I } ;; { .mmi (p7) STFD [YY] = f32 shladd X2 = INCX, 2, X1 mov ar.lc = I } { .mib shladd Y2 = INCY, 2, Y1 cmp.eq p16, p0 = r0, r0 (p8) br.cond.dpnt .L120 } ;; .align 32 .L110: { .mmi (p21) STFD [Y1] = f37 (p21) STFD [Y2] = f61 (p21) add Y1 = INCY, Y1 } { .mmi (p16) lfetch.nt1 [PREA], INCX16 (p16) lfetch.excl.nt1 [PREB], INCY16 (p21) add Y2 = INCY, Y2 } ;; { .mmi (p21) STFD [Y1] = f43 (p21) STFD [Y2] = f67 (p21) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f56 = [X2], INCX (p16) LDFD f32 = [X1], INCX (p21) add Y2 = INCY, Y2 } ;; { .mmi (p21) STFD [Y1] = f49 (p21) STFD [Y2] = f73 (p21) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f38 = [X1], INCX (p16) LDFD f62 = [X2], INCX (p21) add Y2 = INCY, Y2 } ;; { .mmi (p21) STFD [Y1] = f55 (p21) STFD [Y2] = f79 (p21) add Y1 = INCY5, Y1 } { .mmi (p16) LDFD f44 = [X1], INCX (p16) LDFD f68 = [X2], INCX (p21) add Y2 = INCY5, Y2 } ;; { .mmi (p21) STFD [Y1] = f85 (p21) STFD [Y2] = f109 (p21) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f50 = [X1], INCX5 (p16) LDFD f74 = [X2], INCX5 (p21) add Y2 = INCY, Y2 } ;; { .mmi (p21) STFD [Y1] = f91 (p21) STFD [Y2] = f115 (p21) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f80 = [X1], INCX (p16) LDFD f104 = [X2], INCX (p21) add Y2 = INCY, Y2 } ;; { .mmi (p21) STFD [Y1] = f97 (p21) STFD [Y2] = f121 (p21) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f86 = [X1], INCX (p16) LDFD f110 = [X2], INCX (p21) add Y2 = INCY, Y2 } ;; { .mmi (p21) STFD [Y1] = f103 (p21) STFD [Y2] = f127 (p21) add Y1 = INCY5, Y1 } { .mmi (p16) LDFD f92 = [X1], INCX (p16) LDFD f116 = [X2], INCX (p21) add Y2 = INCY5, Y2 } ;; { .mmi nop.m 0 (p16) add XX = INCX5, X1 nop.i 0 } { .mmb (p16) LDFD f98 = [X1], INCX5 (p16) LDFD f122 = [X2], INCX5 br.ctop.sptk.few .L110 } ;; .align 32 .L120: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f52 = [X2], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f49 = [X1], INCX (p12) LDFD f53 = [X2], INCX mov pr = PR, -65474 } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f54 = [X2], INCX tbit.z p0, p13 = N, 2 } { .mmb nop.m 0 nop.m 0 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f51 = [X1], INCX5 (p12) LDFD f55 = [X2], INCX5 (p12) shladd XX = INCX, 3, XX } ;; { .mmi (p13) LDFD f56 = [X1], INCX (p13) shladd XX = INCX, 2, XX tbit.z p0, p14 = N, 1 } ;; { .mmi (p13) LDFD f57 = [X1], INCX (p14) LDFD f60 = [XX], INCX } ;; { .mmi (p13) LDFD f58 = [X1], INCX (p14) LDFD f61 = [XX], INCX tbit.z p0, p15 = N, 0 } ;; { .mmi (p13) LDFD f59 = [X1], INCX (p15) LDFD f62 = [XX] mov YY = Y1 } ;; { .mmi (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f52 nop.i 0 } { .mmi (p12) add Y1 = INCY, Y1 (p12) add Y2 = INCY, Y2 nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 nop.i 0 } { .mmi (p12) add Y1 = INCY, Y1 (p12) add Y2 = INCY, Y2 nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f54 nop.i 0 } { .mmi (p12) add Y1 = INCY, Y1 (p12) add Y2 = INCY, Y2 nop.i 0 } ;; { .mmi (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p12) add Y1 = INCY5, Y1 } { .mmi (p12) add Y2 = INCY5, Y2 (p12) shladd YY = INCY, 3, YY nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f56 (p13) add Y1 = INCY, Y1 (p13) shladd YY =INCY, 2, YY } ;; { .mmi (p13) STFD [Y1] = f57 (p14) STFD [YY] = f60 nop.i 0 } { .mmi (p13) add Y1 = INCY, Y1 (p14) add YY = INCY, YY nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f58 (p14) STFD [YY] = f61 nop.i 0 } { .mmi (p13) add Y1 = INCY, Y1 (p14) add YY = INCY, YY nop.i 0 } ;; { .mmb (p13) STFD [Y1] = f59 (p15) STFD [YY] = f62 br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/daxpy.S000066400000000000000000000713131313527062700164450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (16 * 16) #define N r32 #define X1 r36 #define INCX r37 #define Y1 r38 #define INCY r39 #define PREX r2 #define PREY r3 #define I r14 #define J r15 #define X2 r16 #define Y2 r17 #define YY1 r18 #define YY2 r19 #define INCX16 r20 #define INCY16 r21 #define X3 r26 #define YY r27 #define PR r30 #define ARLC r31 #define ALPHA f8 PROLOGUE PROFCODE .prologue { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N tbit.nz p10, p0 = Y1, BASE_SHIFT (p6) br.ret.dpnt.many b0 } ;; .body { .mmi (p10) LDFD f32 = [X1], INCX (p10) LDFD f33 = [Y1] mov PR = pr } { .mmi (p10) adds N = -1, N mov YY = Y1 (p10) add Y1 = Y1, INCY } ;; { .mmi mov YY1 = Y1 shladd YY2 = INCY, 1, Y1 mov pr.rot= 0 } { .mmi sub r8 = X1, Y1 mov r9 = 0xf0 nop __LINE__ } ;; { .mmi cmp.ne p6, p0 = SIZE, INCX cmp.ne p7, p0 = SIZE, INCY tbit.nz p8, p0 = X1, BASE_SHIFT } { .mbb and J = 15, N (p6) br.cond.dpnt .L100 (p7) br.cond.dpnt .L100 } ;; { .mfi cmp.eq p16, p0 = r0, r0 (p10) FMA f9 = ALPHA, f32, f33 shr I = N, 4 } { .mmb add X3 = X1, INCX and r8 = r9, r8 (p8) br.cond.dpnt.many .L30 } ;; { .mmi cmp.eq p11, p0 = r0, J adds I = -1, I mov ar.ec = 3 } { .mib cmp.lt p9, p0 = 127, r8 tbit.nz p12, p0 = N, 3 (p9) br.cond.dpnt.many .L20 } ;; { .mmi (p10) STFD [YY] = f9 cmp.eq p7 ,p0 = -1, I mov ar.lc = I } { .mib adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 (p7) br.cond.dpnt .L15 } ;; .align 32 .L12: { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f58, f106 } { .mmf (p16) lfetch.fault.nt1 [PREX], 16 * SIZE (p16) LDFPD f32, f35 = [X1], 2 * SIZE (p18) FMA f7 = ALPHA, f64, f112 } ;; { .mmf (p18) STFD [YY1] = f10, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p18) FMA f10 = ALPHA, f61, f109 } { .mmf (p16) LDFPD f38, f41 = [X1], 2 * SIZE (p16) LDFPD f80, f83 = [Y1], 2 * SIZE (p18) FMA f11 = ALPHA, f67, f115 } ;; { .mmf (p18) STFD [YY1] = f12, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p18) FMA f12 = ALPHA, f70, f118 } { .mmf (p16) LDFPD f44, f47 = [X1], 2 * SIZE (p16) LDFPD f86, f89 = [Y1], 2 * SIZE (p18) FMA f13 = ALPHA, f76, f124 } ;; { .mmf (p18) STFD [YY1] = f14, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p18) FMA f14 = ALPHA, f73, f121 } { .mmf (p16) LDFPD f50, f53 = [X1], 2 * SIZE (p16) LDFPD f92, f95 = [Y1], 2 * SIZE (p18) FMA f15 = ALPHA, f79, f127 } ;; { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p17) FMA f6 = ALPHA, f33, f81 } { .mmf (p16) LDFPD f56, f59 = [X1], 2 * SIZE (p16) LDFPD f98, f101 = [Y1], 2 * SIZE (p17) FMA f7 = ALPHA, f39, f87 } ;; { .mmf (p18) STFD [YY1] = f10, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p17) FMA f10 = ALPHA, f36, f84 } { .mmf (p16) LDFPD f62, f65 = [X1], 2 * SIZE (p16) LDFPD f104, f107 = [Y1], 2 * SIZE (p17) FMA f11 = ALPHA, f42, f90 } ;; { .mmf (p18) STFD [YY1] = f12, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p17) FMA f12 = ALPHA, f45, f93 } { .mmf (p16) LDFPD f68, f71 = [X1], 2 * SIZE (p16) LDFPD f110, f113 = [Y1], 2 * SIZE (p17) FMA f13 = ALPHA, f51, f99 } ;; { .mmf (p18) STFD [YY1] = f14, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p17) FMA f14 = ALPHA, f48, f96 } { .mmf (p16) LDFPD f74, f77 = [X1], 2 * SIZE (p16) LDFPD f116, f119 = [Y1], 2 * SIZE (p17) FMA f15 = ALPHA, f54, f102 } ;; { .mmi (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE (p16) LDFPD f122, f125 = [Y1], 2 * SIZE nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE (p12) LDFPD f34, f35 = [Y1], 2 * SIZE mov pr = PR, -65474 } ;; { .mmi (p12) LDFPD f36, f37 = [X1], 2 * SIZE (p12) LDFPD f38, f39 = [Y1], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p12) LDFPD f40, f41 = [X1], 2 * SIZE (p12) LDFPD f42, f43 = [Y1], 2 * SIZE (p11) br.ret.dpnt.many b0 } ;; { .mmi (p12) LDFPD f44, f45 = [X1], 2 * SIZE (p12) LDFPD f46, f47 = [Y1], 2 * SIZE tbit.nz p13, p0 = N, 2 } ;; { .mmi (p13) LDFPD f48, f49 = [X1], 2 * SIZE (p13) LDFPD f50, f51 = [Y1], 2 * SIZE tbit.nz p14, p0 = N, 1 } ;; { .mmi (p13) LDFPD f52, f53 = [X1], 2 * SIZE (p13) LDFPD f54, f55 = [Y1], 2 * SIZE tbit.nz p15, p0 = N, 0 } ;; { .mmi (p14) LDFPD f56, f57 = [X1], 2 * SIZE (p14) LDFPD f58, f59 = [Y1], 2 * SIZE nop __LINE__ } ;; { .mmi (p15) LDFD f60 = [X1] (p15) LDFD f61 = [Y1] nop __LINE__ } ;; (p12) FMA f6 = ALPHA, f32, f34 (p12) FMA f7 = ALPHA, f36, f38 (p12) FMA f10 = ALPHA, f33, f35 (p12) FMA f11 = ALPHA, f37, f39 (p12) FMA f12 = ALPHA, f40, f42 (p12) FMA f13 = ALPHA, f44, f46 (p12) FMA f14 = ALPHA, f41, f43 (p12) FMA f15 = ALPHA, f45, f47 ;; { .mmf (p12) STFD [YY1] = f6, 1 * SIZE (p12) STFD [YY2] = f7, 1 * SIZE (p13) FMA f6 = ALPHA, f48, f50 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f7 = ALPHA, f52, f54 } ;; { .mmf (p12) STFD [YY1] = f10, 3 * SIZE (p12) STFD [YY2] = f11, 3 * SIZE (p13) FMA f10 = ALPHA, f49, f51 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f11 = ALPHA, f53, f55 } ;; { .mmf (p12) STFD [YY1] = f12, 1 * SIZE (p12) STFD [YY2] = f13, 1 * SIZE (p14) FMA f12 = ALPHA, f56, f58 } ;; { .mmf (p12) STFD [YY1] = f14, 3 * SIZE (p12) STFD [YY2] = f15, 3 * SIZE (p14) FMA f13 = ALPHA, f57, f59 } ;; { .mmf (p13) STFD [YY1] = f6, 1 * SIZE (p13) STFD [YY2] = f7, 1 * SIZE (p15) FMA f14 = ALPHA, f60, f61 } ;; { .mmi (p13) STFD [YY1] = f10, 3 * SIZE (p13) STFD [YY2] = f11, 3 * SIZE } ;; { .mmi (p14) STFD [YY1] = f12, 1 * SIZE ;; (p14) STFD [YY1] = f13, 1 * SIZE nop __LINE__ } ;; { .mmb (p15) STFD [YY1] = f14 nop __LINE__ br.ret.sptk.many b0 } ;; .align 32 .L20: { .mmi (p10) STFD [YY] = f9 cmp.eq p7 ,p0 = -1, I mov ar.lc = I } { .mib adds PREX = (PREFETCHSIZE - 4) * SIZE, X1 adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 (p7) br.cond.dpnt .L25 } ;; .align 32 .L22: { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f58, f106 } { .mmf (p16) lfetch.fault.nt1 [PREX], 16 * SIZE (p17) LDFPD f57, f60 = [X1], 2 * SIZE (p18) FMA f7 = ALPHA, f64, f112 } ;; { .mmf (p18) STFD [YY1] = f10, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p18) FMA f10 = ALPHA, f61, f109 } { .mmf (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE (p16) LDFPD f80, f83 = [Y1], 2 * SIZE (p18) FMA f11 = ALPHA, f67, f115 } ;; { .mmf (p18) STFD [YY1] = f12, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p18) FMA f12 = ALPHA, f70, f118 } { .mmf (p17) LDFPD f63, f66 = [X1], 2 * SIZE (p16) LDFPD f86, f89 = [Y1], 2 * SIZE (p18) FMA f13 = ALPHA, f76, f124 } ;; { .mmf (p18) STFD [YY1] = f14, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p18) FMA f14 = ALPHA, f73, f121 } { .mmf (p17) LDFPD f69, f72 = [X1], 2 * SIZE (p16) LDFPD f92, f95 = [Y1], 2 * SIZE (p18) FMA f15 = ALPHA, f79, f127 } ;; { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p17) FMA f6 = ALPHA, f33, f81 } { .mmf (p17) LDFPD f75, f78 = [X1], 2 * SIZE (p16) LDFPD f98, f101 = [Y1], 2 * SIZE (p17) FMA f7 = ALPHA, f39, f87 } ;; { .mmf (p18) STFD [YY1] = f10, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p17) FMA f10 = ALPHA, f36, f84 } { .mmf (p16) LDFPD f32, f35 = [X1], 2 * SIZE (p16) LDFPD f104, f107 = [Y1], 2 * SIZE (p17) FMA f11 = ALPHA, f42, f90 } ;; { .mmf (p18) STFD [YY1] = f12, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p17) FMA f12 = ALPHA, f45, f93 } { .mmf (p16) LDFPD f38, f41 = [X1], 2 * SIZE (p16) LDFPD f110, f113 = [Y1], 2 * SIZE (p17) FMA f13 = ALPHA, f51, f99 } ;; { .mmf (p18) STFD [YY1] = f14, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p17) FMA f14 = ALPHA, f48, f96 } { .mmf (p16) LDFPD f44, f47 = [X1], 2 * SIZE (p16) LDFPD f116, f119 = [Y1], 2 * SIZE (p17) FMA f15 = ALPHA, f54, f102 } ;; { .mmi (p16) LDFPD f50, f53 = [X1], 2 * SIZE (p16) LDFPD f122, f125 = [Y1], 2 * SIZE nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE (p12) LDFPD f34, f35 = [Y1], 2 * SIZE mov pr = PR, -65474 } ;; { .mmi (p12) LDFPD f36, f37 = [X1], 2 * SIZE (p12) LDFPD f38, f39 = [Y1], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p12) LDFPD f40, f41 = [X1], 2 * SIZE (p12) LDFPD f42, f43 = [Y1], 2 * SIZE (p11) br.ret.dpnt.many b0 } ;; { .mmi (p12) LDFPD f44, f45 = [X1], 2 * SIZE (p12) LDFPD f46, f47 = [Y1], 2 * SIZE tbit.nz p13, p0 = N, 2 } ;; { .mmi (p13) LDFPD f48, f49 = [X1], 2 * SIZE (p13) LDFPD f50, f51 = [Y1], 2 * SIZE tbit.nz p14, p0 = N, 1 } ;; { .mmi (p13) LDFPD f52, f53 = [X1], 2 * SIZE (p13) LDFPD f54, f55 = [Y1], 2 * SIZE tbit.nz p15, p0 = N, 0 } ;; { .mmi (p14) LDFPD f56, f57 = [X1], 2 * SIZE (p14) LDFPD f58, f59 = [Y1], 2 * SIZE nop __LINE__ } ;; { .mmi (p15) LDFD f60 = [X1] (p15) LDFD f61 = [Y1] nop __LINE__ } ;; (p12) FMA f6 = ALPHA, f32, f34 (p12) FMA f7 = ALPHA, f36, f38 (p12) FMA f10 = ALPHA, f33, f35 (p12) FMA f11 = ALPHA, f37, f39 (p12) FMA f12 = ALPHA, f40, f42 (p12) FMA f13 = ALPHA, f44, f46 (p12) FMA f14 = ALPHA, f41, f43 (p12) FMA f15 = ALPHA, f45, f47 ;; { .mmf (p12) STFD [YY1] = f6, 1 * SIZE (p12) STFD [YY2] = f7, 1 * SIZE (p13) FMA f6 = ALPHA, f48, f50 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f7 = ALPHA, f52, f54 } ;; { .mmf (p12) STFD [YY1] = f10, 3 * SIZE (p12) STFD [YY2] = f11, 3 * SIZE (p13) FMA f10 = ALPHA, f49, f51 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f11 = ALPHA, f53, f55 } ;; { .mmf (p12) STFD [YY1] = f12, 1 * SIZE (p12) STFD [YY2] = f13, 1 * SIZE (p14) FMA f12 = ALPHA, f56, f58 } ;; { .mmf (p12) STFD [YY1] = f14, 3 * SIZE (p12) STFD [YY2] = f15, 3 * SIZE (p14) FMA f13 = ALPHA, f57, f59 } ;; { .mmf (p13) STFD [YY1] = f6, 1 * SIZE (p13) STFD [YY2] = f7, 1 * SIZE (p15) FMA f14 = ALPHA, f60, f61 } ;; { .mmi (p13) STFD [YY1] = f10, 3 * SIZE (p13) STFD [YY2] = f11, 3 * SIZE } ;; { .mmi (p14) STFD [YY1] = f12, 1 * SIZE ;; (p14) STFD [YY1] = f13, 1 * SIZE nop __LINE__ } ;; { .mmb (p15) STFD [YY1] = f14 nop __LINE__ br.ret.sptk.many b0 } ;; .align 32 .L30: { .mmi cmp.eq p11, p0 = r0, J adds I = -1, I mov ar.ec = 3 } { .mib cmp.lt p9, p0 = 127, r8 tbit.nz p12, p0 = N, 3 (p9) br.cond.dptk.many .L40 } ;; { .mmi (p10) STFD [YY] = f9 cmp.eq p7 ,p0 = -1, I mov ar.lc = I } { .mib adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 (p7) br.cond.dpnt .L35 } ;; .align 32 .L32: { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f58, f106 } { .mmf (p16) lfetch.fault.nt1 [PREX], 16 * SIZE (p16) LDFD f32 = [X1], 1 * SIZE (p18) FMA f7 = ALPHA, f64, f112 } ;; { .mmf (p18) STFD [YY1] = f10, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p18) FMA f10 = ALPHA, f61, f109 } { .mmf (p16) LDFPD f35, f38 = [X1], 2 * SIZE (p16) LDFPD f80, f83 = [Y1], 2 * SIZE (p18) FMA f11 = ALPHA, f67, f115 } ;; { .mmf (p18) STFD [YY1] = f12, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p18) FMA f12 = ALPHA, f70, f118 } { .mmf (p16) LDFPD f41, f44 = [X1], 2 * SIZE (p16) LDFPD f86, f89 = [Y1], 2 * SIZE (p18) FMA f13 = ALPHA, f76, f124 } ;; { .mmf (p18) STFD [YY1] = f14, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p18) FMA f14 = ALPHA, f73, f121 } { .mmf (p16) LDFPD f47, f50 = [X1], 2 * SIZE (p16) LDFPD f92, f95 = [Y1], 2 * SIZE (p18) FMA f15 = ALPHA, f79, f127 } ;; { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p17) FMA f6 = ALPHA, f33, f81 } { .mmf (p16) LDFPD f53, f56 = [X1], 2 * SIZE (p16) LDFPD f98, f101 = [Y1], 2 * SIZE (p17) FMA f7 = ALPHA, f39, f87 } ;; { .mmf (p18) STFD [YY1] = f10, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p17) FMA f10 = ALPHA, f36, f84 } { .mmf (p16) LDFPD f59, f62 = [X1], 2 * SIZE (p16) LDFPD f104, f107 = [Y1], 2 * SIZE (p17) FMA f11 = ALPHA, f42, f90 } ;; { .mmf (p18) STFD [YY1] = f12, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p17) FMA f12 = ALPHA, f45, f93 } { .mmf (p16) LDFPD f65, f68 = [X1], 2 * SIZE (p16) LDFPD f110, f113 = [Y1], 2 * SIZE (p17) FMA f13 = ALPHA, f51, f99 } ;; { .mmf (p18) STFD [YY1] = f14, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p17) FMA f14 = ALPHA, f48, f96 } { .mmf (p16) LDFPD f71, f74 = [X1], 2 * SIZE (p16) LDFPD f116, f119 = [Y1], 2 * SIZE (p17) FMA f15 = ALPHA, f54, f102 } ;; { .mmi (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE (p16) LDFPD f122, f125 = [Y1], 2 * SIZE adds X3 = 1 * SIZE, X1 } { .mmb (p16) LDFD f77 = [X1], 1 * SIZE nop __LINE__ br.ctop.sptk.few .L32 } ;; .align 32 .L35: { .mmi (p12) LDFPD f33, f36 = [X3] (p12) LDFPD f34, f35 = [Y1], 2 * SIZE mov pr = PR, -65474 } { .mmi (p12) LDFD f32 = [X1], 3 * SIZE (p12) adds X3 = 8 * SIZE, X3 nop __LINE__ } ;; { .mmi (p12) LDFPD f37, f40 = [X1], 2 * SIZE (p12) LDFPD f38, f39 = [Y1], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p12) LDFPD f41, f44 = [X1], 2 * SIZE (p12) LDFPD f42, f43 = [Y1], 2 * SIZE (p11) br.ret.dpnt.many b0 } ;; { .mmi (p12) LDFD f45 = [X1], 1 * SIZE (p12) LDFPD f46, f47 = [Y1], 2 * SIZE tbit.nz p13, p0 = N, 2 } ;; { .mmi (p13) LDFPD f49, f52 = [X3] (p13) LDFPD f50, f51 = [Y1], 2 * SIZE tbit.nz p14, p0 = N, 1 } { .mmi (p13) LDFD f48 = [X1], 3 * SIZE (p13) adds X3 = 4 * SIZE, X3 nop __LINE__ } ;; { .mmi (p13) LDFD f53 = [X1], 1 * SIZE (p13) LDFPD f54, f55 = [Y1], 2 * SIZE tbit.nz p15, p0 = N, 0 } ;; { .mmi (p14) LDFD f56 = [X1], 2 * SIZE (p14) LDFPD f58, f59 = [Y1], 2 * SIZE nop __LINE__ } { .mmi (p14) LDFD f57 = [X3] nop __LINE__ nop __LINE__ } ;; { .mmi (p15) LDFD f60 = [X1] (p15) LDFD f61 = [Y1] nop __LINE__ } ;; (p12) FMA f6 = ALPHA, f32, f34 (p12) FMA f7 = ALPHA, f36, f38 (p12) FMA f10 = ALPHA, f33, f35 (p12) FMA f11 = ALPHA, f37, f39 (p12) FMA f12 = ALPHA, f40, f42 (p12) FMA f13 = ALPHA, f44, f46 (p12) FMA f14 = ALPHA, f41, f43 (p12) FMA f15 = ALPHA, f45, f47 ;; { .mmf (p12) STFD [YY1] = f6, 1 * SIZE (p12) STFD [YY2] = f7, 1 * SIZE (p13) FMA f6 = ALPHA, f48, f50 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f7 = ALPHA, f52, f54 } ;; { .mmf (p12) STFD [YY1] = f10, 3 * SIZE (p12) STFD [YY2] = f11, 3 * SIZE (p13) FMA f10 = ALPHA, f49, f51 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f11 = ALPHA, f53, f55 } ;; { .mmf (p12) STFD [YY1] = f12, 1 * SIZE (p12) STFD [YY2] = f13, 1 * SIZE (p14) FMA f12 = ALPHA, f56, f58 } ;; { .mmf (p12) STFD [YY1] = f14, 3 * SIZE (p12) STFD [YY2] = f15, 3 * SIZE (p14) FMA f13 = ALPHA, f57, f59 } ;; { .mmf (p13) STFD [YY1] = f6, 1 * SIZE (p13) STFD [YY2] = f7, 1 * SIZE (p15) FMA f14 = ALPHA, f60, f61 } ;; { .mmi (p13) STFD [YY1] = f10, 3 * SIZE (p13) STFD [YY2] = f11, 3 * SIZE } ;; { .mmi (p14) STFD [YY1] = f12, 1 * SIZE ;; (p14) STFD [YY1] = f13, 1 * SIZE nop __LINE__ } ;; { .mmb (p15) STFD [YY1] = f14 nop __LINE__ br.ret.sptk.many b0 } ;; .align 32 .L40: { .mmi (p10) STFD [YY] = f9 cmp.eq p7 ,p0 = -1, I mov ar.lc = I } { .mib adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 adds PREY = (PREFETCHSIZE + 8) * SIZE, Y1 (p7) br.cond.dpnt .L45 } ;; .align 32 .L42: { .mmf (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f58, f106 } { .mmf (p17) LDFPD f54, f57 = [X1], 2 * SIZE (p16) LDFPD f80, f83 = [Y1], 2 * SIZE (p18) FMA f7 = ALPHA, f64, f112 } ;; { .mmf (p18) STFD [YY1] = f10, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p18) FMA f64 = ALPHA, f61, f109 } { .mmf (p17) LDFPD f60, f63 = [X1], 2 * SIZE (p16) LDFPD f86, f89 = [Y1], 2 * SIZE (p18) FMA f11 = ALPHA, f67, f115 } ;; { .mmf (p18) STFD [YY1] = f12, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p18) FMA f67 = ALPHA, f70, f118 } { .mmf (p17) LDFPD f66, f69 = [X1], 2 * SIZE (p16) LDFPD f92, f95 = [Y1], 2 * SIZE (p18) FMA f13 = ALPHA, f76, f124 } ;; { .mmf (p18) STFD [YY1] = f14, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p18) FMA f9 = ALPHA, f73, f121 } { .mmf (p17) LDFPD f72, f75 = [X1], 2 * SIZE (p16) LDFPD f98, f101 = [Y1], 2 * SIZE (p18) FMA f15 = ALPHA, f79, f127 } ;; { .mmi (p18) STFD [YY1] = f6, 1 * SIZE (p18) STFD [YY2] = f7, 1 * SIZE (p17) adds X3 = 2 * SIZE, X1 } { .mmf (p16) LDFPD f104, f107 = [Y1], 2 * SIZE (p17) LDFD f78 = [X1], 1 * SIZE (p17) FMA f6 = ALPHA, f33, f81 } ;; { .mmf (p16) LDFPD f110, f113 = [Y1], 2 * SIZE (p16) lfetch.fault.nt1 [PREX], 16 * SIZE (p17) FMA f7 = ALPHA, f39, f87 } { .mmf (p16) LDFD f32 = [X1], 1 * SIZE (p17) FMA f10 = ALPHA, f36, f84 } ;; { .mmf (p18) STFD [YY1] = f64, 3 * SIZE (p18) STFD [YY2] = f11, 3 * SIZE (p17) FMA f11 = ALPHA, f42, f90 } { .mmf (p16) LDFPD f35, f38 = [X1], 2 * SIZE (p16) LDFPD f116, f119 = [Y1], 2 * SIZE (p17) FMA f12 = ALPHA, f45, f93 } ;; { .mmf (p18) STFD [YY1] = f67, 1 * SIZE (p18) STFD [YY2] = f13, 1 * SIZE (p17) FMA f13 = ALPHA, f51, f99 } { .mmf (p16) LDFPD f41, f44 = [X1], 2 * SIZE (p16) LDFPD f122, f125 = [Y1], 2 * SIZE (p17) FMA f14 = ALPHA, f48, f96 } ;; { .mmf (p18) STFD [YY1] = f9, 3 * SIZE (p18) STFD [YY2] = f15, 3 * SIZE (p17) FMA f15 = ALPHA, f54, f102 } { .mmb (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE (p16) LDFPD f47, f50 = [X1], 2 * SIZE br.ctop.sptk.few .L42 } ;; .align 32 .L45: { .mmi (p12) LDFPD f33, f36 = [X3] (p12) LDFPD f34, f35 = [Y1], 2 * SIZE mov pr = PR, -65474 } { .mmi (p12) LDFD f32 = [X1], 3 * SIZE (p12) adds X3 = 8 * SIZE, X3 nop __LINE__ } ;; { .mmi (p12) LDFPD f37, f40 = [X1], 2 * SIZE (p12) LDFPD f38, f39 = [Y1], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p12) LDFPD f41, f44 = [X1], 2 * SIZE (p12) LDFPD f42, f43 = [Y1], 2 * SIZE (p11) br.ret.dpnt.many b0 } ;; { .mmi (p12) LDFD f45 = [X1], 1 * SIZE (p12) LDFPD f46, f47 = [Y1], 2 * SIZE tbit.nz p13, p0 = N, 2 } ;; { .mmi (p13) LDFPD f49, f52 = [X3] (p13) LDFPD f50, f51 = [Y1], 2 * SIZE tbit.nz p14, p0 = N, 1 } { .mmi (p13) LDFD f48 = [X1], 3 * SIZE (p13) adds X3 = 4 * SIZE, X3 nop __LINE__ } ;; { .mmi (p13) LDFD f53 = [X1], 1 * SIZE (p13) LDFPD f54, f55 = [Y1], 2 * SIZE tbit.nz p15, p0 = N, 0 } ;; { .mmi (p14) LDFD f56 = [X1], 2 * SIZE (p14) LDFPD f58, f59 = [Y1], 2 * SIZE nop __LINE__ } { .mmi (p14) LDFD f57 = [X3] nop __LINE__ nop __LINE__ } ;; { .mmi (p15) LDFD f60 = [X1] (p15) LDFD f61 = [Y1] nop __LINE__ } ;; (p12) FMA f6 = ALPHA, f32, f34 (p12) FMA f7 = ALPHA, f36, f38 (p12) FMA f10 = ALPHA, f33, f35 (p12) FMA f11 = ALPHA, f37, f39 (p12) FMA f12 = ALPHA, f40, f42 (p12) FMA f13 = ALPHA, f44, f46 (p12) FMA f14 = ALPHA, f41, f43 (p12) FMA f15 = ALPHA, f45, f47 ;; { .mmf (p12) STFD [YY1] = f6, 1 * SIZE (p12) STFD [YY2] = f7, 1 * SIZE (p13) FMA f6 = ALPHA, f48, f50 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f7 = ALPHA, f52, f54 } ;; { .mmf (p12) STFD [YY1] = f10, 3 * SIZE (p12) STFD [YY2] = f11, 3 * SIZE (p13) FMA f10 = ALPHA, f49, f51 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f11 = ALPHA, f53, f55 } ;; { .mmf (p12) STFD [YY1] = f12, 1 * SIZE (p12) STFD [YY2] = f13, 1 * SIZE (p14) FMA f12 = ALPHA, f56, f58 } ;; { .mmf (p12) STFD [YY1] = f14, 3 * SIZE (p12) STFD [YY2] = f15, 3 * SIZE (p14) FMA f13 = ALPHA, f57, f59 } ;; { .mmf (p13) STFD [YY1] = f6, 1 * SIZE (p13) STFD [YY2] = f7, 1 * SIZE (p15) FMA f14 = ALPHA, f60, f61 } ;; { .mmi (p13) STFD [YY1] = f10, 3 * SIZE (p13) STFD [YY2] = f11, 3 * SIZE } ;; { .mmi (p14) STFD [YY1] = f12, 1 * SIZE ;; (p14) STFD [YY1] = f13, 1 * SIZE nop __LINE__ } ;; { .mmb (p15) STFD [YY1] = f14 nop __LINE__ br.ret.sptk.many b0 } ;; .align 32 .L100: { .mfi cmp.eq p16, p0 = r0, r0 (p10) FMA f9 = ALPHA, f32, f33 shr I = N, 4 } ;; { .mmi cmp.eq p11, p0 = r0, J adds I = -1, I mov ar.ec = 3 } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.nz p12, p0 = N, 3 } ;; { .mmi (p10) STFD [YY] = f9 cmp.eq p7 ,p0 = -1, I mov ar.lc = I } { .mib adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 (p7) br.cond.dpnt .L115 } ;; .align 32 .L112: { .mmf (p18) STFD [YY1] = f6 (p16) lfetch.fault.nt1 [PREX], INCX16 (p18) FMA f12 = ALPHA, f46, f94 } { .mmi (p16) LDFD f32 = [X1], INCX (p16) LDFD f80 = [Y1], INCY (p18) add YY1 = YY1, INCY } ;; { .mmf (p18) STFD [YY1] = f7 (p18) add YY1 = YY1, INCY (p18) FMA f13 = ALPHA, f49, f97 } { .mmi (p16) LDFD f35 = [X1], INCX (p16) LDFD f83 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f10 (p18) add YY1 = YY1, INCY (p18) FMA f14 = ALPHA, f52, f100 } { .mmi (p16) LDFD f38 = [X1], INCX (p16) LDFD f86 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f11 (p18) add YY1 = YY1, INCY (p18) FMA f15 = ALPHA, f55, f103 } { .mmi (p16) LDFD f41 = [X1], INCX (p16) LDFD f89 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f12 (p18) add YY1 = YY1, INCY (p18) FMA f6 = ALPHA, f58, f106 } { .mmi (p16) LDFD f44 = [X1], INCX (p16) LDFD f92 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f13 (p18) add YY1 = YY1, INCY (p18) FMA f7 = ALPHA, f61, f109 } { .mmi (p16) LDFD f47 = [X1], INCX (p16) LDFD f95 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f14 (p18) add YY1 = YY1, INCY (p18) FMA f10 = ALPHA, f64, f112 } { .mmi (p16) LDFD f50 = [X1], INCX (p16) LDFD f98 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f15 (p18) add YY1 = YY1, INCY (p18) FMA f11 = ALPHA, f67, f115 } { .mmi (p16) LDFD f53 = [X1], INCX (p16) LDFD f101 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f6 (p16) lfetch.fault.excl.nt1 [PREY], INCY16 (p18) FMA f12 = ALPHA, f70, f118 } { .mmi (p16) LDFD f56 = [X1], INCX (p16) LDFD f104 = [Y1], INCY (p18) add YY1 = YY1, INCY } ;; { .mmf (p18) STFD [YY1] = f7 (p18) add YY1 = YY1, INCY (p18) FMA f13 = ALPHA, f73, f121 } { .mmi (p16) LDFD f59 = [X1], INCX (p16) LDFD f107 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f10 (p18) add YY1 = YY1, INCY (p18) FMA f14 = ALPHA, f76, f124 } { .mmi (p16) LDFD f62 = [X1], INCX (p16) LDFD f110 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f11 (p18) add YY1 = YY1, INCY (p18) FMA f15 = ALPHA, f79, f127 } { .mmi (p16) LDFD f65 = [X1], INCX (p16) LDFD f113 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f12 (p18) add YY1 = YY1, INCY (p17) FMA f6 = ALPHA, f33, f81 } { .mmi (p16) LDFD f68 = [X1], INCX (p16) LDFD f116 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f13 (p18) add YY1 = YY1, INCY (p17) FMA f7 = ALPHA, f36, f84 } { .mmi (p16) LDFD f71 = [X1], INCX (p16) LDFD f119 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f14 (p18) add YY1 = YY1, INCY (p17) FMA f10 = ALPHA, f39, f87 } { .mmi (p16) LDFD f74 = [X1], INCX (p16) LDFD f122 = [Y1], INCY nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f15 (p18) add YY1 = YY1, INCY (p17) FMA f11 = ALPHA, f42, f90 } { .mmb (p16) LDFD f77 = [X1], INCX (p16) LDFD f125 = [Y1], INCY br.ctop.sptk.few .L112 } ;; .align 32 .L115: { .mmi (p12) LDFD f32 = [X1], INCX (p12) LDFD f34 = [Y1], INCY mov pr = PR, -65474 } ;; { .mmi (p12) LDFD f33 = [X1], INCX (p12) LDFD f35 = [Y1], INCY mov ar.lc = ARLC } ;; { .mmb (p12) LDFD f36 = [X1], INCX (p12) LDFD f38 = [Y1], INCY (p11) br.ret.dpnt.many b0 } ;; { .mmi (p12) LDFD f37 = [X1], INCX (p12) LDFD f39 = [Y1], INCY tbit.nz p13, p0 = N, 2 } ;; { .mmi (p12) LDFD f40 = [X1], INCX (p12) LDFD f42 = [Y1], INCY tbit.nz p14, p0 = N, 1 } ;; { .mmi (p12) LDFD f41 = [X1], INCX (p12) LDFD f43 = [Y1], INCY tbit.nz p15, p0 = N, 0 } ;; { .mmf (p12) LDFD f44 = [X1], INCX (p12) LDFD f46 = [Y1], INCY (p12) FMA f6 = ALPHA, f32, f34 } ;; { .mmf (p12) LDFD f45 = [X1], INCX (p12) LDFD f47 = [Y1], INCY (p12) FMA f7 = ALPHA, f33, f35 } ;; { .mmf (p13) LDFD f48 = [X1], INCX (p13) LDFD f50 = [Y1], INCY (p12) FMA f10 = ALPHA, f36, f38 } ;; { .mmf (p13) LDFD f49 = [X1], INCX (p13) LDFD f51 = [Y1], INCY (p12) FMA f11 = ALPHA, f37, f39 } ;; { .mmf (p12) STFD [YY1] = f6 (p12) add YY1 = YY1, INCY (p12) FMA f12 = ALPHA, f40, f42 } { .mmi (p13) LDFD f52 = [X1], INCX (p13) LDFD f54 = [Y1], INCY nop __LINE__ } ;; { .mmf (p12) STFD [YY1] = f7 (p12) add YY1 = YY1, INCY (p12) FMA f13 = ALPHA, f41, f43 } { .mmi (p13) LDFD f53 = [X1], INCX (p13) LDFD f55 = [Y1], INCY nop __LINE__ } ;; { .mmf (p12) STFD [YY1] = f10 (p12) add YY1 = YY1, INCY (p12) FMA f14 = ALPHA, f44, f46 } { .mmi (p14) LDFD f56 = [X1], INCX (p14) LDFD f58 = [Y1], INCY nop __LINE__ } ;; { .mmf (p12) STFD [YY1] = f11 (p12) add YY1 = YY1, INCY (p12) FMA f15 = ALPHA, f45, f47 } { .mmi (p14) LDFD f57 = [X1], INCX (p14) LDFD f59 = [Y1], INCY nop __LINE__ } ;; { .mmf (p12) STFD [YY1] = f12 (p12) add YY1 = YY1, INCY (p13) FMA f6 = ALPHA, f48, f50 } { .mmi (p15) LDFD f60 = [X1], INCX (p15) LDFD f61 = [Y1], INCY nop __LINE__ } ;; { .mmf (p12) STFD [YY1] = f13 (p12) add YY1 = YY1, INCY (p13) FMA f7 = ALPHA, f49, f51 } ;; { .mmf (p12) STFD [YY1] = f14 (p12) add YY1 = YY1, INCY (p13) FMA f10 = ALPHA, f52, f54 } ;; { .mmf (p12) STFD [YY1] = f15 (p12) add YY1 = YY1, INCY (p13) FMA f11 = ALPHA, f53, f55 } ;; ;; { .mmf (p13) STFD [YY1] = f6 (p13) add YY1 = YY1, INCY (p14) FMA f12 = ALPHA, f56, f58 } ;; { .mmf (p13) STFD [YY1] = f7 (p13) add YY1 = YY1, INCY (p14) FMA f13 = ALPHA, f57, f59 } ;; { .mmf (p13) STFD [YY1] = f10 (p13) add YY1 = YY1, INCY (p15) FMA f14 = ALPHA, f60, f61 } ;; { .mmi (p13) STFD [YY1] = f11 (p13) add YY1 = YY1, INCY nop __LINE__ } ;; { .mmi (p14) STFD [YY1] = f12 (p14) add YY1 = YY1, INCY nop __LINE__ } ;; { .mmi (p14) STFD [YY1] = f13 (p14) add YY1 = YY1, INCY nop __LINE__ } ;; { .mmb (p15) STFD [YY1] = f14 nop __LINE__ br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/ddot.S000066400000000000000000000562301313527062700162530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_SIZE (16 * 16 + 2) #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREX r2 #define PREY r3 #define I r14 #define J r15 #define Y2 r16 #define X2 r17 #define INCX16 r18 #define INCY16 r19 #define INCX3 r20 #define INCY3 r21 #define YY r22 #define XA r23 #define YA r24 #define XX r25 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi nop.m 0 mov f8 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mfi mov r26 = 1 mov f9 = f0 shr XA = X1, 4 } ;; .body #ifdef F_INTERFACE LDINT N = [N] LDINT INCX = [INCX] LDINT INCY = [INCY] ;; #ifndef USE64BITINT sxt4 N = N sxt4 INCX = INCX sxt4 INCY = INCY ;; #endif cmp.le p0, p6 = r0, INCX cmp.le p0, p7 = r0, INCY sub r26 = r26, N ;; setf.sig f32 = r26 setf.sig f33 = INCX setf.sig f34 = INCY ;; xmpy.l f33 = f32, f33 xmpy.l f34 = f32, f34 ;; getf.sig r26 = f33 getf.sig r27 = f34 ;; (p6) shladd X1 = r26, BASE_SHIFT, X1 (p7) shladd Y1 = r27, BASE_SHIFT, Y1 ;; #endif { .mfi shladd INCX = INCX, BASE_SHIFT, r0 mov f32 = f0 mov PR = pr } { .mfb cmp.lt p0, p6 = r0, N mov f80 = f0 (p6) br.ret.sptk.many b0 } ;; { .mfi shladd INCY = INCY, BASE_SHIFT, r0 mov f10 = f0 tbit.nz p15, p0 = X1, BASE_SHIFT } { .mfb cmp.ne p6, p0 = SIZE, INCX mov f11 = f0 (p6) br.cond.dptk .L100 } ;; { .mfi (p15) LDFD f32 = [X1], INCX mov f12 = f0 mov pr.rot= 0 } { .mfi (p15) adds N = -1, N mov f13 = f0 shr YA = Y1, 4 } ;; { .mfi (p15) LDFD f80 = [Y1], INCY mov f14 = f0 shr I = N, 4 } { .mmi and J = 15, N and XA = 0xf, XA and YA = 0xf, YA } ;; { .mmi shladd INCX3 = INCX, 1, INCX shladd INCY3 = INCY, 1, INCY sub XA = YA, XA } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } ;; { .mmi shladd Y2 = INCY, 1, Y1 cmp.eq p7, p0 = r0, J mov ar.ec= 3 } { .mmi adds I = -1, I cmp.ge p8, p0 = 2, XA cmp.eq p16, p0 = r0, r0 } ;; { .mbb cmp.le p9, p0 = 12, XA (p8) br.cond.dpnt .L20 (p9) br.cond.dpnt .L20 } ;; { .mmi adds PREX = PREFETCH_SIZE * SIZE, X1 adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L15 } ;; .align 32 /* INCX == 1 && X is aligned */ .L12: { .mmf (p16) LDFPD f32, f35 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f80 = [Y1], INCY (p16) LDFD f86 = [Y2], INCY (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p16) LDFD f83 = [Y1], INCY3 (p16) LDFD f89 = [Y2], INCY3 (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [X1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p16) LDFD f92 = [Y1], INCY (p16) LDFD f98 = [Y2], INCY (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [X1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p16) LDFD f95 = [Y1], INCY3 (p16) LDFD f101 = [Y2], INCY3 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [X1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f104 = [Y1], INCY (p16) LDFD f110 = [Y2], INCY (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [X1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f107 = [Y1], INCY3 (p16) LDFD f113 = [Y2], INCY3 (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [X1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f116 = [Y1], INCY (p16) LDFD f122 = [Y2], INCY (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [X1], 2 * SIZE (p16) LDFD f119 = [Y1], INCY3 (p18) FMA f14 = f76, f124, f14 } { .mfb (p16) LDFD f125 = [Y2], INCY3 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE mov YY = Y1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [Y1], INCY (p12) LDFD f38 = [Y2], INCY (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [X1], 2 * SIZE (p12) shladd YY = INCY, 3, YY tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [Y1], INCY3 (p12) LDFD f39 = [Y2], INCY3 tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [X1], 2 * SIZE (p13) shladd YY = INCY, 2, YY } { .mmi (p12) LDFD f42 = [Y1], INCY (p12) LDFD f46 = [Y2], INCY } ;; (p12) LDFPD f44, f45 = [X1], 2 * SIZE (p12) LDFD f43 = [Y1], INCY3 (p12) LDFD f47 = [Y2], INCY3 (p14) shladd YY = INCY, 1, YY ;; (p13) LDFPD f48, f49 = [X1], 2 * SIZE (p13) LDFD f50 = [Y1], INCY (p13) LDFD f54 = [Y2], INCY ;; (p13) LDFPD f52, f53 = [X1], 2 * SIZE (p13) LDFD f51 = [Y1], INCY3 (p13) LDFD f55 = [Y2], INCY3 ;; (p14) LDFPD f56, f57 = [X1], 2 * SIZE (p14) LDFD f58 = [Y1], INCY (p15) LDFD f61 = [YY] ;; (p14) LDFD f59 = [Y1] (p15) LDFD f60 = [X1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L20: { .mmi adds PREX = PREFETCH_SIZE * SIZE, X1 adds PREY = (PREFETCH_SIZE + 18) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L25 } ;; .align 32 .L22: { .mmf (p16) LDFPD f32, f35 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p17) LDFD f105 = [Y1], INCY (p17) LDFD f111 = [Y2], INCY (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p17) LDFD f108 = [Y1], INCY3 (p17) LDFD f114 = [Y2], INCY3 (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [X1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p17) LDFD f117 = [Y1], INCY (p17) LDFD f123 = [Y2], INCY (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [X1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p17) LDFD f120 = [Y1], INCY3 (p17) LDFD f126 = [Y2], INCY3 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [X1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f80 = [Y1], INCY (p16) LDFD f86 = [Y2], INCY (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [X1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f83 = [Y1], INCY3 (p16) LDFD f89 = [Y2], INCY3 (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [X1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f92 = [Y1], INCY (p16) LDFD f98 = [Y2], INCY (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [X1], 2 * SIZE (p16) LDFD f95 = [Y1], INCY3 (p18) FMA f14 = f76, f124, f14 } { .mfb (p16) LDFD f101 = [Y2], INCY3 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE mov YY = Y1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [Y1], INCY (p12) LDFD f38 = [Y2], INCY (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [X1], 2 * SIZE (p12) shladd YY = INCY, 3, YY tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [Y1], INCY3 (p12) LDFD f39 = [Y2], INCY3 tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [X1], 2 * SIZE (p13) shladd YY = INCY, 2, YY } { .mmi (p12) LDFD f42 = [Y1], INCY (p12) LDFD f46 = [Y2], INCY } ;; (p12) LDFPD f44, f45 = [X1], 2 * SIZE (p12) LDFD f43 = [Y1], INCY3 (p12) LDFD f47 = [Y2], INCY3 (p14) shladd YY = INCY, 1, YY ;; (p13) LDFPD f48, f49 = [X1], 2 * SIZE (p13) LDFD f50 = [Y1], INCY (p13) LDFD f54 = [Y2], INCY ;; (p13) LDFPD f52, f53 = [X1], 2 * SIZE (p13) LDFD f51 = [Y1], INCY3 (p13) LDFD f55 = [Y2], INCY3 ;; (p14) LDFPD f56, f57 = [X1], 2 * SIZE (p14) LDFD f58 = [Y1], INCY (p15) LDFD f61 = [YY] ;; (p14) LDFD f59 = [Y1] (p15) LDFD f60 = [X1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L100: { .mmi shladd X2 = INCX, 1, X1 } { .mib cmp.ne p6, p0 = SIZE, INCY tbit.nz p15, p0 = Y1, BASE_SHIFT (p6) br.cond.dptk .L200 } ;; { .mfi (p15) LDFD f32 = [X1], INCX mov f12 = f0 mov pr.rot= 0 } { .mfi (p15) adds N = -1, N mov f13 = f0 shr YA = Y1, 4 } ;; { .mfi (p15) LDFD f80 = [Y1], INCY mov f14 = f0 shr I = N, 4 } { .mmi and J = 15, N and XA = 0xf, XA and YA = 0xf, YA } ;; { .mmi shladd INCX3 = INCX, 1, INCX shladd INCY3 = INCY, 1, INCY sub XA = YA, XA } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } ;; { .mmi shladd X2 = INCX, 1, X1 cmp.eq p7, p0 = r0, J mov ar.ec= 3 } { .mmi adds I = -1, I cmp.ge p8, p0 = 4, XA cmp.eq p16, p0 = r0, r0 } ;; { .mbb cmp.le p9, p0 = 14, XA (p8) br.cond.dpnt .L120 (p9) br.cond.dpnt .L120 } ;; { .mmi adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L115 } ;; .align 32 /* INCY == 1 */ .L112: { .mmf (p16) LDFPD f32, f35 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f80 = [X1], INCX (p16) LDFD f86 = [X2], INCX (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p16) LDFD f83 = [X1], INCX3 (p16) LDFD f89 = [X2], INCX3 (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [Y1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p16) LDFD f92 = [X1], INCX (p16) LDFD f98 = [X2], INCX (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [Y1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p16) LDFD f95 = [X1], INCX3 (p16) LDFD f101 = [X2], INCX3 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [Y1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f104 = [X1], INCX (p16) LDFD f110 = [X2], INCX (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [Y1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f107 = [X1], INCX3 (p16) LDFD f113 = [X2], INCX3 (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [Y1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f116 = [X1], INCX (p16) LDFD f122 = [X2], INCX (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [Y1], 2 * SIZE (p16) LDFD f119 = [X1], INCX3 (p18) FMA f14 = f76, f124, f14 } { .mfb (p16) LDFD f125 = [X2], INCX3 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L112 } ;; .align 32 .L115: { .mmi (p12) LDFPD f32, f33 = [Y1], 2 * SIZE mov XX = X1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [X1], INCX (p12) LDFD f38 = [X2], INCX (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [Y1], 2 * SIZE (p12) shladd XX = INCX, 3, XX tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [X1], INCX3 (p12) LDFD f39 = [X2], INCX3 tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [Y1], 2 * SIZE (p13) shladd XX = INCX, 2, XX } { .mmi (p12) LDFD f42 = [X1], INCX (p12) LDFD f46 = [X2], INCX } ;; (p12) LDFPD f44, f45 = [Y1], 2 * SIZE (p12) LDFD f43 = [X1], INCX3 (p12) LDFD f47 = [X2], INCX3 (p14) shladd XX = INCX, 1, XX ;; (p13) LDFPD f48, f49 = [Y1], 2 * SIZE (p13) LDFD f50 = [X1], INCX (p13) LDFD f54 = [X2], INCX ;; (p13) LDFPD f52, f53 = [Y1], 2 * SIZE (p13) LDFD f51 = [X1], INCX3 (p13) LDFD f55 = [X2], INCX3 ;; (p14) LDFPD f56, f57 = [Y1], 2 * SIZE (p14) LDFD f58 = [X1], INCX (p15) LDFD f61 = [XX] ;; (p14) LDFD f59 = [X1] (p15) LDFD f60 = [Y1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L120: { .mmi adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L125 } ;; .align 32 .L122: { .mmf (p16) LDFPD f32, f35 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p17) LDFD f105 = [X1], INCX (p17) LDFD f111 = [X2], INCX (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p17) LDFD f108 = [X1], INCX3 (p17) LDFD f114 = [X2], INCX3 (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [Y1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p17) LDFD f117 = [X1], INCX (p17) LDFD f123 = [X2], INCX (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [Y1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p17) LDFD f120 = [X1], INCX3 (p17) LDFD f126 = [X2], INCX3 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [Y1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f80 = [X1], INCX (p16) LDFD f86 = [X2], INCX (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [Y1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f83 = [X1], INCX3 (p16) LDFD f89 = [X2], INCX3 (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [Y1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f92 = [X1], INCX (p16) LDFD f98 = [X2], INCX (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [Y1], 2 * SIZE (p16) LDFD f95 = [X1], INCX3 (p18) FMA f14 = f76, f124, f14 } { .mfb (p16) LDFD f101 = [X2], INCX3 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L122 } ;; .align 32 .L125: { .mmi (p12) LDFPD f32, f33 = [Y1], 2 * SIZE mov XX = X1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [X1], INCX (p12) LDFD f38 = [X2], INCX (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [Y1], 2 * SIZE (p12) shladd XX = INCX, 3, XX tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [X1], INCX3 (p12) LDFD f39 = [X2], INCX3 tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [Y1], 2 * SIZE (p13) shladd XX = INCX, 2, XX } { .mmi (p12) LDFD f42 = [X1], INCX (p12) LDFD f46 = [X2], INCX } ;; (p12) LDFPD f44, f45 = [Y1], 2 * SIZE (p12) LDFD f43 = [X1], INCX3 (p12) LDFD f47 = [X2], INCX3 (p14) shladd XX = INCX, 1, XX ;; (p13) LDFPD f48, f49 = [Y1], 2 * SIZE (p13) LDFD f50 = [X1], INCX (p13) LDFD f54 = [X2], INCX ;; (p13) LDFPD f52, f53 = [Y1], 2 * SIZE (p13) LDFD f51 = [X1], INCX3 (p13) LDFD f55 = [X2], INCX3 ;; (p14) LDFPD f56, f57 = [Y1], 2 * SIZE (p14) LDFD f58 = [X1], INCX (p15) LDFD f61 = [XX] ;; (p14) LDFD f59 = [X1] (p15) LDFD f60 = [Y1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L200: { .mfi shladd INCX3 = INCX, 1, INCX mov f12 = f0 mov pr.rot= 0 } { .mfi and J = 15, N mov f13 = f0 shr I = N, 4 } ;; { .mmf cmp.eq p16, p0 = r0, r0 shladd INCY3 = INCY, 1, INCY mov f14 = f0 } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } ;; { .mmi cmp.eq p7, p0 = r0, J adds I = -1, I mov ar.ec= 3 } { .mmi shladd Y2 = INCY, 1, Y1 mov XX = X1 mov YY = Y1 } ;; { .mmi adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I mov f15 = f0 (p6) br.cond.dpnt .L215 } ;; .align 32 /* INCY == 1 */ .L212: { .mmf (p16) lfetch.nt1 [PREX], INCX16 (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f32 = [Y1], INCY (p16) LDFD f38 = [Y2], INCY (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFD f80 = [X1], INCX (p16) LDFD f86 = [X2], INCX (p18) FMA f10 = f40, f88, f10 } { .mmf (p16) LDFD f35 = [Y1], INCY3 (p16) LDFD f41 = [Y2], INCY3 (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFD f83 = [X1], INCX3 (p16) LDFD f89 = [X2], INCX3 (p18) FMA f12 = f46, f94, f12 } { .mmf (p16) LDFD f44 = [Y1], INCY (p16) LDFD f50 = [Y2], INCY (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFD f92 = [X1], INCX (p16) LDFD f98 = [X2], INCX (p18) FMA f14 = f52, f100, f14 } { .mmf (p16) LDFD f47 = [Y1], INCY3 (p16) LDFD f53 = [Y2], INCY3 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFD f95 = [X1], INCX3 (p16) LDFD f101 = [X2], INCX3 (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f56 = [Y1], INCY (p16) LDFD f62 = [Y2], INCY (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFD f104 = [X1], INCX (p16) LDFD f110 = [X2], INCX (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f59 = [Y1], INCY3 (p16) LDFD f65 = [Y2], INCY3 (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFD f107 = [X1], INCX3 (p16) LDFD f113 = [X2], INCX3 (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f68 = [Y1], INCY (p16) LDFD f74 = [Y2], INCY (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFD f116 = [X1], INCX (p16) LDFD f122 = [X2], INCX (p18) FMA f14 = f76, f124, f14 } { .mmf (p16) LDFD f71 = [Y1], INCY3 (p16) LDFD f77 = [Y2], INCY3 (p18) FMA f15 = f79, f127, f15 } ;; { .mmi (p16) LDFD f119 = [X1], INCX3 (p16) LDFD f125 = [X2], INCX3 } { .mmb (p16) add XX = INCX16, XX (p16) add YY = INCY16, YY br.ctop.sptk.few .L212 } ;; .align 32 .L215: { .mmi (p12) LDFD f34 = [X1], INCX (p12) LDFD f38 = [X2], INCX tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f32 = [Y1], INCY (p12) LDFD f36 = [Y2], INCY (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFD f35 = [X1], INCX3 (p12) LDFD f39 = [X2], INCX3 tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f33 = [Y1], INCY3 (p12) LDFD f37 = [Y2], INCY3 tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFD f42 = [X1], INCX (p12) LDFD f46 = [X2], INCX (p12) shladd XX = INCX, 3, XX } { .mmi (p12) LDFD f40 = [Y1], INCY (p12) LDFD f44 = [Y2], INCY (p12) shladd YY = INCY, 3, YY } ;; { .mmi (p12) LDFD f43 = [X1], INCX3 (p12) LDFD f47 = [X2], INCX3 (p13) shladd XX = INCX, 2, XX } { .mmi (p12) LDFD f41 = [Y1], INCY3 (p12) LDFD f45 = [Y2], INCY3 (p13) shladd YY = INCY, 2, YY } ;; (p13) LDFD f50 = [X1], INCX (p13) LDFD f54 = [X2], INCX (p14) shladd XX = INCX, 1, XX (p13) LDFD f48 = [Y1], INCY (p13) LDFD f52 = [Y2], INCY (p14) shladd YY = INCY, 1, YY ;; (p13) LDFD f51 = [X1], INCX3 (p13) LDFD f55 = [X2] (p13) LDFD f49 = [Y1], INCY3 (p13) LDFD f53 = [Y2] ;; (p14) LDFD f58 = [X1], INCX (p15) LDFD f61 = [XX] (p14) LDFD f56 = [Y1], INCY (p15) LDFD f60 = [YY] ;; (p14) LDFD f59 = [X1] (p14) LDFD f57 = [Y1] ;; ;; ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 ;; .align 32 .L999: FADD f8 = f8, f9 FADD f10 = f10, f11 FADD f12 = f12, f13 FADD f14 = f14, f15 ;; FADD f8 = f8, f10 FADD f12 = f12, f14 mov ar.lc = ARLC ;; FADD f8 = f8, f12 mov pr = PR, -65474 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/gemm_beta.S000066400000000000000000000233201313527062700172330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 140 #define CO1 r14 #define CO2 r15 #define CO3 r16 #define DO1 r17 #define DO2 r18 #define DO3 r19 #define I r22 #define I_AND_15 r23 #define PRE1 r24 #define PR r30 #define ARLC r31 #define M r32 #define N r33 #define C r34 #define LDC r35 #define J r36 #define BETA f8 PROLOGUE .prologue PROFCODE { .mmi #ifndef XDOUBLE adds CO1 = 16, r12 adds CO2 = 24, r12 #else adds CO1 = 32, r12 adds CO2 = 40, r12 #endif .save ar.lc, ARLC mov ARLC = ar.lc } { .mfb cmp.ge p6, p0 = 0, N fcmp.eq p0, p15 = BETA, f0 (p6) br.ret.sptk.many b0 } ;; .body { .mmi ld8 C = [CO1], 8 ld8 LDC = [CO2] mov PR = pr } { .mmi mov J = N shr I = M, 4 } ;; { .mmb shladd LDC = LDC, BASE_SHIFT, r0 adds I = -1, I (p15) br.cond.dpnt .L100 // if (beta != 0) goto L100 } ;; .align 32 .L60: { .mmi mov CO1 = C mov CO3 = C add CO2 = 4 * SIZE, C } { .mmi adds PRE1 = PREFETCHSIZE * SIZE, C add C = C, LDC tbit.nz p12, p0 = M, 3 } ;; { .mmi and I_AND_15 = 15, M mov ar.lc = I } { .mib cmp.gt p8, p0 = 0, I (p8) br.cond.dpnt .L80 } ;; .align 32 .L70: { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } { .mmi lfetch.excl.nt1 [PRE1] nop.m 0 adds PRE1 = 16 * SIZE, PRE1 } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE adds CO3 = 16 * SIZE, CO3 } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmi STFD [CO1] = f0, 5 * SIZE STFD [CO2] = f0, 5 * SIZE } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmb STFD [CO1] = f0, 5 * SIZE STFD [CO2] = f0, 5 * SIZE br.cloop.sptk.few .L70 } ;; .align 32 .L80: { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE tbit.nz p13, p0 = M, 2 } { .mmb cmp.eq p9, p0 = 0, I_AND_15 adds J = -1, J (p9) br.cond.dptk .L99 } ;; { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE tbit.nz p14, p0 = M, 1 } ;; { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi (p12) STFD [CO1] = f0, 5 * SIZE (p12) STFD [CO2] = f0 (p13) adds CO3 = 4 * SIZE, CO3 } ;; { .mmi (p13) STFD [CO1] = f0, 1 * SIZE (p14) STFD [CO3] = f0, 1 * SIZE } ;; { .mmi (p13) STFD [CO1] = f0, 1 * SIZE (p14) STFD [CO3] = f0, 1 * SIZE tbit.nz p15, p0 = M, 0 } ;; { .mmi (p13) STFD [CO1] = f0, 1 * SIZE (p15) STFD [CO3] = f0 } ;; { .mmi (p13) STFD [CO1] = f0 } ;; .align 32 .L99: { .mib cmp.lt p6, p0 = 0, J mov ar.lc = ARLC } { .mbb (p6) br.cond.dptk .L60 br.ret.sptk.many b0 } ;; .align 32 .L100: { .mmi mov CO1 = C mov CO3 = C mov pr.rot = 0 } { .mmi adds PRE1 = PREFETCHSIZE * SIZE, C add CO2 = 4 * SIZE, C mov DO1 = C } ;; { .mmi mov ar.ec = 6 } { .mmi adds DO2 = 4 * SIZE, C mov DO3 = C add C = C, LDC } ;; { .mmi and I_AND_15 = 15, M cmp.eq p16, p0 = r0, r0 mov ar.lc = I } { .mib cmp.gt p8, p0 = 0, I tbit.nz p12, p0 = M, 3 (p8) br.cond.dpnt .L180 } ;; .align 32 .L170: { .mmf (p21) STFD [DO1] = f6, 1 * SIZE (p21) STFD [DO2] = f7, 1 * SIZE (p21) FMPY f6 = BETA, f85 } { .mmf (p16) lfetch.excl.nt1 [PRE1] (p16) adds CO3 = 16 * SIZE, CO3 (p21) FMPY f7 = BETA, f91 } ;; { .mmf (p21) STFD [DO1] = f10, 1 * SIZE (p21) STFD [DO2] = f11, 1 * SIZE (p21) FMPY f10 = BETA, f97 } { .mmf (p16) LDFD f32 = [CO1], 1 * SIZE (p16) LDFD f38 = [CO2], 1 * SIZE (p21) FMPY f11 = BETA, f103 } ;; { .mmf (p21) STFD [DO1] = f12, 1 * SIZE (p21) STFD [DO2] = f13, 1 * SIZE (p21) FMPY f12 = BETA, f109 } { .mmf (p16) LDFD f44 = [CO1], 1 * SIZE (p16) LDFD f50 = [CO2], 1 * SIZE (p21) FMPY f13 = BETA, f115 } ;; { .mmf (p21) STFD [DO1] = f14, 5 * SIZE (p21) STFD [DO2] = f15, 5 * SIZE (p21) FMPY f14 = BETA, f121 } { .mmf (p16) LDFD f56 = [CO1], 1 * SIZE (p16) LDFD f62 = [CO2], 1 * SIZE (p21) FMPY f15 = BETA, f127 } ;; { .mmf (p21) STFD [DO1] = f6, 1 * SIZE (p21) STFD [DO2] = f7, 1 * SIZE (p20) FMPY f6 = BETA, f36 } { .mmf (p16) LDFD f68 = [CO1], 5 * SIZE (p16) LDFD f74 = [CO2], 5 * SIZE (p20) FMPY f7 = BETA, f42 } ;; { .mmf (p21) STFD [DO1] = f10, 1 * SIZE (p21) STFD [DO2] = f11, 1 * SIZE (p20) FMPY f10 = BETA, f48 } { .mmf (p16) LDFD f80 = [CO1], 1 * SIZE (p16) LDFD f86 = [CO2], 1 * SIZE (p20) FMPY f11 = BETA, f54 } ;; { .mmf (p21) STFD [DO1] = f12, 1 * SIZE (p21) STFD [DO2] = f13, 1 * SIZE (p20) FMPY f12 = BETA, f60 } { .mmf (p16) LDFD f92 = [CO1], 1 * SIZE (p16) LDFD f98 = [CO2], 1 * SIZE (p20) FMPY f13 = BETA, f66 } ;; { .mmf (p21) STFD [DO1] = f14, 5 * SIZE (p21) STFD [DO2] = f15, 5 * SIZE (p20) FMPY f14 = BETA, f72 } { .mmf (p16) LDFD f104 = [CO1], 1 * SIZE (p16) LDFD f110 = [CO2], 1 * SIZE (p20) FMPY f15 = BETA, f78 } ;; { .mmi (p16) LDFD f116 = [CO1], 5 * SIZE (p16) LDFD f122 = [CO2], 5 * SIZE adds PRE1 = 16 * SIZE, PRE1 } { .mmb (p16) adds DO3 = 16 * SIZE, DO3 nop.m 0 br.ctop.sptk.few .L170 } ;; .align 32 .L180: { .mmi (p12) LDFD f32 = [CO1], 1 * SIZE (p12) LDFD f36 = [CO2], 1 * SIZE tbit.nz p13, p0 = M, 2 } { .mmb cmp.eq p9, p0 = 0, I_AND_15 adds J = -1, J (p9) br.cond.dptk .L199 } ;; { .mmi (p12) LDFD f33 = [CO1], 1 * SIZE (p12) LDFD f37 = [CO2], 1 * SIZE tbit.nz p14, p0 = M, 1 } ;; { .mmi (p12) LDFD f34 = [CO1], 1 * SIZE (p12) LDFD f38 = [CO2], 1 * SIZE (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi (p12) LDFD f35 = [CO1], 5 * SIZE (p12) LDFD f39 = [CO2] (p13) adds CO3 = 4 * SIZE, CO3 } ;; { .mmi (p13) LDFD f40 = [CO1], 1 * SIZE (p14) LDFD f44 = [CO3], 1 * SIZE } ;; { .mmi (p13) LDFD f41 = [CO1], 1 * SIZE (p14) LDFD f45 = [CO3], 1 * SIZE tbit.nz p15, p0 = M, 0 } ;; { .mmf (p13) LDFD f42 = [CO1], 1 * SIZE (p15) LDFD f46 = [CO3] (p12) FMPY f32 = BETA, f32 } { .mmf (p12) FMPY f36 = BETA, f36 } ;; { .mmf (p13) LDFD f43 = [CO1] (p12) FMPY f33 = BETA, f33 } { .mmf (p12) FMPY f37 = BETA, f37 } ;; (p12) FMPY f34 = BETA, f34 (p12) FMPY f38 = BETA, f38 (p12) FMPY f35 = BETA, f35 (p12) FMPY f39 = BETA, f39 ;; { .mmf (p12) STFD [DO1] = f32, 1 * SIZE (p12) STFD [DO2] = f36, 1 * SIZE (p13) FMPY f40 = BETA, f40 } { .mmf (p12) adds DO3 = 8 * SIZE, DO3 (p14) FMPY f44 = BETA, f44 } ;; { .mmf (p12) STFD [DO1] = f33, 1 * SIZE (p12) STFD [DO2] = f37, 1 * SIZE (p13) FMPY f41 = BETA, f41 } { .mmf (p13) adds DO3 = 4 * SIZE, DO3 (p14) FMPY f45 = BETA, f45 } ;; { .mmf (p12) STFD [DO1] = f34, 1 * SIZE (p12) STFD [DO2] = f38, 1 * SIZE (p13) FMPY f42 = BETA, f42 } { .mmf (p15) FMPY f46 = BETA, f46 } ;; { .mmf (p12) STFD [DO1] = f35, 5 * SIZE (p12) STFD [DO2] = f39 (p13) FMPY f43 = BETA, f43 } ;; { .mmi (p13) STFD [DO1] = f40, 1 * SIZE (p14) STFD [DO3] = f44, 1 * SIZE } ;; { .mmi (p13) STFD [DO1] = f41, 1 * SIZE (p14) STFD [DO3] = f45, 1 * SIZE } ;; { .mmi (p13) STFD [DO1] = f42, 1 * SIZE (p15) STFD [DO3] = f46 } ;; { .mmi (p13) STFD [DO1] = f43 } ;; .align 32 .L199: { .mib cmp.lt p6, p0 = 0, J mov ar.lc = ARLC (p6) br.cond.dptk .L100 } ;; { .mib mov pr = PR, -1 br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/gemm_kernel.S000066400000000000000000004370011313527062700176050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #define CPREFETCHSIZE 7 #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r36 #define B r37 #define C r38 #define LDC r39 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define BB r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define C9 loc0 #define C10 loc1 #define C11 loc2 #define C12 loc3 #define C13 loc4 #define C14 loc5 #define C15 loc6 #define C16 loc7 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA f8 #define AORIG loc8 #define KK loc9 #define KK8 loc10 #define OFFSET loc11 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS #ifdef TRMMKERNEL alloc ARPFS = ar.pfs, 8, 16, 0, 0 #else alloc ARPFS = ar.pfs, 8, 8, 0, 0 #endif adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -16 * 16, SP adds r9 = -15 * 16, SP adds SP = -16 * 16, SP } ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 mov PR = pr ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shladd LDC = LDC, BASE_SHIFT, r0 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 shr J = N, 3 ;; stf.spill [r8] = f22, 32 stf.spill [r9] = f23, 32 mov AOFFSET = A ;; stf.spill [r8] = f24, 32 stf.spill [r9] = f25, 32 cmp.ge p6, p0 = 0, J ;; stf.spill [r8] = f26, 32 stf.spill [r9] = f27, 32 shr BB = K, 3 ;; stf.spill [r8] = f28, 32 stf.spill [r9] = f29, 32 ;; stf.spill [r8] = f30 stf.spill [r9] = f31 #ifndef TRMMKERNEL (p6) br.cond.dpnt .L050 .body ;; #else .body ;; ld8 OFFSET = [r14] #if defined(TRMMKERNEL) && !defined(LEFT) ;; sub KK = r0, OFFSET #endif (p6) br.cond.dpnt .L050 ;; #endif .align 32 .L010: { .mfi adds J = -1, J mov f64 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f72 = f0 shladd BB = BB, BASE_SHIFT, B } ;; { .mmf cmp.eq p6, p7 = 0, I #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif mov f80 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f88 = f0 } ;; { .mmf shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc shladd C = LDC, 3, C // coffset += 8 * ldc mov f96 = f0 } { .mmf shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc mov f104 = f0 } ;; { .mfi shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif }{ .mfb sub C8 = C, LDC // coffset8 = c + 7 * ldc mov f120 = f0 (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f73 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 3, B mov f65 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 nop __LINE__ } ;; #endif { .mfb LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f81 = f0 nop __LINE__ } { .mfb LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f89 = f0 nop __LINE__ } ;; { .mmf LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mmf lfetch.nt1 [BB] setf.d f113 = r0 mov f121 = f0 } ;; { .mmf LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfi setf.d f82 = r0 mov f90 = f0 adds BB = 16 * SIZE, BB } ;; { .mmf LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfb setf.d f114 = r0 mov f122 = f0 nop __LINE__ } ;; { .mmf LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 8, KK #endif #endif } ;; { .mmf LDFPD f38, f39 = [AOFFSET], 2 * SIZE setf.d f99 = r0 mov f107 = f0 } { .mfi setf.d f115 = r0 mov f123 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mmf CPREFETCH [PREC], LDC setf.d f100 = r0 mov f108 = f0 } { .mfi setf.d f116 = r0 mov f124 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f101 = r0 mov f109 = f0 } { .mfi setf.d f117 = r0 mov f125 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f70 = r0 mov f78 = f0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f102 = r0 mov f110 = f0 } { .mfi setf.d f118 = r0 mov f126 = f0 adds L = -1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f71 = r0 mov f79 = f0 } { .mfi setf.d f87 = r0 mov f95 = f0 mov ar.lc = L } ;; { .mmf CPREFETCH [PREC] setf.d f103 = r0 mov f111 = f0 } { .mfi setf.d f119 = r0 mov f127 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfi cmp.ne p4, p5 = 0, L FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfi adds C9 = 4 * SIZE, C1 FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfi adds C10 = 4 * SIZE, C2 FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfi adds C11 = 4 * SIZE, C3 FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfi adds C12 = 4 * SIZE, C4 FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfi (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfi adds C13 = 4 * SIZE, C5 FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfi adds C14 = 4 * SIZE, C6 FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfi (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfi adds C15 = 4 * SIZE, C7 FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfi (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfi adds C16 = 4 * SIZE, C8 FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfi FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfi FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfi nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfi nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfi FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfi FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfi FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfi nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfi nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfi nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f108 = f36, f53, f108 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfi nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f124 = f36, f55, f124 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfi nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfi nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfi nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f109 = f37, f53, f109 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfi nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f125 = f37, f55, f125 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfi nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfi nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfi nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f110 = f38, f53, f110 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfi nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f126 = f38, f55, f126 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfi nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f111 = f39, f53, f111 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfi nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f127 = f39, f55, f127 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfi nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfi (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfi (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f6 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f7 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f10 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f11 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f12 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f13 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f14 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f15 = [C9 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f16 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f17 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f18 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f19 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f20 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f21 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f22 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f23 = [C10], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f24 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f25 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f123 = f43, f63, f123 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f26 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f27 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f28 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f29 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f30 = [C3 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f31 = [C11], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f108 = f44, f61, f108 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f32 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f33 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f124 = f44, f63, f124 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f34 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f35 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f36 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f37 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f38 = [C4 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f39 = [C12], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f109 = f45, f61, f109 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f48 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f49 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f125 = f45, f63, f125 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f50 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f51 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f52 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f53 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f54 = [C5 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f55 = [C13], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f110 = f46, f61, f110 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f40 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f41 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f126 = f46, f63, f126 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f42 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f43 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f44 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f45 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f95 = f47, f59, f95 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f59 = [C6 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f60 = [C14], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f111 = f47, f61, f111 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f61 = [C7 ], SIZE #else nop __LINE__ #endif (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f62 = [C15], SIZE #else nop __LINE__ #endif (p3) FMA f127 = f47, f63, f127 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L013: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi (p5) LDFD f63 = [C7 ], SIZE FMA f64 = ALPHA, f64, f6 cmp.ne p6, p0 = 1, I } { .mfb (p5) LDFD f6 = [C15], SIZE FMA f68 = ALPHA, f68, f7 nop __LINE__ } ;; { .mfi (p5) LDFD f7 = [C7 ], SIZE FMA f65 = ALPHA, f65, f10 adds I = -1, I } { .mfb (p5) LDFD f10 = [C15], SIZE FMA f69 = ALPHA, f69, f11 nop __LINE__ } ;; { .mfb (p5) LDFD f11 = [C7 ], -3 * SIZE FMA f66 = ALPHA, f66, f12 nop __LINE__ } { .mfb (p5) LDFD f12 = [C15], -3 * SIZE FMA f70 = ALPHA, f70, f13 nop __LINE__ } ;; { .mfb LDFD f13 = [C8 ], SIZE FMA f67 = ALPHA, f67, f14 nop __LINE__ } { .mfb LDFD f14 = [C16], SIZE FMA f71 = ALPHA, f71, f15 nop __LINE__ } ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE FMA f72 = ALPHA, f72, f16 } { .mmf LDFD f15 = [C8 ], SIZE LDFD f16 = [C16], SIZE FMA f76 = ALPHA, f76, f17 } ;; { .mmf STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE FMA f73 = ALPHA, f73, f18 } { .mmf LDFD f17 = [C8 ], SIZE LDFD f18 = [C16], SIZE FMA f77 = ALPHA, f77, f19 } ;; { .mmf STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE FMA f74 = ALPHA, f74, f20 } { .mmf LDFD f19 = [C8 ], -3 * SIZE LDFD f20 = [C16], -3 * SIZE FMA f78 = ALPHA, f78, f21 } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMA f75 = ALPHA, f75, f22 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMA f79 = ALPHA, f79, f23 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f80 = ALPHA, f80, f24 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMA f84 = ALPHA, f84, f25 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMA f81 = ALPHA, f81, f26 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMA f85 = ALPHA, f85, f27 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMA f82 = ALPHA, f82, f28 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMA f86 = ALPHA, f86, f29 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMA f83 = ALPHA, f83, f30 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMA f87 = ALPHA, f87, f31 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f88 = ALPHA, f88, f32 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMA f92 = ALPHA, f92, f33 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMA f89 = ALPHA, f89, f34 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMA f93 = ALPHA, f93, f35 nop __LINE__ } ;; { .mfb STFD [C3 ] = f82, SIZE FMA f90 = ALPHA, f90, f36 nop __LINE__ } { .mfb STFD [C11] = f86, SIZE FMA f94 = ALPHA, f94, f37 nop __LINE__ } ;; { .mfb STFD [C3 ] = f83, 5 * SIZE FMA f91 = ALPHA, f91, f38 nop __LINE__ } { .mfb STFD [C11] = f87, 5 * SIZE FMA f95 = ALPHA, f95, f39 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMA f96 = ALPHA, f96, f48 nop __LINE__ } { .mfb STFD [C12] = f92, SIZE FMA f100 = ALPHA, f100, f49 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, SIZE FMA f97 = ALPHA, f97, f50 nop __LINE__ } { .mfb STFD [C12] = f93, SIZE FMA f101 = ALPHA, f101, f51 nop __LINE__ } ;; { .mfb STFD [C4 ] = f90, SIZE FMA f98 = ALPHA, f98, f52 nop __LINE__ } { .mfb STFD [C12] = f94, SIZE FMA f102 = ALPHA, f102, f53 nop __LINE__ } ;; { .mfb STFD [C4 ] = f91, 5 * SIZE FMA f99 = ALPHA, f99, f54 nop __LINE__ } { .mfb STFD [C12] = f95, 5 * SIZE FMA f103 = ALPHA, f103, f55 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMA f104 = ALPHA, f104, f40 nop __LINE__ } { .mfb STFD [C13] = f100, SIZE FMA f108 = ALPHA, f108, f41 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, SIZE FMA f105 = ALPHA, f105, f42 nop __LINE__ } { .mfb STFD [C13] = f101, SIZE FMA f109 = ALPHA, f109, f43 nop __LINE__ } ;; { .mfb STFD [C5 ] = f98, SIZE FMA f106 = ALPHA, f106, f44 nop __LINE__ } { .mfb STFD [C13] = f102, SIZE FMA f110 = ALPHA, f110, f45 nop __LINE__ } ;; { .mfb STFD [C5 ] = f99, 5 * SIZE FMA f107 = ALPHA, f107, f59 nop __LINE__ } { .mfb STFD [C13] = f103, 5 * SIZE FMA f111 = ALPHA, f111, f60 nop __LINE__ } ;; { .mfb STFD [C6 ] = f104, SIZE FMA f112 = ALPHA, f112, f61 nop __LINE__ } { .mfb STFD [C14] = f108, SIZE FMA f116 = ALPHA, f116, f62 nop __LINE__ } ;; { .mfb STFD [C6 ] = f105, SIZE FMA f113 = ALPHA, f113, f63 nop __LINE__ } { .mfb STFD [C14] = f109, SIZE FMA f117 = ALPHA, f117, f6 nop __LINE__ } ;; { .mfb STFD [C6 ] = f106, SIZE FMA f114 = ALPHA, f114, f7 nop __LINE__ } { .mfb STFD [C14] = f110, SIZE FMA f118 = ALPHA, f118, f10 nop __LINE__ } ;; { .mfb STFD [C6 ] = f107, 5 * SIZE FMA f115 = ALPHA, f115, f11 nop __LINE__ } { .mfb STFD [C14] = f111, 5 * SIZE FMA f119 = ALPHA, f119, f12 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE FMA f120 = ALPHA, f120, f13 nop __LINE__ } { .mfb STFD [C15] = f116, SIZE FMA f124 = ALPHA, f124, f14 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, SIZE FMA f121 = ALPHA, f121, f15 nop __LINE__ } { .mfb STFD [C15] = f117, SIZE FMA f125 = ALPHA, f125, f16 nop __LINE__ } ;; { .mfb STFD [C7 ] = f114, SIZE FMA f122 = ALPHA, f122, f17 nop __LINE__ } { .mfb STFD [C15] = f118, SIZE FMA f126 = ALPHA, f126, f18 nop __LINE__ } ;; { .mfb STFD [C7 ] = f115, 5 * SIZE FMA f123 = ALPHA, f123, f19 nop __LINE__ } { .mfb STFD [C15] = f119, 5 * SIZE FMA f127 = ALPHA, f127, f20 nop __LINE__ } ;; { .mfb STFD [C8 ] = f120, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C16] = f124, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f121, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C16] = f125, SIZE mov f88 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f122, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C16] = f126, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f123, 5 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C16] = f127, 5 * SIZE mov f120 = f0 (p6) br.cond.dptk .L011 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMPY f68 = ALPHA, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb nop __LINE__ FMPY f69 = ALPHA, f69 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f70 = ALPHA, f70 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f67 = ALPHA, f67 nop __LINE__ } { .mfb nop __LINE__ FMPY f71 = ALPHA, f71 nop __LINE__ } ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE FMPY f72 = ALPHA, f72 } { .mmf nop __LINE__ nop __LINE__ FMPY f76 = ALPHA, f76 } ;; { .mmf STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE FMPY f73 = ALPHA, f73 } { .mmf nop __LINE__ nop __LINE__ FMPY f77 = ALPHA, f77 } ;; { .mmf STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE FMPY f74 = ALPHA, f74 } { .mmf nop __LINE__ nop __LINE__ FMPY f78 = ALPHA, f78 } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMPY f75 = ALPHA, f75 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMPY f79 = ALPHA, f79 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMPY f84 = ALPHA, f84 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMPY f85 = ALPHA, f85 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMPY f86 = ALPHA, f86 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMPY f87 = ALPHA, f87 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMPY f92 = ALPHA, f92 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMPY f89 = ALPHA, f89 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMPY f93 = ALPHA, f93 nop __LINE__ } ;; { .mfb STFD [C3 ] = f82, SIZE FMPY f90 = ALPHA, f90 nop __LINE__ } { .mfb STFD [C11] = f86, SIZE FMPY f94 = ALPHA, f94 nop __LINE__ } ;; { .mfb STFD [C3 ] = f83, 5 * SIZE FMPY f91 = ALPHA, f91 nop __LINE__ } { .mfb STFD [C11] = f87, 5 * SIZE FMPY f95 = ALPHA, f95 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C12] = f92, SIZE FMPY f100 = ALPHA, f100 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C12] = f93, SIZE FMPY f101 = ALPHA, f101 nop __LINE__ } ;; { .mfb STFD [C4 ] = f90, SIZE FMPY f98 = ALPHA, f98 nop __LINE__ } { .mfb STFD [C12] = f94, SIZE FMPY f102 = ALPHA, f102 nop __LINE__ } ;; { .mfb STFD [C4 ] = f91, 5 * SIZE FMPY f99 = ALPHA, f99 nop __LINE__ } { .mfb STFD [C12] = f95, 5 * SIZE FMPY f103 = ALPHA, f103 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } { .mfb STFD [C13] = f100, SIZE FMPY f108 = ALPHA, f108 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } { .mfb STFD [C13] = f101, SIZE FMPY f109 = ALPHA, f109 nop __LINE__ } ;; { .mfb STFD [C5 ] = f98, SIZE FMPY f106 = ALPHA, f106 nop __LINE__ } { .mfb STFD [C13] = f102, SIZE FMPY f110 = ALPHA, f110 nop __LINE__ } ;; { .mfb STFD [C5 ] = f99, 5 * SIZE FMPY f107 = ALPHA, f107 nop __LINE__ } { .mfb STFD [C13] = f103, 5 * SIZE FMPY f111 = ALPHA, f111 nop __LINE__ } ;; { .mfb STFD [C6 ] = f104, SIZE FMPY f112 = ALPHA, f112 nop __LINE__ } { .mfb STFD [C14] = f108, SIZE FMPY f116 = ALPHA, f116 nop __LINE__ } ;; { .mfb STFD [C6 ] = f105, SIZE FMPY f113 = ALPHA, f113 nop __LINE__ } { .mfb STFD [C14] = f109, SIZE FMPY f117 = ALPHA, f117 nop __LINE__ } ;; { .mfb STFD [C6 ] = f106, SIZE FMPY f114 = ALPHA, f114 nop __LINE__ } { .mfb STFD [C14] = f110, SIZE FMPY f118 = ALPHA, f118 nop __LINE__ } ;; { .mfb STFD [C6 ] = f107, 5 * SIZE FMPY f115 = ALPHA, f115 nop __LINE__ } { .mfb STFD [C14] = f111, 5 * SIZE FMPY f119 = ALPHA, f119 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE FMPY f120 = ALPHA, f120 nop __LINE__ } { .mfb STFD [C15] = f116, SIZE FMPY f124 = ALPHA, f124 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, SIZE FMPY f121 = ALPHA, f121 nop __LINE__ } { .mfb STFD [C15] = f117, SIZE FMPY f125 = ALPHA, f125 nop __LINE__ } ;; { .mfi STFD [C7 ] = f114, SIZE FMPY f122 = ALPHA, f122 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C15] = f118, SIZE FMPY f126 = ALPHA, f126 nop __LINE__ } ;; { .mfi STFD [C7 ] = f115, 5 * SIZE FMPY f123 = ALPHA, f123 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi STFD [C15] = f119, 5 * SIZE FMPY f127 = ALPHA, f127 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C8 ] = f120, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C16] = f124, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f121, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C16] = f125, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C8 ] = f122, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mfb STFD [C16] = f126, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f123, 5 * SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C16] = f127, 5 * SIZE mov f120 = f0 (p6) br.cond.dptk .L011 } ;; #endif .L020: { .mfi cmp.eq p3, p0 = r0, r0 mov f89 = f0 tbit.z p6, p7 = M, 2 } { .mfb #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 8, KK #endif #endif mov f81 = f0 (p6) br.cond.dptk .L030 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; #else { .mfi shladd BOFFSET = KK8, 3, B mov f65 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; #endif { .mmf LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mfi setf.d f113 = r0 mov f121 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mmf LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfi setf.d f82 = r0 mov f90 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfi setf.d f114 = r0 mov f122 = f0 shr L = L, 1 } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f75 = f0 adds L = -1, L } { .mmf setf.d f67 = r0 setf.d f83 = r0 mov f91 = f0 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f107 = f0 mov ar.lc = L } { .mmf setf.d f99 = r0 setf.d f115 = r0 mov f123 = f0 } ;; .align 32 .L022: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C13 = 2 * SIZE, C5 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C14 = 2 * SIZE, C6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 (p5) adds C15 = 2 * SIZE, C7 } { .mfi nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 (p5) adds C16 = 2 * SIZE, C8 } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C10], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f86 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f87 = [C11], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f94 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f95 = [C12], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f102 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f101 = [C5 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f103 = [C13], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C6 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C14], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f116 = [C7 ], SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f118 = [C15], SIZE FMA f66 = ALPHA, f66, f70 nop __LINE__ } ;; { .mfb LDFD f117 = [C7 ], -1 * SIZE FMA f65 = ALPHA, f65, f69 nop __LINE__ } { .mfb LDFD f119 = [C15], -1 * SIZE FMA f67 = ALPHA, f67, f71 nop __LINE__ } ;; { .mfb LDFD f124 = [C8], SIZE FMA f72 = ALPHA, f72, f76 nop __LINE__ } { .mfb LDFD f126 = [C16], SIZE FMA f74 = ALPHA, f74, f78 nop __LINE__ } ;; { .mfb LDFD f125 = [C8], -1 * SIZE FMA f73 = ALPHA, f73, f77 nop __LINE__ } { .mfb LDFD f127 = [C16], -1 * SIZE FMA f75 = ALPHA, f75, f79 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMA f82 = ALPHA, f82, f86 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMA f83 = ALPHA, f83, f87 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMA f90 = ALPHA, f90, f94 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMA f91 = ALPHA, f91, f95 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f96 = ALPHA, f96, f100 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE FMA f98 = ALPHA, f98, f102 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE FMA f97 = ALPHA, f97, f101 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE FMA f99 = ALPHA, f99, f103 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMA f104 = ALPHA, f104, f108 nop __LINE__ } { .mfb STFD [C12] = f90, SIZE FMA f106 = ALPHA, f106, f110 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, 3 * SIZE FMA f105 = ALPHA, f105, f109 nop __LINE__ } { .mfb STFD [C12] = f91, 3 * SIZE FMA f107 = ALPHA, f107, f111 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMA f112 = ALPHA, f112, f116 nop __LINE__ } { .mfb STFD [C13] = f98, SIZE FMA f114 = ALPHA, f114, f118 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, 3 * SIZE FMA f113 = ALPHA, f113, f117 nop __LINE__ } { .mfb STFD [C13] = f99, 3 * SIZE FMA f115 = ALPHA, f115, f119 nop __LINE__ } ;; { .mfb STFD [C6 ] = f104, SIZE FMA f120 = ALPHA, f120, f124 nop __LINE__ } { .mfb STFD [C14] = f106, SIZE FMA f122 = ALPHA, f122, f126 nop __LINE__ } ;; { .mfb STFD [C6 ] = f105, 3 * SIZE FMA f121 = ALPHA, f121, f125 nop __LINE__ } { .mfb STFD [C14] = f107, 3 * SIZE FMA f123 = ALPHA, f123, f127 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C15] = f114, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, 3 * SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C15] = f115, 3 * SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C8 ] = f120, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C16] = f122, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfb STFD [C8 ] = f121, 3 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C16] = f123, 3 * SIZE mov f120 = f0 nop __LINE__ } ;; #else { .mfb FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb FMPY f66 = ALPHA, f66 nop __LINE__ } ;; { .mfb FMPY f65 = ALPHA, f65 nop __LINE__ } { .mfb FMPY f67 = ALPHA, f67 nop __LINE__ } ;; { .mfb FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb FMPY f74 = ALPHA, f74 nop __LINE__ } ;; { .mfb FMPY f73 = ALPHA, f73 nop __LINE__ } { .mfb FMPY f75 = ALPHA, f75 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMPY f90 = ALPHA, f90 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMPY f89 = ALPHA, f89 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMPY f91 = ALPHA, f91 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE FMPY f98 = ALPHA, f98 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE FMPY f99 = ALPHA, f99 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } { .mfb STFD [C12] = f90, SIZE FMPY f106 = ALPHA, f106 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, 3 * SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } { .mfb STFD [C12] = f91, 3 * SIZE FMPY f107 = ALPHA, f107 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMPY f112 = ALPHA, f112 nop __LINE__ } { .mfb STFD [C13] = f98, SIZE FMPY f114 = ALPHA, f114 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, 3 * SIZE FMPY f113 = ALPHA, f113 nop __LINE__ } { .mfb STFD [C13] = f99, 3 * SIZE FMPY f115 = ALPHA, f115 nop __LINE__ } ;; { .mfi STFD [C6 ] = f104, SIZE FMPY f120 = ALPHA, f120 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C14] = f106, SIZE FMPY f122 = ALPHA, f122 nop __LINE__ } ;; { .mfi STFD [C6 ] = f105, 3 * SIZE FMPY f121 = ALPHA, f121 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi STFD [C14] = f107, 3 * SIZE FMPY f123 = ALPHA, f123 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C7 ] = f112, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C15] = f114, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f113, 3 * SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C15] = f115, 3 * SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C8 ] = f120, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfb STFD [C16] = f122, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f121, 3 * SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C16] = f123, 3 * SIZE mov f120 = f0 nop __LINE__ } ;; #endif .align 32 .L030: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 8, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L040 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #else { .mmf shladd BOFFSET = KK8, 3, B shladd AOFFSET = KK8, 1, AOFFSET mov f65 = f0 } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #endif ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f81 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 shr L = L, 1 } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f97 = f0 adds L = -1, L } { .mfi nop __LINE__ mov f105 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov f113 = f0 mov ar.lc = L } { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f121 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 32 .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f100 = [C5], SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f108 = [C6], SIZE FMA f65 = ALPHA, f65, f69 nop __LINE__ } ;; { .mfb LDFD f101 = [C5], -1 * SIZE FMA f72 = ALPHA, f72, f76 nop __LINE__ } { .mfb LDFD f109 = [C6], -1 * SIZE FMA f73 = ALPHA, f73, f77 nop __LINE__ } ;; { .mfb LDFD f116 = [C7], SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb LDFD f124 = [C8], SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } ;; { .mfb LDFD f117 = [C7], -1 * SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb LDFD f125 = [C8], -1 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f96 = ALPHA, f96, f100 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE FMA f104 = ALPHA, f104, f108 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMA f97 = ALPHA, f97, f101 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE FMA f105 = ALPHA, f105, f109 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f112 = ALPHA, f112, f116 nop __LINE__ } { .mfb STFD [C4 ] = f88, SIZE FMA f120 = ALPHA, f120, f124 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMA f113 = ALPHA, f113, f117 nop __LINE__ } { .mfb STFD [C4 ] = f89, SIZE FMA f121 = ALPHA, f121, f125 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C6 ] = f104, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C6 ] = f105, SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f120, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f121, SIZE mov f120 = f0 nop __LINE__ } ;; #else { .mfb nop __LINE__ FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb nop __LINE__ FMPY f65 = ALPHA, f65 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb nop __LINE__ FMPY f73 = ALPHA, f73 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb nop __LINE__ FMPY f81 = ALPHA, f81 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb nop __LINE__ FMPY f89 = ALPHA, f89 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } ;; { .mfi STFD [C3 ] = f80, SIZE FMPY f112 = ALPHA, f112 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C4 ] = f88, SIZE FMPY f120 = ALPHA, f120 nop __LINE__ } ;; { .mfi STFD [C3 ] = f81, SIZE FMPY f113 = ALPHA, f113 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi STFD [C4 ] = f89, SIZE FMPY f121 = ALPHA, f121 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C5 ] = f96, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C6 ] = f104, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C5 ] = f97, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C6 ] = f105, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C7 ] = f112, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfb STFD [C8 ] = f120, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f113, SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C8 ] = f121, SIZE mov f120 = f0 nop __LINE__ } ;; #endif .align 32 .L040: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 8, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L049 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #else { .mmi shladd BOFFSET = KK8, 3, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #endif ;; { .mii LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f52, f53 = [BOFFSET], 2 * SIZE LDFD f32 = [AOFFSET], 1 * SIZE adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } { .mmi LDFPD f54, f55 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET nop __LINE__ } ;; .align 32 .L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] #else nop __LINE__ #endif FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2] #else nop __LINE__ #endif FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3] #else nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4] #else nop __LINE__ #endif (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C5] (p5) LDFD f108 = [C6] #else nop __LINE__ nop __LINE__ #endif nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f116 = [C7] (p5) LDFD f124 = [C8] #else nop __LINE__ nop __LINE__ #endif br.cloop.sptk.few .L042 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f72 = ALPHA, f72, f76 FMA f80 = ALPHA, f80, f84 FMA f88 = ALPHA, f88, f92 FMA f96 = ALPHA, f96, f100 FMA f104 = ALPHA, f104, f108 FMA f112 = ALPHA, f112, f116 FMA f120 = ALPHA, f120, f124 ;; STFD [C1 ] = f64, SIZE mov f64 = f0 STFD [C2 ] = f72, SIZE mov f72 = f0 ;; STFD [C3 ] = f80, SIZE mov f80 = f0 STFD [C4 ] = f88, SIZE mov f88 = f0 ;; STFD [C5 ] = f96, SIZE mov f96 = f0 STFD [C6 ] = f104, SIZE mov f104 = f0 ;; STFD [C7 ] = f112, SIZE mov f112 = f0 STFD [C8 ] = f120, SIZE mov f120 = f0 ;; #else FMPY f64 = ALPHA, f64 FMPY f72 = ALPHA, f72 FMPY f80 = ALPHA, f80 FMPY f88 = ALPHA, f88 { .mfi FMPY f96 = ALPHA, f96 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f104 = ALPHA, f104 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f112 = ALPHA, f112 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f120 = ALPHA, f120 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C4 ] = f88, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C5 ] = f96, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } { .mfi STFD [C6 ] = f104, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f112, SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C8 ] = f120, SIZE mov f120 = f0 nop __LINE__ } ;; #endif .align 32 .L049: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } ;; { .mib cmp.lt p6, p0 = 0, J shr BB = K, 3 (p6) br.cond.dptk .L010 } ;; .align 32 .L050: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 2 } { .mfi add C2 = LDC, C mov f72 = f0 shr I = M, 3 } ;; { .mfi shladd C3 = LDC, 1, C mov f80 = f0 nop __LINE__ } { .mfb mov AOFFSET = A mov f88 = f0 (p6) br.cond.dpnt .L090 } ;; { .mfi cmp.eq p6, p7 = 0, I mov f65 = f0 #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif } { .mfi shladd C4 = LDC, 1, C2 mov f73 = f0 nop __LINE__ } ;; { .mfi nop __LINE__ mov f81 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb shladd C = LDC, 2, C mov f89 = f0 (p6) br.cond.dpnt .L060 } ;; .align 32 .L052: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f66 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f74 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 2, B mov f66 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f74 = f0 nop __LINE__ } ;; #endif ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 4, KK #endif #endif } { .mfi setf.d f84 = r0 mov f90 = f0 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f67 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f75 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f83 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f91 = r0 mov f68 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC], LDC mov f76 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f92 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f69 = f0 shr L = L, 1 } { .mmf setf.d f77 = r0 setf.d f85 = r0 mov f93 = f0 } ;; { .mfi CPREFETCH [PREC], LDC mov f70 = f0 adds L = -1, L } { .mmf setf.d f78 = r0 setf.d f86 = r0 mov f94 = f0 } ;; { .mfi CPREFETCH [PREC] mov f71 = f0 mov ar.lc = L } { .mmf setf.d f79 = r0 setf.d f87 = r0 mov f95 = f0 } ;; .align 32 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 adds C9 = 4 * SIZE, C1 } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C10 = 4 * SIZE, C2 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C11 = 4 * SIZE, C3 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f96 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f97 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f98 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f99 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f101 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f102 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f103 = [C9 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f104 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f105 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f106 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f107 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C10], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f112 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f113 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f114 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f115 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f116 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f117 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f118 = [C3 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f87 = f47, f58, f87 // A8 * B3 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f119 = [C11], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f95 = f47, f59, f95 // A8 * B4 br.cloop.sptk.few .L053 } ;; .align 32 .L058: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi LDFD f120 = [C4 ], SIZE FMA f64 = ALPHA, f64, f96 cmp.ne p6, p0 = 1, I } { .mfb LDFD f121 = [C12], SIZE FMA f68 = ALPHA, f68, f97 nop __LINE__ } ;; { .mfi LDFD f122 = [C4 ], SIZE FMA f65 = ALPHA, f65, f98 adds I = -1, I } { .mfb LDFD f123 = [C12], SIZE FMA f69 = ALPHA, f69, f99 nop __LINE__ } ;; { .mfb LDFD f124 = [C4 ], SIZE FMA f66 = ALPHA, f66, f100 nop __LINE__ } { .mfb LDFD f125 = [C12], SIZE FMA f70 = ALPHA, f70, f101 nop __LINE__ } ;; { .mfb LDFD f126 = [C4 ], -3 * SIZE FMA f67 = ALPHA, f67, f102 nop __LINE__ } { .mfb LDFD f127 = [C12], -3 * SIZE FMA f71 = ALPHA, f71, f103 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f72 = ALPHA, f72, f104 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMA f76 = ALPHA, f76, f105 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMA f73 = ALPHA, f73, f106 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMA f77 = ALPHA, f77, f107 nop __LINE__ } ;; { .mfb STFD [C1 ] = f66, SIZE FMA f74 = ALPHA, f74, f108 nop __LINE__ } { .mfb STFD [C9 ] = f70, SIZE FMA f78 = ALPHA, f78, f109 nop __LINE__ } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMA f75 = ALPHA, f75, f110 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMA f79 = ALPHA, f79, f111 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f80 = ALPHA, f80, f112 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMA f84 = ALPHA, f84, f113 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMA f81 = ALPHA, f81, f114 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMA f85 = ALPHA, f85, f115 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMA f82 = ALPHA, f82, f116 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMA f86 = ALPHA, f86, f117 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMA f83 = ALPHA, f83, f118 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMA f87 = ALPHA, f87, f119 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f88 = ALPHA, f88, f120 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMA f92 = ALPHA, f92, f121 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMA f89 = ALPHA, f89, f122 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMA f93 = ALPHA, f93, f123 nop __LINE__ } ;; { .mfb STFD [C3 ] = f82, SIZE FMA f90 = ALPHA, f90, f124 nop __LINE__ } { .mfb STFD [C11] = f86, SIZE FMA f94 = ALPHA, f94, f125 nop __LINE__ } ;; { .mfb STFD [C3 ] = f83, 5 * SIZE FMA f91 = ALPHA, f91, f126 nop __LINE__ } { .mfb STFD [C11] = f87, 5 * SIZE FMA f95 = ALPHA, f95, f127 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C12] = f92, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C12] = f93, SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C4 ] = f90, SIZE mov f65 = f0 nop __LINE__ } { .mfb STFD [C12] = f94, SIZE mov f73 = f0 nop __LINE__ } ;; { .mfb STFD [C4 ] = f91, 5 * SIZE mov f81 = f0 nop __LINE__ } { .mfb STFD [C12] = f95, 5 * SIZE mov f89 = f0 (p6) br.cond.dptk .L052 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMPY f68 = ALPHA, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb nop __LINE__ FMPY f69 = ALPHA, f69 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f70 = ALPHA, f70 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f67 = ALPHA, f67 nop __LINE__ } { .mfb nop __LINE__ FMPY f71 = ALPHA, f71 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMPY f76 = ALPHA, f76 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMPY f73 = ALPHA, f73 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMPY f77 = ALPHA, f77 nop __LINE__ } ;; { .mfb STFD [C1 ] = f66, SIZE FMPY f74 = ALPHA, f74 nop __LINE__ } { .mfb STFD [C9 ] = f70, SIZE FMPY f78 = ALPHA, f78 nop __LINE__ } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMPY f75 = ALPHA, f75 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMPY f79 = ALPHA, f79 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMPY f84 = ALPHA, f84 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMPY f85 = ALPHA, f85 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMPY f86 = ALPHA, f86 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMPY f87 = ALPHA, f87 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMPY f92 = ALPHA, f92 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMPY f89 = ALPHA, f89 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMPY f93 = ALPHA, f93 nop __LINE__ } ;; { .mfi STFD [C3 ] = f82, SIZE FMPY f90 = ALPHA, f90 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C11] = f86, SIZE FMPY f94 = ALPHA, f94 nop __LINE__ } ;; { .mfi STFD [C3 ] = f83, 5 * SIZE FMPY f91 = ALPHA, f91 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi STFD [C11] = f87, 5 * SIZE FMPY f95 = ALPHA, f95 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi STFD [C4 ] = f88, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C12] = f92, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f89, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C12] = f93, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C4 ] = f90, SIZE mov f65 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mfb STFD [C12] = f94, SIZE mov f73 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f91, 5 * SIZE mov f81 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C12] = f95, 5 * SIZE mov f89 = f0 (p6) br.cond.dptk .L052 } ;; #endif .align 32 .L060: { .mfi nop __LINE__ mov f66 = f0 tbit.z p6, p7 = M, 2 } { .mfb #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 4, KK #endif #endif mov f74 = f0 (p6) br.cond.dptk .L070 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f82 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f90 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mfi shladd BOFFSET = KK8, 2, B mov f82 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f90 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif ;; { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f67 = f0 adds L = -1, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov f75 = f0 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 mov ar.lc = L } { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 32 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C10], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f86 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f87 = [C11], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f94 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f95 = [C12], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 br.cloop.sptk.few .L062 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f66 = ALPHA, f66, f70 FMA f65 = ALPHA, f65, f69 FMA f67 = ALPHA, f67, f71 FMA f72 = ALPHA, f72, f76 FMA f74 = ALPHA, f74, f78 FMA f73 = ALPHA, f73, f77 FMA f75 = ALPHA, f75, f79 ;; { .mfb STFD [C1 ] = f64, SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMA f82 = ALPHA, f82, f86 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMA f83 = ALPHA, f83, f87 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMA f90 = ALPHA, f90, f94 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMA f91 = ALPHA, f91, f95 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE mov f64 = f0 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE mov f81 = f0 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f88, SIZE mov f88 = f0 adds L = 1, K } { .mfb STFD [C12] = f90, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f89, 3 * SIZE mov f89 = f0 shr L = L, 1 } { .mfb STFD [C12] = f91, 3 * SIZE mov f73 = f0 nop __LINE__ } ;; #else FMPY f64 = ALPHA, f64 FMPY f66 = ALPHA, f66 FMPY f65 = ALPHA, f65 FMPY f67 = ALPHA, f67 FMPY f72 = ALPHA, f72 FMPY f74 = ALPHA, f74 FMPY f73 = ALPHA, f73 FMPY f75 = ALPHA, f75 ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } ;; { .mfi STFD [C2 ] = f72, SIZE FMPY f88 = ALPHA, f88 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C10] = f74, SIZE FMPY f90 = ALPHA, f90 nop __LINE__ } ;; { .mfi STFD [C2 ] = f73, 3 * SIZE FMPY f89 = ALPHA, f89 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi STFD [C10] = f75, 3 * SIZE FMPY f91 = ALPHA, f91 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C11] = f82, SIZE mov f64 = f0 nop __LINE__ } ;; { .mfi STFD [C3 ] = f81, 3 * SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C11] = f83, 3 * SIZE mov f72 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C4 ] = f88, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfb STFD [C12] = f90, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f89, 3 * SIZE mov f89 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C12] = f91, 3 * SIZE mov f73 = f0 nop __LINE__ } ;; #endif .align 32 .L070: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 4, KK #endif #endif tbit.z p6,p7 = M, 1 (p6) br.cond.dptk .L080 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 2, B shladd AOFFSET = KK8, 1, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif nop __LINE__ } ;; #endif { .mii cmp.eq p3, p0 = r0, r0 tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = -1, L adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov ar.lc = L } ;; .align 32 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ nop __LINE__ #endif FMA f89 = f33, f51, f89 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3 ], SIZE (p5) LDFD f92 = [C4 ], SIZE #else nop __LINE__ nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f89 = f41, f59, f89 // A2 * B4 br.cloop.sptk.few .L072 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f65 = ALPHA, f65, f69 FMA f72 = ALPHA, f72, f76 FMA f73 = ALPHA, f73, f77 FMA f80 = ALPHA, f80, f84 FMA f81 = ALPHA, f81, f85 FMA f88 = ALPHA, f88, f92 FMA f89 = ALPHA, f89, f93 ;; { .mfb STFD [C1 ] = f64, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C2 ] = f73, SIZE nop __LINE__ } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 adds L = 1, K } { .mfb STFD [C4 ] = f88, SIZE mov f88 = f0 nop __LINE__ } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C4 ] = f89, SIZE shr L = L, 1 } ;; #else FMPY f64 = ALPHA, f64 FMPY f65 = ALPHA, f65 ;; { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f73 = ALPHA, f73 nop __LINE__ } ;; { .mfi FMPY f80 = ALPHA, f80 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi FMPY f81 = ALPHA, f81 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi nop __LINE__ FMPY f88 = ALPHA, f88 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f89 = ALPHA, f89 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C2 ] = f73, SIZE #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfb STFD [C4 ] = f88, SIZE mov f88 = f0 nop __LINE__ } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C4 ] = f89, SIZE #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; #endif .align 32 .L080: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 4, KK #endif #endif tbit.z p6,p7 = M, 0 (p6) br.cond.dptk .L089 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 2, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif nop __LINE__ } ;; #endif { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi nop __LINE__ nop __LINE__ adds L = -1, L } ;; { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L082: { .mfb cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mmf (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2] (p5) LDFD f84 = [C3] #else nop __LINE__ nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mib (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mmb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4] #else nop __LINE__ #endif adds L = -1, L br.cloop.sptk.few .L082 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f72 = ALPHA, f72, f76 FMA f80 = ALPHA, f80, f84 FMA f88 = ALPHA, f88, f92 ;; STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 nop __LINE__ } ;; { .mfi FMPY f80 = ALPHA, f80 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } { .mfi FMPY f88 = ALPHA, f88 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmi STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } ;; { .mmi STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; #endif .align 32 .L089: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } ;; .align 16 .L090: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 1 } { .mfi add C2 = LDC, C mov f72 = f0 shr I = M, 3 } ;; { .mfi setf.d f66 = r0 mov f65 = f0 #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif } { .mfb mov AOFFSET = A mov f73 = f0 (p6) br.cond.dpnt .L130 } ;; { .mfi #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif mov f67 = f0 shladd C = LDC, 1, C } { .mfb cmp.eq p6, p7 = 0, I mov f74 = f0 (p6) br.cond.dpnt .L100 } ;; .align 32 .L092: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f68 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f79 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 1, B mov f68 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f79 = f0 nop __LINE__ } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f75 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 2, KK #endif #endif } ;; { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 mov f76 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f69 = f0 tbit.z p12, p0 = L, 0 } { .mfi cmp.eq p3, p0 = r0, r0 mov f77 = f0 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC], LDC mov f70 = f0 } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f78 = f0 mov ar.lc = L } { .mfi CPREFETCH [PREC] mov f71 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; .align 32 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f96 = [C1 ], SIZE #else nop __LINE__ #endif FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f97 = [C9 ], SIZE #else nop __LINE__ #endif FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f98 = [C1 ], SIZE #else nop __LINE__ #endif FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f99 = [C9 ], SIZE #else nop __LINE__ #endif FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C1 ], SIZE #else nop __LINE__ #endif FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f101 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f102 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f103 = [C9 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f104 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f105 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f106 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f107 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C10], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f79 = f47, f57, f79 // A8 * B2 br.cloop.sptk.few .L093 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi nop __LINE__ FMA f64 = ALPHA, f64, f96 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMA f68 = ALPHA, f68, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f65 = ALPHA, f65, f98 adds I = -1, I } { .mfb nop __LINE__ FMA f69 = ALPHA, f69, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f66 = ALPHA, f66, f100 nop __LINE__ } { .mfb nop __LINE__ FMA f70 = ALPHA, f70, f101 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = ALPHA, f67, f102 nop __LINE__ } { .mfb nop __LINE__ FMA f71 = ALPHA, f71, f103 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f72 = ALPHA, f72, f104 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMA f76 = ALPHA, f76, f105 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMA f73 = ALPHA, f73, f106 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMA f77 = ALPHA, f77, f107 nop __LINE__ } ;; { .mfb STFD [C1 ] = f66, SIZE FMA f74 = ALPHA, f74, f108 nop __LINE__ } { .mfb STFD [C9 ] = f70, SIZE FMA f78 = ALPHA, f78, f109 nop __LINE__ } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMA f75 = ALPHA, f75, f110 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMA f79 = ALPHA, f79, f111 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE mov f65 = f0 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE mov f73 = f0 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE mov f66 = f0 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE mov f74 = f0 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE mov f67 = f0 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE (p6) br.cond.dptk .L092 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMPY f68 = ALPHA, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb nop __LINE__ FMPY f69 = ALPHA, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f70 = ALPHA, f70 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f67 = ALPHA, f67 nop __LINE__ } { .mfb nop __LINE__ FMPY f71 = ALPHA, f71 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMPY f76 = ALPHA, f76 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMPY f73 = ALPHA, f73 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMPY f77 = ALPHA, f77 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMPY f74 = ALPHA, f74 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C9 ] = f70, SIZE FMPY f78 = ALPHA, f78 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE FMPY f75 = ALPHA, f75 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi STFD [C9 ] = f71, 5 * SIZE FMPY f79 = ALPHA, f79 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mfi STFD [C2 ] = f72, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C10] = f76, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f73, SIZE mov f65 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C10] = f77, SIZE mov f73 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C2 ] = f74, SIZE mov f66 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mfb STFD [C10] = f78, SIZE mov f74 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f75, 5 * SIZE mov f67 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mib STFD [C10] = f79, 5 * SIZE nop __LINE__ (p6) br.cond.dptk .L092 } ;; #endif .align 32 .L100: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L110 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmf LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B mov f75 = f0 } { .mii nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mfi shladd BOFFSET = KK8, 1, B mov f75 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ adds L = 1, L } ;; #endif ;; { .mii adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 2 * SIZE, C2 } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 br.cloop.sptk.few .L102 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f77 = [C2 ], -1 * SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f79 = [C10], -1 * SIZE FMA f66 = ALPHA, f66, f70 nop __LINE__ } ;; FMA f65 = ALPHA, f65, f69 adds L = 1, K FMA f67 = ALPHA, f67, f71 ;; FMA f72 = ALPHA, f72, f76 shr L = L, 1 FMA f74 = ALPHA, f74, f78 FMA f73 = ALPHA, f73, f77 FMA f75 = ALPHA, f75, f79 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f66, SIZE mov f64 = f0 } ;; { .mmf STFD [C1 ] = f65, 3 * SIZE STFD [C9 ] = f67, 3 * SIZE mov f65 = f0 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f74, SIZE mov f72 = f0 } ;; { .mmf STFD [C2 ] = f73, 3 * SIZE STFD [C10] = f75, 3 * SIZE mov f73 = f0 } ;; #else { .mfb nop __LINE__ FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } ;; FMPY f65 = ALPHA, f65 FMPY f67 = ALPHA, f67 ;; { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f74 = ALPHA, f74 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f73 = ALPHA, f73 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f75 = ALPHA, f75 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f66, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, 3 * SIZE mov f65 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f67, 3 * SIZE nop __LINE__ #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C2 ] = f72, SIZE mov f72 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mmi STFD [C10] = f74, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C2 ] = f73, 3 * SIZE mov f73 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mib STFD [C10] = f75, 3 * SIZE nop __LINE__ nop __LINE__ } ;; #endif .align 32 .L110: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L120 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 1, B shladd AOFFSET = KK8, 1, AOFFSET } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ adds L = 1, L } ;; #endif ;; { .mii adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi cmp.eq p3, p0 = r0, r0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; .align 32 .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ nop __LINE__ #endif FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 br.cloop.sptk.few .L112 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f65 = ALPHA, f65, f69 FMA f72 = ALPHA, f72, f76 FMA f73 = ALPHA, f73, f77 ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE mov f73 = f0 nop __LINE__ } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f73 = ALPHA, f73 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f73, SIZE mov f73 = f0 nop __LINE__ } ;; #endif .align 32 .L120: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L129 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 1, B add AOFFSET = KK8, AOFFSET } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ adds L = 1, L } ;; #endif { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFD f32 = [AOFFSET], 1 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi cmp.eq p3, p0 = r0, r0 nop __LINE__ mov ar.lc = L } ;; .align 32 .L122: { .mfi FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } { .mmi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] (p5) LDFD f76 = [C2] #else nop __LINE__ nop __LINE__ #endif nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 br.cloop.sptk.few .L122 } ;; .L128: #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f72 = ALPHA, f72, f76 ;; { .mfi STFD [C1 ] = f64 mov f64 = f0 } { .mfb STFD [C2 ] = f72 mov f72 = f0 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 nop __LINE__ } ;; { .mmi nop __LINE__ #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif ;; { .mfi STFD [C1 ] = f64 mov f64 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f72 mov f72 = f0 } ;; #endif .align 32 .L129: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } ;; .align 16 .L130: { .mfi #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif mov f64 = f0 tbit.z p6, p0 = N, 0 } { .mib mov AOFFSET = A shr I = M, 3 (p6) br.cond.dpnt .L999 } ;; { .mfi mov C1 = C mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mfi nop __LINE__ mov f66 = f0 nop __LINE__ } { .mfb cmp.eq p7, p0 = 0, I mov f67 = f0 (p7) br.cond.dpnt .L140 } ;; .align 32 .L132: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFD f48 = [B] mov f68 = f0 nop __LINE__ } { .mfi adds BOFFSET = 1 * SIZE, B mov f69 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 1, KK #endif #endif } ;; #else { .mfi add BOFFSET = KK8, B mov f68 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; { .mfi LDFD f48 = [BOFFSET], 1 * SIZE mov f69 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 1, KK #endif #endif } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f70 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mii LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f71 = f0 adds L = -1, L } ;; { .mmi LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds PREC = CPREFETCHSIZE * SIZE, C1 cmp.eq p3, p0 = r0, r0 } ;; { .mmi CPREFETCH [PREC] adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov ar.lc = L } ;; .align 32 .L133: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C9 = 4 * SIZE, C1 } { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f6 = [C1 ], SIZE #else nop __LINE__ #endif FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f7 = [C9 ], SIZE #else nop __LINE__ #endif FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f10 = [C1 ], SIZE #else nop __LINE__ #endif FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f11 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f12 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f13 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } ;; { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f14 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f15 = [C9 ], -3 * SIZE #else nop __LINE__ #endif nop __LINE__ br.cloop.sptk.few .L133 } ;; .L138: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi FMA f64 = ALPHA, f64, f6 cmp.ne p6, p0 = 1, I } { .mfb FMA f68 = ALPHA, f68, f7 } ;; { .mfi FMA f65 = ALPHA, f65, f10 adds I = -1, I } { .mfb FMA f69 = ALPHA, f69, f11 } ;; { .mfi FMA f66 = ALPHA, f66, f12 } { .mfb FMA f70 = ALPHA, f70, f13 } ;; { .mfb FMA f67 = ALPHA, f67, f14 } { .mfb FMA f71 = ALPHA, f71, f15 } ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmf STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE mov f65 = f0 } ;; { .mmf STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE mov f66 = f0 } ;; { .mmf STFD [C1 ] = f67, 5 * SIZE nop __LINE__ mov f67 = f0 } { .mmb STFD [C9 ] = f71, 5 * SIZE nop __LINE__ (p6) br.cond.dptk .L132 } ;; #else { .mfi FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb FMPY f68 = ALPHA, f68 } ;; { .mfi FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb FMPY f69 = ALPHA, f69 } ;; { .mfi FMPY f66 = ALPHA, f66 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb FMPY f70 = ALPHA, f70 } ;; { .mfi FMPY f67 = ALPHA, f67 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi FMPY f71 = ALPHA, f71 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f68, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f69, SIZE nop __LINE__ #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f66, SIZE mov f66 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f70, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE mov f67 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mmb STFD [C9 ] = f71, 5 * SIZE nop __LINE__ (p6) br.cond.dptk .L132 } ;; #endif .align 32 .L140: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L150 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi add BOFFSET = KK8, B shladd AOFFSET = KK8, 2, AOFFSET nop __LINE__ } ;; { .mmi LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE adds L = -1, L nop __LINE__ } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L142: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 (p5) adds C9 = 2 * SIZE, C1 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif (p3) LDFD f56 = [BOFFSET], 1 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 (p5) adds C10 = 2 * SIZE, C2 } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif nop.f 0 br.cloop.sptk.few .L142 } ;; .L148: #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f66 = ALPHA, f66, f70 FMA f65 = ALPHA, f65, f69 FMA f67 = ALPHA, f67, f71 ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 adds L = 1, K } { .mfb STFD [C9 ] = f66, SIZE mov f66 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, 3 * SIZE mov f65 = f0 shr L = L, 1 } { .mfb STFD [C9 ] = f67, 3 * SIZE mov f67 = f0 nop __LINE__ } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f67 = ALPHA, f67 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfb STFD [C9 ] = f66, SIZE mov f66 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, 3 * SIZE mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C9 ] = f67, 3 * SIZE mov f67 = f0 nop __LINE__ } ;; #endif .align 32 .L150: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L160 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi add BOFFSET = KK8, B shladd AOFFSET = KK8, 1, AOFFSET nop __LINE__ } ;; { .mmi LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mii cmp.eq p3, p0 = r0, r0 tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = -1, L ;; mov ar.lc = L } ;; .align 32 .L152: { .mfi cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } ;; { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } ;; { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 br.cloop.sptk.few .L152 } ;; .L158: #if! defined(TRMMKERNEL) && !defined(BETAZERO) LDFD f68 = [C1 ], SIZE ;; LDFD f69 = [C1 ], -1 * SIZE ;; FMA f64 = ALPHA, f64, f68 FMA f65 = ALPHA, f65, f69 ;; STFD [C1 ] = f64, SIZE mov f64 = f0 ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 nop __LINE__ } ;; { .mii nop __LINE__ #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; #endif .align 32 .L160: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L169 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi add BOFFSET = KK8, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif ;; { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mii adds L = -1, L cmp.eq p3, p0 = r0, r0 ;; mov ar.lc = L } ;; .align 32 .L162: { .mmf cmp.ne p4, p5 = 0, L (p12) cmp.ne p3, p0 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 } ;; { .mmi (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } ;; { .mmi (p4) LDFD f32 = [AOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] #else nop __LINE__ #endif adds L = -1, L } { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 br.cloop.sptk.few .L162 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 #else FMPY f64 = ALPHA, f64 #endif ;; STFD [C1 ] = f64 ;; .align 32 .L169: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } ;; .align 16 .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f24 = [SP], 32 ldf.fill f25 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f26 = [SP], 32 ldf.fill f27 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f28 = [SP], 32 ldf.fill f29 = [r9], 32 ;; ldf.fill f30 = [SP], 32 ldf.fill f31 = [r9] br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/gemm_ncopy.S000066400000000000000000000216661313527062700174630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 64 #define WPREFETCHSIZE 32 #ifndef XDOUBLE #define LD LDF8 #define ST STF8_NTA #else #define LD LDFD #define ST STFD_NTA #endif #define J r15 #define PREB r17 #define PREA r18 #define A1 r19 #define A2 r20 #define A3 r21 #define A4 r22 #define A5 r23 #define A6 r24 #define A7 r25 #define A8 r26 #define B1 r27 #define B2 r28 #define COUNT r9 #define I r10 #define II r11 #define ARLC r29 #define PR r30 #define M r32 #define N r33 #define A r34 #define LDA r35 #define B r36 PROLOGUE .prologue PROFCODE .body { .mii shladd LDA = LDA, BASE_SHIFT, r0 mov PR = pr shr J = N, 3 } ;; { .mib cmp.eq p8, p0 = 0, J mov ARLC = ar.lc (p8) br.cond.dpnt .L20 } ;; .align 32 .L11: { .mmi mov A1 = A add A2 = A, LDA mov pr.rot = 0 } { .mmi shladd A3 = LDA, 1, A shladd A5 = LDA, 2, A adds I = 1, M } ;; { .mmi shladd A4 = LDA, 1, A2 shladd A6 = LDA, 2, A2 mov ar.ec = 6 } { .mmi cmp.eq p16, p0 = r0, r0 shladd A7 = LDA, 2, A3 shr I = I, 1 } ;; { .mmi adds B1 = 8 * SIZE, B shladd A8 = LDA, 2, A4 shladd A = LDA, 3, A } { .mmi adds I = -1, I mov COUNT = 0 adds J = -1, J } ;; { .mmi adds PREA = PREFETCHSIZE * SIZE, A adds PREB = WPREFETCHSIZE * SIZE, B mov ar.lc = I } { .mmi mov I = M mov II = M cmp.ne p14, p0 = r0, r0 } ;; .align 32 .L12: { .mmi (p21) ST [B ] = f37, 1 * SIZE (p14) ST [B1] = f49, 1 * SIZE (p16) cmp.ne.unc p13, p0 = 1, I } { .mmi lfetch.nt1 [PREA], LDA lfetch.excl.nt1 [PREB] adds PREB = 16 * SIZE, PREB } ;; { .mmi (p21) ST [B ] = f43, 1 * SIZE (p14) ST [B1] = f55, 1 * SIZE cmp.eq p9, p0 = 8, COUNT } { .mmi (p16) LD f32 = [A1], SIZE (p16) LD f38 = [A2], SIZE (p16) adds I = -2, I } ;; { .mmi (p21) ST [B ] = f61, 1 * SIZE (p14) ST [B1] = f73, 1 * SIZE (p9) mov COUNT = 0 } { .mmi (p13) LD f44 = [A1], SIZE (p13) LD f50 = [A2], SIZE (p21) adds II = -2, II } ;; { .mmb (p21) ST [B ] = f67, 1 * SIZE (p14) ST [B1] = f79, 1 * SIZE nop __LINE__ } { .mmb (p16) LD f56 = [A3], SIZE (p16) LD f62 = [A4], SIZE nop __LINE__ } ;; { .mmi (p21) ST [B ] = f85, 1 * SIZE (p14) ST [B1] = f97, 1 * SIZE (p9) adds PREA = (PREFETCHSIZE - 2)* SIZE, A1 } { .mmb (p13) LD f68 = [A3], SIZE (p13) LD f74 = [A4], SIZE nop __LINE__ } ;; { .mmb (p21) ST [B ] = f91, 1 * SIZE (p14) ST [B1] = f103, 1 * SIZE nop __LINE__ } { .mmb (p16) LD f80 = [A5], SIZE (p16) LD f86 = [A6], SIZE nop __LINE__ } ;; { .mmb (p21) ST [B ] = f109, 1 * SIZE (p14) ST [B1] = f121, 1 * SIZE nop __LINE__ } { .mmb (p13) LD f92 = [A5], SIZE (p13) LD f98 = [A6], SIZE nop __LINE__ } ;; { .mmi (p21) ST [B ] = f115, 1 * SIZE (p14) ST [B1] = f127, 9 * SIZE (p16) adds COUNT = 1, COUNT } { .mmb (p16) LD f104 = [A7], SIZE (p16) LD f110 = [A8], SIZE nop __LINE__ } ;; { .mmi (p13) LD f116 = [A7], SIZE (p13) LD f122 = [A8], SIZE (p14) adds B = 8 * SIZE, B } { .mmb (p20) cmp.ne.unc p14, p0 = 1, II nop __LINE__ br.ctop.sptk.few .L12 } ;; { .mmb cmp.ne p6, p0 = 0, J nop __LINE__ (p6) br.cond.dptk .L11 } ;; .align 32 .L20: { .mmi adds I = 1, M mov A1 = A mov pr.rot = 0 } { .mmi add A2 = A, LDA shladd A3 = LDA, 1, A tbit.z p6, p0 = N, 2 } ;; { .mmi shladd A4 = LDA, 1, A2 adds B1 = 4 * SIZE, B mov ar.ec = 6 } { .mib cmp.eq p16, p0 = r0, r0 shr I = I, 1 (p6) br.cond.dpnt .L30 } ;; { .mmi shladd A = LDA, 2, A nop __LINE__ nop __LINE__ } { .mmi adds I = -1, I mov COUNT = 0 adds J = -1, J } ;; { .mmi adds PREA = PREFETCHSIZE * SIZE, A adds PREB = WPREFETCHSIZE * SIZE, B mov ar.lc = I } { .mmi mov I = M mov II = M cmp.ne p14, p0 = r0, r0 } ;; .align 32 .L22: { .mmi (p21) ST [B ] = f37, 1 * SIZE (p14) ST [B1] = f49, 1 * SIZE (p16) cmp.ne.unc p13, p0 = 1, I } { .mmi lfetch.nt1 [PREA], LDA lfetch.excl.nt1 [PREB], 8 * SIZE cmp.eq p9, p0 = 4, COUNT } ;; { .mmi (p21) ST [B ] = f43, 1 * SIZE (p14) ST [B1] = f55, 1 * SIZE (p16) adds I = -2, I } { .mmi (p16) LD f32 = [A1], SIZE (p16) LD f38 = [A2], SIZE (p21) adds II = -2, II } ;; { .mmi (p21) ST [B ] = f61, 1 * SIZE (p14) ST [B1] = f73, 1 * SIZE (p9) mov COUNT = 0 } { .mmi (p13) LD f44 = [A1], SIZE (p13) LD f50 = [A2], SIZE nop __LINE__ } ;; { .mmi (p21) ST [B ] = f67, 1 * SIZE (p14) ST [B1] = f79, 5 * SIZE (p9) adds PREA = PREFETCHSIZE * SIZE, A1 } { .mmb (p16) LD f56 = [A3], SIZE (p16) LD f62 = [A4], SIZE nop __LINE__ } ;; { .mmi (p13) LD f68 = [A3], SIZE (p13) LD f74 = [A4], SIZE (p16) adds COUNT = 1, COUNT } { .mmb (p14) adds B = 4 * SIZE, B (p20) cmp.ne.unc p14, p0 = 1, II br.ctop.sptk.few .L22 } ;; .align 32 .L30: { .mmi adds I = 1, M mov A1 = A mov pr.rot = 0 } { .mmi add A2 = A, LDA adds B1 = 2 * SIZE, B tbit.z p6, p0 = N, 1 } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.ec = 6 } { .mib cmp.eq p16, p0 = r0, r0 shr I = I, 1 (p6) br.cond.dpnt .L40 } ;; { .mmi adds I = -1, I ;; shladd A = LDA, 1, A mov ar.lc = I } { .mmi mov I = M mov II = M cmp.ne p14, p0 = r0, r0 } ;; .align 32 .L32: { .mmi (p21) ST [B ] = f37, 1 * SIZE (p14) ST [B1] = f49, 1 * SIZE (p16) cmp.ne.unc p13, p0 = 1, I } { .mmi nop __LINE__ nop __LINE__ (p21) adds II = -2, II } ;; { .mmi (p21) ST [B ] = f43, 1 * SIZE (p14) ST [B1] = f55, 3 * SIZE nop __LINE__ } { .mmi (p16) LD f32 = [A1], SIZE (p16) LD f38 = [A2], SIZE nop __LINE__ } ;; { .mmi (p13) LD f44 = [A1], SIZE (p13) LD f50 = [A2], SIZE (p16) adds I = -2, I } { .mmb (p14) adds B = 2 * SIZE, B (p20) cmp.ne.unc p14, p0 = 1, II br.ctop.sptk.few .L32 } ;; .align 32 .L40: { .mmi adds I = 1, M mov A1 = A mov pr.rot = 0 } { .mmi tbit.z p6, p0 = N, 0 } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.ec = 6 } { .mib cmp.eq p16, p0 = r0, r0 shr I = I, 1 (p6) br.cond.dpnt .L999 } ;; { .mmi adds I = -1, I ;; mov ar.lc = I } { .mmi mov I = M mov II = M cmp.ne p14, p0 = r0, r0 } ;; .align 32 .L42: { .mmi (p21) ST [B ] = f37, 1 * SIZE (p16) cmp.ne.unc p13, p0 = 1, I (p21) adds II = -2, II } ;; { .mmi (p14) ST [B ] = f49, 1 * SIZE (p16) LD f32 = [A1], SIZE (p16) adds I = -2, I } ;; { .mmb (p13) LD f44 = [A1], SIZE (p20) cmp.ne.unc p14, p0 = 1, II br.ctop.sptk.few .L42 } ;; .align 32 .L999: mov pr = PR, -1 mov ar.lc = ARLC br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/gemm_tcopy.S000066400000000000000000000672431313527062700174720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 24 #define WPREFETCHSIZE 32 #ifndef XDOUBLE #define LD LDFD #define ST STFD_NTA #else #define LD LDFD #define ST STFD_NTA #endif #define PREA r2 #define PREB r3 #define A1 r14 #define A2 r15 #define B1 r16 #define B2 r17 #define I r18 #define J r19 #define BO2 r20 #define BO3 r21 #define BO4 r22 #define LDB r23 #define II r24 #define TEMP1 r25 #define TEMP2 r26 #define TEMP3 r27 #define LCOUNT r28 #define SCOUNT r29 #define ARLC r30 #define PR r31 #define MLDA8 r8 #define M r32 #define N r33 #define A r34 #define LDA r35 #define B r36 PROLOGUE .prologue PROFCODE .body { .mmi setf.sig f32 = M and r8 = -8, N mov ARLC = ar.lc } ;; { .mmi setf.sig f33 = r8 and r9 = -4, N mov PR = pr } ;; { .mmi setf.sig f34 = r9 and r10 = -2, N shladd LDA = LDA, BASE_SHIFT, r0 } ;; { .mmi setf.sig f35 = r10 shladd MLDA8 = LDA, 3, r0 shl LDB = M, BASE_SHIFT + 3 } ;; { .mfi sub MLDA8 = r0, MLDA8 xmpy.l f33 = f32, f33 shr J = M, 3 } { .mfi xmpy.l f34 = f32, f34 } ;; { .mmf getf.sig BO2 = f33 adds MLDA8 = 16 * SIZE, MLDA8 xmpy.l f35 = f32, f35 } ;; { .mmi getf.sig BO3 = f34 getf.sig BO4 = f35 nop __LINE__ } ;; { .mmi shladd BO2 = BO2, BASE_SHIFT, B shladd BO3 = BO3, BASE_SHIFT, B shladd BO4 = BO4, BASE_SHIFT, B } { .mib cmp.eq p6, p0 = 0, J nop __LINE__ (p6) br.cond.dpnt .L100 } ;; .align 32 .L11: { .mmi add I = 8, N mov A1 = A mov pr.rot = 0 } { .mmi adds A2 = 4 * SIZE, A shladd A = LDA, 3, A shr II = N, 3 } ;; { .mmi mov B1 = B cmp.eq p16, p0 = r0, r0 mov ar.ec = 3 } { .mmi adds B2 = 4 * SIZE, B adds B = 64 * SIZE, B shr I = I, 4 } ;; { .mmi cmp.eq p8, p0 = 0, I shladd I = I, 2, r0 nop __LINE__ } ;; { .mmi mov LCOUNT = 0 mov SCOUNT = 0 adds I = -1, I } ;; { .mmi adds PREA = PREFETCHSIZE * SIZE, A1 adds PREB = WPREFETCHSIZE * SIZE, B1 mov ar.lc = I } { .mib adds J = -1, J mov I = II (p8) br.cond.dpnt .L20 } ;; .align 32 .L12: { .mmi (p18) ST [B1] = f34, 1 * SIZE (p18) ST [B2] = f46, 1 * SIZE (p18) cmp.ne.unc p13, p0 = 1, II } { .mmi (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB], LDB (p16) cmp.ne.unc p12, p0 = 1, I } ;; { .mmi (p18) ST [B1] = f37, 1 * SIZE (p18) ST [B2] = f49, 1 * SIZE (p18) adds SCOUNT = 1, SCOUNT } { .mmi (p16) LD f32 = [A1], SIZE (p16) LD f44 = [A2], SIZE (p16) adds LCOUNT = 1, LCOUNT } ;; { .mmi (p18) ST [B1] = f40, 1 * SIZE (p18) ST [B2] = f52, 1 * SIZE (p16) cmp.eq.unc p14, p0 = 4, LCOUNT } { .mmi (p16) LD f35 = [A1], SIZE (p16) LD f47 = [A2], SIZE adds TEMP1 = -3 * SIZE, LDA } ;; { .mmi (p18) ST [B1] = f43, 5 * SIZE (p18) ST [B2] = f55, 5 * SIZE (p18) cmp.eq.unc p15, p0 = 4, SCOUNT } { .mmi (p16) LD f38 = [A1], SIZE (p16) LD f50 = [A2], SIZE (p12) mov TEMP1 = 5 * SIZE } ;; { .mmi (p18) ST [B1] = f82, 1 * SIZE (p18) ST [B2] = f94, 1 * SIZE } { .mmi (p16) LD f41 = [A1], TEMP1 (p16) LD f53 = [A2], TEMP1 } ;; { .mmi (p18) ST [B1] = f85, 1 * SIZE (p18) ST [B2] = f97, 1 * SIZE mov TEMP2 = 5 * SIZE } { .mmi (p12) LD f56 = [A1], SIZE (p12) LD f68 = [A2], SIZE shladd TEMP3 = LDA, 3, r0 } ;; { .mmi (p18) ST [B1] = f88, 1 * SIZE (p18) ST [B2] = f100, 1 * SIZE (p13) adds TEMP2 = - 11 * SIZE, LDB } { .mmi (p12) LD f59 = [A1], SIZE (p12) LD f71 = [A2], SIZE (p12) adds TEMP1 = - 11 * SIZE, LDA } ;; { .mmi (p18) ST [B1] = f91 (p18) ST [B2] = f103 (p18) add B1 = B1, TEMP2 } { .mmi (p12) LD f62 = [A1], SIZE (p12) LD f74 = [A2], SIZE (p18) add B2 = B2, TEMP2 } ;; { .mmi (p13) ST [B1] = f58, 1 * SIZE (p13) ST [B2] = f70, 1 * SIZE } { .mmi (p12) LD f65 = [A1], TEMP1 (p12) LD f77 = [A2], TEMP1 sub TEMP3 = LDA, TEMP3 } ;; { .mmi (p13) ST [B1] = f61, 1 * SIZE (p13) ST [B2] = f73, 1 * SIZE } { .mmi (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB] adds TEMP3 = 5 * SIZE, TEMP3 } ;; { .mmi (p13) ST [B1] = f64, 1 * SIZE (p13) ST [B2] = f76, 1 * SIZE } { .mmi (p16) LD f80 = [A1], SIZE (p16) LD f92 = [A2], SIZE adds TEMP1 = -3 * SIZE, LDA } ;; { .mmi (p13) ST [B1] = f67, 5 * SIZE (p13) ST [B2] = f79, 5 * SIZE } { .mmi (p16) LD f83 = [A1], SIZE (p16) LD f95 = [A2], SIZE (p14) mov TEMP1 = TEMP3 } ;; { .mmi (p13) ST [B1] = f106, 1 * SIZE (p13) ST [B2] = f118, 1 * SIZE mov TEMP2 = 5 * SIZE } { .mmi (p16) LD f86 = [A1], SIZE (p16) LD f98 = [A2], SIZE (p12) mov TEMP1 = 5 * SIZE } ;; { .mmi (p13) ST [B1] = f109, 1 * SIZE (p13) ST [B2] = f121, 1 * SIZE sub TEMP2 = TEMP2, LDB } { .mmi (p16) LD f89 = [A1], TEMP1 (p16) LD f101 = [A2], TEMP1 } ;; { .mmi (p13) ST [B1] = f112, 1 * SIZE (p13) ST [B2] = f124, 1 * SIZE (p15) adds TEMP2 = -59 * SIZE, LDB } { .mmi (p12) LD f104 = [A1], SIZE (p12) LD f116 = [A2], SIZE (p14) add PREA = PREA, MLDA8 } ;; { .mmi (p13) ST [B1] = f115 (p13) ST [B2] = f127 (p13) add B1 = B1, TEMP2 } { .mmi (p12) LD f107 = [A1], SIZE (p12) LD f119 = [A2], SIZE adds TEMP1 = -11 * SIZE, LDA } ;; { .mmi (p12) LD f110 = [A1], SIZE (p12) LD f122 = [A2], SIZE (p14) mov TEMP1 = TEMP3 } { .mmi (p14) mov LCOUNT = 0 (p15) mov SCOUNT = 0 adds PREB = WPREFETCHSIZE * SIZE, B1 } ;; { .mmi (p12) LD f113 = [A1], TEMP1 (p12) LD f125 = [A2], TEMP1 (p13) add B2 = B2, TEMP2 } { .mib (p14) adds I = -2, I (p15) adds II = -2, II br.ctop.sptk .L12 } ;; .align 32 .L20: { .mmi add A2 = A1, LDA and TEMP3 = 7, N tbit.nz p7, p0 = N, 2 } ;; { .mmi (p7) LD f32 = [A1], SIZE (p7) LD f36 = [A2], SIZE cmp.eq p6, p0 = 0, TEMP3 } ;; { .mmi (p7) LD f33 = [A1], SIZE (p7) LD f37 = [A2], SIZE adds TEMP1 = -3 * SIZE, LDA } ;; { .mmi (p7) LD f34 = [A1], SIZE (p7) LD f38 = [A2], SIZE add TEMP1 = TEMP1, LDA } ;; { .mmi (p7) LD f35 = [A1], TEMP1 (p7) LD f39 = [A2], TEMP1 (p6) cmp.ne.unc p10, p0 = 0, J } ;; { .mmb (p7) LD f40 = [A1], SIZE (p7) LD f44 = [A2], SIZE (p10) br.cond.dptk .L11 } ;; { .mmi (p7) LD f41 = [A1], SIZE (p7) LD f45 = [A2], SIZE nop __LINE__ } ;; { .mmi (p7) LD f42 = [A1], SIZE (p7) LD f46 = [A2], SIZE tbit.nz p8, p0 = N, 1 } ;; { .mmi (p7) LD f43 = [A1], TEMP1 (p7) LD f47 = [A2], TEMP1 adds B2 = 4 * SIZE, BO2 } ;; { .mmi (p7) ST [BO2] = f32, 1 * SIZE (p7) ST [B2 ] = f36, 1 * SIZE tbit.nz p9, p0 = N, 0 } { .mmi (p7) LD f48 = [A1], SIZE (p7) LD f52 = [A2], SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f33, 1 * SIZE (p7) ST [B2 ] = f37, 1 * SIZE nop __LINE__ } { .mmi (p7) LD f49 = [A1], SIZE (p7) LD f53 = [A2], SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f34, 1 * SIZE (p7) ST [B2 ] = f38, 1 * SIZE nop __LINE__ } { .mmi (p7) LD f50 = [A1], SIZE (p7) LD f54 = [A2], SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f35, 5 * SIZE (p7) ST [B2 ] = f39, 5 * SIZE nop __LINE__ } { .mmi (p7) LD f51 = [A1], TEMP1 (p7) LD f55 = [A2], TEMP1 mov TEMP1 = -1 * SIZE } ;; { .mmi (p7) ST [BO2] = f40, 1 * SIZE (p7) ST [B2 ] = f44, 1 * SIZE nop __LINE__ } { .mmi (p7) LD f56 = [A1], SIZE (p7) LD f60 = [A2], SIZE shladd TEMP1 = LDA, 3, TEMP1 } ;; { .mmi (p7) ST [BO2] = f41, 1 * SIZE (p7) ST [B2 ] = f45, 1 * SIZE nop __LINE__ } { .mmi (p7) LD f57 = [A1], SIZE (p7) LD f61 = [A2], SIZE sub TEMP1 = 0, TEMP1 } ;; { .mmi (p7) ST [BO2] = f42, 1 * SIZE (p7) ST [B2 ] = f46, 1 * SIZE nop __LINE__ } { .mmi (p7) LD f58 = [A1], SIZE (p7) LD f62 = [A2], SIZE shladd TEMP1 = LDA, 1, TEMP1 } ;; { .mmi (p7) ST [BO2] = f43, 5 * SIZE (p7) ST [B2 ] = f47, 5 * SIZE nop __LINE__ } { .mmi (p7) LD f59 = [A1], TEMP1 (p7) LD f63 = [A2], TEMP1 nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f48, 1 * SIZE (p7) ST [B2 ] = f52, 1 * SIZE nop __LINE__ } { .mmi add A2 = A1, LDA adds TEMP1 = -1 * SIZE, LDA nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f49, 1 * SIZE (p7) ST [B2 ] = f53, 1 * SIZE nop __LINE__ } { .mmi (p8) LD f64 = [A1], SIZE (p8) LD f66 = [A2], SIZE add TEMP1 = TEMP1, LDA } ;; { .mmi (p7) ST [BO2] = f50, 1 * SIZE (p7) ST [B2 ] = f54, 1 * SIZE nop __LINE__ } { .mmi (p8) LD f65 = [A1], TEMP1 (p8) LD f67 = [A2], TEMP1 nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f51, 5 * SIZE (p7) ST [B2 ] = f55, 5 * SIZE nop __LINE__ } { .mmi (p8) LD f68 = [A1], SIZE (p8) LD f70 = [A2], SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f56, 1 * SIZE (p7) ST [B2 ] = f60, 1 * SIZE nop __LINE__ } { .mmi (p8) LD f69 = [A1], TEMP1 (p8) LD f71 = [A2], TEMP1 mov TEMP3 = -1 * SIZE } ;; { .mmi (p7) ST [BO2] = f57, 1 * SIZE (p7) ST [B2 ] = f61, 1 * SIZE nop __LINE__ } { .mmi (p8) LD f72 = [A1], SIZE (p8) LD f74 = [A2], SIZE shladd TEMP3 = LDA, 3, TEMP3 } ;; { .mmi (p7) ST [BO2] = f58, 1 * SIZE (p7) ST [B2 ] = f62, 1 * SIZE nop __LINE__ } { .mmi (p8) LD f73 = [A1], TEMP1 (p8) LD f75 = [A2], TEMP1 sub TEMP3 = 0, TEMP3 } ;; { .mmi (p7) ST [BO2] = f59, 5 * SIZE (p7) ST [B2 ] = f63 adds B2 = 4 * SIZE, BO3 } { .mmi (p8) LD f76 = [A1], SIZE (p8) LD f78 = [A2], SIZE shladd TEMP3 = LDA, 1, TEMP3 } ;; { .mmi (p8) ST [BO3] = f64, 1 * SIZE (p8) ST [B2 ] = f68, 1 * SIZE nop __LINE__ } { .mmi (p8) LD f77 = [A1], TEMP3 (p8) LD f79 = [A2], TEMP3 nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f65, 1 * SIZE (p8) ST [B2 ] = f69, 1 * SIZE nop __LINE__ } { .mmi add A2 = A1, LDA shladd TEMP3 = LDA, 1, r0 nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f66, 1 * SIZE (p8) ST [B2 ] = f70, 1 * SIZE nop __LINE__ } { .mmi (p9) LD f80 = [A1], TEMP3 (p9) LD f81 = [A2], TEMP3 nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f67, 5 * SIZE (p8) ST [B2 ] = f71, 5 * SIZE nop __LINE__ } { .mmi (p9) LD f82 = [A1], TEMP3 (p9) LD f83 = [A2], TEMP3 nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f72, 1 * SIZE (p8) ST [B2 ] = f76, 1 * SIZE nop __LINE__ } { .mmi (p9) LD f84 = [A1], TEMP3 (p9) LD f85 = [A2], TEMP3 nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f73, 1 * SIZE (p8) ST [B2 ] = f77, 1 * SIZE nop __LINE__ } { .mmi (p9) LD f86 = [A1] (p9) LD f87 = [A2] nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f74, 1 * SIZE (p8) ST [B2 ] = f78, 1 * SIZE nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f75, 5 * SIZE (p8) ST [B2 ] = f79 adds B2 = 4 * SIZE, BO4 } ;; { .mmi (p9) ST [BO4] = f80, 1 * SIZE (p9) ST [B2 ] = f84, 1 * SIZE nop __LINE__ } ;; { .mmi (p9) ST [BO4] = f81, 1 * SIZE (p9) ST [B2 ] = f85, 1 * SIZE nop __LINE__ } ;; { .mmi (p9) ST [BO4] = f82, 1 * SIZE (p9) ST [B2 ] = f86, 1 * SIZE cmp.ne p8, p0 = 0, J } ;; { .mmb (p9) ST [BO4] = f83, 5 * SIZE (p9) ST [B2 ] = f87, 5 * SIZE (p8) br.cond.dptk .L11 } ;; .align 32 .L100: { .mmi mov A1 = A add I = 8, N mov pr.rot = 0 } { .mmi adds A2 = 4 * SIZE, A tbit.z p6, p0 = M, 2 } ;; { .mmi mov B1 = B adds B2 = 4 * SIZE, B mov ar.ec = 3 } { .mib cmp.eq p16, p0 = r0, r0 shr I = I, 4 (p6) br.cond.dpnt .L200 } ;; { .mmi cmp.eq p8, p0 = 0, I shladd I = I, 1, r0 shladd A = LDA, 2, A } ;; { .mmi adds B = 32 * SIZE, B adds I = -1, I shr II = N, 3 } ;; { .mmi mov LCOUNT = 0 mov SCOUNT = 0 mov ar.lc = I } { .mib nop __LINE__ mov I = II (p8) br.cond.dpnt .L120 } ;; .align 32 .L112: { .mmi (p18) ST [B1] = f34, 1 * SIZE (p18) ST [B2] = f46, 1 * SIZE (p16) cmp.ne.unc p12, p0 = 1, I } { .mmi (p16) LD f32 = [A1], SIZE (p16) LD f44 = [A2], SIZE (p18) cmp.ne.unc p13, p0 = 1, II } ;; { .mmi (p18) ST [B1] = f37, 1 * SIZE (p18) ST [B2] = f49, 1 * SIZE nop __LINE__ } { .mmi (p16) LD f35 = [A1], SIZE (p16) LD f47 = [A2], SIZE adds TEMP1 = -3 * SIZE, LDA } ;; { .mmi (p18) ST [B1] = f40, 1 * SIZE (p18) ST [B2] = f52, 1 * SIZE shladd TEMP3 = LDA, 2, r0 } { .mmi (p16) LD f38 = [A1], SIZE (p16) LD f50 = [A2], SIZE (p12) mov TEMP1 = 5 * SIZE } ;; { .mmi (p18) ST [B1] = f43, 5 * SIZE (p18) ST [B2] = f55, 5 * SIZE (p16) adds LCOUNT = 1, LCOUNT } { .mmi (p16) LD f41 = [A1], TEMP1 (p16) LD f53 = [A2], TEMP1 (p18) adds SCOUNT = 1, SCOUNT } ;; { .mmi (p18) ST [B1] = f82, 1 * SIZE (p18) ST [B2] = f94, 1 * SIZE (p16) cmp.eq.unc p14, p0 = 2, LCOUNT } { .mmi (p12) LD f56 = [A1], SIZE (p12) LD f68 = [A2], SIZE (p18) cmp.eq.unc p15, p0 = 2, SCOUNT } ;; { .mmi (p18) ST [B1] = f85, 1 * SIZE (p18) ST [B2] = f97, 1 * SIZE mov TEMP2 = 5 * SIZE } { .mmi (p12) LD f59 = [A1], SIZE (p12) LD f71 = [A2], SIZE sub TEMP3 = LDA, TEMP3 } ;; { .mmi (p18) ST [B1] = f88, 1 * SIZE (p18) ST [B2] = f100, 1 * SIZE (p13) adds TEMP2 = - 11 * SIZE, LDB } { .mmi (p12) LD f62 = [A1], SIZE (p12) LD f74 = [A2], SIZE (p12) adds TEMP1 = - 11 * SIZE, LDA } ;; { .mmi (p18) ST [B1] = f91 (p18) ST [B2] = f103 (p18) add B1 = B1, TEMP2 } { .mmi (p12) LD f65 = [A1], TEMP1 (p12) LD f77 = [A2], TEMP1 (p18) add B2 = B2, TEMP2 } ;; { .mmi (p13) ST [B1] = f58, 1 * SIZE (p13) ST [B2] = f70, 1 * SIZE adds TEMP3 = 5 * SIZE, TEMP3 } { .mmi (p16) LD f80 = [A1], SIZE (p16) LD f92 = [A2], SIZE adds TEMP1 = -3 * SIZE, LDA } ;; { .mmi (p13) ST [B1] = f61, 1 * SIZE (p13) ST [B2] = f73, 1 * SIZE nop __LINE__ } { .mmi (p16) LD f83 = [A1], SIZE (p16) LD f95 = [A2], SIZE (p14) mov TEMP1 = TEMP3 } ;; { .mmi (p13) ST [B1] = f64, 1 * SIZE (p13) ST [B2] = f76, 1 * SIZE nop __LINE__ } { .mmi (p16) LD f86 = [A1], SIZE (p16) LD f98 = [A2], SIZE (p12) mov TEMP1 = 5 * SIZE } ;; { .mmi (p13) ST [B1] = f67, 5 * SIZE (p13) ST [B2] = f79, 5 * SIZE (p14) mov LCOUNT = 0 } { .mmi (p16) LD f89 = [A1], TEMP1 (p16) LD f101 = [A2], TEMP1 (p15) mov SCOUNT = 0 } ;; { .mmi (p13) ST [B1] = f106, 1 * SIZE (p13) ST [B2] = f118, 1 * SIZE mov TEMP2 = 5 * SIZE } { .mmi (p12) LD f104 = [A1], SIZE (p12) LD f116 = [A2], SIZE nop __LINE__ } ;; { .mmi (p13) ST [B1] = f109, 1 * SIZE (p13) ST [B2] = f121, 1 * SIZE sub TEMP2 = TEMP2, LDB } { .mmi (p12) LD f107 = [A1], SIZE (p12) LD f119 = [A2], SIZE adds TEMP1 = -11 * SIZE, LDA } ;; { .mmi (p13) ST [B1] = f112, 1 * SIZE (p13) ST [B2] = f124, 1 * SIZE (p15) adds TEMP2 = -27 * SIZE, LDB } { .mmi (p12) LD f110 = [A1], SIZE (p12) LD f122 = [A2], SIZE (p14) mov TEMP1 = TEMP3 } ;; { .mmi (p13) ST [B1] = f115 (p13) ST [B2] = f127 (p13) add B1 = B1, TEMP2 } { .mmi (p12) LD f113 = [A1], TEMP1 (p12) LD f125 = [A2], TEMP1 (p13) add B2 = B2, TEMP2 } ;; { .mmb (p14) adds I = -2, I (p15) adds II = -2, II br.ctop.sptk .L112 } ;; .align 32 .L120: { .mmi add A2 = A1, LDA nop __LINE__ tbit.nz p7, p0 = N, 2 } ;; { .mmi (p7) LD f32 = [A1], SIZE (p7) LD f36 = [A2], SIZE tbit.nz p8, p0 = N, 1 } ;; { .mmi (p7) LD f33 = [A1], SIZE (p7) LD f37 = [A2], SIZE adds TEMP1 = -3 * SIZE, LDA } ;; { .mmi (p7) LD f34 = [A1], SIZE (p7) LD f38 = [A2], SIZE add TEMP1 = TEMP1, LDA } ;; { .mmi (p7) LD f35 = [A1], TEMP1 (p7) LD f39 = [A2], TEMP1 tbit.nz p9, p0 = N, 0 } ;; { .mmi (p7) LD f40 = [A1], SIZE (p7) LD f44 = [A2], SIZE mov TEMP2 = -1 * SIZE } ;; { .mmi (p7) LD f41 = [A1], SIZE (p7) LD f45 = [A2], SIZE shladd TEMP2 = LDA, 1, TEMP2 } ;; { .mmi (p7) LD f42 = [A1], SIZE (p7) LD f46 = [A2], SIZE sub TEMP2 = 0, TEMP2 } ;; { .mmi (p7) LD f43 = [A1], TEMP2 (p7) LD f47 = [A2] nop __LINE__ } ;; { .mmi add A2 = A1, LDA adds TEMP1 = -1 * SIZE, LDA mov TEMP2 = -1 * SIZE } ;; { .mmi (p8) LD f48 = [A1], SIZE (p8) LD f50 = [A2], SIZE add TEMP1 = TEMP1, LDA } ;; { .mmi (p8) LD f49 = [A1], TEMP1 (p8) LD f51 = [A2], TEMP1 shladd TEMP2 = LDA, 1, TEMP2 } ;; { .mmi (p8) LD f52 = [A1], SIZE (p8) LD f54 = [A2], SIZE sub TEMP2 = r0, TEMP2 } ;; { .mmi (p8) LD f53 = [A1], TEMP2 (p8) LD f55 = [A2], TEMP2 nop __LINE__ } ;; { .mmi add A2 = A1, LDA adds B2 = 4 * SIZE, BO2 nop __LINE__ } ;; { .mmi (p9) LD f56 = [A1] nop __LINE__ (p9) shladd A1 = LDA, 1, A1 } { .mmi (p9) LD f57 = [A2] nop __LINE__ (p9) shladd A2 = LDA, 1, A2 } ;; { .mmi (p7) ST [BO2] = f32, 1 * SIZE (p7) ST [B2 ] = f36, 1 * SIZE nop __LINE__ } { .mmi (p9) LD f58 = [A1] (p9) LD f59 = [A2] nop __LINE__ } ;; ;; { .mmi (p7) ST [BO2] = f33, 1 * SIZE (p7) ST [B2 ] = f37, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f34, 1 * SIZE (p7) ST [B2 ] = f38, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f35, 5 * SIZE (p7) ST [B2 ] = f39, 5 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f40, 1 * SIZE (p7) ST [B2 ] = f44, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f41, 1 * SIZE (p7) ST [B2 ] = f45, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f42, 1 * SIZE (p7) ST [B2 ] = f46, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f43, 5 * SIZE (p7) ST [B2 ] = f47 adds B2 = 4 * SIZE, BO3 } ;; { .mmi (p8) ST [BO3] = f48, 1 * SIZE (p8) ST [B2 ] = f52, 1 * SIZE nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f49, 1 * SIZE (p8) ST [B2 ] = f53, 1 * SIZE nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f50, 1 * SIZE (p8) ST [B2 ] = f54, 1 * SIZE nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f51, 5 * SIZE (p8) ST [B2 ] = f55 adds B2 = 2 * SIZE, BO4 } ;; { .mmi (p9) ST [BO4] = f56, 1 * SIZE (p9) ST [B2 ] = f58, 1 * SIZE nop __LINE__ } ;; { .mmi (p9) ST [BO4] = f57, 3 * SIZE (p9) ST [B2 ] = f59 nop __LINE__ } ;; .align 32 .L200: { .mmi add I = 8, N mov A1 = A mov pr.rot = 0 } { .mmi adds A2 = 4 * SIZE, A nop __LINE__ tbit.z p6, p0 = M, 1 } ;; { .mmi mov B1 = B cmp.eq p16, p0 = r0, r0 mov ar.ec = 3 } { .mib adds B2 = 4 * SIZE, B shr I = I, 4 (p6) br.cond.dpnt .L300 } ;; { .mmi shladd A = LDA, 1, A adds B = 16 * SIZE, B shr II = N, 3 } { .mmi cmp.eq p8, p0 = 0, I adds I = -1, I nop __LINE__ } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.lc = I } { .mib mov I = II nop __LINE__ (p8) br.cond.dpnt .L220 } ;; .align 32 .L212: { .mmi (p18) ST [B1] = f34, 1 * SIZE (p18) ST [B2] = f46, 1 * SIZE (p16) cmp.ne.unc p12, p0 = 1, I } { .mmi (p16) LD f32 = [A1], SIZE (p16) LD f44 = [A2], SIZE (p18) cmp.ne.unc p13, p0 = 1, II } ;; { .mmi (p18) ST [B1] = f37, 1 * SIZE (p18) ST [B2] = f49, 1 * SIZE adds TEMP1 = -3 * SIZE, LDA } { .mmi (p16) LD f35 = [A1], SIZE (p16) LD f47 = [A2], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f40, 1 * SIZE (p18) ST [B2] = f52, 1 * SIZE (p12) mov TEMP1 = 5 * SIZE } { .mmi (p16) LD f38 = [A1], SIZE (p16) LD f50 = [A2], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f43, 5 * SIZE (p18) ST [B2] = f55, 5 * SIZE nop __LINE__ } { .mmi (p16) LD f41 = [A1], TEMP1 (p16) LD f53 = [A2], TEMP1 nop __LINE__ } ;; { .mmi (p18) ST [B1] = f82, 1 * SIZE (p18) ST [B2] = f94, 1 * SIZE nop __LINE__ } { .mmi (p12) LD f56 = [A1], SIZE (p12) LD f68 = [A2], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f85, 1 * SIZE (p18) ST [B2] = f97, 1 * SIZE mov TEMP2 = 5 * SIZE } { .mmi (p12) LD f59 = [A1], SIZE (p12) LD f71 = [A2], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f88, 1 * SIZE (p18) ST [B2] = f100, 1 * SIZE (p13) adds TEMP2 = - 11 * SIZE, LDB } { .mmi (p12) LD f62 = [A1], SIZE (p12) LD f74 = [A2], SIZE (p12) adds TEMP1 = - 11 * SIZE, LDA } ;; { .mmi (p18) ST [B1] = f91 (p18) ST [B2] = f103 (p18) add B1 = B1, TEMP2 } { .mmi (p12) LD f65 = [A1], TEMP1 (p12) LD f77 = [A2], TEMP1 (p18) add B2 = B2, TEMP2 } ;; { .mmi (p13) ST [B1] = f58, 1 * SIZE (p13) ST [B2] = f70, 1 * SIZE nop __LINE__ } { .mmi (p16) LD f80 = [A1], SIZE (p16) LD f92 = [A2], SIZE sub TEMP1 = r0, LDA } ;; { .mmi (p13) ST [B1] = f61, 1 * SIZE (p13) ST [B2] = f73, 1 * SIZE nop __LINE__ } { .mmi (p16) LD f83 = [A1], SIZE (p16) LD f95 = [A2], SIZE (p16) adds TEMP1 = 5 * SIZE, TEMP1 } ;; { .mmi (p13) ST [B1] = f64, 1 * SIZE (p13) ST [B2] = f76, 1 * SIZE nop __LINE__ } { .mmi (p16) LD f86 = [A1], SIZE (p16) LD f98 = [A2], SIZE (p12) mov TEMP1 = 5 * SIZE } ;; { .mmi (p13) ST [B1] = f67, 5 * SIZE (p13) ST [B2] = f79, 5 * SIZE nop __LINE__ } { .mmi (p16) LD f89 = [A1], TEMP1 (p16) LD f101 = [A2], TEMP1 adds TEMP1 = -11 * SIZE, LDA } ;; { .mmi (p13) ST [B1] = f106, 1 * SIZE (p13) ST [B2] = f118, 1 * SIZE mov TEMP2 = 5 * SIZE } { .mmi (p12) LD f104 = [A1], SIZE (p12) LD f116 = [A2], SIZE (p16) shladd TEMP1 = LDA, 1, r0 } ;; { .mmi (p13) ST [B1] = f109, 1 * SIZE (p13) ST [B2] = f121, 1 * SIZE sub TEMP2 = TEMP2, LDB } { .mmi (p12) LD f107 = [A1], SIZE (p12) LD f119 = [A2], SIZE (p16) sub TEMP1 = LDA, TEMP1 } ;; { .mmi (p13) ST [B1] = f112, 1 * SIZE (p13) ST [B2] = f124, 1 * SIZE (p18) adds TEMP2 = -11 * SIZE, LDB } { .mmi (p12) LD f110 = [A1], SIZE (p12) LD f122 = [A2], SIZE (p16) adds TEMP1 = 5 * SIZE, TEMP1 } ;; { .mmi (p13) ST [B1] = f115 (p13) ST [B2] = f127 (p13) add B1 = B1, TEMP2 } { .mmi (p12) LD f113 = [A1], TEMP1 (p12) LD f125 = [A2], TEMP1 (p13) add B2 = B2, TEMP2 } ;; { .mmb (p16) adds I = -2, I (p18) adds II = -2, II br.ctop.sptk .L212 } ;; .align 32 .L220: { .mmi add A2 = A1, LDA nop __LINE__ tbit.nz p7, p0 = N, 2 } ;; { .mmi (p7) LD f32 = [A1], SIZE (p7) LD f36 = [A2], SIZE tbit.nz p8, p0 = N, 1 } ;; { .mmi (p7) LD f33 = [A1], SIZE (p7) LD f37 = [A2], SIZE tbit.nz p9, p0 = N, 0 } ;; { .mmi (p7) LD f34 = [A1], SIZE (p7) LD f38 = [A2], SIZE nop __LINE__ } ;; { .mmi (p7) LD f35 = [A1], SIZE (p7) LD f39 = [A2] nop __LINE__ } ;; { .mmi add A2 = A1, LDA nop __LINE__ nop __LINE__ } ;; { .mmi (p8) LD f40 = [A1], SIZE (p8) LD f42 = [A2], SIZE nop __LINE__ } ;; { .mmi (p8) LD f41 = [A1], SIZE (p8) LD f43 = [A2] nop __LINE__ } ;; { .mmi add A2 = A1, LDA nop __LINE__ nop __LINE__ } ;; { .mmi (p9) LD f44 = [A1] (p9) LD f45 = [A2] adds B2 = 4 * SIZE, BO2 } ;; { .mmi (p7) ST [BO2] = f32, 1 * SIZE (p7) ST [B2 ] = f36, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f33, 1 * SIZE (p7) ST [B2 ] = f37, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f34, 1 * SIZE (p7) ST [B2 ] = f38, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f35, 5 * SIZE (p7) ST [B2 ] = f39 adds B2 = 2 * SIZE, BO3 } ;; { .mmi (p8) ST [BO3] = f40, 1 * SIZE (p8) ST [B2 ] = f42, 1 * SIZE nop __LINE__ } ;; { .mmi (p8) ST [BO3] = f41, 3 * SIZE (p8) ST [B2 ] = f43 adds B2 = 1 * SIZE, BO4 } ;; { .mmi (p9) ST [BO4] = f44, 2 * SIZE (p9) ST [B2 ] = f45 nop __LINE__ } ;; .align 32 .L300: { .mmi add I = 8, N mov A1 = A mov pr.rot = 0 } { .mmi mov B1 = B adds A2 = 4 * SIZE, A tbit.z p6, p0 = M, 0 } ;; { .mmi adds B2 = 4 * SIZE, B cmp.eq p16, p0 = r0, r0 mov ar.ec = 3 } { .mib nop __LINE__ shr I = I, 4 (p6) br.cond.dpnt .L999 } ;; { .mmi cmp.eq p8, p0 = 0, I adds I = -1, I shr II = N, 3 } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.lc = I } { .mib nop __LINE__ mov I = II (p8) br.cond.dpnt .L320 } ;; .align 32 .L312: { .mmi (p18) ST [B1] = f34, 1 * SIZE (p18) ST [B2] = f46, 1 * SIZE (p16) cmp.ne.unc p12, p0 = 1, I } { .mmi (p16) LD f32 = [A1], SIZE (p16) LD f44 = [A2], SIZE (p18) cmp.ne.unc p13, p0 = 1, II } ;; { .mmi (p18) ST [B1] = f37, 1 * SIZE (p18) ST [B2] = f49, 1 * SIZE adds TEMP2 = - 3 * SIZE, LDB } { .mmi (p16) LD f35 = [A1], SIZE (p16) LD f47 = [A2], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f40, 1 * SIZE (p18) ST [B2] = f52, 1 * SIZE nop __LINE__ } { .mmi (p16) LD f38 = [A1], SIZE (p16) LD f50 = [A2], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f43 (p18) ST [B2] = f55 (p18) add B1 = B1, TEMP2 } { .mmi (p16) LD f41 = [A1], 5 * SIZE (p16) LD f53 = [A2], 5 * SIZE (p18) add B2 = B2, TEMP2 } ;; { .mmi (p13) ST [B1] = f58, 1 * SIZE (p13) ST [B2] = f70, 1 * SIZE (p16) adds I = -2, I } { .mmi (p12) LD f56 = [A1], SIZE (p12) LD f68 = [A2], SIZE (p18) adds II = -2, II } ;; { .mmi (p13) ST [B1] = f61, 1 * SIZE (p13) ST [B2] = f73, 1 * SIZE nop __LINE__ } { .mmi (p12) LD f59 = [A1], SIZE (p12) LD f71 = [A2], SIZE nop __LINE__ } ;; { .mmi (p13) ST [B1] = f64, 1 * SIZE (p13) ST [B2] = f76, 1 * SIZE nop __LINE__ } { .mmi (p12) LD f62 = [A1], SIZE (p12) LD f74 = [A2], SIZE nop __LINE__ } ;; { .mmi (p13) ST [B1] = f67 (p13) ST [B2] = f79 (p13) add B1 = B1, TEMP2 } { .mmi (p12) LD f65 = [A1], 5 * SIZE (p12) LD f77 = [A2], 5 * SIZE (p13) add B2 = B2, TEMP2 } ;; { .mmb nop __LINE__ nop __LINE__ br.ctop.sptk .L312 } ;; .align 32 .L320: { .mmi adds A2 = 2 * SIZE, A1 adds B2 = 2 * SIZE, BO2 tbit.nz p7, p0 = N, 2 } ;; { .mmi (p7) LD f32 = [A1], SIZE (p7) LD f34 = [A2], SIZE tbit.nz p8, p0 = N, 1 } ;; { .mmi (p7) LD f33 = [A1], 3 * SIZE (p7) LD f35 = [A2] nop __LINE__ } ;; { .mmi adds A2 = SIZE, A1 nop __LINE__ nop __LINE__ } ;; { .mmi (p8) LD f36 = [A1], 2 * SIZE (p8) LD f37 = [A2] tbit.nz p9, p0 = N, 0 } ;; { .mmi (p9) LD f38 = [A1] nop __LINE__ nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f32, 1 * SIZE (p7) ST [B2 ] = f34, 1 * SIZE nop __LINE__ } ;; { .mmi (p7) ST [BO2] = f33, 3 * SIZE (p7) ST [B2 ] = f35 adds B2 = SIZE, BO3 } ;; { .mmi (p8) ST [BO3] = f36, 2 * SIZE (p8) ST [B2 ] = f37 nop __LINE__ } ;; { .mmi (p9) ST [BO4] = f38, 1 * SIZE nop __LINE__ nop __LINE__ } ;; .align 32 .L999: mov pr = PR, -1 mov ar.lc = ARLC br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/gemv_n.S000066400000000000000000001725261313527062700166030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define SP r12 #define M r32 #define N r33 #define A r36 #define LDA r37 #define X r38 #define INCX r39 #define Y r34 #define INCY r35 #define BUFFER r11 #define I r14 #define J r15 #define AO1 r16 #define AO2 r17 #define AO3 r18 #define AO4 r19 #define AO5 r20 #define AO6 r21 #define AO7 r22 #define AO8 r23 #define YLD1 r24 #define YST1 r25 #define YST2 r27 #define MM r28 #define YY r9 #define RPRE1 loc0 #define RPRE2 loc1 #define RPRE3 loc2 #define RPRE4 loc3 #define RPRE5 loc4 #define RPRE6 loc5 #define RPRE7 loc6 #define RPRE8 loc7 #define AO11 loc8 #define AO21 loc9 #define AO31 loc10 #define AO41 loc11 #define AO51 loc12 #define AO61 loc13 #define AO71 loc14 #define AO81 loc15 #define PREB r8 #define ARLC r29 #define PR r30 #define ARPFS r31 #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else #define RPREFETCH (16 * 3 + 16) #endif #define PREFETCH lfetch.nt1 #define ALPHA f6 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 8, 0 mov ARLC = ar.lc } ;; mov PR = pr adds r14 = 16, SP adds r15 = 24, SP adds r16 = 32, SP ;; adds r8 = -8 * 16, SP adds r9 = -7 * 16, SP adds SP = -8 * 16, SP ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 .body ;; ld8 Y = [r14] ld8 INCY = [r15] ld8 BUFFER = [r16] mov ALPHA = f8 cmp.ge p7, p0 = 0, M cmp.ge p6, p0 = 0, N ;; shladd INCX = INCX, BASE_SHIFT, r0 shladd LDA = LDA, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 ;; tbit.nz p8, p0 = A, BASE_SHIFT tbit.nz p9, p0 = LDA, BASE_SHIFT mov MM = M ;; (p8) adds MM = -1, M ;; (p7) br.cond.dpnt .L999 (p6) br.cond.dpnt .L999 ;; sub I = A, Y cmp.eq p10, p0 = SIZE, INCY mov YY = Y ;; (p10) tbit.z.unc p10, p0 = I, BASE_SHIFT ;; (p10) br.cond.dptk .L10 ;; shr J = M, 3 mov YY = BUFFER ;; (p8) adds YY = SIZE, BUFFER ;; mov ar.lc = J mov YST1 = YY adds YST2 = 4 * SIZE, YY ;; .L02: STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 5 * SIZE STFD [YST2] = f0, 5 * SIZE br.cloop.sptk.few .L02 ;; .L10: { .mib nop __LINE__ shr J = N, 3 (p9) br.cond.dptk .L100 } ;; { .mib nop __LINE__ cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 } ;; .align 16 .L11: mov YLD1 = YY mov YST1 = YY adds YST2 = 4 * SIZE, YY ;; LDFD f8 = [X], INCX ;; LDFD f9 = [X], INCX ;; LDFD f10 = [X], INCX ;; LDFD f11 = [X], INCX ;; LDFD f12 = [X], INCX ;; LDFD f13 = [X], INCX ;; LDFD f14 = [X], INCX ;; LDFD f15 = [X], INCX ;; FMPY f8 = ALPHA, f8 FMPY f9 = ALPHA, f9 FMPY f10 = ALPHA, f10 FMPY f11 = ALPHA, f11 FMPY f12 = ALPHA, f12 FMPY f13 = ALPHA, f13 FMPY f14 = ALPHA, f14 FMPY f15 = ALPHA, f15 ;; mov AO1 = A add AO2 = LDA, A ;; shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 ;; shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 ;; shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 shladd A = LDA, 3, A ;; ;; adds PREB = RPREFETCH * SIZE, YLD1 adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 adds RPRE3 = RPREFETCH * SIZE, AO3 adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 adds RPRE5 = RPREFETCH * SIZE, AO5 adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 adds RPRE7 = RPREFETCH * SIZE, AO7 adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 (p8) LDFD f80 = [AO1], 1 * SIZE (p8) LDFD f81 = [AO2], 1 * SIZE (p8) LDFD f82 = [AO3], 1 * SIZE (p8) LDFD f83 = [AO4], 1 * SIZE (p8) LDFD f84 = [AO5], 1 * SIZE (p8) LDFD f85 = [AO6], 1 * SIZE (p8) LDFD f86 = [AO7], 1 * SIZE (p8) LDFD f87 = [AO8], 1 * SIZE (p8) LDFD f106 = [YLD1], 1 * SIZE ;; (p8) FMPY f32 = f8, f80 (p8) FMPY f33 = f9, f81 (p8) FMPY f34 = f10, f82 (p8) FMA f35 = f11, f83, f106 ;; (p8) FMA f32 = f12, f84, f32 (p8) FMA f33 = f13, f85, f33 (p8) FMA f34 = f14, f86, f34 (p8) FMA f35 = f15, f87, f35 ;; (p8) FADD f32 = f32, f33 (p8) FADD f34 = f34, f35 ;; (p8) FADD f32 = f32, f34 ;; (p8) STFD [YST1] = f32, 1 * SIZE (p8) adds YST2 = 1 * SIZE, YST2 ;; shr I = MM, 3 mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I tbit.nz p13, p0 = MM, 2 ;; mov ar.lc = I mov ar.ec = 2 (p6) br.cond.dpnt .L15 ;; .align 16 .L12: { .mmf (p18) STFD [YST1] = f16, 1 * SIZE (p18) STFD [YST2] = f17, 1 * SIZE (p17) FMA f16 = f8, f33, f101 } { .mfi (p17) LDFPD f93, f94 = [AO8], 2 * SIZE (p17) FMA f17 = f8, f37, f113 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p18) STFD [YST1] = f18, 1 * SIZE (p18) STFD [YST2] = f19, 1 * SIZE (p17) FMA f18 = f8, f34, f104 } { .mmf (p14) lfetch.excl.nt1 [PREB], 16 * SIZE (p17) LDFPD f95, f96 = [AO8], 2 * SIZE (p17) FMA f19 = f8, f38, f116 } ;; { .mmf (p18) STFD [YST1] = f20, 1 * SIZE (p18) STFD [YST2] = f21, 1 * SIZE (p17) FMA f20 = f8, f35, f107 } { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f21 = f8, f39, f119 nop __LINE__ } ;; { .mmf (p18) STFD [YST1] = f22, 5 * SIZE (p18) STFD [YST2] = f23, 5 * SIZE (p17) FMA f22 = f8, f36, f110 } { .mmf (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f23 = f8, f40, f122 } ;; { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f16 = f9, f41, f16 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f17 = f9, f45, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f18 = f9, f42, f18 nop __LINE__ } { .mfi (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p17) FMA f19 = f9, f46, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p17) FMA f20 = f9, f43, f20 nop __LINE__ } { .mfi (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f21 = f9, f47, f21 nop __LINE__ } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f22 = f9, f44, f22 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f23 = f9, f48, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f16 = f10, f49, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f10, f53, f17 nop __LINE__ } ;; { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p16) LDFPD f46, f47 = [AO2], 2 * SIZE (p17) FMA f18 = f10, f50, f18 } { .mfi nop __LINE__ (p17) FMA f19 = f10, f54, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f20 = f10, f51, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f10, f55, f21 nop __LINE__ } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f22 = f10, f52, f22 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f23 = f10, f56, f23 nop __LINE__ } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f16 = f11, f57, f16 } { .mfi nop __LINE__ (p17) FMA f17 = f11, f61, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f18 = f11, f58, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f11, f62, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f56, f57 = [AO4], 2 * SIZE (p17) FMA f20 = f11, f59, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f11, f63, f21 nop __LINE__ } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFPD f58, f59 = [AO4], 2 * SIZE (p17) FMA f22 = f11, f60, f22 } { .mfi nop __LINE__ (p17) FMA f23 = f11, f64, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f60, f61 = [AO4], 2 * SIZE (p17) FMA f16 = f12, f65, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f12, f69, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f62, f63 = [AO4], 2 * SIZE (p17) FMA f18 = f12, f66, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f12, f70, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f64, f65 = [AO5], 2 * SIZE (p17) FMA f20 = f12, f67, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f12, f71, f21 nop __LINE__ } ;; { .mmf (p14) PREFETCH [RPRE5], 16 * SIZE (p16) LDFPD f66, f67 = [AO5], 2 * SIZE (p17) FMA f22 = f12, f68, f22 } { .mfi nop __LINE__ (p17) FMA f23 = f12, f72, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f68, f69 = [AO5], 2 * SIZE (p17) FMA f16 = f13, f73, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f13, f77, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f70, f71 = [AO5], 2 * SIZE (p17) FMA f18 = f13, f74, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f13, f78, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f72, f73 = [AO6], 2 * SIZE (p17) FMA f20 = f13, f75, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f13, f79, f21 nop __LINE__ } ;; { .mmf (p15) PREFETCH [RPRE6], 16 * SIZE (p16) LDFPD f74, f75 = [AO6], 2 * SIZE (p17) FMA f22 = f13, f76, f22 } { .mfi nop __LINE__ (p17) FMA f23 = f13, f80, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f76, f77 = [AO6], 2 * SIZE (p17) FMA f16 = f14, f81, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f14, f85, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f78, f79 = [AO6], 2 * SIZE (p17) FMA f18 = f14, f82, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f14, f86, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f80, f81 = [AO7], 2 * SIZE (p17) FMA f20 = f14, f83, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f14, f87, f21 nop __LINE__ } ;; { .mmf (p14) PREFETCH [RPRE7], 16 * SIZE (p16) LDFPD f82, f83 = [AO7], 2 * SIZE (p17) FMA f22 = f14, f84, f22 } { .mfi nop __LINE__ (p17) FMA f23 = f14, f88, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f84, f85 = [AO7], 2 * SIZE (p17) FMA f16 = f15, f89, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f15, f93, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f86, f87 = [AO7], 2 * SIZE (p17) FMA f18 = f15, f90, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f15, f94, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f88, f89 = [AO8], 2 * SIZE (p17) FMA f20 = f15, f91, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f15, f95, f21 (p16) adds I = -1, I } ;; { .mmf (p15) PREFETCH [RPRE8], 16 * SIZE (p16) LDFPD f90, f91 = [AO8], 2 * SIZE (p17) FMA f22 = f15, f92, f22 } { .mfb nop __LINE__ (p17) FMA f23 = f15, f96, f23 br.ctop.sptk.few .L12 } ;; .align 16 .L15: { .mmi (p18) STFD [YST1] = f16, 1 * SIZE (p18) STFD [YST2] = f17, 1 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE cmp.lt p6, p0 = 1, J } ;; { .mmi (p18) STFD [YST1] = f18, 1 * SIZE (p18) STFD [YST2] = f19, 1 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE adds J = -1, J } ;; { .mmi (p18) STFD [YST1] = f20, 1 * SIZE (p18) STFD [YST2] = f21, 1 * SIZE nop __LINE__ } { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE nop __LINE__ } ;; { .mmi (p18) STFD [YST1] = f22, 5 * SIZE (p18) STFD [YST2] = f23, 5 * SIZE nop __LINE__ } { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE nop __LINE__ } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ } ;; { .mmi (p15) LDFD f81 = [AO2] (p15) LDFD f82 = [AO3] nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ } ;; { .mfi (p13) LDFPD f38, f39 = [AO4], 2 * SIZE (p13) FMA f100 = f8, f32, f100 nop __LINE__ } { .mfi (p13) LDFPD f40, f41 = [AO5], 2 * SIZE (p13) FMA f101 = f8, f33, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f54, f55 = [AO4], 2 * SIZE (p13) FMA f102 = f8, f48, f102 nop __LINE__ } { .mfi (p13) LDFPD f56, f57 = [AO5], 2 * SIZE (p13) FMA f103 = f8, f49, f103 nop __LINE__ } ;; { .mfi (p14) LDFPD f70, f71 = [AO4], 2 * SIZE (p14) FMA f104 = f8, f64, f104 nop __LINE__ } { .mfi (p14) LDFPD f72, f73 = [AO5], 2 * SIZE (p14) FMA f105 = f8, f65, f105 nop __LINE__ } ;; { .mfi (p15) LDFD f83 = [AO4] (p15) FMA f106 = f8, f80, f106 nop __LINE__ } { .mfi (p15) LDFD f84 = [AO5] nop __LINE__ nop __LINE__ } ;; { .mfi (p13) LDFPD f42, f43 = [AO6], 2 * SIZE (p13) FMA f100 = f9, f34, f100 nop __LINE__ } { .mfi (p13) LDFPD f44, f45 = [AO7], 2 * SIZE (p13) FMA f101 = f9, f35, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f58, f59 = [AO6], 2 * SIZE (p13) FMA f102 = f9, f50, f102 nop __LINE__ } { .mfi (p13) LDFPD f60, f61 = [AO7], 2 * SIZE (p13) FMA f103 = f9, f51, f103 nop __LINE__ } ;; { .mfi (p14) LDFPD f74, f75 = [AO6], 2 * SIZE (p14) FMA f104 = f9, f66, f104 nop __LINE__ } { .mfi (p14) LDFPD f76, f77 = [AO7], 2 * SIZE (p14) FMA f105 = f9, f67, f105 nop __LINE__ } ;; { .mfi (p15) LDFD f85 = [AO6] (p15) FMA f106 = f9, f81, f106 nop __LINE__ } { .mfi (p15) LDFD f86 = [AO7] nop __LINE__ nop __LINE__ } ;; { .mfi (p13) LDFPD f46, f47 = [AO8], 2 * SIZE (p13) FMA f100 = f10, f36, f100 nop __LINE__ } { .mfi (p13) FMA f101 = f10, f37, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f62, f63 = [AO8], 2 * SIZE (p13) FMA f102 = f10, f52, f102 nop __LINE__ } { .mfi (p13) FMA f103 = f10, f53, f103 nop __LINE__ } ;; { .mfi (p14) LDFPD f78, f79 = [AO8], 2 * SIZE (p14) FMA f104 = f10, f68, f104 nop __LINE__ } { .mfi (p14) FMA f105 = f10, f69, f105 nop __LINE__ } ;; { .mfi (p15) LDFD f87 = [AO8] (p15) FMA f106 = f10, f82, f106 nop __LINE__ } ;; (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 (p13) FMA f102 = f11, f54, f102 (p13) FMA f103 = f11, f55, f103 (p14) FMA f104 = f11, f70, f104 (p14) FMA f105 = f11, f71, f105 (p15) FMA f106 = f11, f83, f106 ;; (p13) FMA f100 = f12, f40, f100 (p13) FMA f101 = f12, f41, f101 (p13) FMA f102 = f12, f56, f102 (p13) FMA f103 = f12, f57, f103 (p14) FMA f104 = f12, f72, f104 (p14) FMA f105 = f12, f73, f105 (p15) FMA f106 = f12, f84, f106 ;; (p13) FMA f100 = f13, f42, f100 (p13) FMA f101 = f13, f43, f101 (p13) FMA f102 = f13, f58, f102 (p13) FMA f103 = f13, f59, f103 (p14) FMA f104 = f13, f74, f104 (p14) FMA f105 = f13, f75, f105 (p15) FMA f106 = f13, f85, f106 ;; (p13) FMA f100 = f14, f44, f100 (p13) FMA f101 = f14, f45, f101 (p13) FMA f102 = f14, f60, f102 (p13) FMA f103 = f14, f61, f103 (p14) FMA f104 = f14, f76, f104 (p14) FMA f105 = f14, f77, f105 (p15) FMA f106 = f14, f86, f106 ;; (p13) FMA f100 = f15, f46, f100 (p13) FMA f101 = f15, f47, f101 (p13) FMA f102 = f15, f62, f102 (p13) FMA f103 = f15, f63, f103 (p14) FMA f104 = f15, f78, f104 (p14) FMA f105 = f15, f79, f105 (p15) FMA f106 = f15, f87, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE (p6) br.cond.dptk .L11 ;; .align 16 .L20: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 2 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L30 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd AO4 = LDA, 1, AO2 } ;; { .mmi LDFD f10 = [X], INCX (p8) LDFD f81 = [AO2], 1 * SIZE shladd AO3 = LDA, 1, A } ;; { .mmi LDFD f11 = [X], INCX (p8) LDFD f82 = [AO3], 1 * SIZE } ;; { .mfi (p8) LDFD f83 = [AO4], 1 * SIZE FMPY f8 = ALPHA, f8 adds PREB = RPREFETCH * SIZE, YLD1 } { .mfi adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 } ;; FMPY f10 = ALPHA, f10 shladd A = LDA, 2, A FMPY f11 = ALPHA, f11 ;; { .mfi adds RPRE3 = RPREFETCH * SIZE, AO3 (p8) FMA f106 = f8, f80, f106 mov ar.ec= 2 } ;; adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 (p8) FMA f106 = f9, f81, f106 shr I = MM, 3 ;; { .mmf cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 (p8) FMA f106 = f10, f82, f106 } ;; { .mfi adds I = -1, I (p8) FMA f106 = f11, f83, f106 tbit.nz p13, p0 = MM, 2 } ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L25 } ;; .align 16 .L22: { .mfi (p17) LDFPD f63, f64 = [AO4], 2 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 (p16) adds I = -1, I } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mfi (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mmf (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFPD f46, f47 = [AO2], 2 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f16 = f11, f57, f101 } { .mfi (p15) PREFETCH [RPRE4], 16 * SIZE (p17) FMA f17 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f56, f57 = [AO4], 2 * SIZE (p17) FMA f18 = f11, f59, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f58, f59 = [AO4], 2 * SIZE (p17) FMA f20 = f11, f61, f113 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f60, f61 = [AO4], 2 * SIZE (p17) FMA f22 = f11, f63, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f11, f64, f122 br.ctop.sptk.few .L22 } ;; .align 16 .L25: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmf (p15) LDFD f81 = [AO2] (p15) LDFD f82 = [AO3] (p13) FMA f100 = f8, f32, f100 } { .mfi (p18) STFD [YST1] = f23, 1 * SIZE (p13) FMA f101 = f8, f33, f101 } ;; ;; { .mfi (p13) LDFPD f38, f39 = [AO4], 2 * SIZE (p13) FMA f102 = f8, f48, f102 } { .mfi (p13) FMA f103 = f8, f49, f103 } ;; { .mfi (p13) LDFPD f54, f55 = [AO4], 2 * SIZE (p14) FMA f104 = f8, f64, f104 } { .mfi (p14) FMA f105 = f8, f65, f105 } ;; { .mfi (p14) LDFPD f70, f71 = [AO4], 2 * SIZE (p15) FMA f106 = f8, f80, f106 } { .mfi (p13) FMA f100 = f9, f34, f100 } ;; { .mfi (p15) LDFD f83 = [AO4] (p13) FMA f101 = f9, f35, f101 } { .mfi (p13) FMA f102 = f9, f50, f102 } ;; (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) FMA f100 = f10, f36, f100 (p13) FMA f101 = f10, f37, f101 (p13) FMA f102 = f10, f52, f102 (p13) FMA f103 = f10, f53, f103 (p14) FMA f104 = f10, f68, f104 (p14) FMA f105 = f10, f69, f105 (p15) FMA f106 = f10, f82, f106 ;; (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 ;; (p13) FMA f102 = f11, f54, f102 (p13) STFD [YST1] = f100, 1 * SIZE (p13) FMA f103 = f11, f55, f103 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f104 = f11, f70, f104 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p14) FMA f105 = f11, f71, f105 ;; (p13) STFD [YST1] = f103, 1 * SIZE (p15) FMA f106 = f11, f83, f106 ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L30: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 1 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L40 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd A = LDA, 1, A } ;; adds PREB = RPREFETCH * SIZE, YLD1 FMPY f8 = ALPHA, f8 mov ar.ec= 2 adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 shr I = MM, 3 ;; (p8) LDFD f81 = [AO2], 1 * SIZE cmp.eq p6, p0 = 0, I ;; (p8) FMA f106 = f8, f80, f106 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 tbit.nz p13, p0 = MM, 2 ;; (p8) FMA f106 = f9, f81, f106 cmp.eq p16, p0 = r0, r0 adds I = -1, I ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L35 } ;; .align 16 .L32: { .mfi (p17) LDFPD f47, f48 = [AO2], 2 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mmf (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 adds I = -1, I } { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mmf (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mmf (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f16 = f9, f41, f101 } { .mmf (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f17 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p17) FMA f18 = f9, f43, f107 } { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f20 = f9, f45, f113 } { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f22 = f9, f47, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f9, f48, f122 br.ctop.sptk.few .L32 } ;; .align 16 .L35: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmi (p15) LDFD f81 = [AO2] (p18) STFD [YST1] = f23, 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 (p14) FMA f104 = f8, f64, f104 (p14) FMA f105 = f8, f65, f105 (p15) FMA f106 = f8, f80, f106 ;; (p13) FMA f100 = f9, f34, f100 (p13) FMA f101 = f9, f35, f101 (p13) FMA f102 = f9, f50, f102 (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L40: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 0 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L990 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE adds RPRE1 = RPREFETCH * SIZE, AO1 } ;; { .mii (p8) LDFD f80 = [AO1], 1 * SIZE adds PREB = RPREFETCH * SIZE, YLD1 } ;; FMPY f8 = ALPHA, f8 shr I = MM, 3 ;; (p8) FMA f106 = f8, f80, f106 mov ar.ec= 3 ;; { .mmi cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 tbit.nz p14, p15 = r0, 0 } ;; { .mmi adds YST2 = 4 * SIZE, YST1 adds I = -1, I tbit.nz p13, p0 = MM, 2 } ;; { .mmi (p8) STFD [YST1] = f106, 1 * SIZE (p8) adds YST2 = 1 * SIZE, YST2 } { .mib mov ar.lc = I (p6) br.cond.dpnt .L145 } ;; .align 16 .L42: { .mmf (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE (p18) FMA f16 = f8, f34, f102 } { .mmf (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) FMA f20 = f8, f46, f114 } ;; { .mmf (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE (p18) FMA f17 = f8, f37, f105 } { .mmf (p16) LDFPD f38, f41 = [AO1], 2 * SIZE (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) FMA f21 = f8, f49, f117 } ;; { .mmf (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE (p18) FMA f18 = f8, f40, f108 } { .mmf (p16) LDFPD f44, f47 = [AO1], 2 * SIZE (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) FMA f22 = f8, f52, f120 } ;; { .mmf (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE (p18) FMA f19 = f8, f43, f111 } { .mmf (p16) LDFPD f50, f53 = [AO1], 2 * SIZE (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) FMA f23 = f8, f55, f123 } ;; { .mmi (p14) PREFETCH [RPRE1], 16 * SIZE (p14) PREFETCH [PREB], 16 * SIZE (p16) tbit.nz.unc p14, p15 = I, 0 } { .mib nop __LINE__ (p16) adds I = -1, I br.ctop.sptk.few .L42 } ;; .align 16 .L45: { .mmi (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE } { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE } { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 ;; (p13) STFD [YST1] = f100, 1 * SIZE (p14) FMA f104 = f8, f64, f104 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f105 = f8, f65, f105 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p15) FMA f106 = f8, f80, f106 ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE br .L990 ;; .align 16 .L100: shr J = N, 3 ;; cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L120 ;; .align 16 .L111: mov YLD1 = YY mov YST1 = YY ;; LDFD f8 = [X], INCX ;; LDFD f9 = [X], INCX ;; LDFD f10 = [X], INCX ;; LDFD f11 = [X], INCX ;; LDFD f12 = [X], INCX ;; LDFD f13 = [X], INCX ;; LDFD f14 = [X], INCX ;; LDFD f15 = [X], INCX ;; FMPY f8 = ALPHA, f8 FMPY f9 = ALPHA, f9 FMPY f10 = ALPHA, f10 FMPY f11 = ALPHA, f11 FMPY f12 = ALPHA, f12 FMPY f13 = ALPHA, f13 FMPY f14 = ALPHA, f14 FMPY f15 = ALPHA, f15 ;; mov AO1 = A add AO2 = LDA, A ;; shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 ;; shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 ;; shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 shladd A = LDA, 3, A ;; ;; adds PREB = RPREFETCH * SIZE, YLD1 adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 adds RPRE3 = RPREFETCH * SIZE, AO3 adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 adds RPRE5 = RPREFETCH * SIZE, AO5 adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 adds RPRE7 = RPREFETCH * SIZE, AO7 adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 (p8) LDFD f80 = [AO1], 1 * SIZE (p8) LDFD f81 = [AO2], 1 * SIZE (p8) LDFD f82 = [AO3], 1 * SIZE (p8) LDFD f83 = [AO4], 1 * SIZE (p8) LDFD f84 = [AO5], 1 * SIZE (p8) LDFD f85 = [AO6], 1 * SIZE (p8) LDFD f86 = [AO7], 1 * SIZE (p8) LDFD f87 = [AO8], 1 * SIZE (p8) LDFD f106 = [YLD1], 1 * SIZE ;; (p8) FMPY f32 = f8, f80 (p8) FMPY f33 = f9, f81 (p8) FMPY f34 = f10, f82 (p8) FMA f35 = f11, f83, f106 ;; (p8) FMA f32 = f12, f84, f32 (p8) FMA f33 = f13, f85, f33 (p8) FMA f34 = f14, f86, f34 (p8) FMA f35 = f15, f87, f35 ;; (p8) FADD f32 = f32, f33 (p8) FADD f34 = f34, f35 ;; (p8) FADD f32 = f32, f34 ;; (p8) STFD [YST1] = f32, 1 * SIZE shr I = MM, 3 mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I tbit.nz p13, p0 = MM, 2 ;; mov ar.lc = I mov ar.ec= 2 (p6) br.cond.dpnt .L115 ;; .align 16 .L112: { .mfi (p17) LDFD f96 = [AO8], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mmf (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f41, f42 = [AO2], 2 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f43, f44 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f45, f46 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFD f47 = [AO2], 1 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f101 = f11, f57, f101 } { .mmf (p18) STFD [YST1] = f19, 1 * SIZE (p16) LDFD f56 = [AO4], 1 * SIZE (p17) FMA f104 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f57, f58 = [AO4], 2 * SIZE (p17) FMA f107 = f11, f59, f107 } { .mfi (p15) PREFETCH [RPRE4], 16 * SIZE (p17) FMA f110 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f59, f60 = [AO4], 2 * SIZE (p17) FMA f113 = f11, f61, f113 } { .mfi (p17) FMA f116 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f61, f62 = [AO4], 2 * SIZE (p17) FMA f119 = f11, f63, f119 } { .mfi (p17) FMA f122 = f11, f64, f122 } ;; { .mfi (p16) LDFD f63 = [AO4], 1 * SIZE (p17) FMA f101 = f12, f65, f101 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f104 = f12, f66, f104 } ;; { .mfi (p16) LDFPD f64, f65 = [AO5], 2 * SIZE (p17) FMA f107 = f12, f67, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f110 = f12, f68, f110 } ;; { .mfi (p16) LDFPD f66, f67 = [AO5], 2 * SIZE (p17) FMA f113 = f12, f69, f113 } { .mfi (p14) PREFETCH [RPRE5], 16 * SIZE (p17) FMA f116 = f12, f70, f116 } ;; { .mfi (p16) LDFPD f68, f69 = [AO5], 2 * SIZE (p17) FMA f119 = f12, f71, f119 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f122 = f12, f72, f122 } ;; { .mfi (p16) LDFPD f70, f71 = [AO5], 2 * SIZE (p17) FMA f101 = f13, f73, f101 } { .mmf (p18) STFD [YST1] = f23, 1 * SIZE (p16) LDFD f72 = [AO6], 1 * SIZE (p17) FMA f104 = f13, f74, f104 } ;; { .mfi (p16) LDFPD f73, f74 = [AO6], 2 * SIZE (p17) FMA f107 = f13, f75, f107 } { .mfi (p15) PREFETCH [RPRE6], 16 * SIZE (p17) FMA f110 = f13, f76, f110 } ;; { .mfi (p16) LDFPD f75, f76 = [AO6], 2 * SIZE (p17) FMA f113 = f13, f77, f113 } { .mfi (p17) FMA f116 = f13, f78, f116 } ;; { .mfi (p16) LDFPD f77, f78 = [AO6], 2 * SIZE (p17) FMA f119 = f13, f79, f119 } { .mfi (p17) FMA f122 = f13, f80, f122 } ;; { .mfi (p16) LDFD f79 = [AO6], 1 * SIZE (p17) FMA f101 = f14, f81, f101 } { .mfi (p17) FMA f104 = f14, f82, f104 } ;; { .mfi (p16) LDFPD f80, f81 = [AO7], 2 * SIZE (p17) FMA f107 = f14, f83, f107 } { .mfi (p14) PREFETCH [RPRE7], 16 * SIZE (p17) FMA f110 = f14, f84, f110 } ;; { .mfi (p16) LDFPD f82, f83 = [AO7], 2 * SIZE (p17) FMA f113 = f14, f85, f113 } { .mfi (p17) FMA f116 = f14, f86, f116 } ;; { .mfi (p16) LDFPD f84, f85 = [AO7], 2 * SIZE (p17) FMA f119 = f14, f87, f119 } { .mfi (p17) FMA f122 = f14, f88, f122 } ;; { .mfi (p16) LDFPD f86, f87 = [AO7], 2 * SIZE (p17) FMA f16 = f15, f89, f101 } { .mfi (p16) LDFD f88 = [AO8], 1 * SIZE (p17) FMA f17 = f15, f90, f104 } ;; { .mfi (p16) LDFPD f89, f90 = [AO8], 2 * SIZE (p17) FMA f18 = f15, f91, f107 } { .mfi (p15) PREFETCH [RPRE8], 16 * SIZE (p17) FMA f19 = f15, f92, f110 } ;; { .mfi (p16) LDFPD f91, f92 = [AO8], 2 * SIZE (p17) FMA f20 = f15, f93, f113 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f21 = f15, f94, f116 } ;; { .mfi (p16) LDFPD f93, f94 = [AO8], 2 * SIZE (p17) FMA f22 = f15, f95, f119 } { .mfb (p16) adds I = -1, I (p17) FMA f23 = f15, f96, f122 br.ctop.sptk.few .L112 } ;; .align 16 .L115: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE cmp.lt p6, p0 = 1, J adds J = -1, J } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE (p13) LDFD f34 = [AO2], 1 * SIZE nop __LINE__ } ;; { .mmi (p13) LDFPD f35, f50 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p13) LDFD f51 = [AO2], 1 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFD f66 = [AO2], 1 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFD f67 = [AO2], 1 * SIZE (p15) LDFD f82 = [AO3] nop __LINE__ } { .mmi (p18) STFD [YST1] = f23, 1 * SIZE nop __LINE__ } ;; { .mmf (p15) LDFD f81 = [AO2] (p13) LDFD f38 = [AO4], 1 * SIZE (p13) FMA f100 = f8, f32, f100 } { .mfi (p13) LDFPD f40, f41 = [AO5], 2 * SIZE (p13) FMA f101 = f8, f33, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f39, f54 = [AO4], 2 * SIZE (p13) FMA f102 = f8, f48, f102 nop __LINE__ } { .mfi (p13) LDFPD f56, f57 = [AO5], 2 * SIZE (p13) FMA f103 = f8, f49, f103 nop __LINE__ } ;; { .mfi (p13) LDFD f55 = [AO4], 1 * SIZE (p14) FMA f104 = f8, f64, f104 nop __LINE__ } { .mfi (p14) LDFPD f72, f73 = [AO5], 2 * SIZE (p14) FMA f105 = f8, f65, f105 nop __LINE__ } ;; { .mfi (p14) LDFD f70 = [AO4], 1 * SIZE (p15) FMA f106 = f8, f80, f106 nop __LINE__ } { .mmi (p15) LDFD f84 = [AO5] (p13) LDFD f42 = [AO6], 1 * SIZE nop __LINE__ } ;; { .mmf (p13) LDFPD f43, f58 = [AO6], 2 * SIZE (p14) LDFD f71 = [AO4], 1 * SIZE (p13) FMA f100 = f9, f34, f100 } { .mfi (p13) LDFPD f44, f45 = [AO7], 2 * SIZE (p13) FMA f101 = f9, f35, f101 nop __LINE__ } ;; { .mmf (p13) LDFD f59 = [AO6], 1 * SIZE (p15) LDFD f83 = [AO4] (p13) FMA f102 = f9, f50, f102 } { .mfi (p13) LDFPD f60, f61 = [AO7], 2 * SIZE (p13) FMA f103 = f9, f51, f103 nop __LINE__ } ;; { .mfi (p14) LDFD f74 = [AO6], 1 * SIZE (p14) FMA f104 = f9, f66, f104 nop __LINE__ } { .mfi (p14) LDFPD f76, f77 = [AO7], 2 * SIZE (p14) FMA f105 = f9, f67, f105 nop __LINE__ } ;; { .mfi (p14) LDFD f75 = [AO6], 1 * SIZE (p15) FMA f106 = f9, f81, f106 nop __LINE__ } { .mmi (p15) LDFD f86 = [AO7] (p13) LDFD f46 = [AO8], 1 * SIZE nop __LINE__ } ;; { .mmf (p13) LDFPD f47, f62 = [AO8], 2 * SIZE (p15) LDFD f85 = [AO6] (p13) FMA f100 = f10, f36, f100 } { .mfi (p13) FMA f101 = f10, f37, f101 nop __LINE__ } ;; { .mfi (p13) LDFD f63 = [AO8], 1 * SIZE (p13) FMA f102 = f10, f52, f102 nop __LINE__ } { .mfi (p13) FMA f103 = f10, f53, f103 nop __LINE__ } ;; { .mfi (p14) LDFD f78 = [AO8], 1 * SIZE (p14) FMA f104 = f10, f68, f104 nop __LINE__ } { .mfi (p14) FMA f105 = f10, f69, f105 nop __LINE__ } ;; { .mfi (p14) LDFD f79 = [AO8], 1 * SIZE (p15) FMA f106 = f10, f82, f106 nop __LINE__ } ;; (p15) LDFD f87 = [AO8] (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 (p13) FMA f102 = f11, f54, f102 (p13) FMA f103 = f11, f55, f103 (p14) FMA f104 = f11, f70, f104 (p14) FMA f105 = f11, f71, f105 (p15) FMA f106 = f11, f83, f106 ;; (p13) FMA f100 = f12, f40, f100 (p13) FMA f101 = f12, f41, f101 (p13) FMA f102 = f12, f56, f102 (p13) FMA f103 = f12, f57, f103 (p14) FMA f104 = f12, f72, f104 (p14) FMA f105 = f12, f73, f105 (p15) FMA f106 = f12, f84, f106 ;; (p13) FMA f100 = f13, f42, f100 (p13) FMA f101 = f13, f43, f101 (p13) FMA f102 = f13, f58, f102 (p13) FMA f103 = f13, f59, f103 (p14) FMA f104 = f13, f74, f104 (p14) FMA f105 = f13, f75, f105 (p15) FMA f106 = f13, f85, f106 ;; (p13) FMA f100 = f14, f44, f100 (p13) FMA f101 = f14, f45, f101 (p13) FMA f102 = f14, f60, f102 (p13) FMA f103 = f14, f61, f103 (p14) FMA f104 = f14, f76, f104 (p14) FMA f105 = f14, f77, f105 (p15) FMA f106 = f14, f86, f106 ;; (p13) FMA f100 = f15, f46, f100 (p13) FMA f101 = f15, f47, f101 (p13) FMA f102 = f15, f62, f102 (p13) FMA f103 = f15, f63, f103 (p14) FMA f104 = f15, f78, f104 (p14) FMA f105 = f15, f79, f105 (p15) FMA f106 = f15, f87, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE (p6) br.cond.dptk .L111 ;; .align 16 .L120: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 2 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L130 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd AO4 = LDA, 1, AO2 } ;; { .mmi LDFD f10 = [X], INCX (p8) LDFD f81 = [AO2], 1 * SIZE shladd AO3 = LDA, 1, A } ;; { .mmi LDFD f11 = [X], INCX (p8) LDFD f82 = [AO3], 1 * SIZE } ;; { .mfi (p8) LDFD f83 = [AO4], 1 * SIZE FMPY f8 = ALPHA, f8 adds PREB = RPREFETCH * SIZE, YLD1 } { .mfi adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 } ;; FMPY f10 = ALPHA, f10 shladd A = LDA, 2, A FMPY f11 = ALPHA, f11 ;; { .mfi adds RPRE3 = RPREFETCH * SIZE, AO3 (p8) FMA f106 = f8, f80, f106 mov ar.ec= 2 } ;; adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 (p8) FMA f106 = f9, f81, f106 shr I = MM, 3 ;; { .mmf cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 (p8) FMA f106 = f10, f82, f106 } ;; { .mfi adds I = -1, I (p8) FMA f106 = f11, f83, f106 tbit.nz p13, p0 = MM, 2 } ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L125 } ;; .align 16 .L122: { .mfi (p17) LDFD f64 = [AO4], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 (p16) adds I = -1, I } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mmf (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mmf (p16) LDFPD f41, f42 = [AO2], 2 * SIZE (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f43, f44 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f45, f46 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFD f47 = [AO2], 1 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f16 = f11, f57, f101 } { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFD f56 = [AO4], 1 * SIZE (p17) FMA f17 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f57, f58 = [AO4], 2 * SIZE (p17) FMA f18 = f11, f59, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f59, f60 = [AO4], 2 * SIZE (p17) FMA f20 = f11, f61, f113 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f61, f62 = [AO4], 2 * SIZE (p17) FMA f22 = f11, f63, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f11, f64, f122 br.ctop.sptk.few .L122 } ;; .align 16 .L125: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f19, 1 * SIZE (p15) LDFD f80 = [AO1] } { .mmi (p15) LDFD f106 = [YLD1], 1 * SIZE (p13) LDFD f34 = [AO2], 1 * SIZE } ;; { .mmi (p13) LDFPD f35, f50 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFD f51 = [AO2], 1 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFD f66 = [AO2], 1 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmf (p18) STFD [YST1] = f23, 1 * SIZE (p14) LDFD f67 = [AO2], 1 * SIZE (p13) FMA f100 = f8, f32, f100 } { .mmf (p15) LDFD f82 = [AO3] (p13) LDFD f38 = [AO4], 1 * SIZE (p13) FMA f101 = f8, f33, f101 } ;; ;; { .mmf (p13) LDFPD f39, f54 = [AO4], 2 * SIZE (p15) LDFD f81 = [AO2] (p13) FMA f102 = f8, f48, f102 } { .mfi (p13) FMA f103 = f8, f49, f103 } ;; { .mfi (p13) LDFD f55 = [AO4], 1 * SIZE (p14) FMA f104 = f8, f64, f104 } { .mfi (p14) FMA f105 = f8, f65, f105 } ;; { .mfi (p14) LDFD f70 = [AO4], 1 * SIZE (p15) FMA f106 = f8, f80, f106 } { .mfi (p13) FMA f100 = f9, f34, f100 } ;; { .mfi (p14) LDFD f71 = [AO4], 1 * SIZE (p13) FMA f101 = f9, f35, f101 } { .mfi (p13) FMA f102 = f9, f50, f102 } ;; (p15) LDFD f83 = [AO4] (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) FMA f100 = f10, f36, f100 (p13) FMA f101 = f10, f37, f101 (p13) FMA f102 = f10, f52, f102 (p13) FMA f103 = f10, f53, f103 (p14) FMA f104 = f10, f68, f104 (p14) FMA f105 = f10, f69, f105 (p15) FMA f106 = f10, f82, f106 ;; (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 ;; (p13) FMA f102 = f11, f54, f102 (p13) STFD [YST1] = f100, 1 * SIZE (p13) FMA f103 = f11, f55, f103 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f104 = f11, f70, f104 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p14) FMA f105 = f11, f71, f105 ;; (p13) STFD [YST1] = f103, 1 * SIZE (p15) FMA f106 = f11, f83, f106 ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L130: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 1 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L140 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd A = LDA, 1, A } ;; adds PREB = RPREFETCH * SIZE, YLD1 FMPY f8 = ALPHA, f8 mov ar.ec= 2 adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 shr I = MM, 3 ;; (p8) LDFD f81 = [AO2], 1 * SIZE cmp.eq p6, p0 = 0, I ;; (p8) FMA f106 = f8, f80, f106 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 tbit.nz p13, p0 = MM, 2 ;; (p8) FMA f106 = f9, f81, f106 cmp.eq p16, p0 = r0, r0 adds I = -1, I ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L135 } ;; .align 16 .L132: { .mfi (p17) LDFD f48 = [AO2], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mmf (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 adds I = -1, I } { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mmf (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mmf (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mmf (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f16 = f9, f41, f101 } { .mmf (p18) STFD [YST1] = f20, 1 * SIZE (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f17 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f41, f42 = [AO2], 2 * SIZE (p17) FMA f18 = f9, f43, f107 } { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f43, f44 = [AO2], 2 * SIZE (p17) FMA f20 = f9, f45, f113 } { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f45, f46 = [AO2], 2 * SIZE (p17) FMA f22 = f9, f47, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f9, f48, f122 br.ctop.sptk.few .L132 } ;; .align 16 .L135: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFD f34 = [AO2], 1 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFD f35 = [AO2], 1 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p13) LDFD f50 = [AO2], 1 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmi (p13) LDFD f51 = [AO2], 1 * SIZE (p18) STFD [YST1] = f23, 1 * SIZE } ;; (p14) LDFD f66 = [AO2], 1 * SIZE (p13) FMA f100 = f8, f32, f100 ;; (p14) LDFD f67 = [AO2], 1 * SIZE (p13) FMA f101 = f8, f33, f101 ;; (p15) LDFD f81 = [AO2] (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 (p14) FMA f104 = f8, f64, f104 (p14) FMA f105 = f8, f65, f105 (p15) FMA f106 = f8, f80, f106 ;; (p13) FMA f100 = f9, f34, f100 (p13) FMA f101 = f9, f35, f101 (p13) FMA f102 = f9, f50, f102 (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L140: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 0 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L990 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE adds RPRE1 = RPREFETCH * SIZE, AO1 } ;; { .mmi (p8) LDFD f80 = [AO1], 1 * SIZE adds PREB = RPREFETCH * SIZE, YLD1 } ;; FMPY f8 = ALPHA, f8 shr I = MM, 3 ;; (p8) FMA f106 = f8, f80, f106 mov ar.ec= 3 ;; { .mmi cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 tbit.nz p14, p15 = r0, 0 } ;; { .mmi adds YST2 = 4 * SIZE, YST1 adds I = -1, I tbit.nz p13, p0 = MM, 2 } ;; { .mmi (p8) STFD [YST1] = f106, 1 * SIZE (p8) adds YST2 = 1 * SIZE, YST2 } { .mib mov ar.lc = I (p6) br.cond.dpnt .L145 } ;; .align 16 .L142: { .mmf (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE (p18) FMA f16 = f8, f34, f102 } { .mmf (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) FMA f20 = f8, f46, f114 } ;; { .mmf (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE (p18) FMA f17 = f8, f37, f105 } { .mmf (p16) LDFPD f38, f41 = [AO1], 2 * SIZE (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) FMA f21 = f8, f49, f117 } ;; { .mmf (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE (p18) FMA f18 = f8, f40, f108 } { .mmf (p16) LDFPD f44, f47 = [AO1], 2 * SIZE (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) FMA f22 = f8, f52, f120 } ;; { .mmf (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE (p18) FMA f19 = f8, f43, f111 } { .mmf (p16) LDFPD f50, f53 = [AO1], 2 * SIZE (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) FMA f23 = f8, f55, f123 } ;; { .mmi (p14) PREFETCH [RPRE1], 16 * SIZE (p14) PREFETCH [PREB], 16 * SIZE (p16) tbit.nz.unc p14, p15 = I, 0 } { .mib nop __LINE__ (p16) adds I = -1, I br.ctop.sptk.few .L142 } ;; .align 16 .L145: { .mmi (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE } { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE } { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 (p14) FMA f104 = f8, f64, f104 (p14) FMA f105 = f8, f65, f105 (p15) FMA f106 = f8, f80, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L990: { .mmi mov YLD1 = YY mov YST1 = Y mov pr.rot= 0 } { .mib mov YST2 = Y shr J = M, 3 (p10) br.cond.dptk .L999 } ;; { .mmi cmp.eq p6, p0 = r0, J adds J = -1, J mov ar.ec = 4 } { .mmi cmp.eq p16, p0 = r0, r0 nop __LINE__ tbit.nz p13, p0 = M, 2 } ;; { .mib nop __LINE__ mov ar.lc = J (p6) br.cond.dpnt .L995 } ;; .L992: { .mfi (p19) STFD [YST2] = f35 (p18) FADD f34 = f34, f66 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f64 = [YLD1], 1 * SIZE (p16) LDFD f32 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f39 (p18) FADD f38 = f38, f70 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f36 = [YST1], INCY (p16) LDFD f68 = [YLD1], 1 * SIZE } ;; { .mfi (p19) STFD [YST2] = f43 (p18) FADD f42 = f42, f74 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f72 = [YLD1], 1 * SIZE (p16) LDFD f40 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f47 (p18) FADD f46 = f46, f78 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f76 = [YLD1], 1 * SIZE (p16) LDFD f44 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f51 (p18) FADD f50 = f50, f82 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f80 = [YLD1], 1 * SIZE (p16) LDFD f48 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f55 (p18) FADD f54 = f54, f86 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f84 = [YLD1], 1 * SIZE (p16) LDFD f52 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f59 (p18) FADD f58 = f58, f90 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f88 = [YLD1], 1 * SIZE (p16) LDFD f56 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f63 (p18) FADD f62 = f62, f94 (p19) add YST2 = YST2, INCY } { .mmb (p16) LDFD f92 = [YLD1], 1 * SIZE (p16) LDFD f60 = [YST1], INCY br.ctop.sptk.few .L992 } ;; .L995: (p13) LDFD f32 = [YST1], INCY (p13) LDFD f40 = [YLD1], 1 * SIZE tbit.nz p14, p0 = M, 1 ;; (p13) LDFD f33 = [YST1], INCY (p13) LDFD f41 = [YLD1], 1 * SIZE tbit.nz p15, p0 = M, 0 ;; (p13) LDFD f34 = [YST1], INCY (p13) LDFD f42 = [YLD1], 1 * SIZE ;; (p13) LDFD f35 = [YST1], INCY (p13) LDFD f43 = [YLD1], 1 * SIZE ;; (p14) LDFD f36 = [YST1], INCY (p14) LDFD f44 = [YLD1], 1 * SIZE ;; (p14) LDFD f37 = [YST1], INCY (p14) LDFD f45 = [YLD1], 1 * SIZE ;; (p15) LDFD f38 = [YST1], INCY (p15) LDFD f46 = [YLD1], 1 * SIZE ;; (p13) FADD f32 = f32, f40 (p13) FADD f33 = f33, f41 (p13) FADD f34 = f34, f42 (p13) FADD f35 = f35, f43 (p14) FADD f36 = f36, f44 (p14) FADD f37 = f37, f45 (p15) FADD f38 = f38, f46 ;; (p13) STFD [YST2] = f32 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f33 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f34 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f35 (p13) add YST2 = YST2, INCY ;; (p14) STFD [YST2] = f36 (p14) add YST2 = YST2, INCY ;; (p14) STFD [YST2] = f37 (p14) add YST2 = YST2, INCY ;; (p15) STFD [YST2] = f38 ;; .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/gemv_t.S000066400000000000000000001633761313527062700166140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define P 4096 #define SP r12 #define M r32 #define N r33 #define A r36 #define LDA r37 #define X r38 #define INCX r39 #define Y r34 #define INCY r35 #define BUFFER r11 #define MIN_M r14 #define I r15 #define J r16 #define IS r17 #define AO1 r18 #define AO2 r19 #define AO3 r20 #define AO4 r21 #define AO5 r22 #define AO6 r23 #define AO7 r24 #define AO8 r25 #define BO r26 #define LDAP r27 #define RPRE1 loc0 #define RPRE2 loc1 #define RPRE3 loc2 #define RPRE4 loc3 #define RPRE5 loc4 #define RPRE6 loc5 #define RPRE7 loc6 #define RPRE8 loc7 #define AO21 loc8 #define AO41 loc9 #define AO61 loc10 #define AO81 loc11 #define PREB r8 #define WPRE r9 #define OFFSET PREB #define CO r10 #define ARLC r29 #define PR r30 #define ARPFS r31 #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else #define RPREFETCH (16 * 3 + 16) #endif #define PREFETCH lfetch.nt1 #define ALPHA f6 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 8, 0 setf.sig f11 = LDA mov ARLC = ar.lc } { .mmi adds r15 = 24, SP adds r16 = 32, SP adds r14 = 16, SP } ;; { .mmi setf.sig f10 = N ld8 Y = [r14] mov PR = pr } { .mmi ld8 INCY = [r15] adds r8 = -8 * 16, SP adds r9 = -7 * 16, SP } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 adds SP = -8 * 16, SP } ;; { .mmf stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 mov ALPHA = f8 } ;; { .mmi stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 mov IS = 0 } ;; { .mmf stf.spill [r8] = f22 stf.spill [r9] = f23 xmpy.l f10 = f10, f11 } .body ;; ;; { .mmi ld8 BUFFER = [r16] cmp.ge p7, p0 = r0, M cmp.ge p6, p0 = r0, N } ;; { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd LDA = LDA, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 } ;; { .mmi getf.sig LDAP = f10 mov r2 = P tbit.nz p8, p0 = A, BASE_SHIFT } { .mmi nop __LINE__ nop __LINE__ tbit.nz p9, p0 = LDA, BASE_SHIFT } ;; { .mbb sub LDAP = r2, LDAP (p7) br.cond.dpnt .L999 (p6) br.cond.dpnt .L999 } .align 16 ;; .LIs_loop: { .mmi sub MIN_M = M, IS (p8) LDFD f32 = [X], INCX mov pr.rot= 0 } { .mmi mov AO1 = BUFFER adds AO2 = 4 * SIZE, BUFFER } ;; cmp.le p6, p0 = r2, MIN_M ;; (p6) mov MIN_M = P ;; (p8) adds MIN_M = -1, MIN_M ;; { .mmi shladd OFFSET = INCX, 2, INCX shladd BO = INCX, 2, X shr I = MIN_M, 3 } ;; { .mmi adds I = -1, I cmp.eq p16, p0 = r0, r0 mov ar.ec= 5 } ;; { .mmi (p8) STFD [AO1] = f32, 2 * SIZE (p8) adds AO2 = 6 * SIZE, BUFFER mov ar.lc = I } { .mib cmp.gt p6, p0 = 0, I tbit.nz p13, p0 = MIN_M, 2 (p6) br.cond.dpnt .L05 } ;; .align 16 .L01: (p20) STFD [AO1] = f36, SIZE (p20) STFD [AO2] = f56, SIZE (p16) LDFD f32 = [X], INCX (p16) LDFD f52 = [BO], INCX ;; (p20) STFD [AO1] = f41, SIZE (p20) STFD [AO2] = f61, SIZE (p16) LDFD f37 = [X], INCX (p16) LDFD f57 = [BO], INCX ;; (p20) STFD [AO1] = f46, SIZE (p20) STFD [AO2] = f66, SIZE (p16) LDFD f42 = [X], INCX (p16) LDFD f62 = [BO], INCX ;; (p20) STFD [AO1] = f51, 5 * SIZE (p20) STFD [AO2] = f71, 5 * SIZE (p16) LDFD f47 = [X], OFFSET (p16) LDFD f67 = [BO], OFFSET br.ctop.sptk.few .L01 ;; .align 16 .L05: (p13) LDFD f32 = [X], INCX tbit.nz p14, p0 = MIN_M, 1 ;; (p13) LDFD f33 = [X], INCX tbit.nz p15, p0 = MIN_M, 0 ;; (p13) LDFD f34 = [X], INCX ;; (p13) LDFD f35 = [X], INCX ;; (p14) LDFD f36 = [X], INCX ;; (p13) STFD [AO1] = f32, SIZE (p14) LDFD f37 = [X], INCX ;; (p13) STFD [AO1] = f33, SIZE (p15) LDFD f38 = [X], INCX ;; (p13) STFD [AO1] = f34, SIZE ;; (p13) STFD [AO1] = f35, SIZE ;; (p14) STFD [AO1] = f36, SIZE ;; (p14) STFD [AO1] = f37, SIZE ;; (p15) STFD [AO1] = f38, SIZE (p9) br.cond.dpnt .L100 ;; .align 16 .L10: { .mmi mov CO = Y nop __LINE__ shr J = N, 3 } ;; { .mib nop __LINE__ cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 } ;; .align 16 .L11: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 shr I = MIN_M, 4 } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mmf (p8) LDFD f32 = [AO1], SIZE (p8) LDFD f33 = [AO2], SIZE mov f14 = f0 } ;; { .mmf shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 mov f16 = f0 } { .mmf (p8) LDFD f34 = [AO3], SIZE (p8) LDFD f35 = [AO4], SIZE mov f18 = f0 } ;; { .mmf shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 mov f20 = f0 } { .mmf (p8) LDFD f36 = [AO5], SIZE (p8) LDFD f37 = [AO6], SIZE mov f22 = f0 } ;; { .mfi (p8) LDFD f38 = [AO7], SIZE mov f9 = f0 mov ar.ec= 2 } { .mmf (p8) LDFD f39 = [AO8], SIZE mov BO = BUFFER mov f11 = f0 } ;; { .mmf (p8) LDFD f40 = [BO], 2 * SIZE cmp.eq p6, p0 = 0, I mov f13 = f0 } { .mmf shladd A = LDA, 3, A cmp.eq p16, p0 = r0, r0 mov f15 = f0 } ;; { .mmf add I = I, I nop __LINE__ mov f17 = f0 } { .mmf adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 mov f19 = f0 } ;; { .mmf adds I = -1, I nop __LINE__ mov f21 = f0 } { .mmf adds RPRE3 = RPREFETCH * SIZE, AO3 adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 mov f23 = f0 } ;; { .mmf nop __LINE__ nop __LINE__ (p8) FMPY f8 = f40, f32 } { .mmf adds RPRE5 = RPREFETCH * SIZE, AO5 adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 (p8) FMPY f10 = f40, f33 } ;; { .mmf nop __LINE__ nop __LINE__ (p8) FMPY f12 = f40, f34 } { .mmf adds RPRE7 = RPREFETCH * SIZE, AO7 adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 (p8) FMPY f14 = f40, f35 } ;; { .mfi nop __LINE__ (p8) FMPY f16 = f40, f36 mov ar.lc = I } { .mmf adds WPRE = 8 * SIZE, CO adds PREB = RPREFETCH * SIZE, BO (p8) FMPY f18 = f40, f37 } ;; { .mmf lfetch.excl.nt1 [WPRE] nop __LINE__ (p8) FMPY f20 = f40, f38 } { .mfb nop __LINE__ (p8) FMPY f22 = f40, f39 (p6) br.cond.dpnt .L15 } ;; .align 16 .L12: { .mfi (p17) LDFPD f95, f96 = [AO8], 2 * SIZE (p17) FMA f8 = f104, f33, f8 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f9 = f105, f34, f9 nop __LINE__ } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f10 = f104, f35, f10 nop __LINE__ } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f11 = f105, f36, f11 nop __LINE__ } ;; { .mfi (p16) LDFPD f34, f35 = [AO2], 2 * SIZE (p17) FMA f12 = f104, f37, f12 nop __LINE__ } { .mfi (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f13 = f105, f38, f13 nop __LINE__ } ;; { .mfi (p16) LDFPD f36, f37 = [AO3], 2 * SIZE (p17) FMA f14 = f104, f39, f14 nop __LINE__ } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f15 = f105, f40, f15 nop __LINE__ } ;; { .mfi (p16) LDFPD f38, f39 = [AO4], 2 * SIZE (p17) FMA f16 = f104, f41, f16 nop __LINE__ } { .mfi (p15) PREFETCH [RPRE4], 16 * SIZE (p17) FMA f17 = f105, f42, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f40, f41 = [AO5], 2 * SIZE (p17) FMA f18 = f104, f43, f18 nop __LINE__ } { .mfi (p14) PREFETCH [RPRE5], 16 * SIZE (p17) FMA f19 = f105, f44, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f42, f43 = [AO6], 2 * SIZE (p17) FMA f20 = f104, f45, f20 nop __LINE__ } { .mfi (p15) PREFETCH [RPRE6], 16 * SIZE (p17) FMA f21 = f105, f46, f21 nop __LINE__ } ;; { .mfi (p16) LDFPD f44, f45 = [AO7], 2 * SIZE (p17) FMA f22 = f104, f47, f22 nop __LINE__ } { .mfi (p14) PREFETCH [RPRE7], 16 * SIZE (p17) FMA f23 = f105, f48, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f46, f47 = [AO8], 2 * SIZE (p17) FMA f8 = f106, f49, f8 nop __LINE__ } { .mfi (p15) PREFETCH [RPRE8], 16 * SIZE (p17) FMA f9 = f107, f50, f9 nop __LINE__ } ;; { .mfi (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f10 = f106, f51, f10 nop __LINE__ } { .mfi (p14) PREFETCH [PREB], 16 * SIZE (p17) FMA f11 = f107, f52, f11 nop __LINE__ } ;; { .mfi (p16) LDFPD f50, f51 = [AO2], 2 * SIZE (p17) FMA f12 = f106, f53, f12 nop __LINE__ } { .mfi (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f13 = f107, f54, f13 nop __LINE__ } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f14 = f106, f55, f14 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f15 = f107, f56, f15 nop __LINE__ } ;; { .mfi (p16) LDFPD f54, f55 = [AO4], 2 * SIZE (p17) FMA f16 = f106, f57, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f107, f58, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f56, f57 = [AO5], 2 * SIZE (p17) FMA f18 = f106, f59, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f107, f60, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f58, f59 = [AO6], 2 * SIZE (p17) FMA f20 = f106, f61, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f107, f62, f21 nop __LINE__ } ;; { .mfi (p16) LDFPD f60, f61 = [AO7], 2 * SIZE (p17) FMA f22 = f106, f63, f22 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f23 = f107, f64, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f62, f63 = [AO8], 2 * SIZE (p17) FMA f8 = f108, f65, f8 nop __LINE__ } { .mfi (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p17) FMA f9 = f109, f66, f9 nop __LINE__ } ;; { .mfi (p16) LDFPD f64, f65 = [AO1], 2 * SIZE (p17) FMA f10 = f108, f67, f10 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f11 = f109, f68, f11 nop __LINE__ } ;; { .mfi (p16) LDFPD f66, f67 = [AO2], 2 * SIZE (p17) FMA f12 = f108, f69, f12 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f13 = f109, f70, f13 nop __LINE__ } ;; { .mfi (p16) LDFPD f68, f69 = [AO3], 2 * SIZE (p17) FMA f14 = f108, f71, f14 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f15 = f109, f72, f15 nop __LINE__ } ;; { .mfi (p16) LDFPD f70, f71 = [AO4], 2 * SIZE (p17) FMA f16 = f108, f73, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f109, f74, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f72, f73 = [AO5], 2 * SIZE (p17) FMA f18 = f108, f75, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f109, f76, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f74, f75 = [AO6], 2 * SIZE (p17) FMA f20 = f108, f77, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f109, f78, f21 nop __LINE__ } ;; { .mfi (p16) LDFPD f76, f77 = [AO7], 2 * SIZE (p17) FMA f22 = f108, f79, f22 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f23 = f109, f80, f23 nop __LINE__ } ;; { .mfi (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 nop __LINE__ } { .mfi (p16) LDFPD f78, f79 = [AO8], 2 * SIZE (p17) FMA f9 = f111, f82, f9 nop __LINE__ } ;; { .mfi (p16) LDFPD f80, f81 = [AO1], 2 * SIZE (p17) FMA f10 = f110, f83, f10 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f11 = f111, f84, f11 nop __LINE__ } ;; { .mfi (p16) LDFPD f82, f83 = [AO2], 2 * SIZE (p17) FMA f12 = f110, f85, f12 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f13 = f111, f86, f13 nop __LINE__ } ;; { .mfi (p16) LDFPD f84, f85 = [AO3], 2 * SIZE (p17) FMA f14 = f110, f87, f14 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f15 = f111, f88, f15 nop __LINE__ } ;; { .mfi (p16) LDFPD f86, f87 = [AO4], 2 * SIZE (p17) FMA f16 = f110, f89, f16 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f17 = f111, f90, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f88, f89 = [AO5], 2 * SIZE (p17) FMA f18 = f110, f91, f18 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f19 = f111, f92, f19 nop __LINE__ } ;; { .mfi (p16) LDFPD f90, f91 = [AO6], 2 * SIZE (p17) FMA f20 = f110, f93, f20 nop __LINE__ } { .mfi nop __LINE__ (p17) FMA f21 = f111, f94, f21 nop __LINE__ } ;; { .mfi (p16) LDFPD f92, f93 = [AO7], 2 * SIZE (p17) FMA f22 = f110, f95, f22 nop __LINE__ } { .mfb adds I = -1, I (p17) FMA f23 = f111, f96, f23 br.ctop.sptk.few .L12 } ;; .align 16 .L15: and I = 15, MIN_M mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p15 = r0, r0 ;; adds I = 1, I ;; shr I = I, 1 ;; adds I = -1, I ;; mov ar.lc = I mov ar.ec= 3 and I = 15, MIN_M (p6) br.cond.dpnt .L18 ;; .align 16 .L16: { .mfi (p16) LDFPD f104, f107 = [BO], 2 * SIZE (p18) FMA f8 = f106, f34, f8 nop __LINE__ } { .mfi (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p15) FMA f9 = f109, f37, f9 nop __LINE__ } ;; { .mfi (p16) LDFPD f38, f41 = [AO2], 2 * SIZE (p18) FMA f10 = f106, f40, f10 nop __LINE__ } { .mfi nop __LINE__ (p15) FMA f11 = f109, f43, f11 nop __LINE__ } ;; { .mfi (p16) LDFPD f44, f47 = [AO3], 2 * SIZE (p18) FMA f12 = f106, f46, f12 nop __LINE__ } { .mfi nop __LINE__ (p15) FMA f13 = f109, f49, f13 nop __LINE__ } ;; { .mfi (p16) LDFPD f50, f53 = [AO4], 2 * SIZE (p18) FMA f14 = f106, f52, f14 nop __LINE__ } { .mfi nop __LINE__ (p15) FMA f15 = f109, f55, f15 nop __LINE__ } ;; { .mfi (p16) LDFPD f56, f59 = [AO5], 2 * SIZE (p18) FMA f16 = f106, f58, f16 nop __LINE__ } { .mfi nop __LINE__ (p15) FMA f17 = f109, f61, f17 nop __LINE__ } ;; { .mfi (p16) LDFPD f62, f65 = [AO6], 2 * SIZE (p18) FMA f18 = f106, f64, f18 nop __LINE__ } { .mfi nop __LINE__ (p15) FMA f19 = f109, f67, f19 (p17) adds I = -2, I } ;; { .mfi (p16) LDFPD f68, f71 = [AO7], 2 * SIZE (p18) FMA f20 = f106, f70, f20 nop __LINE__ } { .mfi nop __LINE__ (p15) FMA f21 = f109, f73, f21 nop __LINE__ } ;; { .mfi (p16) LDFPD f74, f77 = [AO8], 2 * SIZE (p15) FMA f23 = f109, f79, f23 (p17) cmp.ne.unc p15, p0 = -1, I } { .mfb nop __LINE__ (p18) FMA f22 = f106, f76, f22 br.ctop.sptk.few .L16 } ;; .L18: { .mmf mov AO1 = CO LDFD f32 = [CO], INCY FADD f8 = f8, f9 } ;; { .mmf LDFD f33 = [CO], INCY nop __LINE__ FADD f10 = f10, f11 } ;; { .mmf LDFD f34 = [CO], INCY nop __LINE__ FADD f12 = f12, f13 } ;; { .mmf LDFD f35 = [CO], INCY nop __LINE__ FADD f14 = f14, f15 } ;; { .mmf LDFD f36 = [CO], INCY nop __LINE__ FADD f16 = f16, f17 } ;; { .mmf LDFD f37 = [CO], INCY nop __LINE__ FADD f18 = f18, f19 } ;; { .mmf LDFD f38 = [CO], INCY nop __LINE__ FADD f20 = f20, f21 } ;; { .mmf LDFD f39 = [CO], INCY nop __LINE__ FADD f22 = f22, f23 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA, f10, f33 } { .mmf nop __LINE__ nop __LINE__ FMA f34 = ALPHA, f12, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA, f14, f35 } ;; { .mmf STFD [AO1] = f32 add AO1 = AO1, INCY FMA f36 = ALPHA, f16, f36 } ;; { .mmf STFD [AO1] = f33 add AO1 = AO1, INCY FMA f37 = ALPHA, f18, f37 } ;; { .mmf STFD [AO1] = f34 add AO1 = AO1, INCY FMA f38 = ALPHA, f20, f38 } ;; { .mmf STFD [AO1] = f35 add AO1 = AO1, INCY FMA f39 = ALPHA, f22, f39 } ;; { .mmi STFD [AO1] = f36 add AO1 = AO1, INCY adds J = -1, J } ;; { .mmi STFD [AO1] = f37 add AO1 = AO1, INCY nop __LINE__ } ;; { .mmi STFD [AO1] = f38 add AO1 = AO1, INCY cmp4.lt p6, p0 = 0, J } ;; { .mib STFD [AO1] = f39 add AO1 = AO1, INCY (p6) br.cond.dptk .L11 } ;; .align 16 .L20: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 2 } ;; { .mfi shladd AO3 = LDA, 1, A mov f12 = f0 shr I = MIN_M, 4 } { .mfb shladd AO4 = LDA, 1, AO2 mov f14 = f0 (p6) br.cond.dpnt .L30 } ;; { .mmf (p8) LDFD f32 = [AO1], SIZE (p8) LDFD f33 = [AO2], SIZE mov f9 = f0 } { .mmf mov BO = BUFFER shladd A = LDA, 2, A mov f11 = f0 } ;; { .mmf (p8) LDFD f40 = [BO], 2 * SIZE cmp.eq p6, p0 = 0, I mov f13 = f0 } { .mmf (p8) LDFD f34 = [AO3], SIZE (p8) LDFD f35 = [AO4], SIZE mov f15 = f0 } ;; { .mmi adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 mov ar.ec= 2 } { .mmi cmp.eq p16, p0 = r0, r0 add I = I, I } ;; { .mmf adds WPRE = 4 * SIZE, CO adds PREB = RPREFETCH * SIZE, BO (p8) FMPY f8 = f40, f32 } { .mmf adds RPRE3 = RPREFETCH * SIZE, AO3 adds I = -1, I (p8) FMPY f10 = f40, f33 } ;; { .mfi lfetch.excl.nt1 [WPRE] (p8) FMPY f12 = f40, f34 mov ar.lc = I } { .mfb adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 (p8) FMPY f14 = f40, f35 (p6) br.cond.dpnt .L25 } ;; .align 16 .L22: { .mmf (p17) LDFPD f87, f88 = [AO4], 2 * SIZE (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f8 = f104, f33, f8 } { .mfi nop __LINE__ (p17) FMA f9 = f105, f34, f9 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f10 = f104, f35, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f105, f36, f11 } ;; { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p16) LDFPD f34, f35 = [AO2], 2 * SIZE (p17) FMA f12 = f104, f37, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f105, f38, f13 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE (p16) LDFPD f36, f37 = [AO3], 2 * SIZE (p17) FMA f14 = f104, f39, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f105, f40, f15 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFPD f38, f39 = [AO4], 2 * SIZE (p17) FMA f8 = f106, f49, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f107, f50, f9 } ;; { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f10 = f106, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f107, f52, f11 } ;; { .mmf (p16) LDFPD f50, f51 = [AO2], 2 * SIZE (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f12 = f106, f53, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f107, f54, f13 } ;; { .mmf (p16) LDFPD f52, f53 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f106, f55, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f107, f56, f15 } ;; { .mmf (p16) LDFPD f54, f55 = [AO4], 2 * SIZE (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p17) FMA f8 = f108, f65, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f109, f66, f9 } ;; { .mmf (p16) LDFPD f64, f65 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f108, f67, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f109, f68, f11 } ;; { .mmf (p16) LDFPD f66, f67 = [AO2], 2 * SIZE nop __LINE__ (p17) FMA f12 = f108, f69, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f109, f70, f13 } ;; { .mmf (p16) LDFPD f68, f69 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f108, f71, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f109, f72, f15 } ;; { .mmf (p16) LDFPD f70, f71 = [AO4], 2 * SIZE (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f111, f82, f9 } ;; { .mmf (p16) LDFPD f80, f81 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f110, f83, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f111, f84, f11 } ;; { .mmf (p16) LDFPD f82, f83 = [AO2], 2 * SIZE nop __LINE__ (p17) FMA f12 = f110, f85, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f111, f86, f13 } ;; { .mmf (p16) LDFPD f84, f85 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f110, f87, f14 } { .mfb adds I = -1, I (p17) FMA f15 = f111, f88, f15 br.ctop.sptk.few .L22 } ;; .align 16 .L25: and I = 15, MIN_M mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p15 = r0, r0 ;; adds I = 1, I ;; shr I = I, 1 ;; adds I = -1, I ;; mov ar.lc = I mov ar.ec= 3 and I = 15, MIN_M (p6) br.cond.dpnt .L28 ;; .align 16 .L26: { .mmf (p16) LDFPD f104, f107 = [BO], 2 * SIZE (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p18) FMA f8 = f106, f34, f8 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f9 = f109, f37, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [AO2], 2 * SIZE nop __LINE__ (p18) FMA f10 = f106, f40, f10 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f11 = f109, f43, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [AO3], 2 * SIZE nop __LINE__ (p18) FMA f12 = f106, f46, f12 } { .mmf nop __LINE__ (p17) adds I = -2, I (p15) FMA f13 = f109, f49, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [AO4], 2 * SIZE nop __LINE__ (p15) FMA f15 = f109, f55, f15 } { .mfb (p17) cmp.ne.unc p15, p0 = -1, I (p18) FMA f14 = f106, f52, f14 br.ctop.sptk.few .L26 } ;; .L28: { .mmf mov AO1 = CO LDFD f32 = [CO], INCY FADD f8 = f8, f9 } ;; { .mmf LDFD f33 = [CO], INCY nop __LINE__ FADD f10 = f10, f11 } ;; { .mmf LDFD f34 = [CO], INCY nop __LINE__ FADD f12 = f12, f13 } ;; { .mmf LDFD f35 = [CO], INCY nop __LINE__ FADD f14 = f14, f15 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA, f10, f33 } { .mmf nop __LINE__ nop __LINE__ FMA f34 = ALPHA, f12, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA, f14, f35 } ;; { .mmf STFD [AO1] = f32 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f33 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f34 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f35 add AO1 = AO1, INCY } ;; .align 16 .L30: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 1 } ;; { .mfi mov BO = BUFFER mov f12 = f0 shr I = MIN_M, 4 } { .mfb adds WPRE = 4 * SIZE, CO mov f14 = f0 (p6) br.cond.dpnt .L40 } ;; { .mmf (p8) LDFD f32 = [AO1], SIZE (p8) LDFD f33 = [AO2], SIZE mov f9 = f0 } { .mfi shladd A = LDA, 1, A mov f11 = f0 mov ar.ec= 2 } ;; { .mmf (p8) LDFD f40 = [BO], 2 * SIZE cmp.eq p6, p0 = 0, I mov f13 = f0 } { .mmf adds RPRE1 = RPREFETCH * SIZE, AO1 add I = I, I mov f15 = f0 } ;; { .mmi cmp.eq p16, p0 = r0, r0 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 adds I = -1, I } ;; { .mfi lfetch.excl.nt1 [WPRE] (p8) FMPY f8 = f40, f32 mov ar.lc = I } { .mfb adds PREB = RPREFETCH * SIZE, BO (p8) FMPY f10 = f40, f33 (p6) br.cond.dpnt .L35 } ;; .align 16 .L32: { .mmf (p17) LDFPD f83, f84 = [AO2], 2 * SIZE (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f8 = f104, f33, f8 } { .mfi nop __LINE__ (p17) FMA f9 = f105, f34, f9 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f10 = f104, f35, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f105, f36, f11 } ;; { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p16) LDFPD f34, f35 = [AO2], 2 * SIZE (p17) FMA f8 = f106, f49, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f107, f50, f9 } ;; { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f10 = f106, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f107, f52, f11 } ;; { .mmf (p16) LDFPD f50, f51 = [AO2], 2 * SIZE (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f8 = f108, f65, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f109, f66, f9 } ;; { .mmf (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p16) LDFPD f64, f65 = [AO1], 2 * SIZE (p17) FMA f10 = f108, f67, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f109, f68, f11 } ;; { .mmf (p16) LDFPD f66, f67 = [AO2], 2 * SIZE (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f111, f82, f9 } ;; { .mmf (p16) LDFPD f80, f81 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f110, f83, f10 } { .mfb adds I = -1, I (p17) FMA f11 = f111, f84, f11 br.ctop.sptk.few .L32 } ;; .align 16 .L35: and I = 15, MIN_M ;; cmp.eq p6, p0 = 0, I (p6) br.cond.dpnt .L38 ;; tbit.nz p12, p0 = MIN_M, 3 tbit.nz p13, p0 = MIN_M, 2 tbit.nz p14, p0 = MIN_M, 1 tbit.nz p15, p0 = MIN_M, 0 ;; (p12) LDFPD f32, f33 = [AO1], 2 * SIZE (p12) LDFPD f34, f35 = [AO2], 2 * SIZE (p12) LDFPD f100, f101 = [BO], 2 * SIZE ;; (p12) LDFPD f36, f37 = [AO1], 2 * SIZE (p12) LDFPD f38, f39 = [AO2], 2 * SIZE (p12) LDFPD f102, f103 = [BO], 2 * SIZE ;; (p12) LDFPD f40, f41 = [AO1], 2 * SIZE (p12) LDFPD f42, f43 = [AO2], 2 * SIZE (p12) LDFPD f104, f105 = [BO], 2 * SIZE ;; (p12) LDFPD f44, f45 = [AO1], 2 * SIZE (p12) LDFPD f46, f47 = [AO2], 2 * SIZE (p12) LDFPD f106, f107 = [BO], 2 * SIZE ;; (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p13) LDFPD f108, f109 = [BO], 2 * SIZE ;; (p13) LDFPD f52, f53 = [AO1], 2 * SIZE (p13) LDFPD f54, f55 = [AO2], 2 * SIZE (p13) LDFPD f110, f111 = [BO], 2 * SIZE ;; (p14) LDFPD f56, f57 = [AO1], 2 * SIZE (p14) LDFPD f58, f59 = [AO2], 2 * SIZE (p14) LDFPD f112, f113 = [BO], 2 * SIZE ;; (p15) LDFD f60 = [AO1] (p15) LDFD f61 = [AO2] (p15) LDFD f114 = [BO] ;; (p12) FMA f8 = f100, f32, f8 (p12) FMA f9 = f101, f33, f9 (p12) FMA f10 = f100, f34, f10 (p12) FMA f11 = f101, f35, f11 ;; (p12) FMA f12 = f102, f36, f12 (p12) FMA f13 = f103, f37, f13 (p12) FMA f14 = f102, f38, f14 (p12) FMA f15 = f103, f39, f15 ;; (p12) FMA f8 = f104, f40, f8 (p12) FMA f9 = f105, f41, f9 (p12) FMA f10 = f104, f42, f10 (p12) FMA f11 = f105, f43, f11 ;; (p12) FMA f12 = f106, f44, f12 (p12) FMA f13 = f107, f45, f13 (p12) FMA f14 = f106, f46, f14 (p12) FMA f15 = f107, f47, f15 ;; (p13) FMA f8 = f108, f48, f8 (p13) FMA f9 = f109, f49, f9 (p13) FMA f10 = f108, f50, f10 (p13) FMA f11 = f109, f51, f11 ;; (p13) FMA f12 = f110, f52, f12 (p13) FMA f13 = f111, f53, f13 (p13) FMA f14 = f110, f54, f14 (p13) FMA f15 = f111, f55, f15 ;; (p14) FMA f8 = f112, f56, f8 (p14) FMA f9 = f113, f57, f9 (p14) FMA f10 = f112, f58, f10 (p14) FMA f11 = f113, f59, f11 ;; (p15) FMA f12 = f114, f60, f12 (p15) FMA f14 = f114, f61, f14 ;; .L38: FADD f8 = f8, f9 FADD f10 = f10, f11 FADD f12 = f12, f13 FADD f14 = f14, f15 ;; FADD f8 = f8, f12 FADD f10 = f10, f14 ;; { .mmf mov AO1 = CO LDFD f32 = [CO], INCY } ;; { .mmf LDFD f33 = [CO], INCY nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA, f10, f33 } ;; { .mmf STFD [AO1] = f32 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f33 } ;; .align 16 .L40: { .mfi mov AO1 = A mov f8 = f0 shr I = MIN_M, 4 } { .mfi mov BO = BUFFER mov f10 = f0 tbit.z p7, p0 = N, 0 } ;; { .mfi cmp.eq p6, p0 = 0, I mov f12 = f0 mov pr.rot= 0 } { .mfb add I = I, I mov f14 = f0 (p7) br.cond.dpnt .L99 } ;; { .mfi (p8) LDFD f32 = [AO1], SIZE mov f9 = f0 mov ar.ec= 2 } { .mmf (p8) LDFD f40 = [BO], 2 * SIZE add A = A, LDA mov f11 = f0 } ;; { .mmf adds WPRE = 1 * SIZE, CO adds PREB = RPREFETCH * SIZE, BO mov f13 = f0 } { .mmf cmp.eq p16, p0 = r0, r0 adds I = -1, I mov f15 = f0 } ;; { .mfi lfetch.excl.nt1 [WPRE] (p8) FMPY f8 = f40, f32 mov ar.lc = I } { .mmb nop __LINE__ nop __LINE__ (p6) br.cond.dpnt .L45 } ;; .align 16 .L42: { .mmf (p17) LDFPD f81, f82 = [AO1], 2 * SIZE (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f8 = f104, f33, f8 } { .mfi nop __LINE__ (p17) FMA f9 = f105, f34, f9 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f8 = f106, f49, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f107, f50, f9 } ;; { .mmf (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f8 = f108, f65, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f109, f66, f9 } ;; { .mmf (p16) LDFPD f64, f65 = [AO1], 2 * SIZE (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 } { .mfb adds I = -1, I (p17) FMA f9 = f111, f82, f9 br.ctop.sptk.few .L42 } ;; .align 16 .L45: and I = 15, MIN_M ;; cmp.eq p6, p0 = 0, I (p6) br.cond.dpnt .L48 ;; tbit.nz p12, p0 = MIN_M, 3 tbit.nz p13, p0 = MIN_M, 2 tbit.nz p14, p0 = MIN_M, 1 tbit.nz p15, p0 = MIN_M, 0 ;; (p12) LDFPD f32, f33 = [AO1], 2 * SIZE (p12) LDFPD f100, f101 = [BO], 2 * SIZE ;; (p12) LDFPD f36, f37 = [AO1], 2 * SIZE (p12) LDFPD f102, f103 = [BO], 2 * SIZE ;; (p12) LDFPD f40, f41 = [AO1], 2 * SIZE (p12) LDFPD f104, f105 = [BO], 2 * SIZE ;; (p12) LDFPD f44, f45 = [AO1], 2 * SIZE (p12) LDFPD f106, f107 = [BO], 2 * SIZE ;; (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f108, f109 = [BO], 2 * SIZE ;; (p13) LDFPD f52, f53 = [AO1], 2 * SIZE (p13) LDFPD f110, f111 = [BO], 2 * SIZE ;; (p14) LDFPD f56, f57 = [AO1], 2 * SIZE (p14) LDFPD f112, f113 = [BO], 2 * SIZE ;; (p15) LDFD f60 = [AO1] (p15) LDFD f114 = [BO] ;; (p12) FMA f8 = f100, f32, f8 (p12) FMA f9 = f101, f33, f9 (p12) FMA f10 = f102, f36, f10 (p12) FMA f11 = f103, f37, f11 (p12) FMA f12 = f104, f40, f12 (p12) FMA f13 = f105, f41, f13 (p12) FMA f14 = f106, f44, f14 (p12) FMA f15 = f107, f45, f15 ;; (p13) FMA f8 = f108, f48, f8 (p13) FMA f9 = f109, f49, f9 (p13) FMA f10 = f110, f52, f10 (p13) FMA f11 = f111, f53, f11 (p14) FMA f12 = f112, f56, f12 (p14) FMA f13 = f113, f57, f13 (p15) FMA f14 = f114, f60, f14 ;; .L48: { .mmf LDFD f32 = [CO] nop __LINE__ FADD f8 = f8, f9 } { .mmf nop __LINE__ nop __LINE__ FADD f10 = f10, f11 } ;; { .mmf nop __LINE__ nop __LINE__ FADD f12 = f12, f13 } { .mmf nop __LINE__ nop __LINE__ FADD f14 = f14, f15 } ;; { .mmf nop __LINE__ nop __LINE__ FADD f8 = f8, f12 } { .mmf nop __LINE__ nop __LINE__ FADD f10 = f10, f14 } ;; { .mmf nop __LINE__ nop __LINE__ FADD f8 = f8, f10 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } ;; { .mmf STFD [CO] = f32 } ;; .align 16 .L99: adds IS = P, IS shladd A = LDAP, BASE_SHIFT, A ;; cmp.gt p6, p0 = M, IS (p6) br.cond.dptk .LIs_loop br .L999 .align 4 ;; .L100: shr J = N, 3 mov CO = Y ;; cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L120 ;; .align 16 .L111: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 shr I = MIN_M, 4 } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mmf (p8) LDFD f32 = [AO1], SIZE (p8) LDFD f33 = [AO2], SIZE mov f14 = f0 } ;; { .mmf shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 mov f16 = f0 } { .mmf (p8) LDFD f34 = [AO3], SIZE (p8) LDFD f35 = [AO4], SIZE mov f18 = f0 } ;; { .mmf shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 mov f20 = f0 } { .mmf (p8) LDFD f36 = [AO5], SIZE (p8) LDFD f37 = [AO6], SIZE mov f22 = f0 } ;; { .mfi (p8) LDFD f38 = [AO7], SIZE mov f9 = f0 mov ar.ec= 2 } { .mmf (p8) LDFD f39 = [AO8], SIZE mov BO = BUFFER mov f11 = f0 } ;; { .mmf (p8) LDFD f40 = [BO], 2 * SIZE cmp.eq p6, p0 = 0, I mov f13 = f0 } { .mmf shladd A = LDA, 3, A cmp.eq p16, p0 = r0, r0 mov f15 = f0 } ;; { .mmf add I = I, I nop __LINE__ mov f17 = f0 } { .mmf adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 mov f19 = f0 } ;; { .mmf adds I = -1, I nop __LINE__ mov f21 = f0 } { .mmf adds RPRE3 = RPREFETCH * SIZE, AO3 adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 mov f23 = f0 } ;; { .mmf nop __LINE__ nop __LINE__ (p8) FMPY f8 = f40, f32 } { .mmf adds RPRE5 = RPREFETCH * SIZE, AO5 adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 (p8) FMPY f10 = f40, f33 } ;; { .mmf adds AO21 = 7 * SIZE, AO2 adds AO41 = 7 * SIZE, AO4 (p8) FMPY f12 = f40, f34 } { .mmf adds RPRE7 = RPREFETCH * SIZE, AO7 adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 (p8) FMPY f14 = f40, f35 } ;; { .mfi nop __LINE__ (p8) FMPY f16 = f40, f36 mov ar.lc = I } { .mmf adds WPRE = 8 * SIZE, CO adds PREB = RPREFETCH * SIZE, BO (p8) FMPY f18 = f40, f37 } ;; { .mmf lfetch.excl.nt1 [WPRE] adds AO61 = 7 * SIZE, AO6 (p8) FMPY f20 = f40, f38 } { .mfb adds AO81 = 7 * SIZE, AO8 (p8) FMPY f22 = f40, f39 (p6) br.cond.dpnt .L115 } ;; .align 16 .L112: { .mmf (p17) LDFPD f80, f95 = [AO8] (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f8 = f104, f33, f8 } { .mfi (p17) adds AO8 = 3 * SIZE, AO8 (p17) FMA f9 = f105, f34, f9 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f10 = f104, f35, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f105, f36, f11 } ;; { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p16) LDFD f34 = [AO2], 1 * SIZE (p17) FMA f12 = f104, f37, f12 } { .mmf (p17) LDFD f84 = [AO21], 8 * SIZE nop __LINE__ (p17) FMA f13 = f105, f38, f13 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE (p16) LDFPD f36, f37 = [AO3], 2 * SIZE (p17) FMA f14 = f104, f39, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f105, f40, f15 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFD f38 = [AO4], 1 * SIZE (p17) FMA f16 = f104, f41, f16 } { .mmf (p17) LDFD f88 = [AO41], 8 * SIZE nop __LINE__ (p17) FMA f17 = f105, f42, f17 } ;; { .mmf (p14) PREFETCH [RPRE5], 16 * SIZE (p16) LDFPD f40, f41 = [AO5], 2 * SIZE (p17) FMA f18 = f104, f43, f18 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f19 = f105, f44, f19 } ;; { .mmf (p15) PREFETCH [RPRE6], 16 * SIZE (p16) LDFD f42 = [AO6], 1 * SIZE (p17) FMA f20 = f104, f45, f20 } { .mmf (p17) LDFD f92 = [AO61], 8 * SIZE nop __LINE__ (p17) FMA f21 = f105, f46, f21 } ;; { .mmf (p14) PREFETCH [RPRE7], 16 * SIZE (p16) LDFPD f44, f45 = [AO7], 2 * SIZE (p17) FMA f22 = f104, f47, f22 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f23 = f105, f48, f23 } ;; { .mmf (p15) PREFETCH [RPRE8], 16 * SIZE (p16) LDFD f46 = [AO8], 1 * SIZE (p17) FMA f8 = f106, f49, f8 } { .mmf (p17) LDFD f96 = [AO81], 8 * SIZE nop __LINE__ (p17) FMA f9 = f107, f50, f9 } ;; { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f10 = f106, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f107, f52, f11 } ;; { .mmf (p16) LDFPD f35, f50 = [AO2], 2 * SIZE (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f12 = f106, f53, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f107, f54, f13 } ;; { .mmf (p16) LDFPD f52, f53 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f106, f55, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f107, f56, f15 } ;; { .mmf (p16) LDFPD f39, f54 = [AO4], 2 * SIZE nop __LINE__ (p17) FMA f16 = f106, f57, f16 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f17 = f107, f58, f17 } ;; { .mmf (p16) LDFPD f56, f57 = [AO5], 2 * SIZE nop __LINE__ (p17) FMA f18 = f106, f59, f18 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f19 = f107, f60, f19 } ;; { .mmf (p16) LDFPD f43, f58 = [AO6], 2 * SIZE nop __LINE__ (p17) FMA f20 = f106, f61, f20 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f21 = f107, f62, f21 } ;; { .mmf (p16) LDFPD f60, f61 = [AO7], 2 * SIZE nop __LINE__ (p17) FMA f22 = f106, f63, f22 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f23 = f107, f64, f23 } ;; { .mmf (p16) LDFPD f47, f62 = [AO8], 2 * SIZE (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p17) FMA f8 = f108, f65, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f109, f66, f9 } ;; { .mmf (p16) LDFPD f64, f65 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f108, f67, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f109, f68, f11 } ;; { .mmf (p16) LDFPD f51, f66 = [AO2], 2 * SIZE nop __LINE__ (p17) FMA f12 = f108, f69, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f109, f70, f13 } ;; { .mmf (p16) LDFPD f68, f69 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f108, f71, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f109, f72, f15 } ;; { .mmf (p16) LDFPD f55, f70 = [AO4], 2 * SIZE nop __LINE__ (p17) FMA f16 = f108, f73, f16 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f17 = f109, f74, f17 } ;; { .mmf (p16) LDFPD f72, f73 = [AO5], 2 * SIZE nop __LINE__ (p17) FMA f18 = f108, f75, f18 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f19 = f109, f76, f19 } ;; { .mmf (p16) LDFPD f59, f74 = [AO6], 2 * SIZE nop __LINE__ (p17) FMA f20 = f108, f77, f20 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f21 = f109, f78, f21 } ;; { .mmf (p16) LDFPD f76, f77 = [AO7], 2 * SIZE nop __LINE__ (p17) FMA f22 = f108, f79, f22 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f23 = f109, f80, f23 } ;; { .mmf (p16) LDFPD f63, f78 = [AO8], 2 * SIZE (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f111, f82, f9 } ;; { .mmf (p16) LDFPD f80, f81 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f110, f83, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f111, f84, f11 } ;; { .mmf (p16) LDFPD f67, f82 = [AO2] nop __LINE__ (p17) FMA f12 = f110, f85, f12 } { .mmf nop __LINE__ (p16) adds AO2 = 3 * SIZE, AO2 (p17) FMA f13 = f111, f86, f13 } ;; { .mmf (p16) LDFPD f84, f85 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f110, f87, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f111, f88, f15 } ;; { .mmf (p16) LDFPD f71, f86 = [AO4] nop __LINE__ (p17) FMA f16 = f110, f89, f16 } { .mmf nop __LINE__ (p16) adds AO4 = 3 * SIZE, AO4 (p17) FMA f17 = f111, f90, f17 } ;; { .mmf (p16) LDFPD f88, f89 = [AO5], 2 * SIZE nop __LINE__ (p17) FMA f18 = f110, f91, f18 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f19 = f111, f92, f19 } ;; { .mmf (p16) LDFPD f75, f90 = [AO6] nop __LINE__ (p17) FMA f20 = f110, f93, f20 } { .mmf nop __LINE__ (p16) adds AO6 = 3 * SIZE, AO6 (p17) FMA f21 = f111, f94, f21 } ;; { .mmf (p16) LDFPD f92, f93 = [AO7], 2 * SIZE nop __LINE__ (p17) FMA f22 = f110, f95, f22 } { .mfb adds I = -1, I (p17) FMA f23 = f111, f96, f23 br.ctop.sptk.few .L112 } ;; .align 16 .L115: and I = 15, MIN_M mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p15 = r0, r0 ;; adds I = 1, I ;; shr I = I, 1 ;; adds I = -1, I adds AO21 = 1 * SIZE, AO2 adds AO41 = 1 * SIZE, AO4 adds AO61 = 1 * SIZE, AO6 adds AO81 = 1 * SIZE, AO8 ;; mov ar.lc = I mov ar.ec= 3 and I = 15, MIN_M (p6) br.cond.dpnt .L118 ;; .align 16 .L116: { .mmf (p16) LDFPD f104, f107 = [BO], 2 * SIZE (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p18) FMA f8 = f106, f34, f8 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f9 = f109, f37, f9 } ;; { .mmf (p16) LDFD f38 = [AO2], 2 * SIZE (p17) LDFD f42 = [AO21], 2 * SIZE (p18) FMA f10 = f106, f40, f10 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f11 = f109, f43, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [AO3], 2 * SIZE nop __LINE__ (p18) FMA f12 = f106, f46, f12 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f13 = f109, f49, f13 } ;; { .mmf (p16) LDFD f50 = [AO4], 2 * SIZE (p17) LDFD f54 = [AO41], 2 * SIZE (p18) FMA f14 = f106, f52, f14 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f15 = f109, f55, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [AO5], 2 * SIZE nop __LINE__ (p18) FMA f16 = f106, f58, f16 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f17 = f109, f61, f17 } ;; { .mmf (p16) LDFD f62 = [AO6], 2 * SIZE (p17) LDFD f66 = [AO61], 2 * SIZE (p18) FMA f18 = f106, f64, f18 } { .mmf nop __LINE__ (p17) adds I = -2, I (p15) FMA f19 = f109, f67, f19 } ;; { .mmf (p16) LDFPD f68, f71 = [AO7], 2 * SIZE nop __LINE__ (p18) FMA f20 = f106, f70, f20 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f21 = f109, f73, f21 } ;; { .mmf (p16) LDFD f74 = [AO8], 2 * SIZE (p17) LDFD f78 = [AO81], 2 * SIZE (p15) FMA f23 = f109, f79, f23 } { .mfb (p17) cmp.ne.unc p15, p0 = -1, I (p18) FMA f22 = f106, f76, f22 br.ctop.sptk.few .L116 } ;; .L118: { .mmf mov AO1 = CO LDFD f32 = [CO], INCY FADD f8 = f8, f9 } ;; { .mmf LDFD f33 = [CO], INCY nop __LINE__ FADD f10 = f10, f11 } ;; { .mmf LDFD f34 = [CO], INCY nop __LINE__ FADD f12 = f12, f13 } ;; { .mmf LDFD f35 = [CO], INCY nop __LINE__ FADD f14 = f14, f15 } ;; { .mmf LDFD f36 = [CO], INCY nop __LINE__ FADD f16 = f16, f17 } ;; { .mmf LDFD f37 = [CO], INCY nop __LINE__ FADD f18 = f18, f19 } ;; { .mmf LDFD f38 = [CO], INCY nop __LINE__ FADD f20 = f20, f21 } ;; { .mmf LDFD f39 = [CO], INCY nop __LINE__ FADD f22 = f22, f23 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA, f10, f33 } { .mmf nop __LINE__ nop __LINE__ FMA f34 = ALPHA, f12, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA, f14, f35 } ;; { .mmf STFD [AO1] = f32 add AO1 = AO1, INCY FMA f36 = ALPHA, f16, f36 } ;; { .mmf STFD [AO1] = f33 add AO1 = AO1, INCY FMA f37 = ALPHA, f18, f37 } ;; { .mmf STFD [AO1] = f34 add AO1 = AO1, INCY FMA f38 = ALPHA, f20, f38 } ;; { .mmf STFD [AO1] = f35 add AO1 = AO1, INCY FMA f39 = ALPHA, f22, f39 } ;; { .mmi STFD [AO1] = f36 add AO1 = AO1, INCY adds J = -1, J } ;; { .mmi STFD [AO1] = f37 add AO1 = AO1, INCY nop __LINE__ } ;; { .mmi STFD [AO1] = f38 add AO1 = AO1, INCY cmp4.lt p6, p0 = 0, J } ;; { .mib STFD [AO1] = f39 add AO1 = AO1, INCY (p6) br.cond.dptk .L111 } ;; .align 16 .L120: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 2 } ;; { .mfi shladd AO3 = LDA, 1, A mov f12 = f0 shr I = MIN_M, 4 } { .mfb shladd AO4 = LDA, 1, AO2 mov f14 = f0 (p6) br.cond.dpnt .L130 } ;; { .mmf (p8) LDFD f32 = [AO1], SIZE (p8) LDFD f33 = [AO2], SIZE mov f9 = f0 } { .mmf mov BO = BUFFER shladd A = LDA, 2, A mov f11 = f0 } ;; { .mmf (p8) LDFD f40 = [BO], 2 * SIZE cmp.eq p6, p0 = 0, I mov f13 = f0 } { .mmf (p8) LDFD f34 = [AO3], SIZE (p8) LDFD f35 = [AO4], SIZE mov f15 = f0 } ;; { .mmi adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 mov ar.ec= 2 } { .mmi cmp.eq p16, p0 = r0, r0 add I = I, I adds AO21 = 7 * SIZE, AO2 } ;; { .mmf adds WPRE = 4 * SIZE, CO adds PREB = RPREFETCH * SIZE, BO (p8) FMPY f8 = f40, f32 } { .mmf adds RPRE3 = RPREFETCH * SIZE, AO3 adds I = -1, I (p8) FMPY f10 = f40, f33 } ;; { .mfi adds AO41 = 7 * SIZE, AO4 (p8) FMPY f12 = f40, f34 mov ar.lc = I } { .mfb adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 (p8) FMPY f14 = f40, f35 (p6) br.cond.dpnt .L125 } ;; .align 16 .L122: { .mmf (p17) LDFPD f72, f87 = [AO4] (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f8 = f104, f33, f8 } { .mfi (p17) adds AO4 = 3 * SIZE, AO4 (p17) FMA f9 = f105, f34, f9 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f10 = f104, f35, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f105, f36, f11 } ;; { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p16) LDFD f34 = [AO2], 1 * SIZE (p17) FMA f12 = f104, f37, f12 } { .mmf (p17) LDFD f84 = [AO21], 8 * SIZE nop __LINE__ (p17) FMA f13 = f105, f38, f13 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE (p16) LDFPD f36, f37 = [AO3], 2 * SIZE (p17) FMA f14 = f104, f39, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f105, f40, f15 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFD f38 = [AO4], 1 * SIZE (p17) FMA f8 = f106, f49, f8 } { .mmf (p17) LDFD f88 = [AO41], 8 * SIZE nop __LINE__ (p17) FMA f9 = f107, f50, f9 } ;; { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f10 = f106, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f107, f52, f11 } ;; { .mmf (p16) LDFPD f35, f50 = [AO2], 2 * SIZE (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f12 = f106, f53, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f107, f54, f13 } ;; { .mmf (p16) LDFPD f52, f53 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f106, f55, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f107, f56, f15 } ;; { .mmf (p16) LDFPD f39, f54 = [AO4], 2 * SIZE (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p17) FMA f8 = f108, f65, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f109, f66, f9 } ;; { .mmf (p16) LDFPD f64, f65 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f108, f67, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f109, f68, f11 } ;; { .mmf (p16) LDFPD f51, f66 = [AO2], 2 * SIZE nop __LINE__ (p17) FMA f12 = f108, f69, f12 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f13 = f109, f70, f13 } ;; { .mmf (p16) LDFPD f68, f69 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f108, f71, f14 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f15 = f109, f72, f15 } ;; { .mmf (p16) LDFPD f55, f70 = [AO4], 2 * SIZE (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f111, f82, f9 } ;; { .mmf (p16) LDFPD f80, f81 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f110, f83, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f111, f84, f11 } ;; { .mmf (p16) LDFPD f67, f82 = [AO2] nop __LINE__ (p17) FMA f12 = f110, f85, f12 } { .mmf nop __LINE__ (p16) adds AO2 = 3 * SIZE, AO2 (p17) FMA f13 = f111, f86, f13 } ;; { .mmf (p16) LDFPD f84, f85 = [AO3], 2 * SIZE nop __LINE__ (p17) FMA f14 = f110, f87, f14 } { .mfb adds I = -1, I (p17) FMA f15 = f111, f88, f15 br.ctop.sptk.few .L122 } ;; .align 16 .L125: and I = 15, MIN_M mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p15 = r0, r0 ;; adds I = 1, I adds AO21 = 1 * SIZE, AO2 adds AO41 = 1 * SIZE, AO4 ;; shr I = I, 1 ;; adds I = -1, I ;; mov ar.lc = I mov ar.ec= 3 and I = 15, MIN_M (p6) br.cond.dpnt .L128 ;; .align 16 .L126: { .mmf (p16) LDFPD f104, f107 = [BO], 2 * SIZE (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p18) FMA f8 = f106, f34, f8 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f9 = f109, f37, f9 } ;; { .mmf (p17) LDFD f42 = [AO21], 2 * SIZE (p16) LDFD f38 = [AO2], 2 * SIZE (p18) FMA f10 = f106, f40, f10 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f11 = f109, f43, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [AO3], 2 * SIZE nop __LINE__ (p18) FMA f12 = f106, f46, f12 } { .mmf nop __LINE__ (p17) adds I = -2, I (p15) FMA f13 = f109, f49, f13 } ;; { .mmf (p17) LDFD f54 = [AO41], 2 * SIZE (p16) LDFD f50 = [AO4], 2 * SIZE (p15) FMA f15 = f109, f55, f15 } { .mfb (p17) cmp.ne.unc p15, p0 = -1, I (p18) FMA f14 = f106, f52, f14 br.ctop.sptk.few .L126 } ;; .L128: { .mmf mov AO1 = CO LDFD f32 = [CO], INCY FADD f8 = f8, f9 } ;; { .mmf LDFD f33 = [CO], INCY nop __LINE__ FADD f10 = f10, f11 } ;; { .mmf LDFD f34 = [CO], INCY nop __LINE__ FADD f12 = f12, f13 } ;; { .mmf LDFD f35 = [CO], INCY nop __LINE__ FADD f14 = f14, f15 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA, f10, f33 } { .mmf nop __LINE__ nop __LINE__ FMA f34 = ALPHA, f12, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA, f14, f35 } ;; { .mmf STFD [AO1] = f32 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f33 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f34 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f35 add AO1 = AO1, INCY } ;; .align 16 .L130: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 1 } ;; { .mfi mov BO = BUFFER mov f12 = f0 shr I = MIN_M, 4 } { .mfb adds WPRE = 4 * SIZE, CO mov f14 = f0 (p6) br.cond.dpnt .L140 } ;; { .mmf (p8) LDFD f32 = [AO1], SIZE (p8) LDFD f33 = [AO2], SIZE mov f9 = f0 } { .mfi shladd A = LDA, 1, A mov f11 = f0 mov ar.ec= 2 } ;; { .mmf (p8) LDFD f40 = [BO], 2 * SIZE cmp.eq p6, p0 = 0, I mov f13 = f0 } { .mmf adds RPRE1 = RPREFETCH * SIZE, AO1 add I = I, I mov f15 = f0 } ;; { .mmi cmp.eq p16, p0 = r0, r0 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 adds I = -1, I } ;; { .mfi adds AO21 = 7 * SIZE, AO2 (p8) FMPY f8 = f40, f32 mov ar.lc = I } { .mfb adds PREB = RPREFETCH * SIZE, BO (p8) FMPY f10 = f40, f33 (p6) br.cond.dpnt .L135 } ;; .align 16 .L132: { .mmf (p17) LDFPD f68, f83 = [AO2] (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f8 = f104, f33, f8 } { .mfi (p17) adds AO2 = 3 * SIZE, AO2 (p17) FMA f9 = f105, f34, f9 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f10 = f104, f35, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f105, f36, f11 } ;; { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p16) LDFD f34 = [AO2], 1 * SIZE (p17) FMA f8 = f106, f49, f8 } { .mmf (p17) LDFD f84 = [AO21], 8 * SIZE nop __LINE__ (p17) FMA f9 = f107, f50, f9 } ;; { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f10 = f106, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f107, f52, f11 } ;; { .mmf (p16) LDFPD f35, f50 = [AO2], 2 * SIZE (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f8 = f108, f65, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f109, f66, f9 } ;; { .mmf (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p16) LDFPD f64, f65 = [AO1], 2 * SIZE (p17) FMA f10 = f108, f67, f10 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f11 = f109, f68, f11 } ;; { .mmf (p16) LDFPD f51, f66 = [AO2], 2 * SIZE (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f111, f82, f9 } ;; { .mmf (p16) LDFPD f80, f81 = [AO1], 2 * SIZE nop __LINE__ (p17) FMA f10 = f110, f83, f10 } { .mfb adds I = -1, I (p17) FMA f11 = f111, f84, f11 br.ctop.sptk.few .L132 } ;; .align 16 .L135: and I = 15, MIN_M ;; cmp.eq p6, p0 = 0, I (p6) br.cond.dpnt .L138 ;; tbit.nz p12, p0 = MIN_M, 3 tbit.nz p13, p0 = MIN_M, 2 tbit.nz p14, p0 = MIN_M, 1 tbit.nz p15, p0 = MIN_M, 0 ;; (p12) LDFPD f100, f101 = [BO], 2 * SIZE (p12) LDFPD f32, f33 = [AO1], 2 * SIZE (p12) LDFD f34 = [AO2], 1 * SIZE ;; (p12) LDFPD f36, f37 = [AO1], 2 * SIZE (p12) LDFPD f35, f38 = [AO2], 2 * SIZE ;; (p12) LDFPD f102, f103 = [BO], 2 * SIZE (p12) LDFPD f39, f42 = [AO2], 2 * SIZE ;; (p12) LDFPD f40, f41 = [AO1], 2 * SIZE (p12) LDFPD f43, f46 = [AO2], 2 * SIZE ;; (p12) LDFPD f104, f105 = [BO], 2 * SIZE (p12) LDFPD f44, f45 = [AO1], 2 * SIZE (p12) LDFD f47 = [AO2], 1 * SIZE ;; (p12) LDFPD f106, f107 = [BO], 2 * SIZE (p13) LDFD f50 = [AO2], 1 * SIZE (p13) LDFPD f48, f49 = [AO1], 2 * SIZE ;; (p13) LDFPD f108, f109 = [BO], 2 * SIZE (p13) LDFPD f51, f54 = [AO2], 2 * SIZE ;; (p13) LDFPD f110, f111 = [BO], 2 * SIZE (p13) LDFPD f52, f53 = [AO1], 2 * SIZE (p13) LDFD f55 = [AO2], 1 * SIZE ;; (p14) LDFPD f56, f57 = [AO1], 2 * SIZE (p14) LDFD f58 = [AO2], 1 * SIZE ;; (p14) LDFPD f112, f113 = [BO], 2 * SIZE (p15) LDFD f60 = [AO1] (p14) LDFD f59 = [AO2], 1 * SIZE ;; (p15) LDFD f61 = [AO2] (p15) LDFD f114 = [BO] ;; (p12) FMA f8 = f100, f32, f8 (p12) FMA f9 = f101, f33, f9 (p12) FMA f10 = f100, f34, f10 (p12) FMA f11 = f101, f35, f11 ;; (p12) FMA f12 = f102, f36, f12 (p12) FMA f13 = f103, f37, f13 (p12) FMA f14 = f102, f38, f14 (p12) FMA f15 = f103, f39, f15 ;; (p12) FMA f8 = f104, f40, f8 (p12) FMA f9 = f105, f41, f9 (p12) FMA f10 = f104, f42, f10 (p12) FMA f11 = f105, f43, f11 ;; (p12) FMA f12 = f106, f44, f12 (p12) FMA f13 = f107, f45, f13 (p12) FMA f14 = f106, f46, f14 (p12) FMA f15 = f107, f47, f15 ;; (p13) FMA f8 = f108, f48, f8 (p13) FMA f9 = f109, f49, f9 (p13) FMA f10 = f108, f50, f10 (p13) FMA f11 = f109, f51, f11 ;; (p13) FMA f12 = f110, f52, f12 (p13) FMA f13 = f111, f53, f13 (p13) FMA f14 = f110, f54, f14 (p13) FMA f15 = f111, f55, f15 ;; (p14) FMA f8 = f112, f56, f8 (p14) FMA f9 = f113, f57, f9 (p14) FMA f10 = f112, f58, f10 (p14) FMA f11 = f113, f59, f11 ;; (p15) FMA f12 = f114, f60, f12 (p15) FMA f14 = f114, f61, f14 ;; .L138: FADD f8 = f8, f9 FADD f10 = f10, f11 FADD f12 = f12, f13 FADD f14 = f14, f15 ;; FADD f8 = f8, f12 FADD f10 = f10, f14 ;; { .mmf mov AO1 = CO LDFD f32 = [CO], INCY } ;; { .mmf LDFD f33 = [CO], INCY nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA, f10, f33 } ;; { .mmf STFD [AO1] = f32 add AO1 = AO1, INCY } ;; { .mmf STFD [AO1] = f33 } ;; .align 16 .L140: { .mfi mov AO1 = A mov f8 = f0 shr I = MIN_M, 4 } { .mfi mov BO = BUFFER mov f10 = f0 tbit.z p7, p0 = N, 0 } ;; { .mfi cmp.eq p6, p0 = 0, I mov f12 = f0 mov pr.rot= 0 } { .mfb add I = I, I mov f14 = f0 (p7) br.cond.dpnt .L199 } ;; { .mfi (p8) LDFD f32 = [AO1], SIZE mov f9 = f0 mov ar.ec= 2 } { .mmf (p8) LDFD f40 = [BO], 2 * SIZE add A = A, LDA mov f11 = f0 } ;; { .mmf adds WPRE = 1 * SIZE, CO adds PREB = RPREFETCH * SIZE, BO mov f13 = f0 } { .mmf cmp.eq p16, p0 = r0, r0 adds I = -1, I mov f15 = f0 } ;; { .mfi lfetch.excl.nt1 [WPRE] (p8) FMPY f8 = f40, f32 mov ar.lc = I } { .mmb nop __LINE__ nop __LINE__ (p6) br.cond.dpnt .L145 } ;; .align 16 .L142: { .mmf (p17) LDFPD f81, f82 = [AO1], 2 * SIZE (p17) LDFPD f110, f111 = [BO], 2 * SIZE (p17) FMA f8 = f104, f33, f8 } { .mfi nop __LINE__ (p17) FMA f9 = f105, f34, f9 (p16) tbit.nz.unc p14, p15 = I, 0 } ;; { .mmf (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p16) LDFPD f103, f104 = [BO], 2 * SIZE (p17) FMA f8 = f106, f49, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f107, f50, f9 } ;; { .mmf (p16) LDFPD f105, f106 = [BO], 2 * SIZE (p16) LDFPD f48, f49 = [AO1], 2 * SIZE (p17) FMA f8 = f108, f65, f8 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f9 = f109, f66, f9 } ;; { .mmf (p16) LDFPD f64, f65 = [AO1], 2 * SIZE (p16) LDFPD f107, f108 = [BO], 2 * SIZE (p17) FMA f8 = f110, f81, f8 } { .mfb adds I = -1, I (p17) FMA f9 = f111, f82, f9 br.ctop.sptk.few .L142 } ;; .align 16 .L145: and I = 15, MIN_M ;; cmp.eq p6, p0 = 0, I (p6) br.cond.dpnt .L148 ;; tbit.nz p12, p0 = MIN_M, 3 tbit.nz p13, p0 = MIN_M, 2 tbit.nz p14, p0 = MIN_M, 1 tbit.nz p15, p0 = MIN_M, 0 ;; (p12) LDFPD f32, f33 = [AO1], 2 * SIZE (p12) LDFPD f100, f101 = [BO], 2 * SIZE ;; (p12) LDFPD f36, f37 = [AO1], 2 * SIZE (p12) LDFPD f102, f103 = [BO], 2 * SIZE ;; (p12) LDFPD f40, f41 = [AO1], 2 * SIZE (p12) LDFPD f104, f105 = [BO], 2 * SIZE ;; (p12) LDFPD f44, f45 = [AO1], 2 * SIZE (p12) LDFPD f106, f107 = [BO], 2 * SIZE ;; (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f108, f109 = [BO], 2 * SIZE ;; (p13) LDFPD f52, f53 = [AO1], 2 * SIZE (p13) LDFPD f110, f111 = [BO], 2 * SIZE ;; (p14) LDFPD f56, f57 = [AO1], 2 * SIZE (p14) LDFPD f112, f113 = [BO], 2 * SIZE ;; (p15) LDFD f60 = [AO1] (p15) LDFD f114 = [BO] ;; (p12) FMA f8 = f100, f32, f8 (p12) FMA f9 = f101, f33, f9 (p12) FMA f10 = f102, f36, f10 (p12) FMA f11 = f103, f37, f11 (p12) FMA f12 = f104, f40, f12 (p12) FMA f13 = f105, f41, f13 (p12) FMA f14 = f106, f44, f14 (p12) FMA f15 = f107, f45, f15 ;; (p13) FMA f8 = f108, f48, f8 (p13) FMA f9 = f109, f49, f9 (p13) FMA f10 = f110, f52, f10 (p13) FMA f11 = f111, f53, f11 (p14) FMA f12 = f112, f56, f12 (p14) FMA f13 = f113, f57, f13 (p15) FMA f14 = f114, f60, f14 ;; .L148: { .mmf LDFD f32 = [CO] nop __LINE__ FADD f8 = f8, f9 } { .mmf nop __LINE__ nop __LINE__ FADD f10 = f10, f11 } ;; { .mmf nop __LINE__ nop __LINE__ FADD f12 = f12, f13 } { .mmf nop __LINE__ nop __LINE__ FADD f14 = f14, f15 } ;; { .mmf nop __LINE__ nop __LINE__ FADD f8 = f8, f12 } { .mmf nop __LINE__ nop __LINE__ FADD f10 = f10, f14 } ;; { .mmf nop __LINE__ nop __LINE__ FADD f8 = f8, f10 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f32 = ALPHA, f8, f32 } ;; { .mmf STFD [CO] = f32 nop __LINE__ nop __LINE__ } ;; .align 16 .L199: adds IS = P, IS shladd A = LDAP, BASE_SHIFT, A ;; cmp.gt p6, p0 = M, IS (p6) br.cond.dptk .LIs_loop .align 4 ;; .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/iamax.S000066400000000000000000000275341313527062700164250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16 + 4) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16 + 8) #else #define PREFETCH_SIZE (32 * 16 + 16) #endif #if !defined(USE_MIN) && defined(USE_ABS) #define FMAX famax #elif !defined(USE_MIN) && !defined(USE_ABS) #define FMAX fmax #elif defined(USE_MIN) && defined(USE_ABS) #define FMAX famin #else #define FMAX fmin #endif #define IMAX1 r8 #define IMAX2 r26 #define IMAX3 r27 #define IMAX4 r28 #define PRE1 r2 #define N r14 #define X1 r15 #define INCX r16 #define I r17 #define X2 r18 #define INCX5 r19 #define INCX16 r20 #define CURRENT r21 #define DMAX1 f8 #define DMAX2 f9 #define DMAX3 f10 #define DMAX4 f11 #define DMAX5 f12 #define DMAX6 f13 #define DMAX7 f14 #define DMAX8 f15 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mmi mov IMAX1 = 0 .save ar.lc, ARLC mov ARLC = ar.lc } ;; .body #ifdef F_INTERFACE { .mmi LDINT N = [r32] LDINT INCX = [r34] mov X1 = r33 } ;; #ifndef USE64BITINT { .mii nop.m 0 sxt4 N = N sxt4 INCX = INCX } ;; #endif #else { .mmi mov N = r32 mov X1 = r33 mov INCX = r34 } ;; #endif { .mii mov PR = pr cmp.ge p6, p0 = 0, INCX } { .mbb cmp.ge p8, p0 = 0, N (p8) br.ret.sptk.many b0 (p6) br.ret.sptk.many b0 } ;; { .mmi LDFD DMAX1 = [X1] shladd INCX = INCX, BASE_SHIFT, r0 mov pr.rot= 0 } ;; mov IMAX1 = 1 mov IMAX2 = 1 mov IMAX3 = 1 mov IMAX4 = 1 mov CURRENT = 1 adds N = -1, N ;; { .mmf add X1 = X1, INCX mov DMAX2 = DMAX1 } ;; { .mmf shladd X2 = INCX, 2, X1 } { .mfi cmp.eq p16, p0 = r0, r0 shr I = N, 4 } ;; { .mfi shladd INCX5 = INCX, 2, INCX mov DMAX3 = DMAX1 mov ar.ec= 4 } { .mmf #ifdef XDOUBLE shladd INCX16= INCX, 3, r0 #else shladd INCX16= INCX, 4, r0 #endif adds I = -1, I } ;; tbit.z p0, p7 = N, 3 ;; { .mfi adds PRE1 = PREFETCH_SIZE * SIZE, X1 mov DMAX4 = DMAX1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I (p6) br.cond.dpnt .L15 } .align 32 ;; .L10: { .mmf (p16) lfetch.nt1 [PRE1], INCX16 (p16) LDFD f32 = [X1], INCX (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5 } { .mmf (p8) adds IMAX1 = 1, CURRENT nop __LINE__ (p19) FMAX DMAX5 = f67, DMAX1 } ;; { .mmf (p16) LDFD f36 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6 } { .mmf (p9) adds IMAX2 = 2, CURRENT nop __LINE__ (p19) FMAX DMAX6 = f71, DMAX2 } ;; { .mmf (p16) LDFD f40 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7 } { .mmf (p10) adds IMAX3 = 3, CURRENT nop __LINE__ (p19) FMAX DMAX7 = f75, DMAX3 } ;; { .mmf (p16) LDFD f44 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8 } { .mmf (p11) adds IMAX4 = 4, CURRENT nop __LINE__ (p19) FMAX DMAX8 = f79, DMAX4 } ;; { .mmf (p16) LDFD f48 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 } { .mmf (p12) adds IMAX1 = 5, CURRENT nop __LINE__ (p19) FMAX DMAX1 = f83, DMAX5 } ;; { .mmf (p16) LDFD f52 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 } { .mmf (p13) adds IMAX2 = 6, CURRENT nop __LINE__ (p19) FMAX DMAX2 = f87, DMAX6 } ;; { .mmf (p16) LDFD f56 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 } { .mmf (p14) adds IMAX3 = 7, CURRENT nop __LINE__ (p19) FMAX DMAX3 = f91, DMAX7 } ;; { .mmf (p16) LDFD f60 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 } { .mmf (p15) adds IMAX4 = 8, CURRENT nop __LINE__ (p19) FMAX DMAX4 = f95, DMAX8 } ;; { .mmf #ifdef XDOUBLE (p16) lfetch.nt1 [PRE1], INCX16 #endif (p16) LDFD f64 = [X1], INCX #ifndef XDOUBLE nop __LINE__ #endif (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5 } { .mmf (p8) adds IMAX1 = 9, CURRENT nop __LINE__ (p18) FMAX DMAX5 = f34, DMAX1 } ;; { .mmf (p16) LDFD f68 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6 } { .mmf (p9) adds IMAX2 = 10, CURRENT nop __LINE__ (p18) FMAX DMAX6 = f38, DMAX2 } ;; { .mmf (p16) LDFD f72 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7 } { .mmf (p10) adds IMAX3 = 11, CURRENT nop __LINE__ (p18) FMAX DMAX7 = f42, DMAX3 } ;; { .mmf (p16) LDFD f76 = [X1], INCX nop __LINE__ (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8 } { .mmf (p11) adds IMAX4 = 12, CURRENT nop __LINE__ (p18) FMAX DMAX8 = f46, DMAX4 } ;; { .mmf (p16) LDFD f80 = [X1], INCX nop __LINE__ (p18) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 } { .mmf (p12) adds IMAX1 = 13, CURRENT nop __LINE__ (p18) FMAX DMAX1 = f50, DMAX5 } ;; { .mmf (p16) LDFD f84 = [X1], INCX nop __LINE__ (p18) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 } { .mmf (p13) adds IMAX2 = 14, CURRENT nop __LINE__ (p18) FMAX DMAX2 = f54, DMAX6 } ;; { .mmf (p16) LDFD f88 = [X1], INCX nop __LINE__ (p18) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 } { .mmf (p14) adds IMAX3 = 15, CURRENT nop __LINE__ (p18) FMAX DMAX3 = f58, DMAX7 } ;; { .mmf (p16) LDFD f92 = [X1], INCX (p15) adds IMAX4 = 16, CURRENT (p18) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 } { .mfb (p19) adds CURRENT = 16, CURRENT (p18) FMAX DMAX4 = f62, DMAX8 br.ctop.sptk.few .L10 } ;; .align 32 .L15: { .mmi (p7) LDFD f32 = [X1], INCX and I = 15, N cmp.ne p14, p0 = r0, r0 } ;; { .mmb (p7) LDFD f33 = [X1], INCX cmp.eq p6, p0 = 0, I (p6) br.cond.dptk .L999 } ;; { .mmi (p7) LDFD f34 = [X1], INCX ;; (p7) LDFD f35 = [X1], INCX tbit.z p0, p13 = N, 2 } ;; { .mmi (p7) LDFD f36 = [X1], INCX ;; (p7) LDFD f37 = [X1], INCX tbit.z p0, p14 = N, 1 } ;; { .mfi (p7) LDFD f38 = [X1], INCX (p7) FMAX DMAX5 = f32, DMAX1 tbit.z p0, p15 = N, 0 } ;; { .mmf (p7) LDFD f39 = [X1], INCX nop __LINE__ (p7) FMAX DMAX6 = f33, DMAX2 } ;; { .mmf (p13) LDFD f40 = [X1], INCX nop __LINE__ (p7) FMAX DMAX7 = f34, DMAX3 } ;; { .mmf (p13) LDFD f41 = [X1], INCX nop __LINE__ (p7) FMAX DMAX8 = f35, DMAX4 } ;; { .mmf (p13) LDFD f42 = [X1], INCX nop __LINE__ (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 } { .mmf nop __LINE__ nop __LINE__ (p7) FMAX DMAX1 = f36, DMAX5 } ;; { .mmf (p13) LDFD f43 = [X1], INCX nop __LINE__ (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 } { .mmf nop __LINE__ nop __LINE__ (p7) FMAX DMAX2 = f37, DMAX6 } ;; { .mmf (p14) LDFD f44 = [X1], INCX nop __LINE__ (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 } { .mmf nop __LINE__ nop __LINE__ (p7) FMAX DMAX3 = f38, DMAX7 } ;; { .mmf (p14) LDFD f45 = [X1], INCX nop __LINE__ (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 } { .mmf nop __LINE__ nop __LINE__ (p7) FMAX DMAX4 = f39, DMAX8 } ;; { .mmf (p15) LDFD f46 = [X1], INCX (p8) adds IMAX1 = 1, CURRENT (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 } { .mmf nop __LINE__ nop __LINE__ (p13) FMAX DMAX5 = f40, DMAX1 } { .mmf (p9) adds IMAX2 = 2, CURRENT nop __LINE__ (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 } { .mmf nop __LINE__ nop __LINE__ (p13) FMAX DMAX6 = f41, DMAX2 } { .mmf (p10) adds IMAX3 = 3, CURRENT nop __LINE__ (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 } { .mmf nop __LINE__ nop __LINE__ (p13) FMAX DMAX7 = f42, DMAX3 } { .mmf (p11) adds IMAX4 = 4, CURRENT nop __LINE__ (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 } { .mmf nop __LINE__ nop __LINE__ (p13) FMAX DMAX8 = f43, DMAX4 } ;; { .mmf (p8) adds IMAX1 = 5, CURRENT nop __LINE__ (p13) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 } { .mmf nop __LINE__ nop __LINE__ (p13) mov DMAX1 = DMAX5 } { .mmf (p9) adds IMAX2 = 6, CURRENT nop __LINE__ (p13) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 } { .mmf nop __LINE__ nop __LINE__ (p13) mov DMAX2 = DMAX6 } { .mmf (p10) adds IMAX3 = 7, CURRENT nop __LINE__ (p13) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 } { .mmf nop __LINE__ nop __LINE__ (p13) mov DMAX3 = DMAX7 } { .mmf (p11) adds IMAX4 = 8, CURRENT nop __LINE__ (p13) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 } { .mmf (p7) adds CURRENT = 8, CURRENT nop __LINE__ (p13) mov DMAX4 = DMAX8 } ;; { .mmf (p8) adds IMAX1 = 1, CURRENT nop __LINE__ (p14) FMAX DMAX5 = f44, DMAX1 } { .mmf (p9) adds IMAX2 = 2, CURRENT (p10) adds IMAX3 = 3, CURRENT (p14) FMAX DMAX6 = f45, DMAX2 } { .mmf (p11) adds IMAX4 = 4, CURRENT (p13) adds CURRENT = 4, CURRENT (p15) FMAX DMAX7 = f46, DMAX3 } ;; { .mmf nop __LINE__ nop __LINE__ (p14) fcmp.neq.unc p8, p0 = DMAX5, DMAX1 } { .mmf nop __LINE__ nop __LINE__ (p14) mov DMAX1 = DMAX5 } { .mmf nop __LINE__ nop __LINE__ (p14) fcmp.neq.unc p9, p0 = DMAX6, DMAX2 } { .mmf nop __LINE__ nop __LINE__ (p14) mov DMAX2 = DMAX6 } { .mmf nop __LINE__ nop __LINE__ (p15) fcmp.neq.unc p10, p0 = DMAX7, DMAX3 } { .mmf nop __LINE__ nop __LINE__ (p15) mov DMAX3 = DMAX7 } ;; .L999: { .mmf (p8) adds IMAX1 = 1, CURRENT nop __LINE__ FMAX DMAX5 = DMAX2, DMAX1 } { .mmf (p9) adds IMAX2 = 2, CURRENT (p14) adds CURRENT = 2, CURRENT FMAX DMAX6 = DMAX4, DMAX3 } ;; { .mmf nop __LINE__ nop __LINE__ fcmp.neq p12, p0 = DMAX5, DMAX1 } { .mmf (p10) adds IMAX3 = 1, CURRENT nop __LINE__ fcmp.neq p13, p0 = DMAX6, DMAX3 } ;; { .mmf (p12) mov IMAX1 = IMAX2 (p13) mov IMAX3 = IMAX4 FMAX DMAX1 = DMAX6, DMAX5 } ;; { .mfi nop __LINE__ fcmp.neq p12, p0 = DMAX1, DMAX5 mov ar.lc = ARLC } ;; { .mib (p12) mov IMAX1 = IMAX3 mov pr = PR, -65474 br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/izamax.S000066400000000000000000000234601313527062700166110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16 + 4) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16 + 8) #else #define PREFETCH_SIZE (32 * 16 + 16) #endif #ifdef USE_MIN #define CMPUNC cmp.lt.unc #define CMP cmp.lt #else #define CMPUNC cmp.gt.unc #define CMP cmp.gt #endif #define RET r8 #define N r32 #define DX r33 #define INCX r34 #define PRE1 r2 #define I r14 #define J r15 #define K r16 #define TMP r17 #define INCXM1 r18 #define INCX8 r19 #define MAX1 r20 #define DMAX1 r21 #define DATA1 r22 #define DATA2 r23 #define DATA3 r24 #define DATA4 r25 #define DATA5 r26 #define DATA6 r27 #define DATA7 r28 #define DATA8 r29 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mmi mov MAX1 = -1 mov DMAX1 = 0 .save ar.lc, ARLC mov ARLC = ar.lc } .body #ifdef F_INTERFACE { .mmi LDINT N = [N] LDINT INCX = [INCX] nop.i 0 } ;; #ifndef USE64BITINT { .mii nop.m 0 sxt4 N = N sxt4 INCX = INCX } ;; #endif #endif { .mii adds K = -1, N shl INCX = INCX, ZBASE_SHIFT mov PR = pr } { .mmb cmp.ge p8, p0 = 0, N (p8) br.cond.dptk .L999 } ;; { .mib cmp.ge p6, p0 = 0, INCX mov pr.rot= 0 (p6) br.cond.dptk .L999 } ;; { .mmi LDFD f6 = [DX], SIZE adds INCXM1 = - SIZE, INCX mov ar.ec= 5 } ;; { .mmi LDFD f7 = [DX], INCXM1 mov MAX1 = 0 mov I = 1 } ;; { .mfi cmp.eq p16, p0 = r0, r0 fabs f6 = f6 shr J = K, 3 } { .mmf nop.m 0 nop.m 0 fabs f7 = f7 } ;; { .mmi cmp.ne p8, p0 = r0, r0 adds J = -1, J shladd INCX8 = INCX, 3, r0 } { .mmf nop.m 0 nop.m 0 FADD f6 = f6, f7 } ;; { .mmi getf.d DMAX1 = f6 adds PRE1 = PREFETCH_SIZE * SIZE, DX mov ar.lc = J } { .mib cmp.eq p7 ,p0 = -1, J tbit.z p0, p13 = K, 2 (p7) br.cond.dpnt .L15 } .align 32 ;; .L10: { .mmf (p16) lfetch.nt1 [PRE1], INCX8 (p16) LDFD f32 = [DX], SIZE (p19) fabs f35 = f35 } { .mmf (p8 ) mov DMAX1 = DATA1 nop.m 0 (p19) fabs f40 = f40 } ;; { .mmf (p20) getf.d DATA5 = f12 (p16) LDFD f37 = [DX], INCXM1 (p20) FADD f14 = f96, f101 } { .mmi (p8 ) adds MAX1 = 0, I (p20) CMPUNC p8, p0 = DATA2, DMAX1 nop.i 0 } ;; { .mmf (p16) LDFD f42 = [DX], SIZE (p8 ) mov DMAX1 = DATA2 (p19) fabs f45 = f45 } { .mmf nop.m 0 nop.m 0 (p19) fabs f50 = f50 } ;; { .mmf (p20) getf.d DATA6 = f13 (p16) LDFD f47 = [DX], INCXM1 (p20) FADD f15 = f106, f111 } { .mmi (p8 ) adds MAX1 = 1, I (p20) CMPUNC p8, p0 = DATA3, DMAX1 nop.i 0 } ;; { .mmf (p16) LDFD f52 = [DX], SIZE (p8 ) mov DMAX1 = DATA3 (p19) fabs f55 = f55 } { .mmf nop.m 0 nop.m 0 (p19) fabs f60 = f60 } ;; { .mmf (p20) getf.d DATA7 = f14 (p16) LDFD f57 = [DX], INCXM1 (p19) FADD f8 = f35, f40 } { .mmi (p8 ) adds MAX1 = 2, I (p20) CMPUNC p8, p0 = DATA4, DMAX1 nop.i 0 } ;; { .mmf (p16) LDFD f62 = [DX], SIZE (p8 ) mov DMAX1 = DATA4 (p19) fabs f65 = f65 } { .mmf nop.m 0 nop.m 0 (p19) fabs f70 = f70 } ;; { .mmf (p20) getf.d DATA8 = f15 (p16) LDFD f67 = [DX], INCXM1 (p19) FADD f9 = f45, f50 } { .mmi (p8 ) adds MAX1 = 3, I (p20) CMPUNC p8, p0 = DATA5, DMAX1 nop.i 0 } ;; { .mmf (p16) LDFD f72 = [DX], SIZE (p8 ) mov DMAX1 = DATA5 (p19) fabs f75 = f75 } { .mmf nop.m 0 nop.m 0 (p19) fabs f80 = f80 } ;; { .mmf (p19) getf.d DATA1 = f8 (p16) LDFD f77 = [DX], INCXM1 (p19) FADD f10 = f55, f60 } { .mmi (p8 ) adds MAX1 = 4, I (p20) CMPUNC p8, p0 = DATA6, DMAX1 nop.i 0 } ;; { .mmf (p16) LDFD f82 = [DX], SIZE (p8 ) mov DMAX1 = DATA6 (p19) fabs f85 = f85 } { .mmf nop.m 0 nop.m 0 (p19) fabs f90 = f90 } ;; { .mmf (p19) getf.d DATA2 = f9 (p16) LDFD f87 = [DX], INCXM1 (p19) FADD f11 = f65, f70 } { .mmi (p8 ) adds MAX1 = 5, I (p20) CMPUNC p8, p0 = DATA7, DMAX1 nop.i 0 } ;; { .mmf (p16) LDFD f92 = [DX], SIZE (p8 ) mov DMAX1 = DATA7 (p19) fabs f95 = f95 } { .mmf mov TMP = I nop.m 0 (p19) fabs f100 = f100 } ;; { .mmf (p19) getf.d DATA3 = f10 (p16) LDFD f97 = [DX], INCXM1 (p19) FADD f12 = f75, f80 } { .mmi (p8 ) adds MAX1 = 6, I (p20) CMPUNC p8, p0 = DATA8, DMAX1 nop.i 0 } ;; { .mmf (p16) LDFD f102 = [DX], SIZE (p8 ) mov DMAX1 = DATA8 (p19) fabs f105 = f105 } { .mmf (p20) adds I = 8, I nop.m 0 (p19) fabs f110 = f110 } ;; { .mmi (p19) getf.d DATA4 = f11 (p16) LDFD f107 = [DX], INCXM1 (p8 ) adds MAX1 = 7, TMP } { .mfb (p19) CMPUNC p8, p0 = DATA1, DMAX1 (p19) FADD f13 = f85, f90 br.ctop.sptk.few .L10 } ;; .align 32 .L15: { .mmi (p13) LDFD f32 = [DX], SIZE and J = 7, K mov pr = PR, -65474 } ;; { .mmb (p13) LDFD f33 = [DX], INCXM1 cmp.eq p8 ,p0 = r0, J (p8) br.cond.dpnt .L999 } ;; { .mmi (p13) LDFD f34 = [DX], SIZE ;; (p13) LDFD f35 = [DX], INCXM1 nop.i 0 } ;; { .mmi (p13) LDFD f36 = [DX], SIZE ;; (p13) LDFD f37 = [DX], INCXM1 nop.i 0 } ;; { .mfi (p13) LDFD f38 = [DX], SIZE (p13) fabs f32 = f32 tbit.z p0, p14 = K, 1 } ;; { .mmf (p13) LDFD f39 = [DX], INCXM1 nop.m 0 (p13) fabs f33 = f33 } ;; { .mmf (p14) LDFD f40 = [DX], SIZE nop.m 0 (p13) fabs f34 = f34 } ;; { .mfi (p14) LDFD f41 = [DX], INCXM1 (p13) fabs f35 = f35 tbit.z p0, p15 = K, 0 } ;; { .mmf (p14) LDFD f42 = [DX], SIZE nop.m 0 (p13) fabs f36 = f36 } ;; { .mmf (p14) LDFD f43 = [DX], INCXM1 nop.m 0 (p13) fabs f37 = f37 } { .mmf nop.m 0 nop.m 0 (p13) FADD f32 = f32, f33 } ;; { .mmf (p15) LDFD f44 = [DX], SIZE nop.m 0 (p13) fabs f38 = f38 } ;; { .mmf (p15) LDFD f45 = [DX], INCXM1 nop.m 0 (p13) fabs f39 = f39 } { .mmf nop.m 0 nop.m 0 (p13) FADD f34 = f34, f35 } ;; { .mmf nop.m 0 nop.m 0 (p14) fabs f40 = f40 } ;; { .mmf (p13) getf.d DATA1 = f32 nop.m 0 (p14) fabs f41 = f41 } { .mmf nop.m 0 nop.m 0 (p13) FADD f36 = f36, f37 } ;; { .mmf nop.m 0 nop.m 0 (p14) fabs f42 = f42 } ;; { .mmf (p13) getf.d DATA2 = f34 nop.m 0 (p14) fabs f43 = f43 } { .mmf nop.m 0 nop.m 0 (p13) FADD f38 = f38, f39 } ;; { .mmf nop.m 0 nop.m 0 (p15) fabs f44 = f44 } ;; { .mmf (p13) getf.d DATA3 = f36 nop.m 0 (p15) fabs f45 = f45 } { .mmf nop.m 0 nop.m 0 (p14) FADD f40 = f40, f41 } ;; { .mmf (p13) getf.d DATA4 = f38 nop.m 0 (p14) FADD f42 = f42, f43 } ;; { .mmf (p14) getf.d DATA5 = f40 nop.m 0 (p15) FADD f44 = f44, f45 } ;; { .mmi (p14) getf.d DATA6 = f42 nop.m 0 (p13) CMPUNC p8, p0 = DATA1, DMAX1 } ;; { .mmi (p15) getf.d DATA7 = f44 (p8 ) adds MAX1 = 0, I (p8 ) mov DMAX1 = DATA1 } ;; { .mmi (p13) CMPUNC p8, p0 = DATA2, DMAX1 ;; (p8 ) adds MAX1 = 1, I (p8 ) mov DMAX1 = DATA2 } ;; { .mmi (p13) CMPUNC p8, p0 = DATA3, DMAX1 ;; (p8 ) adds MAX1 = 2, I (p8 ) mov DMAX1 = DATA3 } ;; { .mmi (p13) CMPUNC p8, p0 = DATA4, DMAX1 ;; (p8 ) adds MAX1 = 3, I (p8 ) mov DMAX1 = DATA4 }{ .mmi (p13) adds I = 4, I nop.m 0 nop.i 0 } ;; { .mmi (p14) CMPUNC p8, p0 = DATA5, DMAX1 ;; (p8 ) adds MAX1 = 0, I (p8 ) mov DMAX1 = DATA5 } ;; { .mmi (p14) CMPUNC p8, p0 = DATA6, DMAX1 ;; (p8 ) adds MAX1 = 1, I (p8 ) mov DMAX1 = DATA6 }{ .mmi (p14) adds I = 2, I nop.m 0 nop.i 0 } ;; { .mmi (p15) CMPUNC p8, p0 = DATA7, DMAX1 ;; (p8) adds MAX1 = 0, I (p8) mov DMAX1 = DATA7 } ;; .align 32 .L999: { .mmi setf.d f8 = DMAX1 adds RET = 1, MAX1 mov ar.lc = ARLC } { .mmb nop.m 0 nop.m 0 br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/lsame.S000066400000000000000000000060121313527062700164130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE .prologue .body ld1 r14 = [r32] ld1 r15 = [r33] ;; adds r16 = -32, r14 // a1 = a - 32 adds r17 = -32, r15 // b1 = b - 32 ;; cmp4.ge p6, p7 = 96, r14 // if (a > 96) cmp4.ge p8, p9 = 96, r15 // if (b > 96) ;; (p7) mov r14 = r16 (p9) mov r15 = r17 ;; cmp4.eq p6, p7 = r15, r14 mov r8 = 1 ;; (p7) mov r8 = 0 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/nrm2.S000066400000000000000000000161671313527062700162040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16) #else #define PREFETCH_SIZE (32 * 16) #endif #ifndef COMPLEX #define COMPADD 0 #define STRIDE INCX #else #define COMPADD 1 #define STRIDE SIZE #endif #define PRE1 r2 #define I r17 #define J r18 #define X2 r19 #define INCX5 r20 #define INCX16 r21 #define N r32 #define X r33 #define INCX r34 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi adds PRE1 = PREFETCH_SIZE * SIZE, X mov f8 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } ;; .body #ifdef F_INTERFACE LDINT N = [N] LDINT INCX = [INCX] ;; #ifndef USE64BITINT sxt4 N = N sxt4 INCX = INCX ;; #endif #endif { .mmi cmp.ge p6, p0 = r0, N cmp.ge p7, p0 = r0, INCX shr I = N, (4 - COMPADD) } { .mbb and J = ((1 << (4 - COMPADD)) - 1), N (p6) br.ret.sptk.many b0 (p7) br.ret.sptk.many b0 } ;; { .mfi mov f9 = f0 mov PR = pr } { .mfi adds I = -1, I mov f10 = f0 shl INCX = INCX, (BASE_SHIFT + COMPADD) } ;; { .mfi shladd X2 = INCX, (2 - COMPADD), X mov f11 = f0 mov pr.rot = 0 } { .mfi shladd INCX5 = INCX, (2 - COMPADD), INCX mov f12 = f0 tbit.z p0, p12 = N, (3 - COMPADD) } ;; { .mfi shladd INCX16 = INCX, (4 - COMPADD), r0 mov f13 = f0 mov ar.ec= 3 } { .mmf cmp.gt p8 ,p0 = r0, I cmp.eq p16, p0 = r0, r0 mov f14 = f0 } ;; { .mmf #ifdef COMPLEX adds INCX = - SIZE, INCX adds INCX5 = - SIZE, INCX5 #else nop.m 0 nop.m 0 #endif mov f15 = f0 } { .mib cmp.eq p9, p0 = r0, J mov ar.lc = I (p8) br.cond.dpnt .L52 } ;; .align 32 .L51: (p16) LDFD f32 = [X], STRIDE (p16) lfetch.nt1 [PRE1], INCX16 (p18) fma.d.s1 f8 = f34, f34, f8 (p16) LDFD f35 = [X2], STRIDE (p18) fma.d.s1 f9 = f37, f37, f9 nop.b 0 ;; (p16) LDFD f38 = [X], INCX (p18) fma.d.s1 f10 = f40, f40, f10 nop.b 0 (p16) LDFD f41 = [X2], INCX (p18) fma.d.s1 f11 = f43, f43, f11 nop.b 0 ;; (p16) LDFD f44 = [X], STRIDE (p18) fma.d.s1 f12 = f46, f46, f12 nop.b 0 (p16) LDFD f47 = [X2], STRIDE (p18) fma.d.s1 f13 = f49, f49, f13 nop.b 0 ;; (p16) LDFD f50 = [X], INCX5 (p18) fma.d.s1 f14 = f52, f52, f14 nop.b 0 (p16) LDFD f53 = [X2], INCX5 (p18) fma.d.s1 f15 = f55, f55, f15 nop.b 0 ;; (p16) LDFD f56 = [X], STRIDE (p18) fma.d.s1 f8 = f58, f58, f8 nop.b 0 (p16) LDFD f59 = [X2], STRIDE (p18) fma.d.s1 f9 = f61, f61, f9 nop.b 0 ;; (p16) LDFD f62 = [X], INCX (p18) fma.d.s1 f10 = f64, f64, f10 nop.b 0 (p16) LDFD f65 = [X2], INCX (p18) fma.d.s1 f11 = f67, f67, f11 nop.b 0 ;; (p16) LDFD f68 = [X], STRIDE (p18) fma.d.s1 f12 = f70, f70, f12 nop.b 0 (p16) LDFD f71 = [X2], STRIDE (p18) fma.d.s1 f13 = f73, f73, f13 nop.b 0 ;; (p16) LDFD f74 = [X], INCX5 (p18) fma.d.s1 f14 = f76, f76, f14 nop.b 0 (p16) LDFD f77 = [X2], INCX5 (p18) fma.d.s1 f15 = f79, f79, f15 br.ctop.sptk.few .L51 ;; .align 32 .L52: { .mmb (p12) LDFD f32 = [X], STRIDE (p12) LDFD f33 = [X2], STRIDE (p9) br.cond.dptk .L998 } ;; { .mmi (p12) LDFD f34 = [X], INCX (p12) LDFD f35 = [X2], INCX tbit.z p0, p13 = N, (2 - COMPADD) } ;; { .mmi (p12) LDFD f36 = [X], STRIDE (p12) LDFD f37 = [X2], STRIDE tbit.z p0, p14 = N, (1 - COMPADD) } ;; { .mmi (p12) LDFD f38 = [X], INCX5 (p12) LDFD f39 = [X2], INCX5 #ifndef COMPLEX tbit.z p0, p15 = N, 0 #endif } ;; (p13) LDFD f40 = [X], STRIDE (p12) fma.d.s1 f8 = f32, f32, f8 (p12) fma.d.s1 f9 = f33, f33, f9 ;; (p13) LDFD f41 = [X], INCX (p12) fma.d.s1 f10 = f34, f34, f10 (p12) fma.d.s1 f11 = f35, f35, f11 ;; (p13) LDFD f42 = [X], STRIDE (p12) fma.d.s1 f12 = f36, f36, f12 (p12) fma.d.s1 f13 = f37, f37, f13 ;; (p13) LDFD f43 = [X], INCX (p12) fma.d.s1 f14 = f38, f38, f14 (p12) fma.d.s1 f15 = f39, f39, f15 ;; (p14) LDFD f44 = [X], STRIDE (p13) fma.d.s1 f8 = f40, f40, f8 (p13) fma.d.s1 f9 = f41, f41, f9 ;; (p14) LDFD f45 = [X], INCX (p13) fma.d.s1 f10 = f42, f42, f10 (p13) fma.d.s1 f11 = f43, f43, f11 ;; #ifndef COMPLEX (p15) LDFD f46 = [X] #endif (p14) fma.d.s1 f12 = f44, f44, f12 (p14) fma.d.s1 f13 = f45, f45, f13 ;; #ifndef COMPLEX (p15) fma.d.s1 f14 = f46, f46, f14 ;; #endif .align 32 .L998: { .mmf fadd.d.s1 f8 = f8, f9 } { .mmf fadd.d.s1 f10 = f10, f11 } { .mmf fadd.d.s1 f12 = f12, f13 } { .mfi fadd.d.s1 f14 = f14, f15 mov ar.lc = ARLC } ;; { .mmf fadd.d.s1 f8 = f8, f10 } { .mfi fadd.d.s1 f12 = f12, f14 mov pr = PR, -65474 } ;; { .mfb fadd.d.s1 f8 = f8, f12 br sqrt } ;; EPILOGUE .section .data .type sqrt, @function .global sqrt OpenBLAS-0.2.20/kernel/ia64/qaxpy.S000066400000000000000000000241541313527062700164630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 16) #define N r32 #define X1 r38 #define INCX r39 #define Y1 r33 #define INCY r34 #define PRE1 r2 #define PRE2 r3 #define I r14 #define J r15 #define X2 r16 #define Y2 r17 #define X3 r18 #define Y3 r19 #define X4 r20 #define Y4 r21 #define YY1 r22 #define YY2 r23 #define YY3 r24 #define YY4 r25 #define INCX4 r8 #define INCY4 r9 #define INCX2 r10 #define INCY2 r11 #define INCX8 r26 #define INCY8 r27 #define PR r30 #define ARLC r31 #define ALPHA f8 #define SP r12 PROLOGUE .prologue PROFCODE { .mmi adds r8 = 16, SP adds r9 = 24, SP .save ar.lc, ARLC mov ARLC = ar.lc } { .mmb adds PRE1 = (PREFETCHSIZE + 2) * SIZE, X1 cmp.lt p0, p6 = r0, N (p6) br.ret.sptk.many b0 } ;; { .mmi ld8 Y1 = [r8] ld8 INCY = [r9] mov PR = pr } ;; .body { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 mov pr.rot = 0 } ;; { .mmi shladd INCX4 = INCX, 2, r0 shladd INCY4 = INCY, 2, r0 mov ar.ec = 3 } { .mmi shladd INCX8 = INCX, 3, r0 shladd INCY8 = INCY, 3, r0 shr I = N, 4 } ;; { .mmi add X2 = INCX, X1 add Y2 = INCY, Y1 add YY2 = INCY, Y1 } ;; { .mmi shladd X3 = INCX, 1, X1 shladd Y3 = INCY, 1, Y1 shladd YY3 = INCY, 1, Y1 } { .mmi shladd X4 = INCX, 1, X2 shladd Y4 = INCY, 1, Y2 shladd YY4 = INCY, 1, Y2 } ;; { .mmi cmp.eq p7 ,p0 = 0, I adds I = -1, I mov YY1 = Y1 } { .mmi and r28 = 127, Y1 and PRE1 = -128, PRE1 cmp.eq p16, p0 = r0, r0 } ;; { .mmi adds PRE2 = (PREFETCHSIZE + 2) * SIZE, Y1 or PRE1 = PRE1, r28 mov ar.lc = I } { .mib and J = 15, N tbit.z p0, p12 = N, 3 (p7) br.cond.dpnt .L115 } ;; .align 32 .L112: { .mmf (p18) STFD [YY1] = f6 (p18) STFD [YY2] = f7 (p18) FMA f6 = ALPHA, f58, f106 } { .mmf (p16) lfetch.excl.nt1 [PRE2], INCY8 nop __LINE__ (p18) FMA f7 = ALPHA, f61, f109 } ;; { .mmf (p18) STFD [YY3] = f10 (p18) STFD [YY4] = f11 (p18) FMA f10 = ALPHA, f64, f112 } { .mmf (p16) lfetch.nt1 [PRE1], INCX8 nop __LINE__ (p18) FMA f11 = ALPHA, f67, f115 } ;; { .mmi (p16) LDFD f32 = [X1], INCX4 (p16) LDFD f35 = [X2], INCX4 (p18) add YY1 = INCY4, YY1 } { .mmi (p16) LDFD f38 = [X3], INCX4 (p16) LDFD f41 = [X4], INCX4 (p18) add YY2 = INCY4, YY2 } ;; { .mmi (p17) LDFD f117 = [Y1], INCY4 (p17) LDFD f120 = [Y2], INCY4 (p18) add YY3 = INCY4, YY3 } { .mmi (p17) LDFD f123 = [Y3], INCY4 (p17) LDFD f126 = [Y4], INCY4 (p18) add YY4 = INCY4, YY4 } ;; { .mmf (p18) STFD [YY1] = f12 (p18) STFD [YY2] = f13 (p18) FMA f12 = ALPHA, f70, f118 } { .mmf (p18) add YY1 = INCY4, YY1 (p18) add YY2 = INCY4, YY2 (p18) FMA f13 = ALPHA, f73, f121 } ;; { .mmf (p18) STFD [YY3] = f14 (p18) STFD [YY4] = f15 (p18) FMA f14 = ALPHA, f76, f124 } { .mmf (p18) add YY3 = INCY4, YY3 (p18) add YY4 = INCY4, YY4 (p18) FMA f15 = ALPHA, f79, f127 } ;; { .mmi (p16) LDFD f44 = [X1], INCX4 (p16) LDFD f47 = [X2], INCX4 nop __LINE__ } { .mmi (p16) LDFD f50 = [X3], INCX4 (p16) LDFD f53 = [X4], INCX4 nop __LINE__ } ;; { .mmi (p16) LDFD f80 = [Y1], INCY4 (p16) LDFD f83 = [Y2], INCY4 nop __LINE__ } { .mmi (p16) LDFD f86 = [Y3], INCY4 (p16) LDFD f89 = [Y4], INCY4 nop __LINE__ } ;; { .mmf (p18) STFD [YY1] = f6 (p18) STFD [YY2] = f7 (p17) FMA f6 = ALPHA, f33, f81 } { .mmf (p16) lfetch.excl.nt1 [PRE2], INCY8 nop __LINE__ (p17) FMA f7 = ALPHA, f36, f84 } ;; { .mmf (p18) STFD [YY3] = f10 (p18) STFD [YY4] = f11 (p17) FMA f10 = ALPHA, f39, f87 } { .mmf (p16) lfetch.nt1 [PRE1], INCX8 nop __LINE__ (p17) FMA f11 = ALPHA, f42, f90 } ;; { .mmi (p16) LDFD f56 = [X1], INCX4 (p16) LDFD f59 = [X2], INCX4 (p18) add YY1 = INCY4, YY1 } { .mmi (p16) LDFD f62 = [X3], INCX4 (p16) LDFD f65 = [X4], INCX4 (p18) add YY2 = INCY4, YY2 } ;; { .mmi (p16) LDFD f92 = [Y1], INCY4 (p16) LDFD f95 = [Y2], INCY4 (p18) add YY3 = INCY4, YY3 } { .mmi (p16) LDFD f98 = [Y3], INCY4 (p16) LDFD f101 = [Y4], INCY4 (p18) add YY4 = INCY4, YY4 } ;; { .mmf (p18) STFD [YY1] = f12 (p18) STFD [YY2] = f13 (p17) FMA f12 = ALPHA, f45, f93 } { .mmf (p18) add YY1 = INCY4, YY1 (p18) add YY2 = INCY4, YY2 (p17) FMA f13 = ALPHA, f48, f96 } ;; { .mmf (p18) STFD [YY3] = f14 (p18) STFD [YY4] = f15 (p17) FMA f14 = ALPHA, f51, f99 } { .mmf (p18) add YY3 = INCY4, YY3 (p18) add YY4 = INCY4, YY4 (p17) FMA f15 = ALPHA, f54, f102 } ;; { .mmi (p16) LDFD f68 = [X1], INCX4 (p16) LDFD f71 = [X2], INCX4 nop __LINE__ } { .mmi (p16) LDFD f74 = [X3], INCX4 (p16) LDFD f77 = [X4], INCX4 nop __LINE__ } ;; { .mmi (p16) LDFD f104 = [Y1], INCY4 (p16) LDFD f107 = [Y2], INCY4 nop __LINE__ } { .mmb (p16) LDFD f110 = [Y3], INCY4 (p16) LDFD f113 = [Y4], INCY4 br.ctop.sptk.few .L112 } ;; .align 32 .L115: { .mmi (p12) LDFD f32 = [X1], INCX4 (p12) LDFD f33 = [X2], INCX4 mov pr = PR, -65474 } { .mmi (p12) LDFD f34 = [X3], INCX4 (p12) LDFD f35 = [X4], INCX4 cmp.eq p9, p0 = r0, J } ;; { .mmi (p12) LDFD f64 = [Y1], INCY4 (p12) LDFD f65 = [Y2], INCY4 mov ar.lc = ARLC } { .mmb (p12) LDFD f66 = [Y3], INCY4 (p12) LDFD f67 = [Y4], INCY4 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f36 = [X1], INCX4 (p12) LDFD f37 = [X2], INCX4 tbit.z p0, p13 = N, 2 } { .mmi (p12) LDFD f38 = [X3], INCX4 (p12) LDFD f39 = [X4], INCX4 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) LDFD f68 = [Y1], INCY4 (p12) LDFD f69 = [Y2], INCY4 tbit.z p0, p15 = N, 0 } { .mmi (p12) LDFD f70 = [Y3], INCY4 (p12) LDFD f71 = [Y4], INCY4 nop __LINE__ } ;; { .mmi (p13) LDFD f40 = [X1], INCX4 (p13) LDFD f41 = [X2], INCX4 shladd INCX2 = INCX, 1, r0 } { .mmi (p13) LDFD f42 = [X3], INCX4 (p13) LDFD f43 = [X4], INCX4 shladd INCY2 = INCY, 1, r0 } ;; { .mmi (p13) LDFD f72 = [Y1], INCY4 (p13) LDFD f73 = [Y2], INCY4 nop __LINE__ } { .mmi (p13) LDFD f74 = [Y3], INCY4 (p13) LDFD f75 = [Y4], INCY4 nop __LINE__ } ;; { .mmi (p14) LDFD f44 = [X1], INCX2 (p14) LDFD f45 = [X2], INCX2 nop __LINE__ } ;; { .mmi (p14) LDFD f76 = [Y1], INCY2 (p14) LDFD f77 = [Y2], INCY2 nop __LINE__ } ;; { .mmi (p15) LDFD f46 = [X1] (p15) LDFD f78 = [Y1] nop __LINE__ } ;; (p12) FMA f32 = ALPHA, f32, f64 (p12) FMA f33 = ALPHA, f33, f65 (p12) FMA f34 = ALPHA, f34, f66 (p12) FMA f35 = ALPHA, f35, f67 (p12) FMA f36 = ALPHA, f36, f68 (p12) FMA f37 = ALPHA, f37, f69 (p12) FMA f38 = ALPHA, f38, f70 (p12) FMA f39 = ALPHA, f39, f71 ;; { .mmf (p12) STFD [YY1] = f32 (p12) STFD [YY2] = f33 (p13) FMA f40 = ALPHA, f40, f72 } { .mmf (p12) add YY1 = INCY4, YY1 (p12) add YY2 = INCY4, YY2 (p13) FMA f41 = ALPHA, f41, f73 } ;; { .mmf (p12) STFD [YY3] = f34 (p12) STFD [YY4] = f35 (p13) FMA f42 = ALPHA, f42, f74 } { .mmf (p12) add YY3 = INCY4, YY3 (p12) add YY4 = INCY4, YY4 (p13) FMA f43 = ALPHA, f43, f75 } ;; { .mmf (p12) STFD [YY1] = f36 (p12) STFD [YY2] = f37 (p14) FMA f44 = ALPHA, f44, f76 } { .mmf (p12) add YY1 = INCY4, YY1 (p12) add YY2 = INCY4, YY2 (p14) FMA f45 = ALPHA, f45, f77 } ;; { .mmf (p12) STFD [YY3] = f38 (p12) STFD [YY4] = f39 (p15) FMA f46 = ALPHA, f46, f78 } { .mmi (p12) add YY3 = INCY4, YY3 (p12) add YY4 = INCY4, YY4 nop __LINE__ } ;; { .mmi (p13) STFD [YY1] = f40 (p13) STFD [YY2] = f41 nop __LINE__ } { .mmi (p13) add YY1 = INCY4, YY1 (p13) add YY2 = INCY4, YY2 nop __LINE__ } ;; { .mmi (p13) STFD [YY3] = f42 (p13) STFD [YY4] = f43 nop __LINE__ } ;; { .mmi (p14) STFD [YY1] = f44 (p14) STFD [YY2] = f45 (p14) add YY1 = INCY2, YY1 } ;; { .mmb (p15) STFD [YY1] = f46 nop __LINE__ br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/qcopy.S000066400000000000000000000254741313527062700164620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREX r2 #define PREY r3 #define I r14 #define J r15 #define X2 r16 #define Y2 r17 #define INCX2 r18 #define INCY2 r19 #define INCX8 r20 #define INCY8 r21 #define PR r30 #define ARLC r31 #define PREFETCH_SIZE (8 * 16) PROLOGUE .prologue PROFCODE { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N shr I = N, 4 (p6) br.ret.sptk.many b0 } ;; .body { .mmi sub r8 = X1, Y1 mov r9 = 0xf0 mov PR = pr } { .mmi shladd INCX2 = INCX, 1, r0 shladd INCY2 = INCY, 1, r0 and J = 15, N } ;; { .mmi shladd INCX8 = INCX, 3, r0 shladd INCY8 = INCY, 3, r0 mov pr.rot = 0 } { .mmi and r8 = r9, r8 cmp.eq p9, p0 = r0, J adds I = -1, I } ;; { .mmi add X2 = X1, INCX add Y2 = Y1, INCY mov ar.ec = 4 } { .mmb cmp.gt p6, p0 = 127, r8 cmp.eq p16, p0 = r0, r0 (p6) br.cond.dpnt .L20 } ;; { .mmi adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1 mov ar.lc = I } { .mib cmp.eq p8 ,p0 = -1, I tbit.z p0, p12 = N, 3 (p8) br.cond.dpnt .L15 } ;; .align 16 .L12: { .mmi (p19) STFD [Y1] = f35 (p19) STFD [Y2] = f39 (p19) add Y1 = INCY2, Y1 } { .mmi (p17) LDFD f81 = [X1], INCX2 (p17) LDFD f85 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f43 (p19) STFD [Y2] = f47 (p19) add Y1 = INCY2, Y1 } { .mmi (p17) LDFD f89 = [X1], INCX2 (p17) LDFD f93 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f51 (p19) STFD [Y2] = f55 (p19) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f32 = [X1], INCX2 (p16) LDFD f36 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f59 (p19) STFD [Y2] = f63 (p19) add Y1 = INCY2, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p16) LDFD f40 = [X1], INCX2 (p16) LDFD f44 = [X2], INCX2 nop __LINE__ } ;; { .mmi (p19) STFD [Y1] = f67 (p19) STFD [Y2] = f71 (p19) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f48 = [X1], INCX2 (p16) LDFD f52 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f75 (p19) STFD [Y2] = f79 (p19) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f56 = [X1], INCX2 (p16) LDFD f60 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f83 (p19) STFD [Y2] = f87 (p19) add Y1 = INCY2, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f91 (p19) STFD [Y2] = f95 (p19) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f64 = [X1], INCX2 (p16) LDFD f68 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmb (p16) LDFD f72 = [X1], INCX2 (p16) LDFD f76 = [X2], INCX2 br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmi (p12) LDFD f48 = [X1], INCX2 (p12) LDFD f49 = [X2], INCX2 mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f50 = [X1], INCX2 (p12) LDFD f51 = [X2], INCX2 mov pr = PR, -65474 } ;; { .mmb (p12) LDFD f52 = [X1], INCX2 (p12) LDFD f53 = [X2], INCX2 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f54 = [X1], INCX2 (p12) LDFD f55 = [X2], INCX2 tbit.z p0, p13 = N, 2 } ;; { .mmi (p13) LDFD f56 = [X1], INCX2 (p13) LDFD f57 = [X2], INCX2 tbit.z p0, p14 = N, 1 } ;; { .mmi (p13) LDFD f58 = [X1], INCX2 (p13) LDFD f59 = [X2], INCX2 tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f49 (p12) add Y1 = INCY2, Y1 } { .mmi (p14) LDFD f60 = [X1], INCX2 (p14) LDFD f61 = [X2], INCX2 (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f51 (p12) add Y1 = INCY2, Y1 } { .mmi (p15) LDFD f62 = [X1] nop __LINE__ (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p12) STFD [Y1] = f52 (p12) STFD [Y2] = f53 (p12) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p12) STFD [Y1] = f54 (p12) STFD [Y2] = f55 (p12) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p13) STFD [Y1] = f56 (p13) STFD [Y2] = f57 (p13) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY2, Y2 } ;; { .mmi (p13) STFD [Y1] = f58 (p13) STFD [Y2] = f59 (p13) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY2, Y2 } ;; { .mmi (p14) STFD [Y1] = f60 (p14) STFD [Y2] = f61 (p14) add Y1 = INCY2, Y1 } ;; { .mmb (p15) STFD [Y1] = f62 nop __LINE__ br.ret.sptk.many b0 } ;; .align 16 .L20: { .mmi adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1 mov ar.lc = I } { .mib cmp.eq p8 ,p0 = -1, I tbit.z p0, p12 = N, 3 (p8) br.cond.dpnt .L25 } ;; .align 16 .L22: { .mmi (p19) STFD [Y1] = f67 (p19) STFD [Y2] = f71 (p19) add Y1 = INCY2, Y1 } { .mmi (p17) LDFD f81 = [X1], INCX2 (p17) LDFD f85 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f75 (p19) STFD [Y2] = f79 (p19) add Y1 = INCY2, Y1 } { .mmi (p17) LDFD f89 = [X1], INCX2 (p17) LDFD f93 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f83 (p19) STFD [Y2] = f87 (p19) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f32 = [X1], INCX2 (p16) LDFD f36 = [X2], INCX2 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p19) STFD [Y1] = f91 (p19) STFD [Y2] = f95 (p19) add Y1 = INCY2, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p19) add Y2 = INCY2, Y2 } ;; { .mmi (p16) LDFD f40 = [X1], INCX2 (p16) LDFD f44 = [X2], INCX2 nop __LINE__ } ;; { .mmi (p18) STFD [Y1] = f34 (p18) STFD [Y2] = f38 (p18) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f48 = [X1], INCX2 (p16) LDFD f52 = [X2], INCX2 (p18) add Y2 = INCY2, Y2 } ;; { .mmi (p18) STFD [Y1] = f42 (p18) STFD [Y2] = f46 (p18) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f56 = [X1], INCX2 (p16) LDFD f60 = [X2], INCX2 (p18) add Y2 = INCY2, Y2 } ;; { .mmi (p18) STFD [Y1] = f50 (p18) STFD [Y2] = f54 (p18) add Y1 = INCY2, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p18) add Y2 = INCY2, Y2 } ;; { .mmi (p18) STFD [Y1] = f58 (p18) STFD [Y2] = f62 (p18) add Y1 = INCY2, Y1 } { .mmi (p16) LDFD f64 = [X1], INCX2 (p16) LDFD f68 = [X2], INCX2 (p18) add Y2 = INCY2, Y2 } ;; { .mmb (p16) LDFD f72 = [X1], INCX2 (p16) LDFD f76 = [X2], INCX2 br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmi (p12) LDFD f48 = [X1], INCX2 (p12) LDFD f49 = [X2], INCX2 mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f50 = [X1], INCX2 (p12) LDFD f51 = [X2], INCX2 mov pr = PR, -65474 } ;; { .mmb (p12) LDFD f52 = [X1], INCX2 (p12) LDFD f53 = [X2], INCX2 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f54 = [X1], INCX2 (p12) LDFD f55 = [X2], INCX2 tbit.z p0, p13 = N, 2 } ;; { .mmi (p13) LDFD f56 = [X1], INCX2 (p13) LDFD f57 = [X2], INCX2 tbit.z p0, p14 = N, 1 } ;; { .mmi (p13) LDFD f58 = [X1], INCX2 (p13) LDFD f59 = [X2], INCX2 tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f49 (p12) add Y1 = INCY2, Y1 } { .mmi (p14) LDFD f60 = [X1], INCX2 (p14) LDFD f61 = [X2], INCX2 (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f51 (p12) add Y1 = INCY2, Y1 } { .mmi (p15) LDFD f62 = [X1] nop __LINE__ (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p12) STFD [Y1] = f52 (p12) STFD [Y2] = f53 (p12) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p12) STFD [Y1] = f54 (p12) STFD [Y2] = f55 (p12) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY2, Y2 } ;; { .mmi (p13) STFD [Y1] = f56 (p13) STFD [Y2] = f57 (p13) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY2, Y2 } ;; { .mmi (p13) STFD [Y1] = f58 (p13) STFD [Y2] = f59 (p13) add Y1 = INCY2, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY2, Y2 } ;; { .mmi (p14) STFD [Y1] = f60 (p14) STFD [Y2] = f61 (p14) add Y1 = INCY2, Y1 } ;; { .mmb (p15) STFD [Y1] = f62 nop __LINE__ br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/qdot.S000066400000000000000000000214511313527062700162650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_SIZE (8 * 24) #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREX1 r2 #define PREY1 r3 #define I r14 #define J r15 #define Y2 r16 #define X2 r17 #define Y3 r18 #define X3 r19 #define Y4 r20 #define X4 r21 #define INCX2 r22 #define INCY2 r23 #define INCX4 r24 #define INCY4 r25 #define INCX16 r26 #define INCY16 r27 #define PREX2 r28 #define PREY2 r29 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi nop __LINE__ mov f8 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mfi mov r26 = 1 mov f9 = f0 nop __LINE__ } ;; .body #ifdef F_INTERFACE LDINT N = [N] LDINT INCX = [INCX] LDINT INCY = [INCY] ;; #ifndef USE64BITINT sxt4 N = N sxt4 INCX = INCX sxt4 INCY = INCY ;; #endif cmp.le p0, p6 = r0, INCX cmp.le p0, p7 = r0, INCY sub r26 = r26, N ;; setf.sig f32 = r26 setf.sig f33 = INCX setf.sig f34 = INCY ;; xmpy.l f33 = f32, f33 xmpy.l f34 = f32, f34 ;; getf.sig r26 = f33 getf.sig r27 = f34 ;; (p6) shladd X1 = r26, BASE_SHIFT, X1 (p7) shladd Y1 = r27, BASE_SHIFT, Y1 ;; #endif { .mmi adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 mov PR = pr } { .mib cmp.lt p0, p6 = r0, N shl INCX = INCX, BASE_SHIFT (p6) br.ret.sptk.many b0 } ;; { .mfi add X2 = INCX, X1 mov f10 = f0 shl INCY = INCY, BASE_SHIFT } { .mmf and r8 = 127, X1 shladd X3 = INCX, 1, X1 mov f11 = f0 } ;; { .mmi and PREY1 = -128, PREY1 shladd X4 = INCX, 1, X2 add INCX2 = INCX, INCX } { .mmi shladd INCX4 = INCX, 2, r0 add Y2 = INCY, Y1 shladd Y3 = INCY, 1, Y1 } ;; { .mmi shladd Y4 = INCY, 1, Y2 add INCY2 = INCY, INCY nop __LINE__ } { .mmi shladd INCY4 = INCY, 2, r0 shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 } ;; { .mfi nop __LINE__ mov f12 = f0 mov pr.rot= 0 } { .mfi or PREY1 = PREY1, r8 mov f13 = f0 shr I = N, 4 } ;; { .mfi adds I = -1, I mov f14 = f0 mov ar.ec= 3 } { .mmf shladd PREX2 = INCX, 3, PREX1 shladd PREY2 = INCY, 3, PREY1 mov f15 = f0 } ;; { .mmi and J = 15, N cmp.eq p16, p0 = r0, r0 mov ar.lc = I } { .mib cmp.eq p6 ,p0 = -1, I tbit.nz p12, p0 = N, 3 (p6) br.cond.dpnt .L215 } ;; .align 32 .L212: { .mmf (p16) lfetch.nt1 [PREX1], INCX16 (p16) lfetch.nt1 [PREX2], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f80 = [X1], INCX4 (p16) LDFD f83 = [X2], INCX4 (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFD f86 = [X3], INCX4 (p16) LDFD f89 = [X4], INCX4 (p18) FMA f10 = f40, f88, f10 } { .mmf (p16) LDFD f92 = [X1], INCX4 (p16) LDFD f95 = [X2], INCX4 (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFD f32 = [Y1], INCY4 (p16) LDFD f35 = [Y2], INCY4 (p18) FMA f12 = f46, f94, f12 } { .mmf (p16) LDFD f38 = [Y3], INCY4 (p16) LDFD f41 = [Y4], INCY4 (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFD f98 = [X3], INCX4 (p16) LDFD f101 = [X4], INCX4 (p18) FMA f14 = f52, f100, f14 } { .mmf (p16) LDFD f104 = [X1], INCX4 (p16) LDFD f107 = [X2], INCX4 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFD f44 = [Y1], INCY4 (p16) LDFD f47 = [Y2], INCY4 (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f50 = [Y3], INCY4 (p16) LDFD f53 = [Y4], INCY4 (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) lfetch.nt1 [PREY1], INCY16 (p16) lfetch.nt1 [PREY2], INCY16 (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f110 = [X3], INCX4 (p16) LDFD f113 = [X4], INCX4 (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFD f56 = [Y1], INCY4 (p16) LDFD f59 = [Y2], INCY4 (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f62 = [Y3], INCY4 (p16) LDFD f65 = [Y4], INCY4 (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFD f116 = [X1], INCX4 (p16) LDFD f119 = [X2], INCX4 (p18) FMA f14 = f76, f124, f14 } { .mmf (p16) LDFD f122 = [X3], INCX4 (p16) LDFD f125 = [X4], INCX4 (p18) FMA f15 = f79, f127, f15 } ;; { .mmi (p16) LDFD f68 = [Y1], INCY4 (p16) LDFD f71 = [Y2], INCY4 nop __LINE__ } { .mmb (p16) LDFD f74 = [Y3], INCY4 (p16) LDFD f77 = [Y4], INCY4 br.ctop.sptk.few .L212 } ;; .align 32 .L215: { .mmi (p12) LDFD f48 = [X1], INCX4 (p12) LDFD f49 = [X2], INCX4 cmp.eq p7, p0 = r0, J } { .mmb (p12) LDFD f50 = [X3], INCX4 (p12) LDFD f51 = [X4], INCX4 (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFD f32 = [Y1], INCY4 (p12) LDFD f33 = [Y2], INCY4 tbit.nz p13, p0 = N, 2 } { .mmi (p12) LDFD f34 = [Y3], INCY4 (p12) LDFD f35 = [Y4], INCY4 nop __LINE__ } ;; { .mmi (p12) LDFD f52 = [X1], INCX4 (p12) LDFD f53 = [X2], INCX4 tbit.nz p14, p0 = N, 1 } { .mmi (p12) LDFD f54 = [X3], INCX4 (p12) LDFD f55 = [X4], INCX4 nop __LINE__ } ;; { .mmi (p12) LDFD f36 = [Y1], INCY4 (p12) LDFD f37 = [Y2], INCY4 tbit.nz p15, p0 = N, 0 } { .mmi (p12) LDFD f38 = [Y3], INCY4 (p12) LDFD f39 = [Y4], INCY4 nop __LINE__ } ;; { .mmi (p13) LDFD f56 = [X1], INCX4 (p13) LDFD f57 = [X2], INCX4 nop __LINE__ } { .mmi (p13) LDFD f58 = [X3], INCX4 (p13) LDFD f59 = [X4], INCX4 nop __LINE__ } ;; { .mmi (p13) LDFD f40 = [Y1], INCY4 (p13) LDFD f41 = [Y2], INCY4 nop __LINE__ } { .mmi (p13) LDFD f42 = [Y3], INCY4 (p13) LDFD f43 = [Y4], INCY4 nop __LINE__ } ;; { .mmi (p14) LDFD f60 = [X1], INCX2 (p14) LDFD f61 = [X2], INCX2 nop __LINE__ } { .mmi (p14) LDFD f44 = [Y1], INCY2 (p14) LDFD f45 = [Y2], INCY2 nop __LINE__ } ;; { .mmi (p15) LDFD f62 = [X1] (p15) LDFD f46 = [Y1] nop __LINE__ } ;; (p12) FMA f8 = f32, f48, f8 (p12) FMA f9 = f33, f49, f9 (p12) FMA f10 = f34, f50, f10 (p12) FMA f11 = f35, f51, f11 ;; (p12) FMA f12 = f36, f52, f12 (p12) FMA f13 = f37, f53, f13 (p12) FMA f14 = f38, f54, f14 (p12) FMA f15 = f39, f55, f15 ;; (p13) FMA f8 = f40, f56, f8 (p13) FMA f9 = f41, f57, f9 (p13) FMA f10 = f42, f58, f10 (p13) FMA f11 = f43, f59, f11 ;; (p14) FMA f8 = f44, f60, f8 (p14) FMA f9 = f45, f61, f9 (p15) FMA f10 = f46, f62, f10 ;; .align 32 .L999: FADD f8 = f8, f9 FADD f10 = f10, f11 FADD f12 = f12, f13 FADD f14 = f14, f15 ;; FADD f8 = f8, f10 FADD f12 = f12, f14 mov ar.lc = ARLC ;; FADD f8 = f8, f12 mov pr = PR, -65474 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/qgemm_kernel.S000066400000000000000000004377411313527062700200010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 16) #define CPREFETCHSIZE 7 #define CPREFETCH lfetch.excl.nt2 #define M r32 #define N r33 #define K r34 #define A r38 #define B r39 #define C r36 #define LDC r37 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define C9 loc0 #define C10 loc1 #define C11 loc2 #define C12 loc3 #define C13 loc4 #define C14 loc5 #define C15 loc6 #define C16 loc7 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA f8 #define AORIG loc8 #define KK loc9 #define KK8 loc10 #define OFFSET loc11 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS #ifdef TRMMKERNEL alloc ARPFS = ar.pfs, 8, 16, 0, 0 #else alloc ARPFS = ar.pfs, 8, 8, 0, 0 #endif adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -16 * 16, SP adds r9 = -15 * 16, SP adds SP = -16 * 16, SP } ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 mov PR = pr ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 shr J = N, 3 ;; stf.spill [r8] = f22, 32 stf.spill [r9] = f23, 32 mov AOFFSET = A ;; stf.spill [r8] = f24, 32 stf.spill [r9] = f25, 32 cmp.ge p6, p0 = 0, J ;; stf.spill [r8] = f26, 32 stf.spill [r9] = f27, 32 ;; stf.spill [r8] = f28, 32 stf.spill [r9] = f29, 32 ;; stf.spill [r8] = f30 stf.spill [r9] = f31 ld8 C = [r14], 8 ;; ld8 LDC = [r14], 8 ;; shladd LDC = LDC, BASE_SHIFT, r0 ;; #ifndef TRMMKERNEL (p6) br.cond.dpnt .L050 .body ;; #else .body ;; ld8 OFFSET = [r14], 8 ;; #if defined(TRMMKERNEL) && !defined(LEFT) ;; sub KK = r0, OFFSET #endif (p6) br.cond.dpnt .L050 ;; #endif .align 32 .L010: { .mfi adds J = -1, J mov f64 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f72 = f0 } ;; { .mmf cmp.eq p6, p7 = 0, I #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif mov f80 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f88 = f0 } ;; { .mmf shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc shladd C = LDC, 3, C // coffset += 8 * ldc mov f96 = f0 } { .mmf shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc mov f104 = f0 } ;; { .mfi shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif }{ .mfb sub C8 = C, LDC // coffset8 = c + 7 * ldc mov f120 = f0 (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) mov BOFFSET = B ;; { .mfb LDFD f48 = [BOFFSET], SIZE mov f65 = f0 nop __LINE__ } ;; { .mfb LDFD f49 = [BOFFSET], SIZE mov f73 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 3, B mov f65 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; LDFD f48 = [BOFFSET], SIZE ;; { .mfi LDFD f49 = [BOFFSET], SIZE mov f73 = f0 nop __LINE__ } ;; #endif LDFD f32 = [AOFFSET], SIZE LDFD f50 = [BOFFSET], SIZE ;; { .mfb LDFD f33 = [AOFFSET], SIZE mov f81 = f0 nop __LINE__ } { .mfb LDFD f51 = [BOFFSET], SIZE mov f89 = f0 nop __LINE__ } ;; LDFD f52 = [BOFFSET], SIZE ;; { .mmf LDFD f53 = [BOFFSET], SIZE setf.d f97 = r0 mov f105 = f0 } { .mfb setf.d f113 = r0 mov f121 = f0 nop __LINE__ } ;; LDFD f54 = [BOFFSET], SIZE ;; { .mmf LDFD f55 = [BOFFSET], SIZE setf.d f66 = r0 mov f74 = f0 } { .mfb setf.d f82 = r0 mov f90 = f0 nop __LINE__ } ;; LDFD f34 = [AOFFSET], SIZE ;; { .mmf LDFD f35 = [AOFFSET], SIZE setf.d f98 = r0 mov f106 = f0 } { .mfb setf.d f114 = r0 mov f122 = f0 nop __LINE__ } ;; LDFD f36 = [AOFFSET], SIZE ;; { .mmf LDFD f37 = [AOFFSET], SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 8, KK #endif #endif } ;; LDFD f38 = [AOFFSET], SIZE ;; { .mmf LDFD f39 = [AOFFSET], SIZE setf.d f99 = r0 mov f107 = f0 } { .mfi setf.d f115 = r0 mov f123 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mmf CPREFETCH [PREC], LDC setf.d f100 = r0 mov f108 = f0 } { .mfi setf.d f116 = r0 mov f124 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f101 = r0 mov f109 = f0 } { .mfi setf.d f117 = r0 mov f125 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f70 = r0 mov f78 = f0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f102 = r0 mov f110 = f0 } { .mfi setf.d f118 = r0 mov f126 = f0 adds L = -1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f71 = r0 mov f79 = f0 } { .mfi setf.d f87 = r0 mov f95 = f0 mov ar.lc = L } ;; { .mmf CPREFETCH [PREC] setf.d f103 = r0 mov f111 = f0 } { .mfi setf.d f119 = r0 mov f127 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.fault.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFD f40 = [AOFFSET], SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb adds C9 = 4 * SIZE, C1 FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfi (p3) LDFD f56 = [BOFFSET], SIZE FMA f112 = f32, f54, f112 // A1 * B7 adds C10 = 4 * SIZE, C2 } { .mfb (p3) LDFD f41 = [AOFFSET], SIZE FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfi (p3) LDFD f57 = [BOFFSET], SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C11 = 4 * SIZE, C3 } { .mfb (p3) LDFD f42 = [AOFFSET], SIZE FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfi (p3) LDFD f58 = [BOFFSET], SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb (p3) LDFD f43 = [AOFFSET], SIZE FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfi (p3) LDFD f59 = [BOFFSET], SIZE FMA f97 = f33, f52, f97 // A2 * B5 adds C13 = 4 * SIZE, C5 } { .mfb (p3) LDFD f44 = [AOFFSET], SIZE FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfi (p3) LDFD f60 = [BOFFSET], SIZE FMA f113 = f33, f54, f113 // A2 * B7 adds C14 = 4 * SIZE, C6 } { .mfb (p3) LDFD f45 = [AOFFSET], SIZE FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfi (p3) LDFD f61 = [BOFFSET], SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C15 = 4 * SIZE, C7 } { .mfb (p3) LDFD f46 = [AOFFSET], SIZE FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfi (p3) LDFD f62 = [BOFFSET], SIZE FMA f82 = f34, f50, f82 // A3 * B3 adds C16 = 4 * SIZE, C8 } { .mfb (p3) LDFD f47 = [AOFFSET], SIZE FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb (p3) LDFD f63 = [BOFFSET], SIZE FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb nop __LINE__ FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb nop __LINE__ FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f108 = f36, f53, f108 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f124 = f36, f55, f124 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f109 = f37, f53, f109 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f125 = f37, f55, f125 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f110 = f38, f53, f110 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f126 = f38, f55, f126 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb (p4) LDFD f32 = [AOFFSET], SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFD f33 = [AOFFSET], SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb (p4) LDFD f48 = [BOFFSET], SIZE FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFD f34 = [AOFFSET], SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb (p4) LDFD f49 = [BOFFSET], SIZE FMA f111 = f39, f53, f111 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb lfetch.fault.nt1 [PREA], 8 * SIZE FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f127 = f39, f55, f127 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb lfetch.nt1 [PREB], 8 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFD f35 = [AOFFSET], SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb (p4) LDFD f50 = [BOFFSET], SIZE (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFD f36 = [AOFFSET], SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb (p4) LDFD f51 = [BOFFSET], SIZE (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFD f37 = [AOFFSET], SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb (p4) LDFD f52 = [BOFFSET], SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFD f38 = [AOFFSET], SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p4) LDFD f53 = [BOFFSET], SIZE (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFD f39 = [AOFFSET], SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p4) LDFD f54 = [BOFFSET], SIZE (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFD f55 = [BOFFSET], SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f6 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f7 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f10 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f11 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f12 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f13 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f14 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f15 = [C9 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f16 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f17 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f18 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f19 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f20 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f21 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f22 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f23 = [C10], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f24 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f25 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f123 = f43, f63, f123 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f26 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f27 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f28 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f29 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f30 = [C3 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f31 = [C11], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f108 = f44, f61, f108 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f32 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f33 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f124 = f44, f63, f124 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f34 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f35 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f36 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f37 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f38 = [C4 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f39 = [C12], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f109 = f45, f61, f109 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f48 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f49 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f125 = f45, f63, f125 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f50 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f51 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f52 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f53 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f54 = [C5 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f55 = [C13], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f110 = f46, f61, f110 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f40 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f41 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f126 = f46, f63, f126 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f42 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f43 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f44 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f45 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f95 = f47, f59, f95 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f59 = [C6 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f60 = [C14], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f111 = f47, f61, f111 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f61 = [C7 ], SIZE #else nop __LINE__ #endif (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f62 = [C15], SIZE #else nop __LINE__ #endif (p3) FMA f127 = f47, f63, f127 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L013: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi (p5) LDFD f63 = [C7 ], SIZE FMA f64 = ALPHA, f64, f6 cmp.ne p6, p0 = 1, I } { .mfb (p5) LDFD f6 = [C15], SIZE FMA f68 = ALPHA, f68, f7 nop __LINE__ } ;; { .mfi (p5) LDFD f7 = [C7 ], SIZE FMA f65 = ALPHA, f65, f10 adds I = -1, I } { .mfb (p5) LDFD f10 = [C15], SIZE FMA f69 = ALPHA, f69, f11 nop __LINE__ } ;; { .mfb (p5) LDFD f11 = [C7 ], -3 * SIZE FMA f66 = ALPHA, f66, f12 nop __LINE__ } { .mfb (p5) LDFD f12 = [C15], -3 * SIZE FMA f70 = ALPHA, f70, f13 nop __LINE__ } ;; { .mfb LDFD f13 = [C8 ], SIZE FMA f67 = ALPHA, f67, f14 nop __LINE__ } { .mfb LDFD f14 = [C16], SIZE FMA f71 = ALPHA, f71, f15 nop __LINE__ } ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE FMA f72 = ALPHA, f72, f16 } { .mmf LDFD f15 = [C8 ], SIZE LDFD f16 = [C16], SIZE FMA f76 = ALPHA, f76, f17 } ;; { .mmf STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE FMA f73 = ALPHA, f73, f18 } { .mmf LDFD f17 = [C8 ], SIZE LDFD f18 = [C16], SIZE FMA f77 = ALPHA, f77, f19 } ;; { .mmf STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE FMA f74 = ALPHA, f74, f20 } { .mmf LDFD f19 = [C8 ], -3 * SIZE LDFD f20 = [C16], -3 * SIZE FMA f78 = ALPHA, f78, f21 } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMA f75 = ALPHA, f75, f22 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMA f79 = ALPHA, f79, f23 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f80 = ALPHA, f80, f24 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMA f84 = ALPHA, f84, f25 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMA f81 = ALPHA, f81, f26 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMA f85 = ALPHA, f85, f27 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMA f82 = ALPHA, f82, f28 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMA f86 = ALPHA, f86, f29 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMA f83 = ALPHA, f83, f30 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMA f87 = ALPHA, f87, f31 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f88 = ALPHA, f88, f32 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMA f92 = ALPHA, f92, f33 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMA f89 = ALPHA, f89, f34 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMA f93 = ALPHA, f93, f35 nop __LINE__ } ;; { .mfb STFD [C3 ] = f82, SIZE FMA f90 = ALPHA, f90, f36 nop __LINE__ } { .mfb STFD [C11] = f86, SIZE FMA f94 = ALPHA, f94, f37 nop __LINE__ } ;; { .mfb STFD [C3 ] = f83, 5 * SIZE FMA f91 = ALPHA, f91, f38 nop __LINE__ } { .mfb STFD [C11] = f87, 5 * SIZE FMA f95 = ALPHA, f95, f39 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMA f96 = ALPHA, f96, f48 nop __LINE__ } { .mfb STFD [C12] = f92, SIZE FMA f100 = ALPHA, f100, f49 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, SIZE FMA f97 = ALPHA, f97, f50 nop __LINE__ } { .mfb STFD [C12] = f93, SIZE FMA f101 = ALPHA, f101, f51 nop __LINE__ } ;; { .mfb STFD [C4 ] = f90, SIZE FMA f98 = ALPHA, f98, f52 nop __LINE__ } { .mfb STFD [C12] = f94, SIZE FMA f102 = ALPHA, f102, f53 nop __LINE__ } ;; { .mfb STFD [C4 ] = f91, 5 * SIZE FMA f99 = ALPHA, f99, f54 nop __LINE__ } { .mfb STFD [C12] = f95, 5 * SIZE FMA f103 = ALPHA, f103, f55 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMA f104 = ALPHA, f104, f40 nop __LINE__ } { .mfb STFD [C13] = f100, SIZE FMA f108 = ALPHA, f108, f41 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, SIZE FMA f105 = ALPHA, f105, f42 nop __LINE__ } { .mfb STFD [C13] = f101, SIZE FMA f109 = ALPHA, f109, f43 nop __LINE__ } ;; { .mfb STFD [C5 ] = f98, SIZE FMA f106 = ALPHA, f106, f44 nop __LINE__ } { .mfb STFD [C13] = f102, SIZE FMA f110 = ALPHA, f110, f45 nop __LINE__ } ;; { .mfb STFD [C5 ] = f99, 5 * SIZE FMA f107 = ALPHA, f107, f59 nop __LINE__ } { .mfb STFD [C13] = f103, 5 * SIZE FMA f111 = ALPHA, f111, f60 nop __LINE__ } ;; { .mfb STFD [C6 ] = f104, SIZE FMA f112 = ALPHA, f112, f61 nop __LINE__ } { .mfb STFD [C14] = f108, SIZE FMA f116 = ALPHA, f116, f62 nop __LINE__ } ;; { .mfb STFD [C6 ] = f105, SIZE FMA f113 = ALPHA, f113, f63 nop __LINE__ } { .mfb STFD [C14] = f109, SIZE FMA f117 = ALPHA, f117, f6 nop __LINE__ } ;; { .mfb STFD [C6 ] = f106, SIZE FMA f114 = ALPHA, f114, f7 nop __LINE__ } { .mfb STFD [C14] = f110, SIZE FMA f118 = ALPHA, f118, f10 nop __LINE__ } ;; { .mfb STFD [C6 ] = f107, 5 * SIZE FMA f115 = ALPHA, f115, f11 nop __LINE__ } { .mfb STFD [C14] = f111, 5 * SIZE FMA f119 = ALPHA, f119, f12 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE FMA f120 = ALPHA, f120, f13 nop __LINE__ } { .mfb STFD [C15] = f116, SIZE FMA f124 = ALPHA, f124, f14 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, SIZE FMA f121 = ALPHA, f121, f15 nop __LINE__ } { .mfb STFD [C15] = f117, SIZE FMA f125 = ALPHA, f125, f16 nop __LINE__ } ;; { .mfb STFD [C7 ] = f114, SIZE FMA f122 = ALPHA, f122, f17 nop __LINE__ } { .mfb STFD [C15] = f118, SIZE FMA f126 = ALPHA, f126, f18 nop __LINE__ } ;; { .mfb STFD [C7 ] = f115, 5 * SIZE FMA f123 = ALPHA, f123, f19 nop __LINE__ } { .mfb STFD [C15] = f119, 5 * SIZE FMA f127 = ALPHA, f127, f20 nop __LINE__ } ;; { .mfb STFD [C8 ] = f120, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C16] = f124, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f121, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C16] = f125, SIZE mov f88 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f122, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C16] = f126, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f123, 5 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C16] = f127, 5 * SIZE mov f120 = f0 (p6) br.cond.dptk .L011 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMPY f68 = ALPHA, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb nop __LINE__ FMPY f69 = ALPHA, f69 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f70 = ALPHA, f70 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f67 = ALPHA, f67 nop __LINE__ } { .mfb nop __LINE__ FMPY f71 = ALPHA, f71 nop __LINE__ } ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE FMPY f72 = ALPHA, f72 } { .mmf nop __LINE__ nop __LINE__ FMPY f76 = ALPHA, f76 } ;; { .mmf STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE FMPY f73 = ALPHA, f73 } { .mmf nop __LINE__ nop __LINE__ FMPY f77 = ALPHA, f77 } ;; { .mmf STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE FMPY f74 = ALPHA, f74 } { .mmf nop __LINE__ nop __LINE__ FMPY f78 = ALPHA, f78 } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMPY f75 = ALPHA, f75 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMPY f79 = ALPHA, f79 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMPY f84 = ALPHA, f84 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMPY f85 = ALPHA, f85 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMPY f86 = ALPHA, f86 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMPY f87 = ALPHA, f87 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMPY f92 = ALPHA, f92 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMPY f89 = ALPHA, f89 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMPY f93 = ALPHA, f93 nop __LINE__ } ;; { .mfb STFD [C3 ] = f82, SIZE FMPY f90 = ALPHA, f90 nop __LINE__ } { .mfb STFD [C11] = f86, SIZE FMPY f94 = ALPHA, f94 nop __LINE__ } ;; { .mfb STFD [C3 ] = f83, 5 * SIZE FMPY f91 = ALPHA, f91 nop __LINE__ } { .mfb STFD [C11] = f87, 5 * SIZE FMPY f95 = ALPHA, f95 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C12] = f92, SIZE FMPY f100 = ALPHA, f100 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C12] = f93, SIZE FMPY f101 = ALPHA, f101 nop __LINE__ } ;; { .mfb STFD [C4 ] = f90, SIZE FMPY f98 = ALPHA, f98 nop __LINE__ } { .mfb STFD [C12] = f94, SIZE FMPY f102 = ALPHA, f102 nop __LINE__ } ;; { .mfb STFD [C4 ] = f91, 5 * SIZE FMPY f99 = ALPHA, f99 nop __LINE__ } { .mfb STFD [C12] = f95, 5 * SIZE FMPY f103 = ALPHA, f103 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } { .mfb STFD [C13] = f100, SIZE FMPY f108 = ALPHA, f108 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } { .mfb STFD [C13] = f101, SIZE FMPY f109 = ALPHA, f109 nop __LINE__ } ;; { .mfb STFD [C5 ] = f98, SIZE FMPY f106 = ALPHA, f106 nop __LINE__ } { .mfb STFD [C13] = f102, SIZE FMPY f110 = ALPHA, f110 nop __LINE__ } ;; { .mfb STFD [C5 ] = f99, 5 * SIZE FMPY f107 = ALPHA, f107 nop __LINE__ } { .mfb STFD [C13] = f103, 5 * SIZE FMPY f111 = ALPHA, f111 nop __LINE__ } ;; { .mfb STFD [C6 ] = f104, SIZE FMPY f112 = ALPHA, f112 nop __LINE__ } { .mfb STFD [C14] = f108, SIZE FMPY f116 = ALPHA, f116 nop __LINE__ } ;; { .mfb STFD [C6 ] = f105, SIZE FMPY f113 = ALPHA, f113 nop __LINE__ } { .mfb STFD [C14] = f109, SIZE FMPY f117 = ALPHA, f117 nop __LINE__ } ;; { .mfb STFD [C6 ] = f106, SIZE FMPY f114 = ALPHA, f114 nop __LINE__ } { .mfb STFD [C14] = f110, SIZE FMPY f118 = ALPHA, f118 nop __LINE__ } ;; { .mfb STFD [C6 ] = f107, 5 * SIZE FMPY f115 = ALPHA, f115 nop __LINE__ } { .mfb STFD [C14] = f111, 5 * SIZE FMPY f119 = ALPHA, f119 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE FMPY f120 = ALPHA, f120 nop __LINE__ } { .mfb STFD [C15] = f116, SIZE FMPY f124 = ALPHA, f124 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, SIZE FMPY f121 = ALPHA, f121 nop __LINE__ } { .mfb STFD [C15] = f117, SIZE FMPY f125 = ALPHA, f125 nop __LINE__ } ;; { .mfi STFD [C7 ] = f114, SIZE FMPY f122 = ALPHA, f122 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C15] = f118, SIZE FMPY f126 = ALPHA, f126 nop __LINE__ } ;; { .mfi STFD [C7 ] = f115, 5 * SIZE FMPY f123 = ALPHA, f123 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi STFD [C15] = f119, 5 * SIZE FMPY f127 = ALPHA, f127 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C8 ] = f120, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C16] = f124, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f121, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C16] = f125, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C8 ] = f122, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mfb STFD [C16] = f126, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f123, 5 * SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C16] = f127, 5 * SIZE mov f120 = f0 (p6) br.cond.dptk .L011 } ;; #endif .L020: #if 0 { .mfi cmp.eq p3, p0 = r0, r0 mov f89 = f0 tbit.z p6, p7 = M, 2 } { .mfb #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 8, KK #endif #endif mov f81 = f0 (p6) br.cond.dptk .L030 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; #else { .mfi shladd BOFFSET = KK8, 3, B mov f65 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; #endif { .mmf LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mfi setf.d f113 = r0 mov f121 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mmf LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfi setf.d f82 = r0 mov f90 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfi setf.d f114 = r0 mov f122 = f0 shr L = L, 1 } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f75 = f0 adds L = -1, L } { .mmf setf.d f67 = r0 setf.d f83 = r0 mov f91 = f0 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f107 = f0 mov ar.lc = L } { .mmf setf.d f99 = r0 setf.d f115 = r0 mov f123 = f0 } ;; .align 32 .L022: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C13 = 2 * SIZE, C5 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C14 = 2 * SIZE, C6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 (p5) adds C15 = 2 * SIZE, C7 } { .mfi nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 (p5) adds C16 = 2 * SIZE, C8 } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C10], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f86 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f87 = [C11], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f94 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f95 = [C12], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f102 = [C13], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f101 = [C5 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f103 = [C13], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C14], SIZE #else nop __LINE__ #endif (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C6 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C14], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f116 = [C7 ], SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f118 = [C15], SIZE FMA f66 = ALPHA, f66, f70 nop __LINE__ } ;; { .mfb LDFD f117 = [C7 ], -1 * SIZE FMA f65 = ALPHA, f65, f69 nop __LINE__ } { .mfb LDFD f119 = [C15], -1 * SIZE FMA f67 = ALPHA, f67, f71 nop __LINE__ } ;; { .mfb LDFD f124 = [C8], SIZE FMA f72 = ALPHA, f72, f76 nop __LINE__ } { .mfb LDFD f126 = [C16], SIZE FMA f74 = ALPHA, f74, f78 nop __LINE__ } ;; { .mfb LDFD f125 = [C8], -1 * SIZE FMA f73 = ALPHA, f73, f77 nop __LINE__ } { .mfb LDFD f127 = [C16], -1 * SIZE FMA f75 = ALPHA, f75, f79 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMA f82 = ALPHA, f82, f86 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMA f83 = ALPHA, f83, f87 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMA f90 = ALPHA, f90, f94 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMA f91 = ALPHA, f91, f95 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f96 = ALPHA, f96, f100 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE FMA f98 = ALPHA, f98, f102 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE FMA f97 = ALPHA, f97, f101 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE FMA f99 = ALPHA, f99, f103 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMA f104 = ALPHA, f104, f108 nop __LINE__ } { .mfb STFD [C12] = f90, SIZE FMA f106 = ALPHA, f106, f110 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, 3 * SIZE FMA f105 = ALPHA, f105, f109 nop __LINE__ } { .mfb STFD [C12] = f91, 3 * SIZE FMA f107 = ALPHA, f107, f111 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMA f112 = ALPHA, f112, f116 nop __LINE__ } { .mfb STFD [C13] = f98, SIZE FMA f114 = ALPHA, f114, f118 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, 3 * SIZE FMA f113 = ALPHA, f113, f117 nop __LINE__ } { .mfb STFD [C13] = f99, 3 * SIZE FMA f115 = ALPHA, f115, f119 nop __LINE__ } ;; { .mfb STFD [C6 ] = f104, SIZE FMA f120 = ALPHA, f120, f124 nop __LINE__ } { .mfb STFD [C14] = f106, SIZE FMA f122 = ALPHA, f122, f126 nop __LINE__ } ;; { .mfb STFD [C6 ] = f105, 3 * SIZE FMA f121 = ALPHA, f121, f125 nop __LINE__ } { .mfb STFD [C14] = f107, 3 * SIZE FMA f123 = ALPHA, f123, f127 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C15] = f114, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, 3 * SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C15] = f115, 3 * SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C8 ] = f120, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C16] = f122, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfb STFD [C8 ] = f121, 3 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C16] = f123, 3 * SIZE mov f120 = f0 nop __LINE__ } ;; #else { .mfb FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb FMPY f66 = ALPHA, f66 nop __LINE__ } ;; { .mfb FMPY f65 = ALPHA, f65 nop __LINE__ } { .mfb FMPY f67 = ALPHA, f67 nop __LINE__ } ;; { .mfb FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb FMPY f74 = ALPHA, f74 nop __LINE__ } ;; { .mfb FMPY f73 = ALPHA, f73 nop __LINE__ } { .mfb FMPY f75 = ALPHA, f75 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMPY f90 = ALPHA, f90 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMPY f89 = ALPHA, f89 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMPY f91 = ALPHA, f91 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE FMPY f98 = ALPHA, f98 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE FMPY f99 = ALPHA, f99 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } { .mfb STFD [C12] = f90, SIZE FMPY f106 = ALPHA, f106 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, 3 * SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } { .mfb STFD [C12] = f91, 3 * SIZE FMPY f107 = ALPHA, f107 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMPY f112 = ALPHA, f112 nop __LINE__ } { .mfb STFD [C13] = f98, SIZE FMPY f114 = ALPHA, f114 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, 3 * SIZE FMPY f113 = ALPHA, f113 nop __LINE__ } { .mfb STFD [C13] = f99, 3 * SIZE FMPY f115 = ALPHA, f115 nop __LINE__ } ;; { .mfi STFD [C6 ] = f104, SIZE FMPY f120 = ALPHA, f120 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C14] = f106, SIZE FMPY f122 = ALPHA, f122 nop __LINE__ } ;; { .mfi STFD [C6 ] = f105, 3 * SIZE FMPY f121 = ALPHA, f121 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi STFD [C14] = f107, 3 * SIZE FMPY f123 = ALPHA, f123 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C7 ] = f112, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C15] = f114, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f113, 3 * SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C15] = f115, 3 * SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C8 ] = f120, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfb STFD [C16] = f122, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f121, 3 * SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C16] = f123, 3 * SIZE mov f120 = f0 nop __LINE__ } ;; #endif .align 32 .L030: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 8, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L040 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #else { .mmf shladd BOFFSET = KK8, 3, B shladd AOFFSET = KK8, 1, AOFFSET mov f65 = f0 } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #endif ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f81 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 shr L = L, 1 } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f97 = f0 adds L = -1, L } { .mfi nop __LINE__ mov f105 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov f113 = f0 mov ar.lc = L } { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f121 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 32 .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f100 = [C5], SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f108 = [C6], SIZE FMA f65 = ALPHA, f65, f69 nop __LINE__ } ;; { .mfb LDFD f101 = [C5], -1 * SIZE FMA f72 = ALPHA, f72, f76 nop __LINE__ } { .mfb LDFD f109 = [C6], -1 * SIZE FMA f73 = ALPHA, f73, f77 nop __LINE__ } ;; { .mfb LDFD f116 = [C7], SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb LDFD f124 = [C8], SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } ;; { .mfb LDFD f117 = [C7], -1 * SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb LDFD f125 = [C8], -1 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f96 = ALPHA, f96, f100 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE FMA f104 = ALPHA, f104, f108 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMA f97 = ALPHA, f97, f101 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE FMA f105 = ALPHA, f105, f109 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f112 = ALPHA, f112, f116 nop __LINE__ } { .mfb STFD [C4 ] = f88, SIZE FMA f120 = ALPHA, f120, f124 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMA f113 = ALPHA, f113, f117 nop __LINE__ } { .mfb STFD [C4 ] = f89, SIZE FMA f121 = ALPHA, f121, f125 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C6 ] = f104, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C6 ] = f105, SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f120, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f121, SIZE mov f120 = f0 nop __LINE__ } ;; #else { .mfb nop __LINE__ FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb nop __LINE__ FMPY f65 = ALPHA, f65 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb nop __LINE__ FMPY f73 = ALPHA, f73 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb nop __LINE__ FMPY f81 = ALPHA, f81 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb nop __LINE__ FMPY f89 = ALPHA, f89 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } ;; { .mfi STFD [C3 ] = f80, SIZE FMPY f112 = ALPHA, f112 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C4 ] = f88, SIZE FMPY f120 = ALPHA, f120 nop __LINE__ } ;; { .mfi STFD [C3 ] = f81, SIZE FMPY f113 = ALPHA, f113 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi STFD [C4 ] = f89, SIZE FMPY f121 = ALPHA, f121 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C5 ] = f96, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C6 ] = f104, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C5 ] = f97, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C6 ] = f105, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C7 ] = f112, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfb STFD [C8 ] = f120, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f113, SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C8 ] = f121, SIZE mov f120 = f0 nop __LINE__ } ;; #endif .align 32 .L040: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 8, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L049 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #else { .mmi shladd BOFFSET = KK8, 3, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } #endif ;; { .mii LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f52, f53 = [BOFFSET], 2 * SIZE LDFD f32 = [AOFFSET], 1 * SIZE adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } { .mmi LDFPD f54, f55 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET nop __LINE__ } ;; .align 32 .L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] #else nop __LINE__ #endif FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2] #else nop __LINE__ #endif FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3] #else nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4] #else nop __LINE__ #endif (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C5] (p5) LDFD f108 = [C6] #else nop __LINE__ nop __LINE__ #endif nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f116 = [C7] (p5) LDFD f124 = [C8] #else nop __LINE__ nop __LINE__ #endif br.cloop.sptk.few .L042 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f72 = ALPHA, f72, f76 FMA f80 = ALPHA, f80, f84 FMA f88 = ALPHA, f88, f92 FMA f96 = ALPHA, f96, f100 FMA f104 = ALPHA, f104, f108 FMA f112 = ALPHA, f112, f116 FMA f120 = ALPHA, f120, f124 ;; STFD [C1 ] = f64, SIZE mov f64 = f0 STFD [C2 ] = f72, SIZE mov f72 = f0 ;; STFD [C3 ] = f80, SIZE mov f80 = f0 STFD [C4 ] = f88, SIZE mov f88 = f0 ;; STFD [C5 ] = f96, SIZE mov f96 = f0 STFD [C6 ] = f104, SIZE mov f104 = f0 ;; STFD [C7 ] = f112, SIZE mov f112 = f0 STFD [C8 ] = f120, SIZE mov f120 = f0 ;; #else FMPY f64 = ALPHA, f64 FMPY f72 = ALPHA, f72 FMPY f80 = ALPHA, f80 FMPY f88 = ALPHA, f88 { .mfi FMPY f96 = ALPHA, f96 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f104 = ALPHA, f104 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f112 = ALPHA, f112 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f120 = ALPHA, f120 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C4 ] = f88, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C5 ] = f96, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } { .mfi STFD [C6 ] = f104, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f112, SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C8 ] = f120, SIZE mov f120 = f0 nop __LINE__ } ;; #endif .align 32 #endif .L049: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } ;; { .mmb nop __LINE__ cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 32 .L050: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 2 } { .mfi add C2 = LDC, C mov f72 = f0 shr I = M, 3 } ;; { .mfi shladd C3 = LDC, 1, C mov f80 = f0 nop __LINE__ } { .mfb mov AOFFSET = A mov f88 = f0 (p6) br.cond.dpnt .L090 } ;; #if 0 { .mfi cmp.eq p6, p7 = 0, I mov f65 = f0 #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif } { .mfi shladd C4 = LDC, 1, C2 mov f73 = f0 nop __LINE__ } ;; { .mfi nop __LINE__ mov f81 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb shladd C = LDC, 2, C mov f89 = f0 (p6) br.cond.dpnt .L060 } ;; .align 32 .L052: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f66 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f74 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 2, B mov f66 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f74 = f0 nop __LINE__ } ;; #endif ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 4, KK #endif #endif } { .mfi setf.d f84 = r0 mov f90 = f0 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f67 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f75 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f83 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f91 = r0 mov f68 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC], LDC mov f76 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f92 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f69 = f0 shr L = L, 1 } { .mmf setf.d f77 = r0 setf.d f85 = r0 mov f93 = f0 } ;; { .mfi CPREFETCH [PREC], LDC mov f70 = f0 adds L = -1, L } { .mmf setf.d f78 = r0 setf.d f86 = r0 mov f94 = f0 } ;; { .mfi CPREFETCH [PREC] mov f71 = f0 mov ar.lc = L } { .mmf setf.d f79 = r0 setf.d f87 = r0 mov f95 = f0 } ;; .align 32 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 adds C9 = 4 * SIZE, C1 } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C10 = 4 * SIZE, C2 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C11 = 4 * SIZE, C3 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f96 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f97 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f98 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f99 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f101 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f102 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f103 = [C9 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f104 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f105 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f106 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f107 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C10], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f112 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f113 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f114 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f115 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f116 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f117 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f118 = [C3 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f87 = f47, f58, f87 // A8 * B3 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f119 = [C11], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f95 = f47, f59, f95 // A8 * B4 br.cloop.sptk.few .L053 } ;; .align 32 .L058: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi LDFD f120 = [C4 ], SIZE FMA f64 = ALPHA, f64, f96 cmp.ne p6, p0 = 1, I } { .mfb LDFD f121 = [C12], SIZE FMA f68 = ALPHA, f68, f97 nop __LINE__ } ;; { .mfi LDFD f122 = [C4 ], SIZE FMA f65 = ALPHA, f65, f98 adds I = -1, I } { .mfb LDFD f123 = [C12], SIZE FMA f69 = ALPHA, f69, f99 nop __LINE__ } ;; { .mfb LDFD f124 = [C4 ], SIZE FMA f66 = ALPHA, f66, f100 nop __LINE__ } { .mfb LDFD f125 = [C12], SIZE FMA f70 = ALPHA, f70, f101 nop __LINE__ } ;; { .mfb LDFD f126 = [C4 ], -3 * SIZE FMA f67 = ALPHA, f67, f102 nop __LINE__ } { .mfb LDFD f127 = [C12], -3 * SIZE FMA f71 = ALPHA, f71, f103 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f72 = ALPHA, f72, f104 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMA f76 = ALPHA, f76, f105 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMA f73 = ALPHA, f73, f106 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMA f77 = ALPHA, f77, f107 nop __LINE__ } ;; { .mfb STFD [C1 ] = f66, SIZE FMA f74 = ALPHA, f74, f108 nop __LINE__ } { .mfb STFD [C9 ] = f70, SIZE FMA f78 = ALPHA, f78, f109 nop __LINE__ } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMA f75 = ALPHA, f75, f110 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMA f79 = ALPHA, f79, f111 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f80 = ALPHA, f80, f112 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMA f84 = ALPHA, f84, f113 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMA f81 = ALPHA, f81, f114 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMA f85 = ALPHA, f85, f115 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMA f82 = ALPHA, f82, f116 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMA f86 = ALPHA, f86, f117 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMA f83 = ALPHA, f83, f118 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMA f87 = ALPHA, f87, f119 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f88 = ALPHA, f88, f120 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMA f92 = ALPHA, f92, f121 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMA f89 = ALPHA, f89, f122 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMA f93 = ALPHA, f93, f123 nop __LINE__ } ;; { .mfb STFD [C3 ] = f82, SIZE FMA f90 = ALPHA, f90, f124 nop __LINE__ } { .mfb STFD [C11] = f86, SIZE FMA f94 = ALPHA, f94, f125 nop __LINE__ } ;; { .mfb STFD [C3 ] = f83, 5 * SIZE FMA f91 = ALPHA, f91, f126 nop __LINE__ } { .mfb STFD [C11] = f87, 5 * SIZE FMA f95 = ALPHA, f95, f127 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C12] = f92, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C12] = f93, SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C4 ] = f90, SIZE mov f65 = f0 nop __LINE__ } { .mfb STFD [C12] = f94, SIZE mov f73 = f0 nop __LINE__ } ;; { .mfb STFD [C4 ] = f91, 5 * SIZE mov f81 = f0 nop __LINE__ } { .mfb STFD [C12] = f95, 5 * SIZE mov f89 = f0 (p6) br.cond.dptk .L052 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMPY f68 = ALPHA, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb nop __LINE__ FMPY f69 = ALPHA, f69 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f70 = ALPHA, f70 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f67 = ALPHA, f67 nop __LINE__ } { .mfb nop __LINE__ FMPY f71 = ALPHA, f71 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMPY f76 = ALPHA, f76 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMPY f73 = ALPHA, f73 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMPY f77 = ALPHA, f77 nop __LINE__ } ;; { .mfb STFD [C1 ] = f66, SIZE FMPY f74 = ALPHA, f74 nop __LINE__ } { .mfb STFD [C9 ] = f70, SIZE FMPY f78 = ALPHA, f78 nop __LINE__ } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMPY f75 = ALPHA, f75 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMPY f79 = ALPHA, f79 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE FMPY f84 = ALPHA, f84 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE FMPY f85 = ALPHA, f85 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE FMPY f86 = ALPHA, f86 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE FMPY f87 = ALPHA, f87 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb STFD [C11] = f84, SIZE FMPY f92 = ALPHA, f92 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMPY f89 = ALPHA, f89 nop __LINE__ } { .mfb STFD [C11] = f85, SIZE FMPY f93 = ALPHA, f93 nop __LINE__ } ;; { .mfi STFD [C3 ] = f82, SIZE FMPY f90 = ALPHA, f90 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C11] = f86, SIZE FMPY f94 = ALPHA, f94 nop __LINE__ } ;; { .mfi STFD [C3 ] = f83, 5 * SIZE FMPY f91 = ALPHA, f91 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi STFD [C11] = f87, 5 * SIZE FMPY f95 = ALPHA, f95 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi STFD [C4 ] = f88, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C12] = f92, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f89, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C12] = f93, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C4 ] = f90, SIZE mov f65 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mfb STFD [C12] = f94, SIZE mov f73 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f91, 5 * SIZE mov f81 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C12] = f95, 5 * SIZE mov f89 = f0 (p6) br.cond.dptk .L052 } ;; #endif .align 32 .L060: { .mfi nop __LINE__ mov f66 = f0 tbit.z p6, p7 = M, 2 } { .mfb #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 4, KK #endif #endif mov f74 = f0 (p6) br.cond.dptk .L070 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f82 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f90 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mfi shladd BOFFSET = KK8, 2, B mov f82 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f90 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif ;; { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f67 = f0 adds L = -1, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov f75 = f0 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 mov ar.lc = L } { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 32 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C10], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f86 = [C11], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f87 = [C11], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f94 = [C12], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f95 = [C12], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 br.cloop.sptk.few .L062 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f66 = ALPHA, f66, f70 FMA f65 = ALPHA, f65, f69 FMA f67 = ALPHA, f67, f71 FMA f72 = ALPHA, f72, f76 FMA f74 = ALPHA, f74, f78 FMA f73 = ALPHA, f73, f77 FMA f75 = ALPHA, f75, f79 ;; { .mfb STFD [C1 ] = f64, SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMA f82 = ALPHA, f82, f86 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMA f83 = ALPHA, f83, f87 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMA f90 = ALPHA, f90, f94 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMA f91 = ALPHA, f91, f95 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE mov f64 = f0 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE mov f81 = f0 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f88, SIZE mov f88 = f0 adds L = 1, K } { .mfb STFD [C12] = f90, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f89, 3 * SIZE mov f89 = f0 shr L = L, 1 } { .mfb STFD [C12] = f91, 3 * SIZE mov f73 = f0 nop __LINE__ } ;; #else FMPY f64 = ALPHA, f64 FMPY f66 = ALPHA, f66 FMPY f65 = ALPHA, f65 FMPY f67 = ALPHA, f67 FMPY f72 = ALPHA, f72 FMPY f74 = ALPHA, f74 FMPY f73 = ALPHA, f73 FMPY f75 = ALPHA, f75 ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } ;; { .mfi STFD [C2 ] = f72, SIZE FMPY f88 = ALPHA, f88 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C10] = f74, SIZE FMPY f90 = ALPHA, f90 nop __LINE__ } ;; { .mfi STFD [C2 ] = f73, 3 * SIZE FMPY f89 = ALPHA, f89 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi STFD [C10] = f75, 3 * SIZE FMPY f91 = ALPHA, f91 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C11] = f82, SIZE mov f64 = f0 nop __LINE__ } ;; { .mfi STFD [C3 ] = f81, 3 * SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C11] = f83, 3 * SIZE mov f72 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C4 ] = f88, SIZE mov f88 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfb STFD [C12] = f90, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f89, 3 * SIZE mov f89 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C12] = f91, 3 * SIZE mov f73 = f0 nop __LINE__ } ;; #endif .align 32 .L070: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 4, KK #endif #endif tbit.z p6,p7 = M, 1 (p6) br.cond.dptk .L080 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 2, B shladd AOFFSET = KK8, 1, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif nop __LINE__ } ;; #endif { .mii cmp.eq p3, p0 = r0, r0 tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = -1, L adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov ar.lc = L } ;; .align 32 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ nop __LINE__ #endif FMA f89 = f33, f51, f89 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3 ], SIZE (p5) LDFD f92 = [C4 ], SIZE #else nop __LINE__ nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f89 = f41, f59, f89 // A2 * B4 br.cloop.sptk.few .L072 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f65 = ALPHA, f65, f69 FMA f72 = ALPHA, f72, f76 FMA f73 = ALPHA, f73, f77 FMA f80 = ALPHA, f80, f84 FMA f81 = ALPHA, f81, f85 FMA f88 = ALPHA, f88, f92 FMA f89 = ALPHA, f89, f93 ;; { .mfb STFD [C1 ] = f64, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C2 ] = f73, SIZE nop __LINE__ } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 adds L = 1, K } { .mfb STFD [C4 ] = f88, SIZE mov f88 = f0 nop __LINE__ } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C4 ] = f89, SIZE shr L = L, 1 } ;; #else FMPY f64 = ALPHA, f64 FMPY f65 = ALPHA, f65 ;; { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f73 = ALPHA, f73 nop __LINE__ } ;; { .mfi FMPY f80 = ALPHA, f80 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi FMPY f81 = ALPHA, f81 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi nop __LINE__ FMPY f88 = ALPHA, f88 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f89 = ALPHA, f89 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C2 ] = f73, SIZE #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C3 ] = f80, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfb STFD [C4 ] = f88, SIZE mov f88 = f0 nop __LINE__ } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C4 ] = f89, SIZE #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; #endif .align 32 .L080: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 4, KK #endif #endif tbit.z p6,p7 = M, 0 (p6) br.cond.dptk .L089 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 2, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif nop __LINE__ } ;; #endif { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi nop __LINE__ nop __LINE__ adds L = -1, L } ;; { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L082: { .mfb cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mmf (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2] (p5) LDFD f84 = [C3] #else nop __LINE__ nop __LINE__ #endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mib (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mmb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4] #else nop __LINE__ #endif adds L = -1, L br.cloop.sptk.few .L082 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f72 = ALPHA, f72, f76 FMA f80 = ALPHA, f80, f84 FMA f88 = ALPHA, f88, f92 ;; STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 nop __LINE__ } ;; { .mfi FMPY f80 = ALPHA, f80 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } { .mfi FMPY f88 = ALPHA, f88 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmi STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } ;; { .mmi STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; #endif .align 32 .L089: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } ;; .align 16 #endif .L090: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 1 } { .mfi add C2 = LDC, C mov f72 = f0 shr I = M, 3 } ;; { .mfi setf.d f66 = r0 mov f65 = f0 #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif } { .mfb mov AOFFSET = A mov f73 = f0 (p6) br.cond.dpnt .L130 } ;; #if 0 { .mfi #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif mov f67 = f0 shladd C = LDC, 1, C } { .mfb cmp.eq p6, p7 = 0, I mov f74 = f0 (p6) br.cond.dpnt .L100 } ;; .align 32 .L092: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f68 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f79 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 1, B mov f68 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f79 = f0 nop __LINE__ } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f75 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 2, KK #endif #endif } ;; { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 mov f76 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f69 = f0 tbit.z p12, p0 = L, 0 } { .mfi cmp.eq p3, p0 = r0, r0 mov f77 = f0 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC], LDC mov f70 = f0 } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f78 = f0 mov ar.lc = L } { .mfi CPREFETCH [PREC] mov f71 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; .align 32 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f96 = [C1 ], SIZE #else nop __LINE__ #endif FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f97 = [C9 ], SIZE #else nop __LINE__ #endif FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f98 = [C1 ], SIZE #else nop __LINE__ #endif FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f99 = [C9 ], SIZE #else nop __LINE__ #endif FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C1 ], SIZE #else nop __LINE__ #endif FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f101 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f102 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f103 = [C9 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f104 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f105 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f106 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f107 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C10], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f79 = f47, f57, f79 // A8 * B2 br.cloop.sptk.few .L093 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi nop __LINE__ FMA f64 = ALPHA, f64, f96 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMA f68 = ALPHA, f68, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f65 = ALPHA, f65, f98 adds I = -1, I } { .mfb nop __LINE__ FMA f69 = ALPHA, f69, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f66 = ALPHA, f66, f100 nop __LINE__ } { .mfb nop __LINE__ FMA f70 = ALPHA, f70, f101 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = ALPHA, f67, f102 nop __LINE__ } { .mfb nop __LINE__ FMA f71 = ALPHA, f71, f103 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f72 = ALPHA, f72, f104 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMA f76 = ALPHA, f76, f105 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMA f73 = ALPHA, f73, f106 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMA f77 = ALPHA, f77, f107 nop __LINE__ } ;; { .mfb STFD [C1 ] = f66, SIZE FMA f74 = ALPHA, f74, f108 nop __LINE__ } { .mfb STFD [C9 ] = f70, SIZE FMA f78 = ALPHA, f78, f109 nop __LINE__ } ;; { .mfb STFD [C1 ] = f67, 5 * SIZE FMA f75 = ALPHA, f75, f110 nop __LINE__ } { .mfb STFD [C9 ] = f71, 5 * SIZE FMA f79 = ALPHA, f79, f111 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C10] = f76, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, SIZE mov f65 = f0 nop __LINE__ } { .mfb STFD [C10] = f77, SIZE mov f73 = f0 nop __LINE__ } ;; { .mfb STFD [C2 ] = f74, SIZE mov f66 = f0 nop __LINE__ } { .mfb STFD [C10] = f78, SIZE mov f74 = f0 nop __LINE__ } ;; { .mfb STFD [C2 ] = f75, 5 * SIZE mov f67 = f0 nop __LINE__ } { .mfb STFD [C10] = f79, 5 * SIZE (p6) br.cond.dptk .L092 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMPY f68 = ALPHA, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb nop __LINE__ FMPY f69 = ALPHA, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f70 = ALPHA, f70 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f67 = ALPHA, f67 nop __LINE__ } { .mfb nop __LINE__ FMPY f71 = ALPHA, f71 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb STFD [C9 ] = f68, SIZE FMPY f76 = ALPHA, f76 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMPY f73 = ALPHA, f73 nop __LINE__ } { .mfb STFD [C9 ] = f69, SIZE FMPY f77 = ALPHA, f77 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMPY f74 = ALPHA, f74 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb STFD [C9 ] = f70, SIZE FMPY f78 = ALPHA, f78 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE FMPY f75 = ALPHA, f75 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi STFD [C9 ] = f71, 5 * SIZE FMPY f79 = ALPHA, f79 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mfi STFD [C2 ] = f72, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C10] = f76, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f73, SIZE mov f65 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C10] = f77, SIZE mov f73 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C2 ] = f74, SIZE mov f66 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mfb STFD [C10] = f78, SIZE mov f74 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f75, 5 * SIZE mov f67 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mib STFD [C10] = f79, 5 * SIZE nop __LINE__ (p6) br.cond.dptk .L092 } ;; #endif .align 32 .L100: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L110 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmf LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B mov f75 = f0 } { .mii nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mfi shladd BOFFSET = KK8, 1, B mov f75 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ adds L = 1, L } ;; #endif ;; { .mii adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 2 * SIZE, C2 } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C10], SIZE #else nop __LINE__ #endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 br.cloop.sptk.few .L102 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f77 = [C2 ], -1 * SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f79 = [C10], -1 * SIZE FMA f66 = ALPHA, f66, f70 nop __LINE__ } ;; FMA f65 = ALPHA, f65, f69 adds L = 1, K FMA f67 = ALPHA, f67, f71 ;; FMA f72 = ALPHA, f72, f76 shr L = L, 1 FMA f74 = ALPHA, f74, f78 FMA f73 = ALPHA, f73, f77 FMA f75 = ALPHA, f75, f79 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f66, SIZE mov f64 = f0 } ;; { .mmf STFD [C1 ] = f65, 3 * SIZE STFD [C9 ] = f67, 3 * SIZE mov f65 = f0 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f74, SIZE mov f72 = f0 } ;; { .mmf STFD [C2 ] = f73, 3 * SIZE STFD [C10] = f75, 3 * SIZE mov f73 = f0 } ;; #else { .mfb nop __LINE__ FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } ;; FMPY f65 = ALPHA, f65 FMPY f67 = ALPHA, f67 ;; { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f74 = ALPHA, f74 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f73 = ALPHA, f73 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f75 = ALPHA, f75 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f66, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, 3 * SIZE mov f65 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f67, 3 * SIZE nop __LINE__ #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C2 ] = f72, SIZE mov f72 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mmi STFD [C10] = f74, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C2 ] = f73, 3 * SIZE mov f73 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mib STFD [C10] = f75, 3 * SIZE nop __LINE__ nop __LINE__ } ;; #endif .align 32 .L110: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L120 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 1, B shladd AOFFSET = KK8, 1, AOFFSET } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ adds L = 1, L } ;; #endif ;; { .mii adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi cmp.eq p3, p0 = r0, r0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; .align 32 .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE (p5) LDFD f76 = [C2 ], SIZE #else nop __LINE__ nop __LINE__ #endif FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f73 = f41, f57, f73 // A2 * B2 br.cloop.sptk.few .L112 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f65 = ALPHA, f65, f69 FMA f72 = ALPHA, f72, f76 FMA f73 = ALPHA, f73, f77 ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE mov f73 = f0 nop __LINE__ } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f73 = ALPHA, f73 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f72, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f73, SIZE mov f73 = f0 nop __LINE__ } ;; #endif .align 32 .L120: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L129 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 1, B add AOFFSET = KK8, AOFFSET } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ adds L = 1, L } ;; #endif { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFD f32 = [AOFFSET], 1 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi cmp.eq p3, p0 = r0, r0 nop __LINE__ mov ar.lc = L } ;; .align 32 .L122: { .mfi FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } { .mmi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] (p5) LDFD f76 = [C2] #else nop __LINE__ nop __LINE__ #endif nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 br.cloop.sptk.few .L122 } ;; .L128: #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f72 = ALPHA, f72, f76 ;; { .mfi STFD [C1 ] = f64 mov f64 = f0 } { .mfb STFD [C2 ] = f72 mov f72 = f0 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f72 = ALPHA, f72 nop __LINE__ } ;; { .mmi nop __LINE__ #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif ;; { .mfi STFD [C1 ] = f64 mov f64 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C2 ] = f72 mov f72 = f0 } ;; #endif .align 32 .L129: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } ;; .align 16 #endif .L130: { .mfi #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif mov f64 = f0 tbit.z p6, p0 = N, 0 } { .mib mov AOFFSET = A shr I = M, 3 (p6) br.cond.dpnt .L999 } ;; #if 0 { .mfi mov C1 = C mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mfi nop __LINE__ mov f66 = f0 nop __LINE__ } { .mfb cmp.eq p7, p0 = 0, I mov f67 = f0 (p7) br.cond.dpnt .L140 } ;; .align 32 .L132: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFD f48 = [B] mov f68 = f0 nop __LINE__ } { .mfi adds BOFFSET = 1 * SIZE, B mov f69 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 1, KK #endif #endif } ;; #else { .mfi add BOFFSET = KK8, B mov f68 = f0 shladd AOFFSET = KK8, 3, AOFFSET } ;; { .mfi LDFD f48 = [BOFFSET], 1 * SIZE mov f69 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 8, KK #else adds L = 1, KK #endif #endif } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f70 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mii LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f71 = f0 adds L = -1, L } ;; { .mmi LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds PREC = CPREFETCHSIZE * SIZE, C1 cmp.eq p3, p0 = r0, r0 } ;; { .mmi CPREFETCH [PREC] adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov ar.lc = L } ;; .align 32 .L133: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C9 = 4 * SIZE, C1 } { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f6 = [C1 ], SIZE #else nop __LINE__ #endif FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f7 = [C9 ], SIZE #else nop __LINE__ #endif FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f10 = [C1 ], SIZE #else nop __LINE__ #endif FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f11 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f12 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f13 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } ;; { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f14 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f15 = [C9 ], -3 * SIZE #else nop __LINE__ #endif nop __LINE__ br.cloop.sptk.few .L133 } ;; .L138: #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfi FMA f64 = ALPHA, f64, f6 cmp.ne p6, p0 = 1, I } { .mfb FMA f68 = ALPHA, f68, f7 } ;; { .mfi FMA f65 = ALPHA, f65, f10 adds I = -1, I } { .mfb FMA f69 = ALPHA, f69, f11 } ;; { .mfi FMA f66 = ALPHA, f66, f12 } { .mfb FMA f70 = ALPHA, f70, f13 } ;; { .mfb FMA f67 = ALPHA, f67, f14 } { .mfb FMA f71 = ALPHA, f71, f15 } ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmf STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE mov f65 = f0 } ;; { .mmf STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE mov f66 = f0 } ;; { .mmf STFD [C1 ] = f67, 5 * SIZE nop __LINE__ mov f67 = f0 } { .mmb STFD [C9 ] = f71, 5 * SIZE nop __LINE__ (p6) br.cond.dptk .L132 } ;; #else { .mfi FMPY f64 = ALPHA, f64 cmp.ne p6, p0 = 1, I } { .mfb FMPY f68 = ALPHA, f68 } ;; { .mfi FMPY f65 = ALPHA, f65 adds I = -1, I } { .mfb FMPY f69 = ALPHA, f69 } ;; { .mfi FMPY f66 = ALPHA, f66 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb FMPY f70 = ALPHA, f70 } ;; { .mfi FMPY f67 = ALPHA, f67 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -8, L #else nop __LINE__ #endif } { .mfi FMPY f71 = ALPHA, f71 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f68, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 3, AOFFSET #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f69, SIZE nop __LINE__ #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f66, SIZE mov f66 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 8, KK #else nop __LINE__ #endif } { .mmi STFD [C9 ] = f70, SIZE nop __LINE__ nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE mov f67 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mmb STFD [C9 ] = f71, 5 * SIZE nop __LINE__ (p6) br.cond.dptk .L132 } ;; #endif .align 32 .L140: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L150 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi add BOFFSET = KK8, B shladd AOFFSET = KK8, 2, AOFFSET nop __LINE__ } ;; { .mmi LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE adds L = -1, L nop __LINE__ } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L142: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 (p5) adds C9 = 2 * SIZE, C1 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE #else nop __LINE__ #endif (p3) LDFD f56 = [BOFFSET], 1 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 (p5) adds C10 = 2 * SIZE, C2 } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE #else nop __LINE__ #endif nop.f 0 br.cloop.sptk.few .L142 } ;; .L148: #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f66 = ALPHA, f66, f70 FMA f65 = ALPHA, f65, f69 FMA f67 = ALPHA, f67, f71 ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 adds L = 1, K } { .mfb STFD [C9 ] = f66, SIZE mov f66 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, 3 * SIZE mov f65 = f0 shr L = L, 1 } { .mfb STFD [C9 ] = f67, 3 * SIZE mov f67 = f0 nop __LINE__ } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f66 = ALPHA, f66 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f67 = ALPHA, f67 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfb STFD [C9 ] = f66, SIZE mov f66 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, 3 * SIZE mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C9 ] = f67, 3 * SIZE mov f67 = f0 nop __LINE__ } ;; #endif .align 32 .L150: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L160 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi add BOFFSET = KK8, B shladd AOFFSET = KK8, 1, AOFFSET nop __LINE__ } ;; { .mmi LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mii cmp.eq p3, p0 = r0, r0 tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = -1, L ;; mov ar.lc = L } ;; .align 32 .L152: { .mfi cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } ;; { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } ;; { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 br.cloop.sptk.few .L152 } ;; .L158: #if! defined(TRMMKERNEL) && !defined(BETAZERO) LDFD f68 = [C1 ], SIZE ;; LDFD f69 = [C1 ], -1 * SIZE ;; FMA f64 = ALPHA, f64, f68 FMA f65 = ALPHA, f65, f69 ;; STFD [C1 ] = f64, SIZE mov f64 = f0 ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 } ;; #else { .mfi nop __LINE__ FMPY f64 = ALPHA, f64 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FMPY f65 = ALPHA, f65 nop __LINE__ } ;; { .mii nop __LINE__ #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mmi #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; #endif .align 32 .L160: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L169 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi add BOFFSET = KK8, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif ;; { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mii adds L = -1, L cmp.eq p3, p0 = r0, r0 ;; mov ar.lc = L } ;; .align 32 .L162: { .mmf cmp.ne p4, p5 = 0, L (p12) cmp.ne p3, p0 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 } ;; { .mmi (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } ;; { .mmi (p4) LDFD f32 = [AOFFSET], 1 * SIZE #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1] #else nop __LINE__ #endif adds L = -1, L } { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 br.cloop.sptk.few .L162 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 #else FMPY f64 = ALPHA, f64 #endif ;; STFD [C1 ] = f64 ;; .align 32 .L169: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } ;; .align 16 #endif .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f24 = [SP], 32 ldf.fill f25 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f26 = [SP], 32 ldf.fill f27 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f28 = [SP], 32 ldf.fill f29 = [r9], 32 ;; ldf.fill f30 = [SP], 32 ldf.fill f31 = [r9] br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/qgemv_n.S000066400000000000000000001046101313527062700167510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define SP r12 #define M r32 #define N r33 #ifndef XDOUBLE #define A r36 #define LDA r37 #define X r38 #define INCX r39 #define Y r34 #define INCY r35 #else #define A r38 #define LDA r39 #define X r34 #define INCX r35 #define Y r36 #define INCY r37 #endif #define BUFFER r11 #define I r14 #define J r15 #define AO1 r16 #define AO2 r17 #define AO3 r18 #define AO4 r19 #define AO5 r20 #define AO6 r21 #define AO7 r22 #define AO8 r23 #define YLD1 r24 #define YLD2 r25 #define YST1 r26 #define YST2 r27 #define II r28 #define YY r29 #define ARLC r30 #define PR r31 #define LDA7M8 r8 #define PREA r9 #define PREB r10 #define ALPHA1 f8 #define ALPHA2 f9 #define ALPHA3 f10 #define ALPHA4 f11 #define ALPHA5 f12 #define ALPHA6 f13 #define ALPHA7 f14 #define ALPHA8 f15 #define RPREFETCHSIZE ( 8 * 1 + 6) #define WPREFETCHSIZE ( 8 * 1 + 6) #define RPREFETCH lfetch.nt1 #define WPREFETCH lfetch.excl.nt1 #define ALPHA f6 PROLOGUE .prologue PROFCODE { .mmi mov ARLC = ar.lc } ;; mov PR = pr adds r14 = 16, SP adds r15 = 24, SP adds r16 = 32, SP .body ;; #ifdef XDOUBLE ld8 X = [r14], 16 ld8 INCX = [r15], 16 ;; #endif ld8 Y = [r14], 16 ld8 INCY = [r15], 16 ;; ld8 BUFFER = [r14] ;; mov ALPHA = f8 cmp.ge p7, p0 = 0, M cmp.ge p6, p0 = 0, N ;; shladd INCX = INCX, BASE_SHIFT, r0 shladd LDA = LDA, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 ;; (p7) br.cond.dpnt .L999 (p6) br.cond.dpnt .L999 ;; sub I = A, Y mov YY = Y ;; cmp.eq p10, p0 = SIZE, INCY (p10) br.cond.dptk .L10 ;; shr J = M, 3 mov YY = BUFFER ;; (p8) adds YY = SIZE, BUFFER ;; mov ar.lc = J mov YST1 = YY adds YST2 = 4 * SIZE, YY ;; .L02: STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 5 * SIZE STFD [YST2] = f0, 5 * SIZE br.cloop.sptk.few .L02 ;; .L10: shr J = N, 3 ;; cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 ;; .align 16 .L11: shladd LDA7M8 = LDA, 3, r0 ;; sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 1 * SIZE, YY adds YST2 = 1 * SIZE, YY ;; LDFD ALPHA1 = [X], INCX ;; LDFD ALPHA2 = [X], INCX ;; LDFD ALPHA3 = [X], INCX ;; LDFD ALPHA4 = [X], INCX ;; LDFD ALPHA5 = [X], INCX ;; LDFD ALPHA6 = [X], INCX ;; LDFD ALPHA7 = [X], INCX ;; LDFD ALPHA8 = [X], INCX ;; FMPY ALPHA1 = ALPHA, ALPHA1 FMPY ALPHA2 = ALPHA, ALPHA2 FMPY ALPHA3 = ALPHA, ALPHA3 FMPY ALPHA4 = ALPHA, ALPHA4 FMPY ALPHA5 = ALPHA, ALPHA5 FMPY ALPHA6 = ALPHA, ALPHA6 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A shladd A = LDA, 3, A ;; shr I = M, 3 mov pr.rot= 0 ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I adds J = -1, J ;; adds PREB = (WPREFETCHSIZE) * SIZE, YY ;; cmp.lt p7, p8 = r0, J tbit.nz p13, p11 = M, 2 mov ar.ec= 2 ;; FMPY ALPHA7 = ALPHA, ALPHA7 ;; { .mfi and II = 7, M FMPY ALPHA8 = ALPHA, ALPHA8 mov ar.lc = I } { .mib cmp.eq p6, p0 = -1, I tbit.nz p14, p12 = M, 1 (p6) br.cond.dpnt .L15 } ;; .align 16 .L12: { .mmf (p17) LDFD f93 = [AO5], LDA7M8 (p17) LDFD f94 = [AO6], LDA7M8 (p17) FMA f101 = ALPHA1, f33, f101 } { .mmf (p17) LDFD f95 = [AO7], LDA7M8 (p17) LDFD f96 = [AO8], LDA7M8 (p17) FMA f104 = ALPHA1, f34, f104 } ;; { .mmf (p16) LDFD f32 = [AO1] (p16) LDFD f33 = [AO2], LDA (p17) FMA f107 = ALPHA1, f35, f107 } { .mmf (p16) LDFD f34 = [AO3], LDA (p16) LDFD f35 = [AO4], LDA (p17) FMA f110 = ALPHA1, f36, f110 } ;; { .mmf (p16) LDFD f100 = [YLD1], 2 * SIZE (p16) LDFD f103 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA1, f37, f113 } { .mmf (p16) adds PREA = (RPREFETCHSIZE) * SIZE, AO1 (p16) add AO1 = AO1, LDA (p17) FMA f116 = ALPHA1, f38, f116 } ;; { .mmf (p18) STFD [YST1] = f102, 2 * SIZE (p18) STFD [YST2] = f105, 2 * SIZE (p17) FMA f119 = ALPHA1, f39, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA1, f40, f122 } ;; { .mmf (p16) LDFD f36 = [AO5], LDA (p16) LDFD f37 = [AO6], LDA (p17) FMA f101 = ALPHA2, f41, f101 } { .mmf (p16) LDFD f38 = [AO7], LDA (p16) LDFD f39 = [AO8], LDA (p17) FMA f104 = ALPHA2, f42, f104 } ;; { .mmf (p16) LDFD f40 = [AO1], LDA (p16) LDFD f41 = [AO2], LDA (p17) FMA f107 = ALPHA2, f43, f107 } { .mmf (p16) LDFD f42 = [AO3], LDA (p16) LDFD f43 = [AO4], LDA (p17) FMA f110 = ALPHA2, f44, f110 } ;; { .mmf (p16) LDFD f106 = [YLD1], 2 * SIZE (p16) LDFD f109 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA2, f45, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA2, f46, f116 } ;; { .mmf (p18) STFD [YST1] = f108, 2 * SIZE (p18) STFD [YST2] = f111, 2 * SIZE (p17) FMA f119 = ALPHA2, f47, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA2, f48, f122 } ;; { .mmf (p16) LDFD f44 = [AO5], LDA (p16) LDFD f45 = [AO6], LDA (p17) FMA f101 = ALPHA3, f49, f101 } { .mmf (p16) LDFD f46 = [AO7], LDA (p16) LDFD f47 = [AO8], LDA (p17) FMA f104 = ALPHA3, f50, f104 } ;; { .mmf (p16) LDFD f48 = [AO1], LDA (p16) LDFD f49 = [AO2], LDA (p17) FMA f107 = ALPHA3, f51, f107 } { .mmf (p16) LDFD f50 = [AO3], LDA (p16) LDFD f51 = [AO4], LDA (p17) FMA f110 = ALPHA3, f52, f110 } ;; { .mmf (p16) LDFD f112 = [YLD1], 2 * SIZE (p16) LDFD f115 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA3, f53, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA3, f54, f116 } ;; { .mmf (p18) STFD [YST1] = f114, 2 * SIZE (p18) STFD [YST2] = f117, 2 * SIZE (p17) FMA f119 = ALPHA3, f55, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA3, f56, f122 } ;; { .mmf (p16) LDFD f52 = [AO5], LDA (p16) LDFD f53 = [AO6], LDA (p17) FMA f101 = ALPHA4, f57, f101 } { .mmf (p16) LDFD f54 = [AO7], LDA (p16) LDFD f55 = [AO8], LDA (p17) FMA f104 = ALPHA4, f58, f104 } ;; { .mmf (p16) LDFD f56 = [AO1], LDA (p16) LDFD f57 = [AO2], LDA (p17) FMA f107 = ALPHA4, f59, f107 } { .mmf (p16) LDFD f58 = [AO3], LDA (p16) LDFD f59 = [AO4], LDA (p17) FMA f110 = ALPHA4, f60, f110 } ;; { .mmf (p16) LDFD f118 = [YLD1], 2 * SIZE (p16) LDFD f121 = [YLD2], 2 * SIZE (p17) FMA f113 = ALPHA4, f61, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA4, f62, f116 } ;; { .mmf (p18) STFD [YST1] = f120, 2 * SIZE (p18) STFD [YST2] = f123, 2 * SIZE (p17) FMA f119 = ALPHA4, f63, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA4, f64, f122 } ;; { .mmf (p16) LDFD f60 = [AO5], LDA (p16) LDFD f61 = [AO6], LDA (p17) FMA f101 = ALPHA5, f65, f101 } { .mmf (p16) LDFD f62 = [AO7], LDA (p16) LDFD f63 = [AO8], LDA (p17) FMA f104 = ALPHA5, f66, f104 } ;; { .mmf (p16) LDFD f64 = [AO1], LDA (p16) LDFD f65 = [AO2], LDA (p17) FMA f107 = ALPHA5, f67, f107 } { .mmf (p16) LDFD f66 = [AO3], LDA (p16) LDFD f67 = [AO4], LDA (p17) FMA f110 = ALPHA5, f68, f110 } ;; { .mmf (p16) WPREFETCH [PREB], 8 * SIZE nop __LINE__ (p17) FMA f113 = ALPHA5, f69, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA5, f70, f116 } ;; { .mmf (p16) RPREFETCH [PREA] nop __LINE__ (p17) FMA f119 = ALPHA5, f71, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA5, f72, f122 } ;; { .mmf (p16) LDFD f68 = [AO5], LDA (p16) LDFD f69 = [AO6], LDA (p17) FMA f101 = ALPHA6, f73, f101 } { .mmf (p16) LDFD f70 = [AO7], LDA (p16) LDFD f71 = [AO8], LDA (p17) FMA f104 = ALPHA6, f74, f104 } ;; { .mmf (p16) LDFD f72 = [AO1], LDA (p16) LDFD f73 = [AO2], LDA (p17) FMA f107 = ALPHA6, f75, f107 } { .mmf (p16) LDFD f74 = [AO3], LDA (p16) LDFD f75 = [AO4], LDA (p17) FMA f110 = ALPHA6, f76, f110 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f113 = ALPHA6, f77, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA6, f78, f116 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f119 = ALPHA6, f79, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA6, f80, f122 } ;; { .mmf (p16) LDFD f76 = [AO5], LDA (p16) LDFD f77 = [AO6], LDA (p17) FMA f101 = ALPHA7, f81, f101 } { .mmf (p16) LDFD f78 = [AO7], LDA (p16) LDFD f79 = [AO8], LDA (p17) FMA f104 = ALPHA7, f82, f104 } ;; { .mmf (p16) LDFD f80 = [AO1], LDA (p16) LDFD f81 = [AO2], LDA (p17) FMA f107 = ALPHA7, f83, f107 } { .mmf (p16) LDFD f82 = [AO3], LDA (p16) LDFD f83 = [AO4], LDA (p17) FMA f110 = ALPHA7, f84, f110 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f113 = ALPHA7, f85, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA7, f86, f116 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f119 = ALPHA7, f87, f119 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f122 = ALPHA7, f88, f122 } ;; { .mmf (p16) LDFD f84 = [AO5], LDA (p16) LDFD f85 = [AO6], LDA (p17) FMA f101 = ALPHA8, f89, f101 } { .mmf (p16) LDFD f86 = [AO7], LDA (p16) LDFD f87 = [AO8], LDA (p17) FMA f104 = ALPHA8, f90, f104 } ;; { .mmf (p16) LDFD f88 = [AO1], LDA7M8 (p16) LDFD f89 = [AO2], LDA7M8 (p17) FMA f107 = ALPHA8, f91, f107 } { .mmf (p16) LDFD f90 = [AO3], LDA7M8 (p16) LDFD f91 = [AO4], LDA7M8 (p17) FMA f110 = ALPHA8, f92, f110 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f113 = ALPHA8, f93, f113 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f116 = ALPHA8, f94, f116 } ;; { .mmf nop __LINE__ nop __LINE__ (p17) FMA f119 = ALPHA8, f95, f119 } { .mfb nop __LINE__ (p17) FMA f122 = ALPHA8, f96, f122 br.ctop.sptk.few .L12 } ;; { .mmi (p18) STFD [YST1] = f102, 2 * SIZE (p18) STFD [YST2] = f105, 2 * SIZE nop __LINE__ } ;; { .mmi (p18) STFD [YST1] = f108, 2 * SIZE (p18) STFD [YST2] = f111, 2 * SIZE nop __LINE__ } ;; { .mmi (p18) STFD [YST1] = f114, 2 * SIZE (p18) STFD [YST2] = f117, 2 * SIZE nop __LINE__ } ;; { .mmi (p18) STFD [YST1] = f120, 2 * SIZE (p18) STFD [YST2] = f123, 2 * SIZE nop __LINE__ } ;; .align 16 .L15: { .mmi (p7) cmp.eq.unc p9, p0 = r0, II (p8) cmp.eq.unc p10, p0 = r0, II (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 (p9) br.cond.dptk .L11 (p10) br.cond.dptk .L20 } ;; { .mmi (p13) LDFD f32 = [AO1], LDA (p13) LDFD f33 = [AO2], LDA tbit.nz p15, p0 = M, 0 } { .mmi (p13) LDFD f34 = [AO3], LDA (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f35 = [AO4], LDA (p14) LDFD f36 = [AO5], LDA (p14) LDFD f37 = [AO6], LDA (p15) LDFD f38 = [AO7], LDA ;; (p13) LDFD f40 = [AO1], LDA (p13) LDFD f41 = [AO2], LDA (p13) LDFD f42 = [AO3], LDA (p13) LDFD f43 = [AO4], LDA ;; (p14) LDFD f44 = [AO5], LDA (p14) LDFD f45 = [AO6], LDA (p15) LDFD f46 = [AO7], LDA ;; (p13) LDFD f48 = [AO1], LDA (p13) LDFD f49 = [AO2], LDA (p13) LDFD f50 = [AO3], LDA (p13) LDFD f51 = [AO4], LDA ;; (p14) LDFD f52 = [AO5], LDA (p14) LDFD f53 = [AO6], LDA (p15) LDFD f54 = [AO7], LDA ;; (p13) LDFD f56 = [AO1], LDA (p13) LDFD f57 = [AO2], LDA (p13) LDFD f58 = [AO3], LDA (p13) LDFD f59 = [AO4], LDA ;; (p14) LDFD f60 = [AO5], LDA (p14) LDFD f61 = [AO6], LDA (p15) LDFD f62 = [AO7], LDA ;; (p13) LDFD f64 = [AO1], LDA (p13) LDFD f65 = [AO2], LDA (p13) LDFD f66 = [AO3], LDA (p13) LDFD f67 = [AO4], LDA ;; (p14) LDFD f68 = [AO5], LDA (p14) LDFD f69 = [AO6], LDA (p15) LDFD f70 = [AO7], LDA ;; (p13) LDFD f72 = [AO1], LDA (p13) LDFD f73 = [AO2], LDA (p13) LDFD f74 = [AO3], LDA (p13) LDFD f75 = [AO4], LDA ;; (p14) LDFD f76 = [AO5], LDA (p14) LDFD f77 = [AO6], LDA (p15) LDFD f78 = [AO7], LDA ;; (p13) LDFD f80 = [AO1], LDA (p13) LDFD f81 = [AO2], LDA (p13) LDFD f82 = [AO3], LDA (p13) LDFD f83 = [AO4], LDA ;; (p14) LDFD f84 = [AO5], LDA (p14) LDFD f85 = [AO6], LDA (p15) LDFD f86 = [AO7], LDA ;; (p13) LDFD f88 = [AO1] (p13) LDFD f89 = [AO2] (p13) LDFD f90 = [AO3] (p13) LDFD f91 = [AO4] ;; (p14) LDFD f92 = [AO5] (p14) LDFD f93 = [AO6] (p15) LDFD f94 = [AO7] ;; (p13) LDFD f96 = [YLD1], 2 * SIZE (p13) LDFD f97 = [YLD2], 2 * SIZE ;; (p13) LDFD f98 = [YLD1], 2 * SIZE (p13) LDFD f99 = [YLD2], 2 * SIZE ;; (p14) LDFD f100 = [YLD1], 1 * SIZE ;; (p14) LDFD f101 = [YLD1], 1 * SIZE ;; (p15) LDFD f102 = [YLD1], 1 * SIZE ;; (p13) FMA f96 = ALPHA1, f32, f96 (p13) FMA f97 = ALPHA1, f33, f97 (p13) FMA f98 = ALPHA1, f34, f98 (p13) FMA f99 = ALPHA1, f35, f99 (p14) FMA f100 = ALPHA1, f36, f100 (p14) FMA f101 = ALPHA1, f37, f101 (p15) FMA f102 = ALPHA1, f38, f102 ;; (p13) FMA f96 = ALPHA2, f40, f96 (p13) FMA f97 = ALPHA2, f41, f97 (p13) FMA f98 = ALPHA2, f42, f98 (p13) FMA f99 = ALPHA2, f43, f99 (p14) FMA f100 = ALPHA2, f44, f100 (p14) FMA f101 = ALPHA2, f45, f101 (p15) FMA f102 = ALPHA2, f46, f102 ;; (p13) FMA f96 = ALPHA3, f48, f96 (p13) FMA f97 = ALPHA3, f49, f97 (p13) FMA f98 = ALPHA3, f50, f98 (p13) FMA f99 = ALPHA3, f51, f99 (p14) FMA f100 = ALPHA3, f52, f100 (p14) FMA f101 = ALPHA3, f53, f101 (p15) FMA f102 = ALPHA3, f54, f102 ;; (p13) FMA f96 = ALPHA4, f56, f96 (p13) FMA f97 = ALPHA4, f57, f97 (p13) FMA f98 = ALPHA4, f58, f98 (p13) FMA f99 = ALPHA4, f59, f99 (p14) FMA f100 = ALPHA4, f60, f100 (p14) FMA f101 = ALPHA4, f61, f101 (p15) FMA f102 = ALPHA4, f62, f102 ;; (p13) FMA f96 = ALPHA5, f64, f96 (p13) FMA f97 = ALPHA5, f65, f97 (p13) FMA f98 = ALPHA5, f66, f98 (p13) FMA f99 = ALPHA5, f67, f99 (p14) FMA f100 = ALPHA5, f68, f100 (p14) FMA f101 = ALPHA5, f69, f101 (p15) FMA f102 = ALPHA5, f70, f102 ;; (p13) FMA f96 = ALPHA6, f72, f96 (p13) FMA f97 = ALPHA6, f73, f97 (p13) FMA f98 = ALPHA6, f74, f98 (p13) FMA f99 = ALPHA6, f75, f99 (p14) FMA f100 = ALPHA6, f76, f100 (p14) FMA f101 = ALPHA6, f77, f101 (p15) FMA f102 = ALPHA6, f78, f102 ;; (p13) FMA f96 = ALPHA7, f80, f96 (p13) FMA f97 = ALPHA7, f81, f97 (p13) FMA f98 = ALPHA7, f82, f98 (p13) FMA f99 = ALPHA7, f83, f99 (p14) FMA f100 = ALPHA7, f84, f100 (p14) FMA f101 = ALPHA7, f85, f101 (p15) FMA f102 = ALPHA7, f86, f102 ;; (p13) FMA f16 = ALPHA8, f88, f96 (p13) FMA f17 = ALPHA8, f89, f97 (p13) FMA f18 = ALPHA8, f90, f98 (p13) FMA f19 = ALPHA8, f91, f99 (p14) FMA f20 = ALPHA8, f92, f100 (p14) FMA f21 = ALPHA8, f93, f101 (p15) FMA f22 = ALPHA8, f94, f102 ;; { .mmi (p13) STFD [YST1] = f16, 2 * SIZE (p13) STFD [YST2] = f17, 2 * SIZE nop __LINE__ } ;; { .mmi (p13) STFD [YST1] = f18, 2 * SIZE (p13) STFD [YST2] = f19 nop __LINE__ } ;; { .mmi (p14) STFD [YST1] = f20, 1 * SIZE ;; (p14) STFD [YST1] = f21, 1 * SIZE nop __LINE__ } ;; { .mib (p15) STFD [YST1] = f22 cmp.lt p11, p12 = r0, J (p11) br.cond.dptk .L11 } ;; .align 16 .L20: tbit.z p6, p0 = N, 2 ;; (p6) br.cond.dpnt .L30 ;; shladd LDA7M8 = LDA, 2, r0 ;; sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 2 * SIZE, YY adds YST2 = 2 * SIZE, YY ;; LDFD ALPHA1 = [X], INCX ;; LDFD ALPHA2 = [X], INCX ;; LDFD ALPHA3 = [X], INCX ;; LDFD ALPHA4 = [X], INCX ;; FMPY ALPHA1 = ALPHA, ALPHA1 FMPY ALPHA2 = ALPHA, ALPHA2 FMPY ALPHA3 = ALPHA, ALPHA3 FMPY ALPHA4 = ALPHA, ALPHA4 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A shladd A = LDA, 2, A ;; shr I = M, 3 mov pr.rot= 0 ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I adds J = -1, J ;; cmp.lt p7, p8 = r0, J tbit.nz p13, p11 = M, 2 tbit.nz p14, p12 = M, 1 mov ar.ec= 1 ;; { .mfi and II = 7, M mov ar.lc = I } { .mfb cmp.eq p6, p0 = -1, I (p6) br.cond.dpnt .L25 } ;; .align 16 .L22: (p16) LDFD f32 = [AO1], LDA (p16) LDFD f34 = [AO3], LDA (p16) LDFD f36 = [AO5], LDA (p16) LDFD f38 = [AO7], LDA ;; (p16) LDFD f33 = [AO2], LDA (p16) LDFD f35 = [AO4], LDA (p16) LDFD f37 = [AO6], LDA (p16) LDFD f39 = [AO8], LDA ;; (p16) LDFD f40 = [AO1], LDA (p16) LDFD f42 = [AO3], LDA (p16) LDFD f44 = [AO5], LDA (p16) LDFD f46 = [AO7], LDA ;; (p16) LDFD f41 = [AO2], LDA (p16) LDFD f43 = [AO4], LDA (p16) LDFD f45 = [AO6], LDA (p16) LDFD f47 = [AO8], LDA ;; (p16) LDFD f48 = [AO1], LDA (p16) LDFD f50 = [AO3], LDA (p16) LDFD f52 = [AO5], LDA (p16) LDFD f54 = [AO7], LDA ;; (p16) LDFD f49 = [AO2], LDA (p16) LDFD f51 = [AO4], LDA (p16) LDFD f53 = [AO6], LDA (p16) LDFD f55 = [AO8], LDA ;; (p16) LDFD f56 = [AO1], LDA7M8 (p16) LDFD f58 = [AO3], LDA7M8 (p16) LDFD f60 = [AO5], LDA7M8 (p16) LDFD f62 = [AO7], LDA7M8 ;; (p16) LDFD f57 = [AO2], LDA7M8 (p16) LDFD f59 = [AO4], LDA7M8 (p16) LDFD f61 = [AO6], LDA7M8 (p16) LDFD f63 = [AO8], LDA7M8 ;; (p16) LDFD f96 = [YLD1], 1 * SIZE (p16) LDFD f98 = [YLD2], 1 * SIZE ;; (p16) LDFD f97 = [YLD1], 3 * SIZE (p16) LDFD f99 = [YLD2], 3 * SIZE ;; (p16) LDFD f100 = [YLD1], 1 * SIZE (p16) LDFD f102 = [YLD2], 1 * SIZE ;; (p16) LDFD f101 = [YLD1], 3 * SIZE (p16) LDFD f103 = [YLD2], 3 * SIZE ;; (p16) FMA f96 = ALPHA1, f32, f96 (p16) FMA f98 = ALPHA1, f34, f98 (p16) FMA f97 = ALPHA1, f33, f97 (p16) FMA f99 = ALPHA1, f35, f99 (p16) FMA f100 = ALPHA1, f36, f100 (p16) FMA f102 = ALPHA1, f38, f102 (p16) FMA f101 = ALPHA1, f37, f101 (p16) FMA f103 = ALPHA1, f39, f103 ;; (p16) FMA f96 = ALPHA2, f40, f96 (p16) FMA f98 = ALPHA2, f42, f98 (p16) FMA f97 = ALPHA2, f41, f97 (p16) FMA f99 = ALPHA2, f43, f99 (p16) FMA f100 = ALPHA2, f44, f100 (p16) FMA f102 = ALPHA2, f46, f102 (p16) FMA f101 = ALPHA2, f45, f101 (p16) FMA f103 = ALPHA2, f47, f103 ;; (p16) FMA f96 = ALPHA3, f48, f96 (p16) FMA f98 = ALPHA3, f50, f98 (p16) FMA f97 = ALPHA3, f49, f97 (p16) FMA f99 = ALPHA3, f51, f99 (p16) FMA f100 = ALPHA3, f52, f100 (p16) FMA f102 = ALPHA3, f54, f102 (p16) FMA f101 = ALPHA3, f53, f101 (p16) FMA f103 = ALPHA3, f55, f103 ;; (p16) FMA f16 = ALPHA4, f56, f96 (p16) FMA f18 = ALPHA4, f58, f98 (p16) FMA f17 = ALPHA4, f57, f97 (p16) FMA f19 = ALPHA4, f59, f99 (p16) FMA f20 = ALPHA4, f60, f100 (p16) FMA f22 = ALPHA4, f62, f102 (p16) FMA f21 = ALPHA4, f61, f101 (p16) FMA f23 = ALPHA4, f63, f103 ;; (p16) STFD [YST1] = f16, 1 * SIZE (p16) STFD [YST2] = f18, 1 * SIZE ;; (p16) STFD [YST1] = f17, 3 * SIZE (p16) STFD [YST2] = f19, 3 * SIZE ;; (p16) STFD [YST1] = f20, 1 * SIZE (p16) STFD [YST2] = f22, 1 * SIZE ;; (p16) STFD [YST1] = f21, 3 * SIZE (p16) STFD [YST2] = f23, 3 * SIZE br.ctop.sptk.few .L22 ;; .align 16 .L25: { .mmi (p8) cmp.eq.unc p10, p0 = r0, II (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 (p10) br.cond.dptk .L30 } ;; { .mmi (p13) LDFD f32 = [AO1], LDA (p13) LDFD f34 = [AO3], LDA tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f36 = [AO5], LDA (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f33 = [AO2], LDA (p13) LDFD f35 = [AO4], LDA (p14) LDFD f37 = [AO6], LDA (p15) LDFD f38 = [AO7], LDA ;; (p13) LDFD f40 = [AO1], LDA (p13) LDFD f42 = [AO3], LDA (p14) LDFD f44 = [AO5], LDA (p15) LDFD f46 = [AO7], LDA ;; (p13) LDFD f41 = [AO2], LDA (p13) LDFD f43 = [AO4], LDA (p14) LDFD f45 = [AO6], LDA ;; (p13) LDFD f48 = [AO1], LDA (p13) LDFD f50 = [AO3], LDA (p14) LDFD f52 = [AO5], LDA (p15) LDFD f54 = [AO7], LDA ;; (p13) LDFD f49 = [AO2], LDA (p13) LDFD f51 = [AO4], LDA (p14) LDFD f53 = [AO6], LDA ;; (p13) LDFD f56 = [AO1] (p13) LDFD f58 = [AO3] (p14) LDFD f60 = [AO5] (p15) LDFD f62 = [AO7] ;; (p13) LDFD f57 = [AO2] (p13) LDFD f59 = [AO4] (p14) LDFD f61 = [AO6] ;; (p13) LDFD f96 = [YLD1], 1 * SIZE (p13) LDFD f98 = [YLD2], 1 * SIZE ;; (p13) LDFD f97 = [YLD1], 3 * SIZE (p13) LDFD f99 = [YLD2], 3 * SIZE ;; (p14) LDFD f100 = [YLD1], 1 * SIZE ;; (p14) LDFD f101 = [YLD1], 1 * SIZE ;; (p15) LDFD f102 = [YLD1], 1 * SIZE ;; (p13) FMA f96 = ALPHA1, f32, f96 (p13) FMA f98 = ALPHA1, f34, f98 (p13) FMA f97 = ALPHA1, f33, f97 (p13) FMA f99 = ALPHA1, f35, f99 (p14) FMA f100 = ALPHA1, f36, f100 (p15) FMA f102 = ALPHA1, f38, f102 (p14) FMA f101 = ALPHA1, f37, f101 ;; (p13) FMA f96 = ALPHA2, f40, f96 (p13) FMA f98 = ALPHA2, f42, f98 (p13) FMA f97 = ALPHA2, f41, f97 (p13) FMA f99 = ALPHA2, f43, f99 (p14) FMA f100 = ALPHA2, f44, f100 (p15) FMA f102 = ALPHA2, f46, f102 (p14) FMA f101 = ALPHA2, f45, f101 ;; (p13) FMA f96 = ALPHA3, f48, f96 (p13) FMA f98 = ALPHA3, f50, f98 (p13) FMA f97 = ALPHA3, f49, f97 (p13) FMA f99 = ALPHA3, f51, f99 (p14) FMA f100 = ALPHA3, f52, f100 (p15) FMA f102 = ALPHA3, f54, f102 (p14) FMA f101 = ALPHA3, f53, f101 ;; (p13) FMA f16 = ALPHA4, f56, f96 (p13) FMA f18 = ALPHA4, f58, f98 (p13) FMA f17 = ALPHA4, f57, f97 (p13) FMA f19 = ALPHA4, f59, f99 (p14) FMA f20 = ALPHA4, f60, f100 (p15) FMA f22 = ALPHA4, f62, f102 (p14) FMA f21 = ALPHA4, f61, f101 ;; { .mmi (p13) STFD [YST1] = f16, 1 * SIZE (p13) STFD [YST2] = f18, 1 * SIZE nop __LINE__ } ;; { .mmi (p13) STFD [YST1] = f17, 3 * SIZE (p13) STFD [YST2] = f19 nop __LINE__ } ;; { .mmi (p14) STFD [YST1] = f20, 1 * SIZE ;; (p14) STFD [YST1] = f21, 1 * SIZE nop __LINE__ } ;; { .mib (p15) STFD [YST1] = f22 } ;; .align 16 .L30: tbit.z p6, p0 = N, 1 ;; (p6) br.cond.dpnt .L40 ;; shladd LDA7M8 = LDA, 1, r0 ;; sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 2 * SIZE, YY adds YST2 = 2 * SIZE, YY ;; LDFD ALPHA1 = [X], INCX ;; LDFD ALPHA2 = [X], INCX ;; FMPY ALPHA1 = ALPHA, ALPHA1 FMPY ALPHA2 = ALPHA, ALPHA2 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A shladd A = LDA, 1, A ;; shr I = M, 3 mov pr.rot= 0 ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I adds J = -1, J ;; cmp.lt p7, p8 = r0, J tbit.nz p13, p11 = M, 2 tbit.nz p14, p12 = M, 1 mov ar.ec= 1 ;; { .mfi and II = 7, M mov ar.lc = I } { .mfb cmp.eq p6, p0 = -1, I (p6) br.cond.dpnt .L35 } ;; .align 16 .L32: (p16) LDFD f32 = [AO1], LDA (p16) LDFD f34 = [AO3], LDA (p16) LDFD f36 = [AO5], LDA (p16) LDFD f38 = [AO7], LDA ;; (p16) LDFD f33 = [AO2], LDA (p16) LDFD f35 = [AO4], LDA (p16) LDFD f37 = [AO6], LDA (p16) LDFD f39 = [AO8], LDA ;; (p16) LDFD f40 = [AO1], LDA7M8 (p16) LDFD f42 = [AO3], LDA7M8 (p16) LDFD f44 = [AO5], LDA7M8 (p16) LDFD f46 = [AO7], LDA7M8 ;; (p16) LDFD f41 = [AO2], LDA7M8 (p16) LDFD f43 = [AO4], LDA7M8 (p16) LDFD f45 = [AO6], LDA7M8 (p16) LDFD f47 = [AO8], LDA7M8 ;; (p16) LDFD f96 = [YLD1], 1 * SIZE (p16) LDFD f98 = [YLD2], 1 * SIZE ;; (p16) LDFD f97 = [YLD1], 3 * SIZE (p16) LDFD f99 = [YLD2], 3 * SIZE ;; (p16) LDFD f100 = [YLD1], 1 * SIZE (p16) LDFD f102 = [YLD2], 1 * SIZE ;; (p16) LDFD f101 = [YLD1], 3 * SIZE (p16) LDFD f103 = [YLD2], 3 * SIZE ;; (p16) FMA f96 = ALPHA1, f32, f96 (p16) FMA f98 = ALPHA1, f34, f98 (p16) FMA f97 = ALPHA1, f33, f97 (p16) FMA f99 = ALPHA1, f35, f99 (p16) FMA f100 = ALPHA1, f36, f100 (p16) FMA f102 = ALPHA1, f38, f102 (p16) FMA f101 = ALPHA1, f37, f101 (p16) FMA f103 = ALPHA1, f39, f103 ;; (p16) FMA f16 = ALPHA2, f40, f96 (p16) FMA f18 = ALPHA2, f42, f98 (p16) FMA f17 = ALPHA2, f41, f97 (p16) FMA f19 = ALPHA2, f43, f99 (p16) FMA f20 = ALPHA2, f44, f100 (p16) FMA f22 = ALPHA2, f46, f102 (p16) FMA f21 = ALPHA2, f45, f101 (p16) FMA f23 = ALPHA2, f47, f103 ;; (p16) STFD [YST1] = f16, 1 * SIZE (p16) STFD [YST2] = f18, 1 * SIZE ;; (p16) STFD [YST1] = f17, 3 * SIZE (p16) STFD [YST2] = f19, 3 * SIZE ;; (p16) STFD [YST1] = f20, 1 * SIZE (p16) STFD [YST2] = f22, 1 * SIZE ;; (p16) STFD [YST1] = f21, 3 * SIZE (p16) STFD [YST2] = f23, 3 * SIZE br.ctop.sptk.few .L32 ;; .align 16 .L35: { .mmi (p8) cmp.eq.unc p10, p0 = r0, II (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 (p10) br.cond.dptk .L40 } ;; { .mmi (p13) LDFD f32 = [AO1], LDA (p13) LDFD f34 = [AO3], LDA tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f36 = [AO5], LDA (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f33 = [AO2], LDA (p13) LDFD f35 = [AO4], LDA (p14) LDFD f37 = [AO6], LDA (p15) LDFD f38 = [AO7], LDA ;; (p13) LDFD f40 = [AO1], LDA (p13) LDFD f42 = [AO3], LDA (p14) LDFD f44 = [AO5], LDA (p15) LDFD f46 = [AO7], LDA ;; (p13) LDFD f41 = [AO2] (p13) LDFD f43 = [AO4] (p14) LDFD f45 = [AO6] ;; (p13) LDFD f96 = [YLD1], 1 * SIZE (p13) LDFD f98 = [YLD2], 1 * SIZE ;; (p13) LDFD f97 = [YLD1], 3 * SIZE (p13) LDFD f99 = [YLD2], 3 * SIZE ;; (p14) LDFD f100 = [YLD1], 1 * SIZE ;; (p14) LDFD f101 = [YLD1], 1 * SIZE ;; (p15) LDFD f102 = [YLD1], 1 * SIZE ;; (p13) FMA f96 = ALPHA1, f32, f96 (p13) FMA f98 = ALPHA1, f34, f98 (p13) FMA f97 = ALPHA1, f33, f97 (p13) FMA f99 = ALPHA1, f35, f99 (p14) FMA f100 = ALPHA1, f36, f100 (p15) FMA f102 = ALPHA1, f38, f102 (p14) FMA f101 = ALPHA1, f37, f101 ;; (p13) FMA f16 = ALPHA2, f40, f96 (p13) FMA f18 = ALPHA2, f42, f98 (p13) FMA f17 = ALPHA2, f41, f97 (p13) FMA f19 = ALPHA2, f43, f99 (p14) FMA f20 = ALPHA2, f44, f100 (p15) FMA f22 = ALPHA2, f46, f102 (p14) FMA f21 = ALPHA2, f45, f101 ;; { .mmi (p13) STFD [YST1] = f16, 1 * SIZE (p13) STFD [YST2] = f18, 1 * SIZE nop __LINE__ } ;; { .mmi (p13) STFD [YST1] = f17, 3 * SIZE (p13) STFD [YST2] = f19 nop __LINE__ } ;; { .mmi (p14) STFD [YST1] = f20, 1 * SIZE ;; (p14) STFD [YST1] = f21, 1 * SIZE nop __LINE__ } ;; { .mib (p15) STFD [YST1] = f22 } ;; .align 16 .L40: tbit.z p6, p0 = N, 0 ;; (p6) br.cond.dpnt .L990 ;; mov LDA7M8 = 8 * SIZE ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 2 * SIZE, YY adds YST2 = 2 * SIZE, YY ;; LDFD ALPHA1 = [X], INCX ;; LDFD ALPHA2 = [X], INCX ;; FMPY ALPHA1 = ALPHA, ALPHA1 FMPY ALPHA2 = ALPHA, ALPHA2 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A add A = LDA, A ;; shr I = M, 3 mov pr.rot= 0 ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I adds J = -1, J ;; cmp.lt p7, p8 = r0, J tbit.nz p13, p11 = M, 2 tbit.nz p14, p12 = M, 1 mov ar.ec= 1 ;; { .mfi and II = 7, M mov ar.lc = I } { .mfb cmp.eq p6, p0 = -1, I (p6) br.cond.dpnt .L45 } ;; .align 16 .L42: (p16) LDFD f32 = [AO1], 8 * SIZE (p16) LDFD f34 = [AO3], 8 * SIZE (p16) LDFD f36 = [AO5], 8 * SIZE (p16) LDFD f38 = [AO7], 8 * SIZE ;; (p16) LDFD f33 = [AO2], 8 * SIZE (p16) LDFD f35 = [AO4], 8 * SIZE (p16) LDFD f37 = [AO6], 8 * SIZE (p16) LDFD f39 = [AO8], 8 * SIZE ;; (p16) LDFD f96 = [YLD1], 1 * SIZE (p16) LDFD f98 = [YLD2], 1 * SIZE ;; (p16) LDFD f97 = [YLD1], 3 * SIZE (p16) LDFD f99 = [YLD2], 3 * SIZE ;; (p16) LDFD f100 = [YLD1], 1 * SIZE (p16) LDFD f102 = [YLD2], 1 * SIZE ;; (p16) LDFD f101 = [YLD1], 3 * SIZE (p16) LDFD f103 = [YLD2], 3 * SIZE ;; (p16) FMA f16 = ALPHA1, f32, f96 (p16) FMA f18 = ALPHA1, f34, f98 (p16) FMA f17 = ALPHA1, f33, f97 (p16) FMA f19 = ALPHA1, f35, f99 (p16) FMA f20 = ALPHA1, f36, f100 (p16) FMA f22 = ALPHA1, f38, f102 (p16) FMA f21 = ALPHA1, f37, f101 (p16) FMA f23 = ALPHA1, f39, f103 ;; (p16) STFD [YST1] = f16, 1 * SIZE (p16) STFD [YST2] = f18, 1 * SIZE ;; (p16) STFD [YST1] = f17, 3 * SIZE (p16) STFD [YST2] = f19, 3 * SIZE ;; (p16) STFD [YST1] = f20, 1 * SIZE (p16) STFD [YST2] = f22, 1 * SIZE ;; (p16) STFD [YST1] = f21, 3 * SIZE (p16) STFD [YST2] = f23, 3 * SIZE br.ctop.sptk.few .L42 ;; .align 16 .L45: { .mmi (p8) cmp.eq.unc p10, p0 = r0, II (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 (p10) br.cond.dptk .L990 } ;; { .mmi (p13) LDFD f32 = [AO1], LDA (p13) LDFD f34 = [AO3], LDA tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f36 = [AO5], LDA (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f33 = [AO2], LDA (p13) LDFD f35 = [AO4], LDA (p14) LDFD f37 = [AO6], LDA (p15) LDFD f38 = [AO7], LDA ;; (p13) LDFD f96 = [YLD1], 1 * SIZE (p13) LDFD f98 = [YLD2], 1 * SIZE ;; (p13) LDFD f97 = [YLD1], 3 * SIZE (p13) LDFD f99 = [YLD2], 3 * SIZE ;; (p14) LDFD f100 = [YLD1], 1 * SIZE ;; (p14) LDFD f101 = [YLD1], 1 * SIZE ;; (p15) LDFD f102 = [YLD1], 1 * SIZE ;; (p13) FMA f16 = ALPHA1, f32, f96 (p13) FMA f18 = ALPHA1, f34, f98 (p13) FMA f17 = ALPHA1, f33, f97 (p13) FMA f19 = ALPHA1, f35, f99 (p14) FMA f20 = ALPHA1, f36, f100 (p15) FMA f22 = ALPHA1, f38, f102 (p14) FMA f21 = ALPHA1, f37, f101 ;; { .mmi (p13) STFD [YST1] = f16, 1 * SIZE (p13) STFD [YST2] = f18, 1 * SIZE nop __LINE__ } ;; { .mmi (p13) STFD [YST1] = f17, 3 * SIZE (p13) STFD [YST2] = f19 nop __LINE__ } ;; { .mmi (p14) STFD [YST1] = f20, 1 * SIZE ;; (p14) STFD [YST1] = f21, 1 * SIZE nop __LINE__ } ;; { .mib (p15) STFD [YST1] = f22 } ;; .align 16 .L990: cmp.eq p10, p0 = SIZE, INCY ;; { .mmi mov YLD1 = YY mov YST1 = Y mov pr.rot= 0 } { .mib mov YST2 = Y shr J = M, 3 (p10) br.cond.dptk .L999 } ;; { .mmi cmp.eq p6, p0 = r0, J adds J = -1, J mov ar.ec = 4 } { .mmi cmp.eq p16, p0 = r0, r0 nop __LINE__ tbit.nz p13, p0 = M, 2 } ;; { .mib nop __LINE__ mov ar.lc = J (p6) br.cond.dpnt .L995 } ;; .L992: { .mfi (p19) STFD [YST2] = f35 (p18) FADD f34 = f34, f66 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f64 = [YLD1], 1 * SIZE (p16) LDFD f32 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f39 (p18) FADD f38 = f38, f70 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f36 = [YST1], INCY (p16) LDFD f68 = [YLD1], 1 * SIZE } ;; { .mfi (p19) STFD [YST2] = f43 (p18) FADD f42 = f42, f74 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f72 = [YLD1], 1 * SIZE (p16) LDFD f40 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f47 (p18) FADD f46 = f46, f78 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f76 = [YLD1], 1 * SIZE (p16) LDFD f44 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f51 (p18) FADD f50 = f50, f82 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f80 = [YLD1], 1 * SIZE (p16) LDFD f48 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f55 (p18) FADD f54 = f54, f86 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f84 = [YLD1], 1 * SIZE (p16) LDFD f52 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f59 (p18) FADD f58 = f58, f90 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f88 = [YLD1], 1 * SIZE (p16) LDFD f56 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f63 (p18) FADD f62 = f62, f94 (p19) add YST2 = YST2, INCY } { .mmb (p16) LDFD f92 = [YLD1], 1 * SIZE (p16) LDFD f60 = [YST1], INCY br.ctop.sptk.few .L992 } ;; .L995: (p13) LDFD f32 = [YST1], INCY (p13) LDFD f40 = [YLD1], 1 * SIZE tbit.nz p14, p0 = M, 1 ;; (p13) LDFD f33 = [YST1], INCY (p13) LDFD f41 = [YLD1], 1 * SIZE tbit.nz p15, p0 = M, 0 ;; (p13) LDFD f34 = [YST1], INCY (p13) LDFD f42 = [YLD1], 1 * SIZE ;; (p13) LDFD f35 = [YST1], INCY (p13) LDFD f43 = [YLD1], 1 * SIZE ;; (p14) LDFD f36 = [YST1], INCY (p14) LDFD f44 = [YLD1], 1 * SIZE ;; (p14) LDFD f37 = [YST1], INCY (p14) LDFD f45 = [YLD1], 1 * SIZE ;; (p15) LDFD f38 = [YST1], INCY (p15) LDFD f46 = [YLD1], 1 * SIZE ;; (p13) FADD f32 = f32, f40 (p13) FADD f33 = f33, f41 (p13) FADD f34 = f34, f42 (p13) FADD f35 = f35, f43 (p14) FADD f36 = f36, f44 (p14) FADD f37 = f37, f45 (p15) FADD f38 = f38, f46 ;; (p13) STFD [YST2] = f32 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f33 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f34 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f35 (p13) add YST2 = YST2, INCY ;; (p14) STFD [YST2] = f36 (p14) add YST2 = YST2, INCY ;; (p14) STFD [YST2] = f37 (p14) add YST2 = YST2, INCY ;; (p15) STFD [YST2] = f38 ;; .L999: mov ar.lc = ARLC mov pr = PR, -1 br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/qgemv_t.S000066400000000000000000000662311313527062700167650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define SP r12 #define M r32 #define N r33 #ifndef XDOUBLE #define A r36 #define LDA r37 #define X1 r38 #define INCX r39 #define Y1 r34 #define INCY r35 #else #define A r38 #define LDA r39 #define X1 r34 #define INCX r35 #define Y1 r36 #define INCY r37 #endif #define BUFFER r11 #define I r15 #define J r16 #define AO1 r17 #define AO2 r18 #define AO3 r19 #define AO4 r20 #define AO5 r21 #define AO6 r22 #define AO7 r23 #define AO8 r24 #define X2 r25 #define Y2 r26 #define LDA7M8 r27 #define INCX5 r28 #define INCY5 r29 #define YY1 r8 #define YY2 r9 #define ARLC r30 #define PR r31 #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else #define RPREFETCH (16 * 3 + 16) #endif #define PREFETCH lfetch.nt1 #define ALPHA f6 PROLOGUE .prologue PROFCODE { .mmi mov ARLC = ar.lc } { .mmi adds r15 = 24, SP adds r14 = 16, SP } ;; #ifdef XDOUBLE ld8 X1 = [r14], 16 ld8 INCX = [r15], 16 ;; #endif ld8 Y1 = [r14], 16 ld8 INCY = [r15], 16 ;; ld8 BUFFER = [r14] ;; mov PR = pr ;; mov ALPHA = f8 .body ;; { .mmi cmp.ge p7, p0 = r0, M cmp.ge p6, p0 = r0, N } ;; { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 shladd LDA = LDA, BASE_SHIFT, r0 } ;; { .mbb (p7) br.cond.dpnt .L999 (p6) br.cond.dpnt .L999 } .align 16 ;; shladd INCY5 = INCY, 2, INCY shladd INCX5 = INCX, 2, INCX cmp.eq p10, p0 = SIZE, INCX ;; (p10) mov BUFFER = X1 (p10) br.cond.dptk .L10 ;; mov pr.rot= 0 shladd X2 = INCX, 2, X1 mov YY1 = BUFFER adds YY2 = 4 * SIZE, BUFFER ;; shr I = M, 3 ;; { .mmi adds I = -1, I cmp.eq p16, p0 = r0, r0 mov ar.ec= 5 } ;; { .mmi mov ar.lc = I } { .mib cmp.gt p6, p0 = 0, I tbit.nz p13, p0 = M, 2 (p6) br.cond.dpnt .L05 } ;; .align 16 .L01: (p20) STFD [YY1] = f36, SIZE (p20) STFD [YY2] = f56, SIZE (p16) LDFD f32 = [X1], INCX (p16) LDFD f52 = [X2], INCX ;; (p20) STFD [YY1] = f41, SIZE (p20) STFD [YY2] = f61, SIZE (p16) LDFD f37 = [X1], INCX (p16) LDFD f57 = [X2], INCX ;; (p20) STFD [YY1] = f46, SIZE (p20) STFD [YY2] = f66, SIZE (p16) LDFD f42 = [X1], INCX (p16) LDFD f62 = [X2], INCX ;; (p20) STFD [YY1] = f51, 5 * SIZE (p20) STFD [YY2] = f71, 5 * SIZE (p16) LDFD f47 = [X1], INCX5 (p16) LDFD f67 = [X2], INCX5 br.ctop.sptk.few .L01 ;; .align 16 .L05: (p13) LDFD f32 = [X1], INCX tbit.nz p14, p0 = M, 1 ;; (p13) LDFD f33 = [X1], INCX tbit.nz p15, p0 = M, 0 ;; (p13) LDFD f34 = [X1], INCX ;; (p13) LDFD f35 = [X1], INCX ;; (p14) LDFD f36 = [X1], INCX ;; (p13) STFD [YY1] = f32, SIZE (p14) LDFD f37 = [X1], INCX ;; (p13) STFD [YY1] = f33, SIZE (p15) LDFD f38 = [X1], INCX ;; (p13) STFD [YY1] = f34, SIZE ;; (p13) STFD [YY1] = f35, SIZE ;; (p14) STFD [YY1] = f36, SIZE ;; (p14) STFD [YY1] = f37, SIZE ;; (p15) STFD [YY1] = f38, SIZE ;; .align 16 .L10: mov YY1 = Y1 shladd Y2 = INCY, 2, Y1 shladd YY2 = INCY, 2, Y1 ;; { .mmi nop __LINE__ shr J = N, 3 } ;; { .mib nop __LINE__ cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 } ;; .align 16 .L11: mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A shladd A = LDA, 3, A ;; shladd LDA7M8 = LDA, 3, r0 ;; sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 ;; mov f8 = f0 mov f9 = f0 mov f10 = f0 mov f11 = f0 mov f12 = f0 mov f13 = f0 mov f14 = f0 mov f15 = f0 mov pr.rot= 0 shr I = M, 3 mov ar.ec = 2 ;; mov X1 = BUFFER adds X2 = 2 * SIZE, BUFFER ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I ;; mov ar.lc = I cmp.eq p6, p0 = -1, I (p6) br.cond.dpnt .L15 ;; .align 16 .L12: (p16) LDFD f32 = [AO1], LDA (p16) LDFD f34 = [AO3], LDA (p16) LDFD f36 = [AO5], LDA (p16) LDFD f38 = [AO7], LDA ;; (p16) LDFD f33 = [AO2], LDA (p16) LDFD f35 = [AO4], LDA (p16) LDFD f37 = [AO6], LDA (p16) LDFD f39 = [AO8], LDA ;; (p16) LDFD f40 = [AO1], LDA (p16) LDFD f42 = [AO3], LDA (p16) LDFD f44 = [AO5], LDA (p16) LDFD f46 = [AO7], LDA ;; (p16) LDFD f41 = [AO2], LDA (p16) LDFD f43 = [AO4], LDA (p16) LDFD f45 = [AO6], LDA (p16) LDFD f47 = [AO8], LDA ;; (p16) LDFD f48 = [AO1], LDA (p16) LDFD f50 = [AO3], LDA (p16) LDFD f52 = [AO5], LDA (p16) LDFD f54 = [AO7], LDA ;; (p16) LDFD f49 = [AO2], LDA (p16) LDFD f51 = [AO4], LDA (p16) LDFD f53 = [AO6], LDA (p16) LDFD f55 = [AO8], LDA ;; (p16) LDFD f56 = [AO1], LDA (p16) LDFD f58 = [AO3], LDA (p16) LDFD f60 = [AO5], LDA (p16) LDFD f62 = [AO7], LDA ;; (p16) LDFD f57 = [AO2], LDA (p16) LDFD f59 = [AO4], LDA (p16) LDFD f61 = [AO6], LDA (p16) LDFD f63 = [AO8], LDA ;; (p16) LDFD f64 = [AO1], LDA (p16) LDFD f66 = [AO3], LDA (p16) LDFD f68 = [AO5], LDA (p16) LDFD f70 = [AO7], LDA ;; (p16) LDFD f65 = [AO2], LDA (p16) LDFD f67 = [AO4], LDA (p16) LDFD f69 = [AO6], LDA (p16) LDFD f71 = [AO8], LDA ;; (p16) LDFD f72 = [AO1], LDA (p16) LDFD f74 = [AO3], LDA (p16) LDFD f76 = [AO5], LDA (p16) LDFD f78 = [AO7], LDA ;; (p16) LDFD f73 = [AO2], LDA (p16) LDFD f75 = [AO4], LDA (p16) LDFD f77 = [AO6], LDA (p16) LDFD f79 = [AO8], LDA ;; (p16) LDFD f80 = [AO1], LDA (p16) LDFD f82 = [AO3], LDA (p16) LDFD f84 = [AO5], LDA (p16) LDFD f86 = [AO7], LDA ;; (p16) LDFD f81 = [AO2], LDA (p16) LDFD f83 = [AO4], LDA (p16) LDFD f85 = [AO6], LDA (p16) LDFD f87 = [AO8], LDA ;; (p16) LDFD f88 = [AO1], LDA7M8 (p16) LDFD f90 = [AO3], LDA7M8 (p16) LDFD f92 = [AO5], LDA7M8 (p16) LDFD f94 = [AO7], LDA7M8 ;; (p16) LDFD f89 = [AO2], LDA7M8 (p16) LDFD f91 = [AO4], LDA7M8 (p16) LDFD f93 = [AO6], LDA7M8 (p16) LDFD f95 = [AO8], LDA7M8 ;; (p16) LDFD f96 = [X1], 1 * SIZE (p16) LDFD f98 = [X2], 1 * SIZE ;; (p16) LDFD f97 = [X1], 3 * SIZE (p16) LDFD f99 = [X2], 3 * SIZE ;; (p16) LDFD f100 = [X1], 1 * SIZE (p16) LDFD f102 = [X2], 1 * SIZE ;; (p16) LDFD f101 = [X1], 3 * SIZE (p16) LDFD f103 = [X2], 3 * SIZE ;; (p16) FMA f8 = f96, f32, f8 (p16) FMA f9 = f96, f40, f9 (p16) FMA f10 = f96, f48, f10 (p16) FMA f11 = f96, f56, f11 (p16) FMA f12 = f96, f64, f12 (p16) FMA f13 = f96, f72, f13 (p16) FMA f14 = f96, f80, f14 (p16) FMA f15 = f96, f88, f15 ;; (p16) FMA f8 = f97, f33, f8 (p16) FMA f9 = f97, f41, f9 (p16) FMA f10 = f97, f49, f10 (p16) FMA f11 = f97, f57, f11 (p16) FMA f12 = f97, f65, f12 (p16) FMA f13 = f97, f73, f13 (p16) FMA f14 = f97, f81, f14 (p16) FMA f15 = f97, f89, f15 ;; (p16) FMA f8 = f98, f34, f8 (p16) FMA f9 = f98, f42, f9 (p16) FMA f10 = f98, f50, f10 (p16) FMA f11 = f98, f58, f11 (p16) FMA f12 = f98, f66, f12 (p16) FMA f13 = f98, f74, f13 (p16) FMA f14 = f98, f82, f14 (p16) FMA f15 = f98, f90, f15 ;; (p16) FMA f8 = f99, f35, f8 (p16) FMA f9 = f99, f43, f9 (p16) FMA f10 = f99, f51, f10 (p16) FMA f11 = f99, f59, f11 (p16) FMA f12 = f99, f67, f12 (p16) FMA f13 = f99, f75, f13 (p16) FMA f14 = f99, f83, f14 (p16) FMA f15 = f99, f91, f15 ;; (p16) FMA f8 = f100, f36, f8 (p16) FMA f9 = f100, f44, f9 (p16) FMA f10 = f100, f52, f10 (p16) FMA f11 = f100, f60, f11 (p16) FMA f12 = f100, f68, f12 (p16) FMA f13 = f100, f76, f13 (p16) FMA f14 = f100, f84, f14 (p16) FMA f15 = f100, f92, f15 ;; (p16) FMA f8 = f101, f37, f8 (p16) FMA f9 = f101, f45, f9 (p16) FMA f10 = f101, f53, f10 (p16) FMA f11 = f101, f61, f11 (p16) FMA f12 = f101, f69, f12 (p16) FMA f13 = f101, f77, f13 (p16) FMA f14 = f101, f85, f14 (p16) FMA f15 = f101, f93, f15 ;; (p16) FMA f8 = f102, f38, f8 (p16) FMA f9 = f102, f46, f9 (p16) FMA f10 = f102, f54, f10 (p16) FMA f11 = f102, f62, f11 (p16) FMA f12 = f102, f70, f12 (p16) FMA f13 = f102, f78, f13 (p16) FMA f14 = f102, f86, f14 (p16) FMA f15 = f102, f94, f15 ;; (p16) FMA f8 = f103, f39, f8 (p16) FMA f9 = f103, f47, f9 (p16) FMA f10 = f103, f55, f10 (p16) FMA f11 = f103, f63, f11 (p16) FMA f12 = f103, f71, f12 (p16) FMA f13 = f103, f79, f13 (p16) FMA f14 = f103, f87, f14 (p16) FMA f15 = f103, f95, f15 br.ctop.sptk.few .L12 ;; .align 16 .L15: tbit.nz p13, p11 = M, 2 tbit.nz p14, p12 = M, 1 ;; { .mmi (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 } ;; { .mmi (p13) LDFD f32 = [AO1], LDA (p13) LDFD f34 = [AO3], LDA tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f36 = [AO5], LDA (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f33 = [AO2], LDA (p13) LDFD f35 = [AO4], LDA (p14) LDFD f37 = [AO6], LDA (p15) LDFD f38 = [AO7], LDA ;; (p13) LDFD f40 = [AO1], LDA (p13) LDFD f42 = [AO3], LDA (p14) LDFD f44 = [AO5], LDA (p15) LDFD f46 = [AO7], LDA ;; (p13) LDFD f41 = [AO2], LDA (p13) LDFD f43 = [AO4], LDA (p14) LDFD f45 = [AO6], LDA ;; (p13) LDFD f48 = [AO1], LDA (p13) LDFD f50 = [AO3], LDA (p14) LDFD f52 = [AO5], LDA (p15) LDFD f54 = [AO7], LDA ;; (p13) LDFD f49 = [AO2], LDA (p13) LDFD f51 = [AO4], LDA (p14) LDFD f53 = [AO6], LDA ;; (p13) LDFD f56 = [AO1], LDA (p13) LDFD f58 = [AO3], LDA (p14) LDFD f60 = [AO5], LDA (p15) LDFD f62 = [AO7], LDA ;; (p13) LDFD f57 = [AO2], LDA (p13) LDFD f59 = [AO4], LDA (p14) LDFD f61 = [AO6], LDA ;; (p13) LDFD f64 = [AO1], LDA (p13) LDFD f66 = [AO3], LDA (p14) LDFD f68 = [AO5], LDA (p15) LDFD f70 = [AO7], LDA ;; (p13) LDFD f65 = [AO2], LDA (p13) LDFD f67 = [AO4], LDA (p14) LDFD f69 = [AO6], LDA ;; (p13) LDFD f72 = [AO1], LDA (p13) LDFD f74 = [AO3], LDA (p14) LDFD f76 = [AO5], LDA (p15) LDFD f78 = [AO7], LDA ;; (p13) LDFD f73 = [AO2], LDA (p13) LDFD f75 = [AO4], LDA (p14) LDFD f77 = [AO6], LDA ;; (p13) LDFD f80 = [AO1], LDA (p13) LDFD f82 = [AO3], LDA (p14) LDFD f84 = [AO5], LDA (p15) LDFD f86 = [AO7], LDA ;; (p13) LDFD f81 = [AO2], LDA (p13) LDFD f83 = [AO4], LDA (p14) LDFD f85 = [AO6], LDA ;; (p13) LDFD f88 = [AO1] (p13) LDFD f90 = [AO3] (p14) LDFD f92 = [AO5] (p15) LDFD f94 = [AO7] ;; (p13) LDFD f89 = [AO2] (p13) LDFD f91 = [AO4] (p14) LDFD f93 = [AO6] ;; (p13) LDFD f96 = [X1], 1 * SIZE (p13) LDFD f98 = [X2], 1 * SIZE ;; (p13) LDFD f97 = [X1], 3 * SIZE (p13) LDFD f99 = [X2], 3 * SIZE ;; (p14) LDFD f100 = [X1], 1 * SIZE ;; (p14) LDFD f101 = [X1], 1 * SIZE ;; (p15) LDFD f102 = [X1], 1 * SIZE ;; (p13) FMA f8 = f96, f32, f8 (p13) FMA f9 = f96, f40, f9 (p13) FMA f10 = f96, f48, f10 (p13) FMA f11 = f96, f56, f11 (p13) FMA f12 = f96, f64, f12 (p13) FMA f13 = f96, f72, f13 (p13) FMA f14 = f96, f80, f14 (p13) FMA f15 = f96, f88, f15 ;; (p13) FMA f8 = f97, f33, f8 (p13) FMA f9 = f97, f41, f9 (p13) FMA f10 = f97, f49, f10 (p13) FMA f11 = f97, f57, f11 (p13) FMA f12 = f97, f65, f12 (p13) FMA f13 = f97, f73, f13 (p13) FMA f14 = f97, f81, f14 (p13) FMA f15 = f97, f89, f15 ;; (p13) FMA f8 = f98, f34, f8 (p13) FMA f9 = f98, f42, f9 (p13) FMA f10 = f98, f50, f10 (p13) FMA f11 = f98, f58, f11 (p13) FMA f12 = f98, f66, f12 (p13) FMA f13 = f98, f74, f13 (p13) FMA f14 = f98, f82, f14 (p13) FMA f15 = f98, f90, f15 ;; (p13) FMA f8 = f99, f35, f8 (p13) FMA f9 = f99, f43, f9 (p13) FMA f10 = f99, f51, f10 (p13) FMA f11 = f99, f59, f11 (p13) FMA f12 = f99, f67, f12 (p13) FMA f13 = f99, f75, f13 (p13) FMA f14 = f99, f83, f14 (p13) FMA f15 = f99, f91, f15 ;; (p14) FMA f8 = f100, f36, f8 (p14) FMA f9 = f100, f44, f9 (p14) FMA f10 = f100, f52, f10 (p14) FMA f11 = f100, f60, f11 (p14) FMA f12 = f100, f68, f12 (p14) FMA f13 = f100, f76, f13 (p14) FMA f14 = f100, f84, f14 (p14) FMA f15 = f100, f92, f15 ;; (p14) FMA f8 = f101, f37, f8 (p14) FMA f9 = f101, f45, f9 (p14) FMA f10 = f101, f53, f10 (p14) FMA f11 = f101, f61, f11 (p14) FMA f12 = f101, f69, f12 (p14) FMA f13 = f101, f77, f13 (p14) FMA f14 = f101, f85, f14 (p14) FMA f15 = f101, f93, f15 ;; (p15) FMA f8 = f102, f38, f8 (p15) FMA f9 = f102, f46, f9 (p15) FMA f10 = f102, f54, f10 (p15) FMA f11 = f102, f62, f11 (p15) FMA f12 = f102, f70, f12 (p15) FMA f13 = f102, f78, f13 (p15) FMA f14 = f102, f86, f14 (p15) FMA f15 = f102, f94, f15 ;; LDFD f32 = [Y1], INCY ;; LDFD f33 = [Y1], INCY ;; LDFD f34 = [Y1], INCY ;; LDFD f35 = [Y1], INCY5 ;; LDFD f36 = [Y2], INCY ;; LDFD f37 = [Y2], INCY ;; LDFD f38 = [Y2], INCY ;; LDFD f39 = [Y2], INCY5 ;; FMA f32 = ALPHA, f8, f32 FMA f33 = ALPHA, f9, f33 FMA f34 = ALPHA, f10, f34 FMA f35 = ALPHA, f11, f35 FMA f36 = ALPHA, f12, f36 FMA f37 = ALPHA, f13, f37 FMA f38 = ALPHA, f14, f38 FMA f39 = ALPHA, f15, f39 ;; STFD [YY1] = f32 add YY1 = YY1, INCY ;; STFD [YY1] = f33 add YY1 = YY1, INCY ;; STFD [YY1] = f34 add YY1 = YY1, INCY ;; STFD [YY1] = f35 add YY1 = YY1, INCY5 ;; STFD [YY2] = f36 add YY2 = YY2, INCY ;; STFD [YY2] = f37 add YY2 = YY2, INCY ;; STFD [YY2] = f38 add YY2 = YY2, INCY ;; STFD [YY2] = f39 add YY2 = YY2, INCY5 ;; adds J = -1, J ;; cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L11 ;; .align 16 .L20: tbit.z p6, p0 = N, 2 ;; (p6) br.cond.dpnt .L30 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A shladd A = LDA, 2, A ;; shladd LDA7M8 = LDA, 2, r0 ;; sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 ;; mov f8 = f0 mov f9 = f0 mov f10 = f0 mov f11 = f0 mov f12 = f0 mov f13 = f0 mov f14 = f0 mov f15 = f0 mov pr.rot= 0 shr I = M, 3 mov ar.ec = 2 ;; mov X1 = BUFFER adds X2 = 2 * SIZE, BUFFER ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I ;; mov ar.lc = I cmp.eq p6, p0 = -1, I (p6) br.cond.dpnt .L25 ;; .align 16 .L22: (p16) LDFD f32 = [AO1], LDA (p16) LDFD f34 = [AO3], LDA (p16) LDFD f36 = [AO5], LDA (p16) LDFD f38 = [AO7], LDA ;; (p16) LDFD f33 = [AO2], LDA (p16) LDFD f35 = [AO4], LDA (p16) LDFD f37 = [AO6], LDA (p16) LDFD f39 = [AO8], LDA ;; (p16) LDFD f40 = [AO1], LDA (p16) LDFD f42 = [AO3], LDA (p16) LDFD f44 = [AO5], LDA (p16) LDFD f46 = [AO7], LDA ;; (p16) LDFD f41 = [AO2], LDA (p16) LDFD f43 = [AO4], LDA (p16) LDFD f45 = [AO6], LDA (p16) LDFD f47 = [AO8], LDA ;; (p16) LDFD f48 = [AO1], LDA (p16) LDFD f50 = [AO3], LDA (p16) LDFD f52 = [AO5], LDA (p16) LDFD f54 = [AO7], LDA ;; (p16) LDFD f49 = [AO2], LDA (p16) LDFD f51 = [AO4], LDA (p16) LDFD f53 = [AO6], LDA (p16) LDFD f55 = [AO8], LDA ;; (p16) LDFD f56 = [AO1], LDA7M8 (p16) LDFD f58 = [AO3], LDA7M8 (p16) LDFD f60 = [AO5], LDA7M8 (p16) LDFD f62 = [AO7], LDA7M8 ;; (p16) LDFD f57 = [AO2], LDA7M8 (p16) LDFD f59 = [AO4], LDA7M8 (p16) LDFD f61 = [AO6], LDA7M8 (p16) LDFD f63 = [AO8], LDA7M8 ;; (p16) LDFD f96 = [X1], 1 * SIZE (p16) LDFD f98 = [X2], 1 * SIZE ;; (p16) LDFD f97 = [X1], 3 * SIZE (p16) LDFD f99 = [X2], 3 * SIZE ;; (p16) LDFD f100 = [X1], 1 * SIZE (p16) LDFD f102 = [X2], 1 * SIZE ;; (p16) LDFD f101 = [X1], 3 * SIZE (p16) LDFD f103 = [X2], 3 * SIZE ;; (p16) FMA f8 = f96, f32, f8 (p16) FMA f9 = f96, f40, f9 (p16) FMA f10 = f96, f48, f10 (p16) FMA f11 = f96, f56, f11 ;; (p16) FMA f8 = f97, f33, f8 (p16) FMA f9 = f97, f41, f9 (p16) FMA f10 = f97, f49, f10 (p16) FMA f11 = f97, f57, f11 ;; (p16) FMA f8 = f98, f34, f8 (p16) FMA f9 = f98, f42, f9 (p16) FMA f10 = f98, f50, f10 (p16) FMA f11 = f98, f58, f11 ;; (p16) FMA f8 = f99, f35, f8 (p16) FMA f9 = f99, f43, f9 (p16) FMA f10 = f99, f51, f10 (p16) FMA f11 = f99, f59, f11 ;; (p16) FMA f8 = f100, f36, f8 (p16) FMA f9 = f100, f44, f9 (p16) FMA f10 = f100, f52, f10 (p16) FMA f11 = f100, f60, f11 ;; (p16) FMA f8 = f101, f37, f8 (p16) FMA f9 = f101, f45, f9 (p16) FMA f10 = f101, f53, f10 (p16) FMA f11 = f101, f61, f11 ;; (p16) FMA f8 = f102, f38, f8 (p16) FMA f9 = f102, f46, f9 (p16) FMA f10 = f102, f54, f10 (p16) FMA f11 = f102, f62, f11 ;; (p16) FMA f8 = f103, f39, f8 (p16) FMA f9 = f103, f47, f9 (p16) FMA f10 = f103, f55, f10 (p16) FMA f11 = f103, f63, f11 br.ctop.sptk.few .L22 ;; .align 16 .L25: tbit.nz p13, p11 = M, 2 tbit.nz p14, p12 = M, 1 ;; { .mmi (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 } ;; { .mmi (p13) LDFD f32 = [AO1], LDA (p13) LDFD f34 = [AO3], LDA tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f36 = [AO5], LDA (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f33 = [AO2], LDA (p13) LDFD f35 = [AO4], LDA (p14) LDFD f37 = [AO6], LDA (p15) LDFD f38 = [AO7], LDA ;; (p13) LDFD f40 = [AO1], LDA (p13) LDFD f42 = [AO3], LDA (p14) LDFD f44 = [AO5], LDA (p15) LDFD f46 = [AO7], LDA ;; (p13) LDFD f41 = [AO2], LDA (p13) LDFD f43 = [AO4], LDA (p14) LDFD f45 = [AO6], LDA ;; (p13) LDFD f48 = [AO1], LDA (p13) LDFD f50 = [AO3], LDA (p14) LDFD f52 = [AO5], LDA (p15) LDFD f54 = [AO7], LDA ;; (p13) LDFD f49 = [AO2], LDA (p13) LDFD f51 = [AO4], LDA (p14) LDFD f53 = [AO6], LDA ;; (p13) LDFD f56 = [AO1] (p13) LDFD f58 = [AO3] (p14) LDFD f60 = [AO5] (p15) LDFD f62 = [AO7] ;; (p13) LDFD f57 = [AO2] (p13) LDFD f59 = [AO4] (p14) LDFD f61 = [AO6] ;; (p13) LDFD f96 = [X1], 1 * SIZE (p13) LDFD f98 = [X2], 1 * SIZE ;; (p13) LDFD f97 = [X1], 3 * SIZE (p13) LDFD f99 = [X2], 3 * SIZE ;; (p14) LDFD f100 = [X1], 1 * SIZE ;; (p14) LDFD f101 = [X1], 1 * SIZE ;; (p15) LDFD f102 = [X1], 1 * SIZE ;; (p13) FMA f8 = f96, f32, f8 (p13) FMA f9 = f96, f40, f9 (p13) FMA f10 = f96, f48, f10 (p13) FMA f11 = f96, f56, f11 ;; (p13) FMA f8 = f97, f33, f8 (p13) FMA f9 = f97, f41, f9 (p13) FMA f10 = f97, f49, f10 (p13) FMA f11 = f97, f57, f11 ;; (p13) FMA f8 = f98, f34, f8 (p13) FMA f9 = f98, f42, f9 (p13) FMA f10 = f98, f50, f10 (p13) FMA f11 = f98, f58, f11 ;; (p13) FMA f8 = f99, f35, f8 (p13) FMA f9 = f99, f43, f9 (p13) FMA f10 = f99, f51, f10 (p13) FMA f11 = f99, f59, f11 ;; (p14) FMA f8 = f100, f36, f8 (p14) FMA f9 = f100, f44, f9 (p14) FMA f10 = f100, f52, f10 (p14) FMA f11 = f100, f60, f11 ;; (p14) FMA f8 = f101, f37, f8 (p14) FMA f9 = f101, f45, f9 (p14) FMA f10 = f101, f53, f10 (p14) FMA f11 = f101, f61, f11 ;; (p15) FMA f8 = f102, f38, f8 (p15) FMA f9 = f102, f46, f9 (p15) FMA f10 = f102, f54, f10 (p15) FMA f11 = f102, f62, f11 ;; LDFD f32 = [Y1], INCY ;; LDFD f33 = [Y1], INCY ;; LDFD f34 = [Y1], INCY ;; LDFD f35 = [Y1], INCY ;; FMA f32 = ALPHA, f8, f32 FMA f33 = ALPHA, f9, f33 FMA f34 = ALPHA, f10, f34 FMA f35 = ALPHA, f11, f35 ;; STFD [YY1] = f32 add YY1 = YY1, INCY ;; STFD [YY1] = f33 add YY1 = YY1, INCY ;; STFD [YY1] = f34 add YY1 = YY1, INCY ;; STFD [YY1] = f35 add YY1 = YY1, INCY ;; .align 16 .L30: tbit.z p6, p0 = N, 1 ;; (p6) br.cond.dpnt .L40 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A shladd A = LDA, 1, A ;; shladd LDA7M8 = LDA, 1, r0 ;; sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 ;; mov f8 = f0 mov f9 = f0 mov f10 = f0 mov f11 = f0 mov f12 = f0 mov f13 = f0 mov f14 = f0 mov f15 = f0 mov pr.rot= 0 shr I = M, 3 mov ar.ec = 2 ;; mov X1 = BUFFER adds X2 = 2 * SIZE, BUFFER ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I ;; mov ar.lc = I cmp.eq p6, p0 = -1, I (p6) br.cond.dpnt .L35 ;; .align 16 .L32: (p16) LDFD f32 = [AO1], LDA (p16) LDFD f34 = [AO3], LDA (p16) LDFD f36 = [AO5], LDA (p16) LDFD f38 = [AO7], LDA ;; (p16) LDFD f33 = [AO2], LDA (p16) LDFD f35 = [AO4], LDA (p16) LDFD f37 = [AO6], LDA (p16) LDFD f39 = [AO8], LDA ;; (p16) LDFD f40 = [AO1], LDA7M8 (p16) LDFD f42 = [AO3], LDA7M8 (p16) LDFD f44 = [AO5], LDA7M8 (p16) LDFD f46 = [AO7], LDA7M8 ;; (p16) LDFD f41 = [AO2], LDA7M8 (p16) LDFD f43 = [AO4], LDA7M8 (p16) LDFD f45 = [AO6], LDA7M8 (p16) LDFD f47 = [AO8], LDA7M8 ;; (p16) LDFD f96 = [X1], 1 * SIZE (p16) LDFD f98 = [X2], 1 * SIZE ;; (p16) LDFD f97 = [X1], 3 * SIZE (p16) LDFD f99 = [X2], 3 * SIZE ;; (p16) LDFD f100 = [X1], 1 * SIZE (p16) LDFD f102 = [X2], 1 * SIZE ;; (p16) LDFD f101 = [X1], 3 * SIZE (p16) LDFD f103 = [X2], 3 * SIZE ;; (p16) FMA f8 = f96, f32, f8 (p16) FMA f9 = f96, f40, f9 ;; (p16) FMA f8 = f97, f33, f8 (p16) FMA f9 = f97, f41, f9 ;; (p16) FMA f8 = f98, f34, f8 (p16) FMA f9 = f98, f42, f9 ;; (p16) FMA f8 = f99, f35, f8 (p16) FMA f9 = f99, f43, f9 ;; (p16) FMA f8 = f100, f36, f8 (p16) FMA f9 = f100, f44, f9 ;; (p16) FMA f8 = f101, f37, f8 (p16) FMA f9 = f101, f45, f9 ;; (p16) FMA f8 = f102, f38, f8 (p16) FMA f9 = f102, f46, f9 ;; (p16) FMA f8 = f103, f39, f8 (p16) FMA f9 = f103, f47, f9 br.ctop.sptk.few .L32 ;; .align 16 .L35: tbit.nz p13, p11 = M, 2 tbit.nz p14, p12 = M, 1 ;; { .mmi (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 } ;; { .mmi (p13) LDFD f32 = [AO1], LDA (p13) LDFD f34 = [AO3], LDA tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f36 = [AO5], LDA (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f33 = [AO2], LDA (p13) LDFD f35 = [AO4], LDA (p14) LDFD f37 = [AO6], LDA (p15) LDFD f38 = [AO7], LDA ;; (p13) LDFD f40 = [AO1] (p13) LDFD f42 = [AO3] (p14) LDFD f44 = [AO5] (p15) LDFD f46 = [AO7] ;; (p13) LDFD f41 = [AO2] (p13) LDFD f43 = [AO4] (p14) LDFD f45 = [AO6] ;; (p13) LDFD f96 = [X1], 1 * SIZE (p13) LDFD f98 = [X2], 1 * SIZE ;; (p13) LDFD f97 = [X1], 3 * SIZE (p13) LDFD f99 = [X2], 3 * SIZE ;; (p14) LDFD f100 = [X1], 1 * SIZE ;; (p14) LDFD f101 = [X1], 1 * SIZE ;; (p15) LDFD f102 = [X1], 1 * SIZE ;; (p13) FMA f8 = f96, f32, f8 (p13) FMA f9 = f96, f40, f9 ;; (p13) FMA f8 = f97, f33, f8 (p13) FMA f9 = f97, f41, f9 ;; (p13) FMA f8 = f98, f34, f8 (p13) FMA f9 = f98, f42, f9 ;; (p13) FMA f8 = f99, f35, f8 (p13) FMA f9 = f99, f43, f9 ;; (p14) FMA f8 = f100, f36, f8 (p14) FMA f9 = f100, f44, f9 ;; (p14) FMA f8 = f101, f37, f8 (p14) FMA f9 = f101, f45, f9 ;; (p15) FMA f8 = f102, f38, f8 (p15) FMA f9 = f102, f46, f9 ;; LDFD f32 = [Y1], INCY ;; LDFD f33 = [Y1], INCY ;; FMA f32 = ALPHA, f8, f32 FMA f33 = ALPHA, f9, f33 ;; STFD [YY1] = f32 add YY1 = YY1, INCY ;; STFD [YY1] = f33 add YY1 = YY1, INCY ;; .align 16 .L40: tbit.z p6, p0 = N, 0 ;; (p6) br.cond.dpnt .L999 ;; mov AO1 = A adds AO2 = 1 * SIZE, A adds AO3 = 2 * SIZE, A adds AO4 = 3 * SIZE, A adds AO5 = 4 * SIZE, A adds AO6 = 5 * SIZE, A adds AO7 = 6 * SIZE, A adds AO8 = 7 * SIZE, A add A = LDA, A ;; mov f8 = f0 mov f9 = f0 mov f10 = f0 mov f11 = f0 mov f12 = f0 mov f13 = f0 mov f14 = f0 mov f15 = f0 mov pr.rot= 0 shr I = M, 3 mov ar.ec = 2 ;; mov X1 = BUFFER adds X2 = 2 * SIZE, BUFFER ;; cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I ;; mov ar.lc = I cmp.eq p6, p0 = -1, I (p6) br.cond.dpnt .L45 ;; .align 16 .L42: (p16) LDFD f32 = [AO1], 8 * SIZE (p16) LDFD f34 = [AO3], 8 * SIZE (p16) LDFD f36 = [AO5], 8 * SIZE (p16) LDFD f38 = [AO7], 8 * SIZE ;; (p16) LDFD f33 = [AO2], 8 * SIZE (p16) LDFD f35 = [AO4], 8 * SIZE (p16) LDFD f37 = [AO6], 8 * SIZE (p16) LDFD f39 = [AO8], 8 * SIZE ;; (p16) LDFD f96 = [X1], 1 * SIZE (p16) LDFD f98 = [X2], 1 * SIZE ;; (p16) LDFD f97 = [X1], 3 * SIZE (p16) LDFD f99 = [X2], 3 * SIZE ;; (p16) LDFD f100 = [X1], 1 * SIZE (p16) LDFD f102 = [X2], 1 * SIZE ;; (p16) LDFD f101 = [X1], 3 * SIZE (p16) LDFD f103 = [X2], 3 * SIZE ;; (p16) FMA f8 = f96, f32, f8 ;; (p16) FMA f8 = f97, f33, f8 ;; (p16) FMA f8 = f98, f34, f8 ;; (p16) FMA f8 = f99, f35, f8 ;; (p16) FMA f8 = f100, f36, f8 ;; (p16) FMA f8 = f101, f37, f8 ;; (p16) FMA f8 = f102, f38, f8 ;; (p16) FMA f8 = f103, f39, f8 br.ctop.sptk.few .L42 ;; .align 16 .L45: tbit.nz p13, p11 = M, 2 tbit.nz p14, p12 = M, 1 ;; { .mmi (p11) adds AO5 = - 4 * SIZE, AO5 } { .mbb (p11) adds AO7 = - 4 * SIZE, AO7 } ;; { .mmi (p13) LDFD f32 = [AO1] (p13) LDFD f34 = [AO3] tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f36 = [AO5] (p11) adds AO6 = - 4 * SIZE, AO6 (p12) adds AO7 = - 2 * SIZE, AO7 } ;; (p13) LDFD f33 = [AO2] (p13) LDFD f35 = [AO4] (p14) LDFD f37 = [AO6] (p15) LDFD f38 = [AO7] ;; (p13) LDFD f96 = [X1], 1 * SIZE (p13) LDFD f98 = [X2], 1 * SIZE ;; (p13) LDFD f97 = [X1], 3 * SIZE (p13) LDFD f99 = [X2], 3 * SIZE ;; (p14) LDFD f100 = [X1], 1 * SIZE ;; (p14) LDFD f101 = [X1], 1 * SIZE ;; (p15) LDFD f102 = [X1], 1 * SIZE ;; (p13) FMA f8 = f96, f32, f8 ;; (p13) FMA f8 = f97, f33, f8 ;; (p13) FMA f8 = f98, f34, f8 ;; (p13) FMA f8 = f99, f35, f8 ;; (p14) FMA f8 = f100, f36, f8 ;; (p14) FMA f8 = f101, f37, f8 ;; (p15) FMA f8 = f102, f38, f8 ;; LDFD f32 = [Y1], INCY ;; FMA f32 = ALPHA, f8, f32 ;; STFD [YY1] = f32 .align 16 .L999: mov ar.lc = ARLC mov pr = PR, -1 br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/qscal.S000066400000000000000000000265331313527062700164270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_SIZE (16 * 16) #define ALPHA f8 #define N r32 #define X1 r38 #define INCX r39 #define X2 r14 #define Y1 r15 #define Y2 r16 #define PRE1 r17 #define I r18 #define NAND15 r19 #define INCX5 r20 #define INCX8 r21 #define XX r22 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi shladd INCX = INCX, BASE_SHIFT, r0 fcmp.eq p0, p6 = ALPHA, f0 .save ar.lc, ARLC mov ARLC = ar.lc } .body { .mib cmp.ge p7, p0 = 0, N (p7) br.ret.sptk.many b0 } ;; { .mmi mov XX = X1 mov PR = pr } { .mmi shladd INCX5 = INCX, 2, INCX shladd INCX8 = INCX, 3, r0 } ;; { .mmi shladd X2 = INCX, 2, X1 nop.m 0 mov ar.ec = 5 } { .mmi and NAND15 = 15, N nop.m 0 shr I = N, 4 } ;; { .mmi adds I = -1, I nop.m 0 tbit.z p0, p12 = N, 3 } { .mmb cmp.ge p9, p0 = 0, NAND15 nop.m 0 (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 } ;; { .mmi adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 mov ar.lc = I } { .mmb cmp.gt p8, p0 = 0, I (p8) br.cond.dpnt .L30 } ;; .align 32 .L20: {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi lfetch.excl.nt1 [PRE1], INCX8 add X1 = INCX, X1 add X2 = INCX, X2 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX5, X1 add X2 = INCX5, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi lfetch.excl.nt1 [PRE1], INCX8 add X1 = INCX, X1 add X2 = INCX, X2 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop.i 0 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop.i 0 } {.mmb add X1 = INCX5, X1 add X2 = INCX5, X2 br.cloop.sptk.few .L20 } ;; .align 16 .L30: { .mmi (p12) STFD [X1] = f0 (p12) STFD [X2] = f0 mov ar.lc = ARLC } { .mmb (p12) add X1 = INCX, X1 (p12) add X2 = INCX, X2 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p13 = N, 2 } { .mmi (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p15 = N, 0 } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 nop __LINE__ } ;; { .mmb (p12) STFD [X1] = f0 (p12) add X1 = INCX5, X1 nop __LINE__ } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX5, X2 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p15) STFD [X1] = f0 nop.m 0 br.ret.sptk.many b0 } ;; .align 32 .L100: { .mmi mov Y1 = X1 shladd Y2 = INCX, 2, X1 mov pr.rot= 0 } ;; { .mmi mov ar.lc = I } cmp.eq p16, p0 = r0, r0 ;; { .mmi adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 nop.m 0 mov.i ar.ec = 6 } { .mmb cmp.gt p8, p0 = 0, I nop.m 0 (p8) br.cond.dpnt .L320 } ;; .align 32 .L310: { .mmf (p16) lfetch.excl.nt1 [PRE1], INCX8 (p22) STFD [Y1] = f12 (p21) FMPY f6 = ALPHA, f37 } { .mmi (p16) LDFD f32 = [X1], INCX nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p22) STFD [Y1] = f13 (p16) LDFD f38 = [X1], INCX (p21) FMPY f7 = ALPHA, f43 } { .mmi nop __LINE__ nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p22) STFD [Y1] = f14 (p16) LDFD f44 = [X1], INCX (p21) FMPY f10 = ALPHA, f49 } { .mmi nop __LINE__ nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p22) STFD [Y1] = f15 (p16) LDFD f50 = [X1], INCX (p21) FMPY f11 = ALPHA, f55 } { .mmi nop __LINE__ nop __LINE__ (p22) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f6 (p16) LDFD f56 = [X1], INCX (p21) FMPY f12 = ALPHA, f61 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p16) lfetch.excl.nt1 [PRE1], INCX8 (p21) STFD [Y1] = f7 (p21) FMPY f13 = ALPHA, f67 } { .mmi (p16) LDFD f62 = [X1], INCX nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f10 (p16) LDFD f68 = [X1], INCX (p21) FMPY f14 = ALPHA, f73 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f11 (p16) LDFD f74 = [X1], INCX (p21) FMPY f15 = ALPHA, f79 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f12 (p16) LDFD f80 = [X1], INCX (p21) FMPY f6 = ALPHA, f85 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f13 (p16) LDFD f86 = [X1], INCX (p21) FMPY f7 = ALPHA, f91 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f14 (p16) LDFD f92 = [X1], INCX (p21) FMPY f10 = ALPHA, f97 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f15 (p16) LDFD f98 = [X1], INCX (p21) FMPY f11 = ALPHA, f103 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f6 (p16) LDFD f104 = [X1], INCX (p21) FMPY f12 = ALPHA, f109 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f7 (p16) LDFD f110 = [X1], INCX (p21) FMPY f13 = ALPHA, f115 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f10 (p16) LDFD f116 = [X1], INCX (p21) FMPY f14 = ALPHA, f121 } { .mmi nop __LINE__ nop __LINE__ (p21) add Y1 = INCX, Y1 } ;; { .mmf (p21) STFD [Y1] = f11 (p16) LDFD f122 = [X1], INCX (p21) FMPY f15 = ALPHA, f127 } { .mmb nop __LINE__ (p21) add Y1 = INCX, Y1 br.ctop.sptk.few .L310 } ;; { .mmi STFD [Y1] = f12 add Y1 = INCX, Y1 shladd Y2 = INCX, 2, X1 } ;; { .mmi STFD [Y1] = f13 add Y1 = INCX, Y1 shladd X2 = INCX, 2, X1 } ;; { .mmi STFD [Y1] = f14 nop __LINE__ add Y1 = INCX, Y1 } ;; { .mmi STFD [Y1] = f15 nop __LINE__ add Y1 = INCX, Y1 } ;; .align 16 .L320: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f52 = [X2], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f49 = [X1], INCX (p12) LDFD f53 = [X2], INCX mov pr = PR, -65474 } { .mmb nop __LINE__ nop __LINE__ (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f54 = [X2], INCX tbit.z p0, p13 = N, 2 } ;; { .mmi (p12) LDFD f51 = [X1], INCX5 (p12) LDFD f55 = [X2], INCX5 tbit.z p0, p14 = N, 1 } ;; (p13) LDFD f56 = [X1], INCX tbit.z p0, p15 = N, 0 ;; (p13) LDFD f57 = [X1], INCX ;; { .mmf (p13) LDFD f58 = [X1], INCX nop __LINE__ (p12) FMPY f48 = ALPHA, f48 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f52 = ALPHA, f52 } ;; { .mmf (p13) LDFD f59 = [X1], INCX nop __LINE__ (p12) FMPY f49 = ALPHA, f49 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f53 = ALPHA, f53 } ;; { .mmf (p14) LDFD f60 = [X1], INCX nop __LINE__ (p12) FMPY f50 = ALPHA, f50 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f54 = ALPHA, f54 } ;; { .mmf (p14) LDFD f61 = [X1], INCX nop __LINE__ (p12) FMPY f51 = ALPHA, f51 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f55 = ALPHA, f55 } ;; { .mmf (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f52 (p13) FMPY f56 = ALPHA, f56 } { .mmi (p15) LDFD f62 = [X1] (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 } ;; { .mmf (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p13) FMPY f57 = ALPHA, f57 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop __LINE__ } ;; { .mmf (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f54 (p13) FMPY f58 = ALPHA, f58 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop __LINE__ } ;; { .mmf (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) FMPY f59 = ALPHA, f59 } { .mmi (p12) add Y1 = INCX5, Y1 (p12) add Y2 = INCX5, Y2 nop __LINE__ } ;; { .mfi (p13) STFD [Y1] = f56 (p14) FMPY f60 = ALPHA, f60 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f57 (p14) FMPY f61 = ALPHA, f61 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f58 (p15) FMPY f62 = ALPHA, f62 (p13) add Y1 = INCX, Y1 } ;; { .mmi (p13) STFD [Y1] = f59 nop __LINE__ (p13) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f60 nop __LINE__ (p14) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f61 nop __LINE__ (p14) add Y1 = INCX, Y1 } ;; { .mib (p15) STFD [Y1] = f62 mov pr = PR, -65474 br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/rot.S000066400000000000000000000370711313527062700161270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 8 + 4) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 8 + 8) #else #define PREFETCH_SIZE (32 * 8 + 16) #endif #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREX r2 #define PREY r3 #define I r14 #define J r15 #define Y2 r16 #define X2 r17 #define INCX16 r18 #define INCY16 r19 #define PR r30 #define ARLC r31 #define C f8 #define S f9 PROLOGUE .prologue PROFCODE { .mmi adds r29 = 16, r12 shladd INCX = INCX, BASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N shr I = N, 4 (p6) br.ret.spnt.many b0 } .body ;; { .mmi #ifdef XDOUBLE LDFD S = [r29] #else nop __LINE__ #endif shladd INCY = INCY, BASE_SHIFT, r0 mov PR = pr } { .mmi mov X2 = X1 mov Y2 = Y1 mov pr.rot= 0 } ;; { .mmi #ifndef XDOUBLE shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 #else shladd INCX16 = INCX, 3, r0 shladd INCY16 = INCY, 3, r0 #endif mov ar.ec= 3 } { .mmi adds I = -1, I cmp.eq p16, p0 = r0, r0 and J = 15, N } ;; { .mmi adds PREX = PREFETCH_SIZE * SIZE, X1 adds PREY = PREFETCH_SIZE * SIZE, Y1 mov ar.lc = I } { .mib cmp.eq p6 ,p0 = -1, I tbit.z p0, p12 = N, 3 (p6) br.cond.dpnt .L15 } ;; .align 32 .L12: { .mmf (p18) STFD [X2] = f6 (p16) lfetch.excl.nt1 [PREY], INCY16 (p18) FMA f12 = C, f40, f12 } { .mmf (p17) LDFD f120 = [Y1], INCY (p18) add X2 = X2, INCX (p18) FMPY f6 = S, f94 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) lfetch.excl.nt1 [PREX], INCX16 (p18) FNMA f13 = S, f40, f13 } { .mmf (p16) LDFD f32 = [X1], INCX (p18) add Y2 = Y2, INCY (p18) FMPY f7 = C, f94 } ;; { .mmf (p18) STFD [X2] = f10 (p17) LDFD f123 = [Y1], INCY (p18) FMA f14 = C, f43, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f10 = S, f97 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f35 = [X1], INCX (p18) FNMA f15 = S, f43, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f11 = C, f97 } ;; { .mmf (p18) STFD [X2] = f12 (p17) LDFD f126 = [Y1], INCY (p18) FMPY f12 = S, f100 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f6 = C, f46, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f38 = [X1], INCX (p18) FMPY f13 = C, f100 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f7 = S, f46, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p16) LDFD f80 = [Y1], INCY (p18) FMPY f14 = S, f103 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f10 = C, f49, f10 } ;; { .mmf (p18) STFD [Y2] = f15 (p16) LDFD f41 = [X1], INCX (p18) FMPY f15 = C, f103 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f11 = S, f49, f11 } ;; { .mmf (p18) STFD [X2] = f6 (p16) LDFD f83 = [Y1], INCY (p18) FMA f12 = C, f52, f12 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f6 = S, f106 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) LDFD f44 = [X1], INCX (p18) FNMA f13 = S, f52, f13 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f7 = C, f106 } ;; { .mmf (p18) STFD [X2] = f10 (p16) LDFD f86 = [Y1], INCY (p18) FMA f14 = C, f55, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f10 = S, f109 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f47 = [X1], INCX (p18) FNMA f15 = S, f55, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f11 = C, f109 } ;; { .mmf (p18) STFD [X2] = f12 (p16) LDFD f89 = [Y1], INCY (p18) FMPY f12 = S, f112 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f6 = C, f58, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f50 = [X1], INCX (p18) FMPY f13 = C, f112 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f7 = S, f58, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p16) LDFD f92 = [Y1], INCY (p18) FMPY f14 = S, f115 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f10 = C, f61, f10 } ;; { .mmf (p18) STFD [Y2] = f15 (p16) LDFD f53 = [X1], INCX (p18) FMPY f15 = C, f115 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f11 = S, f61, f11 } ;; #ifndef XDOUBLE { .mmf (p18) STFD [X2] = f6 (p16) LDFD f95 = [Y1], INCY (p18) FMA f12 = C, f64, f12 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f6 = S, f118 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) LDFD f56 = [X1], INCX (p18) FNMA f13 = S, f64, f13 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f7 = C, f118 } ;; #else { .mmf (p18) STFD [X2] = f6 (p16) lfetch.excl.nt1 [PREY], INCY16 (p18) FMA f12 = C, f64, f12 } { .mmf (p16) LDFD f95 = [Y1], INCY (p18) add X2 = X2, INCX (p18) FMPY f6 = S, f118 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) lfetch.excl.nt1 [PREX], INCX16 (p18) FNMA f13 = S, f64, f13 } { .mmf (p16) LDFD f56 = [X1], INCX (p18) add Y2 = Y2, INCY (p18) FMPY f7 = C, f118 } ;; #endif { .mmf (p18) STFD [X2] = f10 (p16) LDFD f98 = [Y1], INCY (p18) FMA f14 = C, f67, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f10 = S, f121 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f59 = [X1], INCX (p18) FNMA f15 = S, f67, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f11 = C, f121 } ;; { .mmf (p18) STFD [X2] = f12 (p16) LDFD f101 = [Y1], INCY (p18) FMPY f12 = S, f124 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f6 = C, f70, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f62 = [X1], INCX (p18) FMPY f13 = C, f124 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f7 = S, f70, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p16) LDFD f104 = [Y1], INCY (p18) FMPY f14 = S, f127 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f10 = C, f73, f10 } ;; { .mmf (p18) STFD [Y2] = f15 (p16) LDFD f65 = [X1], INCX (p18) FMPY f15 = C, f127 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f11 = S, f73, f11 } ;; { .mmf (p18) STFD [X2] = f6 (p16) LDFD f107 = [Y1], INCY (p18) FMA f12 = C, f76, f12 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p17) FMPY f6 = S, f81 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) LDFD f68 = [X1], INCX (p18) FNMA f13 = S, f76, f13 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p17) FMPY f7 = C, f81 } ;; { .mmf (p18) STFD [X2] = f10 (p16) LDFD f110 = [Y1], INCY (p18) FMA f14 = C, f79, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p17) FMPY f10 = S, f84 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f71 = [X1], INCX (p18) FNMA f15 = S, f79, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p17) FMPY f11 = C, f84 } ;; { .mmf (p18) STFD [X2] = f12 (p16) LDFD f113 = [Y1], INCY (p17) FMPY f12 = S, f87 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p17) FMA f6 = C, f33, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f74 = [X1], INCX (p17) FMPY f13 = C, f87 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p17) FNMA f7 = S, f33, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p16) LDFD f116 = [Y1], INCY (p17) FMPY f14 = S, f90 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p17) FMA f10 = C, f36, f10 } ;; { .mmf (p18) STFD [Y2] = f15 (p16) LDFD f77 = [X1], INCX (p17) FMPY f15 = C, f90 } { .mfb (p18) add Y2 = Y2, INCY (p17) FNMA f11 = S, f36, f11 br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmi (p12) LDFD f40 = [Y1], INCY (p12) LDFD f32 = [X1], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f41 = [Y1], INCY (p12) LDFD f33 = [X1], INCX mov pr = PR, -65474 } ;; { .mmb (p12) LDFD f42 = [Y1], INCY cmp.eq p7, p0 = r0, J (p7) br.ret.sptk.many b0 } ;; { .mmf (p12) LDFD f43 = [Y1], INCY nop __LINE__ (p12) FMPY f6 = S, f40 } ;; { .mmf (p12) LDFD f34 = [X1], INCX nop __LINE__ (p12) FMPY f7 = C, f40 } ;; { .mmf (p12) LDFD f44 = [Y1], INCY nop __LINE__ (p12) FMPY f10 = S, f41 } ;; { .mmf (p12) LDFD f35 = [X1], INCX nop __LINE__ (p12) FMPY f11 = C, f41 } ;; { .mmf (p12) LDFD f45 = [Y1], INCY nop __LINE__ (p12) FMPY f12 = S, f42 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f6 = C, f32, f6 } ;; { .mmf (p12) LDFD f36 = [X1], INCX nop __LINE__ (p12) FMPY f13 = C, f42 } { .mmf nop __LINE__ nop __LINE__ (p12) FNMA f7 = S, f32, f7 } ;; { .mmf (p12) LDFD f46 = [Y1], INCY nop __LINE__ (p12) FMPY f14 = S, f43 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f10 = C, f33, f10 } ;; { .mmf (p12) LDFD f37 = [X1], INCX nop __LINE__ (p12) FMPY f15 = C, f43 } { .mmf nop __LINE__ nop __LINE__ (p12) FNMA f11 = S, f33, f11 } ;; { .mmf (p12) STFD [X2] = f6 (p12) LDFD f47 = [Y1], INCY (p12) FMA f12 = C, f34, f12 } { .mfi (p12) add X2 = X2, INCX (p12) FMPY f6 = S, f44 tbit.z p0, p13 = N, 2 } ;; { .mmf (p12) STFD [Y2] = f7 (p12) LDFD f38 = [X1], INCX (p12) FNMA f13 = S, f34, f13 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p12) FMPY f7 = C, f44 } ;; { .mmf (p12) STFD [X2] = f10 (p13) LDFD f52 = [Y1], INCY (p12) FMA f14 = C, f35, f14 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p12) FMPY f10 = S, f45 } ;; { .mmf (p12) STFD [Y2] = f11 (p12) LDFD f39 = [X1], INCX (p12) FNMA f15 = S, f35, f15 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p12) FMPY f11 = C, f45 } ;; { .mmf (p12) STFD [X2] = f12 (p13) LDFD f53 = [Y1], INCY (p12) FMPY f12 = S, f46 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p12) FMA f6 = C, f36, f6 } ;; { .mmf (p12) STFD [Y2] = f13 (p13) LDFD f48 = [X1], INCX (p12) FMPY f13 = C, f46 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p12) FNMA f7 = S, f36, f7 } ;; { .mmf (p12) STFD [X2] = f14 (p13) LDFD f54 = [Y1], INCY (p12) FMPY f14 = S, f47 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p12) FMA f10 = C, f37, f10 } ;; { .mmf (p12) STFD [Y2] = f15 (p13) LDFD f49 = [X1], INCX (p12) FMPY f15 = C, f47 } { .mfi (p12) add Y2 = Y2, INCY (p12) FNMA f11 = S, f37, f11 tbit.z p0, p14 = N, 1 } ;; { .mmf (p12) STFD [X2] = f6 (p13) LDFD f55 = [Y1], INCY (p12) FMA f12 = C, f38, f12 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p13) FMPY f6 = S, f52 } ;; { .mmf (p12) STFD [Y2] = f7 (p13) LDFD f50 = [X1], INCX (p12) FNMA f13 = S, f38, f13 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p13) FMPY f7 = C, f52 } ;; { .mmf (p12) STFD [X2] = f10 (p14) LDFD f58 = [Y1], INCY (p12) FMA f14 = C, f39, f14 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p13) FMPY f10 = S, f53 } ;; { .mmf (p12) STFD [Y2] = f11 (p13) LDFD f51 = [X1], INCX (p12) FNMA f15 = S, f39, f15 } { .mfi (p12) add Y2 = Y2, INCY (p13) FMPY f11 = C, f53 tbit.z p0, p15 = N, 0 } ;; { .mmf (p12) STFD [X2] = f12 (p14) LDFD f59 = [Y1], INCY (p13) FMPY f12 = S, f54 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p13) FMA f6 = C, f48, f6 } ;; { .mmf (p12) STFD [Y2] = f13 (p14) LDFD f56 = [X1], INCX (p13) FMPY f13 = C, f54 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p13) FNMA f7 = S, f48, f7 } ;; { .mmf (p12) STFD [X2] = f14 (p15) LDFD f61 = [Y1], INCY (p13) FMPY f14 = S, f55 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p13) FMA f10 = C, f49, f10 } ;; { .mmf (p12) STFD [Y2] = f15 (p14) LDFD f57 = [X1], INCX (p13) FMPY f15 = C, f55 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p13) FNMA f11 = S, f49, f11 } ;; { .mmf (p13) STFD [X2] = f6 nop __LINE__ (p13) FMA f12 = C, f50, f12 } { .mmf (p13) add X2 = X2, INCX nop __LINE__ (p14) FMPY f6 = S, f58 } ;; { .mmf (p13) STFD [Y2] = f7 (p15) LDFD f60 = [X1], INCX (p13) FNMA f13 = S, f50, f13 } { .mmf (p13) add Y2 = Y2, INCY nop __LINE__ (p14) FMPY f7 = C, f58 } ;; { .mmf (p13) STFD [X2] = f10 nop __LINE__ (p13) FMA f14 = C, f51, f14 } { .mmf (p13) add X2 = X2, INCX nop __LINE__ (p14) FMPY f10 = S, f59 } ;; { .mmf (p13) STFD [Y2] = f11 nop __LINE__ (p13) FNMA f15 = S, f51, f15 } { .mmf (p13) add Y2 = Y2, INCY nop __LINE__ (p14) FMPY f11 = C, f59 } ;; { .mmf (p13) STFD [X2] = f12 nop __LINE__ (p14) FMA f6 = C, f56, f6 } { .mmf (p13) add X2 = X2, INCX nop __LINE__ (p15) FMPY f12 = S, f61 } ;; { .mmf (p13) STFD [Y2] = f13 nop __LINE__ (p14) FNMA f7 = S, f56, f7 } { .mmf (p13) add Y2 = Y2, INCY nop __LINE__ (p15) FMPY f13 = C, f61 } ;; { .mmf (p13) STFD [X2] = f14 (p13) add X2 = X2, INCX (p14) FMA f10 = C, f57, f10 } ;; { .mmf (p13) STFD [Y2] = f15 (p13) add Y2 = Y2, INCY (p14) FNMA f11 = S, f57, f11 } ;; { .mmf (p14) STFD [X2] = f6 (p14) add X2 = X2, INCX (p15) FMA f12 = C, f60, f12 } ;; { .mmf (p14) STFD [Y2] = f7 (p14) add Y2 = Y2, INCY (p15) FNMA f13 = S, f60, f13 } ;; { .mmi (p14) STFD [X2] = f10 (p14) add X2 = X2, INCX nop __LINE__ } ;; { .mmi (p14) STFD [Y2] = f11 (p14) add Y2 = Y2, INCY nop __LINE__ } ;; { .mmi (p15) STFD [X2] = f12 (p15) add X2 = X2, INCX nop __LINE__ } ;; { .mmb (p15) STFD [Y2] = f13 (p15) add Y2 = Y2, INCY br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/saxpy.S000066400000000000000000000752671313527062700165000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 64 * 8 #define N r32 #define X r36 #define INCX r37 #define Y r38 #define INCY r39 #define PRE1 r2 #define PRE2 r3 #define I r14 #define J r15 #define Y1 r16 #define Y2 r17 #define X1 r18 #define X2 r19 #define INCX16 r20 #define INCY16 r21 #define YYY r25 #define YY r27 #define XA r28 #define XB r29 #define PR r30 #define ARLC r31 #define ALPHA f8 #define ALPHA_P f9 PROLOGUE .prologue PROFCODE { .mii shladd INCX = INCX, BASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc tbit.nz p10, p0 = X, BASE_SHIFT } { .mfb cmp.lt p0, p6 = r0, N fcmp.eq p7, p0 = ALPHA, f0 (p6) br.ret.sptk.many b0 } ;; .body { .mmi (p10) LDFD f32 = [X], INCX shladd INCY = INCY, BASE_SHIFT, r0 mov PR = pr } { .mib (p10) adds N = -1, N mov YYY = Y (p7) br.ret.sptk.many b0 } ;; { .mmi (p10) LDFD f33 = [Y], INCY cmp.ne p13, p0 = SIZE, INCX shr XA = X, 2 } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 nop.i 0 } ;; { .mii mov Y1 = Y tbit.nz p11, p0 = Y, BASE_SHIFT shr XB = Y, 2 } ;; { .mmf and XA = 0x3f, XA and XB = 0x3f, XB (p10) FMA f32 = ALPHA, f32, f33 } ;; { .mmi sub XA = XB, XA shladd Y2 = INCY, 2, Y mov pr.rot = 0x10000 } { .mbb cmp.ne p14, p0 = SIZE, INCY (p13) br.cond.dpnt .L100 (p14) br.cond.dpnt .L100 } ;; { .mmi cmp.gt p14, p0 = r0, XA ;; and J = 15, N shr I = N, 4 } { .mfb (p14) adds XA = 64, XA fpack ALPHA_P = f8, f8 (p11) br.cond.dpnt .L30 } ;; { .mmi cmp.gt p14, p0 = 32, XA cmp.lt p15, p0 = 58, XA mov ar.ec = 3 } { .mmi and J = 31, N cmp.eq p16, p0 = r0, r0 shr I = N, 5 } ;; { .mmi cmp.eq p9, p0 = r0, J cmp.eq p7 ,p0 = 0, I adds I = -1, I } { .mbb nop.m 0 (p14) br.cond.dpnt .L20 (p15) br.cond.dpnt .L20 } ;; { .mmi (p10) STFD [YYY] = f32 adds PRE1 = PREFETCHSIZE * SIZE, X mov ar.lc = I } { .mib adds PRE2 = (PREFETCHSIZE - 24) * SIZE, Y tbit.z p0, p11 = N, 4 (p7) br.cond.dpnt .L15 } ;; .align 32 .L12: /* 0 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p16) lfetch.nt1 [PRE1], 32 * SIZE (p18) fpma f12 = ALPHA_P, f46, f94 } { .mmi (p16) ldf8 f32 = [X], 2 * SIZE (p16) ldf8 f80 = [Y], 2 * SIZE } ;; /* 1 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE (p18) fpma f13 = ALPHA_P, f49, f97 } { .mmi (p16) ldf8 f35 = [X], 2 * SIZE (p16) ldf8 f83 = [Y], 2 * SIZE } ;; /* 2 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f52, f100 } { .mmi (p16) ldf8 f38 = [X], 2 * SIZE (p16) ldf8 f86 = [Y], 2 * SIZE } ;; /* 3 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f55, f103 } { .mmi (p16) ldf8 f41 = [X], 2 * SIZE (p16) ldf8 f89 = [Y], 2 * SIZE } ;; /* 4 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p18) fpma f6 = ALPHA_P, f58, f106 } { .mmi (p16) ldf8 f44 = [X], 2 * SIZE (p16) ldf8 f92 = [Y], 2 * SIZE } ;; /* 5 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p18) fpma f7 = ALPHA_P, f61, f109 } { .mmi (p16) ldf8 f47 = [X], 2 * SIZE (p16) ldf8 f95 = [Y], 2 * SIZE } ;; /* 6 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p18) fpma f10 = ALPHA_P, f64, f112 } { .mmi (p16) ldf8 f50 = [X], 2 * SIZE (p16) ldf8 f98 = [Y], 2 * SIZE } ;; /* 7 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p18) fpma f11 = ALPHA_P, f67, f115 } { .mmi (p16) ldf8 f53 = [X], 2 * SIZE (p16) ldf8 f101 = [Y], 2 * SIZE } ;; /* 8 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p18) fpma f12 = ALPHA_P, f70, f118 } { .mmi (p16) ldf8 f56 = [X], 2 * SIZE (p16) ldf8 f104 = [Y], 2 * SIZE } ;; /* 9 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p18) fpma f13 = ALPHA_P, f73, f121 } { .mmi (p16) ldf8 f59 = [X], 2 * SIZE (p16) ldf8 f107 = [Y], 2 * SIZE } ;; /* 10 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f76, f124 } { .mmi (p16) ldf8 f62 = [X], 2 * SIZE (p16) ldf8 f110 = [Y], 2 * SIZE } ;; /* 11 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f79, f127 } { .mmi (p16) ldf8 f65 = [X], 2 * SIZE (p16) ldf8 f113 = [Y], 2 * SIZE } ;; /* 12 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p17) fpma f6 = ALPHA_P, f33, f81 } { .mmi (p16) ldf8 f68 = [X], 2 * SIZE (p16) ldf8 f116 = [Y], 2 * SIZE } ;; /* 13 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p17) fpma f7 = ALPHA_P, f36, f84 } { .mmi (p16) ldf8 f71 = [X], 2 * SIZE (p16) ldf8 f119 = [Y], 2 * SIZE } ;; /* 14 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p17) fpma f10 = ALPHA_P, f39, f87 } { .mmi (p16) ldf8 f74 = [X], 2 * SIZE (p16) ldf8 f122 = [Y], 2 * SIZE } ;; /*15 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p17) fpma f11 = ALPHA_P, f42, f90 } { .mmb (p16) ldf8 f77 = [X], 2 * SIZE (p16) ldf8 f125 = [Y], 2 * SIZE br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmi (p11) ldf8 f32 = [X], 2 * SIZE (p11) ldf8 f33 = [Y], 2 * SIZE mov pr = PR, -65474 } ;; { .mmi (p11) ldf8 f34 = [X], 2 * SIZE (p11) ldf8 f35 = [Y], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p11) ldf8 f36 = [X], 2 * SIZE (p11) ldf8 f37 = [Y], 2 * SIZE (p9) br.ret.sptk.many b0 } ;; { .mmi (p11) ldf8 f38 = [X], 2 * SIZE (p11) ldf8 f39 = [Y], 2 * SIZE tbit.z p0, p12 = N, 3 } ;; { .mmi (p11) ldf8 f40 = [X], 2 * SIZE (p11) ldf8 f41 = [Y], 2 * SIZE tbit.z p0, p13 = N, 2 } ;; { .mmi (p11) ldf8 f42 = [X], 2 * SIZE (p11) ldf8 f43 = [Y], 2 * SIZE tbit.z p0, p14 = N, 1 } ;; { .mmf (p11) ldf8 f44 = [X], 2 * SIZE (p11) ldf8 f45 = [Y], 2 * SIZE (p11) fpma f6 = ALPHA_P, f32, f33 } ;; { .mmf (p11) ldf8 f46 = [X], 2 * SIZE (p11) ldf8 f47 = [Y], 2 * SIZE (p11) fpma f7 = ALPHA_P, f34, f35 } ;; { .mmf (p12) ldf8 f48 = [X], 2 * SIZE (p12) ldf8 f49 = [Y], 2 * SIZE (p11) fpma f10 = ALPHA_P, f36, f37 } ;; { .mmi (p11) stf8 [Y1] = f6, 2 * SIZE nop.m 0 tbit.z p0, p15 = N, 0 } { .mmf (p12) ldf8 f50 = [X], 2 * SIZE (p12) ldf8 f51 = [Y], 2 * SIZE (p11) fpma f11 = ALPHA_P, f38, f39 } ;; { .mmi (p11) stf8 [Y1] = f7, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f52 = [X], 2 * SIZE (p12) ldf8 f53 = [Y], 2 * SIZE } ;; { .mmi (p11) stf8 [Y1] = f10, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f54 = [X], 2 * SIZE (p12) ldf8 f55 = [Y], 2 * SIZE (p11) fpma f12 = ALPHA_P, f40, f41 } ;; { .mmi (p11) stf8 [Y1] = f11, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f56 = [X], 2 * SIZE (p13) ldf8 f57 = [Y], 2 * SIZE (p11) fpma f13 = ALPHA_P, f42, f43 } ;; { .mmi (p11) stf8 [Y1] = f12, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f58 = [X], 2 * SIZE (p13) ldf8 f59 = [Y], 2 * SIZE (p11) fpma f14 = ALPHA_P, f44, f45 } ;; { .mmi (p11) stf8 [Y1] = f13, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p14) ldf8 f60 = [X], 2 * SIZE (p14) ldf8 f61 = [Y], 2 * SIZE (p11) fpma f15 = ALPHA_P, f46, f47 } ;; { .mmi (p11) stf8 [Y1] = f14, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p15) ldfs f62 = [X] (p15) ldfs f63 = [Y] (p12) fpma f6 = ALPHA_P, f48, f49 } ;; (p12) fpma f7 = ALPHA_P, f50, f51 (p12) fpma f10 = ALPHA_P, f52, f53 ;; (p11) stf8 [Y1] = f15, 2 * SIZE (p12) fpma f11 = ALPHA_P, f54, f55 ;; (p12) stf8 [Y1] = f6, 2 * SIZE (p13) fpma f12 = ALPHA_P, f56, f57 ;; (p12) stf8 [Y1] = f7, 2 * SIZE (p13) fpma f13 = ALPHA_P, f58, f59 ;; (p12) stf8 [Y1] = f10, 2 * SIZE (p14) fpma f14 = ALPHA_P, f60, f61 ;; (p12) stf8 [Y1] = f11, 2 * SIZE (p15) FMA f15 = ALPHA, f62, f63 ;; (p13) stf8 [Y1] = f12, 2 * SIZE ;; (p13) stf8 [Y1] = f13, 2 * SIZE ;; (p14) stf8 [Y1] = f14, 2 * SIZE ;; (p15) stfs [Y1] = f15 br.ret.sptk.many b0 ;; .align 32 /* X is aligned; case 2 */ .L20: { .mmi (p10) STFD [YYY] = f32 adds PRE1 = (PREFETCHSIZE - 28) * SIZE, X mov ar.lc = I } { .mib adds PRE2 = (PREFETCHSIZE + 4) * SIZE, Y tbit.z p0, p11 = N, 4 (p7) br.cond.dpnt .L25 } ;; .align 32 .L22: /* 0 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p16) lfetch.nt1 [PRE1], 32 * SIZE (p18) fpma f12 = ALPHA_P, f46, f94 } { .mmi (p17) ldf8 f60 = [X], 2 * SIZE (p16) ldf8 f80 = [Y], 2 * SIZE } ;; /* 1 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE (p18) fpma f13 = ALPHA_P, f49, f97 } { .mmi (p17) ldf8 f63 = [X], 2 * SIZE (p16) ldf8 f83 = [Y], 2 * SIZE } ;; /* 2 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f52, f100 } { .mmi (p17) ldf8 f66 = [X], 2 * SIZE (p16) ldf8 f86 = [Y], 2 * SIZE } ;; /* 3 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f55, f103 } { .mmi (p17) ldf8 f69 = [X], 2 * SIZE (p16) ldf8 f89 = [Y], 2 * SIZE } ;; /* 4 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p18) fpma f6 = ALPHA_P, f58, f106 } { .mmi (p17) ldf8 f72 = [X], 2 * SIZE (p16) ldf8 f92 = [Y], 2 * SIZE } ;; /* 5 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p18) fpma f7 = ALPHA_P, f61, f109 } { .mmi (p17) ldf8 f75 = [X], 2 * SIZE (p16) ldf8 f95 = [Y], 2 * SIZE } ;; /* 6 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p18) fpma f10 = ALPHA_P, f64, f112 } { .mmi (p17) ldf8 f78 = [X], 2 * SIZE (p16) ldf8 f98 = [Y], 2 * SIZE } ;; /* 7 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p18) fpma f11 = ALPHA_P, f67, f115 } { .mmi (p16) ldf8 f32 = [X], 2 * SIZE (p16) ldf8 f101 = [Y], 2 * SIZE } ;; /* 8 */ { .mmf (p18) stf8 [Y1] = f6, 2 * SIZE (p18) fpma f12 = ALPHA_P, f70, f118 } { .mmi (p16) ldf8 f35 = [X], 2 * SIZE (p16) ldf8 f104 = [Y], 2 * SIZE } ;; /* 9 */ { .mmf (p18) stf8 [Y1] = f7, 2 * SIZE (p18) fpma f13 = ALPHA_P, f73, f121 } { .mmi (p16) ldf8 f38 = [X], 2 * SIZE (p16) ldf8 f107 = [Y], 2 * SIZE } ;; /* 10 */ { .mmf (p18) stf8 [Y1] = f10, 2 * SIZE (p18) fpma f14 = ALPHA_P, f76, f124 } { .mmi (p16) ldf8 f41 = [X], 2 * SIZE (p16) ldf8 f110 = [Y], 2 * SIZE } ;; /* 11 */ { .mmf (p18) stf8 [Y1] = f11, 2 * SIZE (p18) fpma f15 = ALPHA_P, f79, f127 } { .mmi (p16) ldf8 f44 = [X], 2 * SIZE (p16) ldf8 f113 = [Y], 2 * SIZE } ;; /* 12 */ { .mmf (p18) stf8 [Y1] = f12, 2 * SIZE (p17) fpma f6 = ALPHA_P, f33, f81 } { .mmi (p16) ldf8 f47 = [X], 2 * SIZE (p16) ldf8 f116 = [Y], 2 * SIZE } ;; /* 13 */ { .mmf (p18) stf8 [Y1] = f13, 2 * SIZE (p17) fpma f7 = ALPHA_P, f36, f84 } { .mmi (p16) ldf8 f50 = [X], 2 * SIZE (p16) ldf8 f119 = [Y], 2 * SIZE } ;; /* 14 */ { .mmf (p18) stf8 [Y1] = f14, 2 * SIZE (p17) fpma f10 = ALPHA_P, f39, f87 } { .mmi (p16) ldf8 f53 = [X], 2 * SIZE (p16) ldf8 f122 = [Y], 2 * SIZE } ;; /*15 */ { .mmf (p18) stf8 [Y1] = f15, 2 * SIZE (p17) fpma f11 = ALPHA_P, f42, f90 } { .mmb (p16) ldf8 f56 = [X], 2 * SIZE (p16) ldf8 f125 = [Y], 2 * SIZE br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmi (p11) ldf8 f32 = [X], 2 * SIZE (p11) ldf8 f33 = [Y], 2 * SIZE mov pr = PR, -65474 } ;; { .mmi (p11) ldf8 f34 = [X], 2 * SIZE (p11) ldf8 f35 = [Y], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p11) ldf8 f36 = [X], 2 * SIZE (p11) ldf8 f37 = [Y], 2 * SIZE (p9) br.ret.sptk.many b0 } ;; { .mmi (p11) ldf8 f38 = [X], 2 * SIZE (p11) ldf8 f39 = [Y], 2 * SIZE tbit.z p0, p12 = N, 3 } ;; { .mmi (p11) ldf8 f40 = [X], 2 * SIZE (p11) ldf8 f41 = [Y], 2 * SIZE tbit.z p0, p13 = N, 2 } ;; { .mmi (p11) ldf8 f42 = [X], 2 * SIZE (p11) ldf8 f43 = [Y], 2 * SIZE tbit.z p0, p14 = N, 1 } ;; { .mmf (p11) ldf8 f44 = [X], 2 * SIZE (p11) ldf8 f45 = [Y], 2 * SIZE (p11) fpma f6 = ALPHA_P, f32, f33 } ;; { .mmf (p11) ldf8 f46 = [X], 2 * SIZE (p11) ldf8 f47 = [Y], 2 * SIZE (p11) fpma f7 = ALPHA_P, f34, f35 } ;; { .mmf (p12) ldf8 f48 = [X], 2 * SIZE (p12) ldf8 f49 = [Y], 2 * SIZE (p11) fpma f10 = ALPHA_P, f36, f37 } ;; { .mmi (p11) stf8 [Y1] = f6, 2 * SIZE nop.m 0 tbit.z p0, p15 = N, 0 } { .mmf (p12) ldf8 f50 = [X], 2 * SIZE (p12) ldf8 f51 = [Y], 2 * SIZE (p11) fpma f11 = ALPHA_P, f38, f39 } ;; { .mmi (p11) stf8 [Y1] = f7, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f52 = [X], 2 * SIZE (p12) ldf8 f53 = [Y], 2 * SIZE } ;; { .mmi (p11) stf8 [Y1] = f10, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p12) ldf8 f54 = [X], 2 * SIZE (p12) ldf8 f55 = [Y], 2 * SIZE (p11) fpma f12 = ALPHA_P, f40, f41 } ;; { .mmi (p11) stf8 [Y1] = f11, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f56 = [X], 2 * SIZE (p13) ldf8 f57 = [Y], 2 * SIZE (p11) fpma f13 = ALPHA_P, f42, f43 } ;; { .mmi (p11) stf8 [Y1] = f12, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p13) ldf8 f58 = [X], 2 * SIZE (p13) ldf8 f59 = [Y], 2 * SIZE (p11) fpma f14 = ALPHA_P, f44, f45 } ;; { .mmi (p11) stf8 [Y1] = f13, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p14) ldf8 f60 = [X], 2 * SIZE (p14) ldf8 f61 = [Y], 2 * SIZE (p11) fpma f15 = ALPHA_P, f46, f47 } ;; { .mmi (p11) stf8 [Y1] = f14, 2 * SIZE nop.m 0 nop.i 0 } { .mmf (p15) ldfs f62 = [X] (p15) ldfs f63 = [Y] (p12) fpma f6 = ALPHA_P, f48, f49 } ;; (p12) fpma f7 = ALPHA_P, f50, f51 (p12) fpma f10 = ALPHA_P, f52, f53 ;; (p11) stf8 [Y1] = f15, 2 * SIZE (p12) fpma f11 = ALPHA_P, f54, f55 ;; (p12) stf8 [Y1] = f6, 2 * SIZE (p13) fpma f12 = ALPHA_P, f56, f57 ;; (p12) stf8 [Y1] = f7, 2 * SIZE (p13) fpma f13 = ALPHA_P, f58, f59 ;; (p12) stf8 [Y1] = f10, 2 * SIZE (p14) fpma f14 = ALPHA_P, f60, f61 ;; (p12) stf8 [Y1] = f11, 2 * SIZE (p15) FMA f15 = ALPHA, f62, f63 ;; (p13) stf8 [Y1] = f12, 2 * SIZE ;; (p13) stf8 [Y1] = f13, 2 * SIZE ;; (p14) stf8 [Y1] = f14, 2 * SIZE ;; (p15) stfs [Y1] = f15 br.ret.sptk.many b0 ;; .align 32 .L30: { .mmi cmp.eq p9, p0 = r0, J cmp.eq p7 ,p0 = 0, I mov ar.ec = 4 } { .mmi cmp.lt p12, p0 = 33, XA adds I = -1, I } ;; { .mmi cmp.gt p14, p0 = 15, XA cmp.lt p15, p0 = 60, XA (p12) cmp.gt.unc p13, p0 = 53, XA } { .bbb (p13) br.cond.dpnt .L40 (p14) br.cond.dpnt .L40 (p15) br.cond.dpnt .L40 } ;; { .mmi (p10) STFD [YYY] = f32 adds PRE1 = (PREFETCHSIZE + 6) * SIZE, X mov ar.lc = I } { .mib adds PRE2 = (PREFETCHSIZE + 0) * SIZE, Y tbit.z p0, p12 = N, 3 (p7) br.cond.dpnt .L35 } ;; .align 32 .L32: { .mmf (p19) STFD [Y1] = f6, 1 * SIZE (p19) STFD [Y2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f34, f82 } { .mmf (p16) LDFPD f32, f35 = [X], 2 * SIZE (p16) LDFD f80 = [Y], 1 * SIZE (p18) FMA f7 = ALPHA, f46, f94 } ;; { .mmf (p19) STFD [Y1] = f10, 1 * SIZE (p19) STFD [Y2] = f11, 1 * SIZE (p18) FMA f10 = ALPHA, f37, f85 } { .mmf (p16) LDFPD f38, f41 = [X], 2 * SIZE (p16) LDFPD f83, f86 = [Y], 2 * SIZE (p18) FMA f11 = ALPHA, f49, f97 } ;; { .mmf (p19) STFD [Y1] = f12, 1 * SIZE (p19) STFD [Y2] = f13, 1 * SIZE (p18) FMA f12 = ALPHA, f40, f88 } { .mmf (p16) LDFPD f44, f47 = [X], 2 * SIZE (p16) LDFPD f89, f92 = [Y], 2 * SIZE (p18) FMA f13 = ALPHA, f52, f100 } ;; { .mmf (p19) STFD [Y1] = f14, 5 * SIZE (p19) STFD [Y2] = f15, 5 * SIZE (p18) FMA f14 = ALPHA, f43, f91 } { .mmf (p16) LDFPD f50, f53 = [X], 2 * SIZE (p16) LDFPD f95, f98 = [Y], 2 * SIZE (p18) FMA f15 = ALPHA, f55, f103 } ;; { .mmf (p18) STFD [Y1] = f6, 1 * SIZE (p18) STFD [Y2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f58, f106 } { .mmf (p16) LDFPD f56, f59 = [X], 2 * SIZE (p16) LDFPD f101, f104 = [Y], 2 * SIZE (p18) FMA f7 = ALPHA, f70, f118 } ;; { .mmf (p18) STFD [Y1] = f10, 1 * SIZE (p18) STFD [Y2] = f11, 1 * SIZE (p18) FMA f10 = ALPHA, f61, f109 } { .mmf (p16) LDFPD f62, f65 = [X], 2 * SIZE (p16) LDFPD f107, f110 = [Y], 2 * SIZE (p18) FMA f11 = ALPHA, f73, f121 } ;; { .mmf (p18) STFD [Y1] = f12, 1 * SIZE (p18) STFD [Y2] = f13, 1 * SIZE (p18) FMA f12 = ALPHA, f64, f112 } { .mmf (p16) LDFPD f68, f71 = [X], 2 * SIZE (p16) LDFPD f113, f116 = [Y], 2 * SIZE (p18) FMA f13 = ALPHA, f76, f124 } ;; { .mmf (p18) STFD [Y1] = f14, 5 * SIZE (p18) STFD [Y2] = f15, 5 * SIZE (p18) FMA f14 = ALPHA, f67, f115 } { .mmf (p16) LDFPD f74, f77 = [X], 2 * SIZE (p16) LDFPD f119, f122 = [Y], 2 * SIZE (p18) FMA f15 = ALPHA, f79, f127 } ;; { .mmi (p16) lfetch.nt1 [PRE1], 16 * SIZE (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE nop.i 0 } { .mmb (p16) LDFD f125 = [Y], 1 * SIZE nop.m 0 br.ctop.sptk.few .L32 } ;; .align 32 .L35: { .mmi (p12) LDFPD f32, f33 = [X], 2 * SIZE (p12) LDFD f34 = [Y], 1 * SIZE; mov pr = PR, -65474 } ;; { .mmi (p12) LDFPD f36, f37 = [X], 2 * SIZE (p12) LDFPD f35, f38 = [Y], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p12) LDFPD f40, f41 = [X], 2 * SIZE (p12) LDFPD f39, f42 = [Y], 2 * SIZE (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFPD f44, f45 = [X], 2 * SIZE (p12) LDFPD f43, f46 = [Y], 2 * SIZE tbit.z p0, p13 = N, 2 } ;; { .mmi (p13) LDFPD f48, f49 = [X], 2 * SIZE (p12) LDFD f47 = [Y], 1 * SIZE tbit.z p0, p14 = N, 1 } ;; { .mmi (p13) LDFPD f52, f53 = [X], 2 * SIZE (p13) LDFD f50 = [Y], 1 * SIZE tbit.z p0, p15 = N, 0 } ;; { .mmi (p14) LDFPD f56, f57 = [X], 2 * SIZE (p13) LDFPD f51, f54 = [Y], 2 * SIZE mov YY = Y1; } ;; (p15) LDFD f60 = [X] (p13) LDFD f55 = [Y], 1 * SIZE ;; (p14) LDFD f58 = [Y], 1 * SIZE (p12) FMA f6 = ALPHA, f32, f34 (p12) FMA f7 = ALPHA, f40, f42 ;; (p14) LDFD f59 = [Y], 1 * SIZE (p12) shladd YY = INCY, 3, YY (p12) FMA f10 = ALPHA, f33, f35 (p12) FMA f11 = ALPHA, f41, f43 ;; (p15) LDFD f61 = [Y] (p13) shladd YY = INCY, 2, YY (p12) FMA f12 = ALPHA, f36, f38 (p12) FMA f13 = ALPHA, f44, f46 ;; (p12) STFD [Y1] = f6, 1 * SIZE (p12) FMA f14 = ALPHA, f37, f39 (p12) STFD [Y2] = f7, 1 * SIZE (p12) FMA f15 = ALPHA, f45, f47 ;; (p12) STFD [Y1] = f10, 1 * SIZE (p13) FMA f6 = ALPHA, f48, f50 (p12) STFD [Y2] = f11, 1 * SIZE (p14) FMA f7 = ALPHA, f56, f58 ;; (p12) STFD [Y1] = f12, 1 * SIZE (p13) FMA f10 = ALPHA, f49, f51 (p12) STFD [Y2] = f13, 1 * SIZE (p14) FMA f11 = ALPHA, f57, f59 ;; (p12) STFD [Y1] = f14, 5 * SIZE (p13) FMA f12 = ALPHA, f52, f54 (p12) STFD [Y2] = f15, 5 * SIZE (p15) FMA f13 = ALPHA, f60, f61 ;; (p13) STFD [Y1] = f6, 1 * SIZE (p14) STFD [YY] = f7, 1 * SIZE (p13) FMA f14 = ALPHA, f53, f55 ;; (p13) STFD [Y1] = f10, 1 * SIZE (p14) STFD [YY] = f11, 1 * SIZE ;; (p13) STFD [Y1] = f12, 1 * SIZE (p15) STFD [YY] = f13 ;; (p13) STFD [Y1] = f14 br.ret.sptk.many b0 ;; .align 32 .L40: { .mmi (p10) STFD [YYY] = f32 adds PRE1 = (PREFETCHSIZE + 38) * SIZE, X mov ar.lc = I } { .mib adds PRE2 = (PREFETCHSIZE + 14) * SIZE, Y tbit.z p0, p12 = N, 3 (p7) br.cond.dpnt .L45 } ;; .align 32 .L42: { .mmf (p19) STFD [Y1] = f6, 1 * SIZE (p19) STFD [Y2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f34, f82 } { .mmf (p16) lfetch.nt1 [PRE1], 16 * SIZE (p17) LDFPD f102, f105 = [Y], 2 * SIZE (p18) FMA f7 = ALPHA, f46, f94 } ;; { .mmf (p19) STFD [Y1] = f10, 1 * SIZE (p19) STFD [Y2] = f11, 1 * SIZE (p18) FMA f10 = ALPHA, f37, f85 } { .mmf (p17) LDFPD f33, f36 = [X], 2 * SIZE (p17) LDFPD f108, f111 = [Y], 2 * SIZE (p18) FMA f11 = ALPHA, f49, f97 } ;; { .mmf (p19) STFD [Y1] = f12, 1 * SIZE (p19) STFD [Y2] = f13, 1 * SIZE (p18) FMA f12 = ALPHA, f40, f88 } { .mmf (p17) LDFPD f39, f42 = [X], 2 * SIZE (p17) LDFPD f114, f117 = [Y], 2 * SIZE (p18) FMA f13 = ALPHA, f52, f100 } ;; { .mmf (p19) STFD [Y1] = f14, 5 * SIZE (p19) STFD [Y2] = f15, 5 * SIZE (p18) FMA f14 = ALPHA, f43, f91 } { .mmf (p17) LDFPD f45, f48 = [X], 2 * SIZE (p17) LDFPD f120, f123 = [Y], 2 * SIZE (p18) FMA f15 = ALPHA, f55, f103 } ;; { .mmf (p18) STFD [Y1] = f6, 1 * SIZE (p18) STFD [Y2] = f7, 1 * SIZE (p18) FMA f6 = ALPHA, f58, f106 } { .mmf (p17) LDFPD f51, f54 = [X], 2 * SIZE (p17) LDFD f126 = [Y], 1 * SIZE (p18) FMA f7 = ALPHA, f70, f118 } ;; { .mmf (p18) STFD [Y1] = f10, 1 * SIZE (p18) STFD [Y2] = f11, 1 * SIZE (p18) FMA f10 = ALPHA, f61, f109 } { .mmf (p17) LDFPD f57, f60 = [X], 2 * SIZE (p16) LDFD f80 = [Y], 1 * SIZE (p18) FMA f11 = ALPHA, f73, f121 } ;; { .mmf (p18) STFD [Y1] = f12, 1 * SIZE (p18) STFD [Y2] = f13, 1 * SIZE (p18) FMA f12 = ALPHA, f64, f112 } { .mmf (p17) LDFPD f63, f66 = [X], 2 * SIZE (p16) LDFPD f83, f86 = [Y], 2 * SIZE (p18) FMA f13 = ALPHA, f76, f124 } ;; { .mmf (p18) STFD [Y1] = f14, 5 * SIZE (p18) STFD [Y2] = f15, 5 * SIZE (p18) FMA f14 = ALPHA, f67, f115 } { .mmf (p17) LDFPD f69, f72 = [X], 2 * SIZE (p16) LDFPD f89, f92 = [Y], 2 * SIZE (p18) FMA f15 = ALPHA, f79, f127 } ;; #if 0 (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE #endif { .mmb (p17) LDFPD f75, f78 = [X], 2 * SIZE (p16) LDFPD f95, f98 = [Y], 2 * SIZE br.ctop.sptk.few .L42 } ;; { .mmf (p19) STFD [Y1] = f6, 1 * SIZE (p19) STFD [Y2] = f7, 1 * SIZE } ;; { .mmf (p19) STFD [Y1] = f10, 1 * SIZE (p19) STFD [Y2] = f11, 1 * SIZE } ;; { .mmf (p19) STFD [Y1] = f12, 1 * SIZE (p19) STFD [Y2] = f13, 1 * SIZE } ;; { .mmf (p19) STFD [Y1] = f14, 5 * SIZE (p19) STFD [Y2] = f15, 5 * SIZE } ;; .align 32 .L45: { .mmi (p12) LDFPD f32, f33 = [X], 2 * SIZE (p12) LDFD f34 = [Y], 1 * SIZE; mov pr = PR, -65474 } ;; { .mmi (p12) LDFPD f36, f37 = [X], 2 * SIZE (p12) LDFPD f35, f38 = [Y], 2 * SIZE mov ar.lc = ARLC } ;; { .mmb (p12) LDFPD f40, f41 = [X], 2 * SIZE (p12) LDFPD f39, f42 = [Y], 2 * SIZE (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFPD f44, f45 = [X], 2 * SIZE (p12) LDFPD f43, f46 = [Y], 2 * SIZE tbit.z p0, p13 = N, 2 } ;; { .mmi (p13) LDFPD f48, f49 = [X], 2 * SIZE (p12) LDFD f47 = [Y], 1 * SIZE tbit.z p0, p14 = N, 1 } ;; { .mmi (p13) LDFPD f52, f53 = [X], 2 * SIZE (p13) LDFD f50 = [Y], 1 * SIZE tbit.z p0, p15 = N, 0 } ;; { .mmi (p14) LDFPD f56, f57 = [X], 2 * SIZE (p13) LDFPD f51, f54 = [Y], 2 * SIZE mov YY = Y1; } ;; (p15) LDFD f60 = [X] (p13) LDFD f55 = [Y], 1 * SIZE ;; (p14) LDFD f58 = [Y], 1 * SIZE (p12) FMA f6 = ALPHA, f32, f34 (p12) FMA f7 = ALPHA, f40, f42 ;; (p14) LDFD f59 = [Y], 1 * SIZE (p12) shladd YY = INCY, 3, YY (p12) FMA f10 = ALPHA, f33, f35 (p12) FMA f11 = ALPHA, f41, f43 ;; (p15) LDFD f61 = [Y] (p13) shladd YY = INCY, 2, YY (p12) FMA f12 = ALPHA, f36, f38 (p12) FMA f13 = ALPHA, f44, f46 ;; (p12) STFD [Y1] = f6, 1 * SIZE (p12) FMA f14 = ALPHA, f37, f39 (p12) STFD [Y2] = f7, 1 * SIZE (p12) FMA f15 = ALPHA, f45, f47 ;; (p12) STFD [Y1] = f10, 1 * SIZE (p13) FMA f6 = ALPHA, f48, f50 (p12) STFD [Y2] = f11, 1 * SIZE (p14) FMA f7 = ALPHA, f56, f58 ;; (p12) STFD [Y1] = f12, 1 * SIZE (p13) FMA f10 = ALPHA, f49, f51 (p12) STFD [Y2] = f13, 1 * SIZE (p14) FMA f11 = ALPHA, f57, f59 ;; (p12) STFD [Y1] = f14, 5 * SIZE (p13) FMA f12 = ALPHA, f52, f54 (p12) STFD [Y2] = f15, 5 * SIZE (p15) FMA f13 = ALPHA, f60, f61 ;; (p13) STFD [Y1] = f6, 1 * SIZE (p14) STFD [YY] = f7, 1 * SIZE (p13) FMA f14 = ALPHA, f53, f55 ;; (p13) STFD [Y1] = f10, 1 * SIZE (p14) STFD [YY] = f11, 1 * SIZE ;; (p13) STFD [Y1] = f12, 1 * SIZE (p15) STFD [YY] = f13 ;; (p13) STFD [Y1] = f14 br.ret.sptk.many b0 ;; .align 32 .L100: { .mii and J = 15, N shr I = N, 4 mov ar.ec = 3 } ;; { .mmi cmp.eq p9, p0 = r0, J cmp.eq p7 ,p0 = 0, I adds I = -1, I } ;; { .mmi (p10) STFD [YYY] = f32 adds PRE1 = PREFETCHSIZE * SIZE, X mov ar.lc = I } { .mib adds PRE2 = PREFETCHSIZE * SIZE, Y tbit.z p0, p12 = N, 3 (p7) br.cond.dpnt .L115 } ;; .align 32 .L112: { .mmi (p18) STFD [Y1] = f6 (p16) lfetch.nt1 [PRE1], INCX16 (p18) add Y1 = INCY, Y1 } {.mmf (p16) LDFD f32 = [X], INCX (p16) LDFD f80 = [Y], INCY (p18) FMA f6 = ALPHA, f58, f106 } ;; { .mmi (p18) STFD [Y1] = f7 (p16) lfetch.excl.nt1 [PRE2], INCY16 (p18) add Y1 = INCY, Y1 } { .mmf (p16) LDFD f35 = [X], INCX (p16) LDFD f83 = [Y], INCY (p18) FMA f7 = ALPHA, f61, f109 } ;; { .mmi (p18) STFD [Y1] = f10 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f38 = [X], INCX (p16) LDFD f86 = [Y], INCY (p18) FMA f10 = ALPHA, f64, f112 } ;; { .mmi (p18) STFD [Y1] = f11 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f41 = [X], INCX (p16) LDFD f89 = [Y], INCY (p18) FMA f11 = ALPHA, f67, f115 } ;; { .mmi (p18) STFD [Y1] = f12 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f44 = [X], INCX (p16) LDFD f92 = [Y], INCY (p18) FMA f12 = ALPHA, f70, f118 } ;; { .mmi (p18) STFD [Y1] = f13 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f47 = [X], INCX (p16) LDFD f95 = [Y], INCY (p18) FMA f13 = ALPHA, f73, f121 } ;; { .mmi (p18) STFD [Y1] = f14 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f50 = [X], INCX (p16) LDFD f98 = [Y], INCY (p18) FMA f14 = ALPHA, f76, f124 } ;; { .mmi (p18) STFD [Y1] = f15 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f53 = [X], INCX (p16) LDFD f101 = [Y], INCY (p18) FMA f15 = ALPHA, f79, f127 } ;; { .mmi (p18) STFD [Y1] = f6 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f56 = [X], INCX (p16) LDFD f104 = [Y], INCY (p17) FMA f6 = ALPHA, f33, f81 } ;; { .mmi (p18) STFD [Y1] = f7 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f59 = [X], INCX (p16) LDFD f107 = [Y], INCY (p17) FMA f7 = ALPHA, f36, f84 } ;; { .mmi (p18) STFD [Y1] = f10 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f62 = [X], INCX (p16) LDFD f110 = [Y], INCY (p17) FMA f10 = ALPHA, f39, f87 } ;; { .mmi (p18) STFD [Y1] = f11 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f65 = [X], INCX (p16) LDFD f113 = [Y], INCY (p17) FMA f11 = ALPHA, f42, f90 } ;; { .mmi (p18) STFD [Y1] = f12 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f68 = [X], INCX (p16) LDFD f116 = [Y], INCY (p17) FMA f12 = ALPHA, f45, f93 } ;; { .mmi (p18) STFD [Y1] = f13 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f71 = [X], INCX (p16) LDFD f119 = [Y], INCY (p17) FMA f13 = ALPHA, f48, f96 } ;; { .mmi (p18) STFD [Y1] = f14 (p18) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p16) LDFD f74 = [X], INCX (p16) LDFD f122 = [Y], INCY (p17) FMA f14 = ALPHA, f51, f99 } ;; { .mmf (p18) STFD [Y1] = f15 (p18) add Y1 = INCY, Y1 (p17) FMA f15 = ALPHA, f54, f102 } { .mmb (p16) LDFD f77 = [X], INCX (p16) LDFD f125 = [Y], INCY br.ctop.sptk.few .L112 } ;; .align 32 .L115: (p12) LDFD f32 = [X], INCX (p12) LDFD f34 = [Y], INCY mov pr = PR, -65474 ;; (p12) LDFD f33 = [X], INCX (p12) LDFD f35 = [Y], INCY mov ar.lc = ARLC ;; (p12) LDFD f36 = [X], INCX (p12) LDFD f38 = [Y], INCY (p9) br.ret.sptk.many b0 ;; (p12) LDFD f37 = [X], INCX (p12) LDFD f39 = [Y], INCY tbit.z p0, p13 = N, 2 ;; (p12) LDFD f40 = [X], INCX (p12) LDFD f42 = [Y], INCY tbit.z p0, p14 = N, 1 ;; (p12) LDFD f41 = [X], INCX (p12) LDFD f43 = [Y], INCY tbit.z p0, p15 = N, 0 ;; { .mmf (p12) LDFD f44 = [X], INCX (p12) LDFD f46 = [Y], INCY (p12) FMA f6 = ALPHA, f32, f34 } ;; { .mmf (p12) LDFD f45 = [X], INCX (p12) LDFD f47 = [Y], INCY (p12) FMA f7 = ALPHA, f33, f35 } ;; { .mmf (p13) LDFD f48 = [X], INCX (p13) LDFD f50 = [Y], INCY (p12) FMA f10 = ALPHA, f36, f38 } ;; { .mmf (p13) LDFD f49 = [X], INCX (p13) LDFD f51 = [Y], INCY (p12) FMA f11 = ALPHA, f37, f39 } ;; { .mmi (p12) STFD [Y1] = f6 (p12) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p13) LDFD f52 = [X], INCX (p13) LDFD f54 = [Y], INCY (p12) FMA f12 = ALPHA, f40, f42 } ;; { .mmi (p12) STFD [Y1] = f7 (p12) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p13) LDFD f53 = [X], INCX (p13) LDFD f55 = [Y], INCY (p12) FMA f13 = ALPHA, f41, f43 } ;; { .mmi (p12) STFD [Y1] = f10 (p12) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p14) LDFD f56 = [X], INCX (p14) LDFD f58 = [Y], INCY (p12) FMA f14 = ALPHA, f44, f46 } ;; { .mmi (p12) STFD [Y1] = f11 (p12) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p14) LDFD f57 = [X], INCX (p14) LDFD f59 = [Y], INCY (p12) FMA f15 = ALPHA, f45, f47 } ;; { .mmi (p12) STFD [Y1] = f12 (p12) add Y1 = INCY, Y1 nop.i 0 } { .mmf (p15) LDFD f60 = [X] (p15) LDFD f61 = [Y] (p13) FMA f6 = ALPHA, f48, f50 } ;; { .mmf (p12) STFD [Y1] = f13 (p12) add Y1 = INCY, Y1 (p13) FMA f7 = ALPHA, f49, f51 } ;; { .mmf (p12) STFD [Y1] = f14 (p12) add Y1 = INCY, Y1 (p13) FMA f10 = ALPHA, f52, f54 } ;; { .mmf (p12) STFD [Y1] = f15 (p12) add Y1 = INCY, Y1 (p13) FMA f11 = ALPHA, f53, f55 } ;; { .mmf (p13) STFD [Y1] = f6 (p13) add Y1 = INCY, Y1 (p14) FMA f12 = ALPHA, f56, f58 } ;; { .mmf (p13) STFD [Y1] = f7 (p13) add Y1 = INCY, Y1 (p14) FMA f13 = ALPHA, f57, f59 } ;; { .mmf (p13) STFD [Y1] = f10 (p13) add Y1 = INCY, Y1 (p15) FMA f14 = ALPHA, f60, f61 } ;; (p13) STFD [Y1] = f11 (p13) add Y1 = INCY, Y1 ;; (p14) STFD [Y1] = f12 (p14) add Y1 = INCY, Y1 ;; (p14) STFD [Y1] = f13 (p14) add Y1 = INCY, Y1 ;; (p15) STFD [Y1] = f14 br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/scal.S000066400000000000000000000372741313527062700162520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCH_SIZE (8 * 16) #else #define PREFETCH_SIZE (1 * 64) #endif #define ALPHA f8 #define N r32 #define X1 r36 #define INCX r37 #define X2 r14 #define Y1 r15 #define Y2 r16 #define PRE1 r17 #define I r18 #define NAND15 r19 #define INCX5 r20 #define INCX16 r21 #define XX r22 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi shladd INCX = INCX, BASE_SHIFT, r0 fcmp.eq p0, p6 = ALPHA, f0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.ge p7, p0 = 0, N tbit.z p0, p10 = X1, BASE_SHIFT (p7) br.ret.sptk.many b0 } .body ;; { .mmi mov XX = X1 (p10) LDFD f32 = [X1], INCX mov PR = pr } { .mmi shladd INCX5 = INCX, 2, INCX shladd INCX16 = INCX, 4, r0 (p10) adds N = -1, N } ;; { .mmi shladd X2 = INCX, 2, X1 nop __LINE__ mov ar.ec = 5 } { .mmi and NAND15 = 15, N nop __LINE__ shr I = N, 4 } ;; { .mmi adds I = -1, I nop __LINE__ tbit.z p0, p12 = N, 3 } { .mmb cmp.ge p9, p0 = 0, NAND15 adds PRE1 = PREFETCH_SIZE * SIZE + 192, XX (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 } ;; { .mmi (p10) STFD [XX] = f0 nop __LINE__ mov ar.lc = I } { .mmb cmp.gt p8, p0 = 0, I (p8) br.cond.dpnt .L30 } ;; .align 32 .L20: {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmi lfetch.excl.nt1 [PRE1], INCX16 add X1 = INCX, X1 add X2 = INCX, X2 } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop __LINE__ } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop __LINE__ } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmi add X1 = INCX5, X1 add X2 = INCX5, X2 nop __LINE__ } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop __LINE__ } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop __LINE__ } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmi add X1 = INCX, X1 add X2 = INCX, X2 nop __LINE__ } ;; {.mmi STFD [X1] = f0 STFD [X2] = f0 nop __LINE__ } {.mmb add X1 = INCX5, X1 add X2 = INCX5, X2 br.cloop.sptk.few .L20 } ;; .align 16 .L30: { .mmi (p12) STFD [X1] = f0 (p12) STFD [X2] = f0 mov ar.lc = ARLC } { .mmb (p12) add X1 = INCX, X1 (p12) add X2 = INCX, X2 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p13 = N, 2 } { .mmi (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) STFD [X1] = f0 (p12) add X1 = INCX, X1 tbit.z p0, p15 = N, 0 } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX, X2 nop __LINE__ } ;; { .mmb (p12) STFD [X1] = f0 (p12) add X1 = INCX5, X1 nop __LINE__ } { .mmb (p12) STFD [X2] = f0 (p12) add X2 = INCX5, X2 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 nop __LINE__ } ;; { .mmb (p15) STFD [X1] = f0 nop __LINE__ br.ret.sptk.many b0 } ;; .align 32 .L100: { .mmi mov Y1 = X1 shladd Y2 = INCX, 2, X1 mov pr.rot = 0 } { .mmf cmp.gt p8, p0 = 0, I shladd X2 = INCX, 2, X1 (p10) FMPY f32 = ALPHA, f32 } ;; { .mmi (p10) STFD [XX] = f32 cmp.eq p0, p7 = SIZE, INCX mov ar.lc = I } { .mbb cmp.eq p16, p0 = r0, r0 (p7) br.cond.dpnt .L300 (p8) br.cond.dpnt .L120 } ;; .align 32 .L110: { .mmf (p21) STFD [Y1] = f6, 1 * SIZE (p21) STFD [Y2] = f7, 1 * SIZE (p20) FMPY f112 = ALPHA, f36 } { .mmf (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE (p16) LDFPD f32, f37 = [X1], 2 * SIZE (p20) FMPY f113 = ALPHA, f56 } ;; { .mmf (p21) STFD [Y1] = f10, 1 * SIZE (p21) STFD [Y2] = f11, 1 * SIZE (p20) FMPY f114 = ALPHA, f41 } { .mfi (p16) LDFPD f42, f47 = [X1], 2 * SIZE (p20) FMPY f115 = ALPHA, f61 nop __LINE__ } ;; { .mmf (p21) STFD [Y1] = f12, 1 * SIZE (p21) STFD [Y2] = f13, 1 * SIZE (p20) FMPY f116 = ALPHA, f46 } { .mfi (p16) LDFPD f52, f57 = [X1], 2 * SIZE (p20) FMPY f117 = ALPHA, f66 nop __LINE__ } ;; { .mmf (p21) STFD [Y1] = f14, 5 * SIZE (p21) STFD [Y2] = f15, 5 * SIZE (p20) FMPY f118 = ALPHA, f51 } { .mfi (p16) LDFPD f62, f67 = [X1], 2 * SIZE (p20) FMPY f119 = ALPHA, f71 nop __LINE__ } ;; { .mmf (p20) STFD [Y1] = f112, 1 * SIZE (p20) STFD [Y2] = f113, 1 * SIZE (p20) FMPY f6 = ALPHA, f76 } { .mfi (p16) LDFPD f72, f77 = [X1], 2 * SIZE (p20) FMPY f7 = ALPHA, f96 nop __LINE__ } ;; { .mmf (p20) STFD [Y1] = f114, 1 * SIZE (p20) STFD [Y2] = f115, 1 * SIZE (p20) FMPY f10 = ALPHA, f81 } { .mfi (p16) LDFPD f82, f87 = [X1], 2 * SIZE (p20) FMPY f11 = ALPHA, f101 nop __LINE__ } ;; { .mmf (p20) STFD [Y1] = f116, 1 * SIZE (p20) STFD [Y2] = f117, 1 * SIZE (p20) FMPY f12 = ALPHA, f86 } { .mfi (p16) LDFPD f92, f97 = [X1], 2 * SIZE (p20) FMPY f13 = ALPHA, f106 (p20) shladd X2 = INCX, 2, X1 } ;; { .mmf (p20) STFD [Y1] = f118, 5 * SIZE (p20) STFD [Y2] = f119, 5 * SIZE (p20) FMPY f14 = ALPHA, f91 } { .mfb (p16) LDFPD f102, f107 = [X1], 2 * SIZE (p20) FMPY f15 = ALPHA, f111 br.ctop.sptk.few .L110 } ;; .align 32 .L120: { .mmi (p21) STFD [Y1] = f6, 1 * SIZE (p21) STFD [Y2] = f7, 1 * SIZE tbit.z p0, p13 = N, 2 } { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE (p12) LDFPD f36, f37 = [X2], 2 * SIZE nop __LINE__ } ;; { .mmi (p21) STFD [Y1] = f10, 1 * SIZE (p21) STFD [Y2] = f11, 1 * SIZE mov ar.lc = ARLC } { .mmi (p12) LDFPD f34, f35 = [X1] (p12) LDFPD f38, f39 = [X2] (p12) adds X1 = 6 * SIZE,X1 } ;; { .mmi (p21) STFD [Y1] = f12, 1 * SIZE (p21) STFD [Y2] = f13, 1 * SIZE tbit.z p0, p14 = N, 1 } { .mmi (p13) LDFPD f40, f41 = [X1], 2 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p21) STFD [Y1] = f14, 5 * SIZE (p21) STFD [Y2] = f15, 5 * SIZE mov pr = PR, -65474 } { .mib (p13) LDFPD f42, f43 = [X1], 2 * SIZE nop __LINE__ (p9) br.ret.sptk.many b0 } ;; { .mmi (p14) LDFPD f44, f45 = [X1], 2 * SIZE nop __LINE__ tbit.z p0, p15 = N, 0 } ;; { .mmi (p15) LDFD f46 = [X1] nop __LINE__ nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f32 = ALPHA, f32 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f36 = ALPHA, f36 } ;; { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f33 = ALPHA, f33 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f37 = ALPHA, f37 } ;; { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f34 = ALPHA, f34 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f38 = ALPHA, f38 } ;; { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f35 = ALPHA, f35 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f39 = ALPHA, f39 } ;; { .mmf (p12) STFD [Y1] = f32, 1 * SIZE nop __LINE__ (p13) FMPY f40 = ALPHA, f40 } { .mmf (p12) STFD [Y2] = f36, 1 * SIZE nop __LINE__ (p13) FMPY f41 = ALPHA, f41 } ;; { .mmf (p12) STFD [Y1] = f33, 1 * SIZE nop __LINE__ (p13) FMPY f42 = ALPHA, f42 } { .mmf (p12) STFD [Y2] = f37, 1 * SIZE nop __LINE__ (p13) FMPY f43 = ALPHA, f43 } ;; { .mmf (p12) STFD [Y1] = f34, 1 * SIZE nop __LINE__ (p14) FMPY f44 = ALPHA, f44 } { .mmf (p12) STFD [Y2] = f38, 1 * SIZE nop __LINE__ (p14) FMPY f45 = ALPHA, f45 } ;; { .mmf (p12) STFD [Y1] = f35, 5 * SIZE (p12) STFD [Y2] = f39, 5 * SIZE (p15) FMPY f46 = ALPHA, f46 } ;; { .mmi (p13) STFD [Y1] = f40, 1 * SIZE ;; (p13) STFD [Y1] = f41, 1 * SIZE nop __LINE__ } ;; { .mmi (p13) STFD [Y1] = f42, 1 * SIZE ;; (p13) STFD [Y1] = f43, 1 * SIZE nop __LINE__ } ;; { .mmi (p14) STFD [Y1] = f44, 1 * SIZE ;; (p14) STFD [Y1] = f45, 1 * SIZE nop __LINE__ } ;; { .mmb (p15) STFD [Y1] = f46 nop __LINE__ br.ret.sptk.many b0 } ;; .align 32 .L300: { .mmi adds PRE1 = PREFETCH_SIZE * SIZE + 64, X1 nop __LINE__ mov.i ar.ec = 6 } { .mmb cmp.gt p8, p0 = 0, I nop __LINE__ (p8) br.cond.dpnt .L320 } ;; .align 32 .L310: { .mmf (p16) lfetch.excl.nt1 [PRE1], INCX16 (p16) LDFD f32 = [X1], INCX (p21) FMPY f6 = ALPHA, f37 } { .mmb (p22) STFD [Y1] = f12 (p22) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f38 = [X1], INCX (p21) FMPY f7 = ALPHA, f43 nop __LINE__ } { .mmb (p22) STFD [Y1] = f13 (p22) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f44 = [X1], INCX (p21) FMPY f10 = ALPHA, f49 nop __LINE__ } { .mmb (p22) STFD [Y1] = f14 (p22) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f50 = [X1], INCX (p21) FMPY f11 = ALPHA, f55 nop __LINE__ } { .mmb (p22) STFD [Y1] = f15 (p22) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f56 = [X1], INCX (p21) FMPY f12 = ALPHA, f61 nop __LINE__ } { .mmb (p21) STFD [Y1] = f6 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f62 = [X1], INCX (p21) FMPY f13 = ALPHA, f67 nop __LINE__ } { .mmb (p21) STFD [Y1] = f7 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f68 = [X1], INCX (p21) FMPY f14 = ALPHA, f73 nop __LINE__ } { .mmb (p21) STFD [Y1] = f10 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f74 = [X1], INCX (p21) FMPY f15 = ALPHA, f79 nop __LINE__ } { .mmb (p21) STFD [Y1] = f11 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f80 = [X1], INCX (p21) FMPY f6 = ALPHA, f85 nop __LINE__ } { .mmb (p21) STFD [Y1] = f12 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f86 = [X1], INCX (p21) FMPY f7 = ALPHA, f91 nop __LINE__ } { .mmb (p21) STFD [Y1] = f13 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f92 = [X1], INCX (p21) FMPY f10 = ALPHA, f97 nop __LINE__ } { .mmb (p21) STFD [Y1] = f14 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f98 = [X1], INCX (p21) FMPY f11 = ALPHA, f103 nop __LINE__ } { .mmb (p21) STFD [Y1] = f15 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f104 = [X1], INCX (p21) FMPY f12 = ALPHA, f109 nop __LINE__ } { .mmb (p21) STFD [Y1] = f6 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f110 = [X1], INCX (p21) FMPY f13 = ALPHA, f115 nop __LINE__ } { .mmb (p21) STFD [Y1] = f7 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f116 = [X1], INCX (p21) FMPY f14 = ALPHA, f121 nop __LINE__ } { .mmb (p21) STFD [Y1] = f10 (p21) add Y1 = INCX, Y1 nop __LINE__ } ;; { .mfb (p16) LDFD f122 = [X1], INCX (p21) FMPY f15 = ALPHA, f127 nop __LINE__ } { .mmb (p21) STFD [Y1] = f11 (p21) add Y1 = INCX, Y1 br.ctop.sptk.few .L310 } ;; STFD [Y1] = f12 add Y1 = INCX, Y1 shladd Y2 = INCX, 2, X1 ;; STFD [Y1] = f13 add Y1 = INCX, Y1 shladd X2 = INCX, 2, X1 ;; STFD [Y1] = f14 add Y1 = INCX, Y1 ;; STFD [Y1] = f15 add Y1 = INCX, Y1 ;; .align 16 .L320: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f52 = [X2], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f49 = [X1], INCX (p12) LDFD f53 = [X2], INCX mov pr = PR, -65474 } { .mmb nop.m 0 nop.m 0 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f54 = [X2], INCX tbit.z p0, p13 = N, 2 } ;; { .mmi (p12) LDFD f51 = [X1], INCX5 (p12) LDFD f55 = [X2], INCX5 tbit.z p0, p14 = N, 1 } ;; (p13) LDFD f56 = [X1], INCX tbit.z p0, p15 = N, 0 ;; (p13) LDFD f57 = [X1], INCX ;; { .mfi (p13) LDFD f58 = [X1], INCX (p12) FMPY f48 = ALPHA, f48 } { .mfi (p12) FMPY f52 = ALPHA, f52 } ;; { .mfi (p13) LDFD f59 = [X1], INCX (p12) FMPY f49 = ALPHA, f49 } { .mfi (p12) FMPY f53 = ALPHA, f53 } ;; { .mfi (p14) LDFD f60 = [X1], INCX (p12) FMPY f50 = ALPHA, f50 } { .mfi (p12) FMPY f54 = ALPHA, f54 } ;; { .mfi (p14) LDFD f61 = [X1], INCX (p12) FMPY f51 = ALPHA, f51 } { .mfi (p12) FMPY f55 = ALPHA, f55 } ;; { .mmf (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f52 (p13) FMPY f56 = ALPHA, f56 } { .mmi (p15) LDFD f62 = [X1] (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 } ;; { .mmf (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p13) FMPY f57 = ALPHA, f57 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop __LINE__ } ;; { .mmf (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f54 (p13) FMPY f58 = ALPHA, f58 } { .mmi (p12) add Y1 = INCX, Y1 (p12) add Y2 = INCX, Y2 nop __LINE__ } ;; { .mmf (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) FMPY f59 = ALPHA, f59 } { .mmi (p12) add Y1 = INCX5, Y1 (p12) add Y2 = INCX5, Y2 nop __LINE__ } ;; { .mfi (p13) STFD [Y1] = f56 (p14) FMPY f60 = ALPHA, f60 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f57 (p14) FMPY f61 = ALPHA, f61 (p13) add Y1 = INCX, Y1 } ;; { .mfi (p13) STFD [Y1] = f58 (p15) FMPY f62 = ALPHA, f62 (p13) add Y1 = INCX, Y1 } ;; { .mmi (p13) STFD [Y1] = f59 (p13) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f60 (p14) add Y1 = INCX, Y1 } ;; { .mmi (p14) STFD [Y1] = f61 (p14) add Y1 = INCX, Y1 } ;; { .mib (p15) STFD [Y1] = f62 mov pr = PR, -65474 br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/sdot.S000066400000000000000000000557531313527062700163030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_SIZE (8 * 16 + 4) #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREX r2 #define PREY r3 #define I r14 #define J r15 #define Y2 r16 #define X2 r17 #define INCX16 r18 #define INCY16 r19 #define INCX5 r20 #define INCY5 r21 #define YY r22 #define XA r23 #define YA r24 #define XX r25 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi nop.m 0 mov f8 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mfi mov r26 = 1 mov f9 = f0 shr XA = X1, 3 } ;; .body #ifdef F_INTERFACE LDINT N = [N] LDINT INCX = [INCX] LDINT INCY = [INCY] ;; #ifndef USE64BITINT sxt4 N = N sxt4 INCX = INCX sxt4 INCY = INCY ;; #endif cmp.le p0, p6 = r0, INCX cmp.le p0, p7 = r0, INCY sub r26 = r26, N ;; setf.sig f32 = r26 setf.sig f33 = INCX setf.sig f34 = INCY ;; xmpy.l f33 = f32, f33 xmpy.l f34 = f32, f34 ;; getf.sig r26 = f33 getf.sig r27 = f34 ;; (p6) shladd X1 = r26, BASE_SHIFT, X1 (p7) shladd Y1 = r27, BASE_SHIFT, Y1 ;; #endif { .mfi shladd INCX = INCX, BASE_SHIFT, r0 mov f32 = f0 mov PR = pr } { .mfb cmp.lt p0, p6 = r0, N mov f80 = f0 (p6) br.ret.sptk.many b0 } ;; { .mfi shladd INCY = INCY, BASE_SHIFT, r0 mov f10 = f0 tbit.nz p15, p0 = X1, BASE_SHIFT } { .mfb cmp.ne p6, p0 = SIZE, INCX mov f11 = f0 (p6) br.cond.dptk .L100 } ;; { .mfi (p15) LDFD f32 = [X1], INCX mov f12 = f0 mov pr.rot= 0 } { .mfi (p15) adds N = -1, N mov f13 = f0 shr YA = Y1, 3 } ;; { .mfi (p15) LDFD f80 = [Y1], INCY mov f14 = f0 shr I = N, 4 } { .mmi and J = 15, N and XA = 0x1f, XA and YA = 0x1f, YA } ;; { .mmi shladd INCX5 = INCX, 2, INCX shladd INCY5 = INCY, 2, INCY sub XA = YA, XA } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } ;; { .mmi shladd Y2 = INCY, 2, Y1 cmp.eq p7, p0 = r0, J mov ar.ec= 3 } { .mmi adds I = -1, I cmp.ge p8, p0 = 4, XA cmp.eq p16, p0 = r0, r0 } ;; { .mbb cmp.le p9, p0 = 24, XA (p8) br.cond.dpnt .L20 (p9) br.cond.dpnt .L20 } ;; { .mmi adds PREX = PREFETCH_SIZE * SIZE, X1 adds PREY = (PREFETCH_SIZE + 6) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L15 } ;; .align 32 /* INCX == 1 && X is aligned */ .L12: { .mmf (p16) LDFPD f32, f35 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f80 = [Y1], INCY (p16) LDFD f92 = [Y2], INCY (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p16) LDFD f83 = [Y1], INCY (p16) LDFD f95 = [Y2], INCY (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [X1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p16) LDFD f86 = [Y1], INCY (p16) LDFD f98 = [Y2], INCY (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [X1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p16) LDFD f89 = [Y1], INCY5 (p16) LDFD f101 = [Y2], INCY5 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [X1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f104 = [Y1], INCY (p16) LDFD f116 = [Y2], INCY (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [X1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f107 = [Y1], INCY (p16) LDFD f119 = [Y2], INCY (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [X1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f110 = [Y1], INCY (p16) LDFD f122 = [Y2], INCY (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [X1], 2 * SIZE (p16) LDFD f113 = [Y1], INCY5 (p18) FMA f14 = f76, f124, f14 } { .mfb (p16) LDFD f125 = [Y2], INCY5 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE mov YY = Y1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [Y1], INCY (p12) LDFD f42 = [Y2], INCY (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [X1], 2 * SIZE (p12) shladd YY = INCY, 3, YY tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [Y1], INCY (p12) LDFD f43 = [Y2], INCY tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [X1], 2 * SIZE (p13) shladd YY = INCY, 2, YY } { .mmi (p12) LDFD f38 = [Y1], INCY (p12) LDFD f46 = [Y2], INCY } ;; (p12) LDFPD f44, f45 = [X1], 2 * SIZE (p12) LDFD f39 = [Y1], INCY5 (p12) LDFD f47 = [Y2], INCY5 ;; (p13) LDFPD f48, f49 = [X1], 2 * SIZE (p13) LDFD f50 = [Y1], INCY (p14) LDFD f58 = [YY], INCY ;; (p13) LDFPD f52, f53 = [X1], 2 * SIZE (p13) LDFD f51 = [Y1], INCY (p14) LDFD f59 = [YY], INCY ;; (p14) LDFPD f56, f57 = [X1], 2 * SIZE (p13) LDFD f54 = [Y1], INCY (p15) LDFD f61 = [YY] ;; (p13) LDFD f55 = [Y1], INCY (p15) LDFD f60 = [X1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L20: { .mmi adds PREX = PREFETCH_SIZE * SIZE, X1 adds PREY = (PREFETCH_SIZE + 38) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L25 } ;; .align 32 .L22: { .mmf (p16) LDFPD f32, f35 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p17) LDFD f81 = [Y1], INCY (p17) LDFD f93 = [Y2], INCY (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [X1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p17) LDFD f84 = [Y1], INCY (p17) LDFD f96 = [Y2], INCY (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [X1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p17) LDFD f87 = [Y1], INCY (p17) LDFD f99 = [Y2], INCY (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [X1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p17) LDFD f90 = [Y1], INCY5 (p17) LDFD f102 = [Y2], INCY5 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [X1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p17) LDFD f105 = [Y1], INCY (p17) LDFD f117 = [Y2], INCY (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [X1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p17) LDFD f108 = [Y1], INCY (p17) LDFD f120 = [Y2], INCY (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [X1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p17) LDFD f111 = [Y1], INCY (p17) LDFD f123 = [Y2], INCY (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [X1], 2 * SIZE (p17) LDFD f114 = [Y1], INCY5 (p18) FMA f14 = f76, f124, f14 } { .mfb (p17) LDFD f126 = [Y2], INCY5 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmi (p12) LDFPD f32, f33 = [X1], 2 * SIZE mov YY = Y1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [Y1], INCY (p12) LDFD f42 = [Y2], INCY (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [X1], 2 * SIZE (p12) shladd YY = INCY, 3, YY tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [Y1], INCY (p12) LDFD f43 = [Y2], INCY tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [X1], 2 * SIZE (p13) shladd YY = INCY, 2, YY } { .mmi (p12) LDFD f38 = [Y1], INCY (p12) LDFD f46 = [Y2], INCY } ;; (p12) LDFPD f44, f45 = [X1], 2 * SIZE (p12) LDFD f39 = [Y1], INCY5 (p12) LDFD f47 = [Y2], INCY5 ;; (p13) LDFPD f48, f49 = [X1], 2 * SIZE (p13) LDFD f50 = [Y1], INCY (p14) LDFD f58 = [YY], INCY ;; (p13) LDFPD f52, f53 = [X1], 2 * SIZE (p13) LDFD f51 = [Y1], INCY (p14) LDFD f59 = [YY], INCY ;; (p14) LDFPD f56, f57 = [X1], 2 * SIZE (p13) LDFD f54 = [Y1], INCY (p15) LDFD f61 = [YY] ;; (p13) LDFD f55 = [Y1], INCY (p15) LDFD f60 = [X1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L100: { .mmi shladd X2 = INCX, 2, X1 } { .mib cmp.ne p6, p0 = SIZE, INCY tbit.nz p15, p0 = Y1, BASE_SHIFT (p6) br.cond.dptk .L200 } ;; { .mfi (p15) LDFD f32 = [X1], INCX mov f12 = f0 mov pr.rot= 0 } { .mfi (p15) adds N = -1, N mov f13 = f0 shr YA = Y1, 3 } ;; { .mfi (p15) LDFD f80 = [Y1], INCY mov f14 = f0 shr I = N, 4 } { .mmi and J = 15, N and XA = 0x1f, XA and YA = 0x1f, YA } ;; { .mmi shladd INCX5 = INCX, 2, INCX shladd INCY5 = INCY, 2, INCY sub XA = YA, XA } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } ;; { .mmi shladd X2 = INCX, 2, X1 cmp.eq p7, p0 = r0, J mov ar.ec= 3 } { .mmi adds I = -1, I cmp.ge p8, p0 = 8, XA cmp.eq p16, p0 = r0, r0 } ;; { .mbb cmp.le p9, p0 = 28, XA (p8) br.cond.dpnt .L120 (p9) br.cond.dpnt .L120 } ;; { .mmi adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L115 } ;; .align 32 /* INCY == 1 */ .L112: { .mmf (p16) LDFPD f32, f35 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f80 = [X1], INCX (p16) LDFD f92 = [X2], INCX (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p16) LDFD f83 = [X1], INCX (p16) LDFD f95 = [X2], INCX (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [Y1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p16) LDFD f86 = [X1], INCX (p16) LDFD f98 = [X2], INCX (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [Y1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p16) LDFD f89 = [X1], INCX5 (p16) LDFD f101 = [X2], INCX5 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [Y1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f104 = [X1], INCX (p16) LDFD f116 = [X2], INCX (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [Y1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f107 = [X1], INCX (p16) LDFD f119 = [X2], INCX (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [Y1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f110 = [X1], INCX (p16) LDFD f122 = [X2], INCX (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [Y1], 2 * SIZE (p16) LDFD f113 = [X1], INCX5 (p18) FMA f14 = f76, f124, f14 } { .mfb (p16) LDFD f125 = [X2], INCX5 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L112 } ;; .align 32 .L115: { .mmi (p12) LDFPD f32, f33 = [Y1], 2 * SIZE mov XX = X1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [X1], INCX (p12) LDFD f42 = [X2], INCX (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [Y1], 2 * SIZE (p12) shladd XX = INCX, 3, XX tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [X1], INCX (p12) LDFD f43 = [X2], INCX tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [Y1], 2 * SIZE (p13) shladd XX = INCX, 2, XX } { .mmi (p12) LDFD f38 = [X1], INCX (p12) LDFD f46 = [X2], INCX } ;; (p12) LDFPD f44, f45 = [Y1], 2 * SIZE (p12) LDFD f39 = [X1], INCX5 (p12) LDFD f47 = [X2], INCX5 ;; (p13) LDFPD f48, f49 = [Y1], 2 * SIZE (p13) LDFD f50 = [X1], INCX (p14) LDFD f58 = [XX], INCX ;; (p13) LDFPD f52, f53 = [Y1], 2 * SIZE (p13) LDFD f51 = [X1], INCX (p14) LDFD f59 = [XX], INCX ;; (p14) LDFPD f56, f57 = [Y1], 2 * SIZE (p13) LDFD f54 = [X1], INCX (p15) LDFD f61 = [XX] ;; (p13) LDFD f55 = [X1], INCX (p15) LDFD f60 = [Y1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L120: { .mmi adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I FMA f15 = f32, f80, f0 (p6) br.cond.dpnt .L125 } ;; .align 32 .L122: { .mmf (p16) LDFPD f32, f35 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREX], INCX16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p17) LDFD f81 = [X1], INCX (p17) LDFD f93 = [X2], INCX (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFPD f38, f41 = [Y1], 2 * SIZE (p16) lfetch.nt1 [PREY], INCX16 (p18) FMA f10 = f40, f88, f10 } { .mmf (p17) LDFD f84 = [X1], INCX (p17) LDFD f96 = [X2], INCX (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFPD f44, f47 = [Y1], 2 * SIZE (p18) FMA f12 = f46, f94, f12 } { .mmf (p17) LDFD f87 = [X1], INCX (p17) LDFD f99 = [X2], INCX (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFPD f50, f53 = [Y1], 2 * SIZE (p18) FMA f14 = f52, f100, f14 } { .mmf (p17) LDFD f90 = [X1], INCX5 (p17) LDFD f102 = [X2], INCX5 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFPD f56, f59 = [Y1], 2 * SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf (p17) LDFD f105 = [X1], INCX (p17) LDFD f117 = [X2], INCX (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFPD f62, f65 = [Y1], 2 * SIZE (p18) FMA f10 = f64, f112, f10 } { .mmf (p17) LDFD f108 = [X1], INCX (p17) LDFD f120 = [X2], INCX (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFPD f68, f71 = [Y1], 2 * SIZE (p18) FMA f12 = f70, f118, f12 } { .mmf (p17) LDFD f111 = [X1], INCX (p17) LDFD f123 = [X2], INCX (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFPD f74, f77 = [Y1], 2 * SIZE (p17) LDFD f114 = [X1], INCX5 (p18) FMA f14 = f76, f124, f14 } { .mfb (p17) LDFD f126 = [X2], INCX5 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L122 } ;; .align 32 .L125: { .mmi (p12) LDFPD f32, f33 = [Y1], 2 * SIZE mov XX = X1 tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f34 = [X1], INCX (p12) LDFD f42 = [X2], INCX (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFPD f36, f37 = [Y1], 2 * SIZE (p12) shladd XX = INCX, 3, XX tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f35 = [X1], INCX (p12) LDFD f43 = [X2], INCX tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFPD f40, f41 = [Y1], 2 * SIZE (p13) shladd XX = INCX, 2, XX } { .mmi (p12) LDFD f38 = [X1], INCX (p12) LDFD f46 = [X2], INCX } ;; (p12) LDFPD f44, f45 = [Y1], 2 * SIZE (p12) LDFD f39 = [X1], INCX5 (p12) LDFD f47 = [X2], INCX5 ;; (p13) LDFPD f48, f49 = [Y1], 2 * SIZE (p13) LDFD f50 = [X1], INCX (p14) LDFD f58 = [XX], INCX ;; (p13) LDFPD f52, f53 = [Y1], 2 * SIZE (p13) LDFD f51 = [X1], INCX (p14) LDFD f59 = [XX], INCX ;; (p14) LDFPD f56, f57 = [Y1], 2 * SIZE (p13) LDFD f54 = [X1], INCX (p15) LDFD f61 = [XX] ;; (p13) LDFD f55 = [X1], INCX (p15) LDFD f60 = [Y1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L200: { .mfi shladd INCX5 = INCX, 2, INCX mov f12 = f0 mov pr.rot= 0 } { .mfi and J = 15, N mov f13 = f0 shr I = N, 4 } ;; { .mmf cmp.eq p16, p0 = r0, r0 shladd INCY5 = INCY, 2, INCY mov f14 = f0 } { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 tbit.z p0, p12 = N, 3 } ;; { .mmi cmp.eq p7, p0 = r0, J adds I = -1, I mov ar.ec= 3 } { .mmi shladd Y2 = INCY, 2, Y1 mov XX = X1 mov YY = Y1 } ;; { .mmi adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 mov ar.lc = I } { .mfb cmp.eq p6 ,p0 = -1, I mov f15 = f0 (p6) br.cond.dpnt .L215 } ;; .align 32 /* INCY == 1 */ .L212: { .mmf (p16) lfetch.nt1 [PREX], INCX16 (p16) lfetch.nt1 [PREY], INCY16 (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f32 = [Y1], INCY (p16) LDFD f44 = [Y2], INCY (p18) FMA f9 = f37, f85, f9 } ;; { .mmf (p16) LDFD f80 = [X1], INCX (p16) LDFD f92 = [X2], INCX (p18) FMA f10 = f40, f88, f10 } { .mmf (p16) LDFD f35 = [Y1], INCY (p16) LDFD f47 = [Y2], INCY (p18) FMA f11 = f43, f91, f11 } ;; { .mmf (p16) LDFD f83 = [X1], INCX (p16) LDFD f95 = [X2], INCX (p18) FMA f12 = f46, f94, f12 } { .mmf (p16) LDFD f38 = [Y1], INCY (p16) LDFD f50 = [Y2], INCY (p18) FMA f13 = f49, f97, f13 } ;; { .mmf (p16) LDFD f86 = [X1], INCX (p16) LDFD f98 = [X2], INCX (p18) FMA f14 = f52, f100, f14 } { .mmf (p16) LDFD f41 = [Y1], INCY5 (p16) LDFD f53 = [Y2], INCY5 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFD f89 = [X1], INCX5 (p16) LDFD f101 = [X2], INCX5 (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f56 = [Y1], INCY (p16) LDFD f68 = [Y2], INCY (p18) FMA f9 = f61, f109, f9 } ;; { .mmf (p16) LDFD f104 = [X1], INCX (p16) LDFD f116 = [X2], INCX (p18) FMA f10 = f64, f112, f10 } { .mmf (p16) LDFD f59 = [Y1], INCY (p16) LDFD f71 = [Y2], INCY (p18) FMA f11 = f67, f115, f11 } ;; { .mmf (p16) LDFD f107 = [X1], INCX (p16) LDFD f119 = [X2], INCX (p18) FMA f12 = f70, f118, f12 } { .mmf (p16) LDFD f62 = [Y1], INCY (p16) LDFD f74 = [Y2], INCY (p18) FMA f13 = f73, f121, f13 } ;; { .mmf (p16) LDFD f110 = [X1], INCX (p16) LDFD f122 = [X2], INCX (p18) FMA f14 = f76, f124, f14 } { .mmf (p16) LDFD f65 = [Y1], INCY5 (p16) LDFD f77 = [Y2], INCY5 (p18) FMA f15 = f79, f127, f15 } ;; { .mmi (p16) LDFD f113 = [X1], INCX5 (p16) LDFD f125 = [X2], INCX5 } { .mmb (p16) add XX = INCX16, XX (p16) add YY = INCY16, YY br.ctop.sptk.few .L212 } ;; .align 32 .L215: { .mmi (p12) LDFD f34 = [X1], INCX (p12) LDFD f42 = [X2], INCX tbit.z p0, p13 = N, 2 } { .mmb (p12) LDFD f32 = [Y1], INCY (p12) LDFD f40 = [Y2], INCY (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFD f35 = [X1], INCX (p12) LDFD f43 = [X2], INCX tbit.z p0, p14 = N, 1 } { .mmi (p12) LDFD f33 = [Y1], INCY (p12) LDFD f41 = [Y2], INCY tbit.z p0, p15 = N, 0 } ;; { .mmi (p12) LDFD f38 = [X1], INCX (p12) LDFD f46 = [X2], INCX (p12) shladd XX = INCX, 3, XX } { .mmi (p12) LDFD f36 = [Y1], INCY (p12) LDFD f44 = [Y2], INCY (p12) shladd YY = INCY, 3, YY } ;; { .mmi (p12) LDFD f39 = [X1], INCX5 (p12) LDFD f47 = [X2], INCX5 (p13) shladd XX = INCX, 2, XX } { .mmi (p12) LDFD f37 = [Y1], INCY5 (p12) LDFD f45 = [Y2], INCY5 (p13) shladd YY = INCY, 2, YY } ;; (p13) LDFD f50 = [X1], INCX (p13) LDFD f48 = [Y1], INCY (p14) LDFD f58 = [XX], INCX (p14) LDFD f56 = [YY], INCY ;; (p13) LDFD f51 = [X1], INCX (p13) LDFD f49 = [Y1], INCY (p14) LDFD f59 = [XX], INCX (p14) LDFD f57 = [YY], INCY ;; (p13) LDFD f54 = [X1], INCX (p13) LDFD f52 = [Y1], INCY (p15) LDFD f61 = [XX] (p15) LDFD f60 = [YY] ;; (p13) LDFD f55 = [X1] (p13) LDFD f53 = [Y1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f33, f35, f9 (p12) FMA f10 = f36, f38, f10 (p12) FMA f11 = f37, f39, f11 (p12) FMA f12 = f40, f42, f12 (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 (p13) FMA f11 = f53, f55, f11 (p14) FMA f12 = f56, f58, f12 (p14) FMA f13 = f57, f59, f13 (p15) FMA f14 = f60, f61, f14 br .L999 ;; .align 32 .L999: FADD f8 = f8, f9 FADD f10 = f10, f11 FADD f12 = f12, f13 FADD f14 = f14, f15 ;; FADD f8 = f8, f10 FADD f12 = f12, f14 mov ar.lc = ARLC ;; FADD f8 = f8, f12 mov pr = PR, -65474 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/sgemv_n.S000066400000000000000000001704161313527062700167620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define SP r12 #define M r32 #define N r33 #define A r36 #define LDA r37 #define X r38 #define INCX r39 #define Y r34 #define INCY r35 #define BUFFER r11 #define I r14 #define J r15 #define AO1 r16 #define AO2 r17 #define AO3 r18 #define AO4 r19 #define AO5 r20 #define AO6 r21 #define AO7 r22 #define AO8 r23 #define YLD1 r24 #define YST1 r25 #define YST2 r27 #define MM r28 #define YY r9 #define RPRE1 loc0 #define RPRE2 loc1 #define RPRE3 loc2 #define RPRE4 loc3 #define RPRE5 loc4 #define RPRE6 loc5 #define RPRE7 loc6 #define RPRE8 loc7 #define AO11 loc8 #define AO21 loc9 #define AO31 loc10 #define AO41 loc11 #define AO51 loc12 #define AO61 loc13 #define AO71 loc14 #define AO81 loc15 #define PREB r8 #define ARLC r29 #define PR r30 #define ARPFS r31 #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else #define RPREFETCH (16 * 3 + 16) #endif #define PREFETCH lfetch.nt1 #define ALPHA f6 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 8, 0 mov ARLC = ar.lc } ;; mov PR = pr adds r14 = 16, SP adds r15 = 24, SP adds r16 = 32, SP ;; adds r8 = -8 * 16, SP adds r9 = -7 * 16, SP adds SP = -8 * 16, SP ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 .body ;; ld8 Y = [r14] ld8 INCY = [r15] ld8 BUFFER = [r16] mov ALPHA = f8 cmp.ge p7, p0 = 0, M cmp.ge p6, p0 = 0, N ;; shladd INCX = INCX, BASE_SHIFT, r0 shladd LDA = LDA, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 ;; tbit.nz p8, p0 = A, BASE_SHIFT tbit.nz p9, p0 = LDA, BASE_SHIFT mov MM = M ;; (p8) adds MM = -1, M ;; (p7) br.cond.dpnt .L999 (p6) br.cond.dpnt .L999 ;; sub I = A, Y cmp.eq p10, p0 = SIZE, INCY mov YY = Y ;; (p10) tbit.z.unc p10, p0 = I, BASE_SHIFT ;; (p10) br.cond.dptk .L10 ;; shr J = M, 3 mov YY = BUFFER ;; (p8) adds YY = SIZE, BUFFER ;; mov ar.lc = J mov YST1 = YY adds YST2 = 4 * SIZE, YY ;; .L02: STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 5 * SIZE STFD [YST2] = f0, 5 * SIZE br.cloop.sptk.few .L02 ;; .L10: (p9) br.cond.dptk .L100 shr J = N, 3 ;; cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 ;; .align 16 .L11: mov YLD1 = YY mov YST1 = YY ;; LDFD f8 = [X], INCX ;; LDFD f9 = [X], INCX ;; LDFD f10 = [X], INCX ;; LDFD f11 = [X], INCX ;; LDFD f12 = [X], INCX ;; LDFD f13 = [X], INCX ;; LDFD f14 = [X], INCX ;; LDFD f15 = [X], INCX ;; FMPY f8 = ALPHA, f8 FMPY f9 = ALPHA, f9 FMPY f10 = ALPHA, f10 FMPY f11 = ALPHA, f11 FMPY f12 = ALPHA, f12 FMPY f13 = ALPHA, f13 FMPY f14 = ALPHA, f14 FMPY f15 = ALPHA, f15 ;; mov AO1 = A add AO2 = LDA, A ;; shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 ;; shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 ;; shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 shladd A = LDA, 3, A ;; ;; adds PREB = RPREFETCH * SIZE, YLD1 adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 adds RPRE3 = RPREFETCH * SIZE, AO3 adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 adds RPRE5 = RPREFETCH * SIZE, AO5 adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 adds RPRE7 = RPREFETCH * SIZE, AO7 adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 (p8) LDFD f80 = [AO1], 1 * SIZE (p8) LDFD f81 = [AO2], 1 * SIZE (p8) LDFD f82 = [AO3], 1 * SIZE (p8) LDFD f83 = [AO4], 1 * SIZE (p8) LDFD f84 = [AO5], 1 * SIZE (p8) LDFD f85 = [AO6], 1 * SIZE (p8) LDFD f86 = [AO7], 1 * SIZE (p8) LDFD f87 = [AO8], 1 * SIZE (p8) LDFD f106 = [YLD1], 1 * SIZE ;; (p8) FMPY f32 = f8, f80 (p8) FMPY f33 = f9, f81 (p8) FMPY f34 = f10, f82 (p8) FMA f35 = f11, f83, f106 ;; (p8) FMA f32 = f12, f84, f32 (p8) FMA f33 = f13, f85, f33 (p8) FMA f34 = f14, f86, f34 (p8) FMA f35 = f15, f87, f35 ;; (p8) FADD f32 = f32, f33 (p8) FADD f34 = f34, f35 ;; (p8) FADD f32 = f32, f34 ;; (p8) STFD [YST1] = f32, 1 * SIZE shr I = MM, 3 mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I tbit.nz p13, p0 = MM, 2 ;; mov ar.lc = I mov ar.ec= 2 (p6) br.cond.dpnt .L15 ;; .align 16 .L12: { .mfi (p17) LDFPD f95, f96 = [AO8], 2 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mfi (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFPD f46, f47 = [AO2], 2 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f101 = f11, f57, f101 } { .mfi (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f104 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f56, f57 = [AO4], 2 * SIZE (p17) FMA f107 = f11, f59, f107 } { .mfi (p15) PREFETCH [RPRE4], 16 * SIZE (p17) FMA f110 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f58, f59 = [AO4], 2 * SIZE (p17) FMA f113 = f11, f61, f113 } { .mfi (p17) FMA f116 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f60, f61 = [AO4], 2 * SIZE (p17) FMA f119 = f11, f63, f119 } { .mfi (p17) FMA f122 = f11, f64, f122 } ;; { .mfi (p16) LDFPD f62, f63 = [AO4], 2 * SIZE (p17) FMA f101 = f12, f65, f101 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f104 = f12, f66, f104 } ;; { .mfi (p16) LDFPD f64, f65 = [AO5], 2 * SIZE (p17) FMA f107 = f12, f67, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f110 = f12, f68, f110 } ;; { .mfi (p16) LDFPD f66, f67 = [AO5], 2 * SIZE (p17) FMA f113 = f12, f69, f113 } { .mfi (p14) PREFETCH [RPRE5], 16 * SIZE (p17) FMA f116 = f12, f70, f116 } ;; { .mfi (p16) LDFPD f68, f69 = [AO5], 2 * SIZE (p17) FMA f119 = f12, f71, f119 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f122 = f12, f72, f122 } ;; { .mfi (p16) LDFPD f70, f71 = [AO5], 2 * SIZE (p17) FMA f101 = f13, f73, f101 } { .mfi (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f104 = f13, f74, f104 } ;; { .mfi (p16) LDFPD f72, f73 = [AO6], 2 * SIZE (p17) FMA f107 = f13, f75, f107 } { .mfi (p15) PREFETCH [RPRE6], 16 * SIZE (p17) FMA f110 = f13, f76, f110 } ;; { .mfi (p16) LDFPD f74, f75 = [AO6], 2 * SIZE (p17) FMA f113 = f13, f77, f113 } { .mfi (p17) FMA f116 = f13, f78, f116 } ;; { .mfi (p16) LDFPD f76, f77 = [AO6], 2 * SIZE (p17) FMA f119 = f13, f79, f119 } { .mfi (p17) FMA f122 = f13, f80, f122 } ;; { .mfi (p16) LDFPD f78, f79 = [AO6], 2 * SIZE (p17) FMA f101 = f14, f81, f101 } { .mfi (p17) FMA f104 = f14, f82, f104 } ;; { .mfi (p16) LDFPD f80, f81 = [AO7], 2 * SIZE (p17) FMA f107 = f14, f83, f107 } { .mfi (p14) PREFETCH [RPRE7], 16 * SIZE (p17) FMA f110 = f14, f84, f110 } ;; { .mfi (p16) LDFPD f82, f83 = [AO7], 2 * SIZE (p17) FMA f113 = f14, f85, f113 } { .mfi (p17) FMA f116 = f14, f86, f116 } ;; { .mfi (p16) LDFPD f84, f85 = [AO7], 2 * SIZE (p17) FMA f119 = f14, f87, f119 } { .mfi (p17) FMA f122 = f14, f88, f122 } ;; { .mfi (p16) LDFPD f86, f87 = [AO7], 2 * SIZE (p17) FMA f16 = f15, f89, f101 } { .mfi (p17) FMA f17 = f15, f90, f104 } ;; { .mfi (p16) LDFPD f88, f89 = [AO8], 2 * SIZE (p17) FMA f18 = f15, f91, f107 } { .mfi (p15) PREFETCH [RPRE8], 16 * SIZE (p17) FMA f19 = f15, f92, f110 } ;; { .mfi (p16) LDFPD f90, f91 = [AO8], 2 * SIZE (p17) FMA f20 = f15, f93, f113 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f21 = f15, f94, f116 } ;; { .mfi (p16) LDFPD f92, f93 = [AO8], 2 * SIZE (p17) FMA f22 = f15, f95, f119 } { .mfb (p16) adds I = -1, I (p17) FMA f23 = f15, f96, f122 br.ctop.sptk.few .L12 } ;; .align 16 .L15: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE cmp.lt p6, p0 = 1, J adds J = -1, J } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p15) LDFD f81 = [AO2] (p15) LDFD f82 = [AO3] nop __LINE__ } { .mmi (p18) STFD [YST1] = f23, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mfi (p13) LDFPD f38, f39 = [AO4], 2 * SIZE (p13) FMA f100 = f8, f32, f100 nop __LINE__ } { .mfi (p13) LDFPD f40, f41 = [AO5], 2 * SIZE (p13) FMA f101 = f8, f33, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f54, f55 = [AO4], 2 * SIZE (p13) FMA f102 = f8, f48, f102 nop __LINE__ } { .mfi (p13) LDFPD f56, f57 = [AO5], 2 * SIZE (p13) FMA f103 = f8, f49, f103 nop __LINE__ } ;; { .mfi (p14) LDFPD f70, f71 = [AO4], 2 * SIZE (p14) FMA f104 = f8, f64, f104 nop __LINE__ } { .mfi (p14) LDFPD f72, f73 = [AO5], 2 * SIZE (p14) FMA f105 = f8, f65, f105 nop __LINE__ } ;; { .mfi (p15) LDFD f83 = [AO4] (p15) FMA f106 = f8, f80, f106 nop __LINE__ } { .mfi (p15) LDFD f84 = [AO5] nop __LINE__ nop __LINE__ } ;; { .mfi (p13) LDFPD f42, f43 = [AO6], 2 * SIZE (p13) FMA f100 = f9, f34, f100 nop __LINE__ } { .mfi (p13) LDFPD f44, f45 = [AO7], 2 * SIZE (p13) FMA f101 = f9, f35, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f58, f59 = [AO6], 2 * SIZE (p13) FMA f102 = f9, f50, f102 nop __LINE__ } { .mfi (p13) LDFPD f60, f61 = [AO7], 2 * SIZE (p13) FMA f103 = f9, f51, f103 nop __LINE__ } ;; { .mfi (p14) LDFPD f74, f75 = [AO6], 2 * SIZE (p14) FMA f104 = f9, f66, f104 nop __LINE__ } { .mfi (p14) LDFPD f76, f77 = [AO7], 2 * SIZE (p14) FMA f105 = f9, f67, f105 nop __LINE__ } ;; { .mfi (p15) LDFD f85 = [AO6] (p15) FMA f106 = f9, f81, f106 nop __LINE__ } { .mfi (p15) LDFD f86 = [AO7] nop __LINE__ nop __LINE__ } ;; { .mfi (p13) LDFPD f46, f47 = [AO8], 2 * SIZE (p13) FMA f100 = f10, f36, f100 nop __LINE__ } { .mfi (p13) FMA f101 = f10, f37, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f62, f63 = [AO8], 2 * SIZE (p13) FMA f102 = f10, f52, f102 nop __LINE__ } { .mfi (p13) FMA f103 = f10, f53, f103 nop __LINE__ } ;; { .mfi (p14) LDFPD f78, f79 = [AO8], 2 * SIZE (p14) FMA f104 = f10, f68, f104 nop __LINE__ } { .mfi (p14) FMA f105 = f10, f69, f105 nop __LINE__ } ;; { .mfi (p15) LDFD f87 = [AO8] (p15) FMA f106 = f10, f82, f106 nop __LINE__ } ;; (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 (p13) FMA f102 = f11, f54, f102 (p13) FMA f103 = f11, f55, f103 (p14) FMA f104 = f11, f70, f104 (p14) FMA f105 = f11, f71, f105 (p15) FMA f106 = f11, f83, f106 ;; (p13) FMA f100 = f12, f40, f100 (p13) FMA f101 = f12, f41, f101 (p13) FMA f102 = f12, f56, f102 (p13) FMA f103 = f12, f57, f103 (p14) FMA f104 = f12, f72, f104 (p14) FMA f105 = f12, f73, f105 (p15) FMA f106 = f12, f84, f106 ;; (p13) FMA f100 = f13, f42, f100 (p13) FMA f101 = f13, f43, f101 (p13) FMA f102 = f13, f58, f102 (p13) FMA f103 = f13, f59, f103 (p14) FMA f104 = f13, f74, f104 (p14) FMA f105 = f13, f75, f105 (p15) FMA f106 = f13, f85, f106 ;; (p13) FMA f100 = f14, f44, f100 (p13) FMA f101 = f14, f45, f101 (p13) FMA f102 = f14, f60, f102 (p13) FMA f103 = f14, f61, f103 (p14) FMA f104 = f14, f76, f104 (p14) FMA f105 = f14, f77, f105 (p15) FMA f106 = f14, f86, f106 ;; (p13) FMA f100 = f15, f46, f100 (p13) FMA f101 = f15, f47, f101 (p13) FMA f102 = f15, f62, f102 (p13) FMA f103 = f15, f63, f103 (p14) FMA f104 = f15, f78, f104 (p14) FMA f105 = f15, f79, f105 (p15) FMA f106 = f15, f87, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE (p6) br.cond.dptk .L11 ;; .align 16 .L20: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 2 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L30 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd AO4 = LDA, 1, AO2 } ;; { .mmi LDFD f10 = [X], INCX (p8) LDFD f81 = [AO2], 1 * SIZE shladd AO3 = LDA, 1, A } ;; { .mmi LDFD f11 = [X], INCX (p8) LDFD f82 = [AO3], 1 * SIZE } ;; { .mfi (p8) LDFD f83 = [AO4], 1 * SIZE FMPY f8 = ALPHA, f8 adds PREB = RPREFETCH * SIZE, YLD1 } { .mfi adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 } ;; FMPY f10 = ALPHA, f10 shladd A = LDA, 2, A FMPY f11 = ALPHA, f11 ;; { .mfi adds RPRE3 = RPREFETCH * SIZE, AO3 (p8) FMA f106 = f8, f80, f106 mov ar.ec= 2 } ;; adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 (p8) FMA f106 = f9, f81, f106 shr I = MM, 3 ;; { .mmf cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 (p8) FMA f106 = f10, f82, f106 } ;; { .mfi adds I = -1, I (p8) FMA f106 = f11, f83, f106 tbit.nz p13, p0 = MM, 2 } ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L25 } ;; .align 16 .L22: { .mfi (p17) LDFPD f63, f64 = [AO4], 2 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 (p16) adds I = -1, I } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mfi (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mmf (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFPD f46, f47 = [AO2], 2 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f16 = f11, f57, f101 } { .mfi (p15) PREFETCH [RPRE4], 16 * SIZE (p17) FMA f17 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f56, f57 = [AO4], 2 * SIZE (p17) FMA f18 = f11, f59, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f58, f59 = [AO4], 2 * SIZE (p17) FMA f20 = f11, f61, f113 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f60, f61 = [AO4], 2 * SIZE (p17) FMA f22 = f11, f63, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f11, f64, f122 br.ctop.sptk.few .L22 } ;; .align 16 .L25: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmf (p15) LDFD f81 = [AO2] (p15) LDFD f82 = [AO3] (p13) FMA f100 = f8, f32, f100 } { .mfi (p18) STFD [YST1] = f23, 1 * SIZE (p13) FMA f101 = f8, f33, f101 } ;; ;; { .mfi (p13) LDFPD f38, f39 = [AO4], 2 * SIZE (p13) FMA f102 = f8, f48, f102 } { .mfi (p13) FMA f103 = f8, f49, f103 } ;; { .mfi (p13) LDFPD f54, f55 = [AO4], 2 * SIZE (p14) FMA f104 = f8, f64, f104 } { .mfi (p14) FMA f105 = f8, f65, f105 } ;; { .mfi (p14) LDFPD f70, f71 = [AO4], 2 * SIZE (p15) FMA f106 = f8, f80, f106 } { .mfi (p13) FMA f100 = f9, f34, f100 } ;; { .mfi (p15) LDFD f83 = [AO4] (p13) FMA f101 = f9, f35, f101 } { .mfi (p13) FMA f102 = f9, f50, f102 } ;; (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) FMA f100 = f10, f36, f100 (p13) FMA f101 = f10, f37, f101 (p13) FMA f102 = f10, f52, f102 (p13) FMA f103 = f10, f53, f103 (p14) FMA f104 = f10, f68, f104 (p14) FMA f105 = f10, f69, f105 (p15) FMA f106 = f10, f82, f106 ;; (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 ;; (p13) FMA f102 = f11, f54, f102 (p13) STFD [YST1] = f100, 1 * SIZE (p13) FMA f103 = f11, f55, f103 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f104 = f11, f70, f104 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p14) FMA f105 = f11, f71, f105 ;; (p13) STFD [YST1] = f103, 1 * SIZE (p15) FMA f106 = f11, f83, f106 ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L30: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 1 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L40 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd A = LDA, 1, A } ;; adds PREB = RPREFETCH * SIZE, YLD1 FMPY f8 = ALPHA, f8 mov ar.ec= 2 adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 shr I = MM, 3 ;; (p8) LDFD f81 = [AO2], 1 * SIZE cmp.eq p6, p0 = 0, I ;; (p8) FMA f106 = f8, f80, f106 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 tbit.nz p13, p0 = MM, 2 ;; (p8) FMA f106 = f9, f81, f106 cmp.eq p16, p0 = r0, r0 adds I = -1, I ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L35 } ;; .align 16 .L32: { .mfi (p17) LDFPD f47, f48 = [AO2], 2 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mmf (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 adds I = -1, I } { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mmf (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mmf (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f16 = f9, f41, f101 } { .mmf (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f17 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f40, f41 = [AO2], 2 * SIZE (p17) FMA f18 = f9, f43, f107 } { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f42, f43 = [AO2], 2 * SIZE (p17) FMA f20 = f9, f45, f113 } { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f44, f45 = [AO2], 2 * SIZE (p17) FMA f22 = f9, f47, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f9, f48, f122 br.ctop.sptk.few .L32 } ;; .align 16 .L35: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFPD f34, f35 = [AO2], 2 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFPD f50, f51 = [AO2], 2 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFPD f66, f67 = [AO2], 2 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmi (p15) LDFD f81 = [AO2] (p18) STFD [YST1] = f23, 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 (p14) FMA f104 = f8, f64, f104 (p14) FMA f105 = f8, f65, f105 (p15) FMA f106 = f8, f80, f106 ;; (p13) FMA f100 = f9, f34, f100 (p13) FMA f101 = f9, f35, f101 (p13) FMA f102 = f9, f50, f102 (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L40: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 0 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L990 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE adds RPRE1 = RPREFETCH * SIZE, AO1 } ;; { .mii (p8) LDFD f80 = [AO1], 1 * SIZE adds PREB = RPREFETCH * SIZE, YLD1 } ;; FMPY f8 = ALPHA, f8 shr I = MM, 3 ;; (p8) FMA f106 = f8, f80, f106 mov ar.ec= 3 ;; { .mmi cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 tbit.nz p14, p15 = r0, 0 } ;; { .mmi adds YST2 = 4 * SIZE, YST1 adds I = -1, I tbit.nz p13, p0 = MM, 2 } ;; { .mmi (p8) STFD [YST1] = f106, 1 * SIZE (p8) adds YST2 = 1 * SIZE, YST2 } { .mib mov ar.lc = I (p6) br.cond.dpnt .L145 } ;; .align 16 .L42: { .mmf (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE (p18) FMA f16 = f8, f34, f102 } { .mmf (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) FMA f20 = f8, f46, f114 } ;; { .mmf (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE (p18) FMA f17 = f8, f37, f105 } { .mmf (p16) LDFPD f38, f41 = [AO1], 2 * SIZE (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) FMA f21 = f8, f49, f117 } ;; { .mmf (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE (p18) FMA f18 = f8, f40, f108 } { .mmf (p16) LDFPD f44, f47 = [AO1], 2 * SIZE (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) FMA f22 = f8, f52, f120 } ;; { .mmf (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE (p18) FMA f19 = f8, f43, f111 } { .mmf (p16) LDFPD f50, f53 = [AO1], 2 * SIZE (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) FMA f23 = f8, f55, f123 } ;; { .mmi (p14) PREFETCH [RPRE1], 16 * SIZE (p14) PREFETCH [PREB], 16 * SIZE (p16) tbit.nz.unc p14, p15 = I, 0 } { .mib nop __LINE__ (p16) adds I = -1, I br.ctop.sptk.few .L42 } ;; .align 16 .L45: { .mmi (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE } { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE } { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 ;; (p13) STFD [YST1] = f100, 1 * SIZE (p14) FMA f104 = f8, f64, f104 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f105 = f8, f65, f105 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p15) FMA f106 = f8, f80, f106 ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE br .L990 ;; .align 16 .L100: shr J = N, 3 ;; cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L120 ;; .align 16 .L111: mov YLD1 = YY mov YST1 = YY ;; LDFD f8 = [X], INCX ;; LDFD f9 = [X], INCX ;; LDFD f10 = [X], INCX ;; LDFD f11 = [X], INCX ;; LDFD f12 = [X], INCX ;; LDFD f13 = [X], INCX ;; LDFD f14 = [X], INCX ;; LDFD f15 = [X], INCX ;; FMPY f8 = ALPHA, f8 FMPY f9 = ALPHA, f9 FMPY f10 = ALPHA, f10 FMPY f11 = ALPHA, f11 FMPY f12 = ALPHA, f12 FMPY f13 = ALPHA, f13 FMPY f14 = ALPHA, f14 FMPY f15 = ALPHA, f15 ;; mov AO1 = A add AO2 = LDA, A ;; shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 ;; shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 ;; shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 shladd A = LDA, 3, A ;; ;; adds PREB = RPREFETCH * SIZE, YLD1 adds RPRE1 = RPREFETCH * SIZE, AO1 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 adds RPRE3 = RPREFETCH * SIZE, AO3 adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 adds RPRE5 = RPREFETCH * SIZE, AO5 adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 adds RPRE7 = RPREFETCH * SIZE, AO7 adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 (p8) LDFD f80 = [AO1], 1 * SIZE (p8) LDFD f81 = [AO2], 1 * SIZE (p8) LDFD f82 = [AO3], 1 * SIZE (p8) LDFD f83 = [AO4], 1 * SIZE (p8) LDFD f84 = [AO5], 1 * SIZE (p8) LDFD f85 = [AO6], 1 * SIZE (p8) LDFD f86 = [AO7], 1 * SIZE (p8) LDFD f87 = [AO8], 1 * SIZE (p8) LDFD f106 = [YLD1], 1 * SIZE ;; (p8) FMPY f32 = f8, f80 (p8) FMPY f33 = f9, f81 (p8) FMPY f34 = f10, f82 (p8) FMA f35 = f11, f83, f106 ;; (p8) FMA f32 = f12, f84, f32 (p8) FMA f33 = f13, f85, f33 (p8) FMA f34 = f14, f86, f34 (p8) FMA f35 = f15, f87, f35 ;; (p8) FADD f32 = f32, f33 (p8) FADD f34 = f34, f35 ;; (p8) FADD f32 = f32, f34 ;; (p8) STFD [YST1] = f32, 1 * SIZE shr I = MM, 3 mov pr.rot= 0 ;; cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 ;; adds I = -1, I tbit.nz p13, p0 = MM, 2 ;; mov ar.lc = I mov ar.ec= 2 (p6) br.cond.dpnt .L115 ;; .align 16 .L112: { .mfi (p17) LDFD f96 = [AO8], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mmf (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f41, f42 = [AO2], 2 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f43, f44 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f45, f46 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFD f47 = [AO2], 1 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f101 = f11, f57, f101 } { .mmf (p18) STFD [YST1] = f19, 1 * SIZE (p16) LDFD f56 = [AO4], 1 * SIZE (p17) FMA f104 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f57, f58 = [AO4], 2 * SIZE (p17) FMA f107 = f11, f59, f107 } { .mfi (p15) PREFETCH [RPRE4], 16 * SIZE (p17) FMA f110 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f59, f60 = [AO4], 2 * SIZE (p17) FMA f113 = f11, f61, f113 } { .mfi (p17) FMA f116 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f61, f62 = [AO4], 2 * SIZE (p17) FMA f119 = f11, f63, f119 } { .mfi (p17) FMA f122 = f11, f64, f122 } ;; { .mfi (p16) LDFD f63 = [AO4], 1 * SIZE (p17) FMA f101 = f12, f65, f101 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f104 = f12, f66, f104 } ;; { .mfi (p16) LDFPD f64, f65 = [AO5], 2 * SIZE (p17) FMA f107 = f12, f67, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f110 = f12, f68, f110 } ;; { .mfi (p16) LDFPD f66, f67 = [AO5], 2 * SIZE (p17) FMA f113 = f12, f69, f113 } { .mfi (p14) PREFETCH [RPRE5], 16 * SIZE (p17) FMA f116 = f12, f70, f116 } ;; { .mfi (p16) LDFPD f68, f69 = [AO5], 2 * SIZE (p17) FMA f119 = f12, f71, f119 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f122 = f12, f72, f122 } ;; { .mfi (p16) LDFPD f70, f71 = [AO5], 2 * SIZE (p17) FMA f101 = f13, f73, f101 } { .mmf (p18) STFD [YST1] = f23, 1 * SIZE (p16) LDFD f72 = [AO6], 1 * SIZE (p17) FMA f104 = f13, f74, f104 } ;; { .mfi (p16) LDFPD f73, f74 = [AO6], 2 * SIZE (p17) FMA f107 = f13, f75, f107 } { .mfi (p15) PREFETCH [RPRE6], 16 * SIZE (p17) FMA f110 = f13, f76, f110 } ;; { .mfi (p16) LDFPD f75, f76 = [AO6], 2 * SIZE (p17) FMA f113 = f13, f77, f113 } { .mfi (p17) FMA f116 = f13, f78, f116 } ;; { .mfi (p16) LDFPD f77, f78 = [AO6], 2 * SIZE (p17) FMA f119 = f13, f79, f119 } { .mfi (p17) FMA f122 = f13, f80, f122 } ;; { .mfi (p16) LDFD f79 = [AO6], 1 * SIZE (p17) FMA f101 = f14, f81, f101 } { .mfi (p17) FMA f104 = f14, f82, f104 } ;; { .mfi (p16) LDFPD f80, f81 = [AO7], 2 * SIZE (p17) FMA f107 = f14, f83, f107 } { .mfi (p14) PREFETCH [RPRE7], 16 * SIZE (p17) FMA f110 = f14, f84, f110 } ;; { .mfi (p16) LDFPD f82, f83 = [AO7], 2 * SIZE (p17) FMA f113 = f14, f85, f113 } { .mfi (p17) FMA f116 = f14, f86, f116 } ;; { .mfi (p16) LDFPD f84, f85 = [AO7], 2 * SIZE (p17) FMA f119 = f14, f87, f119 } { .mfi (p17) FMA f122 = f14, f88, f122 } ;; { .mfi (p16) LDFPD f86, f87 = [AO7], 2 * SIZE (p17) FMA f16 = f15, f89, f101 } { .mfi (p16) LDFD f88 = [AO8], 1 * SIZE (p17) FMA f17 = f15, f90, f104 } ;; { .mfi (p16) LDFPD f89, f90 = [AO8], 2 * SIZE (p17) FMA f18 = f15, f91, f107 } { .mfi (p15) PREFETCH [RPRE8], 16 * SIZE (p17) FMA f19 = f15, f92, f110 } ;; { .mfi (p16) LDFPD f91, f92 = [AO8], 2 * SIZE (p17) FMA f20 = f15, f93, f113 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f21 = f15, f94, f116 } ;; { .mfi (p16) LDFPD f93, f94 = [AO8], 2 * SIZE (p17) FMA f22 = f15, f95, f119 } { .mfb (p16) adds I = -1, I (p17) FMA f23 = f15, f96, f122 br.ctop.sptk.few .L112 } ;; .align 16 .L115: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE cmp.lt p6, p0 = 1, J adds J = -1, J } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE (p13) LDFD f34 = [AO2], 1 * SIZE nop __LINE__ } ;; { .mmi (p13) LDFPD f35, f50 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p13) LDFD f51 = [AO2], 1 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFD f66 = [AO2], 1 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE nop __LINE__ } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmi (p14) LDFD f67 = [AO2], 1 * SIZE (p15) LDFD f82 = [AO3] nop __LINE__ } { .mmi (p18) STFD [YST1] = f23, 1 * SIZE nop __LINE__ } ;; { .mmf (p15) LDFD f81 = [AO2] (p13) LDFD f38 = [AO4], 1 * SIZE (p13) FMA f100 = f8, f32, f100 } { .mfi (p13) LDFPD f40, f41 = [AO5], 2 * SIZE (p13) FMA f101 = f8, f33, f101 nop __LINE__ } ;; { .mfi (p13) LDFPD f39, f54 = [AO4], 2 * SIZE (p13) FMA f102 = f8, f48, f102 nop __LINE__ } { .mfi (p13) LDFPD f56, f57 = [AO5], 2 * SIZE (p13) FMA f103 = f8, f49, f103 nop __LINE__ } ;; { .mfi (p13) LDFD f55 = [AO4], 1 * SIZE (p14) FMA f104 = f8, f64, f104 nop __LINE__ } { .mfi (p14) LDFPD f72, f73 = [AO5], 2 * SIZE (p14) FMA f105 = f8, f65, f105 nop __LINE__ } ;; { .mfi (p14) LDFD f70 = [AO4], 1 * SIZE (p15) FMA f106 = f8, f80, f106 nop __LINE__ } { .mmi (p15) LDFD f84 = [AO5] (p13) LDFD f42 = [AO6], 1 * SIZE nop __LINE__ } ;; { .mmf (p13) LDFPD f43, f58 = [AO6], 2 * SIZE (p14) LDFD f71 = [AO4], 1 * SIZE (p13) FMA f100 = f9, f34, f100 } { .mfi (p13) LDFPD f44, f45 = [AO7], 2 * SIZE (p13) FMA f101 = f9, f35, f101 nop __LINE__ } ;; { .mmf (p13) LDFD f59 = [AO6], 1 * SIZE (p15) LDFD f83 = [AO4] (p13) FMA f102 = f9, f50, f102 } { .mfi (p13) LDFPD f60, f61 = [AO7], 2 * SIZE (p13) FMA f103 = f9, f51, f103 nop __LINE__ } ;; { .mfi (p14) LDFD f74 = [AO6], 1 * SIZE (p14) FMA f104 = f9, f66, f104 nop __LINE__ } { .mfi (p14) LDFPD f76, f77 = [AO7], 2 * SIZE (p14) FMA f105 = f9, f67, f105 nop __LINE__ } ;; { .mfi (p14) LDFD f75 = [AO6], 1 * SIZE (p15) FMA f106 = f9, f81, f106 nop __LINE__ } { .mmi (p15) LDFD f86 = [AO7] (p13) LDFD f46 = [AO8], 1 * SIZE nop __LINE__ } ;; { .mmf (p13) LDFPD f47, f62 = [AO8], 2 * SIZE (p15) LDFD f85 = [AO6] (p13) FMA f100 = f10, f36, f100 } { .mfi (p13) FMA f101 = f10, f37, f101 nop __LINE__ } ;; { .mfi (p13) LDFD f63 = [AO8], 1 * SIZE (p13) FMA f102 = f10, f52, f102 nop __LINE__ } { .mfi (p13) FMA f103 = f10, f53, f103 nop __LINE__ } ;; { .mfi (p14) LDFD f78 = [AO8], 1 * SIZE (p14) FMA f104 = f10, f68, f104 nop __LINE__ } { .mfi (p14) FMA f105 = f10, f69, f105 nop __LINE__ } ;; { .mfi (p14) LDFD f79 = [AO8], 1 * SIZE (p15) FMA f106 = f10, f82, f106 nop __LINE__ } ;; (p15) LDFD f87 = [AO8] (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 (p13) FMA f102 = f11, f54, f102 (p13) FMA f103 = f11, f55, f103 (p14) FMA f104 = f11, f70, f104 (p14) FMA f105 = f11, f71, f105 (p15) FMA f106 = f11, f83, f106 ;; (p13) FMA f100 = f12, f40, f100 (p13) FMA f101 = f12, f41, f101 (p13) FMA f102 = f12, f56, f102 (p13) FMA f103 = f12, f57, f103 (p14) FMA f104 = f12, f72, f104 (p14) FMA f105 = f12, f73, f105 (p15) FMA f106 = f12, f84, f106 ;; (p13) FMA f100 = f13, f42, f100 (p13) FMA f101 = f13, f43, f101 (p13) FMA f102 = f13, f58, f102 (p13) FMA f103 = f13, f59, f103 (p14) FMA f104 = f13, f74, f104 (p14) FMA f105 = f13, f75, f105 (p15) FMA f106 = f13, f85, f106 ;; (p13) FMA f100 = f14, f44, f100 (p13) FMA f101 = f14, f45, f101 (p13) FMA f102 = f14, f60, f102 (p13) FMA f103 = f14, f61, f103 (p14) FMA f104 = f14, f76, f104 (p14) FMA f105 = f14, f77, f105 (p15) FMA f106 = f14, f86, f106 ;; (p13) FMA f100 = f15, f46, f100 (p13) FMA f101 = f15, f47, f101 (p13) FMA f102 = f15, f62, f102 (p13) FMA f103 = f15, f63, f103 (p14) FMA f104 = f15, f78, f104 (p14) FMA f105 = f15, f79, f105 (p15) FMA f106 = f15, f87, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE (p6) br.cond.dptk .L111 ;; .align 16 .L120: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 2 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L130 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd AO4 = LDA, 1, AO2 } ;; { .mmi LDFD f10 = [X], INCX (p8) LDFD f81 = [AO2], 1 * SIZE shladd AO3 = LDA, 1, A } ;; { .mmi LDFD f11 = [X], INCX (p8) LDFD f82 = [AO3], 1 * SIZE } ;; { .mfi (p8) LDFD f83 = [AO4], 1 * SIZE FMPY f8 = ALPHA, f8 adds PREB = RPREFETCH * SIZE, YLD1 } { .mfi adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 } ;; FMPY f10 = ALPHA, f10 shladd A = LDA, 2, A FMPY f11 = ALPHA, f11 ;; { .mfi adds RPRE3 = RPREFETCH * SIZE, AO3 (p8) FMA f106 = f8, f80, f106 mov ar.ec= 2 } ;; adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 (p8) FMA f106 = f9, f81, f106 shr I = MM, 3 ;; { .mmf cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 (p8) FMA f106 = f10, f82, f106 } ;; { .mfi adds I = -1, I (p8) FMA f106 = f11, f83, f106 tbit.nz p13, p0 = MM, 2 } ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L125 } ;; .align 16 .L122: { .mfi (p17) LDFD f64 = [AO4], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mfi (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 (p16) adds I = -1, I } { .mfi (p14) PREFETCH [RPRE1], 16 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mfi (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mfi (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mfi (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p17) FMA f101 = f9, f41, f101 } { .mmf (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f104 = f9, f42, f104 } ;; { .mmf (p16) LDFPD f41, f42 = [AO2], 2 * SIZE (p15) PREFETCH [RPRE2], 16 * SIZE (p17) FMA f107 = f9, f43, f107 } { .mfi (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f110 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f43, f44 = [AO2], 2 * SIZE (p17) FMA f113 = f9, f45, f113 } { .mfi (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f116 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f45, f46 = [AO2], 2 * SIZE (p17) FMA f119 = f9, f47, f119 } { .mfi (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f122 = f9, f48, f122 } ;; { .mfi (p16) LDFD f47 = [AO2], 1 * SIZE (p17) FMA f101 = f10, f49, f101 } { .mfi (p14) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) FMA f104 = f10, f50, f104 } ;; { .mfi (p16) LDFPD f48, f49 = [AO3], 2 * SIZE (p17) FMA f107 = f10, f51, f107 } { .mfi (p14) PREFETCH [RPRE3], 16 * SIZE (p17) FMA f110 = f10, f52, f110 } ;; { .mfi (p16) LDFPD f50, f51 = [AO3], 2 * SIZE (p17) FMA f113 = f10, f53, f113 } { .mfi (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f116 = f10, f54, f116 } ;; { .mfi (p16) LDFPD f52, f53 = [AO3], 2 * SIZE (p17) FMA f119 = f10, f55, f119 } { .mfi (p18) STFD [YST1] = f20, 1 * SIZE (p17) FMA f122 = f10, f56, f122 } ;; { .mfi (p16) LDFPD f54, f55 = [AO3], 2 * SIZE (p17) FMA f16 = f11, f57, f101 } { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFD f56 = [AO4], 1 * SIZE (p17) FMA f17 = f11, f58, f104 } ;; { .mfi (p16) LDFPD f57, f58 = [AO4], 2 * SIZE (p17) FMA f18 = f11, f59, f107 } { .mfi (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f11, f60, f110 } ;; { .mfi (p16) LDFPD f59, f60 = [AO4], 2 * SIZE (p17) FMA f20 = f11, f61, f113 } { .mfi (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f11, f62, f116 } ;; { .mfi (p16) LDFPD f61, f62 = [AO4], 2 * SIZE (p17) FMA f22 = f11, f63, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f11, f64, f122 br.ctop.sptk.few .L122 } ;; .align 16 .L125: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f19, 1 * SIZE (p15) LDFD f80 = [AO1] } { .mmi (p15) LDFD f106 = [YLD1], 1 * SIZE (p13) LDFD f34 = [AO2], 1 * SIZE } ;; { .mmi (p13) LDFPD f35, f50 = [AO2], 2 * SIZE (p13) LDFPD f36, f37 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFD f51 = [AO2], 1 * SIZE (p13) LDFPD f52, f53 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p14) LDFD f66 = [AO2], 1 * SIZE (p14) LDFPD f68, f69 = [AO3], 2 * SIZE } { .mmi (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmf (p18) STFD [YST1] = f23, 1 * SIZE (p14) LDFD f67 = [AO2], 1 * SIZE (p13) FMA f100 = f8, f32, f100 } { .mmf (p15) LDFD f82 = [AO3] (p13) LDFD f38 = [AO4], 1 * SIZE (p13) FMA f101 = f8, f33, f101 } ;; ;; { .mmf (p13) LDFPD f39, f54 = [AO4], 2 * SIZE (p15) LDFD f81 = [AO2] (p13) FMA f102 = f8, f48, f102 } { .mfi (p13) FMA f103 = f8, f49, f103 } ;; { .mfi (p13) LDFD f55 = [AO4], 1 * SIZE (p14) FMA f104 = f8, f64, f104 } { .mfi (p14) FMA f105 = f8, f65, f105 } ;; { .mfi (p14) LDFD f70 = [AO4], 1 * SIZE (p15) FMA f106 = f8, f80, f106 } { .mfi (p13) FMA f100 = f9, f34, f100 } ;; { .mfi (p14) LDFD f71 = [AO4], 1 * SIZE (p13) FMA f101 = f9, f35, f101 } { .mfi (p13) FMA f102 = f9, f50, f102 } ;; (p15) LDFD f83 = [AO4] (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) FMA f100 = f10, f36, f100 (p13) FMA f101 = f10, f37, f101 (p13) FMA f102 = f10, f52, f102 (p13) FMA f103 = f10, f53, f103 (p14) FMA f104 = f10, f68, f104 (p14) FMA f105 = f10, f69, f105 (p15) FMA f106 = f10, f82, f106 ;; (p13) FMA f100 = f11, f38, f100 (p13) FMA f101 = f11, f39, f101 ;; (p13) FMA f102 = f11, f54, f102 (p13) STFD [YST1] = f100, 1 * SIZE (p13) FMA f103 = f11, f55, f103 ;; (p13) STFD [YST1] = f101, 1 * SIZE (p14) FMA f104 = f11, f70, f104 ;; (p13) STFD [YST1] = f102, 1 * SIZE (p14) FMA f105 = f11, f71, f105 ;; (p13) STFD [YST1] = f103, 1 * SIZE (p15) FMA f106 = f11, f83, f106 ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L130: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 1 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L140 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE add AO2 = LDA, A } ;; { .mmi LDFD f9 = [X], INCX (p8) LDFD f80 = [AO1], 1 * SIZE shladd A = LDA, 1, A } ;; adds PREB = RPREFETCH * SIZE, YLD1 FMPY f8 = ALPHA, f8 mov ar.ec= 2 adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA, f9 shr I = MM, 3 ;; (p8) LDFD f81 = [AO2], 1 * SIZE cmp.eq p6, p0 = 0, I ;; (p8) FMA f106 = f8, f80, f106 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 tbit.nz p13, p0 = MM, 2 ;; (p8) FMA f106 = f9, f81, f106 cmp.eq p16, p0 = r0, r0 adds I = -1, I ;; { .mib (p8) STFD [YST1] = f106, 1 * SIZE mov ar.lc = I (p6) br.cond.dpnt .L135 } ;; .align 16 .L132: { .mfi (p17) LDFD f48 = [AO2], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p14, p15 = I, 0 } { .mmf (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f16, 1 * SIZE (p17) FMA f104 = f8, f34, f104 } ;; { .mfi (p16) LDFPD f32, f33 = [AO1], 2 * SIZE (p17) FMA f107 = f8, f35, f107 adds I = -1, I } { .mmf (p14) PREFETCH [RPRE1], 16 * SIZE (p18) STFD [YST1] = f17, 1 * SIZE (p17) FMA f110 = f8, f36, f110 } ;; { .mfi (p16) LDFPD f34, f35 = [AO1], 2 * SIZE (p17) FMA f113 = f8, f37, f113 } { .mmf (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f18, 1 * SIZE (p17) FMA f116 = f8, f38, f116 } ;; { .mfi (p16) LDFPD f36, f37 = [AO1], 2 * SIZE (p17) FMA f119 = f8, f39, f119 } { .mmf (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) STFD [YST1] = f19, 1 * SIZE (p17) FMA f122 = f8, f40, f122 } ;; { .mmf (p16) LDFPD f38, f39 = [AO1], 2 * SIZE (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p17) FMA f16 = f9, f41, f101 } { .mmf (p18) STFD [YST1] = f20, 1 * SIZE (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f17 = f9, f42, f104 } ;; { .mfi (p16) LDFPD f41, f42 = [AO2], 2 * SIZE (p17) FMA f18 = f9, f43, f107 } { .mmf (p15) PREFETCH [RPRE2], 16 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE (p17) FMA f19 = f9, f44, f110 } ;; { .mfi (p16) LDFPD f43, f44 = [AO2], 2 * SIZE (p17) FMA f20 = f9, f45, f113 } { .mmf (p14) PREFETCH [PREB], 16 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE (p17) FMA f21 = f9, f46, f116 } ;; { .mfi (p16) LDFPD f45, f46 = [AO2], 2 * SIZE (p17) FMA f22 = f9, f47, f119 } { .mfb (p18) STFD [YST1] = f23, 1 * SIZE (p17) FMA f23 = f9, f48, f122 br.ctop.sptk.few .L132 } ;; .align 16 .L135: { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p18) STFD [YST1] = f16, 1 * SIZE } ;; { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p18) STFD [YST1] = f17, 1 * SIZE } ;; { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } { .mmi (p18) STFD [YST1] = f18, 1 * SIZE } ;; { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } { .mmi (p18) STFD [YST1] = f19, 1 * SIZE } ;; { .mmi (p13) LDFD f34 = [AO2], 1 * SIZE (p18) STFD [YST1] = f20, 1 * SIZE } ;; { .mmi (p13) LDFD f35 = [AO2], 1 * SIZE (p18) STFD [YST1] = f21, 1 * SIZE } ;; { .mmi (p13) LDFD f50 = [AO2], 1 * SIZE (p18) STFD [YST1] = f22, 1 * SIZE } ;; { .mmi (p13) LDFD f51 = [AO2], 1 * SIZE (p18) STFD [YST1] = f23, 1 * SIZE } ;; (p14) LDFD f66 = [AO2], 1 * SIZE (p13) FMA f100 = f8, f32, f100 ;; (p14) LDFD f67 = [AO2], 1 * SIZE (p13) FMA f101 = f8, f33, f101 ;; (p15) LDFD f81 = [AO2] (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 (p14) FMA f104 = f8, f64, f104 (p14) FMA f105 = f8, f65, f105 (p15) FMA f106 = f8, f80, f106 ;; (p13) FMA f100 = f9, f34, f100 (p13) FMA f101 = f9, f35, f101 (p13) FMA f102 = f9, f50, f102 (p13) FMA f103 = f9, f51, f103 (p14) FMA f104 = f9, f66, f104 (p14) FMA f105 = f9, f67, f105 (p15) FMA f106 = f9, f81, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L140: { .mmi mov YLD1 = YY mov YST1 = YY tbit.z p6, p0 = N, 0 } ;; { .mib mov AO1 = A mov pr.rot= 0 (p6) br.cond.dpnt .L990 } ;; { .mmi LDFD f8 = [X], INCX (p8) LDFD f106 = [YLD1], 1 * SIZE adds RPRE1 = RPREFETCH * SIZE, AO1 } ;; { .mmi (p8) LDFD f80 = [AO1], 1 * SIZE adds PREB = RPREFETCH * SIZE, YLD1 } ;; FMPY f8 = ALPHA, f8 shr I = MM, 3 ;; (p8) FMA f106 = f8, f80, f106 mov ar.ec= 3 ;; { .mmi cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 tbit.nz p14, p15 = r0, 0 } ;; { .mmi adds YST2 = 4 * SIZE, YST1 adds I = -1, I tbit.nz p13, p0 = MM, 2 } ;; { .mmi (p8) STFD [YST1] = f106, 1 * SIZE (p8) adds YST2 = 1 * SIZE, YST2 } { .mib mov ar.lc = I (p6) br.cond.dpnt .L145 } ;; .align 16 .L142: { .mmf (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE (p18) FMA f16 = f8, f34, f102 } { .mmf (p16) LDFPD f32, f35 = [AO1], 2 * SIZE (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE (p18) FMA f20 = f8, f46, f114 } ;; { .mmf (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE (p18) FMA f17 = f8, f37, f105 } { .mmf (p16) LDFPD f38, f41 = [AO1], 2 * SIZE (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE (p18) FMA f21 = f8, f49, f117 } ;; { .mmf (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE (p18) FMA f18 = f8, f40, f108 } { .mmf (p16) LDFPD f44, f47 = [AO1], 2 * SIZE (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE (p18) FMA f22 = f8, f52, f120 } ;; { .mmf (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE (p18) FMA f19 = f8, f43, f111 } { .mmf (p16) LDFPD f50, f53 = [AO1], 2 * SIZE (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE (p18) FMA f23 = f8, f55, f123 } ;; { .mmi (p14) PREFETCH [RPRE1], 16 * SIZE (p14) PREFETCH [PREB], 16 * SIZE (p16) tbit.nz.unc p14, p15 = I, 0 } { .mib nop __LINE__ (p16) adds I = -1, I br.ctop.sptk.few .L142 } ;; .align 16 .L145: { .mmi (p19) STFD [YST1] = f16, 1 * SIZE (p19) STFD [YST2] = f20, 1 * SIZE tbit.nz p14, p0 = MM, 1 } { .mmi (p13) LDFPD f32, f33 = [AO1], 2 * SIZE (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f17, 1 * SIZE (p19) STFD [YST2] = f21, 1 * SIZE tbit.nz p15, p0 = MM, 0 } { .mmi (p13) LDFPD f48, f49 = [AO1], 2 * SIZE (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f18, 1 * SIZE (p19) STFD [YST2] = f22, 1 * SIZE } { .mmi (p14) LDFPD f64, f65 = [AO1], 2 * SIZE (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE } ;; { .mmi (p19) STFD [YST1] = f19, 5 * SIZE (p19) STFD [YST2] = f23, 5 * SIZE } { .mmi (p15) LDFD f80 = [AO1] (p15) LDFD f106 = [YLD1], 1 * SIZE } ;; (p13) FMA f100 = f8, f32, f100 (p13) FMA f101 = f8, f33, f101 (p13) FMA f102 = f8, f48, f102 (p13) FMA f103 = f8, f49, f103 (p14) FMA f104 = f8, f64, f104 (p14) FMA f105 = f8, f65, f105 (p15) FMA f106 = f8, f80, f106 ;; (p13) STFD [YST1] = f100, 1 * SIZE ;; (p13) STFD [YST1] = f101, 1 * SIZE ;; (p13) STFD [YST1] = f102, 1 * SIZE ;; (p13) STFD [YST1] = f103, 1 * SIZE ;; (p14) STFD [YST1] = f104, 1 * SIZE ;; (p14) STFD [YST1] = f105, 1 * SIZE ;; (p15) STFD [YST1] = f106, 1 * SIZE ;; .align 16 .L990: { .mmi mov YLD1 = YY mov YST1 = Y mov pr.rot= 0 } { .mib mov YST2 = Y shr J = M, 3 (p10) br.cond.dptk .L999 } ;; { .mmi cmp.eq p6, p0 = r0, J adds J = -1, J mov ar.ec = 4 } { .mmi cmp.eq p16, p0 = r0, r0 nop __LINE__ tbit.nz p13, p0 = M, 2 } ;; { .mib nop __LINE__ mov ar.lc = J (p6) br.cond.dpnt .L995 } ;; .L992: { .mfi (p19) STFD [YST2] = f35 (p18) FADD f34 = f34, f66 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f64 = [YLD1], 1 * SIZE (p16) LDFD f32 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f39 (p18) FADD f38 = f38, f70 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f36 = [YST1], INCY (p16) LDFD f68 = [YLD1], 1 * SIZE } ;; { .mfi (p19) STFD [YST2] = f43 (p18) FADD f42 = f42, f74 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f72 = [YLD1], 1 * SIZE (p16) LDFD f40 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f47 (p18) FADD f46 = f46, f78 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f76 = [YLD1], 1 * SIZE (p16) LDFD f44 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f51 (p18) FADD f50 = f50, f82 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f80 = [YLD1], 1 * SIZE (p16) LDFD f48 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f55 (p18) FADD f54 = f54, f86 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f84 = [YLD1], 1 * SIZE (p16) LDFD f52 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f59 (p18) FADD f58 = f58, f90 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f88 = [YLD1], 1 * SIZE (p16) LDFD f56 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f63 (p18) FADD f62 = f62, f94 (p19) add YST2 = YST2, INCY } { .mmb (p16) LDFD f92 = [YLD1], 1 * SIZE (p16) LDFD f60 = [YST1], INCY br.ctop.sptk.few .L992 } ;; .L995: (p13) LDFD f32 = [YST1], INCY (p13) LDFD f40 = [YLD1], 1 * SIZE tbit.nz p14, p0 = M, 1 ;; (p13) LDFD f33 = [YST1], INCY (p13) LDFD f41 = [YLD1], 1 * SIZE tbit.nz p15, p0 = M, 0 ;; (p13) LDFD f34 = [YST1], INCY (p13) LDFD f42 = [YLD1], 1 * SIZE ;; (p13) LDFD f35 = [YST1], INCY (p13) LDFD f43 = [YLD1], 1 * SIZE ;; (p14) LDFD f36 = [YST1], INCY (p14) LDFD f44 = [YLD1], 1 * SIZE ;; (p14) LDFD f37 = [YST1], INCY (p14) LDFD f45 = [YLD1], 1 * SIZE ;; (p15) LDFD f38 = [YST1], INCY (p15) LDFD f46 = [YLD1], 1 * SIZE ;; (p13) FADD f32 = f32, f40 (p13) FADD f33 = f33, f41 (p13) FADD f34 = f34, f42 (p13) FADD f35 = f35, f43 (p14) FADD f36 = f36, f44 (p14) FADD f37 = f37, f45 (p15) FADD f38 = f38, f46 ;; (p13) STFD [YST2] = f32 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f33 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f34 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f35 (p13) add YST2 = YST2, INCY ;; (p14) STFD [YST2] = f36 (p14) add YST2 = YST2, INCY ;; (p14) STFD [YST2] = f37 (p14) add YST2 = YST2, INCY ;; (p15) STFD [YST2] = f38 ;; .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/staticbuffer.S000066400000000000000000000053551313527062700200040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ALLOC_STATIC .align 1024 .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 #endif OpenBLAS-0.2.20/kernel/ia64/swap.S000066400000000000000000000255011313527062700162700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16) #else #define PREFETCH_SIZE (32 * 16) #endif #define SP r12 #ifndef XDOUBLE #define N r32 #define X1 r36 #define INCX r37 #define Y1 r38 #define INCY r39 #else #define N r32 #define X1 r38 #define INCX r39 #define Y1 r33 #define INCY r34 #endif #define PRE1 r2 #define PRE2 r3 #define I r14 #define J r15 #define X2 r16 #define Y2 r17 #define X3 r18 #define Y3 r19 #define X4 r20 #define Y4 r21 #define YY r22 #define XX r23 #define INCX5 r24 #define INCY5 r25 #define INCX16 r26 #define INCY16 r27 #define XYSUB r28 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE #ifdef XDOUBLE adds r8 = 16, SP adds r9 = 24, SP ;; ld8 Y1 = [r8] ld8 INCY = [r9] ;; #endif { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N tbit.z p0, p8 = Y1, BASE_SHIFT (p6) br.ret.sptk.many b0 } ;; .body { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 mov PR = pr } { .mmi sub XYSUB = X1, Y1 mov X3 = X1 shr I = N, 4 } ;; { .mmi shladd INCX5 = INCX, 2, INCX shladd INCY5 = INCY, 2, INCY mov pr.rot= 0 } { .mmi adds I = -1, I and J = 15, N extr XYSUB = XYSUB, BASE_SHIFT, 6 } ;; { .mmi shladd X2 = INCX, 2, X1 shladd Y2 = INCY, 2, Y1 mov ar.lc = I } { .mmi shladd X4 = INCX, 2, X1 shladd Y4 = INCY, 2, Y1 cmp.eq p16, p0 = r0, r0 } ;; { .mmi shladd PRE2 = XYSUB, BASE_SHIFT, Y1 cmp.lt p8 ,p0 = 28, XYSUB mov Y3 = Y1 } ;; { .mmi adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 adds PRE2 = (PREFETCH_SIZE - 12) * SIZE, PRE2 mov ar.ec= 2 } { .mib cmp.eq p9 ,p0 = -1, I tbit.z p0, p12 = N, 3 (p9) br.cond.dpnt .L15 } ;; .align 16 .L12: { .mmi (p18) STFD [Y3] = f56 (p18) STFD [Y4] = f64 (p18) add Y3 = Y3, INCY5 } { .mmi (p16) LDFD f32 = [X1], INCX (p16) LDFD f40 = [X2], INCX (p18) add Y4 = Y4, INCY5 } ;; { .mmi (p17) STFD [X3] = f65 (p17) STFD [X4] = f73 (p17) add X3 = X3, INCX } { .mmi (p16) LDFD f64 = [Y1], INCY (p16) LDFD f72 = [Y2], INCY (p17) add X4 = X4, INCX } ;; { .mmi (p17) STFD [Y3] = f33 (p17) STFD [Y4] = f41 (p17) add Y3 = Y3, INCY } { .mmi (p16) LDFD f34 = [X1], INCX (p16) LDFD f42 = [X2], INCX (p17) add Y4 = Y4, INCY } ;; { .mmi (p17) STFD [X3] = f67 (p17) STFD [X4] = f75 (p17) add X3 = X3, INCX } { .mmi (p16) LDFD f66 = [Y1], INCY (p16) LDFD f74 = [Y2], INCY (p17) add X4 = X4, INCX } ;; { .mmi (p17) STFD [Y3] = f35 (p17) STFD [Y4] = f43 (p17) add Y3 = Y3, INCY } { .mmi (p16) LDFD f36 = [X1], INCX (p16) LDFD f44 = [X2], INCX (p17) add Y4 = Y4, INCY } ;; { .mmi (p17) STFD [X3] = f69 (p17) STFD [X4] = f77 (p17) add X3 = X3, INCX } { .mmi (p16) LDFD f68 = [Y1], INCY (p16) LDFD f76 = [Y2], INCY (p17) add X4 = X4, INCX } ;; { .mmi (p17) STFD [Y3] = f37 (p17) STFD [Y4] = f45 (p17) add Y3 = Y3, INCY } { .mmi (p16) LDFD f38 = [X1], INCX5 (p16) LDFD f46 = [X2], INCX5 (p17) add Y4 = Y4, INCY } ;; { .mmi (p17) STFD [X3] = f71 (p17) STFD [X4] = f79 (p17) add X3 = X3, INCX5 } { .mmi (p16) LDFD f70 = [Y1], INCY5 (p16) LDFD f78 = [Y2], INCY5 (p17) add X4 = X4, INCX5 } ;; { .mmi (p17) STFD [Y3] = f39 (p17) STFD [Y4] = f47 (p17) add Y3 = Y3, INCY5 } { .mmi (p16) LDFD f48 = [X1], INCX (p16) LDFD f56 = [X2], INCX (p17) add Y4 = Y4, INCY5 } ;; { .mmi (p17) STFD [X3] = f81 (p17) STFD [X4] = f89 (p17) add X3 = X3, INCX } { .mmi (p16) LDFD f80 = [Y1], INCY (p16) LDFD f88 = [Y2], INCY (p17) add X4 = X4, INCX } ;; { .mmi (p17) STFD [Y3] = f49 (p17) STFD [Y4] = f57 (p17) add Y3 = Y3, INCY } { .mmi (p16) LDFD f50 = [X1], INCX (p16) LDFD f58 = [X2], INCX (p17) add Y4 = Y4, INCY } ;; { .mmi (p17) STFD [X3] = f83 (p17) STFD [X4] = f91 (p17) add X3 = X3, INCX } { .mmi (p16) LDFD f82 = [Y1], INCY (p16) LDFD f90 = [Y2], INCY (p17) add X4 = X4, INCX } ;; { .mmi (p17) STFD [Y3] = f51 (p17) STFD [Y4] = f59 (p17) add Y3 = Y3, INCY } { .mmi (p16) LDFD f52 = [X1], INCX (p16) LDFD f60 = [X2], INCX (p17) add Y4 = Y4, INCY } ;; { .mmi (p17) STFD [X3] = f85 (p17) STFD [X4] = f93 (p17) add X3 = X3, INCX } { .mmi (p16) LDFD f84 = [Y1], INCY (p16) LDFD f92 = [Y2], INCY (p17) add X4 = X4, INCX } ;; { .mmi (p16) lfetch.nt1 [PRE1] (p16) lfetch.nt1 [PRE2] (p16) shladd PRE1 = INCX, 4, PRE1 } { .mmi (p16) LDFD f54 = [X1], INCX5 (p16) LDFD f62 = [X2], INCX5 (p16) shladd PRE2 = INCX, 4, PRE2 } ;; { .mmi (p17) STFD [Y3] = f53 (p17) STFD [Y4] = f61 (p17) add Y3 = Y3, INCY } { .mmi (p16) LDFD f86 = [Y1], INCY5 (p16) LDFD f94 = [Y2], INCY5 (p17) add Y4 = Y4, INCY } ;; { .mmi (p17) STFD [X3] = f87 (p17) STFD [X4] = f95 (p17) add X3 = X3, INCX5 } { .mib nop __LINE__ (p17) add X4 = X4, INCX5 br.ctop.sptk.few .L12 } ;; .L15: { .mmi (p18) STFD [Y3] = f56 (p18) STFD [Y4] = f64 mov ar.lc = ARLC } { .mmi (p12) LDFD f32 = [X1], INCX (p12) LDFD f36 = [X2], INCX cmp.eq p10, p0 = r0, J } ;; { .mmi (p12) LDFD f80 = [Y1], INCY (p12) LDFD f84 = [Y2], INCY (p18) add Y3 = Y3, INCY5 } { .mmi (p12) LDFD f33 = [X1], INCX (p12) LDFD f37 = [X2], INCX (p18) add Y4 = Y4, INCY5 } ;; { .mmi (p12) LDFD f81 = [Y1], INCY (p12) LDFD f85 = [Y2], INCY mov pr = PR, -65474 } { .mmb (p12) LDFD f34 = [X1], INCX (p12) LDFD f38 = [X2], INCX (p10) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f82 = [Y1], INCY (p12) LDFD f86 = [Y2], INCY tbit.z p0, p13 = N, 2 } { .mmi (p12) LDFD f35 = [X1], INCX5 (p12) LDFD f39 = [X2], INCX5 tbit.z p0, p14 = N, 1 } ;; { .mmi (p12) LDFD f83 = [Y1], INCY5 (p12) LDFD f87 = [Y2], INCY5 tbit.z p0, p15 = N, 0 } ;; { .mmi (p13) LDFD f40 = [X1], INCX (p13) LDFD f88 = [Y1], INCY } ;; { .mmi (p13) LDFD f41 = [X1], INCX (p13) LDFD f89 = [Y1], INCY } ;; { .mmi (p12) STFD [Y3] = f32 (p12) STFD [Y4] = f36 (p12) add Y3 = Y3, INCY } { .mmi (p13) LDFD f42 = [X1], INCX (p13) LDFD f90 = [Y1], INCY (p12) add Y4 = Y4, INCY } ;; { .mmi (p12) STFD [X3] = f80 (p12) STFD [X4] = f84 (p12) add X3 = X3, INCX } { .mmi (p13) LDFD f43 = [X1], INCX (p13) LDFD f91 = [Y1], INCY (p12) add X4 = X4, INCX } ;; { .mmi (p12) STFD [Y3] = f33 (p12) STFD [Y4] = f37 (p12) add Y3 = Y3, INCY } { .mmi (p14) LDFD f44 = [X1], INCX (p14) LDFD f92 = [Y1], INCY (p12) add Y4 = Y4, INCY } ;; { .mmi (p12) STFD [X3] = f81 (p12) STFD [X4] = f85 (p12) add X3 = X3, INCX } { .mmi (p14) LDFD f45 = [X1], INCX (p14) LDFD f93 = [Y1], INCY (p12) add X4 = X4, INCX } ;; { .mmi (p12) STFD [X3] = f82 (p12) STFD [X4] = f86 (p12) add X3 = X3, INCX } { .mmi (p15) LDFD f46 = [X1], INCX (p15) LDFD f94 = [Y1], INCY (p12) add X4 = X4, INCX } ;; { .mmi (p12) STFD [Y3] = f34 (p12) STFD [Y4] = f38 (p12) add Y3 = Y3, INCY } { .mmi nop __LINE__ nop __LINE__ (p12) add Y4 = Y4, INCY } ;; { .mmi (p12) STFD [X3] = f83 (p12) STFD [X4] = f87 (p12) add X3 = X3, INCX5 } { .mmi nop __LINE__ nop __LINE__ (p12) add X4 = X4, INCX5 } ;; { .mmi (p12) STFD [Y3] = f35 (p12) STFD [Y4] = f39 (p12) add Y3 = Y3, INCY5 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y4 = Y4, INCY5 } ;; { .mmi (p13) STFD [X3] = f88 (p13) STFD [Y3] = f40 (p13) add X3 = X3, INCX } { .mmi nop __LINE__ nop __LINE__ (p13) add Y3 = Y3, INCY } ;; { .mmi (p13) STFD [X3] = f89 (p13) STFD [Y3] = f41 (p13) add X3 = X3, INCX } { .mmi nop __LINE__ nop __LINE__ (p13) add Y3 = Y3, INCY } ;; { .mmi (p13) STFD [X3] = f90 (p13) STFD [Y3] = f42 (p13) add X3 = X3, INCX } { .mmi nop __LINE__ nop __LINE__ (p13) add Y3 = Y3, INCY } ;; { .mmi (p13) STFD [X3] = f91 (p13) STFD [Y3] = f43 (p13) add X3 = X3, INCX } { .mmi nop __LINE__ nop __LINE__ (p13) add Y3 = Y3, INCY } ;; { .mmi (p14) STFD [X3] = f92 (p14) STFD [Y3] = f44 (p14) add X3 = X3, INCX } { .mmi nop __LINE__ nop __LINE__ (p14) add Y3 = Y3, INCY } ;; { .mmi (p14) STFD [X3] = f93 (p14) STFD [Y3] = f45 (p14) add X3 = X3, INCX } { .mmi nop __LINE__ nop __LINE__ (p14) add Y3 = Y3, INCY } ;; { .mmb (p15) STFD [X3] = f94 (p15) STFD [Y3] = f46 br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/symv_U.S000066400000000000000000000235051313527062700166020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define SP r12 #define M r32 #define A r34 #define LDA r35 #define X r36 #define INCX r37 #define Y r38 #define INCY r39 #define BUFFER r33 #define I r14 #define IS r15 #define A1 r16 #define A2 r17 #define A3 r18 #define A4 r19 #define NEW_X r20 #define NEW_Y r21 #define XX r22 #define YY r23 #define TEMP r24 #define YYS r25 #define PREA1 loc0 #define PREA2 loc1 #define PREA3 loc2 #define PREA4 loc3 #define A11 loc4 #define A21 loc5 #define A31 loc6 #define A41 loc7 #define PREX r8 #define PREY r9 #define ARLC r29 #define PR r30 #define ARPFS r31 #ifdef DOUBLE #define RPREFETCH (16 * 3 + 4) #else #define RPREFETCH (16 * 3 + 16) #endif #define PREFETCH lfetch.nt1 #define PREFETCHW lfetch.excl.nt1 #define alpha f8 #define atemp1 f6 #define atemp2 f7 #define atemp3 f10 #define atemp4 f11 #define xsum1 f12 #define xsum2 f13 #define xsum3 f14 #define xsum4 f15 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 8, 0 mov ARLC = ar.lc } ;; mov PR = pr adds r14 = 16, SP ;; adds r8 = -8 * 16, SP adds r9 = -7 * 16, SP adds SP = -8 * 16, SP ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 .body ;; ld8 BUFFER = [r14] ;; shladd LDA = LDA, BASE_SHIFT, r0 shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 ;; cmp.ge p7, p0 = 0, M ;; (p7) br.cond.dpnt .L999 ;; mov NEW_X = X cmp.eq p10, p0 = SIZE, INCX (p10) br.cond.dptk .L10 ;; .L10: mov NEW_Y = Y cmp.eq p10, p0 = SIZE, INCY (p10) br.cond.dptk .L20 ;; .L20: mov IS = 0 cmp.gt p10, p0 = 4, M (p10) br.cond.dpnt .L30 ;; .L21: mov A1 = A add A2 = LDA, A ;; shladd A3 = LDA, 1, A shladd A4 = LDA, 1, A2 shladd A = LDA, 2, A ;; ;; adds PREX = RPREFETCH * SIZE, NEW_X adds PREY = RPREFETCH * SIZE, NEW_Y adds PREA1 = RPREFETCH * SIZE, A1 adds PREA2 = RPREFETCH * SIZE, A2 adds PREA3 = RPREFETCH * SIZE, A3 adds PREA4 = RPREFETCH * SIZE, A4 ;; shladd TEMP = IS, BASE_SHIFT, NEW_X ;; LDFD atemp1 = [TEMP], 1 * SIZE ;; LDFD atemp2 = [TEMP], 1 * SIZE ;; LDFD atemp3 = [TEMP], 1 * SIZE ;; LDFD atemp4 = [TEMP], 1 * SIZE ;; FMPY atemp1 = alpha, atemp1 FMPY atemp2 = alpha, atemp2 FMPY atemp3 = alpha, atemp3 FMPY atemp4 = alpha, atemp4 ;; mov xsum1 = f0 mov xsum2 = f0 mov xsum3 = f0 mov xsum4 = f0 ;; mov XX = NEW_X mov YY = NEW_Y mov YYS = NEW_Y ;; shr I = IS, 2 mov pr.rot = 0 ;; mov ar.ec = 3 cmp.eq p16, p0 = r0, r0 ;; cmp.eq p6, p0 = 0, I adds I = -1, I ;; mov ar.lc = I (p6) br.cond.dpnt .L28 ;; .align 16 .L22: { .mmf (p16) LDFPD f32, f35 = [A1], 2 * SIZE (p19) STFD [YYS] = f95, 1 * SIZE (p18) FMA xsum1 = f82, f34, xsum1 } { .mmf (p18) FMA f94 = atemp1, f34, f94 } ;; { .mmf (p17) LDFD f90 = [XX], 1 * SIZE (p18) FMA xsum2 = f82, f46, xsum2 } { .mmf (p18) FMA f98 = atemp1, f37, f98 } ;; { .mmf (p16) LDFPD f44, f47 = [A2], 2 * SIZE (p19) STFD [YYS] = f99, 1 * SIZE (p18) FMA xsum3 = f82, f58, xsum3 } { .mmf (p18) FMA f102 = atemp1, f40, f102 } ;; { .mmf (p16) PREFETCHW [PREY], 4 * SIZE (p16) LDFD f92 = [YY], 1 * SIZE (p18) FMA xsum4 = f82, f70, xsum4 } { .mmf (p18) FMA f106 = atemp1, f43, f106 } ;; { .mmf (p16) LDFPD f56, f59 = [A3], 2 * SIZE (p19) STFD [YYS] = f103, 1 * SIZE (p18) FMA xsum1 = f85, f37, xsum1 } { .mmf (p18) FMA f94 = atemp2, f46, f94 } ;; { .mmf (p16) LDFD f96 = [YY], 1 * SIZE (p18) FMA xsum2 = f85, f49, xsum2 } { .mmf (p18) FMA f98 = atemp2, f49, f98 } ;; { .mmf (p16) LDFPD f68, f71 = [A4], 2 * SIZE (p19) STFD [YYS] = f107, 1 * SIZE (p18) FMA xsum3 = f85, f61, xsum3 } { .mmf (p18) FMA f102 = atemp2, f52, f102 } ;; { .mmf (p16) LDFD f100 = [YY], 1 * SIZE (p18) FMA xsum4 = f85, f73, xsum4 } { .mmf (p18) FMA f106 = atemp2, f55, f106 } ;; { .mmf (p16) PREFETCH [PREA1], 4 * SIZE (p16) LDFPD f38, f41 = [A1], 2 * SIZE (p18) FMA xsum1 = f88, f40, xsum1 } { .mmf (p18) FMA f94 = atemp3, f58, f94 } ;; { .mmf (p16) LDFD f104 = [YY], 1 * SIZE (p18) FMA xsum2 = f88, f52, xsum2 } { .mmf (p18) FMA f98 = atemp3, f61, f98 } ;; { .mmf (p16) PREFETCH [PREA2], 4 * SIZE (p16) LDFPD f50, f53 = [A2], 2 * SIZE (p18) FMA xsum3 = f88, f64, xsum3 } { .mmf (p18) FMA f102 = atemp3, f64, f102 } ;; { .mmf (p16) PREFETCH [PREX], 4 * SIZE (p16) LDFD f80 = [XX], 1 * SIZE (p18) FMA xsum4 = f88, f76, xsum4 } { .mmf (p18) FMA f106 = atemp3, f67, f106 } ;; { .mmf (p16) PREFETCH [PREA3], 4 * SIZE (p16) LDFPD f62, f65 = [A3], 2 * SIZE (p18) FMA xsum1 = f91, f43, xsum1 } { .mmf (p18) FMA f94 = atemp4, f70, f94 } ;; { .mmf (p16) LDFD f83 = [XX], 1 * SIZE (p18) FMA xsum2 = f91, f55, xsum2 } { .mmf (p18) FMA f98 = atemp4, f73, f98 } ;; { .mmf (p16) PREFETCH [PREA4], 4 * SIZE (p16) LDFPD f74, f77 = [A4], 2 * SIZE (p18) FMA xsum3 = f91, f67, xsum3 } { .mmf (p18) FMA f102 = atemp4, f76, f102 } ;; { .mmf (p16) LDFD f86 = [XX], 1 * SIZE (p18) FMA xsum4 = f91, f79, xsum4 } { .mfb (p18) FMA f106 = atemp4, f79, f106 br.ctop.sptk.few .L22 } ;; (p19) STFD [YYS] = f95, 1 * SIZE ;; (p19) STFD [YYS] = f99, 1 * SIZE ;; (p19) STFD [YYS] = f103, 1 * SIZE ;; (p19) STFD [YYS] = f107, 1 * SIZE ;; ;; .align 16 .L28: FMPY xsum1 = alpha, xsum1 FMPY xsum2 = alpha, xsum2 FMPY xsum3 = alpha, xsum3 FMPY xsum4 = alpha, xsum4 ;; LDFD f64 = [A1], 1 * SIZE LDFD f65 = [A2], 1 * SIZE LDFD f66 = [A3], 1 * SIZE LDFD f67 = [A4], 1 * SIZE ;; LDFD f68 = [A1], 1 * SIZE LDFD f69 = [A2], 1 * SIZE LDFD f70 = [A3], 1 * SIZE LDFD f71 = [A4], 1 * SIZE ;; LDFD f72 = [A1], 1 * SIZE LDFD f73 = [A2], 1 * SIZE LDFD f74 = [A3], 1 * SIZE LDFD f75 = [A4], 1 * SIZE ;; LDFD f76 = [A1], 1 * SIZE LDFD f77 = [A2], 1 * SIZE LDFD f78 = [A3], 1 * SIZE LDFD f79 = [A4], 1 * SIZE ;; FMA xsum1 = atemp1, f64, xsum1 FMA xsum2 = atemp1, f65, xsum2 FMA xsum3 = atemp1, f66, xsum3 FMA xsum4 = atemp1, f67, xsum4 ;; FMA xsum1 = atemp2, f65, xsum1 FMA xsum2 = atemp2, f69, xsum2 FMA xsum3 = atemp2, f70, xsum3 FMA xsum4 = atemp2, f71, xsum4 ;; FMA xsum1 = atemp3, f66, xsum1 FMA xsum2 = atemp3, f70, xsum2 FMA xsum3 = atemp3, f74, xsum3 FMA xsum4 = atemp3, f75, xsum4 ;; FMA xsum1 = atemp4, f67, xsum1 FMA xsum2 = atemp4, f71, xsum2 FMA xsum3 = atemp4, f75, xsum3 FMA xsum4 = atemp4, f79, xsum4 ;; LDFD f36 = [YY], 1 * SIZE ;; LDFD f37 = [YY], 1 * SIZE ;; LDFD f38 = [YY], 1 * SIZE ;; LDFD f39 = [YY], 1 * SIZE ;; FADD f36 = f36, xsum1 FADD f37 = f37, xsum2 FADD f38 = f38, xsum3 FADD f39 = f39, xsum4 ;; STFD [YYS] = f36, 1 * SIZE ;; STFD [YYS] = f37, 1 * SIZE ;; STFD [YYS] = f38, 1 * SIZE ;; STFD [YYS] = f39, 1 * SIZE ;; adds IS = 4, IS ;; adds TEMP = 4, IS ;; cmp.le p6, p0 = TEMP, M ;; (p6) br.cond.dpnt .L21 ;; .L30: .L990: .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/trsm_kernel_LN.S000066400000000000000000007135151313527062700202450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #define CPREFETCHSIZE -7 #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r36 #define B r37 #define C r38 #define LDC r39 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define C9 loc0 #define C10 loc1 #define C11 loc2 #define C12 loc3 #define C13 loc4 #define C14 loc5 #define C15 loc6 #define C16 loc7 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA f8 #define AORIG loc8 #define KK loc9 #define KK8 loc10 #define OFFSET loc11 #define AOFFSET2 loc12 #define BOFFSET2 loc13 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -6 * 16, SP adds r9 = -5 * 16, SP adds SP = -6 * 16, SP } ;; { .mmi setf.sig f32 = M setf.sig f33 = K mov PR = pr } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 shr J = N, 3 } ;; { .mmi stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shladd LDC = LDC, BASE_SHIFT, r0 } ;; { .mmi stf.spill [r8] = f20 stf.spill [r9] = f21 mov AOFFSET = A } ;; .body { .mmf ld8 OFFSET = [r14] cmp.ge p6, p0 = 0, J xmpy.l f32 = f32, f33 } ;; { .mmi getf.sig r2 = f32 shladd C = M, BASE_SHIFT, C nop __LINE__ } ;; { .mmb shladd A = r2, BASE_SHIFT, A nop __LINE__ (p6) br.cond.dpnt .L050 } ;; .align 8 .L000: { .mmf mov C1 = C add KK = M, OFFSET } { .mmi mov AORIG = A add C2 = LDC, C shladd C3 = LDC, 1, C } ;; { .mmf shladd C5 = LDC, 2, C shladd C = LDC, 3, C } { .mmf shladd C4 = LDC, 1, C2 shladd C6 = LDC, 2, C2 } ;; { .mfi shladd C7 = LDC, 2, C3 shladd C8 = LDC, 2, C4 } ;; ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f96 = f0 mov f104 = f0 mov f112 = f0 mov f120 = f0 .L040: { .mib sub L = K, KK tbit.z p6, p0 = M, 0 (p6) br.cond.dptk .L030 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; { .mfi shladd BOFFSET = r3, 3, B sub AORIG = AORIG, r2 } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE adds L = -1, L } ;; { .mmi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L048 } ;; .L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ br.cloop.sptk.few .L042 } ;; .L048: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -8, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f96 = f36, f96 FSUB f104 = f37, f104 FSUB f112 = f38, f112 FSUB f120 = f39, f120 ;; #endif #ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f96 = f96, f32 FMPY f72 = f72, f32 FMPY f104 = f104, f32 FMPY f80 = f80, f32 FMPY f112 = f112, f32 FMPY f88 = f88, f32 FMPY f120 = f120, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -1 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; adds C3 = -1 * SIZE, C3 adds C4 = -1 * SIZE, C4 adds C5 = -1 * SIZE, C5 adds C6 = -1 * SIZE, C6 adds C7 = -1 * SIZE, C7 adds C8 = -1 * SIZE, C8 ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, -3 * SIZE } { .mfi STFD [BOFFSET2] = f120, -3 * SIZE } ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FNMA f80 = f64, f34, f80 ;; FNMA f88 = f64, f35, f88 ;; FNMA f96 = f64, f36, f96 ;; FNMA f104 = f64, f37, f104 ;; FNMA f112 = f64, f38, f112 ;; FNMA f120 = f64, f39, f120 ;; FMPY f72 = f72, f40 ;; FNMA f80 = f72, f41, f80 ;; FNMA f88 = f72, f42, f88 ;; FNMA f96 = f72, f43, f96 ;; FNMA f104 = f72, f44, f104 ;; FNMA f112 = f72, f45, f112 ;; FNMA f120 = f72, f46, f120 ;; FMPY f80 = f80, f47 ;; FNMA f88 = f80, f48, f88 ;; FNMA f96 = f80, f49, f96 ;; FNMA f104 = f80, f50, f104 ;; FNMA f112 = f80, f51, f112 ;; FNMA f120 = f80, f52, f120 ;; FMPY f88 = f88, f53 ;; FNMA f96 = f88, f54, f96 ;; FNMA f104 = f88, f55, f104 ;; FNMA f112 = f88, f56, f112 ;; FNMA f120 = f88, f57, f120 ;; FMPY f96 = f96, f58 ;; FNMA f104 = f96, f59, f104 ;; FNMA f112 = f96, f60, f112 ;; FNMA f120 = f96, f61, f120 ;; FMPY f104 = f104, f16 ;; FNMA f112 = f104, f17, f112 ;; FNMA f120 = f104, f18, f120 ;; FMPY f112 = f112, f19 ;; FNMA f120 = f112, f20, f120 ;; FMPY f120 = f120, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE STFD [AOFFSET2] = f120, - 3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 ;; FNMA f112 = f120, f33, f112 ;; FNMA f104 = f120, f34, f104 ;; FNMA f96 = f120, f35, f96 ;; FNMA f88 = f120, f36, f88 ;; FNMA f80 = f120, f37, f80 ;; FNMA f72 = f120, f38, f72 ;; FNMA f64 = f120, f39, f64 ;; FMPY f112 = f112, f40 ;; FNMA f104 = f112, f41, f104 ;; FNMA f96 = f112, f42, f96 ;; FNMA f88 = f112, f43, f88 ;; FNMA f80 = f112, f44, f80 ;; FNMA f72 = f112, f45, f72 ;; FNMA f64 = f112, f46, f64 ;; FMPY f104 = f104, f47 ;; FNMA f96 = f104, f48, f96 ;; FNMA f88 = f104, f49, f88 ;; FNMA f80 = f104, f50, f80 ;; FNMA f72 = f104, f51, f72 ;; FNMA f64 = f104, f52, f64 ;; FMPY f96 = f96, f53 ;; FNMA f88 = f96, f54, f88 ;; FNMA f80 = f96, f55, f80 ;; FNMA f72 = f96, f56, f72 ;; FNMA f64 = f96, f57, f64 ;; FMPY f88 = f88, f58 ;; FNMA f80 = f88, f59, f80 ;; FNMA f72 = f88, f60, f72 ;; FNMA f64 = f88, f61, f64 ;; FMPY f80 = f80, f16 ;; FNMA f72 = f80, f17, f72 ;; FNMA f64 = f80, f18, f64 ;; FMPY f72 = f72, f19 ;; FNMA f64 = f72, f20, f64 ;; FMPY f64 = f64, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f88, - 3 * SIZE STFD [AOFFSET2] = f120, - 3 * SIZE ;; #endif #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif #ifndef LN STFD [C2 ] = f72, SIZE #else STFD [C2 ] = f72 #endif #ifndef LN STFD [C3 ] = f80, SIZE #else STFD [C3 ] = f80 #endif #ifndef LN STFD [C4 ] = f88, SIZE #else STFD [C4 ] = f88 #endif #ifndef LN STFD [C5 ] = f96, SIZE #else STFD [C5 ] = f96 #endif #ifndef LN STFD [C6 ] = f104, SIZE #else STFD [C6 ] = f104 #endif #ifndef LN STFD [C7 ] = f112, SIZE #else STFD [C7 ] = f112 #endif #ifndef LN STFD [C8 ] = f120, SIZE #else STFD [C8 ] = f120 #endif ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f96 = f0 mov f104 = f0 mov f112 = f0 mov f120 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET #else nop __LINE__ #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 8 .L030: { .mib sub L = K, KK tbit.z p6, p0 = M, 1 (p6) br.cond.dptk .L020 } ;; ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 3, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif { .mfi setf.d f105 = r0 mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE adds L = -1, L } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L038 } ;; .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -8, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi FSUB f113 = f46, f113 } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 FSUB f80 = f36, f80 FSUB f81 = f37, f81 FSUB f88 = f38, f88 FSUB f89 = f39, f89 ;; FSUB f96 = f40, f96 FSUB f97 = f41, f97 ;; FSUB f104 = f42, f104 FSUB f105 = f43, f105 ;; FSUB f112 = f44, f112 FSUB f113 = f45, f113 ;; FSUB f120 = f46, f120 FSUB f121 = f47, f121 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f97 = f97, f32 FMPY f73 = f73, f32 FMPY f105 = f105, f32 FMPY f81 = f81, f32 FMPY f113 = f113, f32 FMPY f89 = f89, f32 FMPY f121 = f121, f32 ;; FNMA f64 = f65, f33, f64 FNMA f96 = f97, f33, f96 FNMA f72 = f73, f33, f72 FNMA f104 = f105, f33, f104 FNMA f80 = f81, f33, f80 FNMA f112 = f113, f33, f112 FNMA f88 = f89, f33, f88 FNMA f120 = f121, f33, f120 ;; FMPY f64 = f64, f34 FMPY f96 = f96, f34 FMPY f72 = f72, f34 FMPY f104 = f104, f34 FMPY f80 = f80, f34 FMPY f112 = f112, f34 FMPY f88 = f88, f34 FMPY f120 = f120, f34 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, - 11 * SIZE } { .mfi STFD [BOFFSET2] = f121, - 11 * SIZE } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -2 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -2 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 adds C5 = -2 * SIZE, C5 adds C6 = -2 * SIZE, C6 adds C7 = -2 * SIZE, C7 adds C8 = -2 * SIZE, C8 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi FNMA f81 = f80, f33, f81 } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; FMPY f65 = f65, f34 FMPY f97 = f97, f34 FMPY f73 = f73, f34 FMPY f105 = f105, f34 FMPY f81 = f81, f34 FMPY f113 = f113, f34 FMPY f89 = f89, f34 FMPY f121 = f121, f34 ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, -11 * SIZE } { .mfi STFD [BOFFSET2] = f121, -11 * SIZE } #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 ;; FNMA f96 = f64, f36, f96 FNMA f97 = f65, f36, f97 ;; FNMA f104 = f64, f37, f104 FNMA f105 = f65, f37, f105 ;; FNMA f112 = f64, f38, f112 FNMA f113 = f65, f38, f113 ;; FNMA f120 = f64, f39, f120 FNMA f121 = f65, f39, f121 ;; FMPY f72 = f72, f40 FMPY f73 = f73, f40 ;; FNMA f80 = f72, f41, f80 FNMA f81 = f73, f41, f81 ;; FNMA f88 = f72, f42, f88 FNMA f89 = f73, f42, f89 ;; FNMA f96 = f72, f43, f96 FNMA f97 = f73, f43, f97 ;; FNMA f104 = f72, f44, f104 FNMA f105 = f73, f44, f105 ;; FNMA f112 = f72, f45, f112 FNMA f113 = f73, f45, f113 ;; FNMA f120 = f72, f46, f120 FNMA f121 = f73, f46, f121 ;; FMPY f80 = f80, f47 FMPY f81 = f81, f47 ;; FNMA f88 = f80, f48, f88 FNMA f89 = f81, f48, f89 ;; FNMA f96 = f80, f49, f96 FNMA f97 = f81, f49, f97 ;; FNMA f104 = f80, f50, f104 FNMA f105 = f81, f50, f105 ;; FNMA f112 = f80, f51, f112 FNMA f113 = f81, f51, f113 ;; FNMA f120 = f80, f52, f120 FNMA f121 = f81, f52, f121 ;; FMPY f88 = f88, f53 FMPY f89 = f89, f53 ;; FNMA f96 = f88, f54, f96 FNMA f97 = f89, f54, f97 ;; FNMA f104 = f88, f55, f104 FNMA f105 = f89, f55, f105 ;; FNMA f112 = f88, f56, f112 FNMA f113 = f89, f56, f113 ;; FNMA f120 = f88, f57, f120 FNMA f121 = f89, f57, f121 ;; FMPY f96 = f96, f58 FMPY f97 = f97, f58 ;; FNMA f104 = f96, f59, f104 FNMA f105 = f97, f59, f105 ;; FNMA f112 = f96, f60, f112 FNMA f113 = f97, f60, f113 ;; FNMA f120 = f96, f61, f120 FNMA f121 = f97, f61, f121 ;; FMPY f104 = f104, f16 FMPY f105 = f105, f16 ;; FNMA f112 = f104, f17, f112 FNMA f113 = f105, f17, f113 ;; FNMA f120 = f104, f18, f120 FNMA f121 = f105, f18, f121 ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 ;; FNMA f120 = f112, f20, f120 FNMA f121 = f113, f20, f121 ;; FMPY f120 = f120, f21 FMPY f121 = f121, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, 5 * SIZE STFD [AOFFSET2] = f89, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f104, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f105, -11 * SIZE STFD [AOFFSET2] = f121, - 11 * SIZE ;; #endif #ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 FMPY f121 = f121, f32 ;; FNMA f112 = f120, f33, f112 FNMA f113 = f121, f33, f113 ;; FNMA f104 = f120, f34, f104 FNMA f105 = f121, f34, f105 ;; FNMA f96 = f120, f35, f96 FNMA f97 = f121, f35, f97 ;; FNMA f88 = f120, f36, f88 FNMA f89 = f121, f36, f89 ;; FNMA f80 = f120, f37, f80 FNMA f81 = f121, f37, f81 ;; FNMA f72 = f120, f38, f72 FNMA f73 = f121, f38, f73 ;; FNMA f64 = f120, f39, f64 FNMA f65 = f121, f39, f65 ;; FMPY f112 = f112, f40 FMPY f113 = f113, f40 ;; FNMA f104 = f112, f41, f104 FNMA f105 = f113, f41, f105 ;; FNMA f96 = f112, f42, f96 FNMA f97 = f113, f42, f97 ;; FNMA f88 = f112, f43, f88 FNMA f89 = f113, f43, f89 ;; FNMA f80 = f112, f44, f80 FNMA f81 = f113, f44, f81 ;; FNMA f72 = f112, f45, f72 FNMA f73 = f113, f45, f73 ;; FNMA f64 = f112, f46, f64 FNMA f65 = f113, f46, f65 ;; FMPY f104 = f104, f47 FMPY f105 = f105, f47 ;; FNMA f96 = f104, f48, f96 FNMA f97 = f105, f48, f97 ;; FNMA f88 = f104, f49, f88 FNMA f89 = f105, f49, f89 ;; FNMA f80 = f104, f50, f80 FNMA f81 = f105, f50, f81 ;; FNMA f72 = f104, f51, f72 FNMA f73 = f105, f51, f73 ;; FNMA f64 = f104, f52, f64 FNMA f65 = f105, f52, f65 ;; FMPY f96 = f96, f53 FMPY f97 = f97, f53 ;; FNMA f88 = f96, f54, f88 FNMA f89 = f97, f54, f89 ;; FNMA f80 = f96, f55, f80 FNMA f81 = f97, f55, f81 ;; FNMA f72 = f96, f56, f72 FNMA f73 = f97, f56, f73 ;; FNMA f64 = f96, f57, f64 FNMA f65 = f97, f57, f65 ;; FMPY f88 = f88, f58 FMPY f89 = f89, f58 ;; FNMA f80 = f88, f59, f80 FNMA f81 = f89, f59, f81 ;; FNMA f72 = f88, f60, f72 FNMA f73 = f89, f60, f73 ;; FNMA f64 = f88, f61, f64 FNMA f65 = f89, f61, f65 ;; FMPY f80 = f80, f16 FMPY f81 = f81, f16 ;; FNMA f72 = f80, f17, f72 FNMA f73 = f81, f17, f73 ;; FNMA f64 = f80, f18, f64 FNMA f65 = f81, f18, f65 ;; FMPY f72 = f72, f19 FMPY f73 = f73, f19 ;; FNMA f64 = f72, f20, f64 FNMA f65 = f73, f20, f65 ;; FMPY f64 = f64, f21 FMPY f65 = f65, f21 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f104, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f105, - 11 * SIZE STFD [AOFFSET2] = f121, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, - 3 * SIZE STFD [AOFFSET2] = f89, - 3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; #ifndef LN STFD [C2 ] = f73, SIZE #else STFD [C2 ] = f73, -SIZE #endif ;; STFD [C3 ] = f80, SIZE mov f80 = f0 ;; #ifndef LN STFD [C3 ] = f81, SIZE #else STFD [C3 ] = f81, - SIZE #endif ;; STFD [C4 ] = f88, SIZE mov f88 = f0 ;; #ifndef LN STFD [C4 ] = f89, SIZE #else STFD [C4 ] = f89, -SIZE #endif ;; STFD [C5 ] = f96, SIZE mov f96 = f0 ;; #ifndef LN STFD [C5 ] = f97, SIZE #else STFD [C5 ] = f97, -SIZE #endif ;; STFD [C6 ] = f104, SIZE mov f104 = f0 ;; #ifndef LN STFD [C6 ] = f105, SIZE #else STFD [C6 ] = f105, -SIZE #endif ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; STFD [C7 ] = f112, SIZE mov f112 = f0 ;; { .mmi #ifndef LN STFD [C7 ] = f113, SIZE #else STFD [C7 ] = f113, -SIZE #endif #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mmf STFD [C8 ] = f120, SIZE mov f120 = f0 } ;; { .mmi #ifndef LN STFD [C8 ] = f121, SIZE #else STFD [C8 ] = f121, -SIZE #endif #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L020: { .mib sub L = K, KK tbit.z p6, p0 = M, 2 (p6) br.cond.dptk .L010 } ;; ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 3, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi setf.d f105 = r0 mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f67 = f0 } { .mfi setf.d f74 = r0 mov f75 = f0 adds L = -1, L } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f82 = r0 mov f83 = f0 } { .mfi setf.d f90 = r0 mov f91 = f0 cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f99 = f0 } { .mfi setf.d f106 = r0 mov f107 = f0 mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f114 = r0 mov f115 = f0 } { .mfb setf.d f122 = r0 mov f123 = f0 (p6) br.cond.dpnt .L028 } ;; .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C13 = 2 * SIZE, C5 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C14 = 2 * SIZE, C6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 (p5) adds C15 = 2 * SIZE, C7 } { .mfi nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 (p5) adds C16 = 2 * SIZE, C8 } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -8, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET] FSUB f113 = f46, f113 adds BOFFSET = -30 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; FSUB f66 = f48, f66 FSUB f74 = f49, f74 FSUB f82 = f50, f82 FSUB f90 = f51, f90 FSUB f98 = f52, f98 FSUB f106 = f53, f106 FSUB f114 = f54, f114 FSUB f122 = f55, f122 ;; FSUB f67 = f56, f67 FSUB f75 = f57, f75 FSUB f83 = f58, f83 FSUB f91 = f59, f91 FSUB f99 = f60, f99 FSUB f107 = f61, f107 FSUB f115 = f62, f115 FSUB f123 = f63, f123 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [AOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [AOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [AOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [AOFFSET] adds AOFFSET = -30 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 FSUB f80 = f40, f80 FSUB f81 = f41, f81 FSUB f82 = f42, f82 FSUB f83 = f43, f83 FSUB f88 = f44, f88 FSUB f89 = f45, f89 FSUB f90 = f46, f90 FSUB f91 = f47, f91 ;; FSUB f96 = f48, f96 FSUB f97 = f49, f97 FSUB f98 = f50, f98 FSUB f99 = f51, f99 ;; FSUB f104 = f52, f104 FSUB f105 = f53, f105 FSUB f106 = f54, f106 FSUB f107 = f55, f107 ;; FSUB f112 = f56, f112 FSUB f113 = f57, f113 FSUB f114 = f58, f114 FSUB f115 = f59, f115 ;; FSUB f120 = f60, f120 FSUB f121 = f61, f121 FSUB f122 = f62, f122 FSUB f123 = f63, f123 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 FMPY f99 = f99, f32 FMPY f75 = f75, f32 FMPY f107 = f107, f32 FMPY f83 = f83, f32 FMPY f115 = f115, f32 FMPY f91 = f91, f32 FMPY f123 = f123, f32 ;; FNMA f66 = f67, f33, f66 FNMA f98 = f99, f33, f98 FNMA f74 = f75, f33, f74 FNMA f106 = f107, f33, f106 FNMA f82 = f83, f33, f82 FNMA f114 = f115, f33, f114 FNMA f90 = f91, f33, f90 FNMA f122 = f123, f33, f122 ;; FNMA f65 = f67, f34, f65 FNMA f97 = f99, f34, f97 FNMA f73 = f75, f34, f73 FNMA f105 = f107, f34, f105 FNMA f81 = f83, f34, f81 FNMA f113 = f115, f34, f113 FNMA f89 = f91, f34, f89 FNMA f121 = f123, f34, f121 ;; FNMA f64 = f67, f35, f64 FNMA f96 = f99, f35, f96 FNMA f72 = f75, f35, f72 FNMA f104 = f107, f35, f104 FNMA f80 = f83, f35, f80 FNMA f112 = f115, f35, f112 FNMA f88 = f91, f35, f88 FNMA f120 = f123, f35, f120 ;; FMPY f66 = f66, f36 FMPY f98 = f98, f36 FMPY f74 = f74, f36 FMPY f106 = f106, f36 FMPY f82 = f82, f36 FMPY f114 = f114, f36 FMPY f90 = f90, f36 FMPY f122 = f122, f36 ;; FNMA f65 = f66, f37, f65 FNMA f97 = f98, f37, f97 FNMA f73 = f74, f37, f73 FNMA f105 = f106, f37, f105 FNMA f81 = f82, f37, f81 FNMA f113 = f114, f37, f113 FNMA f89 = f90, f37, f89 FNMA f121 = f122, f37, f121 ;; FNMA f64 = f66, f38, f64 FNMA f96 = f98, f38, f96 FNMA f72 = f74, f38, f72 FNMA f104 = f106, f38, f104 FNMA f80 = f82, f38, f80 FNMA f112 = f114, f38, f112 FNMA f88 = f90, f38, f88 FNMA f120 = f122, f38, f120 ;; adds BOFFSET = 24 * SIZE, BOFFSET adds BOFFSET2 = 24 * SIZE, BOFFSET2 ;; { .mfi STFD [BOFFSET] = f67, SIZE FMPY f65 = f65, f39 } { .mfi STFD [BOFFSET2] = f99, SIZE FMPY f97 = f97, f39 } ;; { .mfi STFD [BOFFSET] = f75, SIZE FMPY f73 = f73, f39 } { .mfi STFD [BOFFSET2] = f107, SIZE FMPY f105 = f105, f39 } ;; { .mfi STFD [BOFFSET] = f83, SIZE FMPY f81 = f81, f39 } { .mfi STFD [BOFFSET2] = f115, SIZE FMPY f113 = f113, f39 } ;; { .mfi STFD [BOFFSET] = f91, - 11 * SIZE FMPY f89 = f89, f39 } { .mfi STFD [BOFFSET2] = f123, - 11 * SIZE FMPY f121 = f121, f39 } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f64 = f65, f40, f64 } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f96 = f97, f40, f96 } ;; { .mfi STFD [BOFFSET] = f74, SIZE FNMA f72 = f73, f40, f72 } { .mfi STFD [BOFFSET2] = f106, SIZE FNMA f104 = f105, f40, f104 } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f80 = f81, f40, f80 } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f112 = f113, f40, f112 } ;; { .mfi STFD [BOFFSET] = f90, -11 * SIZE FNMA f88 = f89, f40, f88 } { .mfi STFD [BOFFSET2] = f122, -11 * SIZE FNMA f120 = f121, f40, f120 } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMPY f64 = f64, f41 } { .mfi STFD [BOFFSET2] = f97, SIZE FMPY f96 = f96, f41 } ;; { .mfi STFD [BOFFSET] = f73, SIZE FMPY f72 = f72, f41 } { .mfi STFD [BOFFSET2] = f105, SIZE FMPY f104 = f104, f41 } ;; { .mfi STFD [BOFFSET] = f81, SIZE FMPY f80 = f80, f41 } { .mfi STFD [BOFFSET2] = f113, SIZE FMPY f112 = f112, f41 } ;; { .mfi STFD [BOFFSET] = f89, - 11 * SIZE FMPY f88 = f88, f41 } { .mfi STFD [BOFFSET2] = f121, - 11 * SIZE FMPY f120 = f120, f41 } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -4 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -4 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi FNMA f81 = f80, f33, f81 } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; { .mfi FNMA f66 = f64, f34, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f96, f34, f98 nop __LINE__ } ;; { .mfi FNMA f74 = f72, f34, f74 } { .mfi nop __LINE__ FNMA f106 = f104, f34, f106 nop __LINE__ } ;; { .mfi FNMA f82 = f80, f34, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f112, f34, f114 nop __LINE__ } ;; { .mfi FNMA f90 = f88, f34, f90 nop __LINE__ } { .mfi nop __LINE__ FNMA f122 = f120, f34, f122 nop __LINE__ } ;; { .mfi FNMA f67 = f64, f35, f67 } { .mfi nop __LINE__ FNMA f99 = f96, f35, f99 nop __LINE__ } ;; { .mfi FNMA f75 = f72, f35, f75 nop __LINE__ } { .mfi nop __LINE__ FNMA f107 = f104, f35, f107 nop __LINE__ } ;; { .mfi FNMA f83 = f80, f35, f83 } { .mfi nop __LINE__ FNMA f115 = f112, f35, f115 nop __LINE__ } ;; { .mfi FNMA f91 = f88, f35, f91 nop __LINE__ } { .mfi nop __LINE__ FNMA f123 = f120, f35, f123 nop __LINE__ } ;; FMPY f65 = f65, f36 FMPY f97 = f97, f36 FMPY f73 = f73, f36 FMPY f105 = f105, f36 FMPY f81 = f81, f36 FMPY f113 = f113, f36 FMPY f89 = f89, f36 FMPY f121 = f121, f36 ;; FNMA f66 = f65, f37, f66 FNMA f98 = f97, f37, f98 FNMA f74 = f73, f37, f74 FNMA f106 = f105, f37, f106 FNMA f82 = f81, f37, f82 FNMA f114 = f113, f37, f114 FNMA f90 = f89, f37, f90 FNMA f122 = f121, f37, f122 ;; FNMA f67 = f65, f38, f67 FNMA f99 = f97, f38, f99 FNMA f75 = f73, f38, f75 FNMA f107 = f105, f38, f107 FNMA f83 = f81, f38, f83 FNMA f115 = f113, f38, f115 FNMA f91 = f89, f38, f91 FNMA f123 = f121, f38, f123 ;; FMPY f66 = f66, f39 FMPY f98 = f98, f39 FMPY f74 = f74, f39 FMPY f106 = f106, f39 FMPY f82 = f82, f39 FMPY f114 = f114, f39 FMPY f90 = f90, f39 FMPY f122 = f122, f39 ;; FNMA f67 = f66, f40, f67 FNMA f99 = f98, f40, f99 FNMA f75 = f74, f40, f75 FNMA f107 = f106, f40, f107 FNMA f83 = f82, f40, f83 FNMA f115 = f114, f40, f115 FNMA f91 = f90, f40, f91 FNMA f123 = f122, f40, f123 ;; FMPY f67 = f67, f41 FMPY f99 = f99, f41 FMPY f75 = f75, f41 FMPY f107 = f107, f41 FMPY f83 = f83, f41 FMPY f115 = f115, f41 FMPY f91 = f91, f41 FMPY f123 = f123, f41 ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, 5 * SIZE } { .mfi STFD [BOFFSET2] = f121, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f66, SIZE } { .mfi STFD [BOFFSET2] = f98, SIZE } ;; { .mfi STFD [BOFFSET] = f74, SIZE } { .mfi STFD [BOFFSET2] = f106, SIZE } ;; { .mfi STFD [BOFFSET] = f82, SIZE } { .mfi STFD [BOFFSET2] = f114, SIZE } ;; { .mfi STFD [BOFFSET] = f90, 5 * SIZE } { .mfi STFD [BOFFSET2] = f122, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f67, SIZE } { .mfi STFD [BOFFSET2] = f99, SIZE } ;; { .mfi STFD [BOFFSET] = f75, SIZE } { .mfi STFD [BOFFSET2] = f107, SIZE } ;; { .mfi STFD [BOFFSET] = f83, SIZE } { .mfi STFD [BOFFSET2] = f115, SIZE } ;; { .mfi STFD [BOFFSET] = f91, -27 * SIZE } { .mfi STFD [BOFFSET2] = f123, -27 * SIZE } ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 FNMA f82 = f66, f34, f82 FNMA f83 = f67, f34, f83 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 FNMA f90 = f66, f35, f90 FNMA f91 = f67, f35, f91 ;; FNMA f96 = f64, f36, f96 FNMA f97 = f65, f36, f97 FNMA f98 = f66, f36, f98 FNMA f99 = f67, f36, f99 ;; FNMA f104 = f64, f37, f104 FNMA f105 = f65, f37, f105 FNMA f106 = f66, f37, f106 FNMA f107 = f67, f37, f107 ;; FNMA f112 = f64, f38, f112 FNMA f113 = f65, f38, f113 FNMA f114 = f66, f38, f114 FNMA f115 = f67, f38, f115 ;; FNMA f120 = f64, f39, f120 FNMA f121 = f65, f39, f121 FNMA f122 = f66, f39, f122 FNMA f123 = f67, f39, f123 ;; FMPY f72 = f72, f40 FMPY f73 = f73, f40 FMPY f74 = f74, f40 FMPY f75 = f75, f40 ;; FNMA f80 = f72, f41, f80 FNMA f81 = f73, f41, f81 FNMA f82 = f74, f41, f82 FNMA f83 = f75, f41, f83 ;; FNMA f88 = f72, f42, f88 FNMA f89 = f73, f42, f89 FNMA f90 = f74, f42, f90 FNMA f91 = f75, f42, f91 ;; FNMA f96 = f72, f43, f96 FNMA f97 = f73, f43, f97 FNMA f98 = f74, f43, f98 FNMA f99 = f75, f43, f99 ;; FNMA f104 = f72, f44, f104 FNMA f105 = f73, f44, f105 FNMA f106 = f74, f44, f106 FNMA f107 = f75, f44, f107 ;; FNMA f112 = f72, f45, f112 FNMA f113 = f73, f45, f113 FNMA f114 = f74, f45, f114 FNMA f115 = f75, f45, f115 ;; FNMA f120 = f72, f46, f120 FNMA f121 = f73, f46, f121 FNMA f122 = f74, f46, f122 FNMA f123 = f75, f46, f123 ;; FMPY f80 = f80, f47 FMPY f81 = f81, f47 FMPY f82 = f82, f47 FMPY f83 = f83, f47 ;; FNMA f88 = f80, f48, f88 FNMA f89 = f81, f48, f89 FNMA f90 = f82, f48, f90 FNMA f91 = f83, f48, f91 ;; FNMA f96 = f80, f49, f96 FNMA f97 = f81, f49, f97 FNMA f98 = f82, f49, f98 FNMA f99 = f83, f49, f99 ;; FNMA f104 = f80, f50, f104 FNMA f105 = f81, f50, f105 FNMA f106 = f82, f50, f106 FNMA f107 = f83, f50, f107 ;; FNMA f112 = f80, f51, f112 FNMA f113 = f81, f51, f113 FNMA f114 = f82, f51, f114 FNMA f115 = f83, f51, f115 ;; FNMA f120 = f80, f52, f120 FNMA f121 = f81, f52, f121 FNMA f122 = f82, f52, f122 FNMA f123 = f83, f52, f123 ;; FMPY f88 = f88, f53 FMPY f89 = f89, f53 FMPY f90 = f90, f53 FMPY f91 = f91, f53 ;; FNMA f96 = f88, f54, f96 FNMA f97 = f89, f54, f97 FNMA f98 = f90, f54, f98 FNMA f99 = f91, f54, f99 ;; FNMA f104 = f88, f55, f104 FNMA f105 = f89, f55, f105 FNMA f106 = f90, f55, f106 FNMA f107 = f91, f55, f107 ;; FNMA f112 = f88, f56, f112 FNMA f113 = f89, f56, f113 FNMA f114 = f90, f56, f114 FNMA f115 = f91, f56, f115 ;; FNMA f120 = f88, f57, f120 FNMA f121 = f89, f57, f121 FNMA f122 = f90, f57, f122 FNMA f123 = f91, f57, f123 ;; FMPY f96 = f96, f58 FMPY f97 = f97, f58 FMPY f98 = f98, f58 FMPY f99 = f99, f58 ;; FNMA f104 = f96, f59, f104 FNMA f105 = f97, f59, f105 FNMA f106 = f98, f59, f106 FNMA f107 = f99, f59, f107 ;; FNMA f112 = f96, f60, f112 FNMA f113 = f97, f60, f113 FNMA f114 = f98, f60, f114 FNMA f115 = f99, f60, f115 ;; FNMA f120 = f96, f61, f120 FNMA f121 = f97, f61, f121 FNMA f122 = f98, f61, f122 FNMA f123 = f99, f61, f123 ;; FMPY f104 = f104, f16 FMPY f105 = f105, f16 FMPY f106 = f106, f16 FMPY f107 = f107, f16 ;; FNMA f112 = f104, f17, f112 FNMA f113 = f105, f17, f113 FNMA f114 = f106, f17, f114 FNMA f115 = f107, f17, f115 ;; FNMA f120 = f104, f18, f120 FNMA f121 = f105, f18, f121 FNMA f122 = f106, f18, f122 FNMA f123 = f107, f18, f123 ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 FMPY f114 = f114, f19 FMPY f115 = f115, f19 ;; FNMA f120 = f112, f20, f120 FNMA f121 = f113, f20, f121 FNMA f122 = f114, f20, f122 FNMA f123 = f115, f20, f123 ;; FMPY f120 = f120, f21 FMPY f121 = f121, f21 FMPY f122 = f122, f21 FMPY f123 = f123, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f75, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, 5 * SIZE STFD [AOFFSET2] = f91, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f105, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f106, SIZE ;; STFD [AOFFSET] = f99, 5 * SIZE STFD [AOFFSET2] = f107, 5 * SIZE ;; STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f121, SIZE ;; STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f122, SIZE ;; STFD [AOFFSET] = f115, -27 * SIZE STFD [AOFFSET2] = f123, - 27 * SIZE ;; #endif #ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 FMPY f121 = f121, f32 FMPY f122 = f122, f32 FMPY f123 = f123, f32 ;; FNMA f112 = f120, f33, f112 FNMA f113 = f121, f33, f113 FNMA f114 = f122, f33, f114 FNMA f115 = f123, f33, f115 ;; FNMA f104 = f120, f34, f104 FNMA f105 = f121, f34, f105 FNMA f106 = f122, f34, f106 FNMA f107 = f123, f34, f107 ;; FNMA f96 = f120, f35, f96 FNMA f97 = f121, f35, f97 FNMA f98 = f122, f35, f98 FNMA f99 = f123, f35, f99 ;; FNMA f88 = f120, f36, f88 FNMA f89 = f121, f36, f89 FNMA f90 = f122, f36, f90 FNMA f91 = f123, f36, f91 ;; FNMA f80 = f120, f37, f80 FNMA f81 = f121, f37, f81 FNMA f82 = f122, f37, f82 FNMA f83 = f123, f37, f83 ;; FNMA f72 = f120, f38, f72 FNMA f73 = f121, f38, f73 FNMA f74 = f122, f38, f74 FNMA f75 = f123, f38, f75 ;; FNMA f64 = f120, f39, f64 FNMA f65 = f121, f39, f65 FNMA f66 = f122, f39, f66 FNMA f67 = f123, f39, f67 ;; FMPY f112 = f112, f40 FMPY f113 = f113, f40 FMPY f114 = f114, f40 FMPY f115 = f115, f40 ;; FNMA f104 = f112, f41, f104 FNMA f105 = f113, f41, f105 FNMA f106 = f114, f41, f106 FNMA f107 = f115, f41, f107 ;; FNMA f96 = f112, f42, f96 FNMA f97 = f113, f42, f97 FNMA f98 = f114, f42, f98 FNMA f99 = f115, f42, f99 ;; FNMA f88 = f112, f43, f88 FNMA f89 = f113, f43, f89 FNMA f90 = f114, f43, f90 FNMA f91 = f115, f43, f91 ;; FNMA f80 = f112, f44, f80 FNMA f81 = f113, f44, f81 FNMA f82 = f114, f44, f82 FNMA f83 = f115, f44, f83 ;; FNMA f72 = f112, f45, f72 FNMA f73 = f113, f45, f73 FNMA f74 = f114, f45, f74 FNMA f75 = f115, f45, f75 ;; FNMA f64 = f112, f46, f64 FNMA f65 = f113, f46, f65 FNMA f66 = f114, f46, f66 FNMA f67 = f115, f46, f67 ;; FMPY f104 = f104, f47 FMPY f105 = f105, f47 FMPY f106 = f106, f47 FMPY f107 = f107, f47 ;; FNMA f96 = f104, f48, f96 FNMA f97 = f105, f48, f97 FNMA f98 = f106, f48, f98 FNMA f99 = f107, f48, f99 ;; FNMA f88 = f104, f49, f88 FNMA f89 = f105, f49, f89 FNMA f90 = f106, f49, f90 FNMA f91 = f107, f49, f91 ;; FNMA f80 = f104, f50, f80 FNMA f81 = f105, f50, f81 FNMA f82 = f106, f50, f82 FNMA f83 = f107, f50, f83 ;; FNMA f72 = f104, f51, f72 FNMA f73 = f105, f51, f73 FNMA f74 = f106, f51, f74 FNMA f75 = f107, f51, f75 ;; FNMA f64 = f104, f52, f64 FNMA f65 = f105, f52, f65 FNMA f66 = f106, f52, f66 FNMA f67 = f107, f52, f67 ;; FMPY f96 = f96, f53 FMPY f97 = f97, f53 FMPY f98 = f98, f53 FMPY f99 = f99, f53 ;; FNMA f88 = f96, f54, f88 FNMA f89 = f97, f54, f89 FNMA f90 = f98, f54, f90 FNMA f91 = f99, f54, f91 ;; FNMA f80 = f96, f55, f80 FNMA f81 = f97, f55, f81 FNMA f82 = f98, f55, f82 FNMA f83 = f99, f55, f83 ;; FNMA f72 = f96, f56, f72 FNMA f73 = f97, f56, f73 FNMA f74 = f98, f56, f74 FNMA f75 = f99, f56, f75 ;; FNMA f64 = f96, f57, f64 FNMA f65 = f97, f57, f65 FNMA f66 = f98, f57, f66 FNMA f67 = f99, f57, f67 ;; FMPY f88 = f88, f58 FMPY f89 = f89, f58 FMPY f90 = f90, f58 FMPY f91 = f91, f58 ;; FNMA f80 = f88, f59, f80 FNMA f81 = f89, f59, f81 FNMA f82 = f90, f59, f82 FNMA f83 = f91, f59, f83 ;; FNMA f72 = f88, f60, f72 FNMA f73 = f89, f60, f73 FNMA f74 = f90, f60, f74 FNMA f75 = f91, f60, f75 ;; FNMA f64 = f88, f61, f64 FNMA f65 = f89, f61, f65 FNMA f66 = f90, f61, f66 FNMA f67 = f91, f61, f67 ;; FMPY f80 = f80, f16 FMPY f81 = f81, f16 FMPY f82 = f82, f16 FMPY f83 = f83, f16 ;; FNMA f72 = f80, f17, f72 FNMA f73 = f81, f17, f73 FNMA f74 = f82, f17, f74 FNMA f75 = f83, f17, f75 ;; FNMA f64 = f80, f18, f64 FNMA f65 = f81, f18, f65 FNMA f66 = f82, f18, f66 FNMA f67 = f83, f18, f67 ;; FMPY f72 = f72, f19 FMPY f73 = f73, f19 FMPY f74 = f74, f19 FMPY f75 = f75, f19 ;; FNMA f64 = f72, f20, f64 FNMA f65 = f73, f20, f65 FNMA f66 = f74, f20, f66 FNMA f67 = f75, f20, f67 ;; FMPY f64 = f64, f21 FMPY f65 = f65, f21 FMPY f66 = f66, f21 FMPY f67 = f67, f21 ;; adds AOFFSET = 24 * SIZE, AOFFSET adds AOFFSET2 = 24 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f121, SIZE ;; STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f122, SIZE ;; STFD [AOFFSET] = f115, - 11 * SIZE STFD [AOFFSET2] = f123, - 11 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f105, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f106, SIZE ;; STFD [AOFFSET] = f99, - 11 * SIZE STFD [AOFFSET2] = f107, - 11 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, - 11 * SIZE STFD [AOFFSET2] = f91, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f75, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE #ifdef LN adds C3 = -4 * SIZE, C3 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE #ifdef LN adds C4 = -4 * SIZE, C4 #else nop __LINE__ #endif } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi #ifndef LN STFD [C2 ] = f75, SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif #ifdef LN adds C5 = -4 * SIZE, C5 #else nop __LINE__ #endif } ;; { .mmf STFD [C3 ] = f80, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE #ifdef LN adds C6 = -4 * SIZE, C6 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C3 ] = f83, SIZE #else STFD [C3 ] = f83, - 3 * SIZE #endif } ;; { .mmf STFD [C4 ] = f88, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE #ifdef LN adds C8 = -4 * SIZE, C8 #else nop __LINE__ #endif } ;; { .mmi STFD [C4 ] = f90, SIZE } ;; { .mmi #ifndef LN STFD [C4 ] = f91, SIZE #else STFD [C4 ] = f91, - 3 * SIZE #endif nop __LINE__ } ;; { .mmf STFD [C5 ] = f96, SIZE mov f96 = f0 } ;; { .mmi STFD [C5 ] = f97, SIZE nop __LINE__ } ;; { .mmi STFD [C5 ] = f98, SIZE #ifdef LN adds C7 = -4 * SIZE, C7 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C5 ] = f99, SIZE #else STFD [C5 ] = f99, - 3 * SIZE #endif } ;; { .mmf STFD [C6 ] = f104, SIZE mov f104 = f0 } ;; { .mmi STFD [C6 ] = f105, SIZE shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi STFD [C6 ] = f106, SIZE sub L = K, KK } ;; { .mmi #ifndef LN STFD [C6 ] = f107, SIZE #else STFD [C6 ] = f107, - 3 * SIZE #endif #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmf STFD [C7 ] = f112, SIZE mov f112 = f0 } ;; { .mmi STFD [C7 ] = f113, SIZE #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi STFD [C7 ] = f114, SIZE #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C7 ] = f115, SIZE #else STFD [C7 ] = f115, - 3 * SIZE #endif #if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mmf STFD [C8 ] = f120, SIZE mov f120 = f0 } ;; { .mmi STFD [C8 ] = f121, SIZE #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi STFD [C8 ] = f122, SIZE #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmb #ifndef LN STFD [C8 ] = f123, SIZE #else STFD [C8 ] = f123, - 3 * SIZE #endif } ;; .align 8 .L010: { .mib cmp.gt p6, p0 = 8, M shr I = M, 3 (p6) br.cond.dpnt .L049 } ;; .align 8 .L011: { .mmi cmp.ne p7, p0 = r0, L shladd r3 = KK, BASE_SHIFT, r0 shl r2 = K, 3 + BASE_SHIFT } ;; { .mmi shladd BOFFSET = r3, 3, B sub AORIG = AORIG, r2 nop __LINE__ } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f64 = r0 mov f72 = f0 } { .mfi setf.d f80 = r0 mov f88 = f0 shladd AOFFSET = r3, 3, AORIG } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f96 = r0 mov f104 = f0 } { .mfb setf.d f112 = r0 mov f120 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f65 = r0 mov f73 = f0 } { .mfb setf.d f89 = r0 mov f81 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mfb setf.d f113 = r0 mov f121 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfb setf.d f82 = r0 mov f90 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfi setf.d f114 = r0 mov f122 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE setf.d f99 = r0 mov f107 = f0 } { .mfi setf.d f115 = r0 mov f123 = f0 adds L = 1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f100 = r0 mov f108 = f0 } { .mfi setf.d f116 = r0 mov f124 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f101 = r0 mov f109 = f0 } { .mfi setf.d f117 = r0 mov f125 = f0 shr L = L, 1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f70 = r0 mov f78 = f0 } { .mfi setf.d f86 = r0 mov f94 = f0 adds L = -1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f102 = r0 mov f110 = f0 } { .mfi setf.d f118 = r0 mov f126 = f0 mov ar.lc = L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f71 = r0 mov f79 = f0 } { .mfi setf.d f87 = r0 mov f95 = f0 cmp.eq p6, p0 = -1, L } ;; { .mmf CPREFETCH [PREC] setf.d f103 = r0 mov f111 = f0 } { .mfb setf.d f119 = r0 mov f127 = f0 (p6) br.cond.dpnt .L018 } ;; .align 8 .L012: /* 1 */ { .mfi lfetch.fault.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb adds C9 = 4 * SIZE, C1 FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb adds C10 = 4 * SIZE, C2 FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb adds C11 = 4 * SIZE, C3 FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb adds C12 = 4 * SIZE, C4 FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb adds C13 = 4 * SIZE, C5 FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb adds C14 = 4 * SIZE, C6 FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb adds C15 = 4 * SIZE, C7 FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb adds C16 = 4 * SIZE, C8 FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f108 = f36, f53, f108 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f124 = f36, f55, f124 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f109 = f37, f53, f109 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f125 = f37, f55, f125 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f110 = f38, f53, f110 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f126 = f38, f55, f126 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f111 = f39, f53, f111 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f127 = f39, f55, f127 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f123 = f43, f63, f123 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb nop __LINE__ (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f108 = f44, f61, f108 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb nop __LINE__ (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f124 = f44, f63, f124 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb nop __LINE__ (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f109 = f45, f61, f109 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb nop __LINE__ (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f125 = f45, f63, f125 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb nop __LINE__ (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f110 = f46, f61, f110 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb nop __LINE__ (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f126 = f46, f63, f126 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f95 = f47, f59, f95 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb nop __LINE__ (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f111 = f47, f61, f111 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi nop __LINE__ (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f127 = f47, f63, f127 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L018: adds r2 = -8, KK ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 3, B ;; LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET], 2 * SIZE FSUB f113 = f46, f113 nop __LINE__ } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [BOFFSET], 2 * SIZE FSUB f66 = f48, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB f74 = f49, f74 nop __LINE__ } ;; { .mfi LDFPD f34, f35 = [BOFFSET], 2 * SIZE FSUB f82 = f50, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f90 = f51, f90 nop __LINE__ } ;; { .mfi LDFPD f36, f37 = [BOFFSET], 2 * SIZE FSUB f98 = f52, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f106 = f53, f106 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [BOFFSET], 2 * SIZE FSUB f114 = f54, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f122 = f55, f122 nop __LINE__ } ;; { .mfi LDFPD f40, f41 = [BOFFSET], 2 * SIZE FSUB f67 = f56, f67 nop __LINE__ } { .mfi nop __LINE__ FSUB f75 = f57, f75 nop __LINE__ } ;; { .mfi LDFPD f42, f43 = [BOFFSET], 2 * SIZE FSUB f83 = f58, f83 nop __LINE__ } { .mfi nop __LINE__ FSUB f91 = f59, f91 nop __LINE__ } ;; { .mfi LDFPD f44, f45 = [BOFFSET], 2 * SIZE FSUB f99 = f60, f99 nop __LINE__ } { .mfi nop __LINE__ FSUB f107 = f61, f107 nop __LINE__ } ;; { .mfi LDFPD f46, f47 = [BOFFSET], 2 * SIZE FSUB f115 = f62, f115 nop __LINE__ } { .mfi nop __LINE__ FSUB f123 = f63, f123 nop __LINE__ } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f68 = f32, f68 nop __LINE__ } { .mfi nop __LINE__ FSUB f76 = f33, f76 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f84 = f34, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f92 = f35, f92 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f100 = f36, f100 nop __LINE__ } { .mfi nop __LINE__ FSUB f108 = f37, f108 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f116 = f38, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f124 = f39, f124 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f69 = f40, f69 nop __LINE__ } { .mfi nop __LINE__ FSUB f77 = f41, f77 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f85 = f42, f85 nop __LINE__ } { .mfi nop __LINE__ FSUB f93 = f43, f93 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f101 = f44, f101 nop __LINE__ } { .mfi nop __LINE__ FSUB f109 = f45, f109 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET] FSUB f117 = f46, f117 adds BOFFSET = -62 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB f125 = f47, f125 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f70 = f48, f70 #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET #else nop __LINE__ #endif } { .mfi nop __LINE__ FSUB f78 = f49, f78 nop __LINE__ } { .mfi nop __LINE__ FSUB f86 = f50, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f94 = f51, f94 nop __LINE__ } ;; { .mfi #ifdef LN LDFPD f33, f32 = [AOFFSET] #else LDFPD f32, f33 = [AOFFSET] #endif FSUB f102 = f52, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB f110 = f53, f110 nop __LINE__ } { .mfi nop __LINE__ FSUB f118 = f54, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f126 = f55, f126 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } ;; { .mfi nop __LINE__ FSUB f71 = f56, f71 nop __LINE__ } { .mfi nop __LINE__ FSUB f79 = f57, f79 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f58, f87 nop __LINE__ } { .mfi nop __LINE__ FSUB f95 = f59, f95 nop __LINE__ } { .mfi nop __LINE__ FSUB f103 = f60, f103 nop __LINE__ } { .mfi nop __LINE__ FSUB f111 = f61, f111 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f62, f119 nop __LINE__ } { .mfi nop __LINE__ FSUB f127 = f63, f127 nop __LINE__ } ;; { .mfi LDFPD f35, f34 = [AOFFSET] FMPY f71 = f71, f32 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f103 = f103, f32 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi LDFPD f37, f36 = [AOFFSET] FMPY f79 = f79, f32 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f111 = f111, f32 nop __LINE__ } ;; { .mfi LDFPD f39, f38 = [AOFFSET] FMPY f87 = f87, f32 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f119 = f119, f32 nop __LINE__ } ;; { .mfi LDFD f40 = [AOFFSET], -2 * SIZE FMPY f95 = f95, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f127 = f127, f32 nop __LINE__ } ;; { .mfi LDFPD f42, f41 = [AOFFSET] FNMA f70 = f71, f33, f70 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f102 = f103, f33, f102 nop __LINE__ } ;; { .mfi LDFPD f44, f43 = [AOFFSET] FNMA f78 = f79, f33, f78 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f110 = f111, f33, f110 nop __LINE__ } ;; { .mfi LDFPD f46, f45 = [AOFFSET] FNMA f86 = f87, f33, f86 adds AOFFSET = - 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f118 = f119, f33, f118 nop __LINE__ } ;; { .mfi LDFPD f48, f47 = [AOFFSET] FNMA f94 = f95, f33, f94 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f126 = f127, f33, f126 nop __LINE__ } ;; { .mfi LDFPD f50, f49 = [AOFFSET] FNMA f69 = f71, f34, f69 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f101 = f103, f34, f101 nop __LINE__ } ;; { .mfi LDFPD f52, f51 = [AOFFSET] FNMA f77 = f79, f34, f77 adds AOFFSET = - 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f109 = f111, f34, f109 nop __LINE__ } ;; { .mfi LDFD f53 = [AOFFSET], -2 * SIZE FNMA f85 = f87, f34, f85 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f119, f34, f117 nop __LINE__ } ;; { .mfi LDFPD f55, f54 = [AOFFSET] FNMA f93 = f95, f34, f93 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f125 = f127, f34, f125 nop __LINE__ } ;; { .mfi LDFPD f57, f56 = [AOFFSET] FNMA f68 = f71, f35, f68 adds AOFFSET = - 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f100 = f103, f35, f100 nop __LINE__ } ;; { .mfi LDFPD f59, f58 = [AOFFSET] FNMA f76 = f79, f35, f76 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f108 = f111, f35, f108 nop __LINE__ } ;; { .mfi LDFPD f61, f60 = [AOFFSET] FNMA f84 = f87, f35, f84 adds AOFFSET = - 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f116 = f119, f35, f116 nop __LINE__ } ;; { .mfi LDFD f16 = [AOFFSET], -2 * SIZE FNMA f92 = f95, f35, f92 nop __LINE__ } { .mfi nop __LINE__ FNMA f124 = f127, f35, f124 nop __LINE__ } ;; { .mfi LDFPD f18, f17 = [AOFFSET] FNMA f67 = f71, f36, f67 adds AOFFSET = - 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f99 = f103, f36, f99 nop __LINE__ } ;; { .mfi LDFPD f20, f19 = [AOFFSET] FNMA f75 = f79, f36, f75 adds AOFFSET = - 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f107 = f111, f36, f107 nop __LINE__ } ;; { .mfi LDFD f21 = [AOFFSET] FNMA f83 = f87, f36, f83 adds BOFFSET = 56 * SIZE, BOFFSET } { .mfi FNMA f115 = f119, f36, f115 adds BOFFSET2 = 56 * SIZE, BOFFSET2 } ;; FNMA f91 = f95, f36, f91 FNMA f123 = f127, f36, f123 ;; FNMA f66 = f71, f37, f66 FNMA f98 = f103, f37, f98 FNMA f74 = f79, f37, f74 FNMA f106 = f111, f37, f106 FNMA f82 = f87, f37, f82 FNMA f114 = f119, f37, f114 FNMA f90 = f95, f37, f90 FNMA f122 = f127, f37, f122 ;; FNMA f65 = f71, f38, f65 FNMA f97 = f103, f38, f97 FNMA f73 = f79, f38, f73 FNMA f105 = f111, f38, f105 FNMA f81 = f87, f38, f81 FNMA f113 = f119, f38, f113 FNMA f89 = f95, f38, f89 FNMA f121 = f127, f38, f121 ;; FNMA f64 = f71, f39, f64 FNMA f96 = f103, f39, f96 FNMA f72 = f79, f39, f72 FNMA f104 = f111, f39, f104 FNMA f80 = f87, f39, f80 FNMA f112 = f119, f39, f112 FNMA f88 = f95, f39, f88 FNMA f120 = f127, f39, f120 ;; FMPY f70 = f70, f40 FMPY f102 = f102, f40 FMPY f78 = f78, f40 FMPY f110 = f110, f40 FMPY f86 = f86, f40 FMPY f118 = f118, f40 FMPY f94 = f94, f40 FMPY f126 = f126, f40 ;; FNMA f69 = f70, f41, f69 FNMA f101 = f102, f41, f101 FNMA f77 = f78, f41, f77 FNMA f109 = f110, f41, f109 FNMA f85 = f86, f41, f85 FNMA f117 = f118, f41, f117 FNMA f93 = f94, f41, f93 FNMA f125 = f126, f41, f125 ;; FNMA f68 = f70, f42, f68 FNMA f100 = f102, f42, f100 FNMA f76 = f78, f42, f76 FNMA f108 = f110, f42, f108 FNMA f84 = f86, f42, f84 FNMA f116 = f118, f42, f116 FNMA f92 = f94, f42, f92 FNMA f124 = f126, f42, f124 ;; FNMA f67 = f70, f43, f67 FNMA f99 = f102, f43, f99 FNMA f75 = f78, f43, f75 FNMA f107 = f110, f43, f107 FNMA f83 = f86, f43, f83 FNMA f115 = f118, f43, f115 FNMA f91 = f94, f43, f91 FNMA f123 = f126, f43, f123 ;; FNMA f66 = f70, f44, f66 FNMA f98 = f102, f44, f98 FNMA f74 = f78, f44, f74 FNMA f106 = f110, f44, f106 FNMA f82 = f86, f44, f82 FNMA f114 = f118, f44, f114 FNMA f90 = f94, f44, f90 FNMA f122 = f126, f44, f122 ;; FNMA f65 = f70, f45, f65 FNMA f97 = f102, f45, f97 FNMA f73 = f78, f45, f73 FNMA f105 = f110, f45, f105 FNMA f81 = f86, f45, f81 FNMA f113 = f118, f45, f113 FNMA f89 = f94, f45, f89 FNMA f121 = f126, f45, f121 ;; FNMA f64 = f70, f46, f64 FNMA f96 = f102, f46, f96 FNMA f72 = f78, f46, f72 FNMA f104 = f110, f46, f104 FNMA f80 = f86, f46, f80 FNMA f112 = f118, f46, f112 FNMA f88 = f94, f46, f88 FNMA f120 = f126, f46, f120 ;; FMPY f69 = f69, f47 FMPY f101 = f101, f47 FMPY f77 = f77, f47 FMPY f109 = f109, f47 FMPY f85 = f85, f47 FMPY f117 = f117, f47 FMPY f93 = f93, f47 FMPY f125 = f125, f47 ;; FNMA f68 = f69, f48, f68 FNMA f100 = f101, f48, f100 FNMA f76 = f77, f48, f76 FNMA f108 = f109, f48, f108 FNMA f84 = f85, f48, f84 FNMA f116 = f117, f48, f116 FNMA f92 = f93, f48, f92 FNMA f124 = f125, f48, f124 ;; FNMA f67 = f69, f49, f67 FNMA f99 = f101, f49, f99 FNMA f75 = f77, f49, f75 FNMA f107 = f109, f49, f107 FNMA f83 = f85, f49, f83 FNMA f115 = f117, f49, f115 FNMA f91 = f93, f49, f91 FNMA f123 = f125, f49, f123 ;; FNMA f66 = f69, f50, f66 FNMA f98 = f101, f50, f98 FNMA f74 = f77, f50, f74 FNMA f106 = f109, f50, f106 FNMA f82 = f85, f50, f82 FNMA f114 = f117, f50, f114 FNMA f90 = f93, f50, f90 FNMA f122 = f125, f50, f122 ;; FNMA f65 = f69, f51, f65 FNMA f97 = f101, f51, f97 FNMA f73 = f77, f51, f73 FNMA f105 = f109, f51, f105 FNMA f81 = f85, f51, f81 FNMA f113 = f117, f51, f113 FNMA f89 = f93, f51, f89 FNMA f121 = f125, f51, f121 ;; FNMA f64 = f69, f52, f64 FNMA f96 = f101, f52, f96 FNMA f72 = f77, f52, f72 FNMA f104 = f109, f52, f104 FNMA f80 = f85, f52, f80 FNMA f112 = f117, f52, f112 FNMA f88 = f93, f52, f88 FNMA f120 = f125, f52, f120 ;; FMPY f68 = f68, f53 FMPY f100 = f100, f53 FMPY f76 = f76, f53 FMPY f108 = f108, f53 FMPY f84 = f84, f53 FMPY f116 = f116, f53 FMPY f92 = f92, f53 FMPY f124 = f124, f53 ;; FNMA f67 = f68, f54, f67 FNMA f99 = f100, f54, f99 FNMA f75 = f76, f54, f75 FNMA f107 = f108, f54, f107 FNMA f83 = f84, f54, f83 FNMA f115 = f116, f54, f115 FNMA f91 = f92, f54, f91 FNMA f123 = f124, f54, f123 ;; FNMA f66 = f68, f55, f66 FNMA f98 = f100, f55, f98 FNMA f74 = f76, f55, f74 FNMA f106 = f108, f55, f106 FNMA f82 = f84, f55, f82 FNMA f114 = f116, f55, f114 FNMA f90 = f92, f55, f90 FNMA f122 = f124, f55, f122 ;; FNMA f65 = f68, f56, f65 FNMA f97 = f100, f56, f97 FNMA f73 = f76, f56, f73 FNMA f105 = f108, f56, f105 FNMA f81 = f84, f56, f81 FNMA f113 = f116, f56, f113 FNMA f89 = f92, f56, f89 FNMA f121 = f124, f56, f121 ;; FNMA f64 = f68, f57, f64 FNMA f96 = f100, f57, f96 FNMA f72 = f76, f57, f72 FNMA f104 = f108, f57, f104 FNMA f80 = f84, f57, f80 FNMA f112 = f116, f57, f112 FNMA f88 = f92, f57, f88 FNMA f120 = f124, f57, f120 ;; FMPY f67 = f67, f58 FMPY f99 = f99, f58 FMPY f75 = f75, f58 FMPY f107 = f107, f58 FMPY f83 = f83, f58 FMPY f115 = f115, f58 FMPY f91 = f91, f58 FMPY f123 = f123, f58 ;; FNMA f66 = f67, f59, f66 FNMA f98 = f99, f59, f98 FNMA f74 = f75, f59, f74 FNMA f106 = f107, f59, f106 FNMA f82 = f83, f59, f82 FNMA f114 = f115, f59, f114 FNMA f90 = f91, f59, f90 FNMA f122 = f123, f59, f122 ;; FNMA f65 = f67, f60, f65 FNMA f97 = f99, f60, f97 FNMA f73 = f75, f60, f73 FNMA f105 = f107, f60, f105 FNMA f81 = f83, f60, f81 FNMA f113 = f115, f60, f113 FNMA f89 = f91, f60, f89 FNMA f121 = f123, f60, f121 ;; { .mfi STFD [BOFFSET] = f71, SIZE FNMA f64 = f67, f61, f64 } { .mfi STFD [BOFFSET2] = f103, SIZE FNMA f96 = f99, f61, f96 } ;; { .mfi STFD [BOFFSET] = f79, SIZE FNMA f72 = f75, f61, f72 } { .mfi STFD [BOFFSET2] = f111, SIZE FNMA f104 = f107, f61, f104 } ;; { .mfi STFD [BOFFSET] = f87, SIZE FNMA f80 = f83, f61, f80 } { .mfi STFD [BOFFSET2] = f119, SIZE FNMA f112 = f115, f61, f112 } ;; { .mfi STFD [BOFFSET] = f95, - 11 * SIZE FNMA f88 = f91, f61, f88 } { .mfi STFD [BOFFSET2] = f127, - 11 * SIZE FNMA f120 = f123, f61, f120 } ;; { .mfi STFD [BOFFSET] = f70, SIZE FMPY f66 = f66, f16 } { .mfi STFD [BOFFSET2] = f102, SIZE FMPY f98 = f98, f16 } ;; { .mfi STFD [BOFFSET] = f78, SIZE FMPY f74 = f74, f16 } { .mfi STFD [BOFFSET2] = f110, SIZE FMPY f106 = f106, f16 } ;; { .mfi STFD [BOFFSET] = f86, SIZE FMPY f82 = f82, f16 } { .mfi STFD [BOFFSET2] = f118, SIZE FMPY f114 = f114, f16 } ;; { .mfi STFD [BOFFSET] = f94, - 11 * SIZE FMPY f90 = f90, f16 } { .mfi STFD [BOFFSET2] = f126, - 11 * SIZE FMPY f122 = f122, f16 } ;; { .mfi STFD [BOFFSET] = f69, SIZE FNMA f65 = f66, f17, f65 } { .mfi STFD [BOFFSET2] = f101, SIZE FNMA f97 = f98, f17, f97 } ;; { .mfi STFD [BOFFSET] = f77, SIZE FNMA f73 = f74, f17, f73 } { .mfi STFD [BOFFSET2] = f109, SIZE FNMA f105 = f106, f17, f105 } ;; { .mfi STFD [BOFFSET] = f85, SIZE FNMA f81 = f82, f17, f81 } { .mfi STFD [BOFFSET2] = f117, SIZE FNMA f113 = f114, f17, f113 } ;; { .mfi STFD [BOFFSET] = f93, - 11 * SIZE FNMA f89 = f90, f17, f89 } { .mfi STFD [BOFFSET2] = f125, - 11 * SIZE FNMA f121 = f122, f17, f121 } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f64 = f66, f18, f64 } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f96 = f98, f18, f96 } ;; { .mfi STFD [BOFFSET] = f76, SIZE FNMA f72 = f74, f18, f72 } { .mfi STFD [BOFFSET2] = f108, SIZE FNMA f104 = f106, f18, f104 } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f80 = f82, f18, f80 } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f112 = f114, f18, f112 } ;; { .mfi STFD [BOFFSET] = f92, - 11 * SIZE FNMA f88 = f90, f18, f88 } { .mfi STFD [BOFFSET2] = f124, - 11 * SIZE FNMA f120 = f122, f18, f120 } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMPY f65 = f65, f19 } { .mfi STFD [BOFFSET2] = f99, SIZE FMPY f97 = f97, f19 } ;; { .mfi STFD [BOFFSET] = f75, SIZE FMPY f73 = f73, f19 } { .mfi STFD [BOFFSET2] = f107, SIZE FMPY f105 = f105, f19 } ;; { .mfi STFD [BOFFSET] = f83, SIZE FMPY f81 = f81, f19 } { .mfi STFD [BOFFSET2] = f115, SIZE FMPY f113 = f113, f19 } ;; { .mfi STFD [BOFFSET] = f91, - 11 * SIZE FMPY f89 = f89, f19 } { .mfi STFD [BOFFSET2] = f123, - 11 * SIZE FMPY f121 = f121, f19 } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f64 = f65, f20, f64 } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f96 = f97, f20, f96 } ;; { .mfi STFD [BOFFSET] = f74, SIZE FNMA f72 = f73, f20, f72 } { .mfi STFD [BOFFSET2] = f106, SIZE FNMA f104 = f105, f20, f104 } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f80 = f81, f20, f80 } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f112 = f113, f20, f112 } ;; { .mfi STFD [BOFFSET] = f90, -11 * SIZE FNMA f88 = f89, f20, f88 } { .mfi STFD [BOFFSET2] = f122, -11 * SIZE FNMA f120 = f121, f20, f120 } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMPY f64 = f64, f21 } { .mfi STFD [BOFFSET2] = f97, SIZE FMPY f96 = f96, f21 } ;; { .mfi STFD [BOFFSET] = f73, SIZE FMPY f72 = f72, f21 } { .mfi STFD [BOFFSET2] = f105, SIZE FMPY f104 = f104, f21 } ;; { .mfi STFD [BOFFSET] = f81, SIZE FMPY f80 = f80, f21 } { .mfi STFD [BOFFSET2] = f113, SIZE FMPY f112 = f112, f21 } ;; { .mfi STFD [BOFFSET] = f89, - 11 * SIZE FMPY f88 = f88, f21 } { .mfi STFD [BOFFSET2] = f121, - 11 * SIZE FMPY f120 = f120, f21 } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -8 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -8 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE adds C9 = 4 * SIZE, C1 } ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE adds C3 = -8 * SIZE, C3 } ;; { .mmi STFD [C1 ] = f67, - 3 * SIZE STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE adds C4 = -8 * SIZE, C4 } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi STFD [C2 ] = f75, - 3 * SIZE STFD [C10] = f79 adds C5 = -8 * SIZE, C5 } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C11] = f84, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C11] = f85, SIZE adds C13 = 4 * SIZE, C5 } ;; { .mmi STFD [C3 ] = f82, SIZE STFD [C11] = f86, SIZE adds C6 = -8 * SIZE, C6 } ;; { .mmi STFD [C3 ] = f83, - 3 * SIZE STFD [C11] = f87 adds C14 = 4 * SIZE, C6 } ;; { .mmf STFD [C4 ] = f88, SIZE STFD [C12] = f92, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE STFD [C12] = f93, SIZE adds C8 = -8 * SIZE, C8 } ;; { .mmi STFD [C4 ] = f90, SIZE STFD [C12] = f94, SIZE adds C16 = 4 * SIZE, C8 } ;; { .mmi STFD [C4 ] = f91, - 3 * SIZE STFD [C12] = f95 cmp.ne p6, p0 = 1, I } ;; { .mmf STFD [C5 ] = f96, SIZE STFD [C13] = f100, SIZE mov f96 = f0 } ;; { .mmi STFD [C5 ] = f97, SIZE STFD [C13] = f101, SIZE adds I = -1, I } ;; { .mmi STFD [C5 ] = f98, SIZE STFD [C13] = f102, SIZE adds C7 = -8 * SIZE, C7 } ;; { .mmi STFD [C5 ] = f99, - 3 * SIZE STFD [C13] = f103 adds C15 = 4 * SIZE, C7 } ;; { .mmf STFD [C6 ] = f104, SIZE STFD [C14] = f108, SIZE mov f104 = f0 } ;; { .mmi STFD [C6 ] = f105, SIZE STFD [C14] = f109, SIZE shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi STFD [C6 ] = f106, SIZE STFD [C14] = f110, SIZE sub L = K, KK } ;; { .mmi STFD [C6 ] = f107, - 3 * SIZE STFD [C14] = f111 nop __LINE__ } ;; { .mmf STFD [C7 ] = f112, SIZE STFD [C15] = f116, SIZE mov f112 = f0 } ;; { .mmi STFD [C7 ] = f113, SIZE STFD [C15] = f117, SIZE nop __LINE__ } ;; { .mmi STFD [C7 ] = f114, SIZE STFD [C15] = f118, SIZE nop __LINE__ } ;; { .mmi STFD [C7 ] = f115, - 3 * SIZE STFD [C15] = f119 nop __LINE__ } ;; { .mmf STFD [C8 ] = f120, SIZE STFD [C16] = f124, SIZE mov f120 = f0 } ;; { .mmi STFD [C8 ] = f121, SIZE STFD [C16] = f125, SIZE adds KK = -8, KK } ;; { .mmi STFD [C8 ] = f122, SIZE STFD [C16] = f126, SIZE sub L = K, KK } ;; { .mmb STFD [C8 ] = f123, - 3 * SIZE STFD [C16] = f127 (p6) br.cond.dptk .L011 } ;; .L049: { .mmi adds J = -1, J mov AOFFSET = A shladd KK8 = K, BASE_SHIFT, r0 } ;; { .mmb shladd B = KK8, 3, B cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L000 } ;; .align 8 .L050: { .mib setf.d f64 = r0 tbit.z p6, p0 = N, 2 (p6) br.cond.dpnt .L090 } ;; #ifdef RT { .mmi shladd r3 = LDC, 2, r0 nop __LINE__ shl r2 = K, 2 + BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } #endif ;; { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc } ;; { .mfi #ifndef RT shladd C = LDC, 2, C // coffset += 8 * ldc #else nop __LINE__ #endif #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif }{ .mfb shladd C4 = LDC, 1, C2 } ;; mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f65 = f0 mov f73 = f0 mov f81 = f0 mov f89 = f0 tbit.z p6,p7 = M, 0 (p6) br.cond.dptk .L070 { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 2, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi adds L = -1, L } ;; { .mmi cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L088 } ;; .L082: { .mfb cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mmf (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mib (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mmb nop __LINE__ adds L = -1, L br.cloop.sptk.few .L082 } ;; .L088: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; #endif #ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE adds C2 = -1 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE } ;; adds C3 = -1 * SIZE, C3 adds C4 = -1 * SIZE, C4 ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f80, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FNMA f80 = f64, f34, f80 ;; FNMA f88 = f64, f35, f88 ;; FMPY f72 = f72, f36 ;; FNMA f80 = f72, f37, f80 ;; FNMA f88 = f72, f38, f88 ;; FMPY f80 = f80, f39 ;; FNMA f88 = f80, f40, f88 ;; FMPY f88 = f88, f41 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 ;; FNMA f80 = f88, f33, f80 ;; FNMA f72 = f88, f34, f72 ;; FNMA f64 = f88, f35, f64 ;; FMPY f80 = f80, f36 ;; FNMA f72 = f80, f37, f72 ;; FNMA f64 = f80, f38, f64 ;; FMPY f72 = f72, f39 ;; FNMA f64 = f72, f40, f64 ;; FMPY f64 = f64, f41 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f88, - 3 * SIZE ;; #endif #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif #ifndef LN STFD [C2 ] = f72, SIZE #else STFD [C2 ] = f72 #endif #ifndef LN STFD [C3 ] = f80, SIZE #else STFD [C3 ] = f80 #endif #ifndef LN STFD [C4 ] = f88, SIZE #else STFD [C4 ] = f88 #endif ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 8 .L070: tbit.z p6,p7 = M, 1 (p6) br.cond.dptk .L060 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif { .mfi mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L078 } ;; .align 8 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf nop __LINE__ nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 br.cloop.sptk.few .L072 } ;; .L078: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 FSUB f80 = f36, f80 FSUB f81 = f37, f81 FSUB f88 = f38, f88 FSUB f89 = f39, f89 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f73 = f73, f32 FMPY f81 = f81, f32 FMPY f89 = f89, f32 ;; FNMA f64 = f65, f33, f64 FNMA f72 = f73, f33, f72 FNMA f80 = f81, f33, f80 FNMA f88 = f89, f33, f88 ;; FMPY f64 = f64, f34 FMPY f72 = f72, f34 FMPY f80 = f80, f34 FMPY f88 = f88, f34 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f89, - 3 * SIZE ;; adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FMPY f65 = f65, f34 FMPY f73 = f73, f34 FMPY f81 = f81, f34 FMPY f89 = f89, f34 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE STFD [BOFFSET2] = f89, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 ;; FMPY f72 = f72, f36 FMPY f73 = f73, f36 ;; FNMA f80 = f72, f37, f80 FNMA f81 = f73, f37, f81 ;; FNMA f88 = f72, f38, f88 FNMA f89 = f73, f38, f89 ;; FMPY f80 = f80, f39 FMPY f81 = f81, f39 ;; FNMA f88 = f80, f40, f88 FNMA f89 = f81, f40, f89 ;; FMPY f88 = f88, f41 FMPY f89 = f89, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE STFD [AOFFSET2] = f89, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 FMPY f89 = f89, f32 ;; FNMA f80 = f88, f33, f80 FNMA f81 = f89, f33, f81 ;; FNMA f72 = f88, f34, f72 FNMA f73 = f89, f34, f73 ;; FNMA f64 = f88, f35, f64 FNMA f65 = f89, f35, f65 ;; FMPY f80 = f80, f36 FMPY f81 = f81, f36 ;; FNMA f72 = f80, f37, f72 FNMA f73 = f81, f37, f73 ;; FNMA f64 = f80, f38, f64 FNMA f65 = f81, f38, f65 ;; FMPY f72 = f72, f39 FMPY f73 = f73, f39 ;; FNMA f64 = f72, f40, f64 FNMA f65 = f73, f40, f65 ;; FMPY f64 = f64, f41 FMPY f65 = f65, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE STFD [AOFFSET2] = f89, -3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; #ifndef LN STFD [C2 ] = f73, SIZE #else STFD [C2 ] = f73, -SIZE #endif ;; STFD [C3 ] = f80, SIZE mov f80 = f0 ;; #ifndef LN STFD [C3 ] = f81, SIZE #else STFD [C3 ] = f81, - SIZE #endif ;; STFD [C4 ] = f88, SIZE mov f88 = f0 ;; #ifndef LN STFD [C4 ] = f89, SIZE #else STFD [C4 ] = f89, -SIZE #endif ;; mov f96 = f0 ;; mov f104 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; mov f112 = f0 ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmf mov f120 = f0 } ;; { .mmi #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L060: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L051 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 2, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 mov f80 = f0 mov f81 = f0 mov f82 = f0 mov f83 = f0 mov f88 = f0 mov f89 = f0 mov f90 = f0 mov f91 = f0 ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L068 } ;; .align 8 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 br.cloop.sptk.few .L062 } ;; .align 8 .L068: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 ;; FSUB f66 = f40, f66 FSUB f74 = f41, f74 FSUB f82 = f42, f82 FSUB f90 = f43, f90 ;; FSUB f67 = f44, f67 FSUB f75 = f45, f75 FSUB f83 = f46, f83 FSUB f91 = f47, f91 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 FSUB f80 = f40, f80 FSUB f81 = f41, f81 FSUB f82 = f42, f82 FSUB f83 = f43, f83 FSUB f88 = f44, f88 FSUB f89 = f45, f89 FSUB f90 = f46, f90 FSUB f91 = f47, f91 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 FMPY f75 = f75, f32 FMPY f83 = f83, f32 FMPY f91 = f91, f32 ;; FNMA f66 = f67, f33, f66 FNMA f74 = f75, f33, f74 FNMA f82 = f83, f33, f82 FNMA f90 = f91, f33, f90 ;; FNMA f65 = f67, f34, f65 FNMA f73 = f75, f34, f73 FNMA f81 = f83, f34, f81 FNMA f89 = f91, f34, f89 ;; FNMA f64 = f67, f35, f64 FNMA f72 = f75, f35, f72 FNMA f80 = f83, f35, f80 FNMA f88 = f91, f35, f88 ;; FMPY f66 = f66, f36 FMPY f74 = f74, f36 FMPY f82 = f82, f36 FMPY f90 = f90, f36 ;; FNMA f65 = f66, f37, f65 FNMA f73 = f74, f37, f73 FNMA f81 = f82, f37, f81 FNMA f89 = f90, f37, f89 ;; FNMA f64 = f66, f38, f64 FNMA f72 = f74, f38, f72 FNMA f80 = f82, f38, f80 FNMA f88 = f90, f38, f88 ;; FMPY f65 = f65, f39 FMPY f73 = f73, f39 FMPY f81 = f81, f39 FMPY f89 = f89, f39 ;; FNMA f64 = f65, f40, f64 FNMA f72 = f73, f40, f72 FNMA f80 = f81, f40, f80 FNMA f88 = f89, f40, f88 ;; FMPY f64 = f64, f41 FMPY f72 = f72, f41 FMPY f80 = f80, f41 FMPY f88 = f88, f41 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, - 11 * SIZE STFD [BOFFSET2] = f91, - 11 * SIZE ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE STFD [BOFFSET2] = f89, -3 * SIZE ;; adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 FNMA f82 = f80, f34, f82 FNMA f90 = f88, f34, f90 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 FNMA f83 = f80, f35, f83 FNMA f91 = f88, f35, f91 ;; FMPY f65 = f65, f36 FMPY f73 = f73, f36 FMPY f81 = f81, f36 FMPY f89 = f89, f36 ;; FNMA f66 = f65, f37, f66 FNMA f74 = f73, f37, f74 FNMA f82 = f81, f37, f82 FNMA f90 = f89, f37, f90 ;; FNMA f67 = f65, f38, f67 FNMA f75 = f73, f38, f75 FNMA f83 = f81, f38, f83 FNMA f91 = f89, f38, f91 ;; FMPY f66 = f66, f39 FMPY f74 = f74, f39 FMPY f82 = f82, f39 FMPY f90 = f90, f39 ;; FNMA f67 = f66, f40, f67 FNMA f75 = f74, f40, f75 FNMA f83 = f82, f40, f83 FNMA f91 = f90, f40, f91 ;; FMPY f67 = f67, f41 FMPY f75 = f75, f41 FMPY f83 = f83, f41 FMPY f91 = f91, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, 5 * SIZE STFD [BOFFSET2] = f89, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, -11 * SIZE STFD [BOFFSET2] = f91, -11 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 FNMA f82 = f66, f34, f82 FNMA f83 = f67, f34, f83 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 FNMA f90 = f66, f35, f90 FNMA f91 = f67, f35, f91 ;; FMPY f72 = f72, f36 FMPY f73 = f73, f36 FMPY f74 = f74, f36 FMPY f75 = f75, f36 ;; FNMA f80 = f72, f37, f80 FNMA f81 = f73, f37, f81 FNMA f82 = f74, f37, f82 FNMA f83 = f75, f37, f83 ;; FNMA f88 = f72, f38, f88 FNMA f89 = f73, f38, f89 FNMA f90 = f74, f38, f90 FNMA f91 = f75, f38, f91 ;; FMPY f80 = f80, f39 FMPY f81 = f81, f39 FMPY f82 = f82, f39 FMPY f83 = f83, f39 ;; FNMA f88 = f80, f40, f88 FNMA f89 = f81, f40, f89 FNMA f90 = f82, f40, f90 FNMA f91 = f83, f40, f91 ;; FMPY f88 = f88, f41 FMPY f89 = f89, f41 FMPY f90 = f90, f41 FMPY f91 = f91, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f75, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, -11 * SIZE STFD [AOFFSET2] = f91, -11 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 FMPY f89 = f89, f32 FMPY f90 = f90, f32 FMPY f91 = f91, f32 ;; FNMA f80 = f88, f33, f80 FNMA f81 = f89, f33, f81 FNMA f82 = f90, f33, f82 FNMA f83 = f91, f33, f83 ;; FNMA f72 = f88, f34, f72 FNMA f73 = f89, f34, f73 FNMA f74 = f90, f34, f74 FNMA f75 = f91, f34, f75 ;; FNMA f64 = f88, f35, f64 FNMA f65 = f89, f35, f65 FNMA f66 = f90, f35, f66 FNMA f67 = f91, f35, f67 ;; FMPY f80 = f80, f36 FMPY f81 = f81, f36 FMPY f82 = f82, f36 FMPY f83 = f83, f36 ;; FNMA f72 = f80, f37, f72 FNMA f73 = f81, f37, f73 FNMA f74 = f82, f37, f74 FNMA f75 = f83, f37, f75 ;; FNMA f64 = f80, f38, f64 FNMA f65 = f81, f38, f65 FNMA f66 = f82, f38, f66 FNMA f67 = f83, f38, f67 ;; FMPY f72 = f72, f39 FMPY f73 = f73, f39 FMPY f74 = f74, f39 FMPY f75 = f75, f39 ;; FNMA f64 = f72, f40, f64 FNMA f65 = f73, f40, f65 FNMA f66 = f74, f40, f66 FNMA f67 = f75, f40, f67 ;; FMPY f64 = f64, f41 FMPY f65 = f65, f41 FMPY f66 = f66, f41 FMPY f67 = f67, f41 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, - 11 * SIZE STFD [AOFFSET2] = f91, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f75, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi #ifndef LN STFD [C2 ] = f75, SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif } ;; { .mmf STFD [C3 ] = f80, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE } ;; { .mmi #ifndef LN STFD [C3 ] = f83, SIZE #else STFD [C3 ] = f83, - 3 * SIZE #endif } ;; { .mmf STFD [C4 ] = f88, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE } ;; { .mmi STFD [C4 ] = f90, SIZE } ;; { .mmi #ifndef LN STFD [C4 ] = f91, SIZE #else STFD [C4 ] = f91, - 3 * SIZE #endif nop __LINE__ } ;; mov f65 = f0 ;; mov f73 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmf mov f81 = f0 } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmf mov f89 = f0 } ;; { .mmi #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L051: mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f65 = f0 mov f73 = f0 mov f81 = f0 mov f89 = f0 shr I = M, 3 ;; cmp.eq p6, p7 = 0, I (p6) br.cond.dpnt .L089 ;; .align 16 .L052: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } ;; #else { .mfi shladd BOFFSET = r3, 2, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 3, AORIG } ;; #endif { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f74 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f82 = r0 mov f90 = f0 } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds L = 1, L } ;; { .mmf CPREFETCH [PREC], LDC } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC] } ;; { .mfi setf.d f70 = r0 mov f78 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mfi setf.d f71 = r0 adds L = -1, L } ;; { .mfi setf.d f87 = r0 mov f79 = f0 mov ar.lc = L } { .mfb cmp.eq p6, p0 = -1, L mov f95 = f0 (p6) br.cond.dpnt .L058 } ;; .align 8 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 adds C9 = 4 * SIZE, C1 } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C10 = 4 * SIZE, C2 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C11 = 4 * SIZE, C3 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f95 = f47, f59, f95 // A8 * B4 br.cloop.sptk.few .L053 } ;; .align 8 .L058: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -8, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [BOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [BOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [BOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [BOFFSET] adds BOFFSET = -30 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 FSUB f66 = f40, f66 FSUB f74 = f41, f74 FSUB f82 = f42, f82 FSUB f90 = f43, f90 FSUB f67 = f44, f67 FSUB f75 = f45, f75 FSUB f83 = f46, f83 FSUB f91 = f47, f91 FSUB f68 = f48, f68 FSUB f76 = f49, f76 FSUB f84 = f50, f84 FSUB f92 = f51, f92 FSUB f69 = f52, f69 FSUB f77 = f53, f77 FSUB f85 = f54, f85 FSUB f93 = f55, f93 FSUB f70 = f56, f70 FSUB f78 = f57, f78 FSUB f86 = f58, f86 FSUB f94 = f59, f94 FSUB f71 = f60, f71 FSUB f79 = f61, f79 FSUB f87 = f62, f87 FSUB f95 = f63, f95 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [AOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [AOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [AOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [AOFFSET] adds AOFFSET = -30 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; FSUB f72 = f40, f72 FSUB f73 = f41, f73 FSUB f74 = f42, f74 FSUB f75 = f43, f75 FSUB f76 = f44, f76 FSUB f77 = f45, f77 FSUB f78 = f46, f78 FSUB f79 = f47, f79 ;; FSUB f80 = f48, f80 FSUB f81 = f49, f81 FSUB f82 = f50, f82 FSUB f83 = f51, f83 FSUB f84 = f52, f84 FSUB f85 = f53, f85 FSUB f86 = f54, f86 FSUB f87 = f55, f87 FSUB f88 = f56, f88 FSUB f89 = f57, f89 FSUB f90 = f58, f90 FSUB f91 = f59, f91 FSUB f92 = f60, f92 FSUB f93 = f61, f93 FSUB f94 = f62, f94 FSUB f95 = f63, f95 ;; #endif #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f37, f36 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f39, f38 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f44, f43 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f46, f45 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f48, f47 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f50, f49 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f52, f51 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f57, f56 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f59, f58 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f61, f60 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f20, f19 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] ;; FMPY f71 = f71, f32 FMPY f79 = f79, f32 FMPY f87 = f87, f32 FMPY f95 = f95, f32 ;; FNMA f70 = f71, f33, f70 FNMA f78 = f79, f33, f78 FNMA f86 = f87, f33, f86 FNMA f94 = f95, f33, f94 ;; FNMA f69 = f71, f34, f69 FNMA f77 = f79, f34, f77 FNMA f85 = f87, f34, f85 FNMA f93 = f95, f34, f93 ;; FNMA f68 = f71, f35, f68 FNMA f76 = f79, f35, f76 FNMA f84 = f87, f35, f84 FNMA f92 = f95, f35, f92 ;; FNMA f67 = f71, f36, f67 FNMA f75 = f79, f36, f75 FNMA f83 = f87, f36, f83 FNMA f91 = f95, f36, f91 ;; FNMA f66 = f71, f37, f66 FNMA f74 = f79, f37, f74 FNMA f82 = f87, f37, f82 FNMA f90 = f95, f37, f90 ;; FNMA f65 = f71, f38, f65 FNMA f73 = f79, f38, f73 FNMA f81 = f87, f38, f81 FNMA f89 = f95, f38, f89 ;; FNMA f64 = f71, f39, f64 FNMA f72 = f79, f39, f72 FNMA f80 = f87, f39, f80 FNMA f88 = f95, f39, f88 ;; FMPY f70 = f70, f40 FMPY f78 = f78, f40 FMPY f86 = f86, f40 FMPY f94 = f94, f40 ;; FNMA f69 = f70, f41, f69 FNMA f77 = f78, f41, f77 FNMA f85 = f86, f41, f85 FNMA f93 = f94, f41, f93 ;; FNMA f68 = f70, f42, f68 FNMA f76 = f78, f42, f76 FNMA f84 = f86, f42, f84 FNMA f92 = f94, f42, f92 ;; FNMA f67 = f70, f43, f67 FNMA f75 = f78, f43, f75 FNMA f83 = f86, f43, f83 FNMA f91 = f94, f43, f91 ;; FNMA f66 = f70, f44, f66 FNMA f74 = f78, f44, f74 FNMA f82 = f86, f44, f82 FNMA f90 = f94, f44, f90 ;; FNMA f65 = f70, f45, f65 FNMA f73 = f78, f45, f73 FNMA f81 = f86, f45, f81 FNMA f89 = f94, f45, f89 ;; FNMA f64 = f70, f46, f64 FNMA f72 = f78, f46, f72 FNMA f80 = f86, f46, f80 FNMA f88 = f94, f46, f88 ;; FMPY f69 = f69, f47 FMPY f77 = f77, f47 FMPY f85 = f85, f47 FMPY f93 = f93, f47 ;; FNMA f68 = f69, f48, f68 FNMA f76 = f77, f48, f76 FNMA f84 = f85, f48, f84 FNMA f92 = f93, f48, f92 ;; FNMA f67 = f69, f49, f67 FNMA f75 = f77, f49, f75 FNMA f83 = f85, f49, f83 FNMA f91 = f93, f49, f91 ;; FNMA f66 = f69, f50, f66 FNMA f74 = f77, f50, f74 FNMA f82 = f85, f50, f82 FNMA f90 = f93, f50, f90 ;; FNMA f65 = f69, f51, f65 FNMA f73 = f77, f51, f73 FNMA f81 = f85, f51, f81 FNMA f89 = f93, f51, f89 ;; FNMA f64 = f69, f52, f64 FNMA f72 = f77, f52, f72 FNMA f80 = f85, f52, f80 FNMA f88 = f93, f52, f88 ;; FMPY f68 = f68, f53 FMPY f76 = f76, f53 FMPY f84 = f84, f53 FMPY f92 = f92, f53 ;; FNMA f67 = f68, f54, f67 FNMA f75 = f76, f54, f75 FNMA f83 = f84, f54, f83 FNMA f91 = f92, f54, f91 ;; FNMA f66 = f68, f55, f66 FNMA f74 = f76, f55, f74 FNMA f82 = f84, f55, f82 FNMA f90 = f92, f55, f90 ;; FNMA f65 = f68, f56, f65 FNMA f73 = f76, f56, f73 FNMA f81 = f84, f56, f81 FNMA f89 = f92, f56, f89 ;; FNMA f64 = f68, f57, f64 FNMA f72 = f76, f57, f72 FNMA f80 = f84, f57, f80 FNMA f88 = f92, f57, f88 ;; FMPY f67 = f67, f58 FMPY f75 = f75, f58 FMPY f83 = f83, f58 FMPY f91 = f91, f58 ;; FNMA f66 = f67, f59, f66 FNMA f74 = f75, f59, f74 FNMA f82 = f83, f59, f82 FNMA f90 = f91, f59, f90 ;; FNMA f65 = f67, f60, f65 FNMA f73 = f75, f60, f73 FNMA f81 = f83, f60, f81 FNMA f89 = f91, f60, f89 ;; FNMA f64 = f67, f61, f64 FNMA f72 = f75, f61, f72 FNMA f80 = f83, f61, f80 FNMA f88 = f91, f61, f88 ;; FMPY f66 = f66, f16 FMPY f74 = f74, f16 FMPY f82 = f82, f16 FMPY f90 = f90, f16 ;; FNMA f65 = f66, f17, f65 FNMA f73 = f74, f17, f73 FNMA f81 = f82, f17, f81 FNMA f89 = f90, f17, f89 ;; FNMA f64 = f66, f18, f64 FNMA f72 = f74, f18, f72 FNMA f80 = f82, f18, f80 FNMA f88 = f90, f18, f88 ;; FMPY f65 = f65, f19 FMPY f73 = f73, f19 FMPY f81 = f81, f19 FMPY f89 = f89, f19 ;; FNMA f64 = f65, f20, f64 FNMA f72 = f73, f20, f72 FNMA f80 = f81, f20, f80 FNMA f88 = f89, f20, f88 ;; FMPY f64 = f64, f21 FMPY f72 = f72, f21 FMPY f80 = f80, f21 FMPY f88 = f88, f21 ;; adds BOFFSET = 24 * SIZE, BOFFSET adds BOFFSET2 = 24 * SIZE, BOFFSET2 ;; STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f78, SIZE STFD [BOFFSET2] = f79, SIZE ;; STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f87, SIZE ;; STFD [BOFFSET] = f94, - 11 * SIZE STFD [BOFFSET2] = f95, - 11 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f77, SIZE ;; STFD [BOFFSET] = f84, SIZE STFD [BOFFSET2] = f85, SIZE ;; STFD [BOFFSET] = f92, - 11 * SIZE STFD [BOFFSET2] = f93, - 11 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, - 11 * SIZE STFD [BOFFSET2] = f91, - 11 * SIZE ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f89, - 3 * SIZE ;; adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C3 = -8 * SIZE, C3 adds C4 = -8 * SIZE, C4 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 FNMA f82 = f80, f34, f82 FNMA f90 = f88, f34, f90 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 FNMA f83 = f80, f35, f83 FNMA f91 = f88, f35, f91 ;; FNMA f68 = f64, f36, f68 FNMA f76 = f72, f36, f76 FNMA f84 = f80, f36, f84 FNMA f92 = f88, f36, f92 ;; FNMA f69 = f64, f37, f69 FNMA f77 = f72, f37, f77 FNMA f85 = f80, f37, f85 FNMA f93 = f88, f37, f93 ;; FNMA f70 = f64, f38, f70 FNMA f78 = f72, f38, f78 FNMA f86 = f80, f38, f86 FNMA f94 = f88, f38, f94 ;; FNMA f71 = f64, f39, f71 FNMA f79 = f72, f39, f79 FNMA f87 = f80, f39, f87 FNMA f95 = f88, f39, f95 ;; FMPY f65 = f65, f40 FMPY f73 = f73, f40 FMPY f81 = f81, f40 FMPY f89 = f89, f40 ;; FNMA f66 = f65, f41, f66 FNMA f74 = f73, f41, f74 FNMA f82 = f81, f41, f82 FNMA f90 = f89, f41, f90 ;; FNMA f67 = f65, f42, f67 FNMA f75 = f73, f42, f75 FNMA f83 = f81, f42, f83 FNMA f91 = f89, f42, f91 ;; FNMA f68 = f65, f43, f68 FNMA f76 = f73, f43, f76 FNMA f84 = f81, f43, f84 FNMA f92 = f89, f43, f92 ;; FNMA f69 = f65, f44, f69 FNMA f77 = f73, f44, f77 FNMA f85 = f81, f44, f85 FNMA f93 = f89, f44, f93 ;; FNMA f70 = f65, f45, f70 FNMA f78 = f73, f45, f78 FNMA f86 = f81, f45, f86 FNMA f94 = f89, f45, f94 ;; FNMA f71 = f65, f46, f71 FNMA f79 = f73, f46, f79 FNMA f87 = f81, f46, f87 FNMA f95 = f89, f46, f95 ;; FMPY f66 = f66, f47 FMPY f74 = f74, f47 FMPY f82 = f82, f47 FMPY f90 = f90, f47 ;; FNMA f67 = f66, f48, f67 FNMA f75 = f74, f48, f75 FNMA f83 = f82, f48, f83 FNMA f91 = f90, f48, f91 ;; FNMA f68 = f66, f49, f68 FNMA f76 = f74, f49, f76 FNMA f84 = f82, f49, f84 FNMA f92 = f90, f49, f92 ;; FNMA f69 = f66, f50, f69 FNMA f77 = f74, f50, f77 FNMA f85 = f82, f50, f85 FNMA f93 = f90, f50, f93 ;; FNMA f70 = f66, f51, f70 FNMA f78 = f74, f51, f78 FNMA f86 = f82, f51, f86 FNMA f94 = f90, f51, f94 ;; FNMA f71 = f66, f52, f71 FNMA f79 = f74, f52, f79 FNMA f87 = f82, f52, f87 FNMA f95 = f90, f52, f95 ;; FMPY f67 = f67, f53 FMPY f75 = f75, f53 FMPY f83 = f83, f53 FMPY f91 = f91, f53 ;; FNMA f68 = f67, f54, f68 FNMA f76 = f75, f54, f76 FNMA f84 = f83, f54, f84 FNMA f92 = f91, f54, f92 ;; FNMA f69 = f67, f55, f69 FNMA f77 = f75, f55, f77 FNMA f85 = f83, f55, f85 FNMA f93 = f91, f55, f93 ;; FNMA f70 = f67, f56, f70 FNMA f78 = f75, f56, f78 FNMA f86 = f83, f56, f86 FNMA f94 = f91, f56, f94 ;; FNMA f71 = f67, f57, f71 FNMA f79 = f75, f57, f79 FNMA f87 = f83, f57, f87 FNMA f95 = f91, f57, f95 ;; FMPY f68 = f68, f58 FMPY f76 = f76, f58 FMPY f84 = f84, f58 FMPY f92 = f92, f58 ;; FNMA f69 = f68, f59, f69 FNMA f77 = f76, f59, f77 FNMA f85 = f84, f59, f85 FNMA f93 = f92, f59, f93 ;; FNMA f70 = f68, f60, f70 FNMA f78 = f76, f60, f78 FNMA f86 = f84, f60, f86 FNMA f94 = f92, f60, f94 ;; FNMA f71 = f68, f61, f71 FNMA f79 = f76, f61, f79 FNMA f87 = f84, f61, f87 FNMA f95 = f92, f61, f95 ;; FMPY f69 = f69, f16 FMPY f77 = f77, f16 FMPY f85 = f85, f16 FMPY f93 = f93, f16 ;; FNMA f70 = f69, f17, f70 FNMA f78 = f77, f17, f78 FNMA f86 = f85, f17, f86 FNMA f94 = f93, f17, f94 ;; FNMA f71 = f69, f18, f71 FNMA f79 = f77, f18, f79 FNMA f87 = f85, f18, f87 FNMA f95 = f93, f18, f95 ;; FMPY f70 = f70, f19 FMPY f78 = f78, f19 FMPY f86 = f86, f19 FMPY f94 = f94, f19 ;; FNMA f71 = f70, f20, f71 FNMA f79 = f78, f20, f79 FNMA f87 = f86, f20, f87 FNMA f95 = f94, f20, f95 ;; FMPY f71 = f71, f21 FMPY f79 = f79, f21 FMPY f87 = f87, f21 FMPY f95 = f95, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, 5 * SIZE STFD [BOFFSET2] = f89, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, 5 * SIZE STFD [BOFFSET2] = f91, 5 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f77, SIZE ;; STFD [BOFFSET] = f84, SIZE STFD [BOFFSET2] = f85, SIZE ;; STFD [BOFFSET] = f92, 5 * SIZE STFD [BOFFSET2] = f93, 5 * SIZE ;; STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f78, SIZE STFD [BOFFSET2] = f79, SIZE ;; STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f87, SIZE ;; STFD [BOFFSET] = f94 STFD [BOFFSET2] = f95 adds C9 = 4 * SIZE, C1 adds BOFFSET = - 27 * SIZE, BOFFSET adds BOFFSET2 = - 27 * SIZE, BOFFSET2 ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; FNMA f72 = f64, f33, f72 FNMA f76 = f68, f33, f76 FNMA f73 = f65, f33, f73 FNMA f77 = f69, f33, f77 FNMA f74 = f66, f33, f74 FNMA f78 = f70, f33, f78 FNMA f75 = f67, f33, f75 FNMA f79 = f71, f33, f79 ;; FNMA f80 = f64, f34, f80 FNMA f84 = f68, f34, f84 FNMA f81 = f65, f34, f81 FNMA f85 = f69, f34, f85 FNMA f82 = f66, f34, f82 FNMA f86 = f70, f34, f86 FNMA f83 = f67, f34, f83 FNMA f87 = f71, f34, f87 ;; FNMA f88 = f64, f35, f88 FNMA f92 = f68, f35, f92 FNMA f89 = f65, f35, f89 FNMA f93 = f69, f35, f93 FNMA f90 = f66, f35, f90 FNMA f94 = f70, f35, f94 FNMA f91 = f67, f35, f91 FNMA f95 = f71, f35, f95 ;; FMPY f72 = f72, f36 FMPY f76 = f76, f36 FMPY f73 = f73, f36 FMPY f77 = f77, f36 FMPY f74 = f74, f36 FMPY f78 = f78, f36 FMPY f75 = f75, f36 FMPY f79 = f79, f36 ;; FNMA f80 = f72, f37, f80 FNMA f84 = f76, f37, f84 FNMA f81 = f73, f37, f81 FNMA f85 = f77, f37, f85 FNMA f82 = f74, f37, f82 FNMA f86 = f78, f37, f86 FNMA f83 = f75, f37, f83 FNMA f87 = f79, f37, f87 ;; FNMA f88 = f72, f38, f88 FNMA f92 = f76, f38, f92 FNMA f89 = f73, f38, f89 FNMA f93 = f77, f38, f93 FNMA f90 = f74, f38, f90 FNMA f94 = f78, f38, f94 FNMA f91 = f75, f38, f91 FNMA f95 = f79, f38, f95 ;; FMPY f80 = f80, f39 FMPY f84 = f84, f39 FMPY f81 = f81, f39 FMPY f85 = f85, f39 FMPY f82 = f82, f39 FMPY f86 = f86, f39 FMPY f83 = f83, f39 FMPY f87 = f87, f39 ;; FNMA f88 = f80, f40, f88 FNMA f92 = f84, f40, f92 FNMA f89 = f81, f40, f89 FNMA f93 = f85, f40, f93 FNMA f90 = f82, f40, f90 FNMA f94 = f86, f40, f94 FNMA f91 = f83, f40, f91 FNMA f95 = f87, f40, f95 ;; FMPY f88 = f88, f41 FMPY f92 = f92, f41 FMPY f89 = f89, f41 FMPY f93 = f93, f41 FMPY f90 = f90, f41 FMPY f94 = f94, f41 FMPY f91 = f91, f41 FMPY f95 = f95, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f71, 5 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, 5 * SIZE STFD [AOFFSET2] = f79, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f84, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f85, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f86, SIZE ;; STFD [AOFFSET] = f83, 5 * SIZE STFD [AOFFSET2] = f87, 5 * SIZE ;; STFD [AOFFSET] = f88, SIZE STFD [AOFFSET2] = f92, SIZE ;; STFD [AOFFSET] = f89, SIZE STFD [AOFFSET2] = f93, SIZE ;; STFD [AOFFSET] = f90, SIZE STFD [AOFFSET2] = f94, SIZE ;; STFD [AOFFSET] = f91, -27 * SIZE STFD [AOFFSET2] = f95, -27 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], -2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 FMPY f92 = f92, f32 FMPY f89 = f89, f32 FMPY f93 = f93, f32 FMPY f90 = f90, f32 FMPY f94 = f94, f32 FMPY f91 = f91, f32 FMPY f95 = f95, f32 ;; FNMA f80 = f88, f33, f80 FNMA f84 = f92, f33, f84 FNMA f81 = f89, f33, f81 FNMA f85 = f93, f33, f85 FNMA f82 = f90, f33, f82 FNMA f86 = f94, f33, f86 FNMA f83 = f91, f33, f83 FNMA f87 = f95, f33, f87 ;; FNMA f72 = f88, f34, f72 FNMA f76 = f92, f34, f76 FNMA f73 = f89, f34, f73 FNMA f77 = f93, f34, f77 FNMA f74 = f90, f34, f74 FNMA f78 = f94, f34, f78 FNMA f75 = f91, f34, f75 FNMA f79 = f95, f34, f79 ;; FNMA f64 = f88, f35, f64 FNMA f68 = f92, f35, f68 FNMA f65 = f89, f35, f65 FNMA f69 = f93, f35, f69 FNMA f66 = f90, f35, f66 FNMA f70 = f94, f35, f70 FNMA f67 = f91, f35, f67 FNMA f71 = f95, f35, f71 ;; FMPY f80 = f80, f36 FMPY f84 = f84, f36 FMPY f81 = f81, f36 FMPY f85 = f85, f36 FMPY f82 = f82, f36 FMPY f86 = f86, f36 FMPY f83 = f83, f36 FMPY f87 = f87, f36 ;; FNMA f72 = f80, f37, f72 FNMA f76 = f84, f37, f76 FNMA f73 = f81, f37, f73 FNMA f77 = f85, f37, f77 FNMA f74 = f82, f37, f74 FNMA f78 = f86, f37, f78 FNMA f75 = f83, f37, f75 FNMA f79 = f87, f37, f79 ;; FNMA f64 = f80, f38, f64 FNMA f68 = f84, f38, f68 FNMA f65 = f81, f38, f65 FNMA f69 = f85, f38, f69 FNMA f66 = f82, f38, f66 FNMA f70 = f86, f38, f70 FNMA f67 = f83, f38, f67 FNMA f71 = f87, f38, f71 ;; FMPY f72 = f72, f39 FMPY f76 = f76, f39 FMPY f73 = f73, f39 FMPY f77 = f77, f39 FMPY f74 = f74, f39 FMPY f78 = f78, f39 FMPY f75 = f75, f39 FMPY f79 = f79, f39 ;; FNMA f64 = f72, f40, f64 FNMA f68 = f76, f40, f68 FNMA f65 = f73, f40, f65 FNMA f69 = f77, f40, f69 FNMA f66 = f74, f40, f66 FNMA f70 = f78, f40, f70 FNMA f67 = f75, f40, f67 FNMA f71 = f79, f40, f71 ;; FMPY f64 = f64, f41 FMPY f68 = f68, f41 FMPY f65 = f65, f41 FMPY f69 = f69, f41 FMPY f66 = f66, f41 FMPY f70 = f70, f41 FMPY f67 = f67, f41 FMPY f71 = f71, f41 ;; adds AOFFSET = 24 * SIZE, AOFFSET adds AOFFSET2 = 24 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f88, SIZE STFD [AOFFSET2] = f92, SIZE ;; STFD [AOFFSET] = f89, SIZE STFD [AOFFSET2] = f93, SIZE ;; STFD [AOFFSET] = f90, SIZE STFD [AOFFSET2] = f94, SIZE ;; STFD [AOFFSET] = f91, - 11 * SIZE STFD [AOFFSET2] = f95, - 11 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f84, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f85, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f86, SIZE ;; STFD [AOFFSET] = f83, - 11 * SIZE STFD [AOFFSET2] = f87, - 11 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, - 11 * SIZE STFD [AOFFSET2] = f79, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f71, - 3 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, 5 * SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi #ifndef LN STFD [C2 ] = f75, 5 * SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif STFD [C10] = f79 } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C11] = f84, SIZE } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C11] = f85, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE STFD [C11] = f86, SIZE } ;; { .mmi #ifndef LN STFD [C3 ] = f83, 5 * SIZE #else STFD [C3 ] = f83, - 3 * SIZE #endif STFD [C11] = f87 } ;; { .mmf STFD [C4 ] = f88, SIZE STFD [C12] = f92, SIZE } ;; { .mmi STFD [C4 ] = f89, SIZE STFD [C12] = f93, SIZE } ;; { .mmi STFD [C4 ] = f90, SIZE STFD [C12] = f94, SIZE } ;; { .mmi #ifndef LN STFD [C4 ] = f91, 5 * SIZE #else STFD [C4 ] = f91, - 3 * SIZE #endif STFD [C12] = f95 cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 3, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 8, KK #elif defined LN adds KK = -8, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f65 = f0 mov f73 = f0 mov f81 = f0 mov f89 = f0 { .mmb (p6) br.cond.dptk .L052 } ;; .align 8 .L089: #ifdef LN shladd KK8 = K, BASE_SHIFT, r0 ;; shladd B = KK8, 2, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 4, KK #endif #ifdef RT adds KK = -4, KK #endif ;; mov AOFFSET = A ;; .align 16 .L090: tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L130 ;; #ifdef RT { .mmi shladd r3 = LDC, 1, r0 nop __LINE__ shl r2 = K, 1 + BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } #endif ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 ;; { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc } ;; { .mfi #ifndef RT shladd C = LDC, 1, C // coffset += 8 * ldc #else nop __LINE__ #endif mov f81 = f0 #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L110 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi adds L = -1, L } ;; { .mmi cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L128 } ;; .align 8 .L122: { .mfi FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 br.cloop.sptk.few .L122 } ;; .L128: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; #else LDFPD f32, f33 = [AOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; #endif #ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, -SIZE adds C2 = -1 * SIZE, C2 } ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, -SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FMPY f72 = f72, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, -SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 ;; FNMA f64 = f72, f33, f64 ;; FMPY f64 = f64, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, -SIZE ;; #endif #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif #ifndef LN STFD [C2 ] = f72, SIZE #else STFD [C2 ] = f72 #endif mov f64 = f0 mov f72 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 8 .L110: tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L100 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L118 } ;; .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 br.cloop.sptk.few .L112 } ;; .align 8 .L118: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f73 = f73, f32 ;; FNMA f64 = f65, f33, f64 FNMA f72 = f73, f33, f72 ;; FMPY f64 = f64, f34 FMPY f72 = f72, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f73, - 3 * SIZE ;; adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FMPY f65 = f65, f34 FMPY f73 = f73, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f73 = f73, f32 ;; FNMA f64 = f72, f33, f64 FNMA f65 = f73, f33, f65 ;; FMPY f64 = f64, f34 FMPY f65 = f65, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; #ifndef LN STFD [C2 ] = f73, SIZE #else STFD [C2 ] = f73, -SIZE #endif ;; mov f65 = f0 mov f73 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L100: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L091 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L108 } ;; .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 2 * SIZE, C2 } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 br.cloop.sptk.few .L102 } ;; .align 8 .L108: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;; FSUB f66 = f36, f66 FSUB f74 = f37, f74 ;; FSUB f67 = f38, f67 FSUB f75 = f39, f75 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 FMPY f75 = f75, f32 ;; FNMA f66 = f67, f33, f66 FNMA f74 = f75, f33, f74 ;; FNMA f65 = f67, f34, f65 FNMA f73 = f75, f34, f73 ;; FNMA f64 = f67, f35, f64 FNMA f72 = f75, f35, f72 ;; FMPY f66 = f66, f36 FMPY f74 = f74, f36 ;; FNMA f65 = f66, f37, f65 FNMA f73 = f74, f37, f73 ;; FNMA f64 = f66, f38, f64 FNMA f72 = f74, f38, f72 ;; FMPY f65 = f65, f39 FMPY f73 = f73, f39 ;; FNMA f64 = f65, f40, f64 FNMA f72 = f73, f40, f72 ;; FMPY f64 = f64, f41 FMPY f72 = f72, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE STFD [BOFFSET2] = f75, -3 * SIZE ;; adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FMPY f65 = f65, f36 FMPY f73 = f73, f36 ;; FNMA f66 = f65, f37, f66 FNMA f74 = f73, f37, f74 ;; FNMA f67 = f65, f38, f67 FNMA f75 = f73, f38, f75 ;; FMPY f66 = f66, f39 FMPY f74 = f74, f39 ;; FNMA f67 = f66, f40, f67 FNMA f75 = f74, f40, f75 ;; FMPY f67 = f67, f41 FMPY f75 = f75, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE STFD [BOFFSET2] = f75, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 FMPY f74 = f74, f34 FMPY f75 = f75, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f75, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f73 = f73, f32 FMPY f74 = f74, f32 FMPY f75 = f75, f32 ;; FNMA f64 = f72, f33, f64 FNMA f65 = f73, f33, f65 FNMA f66 = f74, f33, f66 FNMA f67 = f75, f33, f67 ;; FMPY f64 = f64, f34 FMPY f65 = f65, f34 FMPY f66 = f66, f34 FMPY f67 = f67, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f75, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi #ifndef LN STFD [C2 ] = f75, SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif } ;; mov f65 = f0 mov f73 = f0 mov f66 = f0 mov f74 = f0 mov f67 = f0 mov f75 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L091: shr I = M, 3 ;; cmp.eq p6, p7 = 0, I (p6) br.cond.dpnt .L129 ;; .align 16 .L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 3, AORIG } ;; #endif (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE } { .mfi cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC } { .mfi adds L = 1, L } ;; { .mmf CPREFETCH [PREC] } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi mov ar.lc = L } ;; mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 mov f76 = f0 mov f77 = f0 mov f78 = f0 mov f79 = f0 ;; { .mfb cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 } ;; .align 8 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 br.cloop.sptk.few .L093 } ;; .align 8 .L098: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -8, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 FSUB f66 = f36, f66 FSUB f74 = f37, f74 FSUB f67 = f38, f67 FSUB f75 = f39, f75 FSUB f68 = f40, f68 FSUB f76 = f41, f76 FSUB f69 = f42, f69 FSUB f77 = f43, f77 FSUB f70 = f44, f70 FSUB f78 = f45, f78 FSUB f71 = f46, f71 FSUB f79 = f47, f79 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; FSUB f72 = f40, f72 FSUB f73 = f41, f73 FSUB f74 = f42, f74 FSUB f75 = f43, f75 FSUB f76 = f44, f76 FSUB f77 = f45, f77 FSUB f78 = f46, f78 FSUB f79 = f47, f79 ;; #endif #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f37, f36 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f39, f38 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f44, f43 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f46, f45 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f48, f47 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f50, f49 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f52, f51 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f57, f56 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f59, f58 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f61, f60 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f20, f19 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] ;; FMPY f71 = f71, f32 FMPY f79 = f79, f32 ;; FNMA f70 = f71, f33, f70 FNMA f78 = f79, f33, f78 ;; FNMA f69 = f71, f34, f69 FNMA f77 = f79, f34, f77 ;; FNMA f68 = f71, f35, f68 FNMA f76 = f79, f35, f76 ;; FNMA f67 = f71, f36, f67 FNMA f75 = f79, f36, f75 ;; FNMA f66 = f71, f37, f66 FNMA f74 = f79, f37, f74 ;; FNMA f65 = f71, f38, f65 FNMA f73 = f79, f38, f73 ;; FNMA f64 = f71, f39, f64 FNMA f72 = f79, f39, f72 ;; FMPY f70 = f70, f40 FMPY f78 = f78, f40 ;; FNMA f69 = f70, f41, f69 FNMA f77 = f78, f41, f77 ;; FNMA f68 = f70, f42, f68 FNMA f76 = f78, f42, f76 ;; FNMA f67 = f70, f43, f67 FNMA f75 = f78, f43, f75 ;; FNMA f66 = f70, f44, f66 FNMA f74 = f78, f44, f74 ;; FNMA f65 = f70, f45, f65 FNMA f73 = f78, f45, f73 ;; FNMA f64 = f70, f46, f64 FNMA f72 = f78, f46, f72 ;; FMPY f69 = f69, f47 FMPY f77 = f77, f47 ;; FNMA f68 = f69, f48, f68 FNMA f76 = f77, f48, f76 ;; FNMA f67 = f69, f49, f67 FNMA f75 = f77, f49, f75 ;; FNMA f66 = f69, f50, f66 FNMA f74 = f77, f50, f74 ;; FNMA f65 = f69, f51, f65 FNMA f73 = f77, f51, f73 ;; FNMA f64 = f69, f52, f64 FNMA f72 = f77, f52, f72 ;; FMPY f68 = f68, f53 FMPY f76 = f76, f53 ;; FNMA f67 = f68, f54, f67 FNMA f75 = f76, f54, f75 ;; FNMA f66 = f68, f55, f66 FNMA f74 = f76, f55, f74 ;; FNMA f65 = f68, f56, f65 FNMA f73 = f76, f56, f73 ;; FNMA f64 = f68, f57, f64 FNMA f72 = f76, f57, f72 ;; FMPY f67 = f67, f58 FMPY f75 = f75, f58 ;; FNMA f66 = f67, f59, f66 FNMA f74 = f75, f59, f74 ;; FNMA f65 = f67, f60, f65 FNMA f73 = f75, f60, f73 ;; FNMA f64 = f67, f61, f64 FNMA f72 = f75, f61, f72 ;; FMPY f66 = f66, f16 FMPY f74 = f74, f16 ;; FNMA f65 = f66, f17, f65 FNMA f73 = f74, f17, f73 ;; FNMA f64 = f66, f18, f64 FNMA f72 = f74, f18, f72 ;; FMPY f65 = f65, f19 FMPY f73 = f73, f19 ;; FNMA f64 = f65, f20, f64 FNMA f72 = f73, f20, f72 ;; FMPY f64 = f64, f21 FMPY f72 = f72, f21 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f78, SIZE ;; STFD [BOFFSET] = f69, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f77, - 11 * SIZE STFD [BOFFSET2] = f79, - 11 * SIZE ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, - 3 * SIZE STFD [BOFFSET2] = f75, - 3 * SIZE ;; adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FNMA f68 = f64, f36, f68 FNMA f76 = f72, f36, f76 ;; FNMA f69 = f64, f37, f69 FNMA f77 = f72, f37, f77 ;; FNMA f70 = f64, f38, f70 FNMA f78 = f72, f38, f78 ;; FNMA f71 = f64, f39, f71 FNMA f79 = f72, f39, f79 ;; FMPY f65 = f65, f40 FMPY f73 = f73, f40 ;; FNMA f66 = f65, f41, f66 FNMA f74 = f73, f41, f74 ;; FNMA f67 = f65, f42, f67 FNMA f75 = f73, f42, f75 ;; FNMA f68 = f65, f43, f68 FNMA f76 = f73, f43, f76 ;; FNMA f69 = f65, f44, f69 FNMA f77 = f73, f44, f77 ;; FNMA f70 = f65, f45, f70 FNMA f78 = f73, f45, f78 ;; FNMA f71 = f65, f46, f71 FNMA f79 = f73, f46, f79 ;; FMPY f66 = f66, f47 FMPY f74 = f74, f47 ;; FNMA f67 = f66, f48, f67 FNMA f75 = f74, f48, f75 ;; FNMA f68 = f66, f49, f68 FNMA f76 = f74, f49, f76 ;; FNMA f69 = f66, f50, f69 FNMA f77 = f74, f50, f77 ;; FNMA f70 = f66, f51, f70 FNMA f78 = f74, f51, f78 ;; FNMA f71 = f66, f52, f71 FNMA f79 = f74, f52, f79 ;; FMPY f67 = f67, f53 FMPY f75 = f75, f53 ;; FNMA f68 = f67, f54, f68 FNMA f76 = f75, f54, f76 ;; FNMA f69 = f67, f55, f69 FNMA f77 = f75, f55, f77 ;; FNMA f70 = f67, f56, f70 FNMA f78 = f75, f56, f78 ;; FNMA f71 = f67, f57, f71 FNMA f79 = f75, f57, f79 ;; FMPY f68 = f68, f58 FMPY f76 = f76, f58 ;; FNMA f69 = f68, f59, f69 FNMA f77 = f76, f59, f77 ;; FNMA f70 = f68, f60, f70 FNMA f78 = f76, f60, f78 ;; FNMA f71 = f68, f61, f71 FNMA f79 = f76, f61, f79 ;; FMPY f69 = f69, f16 FMPY f77 = f77, f16 ;; FNMA f70 = f69, f17, f70 FNMA f78 = f77, f17, f78 ;; FNMA f71 = f69, f18, f71 FNMA f79 = f77, f18, f79 ;; FMPY f70 = f70, f19 FMPY f78 = f78, f19 ;; FNMA f71 = f70, f20, f71 FNMA f79 = f78, f20, f79 ;; FMPY f71 = f71, f21 FMPY f79 = f79, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, 5 * SIZE STFD [BOFFSET2] = f75, 5 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f78, SIZE ;; STFD [BOFFSET] = f69, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f77, -11 * SIZE STFD [BOFFSET2] = f79, -11 * SIZE ;; adds C9 = 4 * SIZE, C1 ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; FNMA f72 = f64, f33, f72 FNMA f76 = f68, f33, f76 FNMA f73 = f65, f33, f73 FNMA f77 = f69, f33, f77 FNMA f74 = f66, f33, f74 FNMA f78 = f70, f33, f78 FNMA f75 = f67, f33, f75 FNMA f79 = f71, f33, f79 ;; FMPY f72 = f72, f34 FMPY f76 = f76, f34 FMPY f73 = f73, f34 FMPY f77 = f77, f34 FMPY f74 = f74, f34 FMPY f78 = f78, f34 FMPY f75 = f75, f34 FMPY f79 = f79, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f71, 5 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, -11 * SIZE STFD [AOFFSET2] = f79, -11 * SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f76 = f76, f32 FMPY f73 = f73, f32 FMPY f77 = f77, f32 FMPY f74 = f74, f32 FMPY f78 = f78, f32 FMPY f75 = f75, f32 FMPY f79 = f79, f32 ;; FNMA f64 = f72, f33, f64 FNMA f68 = f76, f33, f68 FNMA f65 = f73, f33, f65 FNMA f69 = f77, f33, f69 FNMA f66 = f74, f33, f66 FNMA f70 = f78, f33, f70 FNMA f67 = f75, f33, f67 FNMA f71 = f79, f33, f71 ;; FMPY f64 = f64, f34 FMPY f68 = f68, f34 FMPY f65 = f65, f34 FMPY f69 = f69, f34 FMPY f66 = f66, f34 FMPY f70 = f70, f34 FMPY f67 = f67, f34 FMPY f71 = f71, f34 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, - 11 * SIZE STFD [AOFFSET2] = f79, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f71, - 3 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, 5 * SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi #ifndef LN STFD [C2 ] = f75, 5 * SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif STFD [C10] = f79 } ;; { .mmf cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 3, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 8, KK #elif defined LN adds KK = -8, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 (p6) br.cond.dptk .L092 ;; .align 8 .L129: #ifdef LN shladd KK8 = K, BASE_SHIFT, r0 ;; shladd B = KK8, 1, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 2, KK #endif #ifdef RT adds KK = -2, KK #endif ;; mov AOFFSET = A ;; .align 16 .L130: tbit.z p6, p0 = N, 0 (p6) br.cond.dpnt .L999 ;; #ifdef RT { .mmi nop __LINE__ shl r2 = K, BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, LDC nop __LINE__ } #endif ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 ;; { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } ;; { .mfi #ifndef RT add C = C, LDC // coffset += 8 * ldc #else nop __LINE__ #endif #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .L160: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L150 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } ;; shladd r3 = KK, BASE_SHIFT, r0 ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ adds L = 1, L } ;; #else { .mmi shladd BOFFSET = KK, BASE_SHIFT, B nop __LINE__ #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE adds L = 1, L add AOFFSET = r3, AORIG } ;; #endif ;; { .mii tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi cmp.eq p6, p0 = 0, L adds L = -1, L cmp.eq p3, p0 = r0, r0 } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L168 } ;; .align 8 .L162: { .mmf cmp.ne p4, p5 = 0, L (p12) cmp.ne p3, p0 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 } ;; { .mmi (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } ;; { .mmi (p4) LDFD f32 = [AOFFSET], 1 * SIZE nop __LINE__ adds L = -1, L } { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 br.cloop.sptk.few .L162 } ;; .align 8 .L168: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) { .mmi LDFD f32 = [BOFFSET] LDFD f33 = [AOFFSET] #ifdef LN adds C1 = -1 * SIZE, C1 #else nop __LINE__ #endif } ;; #else { .mmi LDFD f32 = [AOFFSET] LDFD f33 = [BOFFSET] nop __LINE__ } ;; #endif { .mmf sub L = K, KK #ifdef RT shladd AORIG = K, BASE_SHIFT, AORIG #else nop __LINE__ #endif FSUB f64 = f32, f64 } ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; FMPY f64 = f64, f33 ;; #if defined(LN) || defined(LT) { .mmf STFD [BOFFSET] = f64 #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif mov f64 = f0 } ;; #else { .mmf STFD [AOFFSET] = f64 STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; #endif #if defined(LT) || defined(RN) shladd AOFFSET = L, BASE_SHIFT, AOFFSET #else nop __LINE__ #endif #if defined(LT) || defined(RN) shladd BOFFSET = L, BASE_SHIFT, BOFFSET #else nop __LINE__ #endif ;; .align 8 .L150: tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L140 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } ;; shladd r3 = KK, BASE_SHIFT, r0 ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFD f48 = [BOFFSET], 1 * SIZE } ;; #else { .mfi shladd BOFFSET = KK, BASE_SHIFT, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFD f48 = [BOFFSET], 1 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; (p7) LDFD f32 = [AOFFSET], SIZE ;; (p7) LDFD f33 = [AOFFSET], SIZE ;; ;; { .mib mov ar.lc = L (p6) br.cond.dpnt .L158 } ;; .L152: { .mfi cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } ;; { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } ;; { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 br.cloop.sptk.few .L152 } ;; .L158: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET] ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 ;; #else LDFPD f32, f33 = [AOFFSET] ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 ;; FNMA f64 = f65, f33, f64 ;; FMPY f64 = f64, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, - SIZE ;; adds C1 = -2 * SIZE, C1 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FMPY f65 = f65, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, -SIZE ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, - SIZE ;; #endif #ifdef RT LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, - SIZE ;; #endif STFD [C1 ] = f64, SIZE ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; mov f64 = f0 mov f65 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) add BOFFSET = L, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L140: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L131 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } ;; shladd r3 = KK, BASE_SHIFT, r0 ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFD f48 = [BOFFSET], 1 * SIZE mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = KK, BASE_SHIFT, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFD f48 = [BOFFSET], 1 * SIZE shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L148 } ;; .L142: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 (p5) adds C9 = 2 * SIZE, C1 } { .mmf nop __LINE__ (p3) LDFD f56 = [BOFFSET], 1 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 (p5) adds C10 = 2 * SIZE, C2 } { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } { .mfb nop __LINE__ nop.f 0 br.cloop.sptk.few .L142 } ;; .L148: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 ;; FNMA f66 = f67, f33, f66 ;; FNMA f65 = f67, f34, f65 ;; FNMA f64 = f67, f35, f64 ;; FMPY f66 = f66, f36 ;; FNMA f65 = f66, f37, f65 ;; FNMA f64 = f66, f38, f64 ;; FMPY f65 = f65, f39 ;; FNMA f64 = f65, f40, f64 ;; FMPY f64 = f64, f41 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f66, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE ;; adds C1 = -4 * SIZE, C1 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FNMA f66 = f64, f34, f66 ;; FNMA f67 = f64, f35, f67 ;; FMPY f65 = f65, f36 ;; FNMA f66 = f65, f37, f66 ;; FNMA f67 = f65, f38, f67 ;; FMPY f66 = f66, f39 ;; FNMA f67 = f66, f40, f67 ;; FMPY f67 = f67, f41 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f66, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f66, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE ;; #endif #ifdef RT LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f66, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf mov f72 = f0 } ;; mov f65 = f0 mov f73 = f0 mov f66 = f0 mov f74 = f0 mov f67 = f0 mov f75 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) add BOFFSET = L, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L131: #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; shr I = M, 3 ;; cmp.eq p6, p7 = 0, I (p6) br.cond.dpnt .L169 ;; .align 16 .L132: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } ;; shladd r3 = KK, BASE_SHIFT, r0 ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK, BASE_SHIFT, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFD f48 = [BOFFSET], 1 * SIZE shladd AOFFSET = r3, 3, AORIG } ;; #endif (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE } { .mfi cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC] } { .mfi adds L = 1, L } ;; { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi mov ar.lc = L } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 ;; { .mfb cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L138 } ;; .align 16 .L133: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C9 = 4 * SIZE, C1 } { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } ;; { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE nop __LINE__ br.cloop.sptk.few .L133 } ;; .L138: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -8, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG add BOFFSET = r2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; #endif #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f37, f36 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f39, f38 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f44, f43 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f46, f45 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f48, f47 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f50, f49 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f52, f51 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f57, f56 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f59, f58 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f61, f60 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f20, f19 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] ;; FMPY f71 = f71, f32 ;; FNMA f70 = f71, f33, f70 ;; FNMA f69 = f71, f34, f69 ;; FNMA f68 = f71, f35, f68 ;; FNMA f67 = f71, f36, f67 ;; FNMA f66 = f71, f37, f66 ;; FNMA f65 = f71, f38, f65 ;; FNMA f64 = f71, f39, f64 ;; FMPY f70 = f70, f40 ;; FNMA f69 = f70, f41, f69 ;; FNMA f68 = f70, f42, f68 ;; FNMA f67 = f70, f43, f67 ;; FNMA f66 = f70, f44, f66 ;; FNMA f65 = f70, f45, f65 ;; FNMA f64 = f70, f46, f64 ;; FMPY f69 = f69, f47 ;; FNMA f68 = f69, f48, f68 ;; FNMA f67 = f69, f49, f67 ;; FNMA f66 = f69, f50, f66 ;; FNMA f65 = f69, f51, f65 ;; FNMA f64 = f69, f52, f64 ;; FMPY f68 = f68, f53 ;; FNMA f67 = f68, f54, f67 ;; FNMA f66 = f68, f55, f66 ;; FNMA f65 = f68, f56, f65 ;; FNMA f64 = f68, f57, f64 ;; FMPY f67 = f67, f58 ;; FNMA f66 = f67, f59, f66 ;; FNMA f65 = f67, f60, f65 ;; FNMA f64 = f67, f61, f64 ;; FMPY f66 = f66, f16 ;; FNMA f65 = f66, f17, f65 ;; FNMA f64 = f66, f18, f64 ;; FMPY f65 = f65, f19 ;; FNMA f64 = f65, f20, f64 ;; FMPY f64 = f64, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f68, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f67, - 3 * SIZE STFD [BOFFSET2] = f71, - 3 * SIZE ;; adds C1 = -8 * SIZE, C1 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FNMA f66 = f64, f34, f66 ;; FNMA f67 = f64, f35, f67 ;; FNMA f68 = f64, f36, f68 ;; FNMA f69 = f64, f37, f69 ;; FNMA f70 = f64, f38, f70 ;; FNMA f71 = f64, f39, f71 ;; FMPY f65 = f65, f40 ;; FNMA f66 = f65, f41, f66 ;; FNMA f67 = f65, f42, f67 ;; FNMA f68 = f65, f43, f68 ;; FNMA f69 = f65, f44, f69 ;; FNMA f70 = f65, f45, f70 ;; FNMA f71 = f65, f46, f71 ;; FMPY f66 = f66, f47 ;; FNMA f67 = f66, f48, f67 ;; FNMA f68 = f66, f49, f68 ;; FNMA f69 = f66, f50, f69 ;; FNMA f70 = f66, f51, f70 ;; FNMA f71 = f66, f52, f71 ;; FMPY f67 = f67, f53 ;; FNMA f68 = f67, f54, f68 ;; FNMA f69 = f67, f55, f69 ;; FNMA f70 = f67, f56, f70 ;; FNMA f71 = f67, f57, f71 ;; FMPY f68 = f68, f58 ;; FNMA f69 = f68, f59, f69 ;; FNMA f70 = f68, f60, f70 ;; FNMA f71 = f68, f61, f71 ;; FMPY f69 = f69, f16 ;; FNMA f70 = f69, f17, f70 ;; FNMA f71 = f69, f18, f71 ;; FMPY f70 = f70, f19 ;; FNMA f71 = f70, f20, f71 ;; FMPY f71 = f71, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f68, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE STFD [BOFFSET2] = f71, -3 * SIZE ;; adds C9 = 4 * SIZE, C1 ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71, -3 * SIZE ;; #endif #ifdef RT LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71, -3 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, 5 * SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif STFD [C9 ] = f71 } ;; { .mmf cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 3, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) add BOFFSET = L, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 8, KK #elif defined LN adds KK = -8, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 (p6) br.cond.dptk .L132 .align 8 .L169: { .mii #ifdef LN shladd B = K, BASE_SHIFT, B #elif defined(LT) || defined(RN) mov B = BOFFSET #else nop __LINE__ #endif #ifdef RN adds KK = 1, KK #elif defined RT adds KK = -1, KK #else nop __LINE__ #endif mov AOFFSET = A } ;; .align 16 .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 ;; mov ar.lc = ARLC ;; mov pr = PR, -1 ;; mov ar.pfs = ARPFS ;; br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/trsm_kernel_LT.S000066400000000000000000005374661313527062700202640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #define CPREFETCHSIZE 7 #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r36 #define B r37 #define C r38 #define LDC r39 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define C9 loc0 #define C10 loc1 #define C11 loc2 #define C12 loc3 #define C13 loc4 #define C14 loc5 #define C15 loc6 #define C16 loc7 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA f8 #define AORIG loc8 #define KK loc9 #define KK8 loc10 #define OFFSET loc11 #define AOFFSET2 loc12 #define BOFFSET2 loc13 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -6 * 16, SP adds r9 = -5 * 16, SP adds SP = -6 * 16, SP } ;; { .mmi ld8 OFFSET = [r14] mov AOFFSET = A mov PR = pr } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 shr J = N, 3 } ;; { .mmi stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shladd LDC = LDC, BASE_SHIFT, r0 } ;; .body { .mmi stf.spill [r8] = f20 stf.spill [r9] = f21 cmp.ge p6, p0 = 0, J } { .mib nop __LINE__ #ifdef RN sub KK = r0, OFFSET #else nop __LINE__ #endif (p6) br.cond.dpnt .L050 } ;; .align 8 .L010: { .mfi adds J = -1, J mov f64 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f72 = f0 #ifdef LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I mov AOFFSET = A mov f80 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f88 = f0 } ;; { .mmf shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc shladd C = LDC, 3, C // coffset += 8 * ldc mov f96 = f0 } { .mmf shladd C4 = LDC, 1, C2 shladd C6 = LDC, 2, C2 mov f104 = f0 } ;; { .mfi shladd C7 = LDC, 2, C3 mov f112 = f0 mov L = KK }{ .mfb shladd C8 = LDC, 2, C4 mov f120 = f0 (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: { .mmf cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B mov f65 = f0 } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f81 = f0 } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f119 = r0 mov f89 = f0 } { .mmf (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f113 = r0 mov f121 = f0 } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfi setf.d f82 = r0 mov f90 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfi setf.d f114 = r0 mov f122 = f0 adds L = 1, L } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE setf.d f99 = r0 mov f107 = f0 } { .mfi setf.d f115 = r0 mov f123 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds AOFFSET2 = 4 * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f100 = r0 mov f108 = f0 } { .mfi setf.d f116 = r0 mov f124 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f101 = r0 mov f109 = f0 } { .mfi setf.d f117 = r0 mov f125 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f70 = r0 mov f78 = f0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f102 = r0 mov f110 = f0 } { .mfi setf.d f118 = r0 mov f126 = f0 adds L = -1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f71 = r0 mov f79 = f0 } { .mfi setf.d f87 = r0 mov f95 = f0 mov ar.lc = L } ;; { .mmf CPREFETCH [PREC] setf.d f103 = r0 mov f111 = f0 } { .mfb cmp.eq p6, p0 = -1, L mov f127 = f0 (p6) br.cond.dpnt .L018 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.fault.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb adds C9 = 4 * SIZE, C1 FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb adds C10 = 4 * SIZE, C2 FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb adds C11 = 4 * SIZE, C3 FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb adds C12 = 4 * SIZE, C4 FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb adds C13 = 4 * SIZE, C5 FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb adds C14 = 4 * SIZE, C6 FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb adds C15 = 4 * SIZE, C7 FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb adds C16 = 4 * SIZE, C8 FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f108 = f36, f53, f108 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f124 = f36, f55, f124 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f109 = f37, f53, f109 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f125 = f37, f55, f125 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f110 = f38, f53, f110 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f126 = f38, f55, f126 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f111 = f39, f53, f111 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f127 = f39, f55, f127 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f123 = f43, f63, f123 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb nop __LINE__ (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f108 = f44, f61, f108 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb nop __LINE__ (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f124 = f44, f63, f124 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb nop __LINE__ (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f109 = f45, f61, f109 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb nop __LINE__ (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f125 = f45, f63, f125 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb nop __LINE__ (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f110 = f46, f61, f110 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb nop __LINE__ (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f126 = f46, f63, f126 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f95 = f47, f59, f95 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb nop __LINE__ (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f111 = f47, f61, f111 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi nop __LINE__ (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb adds AOFFSET2 = 4 * SIZE, AOFFSET (p3) FMA f127 = f47, f63, f127 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L018: #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET], 2 * SIZE FSUB f113 = f46, f113 nop __LINE__ } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [BOFFSET], 2 * SIZE FSUB f66 = f48, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB f74 = f49, f74 nop __LINE__ } ;; { .mfi LDFPD f34, f35 = [BOFFSET], 2 * SIZE FSUB f82 = f50, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f90 = f51, f90 nop __LINE__ } ;; { .mfi LDFPD f36, f37 = [BOFFSET], 2 * SIZE FSUB f98 = f52, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f106 = f53, f106 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [BOFFSET], 2 * SIZE FSUB f114 = f54, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f122 = f55, f122 nop __LINE__ } ;; { .mfi LDFPD f40, f41 = [BOFFSET], 2 * SIZE FSUB f67 = f56, f67 nop __LINE__ } { .mfi nop __LINE__ FSUB f75 = f57, f75 nop __LINE__ } ;; { .mfi LDFPD f42, f43 = [BOFFSET], 2 * SIZE FSUB f83 = f58, f83 nop __LINE__ } { .mfi nop __LINE__ FSUB f91 = f59, f91 nop __LINE__ } ;; { .mfi LDFPD f44, f45 = [BOFFSET], 2 * SIZE FSUB f99 = f60, f99 nop __LINE__ } { .mfi nop __LINE__ FSUB f107 = f61, f107 nop __LINE__ } ;; { .mfi LDFPD f46, f47 = [BOFFSET], 2 * SIZE FSUB f115 = f62, f115 nop __LINE__ } { .mfi nop __LINE__ FSUB f123 = f63, f123 nop __LINE__ } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f68 = f32, f68 nop __LINE__ } { .mfi nop __LINE__ FSUB f76 = f33, f76 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f84 = f34, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f92 = f35, f92 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f100 = f36, f100 nop __LINE__ } { .mfi nop __LINE__ FSUB f108 = f37, f108 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f116 = f38, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f124 = f39, f124 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f69 = f40, f69 nop __LINE__ } { .mfi nop __LINE__ FSUB f77 = f41, f77 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f85 = f42, f85 nop __LINE__ } { .mfi nop __LINE__ FSUB f93 = f43, f93 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f101 = f44, f101 nop __LINE__ } { .mfi nop __LINE__ FSUB f109 = f45, f109 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET] FSUB f117 = f46, f117 adds BOFFSET = -62 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB f125 = f47, f125 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f70 = f48, f70 nop __LINE__ } { .mfi nop __LINE__ FSUB f78 = f49, f78 nop __LINE__ } { .mfi nop __LINE__ FSUB f86 = f50, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f94 = f51, f94 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [AOFFSET] FSUB f102 = f52, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB f110 = f53, f110 nop __LINE__ } { .mfi nop __LINE__ FSUB f118 = f54, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f126 = f55, f126 adds AOFFSET = 2 * SIZE, AOFFSET } ;; { .mfi nop __LINE__ FSUB f71 = f56, f71 nop __LINE__ } { .mfi nop __LINE__ FSUB f79 = f57, f79 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f58, f87 nop __LINE__ } { .mfi nop __LINE__ FSUB f95 = f59, f95 nop __LINE__ } { .mfi nop __LINE__ FSUB f103 = f60, f103 nop __LINE__ } { .mfi nop __LINE__ FSUB f111 = f61, f111 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f62, f119 nop __LINE__ } { .mfi nop __LINE__ FSUB f127 = f63, f127 nop __LINE__ } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [AOFFSET] FMPY f80 = f80, f32 adds AOFFSET = 3 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi LDFD f40 = [AOFFSET], 1 * SIZE FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi LDFPD f41, f42 = [AOFFSET], 2 * SIZE FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi LDFPD f43, f44 = [AOFFSET], 2 * SIZE FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi LDFPD f45, f46 = [AOFFSET] FNMA f81 = f80, f33, f81 adds AOFFSET = 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi LDFPD f47, f48 = [AOFFSET], 2 * SIZE FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; { .mfi LDFPD f49, f50 = [AOFFSET], 2 * SIZE FNMA f66 = f64, f34, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f96, f34, f98 nop __LINE__ } ;; { .mfi LDFPD f51, f52 = [AOFFSET] FNMA f74 = f72, f34, f74 adds AOFFSET = 5 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f106 = f104, f34, f106 nop __LINE__ } ;; { .mfi LDFD f53 = [AOFFSET], 1 * SIZE FNMA f82 = f80, f34, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f112, f34, f114 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [AOFFSET], 2 * SIZE FNMA f90 = f88, f34, f90 nop __LINE__ } { .mfi nop __LINE__ FNMA f122 = f120, f34, f122 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [AOFFSET] FNMA f67 = f64, f35, f67 adds AOFFSET = 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f99 = f96, f35, f99 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [AOFFSET], 2 * SIZE FNMA f75 = f72, f35, f75 nop __LINE__ } { .mfi nop __LINE__ FNMA f107 = f104, f35, f107 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [AOFFSET] FNMA f83 = f80, f35, f83 adds AOFFSET = 7 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f115 = f112, f35, f115 nop __LINE__ } ;; { .mfi LDFD f16 = [AOFFSET], 1 * SIZE FNMA f91 = f88, f35, f91 nop __LINE__ } { .mfi nop __LINE__ FNMA f123 = f120, f35, f123 nop __LINE__ } ;; { .mfi LDFPD f17, f18 = [AOFFSET] FNMA f68 = f64, f36, f68 adds AOFFSET = 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f100 = f96, f36, f100 nop __LINE__ } ;; { .mfi LDFPD f19, f20 = [AOFFSET] FNMA f76 = f72, f36, f76 adds AOFFSET = 9 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f108 = f104, f36, f108 nop __LINE__ } ;; { .mfi LDFD f21 = [AOFFSET] FNMA f84 = f80, f36, f84 adds AOFFSET = -63 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f116 = f112, f36, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f92 = f88, f36, f92 nop __LINE__ } { .mfi nop __LINE__ FNMA f124 = f120, f36, f124 nop __LINE__ } ;; FNMA f69 = f64, f37, f69 FNMA f101 = f96, f37, f101 FNMA f77 = f72, f37, f77 FNMA f109 = f104, f37, f109 FNMA f85 = f80, f37, f85 FNMA f117 = f112, f37, f117 FNMA f93 = f88, f37, f93 FNMA f125 = f120, f37, f125 ;; FNMA f70 = f64, f38, f70 FNMA f102 = f96, f38, f102 FNMA f78 = f72, f38, f78 FNMA f110 = f104, f38, f110 FNMA f86 = f80, f38, f86 FNMA f118 = f112, f38, f118 FNMA f94 = f88, f38, f94 FNMA f126 = f120, f38, f126 ;; FNMA f71 = f64, f39, f71 FNMA f103 = f96, f39, f103 FNMA f79 = f72, f39, f79 FNMA f111 = f104, f39, f111 FNMA f87 = f80, f39, f87 FNMA f119 = f112, f39, f119 FNMA f95 = f88, f39, f95 FNMA f127 = f120, f39, f127 ;; FMPY f65 = f65, f40 FMPY f97 = f97, f40 FMPY f73 = f73, f40 FMPY f105 = f105, f40 FMPY f81 = f81, f40 FMPY f113 = f113, f40 FMPY f89 = f89, f40 FMPY f121 = f121, f40 ;; FNMA f66 = f65, f41, f66 FNMA f98 = f97, f41, f98 FNMA f74 = f73, f41, f74 FNMA f106 = f105, f41, f106 FNMA f82 = f81, f41, f82 FNMA f114 = f113, f41, f114 FNMA f90 = f89, f41, f90 FNMA f122 = f121, f41, f122 FNMA f67 = f65, f42, f67 FNMA f99 = f97, f42, f99 FNMA f75 = f73, f42, f75 FNMA f107 = f105, f42, f107 FNMA f83 = f81, f42, f83 FNMA f115 = f113, f42, f115 FNMA f91 = f89, f42, f91 FNMA f123 = f121, f42, f123 ;; FNMA f68 = f65, f43, f68 FNMA f100 = f97, f43, f100 FNMA f76 = f73, f43, f76 FNMA f108 = f105, f43, f108 FNMA f84 = f81, f43, f84 FNMA f116 = f113, f43, f116 FNMA f92 = f89, f43, f92 FNMA f124 = f121, f43, f124 ;; FNMA f69 = f65, f44, f69 FNMA f101 = f97, f44, f101 FNMA f77 = f73, f44, f77 FNMA f109 = f105, f44, f109 FNMA f85 = f81, f44, f85 FNMA f117 = f113, f44, f117 FNMA f93 = f89, f44, f93 FNMA f125 = f121, f44, f125 ;; FNMA f70 = f65, f45, f70 FNMA f102 = f97, f45, f102 FNMA f78 = f73, f45, f78 FNMA f110 = f105, f45, f110 FNMA f86 = f81, f45, f86 FNMA f118 = f113, f45, f118 FNMA f94 = f89, f45, f94 FNMA f126 = f121, f45, f126 ;; FNMA f71 = f65, f46, f71 FNMA f103 = f97, f46, f103 FNMA f79 = f73, f46, f79 FNMA f111 = f105, f46, f111 FNMA f87 = f81, f46, f87 FNMA f119 = f113, f46, f119 FNMA f95 = f89, f46, f95 FNMA f127 = f121, f46, f127 ;; FMPY f66 = f66, f47 FMPY f98 = f98, f47 FMPY f74 = f74, f47 FMPY f106 = f106, f47 FMPY f82 = f82, f47 FMPY f114 = f114, f47 FMPY f90 = f90, f47 FMPY f122 = f122, f47 ;; FNMA f67 = f66, f48, f67 FNMA f99 = f98, f48, f99 FNMA f75 = f74, f48, f75 FNMA f107 = f106, f48, f107 FNMA f83 = f82, f48, f83 FNMA f115 = f114, f48, f115 FNMA f91 = f90, f48, f91 FNMA f123 = f122, f48, f123 FNMA f68 = f66, f49, f68 FNMA f100 = f98, f49, f100 FNMA f76 = f74, f49, f76 FNMA f108 = f106, f49, f108 FNMA f84 = f82, f49, f84 FNMA f116 = f114, f49, f116 FNMA f92 = f90, f49, f92 FNMA f124 = f122, f49, f124 ;; FNMA f69 = f66, f50, f69 FNMA f101 = f98, f50, f101 FNMA f77 = f74, f50, f77 FNMA f109 = f106, f50, f109 FNMA f85 = f82, f50, f85 FNMA f117 = f114, f50, f117 FNMA f93 = f90, f50, f93 FNMA f125 = f122, f50, f125 ;; FNMA f70 = f66, f51, f70 FNMA f102 = f98, f51, f102 FNMA f78 = f74, f51, f78 FNMA f110 = f106, f51, f110 FNMA f86 = f82, f51, f86 FNMA f118 = f114, f51, f118 FNMA f94 = f90, f51, f94 FNMA f126 = f122, f51, f126 ;; FNMA f71 = f66, f52, f71 FNMA f103 = f98, f52, f103 FNMA f79 = f74, f52, f79 FNMA f111 = f106, f52, f111 FNMA f87 = f82, f52, f87 FNMA f119 = f114, f52, f119 FNMA f95 = f90, f52, f95 FNMA f127 = f122, f52, f127 ;; FMPY f67 = f67, f53 FMPY f99 = f99, f53 FMPY f75 = f75, f53 FMPY f107 = f107, f53 FMPY f83 = f83, f53 FMPY f115 = f115, f53 FMPY f91 = f91, f53 FMPY f123 = f123, f53 ;; FNMA f68 = f67, f54, f68 FNMA f100 = f99, f54, f100 FNMA f76 = f75, f54, f76 FNMA f108 = f107, f54, f108 FNMA f84 = f83, f54, f84 FNMA f116 = f115, f54, f116 FNMA f92 = f91, f54, f92 FNMA f124 = f123, f54, f124 ;; FNMA f69 = f67, f55, f69 FNMA f101 = f99, f55, f101 FNMA f77 = f75, f55, f77 FNMA f109 = f107, f55, f109 FNMA f85 = f83, f55, f85 FNMA f117 = f115, f55, f117 FNMA f93 = f91, f55, f93 FNMA f125 = f123, f55, f125 ;; FNMA f70 = f67, f56, f70 FNMA f102 = f99, f56, f102 FNMA f78 = f75, f56, f78 FNMA f110 = f107, f56, f110 FNMA f86 = f83, f56, f86 FNMA f118 = f115, f56, f118 FNMA f94 = f91, f56, f94 FNMA f126 = f123, f56, f126 ;; FNMA f71 = f67, f57, f71 FNMA f103 = f99, f57, f103 FNMA f79 = f75, f57, f79 FNMA f111 = f107, f57, f111 FNMA f87 = f83, f57, f87 FNMA f119 = f115, f57, f119 FNMA f95 = f91, f57, f95 FNMA f127 = f123, f57, f127 ;; FMPY f68 = f68, f58 FMPY f100 = f100, f58 FMPY f76 = f76, f58 FMPY f108 = f108, f58 FMPY f84 = f84, f58 FMPY f116 = f116, f58 FMPY f92 = f92, f58 FMPY f124 = f124, f58 ;; FNMA f69 = f68, f59, f69 FNMA f101 = f100, f59, f101 FNMA f77 = f76, f59, f77 FNMA f109 = f108, f59, f109 FNMA f85 = f84, f59, f85 FNMA f117 = f116, f59, f117 FNMA f93 = f92, f59, f93 FNMA f125 = f124, f59, f125 ;; FNMA f70 = f68, f60, f70 FNMA f102 = f100, f60, f102 FNMA f78 = f76, f60, f78 FNMA f110 = f108, f60, f110 FNMA f86 = f84, f60, f86 FNMA f118 = f116, f60, f118 FNMA f94 = f92, f60, f94 FNMA f126 = f124, f60, f126 ;; { .mfi STFD [BOFFSET] = f64, SIZE FNMA f71 = f68, f61, f71 } { .mfi STFD [BOFFSET2] = f96, SIZE FNMA f103 = f100, f61, f103 } ;; { .mfi STFD [BOFFSET] = f72, SIZE FNMA f79 = f76, f61, f79 } { .mfi STFD [BOFFSET2] = f104, SIZE FNMA f111 = f108, f61, f111 } ;; { .mfi STFD [BOFFSET] = f80, SIZE FNMA f87 = f84, f61, f87 } { .mfi STFD [BOFFSET2] = f112, SIZE FNMA f119 = f116, f61, f119 } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE FNMA f95 = f92, f61, f95 } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE FNMA f127 = f124, f61, f127 } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMPY f69 = f69, f16 } { .mfi STFD [BOFFSET2] = f97, SIZE FMPY f101 = f101, f16 } ;; { .mfi STFD [BOFFSET] = f73, SIZE FMPY f77 = f77, f16 } { .mfi STFD [BOFFSET2] = f105, SIZE FMPY f109 = f109, f16 } ;; { .mfi STFD [BOFFSET] = f81, SIZE FMPY f85 = f85, f16 } { .mfi STFD [BOFFSET2] = f113, SIZE FMPY f117 = f117, f16 } ;; { .mfi STFD [BOFFSET] = f89, 5 * SIZE FMPY f93 = f93, f16 } { .mfi STFD [BOFFSET2] = f121, 5 * SIZE FMPY f125 = f125, f16 } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f70 = f69, f17, f70 } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f102 = f101, f17, f102 } ;; { .mfi STFD [BOFFSET] = f74, SIZE FNMA f78 = f77, f17, f78 } { .mfi STFD [BOFFSET2] = f106, SIZE FNMA f110 = f109, f17, f110 } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f86 = f85, f17, f86 } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f118 = f117, f17, f118 } ;; { .mfi STFD [BOFFSET] = f90, 5 * SIZE FNMA f94 = f93, f17, f94 } { .mfi STFD [BOFFSET2] = f122, 5 * SIZE FNMA f126 = f125, f17, f126 } ;; { .mfi STFD [BOFFSET] = f67, SIZE FNMA f71 = f69, f18, f71 } { .mfi STFD [BOFFSET2] = f99, SIZE FNMA f103 = f101, f18, f103 } ;; { .mfi STFD [BOFFSET] = f75, SIZE FNMA f79 = f77, f18, f79 } { .mfi STFD [BOFFSET2] = f107, SIZE FNMA f111 = f109, f18, f111 } ;; { .mfi STFD [BOFFSET] = f83, SIZE FNMA f87 = f85, f18, f87 } { .mfi STFD [BOFFSET2] = f115, SIZE FNMA f119 = f117, f18, f119 } ;; { .mfi STFD [BOFFSET] = f91, 5 * SIZE FNMA f95 = f93, f18, f95 } { .mfi STFD [BOFFSET2] = f123, 5 * SIZE FNMA f127 = f125, f18, f127 } ;; { .mfi STFD [BOFFSET] = f68, SIZE FMPY f70 = f70, f19 } { .mfi STFD [BOFFSET2] = f100, SIZE FMPY f102 = f102, f19 } ;; { .mfi STFD [BOFFSET] = f76, SIZE FMPY f78 = f78, f19 } { .mfi STFD [BOFFSET2] = f108, SIZE FMPY f110 = f110, f19 } ;; { .mfi STFD [BOFFSET] = f84, SIZE FMPY f86 = f86, f19 } { .mfi STFD [BOFFSET2] = f116, SIZE FMPY f118 = f118, f19 } ;; { .mfi STFD [BOFFSET] = f92, 5 * SIZE FMPY f94 = f94, f19 } { .mfi STFD [BOFFSET2] = f124, 5 * SIZE FMPY f126 = f126, f19 } ;; { .mfi STFD [BOFFSET] = f69, SIZE FNMA f71 = f70, f20, f71 } { .mfi STFD [BOFFSET2] = f101, SIZE FNMA f103 = f102, f20, f103 } ;; { .mfi STFD [BOFFSET] = f77, SIZE FNMA f79 = f78, f20, f79 } { .mfi STFD [BOFFSET2] = f109, SIZE FNMA f111 = f110, f20, f111 } ;; { .mfi STFD [BOFFSET] = f85, SIZE FNMA f87 = f86, f20, f87 } { .mfi STFD [BOFFSET2] = f117, SIZE FNMA f119 = f118, f20, f119 } ;; { .mfi STFD [BOFFSET] = f93, 5 * SIZE FNMA f95 = f94, f20, f95 } { .mfi STFD [BOFFSET2] = f125, 5 * SIZE FNMA f127 = f126, f20, f127 } ;; { .mfi STFD [BOFFSET] = f70, SIZE FMPY f71 = f71, f21 } { .mfi STFD [BOFFSET2] = f102, SIZE FMPY f103 = f103, f21 } ;; { .mfi STFD [BOFFSET] = f78, SIZE FMPY f79 = f79, f21 } { .mfi STFD [BOFFSET2] = f110, SIZE FMPY f111 = f111, f21 } ;; { .mfi STFD [BOFFSET] = f86, SIZE FMPY f87 = f87, f21 } { .mfi STFD [BOFFSET2] = f118, SIZE FMPY f119 = f119, f21 } ;; { .mfi STFD [BOFFSET] = f94, 5 * SIZE FMPY f95 = f95, f21 } { .mfi STFD [BOFFSET2] = f126, 5 * SIZE FMPY f127 = f127, f21 } ;; { .mmi STFD [BOFFSET] = f71, SIZE STFD [BOFFSET2] = f103, SIZE } ;; { .mmi STFD [BOFFSET] = f79, SIZE STFD [BOFFSET2] = f111, SIZE } ;; { .mmi STFD [BOFFSET] = f87, SIZE STFD [BOFFSET2] = f119, SIZE adds C9 = 4 * SIZE, C1 } ;; { .mfi STFD [BOFFSET] = f95 adds BOFFSET = - 59 * SIZE, BOFFSET } { .mfi STFD [BOFFSET2] = f127 adds BOFFSET2 = - 59 * SIZE, BOFFSET2 } ;; #endif #ifdef RN LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [AOFFSET], 2 * SIZE FSUB f64 = f32, f64 } { .mfi FSUB f65 = f33, f65 } ;; { .mfi LDFPD f50, f51 = [AOFFSET], 2 * SIZE FSUB f66 = f34, f66 } { .mfi FSUB f67 = f35, f67 } ;; { .mfi LDFPD f52, f53 = [AOFFSET], 2 * SIZE FSUB f68 = f36, f68 } { .mfi FSUB f69 = f37, f69 } ;; { .mfi LDFPD f54, f55 = [AOFFSET], 2 * SIZE FSUB f70 = f38, f70 } { .mfi FSUB f71 = f39, f71 } ;; { .mfi LDFPD f56, f57 = [AOFFSET], 2 * SIZE FSUB f72 = f40, f72 } { .mfi FSUB f73 = f41, f73 } ;; { .mfi LDFPD f58, f59 = [AOFFSET], 2 * SIZE FSUB f74 = f42, f74 } { .mfi FSUB f75 = f43, f75 } ;; { .mfi LDFPD f60, f61 = [AOFFSET], 2 * SIZE FSUB f76 = f44, f76 } { .mfi FSUB f77 = f45, f77 } ;; { .mfi LDFPD f62, f63 = [AOFFSET], 2 * SIZE FSUB f78 = f46, f78 } { .mfi FSUB f79 = f47, f79 } ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE FSUB f80 = f48, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f49, f81 nop __LINE__ } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE FSUB f82 = f50, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f51, f83 nop __LINE__ } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE FSUB f84 = f52, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f85 = f53, f85 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE FSUB f86 = f54, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f55, f87 nop __LINE__ } ;; { .mfi LDFPD f40, f41 = [AOFFSET], 2 * SIZE FSUB f88 = f56, f88 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f57, f89 nop __LINE__ } ;; { .mfi LDFPD f42, f43 = [AOFFSET], 2 * SIZE FSUB f90 = f58, f90 nop __LINE__ } { .mfi nop __LINE__ FSUB f91 = f59, f91 nop __LINE__ } ;; { .mfi LDFPD f44, f45 = [AOFFSET], 2 * SIZE FSUB f92 = f60, f92 nop __LINE__ } { .mfi nop __LINE__ FSUB f93 = f61, f93 nop __LINE__ } ;; { .mfi LDFPD f46, f47 = [AOFFSET], 2 * SIZE FSUB f94 = f62, f94 nop __LINE__ } { .mfi nop __LINE__ FSUB f95 = f63, f95 nop __LINE__ } ;; { .mfi LDFPD f48, f49 = [AOFFSET], 2 * SIZE FSUB f96 = f32, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f33, f97 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [AOFFSET], 2 * SIZE FSUB f98 = f34, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f99 = f35, f99 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [AOFFSET], 2 * SIZE FSUB f100 = f36, f100 nop __LINE__ } { .mfi nop __LINE__ FSUB f101 = f37, f101 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [AOFFSET], 2 * SIZE FSUB f102 = f38, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB f103 = f39, f103 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [AOFFSET], 2 * SIZE FSUB f104 = f40, f104 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f41, f105 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [AOFFSET], 2 * SIZE FSUB f106 = f42, f106 nop __LINE__ } { .mfi nop __LINE__ FSUB f107 = f43, f107 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [AOFFSET], 2 * SIZE FSUB f108 = f44, f108 nop __LINE__ } { .mfi nop __LINE__ FSUB f109 = f45, f109 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [AOFFSET] FSUB f110 = f46, f110 adds AOFFSET = -62 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f111 = f47, f111 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f48, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f49, f113 nop __LINE__ } { .mfi nop __LINE__ FSUB f114 = f50, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f51, f115 nop __LINE__ } { .mfi nop __LINE__ FSUB f116 = f52, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f117 = f53, f117 nop __LINE__ } { .mfi nop __LINE__ FSUB f118 = f54, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f55, f119 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f56, f120 nop __LINE__ } { .mfi nop __LINE__ FSUB f121 = f57, f121 nop __LINE__ } { .mfi nop __LINE__ FSUB f122 = f58, f122 nop __LINE__ } { .mfi nop __LINE__ FSUB f123 = f59, f123 nop __LINE__ } { .mfi nop __LINE__ FSUB f124 = f60, f124 nop __LINE__ } { .mfi nop __LINE__ FSUB f125 = f61, f125 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [BOFFSET], 2 * SIZE FSUB f126 = f62, f126 nop __LINE__ } { .mfi nop __LINE__ FSUB f127 = f63, f127 nop __LINE__ } ;; { .mfi LDFPD f34, f35 = [BOFFSET], 2 * SIZE FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f68 = f68, f32 nop __LINE__ } ;; { .mfi LDFPD f36, f37 = [BOFFSET], 2 * SIZE FMPY f65 = f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f69 = f69, f32 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [BOFFSET] FMPY f66 = f66, f32 adds BOFFSET = 3 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f70 = f70, f32 nop __LINE__ } ;; { .mfi LDFD f40 = [BOFFSET], 1 * SIZE FMPY f67 = f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f71 = f71, f32 nop __LINE__ } ;; { .mfi LDFPD f41, f42 = [BOFFSET], 2 * SIZE FNMA f72 = f64, f33, f72 nop __LINE__ } { .mfi nop __LINE__ FNMA f76 = f68, f33, f76 nop __LINE__ } ;; { .mfi LDFPD f43, f44 = [BOFFSET], 2 * SIZE FNMA f73 = f65, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f77 = f69, f33, f77 nop __LINE__ } ;; { .mfi LDFPD f45, f46 = [BOFFSET] FNMA f74 = f66, f33, f74 adds BOFFSET = 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f78 = f70, f33, f78 nop __LINE__ } ;; { .mfi LDFPD f47, f48 = [BOFFSET], 2 * SIZE FNMA f75 = f67, f33, f75 nop __LINE__ } { .mfi nop __LINE__ FNMA f79 = f71, f33, f79 nop __LINE__ } ;; { .mfi LDFPD f49, f50 = [BOFFSET], 2 * SIZE FNMA f80 = f64, f34, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f84 = f68, f34, f84 nop __LINE__ } ;; { .mfi LDFPD f51, f52 = [BOFFSET] FNMA f81 = f65, f34, f81 adds BOFFSET = 5 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f85 = f69, f34, f85 nop __LINE__ } ;; { .mfi LDFD f53 = [BOFFSET], 1 * SIZE FNMA f82 = f66, f34, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f86 = f70, f34, f86 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FNMA f83 = f67, f34, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f87 = f71, f34, f87 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET] FNMA f88 = f64, f35, f88 adds BOFFSET = 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f92 = f68, f35, f92 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FNMA f89 = f65, f35, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f93 = f69, f35, f93 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET] FNMA f90 = f66, f35, f90 adds BOFFSET = 7 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f94 = f70, f35, f94 nop __LINE__ } ;; { .mfi LDFD f16 = [BOFFSET], 1 * SIZE FNMA f91 = f67, f35, f91 nop __LINE__ } { .mfi nop __LINE__ FNMA f95 = f71, f35, f95 nop __LINE__ } ;; { .mfi LDFPD f17, f18 = [BOFFSET] FNMA f96 = f64, f36, f96 adds BOFFSET = 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f100 = f68, f36, f100 nop __LINE__ } ;; { .mfi LDFPD f19, f20 = [BOFFSET] FNMA f97 = f65, f36, f97 adds BOFFSET = 9 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f101 = f69, f36, f101 nop __LINE__ } ;; { .mfi LDFD f21 = [BOFFSET] FNMA f98 = f66, f36, f98 adds BOFFSET = -63 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f102 = f70, f36, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f99 = f67, f36, f99 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f71, f36, f103 nop __LINE__ } ;; FNMA f104 = f64, f37, f104 FNMA f108 = f68, f37, f108 FNMA f105 = f65, f37, f105 FNMA f109 = f69, f37, f109 FNMA f106 = f66, f37, f106 FNMA f110 = f70, f37, f110 FNMA f107 = f67, f37, f107 FNMA f111 = f71, f37, f111 ;; FNMA f112 = f64, f38, f112 FNMA f116 = f68, f38, f116 FNMA f113 = f65, f38, f113 FNMA f117 = f69, f38, f117 FNMA f114 = f66, f38, f114 FNMA f118 = f70, f38, f118 FNMA f115 = f67, f38, f115 FNMA f119 = f71, f38, f119 ;; FNMA f120 = f64, f39, f120 FNMA f124 = f68, f39, f124 FNMA f121 = f65, f39, f121 FNMA f125 = f69, f39, f125 FNMA f122 = f66, f39, f122 FNMA f126 = f70, f39, f126 FNMA f123 = f67, f39, f123 FNMA f127 = f71, f39, f127 ;; FMPY f72 = f72, f40 FMPY f76 = f76, f40 FMPY f73 = f73, f40 FMPY f77 = f77, f40 FMPY f74 = f74, f40 FMPY f78 = f78, f40 FMPY f75 = f75, f40 FMPY f79 = f79, f40 ;; FNMA f80 = f72, f41, f80 FNMA f84 = f76, f41, f84 FNMA f81 = f73, f41, f81 FNMA f85 = f77, f41, f85 FNMA f82 = f74, f41, f82 FNMA f86 = f78, f41, f86 FNMA f83 = f75, f41, f83 FNMA f87 = f79, f41, f87 ;; FNMA f88 = f72, f42, f88 FNMA f92 = f76, f42, f92 FNMA f89 = f73, f42, f89 FNMA f93 = f77, f42, f93 FNMA f90 = f74, f42, f90 FNMA f94 = f78, f42, f94 FNMA f91 = f75, f42, f91 FNMA f95 = f79, f42, f95 ;; FNMA f96 = f72, f43, f96 FNMA f100 = f76, f43, f100 FNMA f97 = f73, f43, f97 FNMA f101 = f77, f43, f101 FNMA f98 = f74, f43, f98 FNMA f102 = f78, f43, f102 FNMA f99 = f75, f43, f99 FNMA f103 = f79, f43, f103 ;; FNMA f104 = f72, f44, f104 FNMA f108 = f76, f44, f108 FNMA f105 = f73, f44, f105 FNMA f109 = f77, f44, f109 FNMA f106 = f74, f44, f106 FNMA f110 = f78, f44, f110 FNMA f107 = f75, f44, f107 FNMA f111 = f79, f44, f111 ;; FNMA f112 = f72, f45, f112 FNMA f116 = f76, f45, f116 FNMA f113 = f73, f45, f113 FNMA f117 = f77, f45, f117 FNMA f114 = f74, f45, f114 FNMA f118 = f78, f45, f118 FNMA f115 = f75, f45, f115 FNMA f119 = f79, f45, f119 ;; FNMA f120 = f72, f46, f120 FNMA f124 = f76, f46, f124 FNMA f121 = f73, f46, f121 FNMA f125 = f77, f46, f125 FNMA f122 = f74, f46, f122 FNMA f126 = f78, f46, f126 FNMA f123 = f75, f46, f123 FNMA f127 = f79, f46, f127 ;; FMPY f80 = f80, f47 FMPY f84 = f84, f47 FMPY f81 = f81, f47 FMPY f85 = f85, f47 FMPY f82 = f82, f47 FMPY f86 = f86, f47 FMPY f83 = f83, f47 FMPY f87 = f87, f47 ;; FNMA f88 = f80, f48, f88 FNMA f92 = f84, f48, f92 FNMA f89 = f81, f48, f89 FNMA f93 = f85, f48, f93 FNMA f90 = f82, f48, f90 FNMA f94 = f86, f48, f94 FNMA f91 = f83, f48, f91 FNMA f95 = f87, f48, f95 ;; FNMA f96 = f80, f49, f96 FNMA f100 = f84, f49, f100 FNMA f97 = f81, f49, f97 FNMA f101 = f85, f49, f101 FNMA f98 = f82, f49, f98 FNMA f102 = f86, f49, f102 FNMA f99 = f83, f49, f99 FNMA f103 = f87, f49, f103 ;; FNMA f104 = f80, f50, f104 FNMA f108 = f84, f50, f108 FNMA f105 = f81, f50, f105 FNMA f109 = f85, f50, f109 FNMA f106 = f82, f50, f106 FNMA f110 = f86, f50, f110 FNMA f107 = f83, f50, f107 FNMA f111 = f87, f50, f111 ;; FNMA f112 = f80, f51, f112 FNMA f116 = f84, f51, f116 FNMA f113 = f81, f51, f113 FNMA f117 = f85, f51, f117 FNMA f114 = f82, f51, f114 FNMA f118 = f86, f51, f118 FNMA f115 = f83, f51, f115 FNMA f119 = f87, f51, f119 ;; FNMA f120 = f80, f52, f120 FNMA f124 = f84, f52, f124 FNMA f121 = f81, f52, f121 FNMA f125 = f85, f52, f125 FNMA f122 = f82, f52, f122 FNMA f126 = f86, f52, f126 FNMA f123 = f83, f52, f123 FNMA f127 = f87, f52, f127 ;; FMPY f88 = f88, f53 FMPY f92 = f92, f53 FMPY f89 = f89, f53 FMPY f93 = f93, f53 FMPY f90 = f90, f53 FMPY f94 = f94, f53 FMPY f91 = f91, f53 FMPY f95 = f95, f53 ;; FNMA f96 = f88, f54, f96 FNMA f100 = f92, f54, f100 FNMA f97 = f89, f54, f97 FNMA f101 = f93, f54, f101 FNMA f98 = f90, f54, f98 FNMA f102 = f94, f54, f102 FNMA f99 = f91, f54, f99 FNMA f103 = f95, f54, f103 ;; FNMA f104 = f88, f55, f104 FNMA f108 = f92, f55, f108 FNMA f105 = f89, f55, f105 FNMA f109 = f93, f55, f109 FNMA f106 = f90, f55, f106 FNMA f110 = f94, f55, f110 FNMA f107 = f91, f55, f107 FNMA f111 = f95, f55, f111 ;; FNMA f112 = f88, f56, f112 FNMA f116 = f92, f56, f116 FNMA f113 = f89, f56, f113 FNMA f117 = f93, f56, f117 FNMA f114 = f90, f56, f114 FNMA f118 = f94, f56, f118 FNMA f115 = f91, f56, f115 FNMA f119 = f95, f56, f119 ;; FNMA f120 = f88, f57, f120 FNMA f124 = f92, f57, f124 FNMA f121 = f89, f57, f121 FNMA f125 = f93, f57, f125 FNMA f122 = f90, f57, f122 FNMA f126 = f94, f57, f126 FNMA f123 = f91, f57, f123 FNMA f127 = f95, f57, f127 ;; FMPY f96 = f96, f58 FMPY f100 = f100, f58 FMPY f97 = f97, f58 FMPY f101 = f101, f58 FMPY f98 = f98, f58 FMPY f102 = f102, f58 FMPY f99 = f99, f58 FMPY f103 = f103, f58 ;; FNMA f104 = f96, f59, f104 FNMA f108 = f100, f59, f108 FNMA f105 = f97, f59, f105 FNMA f109 = f101, f59, f109 FNMA f106 = f98, f59, f106 FNMA f110 = f102, f59, f110 FNMA f107 = f99, f59, f107 FNMA f111 = f103, f59, f111 ;; FNMA f112 = f96, f60, f112 FNMA f116 = f100, f60, f116 FNMA f113 = f97, f60, f113 FNMA f117 = f101, f60, f117 FNMA f114 = f98, f60, f114 FNMA f118 = f102, f60, f118 FNMA f115 = f99, f60, f115 FNMA f119 = f103, f60, f119 ;; { .mfi STFD [AOFFSET] = f64, SIZE FNMA f120 = f96, f61, f120 } { .mfi STFD [AOFFSET2] = f68, SIZE FNMA f124 = f100, f61, f124 } ;; { .mfi STFD [AOFFSET] = f65, SIZE FNMA f121 = f97, f61, f121 } { .mfi STFD [AOFFSET2] = f69, SIZE FNMA f125 = f101, f61, f125 } ;; { .mfi STFD [AOFFSET] = f66, SIZE FNMA f122 = f98, f61, f122 } { .mfi STFD [AOFFSET2] = f70, SIZE FNMA f126 = f102, f61, f126 } ;; { .mfi STFD [AOFFSET] = f67, 5 * SIZE FNMA f123 = f99, f61, f123 } { .mfi STFD [AOFFSET2] = f71, 5 * SIZE FNMA f127 = f103, f61, f127 } ;; { .mfi STFD [AOFFSET] = f72, SIZE FMPY f104 = f104, f16 } { .mfi STFD [AOFFSET2] = f76, SIZE FMPY f108 = f108, f16 } ;; { .mfi STFD [AOFFSET] = f73, SIZE FMPY f105 = f105, f16 } { .mfi STFD [AOFFSET2] = f77, SIZE FMPY f109 = f109, f16 } ;; { .mfi STFD [AOFFSET] = f74, SIZE FMPY f106 = f106, f16 } { .mfi STFD [AOFFSET2] = f78, SIZE FMPY f110 = f110, f16 } ;; { .mfi STFD [AOFFSET] = f75, 5 * SIZE FMPY f107 = f107, f16 } { .mfi STFD [AOFFSET2] = f79, 5 * SIZE FMPY f111 = f111, f16 } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f112 = f104, f17, f112 } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f116 = f108, f17, f116 } ;; { .mfi STFD [AOFFSET] = f81, SIZE FNMA f113 = f105, f17, f113 } { .mfi STFD [AOFFSET2] = f85, SIZE FNMA f117 = f109, f17, f117 } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f114 = f106, f17, f114 } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f118 = f110, f17, f118 } ;; { .mfi STFD [AOFFSET] = f83, 5 * SIZE FNMA f115 = f107, f17, f115 } { .mfi STFD [AOFFSET2] = f87, 5 * SIZE FNMA f119 = f111, f17, f119 } ;; { .mfi STFD [AOFFSET] = f88, SIZE FNMA f120 = f104, f18, f120 } { .mfi STFD [AOFFSET2] = f92, SIZE FNMA f124 = f108, f18, f124 } ;; { .mfi STFD [AOFFSET] = f89, SIZE FNMA f121 = f105, f18, f121 } { .mfi STFD [AOFFSET2] = f93, SIZE FNMA f125 = f109, f18, f125 } ;; { .mfi STFD [AOFFSET] = f90, SIZE FNMA f122 = f106, f18, f122 } { .mfi STFD [AOFFSET2] = f94, SIZE FNMA f126 = f110, f18, f126 } ;; { .mfi STFD [AOFFSET] = f91, 5 * SIZE FNMA f123 = f107, f18, f123 } { .mfi STFD [AOFFSET2] = f95, 5 * SIZE FNMA f127 = f111, f18, f127 } ;; { .mfi STFD [AOFFSET] = f96, SIZE FMPY f112 = f112, f19 } { .mfi STFD [AOFFSET2] = f100, SIZE FMPY f116 = f116, f19 } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMPY f113 = f113, f19 } { .mfi STFD [AOFFSET2] = f101, SIZE FMPY f117 = f117, f19 } ;; { .mfi STFD [AOFFSET] = f98, SIZE FMPY f114 = f114, f19 } { .mfi STFD [AOFFSET2] = f102, SIZE FMPY f118 = f118, f19 } ;; { .mfi STFD [AOFFSET] = f99, 5 * SIZE FMPY f115 = f115, f19 } { .mfi STFD [AOFFSET2] = f103, 5 * SIZE FMPY f119 = f119, f19 } ;; { .mfi STFD [AOFFSET] = f104, SIZE FNMA f120 = f112, f20, f120 } { .mfi STFD [AOFFSET2] = f108, SIZE FNMA f124 = f116, f20, f124 } ;; { .mfi STFD [AOFFSET] = f105, SIZE FNMA f121 = f113, f20, f121 } { .mfi STFD [AOFFSET2] = f109, SIZE FNMA f125 = f117, f20, f125 } ;; { .mfi STFD [AOFFSET] = f106, SIZE FNMA f122 = f114, f20, f122 } { .mfi STFD [AOFFSET2] = f110, SIZE FNMA f126 = f118, f20, f126 } ;; { .mfi STFD [AOFFSET] = f107, 5 * SIZE FNMA f123 = f115, f20, f123 } { .mfi STFD [AOFFSET2] = f111, 5 * SIZE FNMA f127 = f119, f20, f127 } ;; { .mfi STFD [AOFFSET] = f112, SIZE FMPY f120 = f120, f21 } { .mfi STFD [AOFFSET2] = f116, SIZE FMPY f124 = f124, f21 } ;; { .mfi STFD [AOFFSET] = f113, SIZE FMPY f121 = f121, f21 } { .mfi STFD [AOFFSET2] = f117, SIZE FMPY f125 = f125, f21 } ;; { .mfi STFD [AOFFSET] = f114, SIZE FMPY f122 = f122, f21 } { .mfi STFD [AOFFSET2] = f118, SIZE FMPY f126 = f126, f21 } ;; { .mfi STFD [AOFFSET] = f115, 5 * SIZE FMPY f123 = f123, f21 } { .mfi STFD [AOFFSET2] = f119, 5 * SIZE FMPY f127 = f127, f21 } ;; { .mmi STFD [AOFFSET] = f120, SIZE STFD [AOFFSET2] = f124, SIZE } ;; { .mmi STFD [AOFFSET] = f121, SIZE STFD [AOFFSET2] = f125, SIZE } ;; { .mmi STFD [AOFFSET] = f122, SIZE STFD [AOFFSET2] = f126, SIZE adds C9 = 4 * SIZE, C1 } ;; { .mfi STFD [AOFFSET] = f123 adds AOFFSET = - 59 * SIZE, AOFFSET } { .mfi STFD [AOFFSET2] = f127 adds AOFFSET2 = - 59 * SIZE, AOFFSET2 } ;; #endif { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE nop __LINE__ } ;; { .mmi STFD [C1 ] = f67, 5 * SIZE STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE nop __LINE__ } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi STFD [C2 ] = f75, 5 * SIZE STFD [C10] = f79 nop __LINE__ } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C11] = f84, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C11] = f85, SIZE adds C13 = 4 * SIZE, C5 } ;; { .mmi STFD [C3 ] = f82, SIZE STFD [C11] = f86, SIZE nop __LINE__ } ;; { .mmi STFD [C3 ] = f83, 5 * SIZE STFD [C11] = f87 adds C14 = 4 * SIZE, C6 } ;; { .mmf STFD [C4 ] = f88, SIZE STFD [C12] = f92, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE STFD [C12] = f93, SIZE nop __LINE__ } ;; { .mmi STFD [C4 ] = f90, SIZE STFD [C12] = f94, SIZE adds C16 = 4 * SIZE, C8 } ;; { .mmi STFD [C4 ] = f91, 5 * SIZE STFD [C12] = f95 cmp.ne p6, p0 = 1, I } ;; { .mmf STFD [C5 ] = f96, SIZE STFD [C13] = f100, SIZE mov f96 = f0 } ;; { .mmi STFD [C5 ] = f97, SIZE STFD [C13] = f101, SIZE adds I = -1, I } ;; { .mmi STFD [C5 ] = f98, SIZE STFD [C13] = f102, SIZE nop __LINE__ } ;; { .mmi STFD [C5 ] = f99, 5 * SIZE STFD [C13] = f103 adds C15 = 4 * SIZE, C7 } ;; { .mmf STFD [C6 ] = f104, SIZE STFD [C14] = f108, SIZE mov f104 = f0 } ;; { .mmi STFD [C6 ] = f105, SIZE STFD [C14] = f109, SIZE nop __LINE__ } ;; { .mmi STFD [C6 ] = f106, SIZE STFD [C14] = f110, SIZE sub L = K, KK } ;; { .mmi STFD [C6 ] = f107, 5 * SIZE STFD [C14] = f111 nop __LINE__ } ;; { .mmf STFD [C7 ] = f112, SIZE STFD [C15] = f116, SIZE mov f112 = f0 } ;; { .mmi STFD [C7 ] = f113, SIZE STFD [C15] = f117, SIZE shladd L = L, BASE_SHIFT, r0 } ;; { .mmi STFD [C7 ] = f114, SIZE STFD [C15] = f118, SIZE shladd AOFFSET = L, 3, AOFFSET } ;; { .mmi STFD [C7 ] = f115, 5 * SIZE STFD [C15] = f119 shladd BOFFSET = L, 3, BOFFSET } ;; { .mmf STFD [C8 ] = f120, SIZE STFD [C16] = f124, SIZE mov f120 = f0 } ;; { .mmi STFD [C8 ] = f121, SIZE STFD [C16] = f125, SIZE #ifdef LT adds KK = 8, KK #else nop __LINE__ #endif } ;; { .mmi STFD [C8 ] = f122, SIZE STFD [C16] = f126, SIZE mov L = KK } ;; { .mmb STFD [C8 ] = f123, 5 * SIZE STFD [C16] = f127 (p6) br.cond.dptk .L011 } ;; .L020: { .mib mov L = KK tbit.z p6, p0 = M, 2 (p6) br.cond.dptk .L030 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B nop __LINE__ } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; { .mfi setf.d f105 = r0 mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f67 = f0 } { .mfi setf.d f74 = r0 mov f75 = f0 adds L = -1, L } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f82 = r0 mov f83 = f0 } { .mfi setf.d f90 = r0 mov f91 = f0 cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f99 = f0 } { .mfi setf.d f106 = r0 mov f107 = f0 mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f114 = r0 mov f115 = f0 } { .mfb setf.d f122 = r0 mov f123 = f0 (p6) br.cond.dpnt .L028 } ;; .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C13 = 2 * SIZE, C5 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C14 = 2 * SIZE, C6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 (p5) adds C15 = 2 * SIZE, C7 } { .mfi nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 (p5) adds C16 = 2 * SIZE, C8 } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET] FSUB f113 = f46, f113 adds BOFFSET = -30 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; FSUB f66 = f48, f66 FSUB f74 = f49, f74 FSUB f82 = f50, f82 FSUB f90 = f51, f90 FSUB f98 = f52, f98 FSUB f106 = f53, f106 FSUB f114 = f54, f114 FSUB f122 = f55, f122 ;; FSUB f67 = f56, f67 FSUB f75 = f57, f75 FSUB f83 = f58, f83 FSUB f91 = f59, f91 FSUB f99 = f60, f99 FSUB f107 = f61, f107 FSUB f115 = f62, f115 FSUB f123 = f63, f123 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [AOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [AOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [AOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [AOFFSET] adds AOFFSET = -30 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 FSUB f80 = f40, f80 FSUB f81 = f41, f81 FSUB f82 = f42, f82 FSUB f83 = f43, f83 FSUB f88 = f44, f88 FSUB f89 = f45, f89 FSUB f90 = f46, f90 FSUB f91 = f47, f91 ;; FSUB f96 = f48, f96 FSUB f97 = f49, f97 FSUB f98 = f50, f98 FSUB f99 = f51, f99 ;; FSUB f104 = f52, f104 FSUB f105 = f53, f105 FSUB f106 = f54, f106 FSUB f107 = f55, f107 ;; FSUB f112 = f56, f112 FSUB f113 = f57, f113 FSUB f114 = f58, f114 FSUB f115 = f59, f115 ;; FSUB f120 = f60, f120 FSUB f121 = f61, f121 FSUB f122 = f62, f122 FSUB f123 = f63, f123 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi FNMA f81 = f80, f33, f81 } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; { .mfi FNMA f66 = f64, f34, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f96, f34, f98 nop __LINE__ } ;; { .mfi FNMA f74 = f72, f34, f74 } { .mfi nop __LINE__ FNMA f106 = f104, f34, f106 nop __LINE__ } ;; { .mfi FNMA f82 = f80, f34, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f112, f34, f114 nop __LINE__ } ;; { .mfi FNMA f90 = f88, f34, f90 nop __LINE__ } { .mfi nop __LINE__ FNMA f122 = f120, f34, f122 nop __LINE__ } ;; { .mfi FNMA f67 = f64, f35, f67 } { .mfi nop __LINE__ FNMA f99 = f96, f35, f99 nop __LINE__ } ;; { .mfi FNMA f75 = f72, f35, f75 nop __LINE__ } { .mfi nop __LINE__ FNMA f107 = f104, f35, f107 nop __LINE__ } ;; { .mfi FNMA f83 = f80, f35, f83 } { .mfi nop __LINE__ FNMA f115 = f112, f35, f115 nop __LINE__ } ;; { .mfi FNMA f91 = f88, f35, f91 nop __LINE__ } { .mfi nop __LINE__ FNMA f123 = f120, f35, f123 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; FMPY f65 = f65, f36 FMPY f97 = f97, f36 FMPY f73 = f73, f36 FMPY f105 = f105, f36 FMPY f81 = f81, f36 FMPY f113 = f113, f36 FMPY f89 = f89, f36 FMPY f121 = f121, f36 ;; FNMA f66 = f65, f37, f66 FNMA f98 = f97, f37, f98 FNMA f74 = f73, f37, f74 FNMA f106 = f105, f37, f106 FNMA f82 = f81, f37, f82 FNMA f114 = f113, f37, f114 FNMA f90 = f89, f37, f90 FNMA f122 = f121, f37, f122 ;; FNMA f67 = f65, f38, f67 FNMA f99 = f97, f38, f99 FNMA f75 = f73, f38, f75 FNMA f107 = f105, f38, f107 FNMA f83 = f81, f38, f83 FNMA f115 = f113, f38, f115 FNMA f91 = f89, f38, f91 FNMA f123 = f121, f38, f123 ;; { .mfi STFD [BOFFSET] = f64, SIZE FMPY f66 = f66, f39 } { .mfi STFD [BOFFSET2] = f96, SIZE FMPY f98 = f98, f39 } ;; { .mfi STFD [BOFFSET] = f72, SIZE FMPY f74 = f74, f39 } { .mfi STFD [BOFFSET2] = f104, SIZE FMPY f106 = f106, f39 } ;; { .mfi STFD [BOFFSET] = f80, SIZE FMPY f82 = f82, f39 } { .mfi STFD [BOFFSET2] = f112, SIZE FMPY f114 = f114, f39 } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE FMPY f90 = f90, f39 } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE FMPY f122 = f122, f39 } ;; { .mfi STFD [BOFFSET] = f65, SIZE FNMA f67 = f66, f40, f67 } { .mfi STFD [BOFFSET2] = f97, SIZE FNMA f99 = f98, f40, f99 } ;; { .mfi STFD [BOFFSET] = f73, SIZE FNMA f75 = f74, f40, f75 } { .mfi STFD [BOFFSET2] = f105, SIZE FNMA f107 = f106, f40, f107 } ;; { .mfi STFD [BOFFSET] = f81, SIZE FNMA f83 = f82, f40, f83 } { .mfi STFD [BOFFSET2] = f113, SIZE FNMA f115 = f114, f40, f115 } ;; { .mfi STFD [BOFFSET] = f89, 5 * SIZE FNMA f91 = f90, f40, f91 } { .mfi STFD [BOFFSET2] = f121, 5 * SIZE FNMA f123 = f122, f40, f123 } ;; { .mfi STFD [BOFFSET] = f66, SIZE FMPY f67 = f67, f41 } { .mfi STFD [BOFFSET2] = f98, SIZE FMPY f99 = f99, f41 } ;; { .mfi STFD [BOFFSET] = f74, SIZE FMPY f75 = f75, f41 } { .mfi STFD [BOFFSET2] = f106, SIZE FMPY f107 = f107, f41 } ;; { .mfi STFD [BOFFSET] = f82, SIZE FMPY f83 = f83, f41 } { .mfi STFD [BOFFSET2] = f114, SIZE FMPY f115 = f115, f41 } ;; { .mfi STFD [BOFFSET] = f90, 5 * SIZE FMPY f91 = f91, f41 } { .mfi STFD [BOFFSET2] = f122, 5 * SIZE FMPY f123 = f123, f41 } ;; { .mmf STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE } ;; { .mmf STFD [BOFFSET] = f75, SIZE STFD [BOFFSET2] = f107, SIZE } ;; { .mmf STFD [BOFFSET] = f83, SIZE STFD [BOFFSET2] = f115, SIZE } ;; { .mmf STFD [BOFFSET] = f91, -27 * SIZE STFD [BOFFSET2] = f123, -27 * SIZE } ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE adds AOFFSET2 = 4 * SIZE, AOFFSET ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 FNMA f82 = f66, f34, f82 FNMA f83 = f67, f34, f83 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 FNMA f90 = f66, f35, f90 FNMA f91 = f67, f35, f91 ;; FNMA f96 = f64, f36, f96 FNMA f97 = f65, f36, f97 FNMA f98 = f66, f36, f98 FNMA f99 = f67, f36, f99 ;; FNMA f104 = f64, f37, f104 FNMA f105 = f65, f37, f105 FNMA f106 = f66, f37, f106 FNMA f107 = f67, f37, f107 ;; FNMA f112 = f64, f38, f112 FNMA f113 = f65, f38, f113 FNMA f114 = f66, f38, f114 FNMA f115 = f67, f38, f115 ;; FNMA f120 = f64, f39, f120 FNMA f121 = f65, f39, f121 FNMA f122 = f66, f39, f122 FNMA f123 = f67, f39, f123 ;; FMPY f72 = f72, f40 FMPY f73 = f73, f40 FMPY f74 = f74, f40 FMPY f75 = f75, f40 ;; FNMA f80 = f72, f41, f80 FNMA f81 = f73, f41, f81 FNMA f82 = f74, f41, f82 FNMA f83 = f75, f41, f83 ;; FNMA f88 = f72, f42, f88 FNMA f89 = f73, f42, f89 FNMA f90 = f74, f42, f90 FNMA f91 = f75, f42, f91 ;; FNMA f96 = f72, f43, f96 FNMA f97 = f73, f43, f97 FNMA f98 = f74, f43, f98 FNMA f99 = f75, f43, f99 ;; FNMA f104 = f72, f44, f104 FNMA f105 = f73, f44, f105 FNMA f106 = f74, f44, f106 FNMA f107 = f75, f44, f107 ;; FNMA f112 = f72, f45, f112 FNMA f113 = f73, f45, f113 FNMA f114 = f74, f45, f114 FNMA f115 = f75, f45, f115 ;; FNMA f120 = f72, f46, f120 FNMA f121 = f73, f46, f121 FNMA f122 = f74, f46, f122 FNMA f123 = f75, f46, f123 ;; FMPY f80 = f80, f47 FMPY f81 = f81, f47 FMPY f82 = f82, f47 FMPY f83 = f83, f47 ;; FNMA f88 = f80, f48, f88 FNMA f89 = f81, f48, f89 FNMA f90 = f82, f48, f90 FNMA f91 = f83, f48, f91 ;; FNMA f96 = f80, f49, f96 FNMA f97 = f81, f49, f97 FNMA f98 = f82, f49, f98 FNMA f99 = f83, f49, f99 ;; FNMA f104 = f80, f50, f104 FNMA f105 = f81, f50, f105 FNMA f106 = f82, f50, f106 FNMA f107 = f83, f50, f107 ;; FNMA f112 = f80, f51, f112 FNMA f113 = f81, f51, f113 FNMA f114 = f82, f51, f114 FNMA f115 = f83, f51, f115 ;; FNMA f120 = f80, f52, f120 FNMA f121 = f81, f52, f121 FNMA f122 = f82, f52, f122 FNMA f123 = f83, f52, f123 ;; FMPY f88 = f88, f53 FMPY f89 = f89, f53 FMPY f90 = f90, f53 FMPY f91 = f91, f53 ;; FNMA f96 = f88, f54, f96 FNMA f97 = f89, f54, f97 FNMA f98 = f90, f54, f98 FNMA f99 = f91, f54, f99 ;; FNMA f104 = f88, f55, f104 FNMA f105 = f89, f55, f105 FNMA f106 = f90, f55, f106 FNMA f107 = f91, f55, f107 ;; FNMA f112 = f88, f56, f112 FNMA f113 = f89, f56, f113 FNMA f114 = f90, f56, f114 FNMA f115 = f91, f56, f115 ;; FNMA f120 = f88, f57, f120 FNMA f121 = f89, f57, f121 FNMA f122 = f90, f57, f122 FNMA f123 = f91, f57, f123 ;; FMPY f96 = f96, f58 FMPY f97 = f97, f58 FMPY f98 = f98, f58 FMPY f99 = f99, f58 ;; FNMA f104 = f96, f59, f104 FNMA f105 = f97, f59, f105 FNMA f106 = f98, f59, f106 FNMA f107 = f99, f59, f107 ;; FNMA f112 = f96, f60, f112 FNMA f113 = f97, f60, f113 FNMA f114 = f98, f60, f114 FNMA f115 = f99, f60, f115 ;; FNMA f120 = f96, f61, f120 FNMA f121 = f97, f61, f121 FNMA f122 = f98, f61, f122 FNMA f123 = f99, f61, f123 ;; { .mfi STFD [AOFFSET] = f64, SIZE FMPY f104 = f104, f16 } { .mfi STFD [AOFFSET2] = f72, SIZE FMPY f105 = f105, f16 } ;; { .mfi STFD [AOFFSET] = f65, SIZE FMPY f106 = f106, f16 } { .mfi STFD [AOFFSET2] = f73, SIZE FMPY f107 = f107, f16 } ;; { .mfi STFD [AOFFSET] = f66, SIZE FNMA f112 = f104, f17, f112 } { .mfi STFD [AOFFSET2] = f74, SIZE FNMA f113 = f105, f17, f113 } ;; { .mfi STFD [AOFFSET] = f67, 5 * SIZE FNMA f114 = f106, f17, f114 } { .mfi STFD [AOFFSET2] = f75, 5 * SIZE FNMA f115 = f107, f17, f115 } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f120 = f104, f18, f120 } { .mfi STFD [AOFFSET2] = f88, SIZE FNMA f121 = f105, f18, f121 } ;; { .mfi STFD [AOFFSET] = f81, SIZE FNMA f122 = f106, f18, f122 } { .mfi STFD [AOFFSET2] = f89, SIZE FNMA f123 = f107, f18, f123 } ;; { .mfi STFD [AOFFSET] = f82, SIZE FMPY f112 = f112, f19 } { .mfi STFD [AOFFSET2] = f90, SIZE FMPY f113 = f113, f19 } ;; { .mfi STFD [AOFFSET] = f83, 5 * SIZE FMPY f114 = f114, f19 } { .mfi STFD [AOFFSET2] = f91, 5 * SIZE FMPY f115 = f115, f19 } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f120 = f112, f20, f120 } { .mfi STFD [AOFFSET2] = f104, SIZE FNMA f121 = f113, f20, f121 } ;; { .mfi STFD [AOFFSET] = f97, SIZE FNMA f122 = f114, f20, f122 } { .mfi STFD [AOFFSET2] = f105, SIZE FNMA f123 = f115, f20, f123 } ;; { .mfi STFD [AOFFSET] = f98, SIZE FMPY f120 = f120, f21 } { .mfi STFD [AOFFSET2] = f106, SIZE FMPY f121 = f121, f21 } ;; { .mfi STFD [AOFFSET] = f99, 5 * SIZE FMPY f122 = f122, f21 } { .mfi STFD [AOFFSET2] = f107, 5 * SIZE FMPY f123 = f123, f21 } ;; { .mmf STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f120, SIZE } ;; { .mmf STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f121, SIZE } ;; { .mmf STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f122, SIZE } ;; { .mmf STFD [AOFFSET] = f115, -27 * SIZE STFD [AOFFSET2] = f123, - 27 * SIZE } ;; #endif { .mmf STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C2 ] = f73, SIZE nop __LINE__ } ;; { .mmf STFD [C1 ] = f66, SIZE STFD [C2 ] = f74, SIZE mov f72 = f0 } ;; { .mmi STFD [C1 ] = f67, SIZE STFD [C2 ] = f75, SIZE sub L = K, KK } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C4 ] = f89, SIZE shladd L = L, BASE_SHIFT, r0 } ;; { .mmf STFD [C3 ] = f82, SIZE STFD [C4 ] = f90, SIZE mov f88 = f0 } ;; { .mmi STFD [C3 ] = f83, SIZE STFD [C4 ] = f91, SIZE shladd AOFFSET = L, 2, AOFFSET } ;; { .mmf STFD [C5 ] = f96, SIZE STFD [C6 ] = f104, SIZE mov f96 = f0 } ;; { .mmi STFD [C5 ] = f97, SIZE STFD [C6 ] = f105, SIZE shladd BOFFSET = L, 3, BOFFSET } ;; { .mmf STFD [C5 ] = f98, SIZE STFD [C6 ] = f106, SIZE mov f104 = f0 } ;; { .mmi STFD [C5 ] = f99, SIZE STFD [C6 ] = f107, SIZE #ifdef LT adds KK = 4, KK #else nop __LINE__ #endif } ;; { .mmf STFD [C7 ] = f112, SIZE STFD [C8 ] = f120, SIZE mov f112 = f0 } ;; { .mmi STFD [C7 ] = f113, SIZE STFD [C8 ] = f121, SIZE mov L = KK } ;; { .mmf STFD [C7 ] = f114, SIZE STFD [C8 ] = f122, SIZE mov f120 = f0 } ;; { .mmi STFD [C7 ] = f115, SIZE STFD [C8 ] = f123, SIZE nop __LINE__ } ;; .align 8 .L030: { .mib mov L = KK tbit.z p6, p0 = M, 1 (p6) br.cond.dptk .L040 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B nop __LINE__ } ;; { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 adds L = 1, L } ;; { .mfi setf.d f105 = r0 mov f81 = f0 tbit.z p12, p0 = L, 0 } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE nop __LINE__ mov f65 = f0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 cmp.eq p3, p0 = r0, r0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = 0, L adds L = -1, L } ;; { .mib (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L038 } ;; .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi FSUB f113 = f46, f113 } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; { .mmi LDFPD f32, f33 = [AOFFSET] nop __LINE__ adds AOFFSET = 3 * SIZE, AOFFSET } ;; { .mfi LDFD f34 = [AOFFSET], - 3 * SIZE FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi FNMA f81 = f80, f33, f81 } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi STFD [BOFFSET] = f64, SIZE FMPY f65 = f65, f34 } { .mfi STFD [BOFFSET2] = f96, SIZE FMPY f97 = f97, f34 } ;; { .mfi STFD [BOFFSET] = f72, SIZE FMPY f73 = f73, f34 } { .mfi STFD [BOFFSET2] = f104, SIZE FMPY f105 = f105, f34 } ;; { .mfi STFD [BOFFSET] = f80, SIZE FMPY f81 = f81, f34 sub L = K, KK } { .mfi STFD [BOFFSET2] = f112, SIZE FMPY f113 = f113, f34 } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE FMPY f89 = f89, f34 shladd L = L, BASE_SHIFT, r0 } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE FMPY f121 = f121, f34 } ;; { .mmi STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE } ;; { .mmi STFD [BOFFSET] = f73, SIZE STFD [BOFFSET2] = f105, SIZE } ;; { .mmi STFD [BOFFSET] = f81, SIZE STFD [BOFFSET2] = f113, SIZE } ;; { .mmi STFD [BOFFSET] = f89, -11 * SIZE STFD [BOFFSET2] = f121, -11 * SIZE } #endif #ifdef RN LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 FSUB f80 = f36, f80 FSUB f81 = f37, f81 FSUB f88 = f38, f88 FSUB f89 = f39, f89 FSUB f96 = f40, f96 FSUB f97 = f41, f97 FSUB f104 = f42, f104 FSUB f105 = f43, f105 FSUB f112 = f44, f112 FSUB f113 = f45, f113 FSUB f120 = f46, f120 FSUB f121 = f47, f121 ;; LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET adds AOFFSET2 = 4 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 FNMA f96 = f64, f36, f96 FNMA f97 = f65, f36, f97 FMPY f72 = f72, f40 FMPY f73 = f73, f40 FNMA f104 = f64, f37, f104 FNMA f105 = f65, f37, f105 FNMA f112 = f64, f38, f112 FNMA f113 = f65, f38, f113 FNMA f120 = f64, f39, f120 FNMA f121 = f65, f39, f121 ;; FNMA f80 = f72, f41, f80 FNMA f81 = f73, f41, f81 FNMA f88 = f72, f42, f88 FNMA f89 = f73, f42, f89 ;; FNMA f96 = f72, f43, f96 FNMA f97 = f73, f43, f97 FNMA f104 = f72, f44, f104 FNMA f105 = f73, f44, f105 FMPY f80 = f80, f47 FMPY f81 = f81, f47 FNMA f112 = f72, f45, f112 FNMA f113 = f73, f45, f113 FNMA f120 = f72, f46, f120 FNMA f121 = f73, f46, f121 ;; FNMA f88 = f80, f48, f88 FNMA f89 = f81, f48, f89 FNMA f96 = f80, f49, f96 FNMA f97 = f81, f49, f97 FNMA f104 = f80, f50, f104 FNMA f105 = f81, f50, f105 FNMA f112 = f80, f51, f112 FNMA f113 = f81, f51, f113 ;; FMPY f88 = f88, f53 FMPY f89 = f89, f53 FNMA f120 = f80, f52, f120 FNMA f121 = f81, f52, f121 ;; FNMA f96 = f88, f54, f96 FNMA f97 = f89, f54, f97 FNMA f104 = f88, f55, f104 FNMA f105 = f89, f55, f105 FNMA f112 = f88, f56, f112 FNMA f113 = f89, f56, f113 FNMA f120 = f88, f57, f120 FNMA f121 = f89, f57, f121 ;; FMPY f96 = f96, f58 FMPY f97 = f97, f58 ;; FNMA f104 = f96, f59, f104 FNMA f105 = f97, f59, f105 FNMA f112 = f96, f60, f112 FNMA f113 = f97, f60, f113 FNMA f120 = f96, f61, f120 FNMA f121 = f97, f61, f121 ;; FMPY f104 = f104, f16 FMPY f105 = f105, f16 ;; FNMA f112 = f104, f17, f112 FNMA f113 = f105, f17, f113 ;; { .mfi STFD [AOFFSET] = f64, SIZE FNMA f120 = f104, f18, f120 } { .mfi STFD [AOFFSET2] = f80, SIZE FNMA f121 = f105, f18, f121 } ;; { .mfi STFD [AOFFSET] = f65, SIZE FMPY f112 = f112, f19 } { .mfi STFD [AOFFSET2] = f81, SIZE FMPY f113 = f113, f19 } ;; { .mfi STFD [AOFFSET] = f72, SIZE FNMA f120 = f112, f20, f120 sub L = K, KK } { .mfi STFD [AOFFSET2] = f88, SIZE FNMA f121 = f113, f20, f121 } ;; { .mfi STFD [AOFFSET] = f73, 5 * SIZE FMPY f120 = f120, f21 shladd L = L, BASE_SHIFT, r0 } { .mfi STFD [AOFFSET2] = f89, 5 * SIZE FMPY f121 = f121, f21 } ;; { .mmi STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f104, SIZE STFD [AOFFSET2] = f120, SIZE nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f105, -11 * SIZE STFD [AOFFSET2] = f121, - 11 * SIZE nop __LINE__ } ;; #endif { .mmf STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmf STFD [C1 ] = f65, SIZE STFD [C2 ] = f73, SIZE mov f64 = f0 } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE mov f88 = f0 } ;; { .mmf STFD [C3 ] = f81, SIZE STFD [C4 ] = f89, SIZE mov f80 = f0 } ;; { .mmf STFD [C5 ] = f96, SIZE STFD [C6 ] = f104, SIZE mov f96 = f0 } ;; { .mmf STFD [C5 ] = f97, SIZE STFD [C6 ] = f105, SIZE mov f104 = f0 } ;; { .mmf STFD [C7 ] = f112, SIZE STFD [C8 ] = f120, SIZE mov f112 = f0 } ;; { .mmf STFD [C7 ] = f113, SIZE STFD [C8 ] = f121, SIZE mov f120 = f0 } { .mmi shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 3, BOFFSET #ifdef LT adds KK = 2, KK #else nop __LINE__ #endif } ;; .align 8 .L040: { .mib mov L = KK tbit.z p6, p0 = M, 0 (p6) br.cond.dptk .L049 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE adds L = -1, L } ;; { .mmi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L048 } ;; .L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ br.cloop.sptk.few .L042 } ;; .L048: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f96 = f36, f96 FSUB f104 = f37, f104 FSUB f112 = f38, f112 FSUB f120 = f39, f120 ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, -3 * SIZE } { .mfi STFD [BOFFSET2] = f120, -3 * SIZE } ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FNMA f80 = f64, f34, f80 ;; FNMA f88 = f64, f35, f88 ;; FNMA f96 = f64, f36, f96 ;; FNMA f104 = f64, f37, f104 ;; FNMA f112 = f64, f38, f112 ;; FNMA f120 = f64, f39, f120 ;; FMPY f72 = f72, f40 ;; FNMA f80 = f72, f41, f80 ;; FNMA f88 = f72, f42, f88 ;; FNMA f96 = f72, f43, f96 ;; FNMA f104 = f72, f44, f104 ;; FNMA f112 = f72, f45, f112 ;; FNMA f120 = f72, f46, f120 ;; FMPY f80 = f80, f47 ;; FNMA f88 = f80, f48, f88 ;; FNMA f96 = f80, f49, f96 ;; FNMA f104 = f80, f50, f104 ;; FNMA f112 = f80, f51, f112 ;; FNMA f120 = f80, f52, f120 ;; FMPY f88 = f88, f53 ;; FNMA f96 = f88, f54, f96 ;; FNMA f104 = f88, f55, f104 ;; FNMA f112 = f88, f56, f112 ;; FNMA f120 = f88, f57, f120 ;; FMPY f96 = f96, f58 ;; FNMA f104 = f96, f59, f104 ;; FNMA f112 = f96, f60, f112 ;; FNMA f120 = f96, f61, f120 ;; FMPY f104 = f104, f16 ;; FNMA f112 = f104, f17, f112 ;; FNMA f120 = f104, f18, f120 ;; FMPY f112 = f112, f19 ;; FNMA f120 = f112, f20, f120 ;; FMPY f120 = f120, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE STFD [AOFFSET2] = f120, - 3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE STFD [C5 ] = f96, SIZE STFD [C6 ] = f104, SIZE STFD [C7 ] = f112, SIZE STFD [C8 ] = f120, SIZE ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f96 = f0 mov f104 = f0 mov f112 = f0 mov f120 = f0 ;; sub L = K, KK ;; shladd L = L, BASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET ;; shladd BOFFSET = L, 3, BOFFSET ;; #ifdef LT adds KK = 1, KK #else nop __LINE__ #endif ;; mov L = KK ;; .align 8 .L049: mov B = BOFFSET #ifdef RN adds KK = 8, KK #endif ;; { .mmi mov AOFFSET = A } ;; { .mmb nop __LINE__ cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 8 .L050: { .mib setf.d f64 = r0 tbit.z p6, p0 = N, 2 (p6) br.cond.dpnt .L090 } ;; { .mfi setf.d f72 = r0 mov f80 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f88 = f0 #ifdef LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I mov AORIG = A mov f65 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f73 = f0 } ;; { .mfi shladd C = LDC, 2, C // coffset += 8 * ldc mov f81 = f0 mov L = KK }{ .mfb shladd C4 = LDC, 1, C2 mov f89 = f0 (p6) br.cond.dpnt .L060 } ;; .align 16 .L052: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f74 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f82 = r0 mov f90 = f0 } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds L = 1, L } ;; { .mmf CPREFETCH [PREC], LDC } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC] } ;; { .mfi setf.d f70 = r0 mov f78 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mfi setf.d f71 = r0 adds L = -1, L } ;; { .mfi setf.d f87 = r0 mov f79 = f0 mov ar.lc = L } { .mfb cmp.eq p6, p0 = -1, L mov f95 = f0 (p6) br.cond.dpnt .L058 } ;; .align 8 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 adds C9 = 4 * SIZE, C1 } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C10 = 4 * SIZE, C2 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C11 = 4 * SIZE, C3 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f95 = f47, f59, f95 // A8 * B4 br.cloop.sptk.few .L053 } ;; .align 8 .L058: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [BOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [BOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [BOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [BOFFSET] adds BOFFSET = -30 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 FSUB f66 = f40, f66 FSUB f74 = f41, f74 FSUB f82 = f42, f82 FSUB f90 = f43, f90 FSUB f67 = f44, f67 FSUB f75 = f45, f75 FSUB f83 = f46, f83 FSUB f91 = f47, f91 FSUB f68 = f48, f68 FSUB f76 = f49, f76 FSUB f84 = f50, f84 FSUB f92 = f51, f92 FSUB f69 = f52, f69 FSUB f77 = f53, f77 FSUB f85 = f54, f85 FSUB f93 = f55, f93 FSUB f70 = f56, f70 FSUB f78 = f57, f78 FSUB f86 = f58, f86 FSUB f94 = f59, f94 FSUB f71 = f60, f71 FSUB f79 = f61, f79 FSUB f87 = f62, f87 FSUB f95 = f63, f95 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [AOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [AOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [AOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [AOFFSET] adds AOFFSET = -30 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; FSUB f72 = f40, f72 FSUB f73 = f41, f73 FSUB f74 = f42, f74 FSUB f75 = f43, f75 FSUB f76 = f44, f76 FSUB f77 = f45, f77 FSUB f78 = f46, f78 FSUB f79 = f47, f79 ;; FSUB f80 = f48, f80 FSUB f81 = f49, f81 FSUB f82 = f50, f82 FSUB f83 = f51, f83 FSUB f84 = f52, f84 FSUB f85 = f53, f85 FSUB f86 = f54, f86 FSUB f87 = f55, f87 FSUB f88 = f56, f88 FSUB f89 = f57, f89 FSUB f90 = f58, f90 FSUB f91 = f59, f91 FSUB f92 = f60, f92 FSUB f93 = f61, f93 FSUB f94 = f62, f94 FSUB f95 = f63, f95 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 FNMA f82 = f80, f34, f82 FNMA f90 = f88, f34, f90 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 FNMA f83 = f80, f35, f83 FNMA f91 = f88, f35, f91 ;; FNMA f68 = f64, f36, f68 FNMA f76 = f72, f36, f76 FNMA f84 = f80, f36, f84 FNMA f92 = f88, f36, f92 ;; FNMA f69 = f64, f37, f69 FNMA f77 = f72, f37, f77 FNMA f85 = f80, f37, f85 FNMA f93 = f88, f37, f93 ;; FNMA f70 = f64, f38, f70 FNMA f78 = f72, f38, f78 FNMA f86 = f80, f38, f86 FNMA f94 = f88, f38, f94 ;; FNMA f71 = f64, f39, f71 FNMA f79 = f72, f39, f79 FNMA f87 = f80, f39, f87 FNMA f95 = f88, f39, f95 ;; FMPY f65 = f65, f40 FMPY f73 = f73, f40 FMPY f81 = f81, f40 FMPY f89 = f89, f40 ;; FNMA f66 = f65, f41, f66 FNMA f74 = f73, f41, f74 FNMA f82 = f81, f41, f82 FNMA f90 = f89, f41, f90 ;; FNMA f67 = f65, f42, f67 FNMA f75 = f73, f42, f75 FNMA f83 = f81, f42, f83 FNMA f91 = f89, f42, f91 ;; FNMA f68 = f65, f43, f68 FNMA f76 = f73, f43, f76 FNMA f84 = f81, f43, f84 FNMA f92 = f89, f43, f92 ;; FNMA f69 = f65, f44, f69 FNMA f77 = f73, f44, f77 FNMA f85 = f81, f44, f85 FNMA f93 = f89, f44, f93 ;; FNMA f70 = f65, f45, f70 FNMA f78 = f73, f45, f78 FNMA f86 = f81, f45, f86 FNMA f94 = f89, f45, f94 ;; FNMA f71 = f65, f46, f71 FNMA f79 = f73, f46, f79 FNMA f87 = f81, f46, f87 FNMA f95 = f89, f46, f95 ;; FMPY f66 = f66, f47 FMPY f74 = f74, f47 FMPY f82 = f82, f47 FMPY f90 = f90, f47 ;; FNMA f67 = f66, f48, f67 FNMA f75 = f74, f48, f75 FNMA f83 = f82, f48, f83 FNMA f91 = f90, f48, f91 ;; FNMA f68 = f66, f49, f68 FNMA f76 = f74, f49, f76 FNMA f84 = f82, f49, f84 FNMA f92 = f90, f49, f92 ;; FNMA f69 = f66, f50, f69 FNMA f77 = f74, f50, f77 FNMA f85 = f82, f50, f85 FNMA f93 = f90, f50, f93 ;; FNMA f70 = f66, f51, f70 FNMA f78 = f74, f51, f78 FNMA f86 = f82, f51, f86 FNMA f94 = f90, f51, f94 ;; FNMA f71 = f66, f52, f71 FNMA f79 = f74, f52, f79 FNMA f87 = f82, f52, f87 FNMA f95 = f90, f52, f95 ;; FMPY f67 = f67, f53 FMPY f75 = f75, f53 FMPY f83 = f83, f53 FMPY f91 = f91, f53 ;; FNMA f68 = f67, f54, f68 FNMA f76 = f75, f54, f76 FNMA f84 = f83, f54, f84 FNMA f92 = f91, f54, f92 ;; FNMA f69 = f67, f55, f69 FNMA f77 = f75, f55, f77 FNMA f85 = f83, f55, f85 FNMA f93 = f91, f55, f93 ;; FNMA f70 = f67, f56, f70 FNMA f78 = f75, f56, f78 FNMA f86 = f83, f56, f86 FNMA f94 = f91, f56, f94 ;; FNMA f71 = f67, f57, f71 FNMA f79 = f75, f57, f79 FNMA f87 = f83, f57, f87 FNMA f95 = f91, f57, f95 ;; FMPY f68 = f68, f58 FMPY f76 = f76, f58 FMPY f84 = f84, f58 FMPY f92 = f92, f58 ;; FNMA f69 = f68, f59, f69 FNMA f77 = f76, f59, f77 FNMA f85 = f84, f59, f85 FNMA f93 = f92, f59, f93 ;; FNMA f70 = f68, f60, f70 FNMA f78 = f76, f60, f78 FNMA f86 = f84, f60, f86 FNMA f94 = f92, f60, f94 ;; FNMA f71 = f68, f61, f71 FNMA f79 = f76, f61, f79 FNMA f87 = f84, f61, f87 FNMA f95 = f92, f61, f95 ;; FMPY f69 = f69, f16 FMPY f77 = f77, f16 FMPY f85 = f85, f16 FMPY f93 = f93, f16 ;; FNMA f70 = f69, f17, f70 FNMA f78 = f77, f17, f78 FNMA f86 = f85, f17, f86 FNMA f94 = f93, f17, f94 ;; FNMA f71 = f69, f18, f71 FNMA f79 = f77, f18, f79 FNMA f87 = f85, f18, f87 FNMA f95 = f93, f18, f95 ;; FMPY f70 = f70, f19 FMPY f78 = f78, f19 FMPY f86 = f86, f19 FMPY f94 = f94, f19 ;; FNMA f71 = f70, f20, f71 FNMA f79 = f78, f20, f79 FNMA f87 = f86, f20, f87 FNMA f95 = f94, f20, f95 ;; FMPY f71 = f71, f21 FMPY f79 = f79, f21 FMPY f87 = f87, f21 FMPY f95 = f95, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, 5 * SIZE STFD [BOFFSET2] = f89, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, 5 * SIZE STFD [BOFFSET2] = f91, 5 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f77, SIZE ;; STFD [BOFFSET] = f84, SIZE STFD [BOFFSET2] = f85, SIZE ;; STFD [BOFFSET] = f92, 5 * SIZE STFD [BOFFSET2] = f93, 5 * SIZE ;; STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f78, SIZE STFD [BOFFSET2] = f79, SIZE ;; STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f87, SIZE ;; STFD [BOFFSET] = f94 STFD [BOFFSET2] = f95 adds C9 = 4 * SIZE, C1 adds BOFFSET = - 27 * SIZE, BOFFSET adds BOFFSET2 = - 27 * SIZE, BOFFSET2 ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; FNMA f72 = f64, f33, f72 FNMA f76 = f68, f33, f76 FNMA f73 = f65, f33, f73 FNMA f77 = f69, f33, f77 FNMA f74 = f66, f33, f74 FNMA f78 = f70, f33, f78 FNMA f75 = f67, f33, f75 FNMA f79 = f71, f33, f79 ;; FNMA f80 = f64, f34, f80 FNMA f84 = f68, f34, f84 FNMA f81 = f65, f34, f81 FNMA f85 = f69, f34, f85 FNMA f82 = f66, f34, f82 FNMA f86 = f70, f34, f86 FNMA f83 = f67, f34, f83 FNMA f87 = f71, f34, f87 ;; FNMA f88 = f64, f35, f88 FNMA f92 = f68, f35, f92 FNMA f89 = f65, f35, f89 FNMA f93 = f69, f35, f93 FNMA f90 = f66, f35, f90 FNMA f94 = f70, f35, f94 FNMA f91 = f67, f35, f91 FNMA f95 = f71, f35, f95 ;; FMPY f72 = f72, f36 FMPY f76 = f76, f36 FMPY f73 = f73, f36 FMPY f77 = f77, f36 FMPY f74 = f74, f36 FMPY f78 = f78, f36 FMPY f75 = f75, f36 FMPY f79 = f79, f36 ;; FNMA f80 = f72, f37, f80 FNMA f84 = f76, f37, f84 FNMA f81 = f73, f37, f81 FNMA f85 = f77, f37, f85 FNMA f82 = f74, f37, f82 FNMA f86 = f78, f37, f86 FNMA f83 = f75, f37, f83 FNMA f87 = f79, f37, f87 ;; FNMA f88 = f72, f38, f88 FNMA f92 = f76, f38, f92 FNMA f89 = f73, f38, f89 FNMA f93 = f77, f38, f93 FNMA f90 = f74, f38, f90 FNMA f94 = f78, f38, f94 FNMA f91 = f75, f38, f91 FNMA f95 = f79, f38, f95 ;; FMPY f80 = f80, f39 FMPY f84 = f84, f39 FMPY f81 = f81, f39 FMPY f85 = f85, f39 FMPY f82 = f82, f39 FMPY f86 = f86, f39 FMPY f83 = f83, f39 FMPY f87 = f87, f39 ;; FNMA f88 = f80, f40, f88 FNMA f92 = f84, f40, f92 FNMA f89 = f81, f40, f89 FNMA f93 = f85, f40, f93 FNMA f90 = f82, f40, f90 FNMA f94 = f86, f40, f94 FNMA f91 = f83, f40, f91 FNMA f95 = f87, f40, f95 ;; FMPY f88 = f88, f41 FMPY f92 = f92, f41 FMPY f89 = f89, f41 FMPY f93 = f93, f41 FMPY f90 = f90, f41 FMPY f94 = f94, f41 FMPY f91 = f91, f41 FMPY f95 = f95, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f71, 5 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, 5 * SIZE STFD [AOFFSET2] = f79, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f84, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f85, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f86, SIZE ;; STFD [AOFFSET] = f83, 5 * SIZE STFD [AOFFSET2] = f87, 5 * SIZE ;; STFD [AOFFSET] = f88, SIZE STFD [AOFFSET2] = f92, SIZE ;; STFD [AOFFSET] = f89, SIZE STFD [AOFFSET2] = f93, SIZE ;; STFD [AOFFSET] = f90, SIZE STFD [AOFFSET2] = f94, SIZE ;; STFD [AOFFSET] = f91, -27 * SIZE STFD [AOFFSET2] = f95, -27 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi STFD [C1 ] = f67, 5 * SIZE STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi STFD [C2 ] = f75, 5 * SIZE STFD [C10] = f79 } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C11] = f84, SIZE } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C11] = f85, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE STFD [C11] = f86, SIZE } ;; { .mmi STFD [C3 ] = f83, 5 * SIZE STFD [C11] = f87 } ;; { .mmf STFD [C4 ] = f88, SIZE STFD [C12] = f92, SIZE } ;; { .mmi STFD [C4 ] = f89, SIZE STFD [C12] = f93, SIZE } ;; { .mmi STFD [C4 ] = f90, SIZE STFD [C12] = f94, SIZE } ;; { .mmi STFD [C4 ] = f91, 5 * SIZE STFD [C12] = f95 cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi sub L = K, KK } ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; ;; { .mmi shladd AOFFSET = L, 3, AOFFSET } ;; { .mmi shladd BOFFSET = L, 2, BOFFSET } ;; { .mmi #ifdef LT adds KK = 8, KK #else nop __LINE__ #endif } ;; { .mmi mov L = KK } ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f65 = f0 mov f73 = f0 mov f81 = f0 mov f89 = f0 { .mmb (p6) br.cond.dptk .L052 } ;; .align 8 .L060: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L070 ;; { .mib mov L = KK } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f65 = f0 } ;; { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; mov f66 = f0 mov f67 = f0 mov f74 = f0 mov f75 = f0 mov f82 = f0 mov f83 = f0 mov f90 = f0 mov f91 = f0 ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L068 } ;; .align 8 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 br.cloop.sptk.few .L062 } ;; .align 8 .L068: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 ;; FSUB f66 = f40, f66 FSUB f74 = f41, f74 FSUB f82 = f42, f82 FSUB f90 = f43, f90 ;; FSUB f67 = f44, f67 FSUB f75 = f45, f75 FSUB f83 = f46, f83 FSUB f91 = f47, f91 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 FSUB f80 = f40, f80 FSUB f81 = f41, f81 FSUB f82 = f42, f82 FSUB f83 = f43, f83 FSUB f88 = f44, f88 FSUB f89 = f45, f89 FSUB f90 = f46, f90 FSUB f91 = f47, f91 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 FNMA f82 = f80, f34, f82 FNMA f90 = f88, f34, f90 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 FNMA f83 = f80, f35, f83 FNMA f91 = f88, f35, f91 ;; FMPY f65 = f65, f36 FMPY f73 = f73, f36 FMPY f81 = f81, f36 FMPY f89 = f89, f36 ;; FNMA f66 = f65, f37, f66 FNMA f74 = f73, f37, f74 FNMA f82 = f81, f37, f82 FNMA f90 = f89, f37, f90 ;; FNMA f67 = f65, f38, f67 FNMA f75 = f73, f38, f75 FNMA f83 = f81, f38, f83 FNMA f91 = f89, f38, f91 ;; FMPY f66 = f66, f39 FMPY f74 = f74, f39 FMPY f82 = f82, f39 FMPY f90 = f90, f39 ;; FNMA f67 = f66, f40, f67 FNMA f75 = f74, f40, f75 FNMA f83 = f82, f40, f83 FNMA f91 = f90, f40, f91 ;; FMPY f67 = f67, f41 FMPY f75 = f75, f41 FMPY f83 = f83, f41 FMPY f91 = f91, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, 5 * SIZE STFD [BOFFSET2] = f89, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, -11 * SIZE STFD [BOFFSET2] = f91, -11 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 FNMA f82 = f66, f34, f82 FNMA f83 = f67, f34, f83 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 FNMA f90 = f66, f35, f90 FNMA f91 = f67, f35, f91 ;; FMPY f72 = f72, f36 FMPY f73 = f73, f36 FMPY f74 = f74, f36 FMPY f75 = f75, f36 ;; FNMA f80 = f72, f37, f80 FNMA f81 = f73, f37, f81 FNMA f82 = f74, f37, f82 FNMA f83 = f75, f37, f83 ;; FNMA f88 = f72, f38, f88 FNMA f89 = f73, f38, f89 FNMA f90 = f74, f38, f90 FNMA f91 = f75, f38, f91 ;; FMPY f80 = f80, f39 FMPY f81 = f81, f39 FMPY f82 = f82, f39 FMPY f83 = f83, f39 ;; FNMA f88 = f80, f40, f88 FNMA f89 = f81, f40, f89 FNMA f90 = f82, f40, f90 FNMA f91 = f83, f40, f91 ;; FMPY f88 = f88, f41 FMPY f89 = f89, f41 FMPY f90 = f90, f41 FMPY f91 = f91, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f75, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, -11 * SIZE STFD [AOFFSET2] = f91, -11 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi STFD [C1 ] = f67, SIZE } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi STFD [C2 ] = f75, SIZE } ;; { .mmf STFD [C3 ] = f80, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE } ;; { .mmi STFD [C3 ] = f83, SIZE } ;; { .mmf STFD [C4 ] = f88, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE } ;; { .mmi STFD [C4 ] = f90, SIZE } ;; { .mmi STFD [C4 ] = f91, SIZE nop __LINE__ } ;; mov f65 = f0 ;; mov f73 = f0 ;; { .mmi sub L = K, KK } ;; { .mmf mov f81 = f0 } ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; { .mmi shladd AOFFSET = L, 2, AOFFSET } ;; { .mmi shladd BOFFSET = L, 2, BOFFSET } ;; { .mmf mov f89 = f0 } ;; { .mmi #ifdef LT adds KK = 4, KK #else nop __LINE__ #endif } ;; .align 8 .L070: tbit.z p6,p7 = M, 1 (p6) br.cond.dptk .L080 ;; { .mib mov L = KK } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; { .mfi mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L078 } ;; .align 8 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf nop __LINE__ nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 br.cloop.sptk.few .L072 } ;; .L078: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 FSUB f80 = f36, f80 FSUB f81 = f37, f81 FSUB f88 = f38, f88 FSUB f89 = f39, f89 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FMPY f65 = f65, f34 FMPY f73 = f73, f34 FMPY f81 = f81, f34 FMPY f89 = f89, f34 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE STFD [BOFFSET2] = f89, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 ;; FMPY f72 = f72, f36 FMPY f73 = f73, f36 ;; FNMA f80 = f72, f37, f80 FNMA f81 = f73, f37, f81 ;; FNMA f88 = f72, f38, f88 FNMA f89 = f73, f38, f89 ;; FMPY f80 = f80, f39 FMPY f81 = f81, f39 ;; FNMA f88 = f80, f40, f88 FNMA f89 = f81, f40, f89 ;; FMPY f88 = f88, f41 FMPY f89 = f89, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE STFD [AOFFSET2] = f89, -3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; STFD [C2 ] = f73, SIZE ;; STFD [C3 ] = f80, SIZE mov f80 = f0 ;; STFD [C3 ] = f81, SIZE ;; STFD [C4 ] = f88, SIZE mov f88 = f0 ;; STFD [C4 ] = f89, SIZE ;; mov f96 = f0 ;; mov f104 = f0 ;; sub L = K, KK ;; mov f112 = f0 ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; { .mmi shladd AOFFSET = L, 1, AOFFSET } ;; { .mmi shladd BOFFSET = L, 2, BOFFSET } ;; { .mmf mov f120 = f0 } ;; { .mmi #ifdef LT adds KK = 2, KK #else nop __LINE__ #endif } ;; .align 8 .L080: tbit.z p6,p7 = M, 0 (p6) br.cond.dptk .L089 { .mib mov L = KK } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi adds L = -1, L } ;; { .mmi cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L088 } ;; .L082: { .mfb cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mmf (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mib (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mmb nop __LINE__ adds L = -1, L br.cloop.sptk.few .L082 } ;; .L088: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f80, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FNMA f80 = f64, f34, f80 ;; FNMA f88 = f64, f35, f88 ;; FMPY f72 = f72, f36 ;; FNMA f80 = f72, f37, f80 ;; FNMA f88 = f72, f38, f88 ;; FMPY f80 = f80, f39 ;; FNMA f88 = f80, f40, f88 ;; FMPY f88 = f88, f41 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE STFD [C3 ] = f80, SIZE STFD [C4 ] = f88, SIZE ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 ;; sub L = K, KK ;; shladd L = L, BASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET ;; shladd BOFFSET = L, 2, BOFFSET ;; #ifdef LT adds KK = 1, KK #else nop __LINE__ #endif ;; mov L = KK ;; .align 8 .L089: mov B = BOFFSET #ifdef RN adds KK = 4, KK #endif ;; mov AOFFSET = A ;; .align 16 .L090: tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L130 ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 ;; { .mfi shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I mov AORIG = A } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc } ;; { .mfi shladd C = LDC, 1, C // coffset += 8 * ldc mov f81 = f0 mov L = KK }{ .mfb (p6) br.cond.dpnt .L100 } ;; .align 16 .L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } ;; (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE } { .mfi cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC } { .mfi adds L = 1, L } ;; { .mmf CPREFETCH [PREC] } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi mov ar.lc = L } ;; mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 mov f76 = f0 mov f77 = f0 mov f78 = f0 mov f79 = f0 ;; { .mfb cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 } ;; .align 8 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 br.cloop.sptk.few .L093 } ;; .align 8 .L098: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 FSUB f66 = f36, f66 FSUB f74 = f37, f74 FSUB f67 = f38, f67 FSUB f75 = f39, f75 FSUB f68 = f40, f68 FSUB f76 = f41, f76 FSUB f69 = f42, f69 FSUB f77 = f43, f77 FSUB f70 = f44, f70 FSUB f78 = f45, f78 FSUB f71 = f46, f71 FSUB f79 = f47, f79 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; FSUB f72 = f40, f72 FSUB f73 = f41, f73 FSUB f74 = f42, f74 FSUB f75 = f43, f75 FSUB f76 = f44, f76 FSUB f77 = f45, f77 FSUB f78 = f46, f78 FSUB f79 = f47, f79 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FNMA f68 = f64, f36, f68 FNMA f76 = f72, f36, f76 ;; FNMA f69 = f64, f37, f69 FNMA f77 = f72, f37, f77 ;; FNMA f70 = f64, f38, f70 FNMA f78 = f72, f38, f78 ;; FNMA f71 = f64, f39, f71 FNMA f79 = f72, f39, f79 ;; FMPY f65 = f65, f40 FMPY f73 = f73, f40 ;; FNMA f66 = f65, f41, f66 FNMA f74 = f73, f41, f74 ;; FNMA f67 = f65, f42, f67 FNMA f75 = f73, f42, f75 ;; FNMA f68 = f65, f43, f68 FNMA f76 = f73, f43, f76 ;; FNMA f69 = f65, f44, f69 FNMA f77 = f73, f44, f77 ;; FNMA f70 = f65, f45, f70 FNMA f78 = f73, f45, f78 ;; FNMA f71 = f65, f46, f71 FNMA f79 = f73, f46, f79 ;; FMPY f66 = f66, f47 FMPY f74 = f74, f47 ;; FNMA f67 = f66, f48, f67 FNMA f75 = f74, f48, f75 ;; FNMA f68 = f66, f49, f68 FNMA f76 = f74, f49, f76 ;; FNMA f69 = f66, f50, f69 FNMA f77 = f74, f50, f77 ;; FNMA f70 = f66, f51, f70 FNMA f78 = f74, f51, f78 ;; FNMA f71 = f66, f52, f71 FNMA f79 = f74, f52, f79 ;; FMPY f67 = f67, f53 FMPY f75 = f75, f53 ;; FNMA f68 = f67, f54, f68 FNMA f76 = f75, f54, f76 ;; FNMA f69 = f67, f55, f69 FNMA f77 = f75, f55, f77 ;; FNMA f70 = f67, f56, f70 FNMA f78 = f75, f56, f78 ;; FNMA f71 = f67, f57, f71 FNMA f79 = f75, f57, f79 ;; FMPY f68 = f68, f58 FMPY f76 = f76, f58 ;; FNMA f69 = f68, f59, f69 FNMA f77 = f76, f59, f77 ;; FNMA f70 = f68, f60, f70 FNMA f78 = f76, f60, f78 ;; FNMA f71 = f68, f61, f71 FNMA f79 = f76, f61, f79 ;; FMPY f69 = f69, f16 FMPY f77 = f77, f16 ;; FNMA f70 = f69, f17, f70 FNMA f78 = f77, f17, f78 ;; FNMA f71 = f69, f18, f71 FNMA f79 = f77, f18, f79 ;; FMPY f70 = f70, f19 FMPY f78 = f78, f19 ;; FNMA f71 = f70, f20, f71 FNMA f79 = f78, f20, f79 ;; FMPY f71 = f71, f21 FMPY f79 = f79, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, 5 * SIZE STFD [BOFFSET2] = f75, 5 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f78, SIZE ;; STFD [BOFFSET] = f69, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f77, -11 * SIZE STFD [BOFFSET2] = f79, -11 * SIZE ;; adds C9 = 4 * SIZE, C1 ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; FNMA f72 = f64, f33, f72 FNMA f76 = f68, f33, f76 FNMA f73 = f65, f33, f73 FNMA f77 = f69, f33, f77 FNMA f74 = f66, f33, f74 FNMA f78 = f70, f33, f78 FNMA f75 = f67, f33, f75 FNMA f79 = f71, f33, f79 ;; FMPY f72 = f72, f34 FMPY f76 = f76, f34 FMPY f73 = f73, f34 FMPY f77 = f77, f34 FMPY f74 = f74, f34 FMPY f78 = f78, f34 FMPY f75 = f75, f34 FMPY f79 = f79, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f71, 5 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, -11 * SIZE STFD [AOFFSET2] = f79, -11 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi STFD [C1 ] = f67, 5 * SIZE STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi STFD [C2 ] = f75, 5 * SIZE STFD [C10] = f79 } ;; { .mmf cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi sub L = K, KK } ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; ;; shladd AOFFSET = L, 3, AOFFSET shladd BOFFSET = L, 1, BOFFSET ;; { .mmi #ifdef LT adds KK = 8, KK #else nop __LINE__ #endif } ;; mov L = KK mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 (p6) br.cond.dptk .L092 ;; .align 8 .L100: { .mib mov L = KK tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L110 } ;; cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f65 = f0 } ;; adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L108 } ;; .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 2 * SIZE, C2 } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 br.cloop.sptk.few .L102 } ;; .align 8 .L108: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;; FSUB f66 = f36, f66 FSUB f74 = f37, f74 ;; FSUB f67 = f38, f67 FSUB f75 = f39, f75 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FMPY f65 = f65, f36 FMPY f73 = f73, f36 ;; FNMA f66 = f65, f37, f66 FNMA f74 = f73, f37, f74 ;; FNMA f67 = f65, f38, f67 FNMA f75 = f73, f38, f75 ;; FMPY f66 = f66, f39 FMPY f74 = f74, f39 ;; FNMA f67 = f66, f40, f67 FNMA f75 = f74, f40, f75 ;; FMPY f67 = f67, f41 FMPY f75 = f75, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE STFD [BOFFSET2] = f75, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 FMPY f74 = f74, f34 FMPY f75 = f75, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f75, -3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi STFD [C1 ] = f67, SIZE } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi STFD [C2 ] = f75, SIZE } ;; mov f65 = f0 mov f73 = f0 mov f66 = f0 mov f74 = f0 mov f67 = f0 mov f75 = f0 ;; { .mmi sub L = K, KK } ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; shladd AOFFSET = L, 2, AOFFSET ;; shladd BOFFSET = L, 1, BOFFSET ;; #ifdef LT adds KK = 4, KK nop __LINE__ #endif ;; .align 8 .L110: { .mib tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L120 } ;; { .mib mov L = KK } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; adds L = 1, L ;; { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L118 } ;; .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 br.cloop.sptk.few .L112 } ;; .align 8 .L118: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FMPY f65 = f65, f34 FMPY f73 = f73, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; STFD [C2 ] = f73, SIZE ;; mov f65 = f0 mov f73 = f0 ;; sub L = K, KK ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; { .mmi shladd AOFFSET = L, 1, AOFFSET } ;; { .mmi shladd BOFFSET = L, 1, BOFFSET } ;; { .mmi #ifdef LT adds KK = 2, KK #else nop __LINE__ #endif } ;; .align 8 .L120: tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L129 ;; { .mib mov L = KK } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi adds L = -1, L } ;; { .mmi cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L128 } ;; .align 8 .L122: { .mfi FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 br.cloop.sptk.few .L122 } ;; .L128: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; #else LDFPD f32, f33 = [AOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, -SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FMPY f72 = f72, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, -SIZE ;; #endif STFD [C1 ] = f64, SIZE STFD [C2 ] = f72, SIZE mov f64 = f0 mov f72 = f0 ;; sub L = K, KK ;; shladd L = L, BASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET ;; shladd BOFFSET = L, 1, BOFFSET ;; #ifdef LT adds KK = 1, KK #else nop __LINE__ #endif ;; mov L = KK ;; .align 8 .L129: mov B = BOFFSET #ifdef RN adds KK = 2, KK #endif ;; mov AOFFSET = A ;; .align 16 .L130: tbit.z p6, p0 = N, 0 (p6) br.cond.dpnt .L999 ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 ;; { .mfi shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I mov AORIG = A } ;; { .mfi add C = C, LDC // coffset += 8 * ldc mov L = KK }{ .mfb (p6) br.cond.dpnt .L140 } ;; .align 16 .L132: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ nop __LINE__ } ;; (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE } { .mfi cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC] } { .mfi adds L = 1, L } ;; { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi mov ar.lc = L } ;; { .mfb cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L138 } ;; .align 16 .L133: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C9 = 4 * SIZE, C1 } { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } ;; { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE nop __LINE__ br.cloop.sptk.few .L133 } ;; .L138: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FNMA f66 = f64, f34, f66 ;; FNMA f67 = f64, f35, f67 ;; FNMA f68 = f64, f36, f68 ;; FNMA f69 = f64, f37, f69 ;; FNMA f70 = f64, f38, f70 ;; FNMA f71 = f64, f39, f71 ;; FMPY f65 = f65, f40 ;; FNMA f66 = f65, f41, f66 ;; FNMA f67 = f65, f42, f67 ;; FNMA f68 = f65, f43, f68 ;; FNMA f69 = f65, f44, f69 ;; FNMA f70 = f65, f45, f70 ;; FNMA f71 = f65, f46, f71 ;; FMPY f66 = f66, f47 ;; FNMA f67 = f66, f48, f67 ;; FNMA f68 = f66, f49, f68 ;; FNMA f69 = f66, f50, f69 ;; FNMA f70 = f66, f51, f70 ;; FNMA f71 = f66, f52, f71 ;; FMPY f67 = f67, f53 ;; FNMA f68 = f67, f54, f68 ;; FNMA f69 = f67, f55, f69 ;; FNMA f70 = f67, f56, f70 ;; FNMA f71 = f67, f57, f71 ;; FMPY f68 = f68, f58 ;; FNMA f69 = f68, f59, f69 ;; FNMA f70 = f68, f60, f70 ;; FNMA f71 = f68, f61, f71 ;; FMPY f69 = f69, f16 ;; FNMA f70 = f69, f17, f70 ;; FNMA f71 = f69, f18, f71 ;; FMPY f70 = f70, f19 ;; FNMA f71 = f70, f20, f71 ;; FMPY f71 = f71, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f68, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE STFD [BOFFSET2] = f71, -3 * SIZE ;; adds C9 = 4 * SIZE, C1 ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71, -3 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi STFD [C1 ] = f67, 5 * SIZE STFD [C9 ] = f71 } ;; { .mmf cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi sub L = K, KK } ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; ;; { .mmi shladd AOFFSET = L, 3, AOFFSET } ;; { .mmi add BOFFSET = L, BOFFSET } ;; { .mmi #ifdef LT adds KK = 8, KK #else nop __LINE__ #endif } ;; { .mmi mov L = KK } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 (p6) br.cond.dptk .L132 .align 8 .L140: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L150 ;; { .mib mov L = KK } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFD f48 = [BOFFSET], 1 * SIZE mov f65 = f0 } ;; { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L148 } ;; .L142: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 (p5) adds C9 = 2 * SIZE, C1 } { .mmf nop __LINE__ (p3) LDFD f56 = [BOFFSET], 1 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 (p5) adds C10 = 2 * SIZE, C2 } { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } { .mfb nop __LINE__ nop.f 0 br.cloop.sptk.few .L142 } ;; .L148: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FNMA f66 = f64, f34, f66 ;; FNMA f67 = f64, f35, f67 ;; FMPY f65 = f65, f36 ;; FNMA f66 = f65, f37, f66 ;; FNMA f67 = f65, f38, f67 ;; FMPY f66 = f66, f39 ;; FNMA f67 = f66, f40, f67 ;; FMPY f67 = f67, f41 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f66, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f66, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi STFD [C1 ] = f67, SIZE } ;; { .mmf mov f72 = f0 } ;; mov f65 = f0 mov f73 = f0 mov f66 = f0 mov f74 = f0 mov f67 = f0 mov f75 = f0 ;; { .mmi sub L = K, KK } ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; { .mmi shladd AOFFSET = L, 2, AOFFSET } ;; { .mmi add BOFFSET = L, BOFFSET } ;; { .mmi #ifdef LT adds KK = 4, KK #else nop __LINE__ #endif } ;; .align 8 .L150: tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L160 ;; { .mib mov L = KK } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmf (p7) LDFD f48 = [BOFFSET], 1 * SIZE } ;; { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L158 } ;; .L152: { .mfi cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } ;; { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } ;; { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 br.cloop.sptk.few .L152 } ;; .L158: adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #ifdef LT LDFPD f32, f33 = [BOFFSET] ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 ;; #else LDFPD f32, f33 = [AOFFSET] ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FMPY f65 = f65, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, -SIZE ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, - SIZE ;; #endif STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; mov f64 = f0 mov f65 = f0 ;; sub L = K, KK ;; { .mmi shladd L = L, BASE_SHIFT, r0 } ;; { .mmi shladd AOFFSET = L, 1, AOFFSET } ;; { .mmi add BOFFSET = L, BOFFSET } ;; { .mmi #ifdef LT adds KK = 2, KK #else nop __LINE__ #endif } ;; .align 8 .L160: { .mib mov L = KK tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L169 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B } ;; { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ adds L = 1, L } ;; { .mii tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi cmp.eq p6, p0 = 0, L adds L = -1, L cmp.eq p3, p0 = r0, r0 } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L168 } ;; .align 8 .L162: { .mmf cmp.ne p4, p5 = 0, L (p12) cmp.ne p3, p0 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 } ;; { .mmi (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } ;; { .mmi (p4) LDFD f32 = [AOFFSET], 1 * SIZE nop __LINE__ adds L = -1, L } { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 br.cloop.sptk.few .L162 } ;; .align 8 .L168: #ifdef LT { .mmi LDFD f32 = [BOFFSET] LDFD f33 = [AOFFSET] nop __LINE__ } ;; #else { .mmi LDFD f32 = [AOFFSET] LDFD f33 = [BOFFSET] nop __LINE__ } ;; #endif { .mmf sub L = K, KK nop __LINE__ FSUB f64 = f32, f64 } ;; #ifdef LT adds KK = 1, KK #else nop __LINE__ #endif ;; mov L = KK ;; FMPY f64 = f64, f33 ;; #ifdef LT { .mmf STFD [BOFFSET] = f64 STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; #else { .mmf STFD [AOFFSET] = f64 STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; #endif shladd AOFFSET = L, BASE_SHIFT, AOFFSET shladd BOFFSET = L, BASE_SHIFT, BOFFSET ;; .align 8 .L169: { .mii mov B = BOFFSET #ifdef RN adds KK = 1, KK #else nop __LINE__ #endif mov AOFFSET = A } ;; .align 16 .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 ;; mov ar.lc = ARLC ;; mov pr = PR, -1 ;; mov ar.pfs = ARPFS ;; br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/trsm_kernel_RT.S000066400000000000000000010414561313527062700202600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 4) #endif #ifndef LN #define CPREFETCHSIZE 8 #else #define CPREFETCHSIZE -8 #endif #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r36 #define B r37 #define C r38 #define LDC r39 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define C9 loc0 #define C10 loc1 #define C11 loc2 #define C12 loc3 #define C13 loc4 #define C14 loc5 #define C15 loc6 #define C16 loc7 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA f8 #define AORIG loc8 #define KK loc9 #define KK8 loc10 #define OFFSET loc11 #define AOFFSET2 loc12 #define BOFFSET2 loc13 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -6 * 16, SP adds r9 = -5 * 16, SP adds SP = -6 * 16, SP } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 mov PR = pr } ;; { .mmi stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 nop __LINE__ } ;; { .mmi stf.spill [r8] = f20 stf.spill [r9] = f21 shladd LDC = LDC, BASE_SHIFT, r0 } ;; .body { .mmi ld8 OFFSET = [r14] mov AOFFSET = A } ;; #ifdef LN { .mmi setf.sig f32 = M setf.sig f33 = K shladd C = M, BASE_SHIFT, C } ;; {.mmf nop __LINE__ nop __LINE__ xmpy.l f32 = f32, f33 } ;; { .mmi getf.sig r2 = f32 ;; nop __LINE__ shladd A = r2, BASE_SHIFT, A } ;; #endif #ifdef RN sub KK = r0, OFFSET #endif #ifdef RT { .mmi setf.sig f32 = N setf.sig f33 = K nop __LINE__ } ;; { .mmi setf.sig f34 = LDC nop __LINE__ nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ xmpy.l f33 = f32, f33 } { .mmf nop __LINE__ sub KK = N, OFFSET xmpy.l f34 = f32, f34 } ;; { .mmi getf.sig r2 = f33 getf.sig r3 = f34 } ;; shladd B = r2, BASE_SHIFT, B add C = r3, C #endif ;; .L130: tbit.z p6, p0 = N, 0 (p6) br.cond.dpnt .L090 ;; #ifdef RT { .mmi nop __LINE__ shl r2 = K, BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, LDC nop __LINE__ } #endif ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 ;; { .mfi shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } ;; { .mfi #ifndef RT add C = C, LDC // coffset += 8 * ldc #else nop __LINE__ #endif #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif }{ .mfb (p6) br.cond.dpnt .L140 } ;; .align 16 .L132: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK, BASE_SHIFT, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFD f48 = [BOFFSET], 1 * SIZE shladd AOFFSET = r3, 3, AORIG } ;; #endif (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE } { .mfi cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC] } { .mfi adds L = 1, L } ;; { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi mov ar.lc = L } ;; { .mfb cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L138 } ;; .align 16 .L133: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C9 = 4 * SIZE, C1 } { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } ;; { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE nop __LINE__ br.cloop.sptk.few .L133 } ;; .L138: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -8, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG add BOFFSET = r2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; #endif #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f37, f36 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f39, f38 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f44, f43 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f46, f45 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f48, f47 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f50, f49 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f52, f51 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f57, f56 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f59, f58 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f61, f60 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f20, f19 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] ;; FMPY f71 = f71, f32 ;; FNMA f70 = f71, f33, f70 ;; FNMA f69 = f71, f34, f69 ;; FNMA f68 = f71, f35, f68 ;; FNMA f67 = f71, f36, f67 ;; FNMA f66 = f71, f37, f66 ;; FNMA f65 = f71, f38, f65 ;; FNMA f64 = f71, f39, f64 ;; FMPY f70 = f70, f40 ;; FNMA f69 = f70, f41, f69 ;; FNMA f68 = f70, f42, f68 ;; FNMA f67 = f70, f43, f67 ;; FNMA f66 = f70, f44, f66 ;; FNMA f65 = f70, f45, f65 ;; FNMA f64 = f70, f46, f64 ;; FMPY f69 = f69, f47 ;; FNMA f68 = f69, f48, f68 ;; FNMA f67 = f69, f49, f67 ;; FNMA f66 = f69, f50, f66 ;; FNMA f65 = f69, f51, f65 ;; FNMA f64 = f69, f52, f64 ;; FMPY f68 = f68, f53 ;; FNMA f67 = f68, f54, f67 ;; FNMA f66 = f68, f55, f66 ;; FNMA f65 = f68, f56, f65 ;; FNMA f64 = f68, f57, f64 ;; FMPY f67 = f67, f58 ;; FNMA f66 = f67, f59, f66 ;; FNMA f65 = f67, f60, f65 ;; FNMA f64 = f67, f61, f64 ;; FMPY f66 = f66, f16 ;; FNMA f65 = f66, f17, f65 ;; FNMA f64 = f66, f18, f64 ;; FMPY f65 = f65, f19 ;; FNMA f64 = f65, f20, f64 ;; FMPY f64 = f64, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f68, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f67, - 3 * SIZE STFD [BOFFSET2] = f71, - 3 * SIZE ;; adds C1 = -8 * SIZE, C1 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FNMA f66 = f64, f34, f66 ;; FNMA f67 = f64, f35, f67 ;; FNMA f68 = f64, f36, f68 ;; FNMA f69 = f64, f37, f69 ;; FNMA f70 = f64, f38, f70 ;; FNMA f71 = f64, f39, f71 ;; FMPY f65 = f65, f40 ;; FNMA f66 = f65, f41, f66 ;; FNMA f67 = f65, f42, f67 ;; FNMA f68 = f65, f43, f68 ;; FNMA f69 = f65, f44, f69 ;; FNMA f70 = f65, f45, f70 ;; FNMA f71 = f65, f46, f71 ;; FMPY f66 = f66, f47 ;; FNMA f67 = f66, f48, f67 ;; FNMA f68 = f66, f49, f68 ;; FNMA f69 = f66, f50, f69 ;; FNMA f70 = f66, f51, f70 ;; FNMA f71 = f66, f52, f71 ;; FMPY f67 = f67, f53 ;; FNMA f68 = f67, f54, f68 ;; FNMA f69 = f67, f55, f69 ;; FNMA f70 = f67, f56, f70 ;; FNMA f71 = f67, f57, f71 ;; FMPY f68 = f68, f58 ;; FNMA f69 = f68, f59, f69 ;; FNMA f70 = f68, f60, f70 ;; FNMA f71 = f68, f61, f71 ;; FMPY f69 = f69, f16 ;; FNMA f70 = f69, f17, f70 ;; FNMA f71 = f69, f18, f71 ;; FMPY f70 = f70, f19 ;; FNMA f71 = f70, f20, f71 ;; FMPY f71 = f71, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f68, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE STFD [BOFFSET2] = f71, -3 * SIZE ;; adds C9 = 4 * SIZE, C1 ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71, -3 * SIZE ;; #endif #ifdef RT LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71, -3 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, 5 * SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif STFD [C9 ] = f71 } ;; { .mmf cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 3, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) add BOFFSET = L, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 8, KK #elif defined LN adds KK = -8, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 (p6) br.cond.dptk .L132 .align 8 .L140: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L150 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFD f48 = [BOFFSET], 1 * SIZE mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = KK, BASE_SHIFT, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFD f48 = [BOFFSET], 1 * SIZE shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L148 } ;; .L142: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 (p5) adds C9 = 2 * SIZE, C1 } { .mmf nop __LINE__ (p3) LDFD f56 = [BOFFSET], 1 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 (p5) adds C10 = 2 * SIZE, C2 } { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } { .mfb nop __LINE__ nop.f 0 br.cloop.sptk.few .L142 } ;; .L148: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 ;; FNMA f66 = f67, f33, f66 ;; FNMA f65 = f67, f34, f65 ;; FNMA f64 = f67, f35, f64 ;; FMPY f66 = f66, f36 ;; FNMA f65 = f66, f37, f65 ;; FNMA f64 = f66, f38, f64 ;; FMPY f65 = f65, f39 ;; FNMA f64 = f65, f40, f64 ;; FMPY f64 = f64, f41 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f66, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE ;; adds C1 = -4 * SIZE, C1 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FNMA f66 = f64, f34, f66 ;; FNMA f67 = f64, f35, f67 ;; FMPY f65 = f65, f36 ;; FNMA f66 = f65, f37, f66 ;; FNMA f67 = f65, f38, f67 ;; FMPY f66 = f66, f39 ;; FNMA f67 = f66, f40, f67 ;; FMPY f67 = f67, f41 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f66, SIZE ;; STFD [BOFFSET] = f67, -3 * SIZE ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f66, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE ;; #endif #ifdef RT LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f66, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf mov f72 = f0 } ;; mov f65 = f0 mov f73 = f0 mov f66 = f0 mov f74 = f0 mov f67 = f0 mov f75 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) add BOFFSET = L, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L150: tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L160 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFD f48 = [BOFFSET], 1 * SIZE } ;; #else { .mfi shladd BOFFSET = KK, BASE_SHIFT, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFD f48 = [BOFFSET], 1 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L158 } ;; .L152: { .mfi cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } ;; { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } ;; { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 br.cloop.sptk.few .L152 } ;; .L158: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET] ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 ;; #else LDFPD f32, f33 = [AOFFSET] ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 ;; FNMA f64 = f65, f33, f64 ;; FMPY f64 = f64, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, - SIZE ;; adds C1 = -2 * SIZE, C1 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f65 = f64, f33, f65 ;; FMPY f65 = f65, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, -SIZE ;; #endif #ifdef RN LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, - SIZE ;; #endif #ifdef RT LDFD f32 = [BOFFSET] ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, - SIZE ;; #endif STFD [C1 ] = f64, SIZE ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; mov f64 = f0 mov f65 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) add BOFFSET = L, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L160: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L169 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE nop __LINE__ adds L = 1, L } ;; #else { .mmi shladd BOFFSET = KK, BASE_SHIFT, B nop __LINE__ #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mmi (p7) LDFD f48 = [BOFFSET], 1 * SIZE adds L = 1, L add AOFFSET = r3, AORIG } ;; #endif ;; { .mii tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi cmp.eq p6, p0 = 0, L adds L = -1, L cmp.eq p3, p0 = r0, r0 } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L168 } ;; .align 8 .L162: { .mmf cmp.ne p4, p5 = 0, L (p12) cmp.ne p3, p0 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 } ;; { .mmi (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } ;; { .mmi (p4) LDFD f32 = [AOFFSET], 1 * SIZE nop __LINE__ adds L = -1, L } { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 br.cloop.sptk.few .L162 } ;; .align 8 .L168: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) { .mmi LDFD f32 = [BOFFSET] LDFD f33 = [AOFFSET] #ifdef LN adds C1 = -1 * SIZE, C1 #else nop __LINE__ #endif } ;; #else { .mmi LDFD f32 = [AOFFSET] LDFD f33 = [BOFFSET] nop __LINE__ } ;; #endif { .mmf sub L = K, KK #ifdef RT shladd AORIG = K, BASE_SHIFT, AORIG #else nop __LINE__ #endif FSUB f64 = f32, f64 } ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; FMPY f64 = f64, f33 ;; #if defined(LN) || defined(LT) { .mmf STFD [BOFFSET] = f64 #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif mov f64 = f0 } ;; #else { .mmf STFD [AOFFSET] = f64 STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; #endif #if defined(LT) || defined(RN) shladd AOFFSET = L, BASE_SHIFT, AOFFSET #else nop __LINE__ #endif #if defined(LT) || defined(RN) shladd BOFFSET = L, BASE_SHIFT, BOFFSET #else nop __LINE__ #endif ;; .align 8 .L169: { .mii #ifdef LN shladd B = K, BASE_SHIFT, B #elif defined(LT) || defined(RN) mov B = BOFFSET #else nop __LINE__ #endif #ifdef RN adds KK = 1, KK #elif defined RT adds KK = -1, KK #else nop __LINE__ #endif mov AOFFSET = A } ;; .align 16 .L090: tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L050 ;; #ifdef RT { .mmi shladd r3 = LDC, 1, r0 nop __LINE__ shl r2 = K, 1 + BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } #endif ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 ;; { .mfi shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc } ;; { .mfi #ifndef RT shladd C = LDC, 1, C // coffset += 8 * ldc #else nop __LINE__ #endif mov f81 = f0 #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif }{ .mfb (p6) br.cond.dpnt .L100 } ;; .align 16 .L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 3, AORIG } ;; #endif (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE } { .mfi cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC } { .mfi adds L = 1, L } ;; { .mmf CPREFETCH [PREC] } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi mov ar.lc = L } ;; mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 mov f76 = f0 mov f77 = f0 mov f78 = f0 mov f79 = f0 ;; { .mfb cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 } ;; .align 8 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 br.cloop.sptk.few .L093 } ;; .align 8 .L098: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -8, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 FSUB f66 = f36, f66 FSUB f74 = f37, f74 FSUB f67 = f38, f67 FSUB f75 = f39, f75 FSUB f68 = f40, f68 FSUB f76 = f41, f76 FSUB f69 = f42, f69 FSUB f77 = f43, f77 FSUB f70 = f44, f70 FSUB f78 = f45, f78 FSUB f71 = f46, f71 FSUB f79 = f47, f79 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; FSUB f72 = f40, f72 FSUB f73 = f41, f73 FSUB f74 = f42, f74 FSUB f75 = f43, f75 FSUB f76 = f44, f76 FSUB f77 = f45, f77 FSUB f78 = f46, f78 FSUB f79 = f47, f79 ;; #endif #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f37, f36 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f39, f38 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f44, f43 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f46, f45 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f48, f47 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f50, f49 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f52, f51 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f57, f56 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f59, f58 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f61, f60 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f20, f19 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] ;; FMPY f71 = f71, f32 FMPY f79 = f79, f32 ;; FNMA f70 = f71, f33, f70 FNMA f78 = f79, f33, f78 ;; FNMA f69 = f71, f34, f69 FNMA f77 = f79, f34, f77 ;; FNMA f68 = f71, f35, f68 FNMA f76 = f79, f35, f76 ;; FNMA f67 = f71, f36, f67 FNMA f75 = f79, f36, f75 ;; FNMA f66 = f71, f37, f66 FNMA f74 = f79, f37, f74 ;; FNMA f65 = f71, f38, f65 FNMA f73 = f79, f38, f73 ;; FNMA f64 = f71, f39, f64 FNMA f72 = f79, f39, f72 ;; FMPY f70 = f70, f40 FMPY f78 = f78, f40 ;; FNMA f69 = f70, f41, f69 FNMA f77 = f78, f41, f77 ;; FNMA f68 = f70, f42, f68 FNMA f76 = f78, f42, f76 ;; FNMA f67 = f70, f43, f67 FNMA f75 = f78, f43, f75 ;; FNMA f66 = f70, f44, f66 FNMA f74 = f78, f44, f74 ;; FNMA f65 = f70, f45, f65 FNMA f73 = f78, f45, f73 ;; FNMA f64 = f70, f46, f64 FNMA f72 = f78, f46, f72 ;; FMPY f69 = f69, f47 FMPY f77 = f77, f47 ;; FNMA f68 = f69, f48, f68 FNMA f76 = f77, f48, f76 ;; FNMA f67 = f69, f49, f67 FNMA f75 = f77, f49, f75 ;; FNMA f66 = f69, f50, f66 FNMA f74 = f77, f50, f74 ;; FNMA f65 = f69, f51, f65 FNMA f73 = f77, f51, f73 ;; FNMA f64 = f69, f52, f64 FNMA f72 = f77, f52, f72 ;; FMPY f68 = f68, f53 FMPY f76 = f76, f53 ;; FNMA f67 = f68, f54, f67 FNMA f75 = f76, f54, f75 ;; FNMA f66 = f68, f55, f66 FNMA f74 = f76, f55, f74 ;; FNMA f65 = f68, f56, f65 FNMA f73 = f76, f56, f73 ;; FNMA f64 = f68, f57, f64 FNMA f72 = f76, f57, f72 ;; FMPY f67 = f67, f58 FMPY f75 = f75, f58 ;; FNMA f66 = f67, f59, f66 FNMA f74 = f75, f59, f74 ;; FNMA f65 = f67, f60, f65 FNMA f73 = f75, f60, f73 ;; FNMA f64 = f67, f61, f64 FNMA f72 = f75, f61, f72 ;; FMPY f66 = f66, f16 FMPY f74 = f74, f16 ;; FNMA f65 = f66, f17, f65 FNMA f73 = f74, f17, f73 ;; FNMA f64 = f66, f18, f64 FNMA f72 = f74, f18, f72 ;; FMPY f65 = f65, f19 FMPY f73 = f73, f19 ;; FNMA f64 = f65, f20, f64 FNMA f72 = f73, f20, f72 ;; FMPY f64 = f64, f21 FMPY f72 = f72, f21 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f78, SIZE ;; STFD [BOFFSET] = f69, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f77, - 11 * SIZE STFD [BOFFSET2] = f79, - 11 * SIZE ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, - 3 * SIZE STFD [BOFFSET2] = f75, - 3 * SIZE ;; adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FNMA f68 = f64, f36, f68 FNMA f76 = f72, f36, f76 ;; FNMA f69 = f64, f37, f69 FNMA f77 = f72, f37, f77 ;; FNMA f70 = f64, f38, f70 FNMA f78 = f72, f38, f78 ;; FNMA f71 = f64, f39, f71 FNMA f79 = f72, f39, f79 ;; FMPY f65 = f65, f40 FMPY f73 = f73, f40 ;; FNMA f66 = f65, f41, f66 FNMA f74 = f73, f41, f74 ;; FNMA f67 = f65, f42, f67 FNMA f75 = f73, f42, f75 ;; FNMA f68 = f65, f43, f68 FNMA f76 = f73, f43, f76 ;; FNMA f69 = f65, f44, f69 FNMA f77 = f73, f44, f77 ;; FNMA f70 = f65, f45, f70 FNMA f78 = f73, f45, f78 ;; FNMA f71 = f65, f46, f71 FNMA f79 = f73, f46, f79 ;; FMPY f66 = f66, f47 FMPY f74 = f74, f47 ;; FNMA f67 = f66, f48, f67 FNMA f75 = f74, f48, f75 ;; FNMA f68 = f66, f49, f68 FNMA f76 = f74, f49, f76 ;; FNMA f69 = f66, f50, f69 FNMA f77 = f74, f50, f77 ;; FNMA f70 = f66, f51, f70 FNMA f78 = f74, f51, f78 ;; FNMA f71 = f66, f52, f71 FNMA f79 = f74, f52, f79 ;; FMPY f67 = f67, f53 FMPY f75 = f75, f53 ;; FNMA f68 = f67, f54, f68 FNMA f76 = f75, f54, f76 ;; FNMA f69 = f67, f55, f69 FNMA f77 = f75, f55, f77 ;; FNMA f70 = f67, f56, f70 FNMA f78 = f75, f56, f78 ;; FNMA f71 = f67, f57, f71 FNMA f79 = f75, f57, f79 ;; FMPY f68 = f68, f58 FMPY f76 = f76, f58 ;; FNMA f69 = f68, f59, f69 FNMA f77 = f76, f59, f77 ;; FNMA f70 = f68, f60, f70 FNMA f78 = f76, f60, f78 ;; FNMA f71 = f68, f61, f71 FNMA f79 = f76, f61, f79 ;; FMPY f69 = f69, f16 FMPY f77 = f77, f16 ;; FNMA f70 = f69, f17, f70 FNMA f78 = f77, f17, f78 ;; FNMA f71 = f69, f18, f71 FNMA f79 = f77, f18, f79 ;; FMPY f70 = f70, f19 FMPY f78 = f78, f19 ;; FNMA f71 = f70, f20, f71 FNMA f79 = f78, f20, f79 ;; FMPY f71 = f71, f21 FMPY f79 = f79, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, 5 * SIZE STFD [BOFFSET2] = f75, 5 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f78, SIZE ;; STFD [BOFFSET] = f69, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f77, -11 * SIZE STFD [BOFFSET2] = f79, -11 * SIZE ;; adds C9 = 4 * SIZE, C1 ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; FNMA f72 = f64, f33, f72 FNMA f76 = f68, f33, f76 FNMA f73 = f65, f33, f73 FNMA f77 = f69, f33, f77 FNMA f74 = f66, f33, f74 FNMA f78 = f70, f33, f78 FNMA f75 = f67, f33, f75 FNMA f79 = f71, f33, f79 ;; FMPY f72 = f72, f34 FMPY f76 = f76, f34 FMPY f73 = f73, f34 FMPY f77 = f77, f34 FMPY f74 = f74, f34 FMPY f78 = f78, f34 FMPY f75 = f75, f34 FMPY f79 = f79, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f71, 5 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, -11 * SIZE STFD [AOFFSET2] = f79, -11 * SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f76 = f76, f32 FMPY f73 = f73, f32 FMPY f77 = f77, f32 FMPY f74 = f74, f32 FMPY f78 = f78, f32 FMPY f75 = f75, f32 FMPY f79 = f79, f32 ;; FNMA f64 = f72, f33, f64 FNMA f68 = f76, f33, f68 FNMA f65 = f73, f33, f65 FNMA f69 = f77, f33, f69 FNMA f66 = f74, f33, f66 FNMA f70 = f78, f33, f70 FNMA f67 = f75, f33, f67 FNMA f71 = f79, f33, f71 ;; FMPY f64 = f64, f34 FMPY f68 = f68, f34 FMPY f65 = f65, f34 FMPY f69 = f69, f34 FMPY f66 = f66, f34 FMPY f70 = f70, f34 FMPY f67 = f67, f34 FMPY f71 = f71, f34 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, - 11 * SIZE STFD [AOFFSET2] = f79, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f71, - 3 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, 5 * SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi #ifndef LN STFD [C2 ] = f75, 5 * SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif STFD [C10] = f79 } ;; { .mmf cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 3, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 8, KK #elif defined LN adds KK = -8, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 (p6) br.cond.dptk .L092 ;; .align 8 .L100: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L110 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L108 } ;; .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 2 * SIZE, C2 } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 br.cloop.sptk.few .L102 } ;; .align 8 .L108: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;; FSUB f66 = f36, f66 FSUB f74 = f37, f74 ;; FSUB f67 = f38, f67 FSUB f75 = f39, f75 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 FMPY f75 = f75, f32 ;; FNMA f66 = f67, f33, f66 FNMA f74 = f75, f33, f74 ;; FNMA f65 = f67, f34, f65 FNMA f73 = f75, f34, f73 ;; FNMA f64 = f67, f35, f64 FNMA f72 = f75, f35, f72 ;; FMPY f66 = f66, f36 FMPY f74 = f74, f36 ;; FNMA f65 = f66, f37, f65 FNMA f73 = f74, f37, f73 ;; FNMA f64 = f66, f38, f64 FNMA f72 = f74, f38, f72 ;; FMPY f65 = f65, f39 FMPY f73 = f73, f39 ;; FNMA f64 = f65, f40, f64 FNMA f72 = f73, f40, f72 ;; FMPY f64 = f64, f41 FMPY f72 = f72, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE STFD [BOFFSET2] = f75, -3 * SIZE ;; adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FMPY f65 = f65, f36 FMPY f73 = f73, f36 ;; FNMA f66 = f65, f37, f66 FNMA f74 = f73, f37, f74 ;; FNMA f67 = f65, f38, f67 FNMA f75 = f73, f38, f75 ;; FMPY f66 = f66, f39 FMPY f74 = f74, f39 ;; FNMA f67 = f66, f40, f67 FNMA f75 = f74, f40, f75 ;; FMPY f67 = f67, f41 FMPY f75 = f75, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE STFD [BOFFSET2] = f75, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 FMPY f74 = f74, f34 FMPY f75 = f75, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f75, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f73 = f73, f32 FMPY f74 = f74, f32 FMPY f75 = f75, f32 ;; FNMA f64 = f72, f33, f64 FNMA f65 = f73, f33, f65 FNMA f66 = f74, f33, f66 FNMA f67 = f75, f33, f67 ;; FMPY f64 = f64, f34 FMPY f65 = f65, f34 FMPY f66 = f66, f34 FMPY f67 = f67, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f75, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi #ifndef LN STFD [C2 ] = f75, SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif } ;; mov f65 = f0 mov f73 = f0 mov f66 = f0 mov f74 = f0 mov f67 = f0 mov f75 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L110: tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L120 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L118 } ;; .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 br.cloop.sptk.few .L112 } ;; .align 8 .L118: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f73 = f73, f32 ;; FNMA f64 = f65, f33, f64 FNMA f72 = f73, f33, f72 ;; FMPY f64 = f64, f34 FMPY f72 = f72, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f73, - 3 * SIZE ;; adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FMPY f65 = f65, f34 FMPY f73 = f73, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f73 = f73, f32 ;; FNMA f64 = f72, f33, f64 FNMA f65 = f73, f33, f65 ;; FMPY f64 = f64, f34 FMPY f65 = f65, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; #ifndef LN STFD [C2 ] = f73, SIZE #else STFD [C2 ] = f73, -SIZE #endif ;; mov f65 = f0 mov f73 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L120: tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L129 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi adds L = -1, L } ;; { .mmi cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L128 } ;; .align 8 .L122: { .mfi FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 br.cloop.sptk.few .L122 } ;; .L128: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; #else LDFPD f32, f33 = [AOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; #endif #ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, -SIZE adds C2 = -1 * SIZE, C2 } ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, -SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FMPY f72 = f72, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, -SIZE ;; #endif #ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 ;; FNMA f64 = f72, f33, f64 ;; FMPY f64 = f64, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, -SIZE ;; #endif #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif #ifndef LN STFD [C2 ] = f72, SIZE #else STFD [C2 ] = f72 #endif mov f64 = f0 mov f72 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET #else nop __LINE__ #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 8 .L129: #ifdef LN shladd KK8 = K, BASE_SHIFT, r0 ;; shladd B = KK8, 1, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 2, KK #endif #ifdef RT adds KK = -2, KK #endif ;; mov AOFFSET = A ;; .align 16 .L050: { .mib setf.d f64 = r0 tbit.z p6, p0 = N, 2 (p6) br.cond.dpnt .L000 } ;; #ifdef RT { .mmi shladd r3 = LDC, 2, r0 nop __LINE__ shl r2 = K, 2 + BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } #endif ;; { .mfi setf.d f72 = r0 mov f80 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f88 = f0 #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif mov f65 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f73 = f0 } ;; { .mfi #ifndef RT shladd C = LDC, 2, C // coffset += 8 * ldc #else nop __LINE__ #endif mov f81 = f0 #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif }{ .mfb shladd C4 = LDC, 1, C2 mov f89 = f0 (p6) br.cond.dpnt .L060 } ;; .align 16 .L052: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } ;; #else { .mfi shladd BOFFSET = r3, 2, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 3, AORIG } ;; #endif { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f74 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f82 = r0 mov f90 = f0 } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds L = 1, L } ;; { .mmf CPREFETCH [PREC], LDC } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC] } ;; { .mfi setf.d f70 = r0 mov f78 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mfi setf.d f71 = r0 adds L = -1, L } ;; { .mfi setf.d f87 = r0 mov f79 = f0 mov ar.lc = L } { .mfb cmp.eq p6, p0 = -1, L mov f95 = f0 (p6) br.cond.dpnt .L058 } ;; .align 8 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 adds C9 = 4 * SIZE, C1 } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C10 = 4 * SIZE, C2 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C11 = 4 * SIZE, C3 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f95 = f47, f59, f95 // A8 * B4 br.cloop.sptk.few .L053 } ;; .align 8 .L058: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -8, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [BOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [BOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [BOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [BOFFSET] adds BOFFSET = -30 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 FSUB f66 = f40, f66 FSUB f74 = f41, f74 FSUB f82 = f42, f82 FSUB f90 = f43, f90 FSUB f67 = f44, f67 FSUB f75 = f45, f75 FSUB f83 = f46, f83 FSUB f91 = f47, f91 FSUB f68 = f48, f68 FSUB f76 = f49, f76 FSUB f84 = f50, f84 FSUB f92 = f51, f92 FSUB f69 = f52, f69 FSUB f77 = f53, f77 FSUB f85 = f54, f85 FSUB f93 = f55, f93 FSUB f70 = f56, f70 FSUB f78 = f57, f78 FSUB f86 = f58, f86 FSUB f94 = f59, f94 FSUB f71 = f60, f71 FSUB f79 = f61, f79 FSUB f87 = f62, f87 FSUB f95 = f63, f95 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [AOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [AOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [AOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [AOFFSET] adds AOFFSET = -30 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; FSUB f72 = f40, f72 FSUB f73 = f41, f73 FSUB f74 = f42, f74 FSUB f75 = f43, f75 FSUB f76 = f44, f76 FSUB f77 = f45, f77 FSUB f78 = f46, f78 FSUB f79 = f47, f79 ;; FSUB f80 = f48, f80 FSUB f81 = f49, f81 FSUB f82 = f50, f82 FSUB f83 = f51, f83 FSUB f84 = f52, f84 FSUB f85 = f53, f85 FSUB f86 = f54, f86 FSUB f87 = f55, f87 FSUB f88 = f56, f88 FSUB f89 = f57, f89 FSUB f90 = f58, f90 FSUB f91 = f59, f91 FSUB f92 = f60, f92 FSUB f93 = f61, f93 FSUB f94 = f62, f94 FSUB f95 = f63, f95 ;; #endif #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f37, f36 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f39, f38 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f44, f43 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f46, f45 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f48, f47 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f50, f49 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f52, f51 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f57, f56 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f59, f58 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f61, f60 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f20, f19 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] ;; FMPY f71 = f71, f32 FMPY f79 = f79, f32 FMPY f87 = f87, f32 FMPY f95 = f95, f32 ;; FNMA f70 = f71, f33, f70 FNMA f78 = f79, f33, f78 FNMA f86 = f87, f33, f86 FNMA f94 = f95, f33, f94 ;; FNMA f69 = f71, f34, f69 FNMA f77 = f79, f34, f77 FNMA f85 = f87, f34, f85 FNMA f93 = f95, f34, f93 ;; FNMA f68 = f71, f35, f68 FNMA f76 = f79, f35, f76 FNMA f84 = f87, f35, f84 FNMA f92 = f95, f35, f92 ;; FNMA f67 = f71, f36, f67 FNMA f75 = f79, f36, f75 FNMA f83 = f87, f36, f83 FNMA f91 = f95, f36, f91 ;; FNMA f66 = f71, f37, f66 FNMA f74 = f79, f37, f74 FNMA f82 = f87, f37, f82 FNMA f90 = f95, f37, f90 ;; FNMA f65 = f71, f38, f65 FNMA f73 = f79, f38, f73 FNMA f81 = f87, f38, f81 FNMA f89 = f95, f38, f89 ;; FNMA f64 = f71, f39, f64 FNMA f72 = f79, f39, f72 FNMA f80 = f87, f39, f80 FNMA f88 = f95, f39, f88 ;; FMPY f70 = f70, f40 FMPY f78 = f78, f40 FMPY f86 = f86, f40 FMPY f94 = f94, f40 ;; FNMA f69 = f70, f41, f69 FNMA f77 = f78, f41, f77 FNMA f85 = f86, f41, f85 FNMA f93 = f94, f41, f93 ;; FNMA f68 = f70, f42, f68 FNMA f76 = f78, f42, f76 FNMA f84 = f86, f42, f84 FNMA f92 = f94, f42, f92 ;; FNMA f67 = f70, f43, f67 FNMA f75 = f78, f43, f75 FNMA f83 = f86, f43, f83 FNMA f91 = f94, f43, f91 ;; FNMA f66 = f70, f44, f66 FNMA f74 = f78, f44, f74 FNMA f82 = f86, f44, f82 FNMA f90 = f94, f44, f90 ;; FNMA f65 = f70, f45, f65 FNMA f73 = f78, f45, f73 FNMA f81 = f86, f45, f81 FNMA f89 = f94, f45, f89 ;; FNMA f64 = f70, f46, f64 FNMA f72 = f78, f46, f72 FNMA f80 = f86, f46, f80 FNMA f88 = f94, f46, f88 ;; FMPY f69 = f69, f47 FMPY f77 = f77, f47 FMPY f85 = f85, f47 FMPY f93 = f93, f47 ;; FNMA f68 = f69, f48, f68 FNMA f76 = f77, f48, f76 FNMA f84 = f85, f48, f84 FNMA f92 = f93, f48, f92 ;; FNMA f67 = f69, f49, f67 FNMA f75 = f77, f49, f75 FNMA f83 = f85, f49, f83 FNMA f91 = f93, f49, f91 ;; FNMA f66 = f69, f50, f66 FNMA f74 = f77, f50, f74 FNMA f82 = f85, f50, f82 FNMA f90 = f93, f50, f90 ;; FNMA f65 = f69, f51, f65 FNMA f73 = f77, f51, f73 FNMA f81 = f85, f51, f81 FNMA f89 = f93, f51, f89 ;; FNMA f64 = f69, f52, f64 FNMA f72 = f77, f52, f72 FNMA f80 = f85, f52, f80 FNMA f88 = f93, f52, f88 ;; FMPY f68 = f68, f53 FMPY f76 = f76, f53 FMPY f84 = f84, f53 FMPY f92 = f92, f53 ;; FNMA f67 = f68, f54, f67 FNMA f75 = f76, f54, f75 FNMA f83 = f84, f54, f83 FNMA f91 = f92, f54, f91 ;; FNMA f66 = f68, f55, f66 FNMA f74 = f76, f55, f74 FNMA f82 = f84, f55, f82 FNMA f90 = f92, f55, f90 ;; FNMA f65 = f68, f56, f65 FNMA f73 = f76, f56, f73 FNMA f81 = f84, f56, f81 FNMA f89 = f92, f56, f89 ;; FNMA f64 = f68, f57, f64 FNMA f72 = f76, f57, f72 FNMA f80 = f84, f57, f80 FNMA f88 = f92, f57, f88 ;; FMPY f67 = f67, f58 FMPY f75 = f75, f58 FMPY f83 = f83, f58 FMPY f91 = f91, f58 ;; FNMA f66 = f67, f59, f66 FNMA f74 = f75, f59, f74 FNMA f82 = f83, f59, f82 FNMA f90 = f91, f59, f90 ;; FNMA f65 = f67, f60, f65 FNMA f73 = f75, f60, f73 FNMA f81 = f83, f60, f81 FNMA f89 = f91, f60, f89 ;; FNMA f64 = f67, f61, f64 FNMA f72 = f75, f61, f72 FNMA f80 = f83, f61, f80 FNMA f88 = f91, f61, f88 ;; FMPY f66 = f66, f16 FMPY f74 = f74, f16 FMPY f82 = f82, f16 FMPY f90 = f90, f16 ;; FNMA f65 = f66, f17, f65 FNMA f73 = f74, f17, f73 FNMA f81 = f82, f17, f81 FNMA f89 = f90, f17, f89 ;; FNMA f64 = f66, f18, f64 FNMA f72 = f74, f18, f72 FNMA f80 = f82, f18, f80 FNMA f88 = f90, f18, f88 ;; FMPY f65 = f65, f19 FMPY f73 = f73, f19 FMPY f81 = f81, f19 FMPY f89 = f89, f19 ;; FNMA f64 = f65, f20, f64 FNMA f72 = f73, f20, f72 FNMA f80 = f81, f20, f80 FNMA f88 = f89, f20, f88 ;; FMPY f64 = f64, f21 FMPY f72 = f72, f21 FMPY f80 = f80, f21 FMPY f88 = f88, f21 ;; adds BOFFSET = 24 * SIZE, BOFFSET adds BOFFSET2 = 24 * SIZE, BOFFSET2 ;; STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f78, SIZE STFD [BOFFSET2] = f79, SIZE ;; STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f87, SIZE ;; STFD [BOFFSET] = f94, - 11 * SIZE STFD [BOFFSET2] = f95, - 11 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f77, SIZE ;; STFD [BOFFSET] = f84, SIZE STFD [BOFFSET2] = f85, SIZE ;; STFD [BOFFSET] = f92, - 11 * SIZE STFD [BOFFSET2] = f93, - 11 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, - 11 * SIZE STFD [BOFFSET2] = f91, - 11 * SIZE ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f89, - 3 * SIZE ;; adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C3 = -8 * SIZE, C3 adds C4 = -8 * SIZE, C4 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 FNMA f82 = f80, f34, f82 FNMA f90 = f88, f34, f90 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 FNMA f83 = f80, f35, f83 FNMA f91 = f88, f35, f91 ;; FNMA f68 = f64, f36, f68 FNMA f76 = f72, f36, f76 FNMA f84 = f80, f36, f84 FNMA f92 = f88, f36, f92 ;; FNMA f69 = f64, f37, f69 FNMA f77 = f72, f37, f77 FNMA f85 = f80, f37, f85 FNMA f93 = f88, f37, f93 ;; FNMA f70 = f64, f38, f70 FNMA f78 = f72, f38, f78 FNMA f86 = f80, f38, f86 FNMA f94 = f88, f38, f94 ;; FNMA f71 = f64, f39, f71 FNMA f79 = f72, f39, f79 FNMA f87 = f80, f39, f87 FNMA f95 = f88, f39, f95 ;; FMPY f65 = f65, f40 FMPY f73 = f73, f40 FMPY f81 = f81, f40 FMPY f89 = f89, f40 ;; FNMA f66 = f65, f41, f66 FNMA f74 = f73, f41, f74 FNMA f82 = f81, f41, f82 FNMA f90 = f89, f41, f90 ;; FNMA f67 = f65, f42, f67 FNMA f75 = f73, f42, f75 FNMA f83 = f81, f42, f83 FNMA f91 = f89, f42, f91 ;; FNMA f68 = f65, f43, f68 FNMA f76 = f73, f43, f76 FNMA f84 = f81, f43, f84 FNMA f92 = f89, f43, f92 ;; FNMA f69 = f65, f44, f69 FNMA f77 = f73, f44, f77 FNMA f85 = f81, f44, f85 FNMA f93 = f89, f44, f93 ;; FNMA f70 = f65, f45, f70 FNMA f78 = f73, f45, f78 FNMA f86 = f81, f45, f86 FNMA f94 = f89, f45, f94 ;; FNMA f71 = f65, f46, f71 FNMA f79 = f73, f46, f79 FNMA f87 = f81, f46, f87 FNMA f95 = f89, f46, f95 ;; FMPY f66 = f66, f47 FMPY f74 = f74, f47 FMPY f82 = f82, f47 FMPY f90 = f90, f47 ;; FNMA f67 = f66, f48, f67 FNMA f75 = f74, f48, f75 FNMA f83 = f82, f48, f83 FNMA f91 = f90, f48, f91 ;; FNMA f68 = f66, f49, f68 FNMA f76 = f74, f49, f76 FNMA f84 = f82, f49, f84 FNMA f92 = f90, f49, f92 ;; FNMA f69 = f66, f50, f69 FNMA f77 = f74, f50, f77 FNMA f85 = f82, f50, f85 FNMA f93 = f90, f50, f93 ;; FNMA f70 = f66, f51, f70 FNMA f78 = f74, f51, f78 FNMA f86 = f82, f51, f86 FNMA f94 = f90, f51, f94 ;; FNMA f71 = f66, f52, f71 FNMA f79 = f74, f52, f79 FNMA f87 = f82, f52, f87 FNMA f95 = f90, f52, f95 ;; FMPY f67 = f67, f53 FMPY f75 = f75, f53 FMPY f83 = f83, f53 FMPY f91 = f91, f53 ;; FNMA f68 = f67, f54, f68 FNMA f76 = f75, f54, f76 FNMA f84 = f83, f54, f84 FNMA f92 = f91, f54, f92 ;; FNMA f69 = f67, f55, f69 FNMA f77 = f75, f55, f77 FNMA f85 = f83, f55, f85 FNMA f93 = f91, f55, f93 ;; FNMA f70 = f67, f56, f70 FNMA f78 = f75, f56, f78 FNMA f86 = f83, f56, f86 FNMA f94 = f91, f56, f94 ;; FNMA f71 = f67, f57, f71 FNMA f79 = f75, f57, f79 FNMA f87 = f83, f57, f87 FNMA f95 = f91, f57, f95 ;; FMPY f68 = f68, f58 FMPY f76 = f76, f58 FMPY f84 = f84, f58 FMPY f92 = f92, f58 ;; FNMA f69 = f68, f59, f69 FNMA f77 = f76, f59, f77 FNMA f85 = f84, f59, f85 FNMA f93 = f92, f59, f93 ;; FNMA f70 = f68, f60, f70 FNMA f78 = f76, f60, f78 FNMA f86 = f84, f60, f86 FNMA f94 = f92, f60, f94 ;; FNMA f71 = f68, f61, f71 FNMA f79 = f76, f61, f79 FNMA f87 = f84, f61, f87 FNMA f95 = f92, f61, f95 ;; FMPY f69 = f69, f16 FMPY f77 = f77, f16 FMPY f85 = f85, f16 FMPY f93 = f93, f16 ;; FNMA f70 = f69, f17, f70 FNMA f78 = f77, f17, f78 FNMA f86 = f85, f17, f86 FNMA f94 = f93, f17, f94 ;; FNMA f71 = f69, f18, f71 FNMA f79 = f77, f18, f79 FNMA f87 = f85, f18, f87 FNMA f95 = f93, f18, f95 ;; FMPY f70 = f70, f19 FMPY f78 = f78, f19 FMPY f86 = f86, f19 FMPY f94 = f94, f19 ;; FNMA f71 = f70, f20, f71 FNMA f79 = f78, f20, f79 FNMA f87 = f86, f20, f87 FNMA f95 = f94, f20, f95 ;; FMPY f71 = f71, f21 FMPY f79 = f79, f21 FMPY f87 = f87, f21 FMPY f95 = f95, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, 5 * SIZE STFD [BOFFSET2] = f89, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, 5 * SIZE STFD [BOFFSET2] = f91, 5 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f69, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f77, SIZE ;; STFD [BOFFSET] = f84, SIZE STFD [BOFFSET2] = f85, SIZE ;; STFD [BOFFSET] = f92, 5 * SIZE STFD [BOFFSET2] = f93, 5 * SIZE ;; STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f78, SIZE STFD [BOFFSET2] = f79, SIZE ;; STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f87, SIZE ;; STFD [BOFFSET] = f94 STFD [BOFFSET2] = f95 adds C9 = 4 * SIZE, C1 adds BOFFSET = - 27 * SIZE, BOFFSET adds BOFFSET2 = - 27 * SIZE, BOFFSET2 ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; FNMA f72 = f64, f33, f72 FNMA f76 = f68, f33, f76 FNMA f73 = f65, f33, f73 FNMA f77 = f69, f33, f77 FNMA f74 = f66, f33, f74 FNMA f78 = f70, f33, f78 FNMA f75 = f67, f33, f75 FNMA f79 = f71, f33, f79 ;; FNMA f80 = f64, f34, f80 FNMA f84 = f68, f34, f84 FNMA f81 = f65, f34, f81 FNMA f85 = f69, f34, f85 FNMA f82 = f66, f34, f82 FNMA f86 = f70, f34, f86 FNMA f83 = f67, f34, f83 FNMA f87 = f71, f34, f87 ;; FNMA f88 = f64, f35, f88 FNMA f92 = f68, f35, f92 FNMA f89 = f65, f35, f89 FNMA f93 = f69, f35, f93 FNMA f90 = f66, f35, f90 FNMA f94 = f70, f35, f94 FNMA f91 = f67, f35, f91 FNMA f95 = f71, f35, f95 ;; FMPY f72 = f72, f36 FMPY f76 = f76, f36 FMPY f73 = f73, f36 FMPY f77 = f77, f36 FMPY f74 = f74, f36 FMPY f78 = f78, f36 FMPY f75 = f75, f36 FMPY f79 = f79, f36 ;; FNMA f80 = f72, f37, f80 FNMA f84 = f76, f37, f84 FNMA f81 = f73, f37, f81 FNMA f85 = f77, f37, f85 FNMA f82 = f74, f37, f82 FNMA f86 = f78, f37, f86 FNMA f83 = f75, f37, f83 FNMA f87 = f79, f37, f87 ;; FNMA f88 = f72, f38, f88 FNMA f92 = f76, f38, f92 FNMA f89 = f73, f38, f89 FNMA f93 = f77, f38, f93 FNMA f90 = f74, f38, f90 FNMA f94 = f78, f38, f94 FNMA f91 = f75, f38, f91 FNMA f95 = f79, f38, f95 ;; FMPY f80 = f80, f39 FMPY f84 = f84, f39 FMPY f81 = f81, f39 FMPY f85 = f85, f39 FMPY f82 = f82, f39 FMPY f86 = f86, f39 FMPY f83 = f83, f39 FMPY f87 = f87, f39 ;; FNMA f88 = f80, f40, f88 FNMA f92 = f84, f40, f92 FNMA f89 = f81, f40, f89 FNMA f93 = f85, f40, f93 FNMA f90 = f82, f40, f90 FNMA f94 = f86, f40, f94 FNMA f91 = f83, f40, f91 FNMA f95 = f87, f40, f95 ;; FMPY f88 = f88, f41 FMPY f92 = f92, f41 FMPY f89 = f89, f41 FMPY f93 = f93, f41 FMPY f90 = f90, f41 FMPY f94 = f94, f41 FMPY f91 = f91, f41 FMPY f95 = f95, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f71, 5 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, 5 * SIZE STFD [AOFFSET2] = f79, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f84, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f85, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f86, SIZE ;; STFD [AOFFSET] = f83, 5 * SIZE STFD [AOFFSET2] = f87, 5 * SIZE ;; STFD [AOFFSET] = f88, SIZE STFD [AOFFSET2] = f92, SIZE ;; STFD [AOFFSET] = f89, SIZE STFD [AOFFSET2] = f93, SIZE ;; STFD [AOFFSET] = f90, SIZE STFD [AOFFSET2] = f94, SIZE ;; STFD [AOFFSET] = f91, -27 * SIZE STFD [AOFFSET2] = f95, -27 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], -2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 FMPY f92 = f92, f32 FMPY f89 = f89, f32 FMPY f93 = f93, f32 FMPY f90 = f90, f32 FMPY f94 = f94, f32 FMPY f91 = f91, f32 FMPY f95 = f95, f32 ;; FNMA f80 = f88, f33, f80 FNMA f84 = f92, f33, f84 FNMA f81 = f89, f33, f81 FNMA f85 = f93, f33, f85 FNMA f82 = f90, f33, f82 FNMA f86 = f94, f33, f86 FNMA f83 = f91, f33, f83 FNMA f87 = f95, f33, f87 ;; FNMA f72 = f88, f34, f72 FNMA f76 = f92, f34, f76 FNMA f73 = f89, f34, f73 FNMA f77 = f93, f34, f77 FNMA f74 = f90, f34, f74 FNMA f78 = f94, f34, f78 FNMA f75 = f91, f34, f75 FNMA f79 = f95, f34, f79 ;; FNMA f64 = f88, f35, f64 FNMA f68 = f92, f35, f68 FNMA f65 = f89, f35, f65 FNMA f69 = f93, f35, f69 FNMA f66 = f90, f35, f66 FNMA f70 = f94, f35, f70 FNMA f67 = f91, f35, f67 FNMA f71 = f95, f35, f71 ;; FMPY f80 = f80, f36 FMPY f84 = f84, f36 FMPY f81 = f81, f36 FMPY f85 = f85, f36 FMPY f82 = f82, f36 FMPY f86 = f86, f36 FMPY f83 = f83, f36 FMPY f87 = f87, f36 ;; FNMA f72 = f80, f37, f72 FNMA f76 = f84, f37, f76 FNMA f73 = f81, f37, f73 FNMA f77 = f85, f37, f77 FNMA f74 = f82, f37, f74 FNMA f78 = f86, f37, f78 FNMA f75 = f83, f37, f75 FNMA f79 = f87, f37, f79 ;; FNMA f64 = f80, f38, f64 FNMA f68 = f84, f38, f68 FNMA f65 = f81, f38, f65 FNMA f69 = f85, f38, f69 FNMA f66 = f82, f38, f66 FNMA f70 = f86, f38, f70 FNMA f67 = f83, f38, f67 FNMA f71 = f87, f38, f71 ;; FMPY f72 = f72, f39 FMPY f76 = f76, f39 FMPY f73 = f73, f39 FMPY f77 = f77, f39 FMPY f74 = f74, f39 FMPY f78 = f78, f39 FMPY f75 = f75, f39 FMPY f79 = f79, f39 ;; FNMA f64 = f72, f40, f64 FNMA f68 = f76, f40, f68 FNMA f65 = f73, f40, f65 FNMA f69 = f77, f40, f69 FNMA f66 = f74, f40, f66 FNMA f70 = f78, f40, f70 FNMA f67 = f75, f40, f67 FNMA f71 = f79, f40, f71 ;; FMPY f64 = f64, f41 FMPY f68 = f68, f41 FMPY f65 = f65, f41 FMPY f69 = f69, f41 FMPY f66 = f66, f41 FMPY f70 = f70, f41 FMPY f67 = f67, f41 FMPY f71 = f71, f41 ;; adds AOFFSET = 24 * SIZE, AOFFSET adds AOFFSET2 = 24 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f88, SIZE STFD [AOFFSET2] = f92, SIZE ;; STFD [AOFFSET] = f89, SIZE STFD [AOFFSET2] = f93, SIZE ;; STFD [AOFFSET] = f90, SIZE STFD [AOFFSET2] = f94, SIZE ;; STFD [AOFFSET] = f91, - 11 * SIZE STFD [AOFFSET2] = f95, - 11 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f84, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f85, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f86, SIZE ;; STFD [AOFFSET] = f83, - 11 * SIZE STFD [AOFFSET2] = f87, - 11 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, - 11 * SIZE STFD [AOFFSET2] = f79, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f71, - 3 * SIZE ;; #endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, 5 * SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi #ifndef LN STFD [C2 ] = f75, 5 * SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif STFD [C10] = f79 } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C11] = f84, SIZE } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C11] = f85, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE STFD [C11] = f86, SIZE } ;; { .mmi #ifndef LN STFD [C3 ] = f83, 5 * SIZE #else STFD [C3 ] = f83, - 3 * SIZE #endif STFD [C11] = f87 } ;; { .mmf STFD [C4 ] = f88, SIZE STFD [C12] = f92, SIZE } ;; { .mmi STFD [C4 ] = f89, SIZE STFD [C12] = f93, SIZE } ;; { .mmi STFD [C4 ] = f90, SIZE STFD [C12] = f94, SIZE } ;; { .mmi #ifndef LN STFD [C4 ] = f91, 5 * SIZE #else STFD [C4 ] = f91, - 3 * SIZE #endif STFD [C12] = f95 cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 3, AORIG #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifdef LT adds KK = 8, KK #elif defined LN adds KK = -8, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f65 = f0 mov f73 = f0 mov f81 = f0 mov f89 = f0 { .mmb (p6) br.cond.dptk .L052 } ;; .align 8 .L060: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L070 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; mov f66 = f0 mov f67 = f0 mov f74 = f0 mov f75 = f0 mov f82 = f0 mov f83 = f0 mov f90 = f0 mov f91 = f0 ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L068 } ;; .align 8 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 br.cloop.sptk.few .L062 } ;; .align 8 .L068: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 ;; FSUB f66 = f40, f66 FSUB f74 = f41, f74 FSUB f82 = f42, f82 FSUB f90 = f43, f90 ;; FSUB f67 = f44, f67 FSUB f75 = f45, f75 FSUB f83 = f46, f83 FSUB f91 = f47, f91 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 FSUB f80 = f40, f80 FSUB f81 = f41, f81 FSUB f82 = f42, f82 FSUB f83 = f43, f83 FSUB f88 = f44, f88 FSUB f89 = f45, f89 FSUB f90 = f46, f90 FSUB f91 = f47, f91 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 FMPY f75 = f75, f32 FMPY f83 = f83, f32 FMPY f91 = f91, f32 ;; FNMA f66 = f67, f33, f66 FNMA f74 = f75, f33, f74 FNMA f82 = f83, f33, f82 FNMA f90 = f91, f33, f90 ;; FNMA f65 = f67, f34, f65 FNMA f73 = f75, f34, f73 FNMA f81 = f83, f34, f81 FNMA f89 = f91, f34, f89 ;; FNMA f64 = f67, f35, f64 FNMA f72 = f75, f35, f72 FNMA f80 = f83, f35, f80 FNMA f88 = f91, f35, f88 ;; FMPY f66 = f66, f36 FMPY f74 = f74, f36 FMPY f82 = f82, f36 FMPY f90 = f90, f36 ;; FNMA f65 = f66, f37, f65 FNMA f73 = f74, f37, f73 FNMA f81 = f82, f37, f81 FNMA f89 = f90, f37, f89 ;; FNMA f64 = f66, f38, f64 FNMA f72 = f74, f38, f72 FNMA f80 = f82, f38, f80 FNMA f88 = f90, f38, f88 ;; FMPY f65 = f65, f39 FMPY f73 = f73, f39 FMPY f81 = f81, f39 FMPY f89 = f89, f39 ;; FNMA f64 = f65, f40, f64 FNMA f72 = f73, f40, f72 FNMA f80 = f81, f40, f80 FNMA f88 = f89, f40, f88 ;; FMPY f64 = f64, f41 FMPY f72 = f72, f41 FMPY f80 = f80, f41 FMPY f88 = f88, f41 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, - 11 * SIZE STFD [BOFFSET2] = f91, - 11 * SIZE ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE STFD [BOFFSET2] = f89, -3 * SIZE ;; adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 FNMA f82 = f80, f34, f82 FNMA f90 = f88, f34, f90 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 FNMA f83 = f80, f35, f83 FNMA f91 = f88, f35, f91 ;; FMPY f65 = f65, f36 FMPY f73 = f73, f36 FMPY f81 = f81, f36 FMPY f89 = f89, f36 ;; FNMA f66 = f65, f37, f66 FNMA f74 = f73, f37, f74 FNMA f82 = f81, f37, f82 FNMA f90 = f89, f37, f90 ;; FNMA f67 = f65, f38, f67 FNMA f75 = f73, f38, f75 FNMA f83 = f81, f38, f83 FNMA f91 = f89, f38, f91 ;; FMPY f66 = f66, f39 FMPY f74 = f74, f39 FMPY f82 = f82, f39 FMPY f90 = f90, f39 ;; FNMA f67 = f66, f40, f67 FNMA f75 = f74, f40, f75 FNMA f83 = f82, f40, f83 FNMA f91 = f90, f40, f91 ;; FMPY f67 = f67, f41 FMPY f75 = f75, f41 FMPY f83 = f83, f41 FMPY f91 = f91, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, 5 * SIZE STFD [BOFFSET2] = f89, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f74, SIZE STFD [BOFFSET2] = f75, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f83, SIZE ;; STFD [BOFFSET] = f90, -11 * SIZE STFD [BOFFSET2] = f91, -11 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 FNMA f82 = f66, f34, f82 FNMA f83 = f67, f34, f83 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 FNMA f90 = f66, f35, f90 FNMA f91 = f67, f35, f91 ;; FMPY f72 = f72, f36 FMPY f73 = f73, f36 FMPY f74 = f74, f36 FMPY f75 = f75, f36 ;; FNMA f80 = f72, f37, f80 FNMA f81 = f73, f37, f81 FNMA f82 = f74, f37, f82 FNMA f83 = f75, f37, f83 ;; FNMA f88 = f72, f38, f88 FNMA f89 = f73, f38, f89 FNMA f90 = f74, f38, f90 FNMA f91 = f75, f38, f91 ;; FMPY f80 = f80, f39 FMPY f81 = f81, f39 FMPY f82 = f82, f39 FMPY f83 = f83, f39 ;; FNMA f88 = f80, f40, f88 FNMA f89 = f81, f40, f89 FNMA f90 = f82, f40, f90 FNMA f91 = f83, f40, f91 ;; FMPY f88 = f88, f41 FMPY f89 = f89, f41 FMPY f90 = f90, f41 FMPY f91 = f91, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f75, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, -11 * SIZE STFD [AOFFSET2] = f91, -11 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 FMPY f89 = f89, f32 FMPY f90 = f90, f32 FMPY f91 = f91, f32 ;; FNMA f80 = f88, f33, f80 FNMA f81 = f89, f33, f81 FNMA f82 = f90, f33, f82 FNMA f83 = f91, f33, f83 ;; FNMA f72 = f88, f34, f72 FNMA f73 = f89, f34, f73 FNMA f74 = f90, f34, f74 FNMA f75 = f91, f34, f75 ;; FNMA f64 = f88, f35, f64 FNMA f65 = f89, f35, f65 FNMA f66 = f90, f35, f66 FNMA f67 = f91, f35, f67 ;; FMPY f80 = f80, f36 FMPY f81 = f81, f36 FMPY f82 = f82, f36 FMPY f83 = f83, f36 ;; FNMA f72 = f80, f37, f72 FNMA f73 = f81, f37, f73 FNMA f74 = f82, f37, f74 FNMA f75 = f83, f37, f75 ;; FNMA f64 = f80, f38, f64 FNMA f65 = f81, f38, f65 FNMA f66 = f82, f38, f66 FNMA f67 = f83, f38, f67 ;; FMPY f72 = f72, f39 FMPY f73 = f73, f39 FMPY f74 = f74, f39 FMPY f75 = f75, f39 ;; FNMA f64 = f72, f40, f64 FNMA f65 = f73, f40, f65 FNMA f66 = f74, f40, f66 FNMA f67 = f75, f40, f67 ;; FMPY f64 = f64, f41 FMPY f65 = f65, f41 FMPY f66 = f66, f41 FMPY f67 = f67, f41 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, - 11 * SIZE STFD [AOFFSET2] = f91, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f75, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi #ifndef LN STFD [C2 ] = f75, SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif } ;; { .mmf STFD [C3 ] = f80, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE } ;; { .mmi #ifndef LN STFD [C3 ] = f83, SIZE #else STFD [C3 ] = f83, - 3 * SIZE #endif } ;; { .mmf STFD [C4 ] = f88, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE } ;; { .mmi STFD [C4 ] = f90, SIZE } ;; { .mmi #ifndef LN STFD [C4 ] = f91, SIZE #else STFD [C4 ] = f91, - 3 * SIZE #endif nop __LINE__ } ;; mov f65 = f0 ;; mov f73 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; { .mmi sub L = K, KK } ;; { .mmi #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmf mov f81 = f0 } ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmf mov f89 = f0 } ;; { .mmi #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L070: tbit.z p6,p0 = M, 1 (p6) br.cond.dptk .L080 ;; { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; mov f73 = f0 ;; { .mfi mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L078 } ;; .align 8 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf nop __LINE__ nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 br.cloop.sptk.few .L072 } ;; .L078: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f65 = f36, f65 FSUB f73 = f37, f73 FSUB f81 = f38, f81 FSUB f89 = f39, f89 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 FSUB f80 = f36, f80 FSUB f81 = f37, f81 FSUB f88 = f38, f88 FSUB f89 = f39, f89 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f73 = f73, f32 FMPY f81 = f81, f32 FMPY f89 = f89, f32 ;; FNMA f64 = f65, f33, f64 FNMA f72 = f73, f33, f72 FNMA f80 = f81, f33, f80 FNMA f88 = f89, f33, f88 ;; FMPY f64 = f64, f34 FMPY f72 = f72, f34 FMPY f80 = f80, f34 FMPY f88 = f88, f34 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f89, - 3 * SIZE ;; adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 FNMA f81 = f80, f33, f81 FNMA f89 = f88, f33, f89 ;; FMPY f65 = f65, f34 FMPY f73 = f73, f34 FMPY f81 = f81, f34 FMPY f89 = f89, f34 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f65, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f73, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f81, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE STFD [BOFFSET2] = f89, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 ;; FMPY f72 = f72, f36 FMPY f73 = f73, f36 ;; FNMA f80 = f72, f37, f80 FNMA f81 = f73, f37, f81 ;; FNMA f88 = f72, f38, f88 FNMA f89 = f73, f38, f89 ;; FMPY f80 = f80, f39 FMPY f81 = f81, f39 ;; FNMA f88 = f80, f40, f88 FNMA f89 = f81, f40, f89 ;; FMPY f88 = f88, f41 FMPY f89 = f89, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE STFD [AOFFSET2] = f89, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 FMPY f89 = f89, f32 ;; FNMA f80 = f88, f33, f80 FNMA f81 = f89, f33, f81 ;; FNMA f72 = f88, f34, f72 FNMA f73 = f89, f34, f73 ;; FNMA f64 = f88, f35, f64 FNMA f65 = f89, f35, f65 ;; FMPY f80 = f80, f36 FMPY f81 = f81, f36 ;; FNMA f72 = f80, f37, f72 FNMA f73 = f81, f37, f73 ;; FNMA f64 = f80, f38, f64 FNMA f65 = f81, f38, f65 ;; FMPY f72 = f72, f39 FMPY f73 = f73, f39 ;; FNMA f64 = f72, f40, f64 FNMA f65 = f73, f40, f65 ;; FMPY f64 = f64, f41 FMPY f65 = f65, f41 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE STFD [AOFFSET2] = f89, -3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; #ifndef LN STFD [C2 ] = f73, SIZE #else STFD [C2 ] = f73, -SIZE #endif ;; STFD [C3 ] = f80, SIZE mov f80 = f0 ;; #ifndef LN STFD [C3 ] = f81, SIZE #else STFD [C3 ] = f81, - SIZE #endif ;; STFD [C4 ] = f88, SIZE mov f88 = f0 ;; #ifndef LN STFD [C4 ] = f89, SIZE #else STFD [C4 ] = f89, -SIZE #endif ;; mov f96 = f0 ;; mov f104 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; mov f112 = f0 ;; { .mmi #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mmf mov f120 = f0 } ;; { .mmi #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L080: tbit.z p6,p7 = M, 0 (p6) br.cond.dptk .L089 { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 2, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi adds L = -1, L } ;; { .mmi cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L088 } ;; .L082: { .mfb cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mmf (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mib (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mmb nop __LINE__ adds L = -1, L br.cloop.sptk.few .L082 } ;; .L088: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 ;; #endif #ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE adds C2 = -1 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE } ;; adds C3 = -1 * SIZE, C3 adds C4 = -1 * SIZE, C4 ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 FMPY f80 = f80, f32 FMPY f88 = f88, f32 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f80, SIZE ;; STFD [BOFFSET] = f88, -3 * SIZE ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET], -15 * SIZE FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FNMA f80 = f64, f34, f80 ;; FNMA f88 = f64, f35, f88 ;; FMPY f72 = f72, f36 ;; FNMA f80 = f72, f37, f80 ;; FNMA f88 = f72, f38, f88 ;; FMPY f80 = f80, f39 ;; FNMA f88 = f80, f40, f88 ;; FMPY f88 = f88, f41 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 14 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f36 = [BOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f40, f39 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f41 = [BOFFSET] ;; FMPY f88 = f88, f32 ;; FNMA f80 = f88, f33, f80 ;; FNMA f72 = f88, f34, f72 ;; FNMA f64 = f88, f35, f64 ;; FMPY f80 = f80, f36 ;; FNMA f72 = f80, f37, f72 ;; FNMA f64 = f80, f38, f64 ;; FMPY f72 = f72, f39 ;; FNMA f64 = f72, f40, f64 ;; FMPY f64 = f64, f41 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f88, - 3 * SIZE ;; #endif #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif #ifndef LN STFD [C2 ] = f72, SIZE #else STFD [C2 ] = f72 #endif #ifndef LN STFD [C3 ] = f80, SIZE #else STFD [C3 ] = f80 #endif #ifndef LN STFD [C4 ] = f88, SIZE #else STFD [C4 ] = f88 #endif ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd BOFFSET = L, 2, BOFFSET #else nop __LINE__ #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 8 .L089: #ifdef LN shladd KK8 = K, BASE_SHIFT, r0 ;; shladd B = KK8, 2, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 4, KK #endif #ifdef RT adds KK = -4, KK #endif ;; mov AOFFSET = A ;; .align 16 .L000: shr J = N, 3 ;; cmp.ge p6, p0 = 0, J (p6) br.cond.dpnt .L999 ;; .align 8 .L010: #ifdef RT { .mmi shladd r3 = LDC, 3, r0 nop __LINE__ shl r2 = K, 3 + BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } #endif ;; { .mfi adds J = -1, J mov f64 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f72 = f0 #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif } ;; { .mmf cmp.eq p6, p7 = 0, I #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif mov f80 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f88 = f0 } ;; { .mmf shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc #ifndef RT shladd C = LDC, 3, C // coffset += 8 * ldc #else nop __LINE__ #endif mov f96 = f0 } { .mmf shladd C4 = LDC, 1, C2 shladd C6 = LDC, 2, C2 mov f104 = f0 } ;; { .mfi shladd C7 = LDC, 2, C3 mov f112 = f0 #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif }{ .mfb shladd C8 = LDC, 2, C4 mov f120 = f0 (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f65 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f73 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 3, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 3, AORIG } ;; #endif { .mfb (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f81 = f0 nop __LINE__ } { .mmf (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f119 = r0 mov f89 = f0 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mfb setf.d f113 = r0 mov f121 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfb setf.d f82 = r0 mov f90 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfb setf.d f114 = r0 mov f122 = f0 nop __LINE__ } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE setf.d f99 = r0 mov f107 = f0 } { .mfi setf.d f115 = r0 mov f123 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds L = 1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f100 = r0 mov f108 = f0 } { .mfi setf.d f116 = r0 mov f124 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f101 = r0 mov f109 = f0 } { .mfi setf.d f117 = r0 mov f125 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f70 = r0 mov f78 = f0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f102 = r0 mov f110 = f0 } { .mfi setf.d f118 = r0 mov f126 = f0 adds L = -1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f71 = r0 mov f79 = f0 } { .mfi setf.d f87 = r0 mov f95 = f0 mov ar.lc = L } ;; { .mmf CPREFETCH [PREC] setf.d f103 = r0 mov f111 = f0 } { .mfb cmp.eq p6, p0 = -1, L mov f127 = f0 (p6) br.cond.dpnt .L018 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.fault.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb adds C9 = 4 * SIZE, C1 FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb adds C10 = 4 * SIZE, C2 FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb adds C11 = 4 * SIZE, C3 FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb adds C12 = 4 * SIZE, C4 FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb adds C13 = 4 * SIZE, C5 FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb adds C14 = 4 * SIZE, C6 FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb adds C15 = 4 * SIZE, C7 FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb adds C16 = 4 * SIZE, C8 FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f108 = f36, f53, f108 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f124 = f36, f55, f124 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f109 = f37, f53, f109 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f125 = f37, f55, f125 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f110 = f38, f53, f110 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f126 = f38, f55, f126 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f111 = f39, f53, f111 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f127 = f39, f55, f127 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f123 = f43, f63, f123 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb nop __LINE__ (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f108 = f44, f61, f108 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb nop __LINE__ (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f124 = f44, f63, f124 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb nop __LINE__ (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f109 = f45, f61, f109 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb nop __LINE__ (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f125 = f45, f63, f125 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb nop __LINE__ (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f110 = f46, f61, f110 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb nop __LINE__ (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f126 = f46, f63, f126 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f95 = f47, f59, f95 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb nop __LINE__ (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f111 = f47, f61, f111 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi nop __LINE__ (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f127 = f47, f63, f127 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L018: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -8, KK #else adds r2 = -8, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 3, B ;; #endif #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET], 2 * SIZE FSUB f113 = f46, f113 nop __LINE__ } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [BOFFSET], 2 * SIZE FSUB f66 = f48, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB f74 = f49, f74 nop __LINE__ } ;; { .mfi LDFPD f34, f35 = [BOFFSET], 2 * SIZE FSUB f82 = f50, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f90 = f51, f90 nop __LINE__ } ;; { .mfi LDFPD f36, f37 = [BOFFSET], 2 * SIZE FSUB f98 = f52, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f106 = f53, f106 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [BOFFSET], 2 * SIZE FSUB f114 = f54, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f122 = f55, f122 nop __LINE__ } ;; { .mfi LDFPD f40, f41 = [BOFFSET], 2 * SIZE FSUB f67 = f56, f67 nop __LINE__ } { .mfi nop __LINE__ FSUB f75 = f57, f75 nop __LINE__ } ;; { .mfi LDFPD f42, f43 = [BOFFSET], 2 * SIZE FSUB f83 = f58, f83 nop __LINE__ } { .mfi nop __LINE__ FSUB f91 = f59, f91 nop __LINE__ } ;; { .mfi LDFPD f44, f45 = [BOFFSET], 2 * SIZE FSUB f99 = f60, f99 nop __LINE__ } { .mfi nop __LINE__ FSUB f107 = f61, f107 nop __LINE__ } ;; { .mfi LDFPD f46, f47 = [BOFFSET], 2 * SIZE FSUB f115 = f62, f115 nop __LINE__ } { .mfi nop __LINE__ FSUB f123 = f63, f123 nop __LINE__ } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f68 = f32, f68 nop __LINE__ } { .mfi nop __LINE__ FSUB f76 = f33, f76 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f84 = f34, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f92 = f35, f92 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f100 = f36, f100 nop __LINE__ } { .mfi nop __LINE__ FSUB f108 = f37, f108 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f116 = f38, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f124 = f39, f124 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f69 = f40, f69 nop __LINE__ } { .mfi nop __LINE__ FSUB f77 = f41, f77 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f85 = f42, f85 nop __LINE__ } { .mfi nop __LINE__ FSUB f93 = f43, f93 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f101 = f44, f101 nop __LINE__ } { .mfi nop __LINE__ FSUB f109 = f45, f109 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET] FSUB f117 = f46, f117 adds BOFFSET = -62 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB f125 = f47, f125 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f70 = f48, f70 #ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET #else nop __LINE__ #endif } { .mfi nop __LINE__ FSUB f78 = f49, f78 nop __LINE__ } { .mfi nop __LINE__ FSUB f86 = f50, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f94 = f51, f94 nop __LINE__ } ;; { .mfi #ifdef LN LDFPD f33, f32 = [AOFFSET] #else LDFPD f32, f33 = [AOFFSET] #endif FSUB f102 = f52, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB f110 = f53, f110 nop __LINE__ } { .mfi nop __LINE__ FSUB f118 = f54, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f126 = f55, f126 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } ;; { .mfi nop __LINE__ FSUB f71 = f56, f71 nop __LINE__ } { .mfi nop __LINE__ FSUB f79 = f57, f79 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f58, f87 nop __LINE__ } { .mfi nop __LINE__ FSUB f95 = f59, f95 nop __LINE__ } { .mfi nop __LINE__ FSUB f103 = f60, f103 nop __LINE__ } { .mfi nop __LINE__ FSUB f111 = f61, f111 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f62, f119 nop __LINE__ } { .mfi nop __LINE__ FSUB f127 = f63, f127 nop __LINE__ } ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [AOFFSET], 2 * SIZE FSUB f64 = f32, f64 } { .mfi FSUB f65 = f33, f65 } ;; { .mfi LDFPD f50, f51 = [AOFFSET], 2 * SIZE FSUB f66 = f34, f66 } { .mfi FSUB f67 = f35, f67 } ;; { .mfi LDFPD f52, f53 = [AOFFSET], 2 * SIZE FSUB f68 = f36, f68 } { .mfi FSUB f69 = f37, f69 } ;; { .mfi LDFPD f54, f55 = [AOFFSET], 2 * SIZE FSUB f70 = f38, f70 } { .mfi FSUB f71 = f39, f71 } ;; { .mfi LDFPD f56, f57 = [AOFFSET], 2 * SIZE FSUB f72 = f40, f72 } { .mfi FSUB f73 = f41, f73 } ;; { .mfi LDFPD f58, f59 = [AOFFSET], 2 * SIZE FSUB f74 = f42, f74 } { .mfi FSUB f75 = f43, f75 } ;; { .mfi LDFPD f60, f61 = [AOFFSET], 2 * SIZE FSUB f76 = f44, f76 } { .mfi FSUB f77 = f45, f77 } ;; { .mfi LDFPD f62, f63 = [AOFFSET], 2 * SIZE FSUB f78 = f46, f78 } { .mfi FSUB f79 = f47, f79 } ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE FSUB f80 = f48, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f49, f81 nop __LINE__ } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE FSUB f82 = f50, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f51, f83 nop __LINE__ } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE FSUB f84 = f52, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f85 = f53, f85 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE FSUB f86 = f54, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f55, f87 nop __LINE__ } ;; { .mfi LDFPD f40, f41 = [AOFFSET], 2 * SIZE FSUB f88 = f56, f88 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f57, f89 nop __LINE__ } ;; { .mfi LDFPD f42, f43 = [AOFFSET], 2 * SIZE FSUB f90 = f58, f90 nop __LINE__ } { .mfi nop __LINE__ FSUB f91 = f59, f91 nop __LINE__ } ;; { .mfi LDFPD f44, f45 = [AOFFSET], 2 * SIZE FSUB f92 = f60, f92 nop __LINE__ } { .mfi nop __LINE__ FSUB f93 = f61, f93 nop __LINE__ } ;; { .mfi LDFPD f46, f47 = [AOFFSET], 2 * SIZE FSUB f94 = f62, f94 nop __LINE__ } { .mfi nop __LINE__ FSUB f95 = f63, f95 nop __LINE__ } ;; { .mfi LDFPD f48, f49 = [AOFFSET], 2 * SIZE FSUB f96 = f32, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f33, f97 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [AOFFSET], 2 * SIZE FSUB f98 = f34, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f99 = f35, f99 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [AOFFSET], 2 * SIZE FSUB f100 = f36, f100 nop __LINE__ } { .mfi nop __LINE__ FSUB f101 = f37, f101 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [AOFFSET], 2 * SIZE FSUB f102 = f38, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB f103 = f39, f103 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [AOFFSET], 2 * SIZE FSUB f104 = f40, f104 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f41, f105 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [AOFFSET], 2 * SIZE FSUB f106 = f42, f106 nop __LINE__ } { .mfi nop __LINE__ FSUB f107 = f43, f107 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [AOFFSET], 2 * SIZE FSUB f108 = f44, f108 nop __LINE__ } { .mfi nop __LINE__ FSUB f109 = f45, f109 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [AOFFSET] FSUB f110 = f46, f110 adds AOFFSET = -62 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f111 = f47, f111 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f48, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f49, f113 nop __LINE__ } { .mfi nop __LINE__ FSUB f114 = f50, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f51, f115 nop __LINE__ } { .mfi nop __LINE__ FSUB f116 = f52, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f117 = f53, f117 nop __LINE__ } { .mfi nop __LINE__ FSUB f118 = f54, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f55, f119 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f56, f120 nop __LINE__ } { .mfi nop __LINE__ FSUB f121 = f57, f121 nop __LINE__ } { .mfi nop __LINE__ FSUB f122 = f58, f122 nop __LINE__ } { .mfi nop __LINE__ FSUB f123 = f59, f123 nop __LINE__ } { .mfi nop __LINE__ FSUB f124 = f60, f124 nop __LINE__ } { .mfi nop __LINE__ FSUB f125 = f61, f125 nop __LINE__ } { .mfi nop __LINE__ FSUB f126 = f62, f126 nop __LINE__ } { .mfi nop __LINE__ FSUB f127 = f63, f127 nop __LINE__ } ;; #endif #ifdef LN { .mfi LDFPD f35, f34 = [AOFFSET] FMPY f71 = f71, f32 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f103 = f103, f32 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi LDFPD f37, f36 = [AOFFSET] FMPY f79 = f79, f32 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f111 = f111, f32 nop __LINE__ } ;; { .mfi LDFPD f39, f38 = [AOFFSET] FMPY f87 = f87, f32 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f119 = f119, f32 nop __LINE__ } ;; { .mfi LDFD f40 = [AOFFSET], -2 * SIZE FMPY f95 = f95, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f127 = f127, f32 nop __LINE__ } ;; { .mfi LDFPD f42, f41 = [AOFFSET] FNMA f70 = f71, f33, f70 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f102 = f103, f33, f102 nop __LINE__ } ;; { .mfi LDFPD f44, f43 = [AOFFSET] FNMA f78 = f79, f33, f78 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f110 = f111, f33, f110 nop __LINE__ } ;; { .mfi LDFPD f46, f45 = [AOFFSET] FNMA f86 = f87, f33, f86 adds AOFFSET = - 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f118 = f119, f33, f118 nop __LINE__ } ;; { .mfi LDFPD f48, f47 = [AOFFSET] FNMA f94 = f95, f33, f94 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f126 = f127, f33, f126 nop __LINE__ } ;; { .mfi LDFPD f50, f49 = [AOFFSET] FNMA f69 = f71, f34, f69 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f101 = f103, f34, f101 nop __LINE__ } ;; { .mfi LDFPD f52, f51 = [AOFFSET] FNMA f77 = f79, f34, f77 adds AOFFSET = - 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f109 = f111, f34, f109 nop __LINE__ } ;; { .mfi LDFD f53 = [AOFFSET], -2 * SIZE FNMA f85 = f87, f34, f85 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f119, f34, f117 nop __LINE__ } ;; { .mfi LDFPD f55, f54 = [AOFFSET] FNMA f93 = f95, f34, f93 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f125 = f127, f34, f125 nop __LINE__ } ;; { .mfi LDFPD f57, f56 = [AOFFSET] FNMA f68 = f71, f35, f68 adds AOFFSET = - 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f100 = f103, f35, f100 nop __LINE__ } ;; { .mfi LDFPD f59, f58 = [AOFFSET] FNMA f76 = f79, f35, f76 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f108 = f111, f35, f108 nop __LINE__ } ;; { .mfi LDFPD f61, f60 = [AOFFSET] FNMA f84 = f87, f35, f84 adds AOFFSET = - 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f116 = f119, f35, f116 nop __LINE__ } ;; { .mfi LDFD f16 = [AOFFSET], -2 * SIZE FNMA f92 = f95, f35, f92 nop __LINE__ } { .mfi nop __LINE__ FNMA f124 = f127, f35, f124 nop __LINE__ } ;; { .mfi LDFPD f18, f17 = [AOFFSET] FNMA f67 = f71, f36, f67 adds AOFFSET = - 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f99 = f103, f36, f99 nop __LINE__ } ;; { .mfi LDFPD f20, f19 = [AOFFSET] FNMA f75 = f79, f36, f75 adds AOFFSET = - 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f107 = f111, f36, f107 nop __LINE__ } ;; { .mfi LDFD f21 = [AOFFSET] FNMA f83 = f87, f36, f83 adds BOFFSET = 56 * SIZE, BOFFSET } { .mfi FNMA f115 = f119, f36, f115 adds BOFFSET2 = 56 * SIZE, BOFFSET2 } ;; FNMA f91 = f95, f36, f91 FNMA f123 = f127, f36, f123 ;; FNMA f66 = f71, f37, f66 FNMA f98 = f103, f37, f98 FNMA f74 = f79, f37, f74 FNMA f106 = f111, f37, f106 FNMA f82 = f87, f37, f82 FNMA f114 = f119, f37, f114 FNMA f90 = f95, f37, f90 FNMA f122 = f127, f37, f122 ;; FNMA f65 = f71, f38, f65 FNMA f97 = f103, f38, f97 FNMA f73 = f79, f38, f73 FNMA f105 = f111, f38, f105 FNMA f81 = f87, f38, f81 FNMA f113 = f119, f38, f113 FNMA f89 = f95, f38, f89 FNMA f121 = f127, f38, f121 ;; FNMA f64 = f71, f39, f64 FNMA f96 = f103, f39, f96 FNMA f72 = f79, f39, f72 FNMA f104 = f111, f39, f104 FNMA f80 = f87, f39, f80 FNMA f112 = f119, f39, f112 FNMA f88 = f95, f39, f88 FNMA f120 = f127, f39, f120 ;; FMPY f70 = f70, f40 FMPY f102 = f102, f40 FMPY f78 = f78, f40 FMPY f110 = f110, f40 FMPY f86 = f86, f40 FMPY f118 = f118, f40 FMPY f94 = f94, f40 FMPY f126 = f126, f40 ;; FNMA f69 = f70, f41, f69 FNMA f101 = f102, f41, f101 FNMA f77 = f78, f41, f77 FNMA f109 = f110, f41, f109 FNMA f85 = f86, f41, f85 FNMA f117 = f118, f41, f117 FNMA f93 = f94, f41, f93 FNMA f125 = f126, f41, f125 ;; FNMA f68 = f70, f42, f68 FNMA f100 = f102, f42, f100 FNMA f76 = f78, f42, f76 FNMA f108 = f110, f42, f108 FNMA f84 = f86, f42, f84 FNMA f116 = f118, f42, f116 FNMA f92 = f94, f42, f92 FNMA f124 = f126, f42, f124 ;; FNMA f67 = f70, f43, f67 FNMA f99 = f102, f43, f99 FNMA f75 = f78, f43, f75 FNMA f107 = f110, f43, f107 FNMA f83 = f86, f43, f83 FNMA f115 = f118, f43, f115 FNMA f91 = f94, f43, f91 FNMA f123 = f126, f43, f123 ;; FNMA f66 = f70, f44, f66 FNMA f98 = f102, f44, f98 FNMA f74 = f78, f44, f74 FNMA f106 = f110, f44, f106 FNMA f82 = f86, f44, f82 FNMA f114 = f118, f44, f114 FNMA f90 = f94, f44, f90 FNMA f122 = f126, f44, f122 ;; FNMA f65 = f70, f45, f65 FNMA f97 = f102, f45, f97 FNMA f73 = f78, f45, f73 FNMA f105 = f110, f45, f105 FNMA f81 = f86, f45, f81 FNMA f113 = f118, f45, f113 FNMA f89 = f94, f45, f89 FNMA f121 = f126, f45, f121 ;; FNMA f64 = f70, f46, f64 FNMA f96 = f102, f46, f96 FNMA f72 = f78, f46, f72 FNMA f104 = f110, f46, f104 FNMA f80 = f86, f46, f80 FNMA f112 = f118, f46, f112 FNMA f88 = f94, f46, f88 FNMA f120 = f126, f46, f120 ;; FMPY f69 = f69, f47 FMPY f101 = f101, f47 FMPY f77 = f77, f47 FMPY f109 = f109, f47 FMPY f85 = f85, f47 FMPY f117 = f117, f47 FMPY f93 = f93, f47 FMPY f125 = f125, f47 ;; FNMA f68 = f69, f48, f68 FNMA f100 = f101, f48, f100 FNMA f76 = f77, f48, f76 FNMA f108 = f109, f48, f108 FNMA f84 = f85, f48, f84 FNMA f116 = f117, f48, f116 FNMA f92 = f93, f48, f92 FNMA f124 = f125, f48, f124 ;; FNMA f67 = f69, f49, f67 FNMA f99 = f101, f49, f99 FNMA f75 = f77, f49, f75 FNMA f107 = f109, f49, f107 FNMA f83 = f85, f49, f83 FNMA f115 = f117, f49, f115 FNMA f91 = f93, f49, f91 FNMA f123 = f125, f49, f123 ;; FNMA f66 = f69, f50, f66 FNMA f98 = f101, f50, f98 FNMA f74 = f77, f50, f74 FNMA f106 = f109, f50, f106 FNMA f82 = f85, f50, f82 FNMA f114 = f117, f50, f114 FNMA f90 = f93, f50, f90 FNMA f122 = f125, f50, f122 ;; FNMA f65 = f69, f51, f65 FNMA f97 = f101, f51, f97 FNMA f73 = f77, f51, f73 FNMA f105 = f109, f51, f105 FNMA f81 = f85, f51, f81 FNMA f113 = f117, f51, f113 FNMA f89 = f93, f51, f89 FNMA f121 = f125, f51, f121 ;; FNMA f64 = f69, f52, f64 FNMA f96 = f101, f52, f96 FNMA f72 = f77, f52, f72 FNMA f104 = f109, f52, f104 FNMA f80 = f85, f52, f80 FNMA f112 = f117, f52, f112 FNMA f88 = f93, f52, f88 FNMA f120 = f125, f52, f120 ;; FMPY f68 = f68, f53 FMPY f100 = f100, f53 FMPY f76 = f76, f53 FMPY f108 = f108, f53 FMPY f84 = f84, f53 FMPY f116 = f116, f53 FMPY f92 = f92, f53 FMPY f124 = f124, f53 ;; FNMA f67 = f68, f54, f67 FNMA f99 = f100, f54, f99 FNMA f75 = f76, f54, f75 FNMA f107 = f108, f54, f107 FNMA f83 = f84, f54, f83 FNMA f115 = f116, f54, f115 FNMA f91 = f92, f54, f91 FNMA f123 = f124, f54, f123 ;; FNMA f66 = f68, f55, f66 FNMA f98 = f100, f55, f98 FNMA f74 = f76, f55, f74 FNMA f106 = f108, f55, f106 FNMA f82 = f84, f55, f82 FNMA f114 = f116, f55, f114 FNMA f90 = f92, f55, f90 FNMA f122 = f124, f55, f122 ;; FNMA f65 = f68, f56, f65 FNMA f97 = f100, f56, f97 FNMA f73 = f76, f56, f73 FNMA f105 = f108, f56, f105 FNMA f81 = f84, f56, f81 FNMA f113 = f116, f56, f113 FNMA f89 = f92, f56, f89 FNMA f121 = f124, f56, f121 ;; FNMA f64 = f68, f57, f64 FNMA f96 = f100, f57, f96 FNMA f72 = f76, f57, f72 FNMA f104 = f108, f57, f104 FNMA f80 = f84, f57, f80 FNMA f112 = f116, f57, f112 FNMA f88 = f92, f57, f88 FNMA f120 = f124, f57, f120 ;; FMPY f67 = f67, f58 FMPY f99 = f99, f58 FMPY f75 = f75, f58 FMPY f107 = f107, f58 FMPY f83 = f83, f58 FMPY f115 = f115, f58 FMPY f91 = f91, f58 FMPY f123 = f123, f58 ;; FNMA f66 = f67, f59, f66 FNMA f98 = f99, f59, f98 FNMA f74 = f75, f59, f74 FNMA f106 = f107, f59, f106 FNMA f82 = f83, f59, f82 FNMA f114 = f115, f59, f114 FNMA f90 = f91, f59, f90 FNMA f122 = f123, f59, f122 ;; FNMA f65 = f67, f60, f65 FNMA f97 = f99, f60, f97 FNMA f73 = f75, f60, f73 FNMA f105 = f107, f60, f105 FNMA f81 = f83, f60, f81 FNMA f113 = f115, f60, f113 FNMA f89 = f91, f60, f89 FNMA f121 = f123, f60, f121 ;; { .mfi STFD [BOFFSET] = f71, SIZE FNMA f64 = f67, f61, f64 } { .mfi STFD [BOFFSET2] = f103, SIZE FNMA f96 = f99, f61, f96 } ;; { .mfi STFD [BOFFSET] = f79, SIZE FNMA f72 = f75, f61, f72 } { .mfi STFD [BOFFSET2] = f111, SIZE FNMA f104 = f107, f61, f104 } ;; { .mfi STFD [BOFFSET] = f87, SIZE FNMA f80 = f83, f61, f80 } { .mfi STFD [BOFFSET2] = f119, SIZE FNMA f112 = f115, f61, f112 } ;; { .mfi STFD [BOFFSET] = f95, - 11 * SIZE FNMA f88 = f91, f61, f88 } { .mfi STFD [BOFFSET2] = f127, - 11 * SIZE FNMA f120 = f123, f61, f120 } ;; { .mfi STFD [BOFFSET] = f70, SIZE FMPY f66 = f66, f16 } { .mfi STFD [BOFFSET2] = f102, SIZE FMPY f98 = f98, f16 } ;; { .mfi STFD [BOFFSET] = f78, SIZE FMPY f74 = f74, f16 } { .mfi STFD [BOFFSET2] = f110, SIZE FMPY f106 = f106, f16 } ;; { .mfi STFD [BOFFSET] = f86, SIZE FMPY f82 = f82, f16 } { .mfi STFD [BOFFSET2] = f118, SIZE FMPY f114 = f114, f16 } ;; { .mfi STFD [BOFFSET] = f94, - 11 * SIZE FMPY f90 = f90, f16 } { .mfi STFD [BOFFSET2] = f126, - 11 * SIZE FMPY f122 = f122, f16 } ;; { .mfi STFD [BOFFSET] = f69, SIZE FNMA f65 = f66, f17, f65 } { .mfi STFD [BOFFSET2] = f101, SIZE FNMA f97 = f98, f17, f97 } ;; { .mfi STFD [BOFFSET] = f77, SIZE FNMA f73 = f74, f17, f73 } { .mfi STFD [BOFFSET2] = f109, SIZE FNMA f105 = f106, f17, f105 } ;; { .mfi STFD [BOFFSET] = f85, SIZE FNMA f81 = f82, f17, f81 } { .mfi STFD [BOFFSET2] = f117, SIZE FNMA f113 = f114, f17, f113 } ;; { .mfi STFD [BOFFSET] = f93, - 11 * SIZE FNMA f89 = f90, f17, f89 } { .mfi STFD [BOFFSET2] = f125, - 11 * SIZE FNMA f121 = f122, f17, f121 } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f64 = f66, f18, f64 } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f96 = f98, f18, f96 } ;; { .mfi STFD [BOFFSET] = f76, SIZE FNMA f72 = f74, f18, f72 } { .mfi STFD [BOFFSET2] = f108, SIZE FNMA f104 = f106, f18, f104 } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f80 = f82, f18, f80 } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f112 = f114, f18, f112 } ;; { .mfi STFD [BOFFSET] = f92, - 11 * SIZE FNMA f88 = f90, f18, f88 } { .mfi STFD [BOFFSET2] = f124, - 11 * SIZE FNMA f120 = f122, f18, f120 } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMPY f65 = f65, f19 } { .mfi STFD [BOFFSET2] = f99, SIZE FMPY f97 = f97, f19 } ;; { .mfi STFD [BOFFSET] = f75, SIZE FMPY f73 = f73, f19 } { .mfi STFD [BOFFSET2] = f107, SIZE FMPY f105 = f105, f19 } ;; { .mfi STFD [BOFFSET] = f83, SIZE FMPY f81 = f81, f19 } { .mfi STFD [BOFFSET2] = f115, SIZE FMPY f113 = f113, f19 } ;; { .mfi STFD [BOFFSET] = f91, - 11 * SIZE FMPY f89 = f89, f19 } { .mfi STFD [BOFFSET2] = f123, - 11 * SIZE FMPY f121 = f121, f19 } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f64 = f65, f20, f64 } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f96 = f97, f20, f96 } ;; { .mfi STFD [BOFFSET] = f74, SIZE FNMA f72 = f73, f20, f72 } { .mfi STFD [BOFFSET2] = f106, SIZE FNMA f104 = f105, f20, f104 } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f80 = f81, f20, f80 } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f112 = f113, f20, f112 } ;; { .mfi STFD [BOFFSET] = f90, -11 * SIZE FNMA f88 = f89, f20, f88 } { .mfi STFD [BOFFSET2] = f122, -11 * SIZE FNMA f120 = f121, f20, f120 } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMPY f64 = f64, f21 } { .mfi STFD [BOFFSET2] = f97, SIZE FMPY f96 = f96, f21 } ;; { .mfi STFD [BOFFSET] = f73, SIZE FMPY f72 = f72, f21 } { .mfi STFD [BOFFSET2] = f105, SIZE FMPY f104 = f104, f21 } ;; { .mfi STFD [BOFFSET] = f81, SIZE FMPY f80 = f80, f21 } { .mfi STFD [BOFFSET2] = f113, SIZE FMPY f112 = f112, f21 } ;; { .mfi STFD [BOFFSET] = f89, - 11 * SIZE FMPY f88 = f88, f21 } { .mfi STFD [BOFFSET2] = f121, - 11 * SIZE FMPY f120 = f120, f21 } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -8 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -8 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE adds C9 = 4 * SIZE, C1 } ;; #endif #ifdef LT { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [AOFFSET] FMPY f80 = f80, f32 adds AOFFSET = 3 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi LDFD f40 = [AOFFSET], 1 * SIZE FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi LDFPD f41, f42 = [AOFFSET], 2 * SIZE FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi LDFPD f43, f44 = [AOFFSET], 2 * SIZE FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi LDFPD f45, f46 = [AOFFSET] FNMA f81 = f80, f33, f81 adds AOFFSET = 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi LDFPD f47, f48 = [AOFFSET], 2 * SIZE FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; { .mfi LDFPD f49, f50 = [AOFFSET], 2 * SIZE FNMA f66 = f64, f34, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f96, f34, f98 nop __LINE__ } ;; { .mfi LDFPD f51, f52 = [AOFFSET] FNMA f74 = f72, f34, f74 adds AOFFSET = 5 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f106 = f104, f34, f106 nop __LINE__ } ;; { .mfi LDFD f53 = [AOFFSET], 1 * SIZE FNMA f82 = f80, f34, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f112, f34, f114 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [AOFFSET], 2 * SIZE FNMA f90 = f88, f34, f90 nop __LINE__ } { .mfi nop __LINE__ FNMA f122 = f120, f34, f122 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [AOFFSET] FNMA f67 = f64, f35, f67 adds AOFFSET = 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f99 = f96, f35, f99 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [AOFFSET], 2 * SIZE FNMA f75 = f72, f35, f75 nop __LINE__ } { .mfi nop __LINE__ FNMA f107 = f104, f35, f107 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [AOFFSET] FNMA f83 = f80, f35, f83 adds AOFFSET = 7 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f115 = f112, f35, f115 nop __LINE__ } ;; { .mfi LDFD f16 = [AOFFSET], 1 * SIZE FNMA f91 = f88, f35, f91 nop __LINE__ } { .mfi nop __LINE__ FNMA f123 = f120, f35, f123 nop __LINE__ } ;; { .mfi LDFPD f17, f18 = [AOFFSET] FNMA f68 = f64, f36, f68 adds AOFFSET = 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f100 = f96, f36, f100 nop __LINE__ } ;; { .mfi LDFPD f19, f20 = [AOFFSET] FNMA f76 = f72, f36, f76 adds AOFFSET = 9 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f108 = f104, f36, f108 nop __LINE__ } ;; { .mfi LDFD f21 = [AOFFSET] FNMA f84 = f80, f36, f84 adds AOFFSET = -63 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f116 = f112, f36, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f92 = f88, f36, f92 nop __LINE__ } { .mfi nop __LINE__ FNMA f124 = f120, f36, f124 nop __LINE__ } ;; FNMA f69 = f64, f37, f69 FNMA f101 = f96, f37, f101 FNMA f77 = f72, f37, f77 FNMA f109 = f104, f37, f109 FNMA f85 = f80, f37, f85 FNMA f117 = f112, f37, f117 FNMA f93 = f88, f37, f93 FNMA f125 = f120, f37, f125 ;; FNMA f70 = f64, f38, f70 FNMA f102 = f96, f38, f102 FNMA f78 = f72, f38, f78 FNMA f110 = f104, f38, f110 FNMA f86 = f80, f38, f86 FNMA f118 = f112, f38, f118 FNMA f94 = f88, f38, f94 FNMA f126 = f120, f38, f126 ;; FNMA f71 = f64, f39, f71 FNMA f103 = f96, f39, f103 FNMA f79 = f72, f39, f79 FNMA f111 = f104, f39, f111 FNMA f87 = f80, f39, f87 FNMA f119 = f112, f39, f119 FNMA f95 = f88, f39, f95 FNMA f127 = f120, f39, f127 ;; FMPY f65 = f65, f40 FMPY f97 = f97, f40 FMPY f73 = f73, f40 FMPY f105 = f105, f40 FMPY f81 = f81, f40 FMPY f113 = f113, f40 FMPY f89 = f89, f40 FMPY f121 = f121, f40 ;; FNMA f66 = f65, f41, f66 FNMA f98 = f97, f41, f98 FNMA f74 = f73, f41, f74 FNMA f106 = f105, f41, f106 FNMA f82 = f81, f41, f82 FNMA f114 = f113, f41, f114 FNMA f90 = f89, f41, f90 FNMA f122 = f121, f41, f122 FNMA f67 = f65, f42, f67 FNMA f99 = f97, f42, f99 FNMA f75 = f73, f42, f75 FNMA f107 = f105, f42, f107 FNMA f83 = f81, f42, f83 FNMA f115 = f113, f42, f115 FNMA f91 = f89, f42, f91 FNMA f123 = f121, f42, f123 ;; FNMA f68 = f65, f43, f68 FNMA f100 = f97, f43, f100 FNMA f76 = f73, f43, f76 FNMA f108 = f105, f43, f108 FNMA f84 = f81, f43, f84 FNMA f116 = f113, f43, f116 FNMA f92 = f89, f43, f92 FNMA f124 = f121, f43, f124 ;; FNMA f69 = f65, f44, f69 FNMA f101 = f97, f44, f101 FNMA f77 = f73, f44, f77 FNMA f109 = f105, f44, f109 FNMA f85 = f81, f44, f85 FNMA f117 = f113, f44, f117 FNMA f93 = f89, f44, f93 FNMA f125 = f121, f44, f125 ;; FNMA f70 = f65, f45, f70 FNMA f102 = f97, f45, f102 FNMA f78 = f73, f45, f78 FNMA f110 = f105, f45, f110 FNMA f86 = f81, f45, f86 FNMA f118 = f113, f45, f118 FNMA f94 = f89, f45, f94 FNMA f126 = f121, f45, f126 ;; FNMA f71 = f65, f46, f71 FNMA f103 = f97, f46, f103 FNMA f79 = f73, f46, f79 FNMA f111 = f105, f46, f111 FNMA f87 = f81, f46, f87 FNMA f119 = f113, f46, f119 FNMA f95 = f89, f46, f95 FNMA f127 = f121, f46, f127 ;; FMPY f66 = f66, f47 FMPY f98 = f98, f47 FMPY f74 = f74, f47 FMPY f106 = f106, f47 FMPY f82 = f82, f47 FMPY f114 = f114, f47 FMPY f90 = f90, f47 FMPY f122 = f122, f47 ;; FNMA f67 = f66, f48, f67 FNMA f99 = f98, f48, f99 FNMA f75 = f74, f48, f75 FNMA f107 = f106, f48, f107 FNMA f83 = f82, f48, f83 FNMA f115 = f114, f48, f115 FNMA f91 = f90, f48, f91 FNMA f123 = f122, f48, f123 FNMA f68 = f66, f49, f68 FNMA f100 = f98, f49, f100 FNMA f76 = f74, f49, f76 FNMA f108 = f106, f49, f108 FNMA f84 = f82, f49, f84 FNMA f116 = f114, f49, f116 FNMA f92 = f90, f49, f92 FNMA f124 = f122, f49, f124 ;; FNMA f69 = f66, f50, f69 FNMA f101 = f98, f50, f101 FNMA f77 = f74, f50, f77 FNMA f109 = f106, f50, f109 FNMA f85 = f82, f50, f85 FNMA f117 = f114, f50, f117 FNMA f93 = f90, f50, f93 FNMA f125 = f122, f50, f125 ;; FNMA f70 = f66, f51, f70 FNMA f102 = f98, f51, f102 FNMA f78 = f74, f51, f78 FNMA f110 = f106, f51, f110 FNMA f86 = f82, f51, f86 FNMA f118 = f114, f51, f118 FNMA f94 = f90, f51, f94 FNMA f126 = f122, f51, f126 ;; FNMA f71 = f66, f52, f71 FNMA f103 = f98, f52, f103 FNMA f79 = f74, f52, f79 FNMA f111 = f106, f52, f111 FNMA f87 = f82, f52, f87 FNMA f119 = f114, f52, f119 FNMA f95 = f90, f52, f95 FNMA f127 = f122, f52, f127 ;; FMPY f67 = f67, f53 FMPY f99 = f99, f53 FMPY f75 = f75, f53 FMPY f107 = f107, f53 FMPY f83 = f83, f53 FMPY f115 = f115, f53 FMPY f91 = f91, f53 FMPY f123 = f123, f53 ;; FNMA f68 = f67, f54, f68 FNMA f100 = f99, f54, f100 FNMA f76 = f75, f54, f76 FNMA f108 = f107, f54, f108 FNMA f84 = f83, f54, f84 FNMA f116 = f115, f54, f116 FNMA f92 = f91, f54, f92 FNMA f124 = f123, f54, f124 ;; FNMA f69 = f67, f55, f69 FNMA f101 = f99, f55, f101 FNMA f77 = f75, f55, f77 FNMA f109 = f107, f55, f109 FNMA f85 = f83, f55, f85 FNMA f117 = f115, f55, f117 FNMA f93 = f91, f55, f93 FNMA f125 = f123, f55, f125 ;; FNMA f70 = f67, f56, f70 FNMA f102 = f99, f56, f102 FNMA f78 = f75, f56, f78 FNMA f110 = f107, f56, f110 FNMA f86 = f83, f56, f86 FNMA f118 = f115, f56, f118 FNMA f94 = f91, f56, f94 FNMA f126 = f123, f56, f126 ;; FNMA f71 = f67, f57, f71 FNMA f103 = f99, f57, f103 FNMA f79 = f75, f57, f79 FNMA f111 = f107, f57, f111 FNMA f87 = f83, f57, f87 FNMA f119 = f115, f57, f119 FNMA f95 = f91, f57, f95 FNMA f127 = f123, f57, f127 ;; FMPY f68 = f68, f58 FMPY f100 = f100, f58 FMPY f76 = f76, f58 FMPY f108 = f108, f58 FMPY f84 = f84, f58 FMPY f116 = f116, f58 FMPY f92 = f92, f58 FMPY f124 = f124, f58 ;; FNMA f69 = f68, f59, f69 FNMA f101 = f100, f59, f101 FNMA f77 = f76, f59, f77 FNMA f109 = f108, f59, f109 FNMA f85 = f84, f59, f85 FNMA f117 = f116, f59, f117 FNMA f93 = f92, f59, f93 FNMA f125 = f124, f59, f125 ;; FNMA f70 = f68, f60, f70 FNMA f102 = f100, f60, f102 FNMA f78 = f76, f60, f78 FNMA f110 = f108, f60, f110 FNMA f86 = f84, f60, f86 FNMA f118 = f116, f60, f118 FNMA f94 = f92, f60, f94 FNMA f126 = f124, f60, f126 ;; { .mfi STFD [BOFFSET] = f64, SIZE FNMA f71 = f68, f61, f71 } { .mfi STFD [BOFFSET2] = f96, SIZE FNMA f103 = f100, f61, f103 } ;; { .mfi STFD [BOFFSET] = f72, SIZE FNMA f79 = f76, f61, f79 } { .mfi STFD [BOFFSET2] = f104, SIZE FNMA f111 = f108, f61, f111 } ;; { .mfi STFD [BOFFSET] = f80, SIZE FNMA f87 = f84, f61, f87 } { .mfi STFD [BOFFSET2] = f112, SIZE FNMA f119 = f116, f61, f119 } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE FNMA f95 = f92, f61, f95 } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE FNMA f127 = f124, f61, f127 } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMPY f69 = f69, f16 } { .mfi STFD [BOFFSET2] = f97, SIZE FMPY f101 = f101, f16 } ;; { .mfi STFD [BOFFSET] = f73, SIZE FMPY f77 = f77, f16 } { .mfi STFD [BOFFSET2] = f105, SIZE FMPY f109 = f109, f16 } ;; { .mfi STFD [BOFFSET] = f81, SIZE FMPY f85 = f85, f16 } { .mfi STFD [BOFFSET2] = f113, SIZE FMPY f117 = f117, f16 } ;; { .mfi STFD [BOFFSET] = f89, 5 * SIZE FMPY f93 = f93, f16 } { .mfi STFD [BOFFSET2] = f121, 5 * SIZE FMPY f125 = f125, f16 } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f70 = f69, f17, f70 } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f102 = f101, f17, f102 } ;; { .mfi STFD [BOFFSET] = f74, SIZE FNMA f78 = f77, f17, f78 } { .mfi STFD [BOFFSET2] = f106, SIZE FNMA f110 = f109, f17, f110 } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f86 = f85, f17, f86 } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f118 = f117, f17, f118 } ;; { .mfi STFD [BOFFSET] = f90, 5 * SIZE FNMA f94 = f93, f17, f94 } { .mfi STFD [BOFFSET2] = f122, 5 * SIZE FNMA f126 = f125, f17, f126 } ;; { .mfi STFD [BOFFSET] = f67, SIZE FNMA f71 = f69, f18, f71 } { .mfi STFD [BOFFSET2] = f99, SIZE FNMA f103 = f101, f18, f103 } ;; { .mfi STFD [BOFFSET] = f75, SIZE FNMA f79 = f77, f18, f79 } { .mfi STFD [BOFFSET2] = f107, SIZE FNMA f111 = f109, f18, f111 } ;; { .mfi STFD [BOFFSET] = f83, SIZE FNMA f87 = f85, f18, f87 } { .mfi STFD [BOFFSET2] = f115, SIZE FNMA f119 = f117, f18, f119 } ;; { .mfi STFD [BOFFSET] = f91, 5 * SIZE FNMA f95 = f93, f18, f95 } { .mfi STFD [BOFFSET2] = f123, 5 * SIZE FNMA f127 = f125, f18, f127 } ;; { .mfi STFD [BOFFSET] = f68, SIZE FMPY f70 = f70, f19 } { .mfi STFD [BOFFSET2] = f100, SIZE FMPY f102 = f102, f19 } ;; { .mfi STFD [BOFFSET] = f76, SIZE FMPY f78 = f78, f19 } { .mfi STFD [BOFFSET2] = f108, SIZE FMPY f110 = f110, f19 } ;; { .mfi STFD [BOFFSET] = f84, SIZE FMPY f86 = f86, f19 } { .mfi STFD [BOFFSET2] = f116, SIZE FMPY f118 = f118, f19 } ;; { .mfi STFD [BOFFSET] = f92, 5 * SIZE FMPY f94 = f94, f19 } { .mfi STFD [BOFFSET2] = f124, 5 * SIZE FMPY f126 = f126, f19 } ;; { .mfi STFD [BOFFSET] = f69, SIZE FNMA f71 = f70, f20, f71 } { .mfi STFD [BOFFSET2] = f101, SIZE FNMA f103 = f102, f20, f103 } ;; { .mfi STFD [BOFFSET] = f77, SIZE FNMA f79 = f78, f20, f79 } { .mfi STFD [BOFFSET2] = f109, SIZE FNMA f111 = f110, f20, f111 } ;; { .mfi STFD [BOFFSET] = f85, SIZE FNMA f87 = f86, f20, f87 } { .mfi STFD [BOFFSET2] = f117, SIZE FNMA f119 = f118, f20, f119 } ;; { .mfi STFD [BOFFSET] = f93, 5 * SIZE FNMA f95 = f94, f20, f95 } { .mfi STFD [BOFFSET2] = f125, 5 * SIZE FNMA f127 = f126, f20, f127 } ;; { .mfi STFD [BOFFSET] = f70, SIZE FMPY f71 = f71, f21 } { .mfi STFD [BOFFSET2] = f102, SIZE FMPY f103 = f103, f21 } ;; { .mfi STFD [BOFFSET] = f78, SIZE FMPY f79 = f79, f21 } { .mfi STFD [BOFFSET2] = f110, SIZE FMPY f111 = f111, f21 } ;; { .mfi STFD [BOFFSET] = f86, SIZE FMPY f87 = f87, f21 } { .mfi STFD [BOFFSET2] = f118, SIZE FMPY f119 = f119, f21 } ;; { .mfi STFD [BOFFSET] = f94, 5 * SIZE FMPY f95 = f95, f21 } { .mfi STFD [BOFFSET2] = f126, 5 * SIZE FMPY f127 = f127, f21 } ;; { .mmi STFD [BOFFSET] = f71, SIZE STFD [BOFFSET2] = f103, SIZE } ;; { .mmi STFD [BOFFSET] = f79, SIZE STFD [BOFFSET2] = f111, SIZE } ;; { .mmi STFD [BOFFSET] = f87, SIZE STFD [BOFFSET2] = f119, SIZE adds C9 = 4 * SIZE, C1 } ;; { .mfi STFD [BOFFSET] = f95 adds BOFFSET = - 59 * SIZE, BOFFSET } { .mfi STFD [BOFFSET2] = f127 adds BOFFSET2 = - 59 * SIZE, BOFFSET2 } ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f34, f35 = [BOFFSET], 2 * SIZE FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f68 = f68, f32 nop __LINE__ } ;; { .mfi LDFPD f36, f37 = [BOFFSET], 2 * SIZE FMPY f65 = f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f69 = f69, f32 nop __LINE__ } ;; { .mfi LDFPD f38, f39 = [BOFFSET] FMPY f66 = f66, f32 adds BOFFSET = 3 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f70 = f70, f32 nop __LINE__ } ;; { .mfi LDFD f40 = [BOFFSET], 1 * SIZE FMPY f67 = f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f71 = f71, f32 nop __LINE__ } ;; { .mfi LDFPD f41, f42 = [BOFFSET], 2 * SIZE FNMA f72 = f64, f33, f72 nop __LINE__ } { .mfi nop __LINE__ FNMA f76 = f68, f33, f76 nop __LINE__ } ;; { .mfi LDFPD f43, f44 = [BOFFSET], 2 * SIZE FNMA f73 = f65, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f77 = f69, f33, f77 nop __LINE__ } ;; { .mfi LDFPD f45, f46 = [BOFFSET] FNMA f74 = f66, f33, f74 adds BOFFSET = 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f78 = f70, f33, f78 nop __LINE__ } ;; { .mfi LDFPD f47, f48 = [BOFFSET], 2 * SIZE FNMA f75 = f67, f33, f75 nop __LINE__ } { .mfi nop __LINE__ FNMA f79 = f71, f33, f79 nop __LINE__ } ;; { .mfi LDFPD f49, f50 = [BOFFSET], 2 * SIZE FNMA f80 = f64, f34, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f84 = f68, f34, f84 nop __LINE__ } ;; { .mfi LDFPD f51, f52 = [BOFFSET] FNMA f81 = f65, f34, f81 adds BOFFSET = 5 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f85 = f69, f34, f85 nop __LINE__ } ;; { .mfi LDFD f53 = [BOFFSET], 1 * SIZE FNMA f82 = f66, f34, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f86 = f70, f34, f86 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FNMA f83 = f67, f34, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f87 = f71, f34, f87 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET] FNMA f88 = f64, f35, f88 adds BOFFSET = 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f92 = f68, f35, f92 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FNMA f89 = f65, f35, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f93 = f69, f35, f93 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET] FNMA f90 = f66, f35, f90 adds BOFFSET = 7 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f94 = f70, f35, f94 nop __LINE__ } ;; { .mfi LDFD f16 = [BOFFSET], 1 * SIZE FNMA f91 = f67, f35, f91 nop __LINE__ } { .mfi nop __LINE__ FNMA f95 = f71, f35, f95 nop __LINE__ } ;; { .mfi LDFPD f17, f18 = [BOFFSET] FNMA f96 = f64, f36, f96 adds BOFFSET = 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f100 = f68, f36, f100 nop __LINE__ } ;; { .mfi LDFPD f19, f20 = [BOFFSET] FNMA f97 = f65, f36, f97 adds BOFFSET = 9 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f101 = f69, f36, f101 nop __LINE__ } ;; { .mfi LDFD f21 = [BOFFSET] FNMA f98 = f66, f36, f98 adds BOFFSET = -63 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f102 = f70, f36, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f99 = f67, f36, f99 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f71, f36, f103 nop __LINE__ } ;; FNMA f104 = f64, f37, f104 FNMA f108 = f68, f37, f108 FNMA f105 = f65, f37, f105 FNMA f109 = f69, f37, f109 FNMA f106 = f66, f37, f106 FNMA f110 = f70, f37, f110 FNMA f107 = f67, f37, f107 FNMA f111 = f71, f37, f111 ;; FNMA f112 = f64, f38, f112 FNMA f116 = f68, f38, f116 FNMA f113 = f65, f38, f113 FNMA f117 = f69, f38, f117 FNMA f114 = f66, f38, f114 FNMA f118 = f70, f38, f118 FNMA f115 = f67, f38, f115 FNMA f119 = f71, f38, f119 ;; FNMA f120 = f64, f39, f120 FNMA f124 = f68, f39, f124 FNMA f121 = f65, f39, f121 FNMA f125 = f69, f39, f125 FNMA f122 = f66, f39, f122 FNMA f126 = f70, f39, f126 FNMA f123 = f67, f39, f123 FNMA f127 = f71, f39, f127 ;; FMPY f72 = f72, f40 FMPY f76 = f76, f40 FMPY f73 = f73, f40 FMPY f77 = f77, f40 FMPY f74 = f74, f40 FMPY f78 = f78, f40 FMPY f75 = f75, f40 FMPY f79 = f79, f40 ;; FNMA f80 = f72, f41, f80 FNMA f84 = f76, f41, f84 FNMA f81 = f73, f41, f81 FNMA f85 = f77, f41, f85 FNMA f82 = f74, f41, f82 FNMA f86 = f78, f41, f86 FNMA f83 = f75, f41, f83 FNMA f87 = f79, f41, f87 ;; FNMA f88 = f72, f42, f88 FNMA f92 = f76, f42, f92 FNMA f89 = f73, f42, f89 FNMA f93 = f77, f42, f93 FNMA f90 = f74, f42, f90 FNMA f94 = f78, f42, f94 FNMA f91 = f75, f42, f91 FNMA f95 = f79, f42, f95 ;; FNMA f96 = f72, f43, f96 FNMA f100 = f76, f43, f100 FNMA f97 = f73, f43, f97 FNMA f101 = f77, f43, f101 FNMA f98 = f74, f43, f98 FNMA f102 = f78, f43, f102 FNMA f99 = f75, f43, f99 FNMA f103 = f79, f43, f103 ;; FNMA f104 = f72, f44, f104 FNMA f108 = f76, f44, f108 FNMA f105 = f73, f44, f105 FNMA f109 = f77, f44, f109 FNMA f106 = f74, f44, f106 FNMA f110 = f78, f44, f110 FNMA f107 = f75, f44, f107 FNMA f111 = f79, f44, f111 ;; FNMA f112 = f72, f45, f112 FNMA f116 = f76, f45, f116 FNMA f113 = f73, f45, f113 FNMA f117 = f77, f45, f117 FNMA f114 = f74, f45, f114 FNMA f118 = f78, f45, f118 FNMA f115 = f75, f45, f115 FNMA f119 = f79, f45, f119 ;; FNMA f120 = f72, f46, f120 FNMA f124 = f76, f46, f124 FNMA f121 = f73, f46, f121 FNMA f125 = f77, f46, f125 FNMA f122 = f74, f46, f122 FNMA f126 = f78, f46, f126 FNMA f123 = f75, f46, f123 FNMA f127 = f79, f46, f127 ;; FMPY f80 = f80, f47 FMPY f84 = f84, f47 FMPY f81 = f81, f47 FMPY f85 = f85, f47 FMPY f82 = f82, f47 FMPY f86 = f86, f47 FMPY f83 = f83, f47 FMPY f87 = f87, f47 ;; FNMA f88 = f80, f48, f88 FNMA f92 = f84, f48, f92 FNMA f89 = f81, f48, f89 FNMA f93 = f85, f48, f93 FNMA f90 = f82, f48, f90 FNMA f94 = f86, f48, f94 FNMA f91 = f83, f48, f91 FNMA f95 = f87, f48, f95 ;; FNMA f96 = f80, f49, f96 FNMA f100 = f84, f49, f100 FNMA f97 = f81, f49, f97 FNMA f101 = f85, f49, f101 FNMA f98 = f82, f49, f98 FNMA f102 = f86, f49, f102 FNMA f99 = f83, f49, f99 FNMA f103 = f87, f49, f103 ;; FNMA f104 = f80, f50, f104 FNMA f108 = f84, f50, f108 FNMA f105 = f81, f50, f105 FNMA f109 = f85, f50, f109 FNMA f106 = f82, f50, f106 FNMA f110 = f86, f50, f110 FNMA f107 = f83, f50, f107 FNMA f111 = f87, f50, f111 ;; FNMA f112 = f80, f51, f112 FNMA f116 = f84, f51, f116 FNMA f113 = f81, f51, f113 FNMA f117 = f85, f51, f117 FNMA f114 = f82, f51, f114 FNMA f118 = f86, f51, f118 FNMA f115 = f83, f51, f115 FNMA f119 = f87, f51, f119 ;; FNMA f120 = f80, f52, f120 FNMA f124 = f84, f52, f124 FNMA f121 = f81, f52, f121 FNMA f125 = f85, f52, f125 FNMA f122 = f82, f52, f122 FNMA f126 = f86, f52, f126 FNMA f123 = f83, f52, f123 FNMA f127 = f87, f52, f127 ;; FMPY f88 = f88, f53 FMPY f92 = f92, f53 FMPY f89 = f89, f53 FMPY f93 = f93, f53 FMPY f90 = f90, f53 FMPY f94 = f94, f53 FMPY f91 = f91, f53 FMPY f95 = f95, f53 ;; FNMA f96 = f88, f54, f96 FNMA f100 = f92, f54, f100 FNMA f97 = f89, f54, f97 FNMA f101 = f93, f54, f101 FNMA f98 = f90, f54, f98 FNMA f102 = f94, f54, f102 FNMA f99 = f91, f54, f99 FNMA f103 = f95, f54, f103 ;; FNMA f104 = f88, f55, f104 FNMA f108 = f92, f55, f108 FNMA f105 = f89, f55, f105 FNMA f109 = f93, f55, f109 FNMA f106 = f90, f55, f106 FNMA f110 = f94, f55, f110 FNMA f107 = f91, f55, f107 FNMA f111 = f95, f55, f111 ;; FNMA f112 = f88, f56, f112 FNMA f116 = f92, f56, f116 FNMA f113 = f89, f56, f113 FNMA f117 = f93, f56, f117 FNMA f114 = f90, f56, f114 FNMA f118 = f94, f56, f118 FNMA f115 = f91, f56, f115 FNMA f119 = f95, f56, f119 ;; FNMA f120 = f88, f57, f120 FNMA f124 = f92, f57, f124 FNMA f121 = f89, f57, f121 FNMA f125 = f93, f57, f125 FNMA f122 = f90, f57, f122 FNMA f126 = f94, f57, f126 FNMA f123 = f91, f57, f123 FNMA f127 = f95, f57, f127 ;; FMPY f96 = f96, f58 FMPY f100 = f100, f58 FMPY f97 = f97, f58 FMPY f101 = f101, f58 FMPY f98 = f98, f58 FMPY f102 = f102, f58 FMPY f99 = f99, f58 FMPY f103 = f103, f58 ;; FNMA f104 = f96, f59, f104 FNMA f108 = f100, f59, f108 FNMA f105 = f97, f59, f105 FNMA f109 = f101, f59, f109 FNMA f106 = f98, f59, f106 FNMA f110 = f102, f59, f110 FNMA f107 = f99, f59, f107 FNMA f111 = f103, f59, f111 ;; FNMA f112 = f96, f60, f112 FNMA f116 = f100, f60, f116 FNMA f113 = f97, f60, f113 FNMA f117 = f101, f60, f117 FNMA f114 = f98, f60, f114 FNMA f118 = f102, f60, f118 FNMA f115 = f99, f60, f115 FNMA f119 = f103, f60, f119 ;; { .mfi STFD [AOFFSET] = f64, SIZE FNMA f120 = f96, f61, f120 } { .mfi STFD [AOFFSET2] = f68, SIZE FNMA f124 = f100, f61, f124 } ;; { .mfi STFD [AOFFSET] = f65, SIZE FNMA f121 = f97, f61, f121 } { .mfi STFD [AOFFSET2] = f69, SIZE FNMA f125 = f101, f61, f125 } ;; { .mfi STFD [AOFFSET] = f66, SIZE FNMA f122 = f98, f61, f122 } { .mfi STFD [AOFFSET2] = f70, SIZE FNMA f126 = f102, f61, f126 } ;; { .mfi STFD [AOFFSET] = f67, 5 * SIZE FNMA f123 = f99, f61, f123 } { .mfi STFD [AOFFSET2] = f71, 5 * SIZE FNMA f127 = f103, f61, f127 } ;; { .mfi STFD [AOFFSET] = f72, SIZE FMPY f104 = f104, f16 } { .mfi STFD [AOFFSET2] = f76, SIZE FMPY f108 = f108, f16 } ;; { .mfi STFD [AOFFSET] = f73, SIZE FMPY f105 = f105, f16 } { .mfi STFD [AOFFSET2] = f77, SIZE FMPY f109 = f109, f16 } ;; { .mfi STFD [AOFFSET] = f74, SIZE FMPY f106 = f106, f16 } { .mfi STFD [AOFFSET2] = f78, SIZE FMPY f110 = f110, f16 } ;; { .mfi STFD [AOFFSET] = f75, 5 * SIZE FMPY f107 = f107, f16 } { .mfi STFD [AOFFSET2] = f79, 5 * SIZE FMPY f111 = f111, f16 } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f112 = f104, f17, f112 } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f116 = f108, f17, f116 } ;; { .mfi STFD [AOFFSET] = f81, SIZE FNMA f113 = f105, f17, f113 } { .mfi STFD [AOFFSET2] = f85, SIZE FNMA f117 = f109, f17, f117 } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f114 = f106, f17, f114 } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f118 = f110, f17, f118 } ;; { .mfi STFD [AOFFSET] = f83, 5 * SIZE FNMA f115 = f107, f17, f115 } { .mfi STFD [AOFFSET2] = f87, 5 * SIZE FNMA f119 = f111, f17, f119 } ;; { .mfi STFD [AOFFSET] = f88, SIZE FNMA f120 = f104, f18, f120 } { .mfi STFD [AOFFSET2] = f92, SIZE FNMA f124 = f108, f18, f124 } ;; { .mfi STFD [AOFFSET] = f89, SIZE FNMA f121 = f105, f18, f121 } { .mfi STFD [AOFFSET2] = f93, SIZE FNMA f125 = f109, f18, f125 } ;; { .mfi STFD [AOFFSET] = f90, SIZE FNMA f122 = f106, f18, f122 } { .mfi STFD [AOFFSET2] = f94, SIZE FNMA f126 = f110, f18, f126 } ;; { .mfi STFD [AOFFSET] = f91, 5 * SIZE FNMA f123 = f107, f18, f123 } { .mfi STFD [AOFFSET2] = f95, 5 * SIZE FNMA f127 = f111, f18, f127 } ;; { .mfi STFD [AOFFSET] = f96, SIZE FMPY f112 = f112, f19 } { .mfi STFD [AOFFSET2] = f100, SIZE FMPY f116 = f116, f19 } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMPY f113 = f113, f19 } { .mfi STFD [AOFFSET2] = f101, SIZE FMPY f117 = f117, f19 } ;; { .mfi STFD [AOFFSET] = f98, SIZE FMPY f114 = f114, f19 } { .mfi STFD [AOFFSET2] = f102, SIZE FMPY f118 = f118, f19 } ;; { .mfi STFD [AOFFSET] = f99, 5 * SIZE FMPY f115 = f115, f19 } { .mfi STFD [AOFFSET2] = f103, 5 * SIZE FMPY f119 = f119, f19 } ;; { .mfi STFD [AOFFSET] = f104, SIZE FNMA f120 = f112, f20, f120 } { .mfi STFD [AOFFSET2] = f108, SIZE FNMA f124 = f116, f20, f124 } ;; { .mfi STFD [AOFFSET] = f105, SIZE FNMA f121 = f113, f20, f121 } { .mfi STFD [AOFFSET2] = f109, SIZE FNMA f125 = f117, f20, f125 } ;; { .mfi STFD [AOFFSET] = f106, SIZE FNMA f122 = f114, f20, f122 } { .mfi STFD [AOFFSET2] = f110, SIZE FNMA f126 = f118, f20, f126 } ;; { .mfi STFD [AOFFSET] = f107, 5 * SIZE FNMA f123 = f115, f20, f123 } { .mfi STFD [AOFFSET2] = f111, 5 * SIZE FNMA f127 = f119, f20, f127 } ;; { .mfi STFD [AOFFSET] = f112, SIZE FMPY f120 = f120, f21 } { .mfi STFD [AOFFSET2] = f116, SIZE FMPY f124 = f124, f21 } ;; { .mfi STFD [AOFFSET] = f113, SIZE FMPY f121 = f121, f21 } { .mfi STFD [AOFFSET2] = f117, SIZE FMPY f125 = f125, f21 } ;; { .mfi STFD [AOFFSET] = f114, SIZE FMPY f122 = f122, f21 } { .mfi STFD [AOFFSET2] = f118, SIZE FMPY f126 = f126, f21 } ;; { .mfi STFD [AOFFSET] = f115, 5 * SIZE FMPY f123 = f123, f21 } { .mfi STFD [AOFFSET2] = f119, 5 * SIZE FMPY f127 = f127, f21 } ;; { .mmi STFD [AOFFSET] = f120, SIZE STFD [AOFFSET2] = f124, SIZE } ;; { .mmi STFD [AOFFSET] = f121, SIZE STFD [AOFFSET2] = f125, SIZE } ;; { .mmi STFD [AOFFSET] = f122, SIZE STFD [AOFFSET2] = f126, SIZE adds C9 = 4 * SIZE, C1 } ;; { .mfi STFD [AOFFSET] = f123 adds AOFFSET = - 59 * SIZE, AOFFSET } { .mfi STFD [AOFFSET2] = f127 adds AOFFSET2 = - 59 * SIZE, AOFFSET2 } ;; #endif #ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; { .mfi LDFPD f35, f34 = [BOFFSET] FMPY f120 = f120, f32 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f124 = f124, f32 nop __LINE__ } ;; { .mfi LDFPD f37, f36 = [BOFFSET] FMPY f121 = f121, f32 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f125 = f125, f32 nop __LINE__ } ;; { .mfi LDFPD f39, f38 = [BOFFSET] FMPY f122 = f122, f32 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f126 = f126, f32 nop __LINE__ } ;; { .mfi LDFD f40 = [BOFFSET], -2 * SIZE FMPY f123 = f123, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f127 = f127, f32 nop __LINE__ } ;; { .mfi LDFPD f42, f41 = [BOFFSET] FNMA f112 = f120, f33, f112 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f116 = f124, f33, f116 nop __LINE__ } ;; { .mfi LDFPD f44, f43 = [BOFFSET] FNMA f113 = f121, f33, f113 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f117 = f125, f33, f117 nop __LINE__ } ;; { .mfi LDFPD f46, f45 = [BOFFSET] FNMA f114 = f122, f33, f114 adds BOFFSET = - 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f118 = f126, f33, f118 nop __LINE__ } ;; { .mfi LDFPD f48, f47 = [BOFFSET] FNMA f115 = f123, f33, f115 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f119 = f127, f33, f119 nop __LINE__ } ;; { .mfi LDFPD f50, f49 = [BOFFSET] FNMA f104 = f120, f34, f104 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f108 = f124, f34, f108 nop __LINE__ } ;; { .mfi LDFPD f52, f51 = [BOFFSET] FNMA f105 = f121, f34, f105 adds BOFFSET = - 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f109 = f125, f34, f109 nop __LINE__ } ;; { .mfi LDFD f53 = [BOFFSET], -2 * SIZE FNMA f106 = f122, f34, f106 } { .mfi nop __LINE__ FNMA f110 = f126, f34, f110 nop __LINE__ } ;; { .mfi LDFPD f55, f54 = [BOFFSET] FNMA f107 = f123, f34, f107 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f111 = f127, f34, f111 nop __LINE__ } ;; { .mfi LDFPD f57, f56 = [BOFFSET] FNMA f96 = f120, f35, f96 adds BOFFSET = - 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f100 = f124, f35, f100 nop __LINE__ } ;; { .mfi LDFPD f59, f58 = [BOFFSET] FNMA f97 = f121, f35, f97 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f101 = f125, f35, f101 nop __LINE__ } ;; { .mfi LDFPD f61, f60 = [BOFFSET] FNMA f98 = f122, f35, f98 adds BOFFSET = - 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f102 = f126, f35, f102 nop __LINE__ } ;; { .mfi LDFD f16 = [BOFFSET], -2 * SIZE FNMA f99 = f123, f35, f99 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f127, f35, f103 nop __LINE__ } ;; { .mfi LDFPD f18, f17 = [BOFFSET] FNMA f88 = f120, f36, f88 adds BOFFSET = - 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f92 = f124, f36, f92 nop __LINE__ } ;; { .mfi LDFPD f20, f19 = [BOFFSET] FNMA f89 = f121, f36, f89 adds BOFFSET = - 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FNMA f93 = f125, f36, f93 nop __LINE__ } ;; { .mfi LDFD f21 = [BOFFSET] FNMA f90 = f122, f36, f90 nop __LINE__ } { .mfi nop __LINE__ FNMA f94 = f126, f36, f94 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f91 = f123, f36, f91 adds AOFFSET = 56 * SIZE, AOFFSET } { .mfi nop __LINE__ FNMA f95 = f127, f36, f95 adds AOFFSET2 = 56 * SIZE, AOFFSET2 } ;; FNMA f80 = f120, f37, f80 FNMA f84 = f124, f37, f84 FNMA f81 = f121, f37, f81 FNMA f85 = f125, f37, f85 FNMA f82 = f122, f37, f82 FNMA f86 = f126, f37, f86 FNMA f83 = f123, f37, f83 FNMA f87 = f127, f37, f87 ;; FNMA f72 = f120, f38, f72 FNMA f76 = f124, f38, f76 FNMA f73 = f121, f38, f73 FNMA f77 = f125, f38, f77 FNMA f74 = f122, f38, f74 FNMA f78 = f126, f38, f78 FNMA f75 = f123, f38, f75 FNMA f79 = f127, f38, f79 ;; FNMA f64 = f120, f39, f64 FNMA f68 = f124, f39, f68 FNMA f65 = f121, f39, f65 FNMA f69 = f125, f39, f69 FNMA f66 = f122, f39, f66 FNMA f70 = f126, f39, f70 FNMA f67 = f123, f39, f67 FNMA f71 = f127, f39, f71 ;; FMPY f112 = f112, f40 FMPY f116 = f116, f40 FMPY f113 = f113, f40 FMPY f117 = f117, f40 FMPY f114 = f114, f40 FMPY f118 = f118, f40 FMPY f115 = f115, f40 FMPY f119 = f119, f40 ;; FNMA f104 = f112, f41, f104 FNMA f108 = f116, f41, f108 FNMA f105 = f113, f41, f105 FNMA f109 = f117, f41, f109 FNMA f106 = f114, f41, f106 FNMA f110 = f118, f41, f110 FNMA f107 = f115, f41, f107 FNMA f111 = f119, f41, f111 ;; FNMA f96 = f112, f42, f96 FNMA f100 = f116, f42, f100 FNMA f97 = f113, f42, f97 FNMA f101 = f117, f42, f101 FNMA f98 = f114, f42, f98 FNMA f102 = f118, f42, f102 FNMA f99 = f115, f42, f99 FNMA f103 = f119, f42, f103 ;; FNMA f88 = f112, f43, f88 FNMA f92 = f116, f43, f92 FNMA f89 = f113, f43, f89 FNMA f93 = f117, f43, f93 FNMA f90 = f114, f43, f90 FNMA f94 = f118, f43, f94 FNMA f91 = f115, f43, f91 FNMA f95 = f119, f43, f95 ;; FNMA f80 = f112, f44, f80 FNMA f84 = f116, f44, f84 FNMA f81 = f113, f44, f81 FNMA f85 = f117, f44, f85 FNMA f82 = f114, f44, f82 FNMA f86 = f118, f44, f86 FNMA f83 = f115, f44, f83 FNMA f87 = f119, f44, f87 ;; FNMA f72 = f112, f45, f72 FNMA f76 = f116, f45, f76 FNMA f73 = f113, f45, f73 FNMA f77 = f117, f45, f77 FNMA f74 = f114, f45, f74 FNMA f78 = f118, f45, f78 FNMA f75 = f115, f45, f75 FNMA f79 = f119, f45, f79 ;; FNMA f64 = f112, f46, f64 FNMA f68 = f116, f46, f68 FNMA f65 = f113, f46, f65 FNMA f69 = f117, f46, f69 FNMA f66 = f114, f46, f66 FNMA f70 = f118, f46, f70 FNMA f67 = f115, f46, f67 FNMA f71 = f119, f46, f71 ;; FMPY f104 = f104, f47 FMPY f108 = f108, f47 FMPY f105 = f105, f47 FMPY f109 = f109, f47 FMPY f106 = f106, f47 FMPY f110 = f110, f47 FMPY f107 = f107, f47 FMPY f111 = f111, f47 ;; FNMA f96 = f104, f48, f96 FNMA f100 = f108, f48, f100 FNMA f97 = f105, f48, f97 FNMA f101 = f109, f48, f101 FNMA f98 = f106, f48, f98 FNMA f102 = f110, f48, f102 FNMA f99 = f107, f48, f99 FNMA f103 = f111, f48, f103 ;; FNMA f88 = f104, f49, f88 FNMA f92 = f108, f49, f92 FNMA f89 = f105, f49, f89 FNMA f93 = f109, f49, f93 FNMA f90 = f106, f49, f90 FNMA f94 = f110, f49, f94 FNMA f91 = f107, f49, f91 FNMA f95 = f111, f49, f95 ;; FNMA f80 = f104, f50, f80 FNMA f84 = f108, f50, f84 FNMA f81 = f105, f50, f81 FNMA f85 = f109, f50, f85 FNMA f82 = f106, f50, f82 FNMA f86 = f110, f50, f86 FNMA f83 = f107, f50, f83 FNMA f87 = f111, f50, f87 ;; FNMA f72 = f104, f51, f72 FNMA f76 = f108, f51, f76 FNMA f73 = f105, f51, f73 FNMA f77 = f109, f51, f77 FNMA f74 = f106, f51, f74 FNMA f78 = f110, f51, f78 FNMA f75 = f107, f51, f75 FNMA f79 = f111, f51, f79 ;; FNMA f64 = f104, f52, f64 FNMA f68 = f108, f52, f68 FNMA f65 = f105, f52, f65 FNMA f69 = f109, f52, f69 FNMA f66 = f106, f52, f66 FNMA f70 = f110, f52, f70 FNMA f67 = f107, f52, f67 FNMA f71 = f111, f52, f71 ;; FMPY f96 = f96, f53 FMPY f100 = f100, f53 FMPY f97 = f97, f53 FMPY f101 = f101, f53 FMPY f98 = f98, f53 FMPY f102 = f102, f53 FMPY f99 = f99, f53 FMPY f103 = f103, f53 ;; FNMA f88 = f96, f54, f88 FNMA f92 = f100, f54, f92 FNMA f89 = f97, f54, f89 FNMA f93 = f101, f54, f93 FNMA f90 = f98, f54, f90 FNMA f94 = f102, f54, f94 FNMA f91 = f99, f54, f91 FNMA f95 = f103, f54, f95 ;; FNMA f80 = f96, f55, f80 FNMA f84 = f100, f55, f84 FNMA f81 = f97, f55, f81 FNMA f85 = f101, f55, f85 FNMA f82 = f98, f55, f82 FNMA f86 = f102, f55, f86 FNMA f83 = f99, f55, f83 FNMA f87 = f103, f55, f87 ;; FNMA f72 = f96, f56, f72 FNMA f76 = f100, f56, f76 FNMA f73 = f97, f56, f73 FNMA f77 = f101, f56, f77 FNMA f74 = f98, f56, f74 FNMA f78 = f102, f56, f78 FNMA f75 = f99, f56, f75 FNMA f79 = f103, f56, f79 ;; FNMA f64 = f96, f57, f64 FNMA f68 = f100, f57, f68 FNMA f65 = f97, f57, f65 FNMA f69 = f101, f57, f69 FNMA f66 = f98, f57, f66 FNMA f70 = f102, f57, f70 FNMA f67 = f99, f57, f67 FNMA f71 = f103, f57, f71 ;; FMPY f88 = f88, f58 FMPY f92 = f92, f58 FMPY f89 = f89, f58 FMPY f93 = f93, f58 FMPY f90 = f90, f58 FMPY f94 = f94, f58 FMPY f91 = f91, f58 FMPY f95 = f95, f58 ;; FNMA f80 = f88, f59, f80 FNMA f84 = f92, f59, f84 FNMA f81 = f89, f59, f81 FNMA f85 = f93, f59, f85 FNMA f82 = f90, f59, f82 FNMA f86 = f94, f59, f86 FNMA f83 = f91, f59, f83 FNMA f87 = f95, f59, f87 ;; FNMA f72 = f88, f60, f72 FNMA f76 = f92, f60, f76 FNMA f73 = f89, f60, f73 FNMA f77 = f93, f60, f77 FNMA f74 = f90, f60, f74 FNMA f78 = f94, f60, f78 FNMA f75 = f91, f60, f75 FNMA f79 = f95, f60, f79 ;; { .mfi STFD [AOFFSET] = f120, SIZE FNMA f64 = f88, f61, f64 } { .mfi STFD [AOFFSET2] = f124, SIZE FNMA f68 = f92, f61, f68 } ;; { .mfi STFD [AOFFSET] = f121, SIZE FNMA f65 = f89, f61, f65 } { .mfi STFD [AOFFSET2] = f125, SIZE FNMA f69 = f93, f61, f69 } ;; { .mfi STFD [AOFFSET] = f122, SIZE FNMA f66 = f90, f61, f66 } { .mfi STFD [AOFFSET2] = f126, SIZE FNMA f70 = f94, f61, f70 } ;; { .mfi STFD [AOFFSET] = f123, - 11 * SIZE FNMA f67 = f91, f61, f67 } { .mfi STFD [AOFFSET2] = f127, - 11 * SIZE FNMA f71 = f95, f61, f71 } ;; { .mfi STFD [AOFFSET] = f112, SIZE FMPY f80 = f80, f16 } { .mfi STFD [AOFFSET2] = f116, SIZE FMPY f84 = f84, f16 } ;; { .mfi STFD [AOFFSET] = f113, SIZE FMPY f81 = f81, f16 } { .mfi STFD [AOFFSET2] = f117, SIZE FMPY f85 = f85, f16 } ;; { .mfi STFD [AOFFSET] = f114, SIZE FMPY f82 = f82, f16 } { .mfi STFD [AOFFSET2] = f118, SIZE FMPY f86 = f86, f16 } ;; { .mfi STFD [AOFFSET] = f115, - 11 * SIZE FMPY f83 = f83, f16 } { .mfi STFD [AOFFSET2] = f119, - 11 * SIZE FMPY f87 = f87, f16 } ;; { .mfi STFD [AOFFSET] = f104, SIZE FNMA f72 = f80, f17, f72 } { .mfi STFD [AOFFSET2] = f108, SIZE FNMA f76 = f84, f17, f76 } ;; { .mfi STFD [AOFFSET] = f105, SIZE FNMA f73 = f81, f17, f73 } { .mfi STFD [AOFFSET2] = f109, SIZE FNMA f77 = f85, f17, f77 } ;; { .mfi STFD [AOFFSET] = f106, SIZE FNMA f74 = f82, f17, f74 } { .mfi STFD [AOFFSET2] = f110, SIZE FNMA f78 = f86, f17, f78 } ;; { .mfi STFD [AOFFSET] = f107, - 11 * SIZE FNMA f75 = f83, f17, f75 } { .mfi STFD [AOFFSET2] = f111, - 11 * SIZE FNMA f79 = f87, f17, f79 } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f64 = f80, f18, f64 } { .mfi STFD [AOFFSET2] = f100, SIZE FNMA f68 = f84, f18, f68 } ;; { .mfi STFD [AOFFSET] = f97, SIZE FNMA f65 = f81, f18, f65 } { .mfi STFD [AOFFSET2] = f101, SIZE FNMA f69 = f85, f18, f69 } ;; { .mfi STFD [AOFFSET] = f98, SIZE FNMA f66 = f82, f18, f66 } { .mfi STFD [AOFFSET2] = f102, SIZE FNMA f70 = f86, f18, f70 } ;; { .mfi STFD [AOFFSET] = f99, - 11 * SIZE FNMA f67 = f83, f18, f67 } { .mfi STFD [AOFFSET2] = f103, - 11 * SIZE FNMA f71 = f87, f18, f71 } ;; { .mfi STFD [AOFFSET] = f88, SIZE FMPY f72 = f72, f19 } { .mfi STFD [AOFFSET2] = f92, SIZE FMPY f76 = f76, f19 } ;; { .mfi STFD [AOFFSET] = f89, SIZE FMPY f73 = f73, f19 } { .mfi STFD [AOFFSET2] = f93, SIZE FMPY f77 = f77, f19 } ;; { .mfi STFD [AOFFSET] = f90, SIZE FMPY f74 = f74, f19 } { .mfi STFD [AOFFSET2] = f94, SIZE FMPY f78 = f78, f19 } ;; { .mfi STFD [AOFFSET] = f91, - 11 * SIZE FMPY f75 = f75, f19 } { .mfi STFD [AOFFSET2] = f95, - 11 * SIZE FMPY f79 = f79, f19 } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f64 = f72, f20, f64 } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f68 = f76, f20, f68 } ;; { .mfi STFD [AOFFSET] = f81, SIZE FNMA f65 = f73, f20, f65 } { .mfi STFD [AOFFSET2] = f85, SIZE FNMA f69 = f77, f20, f69 } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f66 = f74, f20, f66 } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f70 = f78, f20, f70 } ;; { .mfi STFD [AOFFSET] = f83, - 11 * SIZE FNMA f67 = f75, f20, f67 } { .mfi STFD [AOFFSET2] = f87, - 11 * SIZE FNMA f71 = f79, f20, f71 } ;; { .mfi STFD [AOFFSET] = f72, SIZE FMPY f64 = f64, f21 } { .mfi STFD [AOFFSET2] = f76, SIZE FMPY f68 = f68, f21 } ;; { .mfi STFD [AOFFSET] = f73, SIZE FMPY f65 = f65, f21 } { .mfi STFD [AOFFSET2] = f77, SIZE FMPY f69 = f69, f21 } ;; { .mfi STFD [AOFFSET] = f74, SIZE FMPY f66 = f66, f21 } { .mfi STFD [AOFFSET2] = f78, SIZE FMPY f70 = f70, f21 } ;; { .mfi STFD [AOFFSET] = f75, - 11 * SIZE FMPY f67 = f67, f21 } { .mfi STFD [AOFFSET2] = f79, - 11 * SIZE FMPY f71 = f71, f21 } ;; { .mmi STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE } ;; { .mmi STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE } ;; { .mmi STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE } ;; { .mmi STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f71, - 3 * SIZE adds C9 = 4 * SIZE, C1 } ;; #endif { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE #ifdef LN adds C3 = -8 * SIZE, C3 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C1 ] = f67, 5 * SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE #ifdef LN adds C4 = -8 * SIZE, C4 #else nop __LINE__ #endif } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi #ifndef LN STFD [C2 ] = f75, 5 * SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif STFD [C10] = f79 #ifdef LN adds C5 = -8 * SIZE, C5 #else nop __LINE__ #endif } ;; { .mmf STFD [C3 ] = f80, SIZE STFD [C11] = f84, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE STFD [C11] = f85, SIZE adds C13 = 4 * SIZE, C5 } ;; { .mmi STFD [C3 ] = f82, SIZE STFD [C11] = f86, SIZE #ifdef LN adds C6 = -8 * SIZE, C6 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C3 ] = f83, 5 * SIZE #else STFD [C3 ] = f83, - 3 * SIZE #endif STFD [C11] = f87 adds C14 = 4 * SIZE, C6 } ;; { .mmf STFD [C4 ] = f88, SIZE STFD [C12] = f92, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE STFD [C12] = f93, SIZE #ifdef LN adds C8 = -8 * SIZE, C8 #else nop __LINE__ #endif } ;; { .mmi STFD [C4 ] = f90, SIZE STFD [C12] = f94, SIZE adds C16 = 4 * SIZE, C8 } ;; { .mmi #ifndef LN STFD [C4 ] = f91, 5 * SIZE #else STFD [C4 ] = f91, - 3 * SIZE #endif STFD [C12] = f95 cmp.ne p6, p0 = 1, I } ;; { .mmf STFD [C5 ] = f96, SIZE STFD [C13] = f100, SIZE mov f96 = f0 } ;; { .mmi STFD [C5 ] = f97, SIZE STFD [C13] = f101, SIZE adds I = -1, I } ;; { .mmi STFD [C5 ] = f98, SIZE STFD [C13] = f102, SIZE #ifdef LN adds C7 = -8 * SIZE, C7 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C5 ] = f99, 5 * SIZE #else STFD [C5 ] = f99, - 3 * SIZE #endif STFD [C13] = f103 adds C15 = 4 * SIZE, C7 } ;; { .mmf STFD [C6 ] = f104, SIZE STFD [C14] = f108, SIZE mov f104 = f0 } ;; { .mmi STFD [C6 ] = f105, SIZE STFD [C14] = f109, SIZE shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi STFD [C6 ] = f106, SIZE STFD [C14] = f110, SIZE sub L = K, KK } ;; { .mmi #ifndef LN STFD [C6 ] = f107, 5 * SIZE #else STFD [C6 ] = f107, - 3 * SIZE #endif STFD [C14] = f111 #ifdef RT shladd AORIG = r2, 3, AORIG #else nop __LINE__ #endif } ;; { .mmf STFD [C7 ] = f112, SIZE STFD [C15] = f116, SIZE mov f112 = f0 } ;; { .mmi STFD [C7 ] = f113, SIZE STFD [C15] = f117, SIZE #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi STFD [C7 ] = f114, SIZE STFD [C15] = f118, SIZE #if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C7 ] = f115, 5 * SIZE #else STFD [C7 ] = f115, - 3 * SIZE #endif STFD [C15] = f119 #if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mmf STFD [C8 ] = f120, SIZE STFD [C16] = f124, SIZE mov f120 = f0 } ;; { .mmi STFD [C8 ] = f121, SIZE STFD [C16] = f125, SIZE #ifdef LT adds KK = 8, KK #elif defined LN adds KK = -8, KK #else nop __LINE__ #endif } ;; { .mmi STFD [C8 ] = f122, SIZE STFD [C16] = f126, SIZE #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmb #ifndef LN STFD [C8 ] = f123, 5 * SIZE #else STFD [C8 ] = f123, - 3 * SIZE #endif STFD [C16] = f127 (p6) br.cond.dptk .L011 } ;; .L020: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p0 = M, 2 (p6) br.cond.dptk .L030 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 3, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi setf.d f105 = r0 mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f67 = f0 } { .mfi setf.d f74 = r0 mov f75 = f0 adds L = -1, L } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f82 = r0 mov f83 = f0 } { .mfi setf.d f90 = r0 mov f91 = f0 cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f99 = f0 } { .mfi setf.d f106 = r0 mov f107 = f0 mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f114 = r0 mov f115 = f0 } { .mfb setf.d f122 = r0 mov f123 = f0 (p6) br.cond.dpnt .L028 } ;; .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 2 * SIZE, C4 } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C13 = 2 * SIZE, C5 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C14 = 2 * SIZE, C6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 (p5) adds C15 = 2 * SIZE, C7 } { .mfi nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 (p5) adds C16 = 2 * SIZE, C8 } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -8, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi LDFPD f56, f57 = [BOFFSET], 2 * SIZE FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi LDFPD f58, f59 = [BOFFSET], 2 * SIZE FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi LDFPD f60, f61 = [BOFFSET], 2 * SIZE FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi LDFPD f62, f63 = [BOFFSET] FSUB f113 = f46, f113 adds BOFFSET = -30 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; FSUB f66 = f48, f66 FSUB f74 = f49, f74 FSUB f82 = f50, f82 FSUB f90 = f51, f90 FSUB f98 = f52, f98 FSUB f106 = f53, f106 FSUB f114 = f54, f114 FSUB f122 = f55, f122 ;; FSUB f67 = f56, f67 FSUB f75 = f57, f75 FSUB f83 = f58, f83 FSUB f91 = f59, f91 FSUB f99 = f60, f99 FSUB f107 = f61, f107 FSUB f115 = f62, f115 FSUB f123 = f63, f123 ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET], 2 * SIZE ;; LDFPD f48, f49 = [AOFFSET], 2 * SIZE ;; LDFPD f50, f51 = [AOFFSET], 2 * SIZE ;; LDFPD f52, f53 = [AOFFSET], 2 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET], 2 * SIZE ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET], 2 * SIZE ;; LDFPD f62, f63 = [AOFFSET] adds AOFFSET = -30 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 FSUB f80 = f40, f80 FSUB f81 = f41, f81 FSUB f82 = f42, f82 FSUB f83 = f43, f83 FSUB f88 = f44, f88 FSUB f89 = f45, f89 FSUB f90 = f46, f90 FSUB f91 = f47, f91 ;; FSUB f96 = f48, f96 FSUB f97 = f49, f97 FSUB f98 = f50, f98 FSUB f99 = f51, f99 ;; FSUB f104 = f52, f104 FSUB f105 = f53, f105 FSUB f106 = f54, f106 FSUB f107 = f55, f107 ;; FSUB f112 = f56, f112 FSUB f113 = f57, f113 FSUB f114 = f58, f114 FSUB f115 = f59, f115 ;; FSUB f120 = f60, f120 FSUB f121 = f61, f121 FSUB f122 = f62, f122 FSUB f123 = f63, f123 ;; #endif #ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 FMPY f99 = f99, f32 FMPY f75 = f75, f32 FMPY f107 = f107, f32 FMPY f83 = f83, f32 FMPY f115 = f115, f32 FMPY f91 = f91, f32 FMPY f123 = f123, f32 ;; FNMA f66 = f67, f33, f66 FNMA f98 = f99, f33, f98 FNMA f74 = f75, f33, f74 FNMA f106 = f107, f33, f106 FNMA f82 = f83, f33, f82 FNMA f114 = f115, f33, f114 FNMA f90 = f91, f33, f90 FNMA f122 = f123, f33, f122 ;; FNMA f65 = f67, f34, f65 FNMA f97 = f99, f34, f97 FNMA f73 = f75, f34, f73 FNMA f105 = f107, f34, f105 FNMA f81 = f83, f34, f81 FNMA f113 = f115, f34, f113 FNMA f89 = f91, f34, f89 FNMA f121 = f123, f34, f121 ;; FNMA f64 = f67, f35, f64 FNMA f96 = f99, f35, f96 FNMA f72 = f75, f35, f72 FNMA f104 = f107, f35, f104 FNMA f80 = f83, f35, f80 FNMA f112 = f115, f35, f112 FNMA f88 = f91, f35, f88 FNMA f120 = f123, f35, f120 ;; FMPY f66 = f66, f36 FMPY f98 = f98, f36 FMPY f74 = f74, f36 FMPY f106 = f106, f36 FMPY f82 = f82, f36 FMPY f114 = f114, f36 FMPY f90 = f90, f36 FMPY f122 = f122, f36 ;; FNMA f65 = f66, f37, f65 FNMA f97 = f98, f37, f97 FNMA f73 = f74, f37, f73 FNMA f105 = f106, f37, f105 FNMA f81 = f82, f37, f81 FNMA f113 = f114, f37, f113 FNMA f89 = f90, f37, f89 FNMA f121 = f122, f37, f121 ;; FNMA f64 = f66, f38, f64 FNMA f96 = f98, f38, f96 FNMA f72 = f74, f38, f72 FNMA f104 = f106, f38, f104 FNMA f80 = f82, f38, f80 FNMA f112 = f114, f38, f112 FNMA f88 = f90, f38, f88 FNMA f120 = f122, f38, f120 ;; adds BOFFSET = 24 * SIZE, BOFFSET adds BOFFSET2 = 24 * SIZE, BOFFSET2 ;; { .mfi STFD [BOFFSET] = f67, SIZE FMPY f65 = f65, f39 } { .mfi STFD [BOFFSET2] = f99, SIZE FMPY f97 = f97, f39 } ;; { .mfi STFD [BOFFSET] = f75, SIZE FMPY f73 = f73, f39 } { .mfi STFD [BOFFSET2] = f107, SIZE FMPY f105 = f105, f39 } ;; { .mfi STFD [BOFFSET] = f83, SIZE FMPY f81 = f81, f39 } { .mfi STFD [BOFFSET2] = f115, SIZE FMPY f113 = f113, f39 } ;; { .mfi STFD [BOFFSET] = f91, - 11 * SIZE FMPY f89 = f89, f39 } { .mfi STFD [BOFFSET2] = f123, - 11 * SIZE FMPY f121 = f121, f39 } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f64 = f65, f40, f64 } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f96 = f97, f40, f96 } ;; { .mfi STFD [BOFFSET] = f74, SIZE FNMA f72 = f73, f40, f72 } { .mfi STFD [BOFFSET2] = f106, SIZE FNMA f104 = f105, f40, f104 } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f80 = f81, f40, f80 } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f112 = f113, f40, f112 } ;; { .mfi STFD [BOFFSET] = f90, -11 * SIZE FNMA f88 = f89, f40, f88 } { .mfi STFD [BOFFSET2] = f122, -11 * SIZE FNMA f120 = f121, f40, f120 } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMPY f64 = f64, f41 } { .mfi STFD [BOFFSET2] = f97, SIZE FMPY f96 = f96, f41 } ;; { .mfi STFD [BOFFSET] = f73, SIZE FMPY f72 = f72, f41 } { .mfi STFD [BOFFSET2] = f105, SIZE FMPY f104 = f104, f41 } ;; { .mfi STFD [BOFFSET] = f81, SIZE FMPY f80 = f80, f41 } { .mfi STFD [BOFFSET2] = f113, SIZE FMPY f112 = f112, f41 } ;; { .mfi STFD [BOFFSET] = f89, - 11 * SIZE FMPY f88 = f88, f41 } { .mfi STFD [BOFFSET2] = f121, - 11 * SIZE FMPY f120 = f120, f41 } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -4 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -4 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi FNMA f81 = f80, f33, f81 } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; { .mfi FNMA f66 = f64, f34, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f96, f34, f98 nop __LINE__ } ;; { .mfi FNMA f74 = f72, f34, f74 } { .mfi nop __LINE__ FNMA f106 = f104, f34, f106 nop __LINE__ } ;; { .mfi FNMA f82 = f80, f34, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f112, f34, f114 nop __LINE__ } ;; { .mfi FNMA f90 = f88, f34, f90 nop __LINE__ } { .mfi nop __LINE__ FNMA f122 = f120, f34, f122 nop __LINE__ } ;; { .mfi FNMA f67 = f64, f35, f67 } { .mfi nop __LINE__ FNMA f99 = f96, f35, f99 nop __LINE__ } ;; { .mfi FNMA f75 = f72, f35, f75 nop __LINE__ } { .mfi nop __LINE__ FNMA f107 = f104, f35, f107 nop __LINE__ } ;; { .mfi FNMA f83 = f80, f35, f83 } { .mfi nop __LINE__ FNMA f115 = f112, f35, f115 nop __LINE__ } ;; { .mfi FNMA f91 = f88, f35, f91 nop __LINE__ } { .mfi nop __LINE__ FNMA f123 = f120, f35, f123 nop __LINE__ } ;; FMPY f65 = f65, f36 FMPY f97 = f97, f36 FMPY f73 = f73, f36 FMPY f105 = f105, f36 FMPY f81 = f81, f36 FMPY f113 = f113, f36 FMPY f89 = f89, f36 FMPY f121 = f121, f36 ;; FNMA f66 = f65, f37, f66 FNMA f98 = f97, f37, f98 FNMA f74 = f73, f37, f74 FNMA f106 = f105, f37, f106 FNMA f82 = f81, f37, f82 FNMA f114 = f113, f37, f114 FNMA f90 = f89, f37, f90 FNMA f122 = f121, f37, f122 ;; FNMA f67 = f65, f38, f67 FNMA f99 = f97, f38, f99 FNMA f75 = f73, f38, f75 FNMA f107 = f105, f38, f107 FNMA f83 = f81, f38, f83 FNMA f115 = f113, f38, f115 FNMA f91 = f89, f38, f91 FNMA f123 = f121, f38, f123 ;; FMPY f66 = f66, f39 FMPY f98 = f98, f39 FMPY f74 = f74, f39 FMPY f106 = f106, f39 FMPY f82 = f82, f39 FMPY f114 = f114, f39 FMPY f90 = f90, f39 FMPY f122 = f122, f39 ;; FNMA f67 = f66, f40, f67 FNMA f99 = f98, f40, f99 FNMA f75 = f74, f40, f75 FNMA f107 = f106, f40, f107 FNMA f83 = f82, f40, f83 FNMA f115 = f114, f40, f115 FNMA f91 = f90, f40, f91 FNMA f123 = f122, f40, f123 ;; FMPY f67 = f67, f41 FMPY f99 = f99, f41 FMPY f75 = f75, f41 FMPY f107 = f107, f41 FMPY f83 = f83, f41 FMPY f115 = f115, f41 FMPY f91 = f91, f41 FMPY f123 = f123, f41 ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, 5 * SIZE } { .mfi STFD [BOFFSET2] = f121, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f66, SIZE } { .mfi STFD [BOFFSET2] = f98, SIZE } ;; { .mfi STFD [BOFFSET] = f74, SIZE } { .mfi STFD [BOFFSET2] = f106, SIZE } ;; { .mfi STFD [BOFFSET] = f82, SIZE } { .mfi STFD [BOFFSET2] = f114, SIZE } ;; { .mfi STFD [BOFFSET] = f90, 5 * SIZE } { .mfi STFD [BOFFSET2] = f122, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f67, SIZE } { .mfi STFD [BOFFSET2] = f99, SIZE } ;; { .mfi STFD [BOFFSET] = f75, SIZE } { .mfi STFD [BOFFSET2] = f107, SIZE } ;; { .mfi STFD [BOFFSET] = f83, SIZE } { .mfi STFD [BOFFSET2] = f115, SIZE } ;; { .mfi STFD [BOFFSET] = f91, -27 * SIZE } { .mfi STFD [BOFFSET2] = f123, -27 * SIZE } ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 FNMA f82 = f66, f34, f82 FNMA f83 = f67, f34, f83 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 FNMA f90 = f66, f35, f90 FNMA f91 = f67, f35, f91 ;; FNMA f96 = f64, f36, f96 FNMA f97 = f65, f36, f97 FNMA f98 = f66, f36, f98 FNMA f99 = f67, f36, f99 ;; FNMA f104 = f64, f37, f104 FNMA f105 = f65, f37, f105 FNMA f106 = f66, f37, f106 FNMA f107 = f67, f37, f107 ;; FNMA f112 = f64, f38, f112 FNMA f113 = f65, f38, f113 FNMA f114 = f66, f38, f114 FNMA f115 = f67, f38, f115 ;; FNMA f120 = f64, f39, f120 FNMA f121 = f65, f39, f121 FNMA f122 = f66, f39, f122 FNMA f123 = f67, f39, f123 ;; FMPY f72 = f72, f40 FMPY f73 = f73, f40 FMPY f74 = f74, f40 FMPY f75 = f75, f40 ;; FNMA f80 = f72, f41, f80 FNMA f81 = f73, f41, f81 FNMA f82 = f74, f41, f82 FNMA f83 = f75, f41, f83 ;; FNMA f88 = f72, f42, f88 FNMA f89 = f73, f42, f89 FNMA f90 = f74, f42, f90 FNMA f91 = f75, f42, f91 ;; FNMA f96 = f72, f43, f96 FNMA f97 = f73, f43, f97 FNMA f98 = f74, f43, f98 FNMA f99 = f75, f43, f99 ;; FNMA f104 = f72, f44, f104 FNMA f105 = f73, f44, f105 FNMA f106 = f74, f44, f106 FNMA f107 = f75, f44, f107 ;; FNMA f112 = f72, f45, f112 FNMA f113 = f73, f45, f113 FNMA f114 = f74, f45, f114 FNMA f115 = f75, f45, f115 ;; FNMA f120 = f72, f46, f120 FNMA f121 = f73, f46, f121 FNMA f122 = f74, f46, f122 FNMA f123 = f75, f46, f123 ;; FMPY f80 = f80, f47 FMPY f81 = f81, f47 FMPY f82 = f82, f47 FMPY f83 = f83, f47 ;; FNMA f88 = f80, f48, f88 FNMA f89 = f81, f48, f89 FNMA f90 = f82, f48, f90 FNMA f91 = f83, f48, f91 ;; FNMA f96 = f80, f49, f96 FNMA f97 = f81, f49, f97 FNMA f98 = f82, f49, f98 FNMA f99 = f83, f49, f99 ;; FNMA f104 = f80, f50, f104 FNMA f105 = f81, f50, f105 FNMA f106 = f82, f50, f106 FNMA f107 = f83, f50, f107 ;; FNMA f112 = f80, f51, f112 FNMA f113 = f81, f51, f113 FNMA f114 = f82, f51, f114 FNMA f115 = f83, f51, f115 ;; FNMA f120 = f80, f52, f120 FNMA f121 = f81, f52, f121 FNMA f122 = f82, f52, f122 FNMA f123 = f83, f52, f123 ;; FMPY f88 = f88, f53 FMPY f89 = f89, f53 FMPY f90 = f90, f53 FMPY f91 = f91, f53 ;; FNMA f96 = f88, f54, f96 FNMA f97 = f89, f54, f97 FNMA f98 = f90, f54, f98 FNMA f99 = f91, f54, f99 ;; FNMA f104 = f88, f55, f104 FNMA f105 = f89, f55, f105 FNMA f106 = f90, f55, f106 FNMA f107 = f91, f55, f107 ;; FNMA f112 = f88, f56, f112 FNMA f113 = f89, f56, f113 FNMA f114 = f90, f56, f114 FNMA f115 = f91, f56, f115 ;; FNMA f120 = f88, f57, f120 FNMA f121 = f89, f57, f121 FNMA f122 = f90, f57, f122 FNMA f123 = f91, f57, f123 ;; FMPY f96 = f96, f58 FMPY f97 = f97, f58 FMPY f98 = f98, f58 FMPY f99 = f99, f58 ;; FNMA f104 = f96, f59, f104 FNMA f105 = f97, f59, f105 FNMA f106 = f98, f59, f106 FNMA f107 = f99, f59, f107 ;; FNMA f112 = f96, f60, f112 FNMA f113 = f97, f60, f113 FNMA f114 = f98, f60, f114 FNMA f115 = f99, f60, f115 ;; FNMA f120 = f96, f61, f120 FNMA f121 = f97, f61, f121 FNMA f122 = f98, f61, f122 FNMA f123 = f99, f61, f123 ;; FMPY f104 = f104, f16 FMPY f105 = f105, f16 FMPY f106 = f106, f16 FMPY f107 = f107, f16 ;; FNMA f112 = f104, f17, f112 FNMA f113 = f105, f17, f113 FNMA f114 = f106, f17, f114 FNMA f115 = f107, f17, f115 ;; FNMA f120 = f104, f18, f120 FNMA f121 = f105, f18, f121 FNMA f122 = f106, f18, f122 FNMA f123 = f107, f18, f123 ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 FMPY f114 = f114, f19 FMPY f115 = f115, f19 ;; FNMA f120 = f112, f20, f120 FNMA f121 = f113, f20, f121 FNMA f122 = f114, f20, f122 FNMA f123 = f115, f20, f123 ;; FMPY f120 = f120, f21 FMPY f121 = f121, f21 FMPY f122 = f122, f21 FMPY f123 = f123, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f75, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, 5 * SIZE STFD [AOFFSET2] = f91, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f105, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f106, SIZE ;; STFD [AOFFSET] = f99, 5 * SIZE STFD [AOFFSET2] = f107, 5 * SIZE ;; STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f121, SIZE ;; STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f122, SIZE ;; STFD [AOFFSET] = f115, -27 * SIZE STFD [AOFFSET2] = f123, - 27 * SIZE ;; #endif #ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 FMPY f121 = f121, f32 FMPY f122 = f122, f32 FMPY f123 = f123, f32 ;; FNMA f112 = f120, f33, f112 FNMA f113 = f121, f33, f113 FNMA f114 = f122, f33, f114 FNMA f115 = f123, f33, f115 ;; FNMA f104 = f120, f34, f104 FNMA f105 = f121, f34, f105 FNMA f106 = f122, f34, f106 FNMA f107 = f123, f34, f107 ;; FNMA f96 = f120, f35, f96 FNMA f97 = f121, f35, f97 FNMA f98 = f122, f35, f98 FNMA f99 = f123, f35, f99 ;; FNMA f88 = f120, f36, f88 FNMA f89 = f121, f36, f89 FNMA f90 = f122, f36, f90 FNMA f91 = f123, f36, f91 ;; FNMA f80 = f120, f37, f80 FNMA f81 = f121, f37, f81 FNMA f82 = f122, f37, f82 FNMA f83 = f123, f37, f83 ;; FNMA f72 = f120, f38, f72 FNMA f73 = f121, f38, f73 FNMA f74 = f122, f38, f74 FNMA f75 = f123, f38, f75 ;; FNMA f64 = f120, f39, f64 FNMA f65 = f121, f39, f65 FNMA f66 = f122, f39, f66 FNMA f67 = f123, f39, f67 ;; FMPY f112 = f112, f40 FMPY f113 = f113, f40 FMPY f114 = f114, f40 FMPY f115 = f115, f40 ;; FNMA f104 = f112, f41, f104 FNMA f105 = f113, f41, f105 FNMA f106 = f114, f41, f106 FNMA f107 = f115, f41, f107 ;; FNMA f96 = f112, f42, f96 FNMA f97 = f113, f42, f97 FNMA f98 = f114, f42, f98 FNMA f99 = f115, f42, f99 ;; FNMA f88 = f112, f43, f88 FNMA f89 = f113, f43, f89 FNMA f90 = f114, f43, f90 FNMA f91 = f115, f43, f91 ;; FNMA f80 = f112, f44, f80 FNMA f81 = f113, f44, f81 FNMA f82 = f114, f44, f82 FNMA f83 = f115, f44, f83 ;; FNMA f72 = f112, f45, f72 FNMA f73 = f113, f45, f73 FNMA f74 = f114, f45, f74 FNMA f75 = f115, f45, f75 ;; FNMA f64 = f112, f46, f64 FNMA f65 = f113, f46, f65 FNMA f66 = f114, f46, f66 FNMA f67 = f115, f46, f67 ;; FMPY f104 = f104, f47 FMPY f105 = f105, f47 FMPY f106 = f106, f47 FMPY f107 = f107, f47 ;; FNMA f96 = f104, f48, f96 FNMA f97 = f105, f48, f97 FNMA f98 = f106, f48, f98 FNMA f99 = f107, f48, f99 ;; FNMA f88 = f104, f49, f88 FNMA f89 = f105, f49, f89 FNMA f90 = f106, f49, f90 FNMA f91 = f107, f49, f91 ;; FNMA f80 = f104, f50, f80 FNMA f81 = f105, f50, f81 FNMA f82 = f106, f50, f82 FNMA f83 = f107, f50, f83 ;; FNMA f72 = f104, f51, f72 FNMA f73 = f105, f51, f73 FNMA f74 = f106, f51, f74 FNMA f75 = f107, f51, f75 ;; FNMA f64 = f104, f52, f64 FNMA f65 = f105, f52, f65 FNMA f66 = f106, f52, f66 FNMA f67 = f107, f52, f67 ;; FMPY f96 = f96, f53 FMPY f97 = f97, f53 FMPY f98 = f98, f53 FMPY f99 = f99, f53 ;; FNMA f88 = f96, f54, f88 FNMA f89 = f97, f54, f89 FNMA f90 = f98, f54, f90 FNMA f91 = f99, f54, f91 ;; FNMA f80 = f96, f55, f80 FNMA f81 = f97, f55, f81 FNMA f82 = f98, f55, f82 FNMA f83 = f99, f55, f83 ;; FNMA f72 = f96, f56, f72 FNMA f73 = f97, f56, f73 FNMA f74 = f98, f56, f74 FNMA f75 = f99, f56, f75 ;; FNMA f64 = f96, f57, f64 FNMA f65 = f97, f57, f65 FNMA f66 = f98, f57, f66 FNMA f67 = f99, f57, f67 ;; FMPY f88 = f88, f58 FMPY f89 = f89, f58 FMPY f90 = f90, f58 FMPY f91 = f91, f58 ;; FNMA f80 = f88, f59, f80 FNMA f81 = f89, f59, f81 FNMA f82 = f90, f59, f82 FNMA f83 = f91, f59, f83 ;; FNMA f72 = f88, f60, f72 FNMA f73 = f89, f60, f73 FNMA f74 = f90, f60, f74 FNMA f75 = f91, f60, f75 ;; FNMA f64 = f88, f61, f64 FNMA f65 = f89, f61, f65 FNMA f66 = f90, f61, f66 FNMA f67 = f91, f61, f67 ;; FMPY f80 = f80, f16 FMPY f81 = f81, f16 FMPY f82 = f82, f16 FMPY f83 = f83, f16 ;; FNMA f72 = f80, f17, f72 FNMA f73 = f81, f17, f73 FNMA f74 = f82, f17, f74 FNMA f75 = f83, f17, f75 ;; FNMA f64 = f80, f18, f64 FNMA f65 = f81, f18, f65 FNMA f66 = f82, f18, f66 FNMA f67 = f83, f18, f67 ;; FMPY f72 = f72, f19 FMPY f73 = f73, f19 FMPY f74 = f74, f19 FMPY f75 = f75, f19 ;; FNMA f64 = f72, f20, f64 FNMA f65 = f73, f20, f65 FNMA f66 = f74, f20, f66 FNMA f67 = f75, f20, f67 ;; FMPY f64 = f64, f21 FMPY f65 = f65, f21 FMPY f66 = f66, f21 FMPY f67 = f67, f21 ;; adds AOFFSET = 24 * SIZE, AOFFSET adds AOFFSET2 = 24 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f121, SIZE ;; STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f122, SIZE ;; STFD [AOFFSET] = f115, - 11 * SIZE STFD [AOFFSET2] = f123, - 11 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f105, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f106, SIZE ;; STFD [AOFFSET] = f99, - 11 * SIZE STFD [AOFFSET2] = f107, - 11 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f89, SIZE ;; STFD [AOFFSET] = f82, SIZE STFD [AOFFSET2] = f90, SIZE ;; STFD [AOFFSET] = f83, - 11 * SIZE STFD [AOFFSET2] = f91, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f75, - 3 * SIZE ;; #endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE #ifdef LN adds C3 = -4 * SIZE, C3 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C1 ] = f67, SIZE #else STFD [C1 ] = f67, - 3 * SIZE #endif } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE #ifdef LN adds C4 = -4 * SIZE, C4 #else nop __LINE__ #endif } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi #ifndef LN STFD [C2 ] = f75, SIZE #else STFD [C2 ] = f75, - 3 * SIZE #endif #ifdef LN adds C5 = -4 * SIZE, C5 #else nop __LINE__ #endif } ;; { .mmf STFD [C3 ] = f80, SIZE mov f80 = f0 } ;; { .mmi STFD [C3 ] = f81, SIZE } ;; { .mmi STFD [C3 ] = f82, SIZE #ifdef LN adds C6 = -4 * SIZE, C6 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C3 ] = f83, SIZE #else STFD [C3 ] = f83, - 3 * SIZE #endif } ;; { .mmf STFD [C4 ] = f88, SIZE mov f88 = f0 } ;; { .mmi STFD [C4 ] = f89, SIZE #ifdef LN adds C8 = -4 * SIZE, C8 #else nop __LINE__ #endif } ;; { .mmi STFD [C4 ] = f90, SIZE } ;; { .mmi #ifndef LN STFD [C4 ] = f91, SIZE #else STFD [C4 ] = f91, - 3 * SIZE #endif nop __LINE__ } ;; { .mmf STFD [C5 ] = f96, SIZE mov f96 = f0 } ;; { .mmi STFD [C5 ] = f97, SIZE nop __LINE__ } ;; { .mmi STFD [C5 ] = f98, SIZE #ifdef LN adds C7 = -4 * SIZE, C7 #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C5 ] = f99, SIZE #else STFD [C5 ] = f99, - 3 * SIZE #endif } ;; { .mmf STFD [C6 ] = f104, SIZE mov f104 = f0 } ;; { .mmi STFD [C6 ] = f105, SIZE shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi STFD [C6 ] = f106, SIZE sub L = K, KK } ;; { .mmi #ifndef LN STFD [C6 ] = f107, SIZE #else STFD [C6 ] = f107, - 3 * SIZE #endif #ifdef RT shladd AORIG = r2, 2, AORIG #else nop __LINE__ #endif } ;; { .mmf STFD [C7 ] = f112, SIZE mov f112 = f0 } ;; { .mmi STFD [C7 ] = f113, SIZE #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi STFD [C7 ] = f114, SIZE #if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #ifndef LN STFD [C7 ] = f115, SIZE #else STFD [C7 ] = f115, - 3 * SIZE #endif #if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mmf STFD [C8 ] = f120, SIZE mov f120 = f0 } ;; { .mmi STFD [C8 ] = f121, SIZE #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif } ;; { .mmi STFD [C8 ] = f122, SIZE #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; { .mmb #ifndef LN STFD [C8 ] = f123, SIZE #else STFD [C8 ] = f123, - 3 * SIZE #endif } ;; .align 8 .L030: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p0 = M, 1 (p6) br.cond.dptk .L040 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 3, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif { .mfi setf.d f105 = r0 mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE adds L = -1, L } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L038 } ;; .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -8, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi FSUB f113 = f46, f113 } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 FSUB f80 = f36, f80 FSUB f81 = f37, f81 FSUB f88 = f38, f88 FSUB f89 = f39, f89 ;; FSUB f96 = f40, f96 FSUB f97 = f41, f97 ;; FSUB f104 = f42, f104 FSUB f105 = f43, f105 ;; FSUB f112 = f44, f112 FSUB f113 = f45, f113 ;; FSUB f120 = f46, f120 FSUB f121 = f47, f121 ;; #endif #ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f97 = f97, f32 FMPY f73 = f73, f32 FMPY f105 = f105, f32 FMPY f81 = f81, f32 FMPY f113 = f113, f32 FMPY f89 = f89, f32 FMPY f121 = f121, f32 ;; FNMA f64 = f65, f33, f64 FNMA f96 = f97, f33, f96 FNMA f72 = f73, f33, f72 FNMA f104 = f105, f33, f104 FNMA f80 = f81, f33, f80 FNMA f112 = f113, f33, f112 FNMA f88 = f89, f33, f88 FNMA f120 = f121, f33, f120 ;; FMPY f64 = f64, f34 FMPY f96 = f96, f34 FMPY f72 = f72, f34 FMPY f104 = f104, f34 FMPY f80 = f80, f34 FMPY f112 = f112, f34 FMPY f88 = f88, f34 FMPY f120 = f120, f34 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, - 11 * SIZE } { .mfi STFD [BOFFSET2] = f121, - 11 * SIZE } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -2 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -2 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 adds C5 = -2 * SIZE, C5 adds C6 = -2 * SIZE, C6 adds C7 = -2 * SIZE, C7 adds C8 = -2 * SIZE, C8 ;; #endif #ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__ } ;; { .mfi FNMA f81 = f80, f33, f81 } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; FMPY f65 = f65, f34 FMPY f97 = f97, f34 FMPY f73 = f73, f34 FMPY f105 = f105, f34 FMPY f81 = f81, f34 FMPY f113 = f113, f34 FMPY f89 = f89, f34 FMPY f121 = f121, f34 ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, -11 * SIZE } { .mfi STFD [BOFFSET2] = f121, -11 * SIZE } #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 ;; FNMA f96 = f64, f36, f96 FNMA f97 = f65, f36, f97 ;; FNMA f104 = f64, f37, f104 FNMA f105 = f65, f37, f105 ;; FNMA f112 = f64, f38, f112 FNMA f113 = f65, f38, f113 ;; FNMA f120 = f64, f39, f120 FNMA f121 = f65, f39, f121 ;; FMPY f72 = f72, f40 FMPY f73 = f73, f40 ;; FNMA f80 = f72, f41, f80 FNMA f81 = f73, f41, f81 ;; FNMA f88 = f72, f42, f88 FNMA f89 = f73, f42, f89 ;; FNMA f96 = f72, f43, f96 FNMA f97 = f73, f43, f97 ;; FNMA f104 = f72, f44, f104 FNMA f105 = f73, f44, f105 ;; FNMA f112 = f72, f45, f112 FNMA f113 = f73, f45, f113 ;; FNMA f120 = f72, f46, f120 FNMA f121 = f73, f46, f121 ;; FMPY f80 = f80, f47 FMPY f81 = f81, f47 ;; FNMA f88 = f80, f48, f88 FNMA f89 = f81, f48, f89 ;; FNMA f96 = f80, f49, f96 FNMA f97 = f81, f49, f97 ;; FNMA f104 = f80, f50, f104 FNMA f105 = f81, f50, f105 ;; FNMA f112 = f80, f51, f112 FNMA f113 = f81, f51, f113 ;; FNMA f120 = f80, f52, f120 FNMA f121 = f81, f52, f121 ;; FMPY f88 = f88, f53 FMPY f89 = f89, f53 ;; FNMA f96 = f88, f54, f96 FNMA f97 = f89, f54, f97 ;; FNMA f104 = f88, f55, f104 FNMA f105 = f89, f55, f105 ;; FNMA f112 = f88, f56, f112 FNMA f113 = f89, f56, f113 ;; FNMA f120 = f88, f57, f120 FNMA f121 = f89, f57, f121 ;; FMPY f96 = f96, f58 FMPY f97 = f97, f58 ;; FNMA f104 = f96, f59, f104 FNMA f105 = f97, f59, f105 ;; FNMA f112 = f96, f60, f112 FNMA f113 = f97, f60, f113 ;; FNMA f120 = f96, f61, f120 FNMA f121 = f97, f61, f121 ;; FMPY f104 = f104, f16 FMPY f105 = f105, f16 ;; FNMA f112 = f104, f17, f112 FNMA f113 = f105, f17, f113 ;; FNMA f120 = f104, f18, f120 FNMA f121 = f105, f18, f121 ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 ;; FNMA f120 = f112, f20, f120 FNMA f121 = f113, f20, f121 ;; FMPY f120 = f120, f21 FMPY f121 = f121, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, 5 * SIZE STFD [AOFFSET2] = f89, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f104, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f105, -11 * SIZE STFD [AOFFSET2] = f121, - 11 * SIZE ;; #endif #ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 FMPY f121 = f121, f32 ;; FNMA f112 = f120, f33, f112 FNMA f113 = f121, f33, f113 ;; FNMA f104 = f120, f34, f104 FNMA f105 = f121, f34, f105 ;; FNMA f96 = f120, f35, f96 FNMA f97 = f121, f35, f97 ;; FNMA f88 = f120, f36, f88 FNMA f89 = f121, f36, f89 ;; FNMA f80 = f120, f37, f80 FNMA f81 = f121, f37, f81 ;; FNMA f72 = f120, f38, f72 FNMA f73 = f121, f38, f73 ;; FNMA f64 = f120, f39, f64 FNMA f65 = f121, f39, f65 ;; FMPY f112 = f112, f40 FMPY f113 = f113, f40 ;; FNMA f104 = f112, f41, f104 FNMA f105 = f113, f41, f105 ;; FNMA f96 = f112, f42, f96 FNMA f97 = f113, f42, f97 ;; FNMA f88 = f112, f43, f88 FNMA f89 = f113, f43, f89 ;; FNMA f80 = f112, f44, f80 FNMA f81 = f113, f44, f81 ;; FNMA f72 = f112, f45, f72 FNMA f73 = f113, f45, f73 ;; FNMA f64 = f112, f46, f64 FNMA f65 = f113, f46, f65 ;; FMPY f104 = f104, f47 FMPY f105 = f105, f47 ;; FNMA f96 = f104, f48, f96 FNMA f97 = f105, f48, f97 ;; FNMA f88 = f104, f49, f88 FNMA f89 = f105, f49, f89 ;; FNMA f80 = f104, f50, f80 FNMA f81 = f105, f50, f81 ;; FNMA f72 = f104, f51, f72 FNMA f73 = f105, f51, f73 ;; FNMA f64 = f104, f52, f64 FNMA f65 = f105, f52, f65 ;; FMPY f96 = f96, f53 FMPY f97 = f97, f53 ;; FNMA f88 = f96, f54, f88 FNMA f89 = f97, f54, f89 ;; FNMA f80 = f96, f55, f80 FNMA f81 = f97, f55, f81 ;; FNMA f72 = f96, f56, f72 FNMA f73 = f97, f56, f73 ;; FNMA f64 = f96, f57, f64 FNMA f65 = f97, f57, f65 ;; FMPY f88 = f88, f58 FMPY f89 = f89, f58 ;; FNMA f80 = f88, f59, f80 FNMA f81 = f89, f59, f81 ;; FNMA f72 = f88, f60, f72 FNMA f73 = f89, f60, f73 ;; FNMA f64 = f88, f61, f64 FNMA f65 = f89, f61, f65 ;; FMPY f80 = f80, f16 FMPY f81 = f81, f16 ;; FNMA f72 = f80, f17, f72 FNMA f73 = f81, f17, f73 ;; FNMA f64 = f80, f18, f64 FNMA f65 = f81, f18, f65 ;; FMPY f72 = f72, f19 FMPY f73 = f73, f19 ;; FNMA f64 = f72, f20, f64 FNMA f65 = f73, f20, f65 ;; FMPY f64 = f64, f21 FMPY f65 = f65, f21 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f104, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f105, - 11 * SIZE STFD [AOFFSET2] = f121, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, - 3 * SIZE STFD [AOFFSET2] = f89, - 3 * SIZE ;; #endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;; #ifndef LN STFD [C1 ] = f65, SIZE #else STFD [C1 ] = f65, -SIZE #endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;; #ifndef LN STFD [C2 ] = f73, SIZE #else STFD [C2 ] = f73, -SIZE #endif ;; STFD [C3 ] = f80, SIZE mov f80 = f0 ;; #ifndef LN STFD [C3 ] = f81, SIZE #else STFD [C3 ] = f81, - SIZE #endif ;; STFD [C4 ] = f88, SIZE mov f88 = f0 ;; #ifndef LN STFD [C4 ] = f89, SIZE #else STFD [C4 ] = f89, -SIZE #endif ;; STFD [C5 ] = f96, SIZE mov f96 = f0 ;; #ifndef LN STFD [C5 ] = f97, SIZE #else STFD [C5 ] = f97, -SIZE #endif ;; STFD [C6 ] = f104, SIZE mov f104 = f0 ;; #ifndef LN STFD [C6 ] = f105, SIZE #else STFD [C6 ] = f105, -SIZE #endif ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #else nop __LINE__ #endif ;; STFD [C7 ] = f112, SIZE mov f112 = f0 ;; { .mmi #ifndef LN STFD [C7 ] = f113, SIZE #else STFD [C7 ] = f113, -SIZE #endif #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET #else nop __LINE__ #endif } ;; { .mmf STFD [C8 ] = f120, SIZE mov f120 = f0 } ;; { .mmi #ifndef LN STFD [C8 ] = f121, SIZE #else STFD [C8 ] = f121, -SIZE #endif #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif } ;; { .mmi #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } ;; .align 8 .L040: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p0 = M, 0 (p6) br.cond.dptk .L049 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 3, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE adds L = -1, L } ;; { .mmi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L048 } ;; .L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ br.cloop.sptk.few .L042 } ;; .L048: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -8, KK #endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;; #if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; #else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f80 = f34, f80 FSUB f88 = f35, f88 FSUB f96 = f36, f96 FSUB f104 = f37, f104 FSUB f112 = f38, f112 FSUB f120 = f39, f120 ;; #endif #ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f96 = f96, f32 FMPY f72 = f72, f32 FMPY f104 = f104, f32 FMPY f80 = f80, f32 FMPY f112 = f112, f32 FMPY f88 = f88, f32 FMPY f120 = f120, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -1 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; adds C3 = -1 * SIZE, C3 adds C4 = -1 * SIZE, C4 adds C5 = -1 * SIZE, C5 adds C6 = -1 * SIZE, C6 adds C7 = -1 * SIZE, C7 adds C8 = -1 * SIZE, C8 ;; #endif #ifdef LT LDFD f32 = [AOFFSET] ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, -3 * SIZE } { .mfi STFD [BOFFSET2] = f120, -3 * SIZE } ;; #endif #ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FNMA f80 = f64, f34, f80 ;; FNMA f88 = f64, f35, f88 ;; FNMA f96 = f64, f36, f96 ;; FNMA f104 = f64, f37, f104 ;; FNMA f112 = f64, f38, f112 ;; FNMA f120 = f64, f39, f120 ;; FMPY f72 = f72, f40 ;; FNMA f80 = f72, f41, f80 ;; FNMA f88 = f72, f42, f88 ;; FNMA f96 = f72, f43, f96 ;; FNMA f104 = f72, f44, f104 ;; FNMA f112 = f72, f45, f112 ;; FNMA f120 = f72, f46, f120 ;; FMPY f80 = f80, f47 ;; FNMA f88 = f80, f48, f88 ;; FNMA f96 = f80, f49, f96 ;; FNMA f104 = f80, f50, f104 ;; FNMA f112 = f80, f51, f112 ;; FNMA f120 = f80, f52, f120 ;; FMPY f88 = f88, f53 ;; FNMA f96 = f88, f54, f96 ;; FNMA f104 = f88, f55, f104 ;; FNMA f112 = f88, f56, f112 ;; FNMA f120 = f88, f57, f120 ;; FMPY f96 = f96, f58 ;; FNMA f104 = f96, f59, f104 ;; FNMA f112 = f96, f60, f112 ;; FNMA f120 = f96, f61, f120 ;; FMPY f104 = f104, f16 ;; FNMA f112 = f104, f17, f112 ;; FNMA f120 = f104, f18, f120 ;; FMPY f112 = f112, f19 ;; FNMA f120 = f112, f20, f120 ;; FMPY f120 = f120, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f88, -3 * SIZE STFD [AOFFSET2] = f120, - 3 * SIZE ;; #endif #ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 ;; FNMA f112 = f120, f33, f112 ;; FNMA f104 = f120, f34, f104 ;; FNMA f96 = f120, f35, f96 ;; FNMA f88 = f120, f36, f88 ;; FNMA f80 = f120, f37, f80 ;; FNMA f72 = f120, f38, f72 ;; FNMA f64 = f120, f39, f64 ;; FMPY f112 = f112, f40 ;; FNMA f104 = f112, f41, f104 ;; FNMA f96 = f112, f42, f96 ;; FNMA f88 = f112, f43, f88 ;; FNMA f80 = f112, f44, f80 ;; FNMA f72 = f112, f45, f72 ;; FNMA f64 = f112, f46, f64 ;; FMPY f104 = f104, f47 ;; FNMA f96 = f104, f48, f96 ;; FNMA f88 = f104, f49, f88 ;; FNMA f80 = f104, f50, f80 ;; FNMA f72 = f104, f51, f72 ;; FNMA f64 = f104, f52, f64 ;; FMPY f96 = f96, f53 ;; FNMA f88 = f96, f54, f88 ;; FNMA f80 = f96, f55, f80 ;; FNMA f72 = f96, f56, f72 ;; FNMA f64 = f96, f57, f64 ;; FMPY f88 = f88, f58 ;; FNMA f80 = f88, f59, f80 ;; FNMA f72 = f88, f60, f72 ;; FNMA f64 = f88, f61, f64 ;; FMPY f80 = f80, f16 ;; FNMA f72 = f80, f17, f72 ;; FNMA f64 = f80, f18, f64 ;; FMPY f72 = f72, f19 ;; FNMA f64 = f72, f20, f64 ;; FMPY f64 = f64, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f88, - 3 * SIZE STFD [AOFFSET2] = f120, - 3 * SIZE ;; #endif #ifndef LN STFD [C1 ] = f64, SIZE #else STFD [C1 ] = f64 #endif #ifndef LN STFD [C2 ] = f72, SIZE #else STFD [C2 ] = f72 #endif #ifndef LN STFD [C3 ] = f80, SIZE #else STFD [C3 ] = f80 #endif #ifndef LN STFD [C4 ] = f88, SIZE #else STFD [C4 ] = f88 #endif #ifndef LN STFD [C5 ] = f96, SIZE #else STFD [C5 ] = f96 #endif #ifndef LN STFD [C6 ] = f104, SIZE #else STFD [C6 ] = f104 #endif #ifndef LN STFD [C7 ] = f112, SIZE #else STFD [C7 ] = f112 #endif #ifndef LN STFD [C8 ] = f120, SIZE #else STFD [C8 ] = f120 #endif ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f96 = f0 mov f104 = f0 mov f112 = f0 mov f120 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0 #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET #else nop __LINE__ #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 8 .L049: #ifdef LN shladd KK8 = K, BASE_SHIFT, r0 ;; shladd B = KK8, 3, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 8, KK #endif #ifdef RT adds KK = -8, KK #endif ;; { .mmi mov AOFFSET = A } ;; { .mmb nop __LINE__ cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 8 .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 ;; mov ar.lc = ARLC ;; mov pr = PR, -1 ;; mov ar.pfs = ARPFS ;; br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/xcopy.S000066400000000000000000000247211313527062700164630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREX r2 #define PREY r3 #define I r14 #define J r15 #define X2 r16 #define Y2 r17 #define INCX2 r18 #define INCY2 r19 #define INCX8 r20 #define INCY8 r21 #define PR r30 #define ARLC r31 #define PREFETCH_SIZE (8 * 16) PROLOGUE .prologue PROFCODE { .mmi .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N shr I = N, 3 (p6) br.ret.sptk.many b0 } ;; shl INCX = INCX, ZBASE_SHIFT shl INCY = INCY, ZBASE_SHIFT ;; .body { .mmi sub r8 = X1, Y1 mov r9 = 0xf0 mov PR = pr } { .mmi shladd INCX2 = INCX, 1, r0 shladd INCY2 = INCY, 1, r0 and J = 15, N } ;; { .mmi shladd INCX8 = INCX, 2, r0 shladd INCY8 = INCY, 2, r0 mov pr.rot = 0 } { .mmi and r8 = r9, r8 cmp.eq p9, p0 = r0, J adds I = -1, I } ;; { .mmi adds X2 = 1 * SIZE, X1 adds Y2 = 1 * SIZE, Y1 mov ar.ec = 4 } { .mmb cmp.gt p6, p0 = 127, r8 cmp.eq p16, p0 = r0, r0 (p6) br.cond.dpnt .L20 } ;; { .mmi adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1 mov ar.lc = I } { .mib cmp.eq p8 ,p0 = -1, I tbit.z p0, p12 = N, 2 (p8) br.cond.dpnt .L15 } ;; .align 16 .L12: { .mmi (p19) STFD [Y1] = f35 (p19) STFD [Y2] = f39 (p19) add Y1 = INCY, Y1 } { .mmi (p17) LDFD f81 = [X1], INCX (p17) LDFD f85 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f43 (p19) STFD [Y2] = f47 (p19) add Y1 = INCY, Y1 } { .mmi (p17) LDFD f89 = [X1], INCX (p17) LDFD f93 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f51 (p19) STFD [Y2] = f55 (p19) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f32 = [X1], INCX (p16) LDFD f36 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f59 (p19) STFD [Y2] = f63 (p19) add Y1 = INCY, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p19) add Y2 = INCY, Y2 } ;; { .mmi (p16) LDFD f40 = [X1], INCX (p16) LDFD f44 = [X2], INCX nop __LINE__ } ;; { .mmi (p19) STFD [Y1] = f67 (p19) STFD [Y2] = f71 (p19) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f48 = [X1], INCX (p16) LDFD f52 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f75 (p19) STFD [Y2] = f79 (p19) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f56 = [X1], INCX (p16) LDFD f60 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f83 (p19) STFD [Y2] = f87 (p19) add Y1 = INCY, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f91 (p19) STFD [Y2] = f95 (p19) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f64 = [X1], INCX (p16) LDFD f68 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmb (p16) LDFD f72 = [X1], INCX (p16) LDFD f76 = [X2], INCX br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f49 = [X2], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f51 = [X2], INCX mov pr = PR, -65474 } ;; { .mmb (p12) LDFD f52 = [X1], INCX (p12) LDFD f53 = [X2], INCX (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f54 = [X1], INCX (p12) LDFD f55 = [X2], INCX tbit.z p0, p13 = N, 1 } ;; { .mmi (p13) LDFD f56 = [X1], INCX (p13) LDFD f57 = [X2], INCX tbit.z p0, p14 = N, 0 } ;; { .mmi (p13) LDFD f58 = [X1], INCX (p13) LDFD f59 = [X2], INCX } ;; { .mmi (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f49 (p12) add Y1 = INCY, Y1 } { .mmi (p14) LDFD f60 = [X1], INCX (p14) LDFD f61 = [X2], INCX (p12) add Y2 = INCY, Y2 } ;; { .mmi (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f51 (p12) add Y1 = INCY, Y1 } { .mmi nop __LINE__ (p12) add Y2 = INCY, Y2 } ;; { .mmi (p12) STFD [Y1] = f52 (p12) STFD [Y2] = f53 (p12) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY, Y2 } ;; { .mmi (p12) STFD [Y1] = f54 (p12) STFD [Y2] = f55 (p12) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY, Y2 } ;; { .mmi (p13) STFD [Y1] = f56 (p13) STFD [Y2] = f57 (p13) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY, Y2 } ;; { .mmi (p13) STFD [Y1] = f58 (p13) STFD [Y2] = f59 (p13) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY, Y2 } ;; { .mmb (p14) STFD [Y1] = f60 (p14) STFD [Y2] = f61 br.ret.sptk.many b0 } ;; .align 16 .L20: { .mmi adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1 mov ar.lc = I } { .mib cmp.eq p8 ,p0 = -1, I tbit.z p0, p12 = N, 2 (p8) br.cond.dpnt .L25 } ;; .align 16 .L22: { .mmi (p19) STFD [Y1] = f67 (p19) STFD [Y2] = f71 (p19) add Y1 = INCY, Y1 } { .mmi (p17) LDFD f81 = [X1], INCX (p17) LDFD f85 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f75 (p19) STFD [Y2] = f79 (p19) add Y1 = INCY, Y1 } { .mmi (p17) LDFD f89 = [X1], INCX (p17) LDFD f93 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f83 (p19) STFD [Y2] = f87 (p19) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f32 = [X1], INCX (p16) LDFD f36 = [X2], INCX (p19) add Y2 = INCY, Y2 } ;; { .mmi (p19) STFD [Y1] = f91 (p19) STFD [Y2] = f95 (p19) add Y1 = INCY, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p19) add Y2 = INCY, Y2 } ;; { .mmi (p16) LDFD f40 = [X1], INCX (p16) LDFD f44 = [X2], INCX nop __LINE__ } ;; { .mmi (p18) STFD [Y1] = f34 (p18) STFD [Y2] = f38 (p18) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f48 = [X1], INCX (p16) LDFD f52 = [X2], INCX (p18) add Y2 = INCY, Y2 } ;; { .mmi (p18) STFD [Y1] = f42 (p18) STFD [Y2] = f46 (p18) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f56 = [X1], INCX (p16) LDFD f60 = [X2], INCX (p18) add Y2 = INCY, Y2 } ;; { .mmi (p18) STFD [Y1] = f50 (p18) STFD [Y2] = f54 (p18) add Y1 = INCY, Y1 } { .mmi lfetch.fault.nt1 [PREX], INCX8 lfetch.fault.excl.nt1 [PREY], INCY8 (p18) add Y2 = INCY, Y2 } ;; { .mmi (p18) STFD [Y1] = f58 (p18) STFD [Y2] = f62 (p18) add Y1 = INCY, Y1 } { .mmi (p16) LDFD f64 = [X1], INCX (p16) LDFD f68 = [X2], INCX (p18) add Y2 = INCY, Y2 } ;; { .mmb (p16) LDFD f72 = [X1], INCX (p16) LDFD f76 = [X2], INCX br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f49 = [X2], INCX mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f51 = [X2], INCX mov pr = PR, -65474 } ;; { .mmb (p12) LDFD f52 = [X1], INCX (p12) LDFD f53 = [X2], INCX (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f54 = [X1], INCX (p12) LDFD f55 = [X2], INCX tbit.z p0, p13 = N, 1 } ;; { .mmi (p13) LDFD f56 = [X1], INCX (p13) LDFD f57 = [X2], INCX tbit.z p0, p14 = N, 0 } ;; { .mmi (p13) LDFD f58 = [X1], INCX (p13) LDFD f59 = [X2], INCX } ;; { .mmi (p12) STFD [Y1] = f48 (p12) STFD [Y2] = f49 (p12) add Y1 = INCY, Y1 } { .mmi (p14) LDFD f60 = [X1], INCX (p14) LDFD f61 = [X2], INCX (p12) add Y2 = INCY, Y2 } ;; { .mmi (p12) STFD [Y1] = f50 (p12) STFD [Y2] = f51 (p12) add Y1 = INCY, Y1 } { .mmi nop __LINE__ (p12) add Y2 = INCY, Y2 } ;; { .mmi (p12) STFD [Y1] = f52 (p12) STFD [Y2] = f53 (p12) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY, Y2 } ;; { .mmi (p12) STFD [Y1] = f54 (p12) STFD [Y2] = f55 (p12) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p12) add Y2 = INCY, Y2 } ;; { .mmi (p13) STFD [Y1] = f56 (p13) STFD [Y2] = f57 (p13) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY, Y2 } ;; { .mmi (p13) STFD [Y1] = f58 (p13) STFD [Y2] = f59 (p13) add Y1 = INCY, Y1 } { .mmi nop __LINE__ nop __LINE__ (p13) add Y2 = INCY, Y2 } ;; { .mmb (p14) STFD [Y1] = f60 (p14) STFD [Y2] = f61 br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/xdot.S000066400000000000000000000240641313527062700162770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_SIZE (4 * 24) #ifdef F_INTERFACE #define N r33 #define X1 r34 #define INCX r35 #define Y1 r36 #define INCY r37 #else #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #endif #define PREX1 r2 #define PREY1 r3 #define I r14 #define J r15 #define Y2 r16 #define X2 r17 #define INCX4 r24 #define INCY4 r25 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mfi nop __LINE__ mov f8 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mfi mov r26 = 1 mov f9 = f0 nop __LINE__ } ;; .body #ifdef F_INTERFACE LDINT N = [N] LDINT INCX = [INCX] LDINT INCY = [INCY] ;; #ifndef USE64BITINT sxt4 N = N sxt4 INCX = INCX sxt4 INCY = INCY ;; #endif cmp.le p0, p6 = r0, INCX cmp.le p0, p7 = r0, INCY sub r26 = r26, N ;; setf.sig f32 = r26 setf.sig f33 = INCX setf.sig f34 = INCY ;; xmpy.l f33 = f32, f33 xmpy.l f34 = f32, f34 ;; getf.sig r26 = f33 getf.sig r27 = f34 ;; shl r26 = r26, ZBASE_SHIFT shl r27 = r27, ZBASE_SHIFT ;; (p6) add X1 = r26, X1 (p7) add Y1 = r27, Y1 ;; #endif { .mfi adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 mov f10 = f0 mov PR = pr } { .mfb cmp.lt p0, p6 = r0, N mov f11 = f0 (p6) br.cond.spnt .L1000 } ;; { .mii adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 shl INCX = INCX, ZBASE_SHIFT shl INCY = INCY, ZBASE_SHIFT } ;; { .mfi add X2 = SIZE, X1 mov f12 = f0 mov pr.rot= 0 } { .mfi add Y2 = SIZE, Y1 mov f13 = f0 shr I = N, 3 } ;; { .mfi adds I = -1, I mov f14 = f0 mov ar.ec= 3 } { .mmf shladd INCX4 = INCX, 2, r0 shladd INCY4 = INCY, 2, r0 mov f15 = f0 } ;; { .mmi and J = 7, N cmp.eq p16, p0 = r0, r0 mov ar.lc = I } { .mib cmp.eq p6 ,p0 = -1, I tbit.nz p12, p0 = N, 2 (p6) br.cond.dpnt .L215 } ;; .align 32 .L212: { .mmf (p16) lfetch.nt1 [PREX1], INCX4 (p16) LDFD f80 = [X1], INCX (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f83 = [X2], INCX nop __LINE__ (p18) FMA f9 = f37, f82, f9 } ;; { .mmf (p16) LDFD f32 = [Y1], INCY (p16) LDFD f35 = [Y2], INCY (p18) FMA f10 = f34, f85, f10 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f11 = f37, f85, f11 } ;; { .mmf (p16) LDFD f86 = [X1], INCX (p16) LDFD f89 = [X2], INCX (p18) FMA f12 = f40, f88, f12 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f13 = f43, f88, f13 } ;; { .mmf (p16) LDFD f38 = [Y1], INCY (p16) LDFD f41 = [Y2], INCY (p18) FMA f14 = f40, f91, f14 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f15 = f43, f91, f15 } ;; { .mmf (p16) LDFD f92 = [X1], INCX (p16) LDFD f95 = [X2], INCX (p18) FMA f8 = f46, f94, f8 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f9 = f49, f94, f9 } ;; { .mmf (p16) lfetch.nt1 [PREY1], INCY4 (p16) LDFD f44 = [Y1], INCY (p18) FMA f10 = f46, f97, f10 } { .mmf (p16) LDFD f47 = [Y2], INCY nop __LINE__ (p18) FMA f11 = f49, f97, f11 } ;; { .mmf (p16) LDFD f98 = [X1], INCX (p16) LDFD f101 = [X2], INCX (p18) FMA f12 = f52, f100, f12 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f13 = f55, f100, f13 } ;; { .mmf (p16) LDFD f50 = [Y1], INCY (p16) LDFD f53 = [Y2], INCY (p18) FMA f14 = f52, f103, f14 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) lfetch.nt1 [PREX1], INCX4 (p16) LDFD f104 = [X1], INCX (p18) FMA f8 = f58, f106, f8 } { .mmf (p16) LDFD f107 = [X2], INCX nop __LINE__ (p18) FMA f9 = f61, f106, f9 } ;; { .mmf (p16) LDFD f56 = [Y1], INCY (p16) LDFD f59 = [Y2], INCY (p18) FMA f10 = f58, f109, f10 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f11 = f61, f109, f11 } ;; { .mmf (p16) LDFD f110 = [X1], INCX (p16) LDFD f113 = [X2], INCX (p18) FMA f12 = f64, f112, f12 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f13 = f67, f112, f13 } ;; { .mmf (p16) LDFD f62 = [Y1], INCY (p16) LDFD f65 = [Y2], INCY (p18) FMA f14 = f64, f115, f14 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f15 = f67, f115, f15 } ;; { .mmf (p16) lfetch.nt1 [PREY1], INCY4 (p16) LDFD f116 = [X1], INCX (p18) FMA f8 = f70, f118, f8 } { .mmf (p16) LDFD f119 = [X2], INCX nop __LINE__ (p18) FMA f9 = f73, f118, f9 } ;; { .mmf (p16) LDFD f68 = [Y1], INCY (p16) LDFD f71 = [Y2], INCY (p18) FMA f10 = f70, f121, f10 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f11 = f73, f121, f11 } ;; { .mmf (p16) LDFD f122 = [X1], INCX (p16) LDFD f125 = [X2], INCX (p18) FMA f12 = f76, f124, f12 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f13 = f79, f124, f13 } ;; { .mmf (p16) LDFD f74 = [Y1], INCY (p16) LDFD f77 = [Y2], INCY (p18) FMA f14 = f76, f127, f14 } { .mfb nop __LINE__ (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L212 } ;; .align 32 .L215: { .mmi (p12) LDFD f48 = [X1], INCX (p12) LDFD f49 = [X2], INCX cmp.eq p7, p0 = r0, J } ;; { .mmb (p12) LDFD f32 = [Y1], INCY (p12) LDFD f33 = [Y2], INCY (p7) br.cond.dptk .L999 } ;; { .mmi (p12) LDFD f50 = [X1], INCX (p12) LDFD f51 = [X2], INCX tbit.nz p13, p0 = N, 1 } ;; { .mmi (p12) LDFD f34 = [Y1], INCY (p12) LDFD f35 = [Y2], INCY nop __LINE__ } ;; { .mmi (p12) LDFD f52 = [X1], INCX (p12) LDFD f53 = [X2], INCX tbit.nz p14, p0 = N, 0 } ;; { .mmi (p12) LDFD f36 = [Y1], INCY (p12) LDFD f37 = [Y2], INCY nop __LINE__ } ;; { .mmf (p12) LDFD f54 = [X1], INCX (p12) LDFD f55 = [X2], INCX (p12) FMA f8 = f32, f48, f8 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f9 = f33, f48, f9 } ;; { .mmf (p12) LDFD f38 = [Y1], INCY (p12) LDFD f39 = [Y2], INCY (p12) FMA f10 = f32, f49, f10 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f11 = f33, f49, f11 } ;; { .mmf (p13) LDFD f56 = [X1], INCX (p13) LDFD f57 = [X2], INCX (p12) FMA f12 = f34, f50, f12 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f13 = f35, f50, f13 } ;; { .mmf (p13) LDFD f40 = [Y1], INCY (p13) LDFD f41 = [Y2], INCY (p12) FMA f14 = f34, f51, f14 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f15 = f35, f51, f15 } ;; { .mmf (p13) LDFD f58 = [X1], INCX (p13) LDFD f59 = [X2], INCX (p12) FMA f8 = f36, f52, f8 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f9 = f37, f52, f9 } ;; { .mmf (p13) LDFD f42 = [Y1], INCY (p13) LDFD f43 = [Y2], INCY (p12) FMA f10 = f36, f53, f10 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f11 = f37, f53, f11 } ;; { .mmf (p14) LDFD f60 = [X1] (p14) LDFD f61 = [X2] (p12) FMA f12 = f38, f54, f12 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f13 = f39, f54, f13 } ;; { .mmf (p14) LDFD f44 = [Y1] (p14) LDFD f45 = [Y2] (p12) FMA f14 = f38, f55, f14 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f15 = f39, f55, f15 } ;; (p13) FMA f8 = f40, f56, f8 (p13) FMA f9 = f41, f56, f9 (p13) FMA f10 = f40, f57, f10 (p13) FMA f11 = f41, f57, f11 (p13) FMA f12 = f42, f58, f12 (p13) FMA f13 = f43, f58, f13 (p13) FMA f14 = f42, f59, f14 (p13) FMA f15 = f43, f59, f15 ;; (p14) FMA f8 = f44, f60, f8 (p14) FMA f9 = f45, f60, f9 (p14) FMA f10 = f44, f61, f10 (p14) FMA f11 = f45, f61, f11 ;; .align 32 .L999: FADD f8 = f8, f12 FADD f9 = f9, f13 FADD f10 = f10, f14 FADD f11 = f11, f15 mov ar.lc = ARLC ;; #ifndef CONJ FSUB f8 = f8, f11 FADD f9 = f9, f10 #else FADD f8 = f8, f11 FSUB f9 = f9, f10 #endif ;; .align 32 .L1000: #ifdef F_INTERFACE STFD [r32] = f8, SIZE ;; STFD [r32] = f9, SIZE #endif mov pr = PR, -65474 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zaxpy.S000066400000000000000000000403041313527062700164670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16) #else #define PREFETCH_SIZE (32 * 16) #endif #ifndef CONJ #define FMA1 FNMA #define FMA2 FMA #else #define FMA1 FMA #define FMA2 FNMA #endif #define SP r12 #ifdef XDOUBLE #define N r32 #define X1 r14 #define INCX r15 #define Y1 r16 #define INCY r17 #else #define N r32 #define X1 r37 #define INCX r38 #define Y1 r39 #define INCY r36 #endif #define PREX1 r2 #define PREY1 r3 #define I r18 #define J r19 #define Y2 r20 #define X2 r21 #define INCX8 r22 #define INCY8 r23 #define YY1 r24 #define YY2 r25 #define YY3 r26 #define YY4 r27 #define INCX2M1 loc0 #define INCY2M1 loc1 #define INCX4M1 loc2 #define INCY4M1 loc3 #define X3 loc4 #define Y3 loc5 #define X4 loc6 #define Y4 loc7 #define PREX2 loc8 #define PREY2 loc9 #define ARLC r29 #define PR r30 #define ALPHA_R f8 #define ALPHA_I f9 PROLOGUE .prologue PROFCODE { .mmi adds r14 = 16, SP adds r15 = 24, SP adds r16 = 32, SP } { .mmb adds r17 = 40, SP cmp.gt p15, p0 = r0, N (p15) br.ret.sptk.many b0 } ;; #ifdef XDOUBLE { .mmi ld8 X1 = [r14] ld8 INCX = [r15] nop __LINE__ } { .mmi ld8 Y1 = [r16] ld8 INCY = [r17] nop __LINE__ } ;; #else { .mmi ld8 INCY = [r14] nop __LINE__ nop __LINE__ } ;; #endif { .mmi .save ar.pfs, r10 alloc r10 = ar.pfs, 8, 16, 0, 0 and J = 7, N shl INCX = INCX, ZBASE_SHIFT } { .mmi adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 shl INCY = INCY, ZBASE_SHIFT } ;; { .mmi shladd INCX8 = INCX, 3, r0 shladd INCY8 = INCY, 3, r0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mmi adds INCX2M1 = -SIZE, INCX adds INCY2M1 = -SIZE, INCY shr I = N, 3 } ;; { .mmi add INCX2M1 = INCX2M1, INCX add INCY2M1 = INCY2M1, INCY mov PR = pr } { .mmi add X2 = X1, INCX add Y2 = Y1, INCY nop __LINE__ } ;; { .mmi shladd INCX4M1 = INCX, 1, INCX2M1 shladd INCY4M1 = INCY, 1, INCY2M1 mov pr.rot= 0 } { .mmi shladd X3 = INCX, 1, X1 shladd Y3 = INCY, 1, Y1 } ;; { .mmi shladd X4 = INCX, 1, X2 shladd Y4 = INCY, 1, Y2 adds I = -1, I } { .mmi cmp.eq p16, p0 = r0, r0 and r8 = 127, Y1 and PREX1 = -128, PREX1 } ;; { .mmi mov YY1 = Y1 mov YY2 = Y2 mov ar.ec = 3 } { .mmi mov YY3 = Y3 mov YY4 = Y4 or PREX1 = PREX1, r8 } ;; { .mmi shladd PREX2 = INCX, 2, PREX1 shladd PREY2 = INCY, 2, PREY1 mov ar.lc = I } { .mib cmp.eq p11 ,p0 = -1, I tbit.z p0, p13 = N, 2 (p11) br.cond.dpnt .L25 } ;; .align 32 .L22: #ifdef XDOUBLE { .mmf (p16) LDFD f80 = [Y1], 1 * SIZE (p16) LDFD f83 = [Y2], 1 * SIZE (p18) FMA1 f82 = ALPHA_I, f40, f82 } { .mmf (p16) LDFD f92 = [Y3], 1 * SIZE (p16) LDFD f95 = [Y4], 1 * SIZE (p18) FMA1 f85 = ALPHA_I, f43, f85 } ;; { .mmf (p16) LDFD f86 = [Y1], INCY4M1 (p16) LDFD f89 = [Y2], INCY4M1 (p18) FMA1 f94 = ALPHA_I, f52, f94 } { .mmf (p16) LDFD f98 = [Y3], INCY4M1 (p16) LDFD f101 = [Y4], INCY4M1 (p18) FMA1 f97 = ALPHA_I, f55, f97 } ;; { .mmf (p16) LDFD f32 = [X1], 1 * SIZE (p16) LDFD f35 = [X2], 1 * SIZE (p18) FMA f88 = ALPHA_I, f34, f88 } { .mmf (p16) LDFD f44 = [X3], 1 * SIZE (p16) LDFD f47 = [X4], 1 * SIZE (p18) FMA f91 = ALPHA_I, f37, f91 } ;; { .mmf (p16) LDFD f38 = [X1], INCX4M1 (p16) LDFD f41 = [X2], INCX4M1 (p18) FMA f100 = ALPHA_I, f46, f100 } { .mmf (p16) LDFD f50 = [X3], INCX4M1 (p16) LDFD f53 = [X4], INCX4M1 (p18) FMA f103 = ALPHA_I, f49, f103 } ;; { .mmf (p18) STFD [YY1] = f82, 1 * SIZE (p18) STFD [YY2] = f85, 1 * SIZE (p18) FMA f106 = ALPHA_R, f58, f106 } { .mmf (p19) add YY3 = YY3, INCY4M1 (p19) add YY4 = YY4, INCY4M1 (p18) FMA f109 = ALPHA_R, f61, f109 } ;; { .mmf (p18) STFD [YY3] = f94, 1 * SIZE (p18) STFD [YY4] = f97, 1 * SIZE (p18) FMA f118 = ALPHA_R, f70, f118 } { .mmf (p16) lfetch.excl.nt1 [PREY1], INCY8 (p16) lfetch.excl.nt1 [PREY2], INCY8 (p18) FMA f121 = ALPHA_R, f73, f121 } ;; { .mmf (p18) STFD [YY1] = f88 (p18) STFD [YY2] = f91 (p18) FMA2 f112 = ALPHA_R, f64, f112 } { .mmf (p18) add YY1 = YY1, INCY4M1 (p18) add YY2 = YY2, INCY4M1 (p18) FMA2 f115 = ALPHA_R, f67, f115 } ;; { .mmf (p18) STFD [YY3] = f100 (p18) STFD [YY4] = f103 (p18) FMA2 f124 = ALPHA_R, f76, f124 } { .mmf (p18) add YY3 = YY3, INCY4M1 (p18) add YY4 = YY4, INCY4M1 (p18) FMA2 f127 = ALPHA_R, f79, f127 } ;; { .mmf (p16) LDFD f104 = [Y1], 1 * SIZE (p16) LDFD f107 = [Y2], 1 * SIZE (p18) FMA1 f106 = ALPHA_I, f64, f106 } { .mmf (p16) LDFD f116 = [Y3], 1 * SIZE (p16) LDFD f119 = [Y4], 1 * SIZE (p18) FMA1 f109 = ALPHA_I, f67, f109 } ;; { .mmf (p16) LDFD f110 = [Y1], INCY4M1 (p16) LDFD f113 = [Y2], INCY4M1 (p18) FMA1 f118 = ALPHA_I, f76, f118 } { .mmf (p16) LDFD f122 = [Y3], INCY4M1 (p16) LDFD f125 = [Y4], INCY4M1 (p18) FMA1 f121 = ALPHA_I, f79, f121 } ;; { .mmf (p16) LDFD f56 = [X1], 1 * SIZE (p16) LDFD f59 = [X2], 1 * SIZE (p18) FMA f112 = ALPHA_I, f58, f112 } { .mmf (p16) LDFD f68 = [X3], 1 * SIZE (p16) LDFD f71 = [X4], 1 * SIZE (p18) FMA f115 = ALPHA_I, f61, f115 } ;; { .mmf (p16) LDFD f62 = [X1], INCX4M1 (p16) LDFD f65 = [X2], INCX4M1 (p18) FMA f124 = ALPHA_I, f70, f124 } { .mmf (p16) LDFD f74 = [X3], INCX4M1 (p16) LDFD f77 = [X4], INCX4M1 (p18) FMA f127 = ALPHA_I, f73, f127 } ;; { .mmf (p18) STFD [YY1] = f106, 1 * SIZE (p18) STFD [YY2] = f109, 1 * SIZE (p17) FMA f81 = ALPHA_R, f33, f81 } { .mmf nop __LINE__ nop __LINE__ (p17) FMA f84 = ALPHA_R, f36, f84 } ;; { .mmf (p18) STFD [YY3] = f118, 1 * SIZE (p18) STFD [YY4] = f121, 1 * SIZE (p17) FMA f93 = ALPHA_R, f45, f93 } { .mmf (p16) lfetch.nt1 [PREX1], INCX8 (p16) lfetch.nt1 [PREX2], INCX8 (p17) FMA f96 = ALPHA_R, f48, f96 } ;; { .mmf (p18) STFD [YY1] = f112 (p18) STFD [YY2] = f115 (p17) FMA2 f87 = ALPHA_R, f39, f87 } { .mmf (p18) add YY1 = YY1, INCY4M1 (p18) add YY2 = YY2, INCY4M1 (p17) FMA2 f90 = ALPHA_R, f42, f90 } ;; { .mmf (p18) STFD [YY3] = f124 (p18) STFD [YY4] = f127 (p17) FMA2 f99 = ALPHA_R, f51, f99 } { .mfb nop __LINE__ (p17) FMA2 f102 = ALPHA_R, f54, f102 br.ctop.sptk.few .L22 } ;; ;; (p19) add YY3 = YY3, INCY4M1 (p19) add YY4 = YY4, INCY4M1 ;; #else { .mmf (p19) STFD [YY3] = f125 (p19) STFD [YY4] = f32 (p18) FMA2 f100 = ALPHA_R, f52, f100 } { .mmf (p16) lfetch.excl.nt1 [PREY1], INCY8 nop __LINE__ (p18) FMA2 f103 = ALPHA_R, f55, f103 } ;; { .mmf (p16) LDFD f80 = [Y1], 1 * SIZE (p16) LDFD f83 = [Y2], 1 * SIZE (p18) FMA1 f82 = ALPHA_I, f40, f82 } { .mmf (p16) LDFD f92 = [Y3], 1 * SIZE (p16) LDFD f95 = [Y4], 1 * SIZE (p18) FMA1 f85 = ALPHA_I, f43, f85 } ;; { .mmf (p16) LDFD f86 = [Y1], INCY4M1 (p16) LDFD f89 = [Y2], INCY4M1 (p18) FMA1 f94 = ALPHA_I, f52, f94 } { .mmf (p19) add YY3 = YY3, INCY4M1 (p19) add YY4 = YY4, INCY4M1 (p18) FMA1 f97 = ALPHA_I, f55, f97 } ;; { .mmf (p16) LDFD f98 = [Y3], INCY4M1 (p16) LDFD f101 = [Y4], INCY4M1 (p18) FMA f88 = ALPHA_I, f34, f88 } { .mmf (p19) add YY1 = YY1, INCY4M1 (p19) add YY2 = YY2, INCY4M1 (p18) FMA f91 = ALPHA_I, f37, f91 } ;; { .mmf (p16) LDFD f32 = [X1], 1 * SIZE (p16) LDFD f35 = [X2], 1 * SIZE (p18) FMA f100 = ALPHA_I, f46, f100 } { .mmf (p16) LDFD f44 = [X3], 1 * SIZE (p16) LDFD f47 = [X4], 1 * SIZE (p18) FMA f103 = ALPHA_I, f49, f103 } ;; { .mmf (p18) STFD [YY1] = f82, 1 * SIZE (p18) STFD [YY2] = f85, 1 * SIZE (p18) FMA f106 = ALPHA_R, f58, f106 } { .mmf (p16) LDFD f38 = [X1], INCX4M1 (p16) LDFD f41 = [X2], INCX4M1 (p18) FMA f109 = ALPHA_R, f61, f109 } ;; { .mmf (p18) STFD [YY3] = f94, 1 * SIZE (p18) STFD [YY4] = f97, 1 * SIZE (p18) FMA f118 = ALPHA_R, f70, f118 } { .mmf (p16) LDFD f50 = [X3], INCX4M1 (p16) LDFD f53 = [X4], INCX4M1 (p18) FMA f121 = ALPHA_R, f73, f121 } ;; { .mmf (p18) STFD [YY1] = f88 (p18) STFD [YY2] = f91 (p18) FMA2 f112 = ALPHA_R, f64, f112 } { .mmf (p16) lfetch.nt1 [PREX1], INCX8 nop __LINE__ (p18) FMA2 f115 = ALPHA_R, f67, f115 } ;; { .mmf (p18) STFD [YY3] = f100 (p18) STFD [YY4] = f103 (p18) FMA2 f124 = ALPHA_R, f76, f124 } { .mmf (p16) LDFD f104 = [Y1], 1 * SIZE (p16) LDFD f107 = [Y2], 1 * SIZE (p18) FMA2 f127 = ALPHA_R, f79, f127 } ;; { .mmf (p16) LDFD f116 = [Y3], 1 * SIZE (p16) LDFD f119 = [Y4], 1 * SIZE (p18) FMA1 f106 = ALPHA_I, f64, f106 } { .mmf (p18) add YY1 = YY1, INCY4M1 (p18) add YY2 = YY2, INCY4M1 (p18) FMA1 f109 = ALPHA_I, f67, f109 } ;; { .mmf (p16) LDFD f110 = [Y1], INCY4M1 (p16) LDFD f113 = [Y2], INCY4M1 (p18) FMA1 f118 = ALPHA_I, f76, f118 } { .mmf (p18) add YY3 = YY3, INCY4M1 (p18) add YY4 = YY4, INCY4M1 (p18) FMA1 f121 = ALPHA_I, f79, f121 } ;; { .mmf (p16) LDFD f122 = [Y3], INCY4M1 (p16) LDFD f125 = [Y4], INCY4M1 (p18) FMA f112 = ALPHA_I, f58, f112 } { .mmf nop __LINE__ nop __LINE__ (p18) FMA f115 = ALPHA_I, f61, f115 } ;; { .mmf (p16) LDFD f56 = [X1], 1 * SIZE (p16) LDFD f59 = [X2], 1 * SIZE (p18) FMA f124 = ALPHA_I, f70, f124 } { .mmf (p16) LDFD f68 = [X3], 1 * SIZE (p16) LDFD f71 = [X4], 1 * SIZE (p18) FMA f127 = ALPHA_I, f73, f127 } ;; { .mmf (p18) STFD [YY1] = f106, 1 * SIZE (p18) STFD [YY2] = f109, 1 * SIZE (p17) FMA f81 = ALPHA_R, f33, f81 } { .mmf (p16) LDFD f62 = [X1], INCX4M1 (p16) LDFD f65 = [X2], INCX4M1 (p17) FMA f84 = ALPHA_R, f36, f84 } ;; { .mmf (p18) STFD [YY3] = f118, 1 * SIZE (p18) STFD [YY4] = f121, 1 * SIZE (p17) FMA f93 = ALPHA_R, f45, f93 } { .mmf (p16) LDFD f74 = [X3], INCX4M1 (p16) LDFD f77 = [X4], INCX4M1 (p17) FMA f96 = ALPHA_R, f48, f96 } ;; { .mmf (p18) STFD [YY1] = f112 (p18) STFD [YY2] = f115 (p17) FMA2 f87 = ALPHA_R, f39, f87 } { .mfb nop __LINE__ (p17) FMA2 f90 = ALPHA_R, f42, f90 br.ctop.sptk.few .L22 } ;; { .mmi (p19) STFD [YY3] = f125 (p19) STFD [YY4] = f32 (p19) add YY1 = YY1, INCY4M1 } { .mmi (p19) add YY2 = YY2, INCY4M1 (p19) add YY3 = YY3, INCY4M1 (p19) add YY4 = YY4, INCY4M1 } ;; #endif .align 32 .L25: { .mmi (p13) LDFD f32 = [X1], 1 * SIZE (p13) LDFD f34 = [X2], 1 * SIZE mov ar.lc = ARLC } { .mmi (p13) LDFD f36 = [X3], 1 * SIZE (p13) LDFD f38 = [X4], 1 * SIZE cmp.eq p12, p0 = r0, J } ;; { .mmi (p13) LDFD f80 = [Y1], 1 * SIZE (p13) LDFD f82 = [Y2], 1 * SIZE mov pr = PR, -65474 } { .mmb (p13) LDFD f84 = [Y3], 1 * SIZE (p13) LDFD f86 = [Y4], 1 * SIZE (p12) br.ret.sptk.many b0 } ;; { .mmi (p13) LDFD f33 = [X1], INCX4M1 (p13) LDFD f35 = [X2], INCX4M1 tbit.z p0, p14 = N, 1 } { .mmi (p13) LDFD f81 = [Y1], INCY4M1 (p13) LDFD f83 = [Y2], INCY4M1 nop __LINE__ } ;; { .mmi (p13) LDFD f37 = [X3], INCX4M1 (p13) LDFD f39 = [X4], INCX4M1 tbit.z p0, p15 = N, 0 } { .mmi (p13) LDFD f85 = [Y3], INCY4M1 (p13) LDFD f87 = [Y4], INCY4M1 nop __LINE__ } ;; { .mmf (p14) LDFD f40 = [X1], 1 * SIZE (p14) LDFD f42 = [X2], 1 * SIZE } ;; { .mmf (p14) LDFD f88 = [Y1], 1 * SIZE (p14) LDFD f90 = [Y2], 1 * SIZE } ;; { .mmf (p14) LDFD f41 = [X1], INCX2M1 (p14) LDFD f43 = [X2], INCX2M1 (p13) FMA f80 = ALPHA_R, f32, f80 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f82 = ALPHA_R, f34, f82 } ;; { .mmf (p14) LDFD f89 = [Y1], INCY2M1 (p14) LDFD f91 = [Y2], INCY2M1 (p13) FMA f84 = ALPHA_R, f36, f84 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f86 = ALPHA_R, f38, f86 } ;; { .mmf (p15) LDFD f44 = [X1], 1 * SIZE (p15) LDFD f92 = [Y1], 1 * SIZE (p13) FMA2 f81 = ALPHA_R, f33, f81 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA2 f83 = ALPHA_R, f35, f83 } ;; { .mmf (p15) LDFD f45 = [X1] (p15) LDFD f93 = [Y1] (p13) FMA2 f85 = ALPHA_R, f37, f85 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA2 f87 = ALPHA_R, f39, f87 } ;; { .mmf nop __LINE__ nop __LINE__ (p13) FMA1 f80 = ALPHA_I, f33, f80 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA1 f82 = ALPHA_I, f35, f82 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA1 f84 = ALPHA_I, f37, f84 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA1 f86 = ALPHA_I, f39, f86 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f81 = ALPHA_I, f32, f81 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f83 = ALPHA_I, f34, f83 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f85 = ALPHA_I, f36, f85 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f87 = ALPHA_I, f38, f87 } ;; { .mmf (p13) STFD [YY1] = f80, 1 * SIZE (p13) STFD [YY2] = f82, 1 * SIZE (p14) FMA f88 = ALPHA_R, f40, f88 } { .mmf nop __LINE__ nop __LINE__ (p14) FMA f90 = ALPHA_R, f42, f90 } ;; { .mmf (p13) STFD [YY3] = f84, 1 * SIZE (p13) STFD [YY4] = f86, 1 * SIZE (p14) FMA2 f89 = ALPHA_R, f41, f89 } { .mmf nop __LINE__ nop __LINE__ (p14) FMA2 f91 = ALPHA_R, f43, f91 } ;; { .mmf (p13) STFD [YY1] = f81 (p13) STFD [YY2] = f83 (p15) FMA f92 = ALPHA_R, f44, f92 } { .mmf (p13) add YY1 = YY1, INCY4M1 (p13) add YY2 = YY2, INCY4M1 (p15) FMA2 f93 = ALPHA_R, f45, f93 } ;; { .mmf (p13) STFD [YY3] = f85 (p13) STFD [YY4] = f87 (p14) FMA1 f88 = ALPHA_I, f41, f88 } { .mmf (p13) add YY3 = YY3, INCY4M1 (p13) add YY4 = YY4, INCY4M1 (p14) FMA1 f90 = ALPHA_I, f43, f90 } ;; { .mmf nop __LINE__ nop __LINE__ (p14) FMA f89 = ALPHA_I, f40, f89 } { .mmf nop __LINE__ nop __LINE__ (p14) FMA f91 = ALPHA_I, f42, f91 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA1 f92 = ALPHA_I, f45, f92 } { .mmf nop __LINE__ nop __LINE__ (p15) FMA f93 = ALPHA_I, f44, f93 } ;; { .mmi (p14) STFD [YY1] = f88, 1 * SIZE (p14) STFD [YY2] = f90, 1 * SIZE nop __LINE__ } ;; { .mmi (p14) STFD [YY1] = f89 (p14) STFD [YY2] = f91 (p14) add YY1 = YY1, INCY2M1 } ;; { .mmi (p15) STFD [YY1] = f92, 1 * SIZE nop __LINE__ nop __LINE__ } ;; { .mmb (p15) STFD [YY1] = f93 nop __LINE__ br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zcopy.S000066400000000000000000000565351313527062700164750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREA r2 #define PREB r3 #define I r14 #define J r15 #define X2 r16 #define Y2 r17 #define INCXM1 r20 #define INCYM1 r21 #define INCX3M1 r22 #define INCY3M1 r23 #define INCX8 r24 #define INCY8 r25 #define XX r26 #define YY r27 #define XA r28 #define YA r29 #define PR r30 #define ARLC r31 #ifdef DOUBLE #define PREFETCH_SIZE (6 * 32) #else #define PREFETCH_SIZE (8 * 64) #endif PROLOGUE .prologue PROFCODE { .mmi shladd INCX = INCX, ZBASE_SHIFT, r0 shladd INCY = INCY, ZBASE_SHIFT, r0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N sub XA = Y1, X1 (p6) br.ret.sptk.many b0 } ;; .body { .mmi shladd INCX3M1 = INCX, 1, INCX shladd INCY3M1 = INCY, 1, INCY mov PR = pr } { .mmi adds INCXM1 = - SIZE, INCX adds INCYM1 = - SIZE, INCY shr.u XA = XA, BASE_SHIFT } ;; { .mmi #ifdef DOUBLE adds XA = 4, XA #else adds XA = -2, XA #endif and J = 7, N mov pr.rot = 0 } { .mmi adds INCX3M1 = - SIZE, INCX3M1 adds INCY3M1 = - SIZE, INCY3M1 shr I = N, 3 } ;; { .mmi #ifdef DOUBLE and XA = 31, XA #else and XA = 63, XA #endif cmp.eq p9, p0 = r0, J tbit.z p0, p7 = X1, BASE_SHIFT } { .mmi shladd X2 = INCX, 1, X1 shladd Y2 = INCY, 1, Y1 tbit.z p0, p12 = N, 2 } ;; { .mmi cmp.eq p8 ,p0 = r0, I adds I = -1, I #ifdef DOUBLE cmp.le p11, p0 = 15, XA #else cmp.ge p11, p0 = 31, XA #endif } { .mmb shladd INCX8 = INCX, 3, r0 shladd INCY8 = INCY, 3, r0 (p8) br.cond.dpnt .L25 } ;; { .mmi nop.m 0 nop.m 0 mov ar.lc = I } { .mbb (p7) br.cond.dpnt .L100 (p11) br.cond.dpnt .L30 } ;; { .mmi cmp.eq p16, p0 = r0, r0 nop.m 0 mov ar.ec = 5 } { .mmi adds PREA = PREFETCH_SIZE * SIZE + 32, X1 #ifndef DOUBLE adds PREB = PREFETCH_SIZE * SIZE + 0, Y1 #else adds PREB = PREFETCH_SIZE * SIZE - 40, Y1 #endif nop.i 0 } ;; .align 32 .L21: { .mmi (p21) STFD [Y1] = f42 (p21) STFD [Y2] = f62 (p21) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f32, f37 = [X1] (p16) add X1 = X1, INCX (p21) add Y2 = INCYM1, Y2 } ;; { .mmi (p21) STFD [Y1] = f47, 1 * SIZE (p21) STFD [Y2] = f67, 1 * SIZE } { .mmi (p16) lfetch.nt1 [PREA], INCX8 (p16) LDFPD f42, f47 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p21) STFD [Y1] = f52 (p21) STFD [Y2] = f72 (p21) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f52, f57 = [X1] (p16) add X1 = X1, INCX (p21) add Y2 = INCY3M1, Y2 } ;; { .mmi (p21) STFD [Y1] = f77, 1 * SIZE (p21) STFD [Y2] = f97, 1 * SIZE } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY8 (p16) LDFPD f62, f67 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p21) STFD [Y1] = f82 (p21) STFD [Y2] = f102 (p21) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f72, f77 = [X1] (p16) add X1 = X1, INCX (p21) add Y2 = INCYM1, Y2 } ;; { .mmi (p21) STFD [Y1] = f87, 1 * SIZE (p21) STFD [Y2] = f107, 1 * SIZE } { .mmi (p16) LDFPD f82, f87 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p21) STFD [Y1] = f92 (p21) STFD [Y2] = f112 (p21) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f92, f97 = [X1] (p16) add X1 = X1, INCX (p21) add Y2 = INCY3M1, Y2 } ;; { .mmi (p20) STFD [Y1] = f36, 1 * SIZE (p20) STFD [Y2] = f56, 1 * SIZE (p16) shladd X2 = INCX, 3, X2 } { .mmb (p16) LDFPD f102, f107 = [X1] (p16) add X1 = X1, INCX br.ctop.sptk.few .L21 } ;; { .mmi (p21) STFD [Y1] = f42 (p21) STFD [Y2] = f62 (p21) add Y1 = INCYM1, Y1 } { .mmi (p21) add Y2 = INCYM1, Y2 } ;; { .mmi (p21) STFD [Y1] = f47, 1 * SIZE (p21) STFD [Y2] = f67, 1 * SIZE } ;; { .mmi (p21) STFD [Y1] = f52 (p21) STFD [Y2] = f72 (p21) add Y1 = INCY3M1, Y1 } { .mmi (p21) add Y2 = INCY3M1, Y2 } ;; { .mmi (p21) STFD [Y1] = f77, 1 * SIZE (p21) STFD [Y2] = f97, 1 * SIZE } ;; { .mmi (p21) STFD [Y1] = f82 (p21) STFD [Y2] = f102 (p21) add Y1 = INCYM1, Y1 } { .mmi (p21) add Y2 = INCYM1, Y2 } ;; { .mmi (p21) STFD [Y1] = f87, 1 * SIZE (p21) STFD [Y2] = f107, 1 * SIZE } ;; { .mmi (p21) STFD [Y1] = f92 (p21) STFD [Y2] = f112 (p21) add Y1 = INCY3M1, Y1 } { .mmi (p21) add Y2 = INCY3M1, Y2 } ;; .align 32 .L25: { .mmi mov XX = X1 nop.m 0 mov ar.lc = ARLC } { .mmi (p12) LDFD f48 = [X1], 1 * SIZE (p12) LDFD f52 = [X2], 1 * SIZE tbit.z p0, p13 = N, 1 } ;; { .mmi (p12) LDFD f49 = [X1], INCXM1 (p12) LDFD f53 = [X2], INCXM1 mov pr = PR, -65474 } { .mib nop.m 0 tbit.z p0, p14 = N, 0 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], 1 * SIZE (p12) LDFD f54 = [X2], 1 * SIZE (p12) shladd XX = INCX, 2, XX;; } ;; { .mmi (p12) LDFD f51 = [X1], INCX3M1 (p12) LDFD f55 = [X2], INCX3M1 (p13) shladd XX = INCX, 1, XX;; } ;; { .mmi (p13) LDFD f56 = [X1], 1 * SIZE (p14) LDFD f60 = [XX], 1 * SIZE } ;; { .mmi (p13) LDFD f57 = [X1], INCXM1 (p14) LDFD f61 = [XX] mov YY = Y1 } ;; { .mmi (p12) STFD [Y1] = f48, 1 * SIZE (p12) STFD [Y2] = f52, 1 * SIZE } { .mmi (p13) LDFD f58 = [X1], 1 * SIZE } ;; { .mmi (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p12) add Y1 = INCYM1, Y1 } { .mmi (p13) LDFD f59 = [X1] (p12) add Y2 = INCYM1, Y2 } ;; { .mmi (p12) STFD [Y1] = f50, 1 * SIZE (p12) STFD [Y2] = f54, 1 * SIZE (p12) shladd YY = INCY, 2, YY;; } ;; { .mmi (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) shladd YY = INCY, 1, YY } { .mmi (p12) add Y1 = INCY3M1, Y1 (p12) add Y2 = INCY3M1, Y2 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f56, 1 * SIZE (p14) STFD [YY] = f60, 1 * SIZE } ;; { .mmi (p13) STFD [Y1] = f57 (p14) STFD [YY] = f61 (p13) add Y1 = INCYM1, Y1 } ;; { .mmi (p13) STFD [Y1] = f58, 1 * SIZE nop.m 0 nop.i 0 } ;; { .mib (p13) STFD [Y1] = f59 nop.i 0 br.ret.sptk.many b0 } ;; .align 32 .L30: { .mmi cmp.eq p16, p0 = r0, r0 nop.m 0 mov ar.ec = 5 } { .mmi #ifndef DOUBLE adds PREA = PREFETCH_SIZE * SIZE + 24, X1 adds PREB = PREFETCH_SIZE * SIZE + 40, Y1 #else adds PREA = PREFETCH_SIZE * SIZE - 56, X1 adds PREB = PREFETCH_SIZE * SIZE - 24, Y1 #endif nop.i 0 } ;; .align 32 #ifndef DOUBLE .L31: { .mmi (p20) STFD [Y1] = f91 (p20) STFD [Y2] = f111 (p20) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f32, f37 = [X1] (p16) add X1 = X1, INCX (p20) add Y2 = INCY3M1, Y2 } ;; { .mmi (p19) STFD [Y1] = f35, 1 * SIZE (p19) STFD [Y2] = f55, 1 * SIZE } { .mmi (p16) lfetch.nt1 [PREA], INCX8 (p16) LDFPD f42, f47 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p19) STFD [Y1] = f40 (p19) STFD [Y2] = f60 (p19) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f52, f57 = [X1] (p16) add X1 = X1, INCX (p19) add Y2 = INCYM1, Y2 } ;; { .mmi (p19) STFD [Y1] = f45, 1 * SIZE (p19) STFD [Y2] = f65, 1 * SIZE } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY8 (p16) LDFPD f62, f67 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p19) STFD [Y1] = f50 (p19) STFD [Y2] = f70 (p19) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f72, f77 = [X1] (p16) add X1 = X1, INCX (p19) add Y2 = INCY3M1, Y2 } ;; { .mmi (p19) STFD [Y1] = f75, 1 * SIZE (p19) STFD [Y2] = f95, 1 * SIZE } { .mmi (p16) LDFPD f82, f87 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p19) STFD [Y1] = f80 (p19) STFD [Y2] = f100 (p19) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f92, f97 = [X1] (p16) add X1 = X1, INCX (p19) add Y2 = INCYM1, Y2 } ;; { .mmi (p19) STFD [Y1] = f85, 1 * SIZE (p19) STFD [Y2] = f105, 1 * SIZE (p16) shladd X2 = INCX, 3, X2 } { .mmb (p16) LDFPD f102, f107 = [X1] (p16) add X1 = X1, INCX br.ctop.sptk.few .L31 } ;; br .L25 .align 32 #else .L31: { .mmi (p20) STFD [Y1] = f41 (p20) STFD [Y2] = f61 (p20) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f32, f37 = [X1] (p16) add X1 = X1, INCX (p20) add Y2 = INCYM1, Y2 } ;; { .mmi (p20) STFD [Y1] = f46, 1 * SIZE (p20) STFD [Y2] = f66, 1 * SIZE } { .mmi (p16) lfetch.nt1 [PREA], INCX8 (p16) LDFPD f42, f47 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p20) STFD [Y1] = f51 (p20) STFD [Y2] = f71 (p20) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f52, f57 = [X1] (p16) add X1 = X1, INCX (p20) add Y2 = INCY3M1, Y2 } ;; { .mmi (p20) STFD [Y1] = f76, 1 * SIZE (p20) STFD [Y2] = f96, 1 * SIZE } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY8 (p16) LDFPD f62, f67 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p20) STFD [Y1] = f81 (p20) STFD [Y2] = f101 (p20) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f72, f77 = [X1] (p16) add X1 = X1, INCX (p20) add Y2 = INCYM1, Y2 } ;; { .mmi (p20) STFD [Y1] = f86, 1 * SIZE (p20) STFD [Y2] = f106, 1 * SIZE } { .mmi (p16) LDFPD f82, f87 = [X1] (p16) add X1 = X1, INCX } ;; { .mmi (p20) STFD [Y1] = f91 (p20) STFD [Y2] = f111 (p20) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f92, f97 = [X1] (p16) add X1 = X1, INCX (p20) add Y2 = INCY3M1, Y2 } ;; { .mmi (p19) STFD [Y1] = f35, 1 * SIZE (p19) STFD [Y2] = f55, 1 * SIZE (p16) shladd X2 = INCX, 3, X2 } { .mmb (p16) LDFPD f102, f107 = [X1] (p16) add X1 = X1, INCX br.ctop.sptk.few .L31 } ;; br .L25 .align 32 #endif .L100: { .mmi mov ar.lc = I } { .mbb cmp.ne p6, p0 = 2 * SIZE, INCX (p6) br.cond.dpnt .L200 (p11) br.cond.dpnt .L130 } ;; { .mmi adds PREA = PREFETCH_SIZE * SIZE + 32, X1 #ifndef DOUBLE adds PREB = PREFETCH_SIZE * SIZE - 32, Y1 #else adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 #endif mov ar.ec = 5 } { .mmi LDFD f32 = [X1], 1 * SIZE cmp.eq p16, p0 = r0, r0 nop.i 0 } ;; .align 32 .L121: { .mmi (p21) STFD [Y1] = f47, 1 * SIZE (p21) STFD [Y2] = f67, 1 * SIZE } { .mmi (p16) lfetch.nt1 [PREA], INCX8 (p16) LDFPD f37, f42 = [X1], 2 * SIZE } ;; { .mmi (p21) STFD [Y1] = f52 (p21) STFD [Y2] = f72 (p21) add Y1 = INCY3M1, Y1 } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY8 (p16) LDFPD f47, f52 = [X1], 2 * SIZE (p21) add Y2 = INCY3M1, Y2 } ;; { .mmi (p21) STFD [Y1] = f77, 1 * SIZE (p21) STFD [Y2] = f97, 1 * SIZE } { .mmi (p16) LDFPD f57, f62 = [X1], 2 * SIZE } ;; { .mmi (p21) STFD [Y1] = f82 (p21) STFD [Y2] = f102 (p21) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f67, f72 = [X1], 2 * SIZE (p21) add Y2 = INCYM1, Y2 } ;; { .mmi (p21) STFD [Y1] = f87, 1 * SIZE (p21) STFD [Y2] = f107, 1 * SIZE } { .mmi (p16) LDFPD f77, f82 = [X1], 2 * SIZE } ;; { .mmi (p21) STFD [Y1] = f92 (p21) STFD [Y2] = f113 (p21) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f87, f92 = [X1], 2 * SIZE (p21) add Y2 = INCY3M1, Y2 } ;; { .mmi (p20) STFD [Y1] = f36, 1 * SIZE (p20) STFD [Y2] = f56, 1 * SIZE } { .mmi (p16) LDFPD f97, f102 = [X1], 2 * SIZE (p16) shladd X2 = INCX, 3, X2 } ;; { .mmi (p20) STFD [Y1] = f41 (p20) STFD [Y2] = f61 (p20) add Y1 = INCYM1, Y1 } { .mmb (p16) LDFPD f108, f127 = [X1], 2 * SIZE (p20) add Y2 = INCYM1, Y2 br.ctop.sptk.few .L121 } ;; { .mmi (p21) STFD [Y1] = f47, 1 * SIZE (p21) STFD [Y2] = f67, 1 * SIZE } ;; { .mmi (p21) STFD [Y1] = f52 (p21) STFD [Y2] = f72 (p21) add Y1 = INCY3M1, Y1 } (p21) add Y2 = INCY3M1, Y2 ;; { .mmi (p21) STFD [Y1] = f77, 1 * SIZE (p21) STFD [Y2] = f97, 1 * SIZE } ;; { .mmi (p21) STFD [Y1] = f82 (p21) STFD [Y2] = f102 (p21) add Y1 = INCYM1, Y1 } (p21) add Y2 = INCYM1, Y2 ;; { .mmi (p21) STFD [Y1] = f87, 1 * SIZE (p21) STFD [Y2] = f107, 1 * SIZE } ;; { .mmi (p21) STFD [Y1] = f92 (p21) STFD [Y2] = f113 (p21) add Y1 = INCY3M1, Y1 } (p21) add Y2 = INCY3M1, Y2 adds X1 = -SIZE, X1 ;; .align 32 .L125: { .mmi mov XX = X1 nop.m 0 mov ar.lc = ARLC } { .mmi (p12) LDFD f48 = [X1], 1 * SIZE (p12) LDFD f52 = [X2], 1 * SIZE tbit.z p0, p13 = N, 1 } ;; { .mmi (p12) LDFD f49 = [X1], INCXM1 (p12) LDFD f53 = [X2], INCXM1 mov pr = PR, -65474 } { .mib nop.m 0 tbit.z p0, p14 = N, 0 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], 1 * SIZE (p12) LDFD f54 = [X2], 1 * SIZE (p12) shladd XX = INCX, 2, XX;; } ;; { .mmi (p12) LDFD f51 = [X1], INCX3M1 (p12) LDFD f55 = [X2], INCX3M1 (p13) shladd XX = INCX, 1, XX;; } ;; { .mmi (p13) LDFD f56 = [X1], 1 * SIZE (p14) LDFD f60 = [XX], 1 * SIZE } ;; { .mmi (p13) LDFD f57 = [X1], INCXM1 (p14) LDFD f61 = [XX] mov YY = Y1 } ;; { .mmi (p12) STFD [Y1] = f48, 1 * SIZE (p12) STFD [Y2] = f52, 1 * SIZE } { .mmi (p13) LDFD f58 = [X1], 1 * SIZE } ;; { .mmi (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p12) add Y1 = INCYM1, Y1 } { .mmi (p13) LDFD f59 = [X1] (p12) add Y2 = INCYM1, Y2 } ;; { .mmi (p12) STFD [Y1] = f50, 1 * SIZE (p12) STFD [Y2] = f54, 1 * SIZE (p12) shladd YY = INCY, 2, YY;; } ;; { .mmi (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) shladd YY = INCY, 1, YY } { .mmi (p12) add Y1 = INCY3M1, Y1 (p12) add Y2 = INCY3M1, Y2 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f56, 1 * SIZE (p14) STFD [YY] = f60, 1 * SIZE } ;; { .mmi (p13) STFD [Y1] = f57 (p14) STFD [YY] = f61 (p13) add Y1 = INCYM1, Y1 } ;; { .mmi (p13) STFD [Y1] = f58, 1 * SIZE nop.m 0 nop.i 0 } ;; { .mib (p13) STFD [Y1] = f59 nop.i 0 br.ret.sptk.many b0 } ;; .align 32 .L130: { .mmi adds PREA = PREFETCH_SIZE * SIZE + 32, X1 #ifndef DOUBLE adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 #else adds PREB = PREFETCH_SIZE * SIZE + 56, Y1 #endif mov ar.ec = 5 } { .mmi LDFD f32 = [X1], 1 * SIZE cmp.eq p16, p0 = r0, r0 nop.i 0 } ;; #ifndef DOUBLE .L131: { .mmi (p19) STFD [Y1] = f35, 1 * SIZE (p19) STFD [Y2] = f55, 1 * SIZE nop.i 0 } { .mmi (p16) lfetch.nt1 [PREA], INCX8 (p16) LDFPD f37, f42 = [X1], 2 * SIZE nop.i 0 } ;; { .mmi (p19) STFD [Y1] = f40 (p19) STFD [Y2] = f60 (p19) add Y1 = INCYM1, Y1 } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY8 (p16) LDFPD f47, f52 = [X1], 2 * SIZE (p19) add Y2 = INCYM1, Y2 } ;; { .mmi (p19) STFD [Y1] = f45, 1 * SIZE (p19) STFD [Y2] = f65, 1 * SIZE nop.i 0 } { .mmi (p16) LDFPD f57, f62 = [X1], 2 * SIZE nop.m 0 nop.i 0 } ;; { .mmi (p19) STFD [Y1] = f50 (p19) STFD [Y2] = f70 (p19) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f67, f72 = [X1], 2 * SIZE (p19) add Y2 = INCY3M1, Y2 nop.i 0 } ;; { .mmi (p19) STFD [Y1] = f75, 1 * SIZE (p19) STFD [Y2] = f95, 1 * SIZE nop.i 0 } { .mmi (p16) LDFPD f77, f82 = [X1], 2 * SIZE nop.m 0 nop.i 0 } ;; { .mmi (p19) STFD [Y1] = f80 (p19) STFD [Y2] = f100 (p19) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f87, f92 = [X1], 2 * SIZE (p19) add Y2 = INCYM1, Y2 nop.i 0 } ;; { .mmi (p19) STFD [Y1] = f85, 1 * SIZE (p19) STFD [Y2] = f105, 1 * SIZE nop.i 0 } { .mmi (p16) LDFPD f97, f102 = [X1], 2 * SIZE (p16) shladd X2 = INCX, 3, X2 nop.i 0 } ;; { .mmi (p19) STFD [Y1] = f90 (p19) STFD [Y2] = f111 (p19) add Y1 = INCY3M1, Y1 } { .mmb (p16) LDFPD f108, f127 = [X1], 2 * SIZE (p19) add Y2 = INCY3M1, Y2 br.ctop.sptk.few .L131 } ;; { .mmi adds X1 = -SIZE, X1 nop.m 0 nop.i 0 } ;; .align 32 #else .L131: { .mmi (p20) STFD [Y1] = f46, 1 * SIZE (p20) STFD [Y2] = f66, 1 * SIZE } { .mmi (p16) lfetch.nt1 [PREA], INCX8 (p16) LDFPD f37, f42 = [X1], 2 * SIZE } ;; { .mmi (p20) STFD [Y1] = f51 (p20) STFD [Y2] = f71 (p20) add Y1 = INCY3M1, Y1 } { .mmi (p16) lfetch.excl.nt1 [PREB], INCY8 (p16) LDFPD f47, f52 = [X1], 2 * SIZE (p20) add Y2 = INCY3M1, Y2 } ;; { .mmi (p20) STFD [Y1] = f76, 1 * SIZE (p20) STFD [Y2] = f96, 1 * SIZE } { .mmi (p16) LDFPD f57, f62 = [X1], 2 * SIZE } ;; { .mmi (p20) STFD [Y1] = f81 (p20) STFD [Y2] = f101 (p20) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFPD f67, f72 = [X1], 2 * SIZE (p20) add Y2 = INCYM1, Y2 } ;; { .mmi (p20) STFD [Y1] = f86, 1 * SIZE (p20) STFD [Y2] = f106, 1 * SIZE } { .mmi (p16) LDFPD f77, f82 = [X1], 2 * SIZE } ;; { .mmi (p20) STFD [Y1] = f91 (p20) STFD [Y2] = f112 (p20) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFPD f87, f92 = [X1], 2 * SIZE (p20) add Y2 = INCY3M1, Y2 } ;; { .mmi (p19) STFD [Y1] = f35, 1 * SIZE (p19) STFD [Y2] = f55, 1 * SIZE } { .mmi (p16) LDFPD f97, f102 = [X1], 2 * SIZE (p16) shladd X2 = INCX, 3, X2 } ;; { .mmi (p19) STFD [Y1] = f40 (p19) STFD [Y2] = f60 (p19) add Y1 = INCYM1, Y1 } { .mmb (p16) LDFPD f108, f127 = [X1], 2 * SIZE (p19) add Y2 = INCYM1, Y2 br.ctop.sptk.few .L131 } ;; { .mmi adds X1 = -SIZE, X1 nop.m 0 nop.i 0 } ;; .align 32 #endif .L135: { .mmi mov XX = X1 nop.m 0 mov ar.lc = ARLC } { .mmi (p12) LDFD f48 = [X1], 1 * SIZE (p12) LDFD f52 = [X2], 1 * SIZE tbit.z p0, p13 = N, 1 } ;; { .mmi (p12) LDFD f49 = [X1], INCXM1 (p12) LDFD f53 = [X2], INCXM1 mov pr = PR, -65474 } { .mib nop.m 0 tbit.z p0, p14 = N, 0 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], 1 * SIZE (p12) LDFD f54 = [X2], 1 * SIZE (p12) shladd XX = INCX, 2, XX;; } ;; { .mmi (p12) LDFD f51 = [X1], INCX3M1 (p12) LDFD f55 = [X2], INCX3M1 (p13) shladd XX = INCX, 1, XX;; } ;; { .mmi (p13) LDFD f56 = [X1], 1 * SIZE (p14) LDFD f60 = [XX], 1 * SIZE } ;; { .mmi (p13) LDFD f57 = [X1], INCXM1 (p14) LDFD f61 = [XX] mov YY = Y1 } ;; { .mmi (p12) STFD [Y1] = f48, 1 * SIZE (p12) STFD [Y2] = f52, 1 * SIZE } { .mmi (p13) LDFD f58 = [X1], 1 * SIZE } ;; { .mmi (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p12) add Y1 = INCYM1, Y1 } { .mmi (p13) LDFD f59 = [X1] (p12) add Y2 = INCYM1, Y2 } ;; { .mmi (p12) STFD [Y1] = f50, 1 * SIZE (p12) STFD [Y2] = f54, 1 * SIZE (p12) shladd YY = INCY, 2, YY;; } ;; { .mmi (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) shladd YY = INCY, 1, YY } { .mmi (p12) add Y1 = INCY3M1, Y1 (p12) add Y2 = INCY3M1, Y2 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f56, 1 * SIZE (p14) STFD [YY] = f60, 1 * SIZE } ;; { .mmi (p13) STFD [Y1] = f57 (p14) STFD [YY] = f61 (p13) add Y1 = INCYM1, Y1 } ;; { .mmi (p13) STFD [Y1] = f58, 1 * SIZE nop.m 0 nop.i 0 } ;; { .mib (p13) STFD [Y1] = f59 nop.i 0 br.ret.sptk.many b0 } ;; /* Unaligned Copy INCX =! 1 */ .L200: ;; { .mmi adds PREA = PREFETCH_SIZE * SIZE + 32, X1 adds PREB = PREFETCH_SIZE * SIZE + 32, Y1 mov ar.ec = 5 } { .mmi cmp.eq p16, p0 = r0, r0 nop.m 0 nop.i 0 } ;; .align 32 .L221: { .mmi (p20) STFD [Y1] = f91 (p20) STFD [Y2] = f111 (p20) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFD f32 = [X1], 1 * SIZE (p16) LDFD f52 = [X2], 1 * SIZE (p20) add Y2 = INCY3M1, Y2 } ;; { .mmi (p19) STFD [Y1] = f35, 1 * SIZE (p19) STFD [Y2] = f55, 1 * SIZE } { .mmi (p16) LDFD f37 = [X1], INCXM1 (p16) LDFD f57 = [X2], INCXM1 } ;; { .mmi (p19) STFD [Y1] = f40 (p19) STFD [Y2] = f60 (p19) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFD f42 = [X1], 1 * SIZE (p16) LDFD f62 = [X2], 1 * SIZE (p19) add Y2 = INCYM1, Y2 } ;; { .mmi (p19) STFD [Y1] = f45, 1 * SIZE (p19) STFD [Y2] = f65, 1 * SIZE } { .mmi (p16) LDFD f47 = [X1], INCX3M1 (p16) LDFD f67 = [X2], INCX3M1 } ;; { .mmi (p19) STFD [Y1] = f50 (p19) STFD [Y2] = f70 (p19) add Y1 = INCY3M1, Y1 } { .mmi (p16) LDFD f72 = [X1], 1 * SIZE (p16) LDFD f92 = [X2], 1 * SIZE (p19) add Y2 = INCY3M1, Y2 } ;; { .mmi (p19) STFD [Y1] = f75, 1 * SIZE (p19) STFD [Y2] = f95, 1 * SIZE } { .mmi (p16) LDFD f77 = [X1], INCXM1 (p16) LDFD f97 = [X2], INCXM1 } ;; { .mmi (p19) STFD [Y1] = f80 (p19) STFD [Y2] = f100 (p19) add Y1 = INCYM1, Y1 } { .mmi (p16) LDFD f82 = [X1], 1 * SIZE (p16) LDFD f102 = [X2], 1 * SIZE (p19) add Y2 = INCYM1, Y2 } ;; { .mmi (p19) STFD [Y1] = f85, 1 * SIZE (p19) STFD [Y2] = f105, 1 * SIZE } { .mmb (p16) LDFD f87 = [X1], INCX3M1 (p16) LDFD f107 = [X2], INCX3M1 br.ctop.sptk.few .L221 } ;; .align 32 .L225: { .mmi mov XX = X1 nop.m 0 mov ar.lc = ARLC } { .mmi (p12) LDFD f48 = [X1], 1 * SIZE (p12) LDFD f52 = [X2], 1 * SIZE tbit.z p0, p13 = N, 1 } ;; { .mmi (p12) LDFD f49 = [X1], INCXM1 (p12) LDFD f53 = [X2], INCXM1 mov pr = PR, -65474 } { .mib nop.m 0 tbit.z p0, p14 = N, 0 (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f50 = [X1], 1 * SIZE (p12) LDFD f54 = [X2], 1 * SIZE (p12) shladd XX = INCX, 2, XX;; } ;; { .mmi (p12) LDFD f51 = [X1], INCX3M1 (p12) LDFD f55 = [X2], INCX3M1 (p13) shladd XX = INCX, 1, XX;; } ;; { .mmi (p13) LDFD f56 = [X1], 1 * SIZE (p14) LDFD f60 = [XX], 1 * SIZE } ;; { .mmi (p13) LDFD f57 = [X1], INCXM1 (p14) LDFD f61 = [XX] mov YY = Y1 } ;; { .mmi (p12) STFD [Y1] = f48, 1 * SIZE (p12) STFD [Y2] = f52, 1 * SIZE } { .mmi (p13) LDFD f58 = [X1], 1 * SIZE } ;; { .mmi (p12) STFD [Y1] = f49 (p12) STFD [Y2] = f53 (p12) add Y1 = INCYM1, Y1 } { .mmi (p13) LDFD f59 = [X1] (p12) add Y2 = INCYM1, Y2 } ;; { .mmi (p12) STFD [Y1] = f50, 1 * SIZE (p12) STFD [Y2] = f54, 1 * SIZE (p12) shladd YY = INCY, 2, YY;; } ;; { .mmi (p12) STFD [Y1] = f51 (p12) STFD [Y2] = f55 (p13) shladd YY = INCY, 1, YY } { .mmi (p12) add Y1 = INCY3M1, Y1 (p12) add Y2 = INCY3M1, Y2 nop.i 0 } ;; { .mmi (p13) STFD [Y1] = f56, 1 * SIZE (p14) STFD [YY] = f60, 1 * SIZE } ;; { .mmi (p13) STFD [Y1] = f57 (p14) STFD [YY] = f61 (p13) add Y1 = INCYM1, Y1 } ;; { .mmi (p13) STFD [Y1] = f58, 1 * SIZE nop.m 0 nop.i 0 } ;; { .mib (p13) STFD [Y1] = f59 nop.i 0 br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zdot.S000066400000000000000000000240731313527062700163010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCH_SIZE (13 * 16) #else #define PREFETCH_SIZE ( 9 * 32) #endif #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) #define N r33 #define X1 r34 #define INCX r35 #define Y1 r36 #define INCY r37 #else #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #endif #define PRE1 r2 #define PRE2 r3 #define I r14 #define J r15 #define Y2 r16 #define X2 r17 #define INCXM1 r18 #define INCYM1 r19 #define INCX16 r20 #define INCY16 r21 #define INCX3M1 r22 #define INCY3M1 r23 #define XX r24 #define YY r25 #define PR r30 #define ARLC r31 #define ALPHA f8 PROLOGUE .prologue PROFCODE { .mfi mov f8 = f0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mfi mov f9 = f0 } ;; .body #ifdef F_INTERFACE LDINT N = [N] LDINT INCX = [INCX] LDINT INCY = [INCY] ;; #ifndef USE64BITINT sxt4 N = N sxt4 INCX = INCX sxt4 INCY = INCY ;; #endif #endif { .mmi shladd INCX = INCX, ZBASE_SHIFT, r0 shladd INCY = INCY, ZBASE_SHIFT, r0 mov PR = pr } { .mib cmp.lt p0, p7 = r0, N mov r26 = 1 (p7) br.cond.spnt .L1000 } ;; #ifdef F_INTERFACE cmp.le p0, p6 = r0, INCX cmp.le p0, p7 = r0, INCY sub r26 = r26, N ;; setf.sig f32 = r26 setf.sig f33 = INCX setf.sig f34 = INCY ;; xmpy.l f33 = f32, f33 xmpy.l f34 = f32, f34 ;; getf.sig r26 = f33 getf.sig r27 = f34 ;; (p6) add X1 = X1, r26 (p7) add Y1 = Y1, r27 ;; #endif { .mfi #ifdef DOUBLE adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 #else adds PRE1 = (PREFETCH_SIZE + 8) * SIZE, X1 #endif mov f10 = f0 mov pr.rot= 0 } { .mfi and J = 7, N mov f11 = f0 shr I = N, 3 } ;; { .mfi #ifdef DOUBLE adds PRE2 = (PREFETCH_SIZE + 6) * SIZE, Y1 #else adds PRE2 = (PREFETCH_SIZE + 12) * SIZE, Y1 #endif mov f12 = f0 mov ar.ec = 3 } { .mmf shladd INCX16 = INCX, 3, r0 shladd INCY16 = INCY, 3, r0 mov f13 = f0 } ;; { .mmf shladd INCX3M1 = INCX, 1, INCX shladd INCY3M1 = INCY, 1, INCY mov f14 = f0 } { .mmf adds INCXM1 = -SIZE, INCX adds INCYM1 = -SIZE, INCY mov f15 = f0 } ;; { .mmi adds INCX3M1 = -SIZE, INCX3M1 adds INCY3M1 = -SIZE, INCY3M1 tbit.z p0, p12 = N, 2 } { .mmi cmp.eq p8 ,p0 = r0, I cmp.eq p16, p0 = r0, r0 adds I = -1, I } ;; { .mmi shladd X2 = INCX, 1, X1 shladd Y2 = INCY, 1, Y1 mov ar.lc = I } { .mmb mov XX = X1 mov YY = Y1 (p8) br.cond.dpnt .L55 } ;; .align 32 .L52: { .mmf (p16) lfetch.nt1 [PRE1], INCX16 (p16) LDFD f32 = [X1], SIZE (p18) FMA f8 = f34, f82, f8 } { .mmf (p16) LDFD f44 = [X2], SIZE nop.m 0 (p18) FMA f9 = f34, f85, f9 } ;; { .mmf (p16) LDFD f80 = [Y1], SIZE (p16) LDFD f92 = [Y2], SIZE (p18) FMA f10 = f37, f82, f10 } { .mmf nop.m 0 nop.m 0 (p18) FMA f11 = f37, f85, f11 } ;; { .mmf (p16) lfetch.nt1 [PRE2], INCY16 (p16) LDFD f35 = [X1], INCXM1 (p18) FMA f12 = f40, f88, f12 } { .mmf (p16) LDFD f47 = [X2], INCXM1 nop.m 0 (p18) FMA f13 = f40, f91, f13 } ;; { .mmf (p16) LDFD f83 = [Y1], INCYM1 (p16) LDFD f95 = [Y2], INCYM1 (p18) FMA f14 = f43, f88, f14 } { .mmf nop.m 0 nop.m 0 (p18) FMA f15 = f43, f91, f15 } ;; { .mmf (p16) LDFD f38 = [X1], SIZE (p16) LDFD f50 = [X2], SIZE (p18) FMA f8 = f46, f94, f8 } { .mmf nop.m 0 nop.m 0 (p18) FMA f9 = f46, f97, f9 } ;; { .mmf (p16) LDFD f86 = [Y1], SIZE (p16) LDFD f98 = [Y2], SIZE (p18) FMA f10 = f49, f94, f10 } { .mmf nop.m 0 nop.m 0 (p18) FMA f11 = f49, f97, f11 } ;; { .mmf (p16) LDFD f41 = [X1], INCX3M1 (p16) LDFD f53 = [X2], INCX3M1 (p18) FMA f12 = f52, f100, f12 } { .mmf nop.m 0 nop.m 0 (p18) FMA f13 = f52, f103, f13 } ;; { .mmf (p16) LDFD f89 = [Y1], INCY3M1 (p16) LDFD f101 = [Y2], INCY3M1 (p18) FMA f14 = f55, f100, f14 } { .mmf nop.m 0 nop.m 0 (p18) FMA f15 = f55, f103, f15 } ;; { .mmf (p16) LDFD f56 = [X1], SIZE (p16) LDFD f68 = [X2], SIZE (p18) FMA f8 = f58, f106, f8 } { .mmf nop.m 0 nop.m 0 (p18) FMA f9 = f58, f109, f9 } ;; { .mmf (p16) LDFD f104 = [Y1], SIZE (p16) LDFD f116 = [Y2], SIZE (p18) FMA f10 = f61, f106, f10 } { .mmf nop.m 0 nop.m 0 (p18) FMA f11 = f61, f109, f11 } ;; { .mmf (p16) LDFD f59 = [X1], INCXM1 (p16) LDFD f71 = [X2], INCXM1 (p18) FMA f12 = f64, f112, f12 } { .mmf nop.m 0 nop.m 0 (p18) FMA f13 = f64, f115, f13 } ;; { .mmf (p16) LDFD f107 = [Y1], INCYM1 (p16) LDFD f119 = [Y2], INCYM1 (p18) FMA f14 = f67, f112, f14 } { .mmf nop.m 0 nop.m 0 (p18) FMA f15 = f67, f115, f15 } ;; { .mmf (p16) LDFD f62 = [X1], SIZE (p16) LDFD f74 = [X2], SIZE (p18) FMA f8 = f70, f118, f8 } { .mmf nop.m 0 nop.m 0 (p18) FMA f9 = f70, f121, f9 } ;; { .mmf (p16) LDFD f110 = [Y1], SIZE (p16) LDFD f122 = [Y2], SIZE (p18) FMA f10 = f73, f118, f10 } { .mmf nop.m 0 nop.m 0 (p18) FMA f11 = f73, f121, f11 } ;; { .mmf (p16) LDFD f65 = [X1], INCX3M1 (p16) LDFD f77 = [X2], INCX3M1 (p18) FMA f12 = f76, f124, f12 } { .mmf (p16) add XX = INCX16, XX (p16) add YY = INCY16, YY (p18) FMA f13 = f76, f127, f13 } ;; { .mmf (p16) LDFD f113 = [Y1], INCY3M1 (p16) LDFD f125 = [Y2], INCY3M1 (p18) FMA f14 = f79, f124, f14 } { .mfb nop.m 0 (p18) FMA f15 = f79, f127, f15 br.ctop.sptk.few .L52 } ;; .align 32 .L55: (p12) LDFD f32 = [X1], SIZE (p12) LDFD f40 = [X2], SIZE tbit.z p0, p13 = N, 1 (p12) LDFD f34 = [Y1], SIZE (p12) LDFD f42 = [Y2], SIZE tbit.z p0, p14 = N, 0 ;; (p12) LDFD f33 = [X1], INCXM1 (p12) LDFD f41 = [X2], INCXM1 cmp.eq p9, p0 = r0, J (p12) LDFD f35 = [Y1], INCYM1 (p12) LDFD f43 = [Y2], INCYM1 (p9) br.cond.dptk .L999 ;; (p12) LDFD f36 = [X1], SIZE (p12) LDFD f44 = [X2], SIZE (p12) shladd XX = INCX, 2, XX (p12) LDFD f38 = [Y1], SIZE (p12) LDFD f46 = [Y2], SIZE (p12) shladd YY = INCY, 2, YY ;; (p12) LDFD f37 = [X1], INCX3M1 (p12) LDFD f45 = [X2], INCX3M1 (p13) shladd XX = INCX, 1, XX (p12) LDFD f39 = [Y1], INCY3M1 (p12) LDFD f47 = [Y2], INCY3M1 (p13) shladd YY = INCY, 1, YY ;; (p13) LDFD f48 = [X1], SIZE (p13) LDFD f50 = [Y1], SIZE (p14) LDFD f56 = [XX], SIZE (p14) LDFD f58 = [YY], SIZE ;; (p13) LDFD f49 = [X1], INCXM1 (p13) LDFD f51 = [Y1], INCYM1 (p14) LDFD f57 = [XX] (p14) LDFD f59 = [YY] ;; (p13) LDFD f52 = [X1], SIZE (p13) LDFD f54 = [Y1], SIZE ;; (p13) LDFD f53 = [X1] (p13) LDFD f55 = [Y1] ;; (p12) FMA f8 = f32, f34, f8 (p12) FMA f9 = f32, f35, f9 (p12) FMA f10 = f33, f34, f10 (p12) FMA f11 = f33, f35, f11 (p12) FMA f12 = f36, f38, f12 (p12) FMA f13 = f36, f39, f13 (p12) FMA f14 = f37, f38, f14 (p12) FMA f15 = f37, f39, f15 ;; (p12) FMA f8 = f40, f42, f8 (p12) FMA f9 = f40, f43, f9 (p12) FMA f10 = f41, f42, f10 (p12) FMA f11 = f41, f43, f11 (p12) FMA f12 = f44, f46, f12 (p12) FMA f13 = f44, f47, f13 (p12) FMA f14 = f45, f46, f14 (p12) FMA f15 = f45, f47, f15 ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f48, f51, f9 (p13) FMA f10 = f49, f50, f10 (p13) FMA f11 = f49, f51, f11 (p13) FMA f12 = f52, f54, f12 (p13) FMA f13 = f52, f55, f13 (p13) FMA f14 = f53, f54, f14 (p13) FMA f15 = f53, f55, f15 ;; (p14) FMA f8 = f56, f58, f8 (p14) FMA f9 = f56, f59, f9 (p14) FMA f10 = f57, f58, f10 (p14) FMA f11 = f57, f59, f11 .align 32 ;; .L999: FADD f8 = f8, f12 FADD f9 = f9, f13 FADD f10 = f10, f14 FADD f11 = f11, f15 mov ar.lc = ARLC ;; #ifndef CONJ FSUB f8 = f8, f11 FADD f9 = f9, f10 #else FADD f8 = f8, f11 FSUB f9 = f9, f10 #endif ;; .align 32 .L1000: #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) STFD [r32] = f8, SIZE ;; STFD [r32] = f9, SIZE #endif mov pr = PR, -65474 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zgemm3m_kernel.S000066400000000000000000003263771313527062700202540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #define CPREFETCHSIZE 15 #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r37 #define B r38 #define C r39 #define LDC r35 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define C9 loc0 #define C10 loc1 #define C11 loc2 #define C12 loc3 #define C13 loc4 #define C14 loc5 #define C15 loc6 #define C16 loc7 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA_R f8 #define ALPHA_I f9 PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -16 * 16, SP adds r9 = -15 * 16, SP adds SP = -16 * 16, SP } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 mov PR = pr } { .mmi ld8 LDC = [r14], 8 nop __LINE__ nop __LINE__ } ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shr J = N, 3 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 shladd LDC = LDC, ZBASE_SHIFT, r0 ;; stf.spill [r8] = f22, 32 stf.spill [r9] = f23, 32 mov AOFFSET = A ;; stf.spill [r8] = f24, 32 stf.spill [r9] = f25, 32 cmp.ge p6, p0 = 0, J ;; stf.spill [r8] = f26, 32 stf.spill [r9] = f27, 32 ;; stf.spill [r8] = f28, 32 stf.spill [r9] = f29, 32 ;; stf.spill [r8] = f30 stf.spill [r9] = f31 (p6) br.cond.dpnt .L050 .body ;; .align 32 .L010: { .mfi adds J = -1, J mov f64 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f72 = f0 } ;; { .mmf cmp.eq p6, p7 = 0, I nop __LINE__ mov f80 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc mov f88 = f0 } ;; { .mmf shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc shladd C = LDC, 3, C // coffset += 8 * ldc mov f96 = f0 } { .mmf shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc mov f104 = f0 } ;; { .mfi shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc mov f112 = f0 nop __LINE__ } { .mfb sub C8 = C, LDC // coffset8 = c + 7 * ldc mov f120 = f0 (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: { .mfb LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f73 = f0 nop __LINE__ } ;; { .mfb LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f81 = f0 nop __LINE__ } { .mfb LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f89 = f0 nop __LINE__ } ;; { .mmf LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mfb setf.d f113 = r0 mov f121 = f0 nop __LINE__ } ;; { .mmf LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfb setf.d f82 = r0 mov f90 = f0 nop __LINE__ } ;; { .mmf LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfb setf.d f114 = r0 mov f122 = f0 nop __LINE__ } ;; { .mmf LDFPD f36, f37 = [AOFFSET], 2 * SIZE setf.d f67 = r0 mov f75 = f0 } { .mfi setf.d f83 = r0 mov f91 = f0 nop __LINE__ } ;; { .mmf LDFPD f38, f39 = [AOFFSET], 2 * SIZE setf.d f99 = r0 mov f107 = f0 } { .mfi setf.d f115 = r0 mov f123 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f68 = r0 mov f76 = f0 } { .mfi setf.d f84 = r0 mov f92 = f0 adds L = 1, K } ;; { .mmf CPREFETCH [PREC], LDC setf.d f100 = r0 mov f108 = f0 } { .mfi setf.d f116 = r0 mov f124 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f69 = r0 mov f77 = f0 } { .mfi setf.d f85 = r0 mov f93 = f0 adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mmf CPREFETCH [PREC], LDC setf.d f101 = r0 mov f109 = f0 } { .mfi setf.d f117 = r0 mov f125 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f70 = r0 mov f78 = f0 } { .mfi setf.d f86 = r0 mov f94 = f0 shr L = L, 1 } ;; { .mmf CPREFETCH [PREC], LDC setf.d f102 = r0 mov f110 = f0 } { .mfi setf.d f118 = r0 mov f126 = f0 adds L = -1, L } ;; { .mmf CPREFETCH [PREC], LDC setf.d f71 = r0 mov f79 = f0 } { .mfi setf.d f87 = r0 mov f95 = f0 mov ar.lc = L } ;; { .mmf CPREFETCH [PREC] setf.d f103 = r0 mov f111 = f0 } { .mfi setf.d f119 = r0 mov f127 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfi cmp.ne p4, p5 = 0, L FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfi adds C9 = 4 * SIZE, C1 FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfi adds C10 = 4 * SIZE, C2 FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfi adds C11 = 4 * SIZE, C3 FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfi adds C12 = 4 * SIZE, C4 FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfi (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfi adds C13 = 4 * SIZE, C5 FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfi adds C14 = 4 * SIZE, C6 FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfi (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfi adds C15 = 4 * SIZE, C7 FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfi (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfi adds C16 = 4 * SIZE, C8 FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfi FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfi FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfi nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfi nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfi FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfi FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfi FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfi nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfi nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfi nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f108 = f36, f53, f108 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfi nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f124 = f36, f55, f124 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfi nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfi nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfi nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f109 = f37, f53, f109 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfi nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f125 = f37, f55, f125 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfi nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfi nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfi nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f110 = f38, f53, f110 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfi nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f126 = f38, f55, f126 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfi nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfi nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfi nop __LINE__ FMA f111 = f39, f53, f111 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfi nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfi nop __LINE__ FMA f127 = f39, f55, f127 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfi nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfi (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfi (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfi nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfi (p5) LDFD f6 = [C1 ], SIZE (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfi (p5) LDFD f7 = [C9 ], SIZE (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfi (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfi (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfi (p5) LDFD f12 = [C1 ], SIZE (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfi (p5) LDFD f13 = [C9 ], SIZE (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfi (p5) LDFD f14 = [C1 ], 5 * SIZE (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfi (p5) LDFD f15 = [C9 ], 5 * SIZE (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfi (p5) LDFD f16 = [C1 ], SIZE (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfi (p5) LDFD f17 = [C9 ], SIZE (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfi (p5) LDFD f18 = [C1 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfi (p5) LDFD f19 = [C9 ], SIZE (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfi (p5) LDFD f20 = [C1 ], SIZE (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfi (p5) LDFD f21 = [C9 ], SIZE (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfi (p5) LDFD f22 = [C1 ], - 11 * SIZE (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfi (p5) LDFD f23 = [C9 ], - 11 * SIZE (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfi (p5) LDFD f24 = [C2 ], SIZE (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfi (p5) LDFD f25 = [C10], SIZE (p3) FMA f123 = f43, f63, f123 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfi (p5) LDFD f26 = [C2 ], SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfi (p5) LDFD f27 = [C10], SIZE (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfi (p5) LDFD f28 = [C2 ], SIZE (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfi (p5) LDFD f29 = [C10], SIZE (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfi (p5) LDFD f30 = [C2 ], 5 * SIZE (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfi (p5) LDFD f31 = [C10], 5 * SIZE (p3) FMA f108 = f44, f61, f108 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfi (p5) LDFD f32 = [C2 ], SIZE (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfi (p5) LDFD f33 = [C10], SIZE (p3) FMA f124 = f44, f63, f124 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfi (p5) LDFD f34 = [C2 ], SIZE (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfi (p5) LDFD f35 = [C10], SIZE (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfi (p5) LDFD f36 = [C2 ], SIZE (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfi (p5) LDFD f37 = [C10], SIZE (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfi (p5) LDFD f38 = [C2 ], - 11 * SIZE (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfi (p5) LDFD f39 = [C10], - 11 * SIZE (p3) FMA f109 = f45, f61, f109 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfi (p5) LDFD f48 = [C3 ], SIZE (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfi (p5) LDFD f49 = [C11], SIZE (p3) FMA f125 = f45, f63, f125 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfi (p5) LDFD f50 = [C3 ], SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfi (p5) LDFD f51 = [C11], SIZE (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfi (p5) LDFD f52 = [C3 ], SIZE (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfi (p5) LDFD f53 = [C11], SIZE (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfi (p5) LDFD f54 = [C3 ], 5 * SIZE (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfi (p5) LDFD f55 = [C11], 5 * SIZE (p3) FMA f110 = f46, f61, f110 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfi (p5) LDFD f40 = [C3 ], SIZE (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfi (p5) LDFD f41 = [C11], SIZE (p3) FMA f126 = f46, f63, f126 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfi (p5) LDFD f42 = [C3 ], SIZE (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfi (p5) LDFD f43 = [C11], SIZE (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfi (p5) LDFD f44 = [C3 ], SIZE (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfi (p5) LDFD f45 = [C11], SIZE (p3) FMA f95 = f47, f59, f95 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfi (p5) LDFD f46 = [C3 ], - 11 * SIZE (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfi (p5) LDFD f56 = [C11], - 11 * SIZE (p3) FMA f111 = f47, f61, f111 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi (p5) LDFD f57 = [C4 ], SIZE (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb (p5) LDFD f58 = [C12], SIZE (p3) FMA f127 = f47, f63, f127 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L013: { .mmf (p5) LDFD f59 = [C4 ], SIZE (p5) LDFD f60 = [C12], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf cmp.ne p6, p0 = 1, I nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf (p5) LDFD f61 = [C4 ], SIZE (p5) LDFD f62 = [C12], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf (p5) LDFD f63 = [C4 ], 5 * SIZE (p5) LDFD f47 = [C12], 5 * SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mfi (p5) LDFD f64 = [C4 ], SIZE FMA f14 = ALPHA_I, f65, f14 nop __LINE__ } { .mfi (p5) LDFD f65 = [C12], SIZE FMA f15 = ALPHA_I, f67, f15 nop __LINE__ } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f68, f16 } { .mmf (p5) LDFD f6 = [C4 ], SIZE (p5) LDFD f7 = [C12], SIZE FMA f17 = ALPHA_R, f70, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f68, f18 } { .mmf (p5) LDFD f10 = [C4 ], SIZE (p5) LDFD f11 = [C12], SIZE FMA f19 = ALPHA_I, f70, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f69, f20 } { .mmf (p5) LDFD f12 = [C4 ], - 11 * SIZE (p5) LDFD f13 = [C12], - 11 * SIZE FMA f21 = ALPHA_R, f71, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f69, f22 } { .mmf (p5) LDFD f14 = [C5 ], SIZE (p5) LDFD f15 = [C13], SIZE FMA f23 = ALPHA_I, f71, f23 } ;; { .mmf STFD [C1 ] = f16, SIZE STFD [C9 ] = f17, SIZE FMA f24 = ALPHA_R, f72, f24 } { .mmf (p5) LDFD f16 = [C5 ], SIZE (p5) LDFD f17 = [C13], SIZE FMA f25 = ALPHA_R, f74, f25 } ;; { .mmf STFD [C1 ] = f18, SIZE STFD [C9 ] = f19, SIZE FMA f26 = ALPHA_I, f72, f26 } { .mmf (p5) LDFD f18 = [C5 ], SIZE (p5) LDFD f19 = [C13], SIZE FMA f27 = ALPHA_I, f74, f27 } ;; { .mmf STFD [C1 ] = f20, SIZE STFD [C9 ] = f21, SIZE FMA f28 = ALPHA_R, f73, f28 } { .mmf (p5) LDFD f20 = [C5 ], 5 * SIZE (p5) LDFD f21 = [C13], 5 * SIZE FMA f29 = ALPHA_R, f75, f29 } ;; { .mmf STFD [C1 ] = f22, 5 * SIZE STFD [C9 ] = f23, 5 * SIZE FMA f30 = ALPHA_I, f73, f30 } { .mmf (p5) LDFD f22 = [C5 ], SIZE (p5) LDFD f23 = [C13], SIZE FMA f31 = ALPHA_I, f75, f31 } ;; { .mmf STFD [C2 ] = f24, SIZE STFD [C10] = f25, SIZE FMA f32 = ALPHA_R, f76, f32 } { .mmf (p5) LDFD f24 = [C5 ], SIZE (p5) LDFD f25 = [C13], SIZE FMA f33 = ALPHA_R, f78, f33 } ;; { .mmf STFD [C2 ] = f26, SIZE STFD [C10] = f27, SIZE FMA f34 = ALPHA_I, f76, f34 } { .mmf (p5) LDFD f26 = [C5 ], SIZE (p5) LDFD f27 = [C13], SIZE FMA f35 = ALPHA_I, f78, f35 } ;; { .mmf STFD [C2 ] = f28, SIZE STFD [C10] = f29, SIZE FMA f36 = ALPHA_R, f77, f36 } { .mmf (p5) LDFD f28 = [C5 ], - 11 * SIZE (p5) LDFD f29 = [C13], - 11 * SIZE FMA f37 = ALPHA_R, f79, f37 } ;; { .mmf STFD [C2 ] = f30, 5 * SIZE STFD [C10] = f31, 5 * SIZE FMA f38 = ALPHA_I, f77, f38 } { .mmf (p5) LDFD f30 = [C6 ], SIZE (p5) LDFD f31 = [C14], SIZE FMA f39 = ALPHA_I, f79, f39 } ;; { .mmf STFD [C2 ] = f32, SIZE STFD [C10] = f33, SIZE FMA f48 = ALPHA_R, f80, f48 } { .mmf (p5) LDFD f32 = [C6 ], SIZE (p5) LDFD f33 = [C14], SIZE FMA f49 = ALPHA_R, f82, f49 } ;; { .mmf STFD [C2 ] = f34, SIZE STFD [C10] = f35, SIZE FMA f50 = ALPHA_I, f80, f50 } { .mmf (p5) LDFD f34 = [C6 ], SIZE (p5) LDFD f35 = [C14], SIZE FMA f51 = ALPHA_I, f82, f51 } ;; { .mmf STFD [C2 ] = f36, SIZE STFD [C10] = f37, SIZE FMA f52 = ALPHA_R, f81, f52 } { .mmf (p5) LDFD f36 = [C6 ], 5 * SIZE (p5) LDFD f37 = [C14], 5 * SIZE FMA f53 = ALPHA_R, f83, f53 } ;; { .mmf STFD [C2 ] = f38, 5 * SIZE STFD [C10] = f39, 5 * SIZE FMA f54 = ALPHA_I, f81, f54 } { .mmf (p5) LDFD f38 = [C6 ], SIZE (p5) LDFD f39 = [C14], SIZE FMA f55 = ALPHA_I, f83, f55 } ;; { .mmf STFD [C3 ] = f48, SIZE STFD [C11] = f49, SIZE FMA f40 = ALPHA_R, f84, f40 } { .mmf (p5) LDFD f48 = [C6 ], SIZE (p5) LDFD f49 = [C14], SIZE FMA f41 = ALPHA_R, f86, f41 } ;; { .mmf STFD [C3 ] = f50, SIZE STFD [C11] = f51, SIZE FMA f42 = ALPHA_I, f84, f42 } { .mmf (p5) LDFD f50 = [C6 ], SIZE (p5) LDFD f51 = [C14], SIZE FMA f43 = ALPHA_I, f86, f43 } ;; { .mmf STFD [C3 ] = f52, SIZE STFD [C11] = f53, SIZE FMA f44 = ALPHA_R, f85, f44 } { .mmf (p5) LDFD f52 = [C6 ], - 11 * SIZE (p5) LDFD f53 = [C14], - 11 * SIZE FMA f45 = ALPHA_R, f87, f45 } ;; { .mmf STFD [C3 ] = f54, 5 * SIZE STFD [C11] = f55, 5 * SIZE FMA f46 = ALPHA_I, f85, f46 } { .mmf (p5) LDFD f54 = [C7 ], SIZE (p5) LDFD f55 = [C15], SIZE FMA f56 = ALPHA_I, f87, f56 } ;; { .mmf STFD [C3 ] = f40, SIZE STFD [C11] = f41, SIZE FMA f57 = ALPHA_R, f88, f57 } { .mmf (p5) LDFD f40 = [C7 ], SIZE (p5) LDFD f41 = [C15], SIZE FMA f58 = ALPHA_R, f90, f58 } ;; { .mmf STFD [C3 ] = f42, SIZE STFD [C11] = f43, SIZE FMA f59 = ALPHA_I, f88, f59 } { .mmf (p5) LDFD f42 = [C7 ], SIZE (p5) LDFD f43 = [C15], SIZE FMA f60 = ALPHA_I, f90, f60 } ;; { .mmf STFD [C3 ] = f44, SIZE STFD [C11] = f45, SIZE FMA f61 = ALPHA_R, f89, f61 } { .mmf (p5) LDFD f44 = [C7 ], 5 * SIZE (p5) LDFD f45 = [C15], 5 * SIZE FMA f62 = ALPHA_R, f91, f62 } ;; { .mmf STFD [C3 ] = f46, 5 * SIZE STFD [C11] = f56, 5 * SIZE FMA f63 = ALPHA_I, f89, f63 } { .mmf (p5) LDFD f46 = [C7 ], SIZE (p5) LDFD f56 = [C15], SIZE FMA f47 = ALPHA_I, f91, f47 } ;; { .mmf STFD [C4 ] = f57, SIZE STFD [C12] = f58, SIZE FMA f64 = ALPHA_R, f92, f64 } { .mmf (p5) LDFD f57 = [C7 ], SIZE (p5) LDFD f58 = [C15], SIZE FMA f65 = ALPHA_R, f94, f65 } ;; { .mmf STFD [C4 ] = f59, SIZE STFD [C12] = f60, SIZE FMA f6 = ALPHA_I, f92, f6 } { .mmf (p5) LDFD f59 = [C7 ], SIZE (p5) LDFD f60 = [C15], SIZE FMA f7 = ALPHA_I, f94, f7 } ;; { .mmf STFD [C4 ] = f61, SIZE STFD [C12] = f62, SIZE FMA f10 = ALPHA_R, f93, f10 } { .mmf (p5) LDFD f61 = [C7 ], - 11 * SIZE (p5) LDFD f62 = [C15], - 11 * SIZE FMA f11 = ALPHA_R, f95, f11 } ;; { .mmf STFD [C4 ] = f63, 5 * SIZE STFD [C12] = f47, 5 * SIZE FMA f12 = ALPHA_I, f93, f12 } { .mmf (p5) LDFD f63 = [C8 ], SIZE (p5) LDFD f47 = [C16], SIZE FMA f13 = ALPHA_I, f95, f13 } ;; { .mmf STFD [C4 ] = f64, SIZE STFD [C12] = f65, SIZE FMA f14 = ALPHA_R, f96, f14 } { .mmf (p5) LDFD f64 = [C8 ], SIZE (p5) LDFD f65 = [C16], SIZE FMA f15 = ALPHA_R, f98, f15 } ;; { .mmf STFD [C4 ] = f6, SIZE STFD [C12] = f7, SIZE FMA f16 = ALPHA_I, f96, f16 } { .mmf (p5) LDFD f6 = [C8 ], SIZE (p5) LDFD f7 = [C16], SIZE FMA f17 = ALPHA_I, f98, f17 } ;; { .mmf STFD [C4 ] = f10, SIZE STFD [C12] = f11, SIZE FMA f18 = ALPHA_R, f97, f18 } { .mmf (p5) LDFD f10 = [C8 ], 5 * SIZE (p5) LDFD f11 = [C16], 5 * SIZE FMA f19 = ALPHA_R, f99, f19 } ;; { .mmf STFD [C4 ] = f12, 5 * SIZE STFD [C12] = f13, 5 * SIZE FMA f20 = ALPHA_I, f97, f20 } { .mmf (p5) LDFD f12 = [C8 ], SIZE (p5) LDFD f13 = [C16], SIZE FMA f21 = ALPHA_I, f99, f21 } ;; { .mmf STFD [C5 ] = f14, SIZE STFD [C13] = f15, SIZE FMA f22 = ALPHA_R, f100, f22 } { .mmf (p5) LDFD f14 = [C8 ], SIZE (p5) LDFD f15 = [C16], SIZE FMA f23 = ALPHA_R, f102, f23 } ;; { .mmf STFD [C5 ] = f16, SIZE STFD [C13] = f17, SIZE FMA f24 = ALPHA_I, f100, f24 } { .mmf (p5) LDFD f16 = [C8 ], SIZE (p5) LDFD f17 = [C16], SIZE FMA f25 = ALPHA_I, f102, f25 } ;; { .mmf STFD [C5 ] = f18, SIZE STFD [C13] = f19, SIZE FMA f26 = ALPHA_R, f101, f26 } { .mmf (p5) LDFD f18 = [C8 ], - 11 * SIZE (p5) LDFD f19 = [C16], - 11 * SIZE FMA f27 = ALPHA_R, f103, f27 } ;; { .mmf STFD [C5 ] = f20, 5 * SIZE STFD [C13] = f21, 5 * SIZE FMA f28 = ALPHA_I, f101, f28 } { .mmf nop __LINE__ nop __LINE__ FMA f29 = ALPHA_I, f103, f29 } ;; { .mmf STFD [C5 ] = f22, SIZE STFD [C13] = f23, SIZE FMA f30 = ALPHA_R, f104, f30 } { .mmf nop __LINE__ nop __LINE__ FMA f31 = ALPHA_R, f106, f31 } ;; { .mmf STFD [C5 ] = f24, SIZE STFD [C13] = f25, SIZE FMA f32 = ALPHA_I, f104, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA_I, f106, f33 } ;; { .mmf STFD [C5 ] = f26, SIZE STFD [C13] = f27, SIZE FMA f34 = ALPHA_R, f105, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA_R, f107, f35 } ;; { .mmf STFD [C5 ] = f28, 5 * SIZE STFD [C13] = f29, 5 * SIZE FMA f36 = ALPHA_I, f105, f36 } { .mmf nop __LINE__ nop __LINE__ FMA f37 = ALPHA_I, f107, f37 } ;; { .mmf STFD [C6 ] = f30, SIZE STFD [C14] = f31, SIZE FMA f38 = ALPHA_R, f108, f38 } { .mmf nop __LINE__ nop __LINE__ FMA f39 = ALPHA_R, f110, f39 } ;; { .mmf STFD [C6 ] = f32, SIZE STFD [C14] = f33, SIZE FMA f48 = ALPHA_I, f108, f48 } { .mmf nop __LINE__ nop __LINE__ FMA f49 = ALPHA_I, f110, f49 } ;; { .mmf STFD [C6 ] = f34, SIZE STFD [C14] = f35, SIZE FMA f50 = ALPHA_R, f109, f50 } { .mmf nop __LINE__ nop __LINE__ FMA f51 = ALPHA_R, f111, f51 } ;; { .mmf STFD [C6 ] = f36, 5 * SIZE STFD [C14] = f37, 5 * SIZE FMA f52 = ALPHA_I, f109, f52 } { .mmf nop __LINE__ nop __LINE__ FMA f53 = ALPHA_I, f111, f53 } ;; { .mmf STFD [C6 ] = f38, SIZE STFD [C14] = f39, SIZE FMA f54 = ALPHA_R, f112, f54 } { .mmf nop __LINE__ nop __LINE__ FMA f55 = ALPHA_R, f114, f55 } ;; { .mmf STFD [C6 ] = f48, SIZE STFD [C14] = f49, SIZE FMA f40 = ALPHA_I, f112, f40 } { .mmf nop __LINE__ nop __LINE__ FMA f41 = ALPHA_I, f114, f41 } ;; { .mmf STFD [C6 ] = f50, SIZE STFD [C14] = f51, SIZE FMA f42 = ALPHA_R, f113, f42 } { .mmf nop __LINE__ nop __LINE__ FMA f43 = ALPHA_R, f115, f43 } ;; { .mmf STFD [C6 ] = f52, 5 * SIZE STFD [C14] = f53, 5 * SIZE FMA f44 = ALPHA_I, f113, f44 } { .mmf nop __LINE__ nop __LINE__ FMA f45 = ALPHA_I, f115, f45 } ;; { .mmf STFD [C7 ] = f54, SIZE STFD [C15] = f55, SIZE FMA f46 = ALPHA_R, f116, f46 } { .mmf nop __LINE__ nop __LINE__ FMA f56 = ALPHA_R, f118, f56 } ;; { .mmf STFD [C7 ] = f40, SIZE STFD [C15] = f41, SIZE FMA f57 = ALPHA_I, f116, f57 } { .mmf nop __LINE__ nop __LINE__ FMA f58 = ALPHA_I, f118, f58 } ;; { .mmf STFD [C7 ] = f42, SIZE STFD [C15] = f43, SIZE FMA f59 = ALPHA_R, f117, f59 } { .mmf nop __LINE__ nop __LINE__ FMA f60 = ALPHA_R, f119, f60 } ;; { .mmf STFD [C7 ] = f44, 5 * SIZE STFD [C15] = f45, 5 * SIZE FMA f61 = ALPHA_I, f117, f61 } { .mmf nop __LINE__ nop __LINE__ FMA f62 = ALPHA_I, f119, f62 } ;; { .mmf STFD [C7 ] = f46, SIZE STFD [C15] = f56, SIZE FMA f63 = ALPHA_R, f120, f63 } { .mmf nop __LINE__ nop __LINE__ FMA f47 = ALPHA_R, f122, f47 } ;; { .mmf STFD [C7 ] = f57, SIZE STFD [C15] = f58, SIZE FMA f64 = ALPHA_I, f120, f64 } { .mmf nop __LINE__ nop __LINE__ FMA f65 = ALPHA_I, f122, f65 } ;; { .mmf STFD [C7 ] = f59, SIZE STFD [C15] = f60, SIZE FMA f6 = ALPHA_R, f121, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f123, f7 } ;; { .mmf STFD [C7 ] = f61, 5 * SIZE STFD [C15] = f62, 5 * SIZE FMA f10 = ALPHA_I, f121, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f123, f11 } ;; { .mmf STFD [C8 ] = f63, SIZE STFD [C16] = f47, SIZE FMA f12 = ALPHA_R, f124, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f126, f13 } ;; { .mmf STFD [C8 ] = f64, SIZE STFD [C16] = f65, SIZE FMA f14 = ALPHA_I, f124, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f126, f15 } ;; { .mmf STFD [C8 ] = f6, SIZE STFD [C16] = f7, SIZE FMA f16 = ALPHA_R, f125, f16 } { .mmf nop __LINE__ nop __LINE__ FMA f17 = ALPHA_R, f127, f17 } ;; { .mmf STFD [C8 ] = f10, 5 * SIZE STFD [C16] = f11, 5 * SIZE FMA f18 = ALPHA_I, f125, f18 } { .mmf nop __LINE__ nop __LINE__ FMA f19 = ALPHA_I, f127, f19 } ;; { .mmf STFD [C8 ] = f12, SIZE STFD [C16] = f13, SIZE mov f64 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f72 = f0 } ;; { .mmf STFD [C8 ] = f14, SIZE STFD [C16] = f15, SIZE mov f80 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f88 = f0 } ;; { .mmf STFD [C8 ] = f16, SIZE STFD [C16] = f17, SIZE mov f96 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f104 = f0 } ;; { .mmf STFD [C8 ] = f18, 5 * SIZE STFD [C16] = f19, 5 * SIZE mov f112 = f0 } { .mfb adds I = -1, I mov f120 = f0 (p6) br.cond.dptk .L011 } ;; .L020: { .mfi cmp.eq p3, p0 = r0, r0 mov f89 = f0 tbit.z p6, p7 = M, 2 } { .mfb nop __LINE__ mov f81 = f0 (p6) br.cond.dptk .L030 } ;; { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; { .mmf LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mfi setf.d f113 = r0 mov f121 = f0 adds L = 1, K } ;; { .mmf LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfi setf.d f82 = r0 mov f90 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfi setf.d f114 = r0 mov f122 = f0 shr L = L, 1 } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f75 = f0 adds L = -1, L } { .mmf setf.d f67 = r0 setf.d f83 = r0 mov f91 = f0 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f107 = f0 mov ar.lc = L } { .mmf setf.d f99 = r0 setf.d f115 = r0 mov f123 = f0 } ;; .align 32 .L022: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 4 * SIZE, C4 } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C13 = 4 * SIZE, C5 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C14 = 4 * SIZE, C6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 (p5) adds C15 = 4 * SIZE, C7 } { .mfi nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 (p5) adds C16 = 4 * SIZE, C8 } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f6 = [C1 ], SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f12 = [C1 ], SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb (p5) LDFD f13 = [C9 ], SIZE (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p5) LDFD f14 = [C1 ], - 3 * SIZE (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb (p5) LDFD f15 = [C9 ], - 3 * SIZE (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p5) LDFD f16 = [C2 ], SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f17 = [C10], SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f18 = [C2 ], SIZE (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb (p5) LDFD f19 = [C10], SIZE (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f20 = [C2 ], SIZE (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb (p5) LDFD f21 = [C10], SIZE (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb (p5) LDFD f22 = [C2 ], - 3 * SIZE (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb (p5) LDFD f23 = [C10], - 3 * SIZE (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb (p5) LDFD f24 = [C3 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb (p5) LDFD f25 = [C11], SIZE (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f26 = [C3 ], SIZE (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb (p5) LDFD f27 = [C11], SIZE (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f28 = [C3 ], SIZE (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb (p5) LDFD f29 = [C11], SIZE (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi (p5) LDFD f30 = [C3 ], - 3 * SIZE (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb (p5) LDFD f31 = [C11], - 3 * SIZE (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: { .mmf LDFD f68 = [C4 ], SIZE LDFD f69 = [C12], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f70 = [C4 ], SIZE LDFD f71 = [C12], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f76 = [C4 ], SIZE LDFD f77 = [C12], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f78 = [C4 ], -3 * SIZE LDFD f79 = [C12], -3 * SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f72, f16 } { .mmf LDFD f84 = [C5 ], SIZE LDFD f85 = [C13], SIZE FMA f17 = ALPHA_R, f74, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f72, f18 } { .mmf LDFD f86 = [C5 ], SIZE LDFD f87 = [C13], SIZE FMA f19 = ALPHA_I, f74, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f73, f20 } { .mmf LDFD f92 = [C5 ], SIZE LDFD f93 = [C13], SIZE FMA f21 = ALPHA_R, f75, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f73, f22 } { .mmf LDFD f94 = [C5 ], -3 * SIZE LDFD f95 = [C13], -3 * SIZE FMA f23 = ALPHA_I, f75, f23 } ;; { .mmf STFD [C2 ] = f16, SIZE STFD [C10] = f17, SIZE FMA f24 = ALPHA_R, f80, f24 } { .mmf LDFD f100 = [C6 ], SIZE LDFD f101 = [C14], SIZE FMA f25 = ALPHA_R, f82, f25 } ;; { .mmf STFD [C2 ] = f18, SIZE STFD [C10] = f19, SIZE FMA f26 = ALPHA_I, f80, f26 } { .mmf LDFD f102 = [C6 ], SIZE LDFD f103 = [C14], SIZE FMA f27 = ALPHA_I, f82, f27 } ;; { .mmf STFD [C2 ] = f20, SIZE STFD [C10] = f21, SIZE FMA f28 = ALPHA_R, f81, f28 } { .mmf LDFD f108 = [C6 ], SIZE LDFD f109 = [C14], SIZE FMA f29 = ALPHA_R, f83, f29 } ;; { .mmf STFD [C2 ] = f22, 5 * SIZE STFD [C10] = f23, 5 * SIZE FMA f30 = ALPHA_I, f81, f30 } { .mmf LDFD f110 = [C6 ], -3 * SIZE LDFD f111 = [C14], -3 * SIZE FMA f31 = ALPHA_I, f83, f31 } ;; { .mmf STFD [C3 ] = f24, SIZE STFD [C11] = f25, SIZE FMA f68 = ALPHA_R, f88, f68 } { .mmf LDFD f116 = [C7 ], SIZE LDFD f117 = [C15], SIZE FMA f69 = ALPHA_R, f90, f69 } ;; { .mmf STFD [C3 ] = f26, SIZE STFD [C11] = f27, SIZE FMA f70 = ALPHA_I, f88, f70 } { .mmf LDFD f118 = [C7 ], SIZE LDFD f119 = [C15], SIZE FMA f71 = ALPHA_I, f90, f71 } ;; { .mmf STFD [C3 ] = f28, SIZE STFD [C11] = f29, SIZE FMA f76 = ALPHA_R, f89, f76 } { .mmf LDFD f124 = [C7 ], SIZE LDFD f125 = [C15], SIZE FMA f77 = ALPHA_R, f91, f77 } ;; { .mmf STFD [C3 ] = f30, 5 * SIZE STFD [C11] = f31, 5 * SIZE FMA f78 = ALPHA_I, f89, f78 } { .mmf LDFD f126 = [C7 ], -3 * SIZE LDFD f127 = [C15], -3 * SIZE FMA f79 = ALPHA_I, f91, f79 } ;; { .mmf STFD [C4 ] = f68, SIZE STFD [C12] = f69, SIZE FMA f84 = ALPHA_R, f96, f84 } { .mmf LDFD f32 = [C8 ], SIZE LDFD f33 = [C16], SIZE FMA f85 = ALPHA_R, f98, f85 } ;; { .mmf STFD [C4 ] = f70, SIZE STFD [C12] = f71, SIZE FMA f86 = ALPHA_I, f96, f86 } { .mmf LDFD f34 = [C8 ], SIZE LDFD f35 = [C16], SIZE FMA f87 = ALPHA_I, f98, f87 } ;; { .mmf STFD [C4 ] = f76, SIZE STFD [C12] = f77, SIZE FMA f92 = ALPHA_R, f97, f92 } { .mmf LDFD f36 = [C8 ], SIZE LDFD f37 = [C16], SIZE FMA f93 = ALPHA_R, f99, f93 } ;; { .mmf STFD [C4 ] = f78, 5 * SIZE STFD [C12] = f79, 5 * SIZE FMA f94 = ALPHA_I, f97, f94 } { .mmf LDFD f38 = [C8 ], -3 * SIZE LDFD f39 = [C16], -3 * SIZE FMA f95 = ALPHA_I, f99, f95 } ;; { .mmf STFD [C5 ] = f84, SIZE STFD [C13] = f85, SIZE FMA f100 = ALPHA_R, f104, f100 } { .mmf nop __LINE__ nop __LINE__ FMA f101 = ALPHA_R, f106, f101 } ;; { .mmf STFD [C5 ] = f86, SIZE STFD [C13] = f87, SIZE FMA f102 = ALPHA_I, f104, f102 } { .mmf nop __LINE__ nop __LINE__ FMA f103 = ALPHA_I, f106, f103 } ;; { .mmf STFD [C5 ] = f92, SIZE STFD [C13] = f93, SIZE FMA f108 = ALPHA_R, f105, f108 } { .mmf nop __LINE__ nop __LINE__ FMA f109 = ALPHA_R, f107, f109 } ;; { .mmf STFD [C5 ] = f94, 5 * SIZE STFD [C13] = f95, 5 * SIZE FMA f110 = ALPHA_I, f105, f110 } { .mmf nop __LINE__ nop __LINE__ FMA f111 = ALPHA_I, f107, f111 } ;; { .mmf STFD [C6 ] = f100, SIZE STFD [C14] = f101, SIZE FMA f116 = ALPHA_R, f112, f116 } { .mmf nop __LINE__ nop __LINE__ FMA f117 = ALPHA_R, f114, f117 } ;; { .mmf STFD [C6 ] = f102, SIZE STFD [C14] = f103, SIZE FMA f118 = ALPHA_I, f112, f118 } { .mmf nop __LINE__ nop __LINE__ FMA f119 = ALPHA_I, f114, f119 } ;; { .mmf STFD [C6 ] = f108, SIZE STFD [C14] = f109, SIZE FMA f124 = ALPHA_R, f113, f124 } { .mmf nop __LINE__ nop __LINE__ FMA f125 = ALPHA_R, f115, f125 } ;; { .mmf STFD [C6 ] = f110, 5 * SIZE STFD [C14] = f111, 5 * SIZE FMA f126 = ALPHA_I, f113, f126 } { .mmf nop __LINE__ nop __LINE__ FMA f127 = ALPHA_I, f115, f127 } ;; { .mmf STFD [C7 ] = f116, SIZE STFD [C15] = f117, SIZE FMA f32 = ALPHA_R, f120, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA_R, f122, f33 } ;; { .mmf STFD [C7 ] = f118, SIZE STFD [C15] = f119, SIZE FMA f34 = ALPHA_I, f120, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA_I, f122, f35 } ;; { .mmf STFD [C7 ] = f124, SIZE STFD [C15] = f125, SIZE FMA f36 = ALPHA_R, f121, f36 } { .mmf nop __LINE__ nop __LINE__ FMA f37 = ALPHA_R, f123, f37 } ;; { .mmf STFD [C7 ] = f126, 5 * SIZE STFD [C15] = f127, 5 * SIZE FMA f38 = ALPHA_I, f121, f38 } { .mmf nop __LINE__ nop __LINE__ FMA f39 = ALPHA_I, f123, f39 } ;; { .mmf STFD [C8 ] = f32, SIZE STFD [C16] = f33, SIZE mov f64 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f72 = f0 } ;; { .mmf STFD [C8 ] = f34, SIZE STFD [C16] = f35, SIZE mov f80 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f88 = f0 } ;; { .mmf STFD [C8 ] = f36, SIZE STFD [C16] = f37, SIZE mov f96 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f104 = f0 } ;; { .mmf STFD [C8 ] = f38, 5 * SIZE STFD [C16] = f39, 5 * SIZE mov f112 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f120 = f0 } ;; .align 32 .L030: { .mib nop __LINE__ tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L040 } ;; { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 adds L = 1, K } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f81 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 shr L = L, 1 } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f97 = f0 adds L = -1, L } { .mfi nop __LINE__ mov f105 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov f113 = f0 mov ar.lc = L } { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f121 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 32 .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p5) LDFD f6 = [C1], SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb (p5) LDFD f12 = [C2], SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p5) LDFD f7 = [C1], SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p5) LDFD f13 = [C2], SIZE (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f10 = [C1], SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb (p5) LDFD f14 = [C2], SIZE (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi (p5) LDFD f11 = [C1], -3 * SIZE (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb (p5) LDFD f15 = [C2], -3 * SIZE (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: { .mmf LDFD f16 = [C3], SIZE LDFD f20 = [C4], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f12 = ALPHA_R, f72, f12 } ;; { .mmf LDFD f17 = [C3], SIZE LDFD f21 = [C4], SIZE FMA f7 = ALPHA_I, f64, f7 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_I, f72, f13 } ;; { .mmf LDFD f18 = [C3], SIZE LDFD f22 = [C4], SIZE FMA f10 = ALPHA_R, f65, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f14 = ALPHA_R, f73, f14 } ;; { .mmf LDFD f19 = [C3], - 3 * SIZE LDFD f23 = [C4], - 3 * SIZE FMA f11 = ALPHA_I, f65, f11 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f73, f15 } ;; { .mmf STFD [C1] = f6, SIZE STFD [C2] = f12, SIZE FMA f16 = ALPHA_R, f80, f16 } { .mmf LDFD f24 = [C5], SIZE LDFD f28 = [C6], SIZE FMA f20 = ALPHA_R, f88, f20 } ;; { .mmf STFD [C1] = f7, SIZE STFD [C2] = f13, SIZE FMA f17 = ALPHA_I, f80, f17 } { .mmf LDFD f25 = [C5], SIZE LDFD f29 = [C6], SIZE FMA f21 = ALPHA_I, f88, f21 } ;; { .mmf STFD [C1] = f10, SIZE STFD [C2] = f14, SIZE FMA f18 = ALPHA_R, f81, f18 } { .mmf LDFD f26 = [C5], SIZE LDFD f30 = [C6], SIZE FMA f22 = ALPHA_R, f89, f22 } ;; { .mmf STFD [C1] = f11, SIZE STFD [C2] = f15, SIZE FMA f19 = ALPHA_I, f81, f19 } { .mmf LDFD f27 = [C5], - 3 * SIZE LDFD f31 = [C6], - 3 * SIZE FMA f23 = ALPHA_I, f89, f23 } ;; { .mmf STFD [C3] = f16, SIZE STFD [C4] = f20, SIZE FMA f24 = ALPHA_R, f96, f24 } { .mmf LDFD f32 = [C7], SIZE LDFD f36 = [C8], SIZE FMA f28 = ALPHA_R, f104, f28 } ;; { .mmf STFD [C3] = f17, SIZE STFD [C4] = f21, SIZE FMA f25 = ALPHA_I, f96, f25 } { .mmf LDFD f33 = [C7], SIZE LDFD f37 = [C8], SIZE FMA f29 = ALPHA_I, f104, f29 } ;; { .mmf STFD [C3] = f18, SIZE STFD [C4] = f22, SIZE FMA f26 = ALPHA_R, f97, f26 } { .mmf LDFD f34 = [C7], SIZE LDFD f38 = [C8], SIZE FMA f30 = ALPHA_R, f105, f30 } ;; { .mmf STFD [C3] = f19, SIZE STFD [C4] = f23, SIZE FMA f27 = ALPHA_I, f97, f27 } { .mmf LDFD f35 = [C7], - 3 * SIZE LDFD f39 = [C8], - 3 * SIZE FMA f31 = ALPHA_I, f105, f31 } ;; { .mmf STFD [C5] = f24, SIZE STFD [C6] = f28, SIZE FMA f32 = ALPHA_R, f112, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f36 = ALPHA_R, f120, f36 } ;; { .mmf STFD [C5] = f25, SIZE STFD [C6] = f29, SIZE FMA f33 = ALPHA_I, f112, f33 } { .mmf nop __LINE__ nop __LINE__ FMA f37 = ALPHA_I, f120, f37 } ;; { .mmf STFD [C5] = f26, SIZE STFD [C6] = f30, SIZE FMA f34 = ALPHA_R, f113, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f38 = ALPHA_R, f121, f38 } ;; { .mmf STFD [C5] = f27, SIZE STFD [C6] = f31, SIZE FMA f35 = ALPHA_I, f113, f35 } { .mmf nop __LINE__ nop __LINE__ FMA f39 = ALPHA_I, f121, f39 } ;; { .mmf STFD [C7] = f32, SIZE STFD [C8] = f36, SIZE mov f64 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f72 = f0 } ;; { .mmf STFD [C7] = f33, SIZE STFD [C8] = f37, SIZE mov f80 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f88 = f0 } ;; { .mmf STFD [C7] = f34, SIZE STFD [C8] = f38, SIZE mov f96 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f104 = f0 } ;; { .mmf STFD [C7] = f35, SIZE STFD [C8] = f39, SIZE mov f112 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f120 = f0 } ;; .align 32 .L040: { .mib nop __LINE__ tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L049 } ;; { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B adds L = 1, K } ;; { .mii LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f52, f53 = [BOFFSET], 2 * SIZE LDFD f32 = [AOFFSET], 1 * SIZE adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } { .mmi LDFPD f54, f55 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET nop __LINE__ } ;; .align 32 .L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mmf (p5) LDFD f6 = [C1], SIZE (p5) LDFD f10 = [C2], SIZE FMA f104 = f32, f53, f104 // A1 * B6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mmf (p5) LDFD f7 = [C1], -SIZE (p5) LDFD f11 = [C2], -SIZE FMA f120 = f32, f55, f120 // A1 * B8 } ;; { .mmf (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 } { .mmf (p5) LDFD f12 = [C3], SIZE (p5) LDFD f14 = [C4], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf (p5) LDFD f13 = [C3], -SIZE (p5) LDFD f15 = [C4], -SIZE (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mmf (p5) LDFD f16 = [C5], SIZE (p5) LDFD f18 = [C6], SIZE (p3) FMA f104 = f40, f61, f104 // A1 * B6 } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb (p5) LDFD f17 = [C5], -SIZE (p5) LDFD f19 = [C6], -SIZE nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb (p5) LDFD f20 = [C7], SIZE (p5) LDFD f22 = [C8], SIZE br.cloop.sptk.few .L042 } ;; { .mmf LDFD f21 = [C7], -SIZE LDFD f23 = [C8], -SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f10 = ALPHA_R, f72, f10 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_I, f64, f7 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f72, f11 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f12 = ALPHA_R, f80, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f14 = ALPHA_R, f88, f14 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_I, f80, f13 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f88, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C2 ] = f10, SIZE FMA f16 = ALPHA_R, f96, f16 } { .mmf nop __LINE__ nop __LINE__ FMA f18 = ALPHA_R, f104, f18 } ;; { .mmf STFD [C1 ] = f7, SIZE STFD [C2 ] = f11, SIZE FMA f17 = ALPHA_I, f96, f17 } { .mmf nop __LINE__ nop __LINE__ FMA f19 = ALPHA_I, f104, f19 } ;; { .mmf STFD [C3 ] = f12, SIZE STFD [C4 ] = f14, SIZE FMA f20 = ALPHA_R, f112, f20 } { .mmf nop __LINE__ nop __LINE__ FMA f22 = ALPHA_R, f120, f22 } ;; { .mmf STFD [C3 ] = f13, SIZE STFD [C4 ] = f15, SIZE FMA f21 = ALPHA_I, f112, f21 } { .mmf nop __LINE__ nop __LINE__ FMA f23 = ALPHA_I, f120, f23 } ;; { .mmi STFD [C5 ] = f16, SIZE STFD [C6 ] = f18, SIZE nop __LINE__ } ;; { .mmi STFD [C5 ] = f17, SIZE STFD [C6 ] = f19, SIZE nop __LINE__ } ;; { .mmi STFD [C7 ] = f20, SIZE STFD [C8 ] = f22, SIZE nop __LINE__ } ;; { .mmi STFD [C7 ] = f21, SIZE STFD [C8 ] = f23, SIZE nop __LINE__ } ;; .align 32 .L049: { .mmi mov B = BOFFSET mov AOFFSET = A nop __LINE__ } ;; { .mmb nop __LINE__ cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 32 .L050: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 2 } { .mfi add C2 = LDC, C mov f72 = f0 shr I = M, 3 } ;; { .mfi shladd C3 = LDC, 1, C mov f80 = f0 nop __LINE__ } { .mfb mov AOFFSET = A mov f88 = f0 (p6) br.cond.dpnt .L090 } ;; { .mfi cmp.eq p6, p7 = 0, I mov f65 = f0 nop __LINE__ } { .mfi shladd C4 = LDC, 1, C2 mov f73 = f0 nop __LINE__ } ;; { .mfi nop __LINE__ mov f81 = f0 nop __LINE__ } { .mfb shladd C = LDC, 2, C mov f89 = f0 (p6) br.cond.dpnt .L060 } ;; .align 32 .L052: { .mfb LDFPD f48, f49 = [B] mov f66 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f74 = f0 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 nop __LINE__ } { .mfi setf.d f84 = r0 mov f90 = f0 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f67 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f75 = f0 adds L = 1, K } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f83 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f91 = r0 mov f68 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC], LDC mov f76 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f92 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f69 = f0 shr L = L, 1 } { .mmf setf.d f77 = r0 setf.d f85 = r0 mov f93 = f0 } ;; { .mfi CPREFETCH [PREC], LDC mov f70 = f0 adds L = -1, L } { .mmf setf.d f78 = r0 setf.d f86 = r0 mov f94 = f0 } ;; { .mfi CPREFETCH [PREC] mov f71 = f0 mov ar.lc = L } { .mmf setf.d f79 = r0 setf.d f87 = r0 mov f95 = f0 } ;; .align 32 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 adds C9 = 4 * SIZE, C1 } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C10 = 4 * SIZE, C2 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C11 = 4 * SIZE, C3 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f6 = [C1 ], SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f12 = [C1 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb (p5) LDFD f13 = [C9 ], SIZE (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f14 = [C1 ], 5 * SIZE (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb (p5) LDFD f15 = [C9 ], 5 * SIZE (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f16 = [C1 ], SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb (p5) LDFD f17 = [C9], SIZE (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f18 = [C1 ], SIZE (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb (p5) LDFD f19 = [C9], SIZE (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f20 = [C1 ], SIZE (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb (p5) LDFD f21 = [C9], SIZE (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f22 = [C1 ], -11 * SIZE (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb (p5) LDFD f23 = [C9 ], -11 * SIZE (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f24 = [C2 ], SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb (p5) LDFD f25 = [C10], SIZE (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f26 = [C2 ], SIZE (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb (p5) LDFD f27 = [C10], SIZE (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f28 = [C2 ], SIZE (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb (p5) LDFD f29 = [C10], SIZE (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; { .mfi (p5) LDFD f30 = [C2 ], 5 * SIZE (p3) FMA f87 = f47, f58, f87 // A8 * B3 adds L = -1, L } { .mfb (p5) LDFD f31 = [C10], 5 * SIZE (p3) FMA f95 = f47, f59, f95 // A8 * B4 br.cloop.sptk.few .L053 } ;; .align 32 .L058: { .mmf LDFD f32 = [C2 ], SIZE LDFD f33 = [C10], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f34 = [C2 ], SIZE LDFD f35 = [C10], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f36 = [C2 ], SIZE LDFD f37 = [C10], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f38 = [C2 ], - 11 * SIZE LDFD f39 = [C10], - 11 * SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f68, f16 } { .mmf LDFD f48 = [C3 ], SIZE LDFD f49 = [C11], SIZE FMA f17 = ALPHA_R, f70, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f68, f18 } { .mmf LDFD f50 = [C3 ], SIZE LDFD f51 = [C11], SIZE FMA f19 = ALPHA_I, f70, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f69, f20 } { .mmf LDFD f52 = [C3 ], SIZE LDFD f53 = [C11], SIZE FMA f21 = ALPHA_R, f71, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f69, f22 } { .mmf LDFD f54 = [C3 ], 5 * SIZE LDFD f55 = [C11], 5 * SIZE FMA f23 = ALPHA_I, f71, f23 } ;; { .mmf STFD [C1 ] = f16, SIZE STFD [C9 ] = f17, SIZE FMA f24 = ALPHA_R, f72, f24 } { .mmf LDFD f40 = [C3 ], SIZE LDFD f41 = [C11], SIZE FMA f25 = ALPHA_R, f74, f25 } ;; { .mmf STFD [C1 ] = f18, SIZE STFD [C9 ] = f19, SIZE FMA f26 = ALPHA_I, f72, f26 } { .mmf LDFD f42 = [C3 ], SIZE LDFD f43 = [C11], SIZE FMA f27 = ALPHA_I, f74, f27 } ;; { .mmf STFD [C1 ] = f20, SIZE STFD [C9 ] = f21, SIZE FMA f28 = ALPHA_R, f73, f28 } { .mmf LDFD f44 = [C3 ], SIZE LDFD f45 = [C11], SIZE FMA f29 = ALPHA_R, f75, f29 } ;; { .mmf STFD [C1 ] = f22, 5 * SIZE STFD [C9 ] = f23, 5 * SIZE FMA f30 = ALPHA_I, f73, f30 } { .mmf LDFD f46 = [C3 ], - 11 * SIZE LDFD f56 = [C11], - 11 * SIZE FMA f31 = ALPHA_I, f75, f31 } ;; { .mmf STFD [C2 ] = f24, SIZE STFD [C10] = f25, SIZE FMA f32 = ALPHA_R, f76, f32 } { .mmf LDFD f57 = [C4 ], SIZE LDFD f58 = [C12], SIZE FMA f33 = ALPHA_R, f78, f33 } ;; { .mmf STFD [C2 ] = f26, SIZE STFD [C10] = f27, SIZE FMA f34 = ALPHA_I, f76, f34 } { .mmf LDFD f59 = [C4 ], SIZE LDFD f60 = [C12], SIZE FMA f35 = ALPHA_I, f78, f35 } ;; { .mmf STFD [C2 ] = f28, SIZE STFD [C10] = f29, SIZE FMA f36 = ALPHA_R, f77, f36 } { .mmf LDFD f61 = [C4 ], SIZE LDFD f62 = [C12], SIZE FMA f37 = ALPHA_R, f79, f37 } ;; { .mmf STFD [C2 ] = f30, 5 * SIZE STFD [C10] = f31, 5 * SIZE FMA f38 = ALPHA_I, f77, f38 } { .mmf LDFD f63 = [C4 ], 5 * SIZE LDFD f47 = [C12], 5 * SIZE FMA f39 = ALPHA_I, f79, f39 } ;; { .mmf STFD [C2 ] = f32, SIZE STFD [C10] = f33, SIZE FMA f48 = ALPHA_R, f80, f48 } { .mmf LDFD f64 = [C4 ], SIZE LDFD f65 = [C12], SIZE FMA f49 = ALPHA_R, f82, f49 } ;; { .mmf STFD [C2 ] = f34, SIZE STFD [C10] = f35, SIZE FMA f50 = ALPHA_I, f80, f50 } { .mmf LDFD f6 = [C4 ], SIZE LDFD f7 = [C12], SIZE FMA f51 = ALPHA_I, f82, f51 } ;; { .mmf STFD [C2 ] = f36, SIZE STFD [C10] = f37, SIZE FMA f52 = ALPHA_R, f81, f52 } { .mmf LDFD f10 = [C4 ], SIZE LDFD f11 = [C12], SIZE FMA f53 = ALPHA_R, f83, f53 } ;; { .mmf STFD [C2 ] = f38, 5 * SIZE STFD [C10] = f39, 5 * SIZE FMA f54 = ALPHA_I, f81, f54 } { .mmf LDFD f12 = [C4 ], - 11 * SIZE LDFD f13 = [C12], - 11 * SIZE FMA f55 = ALPHA_I, f83, f55 } ;; { .mmf STFD [C3 ] = f48, SIZE STFD [C11] = f49, SIZE FMA f40 = ALPHA_R, f84, f40 } { .mmf nop __LINE__ nop __LINE__ FMA f41 = ALPHA_R, f86, f41 } ;; { .mmf STFD [C3 ] = f50, SIZE STFD [C11] = f51, SIZE FMA f42 = ALPHA_I, f84, f42 } { .mmf nop __LINE__ nop __LINE__ FMA f43 = ALPHA_I, f86, f43 } ;; { .mmf STFD [C3 ] = f52, SIZE STFD [C11] = f53, SIZE FMA f44 = ALPHA_R, f85, f44 } { .mmf nop __LINE__ nop __LINE__ FMA f45 = ALPHA_R, f87, f45 } ;; { .mmf STFD [C3 ] = f54, 5 * SIZE STFD [C11] = f55, 5 * SIZE FMA f46 = ALPHA_I, f85, f46 } { .mmf nop __LINE__ nop __LINE__ FMA f56 = ALPHA_I, f87, f56 } ;; { .mmf STFD [C3 ] = f40, SIZE STFD [C11] = f41, SIZE FMA f57 = ALPHA_R, f88, f57 } { .mmf nop __LINE__ nop __LINE__ FMA f58 = ALPHA_R, f90, f58 } ;; { .mmf STFD [C3 ] = f42, SIZE STFD [C11] = f43, SIZE FMA f59 = ALPHA_I, f88, f59 } { .mmf nop __LINE__ nop __LINE__ FMA f60 = ALPHA_I, f90, f60 } ;; { .mmf STFD [C3 ] = f44, SIZE STFD [C11] = f45, SIZE FMA f61 = ALPHA_R, f89, f61 } { .mmf nop __LINE__ nop __LINE__ FMA f62 = ALPHA_R, f91, f62 } ;; { .mmf STFD [C3 ] = f46, 5 * SIZE STFD [C11] = f56, 5 * SIZE FMA f63 = ALPHA_I, f89, f63 } { .mmf nop __LINE__ nop __LINE__ FMA f47 = ALPHA_I, f91, f47 } ;; { .mmf STFD [C4 ] = f57, SIZE STFD [C12] = f58, SIZE FMA f64 = ALPHA_R, f92, f64 } { .mmf nop __LINE__ nop __LINE__ FMA f65 = ALPHA_R, f94, f65 } ;; { .mmf STFD [C4 ] = f59, SIZE STFD [C12] = f60, SIZE FMA f6 = ALPHA_I, f92, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_I, f94, f7 } ;; { .mmf STFD [C4 ] = f61, SIZE STFD [C12] = f62, SIZE FMA f10 = ALPHA_R, f93, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_R, f95, f11 } ;; { .mmf STFD [C4 ] = f63, 5 * SIZE STFD [C12] = f47, 5 * SIZE FMA f12 = ALPHA_I, f93, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_I, f95, f13 } ;; { .mmf STFD [C4 ] = f64, SIZE STFD [C12] = f65, SIZE mov f64 = f0 } { .mmf cmp.ne p6, p0 = 1, I nop __LINE__ mov f72 = f0 } ;; { .mmf STFD [C4 ] = f6, SIZE STFD [C12] = f7, SIZE mov f80 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f88 = f0 } ;; { .mmf STFD [C4 ] = f10, SIZE STFD [C12] = f11, SIZE mov f65 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f73 = f0 } ;; { .mmf STFD [C4 ] = f12, 5 * SIZE STFD [C12] = f13, 5 * SIZE mov f81 = f0 } { .mfb adds I = -1, I mov f89 = f0 (p6) br.cond.dptk .L052 } ;; .align 32 .L060: { .mfi nop __LINE__ mov f66 = f0 tbit.z p6, p7 = M, 2 } { .mfb nop __LINE__ mov f74 = f0 (p6) br.cond.dptk .L070 } ;; { .mfb LDFPD f48, f49 = [B] mov f82 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f90 = f0 adds L = 1, K } ;; { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f67 = f0 adds L = -1, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov f75 = f0 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 mov ar.lc = L } { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f91 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 32 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 (p5) adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 (p5) adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb (p5) LDFD f6 = [C1 ], SIZE FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f12 = [C1 ], SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p5) LDFD f13 = [C9], SIZE (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f14 = [C1 ], - 3 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p5) LDFD f15 = [C9], - 3 * SIZE (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f16 = [C2 ], SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f17 = [C10], SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f18 = [C2 ], SIZE (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb (p5) LDFD f19 = [C10], SIZE (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f20 = [C2 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb (p5) LDFD f21 = [C10], SIZE (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfi (p5) LDFD f22 = [C2 ], -3 * SIZE (p3) FMA f83 = f43, f58, f83 // A4 * B3 adds L = -1, L } { .mfb (p5) LDFD f23 = [C10], -3 * SIZE (p3) FMA f91 = f43, f59, f91 // A4 * B4 br.cloop.sptk.few .L062 } ;; { .mmf LDFD f24 = [C3 ], SIZE LDFD f25 = [C11], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f26 = [C3 ], SIZE LDFD f27 = [C11], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f28 = [C3 ], SIZE LDFD f29 = [C11], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f30 = [C3 ], - 3 * SIZE LDFD f31 = [C11], - 3 * SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f72, f16 } { .mmf LDFD f32 = [C4 ], SIZE LDFD f33 = [C12], SIZE FMA f17 = ALPHA_R, f74, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f72, f18 } { .mmf LDFD f34 = [C4 ], SIZE LDFD f35 = [C12], SIZE FMA f19 = ALPHA_I, f74, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f73, f20 } { .mmf LDFD f36 = [C4 ], SIZE LDFD f37 = [C12], SIZE FMA f21 = ALPHA_R, f75, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f73, f22 } { .mmf LDFD f38 = [C4 ], - 3 * SIZE LDFD f39 = [C12], - 3 * SIZE FMA f23 = ALPHA_I, f75, f23 } ;; { .mmf STFD [C2 ] = f16, SIZE STFD [C10] = f17, SIZE FMA f24 = ALPHA_R, f80, f24 } { .mmf nop __LINE__ nop __LINE__ FMA f25 = ALPHA_R, f82, f25 } ;; { .mmf STFD [C2 ] = f18, SIZE STFD [C10] = f19, SIZE FMA f26 = ALPHA_I, f80, f26 } { .mmf nop __LINE__ nop __LINE__ FMA f27 = ALPHA_I, f82, f27 } ;; { .mmf STFD [C2 ] = f20, SIZE STFD [C10] = f21, SIZE FMA f28 = ALPHA_R, f81, f28 } { .mmf nop __LINE__ nop __LINE__ FMA f29 = ALPHA_R, f83, f29 } ;; { .mmf STFD [C2 ] = f22, 5 * SIZE STFD [C10] = f23, 5 * SIZE FMA f30 = ALPHA_I, f81, f30 } { .mmf nop __LINE__ nop __LINE__ FMA f31 = ALPHA_I, f83, f31 } ;; { .mmf STFD [C3 ] = f24, SIZE STFD [C11] = f25, SIZE FMA f32 = ALPHA_R, f88, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA_R, f90, f33 } ;; { .mmf STFD [C3 ] = f26, SIZE STFD [C11] = f27, SIZE FMA f34 = ALPHA_I, f88, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA_I, f90, f35 } ;; { .mmf STFD [C3 ] = f28, SIZE STFD [C11] = f29, SIZE FMA f36 = ALPHA_R, f89, f36 } { .mmf nop __LINE__ nop __LINE__ FMA f37 = ALPHA_R, f91, f37 } ;; { .mmf STFD [C3 ] = f30, 5 * SIZE STFD [C11] = f31, 5 * SIZE FMA f38 = ALPHA_I, f89, f38 } { .mmf nop __LINE__ nop __LINE__ FMA f39 = ALPHA_I, f91, f39 } ;; { .mmf STFD [C4 ] = f32, SIZE STFD [C12] = f33, SIZE mov f64 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f72 = f0 } ;; { .mmf STFD [C4 ] = f34, SIZE STFD [C12] = f35, SIZE mov f80 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f88 = f0 } ;; { .mmf STFD [C4 ] = f36, SIZE STFD [C12] = f37, SIZE mov f81 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f65 = f0 } ;; { .mmf STFD [C4 ] = f38, 5 * SIZE STFD [C12] = f39, 5 * SIZE mov f89 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f73 = f0 } ;; .align 32 .L070: { .mib nop __LINE__ tbit.z p6,p7 = M, 1 (p6) br.cond.dptk .L080 } ;; { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B adds L = 1, K } ;; { .mii cmp.eq p3, p0 = r0, r0 tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = -1, L adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov ar.lc = L } ;; .align 32 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf (p5) LDFD f6 = [C1 ], SIZE (p5) LDFD f12 = [C2 ], SIZE FMA f89 = f33, f51, f89 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf (p5) LDFD f7 = [C1 ], SIZE (p5) LDFD f13 = [C2 ], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf (p5) LDFD f10 = [C1 ], SIZE (p5) LDFD f14 = [C2 ], SIZE (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p5) LDFD f11 = [C1 ], - 3 * SIZE (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb (p5) LDFD f15 = [C2 ], - 3 * SIZE (p3) FMA f89 = f41, f59, f89 // A2 * B4 br.cloop.sptk.few .L072 } ;; { .mmf LDFD f16 = [C3], SIZE LDFD f20 = [C4], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f12 = ALPHA_R, f72, f12 } ;; { .mmf LDFD f17 = [C3], SIZE LDFD f21 = [C4], SIZE FMA f7 = ALPHA_I, f64, f7 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_I, f72, f13 } ;; { .mmf LDFD f18 = [C3], SIZE LDFD f22 = [C4], SIZE FMA f10 = ALPHA_R, f65, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f14 = ALPHA_R, f73, f14 } ;; { .mmf LDFD f19 = [C3], - 3 * SIZE LDFD f23 = [C4], - 3 * SIZE FMA f11 = ALPHA_I, f65, f11 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f73, f15 } ;; { .mmf STFD [C1] = f6, SIZE STFD [C2] = f12, SIZE FMA f16 = ALPHA_R, f80, f16 } { .mmf nop __LINE__ nop __LINE__ FMA f20 = ALPHA_R, f88, f20 } ;; { .mmf STFD [C1] = f7, SIZE STFD [C2] = f13, SIZE FMA f17 = ALPHA_I, f80, f17 } { .mmf nop __LINE__ nop __LINE__ FMA f21 = ALPHA_I, f88, f21 } ;; { .mmf STFD [C1] = f10, SIZE STFD [C2] = f14, SIZE FMA f18 = ALPHA_R, f81, f18 } { .mmf nop __LINE__ nop __LINE__ FMA f22 = ALPHA_R, f89, f22 } ;; { .mmf STFD [C1] = f11, SIZE STFD [C2] = f15, SIZE FMA f19 = ALPHA_I, f81, f19 } { .mmf nop __LINE__ nop __LINE__ FMA f23 = ALPHA_I, f89, f23 } ;; { .mmf STFD [C3] = f16, SIZE STFD [C4] = f20, SIZE mov f64 = f0 } ;; { .mmf STFD [C3] = f17, SIZE STFD [C4] = f21, SIZE mov f72 = f0 } ;; { .mmf STFD [C3] = f18, SIZE STFD [C4] = f22, SIZE mov f80 = f0 } ;; { .mmf STFD [C3] = f19, SIZE STFD [C4] = f23, SIZE mov f88 = f0 } ;; .align 32 .L080: { .mib nop __LINE__ tbit.z p6,p7 = M, 0 (p6) br.cond.dptk .L089 } ;; { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B adds L = 1, K } ;; { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi nop __LINE__ nop __LINE__ adds L = -1, L } ;; { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L082: { .mfb cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf (p5) LDFD f6 = [C1], SIZE (p5) LDFD f10 = [C2], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mmf (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 } { .mmf (p5) LDFD f7 = [C1], -SIZE (p5) LDFD f11 = [C2], -SIZE (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mib (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds L = -1, L br.cloop.sptk.few .L082 } ;; { .mmf LDFD f12 = [C3], SIZE LDFD f14 = [C4], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f10 = ALPHA_R, f72, f10 } ;; { .mmf LDFD f13 = [C3], -SIZE LDFD f15 = [C4], -SIZE FMA f7 = ALPHA_I, f64, f7 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f72, f11 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f12 = ALPHA_R, f80, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f14 = ALPHA_R, f88, f14 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_I, f80, f13 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f88, f15 } ;; { .mmi STFD [C1] = f6, SIZE STFD [C2] = f10, SIZE nop __LINE__ } ;; { .mmi STFD [C1] = f7, SIZE STFD [C2] = f11, SIZE nop __LINE__ } ;; { .mmi STFD [C3] = f12, SIZE STFD [C4] = f14, SIZE nop __LINE__ } ;; { .mmi STFD [C3] = f13, SIZE STFD [C4] = f15, SIZE nop __LINE__ } ;; .align 32 .L089: { .mmi mov B = BOFFSET mov AOFFSET = A nop __LINE__ } ;; .align 16 .L090: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 1 } { .mfi add C2 = LDC, C mov f72 = f0 shr I = M, 3 } ;; { .mfi setf.d f66 = r0 mov f65 = f0 nop __LINE__ } { .mfb mov AOFFSET = A mov f73 = f0 (p6) br.cond.dpnt .L130 } ;; { .mfi nop __LINE__ mov f67 = f0 shladd C = LDC, 1, C } { .mfb cmp.eq p6, p7 = 0, I mov f74 = f0 (p6) br.cond.dpnt .L100 } ;; .align 32 .L092: { .mfb LDFPD f48, f49 = [B] mov f68 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f79 = f0 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f75 = f0 nop __LINE__ } ;; { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 mov f76 = f0 adds L = 1, K } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f69 = f0 tbit.z p12, p0 = L, 0 } { .mfi cmp.eq p3, p0 = r0, r0 mov f77 = f0 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC], LDC mov f70 = f0 } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f78 = f0 mov ar.lc = L } { .mfi CPREFETCH [PREC] mov f71 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; .align 32 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb (p5) LDFD f6 = [C1 ], SIZE FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb (p5) LDFD f10 = [C1 ], SIZE FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb (p5) LDFD f12 = [C1 ], SIZE FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p5) LDFD f13 = [C9 ], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p5) LDFD f14 = [C1 ], 5 * SIZE (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f15 = [C9 ], 5 * SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f16 = [C1 ], SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb (p5) LDFD f17 = [C9 ], SIZE (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f18 = [C1 ], SIZE (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb (p5) LDFD f19 = [C9 ], SIZE (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f20 = [C1 ], SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb (p5) LDFD f21 = [C9 ], SIZE (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfi (p5) LDFD f22 = [C1 ], -11 * SIZE (p3) FMA f71 = f47, f56, f71 // A8 * B1 adds L = -1, L } { .mfb (p5) LDFD f23 = [C9 ], -11 * SIZE (p3) FMA f79 = f47, f57, f79 // A8 * B2 br.cloop.sptk.few .L093 } ;; { .mmf LDFD f24 = [C2 ], SIZE LDFD f25 = [C10], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f26 = [C2 ], SIZE LDFD f27 = [C10], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f28 = [C2 ], SIZE LDFD f29 = [C10], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f30 = [C2 ], 5 * SIZE LDFD f31 = [C10], 5 * SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f68, f16 } { .mmf LDFD f32 = [C2 ], SIZE LDFD f33 = [C10], SIZE FMA f17 = ALPHA_R, f70, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f68, f18 } { .mmf LDFD f34 = [C2 ], SIZE LDFD f35 = [C10], SIZE FMA f19 = ALPHA_I, f70, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f69, f20 } { .mmf LDFD f36 = [C2 ], SIZE LDFD f37 = [C10], SIZE FMA f21 = ALPHA_R, f71, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f69, f22 } { .mmf LDFD f38 = [C2 ], - 11 * SIZE LDFD f39 = [C10], - 11 * SIZE FMA f23 = ALPHA_I, f71, f23 } ;; { .mmf STFD [C1 ] = f16, SIZE STFD [C9 ] = f17, SIZE FMA f24 = ALPHA_R, f72, f24 } { .mmf nop __LINE__ nop __LINE__ FMA f25 = ALPHA_R, f74, f25 } ;; { .mmf STFD [C1 ] = f18, SIZE STFD [C9 ] = f19, SIZE FMA f26 = ALPHA_I, f72, f26 } { .mmf nop __LINE__ nop __LINE__ FMA f27 = ALPHA_I, f74, f27 } ;; { .mmf STFD [C1 ] = f20, SIZE STFD [C9 ] = f21, SIZE FMA f28 = ALPHA_R, f73, f28 } { .mmf nop __LINE__ nop __LINE__ FMA f29 = ALPHA_R, f75, f29 } ;; { .mmf STFD [C1 ] = f22, 5 * SIZE STFD [C9 ] = f23, 5 * SIZE FMA f30 = ALPHA_I, f73, f30 } { .mmf nop __LINE__ nop __LINE__ FMA f31 = ALPHA_I, f75, f31 } ;; { .mmf STFD [C2 ] = f24, SIZE STFD [C10] = f25, SIZE FMA f32 = ALPHA_R, f76, f32 } { .mmf nop __LINE__ nop __LINE__ FMA f33 = ALPHA_R, f78, f33 } ;; { .mmf STFD [C2 ] = f26, SIZE STFD [C10] = f27, SIZE FMA f34 = ALPHA_I, f76, f34 } { .mmf nop __LINE__ nop __LINE__ FMA f35 = ALPHA_I, f78, f35 } ;; { .mmf STFD [C2 ] = f28, SIZE STFD [C10] = f29, SIZE FMA f36 = ALPHA_R, f77, f36 } { .mmf nop __LINE__ nop __LINE__ FMA f37 = ALPHA_R, f79, f37 } ;; { .mmf STFD [C2 ] = f30, 5 * SIZE STFD [C10] = f31, 5 * SIZE FMA f38 = ALPHA_I, f77, f38 } { .mmf nop __LINE__ nop __LINE__ FMA f39 = ALPHA_I, f79, f39 } ;; { .mmf STFD [C2 ] = f32, SIZE STFD [C10] = f33, SIZE mov f64 = f0 } { .mmf cmp.ne p6, p0 = 1, I nop __LINE__ mov f72 = f0 } ;; { .mmf STFD [C2 ] = f34, SIZE STFD [C10] = f35, SIZE mov f65 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f73 = f0 } ;; { .mmf STFD [C2 ] = f36, SIZE STFD [C10] = f37, SIZE mov f66 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f74 = f0 } ;; { .mmf STFD [C2 ] = f38, 5 * SIZE STFD [C10] = f39, 5 * SIZE mov f67 = f0 } { .mfb adds I = -1, I mov f75 = f0 (p6) br.cond.dptk .L092 } ;; .align 32 .L100: { .mib nop __LINE__ tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L110 } ;; { .mmf LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B mov f75 = f0 } { .mii nop __LINE__ adds L = 1, K } ;; { .mii adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f6 = [C1 ], SIZE FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f12 = [C1], SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 adds L = -1, L } { .mfb (p5) LDFD f13 = [C9], SIZE (p3) FMA f75 = f43, f57, f75 // A4 * B2 br.cloop.sptk.few .L102 } ;; { .mmf LDFD f14 = [C1], - 3 * SIZE LDFD f15 = [C9], - 3 * SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f16 = [C2 ], SIZE LDFD f17 = [C10], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f18 = [C2 ], SIZE LDFD f19 = [C10], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f20 = [C2 ], SIZE LDFD f21 = [C10], SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f72, f16 } { .mmf LDFD f22 = [C2 ], - 3 * SIZE LDFD f23 = [C10], - 3 * SIZE FMA f17 = ALPHA_R, f74, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f72, f18 } { .mmf nop __LINE__ nop __LINE__ FMA f19 = ALPHA_I, f74, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f73, f20 } { .mmf nop __LINE__ nop __LINE__ FMA f21 = ALPHA_R, f75, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f73, f22 } { .mmf nop __LINE__ nop __LINE__ FMA f23 = ALPHA_I, f75, f23 } ;; { .mmf STFD [C2 ] = f16, SIZE STFD [C10] = f17, SIZE mov f64 = f0 } ;; { .mmf STFD [C2 ] = f18, SIZE STFD [C10] = f19, SIZE mov f65 = f0 } ;; { .mmf STFD [C2 ] = f20, SIZE STFD [C10] = f21, SIZE mov f72 = f0 } ;; { .mmf STFD [C2 ] = f22, 5 * SIZE STFD [C10] = f23, 5 * SIZE mov f73 = f0 } ;; .align 32 .L110: { .mib nop __LINE__ tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L120 } ;; { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B adds L = 1, K } ;; { .mii adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi cmp.eq p3, p0 = r0, r0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; .align 32 .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf (p5) LDFD f6 = [C1 ], SIZE (p5) LDFD f7 = [C2 ], SIZE FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb (p5) LDFD f11 = [C2 ], SIZE (p3) FMA f73 = f41, f57, f73 // A2 * B2 br.cloop.sptk.few .L112 } ;; { .mmf LDFD f12 = [C1], SIZE LDFD f13 = [C2], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f72, f7 } ;; { .mmf LDFD f14 = [C1], - 3 * SIZE LDFD f15 = [C2], - 3 * SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f72, f11 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f73, f13 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f73, f15 } ;; { .mmf STFD [C1] = f6, SIZE STFD [C2] = f7, SIZE mov f64 = f0 } ;; { .mmf STFD [C1] = f10, SIZE STFD [C2] = f11, SIZE mov f72 = f0 } ;; { .mmf STFD [C1] = f12, SIZE STFD [C2] = f13, SIZE mov f65 = f0 } ;; { .mmf STFD [C1] = f14, SIZE STFD [C2] = f15, SIZE mov f73 = f0 } ;; .align 32 .L120: { .mib nop __LINE__ tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L129 } ;; { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B adds L = 1, K } ;; { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFD f32 = [AOFFSET], 1 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi cmp.eq p3, p0 = r0, r0 nop __LINE__ mov ar.lc = L } ;; .align 32 .L122: { .mfi FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } { .mmi (p5) LDFD f6 = [C1], SIZE (p5) LDFD f7 = [C2], SIZE } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 br.cloop.sptk.few .L122 } ;; .L128: { .mmf (p5) LDFD f10 = [C1], -SIZE (p5) LDFD f11 = [C2], -SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f72, f7 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f72, f11 } ;; { .mmi STFD [C1 ] = f6, SIZE STFD [C2 ] = f7, SIZE nop __LINE__ } ;; { .mmi STFD [C1 ] = f10, SIZE STFD [C2 ] = f11, SIZE nop __LINE__ } ;; .align 32 .L129: { .mmi mov B = BOFFSET mov AOFFSET = A nop __LINE__ } ;; .align 16 .L130: { .mfi nop __LINE__ mov f64 = f0 tbit.z p6, p0 = N, 0 } { .mib mov AOFFSET = A shr I = M, 3 (p6) br.cond.dpnt .L999 } ;; { .mfi mov C1 = C mov f65 = f0 nop __LINE__ } ;; { .mfi nop __LINE__ mov f66 = f0 nop __LINE__ } { .mfb cmp.eq p7, p0 = 0, I mov f67 = f0 (p7) br.cond.dpnt .L140 } ;; .align 32 .L132: { .mfb LDFD f48 = [B] mov f68 = f0 nop __LINE__ } { .mfi adds BOFFSET = 1 * SIZE, B mov f69 = f0 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f70 = f0 adds L = 1, K } ;; { .mii LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f71 = f0 adds L = -1, L } ;; { .mmi LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds PREC = CPREFETCHSIZE * SIZE, C1 cmp.eq p3, p0 = r0, r0 } ;; { .mmi CPREFETCH [PREC] adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov ar.lc = L } ;; .align 32 .L133: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C9 = 4 * SIZE, C1 } { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p5) LDFD f6 = [C1 ], SIZE FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb (p5) LDFD f10 = [C1 ], SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p5) LDFD f12 = [C1 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb (p5) LDFD f13 = [C9 ], SIZE (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } ;; { .mfi (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 adds L = -1, L } { .mfb (p5) LDFD f14 = [C1 ], 5 * SIZE (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } { .mfb (p5) LDFD f15 = [C9 ], 5 * SIZE nop __LINE__ br.cloop.sptk.few .L133 } ;; .L138: { .mmf LDFD f16 = [C1 ], SIZE LDFD f17 = [C9 ], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f18 = [C1 ], SIZE LDFD f19 = [C9 ], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f20 = [C1 ], SIZE LDFD f21 = [C9 ], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f22 = [C1 ], - 11 * SIZE LDFD f23 = [C9 ], - 11 * SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f68, f16 } { .mmf nop __LINE__ nop __LINE__ FMA f17 = ALPHA_R, f70, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f68, f18 } { .mmf nop __LINE__ nop __LINE__ FMA f19 = ALPHA_I, f70, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f69, f20 } { .mmf cmp.ne p6, p0 = 1, I adds I = -1, I FMA f21 = ALPHA_R, f71, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f69, f22 } { .mmf nop __LINE__ nop __LINE__ FMA f23 = ALPHA_I, f71, f23 } ;; { .mmf STFD [C1 ] = f16, SIZE STFD [C9 ] = f17, SIZE mov f64 = f0 } ;; { .mmf STFD [C1 ] = f18, SIZE STFD [C9 ] = f19, SIZE mov f65 = f0 } ;; { .mmf STFD [C1 ] = f20, SIZE STFD [C9 ] = f21, SIZE mov f66 = f0 } ;; { .mmf STFD [C1 ] = f22, 5 * SIZE STFD [C9 ] = f23, 5 * SIZE mov f67 = f0 } { .mmb nop __LINE__ nop __LINE__ (p6) br.cond.dptk .L132 } ;; .align 32 .L140: { .mib nop __LINE__ tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L150 } ;; { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B adds L = 1, K } ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE adds L = -1, L nop __LINE__ } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 32 .L142: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f65 = f33, f48, f65 // A2 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 (p5) adds C9 = 4 * SIZE, C1 } { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 } ;; { .mfi (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 (p5) adds C10 = 2 * SIZE, C2 } { .mmf (p5) LDFD f6 = [C1 ], SIZE (p5) LDFD f7 = [C9 ], SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 } ;; { .mmf (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 } { .mmf (p5) LDFD f10 = [C1 ], SIZE (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } { .mmb (p5) LDFD f12 = [C1 ], SIZE (p5) LDFD f13 = [C9 ], SIZE br.cloop.sptk.few .L142 } ;; .L148: { .mmf LDFD f14 = [C1 ], - 3 * SIZE LDFD f15 = [C9 ], - 3 * SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE mov f64 = f0 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE mov f65 = f0 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE mov f66 = f0 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE mov f67 = f0 } ;; .align 32 .L150: { .mib nop __LINE__ tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L160 } ;; { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B adds L = 1, K } ;; { .mii cmp.eq p3, p0 = r0, r0 tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = -1, L ;; mov ar.lc = L } ;; .align 32 .L152: { .mfi cmp.ne p4, p5 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } ;; { .mfi (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } ;; { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 br.cloop.sptk.few .L152 } ;; .L158: LDFD f68 = [C1 ], 1 * SIZE ;; LDFD f69 = [C1 ], 1 * SIZE ;; LDFD f70 = [C1 ], 1 * SIZE ;; LDFD f71 = [C1 ], - 3 * SIZE ;; FMA f68 = ALPHA_R, f64, f68 FMA f69 = ALPHA_I, f64, f69 FMA f70 = ALPHA_R, f65, f70 FMA f71 = ALPHA_I, f65, f71 ;; STFD [C1 ] = f68, SIZE ;; STFD [C1 ] = f69, SIZE ;; STFD [C1 ] = f70, SIZE mov f64 = f0 ;; STFD [C1 ] = f71, SIZE mov f65 = f0 ;; .align 32 .L160: { .mib nop __LINE__ tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L169 } ;; { .mmi LDFD f48 = [B] adds BOFFSET = 1 * SIZE, B adds L = 1, K } ;; { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mii adds L = -1, L cmp.eq p3, p0 = r0, r0 ;; mov ar.lc = L } ;; .align 32 .L162: { .mmf cmp.ne p4, p5 = 0, L (p12) cmp.ne p3, p0 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 } ;; { .mmi (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } ;; { .mmi (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p5) LDFD f68 = [C1], 1 * SIZE adds L = -1, L } ;; { .mmf (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p5) LDFD f69 = [C1], - 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 } { .mib nop __LINE__ nop __LINE__ br.cloop.sptk.few .L162 } ;; FMA f68 = ALPHA_R, f64, f68 FMA f69 = ALPHA_I, f64, f69 ;; STFD [C1 ] = f68, SIZE ;; STFD [C1 ] = f69, SIZE ;; .align 32 .L169: { .mmi mov B = BOFFSET mov AOFFSET = A nop __LINE__ } ;; .align 16 .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f24 = [SP], 32 ldf.fill f25 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f26 = [SP], 32 ldf.fill f27 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f28 = [SP], 32 ldf.fill f29 = [r9], 32 ;; ldf.fill f30 = [SP], 32 ldf.fill f31 = [r9] br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zgemm_beta.S000066400000000000000000000252131313527062700174300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 74 #define CO1 r14 #define CO2 r15 #define CO3 r16 #define DO1 r17 #define DO2 r18 #define DO3 r19 #define I r22 #define I_AND_15 r23 #define PRE1 r24 #define PR r30 #define ARLC r31 #define M r32 #define N r33 #define C r34 #define LDC r35 #define J r36 #define BETA_R f8 #define BETA_I f9 PROLOGUE .prologue PROFCODE { .mmi adds CO1 = 24, r12 adds CO2 = 32, r12 .save ar.lc, ARLC mov ARLC = ar.lc } { .mfb cmp.ge p6, p0 = 0, N fcmp.eq p0, p14 = BETA_R, f0 (p6) br.ret.sptk.many b0 } ;; .body { .mmi ld8 C = [CO1], 8 ld8 LDC = [CO2] mov PR = pr } { .mfi mov J = N fcmp.eq p0, p15 = BETA_I, f0 shr I = M, 3 } ;; { .mmb cmp.ge p6, p0 = 0, M adds I = -1, I (p6) br.ret.sptk.many b0 } ;; { .mbb shladd LDC = LDC, ZBASE_SHIFT, r0 (p14) br.cond.dpnt .L100 (p15) br.cond.dpnt .L100 } ;; .align 32 .L60: { .mmi mov CO1 = C mov CO3 = C add CO2 = 4 * SIZE, C } { .mmi adds PRE1 = PREFETCHSIZE * SIZE, C add C = C, LDC tbit.nz p12, p0 = M, 2 } ;; { .mmi and I_AND_15 = 15, M mov ar.lc = I } { .mib cmp.gt p8, p0 = 0, I (p8) br.cond.dpnt .L80 } ;; .align 32 .L70: { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } { .mmi lfetch.excl.nt1 [PRE1], 16 * SIZE nop.m 0 } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE adds CO3 = 16 * SIZE, CO3 } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmi STFD [CO1] = f0, 5 * SIZE STFD [CO2] = f0, 5 * SIZE } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmi STFD [CO1] = f0, 1 * SIZE STFD [CO2] = f0, 1 * SIZE } ;; { .mmb STFD [CO1] = f0, 5 * SIZE STFD [CO2] = f0, 5 * SIZE br.cloop.sptk.few .L70 } ;; .align 32 .L80: { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE tbit.nz p13, p0 = M, 1 } { .mmb cmp.eq p9, p0 = 0, I_AND_15 adds J = -1, J (p9) br.cond.dptk .L99 } ;; { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE tbit.nz p14, p0 = M, 0 } ;; { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi (p12) STFD [CO1] = f0, 5 * SIZE (p12) STFD [CO2] = f0 (p13) adds CO3 = 4 * SIZE, CO3 } ;; { .mmi (p13) STFD [CO1] = f0, 1 * SIZE (p14) STFD [CO3] = f0, 1 * SIZE } ;; { .mmi (p13) STFD [CO1] = f0, 1 * SIZE (p14) STFD [CO3] = f0, 1 * SIZE } ;; { .mmi (p13) STFD [CO1] = f0, 1 * SIZE } ;; { .mmi (p13) STFD [CO1] = f0 } ;; .align 32 .L99: { .mib cmp.lt p6, p0 = 0, J mov ar.lc = ARLC } { .mbb (p6) br.cond.dptk .L60 br.ret.sptk.many b0 } ;; .align 32 .L100: { .mmi mov CO1 = C mov CO3 = C mov pr.rot = 0 } { .mmi adds PRE1 = PREFETCHSIZE * SIZE, C add CO2 = 4 * SIZE, C mov DO1 = C } ;; { .mmi mov ar.ec = 6 } { .mmi adds DO2 = 4 * SIZE, C mov DO3 = C add C = C, LDC } ;; { .mmi and I_AND_15 = 15, M cmp.eq p16, p0 = r0, r0 mov ar.lc = I } { .mib cmp.gt p8, p0 = 0, I tbit.nz p12, p0 = M, 2 (p8) br.cond.dpnt .L180 } ;; .align 32 .L170: { .mmf (p21) STFD [DO1] = f37, 1 * SIZE (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE (p21) FNMA f61 = BETA_I, f67, f61 } { .mmf (p16) LDFD f32 = [CO1], 1 * SIZE (p16) adds CO2 = 16 * SIZE, CO2 (p21) FMPY f12 = BETA_I, f85 } ;; { .mfi (p21) STFD [DO1] = f43, 1 * SIZE (p21) FMA f67 = BETA_R, f67, f10 (p16) adds CO3 = 16 * SIZE, CO3 } { .mfi (p16) LDFD f38 = [CO1], 1 * SIZE (p21) FMPY f85 = BETA_R, f85 (p16) adds DO2 = 16 * SIZE, DO2 } ;; { .mfi (p21) STFD [DO1] = f49, 1 * SIZE (p21) FNMA f73 = BETA_I, f79, f73 (p16) adds DO3 = 16 * SIZE, DO3 } { .mfi (p16) LDFD f44 = [CO1], 1 * SIZE (p21) FMPY f13 = BETA_I, f97 nop.i 0 } ;; (p21) STFD [DO1] = f55, 1 * SIZE (p21) FMA f79 = BETA_R, f79, f11 (p16) LDFD f50 = [CO1], 1 * SIZE (p21) FMPY f97 = BETA_R, f97 ;; (p21) STFD [DO1] = f61, 1 * SIZE (p21) FNMA f85 = BETA_I, f91, f85 (p16) LDFD f56 = [CO1], 1 * SIZE (p21) FMPY f14 = BETA_I, f109 ;; (p21) STFD [DO1] = f67, 1 * SIZE (p21) FMA f91 = BETA_R, f91, f12 (p16) LDFD f62 = [CO1], 1 * SIZE (p21) FMPY f109 = BETA_R, f109 ;; (p21) STFD [DO1] = f73, 1 * SIZE (p21) FNMA f97 = BETA_I, f103, f97 (p16) LDFD f68 = [CO1], 1 * SIZE (p21) FMPY f15 = BETA_I, f121 ;; (p21) STFD [DO1] = f79, 1 * SIZE (p21) FMA f103 = BETA_R, f103, f13 (p16) LDFD f74 = [CO1], 1 * SIZE (p21) FMPY f121 = BETA_R, f121 ;; (p21) STFD [DO1] = f85, 1 * SIZE (p21) FNMA f109 = BETA_I, f115, f109 (p16) LDFD f80 = [CO1], 1 * SIZE (p20) FMPY f6 = BETA_I, f36 ;; (p21) STFD [DO1] = f91, 1 * SIZE (p21) FMA f115 = BETA_R, f115, f14 (p16) LDFD f86 = [CO1], 1 * SIZE (p20) FMPY f36 = BETA_R, f36 ;; (p21) STFD [DO1] = f97, 1 * SIZE (p21) FNMA f121 = BETA_I, f127, f121 (p16) LDFD f92 = [CO1], 1 * SIZE (p20) FMPY f7 = BETA_I, f48 ;; (p21) STFD [DO1] = f103, 1 * SIZE (p21) FMA f127 = BETA_R, f127, f15 (p16) LDFD f98 = [CO1], 1 * SIZE (p20) FMPY f48 = BETA_R, f48 ;; (p21) STFD [DO1] = f109, 1 * SIZE (p20) FNMA f36 = BETA_I, f42, f36 (p16) LDFD f104 = [CO1], 1 * SIZE (p20) FMPY f10 = BETA_I, f60 ;; (p21) STFD [DO1] = f115, 1 * SIZE (p20) FMA f42 = BETA_R, f42, f6 (p16) LDFD f110 = [CO1], 1 * SIZE (p20) FMPY f60 = BETA_R, f60 ;; (p21) STFD [DO1] = f121, 1 * SIZE (p20) FNMA f48 = BETA_I, f54, f48 (p16) LDFD f116 = [CO1], 1 * SIZE (p20) FMPY f11 = BETA_I, f72 ;; (p21) STFD [DO1] = f127, 1 * SIZE (p20) FMA f54 = BETA_R, f54, f7 (p16) LDFD f122 = [CO1], 1 * SIZE (p20) FMPY f72 = BETA_R, f72 br.ctop.sptk.few .L170 ;; .align 32 .L180: { .mmi (p12) LDFD f32 = [CO1], 1 * SIZE (p12) LDFD f36 = [CO2], 1 * SIZE tbit.nz p13, p0 = M, 1 } { .mmb cmp.eq p9, p0 = 0, I_AND_15 adds J = -1, J (p9) br.cond.dptk .L199 } ;; { .mmi (p12) LDFD f33 = [CO1], 1 * SIZE (p12) LDFD f37 = [CO2], 1 * SIZE tbit.nz p14, p0 = M, 0 } ;; { .mmi (p12) LDFD f34 = [CO1], 1 * SIZE (p12) LDFD f38 = [CO2], 1 * SIZE (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi (p12) LDFD f35 = [CO1], 5 * SIZE (p12) LDFD f39 = [CO2] (p13) adds CO3 = 4 * SIZE, CO3 } ;; { .mmi (p13) LDFD f40 = [CO1], 1 * SIZE (p14) LDFD f44 = [CO3], 1 * SIZE } ;; { .mmi (p13) LDFD f41 = [CO1], 1 * SIZE (p14) LDFD f45 = [CO3], 1 * SIZE } ;; { .mmf (p13) LDFD f42 = [CO1], 1 * SIZE } ;; { .mmf (p13) LDFD f43 = [CO1] } ;; (p12) FMPY f80 = BETA_I, f32 (p12) FMPY f32 = BETA_R, f32 (p12) FMPY f81 = BETA_I, f34 (p12) FMPY f34 = BETA_R, f34 (p12) FMPY f82 = BETA_I, f36 (p12) FMPY f36 = BETA_R, f36 (p12) FMPY f83 = BETA_I, f38 (p12) FMPY f38 = BETA_R, f38 ;; (p12) FNMA f32 = BETA_I, f33, f32 (p12) FMA f33 = BETA_R, f33, f80 (p12) FNMA f34 = BETA_I, f35, f34 (p12) FMA f35 = BETA_R, f35, f81 (p12) FNMA f36 = BETA_I, f37, f36 (p12) FMA f37 = BETA_R, f37, f82 (p12) FNMA f38 = BETA_I, f39, f38 (p12) FMA f39 = BETA_R, f39, f83 ;; (p13) FMPY f84 = BETA_I, f40 (p13) FMPY f40 = BETA_R, f40 (p13) FMPY f85 = BETA_I, f42 (p13) FMPY f42 = BETA_R, f42 (p14) FMPY f86 = BETA_I, f44 (p14) FMPY f44 = BETA_R, f44 ;; (p13) FNMA f40 = BETA_I, f41, f40 (p13) FMA f41 = BETA_R, f41, f84 (p13) FNMA f42 = BETA_I, f43, f42 (p13) FMA f43 = BETA_R, f43, f85 (p14) FNMA f44 = BETA_I, f45, f44 (p14) FMA f45 = BETA_R, f45, f86 ;; { .mmf (p12) STFD [DO1] = f32, 1 * SIZE (p12) STFD [DO2] = f36, 1 * SIZE } { .mmf (p12) adds DO3 = 8 * SIZE, DO3 } ;; { .mmf (p12) STFD [DO1] = f33, 1 * SIZE (p12) STFD [DO2] = f37, 1 * SIZE } { .mmf (p13) adds DO3 = 4 * SIZE, DO3 } ;; { .mmf (p12) STFD [DO1] = f34, 1 * SIZE (p12) STFD [DO2] = f38, 1 * SIZE } ;; { .mmf (p12) STFD [DO1] = f35, 5 * SIZE (p12) STFD [DO2] = f39 } ;; { .mmi (p13) STFD [DO1] = f40, 1 * SIZE (p14) STFD [DO3] = f44, 1 * SIZE } ;; { .mmi (p13) STFD [DO1] = f41, 1 * SIZE (p14) STFD [DO3] = f45, 1 * SIZE } ;; { .mmi (p13) STFD [DO1] = f42, 1 * SIZE ;; (p13) STFD [DO1] = f43 } ;; .align 32 .L199: { .mib cmp.lt p6, p0 = 0, J mov ar.lc = ARLC (p6) br.cond.dptk .L100 } ;; { .mib mov pr = PR, -1 br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zgemm_kernel.S000066400000000000000000003253701313527062700200040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #define CPREFETCHSIZE 7 #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r37 #define B r38 #define C r39 #define LDC r35 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA_R f8 #define ALPHA_I f9 #define AORIG loc0 #define KK loc1 #define KK8 loc2 #define OFFSET loc3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) #define FCALC_A FSUB #define FCALC_B FADD #define FMA_A FNMA #define FMA_B FMA #else #define FCALC_A FADD #define FCALC_B FSUB #define FMA_A FMA #define FMA_B FNMA #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR) #define FCALC_C FMA #define FCALC_D FNMA #else #define FCALC_C FNMA #define FCALC_D FMA #endif PROLOGUE .prologue PROFCODE { .mfi #ifdef TRMMKERNEL .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 8, 0, 0 #else nop __LINE__ #endif mov f64 = f0 adds r14 = 16, SP } { .mfi nop __LINE__ mov f65 = f0 adds r15 = 24, SP } ;; { .mfi ld8 LDC = [r14] mov f81 = f0 mov PR = pr } { .mfi #ifdef TRMMKERNEL ld8 OFFSET = [r15] #else nop __LINE__ #endif mov f96 = f0 shr J = N, 2 } ;; { .mfi shladd LDC = LDC, ZBASE_SHIFT, r0 mov f97 = f0 mov AOFFSET = A } { .mfi nop __LINE__ mov f113 = f0 #if defined(TRMMKERNEL) && !defined(LEFT) sub KK = r0, OFFSET #endif } ;; .body { .mfi nop __LINE__ mov f80 = f0 mov ARLC = ar.lc } { .mfb cmp.ge p6, p0 = 0, J mov f112 = f0 (p6) br.cond.dpnt .L050 } ;; .align 16 .L010: { .mmi mov C1 = C // coffset1 = c + 0 * ldc add C2 = LDC, C // coffset2 = c + 1 * ldc shr I = M, 2 } { .mmi adds J = -1, J #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif nop __LINE__ } ;; { .mmi shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mib cmp.eq p6, p7 = 0, I shladd C = LDC, 2, C // coffset += 8 * ldc (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f66 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f67 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 2, B mov f66 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 nop __LINE__ } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 4, KK #endif #endif } { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 adds C5 = 4 * SIZE, C1 } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f115 = f0 adds C6 = 4 * SIZE, C2 } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f68 = f0 shr L = L, 1 } { .mfi setf.d f86 = r0 mov f69 = f0 adds C7 = 4 * SIZE, C3 } ;; { .mfi CPREFETCH [PREC], LDC mov f84 = f0 adds L = -1, L } { .mfi setf.d f87 = r0 mov f85 = f0 adds C8 = 4 * SIZE, C4 } ;; { .mfi CPREFETCH [PREC], LDC mov f100 = f0 mov ar.lc = L } { .mfi setf.d f102 = r0 mov f101 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f116 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } { .mfi setf.d f103 = r0 mov f117 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC] mov f70 = f0 nop __LINE__ } { .mmf setf.d f118 = r0 setf.d f119 = r0 mov f71 = f0 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA_B f65 = f32, f49, f65 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f69 = f36, f49, f69 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f85 = f36, f51, f85 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f101 = f36, f53, f101 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f117 = f36, f55, f117 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f68 = f37, f49, f68 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f84 = f37, f51, f84 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f100 = f37, f53, f100 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f116 = f37, f55, f116 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f71 = f38, f49, f71 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f87 = f38, f51, f87 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f103 = f38, f53, f103 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f119 = f38, f55, f119 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f70 = f39, f49, f70 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f86 = f39, f51, f86 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f102 = f39, f53, f102 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f118 = f39, f55, f118 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C5], SIZE #else nop __LINE__ #endif (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C5], SIZE #else nop __LINE__ #endif (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f74 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C5], SIZE #else nop __LINE__ #endif (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f75 = [C1], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C5], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f88 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C6], SIZE #else nop __LINE__ #endif (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f89 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C6], SIZE #else nop __LINE__ #endif (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f90 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f94 = [C6], SIZE #else nop __LINE__ #endif (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f91 = [C2], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f95 = [C6], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f104 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C7], SIZE #else nop __LINE__ #endif (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f105 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C7], SIZE #else nop __LINE__ #endif (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f106 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C7], SIZE #else nop __LINE__ #endif (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f107 = [C3], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C7], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f120 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f124 = [C8], SIZE #else nop __LINE__ #endif (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f121 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f125 = [C8], SIZE #else nop __LINE__ #endif (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f122 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f126 = [C8], SIZE #else nop __LINE__ #endif (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f123 = [C4], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f127 = [C8], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb nop __LINE__ (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb nop __LINE__ (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb nop __LINE__ (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb nop __LINE__ (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi nop __LINE__ (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 br.cloop.sptk.few .L012 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = ALPHA_R, f68, f76 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f77 = ALPHA_R, f69, f77 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f74 = ALPHA_R, f66, f74 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = ALPHA_R, f70, f78 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f67, f75 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f79 = ALPHA_R, f71, f79 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f76 = ALPHA_I, f69, f76 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = ALPHA_I, f68, f77 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f67, f74 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f78 = ALPHA_I, f71, f78 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f75 = ALPHA_I, f66, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = ALPHA_I, f70, f79 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE FMA f88 = ALPHA_R, f80, f88 nop __LINE__ } { .mfb STFD [C5] = f76, SIZE FMA f92 = ALPHA_R, f84, f92 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE FCALC_C f89 = ALPHA_R, f81, f89 nop __LINE__ } { .mfb STFD [C5] = f77, SIZE FCALC_C f93 = ALPHA_R, f85, f93 nop __LINE__ } ;; { .mfb STFD [C1] = f74, SIZE FMA f90 = ALPHA_R, f82, f90 nop __LINE__ } { .mfb STFD [C5] = f78, SIZE FMA f94 = ALPHA_R, f86, f94 nop __LINE__ } ;; { .mfb STFD [C1] = f75, 5 * SIZE FCALC_C f91 = ALPHA_R, f83, f91 nop __LINE__ } { .mfb STFD [C5] = f79, 5 * SIZE FCALC_C f95 = ALPHA_R, f87, f95 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f92 = ALPHA_I, f85, f92 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = ALPHA_I, f84, f93 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f90 = ALPHA_I, f83, f90 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f94 = ALPHA_I, f87, f94 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f91 = ALPHA_I, f82, f91 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = ALPHA_I, f86, f95 nop __LINE__ } ;; { .mfb STFD [C2] = f88, SIZE FMA f104 = ALPHA_R, f96, f104 nop __LINE__ } { .mfb STFD [C6] = f92, SIZE FMA f108 = ALPHA_R, f100, f108 nop __LINE__ } ;; { .mfb STFD [C2] = f89, SIZE FCALC_C f105 = ALPHA_R, f97, f105 nop __LINE__ } { .mfb STFD [C6] = f93, SIZE FCALC_C f109 = ALPHA_R, f101, f109 nop __LINE__ } ;; { .mfb STFD [C2] = f90, SIZE FMA f106 = ALPHA_R, f98, f106 nop __LINE__ } { .mfb STFD [C6] = f94, SIZE FMA f110 = ALPHA_R, f102, f110 nop __LINE__ } ;; { .mfb STFD [C2] = f91, 5 * SIZE FCALC_C f107 = ALPHA_R, f99, f107 nop __LINE__ } { .mfb STFD [C6] = f95, 5 * SIZE FCALC_C f111 = ALPHA_R, f103, f111 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f104 = ALPHA_I, f97, f104 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f108 = ALPHA_I, f101, f108 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f105 = ALPHA_I, f96, f105 nop __LINE__ } { .mfb nop __LINE__ FMA f109 = ALPHA_I, f100, f109 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f106 = ALPHA_I, f99, f106 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f110 = ALPHA_I, f103, f110 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f107 = ALPHA_I, f98, f107 nop __LINE__ } { .mfb nop __LINE__ FMA f111 = ALPHA_I, f102, f111 nop __LINE__ } ;; { .mfb STFD [C3] = f104, SIZE FMA f120 = ALPHA_R, f112, f120 nop __LINE__ } { .mfb STFD [C7] = f108, SIZE FMA f124 = ALPHA_R, f116, f124 nop __LINE__ } ;; { .mfb STFD [C3] = f105, SIZE FCALC_C f121 = ALPHA_R, f113, f121 nop __LINE__ } { .mfb STFD [C7] = f109, SIZE FCALC_C f125 = ALPHA_R, f117, f125 nop __LINE__ } ;; { .mfb STFD [C3] = f106, SIZE FMA f122 = ALPHA_R, f114, f122 nop __LINE__ } { .mfb STFD [C7] = f110, SIZE FMA f126 = ALPHA_R, f118, f126 nop __LINE__ } ;; { .mfb STFD [C3] = f107, 5 * SIZE FCALC_C f123 = ALPHA_R, f115, f123 nop __LINE__ } { .mfb STFD [C7] = f111, 5 * SIZE FCALC_C f127 = ALPHA_R, f119, f127 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f120 = ALPHA_I, f113, f120 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f124 = ALPHA_I, f117, f124 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f121 = ALPHA_I, f112, f121 nop __LINE__ } { .mfb nop __LINE__ FMA f125 = ALPHA_I, f116, f125 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f122 = ALPHA_I, f115, f122 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f126 = ALPHA_I, f119, f126 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f123 = ALPHA_I, f114, f123 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMA f127 = ALPHA_I, f118, f127 nop __LINE__ } ;; { .mfi STFD [C4] = f120, SIZE mov f64 = f0 adds I = -1, I } { .mfb STFD [C8] = f124, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C4] = f121, SIZE mov f80 = f0 and TEMP = 3, M } { .mfb STFD [C8] = f125, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C4] = f122, SIZE mov f96 = f0 cmp.ne p8, p9 = r0, TEMP } { .mfb STFD [C8] = f126, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfb STFD [C4] = f123, 5 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C8] = f127, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #else { .mfb nop __LINE__ FMPY f72 = ALPHA_R, f64 nop __LINE__ } { .mfb nop __LINE__ FMPY f76 = ALPHA_R, f68 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f77 = ALPHA_R, f69, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f74 = ALPHA_R, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f78 = ALPHA_R, f70 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f67, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f79 = ALPHA_R, f71, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f76 = ALPHA_I, f69, f76 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = ALPHA_I, f68, f77 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f67, f74 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f78 = ALPHA_I, f71, f78 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f75 = ALPHA_I, f66, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = ALPHA_I, f70, f79 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE FMPY f88 = ALPHA_R, f80 nop __LINE__ } { .mfb STFD [C5] = f76, SIZE FMPY f92 = ALPHA_R, f84 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE FCALC_C f89 = ALPHA_R, f81, f0 nop __LINE__ } { .mfb STFD [C5] = f77, SIZE FCALC_C f93 = ALPHA_R, f85, f0 nop __LINE__ } ;; { .mfb STFD [C1] = f74, SIZE FMPY f90 = ALPHA_R, f82 nop __LINE__ } { .mfb STFD [C5] = f78, SIZE FMPY f94 = ALPHA_R, f86 nop __LINE__ } ;; { .mfb STFD [C1] = f75, 5 * SIZE FCALC_C f91 = ALPHA_R, f83, f0 nop __LINE__ } { .mfb STFD [C5] = f79, 5 * SIZE FCALC_C f95 = ALPHA_R, f87, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f92 = ALPHA_I, f85, f92 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = ALPHA_I, f84, f93 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f90 = ALPHA_I, f83, f90 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f94 = ALPHA_I, f87, f94 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f91 = ALPHA_I, f82, f91 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = ALPHA_I, f86, f95 nop __LINE__ } ;; { .mfb STFD [C2] = f88, SIZE FMPY f104 = ALPHA_R, f96 nop __LINE__ } { .mfb STFD [C6] = f92, SIZE FMPY f108 = ALPHA_R, f100 nop __LINE__ } ;; { .mfb STFD [C2] = f89, SIZE FCALC_C f105 = ALPHA_R, f97, f0 nop __LINE__ } { .mfb STFD [C6] = f93, SIZE FCALC_C f109 = ALPHA_R, f101, f0 nop __LINE__ } ;; { .mfb STFD [C2] = f90, SIZE FMPY f106 = ALPHA_R, f98 nop __LINE__ } { .mfb STFD [C6] = f94, SIZE FMPY f110 = ALPHA_R, f102 nop __LINE__ } ;; { .mfb STFD [C2] = f91, 5 * SIZE FCALC_C f107 = ALPHA_R, f99, f0 nop __LINE__ } { .mfb STFD [C6] = f95, 5 * SIZE FCALC_C f111 = ALPHA_R, f103, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f104 = ALPHA_I, f97, f104 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f108 = ALPHA_I, f101, f108 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f105 = ALPHA_I, f96, f105 nop __LINE__ } { .mfb nop __LINE__ FMA f109 = ALPHA_I, f100, f109 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f106 = ALPHA_I, f99, f106 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f110 = ALPHA_I, f103, f110 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f107 = ALPHA_I, f98, f107 nop __LINE__ } { .mfb nop __LINE__ FMA f111 = ALPHA_I, f102, f111 nop __LINE__ } ;; { .mfb STFD [C3] = f104, SIZE FMPY f120 = ALPHA_R, f112 nop __LINE__ } { .mfb STFD [C7] = f108, SIZE FMPY f124 = ALPHA_R, f116 nop __LINE__ } ;; { .mfb STFD [C3] = f105, SIZE FCALC_C f121 = ALPHA_R, f113, f0 nop __LINE__ } { .mfb STFD [C7] = f109, SIZE FCALC_C f125 = ALPHA_R, f117, f0 nop __LINE__ } ;; { .mfb STFD [C3] = f106, SIZE FMPY f122 = ALPHA_R, f114 nop __LINE__ } { .mfb STFD [C7] = f110, SIZE FMPY f126 = ALPHA_R, f118 nop __LINE__ } ;; { .mfb STFD [C3] = f107, 5 * SIZE FCALC_C f123 = ALPHA_R, f115, f0 nop __LINE__ } { .mfb STFD [C7] = f111, 5 * SIZE FCALC_C f127 = ALPHA_R, f119, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f120 = ALPHA_I, f113, f120 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f124 = ALPHA_I, f117, f124 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f121 = ALPHA_I, f112, f121 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb nop __LINE__ FMA f125 = ALPHA_I, f116, f125 nop __LINE__ } ;; { .mfi nop __LINE__ FCALC_D f122 = ALPHA_I, f115, f122 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FCALC_D f126 = ALPHA_I, f119, f126 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi nop __LINE__ FMA f123 = ALPHA_I, f114, f123 cmp.ne p6, p0 = 1, I } { .mfi nop __LINE__ FMA f127 = ALPHA_I, f118, f127 adds I = -1, I } ;; { .mfi STFD [C4] = f120, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C8] = f124, SIZE mov f65 = f0 and TEMP = 3, M } ;; { .mfi STFD [C4] = f121, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C8] = f125, SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C4] = f122, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfi STFD [C8] = f126, SIZE mov f97 = f0 cmp.ne p8, p9 = r0, TEMP } ;; { .mfi STFD [C4] = f123, 5 * SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C8] = f127, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif .L020: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 4, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L030 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f66 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f67 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mfi shladd BOFFSET = KK8, 2, B mov f66 = f0 shladd AOFFSET = KK8, 1, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 tbit.z p12, p0 = L, 0 } { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = -1, L } { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f114 = f0 mov ar.lc = L } { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ } ;; .align 16 .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f88 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f89 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f74 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f90 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f75 = [C1], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f91 = [C2], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f104 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f120 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f105 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f121 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f106 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f122 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f107 = [C3], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f123 = [C4], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 br.cloop.sptk.few .L022 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f88 = ALPHA_R, f80, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f89 = ALPHA_R, f81, f89 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f74 = ALPHA_R, f66, f74 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = ALPHA_R, f82, f90 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f67, f75 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f91 = ALPHA_R, f83, f91 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f67, f74 nop __LINE__ } { .mfb FCALC_D f90 = ALPHA_I, f83, f90 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = ALPHA_I, f66, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = ALPHA_I, f82, f91 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE FMA f104 = ALPHA_R, f96, f104 nop __LINE__ } { .mfb STFD [C2] = f88, SIZE FMA f120 = ALPHA_R, f112, f120 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE FCALC_C f105 = ALPHA_R, f97, f105 nop __LINE__ } { .mfb STFD [C2] = f89, SIZE FCALC_C f121 = ALPHA_R, f113, f121 nop __LINE__ } ;; { .mfb STFD [C1] = f74, SIZE FMA f106 = ALPHA_R, f98, f106 nop __LINE__ } { .mfb STFD [C2] = f90, SIZE FMA f122 = ALPHA_R, f114, f122 nop __LINE__ } ;; { .mfb STFD [C1] = f75, SIZE FCALC_C f107 = ALPHA_R, f99, f107 nop __LINE__ } { .mfb STFD [C2] = f91, SIZE FCALC_C f123 = ALPHA_R, f115, f123 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f104 = ALPHA_I, f97, f104 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f120 = ALPHA_I, f113, f120 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f105 = ALPHA_I, f96, f105 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = ALPHA_I, f112, f121 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f106 = ALPHA_I, f99, f106 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f122 = ALPHA_I, f115, f122 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f107 = ALPHA_I, f98, f107 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = ALPHA_I, f114, f123 nop __LINE__ } ;; { .mfb STFD [C3] = f104, SIZE mov f64 = f0 nop __LINE__ } { .mfi STFD [C4] = f120, SIZE mov f65 = f0 } ;; { .mfb STFD [C3] = f105, SIZE mov f80 = f0 nop __LINE__ } { .mfi STFD [C4] = f121, SIZE mov f81 = f0 } ;; { .mfb STFD [C3] = f106, SIZE mov f96 = f0 nop __LINE__ } { .mfi STFD [C4] = f122, SIZE mov f97 = f0 } ;; { .mfi STFD [C3] = f107, SIZE mov f112 = f0 } { .mfb STFD [C4] = f123, SIZE mov f113 = f0 nop __LINE__ } ;; #else { .mfb nop __LINE__ FMPY f72 = ALPHA_R, f64 nop __LINE__ } { .mfb nop __LINE__ FMPY f88 = ALPHA_R, f80 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f89 = ALPHA_R, f81, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f74 = ALPHA_R, f66 nop __LINE__ } { .mfb nop __LINE__ FMPY f90 = ALPHA_R, f82 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f67, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f91 = ALPHA_R, f83, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f67, f74 nop __LINE__ } { .mfb FCALC_D f90 = ALPHA_I, f83, f90 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f75 = ALPHA_I, f66, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = ALPHA_I, f82, f91 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE FMPY f104 = ALPHA_R, f96 nop __LINE__ } { .mfb STFD [C2] = f88, SIZE FMPY f120 = ALPHA_R, f112 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE FCALC_C f105 = ALPHA_R, f97, f0 nop __LINE__ } { .mfb STFD [C2] = f89, SIZE FCALC_C f121 = ALPHA_R, f113, f0 nop __LINE__ } ;; { .mfb STFD [C1] = f74, SIZE FMPY f106 = ALPHA_R, f98 nop __LINE__ } { .mfb STFD [C2] = f90, SIZE FMPY f122 = ALPHA_R, f114 nop __LINE__ } ;; { .mfb STFD [C1] = f75, SIZE FCALC_C f107 = ALPHA_R, f99, f0 nop __LINE__ } { .mfb STFD [C2] = f91, SIZE FCALC_C f123 = ALPHA_R, f115, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f104 = ALPHA_I, f97, f104 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f120 = ALPHA_I, f113, f120 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f105 = ALPHA_I, f96, f105 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = ALPHA_I, f112, f121 nop __LINE__ } ;; { .mfi nop __LINE__ FCALC_D f106 = ALPHA_I, f99, f106 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb nop __LINE__ FCALC_D f122 = ALPHA_I, f115, f122 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f107 = ALPHA_I, f98, f107 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMA f123 = ALPHA_I, f114, f123 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi STFD [C3] = f104, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C4] = f120, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C3] = f105, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C4] = f121, SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C3] = f106, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfi STFD [C4] = f122, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C3] = f107, SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C4] = f123, SIZE mov f113 = f0 nop __LINE__ } ;; #endif .align 16 .L030: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 4, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L049 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] mov f72 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mfi shladd BOFFSET = KK8, 2, B mov f72 = f0 add AOFFSET = KK8, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mmi nop __LINE__ adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f88 = f0 shr L = L, 1 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 nop __LINE__ } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f104 = f0 adds L = -1, L } { .mfb adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f105 = f0 nop __LINE__ } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } { .mfi cmp.eq p3, p0 = r0, r0 mov f121 = f0 nop __LINE__ } ;; .align 16 .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1], SIZE #else nop __LINE__ #endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f88 = [C2], SIZE #else nop __LINE__ #endif (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1], - SIZE #else nop __LINE__ #endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f89 = [C2], - SIZE #else nop __LINE__ #endif (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f104 = [C3], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f120 = [C4], SIZE #else nop __LINE__ #endif (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f105 = [C3], - SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f121 = [C4], - SIZE #else nop __LINE__ #endif (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 br.cloop.sptk.few .L032 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f88 = ALPHA_R, f80, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f89 = ALPHA_R, f81, f89 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f104 = ALPHA_R, f96, f104 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = ALPHA_R, f112, f120 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f105 = ALPHA_R, f97, f105 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f121 = ALPHA_R, f113, f121 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f104 = ALPHA_I, f97, f104 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f120 = ALPHA_I, f113, f120 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f105 = ALPHA_I, f96, f105 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = ALPHA_I, f112, f121 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C2] = f88, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C2] = f89, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfb STFD [C3] = f104, SIZE mov f96 = f0 nop __LINE__ } { .mfi STFD [C4] = f120, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfb STFD [C3] = f105, SIZE mov f112 = f0 nop __LINE__ } { .mfi STFD [C4] = f121, SIZE mov f113 = f0 nop __LINE__ } ;; #else { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f88 = ALPHA_R, f80, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f89 = ALPHA_R, f81, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f104 = ALPHA_R, f96, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = ALPHA_R, f112, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f105 = ALPHA_R, f97, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f121 = ALPHA_R, f113, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } ;; { .mfi nop __LINE__ FCALC_D f104 = ALPHA_I, f97, f104 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb nop __LINE__ FCALC_D f120 = ALPHA_I, f113, f120 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f105 = ALPHA_I, f96, f105 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMA f121 = ALPHA_I, f112, f121 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f72, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C2] = f88, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C1] = f73, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C2] = f89, SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 2, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C3] = f104, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } { .mfi STFD [C4] = f120, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C3] = f105, SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C4] = f121, SIZE mov f113 = f0 nop __LINE__ } ;; #endif .align 16 .L049: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mmb nop __LINE__ cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 16 .L050: { .mmi #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif shr I = M, 2 } { .mib mov C1 = C tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L090 } ;; { .mmi add C2 = LDC, C #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif nop __LINE__ } { .mib cmp.eq p6, p7 = 0, I shladd C = LDC, 1, C (p6) br.cond.dpnt .L060 } ;; .align 16 .L052: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfi LDFPD f48, f49 = [B] mov f66 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } { .mfb adds BOFFSET = 2 * SIZE, B mov f67 = f0 nop __LINE__ } ;; #else { .mfi shladd BOFFSET = KK8, 1, B mov f66 = f0 shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 2, KK #endif #endif } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi cmp.eq p3, p0 = r0, r0 mov f99 = f0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi CPREFETCH [PREC], LDC mov f115 = f0 shr L = L, 1 } ;; { .mmi LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds C5 = 4 * SIZE, C1 adds L = -1, L } ;; { .mmi CPREFETCH [PREC], LDC adds C6 = 4 * SIZE, C2 mov ar.lc = L } ;; .align 16 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfi FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f36, f48, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f36, f49, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f36, f50, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f36, f51, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f38, f48, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f38, f49, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f38, f50, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f38, f51, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f37, f48, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f37, f49, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f37, f50, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f37, f51, f82 // A6 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f39, f48, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f39, f49, f98 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f39, f50, f115 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f39, f51, f114 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f74 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f75 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f113 = f43, f58, f113 // A4 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C5 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f88 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f66 = f44, f56, f66 // A5 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f89 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f82 = f44, f58, f82 // A5 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f90 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA f98 = f46, f56, f98 // A7 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f94 = [C6 ], SIZE #else nop __LINE__ #endif (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f91 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f114 = f46, f58, f114 // A7 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f95 = [C6 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f45, f56, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f45, f58, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f47, f56, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f47, f58, f115 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 br.cloop.sptk.few .L053 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = ALPHA_R, f66, f76 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f77 = ALPHA_R, f67, f77 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f74 = ALPHA_R, f96, f74 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = ALPHA_R, f98, f78 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f97, f75 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f79 = ALPHA_R, f99, f79 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f76 = ALPHA_I, f67, f76 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = ALPHA_I, f66, f77 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f97, f74 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f78 = ALPHA_I, f99, f78 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = ALPHA_I, f96, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = ALPHA_I, f98, f79 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE FMA f88 = ALPHA_R, f80, f88 nop __LINE__ } { .mfb STFD [C5] = f76, SIZE FMA f92 = ALPHA_R, f82, f92 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE FCALC_C f89 = ALPHA_R, f81, f89 nop __LINE__ } { .mfb STFD [C5] = f77, SIZE FCALC_C f93 = ALPHA_R, f83, f93 nop __LINE__ } ;; { .mfb STFD [C1] = f74, SIZE FMA f90 = ALPHA_R, f112, f90 nop __LINE__ } { .mfb STFD [C5] = f78, SIZE FMA f94 = ALPHA_R, f114, f94 nop __LINE__ } ;; { .mfb STFD [C1] = f75, 5 * SIZE FCALC_C f91 = ALPHA_R, f113, f91 nop __LINE__ } { .mfb STFD [C5] = f79, 5 * SIZE FCALC_C f95 = ALPHA_R, f115, f95 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f92 = ALPHA_I, f83, f92 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = ALPHA_I, f82, f93 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f90 = ALPHA_I, f113, f90 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f94 = ALPHA_I, f115, f94 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f91 = ALPHA_I, f112, f91 cmp.ne p6, p0 = 1, I } { .mfb nop __LINE__ FMA f95 = ALPHA_I, f114, f95 nop __LINE__ } ;; { .mfb STFD [C2] = f88, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C6] = f92, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C2] = f89, SIZE mov f80 = f0 adds I = -1, I } { .mfb STFD [C6] = f93, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfb STFD [C2] = f90, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C6] = f94, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfb STFD [C2] = f91, 5 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C6] = f95, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L052 } ;; #else { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = ALPHA_R, f66, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f77 = ALPHA_R, f67, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f74 = ALPHA_R, f96, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = ALPHA_R, f98, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f97, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f79 = ALPHA_R, f99, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f76 = ALPHA_I, f67, f76 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = ALPHA_I, f66, f77 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f97, f74 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f78 = ALPHA_I, f99, f78 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f75 = ALPHA_I, f96, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = ALPHA_I, f98, f79 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE FMA f88 = ALPHA_R, f80, f0 nop __LINE__ } { .mfb STFD [C5] = f76, SIZE FMA f92 = ALPHA_R, f82, f0 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE FCALC_C f89 = ALPHA_R, f81, f0 nop __LINE__ } { .mfb STFD [C5] = f77, SIZE FCALC_C f93 = ALPHA_R, f83, f0 nop __LINE__ } ;; { .mfb STFD [C1] = f74, SIZE FMA f90 = ALPHA_R, f112, f0 nop __LINE__ } { .mfb STFD [C5] = f78, SIZE FMA f94 = ALPHA_R, f114, f0 nop __LINE__ } ;; { .mfb STFD [C1] = f75, 5 * SIZE FCALC_C f91 = ALPHA_R, f113, f0 nop __LINE__ } { .mfb STFD [C5] = f79, 5 * SIZE FCALC_C f95 = ALPHA_R, f115, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f92 = ALPHA_I, f83, f92 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = ALPHA_I, f82, f93 nop __LINE__ } ;; { .mfi nop __LINE__ FCALC_D f90 = ALPHA_I, f113, f90 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfi nop __LINE__ FCALC_D f94 = ALPHA_I, f115, f94 cmp.ne p6, p0 = 1, I } ;; { .mfi nop __LINE__ FMA f91 = ALPHA_I, f112, f91 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMA f95 = ALPHA_I, f114, f95 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mfi STFD [C2] = f88, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C6] = f92, SIZE mov f65 = f0 adds I = -1, I } ;; { .mfi STFD [C2] = f89, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C6] = f93, SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C2] = f90, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfb STFD [C6] = f94, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C2] = f91, 5 * SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C6] = f95, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L052 } ;; #endif .align 16 .L060: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L070 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET nop __LINE__ } { .mmi adds BOFFSET = 2 * SIZE, B cmp.eq p3, p0 = r0, r0 #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 1, B shladd AOFFSET = KK8, 1, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE shr L = L, 1 } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.lc = L } ;; .align 16 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 } { .mfb FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f88 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f89 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f74 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f90 = [C2 ], SIZE #else nop __LINE__ #endif (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f75 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f91 = [C2 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 br.cloop.sptk.few .L062 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f88 = ALPHA_R, f80, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f89 = ALPHA_R, f81, f89 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f74 = ALPHA_R, f96, f74 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = ALPHA_R, f112, f90 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f97, f75 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f91 = ALPHA_R, f113, f91 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f97, f74 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f90 = ALPHA_I, f113, f90 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f75 = ALPHA_I, f96, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = ALPHA_I, f112, f91 nop __LINE__ } ;; { .mfb STFD [C1] = f72, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C2] = f88, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfb STFD [C1] = f73, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C2] = f89, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C1] = f74, SIZE mov f96 = f0 adds L = 1, K } { .mfb STFD [C2] = f90, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C1] = f75, SIZE mov f112 = f0 shr L = L, 1 } { .mfb STFD [C2] = f91, SIZE mov f113 = f0 nop __LINE__ } ;; #else { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f88 = ALPHA_R, f80, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f89 = ALPHA_R, f81, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f74 = ALPHA_R, f96, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = ALPHA_R, f112, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f97, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f91 = ALPHA_R, f113, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } ;; { .mfi nop __LINE__ FCALC_D f74 = ALPHA_I, f97, f74 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb nop __LINE__ FCALC_D f90 = ALPHA_I, f113, f90 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f75 = ALPHA_I, f96, f75 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMA f91 = ALPHA_I, f112, f91 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f72, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C2] = f88, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C1] = f73, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C2] = f89, SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f74, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } { .mfb STFD [C2] = f90, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C1] = f75, SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C2] = f91, SIZE mov f113 = f0 nop __LINE__ } ;; #endif .align 16 .L070: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 2, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L089 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mmi shladd BOFFSET = KK8, 1, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif ;; { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 16 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f96 = f32, f49, f96 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f112 = f32, f51, f112 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f97 = f33, f49, f97 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf nop __LINE__ nop __LINE__ FMA f113 = f33, f51, f113 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f96 = f40, f57, f96 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1 ], SIZE (p5) LDFD f88 = [C2 ], SIZE #else nop __LINE__ nop __LINE__ #endif (p3) FMA f112 = f40, f59, f112 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1 ], - SIZE #else nop __LINE__ #endif (p3) FMA f97 = f41, f57, f97 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f89 = [C2 ], - SIZE #else nop __LINE__ #endif (p3) FMA f113 = f41, f59, f113 // A2 * B4 br.cloop.sptk.few .L072 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f97 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f80 = f80, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f96 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f81 = f81, f112 nop __LINE__ } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb setf.d f96 = r0 FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb setf.d f97 = r0 FMA f88 = ALPHA_R, f80, f88 nop __LINE__ } ;; { .mfb setf.d f112 = r0 FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb setf.d f113 = r0 FCALC_C f89 = ALPHA_R, f81, f89 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb setf.d f65 = r0 FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } ;; { .mfb setf.d f81 = r0 FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb setf.d f64 = r0 FMA f89 = ALPHA_I, f80, f89 nop __LINE__ } ;; { .mmf STFD [C1] = f72, SIZE STFD [C2] = f88, SIZE mov f80 = f0 } ;; { .mmi STFD [C1] = f73, SIZE STFD [C2] = f89, SIZE mov B = BOFFSET } ;; #else { .mfi setf.d f96 = r0 FMA f72 = ALPHA_R, f64, f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb setf.d f97 = r0 FMA f88 = ALPHA_R, f80, f0 nop __LINE__ } ;; { .mfi setf.d f112 = r0 FCALC_C f73 = ALPHA_R, f65, f0 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } { .mfi setf.d f113 = r0 FCALC_C f89 = ALPHA_R, f81, f0 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } ;; { .mfi nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb setf.d f65 = r0 FCALC_D f88 = ALPHA_I, f81, f88 nop __LINE__ } ;; { .mfi setf.d f81 = r0 FMA f73 = ALPHA_I, f64, f73 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add AOFFSET = KK8, AOFFSET #else nop __LINE__ #endif } { .mfi setf.d f64 = r0 FMA f89 = ALPHA_I, f80, f89 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 1, BOFFSET #else nop __LINE__ #endif } ;; { .mmf nop __LINE__ nop __LINE__ mov f80 = f0 } ;; { .mmi STFD [C1] = f72, SIZE STFD [C2] = f88, SIZE #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } ;; { .mmi STFD [C1] = f73, SIZE STFD [C2] = f89, SIZE #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } #endif ;; .align 16 .L089: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } ;; .align 16 .L090: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 0 } { .mfi #if defined(TRMMKERNEL) && defined(LEFT) mov KK = OFFSET #else nop __LINE__ #endif mov f72 = f0 shr I = M, 2 } ;; { .mfi setf.d f66 = r0 mov f65 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb mov AOFFSET = A mov f73 = f0 (p6) br.cond.dpnt .L999 } ;; { .mfi setf.d f74 = r0 mov f67 = f0 nop __LINE__ } { .mfb cmp.eq p6, p7 = 0, I mov f75 = f0 (p6) br.cond.dpnt .L100 } ;; .align 16 .L092: #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfb LDFPD f48, f49 = [B] nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 1, KK #endif #endif } ;; #else { .mfi add BOFFSET = KK8, B shladd AOFFSET = KK8, 2, AOFFSET } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 4, KK #else adds L = 1, KK #endif #endif } ;; #endif { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC] } ;; { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov ar.lc = L } { .mmi adds C5 = 4 * SIZE, C1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 } ;; .align 16 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f34, f48, f80 // A3 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f81 = f34, f49, f81 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f36, f48, f96 // A5 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f97 = f36, f49, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f38, f48, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f38, f49, f113 // A7 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f81 = f35, f48, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f35, f49, f80 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f97 = f37, f48, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f37, f49, f96 // A6 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f113 = f39, f48, f113 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f39, f49, f112 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f42, f56, f80 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f44, f56, f96 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f46, f56, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f81 = f43, f56, f81 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 nop __LINE__ } ;; { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f74 = [C1 ], SIZE #else nop __LINE__ #endif (p3) FMA f97 = f45, f56, f97 // A6 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C5 ], SIZE #else nop __LINE__ #endif (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 nop __LINE__ } ;; { .mfi #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f75 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f113 = f47, f56, f113 // A8 * B1 adds L = -1, L } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C5 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 br.cloop.sptk.few .L093 } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = ALPHA_R, f96, f76 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f77 = ALPHA_R, f97, f77 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = ALPHA_R, f80, f74 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = ALPHA_R, f112, f78 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f81, f75 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f79 = ALPHA_R, f113, f79 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f76 = ALPHA_I, f97, f76 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = ALPHA_I, f96, f77 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f74 = ALPHA_I, f81, f74 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f78 = ALPHA_I, f113, f78 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = ALPHA_I, f80, f75 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = ALPHA_I, f112, f79 nop __LINE__ } ;; { .mfi STFD [C1] = f72, SIZE mov f64 = f0 cmp.ne p6, p0 = 1, I } { .mfb STFD [C5] = f76, SIZE mov f65 = f0 nop __LINE__ } ;; { .mfi STFD [C1] = f73, SIZE mov f80 = f0 adds I = -1, I } { .mfb STFD [C5] = f77, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfb STFD [C1] = f74, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C5] = f78, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C1] = f75, 5 * SIZE mov f112 = f0 } { .mfb STFD [C5] = f79, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L092 } ;; #else { .mfb nop __LINE__ FMA f6 = ALPHA_R, f64, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = ALPHA_R, f96, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f77 = ALPHA_R, f97, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = ALPHA_R, f80, f0 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = ALPHA_R, f112, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f81, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f79 = ALPHA_R, f113, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f6 = ALPHA_I, f65, f6 nop __LINE__ } { .mfb nop __LINE__ FCALC_D f76 = ALPHA_I, f97, f76 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = ALPHA_I, f96, f77 nop __LINE__ } ;; { .mfi nop __LINE__ FCALC_D f74 = ALPHA_I, f81, f74 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb nop __LINE__ FCALC_D f78 = ALPHA_I, f113, f78 nop __LINE__ } ;; { .mfi nop __LINE__ FMA f75 = ALPHA_I, f80, f75 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L #else nop __LINE__ #endif } { .mfi nop __LINE__ FMA f79 = ALPHA_I, f112, f79 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f6, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfi STFD [C5] = f76, SIZE mov f65 = f0 cmp.ne p6, p0 = 1, I } ;; { .mfi STFD [C1] = f73, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET #else nop __LINE__ #endif } { .mfi STFD [C5] = f77, SIZE mov f81 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f74, SIZE mov f96 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK #else nop __LINE__ #endif } { .mfi STFD [C5] = f78, SIZE mov f97 = f0 adds I = -1, I } ;; { .mfi STFD [C1] = f75, 5 * SIZE mov f112 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb STFD [C5] = f79, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L092 } ;; #endif .align 16 .L100: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 2, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L110 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mii add BOFFSET = KK8, B shladd AOFFSET = KK8, 1, AOFFSET nop __LINE__ } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif { .mii LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; .align 16 .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1 ], SIZE #else nop __LINE__ #endif FMA f81 = f33, f49, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1 ], SIZE #else nop __LINE__ #endif FMA f112 = f34, f49, f112 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f74 = [C1 ], SIZE #else nop __LINE__ #endif FMA f113 = f35, f49, f113 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f75 = [C1 ], -3 * SIZE #else nop __LINE__ #endif (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f57, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f112 = f42, f57, f112 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f43, f56, f97 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f43, f57, f113 // A4 * B2 br.cloop.sptk.few .L102 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f96 = f96, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f97 = f97, f112 nop __LINE__ } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = ALPHA_R, f96, f74 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f75 = ALPHA_R, f97, f75 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } { .mfb setf.d f112 = r0 FCALC_D f74 = ALPHA_I, f97, f74 nop __LINE__ } { .mfb setf.d f113 = r0 FMA f75 = ALPHA_I, f96, f75 nop __LINE__ } ;; { .mmf STFD [C1] = f72, SIZE setf.d f97 = r0 mov f64 = f0 } ;; { .mmf STFD [C1] = f73, SIZE setf.d f96 = r0 mov f80 = f0 } ;; { .mfi STFD [C1] = f74, SIZE mov f65 = f0 adds L = 1, K } ;; { .mfi STFD [C1] = f75, SIZE mov f81 = f0 shr L = L, 1 } ;; #else { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } ;; { .mfi setf.d f112 = r0 FMA f74 = ALPHA_R, f96, f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK #else nop __LINE__ #endif } { .mfb setf.d f113 = r0 FCALC_C f75 = ALPHA_R, f97, f0 nop __LINE__ } ;; { .mfi setf.d f97 = r0 FCALC_D f72 = ALPHA_I, f65, f72 #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L #else nop __LINE__ #endif } { .mfi setf.d f96 = r0 FMA f73 = ALPHA_I, f64, f73 #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -1, L #else nop __LINE__ #endif } ;; { .mfi nop __LINE__ FCALC_D f74 = ALPHA_I, f97, f74 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } { .mfb nop __LINE__ FMA f75 = ALPHA_I, f96, f75 nop __LINE__ } ;; { .mfi STFD [C1] = f72, SIZE mov f64 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f73, SIZE mov f80 = f0 #if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) add BOFFSET = KK8, BOFFSET #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f74, SIZE mov f65 = f0 #if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK #else nop __LINE__ #endif } ;; { .mfi STFD [C1] = f75, SIZE mov f81 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, ZBASE_SHIFT, r0 #else nop __LINE__ #endif } ;; #endif .align 16 .L110: { .mib #ifndef TRMMKERNEL nop __LINE__ #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK #elif defined(LEFT) adds L = 1, KK #else adds L = 1, KK #endif #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L119 } ;; #if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #else { .mii add BOFFSET = KK8, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE #ifndef TRMMKERNEL adds L = 1, K #else adds L = 1, L #endif } ;; #endif ;; { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f32, f33 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; .align 16 .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f72 = [C1 ], SIZE #else nop __LINE__ #endif nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb #if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f73 = [C1 ], -1 * SIZE #else nop __LINE__ #endif (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb (p3) FMA f81 = f41, f57, f81 // A2 * B2 br.cloop.sptk.few .L112 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } ;; #if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f72 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f73 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } ;; { .mmf STFD [C1] = f72, SIZE setf.d f64 = r0 mov f80 = f0 } ;; { .mmf STFD [C1] = f73, SIZE setf.d f65 = r0 mov f81 = f0 } ;; #else { .mfb nop __LINE__ FMA f72 = ALPHA_R, f64, f0 nop __LINE__ } { .mfb nop __LINE__ FCALC_C f73 = ALPHA_R, f65, f0 nop __LINE__ } ;; { .mfb nop __LINE__ FCALC_D f72 = ALPHA_I, f65, f72 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = ALPHA_I, f64, f73 nop __LINE__ } ;; { .mmf STFD [C1] = f72, SIZE setf.d f64 = r0 mov f80 = f0 } ;; { .mmf STFD [C1] = f73, SIZE setf.d f65 = r0 mov f81 = f0 } ;; #endif .align 16 .L119: { .mmi mov B = BOFFSET mov AOFFSET = A #if defined(TRMMKERNEL) && !defined(LEFT) adds KK = 1, KK #else nop __LINE__ #endif } ;; .align 16 .L999: { .mii nop __LINE__ mov ar.lc = ARLC mov pr = PR, -1 } { .mib nop __LINE__ #ifdef TRMMKERNEL mov ar.pfs = ARPFS #else nop __LINE__ #endif br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zgemm_ncopy.S000066400000000000000000000335351313527062700176530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 64 #define WPREFETCHSIZE 32 #define LD LDF8 #define ST STF8_NTA #define TEMP r2 #define I r14 #define J r15 #define PREB r16 #define PREA r17 #define A1 r18 #define A2 r19 #define A3 r20 #define A4 r21 #define A5 r22 #define A6 r23 #define A7 r24 #define A8 r25 #define B1 r26 #define COUNT r28 #define ARLC r30 #define PR r31 #define M r32 #define N r33 #define A r34 #define LDA r35 #define B r36 PROLOGUE .prologue PROFCODE .body { .mii shladd LDA= LDA, ZBASE_SHIFT, r0 mov PR = pr shr J = N, 2 } ;; { .mii mov COUNT=r0 tbit.nz p10, p0 =M, 1 tbit.nz p11, p0 =M, 0 } ;; { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } { .mib cmp.eq p8,p0 = 0, J mov ARLC = ar.lc (p8) br.cond.dpnt .L20 } ;; .align 32 .L11: { .mmi mov A1 = A add A2 = A, LDA mov pr.rot = 0 } { .mmi shladd A3 = LDA, 1, A adds B1 = 4 * SIZE, B shr I = M, 2 } ;; { .mmi shladd A4 = LDA, 1, A2 cmp.eq p16,p0 = r0, r0 mov ar.ec = 3 } { .mmi cmp.eq p6,p0 = 0,I adds I =-1, I adds J =-1, J } ;; { .mmi shladd A = LDA, 2, A adds A5 = 4 * SIZE, A1 adds A6 = 4 * SIZE, A2 } { .mmi adds A7 = 4 * SIZE, A3 adds A8 = 4 * SIZE, A4 adds PREA = PREFETCHSIZE * SIZE,A1 } ;; { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } { .mib adds PREB = WPREFETCHSIZE * SIZE, B mov ar.lc = I (p6) br.cond.dpnt.few .L15 } ;; .align 32 .L12: { .mmb (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB], 16 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f34, SIZE (p18) ST [B1] = f82, SIZE nop __LINE__ } { .mmb (p16) LD f32 = [A1], SIZE (p16) LD f35 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f40, SIZE (p18) ST [B1] = f88, SIZE nop __LINE__ } { .mmb (p16) LD f38 = [A1], SIZE (p16) LD f41 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f58, SIZE (p18) ST [B1] = f106, SIZE nop __LINE__ } { .mmb (p16) LD f44 = [A1], SIZE (p16) LD f47 = [A5], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f64, 5 * SIZE (p18) ST [B1] = f112, 5 * SIZE tbit.z p0,p7 = COUNT,0 } { .mmb (p16) LD f50 = [A1], 5 * SIZE (p16) LD f53 = [A5], 5 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f46, SIZE (p18) ST [B1] = f94, SIZE nop __LINE__ } { .mmb (p16) LD f56 = [A2], SIZE (p16) LD f59 = [A6], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f52, SIZE (p18) ST [B1] = f100, SIZE nop __LINE__ } { .mmb (p16) LD f62 = [A2], SIZE (p16) LD f65 = [A6], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f70, SIZE (p18) ST [B1] = f118, SIZE nop __LINE__ } { .mmb (p16) LD f68 = [A2], SIZE (p16) LD f71 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f76, 5 * SIZE (p18) ST [B1] = f124, 5 * SIZE shladd TEMP = LDA, 2, r0 } { .mmb (p16) LD f74 = [A2], 5 * SIZE (p16) LD f77 = [A6], 5 * SIZE nop __LINE__ } ;; { .mmb (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB], 16 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f37, SIZE (p18) ST [B1] = f85, SIZE nop __LINE__ } { .mmb (p16) LD f80 = [A3], SIZE (p16) LD f83 = [A7], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f43, SIZE (p18) ST [B1] = f91, SIZE adds TEMP = -16 * SIZE, TEMP } { .mmb (p16) LD f86 = [A3], SIZE (p16) LD f89 = [A7], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f61, SIZE (p18) ST [B1] = f109, SIZE (p7) sub PREA = PREA, TEMP } { .mmb (p16) LD f92 = [A3], SIZE (p16) LD f95 = [A7], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f67, 5 * SIZE (p18) ST [B1] = f115, 5 * SIZE nop __LINE__ } { .mmb (p16) LD f98 = [A3], 5 * SIZE (p16) LD f101 = [A7], 5 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f49, SIZE (p18) ST [B1] = f97, SIZE nop __LINE__ } { .mmb (p16) LD f104 = [A4], SIZE (p16) LD f107 = [A8], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f55, SIZE (p18) ST [B1] = f103, SIZE nop __LINE__ } { .mmb (p16) LD f110 = [A4], SIZE (p16) LD f113 = [A8], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f73, SIZE (p18) ST [B1] = f121, SIZE nop __LINE__ } { .mmb (p16) LD f116 = [A4], SIZE (p16) LD f119 = [A8], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f79, 5 * SIZE (p18) ST [B1] = f127, 5 * SIZE (p16) adds COUNT = 1, COUNT } { .mmb (p16) LD f122 = [A4], 5 * SIZE (p16) LD f125 = [A8], 5 * SIZE br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmb (p10) LD f32 = [A1], SIZE (p10) LD f40 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f33 = [A1], SIZE (p10) LD f41 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f34 = [A1], SIZE (p10) LD f42 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f35 = [A1], SIZE (p10) LD f43 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f50 = [A3], SIZE (p10) LD f60 = [A4], SIZE nop __LINE__ } ;; { .mmb (p10) LD f51 = [A3], SIZE (p10) LD f61 = [A4], SIZE nop __LINE__ } ;; { .mmb (p10) LD f52 = [A3], SIZE (p10) LD f62 = [A4], SIZE nop __LINE__ } ;; { .mmb (p10) LD f53 = [A3], SIZE (p10) LD f63 = [A4], SIZE nop __LINE__ } ;; { .mmb (p11) LD f36 = [A1], SIZE (p11) LD f44 = [A2], SIZE nop __LINE__ } ;; { .mmb (p11) LD f37 = [A1] (p11) LD f45 = [A2] nop __LINE__ } ;; { .mmb (p11) LD f54 = [A3], SIZE (p11) LD f64 = [A4], SIZE nop __LINE__ } ;; { .mmb (p11) LD f55 = [A3] (p11) LD f65 = [A4] nop __LINE__ } ;; { .mmb (p10) ST [B ] = f32, SIZE (p10) ST [B1] = f50, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f33, SIZE (p10) ST [B1] = f51, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f40, SIZE (p10) ST [B1] = f60, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f41, 5 * SIZE (p10) ST [B1] = f61, 5 * SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f34, SIZE (p10) ST [B1] = f52, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f35, SIZE (p10) ST [B1] = f53, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f42, SIZE (p10) ST [B1] = f62, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f43, 5 * SIZE (p10) ST [B1] = f63, 5 * SIZE nop __LINE__ } ;; { .mmb (p11) ST [B ] = f36, SIZE (p11) ST [B1] = f54, SIZE nop __LINE__ } ;; { .mmi (p11) ST [B ] = f37, SIZE (p11) ST [B1] = f55, SIZE mov COUNT = r0 } ;; { .mmi (p11) ST [B ] = f44, SIZE (p11) ST [B1] = f64, SIZE cmp.eq p0,p6 = 0,J } ;; { .mmb (p11) ST [B ] = f45, 5 * SIZE (p11) ST [B1] = f65, 5 * SIZE (p6) br.cond.dptk.few .L11 } ;; .align 32 .L20: { .mmi mov A1 = A add A2 = A,LDA mov pr.rot = 0 } { .mmi adds A5 = 4 * SIZE, A adds B1 = 4 * SIZE, B tbit.z p8, p0 = N, 1 } ;; { .mmi cmp.eq p16,p0 = r0,r0 adds PREA = PREFETCHSIZE * SIZE, A mov ar.ec = 3 } ;; { .mib adds PREB = WPREFETCHSIZE * SIZE,B shr I = M, 2 (p8) br.cond.dpnt.few .L30 } ;; { .mmi shladd A = LDA, 1, A cmp.eq p6, p0 = 0, I adds I = -1, I } ;; { .mib adds A6 = 4 * SIZE, A2 mov ar.lc = I (p6) br.cond.dpnt.few .L25 } ;; .align 32 .L21: { .mmb (p16) lfetch.nt1 [PREA],LDA (p16) lfetch.excl.nt1 [PREB ],16 * SIZE nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mmb (p18) ST [B ] = f34, SIZE (p18) ST [B1] = f46, SIZE nop __LINE__ } { .mmb (p16) LD f32 = [A1], SIZE (p16) LD f35 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f40, SIZE (p18) ST [B1] = f52, SIZE nop __LINE__ } { .mmb (p16) LD f38 = [A1], SIZE (p16) LD f41 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f58, SIZE (p18) ST [B1] = f70, SIZE nop __LINE__ } { .mmb (p16) LD f44 = [A1], SIZE (p16) LD f47 = [A5], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f64, 5 * SIZE (p18) ST [B1] = f76, 5 * SIZE tbit.z p0,p7 = COUNT,0 } { .mmb (p16) LD f50 = [A1], 5 * SIZE (p16) LD f53 = [A5], 5 * SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f37, SIZE (p18) ST [B1] = f49, SIZE adds TEMP = -16 * SIZE,TEMP } { .mmb (p16) LD f56 = [A2], SIZE (p16) LD f59 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f43, SIZE (p18) ST [B1] = f55, SIZE (p7) sub PREA = PREA,TEMP } { .mmb (p16) LD f62 = [A2], SIZE (p16) LD f65 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f61, SIZE (p18) ST [B1] = f73, SIZE (p16) adds COUNT = 1,COUNT } { .mmb (p16) LD f68 = [A2], SIZE (p16) LD f71 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f67, 5 * SIZE (p18) ST [B1] = f79, 5 * SIZE shladd TEMP = LDA,2,r0 } { .mmb (p16) LD f74 = [A2], 5 * SIZE (p16) LD f77 = [A6], 5 * SIZE br.ctop.sptk.few .L21 } ;; .align 32 .L25: { .mmb (p10) LD f32 = [A1], SIZE (p10) LD f40 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f33 = [A1], SIZE (p10) LD f41 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f34 = [A1], SIZE (p10) LD f42 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f35 = [A1], SIZE (p10) LD f43 = [A2], SIZE nop __LINE__ } ;; { .mmb (p11) LD f36 = [A1], SIZE (p11) LD f44 = [A2], SIZE nop __LINE__ } ;; { .mmb (p11) LD f37 = [A1] (p11) LD f45 = [A2] nop __LINE__ } ;; { .mmb (p10) ST [B ] = f32, SIZE (p10) ST [B1] = f34, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f33, SIZE (p10) ST [B1] = f35, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f40, SIZE (p10) ST [B1] = f42, SIZE nop __LINE__ } ;; { .mmb (p10) ST [B ] = f41, 5 * SIZE (p10) ST [B1] = f43, 5 * SIZE nop __LINE__ } ;; { .mmi (p11) ST [B ] = f36, SIZE ;; (p11) ST [B ] = f37, SIZE nop __LINE__ } ;; { .mmi (p11) ST [B ] = f44, SIZE ;; (p11) ST [B ] = f45, SIZE nop __LINE__ } ;; .align 32 .L30: { .mmi mov A1 = A mov COUNT = r0 mov pr.rot = 0 } { .mmi adds A5 = 4 * SIZE,A adds B1 = 4 * SIZE,B tbit.z p8,p0 = N,0 } ;; { .mmi cmp.eq p16,p0 = r0,r0 nop __LINE__ mov ar.ec = 3 } { .mib nop __LINE__ shr I = M,2 (p8) br.cond.dptk.few .L999 } ;; { .mmi cmp.eq p6 ,p0 = 0, I adds PREA = PREFETCHSIZE * SIZE, A adds I = -1, I } ;; { .mib adds PREB = WPREFETCHSIZE * SIZE, B mov ar.lc = I (p6) br.cond.dpnt.few .L35 } ;; .align 32 .L31: { .mmi (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB ], 16 * SIZE tbit.z p0, p7 = COUNT, 0 } { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mmi (p18) ST [B ] = f34, SIZE (p18) ST [B1] = f37, SIZE shladd TEMP = LDA,2,r0 } { .mmb (p16) LD f32 = [A1], SIZE (p16) LD f35 = [A5], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B ] = f40, SIZE (p18) ST [B1] = f43, SIZE adds TEMP = -16 * SIZE,TEMP } { .mmb (p16) LD f38 = [A1], SIZE (p16) LD f41 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B ] = f46, SIZE (p18) ST [B1] = f49, SIZE nop __LINE__ } { .mmi (p16) LD f44 = [A1], SIZE (p16) LD f47 = [A5], SIZE (p7) sub PREA = PREA,TEMP } ;; { .mmi (p18) ST [B ] = f52, 5 * SIZE (p18) ST [B1] = f55, 5 * SIZE (p16) adds COUNT = 1,COUNT } { .mmb (p16) LD f50 = [A1], 5 * SIZE (p16) LD f53 = [A5], 5 * SIZE br.ctop.sptk.few .L31 } ;; .align 32 .L35: { .mmi (p10) LD f32 = [A1], SIZE ;; (p10) LD f33 = [A1], SIZE nop __LINE__ } ;; { .mmi (p10) LD f34 = [A1], SIZE ;; (p10) LD f35 = [A1], SIZE nop __LINE__ } ;; { .mmi (p11) LD f36 = [A1], SIZE ;; (p11) LD f37 = [A1] nop __LINE__ } ;; { .mmi (p10) ST [B ] = f32, SIZE ;; (p10) ST [B ] = f33, SIZE nop __LINE__ } ;; { .mmi (p10) ST [B ] = f34, SIZE ;; (p10) ST [B ] = f35, SIZE nop __LINE__ } ;; { .mmi (p11) ST [B ] = f36, SIZE ;; (p11) ST [B ] = f37, SIZE nop __LINE__ } ;; .align 32 .L999: mov pr = PR,-1 mov ar.lc = ARLC br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zgemm_tcopy.S000066400000000000000000000346701313527062700176620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 24 #define WPREFETCHSIZE 48 #define LD LDF8 #define ST STF8_NTA #define PREA r2 #define PREB r3 #define I r14 #define J r15 #define A1 r16 #define A2 r17 #define A3 r18 #define A4 r19 #define A5 r20 #define A6 r21 #define A7 r22 #define A8 r23 #define B1 r24 #define B2 r25 #define COUNT r26 #define TEMP r27 #define BO2 r28 #define BO3 r29 #define LDB r8 #define ARLC r30 #define PR r31 #define M r32 #define N r33 #define A r34 #define LDA r35 #define B r36 PROLOGUE .prologue PROFCODE .body { .mmi setf.sig f32 = M and r8 = -4, N mov ARLC = ar.lc } ;; { .mmi setf.sig f33 = r8 and r9 = -2, N mov PR = pr } ;; { .mmi setf.sig f34 = r9 shladd LDA = LDA, ZBASE_SHIFT, r0 shl LDB = M, BASE_SHIFT + 3 } ;; { .mfi nop __LINE__ xmpy.l f33 = f32, f33 shr J = M, 2 } { .mfi nop __LINE__ xmpy.l f34 = f32, f34 nop __LINE__ } ;; { .mmb getf.sig BO2 = f33 getf.sig BO3 = f34 nop __LINE__ } ;; { .mmi shladd BO2 = BO2, ZBASE_SHIFT, B shladd BO3 = BO3, ZBASE_SHIFT, B tbit.nz p10, p0 =N, 1 } { .mib cmp.eq p6, p0 = 0, J tbit.nz p11, p0 =N, 0 (p6) br.cond.dpnt .L20 } ;; .align 32 .L11: { .mmi mov A1 = A add A2 = A, LDA mov pr.rot = 0 } { .mmi shladd A3 = LDA, 1, A mov B1 = B shr I = N, 2 } ;; { .mmi shladd A4 = LDA, 1, A2 cmp.eq p16,p0 = r0, r0 mov ar.ec = 3 } { .mmi cmp.eq p6,p0 = 0,I adds I =-1, I adds J =-1, J } ;; { .mmi shladd A = LDA, 2, A adds A5 = 4 * SIZE, A1 adds A6 = 4 * SIZE, A2 } { .mmi adds A7 = 4 * SIZE, A3 adds A8 = 4 * SIZE, A4 adds PREA = PREFETCHSIZE * SIZE,A1 } ;; { .mmb adds B2 = 4 * SIZE, B adds PREB = WPREFETCHSIZE * SIZE, B nop __LINE__ } { .mib adds B = 32 * SIZE, B mov ar.lc = I (p6) br.cond.dpnt.few .L15 } ;; .L12: { .mmb (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB], LDB nop __LINE__ } { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mmb (p18) ST [B1] = f34, SIZE (p18) ST [B2] = f37, SIZE nop __LINE__ } { .mmb (p16) LD f32 = [A1], SIZE (p16) LD f35 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f40, SIZE (p18) ST [B2] = f43, SIZE nop __LINE__ } { .mmb (p16) LD f38 = [A1], SIZE (p16) LD f41 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f46, SIZE (p18) ST [B2] = f49, SIZE nop __LINE__ } { .mmb (p16) LD f44 = [A1], SIZE (p16) LD f47 = [A5], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f52, 5 * SIZE (p18) ST [B2] = f55, 5 * SIZE tbit.z p0,p7 = COUNT,0 } { .mmb (p16) LD f50 = [A1], 5 * SIZE (p16) LD f53 = [A5], 5 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f58, SIZE (p18) ST [B2] = f61, SIZE nop __LINE__ } { .mmb (p16) LD f56 = [A2], SIZE (p16) LD f59 = [A6], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f64, SIZE (p18) ST [B2] = f67, SIZE nop __LINE__ } { .mmb (p16) LD f62 = [A2], SIZE (p16) LD f65 = [A6], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f70, SIZE (p18) ST [B2] = f73, SIZE nop __LINE__ } { .mmb (p16) LD f68 = [A2], SIZE (p16) LD f71 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f76, 5 * SIZE (p18) ST [B2] = f79, 5 * SIZE shladd TEMP = LDA, 2, r0 } { .mmb (p16) LD f74 = [A2], 5 * SIZE (p16) LD f77 = [A6], 5 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f82, SIZE (p18) ST [B2] = f85, SIZE nop __LINE__ } { .mmb (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB], LDB nop __LINE__ } ;; { .mmi (p18) ST [B1] = f88, SIZE (p18) ST [B2] = f91, SIZE adds TEMP = -16 * SIZE, TEMP } { .mmb (p16) LD f80 = [A3], SIZE (p16) LD f83 = [A7], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f94, SIZE (p18) ST [B2] = f97, SIZE (p7) sub PREA = PREA, TEMP } { .mmb (p16) LD f86 = [A3], SIZE (p16) LD f89 = [A7], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f100, 5 * SIZE (p18) ST [B2] = f103, 5 * SIZE nop __LINE__ } { .mmb (p16) LD f92 = [A3], SIZE (p16) LD f95 = [A7], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f106, SIZE (p18) ST [B2] = f109, SIZE nop __LINE__ } { .mmb (p16) LD f98 = [A3], 5 * SIZE (p16) LD f101 = [A7], 5 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f112, SIZE (p18) ST [B2] = f115, SIZE nop __LINE__ } { .mmb (p16) LD f104 = [A4], SIZE (p16) LD f107 = [A8], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f118, SIZE (p18) ST [B2] = f121, SIZE nop __LINE__ } { .mmb (p16) LD f110 = [A4], SIZE (p16) LD f113 = [A8], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f124, -27 * SIZE (p18) ST [B2] = f127, -27 * SIZE (p16) adds COUNT = 1, COUNT } { .mmb (p16) LD f116 = [A4], SIZE (p16) LD f119 = [A8], SIZE nop __LINE__ } ;; { .mmb (p18) add B1 = B1, LDB (p18) add B2 = B2, LDB nop __LINE__ } { .mmb (p16) LD f122 = [A4], 5 * SIZE (p16) LD f125 = [A8], 5 * SIZE br.ctop.sptk.few .L12 } ;; .align 32 .L15: { .mmb (p10) LD f32 = [A1], SIZE (p10) LD f40 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f33 = [A1], SIZE (p10) LD f41 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f34 = [A1], SIZE (p10) LD f42 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f35 = [A1], SIZE (p10) LD f43 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f50 = [A3], SIZE (p10) LD f60 = [A4], SIZE nop __LINE__ } ;; { .mmb (p10) LD f51 = [A3], SIZE (p10) LD f61 = [A4], SIZE nop __LINE__ } ;; { .mmb (p10) LD f52 = [A3], SIZE (p10) LD f62 = [A4], SIZE nop __LINE__ } ;; { .mmb (p10) LD f53 = [A3], SIZE (p10) LD f63 = [A4], SIZE nop __LINE__ } ;; { .mmb (p11) LD f36 = [A1], SIZE (p11) LD f44 = [A2], SIZE nop __LINE__ } ;; { .mmb (p11) LD f37 = [A1] (p11) LD f45 = [A2] nop __LINE__ } ;; { .mmb (p11) LD f54 = [A3], SIZE (p11) LD f64 = [A4], SIZE nop __LINE__ } ;; { .mmi (p11) LD f55 = [A3] (p11) LD f65 = [A4] adds B2 = 4 * SIZE, BO2 } ;; { .mmb (p10) ST [BO2] = f32, SIZE (p10) ST [B2] = f40, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f33, SIZE (p10) ST [B2] = f41, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f34, SIZE (p10) ST [B2] = f42, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f35, 5 * SIZE (p10) ST [B2] = f43, 5 * SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f50, SIZE (p10) ST [B2] = f60, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f51, SIZE (p10) ST [B2] = f61, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f52, SIZE (p10) ST [B2] = f62, SIZE nop __LINE__ } ;; { .mmi (p10) ST [BO2] = f53, 5 * SIZE (p10) ST [B2] = f63 adds B2 = 4 * SIZE, BO3 } ;; { .mmb (p11) ST [BO3] = f36, SIZE (p11) ST [B2] = f54, SIZE nop __LINE__ } ;; { .mmi (p11) ST [BO3] = f37, SIZE (p11) ST [B2] = f55, SIZE mov COUNT = r0 } ;; { .mmi (p11) ST [BO3] = f44, SIZE (p11) ST [B2] = f64, SIZE cmp.eq p0,p6 = 0,J } ;; { .mmb (p11) ST [BO3] = f45, 5 * SIZE (p11) ST [B2] = f65, 5 * SIZE (p6) br.cond.dptk.few .L11 } ;; .align 32 .L20: { .mmi mov A1 = A add A2 = A, LDA mov pr.rot = 0 } { .mmi mov B1 = B adds PREA = PREFETCHSIZE * SIZE,A tbit.z p6, p0 = M, 1 } ;; { .mmi cmp.eq p16,p0 = r0, r0 adds B2 = 4 * SIZE, B mov ar.ec = 3 } { .mib adds PREB = WPREFETCHSIZE * SIZE, B shr I = N, 2 (p6) br.cond.dpnt .L30 } ;; { .mmi cmp.eq p6, p0 = 0, I adds I =-1, I nop __LINE__ } { .mmi shladd A = LDA, 1, A adds A5 = 4 * SIZE, A1 adds A6 = 4 * SIZE, A2 } ;; { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } { .mib adds B = 16 * SIZE, B mov ar.lc = I (p6) br.cond.dpnt.few .L25 } ;; .L22: { .mmi (p16) lfetch.nt1 [PREA], LDA (p16) lfetch.excl.nt1 [PREB], LDB shladd TEMP = LDA, 1, r0 } ;; { .mmb (p18) ST [B1] = f34, SIZE (p18) ST [B2] = f37, SIZE nop __LINE__ } { .mmb (p16) LD f32 = [A1], SIZE (p16) LD f35 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f40, SIZE (p18) ST [B2] = f43, SIZE nop __LINE__ } { .mmb (p16) LD f38 = [A1], SIZE (p16) LD f41 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f46, SIZE (p18) ST [B2] = f49, SIZE nop __LINE__ } { .mmb (p16) LD f44 = [A1], SIZE (p16) LD f47 = [A5], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f52, 5 * SIZE (p18) ST [B2] = f55, 5 * SIZE tbit.z p0,p7 = COUNT,0 } { .mmb (p16) LD f50 = [A1], 5 * SIZE (p16) LD f53 = [A5], 5 * SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f58, SIZE (p18) ST [B2] = f61, SIZE nop __LINE__ } { .mmb (p16) LD f56 = [A2], SIZE (p16) LD f59 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f64, SIZE (p18) ST [B2] = f67, SIZE adds TEMP = -16 * SIZE, TEMP } { .mmb (p16) LD f62 = [A2], SIZE (p16) LD f65 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f70, SIZE (p18) ST [B2] = f73, SIZE (p7) sub PREA = PREA, TEMP } { .mmb (p16) LD f68 = [A2], SIZE (p16) LD f71 = [A6], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f76, -11 * SIZE (p18) ST [B2] = f79, -11 * SIZE (p16) adds COUNT = 1, COUNT } { .mmb (p16) LD f74 = [A2], 5 * SIZE (p16) LD f77 = [A6], 5 * SIZE nop __LINE__ } ;; { .mmb (p18) add B1 = B1, LDB (p18) add B2 = B2, LDB br.ctop.sptk.few .L22 } ;; .align 32 .L25: { .mmb (p10) LD f32 = [A1], SIZE (p10) LD f40 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f33 = [A1], SIZE (p10) LD f41 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f34 = [A1], SIZE (p10) LD f42 = [A2], SIZE nop __LINE__ } ;; { .mmb (p10) LD f35 = [A1], SIZE (p10) LD f43 = [A2], SIZE nop __LINE__ } ;; { .mmb (p11) LD f36 = [A1], SIZE (p11) LD f44 = [A2], SIZE nop __LINE__ } ;; { .mmi (p11) LD f37 = [A1] (p11) LD f45 = [A2] adds B2 = 4 * SIZE, BO2 } ;; { .mmb (p10) ST [BO2] = f32, SIZE (p10) ST [B2] = f40, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f33, SIZE (p10) ST [B2] = f41, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f34, SIZE (p10) ST [B2] = f42, SIZE nop __LINE__ } ;; { .mmb (p10) ST [BO2] = f35, 5 * SIZE (p10) ST [B2] = f43, 5 * SIZE nop __LINE__ } ;; { .mmi (p11) ST [BO3] = f36, SIZE ;; (p11) ST [BO3] = f37, SIZE mov COUNT = r0 } ;; { .mmi (p11) ST [BO3] = f44, SIZE ;; (p11) ST [BO3] = f45, SIZE nop __LINE__ } ;; .align 32 .L30: { .mmi mov A1 = A adds A5 = 4 * SIZE, A mov pr.rot = 0 } { .mmi mov B1 = B adds B2 = 4 * SIZE, B tbit.z p6, p0 = M, 0 } ;; { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } { .mib cmp.eq p16,p0 = r0, r0 shr I = N, 2 (p6) br.cond.dpnt .L999 } ;; { .mmi cmp.eq p6, p0 = 0, I adds I =-1, I mov ar.ec = 3 } ;; { .mib nop __LINE__ mov ar.lc = I (p6) br.cond.dpnt.few .L35 } ;; .align 32 .L32: { .mmb (p18) ST [B1] = f34, SIZE (p18) ST [B2] = f37, SIZE nop __LINE__ } { .mmb (p16) LD f32 = [A1], SIZE (p16) LD f35 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f40, SIZE (p18) ST [B2] = f43, SIZE nop __LINE__ } { .mmb (p16) LD f38 = [A1], SIZE (p16) LD f41 = [A5], SIZE nop __LINE__ } ;; { .mmb (p18) ST [B1] = f46, SIZE (p18) ST [B2] = f49, SIZE nop __LINE__ } { .mmb (p16) LD f44 = [A1], SIZE (p16) LD f47 = [A5], SIZE nop __LINE__ } ;; { .mmi (p18) ST [B1] = f52, -3 * SIZE (p18) ST [B2] = f55, -3 * SIZE nop __LINE__ } { .mmb (p16) LD f50 = [A1], 5 * SIZE (p16) LD f53 = [A5], 5 * SIZE nop __LINE__ } ;; { .mmb nop __LINE__ nop __LINE__ nop __LINE__ } { .mmb (p18) add B1 = B1, LDB (p18) add B2 = B2, LDB br.ctop.sptk.few .L32 } ;; .align 32 .L35: { .mmi (p10) LD f32 = [A1], SIZE ;; (p10) LD f33 = [A1], SIZE nop __LINE__ } ;; { .mmi (p10) LD f34 = [A1], SIZE ;; (p10) LD f35 = [A1], SIZE nop __LINE__ } ;; { .mmi (p11) LD f36 = [A1], SIZE ;; (p11) LD f37 = [A1] nop __LINE__ } ;; { .mmi (p10) ST [BO2] = f32, SIZE ;; (p10) ST [BO2] = f33, SIZE nop __LINE__ } ;; { .mmi (p10) ST [BO2] = f34, SIZE ;; (p10) ST [BO2] = f35, SIZE nop __LINE__ } ;; { .mmi (p11) ST [BO3] = f36, SIZE ;; (p11) ST [BO3] = f37, SIZE nop __LINE__ } ;; .align 32 .L999: mov pr = PR, -1 mov ar.lc = ARLC br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zgemv_n.S000066400000000000000000001320271313527062700167650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define SP r12 #define M r32 #define N r33 #define A r37 #define LDA r38 #define X r39 #define INCX r34 #define Y r35 #define INCY r36 #define BUFFER r11 #define I r14 #define J r15 #define AO1 r16 #define AO2 r17 #define AO3 r18 #define AO4 r19 #define AO5 r20 #define AO6 r21 #define AO7 r22 #define AO8 r23 #define YLD1 r24 #define YLD2 r25 #define YST1 r26 #define YST2 r27 #define YY r28 #define XX r9 #define RPRE1 loc0 #define RPRE2 loc1 #define RPRE3 loc2 #define RPRE4 loc3 #define RPRE5 loc4 #define RPRE6 loc5 #define RPRE7 loc6 #define RPRE8 loc7 #define INCXM1 r2 #define INCX3M1 r3 #define AO9 loc8 #define AO10 loc9 #define AO11 loc10 #define AO12 loc11 #define AO13 loc12 #define AO14 loc13 #define AO15 loc14 #define AO16 loc15 #define PREB r8 #define ARLC r29 #define PR r30 #define ARPFS r31 #ifdef DOUBLE #define RPREFETCH (16 * 2 + 8) #else #define RPREFETCH (16 * 2 + 16) #endif #define PREFETCH lfetch.nt1 #define ALPHA_R f6 #define ALPHA_I f7 #if !defined(CONJ) && !defined(XCONJ) #define ADD1 FNMA #define ADD2 FMA #define ADD3 FNMA #define ADD4 FMA #elif defined(CONJ) && !defined(XCONJ) #define ADD1 FNMA #define ADD2 FMA #define ADD3 FMA #define ADD4 FNMA #elif !defined(CONJ) && defined(XCONJ) #define ADD1 FMA #define ADD2 FNMA #define ADD3 FNMA #define ADD4 FMA #else #define ADD1 FMA #define ADD2 FNMA #define ADD3 FMA #define ADD4 FNMA #endif PROLOGUE .prologue PROFCODE { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 mov ARLC = ar.lc } ;; mov PR = pr adds r14 = 16, SP adds r15 = 24, SP adds r16 = 32, SP adds r17 = 40, SP ;; adds r8 = -8 * 16, SP adds r9 = -7 * 16, SP adds SP = -8 * 16, SP ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 ;; ld8 INCX = [r14] ld8 Y = [r15] ld8 INCY = [r16] ld8 BUFFER = [r17] .body ;; cmp.ge p7, p0 = 0, M cmp.ge p6, p0 = 0, N mov ALPHA_R = f8 shladd INCX = INCX, ZBASE_SHIFT, r0 shladd LDA = LDA, ZBASE_SHIFT, r0 mov ALPHA_I = f9 ;; shladd INCY = INCY, ZBASE_SHIFT, r0 tbit.nz p8, p0 = A, BASE_SHIFT (p7) br.cond.dpnt .L999 ;; shladd XX = INCX, 1, X adds INCXM1 = -SIZE, INCX (p6) br.cond.dpnt .L999 ;; shladd INCX3M1 = INCX, 1, INCXM1 cmp.eq p10, p11 = 2 * SIZE, INCY mov YY = Y ;; (p11) mov YY = BUFFER mov YST1 = BUFFER shr J = M, 2 ;; { .mib adds YST2 = 4 * SIZE, BUFFER mov ar.lc = J (p10) br.cond.dptk .L10 } ;; .L02: STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 1 * SIZE STFD [YST2] = f0, 1 * SIZE ;; STFD [YST1] = f0, 5 * SIZE STFD [YST2] = f0, 5 * SIZE br.cloop.sptk.few .L02 ;; .L10: { .mmi mov AO1 = A nop __LINE__ shr J = N, 3 } ;; { .mmb add AO2 = LDA, A cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 } ;; .align 16 .L11: LDFD f32 = [X], SIZE LDFD f36 = [XX], SIZE mov pr.rot= 0 ;; LDFD f33 = [X], INCXM1 LDFD f37 = [XX], INCXM1 mov YLD1 = YY ;; LDFD f34 = [X], SIZE LDFD f38 = [XX], SIZE adds YLD2 = 4 * SIZE, YY ;; LDFD f35 = [X], INCX3M1 LDFD f39 = [XX], INCX3M1 mov YST1 = YY ;; LDFD f40 = [X], SIZE LDFD f44 = [XX], SIZE adds YST2 = 4 * SIZE, YY ;; LDFD f41 = [X], INCXM1 LDFD f45 = [XX], INCXM1 shr I = M, 2 ;; LDFD f42 = [X], SIZE LDFD f46 = [XX], SIZE mov AO1 = A ;; LDFD f43 = [X], INCX3M1 LDFD f47 = [XX], INCX3M1 add AO2 = LDA, A ;; shladd AO3 = LDA, 1, A FMPY f8 = ALPHA_R, f32 mov ar.ec= 2 shladd AO4 = LDA, 1, AO2 FMPY f9 = ALPHA_I, f32 ;; shladd AO5 = LDA, 1, AO3 FMPY f10 = ALPHA_R, f34 shladd AO6 = LDA, 1, AO4 FMPY f11 = ALPHA_I, f34 ;; FMPY f12 = ALPHA_R, f36 shladd AO7 = LDA, 1, AO5 FMPY f13 = ALPHA_I, f36 shladd AO8 = LDA, 1, AO6 FMPY f14 = ALPHA_R, f38 ;; adds PREB = RPREFETCH * SIZE, YLD1 FMPY f15 = ALPHA_I, f38 adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f16 = ALPHA_R, f40 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 FMPY f17 = ALPHA_I, f40 adds RPRE3 = RPREFETCH * SIZE, AO3 FMPY f18 = ALPHA_R, f42 adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 FMPY f19 = ALPHA_I, f42 adds RPRE5 = RPREFETCH * SIZE, AO5 FMPY f20 = ALPHA_R, f44 adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 FMPY f21 = ALPHA_I, f44 adds RPRE7 = RPREFETCH * SIZE, AO7 FMPY f22 = ALPHA_R, f46 adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 FMPY f23 = ALPHA_I, f46 ;; ADD1 f8 = ALPHA_I, f33, f8 tbit.nz p14, p0 = M, 1 ADD2 f9 = ALPHA_R, f33, f9 shladd A = LDA, 3, A ADD1 f10 = ALPHA_I, f35, f10 adds AO9 = 4 * SIZE, AO1 ADD2 f11 = ALPHA_R, f35, f11 adds AO10 = 4 * SIZE, AO2 ADD1 f12 = ALPHA_I, f37, f12 adds AO11 = 4 * SIZE, AO3 ADD2 f13 = ALPHA_R, f37, f13 adds AO12 = 4 * SIZE, AO4 ADD1 f14 = ALPHA_I, f39, f14 adds AO13 = 4 * SIZE, AO5 ADD2 f15 = ALPHA_R, f39, f15 adds AO14 = 4 * SIZE, AO6 ADD1 f16 = ALPHA_I, f41, f16 adds AO15 = 4 * SIZE, AO7 ADD2 f17 = ALPHA_R, f41, f17 adds AO16 = 4 * SIZE, AO8 ADD1 f18 = ALPHA_I, f43, f18 cmp.eq p6, p0 = 0, I ADD2 f19 = ALPHA_R, f43, f19 cmp.eq p16, p0 = r0, r0 ADD1 f20 = ALPHA_I, f45, f20 adds I = -1, I ADD2 f21 = ALPHA_R, f45, f21 ;; { .mfi nop __LINE__ ADD1 f22 = ALPHA_I, f47, f22 mov ar.lc = I } { .mfb nop __LINE__ ADD2 f23 = ALPHA_R, f47, f23 (p6) br.cond.dpnt .L15 } ;; .align 16 .L12: { .mfi (p17) LDFD f89 = [AO8], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p12, p13 = I, 0 } { .mfi (p17) LDFD f93 = [AO16], 1 * SIZE (p17) FMA f113 = f8, f37, f113 } ;; { .mfi (p17) LDFD f90 = [AO8], 1 * SIZE (p17) FMA f104 = f9, f33, f104 (p16) adds I = -1, I } { .mfi (p17) LDFD f94 = [AO16], 1 * SIZE (p17) FMA f116 = f9, f37, f116 } ;; { .mfi (p17) LDFD f91 = [AO8], 1 * SIZE (p17) FMA f107 = f8, f35, f107 } { .mfi (p17) LDFD f95 = [AO16], 1 * SIZE (p17) FMA f119 = f8, f39, f119 } ;; { .mfi (p17) LDFD f92 = [AO8], 5 * SIZE (p17) FMA f110 = f9, f35, f110 } { .mfi (p17) LDFD f96 = [AO16], 5 * SIZE (p17) FMA f122 = f9, f39, f122 } ;; { .mfi (p12) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) ADD3 f101 = f9, f34, f101 } { .mfi (p17) ADD3 f113 = f9, f38, f113 } ;; { .mfi (p16) LDFD f100 = [YLD1], 1 * SIZE (p17) ADD4 f104 = f8, f34, f104 } { .mfi (p16) LDFD f112 = [YLD2], 1 * SIZE (p17) ADD4 f116 = f8, f38, f116 } ;; { .mfi (p16) LDFD f103 = [YLD1], 1 * SIZE (p17) ADD3 f107 = f9, f36, f107 } { .mfi (p16) LDFD f115 = [YLD2], 1 * SIZE (p17) ADD3 f119 = f9, f40, f119 } ;; { .mfi (p12) PREFETCH [RPRE1], 16 * SIZE (p17) ADD4 f110 = f8, f36, f110 } { .mfi (p17) ADD4 f122 = f8, f40, f122 } ;; { .mfi (p16) LDFD f32 = [AO1], 1 * SIZE (p17) FMA f101 = f10, f41, f101 } { .mfi (p16) LDFD f36 = [AO9], 1 * SIZE (p17) FMA f113 = f10, f45, f113 } ;; { .mfi (p16) LDFD f33 = [AO1], 1 * SIZE (p17) FMA f104 = f11, f41, f104 } { .mfi (p16) LDFD f37 = [AO9], 1 * SIZE (p17) FMA f116 = f11, f45, f116 } ;; { .mfi (p16) LDFD f34 = [AO1], 1 * SIZE (p17) FMA f107 = f10, f43, f107 } { .mfi (p16) LDFD f38 = [AO9], 1 * SIZE (p17) FMA f119 = f10, f47, f119 } ;; { .mfi (p16) LDFD f35 = [AO1], 5 * SIZE (p17) FMA f110 = f11, f43, f110 } { .mfi (p16) LDFD f39 = [AO9], 5 * SIZE (p17) FMA f122 = f11, f47, f122 } ;; { .mfi (p17) ADD3 f101 = f11, f42, f101 } { .mfi (p17) ADD3 f113 = f11, f46, f113 } ;; { .mfi (p16) LDFD f106 = [YLD1], 1 * SIZE (p17) ADD4 f104 = f10, f42, f104 } { .mfi (p16) LDFD f118 = [YLD2], 1 * SIZE (p17) ADD4 f116 = f10, f46, f116 } ;; { .mfi (p16) LDFD f109 = [YLD1], 5 * SIZE (p17) ADD3 f107 = f11, f44, f107 } { .mfi (p16) LDFD f121 = [YLD2], 5 * SIZE (p17) ADD3 f119 = f11, f48, f119 } ;; { .mfi (p13) PREFETCH [RPRE2], 16 * SIZE (p17) ADD4 f110 = f10, f44, f110 } { .mfi (p17) ADD4 f122 = f10, f48, f122 } ;; { .mfi (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f101 = f12, f49, f101 } { .mfi (p16) LDFD f44 = [AO10], 1 * SIZE (p17) FMA f113 = f12, f53, f113 } ;; { .mfi (p16) LDFD f41 = [AO2], 1 * SIZE (p17) FMA f104 = f13, f49, f104 } { .mfi (p16) LDFD f45 = [AO10], 1 * SIZE (p17) FMA f116 = f13, f53, f116 } ;; { .mfi (p16) LDFD f42 = [AO2], 1 * SIZE (p17) FMA f107 = f12, f51, f107 } { .mfi (p16) LDFD f46 = [AO10], 1 * SIZE (p17) FMA f119 = f12, f55, f119 } ;; { .mfi (p16) LDFD f43 = [AO2], 5 * SIZE (p17) FMA f110 = f13, f51, f110 } { .mfi (p16) LDFD f47 = [AO10], 5 * SIZE (p17) FMA f122 = f13, f55, f122 } ;; { .mfi (p17) ADD3 f101 = f13, f50, f101 } { .mfi (p17) ADD3 f113 = f13, f54, f113 } ;; { .mfi (p17) ADD4 f104 = f12, f50, f104 } { .mfi (p17) ADD4 f116 = f12, f54, f116 } ;; { .mfi (p17) ADD3 f107 = f13, f52, f107 } { .mfi (p17) ADD3 f119 = f13, f56, f119 } ;; { .mfi (p12) PREFETCH [RPRE3], 16 * SIZE (p17) ADD4 f110 = f12, f52, f110 } { .mfi (p17) ADD4 f122 = f12, f56, f122 } ;; { .mfi (p16) LDFD f48 = [AO3], 1 * SIZE (p17) FMA f101 = f14, f57, f101 } { .mfi (p16) LDFD f52 = [AO11], 1 * SIZE (p17) FMA f113 = f14, f61, f113 } ;; { .mfi (p16) LDFD f49 = [AO3], 1 * SIZE (p17) FMA f104 = f15, f57, f104 } { .mfi (p16) LDFD f53 = [AO11], 1 * SIZE (p17) FMA f116 = f15, f61, f116 } ;; { .mfi (p16) LDFD f50 = [AO3], 1 * SIZE (p17) FMA f107 = f14, f59, f107 } { .mfi (p16) LDFD f54 = [AO11], 1 * SIZE (p17) FMA f119 = f14, f63, f119 } ;; { .mfi (p16) LDFD f51 = [AO3], 5 * SIZE (p17) FMA f110 = f15, f59, f110 } { .mfi (p16) LDFD f55 = [AO11], 5 * SIZE (p17) FMA f122 = f15, f63, f122 } ;; { .mfi (p17) ADD3 f101 = f15, f58, f101 } { .mfi (p17) ADD3 f113 = f15, f62, f113 } ;; { .mfi (p17) ADD4 f104 = f14, f58, f104 } { .mfi (p17) ADD4 f116 = f14, f62, f116 } ;; { .mfi (p17) ADD3 f107 = f15, f60, f107 } { .mfi (p17) ADD3 f119 = f15, f64, f119 } ;; { .mfi (p13) PREFETCH [RPRE4], 16 * SIZE (p17) ADD4 f110 = f14, f60, f110 } { .mfi (p17) ADD4 f122 = f14, f64, f122 } ;; { .mfi (p16) LDFD f56 = [AO4], 1 * SIZE (p17) FMA f101 = f16, f65, f101 } { .mfi (p16) LDFD f60 = [AO12], 1 * SIZE (p17) FMA f113 = f16, f69, f113 } ;; { .mfi (p16) LDFD f57 = [AO4], 1 * SIZE (p17) FMA f104 = f17, f65, f104 } { .mfi (p16) LDFD f61 = [AO12], 1 * SIZE (p17) FMA f116 = f17, f69, f116 } ;; { .mmf (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE (p17) FMA f107 = f16, f67, f107 } { .mmf (p16) LDFD f58 = [AO4], 1 * SIZE (p16) LDFD f62 = [AO12], 1 * SIZE (p17) FMA f119 = f16, f71, f119 } ;; { .mmf (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE (p17) FMA f110 = f17, f67, f110 } { .mmf (p16) LDFD f59 = [AO4], 5 * SIZE (p16) LDFD f63 = [AO12], 5 * SIZE (p17) FMA f122 = f17, f71, f122 } ;; { .mfi (p17) ADD3 f101 = f17, f66, f101 } { .mfi (p17) ADD3 f113 = f17, f70, f113 } ;; { .mfi (p17) ADD4 f104 = f16, f66, f104 } { .mfi (p17) ADD4 f116 = f16, f70, f116 } ;; { .mfi (p17) ADD3 f107 = f17, f68, f107 } { .mfi (p17) ADD3 f119 = f17, f72, f119 } ;; { .mfi (p12) PREFETCH [RPRE5], 16 * SIZE (p17) ADD4 f110 = f16, f68, f110 } { .mfi (p17) ADD4 f122 = f16, f72, f122 } ;; { .mfi (p16) LDFD f64 = [AO5], 1 * SIZE (p17) FMA f101 = f18, f73, f101 } { .mfi (p16) LDFD f68 = [AO13], 1 * SIZE (p17) FMA f113 = f18, f77, f113 } ;; { .mfi (p16) LDFD f65 = [AO5], 1 * SIZE (p17) FMA f104 = f19, f73, f104 } { .mfi (p16) LDFD f69 = [AO13], 1 * SIZE (p17) FMA f116 = f19, f77, f116 } ;; { .mmf (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE (p17) FMA f107 = f18, f75, f107 } { .mmf (p16) LDFD f66 = [AO5], 1 * SIZE (p16) LDFD f70 = [AO13], 1 * SIZE (p17) FMA f119 = f18, f79, f119 } ;; { .mmf (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE (p17) FMA f110 = f19, f75, f110 } { .mmf (p16) LDFD f67 = [AO5], 5 * SIZE (p16) LDFD f71 = [AO13], 5 * SIZE (p17) FMA f122 = f19, f79, f122 } ;; { .mfi (p17) ADD3 f101 = f19, f74, f101 } { .mfi (p17) ADD3 f113 = f19, f78, f113 } ;; { .mfi (p17) ADD4 f104 = f18, f74, f104 } { .mfi (p17) ADD4 f116 = f18, f78, f116 } ;; { .mfi (p17) ADD3 f107 = f19, f76, f107 } { .mfi (p17) ADD3 f119 = f19, f80, f119 } ;; { .mfi (p13) PREFETCH [RPRE6], 16 * SIZE (p17) ADD4 f110 = f18, f76, f110 } { .mfi (p17) ADD4 f122 = f18, f80, f122 } ;; { .mfi (p16) LDFD f72 = [AO6], 1 * SIZE (p17) FMA f101 = f20, f81, f101 } { .mfi (p16) LDFD f76 = [AO14], 1 * SIZE (p17) FMA f113 = f20, f85, f113 } ;; { .mfi (p16) LDFD f73 = [AO6], 1 * SIZE (p17) FMA f104 = f21, f81, f104 } { .mfi (p16) LDFD f77 = [AO14], 1 * SIZE (p17) FMA f116 = f21, f85, f116 } ;; { .mfi (p16) LDFD f74 = [AO6], 1 * SIZE (p17) FMA f107 = f20, f83, f107 } { .mfi (p16) LDFD f78 = [AO14], 1 * SIZE (p17) FMA f119 = f20, f87, f119 } ;; { .mfi (p16) LDFD f75 = [AO6], 5 * SIZE (p17) FMA f110 = f21, f83, f110 } { .mfi (p16) LDFD f79 = [AO14], 5 * SIZE (p17) FMA f122 = f21, f87, f122 } ;; { .mfi (p17) ADD3 f101 = f21, f82, f101 } { .mfi (p17) ADD3 f113 = f21, f86, f113 } ;; { .mfi (p17) ADD4 f104 = f20, f82, f104 } { .mfi (p17) ADD4 f116 = f20, f86, f116 } ;; { .mfi (p17) ADD3 f107 = f21, f84, f107 } { .mfi (p17) ADD3 f119 = f21, f88, f119 } ;; { .mfi (p12) PREFETCH [RPRE7], 16 * SIZE (p17) ADD4 f110 = f20, f84, f110 } { .mfi (p17) ADD4 f122 = f20, f88, f122 } ;; { .mfi (p16) LDFD f80 = [AO7], 1 * SIZE (p17) FMA f101 = f22, f89, f101 } { .mfi (p16) LDFD f84 = [AO15], 1 * SIZE (p17) FMA f113 = f22, f93, f113 } ;; { .mfi (p16) LDFD f81 = [AO7], 1 * SIZE (p17) FMA f104 = f23, f89, f104 } { .mfi (p16) LDFD f85 = [AO15], 1 * SIZE (p17) FMA f116 = f23, f93, f116 } ;; { .mfi (p16) LDFD f82 = [AO7], 1 * SIZE (p17) FMA f107 = f22, f91, f107 } { .mfi (p16) LDFD f86 = [AO15], 1 * SIZE (p17) FMA f119 = f22, f95, f119 } ;; { .mfi (p16) LDFD f83 = [AO7], 5 * SIZE (p17) FMA f110 = f23, f91, f110 } { .mfi (p16) LDFD f87 = [AO15], 5 * SIZE (p17) FMA f122 = f23, f95, f122 } ;; { .mfi (p17) ADD3 f101 = f23, f90, f101 } { .mfi (p17) ADD3 f113 = f23, f94, f113 } ;; { .mfi (p17) ADD4 f104 = f22, f90, f104 } { .mfi (p17) ADD4 f116 = f22, f94, f116 } ;; { .mfi (p17) ADD3 f107 = f23, f92, f107 } { .mfi (p17) ADD3 f119 = f23, f96, f119 } ;; { .mfi (p13) PREFETCH [RPRE8], 16 * SIZE (p17) ADD4 f110 = f22, f92, f110 } { .mfb (p17) ADD4 f122 = f22, f96, f122 br.ctop.sptk.few .L12 } ;; .align 16 .L15: { .mmi (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f32 = [AO1], 1 * SIZE (p14) LDFD f80 = [YLD1], 1 * SIZE cmp.lt p6, p0 = 1, J } ;; { .mmi (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE adds J = -1, J } { (p14) LDFD f33 = [AO1], 1 * SIZE (p14) LDFD f81 = [YLD1], 1 * SIZE and I = 3, M } ;; { .mmi (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE (p6) cmp.eq.unc p7, p0 = I, r0 } { .mmi (p14) LDFD f34 = [AO1], 1 * SIZE (p14) LDFD f82 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE } { .mmb (p14) LDFD f35 = [AO1], 1 * SIZE (p14) LDFD f83 = [YLD1], 1 * SIZE (p7) br.cond.dptk .L11 } ;; (p15) LDFD f36 = [AO1], 1 * SIZE (p15) LDFD f84 = [YLD1], 1 * SIZE ;; (p15) LDFD f37 = [AO1], 1 * SIZE (p15) LDFD f85 = [YLD1], 1 * SIZE ;; (p14) LDFD f38 = [AO2], 1 * SIZE (p14) LDFD f44 = [AO3], 1 * SIZE ;; (p14) LDFD f39 = [AO2], 1 * SIZE (p14) LDFD f45 = [AO3], 1 * SIZE ;; (p14) LDFD f40 = [AO2], 1 * SIZE (p14) LDFD f46 = [AO3], 1 * SIZE ;; (p14) LDFD f41 = [AO2], 1 * SIZE (p14) LDFD f47 = [AO3], 1 * SIZE (p14) FMA f80 = f8, f32, f80 ;; (p15) LDFD f42 = [AO2], 1 * SIZE (p15) LDFD f48 = [AO3], 1 * SIZE (p14) FMA f81 = f9, f32, f81 ;; (p15) LDFD f43 = [AO2], 1 * SIZE (p15) LDFD f49 = [AO3], 1 * SIZE (p14) FMA f82 = f8, f34, f82 ;; (p14) LDFD f50 = [AO4], 1 * SIZE (p14) LDFD f56 = [AO5], 1 * SIZE (p14) FMA f83 = f9, f34, f83 ;; (p14) LDFD f51 = [AO4], 1 * SIZE (p14) LDFD f57 = [AO5], 1 * SIZE (p15) FMA f84 = f8, f36, f84 ;; (p14) LDFD f52 = [AO4], 1 * SIZE (p14) LDFD f58 = [AO5], 1 * SIZE (p15) FMA f85 = f9, f36, f85 ;; (p14) LDFD f53 = [AO4], 1 * SIZE (p14) LDFD f59 = [AO5], 1 * SIZE (p14) ADD3 f80 = f9, f33, f80 ;; (p15) LDFD f54 = [AO4], 1 * SIZE (p15) LDFD f60 = [AO5], 1 * SIZE (p14) ADD4 f81 = f8, f33, f81 ;; (p15) LDFD f55 = [AO4], 1 * SIZE (p15) LDFD f61 = [AO5], 1 * SIZE (p14) ADD3 f82 = f9, f35, f82 ;; (p14) LDFD f62 = [AO6], 1 * SIZE (p14) LDFD f68 = [AO7], 1 * SIZE (p14) ADD4 f83 = f8, f35, f83 ;; (p14) LDFD f63 = [AO6], 1 * SIZE (p14) LDFD f69 = [AO7], 1 * SIZE (p15) ADD3 f84 = f9, f37, f84 ;; (p14) LDFD f64 = [AO6], 1 * SIZE (p14) LDFD f70 = [AO7], 1 * SIZE (p15) ADD4 f85 = f8, f37, f85 ;; (p14) LDFD f65 = [AO6], 1 * SIZE (p14) LDFD f71 = [AO7], 1 * SIZE (p14) FMA f80 = f10, f38, f80 ;; (p15) LDFD f66 = [AO6], 1 * SIZE (p15) LDFD f72 = [AO7], 1 * SIZE (p14) FMA f81 = f11, f38, f81 ;; (p15) LDFD f67 = [AO6], 1 * SIZE (p15) LDFD f73 = [AO7], 1 * SIZE (p14) FMA f82 = f10, f40, f82 ;; (p14) LDFD f74 = [AO8], 1 * SIZE (p14) FMA f83 = f11, f40, f83 ;; (p14) LDFD f75 = [AO8], 1 * SIZE (p15) FMA f84 = f10, f42, f84 ;; (p14) LDFD f76 = [AO8], 1 * SIZE (p15) FMA f85 = f11, f42, f85 ;; (p14) LDFD f77 = [AO8], 1 * SIZE (p14) ADD3 f80 = f11, f39, f80 ;; (p15) LDFD f78 = [AO8], 1 * SIZE (p14) ADD4 f81 = f10, f39, f81 ;; (p15) LDFD f79 = [AO8], 1 * SIZE (p14) ADD3 f82 = f11, f41, f82 (p14) ADD4 f83 = f10, f41, f83 (p15) ADD3 f84 = f11, f43, f84 (p15) ADD4 f85 = f10, f43, f85 ;; (p14) FMA f80 = f12, f44, f80 (p14) FMA f81 = f13, f44, f81 (p14) FMA f82 = f12, f46, f82 (p14) FMA f83 = f13, f46, f83 (p15) FMA f84 = f12, f48, f84 (p15) FMA f85 = f13, f48, f85 ;; (p14) ADD3 f80 = f13, f45, f80 (p14) ADD4 f81 = f12, f45, f81 (p14) ADD3 f82 = f13, f47, f82 (p14) ADD4 f83 = f12, f47, f83 (p15) ADD3 f84 = f13, f49, f84 (p15) ADD4 f85 = f12, f49, f85 ;; (p14) FMA f80 = f14, f50, f80 (p14) FMA f81 = f15, f50, f81 (p14) FMA f82 = f14, f52, f82 (p14) FMA f83 = f15, f52, f83 (p15) FMA f84 = f14, f54, f84 (p15) FMA f85 = f15, f54, f85 ;; (p14) ADD3 f80 = f15, f51, f80 (p14) ADD4 f81 = f14, f51, f81 (p14) ADD3 f82 = f15, f53, f82 (p14) ADD4 f83 = f14, f53, f83 (p15) ADD3 f84 = f15, f55, f84 (p15) ADD4 f85 = f14, f55, f85 ;; (p14) FMA f80 = f16, f56, f80 (p14) FMA f81 = f17, f56, f81 (p14) FMA f82 = f16, f58, f82 (p14) FMA f83 = f17, f58, f83 (p15) FMA f84 = f16, f60, f84 (p15) FMA f85 = f17, f60, f85 ;; (p14) ADD3 f80 = f17, f57, f80 (p14) ADD4 f81 = f16, f57, f81 (p14) ADD3 f82 = f17, f59, f82 (p14) ADD4 f83 = f16, f59, f83 (p15) ADD3 f84 = f17, f61, f84 (p15) ADD4 f85 = f16, f61, f85 ;; (p14) FMA f80 = f18, f62, f80 (p14) FMA f81 = f19, f62, f81 (p14) FMA f82 = f18, f64, f82 (p14) FMA f83 = f19, f64, f83 (p15) FMA f84 = f18, f66, f84 (p15) FMA f85 = f19, f66, f85 ;; (p14) ADD3 f80 = f19, f63, f80 (p14) ADD4 f81 = f18, f63, f81 (p14) ADD3 f82 = f19, f65, f82 (p14) ADD4 f83 = f18, f65, f83 (p15) ADD3 f84 = f19, f67, f84 (p15) ADD4 f85 = f18, f67, f85 ;; (p14) FMA f80 = f20, f68, f80 (p14) FMA f81 = f21, f68, f81 (p14) FMA f82 = f20, f70, f82 (p14) FMA f83 = f21, f70, f83 (p15) FMA f84 = f20, f72, f84 (p15) FMA f85 = f21, f72, f85 ;; (p14) ADD3 f80 = f21, f69, f80 (p14) ADD4 f81 = f20, f69, f81 (p14) ADD3 f82 = f21, f71, f82 (p14) ADD4 f83 = f20, f71, f83 (p15) ADD3 f84 = f21, f73, f84 (p15) ADD4 f85 = f20, f73, f85 ;; (p14) FMA f80 = f22, f74, f80 (p14) FMA f81 = f23, f74, f81 (p14) FMA f82 = f22, f76, f82 (p14) FMA f83 = f23, f76, f83 (p15) FMA f84 = f22, f78, f84 (p15) FMA f85 = f23, f78, f85 ;; (p14) ADD3 f80 = f23, f75, f80 (p14) ADD4 f81 = f22, f75, f81 (p14) ADD3 f82 = f23, f77, f82 (p14) ADD4 f83 = f22, f77, f83 (p15) ADD3 f84 = f23, f79, f84 (p15) ADD4 f85 = f22, f79, f85 ;; (p14) STFD [YST1] = f80, 1 * SIZE ;; (p14) STFD [YST1] = f81, 1 * SIZE ;; (p14) STFD [YST1] = f82, 1 * SIZE ;; (p14) STFD [YST1] = f83, 1 * SIZE ;; (p15) STFD [YST1] = f84, 1 * SIZE ;; (p15) STFD [YST1] = f85, 1 * SIZE (p6) br.cond.dptk .L11 ;; .L20: { .mmi mov YLD1 = YY adds YLD2 = 4 * SIZE, YY tbit.z p6, p0 = N, 2 } ;; { .mmb mov YST1 = YY adds YST2 = 4 * SIZE, YY (p6) br.cond.dpnt .L30 } ;; LDFD f32 = [X], SIZE LDFD f36 = [XX], SIZE mov AO1 = A ;; LDFD f33 = [X], INCXM1 LDFD f37 = [XX], INCXM1 add AO2 = LDA, A ;; LDFD f34 = [X], SIZE LDFD f38 = [XX], SIZE shladd AO3 = LDA, 1, A ;; LDFD f35 = [X], INCX3M1 LDFD f39 = [XX], INCX3M1 shladd AO4 = LDA, 1, AO2 ;; shladd A = LDA, 2, A FMPY f8 = ALPHA_R, f32 adds AO9 = 4 * SIZE, AO1 FMPY f9 = ALPHA_I, f32 adds AO10 = 4 * SIZE, AO2 FMPY f10 = ALPHA_R, f34 adds AO11 = 4 * SIZE, AO3 FMPY f11 = ALPHA_I, f34 adds AO12 = 4 * SIZE, AO4 FMPY f12 = ALPHA_R, f36 mov pr.rot= 0 FMPY f13 = ALPHA_I, f36 shr I = M, 2 FMPY f14 = ALPHA_R, f38 tbit.nz p14, p0 = M, 1 FMPY f15 = ALPHA_I, f38 ;; { .mfi cmp.eq p6, p0 = 0, I ADD1 f8 = ALPHA_I, f33, f8 mov ar.ec= 2 } ADD2 f9 = ALPHA_R, f33, f9 adds I = -1, I ADD1 f10 = ALPHA_I, f35, f10 adds PREB = RPREFETCH * SIZE, YLD1 ADD2 f11 = ALPHA_R, f35, f11 adds RPRE1 = RPREFETCH * SIZE, AO1 ADD1 f12 = ALPHA_I, f37, f12 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 ADD2 f13 = ALPHA_R, f37, f13 adds RPRE3 = RPREFETCH * SIZE, AO3 ADD1 f14 = ALPHA_I, f39, f14 ADD2 f15 = ALPHA_R, f39, f15 ;; { .mib cmp.eq p16, p0 = r0, r0 mov ar.lc = I (p6) br.cond.dpnt .L25 } ;; .align 16 .L22: { .mfi (p17) LDFD f57 = [AO4], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p12, p13 = I, 0 } { .mfi (p17) LDFD f61 = [AO12], 1 * SIZE (p17) FMA f113 = f8, f37, f113 } ;; { .mfi (p17) LDFD f58 = [AO4], 1 * SIZE (p17) FMA f104 = f9, f33, f104 (p16) adds I = -1, I } { .mfi (p17) LDFD f62 = [AO12], 1 * SIZE (p17) FMA f116 = f9, f37, f116 } ;; { .mfi (p17) LDFD f59 = [AO4], 1 * SIZE (p17) FMA f107 = f8, f35, f107 } { .mfi (p17) LDFD f63 = [AO12], 1 * SIZE (p17) FMA f119 = f8, f39, f119 } ;; { .mfi (p17) LDFD f60 = [AO4], 5 * SIZE (p17) FMA f110 = f9, f35, f110 } { .mfi (p17) LDFD f64 = [AO12], 5 * SIZE (p17) FMA f122 = f9, f39, f122 } ;; { .mfi (p12) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) ADD3 f101 = f9, f34, f101 } { .mfi (p17) ADD3 f113 = f9, f38, f113 } ;; { .mfi (p16) LDFD f100 = [YLD1], 1 * SIZE (p17) ADD4 f104 = f8, f34, f104 } { .mfi (p16) LDFD f112 = [YLD2], 1 * SIZE (p17) ADD4 f116 = f8, f38, f116 } ;; { .mfi (p16) LDFD f103 = [YLD1], 1 * SIZE (p17) ADD3 f107 = f9, f36, f107 } { .mfi (p16) LDFD f115 = [YLD2], 1 * SIZE (p17) ADD3 f119 = f9, f40, f119 } ;; { .mfi (p12) PREFETCH [RPRE1], 16 * SIZE (p17) ADD4 f110 = f8, f36, f110 } { .mfi (p17) ADD4 f122 = f8, f40, f122 } ;; { .mfi (p16) LDFD f32 = [AO1], 1 * SIZE (p17) FMA f101 = f10, f41, f101 } { .mfi (p16) LDFD f36 = [AO9], 1 * SIZE (p17) FMA f113 = f10, f45, f113 } ;; { .mfi (p16) LDFD f33 = [AO1], 1 * SIZE (p17) FMA f104 = f11, f41, f104 } { .mfi (p16) LDFD f37 = [AO9], 1 * SIZE (p17) FMA f116 = f11, f45, f116 } ;; { .mfi (p16) LDFD f34 = [AO1], 1 * SIZE (p17) FMA f107 = f10, f43, f107 } { .mfi (p16) LDFD f38 = [AO9], 1 * SIZE (p17) FMA f119 = f10, f47, f119 } ;; { .mfi (p16) LDFD f35 = [AO1], 5 * SIZE (p17) FMA f110 = f11, f43, f110 } { .mfi (p16) LDFD f39 = [AO9], 5 * SIZE (p17) FMA f122 = f11, f47, f122 } ;; { .mfi (p16) LDFD f106 = [YLD1], 1 * SIZE (p17) ADD3 f101 = f11, f42, f101 } { .mfi (p16) LDFD f118 = [YLD2], 1 * SIZE (p17) ADD3 f113 = f11, f46, f113 } ;; { .mfi (p16) LDFD f109 = [YLD1], 5 * SIZE (p17) ADD4 f104 = f10, f42, f104 } { .mfi (p16) LDFD f121 = [YLD2], 5 * SIZE (p17) ADD4 f116 = f10, f46, f116 } ;; { .mfi (p17) ADD3 f107 = f11, f44, f107 } { .mfi (p17) ADD3 f119 = f11, f48, f119 } ;; { .mfi (p13) PREFETCH [RPRE2], 16 * SIZE (p17) ADD4 f110 = f10, f44, f110 } { .mfi (p17) ADD4 f122 = f10, f48, f122 } ;; { .mfi (p16) LDFD f40 = [AO2], 1 * SIZE (p17) FMA f101 = f12, f49, f101 } { .mfi (p16) LDFD f44 = [AO10], 1 * SIZE (p17) FMA f113 = f12, f53, f113 } ;; { .mfi (p16) LDFD f41 = [AO2], 1 * SIZE (p17) FMA f104 = f13, f49, f104 } { .mfi (p16) LDFD f45 = [AO10], 1 * SIZE (p17) FMA f116 = f13, f53, f116 } ;; { .mmf (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE (p17) FMA f107 = f12, f51, f107 } { .mmf (p16) LDFD f42 = [AO2], 1 * SIZE (p16) LDFD f46 = [AO10], 1 * SIZE (p17) FMA f119 = f12, f55, f119 } ;; { .mmf (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE (p17) FMA f110 = f13, f51, f110 } { .mmf (p16) LDFD f43 = [AO2], 5 * SIZE (p16) LDFD f47 = [AO10], 5 * SIZE (p17) FMA f122 = f13, f55, f122 } ;; { .mfi (p17) ADD3 f101 = f13, f50, f101 } { .mfi (p17) ADD3 f113 = f13, f54, f113 } ;; { .mfi (p17) ADD4 f104 = f12, f50, f104 } { .mfi (p17) ADD4 f116 = f12, f54, f116 } ;; { .mfi (p17) ADD3 f107 = f13, f52, f107 } { .mfi (p17) ADD3 f119 = f13, f56, f119 } ;; { .mfi (p12) PREFETCH [RPRE3], 16 * SIZE (p17) ADD4 f110 = f12, f52, f110 } { .mfi (p17) ADD4 f122 = f12, f56, f122 } ;; { .mfi (p16) LDFD f48 = [AO3], 1 * SIZE (p17) FMA f101 = f14, f57, f101 } { .mfi (p16) LDFD f52 = [AO11], 1 * SIZE (p17) FMA f113 = f14, f61, f113 } ;; { .mfi (p16) LDFD f49 = [AO3], 1 * SIZE (p17) FMA f104 = f15, f57, f104 } { .mfi (p16) LDFD f53 = [AO11], 1 * SIZE (p17) FMA f116 = f15, f61, f116 } ;; { .mmf (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE (p17) FMA f107 = f14, f59, f107 } { .mmf (p16) LDFD f50 = [AO3], 1 * SIZE (p16) LDFD f54 = [AO11], 1 * SIZE (p17) FMA f119 = f14, f63, f119 } ;; { .mmf (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE (p17) FMA f110 = f15, f59, f110 } { .mmf (p16) LDFD f51 = [AO3], 5 * SIZE (p16) LDFD f55 = [AO11], 5 * SIZE (p17) FMA f122 = f15, f63, f122 } ;; { .mfi (p17) ADD3 f101 = f15, f58, f101 } { .mfi (p17) ADD3 f113 = f15, f62, f113 } ;; { .mfi (p17) ADD4 f104 = f14, f58, f104 } { .mfi (p17) ADD4 f116 = f14, f62, f116 } ;; { .mfi (p17) ADD3 f107 = f15, f60, f107 } { .mfi (p17) ADD3 f119 = f15, f64, f119 } ;; { .mfi (p13) PREFETCH [RPRE4], 16 * SIZE (p17) ADD4 f110 = f14, f60, f110 } { .mfb (p17) ADD4 f122 = f14, f64, f122 br.ctop.sptk.few .L22 } ;; .align 16 .L25: { .mmi (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f32 = [AO1], 1 * SIZE (p14) LDFD f80 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE } { .mmi (p14) LDFD f33 = [AO1], 1 * SIZE (p14) LDFD f81 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE } { .mmi (p14) LDFD f34 = [AO1], 1 * SIZE (p14) LDFD f82 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE } { .mmi (p14) LDFD f35 = [AO1], 1 * SIZE (p14) LDFD f83 = [YLD1], 1 * SIZE } ;; (p15) LDFD f36 = [AO1], 1 * SIZE (p15) LDFD f84 = [YLD1], 1 * SIZE ;; (p15) LDFD f37 = [AO1], 1 * SIZE (p15) LDFD f85 = [YLD1], 1 * SIZE ;; (p14) LDFD f38 = [AO2], 1 * SIZE (p14) LDFD f44 = [AO3], 1 * SIZE ;; (p14) LDFD f39 = [AO2], 1 * SIZE (p14) LDFD f45 = [AO3], 1 * SIZE (p14) FMA f80 = f8, f32, f80 ;; (p14) LDFD f40 = [AO2], 1 * SIZE (p14) LDFD f46 = [AO3], 1 * SIZE (p14) FMA f81 = f9, f32, f81 ;; (p14) LDFD f41 = [AO2], 1 * SIZE (p14) LDFD f47 = [AO3], 1 * SIZE (p14) FMA f82 = f8, f34, f82 ;; (p15) LDFD f42 = [AO2], 1 * SIZE (p15) LDFD f48 = [AO3], 1 * SIZE (p14) FMA f83 = f9, f34, f83 ;; (p15) LDFD f43 = [AO2], 1 * SIZE (p15) LDFD f49 = [AO3], 1 * SIZE (p15) FMA f84 = f8, f36, f84 ;; (p14) LDFD f50 = [AO4], 1 * SIZE (p15) FMA f85 = f9, f36, f85 ;; (p14) LDFD f51 = [AO4], 1 * SIZE (p14) ADD3 f80 = f9, f33, f80 ;; (p14) LDFD f52 = [AO4], 1 * SIZE (p14) ADD4 f81 = f8, f33, f81 ;; (p14) LDFD f53 = [AO4], 1 * SIZE (p14) ADD3 f82 = f9, f35, f82 ;; (p15) LDFD f54 = [AO4], 1 * SIZE (p14) ADD4 f83 = f8, f35, f83 ;; (p15) LDFD f55 = [AO4], 1 * SIZE (p15) ADD3 f84 = f9, f37, f84 (p15) ADD4 f85 = f8, f37, f85 ;; (p14) FMA f80 = f10, f38, f80 (p14) FMA f81 = f11, f38, f81 (p14) FMA f82 = f10, f40, f82 (p14) FMA f83 = f11, f40, f83 (p15) FMA f84 = f10, f42, f84 (p15) FMA f85 = f11, f42, f85 ;; (p14) ADD3 f80 = f11, f39, f80 (p14) ADD4 f81 = f10, f39, f81 (p14) ADD3 f82 = f11, f41, f82 (p14) ADD4 f83 = f10, f41, f83 (p15) ADD3 f84 = f11, f43, f84 (p15) ADD4 f85 = f10, f43, f85 ;; (p14) FMA f80 = f12, f44, f80 (p14) FMA f81 = f13, f44, f81 (p14) FMA f82 = f12, f46, f82 (p14) FMA f83 = f13, f46, f83 (p15) FMA f84 = f12, f48, f84 (p15) FMA f85 = f13, f48, f85 ;; (p14) ADD3 f80 = f13, f45, f80 (p14) ADD4 f81 = f12, f45, f81 (p14) ADD3 f82 = f13, f47, f82 (p14) ADD4 f83 = f12, f47, f83 (p15) ADD3 f84 = f13, f49, f84 (p15) ADD4 f85 = f12, f49, f85 ;; (p14) FMA f80 = f14, f50, f80 (p14) FMA f81 = f15, f50, f81 (p14) FMA f82 = f14, f52, f82 (p14) FMA f83 = f15, f52, f83 (p15) FMA f84 = f14, f54, f84 (p15) FMA f85 = f15, f54, f85 ;; (p14) ADD3 f80 = f15, f51, f80 (p14) ADD4 f81 = f14, f51, f81 (p14) ADD3 f82 = f15, f53, f82 (p14) ADD4 f83 = f14, f53, f83 (p15) ADD3 f84 = f15, f55, f84 (p15) ADD4 f85 = f14, f55, f85 ;; (p14) STFD [YST1] = f80, 1 * SIZE ;; (p14) STFD [YST1] = f81, 1 * SIZE ;; (p14) STFD [YST1] = f82, 1 * SIZE ;; (p14) STFD [YST1] = f83, 1 * SIZE ;; (p15) STFD [YST1] = f84, 1 * SIZE ;; (p15) STFD [YST1] = f85, 1 * SIZE ;; .L30: { .mmi mov YLD1 = YY adds YLD2 = 4 * SIZE, YY tbit.z p6, p0 = N, 1 } ;; { .mmb mov YST1 = YY adds YST2 = 4 * SIZE, YY (p6) br.cond.dpnt .L40 } ;; LDFD f32 = [X], SIZE mov AO1 = A mov pr.rot= 0 ;; LDFD f33 = [X], INCXM1 add AO2 = A, LDA shr I = M, 2 ;; LDFD f34 = [X], SIZE shladd A = LDA, 1, A tbit.nz p14, p0 = M, 1 ;; LDFD f35 = [X], INCXM1 cmp.eq p6, p0 = 0, I ;; FMPY f8 = ALPHA_R, f32 adds AO9 = 4 * SIZE, AO1 FMPY f9 = ALPHA_I, f32 adds AO10 = 4 * SIZE, AO2 FMPY f10 = ALPHA_R, f34 mov ar.ec= 2 FMPY f11 = ALPHA_I, f34 ;; adds PREB = RPREFETCH * SIZE, YLD1 adds I = -1, I ADD1 f8 = ALPHA_I, f33, f8 adds RPRE1 = RPREFETCH * SIZE, AO1 ADD2 f9 = ALPHA_R, f33, f9 adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 ADD1 f10 = ALPHA_I, f35, f10 ADD2 f11 = ALPHA_R, f35, f11 ;; { .mib cmp.eq p16, p0 = r0, r0 mov ar.lc = I (p6) br.cond.dpnt .L35 } ;; .align 16 .L32: { .mfi (p17) LDFD f41 = [AO2], 1 * SIZE (p17) FMA f101 = f8, f33, f101 (p16) tbit.nz.unc p12, p13 = I, 0 } { .mfi (p17) LDFD f45 = [AO10], 1 * SIZE (p17) FMA f113 = f8, f37, f113 } ;; { .mfi (p17) LDFD f42 = [AO2], 1 * SIZE (p17) FMA f104 = f9, f33, f104 (p16) adds I = -1, I } { .mfi (p17) LDFD f46 = [AO10], 1 * SIZE (p17) FMA f116 = f9, f37, f116 } ;; { .mmf (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE (p17) FMA f107 = f8, f35, f107 } { .mmf (p17) LDFD f43 = [AO2], 1 * SIZE (p17) LDFD f47 = [AO10], 1 * SIZE (p17) FMA f119 = f8, f39, f119 } ;; { .mmf (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE (p17) FMA f110 = f9, f35, f110 } { .mmf (p17) LDFD f44 = [AO2], 5 * SIZE (p17) LDFD f48 = [AO10], 5 * SIZE (p17) FMA f122 = f9, f39, f122 } ;; { .mfi (p12) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) ADD3 f101 = f9, f34, f101 } { .mfi (p17) ADD3 f113 = f9, f38, f113 } ;; { .mfi (p16) LDFD f100 = [YLD1], 1 * SIZE (p17) ADD4 f104 = f8, f34, f104 } { .mfi (p16) LDFD f112 = [YLD2], 1 * SIZE (p17) ADD4 f116 = f8, f38, f116 } ;; { .mfi (p16) LDFD f103 = [YLD1], 1 * SIZE (p17) ADD3 f107 = f9, f36, f107 } { .mfi (p16) LDFD f115 = [YLD2], 1 * SIZE (p17) ADD3 f119 = f9, f40, f119 } ;; { .mfi (p12) PREFETCH [RPRE1], 16 * SIZE (p17) ADD4 f110 = f8, f36, f110 } { .mfi (p17) ADD4 f122 = f8, f40, f122 } ;; { .mfi (p16) LDFD f32 = [AO1], 1 * SIZE (p17) FMA f101 = f10, f41, f101 } { .mfi (p16) LDFD f36 = [AO9], 1 * SIZE (p17) FMA f113 = f10, f45, f113 } ;; { .mmf (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE (p17) FMA f104 = f11, f41, f104 } { .mmf (p16) LDFD f33 = [AO1], 1 * SIZE (p16) LDFD f37 = [AO9], 1 * SIZE (p17) FMA f116 = f11, f45, f116 } ;; { .mmf (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE (p17) FMA f107 = f10, f43, f107 } { .mmf (p16) LDFD f34 = [AO1], 1 * SIZE (p16) LDFD f38 = [AO9], 1 * SIZE (p17) FMA f119 = f10, f47, f119 } ;; { .mfi (p16) LDFD f35 = [AO1], 5 * SIZE (p17) FMA f110 = f11, f43, f110 } { .mfi (p16) LDFD f39 = [AO9], 5 * SIZE (p17) FMA f122 = f11, f47, f122 } ;; { .mfi (p16) LDFD f106 = [YLD1], 1 * SIZE (p17) ADD3 f101 = f11, f42, f101 } { .mfi (p16) LDFD f118 = [YLD2], 1 * SIZE (p17) ADD3 f113 = f11, f46, f113 } ;; { .mfi (p16) LDFD f109 = [YLD1], 5 * SIZE (p17) ADD4 f104 = f10, f42, f104 } { .mfi (p16) LDFD f121 = [YLD2], 5 * SIZE (p17) ADD4 f116 = f10, f46, f116 } ;; { .mfi (p17) ADD3 f107 = f11, f44, f107 } { .mfi (p17) ADD3 f119 = f11, f48, f119 } ;; { .mfi (p13) PREFETCH [RPRE2], 16 * SIZE (p17) ADD4 f110 = f10, f44, f110 } { .mfb (p17) ADD4 f122 = f10, f48, f122 br.ctop.sptk.few .L32 } ;; .align 16 .L35: { .mmi (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f32 = [AO1], 1 * SIZE (p14) LDFD f80 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE } { .mmi (p14) LDFD f33 = [AO1], 1 * SIZE (p14) LDFD f81 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE } { .mmi (p14) LDFD f34 = [AO1], 1 * SIZE (p14) LDFD f82 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE } { .mmi (p14) LDFD f35 = [AO1], 1 * SIZE (p14) LDFD f83 = [YLD1], 1 * SIZE } ;; (p15) LDFD f36 = [AO1], 1 * SIZE (p15) LDFD f84 = [YLD1], 1 * SIZE ;; (p15) LDFD f37 = [AO1], 1 * SIZE (p15) LDFD f85 = [YLD1], 1 * SIZE ;; (p14) LDFD f38 = [AO2], 1 * SIZE (p14) FMA f80 = f8, f32, f80 ;; (p14) LDFD f39 = [AO2], 1 * SIZE (p14) FMA f81 = f9, f32, f81 ;; (p14) LDFD f40 = [AO2], 1 * SIZE (p14) FMA f82 = f8, f34, f82 ;; (p14) LDFD f41 = [AO2], 1 * SIZE (p14) FMA f83 = f9, f34, f83 ;; (p15) LDFD f42 = [AO2], 1 * SIZE (p15) FMA f84 = f8, f36, f84 ;; (p15) LDFD f43 = [AO2], 1 * SIZE (p15) FMA f85 = f9, f36, f85 ;; (p14) ADD3 f80 = f9, f33, f80 (p14) ADD4 f81 = f8, f33, f81 (p14) ADD3 f82 = f9, f35, f82 (p14) ADD4 f83 = f8, f35, f83 (p15) ADD3 f84 = f9, f37, f84 (p15) ADD4 f85 = f8, f37, f85 ;; (p14) FMA f80 = f10, f38, f80 (p14) FMA f81 = f11, f38, f81 (p14) FMA f82 = f10, f40, f82 (p14) FMA f83 = f11, f40, f83 (p15) FMA f84 = f10, f42, f84 (p15) FMA f85 = f11, f42, f85 ;; (p14) ADD3 f80 = f11, f39, f80 (p14) ADD4 f81 = f10, f39, f81 (p14) ADD3 f82 = f11, f41, f82 (p14) ADD4 f83 = f10, f41, f83 (p15) ADD3 f84 = f11, f43, f84 (p15) ADD4 f85 = f10, f43, f85 ;; (p14) STFD [YST1] = f80, 1 * SIZE ;; (p14) STFD [YST1] = f81, 1 * SIZE ;; (p14) STFD [YST1] = f82, 1 * SIZE ;; (p14) STFD [YST1] = f83, 1 * SIZE ;; (p15) STFD [YST1] = f84, 1 * SIZE ;; (p15) STFD [YST1] = f85, 1 * SIZE ;; .L40: { .mmi mov YLD1 = YY adds YLD2 = 4 * SIZE, YY tbit.z p6, p0 = N, 0 } { .mmb mov YST1 = YY adds YST2 = 4 * SIZE, YY (p6) br.cond.dpnt .L990 } ;; LDFD f32 = [X], SIZE mov AO1 = A adds AO9 = 4 * SIZE, A ;; LDFD f33 = [X], INCXM1 add A = A, LDA mov pr.rot= 0 ;; { .mfi adds PREB = RPREFETCH * SIZE, YLD1 FMPY f8 = ALPHA_R, f32 mov ar.ec= 2 } { .mfi adds RPRE1 = RPREFETCH * SIZE, AO1 FMPY f9 = ALPHA_I, f32 shr I = M, 2 } ;; { .mmf cmp.eq p6, p0 = 0, I cmp.eq p16, p0 = r0, r0 ADD1 f8 = ALPHA_I, f33, f8 } { .mfi adds I = -1, I ADD2 f9 = ALPHA_R, f33, f9 tbit.nz p14, p0 = M, 1 } ;; { .mib nop __LINE__ mov ar.lc = I (p6) br.cond.dpnt .L45 } ;; .align 16 .L42: { .mmf (p16) LDFD f100 = [YLD1], 1 * SIZE (p16) LDFD f112 = [YLD2], 1 * SIZE (p17) FMA f101 = f8, f33, f101 } { .mmf (p16) LDFD f32 = [AO1], 1 * SIZE (p16) LDFD f44 = [AO9], 1 * SIZE (p17) FMA f113 = f8, f45, f113 } ;; { .mmf (p16) LDFD f103 = [YLD1], 1 * SIZE (p16) LDFD f115 = [YLD2], 1 * SIZE (p17) FMA f104 = f9, f33, f104 } { .mmf (p16) LDFD f35 = [AO1], 1 * SIZE (p16) LDFD f47 = [AO9], 1 * SIZE (p17) FMA f116 = f9, f45, f116 } ;; { .mmf (p16) LDFD f106 = [YLD1], 1 * SIZE (p16) LDFD f118 = [YLD2], 1 * SIZE (p17) FMA f107 = f8, f39, f107 } { .mmf (p16) LDFD f38 = [AO1], 1 * SIZE (p16) LDFD f50 = [AO9], 1 * SIZE (p17) FMA f119 = f8, f51, f119 } ;; { .mmf (p16) LDFD f109 = [YLD1], 5 * SIZE (p16) LDFD f121 = [YLD2], 5 * SIZE (p17) FMA f110 = f9, f39, f110 } { .mmf (p16) LDFD f41 = [AO1], 5 * SIZE (p16) LDFD f53 = [AO9], 5 * SIZE (p17) FMA f122 = f9, f51, f122 } ;; { .mmf (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE (p17) ADD3 f101 = f9, f36, f101 } { .mfi (p17) ADD3 f113 = f9, f48, f113 (p16) tbit.nz.unc p12, p13 = I, 0 } ;; { .mmf (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE (p17) ADD4 f104 = f8, f36, f104 } { .mfi (p12) PREFETCH [RPRE1], 16 * SIZE (p17) ADD4 f116 = f8, f48, f116 } ;; { .mmf (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE (p17) ADD3 f107 = f9, f42, f107 } { .mfi (p13) lfetch.excl.nt2 [PREB], 16 * SIZE (p17) ADD3 f119 = f9, f54, f119 } ;; { .mmf (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE (p17) ADD4 f110 = f8, f42, f110 } { .mfb (p17) ADD4 f122 = f8, f54, f122 br.ctop.sptk.few .L42 } ;; .align 16 .L45: { .mmi (p18) STFD [YST1] = f102, 1 * SIZE (p18) STFD [YST2] = f114, 1 * SIZE tbit.nz p15, p0 = M, 0 } { .mmi (p14) LDFD f32 = [AO1], 1 * SIZE (p14) LDFD f80 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f105, 1 * SIZE (p18) STFD [YST2] = f117, 1 * SIZE } { .mmi (p14) LDFD f33 = [AO1], 1 * SIZE (p14) LDFD f81 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f108, 1 * SIZE (p18) STFD [YST2] = f120, 1 * SIZE } { .mmi (p14) LDFD f34 = [AO1], 1 * SIZE (p14) LDFD f82 = [YLD1], 1 * SIZE } ;; { .mmi (p18) STFD [YST1] = f111, 5 * SIZE (p18) STFD [YST2] = f123, 5 * SIZE } { .mmi (p14) LDFD f35 = [AO1], 1 * SIZE (p14) LDFD f83 = [YLD1], 1 * SIZE } ;; (p15) LDFD f36 = [AO1], 1 * SIZE (p15) LDFD f84 = [YLD1], 1 * SIZE ;; (p15) LDFD f37 = [AO1], 1 * SIZE (p15) LDFD f85 = [YLD1], 1 * SIZE ;; (p14) FMA f80 = f8, f32, f80 (p14) FMA f81 = f9, f32, f81 (p14) FMA f82 = f8, f34, f82 (p14) FMA f83 = f9, f34, f83 (p15) FMA f84 = f8, f36, f84 (p15) FMA f85 = f9, f36, f85 ;; (p14) ADD3 f80 = f9, f33, f80 (p14) ADD4 f81 = f8, f33, f81 (p14) ADD3 f82 = f9, f35, f82 (p14) ADD4 f83 = f8, f35, f83 (p15) ADD3 f84 = f9, f37, f84 (p15) ADD4 f85 = f8, f37, f85 ;; (p14) STFD [YST1] = f80, 1 * SIZE ;; (p14) STFD [YST1] = f81, 1 * SIZE ;; (p14) STFD [YST1] = f82, 1 * SIZE ;; (p14) STFD [YST1] = f83, 1 * SIZE ;; (p15) STFD [YST1] = f84, 1 * SIZE ;; (p15) STFD [YST1] = f85, 1 * SIZE ;; .L990: { .mmi mov YST1 = Y mov YST2 = Y mov pr.rot= 0 } { .mib mov YLD1 = YY shr J = M, 2 (p10) br.cond.dptk .L999 } ;; { .mmi cmp.eq p6, p0 = r0, J adds INCY = - SIZE, INCY mov ar.ec = 4 } { .mmi cmp.eq p16, p0 = r0, r0 adds J = -1, J tbit.nz p13, p0 = M, 1 } ;; { .mib nop __LINE__ mov ar.lc = J (p6) br.cond.dpnt .L995 } ;; .L992: { .mfi (p19) STFD [YST2] = f35, 1 * SIZE (p18) FADD f34 = f34, f66 } { .mmi (p16) LDFD f64 = [YLD1], 1 * SIZE (p16) LDFD f32 = [YST1], 1 * SIZE } ;; { .mfi (p19) STFD [YST2] = f39 (p18) FADD f38 = f38, f70 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f68 = [YLD1], 1 * SIZE (p16) LDFD f36 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f43, 1 * SIZE (p18) FADD f42 = f42, f74 } { .mmi (p16) LDFD f72 = [YLD1], 1 * SIZE (p16) LDFD f40 = [YST1], 1 * SIZE } ;; { .mfi (p19) STFD [YST2] = f47 (p18) FADD f50 = f50, f82 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f76 = [YLD1], 1 * SIZE (p16) LDFD f44 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f51, 1 * SIZE (p18) FADD f54 = f54, f86 } { .mmi (p16) LDFD f80 = [YLD1], 1 * SIZE (p16) LDFD f48 = [YST1], 1 * SIZE } ;; { .mfi (p19) STFD [YST2] = f55 (p18) FADD f58 = f58, f90 (p19) add YST2 = YST2, INCY } { .mmi (p16) LDFD f84 = [YLD1], 1 * SIZE (p16) LDFD f52 = [YST1], INCY } ;; { .mfi (p19) STFD [YST2] = f59, 1 * SIZE (p18) FADD f46 = f46, f78 } { .mmi (p16) LDFD f88 = [YLD1], 1 * SIZE (p16) LDFD f56 = [YST1], 1 * SIZE } ;; { .mfi (p19) STFD [YST2] = f63 (p18) FADD f62 = f62, f94 (p19) add YST2 = YST2, INCY } { .mmb (p16) LDFD f92 = [YLD1], 1 * SIZE (p16) LDFD f60 = [YST1], INCY br.ctop.sptk.few .L992 } ;; .L995: (p13) LDFD f32 = [YST1], 1 * SIZE (p13) LDFD f40 = [YLD1], 1 * SIZE tbit.nz p14, p0 = M, 0 ;; (p13) LDFD f33 = [YST1], INCY (p13) LDFD f41 = [YLD1], 1 * SIZE ;; (p13) LDFD f34 = [YST1], 1 * SIZE (p13) LDFD f42 = [YLD1], 1 * SIZE ;; (p13) LDFD f35 = [YST1], INCY (p13) LDFD f43 = [YLD1], 1 * SIZE ;; (p14) LDFD f36 = [YST1], 1 * SIZE (p14) LDFD f44 = [YLD1], 1 * SIZE ;; (p14) LDFD f37 = [YST1], INCY (p14) LDFD f45 = [YLD1], 1 * SIZE ;; (p13) FADD f32 = f32, f40 (p13) FADD f33 = f33, f41 (p13) FADD f34 = f34, f42 (p13) FADD f35 = f35, f43 (p14) FADD f36 = f36, f44 (p14) FADD f37 = f37, f45 ;; (p13) STFD [YST2] = f32, 1 * SIZE ;; (p13) STFD [YST2] = f33 (p13) add YST2 = YST2, INCY ;; (p13) STFD [YST2] = f34, 1 * SIZE ;; (p13) STFD [YST2] = f35 (p13) add YST2 = YST2, INCY ;; (p14) STFD [YST2] = f36, 1 * SIZE ;; (p14) STFD [YST2] = f37 ;; .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zgemv_t.S000066400000000000000000001055171313527062700167770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define SP r12 #define M r32 #define N r33 #define A r37 #define LDA r38 #define X r39 #define INCX r34 #define Y r35 #define INCY r36 #define BUFFER r11 #define I r15 #define J r16 #define AO1 r18 #define AO2 r19 #define AO3 r20 #define AO4 r21 #define AO5 r22 #define AO6 r23 #define AO7 r24 #define AO8 r25 #define BO r26 #define INCYM1 r28 #define RPRE1 loc0 #define RPRE2 loc1 #define RPRE3 loc2 #define RPRE4 loc3 #define RPRE5 loc4 #define RPRE6 loc5 #define RPRE7 loc6 #define RPRE8 loc7 #define AO21 loc8 #define AO41 loc9 #define AO61 loc10 #define AO81 loc11 #define CLD1 loc12 #define CLD2 loc13 #define CST1 loc14 #define CST2 loc15 #define PREB r8 #define WPRE r9 #define OFFSET PREB #define INCX3M1 WPRE #define INCY3M1 r10 #define ARLC r29 #define PR r30 #define ARPFS r31 #ifdef DOUBLE #define RPREFETCH (16 * 2 + 8) #else #define RPREFETCH (16 * 2 + 16) #endif #define PREFETCH lfetch.nt1 #define ALPHA_R f6 #define ALPHA_I f7 #if !defined(CONJ) && !defined(XCONJ) #define ADD1 FMA #define ADD2 FMA #define ADD3 FNMA #define ADD4 FMA #elif defined(CONJ) && !defined(XCONJ) #define ADD1 FMA #define ADD2 FMA #define ADD3 FMA #define ADD4 FNMA #elif !defined(CONJ) && defined(XCONJ) #define ADD1 FMA #define ADD2 FNMA #define ADD3 FMA #define ADD4 FMA #else #define ADD1 FMA #define ADD2 FNMA #define ADD3 FNMA #define ADD4 FNMA #endif PROLOGUE PROFCODE .prologue { .mmi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 16, 0, 0 adds r14 = 16, SP mov ARLC = ar.lc } { .mmi adds r8 = -8 * 16, SP adds r9 = -7 * 16, SP adds SP = -8 * 16, SP } ;; { .mmi stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 mov PR = pr } ;; { .mmi stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 adds r15 = 152, SP } ;; { .mmi stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 adds r16 = 160, SP } ;; { .mmi stf.spill [r8] = f22 stf.spill [r9] = f23 adds r17 = 168, SP } .body ;; { .mmf ld8 INCX = [r14] ld8 Y = [r15] mov ALPHA_R = f8 } { .mmf ld8 INCY = [r16] ld8 BUFFER = [r17] mov ALPHA_I = f9 } ;; { .mmi shladd INCX = INCX, ZBASE_SHIFT, r0 shladd LDA = LDA, ZBASE_SHIFT, r0 mov pr.rot= 0 } { .mmi cmp.ge p7, p0 = 0, M cmp.ge p6, p0 = 0, N shladd INCY = INCY, ZBASE_SHIFT, r0 } ;; { .mmi mov AO1 = BUFFER adds OFFSET = -SIZE, INCX shr I = M, 3 } { .mib adds INCYM1 = - SIZE, INCY shladd INCX3M1 = INCX, 1, INCX (p7) br.cond.dpnt .L999 } ;; { .mmi shladd BO = INCX, 1, X adds AO2 = 4 * SIZE, BUFFER mov ar.ec= 5 } { .mmb shladd INCY3M1 = INCY, 1, INCYM1 adds I = -1, I (p6) br.cond.dpnt .L999 } ;; { .mmi adds INCX3M1 = -SIZE, INCX3M1 cmp.eq p16, p0 = r0, r0 tbit.nz p13, p0 = M, 2 } { .mib cmp.gt p6, p0 = 0, I mov ar.lc = I (p6) br.cond.dpnt .L05 } ;; .align 16 .L01: (p20) STFD [AO1] = f36, SIZE (p20) STFD [AO2] = f56, SIZE (p16) LDFD f32 = [X], SIZE (p16) LDFD f52 = [BO], SIZE ;; (p20) STFD [AO1] = f41, SIZE (p20) STFD [AO2] = f61, SIZE (p16) LDFD f37 = [X], OFFSET (p16) LDFD f57 = [BO], OFFSET ;; (p20) STFD [AO1] = f46, SIZE (p20) STFD [AO2] = f66, SIZE (p16) LDFD f42 = [X], SIZE (p16) LDFD f62 = [BO], SIZE ;; (p20) STFD [AO1] = f51, 5 * SIZE (p20) STFD [AO2] = f71, 5 * SIZE (p16) LDFD f47 = [X], INCX3M1 (p16) LDFD f67 = [BO], INCX3M1 ;; (p20) STFD [AO1] = f76, SIZE (p20) STFD [AO2] = f96, SIZE (p16) LDFD f72 = [X], SIZE (p16) LDFD f92 = [BO], SIZE ;; (p20) STFD [AO1] = f81, SIZE (p20) STFD [AO2] = f101, SIZE (p16) LDFD f77 = [X], OFFSET (p16) LDFD f97 = [BO], OFFSET ;; (p20) STFD [AO1] = f86, SIZE (p20) STFD [AO2] = f106, SIZE (p16) LDFD f82 = [X], SIZE (p16) LDFD f102 = [BO], SIZE ;; (p20) STFD [AO1] = f91, 5 * SIZE (p20) STFD [AO2] = f111, 5 * SIZE (p16) LDFD f87 = [X], INCX3M1 (p16) LDFD f107 = [BO], INCX3M1 br.ctop.sptk.few .L01 ;; .align 16 .L05: { .mmi (p13) LDFD f32 = [X], SIZE (p13) LDFD f36 = [BO], SIZE tbit.nz p14, p0 = M, 1 } ;; { .mmi (p13) LDFD f33 = [X], OFFSET (p13) LDFD f37 = [BO], OFFSET tbit.nz p15, p0 = M, 0 } ;; { .mmb (p13) LDFD f34 = [X], SIZE (p13) LDFD f38 = [BO], SIZE } ;; { .mmi (p13) LDFD f35 = [X], INCX3M1 (p13) LDFD f39 = [BO], INCX3M1 } ;; { .mmi (p14) LDFD f40 = [X], SIZE } ;; (p14) LDFD f41 = [X], OFFSET (p13) STFD [AO1] = f32, SIZE tbit.nz p8, p0 = A, BASE_SHIFT ;; (p14) LDFD f42 = [X], SIZE (p13) STFD [AO2] = f36, SIZE ;; (p14) LDFD f43 = [X], OFFSET (p13) STFD [AO1] = f33, SIZE ;; (p15) LDFD f44 = [X], SIZE (p13) STFD [AO2] = f37, SIZE ;; (p15) LDFD f45 = [X], OFFSET (p13) STFD [AO1] = f34, SIZE (p13) STFD [AO2] = f38, SIZE ;; (p13) STFD [AO1] = f35, 5 * SIZE (p13) STFD [AO2] = f39, 5 * SIZE ;; (p14) STFD [AO1] = f40, SIZE ;; (p14) STFD [AO1] = f41, SIZE ;; (p14) STFD [AO1] = f42, SIZE ;; (p14) STFD [AO1] = f43, SIZE ;; (p15) STFD [AO1] = f44, SIZE ;; (p15) STFD [AO1] = f45, SIZE (p8) br.cond.dpnt .L100 ;; .align 16 .L10: { .mmi mov CLD1 = Y shladd CLD2 = INCY, 1, Y shr J = N, 3 } ;; { .mmb mov CST1 = Y cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L20 } ;; .align 16 .L11: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 mov BO = BUFFER } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mmf adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 mov f14 = f0 } ;; { .mmf shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 mov f16 = f0 } { .mmf adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 mov f18 = f0 } ;; { .mmf shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 mov f20 = f0 } { .mmf adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5 adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6 mov f22 = f0 } ;; { .mfi shladd A = LDA, 3, A mov f9 = f0 mov ar.ec= 5 } { .mmf adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7 adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8 mov f11 = f0 } ;; { .mmf adds WPRE = 16 * SIZE, CLD1 adds PREB = RPREFETCH * SIZE, BO mov f13 = f0 } { .mmf adds I = -1, M cmp.eq p16, p0 = r0, r0 mov f15 = f0 } ;; { .mfi cmp.eq p12, p0 = r0, r0 mov f17 = f0 mov ar.lc = I } { .mmf nop __LINE__ nop __LINE__ mov f19 = f0 } ;; { .mmf lfetch.excl.nt1 [WPRE] nop __LINE__ mov f21 = f0 } { .mmf mov I = 0 nop __LINE__ mov f23 = f0 } ;; .align 16 .L16: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f37 = [AO1], 2 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 1, I nop __LINE__ (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p13) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) cmp.eq.unc p14, p0 = 2, I (p16) cmp.eq.unc p15, p0 = 3, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p16) LDFPD f42, f47 = [AO2], 2 * SIZE nop __LINE__ (p20) ADD1 f12 = f116, f56, f12 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f13 = f121, f56, f13 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE nop __LINE__ (p20) ADD1 f14 = f116, f66, f14 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f15 = f121, f66, f15 } ;; { .mmf (p16) LDFPD f52, f57 = [AO3], 2 * SIZE nop __LINE__ (p20) ADD3 f8 = f121, f41, f8 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f9 = f116, f41, f9 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE nop __LINE__ (p20) ADD3 f10 = f121, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f11 = f116, f51, f11 } ;; { .mmf (p16) LDFPD f62, f67 = [AO4], 2 * SIZE nop __LINE__ (p20) ADD3 f12 = f121, f61, f12 } { .mmf (p16) cmp.eq.unc p12, p0 = 4, I (p16) cmp.eq.unc p13, p0 = 5, I (p20) ADD4 f13 = f116, f61, f13 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE nop __LINE__ (p20) ADD3 f14 = f121, f71, f14 } { .mmf (p16) cmp.eq.unc p14, p0 = 6, I (p16) cmp.eq.unc p15, p0 = 7, I (p20) ADD4 f15 = f116, f71, f15 } ;; { .mmf (p16) LDFPD f72, f77 = [AO5], 2 * SIZE nop __LINE__ (p20) ADD1 f16 = f116, f76, f16 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f17 = f121, f76, f17 } ;; { .mmf (p12) PREFETCH [RPRE5], 16 * SIZE nop __LINE__ (p20) ADD1 f18 = f116, f86, f18 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f19 = f121, f86, f19 } ;; { .mmf (p16) LDFPD f82, f87 = [AO6], 2 * SIZE nop __LINE__ (p20) ADD1 f20 = f116, f96, f20 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f21 = f121, f96, f21 } ;; { .mmf (p13) PREFETCH [RPRE6], 16 * SIZE nop __LINE__ (p20) ADD1 f22 = f116, f106, f22 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f23 = f121, f106, f23 } ;; { .mmf (p16) LDFPD f92, f97 = [AO7], 2 * SIZE nop __LINE__ (p20) ADD3 f16 = f121, f81, f16 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f17 = f116, f81, f17 } ;; { .mmf (p14) PREFETCH [RPRE7], 16 * SIZE nop __LINE__ (p20) ADD3 f18 = f121, f91, f18 } { .mmf nop __LINE__ (p16) adds I = 1, I (p20) ADD4 f19 = f116, f91, f19 } ;; { .mmf (p16) LDFPD f102, f107 = [AO8], 2 * SIZE nop __LINE__ (p20) ADD3 f20 = f121, f101, f20 } { .mmf (p15) mov I = 0 nop __LINE__ (p20) ADD4 f21 = f116, f101, f21 } ;; { .mmf (p15) PREFETCH [RPRE8], 16 * SIZE nop __LINE__ (p20) ADD3 f22 = f121, f111, f22 } { .mfb (p16) cmp.eq.unc p12, p0 = 0, I (p20) ADD4 f23 = f116, f111, f23 br.ctop.sptk.few .L16 } ;; .L18: LDFD f32 = [CLD1], SIZE LDFD f36 = [CLD2], SIZE shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 LDFD f37 = [CLD2], INCYM1 ;; LDFD f34 = [CLD1], SIZE LDFD f38 = [CLD2], SIZE ;; LDFD f35 = [CLD1], INCY3M1 LDFD f39 = [CLD2], INCY3M1 ;; LDFD f40 = [CLD1], SIZE LDFD f44 = [CLD2], SIZE ;; LDFD f41 = [CLD1], INCYM1 LDFD f45 = [CLD2], INCYM1 ;; LDFD f42 = [CLD1], SIZE LDFD f46 = [CLD2], SIZE ;; LDFD f43 = [CLD1], INCY3M1 LDFD f47 = [CLD2], INCY3M1 ;; FMA f32 = ALPHA_R, f8, f32 FMA f36 = ALPHA_R, f12, f36 FMA f33 = ALPHA_I, f8, f33 FMA f37 = ALPHA_I, f12, f37 FMA f34 = ALPHA_R, f10, f34 FMA f38 = ALPHA_R, f14, f38 FMA f35 = ALPHA_I, f10, f35 FMA f39 = ALPHA_I, f14, f39 ;; FNMA f32 = ALPHA_I, f9, f32 FNMA f36 = ALPHA_I, f13, f36 FMA f33 = ALPHA_R, f9, f33 FMA f37 = ALPHA_R, f13, f37 FNMA f34 = ALPHA_I, f11, f34 FNMA f38 = ALPHA_I, f15, f38 FMA f35 = ALPHA_R, f11, f35 FMA f39 = ALPHA_R, f15, f39 ;; FMA f40 = ALPHA_R, f16, f40 FMA f44 = ALPHA_R, f20, f44 FMA f41 = ALPHA_I, f16, f41 FMA f45 = ALPHA_I, f20, f45 FMA f42 = ALPHA_R, f18, f42 FMA f46 = ALPHA_R, f22, f46 FMA f43 = ALPHA_I, f18, f43 FMA f47 = ALPHA_I, f22, f47 ;; { .mmf STFD [CST1] = f32, SIZE STFD [CST2] = f36, SIZE FNMA f40 = ALPHA_I, f17, f40 } { .mmf nop __LINE__ nop __LINE__ FNMA f44 = ALPHA_I, f21, f44 } ;; { .mmf STFD [CST1] = f33 STFD [CST2] = f37 FMA f41 = ALPHA_R, f17, f41 } { .mmf add CST1 = CST1, INCYM1 add CST2 = CST2, INCYM1 FMA f45 = ALPHA_R, f21, f45 } ;; { .mmf STFD [CST1] = f34, SIZE STFD [CST2] = f38, SIZE FNMA f42 = ALPHA_I, f19, f42 } { .mmf nop __LINE__ nop __LINE__ FNMA f46 = ALPHA_I, f23, f46 } ;; { .mmf STFD [CST1] = f35 STFD [CST2] = f39 FMA f43 = ALPHA_R, f19, f43 } { .mmf add CST1 = CST1, INCY3M1 add CST2 = CST2, INCY3M1 FMA f47 = ALPHA_R, f23, f47 } ;; { .mmi STFD [CST1] = f40, SIZE STFD [CST2] = f44, SIZE adds J = -1, J } ;; { .mmi STFD [CST1] = f41 STFD [CST2] = f45 add CST1 = CST1, INCYM1 } { .mmi nop __LINE__ nop __LINE__ add CST2 = CST2, INCYM1 } ;; { .mmi STFD [CST1] = f42, SIZE STFD [CST2] = f46, SIZE cmp.lt p6, p0 = 0, J } ;; { .mmi STFD [CST1] = f43 STFD [CST2] = f47 add CST1 = CST1, INCY3M1 } { .mmb add CST2 = CST2, INCY3M1 (p6) br.cond.dptk .L11 } ;; .align 16 .L20: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 2 } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mfb mov BO = BUFFER mov f14 = f0 (p6) br.cond.dpnt .L30 } ;; { .mfi adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 mov f9 = f0 mov ar.ec= 5 } { .mmf adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 adds I = -1, M mov f11 = f0 } ;; { .mmf adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 mov f13 = f0 } { .mmf cmp.eq p16, p0 = r0, r0 shladd A = LDA, 2, A mov f15 = f0 } ;; { .mmi lfetch.excl.nt1 [WPRE] adds PREB = RPREFETCH * SIZE, BO mov ar.lc = I } { .mmi adds WPRE = 16 * SIZE, CLD1 cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16 .L26: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f37 = [AO1], 2 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 2, I nop __LINE__ (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p12) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) cmp.eq.unc p14, p0 = 4, I (p16) cmp.eq.unc p15, p0 = 6, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p16) LDFPD f42, f47 = [AO2], 2 * SIZE nop __LINE__ (p20) ADD1 f12 = f116, f56, f12 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f13 = f121, f56, f13 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE nop __LINE__ (p20) ADD1 f14 = f116, f66, f14 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f15 = f121, f66, f15 } ;; { .mmf (p16) LDFPD f52, f57 = [AO3], 2 * SIZE nop __LINE__ (p20) ADD3 f8 = f121, f41, f8 } { .mmf (p16) adds I = 1, I nop __LINE__ (p20) ADD4 f9 = f116, f41, f9 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE nop __LINE__ (p20) ADD3 f10 = f121, f51, f10 } { .mmf (p16) cmp.eq.unc p15, p0 = 8, I nop __LINE__ (p20) ADD4 f11 = f116, f51, f11 } ;; { .mmf (p16) LDFPD f62, f67 = [AO4], 2 * SIZE nop __LINE__ (p20) ADD3 f12 = f121, f61, f12 } { .mmf (p15) mov I = 0 nop __LINE__ (p20) ADD4 f13 = f116, f61, f13 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE nop __LINE__ (p20) ADD3 f14 = f121, f71, f14 } { .mfb (p16) cmp.eq.unc p12, p0 = 0, I (p20) ADD4 f15 = f116, f71, f15 br.ctop.sptk.few .L26 } ;; .L28: LDFD f32 = [CLD1], SIZE LDFD f36 = [CLD2], SIZE shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 LDFD f37 = [CLD2], INCYM1 ;; LDFD f34 = [CLD1], SIZE LDFD f38 = [CLD2], SIZE ;; LDFD f35 = [CLD1], INCY3M1 LDFD f39 = [CLD2], INCY3M1 ;; FMA f32 = ALPHA_R, f8, f32 FMA f36 = ALPHA_R, f12, f36 FMA f33 = ALPHA_I, f8, f33 FMA f37 = ALPHA_I, f12, f37 FMA f34 = ALPHA_R, f10, f34 FMA f38 = ALPHA_R, f14, f38 FMA f35 = ALPHA_I, f10, f35 FMA f39 = ALPHA_I, f14, f39 ;; FNMA f32 = ALPHA_I, f9, f32 FNMA f36 = ALPHA_I, f13, f36 FMA f33 = ALPHA_R, f9, f33 FMA f37 = ALPHA_R, f13, f37 FNMA f34 = ALPHA_I, f11, f34 FNMA f38 = ALPHA_I, f15, f38 FMA f35 = ALPHA_R, f11, f35 FMA f39 = ALPHA_R, f15, f39 ;; STFD [CST1] = f32, SIZE STFD [CST2] = f36, SIZE ;; STFD [CST1] = f33 STFD [CST2] = f37 add CST1 = CST1, INCYM1 add CST2 = CST2, INCYM1 ;; STFD [CST1] = f34, SIZE STFD [CST2] = f38, SIZE ;; STFD [CST1] = f35 STFD [CST2] = f39 add CST1 = CST1, INCY3M1 add CST2 = CST2, INCY3M1 ;; .align 16 .L30: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 1 } ;; { .mmf adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 mov f12 = f0 } { .mfb adds I = -1, M mov f14 = f0 (p6) br.cond.dpnt .L40 } ;; { .mfi mov BO = BUFFER mov f9 = f0 mov ar.ec= 5 } { .mmf cmp.eq p16, p0 = r0, r0 shladd A = LDA, 1, A mov f11 = f0 } ;; { .mfi adds WPRE = 16 * SIZE, CLD1 mov f13 = f0 mov ar.lc = I } { .mmf adds PREB = RPREFETCH * SIZE, BO nop __LINE__ mov f15 = f0 } ;; { .mmi lfetch.excl.nt1 [WPRE] cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16 .L36: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f37 = [AO1], 2 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 4, I (p16) adds I = 1, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p12) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) cmp.eq.unc p12, p0 = 8, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE (p16) LDFPD f42, f47 = [AO2], 2 * SIZE (p20) ADD3 f12 = f121, f41, f12 } { .mmf (p12) mov I = 0 (p20) ADD4 f13 = f116, f41, f13 } ;; { .mmf (p20) ADD3 f14 = f121, f51, f14 } { .mfb nop __LINE__ (p20) ADD4 f15 = f116, f51, f15 br.ctop.sptk.few .L36 } ;; .L38: LDFD f32 = [CLD1], SIZE FADD f8 = f8, f12 shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 FADD f10 = f10, f14 ;; LDFD f34 = [CLD1], SIZE FADD f9 = f9, f13 ;; LDFD f35 = [CLD1], INCYM1 FADD f11 = f11, f15 ;; FMA f32 = ALPHA_R, f8, f32 FMA f33 = ALPHA_I, f8, f33 FMA f34 = ALPHA_R, f10, f34 FMA f35 = ALPHA_I, f10, f35 ;; FNMA f32 = ALPHA_I, f9, f32 FMA f33 = ALPHA_R, f9, f33 FNMA f34 = ALPHA_I, f11, f34 FMA f35 = ALPHA_R, f11, f35 ;; STFD [CST1] = f32, SIZE ;; STFD [CST1] = f33 add CST1 = CST1, INCYM1 ;; STFD [CST1] = f34, SIZE ;; STFD [CST1] = f35 add CST1 = CST1, INCYM1 ;; .align 16 .L40: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi mov f9 = f0 tbit.z p6, p0 = N, 0 } ;; { .mfi adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 mov f10 = f0 mov ar.ec= 5 } { .mfb adds I = -1, M mov f11 = f0 (p6) br.cond.dpnt .L999 } ;; { .mmi cmp.eq p16, p0 = r0, r0 add A = LDA, A mov ar.lc = I } { .mmi adds WPRE = 16 * SIZE, CLD1 adds PREB = RPREFETCH * SIZE, BO mov BO = BUFFER } ;; { .mmi lfetch.excl.nt1 [WPRE] cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16 .L46: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFPD f32, f37 = [AO1], 2 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p12, p0 = 7, I (p16) adds I = 1, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD3 f10 = f121, f41, f10 } { .mfb (p12) mov I = 0 (p20) ADD4 f11 = f116, f41, f11 br.ctop.sptk.few .L46 } ;; .L48: LDFD f32 = [CLD1], SIZE FADD f8 = f8, f10 shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 FADD f9 = f9, f11 ;; FMA f32 = ALPHA_R, f8, f32 FMA f33 = ALPHA_I, f8, f33 ;; FNMA f32 = ALPHA_I, f9, f32 FMA f33 = ALPHA_R, f9, f33 ;; STFD [CST1] = f32, SIZE ;; STFD [CST1] = f33 add CST1 = CST1, INCYM1 br .L999 .align 16 ;; .L100: { .mmi mov CLD1 = Y shladd CLD2 = INCY, 1, Y shr J = N, 3 } ;; { .mmb mov CST1 = Y cmp.eq p6, p0 = r0, J (p6) br.cond.dpnt .L120 } ;; .align 16 .L111: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 mov BO = BUFFER } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mmf adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 mov f14 = f0 } ;; { .mmf shladd AO5 = LDA, 1, AO3 shladd AO6 = LDA, 1, AO4 mov f16 = f0 } { .mmf adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 mov f18 = f0 } ;; { .mmf shladd AO7 = LDA, 1, AO5 shladd AO8 = LDA, 1, AO6 mov f20 = f0 } { .mmf adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5 adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6 mov f22 = f0 } ;; { .mfi shladd A = LDA, 3, A mov f9 = f0 mov ar.ec= 5 } { .mmf adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7 adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8 mov f11 = f0 } ;; { .mmf adds WPRE = 16 * SIZE, CLD1 adds PREB = RPREFETCH * SIZE, BO mov f13 = f0 } { .mmf adds I = -1, M cmp.eq p16, p0 = r0, r0 mov f15 = f0 } ;; { .mfi cmp.eq p12, p0 = r0, r0 mov f17 = f0 mov ar.lc = I } { .mmf nop __LINE__ nop __LINE__ mov f19 = f0 } ;; { .mmf lfetch.excl.nt1 [WPRE] nop __LINE__ mov f21 = f0 } { .mmf mov I = 0 nop __LINE__ mov f23 = f0 } ;; .align 16 .L116: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFD f32 = [AO1], 1 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 1, I (p16) cmp.eq.unc p14, p0 = 2, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p13) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) LDFD f37 = [AO1], 1 * SIZE (p16) cmp.eq.unc p15, p0 = 3, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE (p16) LDFD f42 = [AO2], 1 * SIZE (p20) ADD1 f12 = f116, f56, f12 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f13 = f121, f56, f13 } ;; { .mmf (p16) LDFD f47 = [AO2], 1 * SIZE nop __LINE__ (p20) ADD1 f14 = f116, f66, f14 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f15 = f121, f66, f15 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE (p16) LDFD f52 = [AO3], 1 * SIZE (p20) ADD3 f8 = f121, f41, f8 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f9 = f116, f41, f9 } ;; { .mmf (p16) LDFD f57 = [AO3], 1 * SIZE nop __LINE__ (p20) ADD3 f10 = f121, f51, f10 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f11 = f116, f51, f11 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFD f62 = [AO4], 1 * SIZE (p20) ADD3 f12 = f121, f61, f12 } { .mmf (p16) cmp.eq.unc p12, p0 = 4, I (p16) cmp.eq.unc p13, p0 = 5, I (p20) ADD4 f13 = f116, f61, f13 } ;; { .mmf (p16) LDFD f67 = [AO4], 1 * SIZE nop __LINE__ (p20) ADD3 f14 = f121, f71, f14 } { .mmf (p16) cmp.eq.unc p14, p0 = 6, I (p16) cmp.eq.unc p15, p0 = 7, I (p20) ADD4 f15 = f116, f71, f15 } ;; { .mmf (p12) PREFETCH [RPRE5], 16 * SIZE (p16) LDFD f72 = [AO5], 1 * SIZE (p20) ADD1 f16 = f116, f76, f16 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f17 = f121, f76, f17 } ;; { .mmf (p16) LDFD f77 = [AO5], 1 * SIZE nop __LINE__ (p20) ADD1 f18 = f116, f86, f18 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f19 = f121, f86, f19 } ;; { .mmf (p13) PREFETCH [RPRE6], 16 * SIZE (p16) LDFD f82 = [AO6], 1 * SIZE (p20) ADD1 f20 = f116, f96, f20 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f21 = f121, f96, f21 } ;; { .mmf (p16) LDFD f87 = [AO6], 1 * SIZE nop __LINE__ (p20) ADD1 f22 = f116, f106, f22 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f23 = f121, f106, f23 } ;; { .mmf (p14) PREFETCH [RPRE7], 16 * SIZE (p16) LDFD f92 = [AO7], 1 * SIZE (p20) ADD3 f16 = f121, f81, f16 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD4 f17 = f116, f81, f17 } ;; { .mmf (p16) LDFD f97 = [AO7], 1 * SIZE nop __LINE__ (p20) ADD3 f18 = f121, f91, f18 } { .mmf nop __LINE__ (p16) adds I = 1, I (p20) ADD4 f19 = f116, f91, f19 } ;; { .mmf (p15) PREFETCH [RPRE8], 16 * SIZE (p16) LDFD f102 = [AO8], 1 * SIZE (p20) ADD3 f20 = f121, f101, f20 } { .mmf (p15) mov I = 0 nop __LINE__ (p20) ADD4 f21 = f116, f101, f21 } ;; { .mmf (p16) LDFD f107 = [AO8], 1 * SIZE nop __LINE__ (p20) ADD3 f22 = f121, f111, f22 } { .mfb (p16) cmp.eq.unc p12, p0 = 0, I (p20) ADD4 f23 = f116, f111, f23 br.ctop.sptk.few .L116 } ;; .L118: LDFD f32 = [CLD1], SIZE LDFD f36 = [CLD2], SIZE shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 LDFD f37 = [CLD2], INCYM1 ;; LDFD f34 = [CLD1], SIZE LDFD f38 = [CLD2], SIZE ;; LDFD f35 = [CLD1], INCY3M1 LDFD f39 = [CLD2], INCY3M1 ;; LDFD f40 = [CLD1], SIZE LDFD f44 = [CLD2], SIZE ;; LDFD f41 = [CLD1], INCYM1 LDFD f45 = [CLD2], INCYM1 ;; LDFD f42 = [CLD1], SIZE LDFD f46 = [CLD2], SIZE ;; LDFD f43 = [CLD1], INCY3M1 LDFD f47 = [CLD2], INCY3M1 ;; FMA f32 = ALPHA_R, f8, f32 FMA f36 = ALPHA_R, f12, f36 FMA f33 = ALPHA_I, f8, f33 FMA f37 = ALPHA_I, f12, f37 FMA f34 = ALPHA_R, f10, f34 FMA f38 = ALPHA_R, f14, f38 FMA f35 = ALPHA_I, f10, f35 FMA f39 = ALPHA_I, f14, f39 ;; FNMA f32 = ALPHA_I, f9, f32 FNMA f36 = ALPHA_I, f13, f36 FMA f33 = ALPHA_R, f9, f33 FMA f37 = ALPHA_R, f13, f37 FNMA f34 = ALPHA_I, f11, f34 FNMA f38 = ALPHA_I, f15, f38 FMA f35 = ALPHA_R, f11, f35 FMA f39 = ALPHA_R, f15, f39 ;; FMA f40 = ALPHA_R, f16, f40 FMA f44 = ALPHA_R, f20, f44 FMA f41 = ALPHA_I, f16, f41 FMA f45 = ALPHA_I, f20, f45 FMA f42 = ALPHA_R, f18, f42 FMA f46 = ALPHA_R, f22, f46 FMA f43 = ALPHA_I, f18, f43 FMA f47 = ALPHA_I, f22, f47 ;; { .mmf STFD [CST1] = f32, SIZE STFD [CST2] = f36, SIZE FNMA f40 = ALPHA_I, f17, f40 } { .mmf nop __LINE__ nop __LINE__ FNMA f44 = ALPHA_I, f21, f44 } ;; { .mmf STFD [CST1] = f33 STFD [CST2] = f37 FMA f41 = ALPHA_R, f17, f41 } { .mmf add CST1 = CST1, INCYM1 add CST2 = CST2, INCYM1 FMA f45 = ALPHA_R, f21, f45 } ;; { .mmf STFD [CST1] = f34, SIZE STFD [CST2] = f38, SIZE FNMA f42 = ALPHA_I, f19, f42 } { .mmf nop __LINE__ nop __LINE__ FNMA f46 = ALPHA_I, f23, f46 } ;; { .mmf STFD [CST1] = f35 STFD [CST2] = f39 FMA f43 = ALPHA_R, f19, f43 } { .mmf add CST1 = CST1, INCY3M1 add CST2 = CST2, INCY3M1 FMA f47 = ALPHA_R, f23, f47 } ;; { .mmi STFD [CST1] = f40, SIZE STFD [CST2] = f44, SIZE adds J = -1, J } ;; { .mmi STFD [CST1] = f41 STFD [CST2] = f45 add CST1 = CST1, INCYM1 } { .mmi nop __LINE__ nop __LINE__ add CST2 = CST2, INCYM1 } ;; { .mmi STFD [CST1] = f42, SIZE STFD [CST2] = f46, SIZE cmp.lt p6, p0 = 0, J } ;; { .mmi STFD [CST1] = f43 STFD [CST2] = f47 add CST1 = CST1, INCY3M1 } { .mmb add CST2 = CST2, INCY3M1 (p6) br.cond.dptk .L111 } ;; .align 16 .L120: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 2 } ;; { .mmf shladd AO3 = LDA, 1, A shladd AO4 = LDA, 1, AO2 mov f12 = f0 } { .mfb mov BO = BUFFER mov f14 = f0 (p6) br.cond.dpnt .L130 } ;; { .mfi adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 mov f9 = f0 mov ar.ec= 5 } { .mmf adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 adds I = -1, M mov f11 = f0 } ;; { .mmf adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 mov f13 = f0 } { .mmf cmp.eq p16, p0 = r0, r0 shladd A = LDA, 2, A mov f15 = f0 } ;; { .mmi lfetch.excl.nt1 [WPRE] adds PREB = RPREFETCH * SIZE, BO mov ar.lc = I } { .mmi adds WPRE = 16 * SIZE, CLD1 cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16 .L126: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFD f32 = [AO1], 1 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 2, I (p16) cmp.eq.unc p14, p0 = 4, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p12) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) LDFD f37 = [AO1], 1 * SIZE (p16) cmp.eq.unc p15, p0 = 6, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p16) LDFD f42 = [AO2], 1 * SIZE nop __LINE__ (p20) ADD1 f12 = f116, f56, f12 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f13 = f121, f56, f13 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE (p16) LDFD f47 = [AO2], 1 * SIZE (p20) ADD1 f14 = f116, f66, f14 } { .mmf nop __LINE__ nop __LINE__ (p20) ADD2 f15 = f121, f66, f15 } ;; { .mmf (p16) LDFD f52 = [AO3], 1 * SIZE nop __LINE__ (p20) ADD3 f8 = f121, f41, f8 } { .mmf nop __LINE__ (p16) adds I = 1, I (p20) ADD4 f9 = f116, f41, f9 } ;; { .mmf (p14) PREFETCH [RPRE3], 16 * SIZE (p16) LDFD f57 = [AO3], 1 * SIZE (p20) ADD3 f10 = f121, f51, f10 } { .mmf nop __LINE__ (p16) cmp.eq.unc p15, p0 = 8, I (p20) ADD4 f11 = f116, f51, f11 } ;; { .mmf (p16) LDFD f62 = [AO4], 1 * SIZE nop __LINE__ (p20) ADD3 f12 = f121, f61, f12 } { .mmf (p15) mov I = 0 nop __LINE__ (p20) ADD4 f13 = f116, f61, f13 } ;; { .mmf (p15) PREFETCH [RPRE4], 16 * SIZE (p16) LDFD f67 = [AO4], 1 * SIZE (p20) ADD3 f14 = f121, f71, f14 } { .mfb (p16) cmp.eq.unc p12, p0 = 0, I (p20) ADD4 f15 = f116, f71, f15 br.ctop.sptk.few .L126 } ;; .L128: LDFD f32 = [CLD1], SIZE LDFD f36 = [CLD2], SIZE shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 LDFD f37 = [CLD2], INCYM1 ;; LDFD f34 = [CLD1], SIZE LDFD f38 = [CLD2], SIZE ;; LDFD f35 = [CLD1], INCY3M1 LDFD f39 = [CLD2], INCY3M1 ;; FMA f32 = ALPHA_R, f8, f32 FMA f36 = ALPHA_R, f12, f36 FMA f33 = ALPHA_I, f8, f33 FMA f37 = ALPHA_I, f12, f37 FMA f34 = ALPHA_R, f10, f34 FMA f38 = ALPHA_R, f14, f38 FMA f35 = ALPHA_I, f10, f35 FMA f39 = ALPHA_I, f14, f39 ;; FNMA f32 = ALPHA_I, f9, f32 FNMA f36 = ALPHA_I, f13, f36 FMA f33 = ALPHA_R, f9, f33 FMA f37 = ALPHA_R, f13, f37 FNMA f34 = ALPHA_I, f11, f34 FNMA f38 = ALPHA_I, f15, f38 FMA f35 = ALPHA_R, f11, f35 FMA f39 = ALPHA_R, f15, f39 ;; STFD [CST1] = f32, SIZE STFD [CST2] = f36, SIZE ;; STFD [CST1] = f33 STFD [CST2] = f37 add CST1 = CST1, INCYM1 add CST2 = CST2, INCYM1 ;; STFD [CST1] = f34, SIZE STFD [CST2] = f38, SIZE ;; STFD [CST1] = f35 STFD [CST2] = f39 add CST1 = CST1, INCY3M1 add CST2 = CST2, INCY3M1 ;; .align 16 .L130: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi add AO2 = LDA, A mov f10 = f0 tbit.z p6, p0 = N, 1 } ;; { .mmf adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 mov f12 = f0 } { .mfb adds I = -1, M mov f14 = f0 (p6) br.cond.dpnt .L140 } ;; { .mfi mov BO = BUFFER mov f9 = f0 mov ar.ec= 5 } { .mmf cmp.eq p16, p0 = r0, r0 shladd A = LDA, 1, A mov f11 = f0 } ;; { .mfi adds WPRE = 16 * SIZE, CLD1 mov f13 = f0 mov ar.lc = I } { .mmf adds PREB = RPREFETCH * SIZE, BO nop __LINE__ mov f15 = f0 } ;; { .mmi lfetch.excl.nt1 [WPRE] cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16 .L136: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFD f32 = [AO1], 1 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p13, p0 = 4, I (p16) adds I = 1, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p12) PREFETCH [PREB], 16 * SIZE (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p20) ADD1 f10 = f116, f46, f10 } { .mmf (p16) LDFD f37 = [AO1], 1 * SIZE (p16) cmp.eq.unc p12, p0 = 8, I (p20) ADD2 f11 = f121, f46, f11 } ;; { .mmf (p13) PREFETCH [RPRE2], 16 * SIZE (p16) LDFD f42 = [AO2], 1 * SIZE (p20) ADD3 f12 = f121, f41, f12 } { .mmf (p12) mov I = 0 nop __LINE__ (p20) ADD4 f13 = f116, f41, f13 } ;; { .mmf (p16) LDFD f47 = [AO2], 1 * SIZE nop __LINE__ (p20) ADD3 f14 = f121, f51, f14 } { .mfb nop __LINE__ (p20) ADD4 f15 = f116, f51, f15 br.ctop.sptk.few .L136 } ;; .L138: LDFD f32 = [CLD1], SIZE FADD f8 = f8, f12 shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 FADD f10 = f10, f14 ;; LDFD f34 = [CLD1], SIZE FADD f9 = f9, f13 ;; LDFD f35 = [CLD1], INCYM1 FADD f11 = f11, f15 ;; FMA f32 = ALPHA_R, f8, f32 FMA f33 = ALPHA_I, f8, f33 FMA f34 = ALPHA_R, f10, f34 FMA f35 = ALPHA_I, f10, f35 ;; FNMA f32 = ALPHA_I, f9, f32 FMA f33 = ALPHA_R, f9, f33 FNMA f34 = ALPHA_I, f11, f34 FMA f35 = ALPHA_R, f11, f35 ;; STFD [CST1] = f32, SIZE ;; STFD [CST1] = f33 add CST1 = CST1, INCYM1 ;; STFD [CST1] = f34, SIZE ;; STFD [CST1] = f35 add CST1 = CST1, INCYM1 ;; .align 16 .L140: { .mfi mov AO1 = A mov f8 = f0 mov pr.rot= 0 } { .mfi mov f9 = f0 tbit.z p6, p0 = N, 0 } ;; { .mfi adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 mov f10 = f0 mov ar.ec= 5 } { .mfb adds I = -1, M mov f11 = f0 (p6) br.cond.dpnt .L999 } ;; { .mmi cmp.eq p16, p0 = r0, r0 shladd A = LDA, 1, A mov ar.lc = I } { .mmi adds WPRE = 16 * SIZE, CLD1 adds PREB = RPREFETCH * SIZE, BO mov BO = BUFFER } ;; { .mmi lfetch.excl.nt1 [WPRE] cmp.eq p12, p0 = r0, r0 mov I = 0 } ;; .align 16 .L146: { .mmf (p12) PREFETCH [RPRE1], 16 * SIZE (p16) LDFD f32 = [AO1], 1 * SIZE (p20) ADD1 f8 = f116, f36, f8 } { .mmf (p16) cmp.eq.unc p12, p0 = 7, I (p16) adds I = 1, I (p20) ADD2 f9 = f121, f36, f9 } ;; { .mmf (p16) LDFPD f112, f117 = [BO], 2 * SIZE (p16) LDFD f37 = [AO1], 1 * SIZE (p20) ADD3 f10 = f121, f41, f10 } { .mfb (p12) mov I = 0 (p20) ADD4 f11 = f116, f41, f11 br.ctop.sptk.few .L146 } ;; .L148: LDFD f32 = [CLD1], SIZE FADD f8 = f8, f10 shladd CST2 = INCY, 1, CST1 ;; LDFD f33 = [CLD1], INCYM1 FADD f9 = f9, f11 ;; FMA f32 = ALPHA_R, f8, f32 FMA f33 = ALPHA_I, f8, f33 ;; FNMA f32 = ALPHA_I, f9, f32 FMA f33 = ALPHA_R, f9, f33 ;; STFD [CST1] = f32, SIZE ;; STFD [CST1] = f33 add CST1 = CST1, INCYM1 ;; .align 16 .L999: mov r8 = r0 adds r9 = 1 * 16, SP ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zrot.S000066400000000000000000000365061313527062700163230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 8 + 4) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 8 + 8) #else #define PREFETCH_SIZE (32 * 8 + 16) #endif #define N r32 #define X1 r33 #define INCX r34 #define Y1 r35 #define INCY r36 #define PREX r2 #define PREY r3 #define I r14 #define J r15 #define Y2 r16 #define X2 r17 #define INCX16 r18 #define INCY16 r19 #define PR r30 #define ARLC r31 #define C f8 #define S f9 PROLOGUE .prologue PROFCODE { .mmi adds r29 = 16, r12 add INCX = INCX, INCX .save ar.lc, ARLC mov ARLC = ar.lc } { .mib cmp.lt p0, p6 = r0, N shr I = N, 3 (p6) br.ret.spnt.many b0 } ;; .body { .mmi #ifdef XDOUBLE LDFD S = [r29] #else nop __LINE__ #endif add INCY = INCY, INCY mov PR = pr } { .mmi mov X2 = X1 mov Y2 = Y1 mov pr.rot= 0 } ;; { .mmi shladd INCX = INCX, BASE_SHIFT, r0 shladd INCY = INCY, BASE_SHIFT, r0 mov ar.ec= 3 } { .mmi adds I = -1, I cmp.eq p16, p0 = r0, r0 and J = 7, N } ;; { .mmi #ifndef XDOUBLE shladd INCX16 = INCX, 3, r0 shladd INCY16 = INCY, 3, r0 #else shladd INCX16 = INCX, 2, r0 shladd INCY16 = INCY, 2, r0 #endif nop __LINE__ } { .mmi adds INCX = -SIZE, INCX adds INCY = -SIZE, INCY nop __LINE__ } ;; { .mmi adds PREX = PREFETCH_SIZE * SIZE, X1 adds PREY = PREFETCH_SIZE * SIZE, Y1 mov ar.lc = I } { .mib cmp.eq p6 ,p0 = -1, I tbit.z p0, p12 = N, 2 (p6) br.cond.dpnt .L15 } ;; .align 32 .L12: { .mmf (p19) STFD [Y2] = f15 (p16) lfetch.excl.nt1 [PREX], INCX16 (p18) FMPY f15 = C, f91 } { .mmf (p16) LDFD f32 = [X1], SIZE (p19) add Y2 = Y2, INCY (p18) FNMA f11 = S, f37, f11 } ;; { .mmf (p18) STFD [X2] = f6 (p16) lfetch.excl.nt1 [PREY], INCY16 (p18) FMA f12 = C, f40, f12 } { .mmf (p17) LDFD f114 = [Y1], INCY (p18) adds X2 = SIZE, X2 (p18) FMPY f6 = S, f94 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) LDFD f35 = [X1], INCX (p18) FNMA f13 = S, f40, f13 } { .mmf nop __LINE__ (p18) adds Y2 = SIZE, Y2 (p18) FMPY f7 = C, f94 } ;; { .mmf (p18) STFD [X2] = f10 (p17) LDFD f117 = [Y1], SIZE (p18) FMA f14 = C, f43, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f10 = S, f97 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f38 = [X1], SIZE (p18) FNMA f15 = S, f43, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f11 = C, f97 } ;; { .mmf (p18) STFD [X2] = f12 (p17) LDFD f120 = [Y1], INCY (p18) FMPY f12 = S, f100 } { .mmf (p18) adds X2 = SIZE, X2 nop __LINE__ (p18) FMA f6 = C, f46, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f41 = [X1], INCX (p18) FMPY f13 = C, f100 } { .mmf (p18) adds Y2 = SIZE, Y2 nop __LINE__ (p18) FNMA f7 = S, f46, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p17) LDFD f123 = [Y1], SIZE (p18) FMPY f14 = S, f103 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f10 = C, f49, f10 } ;; { .mmf (p18) STFD [Y2] = f15 (p16) LDFD f44 = [X1], SIZE (p18) FMPY f15 = C, f103 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f11 = S, f49, f11 } ;; { .mmf (p18) STFD [X2] = f6 (p17) LDFD f126 = [Y1], INCY (p18) FMA f12 = C, f52, f12 } { .mmf (p18) adds X2 = SIZE, X2 nop __LINE__ (p18) FMPY f6 = S, f106 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) LDFD f47 = [X1], INCX (p18) FNMA f13 = S, f52, f13 } { .mmf (p18) adds Y2 = SIZE, Y2 nop __LINE__ (p18) FMPY f7 = C, f106 } ;; { .mmf (p18) STFD [X2] = f10 (p16) LDFD f80 = [Y1], SIZE (p18) FMA f14 = C, f55, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f10 = S, f109 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f50 = [X1], SIZE (p18) FNMA f15 = S, f55, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f11 = C, f109 } ;; { .mmf (p18) STFD [X2] = f12 (p16) LDFD f83 = [Y1], INCY (p18) FMPY f12 = S, f112 } { .mmf (p18) adds X2 = SIZE, X2 nop __LINE__ (p18) FMA f6 = C, f58, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f53 = [X1], INCX (p18) FMPY f13 = C, f112 } { .mmf (p18) adds Y2 = SIZE, Y2 nop __LINE__ (p18) FNMA f7 = S, f58, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p16) LDFD f86 = [Y1], SIZE (p18) FMPY f14 = S, f115 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f10 = C, f61, f10 } ;; { .mmf (p18) STFD [Y2] = f15 (p16) LDFD f56 = [X1], SIZE (p18) FMPY f15 = C, f115 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f11 = S, f61, f11 } ;; #ifndef XDOUBLE { .mmf (p18) STFD [X2] = f6 (p16) LDFD f89 = [Y1], INCY (p18) FMA f12 = C, f64, f12 } { .mmf (p18) adds X2 = SIZE, X2 nop __LINE__ (p18) FMPY f6 = S, f118 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) LDFD f59 = [X1], INCX (p18) FNMA f13 = S, f64, f13 } { .mmf (p18) adds Y2 = SIZE, Y2 nop __LINE__ (p18) FMPY f7 = C, f118 } ;; #else { .mmf (p18) STFD [X2] = f6 (p16) lfetch.excl.nt1 [PREY], INCY16 (p18) FMA f12 = C, f64, f12 } { .mmf (p16) LDFD f89 = [Y1], INCY (p18) adds X2 = SIZE, X2 (p18) FMPY f6 = S, f118 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) lfetch.excl.nt1 [PREX], INCX16 (p18) FNMA f13 = S, f64, f13 } { .mmf (p16) LDFD f59 = [X1], INCX (p18) adds Y2 = SIZE, Y2 (p18) FMPY f7 = C, f118 } ;; #endif { .mmf (p18) STFD [X2] = f10 (p16) LDFD f92 = [Y1], SIZE (p18) FMA f14 = C, f67, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMPY f10 = S, f121 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f62 = [X1], SIZE (p18) FNMA f15 = S, f67, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FMPY f11 = C, f121 } ;; { .mmf (p18) STFD [X2] = f12 (p16) LDFD f95 = [Y1], INCY (p18) FMPY f12 = S, f124 } { .mmf (p18) adds X2 = SIZE, X2 nop __LINE__ (p18) FMA f6 = C, f70, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f65 = [X1], INCX (p18) FMPY f13 = C, f124 } { .mmf (p18) adds Y2 = SIZE, Y2 nop __LINE__ (p18) FNMA f7 = S, f70, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p16) LDFD f98 = [Y1], SIZE (p18) FMPY f14 = S, f127 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p18) FMA f10 = C, f73, f10 } ;; { .mmf (p18) STFD [Y2] = f15 (p16) LDFD f68 = [X1], SIZE (p18) FMPY f15 = C, f127 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p18) FNMA f11 = S, f73, f11 } ;; { .mmf (p18) STFD [X2] = f6 (p16) LDFD f101 = [Y1], INCY (p18) FMA f12 = C, f76, f12 } { .mmf (p18) adds X2 = SIZE, X2 nop __LINE__ (p17) FMPY f6 = S, f81 } ;; { .mmf (p18) STFD [Y2] = f7 (p16) LDFD f71 = [X1], INCX (p18) FNMA f13 = S, f76, f13 } { .mmf (p18) adds Y2 = SIZE, Y2 nop __LINE__ (p17) FMPY f7 = C, f81 } ;; { .mmf (p18) STFD [X2] = f10 (p16) LDFD f104 = [Y1], SIZE (p18) FMA f14 = C, f79, f14 } { .mmf (p18) add X2 = X2, INCX nop __LINE__ (p17) FMPY f10 = S, f84 } ;; { .mmf (p18) STFD [Y2] = f11 (p16) LDFD f74 = [X1], SIZE (p18) FNMA f15 = S, f79, f15 } { .mmf (p18) add Y2 = Y2, INCY nop __LINE__ (p17) FMPY f11 = C, f84 } ;; { .mmf (p18) STFD [X2] = f12 (p16) LDFD f107 = [Y1], INCY (p17) FMPY f12 = S, f87 } { .mmf (p18) adds X2 = SIZE, X2 nop __LINE__ (p17) FMA f6 = C, f33, f6 } ;; { .mmf (p18) STFD [Y2] = f13 (p16) LDFD f77 = [X1], INCX (p17) FMPY f13 = C, f87 } { .mmf (p18) adds Y2 = SIZE, Y2 nop __LINE__ (p17) FNMA f7 = S, f33, f7 } ;; { .mmf (p18) STFD [X2] = f14 (p16) LDFD f110 = [Y1], SIZE (p17) FMPY f14 = S, f90 } { .mfb (p18) add X2 = X2, INCX (p17) FMA f10 = C, f36, f10 br.ctop.sptk.few .L12 } ;; { .mmi (p19) STFD [Y2] = f15 (p19) add Y2 = Y2, INCY nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ nop __LINE__ } ;; .align 32 .L15: { .mmi (p12) LDFD f40 = [Y1], SIZE (p12) LDFD f32 = [X1], SIZE mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f41 = [Y1], INCY (p12) LDFD f33 = [X1], INCX mov pr = PR, -65474 } ;; { .mmb (p12) LDFD f42 = [Y1], SIZE cmp.eq p7, p0 = r0, J (p7) br.ret.sptk.many b0 } ;; { .mmf (p12) LDFD f43 = [Y1], INCY nop __LINE__ (p12) FMPY f6 = S, f40 } ;; { .mmf (p12) LDFD f34 = [X1], SIZE nop __LINE__ (p12) FMPY f7 = C, f40 } ;; { .mmf (p12) LDFD f44 = [Y1], SIZE nop __LINE__ (p12) FMPY f10 = S, f41 } ;; { .mmf (p12) LDFD f35 = [X1], INCX nop __LINE__ (p12) FMPY f11 = C, f41 } ;; { .mmf (p12) LDFD f45 = [Y1], INCY nop __LINE__ (p12) FMPY f12 = S, f42 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f6 = C, f32, f6 } ;; { .mmf (p12) LDFD f36 = [X1], SIZE nop __LINE__ (p12) FMPY f13 = C, f42 } { .mmf nop __LINE__ nop __LINE__ (p12) FNMA f7 = S, f32, f7 } ;; { .mmf (p12) LDFD f46 = [Y1], SIZE nop __LINE__ (p12) FMPY f14 = S, f43 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f10 = C, f33, f10 } ;; { .mmf (p12) LDFD f37 = [X1], INCX nop __LINE__ (p12) FMPY f15 = C, f43 } { .mmf nop __LINE__ nop __LINE__ (p12) FNMA f11 = S, f33, f11 } ;; { .mmf (p12) STFD [X2] = f6, SIZE (p12) LDFD f47 = [Y1], INCY (p12) FMA f12 = C, f34, f12 } { .mfi nop __LINE__ (p12) FMPY f6 = S, f44 tbit.z p0, p13 = N, 1 } ;; { .mmf (p12) STFD [Y2] = f7, SIZE (p12) LDFD f38 = [X1], SIZE (p12) FNMA f13 = S, f34, f13 } { .mmf nop __LINE__ nop __LINE__ (p12) FMPY f7 = C, f44 } ;; { .mmf (p12) STFD [X2] = f10 (p13) LDFD f52 = [Y1], SIZE (p12) FMA f14 = C, f35, f14 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p12) FMPY f10 = S, f45 } ;; { .mmf (p12) STFD [Y2] = f11 (p12) LDFD f39 = [X1], INCX (p12) FNMA f15 = S, f35, f15 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p12) FMPY f11 = C, f45 } ;; { .mmf (p12) STFD [X2] = f12, SIZE (p13) LDFD f53 = [Y1], INCY (p12) FMPY f12 = S, f46 } { .mmf nop __LINE__ nop __LINE__ (p12) FMA f6 = C, f36, f6 } ;; { .mmf (p12) STFD [Y2] = f13, SIZE (p13) LDFD f48 = [X1], SIZE (p12) FMPY f13 = C, f46 } { .mmf nop __LINE__ nop __LINE__ (p12) FNMA f7 = S, f36, f7 } ;; { .mmf (p12) STFD [X2] = f14 (p13) LDFD f54 = [Y1], SIZE (p12) FMPY f14 = S, f47 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p12) FMA f10 = C, f37, f10 } ;; { .mmf (p12) STFD [Y2] = f15 (p13) LDFD f49 = [X1], INCX (p12) FMPY f15 = C, f47 } { .mfi (p12) add Y2 = Y2, INCY (p12) FNMA f11 = S, f37, f11 tbit.z p0, p14 = N, 0 } ;; { .mmf (p12) STFD [X2] = f6, SIZE (p13) LDFD f55 = [Y1], INCY (p12) FMA f12 = C, f38, f12 } { .mmf nop __LINE__ nop __LINE__ (p13) FMPY f6 = S, f52 } ;; { .mmf (p12) STFD [Y2] = f7, SIZE (p13) LDFD f50 = [X1], SIZE (p12) FNMA f13 = S, f38, f13 } { .mmf nop __LINE__ nop __LINE__ (p13) FMPY f7 = C, f52 } ;; { .mmf (p12) STFD [X2] = f10 (p14) LDFD f58 = [Y1], SIZE (p12) FMA f14 = C, f39, f14 } { .mmf (p12) add X2 = X2, INCX nop __LINE__ (p13) FMPY f10 = S, f53 } ;; { .mmf (p12) STFD [Y2] = f11 (p13) LDFD f51 = [X1], INCX (p12) FNMA f15 = S, f39, f15 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p13) FMPY f11 = C, f53 } ;; { .mmf (p12) STFD [X2] = f12, SIZE (p14) LDFD f59 = [Y1], INCY (p13) FMPY f12 = S, f54 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f6 = C, f48, f6 } ;; { .mmf (p12) STFD [Y2] = f13, SIZE (p14) LDFD f56 = [X1], SIZE (p13) FMPY f13 = C, f54 } { .mmf nop __LINE__ nop __LINE__ (p13) FNMA f7 = S, f48, f7 } ;; { .mmf (p12) STFD [X2] = f14 (p12) add X2 = X2, INCX (p13) FMPY f14 = S, f55 } { .mmf nop __LINE__ nop __LINE__ (p13) FMA f10 = C, f49, f10 } ;; { .mmf (p12) STFD [Y2] = f15 (p14) LDFD f57 = [X1], INCX (p13) FMPY f15 = C, f55 } { .mmf (p12) add Y2 = Y2, INCY nop __LINE__ (p13) FNMA f11 = S, f49, f11 } ;; { .mmf (p13) STFD [X2] = f6, SIZE nop __LINE__ (p13) FMA f12 = C, f50, f12 } { .mmf nop __LINE__ nop __LINE__ (p14) FMPY f6 = S, f58 } ;; { .mmf (p13) STFD [Y2] = f7, SIZE nop __LINE__ (p13) FNMA f13 = S, f50, f13 } { .mmf nop __LINE__ nop __LINE__ (p14) FMPY f7 = C, f58 } ;; { .mmf (p13) STFD [X2] = f10 (p13) add X2 = X2, INCX (p13) FMA f14 = C, f51, f14 } { .mmf nop __LINE__ nop __LINE__ (p14) FMPY f10 = S, f59 } ;; { .mmf (p13) STFD [Y2] = f11 (p13) add Y2 = Y2, INCY (p13) FNMA f15 = S, f51, f15 } { .mmf nop __LINE__ nop __LINE__ (p14) FMPY f11 = C, f59 } ;; { .mmf (p13) STFD [X2] = f12, SIZE nop __LINE__ (p14) FMA f6 = C, f56, f6 } ;; { .mmf (p13) STFD [Y2] = f13, SIZE nop __LINE__ (p14) FNMA f7 = S, f56, f7 } ;; { .mmf (p13) STFD [X2] = f14 (p13) add X2 = X2, INCX (p14) FMA f10 = C, f57, f10 } ;; { .mmf (p13) STFD [Y2] = f15 (p13) add Y2 = Y2, INCY (p14) FNMA f11 = S, f57, f11 } ;; { .mmi (p14) STFD [X2] = f6, SIZE (p14) STFD [Y2] = f7, SIZE nop __LINE__ } ;; { .mmb (p14) STFD [X2] = f10 (p14) STFD [Y2] = f11 br.ret.sptk.many b0 } ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zscal.S000066400000000000000000000251261313527062700164350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16) #else #define PREFETCH_SIZE (32 * 16) #endif #define SP r12 #ifdef XDOUBLE #define N r32 #define X1 r14 #define INCX r15 #else #define N r32 #define X1 r37 #define INCX r38 #endif #define X2 r16 #define Y1 r17 #define INCX3 r18 #define PRE r19 #define INCX8 r20 #define I r29 #define J r28 #define PR r30 #define ARLC r31 #define ALPHA_R f8 #define ALPHA_I f9 PROLOGUE .prologue PROFCODE {.mmi adds r22 = 16, SP adds r23 = 24, SP mov PR = pr } { .mib cmp.ge p7, p0 = 0, N shr I = N, 3 (p7) br.ret.sptk.many b0 } ;; #ifdef XDOUBLE { .mmi ld8 X1 = [r22] ld8 INCX = [r23] nop __LINE__ } ;; #endif { .mfi and J = 7, N fcmp.eq p0, p11 = ALPHA_I, f0 .save ar.lc, ARLC mov ARLC = ar.lc } { .mfi adds I = -1, I fcmp.eq p0, p10 = ALPHA_R, f0 shl INCX = INCX, ZBASE_SHIFT } ;; .body { .mmi shladd INCX8 = INCX, 3, r0 shladd X2 = INCX, 1, X1 mov pr.rot= 0 } { .mmi shladd INCX3 = INCX, 1, INCX adds PRE = PREFETCH_SIZE * SIZE, X1 mov Y1 = X1 } ;; { .mmi cmp.gt p8, p0 = 0, I cmp.ge p9, p0 = 0, J mov ar.lc = I } { .mmi adds INCX = -1 * SIZE, INCX adds INCX3 = -1 * SIZE, INCX3 tbit.z p0, p13 = N, 2 } ;; { .bbb (p10) br.cond.dptk .L100 (p11) br.cond.dptk .L100 (p8) br.cond.dpnt .L20 } ;; .align 32 .L10: { .mmb STFD [X1] = f0, 1 * SIZE STFD [X2] = f0, 1 * SIZE nop.b 0 } { .mmb lfetch.excl.nt1 [PRE], INCX8 nop.m 0 } ;; { .mmb STFD [X1] = f0 add X1 = INCX, X1 } { .mmb STFD [X2] = f0 add X2 = INCX, X2 } ;; { .mmb STFD [X1] = f0, 1 * SIZE STFD [X2] = f0, 1 * SIZE nop.b 0 } ;; { .mmb STFD [X1] = f0 add X1 = INCX3, X1 } { .mmb STFD [X2] = f0 add X2 = INCX3, X2 } ;; { .mmb STFD [X1] = f0, 1 * SIZE STFD [X2] = f0, 1 * SIZE nop.b 0 } ;; { .mmb STFD [X1] = f0 add X1 = INCX, X1 } { .mmb STFD [X2] = f0 add X2 = INCX, X2 } ;; { .mmb STFD [X1] = f0, 1 * SIZE STFD [X2] = f0, 1 * SIZE nop.b 0 } ;; { .mmb STFD [X1] = f0 add X1 = INCX3, X1 } { .mmb STFD [X2] = f0 add X2 = INCX3, X2 br.cloop.sptk.few .L10 } ;; .align 32 .L20: { .mmi (p13) STFD [X1] = f0, 1 * SIZE (p13) STFD [X2] = f0, 1 * SIZE mov ar.lc = ARLC } ;; { .mmi (p13) STFD [X1] = f0 (p13) add X1 = INCX, X1 tbit.z p0, p14 = N, 1 } { .mmi (p13) STFD [X2] = f0 (p13) add X2 = INCX, X2 tbit.z p0, p15 = N, 0 } ;; { .mmb (p13) STFD [X1] = f0, 1 * SIZE (p13) STFD [X2] = f0, 1 * SIZE nop.b 0 } { .mib nop.m 0 mov pr = PR, -65474 (p9) br.ret.sptk.many b0 } ;; { .mmb (p13) STFD [X1] = f0 (p13) add X1 = INCX3, X1 } { .mmb (p13) STFD [X2] = f0 (p13) add X2 = INCX3, X2 } ;; (p14) STFD [X1] = f0, 1 * SIZE ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 } ;; (p14) STFD [X1] = f0, 1 * SIZE ;; { .mmb (p14) STFD [X1] = f0 (p14) add X1 = INCX, X1 } ;; (p15) STFD [X1] = f0, 1 * SIZE ;; { .mib (p15) STFD [X1] = f0 mov pr = PR, -65474 br.ret.sptk.many b0 } ;; .align 32 .L100: cmp.eq p16, p0 = r0, r0 mov.i ar.ec = 6 (p8) br.cond.dpnt .L170 ;; .align 32 .L160: { .mmf (p21) STFD [X1] = f6, 1 * SIZE (p16) lfetch.excl.nt1 [PRE], INCX8 (p21) FMS f12 = ALPHA_R, f85, f12 } { .mfb (p16) LDFD f32 = [Y1], 1 * SIZE (p20) FMPY f6 = ALPHA_I, f42 } ;; { .mmf (p21) STFD [X1] = f43 (p21) add X1 = INCX, X1 (p21) FMA f91 = ALPHA_I, f85, f91 } { .mfb (p16) LDFD f38 = [Y1], INCX (p20) FMPY f42 = ALPHA_R, f42 } ;; { .mmf (p21) STFD [X1] = f7, 1 * SIZE (p21) FMS f13 = ALPHA_R, f97, f13 } { .mfb (p16) LDFD f44 = [Y1], 1 * SIZE (p20) FMPY f7 = ALPHA_I, f54 } ;; { .mmf (p21) STFD [X1] = f55 (p21) add X1 = INCX, X1 (p21) FMA f103 = ALPHA_I, f97, f103 } { .mfb (p16) LDFD f50 = [Y1], INCX (p20) FMPY f54 = ALPHA_R, f54 } ;; { .mmf (p21) STFD [X1] = f10, 1 * SIZE (p21) FMS f14 = ALPHA_R, f109, f14 } { .mfb (p16) LDFD f56 = [Y1], 1 * SIZE (p20) FMPY f10 = ALPHA_I, f66 } ;; { .mmf (p21) STFD [X1] = f67 (p21) add X1 = INCX, X1 (p21) FMA f115 = ALPHA_I, f109, f115 } { .mfb (p16) LDFD f62 = [Y1], INCX (p20) FMPY f66 = ALPHA_R, f66 } ;; { .mmf (p21) STFD [X1] = f11, 1 * SIZE (p21) FMS f15 = ALPHA_R, f121, f15 } { .mfb (p16) LDFD f68 = [Y1], 1 * SIZE (p20) FMPY f11 = ALPHA_I, f78 } ;; { .mmf (p21) STFD [X1] = f79 (p21) add X1 = INCX, X1 (p21) FMA f127 = ALPHA_I, f121, f127 } { .mfb (p16) LDFD f74 = [Y1], INCX (p20) FMPY f78 = ALPHA_R, f78 } ;; { .mmf (p21) STFD [X1] = f12, 1 * SIZE (p20) FMS f6 = ALPHA_R, f36, f6 } { .mfb (p16) LDFD f80 = [Y1], 1 * SIZE (p20) FMPY f12 = ALPHA_I, f90 } ;; { .mmf (p21) STFD [X1] = f91 (p21) add X1 = INCX, X1 (p20) FMA f42 = ALPHA_I, f36, f42 } { .mfb (p16) LDFD f86 = [Y1], INCX (p20) FMPY f90 = ALPHA_R, f90 } ;; { .mmf (p21) STFD [X1] = f13, 1 * SIZE (p20) FMS f7 = ALPHA_R, f48, f7 } { .mfb (p16) LDFD f92 = [Y1], 1 * SIZE (p20) FMPY f13 = ALPHA_I, f102 } ;; { .mmf (p21) STFD [X1] = f103 (p21) add X1 = INCX, X1 (p20) FMA f54 = ALPHA_I, f48, f54 } { .mfb (p16) LDFD f98 = [Y1], INCX (p20) FMPY f102 = ALPHA_R, f102 } ;; { .mmf (p21) STFD [X1] = f14, 1 * SIZE (p20) FMS f10 = ALPHA_R, f60, f10 } { .mfb (p16) LDFD f104 = [Y1], 1 * SIZE (p20) FMPY f14 = ALPHA_I, f114 } ;; { .mmf (p21) STFD [X1] = f115 (p21) add X1 = INCX, X1 (p20) FMA f66 = ALPHA_I, f60, f66 } { .mfb (p16) LDFD f110 = [Y1], INCX (p20) FMPY f114 = ALPHA_R, f114 } ;; { .mmf (p21) STFD [X1] = f15, 1 * SIZE (p20) FMS f11 = ALPHA_R, f72, f11 } { .mfb (p16) LDFD f116 = [Y1], 1 * SIZE (p20) FMPY f15 = ALPHA_I, f126 } ;; { .mmf (p21) STFD [X1] = f127 (p21) add X1 = INCX, X1 (p20) FMA f78 = ALPHA_I, f72, f78 } { .mfb (p16) LDFD f122 = [Y1], INCX (p20) FMPY f126 = ALPHA_R, f126 br.ctop.sptk.few .L160 } ;; .align 16 .L170: { .mmi (p13) LDFD f48 = [Y1], 1 * SIZE mov ar.lc = ARLC } ;; { .mib (p13) LDFD f49 = [Y1], INCX mov pr = PR, -65474 (p9) br.ret.sptk.many b0 } ;; (p13) LDFD f50 = [Y1], 1 * SIZE tbit.z p0, p14 = N, 1 ;; (p13) LDFD f51 = [Y1], INCX tbit.z p0, p15 = N, 0 ;; (p13) LDFD f52 = [Y1], 1 * SIZE ;; (p13) LDFD f53 = [Y1], INCX ;; (p13) LDFD f54 = [Y1], 1 * SIZE (p13) FMPY f112 = ALPHA_I, f48 ;; (p13) LDFD f55 = [Y1], INCX (p13) FMPY f111 = ALPHA_I, f49 ;; (p14) LDFD f56 = [Y1], 1 * SIZE (p13) FMPY f114 = ALPHA_I, f50 ;; (p14) LDFD f57 = [Y1], INCX (p13) FMPY f113 = ALPHA_I, f51 ;; (p14) LDFD f58 = [Y1], 1 * SIZE (p13) FMPY f116 = ALPHA_I, f52 ;; (p14) LDFD f59 = [Y1], INCX (p13) FMPY f115 = ALPHA_I, f53 ;; (p15) LDFD f60 = [Y1], 1 * SIZE (p13) FMPY f118 = ALPHA_I, f54 ;; (p15) LDFD f61 = [Y1], INCX (p13) FMPY f117 = ALPHA_I, f55 ;; (p14) FMPY f120 = ALPHA_I, f56 (p14) FMPY f119 = ALPHA_I, f57 (p14) FMPY f122 = ALPHA_I, f58 (p14) FMPY f121 = ALPHA_I, f59 (p15) FMPY f124 = ALPHA_I, f60 (p15) FMPY f123 = ALPHA_I, f61 ;; (p13) FMS f48 = ALPHA_R, f48, f111 (p13) FMA f49 = ALPHA_R, f49, f112 (p13) FMS f50 = ALPHA_R, f50, f113 (p13) FMA f51 = ALPHA_R, f51, f114 ;; (p13) STFD [X1] = f48, 1 * SIZE (p13) FMS f52 = ALPHA_R, f52, f115 ;; (p13) STFD [X1] = f49 (p13) add X1 = INCX, X1 (p13) FMA f53 = ALPHA_R, f53, f116 ;; (p13) STFD [X1] = f50, 1 * SIZE (p13) FMS f54 = ALPHA_R, f54, f117 ;; (p13) STFD [X1] = f51 (p13) add X1 = INCX, X1 (p13) FMA f55 = ALPHA_R, f55, f118 ;; (p13) STFD [X1] = f52, 1 * SIZE (p14) FMS f56 = ALPHA_R, f56, f119 ;; (p13) STFD [X1] = f53 (p13) add X1 = INCX, X1 (p14) FMA f57 = ALPHA_R, f57, f120 ;; (p13) STFD [X1] = f54, 1 * SIZE (p14) FMS f58 = ALPHA_R, f58, f121 ;; (p13) STFD [X1] = f55 (p13) add X1 = INCX, X1 (p14) FMA f59 = ALPHA_R, f59, f122 ;; (p14) STFD [X1] = f56, 1 * SIZE (p15) FMS f60 = ALPHA_R, f60, f123 ;; (p14) STFD [X1] = f57 (p14) add X1 = INCX, X1 (p15) FMA f61 = ALPHA_R, f61, f124 ;; (p14) STFD [X1] = f58, 1 * SIZE ;; (p14) STFD [X1] = f59 (p14) add X1 = INCX, X1 ;; (p15) STFD [X1] = f60, 1 * SIZE ;; (p15) STFD [X1] = f61 mov pr = PR, -65474 br.ret.sptk.many b0 EPILOGUE OpenBLAS-0.2.20/kernel/ia64/zswap.S000066400000000000000000000230211313527062700164550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef XDOUBLE #define PREFETCH_SIZE ( 8 * 16) #elif defined(DOUBLE) #define PREFETCH_SIZE (16 * 16) #else #define PREFETCH_SIZE (32 * 16) #endif #define SP r12 #ifdef XDOUBLE #define N r32 #define X r14 #define INCX r15 #define Y r16 #define INCY r17 #else #define N r32 #define X r37 #define INCX r38 #define Y r39 #define INCY r36 #endif #define PRE1 r2 #define PRE2 r3 #define I r18 #define J r19 #define YY r20 #define XX r21 #define INCXM1 r22 #define INCYM1 r23 #define INCX8 r24 #define INCY8 r25 #define PR r30 #define ARLC r31 PROLOGUE .prologue PROFCODE { .mmi adds r14 = 16, SP adds r15 = 24, SP adds r16 = 32, SP } { .mmb adds r17 = 40, SP cmp.gt p15, p0 = r0, N (p15) br.ret.sptk.many b0 } ;; #ifdef XDOUBLE { .mmi ld8 X = [r14] ld8 INCX = [r15] nop __LINE__ } { .mmi ld8 Y = [r16] ld8 INCY = [r17] nop __LINE__ } ;; #else { .mmi ld8 INCY = [r14] nop __LINE__ nop __LINE__ } ;; #endif { .mii .save ar.lc, ARLC mov ARLC = ar.lc shl INCX = INCX, ZBASE_SHIFT } ;; .body { .mii and J = 7, N mov PR = pr shl INCY = INCY, ZBASE_SHIFT } ;; { .mmi mov XX = X mov YY = Y shr I = N, 3 } ;; { .mmi adds I = -1, I cmp.eq p9, p0 = r0, J mov pr.rot = 0 } ;; { .mmi shladd INCX8 = INCX, 3, r0 shladd INCY8 = INCY, 3, r0 mov ar.ec= 3 } { .mmi adds INCXM1 = -SIZE, INCX adds INCYM1 = -SIZE, INCY cmp.eq p16, p0 = r0, r0 } ;; { .mmi adds PRE1 = PREFETCH_SIZE * SIZE, X adds PRE2 = PREFETCH_SIZE * SIZE, Y mov ar.lc = I } { .mib cmp.eq p8 ,p0 = -1, I tbit.z p0, p12 = J, 2 (p8) br.cond.dpnt .L55 } ;; .align 32 .L52: { .mmi (p18) STFD [XX] = f37, 1 * SIZE (p18) STFD [YY] = f34, 1 * SIZE } { .mmi (p16) LDFD f32 = [X], 1 * SIZE (p16) LDFD f35 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f43 (p18) STFD [YY] = f40 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f38 = [X], INCXM1 (p16) LDFD f41 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } ;; { .mmi (p18) STFD [XX] = f49, 1 * SIZE (p18) STFD [YY] = f46, 1 * SIZE } { .mmi (p16) LDFD f44 = [X], 1 * SIZE (p16) LDFD f47 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f55 (p18) STFD [YY] = f52 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f50 = [X], INCXM1 (p16) LDFD f53 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } ;; { .mmi (p18) STFD [XX] = f61, 1 * SIZE (p18) STFD [YY] = f58, 1 * SIZE } { .mmi (p16) LDFD f56 = [X], 1 * SIZE (p16) LDFD f59 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f67 (p18) STFD [YY] = f64 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f62 = [X], INCXM1 (p16) LDFD f65 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } ;; { .mmi (p18) STFD [XX] = f73, 1 * SIZE (p18) STFD [YY] = f70, 1 * SIZE } { .mmi (p16) LDFD f68 = [X], 1 * SIZE (p16) LDFD f71 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f79 (p18) STFD [YY] = f76 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f74 = [X], INCXM1 (p16) LDFD f77 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } ;; { .mmi (p18) STFD [XX] = f85, 1 * SIZE (p18) STFD [YY] = f82, 1 * SIZE } { .mmi (p16) LDFD f80 = [X], 1 * SIZE (p16) LDFD f83 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f91 (p18) STFD [YY] = f88 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f86 = [X], INCXM1 (p16) LDFD f89 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } ;; { .mmi (p18) STFD [XX] = f97, 1 * SIZE (p18) STFD [YY] = f94, 1 * SIZE } { .mmi (p16) LDFD f92 = [X], 1 * SIZE (p16) LDFD f95 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f103 (p18) STFD [YY] = f100 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f98 = [X], INCXM1 (p16) LDFD f101 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } ;; { .mmi (p18) STFD [XX] = f109, 1 * SIZE (p18) STFD [YY] = f106, 1 * SIZE } { .mmi (p16) LDFD f104 = [X], 1 * SIZE (p16) LDFD f107 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f115 (p18) STFD [YY] = f112 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f110 = [X], INCXM1 (p16) LDFD f113 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } ;; { .mmi (p18) STFD [XX] = f121, 1 * SIZE (p18) STFD [YY] = f118, 1 * SIZE } { .mmi (p16) LDFD f116 = [X], 1 * SIZE (p16) LDFD f119 = [Y], 1 * SIZE } ;; { .mmi (p18) STFD [XX] = f127 (p18) STFD [YY] = f124 (p18) add XX = XX, INCXM1 } { .mmi (p16) LDFD f122 = [X], INCXM1 (p16) LDFD f125 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } { .mmb (p16) lfetch.excl.nt1 [PRE1], INCX8 (p16) lfetch.excl.nt1 [PRE2], INCY8 br.ctop.sptk.few .L52 } ;; .align 32 .L55: { .mmi (p12) LDFD f32 = [X], 1 * SIZE (p12) LDFD f80 = [Y], 1 * SIZE mov ar.lc = ARLC } ;; { .mmi (p12) LDFD f33 = [X], INCXM1 (p12) LDFD f81 = [Y], INCYM1 mov pr = PR, -65474 } ;; { .mmb (p12) LDFD f34 = [X], 1 * SIZE (p12) LDFD f82 = [Y], 1 * SIZE (p9) br.ret.sptk.many b0 } ;; { .mmi (p12) LDFD f35 = [X], INCXM1 (p12) LDFD f83 = [Y], INCYM1 tbit.z p0, p13 = N, 1 } ;; { .mmi (p12) LDFD f36 = [X], 1 * SIZE (p12) LDFD f84 = [Y], 1 * SIZE tbit.z p0, p14 = N, 0 } ;; { .mmi (p12) LDFD f37 = [X], INCXM1 (p12) LDFD f85 = [Y], INCYM1 } ;; { .mmi (p12) STFD [XX] = f80, 1 * SIZE (p12) STFD [YY] = f32, 1 * SIZE } { .mmi (p12) LDFD f38 = [X], 1 * SIZE (p12) LDFD f86 = [Y], 1 * SIZE } ;; { .mmi (p12) STFD [XX] = f81 (p12) STFD [YY] = f33 (p12) add XX = XX, INCXM1 } { .mmi (p12) LDFD f39 = [X], INCXM1 (p12) LDFD f87 = [Y], INCYM1 (p12) add YY = YY, INCYM1 } ;; { .mmi (p12) STFD [XX] = f82, 1 * SIZE (p12) STFD [YY] = f34, 1 * SIZE } { .mmi (p13) LDFD f40 = [X], 1 * SIZE (p13) LDFD f88 = [Y], 1 * SIZE } ;; { .mmi (p12) STFD [XX] = f83 (p12) STFD [YY] = f35 (p12) add XX = XX, INCXM1 } { .mmi (p13) LDFD f41 = [X], INCXM1 (p13) LDFD f89 = [Y], INCYM1 (p12) add YY = YY, INCYM1 } ;; { .mmi (p12) STFD [XX] = f84, 1 * SIZE (p12) STFD [YY] = f36, 1 * SIZE } { .mmi (p13) LDFD f42 = [X], 1 * SIZE (p13) LDFD f90 = [Y], 1 * SIZE } ;; { .mmi (p12) STFD [XX] = f85 (p12) STFD [YY] = f37 (p12) add XX = XX, INCXM1 } { .mmi (p13) LDFD f43 = [X], INCXM1 (p13) LDFD f91 = [Y], INCYM1 (p12) add YY = YY, INCYM1 } ;; { .mmi (p12) STFD [XX] = f86, 1 * SIZE (p12) STFD [YY] = f38, 1 * SIZE } { .mmi (p14) LDFD f44 = [X], 1 * SIZE (p14) LDFD f92 = [Y], 1 * SIZE } ;; { .mmi (p12) STFD [XX] = f87 (p12) STFD [YY] = f39 (p12) add XX = XX, INCXM1 } { .mmi (p14) LDFD f45 = [X] (p14) LDFD f93 = [Y] (p12) add YY = YY, INCYM1 } ;; { .mmi (p13) STFD [XX] = f88, 1 * SIZE (p13) STFD [YY] = f40, 1 * SIZE } ;; (p13) STFD [XX] = f89 (p13) add XX = XX, INCXM1 (p13) STFD [YY] = f41 (p13) add YY = YY, INCYM1 ;; (p13) STFD [XX] = f90, 1 * SIZE (p13) STFD [YY] = f42, 1 * SIZE ;; (p13) STFD [XX] = f91 (p13) add XX = XX, INCXM1 (p13) STFD [YY] = f43 (p13) add YY = YY, INCYM1 ;; (p14) STFD [XX] = f92, 1 * SIZE (p14) STFD [YY] = f44, 1 * SIZE ;; (p14) STFD [XX] = f93 (p14) STFD [YY] = f45 br.ret.sptk.many b0 ;; EPILOGUE OpenBLAS-0.2.20/kernel/ia64/ztrsm_kernel_LN.S000066400000000000000000005163761313527062700204450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #ifndef LN #define CPREFETCHSIZE 7 #else #define CPREFETCHSIZE -8 #endif #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r37 #define B r38 #define C r39 #define LDC r35 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA_R f8 #define ALPHA_I f9 #define AORIG loc0 #define KK loc1 #define KK8 loc2 #define OFFSET loc3 #define AOFFSET2 loc4 #define BOFFSET2 loc5 #ifndef CONJ #define FCALC_A FSUB #define FCALC_B FADD #define FMA_A FNMA #define FMA_B FMA #else #define FCALC_A FADD #define FCALC_B FSUB #define FMA_A FMA #define FMA_B FNMA #endif #ifndef CONJ #define FCALC_C FMA #define FCALC_D FNMA #else #define FCALC_C FNMA #define FCALC_D FMA #endif #ifndef CONJ #define FMA_C FNMA #define FMA_D FMA #define FSUB_A FSUB #else #define FMA_C FMA #define FMA_D FMS #define FSUB_A FADD #endif PROLOGUE .prologue PROFCODE { .mfi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 8, 0, 0 mov f64 = f0 adds r14 = 16, SP } { .mfi nop __LINE__ mov f65 = f0 adds r15 = 24, SP } ;; { .mfi ld8 LDC = [r14] mov f81 = f0 mov PR = pr } { .mfi ld8 OFFSET = [r15] mov f96 = f0 shr J = N, 2 } ;; { .mfi shladd LDC = LDC, ZBASE_SHIFT, r0 mov f97 = f0 } { .mfi nop __LINE__ mov f113 = f0 } ;; #ifdef LN { .mmi setf.sig f32 = M setf.sig f33 = K shladd C = M, ZBASE_SHIFT, C } ;; {.mmf nop __LINE__ nop __LINE__ xmpy.l f32 = f32, f33 } ;; { .mmi getf.sig r2 = f32 ;; nop __LINE__ shladd A = r2, ZBASE_SHIFT, A } ;; #endif #ifdef RN sub KK = r0, OFFSET #endif #ifdef RT { .mmi setf.sig f32 = N setf.sig f33 = K nop __LINE__ } ;; { .mmi setf.sig f34 = LDC nop __LINE__ nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ xmpy.l f33 = f32, f33 } { .mmf nop __LINE__ sub KK = N, OFFSET xmpy.l f34 = f32, f34 } ;; { .mmi getf.sig r2 = f33 getf.sig r3 = f34 } ;; shladd B = r2, ZBASE_SHIFT, B add C = r3, C #endif ;; .body { .mfi nop __LINE__ mov f80 = f0 mov ARLC = ar.lc } { .mfb cmp.ge p6, p0 = 0, J mov f112 = f0 (p6) br.cond.dpnt .L050 } ;; .align 16 .L010: #ifdef RT { .mmi shladd r3 = LDC, 2, r0 nop __LINE__ shl r2 = K, 2 + ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } ;; #endif { .mmi mov C1 = C // coffset1 = c + 0 * ldc add C2 = LDC, C // coffset2 = c + 1 * ldc } { .mmi adds J = -1, J #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } ;; { .mmi shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } { .mib #ifndef RT shladd C = LDC, 2, C // coffset += 8 * ldc #else nop __LINE__ #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L020 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f72 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f73 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f72 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mmi nop __LINE__ adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f88 = f0 shr L = L, 1 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f104 = f0 adds L = -1, L } { .mfb adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f105 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } { .mfi cmp.eq p3, p0 = r0, r0 mov f121 = f0 nop __LINE__ } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L038 ;; .align 16 .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f88, f80 FSUB_A f81 = f89, f81 FSUB f96 = f104, f96 FSUB_A f97 = f105, f97 FSUB f112 = f120, f112 FSUB_A f113 = f121, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [AOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f96 = f104, f96 FSUB f97 = f105, f97 FSUB f112 = f120, f112 FSUB f113 = f121, f113 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 FMPY f36 = f120, f96 FMPY f37 = f121, f96 FMPY f38 = f120, f112 FMPY f39 = f121, f112 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 FMA_C f96 = f121, f97, f36 FMA_D f97 = f120, f97, f37 FMA_C f112 = f121, f113, f38 FMA_D f113 = f120, f113, f39 ;; #endif #ifdef LT LDFPD f90, f91 = [AOFFSET] ;; FMPY f32 = f90, f64 FMPY f33 = f91, f64 FMPY f34 = f90, f80 FMPY f35 = f91, f80 FMPY f36 = f90, f96 FMPY f37 = f91, f96 FMPY f38 = f90, f112 FMPY f39 = f91, f112 ;; FMA_C f64 = f91, f65, f32 FMA_D f65 = f90, f65, f33 FMA_C f80 = f91, f81, f34 FMA_D f81 = f90, f81, f35 FMA_C f96 = f91, f97, f36 FMA_D f97 = f90, f97, f37 FMA_C f112 = f91, f113, f38 FMA_D f113 = f90, f113, f39 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f108, f109 = [BOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f126, f127 = [BOFFSET] adds BOFFSET = - 30 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 ;; FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 ;; FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 ;; FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 ;; FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 ;; FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 ;; FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 ;; #endif #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET ;; LDFPD f72, f73 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f76, f77 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f92, f93 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 ;; FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 ;; FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 ;; FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 ;; FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 ;; FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 ;; FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f97, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f81, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C3 ] = f96, SIZE ;; STFD [C3 ] = f97, SIZE ;; STFD [C4 ] = f112, SIZE ;; STFD [C4 ] = f113, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET shladd BOFFSET = L, 2, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L020: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L010x } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = -1, L } { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f114 = f0 mov ar.lc = L } { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L028 ;; .align 16 .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f120, f121 = [BOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [BOFFSET] FSUB f80 = f74, f80 adds BOFFSET = -14 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB_A f81 = f75, f81 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f96 = f88, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f97 = f89, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f90, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f113 = f91, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f66 = f104, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f67 = f105, f67 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f82 = f106, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f83 = f107, f83 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f98 = f120, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f99 = f121, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f115 = f123, f115 nop __LINE__ } ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [AOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f120, f121 = [AOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [AOFFSET] FSUB f66 = f74, f66 adds AOFFSET = -14 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f67 = f75, f67 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f80 = f88, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f89, f81 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f91, f83 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f96 = f104, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f105, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f98 = f106, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f99 = f107, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f120, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f121, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f123, f115 nop __LINE__ } ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f66 FMPY f33 = f105, f66 FMPY f34 = f104, f82 FMPY f35 = f105, f82 FMPY f36 = f104, f98 FMPY f37 = f105, f98 FMPY f38 = f104, f114 FMPY f39 = f105, f114 ;; FMA_C f66 = f105, f67, f32 FMA_D f67 = f104, f67, f33 FMA_C f82 = f105, f83, f34 FMA_D f83 = f104, f83, f35 FMA_C f98 = f105, f99, f36 FMA_D f99 = f104, f99, f37 FMA_C f114 = f105, f115, f38 FMA_D f115 = f104, f115, f39 ;; FNMA f64 = f106, f66, f64 FMA_A f65 = f107, f66, f65 FNMA f80 = f106, f82, f80 FMA_A f81 = f107, f82, f81 FNMA f96 = f106, f98, f96 FMA_A f97 = f107, f98, f97 FNMA f112 = f106, f114, f112 FMA_A f113 = f107, f114, f113 ;; FMA_B f64 = f107, f67, f64 FNMA f65 = f106, f67, f65 FMA_B f80 = f107, f83, f80 FNMA f81 = f106, f83, f81 FMA_B f96 = f107, f99, f96 FNMA f97 = f106, f99, f97 FMA_B f112 = f107, f115, f112 FNMA f113 = f106, f115, f113 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 FMPY f36 = f120, f96 FMPY f37 = f121, f96 FMPY f38 = f120, f112 FMPY f39 = f121, f112 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 FMA_C f96 = f121, f97, f36 FMA_D f97 = f120, f97, f37 FMA_C f112 = f121, f113, f38 FMA_D f113 = f120, f113, f39 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; FNMA f66 = f74, f64, f66 FMA_A f67 = f75, f64, f67 FNMA f82 = f74, f80, f82 FMA_A f83 = f75, f80, f83 FNMA f98 = f74, f96, f98 FMA_A f99 = f75, f96, f99 FNMA f114 = f74, f112, f114 FMA_A f115 = f75, f112, f115 ;; FMA_B f66 = f75, f65, f66 FNMA f67 = f74, f65, f67 FMA_B f82 = f75, f81, f82 FNMA f83 = f74, f81, f83 FMA_B f98 = f75, f97, f98 FNMA f99 = f74, f97, f99 FMA_B f114 = f75, f113, f114 FNMA f115 = f74, f113, f115 ;; FMPY f32 = f90, f66 FMPY f33 = f91, f66 FMPY f34 = f90, f82 FMPY f35 = f91, f82 FMPY f36 = f90, f98 FMPY f37 = f91, f98 FMPY f38 = f90, f114 FMPY f39 = f91, f114 ;; FMA_C f66 = f91, f67, f32 FMA_D f67 = f90, f67, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 FMA_C f98 = f91, f99, f36 FMA_D f99 = f90, f99, f37 FMA_C f114 = f91, f115, f38 FMA_D f115 = f90, f115, f39 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f108, f109 = [BOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f126, f127 = [BOFFSET] adds BOFFSET = - 30 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f66 FMPY f35 = f73, f66 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f66 = f73, f67, f34 FMA_D f67 = f72, f67, f35 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f82 = f74, f66, f82 FMA_A f83 = f75, f66, f83 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f82 = f75, f67, f82 FNMA f83 = f74, f67, f83 ;; FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f98 = f76, f66, f98 FMA_A f99 = f77, f66, f99 ;; FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f98 = f77, f67, f98 FNMA f99 = f76, f67, f99 ;; FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 FNMA f114 = f78, f66, f114 FMA_A f115 = f79, f66, f115 ;; FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 FMA_B f114 = f79, f67, f114 FNMA f115 = f78, f67, f115 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f82 FMPY f35 = f91, f82 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f98 = f92, f82, f98 FMA_A f99 = f93, f82, f99 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f98 = f93, f83, f98 FNMA f99 = f92, f83, f99 ;; FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 FNMA f114 = f94, f82, f114 FMA_A f115 = f95, f82, f115 ;; FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 FMA_B f114 = f95, f83, f114 FNMA f115 = f94, f83, f115 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 FMPY f34 = f108, f98 FMPY f35 = f109, f98 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 FMA_C f98 = f109, f99, f34 FMA_D f99 = f108, f99, f35 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 FNMA f114 = f110, f98, f114 FMA_A f115 = f111, f98, f115 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 FMA_B f114 = f111, f99, f114 FNMA f115 = f110, f99, f115 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 FMPY f34 = f126, f114 FMPY f35 = f127, f114 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 FMA_C f114 = f127, f115, f34 FMA_D f115 = f126, f115, f35 ;; #endif #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET ;; LDFPD f72, f73 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f76, f77 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f92, f93 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f98 = f74, f114, f98 FMA_A f99 = f75, f114, f99 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f98 = f75, f115, f98 FNMA f99 = f74, f115, f99 ;; FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f82 = f76, f114, f82 FMA_A f83 = f77, f114, f83 ;; FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f82 = f77, f115, f82 FNMA f83 = f76, f115, f83 ;; FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 FNMA f66 = f78, f114, f66 FMA_A f67 = f79, f114, f67 ;; FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 FMA_B f66 = f79, f115, f66 FNMA f67 = f78, f115, f67 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 FMPY f34 = f88, f98 FMPY f35 = f89, f98 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 FMA_C f98 = f89, f99, f34 FMA_D f99 = f88, f99, f35 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f82 = f90, f98, f82 FMA_A f83 = f91, f98, f83 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f82 = f91, f99, f82 FNMA f83 = f90, f99, f83 ;; FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 FNMA f66 = f92, f98, f66 FMA_A f67 = f93, f98, f67 ;; FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 FMA_B f66 = f93, f99, f66 FNMA f67 = f92, f99, f67 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f82 FMPY f35 = f105, f82 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f82 = f105, f83, f34 FMA_D f83 = f104, f83, f35 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f66 = f106, f82, f66 FMA_A f67 = f107, f82, f67 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f66 = f107, f83, f66 FNMA f67 = f106, f83, f67 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f66 FMPY f35 = f121, f66 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f66 = f121, f67, f34 FMA_D f67 = f120, f67, f35 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f98, SIZE ;; STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f114, SIZE ;; STFD [BOFFSET] = f83, 5 * SIZE STFD [BOFFSET2] = f115, 5 * SIZE ;; adds BOFFSET = - 16 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f82, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f83, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f114, SIZE ;; STFD [AOFFSET] = f99, 5 * SIZE STFD [AOFFSET2] = f115, 5 * SIZE ;; adds AOFFSET = - 16 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f66, SIZE ;; STFD [C1 ] = f67, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C2 ] = f82, SIZE ;; STFD [C2 ] = f83, SIZE ;; STFD [C3 ] = f96, SIZE ;; STFD [C3 ] = f97, SIZE ;; STFD [C3 ] = f98, SIZE ;; STFD [C3 ] = f99, SIZE ;; STFD [C4 ] = f112, SIZE ;; STFD [C4 ] = f113, SIZE ;; STFD [C4 ] = f114, SIZE ;; STFD [C4 ] = f115, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 2, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L010x: #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif shr I = M, 2 ;; cmp.eq p6, p7 = 0, I (p6) br.cond.dpnt .L049 ;; .align 16 .L011: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mfi shladd r3 = KK, ZBASE_SHIFT, r0 mov f118 = f0 nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 nop __LINE__ } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = 1, L } { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 adds C5 = 4 * SIZE, C1 } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f115 = f0 adds C6 = 4 * SIZE, C2 } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f68 = f0 shr L = L, 1 } { .mfi setf.d f86 = r0 mov f69 = f0 adds C7 = 4 * SIZE, C3 } ;; { .mfi CPREFETCH [PREC], LDC mov f84 = f0 adds L = -1, L } { .mfi setf.d f87 = r0 mov f85 = f0 adds C8 = 4 * SIZE, C4 } ;; { .mfi CPREFETCH [PREC], LDC mov f100 = f0 mov ar.lc = L } { .mfi setf.d f102 = r0 mov f101 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f116 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } { .mfi setf.d f103 = r0 mov f117 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC] mov f70 = f0 cmp.eq p6, p0 = -1, L } { .mfb setf.d f119 = r0 mov f71 = f0 (p6) br.cond.dpnt .L018 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA_B f65 = f32, f49, f65 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f69 = f36, f49, f69 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f85 = f36, f51, f85 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f101 = f36, f53, f101 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f117 = f36, f55, f117 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f68 = f37, f49, f68 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f84 = f37, f51, f84 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f100 = f37, f53, f100 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f116 = f37, f55, f116 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f71 = f38, f49, f71 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f87 = f38, f51, f87 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f103 = f38, f53, f103 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f119 = f38, f55, f119 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f70 = f39, f49, f70 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f86 = f39, f51, f86 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f102 = f39, f53, f102 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f118 = f39, f55, f118 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb nop __LINE__ (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb nop __LINE__ (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb nop __LINE__ (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb nop __LINE__ (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb nop __LINE__ (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb nop __LINE__ (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb nop __LINE__ (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi nop __LINE__ (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L018: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f94, f95 = [BOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [BOFFSET], 2 * SIZE FSUB f80 = f74, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f81 = f75, f81 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [BOFFSET], 2 * SIZE FSUB f96 = f76, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f97 = f77, f97 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [BOFFSET], 2 * SIZE FSUB f112 = f78, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f113 = f79, f113 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [BOFFSET], 2 * SIZE FSUB f66 = f88, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f67 = f89, f67 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [BOFFSET], 2 * SIZE FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f83 = f91, f83 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [BOFFSET], 2 * SIZE FSUB f98 = f92, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f99 = f93, f99 nop __LINE__ } ;; { .mfi LDFPD f124, f125 = [BOFFSET], 2 * SIZE FSUB f114 = f94, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f115 = f95, f115 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [BOFFSET] FSUB f68 = f104, f68 adds BOFFSET = -30 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB_A f69 = f105, f69 #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET #else nop __LINE__ #endif } ;; { .mfi LDFPD f72, f73 = [AOFFSET] FSUB f84 = f106, f84 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } { .mfi nop __LINE__ FSUB_A f85 = f107, f85 nop __LINE__ } ;; { .mfi LDFPD f74, f75 = [AOFFSET] FSUB f100 = f108, f100 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } { .mfi nop __LINE__ FSUB_A f101 = f109, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f116 = f110, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f117 = f111, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f70 = f120, f70 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f71 = f121, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f86 = f122, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f87 = f123, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f102 = f124, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f103 = f125, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f118 = f126, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f119 = f127, f119 nop __LINE__ } ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f92, f93 = [AOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [AOFFSET], 2 * SIZE FSUB f66 = f74, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB f67 = f75, f67 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [AOFFSET], 2 * SIZE FSUB f68 = f76, f68 nop __LINE__ } { .mfi nop __LINE__ FSUB f69 = f77, f69 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [AOFFSET], 2 * SIZE FSUB f70 = f78, f70 nop __LINE__ } { .mfi nop __LINE__ FSUB f71 = f79, f71 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [AOFFSET], 2 * SIZE FSUB f80 = f88, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f89, f81 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [AOFFSET], 2 * SIZE FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f91, f83 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [AOFFSET], 2 * SIZE FSUB f84 = f92, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f85 = f93, f85 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [AOFFSET], 2 * SIZE FSUB f86 = f94, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f95, f87 nop __LINE__ } ;; { .mfi LDFPD f124, f125 = [AOFFSET], 2 * SIZE FSUB f96 = f104, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f105, f97 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [AOFFSET] FSUB f98 = f106, f98 adds AOFFSET = -30 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f99 = f107, f99 #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET #else nop __LINE__ #endif } ;; { .mfi LDFPD f72, f73 = [BOFFSET] FSUB f100 = f108, f100 #ifdef RN adds BOFFSET = 2 * SIZE, BOFFSET #else adds BOFFSET = - 2 * SIZE, BOFFSET #endif } { .mfi nop __LINE__ FSUB f101 = f109, f101 nop __LINE__ } ;; { .mfi LDFPD f74, f75 = [BOFFSET] FSUB f102 = f110, f102 #ifdef RN adds BOFFSET = 2 * SIZE, BOFFSET #else adds BOFFSET = - 2 * SIZE, BOFFSET #endif } { .mfi nop __LINE__ FSUB f103 = f111, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f120, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f121, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f123, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f116 = f124, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f117 = f125, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f118 = f126, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f127, f119 nop __LINE__ } ;; #endif #ifdef LN { .mfi LDFPD f76, f77 = [AOFFSET] FMPY f32 = f72, f70 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f36 = f72, f102 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [AOFFSET] FMPY f33 = f73, f70 adds AOFFSET = - 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f102 nop __LINE__ } ;; { .mfi LDFPD f88, f89 = [AOFFSET] FMPY f34 = f72, f86 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f38 = f72, f118 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [AOFFSET] FMPY f35 = f73, f86 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f39 = f73, f118 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [AOFFSET] FMA_C f70 = f73, f71, f32 adds AOFFSET = - 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f102 = f73, f103, f36 adds C1 = -2 * SIZE, C1 } ;; { .mfi LDFPD f104, f105 = [AOFFSET] FMA_D f71 = f72, f71, f33 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f103 = f72, f103, f37 adds C2 = -2 * SIZE, C2 } ;; { .mfi LDFPD f106, f107 = [AOFFSET] FMA_C f86 = f73, f87, f34 adds AOFFSET = - 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f118 = f73, f119, f38 adds C3 = -2 * SIZE, C3 } ;; { .mfi LDFPD f120, f121 = [AOFFSET] FMA_D f87 = f72, f87, f35 adds BOFFSET2 = 28 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f119 = f72, f119, f39 adds BOFFSET = 24 * SIZE, BOFFSET } ;; { .mfi STFD [BOFFSET] = f70, SIZE FNMA f68 = f74, f70, f68 adds C4 = -2 * SIZE, C4 } { .mfi STFD [BOFFSET2] = f102, SIZE FNMA f100 = f74, f102, f100 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f71, SIZE FMA_A f69 = f75, f70, f69 nop __LINE__ } { .mfi STFD [BOFFSET2] = f103, SIZE FMA_A f101 = f75, f102, f101 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f86, SIZE FNMA f84 = f74, f86, f84 nop __LINE__ } { .mfi STFD [BOFFSET2] = f118, SIZE FNMA f116 = f74, f118, f116 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f87, -11 * SIZE FMA_A f85 = f75, f86, f85 nop __LINE__ } { .mfi STFD [BOFFSET2] = f119, -11 * SIZE FMA_A f117 = f75, f118, f117 nop __LINE__ } ;; { .mfi STFD [C1 ] = f70, SIZE FMA_B f68 = f75, f71, f68 nop __LINE__ } { .mfi STFD [C3 ] = f102, SIZE FMA_B f100 = f75, f103, f100 nop __LINE__ } ;; { .mfi STFD [C1 ] = f71, -3 * SIZE FNMA f69 = f74, f71, f69 nop __LINE__ } { .mfi STFD [C3 ] = f103, -3 * SIZE FNMA f101 = f74, f103, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f86, SIZE FMA_B f84 = f75, f87, f84 nop __LINE__ } { .mfi STFD [C4 ] = f118, SIZE FMA_B f116 = f75, f119, f116 nop __LINE__ } ;; { .mfi STFD [C2 ] = f87, -3 * SIZE FNMA f85 = f74, f87, f85 nop __LINE__ } { .mfi STFD [C4 ] = f119, -3 * SIZE FNMA f117 = f74, f119, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f76, f70, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f76, f102, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f77, f70, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f99 = f77, f102, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f82 = f76, f86, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f76, f118, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f83 = f77, f86, f83 nop __LINE__ } { .mfi nop __LINE__ FMA_A f115 = f77, f118, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f77, f71, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f98 = f77, f103, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f76, f71, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f99 = f76, f103, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f82 = f77, f87, f82 nop __LINE__ } { .mfi nop __LINE__ FMA_B f114 = f77, f119, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f83 = f76, f87, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f115 = f76, f119, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f78, f70, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f96 = f78, f102, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f79, f70, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f97 = f79, f102, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f78, f86, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f112 = f78, f118, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f79, f86, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f113 = f79, f118, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f79, f71, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f96 = f79, f103, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f78, f71, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f78, f103, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f79, f87, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f112 = f79, f119, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f78, f87, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f113 = f78, f119, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f88, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f88, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f89, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f89, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f88, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f88, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f89, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f89, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f68 = f89, f69, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f89, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f69 = f88, f69, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f88, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f84 = f89, f85, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f89, f117, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f85 = f88, f85, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f88, f117, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f66 = f90, f68, f66 nop __LINE__ } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f98 = f90, f100, f98 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f69, SIZE FMA_A f67 = f91, f68, f67 nop __LINE__ } { .mfi STFD [BOFFSET2] = f101, SIZE FMA_A f99 = f91, f100, f99 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f82 = f90, f84, f82 nop __LINE__ } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f114 = f90, f116, f114 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f85, -11 * SIZE FMA_A f83 = f91, f84, f83 nop __LINE__ } { .mfi STFD [BOFFSET2] = f117, -11 * SIZE FMA_A f115 = f91, f116, f115 nop __LINE__ } ;; { .mfi STFD [C1 ] = f68, SIZE FMA_B f66 = f91, f69, f66 nop __LINE__ } { .mfi STFD [C3 ] = f100, SIZE FMA_B f98 = f91, f101, f98 nop __LINE__ } ;; { .mfi STFD [C1 ] = f69, -3 * SIZE FNMA f67 = f90, f69, f67 nop __LINE__ } { .mfi STFD [C3 ] = f101, -3 * SIZE FNMA f99 = f90, f101, f99 nop __LINE__ } ;; { .mfi STFD [C2 ] = f84, SIZE FMA_B f82 = f91, f85, f82 nop __LINE__ } { .mfi STFD [C4 ] = f116, SIZE FMA_B f114 = f91, f117, f114 nop __LINE__ } ;; { .mfi STFD [C2 ] = f85, -3 * SIZE FNMA f83 = f90, f85, f83 nop __LINE__ } { .mfi STFD [C4 ] = f117, -3 * SIZE FNMA f115 = f90, f117, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f92, f68, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f96 = f92, f100, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f93, f68, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f97 = f93, f100, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f92, f84, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f112 = f92, f116, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f93, f84, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f113 = f93, f116, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f93, f69, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f96 = f93, f101, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f92, f69, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f92, f101, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f93, f85, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f112 = f93, f117, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f92, f85, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f113 = f92, f117, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f104, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f104, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f105, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f105, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f104, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f104, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f105, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f105, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f105, f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f98 = f105, f99, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f104, f67, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f99 = f104, f99, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f105, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f114 = f105, f115, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f104, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f115 = f104, f115, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f64 = f106, f66, f64 nop __LINE__ } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f96 = f106, f98, f96 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMA_A f65 = f107, f66, f65 nop __LINE__ } { .mfi STFD [BOFFSET2] = f99, SIZE FMA_A f97 = f107, f98, f97 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f80 = f106, f82, f80 nop __LINE__ } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f112 = f106, f114, f112 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f83, -11 * SIZE FMA_A f81 = f107, f82, f81 nop __LINE__ } { .mfi STFD [BOFFSET2] = f115, -11 * SIZE FMA_A f113 = f107, f114, f113 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f64 = f107, f67, f64 nop __LINE__ } { .mfi STFD [C3 ] = f98, SIZE FMA_B f96 = f107, f99, f96 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, -3 * SIZE FNMA f65 = f106, f67, f65 nop __LINE__ } { .mfi STFD [C3 ] = f99, -3 * SIZE FNMA f97 = f106, f99, f97 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f80 = f107, f83, f80 nop __LINE__ } { .mfi STFD [C4 ] = f114, SIZE FMA_B f112 = f107, f115, f112 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, -3 * SIZE FNMA f81 = f106, f83, f81 nop __LINE__ } { .mfi STFD [C4 ] = f115, -3 * SIZE FNMA f113 = f106, f115, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f120, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f120, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f121, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f121, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f120, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f120, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f121, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f121, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f64 = f121, f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f96 = f121, f97, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f65 = f120, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f97 = f120, f97, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f121, f81, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f112 = f121, f113, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f81 = f120, f81, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f113 = f120, f113, f39 nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f81, -3 * SIZE STFD [BOFFSET2] = f113, -3 * SIZE nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 nop __LINE__ } { .mfi STFD [C3 ] = f96, SIZE mov f96 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, -1 * SIZE mov f65 = f0 adds KK = -4, KK } { .mfi STFD [C3 ] = f97, -1 * SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE mov f80 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C4 ] = f112, SIZE mov f112 = f0 sub L = K, KK } ;; { .mfi STFD [C2 ] = f81, -1 * SIZE mov f81 = f0 adds I = -1, I } { .mfb STFD [C4 ] = f113, -1 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif #ifdef LT { .mfi LDFPD f76, f77 = [AOFFSET], 2 * SIZE FMPY f32 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f72, f96 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [AOFFSET] FMPY f33 = f73, f64 adds AOFFSET = 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f96 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [AOFFSET], 2 * SIZE FMPY f34 = f72, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f72, f112 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [AOFFSET], 2 * SIZE FMPY f35 = f73, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f73, f112 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [AOFFSET] FMA_C f64 = f73, f65, f32 adds AOFFSET = 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f96 = f73, f97, f36 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [AOFFSET], 2 * SIZE FMA_D f65 = f72, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f97 = f72, f97, f37 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [AOFFSET] FMA_C f80 = f73, f81, f34 adds AOFFSET = 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f112 = f73, f113, f38 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [AOFFSET] FMA_D f81 = f72, f81, f35 adds AOFFSET = - 30 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f113 = f72, f113, f39 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi STFD [BOFFSET] = f64, SIZE FNMA f66 = f74, f64, f66 nop __LINE__ } { .mfi STFD [BOFFSET2] = f96, SIZE FNMA f98 = f74, f96, f98 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMA_A f67 = f75, f64, f67 nop __LINE__ } { .mfi STFD [BOFFSET2] = f97, SIZE FMA_A f99 = f75, f96, f99 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f80, SIZE FNMA f82 = f74, f80, f82 nop __LINE__ } { .mfi STFD [BOFFSET2] = f112, SIZE FNMA f114 = f74, f112, f114 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f81, 5 * SIZE FMA_A f83 = f75, f80, f83 nop __LINE__ } { .mfi STFD [BOFFSET2] = f113, 5 * SIZE FMA_A f115 = f75, f112, f115 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE FMA_B f66 = f75, f65, f66 nop __LINE__ } { .mfi STFD [C3 ] = f96, SIZE FMA_B f98 = f75, f97, f98 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE FNMA f67 = f74, f65, f67 nop __LINE__ } { .mfi STFD [C3 ] = f97, SIZE FNMA f99 = f74, f97, f99 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f82 = f75, f81, f82 nop __LINE__ } { .mfi STFD [C4 ] = f112, SIZE FMA_B f114 = f75, f113, f114 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f83 = f74, f81, f83 nop __LINE__ } { .mfi STFD [C4 ] = f113, SIZE FNMA f115 = f74, f113, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f68 = f76, f64, f68 nop __LINE__ } { .mfi nop __LINE__ FNMA f100 = f76, f96, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f69 = f77, f64, f69 nop __LINE__ } { .mfi nop __LINE__ FMA_A f101 = f77, f96, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f84 = f76, f80, f84 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f76, f112, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f85 = f77, f80, f85 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f77, f112, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f68 = f77, f65, f68 nop __LINE__ } { .mfi nop __LINE__ FMA_B f100 = f77, f97, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f69 = f76, f65, f69 nop __LINE__ } { .mfi nop __LINE__ FNMA f101 = f76, f97, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f84 = f77, f81, f84 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f77, f113, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f85 = f76, f81, f85 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f76, f113, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f70 = f78, f64, f70 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f78, f96, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f71 = f79, f64, f71 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f79, f96, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f86 = f78, f80, f86 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f78, f112, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f87 = f79, f80, f87 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f79, f112, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f70 = f79, f65, f70 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f79, f97, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f71 = f78, f65, f71 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f78, f97, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f86 = f79, f81, f86 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f79, f113, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f87 = f78, f81, f87 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f78, f113, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f90, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f90, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f91, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f91, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f90, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f91, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f91, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f91, f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f98 = f91, f99, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f90, f67, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f99 = f90, f99, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f91, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f114 = f91, f115, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f90, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f115 = f90, f115, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f68 = f92, f66, f68 nop __LINE__ } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f100 = f92, f98, f100 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMA_A f69 = f93, f66, f69 nop __LINE__ } { .mfi STFD [BOFFSET2] = f99, SIZE FMA_A f101 = f93, f98, f101 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f84 = f92, f82, f84 nop __LINE__ } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f116 = f92, f114, f116 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f83, 5 * SIZE FMA_A f85 = f93, f82, f85 nop __LINE__ } { .mfi STFD [BOFFSET2] = f115, 5 * SIZE FMA_A f117 = f93, f114, f117 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f68 = f93, f67, f68 nop __LINE__ } { .mfi STFD [C3 ] = f98, SIZE FMA_B f100 = f93, f99, f100 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, SIZE FNMA f69 = f92, f67, f69 nop __LINE__ } { .mfi STFD [C3 ] = f99, SIZE FNMA f101 = f92, f99, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f84 = f93, f83, f84 nop __LINE__ } { .mfi STFD [C4 ] = f114, SIZE FMA_B f116 = f93, f115, f116 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, SIZE FNMA f85 = f92, f83, f85 nop __LINE__ } { .mfi STFD [C4 ] = f115, SIZE FNMA f117 = f92, f115, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f70 = f94, f66, f70 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f94, f98, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f71 = f95, f66, f71 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f95, f98, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f86 = f94, f82, f86 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f94, f114, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f87 = f95, f82, f87 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f95, f114, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f70 = f95, f67, f70 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f95, f99, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f71 = f94, f67, f71 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f94, f99, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f86 = f95, f83, f86 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f95, f115, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f87 = f94, f83, f87 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f94, f115, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f108, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f108, f100 nop __LINE__ } { .mfi nop __LINE__ FMPY f33 = f109, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f109, f100 nop __LINE__ } { .mfi nop __LINE__ FMPY f34 = f108, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f108, f116 nop __LINE__ } { .mfi nop __LINE__ FMPY f35 = f109, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f109, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f68 = f109, f69, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f109, f101, f36 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f108, f69, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f108, f101, f37 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f109, f85, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f109, f117, f38 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f108, f85, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f108, f117, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f70 = f110, f68, f70 nop __LINE__ } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f102 = f110, f100, f102 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f69, SIZE FMA_A f71 = f111, f68, f71 nop __LINE__ } { .mfi STFD [BOFFSET2] = f101, SIZE FMA_A f103 = f111, f100, f103 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f86 = f110, f84, f86 nop __LINE__ } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f118 = f110, f116, f118 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f85, 5 * SIZE FMA_A f87 = f111, f84, f87 nop __LINE__ } { .mfi STFD [BOFFSET2] = f117, 5 * SIZE FMA_A f119 = f111, f116, f119 nop __LINE__ } ;; { .mfi STFD [C1 ] = f68, SIZE FMA_B f70 = f111, f69, f70 nop __LINE__ } { .mfi STFD [C3 ] = f100, SIZE FMA_B f102 = f111, f101, f102 nop __LINE__ } ;; { .mfi STFD [C1 ] = f69, SIZE FNMA f71 = f110, f69, f71 nop __LINE__ } { .mfi STFD [C3 ] = f101, SIZE FNMA f103 = f110, f101, f103 nop __LINE__ } ;; { .mfi STFD [C2 ] = f84, SIZE FMA_B f86 = f111, f85, f86 nop __LINE__ } { .mfi STFD [C4 ] = f116, SIZE FMA_B f118 = f111, f117, f118 nop __LINE__ } ;; { .mfi STFD [C2 ] = f85, SIZE FNMA f87 = f110, f85, f87 nop __LINE__ } { .mfi STFD [C4 ] = f117, SIZE FNMA f119 = f110, f117, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f126, f70 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f126, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f127, f70 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f127, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f126, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f126, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f127, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f127, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f70 = f127, f71, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f127, f103, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f71 = f126, f71, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f126, f103, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f86 = f127, f87, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f118 = f127, f119, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f87 = f126, f87, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f119 = f126, f119, f39 nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f102, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f71, SIZE STFD [BOFFSET2] = f103, SIZE sub r2 = K, KK } ;; { .mmi STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f118, SIZE adds KK = 4, KK } ;; { .mmi STFD [BOFFSET] = f87, -27 * SIZE STFD [BOFFSET2] = f119 shladd r2 = r2, ZBASE_SHIFT, r0 } ;; { .mfi STFD [C1 ] = f70, SIZE mov f64 = f0 shladd AOFFSET = r2, 2, AOFFSET } { .mfi STFD [C3 ] = f102, SIZE mov f65 = f0 shladd BOFFSET = r2, 2, BOFFSET } ;; { .mfi STFD [C1 ] = f71, SIZE mov f80 = f0 mov L = KK } { .mfi STFD [C3 ] = f103, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f86, SIZE mov f96 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C4 ] = f118, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f87, SIZE mov f112 = f0 adds I = -1, I } { .mfb STFD [C4 ] = f119, SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif #ifdef RN { .mfi LDFPD f76, f77 = [BOFFSET], 2 * SIZE FMPY f32 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f72, f68 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [BOFFSET] FMPY f33 = f73, f64 adds BOFFSET = 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f68 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [BOFFSET], 2 * SIZE FMPY f34 = f72, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f72, f70 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [BOFFSET], 2 * SIZE FMPY f35 = f73, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f73, f70 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [BOFFSET] FMA_C f64 = f73, f65, f32 adds BOFFSET = 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f68 = f73, f69, f36 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [BOFFSET], 2 * SIZE FMA_D f65 = f72, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f72, f69, f37 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [BOFFSET] FMA_C f66 = f73, f67, f34 adds BOFFSET = 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f70 = f73, f71, f38 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [BOFFSET] FMA_D f67 = f72, f67, f35 adds BOFFSET = - 30 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f71 = f72, f71, f39 adds AOFFSET2 = 4 * SIZE, AOFFSET } ;; { .mfi STFD [AOFFSET] = f64, SIZE FNMA f80 = f74, f64, f80 nop __LINE__ } { .mfi STFD [AOFFSET2] = f68, SIZE FNMA f84 = f74, f68, f84 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f65, SIZE FMA_A f81 = f75, f64, f81 nop __LINE__ } { .mfi STFD [AOFFSET2] = f69, SIZE FMA_A f85 = f75, f68, f85 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f66, SIZE FNMA f82 = f74, f66, f82 nop __LINE__ } { .mfi STFD [AOFFSET2] = f70, SIZE FNMA f86 = f74, f70, f86 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f67, 5 * SIZE FMA_A f83 = f75, f66, f83 nop __LINE__ } { .mfi STFD [AOFFSET2] = f71, 5 * SIZE FMA_A f87 = f75, f70, f87 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE FMA_B f80 = f75, f65, f80 nop __LINE__ } { .mfi STFD [C5 ] = f68, SIZE FMA_B f84 = f75, f69, f84 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE FNMA f81 = f74, f65, f81 nop __LINE__ } { .mfi STFD [C5 ] = f69, SIZE FNMA f85 = f74, f69, f85 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f82 = f75, f67, f82 nop __LINE__ } { .mfi STFD [C5 ] = f70, SIZE FMA_B f86 = f75, f71, f86 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE FNMA f83 = f74, f67, f83 nop __LINE__ } { .mfi STFD [C5 ] = f71, 5 * SIZE FNMA f87 = f74, f71, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f96 = f76, f64, f96 nop __LINE__ } { .mfi nop __LINE__ FNMA f100 = f76, f68, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f97 = f77, f64, f97 nop __LINE__ } { .mfi nop __LINE__ FMA_A f101 = f77, f68, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f98 = f76, f66, f98 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f76, f70, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f99 = f77, f66, f99 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f77, f70, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f96 = f77, f65, f96 nop __LINE__ } { .mfi nop __LINE__ FMA_B f100 = f77, f69, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f97 = f76, f65, f97 nop __LINE__ } { .mfi nop __LINE__ FNMA f101 = f76, f69, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f98 = f77, f67, f98 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f77, f71, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f99 = f76, f67, f99 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f76, f71, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f112 = f78, f64, f112 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f78, f68, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f113 = f79, f64, f113 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f79, f68, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f114 = f78, f66, f114 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f78, f70, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f115 = f79, f66, f115 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f79, f70, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f112 = f79, f65, f112 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f79, f69, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f113 = f78, f65, f113 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f78, f69, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f114 = f79, f67, f114 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f79, f71, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f115 = f78, f67, f115 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f78, f71, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f90, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f90, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f33 = f91, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f91, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f34 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f90, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f35 = f91, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f91, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f91, f81, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f91, f85, f36 nop __LINE__ } { .mfi nop __LINE__ FMA_D f81 = f90, f81, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f90, f85, f37 nop __LINE__ } { .mfi nop __LINE__ FMA_C f82 = f91, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f86 = f91, f87, f38 nop __LINE__ } { .mfi nop __LINE__ FMA_D f83 = f90, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f87 = f90, f87, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f96 = f92, f80, f96 nop __LINE__ } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f100 = f92, f84, f100 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f81, SIZE FMA_A f97 = f93, f80, f97 nop __LINE__ } { .mfi STFD [AOFFSET2] = f85, SIZE FMA_A f101 = f93, f84, f101 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f98 = f92, f82, f98 nop __LINE__ } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f102 = f92, f86, f102 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f83, 5 * SIZE FMA_A f99 = f93, f82, f99 nop __LINE__ } { .mfi STFD [AOFFSET2] = f87, 5 * SIZE FMA_A f103 = f93, f86, f103 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f96 = f93, f81, f96 nop __LINE__ } { .mfi STFD [C6 ] = f84, SIZE FMA_B f100 = f93, f85, f100 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f97 = f92, f81, f97 nop __LINE__ } { .mfi STFD [C6 ] = f85, SIZE FNMA f101 = f92, f85, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f98 = f93, f83, f98 nop __LINE__ } { .mfi STFD [C6 ] = f86, SIZE FMA_B f102 = f93, f87, f102 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, 5 * SIZE FNMA f99 = f92, f83, f99 nop __LINE__ } { .mfi STFD [C6 ] = f87, 5 * SIZE FNMA f103 = f92, f87, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f112 = f94, f80, f112 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f94, f84, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f113 = f95, f80, f113 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f95, f84, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f114 = f94, f82, f114 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f94, f86, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f115 = f95, f82, f115 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f95, f86, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f112 = f95, f81, f112 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f95, f85, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f113 = f94, f81, f113 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f94, f85, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f114 = f95, f83, f114 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f95, f87, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f115 = f94, f83, f115 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f94, f87, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f108, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f108, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f109, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f109, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f108, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f108, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f109, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f109, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f96 = f109, f97, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f109, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f97 = f108, f97, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f108, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f98 = f109, f99, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f109, f103, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f99 = f108, f99, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f108, f103, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f112 = f110, f96, f112 nop __LINE__ } { .mfi STFD [AOFFSET2] = f100, SIZE FNMA f116 = f110, f100, f116 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMA_A f113 = f111, f96, f113 nop __LINE__ } { .mfi STFD [AOFFSET2] = f101, SIZE FMA_A f117 = f111, f100, f117 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f98, SIZE FNMA f114 = f110, f98, f114 nop __LINE__ } { .mfi STFD [AOFFSET2] = f102, SIZE FNMA f118 = f110, f102, f118 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f99, 5 * SIZE FMA_A f115 = f111, f98, f115 nop __LINE__ } { .mfi STFD [AOFFSET2] = f103, 5 * SIZE FMA_A f119 = f111, f102, f119 nop __LINE__ } ;; { .mfi STFD [C3 ] = f96, SIZE FMA_B f112 = f111, f97, f112 nop __LINE__ } { .mfi STFD [C7 ] = f100, SIZE FMA_B f116 = f111, f101, f116 nop __LINE__ } ;; { .mfi STFD [C3 ] = f97, SIZE FNMA f113 = f110, f97, f113 nop __LINE__ } { .mfi STFD [C7 ] = f101, SIZE FNMA f117 = f110, f101, f117 nop __LINE__ } ;; { .mfi STFD [C3 ] = f98, SIZE FMA_B f114 = f111, f99, f114 nop __LINE__ } { .mfi STFD [C7 ] = f102, SIZE FMA_B f118 = f111, f103, f118 nop __LINE__ } ;; { .mfi STFD [C3 ] = f99, 5 * SIZE FNMA f115 = f110, f99, f115 nop __LINE__ } { .mfi STFD [C7 ] = f103, 5 * SIZE FNMA f119 = f110, f103, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f126, f112 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f126, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f127, f112 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f127, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f126, f114 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f126, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f127, f114 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f127, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f112 = f127, f113, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f127, f117, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f113 = f126, f113, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f126, f117, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f114 = f127, f115, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f118 = f127, f119, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f115 = f126, f115, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f119 = f126, f119, f39 nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f116, SIZE sub r2 = K, KK } ;; { .mmi STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f117, SIZE mov L = KK } ;; { .mmi STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f118, SIZE shladd r2 = r2, ZBASE_SHIFT, r0 } ;; { .mmi STFD [AOFFSET] = f115, -27 * SIZE STFD [AOFFSET2] = f119 nop __LINE__ } ;; { .mfi STFD [C4 ] = f112, SIZE mov f64 = f0 shladd BOFFSET = r2, 2, BOFFSET } { .mfi STFD [C8 ] = f116, SIZE mov f65 = f0 shladd AOFFSET = r2, 2, AOFFSET } ;; { .mfi STFD [C4 ] = f113, SIZE mov f80 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C8 ] = f117, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f114, SIZE mov f96 = f0 adds I = -1, I } { .mfi STFD [C8 ] = f118, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f115, 5 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f119, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } #endif #ifdef RT { .mfi LDFPD f76, f77 = [BOFFSET] FMPY f32 = f72, f112 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f36 = f72, f116 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [BOFFSET] FMPY f33 = f73, f112 adds BOFFSET = - 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f116 nop __LINE__ } ;; { .mfi LDFPD f88, f89 = [BOFFSET] FMPY f34 = f72, f114 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f38 = f72, f118 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [BOFFSET] FMPY f35 = f73, f114 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f39 = f73, f118 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [BOFFSET] FMA_C f112 = f73, f113, f32 adds BOFFSET = - 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f116 = f73, f117, f36 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [BOFFSET] FMA_D f113 = f72, f113, f33 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f117 = f72, f117, f37 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [BOFFSET] FMA_C f114 = f73, f115, f34 adds BOFFSET = - 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f118 = f73, f119, f38 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [BOFFSET] FMA_D f115 = f72, f115, f35 adds AOFFSET2 = 28 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f119 = f72, f119, f39 adds AOFFSET = 24 * SIZE, AOFFSET } ;; { .mfi STFD [AOFFSET] = f112, SIZE FNMA f96 = f74, f112, f96 nop __LINE__ } { .mfi STFD [AOFFSET2] = f116, SIZE FNMA f100 = f74, f116, f100 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f113, SIZE FMA_A f97 = f75, f112, f97 nop __LINE__ } { .mfi STFD [AOFFSET2] = f117, SIZE FMA_A f101 = f75, f116, f101 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f114, SIZE FNMA f98 = f74, f114, f98 nop __LINE__ } { .mfi STFD [AOFFSET2] = f118, SIZE FNMA f102 = f74, f118, f102 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f115, -11 * SIZE FMA_A f99 = f75, f114, f99 nop __LINE__ } { .mfi STFD [AOFFSET2] = f119, -11 * SIZE FMA_A f103 = f75, f118, f103 nop __LINE__ } ;; { .mfi STFD [C4 ] = f112, SIZE FMA_B f96 = f75, f113, f96 nop __LINE__ } { .mfi STFD [C8 ] = f116, SIZE FMA_B f100 = f75, f117, f100 nop __LINE__ } ;; { .mfi STFD [C4 ] = f113, SIZE FNMA f97 = f74, f113, f97 nop __LINE__ } { .mfi STFD [C8 ] = f117, SIZE FNMA f101 = f74, f117, f101 nop __LINE__ } ;; { .mfi STFD [C4 ] = f114, SIZE FMA_B f98 = f75, f115, f98 nop __LINE__ } { .mfi STFD [C8 ] = f118, SIZE FMA_B f102 = f75, f119, f102 nop __LINE__ } ;; { .mfi STFD [C4 ] = f115, 5 * SIZE FNMA f99 = f74, f115, f99 nop __LINE__ } { .mfi STFD [C8 ] = f119, 5 * SIZE FNMA f103 = f74, f119, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f76, f112, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f84 = f76, f116, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f77, f112, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f85 = f77, f116, f85 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f82 = f76, f114, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f86 = f76, f118, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f83 = f77, f114, f83 nop __LINE__ } { .mfi nop __LINE__ FMA_A f87 = f77, f118, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f77, f113, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f84 = f77, f117, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f76, f113, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f85 = f76, f117, f85 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f82 = f77, f115, f82 nop __LINE__ } { .mfi nop __LINE__ FMA_B f86 = f77, f119, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f83 = f76, f115, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f87 = f76, f119, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f78, f112, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f68 = f78, f116, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f79, f112, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f69 = f79, f116, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f78, f114, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f70 = f78, f118, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f79, f114, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f71 = f79, f118, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f79, f113, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f68 = f79, f117, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f78, f113, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f69 = f78, f117, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f79, f115, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f70 = f79, f119, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f78, f115, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f71 = f78, f119, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f88, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f88, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f89, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f89, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f88, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f88, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f89, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f89, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f96 = f89, f97, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f89, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f97 = f88, f97, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f88, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f98 = f89, f99, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f89, f103, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f99 = f88, f99, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f88, f103, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f80 = f90, f96, f80 nop __LINE__ } { .mfi STFD [AOFFSET2] = f100, SIZE FNMA f84 = f90, f100, f84 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMA_A f81 = f91, f96, f81 nop __LINE__ } { .mfi STFD [AOFFSET2] = f101, SIZE FMA_A f85 = f91, f100, f85 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f98, SIZE FNMA f82 = f90, f98, f82 nop __LINE__ } { .mfi STFD [AOFFSET2] = f102, SIZE FNMA f86 = f90, f102, f86 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f99, -11 * SIZE FMA_A f83 = f91, f98, f83 nop __LINE__ } { .mfi STFD [AOFFSET2] = f103, -11 * SIZE FMA_A f87 = f91, f102, f87 nop __LINE__ } ;; { .mfi STFD [C3 ] = f96, SIZE FMA_B f80 = f91, f97, f80 nop __LINE__ } { .mfi STFD [C7 ] = f100, SIZE FMA_B f84 = f91, f101, f84 nop __LINE__ } ;; { .mfi STFD [C3 ] = f97, SIZE FNMA f81 = f90, f97, f81 nop __LINE__ } { .mfi STFD [C7 ] = f101, SIZE FNMA f85 = f90, f101, f85 nop __LINE__ } ;; { .mfi STFD [C3 ] = f98, SIZE FMA_B f82 = f91, f99, f82 nop __LINE__ } { .mfi STFD [C7 ] = f102, SIZE FMA_B f86 = f91, f103, f86 nop __LINE__ } ;; { .mfi STFD [C3 ] = f99, 5 * SIZE FNMA f83 = f90, f99, f83 nop __LINE__ } { .mfi STFD [C7 ] = f103, 5 * SIZE FNMA f87 = f90, f103, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f92, f96, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f68 = f92, f100, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f93, f96, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f69 = f93, f100, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f92, f98, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f70 = f92, f102, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f93, f98, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f71 = f93, f102, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f93, f97, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f68 = f93, f101, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f92, f97, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f69 = f92, f101, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f93, f99, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f70 = f93, f103, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f92, f99, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f71 = f92, f103, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f104, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f104, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f105, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f105, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f104, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f104, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f105, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f105, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f105, f81, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f105, f85, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f81 = f104, f81, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f104, f85, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f105, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f86 = f105, f87, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f104, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f87 = f104, f87, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f64 = f106, f80, f64 nop __LINE__ } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f68 = f106, f84, f68 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f81, SIZE FMA_A f65 = f107, f80, f65 nop __LINE__ } { .mfi STFD [AOFFSET2] = f85, SIZE FMA_A f69 = f107, f84, f69 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f66 = f106, f82, f66 nop __LINE__ } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f70 = f106, f86, f70 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f83, -11 * SIZE FMA_A f67 = f107, f82, f67 nop __LINE__ } { .mfi STFD [AOFFSET2] = f87, -11 * SIZE FMA_A f71 = f107, f86, f71 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f64 = f107, f81, f64 nop __LINE__ } { .mfi STFD [C6 ] = f84, SIZE FMA_B f68 = f107, f85, f68 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f65 = f106, f81, f65 nop __LINE__ } { .mfi STFD [C6 ] = f85, SIZE FNMA f69 = f106, f85, f69 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f66 = f107, f83, f66 nop __LINE__ } { .mfi STFD [C6 ] = f86, SIZE FMA_B f70 = f107, f87, f70 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, 5 * SIZE FNMA f67 = f106, f83, f67 nop __LINE__ } { .mfi STFD [C6 ] = f87, 5 * SIZE FNMA f71 = f106, f87, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f120, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f120, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f121, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f121, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f120, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f120, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f121, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f121, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f64 = f121, f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f68 = f121, f69, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f65 = f120, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f120, f69, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f121, f67, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f70 = f121, f71, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f120, f67, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f71 = f120, f71, f39 nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE shladd r2 = K, ZBASE_SHIFT, r0 } ;; { .mmi STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE shladd AORIG = r2, 2, AORIG } ;; { .mmi STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C5 ] = f68, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 nop __LINE__ } { .mfi STFD [C5 ] = f69, SIZE mov f96 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE mov f80 = f0 sub L = K, KK } { .mfi STFD [C5 ] = f70, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE mov f112 = f0 adds I = -1, I } { .mfb STFD [C5 ] = f71, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif .L049: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; shladd B = KK8, 2, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 4, KK #endif #ifdef RT adds KK = -4, KK #endif ;; { .mmb mov AOFFSET = A cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 16 .L050: { .mib tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L090 } ;; #ifdef RT { .mmi shladd r3 = LDC, 1, r0 nop __LINE__ shl r2 = K, 1 + ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } ;; #endif mov C1 = C add C2 = LDC, C ;; #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif ;; #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; { .mib #ifndef RT shladd C = LDC, 1, C #else nop __LINE__ #endif } ;; .L070: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L060 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L078 ;; .align 16 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f96 = f32, f49, f96 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f112 = f32, f51, f112 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f97 = f33, f49, f97 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf nop __LINE__ nop __LINE__ FMA f113 = f33, f51, f113 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f96 = f40, f57, f96 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f112 = f40, f59, f112 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f97 = f41, f57, f97 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f41, f59, f113 // A2 * B4 br.cloop.sptk.few .L072 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f97 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f80 = f80, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f96 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f81 = f81, f112 nop __LINE__ } ;; .L078: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f88, f80 FSUB f81 = f89, f81 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f80, SIZE ;; STFD [BOFFSET] = f81, SIZE ;; adds BOFFSET = - 4 * SIZE, BOFFSET ;; #else STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f81, SIZE ;; adds AOFFSET = - 4 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L060: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L051 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } { .mmi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE shr L = L, 1 } ;; { .mmi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L068 ;; .align 16 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 } { .mfb FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 br.cloop.sptk.few .L062 } ;; .L068: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f74, f96 FSUB f97 = f75, f97 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f112 = f90, f112 FSUB f113 = f91, f113 ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 FNMA f80 = f106, f112, f80 FMA_A f81 = f107, f112, f81 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 FMA_B f80 = f107, f113, f80 FNMA f81 = f106, f113, f81 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 FNMA f112 = f74, f80, f112 FMA_A f113 = f75, f80, f113 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 FMA_B f112 = f75, f81, f112 FNMA f113 = f74, f81, f113 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f96 FMPY f35 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f34 FMA_D f97 = f72, f97, f35 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f112 = f74, f96, f112 FMA_A f113 = f75, f96, f113 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f112 = f75, f97, f112 FNMA f113 = f74, f97, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f96 = f106, f112, f96 FMA_A f97 = f107, f112, f97 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f96 = f107, f113, f96 FNMA f97 = f106, f113, f97 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f96 FMPY f35 = f121, f96 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f96 = f121, f97, f34 FMA_D f97 = f120, f97, f35 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f96, SIZE ;; STFD [C1 ] = f97, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C2 ] = f112, SIZE ;; STFD [C2 ] = f113, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L051: shr I = M, 2 ;; cmp.eq p6, p7 = 0, I (p6) br.cond.dpnt .L089 ;; .align 16 .L052: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 1, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi cmp.eq p3, p0 = r0, r0 mov f99 = f0 adds L = 1, L } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi CPREFETCH [PREC], LDC mov f115 = f0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds C5 = 4 * SIZE, C1 adds L = -1, L } ;; { .mmi CPREFETCH [PREC], LDC adds C6 = 4 * SIZE, C2 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L058 ;; .align 16 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfi FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f36, f48, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f36, f49, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f36, f50, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f36, f51, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f38, f48, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f38, f49, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f38, f50, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f38, f51, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f37, f48, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f37, f49, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f37, f50, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f37, f51, f82 // A6 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f39, f48, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f39, f49, f98 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f39, f50, f115 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f39, f51, f114 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f44, f56, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f44, f58, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f46, f56, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f46, f58, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f45, f56, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f45, f58, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f47, f56, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f47, f58, f115 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 br.cloop.sptk.few .L053 } ;; .L058: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [BOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [BOFFSET], 2 * SIZE ;; LDFPD f122, f123 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 FSUB f66 = f104, f66 FSUB_A f67 = f105, f67 FSUB f82 = f106, f82 FSUB_A f83 = f107, f83 FSUB f98 = f120, f98 FSUB_A f99 = f121, f99 FSUB f114 = f122, f114 FSUB_A f115 = f123, f115 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f74, f96 FSUB f97 = f75, f97 FSUB f66 = f76, f66 FSUB f67 = f77, f67 FSUB f98 = f78, f98 FSUB f99 = f79, f99 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f112 = f90, f112 FSUB f113 = f91, f113 FSUB f82 = f92, f82 FSUB f83 = f93, f83 FSUB f114 = f94, f114 FSUB f115 = f95, f115 ;; #endif #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f98 FMPY f33 = f73, f98 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f98 = f73, f99, f32 FMA_D f99 = f72, f99, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f66 = f74, f98, f66 FMA_A f67 = f75, f98, f67 FNMA f82 = f74, f114, f82 FMA_A f83 = f75, f114, f83 ;; FMA_B f66 = f75, f99, f66 FNMA f67 = f74, f99, f67 FMA_B f82 = f75, f115, f82 FNMA f83 = f74, f115, f83 ;; FNMA f96 = f76, f98, f96 FMA_A f97 = f77, f98, f97 FNMA f112 = f76, f114, f112 FMA_A f113 = f77, f114, f113 ;; FMA_B f96 = f77, f99, f96 FNMA f97 = f76, f99, f97 FMA_B f112 = f77, f115, f112 FNMA f113 = f76, f115, f113 ;; FNMA f64 = f78, f98, f64 FMA_A f65 = f79, f98, f65 FNMA f80 = f78, f114, f80 FMA_A f81 = f79, f114, f81 ;; FMA_B f64 = f79, f99, f64 FNMA f65 = f78, f99, f65 FMA_B f80 = f79, f115, f80 FNMA f81 = f78, f115, f81 ;; FMPY f32 = f88, f66 FMPY f33 = f89, f66 FMPY f34 = f88, f82 FMPY f35 = f89, f82 ;; FMA_C f66 = f89, f67, f32 FMA_D f67 = f88, f67, f33 FMA_C f82 = f89, f83, f34 FMA_D f83 = f88, f83, f35 ;; FNMA f96 = f90, f66, f96 FMA_A f97 = f91, f66, f97 FNMA f112 = f90, f82, f112 FMA_A f113 = f91, f82, f113 ;; FMA_B f96 = f91, f67, f96 FNMA f97 = f90, f67, f97 FMA_B f112 = f91, f83, f112 FNMA f113 = f90, f83, f113 ;; FNMA f64 = f92, f66, f64 FMA_A f65 = f93, f66, f65 FNMA f80 = f92, f82, f80 FMA_A f81 = f93, f82, f81 ;; FMA_B f64 = f93, f67, f64 FNMA f65 = f92, f67, f65 FMA_B f80 = f93, f83, f80 FNMA f81 = f92, f83, f81 ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 FNMA f80 = f106, f112, f80 FMA_A f81 = f107, f112, f81 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 FMA_B f80 = f107, f113, f80 FNMA f81 = f106, f113, f81 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f108, f109 = [AOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f126, f127 = [AOFFSET] adds AOFFSET = - 30 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 FNMA f112 = f74, f80, f112 FMA_A f113 = f75, f80, f113 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 FMA_B f112 = f75, f81, f112 FNMA f113 = f74, f81, f113 ;; FNMA f66 = f76, f64, f66 FMA_A f67 = f77, f64, f67 FNMA f82 = f76, f80, f82 FMA_A f83 = f77, f80, f83 ;; FMA_B f66 = f77, f65, f66 FNMA f67 = f76, f65, f67 FMA_B f82 = f77, f81, f82 FNMA f83 = f76, f81, f83 ;; FNMA f98 = f78, f64, f98 FMA_A f99 = f79, f64, f99 FNMA f114 = f78, f80, f114 FMA_A f115 = f79, f80, f115 ;; FMA_B f98 = f79, f65, f98 FNMA f99 = f78, f65, f99 FMA_B f114 = f79, f81, f114 FNMA f115 = f78, f81, f115 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; FNMA f66 = f92, f96, f66 FMA_A f67 = f93, f96, f67 FNMA f82 = f92, f112, f82 FMA_A f83 = f93, f112, f83 ;; FMA_B f66 = f93, f97, f66 FNMA f67 = f92, f97, f67 FMA_B f82 = f93, f113, f82 FNMA f83 = f92, f113, f83 ;; FNMA f98 = f94, f96, f98 FMA_A f99 = f95, f96, f99 FNMA f114 = f94, f112, f114 FMA_A f115 = f95, f112, f115 ;; FMA_B f98 = f95, f97, f98 FNMA f99 = f94, f97, f99 FMA_B f114 = f95, f113, f114 FNMA f115 = f94, f113, f115 ;; FMPY f32 = f108, f66 FMPY f33 = f109, f66 FMPY f34 = f108, f82 FMPY f35 = f109, f82 ;; FMA_C f66 = f109, f67, f32 FMA_D f67 = f108, f67, f33 FMA_C f82 = f109, f83, f34 FMA_D f83 = f108, f83, f35 ;; FNMA f98 = f110, f66, f98 FMA_A f99 = f111, f66, f99 FNMA f114 = f110, f82, f114 FMA_A f115 = f111, f82, f115 ;; FMA_B f98 = f111, f67, f98 FNMA f99 = f110, f67, f99 FMA_B f114 = f111, f83, f114 FNMA f115 = f110, f83, f115 ;; FMPY f32 = f126, f98 FMPY f33 = f127, f98 FMPY f34 = f126, f114 FMPY f35 = f127, f114 ;; FMA_C f98 = f127, f99, f32 FMA_D f99 = f126, f99, f33 FMA_C f114 = f127, f115, f34 FMA_D f115 = f126, f115, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f96 FMPY f35 = f73, f96 FMPY f36 = f72, f66 FMPY f37 = f73, f66 FMPY f38 = f72, f98 FMPY f39 = f73, f98 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f34 FMA_D f97 = f72, f97, f35 FMA_C f66 = f73, f67, f36 FMA_D f67 = f72, f67, f37 FMA_C f98 = f73, f99, f38 FMA_D f99 = f72, f99, f39 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f112 = f74, f96, f112 FMA_A f113 = f75, f96, f113 FNMA f82 = f74, f66, f82 FMA_A f83 = f75, f66, f83 FNMA f114 = f74, f98, f114 FMA_A f115 = f75, f98, f115 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f112 = f75, f97, f112 FNMA f113 = f74, f97, f113 FMA_B f82 = f75, f67, f82 FNMA f83 = f74, f67, f83 FMA_B f114 = f75, f99, f114 FNMA f115 = f74, f99, f115 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f112 FMPY f35 = f91, f112 FMPY f36 = f90, f82 FMPY f37 = f91, f82 FMPY f38 = f90, f114 FMPY f39 = f91, f114 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 FMA_C f82 = f91, f83, f36 FMA_D f83 = f90, f83, f37 FMA_C f114 = f91, f115, f38 FMA_D f115 = f90, f115, f39 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f112 FMPY f35 = f105, f112 FMPY f36 = f104, f82 FMPY f37 = f105, f82 FMPY f38 = f104, f114 FMPY f39 = f105, f114 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 FMA_C f82 = f105, f83, f36 FMA_D f83 = f104, f83, f37 FMA_C f114 = f105, f115, f38 FMA_D f115 = f104, f115, f39 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f96 = f106, f112, f96 FMA_A f97 = f107, f112, f97 FNMA f66 = f106, f82, f66 FMA_A f67 = f107, f82, f67 FNMA f98 = f106, f114, f98 FMA_A f99 = f107, f114, f99 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f96 = f107, f113, f96 FNMA f97 = f106, f113, f97 FMA_B f66 = f107, f83, f66 FNMA f67 = f106, f83, f67 FMA_B f98 = f107, f115, f98 FNMA f99 = f106, f115, f99 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f96 FMPY f35 = f121, f96 FMPY f36 = f120, f66 FMPY f37 = f121, f66 FMPY f38 = f120, f98 FMPY f39 = f121, f98 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f96 = f121, f97, f34 FMA_D f97 = f120, f97, f35 FMA_C f66 = f121, f67, f36 FMA_D f67 = f120, f67, f37 FMA_C f98 = f121, f99, f38 FMA_D f99 = f120, f99, f39 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f98, SIZE ;; STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f114, SIZE ;; STFD [BOFFSET] = f83, 5 * SIZE STFD [BOFFSET2] = f115, 5 * SIZE ;; adds BOFFSET = - 16 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f66, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f67, SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f98, SIZE ;; STFD [AOFFSET] = f97, 5 * SIZE STFD [AOFFSET2] = f99, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f82, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f83, SIZE ;; STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f114, SIZE ;; STFD [AOFFSET] = f113, 5 * SIZE STFD [AOFFSET2] = f115, 5 * SIZE ;; adds AOFFSET = - 16 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C5 = -8 * SIZE, C5 adds C6 = -8 * SIZE, C6 #endif ;; STFD [C1 ] = f64, SIZE STFD [C5 ] = f66, SIZE ;; STFD [C1 ] = f65, SIZE STFD [C5 ] = f67, SIZE ;; STFD [C1 ] = f96, SIZE STFD [C5 ] = f98, SIZE ;; STFD [C1 ] = f97, 5 * SIZE STFD [C5 ] = f99, 5 * SIZE ;; STFD [C2 ] = f80, SIZE STFD [C6 ] = f82, SIZE ;; STFD [C2 ] = f81, SIZE STFD [C6 ] = f83, SIZE ;; STFD [C2 ] = f112, SIZE STFD [C6 ] = f114, SIZE ;; STFD [C2 ] = f113, 5 * SIZE STFD [C6 ] = f115, 5 * SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C5 = -8 * SIZE, C5 adds C6 = -8 * SIZE, C6 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 2, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; (p6) br.cond.dptk .L052 ;; .align 16 .L089: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; shladd B = KK8, 1, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 2, KK #endif #ifdef RT adds KK = -2, KK #endif ;; { .mmi mov AOFFSET = A nop __LINE__ } ;; .align 16 .L090: tbit.z p6, p0 = N, 0 (p6) br.cond.dpnt .L999 ;; #ifdef RT { .mmi shl r2 = K, ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, LDC nop __LINE__ } ;; #endif mov C1 = C #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif ;; #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; { .mib #ifndef RT add C = LDC, C #else nop __LINE__ #endif } ;; .L110: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L100 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L118 ;; .align 16 .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb (p3) FMA f81 = f41, f57, f81 // A2 * B2 br.cloop.sptk.few .L112 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } ;; .L118: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET] ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 ;; #else LDFPD f72, f73 = [AOFFSET] ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; adds BOFFSET = - 2 * SIZE, BOFFSET ;; #else STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; adds AOFFSET = - 2 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L100: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L091 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L108 ;; .align 16 .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f112 = f34, f49, f112 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f113 = f35, f49, f113 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f57, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f112 = f42, f57, f112 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f43, f56, f97 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f43, f57, f113 // A4 * B2 br.cloop.sptk.few .L102 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f96 = f96, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f97 = f97, f112 nop __LINE__ } ;; .L108: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f88, f96 FSUB f97 = f89, f97 ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f96, SIZE ;; STFD [BOFFSET] = f97, SIZE ;; adds BOFFSET = - 4 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f96, SIZE ;; STFD [AOFFSET] = f97, SIZE ;; adds AOFFSET = - 4 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f96, SIZE ;; STFD [C1 ] = f97, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif .align 16 .L091: shr I = M, 2 ;; cmp.eq p6, p7 = 0, I (p6) br.cond.dpnt .L119 ;; .align 16 .L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif ;; (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = 1, L ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC] } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov ar.lc = L } { .mmi adds C5 = 4 * SIZE, C1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 ;; .align 16 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f34, f48, f80 // A3 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f81 = f34, f49, f81 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f36, f48, f96 // A5 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f97 = f36, f49, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f38, f48, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f38, f49, f113 // A7 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f81 = f35, f48, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f35, f49, f80 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f97 = f37, f48, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f37, f49, f96 // A6 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f113 = f39, f48, f113 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f39, f49, f112 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f42, f56, f80 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f44, f56, f96 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f46, f56, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f43, f56, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f45, f56, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f47, f56, f113 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 br.cloop.sptk.few .L093 } ;; .L098: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f74, f80 FSUB f81 = f75, f81 FSUB f96 = f88, f96 FSUB f97 = f89, f97 FSUB f112 = f90, f112 FSUB f113 = f91, f113 ;; #endif #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f108, f109 = [AOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f126, f127 = [AOFFSET] adds AOFFSET = - 30 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f97, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f81, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5 #endif ;; STFD [C1 ] = f64, SIZE STFD [C5 ] = f96, SIZE ;; STFD [C1 ] = f65, SIZE STFD [C5 ] = f97, SIZE ;; STFD [C1 ] = f80, SIZE STFD [C5 ] = f112, SIZE ;; STFD [C1 ] = f81, 5 * SIZE STFD [C5 ] = f113, 5 * SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 2, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; (p6) br.cond.dptk .L092 ;; .align 16 .L119: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; add B = KK8, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 1, KK #endif #ifdef RT adds KK = -1, KK #endif ;; { .mmi mov AOFFSET = A nop __LINE__ } ;; .align 16 .L999: { .mii nop __LINE__ mov ar.lc = ARLC mov pr = PR, -1 } { .mib nop __LINE__ #ifdef TRMMKERNEL mov ar.pfs = ARPFS #else nop __LINE__ #endif br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/ztrsm_kernel_LT.S000066400000000000000000005163621313527062700204460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #ifndef LN #define CPREFETCHSIZE 7 #else #define CPREFETCHSIZE -8 #endif #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r37 #define B r38 #define C r39 #define LDC r35 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA_R f8 #define ALPHA_I f9 #define AORIG loc0 #define KK loc1 #define KK8 loc2 #define OFFSET loc3 #define AOFFSET2 loc4 #define BOFFSET2 loc5 #ifndef CONJ #define FCALC_A FSUB #define FCALC_B FADD #define FMA_A FNMA #define FMA_B FMA #else #define FCALC_A FADD #define FCALC_B FSUB #define FMA_A FMA #define FMA_B FNMA #endif #ifndef CONJ #define FCALC_C FMA #define FCALC_D FNMA #else #define FCALC_C FNMA #define FCALC_D FMA #endif #ifndef CONJ #define FMA_C FNMA #define FMA_D FMA #define FSUB_A FSUB #else #define FMA_C FMA #define FMA_D FMS #define FSUB_A FADD #endif PROLOGUE .prologue PROFCODE { .mfi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 8, 0, 0 mov f64 = f0 adds r14 = 16, SP } { .mfi nop __LINE__ mov f65 = f0 adds r15 = 24, SP } ;; { .mfi ld8 LDC = [r14] mov f81 = f0 mov PR = pr } { .mfi ld8 OFFSET = [r15] mov f96 = f0 shr J = N, 2 } ;; { .mfi shladd LDC = LDC, ZBASE_SHIFT, r0 mov f97 = f0 } { .mfi nop __LINE__ mov f113 = f0 } ;; #ifdef LN { .mmi setf.sig f32 = M setf.sig f33 = K shladd C = M, ZBASE_SHIFT, C } ;; {.mmf nop __LINE__ nop __LINE__ xmpy.l f32 = f32, f33 } ;; { .mmi getf.sig r2 = f32 ;; nop __LINE__ shladd A = r2, ZBASE_SHIFT, A } ;; #endif #ifdef RN sub KK = r0, OFFSET #endif #ifdef RT { .mmi setf.sig f32 = N setf.sig f33 = K nop __LINE__ } ;; { .mmi setf.sig f34 = LDC nop __LINE__ nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ xmpy.l f33 = f32, f33 } { .mmf nop __LINE__ sub KK = N, OFFSET xmpy.l f34 = f32, f34 } ;; { .mmi getf.sig r2 = f33 getf.sig r3 = f34 } ;; shladd B = r2, ZBASE_SHIFT, B add C = r3, C #endif ;; .body { .mfi nop __LINE__ mov f80 = f0 mov ARLC = ar.lc } { .mfb cmp.ge p6, p0 = 0, J mov f112 = f0 (p6) br.cond.dpnt .L050 } ;; .align 16 .L010: #ifdef RT { .mmi shladd r3 = LDC, 2, r0 nop __LINE__ shl r2 = K, 2 + ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } ;; #endif { .mmi mov C1 = C // coffset1 = c + 0 * ldc add C2 = LDC, C // coffset2 = c + 1 * ldc shr I = M, 2 } { .mmi adds J = -1, J #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } ;; ;; { .mmi shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } { .mib cmp.eq p6, p7 = 0, I #ifndef RT shladd C = LDC, 2, C // coffset += 8 * ldc #else nop __LINE__ #endif (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mfi shladd r3 = KK, ZBASE_SHIFT, r0 mov f118 = f0 nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 nop __LINE__ } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = 1, L } { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 adds C5 = 4 * SIZE, C1 } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f115 = f0 adds C6 = 4 * SIZE, C2 } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f68 = f0 shr L = L, 1 } { .mfi setf.d f86 = r0 mov f69 = f0 adds C7 = 4 * SIZE, C3 } ;; { .mfi CPREFETCH [PREC], LDC mov f84 = f0 adds L = -1, L } { .mfi setf.d f87 = r0 mov f85 = f0 adds C8 = 4 * SIZE, C4 } ;; { .mfi CPREFETCH [PREC], LDC mov f100 = f0 mov ar.lc = L } { .mfi setf.d f102 = r0 mov f101 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f116 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } { .mfi setf.d f103 = r0 mov f117 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC] mov f70 = f0 cmp.eq p6, p0 = -1, L } { .mfb setf.d f119 = r0 mov f71 = f0 (p6) br.cond.dpnt .L018 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA_B f65 = f32, f49, f65 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f69 = f36, f49, f69 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f85 = f36, f51, f85 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f101 = f36, f53, f101 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f117 = f36, f55, f117 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f68 = f37, f49, f68 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f84 = f37, f51, f84 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f100 = f37, f53, f100 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f116 = f37, f55, f116 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f71 = f38, f49, f71 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f87 = f38, f51, f87 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f103 = f38, f53, f103 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f119 = f38, f55, f119 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f70 = f39, f49, f70 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f86 = f39, f51, f86 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f102 = f39, f53, f102 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f118 = f39, f55, f118 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb nop __LINE__ (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb nop __LINE__ (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb nop __LINE__ (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb nop __LINE__ (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb nop __LINE__ (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb nop __LINE__ (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb nop __LINE__ (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi nop __LINE__ (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L018: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f94, f95 = [BOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [BOFFSET], 2 * SIZE FSUB f80 = f74, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f81 = f75, f81 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [BOFFSET], 2 * SIZE FSUB f96 = f76, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f97 = f77, f97 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [BOFFSET], 2 * SIZE FSUB f112 = f78, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f113 = f79, f113 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [BOFFSET], 2 * SIZE FSUB f66 = f88, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f67 = f89, f67 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [BOFFSET], 2 * SIZE FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f83 = f91, f83 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [BOFFSET], 2 * SIZE FSUB f98 = f92, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f99 = f93, f99 nop __LINE__ } ;; { .mfi LDFPD f124, f125 = [BOFFSET], 2 * SIZE FSUB f114 = f94, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f115 = f95, f115 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [BOFFSET] FSUB f68 = f104, f68 adds BOFFSET = -30 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB_A f69 = f105, f69 #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET #else nop __LINE__ #endif } ;; { .mfi LDFPD f72, f73 = [AOFFSET] FSUB f84 = f106, f84 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } { .mfi nop __LINE__ FSUB_A f85 = f107, f85 nop __LINE__ } ;; { .mfi LDFPD f74, f75 = [AOFFSET] FSUB f100 = f108, f100 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } { .mfi nop __LINE__ FSUB_A f101 = f109, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f116 = f110, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f117 = f111, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f70 = f120, f70 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f71 = f121, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f86 = f122, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f87 = f123, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f102 = f124, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f103 = f125, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f118 = f126, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f119 = f127, f119 nop __LINE__ } ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f92, f93 = [AOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [AOFFSET], 2 * SIZE FSUB f66 = f74, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB f67 = f75, f67 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [AOFFSET], 2 * SIZE FSUB f68 = f76, f68 nop __LINE__ } { .mfi nop __LINE__ FSUB f69 = f77, f69 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [AOFFSET], 2 * SIZE FSUB f70 = f78, f70 nop __LINE__ } { .mfi nop __LINE__ FSUB f71 = f79, f71 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [AOFFSET], 2 * SIZE FSUB f80 = f88, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f89, f81 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [AOFFSET], 2 * SIZE FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f91, f83 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [AOFFSET], 2 * SIZE FSUB f84 = f92, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f85 = f93, f85 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [AOFFSET], 2 * SIZE FSUB f86 = f94, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f95, f87 nop __LINE__ } ;; { .mfi LDFPD f124, f125 = [AOFFSET], 2 * SIZE FSUB f96 = f104, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f105, f97 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [AOFFSET] FSUB f98 = f106, f98 adds AOFFSET = -30 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f99 = f107, f99 #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET #else nop __LINE__ #endif } ;; { .mfi LDFPD f72, f73 = [BOFFSET] FSUB f100 = f108, f100 #ifdef RN adds BOFFSET = 2 * SIZE, BOFFSET #else adds BOFFSET = - 2 * SIZE, BOFFSET #endif } { .mfi nop __LINE__ FSUB f101 = f109, f101 nop __LINE__ } ;; { .mfi LDFPD f74, f75 = [BOFFSET] FSUB f102 = f110, f102 #ifdef RN adds BOFFSET = 2 * SIZE, BOFFSET #else adds BOFFSET = - 2 * SIZE, BOFFSET #endif } { .mfi nop __LINE__ FSUB f103 = f111, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f120, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f121, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f123, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f116 = f124, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f117 = f125, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f118 = f126, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f127, f119 nop __LINE__ } ;; #endif #ifdef LN { .mfi LDFPD f76, f77 = [AOFFSET] FMPY f32 = f72, f70 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f36 = f72, f102 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [AOFFSET] FMPY f33 = f73, f70 adds AOFFSET = - 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f102 nop __LINE__ } ;; { .mfi LDFPD f88, f89 = [AOFFSET] FMPY f34 = f72, f86 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f38 = f72, f118 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [AOFFSET] FMPY f35 = f73, f86 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f39 = f73, f118 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [AOFFSET] FMA_C f70 = f73, f71, f32 adds AOFFSET = - 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f102 = f73, f103, f36 adds C1 = -2 * SIZE, C1 } ;; { .mfi LDFPD f104, f105 = [AOFFSET] FMA_D f71 = f72, f71, f33 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f103 = f72, f103, f37 adds C2 = -2 * SIZE, C2 } ;; { .mfi LDFPD f106, f107 = [AOFFSET] FMA_C f86 = f73, f87, f34 adds AOFFSET = - 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f118 = f73, f119, f38 adds C3 = -2 * SIZE, C3 } ;; { .mfi LDFPD f120, f121 = [AOFFSET] FMA_D f87 = f72, f87, f35 adds BOFFSET2 = 28 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f119 = f72, f119, f39 adds BOFFSET = 24 * SIZE, BOFFSET } ;; { .mfi STFD [BOFFSET] = f70, SIZE FNMA f68 = f74, f70, f68 adds C4 = -2 * SIZE, C4 } { .mfi STFD [BOFFSET2] = f102, SIZE FNMA f100 = f74, f102, f100 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f71, SIZE FMA_A f69 = f75, f70, f69 nop __LINE__ } { .mfi STFD [BOFFSET2] = f103, SIZE FMA_A f101 = f75, f102, f101 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f86, SIZE FNMA f84 = f74, f86, f84 nop __LINE__ } { .mfi STFD [BOFFSET2] = f118, SIZE FNMA f116 = f74, f118, f116 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f87, -11 * SIZE FMA_A f85 = f75, f86, f85 nop __LINE__ } { .mfi STFD [BOFFSET2] = f119, -11 * SIZE FMA_A f117 = f75, f118, f117 nop __LINE__ } ;; { .mfi STFD [C1 ] = f70, SIZE FMA_B f68 = f75, f71, f68 nop __LINE__ } { .mfi STFD [C3 ] = f102, SIZE FMA_B f100 = f75, f103, f100 nop __LINE__ } ;; { .mfi STFD [C1 ] = f71, -3 * SIZE FNMA f69 = f74, f71, f69 nop __LINE__ } { .mfi STFD [C3 ] = f103, -3 * SIZE FNMA f101 = f74, f103, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f86, SIZE FMA_B f84 = f75, f87, f84 nop __LINE__ } { .mfi STFD [C4 ] = f118, SIZE FMA_B f116 = f75, f119, f116 nop __LINE__ } ;; { .mfi STFD [C2 ] = f87, -3 * SIZE FNMA f85 = f74, f87, f85 nop __LINE__ } { .mfi STFD [C4 ] = f119, -3 * SIZE FNMA f117 = f74, f119, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f76, f70, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f76, f102, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f77, f70, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f99 = f77, f102, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f82 = f76, f86, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f76, f118, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f83 = f77, f86, f83 nop __LINE__ } { .mfi nop __LINE__ FMA_A f115 = f77, f118, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f77, f71, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f98 = f77, f103, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f76, f71, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f99 = f76, f103, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f82 = f77, f87, f82 nop __LINE__ } { .mfi nop __LINE__ FMA_B f114 = f77, f119, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f83 = f76, f87, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f115 = f76, f119, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f78, f70, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f96 = f78, f102, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f79, f70, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f97 = f79, f102, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f78, f86, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f112 = f78, f118, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f79, f86, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f113 = f79, f118, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f79, f71, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f96 = f79, f103, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f78, f71, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f78, f103, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f79, f87, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f112 = f79, f119, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f78, f87, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f113 = f78, f119, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f88, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f88, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f89, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f89, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f88, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f88, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f89, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f89, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f68 = f89, f69, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f89, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f69 = f88, f69, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f88, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f84 = f89, f85, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f89, f117, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f85 = f88, f85, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f88, f117, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f66 = f90, f68, f66 nop __LINE__ } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f98 = f90, f100, f98 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f69, SIZE FMA_A f67 = f91, f68, f67 nop __LINE__ } { .mfi STFD [BOFFSET2] = f101, SIZE FMA_A f99 = f91, f100, f99 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f82 = f90, f84, f82 nop __LINE__ } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f114 = f90, f116, f114 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f85, -11 * SIZE FMA_A f83 = f91, f84, f83 nop __LINE__ } { .mfi STFD [BOFFSET2] = f117, -11 * SIZE FMA_A f115 = f91, f116, f115 nop __LINE__ } ;; { .mfi STFD [C1 ] = f68, SIZE FMA_B f66 = f91, f69, f66 nop __LINE__ } { .mfi STFD [C3 ] = f100, SIZE FMA_B f98 = f91, f101, f98 nop __LINE__ } ;; { .mfi STFD [C1 ] = f69, -3 * SIZE FNMA f67 = f90, f69, f67 nop __LINE__ } { .mfi STFD [C3 ] = f101, -3 * SIZE FNMA f99 = f90, f101, f99 nop __LINE__ } ;; { .mfi STFD [C2 ] = f84, SIZE FMA_B f82 = f91, f85, f82 nop __LINE__ } { .mfi STFD [C4 ] = f116, SIZE FMA_B f114 = f91, f117, f114 nop __LINE__ } ;; { .mfi STFD [C2 ] = f85, -3 * SIZE FNMA f83 = f90, f85, f83 nop __LINE__ } { .mfi STFD [C4 ] = f117, -3 * SIZE FNMA f115 = f90, f117, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f92, f68, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f96 = f92, f100, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f93, f68, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f97 = f93, f100, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f92, f84, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f112 = f92, f116, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f93, f84, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f113 = f93, f116, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f93, f69, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f96 = f93, f101, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f92, f69, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f92, f101, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f93, f85, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f112 = f93, f117, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f92, f85, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f113 = f92, f117, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f104, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f104, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f105, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f105, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f104, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f104, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f105, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f105, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f105, f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f98 = f105, f99, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f104, f67, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f99 = f104, f99, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f105, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f114 = f105, f115, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f104, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f115 = f104, f115, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f64 = f106, f66, f64 nop __LINE__ } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f96 = f106, f98, f96 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMA_A f65 = f107, f66, f65 nop __LINE__ } { .mfi STFD [BOFFSET2] = f99, SIZE FMA_A f97 = f107, f98, f97 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f80 = f106, f82, f80 nop __LINE__ } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f112 = f106, f114, f112 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f83, -11 * SIZE FMA_A f81 = f107, f82, f81 nop __LINE__ } { .mfi STFD [BOFFSET2] = f115, -11 * SIZE FMA_A f113 = f107, f114, f113 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f64 = f107, f67, f64 nop __LINE__ } { .mfi STFD [C3 ] = f98, SIZE FMA_B f96 = f107, f99, f96 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, -3 * SIZE FNMA f65 = f106, f67, f65 nop __LINE__ } { .mfi STFD [C3 ] = f99, -3 * SIZE FNMA f97 = f106, f99, f97 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f80 = f107, f83, f80 nop __LINE__ } { .mfi STFD [C4 ] = f114, SIZE FMA_B f112 = f107, f115, f112 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, -3 * SIZE FNMA f81 = f106, f83, f81 nop __LINE__ } { .mfi STFD [C4 ] = f115, -3 * SIZE FNMA f113 = f106, f115, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f120, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f120, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f121, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f121, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f120, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f120, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f121, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f121, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f64 = f121, f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f96 = f121, f97, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f65 = f120, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f97 = f120, f97, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f121, f81, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f112 = f121, f113, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f81 = f120, f81, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f113 = f120, f113, f39 nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f81, -3 * SIZE STFD [BOFFSET2] = f113, -3 * SIZE nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 nop __LINE__ } { .mfi STFD [C3 ] = f96, SIZE mov f96 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, -1 * SIZE mov f65 = f0 adds KK = -4, KK } { .mfi STFD [C3 ] = f97, -1 * SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE mov f80 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C4 ] = f112, SIZE mov f112 = f0 sub L = K, KK } ;; { .mfi STFD [C2 ] = f81, -1 * SIZE mov f81 = f0 adds I = -1, I } { .mfb STFD [C4 ] = f113, -1 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif #ifdef LT { .mfi LDFPD f76, f77 = [AOFFSET], 2 * SIZE FMPY f32 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f72, f96 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [AOFFSET] FMPY f33 = f73, f64 adds AOFFSET = 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f96 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [AOFFSET], 2 * SIZE FMPY f34 = f72, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f72, f112 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [AOFFSET], 2 * SIZE FMPY f35 = f73, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f73, f112 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [AOFFSET] FMA_C f64 = f73, f65, f32 adds AOFFSET = 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f96 = f73, f97, f36 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [AOFFSET], 2 * SIZE FMA_D f65 = f72, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f97 = f72, f97, f37 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [AOFFSET] FMA_C f80 = f73, f81, f34 adds AOFFSET = 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f112 = f73, f113, f38 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [AOFFSET] FMA_D f81 = f72, f81, f35 adds AOFFSET = - 30 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f113 = f72, f113, f39 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi STFD [BOFFSET] = f64, SIZE FNMA f66 = f74, f64, f66 nop __LINE__ } { .mfi STFD [BOFFSET2] = f96, SIZE FNMA f98 = f74, f96, f98 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMA_A f67 = f75, f64, f67 nop __LINE__ } { .mfi STFD [BOFFSET2] = f97, SIZE FMA_A f99 = f75, f96, f99 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f80, SIZE FNMA f82 = f74, f80, f82 nop __LINE__ } { .mfi STFD [BOFFSET2] = f112, SIZE FNMA f114 = f74, f112, f114 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f81, 5 * SIZE FMA_A f83 = f75, f80, f83 nop __LINE__ } { .mfi STFD [BOFFSET2] = f113, 5 * SIZE FMA_A f115 = f75, f112, f115 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE FMA_B f66 = f75, f65, f66 nop __LINE__ } { .mfi STFD [C3 ] = f96, SIZE FMA_B f98 = f75, f97, f98 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE FNMA f67 = f74, f65, f67 nop __LINE__ } { .mfi STFD [C3 ] = f97, SIZE FNMA f99 = f74, f97, f99 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f82 = f75, f81, f82 nop __LINE__ } { .mfi STFD [C4 ] = f112, SIZE FMA_B f114 = f75, f113, f114 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f83 = f74, f81, f83 nop __LINE__ } { .mfi STFD [C4 ] = f113, SIZE FNMA f115 = f74, f113, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f68 = f76, f64, f68 nop __LINE__ } { .mfi nop __LINE__ FNMA f100 = f76, f96, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f69 = f77, f64, f69 nop __LINE__ } { .mfi nop __LINE__ FMA_A f101 = f77, f96, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f84 = f76, f80, f84 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f76, f112, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f85 = f77, f80, f85 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f77, f112, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f68 = f77, f65, f68 nop __LINE__ } { .mfi nop __LINE__ FMA_B f100 = f77, f97, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f69 = f76, f65, f69 nop __LINE__ } { .mfi nop __LINE__ FNMA f101 = f76, f97, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f84 = f77, f81, f84 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f77, f113, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f85 = f76, f81, f85 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f76, f113, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f70 = f78, f64, f70 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f78, f96, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f71 = f79, f64, f71 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f79, f96, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f86 = f78, f80, f86 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f78, f112, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f87 = f79, f80, f87 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f79, f112, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f70 = f79, f65, f70 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f79, f97, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f71 = f78, f65, f71 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f78, f97, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f86 = f79, f81, f86 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f79, f113, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f87 = f78, f81, f87 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f78, f113, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f90, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f90, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f91, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f91, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f90, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f91, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f91, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f91, f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f98 = f91, f99, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f90, f67, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f99 = f90, f99, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f91, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f114 = f91, f115, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f90, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f115 = f90, f115, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f68 = f92, f66, f68 nop __LINE__ } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f100 = f92, f98, f100 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMA_A f69 = f93, f66, f69 nop __LINE__ } { .mfi STFD [BOFFSET2] = f99, SIZE FMA_A f101 = f93, f98, f101 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f84 = f92, f82, f84 nop __LINE__ } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f116 = f92, f114, f116 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f83, 5 * SIZE FMA_A f85 = f93, f82, f85 nop __LINE__ } { .mfi STFD [BOFFSET2] = f115, 5 * SIZE FMA_A f117 = f93, f114, f117 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f68 = f93, f67, f68 nop __LINE__ } { .mfi STFD [C3 ] = f98, SIZE FMA_B f100 = f93, f99, f100 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, SIZE FNMA f69 = f92, f67, f69 nop __LINE__ } { .mfi STFD [C3 ] = f99, SIZE FNMA f101 = f92, f99, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f84 = f93, f83, f84 nop __LINE__ } { .mfi STFD [C4 ] = f114, SIZE FMA_B f116 = f93, f115, f116 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, SIZE FNMA f85 = f92, f83, f85 nop __LINE__ } { .mfi STFD [C4 ] = f115, SIZE FNMA f117 = f92, f115, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f70 = f94, f66, f70 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f94, f98, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f71 = f95, f66, f71 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f95, f98, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f86 = f94, f82, f86 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f94, f114, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f87 = f95, f82, f87 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f95, f114, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f70 = f95, f67, f70 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f95, f99, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f71 = f94, f67, f71 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f94, f99, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f86 = f95, f83, f86 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f95, f115, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f87 = f94, f83, f87 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f94, f115, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f108, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f108, f100 nop __LINE__ } { .mfi nop __LINE__ FMPY f33 = f109, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f109, f100 nop __LINE__ } { .mfi nop __LINE__ FMPY f34 = f108, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f108, f116 nop __LINE__ } { .mfi nop __LINE__ FMPY f35 = f109, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f109, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f68 = f109, f69, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f109, f101, f36 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f108, f69, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f108, f101, f37 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f109, f85, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f109, f117, f38 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f108, f85, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f108, f117, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f70 = f110, f68, f70 nop __LINE__ } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f102 = f110, f100, f102 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f69, SIZE FMA_A f71 = f111, f68, f71 nop __LINE__ } { .mfi STFD [BOFFSET2] = f101, SIZE FMA_A f103 = f111, f100, f103 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f86 = f110, f84, f86 nop __LINE__ } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f118 = f110, f116, f118 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f85, 5 * SIZE FMA_A f87 = f111, f84, f87 nop __LINE__ } { .mfi STFD [BOFFSET2] = f117, 5 * SIZE FMA_A f119 = f111, f116, f119 nop __LINE__ } ;; { .mfi STFD [C1 ] = f68, SIZE FMA_B f70 = f111, f69, f70 nop __LINE__ } { .mfi STFD [C3 ] = f100, SIZE FMA_B f102 = f111, f101, f102 nop __LINE__ } ;; { .mfi STFD [C1 ] = f69, SIZE FNMA f71 = f110, f69, f71 nop __LINE__ } { .mfi STFD [C3 ] = f101, SIZE FNMA f103 = f110, f101, f103 nop __LINE__ } ;; { .mfi STFD [C2 ] = f84, SIZE FMA_B f86 = f111, f85, f86 nop __LINE__ } { .mfi STFD [C4 ] = f116, SIZE FMA_B f118 = f111, f117, f118 nop __LINE__ } ;; { .mfi STFD [C2 ] = f85, SIZE FNMA f87 = f110, f85, f87 nop __LINE__ } { .mfi STFD [C4 ] = f117, SIZE FNMA f119 = f110, f117, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f126, f70 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f126, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f127, f70 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f127, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f126, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f126, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f127, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f127, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f70 = f127, f71, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f127, f103, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f71 = f126, f71, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f126, f103, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f86 = f127, f87, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f118 = f127, f119, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f87 = f126, f87, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f119 = f126, f119, f39 nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f102, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f71, SIZE STFD [BOFFSET2] = f103, SIZE sub r2 = K, KK } ;; { .mmi STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f118, SIZE adds KK = 4, KK } ;; { .mmi STFD [BOFFSET] = f87, -27 * SIZE STFD [BOFFSET2] = f119 shladd r2 = r2, ZBASE_SHIFT, r0 } ;; { .mfi STFD [C1 ] = f70, SIZE mov f64 = f0 shladd AOFFSET = r2, 2, AOFFSET } { .mfi STFD [C3 ] = f102, SIZE mov f65 = f0 shladd BOFFSET = r2, 2, BOFFSET } ;; { .mfi STFD [C1 ] = f71, SIZE mov f80 = f0 mov L = KK } { .mfi STFD [C3 ] = f103, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f86, SIZE mov f96 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C4 ] = f118, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f87, SIZE mov f112 = f0 adds I = -1, I } { .mfb STFD [C4 ] = f119, SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif #ifdef RN { .mfi LDFPD f76, f77 = [BOFFSET], 2 * SIZE FMPY f32 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f72, f68 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [BOFFSET] FMPY f33 = f73, f64 adds BOFFSET = 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f68 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [BOFFSET], 2 * SIZE FMPY f34 = f72, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f72, f70 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [BOFFSET], 2 * SIZE FMPY f35 = f73, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f73, f70 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [BOFFSET] FMA_C f64 = f73, f65, f32 adds BOFFSET = 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f68 = f73, f69, f36 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [BOFFSET], 2 * SIZE FMA_D f65 = f72, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f72, f69, f37 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [BOFFSET] FMA_C f66 = f73, f67, f34 adds BOFFSET = 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f70 = f73, f71, f38 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [BOFFSET] FMA_D f67 = f72, f67, f35 adds BOFFSET = - 30 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f71 = f72, f71, f39 adds AOFFSET2 = 4 * SIZE, AOFFSET } ;; { .mfi STFD [AOFFSET] = f64, SIZE FNMA f80 = f74, f64, f80 nop __LINE__ } { .mfi STFD [AOFFSET2] = f68, SIZE FNMA f84 = f74, f68, f84 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f65, SIZE FMA_A f81 = f75, f64, f81 nop __LINE__ } { .mfi STFD [AOFFSET2] = f69, SIZE FMA_A f85 = f75, f68, f85 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f66, SIZE FNMA f82 = f74, f66, f82 nop __LINE__ } { .mfi STFD [AOFFSET2] = f70, SIZE FNMA f86 = f74, f70, f86 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f67, 5 * SIZE FMA_A f83 = f75, f66, f83 nop __LINE__ } { .mfi STFD [AOFFSET2] = f71, 5 * SIZE FMA_A f87 = f75, f70, f87 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE FMA_B f80 = f75, f65, f80 nop __LINE__ } { .mfi STFD [C5 ] = f68, SIZE FMA_B f84 = f75, f69, f84 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE FNMA f81 = f74, f65, f81 nop __LINE__ } { .mfi STFD [C5 ] = f69, SIZE FNMA f85 = f74, f69, f85 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f82 = f75, f67, f82 nop __LINE__ } { .mfi STFD [C5 ] = f70, SIZE FMA_B f86 = f75, f71, f86 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE FNMA f83 = f74, f67, f83 nop __LINE__ } { .mfi STFD [C5 ] = f71, 5 * SIZE FNMA f87 = f74, f71, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f96 = f76, f64, f96 nop __LINE__ } { .mfi nop __LINE__ FNMA f100 = f76, f68, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f97 = f77, f64, f97 nop __LINE__ } { .mfi nop __LINE__ FMA_A f101 = f77, f68, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f98 = f76, f66, f98 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f76, f70, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f99 = f77, f66, f99 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f77, f70, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f96 = f77, f65, f96 nop __LINE__ } { .mfi nop __LINE__ FMA_B f100 = f77, f69, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f97 = f76, f65, f97 nop __LINE__ } { .mfi nop __LINE__ FNMA f101 = f76, f69, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f98 = f77, f67, f98 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f77, f71, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f99 = f76, f67, f99 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f76, f71, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f112 = f78, f64, f112 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f78, f68, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f113 = f79, f64, f113 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f79, f68, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f114 = f78, f66, f114 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f78, f70, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f115 = f79, f66, f115 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f79, f70, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f112 = f79, f65, f112 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f79, f69, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f113 = f78, f65, f113 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f78, f69, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f114 = f79, f67, f114 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f79, f71, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f115 = f78, f67, f115 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f78, f71, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f90, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f90, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f33 = f91, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f91, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f34 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f90, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f35 = f91, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f91, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f91, f81, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f91, f85, f36 nop __LINE__ } { .mfi nop __LINE__ FMA_D f81 = f90, f81, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f90, f85, f37 nop __LINE__ } { .mfi nop __LINE__ FMA_C f82 = f91, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f86 = f91, f87, f38 nop __LINE__ } { .mfi nop __LINE__ FMA_D f83 = f90, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f87 = f90, f87, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f96 = f92, f80, f96 nop __LINE__ } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f100 = f92, f84, f100 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f81, SIZE FMA_A f97 = f93, f80, f97 nop __LINE__ } { .mfi STFD [AOFFSET2] = f85, SIZE FMA_A f101 = f93, f84, f101 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f98 = f92, f82, f98 nop __LINE__ } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f102 = f92, f86, f102 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f83, 5 * SIZE FMA_A f99 = f93, f82, f99 nop __LINE__ } { .mfi STFD [AOFFSET2] = f87, 5 * SIZE FMA_A f103 = f93, f86, f103 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f96 = f93, f81, f96 nop __LINE__ } { .mfi STFD [C6 ] = f84, SIZE FMA_B f100 = f93, f85, f100 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f97 = f92, f81, f97 nop __LINE__ } { .mfi STFD [C6 ] = f85, SIZE FNMA f101 = f92, f85, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f98 = f93, f83, f98 nop __LINE__ } { .mfi STFD [C6 ] = f86, SIZE FMA_B f102 = f93, f87, f102 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, 5 * SIZE FNMA f99 = f92, f83, f99 nop __LINE__ } { .mfi STFD [C6 ] = f87, 5 * SIZE FNMA f103 = f92, f87, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f112 = f94, f80, f112 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f94, f84, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f113 = f95, f80, f113 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f95, f84, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f114 = f94, f82, f114 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f94, f86, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f115 = f95, f82, f115 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f95, f86, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f112 = f95, f81, f112 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f95, f85, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f113 = f94, f81, f113 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f94, f85, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f114 = f95, f83, f114 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f95, f87, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f115 = f94, f83, f115 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f94, f87, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f108, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f108, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f109, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f109, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f108, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f108, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f109, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f109, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f96 = f109, f97, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f109, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f97 = f108, f97, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f108, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f98 = f109, f99, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f109, f103, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f99 = f108, f99, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f108, f103, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f112 = f110, f96, f112 nop __LINE__ } { .mfi STFD [AOFFSET2] = f100, SIZE FNMA f116 = f110, f100, f116 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMA_A f113 = f111, f96, f113 nop __LINE__ } { .mfi STFD [AOFFSET2] = f101, SIZE FMA_A f117 = f111, f100, f117 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f98, SIZE FNMA f114 = f110, f98, f114 nop __LINE__ } { .mfi STFD [AOFFSET2] = f102, SIZE FNMA f118 = f110, f102, f118 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f99, 5 * SIZE FMA_A f115 = f111, f98, f115 nop __LINE__ } { .mfi STFD [AOFFSET2] = f103, 5 * SIZE FMA_A f119 = f111, f102, f119 nop __LINE__ } ;; { .mfi STFD [C3 ] = f96, SIZE FMA_B f112 = f111, f97, f112 nop __LINE__ } { .mfi STFD [C7 ] = f100, SIZE FMA_B f116 = f111, f101, f116 nop __LINE__ } ;; { .mfi STFD [C3 ] = f97, SIZE FNMA f113 = f110, f97, f113 nop __LINE__ } { .mfi STFD [C7 ] = f101, SIZE FNMA f117 = f110, f101, f117 nop __LINE__ } ;; { .mfi STFD [C3 ] = f98, SIZE FMA_B f114 = f111, f99, f114 nop __LINE__ } { .mfi STFD [C7 ] = f102, SIZE FMA_B f118 = f111, f103, f118 nop __LINE__ } ;; { .mfi STFD [C3 ] = f99, 5 * SIZE FNMA f115 = f110, f99, f115 nop __LINE__ } { .mfi STFD [C7 ] = f103, 5 * SIZE FNMA f119 = f110, f103, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f126, f112 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f126, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f127, f112 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f127, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f126, f114 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f126, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f127, f114 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f127, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f112 = f127, f113, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f127, f117, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f113 = f126, f113, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f126, f117, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f114 = f127, f115, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f118 = f127, f119, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f115 = f126, f115, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f119 = f126, f119, f39 nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f116, SIZE sub r2 = K, KK } ;; { .mmi STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f117, SIZE mov L = KK } ;; { .mmi STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f118, SIZE shladd r2 = r2, ZBASE_SHIFT, r0 } ;; { .mmi STFD [AOFFSET] = f115, -27 * SIZE STFD [AOFFSET2] = f119 nop __LINE__ } ;; { .mfi STFD [C4 ] = f112, SIZE mov f64 = f0 shladd BOFFSET = r2, 2, BOFFSET } { .mfi STFD [C8 ] = f116, SIZE mov f65 = f0 shladd AOFFSET = r2, 2, AOFFSET } ;; { .mfi STFD [C4 ] = f113, SIZE mov f80 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C8 ] = f117, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f114, SIZE mov f96 = f0 adds I = -1, I } { .mfi STFD [C8 ] = f118, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f115, 5 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f119, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } #endif #ifdef RT { .mfi LDFPD f76, f77 = [BOFFSET] FMPY f32 = f72, f112 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f36 = f72, f116 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [BOFFSET] FMPY f33 = f73, f112 adds BOFFSET = - 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f116 nop __LINE__ } ;; { .mfi LDFPD f88, f89 = [BOFFSET] FMPY f34 = f72, f114 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f38 = f72, f118 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [BOFFSET] FMPY f35 = f73, f114 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f39 = f73, f118 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [BOFFSET] FMA_C f112 = f73, f113, f32 adds BOFFSET = - 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f116 = f73, f117, f36 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [BOFFSET] FMA_D f113 = f72, f113, f33 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f117 = f72, f117, f37 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [BOFFSET] FMA_C f114 = f73, f115, f34 adds BOFFSET = - 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f118 = f73, f119, f38 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [BOFFSET] FMA_D f115 = f72, f115, f35 adds AOFFSET2 = 28 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f119 = f72, f119, f39 adds AOFFSET = 24 * SIZE, AOFFSET } ;; { .mfi STFD [AOFFSET] = f112, SIZE FNMA f96 = f74, f112, f96 nop __LINE__ } { .mfi STFD [AOFFSET2] = f116, SIZE FNMA f100 = f74, f116, f100 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f113, SIZE FMA_A f97 = f75, f112, f97 nop __LINE__ } { .mfi STFD [AOFFSET2] = f117, SIZE FMA_A f101 = f75, f116, f101 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f114, SIZE FNMA f98 = f74, f114, f98 nop __LINE__ } { .mfi STFD [AOFFSET2] = f118, SIZE FNMA f102 = f74, f118, f102 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f115, -11 * SIZE FMA_A f99 = f75, f114, f99 nop __LINE__ } { .mfi STFD [AOFFSET2] = f119, -11 * SIZE FMA_A f103 = f75, f118, f103 nop __LINE__ } ;; { .mfi STFD [C4 ] = f112, SIZE FMA_B f96 = f75, f113, f96 nop __LINE__ } { .mfi STFD [C8 ] = f116, SIZE FMA_B f100 = f75, f117, f100 nop __LINE__ } ;; { .mfi STFD [C4 ] = f113, SIZE FNMA f97 = f74, f113, f97 nop __LINE__ } { .mfi STFD [C8 ] = f117, SIZE FNMA f101 = f74, f117, f101 nop __LINE__ } ;; { .mfi STFD [C4 ] = f114, SIZE FMA_B f98 = f75, f115, f98 nop __LINE__ } { .mfi STFD [C8 ] = f118, SIZE FMA_B f102 = f75, f119, f102 nop __LINE__ } ;; { .mfi STFD [C4 ] = f115, 5 * SIZE FNMA f99 = f74, f115, f99 nop __LINE__ } { .mfi STFD [C8 ] = f119, 5 * SIZE FNMA f103 = f74, f119, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f76, f112, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f84 = f76, f116, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f77, f112, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f85 = f77, f116, f85 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f82 = f76, f114, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f86 = f76, f118, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f83 = f77, f114, f83 nop __LINE__ } { .mfi nop __LINE__ FMA_A f87 = f77, f118, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f77, f113, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f84 = f77, f117, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f76, f113, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f85 = f76, f117, f85 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f82 = f77, f115, f82 nop __LINE__ } { .mfi nop __LINE__ FMA_B f86 = f77, f119, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f83 = f76, f115, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f87 = f76, f119, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f78, f112, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f68 = f78, f116, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f79, f112, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f69 = f79, f116, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f78, f114, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f70 = f78, f118, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f79, f114, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f71 = f79, f118, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f79, f113, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f68 = f79, f117, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f78, f113, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f69 = f78, f117, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f79, f115, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f70 = f79, f119, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f78, f115, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f71 = f78, f119, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f88, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f88, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f89, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f89, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f88, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f88, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f89, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f89, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f96 = f89, f97, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f89, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f97 = f88, f97, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f88, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f98 = f89, f99, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f89, f103, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f99 = f88, f99, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f88, f103, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f80 = f90, f96, f80 nop __LINE__ } { .mfi STFD [AOFFSET2] = f100, SIZE FNMA f84 = f90, f100, f84 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMA_A f81 = f91, f96, f81 nop __LINE__ } { .mfi STFD [AOFFSET2] = f101, SIZE FMA_A f85 = f91, f100, f85 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f98, SIZE FNMA f82 = f90, f98, f82 nop __LINE__ } { .mfi STFD [AOFFSET2] = f102, SIZE FNMA f86 = f90, f102, f86 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f99, -11 * SIZE FMA_A f83 = f91, f98, f83 nop __LINE__ } { .mfi STFD [AOFFSET2] = f103, -11 * SIZE FMA_A f87 = f91, f102, f87 nop __LINE__ } ;; { .mfi STFD [C3 ] = f96, SIZE FMA_B f80 = f91, f97, f80 nop __LINE__ } { .mfi STFD [C7 ] = f100, SIZE FMA_B f84 = f91, f101, f84 nop __LINE__ } ;; { .mfi STFD [C3 ] = f97, SIZE FNMA f81 = f90, f97, f81 nop __LINE__ } { .mfi STFD [C7 ] = f101, SIZE FNMA f85 = f90, f101, f85 nop __LINE__ } ;; { .mfi STFD [C3 ] = f98, SIZE FMA_B f82 = f91, f99, f82 nop __LINE__ } { .mfi STFD [C7 ] = f102, SIZE FMA_B f86 = f91, f103, f86 nop __LINE__ } ;; { .mfi STFD [C3 ] = f99, 5 * SIZE FNMA f83 = f90, f99, f83 nop __LINE__ } { .mfi STFD [C7 ] = f103, 5 * SIZE FNMA f87 = f90, f103, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f92, f96, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f68 = f92, f100, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f93, f96, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f69 = f93, f100, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f92, f98, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f70 = f92, f102, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f93, f98, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f71 = f93, f102, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f93, f97, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f68 = f93, f101, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f92, f97, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f69 = f92, f101, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f93, f99, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f70 = f93, f103, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f92, f99, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f71 = f92, f103, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f104, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f104, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f105, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f105, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f104, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f104, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f105, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f105, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f105, f81, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f105, f85, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f81 = f104, f81, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f104, f85, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f105, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f86 = f105, f87, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f104, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f87 = f104, f87, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f64 = f106, f80, f64 nop __LINE__ } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f68 = f106, f84, f68 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f81, SIZE FMA_A f65 = f107, f80, f65 nop __LINE__ } { .mfi STFD [AOFFSET2] = f85, SIZE FMA_A f69 = f107, f84, f69 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f66 = f106, f82, f66 nop __LINE__ } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f70 = f106, f86, f70 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f83, -11 * SIZE FMA_A f67 = f107, f82, f67 nop __LINE__ } { .mfi STFD [AOFFSET2] = f87, -11 * SIZE FMA_A f71 = f107, f86, f71 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f64 = f107, f81, f64 nop __LINE__ } { .mfi STFD [C6 ] = f84, SIZE FMA_B f68 = f107, f85, f68 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f65 = f106, f81, f65 nop __LINE__ } { .mfi STFD [C6 ] = f85, SIZE FNMA f69 = f106, f85, f69 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f66 = f107, f83, f66 nop __LINE__ } { .mfi STFD [C6 ] = f86, SIZE FMA_B f70 = f107, f87, f70 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, 5 * SIZE FNMA f67 = f106, f83, f67 nop __LINE__ } { .mfi STFD [C6 ] = f87, 5 * SIZE FNMA f71 = f106, f87, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f120, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f120, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f121, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f121, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f120, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f120, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f121, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f121, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f64 = f121, f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f68 = f121, f69, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f65 = f120, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f120, f69, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f121, f67, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f70 = f121, f71, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f120, f67, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f71 = f120, f71, f39 nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE shladd r2 = K, ZBASE_SHIFT, r0 } ;; { .mmi STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE shladd AORIG = r2, 2, AORIG } ;; { .mmi STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C5 ] = f68, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 nop __LINE__ } { .mfi STFD [C5 ] = f69, SIZE mov f96 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE mov f80 = f0 sub L = K, KK } { .mfi STFD [C5 ] = f70, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE mov f112 = f0 adds I = -1, I } { .mfb STFD [C5 ] = f71, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif .L020: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L030 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = -1, L } { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f114 = f0 mov ar.lc = L } { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L028 ;; .align 16 .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f120, f121 = [BOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [BOFFSET] FSUB f80 = f74, f80 adds BOFFSET = -14 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB_A f81 = f75, f81 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f96 = f88, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f97 = f89, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f90, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f113 = f91, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f66 = f104, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f67 = f105, f67 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f82 = f106, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f83 = f107, f83 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f98 = f120, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f99 = f121, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f115 = f123, f115 nop __LINE__ } ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [AOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f120, f121 = [AOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [AOFFSET] FSUB f66 = f74, f66 adds AOFFSET = -14 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f67 = f75, f67 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f80 = f88, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f89, f81 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f91, f83 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f96 = f104, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f105, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f98 = f106, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f99 = f107, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f120, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f121, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f123, f115 nop __LINE__ } ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f66 FMPY f33 = f105, f66 FMPY f34 = f104, f82 FMPY f35 = f105, f82 FMPY f36 = f104, f98 FMPY f37 = f105, f98 FMPY f38 = f104, f114 FMPY f39 = f105, f114 ;; FMA_C f66 = f105, f67, f32 FMA_D f67 = f104, f67, f33 FMA_C f82 = f105, f83, f34 FMA_D f83 = f104, f83, f35 FMA_C f98 = f105, f99, f36 FMA_D f99 = f104, f99, f37 FMA_C f114 = f105, f115, f38 FMA_D f115 = f104, f115, f39 ;; FNMA f64 = f106, f66, f64 FMA_A f65 = f107, f66, f65 FNMA f80 = f106, f82, f80 FMA_A f81 = f107, f82, f81 FNMA f96 = f106, f98, f96 FMA_A f97 = f107, f98, f97 FNMA f112 = f106, f114, f112 FMA_A f113 = f107, f114, f113 ;; FMA_B f64 = f107, f67, f64 FNMA f65 = f106, f67, f65 FMA_B f80 = f107, f83, f80 FNMA f81 = f106, f83, f81 FMA_B f96 = f107, f99, f96 FNMA f97 = f106, f99, f97 FMA_B f112 = f107, f115, f112 FNMA f113 = f106, f115, f113 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 FMPY f36 = f120, f96 FMPY f37 = f121, f96 FMPY f38 = f120, f112 FMPY f39 = f121, f112 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 FMA_C f96 = f121, f97, f36 FMA_D f97 = f120, f97, f37 FMA_C f112 = f121, f113, f38 FMA_D f113 = f120, f113, f39 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; FNMA f66 = f74, f64, f66 FMA_A f67 = f75, f64, f67 FNMA f82 = f74, f80, f82 FMA_A f83 = f75, f80, f83 FNMA f98 = f74, f96, f98 FMA_A f99 = f75, f96, f99 FNMA f114 = f74, f112, f114 FMA_A f115 = f75, f112, f115 ;; FMA_B f66 = f75, f65, f66 FNMA f67 = f74, f65, f67 FMA_B f82 = f75, f81, f82 FNMA f83 = f74, f81, f83 FMA_B f98 = f75, f97, f98 FNMA f99 = f74, f97, f99 FMA_B f114 = f75, f113, f114 FNMA f115 = f74, f113, f115 ;; FMPY f32 = f90, f66 FMPY f33 = f91, f66 FMPY f34 = f90, f82 FMPY f35 = f91, f82 FMPY f36 = f90, f98 FMPY f37 = f91, f98 FMPY f38 = f90, f114 FMPY f39 = f91, f114 ;; FMA_C f66 = f91, f67, f32 FMA_D f67 = f90, f67, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 FMA_C f98 = f91, f99, f36 FMA_D f99 = f90, f99, f37 FMA_C f114 = f91, f115, f38 FMA_D f115 = f90, f115, f39 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f108, f109 = [BOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f126, f127 = [BOFFSET] adds BOFFSET = - 30 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f66 FMPY f35 = f73, f66 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f66 = f73, f67, f34 FMA_D f67 = f72, f67, f35 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f82 = f74, f66, f82 FMA_A f83 = f75, f66, f83 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f82 = f75, f67, f82 FNMA f83 = f74, f67, f83 ;; FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f98 = f76, f66, f98 FMA_A f99 = f77, f66, f99 ;; FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f98 = f77, f67, f98 FNMA f99 = f76, f67, f99 ;; FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 FNMA f114 = f78, f66, f114 FMA_A f115 = f79, f66, f115 ;; FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 FMA_B f114 = f79, f67, f114 FNMA f115 = f78, f67, f115 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f82 FMPY f35 = f91, f82 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f98 = f92, f82, f98 FMA_A f99 = f93, f82, f99 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f98 = f93, f83, f98 FNMA f99 = f92, f83, f99 ;; FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 FNMA f114 = f94, f82, f114 FMA_A f115 = f95, f82, f115 ;; FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 FMA_B f114 = f95, f83, f114 FNMA f115 = f94, f83, f115 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 FMPY f34 = f108, f98 FMPY f35 = f109, f98 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 FMA_C f98 = f109, f99, f34 FMA_D f99 = f108, f99, f35 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 FNMA f114 = f110, f98, f114 FMA_A f115 = f111, f98, f115 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 FMA_B f114 = f111, f99, f114 FNMA f115 = f110, f99, f115 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 FMPY f34 = f126, f114 FMPY f35 = f127, f114 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 FMA_C f114 = f127, f115, f34 FMA_D f115 = f126, f115, f35 ;; #endif #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET ;; LDFPD f72, f73 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f76, f77 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f92, f93 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f98 = f74, f114, f98 FMA_A f99 = f75, f114, f99 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f98 = f75, f115, f98 FNMA f99 = f74, f115, f99 ;; FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f82 = f76, f114, f82 FMA_A f83 = f77, f114, f83 ;; FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f82 = f77, f115, f82 FNMA f83 = f76, f115, f83 ;; FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 FNMA f66 = f78, f114, f66 FMA_A f67 = f79, f114, f67 ;; FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 FMA_B f66 = f79, f115, f66 FNMA f67 = f78, f115, f67 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 FMPY f34 = f88, f98 FMPY f35 = f89, f98 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 FMA_C f98 = f89, f99, f34 FMA_D f99 = f88, f99, f35 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f82 = f90, f98, f82 FMA_A f83 = f91, f98, f83 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f82 = f91, f99, f82 FNMA f83 = f90, f99, f83 ;; FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 FNMA f66 = f92, f98, f66 FMA_A f67 = f93, f98, f67 ;; FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 FMA_B f66 = f93, f99, f66 FNMA f67 = f92, f99, f67 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f82 FMPY f35 = f105, f82 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f82 = f105, f83, f34 FMA_D f83 = f104, f83, f35 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f66 = f106, f82, f66 FMA_A f67 = f107, f82, f67 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f66 = f107, f83, f66 FNMA f67 = f106, f83, f67 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f66 FMPY f35 = f121, f66 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f66 = f121, f67, f34 FMA_D f67 = f120, f67, f35 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f98, SIZE ;; STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f114, SIZE ;; STFD [BOFFSET] = f83, 5 * SIZE STFD [BOFFSET2] = f115, 5 * SIZE ;; adds BOFFSET = - 16 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f82, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f83, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f114, SIZE ;; STFD [AOFFSET] = f99, 5 * SIZE STFD [AOFFSET2] = f115, 5 * SIZE ;; adds AOFFSET = - 16 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f66, SIZE ;; STFD [C1 ] = f67, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C2 ] = f82, SIZE ;; STFD [C2 ] = f83, SIZE ;; STFD [C3 ] = f96, SIZE ;; STFD [C3 ] = f97, SIZE ;; STFD [C3 ] = f98, SIZE ;; STFD [C3 ] = f99, SIZE ;; STFD [C4 ] = f112, SIZE ;; STFD [C4 ] = f113, SIZE ;; STFD [C4 ] = f114, SIZE ;; STFD [C4 ] = f115, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 2, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L030: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L049 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f72 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f73 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f72 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mmi nop __LINE__ adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f88 = f0 shr L = L, 1 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f104 = f0 adds L = -1, L } { .mfb adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f105 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } { .mfi cmp.eq p3, p0 = r0, r0 mov f121 = f0 nop __LINE__ } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L038 ;; .align 16 .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f88, f80 FSUB_A f81 = f89, f81 FSUB f96 = f104, f96 FSUB_A f97 = f105, f97 FSUB f112 = f120, f112 FSUB_A f113 = f121, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [AOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f96 = f104, f96 FSUB f97 = f105, f97 FSUB f112 = f120, f112 FSUB f113 = f121, f113 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 FMPY f36 = f120, f96 FMPY f37 = f121, f96 FMPY f38 = f120, f112 FMPY f39 = f121, f112 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 FMA_C f96 = f121, f97, f36 FMA_D f97 = f120, f97, f37 FMA_C f112 = f121, f113, f38 FMA_D f113 = f120, f113, f39 ;; #endif #ifdef LT LDFPD f90, f91 = [AOFFSET] ;; FMPY f32 = f90, f64 FMPY f33 = f91, f64 FMPY f34 = f90, f80 FMPY f35 = f91, f80 FMPY f36 = f90, f96 FMPY f37 = f91, f96 FMPY f38 = f90, f112 FMPY f39 = f91, f112 ;; FMA_C f64 = f91, f65, f32 FMA_D f65 = f90, f65, f33 FMA_C f80 = f91, f81, f34 FMA_D f81 = f90, f81, f35 FMA_C f96 = f91, f97, f36 FMA_D f97 = f90, f97, f37 FMA_C f112 = f91, f113, f38 FMA_D f113 = f90, f113, f39 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f108, f109 = [BOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f126, f127 = [BOFFSET] adds BOFFSET = - 30 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 ;; FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 ;; FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 ;; FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 ;; FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 ;; FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 ;; FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 ;; #endif #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET ;; LDFPD f72, f73 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f76, f77 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f92, f93 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 ;; FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 ;; FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 ;; FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 ;; FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 ;; FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 ;; FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f97, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f81, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C3 ] = f96, SIZE ;; STFD [C3 ] = f97, SIZE ;; STFD [C4 ] = f112, SIZE ;; STFD [C4 ] = f113, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET shladd BOFFSET = L, 2, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L049: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; shladd B = KK8, 2, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 4, KK #endif #ifdef RT adds KK = -4, KK #endif ;; { .mmb mov AOFFSET = A cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 16 .L050: { .mmi shr I = M, 2 } { .mib tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L090 } ;; #ifdef RT { .mmi shladd r3 = LDC, 1, r0 nop __LINE__ shl r2 = K, 1 + ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } ;; #endif mov C1 = C add C2 = LDC, C ;; #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif ;; #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; { .mib cmp.eq p6, p7 = 0, I #ifndef RT shladd C = LDC, 1, C #else nop __LINE__ #endif (p6) br.cond.dpnt .L060 } ;; .align 16 .L052: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 1, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi cmp.eq p3, p0 = r0, r0 mov f99 = f0 adds L = 1, L } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi CPREFETCH [PREC], LDC mov f115 = f0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds C5 = 4 * SIZE, C1 adds L = -1, L } ;; { .mmi CPREFETCH [PREC], LDC adds C6 = 4 * SIZE, C2 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L058 ;; .align 16 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfi FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f36, f48, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f36, f49, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f36, f50, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f36, f51, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f38, f48, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f38, f49, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f38, f50, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f38, f51, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f37, f48, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f37, f49, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f37, f50, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f37, f51, f82 // A6 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f39, f48, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f39, f49, f98 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f39, f50, f115 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f39, f51, f114 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f44, f56, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f44, f58, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f46, f56, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f46, f58, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f45, f56, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f45, f58, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f47, f56, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f47, f58, f115 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 br.cloop.sptk.few .L053 } ;; .L058: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [BOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [BOFFSET], 2 * SIZE ;; LDFPD f122, f123 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 FSUB f66 = f104, f66 FSUB_A f67 = f105, f67 FSUB f82 = f106, f82 FSUB_A f83 = f107, f83 FSUB f98 = f120, f98 FSUB_A f99 = f121, f99 FSUB f114 = f122, f114 FSUB_A f115 = f123, f115 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f74, f96 FSUB f97 = f75, f97 FSUB f66 = f76, f66 FSUB f67 = f77, f67 FSUB f98 = f78, f98 FSUB f99 = f79, f99 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f112 = f90, f112 FSUB f113 = f91, f113 FSUB f82 = f92, f82 FSUB f83 = f93, f83 FSUB f114 = f94, f114 FSUB f115 = f95, f115 ;; #endif #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f98 FMPY f33 = f73, f98 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f98 = f73, f99, f32 FMA_D f99 = f72, f99, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f66 = f74, f98, f66 FMA_A f67 = f75, f98, f67 FNMA f82 = f74, f114, f82 FMA_A f83 = f75, f114, f83 ;; FMA_B f66 = f75, f99, f66 FNMA f67 = f74, f99, f67 FMA_B f82 = f75, f115, f82 FNMA f83 = f74, f115, f83 ;; FNMA f96 = f76, f98, f96 FMA_A f97 = f77, f98, f97 FNMA f112 = f76, f114, f112 FMA_A f113 = f77, f114, f113 ;; FMA_B f96 = f77, f99, f96 FNMA f97 = f76, f99, f97 FMA_B f112 = f77, f115, f112 FNMA f113 = f76, f115, f113 ;; FNMA f64 = f78, f98, f64 FMA_A f65 = f79, f98, f65 FNMA f80 = f78, f114, f80 FMA_A f81 = f79, f114, f81 ;; FMA_B f64 = f79, f99, f64 FNMA f65 = f78, f99, f65 FMA_B f80 = f79, f115, f80 FNMA f81 = f78, f115, f81 ;; FMPY f32 = f88, f66 FMPY f33 = f89, f66 FMPY f34 = f88, f82 FMPY f35 = f89, f82 ;; FMA_C f66 = f89, f67, f32 FMA_D f67 = f88, f67, f33 FMA_C f82 = f89, f83, f34 FMA_D f83 = f88, f83, f35 ;; FNMA f96 = f90, f66, f96 FMA_A f97 = f91, f66, f97 FNMA f112 = f90, f82, f112 FMA_A f113 = f91, f82, f113 ;; FMA_B f96 = f91, f67, f96 FNMA f97 = f90, f67, f97 FMA_B f112 = f91, f83, f112 FNMA f113 = f90, f83, f113 ;; FNMA f64 = f92, f66, f64 FMA_A f65 = f93, f66, f65 FNMA f80 = f92, f82, f80 FMA_A f81 = f93, f82, f81 ;; FMA_B f64 = f93, f67, f64 FNMA f65 = f92, f67, f65 FMA_B f80 = f93, f83, f80 FNMA f81 = f92, f83, f81 ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 FNMA f80 = f106, f112, f80 FMA_A f81 = f107, f112, f81 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 FMA_B f80 = f107, f113, f80 FNMA f81 = f106, f113, f81 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f108, f109 = [AOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f126, f127 = [AOFFSET] adds AOFFSET = - 30 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 FNMA f112 = f74, f80, f112 FMA_A f113 = f75, f80, f113 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 FMA_B f112 = f75, f81, f112 FNMA f113 = f74, f81, f113 ;; FNMA f66 = f76, f64, f66 FMA_A f67 = f77, f64, f67 FNMA f82 = f76, f80, f82 FMA_A f83 = f77, f80, f83 ;; FMA_B f66 = f77, f65, f66 FNMA f67 = f76, f65, f67 FMA_B f82 = f77, f81, f82 FNMA f83 = f76, f81, f83 ;; FNMA f98 = f78, f64, f98 FMA_A f99 = f79, f64, f99 FNMA f114 = f78, f80, f114 FMA_A f115 = f79, f80, f115 ;; FMA_B f98 = f79, f65, f98 FNMA f99 = f78, f65, f99 FMA_B f114 = f79, f81, f114 FNMA f115 = f78, f81, f115 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; FNMA f66 = f92, f96, f66 FMA_A f67 = f93, f96, f67 FNMA f82 = f92, f112, f82 FMA_A f83 = f93, f112, f83 ;; FMA_B f66 = f93, f97, f66 FNMA f67 = f92, f97, f67 FMA_B f82 = f93, f113, f82 FNMA f83 = f92, f113, f83 ;; FNMA f98 = f94, f96, f98 FMA_A f99 = f95, f96, f99 FNMA f114 = f94, f112, f114 FMA_A f115 = f95, f112, f115 ;; FMA_B f98 = f95, f97, f98 FNMA f99 = f94, f97, f99 FMA_B f114 = f95, f113, f114 FNMA f115 = f94, f113, f115 ;; FMPY f32 = f108, f66 FMPY f33 = f109, f66 FMPY f34 = f108, f82 FMPY f35 = f109, f82 ;; FMA_C f66 = f109, f67, f32 FMA_D f67 = f108, f67, f33 FMA_C f82 = f109, f83, f34 FMA_D f83 = f108, f83, f35 ;; FNMA f98 = f110, f66, f98 FMA_A f99 = f111, f66, f99 FNMA f114 = f110, f82, f114 FMA_A f115 = f111, f82, f115 ;; FMA_B f98 = f111, f67, f98 FNMA f99 = f110, f67, f99 FMA_B f114 = f111, f83, f114 FNMA f115 = f110, f83, f115 ;; FMPY f32 = f126, f98 FMPY f33 = f127, f98 FMPY f34 = f126, f114 FMPY f35 = f127, f114 ;; FMA_C f98 = f127, f99, f32 FMA_D f99 = f126, f99, f33 FMA_C f114 = f127, f115, f34 FMA_D f115 = f126, f115, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f96 FMPY f35 = f73, f96 FMPY f36 = f72, f66 FMPY f37 = f73, f66 FMPY f38 = f72, f98 FMPY f39 = f73, f98 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f34 FMA_D f97 = f72, f97, f35 FMA_C f66 = f73, f67, f36 FMA_D f67 = f72, f67, f37 FMA_C f98 = f73, f99, f38 FMA_D f99 = f72, f99, f39 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f112 = f74, f96, f112 FMA_A f113 = f75, f96, f113 FNMA f82 = f74, f66, f82 FMA_A f83 = f75, f66, f83 FNMA f114 = f74, f98, f114 FMA_A f115 = f75, f98, f115 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f112 = f75, f97, f112 FNMA f113 = f74, f97, f113 FMA_B f82 = f75, f67, f82 FNMA f83 = f74, f67, f83 FMA_B f114 = f75, f99, f114 FNMA f115 = f74, f99, f115 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f112 FMPY f35 = f91, f112 FMPY f36 = f90, f82 FMPY f37 = f91, f82 FMPY f38 = f90, f114 FMPY f39 = f91, f114 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 FMA_C f82 = f91, f83, f36 FMA_D f83 = f90, f83, f37 FMA_C f114 = f91, f115, f38 FMA_D f115 = f90, f115, f39 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f112 FMPY f35 = f105, f112 FMPY f36 = f104, f82 FMPY f37 = f105, f82 FMPY f38 = f104, f114 FMPY f39 = f105, f114 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 FMA_C f82 = f105, f83, f36 FMA_D f83 = f104, f83, f37 FMA_C f114 = f105, f115, f38 FMA_D f115 = f104, f115, f39 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f96 = f106, f112, f96 FMA_A f97 = f107, f112, f97 FNMA f66 = f106, f82, f66 FMA_A f67 = f107, f82, f67 FNMA f98 = f106, f114, f98 FMA_A f99 = f107, f114, f99 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f96 = f107, f113, f96 FNMA f97 = f106, f113, f97 FMA_B f66 = f107, f83, f66 FNMA f67 = f106, f83, f67 FMA_B f98 = f107, f115, f98 FNMA f99 = f106, f115, f99 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f96 FMPY f35 = f121, f96 FMPY f36 = f120, f66 FMPY f37 = f121, f66 FMPY f38 = f120, f98 FMPY f39 = f121, f98 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f96 = f121, f97, f34 FMA_D f97 = f120, f97, f35 FMA_C f66 = f121, f67, f36 FMA_D f67 = f120, f67, f37 FMA_C f98 = f121, f99, f38 FMA_D f99 = f120, f99, f39 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f98, SIZE ;; STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f114, SIZE ;; STFD [BOFFSET] = f83, 5 * SIZE STFD [BOFFSET2] = f115, 5 * SIZE ;; adds BOFFSET = - 16 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f66, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f67, SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f98, SIZE ;; STFD [AOFFSET] = f97, 5 * SIZE STFD [AOFFSET2] = f99, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f82, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f83, SIZE ;; STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f114, SIZE ;; STFD [AOFFSET] = f113, 5 * SIZE STFD [AOFFSET2] = f115, 5 * SIZE ;; adds AOFFSET = - 16 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C5 = -8 * SIZE, C5 adds C6 = -8 * SIZE, C6 #endif ;; STFD [C1 ] = f64, SIZE STFD [C5 ] = f66, SIZE ;; STFD [C1 ] = f65, SIZE STFD [C5 ] = f67, SIZE ;; STFD [C1 ] = f96, SIZE STFD [C5 ] = f98, SIZE ;; STFD [C1 ] = f97, 5 * SIZE STFD [C5 ] = f99, 5 * SIZE ;; STFD [C2 ] = f80, SIZE STFD [C6 ] = f82, SIZE ;; STFD [C2 ] = f81, SIZE STFD [C6 ] = f83, SIZE ;; STFD [C2 ] = f112, SIZE STFD [C6 ] = f114, SIZE ;; STFD [C2 ] = f113, 5 * SIZE STFD [C6 ] = f115, 5 * SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C5 = -8 * SIZE, C5 adds C6 = -8 * SIZE, C6 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 2, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; (p6) br.cond.dptk .L052 ;; .align 16 .L060: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L070 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } { .mmi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE shr L = L, 1 } ;; { .mmi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L068 ;; .align 16 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 } { .mfb FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 br.cloop.sptk.few .L062 } ;; .L068: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f74, f96 FSUB f97 = f75, f97 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f112 = f90, f112 FSUB f113 = f91, f113 ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 FNMA f80 = f106, f112, f80 FMA_A f81 = f107, f112, f81 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 FMA_B f80 = f107, f113, f80 FNMA f81 = f106, f113, f81 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 FNMA f112 = f74, f80, f112 FMA_A f113 = f75, f80, f113 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 FMA_B f112 = f75, f81, f112 FNMA f113 = f74, f81, f113 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f96 FMPY f35 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f34 FMA_D f97 = f72, f97, f35 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f112 = f74, f96, f112 FMA_A f113 = f75, f96, f113 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f112 = f75, f97, f112 FNMA f113 = f74, f97, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f96 = f106, f112, f96 FMA_A f97 = f107, f112, f97 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f96 = f107, f113, f96 FNMA f97 = f106, f113, f97 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f96 FMPY f35 = f121, f96 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f96 = f121, f97, f34 FMA_D f97 = f120, f97, f35 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f96, SIZE ;; STFD [C1 ] = f97, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C2 ] = f112, SIZE ;; STFD [C2 ] = f113, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L070: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L089 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L078 ;; .align 16 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f96 = f32, f49, f96 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f112 = f32, f51, f112 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f97 = f33, f49, f97 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf nop __LINE__ nop __LINE__ FMA f113 = f33, f51, f113 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f96 = f40, f57, f96 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f112 = f40, f59, f112 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f97 = f41, f57, f97 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f41, f59, f113 // A2 * B4 br.cloop.sptk.few .L072 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f97 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f80 = f80, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f96 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f81 = f81, f112 nop __LINE__ } ;; .L078: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f88, f80 FSUB f81 = f89, f81 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f80, SIZE ;; STFD [BOFFSET] = f81, SIZE ;; adds BOFFSET = - 4 * SIZE, BOFFSET ;; #else STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f81, SIZE ;; adds AOFFSET = - 4 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L089: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; shladd B = KK8, 1, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 2, KK #endif #ifdef RT adds KK = -2, KK #endif ;; { .mmi mov AOFFSET = A nop __LINE__ } ;; .align 16 .L090: shr I = M, 2 tbit.z p6, p0 = N, 0 (p6) br.cond.dpnt .L999 ;; #ifdef RT { .mmi shl r2 = K, ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, LDC nop __LINE__ } ;; #endif mov C1 = C #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif ;; #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; { .mib cmp.eq p6, p7 = 0, I #ifndef RT add C = LDC, C #else nop __LINE__ #endif (p6) br.cond.dpnt .L100 } ;; .align 16 .L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif ;; (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = 1, L ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC] } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov ar.lc = L } { .mmi adds C5 = 4 * SIZE, C1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 ;; .align 16 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f34, f48, f80 // A3 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f81 = f34, f49, f81 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f36, f48, f96 // A5 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f97 = f36, f49, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f38, f48, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f38, f49, f113 // A7 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f81 = f35, f48, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f35, f49, f80 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f97 = f37, f48, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f37, f49, f96 // A6 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f113 = f39, f48, f113 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f39, f49, f112 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f42, f56, f80 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f44, f56, f96 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f46, f56, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f43, f56, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f45, f56, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f47, f56, f113 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 br.cloop.sptk.few .L093 } ;; .L098: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f74, f80 FSUB f81 = f75, f81 FSUB f96 = f88, f96 FSUB f97 = f89, f97 FSUB f112 = f90, f112 FSUB f113 = f91, f113 ;; #endif #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f108, f109 = [AOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f126, f127 = [AOFFSET] adds AOFFSET = - 30 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f97, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f81, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5 #endif ;; STFD [C1 ] = f64, SIZE STFD [C5 ] = f96, SIZE ;; STFD [C1 ] = f65, SIZE STFD [C5 ] = f97, SIZE ;; STFD [C1 ] = f80, SIZE STFD [C5 ] = f112, SIZE ;; STFD [C1 ] = f81, 5 * SIZE STFD [C5 ] = f113, 5 * SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 2, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; (p6) br.cond.dptk .L092 ;; .align 16 .L100: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L110 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L108 ;; .align 16 .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f112 = f34, f49, f112 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f113 = f35, f49, f113 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f57, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f112 = f42, f57, f112 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f43, f56, f97 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f43, f57, f113 // A4 * B2 br.cloop.sptk.few .L102 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f96 = f96, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f97 = f97, f112 nop __LINE__ } ;; .L108: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f88, f96 FSUB f97 = f89, f97 ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f96, SIZE ;; STFD [BOFFSET] = f97, SIZE ;; adds BOFFSET = - 4 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f96, SIZE ;; STFD [AOFFSET] = f97, SIZE ;; adds AOFFSET = - 4 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f96, SIZE ;; STFD [C1 ] = f97, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L110: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L119 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L118 ;; .align 16 .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb (p3) FMA f81 = f41, f57, f81 // A2 * B2 br.cloop.sptk.few .L112 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } ;; .L118: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET] ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 ;; #else LDFPD f72, f73 = [AOFFSET] ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; adds BOFFSET = - 2 * SIZE, BOFFSET ;; #else STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; adds AOFFSET = - 2 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif .align 16 .L119: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; add B = KK8, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 1, KK #endif #ifdef RT adds KK = -1, KK #endif ;; { .mmi mov AOFFSET = A nop __LINE__ } ;; .align 16 .L999: { .mii nop __LINE__ mov ar.lc = ARLC mov pr = PR, -1 } { .mib nop __LINE__ #ifdef TRMMKERNEL mov ar.pfs = ARPFS #else nop __LINE__ #endif br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/ia64/ztrsm_kernel_RT.S000066400000000000000000005163651313527062700204570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE (16 * 8) #else #define PREFETCHSIZE (32 * 8) #endif #ifndef LN #define CPREFETCHSIZE 7 #else #define CPREFETCHSIZE -8 #endif #define CPREFETCH lfetch.excl.nt1 #define M r32 #define N r33 #define K r34 #define A r37 #define B r38 #define C r39 #define LDC r35 #define I r15 #define J r16 #define AOFFSET r17 #define BOFFSET r18 #define TEMP r19 #define L r20 #define C1 r21 #define C2 r22 #define C3 r23 #define C4 r24 #define C5 r25 #define C6 r26 #define C7 r27 #define C8 r28 #define PREA r8 #define PREB r9 #define PREC r10 #define SP r12 #define ARLC r29 #define PR r30 #define ARPFS r31 #define ALPHA_R f8 #define ALPHA_I f9 #define AORIG loc0 #define KK loc1 #define KK8 loc2 #define OFFSET loc3 #define AOFFSET2 loc4 #define BOFFSET2 loc5 #ifndef CONJ #define FCALC_A FSUB #define FCALC_B FADD #define FMA_A FNMA #define FMA_B FMA #else #define FCALC_A FADD #define FCALC_B FSUB #define FMA_A FMA #define FMA_B FNMA #endif #ifndef CONJ #define FCALC_C FMA #define FCALC_D FNMA #else #define FCALC_C FNMA #define FCALC_D FMA #endif #ifndef CONJ #define FMA_C FNMA #define FMA_D FMA #define FSUB_A FSUB #else #define FMA_C FMA #define FMA_D FMS #define FSUB_A FADD #endif PROLOGUE .prologue PROFCODE { .mfi .save ar.pfs, ARPFS alloc ARPFS = ar.pfs, 8, 8, 0, 0 mov f64 = f0 adds r14 = 16, SP } { .mfi nop __LINE__ mov f65 = f0 adds r15 = 24, SP } ;; { .mfi ld8 LDC = [r14] mov f81 = f0 mov PR = pr } { .mfi ld8 OFFSET = [r15] mov f96 = f0 } ;; { .mfi shladd LDC = LDC, ZBASE_SHIFT, r0 mov f97 = f0 } { .mfi nop __LINE__ mov f113 = f0 } ;; #ifdef LN { .mmi setf.sig f32 = M setf.sig f33 = K shladd C = M, ZBASE_SHIFT, C } ;; {.mmf nop __LINE__ nop __LINE__ xmpy.l f32 = f32, f33 } ;; { .mmi getf.sig r2 = f32 ;; nop __LINE__ shladd A = r2, ZBASE_SHIFT, A } ;; #endif #ifdef RN sub KK = r0, OFFSET #endif #ifdef RT { .mmi setf.sig f32 = N setf.sig f33 = K nop __LINE__ } ;; { .mmi setf.sig f34 = LDC nop __LINE__ nop __LINE__ } ;; { .mmf nop __LINE__ nop __LINE__ xmpy.l f33 = f32, f33 } { .mmf nop __LINE__ sub KK = N, OFFSET xmpy.l f34 = f32, f34 } ;; { .mmi getf.sig r2 = f33 getf.sig r3 = f34 } ;; shladd B = r2, ZBASE_SHIFT, B add C = r3, C #endif ;; .body { .mfi nop __LINE__ mov f80 = f0 mov ARLC = ar.lc } { .mfb mov f112 = f0 } ;; ;; shr I = M, 2 tbit.z p6, p0 = N, 0 (p6) br.cond.dpnt .L050 ;; #ifdef RT { .mmi shl r2 = K, ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, LDC nop __LINE__ } ;; #endif mov C1 = C #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif ;; #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; { .mib cmp.eq p6, p7 = 0, I #ifndef RT add C = LDC, C #else nop __LINE__ #endif (p6) br.cond.dpnt .L100 } ;; .align 16 .L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif ;; (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds L = 1, L ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 shr L = L, 1 } ;; { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE CPREFETCH [PREC] } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov ar.lc = L } { .mmi adds C5 = 4 * SIZE, C1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 ;; .align 16 .L093: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f34, f48, f80 // A3 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f81 = f34, f49, f81 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f36, f48, f96 // A5 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f97 = f36, f49, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f38, f48, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f38, f49, f113 // A7 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f81 = f35, f48, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f35, f49, f80 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f97 = f37, f48, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f37, f49, f96 // A6 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f113 = f39, f48, f113 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f39, f49, f112 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f42, f56, f80 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f44, f56, f96 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f46, f56, f112 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f43, f56, f81 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f45, f56, f97 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f47, f56, f113 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 br.cloop.sptk.few .L093 } ;; .L098: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f74, f80 FSUB f81 = f75, f81 FSUB f96 = f88, f96 FSUB f97 = f89, f97 FSUB f112 = f90, f112 FSUB f113 = f91, f113 ;; #endif #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f108, f109 = [AOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f126, f127 = [AOFFSET] adds AOFFSET = - 30 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f97, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f81, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5 #endif ;; STFD [C1 ] = f64, SIZE STFD [C5 ] = f96, SIZE ;; STFD [C1 ] = f65, SIZE STFD [C5 ] = f97, SIZE ;; STFD [C1 ] = f80, SIZE STFD [C5 ] = f112, SIZE ;; STFD [C1 ] = f81, 5 * SIZE STFD [C5 ] = f113, 5 * SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 2, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; (p6) br.cond.dptk .L092 ;; .align 16 .L100: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L110 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L108 ;; .align 16 .L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f112 = f34, f49, f112 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f113 = f35, f49, f113 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f57, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f112 = f42, f57, f112 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f43, f56, f97 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f43, f57, f113 // A4 * B2 br.cloop.sptk.few .L102 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f96 = f96, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f97 = f97, f112 nop __LINE__ } ;; .L108: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f88, f96 FSUB f97 = f89, f97 ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f96, SIZE ;; STFD [BOFFSET] = f97, SIZE ;; adds BOFFSET = - 4 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f96, SIZE ;; STFD [AOFFSET] = f97, SIZE ;; adds AOFFSET = - 4 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f96, SIZE ;; STFD [C1 ] = f97, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L110: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L119 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi add BOFFSET = r3, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L118 ;; .align 16 .L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb (p3) FMA f81 = f41, f57, f81 // A2 * B2 br.cloop.sptk.few .L112 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } ;; .L118: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -1, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET] ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 ;; #else LDFPD f72, f73 = [AOFFSET] ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; adds BOFFSET = - 2 * SIZE, BOFFSET ;; #else STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; adds AOFFSET = - 2 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET add BOFFSET = L, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif .align 16 .L119: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; add B = KK8, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 1, KK #endif #ifdef RT adds KK = -1, KK #endif ;; { .mmi mov AOFFSET = A nop __LINE__ } ;; .align 16 .L050: { .mmi shr I = M, 2 } { .mib tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L010 } ;; #ifdef RT { .mmi shladd r3 = LDC, 1, r0 nop __LINE__ shl r2 = K, 1 + ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } ;; #endif mov C1 = C add C2 = LDC, C ;; #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif ;; #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; { .mib cmp.eq p6, p7 = 0, I #ifndef RT shladd C = LDC, 1, C #else nop __LINE__ #endif (p6) br.cond.dpnt .L060 } ;; .align 16 .L052: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 1, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi cmp.eq p3, p0 = r0, r0 mov f99 = f0 adds L = 1, L } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi CPREFETCH [PREC], LDC mov f115 = f0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds C5 = 4 * SIZE, C1 adds L = -1, L } ;; { .mmi CPREFETCH [PREC], LDC adds C6 = 4 * SIZE, C2 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L058 ;; .align 16 .L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfi FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f36, f48, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f36, f49, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f36, f50, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f36, f51, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f38, f48, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f38, f49, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f38, f50, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f38, f51, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f37, f48, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f37, f49, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f37, f50, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f37, f51, f82 // A6 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f39, f48, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f39, f49, f98 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f39, f50, f115 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f39, f51, f114 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f44, f56, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f44, f58, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f46, f56, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f46, f58, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f45, f56, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f45, f58, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f47, f56, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f47, f58, f115 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 br.cloop.sptk.few .L053 } ;; .L058: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [BOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [BOFFSET], 2 * SIZE ;; LDFPD f122, f123 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 FSUB f66 = f104, f66 FSUB_A f67 = f105, f67 FSUB f82 = f106, f82 FSUB_A f83 = f107, f83 FSUB f98 = f120, f98 FSUB_A f99 = f121, f99 FSUB f114 = f122, f114 FSUB_A f115 = f123, f115 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f74, f96 FSUB f97 = f75, f97 FSUB f66 = f76, f66 FSUB f67 = f77, f67 FSUB f98 = f78, f98 FSUB f99 = f79, f99 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f112 = f90, f112 FSUB f113 = f91, f113 FSUB f82 = f92, f82 FSUB f83 = f93, f83 FSUB f114 = f94, f114 FSUB f115 = f95, f115 ;; #endif #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f98 FMPY f33 = f73, f98 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f98 = f73, f99, f32 FMA_D f99 = f72, f99, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f66 = f74, f98, f66 FMA_A f67 = f75, f98, f67 FNMA f82 = f74, f114, f82 FMA_A f83 = f75, f114, f83 ;; FMA_B f66 = f75, f99, f66 FNMA f67 = f74, f99, f67 FMA_B f82 = f75, f115, f82 FNMA f83 = f74, f115, f83 ;; FNMA f96 = f76, f98, f96 FMA_A f97 = f77, f98, f97 FNMA f112 = f76, f114, f112 FMA_A f113 = f77, f114, f113 ;; FMA_B f96 = f77, f99, f96 FNMA f97 = f76, f99, f97 FMA_B f112 = f77, f115, f112 FNMA f113 = f76, f115, f113 ;; FNMA f64 = f78, f98, f64 FMA_A f65 = f79, f98, f65 FNMA f80 = f78, f114, f80 FMA_A f81 = f79, f114, f81 ;; FMA_B f64 = f79, f99, f64 FNMA f65 = f78, f99, f65 FMA_B f80 = f79, f115, f80 FNMA f81 = f78, f115, f81 ;; FMPY f32 = f88, f66 FMPY f33 = f89, f66 FMPY f34 = f88, f82 FMPY f35 = f89, f82 ;; FMA_C f66 = f89, f67, f32 FMA_D f67 = f88, f67, f33 FMA_C f82 = f89, f83, f34 FMA_D f83 = f88, f83, f35 ;; FNMA f96 = f90, f66, f96 FMA_A f97 = f91, f66, f97 FNMA f112 = f90, f82, f112 FMA_A f113 = f91, f82, f113 ;; FMA_B f96 = f91, f67, f96 FNMA f97 = f90, f67, f97 FMA_B f112 = f91, f83, f112 FNMA f113 = f90, f83, f113 ;; FNMA f64 = f92, f66, f64 FMA_A f65 = f93, f66, f65 FNMA f80 = f92, f82, f80 FMA_A f81 = f93, f82, f81 ;; FMA_B f64 = f93, f67, f64 FNMA f65 = f92, f67, f65 FMA_B f80 = f93, f83, f80 FNMA f81 = f92, f83, f81 ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 FNMA f80 = f106, f112, f80 FMA_A f81 = f107, f112, f81 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 FMA_B f80 = f107, f113, f80 FNMA f81 = f106, f113, f81 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f108, f109 = [AOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f126, f127 = [AOFFSET] adds AOFFSET = - 30 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 FNMA f112 = f74, f80, f112 FMA_A f113 = f75, f80, f113 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 FMA_B f112 = f75, f81, f112 FNMA f113 = f74, f81, f113 ;; FNMA f66 = f76, f64, f66 FMA_A f67 = f77, f64, f67 FNMA f82 = f76, f80, f82 FMA_A f83 = f77, f80, f83 ;; FMA_B f66 = f77, f65, f66 FNMA f67 = f76, f65, f67 FMA_B f82 = f77, f81, f82 FNMA f83 = f76, f81, f83 ;; FNMA f98 = f78, f64, f98 FMA_A f99 = f79, f64, f99 FNMA f114 = f78, f80, f114 FMA_A f115 = f79, f80, f115 ;; FMA_B f98 = f79, f65, f98 FNMA f99 = f78, f65, f99 FMA_B f114 = f79, f81, f114 FNMA f115 = f78, f81, f115 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; FNMA f66 = f92, f96, f66 FMA_A f67 = f93, f96, f67 FNMA f82 = f92, f112, f82 FMA_A f83 = f93, f112, f83 ;; FMA_B f66 = f93, f97, f66 FNMA f67 = f92, f97, f67 FMA_B f82 = f93, f113, f82 FNMA f83 = f92, f113, f83 ;; FNMA f98 = f94, f96, f98 FMA_A f99 = f95, f96, f99 FNMA f114 = f94, f112, f114 FMA_A f115 = f95, f112, f115 ;; FMA_B f98 = f95, f97, f98 FNMA f99 = f94, f97, f99 FMA_B f114 = f95, f113, f114 FNMA f115 = f94, f113, f115 ;; FMPY f32 = f108, f66 FMPY f33 = f109, f66 FMPY f34 = f108, f82 FMPY f35 = f109, f82 ;; FMA_C f66 = f109, f67, f32 FMA_D f67 = f108, f67, f33 FMA_C f82 = f109, f83, f34 FMA_D f83 = f108, f83, f35 ;; FNMA f98 = f110, f66, f98 FMA_A f99 = f111, f66, f99 FNMA f114 = f110, f82, f114 FMA_A f115 = f111, f82, f115 ;; FMA_B f98 = f111, f67, f98 FNMA f99 = f110, f67, f99 FMA_B f114 = f111, f83, f114 FNMA f115 = f110, f83, f115 ;; FMPY f32 = f126, f98 FMPY f33 = f127, f98 FMPY f34 = f126, f114 FMPY f35 = f127, f114 ;; FMA_C f98 = f127, f99, f32 FMA_D f99 = f126, f99, f33 FMA_C f114 = f127, f115, f34 FMA_D f115 = f126, f115, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f96 FMPY f35 = f73, f96 FMPY f36 = f72, f66 FMPY f37 = f73, f66 FMPY f38 = f72, f98 FMPY f39 = f73, f98 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f34 FMA_D f97 = f72, f97, f35 FMA_C f66 = f73, f67, f36 FMA_D f67 = f72, f67, f37 FMA_C f98 = f73, f99, f38 FMA_D f99 = f72, f99, f39 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f112 = f74, f96, f112 FMA_A f113 = f75, f96, f113 FNMA f82 = f74, f66, f82 FMA_A f83 = f75, f66, f83 FNMA f114 = f74, f98, f114 FMA_A f115 = f75, f98, f115 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f112 = f75, f97, f112 FNMA f113 = f74, f97, f113 FMA_B f82 = f75, f67, f82 FNMA f83 = f74, f67, f83 FMA_B f114 = f75, f99, f114 FNMA f115 = f74, f99, f115 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f112 FMPY f35 = f91, f112 FMPY f36 = f90, f82 FMPY f37 = f91, f82 FMPY f38 = f90, f114 FMPY f39 = f91, f114 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 FMA_C f82 = f91, f83, f36 FMA_D f83 = f90, f83, f37 FMA_C f114 = f91, f115, f38 FMA_D f115 = f90, f115, f39 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f112 FMPY f35 = f105, f112 FMPY f36 = f104, f82 FMPY f37 = f105, f82 FMPY f38 = f104, f114 FMPY f39 = f105, f114 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 FMA_C f82 = f105, f83, f36 FMA_D f83 = f104, f83, f37 FMA_C f114 = f105, f115, f38 FMA_D f115 = f104, f115, f39 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f96 = f106, f112, f96 FMA_A f97 = f107, f112, f97 FNMA f66 = f106, f82, f66 FMA_A f67 = f107, f82, f67 FNMA f98 = f106, f114, f98 FMA_A f99 = f107, f114, f99 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f96 = f107, f113, f96 FNMA f97 = f106, f113, f97 FMA_B f66 = f107, f83, f66 FNMA f67 = f106, f83, f67 FMA_B f98 = f107, f115, f98 FNMA f99 = f106, f115, f99 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f96 FMPY f35 = f121, f96 FMPY f36 = f120, f66 FMPY f37 = f121, f66 FMPY f38 = f120, f98 FMPY f39 = f121, f98 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f96 = f121, f97, f34 FMA_D f97 = f120, f97, f35 FMA_C f66 = f121, f67, f36 FMA_D f67 = f120, f67, f37 FMA_C f98 = f121, f99, f38 FMA_D f99 = f120, f99, f39 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f98, SIZE ;; STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f114, SIZE ;; STFD [BOFFSET] = f83, 5 * SIZE STFD [BOFFSET2] = f115, 5 * SIZE ;; adds BOFFSET = - 16 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f66, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f67, SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f98, SIZE ;; STFD [AOFFSET] = f97, 5 * SIZE STFD [AOFFSET2] = f99, 5 * SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f82, SIZE ;; STFD [AOFFSET] = f81, SIZE STFD [AOFFSET2] = f83, SIZE ;; STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f114, SIZE ;; STFD [AOFFSET] = f113, 5 * SIZE STFD [AOFFSET2] = f115, 5 * SIZE ;; adds AOFFSET = - 16 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C5 = -8 * SIZE, C5 adds C6 = -8 * SIZE, C6 #endif ;; STFD [C1 ] = f64, SIZE STFD [C5 ] = f66, SIZE ;; STFD [C1 ] = f65, SIZE STFD [C5 ] = f67, SIZE ;; STFD [C1 ] = f96, SIZE STFD [C5 ] = f98, SIZE ;; STFD [C1 ] = f97, 5 * SIZE STFD [C5 ] = f99, 5 * SIZE ;; STFD [C2 ] = f80, SIZE STFD [C6 ] = f82, SIZE ;; STFD [C2 ] = f81, SIZE STFD [C6 ] = f83, SIZE ;; STFD [C2 ] = f112, SIZE STFD [C6 ] = f114, SIZE ;; STFD [C2 ] = f113, 5 * SIZE STFD [C6 ] = f115, 5 * SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 adds C5 = -8 * SIZE, C5 adds C6 = -8 * SIZE, C6 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 2, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 4, KK #elif defined LN adds KK = -4, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; (p6) br.cond.dptk .L052 ;; .align 16 .L060: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L070 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } { .mmi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE shr L = L, 1 } ;; { .mmi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi nop __LINE__ nop __LINE__ mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L068 ;; .align 16 .L062: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 } { .mfb FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 br.cloop.sptk.few .L062 } ;; .L068: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f74, f96 FSUB f97 = f75, f97 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f112 = f90, f112 FSUB f113 = f91, f113 ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 FNMA f80 = f106, f112, f80 FMA_A f81 = f107, f112, f81 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 FMA_B f80 = f107, f113, f80 FNMA f81 = f106, f113, f81 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 FNMA f112 = f74, f80, f112 FMA_A f113 = f75, f80, f113 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 FMA_B f112 = f75, f81, f112 FNMA f113 = f74, f81, f113 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f96 FMPY f35 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f34 FMA_D f97 = f72, f97, f35 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f112 = f74, f96, f112 FMA_A f113 = f75, f96, f113 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f112 = f75, f97, f112 FNMA f113 = f74, f97, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f112 FMPY f35 = f91, f112 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f112 = f91, f113, f34 FMA_D f113 = f90, f113, f35 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f112 FMPY f35 = f105, f112 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f112 = f105, f113, f34 FMA_D f113 = f104, f113, f35 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f96 = f106, f112, f96 FMA_A f97 = f107, f112, f97 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f96 = f107, f113, f96 FNMA f97 = f106, f113, f97 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f96 FMPY f35 = f121, f96 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f96 = f121, f97, f34 FMA_D f97 = f120, f97, f35 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f96, SIZE ;; STFD [C1 ] = f97, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C2 ] = f112, SIZE ;; STFD [C2 ] = f113, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L070: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L089 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;; #else { .mfi shladd BOFFSET = r3, 1, B #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L078 ;; .align 16 .L072: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f96 = f32, f49, f96 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f112 = f32, f51, f112 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mfi nop __LINE__ FMA f97 = f33, f49, f97 // A2 * B2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 } { .mmf nop __LINE__ nop __LINE__ FMA f113 = f33, f51, f113 // A2 * B4 } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f96 = f40, f57, f96 // A1 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ (p3) FMA f112 = f40, f59, f112 // A1 * B4 } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f97 = f41, f57, f97 // A2 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f41, f59, f113 // A2 * B4 br.cloop.sptk.few .L072 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f97 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f80 = f80, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f96 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f81 = f81, f112 nop __LINE__ } ;; .L078: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -2, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f88, f80 FSUB f81 = f89, f81 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; #endif #ifdef RT adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f80, SIZE ;; STFD [BOFFSET] = f81, SIZE ;; adds BOFFSET = - 4 * SIZE, BOFFSET ;; #else STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f80, SIZE ;; STFD [AOFFSET] = f81, SIZE ;; adds AOFFSET = - 4 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET shladd BOFFSET = L, 1, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L089: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; shladd B = KK8, 1, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 2, KK #endif #ifdef RT adds KK = -2, KK #endif ;; { .mmi mov AOFFSET = A nop __LINE__ } ;; .align 16 .L010: shr J = N, 2 ;; cmp.ge p6, p0 = 0, J (p6) br.cond.dpnt .L999 ;; .L010x: #ifdef RT { .mmi shladd r3 = LDC, 2, r0 nop __LINE__ shl r2 = K, 2 + ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } ;; #endif { .mmi mov C1 = C // coffset1 = c + 0 * ldc add C2 = LDC, C // coffset2 = c + 1 * ldc shr I = M, 2 } { .mmi adds J = -1, J #ifdef LN add KK = M, OFFSET #elif defined LT mov KK = OFFSET #else nop __LINE__ #endif #if defined(LN) || defined(RT) mov AORIG = A #else mov AOFFSET = A #endif } ;; ;; { .mmi shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif } { .mib cmp.eq p6, p7 = 0, I #ifndef RT shladd C = LDC, 2, C // coffset += 8 * ldc #else nop __LINE__ #endif (p6) br.cond.dpnt .L020 } ;; .align 16 .L011: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mfi shladd r3 = KK, ZBASE_SHIFT, r0 mov f118 = f0 nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 nop __LINE__ } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = 1, L } { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 adds C5 = 4 * SIZE, C1 } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f115 = f0 adds C6 = 4 * SIZE, C2 } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f68 = f0 shr L = L, 1 } { .mfi setf.d f86 = r0 mov f69 = f0 adds C7 = 4 * SIZE, C3 } ;; { .mfi CPREFETCH [PREC], LDC mov f84 = f0 adds L = -1, L } { .mfi setf.d f87 = r0 mov f85 = f0 adds C8 = 4 * SIZE, C4 } ;; { .mfi CPREFETCH [PREC], LDC mov f100 = f0 mov ar.lc = L } { .mfi setf.d f102 = r0 mov f101 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f116 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } { .mfi setf.d f103 = r0 mov f117 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC] mov f70 = f0 cmp.eq p6, p0 = -1, L } { .mfb setf.d f119 = r0 mov f71 = f0 (p6) br.cond.dpnt .L018 } ;; .align 16 .L012: /* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA_B f65 = f32, f49, f65 // A1 * B2 nop __LINE__ } ;; /* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; /* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; /* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; /* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; /* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; /* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; /* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; /* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; /* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; /* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; /* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; /* 14 */ { .mfb FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; /* 15 */ { .mfb FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; /* 16 */ { .mfb FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; /* 17 */ { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f69 = f36, f49, f69 // A5 * B2 nop __LINE__ } ;; /* 18 */ { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f85 = f36, f51, f85 // A5 * B4 nop __LINE__ } ;; /* 19 */ { .mfb nop __LINE__ FMA f100 = f36, f52, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f101 = f36, f53, f101 // A5 * B6 nop __LINE__ } ;; /* 20 */ { .mfb nop __LINE__ FMA f116 = f36, f54, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f117 = f36, f55, f117 // A5 * B8 nop __LINE__ } ;; /* 21 */ { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f68 = f37, f49, f68 // A6 * B2 nop __LINE__ } ;; /* 22 */ { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f84 = f37, f51, f84 // A6 * B4 nop __LINE__ } ;; /* 23 */ { .mfb nop __LINE__ FMA f101 = f37, f52, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f100 = f37, f53, f100 // A6 * B6 nop __LINE__ } ;; /* 24 */ { .mfb nop __LINE__ FMA f117 = f37, f54, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f116 = f37, f55, f116 // A6 * B8 nop __LINE__ } ;; /* 25 */ { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f71 = f38, f49, f71 // A7 * B2 nop __LINE__ } ;; /* 26 */ { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f87 = f38, f51, f87 // A7 * B4 nop __LINE__ } ;; /* 27 */ { .mfb nop __LINE__ FMA f102 = f38, f52, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f103 = f38, f53, f103 // A7 * B6 nop __LINE__ } ;; /* 28 */ { .mfb nop __LINE__ FMA f118 = f38, f54, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f119 = f38, f55, f119 // A7 * B8 nop __LINE__ } ;; /* 29 */ { .mfb nop __LINE__ FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f70 = f39, f49, f70 // A8 * B2 nop __LINE__ } ;; /* 30 */ { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f86 = f39, f51, f86 // A8 * B4 nop __LINE__ } ;; /* 31 */ { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f103 = f39, f52, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f102 = f39, f53, f102 // A8 * B6 nop __LINE__ } ;; /* 32 */ { .mfb nop __LINE__ FMA f119 = f39, f54, f119 // A8 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f118 = f39, f55, f118 // A8 * B8 nop __LINE__ } ;; /* 33 */ { .mfb nop __LINE__ (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; /* 34 */ { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; /* 35 */ { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; /* 36 */ { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; /* 37 */ { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; /* 38 */ { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; /* 39 */ { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; /* 40 */ { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; /* 41 */ { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; /* 42 */ { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; /* 43 */ { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; /* 44 */ { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; /* 45 */ { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; /* 46 */ { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; /* 47 */ { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; /* 48 */ { .mfb nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 nop __LINE__ } ;; /* 49 */ { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 nop __LINE__ } ;; /* 50 */ { .mfb nop __LINE__ (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 nop __LINE__ } ;; /* 51 */ { .mfb nop __LINE__ (p3) FMA f100 = f44, f60, f100 // A5 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 nop __LINE__ } ;; /* 52 */ { .mfb nop __LINE__ (p3) FMA f116 = f44, f62, f116 // A5 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 nop __LINE__ } ;; /* 53 */ { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 nop __LINE__ } ;; /* 54 */ { .mfb nop __LINE__ (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 nop __LINE__ } ;; /* 55 */ { .mfb nop __LINE__ (p3) FMA f101 = f45, f60, f101 // A6 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 nop __LINE__ } ;; /* 56 */ { .mfb nop __LINE__ (p3) FMA f117 = f45, f62, f117 // A6 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 nop __LINE__ } ;; /* 57 */ { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 nop __LINE__ } ;; /* 58 */ { .mfb nop __LINE__ (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 nop __LINE__ } ;; /* 59 */ { .mfb nop __LINE__ (p3) FMA f102 = f46, f60, f102 // A7 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 nop __LINE__ } ;; /* 60 */ { .mfb nop __LINE__ (p3) FMA f118 = f46, f62, f118 // A7 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 nop __LINE__ } ;; /* 61 */ { .mfb nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 nop __LINE__ } ;; /* 62 */ { .mfb nop __LINE__ (p3) FMA f87 = f47, f58, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 nop __LINE__ } ;; /* 63 */ { .mfb nop __LINE__ (p3) FMA f103 = f47, f60, f103 // A8 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 nop __LINE__ } ;; /* 64 */ { .mfi nop __LINE__ (p3) FMA f119 = f47, f62, f119 // A8 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 br.cloop.sptk.few .L012 } ;; .L018: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -4, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f94, f95 = [BOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [BOFFSET], 2 * SIZE FSUB f80 = f74, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f81 = f75, f81 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [BOFFSET], 2 * SIZE FSUB f96 = f76, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f97 = f77, f97 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [BOFFSET], 2 * SIZE FSUB f112 = f78, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f113 = f79, f113 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [BOFFSET], 2 * SIZE FSUB f66 = f88, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f67 = f89, f67 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [BOFFSET], 2 * SIZE FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f83 = f91, f83 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [BOFFSET], 2 * SIZE FSUB f98 = f92, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f99 = f93, f99 nop __LINE__ } ;; { .mfi LDFPD f124, f125 = [BOFFSET], 2 * SIZE FSUB f114 = f94, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f115 = f95, f115 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [BOFFSET] FSUB f68 = f104, f68 adds BOFFSET = -30 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB_A f69 = f105, f69 #ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET #else nop __LINE__ #endif } ;; { .mfi LDFPD f72, f73 = [AOFFSET] FSUB f84 = f106, f84 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } { .mfi nop __LINE__ FSUB_A f85 = f107, f85 nop __LINE__ } ;; { .mfi LDFPD f74, f75 = [AOFFSET] FSUB f100 = f108, f100 #ifdef LN adds AOFFSET = - 2 * SIZE, AOFFSET #else adds AOFFSET = 2 * SIZE, AOFFSET #endif } { .mfi nop __LINE__ FSUB_A f101 = f109, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f116 = f110, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f117 = f111, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f70 = f120, f70 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f71 = f121, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f86 = f122, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f87 = f123, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f102 = f124, f102 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f103 = f125, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f118 = f126, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f119 = f127, f119 nop __LINE__ } ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f92, f93 = [AOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [AOFFSET], 2 * SIZE FSUB f66 = f74, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB f67 = f75, f67 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [AOFFSET], 2 * SIZE FSUB f68 = f76, f68 nop __LINE__ } { .mfi nop __LINE__ FSUB f69 = f77, f69 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [AOFFSET], 2 * SIZE FSUB f70 = f78, f70 nop __LINE__ } { .mfi nop __LINE__ FSUB f71 = f79, f71 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [AOFFSET], 2 * SIZE FSUB f80 = f88, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f89, f81 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [AOFFSET], 2 * SIZE FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f91, f83 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [AOFFSET], 2 * SIZE FSUB f84 = f92, f84 nop __LINE__ } { .mfi nop __LINE__ FSUB f85 = f93, f85 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [AOFFSET], 2 * SIZE FSUB f86 = f94, f86 nop __LINE__ } { .mfi nop __LINE__ FSUB f87 = f95, f87 nop __LINE__ } ;; { .mfi LDFPD f124, f125 = [AOFFSET], 2 * SIZE FSUB f96 = f104, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f105, f97 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [AOFFSET] FSUB f98 = f106, f98 adds AOFFSET = -30 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f99 = f107, f99 #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET #else nop __LINE__ #endif } ;; { .mfi LDFPD f72, f73 = [BOFFSET] FSUB f100 = f108, f100 #ifdef RN adds BOFFSET = 2 * SIZE, BOFFSET #else adds BOFFSET = - 2 * SIZE, BOFFSET #endif } { .mfi nop __LINE__ FSUB f101 = f109, f101 nop __LINE__ } ;; { .mfi LDFPD f74, f75 = [BOFFSET] FSUB f102 = f110, f102 #ifdef RN adds BOFFSET = 2 * SIZE, BOFFSET #else adds BOFFSET = - 2 * SIZE, BOFFSET #endif } { .mfi nop __LINE__ FSUB f103 = f111, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f120, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f121, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f123, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f116 = f124, f116 nop __LINE__ } { .mfi nop __LINE__ FSUB f117 = f125, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f118 = f126, f118 nop __LINE__ } { .mfi nop __LINE__ FSUB f119 = f127, f119 nop __LINE__ } ;; #endif #ifdef LN { .mfi LDFPD f76, f77 = [AOFFSET] FMPY f32 = f72, f70 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f36 = f72, f102 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [AOFFSET] FMPY f33 = f73, f70 adds AOFFSET = - 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f102 nop __LINE__ } ;; { .mfi LDFPD f88, f89 = [AOFFSET] FMPY f34 = f72, f86 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f38 = f72, f118 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [AOFFSET] FMPY f35 = f73, f86 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f39 = f73, f118 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [AOFFSET] FMA_C f70 = f73, f71, f32 adds AOFFSET = - 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f102 = f73, f103, f36 adds C1 = -2 * SIZE, C1 } ;; { .mfi LDFPD f104, f105 = [AOFFSET] FMA_D f71 = f72, f71, f33 adds AOFFSET = - 2 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f103 = f72, f103, f37 adds C2 = -2 * SIZE, C2 } ;; { .mfi LDFPD f106, f107 = [AOFFSET] FMA_C f86 = f73, f87, f34 adds AOFFSET = - 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f118 = f73, f119, f38 adds C3 = -2 * SIZE, C3 } ;; { .mfi LDFPD f120, f121 = [AOFFSET] FMA_D f87 = f72, f87, f35 adds BOFFSET2 = 28 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f119 = f72, f119, f39 adds BOFFSET = 24 * SIZE, BOFFSET } ;; { .mfi STFD [BOFFSET] = f70, SIZE FNMA f68 = f74, f70, f68 adds C4 = -2 * SIZE, C4 } { .mfi STFD [BOFFSET2] = f102, SIZE FNMA f100 = f74, f102, f100 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f71, SIZE FMA_A f69 = f75, f70, f69 nop __LINE__ } { .mfi STFD [BOFFSET2] = f103, SIZE FMA_A f101 = f75, f102, f101 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f86, SIZE FNMA f84 = f74, f86, f84 nop __LINE__ } { .mfi STFD [BOFFSET2] = f118, SIZE FNMA f116 = f74, f118, f116 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f87, -11 * SIZE FMA_A f85 = f75, f86, f85 nop __LINE__ } { .mfi STFD [BOFFSET2] = f119, -11 * SIZE FMA_A f117 = f75, f118, f117 nop __LINE__ } ;; { .mfi STFD [C1 ] = f70, SIZE FMA_B f68 = f75, f71, f68 nop __LINE__ } { .mfi STFD [C3 ] = f102, SIZE FMA_B f100 = f75, f103, f100 nop __LINE__ } ;; { .mfi STFD [C1 ] = f71, -3 * SIZE FNMA f69 = f74, f71, f69 nop __LINE__ } { .mfi STFD [C3 ] = f103, -3 * SIZE FNMA f101 = f74, f103, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f86, SIZE FMA_B f84 = f75, f87, f84 nop __LINE__ } { .mfi STFD [C4 ] = f118, SIZE FMA_B f116 = f75, f119, f116 nop __LINE__ } ;; { .mfi STFD [C2 ] = f87, -3 * SIZE FNMA f85 = f74, f87, f85 nop __LINE__ } { .mfi STFD [C4 ] = f119, -3 * SIZE FNMA f117 = f74, f119, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f76, f70, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f98 = f76, f102, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f77, f70, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f99 = f77, f102, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f82 = f76, f86, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f114 = f76, f118, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f83 = f77, f86, f83 nop __LINE__ } { .mfi nop __LINE__ FMA_A f115 = f77, f118, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f77, f71, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f98 = f77, f103, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f76, f71, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f99 = f76, f103, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f82 = f77, f87, f82 nop __LINE__ } { .mfi nop __LINE__ FMA_B f114 = f77, f119, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f83 = f76, f87, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f115 = f76, f119, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f78, f70, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f96 = f78, f102, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f79, f70, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f97 = f79, f102, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f78, f86, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f112 = f78, f118, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f79, f86, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f113 = f79, f118, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f79, f71, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f96 = f79, f103, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f78, f71, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f78, f103, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f79, f87, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f112 = f79, f119, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f78, f87, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f113 = f78, f119, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f88, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f88, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f89, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f89, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f88, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f88, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f89, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f89, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f68 = f89, f69, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f89, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f69 = f88, f69, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f88, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f84 = f89, f85, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f89, f117, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f85 = f88, f85, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f88, f117, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f66 = f90, f68, f66 nop __LINE__ } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f98 = f90, f100, f98 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f69, SIZE FMA_A f67 = f91, f68, f67 nop __LINE__ } { .mfi STFD [BOFFSET2] = f101, SIZE FMA_A f99 = f91, f100, f99 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f82 = f90, f84, f82 nop __LINE__ } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f114 = f90, f116, f114 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f85, -11 * SIZE FMA_A f83 = f91, f84, f83 nop __LINE__ } { .mfi STFD [BOFFSET2] = f117, -11 * SIZE FMA_A f115 = f91, f116, f115 nop __LINE__ } ;; { .mfi STFD [C1 ] = f68, SIZE FMA_B f66 = f91, f69, f66 nop __LINE__ } { .mfi STFD [C3 ] = f100, SIZE FMA_B f98 = f91, f101, f98 nop __LINE__ } ;; { .mfi STFD [C1 ] = f69, -3 * SIZE FNMA f67 = f90, f69, f67 nop __LINE__ } { .mfi STFD [C3 ] = f101, -3 * SIZE FNMA f99 = f90, f101, f99 nop __LINE__ } ;; { .mfi STFD [C2 ] = f84, SIZE FMA_B f82 = f91, f85, f82 nop __LINE__ } { .mfi STFD [C4 ] = f116, SIZE FMA_B f114 = f91, f117, f114 nop __LINE__ } ;; { .mfi STFD [C2 ] = f85, -3 * SIZE FNMA f83 = f90, f85, f83 nop __LINE__ } { .mfi STFD [C4 ] = f117, -3 * SIZE FNMA f115 = f90, f117, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f92, f68, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f96 = f92, f100, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f93, f68, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f97 = f93, f100, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f92, f84, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f112 = f92, f116, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f93, f84, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f113 = f93, f116, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f93, f69, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f96 = f93, f101, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f92, f69, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f92, f101, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f93, f85, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f112 = f93, f117, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f92, f85, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f113 = f92, f117, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f104, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f104, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f105, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f105, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f104, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f104, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f105, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f105, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f105, f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f98 = f105, f99, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f104, f67, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f99 = f104, f99, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f105, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f114 = f105, f115, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f104, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f115 = f104, f115, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f64 = f106, f66, f64 nop __LINE__ } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f96 = f106, f98, f96 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMA_A f65 = f107, f66, f65 nop __LINE__ } { .mfi STFD [BOFFSET2] = f99, SIZE FMA_A f97 = f107, f98, f97 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f80 = f106, f82, f80 nop __LINE__ } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f112 = f106, f114, f112 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f83, -11 * SIZE FMA_A f81 = f107, f82, f81 nop __LINE__ } { .mfi STFD [BOFFSET2] = f115, -11 * SIZE FMA_A f113 = f107, f114, f113 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f64 = f107, f67, f64 nop __LINE__ } { .mfi STFD [C3 ] = f98, SIZE FMA_B f96 = f107, f99, f96 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, -3 * SIZE FNMA f65 = f106, f67, f65 nop __LINE__ } { .mfi STFD [C3 ] = f99, -3 * SIZE FNMA f97 = f106, f99, f97 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f80 = f107, f83, f80 nop __LINE__ } { .mfi STFD [C4 ] = f114, SIZE FMA_B f112 = f107, f115, f112 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, -3 * SIZE FNMA f81 = f106, f83, f81 nop __LINE__ } { .mfi STFD [C4 ] = f115, -3 * SIZE FNMA f113 = f106, f115, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f120, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f120, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f121, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f121, f96 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f120, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f120, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f121, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f121, f112 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f64 = f121, f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f96 = f121, f97, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f65 = f120, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f97 = f120, f97, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f121, f81, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f112 = f121, f113, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f81 = f120, f81, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f113 = f120, f113, f39 nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f81, -3 * SIZE STFD [BOFFSET2] = f113, -3 * SIZE nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 nop __LINE__ } { .mfi STFD [C3 ] = f96, SIZE mov f96 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, -1 * SIZE mov f65 = f0 adds KK = -4, KK } { .mfi STFD [C3 ] = f97, -1 * SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE mov f80 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C4 ] = f112, SIZE mov f112 = f0 sub L = K, KK } ;; { .mfi STFD [C2 ] = f81, -1 * SIZE mov f81 = f0 adds I = -1, I } { .mfb STFD [C4 ] = f113, -1 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif #ifdef LT { .mfi LDFPD f76, f77 = [AOFFSET], 2 * SIZE FMPY f32 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f72, f96 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [AOFFSET] FMPY f33 = f73, f64 adds AOFFSET = 4 * SIZE, AOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f96 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [AOFFSET], 2 * SIZE FMPY f34 = f72, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f72, f112 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [AOFFSET], 2 * SIZE FMPY f35 = f73, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f73, f112 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [AOFFSET] FMA_C f64 = f73, f65, f32 adds AOFFSET = 6 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f96 = f73, f97, f36 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [AOFFSET], 2 * SIZE FMA_D f65 = f72, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f97 = f72, f97, f37 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [AOFFSET] FMA_C f80 = f73, f81, f34 adds AOFFSET = 8 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_C f112 = f73, f113, f38 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [AOFFSET] FMA_D f81 = f72, f81, f35 adds AOFFSET = - 30 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f113 = f72, f113, f39 adds BOFFSET2 = 4 * SIZE, BOFFSET } ;; { .mfi STFD [BOFFSET] = f64, SIZE FNMA f66 = f74, f64, f66 nop __LINE__ } { .mfi STFD [BOFFSET2] = f96, SIZE FNMA f98 = f74, f96, f98 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f65, SIZE FMA_A f67 = f75, f64, f67 nop __LINE__ } { .mfi STFD [BOFFSET2] = f97, SIZE FMA_A f99 = f75, f96, f99 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f80, SIZE FNMA f82 = f74, f80, f82 nop __LINE__ } { .mfi STFD [BOFFSET2] = f112, SIZE FNMA f114 = f74, f112, f114 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f81, 5 * SIZE FMA_A f83 = f75, f80, f83 nop __LINE__ } { .mfi STFD [BOFFSET2] = f113, 5 * SIZE FMA_A f115 = f75, f112, f115 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE FMA_B f66 = f75, f65, f66 nop __LINE__ } { .mfi STFD [C3 ] = f96, SIZE FMA_B f98 = f75, f97, f98 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE FNMA f67 = f74, f65, f67 nop __LINE__ } { .mfi STFD [C3 ] = f97, SIZE FNMA f99 = f74, f97, f99 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f82 = f75, f81, f82 nop __LINE__ } { .mfi STFD [C4 ] = f112, SIZE FMA_B f114 = f75, f113, f114 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f83 = f74, f81, f83 nop __LINE__ } { .mfi STFD [C4 ] = f113, SIZE FNMA f115 = f74, f113, f115 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f68 = f76, f64, f68 nop __LINE__ } { .mfi nop __LINE__ FNMA f100 = f76, f96, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f69 = f77, f64, f69 nop __LINE__ } { .mfi nop __LINE__ FMA_A f101 = f77, f96, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f84 = f76, f80, f84 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f76, f112, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f85 = f77, f80, f85 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f77, f112, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f68 = f77, f65, f68 nop __LINE__ } { .mfi nop __LINE__ FMA_B f100 = f77, f97, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f69 = f76, f65, f69 nop __LINE__ } { .mfi nop __LINE__ FNMA f101 = f76, f97, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f84 = f77, f81, f84 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f77, f113, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f85 = f76, f81, f85 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f76, f113, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f70 = f78, f64, f70 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f78, f96, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f71 = f79, f64, f71 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f79, f96, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f86 = f78, f80, f86 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f78, f112, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f87 = f79, f80, f87 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f79, f112, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f70 = f79, f65, f70 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f79, f97, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f71 = f78, f65, f71 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f78, f97, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f86 = f79, f81, f86 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f79, f113, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f87 = f78, f81, f87 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f78, f113, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f90, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f90, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f91, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f91, f98 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f90, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f91, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f91, f114 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f91, f67, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f98 = f91, f99, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f90, f67, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f99 = f90, f99, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f91, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f114 = f91, f115, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f90, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f115 = f90, f115, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f66, SIZE FNMA f68 = f92, f66, f68 nop __LINE__ } { .mfi STFD [BOFFSET2] = f98, SIZE FNMA f100 = f92, f98, f100 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f67, SIZE FMA_A f69 = f93, f66, f69 nop __LINE__ } { .mfi STFD [BOFFSET2] = f99, SIZE FMA_A f101 = f93, f98, f101 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f82, SIZE FNMA f84 = f92, f82, f84 nop __LINE__ } { .mfi STFD [BOFFSET2] = f114, SIZE FNMA f116 = f92, f114, f116 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f83, 5 * SIZE FMA_A f85 = f93, f82, f85 nop __LINE__ } { .mfi STFD [BOFFSET2] = f115, 5 * SIZE FMA_A f117 = f93, f114, f117 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f68 = f93, f67, f68 nop __LINE__ } { .mfi STFD [C3 ] = f98, SIZE FMA_B f100 = f93, f99, f100 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, SIZE FNMA f69 = f92, f67, f69 nop __LINE__ } { .mfi STFD [C3 ] = f99, SIZE FNMA f101 = f92, f99, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f84 = f93, f83, f84 nop __LINE__ } { .mfi STFD [C4 ] = f114, SIZE FMA_B f116 = f93, f115, f116 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, SIZE FNMA f85 = f92, f83, f85 nop __LINE__ } { .mfi STFD [C4 ] = f115, SIZE FNMA f117 = f92, f115, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f70 = f94, f66, f70 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f94, f98, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f71 = f95, f66, f71 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f95, f98, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f86 = f94, f82, f86 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f94, f114, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f87 = f95, f82, f87 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f95, f114, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f70 = f95, f67, f70 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f95, f99, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f71 = f94, f67, f71 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f94, f99, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f86 = f95, f83, f86 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f95, f115, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f87 = f94, f83, f87 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f94, f115, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f108, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f108, f100 nop __LINE__ } { .mfi nop __LINE__ FMPY f33 = f109, f68 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f109, f100 nop __LINE__ } { .mfi nop __LINE__ FMPY f34 = f108, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f108, f116 nop __LINE__ } { .mfi nop __LINE__ FMPY f35 = f109, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f109, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f68 = f109, f69, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f109, f101, f36 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f108, f69, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f108, f101, f37 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f109, f85, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f109, f117, f38 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f108, f85, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f108, f117, f39 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f68, SIZE FNMA f70 = f110, f68, f70 nop __LINE__ } { .mfi STFD [BOFFSET2] = f100, SIZE FNMA f102 = f110, f100, f102 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f69, SIZE FMA_A f71 = f111, f68, f71 nop __LINE__ } { .mfi STFD [BOFFSET2] = f101, SIZE FMA_A f103 = f111, f100, f103 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f84, SIZE FNMA f86 = f110, f84, f86 nop __LINE__ } { .mfi STFD [BOFFSET2] = f116, SIZE FNMA f118 = f110, f116, f118 nop __LINE__ } ;; { .mfi STFD [BOFFSET] = f85, 5 * SIZE FMA_A f87 = f111, f84, f87 nop __LINE__ } { .mfi STFD [BOFFSET2] = f117, 5 * SIZE FMA_A f119 = f111, f116, f119 nop __LINE__ } ;; { .mfi STFD [C1 ] = f68, SIZE FMA_B f70 = f111, f69, f70 nop __LINE__ } { .mfi STFD [C3 ] = f100, SIZE FMA_B f102 = f111, f101, f102 nop __LINE__ } ;; { .mfi STFD [C1 ] = f69, SIZE FNMA f71 = f110, f69, f71 nop __LINE__ } { .mfi STFD [C3 ] = f101, SIZE FNMA f103 = f110, f101, f103 nop __LINE__ } ;; { .mfi STFD [C2 ] = f84, SIZE FMA_B f86 = f111, f85, f86 nop __LINE__ } { .mfi STFD [C4 ] = f116, SIZE FMA_B f118 = f111, f117, f118 nop __LINE__ } ;; { .mfi STFD [C2 ] = f85, SIZE FNMA f87 = f110, f85, f87 nop __LINE__ } { .mfi STFD [C4 ] = f117, SIZE FNMA f119 = f110, f117, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f126, f70 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f126, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f127, f70 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f127, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f126, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f126, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f127, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f127, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f70 = f127, f71, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f127, f103, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f71 = f126, f71, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f126, f103, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f86 = f127, f87, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f118 = f127, f119, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f87 = f126, f87, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f119 = f126, f119, f39 nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f70, SIZE STFD [BOFFSET2] = f102, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f71, SIZE STFD [BOFFSET2] = f103, SIZE sub r2 = K, KK } ;; { .mmi STFD [BOFFSET] = f86, SIZE STFD [BOFFSET2] = f118, SIZE adds KK = 4, KK } ;; { .mmi STFD [BOFFSET] = f87, -27 * SIZE STFD [BOFFSET2] = f119 shladd r2 = r2, ZBASE_SHIFT, r0 } ;; { .mfi STFD [C1 ] = f70, SIZE mov f64 = f0 shladd AOFFSET = r2, 2, AOFFSET } { .mfi STFD [C3 ] = f102, SIZE mov f65 = f0 shladd BOFFSET = r2, 2, BOFFSET } ;; { .mfi STFD [C1 ] = f71, SIZE mov f80 = f0 mov L = KK } { .mfi STFD [C3 ] = f103, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f86, SIZE mov f96 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C4 ] = f118, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C2 ] = f87, SIZE mov f112 = f0 adds I = -1, I } { .mfb STFD [C4 ] = f119, SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif #ifdef RN { .mfi LDFPD f76, f77 = [BOFFSET], 2 * SIZE FMPY f32 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f72, f68 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [BOFFSET] FMPY f33 = f73, f64 adds BOFFSET = 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f68 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [BOFFSET], 2 * SIZE FMPY f34 = f72, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f72, f70 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [BOFFSET], 2 * SIZE FMPY f35 = f73, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f73, f70 nop __LINE__ } ;; { .mfi LDFPD f94, f95 = [BOFFSET] FMA_C f64 = f73, f65, f32 adds BOFFSET = 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f68 = f73, f69, f36 nop __LINE__ } ;; { .mfi LDFPD f108, f109 = [BOFFSET], 2 * SIZE FMA_D f65 = f72, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f72, f69, f37 nop __LINE__ } ;; { .mfi LDFPD f110, f111 = [BOFFSET] FMA_C f66 = f73, f67, f34 adds BOFFSET = 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f70 = f73, f71, f38 nop __LINE__ } ;; { .mfi LDFPD f126, f127 = [BOFFSET] FMA_D f67 = f72, f67, f35 adds BOFFSET = - 30 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f71 = f72, f71, f39 adds AOFFSET2 = 4 * SIZE, AOFFSET } ;; { .mfi STFD [AOFFSET] = f64, SIZE FNMA f80 = f74, f64, f80 nop __LINE__ } { .mfi STFD [AOFFSET2] = f68, SIZE FNMA f84 = f74, f68, f84 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f65, SIZE FMA_A f81 = f75, f64, f81 nop __LINE__ } { .mfi STFD [AOFFSET2] = f69, SIZE FMA_A f85 = f75, f68, f85 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f66, SIZE FNMA f82 = f74, f66, f82 nop __LINE__ } { .mfi STFD [AOFFSET2] = f70, SIZE FNMA f86 = f74, f70, f86 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f67, 5 * SIZE FMA_A f83 = f75, f66, f83 nop __LINE__ } { .mfi STFD [AOFFSET2] = f71, 5 * SIZE FMA_A f87 = f75, f70, f87 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE FMA_B f80 = f75, f65, f80 nop __LINE__ } { .mfi STFD [C5 ] = f68, SIZE FMA_B f84 = f75, f69, f84 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE FNMA f81 = f74, f65, f81 nop __LINE__ } { .mfi STFD [C5 ] = f69, SIZE FNMA f85 = f74, f69, f85 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE FMA_B f82 = f75, f67, f82 nop __LINE__ } { .mfi STFD [C5 ] = f70, SIZE FMA_B f86 = f75, f71, f86 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE FNMA f83 = f74, f67, f83 nop __LINE__ } { .mfi STFD [C5 ] = f71, 5 * SIZE FNMA f87 = f74, f71, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f96 = f76, f64, f96 nop __LINE__ } { .mfi nop __LINE__ FNMA f100 = f76, f68, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f97 = f77, f64, f97 nop __LINE__ } { .mfi nop __LINE__ FMA_A f101 = f77, f68, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f98 = f76, f66, f98 nop __LINE__ } { .mfi nop __LINE__ FNMA f102 = f76, f70, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f99 = f77, f66, f99 nop __LINE__ } { .mfi nop __LINE__ FMA_A f103 = f77, f70, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f96 = f77, f65, f96 nop __LINE__ } { .mfi nop __LINE__ FMA_B f100 = f77, f69, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f97 = f76, f65, f97 nop __LINE__ } { .mfi nop __LINE__ FNMA f101 = f76, f69, f101 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f98 = f77, f67, f98 nop __LINE__ } { .mfi nop __LINE__ FMA_B f102 = f77, f71, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f99 = f76, f67, f99 nop __LINE__ } { .mfi nop __LINE__ FNMA f103 = f76, f71, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f112 = f78, f64, f112 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f78, f68, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f113 = f79, f64, f113 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f79, f68, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f114 = f78, f66, f114 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f78, f70, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f115 = f79, f66, f115 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f79, f70, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f112 = f79, f65, f112 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f79, f69, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f113 = f78, f65, f113 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f78, f69, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f114 = f79, f67, f114 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f79, f71, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f115 = f78, f67, f115 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f78, f71, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f90, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f90, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f33 = f91, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f91, f84 nop __LINE__ } { .mfi nop __LINE__ FMPY f34 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f90, f86 nop __LINE__ } { .mfi nop __LINE__ FMPY f35 = f91, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f91, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f91, f81, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f91, f85, f36 nop __LINE__ } { .mfi nop __LINE__ FMA_D f81 = f90, f81, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f90, f85, f37 nop __LINE__ } { .mfi nop __LINE__ FMA_C f82 = f91, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f86 = f91, f87, f38 nop __LINE__ } { .mfi nop __LINE__ FMA_D f83 = f90, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f87 = f90, f87, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f96 = f92, f80, f96 nop __LINE__ } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f100 = f92, f84, f100 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f81, SIZE FMA_A f97 = f93, f80, f97 nop __LINE__ } { .mfi STFD [AOFFSET2] = f85, SIZE FMA_A f101 = f93, f84, f101 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f98 = f92, f82, f98 nop __LINE__ } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f102 = f92, f86, f102 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f83, 5 * SIZE FMA_A f99 = f93, f82, f99 nop __LINE__ } { .mfi STFD [AOFFSET2] = f87, 5 * SIZE FMA_A f103 = f93, f86, f103 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f96 = f93, f81, f96 nop __LINE__ } { .mfi STFD [C6 ] = f84, SIZE FMA_B f100 = f93, f85, f100 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f97 = f92, f81, f97 nop __LINE__ } { .mfi STFD [C6 ] = f85, SIZE FNMA f101 = f92, f85, f101 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f98 = f93, f83, f98 nop __LINE__ } { .mfi STFD [C6 ] = f86, SIZE FMA_B f102 = f93, f87, f102 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, 5 * SIZE FNMA f99 = f92, f83, f99 nop __LINE__ } { .mfi STFD [C6 ] = f87, 5 * SIZE FNMA f103 = f92, f87, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f112 = f94, f80, f112 nop __LINE__ } { .mfi nop __LINE__ FNMA f116 = f94, f84, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f113 = f95, f80, f113 nop __LINE__ } { .mfi nop __LINE__ FMA_A f117 = f95, f84, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f114 = f94, f82, f114 nop __LINE__ } { .mfi nop __LINE__ FNMA f118 = f94, f86, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f115 = f95, f82, f115 nop __LINE__ } { .mfi nop __LINE__ FMA_A f119 = f95, f86, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f112 = f95, f81, f112 nop __LINE__ } { .mfi nop __LINE__ FMA_B f116 = f95, f85, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f113 = f94, f81, f113 nop __LINE__ } { .mfi nop __LINE__ FNMA f117 = f94, f85, f117 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f114 = f95, f83, f114 nop __LINE__ } { .mfi nop __LINE__ FMA_B f118 = f95, f87, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f115 = f94, f83, f115 nop __LINE__ } { .mfi nop __LINE__ FNMA f119 = f94, f87, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f108, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f108, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f109, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f109, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f108, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f108, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f109, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f109, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f96 = f109, f97, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f109, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f97 = f108, f97, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f108, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f98 = f109, f99, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f109, f103, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f99 = f108, f99, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f108, f103, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f112 = f110, f96, f112 nop __LINE__ } { .mfi STFD [AOFFSET2] = f100, SIZE FNMA f116 = f110, f100, f116 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMA_A f113 = f111, f96, f113 nop __LINE__ } { .mfi STFD [AOFFSET2] = f101, SIZE FMA_A f117 = f111, f100, f117 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f98, SIZE FNMA f114 = f110, f98, f114 nop __LINE__ } { .mfi STFD [AOFFSET2] = f102, SIZE FNMA f118 = f110, f102, f118 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f99, 5 * SIZE FMA_A f115 = f111, f98, f115 nop __LINE__ } { .mfi STFD [AOFFSET2] = f103, 5 * SIZE FMA_A f119 = f111, f102, f119 nop __LINE__ } ;; { .mfi STFD [C3 ] = f96, SIZE FMA_B f112 = f111, f97, f112 nop __LINE__ } { .mfi STFD [C7 ] = f100, SIZE FMA_B f116 = f111, f101, f116 nop __LINE__ } ;; { .mfi STFD [C3 ] = f97, SIZE FNMA f113 = f110, f97, f113 nop __LINE__ } { .mfi STFD [C7 ] = f101, SIZE FNMA f117 = f110, f101, f117 nop __LINE__ } ;; { .mfi STFD [C3 ] = f98, SIZE FMA_B f114 = f111, f99, f114 nop __LINE__ } { .mfi STFD [C7 ] = f102, SIZE FMA_B f118 = f111, f103, f118 nop __LINE__ } ;; { .mfi STFD [C3 ] = f99, 5 * SIZE FNMA f115 = f110, f99, f115 nop __LINE__ } { .mfi STFD [C7 ] = f103, 5 * SIZE FNMA f119 = f110, f103, f119 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f126, f112 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f126, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f127, f112 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f127, f116 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f126, f114 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f126, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f127, f114 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f127, f118 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f112 = f127, f113, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f116 = f127, f117, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f113 = f126, f113, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f117 = f126, f117, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f114 = f127, f115, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f118 = f127, f119, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f115 = f126, f115, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f119 = f126, f119, f39 nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f112, SIZE STFD [AOFFSET2] = f116, SIZE sub r2 = K, KK } ;; { .mmi STFD [AOFFSET] = f113, SIZE STFD [AOFFSET2] = f117, SIZE mov L = KK } ;; { .mmi STFD [AOFFSET] = f114, SIZE STFD [AOFFSET2] = f118, SIZE shladd r2 = r2, ZBASE_SHIFT, r0 } ;; { .mmi STFD [AOFFSET] = f115, -27 * SIZE STFD [AOFFSET2] = f119 nop __LINE__ } ;; { .mfi STFD [C4 ] = f112, SIZE mov f64 = f0 shladd BOFFSET = r2, 2, BOFFSET } { .mfi STFD [C8 ] = f116, SIZE mov f65 = f0 shladd AOFFSET = r2, 2, AOFFSET } ;; { .mfi STFD [C4 ] = f113, SIZE mov f80 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C8 ] = f117, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f114, SIZE mov f96 = f0 adds I = -1, I } { .mfi STFD [C8 ] = f118, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C4 ] = f115, 5 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f119, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } #endif #ifdef RT { .mfi LDFPD f76, f77 = [BOFFSET] FMPY f32 = f72, f112 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f36 = f72, f116 nop __LINE__ } ;; { .mfi LDFPD f78, f79 = [BOFFSET] FMPY f33 = f73, f112 adds BOFFSET = - 4 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f37 = f73, f116 nop __LINE__ } ;; { .mfi LDFPD f88, f89 = [BOFFSET] FMPY f34 = f72, f114 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f38 = f72, f118 nop __LINE__ } ;; { .mfi LDFPD f90, f91 = [BOFFSET] FMPY f35 = f73, f114 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMPY f39 = f73, f118 nop __LINE__ } ;; { .mfi LDFPD f92, f93 = [BOFFSET] FMA_C f112 = f73, f113, f32 adds BOFFSET = - 6 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f116 = f73, f117, f36 nop __LINE__ } ;; { .mfi LDFPD f104, f105 = [BOFFSET] FMA_D f113 = f72, f113, f33 adds BOFFSET = - 2 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_D f117 = f72, f117, f37 nop __LINE__ } ;; { .mfi LDFPD f106, f107 = [BOFFSET] FMA_C f114 = f73, f115, f34 adds BOFFSET = - 8 * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_C f118 = f73, f119, f38 nop __LINE__ } ;; { .mfi LDFPD f120, f121 = [BOFFSET] FMA_D f115 = f72, f115, f35 adds AOFFSET2 = 28 * SIZE, AOFFSET } { .mfi nop __LINE__ FMA_D f119 = f72, f119, f39 adds AOFFSET = 24 * SIZE, AOFFSET } ;; { .mfi STFD [AOFFSET] = f112, SIZE FNMA f96 = f74, f112, f96 nop __LINE__ } { .mfi STFD [AOFFSET2] = f116, SIZE FNMA f100 = f74, f116, f100 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f113, SIZE FMA_A f97 = f75, f112, f97 nop __LINE__ } { .mfi STFD [AOFFSET2] = f117, SIZE FMA_A f101 = f75, f116, f101 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f114, SIZE FNMA f98 = f74, f114, f98 nop __LINE__ } { .mfi STFD [AOFFSET2] = f118, SIZE FNMA f102 = f74, f118, f102 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f115, -11 * SIZE FMA_A f99 = f75, f114, f99 nop __LINE__ } { .mfi STFD [AOFFSET2] = f119, -11 * SIZE FMA_A f103 = f75, f118, f103 nop __LINE__ } ;; { .mfi STFD [C4 ] = f112, SIZE FMA_B f96 = f75, f113, f96 nop __LINE__ } { .mfi STFD [C8 ] = f116, SIZE FMA_B f100 = f75, f117, f100 nop __LINE__ } ;; { .mfi STFD [C4 ] = f113, SIZE FNMA f97 = f74, f113, f97 nop __LINE__ } { .mfi STFD [C8 ] = f117, SIZE FNMA f101 = f74, f117, f101 nop __LINE__ } ;; { .mfi STFD [C4 ] = f114, SIZE FMA_B f98 = f75, f115, f98 nop __LINE__ } { .mfi STFD [C8 ] = f118, SIZE FMA_B f102 = f75, f119, f102 nop __LINE__ } ;; { .mfi STFD [C4 ] = f115, 5 * SIZE FNMA f99 = f74, f115, f99 nop __LINE__ } { .mfi STFD [C8 ] = f119, 5 * SIZE FNMA f103 = f74, f119, f103 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f80 = f76, f112, f80 nop __LINE__ } { .mfi nop __LINE__ FNMA f84 = f76, f116, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f81 = f77, f112, f81 nop __LINE__ } { .mfi nop __LINE__ FMA_A f85 = f77, f116, f85 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f82 = f76, f114, f82 nop __LINE__ } { .mfi nop __LINE__ FNMA f86 = f76, f118, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f83 = f77, f114, f83 nop __LINE__ } { .mfi nop __LINE__ FMA_A f87 = f77, f118, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f80 = f77, f113, f80 nop __LINE__ } { .mfi nop __LINE__ FMA_B f84 = f77, f117, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f81 = f76, f113, f81 nop __LINE__ } { .mfi nop __LINE__ FNMA f85 = f76, f117, f85 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f82 = f77, f115, f82 nop __LINE__ } { .mfi nop __LINE__ FMA_B f86 = f77, f119, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f83 = f76, f115, f83 nop __LINE__ } { .mfi nop __LINE__ FNMA f87 = f76, f119, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f78, f112, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f68 = f78, f116, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f79, f112, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f69 = f79, f116, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f78, f114, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f70 = f78, f118, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f79, f114, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f71 = f79, f118, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f79, f113, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f68 = f79, f117, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f78, f113, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f69 = f78, f117, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f79, f115, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f70 = f79, f119, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f78, f115, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f71 = f78, f119, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f88, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f88, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f89, f96 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f89, f100 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f88, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f88, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f89, f98 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f89, f102 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f96 = f89, f97, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f100 = f89, f101, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f97 = f88, f97, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f101 = f88, f101, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f98 = f89, f99, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f102 = f89, f103, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f99 = f88, f99, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f103 = f88, f103, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f96, SIZE FNMA f80 = f90, f96, f80 nop __LINE__ } { .mfi STFD [AOFFSET2] = f100, SIZE FNMA f84 = f90, f100, f84 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f97, SIZE FMA_A f81 = f91, f96, f81 nop __LINE__ } { .mfi STFD [AOFFSET2] = f101, SIZE FMA_A f85 = f91, f100, f85 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f98, SIZE FNMA f82 = f90, f98, f82 nop __LINE__ } { .mfi STFD [AOFFSET2] = f102, SIZE FNMA f86 = f90, f102, f86 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f99, -11 * SIZE FMA_A f83 = f91, f98, f83 nop __LINE__ } { .mfi STFD [AOFFSET2] = f103, -11 * SIZE FMA_A f87 = f91, f102, f87 nop __LINE__ } ;; { .mfi STFD [C3 ] = f96, SIZE FMA_B f80 = f91, f97, f80 nop __LINE__ } { .mfi STFD [C7 ] = f100, SIZE FMA_B f84 = f91, f101, f84 nop __LINE__ } ;; { .mfi STFD [C3 ] = f97, SIZE FNMA f81 = f90, f97, f81 nop __LINE__ } { .mfi STFD [C7 ] = f101, SIZE FNMA f85 = f90, f101, f85 nop __LINE__ } ;; { .mfi STFD [C3 ] = f98, SIZE FMA_B f82 = f91, f99, f82 nop __LINE__ } { .mfi STFD [C7 ] = f102, SIZE FMA_B f86 = f91, f103, f86 nop __LINE__ } ;; { .mfi STFD [C3 ] = f99, 5 * SIZE FNMA f83 = f90, f99, f83 nop __LINE__ } { .mfi STFD [C7 ] = f103, 5 * SIZE FNMA f87 = f90, f103, f87 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f64 = f92, f96, f64 nop __LINE__ } { .mfi nop __LINE__ FNMA f68 = f92, f100, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f65 = f93, f96, f65 nop __LINE__ } { .mfi nop __LINE__ FMA_A f69 = f93, f100, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f66 = f92, f98, f66 nop __LINE__ } { .mfi nop __LINE__ FNMA f70 = f92, f102, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_A f67 = f93, f98, f67 nop __LINE__ } { .mfi nop __LINE__ FMA_A f71 = f93, f102, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f64 = f93, f97, f64 nop __LINE__ } { .mfi nop __LINE__ FMA_B f68 = f93, f101, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f65 = f92, f97, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f69 = f92, f101, f69 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_B f66 = f93, f99, f66 nop __LINE__ } { .mfi nop __LINE__ FMA_B f70 = f93, f103, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FNMA f67 = f92, f99, f67 nop __LINE__ } { .mfi nop __LINE__ FNMA f71 = f92, f103, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f104, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f104, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f105, f80 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f105, f84 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f104, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f104, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f105, f82 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f105, f86 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f80 = f105, f81, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f84 = f105, f85, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f81 = f104, f81, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f85 = f104, f85, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f82 = f105, f83, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f86 = f105, f87, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f83 = f104, f83, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f87 = f104, f87, f39 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f80, SIZE FNMA f64 = f106, f80, f64 nop __LINE__ } { .mfi STFD [AOFFSET2] = f84, SIZE FNMA f68 = f106, f84, f68 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f81, SIZE FMA_A f65 = f107, f80, f65 nop __LINE__ } { .mfi STFD [AOFFSET2] = f85, SIZE FMA_A f69 = f107, f84, f69 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f82, SIZE FNMA f66 = f106, f82, f66 nop __LINE__ } { .mfi STFD [AOFFSET2] = f86, SIZE FNMA f70 = f106, f86, f70 nop __LINE__ } ;; { .mfi STFD [AOFFSET] = f83, -11 * SIZE FMA_A f67 = f107, f82, f67 nop __LINE__ } { .mfi STFD [AOFFSET2] = f87, -11 * SIZE FMA_A f71 = f107, f86, f71 nop __LINE__ } ;; { .mfi STFD [C2 ] = f80, SIZE FMA_B f64 = f107, f81, f64 nop __LINE__ } { .mfi STFD [C6 ] = f84, SIZE FMA_B f68 = f107, f85, f68 nop __LINE__ } ;; { .mfi STFD [C2 ] = f81, SIZE FNMA f65 = f106, f81, f65 nop __LINE__ } { .mfi STFD [C6 ] = f85, SIZE FNMA f69 = f106, f85, f69 nop __LINE__ } ;; { .mfi STFD [C2 ] = f82, SIZE FMA_B f66 = f107, f83, f66 nop __LINE__ } { .mfi STFD [C6 ] = f86, SIZE FMA_B f70 = f107, f87, f70 nop __LINE__ } ;; { .mfi STFD [C2 ] = f83, 5 * SIZE FNMA f67 = f106, f83, f67 nop __LINE__ } { .mfi STFD [C6 ] = f87, 5 * SIZE FNMA f71 = f106, f87, f71 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f32 = f120, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f36 = f120, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f33 = f121, f64 nop __LINE__ } { .mfi nop __LINE__ FMPY f37 = f121, f68 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f34 = f120, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f38 = f120, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMPY f35 = f121, f66 nop __LINE__ } { .mfi nop __LINE__ FMPY f39 = f121, f70 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f64 = f121, f65, f32 nop __LINE__ } { .mfi nop __LINE__ FMA_C f68 = f121, f69, f36 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f65 = f120, f65, f33 nop __LINE__ } { .mfi nop __LINE__ FMA_D f69 = f120, f69, f37 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_C f66 = f121, f67, f34 nop __LINE__ } { .mfi nop __LINE__ FMA_C f70 = f121, f71, f38 nop __LINE__ } ;; { .mfi nop __LINE__ FMA_D f67 = f120, f67, f35 nop __LINE__ } { .mfi nop __LINE__ FMA_D f71 = f120, f71, f39 nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE shladd r2 = K, ZBASE_SHIFT, r0 } ;; { .mmi STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE shladd AORIG = r2, 2, AORIG } ;; { .mmi STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE nop __LINE__ } ;; { .mmi STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f71 nop __LINE__ } ;; { .mfi STFD [C1 ] = f64, SIZE mov f64 = f0 cmp.ne p6, p0 = 1, I } { .mfi STFD [C5 ] = f68, SIZE mov f81 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f65, SIZE mov f65 = f0 nop __LINE__ } { .mfi STFD [C5 ] = f69, SIZE mov f96 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f66, SIZE mov f80 = f0 sub L = K, KK } { .mfi STFD [C5 ] = f70, SIZE mov f97 = f0 nop __LINE__ } ;; { .mfi STFD [C1 ] = f67, 5 * SIZE mov f112 = f0 adds I = -1, I } { .mfb STFD [C5 ] = f71, 5 * SIZE mov f113 = f0 (p6) br.cond.dptk .L011 } ;; #endif .L020: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L030 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f66 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 1, AORIG } ;; #endif ;; adds L = 1, L ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = -1, L } { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f114 = f0 mov ar.lc = L } { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L028 ;; .align 16 .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f35, f51, f82 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f35, f53, f98 // A4 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f35, f55, f114 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 br.cloop.sptk.few .L022 } ;; .L028: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -2, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [BOFFSET], 2 * SIZE ;; { .mfi LDFPD f120, f121 = [BOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [BOFFSET] FSUB f80 = f74, f80 adds BOFFSET = -14 * SIZE, BOFFSET } { .mfi nop __LINE__ FSUB_A f81 = f75, f81 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f96 = f88, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f97 = f89, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f90, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f113 = f91, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f66 = f104, f66 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f67 = f105, f67 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f82 = f106, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f83 = f107, f83 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f98 = f120, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f99 = f121, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB_A f115 = f123, f115 nop __LINE__ } ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [AOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [AOFFSET], 2 * SIZE ;; { .mfi LDFPD f120, f121 = [AOFFSET], 2 * SIZE FSUB f64 = f72, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f65 = f73, f65 nop __LINE__ } ;; { .mfi LDFPD f122, f123 = [AOFFSET] FSUB f66 = f74, f66 adds AOFFSET = -14 * SIZE, AOFFSET } { .mfi nop __LINE__ FSUB f67 = f75, f67 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f80 = f88, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f81 = f89, f81 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f82 = f90, f82 nop __LINE__ } { .mfi nop __LINE__ FSUB f83 = f91, f83 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f96 = f104, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f97 = f105, f97 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f98 = f106, f98 nop __LINE__ } { .mfi nop __LINE__ FSUB f99 = f107, f99 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f112 = f120, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f113 = f121, f113 nop __LINE__ } ;; { .mfi nop __LINE__ FSUB f114 = f122, f114 nop __LINE__ } { .mfi nop __LINE__ FSUB f115 = f123, f115 nop __LINE__ } ;; #endif #ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f66 FMPY f33 = f105, f66 FMPY f34 = f104, f82 FMPY f35 = f105, f82 FMPY f36 = f104, f98 FMPY f37 = f105, f98 FMPY f38 = f104, f114 FMPY f39 = f105, f114 ;; FMA_C f66 = f105, f67, f32 FMA_D f67 = f104, f67, f33 FMA_C f82 = f105, f83, f34 FMA_D f83 = f104, f83, f35 FMA_C f98 = f105, f99, f36 FMA_D f99 = f104, f99, f37 FMA_C f114 = f105, f115, f38 FMA_D f115 = f104, f115, f39 ;; FNMA f64 = f106, f66, f64 FMA_A f65 = f107, f66, f65 FNMA f80 = f106, f82, f80 FMA_A f81 = f107, f82, f81 FNMA f96 = f106, f98, f96 FMA_A f97 = f107, f98, f97 FNMA f112 = f106, f114, f112 FMA_A f113 = f107, f114, f113 ;; FMA_B f64 = f107, f67, f64 FNMA f65 = f106, f67, f65 FMA_B f80 = f107, f83, f80 FNMA f81 = f106, f83, f81 FMA_B f96 = f107, f99, f96 FNMA f97 = f106, f99, f97 FMA_B f112 = f107, f115, f112 FNMA f113 = f106, f115, f113 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 FMPY f36 = f120, f96 FMPY f37 = f121, f96 FMPY f38 = f120, f112 FMPY f39 = f121, f112 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 FMA_C f96 = f121, f97, f36 FMA_D f97 = f120, f97, f37 FMA_C f112 = f121, f113, f38 FMA_D f113 = f120, f113, f39 ;; #endif #ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; FNMA f66 = f74, f64, f66 FMA_A f67 = f75, f64, f67 FNMA f82 = f74, f80, f82 FMA_A f83 = f75, f80, f83 FNMA f98 = f74, f96, f98 FMA_A f99 = f75, f96, f99 FNMA f114 = f74, f112, f114 FMA_A f115 = f75, f112, f115 ;; FMA_B f66 = f75, f65, f66 FNMA f67 = f74, f65, f67 FMA_B f82 = f75, f81, f82 FNMA f83 = f74, f81, f83 FMA_B f98 = f75, f97, f98 FNMA f99 = f74, f97, f99 FMA_B f114 = f75, f113, f114 FNMA f115 = f74, f113, f115 ;; FMPY f32 = f90, f66 FMPY f33 = f91, f66 FMPY f34 = f90, f82 FMPY f35 = f91, f82 FMPY f36 = f90, f98 FMPY f37 = f91, f98 FMPY f38 = f90, f114 FMPY f39 = f91, f114 ;; FMA_C f66 = f91, f67, f32 FMA_D f67 = f90, f67, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 FMA_C f98 = f91, f99, f36 FMA_D f99 = f90, f99, f37 FMA_C f114 = f91, f115, f38 FMA_D f115 = f90, f115, f39 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f108, f109 = [BOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f126, f127 = [BOFFSET] adds BOFFSET = - 30 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f66 FMPY f35 = f73, f66 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f66 = f73, f67, f34 FMA_D f67 = f72, f67, f35 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f82 = f74, f66, f82 FMA_A f83 = f75, f66, f83 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f82 = f75, f67, f82 FNMA f83 = f74, f67, f83 ;; FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f98 = f76, f66, f98 FMA_A f99 = f77, f66, f99 ;; FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f98 = f77, f67, f98 FNMA f99 = f76, f67, f99 ;; FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 FNMA f114 = f78, f66, f114 FMA_A f115 = f79, f66, f115 ;; FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 FMA_B f114 = f79, f67, f114 FNMA f115 = f78, f67, f115 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f82 FMPY f35 = f91, f82 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f98 = f92, f82, f98 FMA_A f99 = f93, f82, f99 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f98 = f93, f83, f98 FNMA f99 = f92, f83, f99 ;; FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 FNMA f114 = f94, f82, f114 FMA_A f115 = f95, f82, f115 ;; FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 FMA_B f114 = f95, f83, f114 FNMA f115 = f94, f83, f115 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 FMPY f34 = f108, f98 FMPY f35 = f109, f98 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 FMA_C f98 = f109, f99, f34 FMA_D f99 = f108, f99, f35 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 FNMA f114 = f110, f98, f114 FMA_A f115 = f111, f98, f115 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 FMA_B f114 = f111, f99, f114 FNMA f115 = f110, f99, f115 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 FMPY f34 = f126, f114 FMPY f35 = f127, f114 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 FMA_C f114 = f127, f115, f34 FMA_D f115 = f126, f115, f35 ;; #endif #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET ;; LDFPD f72, f73 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f76, f77 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f92, f93 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f98 = f74, f114, f98 FMA_A f99 = f75, f114, f99 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f98 = f75, f115, f98 FNMA f99 = f74, f115, f99 ;; FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f82 = f76, f114, f82 FMA_A f83 = f77, f114, f83 ;; FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f82 = f77, f115, f82 FNMA f83 = f76, f115, f83 ;; FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 FNMA f66 = f78, f114, f66 FMA_A f67 = f79, f114, f67 ;; FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 FMA_B f66 = f79, f115, f66 FNMA f67 = f78, f115, f67 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 FMPY f34 = f88, f98 FMPY f35 = f89, f98 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 FMA_C f98 = f89, f99, f34 FMA_D f99 = f88, f99, f35 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f82 = f90, f98, f82 FMA_A f83 = f91, f98, f83 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f82 = f91, f99, f82 FNMA f83 = f90, f99, f83 ;; FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 FNMA f66 = f92, f98, f66 FMA_A f67 = f93, f98, f67 ;; FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 FMA_B f66 = f93, f99, f66 FNMA f67 = f92, f99, f67 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f82 FMPY f35 = f105, f82 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f82 = f105, f83, f34 FMA_D f83 = f104, f83, f35 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f66 = f106, f82, f66 FMA_A f67 = f107, f82, f67 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f66 = f107, f83, f66 FNMA f67 = f106, f83, f67 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f66 FMPY f35 = f121, f66 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f66 = f121, f67, f34 FMA_D f67 = f120, f67, f35 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f98, SIZE ;; STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f114, SIZE ;; STFD [BOFFSET] = f83, 5 * SIZE STFD [BOFFSET2] = f115, 5 * SIZE ;; adds BOFFSET = - 16 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f82, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f83, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f114, SIZE ;; STFD [AOFFSET] = f99, 5 * SIZE STFD [AOFFSET2] = f115, 5 * SIZE ;; adds AOFFSET = - 16 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f66, SIZE ;; STFD [C1 ] = f67, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C2 ] = f82, SIZE ;; STFD [C2 ] = f83, SIZE ;; STFD [C3 ] = f96, SIZE ;; STFD [C3 ] = f97, SIZE ;; STFD [C3 ] = f98, SIZE ;; STFD [C3 ] = f99, SIZE ;; STFD [C4 ] = f112, SIZE ;; STFD [C4 ] = f113, SIZE ;; STFD [C4 ] = f114, SIZE ;; STFD [C4 ] = f115, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT shladd AORIG = r2, 1, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 2, BOFFSET #endif ;; #ifdef LT adds KK = 2, KK #elif defined LN adds KK = -2, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L030: { .mib #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L049 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f72 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f73 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 2, B mov f72 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 add AOFFSET = r3, AORIG } ;; #endif ;; adds L = 1, L ;; { .mmi nop __LINE__ adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET tbit.z p12, p0 = L, 0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f88 = f0 shr L = L, 1 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f104 = f0 adds L = -1, L } { .mfb adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f105 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } { .mfi cmp.eq p3, p0 = r0, r0 mov f121 = f0 nop __LINE__ } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L038 ;; .align 16 .L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 br.cloop.sptk.few .L032 } ;; .L038: #if defined(LN) || defined(RT) #ifdef LN adds r2 = -1, KK #else adds r2 = -4, KK #endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B ;; #endif #if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f88, f80 FSUB_A f81 = f89, f81 FSUB f96 = f104, f96 FSUB_A f97 = f105, f97 FSUB f112 = f120, f112 FSUB_A f113 = f121, f113 ;; #else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [AOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f96 = f104, f96 FSUB f97 = f105, f97 FSUB f112 = f120, f112 FSUB f113 = f121, f113 ;; #endif #ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 FMPY f36 = f120, f96 FMPY f37 = f121, f96 FMPY f38 = f120, f112 FMPY f39 = f121, f112 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 FMA_C f96 = f121, f97, f36 FMA_D f97 = f120, f97, f37 FMA_C f112 = f121, f113, f38 FMA_D f113 = f120, f113, f39 ;; #endif #ifdef LT LDFPD f90, f91 = [AOFFSET] ;; FMPY f32 = f90, f64 FMPY f33 = f91, f64 FMPY f34 = f90, f80 FMPY f35 = f91, f80 FMPY f36 = f90, f96 FMPY f37 = f91, f96 FMPY f38 = f90, f112 FMPY f39 = f91, f112 ;; FMA_C f64 = f91, f65, f32 FMA_D f65 = f90, f65, f33 FMA_C f80 = f91, f81, f34 FMA_D f81 = f90, f81, f35 FMA_C f96 = f91, f97, f36 FMA_D f97 = f90, f97, f37 FMA_C f112 = f91, f113, f38 FMA_D f113 = f90, f113, f39 ;; #endif #ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f108, f109 = [BOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f126, f127 = [BOFFSET] adds BOFFSET = - 30 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 ;; FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 ;; FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 ;; FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 ;; FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 ;; FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 ;; FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 ;; #endif #ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET ;; LDFPD f72, f73 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f76, f77 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f92, f93 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 ;; FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 ;; FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 ;; FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 ;; FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 ;; FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 ;; FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;; #endif #if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;; #else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f97, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f81, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;; #endif #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 #endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C3 ] = f96, SIZE ;; STFD [C3 ] = f97, SIZE ;; STFD [C4 ] = f112, SIZE ;; STFD [C4 ] = f113, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;; #ifdef LN adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 #endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;; #ifdef RT add AORIG = r2, AORIG #endif ;; #if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET shladd BOFFSET = L, 2, BOFFSET #endif ;; #ifdef LT adds KK = 1, KK #elif defined LN adds KK = -1, KK #else nop __LINE__ #endif ;; #if defined(LT) || defined(RN) mov L = KK #else sub L = K, KK #endif ;; .align 16 .L049: #ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; shladd B = KK8, 2, B #endif #if defined(LT) || defined(RN) mov B = BOFFSET #endif #ifdef RN adds KK = 4, KK #endif #ifdef RT adds KK = -4, KK #endif ;; { .mmb mov AOFFSET = A cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010x } ;; .align 16 .L999: { .mii nop __LINE__ mov ar.lc = ARLC mov pr = PR, -1 } { .mib nop __LINE__ #ifdef TRMMKERNEL mov ar.pfs = ARPFS #else nop __LINE__ #endif br.ret.sptk.many b0 } EPILOGUE OpenBLAS-0.2.20/kernel/mips/000077500000000000000000000000001313527062700153745ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/mips/KERNEL000066400000000000000000000012431313527062700162770ustar00rootroot00000000000000ifndef SNRM2KERNEL SNRM2KERNEL = nrm2.c endif ifndef DNRM2KERNEL DNRM2KERNEL = nrm2.c endif ifndef CNRM2KERNEL CNRM2KERNEL = znrm2.c endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.c endif ifndef SCABS_KERNEL SCABS_KERNEL = ../generic/cabs.c endif ifndef DCABS_KERNEL DCABS_KERNEL = ../generic/cabs.c endif ifndef QCABS_KERNEL QCABS_KERNEL = ../generic/cabs.c endif ifndef LSAME_KERNEL LSAME_KERNEL = ../generic/lsame.c endif ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA DGEMM_BETA = ../generic/gemm_beta.c endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif OpenBLAS-0.2.20/kernel/mips/KERNEL.P5600000066400000000000000000000156521313527062700170210ustar00rootroot00000000000000SAMAXKERNEL = ../mips/amax.c DAMAXKERNEL = ../mips/amax.c CAMAXKERNEL = ../mips/zamax.c ZAMAXKERNEL = ../mips/zamax.c SAMINKERNEL = ../mips/amin.c DAMINKERNEL = ../mips/amin.c CAMINKERNEL = ../mips/zamin.c ZAMINKERNEL = ../mips/zamin.c SMAXKERNEL = ../mips/max.c DMAXKERNEL = ../mips/max.c SMINKERNEL = ../mips/min.c DMINKERNEL = ../mips/min.c ISAMAXKERNEL = ../mips/iamax.c IDAMAXKERNEL = ../mips/iamax.c ICAMAXKERNEL = ../mips/izamax.c IZAMAXKERNEL = ../mips/izamax.c ISAMINKERNEL = ../mips/iamin.c IDAMINKERNEL = ../mips/iamin.c ICAMINKERNEL = ../mips/izamin.c IZAMINKERNEL = ../mips/izamin.c ISMAXKERNEL = ../mips/imax.c IDMAXKERNEL = ../mips/imax.c ISMINKERNEL = ../mips/imin.c IDMINKERNEL = ../mips/imin.c ifdef HAVE_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c ZASUMKERNEL = ../mips/zasum_msa.c else SASUMKERNEL = ../mips/asum.c DASUMKERNEL = ../mips/asum.c CASUMKERNEL = ../mips/asum.c ZASUMKERNEL = ../mips/asum.c endif ifdef HAVE_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c ZAXPYKERNEL = ../mips/zaxpy_msa.c else SAXPYKERNEL = ../mips/axpy.c DAXPYKERNEL = ../mips/axpy.c CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c endif ifdef HAVE_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c ZCOPYKERNEL = ../mips/zcopy_msa.c else SCOPYKERNEL = ../mips/copy.c DCOPYKERNEL = ../mips/copy.c CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c endif ifdef HAVE_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c ZDOTKERNEL = ../mips/zdot_msa.c else SDOTKERNEL = ../mips/dot.c DDOTKERNEL = ../mips/dot.c CDOTKERNEL = ../mips/zdot.c ZDOTKERNEL = ../mips/zdot.c endif SNRM2KERNEL = ../mips/nrm2.c DNRM2KERNEL = ../mips/nrm2.c CNRM2KERNEL = ../mips/znrm2.c ZNRM2KERNEL = ../mips/znrm2.c ifdef HAVE_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c ZROTKERNEL = ../mips/zrot_msa.c else SROTKERNEL = ../mips/rot.c DROTKERNEL = ../mips/rot.c CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c endif ifdef HAVE_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c CSCALKERNEL = ../mips/cscal_msa.c ZSCALKERNEL = ../mips/zscal_msa.c else SSCALKERNEL = ../mips/scal.c DSCALKERNEL = ../mips/scal.c CSCALKERNEL = ../mips/zscal.c ZSCALKERNEL = ../mips/zscal.c endif ifdef HAVE_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c ZSWAPKERNEL = ../mips/zswap_msa.c else SSWAPKERNEL = ../mips/swap.c DSWAPKERNEL = ../mips/swap.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c endif ifdef HAVE_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c CGEMVNKERNEL = ../mips/cgemv_n_msa.c ZGEMVNKERNEL = ../mips/zgemv_n_msa.c else SGEMVNKERNEL = ../mips/gemv_n.c DGEMVNKERNEL = ../mips/gemv_n.c CGEMVNKERNEL = ../mips/zgemv_n.c ZGEMVNKERNEL = ../mips/zgemv_n.c endif ifdef HAVE_MSA SGEMVTKERNEL = ../mips/sgemv_t_msa.c DGEMVTKERNEL = ../mips/dgemv_t_msa.c CGEMVTKERNEL = ../mips/cgemv_t_msa.c ZGEMVTKERNEL = ../mips/zgemv_t_msa.c else SGEMVTKERNEL = ../mips/gemv_t.c DGEMVTKERNEL = ../mips/gemv_t.c CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c endif ifdef HAVE_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o else SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o endif ifdef HAVE_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o else DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o endif ifdef HAVE_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o else CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o endif ifdef HAVE_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o else ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o endif ifdef HAVE_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c else STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif ifdef HAVE_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c else DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif ifdef HAVE_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c else CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif ifdef HAVE_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c else ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endifOpenBLAS-0.2.20/kernel/mips/Makefile000066400000000000000000000000121313527062700170250ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/mips/amax.c000066400000000000000000000040061313527062700164660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); maxf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) > maxf ) { maxf = ABS(x[ix]); } ix += inc_x; i++; } return(maxf); } OpenBLAS-0.2.20/kernel/mips/amin.c000066400000000000000000000040061313527062700164640ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; if (n <= 0 || inc_x <= 0) return(minf); minf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) < minf ) { minf = ABS(x[ix]); } ix += inc_x; i++; } return(minf); } OpenBLAS-0.2.20/kernel/mips/asum.c000066400000000000000000000036631313527062700165150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; if (n <= 0 || inc_x <= 0) return(sumf); n *= inc_x; while(i < n) { sumf += ABS(x[i]); i += inc_x; } return(sumf); } OpenBLAS-0.2.20/kernel/mips/axpby.c000066400000000000000000000045011313527062700166630ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix,iy; if ( n < 0 ) return(0); ix = 0; iy = 0; if ( beta == 0.0 ) { if ( alpha == 0.0 ) { while(i < n) { y[iy] = 0.0 ; iy += inc_y ; i++ ; } } else { while(i < n) { y[iy] = alpha * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } } } else { if ( alpha == 0.0 ) { while(i < n) { y[iy] = beta * y[iy] ; iy += inc_y ; i++ ; } } else { while(i < n) { y[iy] = alpha * x[ix] + beta * y[iy] ; ix += inc_x ; iy += inc_y ; i++ ; } } } return(0); } OpenBLAS-0.2.20/kernel/mips/axpy.c000066400000000000000000000037561313527062700165340ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix,iy; if ( n < 0 ) return(0); if ( da == 0.0 ) return(0); ix = 0; iy = 0; while(i < n) { y[iy] += da * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/mips/casum_msa.c000066400000000000000000000223751313527062700175210ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include #include "macros_msa.h" #define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i, inc_x2; FLOAT sumf = 0.0; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; v4f32 sum_abs0 = {0, 0, 0, 0}; v4f32 sum_abs1 = {0, 0, 0, 0}; v4f32 sum_abs2 = {0, 0, 0, 0}; v4f32 sum_abs3 = {0, 0, 0, 0}; v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { if (n > 31) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 128 + 32; LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); for (i = (n >> 5) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src8); sum_abs1 += AND_VEC_W(src9); sum_abs2 += AND_VEC_W(src10); sum_abs3 += AND_VEC_W(src11); sum_abs0 += AND_VEC_W(src12); sum_abs1 += AND_VEC_W(src13); sum_abs2 += AND_VEC_W(src14); sum_abs3 += AND_VEC_W(src15); } LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); sum_abs0 += AND_VEC_W(src8); sum_abs1 += AND_VEC_W(src9); sum_abs2 += AND_VEC_W(src10); sum_abs3 += AND_VEC_W(src11); sum_abs0 += AND_VEC_W(src12); sum_abs1 += AND_VEC_W(src13); sum_abs2 += AND_VEC_W(src14); sum_abs3 += AND_VEC_W(src15); } if (n & 31) { if (n & 16) { LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); } if (n & 8) { LD_SP4_INC(x, 4, src0, src1, src2, src3); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); } if (n & 4) { LD_SP2_INC(x, 4, src0, src1); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); } if (n & 2) { src0 = LD_SP(x); x += 4; sum_abs0 += AND_VEC_W(src0); } if (n & 1) { sumf += fabsf(*x); sumf += fabsf(*(x + 1)); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf += sum_abs0[0]; sumf += sum_abs0[1]; sumf += sum_abs0[2]; sumf += sum_abs0[3]; } else { inc_x2 = 2 * inc_x; if (n > 16) { LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); for (i = (n >> 4) - 1; i--;) { LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src8); sum_abs1 += AND_VEC_W(src9); sum_abs2 += AND_VEC_W(src10); sum_abs3 += AND_VEC_W(src11); sum_abs0 += AND_VEC_W(src12); sum_abs1 += AND_VEC_W(src13); sum_abs2 += AND_VEC_W(src14); sum_abs3 += AND_VEC_W(src15); } LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); sum_abs0 += AND_VEC_W(src8); sum_abs1 += AND_VEC_W(src9); sum_abs2 += AND_VEC_W(src10); sum_abs3 += AND_VEC_W(src11); sum_abs0 += AND_VEC_W(src12); sum_abs1 += AND_VEC_W(src13); sum_abs2 += AND_VEC_W(src14); sum_abs3 += AND_VEC_W(src15); } if (n & 15) { if (n & 8) { LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); } if (n & 4) { LD_SP4_INC(x, inc_x2, src0, src1, src2, src3); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); } if (n & 2) { LD_SP2_INC(x, inc_x2, src0, src1); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); } if (n & 1) { src0 = LD_SP(x); sum_abs0 += AND_VEC_W(src0); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf = sum_abs0[0] + sum_abs0[1]; } return (sumf); } OpenBLAS-0.2.20/kernel/mips/caxpy_msa.c000066400000000000000000000350661313527062700175360ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #if !defined(CONJ) #define OP0 += #define OP1 -= #define OP2 += #else #define OP0 -= #define OP1 += #define OP2 -= #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i, inc_x2, inc_y2; FLOAT *py; v4f32 x0, x1, x2, x3, x4, x5, x6, x7; v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7; FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7; if (n < 0) return(0); if ((da_r == 0.0) && (da_i == 0.0)) return(0); py = y; if ((1 == inc_x) && (1 == inc_y)) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 64; dar_vec = COPY_FLOAT_TO_VECTOR(da_r); dai_vec = COPY_FLOAT_TO_VECTOR(da_i); for (i = (n >> 4); i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 32; y_pref += 32; LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); PCKEVOD_W2_SP(x1, x0, x0r, x0i); PCKEVOD_W2_SP(y1, y0, y0r, y0i); PCKEVOD_W2_SP(x3, x2, x1r, x1i); PCKEVOD_W2_SP(y3, y2, y1r, y1i); PCKEVOD_W2_SP(x5, x4, x2r, x2i); PCKEVOD_W2_SP(y5, y4, y2r, y2i); PCKEVOD_W2_SP(x7, x6, x3r, x3i); PCKEVOD_W2_SP(y7, y6, y3r, y3i); FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y2i OP0 dar_vec * x2i; y3i OP0 dar_vec * x3i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y2r OP1 dai_vec * x2i; y3r OP1 dai_vec * x3i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; y2i OP2 dai_vec * x2r; y3i OP2 dai_vec * x3r; ILVRL_W2_SP(y0i, y0r, y0, y1); ILVRL_W2_SP(y1i, y1r, y2, y3); ILVRL_W2_SP(y2i, y2r, y4, y5); ILVRL_W2_SP(y3i, y3r, y6, y7); ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); } if (n & 15) { if (n & 8) { LD_SP4_INC(x, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); PCKEVOD_W2_SP(x1, x0, x0r, x0i); PCKEVOD_W2_SP(y1, y0, y0r, y0i); PCKEVOD_W2_SP(x3, x2, x1r, x1i); PCKEVOD_W2_SP(y3, y2, y1r, y1i); FMADD2(x0r, x1r, dar_vec, y0r, y1r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; ILVRL_W2_SP(y0i, y0r, y0, y1); ILVRL_W2_SP(y1i, y1r, y2, y3); ST_SP4_INC(y0, y1, y2, y3, y, 4); } if (n & 4) { LD_SP2_INC(x, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); PCKEVOD_W2_SP(x1, x0, x0r, x0i); PCKEVOD_W2_SP(y1, y0, y0r, y0i); y0r += dar_vec * x0r; y0i OP0 dar_vec * x0i; y0r OP1 dai_vec * x0i; y0i OP2 dai_vec * x0r; ILVRL_W2_SP(y0i, y0r, y0, y1); ST_SP2_INC(y0, y1, y, 4); } if (n & 2) { LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3); LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); FMADD2(xd0, xd2, da_r, yd0, yd2); yd1 OP0 da_r * xd1; yd3 OP0 da_r * xd3; yd0 OP1 da_i * xd1; yd2 OP1 da_i * xd3; yd1 OP2 da_i * xd0; yd3 OP2 da_i * xd2; ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); } if (n & 1) { LD_GP2_INC(x, 1, xd0, xd1); LD_GP2_INC(py, 1, yd0, yd1); yd0 += da_r * xd0; yd1 OP0 da_r * xd1; yd0 OP1 da_i * xd1; yd1 OP2 da_i * xd0; ST_GP2_INC(yd0, yd1, y, 1); } } } else if (1 == inc_y) { FLOAT *y_pref; BLASLONG pref_offset; v4f32 x8, x9, x10, x11, x12, x13, x14; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 64; inc_x2 = 2 * inc_x; dar_vec = COPY_FLOAT_TO_VECTOR(da_r); dai_vec = COPY_FLOAT_TO_VECTOR(da_i); for (i = (n >> 4); i--;) { PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); y_pref += 32; LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14); LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7); PCKEV_D2_SP(x1, x0, x3, x2, x0, x1); PCKEV_D2_SP(x5, x4, x14, x6, x2, x3); PCKEV_D2_SP(x9, x8, x11, x10, x4, x5); x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12); x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x)); x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1))); x += inc_x2; LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); PCKEVOD_W2_SP(x1, x0, x0r, x0i); PCKEVOD_W2_SP(y1, y0, y0r, y0i); PCKEVOD_W2_SP(x3, x2, x1r, x1i); PCKEVOD_W2_SP(y3, y2, y1r, y1i); PCKEVOD_W2_SP(x5, x4, x2r, x2i); PCKEVOD_W2_SP(y5, y4, y2r, y2i); PCKEVOD_W2_SP(x7, x6, x3r, x3i); PCKEVOD_W2_SP(y7, y6, y3r, y3i); FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y2i OP0 dar_vec * x2i; y3i OP0 dar_vec * x3i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y2r OP1 dai_vec * x2i; y3r OP1 dai_vec * x3i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; y2i OP2 dai_vec * x2r; y3i OP2 dai_vec * x3r; ILVRL_W2_SP(y0i, y0r, y0, y1); ILVRL_W2_SP(y1i, y1r, y2, y3); ILVRL_W2_SP(y2i, y2r, y4, y5); ILVRL_W2_SP(y3i, y3r, y6, y7); ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); } if (n & 15) { if (n & 8) { LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3); PCKEV_D2_SP(x1, x0, x6, x2, x0, x1); x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4); x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1))); x += inc_x2; LD_SP4_INC(py, 4, y0, y1, y2, y3); PCKEVOD_W2_SP(x1, x0, x0r, x0i); PCKEVOD_W2_SP(y1, y0, y0r, y0i); PCKEVOD_W2_SP(x3, x2, x1r, x1i); PCKEVOD_W2_SP(y3, y2, y1r, y1i); FMADD2(x0r, x1r, dar_vec, y0r, y1r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; ILVRL_W2_SP(y0i, y0r, y0, y1); ILVRL_W2_SP(y1i, y1r, y2, y3); ST_SP4_INC(y0, y1, y2, y3, y, 4); } if (n & 4) { LD_SP3_INC(x, inc_x2, x0, x2, x1); x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0); x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1))); x += inc_x2; LD_SP2_INC(py, 4, y0, y1); PCKEVOD_W2_SP(x1, x0, x0r, x0i); PCKEVOD_W2_SP(y1, y0, y0r, y0i); y0r += dar_vec * x0r; y0i OP0 dar_vec * x0i; y0r OP1 dai_vec * x0i; y0i OP2 dai_vec * x0r; ILVRL_W2_SP(y0i, y0r, y0, y1); ST_SP2_INC(y0, y1, y, 4); } if (n & 2) { xd0 = x[0]; xd1 = x[1]; x += inc_x2; xd2 = x[0]; xd3 = x[1]; x += inc_x2; LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); FMADD2(xd0, xd2, da_r, yd0, yd2); yd1 OP0 da_r * xd1; yd3 OP0 da_r * xd3; yd0 OP1 da_i * xd1; yd2 OP1 da_i * xd3; yd1 OP2 da_i * xd0; yd3 OP2 da_i * xd2; ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); } if (n & 1) { LD_GP2_INC(x, 1, xd0, xd1); LD_GP2_INC(py, 1, yd0, yd1); yd0 += da_r * xd0; yd1 OP0 da_r * xd1; yd0 OP1 da_i * xd1; yd1 OP2 da_i * xd0; ST_GP2_INC(yd0, yd1, y, 1); } } } else { inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; for (i = (n >> 2); i--;) { xd0 = x[0]; xd1 = x[1]; x += inc_x2; xd2 = x[0]; xd3 = x[1]; x += inc_x2; xd4 = x[0]; xd5 = x[1]; x += inc_x2; xd6 = x[0]; xd7 = x[1]; x += inc_x2; yd0 = py[0]; yd1 = py[1]; py += inc_y2; yd2 = py[0]; yd3 = py[1]; py += inc_y2; yd4 = py[0]; yd5 = py[1]; py += inc_y2; yd6 = py[0]; yd7 = py[1]; py += inc_y2; FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6); yd1 OP0 da_r * xd1; yd3 OP0 da_r * xd3; yd5 OP0 da_r * xd5; yd7 OP0 da_r * xd7; yd0 OP1 da_i * xd1; yd2 OP1 da_i * xd3; yd4 OP1 da_i * xd5; yd6 OP1 da_i * xd7; yd1 OP2 da_i * xd0; yd3 OP2 da_i * xd2; yd5 OP2 da_i * xd4; yd7 OP2 da_i * xd6; y[0] = yd0; y[1] = yd1; y += inc_y2; y[0] = yd2; y[1] = yd3; y += inc_y2; y[0] = yd4; y[1] = yd5; y += inc_y2; y[0] = yd6; y[1] = yd7; y += inc_y2; } if (n & 3) { if (n & 2) { xd0 = x[0]; xd1 = x[1]; x += inc_x2; xd2 = x[0]; xd3 = x[1]; x += inc_x2; yd0 = py[0]; yd1 = py[1]; py += inc_y2; yd2 = py[0]; yd3 = py[1]; py += inc_y2; FMADD2(xd0, xd2, da_r, yd0, yd2); yd1 OP0 da_r * xd1; yd3 OP0 da_r * xd3; yd0 OP1 da_i * xd1; yd2 OP1 da_i * xd3; yd1 OP2 da_i * xd0; yd3 OP2 da_i * xd2; y[0] = yd0; y[1] = yd1; y += inc_y2; y[0] = yd2; y[1] = yd3; y += inc_y2; } if (n & 1) { xd0 = x[0]; xd1 = x[1]; yd0 = y[0]; yd1 = y[1]; yd0 += da_r * xd0; yd1 OP0 da_r * xd1; yd0 OP1 da_i * xd1; yd1 OP2 da_i * xd0; y[0] = yd0; y[1] = yd1; } } } return (0); } OpenBLAS-0.2.20/kernel/mips/ccopy_msa.c000066400000000000000000000146701313527062700175250ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i, inc_x2, inc_y2; v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; if (n < 0) return (0); if ((1 == inc_x) && (1 == inc_y)) { if (n > 31) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 128 + 32; LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 5) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; x8 = LD_SP(x); x += 4; ST_SP(x0, y); y += 4; x9 = LD_SP(x); x += 4; ST_SP(x1, y); y += 4; x10 = LD_SP(x); x += 4; ST_SP(x2, y); y += 4; x11 = LD_SP(x); x += 4; ST_SP(x3, y); y += 4; x12 = LD_SP(x); x += 4; ST_SP(x4, y); y += 4; x13 = LD_SP(x); x += 4; ST_SP(x5, y); y += 4; x14 = LD_SP(x); x += 4; ST_SP(x6, y); y += 4; x15 = LD_SP(x); x += 4; ST_SP(x7, y); y += 4; x0 = LD_SP(x); x += 4; ST_SP(x8, y); y += 4; x1 = LD_SP(x); x += 4; ST_SP(x9, y); y += 4; x2 = LD_SP(x); x += 4; ST_SP(x10, y); y += 4; x3 = LD_SP(x); x += 4; ST_SP(x11, y); y += 4; x4 = LD_SP(x); x += 4; ST_SP(x12, y); y += 4; x5 = LD_SP(x); x += 4; ST_SP(x13, y); y += 4; x6 = LD_SP(x); x += 4; ST_SP(x14, y); y += 4; x7 = LD_SP(x); x += 4; ST_SP(x15, y); y += 4; } x8 = LD_SP(x); x += 4; x9 = LD_SP(x); x += 4; ST_SP(x0, y); y += 4; x10 = LD_SP(x); x += 4; ST_SP(x1, y); y += 4; x11 = LD_SP(x); x += 4; ST_SP(x2, y); y += 4; x12 = LD_SP(x); x += 4; ST_SP(x3, y); y += 4; x13 = LD_SP(x); x += 4; ST_SP(x4, y); y += 4; x14 = LD_SP(x); x += 4; ST_SP(x5, y); y += 4; x15 = LD_SP(x); x += 4; ST_SP(x6, y); y += 4; ST_SP(x7, y); y += 4; ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); } if (n & 31) { if (n & 16) { LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); } if (n & 8) { LD_SP4_INC(x, 4, x0, x1, x2, x3); ST_SP4_INC(x0, x1, x2, x3, y, 4); } if (n & 4) { LD_SP2_INC(x, 4, x0, x1); ST_SP2_INC(x0, x1, y, 4); } if (n & 2) { LD_GP4_INC(x, 1, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, y, 1); } if (n & 1) { LD_GP2_INC(x, 1, f0, f1); ST_GP2_INC(f0, f1, y, 1); } } } else { inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; for (i = (n >> 2); i--;) { f0 = *x; f1 = *(x+1); x += inc_x2; f2 = *x; f3 = *(x+1); x += inc_x2; f4 = *x; f5 = *(x+1); x += inc_x2; f6 = *x; f7 = *(x+1); x += inc_x2; *y = f0; *(y+1) = f1; y += inc_y2; *y = f2; *(y+1) = f3; y += inc_y2; *y = f4; *(y+1) = f5; y += inc_y2; *y = f6; *(y+1) = f7; y += inc_y2; } if (n & 2) { f0 = *x; f1 = *(x+1); x += inc_x2; f2 = *x; f3 = *(x+1); x += inc_x2; *y = f0; *(y+1) = f1; y += inc_y2; *y = f2; *(y+1) = f3; y += inc_y2; } if (n & 1) { LD_GP2_INC(x, 1, f0, f1); ST_GP2_INC(f0, f1, y, 1); } } return (0); } OpenBLAS-0.2.20/kernel/mips/cdot_msa.c000066400000000000000000000300221313527062700173260ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #if !defined(CONJ) #define OP1 -= #define OP2 += #define OP3 - #define OP4 + #else #define OP1 += #define OP2 -= #define OP3 + #define OP4 - #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i = 0; FLOAT dot[2]; BLASLONG inc_x2, inc_y2; FLOAT x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7, vx8, vx9, vx10, vx11; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7, vy8, vy9, vy10, vy11; v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; v4f32 dot0 = {0, 0, 0, 0}; v4f32 dot1 = {0, 0, 0, 0}; v4f32 dot2 = {0, 0, 0, 0}; v4f32 dot3 = {0, 0, 0, 0}; v4f32 dot4 = {0, 0, 0, 0}; v4f32 dot5 = {0, 0, 0, 0}; v4f32 dot6 = {0, 0, 0, 0}; v4f32 dot7 = {0, 0, 0, 0}; OPENBLAS_COMPLEX_FLOAT result; dot[0] = 0.0; dot[1] = 0.0; CREAL(result) = 0.0; CIMAG(result) = 0.0; if (n < 1) return (result); if ((1 == inc_x) && (1 == inc_y)) { if (n > 15) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 16; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 64 + 16; LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); for (i = (n >> 4) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 32; y_pref += 32; vx4 = LD_SP(x); x += 4; vx1r = (v4f32) __msa_pckev_w((v4i32) vx3, (v4i32) vx2); dot0 += (vx0r * vy0r); vx5 = LD_SP(x); x += 4; vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2); dot1 OP2 (vx0i * vy0r); vy4 = LD_SP(y); y += 4; vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2); dot2 += (vx1r * vy1r); vy5 = LD_SP(y); y += 4; vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2); dot3 OP2 (vx1i * vy1r); vx6 = LD_SP(x); x += 4; vx7 = LD_SP(x); x += 4; vy6 = LD_SP(y); y += 4; vy7 = LD_SP(y); y += 4; vx8 = LD_SP(x); x += 4; dot0 OP1 (vx0i * vy0i); vx9 = LD_SP(x); x += 4; vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4); dot1 += (vx0r * vy0i); vy8 = LD_SP(y); y += 4; vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4); dot2 OP1 (vx1i * vy1i); vy9 = LD_SP(y); y += 4; vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4); dot3 += (vx1r * vy1i); vx10 = LD_SP(x); x += 4; vy2i = (v4f32) __msa_pckod_w((v4i32) vy5, (v4i32) vy4); vx11 = LD_SP(x); x += 4; vx3r = (v4f32) __msa_pckev_w((v4i32) vx7, (v4i32) vx6); dot4 += (vx2r * vy2r); vy10 = LD_SP(y); y += 4; vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6); dot5 OP2 (vx2i * vy2r); vy11 = LD_SP(y); y += 4; vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6); vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6); dot6 += (vx3r * vy3r); vx0r = (v4f32) __msa_pckev_w((v4i32) vx9, (v4i32) vx8); dot7 OP2 (vx3i * vy3r); vx0i = (v4f32) __msa_pckod_w((v4i32) vx9, (v4i32) vx8); vy0r = (v4f32) __msa_pckev_w((v4i32) vy9, (v4i32) vy8); vx2 = vx10; vy0i = (v4f32) __msa_pckod_w((v4i32) vy9, (v4i32) vy8); vx3 = vx11; dot4 OP1 (vx2i * vy2i); vy2 = vy10; dot5 += (vx2r * vy2i); vy3 = vy11; dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } vx4 = LD_SP(x); x += 4; vx1r = (v4f32) __msa_pckev_w((v4i32) vx3, (v4i32) vx2); dot0 += (vx0r * vy0r); vx5 = LD_SP(x); x += 4; vx1i = (v4f32) __msa_pckod_w((v4i32) vx3, (v4i32) vx2); dot1 OP2 (vx0i * vy0r); vy4 = LD_SP(y); y += 4; vy1r = (v4f32) __msa_pckev_w((v4i32) vy3, (v4i32) vy2); dot2 += (vx1r * vy1r); vy5 = LD_SP(y); y += 4; vy1i = (v4f32) __msa_pckod_w((v4i32) vy3, (v4i32) vy2); dot3 OP2 (vx1i * vy1r); vx6 = LD_SP(x); x += 4; vx7 = LD_SP(x); x += 4; vy6 = LD_SP(y); y += 4; vy7 = LD_SP(y); y += 4; dot0 OP1 (vx0i * vy0i); vx2r = (v4f32) __msa_pckev_w((v4i32) vx5, (v4i32) vx4); dot1 += (vx0r * vy0i); vx2i = (v4f32) __msa_pckod_w((v4i32) vx5, (v4i32) vx4); dot2 OP1 (vx1i * vy1i); vy2r = (v4f32) __msa_pckev_w((v4i32) vy5, (v4i32) vy4); dot3 += (vx1r * vy1i); vy2i = (v4f32) __msa_pckod_w((v4i32) vy5, (v4i32) vy4); vx3r = (v4f32) __msa_pckev_w((v4i32) vx7, (v4i32) vx6); dot4 += (vx2r * vy2r); vx3i = (v4f32) __msa_pckod_w((v4i32) vx7, (v4i32) vx6); dot5 OP2 (vx2i * vy2r); vy3r = (v4f32) __msa_pckev_w((v4i32) vy7, (v4i32) vy6); vy3i = (v4f32) __msa_pckod_w((v4i32) vy7, (v4i32) vy6); dot6 += (vx3r * vy3r); dot7 OP2 (vx3i * vy3r); dot4 OP1 (vx2i * vy2i); dot5 += (vx2r * vy2i); dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } if (n & 15) { if (n & 8) { LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); dot0 += (vx0r * vy0r); dot0 OP1 (vx0i * vy0i); dot1 OP2 (vx0i * vy0r); dot1 += (vx0r * vy0i); dot2 += (vx1r * vy1r); dot2 OP1 (vx1i * vy1i); dot3 OP2 (vx1i * vy1r); dot3 += (vx1r * vy1i); } if (n & 4) { LD_SP2_INC(x, 4, vx0, vx1); LD_SP2_INC(y, 4, vy0, vy1); PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); dot0 += (vx0r * vy0r); dot0 OP1 (vx0i * vy0i); dot1 OP2 (vx0i * vy0r); dot1 += (vx0r * vy0i); } if (n & 2) { LD_GP4_INC(x, 1, x0, x1, x2, x3); LD_GP4_INC(y, 1, y0, y1, y2, y3); dot[0] += (x0 * y0 OP3 x1 * y1); dot[1] OP2 (x1 * y0 OP4 x0 * y1); dot[0] += (x2 * y2 OP3 x3 * y3); dot[1] OP2 (x3 * y2 OP4 x2 * y3); } if (n & 1) { LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); dot[0] += (x0 * y0 OP3 x1 * y1); dot[1] OP2 (x1 * y0 OP4 x0 * y1); } } dot0 += dot2 + dot4 + dot6; dot1 += dot3 + dot5 + dot7; dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); } else { inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; for (i = (n >> 2); i--;) { x0 = *x; x1 = *(x + 1); x += inc_x2; x2 = *x; x3 = *(x + 1); x += inc_x2; x4 = *x; x5 = *(x + 1); x += inc_x2; x6 = *x; x7 = *(x + 1); x += inc_x2; y0 = *y; y1 = *(y + 1); y += inc_y2; y2 = *y; y3 = *(y + 1); y += inc_y2; y4 = *y; y5 = *(y + 1); y += inc_y2; y6 = *y; y7 = *(y + 1); y += inc_y2; dot[0] += (x0 * y0 OP3 x1 * y1); dot[1] OP2 (x1 * y0 OP4 x0 * y1); dot[0] += (x2 * y2 OP3 x3 * y3); dot[1] OP2 (x3 * y2 OP4 x2 * y3); dot[0] += (x4 * y4 OP3 x5 * y5); dot[1] OP2 (x5 * y4 OP4 x4 * y5); dot[0] += (x6 * y6 OP3 x7 * y7); dot[1] OP2 (x7 * y6 OP4 x6 * y7); } if (n & 2) { x0 = *x; x1 = *(x + 1); x += inc_x2; x2 = *x; x3 = *(x + 1); x += inc_x2; y0 = *y; y1 = *(y + 1); y += inc_y2; y2 = *y; y3 = *(y + 1); y += inc_y2; dot[0] += (x0 * y0 OP3 x1 * y1); dot[1] OP2 (x1 * y0 OP4 x0 * y1); dot[0] += (x2 * y2 OP3 x3 * y3); dot[1] OP2 (x3 * y2 OP4 x2 * y3); } if (n & 1) { x0 = *x; x1 = *(x + 1); x += inc_x2; y0 = *y; y1 = *(y + 1); y += inc_y2; dot[0] += (x0 * y0 OP3 x1 * y1); dot[1] OP2 (x1 * y0 OP4 x0 * y1); } } CREAL(result) = dot[0]; CIMAG(result) = dot[1]; return (result); } OpenBLAS-0.2.20/kernel/mips/cgemm_kernel_8x4_msa.c000066400000000000000000002275701313527062700215500ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ LD_SP2_INC(pb0, 4, src_b0, src_b1); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ \ /* 0th col */ \ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = (OP4 src_a0r) * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ res1_r OP0## = src_a1r * src_br; \ res1_r OP1## = src_a1i * src_bi; \ res1_i OP2## = (OP4 src_a1r) * src_bi; \ res1_i OP3## = src_a1i * src_br; \ \ /* 1st col */ \ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = (OP4 src_a0r) * src_bi; \ res2_i OP3## = src_a0i * src_br; \ \ res3_r OP0## = src_a1r * src_br; \ res3_r OP1## = src_a1i * src_bi; \ res3_i OP2## = (OP4 src_a1r) * src_bi; \ res3_i OP3## = src_a1i * src_br; \ \ /* 2nd col */ \ SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ res4_r OP0## = src_a0r * src_br; \ res4_r OP1## = src_a0i * src_bi; \ res4_i OP2## = (OP4 src_a0r) * src_bi; \ res4_i OP3## = src_a0i * src_br; \ \ res5_r OP0## = src_a1r * src_br; \ res5_r OP1## = src_a1i * src_bi; \ res5_i OP2## = (OP4 src_a1r) * src_bi; \ res5_i OP3## = src_a1i * src_br; \ \ /* 3rd col */ \ SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ res6_r OP0## = src_a0r * src_br; \ res6_r OP1## = src_a0i * src_bi; \ res6_i OP2## = (OP4 src_a0r) * src_bi; \ res6_i OP3## = src_a0i * src_br; \ \ res7_r OP0## = src_a1r * src_br; \ res7_r OP1## = src_a1i * src_bi; \ res7_i OP2## = (OP4 src_a1r) * src_bi; \ res7_i OP3## = src_a1i * src_br; \ } #define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ src_b0 = LD_SP(pb0); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ \ /* 0th col */ \ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = (OP4 src_a0r) * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ res1_r OP0## = src_a1r * src_br; \ res1_r OP1## = src_a1i * src_bi; \ res1_i OP2## = (OP4 src_a1r) * src_bi; \ res1_i OP3## = src_a1i * src_br; \ \ /* 1st col */ \ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = (OP4 src_a0r) * src_bi; \ res2_i OP3## = src_a0i * src_br; \ \ res3_r OP0## = src_a1r * src_br; \ res3_r OP1## = src_a1i * src_bi; \ res3_i OP2## = (OP4 src_a1r) * src_bi; \ res3_i OP3## = src_a1i * src_br; \ } #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \ \ /* 0th col */ \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = (OP4 src_a0r) * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ res1_r OP0## = src_a1r * src_br; \ res1_r OP1## = src_a1i * src_bi; \ res1_i OP2## = (OP4 src_a1r) * src_bi; \ res1_i OP3## = src_a1i * src_br; \ } #define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ LD_SP2_INC(pb0, 4, src_b0, src_b1); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ \ /* 0th col */ \ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ /* 1st col */ \ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = OP4 src_a0r * src_bi; \ res2_i OP3## = src_a0i * src_br; \ \ /* 2nd col */ \ SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \ res4_r OP0## = src_a0r * src_br; \ res4_r OP1## = src_a0i * src_bi; \ res4_i OP2## = OP4 src_a0r * src_bi; \ res4_i OP3## = src_a0i * src_br; \ \ /* 3rd col */ \ SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \ res6_r OP0## = src_a0r * src_br; \ res6_r OP1## = src_a0i * src_bi; \ res6_i OP2## = OP4 src_a0r * src_bi; \ res6_i OP3## = src_a0i * src_br; \ } #define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ src_b0 = LD_SP(pb0); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ \ /* 0th col */ \ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ /* 1st col */ \ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = OP4 src_a0r * src_bi; \ res2_i OP3## = src_a0i * src_br; \ } #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ \ /* 0th col */ \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ } #define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \ { \ a0_r = pa0[0]; \ a0_i = pa0[1]; \ b0_r = pb0[0]; \ b0_i = pb0[1]; \ \ res0 OP0## = a0_r * b0_r; \ res0 OP1## = a0_i * b0_i; \ res1 OP2## = OP4 a0_r * b0_i; \ res1 OP3## = a0_i * b0_r; \ \ a1_r = pa0[2]; \ a1_i = pa0[3]; \ res2 OP0## = a1_r * b0_r; \ res2 OP1## = a1_i * b0_i; \ res3 OP2## = OP4 a1_r * b0_i; \ res3 OP3## = a1_i * b0_r; \ \ /* 1st col */ \ b1_r = pb0[2]; \ b1_i = pb0[3]; \ res4 OP0## = a0_r * b1_r; \ res4 OP1## = a0_i * b1_i; \ res5 OP2## = OP4 a0_r * b1_i; \ res5 OP3## = a0_i * b1_r; \ \ res6 OP0## = a1_r * b1_r; \ res6 OP1## = a1_i * b1_i; \ res7 OP2## = OP4 a1_r * b1_i; \ res7 OP3## = a1_i * b1_r; \ \ /* 2nd col */ \ b2_r = pb0[4]; \ b2_i = pb0[5]; \ res8 OP0## = a0_r * b2_r; \ res8 OP1## = a0_i * b2_i; \ res9 OP2## = OP4 a0_r * b2_i; \ res9 OP3## = a0_i * b2_r; \ \ res10 OP0## = a1_r * b2_r; \ res10 OP1## = a1_i * b2_i; \ res11 OP2## = OP4 a1_r * b2_i; \ res11 OP3## = a1_i * b2_r; \ \ /* 3rd col */ \ b3_r = pb0[6]; \ b3_i = pb0[7]; \ res12 OP0## = a0_r * b3_r; \ res12 OP1## = a0_i * b3_i; \ res13 OP2## = OP4 a0_r * b3_i; \ res13 OP3## = a0_i * b3_r; \ \ res14 OP0## = a1_r * b3_r; \ res14 OP1## = a1_i * b3_i; \ res15 OP2## = OP4 a1_r * b3_i; \ res15 OP3## = a1_i * b3_r; \ } #define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \ { \ /* 0th col */ \ a0_r = pa0[0]; \ a0_i = pa0[1]; \ b0_r = pb0[0]; \ b0_i = pb0[1]; \ \ res0 OP0## = a0_r * b0_r; \ res0 OP1## = a0_i * b0_i; \ res1 OP2## = OP4 a0_r * b0_i; \ res1 OP3## = a0_i * b0_r; \ \ a1_r = pa0[2]; \ a1_i = pa0[3]; \ res2 OP0## = a1_r * b0_r; \ res2 OP1## = a1_i * b0_i; \ res3 OP2## = OP4 a1_r * b0_i; \ res3 OP3## = a1_i * b0_r; \ \ /* 1st col */ \ b1_r = pb0[2]; \ b1_i = pb0[3]; \ res4 OP0## = a0_r * b1_r; \ res4 OP1## = a0_i * b1_i; \ res5 OP2## = OP4 a0_r * b1_i; \ res5 OP3## = a0_i * b1_r; \ \ res6 OP0## = a1_r * b1_r; \ res6 OP1## = a1_i * b1_i; \ res7 OP2## = OP4 a1_r * b1_i; \ res7 OP3## = a1_i * b1_r; \ } #define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \ { \ /* 0th col */ \ a0_r = pa0[0]; \ a0_i = pa0[1]; \ b0_r = pb0[0]; \ b0_i = pb0[1]; \ \ res0 OP0## = a0_r * b0_r; \ res0 OP1## = a0_i * b0_i; \ res1 OP2## = OP4 a0_r * b0_i; \ res1 OP3## = a0_i * b0_r; \ \ a1_r = pa0[2]; \ a1_i = pa0[3]; \ res2 OP0## = a1_r * b0_r; \ res2 OP1## = a1_i * b0_i; \ res3 OP2## = OP4 a1_r * b0_i; \ res3 OP3## = a1_i * b0_r; \ } #define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \ { \ /* 0th col */ \ a0_r = pa0[0]; \ a0_i = pa0[1]; \ b0_r = pb0[0]; \ b0_i = pb0[1]; \ \ res0 OP0## = a0_r * b0_r; \ res0 OP1## = a0_i * b0_i; \ res1 OP2## = OP4 a0_r * b0_i; \ res1 OP3## = a0_i * b0_r; \ \ /* 1st col */ \ b1_r = pb0[2]; \ b1_i = pb0[3]; \ res2 OP0## = a0_r * b1_r; \ res2 OP1## = a0_i * b1_i; \ res3 OP2## = OP4 a0_r * b1_i; \ res3 OP3## = a0_i * b1_r; \ \ /* 2nd col */ \ b2_r = pb0[4]; \ b2_i = pb0[5]; \ res4 OP0## = a0_r * b2_r; \ res4 OP1## = a0_i * b2_i; \ res5 OP2## = OP4 a0_r * b2_i; \ res5 OP3## = a0_i * b2_r; \ \ /* 3rd col */ \ b3_r = pb0[6]; \ b3_i = pb0[7]; \ res6 OP0## = a0_r * b3_r; \ res6 OP1## = a0_i * b3_i; \ res7 OP2## = OP4 a0_r * b3_i; \ res7 OP3## = a0_i * b3_r; \ } #define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \ { \ /* 0th col */ \ a0_r = pa0[0]; \ a0_i = pa0[1]; \ b0_r = pb0[0]; \ b0_i = pb0[1]; \ \ res0 OP0## = a0_r * b0_r; \ res0 OP1## = a0_i * b0_i; \ res1 OP2## = OP4 a0_r * b0_i; \ res1 OP3## = a0_i * b0_r; \ \ /* 1st col */ \ b1_r = pb0[2]; \ b1_i = pb0[3]; \ res2 OP0## = a0_r * b1_r; \ res2 OP1## = a0_i * b1_i; \ res3 OP2## = OP4 a0_r * b1_i; \ res3 OP3## = a0_i * b1_r; \ } #define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ { \ /* 0th col */ \ a0_r = pa0[0]; \ a0_i = pa0[1]; \ b0_r = pb0[0]; \ b0_i = pb0[1]; \ \ res0 OP0## = a0_r * b0_r; \ res0 OP1## = a0_i * b0_i; \ res1 OP2## = OP4 a0_r * b0_i; \ res1 OP3## = a0_i * b0_r; \ } #define CGEMM_SCALE_8X4_MSA \ { \ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r += alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i += alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ \ LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r += alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i += alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ \ LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i += alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ dst1_r += alpha_r * res5_r; \ dst1_r -= alpha_i * res5_i; \ dst1_i += alpha_r * res5_i; \ dst1_i += alpha_i * res5_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ \ LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i += alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ dst1_r += alpha_r * res7_r; \ dst1_r -= alpha_i * res7_i; \ dst1_i += alpha_r * res7_i; \ dst1_i += alpha_i * res7_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ } #define CGEMM_SCALE_8X2_MSA \ { \ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r += alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i += alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ \ LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r += alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i += alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ } #define CGEMM_SCALE_8X1_MSA \ { \ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r += alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i += alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ } #define CGEMM_SCALE_4X4_MSA \ { \ LD_SP2(pc0, 4, dst0, dst1); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc0, 4); \ \ LD_SP2(pc1, 4, dst0, dst1); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc1, 4); \ \ LD_SP2(pc2, 4, dst0, dst1); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i += alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc2, 4); \ \ LD_SP2(pc3, 4, dst0, dst1); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i += alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc3, 4); \ } #define CGEMM_SCALE_4X2_MSA \ { \ LD_SP2(pc0, 4, dst0, dst1); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc0, 4); \ \ LD_SP2(pc1, 4, dst0, dst1); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc1, 4); \ } #define CGEMM_SCALE_4X1_MSA \ { \ LD_SP2(pc0, 4, dst0, dst1); \ \ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc0, 4); \ } #define CGEMM_SCALE_2X4 \ { \ /* 0th col */ \ pc0[0] += alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ pc0[2] += alphar * res2; \ pc0[2] -= alphai * res3; \ pc0[3] += alphar * res3; \ pc0[3] += alphai * res2; \ \ /* 1st col */ \ pc1[0] += alphar * res4; \ pc1[0] -= alphai * res5; \ pc1[1] += alphar * res5; \ pc1[1] += alphai * res4; \ pc1[2] += alphar * res6; \ pc1[2] -= alphai * res7; \ pc1[3] += alphar * res7; \ pc1[3] += alphai * res6; \ \ /* 2nd col */ \ pc2[0] += alphar * res8; \ pc2[0] -= alphai * res9; \ pc2[1] += alphar * res9; \ pc2[1] += alphai * res8; \ pc2[2] += alphar * res10; \ pc2[2] -= alphai * res11; \ pc2[3] += alphar * res11; \ pc2[3] += alphai * res10; \ \ /* 3rd col */ \ pc3[0] += alphar * res12; \ pc3[0] -= alphai * res13; \ pc3[1] += alphar * res13; \ pc3[1] += alphai * res12; \ pc3[2] += alphar * res14; \ pc3[2] -= alphai * res15; \ pc3[3] += alphar * res15; \ pc3[3] += alphai * res14; \ } #define CGEMM_SCALE_2X2 \ { \ /* 0th col */ \ pc0[0] += alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ pc0[2] += alphar * res2; \ pc0[2] -= alphai * res3; \ pc0[3] += alphar * res3; \ pc0[3] += alphai * res2; \ \ /* 1st col */ \ pc1[0] += alphar * res4; \ pc1[0] -= alphai * res5; \ pc1[1] += alphar * res5; \ pc1[1] += alphai * res4; \ pc1[2] += alphar * res6; \ pc1[2] -= alphai * res7; \ pc1[3] += alphar * res7; \ pc1[3] += alphai * res6; \ } #define CGEMM_SCALE_2X1 \ { \ pc0[0] += alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ \ pc0[2] += alphar * res2; \ pc0[2] -= alphai * res3; \ pc0[3] += alphar * res3; \ pc0[3] += alphai * res2; \ } #define CGEMM_SCALE_1X4 \ { \ pc0[0] += alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ \ pc1[0] += alphar * res2; \ pc1[0] -= alphai * res3; \ pc1[1] += alphar * res3; \ pc1[1] += alphai * res2; \ \ pc2[0] += alphar * res4; \ pc2[0] -= alphai * res5; \ pc2[1] += alphar * res5; \ pc2[1] += alphai * res4; \ \ pc3[0] += alphar * res6; \ pc3[0] -= alphai * res7; \ pc3[1] += alphar * res7; \ pc3[1] += alphai * res6; \ } #define CGEMM_SCALE_1X2 \ { \ pc0[0] += alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ \ pc1[2] += alphar * res2; \ pc1[2] -= alphai * res3; \ pc1[3] += alphar * res3; \ pc1[3] += alphai * res2; \ } #define CGEMM_SCALE_1X1 \ { \ pc0[0] += alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ } #define CGEMM_TRMM_SCALE_8X4_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r = alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i = alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r = alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i = alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ \ dst0_r = alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i = alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ dst1_r = alpha_r * res5_r; \ dst1_r -= alpha_i * res5_i; \ dst1_i = alpha_r * res5_i; \ dst1_i += alpha_i * res5_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \ \ dst0_r = alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i = alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ dst1_r = alpha_r * res7_r; \ dst1_r -= alpha_i * res7_i; \ dst1_i = alpha_r * res7_i; \ dst1_i += alpha_i * res7_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \ } #define CGEMM_TRMM_SCALE_8X2_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r = alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i = alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r = alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i = alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \ } #define CGEMM_TRMM_SCALE_8X1_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r = alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i = alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \ \ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \ } #define CGEMM_TRMM_SCALE_4X4_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc0, 4); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc1, 4); \ \ dst0_r = alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i = alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc2, 4); \ \ dst0_r = alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i = alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc3, 4); \ } #define CGEMM_TRMM_SCALE_4X2_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc0, 4); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc1, 4); \ } #define CGEMM_TRMM_SCALE_4X1_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \ \ ST_SP2_INC(dst0, dst1, pc0, 4); \ } #define CGEMM_TRMM_SCALE_2X4 \ { \ /* 0th col */ \ pc0[0] = alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ pc0[2] = alphar * res2; \ pc0[2] -= alphai * res3; \ pc0[3] = alphar * res3; \ pc0[3] += alphai * res2; \ \ /* 1st col */ \ pc1[0] = alphar * res4; \ pc1[0] -= alphai * res5; \ pc1[1] = alphar * res5; \ pc1[1] += alphai * res4; \ pc1[2] = alphar * res6; \ pc1[2] -= alphai * res7; \ pc1[3] = alphar * res7; \ pc1[3] += alphai * res6; \ \ /* 2nd col */ \ pc2[0] = alphar * res8; \ pc2[0] -= alphai * res9; \ pc2[1] = alphar * res9; \ pc2[1] += alphai * res8; \ pc2[2] = alphar * res10; \ pc2[2] -= alphai * res11; \ pc2[3] = alphar * res11; \ pc2[3] += alphai * res10; \ \ /* 3rd col */ \ pc3[0] = alphar * res12; \ pc3[0] -= alphai * res13; \ pc3[1] = alphar * res13; \ pc3[1] += alphai * res12; \ pc3[2] = alphar * res14; \ pc3[2] -= alphai * res15; \ pc3[3] = alphar * res15; \ pc3[3] += alphai * res14; \ } #define CGEMM_TRMM_SCALE_2X2 \ { \ /* 0th col */ \ pc0[0] = alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ pc0[2] = alphar * res2; \ pc0[2] -= alphai * res3; \ pc0[3] = alphar * res3; \ pc0[3] += alphai * res2; \ \ /* 1st col */ \ pc1[0] = alphar * res4; \ pc1[0] -= alphai * res5; \ pc1[1] = alphar * res5; \ pc1[1] += alphai * res4; \ pc1[2] = alphar * res6; \ pc1[2] -= alphai * res7; \ pc1[3] = alphar * res7; \ pc1[3] += alphai * res6; \ } #define CGEMM_TRMM_SCALE_2X1 \ { \ pc0[0] = alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ \ pc0[2] = alphar * res2; \ pc0[2] -= alphai * res3; \ pc0[3] = alphar * res3; \ pc0[3] += alphai * res2; \ } #define CGEMM_TRMM_SCALE_1X4 \ { \ pc0[0] = alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ \ pc1[0] = alphar * res2; \ pc1[0] -= alphai * res3; \ pc1[1] = alphar * res3; \ pc1[1] += alphai * res2; \ \ pc2[0] = alphar * res4; \ pc2[0] -= alphai * res5; \ pc2[1] = alphar * res5; \ pc2[1] += alphai * res4; \ \ pc3[0] = alphar * res6; \ pc3[0] -= alphai * res7; \ pc3[1] = alphar * res7; \ pc3[1] += alphai * res6; \ } #define CGEMM_TRMM_SCALE_1X2 \ { \ pc0[0] = alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ \ pc1[2] = alphar * res2; \ pc1[2] -= alphai * res3; \ pc1[3] = alphar * res3; \ pc1[3] += alphai * res2; \ } #define CGEMM_TRMM_SCALE_1X1 \ { \ pc0[0] = alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { BLASLONG i, j, l, temp; #if defined(TRMMKERNEL) BLASLONG off; #endif FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; FLOAT res0, res1, res2, res3, res4, res5, res6, res7; FLOAT res8, res9, res10, res11, res12, res13, res14, res15; FLOAT a0_r, a1_r, a0_i, a1_i, b0_i, b1_i, b2_i, b3_i; FLOAT b0_r, b1_r, b2_r, b3_r; v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1; v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; v4f32 dst0, dst1, dst2, dst3, alpha_r, alpha_i; v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; v4f32 dst0_r, dst0_i, dst1_r, dst1_i; alpha_r = COPY_FLOAT_TO_VECTOR(alphar); alpha_i = COPY_FLOAT_TO_VECTOR(alphai); #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset; #endif for (j = (n >> 2); j--;) { pc0 = C; pc1 = pc0 + 2 * ldc; pc2 = pc1 + 2 * ldc; pc3 = pc2 + 2 * ldc; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 8; pb0 = B + off * 2 * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pa0]) \n\t" "pref 0, 96(%[pa0]) \n\t" "pref 0, 32(%[pb0]) \n\t" : : [pa0] "r" (pa0), [pb0] "r" (pb0) ); #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X4_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_8X4_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_8X4_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_8X4_MSA(, -, , -, -); #endif for (l = (temp - 1); l--;) { #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pa0]) \n\t" "pref 0, 96(%[pa0]) \n\t" "pref 0, 32(%[pb0]) \n\t" : : [pa0] "r" (pa0), [pb0] "r" (pb0) ); #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X4_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_8X4_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_8X4_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_8X4_MSA(+, -, -, -,); #endif } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_8X4_MSA #else CGEMM_SCALE_8X4_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2 * 8; pb0 += temp * 2 * 4; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 4; pb0 = B + off * 2 * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_4X4_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_4X4_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_4X4_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_4X4_MSA(, -, , -, -); #endif for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_4X4_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_4X4_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_4X4_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_4X4_MSA(+, -, -, -,); #endif } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_4X4_MSA #else CGEMM_SCALE_4X4_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2 * 4; pb0 += temp * 2 * 4; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 2; pb0 = B + off * 2 * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_2X4(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_2X4(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_2X4(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_2X4(, -, , -, -); #endif pa0 += 4; pb0 += 8; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_2X4(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_2X4(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_2X4(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_2X4(+, -, -, -,); #endif pa0 += 4; pb0 += 8; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_2X4 #else CGEMM_SCALE_2X4 #endif pc0 += 4; pc1 += 4; pc2 += 4; pc3 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2 * 2; pb0 += temp * 2 * 4; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 1; pb0 = B + off * 2 * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_1X4(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_1X4(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_1X4(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_1X4(, -, , -, -); #endif pa0 += 2; pb0 += 8; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_1X4(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_1X4(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_1X4(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_1X4(+, -, -, -,); #endif pa0 += 2; pb0 += 8; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_1X4 #else CGEMM_SCALE_1X4 #endif pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2 * 1; pb0 += temp * 2 * 4; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif B += (k << 3); C += (ldc << 3); } if (n & 2) { pc0 = C; pc1 = pc0 + 2 * ldc; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 8; pb0 = B + off * 2 * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X2_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_8X2_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_8X2_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_8X2_MSA(, -, , -, -); #endif pb0 += 4; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X2_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_8X2_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_8X2_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_8X2_MSA(+, -, -, -,); #endif pb0 += 4; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_8X2_MSA #else CGEMM_SCALE_8X2_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2 * 8; pb0 += temp * 2 * 2; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 4; pb0 = B + off * 2 * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_4X2_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_4X2_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_4X2_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_4X2_MSA(, -, , -, -); #endif pb0 += 4; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_4X2_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_4X2_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_4X2_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_4X2_MSA(+, -, -, -,); #endif pb0 += 4; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_4X2_MSA #else CGEMM_SCALE_4X2_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2 * 4; pb0 += temp * 2 * 2; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 2; pb0 = B + off * 2 * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_2X2(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_2X2(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_2X2(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_2X2(, -, , -, -); #endif pa0 += 4; pb0 += 4; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_2X2(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_2X2(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_2X2(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_2X2(+, -, -, -,); #endif pa0 += 4; pb0 += 4; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_2X2 #else CGEMM_SCALE_2X2 #endif pc0 += 4; pc1 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2 * 2; pb0 += temp * 2 * 2; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 1; pb0 = B + off * 2 * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_1X2(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_1X2(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_1X2(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_1X2(, -, , -, -); #endif pa0 += 2; pb0 += 4; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_1X2(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_1X2(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_1X2(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_1X2(+, -, -, -,); #endif pa0 += 2; pb0 += 4; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_1X2 #else CGEMM_SCALE_1X2 #endif pc0 += 2; pc1 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2 * 1; pb0 += temp * 2 * 2; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif B += (k << 2); C += (ldc << 2); } if (n & 1) { pc0 = C; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 8; pb0 = B + off * 2 * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X1_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_8X1_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_8X1_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_8X1_MSA(, -, , -, -); #endif pb0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_8X1_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_8X1_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_8X1_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_8X1_MSA(+, -, -, -,); #endif pb0 += 2; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_8X1_MSA #else CGEMM_SCALE_8X1_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2 * 8; pb0 += temp * 2 * 1; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 4; pb0 = B + off * 2 * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_4X1_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_4X1_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_4X1_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_4X1_MSA(, -, , -, -); #endif pb0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_4X1_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_4X1_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_4X1_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_4X1_MSA(+, -, -, -,); #endif pb0 += 2; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_4X1_MSA #else CGEMM_SCALE_4X1_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2 * 4; pb0 += temp * 2 * 1; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 2; pb0 = B + off * 2 * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_2X1(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_2X1(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_2X1(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_2X1(, -, , -, -); #endif pa0 += 4; pb0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_2X1(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_2X1(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_2X1(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_2X1(+, -, -, -,); #endif pa0 += 4; pb0 += 2; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_2X1 #else CGEMM_SCALE_2X1 #endif pc0 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2 * 2; pb0 += temp * 2 * 1; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 1; pb0 = B + off * 2 * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_1X1(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_1X1(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_1X1(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_1X1(, -, , -, -); #endif pa0 += 2; pb0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) CGEMM_KERNEL_1X1(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) CGEMM_KERNEL_1X1(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) CGEMM_KERNEL_1X1(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) CGEMM_KERNEL_1X1(+, -, -, -,); #endif pa0 += 2; pb0 += 2; } #if defined(TRMMKERNEL) CGEMM_TRMM_SCALE_1X1 #else CGEMM_SCALE_1X1 #endif pc0 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2 * 1; pb0 += temp * 2 * 1; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 1; // number of values in A #endif B += (k << 1); C += (ldc << 1); } return 0; } OpenBLAS-0.2.20/kernel/mips/cgemm_ncopy_4_msa.c000066400000000000000000000130141313527062700211220ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; FLOAT ctemp05, ctemp06, ctemp07, ctemp08; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 dst0, dst1, dst4, dst5; psrc0 = src; pdst = dst; lda *= 2; for (j = (n >> 2); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; for (i = (m >> 2); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); LD_SP2_INC(psrc3, 4, src4, src5); LD_SP2_INC(psrc4, 4, src6, src7); ILVRL_D2_SP(src2, src0, dst0, dst4); ILVRL_D2_SP(src6, src4, dst1, dst5); ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); ILVRL_D2_SP(src3, src1, dst0, dst4); ILVRL_D2_SP(src7, src5, dst1, dst5); ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); } if (m & 2) { src0 = LD_SP(psrc1); src2 = LD_SP(psrc2); src4 = LD_SP(psrc3); src6 = LD_SP(psrc4); psrc1 += 4; psrc2 += 4; psrc3 += 4; psrc4 += 4; ILVRL_D2_SP(src2, src0, dst0, dst4); ILVRL_D2_SP(src6, src4, dst1, dst5); ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); ctemp05 = *(psrc3 + 0); ctemp06 = *(psrc3 + 1); ctemp07 = *(psrc4 + 0); ctemp08 = *(psrc4 + 1); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; *(pdst + 0) = ctemp01; *(pdst + 1) = ctemp02; *(pdst + 2) = ctemp03; *(pdst + 3) = ctemp04; *(pdst + 4) = ctemp05; *(pdst + 5) = ctemp06; *(pdst + 6) = ctemp07; *(pdst + 7) = ctemp08; pdst += 8; } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; for (i = (m >> 2); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); ILVRL_D2_SP(src2, src0, dst0, dst4); ST_SP2_INC(dst0, dst4, pdst, 4); ILVRL_D2_SP(src3, src1, dst0, dst4); ST_SP2_INC(dst0, dst4, pdst, 4); } if (m & 2) { src0 = LD_SP(psrc1); src2 = LD_SP(psrc2); psrc1 += 4; psrc2 += 4; ILVRL_D2_SP(src2, src0, dst0, dst4); ST_SP2_INC(dst0, dst4, pdst, 4); } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); psrc1 += 2; psrc2 += 2; *(pdst + 0) = ctemp01; *(pdst + 1) = ctemp02; *(pdst + 2) = ctemp03; *(pdst + 3) = ctemp04; pdst += 4; } } if (n & 1) { psrc1 = psrc0; for (i = (m >> 2); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); ST_SP2_INC(src0, src1, pdst, 4); } if (m & 2) { src0 = LD_SP(psrc1); psrc1 += 4; ST_SP(src0, pdst); pdst += 4; } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); psrc1 += 2; *(pdst + 0) = ctemp01; *(pdst + 1) = ctemp02; pdst += 2; } } return 0; } OpenBLAS-0.2.20/kernel/mips/cgemm_ncopy_8_msa.c000066400000000000000000000221471313527062700211350ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; FLOAT *psrc8, *pdst; FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07; FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14; FLOAT ctemp15, ctemp16; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; psrc0 = src; pdst = dst; lda *= 2; for (j = (n >> 3); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc5 = psrc4 + lda; psrc6 = psrc5 + lda; psrc7 = psrc6 + lda; psrc8 = psrc7 + lda; psrc0 += 8 * lda; for (i = (m >> 2); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); LD_SP2_INC(psrc3, 4, src4, src5); LD_SP2_INC(psrc4, 4, src6, src7); LD_SP2_INC(psrc5, 4, src8, src9); LD_SP2_INC(psrc6, 4, src10, src11); LD_SP2_INC(psrc7, 4, src12, src13); LD_SP2_INC(psrc8, 4, src14, src15); ILVRL_D2_SP(src2, src0, dst0, dst4); ILVRL_D2_SP(src6, src4, dst1, dst5); ILVRL_D2_SP(src10, src8, dst2, dst6); ILVRL_D2_SP(src14, src12, dst3, dst7); ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); ILVRL_D2_SP(src3, src1, dst0, dst4); ILVRL_D2_SP(src7, src5, dst1, dst5); ILVRL_D2_SP(src11, src9, dst2, dst6); ILVRL_D2_SP(src15, src13, dst3, dst7); ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); } if (m & 2) { src0 = LD_SP(psrc1); src2 = LD_SP(psrc2); src4 = LD_SP(psrc3); src6 = LD_SP(psrc4); src8 = LD_SP(psrc5); src10 = LD_SP(psrc6); src12 = LD_SP(psrc7); src14 = LD_SP(psrc8); psrc1 += 4; psrc2 += 4; psrc3 += 4; psrc4 += 4; psrc5 += 4; psrc6 += 4; psrc7 += 4; psrc8 += 4; ILVRL_D2_SP(src2, src0, dst0, dst4); ILVRL_D2_SP(src6, src4, dst1, dst5); ILVRL_D2_SP(src10, src8, dst2, dst6); ILVRL_D2_SP(src14, src12, dst3, dst7); ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4); } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); ctemp05 = *(psrc3 + 0); ctemp06 = *(psrc3 + 1); ctemp07 = *(psrc4 + 0); ctemp08 = *(psrc4 + 1); ctemp09 = *(psrc5 + 0); ctemp10 = *(psrc5 + 1); ctemp11 = *(psrc6 + 0); ctemp12 = *(psrc6 + 1); ctemp13 = *(psrc7 + 0); ctemp14 = *(psrc7 + 1); ctemp15 = *(psrc8 + 0); ctemp16 = *(psrc8 + 1); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; psrc5 += 2; psrc6 += 2; psrc7 += 2; psrc8 += 2; *(pdst + 0) = ctemp01; *(pdst + 1) = ctemp02; *(pdst + 2) = ctemp03; *(pdst + 3) = ctemp04; *(pdst + 4) = ctemp05; *(pdst + 5) = ctemp06; *(pdst + 6) = ctemp07; *(pdst + 7) = ctemp08; *(pdst + 8) = ctemp09; *(pdst + 9) = ctemp10; *(pdst + 10) = ctemp11; *(pdst + 11) = ctemp12; *(pdst + 12) = ctemp13; *(pdst + 13) = ctemp14; *(pdst + 14) = ctemp15; *(pdst + 15) = ctemp16; pdst += 16; } } if (n & 4) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; for (i = (m >> 2); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); LD_SP2_INC(psrc3, 4, src4, src5); LD_SP2_INC(psrc4, 4, src6, src7); ILVRL_D2_SP(src2, src0, dst0, dst4); ILVRL_D2_SP(src6, src4, dst1, dst5); ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); ILVRL_D2_SP(src3, src1, dst0, dst4); ILVRL_D2_SP(src7, src5, dst1, dst5); ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); } if (m & 2) { src0 = LD_SP(psrc1); src2 = LD_SP(psrc2); src4 = LD_SP(psrc3); src6 = LD_SP(psrc4); psrc1 += 4; psrc2 += 4; psrc3 += 4; psrc4 += 4; ILVRL_D2_SP(src2, src0, dst0, dst4); ILVRL_D2_SP(src6, src4, dst1, dst5); ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4); } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); ctemp05 = *(psrc3 + 0); ctemp06 = *(psrc3 + 1); ctemp07 = *(psrc4 + 0); ctemp08 = *(psrc4 + 1); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; *(pdst + 0) = ctemp01; *(pdst + 1) = ctemp02; *(pdst + 2) = ctemp03; *(pdst + 3) = ctemp04; *(pdst + 4) = ctemp05; *(pdst + 5) = ctemp06; *(pdst + 6) = ctemp07; *(pdst + 7) = ctemp08; pdst += 8; } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; for (i = (m >> 2); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); ILVRL_D2_SP(src2, src0, dst0, dst4); ST_SP2_INC(dst0, dst4, pdst, 4); ILVRL_D2_SP(src3, src1, dst0, dst4); ST_SP2_INC(dst0, dst4, pdst, 4); } if (m & 2) { src0 = LD_SP(psrc1); src2 = LD_SP(psrc2); psrc1 += 4; psrc2 += 4; ILVRL_D2_SP(src2, src0, dst0, dst4); ST_SP2_INC(dst0, dst4, pdst, 4); } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); psrc1 += 2; psrc2 += 2; *(pdst + 0) = ctemp01; *(pdst + 1) = ctemp02; *(pdst + 2) = ctemp03; *(pdst + 3) = ctemp04; pdst += 4; } } if (n & 1) { psrc1 = psrc0; for (i = (m >> 2); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); ST_SP2_INC(src0, src1, pdst, 4); } if (m & 2) { src0 = LD_SP(psrc1); psrc1 += 4; ST_SP(src0, pdst); pdst += 4; } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); psrc1 += 2; *(pdst + 0) = ctemp01; *(pdst + 1) = ctemp02; pdst += 2; } } return 0; } OpenBLAS-0.2.20/kernel/mips/cgemm_tcopy_4_msa.c000066400000000000000000000071461313527062700211410ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0; FLOAT *psrc1, *psrc2; FLOAT *pdst0; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; v4f32 src0, src1, src2, src3; psrc0 = src; pdst0 = dst; lda *= 2; for (j = (n >> 2); j--;) { psrc1 = psrc0; psrc2 = psrc0 + lda; psrc0 += 8; for (i = (m >> 1); i--;) { LD_SP2(psrc1, 4, src0, src1); LD_SP2(psrc2, 4, src2, src3); ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); psrc1 += 2 * lda; psrc2 += 2 * lda; } if (m & 1) { LD_SP2(psrc1, 4, src0, src1); ST_SP2_INC(src0, src1, pdst0, 4); } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc0 + lda; psrc0 += 4; for (i = (m >> 1); i--;) { src0 = LD_SP(psrc1); src1 = LD_SP(psrc2); ST_SP2_INC(src0, src1, pdst0, 4); psrc1 += 2 * lda; psrc2 += 2 * lda; } if (m & 1) { src0 = LD_SP(psrc1); ST_SP(src0, pdst0); pdst0 += 4; } } if (n & 1) { psrc1 = psrc0; psrc2 = psrc0 + lda; psrc0 += 2; for (i = (m >> 1); i--;) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); *(pdst0 + 0) = ctemp01; *(pdst0 + 1) = ctemp02; *(pdst0 + 2) = ctemp03; *(pdst0 + 3) = ctemp04; psrc1 += 2 * lda; psrc2 += 2 * lda; pdst0 += 4; } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); *(pdst0 + 0) = ctemp01; *(pdst0 + 1) = ctemp02; pdst0 += 2; } } return 0; } OpenBLAS-0.2.20/kernel/mips/cgemm_tcopy_8_msa.c000066400000000000000000000142701313527062700211410ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *pdst0; FLOAT ctemp01, ctemp02, ctemp03, ctemp04; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; psrc0 = src; pdst0 = dst; lda *= 2; for (j = (n >> 3); j--;) { psrc1 = psrc0; psrc2 = psrc0 + lda; psrc0 += 16; for (i = (m >> 2); i--;) { LD_SP4(psrc1, 4, src0, src1, src2, src3); LD_SP4(psrc2, 4, src4, src5, src6, src7); LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11); LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15); ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4); psrc1 += 4 * lda; psrc2 += 4 * lda; } if (m & 2) { LD_SP4(psrc1, 4, src0, src1, src2, src3); LD_SP4(psrc2, 4, src4, src5, src6, src7); ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4); psrc1 += 2 * lda; psrc2 += 2 * lda; } if (m & 1) { LD_SP4(psrc1, 4, src0, src1, src2, src3); ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); } } if (n & 4) { psrc1 = psrc0; psrc2 = psrc0 + lda; psrc0 += 8; for (i = (m >> 2); i--;) { LD_SP2(psrc1, 4, src0, src1); LD_SP2(psrc2, 4, src2, src3); LD_SP2(psrc1 + 2 * lda, 4, src4, src5); LD_SP2(psrc2 + 2 * lda, 4, src6, src7); ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); ST_SP4_INC(src4, src5, src6, src7, pdst0, 4); psrc1 += 4 * lda; psrc2 += 4 * lda; } if (m & 2) { LD_SP2(psrc1, 4, src0, src1); LD_SP2(psrc2, 4, src2, src3); ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); psrc1 += 2 * lda; psrc2 += 2 * lda; } if (m & 1) { LD_SP2(psrc1, 4, src0, src1); ST_SP2_INC(src0, src1, pdst0, 4); } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc0 + lda; psrc0 += 4; for (i = (m >> 2); i--;) { src0 = LD_SP(psrc1); src1 = LD_SP(psrc2); src2 = LD_SP(psrc1 + 2 * lda); src3 = LD_SP(psrc2 + 2 * lda); ST_SP4_INC(src0, src1, src2, src3, pdst0, 4); psrc1 += 4 * lda; psrc2 += 4 * lda; } if (m & 2) { src0 = LD_SP(psrc1); src1 = LD_SP(psrc2); ST_SP2_INC(src0, src1, pdst0, 4); psrc1 += 2 * lda; psrc2 += 2 * lda; } if (m & 1) { src0 = LD_SP(psrc1); ST_SP(src0, pdst0); pdst0 += 4; } } if (n & 1) { psrc1 = psrc0; psrc2 = psrc0 + lda; psrc0 += 2; for (i = (m >> 2); i--;) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); *(pdst0 + 0) = ctemp01; *(pdst0 + 1) = ctemp02; *(pdst0 + 2) = ctemp03; *(pdst0 + 3) = ctemp04; psrc1 += 2 * lda; psrc2 += 2 * lda; pdst0 += 4; ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); *(pdst0 + 0) = ctemp01; *(pdst0 + 1) = ctemp02; *(pdst0 + 2) = ctemp03; *(pdst0 + 3) = ctemp04; psrc1 += 2 * lda; psrc2 += 2 * lda; pdst0 += 4; } if (m & 2) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); ctemp03 = *(psrc2 + 0); ctemp04 = *(psrc2 + 1); *(pdst0 + 0) = ctemp01; *(pdst0 + 1) = ctemp02; *(pdst0 + 2) = ctemp03; *(pdst0 + 3) = ctemp04; psrc1 += 2 * lda; psrc2 += 2 * lda; pdst0 += 4; } if (m & 1) { ctemp01 = *(psrc1 + 0); ctemp02 = *(psrc1 + 1); *(pdst0 + 0) = ctemp01; *(pdst0 + 1) = ctemp02; pdst0 += 2; } } return 0; } OpenBLAS-0.2.20/kernel/mips/cgemv_n_msa.c000066400000000000000000000655661313527062700200400ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #undef OP0 #undef OP1 #undef OP2 #undef OP3 #undef OP4 #if !defined(XCONJ) #define OP3 -= #define OP4 += #else #define OP3 += #define OP4 -= #endif #if !defined(CONJ) #if !defined(XCONJ) #define OP0 -= #define OP1 += #define OP2 += #else #define OP0 += #define OP1 += #define OP2 -= #endif #else #if !defined(XCONJ) #define OP0 += #define OP1 -= #define OP2 -= #else #define OP0 -= #define OP1 -= #define OP2 += #endif #endif #define CGEMV_N_8x4() \ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ \ y0r += tp0r * src0r; \ y1r += tp0r * src1r; \ y0r += tp1r * src2r; \ y1r += tp1r * src3r; \ y0r += tp2r * src4r; \ y1r += tp2r * src5r; \ y0r += tp3r * src6r; \ y1r += tp3r * src7r; \ \ y0r OP0 tp0i * src0i; \ y1r OP0 tp0i * src1i; \ y0r OP0 tp1i * src2i; \ y1r OP0 tp1i * src3i; \ y0r OP0 tp2i * src4i; \ y1r OP0 tp2i * src5i; \ y0r OP0 tp3i * src6i; \ y1r OP0 tp3i * src7i; \ \ y0i OP1 tp0r * src0i; \ y1i OP1 tp0r * src1i; \ y0i OP1 tp1r * src2i; \ y1i OP1 tp1r * src3i; \ y0i OP1 tp2r * src4i; \ y1i OP1 tp2r * src5i; \ y0i OP1 tp3r * src6i; \ y1i OP1 tp3r * src7i; \ \ y0i OP2 tp0i * src0r; \ y1i OP2 tp0i * src1r; \ y0i OP2 tp1i * src2r; \ y1i OP2 tp1i * src3r; \ y0i OP2 tp2i * src4r; \ y1i OP2 tp2i * src5r; \ y0i OP2 tp3i * src6r; \ y1i OP2 tp3i * src7r; \ #define CGEMV_N_4x4() \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t4, t5); \ LD_SP2(pa2 + k, 4, t8, t9); \ LD_SP2(pa3 + k, 4, t12, t13); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ \ y0r += tp0r * src0r; \ y0r += tp1r * src2r; \ y0r += tp2r * src4r; \ y0r += tp3r * src6r; \ \ y0r OP0 tp0i * src0i; \ y0r OP0 tp1i * src2i; \ y0r OP0 tp2i * src4i; \ y0r OP0 tp3i * src6i; \ \ y0i OP1 tp0r * src0i; \ y0i OP1 tp1r * src2i; \ y0i OP1 tp2r * src4i; \ y0i OP1 tp3r * src6i; \ \ y0i OP2 tp0i * src0r; \ y0i OP2 tp1i * src2r; \ y0i OP2 tp2i * src4r; \ y0i OP2 tp3i * src6r; \ #define CGEMV_N_1x4() \ res0 = y[0 * inc_y2]; \ res1 = y[0 * inc_y2 + 1]; \ \ res0 += temp0_r * pa0[k]; \ res0 OP0 temp0_i * pa0[k + 1]; \ res0 += temp1_r * pa1[k]; \ res0 OP0 temp1_i * pa1[k + 1]; \ res0 += temp2_r * pa2[k]; \ res0 OP0 temp2_i * pa2[k + 1]; \ res0 += temp3_r * pa3[k]; \ res0 OP0 temp3_i * pa3[k + 1]; \ \ res1 OP1 temp0_r * pa0[k + 1]; \ res1 OP2 temp0_i * pa0[k]; \ res1 OP1 temp1_r * pa1[k + 1]; \ res1 OP2 temp1_i * pa1[k]; \ res1 OP1 temp2_r * pa2[k + 1]; \ res1 OP2 temp2_i * pa2[k]; \ res1 OP1 temp3_r * pa3[k + 1]; \ res1 OP2 temp3_i * pa3[k]; \ \ y[0 * inc_y2] = res0; \ y[0 * inc_y2 + 1] = res1; \ #define CGEMV_N_8x2() \ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ \ y0r += tp0r * src0r; \ y1r += tp0r * src1r; \ y0r += tp1r * src2r; \ y1r += tp1r * src3r; \ \ y0r OP0 tp0i * src0i; \ y1r OP0 tp0i * src1i; \ y0r OP0 tp1i * src2i; \ y1r OP0 tp1i * src3i; \ \ y0i OP1 tp0r * src0i; \ y1i OP1 tp0r * src1i; \ y0i OP1 tp1r * src2i; \ y1i OP1 tp1r * src3i; \ \ y0i OP2 tp0i * src0r; \ y1i OP2 tp0i * src1r; \ y0i OP2 tp1i * src2r; \ y1i OP2 tp1i * src3r; \ #define CGEMV_N_4x2() \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t4, t5); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ \ y0r += tp0r * src0r; \ y0r += tp1r * src2r; \ \ y0r OP0 tp0i * src0i; \ y0r OP0 tp1i * src2i; \ \ y0i OP1 tp0r * src0i; \ y0i OP1 tp1r * src2i; \ \ y0i OP2 tp0i * src0r; \ y0i OP2 tp1i * src2r; \ #define CGEMV_N_1x2() \ res0 = y[0 * inc_y2]; \ res1 = y[0 * inc_y2 + 1]; \ \ res0 += temp0_r * pa0[k]; \ res0 OP0 temp0_i * pa0[k + 1]; \ res0 += temp1_r * pa1[k]; \ res0 OP0 temp1_i * pa1[k + 1]; \ \ res1 OP1 temp0_r * pa0[k + 1]; \ res1 OP2 temp0_i * pa0[k]; \ res1 OP1 temp1_r * pa1[k + 1]; \ res1 OP2 temp1_i * pa1[k]; \ \ y[0 * inc_y2] = res0; \ y[0 * inc_y2 + 1] = res1; \ #define CGEMV_N_1x1() \ res0 = y[0 * inc_y2]; \ res1 = y[0 * inc_y2 + 1]; \ \ res0 += temp_r * pa0[k]; \ res0 OP0 temp_i * pa0[k + 1]; \ \ res1 OP1 temp_r * pa0[k + 1]; \ res1 OP2 temp_i * pa0[k]; \ \ y[0 * inc_y2] = res0; \ y[0 * inc_y2 + 1] = res1; \ #define CLOAD_X4_SCALE_VECTOR() \ LD_SP2(x, 4, x0, x1); \ \ PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ \ tp4r = alphar * x0r; \ tp4r OP3 alphai * x0i; \ tp4i = alphar * x0i; \ tp4i OP4 alphai * x0r; \ \ SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ #define CLOAD_X4_SCALE_GP() \ x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ \ tp4r = alphar * x0r; \ tp4r OP3 alphai * x0i; \ tp4i = alphar * x0i; \ tp4i OP4 alphai * x0r; \ \ SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \ SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \ #define CLOAD_X2_SCALE_GP() \ temp0_r = alpha_r * x[0 * inc_x2]; \ temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ temp0_i = alpha_r * x[0 * inc_x2 + 1]; \ temp0_i OP4 alpha_i * x[0 * inc_x2]; \ \ temp1_r = alpha_r * x[1 * inc_x2]; \ temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \ temp1_i = alpha_r * x[1 * inc_x2 + 1]; \ temp1_i OP4 alpha_i * x[1 * inc_x2]; \ \ tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \ tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \ tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \ tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \ #define CLOAD_X1_SCALE_GP() \ temp_r = alpha_r * x[0 * inc_x2]; \ temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \ temp_i = alpha_r * x[0 * inc_x2 + 1]; \ temp_i OP4 alpha_i * x[0 * inc_x2]; \ #define CLOAD_Y8_VECTOR() \ LD_SP4(y, 4, y0, y1, y2, y3); \ PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ PCKEVOD_W2_SP(y3, y2, y1r, y1i); \ #define CLOAD_Y4_VECTOR() \ LD_SP2(y, 4, y0, y1); \ PCKEVOD_W2_SP(y1, y0, y0r, y0i); \ #define CSTORE_Y8_VECTOR() \ ILVRL_W2_SP(y0i, y0r, y0, y1); \ ILVRL_W2_SP(y1i, y1r, y2, y3); \ ST_SP4(y0, y1, y2, y3, y, 4); \ #define CSTORE_Y4_VECTOR() \ ILVRL_W2_SP(y0i, y0r, y0, y1); \ ST_SP2(y0, y1, y, 4); \ #define CLOAD_Y8_GP() \ y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \ y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \ y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \ y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \ y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \ y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \ y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \ y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \ #define CLOAD_Y4_GP() \ y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \ y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \ y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \ y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \ #define CSTORE_Y8_GP() \ *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \ *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \ *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \ *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \ *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \ *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \ *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \ *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \ #define CSTORE_Y4_GP() \ *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \ *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \ *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \ *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \ *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \ *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \ *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \ *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \ #define CGEMV_N_MSA() \ for (j = (n >> 2); j--;) \ { \ CLOAD_X4_SCALE(); \ \ k = 0; \ k_pref = pref_offset; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ PREFETCH(pa0 + k_pref + 16 + 0); \ PREFETCH(pa0 + k_pref + 16 + 8); \ PREFETCH(pa1 + k_pref + 16 + 0); \ PREFETCH(pa1 + k_pref + 16 + 8); \ PREFETCH(pa2 + k_pref + 16 + 0); \ PREFETCH(pa2 + k_pref + 16 + 8); \ PREFETCH(pa3 + k_pref + 16 + 0); \ PREFETCH(pa3 + k_pref + 16 + 8); \ \ CLOAD_Y8() \ CGEMV_N_8x4(); \ CSTORE_Y8(); \ \ k += 2 * 8; \ k_pref += 2 * 8; \ y += inc_y2 * 8; \ } \ \ if (m & 4) \ { \ CLOAD_Y4(); \ CGEMV_N_4x4(); \ CSTORE_Y4(); \ \ k += 2 * 4; \ y += inc_y2 * 4; \ } \ \ if (m & 3) \ { \ temp0_r = tp4r[0]; \ temp1_r = tp4r[1]; \ temp2_r = tp4r[2]; \ temp3_r = tp4r[3]; \ \ temp0_i = tp4i[0]; \ temp1_i = tp4i[1]; \ temp2_i = tp4i[2]; \ temp3_i = tp4i[3]; \ \ for (i = (m & 3); i--;) \ { \ CGEMV_N_1x4(); \ \ k += 2; \ y += inc_y2; \ } \ } \ \ pa0 += 4 * lda2; \ pa1 += 4 * lda2; \ pa2 += 4 * lda2; \ pa3 += 4 * lda2; \ \ x += 4 * inc_x2; \ } \ \ if (n & 2) \ { \ CLOAD_X2_SCALE(); \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ CLOAD_Y8(); \ CGEMV_N_8x2(); \ CSTORE_Y8(); \ \ k += 2 * 8; \ y += inc_y2 * 8; \ } \ \ if (m & 4) \ { \ CLOAD_Y4(); \ CGEMV_N_4x2(); \ CSTORE_Y4(); \ \ k += 2 * 4; \ y += inc_y2 * 4; \ } \ \ for (i = (m & 3); i--;) \ { \ CGEMV_N_1x2(); \ \ k += 2; \ y += inc_y2; \ } \ \ pa0 += 2 * lda2; \ pa1 += 2 * lda2; \ \ x += 2 * inc_x2; \ } \ \ if (n & 1) \ { \ CLOAD_X1_SCALE(); \ \ k = 0; \ y = y_org; \ \ for (i = m; i--;) \ { \ CGEMV_N_1x1(); \ \ k += 2; \ y += inc_y2; \ } \ \ pa0 += lda2; \ x += inc_x2; \ } \ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, BLASLONG inc_y2, FLOAT *buffer) { BLASLONG i, j, k, k_pref, pref_offset; FLOAT *y_org = y; FLOAT *pa0, *pa1, *pa2, *pa3; FLOAT temp_r, temp_i, res0, res1, temp0_r; FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i; v4f32 alphar, alphai; v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i; lda2 = 2 * lda2; inc_x2 = 2 * inc_x2; inc_y2 = 2 * inc_y2; pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1); pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); pa0 = A; pa1 = A + lda2; pa2 = A + 2 * lda2; pa3 = A + 3 * lda2; alphar = COPY_FLOAT_TO_VECTOR(alpha_r); alphai = COPY_FLOAT_TO_VECTOR(alpha_i); if ((2 == inc_x2) && (2 == inc_y2)) { #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP #define CLOAD_Y8 CLOAD_Y8_VECTOR #define CLOAD_Y4 CLOAD_Y4_VECTOR #define CSTORE_Y8 CSTORE_Y8_VECTOR #define CSTORE_Y4 CSTORE_Y4_VECTOR CGEMV_N_MSA(); #undef CLOAD_X4_SCALE #undef CLOAD_X2_SCALE #undef CLOAD_X1_SCALE #undef CLOAD_Y8 #undef CLOAD_Y4 #undef CSTORE_Y8 #undef CSTORE_Y4 } else if (2 == inc_x2) { #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP #define CLOAD_Y8 CLOAD_Y8_GP #define CLOAD_Y4 CLOAD_Y4_GP #define CSTORE_Y8 CSTORE_Y8_GP #define CSTORE_Y4 CSTORE_Y4_GP CGEMV_N_MSA(); #undef CLOAD_X4_SCALE #undef CLOAD_X2_SCALE #undef CLOAD_X1_SCALE #undef CLOAD_Y8 #undef CLOAD_Y4 #undef CSTORE_Y8 #undef CSTORE_Y4 } else if (2 == inc_y2) { #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP #define CLOAD_Y8 CLOAD_Y8_VECTOR #define CLOAD_Y4 CLOAD_Y4_VECTOR #define CSTORE_Y8 CSTORE_Y8_VECTOR #define CSTORE_Y4 CSTORE_Y4_VECTOR CGEMV_N_MSA(); #undef CLOAD_X4_SCALE #undef CLOAD_X2_SCALE #undef CLOAD_X1_SCALE #undef CLOAD_Y8 #undef CLOAD_Y4 #undef CSTORE_Y8 #undef CSTORE_Y4 } else { #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP #define CLOAD_Y8 CLOAD_Y8_GP #define CLOAD_Y4 CLOAD_Y4_GP #define CSTORE_Y8 CSTORE_Y8_GP #define CSTORE_Y4 CSTORE_Y4_GP CGEMV_N_MSA(); #undef CLOAD_X4_SCALE #undef CLOAD_X2_SCALE #undef CLOAD_X1_SCALE #undef CLOAD_Y8 #undef CLOAD_Y4 #undef CSTORE_Y8 #undef CSTORE_Y4 } return(0); } #undef OP0 #undef OP1 #undef OP2 #undef OP3 #undef OP4 OpenBLAS-0.2.20/kernel/mips/cgemv_t_msa.c000066400000000000000000000652501313527062700200340ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #undef OP0 #undef OP1 #undef OP2 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #define OP0 -= #define OP1 += #define OP2 += #else #define OP0 += #define OP1 += #define OP2 -= #endif #define CGEMV_T_8x4() \ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \ LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ PCKEVOD_W2_SP(t11, t10, src5r, src5i); \ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ PCKEVOD_W2_SP(t15, t14, src7r, src7i); \ \ tp0r += src0r * x0r; \ tp0r += src1r * x1r; \ tp0r OP0 src0i * x0i; \ tp0r OP0 src1i * x1i; \ \ tp1r += src2r * x0r; \ tp1r += src3r * x1r; \ tp1r OP0 src2i * x0i; \ tp1r OP0 src3i * x1i; \ \ tp2r += src4r * x0r; \ tp2r += src5r * x1r; \ tp2r OP0 src4i * x0i; \ tp2r OP0 src5i * x1i; \ \ tp3r += src6r * x0r; \ tp3r += src7r * x1r; \ tp3r OP0 src6i * x0i; \ tp3r OP0 src7i * x1i; \ \ tp0i OP1 src0r * x0i; \ tp0i OP1 src1r * x1i; \ tp0i OP2 src0i * x0r; \ tp0i OP2 src1i * x1r; \ \ tp1i OP1 src2r * x0i; \ tp1i OP1 src3r * x1i; \ tp1i OP2 src2i * x0r; \ tp1i OP2 src3i * x1r; \ \ tp2i OP1 src4r * x0i; \ tp2i OP1 src5r * x1i; \ tp2i OP2 src4i * x0r; \ tp2i OP2 src5i * x1r; \ \ tp3i OP1 src6r * x0i; \ tp3i OP1 src7r * x1i; \ tp3i OP2 src6i * x0r; \ tp3i OP2 src7i * x1r; \ #define CGEMV_T_8x2() \ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ PCKEVOD_W2_SP(t7, t6, src3r, src3i); \ \ tp0r += src0r * x0r; \ tp0r += src1r * x1r; \ tp0r OP0 src0i * x0i; \ tp0r OP0 src1i * x1i; \ \ tp1r += src2r * x0r; \ tp1r += src3r * x1r; \ tp1r OP0 src2i * x0i; \ tp1r OP0 src3i * x1i; \ \ tp0i OP1 src0r * x0i; \ tp0i OP1 src1r * x1i; \ tp0i OP2 src0i * x0r; \ tp0i OP2 src1i * x1r; \ \ tp1i OP1 src2r * x0i; \ tp1i OP1 src3r * x1i; \ tp1i OP2 src2i * x0r; \ tp1i OP2 src3i * x1r; \ #define CGEMV_T_8x1() \ LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t3, t2, src1r, src1i); \ \ tp0r += src0r * x0r; \ tp0r += src1r * x1r; \ tp0r OP0 src0i * x0i; \ tp0r OP0 src1i * x1i; \ \ tp0i OP1 src0r * x0i; \ tp0i OP1 src1r * x1i; \ tp0i OP2 src0i * x0r; \ tp0i OP2 src1i * x1r; \ #define CGEMV_T_4x4() \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t4, t5); \ LD_SP2(pa2 + k, 4, t8, t9); \ LD_SP2(pa3 + k, 4, t12, t13); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ PCKEVOD_W2_SP(t9, t8, src4r, src4i); \ PCKEVOD_W2_SP(t13, t12, src6r, src6i); \ \ tp0r += src0r * x0r; \ tp0r OP0 src0i * x0i; \ \ tp1r += src2r * x0r; \ tp1r OP0 src2i * x0i; \ \ tp2r += src4r * x0r; \ tp2r OP0 src4i * x0i; \ \ tp3r += src6r * x0r; \ tp3r OP0 src6i * x0i; \ \ tp0i OP1 src0r * x0i; \ tp0i OP2 src0i * x0r; \ \ tp1i OP1 src2r * x0i; \ tp1i OP2 src2i * x0r; \ \ tp2i OP1 src4r * x0i; \ tp2i OP2 src4i * x0r; \ \ tp3i OP1 src6r * x0i; \ tp3i OP2 src6i * x0r; \ #define CGEMV_T_4x2() \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t4, t5); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ PCKEVOD_W2_SP(t5, t4, src2r, src2i); \ \ tp0r += src0r * x0r; \ tp0r OP0 src0i * x0i; \ \ tp1r += src2r * x0r; \ tp1r OP0 src2i * x0i; \ \ tp0i OP1 src0r * x0i; \ tp0i OP2 src0i * x0r; \ \ tp1i OP1 src2r * x0i; \ tp1i OP2 src2i * x0r; \ #define CGEMV_T_4x1() \ LD_SP2(pa0 + k, 4, t0, t1); \ \ PCKEVOD_W2_SP(t1, t0, src0r, src0i); \ \ tp0r += src0r * x0r; \ tp0r OP0 src0i * x0i; \ \ tp0i OP1 src0r * x0i; \ tp0i OP2 src0i * x0r; \ #define CGEMV_T_1x4() \ temp0r += pa0[k + 0] * x[0 * inc_x2]; \ temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ temp1r += pa1[k + 0] * x[0 * inc_x2]; \ temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ temp2r += pa2[k + 0] * x[0 * inc_x2]; \ temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \ temp3r += pa3[k + 0] * x[0 * inc_x2]; \ temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \ \ temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \ temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \ temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \ temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \ #define CGEMV_T_1x2() \ temp0r += pa0[k + 0] * x[0 * inc_x2]; \ temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ temp1r += pa1[k + 0] * x[0 * inc_x2]; \ temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \ \ temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \ temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \ #define CGEMV_T_1x1() \ temp0r += pa0[k + 0] * x[0 * inc_x2]; \ temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \ \ temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \ temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \ #define CSCALE_STORE_Y4_GP() \ res0r = y[0 * inc_y2]; \ res1r = y[1 * inc_y2]; \ res2r = y[2 * inc_y2]; \ res3r = y[3 * inc_y2]; \ \ res0i = y[0 * inc_y2 + 1]; \ res1i = y[1 * inc_y2 + 1]; \ res2i = y[2 * inc_y2 + 1]; \ res3i = y[3 * inc_y2 + 1]; \ \ res0r += alphar * temp0r; \ res0r OP0 alphai * temp0i; \ res1r += alphar * temp1r; \ res1r OP0 alphai * temp1i; \ res2r += alphar * temp2r; \ res2r OP0 alphai * temp2i; \ res3r += alphar * temp3r; \ res3r OP0 alphai * temp3i; \ \ res0i OP1 alphar * temp0i; \ res0i OP2 alphai * temp0r; \ res1i OP1 alphar * temp1i; \ res1i OP2 alphai * temp1r; \ res2i OP1 alphar * temp2i; \ res2i OP2 alphai * temp2r; \ res3i OP1 alphar * temp3i; \ res3i OP2 alphai * temp3r; \ \ y[0 * inc_y2] = res0r; \ y[1 * inc_y2] = res1r; \ y[2 * inc_y2] = res2r; \ y[3 * inc_y2] = res3r; \ \ y[0 * inc_y2 + 1] = res0i; \ y[1 * inc_y2 + 1] = res1i; \ y[2 * inc_y2 + 1] = res2i; \ y[3 * inc_y2 + 1] = res3i; \ #define CSCALE_STORE_Y2_GP() \ res0r = y[0 * inc_y2]; \ res1r = y[1 * inc_y2]; \ \ res0i = y[0 * inc_y2 + 1]; \ res1i = y[1 * inc_y2 + 1]; \ \ res0r += alphar * temp0r; \ res0r OP0 alphai * temp0i; \ res1r += alphar * temp1r; \ res1r OP0 alphai * temp1i; \ \ res0i OP1 alphar * temp0i; \ res0i OP2 alphai * temp0r; \ res1i OP1 alphar * temp1i; \ res1i OP2 alphai * temp1r; \ \ y[0 * inc_y2] = res0r; \ y[1 * inc_y2] = res1r; \ \ y[0 * inc_y2 + 1] = res0i; \ y[1 * inc_y2 + 1] = res1i; \ #define CSCALE_STORE_Y1_GP() \ res0r = y[0 * inc_y2]; \ res0i = y[0 * inc_y2 + 1]; \ \ res0r += alphar * temp0r; \ res0r OP0 alphai * temp0i; \ \ res0i OP1 alphar * temp0i; \ res0i OP2 alphai * temp0r; \ \ y[0 * inc_y2] = res0r; \ y[0 * inc_y2 + 1] = res0i; \ #define CLOAD_X8_VECTOR() \ LD_SP4(x, 4, x0, x1, x2, x3); \ PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ PCKEVOD_W2_SP(x3, x2, x1r, x1i); \ #define CLOAD_X4_VECTOR() \ LD_SP2(x, 4, x0, x1); \ PCKEVOD_W2_SP(x1, x0, x0r, x0i); \ #define CLOAD_X8_GP() \ x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \ x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \ x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \ x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \ x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \ x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \ x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \ x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \ #define CLOAD_X4_GP() \ x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \ x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \ x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \ x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \ #define CGEMV_T_MSA() \ for (j = (n >> 2); j--;) \ { \ tp0r = tp1r = tp2r = tp3r = zero; \ tp0i = tp1i = tp2i = tp3i = zero; \ \ k = 0; \ k_pref = pref_offset; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ PREFETCH(pa0 + k_pref + 16 + 0); \ PREFETCH(pa0 + k_pref + 16 + 8); \ PREFETCH(pa1 + k_pref + 16 + 0); \ PREFETCH(pa1 + k_pref + 16 + 8); \ PREFETCH(pa2 + k_pref + 16 + 0); \ PREFETCH(pa2 + k_pref + 16 + 8); \ PREFETCH(pa3 + k_pref + 16 + 0); \ PREFETCH(pa3 + k_pref + 16 + 8); \ \ CLOAD_X8() \ CGEMV_T_8x4(); \ \ k += 2 * 8; \ k_pref += 2 * 8; \ x += inc_x2 * 8; \ } \ \ if (m & 4) \ { \ CLOAD_X4(); \ \ CGEMV_T_4x4(); \ \ k += 2 * 4; \ x += inc_x2 * 4; \ } \ \ TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \ tp0r, tp1r, tp2r, tp3r); \ TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \ tp0i, tp1i, tp2i, tp3i); \ \ tp0r += tp1r; \ tp0r += tp2r; \ tp0r += tp3r; \ tp0i += tp1i; \ tp0i += tp2i; \ tp0i += tp3i; \ \ temp0r = tp0r[0]; \ temp1r = tp0r[1]; \ temp2r = tp0r[2]; \ temp3r = tp0r[3]; \ temp0i = tp0i[0]; \ temp1i = tp0i[1]; \ temp2i = tp0i[2]; \ temp3i = tp0i[3]; \ \ for (i = (m & 3); i--;) \ { \ CGEMV_T_1x4(); \ \ k += 2; \ x += inc_x2; \ } \ \ CSCALE_STORE_Y4_GP(); \ \ pa0 += 4 * lda2; \ pa1 += 4 * lda2; \ pa2 += 4 * lda2; \ pa3 += 4 * lda2; \ y += 4 * inc_y2; \ } \ \ if (n & 2) \ { \ tp0r = tp1r = zero; \ tp0i = tp1i = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ CLOAD_X8(); \ \ CGEMV_T_8x2(); \ \ k += 2 * 8; \ x += inc_x2 * 8; \ } \ \ if (m & 4) \ { \ CLOAD_X4(); \ \ CGEMV_T_4x2(); \ \ k += 2 * 4; \ x += inc_x2 * 4; \ } \ \ TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \ tp0r, tp1r, tp0i, tp1i); \ \ tp0r += tp1r; \ tp0r += tp0i; \ tp0r += tp1i; \ \ temp0r = tp0r[0]; \ temp1r = tp0r[1]; \ temp0i = tp0r[2]; \ temp1i = tp0r[3]; \ \ for (i = (m & 3); i--;) \ { \ CGEMV_T_1x2(); \ \ k += 2; \ x += inc_x2; \ } \ \ CSCALE_STORE_Y2_GP(); \ \ pa0 += 2 * lda2; \ pa1 += 2 * lda2; \ y += 2 * inc_y2; \ } \ \ if (n & 1) \ { \ tp0r = zero; \ tp0i = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ CLOAD_X8(); \ \ CGEMV_T_8x1(); \ \ k += 2 * 8; \ x += inc_x2 * 8; \ } \ \ if (m & 4) \ { \ CLOAD_X4(); \ \ CGEMV_T_4x1(); \ \ k += 2 * 4; \ x += inc_x2 * 4; \ } \ \ ILVRL_W2_SP(tp0i, tp0r, t0, t1); \ \ t0 += t1; \ \ temp0r = t0[0] + t0[2]; \ temp0i = t0[1] + t0[3]; \ \ for (i = (m & 3); i--;) \ { \ CGEMV_T_1x1(); \ \ k += 2; \ x += inc_x2; \ } \ \ CSCALE_STORE_Y1_GP(); \ \ pa0 += lda2; \ y += inc_y2; \ } \ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i, j, k, k_pref, pref_offset; FLOAT *pa0, *pa1, *pa2, *pa3; FLOAT *srcx_org = x; FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i; FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i; BLASLONG inc_x2, inc_y2, lda2; v4f32 zero = {0}; v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; lda2 = 2 * lda; pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1); pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); pa0 = A; pa1 = A + lda2; pa2 = A + 2 * lda2; pa3 = A + 3 * lda2; inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; if (2 == inc_x2) { #define CLOAD_X8 CLOAD_X8_VECTOR #define CLOAD_X4 CLOAD_X4_VECTOR CGEMV_T_MSA(); #undef CLOAD_X8 #undef CLOAD_X4 } else { #define CLOAD_X8 CLOAD_X8_GP #define CLOAD_X4 CLOAD_X4_GP CGEMV_T_MSA(); #undef CLOAD_X8 #undef CLOAD_X4 } return(0); } #undef OP0 #undef OP1 #undef OP2 OpenBLAS-0.2.20/kernel/mips/copy.c000066400000000000000000000035611313527062700165170ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; if ( n < 0 ) return(0); while(i < n) { y[iy] = x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/mips/crot_msa.c000066400000000000000000001045461313527062700173610ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { BLASLONG i, j; FLOAT *px, *py; FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3; BLASLONG inc_x2, inc_y2; v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; v4f32 out0, out1, out2, out3, out4, out5, out6, out7; v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0; if (n <= 0) return (0); px = x; py = y; if ((1 == inc_x) && (1 == inc_y)) { if ((0 == c) && (0 == s)) { v4f32 zero = __msa_cast_to_vector_float(0); zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); /* process 2 elements */ for (j = (n >> 1); j--;) { ST_SP(zero, px); ST_SP(zero, py); px += 4; py += 4; } if (n & 1) { px[0] = 0; px[1] = 0; py[0] = 0; py[1] = 0; } } else if ((1 == c) && (1 == s)) { if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } x0 = LD_SP(px); px += 4; x1 = LD_SP(px); px += 4; x2 = LD_SP(px); px += 4; x3 = LD_SP(px); px += 4; y0 = LD_SP(py); py += 4; y1 = LD_SP(py); py += 4; y2 = LD_SP(py); py += 4; y3 = LD_SP(py); py += 4; for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); out0 = x0 + y0; x4 = LD_SP(px); px += 4; out1 = y0 - x0; x5 = LD_SP(px); px += 4; out2 = x1 + y1; x6 = LD_SP(px); px += 4; out3 = y1 - x1; x7 = LD_SP(px); px += 4; out4 = x2 + y2; y4 = LD_SP(py); py += 4; out5 = y2 - x2; y5 = LD_SP(py); py += 4; out6 = x3 + y3; y6 = LD_SP(py); py += 4; out7 = y3 - x3; y7 = LD_SP(py); py += 4; ST_SP(out0, x); x += 4; out8 = x4 + y4; ST_SP(out1, y); y += 4; out9 = y4 - x4; ST_SP(out2, x); x += 4; out10 = x5 + y5; ST_SP(out3, y); y += 4; out11 = y5 - x5; ST_SP(out4, x); x += 4; out12 = x6 + y6; ST_SP(out5, y); y += 4; out13 = y6 - x6; ST_SP(out6, x); x += 4; out14 = x7 + y7; ST_SP(out7, y); y += 4; out15 = y7 - x7; x0 = LD_SP(px); px += 4; ST_SP(out8, x); x += 4; x1 = LD_SP(px); px += 4; ST_SP(out10, x); x += 4; x2 = LD_SP(px); px += 4; ST_SP(out12, x); x += 4; x3 = LD_SP(px); px += 4; ST_SP(out14, x); x += 4; y0 = LD_SP(py); py += 4; ST_SP(out9, y); y += 4; y1 = LD_SP(py); py += 4; ST_SP(out11, y); y += 4; y2 = LD_SP(py); py += 4; ST_SP(out13, y); y += 4; y3 = LD_SP(py); py += 4; ST_SP(out15, y); y += 4; } x4 = LD_SP(px); px += 4; x5 = LD_SP(px); px += 4; x6 = LD_SP(px); px += 4; x7 = LD_SP(px); px += 4; y4 = LD_SP(py); py += 4; y5 = LD_SP(py); py += 4; y6 = LD_SP(py); py += 4; y7 = LD_SP(py); py += 4; out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; out4 = x2 + y2; out5 = y2 - x2; out6 = x3 + y3; out7 = y3 - x3; out8 = x4 + y4; out9 = y4 - x4; out10 = x5 + y5; out11 = y5 - x5; out12 = x6 + y6; out13 = y6 - x6; out14 = x7 + y7; out15 = y7 - x7; ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4); ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4); } if (n & 8) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; out4 = x2 + y2; out5 = y2 - x2; out6 = x3 + y3; out7 = y3 - x3; ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 4) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; ST_SP2_INC(out0, out2, x, 4); ST_SP2_INC(out1, out3, y, 4); } if (n & 2) { x0 = LD_SP(px); y0 = LD_SP(py); px += 4; py += 4; out0 = x0 + y0; out1 = y0 - x0; ST_SP(out0, x); ST_SP(out1, y); x += 4; y += 4; } if (n & 1) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } } else if (0 == s) { c0 = COPY_FLOAT_TO_VECTOR(c); if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); y0 = LD_SP(py); py += 4; x0 *= c0; y1 = LD_SP(py); py += 4; x1 *= c0; y2 = LD_SP(py); py += 4; x2 *= c0; y3 = LD_SP(py); py += 4; x3 *= c0; y4 = LD_SP(py); py += 4; x4 *= c0; y5 = LD_SP(py); py += 4; x5 *= c0; y6 = LD_SP(py); py += 4; x6 *= c0; y7 = LD_SP(py); py += 4; x7 *= c0; ST_SP(x0, x); x += 4; y0 *= c0; ST_SP(x1, x); x += 4; y1 *= c0; ST_SP(x2, x); x += 4; y2 *= c0; ST_SP(x3, x); x += 4; y3 *= c0; ST_SP(x4, x); x += 4; y4 *= c0; ST_SP(x5, x); x += 4; y5 *= c0; ST_SP(x6, x); x += 4; y6 *= c0; ST_SP(x7, x); x += 4; y7 *= c0; x0 = LD_SP(px); px += 4; ST_SP(y0, y); y += 4; x1 = LD_SP(px); px += 4; ST_SP(y1, y); y += 4; x2 = LD_SP(px); px += 4; ST_SP(y2, y); y += 4; x3 = LD_SP(px); px += 4; ST_SP(y3, y); y += 4; x4 = LD_SP(px); px += 4; ST_SP(y4, y); y += 4; x5 = LD_SP(px); px += 4; ST_SP(y5, y); y += 4; x6 = LD_SP(px); px += 4; ST_SP(y6, y); y += 4; x7 = LD_SP(px); px += 4; ST_SP(y7, y); y += 4; } LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); x0 *= c0; y0 *= c0; x1 *= c0; y1 *= c0; x2 *= c0; y2 *= c0; x3 *= c0; y3 *= c0; x4 *= c0; y4 *= c0; x5 *= c0; y5 *= c0; x6 *= c0; y6 *= c0; x7 *= c0; y7 *= c0; ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); } if (n & 8) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); x0 *= c0; y0 *= c0; x1 *= c0; y1 *= c0; x2 *= c0; y2 *= c0; x3 *= c0; y3 *= c0; ST_SP4_INC(x0, x1, x2, x3, x, 4); ST_SP4_INC(y0, y1, y2, y3, y, 4); } if (n & 4) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); x0 *= c0; y0 *= c0; x1 *= c0; y1 *= c0; ST_SP2_INC(x0, x1, x, 4); ST_SP2_INC(y0, y1, y, 4); } if (n & 2) { x0 = LD_SP(px); y0 = LD_SP(py); px += 4; py += 4; x0 *= c0; y0 *= c0; ST_SP(x0, x); ST_SP(y0, y); x += 4; y += 4; } if (n & 1) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = (c * fx0); tp1 = (c * fy0); tp2 = (c * fx1); tp3 = (c * fy1); ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } } else if (0 == c) { s0 = COPY_FLOAT_TO_VECTOR(s); /* process 16 floats */ if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); x4 = LD_SP(px); px += 4; out0 = s0 * y0; x5 = LD_SP(px); px += 4; out2 = s0 * y1; x6 = LD_SP(px); px += 4; out4 = s0 * y2; x7 = LD_SP(px); px += 4; out6 = s0 * y3; y4 = LD_SP(py); py += 4; out1 = -(s0 * x0); y5 = LD_SP(py); py += 4; out3 = -(s0 * x1); y6 = LD_SP(py); py += 4; out5 = -(s0 * x2); y7 = LD_SP(py); py += 4; out7 = -(s0 * x3); ST_SP(out0, x); x += 4; out0 = s0 * y4; ST_SP(out2, x); x += 4; out2 = s0 * y5; ST_SP(out4, x); x += 4; out4 = s0 * y6; ST_SP(out6, x); x += 4; out6 = s0 * y7; ST_SP(out1, y); y += 4; out1 = -(s0 * x4); ST_SP(out3, y); y += 4; out3 = -(s0 * x5); ST_SP(out5, y); y += 4; out5 = -(s0 * x6); ST_SP(out7, y); y += 4; out7 = -(s0 * x7); x0 = LD_SP(px); px += 4; ST_SP(out0, x); x += 4; x1 = LD_SP(px); px += 4; ST_SP(out2, x); x += 4; x2 = LD_SP(px); px += 4; ST_SP(out4, x); x += 4; x3 = LD_SP(px); px += 4; ST_SP(out6, x); x += 4; y0 = LD_SP(py); py += 4; ST_SP(out1, y); y += 4; y1 = LD_SP(py); py += 4; ST_SP(out3, y); y += 4; y2 = LD_SP(py); py += 4; ST_SP(out5, y); y += 4; y3 = LD_SP(py); py += 4; ST_SP(out7, y); y += 4; } out0 = s0 * y0; out2 = s0 * y1; out4 = s0 * y2; out6 = s0 * y3; out1 = -(s0 * x0); out3 = -(s0 * x1); out5 = -(s0 * x2); out7 = -(s0 * x3); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); LD_SP4_INC(px, 4, x4, x5, x6, x7); LD_SP4_INC(py, 4, y4, y5, y6, y7); out0 = s0 * y4; out2 = s0 * y5; out4 = s0 * y6; out6 = s0 * y7; out1 = -(s0 * x4); out3 = -(s0 * x5); out5 = -(s0 * x6); out7 = -(s0 * x7); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 8) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); out0 = s0 * y0; out1 = - (s0 * x0); out2 = s0 * y1; out3 = - (s0 * x1); out4 = s0 * y2; out5 = - (s0 * x2); out6 = s0 * y3; out7 = - (s0 * x3); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 4) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); out0 = s0 * y0; out1 = - (s0 * x0); out2 = s0 * y1; out3 = - (s0 * x1); ST_SP2_INC(out0, out2, x, 4); ST_SP2_INC(out1, out3, y, 4); } if (n & 2) { x0 = LD_SP(px); px += 4; y0 = LD_SP(py); py += 4; out0 = s0 * y0; out1 = - (s0 * x0); ST_SP(out0, x); x += 4; ST_SP(out1, y); y += 4; } if (n & 1) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = s * fy0; tp1 = - (s * fx0); tp2 = s * fy1; tp3 = - (s * fx1); ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } } else { c0 = COPY_FLOAT_TO_VECTOR(c); s0 = COPY_FLOAT_TO_VECTOR(s); if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); x4 = LD_SP(px); px += 4; out0 = c0 * x0; x5 = LD_SP(px); px += 4; out1 = c0 * y0; x6 = LD_SP(px); px += 4; out2 = c0 * x1; x7 = LD_SP(px); px += 4; out3 = c0 * y1; y4 = LD_SP(py); py += 4; out4 = c0 * x2; y5 = LD_SP(py); py += 4; out5 = c0 * y2; y6 = LD_SP(py); py += 4; out6 = c0 * x3; y7 = LD_SP(py); py += 4; out7 = c0 * y3; out0 += s0 * y0; out1 -= s0 * x0; out2 += s0 * y1; out3 -= s0 * x1; out4 += s0 * y2; out5 -= s0 * x2; out6 += s0 * y3; out7 -= s0 * x3; ST_SP(out0, x); x += 4; out8 = c0 * x4; ST_SP(out2, x); x += 4; out9 = c0 * y4; ST_SP(out4, x); x += 4; out10 = c0 * x5; ST_SP(out6, x); x += 4; out11 = c0 * y5; ST_SP(out1, y); y += 4; out12 = c0 * x6; ST_SP(out3, y); y += 4; out13 = c0 * y6; ST_SP(out5, y); y += 4; out14 = c0 * x7; ST_SP(out7, y); y += 4; out15 = c0 * y7; x0 = LD_SP(px); px += 4; out8 += s0 * y4; x1 = LD_SP(px); px += 4; out9 -= s0 * x4; x2 = LD_SP(px); px += 4; out10 += s0 * y5; x3 = LD_SP(px); px += 4; out11 -= s0 * x5; y0 = LD_SP(py); py += 4; out12 += s0 * y6; y1 = LD_SP(py); py += 4; out13 -= s0 * x6; y2 = LD_SP(py); py += 4; out14 += s0 * y7; y3 = LD_SP(py); py += 4; out15 -= s0 * x7; ST_SP(out8, x); x += 4; ST_SP(out10, x); x += 4; ST_SP(out12, x); x += 4; ST_SP(out14, x); x += 4; ST_SP(out9, y); y += 4; ST_SP(out11, y); y += 4; ST_SP(out13, y); y += 4; ST_SP(out15, y); y += 4; } out0 = c0 * x0; out0 += s0 * y0; out1 = c0 * y0; out1 -= s0 * x0; out2 = c0 * x1; out2 += s0 * y1; out3 = c0 * y1; out3 -= s0 * x1; out4 = c0 * x2; out4 += s0 * y2; out5 = c0 * y2; out5 -= s0 * x2; out6 = c0 * x3; out6 += s0 * y3; out7 = c0 * y3; out7 -= s0 * x3; ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); LD_SP4_INC(px, 4, x4, x5, x6, x7); LD_SP4_INC(py, 4, y4, y5, y6, y7); out8 = c0 * x4; out8 += s0 * y4; out9 = c0 * y4; out9 -= s0 * x4; out10 = c0 * x5; out10 += s0 * y5; out11 = c0 * y5; out11 -= s0 * x5; out12 = c0 * x6; out12 += s0 * y6; out13 = c0 * y6; out13 -= s0 * x6; out14 = c0 * x7; out14 += s0 * y7; out15 = c0 * y7; out15 -= s0 * x7; ST_SP4_INC(out8, out10, out12, out14, x, 4); ST_SP4_INC(out9, out11, out13, out15, y, 4); } if (n & 8) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); out2 = (c0 * x1) + (s0 * y1); out3 = (c0 * y1) - (s0 * x1); out4 = (c0 * x2) + (s0 * y2); out5 = (c0 * y2) - (s0 * x2); out6 = (c0 * x3) + (s0 * y3); out7 = (c0 * y3) - (s0 * x3); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 4) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); out2 = (c0 * x1) + (s0 * y1); out3 = (c0 * y1) - (s0 * x1); ST_SP2_INC(out0, out2, x, 4); ST_SP2_INC(out1, out3, y, 4); } if (n & 2) { x0 = LD_SP(px); y0 = LD_SP(py); px += 4; py += 4; out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); ST_SP(out0, x); ST_SP(out1, y); x += 4; y += 4; } if (n & 1) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = (c * fx0) + (s * fy0); tp1 = (c * fy0) - (s * fx0); tp2 = (c * fx1) + (s * fy1); tp3 = (c * fy1) - (s * fx1); ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } } } else { inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; if ((0 == c) && (0 == s)) { for (i = n; i--;) { *x = 0; *(x + 1) = 0; *y = 0; *(y + 1) = 0; x += inc_x2; y += inc_y2; } } else if ((1 == c) && (1 == s)) { if (n >> 1) { fx0 = *px; fx1 = *(px+1); px += inc_x2; fx2 = *px; fx3 = *(px+1); px += inc_x2; fy0 = *py; fy1 = *(py+1); py += inc_y2; fy2 = *py; fy3 = *(py+1); py += inc_y2; for (i = (n >> 1) - 1; i--;) { tp0 = fx0 + fy0; tp1 = fx1 + fy1; tp2 = fy0 - fx0; tp3 = fy1 - fx1; tp4 = fx2 + fy2; tp5 = fx3 + fy3; tp6 = fy2 - fx2; tp7 = fy3 - fx3; fx0 = *px; *x = tp0; fx1 = *(px+1); px += inc_x2; *(x+1) = tp1; x += inc_x2; fx2 = *px; *x = tp4; fx3 = *(px+1); px += inc_x2; *(x+1) = tp5; x += inc_x2; fy0 = *py; *y = tp2; fy1 = *(py+1); py += inc_y2; *(y+1) = tp3; y += inc_y2; fy2 = *py; *y = tp6; fy3 = *(py+1); py += inc_y2; *(y+1) = tp7; y += inc_y2; } tp0 = fx0 + fy0; tp1 = fx1 + fy1; tp2 = fy0 - fx0; tp3 = fy1 - fx1; tp4 = fx2 + fy2; tp5 = fx3 + fy3; tp6 = fy2 - fx2; tp7 = fy3 - fx3; *x = tp0; *(x+1) = tp1; x += inc_x2; *x = tp4; *(x+1) = tp5; x += inc_x2; *y = tp2; *(y+1) = tp3; y += inc_y2; *y = tp6; *(y+1) = tp7; y += inc_y2; } if (n & 1) { fx0 = *px; fx1 = *(px+1); fy0 = *py; fy1 = *(py+1); tp0 = fx0 + fy0; tp1 = fx1 + fy1; tp2 = fy0 - fx0; tp3 = fy1 - fx1; *x = tp0; *(x+1) = tp1; *y = tp2; *(y+1) = tp3; } } else if (0 == s) { if (n >> 1) { fx0 = *px; fx1 = *(px+1); px += inc_x2; fx2 = *px; fx3 = *(px+1); px += inc_x2; fy0 = *py; fy1 = *(py+1); py += inc_y2; fy2 = *py; fy3 = *(py+1); py += inc_y2; for (i = (n >> 1) - 1; i--;) { tp0 = c * fx0; tp1 = c * fx1; tp2 = c * fx2; tp3 = c * fx3; tp4 = c * fy0; tp5 = c * fy1; tp6 = c * fy2; tp7 = c * fy3; fx0 = *px; *x = tp0; fx1 = *(px+1); px += inc_x2; *(x+1) = tp1; x += inc_x2; fx2 = *px; *x = tp2; fx3 = *(px+1); px += inc_x2; *(x+1) = tp3; x += inc_x2; fy0 = *py; *y = tp4; fy1 = *(py+1); py += inc_y2; *(y+1) = tp5; y += inc_y2; fy2 = *py; *y = tp6; fy3 = *(py+1); py += inc_y2; *(y+1) = tp7; y += inc_y2; } tp0 = c * fx0; tp1 = c * fx1; tp2 = c * fx2; tp3 = c * fx3; tp4 = c * fy0; tp5 = c * fy1; tp6 = c * fy2; tp7 = c * fy3; *x = tp0; *(x+1) = tp1; x += inc_x2; *x = tp2; *(x+1) = tp3; x += inc_x2; *y = tp4; *(y+1) = tp5; y += inc_y2; *y = tp6; *(y+1) = tp7; y += inc_y2; } if (n & 1) { fx0 = *px; fx1 = *(px+1); fy0 = *py; fy1 = *(py+1); tp0 = c * fx0; tp1 = c * fx1; tp2 = c * fy0; tp3 = c * fy1; *x = tp0; *(x+1) = tp1; *y = tp2; *(y+1) = tp3; } } else { if (n >> 1) { fx0 = *px; fx1 = *(px+1); px += inc_x2; fx2 = *px; fx3 = *(px+1); px += inc_x2; fy0 = *py; fy1 = *(py+1); py += inc_y2; fy2 = *py; fy3 = *(py+1); py += inc_y2; for (i = (n >> 1) - 1; i--;) { tp0 = c * fx0 + s * fy0; tp1 = c * fx1 + s * fy1; tp2 = c * fy0 - s * fx0; tp3 = c * fy1 - s * fx1; tp4 = c * fx2 + s * fy2; tp5 = c * fx3 + s * fy3; tp6 = c * fy2 - s * fx2; tp7 = c * fy3 - s * fx3; fx0 = *px; *x = tp0; fx1 = *(px+1); px += inc_x2; *(x+1) = tp1; x += inc_x2; fx2 = *px; *x = tp4; fx3 = *(px+1); px += inc_x2; *(x+1) = tp5; x += inc_x2; fy0 = *py; *y = tp2; fy1 = *(py+1); py += inc_y2; *(y+1) = tp3; y += inc_y2; fy2 = *py; *y = tp6; fy3 = *(py+1); py += inc_y2; *(y+1) = tp7; y += inc_y2; } tp0 = c * fx0 + s * fy0; tp1 = c * fx1 + s * fy1; tp2 = c * fy0 - s * fx0; tp3 = c * fy1 - s * fx1; tp4 = c * fx2 + s * fy2; tp5 = c * fx3 + s * fy3; tp6 = c * fy2 - s * fx2; tp7 = c * fy3 - s * fx3; *x = tp0; *(x+1) = tp1; x += inc_x2; *x = tp4; *(x+1) = tp5; x += inc_x2; *y = tp2; *(y+1) = tp3; y += inc_y2; *y = tp6; *(y+1) = tp7; y += inc_y2; } if (n & 1) { fx0 = *px; fx1 = *(px+1); fy0 = *py; fy1 = *(py+1); tp0 = c * fx0 + s * fy0; tp1 = c * fx1 + s * fy1; tp2 = c * fy0 - s * fx0; tp3 = c * fy1 - s * fx1; *x = tp0; *(x+1) = tp1; *y = tp2; *(y+1) = tp3; } } } return 0; } OpenBLAS-0.2.20/kernel/mips/cscal_msa.c000066400000000000000000001113571313527062700174750ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" /* This will shuffle the elements in 'in' vector as (mask needed :: 10 11 00 01) 0 1 2 3 => 1 0 3 2 */ #define SHF_177 177 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i, inc_x2; FLOAT *px; FLOAT tp0, tp1, tp2, tp3, f0, f1, f2, f3; v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; v4f32 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15; v4f32 da_i_vec, da_i_vec_neg, da_r_vec; px = x; if (1 == inc_x) { if ((0.0 == da_r) && (0.0 == da_i)) { v4f32 zero_v = __msa_cast_to_vector_float(0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); for (i = (n >> 5); i--;) { ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 4); ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 4); } if (n & 31) { if (n & 16) { ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 4); } if (n & 8) { ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4); } if (n & 4) { ST_SP2_INC(zero_v, zero_v, x, 4); } if (n & 2) { ST_SP(zero_v, x); x += 4; } if (n & 1) { *x = 0; x += 1; *x = 0; } } } else if (0.0 == da_r) { da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); if (n > 31) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 32; LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 5)- 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; x8 = LD_SP(px); px += 4; x0 *= da_i_vec; x9 = LD_SP(px); px += 4; x1 *= da_i_vec; x10 = LD_SP(px); px += 4; x2 *= da_i_vec; x11 = LD_SP(px); px += 4; x3 *= da_i_vec; x12 = LD_SP(px); px += 4; x4 *= da_i_vec; x13 = LD_SP(px); px += 4; x5 *= da_i_vec; x0 = (v4f32) __msa_shf_w((v4i32) x0, SHF_177); x14 = LD_SP(px); px += 4; x6 *= da_i_vec; x1 = (v4f32) __msa_shf_w((v4i32) x1, SHF_177); x15 = LD_SP(px); px += 4; x7 *= da_i_vec; x2 = (v4f32) __msa_shf_w((v4i32) x2, SHF_177); x8 *= da_i_vec; x3 = (v4f32) __msa_shf_w((v4i32) x3, SHF_177); ST_SP(x0, x); x += 4; x9 *= da_i_vec; x4 = (v4f32) __msa_shf_w((v4i32) x4, SHF_177); ST_SP(x1, x); x += 4; x10 *= da_i_vec; x5 = (v4f32) __msa_shf_w((v4i32) x5, SHF_177); ST_SP(x2, x); x += 4; x11 *= da_i_vec; x6 = (v4f32) __msa_shf_w((v4i32) x6, SHF_177); ST_SP(x3, x); x += 4; x12 *= da_i_vec; x7 = (v4f32) __msa_shf_w((v4i32) x7, SHF_177); ST_SP(x4, x); x += 4; x13 *= da_i_vec; x8 = (v4f32) __msa_shf_w((v4i32) x8, SHF_177); ST_SP(x5, x); x += 4; x14 *= da_i_vec; x9 = (v4f32) __msa_shf_w((v4i32) x9, SHF_177); ST_SP(x6, x); x += 4; x15 *= da_i_vec; x10 = (v4f32) __msa_shf_w((v4i32) x10, SHF_177); ST_SP(x7, x); x += 4; x11 = (v4f32) __msa_shf_w((v4i32) x11, SHF_177); ST_SP(x8, x); x += 4; x0 = LD_SP(px); px += 4; x12 = (v4f32) __msa_shf_w((v4i32) x12, SHF_177); ST_SP(x9, x); x += 4; x1 = LD_SP(px); px += 4; x13 = (v4f32) __msa_shf_w((v4i32) x13, SHF_177); ST_SP(x10, x); x += 4; x2 = LD_SP(px); px += 4; x14 = (v4f32) __msa_shf_w((v4i32) x14, SHF_177); ST_SP(x11, x); x += 4; x3 = LD_SP(px); px += 4; x15 = (v4f32) __msa_shf_w((v4i32) x15, SHF_177); ST_SP(x12, x); x += 4; x4 = LD_SP(px); px += 4; ST_SP(x13, x); x += 4; x5 = LD_SP(px); px += 4; ST_SP(x14, x); x += 4; x6 = LD_SP(px); px += 4; ST_SP(x15, x); x += 4; x7 = LD_SP(px); px += 4; } LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, x4, x5, x6, x7); MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, x8, x9, x10, x11); MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, x12, x13, x14, x15); SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177); SHF_W4_SP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_177); SHF_W4_SP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_177); ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x, 4); } if (n & 31) { if (n & 16) { LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, x4, x5, x6, x7); SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177); ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); } if (n & 8) { LD_SP4_INC(px, 4, x0, x1, x2, x3); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); ST_SP4_INC(x0, x1, x2, x3, x, 4); } if (n & 4) { LD_SP2_INC(px, 4, x0, x1); MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); SHF_W2_SP(x0, x1, x0, x1, SHF_177); ST_SP2_INC(x0, x1, x, 4); } if (n & 2) { LD_GP4_INC(px, 1, f0, f1, f2, f3); MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3); ST_GP4_INC(f1, f0, f3, f2, x, 1); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da_i, f1, -da_i, f0, f1); ST_GP2_INC(f1, f0, x, 1); } } } else if (0.0 == da_i) { da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); if (n > 31) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 32; LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 5)- 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; x8 = LD_SP(px); px += 4; x0 *= da_r_vec; x9 = LD_SP(px); px += 4; x1 *= da_r_vec; x10 = LD_SP(px); px += 4; x2 *= da_r_vec; x11 = LD_SP(px); px += 4; x3 *= da_r_vec; x12 = LD_SP(px); px += 4; x4 *= da_r_vec; x13 = LD_SP(px); px += 4; x5 *= da_r_vec; ST_SP(x0, x); x += 4; x14 = LD_SP(px); px += 4; x6 *= da_r_vec; ST_SP(x1, x); x += 4; x15 = LD_SP(px); px += 4; x7 *= da_r_vec; ST_SP(x2, x); x += 4; x8 *= da_r_vec; ST_SP(x3, x); x += 4; x9 *= da_r_vec; ST_SP(x4, x); x += 4; x10 *= da_r_vec; ST_SP(x5, x); x += 4; x11 *= da_r_vec; ST_SP(x6, x); x += 4; x12 *= da_r_vec; ST_SP(x7, x); x += 4; x13 *= da_r_vec; ST_SP(x8, x); x += 4; x0 = LD_SP(px); px += 4; x14 *= da_r_vec; ST_SP(x9, x); x += 4; x1 = LD_SP(px); px += 4; x15 *= da_r_vec; ST_SP(x10, x); x += 4; x2 = LD_SP(px); px += 4; ST_SP(x11, x); x += 4; x3 = LD_SP(px); px += 4; ST_SP(x12, x); x += 4; x4 = LD_SP(px); px += 4; ST_SP(x13, x); x += 4; x5 = LD_SP(px); px += 4; ST_SP(x14, x); x += 4; x6 = LD_SP(px); px += 4; ST_SP(x15, x); x += 4; x7 = LD_SP(px); px += 4; } LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, x4, x5, x6, x7); MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, x8, x9, x10, x11); MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, x12, x13, x14, x15); ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x, 4); } if (n & 31) { if (n & 16) { LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, x4, x5, x6, x7); ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); } if (n & 8) { LD_SP4_INC(px, 4, x0, x1, x2, x3); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); ST_SP4_INC(x0, x1, x2, x3, x, 4); } if (n & 4) { LD_SP2_INC(px, 4, x0, x1); MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); ST_SP2_INC(x0, x1, x, 4); } if (n & 2) { LD_GP4_INC(px, 1, f0, f1, f2, f3); MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, x, 1); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da_r, f1, da_r, f0, f1); ST_GP2_INC(f0, f1, x, 1); } } } else { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64; da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); for (i = (n >> 5); i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; LD_SP16_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, d4, d5, d6, d7); MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, d8, d9, d10, d11); MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, d12, d13, d14, d15); SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177); SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177); SHF_W4_SP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_177); SHF_W4_SP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_177); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); ST_SP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, x, 4); } if (n & 31) { if (n & 16) { LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, d4, d5, d6, d7); SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177); SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); ST_SP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 4); } if (n & 8) { LD_SP4_INC(px, 4, x0, x1, x2, x3); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); ST_SP4_INC(d0, d1, d2, d3, x, 4); } if (n & 4) { LD_SP2_INC(px, 4, x0, x1); MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); SHF_W2_SP(d0, d1, d0, d1, SHF_177); FMADD2(x0, x1, da_r_vec, d0, d1); ST_SP2_INC(d0, d1, x, 4); } if (n & 2) { LD_GP4_INC(px, 1, f0, f1, f2, f3); tp0 = da_r * f0; tp0 -= da_i * f1; tp1 = da_r * f1; tp1 += da_i * f0; tp2 = da_r * f2; tp2 -= da_i * f3; tp3 = da_r * f3; tp3 += da_i * f2; ST_GP4_INC(tp0, tp1, tp2, tp3, x, 1); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); tp0 = da_r * f0; tp0 -= da_i * f1; tp1 = da_r * f1; tp1 += da_i * f0; ST_GP2_INC(tp0, tp1, x, 1); } } } } else { inc_x2 = 2 * inc_x; if ((0.0 == da_r) && (0.0 == da_i)) { for (i = n; i--;) { *x = 0; *(x + 1) = 0; x += inc_x2; } } else if (0.0 == da_r) { da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); for (i = (n >> 4); i--;) { LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7); MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, d0, d1, d2, d3); MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec, d4, d5, d6, d7); *x = d0[1]; *(x + 1) = d0[0]; x += inc_x2; *x = d0[3]; *(x + 1) = d0[2]; x += inc_x2; *x = d1[1]; *(x + 1) = d1[0]; x += inc_x2; *x = d1[3]; *(x + 1) = d1[2]; x += inc_x2; *x = d2[1]; *(x + 1) = d2[0]; x += inc_x2; *x = d2[3]; *(x + 1) = d2[2]; x += inc_x2; *x = d3[1]; *(x + 1) = d3[0]; x += inc_x2; *x = d3[3]; *(x + 1) = d3[2]; x += inc_x2; *x = d4[1]; *(x + 1) = d4[0]; x += inc_x2; *x = d4[3]; *(x + 1) = d4[2]; x += inc_x2; *x = d5[1]; *(x + 1) = d5[0]; x += inc_x2; *x = d5[3]; *(x + 1) = d5[2]; x += inc_x2; *x = d6[1]; *(x + 1) = d6[0]; x += inc_x2; *x = d6[3]; *(x + 1) = d6[2]; x += inc_x2; *x = d7[1]; *(x + 1) = d7[0]; x += inc_x2; *x = d7[3]; *(x + 1) = d7[2]; x += inc_x2; } if (n & 15) { if (n & 8) { LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, d0, d1, d2, d3); *x = d0[1]; *(x + 1) = d0[0]; x += inc_x2; *x = d0[3]; *(x + 1) = d0[2]; x += inc_x2; *x = d1[1]; *(x + 1) = d1[0]; x += inc_x2; *x = d1[3]; *(x + 1) = d1[2]; x += inc_x2; *x = d2[1]; *(x + 1) = d2[0]; x += inc_x2; *x = d2[3]; *(x + 1) = d2[2]; x += inc_x2; *x = d3[1]; *(x + 1) = d3[0]; x += inc_x2; *x = d3[3]; *(x + 1) = d3[2]; x += inc_x2; } if (n & 4) { LD_SP4_INC(px, inc_x2, x0, x1, x2, x3); PCKEV_D2_SP(x1, x0, x3, x2, d0, d1); MUL2(d0, da_i_vec, d1, da_i_vec, d0, d1); *x = d0[1]; *(x + 1) = d0[0]; x += inc_x2; *x = d0[3]; *(x + 1) = d0[2]; x += inc_x2; *x = d1[1]; *(x + 1) = d1[0]; x += inc_x2; *x = d1[3]; *(x + 1) = d1[2]; x += inc_x2; } if (n & 2) { f0 = *px; f1 = *(px + 1); px += inc_x2; f2 = *px; f3 = *(px + 1); px += inc_x2; MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3); *x = f1; *(x + 1) = f0; x += inc_x2; *x = f3; *(x + 1) = f2; x += inc_x2; } if (n & 1) { f0 = *x; f1 = *(x + 1); MUL2(f0, da_i, f1, -da_i, f0, f1); *x = f1; *(x + 1) = f0; } } } else if (0.0 == da_i) { da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); for (i = (n >> 4); i--;) { LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7); MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, d0, d1, d2, d3); MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec, d4, d5, d6, d7); *x = d0[0]; *(x + 1) = d0[1]; x += inc_x2; *x = d0[2]; *(x + 1) = d0[3]; x += inc_x2; *x = d1[0]; *(x + 1) = d1[1]; x += inc_x2; *x = d1[2]; *(x + 1) = d1[3]; x += inc_x2; *x = d2[0]; *(x + 1) = d2[1]; x += inc_x2; *x = d2[2]; *(x + 1) = d2[3]; x += inc_x2; *x = d3[0]; *(x + 1) = d3[1]; x += inc_x2; *x = d3[2]; *(x + 1) = d3[3]; x += inc_x2; *x = d4[0]; *(x + 1) = d4[1]; x += inc_x2; *x = d4[2]; *(x + 1) = d4[3]; x += inc_x2; *x = d5[0]; *(x + 1) = d5[1]; x += inc_x2; *x = d5[2]; *(x + 1) = d5[3]; x += inc_x2; *x = d6[0]; *(x + 1) = d6[1]; x += inc_x2; *x = d6[2]; *(x + 1) = d6[3]; x += inc_x2; *x = d7[0]; *(x + 1) = d7[1]; x += inc_x2; *x = d7[2]; *(x + 1) = d7[3]; x += inc_x2; } if (n & 15) { if (n & 8) { LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, d0, d1, d2, d3); *x = d0[0]; *(x + 1) = d0[1]; x += inc_x2; *x = d0[2]; *(x + 1) = d0[3]; x += inc_x2; *x = d1[0]; *(x + 1) = d1[1]; x += inc_x2; *x = d1[2]; *(x + 1) = d1[3]; x += inc_x2; *x = d2[0]; *(x + 1) = d2[1]; x += inc_x2; *x = d2[2]; *(x + 1) = d2[3]; x += inc_x2; *x = d3[0]; *(x + 1) = d3[1]; x += inc_x2; *x = d3[2]; *(x + 1) = d3[3]; x += inc_x2; } if (n & 4) { LD_SP4_INC(px, inc_x2, x0, x1, x2, x3); PCKEV_D2_SP(x1, x0, x3, x2, d0, d1); MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1); *x = d0[0]; *(x + 1) = d0[1]; x += inc_x2; *x = d0[2]; *(x + 1) = d0[3]; x += inc_x2; *x = d1[0]; *(x + 1) = d1[1]; x += inc_x2; *x = d1[2]; *(x + 1) = d1[3]; x += inc_x2; } if (n & 2) { f0 = *px; f1 = *(px + 1); px += inc_x2; f2 = *px; f3 = *(px + 1); px += inc_x2; MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3); *x = f0; *(x + 1) = f1; x += inc_x2; *x = f2; *(x + 1) = f3; x += inc_x2; } if (n & 1) { f0 = *x; f1 = *(x + 1); MUL2(f0, da_r, f1, da_r, f0, f1); *x = f0; *(x + 1) = f1; } } } else { da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); for (i = (n >> 4); i--;) { LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7); MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, x0, x1, x2, x3); MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec, x4, x5, x6, x7); MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, d0, d1, d2, d3); MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec, d4, d5, d6, d7); SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177); ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3); ADD4(d4, x4, d5, x5, d6, x6, d7, x7, d4, d5, d6, d7); *x = d0[0]; *(x + 1) = d0[1]; x += inc_x2; *x = d0[2]; *(x + 1) = d0[3]; x += inc_x2; *x = d1[0]; *(x + 1) = d1[1]; x += inc_x2; *x = d1[2]; *(x + 1) = d1[3]; x += inc_x2; *x = d2[0]; *(x + 1) = d2[1]; x += inc_x2; *x = d2[2]; *(x + 1) = d2[3]; x += inc_x2; *x = d3[0]; *(x + 1) = d3[1]; x += inc_x2; *x = d3[2]; *(x + 1) = d3[3]; x += inc_x2; *x = d4[0]; *(x + 1) = d4[1]; x += inc_x2; *x = d4[2]; *(x + 1) = d4[3]; x += inc_x2; *x = d5[0]; *(x + 1) = d5[1]; x += inc_x2; *x = d5[2]; *(x + 1) = d5[3]; x += inc_x2; *x = d6[0]; *(x + 1) = d6[1]; x += inc_x2; *x = d6[2]; *(x + 1) = d6[3]; x += inc_x2; *x = d7[0]; *(x + 1) = d7[1]; x += inc_x2; *x = d7[2]; *(x + 1) = d7[3]; x += inc_x2; } if (n & 15) { if (n & 8) { LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, x0, x1, x2, x3); MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, d0, d1, d2, d3); SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3); *x = d0[0]; *(x + 1) = d0[1]; x += inc_x2; *x = d0[2]; *(x + 1) = d0[3]; x += inc_x2; *x = d1[0]; *(x + 1) = d1[1]; x += inc_x2; *x = d1[2]; *(x + 1) = d1[3]; x += inc_x2; *x = d2[0]; *(x + 1) = d2[1]; x += inc_x2; *x = d2[2]; *(x + 1) = d2[3]; x += inc_x2; *x = d3[0]; *(x + 1) = d3[1]; x += inc_x2; *x = d3[2]; *(x + 1) = d3[3]; x += inc_x2; } if (n & 4) { LD_SP4_INC(px, inc_x2, x0, x1, x2, x3); PCKEV_D2_SP(x1, x0, x3, x2, d0, d1); MUL2(d0, da_i_vec, d1, da_i_vec, x0, x1); MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1); SHF_W2_SP(x0, x1, x0, x1, SHF_177); ADD2(d0, x0, d1, x1, d0, d1); *x = d0[0]; *(x + 1) = d0[1]; x += inc_x2; *x = d0[2]; *(x + 1) = d0[3]; x += inc_x2; *x = d1[0]; *(x + 1) = d1[1]; x += inc_x2; *x = d1[2]; *(x + 1) = d1[3]; x += inc_x2; } if (n & 2) { f0 = *px;; f1 = *(px + 1); px += inc_x2; f2 = *px; f3 = *(px + 1); px += inc_x2; tp0 = da_r * f0; tp0 -= da_i * f1; tp1 = da_r * f1; tp1 += da_i * f0; tp2 = da_r * f2; tp2 -= da_i * f3; tp3 = da_r * f3; tp3 += da_i * f2; *x = tp0; *(x + 1) = tp1; x += inc_x2; *x = tp2; *(x + 1) = tp3; x += inc_x2; } if (n & 1) { f0 = *px; px += 1; f1 = *px; tp0 = da_r * f0; tp0 -= da_i * f1; tp1 = da_r * f1; tp1 += da_i * f0; *x = tp0; x += 1; *x = tp1; } } } } return (0); } OpenBLAS-0.2.20/kernel/mips/cswap_msa.c000066400000000000000000000231111313527062700175130ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, pref_offsetx, pref_offsety; FLOAT *px, *py; BLASLONG inc_x2, inc_y2; FLOAT x0, x1, x2, x3, x4, x5, x6, x7; FLOAT y0, y1, y2, y3, y4, y5, y6, y7; v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; if (n < 0) return (0); pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } px = srcx; py = srcy; inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; if ((1 == inc_x) && (1 == inc_y)) { if (n >> 4) { LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); for (i = (n >> 4) - 1; i--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); yv0 = LD_SP(py); py += 4; ST_SP(xv0, srcy); srcy += 4; yv1 = LD_SP(py); py += 4; ST_SP(xv1, srcy); srcy += 4; yv2 = LD_SP(py); py += 4; ST_SP(xv2, srcy); srcy += 4; yv3 = LD_SP(py); py += 4; ST_SP(xv3, srcy); srcy += 4; yv4 = LD_SP(py); py += 4; ST_SP(xv4, srcy); srcy += 4; yv5 = LD_SP(py); py += 4; ST_SP(xv5, srcy); srcy += 4; yv6 = LD_SP(py); py += 4; ST_SP(xv6, srcy); srcy += 4; yv7 = LD_SP(py); py += 4; ST_SP(xv7, srcy); srcy += 4; xv0 = LD_SP(px); px += 4; ST_SP(yv0, srcx); srcx += 4; xv1 = LD_SP(px); px += 4; ST_SP(yv1, srcx); srcx += 4; xv2 = LD_SP(px); px += 4; ST_SP(yv2, srcx); srcx += 4; xv3 = LD_SP(px); px += 4; ST_SP(yv3, srcx); srcx += 4; xv4 = LD_SP(px); px += 4; ST_SP(yv4, srcx); srcx += 4; xv5 = LD_SP(px); px += 4; ST_SP(yv5, srcx); srcx += 4; xv6 = LD_SP(px); px += 4; ST_SP(yv6, srcx); srcx += 4; xv7 = LD_SP(px); px += 4; ST_SP(yv7, srcx); srcx += 4; } LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); } if (n & 15) { if ((n & 8) && (n & 4) && (n & 2)) { LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); } else if ((n & 8) && (n & 4)) { LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); } else if ((n & 8) && (n & 2)) { LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); } else if ((n & 4) && (n & 2)) { LD_SP3_INC(px, 4, xv0, xv1, xv2); LD_SP3_INC(py, 4, yv0, yv1, yv2); ST_SP3_INC(xv0, xv1, xv2, srcy, 4); ST_SP3_INC(yv0, yv1, yv2, srcx, 4); } else if (n & 8) { LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); } else if (n & 4) { LD_SP2_INC(px, 4, xv0, xv1); LD_SP2_INC(py, 4, yv0, yv1); ST_SP2_INC(xv0, xv1, srcy, 4); ST_SP2_INC(yv0, yv1, srcx, 4); } else if (n & 2) { xv0 = LD_SP(px); yv0 = LD_SP(py); px += 4; py += 4; ST_SP(xv0, srcy); ST_SP(yv0, srcx); srcx += 4; srcy += 4; } if (n & 1) { LD_GP2_INC(px, 1, x0, x1); LD_GP2_INC(py, 1, y0, y1); ST_GP2_INC(x0, x1, srcy, 1); ST_GP2_INC(y0, y1, srcx, 1); } } } else { for (i = (n >> 2); i--;) { x0 = srcx[0 * inc_x2]; x1 = srcx[0 * inc_x2 + 1]; x2 = srcx[1 * inc_x2]; x3 = srcx[1 * inc_x2 + 1]; x4 = srcx[2 * inc_x2]; x5 = srcx[2 * inc_x2 + 1]; x6 = srcx[3 * inc_x2]; x7 = srcx[3 * inc_x2 + 1]; y0 = srcy[0 * inc_y2]; y1 = srcy[0 * inc_y2 + 1]; y2 = srcy[1 * inc_y2]; y3 = srcy[1 * inc_y2 + 1]; y4 = srcy[2 * inc_y2]; y5 = srcy[2 * inc_y2 + 1]; y6 = srcy[3 * inc_y2]; y7 = srcy[3 * inc_y2 + 1]; srcx[0 * inc_x2] = y0; srcx[0 * inc_x2 + 1] = y1; srcx[1 * inc_x2] = y2; srcx[1 * inc_x2 + 1] = y3; srcx[2 * inc_x2] = y4; srcx[2 * inc_x2 + 1] = y5; srcx[3 * inc_x2] = y6; srcx[3 * inc_x2 + 1] = y7; srcy[0 * inc_y2] = x0; srcy[0 * inc_y2 + 1] = x1; srcy[1 * inc_y2] = x2; srcy[1 * inc_y2 + 1] = x3; srcy[2 * inc_y2] = x4; srcy[2 * inc_y2 + 1] = x5; srcy[3 * inc_y2] = x6; srcy[3 * inc_y2 + 1] = x7; srcx += 4 * inc_x2; srcy += 4 * inc_y2; } if (n & 2) { x0 = srcx[0 * inc_x2]; x1 = srcx[0 * inc_x2 + 1]; x2 = srcx[1 * inc_x2]; x3 = srcx[1 * inc_x2 + 1]; y0 = srcy[0 * inc_y2]; y1 = srcy[0 * inc_y2 + 1]; y2 = srcy[1 * inc_y2]; y3 = srcy[1 * inc_y2 + 1]; srcx[0 * inc_x2] = y0; srcx[0 * inc_x2 + 1] = y1; srcx[1 * inc_x2] = y2; srcx[1 * inc_x2 + 1] = y3; srcy[0 * inc_y2] = x0; srcy[0 * inc_y2 + 1] = x1; srcy[1 * inc_y2] = x2; srcy[1 * inc_y2 + 1] = x3; srcx += 2 * inc_x2; srcy += 2 * inc_y2; } if (n & 1) { x0 = srcx[0 * inc_x2]; x1 = srcx[0 * inc_x2 + 1]; y0 = srcy[0 * inc_y2]; y1 = srcy[0 * inc_y2 + 1]; srcx[0 * inc_x2] = y0; srcx[0 * inc_x2 + 1] = y1; srcy[0 * inc_y2] = x0; srcy[0 * inc_y2 + 1] = x1; srcx += inc_x2; srcy += inc_y2; } } return (0); } OpenBLAS-0.2.20/kernel/mips/dasum_msa.c000066400000000000000000000220571313527062700175170ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include #include "macros_msa.h" #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i; FLOAT sumf = 0.0; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; v2f64 sum_abs0 = {0, 0}; v2f64 sum_abs1 = {0, 0}; v2f64 sum_abs2 = {0, 0}; v2f64 sum_abs3 = {0, 0}; v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { if (n > 31) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 16; LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); for (i = (n >> 5) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } if (n & 31) { if (n & 16) { LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); } if (n & 8) { LD_DP4_INC(x, 2, src0, src1, src2, src3); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); } if (n & 4) { LD_DP2_INC(x, 2, src0, src1); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); } if (n & 2) { src0 = LD_DP(x); x += 2; sum_abs0 += AND_VEC_D(src0); } if (n & 1) { sumf += fabs(*x); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf += sum_abs0[0] + sum_abs0[1]; } else { if (n > 16) { LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); for (i = (n >> 4) - 1; i--;) { LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } if (n & 15) { if (n & 8) { LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); } if (n & 4) { LD_DP4_INC(x, inc_x, src0, src1, src2, src3); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); } if (n & 2) { LD_DP2_INC(x, inc_x, src0, src1); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); } if (n & 1) { src0 = LD_DP(x); sum_abs0 += AND_VEC_D(src0); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf = sum_abs0[0]; } return (sumf); } OpenBLAS-0.2.20/kernel/mips/daxpy_msa.c000066400000000000000000000174011313527062700175300ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #if !defined(CONJ) #define OP0 += #define OP1 -= #define OP2 += #else #define OP0 -= #define OP1 += #define OP2 -= #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i; FLOAT *py; v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; v2f64 da_vec, zero_v = {0}; if ((n < 0) || (da == 0.0)) return(0); py = y; if ((1 == inc_x) && (1 == inc_y)) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 32; da_vec = COPY_DOUBLE_TO_VECTOR(da); for (i = (n >> 4); i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 16; y_pref += 16; LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); } if (n & 15) { if (n & 8) { LD_DP4_INC(x, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); ST_DP4_INC(y0, y1, y2, y3, y, 2); } if (n & 4) { LD_DP2_INC(x, 2, x0, x1); LD_DP2_INC(py, 2, y0, y1); FMADD2(x0, x1, da_vec, y0, y1); ST_DP2_INC(y0, y1, y, 2); } if (n & 2) { x0 = LD_DP(x); x += 2; y0 = LD_DP(py); py += 2; y0 += da_vec * x0; ST_DP(y0, y); y += 2; } if (n & 1) { y[0] += da * x[0]; } } } else if (1 == inc_y) { FLOAT *y_pref; BLASLONG pref_offset; v2f64 x8, x9, x10, x11, x12, x13, x14; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 32; da_vec = COPY_DOUBLE_TO_VECTOR(da); for (i = (n >> 4); i--;) { PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); y_pref += 16; LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14); LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7); PCKEV_D2_SD(x1, x0, x3, x2, x0, x1); PCKEV_D2_SD(x5, x4, x14, x6, x2, x3); PCKEV_D2_SD(x9, x8, x11, x10, x4, x5); x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12); x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x)); x += inc_x; LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); } if (n & 15) { if (n & 8) { LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3); PCKEV_D2_SD(x1, x0, x6, x2, x0, x1); x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x)); x += inc_x; LD_DP4_INC(py, 2, y0, y1, y2, y3); FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); ST_DP4_INC(y0, y1, y2, y3, y, 2); } if (n & 4) { LD_DP3_INC(x, inc_x, x0, x2, x1); x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0); x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x)); x += inc_x; LD_DP2_INC(py, 2, y0, y1); FMADD2(x0, x1, da_vec, y0, y1); ST_DP2_INC(y0, y1, y, 2); } if (n & 2) { x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x)); x += inc_x; x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x)); x += inc_x; y0 = LD_DP(py); py += 2; y0 += da_vec * x0; ST_DP(y0, y); y += 2; } if (n & 1) { y[0] += da * x[0]; } } } else { FLOAT x0, x1, x2, x3, y0, y1, y2, y3; for (i = (n >> 2); i--;) { LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(py, inc_y, y0, y1, y2, y3); FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); ST_GP4_INC(y0, y1, y2, y3, y, inc_y); } if (n & 3) { if (n & 2) { LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(py, inc_y, y0, y1); FMADD2(x0, x1, da, y0, y1); ST_GP2_INC(y0, y1, y, inc_y); } if (n & 1) { *y += da * *x; } } } return (0); } OpenBLAS-0.2.20/kernel/mips/dcopy_msa.c000066400000000000000000000136321313527062700175230ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; if (n < 0) return (0); if ((1 == inc_x) && (1 == inc_y)) { if (n > 31) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 16; LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 5) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; x8 = LD_DP(x); x += 2; ST_DP(x0, y); y += 2; x9 = LD_DP(x); x += 2; ST_DP(x1, y); y += 2; x10 = LD_DP(x); x += 2; ST_DP(x2, y); y += 2; x11 = LD_DP(x); x += 2; ST_DP(x3, y); y += 2; x12 = LD_DP(x); x += 2; ST_DP(x4, y); y += 2; x13 = LD_DP(x); x += 2; ST_DP(x5, y); y += 2; x14 = LD_DP(x); x += 2; ST_DP(x6, y); y += 2; x15 = LD_DP(x); x += 2; ST_DP(x7, y); y += 2; x0 = LD_DP(x); x += 2; ST_DP(x8, y); y += 2; x1 = LD_DP(x); x += 2; ST_DP(x9, y); y += 2; x2 = LD_DP(x); x += 2; ST_DP(x10, y); y += 2; x3 = LD_DP(x); x += 2; ST_DP(x11, y); y += 2; x4 = LD_DP(x); x += 2; ST_DP(x12, y); y += 2; x5 = LD_DP(x); x += 2; ST_DP(x13, y); y += 2; x6 = LD_DP(x); x += 2; ST_DP(x14, y); y += 2; x7 = LD_DP(x); x += 2; ST_DP(x15, y); y += 2; } x8 = LD_DP(x); x += 2; x9 = LD_DP(x); x += 2; ST_DP(x0, y); y += 2; x10 = LD_DP(x); x += 2; ST_DP(x1, y); y += 2; x11 = LD_DP(x); x += 2; ST_DP(x2, y); y += 2; x12 = LD_DP(x); x += 2; ST_DP(x3, y); y += 2; x13 = LD_DP(x); x += 2; ST_DP(x4, y); y += 2; x14 = LD_DP(x); x += 2; ST_DP(x5, y); y += 2; x15 = LD_DP(x); x += 2; ST_DP(x6, y); y += 2; ST_DP(x7, y); y += 2; ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); } if (n & 31) { if (n & 16) { LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); } if (n & 8) { LD_DP4_INC(x, 2, x0, x1, x2, x3); ST_DP4_INC(x0, x1, x2, x3, y, 2); } if (n & 4) { LD_GP4_INC(x, 1, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, y, 1); } if (n & 2) { LD_GP2_INC(x, 1, f0, f1); ST_GP2_INC(f0, f1, y, 1); } if (n & 1) { *y = *x; } } } else { for (i = (n >> 3); i--;) { LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); } if (n & 4) { LD_GP4_INC(x, inc_x, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, y, inc_y); } if (n & 2) { LD_GP2_INC(x, inc_x, f0, f1); ST_GP2_INC(f0, f1, y, inc_y); } if (n & 1) { *y = *x; } } return (0); } OpenBLAS-0.2.20/kernel/mips/ddot_msa.c000066400000000000000000000117531313527062700173410ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i = 0; FLOAT dot = 0.0; FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; v2f64 dot0 = {0, 0}; v2f64 dot1 = {0, 0}; v2f64 dot2 = {0, 0}; v2f64 dot3 = {0, 0}; if (n < 1) return (dot); if ((1 == inc_x) && (1 == inc_y)) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 32; for (i = (n >> 4); i--;) { LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 16; y_pref += 16; dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); dot0 += (vy4 * vx4); dot1 += (vy5 * vx5); dot2 += (vy6 * vx6); dot3 += (vy7 * vx7); } if (n & 15) { if (n & 8) { LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); } if (n & 4) { LD_DP2_INC(x, 2, vx0, vx1); LD_DP2_INC(y, 2, vy0, vy1); dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); } if (n & 2) { vx0 = LD_DP(x); x += 2; vy0 = LD_DP(y); y += 2; dot0 += (vy0 * vx0); } if (n & 1) { x0 = *x; y0 = *y; dot += (y0 * x0); } } dot0 += dot1 + dot2 + dot3; dot += dot0[0]; dot += dot0[1]; } else { for (i = (n >> 2); i--;) { LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(y, inc_y, y0, y1, y2, y3); dot += (y0 * x0); dot += (y1 * x1); dot += (y2 * x2); dot += (y3 * x3); } if (n & 2) { LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); dot += (y0 * x0); dot += (y1 * x1); } if (n & 1) { x0 = *x; y0 = *y; dot += (y0 * x0); } } return (dot); } OpenBLAS-0.2.20/kernel/mips/dgemm_kernel_8x4_msa.c000066400000000000000000001373731313527062700215520ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static void __attribute__ ((noinline)) dgemmkernel_8x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { BLASLONG i, j, l, temp; #if defined(TRMMKERNEL) BLASLONG off; #endif FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; v2f64 v_alpha = {alpha, alpha}; v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 res0, res1, res2, res3, res4, res5, res6, res7; v2f64 res8, res9, res10, res11, res12, res13, res14, res15; #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset; #endif for (j = (n >> 2); j--;) { pc0 = C; pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 4; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pa0]) \n\t" "pref 0, 96(%[pa0]) \n\t" "pref 0, 128(%[pa0]) \n\t" "pref 0, 160(%[pa0]) \n\t" "pref 0, 32(%[pb0]) \n\t" "pref 0, 64(%[pb0]) \n\t" "pref 0, 96(%[pb0]) \n\t" : : [pa0] "r" (pa0), [pb0] "r" (pb0) ); #endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; res2 = src_a2 * src_b; res3 = src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 = src_a0 * src_b; res5 = src_a1 * src_b; res6 = src_a2 * src_b; res7 = src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res8 = src_a0 * src_b; res9 = src_a1 * src_b; res10 = src_a2 * src_b; res11 = src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res12 = src_a0 * src_b; res13 = src_a1 * src_b; res14 = src_a2 * src_b; res15 = src_a3 * src_b; for (l = ((temp - 1) >> 1); l--;) { #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 128(%[pa0]) \n\t" "pref 0, 160(%[pa0]) \n\t" "pref 0, 192(%[pa0]) \n\t" "pref 0, 224(%[pa0]) \n\t" : : [pa0] "r" (pa0) ); #endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 += src_a0 * src_b; res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res8 += src_a0 * src_b; res9 += src_a1 * src_b; res10 += src_a2 * src_b; res11 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res12 += src_a0 * src_b; res13 += src_a1 * src_b; res14 += src_a2 * src_b; res15 += src_a3 * src_b; LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pb0]) \n\t" "pref 0, 96(%[pb0]) \n\t" : : [pb0] "r" (pb0) ); #endif LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 += src_a0 * src_b; res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res8 += src_a0 * src_b; res9 += src_a1 * src_b; res10 += src_a2 * src_b; res11 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res12 += src_a0 * src_b; res13 += src_a1 * src_b; res14 += src_a2 * src_b; res15 += src_a3 * src_b; } if ((temp - 1) & 1) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 += src_a0 * src_b; res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res8 += src_a0 * src_b; res9 += src_a1 * src_b; res10 += src_a2 * src_b; res11 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res12 += src_a0 * src_b; res13 += src_a1 * src_b; res14 += src_a2 * src_b; res15 += src_a3 * src_b; } #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pc0]) \n\t" "pref 0, 64(%[pc1]) \n\t" "pref 0, 64(%[pc2]) \n\t" "pref 0, 64(%[pc3]) \n\t" : : [pc0] "r" (pc0), [pc1] "r" (pc1), [pc2] "r" (pc2), [pc3] "r" (pc3) ); #endif #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; dst4 = res4 * v_alpha; dst5 = res5 * v_alpha; dst6 = res6 * v_alpha; dst7 = res7 * v_alpha; #else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; dst4 += res4 * v_alpha; dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; #endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); #if defined(TRMMKERNEL) dst0 = res8 * v_alpha; dst1 = res9 * v_alpha; dst2 = res10 * v_alpha; dst3 = res11 * v_alpha; dst4 = res12 * v_alpha; dst5 = res13 * v_alpha; dst6 = res14 * v_alpha; dst7 = res15 * v_alpha; #else LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); dst0 += res8 * v_alpha; dst1 += res9 * v_alpha; dst2 += res10 * v_alpha; dst3 += res11 * v_alpha; dst4 += res12 * v_alpha; dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; #endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 4; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif B += (k << 2); C += (ldc << 2); } } static void __attribute__ ((noinline)) dgemmkernel_7x4_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { BLASLONG j, l, temp; #if defined(TRMMKERNEL) BLASLONG off; #endif FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3; FLOAT a0, b0, b1, b2, b3; v2f64 v_alpha = {alpha, alpha}; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 res0, res1, res2, res3, res4, res5, res6, res7; #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset; #endif for (j = (n >> 2); j--;) { #if defined(TRMMKERNEL) pc0 = C; pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; pa0 = A; #if defined(LEFT) off = offset; #endif for (l = (m >> 3); l--;) { #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 4; // number of values in B #endif pc0 += 8; pc1 += 8; pc2 += 8; pc3 += 8; pa0 += 8 * temp; pb0 += 4 * temp; #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 4; #endif #ifdef LEFT off += 8; // number of values in A #endif } #else // #if !defined(TRMMKERNEL) pc0 = C + 8 * (m >> 3); pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; pa0 = A + k * 8 * (m >> 3); #endif if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 4; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 4; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif LD_DP2_INC(pa0, 2, src_a0, src_a1); LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 = src_a0 * src_b; res3 = src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res4 = src_a0 * src_b; res5 = src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 = src_a0 * src_b; res7 = src_a1 * src_b; for (l = ((temp - 1) >> 1); l--;) { LD_DP2_INC(pa0, 2, src_a0, src_a1); LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 += src_a0 * src_b; res7 += src_a1 * src_b; LD_DP2_INC(pa0, 2, src_a0, src_a1); LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 += src_a0 * src_b; res7 += src_a1 * src_b; } if ((temp - 1) & 1) { LD_DP2_INC(pa0, 2, src_a0, src_a1); LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res6 += src_a0 * src_b; res7 += src_a1 * src_b; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; dst4 = res4 * v_alpha; dst5 = res5 * v_alpha; dst6 = res6 * v_alpha; dst7 = res7 * v_alpha; #else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); LD_DP2(pc2, 2, dst4, dst5); LD_DP2(pc3, 2, dst6, dst7); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; dst4 += res4 * v_alpha; dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; #endif ST_DP2_INC(dst0, dst1, pc0, 2); ST_DP2_INC(dst2, dst3, pc1, 2); ST_DP2_INC(dst4, dst5, pc2, 2); ST_DP2_INC(dst6, dst7, pc3, 2); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 4; pb0 += temp * 4; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 4; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif src_a0 = LD_DP(pa0); pa0 += 2; LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 = src_a0 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res2 = src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 = src_a0 * src_b; for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); pa0 += 2; LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res2 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; src_a0 = LD_DP(pa0); pa0 += 2; LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res2 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; } if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); pa0 += 2; LD_DP2_INC(pb0, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); res2 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); res3 += src_a0 * src_b; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; #else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst2 = LD_DP(pc2); dst3 = LD_DP(pc3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; #endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); ST_DP(dst2, pc2); ST_DP(dst3, pc3); pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2; pb0 += temp * 4; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 1; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 4; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; b2 = pb0[2]; tmp2 = a0 * b2; b3 = pb0[3]; tmp3 = a0 * b3; pa0 += 1; pb0 += 4; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp1; pc2[0] = tmp2; pc3[0] = tmp3; #else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; #endif pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 1; pb0 += temp * 4; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif B += (k << 2); C += (ldc << 2); } } static void __attribute__ ((noinline)) dgemmkernel_8x4_non_core_msa(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { BLASLONG i, l, temp; #if defined(TRMMKERNEL) BLASLONG off; #endif FLOAT *pc0, *pc1, *pa0, *pb0; FLOAT tmp0, tmp1; FLOAT a0, b0, b1; v2f64 v_alpha = {alpha, alpha}; v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 res0, res1, res2, res3, res4, res5, res6, res7; #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset + (4 * (n >> 2)); #endif if (n & 2) { pc0 = C; pc1 = pc0 + ldc; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 2; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; res2 = src_a2 * src_b; res3 = src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 = src_a0 * src_b; res5 = src_a1 * src_b; res6 = src_a2 * src_b; res7 = src_a3 * src_b; for (l = ((temp - 1) >> 1); l--;) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 += src_a0 * src_b; res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 += src_a0 * src_b; res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; } if ((temp - 1) & 1) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res4 += src_a0 * src_b; res5 += src_a1 * src_b; res6 += src_a2 * src_b; res7 += src_a3 * src_b; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; dst4 = res4 * v_alpha; dst5 = res5 * v_alpha; dst6 = res6 * v_alpha; dst7 = res7 * v_alpha; #else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; dst4 += res4 * v_alpha; dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; #endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 2; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 4; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 2; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 = src_a0 * src_b; res3 = src_a1 * src_b; for (l = ((temp - 1) >> 1); l--;) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; } if ((temp - 1) & 1) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res2 += src_a0 * src_b; res3 += src_a1 * src_b; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; #else LD_DP2(pc0, 2, dst0, dst1); LD_DP2(pc1, 2, dst2, dst3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; #endif ST_DP2_INC(dst0, dst1, pc0, 2); ST_DP2_INC(dst2, dst3, pc1, 2); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 4; pb0 += temp * 2; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 2; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif src_a0 = LD_DP(pa0); pa0 += 2; src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 = src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 = src_a0 * src_b; for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); pa0 += 2; src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; src_a0 = LD_DP(pa0); pa0 += 2; src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; } if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); pa0 += 2; src_b0 = LD_DP(pb0); pb0 += 2; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); res0 += src_a0 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); res1 += src_a0 * src_b; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; #else dst0 = LD_DP(pc0); dst1 = LD_DP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; #endif ST_DP(dst0, pc0); ST_DP(dst1, pc1); pc0 += 2; pc1 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2; pb0 += temp * 2; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 1; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 2; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; pa0 += 1; pb0 += 2; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp1; #else pc0[0] += tmp0; pc1[0] += tmp1; #endif pc0 += 1; pc1 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 1; pb0 += temp * 2; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif B += (k << 1); C += (ldc << 1); } if (n & 1) { pc0 = C; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 1; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 = src_a0 * src_b; res1 = src_a1 * src_b; res2 = src_a2 * src_b; res3 = src_a3 * src_b; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; pb0 += 1; LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; pb0 += 1; } if ((temp - 1) & 1) { LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; res2 += src_a2 * src_b; res3 += src_a3 * src_b; pb0 += 1; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; #else LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; #endif ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 1; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 4; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 1; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 = src_a0 * src_b; res1 = src_a1 * src_b; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; pb0 += 1; LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; pb0 += 1; } if ((temp - 1) & 1) { LD_DP2_INC(pa0, 2, src_a0, src_a1); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; res1 += src_a1 * src_b; pb0 += 1; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; #else LD_DP2(pc0, 2, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; #endif ST_DP2_INC(dst0, dst1, pc0, 2); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 4; pb0 += temp * 1; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 1; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 = src_a0 * src_b; pa0 += 2; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; pa0 += 2; pb0 += 1; src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; pa0 += 2; pb0 += 1; } if ((temp - 1) & 1) { src_a0 = LD_DP(pa0); src_b[0] = pb0[0]; src_b[1] = pb0[0]; res0 += src_a0 * src_b; pa0 += 2; pb0 += 1; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; #else dst0 = LD_DP(pc0); dst0 += res0 * v_alpha; #endif ST_DP(dst0, pc0); pc0 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2; pb0 += temp * 1; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif // #if defined(TRMMKERNEL) } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 1; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 1; // number of values in B #endif #else // #if !defined(TRMMKERNEL) pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; pa0 += 1; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; } #if defined(TRMMKERNEL) pc0[0] = alpha * tmp0; #else pc0[0] += alpha * tmp0; #endif } } } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { if (n >> 2) { if (m >> 3) #ifdef TRMMKERNEL dgemmkernel_8x4_core_msa(m, n, k, alpha, A, B, C, ldc, offset); #else dgemmkernel_8x4_core_msa(m, n, k, alpha, A, B, C, ldc); #endif if (m & 7) #ifdef TRMMKERNEL dgemmkernel_7x4_core_msa(m, n, k, alpha, A, B, C, ldc, offset); #else dgemmkernel_7x4_core_msa(m, n, k, alpha, A, B, C, ldc); #endif } if (n & 3) { B = B + (k << 2) * (n >> 2); C = C + (ldc << 2) * (n >> 2); #ifdef TRMMKERNEL dgemmkernel_8x4_non_core_msa(m, n, k, alpha, A, B, C, ldc, offset); #else dgemmkernel_8x4_non_core_msa(m, n, k, alpha, A, B, C, ldc); #endif } return 0; } OpenBLAS-0.2.20/kernel/mips/dgemm_ncopy_4_msa.c000066400000000000000000000072701313527062700211320ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; psrc0 = src; pdst = dst; for (j = (n >> 2); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; for (i = (m >> 2); i--;) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); ILVRL_D2_DP(src2, src0, dst0, dst4); ILVRL_D2_DP(src6, src4, dst1, dst5); ILVRL_D2_DP(src3, src1, dst2, dst6); ILVRL_D2_DP(src7, src5, dst3, dst7); ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); } for (i = (m & 3); i--;) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; *pdst++ = *psrc3++; *pdst++ = *psrc4++; } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; for (i = (m >> 2); i--;) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); ILVRL_D2_DP(src2, src0, dst0, dst4); ILVRL_D2_DP(src3, src1, dst1, dst5); ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2); } for (i = (m & 3); i--;) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; } } if (n & 1) { psrc1 = psrc0; for (i = (m >> 2); i--;) { LD_DP2(psrc1, 2, src0, src1); psrc1 += 4; ST_DP2(src0, src1, pdst, 2); pdst += 4; } for (i = (m & 3); i--;) { *pdst++ = *psrc1++; } } return 0; } OpenBLAS-0.2.20/kernel/mips/dgemm_ncopy_8_msa.c000066400000000000000000000140101313527062700211240ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; FLOAT *psrc8, *pdst; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; psrc0 = src; pdst = dst; for (j = (n >> 3); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc5 = psrc4 + lda; psrc6 = psrc5 + lda; psrc7 = psrc6 + lda; psrc8 = psrc7 + lda; psrc0 += 8 * lda; for (i = (m >> 3); i--;) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); LD_DP2_INC(psrc5, 2, src8, src9); LD_DP2_INC(psrc6, 2, src10, src11); LD_DP2_INC(psrc7, 2, src12, src13); LD_DP2_INC(psrc8, 2, src14, src15); ILVRL_D2_DP(src2, src0, dst0, dst4); ILVRL_D2_DP(src6, src4, dst1, dst5); ILVRL_D2_DP(src10, src8, dst2, dst6); ILVRL_D2_DP(src14, src12, dst3, dst7); ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); ILVRL_D2_DP(src3, src1, dst0, dst4); ILVRL_D2_DP(src7, src5, dst1, dst5); ILVRL_D2_DP(src11, src9, dst2, dst6); ILVRL_D2_DP(src15, src13, dst3, dst7); ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); LD_DP2_INC(psrc5, 2, src8, src9); LD_DP2_INC(psrc6, 2, src10, src11); LD_DP2_INC(psrc7, 2, src12, src13); LD_DP2_INC(psrc8, 2, src14, src15); ILVRL_D2_DP(src2, src0, dst0, dst4); ILVRL_D2_DP(src6, src4, dst1, dst5); ILVRL_D2_DP(src10, src8, dst2, dst6); ILVRL_D2_DP(src14, src12, dst3, dst7); ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); ILVRL_D2_DP(src3, src1, dst0, dst4); ILVRL_D2_DP(src7, src5, dst1, dst5); ILVRL_D2_DP(src11, src9, dst2, dst6); ILVRL_D2_DP(src15, src13, dst3, dst7); ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); } for (i = (m & 7); i--;) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; *pdst++ = *psrc3++; *pdst++ = *psrc4++; *pdst++ = *psrc5++; *pdst++ = *psrc6++; *pdst++ = *psrc7++; *pdst++ = *psrc8++; } } if (n & 4) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; for (i = (m >> 2); i--;) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); ILVRL_D2_DP(src2, src0, dst0, dst4); ILVRL_D2_DP(src6, src4, dst1, dst5); ILVRL_D2_DP(src3, src1, dst2, dst6); ILVRL_D2_DP(src7, src5, dst3, dst7); ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); } for (i = (m & 3); i--;) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; *pdst++ = *psrc3++; *pdst++ = *psrc4++; } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; for (i = (m >> 1); i--;) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); psrc1 += 2; psrc2 += 2; ILVRL_D2_DP(src1, src0, dst0, dst1); ST_DP2_INC(dst0, dst1, pdst, 2); } if (m & 1) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; } } if (n & 1) { psrc1 = psrc0; for (i = m; i--;) { *pdst++ = *psrc1++; } } return 0; } OpenBLAS-0.2.20/kernel/mips/dgemm_tcopy_4_msa.c000066400000000000000000000102751313527062700211370ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; FLOAT *pdst0, *pdst1, *pdst2, *pdst3; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; psrc0 = src; pdst0 = dst; pdst2 = dst + m * (n & ~3); pdst3 = dst + m * (n & ~1); for (j = (m >> 2); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; pdst1 = pdst0; pdst0 += 16; for (i = (n >> 2); i--;) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); pdst1 += m * 4; } if (n & 2) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); src2 = LD_DP(psrc3); src3 = LD_DP(psrc4); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); } if (n & 1) { *pdst3++ = *psrc1++; *pdst3++ = *psrc2++; *pdst3++ = *psrc3++; *pdst3++ = *psrc4++; } } if (m & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; pdst1 = pdst0; pdst0 += 8; for (i = (n >> 2); i--;) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); ST_DP4(src0, src1, src2, src3, pdst1, 2); pdst1 += m * 4; } if (n & 2) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); psrc1 += 2; psrc2 += 2; ST_DP2_INC(src0, src1, pdst2, 2); } if (n & 1) { *pdst3++ = *psrc1++; *pdst3++ = *psrc2++; } } if (m & 1) { psrc1 = psrc0; pdst1 = pdst0; for (i = (n >> 2); i--;) { LD_DP2_INC(psrc1, 2, src0, src1); ST_DP2(src0, src1, pdst1, 2); pdst1 += 4 * m; } if (n & 2) { src0 = LD_DP(psrc1); psrc1 += 2; ST_DP(src0, pdst2); } if (n & 1) { *pdst3 = *psrc1; } } return 0; } OpenBLAS-0.2.20/kernel/mips/dgemm_tcopy_8_msa.c000066400000000000000000000177151313527062700211510ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, FLOAT * __restrict dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; FLOAT *psrc5, *psrc6, *psrc7, *psrc8; FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; psrc0 = src; pdst0 = dst; pdst2 = dst + m * (n & ~7); pdst3 = dst + m * (n & ~3); pdst4 = dst + m * (n & ~1); for (j = (m >> 3); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc5 = psrc4 + lda; psrc6 = psrc5 + lda; psrc7 = psrc6 + lda; psrc8 = psrc7 + lda; psrc0 += 8 * lda; pdst1 = pdst0; pdst0 += 64; for (i = (n >> 3); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, pdst1 + 16, 2); LD_DP4_INC(psrc5, 2, src0, src1, src2, src3); LD_DP4_INC(psrc6, 2, src4, src5, src6, src7); LD_DP4_INC(psrc7, 2, src8, src9, src10, src11); LD_DP4_INC(psrc8, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, pdst1 + 48, 2); pdst1 += m * 8; } if (n & 4) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); LD_DP2_INC(psrc5, 2, src8, src9); LD_DP2_INC(psrc6, 2, src10, src11); LD_DP2_INC(psrc7, 2, src12, src13); LD_DP2_INC(psrc8, 2, src14, src15); ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst2, 2); } if (n & 2) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); src2 = LD_DP(psrc3); src3 = LD_DP(psrc4); src4 = LD_DP(psrc5); src5 = LD_DP(psrc6); src6 = LD_DP(psrc7); src7 = LD_DP(psrc8); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; psrc5 += 2; psrc6 += 2; psrc7 += 2; psrc8 += 2; ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); } if (n & 1) { *pdst4++ = *psrc1++; *pdst4++ = *psrc2++; *pdst4++ = *psrc3++; *pdst4++ = *psrc4++; *pdst4++ = *psrc5++; *pdst4++ = *psrc6++; *pdst4++ = *psrc7++; *pdst4++ = *psrc8++; } } if (m & 4) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; pdst1 = pdst0; pdst0 += 32; for (i = (n >> 3); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, pdst1 + 16, 2); pdst1 += 8 * m; } if (n & 4) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); } if (n & 2) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); src2 = LD_DP(psrc3); src3 = LD_DP(psrc4); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); } if (n & 1) { *pdst4++ = *psrc1++; *pdst4++ = *psrc2++; *pdst4++ = *psrc3++; *pdst4++ = *psrc4++; } } if (m & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; pdst1 = pdst0; pdst0 += 16; for (i = (n >> 3); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); pdst1 += 8 * m; } if (n & 4) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); } if (n & 2) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); psrc1 += 2; psrc2 += 2; ST_DP2_INC(src0, src1, pdst3, 2); } if (n & 1) { *pdst4++ = *psrc1++; *pdst4++ = *psrc2++; } } if (m & 1) { psrc1 = psrc0; psrc0 += lda; pdst1 = pdst0; pdst0 += 8; for (i = (n >> 3); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); ST_DP4(src0, src1, src2, src3, pdst1, 2); pdst1 += 8 * m; } if (n & 4) { LD_DP2_INC(psrc1, 2, src0, src1); ST_DP2_INC(src0, src1, pdst2, 2); } if (n & 2) { src0 = LD_DP(psrc1); psrc1 += 2; ST_DP(src0, pdst3); pdst3 += 2; } if (n & 1) { *pdst4++ = *psrc1++; } } return 0; } OpenBLAS-0.2.20/kernel/mips/dgemv_n_msa.c000066400000000000000000000574351313527062700200350ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #define DGEMV_N_8x8() \ { \ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ y2 += tp0 * t2; \ y3 += tp0 * t3; \ \ y0 += tp1 * t4; \ y1 += tp1 * t5; \ y2 += tp1 * t6; \ y3 += tp1 * t7; \ \ y0 += tp2 * t8; \ y1 += tp2 * t9; \ y2 += tp2 * t10; \ y3 += tp2 * t11; \ \ y0 += tp3 * t12; \ y1 += tp3 * t13; \ y2 += tp3 * t14; \ y3 += tp3 * t15; \ \ y0 += tp4 * t16; \ y1 += tp4 * t17; \ y2 += tp4 * t18; \ y3 += tp4 * t19; \ \ y0 += tp5 * t20; \ y1 += tp5 * t21; \ y2 += tp5 * t22; \ y3 += tp5 * t23; \ \ y0 += tp6 * t24; \ y1 += tp6 * t25; \ y2 += tp6 * t26; \ y3 += tp6 * t27; \ \ y0 += tp7 * t28; \ y1 += tp7 * t29; \ y2 += tp7 * t30; \ y3 += tp7 * t31; \ } #define DGEMV_N_4x8() \ { \ LD_DP2(pa0 + k, 2, t0, t1); \ LD_DP2(pa1 + k, 2, t4, t5); \ LD_DP2(pa2 + k, 2, t8, t9); \ LD_DP2(pa3 + k, 2, t12, t13); \ LD_DP2(pa4 + k, 2, t16, t17); \ LD_DP2(pa5 + k, 2, t20, t21); \ LD_DP2(pa6 + k, 2, t24, t25); \ LD_DP2(pa7 + k, 2, t28, t29); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ \ y0 += tp1 * t4; \ y1 += tp1 * t5; \ \ y0 += tp2 * t8; \ y1 += tp2 * t9; \ \ y0 += tp3 * t12; \ y1 += tp3 * t13; \ \ y0 += tp4 * t16; \ y1 += tp4 * t17; \ \ y0 += tp5 * t20; \ y1 += tp5 * t21; \ \ y0 += tp6 * t24; \ y1 += tp6 * t25; \ \ y0 += tp7 * t28; \ y1 += tp7 * t29; \ } #define DGEMV_N_8x4() \ { \ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ y2 += tp0 * t2; \ y3 += tp0 * t3; \ \ y0 += tp1 * t4; \ y1 += tp1 * t5; \ y2 += tp1 * t6; \ y3 += tp1 * t7; \ \ y0 += tp2 * t8; \ y1 += tp2 * t9; \ y2 += tp2 * t10; \ y3 += tp2 * t11; \ \ y0 += tp3 * t12; \ y1 += tp3 * t13; \ y2 += tp3 * t14; \ y3 += tp3 * t15; \ } #define DGEMV_N_4x4() \ { \ LD_DP2(pa0 + k, 2, t0, t1); \ LD_DP2(pa1 + k, 2, t4, t5); \ LD_DP2(pa2 + k, 2, t8, t9); \ LD_DP2(pa3 + k, 2, t12, t13); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ \ y0 += tp1 * t4; \ y1 += tp1 * t5; \ \ y0 += tp2 * t8; \ y1 += tp2 * t9; \ \ y0 += tp3 * t12; \ y1 += tp3 * t13; \ } #define DGEMV_N_8x2() \ { \ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ y2 += tp0 * t2; \ y3 += tp0 * t3; \ \ y0 += tp1 * t4; \ y1 += tp1 * t5; \ y2 += tp1 * t6; \ y3 += tp1 * t7; \ } #define DGEMV_N_4x2() \ { \ LD_DP2(pa0 + k, 2, t0, t1); \ LD_DP2(pa1 + k, 2, t4, t5); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ \ y0 += tp1 * t4; \ y1 += tp1 * t5; \ } #define DLOAD_X8_SCALE_GP() \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ temp4 = alpha * x[4 * inc_x]; \ temp5 = alpha * x[5 * inc_x]; \ temp6 = alpha * x[6 * inc_x]; \ temp7 = alpha * x[7 * inc_x]; \ \ tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \ tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \ tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \ tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \ #define DLOAD_X4_SCALE_GP() \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ \ tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \ tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \ #define DLOAD_X8_SCALE_VECTOR() \ LD_DP4(x, 2, x0, x1, x2, x3); \ \ x0 = x0 * v_alpha; \ x1 = x1 * v_alpha; \ x2 = x2 * v_alpha; \ x3 = x3 * v_alpha; \ \ SPLATI_D2_DP(x0, tp0, tp1); \ SPLATI_D2_DP(x1, tp2, tp3); \ SPLATI_D2_DP(x2, tp4, tp5); \ SPLATI_D2_DP(x3, tp6, tp7); \ #define DLOAD_X4_SCALE_VECTOR() \ LD_DP2(x, 2, x0, x1); \ \ x0 = x0 * v_alpha; \ x1 = x1 * v_alpha; \ \ SPLATI_D2_DP(x0, tp0, tp1); \ SPLATI_D2_DP(x1, tp2, tp3); \ #define DLOAD_Y8_GP() \ y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \ y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \ y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \ y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \ #define DLOAD_Y4_GP() \ y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \ y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \ y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \ y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \ #define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3); #define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1); #define DSTORE_Y8_GP() \ *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \ *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \ *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \ *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \ #define DSTORE_Y4_GP() \ *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \ *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \ *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \ *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \ #define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2); #define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2); #define DGEMV_N_MSA() \ for (j = (n >> 3); j--;) \ { \ DLOAD_X8_SCALE(); \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ DLOAD_Y8(); \ DGEMV_N_8x8(); \ DSTORE_Y8(); \ \ y += 8 * inc_y; \ k += 8; \ } \ \ if (m & 4) \ { \ DLOAD_Y4(); \ DGEMV_N_4x8(); \ DSTORE_Y4(); \ \ y += 4 * inc_y; \ k += 4; \ } \ \ if (m & 3) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ temp4 = alpha * x[4 * inc_x]; \ temp5 = alpha * x[5 * inc_x]; \ temp6 = alpha * x[6 * inc_x]; \ temp7 = alpha * x[7 * inc_x]; \ \ for (i = (m & 3); i--;) \ { \ temp = y[0]; \ temp += temp0 * pa0[k]; \ temp += temp1 * pa1[k]; \ temp += temp2 * pa2[k]; \ temp += temp3 * pa3[k]; \ temp += temp4 * pa4[k]; \ temp += temp5 * pa5[k]; \ temp += temp6 * pa6[k]; \ temp += temp7 * pa7[k]; \ y[0] = temp; \ \ y += inc_y; \ k++; \ } \ } \ pa0 += 8 * lda; \ pa1 += 8 * lda; \ pa2 += 8 * lda; \ pa3 += 8 * lda; \ pa4 += 8 * lda; \ pa5 += 8 * lda; \ pa6 += 8 * lda; \ pa7 += 8 * lda; \ \ x += 8 * inc_x; \ } \ \ if (n & 4) \ { \ DLOAD_X4_SCALE(); \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ DLOAD_Y8(); \ DGEMV_N_8x4(); \ DSTORE_Y8(); \ \ y += 8 * inc_y; \ k += 8; \ } \ \ if (m & 4) \ { \ DLOAD_Y4(); \ DGEMV_N_4x4(); \ DSTORE_Y4(); \ \ y += 4 * inc_y; \ k += 4; \ } \ \ if (m & 3) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ \ for (i = (m & 3); i--;) \ { \ temp = y[0]; \ temp += temp0 * pa0[k]; \ temp += temp1 * pa1[k]; \ temp += temp2 * pa2[k]; \ temp += temp3 * pa3[k]; \ y[0] = temp; \ \ y += inc_y; \ k++; \ } \ } \ \ pa0 += 4 * lda; \ pa1 += 4 * lda; \ pa2 += 4 * lda; \ pa3 += 4 * lda; \ \ x += 4 * inc_x; \ } \ \ if (n & 2) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ \ tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \ tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ DLOAD_Y8(); \ DGEMV_N_8x2(); \ DSTORE_Y8(); \ \ y += 8 * inc_y; \ k += 8; \ } \ \ if (m & 4) \ { \ DLOAD_Y4(); \ DGEMV_N_4x2(); \ DSTORE_Y4(); \ \ y += 4 * inc_y; \ k += 4; \ } \ \ if (m & 3) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ \ for (i = (m & 3); i--;) \ { \ temp = y[0]; \ temp += temp0 * pa0[k]; \ temp += temp1 * pa1[k]; \ y[0] = temp; \ \ y += inc_y; \ k++; \ } \ } \ \ pa0 += 2 * lda; \ pa1 += 2 * lda; \ \ x += 2 * inc_x; \ } \ \ if (n & 1) \ { \ temp = alpha * x[0]; \ \ k = 0; \ y = y_org; \ \ for (i = m; i--;) \ { \ y[0] += temp * pa0[k]; \ y += inc_y; \ k++; \ } \ } \ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i, j, k; FLOAT *y_org = y; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v2f64 v_alpha; v2f64 x0, x1, x2, x3, y0, y1, y2, y3; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); pa0 = A; pa1 = A + lda; pa2 = A + 2 * lda; pa3 = A + 3 * lda; pa4 = A + 4 * lda; pa5 = A + 5 * lda; pa6 = A + 6 * lda; pa7 = A + 7 * lda; if ((1 == inc_x) && (1 == inc_y)) { #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR #define DLOAD_Y8 DLOAD_Y8_VECTOR #define DLOAD_Y4 DLOAD_Y4_VECTOR #define DSTORE_Y8 DSTORE_Y8_VECTOR #define DSTORE_Y4 DSTORE_Y4_VECTOR DGEMV_N_MSA(); #undef DLOAD_X8_SCALE #undef DLOAD_X4_SCALE #undef DLOAD_Y8 #undef DLOAD_Y4 #undef DSTORE_Y8 #undef DSTORE_Y4 } else if (1 == inc_y) { #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP #define DLOAD_Y8 DLOAD_Y8_VECTOR #define DLOAD_Y4 DLOAD_Y4_VECTOR #define DSTORE_Y8 DSTORE_Y8_VECTOR #define DSTORE_Y4 DSTORE_Y4_VECTOR DGEMV_N_MSA(); #undef DLOAD_X8_SCALE #undef DLOAD_X4_SCALE #undef DLOAD_Y8 #undef DLOAD_Y4 #undef DSTORE_Y8 #undef DSTORE_Y4 } else if (1 == inc_x) { #define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR #define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR #define DLOAD_Y8 DLOAD_Y8_GP #define DLOAD_Y4 DLOAD_Y4_GP #define DSTORE_Y8 DSTORE_Y8_GP #define DSTORE_Y4 DSTORE_Y4_GP DGEMV_N_MSA(); #undef DLOAD_X8_SCALE #undef DLOAD_X4_SCALE #undef DLOAD_Y8 #undef DLOAD_Y4 #undef DSTORE_Y8 #undef DSTORE_Y4 } else { #define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP #define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP #define DLOAD_Y8 DLOAD_Y8_GP #define DLOAD_Y4 DLOAD_Y4_GP #define DSTORE_Y8 DSTORE_Y8_GP #define DSTORE_Y4 DSTORE_Y4_GP DGEMV_N_MSA(); #undef DLOAD_X8_SCALE #undef DLOAD_X4_SCALE #undef DLOAD_Y8 #undef DLOAD_Y4 #undef DSTORE_Y8 #undef DSTORE_Y4 } return(0); } OpenBLAS-0.2.20/kernel/mips/dgemv_t_msa.c000066400000000000000000000562441313527062700200400ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #define DGEMV_T_8x8() \ { \ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \ LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \ LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \ LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ tp0 += x2 * t2; \ tp0 += x3 * t3; \ \ tp1 += x0 * t4; \ tp1 += x1 * t5; \ tp1 += x2 * t6; \ tp1 += x3 * t7; \ \ tp2 += x0 * t8; \ tp2 += x1 * t9; \ tp2 += x2 * t10; \ tp2 += x3 * t11; \ \ tp3 += x0 * t12; \ tp3 += x1 * t13; \ tp3 += x2 * t14; \ tp3 += x3 * t15; \ \ tp4 += x0 * t16; \ tp4 += x1 * t17; \ tp4 += x2 * t18; \ tp4 += x3 * t19; \ \ tp5 += x0 * t20; \ tp5 += x1 * t21; \ tp5 += x2 * t22; \ tp5 += x3 * t23; \ \ tp6 += x0 * t24; \ tp6 += x1 * t25; \ tp6 += x2 * t26; \ tp6 += x3 * t27; \ \ tp7 += x0 * t28; \ tp7 += x1 * t29; \ tp7 += x2 * t30; \ tp7 += x3 * t31; \ } #define DGEMV_T_8x4() \ { \ LD_DP2(pa0 + k, 2, t0, t1); \ LD_DP2(pa1 + k, 2, t4, t5); \ LD_DP2(pa2 + k, 2, t8, t9); \ LD_DP2(pa3 + k, 2, t12, t13); \ LD_DP2(pa4 + k, 2, t16, t17); \ LD_DP2(pa5 + k, 2, t20, t21); \ LD_DP2(pa6 + k, 2, t24, t25); \ LD_DP2(pa7 + k, 2, t28, t29); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ \ tp1 += x0 * t4; \ tp1 += x1 * t5; \ \ tp2 += x0 * t8; \ tp2 += x1 * t9; \ \ tp3 += x0 * t12; \ tp3 += x1 * t13; \ \ tp4 += x0 * t16; \ tp4 += x1 * t17; \ \ tp5 += x0 * t20; \ tp5 += x1 * t21; \ \ tp6 += x0 * t24; \ tp6 += x1 * t25; \ \ tp7 += x0 * t28; \ tp7 += x1 * t29; \ } #define DGEMV_T_8x2() \ { \ t0 = LD_DP(pa0 + k); \ t4 = LD_DP(pa1 + k); \ t8 = LD_DP(pa2 + k); \ t12 = LD_DP(pa3 + k); \ t16 = LD_DP(pa4 + k); \ t20 = LD_DP(pa5 + k); \ t24 = LD_DP(pa6 + k); \ t28 = LD_DP(pa7 + k); \ \ tp0 += x0 * t0; \ tp1 += x0 * t4; \ tp2 += x0 * t8; \ tp3 += x0 * t12; \ tp4 += x0 * t16; \ tp5 += x0 * t20; \ tp6 += x0 * t24; \ tp7 += x0 * t28; \ } #define DGEMV_T_4x8() \ { \ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \ LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ tp0 += x2 * t2; \ tp0 += x3 * t3; \ \ tp1 += x0 * t4; \ tp1 += x1 * t5; \ tp1 += x2 * t6; \ tp1 += x3 * t7; \ \ tp2 += x0 * t8; \ tp2 += x1 * t9; \ tp2 += x2 * t10; \ tp2 += x3 * t11; \ \ tp3 += x0 * t12; \ tp3 += x1 * t13; \ tp3 += x2 * t14; \ tp3 += x3 * t15; \ } #define DGEMV_T_4x4() \ { \ LD_DP2(pa0 + k, 2, t0, t1); \ LD_DP2(pa1 + k, 2, t4, t5); \ LD_DP2(pa2 + k, 2, t8, t9); \ LD_DP2(pa3 + k, 2, t12, t13); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ \ tp1 += x0 * t4; \ tp1 += x1 * t5; \ \ tp2 += x0 * t8; \ tp2 += x1 * t9; \ \ tp3 += x0 * t12; \ tp3 += x1 * t13; \ } #define DGEMV_T_4x2() \ { \ t0 = LD_DP(pa0 + k); \ t4 = LD_DP(pa1 + k); \ t8 = LD_DP(pa2 + k); \ t12 = LD_DP(pa3 + k); \ \ tp0 += x0 * t0; \ tp1 += x0 * t4; \ tp2 += x0 * t8; \ tp3 += x0 * t12; \ } #define DGEMV_T_2x8() \ { \ LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \ LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ tp0 += x2 * t2; \ tp0 += x3 * t3; \ \ tp1 += x0 * t4; \ tp1 += x1 * t5; \ tp1 += x2 * t6; \ tp1 += x3 * t7; \ } #define DGEMV_T_2x4() \ { \ LD_DP2(pa0 + k, 2, t0, t1); \ LD_DP2(pa1 + k, 2, t4, t5); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ \ tp1 += x0 * t4; \ tp1 += x1 * t5; \ } #define DGEMV_T_2x2() \ { \ t0 = LD_DP(pa0 + k); \ t4 = LD_DP(pa1 + k); \ \ tp0 += x0 * t0; \ tp1 += x0 * t4; \ } #define DLOAD_X8_GP() \ x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \ x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \ x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \ x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \ #define DLOAD_X4_GP() \ x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \ x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \ #define DLOAD_X2_GP() \ x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \ x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \ #define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3); #define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1); #define DLOAD_X2_VECTOR() x0 = LD_DP(x); #define DGEMV_T_MSA() \ for (j = (n >> 3); j--;) \ { \ tp0 = zero; \ tp1 = zero; \ tp2 = zero; \ tp3 = zero; \ tp4 = zero; \ tp5 = zero; \ tp6 = zero; \ tp7 = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ DLOAD_X8(); \ DGEMV_T_8x8(); \ \ x += 8 * inc_x; \ k += 8; \ } \ \ if (m & 4) \ { \ DLOAD_X4(); \ DGEMV_T_8x4(); \ \ x += 4 * inc_x; \ k += 4; \ } \ \ if (m & 2) \ { \ DLOAD_X2(); \ DGEMV_T_8x2(); \ \ x += 2 * inc_x; \ k += 2; \ } \ \ ILVRL_D2_DP(tp1, tp0, t0, t4); \ ILVRL_D2_DP(tp3, tp2, t1, t5); \ ILVRL_D2_DP(tp5, tp4, t2, t6); \ ILVRL_D2_DP(tp7, tp6, t3, t7); \ ADD2(t0, t4, t1, t5, t0, t1); \ ADD2(t2, t6, t3, t7, t2, t3); \ \ temp0 = t0[0]; \ temp1 = t0[1]; \ temp2 = t1[0]; \ temp3 = t1[1]; \ temp4 = t2[0]; \ temp5 = t2[1]; \ temp6 = t3[0]; \ temp7 = t3[1]; \ \ if (m & 1) \ { \ temp0 += pa0[k] * x[0]; \ temp1 += pa1[k] * x[0]; \ temp2 += pa2[k] * x[0]; \ temp3 += pa3[k] * x[0]; \ temp4 += pa4[k] * x[0]; \ temp5 += pa5[k] * x[0]; \ temp6 += pa6[k] * x[0]; \ temp7 += pa7[k] * x[0]; \ \ x += inc_x; \ k++; \ } \ \ res0 = y[0 * inc_y]; \ res1 = y[1 * inc_y]; \ res2 = y[2 * inc_y]; \ res3 = y[3 * inc_y]; \ res4 = y[4 * inc_y]; \ res5 = y[5 * inc_y]; \ res6 = y[6 * inc_y]; \ res7 = y[7 * inc_y]; \ \ res0 += alpha * temp0; \ res1 += alpha * temp1; \ res2 += alpha * temp2; \ res3 += alpha * temp3; \ res4 += alpha * temp4; \ res5 += alpha * temp5; \ res6 += alpha * temp6; \ res7 += alpha * temp7; \ \ y[0 * inc_y] = res0; \ y[1 * inc_y] = res1; \ y[2 * inc_y] = res2; \ y[3 * inc_y] = res3; \ y[4 * inc_y] = res4; \ y[5 * inc_y] = res5; \ y[6 * inc_y] = res6; \ y[7 * inc_y] = res7; \ \ y += 8 * inc_y; \ \ pa0 += 8 * lda; \ pa1 += 8 * lda; \ pa2 += 8 * lda; \ pa3 += 8 * lda; \ pa4 += 8 * lda; \ pa5 += 8 * lda; \ pa6 += 8 * lda; \ pa7 += 8 * lda; \ } \ \ if (n & 4) \ { \ tp0 = zero; \ tp1 = zero; \ tp2 = zero; \ tp3 = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ DLOAD_X8(); \ DGEMV_T_4x8(); \ \ x += 8 * inc_x; \ k += 8; \ } \ \ if (m & 4) \ { \ DLOAD_X4(); \ DGEMV_T_4x4(); \ \ x += 4 * inc_x; \ k += 4; \ } \ \ if (m & 2) \ { \ DLOAD_X2(); \ DGEMV_T_4x2(); \ \ x += 2 * inc_x; \ k += 2; \ } \ \ ILVRL_D2_DP(tp1, tp0, t0, t4); \ ILVRL_D2_DP(tp3, tp2, t1, t5); \ ADD2(t0, t4, t1, t5, t0, t1); \ \ temp0 = t0[0]; \ temp1 = t0[1]; \ temp2 = t1[0]; \ temp3 = t1[1]; \ \ if (m & 1) \ { \ temp0 += pa0[k] * x[0]; \ temp1 += pa1[k] * x[0]; \ temp2 += pa2[k] * x[0]; \ temp3 += pa3[k] * x[0]; \ \ x += inc_x; \ k++; \ } \ \ res0 = y[0 * inc_y]; \ res1 = y[1 * inc_y]; \ res2 = y[2 * inc_y]; \ res3 = y[3 * inc_y]; \ \ res0 += alpha * temp0; \ res1 += alpha * temp1; \ res2 += alpha * temp2; \ res3 += alpha * temp3; \ \ y[0 * inc_y] = res0; \ y[1 * inc_y] = res1; \ y[2 * inc_y] = res2; \ y[3 * inc_y] = res3; \ \ y += 4 * inc_y; \ \ pa0 += 4 * lda; \ pa1 += 4 * lda; \ pa2 += 4 * lda; \ pa3 += 4 * lda; \ } \ \ if (n & 2) \ { \ tp0 = zero; \ tp1 = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ DLOAD_X8(); \ DGEMV_T_2x8(); \ \ x += 8 * inc_x; \ k += 8; \ } \ \ if (m & 4) \ { \ DLOAD_X4(); \ DGEMV_T_2x4(); \ \ x += 4 * inc_x; \ k += 4; \ } \ \ if (m & 2) \ { \ DLOAD_X2(); \ DGEMV_T_2x2(); \ \ x += 2 * inc_x; \ k += 2; \ } \ \ ILVRL_D2_DP(tp1, tp0, t0, t4); \ \ t0 += t4; \ \ temp0 = t0[0]; \ temp1 = t0[1]; \ \ if (m & 1) \ { \ temp0 += pa0[k] * x[0]; \ temp1 += pa1[k] * x[0]; \ x += inc_x; \ k++; \ } \ \ res0 = y[0 * inc_y]; \ res1 = y[1 * inc_y]; \ \ res0 += alpha * temp0; \ res1 += alpha * temp1; \ \ y[0 * inc_y] = res0; \ y[1 * inc_y] = res1; \ \ y += 2 * inc_y; \ \ pa0 += 2 * lda; \ pa1 += 2 * lda; \ } \ \ if (n & 1) \ { \ temp0 = 0.0; \ \ k = 0; \ x = srcx_org; \ \ for (i = m; i--;) \ { \ temp0 += pa0[k] * x[0]; \ x += inc_x; \ k++; \ } \ \ y[0] += alpha * temp0; \ y += inc_y; \ pa0 += lda; \ } int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i, j, k; FLOAT *srcx_org = x; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; FLOAT res0, res1, res2, res3, res4, res5, res6, res7; v2f64 x0, x1, x2, x3; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; v2f64 zero = {0}; pa0 = A + 0 * lda; pa1 = A + 1 * lda; pa2 = A + 2 * lda; pa3 = A + 3 * lda; pa4 = A + 4 * lda; pa5 = A + 5 * lda; pa6 = A + 6 * lda; pa7 = A + 7 * lda; if (1 == inc_x) { #define DLOAD_X8 DLOAD_X8_VECTOR #define DLOAD_X4 DLOAD_X4_VECTOR #define DLOAD_X2 DLOAD_X2_VECTOR DGEMV_T_MSA(); #undef DLOAD_X8 #undef DLOAD_X4 #undef DLOAD_X2 } else { #define DLOAD_X8 DLOAD_X8_GP #define DLOAD_X4 DLOAD_X4_GP #define DLOAD_X2 DLOAD_X2_GP DGEMV_T_MSA(); #undef DLOAD_X8 #undef DLOAD_X4 #undef DLOAD_X2 } return(0); } OpenBLAS-0.2.20/kernel/mips/dot.c000066400000000000000000000037751313527062700163420ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif { BLASLONG i=0; BLASLONG ix=0,iy=0; double dot = 0.0 ; if ( n < 0 ) return(dot); while(i < n) { dot += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(dot); } OpenBLAS-0.2.20/kernel/mips/drot_msa.c000066400000000000000000001046211313527062700173540ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { BLASLONG i, j; FLOAT *px, *py; FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3; v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; v2f64 out0, out1, out2, out3, out4, out5, out6, out7; v2f64 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0; if (n <= 0) return (0); px = x; py = y; if ((1 == inc_x) && (1 == inc_y)) { if ((0 == c) && (0 == s)) { v2f64 zero = {0, 0}; zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0); zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0); /* process 4 floats */ for (j = (n >> 2); j--;) { ST_DP(zero, px); ST_DP(zero, py); px += 2; py += 2; ST_DP(zero, px); ST_DP(zero, py); px += 2; py += 2; } if (n & 2) { ST_DP(zero, px); ST_DP(zero, py); px += 2; py += 2; } if (n & 1) { px[0] = 0; py[0] = 0; } } else if ((1 == c) && (1 == s)) { if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } x0 = LD_DP(px); px += 2; x1 = LD_DP(px); px += 2; x2 = LD_DP(px); px += 2; x3 = LD_DP(px); px += 2; y0 = LD_DP(py); py += 2; y1 = LD_DP(py); py += 2; y2 = LD_DP(py); py += 2; y3 = LD_DP(py); py += 2; for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 16); PREFETCH(px + pref_offsetx + 20); PREFETCH(px + pref_offsetx + 24); PREFETCH(px + pref_offsetx + 28); PREFETCH(py + pref_offsety + 16); PREFETCH(py + pref_offsety + 20); PREFETCH(py + pref_offsety + 24); PREFETCH(py + pref_offsety + 28); out0 = x0 + y0; x4 = LD_DP(px); px += 2; out1 = y0 - x0; x5 = LD_DP(px); px += 2; out2 = x1 + y1; x6 = LD_DP(px); px += 2; out3 = y1 - x1; x7 = LD_DP(px); px += 2; out4 = x2 + y2; y4 = LD_DP(py); py += 2; out5 = y2 - x2; y5 = LD_DP(py); py += 2; out6 = x3 + y3; y6 = LD_DP(py); py += 2; out7 = y3 - x3; y7 = LD_DP(py); py += 2; ST_DP(out0, x); x += 2; out8 = x4 + y4; ST_DP(out1, y); y += 2; out9 = y4 - x4; ST_DP(out2, x); x += 2; out10 = x5 + y5; ST_DP(out3, y); y += 2; out11 = y5 - x5; ST_DP(out4, x); x += 2; out12 = x6 + y6; ST_DP(out5, y); y += 2; out13 = y6 - x6; ST_DP(out6, x); x += 2; out14 = x7 + y7; ST_DP(out7, y); y += 2; out15 = y7 - x7; x0 = LD_DP(px); px += 2; ST_DP(out8, x); x += 2; x1 = LD_DP(px); px += 2; ST_DP(out10, x); x += 2; x2 = LD_DP(px); px += 2; ST_DP(out12, x); x += 2; x3 = LD_DP(px); px += 2; ST_DP(out14, x); x += 2; y0 = LD_DP(py); py += 2; ST_DP(out9, y); y += 2; y1 = LD_DP(py); py += 2; ST_DP(out11, y); y += 2; y2 = LD_DP(py); py += 2; ST_DP(out13, y); y += 2; y3 = LD_DP(py); py += 2; ST_DP(out15, y); y += 2; } x4 = LD_DP(px); px += 2; x5 = LD_DP(px); px += 2; x6 = LD_DP(px); px += 2; x7 = LD_DP(px); px += 2; y4 = LD_DP(py); py += 2; y5 = LD_DP(py); py += 2; y6 = LD_DP(py); py += 2; y7 = LD_DP(py); py += 2; out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; out4 = x2 + y2; out5 = y2 - x2; out6 = x3 + y3; out7 = y3 - x3; out8 = x4 + y4; out9 = y4 - x4; out10 = x5 + y5; out11 = y5 - x5; out12 = x6 + y6; out13 = y6 - x6; out14 = x7 + y7; out15 = y7 - x7; ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 2); ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 2); } if (n & 8) { LD_DP4_INC(px, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; out4 = x2 + y2; out5 = y2 - x2; out6 = x3 + y3; out7 = y3 - x3; ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); } if (n & 4) { LD_DP2_INC(px, 2, x0, x1); LD_DP2_INC(py, 2, y0, y1); out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; ST_DP2_INC(out0, out2, x, 2); ST_DP2_INC(out1, out3, y, 2); } if (n & 2) { x0 = LD_DP(px); y0 = LD_DP(py); px += 2; py += 2; out0 = x0 + y0; out1 = y0 - x0; ST_DP(out0, x); ST_DP(out1, y); x += 2; y += 2; } if (n & 1) { tp0 = *x + *y; *y = *y - *x; *x = tp0; } } else if (0 == s) { c0 = COPY_DOUBLE_TO_VECTOR(c); if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 16); PREFETCH(px + pref_offsetx + 20); PREFETCH(px + pref_offsetx + 24); PREFETCH(px + pref_offsetx + 28); PREFETCH(py + pref_offsety + 16); PREFETCH(py + pref_offsety + 20); PREFETCH(py + pref_offsety + 24); PREFETCH(py + pref_offsety + 28); y0 = LD_DP(py); py += 2; x0 *= c0; y1 = LD_DP(py); py += 2; x1 *= c0; y2 = LD_DP(py); py += 2; x2 *= c0; y3 = LD_DP(py); py += 2; x3 *= c0; y4 = LD_DP(py); py += 2; x4 *= c0; y5 = LD_DP(py); py += 2; x5 *= c0; y6 = LD_DP(py); py += 2; x6 *= c0; y7 = LD_DP(py); py += 2; x7 *= c0; ST_DP(x0, x); x += 2; y0 *= c0; ST_DP(x1, x); x += 2; y1 *= c0; ST_DP(x2, x); x += 2; y2 *= c0; ST_DP(x3, x); x += 2; y3 *= c0; ST_DP(x4, x); x += 2; y4 *= c0; ST_DP(x5, x); x += 2; y5 *= c0; ST_DP(x6, x); x += 2; y6 *= c0; ST_DP(x7, x); x += 2; y7 *= c0; x0 = LD_DP(px); px += 2; ST_DP(y0, y); y += 2; x1 = LD_DP(px); px += 2; ST_DP(y1, y); y += 2; x2 = LD_DP(px); px += 2; ST_DP(y2, y); y += 2; x3 = LD_DP(px); px += 2; ST_DP(y3, y); y += 2; x4 = LD_DP(px); px += 2; ST_DP(y4, y); y += 2; x5 = LD_DP(px); px += 2; ST_DP(y5, y); y += 2; x6 = LD_DP(px); px += 2; ST_DP(y6, y); y += 2; x7 = LD_DP(px); px += 2; ST_DP(y7, y); y += 2; } LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); x0 *= c0; y0 *= c0; x1 *= c0; y1 *= c0; x2 *= c0; y2 *= c0; x3 *= c0; y3 *= c0; x4 *= c0; y4 *= c0; x5 *= c0; y5 *= c0; x6 *= c0; y6 *= c0; x7 *= c0; y7 *= c0; ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); } if (n & 8) { LD_DP4_INC(px, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); out0 = c0 * x0; out1 = c0 * y0; out2 = c0 * x1; out3 = c0 * y1; out4 = c0 * x2; out5 = c0 * y2; out6 = c0 * x3; out7 = c0 * y3; ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); } if (n & 4) { LD_DP2_INC(px, 2, x0, x1); LD_DP2_INC(py, 2, y0, y1); out0 = c0 * x0; out1 = c0 * y0; out2 = c0 * x1; out3 = c0 * y1; ST_DP2_INC(out0, out2, x, 2); ST_DP2_INC(out1, out3, y, 2); } if (n & 2) { x0 = LD_DP(px); y0 = LD_DP(py); px += 2; py += 2; out0 = c0 * x0; out1 = c0 * y0; ST_DP(out0, x); ST_DP(out1, y); x += 2; y += 2; } if (n & 1) { *x *= c; *y *= c; } } else if (0 == c) { s0 = COPY_DOUBLE_TO_VECTOR(s); /* process 16 floats */ if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_DP4_INC(px, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 16); PREFETCH(px + pref_offsetx + 20); PREFETCH(px + pref_offsetx + 24); PREFETCH(px + pref_offsetx + 28); PREFETCH(py + pref_offsety + 16); PREFETCH(py + pref_offsety + 20); PREFETCH(py + pref_offsety + 24); PREFETCH(py + pref_offsety + 28); x4 = LD_DP(px); px += 2; out0 = s0 * y0; x5 = LD_DP(px); px += 2; out2 = s0 * y1; x6 = LD_DP(px); px += 2; out4 = s0 * y2; x7 = LD_DP(px); px += 2; out6 = s0 * y3; y4 = LD_DP(py); py += 2; out1 = -(s0 * x0); y5 = LD_DP(py); py += 2; out3 = -(s0 * x1); y6 = LD_DP(py); py += 2; out5 = -(s0 * x2); y7 = LD_DP(py); py += 2; out7 = -(s0 * x3); ST_DP(out0, x); x += 2; out0 = s0 * y4; ST_DP(out2, x); x += 2; out2 = s0 * y5; ST_DP(out4, x); x += 2; out4 = s0 * y6; ST_DP(out6, x); x += 2; out6 = s0 * y7; ST_DP(out1, y); y += 2; out1 = -(s0 * x4); ST_DP(out3, y); y += 2; out3 = -(s0 * x5); ST_DP(out5, y); y += 2; out5 = -(s0 * x6); ST_DP(out7, y); y += 2; out7 = -(s0 * x7); x0 = LD_DP(px); px += 2; ST_DP(out0, x); x += 2; x1 = LD_DP(px); px += 2; ST_DP(out2, x); x += 2; x2 = LD_DP(px); px += 2; ST_DP(out4, x); x += 2; x3 = LD_DP(px); px += 2; ST_DP(out6, x); x += 2; y0 = LD_DP(py); py += 2; ST_DP(out1, y); y += 2; y1 = LD_DP(py); py += 2; ST_DP(out3, y); y += 2; y2 = LD_DP(py); py += 2; ST_DP(out5, y); y += 2; y3 = LD_DP(py); py += 2; ST_DP(out7, y); y += 2; } out0 = s0 * y0; out2 = s0 * y1; out4 = s0 * y2; out6 = s0 * y3; out1 = -(s0 * x0); out3 = -(s0 * x1); out5 = -(s0 * x2); out7 = -(s0 * x3); ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); LD_DP4_INC(px, 2, x4, x5, x6, x7); LD_DP4_INC(py, 2, y4, y5, y6, y7); out0 = s0 * y4; out2 = s0 * y5; out4 = s0 * y6; out6 = s0 * y7; out1 = -(s0 * x4); out3 = -(s0 * x5); out5 = -(s0 * x6); out7 = -(s0 * x7); ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); } if (n & 8) { LD_DP4_INC(px, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); out0 = s0 * y0; out1 = - (s0 * x0); out2 = s0 * y1; out3 = - (s0 * x1); out4 = s0 * y2; out5 = - (s0 * x2); out6 = s0 * y3; out7 = - (s0 * x3); ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); } if (n & 4) { LD_DP2_INC(px, 2, x0, x1); LD_DP2_INC(py, 2, y0, y1); out0 = s0 * y0; out1 = - (s0 * x0); out2 = s0 * y1; out3 = - (s0 * x1); ST_DP2_INC(out0, out2, x, 2); ST_DP2_INC(out1, out3, y, 2); } if (n & 2) { x0 = LD_DP(px); px += 2; y0 = LD_DP(py); py += 2; out0 = s0 * y0; out1 = - (s0 * x0); ST_DP(out0, x); x += 2; ST_DP(out1, y); y += 2; } if (n & 1) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = s * fy0; tp1 = - (s * fx0); tp2 = s * fy1; tp3 = - (s * fx1); ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } } else { c0 = COPY_DOUBLE_TO_VECTOR(c); s0 = COPY_DOUBLE_TO_VECTOR(s); /* process 14 doubles */ if (n >> 4) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_DP4_INC(px, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); for (j = (n >> 4) - 1; j--;) { PREFETCH(px + pref_offsetx + 16); PREFETCH(px + pref_offsetx + 20); PREFETCH(px + pref_offsetx + 24); PREFETCH(px + pref_offsetx + 28); PREFETCH(py + pref_offsety + 16); PREFETCH(py + pref_offsety + 20); PREFETCH(py + pref_offsety + 24); PREFETCH(py + pref_offsety + 28); x4 = LD_DP(px); px += 2; out0 = c0 * x0; x5 = LD_DP(px); px += 2; out2 = c0 * x1; x6 = LD_DP(px); px += 2; out4 = c0 * x2; x7 = LD_DP(px); px += 2; out6 = c0 * x3; y4 = LD_DP(py); py += 2; out1 = c0 * y0; y5 = LD_DP(py); py += 2; out3 = c0 * y1; y6 = LD_DP(py); py += 2; out5 = c0 * y2; y7 = LD_DP(py); py += 2; out7 = c0 * y3; out0 += s0 * y0; out2 += s0 * y1; out4 += s0 * y2; out6 += s0 * y3; out1 -= s0 * x0; out3 -= s0 * x1; out5 -= s0 * x2; out7 -= s0 * x3; ST_DP(out0, x); x += 2; out0 = c0 * x4; ST_DP(out2, x); x += 2; out2 = c0 * x5; ST_DP(out4, x); x += 2; out4 = c0 * x6; ST_DP(out6, x); x += 2; out6 = c0 * x7; ST_DP(out1, y); y += 2; out1 = c0 * y4; ST_DP(out3, y); y += 2; out3 = c0 * y5; ST_DP(out5, y); y += 2; out5 = c0 * y6; ST_DP(out7, y); y += 2; out7 = c0 * y7; x0 = LD_DP(px); px += 2; out0 += s0 * y4; x1 = LD_DP(px); px += 2; out2 += s0 * y5; x2 = LD_DP(px); px += 2; out4 += s0 * y6; x3 = LD_DP(px); px += 2; out6 += s0 * y7; y0 = LD_DP(py); py += 2; out1 -= s0 * x4; y1 = LD_DP(py); py += 2; out3 -= s0 * x5; y2 = LD_DP(py); py += 2; out5 -= s0 * x6; y3 = LD_DP(py); py += 2; out7 -= s0 * x7; ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); } out0 = c0 * x0; out0 += s0 * y0; out1 = c0 * y0; out1 -= s0 * x0; out2 = c0 * x1; out2 += s0 * y1; out3 = c0 * y1; out3 -= s0 * x1; out4 = c0 * x2; out4 += s0 * y2; out5 = c0 * y2; out5 -= s0 * x2; out6 = c0 * x3; out6 += s0 * y3; out7 = c0 * y3; out7 -= s0 * x3; ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); LD_DP4_INC(px, 2, x4, x5, x6, x7); LD_DP4_INC(py, 2, y4, y5, y6, y7); out8 = c0 * x4; out8 += s0 * y4; out9 = c0 * y4; out9 -= s0 * x4; out10 = c0 * x5; out10 += s0 * y5; out11 = c0 * y5; out11 -= s0 * x5; out12 = c0 * x6; out12 += s0 * y6; out13 = c0 * y6; out13 -= s0 * x6; out14 = c0 * x7; out14 += s0 * y7; out15 = c0 * y7; out15 -= s0 * x7; ST_DP4_INC(out8, out10, out12, out14, x, 2); ST_DP4_INC(out9, out11, out13, out15, y, 2); } if (n & 8) { LD_DP4_INC(px, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); out2 = (c0 * x1) + (s0 * y1); out3 = (c0 * y1) - (s0 * x1); out4 = (c0 * x2) + (s0 * y2); out5 = (c0 * y2) - (s0 * x2); out6 = (c0 * x3) + (s0 * y3); out7 = (c0 * y3) - (s0 * x3); ST_DP4_INC(out0, out2, out4, out6, x, 2); ST_DP4_INC(out1, out3, out5, out7, y, 2); } if (n & 4) { LD_DP2_INC(px, 2, x0, x1); LD_DP2_INC(py, 2, y0, y1); out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); out2 = (c0 * x1) + (s0 * y1); out3 = (c0 * y1) - (s0 * x1); ST_DP2_INC(out0, out2, x, 2); ST_DP2_INC(out1, out3, y, 2); } if (n & 2) { x0 = LD_DP(px); y0 = LD_DP(py); px += 2; py += 2; out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); ST_DP(out0, x); ST_DP(out1, y); x += 2; y += 2; } if (n & 1) { tp0 = c * *x + s * *y; *y = c * *y - s * *x; *x = tp0; } } } else { if ((0 == c) && (0 == s)) { for (i = n; i--;) { *x = 0; *y = 0; x += inc_x; y += inc_y; } } else if ((1 == c) && (1 == s)) { if (n >> 2) { fx0 = *px; px += inc_x; fx1 = *px; px += inc_x; fx2 = *px; px += inc_x; fx3 = *px; px += inc_x; fy0 = *py; py += inc_y; fy1 = *py; py += inc_y; fy2 = *py; py += inc_y; fy3 = *py; py += inc_y; for (i = (n >> 2) -1; i--;) { tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; tp4 = fx2 + fy2; tp5 = fy2 - fx2; tp6 = fx3 + fy3; tp7 = fy3 - fx3; fx0 = *px; px += inc_x; *x = tp0; x += inc_x; fx1 = *px; px += inc_x; *x = tp2; x += inc_x; fx2 = *px; px += inc_x; *x = tp4; x += inc_x; fx3 = *px; px += inc_x; *x = tp6; x += inc_x; fy0 = *py; py += inc_y; *y = tp1; y += inc_y; fy1 = *py; py += inc_y; *y = tp3; y += inc_y; fy2 = *py; py += inc_y; *y = tp5; y += inc_y; fy3 = *py; py += inc_y; *y = tp7; y += inc_y; } tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; tp4 = fx2 + fy2; tp5 = fy2 - fx2; tp6 = fx3 + fy3; tp7 = fy3 - fx3; *x = tp0; x += inc_x; *x = tp2; x += inc_x; *x = tp4; x += inc_x; *x = tp6; x += inc_x; *y = tp1; y += inc_y; *y = tp3; y += inc_y; *y = tp5; y += inc_y; *y = tp7; y += inc_y; } if (n & 2) { LD_GP2_INC(px, inc_x, fx0, fx1); LD_GP2_INC(py, inc_y, fy0, fy1); tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; ST_GP2_INC(tp0, tp2, x, inc_x); ST_GP2_INC(tp1, tp3, y, inc_y); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = fx0 + fy0; tp1 = fy0 - fx0; *x = tp0; *y = tp1; } } else if (0 == s) { if (n >> 2) { fx0 = *px; px += inc_x; fx1 = *px; px += inc_x; fx2 = *px; px += inc_x; fx3 = *px; px += inc_x; fy0 = *py; py += inc_y; fy1 = *py; py += inc_y; fy2 = *py; py += inc_y; fy3 = *py; py += inc_y; for (i = (n >> 2) - 1; i--;) { tp0 = c * fx0; tp1 = c * fy0; tp2 = c * fx1; tp3 = c * fy1; tp4 = c * fx2; tp5 = c * fy2; tp6 = c * fx3; tp7 = c * fy3; fx0 = *px; px += inc_x; *x = tp0; x += inc_x; fx1 = *px; px += inc_x; *x = tp2; x += inc_x; fx2 = *px; px += inc_x; *x = tp4; x += inc_x; fx3 = *px; px += inc_x; *x = tp6; x += inc_x; fy0 = *py; py += inc_y; *y = tp1; y += inc_y; fy1 = *py; py += inc_y; *y = tp3; y += inc_y; fy2 = *py; py += inc_y; *y = tp5; y += inc_y; fy3 = *py; py += inc_y; *y = tp7; y += inc_y; } tp0 = c * fx0; tp1 = c * fy0; tp2 = c * fx1; tp3 = c * fy1; tp4 = c * fx2; tp5 = c * fy2; tp6 = c * fx3; tp7 = c * fy3; *x = tp0; x += inc_x; *x = tp2; x += inc_x; *x = tp4; x += inc_x; *x = tp6; x += inc_x; *y = tp1; y += inc_y; *y = tp3; y += inc_y; *y = tp5; y += inc_y; *y = tp7; y += inc_y; } if (n & 2) { LD_GP2_INC(px, inc_x, fx0, fx1); LD_GP2_INC(py, inc_y, fy0, fy1); tp0 = c * fx0; tp1 = c * fy0; tp2 = c * fx1; tp3 = c * fy1; ST_GP2_INC(tp0, tp2, x, inc_x); ST_GP2_INC(tp1, tp3, y, inc_y); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = c * fx0; tp1 = c * fy0; *x = tp0; *y = tp1; } } else { if (n >> 2) { fx0 = *px; px += inc_x; fx1 = *px; px += inc_x; fx2 = *px; px += inc_x; fx3 = *px; px += inc_x; fy0 = *py; py += inc_y; fy1 = *py; py += inc_y; fy2 = *py; py += inc_y; fy3 = *py; py += inc_y; for (i = (n >> 2) - 1; i--;) { tp0 = c * fx0 + s * fy0; tp1 = c * fy0 - s * fx0; tp2 = c * fx1 + s * fy1; tp3 = c * fy1 - s * fx1; tp4 = c * fx2 + s * fy2; tp5 = c * fy2 - s * fx2; tp6 = c * fx3 + s * fy3; tp7 = c * fy3 - s * fx3; fx0 = *px; px += inc_x; *x = tp0; x += inc_x; fx1 = *px; px += inc_x; *x = tp2; x += inc_x; fx2 = *px; px += inc_x; *x = tp4; x += inc_x; fx3 = *px; px += inc_x; *x = tp6; x += inc_x; fy0 = *py; py += inc_y; *y = tp1; y += inc_y; fy1 = *py; py += inc_y; *y = tp3; y += inc_y; fy2 = *py; py += inc_y; *y = tp5; y += inc_y; fy3 = *py; py += inc_y; *y = tp7; y += inc_y; } tp0 = c * fx0 + s * fy0; tp1 = c * fy0 - s * fx0; tp2 = c * fx1 + s * fy1; tp3 = c * fy1 - s * fx1; tp4 = c * fx2 + s * fy2; tp5 = c * fy2 - s * fx2; tp6 = c * fx3 + s * fy3; tp7 = c * fy3 - s * fx3; *x = tp0; x += inc_x; *x = tp2; x += inc_x; *x = tp4; x += inc_x; *x = tp6; x += inc_x; *y = tp1; y += inc_y; *y = tp3; y += inc_y; *y = tp5; y += inc_y; *y = tp7; y += inc_y; } if (n & 2) { LD_GP2_INC(px, inc_x, fx0, fx1); LD_GP2_INC(py, inc_y, fy0, fy1); tp0 = (c * fx0) + (s * fy0); tp1 = (c * fy0) - (s * fx0); tp2 = (c * fx1) + (s * fy1); tp3 = (c * fy1) - (s * fx1); ST_GP2_INC(tp0, tp2, x, inc_x); ST_GP2_INC(tp1, tp3, y, inc_y); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = (c * fx0) + (s * fy0); tp1 = (c * fy0) - (s * fx0); *x = tp0; *y = tp1; } } } return 0; } OpenBLAS-0.2.20/kernel/mips/dscal_msa.c000066400000000000000000000310751313527062700174740ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i; FLOAT *px; FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; v2f64 da_vec; px = x; if (1 == inc_x) { if (0.0 == da) { v2f64 zero_v = __msa_cast_to_vector_double(0); zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); for (i = (n >> 5); i--;) { ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 2); ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 2); } if (n & 31) { if (n & 16) { ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 2); } if (n & 8) { ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); } if (n & 4) { ST_DP2_INC(zero_v, zero_v, x, 2); } if (n & 2) { *x = 0; x += 1; *x = 0; x += 1; } if (n & 1) { *x = 0; } } } else { da_vec = COPY_DOUBLE_TO_VECTOR(da); if (n > 31) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32 + 16; LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); for (i = 0; i < (n >> 5) - 1; i++) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; x8 = LD_DP(px); px += 2; x0 *= da_vec; x9 = LD_DP(px); px += 2; x1 *= da_vec; x10 = LD_DP(px); px += 2; x2 *= da_vec; x11 = LD_DP(px); px += 2; x3 *= da_vec; x12 = LD_DP(px); px += 2; x4 *= da_vec; x13 = LD_DP(px); px += 2; x5 *= da_vec; x14 = LD_DP(px); px += 2; x6 *= da_vec; x15 = LD_DP(px); px += 2; x7 *= da_vec; x8 *= da_vec; ST_DP(x0, x); x += 2; x9 *= da_vec; ST_DP(x1, x); x += 2; x10 *= da_vec; ST_DP(x2, x); x += 2; x11 *= da_vec; ST_DP(x3, x); x += 2; x12 *= da_vec; ST_DP(x4, x); x += 2; x13 *= da_vec; ST_DP(x5, x); x += 2; x14 *= da_vec; ST_DP(x6, x); x += 2; x15 *= da_vec; ST_DP(x7, x); x += 2; ST_DP(x8, x); x += 2; x0 = LD_DP(px); px += 2; ST_DP(x9, x); x += 2; x1 = LD_DP(px); px += 2; ST_DP(x10, x); x += 2; x2 = LD_DP(px); px += 2; ST_DP(x11, x); x += 2; x3 = LD_DP(px); px += 2; ST_DP(x12, x); x += 2; x4 = LD_DP(px); px += 2; ST_DP(x13, x); x += 2; x5 = LD_DP(px); px += 2; ST_DP(x14, x); x += 2; x6 = LD_DP(px); px += 2; ST_DP(x15, x); x += 2; x7 = LD_DP(px); px += 2; } x8 = LD_DP(px); px += 2; x0 *= da_vec; x9 = LD_DP(px); px += 2; x1 *= da_vec; x10 = LD_DP(px); px += 2; x2 *= da_vec; x11 = LD_DP(px); px += 2; x3 *= da_vec; x12 = LD_DP(px); px += 2; x4 *= da_vec; x13 = LD_DP(px); px += 2; x5 *= da_vec; x14 = LD_DP(px); px += 2; x6 *= da_vec; x15 = LD_DP(px); px += 2; x7 *= da_vec; x8 *= da_vec; ST_DP(x0, x); x += 2; x9 *= da_vec; ST_DP(x1, x); x += 2; x10 *= da_vec; ST_DP(x2, x); x += 2; x11 *= da_vec; ST_DP(x3, x); x += 2; x12 *= da_vec; ST_DP(x4, x); x += 2; x13 *= da_vec; ST_DP(x5, x); x += 2; x15 *= da_vec; ST_DP(x6, x); x += 2; x14 *= da_vec; ST_DP(x7, x); x += 2; ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2); } if (n & 31) { if (n & 16) { LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); } if (n & 8) { LD_DP4_INC(px, 2, x0, x1, x2, x3); MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); ST_DP4_INC(x0, x1, x2, x3, x, 2); } if (n & 4) { LD_DP2_INC(px, 2, x0, x1); MUL2(x0, da_vec, x1, da_vec, x0, x1); ST_DP2_INC(x0, x1, x, 2); } if (n & 2) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da, f1, da, f0, f1); ST_GP2_INC(f0, f1, x, 1); } if (n & 1) { *x *= da; } } } } else { if (da == 0.0) { for (i = n; i--;) { *x = 0.0; x += inc_x; } } else { if (n > 15) { LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); for (i = 0; i < (n >> 4) - 1; i++) { LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); f4 *= da; f5 *= da; *x = f0; x += inc_x; f6 *= da; *x = f1; x += inc_x; f7 *= da; *x = f2; x += inc_x; f8 *= da; *x = f3; x += inc_x; f9 *= da; *x = f4; x += inc_x; f10 *= da; *x = f5; x += inc_x; f11 *= da; *x = f6; x += inc_x; f12 *= da; *x = f7; x += inc_x; f13 *= da; *x = f8; x += inc_x; f14 *= da; *x = f9; x += inc_x; f15 *= da; *x = f10; x += inc_x; *x = f11; x += inc_x; f0 = *px; px += inc_x; *x = f12; x += inc_x; f1 = *px; px += inc_x; *x = f13; x += inc_x; f2 = *px; px += inc_x; *x = f14; x += inc_x; f3 = *px; px += inc_x; *x = f15; x += inc_x; f4 = *px; px += inc_x; f5 = *px; px += inc_x; f6 = *px; px += inc_x; f7 = *px; px += inc_x; } LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); f4 *= da; f5 *= da; *x = f0; x += inc_x; f6 *= da; *x = f1; x += inc_x; f7 *= da; *x = f2; x += inc_x; f8 *= da; *x = f3; x += inc_x; f9 *= da; *x = f4; x += inc_x; f10 *= da; *x = f5; x += inc_x; f11 *= da; *x = f6; x += inc_x; f12 *= da; *x = f7; x += inc_x; f13 *= da; *x = f8; x += inc_x; f14 *= da; *x = f9; x += inc_x; f15 *= da; *x = f10; x += inc_x; *x = f11; x += inc_x; *x = f12; x += inc_x; *x = f13; x += inc_x; *x = f14; x += inc_x; *x = f15; x += inc_x; } if (n & 15) { if (n & 8) { LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); } if (n & 4) { LD_GP4_INC(px, inc_x, f0, f1, f2, f3); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, x, inc_x); } if (n & 2) { LD_GP2_INC(px, inc_x, f0, f1); MUL2(f0, da, f1, da, f0, f1); ST_GP2_INC(f0, f1, x, inc_x); } if (n & 1) { *x *= da; } } } } return 0; } OpenBLAS-0.2.20/kernel/mips/dswap_msa.c000066400000000000000000000224501313527062700175210ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, pref_offsetx, pref_offsety; FLOAT *px, *py; FLOAT x0, x1, x2, x3, x4, x5, x6, x7; FLOAT y0, y1, y2, y3, y4, y5, y6, y7; v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; if (n < 0) return (0); pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } px = srcx; py = srcy; if ((1 == inc_x) && (1 == inc_y)) { if (n >> 4) { LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); for (i = (n >> 4) - 1; i--;) { PREFETCH(px + pref_offsetx + 16); PREFETCH(px + pref_offsetx + 20); PREFETCH(px + pref_offsetx + 24); PREFETCH(px + pref_offsetx + 28); PREFETCH(py + pref_offsety + 16); PREFETCH(py + pref_offsety + 20); PREFETCH(py + pref_offsety + 24); PREFETCH(py + pref_offsety + 28); yv0 = LD_DP(py); py += 2; ST_DP(xv0, srcy); srcy += 2; yv1 = LD_DP(py); py += 2; ST_DP(xv1, srcy); srcy += 2; yv2 = LD_DP(py); py += 2; ST_DP(xv2, srcy); srcy += 2; yv3 = LD_DP(py); py += 2; ST_DP(xv3, srcy); srcy += 2; yv4 = LD_DP(py); py += 2; ST_DP(xv4, srcy); srcy += 2; yv5 = LD_DP(py); py += 2; ST_DP(xv5, srcy); srcy += 2; yv6 = LD_DP(py); py += 2; ST_DP(xv6, srcy); srcy += 2; yv7 = LD_DP(py); py += 2; ST_DP(xv7, srcy); srcy += 2; xv0 = LD_DP(px); px += 2; ST_DP(yv0, srcx); srcx += 2; xv1 = LD_DP(px); px += 2; ST_DP(yv1, srcx); srcx += 2; xv2 = LD_DP(px); px += 2; ST_DP(yv2, srcx); srcx += 2; xv3 = LD_DP(px); px += 2; ST_DP(yv3, srcx); srcx += 2; xv4 = LD_DP(px); px += 2; ST_DP(yv4, srcx); srcx += 2; xv5 = LD_DP(px); px += 2; ST_DP(yv5, srcx); srcx += 2; xv6 = LD_DP(px); px += 2; ST_DP(yv6, srcx); srcx += 2; xv7 = LD_DP(px); px += 2; ST_DP(yv7, srcx); srcx += 2; } LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2); ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2); } if (n & 15) { if ((n & 8) && (n & 4) && (n & 2)) { LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6); LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6); ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2); ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2); } else if ((n & 8) && (n & 4)) { LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5); LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5); ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2); ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2); } else if ((n & 8) && (n & 2)) { LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4); LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4); ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2); ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2); } else if ((n & 4) && (n & 2)) { LD_DP3_INC(px, 2, xv0, xv1, xv2); LD_DP3_INC(py, 2, yv0, yv1, yv2); ST_DP3_INC(xv0, xv1, xv2, srcy, 2); ST_DP3_INC(yv0, yv1, yv2, srcx, 2); } else if (n & 8) { LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3); LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3); ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2); ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2); } else if (n & 4) { LD_DP2_INC(px, 2, xv0, xv1); LD_DP2_INC(py, 2, yv0, yv1); ST_DP2_INC(xv0, xv1, srcy, 2); ST_DP2_INC(yv0, yv1, srcx, 2); } else if (n & 2) { xv0 = LD_DP(px); yv0 = LD_DP(py); px += 2; py += 2; ST_DP(xv0, srcy); ST_DP(yv0, srcx); srcx += 2; srcy += 2; } if (n & 1) { x0 = px[0]; y0 = py[0]; srcx[0] = y0; srcy[0] = x0; } } } else { for (i = (n >> 3); i--;) { LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); } if (n & 7) { if ((n & 4) && (n & 2) && (n & 1)) { LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); } else if ((n & 4) && (n & 2)) { LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); } else if ((n & 4) && (n & 1)) { LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); } else if ((n & 2) && (n & 1)) { LD_GP3_INC(px, inc_x, x0, x1, x2); LD_GP3_INC(py, inc_y, y0, y1, y2); ST_GP3_INC(x0, x1, x2, srcy, inc_y); ST_GP3_INC(y0, y1, y2, srcx, inc_x); } else if (n & 4) { LD_GP4_INC(px, inc_x, x0, x1, x2, x3); LD_GP4_INC(py, inc_y, y0, y1, y2, y3); ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); } else if (n & 2) { LD_GP2_INC(px, inc_x, x0, x1); LD_GP2_INC(py, inc_y, y0, y1); ST_GP2_INC(x0, x1, srcy, inc_y); ST_GP2_INC(y0, y1, srcx, inc_x); } else if (n & 1) { x0 = *srcx; y0 = *srcy; *srcx = y0; *srcy = x0; } } } return (0); } OpenBLAS-0.2.20/kernel/mips/dtrsm_kernel_LN_8x4_msa.c000066400000000000000000001133051313527062700221700ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static __attribute__ ((noinline)) void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; v2f64 src_a61, src_a62, src_a63; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; PREF_OFFSET(a, -96); PREF_OFFSET(a, -32); PREF_OFFSET(a, -160); PREF_OFFSET(a, -224); PREF_OFFSET(a, -64); PREF_OFFSET(a, -128); PREF_OFFSET(a, -192); PREF_OFFSET(a, -256); PREF_OFFSET(a, -320); PREF_OFFSET(a, -384); PREF_OFFSET(a, -448); PREF_OFFSET(a, -512); LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); if (bk > 0) { BLASLONG i, pref_offset; FLOAT *pba = a, *pbb = b, *pa0_pref; v2f64 src_b, src_b0, src_b1; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (i = bk >> 1; i--;) { PREF_OFFSET(pa0_pref, 128); PREF_OFFSET(pa0_pref, 160); PREF_OFFSET(pa0_pref, 192); PREF_OFFSET(pa0_pref, 224); LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; pa0_pref += 16; } if (bk & 1) { LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; } } a -= 64; b -= 32; ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); src_a54 = __msa_cast_to_vector_double(*(a + 54)); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); src_a60 = LD_DP(a + 60); src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); src_a52 = LD_DP(a + 52); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); src_a36 = __msa_cast_to_vector_double(*(a + 36)); src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; res_c6 *= src_a54; res_c15 *= src_a63; res_c14 -= res_c15 * src_a62; res_c14 *= src_a54; ST_DP(res_c7, b + 28); ST_DP(res_c6, b + 24); ST_DP(res_c15, b + 30); ST_DP(res_c14, b + 26); ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); ST_DP(src_c3, c + 6); ST_DP(src_c7, c_nxt1line + 6); ST_DP(src_c11, c_nxt2line + 6); ST_DP(src_c15, c_nxt3line + 6); res_c5 -= res_c7 * src_a61; res_c5 -= res_c6 * src_a53; res_c5 *= src_a45; res_c4 -= res_c7 * src_a60; res_c4 -= res_c6 * src_a52; res_c4 -= res_c5 * src_a44; res_c4 *= src_a36; res_c13 -= res_c15 * src_a61; res_c13 -= res_c14 * src_a53; res_c13 *= src_a45; res_c12 -= res_c15 * src_a60; res_c12 -= res_c14 * src_a52; res_c12 -= res_c13 * src_a44; res_c12 *= src_a36; src_a56 = LD_DP(a + 56); src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); src_a58 = LD_DP(a + 58); src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); ST_DP(res_c4, b + 16); ST_DP(res_c5, b + 20); ST_DP(res_c12, b + 18); ST_DP(res_c13, b + 22); ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); ST_DP(src_c2, c + 4); ST_DP(src_c6, c_nxt1line + 4); ST_DP(src_c10, c_nxt2line + 4); ST_DP(src_c14, c_nxt3line + 4); src_a50 = LD_DP(a + 50); src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); src_a42 = LD_DP(a + 42); src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); src_a34 = LD_DP(a + 34); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); src_a18 = __msa_cast_to_vector_double(*(a + 18)); src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; res_c1 -= res_c7 * src_a57; res_c0 -= res_c7 * src_a56; res_c11 -= res_c15 * src_a59; res_c10 -= res_c15 * src_a58; res_c9 -= res_c15 * src_a57; res_c8 -= res_c15 * src_a56; res_c3 -= res_c6 * src_a51; res_c3 -= res_c5 * src_a43; res_c3 -= res_c4 * src_a35; res_c3 *= src_a27; res_c2 -= res_c6 * src_a50; res_c2 -= res_c5 * src_a42; res_c2 -= res_c4 * src_a34; res_c2 -= res_c3 * src_a26; res_c2 *= src_a18; res_c11 -= res_c14 * src_a51; res_c11 -= res_c13 * src_a43; res_c11 -= res_c12 * src_a35; res_c11 *= src_a27; res_c10 -= res_c14 * src_a50; res_c10 -= res_c13 * src_a42; res_c10 -= res_c12 * src_a34; res_c10 -= res_c11 * src_a26; res_c10 *= src_a18; src_a48 = LD_DP(a + 48); src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); src_a40 = LD_DP(a + 40); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); ST_DP(res_c2, b + 8); ST_DP(res_c3, b + 12); ST_DP(res_c10, b + 10); ST_DP(res_c11, b + 14); src_a32 = LD_DP(a + 32); src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); src_a24 = LD_DP(a + 24); src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); ST_DP(src_c1, c + 2); ST_DP(src_c5, c_nxt1line + 2); ST_DP(src_c9, c_nxt2line + 2); ST_DP(src_c13, c_nxt3line + 2); res_c1 -= res_c6 * src_a49; res_c1 -= res_c5 * src_a41; res_c1 -= res_c4 * src_a33; res_c1 -= res_c3 * src_a25; res_c0 -= res_c6 * src_a48; res_c0 -= res_c5 * src_a40; res_c0 -= res_c4 * src_a32; res_c0 -= res_c3 * src_a24; res_c9 -= res_c14 * src_a49; res_c9 -= res_c13 * src_a41; res_c9 -= res_c12 * src_a33; res_c9 -= res_c11 * src_a25; res_c8 -= res_c14 * src_a48; res_c8 -= res_c13 * src_a40; res_c8 -= res_c12 * src_a32; res_c8 -= res_c11 * src_a24; src_a16 = LD_DP(a + 16); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); src_a0 = __msa_cast_to_vector_double(*(a + 0)); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; res_c9 -= res_c10 * src_a17; res_c9 *= src_a9; res_c0 -= res_c2 * src_a16; res_c0 -= res_c1 * src_a8; res_c0 *= src_a0; res_c8 -= res_c10 * src_a16; res_c8 -= res_c9 * src_a8; res_c8 *= src_a0; ST_DP(res_c0, b + 0); ST_DP(res_c8, b + 2); ST_DP(res_c1, b + 4); ST_DP(res_c9, b + 6); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); ST_DP(src_c0, c); ST_DP(src_c4, c_nxt1line); ST_DP(src_c8, c_nxt2line); ST_DP(src_c12, c_nxt3line); } static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; v2f64 src_a61, src_a62, src_a63; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); if (bk > 0) { BLASLONG i; FLOAT *pba = a, *pbb = b; v2f64 src_b, src_b0, src_b1; LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pbb); for (i = bk - 1; i--;) { pba += 8; pbb += 2; LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); src_b1 = LD_DP(pbb); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_a0 = src_a8; src_a1 = src_a9; src_a2 = src_a16; src_a3 = src_a17; src_b0 = src_b1; } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; } ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); src_a56 = LD_DP(a - 8); src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); src_a58 = LD_DP(a - 6); src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); src_a60 = LD_DP(a - 4); src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); src_a62 = LD_DP(a - 2); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; res_c5 -= res_c7 * src_a61; res_c4 -= res_c7 * src_a60; res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; res_c1 -= res_c7 * src_a57; res_c0 -= res_c7 * src_a56; src_a48 = LD_DP(a - 16); src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); src_a50 = LD_DP(a - 14); src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); src_a54 = __msa_cast_to_vector_double(*(a - 10)); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); src_a42 = LD_DP(a - 22); src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); src_a44 = LD_DP(a - 20); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); res_c6 *= src_a54; res_c5 -= res_c6 * src_a53; res_c4 -= res_c6 * src_a52; res_c3 -= res_c6 * src_a51; res_c2 -= res_c6 * src_a50; res_c1 -= res_c6 * src_a49; res_c0 -= res_c6 * src_a48; res_c5 *= src_a45; res_c4 -= res_c5 * src_a44; res_c3 -= res_c5 * src_a43; res_c2 -= res_c5 * src_a42; res_c1 -= res_c5 * src_a41; res_c0 -= res_c5 * src_a40; ST_DP(res_c7, b - 2); ST_DP(res_c6, b - 4); ST_DP(res_c5, b - 6); src_a32 = LD_DP(a - 32); src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); src_a36 = __msa_cast_to_vector_double(*(a - 28)); src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; res_c2 -= res_c4 * src_a34; res_c1 -= res_c4 * src_a33; res_c0 -= res_c4 * src_a32; src_a24 = LD_DP(a - 40); src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); src_a26 = LD_DP(a - 38); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); src_a18 = __msa_cast_to_vector_double(*(a - 46)); src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); src_a0 = __msa_cast_to_vector_double(*(a - 64)); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); res_c3 *= src_a27; res_c2 -= res_c3 * src_a26; res_c1 -= res_c3 * src_a25; res_c0 -= res_c3 * src_a24; res_c2 *= src_a18; res_c1 -= res_c2 * src_a17; res_c0 -= res_c2 * src_a16; res_c1 *= src_a9; res_c0 -= res_c1 * src_a8; res_c0 *= src_a0; ST_DP(res_c4, b - 8); ST_DP(res_c3, b - 10); ST_DP(res_c2, b - 12); ST_DP(res_c1, b - 14); ST_DP(res_c0, b - 16); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); } static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; FLOAT c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c4 = *(c + 4); c5 = *(c + 5); c6 = *(c + 6); c7 = *(c + 7); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--; ) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; c4 -= aa[4] * bb[0]; c5 -= aa[5] * bb[0]; c6 -= aa[6] * bb[0]; c7 -= aa[7] * bb[0]; aa += 8; bb += 1; } } a -= 64; b -= 8; a0 = *(a + 0); a8 = *(a + 8); a9 = *(a + 9); a16 = *(a + 16); a17 = *(a + 17); a18 = *(a + 18); a24 = *(a + 24); a25 = *(a + 25); a26 = *(a + 26); a27 = *(a + 27); a32 = *(a + 32); a33 = *(a + 33); a34 = *(a + 34); a35 = *(a + 35); a36 = *(a + 36); a40 = *(a + 40); a41 = *(a + 41); a42 = *(a + 42); a43 = *(a + 43); a44 = *(a + 44); a45 = *(a + 45); a48 = *(a + 48); a49 = *(a + 49); a50 = *(a + 50); a51 = *(a + 51); a52 = *(a + 52); a53 = *(a + 53); a54 = *(a + 54); a56 = *(a + 56); a57 = *(a + 57); a58 = *(a + 58); a59 = *(a + 59); a60 = *(a + 60); a61 = *(a + 61); a62 = *(a + 62); a63 = *(a + 63); c7 *= a63; c6 -= c7 * a62; c6 *= a54; c5 -= c7 * a61; c5 -= c6 * a53; c5 *= a45; c4 -= c7 * a60; c4 -= c6 * a52; c4 -= c5 * a44; c4 *= a36; c3 -= c7 * a59; c3 -= c6 * a51; c3 -= c5 * a43; c3 -= c4 * a35; c3 *= a27; c2 -= c7 * a58; c2 -= c6 * a50; c2 -= c5 * a42; c2 -= c4 * a34; c2 -= c3 * a26; c2 *= a18; c1 -= c7 * a57; c1 -= c6 * a49; c1 -= c5 * a41; c1 -= c4 * a33; c1 -= c3 * a25; c1 -= c2 * a17; c1 *= a9; c0 -= c7 * a56; c0 -= c6 * a48; c0 -= c5 * a40; c0 -= c4 * a32; c0 -= c3 * a24; c0 -= c2 * a16; c0 -= c1 * a8; c0 *= a0; *(b + 7) = c7; *(b + 6) = c6; *(b + 5) = c5; *(b + 4) = c4; *(b + 3) = c3; *(b + 2) = c2; *(b + 1) = c1; *(b + 0) = c0; *(c + 7) = c7; *(c + 6) = c6; *(c + 5) = c5; *(c + 4) = c4; *(c + 3) = c3; *(c + 2) = c2; *(c + 1) = c1; *(c + 0) = c0; } static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; v2f64 src_a14, src_a15; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) { LD_DP2(aa, 2, src_a0, src_a1); LD_DP2(bb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c6 -= src_a0 * src_b; src_c7 -= src_a1 * src_b; aa += 4; bb += 4; } } a -= 16; b -= 16; ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); src_a12 = LD_DP(a + 12); src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); src_a9 = LD_DP(a + 9); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); src_a8 = __msa_cast_to_vector_double(*(a + 8)); src_a0 = __msa_cast_to_vector_double(*(a + 0)); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); res_c3 *= src_a15; res_c7 *= src_a15; res_c2 -= res_c3 * src_a14; res_c6 -= res_c7 * src_a14; res_c2 *= src_a10; res_c6 *= src_a10; res_c1 -= res_c3 * src_a13; res_c5 -= res_c7 * src_a13; res_c1 -= res_c2 * src_a9; res_c5 -= res_c6 * src_a9; res_c1 *= src_a5; res_c5 *= src_a5; res_c0 -= res_c3 * src_a12; res_c4 -= res_c7 * src_a12; res_c0 -= res_c2 * src_a8; res_c4 -= res_c6 * src_a8; res_c0 -= res_c1 * src_a4; res_c4 -= res_c5 * src_a4; res_c0 *= src_a0; res_c4 *= src_a0; ST_DP(res_c7, b + 14); ST_DP(res_c3, b + 12); ST_DP(res_c6, b + 10); ST_DP(res_c2, b + 8); ST_DP(res_c5, b + 6); ST_DP(res_c1, b + 4); ST_DP(res_c4, b + 2); ST_DP(res_c0, b + 0); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); } static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; v2f64 src_a14, src_a15; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0; for (i = bk; i--;) { LD_DP2(aa, 2, src_a0, src_a1); src_b0 = LD_DP(bb); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; aa += 4; bb += 2; } } a -= 16; b -= 8; ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); src_a12 = LD_DP(a + 12); src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); src_a9 = LD_DP(a + 9); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); src_a8 = __msa_cast_to_vector_double(*(a + 8)); src_a0 = __msa_cast_to_vector_double(*(a + 0)); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); res_c3 *= src_a15; res_c2 -= res_c3 * src_a14; res_c2 *= src_a10; res_c1 -= res_c3 * src_a13; res_c1 -= res_c2 * src_a9; res_c1 *= src_a5; res_c0 -= res_c3 * src_a12; res_c0 -= res_c2 * src_a8; res_c0 -= res_c1 * src_a4; res_c0 *= src_a0; ST_DP(res_c3, b + 6); ST_DP(res_c2, b + 4); ST_DP(res_c1, b + 2); ST_DP(res_c0, b + 0); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); } static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; aa += 4; bb += 1; } } a -= 16; b -= 4; a0 = *(a + 0); a4 = *(a + 4); a5 = *(a + 5); a8 = *(a + 8); a9 = *(a + 9); a10 = *(a + 10); a12 = *(a + 12); a13 = *(a + 13); a14 = *(a + 14); a15 = *(a + 15); c3 *= a15; c2 -= c3 * a14; c2 *= a10; c1 -= c3 * a13; c1 -= c2 * a9; c1 *= a5; c0 -= c3 * a12; c0 -= c2 * a8; c0 -= c1 * a4; c0 *= a0; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + ldc); c1_nxt1 = *(c + 1 + ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt1 -= aa[0] * bb[1]; c1_nxt1 -= aa[1] * bb[1]; c0_nxt2 -= aa[0] * bb[2]; c1_nxt2 -= aa[1] * bb[2]; c0_nxt3 -= aa[0] * bb[3]; c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } } a -= 4; b -= 8; a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); c1 *= a3; c0 -= c1 * a2; c0 *= a0; c1_nxt1 *= a3; c0_nxt1 -= c1_nxt1 * a2; c0_nxt1 *= a0; c1_nxt2 *= a3; c0_nxt2 -= c1_nxt2 * a2; c0_nxt2 *= a0; c1_nxt3 *= a3; c0_nxt3 -= c1_nxt3 * a2; c0_nxt3 *= a0; *(b + 0) = c0; *(b + 1) = c0_nxt1; *(b + 2) = c0_nxt2; *(b + 3) = c0_nxt3; *(b + 4) = c1; *(b + 5) = c1_nxt1; *(b + 6) = c1_nxt2; *(b + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt -= aa[0] * bb[1]; c1_nxt -= aa[1] * bb[1]; aa += 2; bb += 2; } } a -= 4; b -= 4; a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); c1 *= a3; c0 -= c1 * a2; c0 *= a0; c1_nxt *= a3; c0_nxt -= c1_nxt * a2; c0_nxt *= a0; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a2, a3, c0, c1; c0 = *(c + 0); c1 = *(c + 1); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; aa += 2; bb += 1; } } a0 = *(a - 4); a2 = *(a - 2); a3 = *(a - 1); c1 *= a3; c0 -= c1 * a2; c0 *= a0; *(b - 2) = c0; *(b - 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; c2 -= aa[0] * bb[2]; c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } } c0 *= *(a - 1); c1 *= *(a - 1); c2 *= *(a - 1); c3 *= *(a - 1); *(c + 0 * ldc) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; *(b - 4) = c0; *(b - 3) = c1; *(b - 2) = c2; *(b - 1) = c3; } static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { *c *= *a; *(c + ldc) = *a * *(c + ldc); *b = *c; *(b + 1) = *(c + ldc); } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { BLASLONG kk, i, j; FLOAT *aa, *bb, *cc; for (j = (n >> 2); j--;) { kk = m + offset; if (m & 7) { if (m & 1) { aa = a + (m - 1) * k + kk; bb = b + 4 * kk; cc = c + (m - 1); dsolve_1x4_ln_msa(aa, bb, cc, ldc, (k - kk)); kk -= 1; } if (m & 2) { aa = a + ((m & -2) - 2) * k + 2 * kk; bb = b + 4 * kk; cc = c + ((m & -2) - 2); dsolve_2x4_ln_msa(aa, bb, cc, ldc, (k - kk)); kk -= 2; } if (m & 4) { aa = a + ((m & -4) - 4) * k + 4 * kk; bb = b + 4 * kk; cc = c + ((m & -4) - 4); dsolve_4x4_ln_msa(aa, bb, cc, ldc, (k - kk)); kk -= 4; } } i = (m >> 3); if (i > 0) { aa = a + ((m & -8) - 8) * k; cc = c + ((m & -8) - 8); do { dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; kk -= 8; i --; } while (i > 0); } b += 4 * k; c += 4 * ldc; } if (n & 3) { if (n & 2) { kk = m + offset; if (m & 7) { if (m & 1) { aa = a + ((m & -1) - 1) * k; cc = c + ((m & -1) - 1); dsolve_1x2_ln_msa(aa + kk - 1, b + kk * 2 - 2, cc, ldc); kk -= 1; } if (m & 2) { aa = a + ((m & -2) - 2) * k; cc = c + ((m & -2) - 2); dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, (k - kk)); kk -= 2; } if (m & 4) { aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, (k - kk)); kk -= 4; } } i = (m >> 3); if (i > 0) { aa = a + ((m & -8) - 8) * k; cc = c + ((m & -8) - 8); do { dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; kk -= 8; i --; } while (i > 0); } b += 2 * k; c += 2 * ldc; } if (n & 1) { kk = m + offset; if (m & 7) { if (m & 1) { kk -= 1; aa = a + ((m & -1) - 1) * k + kk; cc = c + ((m & -1) - 1); *cc *= *aa; *(b + kk) = *cc; } if (m & 2) { aa = a + ((m & -2) - 2) * k + kk * 2; cc = c + ((m & -2) - 2); dsolve_2x1_ln_msa(aa, b + kk, cc, (k - kk)); kk -= 2; } if (m & 4) { aa = a + ((m & -4) - 4) * k; cc = c + ((m & -4) - 4); dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, (k - kk)); kk -= 4; } } i = (m >> 3); if (i > 0) { aa = a + ((m & -8) - 8) * k; cc = c + ((m & -8) - 8); do { dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk)); aa -= 8 * k; cc -= 8; kk -= 8; i --; } while (i > 0); } } } return 0; } OpenBLAS-0.2.20/kernel/mips/dtrsm_kernel_LT_8x4_msa.c000066400000000000000000001106261313527062700222010ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static __attribute__ ((noinline)) void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; a += bk * 8; PREF_OFFSET(a, 0); PREF_OFFSET(a, 32); PREF_OFFSET(a, 72); PREF_OFFSET(a, 104); PREF_OFFSET(a, 144); PREF_OFFSET(a, 176); PREF_OFFSET(a, 216); PREF_OFFSET(a, 248); PREF_OFFSET(a, 288); PREF_OFFSET(a, 360); PREF_OFFSET(a, 504); PREF_OFFSET(a, 432); a -= bk * 8; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); if (bk) { BLASLONG i, pref_offset; FLOAT *pa0_pref; v2f64 src_b, src_b0, src_b1; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (i = (bk >> 1); i--;) { PREF_OFFSET(pa0_pref, 128); PREF_OFFSET(pa0_pref, 160); PREF_OFFSET(pa0_pref, 192); PREF_OFFSET(pa0_pref, 224); LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; pa0_pref += 16; } if (bk & 1) { LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; } } ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a2 = LD_DP(a + 2); src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; res_c4 -= res_c0 * src_a4; res_c5 -= res_c0 * src_a5; res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; res_c8 *= src_a0; res_c9 -= res_c8 * src_a1; res_c10 -= res_c8 * src_a2; res_c11 -= res_c8 * src_a3; res_c12 -= res_c8 * src_a4; res_c13 -= res_c8 * src_a5; res_c14 -= res_c8 * src_a6; res_c15 -= res_c8 * src_a7; src_a9 = __msa_cast_to_vector_double(*(a + 9)); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); src_a12 = LD_DP(a + 12); src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); res_c1 *= src_a9; res_c2 -= res_c1 * src_a10; res_c3 -= res_c1 * src_a11; res_c4 -= res_c1 * src_a12; res_c5 -= res_c1 * src_a13; res_c6 -= res_c1 * src_a14; res_c7 -= res_c1 * src_a15; res_c9 *= src_a9; res_c10 -= res_c9 * src_a10; res_c11 -= res_c9 * src_a11; res_c12 -= res_c9 * src_a12; res_c13 -= res_c9 * src_a13; res_c14 -= res_c9 * src_a14; res_c15 -= res_c9 * src_a15; ST_DP(res_c0, b + 0); ST_DP(res_c8, b + 2); ST_DP(res_c1, b + 4); ST_DP(res_c9, b + 6); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); ST_DP(src_c0, c); ST_DP(src_c4, c_nxt1line); ST_DP(src_c8, c_nxt2line); ST_DP(src_c12, c_nxt3line); src_a18 = LD_DP(a + 18); src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); src_a20 = LD_DP(a + 20); src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); src_a22 = LD_DP(a + 22); src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); res_c2 *= src_a18; res_c3 -= res_c2 * src_a19; res_c4 -= res_c2 * src_a20; res_c5 -= res_c2 * src_a21; res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; res_c10 *= src_a18; res_c11 -= res_c10 * src_a19; res_c12 -= res_c10 * src_a20; res_c13 -= res_c10 * src_a21; res_c14 -= res_c10 * src_a22; res_c15 -= res_c10 * src_a23; src_a27 = __msa_cast_to_vector_double(*(a + 27)); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); src_a30 = LD_DP(a + 30); src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); res_c3 *= src_a27; res_c4 -= res_c3 * src_a28; res_c5 -= res_c3 * src_a29; res_c6 -= res_c3 * src_a30; res_c7 -= res_c3 * src_a31; res_c11 *= src_a27; res_c12 -= res_c11 * src_a28; res_c13 -= res_c11 * src_a29; res_c14 -= res_c11 * src_a30; res_c15 -= res_c11 * src_a31; ST_DP(res_c2, b + 8); ST_DP(res_c10, b + 10); ST_DP(res_c3, b + 12); ST_DP(res_c11, b + 14); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); src_a36 = LD_DP(a + 36); src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); src_a38 = LD_DP(a + 38); src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); res_c4 *= src_a36; res_c5 -= res_c4 * src_a37; res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; res_c12 *= src_a36; res_c13 -= res_c12 * src_a37; res_c14 -= res_c12 * src_a38; res_c15 -= res_c12 * src_a39; src_a45 = __msa_cast_to_vector_double(*(a + 45)); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); res_c5 *= src_a45; res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; res_c13 *= src_a45; res_c14 -= res_c13 * src_a46; res_c15 -= res_c13 * src_a47; ST_DP(src_c1, c + 2); ST_DP(src_c5, c_nxt1line + 2); ST_DP(src_c9, c_nxt2line + 2); ST_DP(src_c13, c_nxt3line + 2); ST_DP(res_c4, b + 16); ST_DP(res_c12, b + 18); ST_DP(res_c5, b + 20); ST_DP(res_c13, b + 22); ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); src_a63 = __msa_cast_to_vector_double(*(a + 63)); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); res_c6 *= src_a54; res_c7 -= res_c6 * src_a55; res_c14 *= src_a54; res_c15 -= res_c14 * src_a55; res_c7 *= src_a63; res_c15 *= src_a63; ST_DP(src_c2, c + 4); ST_DP(src_c6, c_nxt1line + 4); ST_DP(src_c10, c_nxt2line + 4); ST_DP(src_c14, c_nxt3line + 4); ST_DP(res_c6, b + 24); ST_DP(res_c14, b + 26); ST_DP(res_c7, b + 28); ST_DP(res_c15, b + 30); ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); ST_DP(src_c3, c + 6); ST_DP(src_c7, c_nxt1line + 6); ST_DP(src_c11, c_nxt2line + 6); ST_DP(src_c15, c_nxt3line + 6); } static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); if (bk) { BLASLONG i; v2f64 src_b, src_b0, src_b1; LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(b); a += 8; b += 2; for (i = (bk - 1); i--;) { LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); src_b1 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_a0 = src_a4; src_a1 = src_a5; src_a2 = src_a6; src_a3 = src_a7; src_b0 = src_b1; a += 8; b += 2; } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; } ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a2 = LD_DP(a + 2); src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; res_c4 -= res_c0 * src_a4; res_c5 -= res_c0 * src_a5; res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; src_a9 = __msa_cast_to_vector_double(*(a + 9)); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); src_a12 = LD_DP(a + 12); src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); res_c1 *= src_a9; res_c2 -= res_c1 * src_a10; res_c3 -= res_c1 * src_a11; res_c4 -= res_c1 * src_a12; res_c5 -= res_c1 * src_a13; res_c6 -= res_c1 * src_a14; res_c7 -= res_c1 * src_a15; src_a18 = LD_DP(a + 18); src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); src_a20 = LD_DP(a + 20); src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); src_a22 = LD_DP(a + 22); src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); res_c2 *= src_a18; res_c3 -= res_c2 * src_a19; res_c4 -= res_c2 * src_a20; res_c5 -= res_c2 * src_a21; res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; src_a27 = __msa_cast_to_vector_double(*(a + 27)); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); src_a30 = LD_DP(a + 30); src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); res_c3 *= src_a27; res_c4 -= res_c3 * src_a28; res_c5 -= res_c3 * src_a29; res_c6 -= res_c3 * src_a30; res_c7 -= res_c3 * src_a31; ST_DP(res_c0, b + 0); ST_DP(res_c1, b + 2); ST_DP(res_c2, b + 4); ST_DP(res_c3, b + 6); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c4, src_c5, c + ldc, 2); src_a36 = LD_DP(a + 36); src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); src_a38 = LD_DP(a + 38); src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); res_c4 *= src_a36; res_c5 -= res_c4 * src_a37; res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; src_a45 = __msa_cast_to_vector_double(*(a + 45)); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); res_c5 *= src_a45; res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; src_a63 = __msa_cast_to_vector_double(*(a + 63)); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); res_c6 *= src_a54; res_c7 -= res_c6 * src_a55; res_c7 *= src_a63; ST_DP(res_c4, b + 8); ST_DP(res_c5, b + 10); ST_DP(res_c6, b + 12); ST_DP(res_c7, b + 14); ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ST_DP2(src_c2, src_c3, c + 4, 2); ST_DP2(src_c6, src_c7, c + 4 + ldc, 2); } static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c4 = *(c + 4); c5 = *(c + 5); c6 = *(c + 6); c7 = *(c + 7); if (bk) { BLASLONG i; for (i = bk; i--; ) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; c4 -= a[4] * b[0]; c5 -= a[5] * b[0]; c6 -= a[6] * b[0]; c7 -= a[7] * b[0]; a += 8; b += 1; } } a0 = *(a + 0); a1 = *(a + 1); a2 = *(a + 2); a3 = *(a + 3); a4 = *(a + 4); a5 = *(a + 5); a6 = *(a + 6); a7 = *(a + 7); a9 = *(a + 9); a10 = *(a + 10); a11 = *(a + 11); a12 = *(a + 12); a13 = *(a + 13); a14 = *(a + 14); a15 = *(a + 15); a18 = *(a + 18); a19 = *(a + 19); a20 = *(a + 20); a21 = *(a + 21); a22 = *(a + 22); a23 = *(a + 23); a27 = *(a + 27); a28 = *(a + 28); a29 = *(a + 29); a30 = *(a + 30); a31 = *(a + 31); a36 = *(a + 36); a37 = *(a + 37); a38 = *(a + 38); a39 = *(a + 39); a45 = *(a + 45); a46 = *(a + 46); a47 = *(a + 47); a54 = *(a + 54); a55 = *(a + 55); a63 = *(a + 63); c0 *= a0; c1 -= c0 * a1; c1 *= a9; c2 -= c0 * a2; c2 -= c1 * a10; c2 *= a18; c3 -= c0 * a3; c3 -= c1 * a11; c3 -= c2 * a19; c3 *= a27; c4 -= c0 * a4; c4 -= c1 * a12; c4 -= c2 * a20; c4 -= c3 * a28; c4 *= a36; c5 -= c0 * a5; c5 -= c1 * a13; c5 -= c2 * a21; c5 -= c3 * a29; c5 -= c4 * a37; c5 *= a45; c6 -= c0 * a6; c6 -= c1 * a14; c6 -= c2 * a22; c6 -= c3 * a30; c6 -= c4 * a38; c6 -= c5 * a46; c6 *= a54; c7 -= c0 * a7; c7 -= c1 * a15; c7 -= c2 * a23; c7 -= c3 * a31; c7 -= c4 * a39; c7 -= c5 * a47; c7 -= c6 * a55; c7 *= a63; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; *(c + 4) = c4; *(c + 5) = c5; *(c + 6) = c6; *(c + 7) = c7; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(b + 4) = c4; *(b + 5) = c5; *(b + 6) = c6; *(b + 7) = c7; } static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; v2f64 src_a10, src_a11, src_a15; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) { LD_DP2(a, 2, src_a0, src_a1); LD_DP2(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c6 -= src_a0 * src_b; src_c7 -= src_a1 * src_b; a += 4; b += 4; } } ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a2 = LD_DP(a + 2); src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; res_c4 *= src_a0; res_c5 -= res_c4 * src_a1; res_c6 -= res_c4 * src_a2; res_c7 -= res_c4 * src_a3; src_a5 = __msa_cast_to_vector_double(*(a + 5)); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); res_c1 *= src_a5; res_c2 -= res_c1 * src_a6; res_c3 -= res_c1 * src_a7; res_c5 *= src_a5; res_c6 -= res_c5 * src_a6; res_c7 -= res_c5 * src_a7; src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); src_a15 = __msa_cast_to_vector_double(*(a + 15)); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; res_c3 *= src_a15; res_c6 *= src_a10; res_c7 -= res_c6 * src_a11; res_c7 *= src_a15; ST_DP(res_c0, b + 0); ST_DP(res_c4, b + 2); ST_DP(res_c1, b + 4); ST_DP(res_c5, b + 6); ST_DP(res_c2, b + 8); ST_DP(res_c6, b + 10); ST_DP(res_c3, b + 12); ST_DP(res_c7, b + 14); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6); ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); } static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; v2f64 src_a10, src_a11, src_a15; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0; for (i = bk; i--;) { LD_DP2(a, 2, src_a0, src_a1); src_b0 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; a += 4; b += 2; } } ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); src_a0 = LD_DP(a + 0); src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a2 = LD_DP(a + 2); src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; src_a5 = __msa_cast_to_vector_double(*(a + 5)); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); res_c1 *= src_a5; res_c2 -= res_c1 * src_a6; res_c3 -= res_c1 * src_a7; src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); src_a15 = __msa_cast_to_vector_double(*(a + 15)); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; res_c3 *= src_a15; ST_DP(res_c0, b + 0); ST_DP(res_c1, b + 2); ST_DP(res_c2, b + 4); ST_DP(res_c3, b + 6); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); } static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; a += 4; b += 1; } } a0 = *(a + 0); a1 = *(a + 1); a2 = *(a + 2); a3 = *(a + 3); a5 = *(a + 5); a6 = *(a + 6); a7 = *(a + 7); a10 = *(a + 10); a11 = *(a + 11); a15 = *(a + 15); c0 *= a0; c1 -= c0 * a1; c1 *= a5; c2 -= c0 * a2; c2 -= c1 * a6; c2 *= a10; c3 -= c0 * a3; c3 -= c1 * a7; c3 -= c2 * a11; c3 *= a15; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + ldc); c1_nxt1 = *(c + 1 + ldc); c0_nxt2 = *(c + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt1 -= a[0] * b[1]; c1_nxt1 -= a[1] * b[1]; c0_nxt2 -= a[0] * b[2]; c1_nxt2 -= a[1] * b[2]; c0_nxt3 -= a[0] * b[3]; c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; } } a0 = *a; a1 = *(a + 1); a3 = *(a + 3); c0 *= a0; c1 -= c0 * a1; c1 *= a3; c0_nxt1 *= a0; c1_nxt1 -= c0_nxt1 * a1; c1_nxt1 *= a3; c0_nxt2 *= a0; c1_nxt2 -= c0_nxt2 * a1; c1_nxt2 *= a3; c0_nxt3 *= a0; c1_nxt3 -= c0_nxt3 * a1; c1_nxt3 *= a3; *(b + 0) = c0; *(b + 1) = c0_nxt1; *(b + 2) = c0_nxt2; *(b + 3) = c0_nxt3; *(b + 4) = c1; *(b + 5) = c1_nxt1; *(b + 6) = c1_nxt2; *(b + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + ldc); c1_nxt = *(c + 1 + ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt -= a[0] * b[1]; c1_nxt -= a[1] * b[1]; a += 2; b += 2; } } a0 = *a; a1 = *(a + 1); a3 = *(a + 3); c0 *= a0; c1 -= c0 * a1; c1 *= a3; c0_nxt *= a0; c1_nxt -= c0_nxt * a1; c1_nxt *= a3; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT a0, a1, a3, c0, c1; c0 = *(c + 0); c1 = *(c + 1); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; a += 2; b += 1; } } a0 = *(a + 0); a1 = *(a + 1); a3 = *(a + 3); c0 *= a0; c1 -= c0 * a1; c1 *= a3; *(b + 0) = c0; *(b + 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; c2 -= a[0] * b[2]; c3 -= a[0] * b[3]; a += 1; b += 4; } } c0 *= *a; c1 *= *a; c2 *= *a; c3 *= *a; *(c + 0 * ldc) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; } static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT c0, c1; c0 = *c; c1 = *(c + ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= *a * b[0]; c1 -= *a * b[1]; a += 1; b += 2; } } c0 *= *a; c1 *= *a; *(b + 0) = c0; *(b + 1) = c1; *(c + 0) = c0; *(c + ldc) = c1; } static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { if (bk) { BLASLONG i; for (i = bk; i--;) { *c -= *a * *b; a += 1; b += 1; } } *c *= *a; *b = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { BLASLONG i, j, kk; FLOAT *aa, *cc; for (j = (n >> 2); j--;) { kk = offset; aa = a; cc = c; for (i = (m >> 3); i--;) { dsolve_8x4_lt_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; kk += 8; } if (m & 7) { if (m & 4) { dsolve_4x4_lt_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; kk += 4; } if (m & 2) { dsolve_2x4_lt_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; kk += 2; } if (m & 1) { dsolve_1x4_lt_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; kk += 1; } } b += 4 * k; c += 4 * ldc; } if (n & 3) { if (n & 2) { kk = offset; aa = a; cc = c; for (i = (m >> 3); i--;) { dsolve_8x2_lt_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; kk += 8; } if (m & 7) { if (m & 4) { dsolve_4x2_lt_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; kk += 4; } if (m & 2) { dsolve_2x2_lt_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; kk += 2; } if (m & 1) { dsolve_1x2_lt_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; kk += 1; } } b += 2 * k; c += 2 * ldc; } if (n & 1) { kk = offset; aa = a; cc = c; for (i = (m >> 3); i--;) { dsolve_8x1_lt_msa(aa, b, cc, kk); aa += 8 * k; cc += 8; kk += 8; } if (m & 7) { if (m & 4) { dsolve_4x1_lt_msa(aa, b, cc, kk); aa += 4 * k; cc += 4; kk += 4; } if (m & 2) { dsolve_2x1_lt_msa(aa, b, cc, kk); aa += 2 * k; cc += 2; kk += 2; } if (m & 1) { dgmm_dsolve_1x1_msa(aa, b, cc, kk); aa += k; cc += 1; kk += 1; } } b += k; c += ldc; } } return 0; } OpenBLAS-0.2.20/kernel/mips/dtrsm_kernel_RN_8x4_msa.c000066400000000000000000000605641313527062700222060ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static __attribute__ ((noinline)) void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; v2f64 src_b10, src_b11, src_b15; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); if (bk) { BLASLONG i, pref_offset; FLOAT *pa0_pref; v2f64 src_a0, src_a1, src_a2, src_a3, src_b; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (i = (bk >> 1); i--;) { PREF_OFFSET(pa0_pref, 128); PREF_OFFSET(pa0_pref, 160); PREF_OFFSET(pa0_pref, 192); PREF_OFFSET(pa0_pref, 224); LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; pa0_pref += 16; } if (bk & 1) { LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; } } src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); src_b5 = __msa_cast_to_vector_double(*(b + 5)); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); src_b15 = __msa_cast_to_vector_double(*(b + 15)); src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 *= src_b0; src_c3 *= src_b0; src_c4 -= src_c0 * src_b1; src_c5 -= src_c1 * src_b1; src_c6 -= src_c2 * src_b1; src_c7 -= src_c3 * src_b1; src_c4 *= src_b5; src_c5 *= src_b5; src_c6 *= src_b5; src_c7 *= src_b5; src_c8 -= src_c0 * src_b2; src_c9 -= src_c1 * src_b2; src_c10 -= src_c2 * src_b2; src_c11 -= src_c3 * src_b2; src_c8 -= src_c4 * src_b6; src_c9 -= src_c5 * src_b6; src_c10 -= src_c6 * src_b6; src_c11 -= src_c7 * src_b6; src_c8 *= src_b10; src_c9 *= src_b10; src_c10 *= src_b10; src_c11 *= src_b10; src_c12 -= src_c0 * src_b3; src_c13 -= src_c1 * src_b3; src_c14 -= src_c2 * src_b3; src_c15 -= src_c3 * src_b3; src_c12 -= src_c4 * src_b7; src_c13 -= src_c5 * src_b7; src_c14 -= src_c6 * src_b7; src_c15 -= src_c7 * src_b7; src_c12 -= src_c8 * src_b11; src_c13 -= src_c9 * src_b11; src_c14 -= src_c10 * src_b11; src_c15 -= src_c11 * src_b11; src_c12 *= src_b15; src_c13 *= src_b15; src_c14 *= src_b15; src_c15 *= src_b15; ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); } static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_b0, src_b1, src_b3, src_b; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(b); a += 8; b += 2; for (i = (bk - 1); i--;) { LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); src_b1 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_a0 = src_a4; src_a1 = src_a5; src_a2 = src_a6; src_a3 = src_a7; src_b0 = src_b1; a += 8; b += 2; } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; } src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b3 = __msa_cast_to_vector_double(*(b + 3)); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 *= src_b0; src_c3 *= src_b0; src_c4 -= src_c0 * src_b1; src_c5 -= src_c1 * src_b1; src_c6 -= src_c2 * src_b1; src_c7 -= src_c3 * src_b1; src_c4 *= src_b3; src_c5 *= src_b3; src_c6 *= src_b3; src_c7 *= src_b3; ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); } static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3; v2f64 src_b0; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_a2, src_a3, src_b; for (i = bk; i--;) { LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); src_b = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b, (v2i64) src_b); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; a += 8; b += 1; } } src_b0 = __msa_cast_to_vector_double(*b); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 *= src_b0; src_c3 *= src_b0; ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; v2f64 src_b10, src_b11, src_b15; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) { LD_DP2(a, 2, src_a0, src_a1); LD_DP2(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c6 -= src_a0 * src_b; src_c7 -= src_a1 * src_b; a += 4; b += 4; } } src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); src_b5 = __msa_cast_to_vector_double(*(b + 5)); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); src_b15 = __msa_cast_to_vector_double(*(b + 15)); src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 -= src_c0 * src_b1; src_c3 -= src_c1 * src_b1; src_c2 *= src_b5; src_c3 *= src_b5; src_c4 -= src_c0 * src_b2; src_c5 -= src_c1 * src_b2; src_c4 -= src_c2 * src_b6; src_c5 -= src_c3 * src_b6; src_c4 *= src_b10; src_c5 *= src_b10; src_c6 -= src_c0 * src_b3; src_c7 -= src_c1 * src_b3; src_c6 -= src_c2 * src_b7; src_c7 -= src_c3 * src_b7; src_c6 -= src_c4 * src_b11; src_c7 -= src_c5 * src_b11; src_c6 *= src_b15; src_c7 *= src_b15; ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); } static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); if (bk) { BLASLONG i; v2f64 src_a0, src_a1, src_b, src_b0; for (i = bk; i--;) { LD_DP2(a, 2, src_a0, src_a1); src_b0 = LD_DP(b); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; a += 4; b += 2; } } src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b3 = __msa_cast_to_vector_double(*(b + 3)); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 -= src_c0 * src_b1; src_c3 -= src_c1 * src_b1; src_c2 *= src_b3; src_c3 *= src_b3; ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; a += 4; b += 1; } } c0 *= *b; c1 *= *b; c2 *= *b; c3 *= *b; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; FLOAT c1, c1_nxt1, c1_nxt2, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + 1 * ldc); c1_nxt1 = *(c + 1 + 1 * ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt1 -= a[0] * b[1]; c1_nxt1 -= a[1] * b[1]; c0_nxt2 -= a[0] * b[2]; c1_nxt2 -= a[1] * b[2]; c0_nxt3 -= a[0] * b[3]; c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; } } b0 = *(b + 0); b1 = *(b + 1); b2 = *(b + 2); b3 = *(b + 3); b5 = *(b + 5); b6 = *(b + 6); b7 = *(b + 7); b10 = *(b + 10); b11 = *(b + 11); b15 = *(b + 15); c0 *= b0; c1 *= b0; c0_nxt1 -= c0 * b1; c1_nxt1 -= c1 * b1; c0_nxt1 *= b5; c1_nxt1 *= b5; c0_nxt2 -= c0 * b2; c1_nxt2 -= c1 * b2; c0_nxt2 -= c0_nxt1 * b6; c1_nxt2 -= c1_nxt1 * b6; c0_nxt2 *= b10; c1_nxt2 *= b10; c0_nxt3 -= c0 * b3; c1_nxt3 -= c1 * b3; c0_nxt3 -= c0_nxt1 * b7; c1_nxt3 -= c1_nxt1 * b7; c0_nxt3 -= c0_nxt2 * b11; c1_nxt3 -= c1_nxt2 * b11; c0_nxt3 *= b15; c1_nxt3 *= b15; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt1; *(a + 3) = c1_nxt1; *(a + 4) = c0_nxt2; *(a + 5) = c1_nxt2; *(a + 6) = c0_nxt3; *(a + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; *(c + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt -= a[0] * b[1]; c1_nxt -= a[1] * b[1]; a += 2; b += 2; } } b0 = *(b + 0); b1 = *(b + 1); b3 = *(b + 3); c0 *= b0; c1 *= b0; c0_nxt -= c0 * b1; c1_nxt -= c1 * b1; c0_nxt *= b3; c1_nxt *= b3; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt; *(a + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; a += 2; b += 1; } } b0 = *b; c0 *= b0; c1 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; c2 -= a[0] * b[2]; c3 -= a[0] * b[3]; a += 1; b += 4; } } b0 = *(b + 0); b1 = *(b + 1); b2 = *(b + 2); b3 = *(b + 3); b5 = *(b + 5); b6 = *(b + 6); b7 = *(b + 7); b10 = *(b + 10); b11 = *(b + 11); b15 = *(b + 15); c0 *= b0; c1 -= c0 * b1; c1 *= b5; c2 -= c0 * b2; c2 -= c1 * b6; c2 *= b10; c3 -= c0 * b3; c3 -= c1 * b7; c3 -= c2 * b11; c3 *= b15; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c + 0) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; } static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b1, b3, c0, c1; c0 = *c; c1 = *(c + ldc); if (bk) { BLASLONG i; for (i = bk; i--;) { c0 -= *a * b[0]; c1 -= *a * b[1]; a += 1; b += 2; } } b0 = *(b + 0); b1 = *(b + 1); b3 = *(b + 3); c0 *= b0; c1 -= c0 * b1; c1 *= b3; *(a + 0) = c0; *(a + 1) = c1; *(c + 0) = c0; *(c + ldc) = c1; } static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { if (bk) { BLASLONG i; for (i = bk; i--;) { *c -= *a * *b; a += 1; b += 1; } } *c *= *a; *b = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { BLASLONG i, j, kk; FLOAT *aa, *cc; kk = -offset; for (j = (n >> 2); j--;) { aa = a; cc = c; for (i = (m >> 3); i--;) { dsolve_8x4_rn_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { dsolve_4x4_rn_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; } if (m & 2) { dsolve_2x4_rn_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; } if (m & 1) { dsolve_1x4_rn_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; } } kk += 4; b += 4 * k; c += 4 * ldc; } if (n & 3) { if (n & 2) { aa = a; cc = c; for (i = (m >> 3); i--;) { dsolve_8x2_rn_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { dsolve_4x2_rn_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; } if (m & 2) { dsolve_2x2_rn_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; } if (m & 1) { dsolve_1x2_rn_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; } } b += 2 * k; c += 2 * ldc; kk += 2; } if (n & 1) { aa = a; cc = c; for (i = (m >> 3); i--;) { dsolve_8x1_rn_msa(aa, b, cc, kk); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { dsolve_4x1_rn_msa(aa, b, cc, kk); aa += 4 * k; cc += 4; } if (m & 2) { dsolve_2x1_rn_msa(aa, b, cc, kk); aa += 2 * k; cc += 2; } if (m & 1) { dgmm_dsolve_1x1_msa(b, aa, cc, kk); aa += k; cc += 1; } } b += k; c += ldc; kk += 1; } } return 0; } OpenBLAS-0.2.20/kernel/mips/dtrsm_kernel_RT_8x4_msa.c000066400000000000000000000637121313527062700222120ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static __attribute__ ((noinline)) void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; v2f64 src_b14, src_b15; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); if (bk > 0) { BLASLONG i, pref_offset; FLOAT *pba = a, *pbb = b, *pa0_pref; v2f64 src_b, src_b0, src_b1, src_a0, src_a1, src_a2, src_a3; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (i = (bk >> 1); i--;) { PREF_OFFSET(pa0_pref, 128); PREF_OFFSET(pa0_pref, 160); PREF_OFFSET(pa0_pref, 192); PREF_OFFSET(pa0_pref, 224); LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; pa0_pref += 16; } if (bk & 1) { LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(pbb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; } } a -= 32; b -= 16; src_b12 = LD_DP(b + 12); src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); src_b14 = LD_DP(b + 14); src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); src_b10 = __msa_cast_to_vector_double(*(b + 10)); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); src_c12 *= src_b15; src_c13 *= src_b15; src_c14 *= src_b15; src_c15 *= src_b15; src_c8 -= src_c12 * src_b14; src_c9 -= src_c13 * src_b14; src_c10 -= src_c14 * src_b14; src_c11 -= src_c15 * src_b14; src_c8 *= src_b10; src_c9 *= src_b10; src_c10 *= src_b10; src_c11 *= src_b10; src_c4 -= src_c12 * src_b13; src_c5 -= src_c13 * src_b13; src_c6 -= src_c14 * src_b13; src_c7 -= src_c15 * src_b13; src_c4 -= src_c8 * src_b9; src_c5 -= src_c9 * src_b9; src_c6 -= src_c10 * src_b9; src_c7 -= src_c11 * src_b9; src_c4 *= src_b5; src_c5 *= src_b5; src_c6 *= src_b5; src_c7 *= src_b5; src_c0 -= src_c12 * src_b12; src_c1 -= src_c13 * src_b12; src_c2 -= src_c14 * src_b12; src_c3 -= src_c15 * src_b12; src_c0 -= src_c8 * src_b8; src_c1 -= src_c9 * src_b8; src_c2 -= src_c10 * src_b8; src_c3 -= src_c11 * src_b8; src_c0 -= src_c4 * src_b4; src_c1 -= src_c5 * src_b4; src_c2 -= src_c6 * src_b4; src_c3 -= src_c7 * src_b4; src_c0 *= src_b0; src_c1 *= src_b0; src_c2 *= src_b0; src_c3 *= src_b0; ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_b0, src_b2, src_b3; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); if (bk > 0) { BLASLONG i; FLOAT *pba = a, *pbb = b; v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3; v2f64 src_a4, src_a5, src_a6, src_a7; LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pbb); for (i = bk - 1; i--;) { pba += 8; pbb += 2; LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); src_b1 = LD_DP(pbb); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_a0 = src_a4; src_a1 = src_a5; src_a2 = src_a6; src_a3 = src_a7; src_b0 = src_b1; } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; } a -= 16; b -= 4; src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); src_c4 *= src_b3; src_c5 *= src_b3; src_c6 *= src_b3; src_c7 *= src_b3; src_c0 -= src_c4 * src_b2; src_c1 -= src_c5 * src_b2; src_c2 -= src_c6 * src_b2; src_c3 -= src_c7 * src_b2; src_c0 *= src_b0; src_c1 *= src_b0; src_c2 *= src_b0; src_c3 *= src_b0; ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); } static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3; v2f64 src_b0; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; v2f64 src_b1; LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(bb); aa += 8; bb += 1; for (i = (bk - 1); i--;) { LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7); src_b1 = LD_DP(bb); src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a2 * src_b0; src_c3 -= src_a3 * src_b0; src_a0 = src_a4; src_a1 = src_a5; src_a2 = src_a6; src_a3 = src_a7; src_b0 = src_b1; aa += 8; bb += 1; } src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a2 * src_b0; src_c3 -= src_a3 * src_b0; } a -= 8; b -= 1; src_b0 = __msa_cast_to_vector_double(*b); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 *= src_b0; src_c3 *= src_b0; ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; v2f64 src_b14, src_b15; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0, src_b1; for (i = bk; i--;) { LD_DP2(aa, 2, src_a0, src_a1); LD_DP2(bb, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c6 -= src_a0 * src_b; src_c7 -= src_a1 * src_b; aa += 4; bb += 4; } } a -= 16; b -= 16; src_b12 = LD_DP(b + 12); src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); src_b14 = LD_DP(b + 14); src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); src_b10 = __msa_cast_to_vector_double(*(b + 10)); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); src_c6 *= src_b15; src_c7 *= src_b15; src_c4 -= src_c6 * src_b14; src_c5 -= src_c7 * src_b14; src_c4 *= src_b10; src_c5 *= src_b10; src_c2 -= src_c6 * src_b13; src_c3 -= src_c7 * src_b13; src_c2 -= src_c4 * src_b9; src_c3 -= src_c5 * src_b9; src_c2 *= src_b5; src_c3 *= src_b5; src_c0 -= src_c6 * src_b12; src_c1 -= src_c7 * src_b12; src_c0 -= src_c4 * src_b8; src_c1 -= src_c5 * src_b8; src_c0 -= src_c2 * src_b4; src_c1 -= src_c3 * src_b4; src_c0 *= src_b0; src_c1 *= src_b0; ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); ST_DP2(src_c0, src_c1, c, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0; for (i = bk; i--;) { LD_DP2(aa, 2, src_a0, src_a1); src_b0 = LD_DP(bb); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; aa += 4; bb += 2; } } a -= 8; b -= 4; src_b0 = __msa_cast_to_vector_double(*(b + 0)); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); src_c2 *= src_b3; src_c3 *= src_b3; src_c0 -= src_c2 * src_b2; src_c1 -= src_c3 * src_b2; src_c0 *= src_b0; src_c1 *= src_b0; ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); } static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT b0, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; aa += 4; bb += 1; } } a -= 4; b0 = *(b - 1); c0 *= b0; c1 *= b0; c2 *= b0; c3 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + 1 * ldc); c1_nxt1 = *(c + 1 + 1 * ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt1 -= aa[0] * bb[1]; c1_nxt1 -= aa[1] * bb[1]; c0_nxt2 -= aa[0] * bb[2]; c1_nxt2 -= aa[1] * bb[2]; c0_nxt3 -= aa[0] * bb[3]; c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } } a -= 8; b -= 16; b0 = *b; b4 = *(b + 4); b5 = *(b + 5); b8 = *(b + 8); b9 = *(b + 9); b10 = *(b + 10); b12 = *(b + 12); b13 = *(b + 13); b14 = *(b + 14); b15 = *(b + 15); c0_nxt3 *= b15; c1_nxt3 *= b15; c0_nxt2 -= c0_nxt3 * b14; c1_nxt2 -= c1_nxt3 * b14; c0_nxt2 *= b10; c1_nxt2 *= b10; c0_nxt1 -= c0_nxt3 * b13; c1_nxt1 -= c1_nxt3 * b13; c0_nxt1 -= c0_nxt2 * b9; c1_nxt1 -= c1_nxt2 * b9; c0_nxt1 *= b5; c1_nxt1 *= b5; c0 -= c0_nxt3 * b12; c1 -= c1_nxt3 * b12; c0 -= c0_nxt2 * b8; c1 -= c1_nxt2 * b8; c0 -= c0_nxt1 * b4; c1 -= c1_nxt1 * b4; c0 *= b0; c1 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt1; *(a + 3) = c1_nxt1; *(a + 4) = c0_nxt2; *(a + 5) = c1_nxt2; *(a + 6) = c0_nxt3; *(a + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt -= aa[0] * bb[1]; c1_nxt -= aa[1] * bb[1]; aa += 2; bb += 2; } } a -= 4; b -= 4; b3 = *(b + 3); b2 = *(b + 2); b0 = *b; c0_nxt *= b3; c1_nxt *= b3; c0 -= c0_nxt * b2; c0 *= b0; c1 -= c1_nxt * b2; c1 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt; *(a + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; aa += 2; bb += 1; } } b0 = *(b - 1); c0 *= b0; c1 *= b0; *(a - 2) = c0; *(a - 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; c2 -= aa[0] * bb[2]; c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } } a -= 4; b -= 16; b0 = *b; b4 = *(b + 4); b5 = *(b + 5); b8 = *(b + 8); b9 = *(b + 9); b10 = *(b + 10); b12 = *(b + 12); b13 = *(b + 13); b14 = *(b + 14); b15 = *(b + 15); c3 *= b15; c2 -= c3 * b14; c2 *= b10; c1 -= c3 * b13; c1 -= c2 * b9; c1 *= b5; c0 -= c3 * b12; c0 -= c2 * b8; c0 -= c1 * b4; c0 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c + 0 * ldc) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; } static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { FLOAT b0, b2, b3, c0, c1; c0 = *(c + 0); c1 = *(c + ldc); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; for (i = bk; i--;) { c0 -= *aa * bb[0]; c1 -= *aa * bb[1]; aa += 1; bb += 2; } } a -= 2; b -= 4; b3 = *(b + 3); b2 = *(b + 2); b0 = *b; c1 *= b3; c0 -= c1 * b2; c0 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(c + 0) = c0; *(c + ldc) = c1; } static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { if (bk > 0) { BLASLONG i; for (i = 0; i < bk; i++) { *c -= a[i] * b[i]; } } *c *= *(b - 1); *(a - 1) = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { BLASLONG i, j, kk; FLOAT *aa, *cc, *bb; kk = n - offset; c += n * ldc; b += n * k; if (n & 3) { if (n & 1) { aa = a; c -= ldc; b -= k; bb = b + kk; cc = c; for (i = (m >> 3); i--;) { dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, (k - kk)); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, (k - kk)); aa += 4 * k; cc += 4; } if (m & 2) { dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, (k - kk)); aa += 2 * k; cc += 2; } if (m & 1) { dsolve_1x1_rt_msa(aa + kk, bb, cc, (k - kk)); aa += k; cc += 1; } } kk -= 1; } if (n & 2) { aa = a; c -= 2 * ldc; b -= 2 * k; bb = b + 2 * kk; cc = c; for (i = (m >> 3); i--;) { dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, (k - kk)); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, (k - kk)); aa += 4 * k; cc += 4; } if (m & 2) { dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, (k - kk)); aa += 2 * k; cc += 2; } if (m & 1) { dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, (k - kk)); aa += k; cc += 1; } } kk -= 2; } } for (j = (n >> 2); j--;) { aa = a; b -= 4 * k; bb = b + 4 * kk; c -= 4 * ldc; cc = c; for (i = (m >> 3); i--;) { dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, (k - kk)); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, (k - kk)); aa += 4 * k; cc += 4; } if (m & 2) { dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, (k - kk)); aa += 2 * k; cc += 2; } if (m & 1) { dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, (k - kk)); aa += k; cc += 1; } } kk -= 4; } return 0; } OpenBLAS-0.2.20/kernel/mips/gemv_n.c000066400000000000000000000040651313527062700170200ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG j; FLOAT *a_ptr; FLOAT temp; ix = 0; a_ptr = a; for (j=0; j #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; BLASLONG max=0; if (n <= 0 || inc_x <= 0) return(max); maxf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) > maxf ) { max = i; maxf = ABS(x[ix]); } ix += inc_x; i++; } return(max+1); } OpenBLAS-0.2.20/kernel/mips/iamin.c000066400000000000000000000040531313527062700166370ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; BLASLONG min=0; if (n <= 0 || inc_x <= 0) return(min); minf=ABS(x[0]); ix += inc_x; i++; while(i < n) { if( ABS(x[ix]) < ABS(minf) ) { min = i; minf = ABS(x[ix]); } ix += inc_x; i++; } return(min+1); } OpenBLAS-0.2.20/kernel/mips/imax.c000066400000000000000000000037171313527062700165060ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; BLASLONG max=0; if (n <= 0 || inc_x <= 0) return(max); maxf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] > maxf ) { max = i; maxf = x[ix]; } ix += inc_x; i++; } return(max+1); } OpenBLAS-0.2.20/kernel/mips/imin.c000066400000000000000000000037171313527062700165040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; BLASLONG min=0; if (n <= 0 || inc_x <= 0) return(min); minf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] > minf ) { min = i; minf = x[ix]; } ix += inc_x; i++; } return(min+1); } OpenBLAS-0.2.20/kernel/mips/izamax.c000066400000000000000000000041721313527062700170350ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf; BLASLONG max=0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(max); inc_x2 = 2 * inc_x; maxf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) > maxf ) { max = i; maxf = CABS1(x,ix); } ix += inc_x2; i++; } return(max+1); } OpenBLAS-0.2.20/kernel/mips/izamin.c000066400000000000000000000041721313527062700170330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf; BLASLONG min=0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(min); inc_x2 = 2 * inc_x; minf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) < minf ) { min = i; minf = CABS1(x,ix); } ix += inc_x2; i++; } return(min+1); } OpenBLAS-0.2.20/kernel/mips/macros_msa.h000066400000000000000000000750321313527062700177000ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef __MACROS_MSA_H__ #define __MACROS_MSA_H__ #include #include #define ENABLE_PREFETCH #ifdef ENABLE_PREFETCH inline static void prefetch_load_lf(unsigned char *src) { __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r" (src)); } #define PREFETCH(PTR) prefetch_load_lf((unsigned char *)(PTR)); #define STRNG(X) #X #define PREF_OFFSET(src_ptr, offset) \ __asm__ __volatile__("pref 0, " STRNG(offset) "(%[src]) \n\t" : : [src] "r" (src_ptr)); #else #define PREFETCH(PTR) #define PREF_OFFSET(src_ptr, offset) #endif #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) #define LD_SP(...) LD_W(v4f32, __VA_ARGS__) #define LD_D(RTYPE, psrc) *((RTYPE *)(psrc)) #define LD_DP(...) LD_D(v2f64, __VA_ARGS__) #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_SP(...) ST_W(v4f32, __VA_ARGS__) #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) #define COPY_FLOAT_TO_VECTOR(a) ( { \ v4f32 out; \ out = __msa_cast_to_vector_float(a); \ out = (v4f32) __msa_splati_w((v4i32) out, 0); \ out; \ } ) #define COPY_DOUBLE_TO_VECTOR(a) ( { \ v2f64 out; \ out = __msa_cast_to_vector_double(a); \ out = (v2f64) __msa_splati_d((v2i64) out, 0); \ out; \ } ) /* Description : Load 2 variables with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 */ #define LD_GP2_INC(psrc, stride, out0, out1) \ { \ out0 = *(psrc); \ (psrc) += stride; \ out1 = *(psrc); \ (psrc) += stride; \ } #define LD_GP3_INC(psrc, stride, out0, \ out1, out2) \ { \ LD_GP2_INC(psrc, stride, out0, out1); \ out2 = *(psrc); \ (psrc) += stride; \ } #define LD_GP4_INC(psrc, stride, out0, \ out1, out2, out3) \ { \ LD_GP2_INC(psrc, stride, out0, out1); \ LD_GP2_INC(psrc, stride, out2, out3); \ } #define LD_GP5_INC(psrc, stride, out0, \ out1, out2, out3, out4) \ { \ LD_GP2_INC(psrc, stride, out0, out1); \ LD_GP2_INC(psrc, stride, out2, out3); \ out4 = *(psrc); \ (psrc) += stride; \ } #define LD_GP6_INC(psrc, stride, out0, \ out1, out2, out3, \ out4, out5) \ { \ LD_GP2_INC(psrc, stride, out0, out1); \ LD_GP2_INC(psrc, stride, out2, out3); \ LD_GP2_INC(psrc, stride, out4, out5); \ } #define LD_GP7_INC(psrc, stride, out0, \ out1, out2, out3, \ out4, out5, out6) \ { \ LD_GP2_INC(psrc, stride, out0, out1); \ LD_GP2_INC(psrc, stride, out2, out3); \ LD_GP2_INC(psrc, stride, out4, out5); \ out6 = *(psrc); \ (psrc) += stride; \ } #define LD_GP8_INC(psrc, stride, out0, out1, out2, \ out3, out4, out5, out6, out7) \ { \ LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \ LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \ } /* Description : Load 2 vectors of single precision floating point elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - single precision floating point */ #define LD_SP2(psrc, stride, out0, out1) \ { \ out0 = LD_SP((psrc)); \ out1 = LD_SP((psrc) + stride); \ } #define LD_SP4(psrc, stride, out0, out1, out2, out3) \ { \ LD_SP2(psrc, stride, out0, out1) \ LD_SP2(psrc + 2 * stride, stride, out2, out3) \ } #define LD_SP2_INC(psrc, stride, out0, out1) \ { \ out0 = LD_SP((psrc)); \ (psrc) += stride; \ out1 = LD_SP((psrc)); \ (psrc) += stride; \ } #define LD_SP3_INC(psrc, stride, out0, \ out1, out2) \ { \ LD_SP2_INC(psrc, stride, out0, out1); \ out2 = LD_SP((psrc)); \ (psrc) += stride; \ } #define LD_SP4_INC(psrc, stride, out0, \ out1, out2, out3) \ { \ LD_SP2_INC(psrc, stride, out0, out1); \ LD_SP2_INC(psrc, stride, out2, out3); \ } #define LD_SP5_INC(psrc, stride, out0, \ out1, out2, out3, out4) \ { \ LD_SP2_INC(psrc, stride, out0, out1); \ LD_SP2_INC(psrc, stride, out2, out3); \ out4 = LD_SP((psrc)); \ (psrc) += stride; \ } #define LD_SP6_INC(psrc, stride, out0, \ out1, out2, out3, \ out4, out5) \ { \ LD_SP2_INC(psrc, stride, out0, out1); \ LD_SP2_INC(psrc, stride, out2, out3); \ LD_SP2_INC(psrc, stride, out4, out5); \ } #define LD_SP7_INC(psrc, stride, out0, \ out1, out2, out3, \ out4, out5, out6) \ { \ LD_SP2_INC(psrc, stride, out0, out1); \ LD_SP2_INC(psrc, stride, out2, out3); \ LD_SP2_INC(psrc, stride, out4, out5); \ out6 = LD_SP((psrc)); \ (psrc) += stride; \ } #define LD_SP8_INC(psrc, stride, out0, out1, out2, \ out3, out4, out5, out6, out7) \ { \ LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \ LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \ } #define LD_SP16_INC(psrc, stride, out0, out1, out2, \ out3, out4, out5, out6, out7, out8, \ out9, out10, out11, out12, out13, \ out14, out15) \ { \ LD_SP8_INC(psrc, stride, out0, out1, out2, \ out3, out4, out5, out6, out7); \ LD_SP8_INC(psrc, stride, out8, out9, out10, \ out11, out12, out13, out14, out15); \ } /* Description : Load 2 vectors of double precision floating point elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - double precision floating point */ #define LD_DP2(psrc, stride, out0, out1) \ { \ out0 = LD_DP((psrc)); \ out1 = LD_DP((psrc) + stride); \ } #define LD_DP4(psrc, stride, out0, out1, out2, out3) \ { \ LD_DP2(psrc, stride, out0, out1) \ LD_DP2(psrc + 2 * stride, stride, out2, out3) \ } #define LD_DP2_INC(psrc, stride, out0, out1) \ { \ out0 = LD_DP(psrc); \ (psrc) += stride; \ out1 = LD_DP(psrc); \ (psrc) += stride; \ } #define LD_DP3_INC(psrc, stride, out0, \ out1, out2) \ { \ LD_DP2_INC(psrc, stride, out0, out1); \ out2 = LD_DP((psrc)); \ (psrc) += stride; \ } #define LD_DP4_INC(psrc, stride, out0, \ out1, out2, out3) \ { \ LD_DP2_INC(psrc, stride, out0, out1); \ LD_DP2_INC(psrc, stride, out2, out3); \ } #define LD_DP5_INC(psrc, stride, out0, \ out1, out2, out3, out4) \ { \ LD_DP2_INC(psrc, stride, out0, out1); \ LD_DP2_INC(psrc, stride, out2, out3); \ out4 = LD_DP((psrc)); \ (psrc) += stride; \ } #define LD_DP6_INC(psrc, stride, out0, \ out1, out2, out3, \ out4, out5) \ { \ LD_DP2_INC(psrc, stride, out0, out1); \ LD_DP2_INC(psrc, stride, out2, out3); \ LD_DP2_INC(psrc, stride, out4, out5); \ } #define LD_DP7_INC(psrc, stride, out0, \ out1, out2, out3, \ out4, out5, out6) \ { \ LD_DP2_INC(psrc, stride, out0, out1); \ LD_DP2_INC(psrc, stride, out2, out3); \ LD_DP2_INC(psrc, stride, out4, out5); \ out6 = LD_DP((psrc)); \ (psrc) += stride; \ } #define LD_DP8_INC(psrc, stride, out0, out1, out2, \ out3, out4, out5, out6, out7) \ { \ LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \ LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \ } #define LD_DP16_INC(psrc, stride, out0, out1, out2, \ out3, out4, out5, out6, out7, out8, \ out9, out10, out11, out12, out13, \ out14, out15) \ { \ LD_DP8_INC(psrc, stride, out0, out1, out2, \ out3, out4, out5, out6, out7); \ LD_DP8_INC(psrc, stride, out8, out9, out10, \ out11, out12, out13, out14, out15); \ } /* Description : Store GP variable with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 4 single precision floating point elements from 'in0' to (pdst) Store 4 single precision floating point elements from 'in1' to (pdst + stride) */ #define ST_GP2_INC(in0, in1, \ pdst, stride) \ { \ *(pdst) = in0; \ (pdst) += stride; \ *(pdst) = in1; \ (pdst) += stride; \ } #define ST_GP3_INC(in0, in1, in2, \ pdst, stride) \ { \ ST_GP2_INC(in0, in1, pdst, stride); \ *(pdst) = in2; \ (pdst) += stride; \ } #define ST_GP4_INC(in0, in1, in2, in3, \ pdst, stride) \ { \ ST_GP2_INC(in0, in1, pdst, stride); \ ST_GP2_INC(in2, in3, pdst, stride); \ } #define ST_GP5_INC(in0, in1, in2, in3, \ in4, pdst, stride) \ { \ ST_GP2_INC(in0, in1, pdst, stride); \ ST_GP2_INC(in2, in3, pdst, stride); \ *(pdst) = in4; \ (pdst) += stride; \ } #define ST_GP6_INC(in0, in1, in2, in3, \ in4, in5, pdst, stride) \ { \ ST_GP2_INC(in0, in1, pdst, stride); \ ST_GP2_INC(in2, in3, pdst, stride); \ ST_GP2_INC(in4, in5, pdst, stride); \ } #define ST_GP7_INC(in0, in1, in2, in3, in4, \ in5, in6, pdst, stride) \ { \ ST_GP2_INC(in0, in1, pdst, stride); \ ST_GP2_INC(in2, in3, pdst, stride); \ ST_GP2_INC(in4, in5, pdst, stride); \ *(pdst) = in6; \ (pdst) += stride; \ } #define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \ in6, in7, pdst, stride) \ { \ ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \ ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \ } /* Description : Store vectors of single precision floating point elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 4 single precision floating point elements from 'in0' to (pdst) Store 4 single precision floating point elements from 'in1' to (pdst + stride) */ #define ST_SP2(in0, in1, pdst, stride) \ { \ ST_SP(in0, (pdst)); \ ST_SP(in1, (pdst) + stride); \ } #define ST_SP4(in0, in1, in2, in3, pdst, stride) \ { \ ST_SP2(in0, in1, (pdst), stride); \ ST_SP2(in2, in3, (pdst + 2 * stride), stride); \ } #define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ { \ ST_SP4(in0, in1, in2, in3, (pdst), stride); \ ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \ } #define ST_SP2_INC(in0, in1, pdst, stride) \ { \ ST_SP(in0, (pdst)); \ (pdst) += stride; \ ST_SP(in1, (pdst)); \ (pdst) += stride; \ } #define ST_SP3_INC(in0, in1, in2, \ pdst, stride) \ { \ ST_SP2_INC(in0, in1, pdst, stride); \ ST_SP(in2, (pdst)); \ (pdst) += stride; \ } #define ST_SP4_INC(in0, in1, in2, in3, \ pdst, stride) \ { \ ST_SP2_INC(in0, in1, pdst, stride); \ ST_SP2_INC(in2, in3, pdst, stride); \ } #define ST_SP5_INC(in0, in1, in2, in3, \ in4, pdst, stride) \ { \ ST_SP2_INC(in0, in1, pdst, stride); \ ST_SP2_INC(in2, in3, pdst, stride); \ ST_SP(in4, (pdst)); \ (pdst) += stride; \ } #define ST_SP6_INC(in0, in1, in2, in3, \ in4, in5, pdst, stride) \ { \ ST_SP2_INC(in0, in1, pdst, stride); \ ST_SP2_INC(in2, in3, pdst, stride); \ ST_SP2_INC(in4, in5, pdst, stride); \ } #define ST_SP7_INC(in0, in1, in2, in3, in4, \ in5, in6, pdst, stride) \ { \ ST_SP2_INC(in0, in1, pdst, stride); \ ST_SP2_INC(in2, in3, pdst, stride); \ ST_SP2_INC(in4, in5, pdst, stride); \ ST_SP(in6, (pdst)); \ (pdst) += stride; \ } #define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \ in6, in7, pdst, stride) \ { \ ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \ ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \ } #define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \ in7, in8, in9, in10, in11, in12, \ in13, in14, in15, pdst, stride) \ { \ ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \ in7, pdst, stride); \ ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \ in15, pdst, stride); \ } /* Description : Store vectors of double precision floating point elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 2 double precision floating point elements from 'in0' to (pdst) Store 2 double precision floating point elements from 'in1' to (pdst + stride) */ #define ST_DP2(in0, in1, pdst, stride) \ { \ ST_DP(in0, (pdst)); \ ST_DP(in1, (pdst) + stride); \ } #define ST_DP4(in0, in1, in2, in3, pdst, stride) \ { \ ST_DP2(in0, in1, (pdst), stride); \ ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \ } #define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ { \ ST_DP4(in0, in1, in2, in3, (pdst), stride); \ ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } #define ST_DP2_INC(in0, in1, pdst, stride) \ { \ ST_DP(in0, (pdst)); \ (pdst) += stride; \ ST_DP(in1, (pdst)); \ (pdst) += stride; \ } #define ST_DP3_INC(in0, in1, in2, \ pdst, stride) \ { \ ST_DP2_INC(in0, in1, pdst, stride); \ ST_DP(in2, (pdst)); \ (pdst) += stride; \ } #define ST_DP4_INC(in0, in1, in2, in3, \ pdst, stride) \ { \ ST_DP2_INC(in0, in1, pdst, stride); \ ST_DP2_INC(in2, in3, pdst, stride); \ } #define ST_DP5_INC(in0, in1, in2, in3, \ in4, pdst, stride) \ { \ ST_DP2_INC(in0, in1, pdst, stride); \ ST_DP2_INC(in2, in3, pdst, stride); \ ST_DP(in4, (pdst)); \ (pdst) += stride; \ } #define ST_DP6_INC(in0, in1, in2, in3, \ in4, in5, pdst, stride) \ { \ ST_DP2_INC(in0, in1, pdst, stride); \ ST_DP2_INC(in2, in3, pdst, stride); \ ST_DP2_INC(in4, in5, pdst, stride); \ } #define ST_DP7_INC(in0, in1, in2, in3, in4, \ in5, in6, pdst, stride) \ { \ ST_DP2_INC(in0, in1, pdst, stride); \ ST_DP2_INC(in2, in3, pdst, stride); \ ST_DP2_INC(in4, in5, pdst, stride); \ ST_DP(in6, (pdst)); \ (pdst) += stride; \ } #define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \ in6, in7, pdst, stride) \ { \ ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \ ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \ } #define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \ in7, in8, in9, in10, in11, in12, \ in13, in14, in15, pdst, stride) \ { \ ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \ in7, pdst, stride); \ ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \ in15, pdst, stride); \ } /* Description : shuffle elements in vector as shf_val Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE */ #define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \ { \ out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ } #define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__) #define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__) #define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \ shf_val) \ { \ out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \ out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \ out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \ } #define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__) #define SHF_W4(RTYPE, in0, in1, in2, in3, \ out0, out1, out2, out3, shf_val) \ { \ SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \ SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \ } #define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__) #define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__) /* Description : Interleave both left and right half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE Details : Right half of byte elements from 'in0' and 'in1' are interleaved and written to 'out0' */ #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \ out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \ } #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) #define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__) #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \ } #define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__) #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) /* Description : Indexed word element values are replicated to all elements in output vector Arguments : Inputs - in, stidx Outputs - out0, out1 Return Type - as per RTYPE Details : 'stidx' element value from 'in' vector is replicated to all elements in 'out0' vector 'stidx + 1' element value from 'in' vector is replicated to all elements in 'out1' vector Valid index range for word operation is 0-3 */ #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \ { \ out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \ out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \ } #define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__) #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \ { \ SPLATI_W2(RTYPE, in, 0, out0, out1); \ SPLATI_W2(RTYPE, in, 2, out2, out3); \ } #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__) #define SPLATI_D2(RTYPE, in, out0, out1) \ { \ out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \ out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \ } #define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__) /* Description : Pack even double word elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Even double word elements of 'in0' are copied to the left half of 'out0' & even double word elements of 'in1' are copied to the right half of 'out0'. */ #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ { \ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ } #define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__) #define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__) #define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \ out0, out1, out2) \ { \ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \ out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \ } #define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__) #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3) \ { \ PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ } #define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__) /* Description : pack both even and odd half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE Details : Even double word elements of 'in0' and 'in1' are copied to the 'out0' & odd double word elements of 'in0' and 'in1' are copied to the 'out1'. */ #define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \ out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \ } #define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__) #define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \ out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \ } #define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__) /* Description : Multiplication of pairs of vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Details : Each element from 'in0' is multiplied with elements from 'in1' and the result is written to 'out0' */ #define MUL2(in0, in1, in2, in3, out0, out1) \ { \ out0 = in0 * in1; \ out1 = in2 * in3; \ } #define MUL3(in0, in1, in2, in3, in4, in5, \ out0, out1, out2) \ { \ out0 = in0 * in1; \ out1 = in2 * in3; \ out2 = in4 * in5; \ } #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3) \ { \ MUL2(in0, in1, in2, in3, out0, out1); \ MUL2(in4, in5, in6, in7, out2, out3); \ } /* Description : Multiplication of pairs of vectors and added in output Arguments : Inputs - in0, in1, vec, out0, out1 Outputs - out0, out1 Details : Each element from 'in0' is multiplied with elements from 'vec' and the result is added to 'out0' */ #define FMADD2(in0, in1, vec, inout0, inout1) \ { \ inout0 += in0 * vec; \ inout1 += in1 * vec; \ } #define FMADD3(in0, in1, in2, vec, \ inout0, inout1, inout2) \ { \ inout0 += in0 * vec; \ inout1 += in1 * vec; \ inout2 += in2 * vec; \ } #define FMADD4(in0, in1, in2, in3, vec, \ inout0, inout1, inout2, inout3) \ { \ FMADD2(in0, in1, vec, inout0, inout1); \ FMADD2(in2, in3, vec, inout2, inout3); \ } /* Description : Addition of 2 pairs of variables Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Details : Each element in 'in0' is added to 'in1' and result is written to 'out0'. */ #define ADD2(in0, in1, in2, in3, out0, out1) \ { \ out0 = in0 + in1; \ out1 = in2 + in3; \ } #define ADD3(in0, in1, in2, in3, in4, in5, \ out0, out1, out2) \ { \ out0 = in0 + in1; \ out1 = in2 + in3; \ out2 = in4 + in5; \ } #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3) \ { \ ADD2(in0, in1, in2, in3, out0, out1); \ ADD2(in4, in5, in6, in7, out2, out3); \ } /* Description : Transpose 4x4 block with word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - as per RTYPE */ #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \ out0, out1, out2, out3) \ { \ v4i32 s0_m, s1_m, s2_m, s3_m; \ \ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \ ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \ } #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__) #endif /* __MACROS_MSA_H__ */ OpenBLAS-0.2.20/kernel/mips/max.c000066400000000000000000000043251313527062700163310ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); maxf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] > maxf ) { maxf = x[ix]; } ix += inc_x; i++; } return(maxf); } OpenBLAS-0.2.20/kernel/mips/min.c000066400000000000000000000043251313527062700163270ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/14 Saar * BLASTEST float : NoTest * BLASTEST double : NoTest * CTEST : NoTest * TEST : NoTest * **************************************************************************************/ #include "common.h" #include FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf=0.0; if (n <= 0 || inc_x <= 0) return(minf); minf=x[0]; ix += inc_x; i++; while(i < n) { if( x[ix] < minf ) { minf = x[ix]; } ix += inc_x; i++; } return(minf); } OpenBLAS-0.2.20/kernel/mips/nrm2.c000066400000000000000000000050141313527062700164160ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2013/09/13 Saar * BLASTEST float : OK * BLASTEST double : OK * CTEST : OK * TEST : OK * **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT scale = 0.0; FLOAT ssq = 1.0; FLOAT absxi = 0.0; if (n <= 0 || inc_x <= 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; while(i < n) { if ( x[i] != 0.0 ) { absxi = ABS( x[i] ); if ( scale < absxi ) { ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); scale = absxi ; } else { ssq += ( absxi/scale ) * ( absxi/scale ); } } i += inc_x; } scale = scale * sqrt( ssq ); return(scale); } OpenBLAS-0.2.20/kernel/mips/omatcopy_cn.c000066400000000000000000000044601313527062700200570ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) { BLASLONG i,j; FLOAT *aptr,*bptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); aptr = a; bptr = b; if ( alpha == 0.0 ) { for ( i=0; i #include "macros_msa.h" #define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec)) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; FLOAT data0, data1, sumf = 0.0; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; v4f32 sum_abs0 = {0, 0, 0, 0}; v4f32 sum_abs1 = {0, 0, 0, 0}; v4f32 sum_abs2 = {0, 0, 0, 0}; v4f32 sum_abs3 = {0, 0, 0, 0}; v4f32 zero_v = {0, 0, 0, 0}; v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { if (n > 63) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 128 + 32; LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); for (i = 0; i < (n >> 6) - 1; i++) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src8); sum_abs1 += AND_VEC_W(src9); sum_abs2 += AND_VEC_W(src10); sum_abs3 += AND_VEC_W(src11); sum_abs0 += AND_VEC_W(src12); sum_abs1 += AND_VEC_W(src13); sum_abs2 += AND_VEC_W(src14); sum_abs3 += AND_VEC_W(src15); } LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); sum_abs0 += AND_VEC_W(src8); sum_abs1 += AND_VEC_W(src9); sum_abs2 += AND_VEC_W(src10); sum_abs3 += AND_VEC_W(src11); sum_abs0 += AND_VEC_W(src12); sum_abs1 += AND_VEC_W(src13); sum_abs2 += AND_VEC_W(src14); sum_abs3 += AND_VEC_W(src15); } if (n & 63) { if (n & 32) { LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); sum_abs0 += AND_VEC_W(src4); sum_abs1 += AND_VEC_W(src5); sum_abs2 += AND_VEC_W(src6); sum_abs3 += AND_VEC_W(src7); } if (n & 16) { LD_SP4_INC(x, 4, src0, src1, src2, src3); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); } if (n & 8) { LD_SP2_INC(x, 4, src0, src1); sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); } if (n & 4) { src0 = LD_SP(x); x += 4; sum_abs0 += AND_VEC_W(src0); } if (n & 2) { sumf += fabsf(*x); sumf += fabsf(*(x + 1)); x += 2; } if (n & 1) { sumf += fabsf(*x); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf += sum_abs0[0]; sumf += sum_abs0[1]; sumf += sum_abs0[2]; sumf += sum_abs0[3]; } else { for (i = (n >> 4); i--;) { src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x)); x += inc_x; src2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; src2 = (v4f32) __msa_insert_w((v4i32) src2, 1, *((int *) x)); x += inc_x; src2 = (v4f32) __msa_insert_w((v4i32) src2, 2, *((int *) x)); x += inc_x; src2 = (v4f32) __msa_insert_w((v4i32) src2, 3, *((int *) x)); x += inc_x; src3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; src3 = (v4f32) __msa_insert_w((v4i32) src3, 1, *((int *) x)); x += inc_x; src3 = (v4f32) __msa_insert_w((v4i32) src3, 2, *((int *) x)); x += inc_x; src3 = (v4f32) __msa_insert_w((v4i32) src3, 3, *((int *) x)); x += inc_x; sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); sum_abs2 += AND_VEC_W(src2); sum_abs3 += AND_VEC_W(src3); } if (n & 15) { if (n & 8) { src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x)); x += inc_x; src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x)); x += inc_x; sum_abs0 += AND_VEC_W(src0); sum_abs1 += AND_VEC_W(src1); } if (n & 4) { src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); x += inc_x; src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); x += inc_x; sum_abs0 += AND_VEC_W(src0); } if (n & 2) { data0 = fabsf(*x); x += inc_x; data1 = fabsf(*x); x += inc_x; sumf += data0; sumf += data1; } if (n & 1) { sumf += fabsf(*x); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf += sum_abs0[0]; sumf += sum_abs0[1]; sumf += sum_abs0[2]; sumf += sum_abs0[3]; } return (sumf); } OpenBLAS-0.2.20/kernel/mips/saxpy_msa.c000066400000000000000000000214041313527062700175450ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #if !defined(CONJ) #define OP0 += #define OP1 -= #define OP2 += #else #define OP0 -= #define OP1 += #define OP2 -= #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i; FLOAT *py; v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; v4f32 da_vec, zero_v = {0}; if ((n < 0) || (da == 0.0)) return(0); py = y; if ((1 == inc_x) && (1 == inc_y)) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 64; da_vec = COPY_FLOAT_TO_VECTOR(da); for (i = (n >> 5); i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 32; y_pref += 32; LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); } if (n & 31) { if (n & 16) { LD_SP4_INC(x, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); ST_SP4_INC(y0, y1, y2, y3, y, 4); } if (n & 8) { LD_SP2_INC(x, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); FMADD2(x0, x1, da_vec, y0, y1); ST_SP2_INC(y0, y1, y, 4); } if (n & 4) { x0 = LD_SP(x); x += 4; y0 = LD_SP(py); py += 4; y0 += da_vec * x0; ST_SP(y0, y); y += 4; } if (n & 2) { FMADD2(x[0], x[1], da, y[0], y[1]); x += 2; y += 2; } if (n & 1) { y[0] += da * x[0]; } } } else if (1 == inc_y) { da_vec = COPY_FLOAT_TO_VECTOR(da); for (i = (n >> 4); i--;) { x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); x += inc_x; x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x)); x += inc_x; x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x)); x += inc_x; x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x)); x += inc_x; x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x)); x += inc_x; x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); x += inc_x; x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x)); x += inc_x; LD_SP4_INC(py, 4, y0, y1, y2, y3); FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); ST_SP4_INC(y0, y1, y2, y3, y, 4); } if (n & 15) { if (n & 8) { x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); x += inc_x; x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); x += inc_x; LD_SP2_INC(py, 4, y0, y1); FMADD2(x0, x1, da_vec, y0, y1); ST_SP2_INC(y0, y1, y, 4); } if (n & 4) { x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); x += inc_x; x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); x += inc_x; y0 = LD_SP(py); py += 4; y0 += da_vec * x0; ST_SP(y0, y); y += 4; } if (n & 2) { FMADD2(x[0], x[inc_x], da, y[0], y[1]); x += 2 * inc_x; y += 2; } if (n & 1) { y[0] += da * x[0]; } } } else { FLOAT x0, x1, x2, x3, y0, y1, y2, y3; for (i = (n >> 2); i--;) { LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(py, inc_y, y0, y1, y2, y3); FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); ST_GP4_INC(y0, y1, y2, y3, y, inc_y); } if (n & 3) { if (n & 2) { LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(py, inc_y, y0, y1); FMADD2(x0, x1, da, y0, y1); ST_GP2_INC(y0, y1, y, inc_y); } if (n & 1) { *y += da * *x; } } } return (0); } OpenBLAS-0.2.20/kernel/mips/scal.c000066400000000000000000000036451313527062700164720ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; while(j < n) { if ( da == 0.0 ) x[i]=0.0; else x[i] = da * x[i] ; i += inc_x ; j++; } return 0; } OpenBLAS-0.2.20/kernel/mips/scopy_msa.c000066400000000000000000000140441313527062700175400ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; if (n < 0) return (0); if ((1 == inc_x) && (1 == inc_y)) { if (n > 63) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 128 + 32; LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 6) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; x8 = LD_SP(x); x += 4; ST_SP(x0, y); y += 4; x9 = LD_SP(x); x += 4; ST_SP(x1, y); y += 4; x10 = LD_SP(x); x += 4; ST_SP(x2, y); y += 4; x11 = LD_SP(x); x += 4; ST_SP(x3, y); y += 4; x12 = LD_SP(x); x += 4; ST_SP(x4, y); y += 4; x13 = LD_SP(x); x += 4; ST_SP(x5, y); y += 4; x14 = LD_SP(x); x += 4; ST_SP(x6, y); y += 4; x15 = LD_SP(x); x += 4; ST_SP(x7, y); y += 4; x0 = LD_SP(x); x += 4; ST_SP(x8, y); y += 4; x1 = LD_SP(x); x += 4; ST_SP(x9, y); y += 4; x2 = LD_SP(x); x += 4; ST_SP(x10, y); y += 4; x3 = LD_SP(x); x += 4; ST_SP(x11, y); y += 4; x4 = LD_SP(x); x += 4; ST_SP(x12, y); y += 4; x5 = LD_SP(x); x += 4; ST_SP(x13, y); y += 4; x6 = LD_SP(x); x += 4; ST_SP(x14, y); y += 4; x7 = LD_SP(x); x += 4; ST_SP(x15, y); y += 4; } x8 = LD_SP(x); x += 4; x9 = LD_SP(x); x += 4; ST_SP(x0, y); y += 4; x10 = LD_SP(x); x += 4; ST_SP(x1, y); y += 4; x11 = LD_SP(x); x += 4; ST_SP(x2, y); y += 4; x12 = LD_SP(x); x += 4; ST_SP(x3, y); y += 4; x13 = LD_SP(x); x += 4; ST_SP(x4, y); y += 4; x14 = LD_SP(x); x += 4; ST_SP(x5, y); y += 4; x15 = LD_SP(x); x += 4; ST_SP(x6, y); y += 4; ST_SP(x7, y); y += 4; ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); } if (n & 63) { if (n & 32) { LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); } if (n & 16) { LD_SP4_INC(x, 4, x0, x1, x2, x3); ST_SP4_INC(x0, x1, x2, x3, y, 4); } if (n & 8) { LD_SP2_INC(x, 4, x0, x1); ST_SP2_INC(x0, x1, y, 4); } if (n & 4) { LD_GP4_INC(x, 1, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, y, 1); } if (n & 2) { LD_GP2_INC(x, 1, f0, f1); ST_GP2_INC(f0, f1, y, 1); } if (n & 1) { *y = *x; } } } else { for (i = (n >> 3); i--;) { LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); } if (n & 4) { LD_GP4_INC(x, inc_x, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, y, inc_y); } if (n & 2) { LD_GP2_INC(x, inc_x, f0, f1); ST_GP2_INC(f0, f1, y, inc_y); } if (n & 1) { *y = *x; } } return (0); } OpenBLAS-0.2.20/kernel/mips/sdot_msa.c000066400000000000000000000125571313527062700173630ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif { BLASLONG i = 0; double dot = 0.0; FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; v4f32 dot0 = {0, 0, 0, 0}; v4f32 dot1 = {0, 0, 0, 0}; v4f32 dot2 = {0, 0, 0, 0}; v4f32 dot3 = {0, 0, 0, 0}; if (n < 1) return (dot); if ((1 == inc_x) && (1 == inc_y)) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 64; for (i = (n >> 5); i--;) { LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 32; y_pref += 32; dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); dot0 += (vy4 * vx4); dot1 += (vy5 * vx5); dot2 += (vy6 * vx6); dot3 += (vy7 * vx7); } if (n & 31) { if (n & 16) { LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); } if (n & 8) { LD_SP2_INC(x, 4, vx0, vx1); LD_SP2_INC(y, 4, vy0, vy1); dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); } if (n & 4) { vx0 = LD_SP(x); x += 4; vy0 = LD_SP(y); y += 4; dot0 += (vy0 * vx0); } if (n & 2) { LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); dot += (y0 * x0); dot += (y1 * x1); } if (n & 1) { x0 = *x; y0 = *y; dot += (y0 * x0); } } dot0 += dot1 + dot2 + dot3; dot += dot0[0]; dot += dot0[1]; dot += dot0[2]; dot += dot0[3]; } else { for (i = (n >> 2); i--;) { LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(y, inc_y, y0, y1, y2, y3); dot += (y0 * x0); dot += (y1 * x1); dot += (y2 * x2); dot += (y3 * x3); } if (n & 2) { LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); dot += (y0 * x0); dot += (y1 * x1); } if (n & 1) { x0 = *x; y0 = *y; dot += (y0 * x0); } } return (dot); } OpenBLAS-0.2.20/kernel/mips/sgemm_kernel_8x8_msa.c000066400000000000000000001735601313527062700215730ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { BLASLONG i, j, l, temp; #if defined(TRMMKERNEL) BLASLONG off; #endif FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; FLOAT *pa0, *pb0; FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7; v4f32 v_alpha = {alpha, alpha, alpha, alpha}; v4f32 src_a0, src_a1, src_b, src_b0, src_b1; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v4f32 res0, res1, res2, res3, res4, res5, res6, res7; v4f32 res8, res9, res10, res11, res12, res13, res14, res15; #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset; #endif for (j = (n >> 3); j--;) { pc0 = C; pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; pc4 = pc3 + ldc; pc5 = pc4 + ldc; pc6 = pc5 + ldc; pc7 = pc6 + ldc; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 8; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 8; // number of values in B #endif #else pb0 = B; temp = k; #endif #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 32(%[pa0]) \n\t" "pref 0, 32(%[pb0]) \n\t" : : [pa0] "r" (pa0), [pb0] "r" (pb0) ); #endif LD_SP2_INC(pa0, 4, src_a0, src_a1); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 = src_a0 * src_b; res3 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 = src_a0 * src_b; res5 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 = src_a0 * src_b; res7 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 = src_a0 * src_b; res9 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 = src_a0 * src_b; res11 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 = src_a0 * src_b; res13 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 = src_a0 * src_b; res15 = src_a1 * src_b; for (l = ((temp - 1) >> 1); l--;) { #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pa0]) \n\t" "pref 0, 96(%[pa0]) \n\t" "pref 0, 64(%[pb0]) \n\t" "pref 0, 96(%[pb0]) \n\t" : : [pa0] "r" (pa0), [pb0] "r" (pb0) ); #endif LD_SP2_INC(pa0, 4, src_a0, src_a1); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 += src_a0 * src_b; res9 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 += src_a0 * src_b; res11 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 += src_a0 * src_b; res13 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; LD_SP2_INC(pa0, 4, src_a0, src_a1); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 += src_a0 * src_b; res9 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 += src_a0 * src_b; res11 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 += src_a0 * src_b; res13 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; } if ((temp - 1) & 1) { LD_SP2_INC(pa0, 4, src_a0, src_a1); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res8 += src_a0 * src_b; res9 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res10 += src_a0 * src_b; res11 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res12 += src_a0 * src_b; res13 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res14 += src_a0 * src_b; res15 += src_a1 * src_b; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; dst4 = res4 * v_alpha; dst5 = res5 * v_alpha; dst6 = res6 * v_alpha; dst7 = res7 * v_alpha; #else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); LD_SP2(pc3, 4, dst6, dst7); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; dst4 += res4 * v_alpha; dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; #endif ST_SP2_INC(dst0, dst1, pc0, 4); ST_SP2_INC(dst2, dst3, pc1, 4); ST_SP2_INC(dst4, dst5, pc2, 4); ST_SP2_INC(dst6, dst7, pc3, 4); #if defined(TRMMKERNEL) dst0 = res8 * v_alpha; dst1 = res9 * v_alpha; dst2 = res10 * v_alpha; dst3 = res11 * v_alpha; dst4 = res12 * v_alpha; dst5 = res13 * v_alpha; dst6 = res14 * v_alpha; dst7 = res15 * v_alpha; #else LD_SP2(pc4, 4, dst0, dst1); LD_SP2(pc5, 4, dst2, dst3); LD_SP2(pc6, 4, dst4, dst5); LD_SP2(pc7, 4, dst6, dst7); dst0 += res8 * v_alpha; dst1 += res9 * v_alpha; dst2 += res10 * v_alpha; dst3 += res11 * v_alpha; dst4 += res12 * v_alpha; dst5 += res13 * v_alpha; dst6 += res14 * v_alpha; dst7 += res15 * v_alpha; #endif ST_SP2_INC(dst0, dst1, pc4, 4); ST_SP2_INC(dst2, dst3, pc5, 4); ST_SP2_INC(dst4, dst5, pc6, 4); ST_SP2_INC(dst6, dst7, pc7, 4); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 8; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 8; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 4; pb0 = B + off * 8; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 8; // number of values in B #endif #else pb0 = B; temp = k; #endif src_a0 = LD_SP(pa0); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 = src_a0 * src_b; pa0 += 4; for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 += src_a0 * src_b; pa0 += 4; src_a0 = LD_SP(pa0); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 += src_a0 * src_b; pa0 += 4; } if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); LD_SP2_INC(pb0, 4, src_b0, src_b1); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0); res4 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55); res5 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA); res6 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF); res7 += src_a0 * src_b; pa0 += 4; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; #else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); dst3 = LD_SP(pc3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; #endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); #if defined(TRMMKERNEL) dst0 = res4 * v_alpha; dst1 = res5 * v_alpha; dst2 = res6 * v_alpha; dst3 = res7 * v_alpha; #else dst0 = LD_SP(pc4); dst1 = LD_SP(pc5); dst2 = LD_SP(pc6); dst3 = LD_SP(pc7); dst0 += res4 * v_alpha; dst1 += res5 * v_alpha; dst2 += res6 * v_alpha; dst3 += res7 * v_alpha; #endif ST_SP(dst0, pc4); ST_SP(dst1, pc5); ST_SP(dst2, pc6); ST_SP(dst3, pc7); pc0 += 4; pc1 += 4; pc2 += 4; pc3 += 4; pc4 += 4; pc5 += 4; pc6 += 4; pc7 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 8; // number of values in B #endif pa0 += temp * 4; pb0 += temp * 8; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2; pb0 = B + off * 8; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 8; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; b1 = pb0[1]; tmp2 = a0 * b1; tmp3 = a1 * b1; b2 = pb0[2]; tmp4 = a0 * b2; tmp5 = a1 * b2; b3 = pb0[3]; tmp6 = a0 * b3; tmp7 = a1 * b3; b4 = pb0[4]; tmp8 = a0 * b4; tmp9 = a1 * b4; b5 = pb0[5]; tmp10 = a0 * b5; tmp11 = a1 * b5; b6 = pb0[6]; tmp12 = a0 * b6; tmp13 = a1 * b6; b7 = pb0[7]; tmp14 = a0 * b7; tmp15 = a1 * b7; pa0 += 2; pb0 += 8; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; b4 = pb0[4]; tmp8 += a0 * b4; tmp9 += a1 * b4; b5 = pb0[5]; tmp10 += a0 * b5; tmp11 += a1 * b5; b6 = pb0[6]; tmp12 += a0 * b6; tmp13 += a1 * b6; b7 = pb0[7]; tmp14 += a0 * b7; tmp15 += a1 * b7; pa0 += 2; pb0 += 8; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; b4 = pb0[4]; tmp8 += a0 * b4; tmp9 += a1 * b4; b5 = pb0[5]; tmp10 += a0 * b5; tmp11 += a1 * b5; b6 = pb0[6]; tmp12 += a0 * b6; tmp13 += a1 * b6; b7 = pb0[7]; tmp14 += a0 * b7; tmp15 += a1 * b7; pa0 += 2; pb0 += 8; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; b4 = pb0[4]; tmp8 += a0 * b4; tmp9 += a1 * b4; b5 = pb0[5]; tmp10 += a0 * b5; tmp11 += a1 * b5; b6 = pb0[6]; tmp12 += a0 * b6; tmp13 += a1 * b6; b7 = pb0[7]; tmp14 += a0 * b7; tmp15 += a1 * b7; pa0 += 2; pb0 += 8; } tmp0 = alpha * tmp0; tmp2 = alpha * tmp2; tmp4 = alpha * tmp4; tmp6 = alpha * tmp6; tmp8 = alpha * tmp8; tmp10 = alpha * tmp10; tmp12 = alpha * tmp12; tmp14 = alpha * tmp14; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp2; pc2[0] = tmp4; pc3[0] = tmp6; pc4[0] = tmp8; pc5[0] = tmp10; pc6[0] = tmp12; pc7[0] = tmp14; #else pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; pc3[0] += tmp6; pc4[0] += tmp8; pc5[0] += tmp10; pc6[0] += tmp12; pc7[0] += tmp14; #endif tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; tmp7 = alpha * tmp7; tmp9 = alpha * tmp9; tmp11 = alpha * tmp11; tmp13 = alpha * tmp13; tmp15 = alpha * tmp15; #if defined(TRMMKERNEL) pc0[1] = tmp1; pc1[1] = tmp3; pc2[1] = tmp5; pc3[1] = tmp7; pc4[1] = tmp9; pc5[1] = tmp11; pc6[1] = tmp13; pc7[1] = tmp15; #else pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; pc3[1] += tmp7; pc4[1] += tmp9; pc5[1] += tmp11; pc6[1] += tmp13; pc7[1] += tmp15; #endif pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; pc4 += 2; pc5 += 2; pc6 += 2; pc7 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 8; // number of values in B #endif pa0 += temp * 2; pb0 += temp * 8; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 1; pb0 = B + off * 8; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 8; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; b2 = pb0[2]; tmp2 = a0 * b2; b3 = pb0[3]; tmp3 = a0 * b3; b4 = pb0[4]; tmp4 = a0 * b4; b5 = pb0[5]; tmp5 = a0 * b5; b6 = pb0[6]; tmp6 = a0 * b6; b7 = pb0[7]; tmp7 = a0 * b7; pa0 += 1; pb0 += 8; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; b4 = pb0[4]; tmp4 += a0 * b4; b5 = pb0[5]; tmp5 += a0 * b5; b6 = pb0[6]; tmp6 += a0 * b6; b7 = pb0[7]; tmp7 += a0 * b7; pa0 += 1; pb0 += 8; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; b4 = pb0[4]; tmp4 += a0 * b4; b5 = pb0[5]; tmp5 += a0 * b5; b6 = pb0[6]; tmp6 += a0 * b6; b7 = pb0[7]; tmp7 += a0 * b7; pa0 += 1; pb0 += 8; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; b4 = pb0[4]; tmp4 += a0 * b4; b5 = pb0[5]; tmp5 += a0 * b5; b6 = pb0[6]; tmp6 += a0 * b6; b7 = pb0[7]; tmp7 += a0 * b7; pa0 += 1; pb0 += 8; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; tmp4 = alpha * tmp4; tmp5 = alpha * tmp5; tmp6 = alpha * tmp6; tmp7 = alpha * tmp7; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp1; pc2[0] = tmp2; pc3[0] = tmp3; pc4[0] = tmp4; pc5[0] = tmp5; pc6[0] = tmp6; pc7[0] = tmp7; #else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; pc4[0] += tmp4; pc5[0] += tmp5; pc6[0] += tmp6; pc7[0] += tmp7; #endif pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; pc4 += 1; pc5 += 1; pc6 += 1; pc7 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 8; // number of values in B #endif pa0 += temp * 1; pb0 += temp * 8; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 8; // number of values in A #endif B += (k << 3); C += (ldc << 3); } if (n & 4) { pc0 = C; pc1 = pc0 + ldc; pc2 = pc1 + ldc; pc3 = pc2 + ldc; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 = src_a0 * src_b; res3 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 = src_a0 * src_b; res5 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 = src_a0 * src_b; res7 = src_a1 * src_b; pb0 += 4; for (l = ((temp - 1) >> 1); l--;) { LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; pb0 += 4; LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; pb0 += 4; } if ((temp - 1) & 1) { LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res4 += src_a0 * src_b; res5 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res6 += src_a0 * src_b; res7 += src_a1 * src_b; pb0 += 4; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; dst4 = res4 * v_alpha; dst5 = res5 * v_alpha; dst6 = res6 * v_alpha; dst7 = res7 * v_alpha; #else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); LD_SP2(pc2, 4, dst4, dst5); LD_SP2(pc3, 4, dst6, dst7); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; dst4 += res4 * v_alpha; dst5 += res5 * v_alpha; dst6 += res6 * v_alpha; dst7 += res7 * v_alpha; #endif ST_SP2_INC(dst0, dst1, pc0, 4); ST_SP2_INC(dst2, dst3, pc1, 4); ST_SP2_INC(dst4, dst5, pc2, 4); ST_SP2_INC(dst6, dst7, pc3, 4); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 4; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 4; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 = src_a0 * src_b; pa0 += 4; pb0 += 4; for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; pa0 += 4; pb0 += 4; src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; pa0 += 4; pb0 += 4; } if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0 = LD_SP(pb0); src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA); res2 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF); res3 += src_a0 * src_b; pa0 += 4; pb0 += 4; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; #else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst2 = LD_SP(pc2); dst3 = LD_SP(pc3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; #endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); ST_SP(dst2, pc2); ST_SP(dst3, pc3); pc0 += 4; pc1 += 4; pc2 += 4; pc3 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 4; pb0 += temp * 4; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; b1 = pb0[1]; tmp2 = a0 * b1; tmp3 = a1 * b1; b2 = pb0[2]; tmp4 = a0 * b2; tmp5 = a1 * b2; b3 = pb0[3]; tmp6 = a0 * b3; tmp7 = a1 * b3; pa0 += 2; pb0 += 4; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; pa0 += 2; pb0 += 4; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; pa0 += 2; pb0 += 4; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; b2 = pb0[2]; tmp4 += a0 * b2; tmp5 += a1 * b2; b3 = pb0[3]; tmp6 += a0 * b3; tmp7 += a1 * b3; pa0 += 2; pb0 += 4; } tmp0 = alpha * tmp0; tmp2 = alpha * tmp2; tmp4 = alpha * tmp4; tmp6 = alpha * tmp6; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp2; pc2[0] = tmp4; pc3[0] = tmp6; #else pc0[0] += tmp0; pc1[0] += tmp2; pc2[0] += tmp4; pc3[0] += tmp6; #endif tmp1 = alpha * tmp1; tmp3 = alpha * tmp3; tmp5 = alpha * tmp5; tmp7 = alpha * tmp7; #if defined(TRMMKERNEL) pc0[1] = tmp1; pc1[1] = tmp3; pc2[1] = tmp5; pc3[1] = tmp7; #else pc0[1] += tmp1; pc1[1] += tmp3; pc2[1] += tmp5; pc3[1] += tmp7; #endif pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2; pb0 += temp * 4; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 1; pb0 = B + off * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; b2 = pb0[2]; tmp2 = a0 * b2; b3 = pb0[3]; tmp3 = a0 * b3; pa0 += 1; pb0 += 4; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; b2 = pb0[2]; tmp2 += a0 * b2; b3 = pb0[3]; tmp3 += a0 * b3; pa0 += 1; pb0 += 4; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp1; pc2[0] = tmp2; pc3[0] = tmp3; #else pc0[0] += tmp0; pc1[0] += tmp1; pc2[0] += tmp2; pc3[0] += tmp3; #endif pc0 += 1; pc1 += 1; pc2 += 1; pc3 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 1; pb0 += temp * 4; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif B += (k << 2); C += (ldc << 2); } if (n & 2) { pc0 = C; pc1 = pc0 + ldc; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 = src_a0 * src_b; res3 = src_a1 * src_b; pb0 += 2; for (l = ((temp - 1) >> 1); l--;) { LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; pb0 += 2; LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; pb0 += 2; } if ((temp - 1) & 1) { LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res2 += src_a0 * src_b; res3 += src_a1 * src_b; pb0 += 2; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; dst2 = res2 * v_alpha; dst3 = res3 * v_alpha; #else LD_SP2(pc0, 4, dst0, dst1); LD_SP2(pc1, 4, dst2, dst3); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; dst2 += res2 * v_alpha; dst3 += res3 * v_alpha; #endif ST_SP2_INC(dst0, dst1, pc0, 4); ST_SP2_INC(dst2, dst3, pc1, 4); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 2; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 4; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 = src_a0 * src_b; pa0 += 4; pb0 += 2; for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; pa0 += 4; pb0 += 2; src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; pa0 += 4; pb0 += 2; } if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b0[1] = pb0[1]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55); res1 += src_a0 * src_b; pa0 += 4; pb0 += 2; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; #else dst0 = LD_SP(pc0); dst1 = LD_SP(pc1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; #endif ST_SP(dst0, pc0); ST_SP(dst1, pc1); pc0 += 4; pc1 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 4; pb0 += temp * 2; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; b1 = pb0[1]; tmp2 = a0 * b1; tmp3 = a1 * b1; pa0 += 2; pb0 += 2; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; pa0 += 2; pb0 += 2; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; pa0 += 2; pb0 += 2; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; b1 = pb0[1]; tmp2 += a0 * b1; tmp3 += a1 * b1; pa0 += 2; pb0 += 2; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; tmp2 = alpha * tmp2; tmp3 = alpha * tmp3; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp2; pc0[1] = tmp1; pc1[1] = tmp3; #else pc0[0] += tmp0; pc1[0] += tmp2; pc0[1] += tmp1; pc1[1] += tmp3; #endif pc0 += 2; pc1 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2; pb0 += temp * 2; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 1; pb0 = B + off * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; b1 = pb0[1]; tmp1 = a0 * b1; pa0 += 1; pb0 += 2; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; b1 = pb0[1]; tmp1 += a0 * b1; pa0 += 1; pb0 += 2; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc1[0] = tmp1; #else pc0[0] += tmp0; pc1[0] += tmp1; #endif pc0 += 1; pc1 += 1; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 1; pb0 += temp * 2; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif B += (k << 1); C += (ldc << 1); } if (n & 1) { pc0 = C; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif pa0 = A; for (i = (m >> 3); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 8; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 8; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; res1 = src_a1 * src_b; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; pb0 += 1; LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; pb0 += 1; } if ((temp - 1) & 1) { LD_SP2_INC(pa0, 4, src_a0, src_a1); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; res1 += src_a1 * src_b; pb0 += 1; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; dst1 = res1 * v_alpha; #else LD_SP2(pc0, 4, dst0, dst1); dst0 += res0 * v_alpha; dst1 += res1 * v_alpha; #endif ST_SP2_INC(dst0, dst1, pc0, 4); #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 8; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 8; pb0 += temp * 1; #endif #ifdef LEFT off += 8; // number of values in A #endif #endif } if (m & 4) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 4; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 = src_a0 * src_b; pa0 += 4; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; pa0 += 4; pb0 += 1; src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; pa0 += 4; pb0 += 1; } if ((temp - 1) & 1) { src_a0 = LD_SP(pa0); src_b0[0] = pb0[0]; src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0); res0 += src_a0 * src_b; pa0 += 4; pb0 += 1; } #if defined(TRMMKERNEL) dst0 = res0 * v_alpha; #else dst0 = LD_SP(pc0); dst0 += res0 * v_alpha; #endif ST_SP(dst0, pc0); pc0 += 4; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 4; pb0 += temp * 1; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; a1 = pa0[1]; tmp1 = a1 * b0; pa0 += 2; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; pa0 += 2; pb0 += 1; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; pa0 += 2; pb0 += 1; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; a1 = pa0[1]; tmp1 += a1 * b0; pa0 += 2; pb0 += 1; } tmp0 = alpha * tmp0; tmp1 = alpha * tmp1; #if defined(TRMMKERNEL) pc0[0] = tmp0; pc0[1] = tmp1; #else pc0[0] += tmp0; pc0[1] += tmp1; #endif pc0 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2; pb0 += temp * 1; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 1; pb0 = B + off * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif a0 = pa0[0]; b0 = pb0[0]; tmp0 = a0 * b0; pa0 += 1; pb0 += 1; for (l = ((temp - 1) >> 1); l--;) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; } if ((temp - 1) & 1) { a0 = pa0[0]; b0 = pb0[0]; tmp0 += a0 * b0; pa0 += 1; pb0 += 1; } #if defined(TRMMKERNEL) pc0[0] = alpha * tmp0; #else pc0[0] += alpha * tmp0; #endif } } return 0; } OpenBLAS-0.2.20/kernel/mips/sgemm_ncopy_8_msa.c000066400000000000000000000123161313527062700211520ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; FLOAT *psrc8, *pdst; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v4f32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; psrc0 = src; pdst = dst; for (j = (n >> 3); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc5 = psrc4 + lda; psrc6 = psrc5 + lda; psrc7 = psrc6 + lda; psrc8 = psrc7 + lda; psrc0 += 8 * lda; for (i = (m >> 3); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); LD_SP2_INC(psrc3, 4, src4, src5); LD_SP2_INC(psrc4, 4, src6, src7); LD_SP2_INC(psrc5, 4, src8, src9); LD_SP2_INC(psrc6, 4, src10, src11); LD_SP2_INC(psrc7, 4, src12, src13); LD_SP2_INC(psrc8, 4, src14, src15); TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6); TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5, dst7); TRANSPOSE4x4_SP_SP(src1, src3, src5, src7, dst8, dst10, dst12, dst14); TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13, dst15); ST_SP2_INC(dst0, dst1, pdst, 4); ST_SP2_INC(dst2, dst3, pdst, 4); ST_SP2_INC(dst4, dst5, pdst, 4); ST_SP2_INC(dst6, dst7, pdst, 4); ST_SP2_INC(dst8, dst9, pdst, 4); ST_SP2_INC(dst10, dst11, pdst, 4); ST_SP2_INC(dst12, dst13, pdst, 4); ST_SP2_INC(dst14, dst15, pdst, 4); } for (i = (m & 7); i--;) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; *pdst++ = *psrc3++; *pdst++ = *psrc4++; *pdst++ = *psrc5++; *pdst++ = *psrc6++; *pdst++ = *psrc7++; *pdst++ = *psrc8++; } } if (n & 4) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; for (i = (m >> 2); i--;) { src0 = LD_SP(psrc1); src1 = LD_SP(psrc2); src2 = LD_SP(psrc3); src3 = LD_SP(psrc4); psrc1 += 4; psrc2 += 4; psrc3 += 4; psrc4 += 4; TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3); ST_SP2_INC(dst0, dst1, pdst, 4); ST_SP2_INC(dst2, dst3, pdst, 4); } for (i = (m & 3); i--;) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; *pdst++ = *psrc3++; *pdst++ = *psrc4++; } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; for (i = (m >> 1); i--;) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; *pdst++ = *psrc1++; *pdst++ = *psrc2++; } if (m & 1) { *pdst++ = *psrc1++; *pdst++ = *psrc2++; } } if (n & 1) { psrc1 = psrc0; for (i = m; i--;) { *pdst++ = *psrc1++; } } return 0; } OpenBLAS-0.2.20/kernel/mips/sgemm_tcopy_8_msa.c000066400000000000000000000164461313527062700211700ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7; FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; v4f32 src0, src1, src2, src3, src4, src5, src6, src7; v4f32 src8, src9, src10, src11, src12, src13, src14, src15; psrc0 = src; pdst0 = dst; pdst2 = dst + m * (n & ~7); pdst3 = dst + m * (n & ~3); pdst4 = dst + m * (n & ~1); for (j = (m >> 3); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc5 = psrc4 + lda; psrc6 = psrc5 + lda; psrc7 = psrc6 + lda; psrc8 = psrc7 + lda; psrc0 += 8 * lda; pdst1 = pdst0; pdst0 += 64; for (i = (n >> 3); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); LD_SP2_INC(psrc3, 4, src4, src5); LD_SP2_INC(psrc4, 4, src6, src7); LD_SP2_INC(psrc5, 4, src8, src9); LD_SP2_INC(psrc6, 4, src10, src11); LD_SP2_INC(psrc7, 4, src12, src13); LD_SP2_INC(psrc8, 4, src14, src15); ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15, pdst1 + 32, 4); pdst1 += m * 8; } if (n & 4) { src0 = LD_SP(psrc1); src1 = LD_SP(psrc2); src2 = LD_SP(psrc3); src3 = LD_SP(psrc4); src4 = LD_SP(psrc5); src5 = LD_SP(psrc6); src6 = LD_SP(psrc7); src7 = LD_SP(psrc8); psrc1 += 4; psrc2 += 4; psrc3 += 4; psrc4 += 4; psrc5 += 4; psrc6 += 4; psrc7 += 4; psrc8 += 4; ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4); } if (n & 2) { *pdst3++ = *psrc1++; *pdst3++ = *psrc1++; *pdst3++ = *psrc2++; *pdst3++ = *psrc2++; *pdst3++ = *psrc3++; *pdst3++ = *psrc3++; *pdst3++ = *psrc4++; *pdst3++ = *psrc4++; *pdst3++ = *psrc5++; *pdst3++ = *psrc5++; *pdst3++ = *psrc6++; *pdst3++ = *psrc6++; *pdst3++ = *psrc7++; *pdst3++ = *psrc7++; *pdst3++ = *psrc8++; *pdst3++ = *psrc8++; } if (n & 1) { *pdst4++ = *psrc1++; *pdst4++ = *psrc2++; *pdst4++ = *psrc3++; *pdst4++ = *psrc4++; *pdst4++ = *psrc5++; *pdst4++ = *psrc6++; *pdst4++ = *psrc7++; *pdst4++ = *psrc8++; } } if (m & 4) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; pdst1 = pdst0; pdst0 += 32; for (i = (n >> 3); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); LD_SP2_INC(psrc3, 4, src4, src5); LD_SP2_INC(psrc4, 4, src6, src7); ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4); pdst1 += 8 * m; } if (n & 4) { src0 = LD_SP(psrc1); src1 = LD_SP(psrc2); src2 = LD_SP(psrc3); src3 = LD_SP(psrc4); psrc1 += 4; psrc2 += 4; psrc3 += 4; psrc4 += 4; ST_SP4_INC(src0, src1, src2, src3, pdst2, 4); } if (n & 2) { *pdst3++ = *psrc1++; *pdst3++ = *psrc1++; *pdst3++ = *psrc2++; *pdst3++ = *psrc2++; *pdst3++ = *psrc3++; *pdst3++ = *psrc3++; *pdst3++ = *psrc4++; *pdst3++ = *psrc4++; } if (n & 1) { *pdst4++ = *psrc1++; *pdst4++ = *psrc2++; *pdst4++ = *psrc3++; *pdst4++ = *psrc4++; } } if (m & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; pdst1 = pdst0; pdst0 += 16; for (i = (n >> 3); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); LD_SP2_INC(psrc2, 4, src2, src3); ST_SP4(src0, src1, src2, src3, pdst1, 4); pdst1 += 8 * m; } if (n & 4) { src0 = LD_SP(psrc1); src1 = LD_SP(psrc2); psrc1 += 4; psrc2 += 4; ST_SP2_INC(src0, src1, pdst2, 4); } if (n & 2) { *pdst3++ = *psrc1++; *pdst3++ = *psrc1++; *pdst3++ = *psrc2++; *pdst3++ = *psrc2++; } if (n & 1) { *pdst4++ = *psrc1++; *pdst4++ = *psrc2++; } } if (m & 1) { psrc1 = psrc0; psrc0 += lda; pdst1 = pdst0; pdst0 += 8; for (i = (n >> 3); i--;) { LD_SP2_INC(psrc1, 4, src0, src1); ST_SP2(src0, src1, pdst1, 4); pdst1 += 8 * m; } if (n & 4) { src0 = LD_SP(psrc1); psrc1 += 4; ST_SP(src0, pdst2); pdst2 += 4; } if (n & 2) { *pdst3++ = *psrc1++; *pdst3++ = *psrc1++; } if (n & 1) { *pdst4++ = *psrc1++; } } return 0; } OpenBLAS-0.2.20/kernel/mips/sgemv_n_msa.c000066400000000000000000000501141313527062700200370ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #define SGEMV_N_8x8() \ { \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t2, t3); \ LD_SP2(pa2 + k, 4, t4, t5); \ LD_SP2(pa3 + k, 4, t6, t7); \ LD_SP2(pa4 + k, 4, t8, t9); \ LD_SP2(pa5 + k, 4, t10, t11); \ LD_SP2(pa6 + k, 4, t12, t13); \ LD_SP2(pa7 + k, 4, t14, t15); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ \ y0 += tp1 * t2; \ y1 += tp1 * t3; \ \ y0 += tp2 * t4; \ y1 += tp2 * t5; \ \ y0 += tp3 * t6; \ y1 += tp3 * t7; \ \ y0 += tp4 * t8; \ y1 += tp4 * t9; \ \ y0 += tp5 * t10; \ y1 += tp5 * t11; \ \ y0 += tp6 * t12; \ y1 += tp6 * t13; \ \ y0 += tp7 * t14; \ y1 += tp7 * t15; \ } #define SGEMV_N_4x8() \ { \ t0 = LD_SP(pa0 + k); \ t2 = LD_SP(pa1 + k); \ t4 = LD_SP(pa2 + k); \ t6 = LD_SP(pa3 + k); \ t8 = LD_SP(pa4 + k); \ t10 = LD_SP(pa5 + k); \ t12 = LD_SP(pa6 + k); \ t14 = LD_SP(pa7 + k); \ \ y0 += tp0 * t0; \ y0 += tp1 * t2; \ y0 += tp2 * t4; \ y0 += tp3 * t6; \ y0 += tp4 * t8; \ y0 += tp5 * t10; \ y0 += tp6 * t12; \ y0 += tp7 * t14; \ } #define SGEMV_N_8x4() \ { \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t2, t3); \ LD_SP2(pa2 + k, 4, t4, t5); \ LD_SP2(pa3 + k, 4, t6, t7); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ \ y0 += tp1 * t2; \ y1 += tp1 * t3; \ \ y0 += tp2 * t4; \ y1 += tp2 * t5; \ \ y0 += tp3 * t6; \ y1 += tp3 * t7; \ } #define SGEMV_N_4x4() \ { \ t0 = LD_SP(pa0 + k); \ t2 = LD_SP(pa1 + k); \ t4 = LD_SP(pa2 + k); \ t6 = LD_SP(pa3 + k); \ \ y0 += tp0 * t0; \ y0 += tp1 * t2; \ y0 += tp2 * t4; \ y0 += tp3 * t6; \ } #define SGEMV_N_8x2() \ { \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t2, t3); \ \ y0 += tp0 * t0; \ y1 += tp0 * t1; \ \ y0 += tp1 * t2; \ y1 += tp1 * t3; \ } #define SGEMV_N_4x2() \ { \ t0 = LD_SP(pa0 + k); \ t2 = LD_SP(pa1 + k); \ \ y0 += tp0 * t0; \ y0 += tp1 * t2; \ } #define SLOAD_X8_SCALE_GP() \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ temp4 = alpha * x[4 * inc_x]; \ temp5 = alpha * x[5 * inc_x]; \ temp6 = alpha * x[6 * inc_x]; \ temp7 = alpha * x[7 * inc_x]; \ \ tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ tp2 = COPY_FLOAT_TO_VECTOR(temp2); \ tp3 = COPY_FLOAT_TO_VECTOR(temp3); \ tp4 = COPY_FLOAT_TO_VECTOR(temp4); \ tp5 = COPY_FLOAT_TO_VECTOR(temp5); \ tp6 = COPY_FLOAT_TO_VECTOR(temp6); \ tp7 = COPY_FLOAT_TO_VECTOR(temp7); \ #define SLOAD_X4_SCALE_GP() \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ \ tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ tp2 = COPY_FLOAT_TO_VECTOR(temp2); \ tp3 = COPY_FLOAT_TO_VECTOR(temp3); \ #define SLOAD_X8_SCALE_VECTOR() \ LD_SP2(x, 4, x0, x1); \ \ x0 = x0 * v_alpha; \ x1 = x1 * v_alpha; \ \ SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \ SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7); \ #define SLOAD_X4_SCALE_VECTOR() \ x0 = LD_SP(x); \ x0 = x0 * v_alpha; \ SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \ #define SLOAD_Y8_GP() \ y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \ y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \ y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \ y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \ y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y))); \ y1 = (v4f32) __msa_insert_w((v4i32) y1, 1, *((int *)(y + 5 * inc_y))); \ y1 = (v4f32) __msa_insert_w((v4i32) y1, 2, *((int *)(y + 6 * inc_y))); \ y1 = (v4f32) __msa_insert_w((v4i32) y1, 3, *((int *)(y + 7 * inc_y))); \ #define SLOAD_Y4_GP() \ y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \ y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \ y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \ y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \ #define SLOAD_Y8_VECTOR() LD_SP2(y, 4, y0, y1); #define SLOAD_Y4_VECTOR() y0 = LD_SP(y); #define SSTORE_Y8_GP() \ *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \ *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \ *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \ *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \ *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0); \ *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1); \ *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2); \ *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3); \ #define SSTORE_Y4_GP() \ *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \ *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \ *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \ *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \ #define SSTORE_Y8_VECTOR() ST_SP2(y0, y1, y, 4); #define SSTORE_Y4_VECTOR() ST_SP(y0, y); #define SGEMV_N_MSA() \ for (j = (n >> 3); j--;) \ { \ SLOAD_X8_SCALE(); \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ SLOAD_Y8(); \ SGEMV_N_8x8(); \ SSTORE_Y8(); \ \ y += 8 * inc_y; \ k += 8; \ } \ \ if (m & 4) \ { \ SLOAD_Y4(); \ SGEMV_N_4x8(); \ SSTORE_Y4(); \ \ y += 4 * inc_y; \ k += 4; \ } \ \ if (m & 3) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ temp4 = alpha * x[4 * inc_x]; \ temp5 = alpha * x[5 * inc_x]; \ temp6 = alpha * x[6 * inc_x]; \ temp7 = alpha * x[7 * inc_x]; \ \ for (i = (m & 3); i--;) \ { \ temp = y[0]; \ temp += temp0 * pa0[k]; \ temp += temp1 * pa1[k]; \ temp += temp2 * pa2[k]; \ temp += temp3 * pa3[k]; \ temp += temp4 * pa4[k]; \ temp += temp5 * pa5[k]; \ temp += temp6 * pa6[k]; \ temp += temp7 * pa7[k]; \ y[0] = temp; \ \ y += inc_y; \ k++; \ } \ } \ pa0 += 8 * lda; \ pa1 += 8 * lda; \ pa2 += 8 * lda; \ pa3 += 8 * lda; \ pa4 += 8 * lda; \ pa5 += 8 * lda; \ pa6 += 8 * lda; \ pa7 += 8 * lda; \ \ x += 8 * inc_x; \ } \ \ if (n & 4) \ { \ SLOAD_X4_SCALE(); \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ SLOAD_Y8(); \ SGEMV_N_8x4(); \ SSTORE_Y8(); \ \ y += 8 * inc_y; \ k += 8; \ } \ \ if (m & 4) \ { \ SLOAD_Y4(); \ SGEMV_N_4x4(); \ SSTORE_Y4(); \ \ y += 4 * inc_y; \ k += 4; \ } \ \ if (m & 3) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ temp2 = alpha * x[2 * inc_x]; \ temp3 = alpha * x[3 * inc_x]; \ \ for (i = (m & 3); i--;) \ { \ temp = y[0]; \ temp += temp0 * pa0[k]; \ temp += temp1 * pa1[k]; \ temp += temp2 * pa2[k]; \ temp += temp3 * pa3[k]; \ y[0] = temp; \ \ y += inc_y; \ k++; \ } \ } \ \ pa0 += 4 * lda; \ pa1 += 4 * lda; \ pa2 += 4 * lda; \ pa3 += 4 * lda; \ \ x += 4 * inc_x; \ } \ \ if (n & 2) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ \ tp0 = COPY_FLOAT_TO_VECTOR(temp0); \ tp1 = COPY_FLOAT_TO_VECTOR(temp1); \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 3); i--;) \ { \ SLOAD_Y8(); \ SGEMV_N_8x2(); \ SSTORE_Y8(); \ \ y += 8 * inc_y; \ k += 8; \ } \ \ if (m & 4) \ { \ SLOAD_Y4(); \ SGEMV_N_4x2(); \ SSTORE_Y4(); \ \ y += 4 * inc_y; \ k += 4; \ } \ \ if (m & 3) \ { \ temp0 = alpha * x[0 * inc_x]; \ temp1 = alpha * x[1 * inc_x]; \ \ for (i = (m & 3); i--;) \ { \ temp = y[0]; \ temp += temp0 * pa0[k]; \ temp += temp1 * pa1[k]; \ y[0] = temp; \ \ y += inc_y; \ k++; \ } \ } \ \ pa0 += 2 * lda; \ pa1 += 2 * lda; \ \ x += 2 * inc_x; \ } \ \ if (n & 1) \ { \ temp = alpha * x[0]; \ \ k = 0; \ y = y_org; \ \ for (i = m; i--;) \ { \ y[0] += temp * pa0[k]; \ \ y += inc_y; \ k++; \ } \ } \ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i, j, k; FLOAT *y_org = y; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v4f32 v_alpha, x0, x1, y0, y1; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; v_alpha = COPY_FLOAT_TO_VECTOR(alpha); pa0 = A; pa1 = A + lda; pa2 = A + 2 * lda; pa3 = A + 3 * lda; pa4 = A + 4 * lda; pa5 = A + 5 * lda; pa6 = A + 6 * lda; pa7 = A + 7 * lda; if ((1 == inc_x) && (1 == inc_y)) { #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR #define SLOAD_Y8 SLOAD_Y8_VECTOR #define SLOAD_Y4 SLOAD_Y4_VECTOR #define SSTORE_Y8 SSTORE_Y8_VECTOR #define SSTORE_Y4 SSTORE_Y4_VECTOR SGEMV_N_MSA(); #undef SLOAD_X8_SCALE #undef SLOAD_X4_SCALE #undef SLOAD_Y8 #undef SLOAD_Y4 #undef SSTORE_Y8 #undef SSTORE_Y4 } else if (1 == inc_y) { #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP #define SLOAD_Y8 SLOAD_Y8_VECTOR #define SLOAD_Y4 SLOAD_Y4_VECTOR #define SSTORE_Y8 SSTORE_Y8_VECTOR #define SSTORE_Y4 SSTORE_Y4_VECTOR SGEMV_N_MSA(); #undef SLOAD_X8_SCALE #undef SLOAD_X4_SCALE #undef SLOAD_Y8 #undef SLOAD_Y4 #undef SSTORE_Y8 #undef SSTORE_Y4 } else if (1 == inc_x) { #define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR #define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR #define SLOAD_Y8 SLOAD_Y8_GP #define SLOAD_Y4 SLOAD_Y4_GP #define SSTORE_Y8 SSTORE_Y8_GP #define SSTORE_Y4 SSTORE_Y4_GP SGEMV_N_MSA(); #undef SLOAD_X8_SCALE #undef SLOAD_X4_SCALE #undef SLOAD_Y8 #undef SLOAD_Y4 #undef SSTORE_Y8 #undef SSTORE_Y4 } else { #define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP #define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP #define SLOAD_Y8 SLOAD_Y8_GP #define SLOAD_Y4 SLOAD_Y4_GP #define SSTORE_Y8 SSTORE_Y8_GP #define SSTORE_Y4 SSTORE_Y4_GP SGEMV_N_MSA(); #undef SLOAD_X8_SCALE #undef SLOAD_X4_SCALE #undef SLOAD_Y8 #undef SLOAD_Y4 #undef SSTORE_Y8 #undef SSTORE_Y4 } return(0); } OpenBLAS-0.2.20/kernel/mips/sgemv_t_msa.c000066400000000000000000000476041313527062700200570ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #define SGEMV_T_8x8() \ { \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t2, t3); \ LD_SP2(pa2 + k, 4, t4, t5); \ LD_SP2(pa3 + k, 4, t6, t7); \ LD_SP2(pa4 + k, 4, t8, t9); \ LD_SP2(pa5 + k, 4, t10, t11); \ LD_SP2(pa6 + k, 4, t12, t13); \ LD_SP2(pa7 + k, 4, t14, t15); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ \ tp1 += x0 * t2; \ tp1 += x1 * t3; \ \ tp2 += x0 * t4; \ tp2 += x1 * t5; \ \ tp3 += x0 * t6; \ tp3 += x1 * t7; \ \ tp4 += x0 * t8; \ tp4 += x1 * t9; \ \ tp5 += x0 * t10; \ tp5 += x1 * t11; \ \ tp6 += x0 * t12; \ tp6 += x1 * t13; \ \ tp7 += x0 * t14; \ tp7 += x1 * t15; \ } #define SGEMV_T_8x4() \ { \ t0 = LD_SP(pa0 + k); \ t2 = LD_SP(pa1 + k); \ t4 = LD_SP(pa2 + k); \ t6 = LD_SP(pa3 + k); \ t8 = LD_SP(pa4 + k); \ t10 = LD_SP(pa5 + k); \ t12 = LD_SP(pa6 + k); \ t14 = LD_SP(pa7 + k); \ \ tp0 += x0 * t0; \ tp1 += x0 * t2; \ tp2 += x0 * t4; \ tp3 += x0 * t6; \ tp4 += x0 * t8; \ tp5 += x0 * t10; \ tp6 += x0 * t12; \ tp7 += x0 * t14; \ } #define SGEMV_T_4x8() \ { \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t2, t3); \ LD_SP2(pa2 + k, 4, t4, t5); \ LD_SP2(pa3 + k, 4, t6, t7); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ \ tp1 += x0 * t2; \ tp1 += x1 * t3; \ \ tp2 += x0 * t4; \ tp2 += x1 * t5; \ \ tp3 += x0 * t6; \ tp3 += x1 * t7; \ } #define SGEMV_T_4x4() \ { \ t0 = LD_SP(pa0 + k); \ t2 = LD_SP(pa1 + k); \ t4 = LD_SP(pa2 + k); \ t6 = LD_SP(pa3 + k); \ \ tp0 += x0 * t0; \ tp1 += x0 * t2; \ tp2 += x0 * t4; \ tp3 += x0 * t6; \ } #define SGEMV_T_2x8() \ { \ LD_SP2(pa0 + k, 4, t0, t1); \ LD_SP2(pa1 + k, 4, t2, t3); \ \ tp0 += x0 * t0; \ tp0 += x1 * t1; \ \ tp1 += x0 * t2; \ tp1 += x1 * t3; \ } #define SGEMV_T_2x4() \ { \ t0 = LD_SP(pa0 + k); \ t2 = LD_SP(pa1 + k); \ \ tp0 += x0 * t0; \ tp1 += x0 * t2; \ } #define SLOAD_X8_GP() \ x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \ x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \ x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \ x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \ x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x))); \ x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *)(x + 5 * inc_x))); \ x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *)(x + 6 * inc_x))); \ x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *)(x + 7 * inc_x))); \ #define SLOAD_X4_GP() \ x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \ x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \ x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \ x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \ #define SLOAD_X8_VECTOR() LD_SP2(x, 4, x0, x1); #define SLOAD_X4_VECTOR() x0 = LD_SP(x); #define SGEMV_T_MSA() \ for (j = (n >> 3); j--;) \ { \ tp0 = zero; \ tp1 = zero; \ tp2 = zero; \ tp3 = zero; \ tp4 = zero; \ tp5 = zero; \ tp6 = zero; \ tp7 = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ SLOAD_X8(); \ SGEMV_T_8x8(); \ \ x += 8 * inc_x; \ k += 8; \ } \ \ if (m & 4) \ { \ SLOAD_X4(); \ SGEMV_T_8x4(); \ \ x += 4 * inc_x; \ k += 4; \ } \ \ TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \ tp0, tp1, tp2, tp3); \ TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7, \ tp4, tp5, tp6, tp7); \ tp0 += tp1; \ tp0 += tp2; \ tp0 += tp3; \ tp4 += tp5; \ tp4 += tp6; \ tp4 += tp7; \ \ temp0 = tp0[0]; \ temp1 = tp0[1]; \ temp2 = tp0[2]; \ temp3 = tp0[3]; \ temp4 = tp4[0]; \ temp5 = tp4[1]; \ temp6 = tp4[2]; \ temp7 = tp4[3]; \ \ for (i = (m & 3); i--;) \ { \ temp0 += pa0[k] * x[0]; \ temp1 += pa1[k] * x[0]; \ temp2 += pa2[k] * x[0]; \ temp3 += pa3[k] * x[0]; \ temp4 += pa4[k] * x[0]; \ temp5 += pa5[k] * x[0]; \ temp6 += pa6[k] * x[0]; \ temp7 += pa7[k] * x[0]; \ \ x += inc_x; \ k++; \ } \ \ res0 = y[0 * inc_y]; \ res1 = y[1 * inc_y]; \ res2 = y[2 * inc_y]; \ res3 = y[3 * inc_y]; \ res4 = y[4 * inc_y]; \ res5 = y[5 * inc_y]; \ res6 = y[6 * inc_y]; \ res7 = y[7 * inc_y]; \ \ res0 += alpha * temp0; \ res1 += alpha * temp1; \ res2 += alpha * temp2; \ res3 += alpha * temp3; \ res4 += alpha * temp4; \ res5 += alpha * temp5; \ res6 += alpha * temp6; \ res7 += alpha * temp7; \ \ y[0 * inc_y] = res0; \ y[1 * inc_y] = res1; \ y[2 * inc_y] = res2; \ y[3 * inc_y] = res3; \ y[4 * inc_y] = res4; \ y[5 * inc_y] = res5; \ y[6 * inc_y] = res6; \ y[7 * inc_y] = res7; \ \ y += 8 * inc_y; \ \ pa0 += 8 * lda; \ pa1 += 8 * lda; \ pa2 += 8 * lda; \ pa3 += 8 * lda; \ pa4 += 8 * lda; \ pa5 += 8 * lda; \ pa6 += 8 * lda; \ pa7 += 8 * lda; \ } \ \ if (n & 4) \ { \ tp0 = zero; \ tp1 = zero; \ tp2 = zero; \ tp3 = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ SLOAD_X8(); \ SGEMV_T_4x8(); \ \ x += 8 * inc_x; \ k += 8; \ } \ \ if (m & 4) \ { \ SLOAD_X4(); \ SGEMV_T_4x4(); \ \ x += 4 * inc_x; \ k += 4; \ } \ \ TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \ tp0, tp1, tp2, tp3); \ tp0 += tp1; \ tp0 += tp2; \ tp0 += tp3; \ \ temp0 = tp0[0]; \ temp1 = tp0[1]; \ temp2 = tp0[2]; \ temp3 = tp0[3]; \ \ for (i = (m & 3); i--;) \ { \ temp0 += pa0[k] * x[0]; \ temp1 += pa1[k] * x[0]; \ temp2 += pa2[k] * x[0]; \ temp3 += pa3[k] * x[0]; \ \ x += inc_x; \ k++; \ } \ \ res0 = y[0 * inc_y]; \ res1 = y[1 * inc_y]; \ res2 = y[2 * inc_y]; \ res3 = y[3 * inc_y]; \ \ res0 += alpha * temp0; \ res1 += alpha * temp1; \ res2 += alpha * temp2; \ res3 += alpha * temp3; \ \ y[0 * inc_y] = res0; \ y[1 * inc_y] = res1; \ y[2 * inc_y] = res2; \ y[3 * inc_y] = res3; \ \ y += 4 * inc_y; \ \ pa0 += 4 * lda; \ pa1 += 4 * lda; \ pa2 += 4 * lda; \ pa3 += 4 * lda; \ } \ \ if (n & 2) \ { \ tp0 = zero; \ tp1 = zero; \ \ k = 0; \ x = srcx_org; \ \ for (i = (m >> 3); i--;) \ { \ SLOAD_X8(); \ SGEMV_T_2x8(); \ \ x += 8 * inc_x; \ k += 8; \ } \ \ if (m & 4) \ { \ SLOAD_X4(); \ SGEMV_T_2x4(); \ \ x += 4 * inc_x; \ k += 4; \ } \ \ ILVRL_W2_SP(tp1, tp0, tp2, tp3); \ \ tp2 += tp3; \ \ temp0 = tp2[0] + tp2[2]; \ temp1 = tp2[1] + tp2[3]; \ \ for (i = (m & 3); i--;) \ { \ temp0 += pa0[k] * x[0]; \ temp1 += pa1[k] * x[0]; \ \ x += inc_x; \ k++; \ } \ \ res0 = y[0 * inc_y]; \ res1 = y[1 * inc_y]; \ \ res0 += alpha * temp0; \ res1 += alpha * temp1; \ \ y[0 * inc_y] = res0; \ y[1 * inc_y] = res1; \ \ y += 2 * inc_y; \ \ pa0 += 2 * lda; \ pa1 += 2 * lda; \ } \ \ if (n & 1) \ { \ temp0 = 0.0; \ \ k = 0; \ x = srcx_org; \ \ for (i = m; i--;) \ { \ temp0 += pa0[k] * x[0]; \ \ x += inc_x; \ k++; \ } \ \ y[0] += alpha * temp0; \ y += inc_y; \ pa0 += lda; \ } int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i, j, k; FLOAT *srcx_org = x; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; FLOAT res0, res1, res2, res3, res4, res5, res6, res7; v4f32 x0, x1; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; v4f32 zero = {0}; pa0 = A + 0 * lda; pa1 = A + 1 * lda; pa2 = A + 2 * lda; pa3 = A + 3 * lda; pa4 = A + 4 * lda; pa5 = A + 5 * lda; pa6 = A + 6 * lda; pa7 = A + 7 * lda; if (1 == inc_x) { #define SLOAD_X8 SLOAD_X8_VECTOR #define SLOAD_X4 SLOAD_X4_VECTOR SGEMV_T_MSA(); #undef SLOAD_X8 #undef SLOAD_X4 } else { #define SLOAD_X8 SLOAD_X8_GP #define SLOAD_X4 SLOAD_X4_GP SGEMV_T_MSA(); #undef SLOAD_X8 #undef SLOAD_X4 } return(0); } OpenBLAS-0.2.20/kernel/mips/srot_msa.c000066400000000000000000001077131313527062700174000ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { BLASLONG i, j; FLOAT *px, *py; FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3; v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; v4f32 out0, out1, out2, out3, out4, out5, out6, out7; v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0; if (n <= 0) return (0); px = x; py = y; if ((1 == inc_x) && (1 == inc_y)) { if ((0 == c) && (0 == s)) { v4f32 zero = __msa_cast_to_vector_float(0); zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); /* process 4 floats */ for (j = (n >> 2); j--;) { ST_SP(zero, px); ST_SP(zero, py); px += 4; py += 4; } if (n & 2) { px[0] = 0; py[0] = 0; px[1] = 0; py[1] = 0; px += 2; py += 2; } if (n & 1) { px[0] = 0; py[0] = 0; } } else if ((1 == c) && (1 == s)) { if (n >> 5) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } x0 = LD_SP(px); px += 4; x1 = LD_SP(px); px += 4; x2 = LD_SP(px); px += 4; x3 = LD_SP(px); px += 4; y0 = LD_SP(py); py += 4; y1 = LD_SP(py); py += 4; y2 = LD_SP(py); py += 4; y3 = LD_SP(py); py += 4; for (j = (n >> 5) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); out0 = x0 + y0; x4 = LD_SP(px); px += 4; out1 = y0 - x0; x5 = LD_SP(px); px += 4; out2 = x1 + y1; x6 = LD_SP(px); px += 4; out3 = y1 - x1; x7 = LD_SP(px); px += 4; out4 = x2 + y2; y4 = LD_SP(py); py += 4; out5 = y2 - x2; y5 = LD_SP(py); py += 4; out6 = x3 + y3; y6 = LD_SP(py); py += 4; out7 = y3 - x3; y7 = LD_SP(py); py += 4; ST_SP(out0, x); x += 4; out8 = x4 + y4; ST_SP(out1, y); y += 4; out9 = y4 - x4; ST_SP(out2, x); x += 4; out10 = x5 + y5; ST_SP(out3, y); y += 4; out11 = y5 - x5; ST_SP(out4, x); x += 4; out12 = x6 + y6; ST_SP(out5, y); y += 4; out13 = y6 - x6; ST_SP(out6, x); x += 4; out14 = x7 + y7; ST_SP(out7, y); y += 4; out15 = y7 - x7; x0 = LD_SP(px); px += 4; ST_SP(out8, x); x += 4; x1 = LD_SP(px); px += 4; ST_SP(out10, x); x += 4; x2 = LD_SP(px); px += 4; ST_SP(out12, x); x += 4; x3 = LD_SP(px); px += 4; ST_SP(out14, x); x += 4; y0 = LD_SP(py); py += 4; ST_SP(out9, y); y += 4; y1 = LD_SP(py); py += 4; ST_SP(out11, y); y += 4; y2 = LD_SP(py); py += 4; ST_SP(out13, y); y += 4; y3 = LD_SP(py); py += 4; ST_SP(out15, y); y += 4; } x4 = LD_SP(px); px += 4; x5 = LD_SP(px); px += 4; x6 = LD_SP(px); px += 4; x7 = LD_SP(px); px += 4; y4 = LD_SP(py); py += 4; y5 = LD_SP(py); py += 4; y6 = LD_SP(py); py += 4; y7 = LD_SP(py); py += 4; out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; out4 = x2 + y2; out5 = y2 - x2; out6 = x3 + y3; out7 = y3 - x3; out8 = x4 + y4; out9 = y4 - x4; out10 = x5 + y5; out11 = y5 - x5; out12 = x6 + y6; out13 = y6 - x6; out14 = x7 + y7; out15 = y7 - x7; ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4); ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4); } if (n & 16) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; out4 = x2 + y2; out5 = y2 - x2; out6 = x3 + y3; out7 = y3 - x3; ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 8) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); out0 = x0 + y0; out1 = y0 - x0; out2 = x1 + y1; out3 = y1 - x1; ST_SP2_INC(out0, out2, x, 4); ST_SP2_INC(out1, out3, y, 4); } if (n & 4) { x0 = LD_SP(px); y0 = LD_SP(py); px += 4; py += 4; out0 = x0 + y0; out1 = y0 - x0; ST_SP(out0, x); ST_SP(out1, y); x += 4; y += 4; } if (n & 2) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = fx0 + fy0; tp1 = fy0 - fx0; *x = tp0; *y = tp1; } } else if (0 == s) { c0 = COPY_FLOAT_TO_VECTOR(c); if (n >> 5) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); for (j = (n >> 5) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); y0 = LD_SP(py); py += 4; x0 *= c0; y1 = LD_SP(py); py += 4; x1 *= c0; y2 = LD_SP(py); py += 4; x2 *= c0; y3 = LD_SP(py); py += 4; x3 *= c0; y4 = LD_SP(py); py += 4; x4 *= c0; y5 = LD_SP(py); py += 4; x5 *= c0; y6 = LD_SP(py); py += 4; x6 *= c0; y7 = LD_SP(py); py += 4; x7 *= c0; ST_SP(x0, x); x += 4; y0 *= c0; ST_SP(x1, x); x += 4; y1 *= c0; ST_SP(x2, x); x += 4; y2 *= c0; ST_SP(x3, x); x += 4; y3 *= c0; ST_SP(x4, x); x += 4; y4 *= c0; ST_SP(x5, x); x += 4; y5 *= c0; ST_SP(x6, x); x += 4; y6 *= c0; ST_SP(x7, x); x += 4; y7 *= c0; x0 = LD_SP(px); px += 4; ST_SP(y0, y); y += 4; x1 = LD_SP(px); px += 4; ST_SP(y1, y); y += 4; x2 = LD_SP(px); px += 4; ST_SP(y2, y); y += 4; x3 = LD_SP(px); px += 4; ST_SP(y3, y); y += 4; x4 = LD_SP(px); px += 4; ST_SP(y4, y); y += 4; x5 = LD_SP(px); px += 4; ST_SP(y5, y); y += 4; x6 = LD_SP(px); px += 4; ST_SP(y6, y); y += 4; x7 = LD_SP(px); px += 4; ST_SP(y7, y); y += 4; } LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); x0 *= c0; y0 *= c0; x1 *= c0; y1 *= c0; x2 *= c0; y2 *= c0; x3 *= c0; y3 *= c0; x4 *= c0; y4 *= c0; x5 *= c0; y5 *= c0; x6 *= c0; y6 *= c0; x7 *= c0; y7 *= c0; ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); } if (n & 16) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); x0 *= c0; y0 *= c0; x1 *= c0; y1 *= c0; x2 *= c0; y2 *= c0; x3 *= c0; y3 *= c0; ST_SP4_INC(x0, x1, x2, x3, x, 4); ST_SP4_INC(y0, y1, y2, y3, y, 4); } if (n & 8) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); x0 *= c0; y0 *= c0; x1 *= c0; y1 *= c0; ST_SP2_INC(x0, x1, x, 4); ST_SP2_INC(y0, y1, y, 4); } if (n & 4) { x0 = LD_SP(px); y0 = LD_SP(py); px += 4; py += 4; x0 *= c0; y0 *= c0; ST_SP(x0, x); ST_SP(y0, y); x += 4; y += 4; } if (n & 2) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = (c * fx0); tp1 = (c * fy0); tp2 = (c * fx1); tp3 = (c * fy1); ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = (c * fx0); tp1 = (c * fy0); *x = tp0; *y = tp1; } } else if (0 == c) { s0 = COPY_FLOAT_TO_VECTOR(s); /* process 16 floats */ if (n >> 5) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); for (j = (n >> 5) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); x4 = LD_SP(px); px += 4; out0 = s0 * y0; x5 = LD_SP(px); px += 4; out2 = s0 * y1; x6 = LD_SP(px); px += 4; out4 = s0 * y2; x7 = LD_SP(px); px += 4; out6 = s0 * y3; y4 = LD_SP(py); py += 4; out1 = -(s0 * x0); y5 = LD_SP(py); py += 4; out3 = -(s0 * x1); y6 = LD_SP(py); py += 4; out5 = -(s0 * x2); y7 = LD_SP(py); py += 4; out7 = -(s0 * x3); ST_SP(out0, x); x += 4; out0 = s0 * y4; ST_SP(out2, x); x += 4; out2 = s0 * y5; ST_SP(out4, x); x += 4; out4 = s0 * y6; ST_SP(out6, x); x += 4; out6 = s0 * y7; ST_SP(out1, y); y += 4; out1 = -(s0 * x4); ST_SP(out3, y); y += 4; out3 = -(s0 * x5); ST_SP(out5, y); y += 4; out5 = -(s0 * x6); ST_SP(out7, y); y += 4; out7 = -(s0 * x7); x0 = LD_SP(px); px += 4; ST_SP(out0, x); x += 4; x1 = LD_SP(px); px += 4; ST_SP(out2, x); x += 4; x2 = LD_SP(px); px += 4; ST_SP(out4, x); x += 4; x3 = LD_SP(px); px += 4; ST_SP(out6, x); x += 4; y0 = LD_SP(py); py += 4; ST_SP(out1, y); y += 4; y1 = LD_SP(py); py += 4; ST_SP(out3, y); y += 4; y2 = LD_SP(py); py += 4; ST_SP(out5, y); y += 4; y3 = LD_SP(py); py += 4; ST_SP(out7, y); y += 4; } out0 = s0 * y0; out2 = s0 * y1; out4 = s0 * y2; out6 = s0 * y3; out1 = -(s0 * x0); out3 = -(s0 * x1); out5 = -(s0 * x2); out7 = -(s0 * x3); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); LD_SP4_INC(px, 4, x4, x5, x6, x7); LD_SP4_INC(py, 4, y4, y5, y6, y7); out0 = s0 * y4; out2 = s0 * y5; out4 = s0 * y6; out6 = s0 * y7; out1 = -(s0 * x4); out3 = -(s0 * x5); out5 = -(s0 * x6); out7 = -(s0 * x7); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 16) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); out0 = s0 * y0; out1 = - (s0 * x0); out2 = s0 * y1; out3 = - (s0 * x1); out4 = s0 * y2; out5 = - (s0 * x2); out6 = s0 * y3; out7 = - (s0 * x3); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 8) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); out0 = s0 * y0; out1 = - (s0 * x0); out2 = s0 * y1; out3 = - (s0 * x1); ST_SP2_INC(out0, out2, x, 4); ST_SP2_INC(out1, out3, y, 4); } if (n & 4) { x0 = LD_SP(px); px += 4; y0 = LD_SP(py); py += 4; out0 = s0 * y0; out1 = - (s0 * x0); ST_SP(out0, x); x += 4; ST_SP(out1, y); y += 4; } if (n & 2) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = s * fy0; tp1 = - (s * fx0); tp2 = s * fy1; tp3 = - (s * fx1); ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = s * fy0; tp1 = - (s * fx0); *x = tp0; *y = tp1; } } else { c0 = COPY_FLOAT_TO_VECTOR(c); s0 = COPY_FLOAT_TO_VECTOR(s); /* process 16 floats */ if (n >> 5) { BLASLONG pref_offsetx, pref_offsety; pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); for (j = (n >> 5) - 1; j--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); x4 = LD_SP(px); px += 4; out0 = c0 * x0; x5 = LD_SP(px); px += 4; out2 = c0 * x1; x6 = LD_SP(px); px += 4; out4 = c0 * x2; x7 = LD_SP(px); px += 4; out6 = c0 * x3; y4 = LD_SP(py); py += 4; out1 = c0 * y0; y5 = LD_SP(py); py += 4; out3 = c0 * y1; y6 = LD_SP(py); py += 4; out5 = c0 * y2; y7 = LD_SP(py); py += 4; out7 = c0 * y3; out0 += s0 * y0; out2 += s0 * y1; out4 += s0 * y2; out6 += s0 * y3; out1 -= s0 * x0; out3 -= s0 * x1; out5 -= s0 * x2; out7 -= s0 * x3; ST_SP(out0, x); x += 4; out0 = c0 * x4; ST_SP(out2, x); x += 4; out2 = c0 * x5; ST_SP(out4, x); x += 4; out4 = c0 * x6; ST_SP(out6, x); x += 4; out6 = c0 * x7; ST_SP(out1, y); y += 4; out1 = c0 * y4; ST_SP(out3, y); y += 4; out3 = c0 * y5; ST_SP(out5, y); y += 4; out5 = c0 * y6; ST_SP(out7, y); y += 4; out7 = c0 * y7; x0 = LD_SP(px); px += 4; out0 += s0 * y4; x1 = LD_SP(px); px += 4; out2 += s0 * y5; x2 = LD_SP(px); px += 4; out4 += s0 * y6; x3 = LD_SP(px); px += 4; out6 += s0 * y7; y0 = LD_SP(py); py += 4; out1 -= s0 * x4; y1 = LD_SP(py); py += 4; out3 -= s0 * x5; y2 = LD_SP(py); py += 4; out5 -= s0 * x6; y3 = LD_SP(py); py += 4; out7 -= s0 * x7; ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } out0 = c0 * x0; out2 = c0 * x1; out4 = c0 * x2; out6 = c0 * x3; out1 = c0 * y0; out3 = c0 * y1; out5 = c0 * y2; out7 = c0 * y3; out0 += s0 * y0; out2 += s0 * y1; out4 += s0 * y2; out6 += s0 * y3; out1 -= s0 * x0; out3 -= s0 * x1; out5 -= s0 * x2; out7 -= s0 * x3; ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); LD_SP4_INC(px, 4, x4, x5, x6, x7); LD_SP4_INC(py, 4, y4, y5, y6, y7); out0 = c0 * x4; out2 = c0 * x5; out4 = c0 * x6; out6 = c0 * x7; out1 = c0 * y4; out3 = c0 * y5; out5 = c0 * y6; out7 = c0 * y7; out0 += s0 * y4; out2 += s0 * y5; out4 += s0 * y6; out6 += s0 * y7; out1 -= s0 * x4; out3 -= s0 * x5; out5 -= s0 * x6; out7 -= s0 * x7; ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 16) { LD_SP4_INC(px, 4, x0, x1, x2, x3); LD_SP4_INC(py, 4, y0, y1, y2, y3); out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); out2 = (c0 * x1) + (s0 * y1); out3 = (c0 * y1) - (s0 * x1); out4 = (c0 * x2) + (s0 * y2); out5 = (c0 * y2) - (s0 * x2); out6 = (c0 * x3) + (s0 * y3); out7 = (c0 * y3) - (s0 * x3); ST_SP4_INC(out0, out2, out4, out6, x, 4); ST_SP4_INC(out1, out3, out5, out7, y, 4); } if (n & 8) { LD_SP2_INC(px, 4, x0, x1); LD_SP2_INC(py, 4, y0, y1); out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); out2 = (c0 * x1) + (s0 * y1); out3 = (c0 * y1) - (s0 * x1); ST_SP2_INC(out0, out2, x, 4); ST_SP2_INC(out1, out3, y, 4); } if (n & 4) { x0 = LD_SP(px); y0 = LD_SP(py); px += 4; py += 4; out0 = (c0 * x0) + (s0 * y0); out1 = (c0 * y0) - (s0 * x0); ST_SP(out0, x); ST_SP(out1, y); x += 4; y += 4; } if (n & 2) { LD_GP2_INC(px, 1, fx0, fx1); LD_GP2_INC(py, 1, fy0, fy1); tp0 = (c * fx0) + (s * fy0); tp1 = (c * fy0) - (s * fx0); tp2 = (c * fx1) + (s * fy1); tp3 = (c * fy1) - (s * fx1); ST_GP2_INC(tp0, tp2, x, 1); ST_GP2_INC(tp1, tp3, y, 1); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = (c * fx0) + (s * fy0); tp1 = (c * fy0) - (s * fx0); *x = tp0; *y = tp1; } } } else { if ((0 == c) && (0 == s)) { for (i = n; i--;) { *x = 0; *y = 0; x += inc_x; y += inc_y; } } else if ((1 == c) && (1 == s)) { if (n >> 2) { fx0 = *px; px += inc_x; fx1 = *px; px += inc_x; fx2 = *px; px += inc_x; fx3 = *px; px += inc_x; fy0 = *py; py += inc_y; fy1 = *py; py += inc_y; fy2 = *py; py += inc_y; fy3 = *py; py += inc_y; for (i = (n >> 2) -1; i--;) { tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; tp4 = fx2 + fy2; tp5 = fy2 - fx2; tp6 = fx3 + fy3; tp7 = fy3 - fx3; fx0 = *px; px += inc_x; *x = tp0; x += inc_x; fx1 = *px; px += inc_x; *x = tp2; x += inc_x; fx2 = *px; px += inc_x; *x = tp4; x += inc_x; fx3 = *px; px += inc_x; *x = tp6; x += inc_x; fy0 = *py; py += inc_y; *y = tp1; y += inc_y; fy1 = *py; py += inc_y; *y = tp3; y += inc_y; fy2 = *py; py += inc_y; *y = tp5; y += inc_y; fy3 = *py; py += inc_y; *y = tp7; y += inc_y; } tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; tp4 = fx2 + fy2; tp5 = fy2 - fx2; tp6 = fx3 + fy3; tp7 = fy3 - fx3; *x = tp0; x += inc_x; *x = tp2; x += inc_x; *x = tp4; x += inc_x; *x = tp6; x += inc_x; *y = tp1; y += inc_y; *y = tp3; y += inc_y; *y = tp5; y += inc_y; *y = tp7; y += inc_y; } if (n & 2) { LD_GP2_INC(px, inc_x, fx0, fx1); LD_GP2_INC(py, inc_y, fy0, fy1); tp0 = fx0 + fy0; tp1 = fy0 - fx0; tp2 = fx1 + fy1; tp3 = fy1 - fx1; ST_GP2_INC(tp0, tp2, x, inc_x); ST_GP2_INC(tp1, tp3, y, inc_y); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = fx0 + fy0; tp1 = fy0 - fx0; *x = tp0; *y = tp1; } } else if (0 == s) { if (n >> 2) { fx0 = *px; px += inc_x; fx1 = *px; px += inc_x; fx2 = *px; px += inc_x; fx3 = *px; px += inc_x; fy0 = *py; py += inc_y; fy1 = *py; py += inc_y; fy2 = *py; py += inc_y; fy3 = *py; py += inc_y; for (i = (n >> 2) - 1; i--;) { tp0 = c * fx0; tp1 = c * fy0; tp2 = c * fx1; tp3 = c * fy1; tp4 = c * fx2; tp5 = c * fy2; tp6 = c * fx3; tp7 = c * fy3; fx0 = *px; px += inc_x; *x = tp0; x += inc_x; fx1 = *px; px += inc_x; *x = tp2; x += inc_x; fx2 = *px; px += inc_x; *x = tp4; x += inc_x; fx3 = *px; px += inc_x; *x = tp6; x += inc_x; fy0 = *py; py += inc_y; *y = tp1; y += inc_y; fy1 = *py; py += inc_y; *y = tp3; y += inc_y; fy2 = *py; py += inc_y; *y = tp5; y += inc_y; fy3 = *py; py += inc_y; *y = tp7; y += inc_y; } tp0 = c * fx0; tp1 = c * fy0; tp2 = c * fx1; tp3 = c * fy1; tp4 = c * fx2; tp5 = c * fy2; tp6 = c * fx3; tp7 = c * fy3; *x = tp0; x += inc_x; *x = tp2; x += inc_x; *x = tp4; x += inc_x; *x = tp6; x += inc_x; *y = tp1; y += inc_y; *y = tp3; y += inc_y; *y = tp5; y += inc_y; *y = tp7; y += inc_y; } if (n & 2) { LD_GP2_INC(px, inc_x, fx0, fx1); LD_GP2_INC(py, inc_y, fy0, fy1); tp0 = c * fx0; tp1 = c * fy0; tp2 = c * fx1; tp3 = c * fy1; ST_GP2_INC(tp0, tp2, x, inc_x); ST_GP2_INC(tp1, tp3, y, inc_y); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = c * fx0; tp1 = c * fy0; *x = tp0; *y = tp1; } } else { if (n >> 2) { fx0 = *px; px += inc_x; fx1 = *px; px += inc_x; fx2 = *px; px += inc_x; fx3 = *px; px += inc_x; fy0 = *py; py += inc_y; fy1 = *py; py += inc_y; fy2 = *py; py += inc_y; fy3 = *py; py += inc_y; for (i = (n >> 2) - 1; i--;) { tp0 = c * fx0 + s * fy0; tp1 = c * fy0 - s * fx0; tp2 = c * fx1 + s * fy1; tp3 = c * fy1 - s * fx1; tp4 = c * fx2 + s * fy2; tp5 = c * fy2 - s * fx2; tp6 = c * fx3 + s * fy3; tp7 = c * fy3 - s * fx3; fx0 = *px; px += inc_x; *x = tp0; x += inc_x; fx1 = *px; px += inc_x; *x = tp2; x += inc_x; fx2 = *px; px += inc_x; *x = tp4; x += inc_x; fx3 = *px; px += inc_x; *x = tp6; x += inc_x; fy0 = *py; py += inc_y; *y = tp1; y += inc_y; fy1 = *py; py += inc_y; *y = tp3; y += inc_y; fy2 = *py; py += inc_y; *y = tp5; y += inc_y; fy3 = *py; py += inc_y; *y = tp7; y += inc_y; } tp0 = c * fx0 + s * fy0; tp1 = c * fy0 - s * fx0; tp2 = c * fx1 + s * fy1; tp3 = c * fy1 - s * fx1; tp4 = c * fx2 + s * fy2; tp5 = c * fy2 - s * fx2; tp6 = c * fx3 + s * fy3; tp7 = c * fy3 - s * fx3; *x = tp0; x += inc_x; *x = tp2; x += inc_x; *x = tp4; x += inc_x; *x = tp6; x += inc_x; *y = tp1; y += inc_y; *y = tp3; y += inc_y; *y = tp5; y += inc_y; *y = tp7; y += inc_y; } if (n & 2) { LD_GP2_INC(px, inc_x, fx0, fx1); LD_GP2_INC(py, inc_y, fy0, fy1); tp0 = c * fx0 + s * fy0; tp1 = c * fy0 - s * fx0; tp2 = c * fx1 + s * fy1; tp3 = c * fy1 - s * fx1; ST_GP2_INC(tp0, tp2, x, inc_x); ST_GP2_INC(tp1, tp3, y, inc_y); } if (n & 1) { fx0 = *px; fy0 = *py; tp0 = c * fx0 + s * fy0; tp1 = c * fy0 - s * fx0; *x = tp0; *y = tp1; } } } return 0; } OpenBLAS-0.2.20/kernel/mips/sscal_msa.c000066400000000000000000000322141313527062700175070ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i; FLOAT *px; FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; v4f32 da_vec; px = x; if (1 == inc_x) { if (0.0 == da) { v4f32 zero_v = __msa_cast_to_vector_float(0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); for (i = (n >> 6); i--;) { ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 4); ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 4); } if (n & 63) { if (n & 32) { ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 4); } if (n & 16) { ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4); } if (n & 8) { ST_SP2_INC(zero_v, zero_v, x, 4); } if (n & 4) { *x = 0; x += 1; *x = 0; x += 1; *x = 0; x += 1; *x = 0; x += 1; } if (n & 2) { *x = 0; x += 1; *x = 0; x += 1; } if (n & 1) { *x = 0; } } } else { da_vec = COPY_FLOAT_TO_VECTOR(da); if (n > 63) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 32; LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); for (i = 0; i < (n >> 6) - 1; i++) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 64; x8 = LD_SP(px); px += 4; x0 *= da_vec; x9 = LD_SP(px); px += 4; x1 *= da_vec; x10 = LD_SP(px); px += 4; x2 *= da_vec; x11 = LD_SP(px); px += 4; x3 *= da_vec; x12 = LD_SP(px); px += 4; x4 *= da_vec; x13 = LD_SP(px); px += 4; x5 *= da_vec; x14 = LD_SP(px); px += 4; x6 *= da_vec; x15 = LD_SP(px); px += 4; x7 *= da_vec; x8 *= da_vec; ST_SP(x0, x); x += 4; x9 *= da_vec; ST_SP(x1, x); x += 4; x10 *= da_vec; ST_SP(x2, x); x += 4; x11 *= da_vec; ST_SP(x3, x); x += 4; x12 *= da_vec; ST_SP(x4, x); x += 4; x13 *= da_vec; ST_SP(x5, x); x += 4; x14 *= da_vec; ST_SP(x6, x); x += 4; x15 *= da_vec; ST_SP(x7, x); x += 4; ST_SP(x8, x); x += 4; x0 = LD_SP(px); px += 4; ST_SP(x9, x); x += 4; x1 = LD_SP(px); px += 4; ST_SP(x10, x); x += 4; x2 = LD_SP(px); px += 4; ST_SP(x11, x); x += 4; x3 = LD_SP(px); px += 4; ST_SP(x12, x); x += 4; x4 = LD_SP(px); px += 4; ST_SP(x13, x); x += 4; x5 = LD_SP(px); px += 4; ST_SP(x14, x); x += 4; x6 = LD_SP(px); px += 4; ST_SP(x15, x); x += 4; x7 = LD_SP(px); px += 4; } x8 = LD_SP(px); px += 4; x0 *= da_vec; x9 = LD_SP(px); px += 4; x1 *= da_vec; x10 = LD_SP(px); px += 4; x2 *= da_vec; x11 = LD_SP(px); px += 4; x3 *= da_vec; x12 = LD_SP(px); px += 4; x4 *= da_vec; x13 = LD_SP(px); px += 4; x5 *= da_vec; x14 = LD_SP(px); px += 4; x6 *= da_vec; x15 = LD_SP(px); px += 4; x7 *= da_vec; x8 *= da_vec; ST_SP(x0, x); x += 4; x9 *= da_vec; ST_SP(x1, x); x += 4; x10 *= da_vec; ST_SP(x2, x); x += 4; x11 *= da_vec; ST_SP(x3, x); x += 4; x12 *= da_vec; ST_SP(x4, x); x += 4; x13 *= da_vec; ST_SP(x5, x); x += 4; x15 *= da_vec; ST_SP(x6, x); x += 4; x14 *= da_vec; ST_SP(x7, x); x += 4; ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4); } if (n & 63) { if (n & 32) { LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); } if (n & 16) { LD_SP4_INC(px, 4, x0, x1, x2, x3); MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); ST_SP4_INC(x0, x1, x2, x3, x, 4); } if (n & 8) { LD_SP2_INC(px, 4, x0, x1); MUL2(x0, da_vec, x1, da_vec, x0, x1); ST_SP2_INC(x0, x1, x, 4); } if (n & 4) { LD_GP4_INC(px, 1, f0, f1, f2, f3); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, x, 1); } if (n & 2) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da, f1, da, f0, f1); ST_GP2_INC(f0, f1, x, 1); } if (n & 1) { *x *= da; } } } } else { if (0.0 == da) { for (i = n; i--;) { *x = 0; x += inc_x; } } else { if (n > 15) { LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); for (i = 0; i < (n >> 4) - 1; i++) { LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); f4 *= da; f5 *= da; *x = f0; x += inc_x; f6 *= da; *x = f1; x += inc_x; f7 *= da; *x = f2; x += inc_x; f8 *= da; *x = f3; x += inc_x; f9 *= da; *x = f4; x += inc_x; f10 *= da; *x = f5; x += inc_x; f11 *= da; *x = f6; x += inc_x; f12 *= da; *x = f7; x += inc_x; f13 *= da; *x = f8; x += inc_x; f14 *= da; *x = f9; x += inc_x; f15 *= da; *x = f10; x += inc_x; *x = f11; x += inc_x; f0 = *px; px += inc_x; *x = f12; x += inc_x; f1 = *px; px += inc_x; *x = f13; x += inc_x; f2 = *px; px += inc_x; *x = f14; x += inc_x; f3 = *px; px += inc_x; *x = f15; x += inc_x; f4 = *px; px += inc_x; f5 = *px; px += inc_x; f6 = *px; px += inc_x; f7 = *px; px += inc_x; } LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); f4 *= da; f5 *= da; *x = f0; x += inc_x; f6 *= da; *x = f1; x += inc_x; f7 *= da; *x = f2; x += inc_x; f8 *= da; *x = f3; x += inc_x; f9 *= da; *x = f4; x += inc_x; f10 *= da; *x = f5; x += inc_x; f11 *= da; *x = f6; x += inc_x; f12 *= da; *x = f7; x += inc_x; f13 *= da; *x = f8; x += inc_x; f14 *= da; *x = f9; x += inc_x; f15 *= da; *x = f10; x += inc_x; *x = f11; x += inc_x; *x = f12; x += inc_x; *x = f13; x += inc_x; *x = f14; x += inc_x; *x = f15; x += inc_x; } if (n & 15) { if (n & 8) { LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); } if (n & 4) { LD_GP4_INC(px, inc_x, f0, f1, f2, f3); MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); ST_GP4_INC(f0, f1, f2, f3, x, inc_x); } if (n & 2) { LD_GP2_INC(px, inc_x, f0, f1); MUL2(f0, da, f1, da, f0, f1); ST_GP2_INC(f0, f1, x, inc_x); } if (n & 1) { *x *= da; } } } } return 0; } OpenBLAS-0.2.20/kernel/mips/sswap_msa.c000066400000000000000000000234311313527062700175400ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, pref_offsetx, pref_offsety; FLOAT *px, *py; FLOAT x0, x1, x2, x3, x4, x5, x6, x7; FLOAT y0, y1, y2, y3, y4, y5, y6, y7; v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; if (n < 0) return (0); pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } px = srcx; py = srcy; if ((1 == inc_x) && (1 == inc_y)) { if (n >> 5) { LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); for (i = (n >> 5) - 1; i--;) { PREFETCH(px + pref_offsetx + 32); PREFETCH(px + pref_offsetx + 40); PREFETCH(px + pref_offsetx + 48); PREFETCH(px + pref_offsetx + 56); PREFETCH(py + pref_offsety + 32); PREFETCH(py + pref_offsety + 40); PREFETCH(py + pref_offsety + 48); PREFETCH(py + pref_offsety + 56); yv0 = LD_SP(py); py += 4; ST_SP(xv0, srcy); srcy += 4; yv1 = LD_SP(py); py += 4; ST_SP(xv1, srcy); srcy += 4; yv2 = LD_SP(py); py += 4; ST_SP(xv2, srcy); srcy += 4; yv3 = LD_SP(py); py += 4; ST_SP(xv3, srcy); srcy += 4; yv4 = LD_SP(py); py += 4; ST_SP(xv4, srcy); srcy += 4; yv5 = LD_SP(py); py += 4; ST_SP(xv5, srcy); srcy += 4; yv6 = LD_SP(py); py += 4; ST_SP(xv6, srcy); srcy += 4; yv7 = LD_SP(py); py += 4; ST_SP(xv7, srcy); srcy += 4; xv0 = LD_SP(px); px += 4; ST_SP(yv0, srcx); srcx += 4; xv1 = LD_SP(px); px += 4; ST_SP(yv1, srcx); srcx += 4; xv2 = LD_SP(px); px += 4; ST_SP(yv2, srcx); srcx += 4; xv3 = LD_SP(px); px += 4; ST_SP(yv3, srcx); srcx += 4; xv4 = LD_SP(px); px += 4; ST_SP(yv4, srcx); srcx += 4; xv5 = LD_SP(px); px += 4; ST_SP(yv5, srcx); srcx += 4; xv6 = LD_SP(px); px += 4; ST_SP(yv6, srcx); srcx += 4; xv7 = LD_SP(px); px += 4; ST_SP(yv7, srcx); srcx += 4; } LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); } if (n & 31) { if ((n & 16) && (n & 8) && (n & 4)) { LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); } else if ((n & 16) && (n & 8)) { LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); } else if ((n & 16) && (n & 4)) { LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); } else if ((n & 8) && (n & 4)) { LD_SP3_INC(px, 4, xv0, xv1, xv2); LD_SP3_INC(py, 4, yv0, yv1, yv2); ST_SP3_INC(xv0, xv1, xv2, srcy, 4); ST_SP3_INC(yv0, yv1, yv2, srcx, 4); } else if (n & 16) { LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); } else if (n & 8) { LD_SP2_INC(px, 4, xv0, xv1); LD_SP2_INC(py, 4, yv0, yv1); ST_SP2_INC(xv0, xv1, srcy, 4); ST_SP2_INC(yv0, yv1, srcx, 4); } else if (n & 4) { xv0 = LD_SP(px); yv0 = LD_SP(py); px += 4; py += 4; ST_SP(xv0, srcy); ST_SP(yv0, srcx); srcx += 4; srcy += 4; } if ((n & 2) && (n & 1)) { LD_GP3_INC(px, 1, x0, x1, x3); LD_GP3_INC(py, 1, y0, y1, y3); ST_GP3_INC(x0, x1, x3, srcy, 1); ST_GP3_INC(y0, y1, y3, srcx, 1); } else if (n & 2) { LD_GP2_INC(px, 1, x0, x1); LD_GP2_INC(py, 1, y0, y1); ST_GP2_INC(x0, x1, srcy, 1); ST_GP2_INC(y0, y1, srcx, 1); } else if (n & 1) { x0 = px[0]; y0 = py[0]; srcx[0] = y0; srcy[0] = x0; } } } else { for (i = (n >> 3); i--;) { LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); } if (n & 7) { if ((n & 4) && (n & 2) && (n & 1)) { LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); } else if ((n & 4) && (n & 2)) { LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); } else if ((n & 4) && (n & 1)) { LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); } else if ((n & 2) && (n & 1)) { LD_GP3_INC(px, inc_x, x0, x1, x2); LD_GP3_INC(py, inc_y, y0, y1, y2); ST_GP3_INC(x0, x1, x2, srcy, inc_y); ST_GP3_INC(y0, y1, y2, srcx, inc_x); } else if (n & 4) { LD_GP4_INC(px, inc_x, x0, x1, x2, x3); LD_GP4_INC(py, inc_y, y0, y1, y2, y3); ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); } else if (n & 2) { LD_GP2_INC(px, inc_x, x0, x1); LD_GP2_INC(py, inc_y, y0, y1); ST_GP2_INC(x0, x1, srcy, inc_y); ST_GP2_INC(y0, y1, srcx, inc_x); } else if (n & 1) { x0 = *srcx; y0 = *srcy; *srcx = y0; *srcy = x0; } } } return (0); } OpenBLAS-0.2.20/kernel/mips/strsm_kernel_LN_8x8_msa.c000066400000000000000000001335351313527062700222220ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35, src_a36; v4f32 src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); LD_SP2(c_nxt4line, 4, src_c8, src_c9); LD_SP2(c_nxt5line, 4, src_c10, src_c11); LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); if (bk > 0) { BLASLONG k, pref_offset; FLOAT *aa = a, *bb = b, *pa0_pref; v4f32 src_a1, src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (k = 0; k < (bk >> 1); k++) { PREF_OFFSET(pa0_pref, 64); PREF_OFFSET(pa0_pref, 96); LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; pa0_pref += 16; } if (bk & 1) { LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; } } a -= 64; b -= 64; TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, res_c4, res_c5, res_c6, res_c7); TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, res_c12, res_c13, res_c14, res_c15); TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, res_c0, res_c1, res_c2, res_c3); TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, res_c8, res_c9, res_c10, res_c11); src_a = LD_SP(a + 60); SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); src_a = LD_SP(a + 56); SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); res_c7 *= src_a63; res_c15 *= src_a63; res_c6 -= res_c7 * src_a62; res_c14 -= res_c15 * src_a62; res_c5 -= res_c7 * src_a61; res_c13 -= res_c15 * src_a61; res_c4 -= res_c7 * src_a60; res_c12 -= res_c15 * src_a60; res_c3 -= res_c7 * src_a59; res_c11 -= res_c15 * src_a59; res_c2 -= res_c7 * src_a58; res_c10 -= res_c15 * src_a58; res_c1 -= res_c7 * src_a57; res_c9 -= res_c15 * src_a57; res_c0 -= res_c7 * src_a56; res_c8 -= res_c15 * src_a56; src_a = LD_SP(a + 48); SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); src_a52 = LD_SP(a + 52); src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); res_c6 *= src_a54; res_c14 *= src_a54; res_c5 -= res_c6 * src_a53; res_c13 -= res_c14 * src_a53; res_c4 -= res_c6 * src_a52; res_c12 -= res_c14 * src_a52; res_c3 -= res_c6 * src_a51; res_c11 -= res_c14 * src_a51; res_c2 -= res_c6 * src_a50; res_c10 -= res_c14 * src_a50; res_c1 -= res_c6 * src_a49; res_c9 -= res_c14 * src_a49; res_c0 -= res_c6 * src_a48; res_c8 -= res_c14 * src_a48; src_a = LD_SP(a + 40); SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); src_a44 = LD_SP(a + 44); src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); res_c5 *= src_a45; res_c13 *= src_a45; res_c4 -= res_c5 * src_a44; res_c12 -= res_c13 * src_a44; res_c3 -= res_c5 * src_a43; res_c11 -= res_c13 * src_a43; res_c2 -= res_c5 * src_a42; res_c10 -= res_c13 * src_a42; res_c1 -= res_c5 * src_a41; res_c9 -= res_c13 * src_a41; res_c0 -= res_c5 * src_a40; res_c8 -= res_c13 * src_a40; src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); res_c4 *= src_a36; res_c12 *= src_a36; res_c3 -= res_c4 * src_a35; res_c11 -= res_c12 * src_a35; res_c2 -= res_c4 * src_a34; res_c10 -= res_c12 * src_a34; res_c1 -= res_c4 * src_a33; res_c9 -= res_c12 * src_a33; res_c0 -= res_c4 * src_a32; res_c8 -= res_c12 * src_a32; ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, src_c1, src_c3, src_c5, src_c7); TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, src_c9, src_c11, src_c13, src_c15); ST_SP(src_c1, c + 4); ST_SP(src_c3, c_nxt1line + 4); ST_SP(src_c5, c_nxt2line + 4); ST_SP(src_c7, c_nxt3line + 4); ST_SP(src_c9, c_nxt4line + 4); ST_SP(src_c11, c_nxt5line + 4); ST_SP(src_c13, c_nxt6line + 4); ST_SP(src_c15, c_nxt7line + 4); src_a = LD_SP(a + 24); SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); res_c3 *= src_a27; res_c11 *= src_a27; res_c2 -= res_c3 * src_a26; res_c10 -= res_c11 * src_a26; res_c1 -= res_c3 * src_a25; res_c9 -= res_c11 * src_a25; res_c0 -= res_c3 * src_a24; res_c8 -= res_c11 * src_a24; src_a16 = LD_SP(a + 16); src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); res_c2 *= src_a18; res_c10 *= src_a18; res_c1 -= res_c2 * src_a17; res_c9 -= res_c10 * src_a17; res_c0 -= res_c2 * src_a16; res_c8 -= res_c10 * src_a16; src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c1 *= src_a9; res_c9 *= src_a9; res_c0 -= res_c1 * src_a8; res_c8 -= res_c9 * src_a8; res_c0 *= src_a0; res_c8 *= src_a0; ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c2, src_c4, src_c6); TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, src_c8, src_c10, src_c12, src_c14); ST_SP(src_c0, c); ST_SP(src_c2, c_nxt1line); ST_SP(src_c4, c_nxt2line); ST_SP(src_c6, c_nxt3line); ST_SP(src_c8, c_nxt4line); ST_SP(src_c10, c_nxt5line); ST_SP(src_c12, c_nxt6line); ST_SP(src_c14, c_nxt7line); } static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a, src_a0, src_a8, src_a9, src_a16, src_a17, src_a18, src_a24; v4f32 src_a25, src_a26, src_a27, src_a32, src_a33, src_a34, src_a35; v4f32 src_a36, src_a40, src_a41, src_a42, src_a43, src_a44, src_a45; v4f32 src_a48, src_a49, src_a50, src_a51, src_a52, src_a53, src_a54; v4f32 src_a56, src_a57, src_a58, src_a59, src_a60, src_a61, src_a62, src_a63; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); for (k = 0; k < (bk >> 1); k++) { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; aa += 8; bb += 4; LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; aa += 8; bb += 4; } if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; } a -= 64; b -= 32; TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, res_c0, res_c1, res_c2, res_c3); TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, res_c4, res_c5, res_c6, res_c7); src_a = LD_SP(a + 60); SPLATI_W4_SP(src_a, src_a60, src_a61, src_a62, src_a63); src_a = LD_SP(a + 56); SPLATI_W4_SP(src_a, src_a56, src_a57, src_a58, src_a59); src_a = LD_SP(a + 48); SPLATI_W4_SP(src_a, src_a48, src_a49, src_a50, src_a51); src_a52 = LD_SP(a + 52); src_a54 = (v4f32) __msa_splati_w((v4i32) src_a52, 2); src_a53 = (v4f32) __msa_splati_w((v4i32) src_a52, 1); src_a52 = (v4f32) __msa_splati_w((v4i32) src_a52, 0); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; res_c5 -= res_c7 * src_a61; res_c4 -= res_c7 * src_a60; res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; res_c1 -= res_c7 * src_a57; res_c0 -= res_c7 * src_a56; res_c6 *= src_a54; res_c5 -= res_c6 * src_a53; res_c4 -= res_c6 * src_a52; res_c3 -= res_c6 * src_a51; res_c2 -= res_c6 * src_a50; res_c1 -= res_c6 * src_a49; res_c0 -= res_c6 * src_a48; src_a = LD_SP(a + 40); SPLATI_W4_SP(src_a, src_a40, src_a41, src_a42, src_a43); src_a44 = LD_SP(a + 44); src_a45 = (v4f32) __msa_splati_w((v4i32) src_a44, 1); src_a44 = (v4f32) __msa_splati_w((v4i32) src_a44, 0); res_c5 *= src_a45; res_c4 -= res_c5 * src_a44; res_c3 -= res_c5 * src_a43; res_c2 -= res_c5 * src_a42; res_c1 -= res_c5 * src_a41; res_c0 -= res_c5 * src_a40; src_a = LD_SP(a + 32); SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35); src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; res_c2 -= res_c4 * src_a34; res_c1 -= res_c4 * src_a33; res_c0 -= res_c4 * src_a32; src_a = LD_SP(a + 24); SPLATI_W4_SP(src_a, src_a24, src_a25, src_a26, src_a27); res_c3 *= src_a27; res_c2 -= res_c3 * src_a26; res_c1 -= res_c3 * src_a25; res_c0 -= res_c3 * src_a24; src_a16 = LD_SP(a + 16); src_a18 = (v4f32) __msa_splati_w((v4i32) src_a16, 2); src_a17 = (v4f32) __msa_splati_w((v4i32) src_a16, 1); src_a16 = (v4f32) __msa_splati_w((v4i32) src_a16, 0); res_c2 *= src_a18; res_c1 -= res_c2 * src_a17; res_c0 -= res_c2 * src_a16; src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9)); src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8)); src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c1 *= src_a9; res_c0 -= res_c1 * src_a8; res_c0 *= src_a0; ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); ST_SP4(res_c4, res_c5, res_c6, res_c7, b + 16, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c2, src_c4, src_c6); TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, src_c1, src_c3, src_c5, src_c7); ST_SP2(src_c0, src_c1, c, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); } static void ssolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; FLOAT c0, c1, c2, c3, c4, c5, c6, c7; FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c4 = *(c + 4); c5 = *(c + 5); c6 = *(c + 6); c7 = *(c + 7); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); c4_nxt = *(c + 4 + ldc); c5_nxt = *(c + 5 + ldc); c6_nxt = *(c + 6 + ldc); c7_nxt = *(c + 7 + ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; c4 -= aa[4] * bb[0]; c5 -= aa[5] * bb[0]; c6 -= aa[6] * bb[0]; c7 -= aa[7] * bb[0]; c0_nxt -= aa[0] * bb[1]; c1_nxt -= aa[1] * bb[1]; c2_nxt -= aa[2] * bb[1]; c3_nxt -= aa[3] * bb[1]; c4_nxt -= aa[4] * bb[1]; c5_nxt -= aa[5] * bb[1]; c6_nxt -= aa[6] * bb[1]; c7_nxt -= aa[7] * bb[1]; aa += 8; bb += 2; } a -= 64; b -= 16; a0 = *(a + 0); a8 = *(a + 8); a9 = *(a + 9); a16 = *(a + 16); a17 = *(a + 17); a18 = *(a + 18); a24 = *(a + 24); a25 = *(a + 25); a26 = *(a + 26); a27 = *(a + 27); a32 = *(a + 32); a33 = *(a + 33); a34 = *(a + 34); a35 = *(a + 35); a36 = *(a + 36); a40 = *(a + 40); a41 = *(a + 41); a42 = *(a + 42); a43 = *(a + 43); a44 = *(a + 44); a45 = *(a + 45); a48 = *(a + 48); a49 = *(a + 49); a50 = *(a + 50); a51 = *(a + 51); a52 = *(a + 52); a53 = *(a + 53); a54 = *(a + 54); a56 = *(a + 56); a57 = *(a + 57); a58 = *(a + 58); a59 = *(a + 59); a60 = *(a + 60); a61 = *(a + 61); a62 = *(a + 62); a63 = *(a + 63); c7 *= a63; c7_nxt *= a63; c6 -= c7 * a62; c6_nxt -= c7_nxt * a62; c5 -= c7 * a61; c5_nxt -= c7_nxt * a61; c4 -= c7 * a60; c4_nxt -= c7_nxt * a60; c3 -= c7 * a59; c3_nxt -= c7_nxt * a59; c2 -= c7 * a58; c2_nxt -= c7_nxt * a58; c1 -= c7 * a57; c1_nxt -= c7_nxt * a57; c0 -= c7 * a56; c0_nxt -= c7_nxt * a56; c6 *= a54; c6_nxt *= a54; c5 -= c6 * a53; c5_nxt -= c6_nxt * a53; c4 -= c6 * a52; c4_nxt -= c6_nxt * a52; c3 -= c6 * a51; c3_nxt -= c6_nxt * a51; c2 -= c6 * a50; c2_nxt -= c6_nxt * a50; c1 -= c6 * a49; c1_nxt -= c6_nxt * a49; c0 -= c6 * a48; c0_nxt -= c6_nxt * a48; c5 *= a45; c5_nxt *= a45; c4 -= c5 * a44; c4_nxt -= c5_nxt * a44; c3 -= c5 * a43; c3_nxt -= c5_nxt * a43; c2 -= c5 * a42; c2_nxt -= c5_nxt * a42; c1 -= c5 * a41; c1_nxt -= c5_nxt * a41; c0 -= c5 * a40; c0_nxt -= c5_nxt * a40; c4 *= a36; c4_nxt *= a36; c3 -= c4 * a35; c3_nxt -= c4_nxt * a35; c2 -= c4 * a34; c2_nxt -= c4_nxt * a34; c1 -= c4 * a33; c1_nxt -= c4_nxt * a33; c0 -= c4 * a32; c0_nxt -= c4_nxt * a32; c3 *= a27; c3_nxt *= a27; c2 -= c3 * a26; c2_nxt -= c3_nxt * a26; c1 -= c3 * a25; c1_nxt -= c3_nxt * a25; c0 -= c3 * a24; c0_nxt -= c3_nxt * a24; c2 *= a18; c2_nxt *= a18; c1 -= c2 * a17; c1_nxt -= c2_nxt * a17; c0 -= c2 * a16; c0_nxt -= c2_nxt * a16; c1 *= a9; c1_nxt *= a9; c0 -= c1 * a8; c0_nxt -= c1_nxt * a8; c0 *= a0; c0_nxt *= a0; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(b + 4) = c2; *(b + 5) = c2_nxt; *(b + 6) = c3; *(b + 7) = c3_nxt; *(b + 8) = c4; *(b + 9) = c4_nxt; *(b + 10) = c5; *(b + 11) = c5_nxt; *(b + 12) = c6; *(b + 13) = c6_nxt; *(b + 14) = c7; *(b + 15) = c7_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; *(c + 4) = c4; *(c + 5) = c5; *(c + 6) = c6; *(c + 7) = c7; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; *(c + 2 + ldc) = c2_nxt; *(c + 3 + ldc) = c3_nxt; *(c + 4 + ldc) = c4_nxt; *(c + 5 + ldc) = c5_nxt; *(c + 6 + ldc) = c6_nxt; *(c + 7 + ldc) = c7_nxt; } static void ssolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; FLOAT c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c4 = *(c + 4); c5 = *(c + 5); c6 = *(c + 6); c7 = *(c + 7); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; c4 -= aa[4] * bb[0]; c5 -= aa[5] * bb[0]; c6 -= aa[6] * bb[0]; c7 -= aa[7] * bb[0]; aa += 8; bb += 1; } a -= 64; b -= 8; a0 = *(a + 0); a8 = *(a + 8); a9 = *(a + 9); a16 = *(a + 16); a17 = *(a + 17); a18 = *(a + 18); a24 = *(a + 24); a25 = *(a + 25); a26 = *(a + 26); a27 = *(a + 27); a32 = *(a + 32); a33 = *(a + 33); a34 = *(a + 34); a35 = *(a + 35); a36 = *(a + 36); a40 = *(a + 40); a41 = *(a + 41); a42 = *(a + 42); a43 = *(a + 43); a44 = *(a + 44); a45 = *(a + 45); a48 = *(a + 48); a49 = *(a + 49); a50 = *(a + 50); a51 = *(a + 51); a52 = *(a + 52); a53 = *(a + 53); a54 = *(a + 54); a56 = *(a + 56); a57 = *(a + 57); a58 = *(a + 58); a59 = *(a + 59); a60 = *(a + 60); a61 = *(a + 61); a62 = *(a + 62); a63 = *(a + 63); c7 *= a63; c6 -= c7 * a62; c6 *= a54; c5 -= c7 * a61; c5 -= c6 * a53; c5 *= a45; c4 -= c7 * a60; c4 -= c6 * a52; c4 -= c5 * a44; c4 *= a36; c3 -= c7 * a59; c3 -= c6 * a51; c3 -= c5 * a43; c3 -= c4 * a35; c3 *= a27; c2 -= c7 * a58; c2 -= c6 * a50; c2 -= c5 * a42; c2 -= c4 * a34; c2 -= c3 * a26; c2 *= a18; c1 -= c7 * a57; c1 -= c6 * a49; c1 -= c5 * a41; c1 -= c4 * a33; c1 -= c3 * a25; c1 -= c2 * a17; c1 *= a9; c0 -= c7 * a56; c0 -= c6 * a48; c0 -= c5 * a40; c0 -= c4 * a32; c0 -= c3 * a24; c0 -= c2 * a16; c0 -= c1 * a8; c0 *= a0; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(b + 4) = c4; *(b + 5) = c5; *(b + 6) = c6; *(b + 7) = c7; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; *(c + 4) = c4; *(c + 5) = c5; *(c + 6) = c6; *(c + 7) = c7; } static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; v4f32 src_a13, src_a14, src_a15; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); src_c4 = LD_SP(c_nxt4line); src_c5 = LD_SP(c_nxt5line); src_c6 = LD_SP(c_nxt6line); src_c7 = LD_SP(c_nxt7line); for (k = 0; k < bk; k++) { src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c4 -= src_a0 * src_b0; src_c5 -= src_a0 * src_b1; src_c6 -= src_a0 * src_b2; src_c7 -= src_a0 * src_b3; aa += 4; bb += 8; } a -= 16; b -= 32; TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3); TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, res_c4, res_c5, res_c6, res_c7); src_a = LD_SP(a + 12); SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); src_a8 = LD_SP(a + 8); src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c3 *= src_a15; res_c7 *= src_a15; res_c2 -= res_c3 * src_a14; res_c6 -= res_c7 * src_a14; res_c1 -= res_c3 * src_a13; res_c5 -= res_c7 * src_a13; res_c0 -= res_c3 * src_a12; res_c4 -= res_c7 * src_a12; res_c2 *= src_a10; res_c6 *= src_a10; res_c1 -= res_c2 * src_a9; res_c5 -= res_c6 * src_a9; res_c0 -= res_c2 * src_a8; res_c4 -= res_c6 * src_a8; res_c1 *= src_a5; res_c5 *= src_a5; res_c0 -= res_c1 * src_a4; res_c4 -= res_c5 * src_a4; res_c0 *= src_a0; res_c4 *= src_a0; ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c1, src_c2, src_c3); TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, src_c4, src_c5, src_c6, src_c7); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); ST_SP(src_c4, c_nxt4line); ST_SP(src_c5, c_nxt5line); ST_SP(src_c6, c_nxt6line); ST_SP(src_c7, c_nxt7line); } static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v4f32 src_a, src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12; v4f32 src_a13, src_a14, src_a15; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); for (k = 0; k < (bk >> 1); k++) { src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; aa += 4; bb += 4; src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; aa += 4; bb += 4; } if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; } a -= 16; b -= 16; TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3); src_a = LD_SP(a + 12); SPLATI_W4_SP(src_a, src_a12, src_a13, src_a14, src_a15); src_a8 = LD_SP(a + 8); src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2); src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1); src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0); src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5)); src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4)); src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0)); res_c3 *= src_a15; res_c2 -= res_c3 * src_a14; res_c1 -= res_c3 * src_a13; res_c0 -= res_c3 * src_a12; res_c2 *= src_a10; res_c1 -= res_c2 * src_a9; res_c0 -= res_c2 * src_a8; res_c1 *= src_a5; res_c0 -= res_c1 * src_a4; res_c0 *= src_a0; ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c1, src_c2, src_c3); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); } static void ssolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; c0_nxt -= aa[0] * bb[1]; c1_nxt -= aa[1] * bb[1]; c2_nxt -= aa[2] * bb[1]; c3_nxt -= aa[3] * bb[1]; aa += 4; bb += 2; } a -= 16; b -= 8; a0 = *(a + 0); a4 = *(a + 4); a5 = *(a + 5); a8 = *(a + 8); a9 = *(a + 9); a10 = *(a + 10); a12 = *(a + 12); a13 = *(a + 13); a14 = *(a + 14); a15 = *(a + 15); c3 *= a15; c3_nxt *= a15; c2 -= c3 * a14; c2_nxt -= c3_nxt * a14; c2 *= a10; c2_nxt *= a10; c1 -= c3 * a13; c1_nxt -= c3_nxt * a13; c1 -= c2 * a9; c1_nxt -= c2_nxt * a9; c1 *= a5; c1_nxt *= a5; c0 -= c3 * a12; c0_nxt -= c3_nxt * a12; c0 -= c2 * a8; c0_nxt -= c2_nxt * a8; c0 -= c1 * a4; c0_nxt -= c1_nxt * a4; c0 *= a0; c0_nxt *= a0; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(b + 4) = c2; *(b + 5) = c2_nxt; *(b + 6) = c3; *(b + 7) = c3_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; *(c + 2 + ldc) = c2_nxt; *(c + 3 + ldc) = c3_nxt; } static void ssolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; aa += 4; bb += 1; } a -= 16; b -= 4; a0 = *(a + 0); a4 = *(a + 4); a5 = *(a + 5); a8 = *(a + 8); a9 = *(a + 9); a10 = *(a + 10); a12 = *(a + 12); a13 = *(a + 13); a14 = *(a + 14); a15 = *(a + 15); c3 *= a15; c2 -= c3 * a14; c2 *= a10; c1 -= c3 * a13; c1 -= c2 * a9; c1 *= a5; c0 -= c3 * a12; c0 -= c2 * a8; c0 -= c1 * a4; c0 *= a0; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void ssolve_2x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3; FLOAT c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; FLOAT c0_nxt7, c1_nxt7; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + 1 * ldc); c1_nxt1 = *(c + 1 + 1 * ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); c0_nxt4 = *(c + 0 + 4 * ldc); c1_nxt4 = *(c + 1 + 4 * ldc); c0_nxt5 = *(c + 0 + 5 * ldc); c1_nxt5 = *(c + 1 + 5 * ldc); c0_nxt6 = *(c + 0 + 6 * ldc); c1_nxt6 = *(c + 1 + 6 * ldc); c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt1 -= aa[0] * bb[1]; c1_nxt1 -= aa[1] * bb[1]; c0_nxt2 -= aa[0] * bb[2]; c1_nxt2 -= aa[1] * bb[2]; c0_nxt3 -= aa[0] * bb[3]; c1_nxt3 -= aa[1] * bb[3]; c0_nxt4 -= aa[0] * bb[4]; c1_nxt4 -= aa[1] * bb[4]; c0_nxt5 -= aa[0] * bb[5]; c1_nxt5 -= aa[1] * bb[5]; c0_nxt6 -= aa[0] * bb[6]; c1_nxt6 -= aa[1] * bb[6]; c0_nxt7 -= aa[0] * bb[7]; c1_nxt7 -= aa[1] * bb[7]; aa += 2; bb += 8; } a -= 4; b -= 16; a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); c1 *= a3; c1_nxt1 *= a3; c1_nxt2 *= a3; c1_nxt3 *= a3; c1_nxt4 *= a3; c1_nxt5 *= a3; c1_nxt6 *= a3; c1_nxt7 *= a3; c0 -= c1 * a2; c0_nxt1 -= c1_nxt1 * a2; c0_nxt2 -= c1_nxt2 * a2; c0_nxt3 -= c1_nxt3 * a2; c0_nxt4 -= c1_nxt4 * a2; c0_nxt5 -= c1_nxt5 * a2; c0_nxt6 -= c1_nxt6 * a2; c0_nxt7 -= c1_nxt7 * a2; c0 *= a0; c0_nxt1 *= a0; c0_nxt2 *= a0; c0_nxt3 *= a0; c0_nxt4 *= a0; c0_nxt5 *= a0; c0_nxt6 *= a0; c0_nxt7 *= a0; *(b + 0) = c0; *(b + 1) = c0_nxt1; *(b + 2) = c0_nxt2; *(b + 3) = c0_nxt3; *(b + 4) = c0_nxt4; *(b + 5) = c0_nxt5; *(b + 6) = c0_nxt6; *(b + 7) = c0_nxt7; *(b + 8) = c1; *(b + 9) = c1_nxt1; *(b + 10) = c1_nxt2; *(b + 11) = c1_nxt3; *(b + 12) = c1_nxt4; *(b + 13) = c1_nxt5; *(b + 14) = c1_nxt6; *(b + 15) = c1_nxt7; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; *(c + 0 + 4 * ldc) = c0_nxt4; *(c + 1 + 4 * ldc) = c1_nxt4; *(c + 0 + 5 * ldc) = c0_nxt5; *(c + 1 + 5 * ldc) = c1_nxt5; *(c + 0 + 6 * ldc) = c0_nxt6; *(c + 1 + 6 * ldc) = c1_nxt6; *(c + 0 + 7 * ldc) = c0_nxt7; *(c + 1 + 7 * ldc) = c1_nxt7; } static void ssolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + ldc); c1_nxt1 = *(c + 1 + ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt1 -= aa[0] * bb[1]; c1_nxt1 -= aa[1] * bb[1]; c0_nxt2 -= aa[0] * bb[2]; c1_nxt2 -= aa[1] * bb[2]; c0_nxt3 -= aa[0] * bb[3]; c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } a -= 4; b -= 8; a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); c1 *= a3; c1_nxt1 *= a3; c1_nxt2 *= a3; c1_nxt3 *= a3; c0 -= c1 * a2; c0_nxt1 -= c1_nxt1 * a2; c0_nxt2 -= c1_nxt2 * a2; c0_nxt3 -= c1_nxt3 * a2; c0 *= a0; c0_nxt1 *= a0; c0_nxt2 *= a0; c0_nxt3 *= a0; *(b + 0) = c0; *(b + 1) = c0_nxt1; *(b + 2) = c0_nxt2; *(b + 3) = c0_nxt3; *(b + 4) = c1; *(b + 5) = c1_nxt1; *(b + 6) = c1_nxt2; *(b + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void ssolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt -= aa[0] * bb[1]; c1_nxt -= aa[1] * bb[1]; aa += 2; bb += 2; } a -= 4; b -= 4; a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); c1 *= a3; c1_nxt *= a3; c0 -= c1 * a2; c0_nxt -= c1_nxt * a2; c0 *= a0; c0_nxt *= a0; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void ssolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, a2, a3, c0, c1; c0 = *(c + 0); c1 = *(c + 1); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; aa += 2; bb += 1; } a -= 4; b -= 2; a0 = *(a + 0); a2 = *(a + 2); a3 = *(a + 3); c1 *= a3; c0 -= c1 * a2; c0 *= a0; *(b + 0) = c0; *(b + 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void ssolve_1x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); c4 = *(c + 4 * ldc); c5 = *(c + 5 * ldc); c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; c2 -= aa[0] * bb[2]; c3 -= aa[0] * bb[3]; c4 -= aa[0] * bb[4]; c5 -= aa[0] * bb[5]; c6 -= aa[0] * bb[6]; c7 -= aa[0] * bb[7]; aa += 1; bb += 8; } a0 = *(a - 1); c0 *= a0; c1 *= a0; c2 *= a0; c3 *= a0; c4 *= a0; c5 *= a0; c6 *= a0; c7 *= a0; *(b - 8) = c0; *(b - 7) = c1; *(b - 6) = c2; *(b - 5) = c3; *(b - 4) = c4; *(b - 3) = c5; *(b - 2) = c6; *(b - 1) = c7; *(c + 0 * ldc) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; *(c + 4 * ldc) = c4; *(c + 5 * ldc) = c5; *(c + 6 * ldc) = c6; *(c + 7 * ldc) = c7; } static void ssolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1, c2, c3; c0 = *(c + 0 * ldc); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; c2 -= aa[0] * bb[2]; c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } a0 = *(a - 1); c0 *= a0; c1 *= a0; c2 *= a0; c3 *= a0; *(b - 4) = c0; *(b - 3) = c1; *(b - 2) = c2; *(b - 1) = c3; *(c + 0 * ldc) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; } static void ssolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT a0, c0, c1; c0 = *c; c1 = *(c + ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; aa += 1; bb += 2; } a0 = *(a - 1); c0 *= a0; c1 *= a0; *(b - 2) = c0; *(b - 1) = c1; *(c + 0 * ldc) = c0; *(c + 1 * ldc) = c1; } static void ssolve_1x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; for (k = 0; k < bk; k++) { *c -= a[k] * b[k]; } *c *= *(a - 1); *(b - 1) = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { FLOAT *aa, *cc; BLASLONG i, j, kk; for (j = (n >> 3); j--;) { kk = m + offset; if (m & 7) { if (m & 1) { aa = a + (m - 1) * k + kk; cc = c + (m - 1); ssolve_1x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); kk -= 1; } if (m & 2) { aa = a + ((m & ~1) - 2) * k + 2 * kk; cc = c + ((m & ~1) - 2); ssolve_2x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); kk -= 2; } if (m & 4) { aa = a + ((m & ~3) - 4) * k + 4 * kk; cc = c + ((m & ~3) - 4); ssolve_4x8_ln_msa(aa, b + 8 * kk, cc, ldc, (k - kk)); kk -= 4; } } i = (m >> 3); if (i > 0) { aa = a + ((m & ~7) - 8) * k; cc = c + ((m & ~7) - 8); do { ssolve_8x8_ln_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; kk -= 8; i --; } while (i > 0); } b += 8 * k; c += 8 * ldc; } if (n & 7) { if (n & 4) { kk = m + offset; if (m & 7) { if (m & 1) { aa = a + (m - 1) * k + kk; cc = c + (m - 1); ssolve_1x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); kk -= 1; } if (m & 2) { aa = a + ((m & ~1) - 2) * k + 2 * kk; cc = c + ((m & ~1) - 2); ssolve_2x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); kk -= 2; } if (m & 4) { aa = a + ((m & ~3) - 4) * k + 4 * kk; cc = c + ((m & ~3) - 4); ssolve_4x4_ln_msa(aa, b + 4 * kk, cc, ldc, (k - kk)); kk -= 4; } } i = (m >> 3); if (i > 0) { aa = a + ((m & ~7) - 8) * k; cc = c + ((m & ~7) - 8); do { ssolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; kk -= 8; i --; } while (i > 0); } b += 4 * k; c += 4 * ldc; } if (n & 2) { kk = m + offset; if (m & 7) { if (m & 1) { aa = a + (m - 1) * k + kk; cc = c + (m - 1); ssolve_1x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); kk -= 1; } if (m & 2) { aa = a + ((m & ~1) - 2) * k + 2 * kk; cc = c + ((m & ~1) - 2); ssolve_2x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); kk -= 2; } if (m & 4) { aa = a + ((m & ~3) - 4) * k + 4 * kk; cc = c + ((m & ~3) - 4); ssolve_4x2_ln_msa(aa, b + 2 * kk, cc, ldc, (k - kk)); kk -= 4; } } i = (m >> 3); if (i > 0) { aa = a + ((m & ~7) - 8) * k; cc = c + ((m & ~7) - 8); do { ssolve_8x2_ln_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); aa -= 8 * k; cc -= 8; kk -= 8; i --; } while (i > 0); } b += 2 * k; c += 2 * ldc; } if (n & 1) { kk = m + offset; if (m & 7) { if (m & 1) { aa = a + (m - 1) * k + kk; cc = c + (m - 1); ssolve_1x1_ln_msa(aa, b + kk, cc, (k - kk)); kk -= 1; } if (m & 2) { aa = a + ((m & ~1) - 2) * k + 2 * kk; cc = c + ((m & ~1) - 2); ssolve_2x1_ln_msa(aa, b + kk, cc, (k - kk)); kk -= 2; } if (m & 4) { aa = a + ((m & ~3) - 4) * k + 4 * kk; cc = c + ((m & ~3) - 4); ssolve_4x1_ln_msa(aa, b + kk, cc, (k - kk)); kk -= 4; } } i = (m >> 3); if (i > 0) { aa = a + ((m & ~7) - 8) * k; cc = c + ((m & ~7) - 8); do { ssolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk)); aa -= 8 * k; cc -= 8; kk -= 8; i --; } while (i > 0); } b += k; c += ldc; } } return 0; } OpenBLAS-0.2.20/kernel/mips/strsm_kernel_LT_8x8_msa.c000066400000000000000000001270011313527062700222170ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); LD_SP2(c_nxt4line, 4, src_c8, src_c9); LD_SP2(c_nxt5line, 4, src_c10, src_c11); LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); if (bk > 0) { BLASLONG k, pref_offset; FLOAT *pa0_pref; v4f32 src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (k = 0; k < (bk >> 1); k++) { PREF_OFFSET(pa0_pref, 64); PREF_OFFSET(pa0_pref, 96); LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; pa0_pref += 16; } if (bk & 1) { LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; } } TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, res_c0, res_c1, res_c2, res_c3); TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14, res_c8, res_c9, res_c10, res_c11); TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, res_c4, res_c5, res_c6, res_c7); TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15, res_c12, res_c13, res_c14, res_c15); src_a = LD_SP(a + 0); SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); src_a = LD_SP(a + 4); SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); res_c0 *= src_a0; res_c8 *= src_a0; res_c1 -= res_c0 * src_a1; res_c9 -= res_c8 * src_a1; res_c2 -= res_c0 * src_a2; res_c10 -= res_c8 * src_a2; res_c3 -= res_c0 * src_a3; res_c11 -= res_c8 * src_a3; res_c4 -= res_c0 * src_a4; res_c12 -= res_c8 * src_a4; res_c5 -= res_c0 * src_a5; res_c13 -= res_c8 * src_a5; res_c6 -= res_c0 * src_a6; res_c14 -= res_c8 * src_a6; res_c7 -= res_c0 * src_a7; res_c15 -= res_c8 * src_a7; src_a = LD_SP(a + 9); SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); src_a13 = LD_SP(a + 13); src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); res_c1 *= src_a9; res_c9 *= src_a9; res_c2 -= res_c1 * src_a10; res_c10 -= res_c9 * src_a10; res_c3 -= res_c1 * src_a11; res_c11 -= res_c9 * src_a11; res_c4 -= res_c1 * src_a12; res_c12 -= res_c9 * src_a12; res_c5 -= res_c1 * src_a13; res_c13 -= res_c9 * src_a13; res_c6 -= res_c1 * src_a14; res_c14 -= res_c9 * src_a14; res_c7 -= res_c1 * src_a15; res_c15 -= res_c9 * src_a15; src_a = LD_SP(a + 18); SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); src_a22 = LD_SP(a + 22); src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); res_c2 *= src_a18; res_c10 *= src_a18; res_c3 -= res_c2 * src_a19; res_c11 -= res_c10 * src_a19; res_c4 -= res_c2 * src_a20; res_c12 -= res_c10 * src_a20; res_c5 -= res_c2 * src_a21; res_c13 -= res_c10 * src_a21; res_c6 -= res_c2 * src_a22; res_c14 -= res_c10 * src_a22; res_c7 -= res_c2 * src_a23; res_c15 -= res_c10 * src_a23; src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); res_c3 *= src_a27; res_c11 *= src_a27; res_c4 -= res_c3 * src_a28; res_c12 -= res_c11 * src_a28; res_c5 -= res_c3 * src_a29; res_c13 -= res_c11 * src_a29; res_c6 -= res_c3 * src_a30; res_c14 -= res_c11 * src_a30; res_c7 -= res_c3 * src_a31; res_c15 -= res_c11 * src_a31; ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4); ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c2, src_c4, src_c6); TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11, src_c8, src_c10, src_c12, src_c14); ST_SP(src_c0, c); ST_SP(src_c2, c_nxt1line); ST_SP(src_c4, c_nxt2line); ST_SP(src_c6, c_nxt3line); ST_SP(src_c8, c_nxt4line); ST_SP(src_c10, c_nxt5line); ST_SP(src_c12, c_nxt6line); ST_SP(src_c14, c_nxt7line); src_a = LD_SP(a + 36); SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); res_c4 *= src_a36; res_c12 *= src_a36; res_c5 -= res_c4 * src_a37; res_c13 -= res_c12 * src_a37; res_c6 -= res_c4 * src_a38; res_c14 -= res_c12 * src_a38; res_c7 -= res_c4 * src_a39; res_c15 -= res_c12 * src_a39; src_a45 = LD_SP(a + 45); src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); res_c5 *= src_a45; res_c13 *= src_a45; res_c6 -= res_c5 * src_a46; res_c14 -= res_c13 * src_a46; res_c7 -= res_c5 * src_a47; res_c15 -= res_c13 * src_a47; src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); res_c6 *= src_a54; res_c14 *= src_a54; res_c7 -= res_c6 * src_a55; res_c15 -= res_c14 * src_a55; res_c7 *= src_a63; res_c15 *= src_a63; ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4); ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4); TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, src_c1, src_c3, src_c5, src_c7); TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15, src_c9, src_c11, src_c13, src_c15); ST_SP(src_c1, c + 4); ST_SP(src_c3, c_nxt1line + 4); ST_SP(src_c5, c_nxt2line + 4); ST_SP(src_c7, c_nxt3line + 4); ST_SP(src_c9, c_nxt4line + 4); ST_SP(src_c11, c_nxt5line + 4); ST_SP(src_c13, c_nxt6line + 4); ST_SP(src_c15, c_nxt7line + 4); } static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); for (k = 0; k < bk; k++) { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; a += 8; b += 4; } TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6, res_c0, res_c1, res_c2, res_c3); TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7, res_c4, res_c5, res_c6, res_c7); src_a = LD_SP(a + 0); SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); src_a = LD_SP(a + 4); SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; res_c4 -= res_c0 * src_a4; res_c5 -= res_c0 * src_a5; res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; src_a = LD_SP(a + 9); SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12); src_a13 = LD_SP(a + 13); src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2); src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1); src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0); res_c1 *= src_a9; res_c2 -= res_c1 * src_a10; res_c3 -= res_c1 * src_a11; res_c4 -= res_c1 * src_a12; res_c5 -= res_c1 * src_a13; res_c6 -= res_c1 * src_a14; res_c7 -= res_c1 * src_a15; src_a = LD_SP(a + 18); SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21); src_a22 = LD_SP(a + 22); src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1); src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0); res_c2 *= src_a18; res_c3 -= res_c2 * src_a19; res_c4 -= res_c2 * src_a20; res_c5 -= res_c2 * src_a21; res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; src_a = LD_SP(a + 27); SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30); src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31)); res_c3 *= src_a27; res_c4 -= res_c3 * src_a28; res_c5 -= res_c3 * src_a29; res_c6 -= res_c3 * src_a30; res_c7 -= res_c3 * src_a31; src_a = LD_SP(a + 36); SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39); res_c4 *= src_a36; res_c5 -= res_c4 * src_a37; res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; src_a45 = LD_SP(a + 45); src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2); src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1); src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0); res_c5 *= src_a45; res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54)); src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55)); src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63)); res_c6 *= src_a54; res_c7 -= res_c6 * src_a55; res_c7 *= src_a63; ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); b += 16; ST_SP4(res_c4, res_c5, res_c6, res_c7, b, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c2, src_c4, src_c6); TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, src_c1, src_c3, src_c5, src_c7); ST_SP2(src_c0, src_c1, c, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); } static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; FLOAT a45, a46, a47, a54, a55, a63; FLOAT c0, c1, c2, c3, c4, c5, c6, c7; FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c4 = *(c + 4); c5 = *(c + 5); c6 = *(c + 6); c7 = *(c + 7); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); c4_nxt = *(c + 4 + ldc); c5_nxt = *(c + 5 + ldc); c6_nxt = *(c + 6 + ldc); c7_nxt = *(c + 7 + ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; c4 -= a[4] * b[0]; c5 -= a[5] * b[0]; c6 -= a[6] * b[0]; c7 -= a[7] * b[0]; c0_nxt -= a[0] * b[1]; c1_nxt -= a[1] * b[1]; c2_nxt -= a[2] * b[1]; c3_nxt -= a[3] * b[1]; c4_nxt -= a[4] * b[1]; c5_nxt -= a[5] * b[1]; c6_nxt -= a[6] * b[1]; c7_nxt -= a[7] * b[1]; a += 8; b += 2; } a0 = *(a + 0); a1 = *(a + 1); a2 = *(a + 2); a3 = *(a + 3); a4 = *(a + 4); a5 = *(a + 5); a6 = *(a + 6); a7 = *(a + 7); a9 = *(a + 9); a10 = *(a + 10); a11 = *(a + 11); a12 = *(a + 12); a13 = *(a + 13); a14 = *(a + 14); a15 = *(a + 15); a18 = *(a + 18); a19 = *(a + 19); a20 = *(a + 20); a21 = *(a + 21); a22 = *(a + 22); a23 = *(a + 23); a27 = *(a + 27); a28 = *(a + 28); a29 = *(a + 29); a30 = *(a + 30); a31 = *(a + 31); a36 = *(a + 36); a37 = *(a + 37); a38 = *(a + 38); a39 = *(a + 39); a45 = *(a + 45); a46 = *(a + 46); a47 = *(a + 47); a54 = *(a + 54); a55 = *(a + 55); a63 = *(a + 63); c0 *= a0; c0_nxt *= a0; c1 -= c0 * a1; c1_nxt -= c0_nxt * a1; c1 *= a9; c1_nxt *= a9; c2 -= c0 * a2; c2_nxt -= c0_nxt * a2; c2 -= c1 * a10; c2_nxt -= c1_nxt * a10; c2 *= a18; c2_nxt *= a18; c3 -= c0 * a3; c3_nxt -= c0_nxt * a3; c3 -= c1 * a11; c3_nxt -= c1_nxt * a11; c3 -= c2 * a19; c3_nxt -= c2_nxt * a19; c3 *= a27; c3_nxt *= a27; c4 -= c0 * a4; c4_nxt -= c0_nxt * a4; c4 -= c1 * a12; c4_nxt -= c1_nxt * a12; c4 -= c2 * a20; c4_nxt -= c2_nxt * a20; c4 -= c3 * a28; c4_nxt -= c3_nxt * a28; c4 *= a36; c4_nxt *= a36; c5 -= c0 * a5; c5_nxt -= c0_nxt * a5; c5 -= c1 * a13; c5_nxt -= c1_nxt * a13; c5 -= c2 * a21; c5_nxt -= c2_nxt * a21; c5 -= c3 * a29; c5_nxt -= c3_nxt * a29; c5 -= c4 * a37; c5_nxt -= c4_nxt * a37; c5 *= a45; c5_nxt *= a45; c6 -= c0 * a6; c6_nxt -= c0_nxt * a6; c6 -= c1 * a14; c6_nxt -= c1_nxt * a14; c6 -= c2 * a22; c6_nxt -= c2_nxt * a22; c6 -= c3 * a30; c6_nxt -= c3_nxt * a30; c6 -= c4 * a38; c6_nxt -= c4_nxt * a38; c6 -= c5 * a46; c6_nxt -= c5_nxt * a46; c6 *= a54; c6_nxt *= a54; c7 -= c0 * a7; c7_nxt -= c0_nxt * a7; c7 -= c1 * a15; c7_nxt -= c1_nxt * a15; c7 -= c2 * a23; c7_nxt -= c2_nxt * a23; c7 -= c3 * a31; c7_nxt -= c3_nxt * a31; c7 -= c4 * a39; c7_nxt -= c4_nxt * a39; c7 -= c5 * a47; c7_nxt -= c5_nxt * a47; c7 -= c6 * a55; c7_nxt -= c6_nxt * a55; c7 *= a63; c7_nxt *= a63; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; *(c + 4) = c4; *(c + 5) = c5; *(c + 6) = c6; *(c + 7) = c7; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; *(c + 2 + ldc) = c2_nxt; *(c + 3 + ldc) = c3_nxt; *(c + 4 + ldc) = c4_nxt; *(c + 5 + ldc) = c5_nxt; *(c + 6 + ldc) = c6_nxt; *(c + 7 + ldc) = c7_nxt; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(b + 4) = c2; *(b + 5) = c2_nxt; *(b + 6) = c3; *(b + 7) = c3_nxt; *(b + 8) = c4; *(b + 9) = c4_nxt; *(b + 10) = c5; *(b + 11) = c5_nxt; *(b + 12) = c6; *(b + 13) = c6_nxt; *(b + 14) = c7; *(b + 15) = c7_nxt; } static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c4 = *(c + 4); c5 = *(c + 5); c6 = *(c + 6); c7 = *(c + 7); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; c4 -= a[4] * b[0]; c5 -= a[5] * b[0]; c6 -= a[6] * b[0]; c7 -= a[7] * b[0]; a += 8; b += 1; } a0 = *(a + 0); a1 = *(a + 1); a2 = *(a + 2); a3 = *(a + 3); a4 = *(a + 4); a5 = *(a + 5); a6 = *(a + 6); a7 = *(a + 7); a9 = *(a + 9); a10 = *(a + 10); a11 = *(a + 11); a12 = *(a + 12); a13 = *(a + 13); a14 = *(a + 14); a15 = *(a + 15); a18 = *(a + 18); a19 = *(a + 19); a20 = *(a + 20); a21 = *(a + 21); a22 = *(a + 22); a23 = *(a + 23); a27 = *(a + 27); a28 = *(a + 28); a29 = *(a + 29); a30 = *(a + 30); a31 = *(a + 31); a36 = *(a + 36); a37 = *(a + 37); a38 = *(a + 38); a39 = *(a + 39); a45 = *(a + 45); a46 = *(a + 46); a47 = *(a + 47); a54 = *(a + 54); a55 = *(a + 55); a63 = *(a + 63); c0 *= a0; c1 -= c0 * a1; c1 *= a9; c2 -= c0 * a2; c2 -= c1 * a10; c2 *= a18; c3 -= c0 * a3; c3 -= c1 * a11; c3 -= c2 * a19; c3 *= a27; c4 -= c0 * a4; c4 -= c1 * a12; c4 -= c2 * a20; c4 -= c3 * a28; c4 *= a36; c5 -= c0 * a5; c5 -= c1 * a13; c5 -= c2 * a21; c5 -= c3 * a29; c5 -= c4 * a37; c5 *= a45; c6 -= c0 * a6; c6 -= c1 * a14; c6 -= c2 * a22; c6 -= c3 * a30; c6 -= c4 * a38; c6 -= c5 * a46; c6 *= a54; c7 -= c0 * a7; c7 -= c1 * a15; c7 -= c2 * a23; c7 -= c3 * a31; c7 -= c4 * a39; c7 -= c5 * a47; c7 -= c6 * a55; c7 *= a63; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; *(c + 4) = c4; *(c + 5) = c5; *(c + 6) = c6; *(c + 7) = c7; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(b + 4) = c4; *(b + 5) = c5; *(b + 6) = c6; *(b + 7) = c7; } static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; v4f32 src_a10, src_a11, src_a15, src_a; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); src_c4 = LD_SP(c_nxt4line); src_c5 = LD_SP(c_nxt5line); src_c6 = LD_SP(c_nxt6line); src_c7 = LD_SP(c_nxt7line); for (k = 0; k < (bk >> 1); k++) { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c4 -= src_a0 * src_b0; src_c5 -= src_a0 * src_b1; src_c6 -= src_a0 * src_b2; src_c7 -= src_a0 * src_b3; a += 4; b += 8; src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c4 -= src_a0 * src_b0; src_c5 -= src_a0 * src_b1; src_c6 -= src_a0 * src_b2; src_c7 -= src_a0 * src_b3; a += 4; b += 8; } if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c4 -= src_a0 * src_b0; src_c5 -= src_a0 * src_b1; src_c6 -= src_a0 * src_b2; src_c7 -= src_a0 * src_b3; a += 4; b += 8; } TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3); TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7, res_c4, res_c5, res_c6, res_c7); src_a = LD_SP(a + 0); SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); src_a5 = LD_SP(a + 5); src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); res_c0 *= src_a0; res_c4 *= src_a0; res_c1 -= res_c0 * src_a1; res_c5 -= res_c4 * src_a1; res_c2 -= res_c0 * src_a2; res_c6 -= res_c4 * src_a2; res_c3 -= res_c0 * src_a3; res_c7 -= res_c4 * src_a3; res_c1 *= src_a5; res_c5 *= src_a5; res_c2 -= res_c1 * src_a6; res_c6 -= res_c5 * src_a6; res_c3 -= res_c1 * src_a7; res_c7 -= res_c5 * src_a7; res_c2 *= src_a10; res_c6 *= src_a10; res_c3 -= res_c2 * src_a11; res_c7 -= res_c6 * src_a11; res_c3 *= src_a15; res_c7 *= src_a15; ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4); ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c1, src_c2, src_c3); TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7, src_c4, src_c5, src_c6, src_c7); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); ST_SP(src_c4, c_nxt4line); ST_SP(src_c5, c_nxt5line); ST_SP(src_c6, c_nxt6line); ST_SP(src_c7, c_nxt7line); } static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_b, src_b0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; v4f32 src_a10, src_a11, src_a15, src_a; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); for (k = 0; k < (bk >> 1); k++) { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; a += 4; b += 4; src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; a += 4; b += 4; } if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; a += 4; b += 4; } TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3); src_a = LD_SP(a + 0); SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3); src_a5 = LD_SP(a + 5); src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2); src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1); src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0); src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10)); src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11)); src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15)); res_c0 *= src_a0; res_c1 -= res_c0 * src_a1; res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; res_c1 *= src_a5; res_c2 -= res_c1 * src_a6; res_c3 -= res_c1 * src_a7; res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; res_c3 *= src_a15; ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4); TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3, src_c0, src_c1, src_c2, src_c3); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); } static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt; FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); c2_nxt = *(c + 2 + ldc); c3_nxt = *(c + 3 + ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; c0_nxt -= a[0] * b[1]; c1_nxt -= a[1] * b[1]; c2_nxt -= a[2] * b[1]; c3_nxt -= a[3] * b[1]; a += 4; b += 2; } a0 = *(a + 0); a1 = *(a + 1); a2 = *(a + 2); a3 = *(a + 3); a5 = *(a + 5); a6 = *(a + 6); a7 = *(a + 7); a10 = *(a + 10); a11 = *(a + 11); a15 = *(a + 15); c0 *= a0; c0_nxt *= a0; c1 -= c0 * a1; c1_nxt -= c0_nxt * a1; c1 *= a5; c1_nxt *= a5; c2 -= c0 * a2; c2_nxt -= c0_nxt * a2; c2 -= c1 * a6; c2_nxt -= c1_nxt * a6; c2 *= a10; c2_nxt *= a10; c3 -= c0 * a3; c3_nxt -= c0_nxt * a3; c3 -= c1 * a7; c3_nxt -= c1_nxt * a7; c3 -= c2 * a11; c3_nxt -= c2_nxt * a11; c3 *= a15; c3_nxt *= a15; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(b + 4) = c2; *(b + 5) = c2_nxt; *(b + 6) = c3; *(b + 7) = c3_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; *(c + 2 + ldc) = c2_nxt; *(c + 3 + ldc) = c3_nxt; } static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; a += 4; b += 1; } a0 = *(a + 0); a1 = *(a + 1); a2 = *(a + 2); a3 = *(a + 3); a5 = *(a + 5); a6 = *(a + 6); a7 = *(a + 7); a10 = *(a + 10); a11 = *(a + 11); a15 = *(a + 15); c0 *= a0; c1 -= c0 * a1; c1 *= a5; c2 -= c0 * a2; c2 -= c1 * a6; c2 *= a10; c3 -= c0 * a3; c3 -= c1 * a7; c3 -= c2 * a11; c3 *= a15; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2; FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5; FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + ldc); c1_nxt1 = *(c + 1 + ldc); c0_nxt2 = *(c + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); c0_nxt4 = *(c + 4 * ldc); c1_nxt4 = *(c + 1 + 4 * ldc); c0_nxt5 = *(c + 5 * ldc); c1_nxt5 = *(c + 1 + 5 * ldc); c0_nxt6 = *(c + 6 * ldc); c1_nxt6 = *(c + 1 + 6 * ldc); c0_nxt7 = *(c + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt1 -= a[0] * b[1]; c1_nxt1 -= a[1] * b[1]; c0_nxt2 -= a[0] * b[2]; c1_nxt2 -= a[1] * b[2]; c0_nxt3 -= a[0] * b[3]; c1_nxt3 -= a[1] * b[3]; c0_nxt4 -= a[0] * b[4]; c1_nxt4 -= a[1] * b[4]; c0_nxt5 -= a[0] * b[5]; c1_nxt5 -= a[1] * b[5]; c0_nxt6 -= a[0] * b[6]; c1_nxt6 -= a[1] * b[6]; c0_nxt7 -= a[0] * b[7]; c1_nxt7 -= a[1] * b[7]; a += 2; b += 8; } a0 = *a; a1 = *(a + 1); a3 = *(a + 3); c0 = c0 * a0; c1 = (c1 - c0 * a1) * a3; c0_nxt1 = c0_nxt1 * a0; c1_nxt1 = (c1_nxt1 - c0_nxt1 * a1) * a3; c0_nxt2 = c0_nxt2 * a0; c1_nxt2 = (c1_nxt2 - c0_nxt2 * a1) * a3; c0_nxt3 = c0_nxt3 * a0; c1_nxt3 = (c1_nxt3 - c0_nxt3 * a1) * a3; c0_nxt4 = c0_nxt4 * a0; c1_nxt4 = (c1_nxt4 - c0_nxt4 * a1) * a3; c0_nxt5 = c0_nxt5 * a0; c1_nxt5 = (c1_nxt5 - c0_nxt5 * a1) * a3; c0_nxt6 = c0_nxt6 * a0; c1_nxt6 = (c1_nxt6 - c0_nxt6 * a1) * a3; c0_nxt7 = c0_nxt7 * a0; c1_nxt7 = (c1_nxt7 - c0_nxt7 * a1) * a3; *(b + 0) = c0; *(b + 1) = c0_nxt1; *(b + 2) = c0_nxt2; *(b + 3) = c0_nxt3; *(b + 4) = c0_nxt4; *(b + 5) = c0_nxt5; *(b + 6) = c0_nxt6; *(b + 7) = c0_nxt7; *(b + 8) = c1; *(b + 9) = c1_nxt1; *(b + 10) = c1_nxt2; *(b + 11) = c1_nxt3; *(b + 12) = c1_nxt4; *(b + 13) = c1_nxt5; *(b + 14) = c1_nxt6; *(b + 15) = c1_nxt7; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; *(c + 0 + 4 * ldc) = c0_nxt4; *(c + 1 + 4 * ldc) = c1_nxt4; *(c + 0 + 5 * ldc) = c0_nxt5; *(c + 1 + 5 * ldc) = c1_nxt5; *(c + 0 + 6 * ldc) = c0_nxt6; *(c + 1 + 6 * ldc) = c1_nxt6; *(c + 0 + 7 * ldc) = c0_nxt7; *(c + 1 + 7 * ldc) = c1_nxt7; } static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1; FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + ldc); c1_nxt1 = *(c + 1 + ldc); c0_nxt2 = *(c + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt1 -= a[0] * b[1]; c1_nxt1 -= a[1] * b[1]; c0_nxt2 -= a[0] * b[2]; c1_nxt2 -= a[1] * b[2]; c0_nxt3 -= a[0] * b[3]; c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; } a0 = *a; a1 = *(a + 1); a3 = *(a + 3); c0 *= a0; c0_nxt1 *= a0; c0_nxt2 *= a0; c0_nxt3 *= a0; c1 -= c0 * a1; c1_nxt1 -= c0_nxt1 * a1; c1_nxt2 -= c0_nxt2 * a1; c1_nxt3 -= c0_nxt3 * a1; c1 *= a3; c1_nxt1 *= a3; c1_nxt2 *= a3; c1_nxt3 *= a3; *(b + 0) = c0; *(b + 1) = c0_nxt1; *(b + 2) = c0_nxt2; *(b + 3) = c0_nxt3; *(b + 4) = c1; *(b + 5) = c1_nxt1; *(b + 6) = c1_nxt2; *(b + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt1; *(c + 1 + ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + ldc); c1_nxt = *(c + 1 + ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt -= a[0] * b[1]; c1_nxt -= a[1] * b[1]; a += 2; b += 2; } a0 = *a; a1 = *(a + 1); a3 = *(a + 3); c0 *= a0; c0_nxt *= a0; c1 -= c0 * a1; c1_nxt -= c0_nxt * a1; c1 *= a3; c1_nxt *= a3; *(b + 0) = c0; *(b + 1) = c0_nxt; *(b + 2) = c1; *(b + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT c0, c1; c0 = *(c + 0); c1 = *(c + 1); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; a += 2; b += 1; } c0 *= *(a + 0); c1 -= c0 * *(a + 1); c1 *= *(a + 3); *(b + 0) = c0; *(b + 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); c4 = *(c + 4 * ldc); c5 = *(c + 5 * ldc); c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; c2 -= a[0] * b[2]; c3 -= a[0] * b[3]; c4 -= a[0] * b[4]; c5 -= a[0] * b[5]; c6 -= a[0] * b[6]; c7 -= a[0] * b[7]; a += 1; b += 8; } c0 *= *a; c1 *= *a; c2 *= *a; c3 *= *a; c4 *= *a; c5 *= *a; c6 *= *a; c7 *= *a; *(b + 0) = c0; *(b + 1) = c1; *(b + 2) = c2; *(b + 3) = c3; *(b + 4) = c4; *(b + 5) = c5; *(b + 6) = c6; *(b + 7) = c7; *(c + 0) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; *(c + 4 * ldc) = c4; *(c + 5 * ldc) = c5; *(c + 6 * ldc) = c6; *(c + 7 * ldc) = c7; } static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT c0, c1, c2, c3; c0 = *(c + 0 * ldc); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; c2 -= a[0] * b[2]; c3 -= a[0] * b[3]; a += 1; b += 4; } c0 *= *a; c1 *= *a; c2 *= *a; c3 *= *a; *c = c0; *(c + ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; *b = *c; *(b + 1) = *(c + ldc); *(b + 2) = *(c + 2 * ldc); *(b + 3) = *(c + 3 * ldc); } static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT c0, c1; c0 = *c; c1 = *(c + ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; a += 1; b += 2; } *c = c0 * *a; *(c + ldc) = c1 * *a; *b = *c; *(b + 1) = *(c + ldc); } static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; for (k = 0; k < bk; k++) { *c -= a[0] * b[0]; a++; b++; } *c *= *a; *b = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { FLOAT *aa, *cc; BLASLONG i, j, kk; for (j = (n >> 3); j--;) { kk = offset; aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x8_lt_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; kk += 8; } if (m & 7) { if (m & 4) { ssolve_4x8_lt_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; kk += 4; } if (m & 2) { ssolve_2x8_lt_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; kk += 2; } if (m & 1) { ssolve_1x8_lt_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; kk += 1; } } b += 8 * k; c += 8 * ldc; } if (n & 7) { if (n & 4) { kk = offset; aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x4_lt_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; kk += 8; } if (m & 7) { if (m & 4) { ssolve_4x4_lt_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; kk += 4; } if (m & 2) { ssolve_2x4_lt_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; kk += 2; } if (m & 1) { ssolve_1x4_lt_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; kk += 1; } } b += 4 * k; c += 4 * ldc; } if (n & 2) { kk = offset; aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x2_lt_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; kk += 8; } if (m & 7) { if (m & 4) { ssolve_4x2_lt_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; kk += 4; } if (m & 2) { ssolve_2x2_lt_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; kk += 2; } if (m & 1) { ssolve_1x2_lt_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; kk += 1; } } b += 2 * k; c += 2 * ldc; } if (n & 1) { kk = offset; aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x1_lt_msa(aa, b, cc, kk); aa += 8 * k; cc += 8; kk += 8; } if (m & 7) { if (m & 4) { ssolve_4x1_lt_msa(aa, b, cc, kk); aa += 4 * k; cc += 4; kk += 4; } if (m & 2) { ssolve_2x1_lt_msa(aa, b, cc, kk); aa += 2 * k; cc += 2; kk += 2; } if (m & 1) { ssolve_1x1_lt_msa(aa, b, cc, kk); aa += k; cc += 1; kk += 1; } } b += k; c += ldc; } } return 0; } OpenBLAS-0.2.20/kernel/mips/strsm_kernel_RN_8x8_msa.c000066400000000000000000001250351313527062700222240ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); LD_SP2(c_nxt4line, 4, src_c8, src_c9); LD_SP2(c_nxt5line, 4, src_c10, src_c11); LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); if (bk > 0) { BLASLONG k, pref_offset; FLOAT *pa0_pref; v4f32 src_a0, src_a1, src_bb0, src_bb1; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (k = 0; k < (bk >> 1); k++) { PREF_OFFSET(pa0_pref, 64); PREF_OFFSET(pa0_pref, 96); LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; pa0_pref += 16; } if (bk & 1) { LD_SP2_INC(a, 4, src_a0, src_a1); LD_SP2_INC(b, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; } } src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); src_b = LD_SP(b + 9); SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); src_b13 = LD_SP(b + 13); src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 -= src_c0 * src_b1; src_c3 -= src_c1 * src_b1; src_c4 -= src_c0 * src_b2; src_c5 -= src_c1 * src_b2; src_c6 -= src_c0 * src_b3; src_c7 -= src_c1 * src_b3; src_c8 -= src_c0 * src_b4; src_c9 -= src_c1 * src_b4; src_c10 -= src_c0 * src_b5; src_c11 -= src_c1 * src_b5; src_c12 -= src_c0 * src_b6; src_c13 -= src_c1 * src_b6; src_c14 -= src_c0 * src_b7; src_c15 -= src_c1 * src_b7; ST_SP2(src_c0, src_c1, a, 4); ST_SP2(src_c0, src_c1, c, 4); src_c2 *= src_b9; src_c3 *= src_b9; src_c4 -= src_c2 * src_b10; src_c5 -= src_c3 * src_b10; src_c6 -= src_c2 * src_b11; src_c7 -= src_c3 * src_b11; src_c8 -= src_c2 * src_b12; src_c9 -= src_c3 * src_b12; src_c10 -= src_c2 * src_b13; src_c11 -= src_c3 * src_b13; src_c12 -= src_c2 * src_b14; src_c13 -= src_c3 * src_b14; src_c14 -= src_c2 * src_b15; src_c15 -= src_c3 * src_b15; ST_SP2(src_c2, src_c3, a + 8, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); src_b = LD_SP(b + 18); SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); src_b22 = LD_SP(b + 22); src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); src_c4 *= src_b18; src_c5 *= src_b18; src_c6 -= src_c4 * src_b19; src_c7 -= src_c5 * src_b19; src_c8 -= src_c4 * src_b20; src_c9 -= src_c5 * src_b20; src_c10 -= src_c4 * src_b21; src_c11 -= src_c5 * src_b21; src_c12 -= src_c4 * src_b22; src_c13 -= src_c5 * src_b22; src_c14 -= src_c4 * src_b23; src_c15 -= src_c5 * src_b23; ST_SP2(src_c4, src_c5, a + 16, 4); ST_SP2(src_c4, src_c5, c_nxt2line, 4); src_c6 *= src_b27; src_c7 *= src_b27; src_c8 -= src_c6 * src_b28; src_c9 -= src_c7 * src_b28; src_c10 -= src_c6 * src_b29; src_c11 -= src_c7 * src_b29; src_c12 -= src_c6 * src_b30; src_c13 -= src_c7 * src_b30; src_c14 -= src_c6 * src_b31; src_c15 -= src_c7 * src_b31; ST_SP2(src_c6, src_c7, a + 24, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); src_b = LD_SP(b + 36); SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); src_b45 = LD_SP(b + 45); src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); src_c8 *= src_b36; src_c9 *= src_b36; src_c10 -= src_c8 * src_b37; src_c11 -= src_c9 * src_b37; src_c12 -= src_c8 * src_b38; src_c13 -= src_c9 * src_b38; src_c14 -= src_c8 * src_b39; src_c15 -= src_c9 * src_b39; ST_SP2(src_c8, src_c9, a + 32, 4); ST_SP2(src_c8, src_c9, c_nxt4line, 4); src_c10 *= src_b45; src_c11 *= src_b45; src_c12 -= src_c10 * src_b46; src_c13 -= src_c11 * src_b46; src_c14 -= src_c10 * src_b47; src_c15 -= src_c11 * src_b47; ST_SP2(src_c10, src_c11, a + 40, 4); ST_SP2(src_c10, src_c11, c_nxt5line, 4); src_c12 *= src_b54; src_c13 *= src_b54; src_c14 -= src_c12 * src_b55; src_c15 -= src_c13 * src_b55; ST_SP2(src_c12, src_c13, a + 48, 4); ST_SP2(src_c12, src_c13, c_nxt6line, 4); src_c14 *= src_b63; src_c15 *= src_b63; ST_SP2(src_c14, src_c15, a + 56, 4); ST_SP2(src_c14, src_c15, c_nxt7line, 4); } static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); for (k = 0; k < (bk >> 1); k++) { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; a += 8; b += 4; LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; a += 8; b += 4; } if ((bk & 1) && (bk > 0)) { LD_SP2(a, 4, src_a0, src_a1); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; a += 8; b += 4; } src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_b5 = LD_SP(b + 5); src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 -= src_c0 * src_b1; src_c3 -= src_c1 * src_b1; src_c4 -= src_c0 * src_b2; src_c5 -= src_c1 * src_b2; src_c6 -= src_c0 * src_b3; src_c7 -= src_c1 * src_b3; src_c2 *= src_b5; src_c3 *= src_b5; src_c4 -= src_c2 * src_b6; src_c5 -= src_c3 * src_b6; src_c6 -= src_c2 * src_b7; src_c7 -= src_c3 * src_b7; src_c4 *= src_b10; src_c5 *= src_b10; src_c6 -= src_c4 * src_b11; src_c7 -= src_c5 * src_b11; src_c6 *= src_b15; src_c7 *= src_b15; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); ST_SP2(src_c0, src_c1, c, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); } static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_a0, src_a1; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; FLOAT *c_nxt1line = c + ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); for (k = 0; k < (bk >> 1); k++) { LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; a += 8; b += 2; LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; a += 8; b += 2; } if ((bk & 1) && (bk > 0)) { LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; a += 8; b += 2; } src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; src_c2 -= src_c0 * src_b1; src_c3 -= src_c1 * src_b1; src_c2 *= src_b3; src_c3 *= src_b3; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP2(src_c0, src_c1, c, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); } static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; LD_SP2(c, 4, src_c0, src_c1); for (k = 0; k < (bk >> 2); k++) { LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; a += 8; b += 1; LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; a += 8; b += 1; LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; a += 8; b += 1; LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; a += 8; b += 1; } if ((bk & 3) && (bk > 0)) { if (bk & 2) { LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; a += 8; b += 1; LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; a += 8; b += 1; } if (bk & 1) { LD_SP2(a, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; a += 8; b += 1; } } src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 *= src_b0; src_c1 *= src_b0; ST_SP2(src_c0, src_c1, a, 4); ST_SP2(src_c0, src_c1, c, 4); } static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7; v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18; v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28; v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39; v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); src_c4 = LD_SP(c_nxt4line); src_c5 = LD_SP(c_nxt5line); src_c6 = LD_SP(c_nxt6line); src_c7 = LD_SP(c_nxt7line); for (k = 0; k < bk; k++) { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c4 -= src_a0 * src_b0; src_c5 -= src_a0 * src_b1; src_c6 -= src_a0 * src_b2; src_c7 -= src_a0 * src_b3; a += 4; b += 8; } src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_b = LD_SP(b + 4); SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7); src_b = LD_SP(b + 9); SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12); src_b13 = LD_SP(b + 13); src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2); src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1); src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0); src_b = LD_SP(b + 18); SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21); src_b22 = LD_SP(b + 22); src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1); src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0); src_b = LD_SP(b + 27); SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30); src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31)); src_b = LD_SP(b + 36); SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39); src_b45 = LD_SP(b + 45); src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2); src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0); src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54)); src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55)); src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; src_c2 -= src_c0 * src_b2; src_c3 -= src_c0 * src_b3; src_c4 -= src_c0 * src_b4; src_c5 -= src_c0 * src_b5; src_c6 -= src_c0 * src_b6; src_c7 -= src_c0 * src_b7; src_c1 *= src_b9; src_c2 -= src_c1 * src_b10; src_c3 -= src_c1 * src_b11; src_c4 -= src_c1 * src_b12; src_c5 -= src_c1 * src_b13; src_c6 -= src_c1 * src_b14; src_c7 -= src_c1 * src_b15; src_c2 *= src_b18; src_c3 -= src_c2 * src_b19; src_c4 -= src_c2 * src_b20; src_c5 -= src_c2 * src_b21; src_c6 -= src_c2 * src_b22; src_c7 -= src_c2 * src_b23; src_c3 *= src_b27; src_c4 -= src_c3 * src_b28; src_c5 -= src_c3 * src_b29; src_c6 -= src_c3 * src_b30; src_c7 -= src_c3 * src_b31; src_c4 *= src_b36; src_c5 -= src_c4 * src_b37; src_c6 -= src_c4 * src_b38; src_c7 -= src_c4 * src_b39; src_c5 *= src_b45; src_c6 -= src_c5 * src_b46; src_c7 -= src_c5 * src_b47; src_c6 *= src_b54; src_c7 -= src_c6 * src_b55; src_c7 *= src_b63; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); ST_SP(src_c4, c_nxt4line); ST_SP(src_c5, c_nxt5line); ST_SP(src_c6, c_nxt6line); ST_SP(src_c7, c_nxt7line); } static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3; v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); for (k = 0; k < (bk >> 1); k++) { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; a += 4; b += 4; src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; a += 4; b += 4; } if ((bk & 1) && (bk > 0)) { src_a0 = LD_SP(a); src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; a += 4; b += 4; } src_b = LD_SP(b + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_b5 = LD_SP(b + 5); src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2); src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1); src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0); src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10)); src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11)); src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; src_c2 -= src_c0 * src_b2; src_c3 -= src_c0 * src_b3; src_c1 *= src_b5; src_c2 -= src_c1 * src_b6; src_c3 -= src_c1 * src_b7; src_c2 *= src_b10; src_c3 -= src_c2 * src_b11; src_c3 *= src_b15; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); } static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3; FLOAT *c_nxt1line = c + ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); for (k = 0; k < (bk >> 2); k++) { src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; a += 4; b += 2; src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; a += 4; b += 2; src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; a += 4; b += 2; src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; a += 4; b += 2; } if ((bk & 3) && (bk > 0)) { if (bk & 2) { src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; a += 4; b += 2; src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; a += 4; b += 2; } if (bk & 1) { src_a = LD_SP(a); src_b0 = LD_SP(b); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; a += 4; b += 2; } } src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1)); src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 -= src_c0 * src_b1; src_c1 *= src_b3; ST_SP2(src_c0, src_c1, a, 4); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); } static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c2 -= a[2] * b[0]; c3 -= a[3] * b[0]; a += 4; b += 1; } b0 = *(b + 0); c0 *= b0; c1 *= b0; c2 *= b0; c3 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31; FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63; FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; FLOAT c0_nxt7, c1_nxt7; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + 1 * ldc); c1_nxt1 = *(c + 1 + 1 * ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); c0_nxt4 = *(c + 0 + 4 * ldc); c1_nxt4 = *(c + 1 + 4 * ldc); c0_nxt5 = *(c + 0 + 5 * ldc); c1_nxt5 = *(c + 1 + 5 * ldc); c0_nxt6 = *(c + 0 + 6 * ldc); c1_nxt6 = *(c + 1 + 6 * ldc); c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt1 -= a[0] * b[1]; c1_nxt1 -= a[1] * b[1]; c0_nxt2 -= a[0] * b[2]; c1_nxt2 -= a[1] * b[2]; c0_nxt3 -= a[0] * b[3]; c1_nxt3 -= a[1] * b[3]; c0_nxt4 -= a[0] * b[4]; c1_nxt4 -= a[1] * b[4]; c0_nxt5 -= a[0] * b[5]; c1_nxt5 -= a[1] * b[5]; c0_nxt6 -= a[0] * b[6]; c1_nxt6 -= a[1] * b[6]; c0_nxt7 -= a[0] * b[7]; c1_nxt7 -= a[1] * b[7]; a += 2; b += 8; } b0 = *(b + 0); b1 = *(b + 1); b2 = *(b + 2); b3 = *(b + 3); b4 = *(b + 4); b5 = *(b + 5); b6 = *(b + 6); b7 = *(b + 7); b9 = *(b + 9); b10 = *(b + 10); b11 = *(b + 11); b12 = *(b + 12); b13 = *(b + 13); b14 = *(b + 14); b15 = *(b + 15); b18 = *(b + 18); b19 = *(b + 19); b20 = *(b + 20); b21 = *(b + 21); b22 = *(b + 22); b23 = *(b + 23); b27 = *(b + 27); b28 = *(b + 28); b29 = *(b + 29); b30 = *(b + 30); b31 = *(b + 31); b36 = *(b + 36); b37 = *(b + 37); b38 = *(b + 38); b39 = *(b + 39); b45 = *(b + 45); b46 = *(b + 46); b47 = *(b + 47); b54 = *(b + 54); b55 = *(b + 55); b63 = *(b + 63); c0 *= b0; c1 *= b0; c0_nxt1 -= c0 * b1; c1_nxt1 -= c1 * b1; c0_nxt2 -= c0 * b2; c1_nxt2 -= c1 * b2; c0_nxt3 -= c0 * b3; c1_nxt3 -= c1 * b3; c0_nxt4 -= c0 * b4; c1_nxt4 -= c1 * b4; c0_nxt5 -= c0 * b5; c1_nxt5 -= c1 * b5; c0_nxt6 -= c0 * b6; c1_nxt6 -= c1 * b6; c0_nxt7 -= c0 * b7; c1_nxt7 -= c1 * b7; c0_nxt1 *= b9; c1_nxt1 *= b9; c0_nxt2 -= c0_nxt1 * b10; c1_nxt2 -= c1_nxt1 * b10; c0_nxt3 -= c0_nxt1 * b11; c1_nxt3 -= c1_nxt1 * b11; c0_nxt4 -= c0_nxt1 * b12; c1_nxt4 -= c1_nxt1 * b12; c0_nxt5 -= c0_nxt1 * b13; c1_nxt5 -= c1_nxt1 * b13; c0_nxt6 -= c0_nxt1 * b14; c1_nxt6 -= c1_nxt1 * b14; c0_nxt7 -= c0_nxt1 * b15; c1_nxt7 -= c1_nxt1 * b15; c0_nxt2 *= b18; c1_nxt2 *= b18; c0_nxt3 -= c0_nxt2 * b19; c1_nxt3 -= c1_nxt2 * b19; c0_nxt4 -= c0_nxt2 * b20; c1_nxt4 -= c1_nxt2 * b20; c0_nxt5 -= c0_nxt2 * b21; c1_nxt5 -= c1_nxt2 * b21; c0_nxt6 -= c0_nxt2 * b22; c1_nxt6 -= c1_nxt2 * b22; c0_nxt7 -= c0_nxt2 * b23; c1_nxt7 -= c1_nxt2 * b23; c0_nxt3 *= b27; c1_nxt3 *= b27; c0_nxt4 -= c0_nxt3 * b28; c1_nxt4 -= c1_nxt3 * b28; c0_nxt5 -= c0_nxt3 * b29; c1_nxt5 -= c1_nxt3 * b29; c0_nxt6 -= c0_nxt3 * b30; c1_nxt6 -= c1_nxt3 * b30; c0_nxt7 -= c0_nxt3 * b31; c1_nxt7 -= c1_nxt3 * b31; c0_nxt4 *= b36; c1_nxt4 *= b36; c0_nxt5 -= c0_nxt4 * b37; c1_nxt5 -= c1_nxt4 * b37; c0_nxt6 -= c0_nxt4 * b38; c1_nxt6 -= c1_nxt4 * b38; c0_nxt7 -= c0_nxt4 * b39; c1_nxt7 -= c1_nxt4 * b39; c0_nxt5 *= b45; c1_nxt5 *= b45; c0_nxt6 -= c0_nxt5 * b46; c1_nxt6 -= c1_nxt5 * b46; c0_nxt7 -= c0_nxt5 * b47; c1_nxt7 -= c1_nxt5 * b47; c0_nxt6 *= b54; c1_nxt6 *= b54; c0_nxt7 -= c0_nxt6 * b55; c1_nxt7 -= c1_nxt6 * b55; c0_nxt7 *= b63; c1_nxt7 *= b63; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt1; *(a + 3) = c1_nxt1; *(a + 4) = c0_nxt2; *(a + 5) = c1_nxt2; *(a + 6) = c0_nxt3; *(a + 7) = c1_nxt3; *(a + 8) = c0_nxt4; *(a + 9) = c1_nxt4; *(a + 10) = c0_nxt5; *(a + 11) = c1_nxt5; *(a + 12) = c0_nxt6; *(a + 13) = c1_nxt6; *(a + 14) = c0_nxt7; *(a + 15) = c1_nxt7; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; *(c + 0 + 4 * ldc) = c0_nxt4; *(c + 1 + 4 * ldc) = c1_nxt4; *(c + 0 + 5 * ldc) = c0_nxt5; *(c + 1 + 5 * ldc) = c1_nxt5; *(c + 0 + 6 * ldc) = c0_nxt6; *(c + 1 + 6 * ldc) = c1_nxt6; *(c + 0 + 7 * ldc) = c0_nxt7; *(c + 1 + 7 * ldc) = c1_nxt7; } static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1; FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + 1 * ldc); c1_nxt1 = *(c + 1 + 1 * ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt1 -= a[0] * b[1]; c1_nxt1 -= a[1] * b[1]; c0_nxt2 -= a[0] * b[2]; c1_nxt2 -= a[1] * b[2]; c0_nxt3 -= a[0] * b[3]; c1_nxt3 -= a[1] * b[3]; a += 2; b += 4; } b0 = *(b + 0); b1 = *(b + 1); b2 = *(b + 2); b3 = *(b + 3); b5 = *(b + 5); b6 = *(b + 6); b7 = *(b + 7); b10 = *(b + 10); b11 = *(b + 11); b15 = *(b + 15); c0 *= b0; c1 *= b0; c0_nxt1 -= c0 * b1; c1_nxt1 -= c1 * b1; c0_nxt1 *= b5; c1_nxt1 *= b5; c0_nxt2 -= c0 * b2; c1_nxt2 -= c1 * b2; c0_nxt2 -= c0_nxt1 * b6; c1_nxt2 -= c1_nxt1 * b6; c0_nxt2 *= b10; c1_nxt2 *= b10; c0_nxt3 -= c0 * b3; c1_nxt3 -= c1 * b3; c0_nxt3 -= c0_nxt1 * b7; c1_nxt3 -= c1_nxt1 * b7; c0_nxt3 -= c0_nxt2 * b11; c1_nxt3 -= c1_nxt2 * b11; c0_nxt3 *= b15; c1_nxt3 *= b15; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt1; *(a + 3) = c1_nxt1; *(a + 4) = c0_nxt2; *(a + 5) = c1_nxt2; *(a + 6) = c0_nxt3; *(a + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; *(c + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; c0_nxt -= a[0] * b[1]; c1_nxt -= a[1] * b[1]; a += 2; b += 2; } b0 = *(b + 0); b1 = *(b + 1); b3 = *(b + 3); c0 *= b0; c1 *= b0; c0_nxt -= c0 * b1; c1_nxt -= c1 * b1; c0_nxt *= b3; c1_nxt *= b3; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt; *(a + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[1] * b[0]; a += 2; b += 1; } b0 = *(b + 0); c0 *= b0; c1 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15; FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38; FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); c4 = *(c + 4 * ldc); c5 = *(c + 5 * ldc); c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; c2 -= a[0] * b[2]; c3 -= a[0] * b[3]; c4 -= a[0] * b[4]; c5 -= a[0] * b[5]; c6 -= a[0] * b[6]; c7 -= a[0] * b[7]; a += 1; b += 8; } b0 = *(b + 0); b1 = *(b + 1); b2 = *(b + 2); b3 = *(b + 3); b4 = *(b + 4); b5 = *(b + 5); b6 = *(b + 6); b7 = *(b + 7); b9 = *(b + 9); b10 = *(b + 10); b11 = *(b + 11); b12 = *(b + 12); b13 = *(b + 13); b14 = *(b + 14); b15 = *(b + 15); b18 = *(b + 18); b19 = *(b + 19); b20 = *(b + 20); b21 = *(b + 21); b22 = *(b + 22); b23 = *(b + 23); b27 = *(b + 27); b28 = *(b + 28); b29 = *(b + 29); b30 = *(b + 30); b31 = *(b + 31); b36 = *(b + 36); b37 = *(b + 37); b38 = *(b + 38); b39 = *(b + 39); b45 = *(b + 45); b46 = *(b + 46); b47 = *(b + 47); b54 = *(b + 54); b55 = *(b + 55); b63 = *(b + 63); c0 *= b0; c1 -= c0 * b1; c1 *= b9; c2 -= c0 * b2; c2 -= c1 * b10; c2 *= b18; c3 -= c0 * b3; c3 -= c1 * b11; c3 -= c2 * b19; c3 *= b27; c4 -= c0 * b4; c4 -= c1 * b12; c4 -= c2 * b20; c4 -= c3 * b28; c4 *= b36; c5 -= c0 * b5; c5 -= c1 * b13; c5 -= c2 * b21; c5 -= c3 * b29; c5 -= c4 * b37; c5 *= b45; c6 -= c0 * b6; c6 -= c1 * b14; c6 -= c2 * b22; c6 -= c3 * b30; c6 -= c4 * b38; c6 -= c5 * b46; c6 *= b54; c7 -= c0 * b7; c7 -= c1 * b15; c7 -= c2 * b23; c7 -= c3 * b31; c7 -= c4 * b39; c7 -= c5 * b47; c7 -= c6 * b55; c7 *= b63; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(a + 4) = c4; *(a + 5) = c5; *(a + 6) = c6; *(a + 7) = c7; *(c + 0) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; *(c + 4 * ldc) = c4; *(c + 5 * ldc) = c5; *(c + 6 * ldc) = c6; *(c + 7 * ldc) = c7; } static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; c2 -= a[0] * b[2]; c3 -= a[0] * b[3]; a += 1; b += 4; } b0 = *(b + 0); b1 = *(b + 1); b2 = *(b + 2); b3 = *(b + 3); b5 = *(b + 5); b6 = *(b + 6); b7 = *(b + 7); b10 = *(b + 10); b11 = *(b + 11); b15 = *(b + 15); c0 *= b0; c1 -= c0 * b1; c1 *= b5; c2 -= c0 * b2; c2 -= c1 * b6; c2 *= b10; c3 -= c0 * b3; c3 -= c1 * b7; c3 -= c2 * b11; c3 *= b15; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c + 0) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; } static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT b0, b1, b3, c0, c1; c0 = *c; c1 = *(c + ldc); for (k = 0; k < bk; k++) { c0 -= a[0] * b[0]; c1 -= a[0] * b[1]; a += 1; b += 2; } b0 = *(b + 0); b1 = *(b + 1); b3 = *(b + 3); c0 *= b0; c1 -= c0 * b1; c1 *= b3; *(a + 0) = c0; *(a + 1) = c1; *(c + 0) = c0; *(c + ldc) = c1; } static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; for (k = 0; k < bk; k++) { *c -= a[0] * b[0]; a++; b++; } *c *= *b; *a = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { FLOAT *aa, *cc; BLASLONG i, j, kk; kk = -offset; for (j = (n >> 3); j--;) { aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x8_rn_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x8_rn_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x8_rn_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x8_rn_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; } } kk += 8; b += 8 * k; c += 8 * ldc; } if (n & 7) { if (n & 4) { aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x4_rn_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x4_rn_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x4_rn_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x4_rn_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; } } b += 4 * k; c += 4 * ldc; kk += 4; } if (n & 2) { aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x2_rn_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x2_rn_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x2_rn_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x2_rn_msa(aa, b, cc, ldc, kk); aa += k; cc += 1; } } b += 2 * k; c += 2 * ldc; kk += 2; } if (n & 1) { aa = a; cc = c; for (i = (m >> 3); i--;) { ssolve_8x1_rn_msa(aa, b, cc, ldc, kk); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x1_rn_msa(aa, b, cc, ldc, kk); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x1_rn_msa(aa, b, cc, ldc, kk); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x1_rn_msa(aa, b, cc, kk); aa += k; cc += 1; } } b += k; c += ldc; kk += 1; } } return 0; } OpenBLAS-0.2.20/kernel/mips/strsm_kernel_RT_8x8_msa.c000066400000000000000000001272571313527062700222420ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); LD_SP2(c_nxt4line, 4, src_c8, src_c9); LD_SP2(c_nxt5line, 4, src_c10, src_c11); LD_SP2(c_nxt6line, 4, src_c12, src_c13); LD_SP2(c_nxt7line, 4, src_c14, src_c15); if (bk > 0) { BLASLONG k, pref_offset; FLOAT *aa = a, *bb = b, *pa0_pref; v4f32 src_a0, src_a1, src_b1, src_b2, src_b3, src_bb0, src_bb1; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (k = 0; k < (bk >> 1); k++) { PREF_OFFSET(pa0_pref, 64); PREF_OFFSET(pa0_pref, 96); LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; pa0_pref += 16; } if (bk & 1) { LD_SP2_INC(aa, 4, src_a0, src_a1); LD_SP2_INC(bb, 4, src_bb0, src_bb1); SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3); src_c8 -= src_a0 * src_b0; src_c9 -= src_a1 * src_b0; src_c10 -= src_a0 * src_b1; src_c11 -= src_a1 * src_b1; src_c12 -= src_a0 * src_b2; src_c13 -= src_a1 * src_b2; src_c14 -= src_a0 * src_b3; src_c15 -= src_a1 * src_b3; } } b -= 64; src_b = LD_SP(b + 60); SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); src_b = LD_SP(b + 56); SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); src_c15 *= src_b63; src_c14 *= src_b63; src_c13 -= src_c15 * src_b62; src_c12 -= src_c14 * src_b62; src_c11 -= src_c15 * src_b61; src_c10 -= src_c14 * src_b61; src_c9 -= src_c15 * src_b60; src_c8 -= src_c14 * src_b60; src_c7 -= src_c15 * src_b59; src_c6 -= src_c14 * src_b59; src_c5 -= src_c15 * src_b58; src_c4 -= src_c14 * src_b58; src_c3 -= src_c15 * src_b57; src_c2 -= src_c14 * src_b57; src_c1 -= src_c15 * src_b56; src_c0 -= src_c14 * src_b56; src_b = LD_SP(b + 48); SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); src_b52 = LD_SP(b + 52); src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); src_c12 *= src_b54; src_c13 *= src_b54; src_c10 -= src_c12 * src_b53; src_c11 -= src_c13 * src_b53; src_c8 -= src_c12 * src_b52; src_c9 -= src_c13 * src_b52; src_c6 -= src_c12 * src_b51; src_c7 -= src_c13 * src_b51; src_c4 -= src_c12 * src_b50; src_c5 -= src_c13 * src_b50; src_c2 -= src_c12 * src_b49; src_c3 -= src_c13 * src_b49; src_c0 -= src_c12 * src_b48; src_c1 -= src_c13 * src_b48; ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4); ST_SP2(src_c12, src_c13, c_nxt6line, 4); ST_SP2(src_c14, src_c15, c_nxt7line, 4); src_b = LD_SP(b + 40); SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); src_b44 = LD_SP(b + 44); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); src_c10 *= src_b45; src_c11 *= src_b45; src_c8 -= src_c10 * src_b44; src_c9 -= src_c11 * src_b44; src_c6 -= src_c10 * src_b43; src_c7 -= src_c11 * src_b43; src_c4 -= src_c10 * src_b42; src_c5 -= src_c11 * src_b42; src_c2 -= src_c10 * src_b41; src_c3 -= src_c11 * src_b41; src_c0 -= src_c10 * src_b40; src_c1 -= src_c11 * src_b40; src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); src_c8 *= src_b36; src_c9 *= src_b36; src_c6 -= src_c8 * src_b35; src_c7 -= src_c9 * src_b35; src_c4 -= src_c8 * src_b34; src_c5 -= src_c9 * src_b34; src_c2 -= src_c8 * src_b33; src_c3 -= src_c9 * src_b33; src_c0 -= src_c8 * src_b32; src_c1 -= src_c9 * src_b32; ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4); ST_SP2(src_c8, src_c9, c_nxt4line, 4); ST_SP2(src_c10, src_c11, c_nxt5line, 4); src_b = LD_SP(b + 24); SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); src_c6 *= src_b27; src_c7 *= src_b27; src_c4 -= src_c6 * src_b26; src_c5 -= src_c7 * src_b26; src_c2 -= src_c6 * src_b25; src_c3 -= src_c7 * src_b25; src_c0 -= src_c6 * src_b24; src_c1 -= src_c7 * src_b24; src_b16 = LD_SP(b + 16); src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); src_c4 *= src_b18; src_c5 *= src_b18; src_c2 -= src_c4 * src_b17; src_c3 -= src_c5 * src_b17; src_c0 -= src_c4 * src_b16; src_c1 -= src_c5 * src_b16; ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4); ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c2 *= src_b9; src_c3 *= src_b9; src_c0 -= src_c2 * src_b8; src_c1 -= src_c3 * src_b8; src_c0 *= src_b0; src_c1 *= src_b0; ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4); ST_SP2(src_c0, src_c1, c, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); } static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_a0, src_a1, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12; v4f32 src_b13, src_b14, src_b15; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); LD_SP2(c_nxt2line, 4, src_c4, src_c5); LD_SP2(c_nxt3line, 4, src_c6, src_c7); for (k = 0; k < (bk >> 1); k++) { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; aa += 8; bb += 4; LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; aa += 8; bb += 4; } if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; src_c4 -= src_a0 * src_b2; src_c5 -= src_a1 * src_b2; src_c6 -= src_a0 * src_b3; src_c7 -= src_a1 * src_b3; } a -= 32; b -= 16; src_b = LD_SP(b + 12); SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); src_b8 = LD_SP(b + 8); src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c7 *= src_b15; src_c6 *= src_b15; src_c5 -= src_c7 * src_b14; src_c4 -= src_c6 * src_b14; src_c3 -= src_c7 * src_b13; src_c2 -= src_c6 * src_b13; src_c1 -= src_c7 * src_b12; src_c0 -= src_c6 * src_b12; src_c5 *= src_b10; src_c4 *= src_b10; src_c3 -= src_c5 * src_b9; src_c2 -= src_c4 * src_b9; src_c1 -= src_c5 * src_b8; src_c0 -= src_c4 * src_b8; src_c3 *= src_b5; src_c2 *= src_b5; src_c1 -= src_c3 * src_b4; src_c0 -= src_c2 * src_b4; src_c1 *= src_b0; src_c0 *= src_b0; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); ST_SP2(src_c0, src_c1, c, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); ST_SP2(src_c4, src_c5, c_nxt2line, 4); ST_SP2(src_c6, src_c7, c_nxt3line, 4); } static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_a0, src_a1, src_b1; v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; LD_SP2(c, 4, src_c0, src_c1); LD_SP2(c_nxt1line, 4, src_c2, src_c3); for (k = 0; k < (bk >> 1); k++) { LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; aa += 8; bb += 2; LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; aa += 8; bb += 2; } if ((bk & 1) && (bk > 0)) { LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; src_c2 -= src_a0 * src_b1; src_c3 -= src_a1 * src_b1; } a -= 16; b -= 4; src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_c2 *= src_b3; src_c3 *= src_b3; src_c0 -= src_c2 * src_b2; src_c1 -= src_c3 * src_b2; src_c0 *= src_b0; src_c1 *= src_b0; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP2(src_c0, src_c1, c, 4); ST_SP2(src_c2, src_c3, c_nxt1line, 4); } static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_a0, src_a1, src_c0, src_c1, src_b0; LD_SP2(c, 4, src_c0, src_c1); for (k = 0; k < (bk >> 2); k++) { LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; aa += 8; bb += 1; LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; aa += 8; bb += 1; LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; aa += 8; bb += 1; LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; aa += 8; bb += 1; } if ((bk & 3) && (bk > 0)) { if (bk & 2) { LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; aa += 8; bb += 1; LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; aa += 8; bb += 1; } if (bk & 1) { LD_SP2(aa, 4, src_a0, src_a1); src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0)); src_c0 -= src_a0 * src_b0; src_c1 -= src_a1 * src_b0; } } a -= 8; b -= 1; src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c0 *= src_b0; src_c1 *= src_b0; ST_SP2(src_c0, src_c1, a, 4); ST_SP2(src_c0, src_c1, c, 4); } static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_a0, src_b1, src_b2, src_b3; v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24; v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35; v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45; v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54; v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; FLOAT *c_nxt4line = c + 4 * ldc; FLOAT *c_nxt5line = c + 5 * ldc; FLOAT *c_nxt6line = c + 6 * ldc; FLOAT *c_nxt7line = c + 7 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); src_c4 = LD_SP(c_nxt4line); src_c5 = LD_SP(c_nxt5line); src_c6 = LD_SP(c_nxt6line); src_c7 = LD_SP(c_nxt7line); for (k = 0; k < bk; k++) { src_a0 = LD_SP(aa); src_b = LD_SP(bb + 0); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a0 * src_b0; src_c1 -= src_a0 * src_b1; src_c2 -= src_a0 * src_b2; src_c3 -= src_a0 * src_b3; src_b = LD_SP(bb + 4); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c4 -= src_a0 * src_b0; src_c5 -= src_a0 * src_b1; src_c6 -= src_a0 * src_b2; src_c7 -= src_a0 * src_b3; aa += 4; bb += 8; } a -= 32; b -= 64; src_b = LD_SP(b + 60); SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63); src_b = LD_SP(b + 56); SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59); src_b = LD_SP(b + 48); SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51); src_b52 = LD_SP(b + 52); src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2); src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1); src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0); src_b = LD_SP(b + 40); SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43); src_b44 = LD_SP(b + 44); src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1); src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0); src_b = LD_SP(b + 32); SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35); src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36)); src_b = LD_SP(b + 24); SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27); src_b16 = LD_SP(b + 16); src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2); src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1); src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0); src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9)); src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8)); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c7 *= src_b63; src_c6 -= src_c7 * src_b62; src_c5 -= src_c7 * src_b61; src_c4 -= src_c7 * src_b60; src_c3 -= src_c7 * src_b59; src_c2 -= src_c7 * src_b58; src_c1 -= src_c7 * src_b57; src_c0 -= src_c7 * src_b56; src_c6 *= src_b54; src_c5 -= src_c6 * src_b53; src_c4 -= src_c6 * src_b52; src_c3 -= src_c6 * src_b51; src_c2 -= src_c6 * src_b50; src_c1 -= src_c6 * src_b49; src_c0 -= src_c6 * src_b48; src_c5 *= src_b45; src_c4 -= src_c5 * src_b44; src_c3 -= src_c5 * src_b43; src_c2 -= src_c5 * src_b42; src_c1 -= src_c5 * src_b41; src_c0 -= src_c5 * src_b40; src_c4 *= src_b36; src_c3 -= src_c4 * src_b35; src_c2 -= src_c4 * src_b34; src_c1 -= src_c4 * src_b33; src_c0 -= src_c4 * src_b32; src_c3 *= src_b27; src_c2 -= src_c3 * src_b26; src_c1 -= src_c3 * src_b25; src_c0 -= src_c3 * src_b24; src_c2 *= src_b18; src_c1 -= src_c2 * src_b17; src_c0 -= src_c2 * src_b16; src_c1 *= src_b9; src_c0 -= src_c1 * src_b8; src_c0 *= src_b0; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); ST_SP(src_c4, c_nxt4line); ST_SP(src_c5, c_nxt5line); ST_SP(src_c6, c_nxt6line); ST_SP(src_c7, c_nxt7line); } static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_c0, src_c1, src_c2, src_c3, src_b; v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); src_c2 = LD_SP(c_nxt2line); src_c3 = LD_SP(c_nxt3line); for (k = 0; k < (bk >> 1); k++) { src_a = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; src_c2 -= src_a * src_b2; src_c3 -= src_a * src_b3; aa += 4; bb += 4; src_a = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; src_c2 -= src_a * src_b2; src_c3 -= src_a * src_b3; aa += 4; bb += 4; } if ((bk & 1) && (bk > 0)) { src_a = LD_SP(aa); src_b = LD_SP(bb); SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; src_c2 -= src_a * src_b2; src_c3 -= src_a * src_b3; } a -= 16; b -= 16; src_b = LD_SP(b + 12); SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15); src_b8 = LD_SP(b + 8); src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2); src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1); src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0); src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5)); src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4)); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c3 *= src_b15; src_c2 -= src_c3 * src_b14; src_c1 -= src_c3 * src_b13; src_c0 -= src_c3 * src_b12; src_c2 *= src_b10; src_c1 -= src_c2 * src_b9; src_c0 -= src_c2 * src_b8; src_c1 *= src_b5; src_c0 -= src_c1 * src_b4; src_c0 *= src_b0; ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); ST_SP(src_c2, c_nxt2line); ST_SP(src_c3, c_nxt3line); } static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3; FLOAT *c_nxt1line = c + ldc; src_c0 = LD_SP(c); src_c1 = LD_SP(c_nxt1line); for (k = 0; k < (bk >> 2); k++) { src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; aa += 4; bb += 2; src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; aa += 4; bb += 2; src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; aa += 4; bb += 2; src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; aa += 4; bb += 2; } if ((bk & 3) && (bk > 0)) { if (bk & 2) { src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; aa += 4; bb += 2; src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; aa += 4; bb += 2; } if (bk & 1) { src_a = LD_SP(aa); src_b0 = LD_SP(bb); src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1); src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0); src_c0 -= src_a * src_b0; src_c1 -= src_a * src_b1; } } a -= 8; b -= 4; src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3)); src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2)); src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0)); src_c1 *= src_b3; src_c0 -= src_c1 * src_b2; src_c0 *= src_b0; ST_SP2(src_c0, src_c1, a, 4); ST_SP(src_c0, c); ST_SP(src_c1, c_nxt1line); } static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1); c2 = *(c + 2); c3 = *(c + 3); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c2 -= aa[2] * bb[0]; c3 -= aa[3] * bb[0]; aa += 4; bb += 1; } a -= 4; b -= 1; b0 = *b; c0 *= b0; c1 *= b0; c2 *= b0; c3 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c + 0) = c0; *(c + 1) = c1; *(c + 2) = c2; *(c + 3) = c3; } static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7; FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + 1 * ldc); c1_nxt1 = *(c + 1 + 1 * ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); c0_nxt4 = *(c + 0 + 4 * ldc); c1_nxt4 = *(c + 1 + 4 * ldc); c0_nxt5 = *(c + 0 + 5 * ldc); c1_nxt5 = *(c + 1 + 5 * ldc); c0_nxt6 = *(c + 0 + 6 * ldc); c1_nxt6 = *(c + 1 + 6 * ldc); c0_nxt7 = *(c + 0 + 7 * ldc); c1_nxt7 = *(c + 1 + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt1 -= aa[0] * bb[1]; c1_nxt1 -= aa[1] * bb[1]; c0_nxt2 -= aa[0] * bb[2]; c1_nxt2 -= aa[1] * bb[2]; c0_nxt3 -= aa[0] * bb[3]; c1_nxt3 -= aa[1] * bb[3]; c0_nxt4 -= aa[0] * bb[4]; c1_nxt4 -= aa[1] * bb[4]; c0_nxt5 -= aa[0] * bb[5]; c1_nxt5 -= aa[1] * bb[5]; c0_nxt6 -= aa[0] * bb[6]; c1_nxt6 -= aa[1] * bb[6]; c0_nxt7 -= aa[0] * bb[7]; c1_nxt7 -= aa[1] * bb[7]; aa += 2; bb += 8; } a -= 16; b -= 64; b0 = *(b + 0); b8 = *(b + 8); b9 = *(b + 9); b16 = *(b + 16); b17 = *(b + 17); b18 = *(b + 18); b24 = *(b + 24); b25 = *(b + 25); b26 = *(b + 26); b27 = *(b + 27); b32 = *(b + 32); b33 = *(b + 33); b34 = *(b + 34); b35 = *(b + 35); b36 = *(b + 36); b40 = *(b + 40); b41 = *(b + 41); b42 = *(b + 42); b43 = *(b + 43); b44 = *(b + 44); b45 = *(b + 45); b48 = *(b + 48); b49 = *(b + 49); b50 = *(b + 50); b51 = *(b + 51); b52 = *(b + 52); b53 = *(b + 53); b54 = *(b + 54); b56 = *(b + 56); b57 = *(b + 57); b58 = *(b + 58); b59 = *(b + 59); b60 = *(b + 60); b61 = *(b + 61); b62 = *(b + 62); b63 = *(b + 63); c0_nxt7 *= b63; c1_nxt7 *= b63; c0_nxt6 -= c0_nxt7 * b62; c1_nxt6 -= c1_nxt7 * b62; c0_nxt6 *= b54; c1_nxt6 *= b54; c0_nxt5 -= c0_nxt7 * b61; c1_nxt5 -= c1_nxt7 * b61; c0_nxt5 -= c0_nxt6 * b53; c1_nxt5 -= c1_nxt6 * b53; c0_nxt5 *= b45; c1_nxt5 *= b45; c0_nxt4 -= c0_nxt7 * b60; c1_nxt4 -= c1_nxt7 * b60; c0_nxt4 -= c0_nxt6 * b52; c1_nxt4 -= c1_nxt6 * b52; c0_nxt4 -= c0_nxt5 * b44; c1_nxt4 -= c1_nxt5 * b44; c0_nxt4 *= b36; c1_nxt4 *= b36; c0_nxt3 -= c0_nxt7 * b59; c1_nxt3 -= c1_nxt7 * b59; c0_nxt3 -= c0_nxt6 * b51; c1_nxt3 -= c1_nxt6 * b51; c0_nxt3 -= c0_nxt5 * b43; c1_nxt3 -= c1_nxt5 * b43; c0_nxt3 -= c0_nxt4 * b35; c1_nxt3 -= c1_nxt4 * b35; c0_nxt3 *= b27; c1_nxt3 *= b27; c0_nxt2 -= c0_nxt7 * b58; c1_nxt2 -= c1_nxt7 * b58; c0_nxt2 -= c0_nxt6 * b50; c1_nxt2 -= c1_nxt6 * b50; c0_nxt2 -= c0_nxt5 * b42; c1_nxt2 -= c1_nxt5 * b42; c0_nxt2 -= c0_nxt4 * b34; c1_nxt2 -= c1_nxt4 * b34; c0_nxt2 -= c0_nxt3 * b26; c1_nxt2 -= c1_nxt3 * b26; c0_nxt2 *= b18; c1_nxt2 *= b18; c0_nxt1 -= c0_nxt7 * b57; c1_nxt1 -= c1_nxt7 * b57; c0_nxt1 -= c0_nxt6 * b49; c1_nxt1 -= c1_nxt6 * b49; c0_nxt1 -= c0_nxt5 * b41; c1_nxt1 -= c1_nxt5 * b41; c0_nxt1 -= c0_nxt4 * b33; c1_nxt1 -= c1_nxt4 * b33; c0_nxt1 -= c0_nxt3 * b25; c1_nxt1 -= c1_nxt3 * b25; c0_nxt1 -= c0_nxt2 * b17; c1_nxt1 -= c1_nxt2 * b17; c0_nxt1 *= b9; c1_nxt1 *= b9; c0 -= c0_nxt7 * b56; c1 -= c1_nxt7 * b56; c0 -= c0_nxt6 * b48; c1 -= c1_nxt6 * b48; c0 -= c0_nxt5 * b40; c1 -= c1_nxt5 * b40; c0 -= c0_nxt4 * b32; c1 -= c1_nxt4 * b32; c0 -= c0_nxt3 * b24; c1 -= c1_nxt3 * b24; c0 -= c0_nxt2 * b16; c1 -= c1_nxt2 * b16; c0 -= c0_nxt1 * b8; c1 -= c1_nxt1 * b8; c0 *= b0; c1 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt1; *(a + 3) = c1_nxt1; *(a + 4) = c0_nxt2; *(a + 5) = c1_nxt2; *(a + 6) = c0_nxt3; *(a + 7) = c1_nxt3; *(a + 8) = c0_nxt4; *(a + 9) = c1_nxt4; *(a + 10) = c0_nxt5; *(a + 11) = c1_nxt5; *(a + 12) = c0_nxt6; *(a + 13) = c1_nxt6; *(a + 14) = c0_nxt7; *(a + 15) = c1_nxt7; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; *(c + 0 + 4 * ldc) = c0_nxt4; *(c + 1 + 4 * ldc) = c1_nxt4; *(c + 0 + 5 * ldc) = c0_nxt5; *(c + 1 + 5 * ldc) = c1_nxt5; *(c + 0 + 6 * ldc) = c0_nxt6; *(c + 1 + 6 * ldc) = c1_nxt6; *(c + 0 + 7 * ldc) = c0_nxt7; *(c + 1 + 7 * ldc) = c1_nxt7; } static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; c0 = *(c + 0); c1 = *(c + 1); c0_nxt1 = *(c + 0 + 1 * ldc); c1_nxt1 = *(c + 1 + 1 * ldc); c0_nxt2 = *(c + 0 + 2 * ldc); c1_nxt2 = *(c + 1 + 2 * ldc); c0_nxt3 = *(c + 0 + 3 * ldc); c1_nxt3 = *(c + 1 + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt1 -= aa[0] * bb[1]; c1_nxt1 -= aa[1] * bb[1]; c0_nxt2 -= aa[0] * bb[2]; c1_nxt2 -= aa[1] * bb[2]; c0_nxt3 -= aa[0] * bb[3]; c1_nxt3 -= aa[1] * bb[3]; aa += 2; bb += 4; } a -= 8; b -= 16; b0 = *b; b4 = *(b + 4); b5 = *(b + 5); b8 = *(b + 8); b9 = *(b + 9); b10 = *(b + 10); b12 = *(b + 12); b13 = *(b + 13); b14 = *(b + 14); b15 = *(b + 15); c0_nxt3 *= b15; c1_nxt3 *= b15; c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10; c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10; c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5; c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5; c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0; c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt1; *(a + 3) = c1_nxt1; *(a + 4) = c0_nxt2; *(a + 5) = c1_nxt2; *(a + 6) = c0_nxt3; *(a + 7) = c1_nxt3; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + 1 * ldc) = c0_nxt1; *(c + 1 + 1 * ldc) = c1_nxt1; *(c + 0 + 2 * ldc) = c0_nxt2; *(c + 1 + 2 * ldc) = c1_nxt2; *(c + 0 + 3 * ldc) = c0_nxt3; *(c + 1 + 3 * ldc) = c1_nxt3; } static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt; c0 = *(c + 0); c1 = *(c + 1); c0_nxt = *(c + 0 + ldc); c1_nxt = *(c + 1 + ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; c0_nxt -= aa[0] * bb[1]; c1_nxt -= aa[1] * bb[1]; aa += 2; bb += 2; } a -= 4; b -= 4; b3 = *(b + 3); b2 = *(b + 2); b0 = *b; c0_nxt *= b3; c1_nxt *= b3; c0 -= c0_nxt * b2; c1 -= c1_nxt * b2; c0 *= b0; c1 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c0_nxt; *(a + 3) = c1_nxt; *(c + 0) = c0; *(c + 1) = c1; *(c + 0 + ldc) = c0_nxt; *(c + 1 + ldc) = c1_nxt; } static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, c0, c1; c0 = *(c + 0); c1 = *(c + 1); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[1] * bb[0]; aa += 2; bb += 1; } a -= 2; b -= 1; b0 = *b; c0 *= b0; c1 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(c + 0) = c0; *(c + 1) = c1; } static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35; FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54; FLOAT b56, b57, b58, b59, b60, b61, b62, b63; FLOAT c0, c1, c2, c3, c4, c5, c6, c7; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); c4 = *(c + 4 * ldc); c5 = *(c + 5 * ldc); c6 = *(c + 6 * ldc); c7 = *(c + 7 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; c2 -= aa[0] * bb[2]; c3 -= aa[0] * bb[3]; c4 -= aa[0] * bb[4]; c5 -= aa[0] * bb[5]; c6 -= aa[0] * bb[6]; c7 -= aa[0] * bb[7]; aa += 1; bb += 8; } a -= 8; b -= 64; b0 = *(b + 0); b8 = *(b + 8); b9 = *(b + 9); b16 = *(b + 16); b17 = *(b + 17); b18 = *(b + 18); b24 = *(b + 24); b25 = *(b + 25); b26 = *(b + 26); b27 = *(b + 27); b32 = *(b + 32); b33 = *(b + 33); b34 = *(b + 34); b35 = *(b + 35); b36 = *(b + 36); b40 = *(b + 40); b41 = *(b + 41); b42 = *(b + 42); b43 = *(b + 43); b44 = *(b + 44); b45 = *(b + 45); b48 = *(b + 48); b49 = *(b + 49); b50 = *(b + 50); b51 = *(b + 51); b52 = *(b + 52); b53 = *(b + 53); b54 = *(b + 54); b56 = *(b + 56); b57 = *(b + 57); b58 = *(b + 58); b59 = *(b + 59); b60 = *(b + 60); b61 = *(b + 61); b62 = *(b + 62); b63 = *(b + 63); c7 *= b63; c6 -= c7 * b62; c6 *= b54; c5 -= c7 * b61; c5 -= c6 * b53; c5 *= b45; c4 -= c7 * b60; c4 -= c6 * b52; c4 -= c5 * b44; c4 *= b36; c3 -= c7 * b59; c3 -= c6 * b51; c3 -= c5 * b43; c3 -= c4 * b35; c3 *= b27; c2 -= c7 * b58; c2 -= c6 * b50; c2 -= c5 * b42; c2 -= c4 * b34; c2 -= c3 * b26; c2 *= b18; c1 -= c7 * b57; c1 -= c6 * b49; c1 -= c5 * b41; c1 -= c4 * b33; c1 -= c3 * b25; c1 -= c2 * b17; c1 *= b9; c0 -= c7 * b56; c0 -= c6 * b48; c0 -= c5 * b40; c0 -= c4 * b32; c0 -= c3 * b24; c0 -= c2 * b16; c0 -= c1 * b8; c0 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(a + 4) = c4; *(a + 5) = c5; *(a + 6) = c6; *(a + 7) = c7; *(c + 0) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; *(c + 4 * ldc) = c4; *(c + 5 * ldc) = c5; *(c + 6 * ldc) = c6; *(c + 7 * ldc) = c7; } static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; FLOAT c0, c1, c2, c3; c0 = *(c + 0); c1 = *(c + 1 * ldc); c2 = *(c + 2 * ldc); c3 = *(c + 3 * ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; c2 -= aa[0] * bb[2]; c3 -= aa[0] * bb[3]; aa += 1; bb += 4; } a -= 4; b -= 16; b0 = *b; b4 = *(b + 4); b5 = *(b + 5); b8 = *(b + 8); b9 = *(b + 9); b10 = *(b + 10); b12 = *(b + 12); b13 = *(b + 13); b14 = *(b + 14); b15 = *(b + 15); c3 *= b15; c2 = (c2 - c3 * b14) * b10; c1 = ((c1 - c3 * b13) - c2 * b9) * b5; c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0; *(a + 0) = c0; *(a + 1) = c1; *(a + 2) = c2; *(a + 3) = c3; *(c) = c0; *(c + 1 * ldc) = c1; *(c + 2 * ldc) = c2; *(c + 3 * ldc) = c3; } static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { BLASLONG k; FLOAT *aa = a, *bb = b; FLOAT b0, b2, b3, c0, c1; c0 = *(c + 0); c1 = *(c + ldc); for (k = 0; k < bk; k++) { c0 -= aa[0] * bb[0]; c1 -= aa[0] * bb[1]; aa += 1; bb += 2; } a -= 2; b -= 4; b3 = *(b + 3); b2 = *(b + 2); b0 = *b; c1 *= b3; c0 -= c1 * b2; c0 *= b0; *(a + 0) = c0; *(a + 1) = c1; *(c + 0) = c0; *(c + ldc) = c1; } static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) { BLASLONG k; for (k = 0; k < bk; k++) { *c -= a[k] * b[k]; } *c *= *(a - 1); *(b - 1) = *c; } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset) { FLOAT *aa, *cc; BLASLONG i, j, kk; kk = n - offset; c += n * ldc; b += n * k; if (n & 7) { if (n & 1) { aa = a; b -= k; c -= ldc; cc = c; for (i = (m >> 3); i--;) { ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk)); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk)); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk)); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk)); aa += k; cc += 1; } } kk -= 1; } if (n & 2) { aa = a; b -= 2 * k; c -= 2 * ldc; cc = c; for (i = (m >> 3); i--;) { ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk)); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk)); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk)); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk)); aa += k; cc += 1; } } kk -= 2; } if (n & 4) { aa = a; b -= 4 * k; c -= 4 * ldc; cc = c; for (i = (m >> 3); i--;) { ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk)); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk)); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk)); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk)); aa += k; cc += 1; } } kk -= 4; } } for (j = (n >> 3); j--;) { aa = a; b -= 8 * k; c -= 8 * ldc; cc = c; for (i = (m >> 3); i--;) { ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk)); aa += 8 * k; cc += 8; } if (m & 7) { if (m & 4) { ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk)); aa += 4 * k; cc += 4; } if (m & 2) { ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk)); aa += 2 * k; cc += 2; } if (m & 1) { ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk)); aa += k; cc += 1; } } kk -= 8; } return 0; } OpenBLAS-0.2.20/kernel/mips/swap.c000066400000000000000000000040041313527062700165100ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp; if ( n < 0 ) return(0); while(i < n) { temp = x[ix] ; x[ix] = y[iy] ; y[iy] = temp ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/mips/symv_L.c000066400000000000000000000044361313527062700170200ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG jx,jy; BLASLONG j; FLOAT temp1; FLOAT temp2; #if 0 if ( m != offset ) printf("Symv_L: m=%d offset=%d\n",m,offset); #endif jx = 0; jy = 0; for (j=0; j #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT maxf; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; maxf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) > maxf ) { maxf = CABS1(x,ix); } ix += inc_x2; i++; } return(maxf); } OpenBLAS-0.2.20/kernel/mips/zamin.c000066400000000000000000000041311313527062700166550ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; FLOAT minf; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; minf = CABS1(x,0); ix += inc_x2; i++; while(i < n) { if( CABS1(x,ix) < minf ) { minf = CABS1(x,ix); } ix += inc_x2; i++; } return(minf); } OpenBLAS-0.2.20/kernel/mips/zasum.c000066400000000000000000000040101313527062700166720ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(sumf); inc_x2 = 2 * inc_x; n *= inc_x2; while(i < n) { sumf += CABS1(x,i); i += inc_x2; } return(sumf); } OpenBLAS-0.2.20/kernel/mips/zasum_msa.c000066400000000000000000000217601313527062700175450ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include #include "macros_msa.h" #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i; FLOAT sumf = 0.0; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; v2f64 sum_abs0 = {0, 0}; v2f64 sum_abs1 = {0, 0}; v2f64 sum_abs2 = {0, 0}; v2f64 sum_abs3 = {0, 0}; v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; if (n <= 0 || inc_x <= 0) return (sumf); if (1 == inc_x) { if (n > 16) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 16; LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); for (i = (n >> 4) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } if (n & 15) { if (n & 8) { LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); } if (n & 4) { LD_DP4_INC(x, 2, src0, src1, src2, src3); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); } if (n & 2) { LD_DP2_INC(x, 2, src0, src1); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); } if (n & 1) { src0 = LD_DP(x); sum_abs0 += AND_VEC_D(src0); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf = sum_abs0[0] + sum_abs0[1]; } else { inc_x *= 2; if (n > 16) { LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); for (i = (n >> 4) - 1; i--;) { LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); sum_abs0 += AND_VEC_D(src8); sum_abs1 += AND_VEC_D(src9); sum_abs2 += AND_VEC_D(src10); sum_abs3 += AND_VEC_D(src11); sum_abs0 += AND_VEC_D(src12); sum_abs1 += AND_VEC_D(src13); sum_abs2 += AND_VEC_D(src14); sum_abs3 += AND_VEC_D(src15); } if (n & 15) { if (n & 8) { LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); sum_abs0 += AND_VEC_D(src4); sum_abs1 += AND_VEC_D(src5); sum_abs2 += AND_VEC_D(src6); sum_abs3 += AND_VEC_D(src7); } if (n & 4) { LD_DP4_INC(x, inc_x, src0, src1, src2, src3); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); sum_abs2 += AND_VEC_D(src2); sum_abs3 += AND_VEC_D(src3); } if (n & 2) { LD_DP2_INC(x, inc_x, src0, src1); sum_abs0 += AND_VEC_D(src0); sum_abs1 += AND_VEC_D(src1); } if (n & 1) { src0 = LD_DP(x); sum_abs0 += AND_VEC_D(src0); } } sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; sumf = sum_abs0[0] + sum_abs0[1]; } return (sumf); } OpenBLAS-0.2.20/kernel/mips/zaxpby.c000066400000000000000000000056021313527062700170600ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix,iy; FLOAT temp; BLASLONG inc_x2, inc_y2; if ( n <= 0 ) return(0); ix = 0; iy = 0; inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; if ( beta_r == 0.0 && beta_i == 0.0) { if ( alpha_r == 0.0 && alpha_i == 0.0 ) { while(i < n) { y[iy] = 0.0 ; y[iy+1] = 0.0 ; iy += inc_y2 ; i++ ; } } else { while(i < n) { y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } } else { if ( alpha_r == 0.0 && alpha_i == 0.0 ) { while(i < n) { temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; y[iy] = temp; iy += inc_y2 ; i++ ; } } else { while(i < n) { temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; y[iy] = temp; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } } return(0); } OpenBLAS-0.2.20/kernel/mips/zaxpy.c000066400000000000000000000044611313527062700167200ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix,iy; BLASLONG inc_x2; BLASLONG inc_y2; if ( n < 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); ix = 0; iy = 0; inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { #if !defined(CONJ) y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif ix += inc_x2 ; iy += inc_y2 ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/mips/zaxpy_msa.c000066400000000000000000000377421313527062700175700ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #if !defined(CONJ) #define OP0 += #define OP1 -= #define OP2 += #else #define OP0 -= #define OP1 += #define OP2 -= #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i, inc_x2, inc_y2; FLOAT *py; v2f64 x0, x1, x2, x3, x4, x5, x6, x7; v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; FLOAT xd0, xd1, yd0, yd1; if (n < 0) return(0); if ((da_r == 0.0) && (da_i == 0.0)) return(0); py = y; dar_vec = COPY_DOUBLE_TO_VECTOR(da_r); dai_vec = COPY_DOUBLE_TO_VECTOR(da_i); if ((1 == inc_x) && (1 == inc_y)) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 32; for (i = (n >> 3); i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 16; y_pref += 16; LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); PCKEVOD_D2_DP(x5, x4, x2r, x2i); PCKEVOD_D2_DP(y5, y4, y2r, y2i); PCKEVOD_D2_DP(x7, x6, x3r, x3i); PCKEVOD_D2_DP(y7, y6, y3r, y3i); FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y2i OP0 dar_vec * x2i; y3i OP0 dar_vec * x3i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y2r OP1 dai_vec * x2i; y3r OP1 dai_vec * x3i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; y2i OP2 dai_vec * x2r; y3i OP2 dai_vec * x3r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ILVRL_D2_DP(y2i, y2r, y4, y5); ILVRL_D2_DP(y3i, y3r, y6, y7); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); } if (n & 7) { if (n & 4) { LD_DP4_INC(x, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); FMADD2(x0r, x1r, dar_vec, y0r, y1r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ST_DP4_INC(y0, y1, y2, y3, y, 2); } if (n & 2) { LD_DP2_INC(x, 2, x0, x1); LD_DP2_INC(py, 2, y0, y1); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); y0r += dar_vec * x0r; y0i OP0 dar_vec * x0i; y0r OP1 dai_vec * x0i; y0i OP2 dai_vec * x0r; ILVRL_D2_DP(y0i, y0r, y0, y1); ST_DP2_INC(y0, y1, y, 2); } if (n & 1) { LD_GP2_INC(x, 1, xd0, xd1); LD_GP2_INC(py, 1, yd0, yd1); yd0 += da_r * xd0; yd1 OP0 da_r * xd1; yd0 OP1 da_i * xd1; yd1 OP2 da_i * xd0; ST_GP2_INC(yd0, yd1, y, 1); } } } else if (1 == inc_y) { FLOAT *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 32; inc_x2 = 2 * inc_x; for (i = (n >> 3); i--;) { PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); y_pref += 16; LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); PCKEVOD_D2_DP(x5, x4, x2r, x2i); PCKEVOD_D2_DP(y5, y4, y2r, y2i); PCKEVOD_D2_DP(x7, x6, x3r, x3i); PCKEVOD_D2_DP(y7, y6, y3r, y3i); FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y2i OP0 dar_vec * x2i; y3i OP0 dar_vec * x3i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y2r OP1 dai_vec * x2i; y3r OP1 dai_vec * x3i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; y2i OP2 dai_vec * x2r; y3i OP2 dai_vec * x3r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ILVRL_D2_DP(y2i, y2r, y4, y5); ILVRL_D2_DP(y3i, y3r, y6, y7); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); } if (n & 7) { if (n & 4) { LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); FMADD2(x0r, x1r, dar_vec, y0r, y1r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ST_DP4_INC(y0, y1, y2, y3, y, 2); } if (n & 2) { LD_DP2_INC(x, inc_x2, x0, x1); LD_DP2_INC(py, 2, y0, y1); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); y0r += dar_vec * x0r; y0i OP0 dar_vec * x0i; y0r OP1 dai_vec * x0i; y0i OP2 dai_vec * x0r; ILVRL_D2_DP(y0i, y0r, y0, y1); ST_DP2_INC(y0, y1, y, 2); } if (n & 1) { LD_GP2_INC(x, 1, xd0, xd1); LD_GP2_INC(py, 1, yd0, yd1); yd0 += da_r * xd0; yd1 OP0 da_r * xd1; yd0 OP1 da_i * xd1; yd1 OP2 da_i * xd0; ST_GP2_INC(yd0, yd1, y, 1); } } } else if (1 == inc_x) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32; inc_y2 = 2 * inc_y; for (i = (n >> 3); i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); x_pref += 16; LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); PCKEVOD_D2_DP(x5, x4, x2r, x2i); PCKEVOD_D2_DP(y5, y4, y2r, y2i); PCKEVOD_D2_DP(x7, x6, x3r, x3i); PCKEVOD_D2_DP(y7, y6, y3r, y3i); FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y2i OP0 dar_vec * x2i; y3i OP0 dar_vec * x3i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y2r OP1 dai_vec * x2i; y3r OP1 dai_vec * x3i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; y2i OP2 dai_vec * x2r; y3i OP2 dai_vec * x3r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ILVRL_D2_DP(y2i, y2r, y4, y5); ILVRL_D2_DP(y3i, y3r, y6, y7); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); } if (n & 7) { if (n & 4) { LD_DP4_INC(x, 2, x0, x1, x2, x3); LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); FMADD2(x0r, x1r, dar_vec, y0r, y1r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); } if (n & 2) { LD_DP2_INC(x, 2, x0, x1); LD_DP2_INC(py, inc_y2, y0, y1); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); y0r += dar_vec * x0r; y0i OP0 dar_vec * x0i; y0r OP1 dai_vec * x0i; y0i OP2 dai_vec * x0r; ILVRL_D2_DP(y0i, y0r, y0, y1); ST_DP2_INC(y0, y1, y, inc_y2); } if (n & 1) { LD_GP2_INC(x, 1, xd0, xd1); LD_GP2_INC(py, 1, yd0, yd1); yd0 += da_r * xd0; yd1 OP0 da_r * xd1; yd0 OP1 da_i * xd1; yd1 OP2 da_i * xd0; ST_GP2_INC(yd0, yd1, y, 1); } } } else { inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; for (i = (n >> 3); i--;) { LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); PCKEVOD_D2_DP(x5, x4, x2r, x2i); PCKEVOD_D2_DP(y5, y4, y2r, y2i); PCKEVOD_D2_DP(x7, x6, x3r, x3i); PCKEVOD_D2_DP(y7, y6, y3r, y3i); FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y2i OP0 dar_vec * x2i; y3i OP0 dar_vec * x3i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y2r OP1 dai_vec * x2i; y3r OP1 dai_vec * x3i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; y2i OP2 dai_vec * x2r; y3i OP2 dai_vec * x3r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ILVRL_D2_DP(y2i, y2r, y4, y5); ILVRL_D2_DP(y3i, y3r, y6, y7); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); } if (n & 7) { if (n & 4) { LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); PCKEVOD_D2_DP(x3, x2, x1r, x1i); PCKEVOD_D2_DP(y3, y2, y1r, y1i); FMADD2(x0r, x1r, dar_vec, y0r, y1r); y0i OP0 dar_vec * x0i; y1i OP0 dar_vec * x1i; y0r OP1 dai_vec * x0i; y1r OP1 dai_vec * x1i; y0i OP2 dai_vec * x0r; y1i OP2 dai_vec * x1r; ILVRL_D2_DP(y0i, y0r, y0, y1); ILVRL_D2_DP(y1i, y1r, y2, y3); ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); } if (n & 2) { LD_DP2_INC(x, inc_x2, x0, x1); LD_DP2_INC(py, inc_y2, y0, y1); PCKEVOD_D2_DP(x1, x0, x0r, x0i); PCKEVOD_D2_DP(y1, y0, y0r, y0i); y0r += dar_vec * x0r; y0i OP0 dar_vec * x0i; y0r OP1 dai_vec * x0i; y0i OP2 dai_vec * x0r; ILVRL_D2_DP(y0i, y0r, y0, y1); ST_DP2_INC(y0, y1, y, inc_y2); } if (n & 1) { LD_GP2_INC(x, 1, xd0, xd1); LD_GP2_INC(py, 1, yd0, yd1); yd0 += da_r * xd0; yd1 OP0 da_r * xd1; yd0 OP1 da_i * xd1; yd1 OP2 da_i * xd0; ST_GP2_INC(yd0, yd1, y, 1); } } } return (0); } OpenBLAS-0.2.20/kernel/mips/zcopy.c000066400000000000000000000037301313527062700167070ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; BLASLONG inc_x2; BLASLONG inc_y2; if ( n < 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { y[iy] = x[ix] ; y[iy+1] = x[ix+1] ; ix += inc_x2; iy += inc_y2; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/mips/zcopy_msa.c000066400000000000000000000164001313527062700175450ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; FLOAT f0, f1; if (n < 0) return (0); if ((1 == inc_x) && (1 == inc_y)) { if (n > 15) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 64 + 16; LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 4) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; x8 = LD_DP(x); x += 2; ST_DP(x0, y); y += 2; x9 = LD_DP(x); x += 2; ST_DP(x1, y); y += 2; x10 = LD_DP(x); x += 2; ST_DP(x2, y); y += 2; x11 = LD_DP(x); x += 2; ST_DP(x3, y); y += 2; x12 = LD_DP(x); x += 2; ST_DP(x4, y); y += 2; x13 = LD_DP(x); x += 2; ST_DP(x5, y); y += 2; x14 = LD_DP(x); x += 2; ST_DP(x6, y); y += 2; x15 = LD_DP(x); x += 2; ST_DP(x7, y); y += 2; x0 = LD_DP(x); x += 2; ST_DP(x8, y); y += 2; x1 = LD_DP(x); x += 2; ST_DP(x9, y); y += 2; x2 = LD_DP(x); x += 2; ST_DP(x10, y); y += 2; x3 = LD_DP(x); x += 2; ST_DP(x11, y); y += 2; x4 = LD_DP(x); x += 2; ST_DP(x12, y); y += 2; x5 = LD_DP(x); x += 2; ST_DP(x13, y); y += 2; x6 = LD_DP(x); x += 2; ST_DP(x14, y); y += 2; x7 = LD_DP(x); x += 2; ST_DP(x15, y); y += 2; } x8 = LD_DP(x); x += 2; x9 = LD_DP(x); x += 2; ST_DP(x0, y); y += 2; x10 = LD_DP(x); x += 2; ST_DP(x1, y); y += 2; x11 = LD_DP(x); x += 2; ST_DP(x2, y); y += 2; x12 = LD_DP(x); x += 2; ST_DP(x3, y); y += 2; x13 = LD_DP(x); x += 2; ST_DP(x4, y); y += 2; x14 = LD_DP(x); x += 2; ST_DP(x5, y); y += 2; x15 = LD_DP(x); x += 2; ST_DP(x6, y); y += 2; ST_DP(x7, y); y += 2; ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); } if (n & 15) { if (n & 8) { LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); } if (n & 4) { LD_DP4_INC(x, 2, x0, x1, x2, x3); ST_DP4_INC(x0, x1, x2, x3, y, 2); } if (n & 2) { LD_DP2_INC(x, 2, x0, x1); ST_DP2_INC(x0, x1, y, 2); } if (n & 1) { LD_GP2_INC(x, 1, f0, f1); ST_GP2_INC(f0, f1, y, 1); } } } else { inc_x *= 2; inc_y *= 2; for (i = (n >> 4); i--;) { x0 = LD_DP(x); x += inc_x; x1 = LD_DP(x); x += inc_x; x2 = LD_DP(x); x += inc_x; x3 = LD_DP(x); x += inc_x; x4 = LD_DP(x); x += inc_x; x5 = LD_DP(x); x += inc_x; x6 = LD_DP(x); x += inc_x; x7 = LD_DP(x); x += inc_x; x8 = LD_DP(x); x += inc_x; ST_DP(x0, y); y += inc_y; x9 = LD_DP(x); x += inc_x; ST_DP(x1, y); y += inc_y; x10 = LD_DP(x); x += inc_x; ST_DP(x2, y); y += inc_y; x11 = LD_DP(x); x += inc_x; ST_DP(x3, y); y += inc_y; x12 = LD_DP(x); x += inc_x; ST_DP(x4, y); y += inc_y; x13 = LD_DP(x); x += inc_x; ST_DP(x5, y); y += inc_y; x14 = LD_DP(x); x += inc_x; ST_DP(x6, y); y += inc_y; x15 = LD_DP(x); x += inc_x; ST_DP(x7, y); y += inc_y; ST_DP(x8, y); y += inc_y; ST_DP(x9, y); y += inc_y; ST_DP(x10, y); y += inc_y; ST_DP(x11, y); y += inc_y; ST_DP(x12, y); y += inc_y; ST_DP(x13, y); y += inc_y; ST_DP(x14, y); y += inc_y; ST_DP(x15, y); y += inc_y; } if (n & 15) { if (n & 8) { LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y); } if (n & 4) { LD_DP4_INC(x, inc_x, x0, x1, x2, x3); ST_DP4_INC(x0, x1, x2, x3, y, inc_y); } if (n & 2) { LD_DP2_INC(x, inc_x, x0, x1); ST_DP2_INC(x0, x1, y, inc_y); } if (n & 1) { LD_GP2_INC(x, 1, f0, f1); ST_GP2_INC(f0, f1, y, 1); } } } return (0); } OpenBLAS-0.2.20/kernel/mips/zdot.c000066400000000000000000000045571313527062700165330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot[2]; OPENBLAS_COMPLEX_FLOAT result; BLASLONG inc_x2; BLASLONG inc_y2; dot[0]=0.0; dot[1]=0.0; CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; inc_y2 = 2 * inc_y ; while(i < n) { #if !defined(CONJ) dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; #else dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; #endif ix += inc_x2 ; iy += inc_y2 ; i++ ; } CREAL(result) = dot[0]; CIMAG(result) = dot[1]; return(result); } OpenBLAS-0.2.20/kernel/mips/zdot_msa.c000066400000000000000000000336251313527062700173710ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #if !defined(CONJ) #define OP1 -= #define OP2 += #define OP3 - #define OP4 + #else #define OP1 += #define OP2 -= #define OP3 + #define OP4 - #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i = 0; FLOAT dot[2]; BLASLONG inc_x2, inc_y2; v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7, vx8, vx9, vx10, vx11; v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7, vy8, vy9, vy10, vy11; v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; v2f64 dot0 = {0, 0}; v2f64 dot1 = {0, 0}; v2f64 dot2 = {0, 0}; v2f64 dot3 = {0, 0}; v2f64 dot4 = {0, 0}; v2f64 dot5 = {0, 0}; v2f64 dot6 = {0, 0}; v2f64 dot7 = {0, 0}; v2f64 zero = {0, 0}; OPENBLAS_COMPLEX_FLOAT result; dot[0] = 0.0; dot[1] = 0.0; CREAL(result) = 0.0; CIMAG(result) = 0.0; if (n < 1) return (result); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; if ((1 == inc_x) && (1 == inc_y)) { if (n > 7) { FLOAT *x_pref, *y_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32 + 8; pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } y_pref = y + pref_offset + 32 + 8; LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); for (i = (n >> 3) - 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(y_pref, 0); PREF_OFFSET(y_pref, 32); PREF_OFFSET(y_pref, 64); PREF_OFFSET(y_pref, 96); x_pref += 16; y_pref += 16; vx4 = LD_DP(x); x += 2; vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += 2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += 2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += 2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += 2; vx7 = LD_DP(x); x += 2; vy6 = LD_DP(y); y += 2; vy7 = LD_DP(y); y += 2; vx8 = LD_DP(x); x += 2; dot0 OP1 (vx0i * vy0i); vx9 = LD_DP(x); x += 2; vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vy8 = LD_DP(y); y += 2; vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); dot2 OP1 (vx1i * vy1i); vy9 = LD_DP(y); y += 2; vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); vx10 = LD_DP(x); x += 2; vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); vx11 = LD_DP(x); x += 2; vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); vy10 = LD_DP(y); y += 2; vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); dot5 OP2 (vx2i * vy2r); vy11 = LD_DP(y); y += 2; vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); vx0r = (v2f64) __msa_pckev_d((v2i64) vx9, (v2i64) vx8); dot7 OP2 (vx3i * vy3r); vx0i = (v2f64) __msa_pckod_d((v2i64) vx9, (v2i64) vx8); vy0r = (v2f64) __msa_pckev_d((v2i64) vy9, (v2i64) vy8); vx2 = vx10; vy0i = (v2f64) __msa_pckod_d((v2i64) vy9, (v2i64) vy8); vx3 = vx11; dot4 OP1 (vx2i * vy2i); vy2 = vy10; dot5 += (vx2r * vy2i); vy3 = vy11; dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } vx4 = LD_DP(x); x += 2; vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += 2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += 2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += 2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += 2; vx7 = LD_DP(x); x += 2; vy6 = LD_DP(y); y += 2; vy7 = LD_DP(y); y += 2; dot0 OP1 (vx0i * vy0i); vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); dot2 OP1 (vx1i * vy1i); vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); dot5 OP2 (vx2i * vy2r); vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); dot7 OP2 (vx3i * vy3r); dot4 OP1 (vx2i * vy2i); dot5 += (vx2r * vy2i); dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } } else if (n > 7) { LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); for (i = (n >> 3) - 1; i--;) { vx4 = LD_DP(x); x += inc_x2; vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += inc_x2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += inc_y2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += inc_y2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += inc_x2; vx7 = LD_DP(x); x += inc_x2; vy6 = LD_DP(y); y += inc_y2; vy7 = LD_DP(y); y += inc_y2; vx8 = LD_DP(x); x += inc_x2; dot0 OP1 (vx0i * vy0i); vx9 = LD_DP(x); x += inc_x2; vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vy8 = LD_DP(y); y += inc_y2; vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); dot2 OP1 (vx1i * vy1i); vy9 = LD_DP(y); y += inc_y2; vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); vx10 = LD_DP(x); x += inc_x2; vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); vx11 = LD_DP(x); x += inc_x2; vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); vy10 = LD_DP(y); y += inc_y2; vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); dot5 OP2 (vx2i * vy2r); vy11 = LD_DP(y); y += inc_y2; vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); vx0r = (v2f64) __msa_pckev_d((v2i64) vx9, (v2i64) vx8); dot7 OP2 (vx3i * vy3r); vx0i = (v2f64) __msa_pckod_d((v2i64) vx9, (v2i64) vx8); vy0r = (v2f64) __msa_pckev_d((v2i64) vy9, (v2i64) vy8); vx2 = vx10; vy0i = (v2f64) __msa_pckod_d((v2i64) vy9, (v2i64) vy8); vx3 = vx11; dot4 OP1 (vx2i * vy2i); vy2 = vy10; dot5 += (vx2r * vy2i); vy3 = vy11; dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } vx4 = LD_DP(x); x += inc_x2; vx1r = (v2f64) __msa_pckev_d((v2i64) vx3, (v2i64) vx2); dot0 += (vx0r * vy0r); vx5 = LD_DP(x); x += inc_x2; vx1i = (v2f64) __msa_pckod_d((v2i64) vx3, (v2i64) vx2); dot1 OP2 (vx0i * vy0r); vy4 = LD_DP(y); y += inc_y2; vy1r = (v2f64) __msa_pckev_d((v2i64) vy3, (v2i64) vy2); dot2 += (vx1r * vy1r); vy5 = LD_DP(y); y += inc_y2; vy1i = (v2f64) __msa_pckod_d((v2i64) vy3, (v2i64) vy2); dot3 OP2 (vx1i * vy1r); vx6 = LD_DP(x); x += inc_x2; vx7 = LD_DP(x); x += inc_x2; vy6 = LD_DP(y); y += inc_y2; vy7 = LD_DP(y); y += inc_y2; dot0 OP1 (vx0i * vy0i); vx2r = (v2f64) __msa_pckev_d((v2i64) vx5, (v2i64) vx4); dot1 += (vx0r * vy0i); vx2i = (v2f64) __msa_pckod_d((v2i64) vx5, (v2i64) vx4); dot2 OP1 (vx1i * vy1i); vy2r = (v2f64) __msa_pckev_d((v2i64) vy5, (v2i64) vy4); dot3 += (vx1r * vy1i); vy2i = (v2f64) __msa_pckod_d((v2i64) vy5, (v2i64) vy4); vx3r = (v2f64) __msa_pckev_d((v2i64) vx7, (v2i64) vx6); dot4 += (vx2r * vy2r); vx3i = (v2f64) __msa_pckod_d((v2i64) vx7, (v2i64) vx6); dot5 OP2 (vx2i * vy2r); vy3r = (v2f64) __msa_pckev_d((v2i64) vy7, (v2i64) vy6); vy3i = (v2f64) __msa_pckod_d((v2i64) vy7, (v2i64) vy6); dot6 += (vx3r * vy3r); dot7 OP2 (vx3i * vy3r); dot4 OP1 (vx2i * vy2i); dot5 += (vx2r * vy2i); dot6 OP1 (vx3i * vy3i); dot7 += (vx3r * vy3i); } if (n & 7) { if (n & 4) { LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); dot0 += (vx0r * vy0r); dot0 OP1 (vx0i * vy0i); dot1 OP2 (vx0i * vy0r); dot1 += (vx0r * vy0i); dot2 += (vx1r * vy1r); dot2 OP1 (vx1i * vy1i); dot3 OP2 (vx1i * vy1r); dot3 += (vx1r * vy1i); } if (n & 2) { LD_DP2_INC(x, inc_x2, vx0, vx1); LD_DP2_INC(y, inc_y2, vy0, vy1); PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); dot0 += (vx0r * vy0r); dot0 OP1 (vx0i * vy0i); dot1 OP2 (vx0i * vy0r); dot1 += (vx0r * vy0i); } if (n & 1) { vx0 = LD_DP(x); vy0 = LD_DP(y); PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i); PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i); dot0 += (vx0r * vy0r); dot0 OP1 (vx0i * vy0i); dot1 OP2 (vx0i * vy0r); dot1 += (vx0r * vy0i); } } dot0 += dot2 + dot4 + dot6; dot1 += dot3 + dot5 + dot7; dot[0] += (dot0[0] + dot0[1]); dot[1] += (dot1[0] + dot1[1]); CREAL(result) = dot[0]; CIMAG(result) = dot[1]; return (result); } OpenBLAS-0.2.20/kernel/mips/zgemm_kernel_4x4_msa.c000066400000000000000000001656411313527062700215730ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" #define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ \ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ \ /* 0th col */ \ SPLATI_D2_DP(src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ res1_r OP0## = src_a1r * src_br; \ res1_r OP1## = src_a1i * src_bi; \ res1_i OP2## = OP4 src_a1r * src_bi; \ res1_i OP3## = src_a1i * src_br; \ \ /* 1st col */ \ SPLATI_D2_DP(src_b1, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = OP4 src_a0r * src_bi; \ res2_i OP3## = src_a0i * src_br; \ \ res3_r OP0## = src_a1r * src_br; \ res3_r OP1## = src_a1i * src_bi; \ res3_i OP2## = OP4 src_a1r * src_bi; \ res3_i OP3## = src_a1i * src_br; \ \ /* 2nd col */ \ SPLATI_D2_DP(src_b2, src_br, src_bi); \ res4_r OP0## = src_a0r * src_br; \ res4_r OP1## = src_a0i * src_bi; \ res4_i OP2## = OP4 src_a0r * src_bi; \ res4_i OP3## = src_a0i * src_br; \ \ res5_r OP0## = src_a1r * src_br; \ res5_r OP1## = src_a1i * src_bi; \ res5_i OP2## = OP4 src_a1r * src_bi; \ res5_i OP3## = src_a1i * src_br; \ \ /* 3rd col */ \ SPLATI_D2_DP(src_b3, src_br, src_bi); \ res6_r OP0## = src_a0r * src_br; \ res6_r OP1## = src_a0i * src_bi; \ res6_i OP2## = OP4 src_a0r * src_bi; \ res6_i OP3## = src_a0i * src_br; \ \ res7_r OP0## = src_a1r * src_br; \ res7_r OP1## = src_a1i * src_bi; \ res7_i OP2## = OP4 src_a1r * src_bi; \ res7_i OP3## = src_a1i * src_br; \ } #define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_DP2_INC(pa0, 2, src_a0, src_a1); \ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ \ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ \ /* 0th col */ \ SPLATI_D2_DP(src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ /* 1st col */ \ SPLATI_D2_DP(src_b1, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = OP4 src_a0r * src_bi; \ res2_i OP3## = src_a0i * src_br; \ \ /* 2nd col */ \ SPLATI_D2_DP(src_b2, src_br, src_bi); \ res4_r OP0## = src_a0r * src_br; \ res4_r OP1## = src_a0i * src_bi; \ res4_i OP2## = OP4 src_a0r * src_bi; \ res4_i OP3## = src_a0i * src_br; \ \ /* 3rd col */ \ SPLATI_D2_DP(src_b3, src_br, src_bi); \ res6_r OP0## = src_a0r * src_br; \ res6_r OP1## = src_a0i * src_bi; \ res6_i OP2## = OP4 src_a0r * src_bi; \ res6_i OP3## = src_a0i * src_br; \ } #define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ src_a0 = LD_DP(pa0); \ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \ \ PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ \ /* 0th and 1st col */ \ PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ /* 2nd and 3rd col */ \ PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \ res1_r OP0## = src_a0r * src_br; \ res1_r OP1## = src_a0i * src_bi; \ res1_i OP2## = OP4 src_a0r * src_bi; \ res1_i OP3## = src_a0i * src_br; \ } #define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ LD_DP2_INC(pb0, 2, src_b0, src_b1); \ \ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ \ /* 0th col */ \ SPLATI_D2_DP(src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ res1_r OP0## = src_a1r * src_br; \ res1_r OP1## = src_a1i * src_bi; \ res1_i OP2## = OP4 src_a1r * src_bi; \ res1_i OP3## = src_a1i * src_br; \ \ /* 1st col */ \ SPLATI_D2_DP(src_b1, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = OP4 src_a0r * src_bi; \ res2_i OP3## = src_a0i * src_br; \ \ res3_r OP0## = src_a1r * src_br; \ res3_r OP1## = src_a1i * src_bi; \ res3_i OP2## = OP4 src_a1r * src_bi; \ res3_i OP3## = src_a1i * src_br; \ } #define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_DP2_INC(pa0, 2, src_a0, src_a1); \ LD_DP2_INC(pb0, 2, src_b0, src_b1); \ \ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ \ /* 0th col */ \ SPLATI_D2_DP(src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ /* 1st col */ \ SPLATI_D2_DP(src_b1, src_br, src_bi); \ res2_r OP0## = src_a0r * src_br; \ res2_r OP1## = src_a0i * src_bi; \ res2_i OP2## = OP4 src_a0r * src_bi; \ res2_i OP3## = src_a0i * src_br; \ } #define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ src_a0 = LD_DP(pa0); \ LD_DP2_INC(pb0, 2, src_b0, src_b1); \ \ PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \ \ /* 0th and 1st col */ \ PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ } #define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \ src_b0 = LD_DP(pb0); \ \ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \ \ /* 0th col */ \ SPLATI_D2_DP(src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ \ res1_r OP0## = src_a1r * src_br; \ res1_r OP1## = src_a1i * src_bi; \ res1_i OP2## = OP4 src_a1r * src_bi; \ res1_i OP3## = src_a1i * src_br; \ } #define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_DP2_INC(pa0, 2, src_a0, src_a1); \ src_b0 = LD_DP(pb0); \ \ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \ \ /* 0th col */ \ SPLATI_D2_DP(src_b0, src_br, src_bi); \ res0_r OP0## = src_a0r * src_br; \ res0_r OP1## = src_a0i * src_bi; \ res0_i OP2## = OP4 src_a0r * src_bi; \ res0_i OP3## = src_a0i * src_br; \ } #define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \ { \ /* 0th col */ \ a0_r = pa0[0]; \ a0_i = pa0[1]; \ b0_r = pb0[0]; \ b0_i = pb0[1]; \ \ res0 OP0## = a0_r * b0_r; \ res0 OP1## = a0_i * b0_i; \ res1 OP2## = OP4 a0_r * b0_i; \ res1 OP3## = a0_i * b0_r; \ } #define ZGEMM_SCALE_4X4_MSA \ { \ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r += alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i += alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ \ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r += alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i += alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ \ LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i += alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ dst1_r += alpha_r * res5_r; \ dst1_r -= alpha_i * res5_i; \ dst1_i += alpha_r * res5_i; \ dst1_i += alpha_i * res5_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \ \ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i += alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ dst1_r += alpha_r * res7_r; \ dst1_r -= alpha_i * res7_i; \ dst1_i += alpha_r * res7_i; \ dst1_i += alpha_i * res7_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ } #define ZGEMM_SCALE_2X4_MSA \ { \ LD_DP2(pc0, 2, dst0, dst1); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ LD_DP2(pc1, 2, dst2, dst3); \ \ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP2_INC(dst0, dst1, pc0, 2); \ ST_DP2_INC(dst2, dst3, pc1, 2); \ \ LD_DP2(pc2, 2, dst0, dst1); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i += alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ LD_DP2(pc3, 2, dst2, dst3); \ \ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i += alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP2_INC(dst0, dst1, pc2, 2); \ ST_DP2_INC(dst2, dst3, pc3, 2); \ } #define ZGEMM_SCALE_1X4_MSA \ { \ dst0 = LD_DP(pc0); \ dst1 = LD_DP(pc1); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ dst2 = LD_DP(pc2); \ dst3 = LD_DP(pc3); \ \ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res1_r; \ dst0_r -= alpha_i * res1_i; \ dst0_i += alpha_r * res1_i; \ dst0_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP(dst0, pc0); \ ST_DP(dst1, pc1); \ ST_DP(dst2, pc2); \ ST_DP(dst3, pc3); \ } #define ZGEMM_SCALE_4X2_MSA \ { \ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r += alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i += alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \ \ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r += alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i += alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ } #define ZGEMM_SCALE_2X2_MSA \ { \ LD_DP2(pc0, 2, dst0, dst1); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ ST_DP2_INC(dst0, dst1, pc0, 2); \ \ LD_DP2(pc1, 2, dst2, dst3); \ \ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i += alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP2_INC(dst2, dst3, pc1, 2); \ } #define ZGEMM_SCALE_1X2_MSA \ { \ dst0 = LD_DP(pc0); \ dst1 = LD_DP(pc1); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ ST_DP(dst0, pc0); \ ST_DP(dst1, pc1); \ } #define ZGEMM_SCALE_4X1_MSA \ { \ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r += alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i += alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ } #define ZGEMM_SCALE_2X1_MSA \ { \ LD_DP2(pc0, 2, dst0, dst1); \ \ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \ \ dst0_r += alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i += alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ ST_DP2_INC(dst0, dst1, pc0, 2); \ } #define ZGEMM_SCALE_1X1 \ { \ pc0[0] += alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ } #define ZGEMM_TRMM_SCALE_4X4_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r = alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i = alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r = alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i = alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ \ dst0_r = alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i = alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ dst1_r = alpha_r * res5_r; \ dst1_r -= alpha_i * res5_i; \ dst1_i = alpha_r * res5_i; \ dst1_i += alpha_i * res5_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ dst0_r = alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i = alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ dst1_r = alpha_r * res7_r; \ dst1_r -= alpha_i * res7_i; \ dst1_i = alpha_r * res7_i; \ dst1_i += alpha_i * res7_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \ } #define ZGEMM_TRMM_SCALE_2X4_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP2_INC(dst0, dst1, pc0, 2); \ ST_DP2_INC(dst2, dst3, pc1, 2); \ \ dst0_r = alpha_r * res4_r; \ dst0_r -= alpha_i * res4_i; \ dst0_i = alpha_r * res4_i; \ dst0_i += alpha_i * res4_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ dst0_r = alpha_r * res6_r; \ dst0_r -= alpha_i * res6_i; \ dst0_i = alpha_r * res6_i; \ dst0_i += alpha_i * res6_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP2_INC(dst0, dst1, pc2, 2); \ ST_DP2_INC(dst2, dst3, pc3, 2); \ } #define ZGEMM_TRMM_SCALE_1X4_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ dst0_r = alpha_r * res1_r; \ dst0_r -= alpha_i * res1_i; \ dst0_i = alpha_r * res1_i; \ dst0_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP(dst0, pc0); \ ST_DP(dst1, pc1); \ ST_DP(dst2, pc2); \ ST_DP(dst3, pc3); \ } #define ZGEMM_TRMM_SCALE_4X2_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r = alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i = alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ dst1_r = alpha_r * res3_r; \ dst1_r -= alpha_i * res3_i; \ dst1_i = alpha_r * res3_i; \ dst1_i += alpha_i * res3_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \ } #define ZGEMM_TRMM_SCALE_2X2_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ ST_DP2_INC(dst0, dst1, pc0, 2); \ \ dst0_r = alpha_r * res2_r; \ dst0_r -= alpha_i * res2_i; \ dst0_i = alpha_r * res2_i; \ dst0_i += alpha_i * res2_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \ \ ST_DP2_INC(dst2, dst3, pc1, 2); \ } #define ZGEMM_TRMM_SCALE_1X2_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ ST_DP(dst0, pc0); \ ST_DP(dst1, pc1); \ } #define ZGEMM_TRMM_SCALE_4X1_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ dst1_r = alpha_r * res1_r; \ dst1_r -= alpha_i * res1_i; \ dst1_i = alpha_r * res1_i; \ dst1_i += alpha_i * res1_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \ \ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \ } #define ZGEMM_TRMM_SCALE_2X1_MSA \ { \ dst0_r = alpha_r * res0_r; \ dst0_r -= alpha_i * res0_i; \ dst0_i = alpha_r * res0_i; \ dst0_i += alpha_i * res0_r; \ \ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \ \ ST_DP2_INC(dst0, dst1, pc0, 2); \ } #define ZGEMM_TRMM_SCALE_1X1 \ { \ pc0[0] = alphar * res0; \ pc0[0] -= alphai * res1; \ pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ } int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc #ifdef TRMMKERNEL , BLASLONG offset #endif ) { BLASLONG i, j, l, temp; #if defined(TRMMKERNEL) BLASLONG off; #endif FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0; FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i; v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3; v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi; v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i; v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i; v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i; alpha_r = COPY_DOUBLE_TO_VECTOR(alphar); alpha_i = COPY_DOUBLE_TO_VECTOR(alphai); #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset; #endif for (j = (n >> 2); j--;) { pc0 = C; pc1 = pc0 + 2 * ldc; pc2 = pc1 + 2 * ldc; pc3 = pc2 + 2 * ldc; pa0 = A; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif for (i = (m >> 2); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 4; pb0 = B + off * 2 * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pa0]) \n\t" "pref 0, 96(%[pa0]) \n\t" "pref 0, 64(%[pb0]) \n\t" "pref 0, 96(%[pb0]) \n\t" : : [pa0] "r" (pa0), [pb0] "r" (pb0) ); #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X4_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_4X4_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_4X4_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_4X4_MSA(, -, , -, -); #endif for (l = (temp - 1); l--;) { #ifdef ENABLE_PREFETCH __asm__ __volatile__( "pref 0, 64(%[pa0]) \n\t" "pref 0, 96(%[pa0]) \n\t" "pref 0, 64(%[pb0]) \n\t" "pref 0, 96(%[pb0]) \n\t" : : [pa0] "r" (pa0), [pb0] "r" (pb0) ); #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X4_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_4X4_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_4X4_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_4X4_MSA(+, -, -, -,); #endif } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_4X4_MSA #else ZGEMM_SCALE_4X4_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2 * 4; pb0 += temp * 2 * 4; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 2; pb0 = B + off * 2 * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_2X4_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_2X4_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_2X4_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_2X4_MSA(, -, , -, -); #endif for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_2X4_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_2X4_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_2X4_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_2X4_MSA(+, -, -, -,); #endif } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_2X4_MSA #else ZGEMM_SCALE_2X4_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2 * 2; pb0 += temp * 2 * 4; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 1; pb0 = B + off * 2 * 4; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 4; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_1X4_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_1X4_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_1X4_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_1X4_MSA(, -, , -, -); #endif pa0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_1X4_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_1X4_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_1X4_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_1X4_MSA(+, -, -, -,); #endif pa0 += 2; } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_1X4_MSA #else ZGEMM_SCALE_1X4_MSA #endif pc0 += 2; pc1 += 2; pc2 += 2; pc3 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 4; // number of values in B #endif pa0 += temp * 2 * 1; pb0 += temp * 2 * 4; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 4; // number of values in A #endif B += (k << 3); C += (ldc << 3); } if (n & 2) { pc0 = C; pc1 = pc0 + 2 * ldc; pa0 = A; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif for (i = (m >> 2); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 4; pb0 = B + off * 2 * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X2_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_4X2_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_4X2_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_4X2_MSA(, -, , -, -); #endif for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X2_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_4X2_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_4X2_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_4X2_MSA(+, -, -, -,); #endif } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_4X2_MSA #else ZGEMM_SCALE_4X2_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2 * 4; pb0 += temp * 2 * 2; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 2; pb0 = B + off * 2 * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_2X2_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_2X2_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_2X2_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_2X2_MSA(, -, , -, -); #endif for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_2X2_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_2X2_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_2X2_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_2X2_MSA(+, -, -, -,); #endif } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_2X2_MSA #else ZGEMM_SCALE_2X2_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2 * 2; pb0 += temp * 2 * 2; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 1; pb0 = B + off * 2 * 2; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 2; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_1X2_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_1X2_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_1X2_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_1X2_MSA(, -, , -, -); #endif pa0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_1X2_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_1X2_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_1X2_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_1X2_MSA(+, -, -, -,); #endif pa0 += 2; } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_1X2_MSA #else ZGEMM_SCALE_1X2_MSA #endif pc0 += 2; pc1 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 2; // number of values in B #endif pa0 += temp * 2 * 1; pb0 += temp * 2 * 2; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif B += (k << 2); C += (ldc << 2); } if (n & 1) { pc0 = C; pa0 = A; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif for (i = (m >> 2); i--;) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 4; pb0 = B + off * 2 * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 4; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X1_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_4X1_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_4X1_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_4X1_MSA(, -, , -, -); #endif pb0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_4X1_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_4X1_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_4X1_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_4X1_MSA(+, -, -, -,); #endif pb0 += 2; } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_4X1_MSA #else ZGEMM_SCALE_4X1_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 4; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2 * 4; pb0 += temp * 2 * 1; #endif #ifdef LEFT off += 4; // number of values in A #endif #endif } if (m & 2) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 2; pb0 = B + off * 2 * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 2; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_2X1_MSA(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_2X1_MSA(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_2X1_MSA(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_2X1_MSA(, -, , -, -); #endif pb0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_2X1_MSA(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_2X1_MSA(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_2X1_MSA(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_2X1_MSA(+, -, -, -,); #endif pb0 += 2; } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_2X1_MSA #else ZGEMM_SCALE_2X1_MSA #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 2; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2 * 2; pb0 += temp * 2 * 1; #endif #ifdef LEFT off += 2; // number of values in A #endif #endif } if (m & 1) { #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) pb0 = B; #else pa0 += off * 2 * 1; pb0 = B + off * 2 * 1; #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) temp = k - off; #elif defined(LEFT) temp = off + 1; // number of values in A #else temp = off + 1; // number of values in B #endif #else pb0 = B; temp = k; #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_1X1(, -, , +, +); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_1X1(, +, , +, -); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_1X1(, +, , -, +); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_1X1(, -, , -, -); #endif pa0 += 2; pb0 += 2; for (l = (temp - 1); l--;) { #if defined(NN) || defined(NT) || defined(TN) || defined(TT) ZGEMM_KERNEL_1X1(+, -, +, +,); #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ZGEMM_KERNEL_1X1(+, +, -, +,); #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ZGEMM_KERNEL_1X1(+, +, +, -,); #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) ZGEMM_KERNEL_1X1(+, -, -, -,); #endif pa0 += 2; pb0 += 2; } #if defined(TRMMKERNEL) ZGEMM_TRMM_SCALE_1X1 #else ZGEMM_SCALE_1X1 #endif pc0 += 2; #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = k - off; #ifdef LEFT temp -= 1; // number of values in A #else temp -= 1; // number of values in B #endif pa0 += temp * 2 * 1; pb0 += temp * 2 * 1; #endif #ifdef LEFT off += 1; // number of values in A #endif #endif } #if defined(TRMMKERNEL) && !defined(LEFT) off += 1; // number of values in A #endif B += (k << 1); C += (ldc << 1); } return 0; } OpenBLAS-0.2.20/kernel/mips/zgemm_ncopy_4_msa.c000066400000000000000000000106751313527062700211630ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; psrc0 = src; pdst = dst; lda *= 2; for (j = (n >> 2); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; for (i = (m >> 2); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15, pdst, 2); } if (m & 2) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src4, src5); LD_DP2_INC(psrc3, 2, src8, src9); LD_DP2_INC(psrc4, 2, src12, src13); ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2); } if (m & 1) { src0 = LD_DP(psrc1); src4 = LD_DP(psrc2); src8 = LD_DP(psrc3); src12 = LD_DP(psrc4); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; ST_DP4_INC(src0, src4, src8, src12, pdst, 2); } } if (n & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; for (i = (m >> 2); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2); } if (m & 2) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src4, src5); ST_DP4_INC(src0, src4, src1, src5, pdst, 2); } if (m & 1) { src0 = LD_DP(psrc1); src4 = LD_DP(psrc2); psrc1 += 2; psrc2 += 2; ST_DP2_INC(src0, src4, pdst, 2); } } if (n & 1) { psrc1 = psrc0; for (i = (m >> 2); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); ST_DP4_INC(src0, src1, src2, src3, pdst, 2); } if (m & 2) { LD_DP2_INC(psrc1, 2, src0, src1); ST_DP2_INC(src0, src1, pdst, 2); } if (m & 1) { src0 = LD_DP(psrc1); ST_DP(src0, pdst); } } return 0; } OpenBLAS-0.2.20/kernel/mips/zgemm_tcopy_4_msa.c000066400000000000000000000113421313527062700211610ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst) { BLASLONG i, j; FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; FLOAT *pdst0, *pdst1, *pdst2, *pdst3; v2f64 src0, src1, src2, src3, src4, src5, src6, src7; v2f64 src8, src9, src10, src11, src12, src13, src14, src15; psrc0 = src; pdst0 = dst; lda *= 2; pdst2 = dst + 2 * m * (n & ~3); pdst3 = dst + 2 * m * (n & ~1); for (j = (m >> 2); j--;) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc3 = psrc2 + lda; psrc4 = psrc3 + lda; psrc0 += 4 * lda; pdst1 = pdst0; pdst0 += 32; for (i = (n >> 2); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); LD_DP4_INC(psrc3, 2, src8, src9, src10, src11); LD_DP4_INC(psrc4, 2, src12, src13, src14, src15); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, pdst1 + 16, 2); pdst1 += m * 8; } if (n & 2) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); LD_DP2_INC(psrc3, 2, src4, src5); LD_DP2_INC(psrc4, 2, src6, src7); ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); } if (n & 1) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); src2 = LD_DP(psrc3); src3 = LD_DP(psrc4); psrc1 += 2; psrc2 += 2; psrc3 += 2; psrc4 += 2; ST_DP4_INC(src0, src1, src2, src3, pdst3, 2); } } if (m & 2) { psrc1 = psrc0; psrc2 = psrc1 + lda; psrc0 += 2 * lda; pdst1 = pdst0; pdst0 += 16; for (i = (n >> 2); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); LD_DP4_INC(psrc2, 2, src4, src5, src6, src7); ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); pdst1 += m * 8; } if (n & 2) { LD_DP2_INC(psrc1, 2, src0, src1); LD_DP2_INC(psrc2, 2, src2, src3); ST_DP4_INC(src0, src1, src2, src3, pdst2, 2); } if (n & 1) { src0 = LD_DP(psrc1); src1 = LD_DP(psrc2); ST_DP2_INC(src0, src1, pdst3, 2); psrc1 += 2; psrc2 += 2; } } if (m & 1) { psrc1 = psrc0; pdst1 = pdst0; for (i = (n >> 2); i--;) { LD_DP4_INC(psrc1, 2, src0, src1, src2, src3); ST_DP4(src0, src1, src2, src3, pdst1, 2); pdst1 += m * 8; } if (n & 2) { LD_DP2_INC(psrc1, 2, src0, src1); ST_DP2_INC(src0, src1, pdst2, 2); } if (n & 1) { src0 = LD_DP(psrc1); ST_DP(src0, pdst3); } } return 0; } OpenBLAS-0.2.20/kernel/mips/zgemv_n.c000066400000000000000000000076451313527062700172210ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG j; FLOAT *a_ptr; FLOAT temp_r,temp_i; BLASLONG inc_x2,inc_y2; BLASLONG lda2; BLASLONG i2; lda2 = 2*lda; ix = 0; a_ptr = a; if ( inc_x == 1 && inc_y == 1 ) { for (j=0; j> 2); j--;) \ { \ ZLOAD_X4_SCALE() \ \ k = 0; \ k_pref = pref_offset; \ y = y_org; \ \ for (i = (m >> 2); i--;) \ { \ PREFETCH(pa0 + k_pref + 8 + 0); \ PREFETCH(pa0 + k_pref + 8 + 4); \ PREFETCH(pa1 + k_pref + 8 + 0); \ PREFETCH(pa1 + k_pref + 8 + 4); \ PREFETCH(pa2 + k_pref + 8 + 0); \ PREFETCH(pa2 + k_pref + 8 + 4); \ PREFETCH(pa3 + k_pref + 8 + 0); \ PREFETCH(pa3 + k_pref + 8 + 4); \ \ ZLOAD_Y4() \ ZGEMV_N_4x4() \ ZSTORE_Y4() \ \ k += 2 * 4; \ k_pref += 2 * 4; \ y += inc_y2 * 4; \ } \ \ if (m & 2) \ { \ ZLOAD_Y2() \ ZGEMV_N_2x4() \ ZSTORE_Y2() \ \ k += 2 * 2; \ y += inc_y2 * 2; \ } \ \ if (m & 1) \ { \ temp0_r = tp4r[0]; \ temp1_r = tp4r[1]; \ temp2_r = tp5r[0]; \ temp3_r = tp5r[1]; \ \ temp0_i = tp4i[0]; \ temp1_i = tp4i[1]; \ temp2_i = tp5i[0]; \ temp3_i = tp5i[1]; \ \ ZGEMV_N_1x4() \ k += 2; \ y += inc_y2; \ } \ \ pa0 += 4 * lda2; \ pa1 += 4 * lda2; \ pa2 += 4 * lda2; \ pa3 += 4 * lda2; \ \ x += 4 * inc_x2; \ } \ \ if (n & 2) \ { \ ZLOAD_X2_SCALE() \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 2); i--;) \ { \ ZLOAD_Y4() \ ZGEMV_N_4x2() \ ZSTORE_Y4() \ \ k += 2 * 4; \ y += inc_y2 * 4; \ } \ \ if (m & 2) \ { \ ZLOAD_Y2() \ ZGEMV_N_2x2() \ ZSTORE_Y2() \ \ k += 2 * 2; \ y += inc_y2 * 2; \ } \ \ if (m & 1) \ { \ temp0_r = tp4r[0]; \ temp1_r = tp4r[1]; \ \ temp0_i = tp4i[0]; \ temp1_i = tp4i[1]; \ \ ZGEMV_N_1x2() \ \ k += 2; \ y += inc_y2; \ } \ \ pa0 += 2 * lda2; \ pa1 += 2 * lda2; \ \ x += 2 * inc_x2; \ } \ \ if (n & 1) \ { \ ZLOAD_X1_SCALE() \ \ k = 0; \ y = y_org; \ \ for (i = (m >> 2); i--;) \ { \ ZLOAD_Y4() \ ZGEMV_N_4x1() \ ZSTORE_Y4() \ \ k += 2 * 4; \ y += inc_y2 * 4; \ } \ \ if (m & 2) \ { \ ZLOAD_Y2() \ ZGEMV_N_2x1() \ ZSTORE_Y2() \ \ k += 2 * 2; \ y += inc_y2 * 2; \ } \ \ if (m & 1) \ { \ ZGEMV_N_1x1() \ \ k += 2; \ y += inc_y2; \ } \ \ pa0 += lda2; \ x += inc_x2; \ } \ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y, BLASLONG inc_y2, FLOAT *buffer) { BLASLONG i, j, k, k_pref, pref_offset; FLOAT *y_org = y; FLOAT *pa0, *pa1, *pa2, *pa3; FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i; FLOAT temp3_i, res0, res1; v2f64 alphar, alphai; v2f64 x0, x1, x2, x3, y0, y1, y2, y3; v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r; v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i; v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i; lda2 = 2 * lda2; inc_x2 = 2 * inc_x2; inc_y2 = 2 * inc_y2; pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1); pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); pa0 = A; pa1 = A + lda2; pa2 = A + 2 * lda2; pa3 = A + 3 * lda2; alphar = COPY_DOUBLE_TO_VECTOR(alpha_r); alphai = COPY_DOUBLE_TO_VECTOR(alpha_i); if ((2 == inc_x2) && (2 == inc_y2)) { #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP #define ZLOAD_Y4 ZLOAD_Y4_VECTOR #define ZLOAD_Y2 ZLOAD_Y2_VECTOR #define ZSTORE_Y4 ZSTORE_Y4_VECTOR #define ZSTORE_Y2 ZSTORE_Y2_VECTOR ZGEMV_N_MSA(); #undef ZLOAD_X4_SCALE #undef ZLOAD_X2_SCALE #undef ZLOAD_X1_SCALE #undef ZLOAD_Y4 #undef ZLOAD_Y2 #undef ZSTORE_Y4 #undef ZSTORE_Y2 } else if (2 == inc_x2) { #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP #define ZLOAD_Y4 ZLOAD_Y4_GP #define ZLOAD_Y2 ZLOAD_Y2_GP #define ZSTORE_Y4 ZSTORE_Y4_GP #define ZSTORE_Y2 ZSTORE_Y2_GP ZGEMV_N_MSA(); #undef ZLOAD_X4_SCALE #undef ZLOAD_X2_SCALE #undef ZLOAD_X1_SCALE #undef ZLOAD_Y4 #undef ZLOAD_Y2 #undef ZSTORE_Y4 #undef ZSTORE_Y2 } else if (2 == inc_y2) { #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP #define ZLOAD_Y4 ZLOAD_Y4_VECTOR #define ZLOAD_Y2 ZLOAD_Y2_VECTOR #define ZSTORE_Y4 ZSTORE_Y4_VECTOR #define ZSTORE_Y2 ZSTORE_Y2_VECTOR ZGEMV_N_MSA(); #undef ZLOAD_X4_SCALE #undef ZLOAD_X2_SCALE #undef ZLOAD_X1_SCALE #undef ZLOAD_Y4 #undef ZLOAD_Y2 #undef ZSTORE_Y4 #undef ZSTORE_Y2 } else { #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP #define ZLOAD_Y4 ZLOAD_Y4_GP #define ZLOAD_Y2 ZLOAD_Y2_GP #define ZSTORE_Y4 ZSTORE_Y4_GP #define ZSTORE_Y2 ZSTORE_Y2_GP ZGEMV_N_MSA(); #undef ZLOAD_X4_SCALE #undef ZLOAD_X2_SCALE #undef ZLOAD_X1_SCALE #undef ZLOAD_Y4 #undef ZLOAD_Y2 #undef ZSTORE_Y4 #undef ZSTORE_Y2 } return(0); } #undef OP0 #undef OP1 #undef OP2 #undef OP3 #undef OP4 OpenBLAS-0.2.20/kernel/mips/zgemv_t.c000066400000000000000000000070141313527062700172150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i; BLASLONG ix,iy; BLASLONG j; FLOAT *a_ptr; FLOAT temp_r,temp_i; BLASLONG inc_x2,inc_y2; BLASLONG lda2; BLASLONG i2; lda2 = 2*lda; iy = 0; a_ptr = a; if ( inc_x == 1 && inc_y == 1 ) { for (j=0; j> 4) \ { \ x0 = LD_DP(x); \ x1 = LD_DP(x + 1 * inc_x2); \ t0 = LD_DP(pa0); \ t1 = LD_DP(pa0 + 2); \ \ x4 = LD_DP(x + 4 * inc_x2); \ x5 = LD_DP(x + 5 * inc_x2); \ t4 = LD_DP(pa0 + 8); \ t5 = LD_DP(pa0 + 10); \ \ for (i = (m >> 4) - 1; i--;) \ { \ pa0_pref = pa0 + pref_offset; \ \ PREFETCH(pa0_pref + 36); \ PREFETCH(pa0_pref + 44); \ PREFETCH(pa0_pref + 48); \ PREFETCH(pa0_pref + 52); \ PREFETCH(pa0_pref + 56); \ PREFETCH(pa0_pref + 60); \ PREFETCH(pa0_pref + 64); \ PREFETCH(pa0_pref + 72); \ \ x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \ x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \ src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \ src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \ \ tp0r += src0r * x0r; \ x2 = LD_DP(x + 2 * inc_x2); \ x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \ \ tp0i OP1 src0r * x0i; \ x3 = LD_DP(x + 3 * inc_x2); \ x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \ \ tp1r OP0 src0i * x0i; \ t2 = LD_DP(pa0 + 4); \ src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \ \ tp1i OP2 src0i * x0r; \ t3 = LD_DP(pa0 + 6); \ src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \ \ tp2r += src2r * x2r; \ x6 = LD_DP(x + 6 * inc_x2); \ \ tp2i OP1 src2r * x2i; \ x7 = LD_DP(x + 7 * inc_x2); \ \ tp3r OP0 src2i * x2i; \ t6 = LD_DP(pa0 + 12); \ \ tp3i OP2 src2i * x2r; \ t7 = LD_DP(pa0 + 14); \ \ x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \ x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \ src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \ src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \ \ tp0r += src1r * x1r; \ x0 = LD_DP(x + 8 * inc_x2); \ x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \ \ tp0i OP1 src1r * x1i; \ x1 = LD_DP(x + 9 * inc_x2); \ x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \ \ tp1r OP0 src1i * x1i; \ t0 = LD_DP(pa0 + 16); \ src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \ \ tp1i OP2 src1i * x1r; \ t1 = LD_DP(pa0 + 18); \ src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \ \ tp2r += src3r * x3r; \ x4 = LD_DP(x + 12 * inc_x2); \ \ tp2i OP1 src3r * x3i; \ x5 = LD_DP(x + 13 * inc_x2); \ \ tp3r OP0 src3i * x3i; \ t4 = LD_DP(pa0 + 24); \ \ tp3i OP2 src3i * x3r; \ t5 = LD_DP(pa0 + 26); \ \ x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \ x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \ src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \ src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \ \ tp0r += src0r * x0r; \ x2 = LD_DP(x + 10 * inc_x2); \ x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \ \ tp0i OP1 src0r * x0i; \ x3 = LD_DP(x + 11 * inc_x2); \ x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \ \ tp1r OP0 src0i * x0i; \ t2 = LD_DP(pa0 + 20); \ src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \ \ tp1i OP2 src0i * x0r; \ t3 = LD_DP(pa0 + 22); \ src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \ \ tp2r += src2r * x2r; \ x6 = LD_DP(x + 14 * inc_x2); \ \ tp2i OP1 src2r * x2i; \ x7 = LD_DP(x + 15 * inc_x2); \ \ tp3r OP0 src2i * x2i; \ t6 = LD_DP(pa0 + 28); \ \ tp3i OP2 src2i * x2r; \ t7 = LD_DP(pa0 + 30); \ \ x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \ x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \ src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \ src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \ \ tp0r += src1r * x1r; \ x0 = LD_DP(x + inc_x2 * 16); \ x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \ \ tp0i OP1 src1r * x1i; \ x1 = LD_DP(x + inc_x2 * 16 + 1 * inc_x2); \ x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \ \ tp1r OP0 src1i * x1i; \ t0 = LD_DP(pa0 + 2 * 16); \ src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \ \ tp1i OP2 src1i * x1r; \ t1 = LD_DP(pa0 + 2 * 16 + 2); \ src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \ \ tp2r += src3r * x3r; \ x4 = LD_DP(x + inc_x2 * 16 + 4 * inc_x2); \ \ tp2i OP1 src3r * x3i; \ x5 = LD_DP(x + inc_x2 * 16 + 5 * inc_x2); \ \ tp3r OP0 src3i * x3i; \ t4 = LD_DP(pa0 + 2 * 16 + 8); \ \ tp3i OP2 src3i * x3r; \ t5 = LD_DP(pa0 + 2 * 16 + 10); \ \ pa0 += 2 * 16; \ x += inc_x2 * 16; \ } \ \ x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \ x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \ src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \ src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \ \ tp0r += src0r * x0r; \ x2 = LD_DP(x + 2 * inc_x2); \ x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \ \ tp0i OP1 src0r * x0i; \ x3 = LD_DP(x + 3 * inc_x2); \ x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \ \ tp1r OP0 src0i * x0i; \ t2 = LD_DP(pa0 + 4); \ src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \ \ tp1i OP2 src0i * x0r; \ t3 = LD_DP(pa0 + 6); \ src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \ \ tp2r += src2r * x2r; \ x6 = LD_DP(x + 6 * inc_x2); \ \ tp2i OP1 src2r * x2i; \ x7 = LD_DP(x + 7 * inc_x2); \ \ tp3r OP0 src2i * x2i; \ t6 = LD_DP(pa0 + 12); \ \ tp3i OP2 src2i * x2r; \ t7 = LD_DP(pa0 + 14); \ \ x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \ x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \ src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \ src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \ \ tp0r += src1r * x1r; \ x0 = LD_DP(x + 8 * inc_x2); \ x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \ \ tp0i OP1 src1r * x1i; \ x1 = LD_DP(x + 9 * inc_x2); \ x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \ \ tp1r OP0 src1i * x1i; \ t0 = LD_DP(pa0 + 16); \ src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \ \ tp1i OP2 src1i * x1r; \ t1 = LD_DP(pa0 + 18); \ src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \ \ tp2r += src3r * x3r; \ x4 = LD_DP(x + 12 * inc_x2); \ \ tp2i OP1 src3r * x3i; \ x5 = LD_DP(x + 13 * inc_x2); \ \ tp3r OP0 src3i * x3i; \ t4 = LD_DP(pa0 + 24); \ \ tp3i OP2 src3i * x3r; \ t5 = LD_DP(pa0 + 26); \ \ x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \ x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \ src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \ src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \ \ tp0r += src0r * x0r; \ x2 = LD_DP(x + 10 * inc_x2); \ x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \ \ tp0i OP1 src0r * x0i; \ x3 = LD_DP(x + 11 * inc_x2); \ x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \ \ tp1r OP0 src0i * x0i; \ t2 = LD_DP(pa0 + 20); \ src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \ \ tp1i OP2 src0i * x0r; \ t3 = LD_DP(pa0 + 22); \ src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \ \ tp2r += src2r * x2r; \ x6 = LD_DP(x + 14 * inc_x2); \ \ tp2i OP1 src2r * x2i; \ x7 = LD_DP(x + 15 * inc_x2); \ \ tp3r OP0 src2i * x2i; \ t6 = LD_DP(pa0 + 28); \ \ tp3i OP2 src2i * x2r; \ t7 = LD_DP(pa0 + 30); \ \ x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \ x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \ src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \ src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \ \ tp0r += src1r * x1r; \ x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \ \ tp0i OP1 src1r * x1i; \ x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \ \ tp1r OP0 src1i * x1i; \ src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \ \ tp1i OP2 src1i * x1r; \ src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \ \ tp2r += src3r * x3r; \ tp2i OP1 src3r * x3i; \ tp3r OP0 src3i * x3i; \ tp3i OP2 src3i * x3r; \ \ pa0 += 2 * 16; \ x += inc_x2 * 16; \ \ tp0r += tp1r + tp2r + tp3r; \ tp0i += tp1i + tp2i + tp3i; \ } \ \ if (m & 8) \ { \ ZLOAD_X8(); \ ZGEMV_T_8x1(); \ \ pa0 += 2 * 8; \ x += inc_x2 * 8; \ } \ \ if (m & 4) \ { \ ZLOAD_X4(); \ ZGEMV_T_4x1(); \ \ pa0 += 2 * 4; \ x += inc_x2 * 4; \ } \ \ if (m & 2) \ { \ ZLOAD_X2(); \ ZGEMV_T_2x1(); \ \ pa0 += 2 * 2; \ x += inc_x2 * 2; \ } \ \ temp0r = tp0r[0] + tp0r[1]; \ temp0i = tp0i[0] + tp0i[1]; \ \ if (m & 1) \ { \ ZGEMV_T_1x1(); \ \ pa0 += 2; \ x += inc_x2; \ } \ \ ZSCALE_STORE_Y1_GP(); \ \ A += lda2; \ y += inc_y2; \ } \ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai, FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { BLASLONG i, j, pref_offset; BLASLONG inc_x2, inc_y2, lda2; FLOAT *pa0, *pa0_pref; FLOAT *srcx_org = x; FLOAT temp0r, temp0i; FLOAT res0r, res0i; v2f64 zero = {0}; v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i; v2f64 x4, x5, x6, x7, x2r, x3r, x2i, x3i; v2f64 t0, t1, t2, t3, t4, t5, t6, t7; v2f64 src0r, src1r, src2r, src3r; v2f64 src0i, src1i, src2i, src3i; v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i; lda2 = 2 * lda; inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; pref_offset = (uintptr_t)A & L1_DATA_LINESIZE; pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); if (2 == inc_x2) { #define ZLOAD_X8 ZLOAD_X8_VECTOR #define ZLOAD_X4 ZLOAD_X4_VECTOR #define ZLOAD_X2 ZLOAD_X2_VECTOR ZGEMV_T_MSA(); #undef ZLOAD_X8 #undef ZLOAD_X4 #undef ZLOAD_X2 } else { #define ZLOAD_X8 ZLOAD_X8_GP #define ZLOAD_X4 ZLOAD_X4_GP #define ZLOAD_X2 ZLOAD_X2_GP ZGEMV_T_MSA(); #undef ZLOAD_X8 #undef ZLOAD_X4 #undef ZLOAD_X2 } return(0); } #undef OP0 #undef OP1 #undef OP2 OpenBLAS-0.2.20/kernel/mips/znrm2.c000066400000000000000000000047331313527062700166170ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT scale = 0.0; FLOAT ssq = 1.0; BLASLONG inc_x2; FLOAT temp; if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; n *= inc_x2; while(i < n) { if ( x[i] != 0.0 ) { temp = ABS( x[i] ); if ( scale < temp ) { ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); scale = temp ; } else { ssq += ( temp / scale ) * ( temp / scale ); } } if ( x[i+1] != 0.0 ) { temp = ABS( x[i+1] ); if ( scale < temp ) { ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); scale = temp ; } else { ssq += ( temp / scale ) * ( temp / scale ); } } i += inc_x2; } scale = scale * sqrt( ssq ); return(scale); } OpenBLAS-0.2.20/kernel/mips/zomatcopy_cn.c000066400000000000000000000042201313527062700202430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) { BLASLONG i,j,ia; FLOAT *aptr,*bptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); aptr = a; bptr = b; lda *= 2; ldb *= 2; for ( i=0; i> 1); j--;) \ { \ ST_DP2_INC(zero, zero, px, inc_x2); \ ST_DP2_INC(zero, zero, py, inc_y2); \ } \ \ if (n & 1) \ { \ ST_DP(zero, px); \ ST_DP(zero, py); \ } \ } \ else if ((1 == c) && (1 == s)) \ { \ /* process 8 elements */ \ if (n >> 3) \ { \ BLASLONG pref_offsetx, pref_offsety; \ \ pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ if (pref_offsetx > 0) \ { \ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ pref_offsetx = pref_offsetx / sizeof(FLOAT); \ } \ \ pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ if (pref_offsety > 0) \ { \ pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ pref_offsety = pref_offsety / sizeof(FLOAT); \ } \ \ x0 = LD_DP(px); px += inc_x2; \ x1 = LD_DP(px); px += inc_x2; \ x2 = LD_DP(px); px += inc_x2; \ x3 = LD_DP(px); px += inc_x2; \ y0 = LD_DP(py); py += inc_y2; \ y1 = LD_DP(py); py += inc_y2; \ y2 = LD_DP(py); py += inc_y2; \ y3 = LD_DP(py); py += inc_y2; \ \ for (j = (n >> 3) - 1; j--;) \ { \ PREFETCH(px + pref_offsetx + 16); \ PREFETCH(px + pref_offsetx + 20); \ PREFETCH(px + pref_offsetx + 24); \ PREFETCH(px + pref_offsetx + 28); \ PREFETCH(py + pref_offsety + 16); \ PREFETCH(py + pref_offsety + 20); \ PREFETCH(py + pref_offsety + 24); \ PREFETCH(py + pref_offsety + 28); \ \ out0 = x0 + y0; \ x4 = LD_DP(px); px += inc_x2; \ out1 = y0 - x0; \ x5 = LD_DP(px); px += inc_x2; \ out2 = x1 + y1; \ x6 = LD_DP(px); px += inc_x2; \ out3 = y1 - x1; \ x7 = LD_DP(px); px += inc_x2; \ out4 = x2 + y2; \ y4 = LD_DP(py); py += inc_y2; \ out5 = y2 - x2; \ y5 = LD_DP(py); py += inc_y2; \ out6 = x3 + y3; \ y6 = LD_DP(py); py += inc_y2; \ out7 = y3 - x3; \ y7 = LD_DP(py); py += inc_y2; \ \ ST_DP(out0, x); x += inc_x2; \ out8 = x4 + y4; \ ST_DP(out1, y); y += inc_y2; \ out9 = y4 - x4; \ ST_DP(out2, x); x += inc_x2; \ out10 = x5 + y5; \ ST_DP(out3, y); y += inc_y2; \ out11 = y5 - x5; \ ST_DP(out4, x); x += inc_x2; \ out12 = x6 + y6; \ ST_DP(out5, y); y += inc_y2; \ out13 = y6 - x6; \ ST_DP(out6, x); x += inc_x2; \ out14 = x7 + y7; \ ST_DP(out7, y); y += inc_y2; \ out15 = y7 - x7; \ \ x0 = LD_DP(px); px += inc_x2; \ ST_DP(out8, x); x += inc_x2; \ x1 = LD_DP(px); px += inc_x2; \ ST_DP(out10, x); x += inc_x2; \ x2 = LD_DP(px); px += inc_x2; \ ST_DP(out12, x); x += inc_x2; \ x3 = LD_DP(px); px += inc_x2; \ ST_DP(out14, x); x += inc_x2; \ \ y0 = LD_DP(py); py += inc_y2; \ ST_DP(out9, y); y += inc_y2; \ y1 = LD_DP(py); py += inc_y2; \ ST_DP(out11, y); y += inc_y2; \ y2 = LD_DP(py); py += inc_y2; \ ST_DP(out13, y); y += inc_y2; \ y3 = LD_DP(py); py += inc_y2; \ ST_DP(out15, y); y += inc_y2; \ } \ \ x4 = LD_DP(px); px += inc_x2; \ x5 = LD_DP(px); px += inc_x2; \ x6 = LD_DP(px); px += inc_x2; \ x7 = LD_DP(px); px += inc_x2; \ y4 = LD_DP(py); py += inc_y2; \ y5 = LD_DP(py); py += inc_y2; \ y6 = LD_DP(py); py += inc_y2; \ y7 = LD_DP(py); py += inc_y2; \ \ out0 = x0 + y0; \ out1 = y0 - x0; \ out2 = x1 + y1; \ out3 = y1 - x1; \ out4 = x2 + y2; \ out5 = y2 - x2; \ out6 = x3 + y3; \ out7 = y3 - x3; \ out8 = x4 + y4; \ out9 = y4 - x4; \ out10 = x5 + y5; \ out11 = y5 - x5; \ out12 = x6 + y6; \ out13 = y6 - x6; \ out14 = x7 + y7; \ out15 = y7 - x7; \ \ ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, inc_x2); \ ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, inc_y2); \ } \ if (n & 4) \ { \ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ \ out0 = x0 + y0; \ out1 = y0 - x0; \ out2 = x1 + y1; \ out3 = y1 - x1; \ out4 = x2 + y2; \ out5 = y2 - x2; \ out6 = x3 + y3; \ out7 = y3 - x3; \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ } \ if (n & 2) \ { \ LD_DP2_INC(px, inc_x2, x0, x1); \ LD_DP2_INC(py, inc_y2, y0, y1); \ \ out0 = x0 + y0; \ out1 = y0 - x0; \ out2 = x1 + y1; \ out3 = y1 - x1; \ \ ST_DP2_INC(out0, out2, x, inc_x2); \ ST_DP2_INC(out1, out3, y, inc_y2); \ } \ if (n & 1) \ { \ x0 = LD_DP(px); \ y0 = LD_DP(py); \ \ out0 = x0 + y0; \ out1 = y0 - x0; \ \ ST_DP(out0, px); \ ST_DP(out1, py); \ } \ } \ else if (0 == s) \ { \ c0 = COPY_DOUBLE_TO_VECTOR(c); \ \ if (n >> 3) \ { \ BLASLONG pref_offsetx, pref_offsety; \ \ pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ if (pref_offsetx > 0) \ { \ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ pref_offsetx = pref_offsetx / sizeof(FLOAT); \ } \ \ pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ if (pref_offsety > 0) \ { \ pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ pref_offsety = pref_offsety / sizeof(FLOAT); \ } \ \ LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); \ \ for (j = (n >> 3) - 1; j--;) \ { \ PREFETCH(px + pref_offsetx + 16); \ PREFETCH(px + pref_offsetx + 20); \ PREFETCH(px + pref_offsetx + 24); \ PREFETCH(px + pref_offsetx + 28); \ PREFETCH(py + pref_offsety + 16); \ PREFETCH(py + pref_offsety + 20); \ PREFETCH(py + pref_offsety + 24); \ PREFETCH(py + pref_offsety + 28); \ \ y0 = LD_DP(py); py += inc_y2; \ x0 *= c0; \ y1 = LD_DP(py); py += inc_y2; \ x1 *= c0; \ y2 = LD_DP(py); py += inc_y2; \ x2 *= c0; \ y3 = LD_DP(py); py += inc_y2; \ x3 *= c0; \ y4 = LD_DP(py); py += inc_y2; \ x4 *= c0; \ y5 = LD_DP(py); py += inc_y2; \ x5 *= c0; \ y6 = LD_DP(py); py += inc_y2; \ x6 *= c0; \ y7 = LD_DP(py); py += inc_y2; \ x7 *= c0; \ \ ST_DP(x0, x); x += inc_x2; \ y0 *= c0; \ ST_DP(x1, x); x += inc_x2; \ y1 *= c0; \ ST_DP(x2, x); x += inc_x2; \ y2 *= c0; \ ST_DP(x3, x); x += inc_x2; \ y3 *= c0; \ ST_DP(x4, x); x += inc_x2; \ y4 *= c0; \ ST_DP(x5, x); x += inc_x2; \ y5 *= c0; \ ST_DP(x6, x); x += inc_x2; \ y6 *= c0; \ ST_DP(x7, x); x += inc_x2; \ y7 *= c0; \ \ x0 = LD_DP(px); px += inc_x2; \ ST_DP(y0, y); y += inc_y2; \ x1 = LD_DP(px); px += inc_x2; \ ST_DP(y1, y); y += inc_y2; \ x2 = LD_DP(px); px += inc_x2; \ ST_DP(y2, y); y += inc_y2; \ x3 = LD_DP(px); px += inc_x2; \ ST_DP(y3, y); y += inc_y2; \ x4 = LD_DP(px); px += inc_x2; \ ST_DP(y4, y); y += inc_y2; \ x5 = LD_DP(px); px += inc_x2; \ ST_DP(y5, y); y += inc_y2; \ x6 = LD_DP(px); px += inc_x2; \ ST_DP(y6, y); y += inc_y2; \ x7 = LD_DP(px); px += inc_x2; \ ST_DP(y7, y); y += inc_y2; \ } \ \ LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); \ \ x0 *= c0; \ y0 *= c0; \ x1 *= c0; \ y1 *= c0; \ x2 *= c0; \ y2 *= c0; \ x3 *= c0; \ y3 *= c0; \ x4 *= c0; \ y4 *= c0; \ x5 *= c0; \ y5 *= c0; \ x6 *= c0; \ y6 *= c0; \ x7 *= c0; \ y7 *= c0; \ \ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); \ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); \ } \ \ if (n & 4) \ { \ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ \ out0 = c0 * x0; \ out1 = c0 * y0; \ out2 = c0 * x1; \ out3 = c0 * y1; \ out4 = c0 * x2; \ out5 = c0 * y2; \ out6 = c0 * x3; \ out7 = c0 * y3; \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ } \ if (n & 2) \ { \ LD_DP2_INC(px, inc_x2, x0, x1); \ LD_DP2_INC(py, inc_y2, y0, y1); \ \ out0 = c0 * x0; \ out1 = c0 * y0; \ out2 = c0 * x1; \ out3 = c0 * y1; \ \ ST_DP2_INC(out0, out2, x, inc_x2); \ ST_DP2_INC(out1, out3, y, inc_y2); \ } \ if (n & 1) \ { \ x0 = LD_DP(px); \ y0 = LD_DP(py); \ \ out0 = c0 * x0; \ out1 = c0 * y0; \ \ ST_DP(out0, px); \ ST_DP(out1, py); \ } \ } \ else if (0 == c) \ { \ s0 = COPY_DOUBLE_TO_VECTOR(s); \ \ /* process 16 floats */ \ if (n >> 3) \ { \ BLASLONG pref_offsetx, pref_offsety; \ \ pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ if (pref_offsetx > 0) \ { \ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ pref_offsetx = pref_offsetx / sizeof(FLOAT); \ } \ \ pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ if (pref_offsety > 0) \ { \ pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ pref_offsety = pref_offsety / sizeof(FLOAT); \ } \ \ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ \ for (j = (n >> 3) - 1; j--;) \ { \ PREFETCH(px + pref_offsetx + 16); \ PREFETCH(px + pref_offsetx + 20); \ PREFETCH(px + pref_offsetx + 24); \ PREFETCH(px + pref_offsetx + 28); \ PREFETCH(py + pref_offsety + 16); \ PREFETCH(py + pref_offsety + 20); \ PREFETCH(py + pref_offsety + 24); \ PREFETCH(py + pref_offsety + 28); \ \ x4 = LD_DP(px); px += inc_x2; \ out0 = s0 * y0; \ x5 = LD_DP(px); px += inc_x2; \ out2 = s0 * y1; \ x6 = LD_DP(px); px += inc_x2; \ out4 = s0 * y2; \ x7 = LD_DP(px); px += inc_x2; \ out6 = s0 * y3; \ y4 = LD_DP(py); py += inc_y2; \ out1 = -(s0 * x0); \ y5 = LD_DP(py); py += inc_y2; \ out3 = -(s0 * x1); \ y6 = LD_DP(py); py += inc_y2; \ out5 = -(s0 * x2); \ y7 = LD_DP(py); py += inc_y2; \ out7 = -(s0 * x3); \ \ ST_DP(out0, x); x += inc_y2; \ out0 = s0 * y4; \ ST_DP(out2, x); x += inc_y2; \ out2 = s0 * y5; \ ST_DP(out4, x); x += inc_y2; \ out4 = s0 * y6; \ ST_DP(out6, x); x += inc_y2; \ out6 = s0 * y7; \ ST_DP(out1, y); y += inc_y2; \ out1 = -(s0 * x4); \ ST_DP(out3, y); y += inc_y2; \ out3 = -(s0 * x5); \ ST_DP(out5, y); y += inc_y2; \ out5 = -(s0 * x6); \ ST_DP(out7, y); y += inc_y2; \ out7 = -(s0 * x7); \ \ x0 = LD_DP(px); px += inc_x2; \ ST_DP(out0, x); x += inc_y2; \ x1 = LD_DP(px); px += inc_x2; \ ST_DP(out2, x); x += inc_y2; \ x2 = LD_DP(px); px += inc_x2; \ ST_DP(out4, x); x += inc_y2; \ x3 = LD_DP(px); px += inc_x2; \ ST_DP(out6, x); x += inc_y2; \ y0 = LD_DP(py); py += inc_y2; \ ST_DP(out1, y); y += inc_y2; \ y1 = LD_DP(py); py += inc_y2; \ ST_DP(out3, y); y += inc_y2; \ y2 = LD_DP(py); py += inc_y2; \ ST_DP(out5, y); y += inc_y2; \ y3 = LD_DP(py); py += inc_y2; \ ST_DP(out7, y); y += inc_y2; \ } \ \ out0 = s0 * y0; \ out2 = s0 * y1; \ out4 = s0 * y2; \ out6 = s0 * y3; \ out1 = -(s0 * x0); \ out3 = -(s0 * x1); \ out5 = -(s0 * x2); \ out7 = -(s0 * x3); \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ \ LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \ LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \ \ out0 = s0 * y4; \ out2 = s0 * y5; \ out4 = s0 * y6; \ out6 = s0 * y7; \ out1 = -(s0 * x4); \ out3 = -(s0 * x5); \ out5 = -(s0 * x6); \ out7 = -(s0 * x7); \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ } \ if (n & 4) \ { \ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ \ out0 = s0 * y0; \ out1 = - (s0 * x0); \ out2 = s0 * y1; \ out3 = - (s0 * x1); \ out4 = s0 * y2; \ out5 = - (s0 * x2); \ out6 = s0 * y3; \ out7 = - (s0 * x3); \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ } \ if (n & 2) \ { \ LD_DP2_INC(px, inc_x2, x0, x1); \ LD_DP2_INC(py, inc_y2, y0, y1); \ \ out0 = s0 * y0; \ out1 = - (s0 * x0); \ out2 = s0 * y1; \ out3 = - (s0 * x1); \ \ ST_DP2_INC(out0, out2, x, inc_x2); \ ST_DP2_INC(out1, out3, y, inc_y2); \ } \ if (n & 1) \ { \ x0 = LD_DP(px); px += inc_x2; \ y0 = LD_DP(py); py += inc_y2; \ \ out0 = s0 * y0; \ out1 = - (s0 * x0); \ \ ST_DP(out0, x); x += inc_x2; \ ST_DP(out1, y); y += inc_y2; \ } \ } \ else \ { \ c0 = COPY_DOUBLE_TO_VECTOR(c); \ s0 = COPY_DOUBLE_TO_VECTOR(s); \ \ if (n >> 3) \ { \ BLASLONG pref_offsetx, pref_offsety; \ \ pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ if (pref_offsetx > 0) \ { \ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ pref_offsetx = pref_offsetx / sizeof(FLOAT); \ } \ \ pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ if (pref_offsety > 0) \ { \ pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ pref_offsety = pref_offsety / sizeof(FLOAT); \ } \ \ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ \ for (j = (n >> 3) - 1; j--;) \ { \ PREFETCH(px + pref_offsetx + 16); \ PREFETCH(px + pref_offsetx + 20); \ PREFETCH(px + pref_offsetx + 24); \ PREFETCH(px + pref_offsetx + 28); \ PREFETCH(py + pref_offsety + 16); \ PREFETCH(py + pref_offsety + 20); \ PREFETCH(py + pref_offsety + 24); \ PREFETCH(py + pref_offsety + 28); \ \ x4 = LD_DP(px); px += inc_x2; \ out0 = c0 * x0; \ x5 = LD_DP(px); px += inc_x2; \ out2 = c0 * x1; \ x6 = LD_DP(px); px += inc_x2; \ out4 = c0 * x2; \ x7 = LD_DP(px); px += inc_x2; \ out6 = c0 * x3; \ y4 = LD_DP(py); py += inc_y2; \ out1 = c0 * y0; \ y5 = LD_DP(py); py += inc_y2; \ out3 = c0 * y1; \ y6 = LD_DP(py); py += inc_y2; \ out5 = c0 * y2; \ y7 = LD_DP(py); py += inc_y2; \ out7 = c0 * y3; \ \ out0 += s0 * y0; \ out2 += s0 * y1; \ out4 += s0 * y2; \ out6 += s0 * y3; \ out1 -= s0 * x0; \ out3 -= s0 * x1; \ out5 -= s0 * x2; \ out7 -= s0 * x3; \ \ ST_DP(out0, x); x += inc_x2; \ out0 = c0 * x4; \ ST_DP(out2, x); x += inc_x2; \ out2 = c0 * x5; \ ST_DP(out4, x); x += inc_x2; \ out4 = c0 * x6; \ ST_DP(out6, x); x += inc_x2; \ out6 = c0 * x7; \ ST_DP(out1, y); y += inc_y2; \ out1 = c0 * y4; \ ST_DP(out3, y); y += inc_y2; \ out3 = c0 * y5; \ ST_DP(out5, y); y += inc_y2; \ out5 = c0 * y6; \ ST_DP(out7, y); y += inc_y2; \ out7 = c0 * y7; \ \ x0 = LD_DP(px); px += inc_x2; \ out0 += s0 * y4; \ x1 = LD_DP(px); px += inc_x2; \ out2 += s0 * y5; \ x2 = LD_DP(px); px += inc_x2; \ out4 += s0 * y6; \ x3 = LD_DP(px); px += inc_x2; \ out6 += s0 * y7; \ y0 = LD_DP(py); py += inc_y2; \ out1 -= s0 * x4; \ y1 = LD_DP(py); py += inc_y2; \ out3 -= s0 * x5; \ y2 = LD_DP(py); py += inc_y2; \ out5 -= s0 * x6; \ y3 = LD_DP(py); py += inc_y2; \ out7 -= s0 * x7; \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ } \ \ out0 = c0 * x0; \ out0 += s0 * y0; \ out1 = c0 * y0; \ out1 -= s0 * x0; \ out2 = c0 * x1; \ out2 += s0 * y1; \ out3 = c0 * y1; \ out3 -= s0 * x1; \ out4 = c0 * x2; \ out4 += s0 * y2; \ out5 = c0 * y2; \ out5 -= s0 * x2; \ out6 = c0 * x3; \ out6 += s0 * y3; \ out7 = c0 * y3; \ out7 -= s0 * x3; \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ \ LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \ LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \ \ out8 = c0 * x4; \ out8 += s0 * y4; \ out9 = c0 * y4; \ out9 -= s0 * x4; \ out10 = c0 * x5; \ out10 += s0 * y5; \ out11 = c0 * y5; \ out11 -= s0 * x5; \ out12 = c0 * x6; \ out12 += s0 * y6; \ out13 = c0 * y6; \ out13 -= s0 * x6; \ out14 = c0 * x7; \ out14 += s0 * y7; \ out15 = c0 * y7; \ out15 -= s0 * x7; \ \ ST_DP4_INC(out8, out10, out12, out14, x, inc_x2); \ ST_DP4_INC(out9, out11, out13, out15, y, inc_y2); \ } \ if (n & 4) \ { \ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ \ out0 = (c0 * x0) + (s0 * y0); \ out1 = (c0 * y0) - (s0 * x0); \ out2 = (c0 * x1) + (s0 * y1); \ out3 = (c0 * y1) - (s0 * x1); \ out4 = (c0 * x2) + (s0 * y2); \ out5 = (c0 * y2) - (s0 * x2); \ out6 = (c0 * x3) + (s0 * y3); \ out7 = (c0 * y3) - (s0 * x3); \ \ ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ } \ if (n & 2) \ { \ LD_DP2_INC(px, inc_x2, x0, x1); \ LD_DP2_INC(py, inc_y2, y0, y1); \ \ out0 = (c0 * x0) + (s0 * y0); \ out1 = (c0 * y0) - (s0 * x0); \ out2 = (c0 * x1) + (s0 * y1); \ out3 = (c0 * y1) - (s0 * x1); \ \ ST_DP2_INC(out0, out2, x, inc_x2); \ ST_DP2_INC(out1, out3, y, inc_y2); \ } \ if (n & 1) \ { \ x0 = LD_DP(px); \ y0 = LD_DP(py); \ \ out0 = (c0 * x0) + (s0 * y0); \ out1 = (c0 * y0) - (s0 * x0); \ \ ST_DP(out0, px); \ ST_DP(out1, py); \ } \ } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { BLASLONG j; FLOAT *px, *py; v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; v2f64 out0, out1, out2, out3, out4, out5, out6, out7, c0, s0; v2f64 out8, out9, out10, out11, out12, out13, out14, out15; px = x; py = y; if ((1 == inc_x) && (1 == inc_y)) { PROCESS_ZROT(2, 2); } else { inc_x *= 2; inc_y *= 2; PROCESS_ZROT(inc_x, inc_y); } return 0; } OpenBLAS-0.2.20/kernel/mips/zscal.c000066400000000000000000000045301313527062700166560ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG inc_x2; BLASLONG ip = 0; FLOAT temp; inc_x2 = 2 * inc_x; for ( i=0; i 2 3 0 1 */ #define SHF_78 78 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i, inc_x2; FLOAT *px; FLOAT tp0, tp1, f0, f1; v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15; v2f64 da_i_vec, da_i_vec_neg, da_r_vec; px = x; if (1 == inc_x) { if ((0.0 == da_r) && (0.0 == da_i)) { v2f64 zero_v = __msa_cast_to_vector_double(0); zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); for (i = (n >> 4); i--;) { ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 2); ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 2); } if (n & 15) { if (n & 8) { ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, 2); } if (n & 4) { ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); } if (n & 2) { ST_DP2_INC(zero_v, zero_v, x, 2); } if (n & 1) { ST_DP(zero_v, x); } } } else if (0.0 == da_r) { da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); if (n > 15) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32 + 16; LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 4)- 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; x8 = LD_DP(px); px += 2; x0 *= da_i_vec; x9 = LD_DP(px); px += 2; x1 *= da_i_vec; x10 = LD_DP(px); px += 2; x2 *= da_i_vec; x11 = LD_DP(px); px += 2; x3 *= da_i_vec; x12 = LD_DP(px); px += 2; x4 *= da_i_vec; x13 = LD_DP(px); px += 2; x5 *= da_i_vec; x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78); x14 = LD_DP(px); px += 2; x6 *= da_i_vec; x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78); x15 = LD_DP(px); px += 2; x7 *= da_i_vec; x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78); x8 *= da_i_vec; x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78); ST_DP(x0, x); x += 2; x9 *= da_i_vec; x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78); ST_DP(x1, x); x += 2; x10 *= da_i_vec; x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78); ST_DP(x2, x); x += 2; x11 *= da_i_vec; x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78); ST_DP(x3, x); x += 2; x12 *= da_i_vec; x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78); ST_DP(x4, x); x += 2; x13 *= da_i_vec; x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78); ST_DP(x5, x); x += 2; x14 *= da_i_vec; x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78); ST_DP(x6, x); x += 2; x15 *= da_i_vec; x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78); ST_DP(x7, x); x += 2; x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78); ST_DP(x8, x); x += 2; x0 = LD_DP(px); px += 2; x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78); ST_DP(x9, x); x += 2; x1 = LD_DP(px); px += 2; x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78); ST_DP(x10, x); x += 2; x2 = LD_DP(px); px += 2; x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78); ST_DP(x11, x); x += 2; x3 = LD_DP(px); px += 2; x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78); ST_DP(x12, x); x += 2; x4 = LD_DP(px); px += 2; ST_DP(x13, x); x += 2; x5 = LD_DP(px); px += 2; ST_DP(x14, x); x += 2; x6 = LD_DP(px); px += 2; ST_DP(x15, x); x += 2; x7 = LD_DP(px); px += 2; } LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, x4, x5, x6, x7); MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, x8, x9, x10, x11); MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, x12, x13, x14, x15); SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x, 2); } if (n & 15) { if (n & 8) { LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, x4, x5, x6, x7); SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); } if (n & 4) { LD_DP4_INC(px, 2, x0, x1, x2, x3); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); ST_DP4_INC(x0, x1, x2, x3, x, 2); } if (n & 2) { LD_DP2_INC(px, 2, x0, x1); MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); SHF_W2_DP(x0, x1, x0, x1, SHF_78); ST_DP2_INC(x0, x1, x, 2); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da_i, f1, -da_i, f0, f1); ST_GP2_INC(f1, f0, x, 1); } } } else if (0.0 == da_i) { da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); if (n > 15) { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32 + 16; LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 4)- 1; i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; x8 = LD_DP(px); px += 2; x0 *= da_r_vec; x9 = LD_DP(px); px += 2; x1 *= da_r_vec; x10 = LD_DP(px); px += 2; x2 *= da_r_vec; x11 = LD_DP(px); px += 2; x3 *= da_r_vec; x12 = LD_DP(px); px += 2; x4 *= da_r_vec; x13 = LD_DP(px); px += 2; x5 *= da_r_vec; ST_DP(x0, x); x += 2; x14 = LD_DP(px); px += 2; x6 *= da_r_vec; ST_DP(x1, x); x += 2; x15 = LD_DP(px); px += 2; x7 *= da_r_vec; ST_DP(x2, x); x += 2; x8 *= da_r_vec; ST_DP(x3, x); x += 2; x9 *= da_r_vec; ST_DP(x4, x); x += 2; x10 *= da_r_vec; ST_DP(x5, x); x += 2; x11 *= da_r_vec; ST_DP(x6, x); x += 2; x12 *= da_r_vec; ST_DP(x7, x); x += 2; x13 *= da_r_vec; ST_DP(x8, x); x += 2; x0 = LD_DP(px); px += 2; x14 *= da_r_vec; ST_DP(x9, x); x += 2; x1 = LD_DP(px); px += 2; x15 *= da_r_vec; ST_DP(x10, x); x += 2; x2 = LD_DP(px); px += 2; ST_DP(x11, x); x += 2; x3 = LD_DP(px); px += 2; ST_DP(x12, x); x += 2; x4 = LD_DP(px); px += 2; ST_DP(x13, x); x += 2; x5 = LD_DP(px); px += 2; ST_DP(x14, x); x += 2; x6 = LD_DP(px); px += 2; ST_DP(x15, x); x += 2; x7 = LD_DP(px); px += 2; } LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, x4, x5, x6, x7); MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, x8, x9, x10, x11); MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, x12, x13, x14, x15); ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x, 2); } if (n & 15) { if (n & 8) { LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, x4, x5, x6, x7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); } if (n & 4) { LD_DP4_INC(px, 2, x0, x1, x2, x3); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); ST_DP4_INC(x0, x1, x2, x3, x, 2); } if (n & 2) { LD_DP2_INC(px, 2, x0, x1); MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); ST_DP2_INC(x0, x1, x, 2); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da_r, f1, da_r, f0, f1); ST_GP2_INC(f0, f1, x, 1); } } } else { FLOAT *x_pref; BLASLONG pref_offset; pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); if (pref_offset > 0) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } x_pref = x + pref_offset + 32; da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); for (i = (n >> 4); i--;) { PREF_OFFSET(x_pref, 0); PREF_OFFSET(x_pref, 32); PREF_OFFSET(x_pref, 64); PREF_OFFSET(x_pref, 96); PREF_OFFSET(x_pref, 128); PREF_OFFSET(x_pref, 160); PREF_OFFSET(x_pref, 192); PREF_OFFSET(x_pref, 224); x_pref += 32; LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, d4, d5, d6, d7); MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, d8, d9, d10, d11); MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, d12, d13, d14, d15); SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, x, 2); } if (n & 15) { if (n & 8) { LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, d4, d5, d6, d7); SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2); } if (n & 4) { LD_DP4_INC(px, 2, x0, x1, x2, x3); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); ST_DP4_INC(d0, d1, d2, d3, x, 2); } if (n & 2) { LD_DP2_INC(px, 2, x0, x1); MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); SHF_W2_DP(d0, d1, d0, d1, SHF_78); FMADD2(x0, x1, da_r_vec, d0, d1); ST_DP2_INC(d0, d1, x, 2); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); tp0 = da_r * f0; tp0 -= da_i * f1; tp1 = da_r * f1; tp1 += da_i * f0; ST_GP2_INC(tp0, tp1, x, 1); } } } } else { inc_x2 = 2 * inc_x; if ((0.0 == da_r) && (0.0 == da_i)) { v2f64 zero_v = __msa_cast_to_vector_double(0); zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); for (i = (n >> 4); i--;) { ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, inc_x2); ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, inc_x2); } if (n & 15) { if (n & 8) { ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, x, inc_x2); } if (n & 4) { ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2); } if (n & 2) { ST_DP2_INC(zero_v, zero_v, x, inc_x2); } if (n & 1) { ST_DP(zero_v, x); } } } else if (0.0 == da_r) { da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); for (i = (n >> 4); i--;) { LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, x4, x5, x6, x7); MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, x8, x9, x10, x11); MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, x12, x13, x14, x15); SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x, inc_x2); } if (n & 15) { if (n & 8) { LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, x4, x5, x6, x7); SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); } if (n & 4) { LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, x0, x1, x2, x3); SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); } if (n & 2) { LD_DP2_INC(px, inc_x2, x0, x1); MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); SHF_W2_DP(x0, x1, x0, x1, SHF_78); ST_DP2_INC(x0, x1, x, inc_x2); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da_i, f1, -da_i, f0, f1); ST_GP2_INC(f1, f0, x, 1); } } } else if (0.0 == da_i) { da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); for (i = (n >> 4); i--;) { LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, x4, x5, x6, x7); MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, x8, x9, x10, x11); MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, x12, x13, x14, x15); ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x, inc_x2); } if (n & 15) { if (n & 8) { LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, x4, x5, x6, x7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); } if (n & 4) { LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, x0, x1, x2, x3); ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); } if (n & 2) { LD_DP2_INC(px, inc_x2, x0, x1); MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); ST_DP2_INC(x0, x1, x, inc_x2); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); MUL2(f0, da_r, f1, da_r, f0, f1); ST_GP2_INC(f0, f1, x, 1); } } } else { da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); da_i_vec_neg = -da_i_vec; da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); for (i = (n >> 4); i--;) { LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, d4, d5, d6, d7); MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, d8, d9, d10, d11); MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, d12, d13, d14, d15); SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, x, inc_x2); } if (n & 15) { if (n & 8) { LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, d4, d5, d6, d7); SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2); } if (n & 4) { LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, d0, d1, d2, d3); SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); ST_DP4_INC(d0, d1, d2, d3, x, inc_x2); } if (n & 2) { LD_DP2_INC(px, inc_x2, x0, x1); MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); SHF_W2_DP(d0, d1, d0, d1, SHF_78); FMADD2(x0, x1, da_r_vec, d0, d1); ST_DP2_INC(d0, d1, x, inc_x2); } if (n & 1) { LD_GP2_INC(px, 1, f0, f1); tp0 = da_r * f0; tp0 -= da_i * f1; tp1 = da_r * f1; tp1 += da_i * f0; ST_GP2_INC(tp0, tp1, x, 1); } } } } return (0); } OpenBLAS-0.2.20/kernel/mips/zswap.c000066400000000000000000000042731313527062700167120ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT temp[2]; BLASLONG inc_x2; BLASLONG inc_y2; if ( n < 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { temp[0] = x[ix] ; temp[1] = x[ix+1] ; x[ix] = y[iy] ; x[ix+1] = y[iy+1] ; y[iy] = temp[0] ; y[iy+1] = temp[1] ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/mips/zswap_msa.c000066400000000000000000000215671313527062700175570ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include "common.h" #include "macros_msa.h" int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety; FLOAT *px, *py; v2f64 x0, x1, x2, x3, x4, x5, x6, x7; v2f64 y0, y1, y2, y3, y4, y5, y6, y7; if (n < 0) return (0); pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); if (pref_offsetx > 0) { pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; pref_offsetx = pref_offsetx / sizeof(FLOAT); } pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); if (pref_offsety > 0) { pref_offsety = L1_DATA_LINESIZE - pref_offsety; pref_offsety = pref_offsety / sizeof(FLOAT); } inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; px = srcx; py = srcy; if ((1 == inc_x) && (1 == inc_y)) { if (n >> 3) { LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); for (i = (n >> 3) - 1; i--;) { PREFETCH(px + pref_offsetx + 16); PREFETCH(px + pref_offsetx + 20); PREFETCH(px + pref_offsetx + 24); PREFETCH(px + pref_offsetx + 28); PREFETCH(py + pref_offsety + 16); PREFETCH(py + pref_offsety + 20); PREFETCH(py + pref_offsety + 24); PREFETCH(py + pref_offsety + 28); y0 = LD_DP(py); py += 2; ST_DP(x0, srcy); srcy += 2; y1 = LD_DP(py); py += 2; ST_DP(x1, srcy); srcy += 2; y2 = LD_DP(py); py += 2; ST_DP(x2, srcy); srcy += 2; y3 = LD_DP(py); py += 2; ST_DP(x3, srcy); srcy += 2; y4 = LD_DP(py); py += 2; ST_DP(x4, srcy); srcy += 2; y5 = LD_DP(py); py += 2; ST_DP(x5, srcy); srcy += 2; y6 = LD_DP(py); py += 2; ST_DP(x6, srcy); srcy += 2; y7 = LD_DP(py); py += 2; ST_DP(x7, srcy); srcy += 2; x0 = LD_DP(px); px += 2; ST_DP(y0, srcx); srcx += 2; x1 = LD_DP(px); px += 2; ST_DP(y1, srcx); srcx += 2; x2 = LD_DP(px); px += 2; ST_DP(y2, srcx); srcx += 2; x3 = LD_DP(px); px += 2; ST_DP(y3, srcx); srcx += 2; x4 = LD_DP(px); px += 2; ST_DP(y4, srcx); srcx += 2; x5 = LD_DP(px); px += 2; ST_DP(y5, srcx); srcx += 2; x6 = LD_DP(px); px += 2; ST_DP(y6, srcx); srcx += 2; x7 = LD_DP(px); px += 2; ST_DP(y7, srcx); srcx += 2; } LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2); } if (n & 7) { if ((n & 4) && (n & 2) && (n & 1)) { LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6); LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6); ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2); ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2); } else if ((n & 4) && (n & 2)) { LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5); LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5); ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2); ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2); } else if ((n & 4) && (n & 1)) { LD_DP5_INC(px, 2, x0, x1, x2, x3, x4); LD_DP5_INC(py, 2, y0, y1, y2, y3, y4); ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2); ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2); } else if ((n & 2) && (n & 1)) { LD_DP3_INC(px, 2, x0, x1, x2); LD_DP3_INC(py, 2, y0, y1, y2); ST_DP3_INC(x0, x1, x2, srcy, 2); ST_DP3_INC(y0, y1, y2, srcx, 2); } else if (n & 4) { LD_DP4_INC(px, 2, x0, x1, x2, x3); LD_DP4_INC(py, 2, y0, y1, y2, y3); ST_DP4_INC(x0, x1, x2, x3, srcy, 2); ST_DP4_INC(y0, y1, y2, y3, srcx, 2); } else if (n & 2) { LD_DP2_INC(px, 2, x0, x1); LD_DP2_INC(py, 2, y0, y1); ST_DP2_INC(x0, x1, srcy, 2); ST_DP2_INC(y0, y1, srcx, 2); } else if (n & 1) { x0 = LD_DP(px); y0 = LD_DP(py); ST_DP(y0, srcx); ST_DP(x0, srcy); } } } else { for (i = (n >> 3); i--;) { LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2); ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2); } if (n & 7) { if ((n & 4) && (n & 2) && (n & 1)) { LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6); LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6); ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2); ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2); } else if ((n & 4) && (n & 2)) { LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5); LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5); ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2); ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2); } else if ((n & 4) && (n & 1)) { LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4); LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4); ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2); ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2); } else if ((n & 2) && (n & 1)) { LD_DP3_INC(px, inc_x2, x0, x1, x2); LD_DP3_INC(py, inc_y2, y0, y1, y2); ST_DP3_INC(x0, x1, x2, srcy, inc_y2); ST_DP3_INC(y0, y1, y2, srcx, inc_x2); } else if (n & 4) { LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2); ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2); } else if (n & 2) { LD_DP2_INC(px, inc_x2, x0, x1); LD_DP2_INC(py, inc_y2, y0, y1); ST_DP2_INC(x0, x1, srcy, inc_y2); ST_DP2_INC(y0, y1, srcx, inc_x2); } else if (n & 1) { x0 = LD_DP(px); y0 = LD_DP(py); ST_DP(y0, srcx); ST_DP(x0, srcy); } } } return (0); } OpenBLAS-0.2.20/kernel/mips64/000077500000000000000000000000001313527062700155465ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/mips64/KERNEL000066400000000000000000000062501313527062700164540ustar00rootroot00000000000000ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif ifndef DNRM2KERNEL DNRM2KERNEL = dnrm2.S endif ifndef CNRM2KERNEL CNRM2KERNEL = cnrm2.S endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.S endif ifndef SCABS_KERNEL SCABS_KERNEL = ../generic/cabs.c endif ifndef DCABS_KERNEL DCABS_KERNEL = ../generic/cabs.c endif ifndef QCABS_KERNEL QCABS_KERNEL = ../generic/cabs.c endif ifndef LSAME_KERNEL LSAME_KERNEL = ../generic/lsame.c endif ifndef SGEMMKERNEL SGEMMKERNEL = gemm_kernel.S SGEMMINCOPY = ../generic/gemm_ncopy_2.c SGEMMITCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o endif ifndef DGEMMKERNEL DGEMMKERNEL = gemm_kernel.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o endif ifndef CGEMMKERNEL CGEMMKERNEL = zgemm_kernel.S CGEMMINCOPY = ../generic/zgemm_ncopy_1.c CGEMMITCOPY = ../generic/zgemm_tcopy_1.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o endif ifndef ZGEMMKERNEL ZGEMMKERNEL = zgemm_kernel.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ZGEMMINCOPYOBJ = zgemm_incopy.o ZGEMMITCOPYOBJ = zgemm_itcopy.o ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o endif ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA DGEMM_BETA = ../generic/gemm_beta.c endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif ifndef STRSMKERNEL_LN STRSMKERNEL_LN = trsm_kernel_LN.S endif ifndef STRSMKERNEL_LT STRSMKERNEL_LT = trsm_kernel_LT.S endif ifndef STRSMKERNEL_RN STRSMKERNEL_RN = trsm_kernel_LT.S endif ifndef STRSMKERNEL_RT STRSMKERNEL_RT = trsm_kernel_RT.S endif ifndef DTRSMKERNEL_LN DTRSMKERNEL_LN = trsm_kernel_LN.S endif ifndef DTRSMKERNEL_LT DTRSMKERNEL_LT = trsm_kernel_LT.S endif ifndef DTRSMKERNEL_RN DTRSMKERNEL_RN = trsm_kernel_LT.S endif ifndef DTRSMKERNEL_RT DTRSMKERNEL_RT = trsm_kernel_RT.S endif ifndef CTRSMKERNEL_LN CTRSMKERNEL_LN = ztrsm_kernel_LT.S endif ifndef CTRSMKERNEL_LT CTRSMKERNEL_LT = ztrsm_kernel_LT.S endif ifndef CTRSMKERNEL_RN CTRSMKERNEL_RN = ztrsm_kernel_LT.S endif ifndef CTRSMKERNEL_RT CTRSMKERNEL_RT = ztrsm_kernel_RT.S endif ifndef ZTRSMKERNEL_LN ZTRSMKERNEL_LN = ztrsm_kernel_LT.S endif ifndef ZTRSMKERNEL_LT ZTRSMKERNEL_LT = ztrsm_kernel_LT.S endif ifndef ZTRSMKERNEL_RN ZTRSMKERNEL_RN = ztrsm_kernel_LT.S endif ifndef ZTRSMKERNEL_RT ZTRSMKERNEL_RT = ztrsm_kernel_RT.S endif CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S OpenBLAS-0.2.20/kernel/mips64/KERNEL.I6400000066400000000000000000000000521313527062700171470ustar00rootroot00000000000000include $(KERNELDIR)/../mips/KERNEL.P5600 OpenBLAS-0.2.20/kernel/mips64/KERNEL.LOONGSON3A000066400000000000000000000042231313527062700200730ustar00rootroot00000000000000SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S SGEMVNKERNEL = gemv_n_loongson3a.c SGEMVTKERNEL = gemv_t_loongson3a.c DGEMVNKERNEL = gemv_n_loongson3a.c DGEMVTKERNEL = gemv_t_loongson3a.c CGEMVNKERNEL = zgemv_n_loongson3a.c CGEMVTKERNEL = zgemv_t_loongson3a.c ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c SGEMMKERNEL = sgemm_kernel_8x4_ps.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c OpenBLAS-0.2.20/kernel/mips64/KERNEL.LOONGSON3B000066400000000000000000000040061313527062700200730ustar00rootroot00000000000000SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S SGEMVNKERNEL = gemv_n_loongson3a.c SGEMVTKERNEL = gemv_t_loongson3a.c DGEMVNKERNEL = gemv_n_loongson3a.c DGEMVTKERNEL = gemv_t_loongson3a.c CGEMVNKERNEL = zgemv_n_loongson3a.c CGEMVTKERNEL = zgemv_t_loongson3a.c ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c OpenBLAS-0.2.20/kernel/mips64/KERNEL.P6600000066400000000000000000000000521313527062700171600ustar00rootroot00000000000000include $(KERNELDIR)/../mips/KERNEL.P5600 OpenBLAS-0.2.20/kernel/mips64/Makefile000066400000000000000000000000121313527062700171770ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/mips64/amax.S000066400000000000000000000123351313527062700166240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD a1, 0 * SIZE(X) daddiu N, N, -1 daddu X, X, INCX FABS s1, a1 blez N, .L999 FABS s2, a1 FABS s3, a1 dsra I, N, 3 blez I, .L15 FABS s4, a1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 daddu X, X, INCX FABS t3, a3 LD a2, 0 * SIZE(X) FABS t4, a4 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a3, 0 * SIZE(X) CMPLT $fcc1, s2, t2 daddu X, X, INCX CMPLT $fcc2, s3, t3 LD a4, 0 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 LD a5, 0 * SIZE(X) FABS t2, a6 daddu X, X, INCX FABS t3, a7 LD a6, 0 * SIZE(X) FABS t4, a8 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, t2 daddu X, X, INCX CMPLT $fcc2, s3, t3 LD a8, 0 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 daddiu I, I, -1 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 bgtz I, .L12 CMOVT s4, t4, $fcc3 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 FABS t2, a6 FABS t3, a7 FABS t4, a8 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu I, I, -1 FABS t1, a1 CMPLT $fcc0, s1, t1 CMOVT s1, t1, $fcc0 bgtz I, .L16 daddu X, X, INCX .align 3 .L998: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/amin.S000066400000000000000000000123351313527062700166220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD a1, 0 * SIZE(X) daddiu N, N, -1 daddu X, X, INCX FABS s1, a1 blez N, .L999 FABS s2, a1 FABS s3, a1 dsra I, N, 3 blez I, .L15 FABS s4, a1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 daddu X, X, INCX FABS t3, a3 LD a2, 0 * SIZE(X) FABS t4, a4 daddu X, X, INCX CMPLT $fcc0, t1, s1 LD a3, 0 * SIZE(X) CMPLT $fcc1, t2, s2 daddu X, X, INCX CMPLT $fcc2, t3, s3 LD a4, 0 * SIZE(X) CMPLT $fcc3, t4, s4 daddu X, X, INCX CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 LD a5, 0 * SIZE(X) FABS t2, a6 daddu X, X, INCX FABS t3, a7 LD a6, 0 * SIZE(X) FABS t4, a8 daddu X, X, INCX CMPLT $fcc0, t1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, t2, s2 daddu X, X, INCX CMPLT $fcc2, t3, s3 LD a8, 0 * SIZE(X) CMPLT $fcc3, t4, s4 daddu X, X, INCX CMOVT s1, t1, $fcc0 daddiu I, I, -1 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 bgtz I, .L12 CMOVT s4, t4, $fcc3 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 CMPLT $fcc0, t1, s1 CMPLT $fcc1, t2, s2 CMPLT $fcc2, t3, s3 CMPLT $fcc3, t4, s4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 FABS t2, a6 FABS t3, a7 FABS t4, a8 CMPLT $fcc0, t1, s1 CMPLT $fcc1, t2, s2 CMPLT $fcc2, t3, s3 CMPLT $fcc3, t4, s4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu I, I, -1 FABS t1, a1 CMPLT $fcc0, t1, s1 CMOVT s1, t1, $fcc0 bgtz I, .L16 daddu X, X, INCX .align 3 .L998: CMPLT $fcc0, s2, s1 CMPLT $fcc1, s4, s3 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s3, s1 CMOVT s1, s3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/asum.S000066400000000000000000000133631313527062700166450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f2 #define a2 $f3 #define a3 $f4 #define a4 $f5 #define a5 $f6 #define a6 $f7 #define a7 $f8 #define a8 $f9 #define t1 $f10 #define t2 $f11 #define t3 $f12 #define t4 $f13 #define s1 $f0 #define s2 $f1 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif MTC $0, s1 MTC $0, s2 dsll INCX, INCX, BASE_SHIFT blez N, .L999 li TEMP, SIZE bne INCX, TEMP, .L20 dsra I, N, 3 blez I, .L15 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD a3, 2 * SIZE(X) LD a4, 3 * SIZE(X) LD a5, 4 * SIZE(X) FABS t1, a1 LD a6, 5 * SIZE(X) FABS t2, a2 LD a7, 6 * SIZE(X) FABS t3, a3 FABS t4, a4 daddiu I, I, -1 blez I, .L13 LD a8, 7 * SIZE(X) .align 3 .L12: ADD s1, s1, t1 LD a1, 8 * SIZE(X) FABS t1, a5 daddiu I, I, -1 ADD s2, s2, t2 LD a2, 9 * SIZE(X) FABS t2, a6 NOP ADD s1, s1, t3 LD a3, 10 * SIZE(X) FABS t3, a7 NOP ADD s2, s2, t4 LD a4, 11 * SIZE(X) FABS t4, a8 daddiu X, X, 8 * SIZE ADD s1, s1, t1 LD a5, 4 * SIZE(X) FABS t1, a1 NOP ADD s2, s2, t2 LD a6, 5 * SIZE(X) FABS t2, a2 NOP ADD s1, s1, t3 LD a7, 6 * SIZE(X) FABS t3, a3 NOP ADD s2, s2, t4 LD a8, 7 * SIZE(X) bgtz I, .L12 FABS t4, a4 .align 3 .L13: ADD s1, s1, t1 daddiu X, X, 8 * SIZE FABS t1, a5 NOP ADD s2, s2, t2 FABS t2, a6 ADD s1, s1, t3 FABS t3, a7 ADD s2, s2, t4 FABS t4, a8 ADD s1, s1, t1 ADD s2, s2, t2 ADD s1, s1, t3 ADD s2, s2, t4 .align 3 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu I, I, -1 FABS t1, a1 ADD s1, s1, t1 bgtz I, .L16 daddiu X, X, SIZE j .L999 NOP .align 3 .L20: blez I, .L25 NOP LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX FABS t1, a1 LD a7, 0 * SIZE(X) FABS t2, a2 daddu X, X, INCX FABS t3, a3 LD a8, 0 * SIZE(X) FABS t4, a4 daddiu I, I, -1 blez I, .L24 daddu X, X, INCX .align 3 .L23: ADD s1, s1, t1 LD a1, 0 * SIZE(X) FABS t1, a5 daddu X, X, INCX ADD s2, s2, t2 LD a2, 0 * SIZE(X) FABS t2, a6 daddu X, X, INCX ADD s1, s1, t3 LD a3, 0 * SIZE(X) FABS t3, a7 daddu X, X, INCX ADD s2, s2, t4 LD a4, 0 * SIZE(X) FABS t4, a8 daddu X, X, INCX ADD s1, s1, t1 LD a5, 0 * SIZE(X) FABS t1, a1 daddu X, X, INCX ADD s2, s2, t2 LD a6, 0 * SIZE(X) FABS t2, a2 daddu X, X, INCX ADD s1, s1, t3 LD a7, 0 * SIZE(X) FABS t3, a3 daddu X, X, INCX ADD s2, s2, t4 LD a8, 0 * SIZE(X) FABS t4, a4 daddiu I, I, -1 bgtz I, .L23 daddu X, X, INCX .align 3 .L24: ADD s1, s1, t1 FABS t1, a5 ADD s2, s2, t2 FABS t2, a6 ADD s1, s1, t3 FABS t3, a7 ADD s2, s2, t4 FABS t4, a8 ADD s1, s1, t1 ADD s2, s2, t2 ADD s1, s1, t3 ADD s2, s2, t4 .align 3 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) daddiu I, I, -1 FABS t1, a1 daddu X, X, INCX bgtz I, .L26 ADD s1, s1, t1 .align 3 .L999: j $31 ADD s1, s1, s2 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/axpy.S000066400000000000000000000202641313527062700166570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $8 #define INCX $9 #define Y $10 #define INCY $11 #define I $2 #define TEMP $3 #define YY $5 #define ALPHA $f15 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f14 #define b8 $f17 #define t1 $f18 #define t2 $f19 #define t3 $f20 #define t4 $f21 PROLOGUE #ifndef __64BIT__ daddiu $sp, $sp, -16 sdc1 $f20, 0($sp) sdc1 $f21, 8($sp) #endif li TEMP, SIZE blez N, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, BASE_SHIFT bne INCY, TEMP, .L20 dsra I, N, 3 blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) LD a3, 2 * SIZE(X) LD b3, 2 * SIZE(Y) LD a4, 3 * SIZE(X) LD b4, 3 * SIZE(Y) LD a5, 4 * SIZE(X) LD b5, 4 * SIZE(Y) LD a6, 5 * SIZE(X) LD b6, 5 * SIZE(Y) LD a7, 6 * SIZE(X) LD b7, 6 * SIZE(Y) LD a8, 7 * SIZE(X) LD b8, 7 * SIZE(Y) blez I, .L13 NOP .align 3 .L12: MADD t1, b1, ALPHA, a1 LD a1, 8 * SIZE(X) LD b1, 8 * SIZE(Y) MADD t2, b2, ALPHA, a2 LD a2, 9 * SIZE(X) LD b2, 9 * SIZE(Y) MADD t3, b3, ALPHA, a3 LD a3, 10 * SIZE(X) LD b3, 10 * SIZE(Y) MADD t4, b4, ALPHA, a4 LD a4, 11 * SIZE(X) LD b4, 11 * SIZE(Y) ST t1, 0 * SIZE(Y) ST t2, 1 * SIZE(Y) ST t3, 2 * SIZE(Y) ST t4, 3 * SIZE(Y) MADD t1, b5, ALPHA, a5 LD a5, 12 * SIZE(X) LD b5, 12 * SIZE(Y) MADD t2, b6, ALPHA, a6 LD a6, 13 * SIZE(X) LD b6, 13 * SIZE(Y) MADD t3, b7, ALPHA, a7 LD a7, 14 * SIZE(X) LD b7, 14 * SIZE(Y) MADD t4, b8, ALPHA, a8 LD a8, 15 * SIZE(X) LD b8, 15 * SIZE(Y) ST t1, 4 * SIZE(Y) ST t2, 5 * SIZE(Y) ST t3, 6 * SIZE(Y) ST t4, 7 * SIZE(Y) daddiu I, I, -1 daddiu Y, Y, 8 * SIZE bgtz I, .L12 daddiu X, X, 8 * SIZE .align 3 .L13: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 ST t1, 0 * SIZE(Y) MADD t1, b5, ALPHA, a5 ST t2, 1 * SIZE(Y) MADD t2, b6, ALPHA, a6 ST t3, 2 * SIZE(Y) MADD t3, b7, ALPHA, a7 ST t4, 3 * SIZE(Y) MADD t4, b8, ALPHA, a8 ST t1, 4 * SIZE(Y) ST t2, 5 * SIZE(Y) ST t3, 6 * SIZE(Y) ST t4, 7 * SIZE(Y) daddiu X, X, 8 * SIZE daddiu Y, Y, 8 * SIZE .align 3 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu X, X, SIZE daddiu Y, Y, SIZE MADD t1, b1, ALPHA, a1 daddiu I, I, -1 bgtz I, .L16 ST t1, -1 * SIZE(Y) #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f21, 8($sp) daddiu $sp, $sp, 16 #endif j $31 NOP .align 3 .L20: beqz INCY, .L27 dsra I, N, 3 move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD b1, 0 * SIZE(Y) daddu Y, Y, INCY LD a2, 0 * SIZE(X) daddu X, X, INCX LD b2, 0 * SIZE(Y) daddu Y, Y, INCY LD a3, 0 * SIZE(X) daddu X, X, INCX LD b3, 0 * SIZE(Y) daddu Y, Y, INCY LD a4, 0 * SIZE(X) daddu X, X, INCX LD b4, 0 * SIZE(Y) daddu Y, Y, INCY LD a5, 0 * SIZE(X) daddu X, X, INCX LD b5, 0 * SIZE(Y) daddu Y, Y, INCY LD a6, 0 * SIZE(X) daddu X, X, INCX LD b6, 0 * SIZE(Y) daddu Y, Y, INCY LD a7, 0 * SIZE(X) daddu X, X, INCX LD b7, 0 * SIZE(Y) daddu Y, Y, INCY LD a8, 0 * SIZE(X) daddu X, X, INCX LD b8, 0 * SIZE(Y) daddu Y, Y, INCY blez I, .L23 NOP .align 3 .L22: MADD t1, b1, ALPHA, a1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t2, b2, ALPHA, a2 LD a2, 0 * SIZE(X) LD b2, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t3, b3, ALPHA, a3 LD a3, 0 * SIZE(X) LD b3, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t4, b4, ALPHA, a4 LD a4, 0 * SIZE(X) LD b4, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t1, 0 * SIZE(YY) daddu YY, YY, INCY MADD t1, b5, ALPHA, a5 LD a5, 0 * SIZE(X) LD b5, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY MADD t2, b6, ALPHA, a6 LD a6, 0 * SIZE(X) LD b6, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY MADD t3, b7, ALPHA, a7 LD a7, 0 * SIZE(X) LD b7, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t4, 0 * SIZE(YY) daddu YY, YY, INCY MADD t4, b8, ALPHA, a8 LD a8, 0 * SIZE(X) daddu X, X, INCX LD b8, 0 * SIZE(Y) daddu Y, Y, INCY ST t1, 0 * SIZE(YY) daddu YY, YY, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY ST t4, 0 * SIZE(YY) daddiu I, I, -1 bgtz I, .L22 daddu YY, YY, INCY .align 3 .L23: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 ST t1, 0 * SIZE(YY) daddu YY, YY, INCY MADD t1, b5, ALPHA, a5 ST t2, 0 * SIZE(YY) daddu YY, YY, INCY MADD t2, b6, ALPHA, a6 ST t3, 0 * SIZE(YY) daddu YY, YY, INCY MADD t3, b7, ALPHA, a7 ST t4, 0 * SIZE(YY) daddu YY, YY, INCY MADD t4, b8, ALPHA, a8 ST t1, 0 * SIZE(YY) daddu YY, YY, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY ST t4, 0 * SIZE(YY) daddu YY, YY, INCY .align 3 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) MADD t1, b1, ALPHA, a1 daddu X, X, INCX ST t1, 0 * SIZE(Y) daddiu I, I, -1 bgtz I, .L26 daddu Y, Y, INCY .align 3 .L999: #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f21, 8($sp) daddiu $sp, $sp, 16 #endif j $31 NOP .align 3 .L27: LD b1, 0 * SIZE(Y) .L28: daddiu N, N, -1 LD a1, 0 * SIZE(X) daddu X, X, INCX bgtz N, .L28 MADD b1, b1, ALPHA, a1 j .L999 ST b1, 0 * SIZE(Y) EPILOGUE OpenBLAS-0.2.20/kernel/mips64/axpy_loongson3a.S000066400000000000000000000235551313527062700210270ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_DISTANCE 48 #define N $4 #define X $8 #define INCX $9 #define Y $10 #define INCY $11 #define I $2 #define TEMP $3 #define YY $5 #define ALPHA $f15 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f14 #define b8 $f17 #define t1 $f18 #define t2 $f19 #define t3 $f20 #define t4 $f21 PROLOGUE #ifndef __64BIT__ daddiu $sp, $sp, -16 sdc1 $f20, 0($sp) sdc1 $f21, 8($sp) #endif li TEMP, SIZE blez N, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, BASE_SHIFT bne INCY, TEMP, .L20 dsra I, N, 3 blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD a3, 2 * SIZE(X) LD a4, 3 * SIZE(X) LD a5, 4 * SIZE(X) LD a6, 5 * SIZE(X) LD a7, 6 * SIZE(X) LD a8, 7 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) LD b3, 2 * SIZE(Y) LD b4, 3 * SIZE(Y) LD b5, 4 * SIZE(Y) LD b6, 5 * SIZE(Y) LD b7, 6 * SIZE(Y) LD b8, 7 * SIZE(Y) blez I, .L13 NOP .align 5 .L12: PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 LD b1, 8 * SIZE(Y) LD b2, 9 * SIZE(Y) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 LD b3, 10 * SIZE(Y) LD b4, 11 * SIZE(Y) LD a1, 8 * SIZE(X) LD a2, 9 * SIZE(X) LD a3, 10 * SIZE(X) LD a4, 11 * SIZE(X) ST t1, 0 * SIZE(Y) ST t2, 1 * SIZE(Y) ST t3, 2 * SIZE(Y) ST t4, 3 * SIZE(Y) PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 LD b5, 12 * SIZE(Y) LD b6, 13 * SIZE(Y) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 LD b7, 14 * SIZE(Y) LD b8, 15 * SIZE(Y) LD a5, 12 * SIZE(X) LD a6, 13 * SIZE(X) LD a7, 14 * SIZE(X) LD a8, 15 * SIZE(X) ST t1, 4 * SIZE(Y) ST t2, 5 * SIZE(Y) ST t3, 6 * SIZE(Y) ST t4, 7 * SIZE(Y) daddiu I, I, -1 daddiu Y, Y, 8 * SIZE bgtz I, .L12 daddiu X, X, 8 * SIZE .align 5 .L13: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 ST t1, 0 * SIZE(Y) MADD t1, b5, ALPHA, a5 ST t2, 1 * SIZE(Y) MADD t2, b6, ALPHA, a6 ST t3, 2 * SIZE(Y) MADD t3, b7, ALPHA, a7 ST t4, 3 * SIZE(Y) MADD t4, b8, ALPHA, a8 ST t1, 4 * SIZE(Y) ST t2, 5 * SIZE(Y) ST t3, 6 * SIZE(Y) ST t4, 7 * SIZE(Y) daddiu X, X, 8 * SIZE daddiu Y, Y, 8 * SIZE .align 5 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu X, X, SIZE daddiu Y, Y, SIZE MADD t1, b1, ALPHA, a1 daddiu I, I, -1 bgtz I, .L16 ST t1, -1 * SIZE(Y) #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f21, 8($sp) daddiu $sp, $sp, 16 #endif j $31 NOP .align 5 .L20: dsra I, N, 3 move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD b1, 0 * SIZE(Y) daddu Y, Y, INCY LD a2, 0 * SIZE(X) daddu X, X, INCX LD b2, 0 * SIZE(Y) daddu Y, Y, INCY LD a3, 0 * SIZE(X) daddu X, X, INCX LD b3, 0 * SIZE(Y) daddu Y, Y, INCY LD a4, 0 * SIZE(X) daddu X, X, INCX LD b4, 0 * SIZE(Y) daddu Y, Y, INCY LD a5, 0 * SIZE(X) daddu X, X, INCX LD b5, 0 * SIZE(Y) daddu Y, Y, INCY LD a6, 0 * SIZE(X) daddu X, X, INCX LD b6, 0 * SIZE(Y) daddu Y, Y, INCY LD a7, 0 * SIZE(X) daddu X, X, INCX LD b7, 0 * SIZE(Y) daddu Y, Y, INCY LD a8, 0 * SIZE(X) daddu X, X, INCX LD b8, 0 * SIZE(Y) daddu Y, Y, INCY blez I, .L23 NOP .align 5 .L22: MADD t1, b1, ALPHA, a1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t2, b2, ALPHA, a2 LD a2, 0 * SIZE(X) LD b2, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t3, b3, ALPHA, a3 LD a3, 0 * SIZE(X) LD b3, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t4, b4, ALPHA, a4 LD a4, 0 * SIZE(X) LD b4, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t1, 0 * SIZE(YY) daddu YY, YY, INCY MADD t1, b5, ALPHA, a5 LD a5, 0 * SIZE(X) LD b5, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY MADD t2, b6, ALPHA, a6 LD a6, 0 * SIZE(X) LD b6, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY MADD t3, b7, ALPHA, a7 LD a7, 0 * SIZE(X) LD b7, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t4, 0 * SIZE(YY) daddu YY, YY, INCY MADD t4, b8, ALPHA, a8 LD a8, 0 * SIZE(X) daddu X, X, INCX LD b8, 0 * SIZE(Y) daddu Y, Y, INCY ST t1, 0 * SIZE(YY) daddu YY, YY, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY ST t4, 0 * SIZE(YY) daddiu I, I, -1 bgtz I, .L22 daddu YY, YY, INCY .align 5 .L23: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 ST t1, 0 * SIZE(YY) daddu YY, YY, INCY MADD t1, b5, ALPHA, a5 ST t2, 0 * SIZE(YY) daddu YY, YY, INCY MADD t2, b6, ALPHA, a6 ST t3, 0 * SIZE(YY) daddu YY, YY, INCY MADD t3, b7, ALPHA, a7 ST t4, 0 * SIZE(YY) daddu YY, YY, INCY MADD t4, b8, ALPHA, a8 ST t1, 0 * SIZE(YY) daddu YY, YY, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY ST t4, 0 * SIZE(YY) daddu YY, YY, INCY .align 5 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) MADD t1, b1, ALPHA, a1 daddu X, X, INCX ST t1, 0 * SIZE(Y) daddiu I, I, -1 bgtz I, .L26 daddu Y, Y, INCY .align 5 .L999: #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f21, 8($sp) daddiu $sp, $sp, 16 #endif j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/cgemm_kernel_loongson3a_2x2.S000066400000000000000000000642401313527062700231650ustar00rootroot00000000000000#define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define STACKSIZE 160 #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define R12 12 #define R13 13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define PREA $16 #define PREB $17 #if defined(TRMMKERNEL) #define OFFSET $18 #define KK $19 #define TEMP $20 #endif #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define b1 $f4 #define b2 $f5 #define b3 $f6 #define b4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f15 #define b8 $f16 #define c11 $f14 #define c12 $f17 #define c13 $f18 #define c14 $f19 #define c21 $f20 #define c22 $f21 #define c23 $f22 #define c24 $f23 #define c31 $f24 #define c32 $f25 #define c33 $f26 #define c34 $f27 #define c41 $f28 #define c42 $f29 #define c43 $f30 #define c44 $f31 #define F0 0 #define F1 1 #define F2 2 #define F3 3 #define F4 4 #define F5 5 #define F6 6 #define F7 7 #define F8 8 #define F9 9 #define F10 10 #define F11 11 #define F12 12 #define F13 13 #define F14 14 #define F15 15 #define F16 16 #define F17 17 #define F18 18 #define F19 19 #define F20 20 #define F21 21 #define F22 22 #define F23 23 #define F24 24 #define F25 25 #define F26 26 #define F27 27 #define F28 28 #define F29 29 #define F30 30 #define F31 31 #define ALPHA_R $f15 #define ALPHA_I $f16 ################################# ## MADD1 a*c ## MADD2 b*c ## MADD3 a*d ## MADD4 d*b ################################## #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 NMSUB #define MADD4 NMSUB #endif PROLOGUE LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE SDARG $16, 0($sp) SDARG $17, 8($sp) sdc1 $f24, 16($sp) sdc1 $f25, 24($sp) sdc1 $f26, 32($sp) sdc1 $f27, 40($sp) sdc1 $f28, 48($sp) sdc1 $f29, 56($sp) #if defined(TRMMKERNEL) SDARG $18, 64($sp) SDARG $19, 72($sp) SDARG $20, 80($sp) LDARG OFFSET, STACKSIZE + 8($sp) #endif #ifndef __64BIT__ sdc1 $f20, 88($sp) sdc1 $f21, 96($sp) sdc1 $f22,104($sp) sdc1 $f23,112($sp) #endif dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE blez J, .L20 ST ALPHA_I, 136($sp) .align 5 .L10: #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 move CO1, C # Fix pointer Cx daddu CO2, C, LDC move AO, A # Reset AO blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif MTC $0, c11 # Clear results regs LD a1, 0 * SIZE(AO) MOV c12, c11 LD a2, 1 * SIZE(AO) MOV c13, c11 LD b1, 0 * SIZE(BO) MOV c14, c11 LD b2, 1 * SIZE(BO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 LD b4, 3 * SIZE(BO) FETCH $0, 0 * SIZE(CO2) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO1) MOV c43, c11 MOV c44, c11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs LD a1, 0 * SIZE(AO) MOV c12, c11 LD a2, 1 * SIZE(AO) MOV c13, c11 LD b1, 0 * SIZE(BO) MOV c14, c11 LD b2, 1 * SIZE(BO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 LD b4, 3 * SIZE(BO) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO2) MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO2) MOV c43, c11 NOP FETCH $0, 4 * SIZE(CO1) daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 MOV c44, c11 #endif .align 5 .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 4 * SIZE(PREA) FETCH $0, 4 * SIZE(PREB) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 8 * SIZE(PREA) FETCH $0, 8 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 MADD4 c44, c44, a8, b8 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 12 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 daddiu AO, AO, 16 * SIZE daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 daddu PREA, PREA, 16 * SIZE MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 daddu PREB, PREB, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 0 * SIZE(PREA) FETCH $0, 0 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 bgtz L, .L12 MADD4 c44, c44, a8, b8 .align 5 .L15: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L18 LD ALPHA_I, 136($sp) .align 5 .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu PREA, PREA, 4 * SIZE daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 0 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 FETCH $0, 0 * SIZE(PREB) MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) bgtz L, .L16 NOP .L18: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) ADD c41, c44, c41 LD b3, 2 * SIZE(CO2) ADD c42, c43, c42 LD b4, 3 * SIZE(CO2) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 ST a1, 0 * SIZE(CO1) MADD b3, b3, ALPHA_R, c41 MADD b4, b4, ALPHA_R, c42 ST a2, 1 * SIZE(CO1) NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST b1, 2 * SIZE(CO1) NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 ADD c31, c34, c31 ADD c32, c33, c32 ADD c41, c44, c41 ADD c42, c43, c42 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 MUL b3, ALPHA_R, c41 MUL b4, ALPHA_R, c42 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE .align 5 .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel daddu PREB, PREB, B # PREA=A+panel size blez I, .L19 daddu C, C, LDC # Change C to next panel #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, ZBASE_SHIFT # MR=1 dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(PREB) MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 #else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 blez L, .L35 NOP #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) dsra L, K, 2 # Unroll K 4 times move BO, B LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREB) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) MOV c33, c11 blez L, .L35 MOV c34, c11 #endif .align 5 .L32: LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 NOP LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD1 c31, c31, a3, b7 # A1xB2 MADD3 c33, c33, a3, b8 FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a4, b7 MADD4 c34, c34, a4, b8 daddiu L, L, -1 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD1 c31, c31, a5, b3 # A1xB2 MADD3 c33, c33, a5, b4 daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a6, b3 MADD4 c34, c34, a6, b4 NOP LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a8, b5 # bxc MADD4 c14, c14, a8, b6 # bxd LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD1 c31, c31, a7, b7 # A1xB2 NOP MADD3 c33, c33, a7, b8 daddiu PREB, PREB, 16 * SIZE FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a8, b7 bgtz L, .L32 MADD4 c34, c34, a8, b8 .L35: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L38 LD ALPHA_I, 136($sp) .align 5 .L36: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) bgtz L, .L36 NOP .L38: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c31, c34, c31 ADD c32, c33, c32 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 5 .L19: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif bgtz J, .L10 move B, BO .align 5 .L20: andi J, N, 1 blez J, .L999 dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 dsra I, M, 1 # I=M/2 move CO1, C #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif move AO, A # Reset AO blez I, .L29 daddu PREA, PREA, A .L21: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + ZBASE_SHIFT dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 # define Mr=2 #else daddiu TEMP, KK, 1 # define NR=1 #endif dsra L, TEMP, 2 blez L, .L25 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) blez L, .L25 NOP #endif .align 5 .L22: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 FETCH $0, 8 * SIZE(PREA) MADD2 c22, c22, a8, b3 MADD4 c24, c24, a8, b4 daddiu L, L, -1 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 daddiu PREA, PREA, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a6, b7 # bxc MADD4 c14, c14, a6, b8 # bxd LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c21, c21, a7, b7 # A2xB1 MADD3 c23, c23, a7, b8 FETCH $0, 0 * SIZE(PREA) MADD2 c22, c22, a8, b7 bgtz L, .L22 MADD4 c24, c24, a8, b8 .L25: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L28 LD ALPHA_I, 136($sp) .align 3 .L26: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) bgtz L, .L26 FETCH $0, 0 * SIZE(PREA) .L28: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 1 + ZBASE_SHIFT dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP .L29: andi I, M, 1 blez I, .L999 NOP #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L45 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP #endif .align 3 .L42: # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd # gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd # gsLQC1(R12, F9, F8, 2) # Unroll K=1 LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd # gsLQC1(R13, F13, F12, 2) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd # gsLQC1(R12, F11, F10, 3) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu L, L, -1 # gsLQC1(R13, F16, F15, 3) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a8, b7 # bxc MADD4 c14, c14, a8, b8 # bxd bgtz L, .L42 NOP .align 5 .L45: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L48 LD ALPHA_I, 136($sp) .L46: daddiu L, L, -1 daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) bgtz L, .L46 NOP .L48: #ifndef TRMMKERNEL ADD c11, c14, c11 ADD c12, c13, c12 LD a1, 0 * SIZE(CO1) LD a2, 1 * SIZE(CO1) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -1 #endif dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif daddiu CO1,CO1, 2 * SIZE #endif .align 5 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) ldc1 $f24, 16($sp) ldc1 $f25, 24($sp) ldc1 $f26, 32($sp) ldc1 $f27, 40($sp) ldc1 $f28, 48($sp) ldc1 $f29, 56($sp) #if defined(TRMMKERNEL) LDARG $18, 64($sp) LDARG $19, 72($sp) LDARG $20, 80($sp) #endif #ifndef __64BIT__ ldc1 $f20, 88($sp) ldc1 $f21, 96($sp) ldc1 $f22,104($sp) ldc1 $f23,112($sp) #endif j $31 daddiu $sp, $sp, STACKSIZE EPILOGUE OpenBLAS-0.2.20/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S000066400000000000000000002233501313527062700236700ustar00rootroot00000000000000##define REALNAME gemm #define ASSEMBLER #include "common.h" #define FETCH ld #define STACKSIZE 192 #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) ##### Parameter registers #### #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #### Pointer A, B, C #### #define AO $12 #define BO $13 #define CO1 $14 #define CO2 $15 #define PREA $18 #define PREB $19 #### Used registers #### #define A1 $f0 #define A2 $f1 #define A3 $f2 #define A4 $f3 #define A5 $f4 #define A6 $f5 #define A7 $f6 #define A8 $f7 #define B1 $f8 #define B2 $f9 #define B3 $f10 #define B4 $f11 #define B5 $f12 #define B6 $f13 #define B7 $f14 #define B8 $f15 #define C11 $f16 #define C12 $f17 #define C21 $f18 #define C22 $f19 #define C31 $f20 #define C32 $f21 #define C41 $f22 #define C42 $f23 #define C13 $f24 #define C14 $f25 #define C23 $f26 #define C24 $f27 #define C33 $f28 #define C34 $f29 #define C43 $f30 #define C44 $f31 #define I $2 #define J $3 #define L $7 #### Alpha register #### #define ALPHA $f15 #define F31 31 #define F30 30 #define F29 29 #define F28 28 #define F27 27 #define F26 26 #define F25 25 #define F24 24 #define F23 23 #define F22 22 #define F21 21 #define F20 20 #define F19 19 #define F18 18 #define F17 17 #define F16 16 #define F15 15 #define F14 14 #define F13 13 #define F12 12 #define F11 11 #define F10 10 #define F9 9 #define F8 8 #define F7 7 #define F6 6 #define F5 5 #define F4 4 #define F3 3 #define F2 2 #define F1 1 #define F0 0 #define R12 12 #define R13 13 #define R14 14 #define R15 15 #define R16 16 #define R17 17 #if defined(TRMMKERNEL) #define OFFSET $23 #define KK $24 #define TEMP $25 #endif PROLOGUE LDARG LDC, 0($sp) daddiu $sp,$sp,-STACKSIZE sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 48($sp) ST $f24, 56($sp) ST $f25, 64($sp) ST $f26, 72($sp) ST $f27, 80($sp) ST $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) sd $24, 104($sp) sd $25, 112($sp) LDARG OFFSET, STACKSIZE+8($sp) #endif #ifndef __64BIT__ ST $f20,120($sp) ST $f21,128($sp) ST $f22,136($sp) ST $f23,144($sp) #endif .align 4 .L2: dsra J, N, 1 # NR=2 ST $f15, 152($sp) #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE blez J, .L1 ST $f16, 160($sp) .L24: #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif dsra I, M, 2 # MR=8 move AO, A # Reset A dsll PREA, K, 1 + ZBASE_SHIFT move CO1, C daddu CO2, C, LDC daddu PREA, AO, PREA blez I, .L22 daddu C, CO2, LDC .align 4 .L241: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 2 + ZBASE_SHIFT dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 dsll PREB, K, ZBASE_SHIFT MOV C21, C11 MOV C22, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C31, C11 MOV C32, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C13, C11 MOV C14, C11 MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 MOV C44, C11 PLU B3, B1, B1 PLU B4, B2, B2 daddu PREB, BO, PREB FETCH $0, 0 * SIZE(CO1) FETCH $0, 8 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 8 * SIZE(CO2) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 blez L, .L242 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 dsll PREB, K, ZBASE_SHIFT MOV C21, C11 MOV C22, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C31, C11 MOV C32, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C13, C11 MOV C14, C11 FETCH $0, 0 * SIZE(CO1) MOV C23, C11 MOV C24, C11 FETCH $0, 0 * SIZE(CO2) MOV C33, C11 MOV C34, C11 MOV C43, C11 MOV C44, C11 daddu PREB, BO, PREB PLU B3, B1, B1 PLU B4, B2, B2 FETCH $0, 8 * SIZE(CO1) blez L, .L242 FETCH $0, 8 * SIZE(CO2) #endif .L2410: daddiu L, L, -1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 FETCH $0, 0 * SIZE(PREB) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREA) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 PLU B7, B5, B5 PLU B8, B6, B6 daddu PREB, PREB, 8 * SIZE MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 FETCH $0, 8 * SIZE(PREA) MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 MADPS C24, C24, A6, B8 PLU B3, B1, B1 PLU B4, B2, B2 MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR FETCH $0, 16 * SIZE(PREA) MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 FETCH $0, 24 * SIZE(PREA) MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddu PREA, PREA, 32 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 MADPS C24, C24, A6, B8 PLU B3, B1, B1 PLU B4, B2, B2 MADPS C34, C34, A7, B8 bgtz L, .L2410 MADPS C44, C44, A8, B8 .align 4 .L242: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L247 NOP gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 daddiu AO, AO, 4 * 4 * SIZE MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 MADPS C24, C24, A6, B8 PLU B3, B1, B1 PLU B4, B2, B2 MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 .align 4 .L247: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L240 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 daddiu BO, BO, 1 * 4 * SIZE # 4KR*4NR MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 daddiu AO, AO, 2 * 4 * SIZE MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 .align 4 .L240: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C31 CVTU A4, C41 CVTU A5, C13 CVTU A6, C23 CVTU A7, C33 CVTU A8, C43 CVTU B1, C12 CVTU B2, C22 CVTU B3, C32 CVTU B4, C42 CVTU B5, C14 CVTU B6, C24 CVTU B7, C34 CVTU B8, C44 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 # LD A1, 0 * SIZE(A) # load alpha_r SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r SUB C41, C41, A4 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 SUB C12, C12, B1 SUB C22, C22, B2 SUB C32, C32, B3 SUB C42, C42, B4 ADD C14, B5, C14 ADD C24, B6, C24 ADD C34, B7, C34 ADD C44, B8, C44 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C23, 2 * SIZE(CO2) LD C33, 4 * SIZE(CO2) LD C43, 6 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) LD C41, 7 * SIZE(CO2) MADD C13, C13, C12, A1 MADD C23, C23, C22, A1 MADD C33, C33, C32, A1 ST B1, 0 * SIZE(CO1) MADD C43, C43, C42, A1 ST B3, 2 * SIZE(CO1) MADD C11, C11, C14, A1 ST B5, 4 * SIZE(CO1) MADD C21, C21, C24, A1 ST B7, 6 * SIZE(CO1) MADD C31, C31, C34, A1 ST B2, 1 * SIZE(CO1) MADD C41, C41, C44, A1 ST B4, 3 * SIZE(CO1) NMSUB C13, C13, C14, A2 ST B6, 5 * SIZE(CO1) NMSUB C23, C23, C24, A2 ST B8, 7 * SIZE(CO1) NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r SUB C13, A5, C13 # ad'+'cb SUB C23, A6, C23 SUB C33, A7, C33 SUB C43, A8, C43 ADD C12, B1, C12 ADD C22, B2, C22 ADD C32, B3, C32 ADD C42, B4, C42 SUB C14, B5, C14 SUB C24, B6, C24 SUB C34, B7, C34 SUB C44, B8, C44 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C23, 2 * SIZE(CO2) LD C33, 4 * SIZE(CO2) LD C43, 6 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) LD C41, 7 * SIZE(CO2) MADD C13, C13, C12, A1 MADD C23, C23, C22, A1 MADD C33, C33, C32, A1 ST B1, 0 * SIZE(CO1) MADD C43, C43, C42, A1 ST B3, 2 * SIZE(CO1) MADD C11, C11, C14, A1 ST B5, 4 * SIZE(CO1) MADD C21, C21, C24, A1 ST B7, 6 * SIZE(CO1) MADD C31, C31, C34, A1 ST B2, 1 * SIZE(CO1) MADD C41, C41, C44, A1 ST B4, 3 * SIZE(CO1) NMSUB C13, C13, C14, A2 ST B6, 5 * SIZE(CO1) NMSUB C23, C23, C24, A2 ST B8, 7 * SIZE(CO1) NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i SUB C13, C13, A5 # ad'+'cb SUB C23, C23, A6 SUB C33, C33, A7 SUB C43, C43, A8 ADD C12, B1, C12 ADD C22, B2, C22 ADD C32, B3, C32 ADD C42, B4, C42 SUB C14, C14, B5 SUB C24, C24, B6 SUB C34, C34, B7 SUB C44, C44, B8 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C23, 2 * SIZE(CO2) LD C33, 4 * SIZE(CO2) LD C43, 6 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) LD C41, 7 * SIZE(CO2) MADD C13, C13, C12, A1 MADD C23, C23, C22, A1 MADD C33, C33, C32, A1 ST B1, 0 * SIZE(CO1) MADD C43, C43, C42, A1 ST B3, 2 * SIZE(CO1) MADD C11, C11, C14, A1 ST B5, 4 * SIZE(CO1) MADD C21, C21, C24, A1 ST B7, 6 * SIZE(CO1) MADD C31, C31, C34, A1 ST B2, 1 * SIZE(CO1) MADD C41, C41, C44, A1 ST B4, 3 * SIZE(CO1) NMSUB C13, C13, C14, A2 ST B6, 5 * SIZE(CO1) NMSUB C23, C23, C24, A2 ST B8, 7 * SIZE(CO1) NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r SUB C41, C41, A4 LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 SUB C12, C12, B1 SUB C22, C22, B2 SUB C32, C32, B3 SUB C42, C42, B4 ADD C14, B5, C14 ADD C24, B6, C24 ADD C34, B7, C34 ADD C44, B8, C44 NEG C13, C13 NEG C23, C23 NEG C33, C33 NEG C43, C43 NEG C14, C14 NEG C24, C24 NEG C34, C34 NEG C44, C44 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 LD C13, 0 * SIZE(CO2) LD C43, 6 * SIZE(CO2) LD C23, 2 * SIZE(CO2) LD C33, 4 * SIZE(CO2) LD C11, 1 * SIZE(CO2) LD C21, 3 * SIZE(CO2) LD C31, 5 * SIZE(CO2) LD C41, 7 * SIZE(CO2) MADD C13, C13, C12, A1 ST B1, 0 * SIZE(CO1) MADD C23, C23, C22, A1 ST B3, 2 * SIZE(CO1) MADD C33, C33, C32, A1 ST B5, 4 * SIZE(CO1) MADD C43, C43, C42, A1 ST B7, 6 * SIZE(CO1) MADD C11, C11, C14, A1 ST B2, 1 * SIZE(CO1) MADD C21, C21, C24, A1 ST B4, 3 * SIZE(CO1) MADD C31, C31, C34, A1 ST B6, 5 * SIZE(CO1) MADD C41, C41, C44, A1 ST B8, 7 * SIZE(CO1) NMSUB C13, C13, C14, A2 NMSUB C23, C23, C24, A2 NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #else daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C31 CVTU A4, C41 CVTU A5, C13 CVTU A6, C23 CVTU A7, C33 CVTU A8, C43 CVTU B1, C12 CVTU B2, C22 CVTU B3, C32 CVTU B4, C42 CVTU B5, C14 CVTU B6, C24 CVTU B7, C34 CVTU B8, C44 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r SUB C41, C41, A4 # LD A1, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 # LD A2, 0 * SIZE(A) # load alpha_i ADD C33, A7, C33 ADD C43, A8, C43 SUB C12, C12, B1 SUB C22, C22, B2 SUB C32, C32, B3 SUB C42, C42, B4 ADD C14, B5, C14 ADD C24, B6, C24 ADD C34, B7, C34 ADD C44, B8, C44 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) MUL C13, C12, A1 MUL C23, C22, A1 ST B3, 2 * SIZE(CO1) MUL C33, C32, A1 MUL C43, C42, A1 ST B5, 4 * SIZE(CO1) MUL C11, C14, A1 MUL C21, C24, A1 ST B7, 6 * SIZE(CO1) MUL C31, C34, A1 MUL C41, C44, A1 ST B2, 1 * SIZE(CO1) NMSUB C13, C13, C14, A2 NMSUB C23, C23, C24, A2 ST B4, 3 * SIZE(CO1) NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 ST B6, 5 * SIZE(CO1) MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 ST B8, 7 * SIZE(CO1) MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r SUB C13, A5, C13 # ad'+'cb SUB C23, A6, C23 SUB C33, A7, C33 SUB C43, A8, C43 ADD C12, B1, C12 ADD C22, B2, C22 ADD C32, B3, C32 ADD C42, B4, C42 SUB C14, B5, C14 SUB C24, B6, C24 SUB C34, B7, C34 SUB C44, B8, C44 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 MUL C13, C12, A1 MUL C23, C22, A1 ST B1, 0 * SIZE(CO1) MUL C33, C32, A1 MUL C43, C42, A1 ST B3, 2 * SIZE(CO1) MUL C11, C14, A1 MUL C21, C24, A1 ST B5, 4 * SIZE(CO1) MUL C31, C34, A1 MUL C41, C44, A1 ST B7, 6 * SIZE(CO1) NMSUB C13, C13, C14, A2 NMSUB C23, C23, C24, A2 ST B2, 1 * SIZE(CO1) NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 ST B4, 3 * SIZE(CO1) MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 ST B6, 5 * SIZE(CO1) MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST B8, 7 * SIZE(CO1) ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i SUB C13, C13, A5 # ad'+'cb SUB C23, C23, A6 SUB C33, C33, A7 SUB C43, C43, A8 ADD C12, B1, C12 ADD C22, B2, C22 ADD C32, B3, C32 ADD C42, B4, C42 SUB C14, C14, B5 SUB C24, C24, B6 SUB C34, C34, B7 SUB C44, C44, B8 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 MUL C13, C12, A1 MUL C23, C22, A1 ST B1, 0 * SIZE(CO1) MUL C33, C32, A1 MUL C43, C42, A1 ST B3, 2 * SIZE(CO1) MUL C11, C14, A1 MUL C21, C24, A1 ST B5, 4 * SIZE(CO1) MUL C31, C34, A1 MUL C41, C44, A1 ST B7, 6 * SIZE(CO1) NMSUB C13, C13, C14, A2 NMSUB C23, C23, C24, A2 ST B2, 1 * SIZE(CO1) NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 ST B4, 3 * SIZE(CO1) MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 ST B6, 5 * SIZE(CO1) MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST B8, 7 * SIZE(CO1) ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r SUB C41, C41, A4 LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 SUB C12, C12, B1 SUB C22, C22, B2 SUB C32, C32, B3 SUB C42, C42, B4 ADD C14, B5, C14 ADD C24, B6, C24 ADD C34, B7, C34 ADD C44, B8, C44 NEG C13, C13 NEG C23, C23 NEG C33, C33 NEG C43, C43 NEG C14, C14 NEG C24, C24 NEG C34, C34 NEG C44, C44 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) MUL C13, C12, A1 MUL C23, C22, A1 ST B3, 2 * SIZE(CO1) MUL C33, C32, A1 MUL C43, C42, A1 ST B5, 4 * SIZE(CO1) MUL C11, C14, A1 MUL C21, C24, A1 ST B7, 6 * SIZE(CO1) MUL C31, C34, A1 MUL C41, C44, A1 ST B2, 1 * SIZE(CO1) NMSUB C13, C13, C14, A2 NMSUB C23, C23, C24, A2 ST B4, 3 * SIZE(CO1) NMSUB C33, C33, C34, A2 NMSUB C43, C43, C44, A2 ST B6, 5 * SIZE(CO1) MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 ST B8, 7 * SIZE(CO1) MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 ST C13, 0 * SIZE(CO2) ST C23, 2 * SIZE(CO2) ST C33, 4 * SIZE(CO2) ST C43, 6 * SIZE(CO2) ST C11, 1 * SIZE(CO2) ST C21, 3 * SIZE(CO2) ST C31, 5 * SIZE(CO2) ST C41, 7 * SIZE(CO2) #endif #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, 2 + ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif #endif daddiu CO1, CO1, 8 * SIZE bgtz I, .L241 daddiu CO2, CO2, 8 * SIZE .align 4 .L22: andi I, M, 2 # MR=4 blez I, .L21 NOP .align 4 .L221: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 daddu AO, AO, TEMP daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 MOV C21, C11 MOV C22, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 8 * SIZE(CO1) MOV C24, C11 FETCH $0, 0 * SIZE(CO2) FETCH $0, 8 * SIZE(CO2) PLU B3, B1, B1 PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 # MR=2 #else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 blez L, .L222 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 MOV C21, C11 MOV C22, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 8 * SIZE(CO1) MOV C24, C11 FETCH $0, 0 * SIZE(CO2) FETCH $0, 8 * SIZE(CO2) PLU B3, B1, B1 blez L, .L222 PLU B4, B2, B2 #endif .L2210: daddiu L, L, -1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 gsLQC1(R12, F5, F4, 2) # A5 A6 PLU B7, B5, B5 PLU B8, B6, B6 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C11, C11, A3, B5 MADPS C21, C21, A4, B5 MADPS C12, C12, A3, B6 MADPS C22, C22, A4, B6 MADPS C13, C13, A3, B7 MADPS C23, C23, A4, B7 MADPS C14, C14, A3, B8 MADPS C24, C24, A4, B8 gsLQC1(R12, F7, F6, 3) # A7 A8 PLU B3, B1, B1 PLU B4, B2, B2 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C11, C11, A5, B1 MADPS C21, C21, A6, B1 MADPS C12, C12, A5, B2 MADPS C22, C22, A6, B2 daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR MADPS C13, C13, A5, B3 MADPS C23, C23, A6, B3 MADPS C14, C14, A5, B4 MADPS C24, C24, A6, B4 gsLQC1(R12, F1, F0, 0) # A1 A2 PLU B7, B5, B5 PLU B8, B6, B6 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C11, C11, A7, B5 MADPS C21, C21, A8, B5 MADPS C12, C12, A7, B6 MADPS C22, C22, A8, B6 MADPS C13, C13, A7, B7 MADPS C23, C23, A8, B7 MADPS C14, C14, A7, B8 MADPS C24, C24, A8, B8 PLU B3, B1, B1 bgtz L, .L2210 PLU B4, B2, B2 .align 4 .L222: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L227 NOP gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 PLU B7, B5, B5 PLU B8, B6, B6 daddiu BO, BO, 2 * 4 * SIZE daddiu AO, AO, 2 * 4 * SIZE MADPS C11, C11, A3, B5 MADPS C21, C21, A4, B5 gsLQC1(R13, F9, F8, 0) # A1 A2 MADPS C12, C12, A3, B6 MADPS C22, C22, A4, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C13, C13, A3, B7 MADPS C23, C23, A4, B7 MADPS C14, C14, A3, B8 MADPS C24, C24, A4, B8 PLU B3, B1, B1 PLU B4, B2, B2 .align 4 .L227: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L220 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 daddiu BO, BO, 4 * SIZE daddiu AO, AO, 4 * SIZE MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 .align 4 .L220: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C13 CVTU A4, C23 CVTU A5, C12 CVTU A6, C22 CVTU A7, C14 CVTU A8, C24 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 SUB C22, C22, A6 ADD C14, A7, C14 ADD C24, A8, C24 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 LD B5, 0 * SIZE(CO2) LD B7, 2 * SIZE(CO2) LD B6, 1 * SIZE(CO2) LD B8, 3 * SIZE(CO2) MADD B5, B5, C12, A1 MADD B7, B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MADD B6, B6, C14, A1 MADD B8, B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, A3, C13 # ad'+'cb SUB C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r ADD C12, A5, C12 ADD C22, A6, C22 SUB C14, A7, C14 SUB C24, A8, C24 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 LD B5, 0 * SIZE(CO2) LD B7, 2 * SIZE(CO2) LD B6, 1 * SIZE(CO2) LD B8, 3 * SIZE(CO2) MADD B5, B5, C12, A1 MADD B7, B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MADD B6, B6, C14, A1 MADD B8, B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, C13, A3 # ad'+'cb SUB C23, C23, A4 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i ADD C12, A5, C12 ADD C22, A6, C22 SUB C14, C14, A7 SUB C24, C24, A8 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 LD B5, 0 * SIZE(CO2) LD B7, 2 * SIZE(CO2) LD B6, 1 * SIZE(CO2) LD B8, 3 * SIZE(CO2) MADD B5, B5, C12, A1 MADD B7, B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MADD B6, B6, C14, A1 MADD B8, B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 SUB C22, C22, A6 ADD C14, A7, C14 ADD C24, A8, C24 NEG C13, C13 NEG C23, C23 NEG C14, C14 NEG C24, C24 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 LD B5, 0 * SIZE(CO2) LD B7, 2 * SIZE(CO2) LD B6, 1 * SIZE(CO2) LD B8, 3 * SIZE(CO2) MADD B5, B5, C12, A1 MADD B7, B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MADD B6, B6, C14, A1 MADD B8, B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #else daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C13 CVTU A4, C23 CVTU A5, C12 CVTU A6, C22 CVTU A7, C14 CVTU A8, C24 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 SUB C22, C22, A6 ADD C14, A7, C14 ADD C24, A8, C24 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MUL B5, C12, A1 MUL B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MUL B6, C14, A1 MUL B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, A3, C13 # ad'+'cb SUB C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r ADD C12, A5, C12 ADD C22, A6, C22 SUB C14, A7, C14 SUB C24, A8, C24 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MUL B5, C12, A1 MUL B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MUL B6, C14, A1 MUL B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, C13, A3 # ad'+'cb SUB C23, C23, A4 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i ADD C12, A5, C12 ADD C22, A6, C22 SUB C14, C14, A7 SUB C24, C24, A8 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MUL B5, C12, A1 MUL B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MUL B6, C14, A1 MUL B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 SUB C22, C22, A6 ADD C14, A7, C14 ADD C24, A8, C24 NEG C13, C13 NEG C23, C23 NEG C14, C14 NEG C24, C24 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MUL B5, C12, A1 MUL B7, C22, A1 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) MUL B6, C14, A1 MUL B8, C24, A1 ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) NMSUB B5, B5, C14, A2 NMSUB B7, B7, C24, A2 MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) ST B8, 3 * SIZE(CO2) #endif #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE .align 4 .L21: andi I, M, 1 blez I, .L20 NOP .align 4 .L211: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, ZBASE_SHIFT # MR=1 dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) PLU B3, B1, B1 PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 #else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 blez L, .L212 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) PLU B3, B1, B1 blez L, .L212 PLU B4, B2, B2 #endif .L2110: daddiu L, L, -1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 MADPS C12, C12, A1, B2 MADPS C13, C13, A1, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 PLU B8, B6, B6 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C11, C11, A2, B5 MADPS C12, C12, A2, B6 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C13, C13, A2, B7 MADPS C14, C14, A2, B8 PLU B3, B1, B1 PLU B4, B2, B2 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C11, C11, A3, B1 MADPS C12, C12, A3, B2 daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR MADPS C13, C13, A3, B3 MADPS C14, C14, A3, B4 PLU B7, B5, B5 PLU B8, B6, B6 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C11, C11, A4, B5 MADPS C12, C12, A4, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C13, C13, A4, B7 MADPS C14, C14, A4, B8 PLU B3, B1, B1 bgtz L, .L2110 PLU B4, B2, B2 .align 4 .L212: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L217 NOP gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 MADPS C12, C12, A1, B2 MADPS C13, C13, A1, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 PLU B8, B6, B6 daddiu BO, BO, 2 * 4 * SIZE MADPS C11, C11, A2, B5 MADPS C12, C12, A2, B6 daddiu AO, AO, 4 * SIZE MADPS C13, C13, A2, B7 MADPS C14, C14, A2, B8 gsLQC1(R12, F1, F0, 0) # A5 A6 gsLQC1(R13, F9, F8, 0) # B1 B2 PLU B3, B1, B1 PLU B4, B2, B2 .align 4 .L217: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L210 NOP MADPS C11, C11, A1, B1 daddiu BO, BO, 4 * SIZE MADPS C12, C12, A1, B2 daddiu AO, AO, 2 * SIZE MADPS C13, C13, A1, B3 MADPS C14, C14, A1, B4 .align 4 .L210: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 CVTU A5, C12 CVTU A7, C14 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 ADD C14, A7, C14 LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 LD B5, 0 * SIZE(CO2) LD B6, 1 * SIZE(CO2) MADD B5, B5, C12, A4 ST B1, 0 * SIZE(CO1) MADD B6, B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, A3, C13 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r ADD C12, A5, C12 SUB C14, A7, C14 LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 LD B5, 0 * SIZE(CO2) LD B6, 1 * SIZE(CO2) MADD B5, B5, C12, A4 ST B1, 0 * SIZE(CO1) MADD B6, B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, C13, A3 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i ADD C12, A5, C12 SUB C14, C14, A7 LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 LD B5, 0 * SIZE(CO2) LD B6, 1 * SIZE(CO2) MADD B5, B5, C12, A4 ST B1, 0 * SIZE(CO1) MADD B6, B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb LD A4, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 ADD C14, A7, C14 NEG C13, C13 LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) NEG C14, C14 MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 LD B5, 0 * SIZE(CO2) LD B6, 1 * SIZE(CO2) MADD B5, B5, C12, A4 ST B1, 0 * SIZE(CO1) MADD B6, B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #else daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 CVTU A5, C12 CVTU A7, C14 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 ADD C14, A7, C14 MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 MUL B5, C12, A4 ST B1, 0 * SIZE(CO1) MUL B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, A3, C13 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r ADD C12, A5, C12 SUB C14, A7, C14 MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 MUL B5, C12, A4 ST B1, 0 * SIZE(CO1) MUL B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, C13, A3 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i ADD C12, A5, C12 SUB C14, C14, A7 MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 MUL B5, C12, A4 ST B1, 0 * SIZE(CO1) MUL B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb LD A4, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i SUB C12, C12, A5 ADD C14, A7, C14 NEG C13, C13 NEG C14, C14 MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 MUL B5, C12, A4 ST B1, 0 * SIZE(CO1) MUL B6, C14, A4 ST B2, 1 * SIZE(CO1) NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE .align 4 .L20: daddiu J, J, -1 move B, BO #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif bgtz J, .L24 NOP .align 4 .L1: andi J, N, 1 blez J, .L999 NOP .L14: dsra I, M, 2 # MR=8 move AO, A # Reset A #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif move CO1, C blez I, .L12 daddu C, CO1, LDC .align 4 .L141: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 2 + ZBASE_SHIFT dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C21, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C31, C11 MOV C41, C11 gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C13, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C33, C11 MOV C43, C11 FETCH $0, 8 * SIZE(CO1) PLU B3, B1, B1 PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 4 # define Mr=4 #else daddiu TEMP, KK, 1 # define NR=1 #endif dsra L, TEMP, 2 blez L, .L142 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C21, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C31, C11 MOV C41, C11 gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C13, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C33, C11 MOV C43, C11 FETCH $0, 8 * SIZE(CO1) PLU B3, B1, B1 blez L, .L142 PLU B4, B2, B2 #endif .L1410: daddiu L, L, -1 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F5, F4, 2) # A5 A6 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 gsLQC1(R13, F13, F12, 1) # B3 B4 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C11, C11, A5, B2 MADPS C21, C21, A6, B2 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C31, C31, A7, B2 MADPS C41, C41, A8, B2 daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR MADPS C13, C13, A5, B4 MADPS C23, C23, A6, B4 MADPS C33, C33, A7, B4 MADPS C43, C43, A8, B4 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C11, C11, A1, B5 MADPS C21, C21, A2, B5 gsLQC1(R12, F5, F4, 6) # A5 A6 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C31, C31, A3, B5 MADPS C41, C41, A4, B5 daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR MADPS C13, C13, A1, B7 MADPS C23, C23, A2, B7 MADPS C33, C33, A3, B7 MADPS C43, C43, A4, B7 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C11, C11, A5, B6 MADPS C21, C21, A6, B6 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C31, C31, A7, B6 MADPS C41, C41, A8, B6 MADPS C13, C13, A5, B8 MADPS C23, C23, A6, B8 MADPS C33, C33, A7, B8 MADPS C43, C43, A8, B8 PLU B3, B1, B1 bgtz L, .L1410 PLU B4, B2, B2 .align 4 .L142: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L147 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F5, F4, 2) # A5 A6 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 gsLQC1(R13, F13, F8, 1) # B3 B4 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C11, C11, A5, B2 MADPS C21, C21, A6, B2 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C31, C31, A7, B2 MADPS C41, C41, A8, B2 daddiu BO, BO, 4 * SIZE # 4KR*4NR MADPS C13, C13, A5, B4 MADPS C23, C23, A6, B4 MADPS C33, C33, A7, B4 MADPS C43, C43, A8, B4 PLU B3, B1, B1 .align 4 .L147: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L140 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 daddiu BO, BO, 2 * SIZE MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 daddiu AO, AO, 2 * 4 * SIZE MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 .align 4 .L140: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C31 CVTU A4, C41 CVTU A5, C13 CVTU A6, C23 CVTU A7, C33 CVTU A8, C43 CVTU B1, C12 CVTU B2, C22 CVTU B3, C32 CVTU B4, C42 CVTU B5, C14 CVTU B6, C24 CVTU B7, C34 CVTU B8, C44 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 # LD A1, 0 * SIZE(A) # load alpha_r SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r SUB C41, C41, A4 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r SUB C13, A5, C13 # ad'+'cb SUB C23, A6, C23 SUB C33, A7, C33 SUB C43, A8, C43 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i SUB C13, C13, A5 # ad'+'cb SUB C23, C23, A6 SUB C33, C33, A7 SUB C43, C43, A8 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # AC'+'BD SUB C21, C21, A2 SUB C31, C31, A3 LD A1, 152($sp) # LOAD ALPHA_R # LD A1, 0 * SIZE(A) # LOAD ALPHA_R SUB C41, C41, A4 LD A2, 160($sp) # LD A2, 0 * SIZE(A) # LOAD ALPHA_I ADD C13, A5, C13 # AD'+'CB ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 NEG C13, C13 # AD'+'CB NEG C23, C23 NEG C33, C33 NEG C43, C43 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B5, 4 * SIZE(CO1) LD B7, 6 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) LD B6, 5 * SIZE(CO1) LD B8, 7 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = ALPHA_R MADD B3, B3, C21, A1 MADD B5, B5, C31, A1 MADD B7, B7, C41, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 MADD B6, B6, C33, A1 MADD B8, B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = ALPHA_I NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #else daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C31 CVTU A4, C41 CVTU A5, C13 CVTU A6, C23 CVTU A7, C33 CVTU A8, C43 CVTU B1, C12 CVTU B2, C22 CVTU B3, C32 CVTU B4, C42 CVTU B5, C14 CVTU B6, C24 CVTU B7, C34 CVTU B8, C44 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 # LD A1, 0 * SIZE(A) # load alpha_r SUB C31, C31, A3 LD A1, 152($sp) # load alpha_r SUB C41, C41, A4 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i ADD C13, A5, C13 # ad'+'cb ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r SUB C13, A5, C13 # ad'+'cb SUB C23, A6, C23 SUB C33, A7, C33 SUB C43, A8, C43 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 # LD A1, 0 * SIZE(A) # load alpha_r ADD C31, A3, C31 LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r ADD C41, A4, C41 LD A2, 160($sp) # load alpha_i SUB C13, C13, A5 # ad'+'cb SUB C23, C23, A6 SUB C33, C33, A7 SUB C43, C43, A8 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # AC'+'BD SUB C21, C21, A2 SUB C31, C31, A3 LD A1, 152($sp) # LOAD ALPHA_R # LD A1, 0 * SIZE(A) # LOAD ALPHA_R SUB C41, C41, A4 LD A2, 160($sp) # LD A2, 0 * SIZE(A) # LOAD ALPHA_I ADD C13, A5, C13 # AD'+'CB ADD C23, A6, C23 ADD C33, A7, C33 ADD C43, A8, C43 NEG C13, C13 # AD'+'CB NEG C23, C23 NEG C33, C33 NEG C43, C43 MUL B1, C11, A1 # A1 = ALPHA_R MUL B3, C21, A1 MUL B5, C31, A1 MUL B7, C41, A1 MUL B2, C13, A1 MUL B4, C23, A1 MUL B6, C33, A1 MUL B8, C43, A1 NMSUB B1, B1, C13, A2 # A2 = ALPHA_I NMSUB B3, B3, C23, A2 NMSUB B5, B5, C33, A2 NMSUB B7, B7, C43, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B5, 4 * SIZE(CO1) ST B7, 6 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) ST B6, 5 * SIZE(CO1) ST B8, 7 * SIZE(CO1) #endif #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 2 + ZBASE_SHIFT dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif #endif bgtz I, .L141 daddiu CO1, CO1, 8 * SIZE .align 4 .L12: andi I, M, 2 # MR=4 blez I, .L11 NOP .align 4 .L121: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + ZBASE_SHIFT dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C21, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 8 * SIZE(CO1) PLU B3, B1, B1 PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L122 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C21, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 8 * SIZE(CO1) PLU B3, B1, B1 blez L, .L122 PLU B4, B2, B2 #endif .L1210: daddiu L, L, -1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 gsLQC1(R12, F5, F4, 2) # A5 A6 PLU B7, B5, B5 PLU B8, B6, B6 daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR MADPS C11, C11, A3, B2 MADPS C21, C21, A4, B2 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C13, C13, A3, B4 MADPS C23, C23, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C11, C11, A7, B6 MADPS C21, C21, A8, B6 MADPS C13, C13, A7, B8 MADPS C23, C23, A8, B8 PLU B3, B1, B1 bgtz L, .L1210 PLU B4, B2, B2 .align 4 .L122: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L127 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 PLU B7, B5, B5 daddiu BO, BO, 1 * 4 * SIZE daddiu AO, AO, 2 * 4 * SIZE MADPS C11, C11, A3, B2 MADPS C21, C21, A4, B2 MADPS C13, C13, A3, B4 MADPS C23, C23, A4, B4 gsLQC1(R13, F9, F8, 0) gsLQC1(R12, F1, F0, 0) PLU B3, B1, B1 .align 4 .L127: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L120 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 daddiu BO, BO, 2 * SIZE daddiu AO, AO, 4 * SIZE MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 .align 4 .L120: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C13 CVTU A4, C23 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, A3, C13 # ad'+'cb SUB C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, C13, A3 # ad'+'cb SUB C23, C23, A4 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i NEG C13, C13 # ad'+'cb NEG C23, C23 LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) LD B4, 3 * SIZE(CO1) MADD B1, B1, C11, A1 # A1 = alpha_r MADD B3, B3, C21, A1 MADD B2, B2, C13, A1 MADD B4, B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #else daddiu I, I, -1 CVTU A1, C11 CVTU A2, C21 CVTU A3, C13 CVTU A4, C23 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, A3, C13 # ad'+'cb SUB C23, A4, C23 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_r MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd ADD C21, A2, C21 SUB C13, C13, A3 # ad'+'cb SUB C23, C23, A4 # LD A1, 0 * SIZE(A) # load alpha_r LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd SUB C21, C21, A2 ADD C13, A3, C13 # ad'+'cb ADD C23, A4, C23 LD A1, 152($sp) # load alpha_r # LD A1, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # LD A2, 0 * SIZE(A) # load alpha_i NEG C13, C13 # ad'+'cb NEG C23, C23 MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 MUL B4, C23, A1 NMSUB B1, B1, C13, A2 # A2 = alpha_i NMSUB B3, B3, C23, A2 MADD B2, B2, C11, A2 MADD B4, B4, C21, A2 ST B1, 0 * SIZE(CO1) ST B3, 2 * SIZE(CO1) ST B2, 1 * SIZE(CO1) ST B4, 3 * SIZE(CO1) #endif #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 1 + ZBASE_SHIFT dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE .align 4 .L11: andi I, M, 1 blez I, .L10 NOP .align 4 .L111: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 FETCH $0, 0 * SIZE(CO1) PLU B3, B1, B1 PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L112 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 FETCH $0, 0 * SIZE(CO1) PLU B3, B1, B1 blez L, .L112 PLU B4, B2, B2 #endif .L1110: daddiu L, L, -1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C11, C11, A1, B1 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C13, C13, A1, B3 daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR PLU B7, B5, B5 PLU B8, B6, B6 daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR MADPS C11, C11, A2, B2 MADPS C13, C13, A2, B4 MADPS C11, C11, A3, B5 MADPS C13, C13, A3, B7 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C11, C11, A4, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C13, C13, A4, B8 PLU B3, B1, B1 bgtz L, .L1110 PLU B4, B2, B2 .align 4 .L112: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L117 NOP MADPS C11, C11, A1, B1 MADPS C13, C13, A1, B3 daddiu BO, BO, 4 * SIZE daddiu AO, AO, 4 * SIZE MADPS C11, C11, A2, B2 MADPS C13, C13, A2, B4 gsLQC1(R13, F9, F8, 0) gsLQC1(R12, F1, F0, 0) PLU B3, B1, B1 .align 4 .L117: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L110 NOP daddiu BO, BO, 2 * SIZE daddiu AO, AO, 2 * SIZE MADPS C11, C11, A1, B1 MADPS C13, C13, A1, B3 .align 4 .L110: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, A3, C13 # ad'+'cb LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, C13, A3 # ad'+'cb LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb NEG C13, C13 LD A4, 152($sp) # load alpha_r LD A2, 160($sp) LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) MADD B1, B1, C11, A4 # A1 = alpha_r MADD B2, B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #else daddiu I, I, -1 CVTU A1, C11 CVTU A3, C13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) /* (a + bi) * (c + di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb # LD A1, 0 * SIZE(A) # load alpha_r LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i # LD A2, 0 * SIZE(A) # load alpha_i MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) /* (a + bi) * (c - di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, A3, C13 # ad'+'cb LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) /* (a - bi) * (c + di) */ ADD C11, A1, C11 # ac'+'bd SUB C13, C13, A3 # ad'+'cb LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) /* (a - bi) * (c - di) */ SUB C11, C11, A1 # ac'+'bd ADD C13, A3, C13 # ad'+'cb NEG C13, C13 LD A4, 152($sp) # load alpha_r LD A2, 160($sp) MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i MADD B2, B2, C11, A2 ST B1, 0 * SIZE(CO1) ST B2, 1 * SIZE(CO1) #endif #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -1 #endif dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE .align 4 .L10: move B, BO #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 1 #endif .L999: ld $16, 0($sp) ld $17, 8($sp) ld $18, 16($sp) ld $19, 24($sp) ld $20, 32($sp) ld $21, 40($sp) ld $22, 48($sp) LD $f24, 56($sp) LD $f25, 64($sp) LD $f26, 72($sp) LD $f27, 80($sp) LD $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) ld $24, 104($sp) ld $25, 112($sp) #endif #ifndef __64BIT__ LD $f20,120($sp) LD $f21,128($sp) LD $f22,136($sp) LD $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE j $31 nop EPILOGUE OpenBLAS-0.2.20/kernel/mips64/cgemm_kernel_loongson3b_2x2.S000066400000000000000000000642401313527062700231660ustar00rootroot00000000000000#define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define STACKSIZE 160 #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define R12 12 #define R13 13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define PREA $16 #define PREB $17 #if defined(TRMMKERNEL) #define OFFSET $18 #define KK $19 #define TEMP $20 #endif #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define b1 $f4 #define b2 $f5 #define b3 $f6 #define b4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f15 #define b8 $f16 #define c11 $f14 #define c12 $f17 #define c13 $f18 #define c14 $f19 #define c21 $f20 #define c22 $f21 #define c23 $f22 #define c24 $f23 #define c31 $f24 #define c32 $f25 #define c33 $f26 #define c34 $f27 #define c41 $f28 #define c42 $f29 #define c43 $f30 #define c44 $f31 #define F0 0 #define F1 1 #define F2 2 #define F3 3 #define F4 4 #define F5 5 #define F6 6 #define F7 7 #define F8 8 #define F9 9 #define F10 10 #define F11 11 #define F12 12 #define F13 13 #define F14 14 #define F15 15 #define F16 16 #define F17 17 #define F18 18 #define F19 19 #define F20 20 #define F21 21 #define F22 22 #define F23 23 #define F24 24 #define F25 25 #define F26 26 #define F27 27 #define F28 28 #define F29 29 #define F30 30 #define F31 31 #define ALPHA_R $f15 #define ALPHA_I $f16 ################################# ## MADD1 a*c ## MADD2 b*c ## MADD3 a*d ## MADD4 d*b ################################## #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 NMSUB #define MADD4 NMSUB #endif PROLOGUE LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE SDARG $16, 0($sp) SDARG $17, 8($sp) sdc1 $f24, 16($sp) sdc1 $f25, 24($sp) sdc1 $f26, 32($sp) sdc1 $f27, 40($sp) sdc1 $f28, 48($sp) sdc1 $f29, 56($sp) #if defined(TRMMKERNEL) SDARG $18, 64($sp) SDARG $19, 72($sp) SDARG $20, 80($sp) LDARG OFFSET, STACKSIZE + 8($sp) #endif #ifndef __64BIT__ sdc1 $f20, 88($sp) sdc1 $f21, 96($sp) sdc1 $f22,104($sp) sdc1 $f23,112($sp) #endif dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE blez J, .L20 ST ALPHA_I, 136($sp) .align 5 .L10: #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 move CO1, C # Fix pointer Cx daddu CO2, C, LDC move AO, A # Reset AO blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif MTC $0, c11 # Clear results regs LD a1, 0 * SIZE(AO) MOV c12, c11 LD a2, 1 * SIZE(AO) MOV c13, c11 LD b1, 0 * SIZE(BO) MOV c14, c11 LD b2, 1 * SIZE(BO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 LD b4, 3 * SIZE(BO) FETCH $0, 0 * SIZE(CO2) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO1) MOV c43, c11 MOV c44, c11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs LD a1, 0 * SIZE(AO) MOV c12, c11 LD a2, 1 * SIZE(AO) MOV c13, c11 LD b1, 0 * SIZE(BO) MOV c14, c11 LD b2, 1 * SIZE(BO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 LD b4, 3 * SIZE(BO) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO2) MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO2) MOV c43, c11 NOP FETCH $0, 4 * SIZE(CO1) daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 MOV c44, c11 #endif .align 5 .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 4 * SIZE(PREA) FETCH $0, 4 * SIZE(PREB) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 8 * SIZE(PREA) FETCH $0, 8 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 MADD4 c44, c44, a8, b8 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 12 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 daddiu AO, AO, 16 * SIZE daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 daddu PREA, PREA, 16 * SIZE MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 daddu PREB, PREB, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 0 * SIZE(PREA) FETCH $0, 0 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 bgtz L, .L12 MADD4 c44, c44, a8, b8 .align 5 .L15: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L18 LD ALPHA_I, 136($sp) .align 5 .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu PREA, PREA, 4 * SIZE daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 0 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 FETCH $0, 0 * SIZE(PREB) MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) bgtz L, .L16 NOP .L18: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) ADD c41, c44, c41 LD b3, 2 * SIZE(CO2) ADD c42, c43, c42 LD b4, 3 * SIZE(CO2) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 ST a1, 0 * SIZE(CO1) MADD b3, b3, ALPHA_R, c41 MADD b4, b4, ALPHA_R, c42 ST a2, 1 * SIZE(CO1) NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST b1, 2 * SIZE(CO1) NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 ADD c31, c34, c31 ADD c32, c33, c32 ADD c41, c44, c41 ADD c42, c43, c42 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 MUL b3, ALPHA_R, c41 MUL b4, ALPHA_R, c42 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE .align 5 .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel daddu PREB, PREB, B # PREA=A+panel size blez I, .L19 daddu C, C, LDC # Change C to next panel #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, ZBASE_SHIFT # MR=1 dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(PREB) MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 #else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 blez L, .L35 NOP #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) dsra L, K, 2 # Unroll K 4 times move BO, B LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREB) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) MOV c33, c11 blez L, .L35 MOV c34, c11 #endif .align 5 .L32: LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 NOP LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD1 c31, c31, a3, b7 # A1xB2 MADD3 c33, c33, a3, b8 FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a4, b7 MADD4 c34, c34, a4, b8 daddiu L, L, -1 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD1 c31, c31, a5, b3 # A1xB2 MADD3 c33, c33, a5, b4 daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a6, b3 MADD4 c34, c34, a6, b4 NOP LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a8, b5 # bxc MADD4 c14, c14, a8, b6 # bxd LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD1 c31, c31, a7, b7 # A1xB2 NOP MADD3 c33, c33, a7, b8 daddiu PREB, PREB, 16 * SIZE FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a8, b7 bgtz L, .L32 MADD4 c34, c34, a8, b8 .L35: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L38 LD ALPHA_I, 136($sp) .align 5 .L36: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) bgtz L, .L36 NOP .L38: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c31, c34, c31 ADD c32, c33, c32 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 5 .L19: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif bgtz J, .L10 move B, BO .align 5 .L20: andi J, N, 1 blez J, .L999 dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 dsra I, M, 1 # I=M/2 move CO1, C #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif move AO, A # Reset AO blez I, .L29 daddu PREA, PREA, A .L21: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + ZBASE_SHIFT dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 # define Mr=2 #else daddiu TEMP, KK, 1 # define NR=1 #endif dsra L, TEMP, 2 blez L, .L25 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) blez L, .L25 NOP #endif .align 5 .L22: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 FETCH $0, 8 * SIZE(PREA) MADD2 c22, c22, a8, b3 MADD4 c24, c24, a8, b4 daddiu L, L, -1 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 daddiu PREA, PREA, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a6, b7 # bxc MADD4 c14, c14, a6, b8 # bxd LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c21, c21, a7, b7 # A2xB1 MADD3 c23, c23, a7, b8 FETCH $0, 0 * SIZE(PREA) MADD2 c22, c22, a8, b7 bgtz L, .L22 MADD4 c24, c24, a8, b8 .L25: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L28 LD ALPHA_I, 136($sp) .align 3 .L26: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) bgtz L, .L26 FETCH $0, 0 * SIZE(PREA) .L28: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 1 + ZBASE_SHIFT dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP .L29: andi I, M, 1 blez I, .L999 NOP #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L45 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP #endif .align 3 .L42: # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd # gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd # gsLQC1(R12, F9, F8, 2) # Unroll K=1 LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd # gsLQC1(R13, F13, F12, 2) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd # gsLQC1(R12, F11, F10, 3) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu L, L, -1 # gsLQC1(R13, F16, F15, 3) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a8, b7 # bxc MADD4 c14, c14, a8, b8 # bxd bgtz L, .L42 NOP .align 5 .L45: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L48 LD ALPHA_I, 136($sp) .L46: daddiu L, L, -1 daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) bgtz L, .L46 NOP .L48: #ifndef TRMMKERNEL ADD c11, c14, c11 ADD c12, c13, c12 LD a1, 0 * SIZE(CO1) LD a2, 1 * SIZE(CO1) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -1 #endif dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif daddiu CO1,CO1, 2 * SIZE #endif .align 5 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) ldc1 $f24, 16($sp) ldc1 $f25, 24($sp) ldc1 $f26, 32($sp) ldc1 $f27, 40($sp) ldc1 $f28, 48($sp) ldc1 $f29, 56($sp) #if defined(TRMMKERNEL) LDARG $18, 64($sp) LDARG $19, 72($sp) LDARG $20, 80($sp) #endif #ifndef __64BIT__ ldc1 $f20, 88($sp) ldc1 $f21, 96($sp) ldc1 $f22,104($sp) ldc1 $f23,112($sp) #endif j $31 daddiu $sp, $sp, STACKSIZE EPILOGUE OpenBLAS-0.2.20/kernel/mips64/cnrm2.S000066400000000000000000000112641313527062700167170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f6 #define a2 $f7 #define a3 $f8 #define a4 $f9 #define a5 $f10 #define a6 $f11 #define a7 $f12 #define a8 $f13 #define s1 $f0 #define s2 $f1 #define t1 $f2 #define t2 $f3 #define t3 $f4 #define t4 $f5 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif dmtc1 $0, s1 li TEMP, 2 * SIZE blez N, .L999 mov.d s2, s1 blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT dsra I, N, 2 blez I, .L25 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX cvt.d.s t1, a1 LD a7, 0 * SIZE(X) cvt.d.s t2, a2 LD a8, 1 * SIZE(X) cvt.d.s t3, a3 daddiu I, I, -1 cvt.d.s t4, a4 blez I, .L24 daddu X, X, INCX .align 3 .L23: madd.d s1, s1, t1, t1 LD a1, 0 * SIZE(X) cvt.d.s t1, a5 NOP madd.d s2, s2, t2, t2 LD a2, 1 * SIZE(X) cvt.d.s t2, a6 daddu X, X, INCX madd.d s1, s1, t3, t3 LD a3, 0 * SIZE(X) cvt.d.s t3, a7 NOP madd.d s2, s2, t4, t4 LD a4, 1 * SIZE(X) cvt.d.s t4, a8 daddu X, X, INCX madd.d s1, s1, t1, t1 LD a5, 0 * SIZE(X) cvt.d.s t1, a1 daddiu I, I, -1 madd.d s2, s2, t2, t2 LD a6, 1 * SIZE(X) cvt.d.s t2, a2 daddu X, X, INCX madd.d s1, s1, t3, t3 LD a7, 0 * SIZE(X) cvt.d.s t3, a3 LD a8, 1 * SIZE(X) madd.d s2, s2, t4, t4 daddu X, X, INCX bgtz I, .L23 cvt.d.s t4, a4 .align 3 .L24: madd.d s1, s1, t1, t1 cvt.d.s t1, a5 madd.d s2, s2, t2, t2 cvt.d.s t2, a6 madd.d s1, s1, t3, t3 cvt.d.s t3, a7 madd.d s2, s2, t4, t4 cvt.d.s t4, a8 madd.d s1, s1, t1, t1 madd.d s2, s2, t2, t2 madd.d s1, s1, t3, t3 madd.d s2, s2, t4, t4 .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddiu I, I, -1 cvt.d.s t1, a1 cvt.d.s t2, a2 madd.d s1, s1, t1, t1 daddu X, X, INCX bgtz I, .L26 madd.d s2, s2, t2, t2 .align 3 .L999: add.d s1, s1, s2 sqrt.d s1, s1 j $31 cvt.s.d s1, s1 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/copy.S000066400000000000000000000134141313527062700166470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define Y $7 #define INCY $8 #define I $2 #define TEMP $3 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif li TEMP, SIZE NOP blez N, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, BASE_SHIFT bne INCY, TEMP, .L20 dsra I, N, 3 blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD a3, 2 * SIZE(X) LD a4, 3 * SIZE(X) LD a5, 4 * SIZE(X) LD a6, 5 * SIZE(X) LD a7, 6 * SIZE(X) LD a8, 7 * SIZE(X) blez I, .L13 NOP .align 3 .L12: ST a1, 0 * SIZE(Y) LD a1, 8 * SIZE(X) ST a2, 1 * SIZE(Y) LD a2, 9 * SIZE(X) ST a3, 2 * SIZE(Y) LD a3, 10 * SIZE(X) ST a4, 3 * SIZE(Y) LD a4, 11 * SIZE(X) ST a5, 4 * SIZE(Y) LD a5, 12 * SIZE(X) ST a6, 5 * SIZE(Y) LD a6, 13 * SIZE(X) ST a7, 6 * SIZE(Y) LD a7, 14 * SIZE(X) ST a8, 7 * SIZE(Y) LD a8, 15 * SIZE(X) daddiu I, I, -1 daddiu X, X, 8 * SIZE bgtz I, .L12 daddiu Y, Y, 8 * SIZE .align 3 .L13: ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) ST a3, 2 * SIZE(Y) ST a4, 3 * SIZE(Y) ST a5, 4 * SIZE(Y) ST a6, 5 * SIZE(Y) ST a7, 6 * SIZE(Y) ST a8, 7 * SIZE(Y) daddiu X, X, 8 * SIZE daddiu Y, Y, 8 * SIZE .align 3 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu X, X, SIZE daddiu I, I, -1 daddiu Y, Y, SIZE bgtz I, .L16 ST a1, -1 * SIZE(Y) j .L999 NOP .align 3 .L20: dsra I, N, 3 blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddu X, X, INCX blez I, .L23 NOP .align 3 .L22: ST a1, 0 * SIZE(Y) daddu Y, Y, INCY LD a1, 0 * SIZE(X) daddu X, X, INCX ST a2, 0 * SIZE(Y) daddu Y, Y, INCY LD a2, 0 * SIZE(X) daddu X, X, INCX ST a3, 0 * SIZE(Y) daddu Y, Y, INCY LD a3, 0 * SIZE(X) daddu X, X, INCX ST a4, 0 * SIZE(Y) daddu Y, Y, INCY LD a4, 0 * SIZE(X) daddu X, X, INCX ST a5, 0 * SIZE(Y) daddu Y, Y, INCY LD a5, 0 * SIZE(X) daddu X, X, INCX ST a6, 0 * SIZE(Y) daddu Y, Y, INCY LD a6, 0 * SIZE(X) daddu X, X, INCX ST a7, 0 * SIZE(Y) daddu Y, Y, INCY LD a7, 0 * SIZE(X) daddu X, X, INCX ST a8, 0 * SIZE(Y) daddu Y, Y, INCY LD a8, 0 * SIZE(X) daddiu I, I, -1 bgtz I, .L22 daddu X, X, INCX .align 3 .L23: ST a1, 0 * SIZE(Y) daddu Y, Y, INCY ST a2, 0 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) daddu Y, Y, INCY ST a4, 0 * SIZE(Y) daddu Y, Y, INCY ST a5, 0 * SIZE(Y) daddu Y, Y, INCY ST a6, 0 * SIZE(Y) daddu Y, Y, INCY ST a7, 0 * SIZE(Y) daddu Y, Y, INCY ST a8, 0 * SIZE(Y) daddu Y, Y, INCY .align 3 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) daddu X, X, INCX daddiu I, I, -1 ST a1, 0 * SIZE(Y) bgtz I, .L26 daddu Y, Y, INCY .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/daxpy_loongson3a_simd.S000066400000000000000000000364571313527062700222140ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCH_DISTANCE 2016 #define N $4 #define X $8 #define INCX $9 #define Y $10 #define INCY $11 #define I $2 #define TEMP $3 #define YY $5 #define ALPHA $f15 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define a9 $f8 #define a10 $f9 #define a11 $f10 #define a12 $f11 #define a13 $f12 #define a14 $f13 #define a15 $f14 #define a16 $f17 #define t1 $f18 #define t2 $f19 #define t3 $f20 #define t4 $f21 #define b1 $f22 #define b2 $f23 #define b3 $f24 #define b4 $f25 #define b5 $f26 #define b6 $f27 #define b7 $f28 #define b8 $f29 #define A1 0 #define A2 1 #define A3 2 #define A4 3 #define A5 4 #define A6 5 #define A7 6 #define A8 7 #define A9 8 #define A10 9 #define A11 10 #define A12 11 #define A13 12 #define A14 13 #define A15 14 #define A16 17 #define T1 18 #define T2 19 #define T3 20 #define T4 21 #define B1 22 #define B2 23 #define B3 24 #define B4 25 #define B5 26 #define B6 27 #define B7 28 #define B8 29 #define X_BASE 8 #define Y_BASE 10 #define gsLQC1_(base,fq,ft,offset) .word (0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsLQC1(base,fq,ft,offset) gsLQC1_((base), (fq), (ft), (offset)) #define gsSQC1_(base,fq,ft,offset) .word (0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) gsSQC1_((base), (fq), (ft), (offset)) PROLOGUE #ifndef __64BIT__ daddiu $sp, $sp, -40 sdc1 $f20, 0($sp) sdc1 $f22, 8($sp) sdc1 $f24, 16($sp) sdc1 $f26, 24($sp) sdc1 $f28, 32($sp) #else daddiu $sp, $sp, -48 sdc1 $f24, 0($sp) sdc1 $f25, 8($sp) sdc1 $f26, 16($sp) sdc1 $f27, 24($sp) sdc1 $f28, 32($sp) sdc1 $f29, 40($sp) #endif li TEMP, SIZE blez N, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, BASE_SHIFT bne INCY, TEMP, .L20 //Dose the address of Y algin 16 bytes? andi TEMP, Y, 8 beq TEMP, $0, .L10 //Y unalgin. Compute this unalgined element. LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu X, X, SIZE daddiu Y, Y, SIZE MADD t1, b1, ALPHA, a1 daddiu N, N, -1 ST t1, -1 * SIZE(Y) blez N, .L999 .align 5 .L10: dsra I, N, 4 blez I, .L15 daddiu I, I, -1 //Y algin. We need test X address //Dose the address of X algin 16 bytes? andi TEMP, X, 8 bne TEMP, $0, .L30 /// .align 5 .L11: //X & Y algin gsLQC1(X_BASE,A2,A1,0) gsLQC1(X_BASE,A4,A3,1) gsLQC1(X_BASE,A6,A5,2) gsLQC1(X_BASE,A8,A7,3) gsLQC1(X_BASE,A10,A9,4) gsLQC1(X_BASE,A12,A11,5) gsLQC1(X_BASE,A14,A13,6) gsLQC1(X_BASE,A16,A15,7) gsLQC1(Y_BASE,B2,B1,0) gsLQC1(Y_BASE,B4,B3,1) gsLQC1(Y_BASE,B6,B5,2) gsLQC1(Y_BASE,B8,B7,3) blez I, .L13 NOP .align 5 .L12: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 gsSQC1(Y_BASE, T4, T3, 1) gsLQC1(Y_BASE,B4,B3,5) PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 gsSQC1(Y_BASE, T2, T1, 2) gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 gsSQC1(Y_BASE, T4, T3, 3) gsLQC1(Y_BASE,B8,B7, 7) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 gsSQC1(Y_BASE, T2, T1, 4) gsLQC1(Y_BASE,B2,B1,8) MADD t3, b3, ALPHA, a11 MADD t4, b4, ALPHA, a12 gsSQC1(Y_BASE, T4, T3, 5) gsLQC1(Y_BASE,B4,B3,9) PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 gsSQC1(Y_BASE, T2, T1, 6) gsLQC1(Y_BASE,B6,B5,10) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 gsSQC1(Y_BASE, T4, T3, 7) gsLQC1(Y_BASE,B8,B7,11) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) gsLQC1(X_BASE,A2,A1,8) gsLQC1(X_BASE,A4,A3,9) gsLQC1(X_BASE,A6,A5,10) gsLQC1(X_BASE,A8,A7,11) gsLQC1(X_BASE,A10,A9,12) gsLQC1(X_BASE,A12,A11,13) gsLQC1(X_BASE,A14,A13,14) gsLQC1(X_BASE,A16,A15,15) daddiu I, I, -1 daddiu Y, Y, 16 * SIZE daddiu X, X, 16 * SIZE bgtz I, .L12 .align 5 .L13: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 gsSQC1(Y_BASE, T4, T3, 1) gsLQC1(Y_BASE,B4,B3,5) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 gsSQC1(Y_BASE, T2, T1, 2) gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 gsSQC1(Y_BASE, T4, T3, 3) gsLQC1(Y_BASE,B8,B7,7) MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 gsSQC1(Y_BASE, T2, T1, 4) MADD t3, b3, ALPHA, a11 MADD t4, b4, ALPHA, a12 gsSQC1(Y_BASE, T4, T3, 5) MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 gsSQC1(Y_BASE, T2, T1, 6) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 gsSQC1(Y_BASE, T4, T3, 7) daddiu X, X, 16 * SIZE daddiu Y, Y, 16 * SIZE .align 5 .L15: andi I, N, 15 blez I, .L999 NOP .align 5 .L16: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu X, X, SIZE daddiu Y, Y, SIZE MADD t1, b1, ALPHA, a1 daddiu I, I, -1 bgtz I, .L16 ST t1, -1 * SIZE(Y) #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f22, 8($sp) ldc1 $f24, 16($sp) ldc1 $f26, 24($sp) ldc1 $f28, 32($sp) daddiu $sp, $sp, 40 #else ldc1 $f24, 0($sp) ldc1 $f25, 8($sp) ldc1 $f26, 16($sp) ldc1 $f27, 24($sp) ldc1 $f28, 32($sp) ldc1 $f29, 40($sp) daddiu $sp, $sp, 48 #endif j $31 NOP .align 5 .L30: //Y align, X unalign, INCX==INCY==1 //unloop 16 LD a1, 0 * SIZE(X) daddiu X, X, SIZE gsLQC1(X_BASE,A3,A2,0) gsLQC1(X_BASE,A5,A4,1) gsLQC1(X_BASE,A7,A6,2) gsLQC1(X_BASE,A9,A8,3) gsLQC1(X_BASE,A11,A10,4) gsLQC1(X_BASE,A13,A12,5) gsLQC1(X_BASE,A15,A14,6) LD a16, 14 * SIZE(X) gsLQC1(Y_BASE,B2,B1,0) gsLQC1(Y_BASE,B4,B3,1) gsLQC1(Y_BASE,B6,B5,2) gsLQC1(Y_BASE,B8,B7,3) blez I, .L32 NOP .align 5 .L31: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 gsSQC1(Y_BASE, T4, T3, 1) gsLQC1(Y_BASE,B4,B3,5) PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 gsSQC1(Y_BASE, T2, T1, 2) gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 gsSQC1(Y_BASE, T4, T3, 3) gsLQC1(Y_BASE,B8,B7,7) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 gsSQC1(Y_BASE, T2, T1, 4) gsLQC1(Y_BASE,B2,B1,8) MADD t3, b3, ALPHA, a11 MADD t4, b4, ALPHA, a12 gsSQC1(Y_BASE, T4, T3, 5) gsLQC1(Y_BASE,B4,B3,9) PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 gsSQC1(Y_BASE, T2, T1, 6) gsLQC1(Y_BASE,B6,B5,10) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 gsSQC1(Y_BASE, T4, T3, 7) gsLQC1(Y_BASE,B8,B7,11) PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) LD a1, 15 * SIZE(X) gsLQC1(X_BASE,A3,A2,8) gsLQC1(X_BASE,A5,A4,9) gsLQC1(X_BASE,A7,A6,10) gsLQC1(X_BASE,A9,A8,11) gsLQC1(X_BASE,A11,A10,12) gsLQC1(X_BASE,A13,A12,13) gsLQC1(X_BASE,A15,A14,14) LD a16, 30 * SIZE(X) daddiu I, I, -1 daddiu Y, Y, 16 * SIZE daddiu X, X, 16 * SIZE bgtz I, .L31 .align 5 //Loop end: .L32: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 gsSQC1(Y_BASE, T4, T3, 1) gsLQC1(Y_BASE,B4,B3,5) MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 gsSQC1(Y_BASE, T2, T1, 2) gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 gsSQC1(Y_BASE, T4, T3, 3) gsLQC1(Y_BASE,B8,B7,7) MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 gsSQC1(Y_BASE, T2, T1, 4) MADD t3, b3, ALPHA, a11 MADD t4, b4, ALPHA, a12 gsSQC1(Y_BASE, T4, T3, 5) MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 gsSQC1(Y_BASE, T2, T1, 6) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 gsSQC1(Y_BASE, T4, T3, 7) daddiu X, X, 15 * SIZE daddiu Y, Y, 16 * SIZE //jump back to the remain process. b .L15 .align 5 //INCX!=1 or INCY != 1 .L20: dsra I, N, 3 move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD b1, 0 * SIZE(Y) daddu Y, Y, INCY LD a2, 0 * SIZE(X) daddu X, X, INCX LD b2, 0 * SIZE(Y) daddu Y, Y, INCY LD a3, 0 * SIZE(X) daddu X, X, INCX LD b3, 0 * SIZE(Y) daddu Y, Y, INCY LD a4, 0 * SIZE(X) daddu X, X, INCX LD b4, 0 * SIZE(Y) daddu Y, Y, INCY LD a5, 0 * SIZE(X) daddu X, X, INCX LD b5, 0 * SIZE(Y) daddu Y, Y, INCY LD a6, 0 * SIZE(X) daddu X, X, INCX LD b6, 0 * SIZE(Y) daddu Y, Y, INCY LD a7, 0 * SIZE(X) daddu X, X, INCX LD b7, 0 * SIZE(Y) daddu Y, Y, INCY LD a8, 0 * SIZE(X) daddu X, X, INCX LD b8, 0 * SIZE(Y) daddu Y, Y, INCY blez I, .L23 NOP .align 5 .L22: MADD t1, b1, ALPHA, a1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t2, b2, ALPHA, a2 LD a2, 0 * SIZE(X) LD b2, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t3, b3, ALPHA, a3 LD a3, 0 * SIZE(X) LD b3, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY MADD t4, b4, ALPHA, a4 LD a4, 0 * SIZE(X) LD b4, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t1, 0 * SIZE(YY) daddu YY, YY, INCY MADD t1, b5, ALPHA, a5 LD a5, 0 * SIZE(X) LD b5, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY MADD t2, b6, ALPHA, a6 LD a6, 0 * SIZE(X) LD b6, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY MADD t3, b7, ALPHA, a7 LD a7, 0 * SIZE(X) LD b7, 0 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY ST t4, 0 * SIZE(YY) daddu YY, YY, INCY MADD t4, b8, ALPHA, a8 LD a8, 0 * SIZE(X) daddu X, X, INCX LD b8, 0 * SIZE(Y) daddu Y, Y, INCY ST t1, 0 * SIZE(YY) daddu YY, YY, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY ST t4, 0 * SIZE(YY) daddiu I, I, -1 bgtz I, .L22 daddu YY, YY, INCY .align 5 .L23: MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 ST t1, 0 * SIZE(YY) daddu YY, YY, INCY MADD t1, b5, ALPHA, a5 ST t2, 0 * SIZE(YY) daddu YY, YY, INCY MADD t2, b6, ALPHA, a6 ST t3, 0 * SIZE(YY) daddu YY, YY, INCY MADD t3, b7, ALPHA, a7 ST t4, 0 * SIZE(YY) daddu YY, YY, INCY MADD t4, b8, ALPHA, a8 ST t1, 0 * SIZE(YY) daddu YY, YY, INCY ST t2, 0 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) daddu YY, YY, INCY ST t4, 0 * SIZE(YY) daddu YY, YY, INCY .align 5 .L25: andi I, N, 7 blez I, .L999 NOP .align 5 .L26: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) MADD t1, b1, ALPHA, a1 daddu X, X, INCX ST t1, 0 * SIZE(Y) daddiu I, I, -1 bgtz I, .L26 daddu Y, Y, INCY .align 5 .L999: #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f22, 8($sp) ldc1 $f24, 16($sp) ldc1 $f26, 24($sp) ldc1 $f28, 32($sp) daddiu $sp, $sp, 40 #else ldc1 $f24, 0($sp) ldc1 $f25, 8($sp) ldc1 $f26, 16($sp) ldc1 $f27, 24($sp) ldc1 $f28, 32($sp) ldc1 $f29, 40($sp) daddiu $sp, $sp, 48 #endif j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/dgemm_kernel_loongson3a_4x4.S000066400000000000000000001145761313527062700232020ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define KCO $18 #define MCO $19 #define NCO $20 #define SPANB $21 #define PREB $23 #define PREA $24 #define SPANA $25 #define ALPHA $f15 #if defined(TRMMKERNEL) #define OFFSET $2 #define KK $3 #define TEMP $7 #endif #define R8 8 #define R9 9 #define R14 14 #define R15 15 #define R16 16 #define R17 17 #define t11 $f30 #define t21 $f31 #define t31 $f28 #define t41 $f29 #define t12 $f26 #define t22 $f27 #define t32 $f24 #define t42 $f25 #define t13 $f22 #define t23 $f23 #define t33 $f20 #define t43 $f21 #define t14 $f18 #define t24 $f19 #define t34 $f16 #define t44 $f17 #define c11 $f0 #define c21 $f1 #define c31 $f2 #define c41 $f3 #define c12 $f4 #define c22 $f5 #define c32 $f6 #define c42 $f7 #define c13 $f8 #define c23 $f9 #define c33 $f10 #define c43 $f11 #define c14 $f12 #define c24 $f13 #define c34 $f14 #define c44 $f0 #define a0 $f0 #define a1 $f1 #define a2 $f2 #define a3 $f3 #define a4 $f4 #define a5 $f5 #define a6 $f6 #define a7 $f7 #define b0 $f8 #define b1 $f9 #define b2 $f10 #define b3 $f11 #define b4 $f12 #define b5 $f13 #define b6 $f14 #define b7 $f15 #define F31 31 #define F30 30 #define F29 29 #define F28 28 #define F27 27 #define F26 26 #define F25 25 #define F24 24 #define F23 23 #define F22 22 #define F21 21 #define F20 20 #define F19 19 #define F18 18 #define F17 17 #define F16 16 #define F15 15 #define F14 14 #define F13 13 #define F12 12 #define F11 11 #define F10 10 #define F9 9 #define F8 8 #define F7 7 #define F6 6 #define F5 5 #define F4 4 #define F3 3 #define F2 2 #define F1 1 #define F0 0 PROLOGUE daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 48($sp) ST $f24, 56($sp) ST $f25, 64($sp) ST $f26, 72($sp) ST $f27, 80($sp) ST $f28, 88($sp) sd $23, 96($sp) sd $24, 104($sp) sd $25, 112($sp) ST $f20,120($sp) ST $f21,128($sp) ST $f22,136($sp) ST $f23,144($sp) .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M move NCO,N # Backup N move KCO,K # Backup K move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 #if defined(TRMMKERNEL) LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK,OFFSET #endif move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 move CO1,C dsra M,MCO,2 # M=MCO/2 move A,AO # Reset A daddu CO2,C,LDC daddu PREB,BO,SPANB # PreB point next panelB daddu CO3,CO2,LDC daddu PREA,AO,SPANA daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj .L10: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) #else dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t31,t11 MOV t41,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t12,t11 MOV t22,t11 gsLQC1(R8,F3,F2,1) # a2,a3 MOV t32,t11 MOV t42,t11 gsLQC1(R9,F11,F10,1) # b2,b3 MOV t13,t11 MOV t23,t11 MOV t33,t11 MOV t43,t11 MOV t14,t11 MOV t24,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif dsra K,TEMP,2 # K=KCO/2 MOV t34,t11 beqz K,.L15 MOV t44,t11 #else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t21,t11 MOV t31,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t41,t11 MOV t12,t11 gsLQC1(R8,F3,F2,1) # a2,a3 MOV t22,t11 MOV t32,t11 gsLQC1(R9,F11,F10,1) # b2,b3 MOV t42,t11 dsra K,KCO,2 # K=KCO/2 MOV t13,t11 MOV t23,t11 MOV t33,t11 MOV t43,t11 MOV t14,t11 MOV t24,t11 MOV t34,t11 beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif .align 5 .L11: # kr=4 gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 gsLQC1(R9,F15,F14,3) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 FETCH $0,(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 FETCH $0,(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L12: gsLQC1(R8,F1,F0,4) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,4) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,5) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 gsLQC1(R9,F11,F10,5) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 FETCH $0,4*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 .L13: gsLQC1(R8,F5,F4,6) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,6) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 gsLQC1(R9,F15,F14,7) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu A,A,16*SIZE # 4mr*4kr FETCH $0,8*SIZE(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 daddu B,B,16*SIZE # 4nr*4kr MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 FETCH $0,8*SIZE(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L14: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddiu K,K,-1 gsLQC1(R9,F11,F10,1) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 FETCH $0,12*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 FETCH $0,12*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREB,PREB,16*SIZE MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 bnez K,.L11 daddu PREA,PREA,16*SIZE .L15: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop .L16: gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 gsLQC1(R9,F15,F14,3) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu A,A,8*SIZE # 4mr*2kr FETCH $0,0(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 daddu B,B,8*SIZE # 4nr*2kr FETCH $0,0(PREA) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L17: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 gsLQC1(R9,F11,F10,1) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 FETCH $0,4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 daddu PREB,PREB,8*SIZE MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREA,PREA,8*SIZE MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # 4mr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 4nr*kr FETCH $0,0(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu PREB,PREB,4*SIZE MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu PREA,PREA,4*SIZE MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L19: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA LD c13,0(CO3) MADD t12,c12,t12,ALPHA LD c23,1*SIZE(CO3) MADD t22,c22,t22,ALPHA LD c33,2*SIZE(CO3) MADD t32,c32,t32,ALPHA LD c43,3*SIZE(CO3) MADD t42,c42,t42,ALPHA LD c14,0(CO4) MADD t13,c13,t13,ALPHA LD c24,1*SIZE(CO4) MADD t23,c23,t23,ALPHA LD c34,2*SIZE(CO4) MADD t33,c33,t33,ALPHA LD c44,3*SIZE(CO4) MADD t43,c43,t43,ALPHA ST t11,0(CO1) MADD t14,c14,t14,ALPHA ST t21,1*SIZE(CO1) MADD t24,c24,t24,ALPHA ST t31,2*SIZE(CO1) MADD t34,c34,t34,ALPHA ST t41,3*SIZE(CO1) MADD t44,c44,t44,ALPHA daddiu M,M,-1 # M-- ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) ST t13,0(CO3) ST t23,1*SIZE(CO3) ST t33,2*SIZE(CO3) ST t43,3*SIZE(CO3) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) FETCH $0,8*SIZE(CO3) FETCH $0,8*SIZE(CO4) ST t14,0(CO4) daddu CO1,CO1,4*SIZE # COi += 4 ST t24,1*SIZE(CO4) daddu CO2,CO2,4*SIZE ST t34,2*SIZE(CO4) daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB bnez M,.L10 daddu CO4,CO4,4*SIZE #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t31, 2 * SIZE(CO1) MUL t32, ALPHA, t32 ST t41, 3 * SIZE(CO1) MUL t42, ALPHA, t42 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t32, 2 * SIZE(CO2) MUL t33, ALPHA, t33 ST t42, 3 * SIZE(CO2) MUL t43, ALPHA, t43 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t33, 2 * SIZE(CO3) MUL t34, ALPHA, t34 ST t43, 3 * SIZE(CO3) MUL t44, ALPHA, t44 ST t14, 0 * SIZE(CO4) daddiu M,M,-1 # M-- ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else daddiu TEMP,TEMP, -4 #endif dsll K,TEMP,2 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # mov A to the end of panel Ai daddu B,B,TEMP # mov B to the end of panel Bj #endif #ifdef LEFT daddiu KK, KK,4 #endif bnez M,.L10 nop #endif .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 beqz M,.L14_M1 nop .L20: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t12,t11 MOV t22,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t13,t11 MOV t23,t11 gsLQC1(R9,F11,F10,1) # b2,b3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 #else daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 #endif dsra K,TEMP,2 MOV t14,t11 beqz K,.L25 MOV t24,t11 # clear 2*4=8 results registers #else move B,BO # Reset B MTC $0,t11 gsLQC1(R8,F1,F0,0) MOV t21,t11 MOV t12,t11 gsLQC1(R9,F9,F8,0) MOV t22,t11 dsra K,KCO,2 gsLQC1(R9,F11,F10,1) MOV t13,t11 MOV t23,t11 MOV t14,t11 beqz K,.L25 MOV t24,t11 #endif .L21: # nr=4,mr=2,kr=4 gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,4) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R9,F11,F10,5) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 daddiu K,K,-1 gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b0 MADD t21,t21,a3,b0 gsLQC1(R9,F13,F12,6) MADD t12,t12,a2,b1 MADD t22,t22,a3,b1 gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t23,t23,a3,b2 daddu A,A,8*SIZE # 2mr*4kr MADD t14,t14,a2,b3 MADD t24,t24,a3,b3 daddu B,B,16*SIZE # 4nr*4kr gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b4 MADD t21,t21,a7,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a6,b5 MADD t22,t22,a7,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a6,b6 MADD t23,t23,a7,b6 MADD t14,t14,a6,b7 bnez K,.L21 MADD t24,t24,a7,b7 .L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else andi K,TEMP,2 #endif beqz K,.L28 nop .L26: gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 daddu A,A,4*SIZE # 2mr*2kr MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu B,B,8*SIZE # 4nr*2kr .L27: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 .L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr daddu B,B,4*SIZE # 4nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) MADD t21,c21,t21,ALPHA LD c14,0(CO4) MADD t12,c12,t12,ALPHA LD c24,1*SIZE(CO4) MADD t22,c22,t22,ALPHA ST t11,0(CO1) MADD t13,c13,t13,ALPHA ST t21,1*SIZE(CO1) MADD t23,c23,t23,ALPHA ST t12,0(CO2) MADD t14,c14,t14,ALPHA ST t22,1*SIZE(CO2) MADD t24,c24,t24,ALPHA ST t13,0(CO3) daddu CO1,CO1,2*SIZE # COi += 2 ST t23,1*SIZE(CO3) daddu CO2,CO2,2*SIZE ST t14,0(CO4) daddu CO3,CO3,2*SIZE ST t24,1*SIZE(CO4) daddu CO4,CO4,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP,-2 #else daddiu TEMP,TEMP,-4 #endif dsll K,TEMP,1 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # move A to next panel Ai daddu B,B,TEMP # move B to next panel Bj #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L14_M1: andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop .L30: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 0 + BASE_SHIFT dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t12,t11 LD a0, 0 * SIZE(A) # a0 MOV t13,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t14,t11 # clear result registers gsLQC1(R9,F11,F10,1) # b2,b3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra K,TEMP, 2 nop beqz K,.L35 nop #else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 MOV t12,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t13,t11 MOV t14,t11 gsLQC1(R9,F11,F10,1) # b2,b3 beqz K,.L35 nop #endif .L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 gsLQC1(R9,F13,F12,2) # b4,b5 MADD t12,t12,a0,b1 gsLQC1(R9,F15,F14,3) # b6,b7 MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 gsLQC1(R9,F9,F8,4) MADD t12,t12,a1,b5 gsLQC1(R9,F11,F10,5) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 daddiu K,K,-1 LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 gsLQC1(R9,F13,F12,6) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 daddu B,B,16*SIZE # 4nr*4kr LD a0, 0*SIZE(A) # a0 MADD t11,t11,a3,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a3,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a3,b6 bnez K,.L31 MADD t14,t14,a3,b7 .L35: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop .L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr .L37: LD a0,0(A) MADD t11,t11,a1,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a1,b5 gsLQC1(R9,F11,F10,1) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE daddu B,B,4*SIZE MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA MADD t14,c14,t14,ALPHA ST t11,0(CO1) ST t12,0(CO2) ST t13,0(CO3) ST t14,0(CO4) #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 MUL t13, ALPHA, t13 MUL t14, ALPHA, t14 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll K,TEMP, 0 + BASE_SHIFT dsll TEMP,TEMP, 2 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK,4 #endif bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj .align 5 .L0_N2: andi N,NCO,2 # nr = 2 beqz N,.L0_N1 nop .L0_N2_Lb: move CO1,C daddu CO2,C,LDC dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA daddu C,CO2,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L12_M2 nop .L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) # a0,a1 MOV t31,t11 MOV t41,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t12,t11 MOV t22,t11 gsLQC1(R8,F3,F2,1) # a2,a3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 #else move B,BO # Reset B MTC $0,t11 # gemm part gsLQC1(R8,F1,F0,0) # a0,a1 MOV t21,t11 MOV t31,t11 gsLQC1(R9,F9,F8,0) # b0,b1 MOV t41,t11 dsra K,KCO,2 # K=KCO/2 gsLQC1(R8,F3,F2,1) # a2,a3 MOV t12,t11 MOV t22,t11 MOV t32,t11 beqz K,.L45 MOV t42,t11 #endif .L41: # nr=2,mr=kr=4 gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 FETCH $0,(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L42: gsLQC1(R8,F1,F0,4) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,5) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 .L43: gsLQC1(R8,F5,F4,6) MADD t11,t11,a0,b2 MADD t21,t21,a1,b2 gsLQC1(R9,F15,F14,3) MADD t12,t12,a0,b3 MADD t22,t22,a1,b3 gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 daddu B,B,8*SIZE # 2nr*4kr FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 daddu A,A,16*SIZE # 4mr*4kr .L44: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b6 MADD t21,t21,a5,b6 daddiu K,K,-1 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b7 MADD t22,t22,a5,b7 daddu PREA,PREA,16*SIZE gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b6 MADD t41,t41,a7,b6 FETCH $0,-4*SIZE(PREA) MADD t32,t32,a6,b7 bnez K,.L41 MADD t42,t42,a7,b7 .L45: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop .L46: gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 FETCH $0,0(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L47: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,8*SIZE .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,2*SIZE daddu PREA,PREA,4*SIZE MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA ST t11,0(CO1) MADD t12,c12,t12,ALPHA ST t21,1*SIZE(CO1) MADD t22,c22,t22,ALPHA ST t31,2*SIZE(CO1) MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) daddu CO1,CO1,4*SIZE bnez M,.L40 daddu CO2,CO2,4*SIZE #else MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 ST t21, 1 * SIZE(CO1) MUL t32, ALPHA, t32 ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,4(CO1) FETCH $0,4(CO2) #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif dsll K,TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L40 nop #endif .align 3 .L12_M2: andi M,MCO,2 # mr = 2 beqz M,.L12_M1 nop .L50: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO #else dsll K, KK, 1 + BASE_SHIFT #mr=2 dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 daddu A, A, K daddu B, BO, TEMP #endif MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 gsLQC1(R9,F9,F8,0) #b0,b1 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 #else move B,BO dsra K,KCO,2 # K=KCO/2 gsLQC1(R8,F1,F0,0) #a0,a1 MTC $0,t11 MOV t21,t11 gsLQC1(R9,F9,F8,0) #b0,b1 MOV t12,t11 beqz K,.L55 MOV t22,t11 #endif .L51: # nr=2 mr=2,kr=4 gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 daddiu K,K,-1 gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE gsLQC1(R9,F15,F14,3) MADD t12,t12,a2,b3 MADD t22,t22,a3,b3 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b6 MADD t21,t21,a7,b6 gsLQC1(R9,F9,F8,0) MADD t12,t12,a6,b7 bnez K,.L51 MADD t22,t22,a7,b7 .L55: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop .L56: gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 2nr*2kr .L57: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 .L58: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP, 1 #endif beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE # 2nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA MADD t22,c22,t22,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t12,0(CO2) ST t22,1*SIZE(CO2) daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) #else daddiu M, M, -1 daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t12, ALPHA, t12 MUL t22, ALPHA, t22 ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) ST t12, -2 * SIZE(CO2) ST t22, -1 * SIZE(CO2) FETCH $0,0(CO1) FETCH $0,0(CO2) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L12_M1: andi M,MCO,1 # mr = 1 beqz M,.L0_N2_Loop nop .L60: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif MTC $0,t11 LD a0, 0*SIZE(A) # a0 MOV t21,t11 gsLQC1(R9,F9,F8,0) # b0,b1 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t12,t11 beqz K,.L65 MOV t22,t11 #else dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 gsLQC1(R9,F9,F8,0) MOV t12,t11 beqz K,.L65 MOV t22,t11 #endif .L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 MADD t11,t11,a0,b0 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 MADD t11,t11,a4,b4 gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 LD a6, 3*SIZE(A) # a4 MADD t11,t11,a2,b2 daddiu K,K,-1 gsLQC1(R9,F15,F14,3) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) MADD t11,t11,a6,b6 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE gsLQC1(R9,F9,F8,0) # a0 bnez K,.L61 MADD t12,t12,a6,b7 .L65: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop .L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 daddu B,B,4*SIZE .L67: LD a0,0(A) # a0 MADD t11,t11,a4,b4 gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 .L68: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE .L69: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .L0_N2_Loop: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move BO, B .align 5 .L0_N1: andi N,NCO,1 # nr = 1 beqz N,.L999 nop move CO1,C dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L11_M2 daddu C,CO1,LDC .L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B #else dsll K, KK, 2 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif MTC $0,t11 LD b0, 0*SIZE(B) MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t31,t11 gsLQC1(R8,F3,F2,1) #a2,a3 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 MOV t41,t11 beqz K,.L75 nop #else move B, BO # Reset B dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t31,t11 MOV t41,t11 gsLQC1(R8,F3,F2,1) #a2,a3 beqz K,.L75 nop #endif .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 gsLQC1(R8,F5,F4,2) MADD t21,t21,a1,b0 gsLQC1(R8,F7,F6,3) FETCH $0,(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 gsLQC1(R8,F1,F0,4) MADD t21,t21,a5,b4 gsLQC1(R8,F3,F2,5) FETCH $0,4*SIZE(PREA) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 gsLQC1(R8,F5,F4,6) MADD t21,t21,a1,b2 FETCH $0,8*SIZE(PREA) gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 daddu PREA,PREA,16*SIZE gsLQC1(R8,F1,F0,0) MADD t21,t21,a5,b6 daddiu K,K,-1 FETCH $0,-32(PREA) gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b6 bnez K,.L71 MADD t41,t41,a7,b6 .L75: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop .L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 gsLQC1(R8,F5,F4,2) MADD t21,t21,a1,b0 FETCH $0,0(PREA) gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L77: LD b0,0(B) MADD t11,t11,a4,b4 gsLQC1(R8,F1,F0,0) MADD t21,t21,a5,b4 FETCH $0,4*SIZE(PREA) gsLQC1(R8,F3,F2,1) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddu PREA,PREA,8*SIZE .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu B,B,1*SIZE daddu PREA,PREA,4*SIZE .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t31,c31,t31,ALPHA MADD t41,c41,t41,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) daddiu M,M,-1 # M-- FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) bnez M,.L70 # M!=0 daddu CO1,CO1,4*SIZE # COx += 4*8Byte #else daddiu M,M,-1 # M-- MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A,K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L70 nop #endif .align 3 .L11_M2: andi M,MCO,2 # mr = 2 beqz M,.L11_M1 nop .L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 # K=KCO/2 beqz K,.L85 nop #else move B, BO dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 beqz K,.L85 nop #endif .L81: # nr=1,mr=2,kr=4 LD b4, 1*SIZE(B) gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD b2, 2*SIZE(B) gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD b6, 3*SIZE(B) gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD b0, 0*SIZE(B) gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b6 MADD t21,t21,a7,b6 daddiu K,K,-1 bnez K,.L81 nop .L85: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop .L86: gsLQC1(R8,F5,F4,1) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 gsLQC1(R8,F1,F0,0) LD b0,0(B) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,1*SIZE .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else daddu CO1,CO1,2*SIZE # COx += 2*8Byte MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 FETCH $0,0(CO1) ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L11_M1: andi M,MCO,1 # mr = 1 beqz M,.L999 nop .L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MTC $0,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra K, TEMP, 2 beqz K,.L95 nop #else move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif .L91: # nr=mr=1,kr=4 LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop .L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 .L99: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C MADD t11,c11,t11,ALPHA ST t11,0(CO1) #else MUL t11, ALPHA, t11 ST t11, 0 * SIZE(CO1) #endif .L999: # End ld $16, 0($sp) ld $17, 8($sp) ld $18, 16($sp) ld $19, 24($sp) ld $20, 32($sp) ld $21, 40($sp) ld $22, 48($sp) LD $f24, 56($sp) LD $f25, 64($sp) LD $f26, 72($sp) LD $f27, 80($sp) LD $f28, 88($sp) ld $23, 96($sp) ld $24, 104($sp) ld $25, 112($sp) LD $f20,120($sp) LD $f21,128($sp) LD $f22,136($sp) LD $f23,144($sp) j $31 daddiu $sp, $sp, 160 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/dgemm_kernel_loongson3b_4x4.S000066400000000000000000001176461313527062700232040ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define KCO $18 #define MCO $19 #define NCO $20 #define SPANB $21 #define PREB $23 #define PREA $24 #define SPANA $25 #define ALPHA $f15 #if defined(TRMMKERNEL) #define OFFSET $2 #define KK $3 #define TEMP $7 #endif #define R8 8 #define R9 9 #define R14 14 #define R15 15 #define R16 16 #define R17 17 #define t11 $f30 #define t21 $f31 #define t31 $f28 #define t41 $f29 #define t12 $f26 #define t22 $f27 #define t32 $f24 #define t42 $f25 #define t13 $f22 #define t23 $f23 #define t33 $f20 #define t43 $f21 #define t14 $f18 #define t24 $f19 #define t34 $f16 #define t44 $f17 #define c11 $f0 #define c21 $f1 #define c31 $f2 #define c41 $f3 #define c12 $f4 #define c22 $f5 #define c32 $f6 #define c42 $f7 #define c13 $f8 #define c23 $f9 #define c33 $f10 #define c43 $f11 #define c14 $f12 #define c24 $f13 #define c34 $f14 #define c44 $f0 #define a0 $f0 #define a1 $f1 #define a2 $f2 #define a3 $f3 #define a4 $f4 #define a5 $f5 #define a6 $f6 #define a7 $f7 #define b0 $f8 #define b1 $f9 #define b2 $f10 #define b3 $f11 #define b4 $f12 #define b5 $f13 #define b6 $f14 #define b7 $f15 #define F31 31 #define F30 30 #define F29 29 #define F28 28 #define F27 27 #define F26 26 #define F25 25 #define F24 24 #define F23 23 #define F22 22 #define F21 21 #define F20 20 #define F19 19 #define F18 18 #define F17 17 #define F16 16 #define F15 15 #define F14 14 #define F13 13 #define F12 12 #define F11 11 #define F10 10 #define F9 9 #define F8 8 #define F7 7 #define F6 6 #define F5 5 #define F4 4 #define F3 3 #define F2 2 #define F1 1 #define F0 0 PROLOGUE daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 48($sp) ST $f24, 56($sp) ST $f25, 64($sp) ST $f26, 72($sp) ST $f27, 80($sp) ST $f28, 88($sp) sd $23, 96($sp) sd $24, 104($sp) sd $25, 112($sp) ST $f20,120($sp) ST $f21,128($sp) ST $f22,136($sp) ST $f23,144($sp) .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M move NCO,N # Backup N move KCO,K # Backup K move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 #if defined(TRMMKERNEL) LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK,OFFSET #endif move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 move CO1,C dsra M,MCO,2 # M=MCO/2 move A,AO # Reset A daddu CO2,C,LDC daddu PREB,BO,SPANB # PreB point next panelB daddu CO3,CO2,LDC daddu PREA,AO,SPANA daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj .L10: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) #else dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) MOV t21,t11 MOV t31,t11 LD a1,1*SIZE(A) MOV t41,t11 MOV t12,t11 LD b0,0(B) MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 LD a2,2*SIZE(A) MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) MOV t14,t11 MOV t24,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif dsra K,TEMP,2 # K=KCO/2 MOV t34,t11 beqz K,.L15 MOV t44,t11 #else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) MOV t21,t11 MOV t31,t11 LD a1,1*SIZE(A) MOV t41,t11 MOV t12,t11 LD b0,0(B) MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 dsra K,KCO,2 # K=KCO/2 LD a2,2*SIZE(A) MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) MOV t14,t11 MOV t24,t11 LD b3,3*SIZE(B) MOV t34,t11 beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif .align 5 .L11: # kr=4 MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 LD b6,6*SIZE(B) FETCH $0,(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 LD a7,7*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,7*SIZE(B) .L12: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,8*SIZE(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,9*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 LD b0,8*SIZE(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 LD b1,9*SIZE(B) FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,10*SIZE(A) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,10*SIZE(B) FETCH $0,4*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 LD a3,11*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 LD b3,11*SIZE(B) .L13: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,12*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,13*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,12*SIZE(B) FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,13*SIZE(B) FETCH $0,8*SIZE(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,14*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,16*SIZE # 4mr*4kr LD b6,14*SIZE(B) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 daddu B,B,16*SIZE # 4nr*4kr LD a7,-1*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,-1*SIZE(B) .L14: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,0(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,1*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddiu K,K,-1 LD b0,0(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,16*SIZE LD b1,1*SIZE(B) FETCH $0,12*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,2*SIZE(A) FETCH $0,-4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,2*SIZE(B) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREB,PREB,16*SIZE LD a3,3*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 bnez K,.L11 LD b3,3*SIZE(B) .L15: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop .L16: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) FETCH $0,0(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,0(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,8*SIZE # 4mr*2kr LD b6,6*SIZE(B) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 daddu B,B,8*SIZE # 4nr*2kr LD a7,-1*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,-1*SIZE(B) .L17: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,0*SIZE(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,1*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 LD b0,0*SIZE(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 LD b1,1*SIZE(B) FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,2*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,2*SIZE(B) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREA,PREA,8*SIZE LD a3,3*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 daddu PREB,PREB,8*SIZE LD b3,3*SIZE(B) .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # 4mr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 4nr*kr FETCH $0,0(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu PREB,PREB,4*SIZE MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu PREA,PREA,4*SIZE MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L19: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA LD c13,0(CO3) MADD t12,c12,t12,ALPHA LD c23,1*SIZE(CO3) MADD t22,c22,t22,ALPHA LD c33,2*SIZE(CO3) MADD t32,c32,t32,ALPHA LD c43,3*SIZE(CO3) MADD t42,c42,t42,ALPHA LD c14,0(CO4) MADD t13,c13,t13,ALPHA LD c24,1*SIZE(CO4) MADD t23,c23,t23,ALPHA LD c34,2*SIZE(CO4) MADD t33,c33,t33,ALPHA LD c44,3*SIZE(CO4) MADD t43,c43,t43,ALPHA ST t11,0(CO1) MADD t14,c14,t14,ALPHA ST t21,1*SIZE(CO1) MADD t24,c24,t24,ALPHA ST t31,2*SIZE(CO1) MADD t34,c34,t34,ALPHA ST t41,3*SIZE(CO1) MADD t44,c44,t44,ALPHA daddiu M,M,-1 # M-- ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) ST t13,0(CO3) ST t23,1*SIZE(CO3) ST t33,2*SIZE(CO3) ST t43,3*SIZE(CO3) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) FETCH $0,8*SIZE(CO3) FETCH $0,8*SIZE(CO4) ST t14,0(CO4) daddu CO1,CO1,4*SIZE # COi += 4 ST t24,1*SIZE(CO4) daddu CO2,CO2,4*SIZE ST t34,2*SIZE(CO4) daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB bnez M,.L10 daddu CO4,CO4,4*SIZE #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t31, 2 * SIZE(CO1) MUL t32, ALPHA, t32 ST t41, 3 * SIZE(CO1) MUL t42, ALPHA, t42 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t32, 2 * SIZE(CO2) MUL t33, ALPHA, t33 ST t42, 3 * SIZE(CO2) MUL t43, ALPHA, t43 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t33, 2 * SIZE(CO3) MUL t34, ALPHA, t34 ST t43, 3 * SIZE(CO3) MUL t44, ALPHA, t44 ST t14, 0 * SIZE(CO4) daddiu M,M,-1 # M-- ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else daddiu TEMP,TEMP, -4 #endif dsll K,TEMP,2 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # mov A to the end of panel Ai daddu B,B,TEMP # mov B to the end of panel Bj #endif #ifdef LEFT daddiu KK, KK,4 #endif bnez M,.L10 nop #endif .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 beqz M,.L14_M1 nop .L20: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 LD b2,2*SIZE(B) MOV t13,t11 MOV t23,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 #else daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 #endif dsra K,TEMP,2 MOV t14,t11 beqz K,.L25 MOV t24,t11 # clear 2*4=8 results registers #else move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 dsra K,KCO,2 LD b2,2*SIZE(B) MOV t13,t11 MOV t23,t11 LD b3,3*SIZE(B) MOV t14,t11 beqz K,.L25 MOV t24,t11 #endif .L21: # nr=4,mr=2,kr=4 MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD a5,3*SIZE(A) MADD t12,t12,a0,b1 LD b4,4*SIZE(B) MADD t22,t22,a1,b1 LD b5,5*SIZE(B) MADD t13,t13,a0,b2 LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 LD a3,5*SIZE(A) MADD t12,t12,a4,b5 LD b0,8*SIZE(B) MADD t22,t22,a5,b5 LD b1,9*SIZE(B) MADD t13,t13,a4,b6 LD b2,10*SIZE(B) MADD t23,t23,a5,b6 LD b3,11*SIZE(B) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 daddiu K,K,-1 MADD t11,t11,a2,b0 LD a6,6*SIZE(A) MADD t21,t21,a3,b0 LD a7,7*SIZE(A) MADD t12,t12,a2,b1 LD b4,12*SIZE(B) MADD t22,t22,a3,b1 LD b5,13*SIZE(B) MADD t13,t13,a2,b2 LD b6,14*SIZE(B) MADD t23,t23,a3,b2 LD b7,15*SIZE(B) MADD t14,t14,a2,b3 MADD t24,t24,a3,b3 daddu A,A,8*SIZE # 2mr*4kr daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a6,b4 LD a0,0*SIZE(A) MADD t21,t21,a7,b4 LD a1,1*SIZE(A) MADD t12,t12,a6,b5 LD b0,0*SIZE(B) MADD t22,t22,a7,b5 LD b1,1*SIZE(B) MADD t13,t13,a6,b6 LD b2,2*SIZE(B) MADD t23,t23,a7,b6 LD b3,3*SIZE(B) MADD t14,t14,a6,b7 bnez K,.L21 MADD t24,t24,a7,b7 .L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else andi K,TEMP,2 #endif beqz K,.L28 nop .L26: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD a5,3*SIZE(A) MADD t12,t12,a0,b1 LD b4,4*SIZE(B) MADD t22,t22,a1,b1 LD b5,5*SIZE(B) MADD t13,t13,a0,b2 LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,4*SIZE # 2mr*2kr daddu B,B,8*SIZE # 4nr*2kr .L27: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD a1,1*SIZE(A) MADD t12,t12,a4,b5 LD b0,0*SIZE(B) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) MADD t13,t13,a4,b6 LD b2,2*SIZE(B) MADD t23,t23,a5,b6 LD b3,3*SIZE(B) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 .L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr daddu B,B,4*SIZE # 4nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) MADD t21,c21,t21,ALPHA LD c14,0(CO4) MADD t12,c12,t12,ALPHA LD c24,1*SIZE(CO4) MADD t22,c22,t22,ALPHA ST t11,0(CO1) MADD t13,c13,t13,ALPHA ST t21,1*SIZE(CO1) MADD t23,c23,t23,ALPHA ST t12,0(CO2) MADD t14,c14,t14,ALPHA ST t22,1*SIZE(CO2) MADD t24,c24,t24,ALPHA ST t13,0(CO3) daddu CO1,CO1,2*SIZE # COi += 2 ST t23,1*SIZE(CO3) daddu CO2,CO2,2*SIZE ST t14,0(CO4) daddu CO3,CO3,2*SIZE ST t24,1*SIZE(CO4) daddu CO4,CO4,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP,-2 #else daddiu TEMP,TEMP,-4 #endif dsll K,TEMP,1 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # move A to next panel Ai daddu B,B,TEMP # move B to next panel Bj #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L14_M1: andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop .L30: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, BASE_SHIFT dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) MOV t14,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra K,TEMP, 2 nop beqz K,.L35 nop #else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) MOV t14,t11 beqz K,.L35 LD b3,3*SIZE(B) #endif .L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 LD b6,6*SIZE(B) LD b7,7*SIZE(B) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 LD b0,8*SIZE(B) LD b1,9*SIZE(B) MADD t12,t12,a1,b5 LD b2,10*SIZE(B) LD b3,11*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 daddiu K,K,-1 LD b4,12*SIZE(B) LD b5,13*SIZE(B) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr LD b6,14*SIZE(B) LD b7,15*SIZE(B) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 LD a0, 0*SIZE(A) # a0 daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a3,b4 LD b0,0*SIZE(B) MADD t12,t12,a3,b5 LD b1,1*SIZE(B) MADD t13,t13,a3,b6 LD b2,2*SIZE(B) MADD t14,t14,a3,b7 bnez K,.L31 LD b3,3*SIZE(B) .L35: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop .L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr LD b6,6*SIZE(B) MADD t13,t13,a0,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr .L37: LD a0,0(A) MADD t11,t11,a1,b4 LD b0,0*SIZE(B) LD b1,1*SIZE(B) MADD t12,t12,a1,b5 LD b2,2*SIZE(B) LD b3,3*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE daddu B,B,4*SIZE MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA MADD t14,c14,t14,ALPHA ST t11,0(CO1) ST t12,0(CO2) ST t13,0(CO3) ST t14,0(CO4) #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 MUL t13, ALPHA, t13 MUL t14, ALPHA, t14 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll K,TEMP, BASE_SHIFT dsll TEMP,TEMP, 2 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK,4 #endif bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj .align 5 .L0_N2: andi N,NCO,2 # nr = 2 beqz N,.L0_N1 nop .L0_N2_Lb: move CO1,C daddu CO2,C,LDC dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA daddu C,CO2,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L12_M2 nop .L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 # gemm part LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t31,t11 LD b1,1*SIZE(B) MOV t41,t11 LD a2,2*SIZE(A) LD a3,3*SIZE(A) MOV t12,t11 MOV t22,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 #else move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 # gemm part LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t31,t11 LD b1,1*SIZE(B) MOV t41,t11 LD a2,2*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a3,3*SIZE(A) MOV t12,t11 MOV t22,t11 MOV t32,t11 beqz K,.L45 MOV t42,t11 #endif .L41: # nr=2,mr=kr=4 MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 LD a5,5*SIZE(A) MADD t12,t12,a0,b1 LD b4,2*SIZE(B) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t31,t31,a2,b0 LD a6,6*SIZE(A) MADD t41,t41,a3,b0 LD a7,7*SIZE(A) FETCH $0,(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L42: MADD t11,t11,a4,b4 LD a0,8*SIZE(A) MADD t21,t21,a5,b4 LD a1,9*SIZE(A) MADD t12,t12,a4,b5 LD b2,4*SIZE(B) MADD t22,t22,a5,b5 LD b3,5*SIZE(B) MADD t31,t31,a6,b4 LD a2,10*SIZE(A) MADD t41,t41,a7,b4 LD a3,11*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 .L43: MADD t11,t11,a0,b2 LD a4,12*SIZE(A) MADD t21,t21,a1,b2 LD a5,13*SIZE(A) MADD t12,t12,a0,b3 LD b6,6*SIZE(B) MADD t22,t22,a1,b3 LD b7,7*SIZE(B) MADD t31,t31,a2,b2 LD a6,14*SIZE(A) MADD t41,t41,a3,b2 LD a7,15*SIZE(A) FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 daddu A,A,16*SIZE # 4mr*4kr daddu B,B,8*SIZE # 2nr*4kr .L44: MADD t11,t11,a4,b6 LD a0,0*SIZE(A) MADD t21,t21,a5,b6 LD a1,1*SIZE(A) MADD t12,t12,a4,b7 LD b0,0*SIZE(B) MADD t22,t22,a5,b7 LD b1,1*SIZE(B) daddiu K,K,-1 daddu PREA,PREA,16*SIZE MADD t31,t31,a6,b6 LD a2,2*SIZE(A) MADD t41,t41,a7,b6 LD a3,3*SIZE(A) FETCH $0,-4*SIZE(PREA) MADD t32,t32,a6,b7 bnez K,.L41 MADD t42,t42,a7,b7 .L45: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop .L46: MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 LD a5,5*SIZE(A) MADD t12,t12,a0,b1 LD b4,2*SIZE(B) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t31,t31,a2,b0 LD a6,6*SIZE(A) MADD t41,t41,a3,b0 LD a7,7*SIZE(A) FETCH $0,0(PREA) MADD t32,t32,a2,b1 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L47: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD a1,1*SIZE(A) MADD t12,t12,a4,b5 LD b0,0*SIZE(B) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) MADD t31,t31,a6,b4 LD a2,2*SIZE(A) MADD t41,t41,a7,b4 LD a3,3*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,8*SIZE .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,2*SIZE daddu PREA,PREA,4*SIZE MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA ST t11,0(CO1) MADD t12,c12,t12,ALPHA ST t21,1*SIZE(CO1) MADD t22,c22,t22,ALPHA ST t31,2*SIZE(CO1) MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) daddu CO1,CO1,4*SIZE bnez M,.L40 daddu CO2,CO2,4*SIZE #else MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 ST t21, 1 * SIZE(CO1) MUL t32, ALPHA, t32 ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,4(CO1) FETCH $0,4(CO2) #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif dsll K,TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L40 nop #endif .align 3 .L12_M2: andi M,MCO,2 # mr = 2 beqz M,.L12_M1 nop .L50: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO #else dsll K, KK, 1 + BASE_SHIFT #mr=2 dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 daddu A, A, K daddu B, BO, TEMP #endif LD a0,0*SIZE(A) LD a1,1*SIZE(A) MTC $0,t11 LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 #else move B,BO LD a0,0*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a1,1*SIZE(A) MTC $0,t11 LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) MOV t12,t11 beqz K,.L55 MOV t22,t11 #endif .L51: # nr=2 mr=2,kr=4 MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD b4,2*SIZE(B) MADD t12,t12,a0,b1 LD a5,3*SIZE(A) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 LD b2,4*SIZE(B) MADD t12,t12,a4,b5 LD a3,5*SIZE(A) MADD t22,t22,a5,b5 daddiu K,K,-1 LD b3,5*SIZE(B) MADD t11,t11,a2,b2 LD a6,6*SIZE(A) MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE LD b6,6*SIZE(B) MADD t12,t12,a2,b3 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE LD a7,-1*SIZE(A) MADD t22,t22,a3,b3 LD b7,-1*SIZE(B) MADD t11,t11,a6,b6 LD a0,0*SIZE(A) MADD t21,t21,a7,b6 LD b0,0*SIZE(B) MADD t12,t12,a6,b7 LD a1,1*SIZE(A) MADD t22,t22,a7,b7 bnez K,.L51 LD b1,1*SIZE(B) .L55: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop .L56: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 LD b4,2*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE # 2nr*2kr LD a5,-1*SIZE(A) MADD t22,t22,a1,b1 LD b5,-1*SIZE(B) .L57: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD b0,0*SIZE(B) MADD t12,t12,a4,b5 LD a1,1*SIZE(A) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) .L58: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP, 1 #endif beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE # 2nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA MADD t22,c22,t22,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t12,0(CO2) ST t22,1*SIZE(CO2) daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) #else daddiu M, M, -1 daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t12, ALPHA, t12 MUL t22, ALPHA, t22 ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) ST t12, -2 * SIZE(CO2) ST t22, -1 * SIZE(CO2) FETCH $0,0(CO1) FETCH $0,0(CO2) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L12_M1: andi M,MCO,1 # mr = 1 beqz M,.L0_N2_Loop nop .L60: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t22,t11 beqz K,.L65 nop #else dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) beqz K,.L65 MOV t22,t11 #endif .L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 LD b4, 2*SIZE(B) MADD t11,t11,a0,b0 LD b5,3*SIZE(B) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 LD b2,4*SIZE(B) MADD t11,t11,a4,b4 LD b3,5*SIZE(B) MADD t12,t12,a4,b5 LD a6, 3*SIZE(A) # a4 daddiu K,K,-1 LD b6,6*SIZE(B) MADD t11,t11,a2,b2 LD b7,7*SIZE(B) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE LD b0,0*SIZE(B) MADD t11,t11,a6,b6 LD b1,1*SIZE(B) bnez K,.L61 MADD t12,t12,a6,b7 .L65: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop .L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 LD b4,2*SIZE(B) daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 LD b5,3*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE .L67: LD a0,0(A) # a0 LD b0,0*SIZE(B) MADD t11,t11,a4,b4 LD b1,1*SIZE(B) MADD t12,t12,a4,b5 .L68: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE .L69: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .L0_N2_Loop: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move BO, B .align 5 .L0_N1: andi N,NCO,1 # nr = 1 beqz N,.L999 nop move CO1,C dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L11_M2 daddu C,CO1,LDC .L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B #else dsll K, KK, 2 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 LD a3,3*SIZE(A) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 beqz K,.L75 nop #else move B, BO # Reset B dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 beqz K,.L75 LD a3,3*SIZE(A) #endif .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 LD a4, 4*SIZE(A) MADD t21,t21,a1,b0 LD a5, 5*SIZE(A) FETCH $0,(PREA) LD a6,6*SIZE(A) MADD t31,t31,a2,b0 LD a7,7*SIZE(A) MADD t41,t41,a3,b0 .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 LD a0,8*SIZE(A) MADD t21,t21,a5,b4 LD a1,9*SIZE(A) FETCH $0,4*SIZE(PREA) LD a2,10*SIZE(A) MADD t31,t31,a6,b4 LD a3,11*SIZE(A) MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 LD a4,12*SIZE(A) daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a5,13*SIZE(A) MADD t21,t21,a1,b2 LD a6,14*SIZE(A) FETCH $0,8*SIZE(PREA) MADD t31,t31,a2,b2 LD a7,15*SIZE(A) MADD t41,t41,a3,b2 daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 LD a0,0*SIZE(A) daddu PREA,PREA,16*SIZE LD a1,1*SIZE(A) MADD t21,t21,a5,b6 LD a2,2*SIZE(A) daddiu K,K,-1 MADD t31,t31,a6,b6 LD a3,3*SIZE(A) MADD t41,t41,a7,b6 bnez K,.L71 FETCH $0,-32(PREA) .L75: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop .L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a4,4*SIZE(A) daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 LD a5,5*SIZE(A) MADD t21,t21,a1,b0 FETCH $0,0(PREA) LD a6,6*SIZE(A) MADD t31,t31,a2,b0 LD a7,7*SIZE(A) MADD t41,t41,a3,b0 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L77: LD b0,0(B) MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 FETCH $0,4*SIZE(PREA) LD a1,1*SIZE(A) MADD t31,t31,a6,b4 LD a2,2*SIZE(A) MADD t41,t41,a7,b4 LD a3,3*SIZE(A) daddu PREA,PREA,8*SIZE .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu B,B,1*SIZE daddu PREA,PREA,4*SIZE .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t31,c31,t31,ALPHA MADD t41,c41,t41,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) daddiu M,M,-1 # M-- FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) bnez M,.L70 # M!=0 daddu CO1,CO1,4*SIZE # COx += 4*8Byte #else daddiu M,M,-1 # M-- MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A,K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L70 nop #endif .align 3 .L11_M2: andi M,MCO,2 # mr = 2 beqz M,.L11_M1 nop .L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) LD a1,1*SIZE(A) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 # K=KCO/2 beqz K,.L85 nop #else move B, BO dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) beqz K,.L85 LD a1,1*SIZE(A) #endif .L81: # nr=1,mr=2,kr=4 LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 LD b2, 2*SIZE(B) LD a2,4*SIZE(A) MADD t11,t11,a4,b4 LD a3,5*SIZE(A) MADD t21,t21,a5,b4 LD b6, 3*SIZE(B) LD a6,6*SIZE(A) MADD t11,t11,a2,b2 LD a7,7*SIZE(A) MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD b0, 0*SIZE(B) daddiu K,K,-1 LD a0,0*SIZE(A) MADD t11,t11,a6,b6 LD a1,1*SIZE(A) bnez K,.L81 MADD t21,t21,a7,b6 .L85: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop .L86: LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 LD b0,0(B) LD a0,0*SIZE(A) MADD t11,t11,a4,b4 LD a1,1*SIZE(A) MADD t21,t21,a5,b4 .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,1*SIZE .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else daddu CO1,CO1,2*SIZE # COx += 2*8Byte MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 FETCH $0,0(CO1) ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L11_M1: andi M,MCO,1 # mr = 1 beqz M,.L999 nop .L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MTC $0,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra K, TEMP, 2 beqz K,.L95 nop #else move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif .L91: # nr=mr=1,kr=4 LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop .L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 .L99: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C MADD t11,c11,t11,ALPHA ST t11,0(CO1) #else MUL t11, ALPHA, t11 ST t11, 0 * SIZE(CO1) #endif .L999: # End ld $16, 0($sp) ld $17, 8($sp) ld $18, 16($sp) ld $19, 24($sp) ld $20, 32($sp) ld $21, 40($sp) ld $22, 48($sp) LD $f24, 56($sp) LD $f25, 64($sp) LD $f26, 72($sp) LD $f27, 80($sp) LD $f28, 88($sp) ld $23, 96($sp) ld $24, 104($sp) ld $25, 112($sp) LD $f20,120($sp) LD $f21,128($sp) LD $f22,136($sp) LD $f23,144($sp) j $31 daddiu $sp, $sp, 160 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/dnrm2.S000066400000000000000000000163441313527062700167240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define XX $7 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define ALPHA $f16 #define max $f17 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT move XX, X NOP LD a1, 0 * SIZE(X) daddiu N, N, -1 daddu X, X, INCX FABS s1, a1 blez N, .L999 FABS s2, a1 FABS s3, a1 dsra I, N, 3 blez I, .L15 FABS s4, a1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 daddu X, X, INCX FABS t3, a3 LD a2, 0 * SIZE(X) FABS t4, a4 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a3, 0 * SIZE(X) CMPLT $fcc1, s2, t2 daddu X, X, INCX CMPLT $fcc2, s3, t3 LD a4, 0 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 LD a5, 0 * SIZE(X) FABS t2, a6 daddu X, X, INCX FABS t3, a7 LD a6, 0 * SIZE(X) FABS t4, a8 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, t2 daddu X, X, INCX CMPLT $fcc2, s3, t3 LD a8, 0 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 daddiu I, I, -1 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 bgtz I, .L12 CMOVT s4, t4, $fcc3 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 FABS t2, a6 FABS t3, a7 FABS t4, a8 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 .align 3 .L15: andi I, N, 7 blez I, .L100 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu I, I, -1 FABS t1, a1 CMPLT $fcc0, s1, t1 CMOVT s1, t1, $fcc0 bgtz I, .L16 daddu X, X, INCX .align 3 .L100: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 daddiu N, N, 1 lui TEMP, 0x3f80 dmtc1 $0, a1 mtc1 TEMP, ALPHA CMPEQ $fcc0, s1, a1 bc1t $fcc0, .L999 cvt.d.s ALPHA, ALPHA div.d ALPHA, ALPHA, s1 MOV max, s1 MOV s1, a1 MOV s2, a1 MOV s3, a1 MOV s4, a1 dsra I, N, 3 blez I, .L105 NOP LD a1, 0 * SIZE(XX) daddu XX, XX, INCX LD a2, 0 * SIZE(XX) daddu XX, XX, INCX LD a3, 0 * SIZE(XX) daddu XX, XX, INCX LD a4, 0 * SIZE(XX) daddu XX, XX, INCX LD a5, 0 * SIZE(XX) daddu XX, XX, INCX LD a6, 0 * SIZE(XX) daddu XX, XX, INCX LD a7, 0 * SIZE(XX) daddu XX, XX, INCX LD a8, 0 * SIZE(XX) daddiu I, I, -1 blez I, .L104 daddu XX, XX, INCX .align 3 .L103: MUL t1, ALPHA, a1 LD a1, 0 * SIZE(XX) MUL t2, ALPHA, a2 daddu XX, XX, INCX MUL t3, ALPHA, a3 LD a2, 0 * SIZE(XX) MUL t4, ALPHA, a4 daddu XX, XX, INCX MADD s1, s1, t1, t1 LD a3, 0 * SIZE(XX) MADD s2, s2, t2, t2 daddu XX, XX, INCX MADD s3, s3, t3, t3 LD a4, 0 * SIZE(XX) MADD s4, s4, t4, t4 daddu XX, XX, INCX MUL t1, ALPHA, a5 LD a5, 0 * SIZE(XX) MUL t2, ALPHA, a6 daddu XX, XX, INCX MUL t3, ALPHA, a7 LD a6, 0 * SIZE(XX) MUL t4, ALPHA, a8 daddu XX, XX, INCX MADD s1, s1, t1, t1 LD a7, 0 * SIZE(XX) MADD s2, s2, t2, t2 daddu XX, XX, INCX MADD s3, s3, t3, t3 LD a8, 0 * SIZE(XX) MADD s4, s4, t4, t4 daddiu I, I, -1 bgtz I, .L103 daddu XX, XX, INCX .align 3 .L104: MUL t1, ALPHA, a1 MUL t2, ALPHA, a2 MUL t3, ALPHA, a3 MUL t4, ALPHA, a4 MADD s1, s1, t1, t1 MADD s2, s2, t2, t2 MADD s3, s3, t3, t3 MADD s4, s4, t4, t4 MUL t1, ALPHA, a5 MUL t2, ALPHA, a6 MUL t3, ALPHA, a7 MUL t4, ALPHA, a8 MADD s1, s1, t1, t1 MADD s2, s2, t2, t2 MADD s3, s3, t3, t3 MADD s4, s4, t4, t4 .align 3 .L105: andi I, N, 7 blez I, .L998 NOP .align 3 .L106: LD a1, 0 * SIZE(XX) daddiu I, I, -1 MUL t1, ALPHA, a1 daddu XX, XX, INCX bgtz I, .L106 MADD s1, s1, t1, t1 .align 3 .L998: ADD s1, s1, s2 ADD s3, s3, s4 ADD s1, s1, s3 sqrt.d s1, s1 j $31 MUL s1, max, s1 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/dot.S000066400000000000000000000137211313527062700164640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define Y $7 #define INCY $8 #define I $2 #define TEMP $3 #define a1 $f2 #define a2 $f3 #define a3 $f4 #define a4 $f5 #define b1 $f6 #define b2 $f7 #define b3 $f8 #define b4 $f9 #define s1 $f0 #define s2 $f1 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif MTC $0, s1 MTC $0, s2 dsll INCX, INCX, BASE_SHIFT li TEMP, SIZE blez N, .L999 dsll INCY, INCY, BASE_SHIFT bne INCX, TEMP, .L20 dsra I, N, 3 bne INCY, TEMP, .L20 NOP blez I, .L15 NOP LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) LD a3, 2 * SIZE(X) LD b3, 2 * SIZE(Y) LD a4, 3 * SIZE(X) daddiu I, I, -1 blez I, .L13 LD b4, 3 * SIZE(Y) .align 3 .L12: MADD s1, s1, a1, b1 LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) MADD s2, s2, a2, b2 LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) MADD s1, s1, a3, b3 LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) MADD s2, s2, a4, b4 LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) MADD s1, s1, a1, b1 LD a1, 8 * SIZE(X) LD b1, 8 * SIZE(Y) MADD s2, s2, a2, b2 LD a2, 9 * SIZE(X) LD b2, 9 * SIZE(Y) MADD s1, s1, a3, b3 LD a3, 10 * SIZE(X) LD b3, 10 * SIZE(Y) MADD s2, s2, a4, b4 LD a4, 11 * SIZE(X) LD b4, 11 * SIZE(Y) daddiu I, I, -1 daddiu X, X, 8 * SIZE bgtz I, .L12 daddiu Y, Y, 8 * SIZE .align 3 .L13: MADD s1, s1, a1, b1 LD a1, 4 * SIZE(X) LD b1, 4 * SIZE(Y) MADD s2, s2, a2, b2 LD a2, 5 * SIZE(X) LD b2, 5 * SIZE(Y) MADD s1, s1, a3, b3 LD a3, 6 * SIZE(X) LD b3, 6 * SIZE(Y) MADD s2, s2, a4, b4 LD a4, 7 * SIZE(X) LD b4, 7 * SIZE(Y) MADD s1, s1, a1, b1 daddiu X, X, 8 * SIZE MADD s2, s2, a2, b2 daddiu Y, Y, 8 * SIZE MADD s1, s1, a3, b3 MADD s2, s2, a4, b4 .align 3 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) MADD s1, s1, a1, b1 daddiu I, I, -1 daddiu X, X, SIZE daddiu Y, Y, SIZE bgtz I, .L16 NOP j .L999 NOP .align 3 .L20: #ifdef F_INTERFACE bgez INCX, .L21 daddiu TEMP, N, -1 mult TEMP, INCX mflo TEMP dsub X, X, TEMP .align 3 .L21: bgez INCY, .L22 daddiu TEMP, N, -1 mult TEMP, INCY mflo TEMP dsub Y, Y, TEMP .align 3 .L22: #endif blez I, .L25 NOP .align 3 .L23: LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY MADD s1, s1, a1, b1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY MADD s2, s2, a1, b1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY MADD s1, s1, a1, b1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY MADD s2, s2, a1, b1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY MADD s1, s1, a1, b1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY MADD s2, s2, a1, b1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY MADD s1, s1, a1, b1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY daddiu I, I, -1 bgtz I, .L23 MADD s2, s2, a1, b1 .align 3 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY daddiu I, I, -1 bgtz I, .L26 MADD s1, s1, a1, b1 .align 3 .L999: ADD s1, s1, s2 #ifdef DSDOT cvt.d.s s1, s1 #endif j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/gemm_beta.S000066400000000000000000000111071313527062700176120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define C $6 #define LDC $7 #define I $2 #define J $3 #define CO1 $8 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define b1 $f4 #define b2 $f5 #define b3 $f6 #define b4 $f7 #define FZERO $f8 #define ALPHA $f15 PROLOGUE LDARG C, 0($sp) MTC $0, FZERO LDARG LDC, 8($sp) dsll LDC, LDC, BASE_SHIFT move J, N blez J, .L999 nop .align 3 .L10: move CO1, C dsra I, M, 3 blez I, .L15 daddu C, C, LDC LD a1, 0 * SIZE(CO1) LD a2, 1 * SIZE(CO1) LD a3, 2 * SIZE(CO1) LD a4, 3 * SIZE(CO1) MUL b1, ALPHA, a1 LD a1, 4 * SIZE(CO1) MUL b2, ALPHA, a2 daddiu I, I, -1 blez I, .L13 LD a2, 5 * SIZE(CO1) .align 3 .L12: MUL b3, ALPHA, a3 LD a3, 6 * SIZE(CO1) ST b1, 0 * SIZE(CO1) MUL b4, ALPHA, a4 LD a4, 7 * SIZE(CO1) ST b2, 1 * SIZE(CO1) MUL b1, ALPHA, a1 LD a1, 8 * SIZE(CO1) ST b3, 2 * SIZE(CO1) MUL b2, ALPHA, a2 LD a2, 9 * SIZE(CO1) ST b4, 3 * SIZE(CO1) MUL b3, ALPHA, a3 LD a3, 10 * SIZE(CO1) ST b1, 4 * SIZE(CO1) MUL b4, ALPHA, a4 LD a4, 11 * SIZE(CO1) ST b2, 5 * SIZE(CO1) MUL b1, ALPHA, a1 LD a1, 12 * SIZE(CO1) ST b3, 6 * SIZE(CO1) MUL b2, ALPHA, a2 LD a2, 13 * SIZE(CO1) ST b4, 7 * SIZE(CO1) daddiu I, I, -1 bgtz I, .L12 daddiu CO1, CO1, 8 * SIZE .align 3 .L13: MUL b3, ALPHA, a3 LD a3, 6 * SIZE(CO1) ST b1, 0 * SIZE(CO1) MUL b4, ALPHA, a4 LD a4, 7 * SIZE(CO1) ST b2, 1 * SIZE(CO1) MUL b1, ALPHA, a1 ST b3, 2 * SIZE(CO1) MUL b2, ALPHA, a2 ST b4, 3 * SIZE(CO1) MUL b3, ALPHA, a3 ST b1, 4 * SIZE(CO1) MUL b4, ALPHA, a4 ST b2, 5 * SIZE(CO1) ST b3, 6 * SIZE(CO1) ST b4, 7 * SIZE(CO1) daddiu CO1, CO1, 8 * SIZE .align 3 .L15: andi I, M, 7 daddiu J, J, -1 blez I, .L18 NOP .align 3 .L16: LD a1, 0 * SIZE(CO1) daddiu I, I, -1 MUL b1, ALPHA, a1 daddiu CO1, CO1, 1 * SIZE bgtz I, .L16 ST b1, -1 * SIZE(CO1) .align 3 .L18: bgtz J, .L10 NOP .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/gemm_kernel.S000066400000000000000000001154121313527062700201630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define PREFETCHSIZE (4 * 10) #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define CO5 $18 #define CO6 $19 #define CO7 $20 #define CO8 $21 #define BB $22 #if defined(TRMMKERNEL) #define OFFSET $23 #define KK $24 #define TEMP $25 #endif #define a1 $f0 #define a2 $f1 #define a3 $f27 #define a4 $f28 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f16 #define c41 $f17 #define c42 $f18 #define c51 $f19 #define c52 $f20 #define c61 $f21 #define c62 $f22 #define c71 $f23 #define c72 $f24 #define c81 $f25 #define c82 $f26 #define ALPHA $f15 PROLOGUE daddiu $sp, $sp, -160 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) SDARG $22, 48($sp) sdc1 $f24, 56($sp) sdc1 $f25, 64($sp) sdc1 $f26, 72($sp) sdc1 $f27, 80($sp) sdc1 $f28, 88($sp) #if defined(TRMMKERNEL) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) LDARG OFFSET, 160($sp) #endif #ifndef __64BIT__ sdc1 $f20,120($sp) sdc1 $f21,128($sp) sdc1 $f22,136($sp) sdc1 $f23,144($sp) #endif dsll LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif dsra J, N, 3 blez J, .L30 nop .L10: move CO1, C MTC $0, c11 daddu CO2, C, LDC move AO, A daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 daddu CO5, CO4, LDC MOV c31, c11 daddu CO6, CO5, LDC MOV c41, c11 daddu CO7, CO6, LDC MOV c51, c11 daddu CO8, CO7, LDC dsra I, M, 1 daddu C, CO8, LDC dsll BB, K, 2 + BASE_SHIFT daddu BB, B, BB #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif blez I, .L20 MOV c61, c11 .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(BO) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 MOV c32, c11 LD b3, 2 * SIZE(BO) MOV c42, c11 LD b4, 3 * SIZE(BO) MOV c52, c11 LD b5, 4 * SIZE(BO) MOV c62, c11 LD b6, 8 * SIZE(BO) MOV c72, c11 LD b7, 12 * SIZE(BO) MOV c82, c11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 8 #endif dsra L, TEMP, 2 blez L, .L15 NOP #else LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 pref 1, 3 * SIZE(CO1) pref 1, 3 * SIZE(CO2) LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, K, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B #endif MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 blez L, .L13 MADD c41, c41, a1, b4 pref 1, 2 * SIZE(CO3) .align 3 .L12: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 LD a4, 2 * SIZE(AO) MADD c61, c61, a1, b2 NOP MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 LD a4, 6 * SIZE(AO) MADD c61, c61, a3, b2 NOP MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 daddiu L, L, -1 MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 bgtz L, .L12 MADD c41, c41, a1, b4 NOP .align 3 .L13: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 pref 1, 3 * SIZE(CO4) MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 pref 1, 3 * SIZE(CO5) MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 pref 1, 3 * SIZE(CO6) MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 pref 1, 3 * SIZE(CO7) MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 NOP MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L18 pref 1, 3 * SIZE(CO8) .align 3 .L16: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 NOP MADD c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 daddiu L, L, -1 MADD c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) daddiu CO3,CO3, 2 * SIZE LD $f1, 1 * SIZE(CO1) daddiu CO1,CO1, 2 * SIZE LD $f2, 0 * SIZE(CO2) daddiu CO4,CO4, 2 * SIZE LD $f3, 1 * SIZE(CO2) daddiu CO2,CO2, 2 * SIZE LD $f4, -2 * SIZE(CO3) daddiu CO5,CO5, 2 * SIZE LD $f5, -1 * SIZE(CO3) daddiu CO6,CO6, 2 * SIZE LD $f6, -2 * SIZE(CO4) daddiu CO7,CO7, 2 * SIZE LD $f7, -1 * SIZE(CO4) daddiu I, I, -1 MADD c11, $f0, ALPHA, c11 LD $f0,-2 * SIZE(CO5) MADD c12, $f1, ALPHA, c12 LD $f1,-1 * SIZE(CO5) MADD c21, $f2, ALPHA, c21 LD $f2,-2 * SIZE(CO6) MADD c22, $f3, ALPHA, c22 LD $f3,-1 * SIZE(CO6) MADD c31, $f4, ALPHA, c31 LD $f4,-2 * SIZE(CO7) MADD c32, $f5, ALPHA, c32 LD $f5,-1 * SIZE(CO7) MADD c41, $f6, ALPHA, c41 LD $f6, 0 * SIZE(CO8) MADD c42, $f7, ALPHA, c42 LD $f7, 1 * SIZE(CO8) pref 0, 0 * SIZE(BB) pref 0, 8 * SIZE(BB) ST c11, -2 * SIZE(CO1) MTC $0, c11 ST c12, -1 * SIZE(CO1) daddiu CO8,CO8, 2 * SIZE ST c21, -2 * SIZE(CO2) MOV c21, c11 ST c22, -1 * SIZE(CO2) daddiu BB, BB, 16 * SIZE MADD c51, $f0, ALPHA, c51 ST c31, -2 * SIZE(CO3) MADD c52, $f1, ALPHA, c52 ST c32, -1 * SIZE(CO3) MADD c61, $f2, ALPHA, c61 ST c41, -2 * SIZE(CO4) MADD c62, $f3, ALPHA, c62 ST c42, -1 * SIZE(CO4) MADD c71, $f4, ALPHA, c71 ST c51, -2 * SIZE(CO5) MADD c72, $f5, ALPHA, c72 ST c52, -1 * SIZE(CO5) MADD c81, $f6, ALPHA, c81 ST c61, -2 * SIZE(CO6) MADD c82, $f7, ALPHA, c82 ST c62, -1 * SIZE(CO6) ST c71, -2 * SIZE(CO7) MOV c31, c11 ST c72, -1 * SIZE(CO7) MOV c41, c11 ST c81, -2 * SIZE(CO8) MOV c51, c11 ST c82, -1 * SIZE(CO8) bgtz I, .L11 MOV c61, c11 #else daddiu CO4,CO4, 2 * SIZE daddiu CO5,CO5, 2 * SIZE daddiu CO6,CO6, 2 * SIZE daddiu CO7,CO7, 2 * SIZE pref 0, 0 * SIZE(BB) pref 0, 8 * SIZE(BB) MUL c11, ALPHA, c11 daddiu CO1,CO1, 2 * SIZE MUL c12, ALPHA, c12 MTC $0, a1 MUL c21, ALPHA, c21 daddiu CO2,CO2, 2 * SIZE MUL c22, ALPHA, c22 daddiu CO3,CO3, 2 * SIZE ST c11, -2 * SIZE(CO1) MUL c31, ALPHA, c31 ST c12, -1 * SIZE(CO1) MUL c32, ALPHA, c32 ST c21, -2 * SIZE(CO2) MUL c41, ALPHA, c41 ST c22, -1 * SIZE(CO2) MUL c42, ALPHA, c42 ST c31, -2 * SIZE(CO3) MUL c51, ALPHA, c51 ST c32, -1 * SIZE(CO3) MUL c52, ALPHA, c52 ST c41, -2 * SIZE(CO4) MUL c61, ALPHA, c61 ST c42, -1 * SIZE(CO4) MUL c62, ALPHA, c62 ST c51, -2 * SIZE(CO5) MUL c71, ALPHA, c71 ST c52, -1 * SIZE(CO5) MUL c72, ALPHA, c72 ST c61, -2 * SIZE(CO6) MUL c81, ALPHA, c81 ST c62, -1 * SIZE(CO6) MUL c82, ALPHA, c82 ST c71, -2 * SIZE(CO7) MOV c11, a1 ST c72, -1 * SIZE(CO7) MOV c21, a1 daddiu CO8,CO8, 2 * SIZE daddiu BB, BB, 16 * SIZE ST c81, -2 * SIZE(CO8) MOV c31, a1 ST c82, -1 * SIZE(CO8) MOV c41, a1 daddiu I, I, -1 MOV c51, a1 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -8 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif bgtz I, .L11 MOV c61, a1 #endif .align 3 .L20: andi I, M, 1 MOV c61, c11 blez I, .L29 MOV c71, c11 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 8 #endif dsra L, TEMP, 2 blez L, .L25 MOV c81, c11 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, K, 2 MOV c81, c11 blez L, .L25 move BO, B #endif .align 3 .L22: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 LD b5, 20 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 9 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 10 * SIZE(BO) MADD c81, c81, a1, b4 LD b4, 11 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a2, b7 LD b7, 28 * SIZE(BO) MADD c61, c61, a2, b2 LD b2, 17 * SIZE(BO) MADD c71, c71, a2, b3 LD b3, 18 * SIZE(BO) MADD c81, c81, a2, b4 LD b4, 19 * SIZE(BO) LD a2, 5 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a3, b1 LD b1, 32 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 21 * SIZE(BO) MADD c31, c31, a3, b3 LD b3, 22 * SIZE(BO) MADD c41, c41, a3, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 LD b5, 36 * SIZE(BO) MADD c61, c61, a3, b2 LD b2, 25 * SIZE(BO) MADD c71, c71, a3, b3 LD b3, 26 * SIZE(BO) MADD c81, c81, a3, b4 LD b4, 27 * SIZE(BO) LD a3, 2 * SIZE(AO) daddiu BO, BO, 32 * SIZE MADD c11, c11, a4, b6 LD b6, 8 * SIZE(BO) MADD c21, c21, a4, b2 LD b2, -3 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, -2 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, -1 * SIZE(BO) MADD c51, c51, a4, b7 LD b7, 12 * SIZE(BO) MADD c61, c61, a4, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a4, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a4, b4 LD b4, 3 * SIZE(BO) bgtz L, .L22 LD a4, 3 * SIZE(AO) .align 3 .L25: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L28 NOP .align 3 .L26: MADD c11, c11, a1, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) daddiu L, L, -1 MOV a2, a2 daddiu AO, AO, 1 * SIZE daddiu BO, BO, 8 * SIZE MADD c51, c51, a1, b5 LD b5, 4 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) bgtz L, .L26 LD b4, 3 * SIZE(BO) .L28: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) LD $f1, 0 * SIZE(CO2) LD $f2, 0 * SIZE(CO3) LD $f3, 0 * SIZE(CO4) MADD c11, $f0, ALPHA, c11 LD $f4, 0 * SIZE(CO5) MADD c21, $f1, ALPHA, c21 LD $f5, 0 * SIZE(CO6) MADD c31, $f2, ALPHA, c31 LD $f6, 0 * SIZE(CO7) MADD c41, $f3, ALPHA, c41 LD $f7, 0 * SIZE(CO8) MADD c51, $f4, ALPHA, c51 ST c11, 0 * SIZE(CO1) MADD c61, $f5, ALPHA, c61 ST c21, 0 * SIZE(CO2) MADD c71, $f6, ALPHA, c71 ST c31, 0 * SIZE(CO3) MADD c81, $f7, ALPHA, c81 ST c41, 0 * SIZE(CO4) ST c51, 0 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c81, 0 * SIZE(CO8) #else MUL c11, ALPHA, c11 MUL c21, ALPHA, c21 MUL c31, ALPHA, c31 MUL c41, ALPHA, c41 ST c11, 0 * SIZE(CO1) MUL c51, ALPHA, c51 ST c21, 0 * SIZE(CO2) MUL c61, ALPHA, c61 ST c31, 0 * SIZE(CO3) MUL c71, ALPHA, c71 ST c41, 0 * SIZE(CO4) MUL c81, ALPHA, c81 ST c51, 0 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c81, 0 * SIZE(CO8) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -8 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 8 #endif bgtz J, .L10 move B, BO .align 3 .L30: andi J, N, 4 blez J, .L50 move AO, A move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC MOV c21, c11 daddu C, CO4, LDC MOV c31, c11 #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif dsra I, M, 1 blez I, .L40 MOV c41, c11 .L31: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) MOV c32, c11 LD b4, 3 * SIZE(BO) MOV c42, c11 LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 4 #endif dsra L, TEMP, 2 blez L, .L35 NOP #else LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) MOV c32, c11 LD b4, 3 * SIZE(B) MOV c42, c11 LD b5, 4 * SIZE(B) dsra L, K, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L35 move BO, B #endif .align 3 .L32: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a3, b6 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c11, c11, a3, b7 LD a2, 7 * SIZE(AO) MADD c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD c31, c31, a3, b3 daddiu BO, BO, 16 * SIZE MADD c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD c12, c12, a2, b7 LD b7, 12 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD c42, c42, a2, b4 NOP bgtz L, .L32 LD b4, 3 * SIZE(BO) .align 3 .L35: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L38 NOP .align 3 .L36: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 daddiu AO, AO, 2 * SIZE MADD c41, c41, a1, b4 LD a1, 0 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 4 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) bgtz L, .L36 daddiu BO, BO, 4 * SIZE .L38: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) daddiu CO3,CO3, 2 * SIZE LD $f1, 1 * SIZE(CO1) daddiu CO1,CO1, 2 * SIZE LD $f2, 0 * SIZE(CO2) daddiu CO4,CO4, 2 * SIZE LD $f3, 1 * SIZE(CO2) daddiu CO2,CO2, 2 * SIZE LD $f4, -2 * SIZE(CO3) MADD c11, $f0, ALPHA, c11 LD $f5, -1 * SIZE(CO3) MADD c12, $f1, ALPHA, c12 LD $f6, -2 * SIZE(CO4) MADD c21, $f2, ALPHA, c21 LD $f7, -1 * SIZE(CO4) MADD c22, $f3, ALPHA, c22 MADD c31, $f4, ALPHA, c31 ST c11, -2 * SIZE(CO1) MADD c32, $f5, ALPHA, c32 ST c12, -1 * SIZE(CO1) MADD c41, $f6, ALPHA, c41 ST c21, -2 * SIZE(CO2) MADD c42, $f7, ALPHA, c42 ST c22, -1 * SIZE(CO2) ST c31, -2 * SIZE(CO3) MTC $0, c11 ST c32, -1 * SIZE(CO3) daddiu I, I, -1 ST c41, -2 * SIZE(CO4) MOV c21, c11 ST c42, -1 * SIZE(CO4) MOV c31, c11 #else MUL c11, ALPHA, c11 daddiu CO3,CO3, 2 * SIZE MUL c12, ALPHA, c12 daddiu CO1,CO1, 2 * SIZE MUL c21, ALPHA, c21 daddiu CO4,CO4, 2 * SIZE MUL c22, ALPHA, c22 daddiu CO2,CO2, 2 * SIZE ST c11, -2 * SIZE(CO1) MUL c31, ALPHA, c31 ST c12, -1 * SIZE(CO1) MUL c32, ALPHA, c32 ST c21, -2 * SIZE(CO2) MUL c41, ALPHA, c41 ST c22, -1 * SIZE(CO2) MUL c42, ALPHA, c42 ST c31, -2 * SIZE(CO3) MTC $0, c11 ST c32, -1 * SIZE(CO3) daddiu I, I, -1 ST c41, -2 * SIZE(CO4) MOV c21, c11 ST c42, -1 * SIZE(CO4) MOV c31, c11 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -4 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif bgtz I, .L31 MOV c41, c11 .align 3 .L40: andi I, M, 1 blez I, .L49 MOV c61, c11 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra L, TEMP, 2 blez L, .L45 NOP #else LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, K, 2 blez L, .L45 move BO, B #endif .align 3 .L42: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b5 LD b5, 20 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 11 * SIZE(BO) LD a2, 2 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) LD a2, -1 * SIZE(AO) daddiu BO, BO, 16 * SIZE MADD c11, c11, a2, b7 LD b7, 12 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 1 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 2 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L42 LD a2, 1 * SIZE(AO) .align 3 .L45: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L48 NOP .align 3 .L46: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD a1, 1 * SIZE(AO) LD b4, 7 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE MOV a2, a2 bgtz L, .L46 daddiu BO, BO, 4 * SIZE .L48: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) LD $f1, 0 * SIZE(CO2) LD $f2, 0 * SIZE(CO3) LD $f3, 0 * SIZE(CO4) MADD c11, $f0, ALPHA, c11 MADD c21, $f1, ALPHA, c21 MADD c31, $f2, ALPHA, c31 MADD c41, $f3, ALPHA, c41 ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) #else MUL c11, ALPHA, c11 MUL c21, ALPHA, c21 MUL c31, ALPHA, c31 MUL c41, ALPHA, c41 ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 4 #endif move B, BO .align 3 .L50: andi J, N, 2 blez J, .L70 move AO, A move CO1, C daddu CO2, C, LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif dsra I, M, 1 blez I, .L60 daddu C, CO2, LDC .L51: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 blez L, .L55 NOP #else LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, K, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L55 move BO, B #endif .align 3 .L52: MADD c11, c11, a1, b1 LD a3, 2 * SIZE(AO) MADD c21, c21, a1, b2 LD b4, 3 * SIZE(BO) MADD c12, c12, a2, b1 LD a4, 3 * SIZE(AO) MADD c22, c22, a2, b2 LD b1, 8 * SIZE(BO) MADD c11, c11, a3, b3 LD a1, 8 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 5 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 5 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 6 * SIZE(BO) MADD c11, c11, a5, b5 LD a3, 6 * SIZE(AO) MADD c21, c21, a5, b2 LD b4, 7 * SIZE(BO) MADD c12, c12, a2, b5 LD a4, 7 * SIZE(AO) MADD c22, c22, a2, b2 LD b5, 12 * SIZE(BO) MADD c11, c11, a3, b3 LD a5, 12 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 9 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 9 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 10 * SIZE(BO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L52 daddiu BO, BO, 8 * SIZE .align 3 .L55: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L58 NOP .align 3 .L56: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 3 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L56 daddiu BO, BO, 2 * SIZE .L58: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) daddiu I, I, -1 LD $f1, 1 * SIZE(CO1) daddiu CO1,CO1, 2 * SIZE LD $f2, 0 * SIZE(CO2) NOP LD $f3, 1 * SIZE(CO2) daddiu CO2,CO2, 2 * SIZE MADD c11, $f0, ALPHA, c11 MADD c12, $f1, ALPHA, c12 MADD c21, $f2, ALPHA, c21 MADD c22, $f3, ALPHA, c22 ST c11, -2 * SIZE(CO1) ST c12, -1 * SIZE(CO1) ST c21, -2 * SIZE(CO2) NOP bgtz I, .L51 ST c22, -1 * SIZE(CO2) #else daddiu I, I, -1 daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE MUL c11, ALPHA, c11 MUL c12, ALPHA, c12 MUL c21, ALPHA, c21 MUL c22, ALPHA, c22 ST c11, -2 * SIZE(CO1) ST c12, -1 * SIZE(CO1) ST c21, -2 * SIZE(CO2) ST c22, -1 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif bgtz I, .L51 NOP #endif .align 3 .L60: andi I, M, 1 blez I, .L69 NOP #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 blez L, .L65 NOP #else dsra L, K, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L65 move BO, B #endif .align 3 .L62: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) MADD c11, c11, a3, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, 11 * SIZE(BO) LD a3, 6 * SIZE(AO) LD a4, 7 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L62 daddiu BO, BO, 8 * SIZE .align 3 .L65: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L68 NOP .align 3 .L66: MADD c11, c11, a1, b1 LD b1, 2 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 3 * SIZE(BO) LD a1, 1 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L66 daddiu BO, BO, 2 * SIZE .L68: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) LD $f1, 0 * SIZE(CO2) ADD c11, c11, c31 ADD c21, c21, c41 MADD c11, $f0, ALPHA, c11 MADD c21, $f1, ALPHA, c21 ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) #else ADD c11, c11, c31 ADD c21, c21, c41 MUL c11, ALPHA, c11 MUL c21, ALPHA, c21 ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move B, BO .align 3 .L70: andi J, N, 1 blez J, .L999 move AO, A move CO1, C #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif dsra I, M, 1 blez I, .L80 daddu C, CO1, LDC .L71: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L75 NOP #else LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, K, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L75 move BO, B #endif .align 3 .L72: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 2 * SIZE(AO) LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 8 * SIZE bgtz L, .L72 daddiu BO, BO, 4 * SIZE .align 3 .L75: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L78 NOP .align 3 .L76: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L76 daddiu BO, BO, 1 * SIZE .L78: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) daddiu I, I, -1 LD $f1, 1 * SIZE(CO1) daddiu CO1,CO1, 2 * SIZE ADD c11, c11, c21 ADD c12, c12, c22 MADD c11, $f0, ALPHA, c11 MADD c12, $f1, ALPHA, c12 ST c11, -2 * SIZE(CO1) bgtz I, .L71 ST c12, -1 * SIZE(CO1) #else ADD c11, c11, c21 daddiu I, I, -1 ADD c12, c12, c22 daddiu CO1,CO1, 2 * SIZE MUL c11, ALPHA, c11 MUL c12, ALPHA, c12 ST c11, -2 * SIZE(CO1) ST c12, -1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif bgtz I, .L71 NOP #endif .align 3 .L80: andi I, M, 1 blez I, .L89 NOP #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L85 NOP #else LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, K, 2 blez L, .L85 move BO, B #endif .align 3 .L82: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c21, c21, a1, b1 LD a1, 2 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 3 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c21, c21, a1, b1 daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L82 daddiu BO, BO, 4 * SIZE .align 3 .L85: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L88 NOP .align 3 .L86: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L86 daddiu BO, BO, 1 * SIZE .L88: #ifndef TRMMKERNEL LD $f0, 0 * SIZE(CO1) ADD c11, c11, c21 MADD c11, $f0, ALPHA, c11 ST c11, 0 * SIZE(CO1) #else ADD c11, c11, c21 MUL c11, ALPHA, c11 ST c11, 0 * SIZE(CO1) #endif .align 3 .L89: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 1 #endif move B, BO .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) LDARG $22, 48($sp) ldc1 $f24, 56($sp) ldc1 $f25, 64($sp) ldc1 $f26, 72($sp) ldc1 $f27, 80($sp) ldc1 $f28, 88($sp) #if defined(TRMMKERNEL) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #endif #ifndef __64BIT__ ldc1 $f20,120($sp) ldc1 $f21,128($sp) ldc1 $f22,136($sp) ldc1 $f23,144($sp) #endif j $31 daddiu $sp, $sp, 160 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/gemv_n.S000066400000000000000000000271621313527062700171550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define A $8 #define LDA $9 #define X $10 #define INCX $11 #define Y $2 #define INCY $6 #define BUFFER $7 #define YORIG $3 #define XX $12 #define YY $13 #define I $14 #define J $15 #define AO1 $16 #define AO2 $17 #define ALPHA $f15 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define x1 $f8 #define x2 $f9 #define y1 $f10 #define y2 $f11 #define y3 $f12 #define y4 $f13 #define y5 $f14 #define y6 $f16 #define y7 $f17 #define y8 $f18 #define t1 $f19 #define t2 $f20 #define t3 $f21 #define t4 $f22 PROLOGUE LDARG Y, 0($sp) LDARG INCY, 8($sp) LDARG BUFFER, 16($sp) #ifdef __64BIT__ daddiu $sp, $sp, -16 #else daddiu $sp, $sp, -48 #endif SDARG $16, 0($sp) SDARG $17, 8($sp) dsll LDA, LDA, BASE_SHIFT #ifndef __64BIT__ sdc1 $f20, 16($sp) sdc1 $f21, 24($sp) sdc1 $f22, 32($sp) #endif blez M, .L999 dsll INCX, INCX, BASE_SHIFT blez N, .L999 dsll INCY, INCY, BASE_SHIFT li YORIG, SIZE beq INCY, YORIG, .L10 move YORIG, Y dsra I, M, 2 move YORIG, BUFFER move XX, Y blez I, .L05 move YY, BUFFER .align 3 .L02: LD a1, 0 * SIZE(XX) daddu XX, XX, INCY LD a2, 0 * SIZE(XX) daddu XX, XX, INCY LD a3, 0 * SIZE(XX) daddu XX, XX, INCY LD a4, 0 * SIZE(XX) daddu XX, XX, INCY ST a1, 0 * SIZE(YY) ST a2, 1 * SIZE(YY) ST a3, 2 * SIZE(YY) ST a4, 3 * SIZE(YY) daddiu I, I, -1 bgtz I, .L02 daddiu YY, YY, 4 * SIZE .align 3 .L05: andi I, M, 3 blez I, .L10 NOP .align 3 .L06: LD a1, 0 * SIZE(XX) daddu XX, XX, INCY ST a1, 0 * SIZE(YY) daddiu I, I, -1 bgtz I, .L06 daddiu YY, YY, 1 * SIZE .align 3 .L10: dsra J, N, 1 blez J, .L20 NOP .align 3 .L11: LD x1, 0 * SIZE(X) daddu X, X, INCX LD x2, 0 * SIZE(X) daddu X, X, INCX move AO1, A daddu AO2, A, LDA daddu A, AO2, LDA move YY, YORIG MUL x1, ALPHA, x1 dsra I, M, 3 blez I, .L15 MUL x2, ALPHA, x2 LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a4, 3 * SIZE(AO1) LD y4, 3 * SIZE(YY) LD a5, 0 * SIZE(AO2) LD y5, 4 * SIZE(YY) LD a6, 1 * SIZE(AO2) LD y6, 5 * SIZE(YY) LD a7, 2 * SIZE(AO2) LD y7, 6 * SIZE(YY) LD a8, 3 * SIZE(AO2) daddiu I, I, -1 blez I, .L13 LD y8, 7 * SIZE(YY) .align 3 .L12: MADD t1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD t2, y2, x1, a2 LD a2, 5 * SIZE(AO1) LD y1, 8 * SIZE(YY) LD y2, 9 * SIZE(YY) MADD t3, y3, x1, a3 LD a3, 6 * SIZE(AO1) MADD t4, y4, x1, a4 LD a4, 7 * SIZE(AO1) LD y3, 10 * SIZE(YY) LD y4, 11 * SIZE(YY) MADD t1, t1, x2, a5 LD a5, 4 * SIZE(AO2) MADD t2, t2, x2, a6 LD a6, 5 * SIZE(AO2) MADD t3, t3, x2, a7 LD a7, 6 * SIZE(AO2) MADD t4, t4, x2, a8 LD a8, 7 * SIZE(AO2) ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) ST t3, 2 * SIZE(YY) ST t4, 3 * SIZE(YY) MADD t1, y5, x1, a1 LD a1, 8 * SIZE(AO1) MADD t2, y6, x1, a2 LD a2, 9 * SIZE(AO1) LD y5, 12 * SIZE(YY) LD y6, 13 * SIZE(YY) MADD t3, y7, x1, a3 LD a3, 10 * SIZE(AO1) MADD t4, y8, x1, a4 LD a4, 11 * SIZE(AO1) LD y7, 14 * SIZE(YY) LD y8, 15 * SIZE(YY) MADD t1, t1, x2, a5 LD a5, 8 * SIZE(AO2) MADD t2, t2, x2, a6 LD a6, 9 * SIZE(AO2) MADD t3, t3, x2, a7 LD a7, 10 * SIZE(AO2) MADD t4, t4, x2, a8 LD a8, 11 * SIZE(AO2) ST t1, 4 * SIZE(YY) ST t2, 5 * SIZE(YY) ST t3, 6 * SIZE(YY) ST t4, 7 * SIZE(YY) daddiu I, I, -1 daddiu YY, YY, 8 * SIZE daddiu AO1, AO1, 8 * SIZE bgtz I, .L12 daddiu AO2, AO2, 8 * SIZE .align 3 .L13: MADD t1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD t2, y2, x1, a2 LD a2, 5 * SIZE(AO1) MADD t3, y3, x1, a3 LD a3, 6 * SIZE(AO1) MADD t4, y4, x1, a4 LD a4, 7 * SIZE(AO1) MADD t1, t1, x2, a5 LD a5, 4 * SIZE(AO2) MADD t2, t2, x2, a6 LD a6, 5 * SIZE(AO2) MADD t3, t3, x2, a7 LD a7, 6 * SIZE(AO2) MADD t4, t4, x2, a8 LD a8, 7 * SIZE(AO2) ST t1, 0 * SIZE(YY) MADD t1, y5, x1, a1 ST t2, 1 * SIZE(YY) MADD t2, y6, x1, a2 ST t3, 2 * SIZE(YY) MADD t3, y7, x1, a3 ST t4, 3 * SIZE(YY) MADD t4, y8, x1, a4 MADD t1, t1, x2, a5 daddiu AO1, AO1, 8 * SIZE MADD t2, t2, x2, a6 daddiu AO2, AO2, 8 * SIZE MADD t3, t3, x2, a7 daddiu YY, YY, 8 * SIZE MADD t4, t4, x2, a8 NOP ST t1, -4 * SIZE(YY) ST t2, -3 * SIZE(YY) ST t3, -2 * SIZE(YY) ST t4, -1 * SIZE(YY) .align 3 .L15: andi I, M, 4 NOP blez I, .L16 NOP LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a4, 3 * SIZE(AO1) LD y4, 3 * SIZE(YY) LD a5, 0 * SIZE(AO2) MADD y1, y1, x1, a1 LD a6, 1 * SIZE(AO2) MADD y2, y2, x1, a2 LD a7, 2 * SIZE(AO2) MADD y3, y3, x1, a3 LD a8, 3 * SIZE(AO2) MADD y4, y4, x1, a4 MADD y1, y1, x2, a5 daddiu YY, YY, 4 * SIZE MADD y2, y2, x2, a6 daddiu AO1, AO1, 4 * SIZE MADD y3, y3, x2, a7 daddiu AO2, AO2, 4 * SIZE MADD y4, y4, x2, a8 ST y1, -4 * SIZE(YY) ST y2, -3 * SIZE(YY) ST y3, -2 * SIZE(YY) ST y4, -1 * SIZE(YY) .align 3 .L16: andi I, M, 2 NOP blez I, .L17 NOP LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a5, 0 * SIZE(AO2) LD a6, 1 * SIZE(AO2) MADD y1, y1, x1, a1 NOP MADD y2, y2, x1, a2 daddiu YY, YY, 2 * SIZE MADD y1, y1, x2, a5 daddiu AO1, AO1, 2 * SIZE MADD y2, y2, x2, a6 daddiu AO2, AO2, 2 * SIZE ST y1, -2 * SIZE(YY) ST y2, -1 * SIZE(YY) .align 3 .L17: andi I, M, 1 NOP blez I, .L19 NOP LD y1, 0 * SIZE(YY) LD a1, 0 * SIZE(AO1) LD a5, 0 * SIZE(AO2) MADD y1, y1, x1, a1 MADD y1, y1, x2, a5 ST y1, 0 * SIZE(YY) .align 3 .L19: daddiu J, J, -1 bgtz J, .L11 NOP .align 3 .L20: andi J, N, 1 blez J, .L900 NOP .align 3 .L21: LD x1, 0 * SIZE(X) daddu X, X, INCX move YY, YORIG move AO1, A dsra I, M, 3 blez I, .L25 MUL x1, ALPHA, x1 LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a4, 3 * SIZE(AO1) LD y4, 3 * SIZE(YY) LD y5, 4 * SIZE(YY) LD y6, 5 * SIZE(YY) LD y7, 6 * SIZE(YY) daddiu I, I, -1 blez I, .L23 LD y8, 7 * SIZE(YY) .align 3 .L22: MADD t1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD t2, y2, x1, a2 LD a2, 5 * SIZE(AO1) LD y1, 8 * SIZE(YY) LD y2, 9 * SIZE(YY) MADD t3, y3, x1, a3 LD a3, 6 * SIZE(AO1) MADD t4, y4, x1, a4 LD a4, 7 * SIZE(AO1) LD y3, 10 * SIZE(YY) LD y4, 11 * SIZE(YY) ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) ST t3, 2 * SIZE(YY) ST t4, 3 * SIZE(YY) MADD t1, y5, x1, a1 LD a1, 8 * SIZE(AO1) MADD t2, y6, x1, a2 LD a2, 9 * SIZE(AO1) LD y5, 12 * SIZE(YY) LD y6, 13 * SIZE(YY) MADD t3, y7, x1, a3 LD a3, 10 * SIZE(AO1) MADD t4, y8, x1, a4 LD a4, 11 * SIZE(AO1) LD y7, 14 * SIZE(YY) LD y8, 15 * SIZE(YY) ST t1, 4 * SIZE(YY) ST t2, 5 * SIZE(YY) ST t3, 6 * SIZE(YY) ST t4, 7 * SIZE(YY) daddiu I, I, -1 daddiu YY, YY, 8 * SIZE bgtz I, .L22 daddiu AO1, AO1, 8 * SIZE .align 3 .L23: MADD t1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD t2, y2, x1, a2 LD a2, 5 * SIZE(AO1) MADD t3, y3, x1, a3 LD a3, 6 * SIZE(AO1) MADD t4, y4, x1, a4 LD a4, 7 * SIZE(AO1) ST t1, 0 * SIZE(YY) MADD t1, y5, x1, a1 ST t2, 1 * SIZE(YY) MADD t2, y6, x1, a2 ST t3, 2 * SIZE(YY) MADD t3, y7, x1, a3 ST t4, 3 * SIZE(YY) MADD t4, y8, x1, a4 ST t1, 4 * SIZE(YY) ST t2, 5 * SIZE(YY) ST t3, 6 * SIZE(YY) ST t4, 7 * SIZE(YY) daddiu AO1, AO1, 8 * SIZE daddiu YY, YY, 8 * SIZE .align 3 .L25: andi I, M, 4 NOP blez I, .L26 NOP LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a4, 3 * SIZE(AO1) LD y4, 3 * SIZE(YY) MADD y1, y1, x1, a1 MADD y2, y2, x1, a2 MADD y3, y3, x1, a3 daddiu YY, YY, 4 * SIZE MADD y4, y4, x1, a4 daddiu AO1, AO1, 4 * SIZE ST y1, -4 * SIZE(YY) ST y2, -3 * SIZE(YY) ST y3, -2 * SIZE(YY) ST y4, -1 * SIZE(YY) .align 3 .L26: andi I, M, 2 NOP blez I, .L27 NOP LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) MADD y1, y1, x1, a1 daddiu YY, YY, 2 * SIZE MADD y2, y2, x1, a2 daddiu AO1, AO1, 2 * SIZE ST y1, -2 * SIZE(YY) ST y2, -1 * SIZE(YY) .align 3 .L27: andi I, M, 1 NOP blez I, .L900 NOP LD y1, 0 * SIZE(YY) LD a1, 0 * SIZE(AO1) MADD y1, y1, x1, a1 ST y1, 0 * SIZE(YY) .align 3 .L900: li YORIG, SIZE beq INCY, YORIG, .L999 dsra I, M, 2 blez I, .L905 move XX, BUFFER .align 3 .L902: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) LD a3, 2 * SIZE(XX) LD a4, 3 * SIZE(XX) ST a1, 0 * SIZE(Y) daddu Y, Y, INCY ST a2, 0 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) daddu Y, Y, INCY ST a4, 0 * SIZE(Y) daddu Y, Y, INCY daddiu I, I, -1 bgtz I, .L902 daddiu XX, XX, 4 * SIZE .align 3 .L905: andi I, M, 3 blez I, .L999 NOP .align 3 .L906: LD a1, 0 * SIZE(XX) daddiu XX, XX, 1 * SIZE ST a1, 0 * SIZE(Y) daddiu I, I, -1 bgtz I, .L906 daddu Y, Y, INCY .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) #ifndef __64BIT__ ldc1 $f20, 16($sp) ldc1 $f21, 24($sp) ldc1 $f22, 32($sp) #endif j $31 #ifdef __64BIT__ daddiu $sp, $sp, 16 #else daddiu $sp, $sp, 48 #endif EPILOGUE OpenBLAS-0.2.20/kernel/mips64/gemv_n_loongson3a.c000066400000000000000000000052031313527062700213270ustar00rootroot00000000000000#include "common.h" //These are auto-tuning codes on Loongson-3A platform. //#define prefetch(x) __builtin_prefetch(x) //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) #define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) #define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) #define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { BLASLONG kx=0, ky=0; if(!ALPHA) return 0; //if(INCX < 0) // kx = (1-N) * INCX; // INCX = -INCX; //if(INCY < 0) // ky = (1-M) * INCY; // INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 4; BLASLONG tMQ = M - M % spec_unroll; BLASLONG j = 0, k = 0; if(ALPHA == 1) { if(INCY == 1) { for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[i + fahead]); /*loop_mark*/ spec_loop_alpha1; /*loop_mark*/ spec_loop_alpha1; /*loop_mark*/ spec_loop_alpha1; /*loop_mark*/ spec_loop_alpha1; } for(; likely(i < M);) { spec_loop_alpha1; } } } else { for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0, h = ky; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[h + fahead]); /*loop_mark*/ norm_loop_alpha1; /*loop_mark*/ norm_loop_alpha1; /*loop_mark*/ norm_loop_alpha1; /*loop_mark*/ norm_loop_alpha1; } for(; likely(i < M);) { norm_loop_alpha1; } } } } else { if(INCY == 1) { for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[i + fahead]); /*loop_mark*/ spec_loop; /*loop_mark*/ spec_loop; /*loop_mark*/ spec_loop; /*loop_mark*/ spec_loop; } for(; likely(i < M);) { spec_loop; } } } else { for(k=kx; likely(j < N); j++, k += INCX) { BLASLONG i = 0, h = ky; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(Y[h + fahead]); /*loop_mark*/ norm_loop; /*loop_mark*/ norm_loop; /*loop_mark*/ norm_loop; /*loop_mark*/ norm_loop; } for(; likely(i < M);) { norm_loop; } } } } return 0; } OpenBLAS-0.2.20/kernel/mips64/gemv_t.S000066400000000000000000000233561313527062700171640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define A $8 #define LDA $9 #define X $10 #define INCX $11 #define Y $2 #define INCY $6 #define BUFFER $7 #define XORIG $3 #define XX $12 #define YY $13 #define I $14 #define J $15 #define AO1 $16 #define AO2 $17 #define ALPHA $f15 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define y1 $f8 #define y2 $f9 #define y3 $f10 #define y4 $f11 #define x1 $f12 #define x2 $f13 #define x3 $f14 #define x4 $f16 #define x5 $f17 #define x6 $f18 #define x7 $f19 #define x8 $f20 PROLOGUE LDARG Y, 0($sp) LDARG INCY, 8($sp) LDARG BUFFER, 16($sp) #ifdef __64BIT__ daddiu $sp, $sp, -16 #else daddiu $sp, $sp, -32 #endif MTC $0, y1 SDARG $16, 0($sp) SDARG $17, 8($sp) dsll LDA, LDA, BASE_SHIFT #ifndef __64BIT__ sdc1 $f20, 16($sp) #endif blez M, .L999 dsll INCX, INCX, BASE_SHIFT blez N, .L999 dsll INCY, INCY, BASE_SHIFT li XORIG, SIZE beq INCX, XORIG, .L10 move XORIG, X dsra I, M, 2 move XORIG, BUFFER blez I, .L05 move YY, BUFFER .align 3 .L02: LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(YY) ST a2, 1 * SIZE(YY) ST a3, 2 * SIZE(YY) ST a4, 3 * SIZE(YY) daddiu I, I, -1 bgtz I, .L02 daddiu YY, YY, 4 * SIZE .align 3 .L05: andi I, M, 3 blez I, .L10 NOP .align 3 .L06: LD a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(YY) daddiu I, I, -1 bgtz I, .L06 daddiu YY, YY, 1 * SIZE .align 3 .L10: dsra J, N, 1 blez J, .L20 move YY, Y .align 3 .L11: move AO1, A MOV y2, y1 daddu AO2, A, LDA MOV y3, y1 daddu A, AO2, LDA MOV y4, y1 dsra I, M, 3 blez I, .L15 move XX, XORIG LD a1, 0 * SIZE(AO1) LD x1, 0 * SIZE(XX) LD a2, 0 * SIZE(AO2) LD x2, 1 * SIZE(XX) LD a3, 1 * SIZE(AO1) LD x3, 2 * SIZE(XX) LD a4, 1 * SIZE(AO2) LD x4, 3 * SIZE(XX) LD a5, 2 * SIZE(AO1) LD x5, 4 * SIZE(XX) LD a6, 2 * SIZE(AO2) LD x6, 5 * SIZE(XX) LD a7, 3 * SIZE(AO1) LD x7, 6 * SIZE(XX) LD a8, 3 * SIZE(AO2) daddiu I, I, -1 blez I, .L13 LD x8, 7 * SIZE(XX) .align 3 .L12: MADD y1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD y2, y2, x1, a2 LD a2, 4 * SIZE(AO2) MADD y3, y3, x2, a3 LD a3, 5 * SIZE(AO1) MADD y4, y4, x2, a4 LD a4, 5 * SIZE(AO2) LD x1, 8 * SIZE(XX) LD x2, 9 * SIZE(XX) MADD y1, y1, x3, a5 LD a5, 6 * SIZE(AO1) MADD y2, y2, x3, a6 LD a6, 6 * SIZE(AO2) MADD y3, y3, x4, a7 LD a7, 7 * SIZE(AO1) MADD y4, y4, x4, a8 LD a8, 7 * SIZE(AO2) LD x3, 10 * SIZE(XX) LD x4, 11 * SIZE(XX) MADD y1, y1, x5, a1 LD a1, 8 * SIZE(AO1) MADD y2, y2, x5, a2 LD a2, 8 * SIZE(AO2) MADD y3, y3, x6, a3 LD a3, 9 * SIZE(AO1) MADD y4, y4, x6, a4 LD a4, 9 * SIZE(AO2) LD x5, 12 * SIZE(XX) LD x6, 13 * SIZE(XX) MADD y1, y1, x7, a5 LD a5,10 * SIZE(AO1) MADD y2, y2, x7, a6 LD a6,10 * SIZE(AO2) MADD y3, y3, x8, a7 LD a7,11 * SIZE(AO1) MADD y4, y4, x8, a8 LD a8,11 * SIZE(AO2) LD x7, 14 * SIZE(XX) LD x8, 15 * SIZE(XX) daddiu I, I, -1 daddiu XX, XX, 8 * SIZE daddiu AO1, AO1, 8 * SIZE bgtz I, .L12 daddiu AO2, AO2, 8 * SIZE .align 3 .L13: MADD y1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD y2, y2, x1, a2 LD a2, 4 * SIZE(AO2) MADD y3, y3, x2, a3 LD a3, 5 * SIZE(AO1) MADD y4, y4, x2, a4 LD a4, 5 * SIZE(AO2) MADD y1, y1, x3, a5 LD a5, 6 * SIZE(AO1) MADD y2, y2, x3, a6 LD a6, 6 * SIZE(AO2) MADD y3, y3, x4, a7 LD a7, 7 * SIZE(AO1) MADD y4, y4, x4, a8 LD a8, 7 * SIZE(AO2) MADD y1, y1, x5, a1 MADD y2, y2, x5, a2 MADD y3, y3, x6, a3 MADD y4, y4, x6, a4 MADD y1, y1, x7, a5 daddiu XX, XX, 8 * SIZE MADD y2, y2, x7, a6 daddiu AO1, AO1, 8 * SIZE MADD y3, y3, x8, a7 daddiu AO2, AO2, 8 * SIZE MADD y4, y4, x8, a8 NOP .align 3 .L15: andi I, M, 4 NOP blez I, .L17 NOP LD a1, 0 * SIZE(AO1) LD x1, 0 * SIZE(XX) LD a2, 0 * SIZE(AO2) LD a3, 1 * SIZE(AO1) LD x2, 1 * SIZE(XX) LD a4, 1 * SIZE(AO2) LD a5, 2 * SIZE(AO1) LD x3, 2 * SIZE(XX) MADD y1, y1, x1, a1 LD a6, 2 * SIZE(AO2) MADD y2, y2, x1, a2 LD a7, 3 * SIZE(AO1) MADD y3, y3, x2, a3 LD x4, 3 * SIZE(XX) MADD y4, y4, x2, a4 LD a8, 3 * SIZE(AO2) MADD y1, y1, x3, a5 MADD y2, y2, x3, a6 daddiu XX, XX, 4 * SIZE MADD y3, y3, x4, a7 daddiu AO1, AO1, 4 * SIZE MADD y4, y4, x4, a8 daddiu AO2, AO2, 4 * SIZE .align 3 .L17: andi I, M, 3 ADD y1, y1, y3 blez I, .L19 ADD y2, y2, y4 .align 3 .L18: LD x1, 0 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 0 * SIZE(AO2) daddiu I, I, -1 daddiu XX, XX, 1 * SIZE daddiu AO1, AO1, 1 * SIZE daddiu AO2, AO2, 1 * SIZE MADD y1, y1, x1, a1 bgtz I, .L18 MADD y2, y2, x1, a2 .align 3 .L19: LD a1, 0 * SIZE(Y) daddu Y, Y, INCY LD a2, 0 * SIZE(Y) daddu Y, Y, INCY MADD a1, a1, ALPHA, y1 daddiu J, J, -1 MADD a2, a2, ALPHA, y2 MTC $0, y1 ST a1, 0 * SIZE(YY) daddu YY, YY, INCY ST a2, 0 * SIZE(YY) bgtz J, .L11 daddu YY, YY, INCY .align 3 .L20: andi J, N, 1 MOV y3, y1 blez J, .L999 move AO1, A dsra I, M, 3 NOP blez I, .L25 move XX, XORIG LD a1, 0 * SIZE(AO1) LD x1, 0 * SIZE(XX) LD a3, 1 * SIZE(AO1) LD x2, 1 * SIZE(XX) LD a5, 2 * SIZE(AO1) LD x3, 2 * SIZE(XX) LD a7, 3 * SIZE(AO1) LD x4, 3 * SIZE(XX) LD x5, 4 * SIZE(XX) LD x6, 5 * SIZE(XX) LD x7, 6 * SIZE(XX) daddiu I, I, -1 blez I, .L23 LD x8, 7 * SIZE(XX) .align 3 .L22: MADD y1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD y3, y3, x2, a3 LD a3, 5 * SIZE(AO1) LD x1, 8 * SIZE(XX) LD x2, 9 * SIZE(XX) MADD y1, y1, x3, a5 LD a5, 6 * SIZE(AO1) MADD y3, y3, x4, a7 LD a7, 7 * SIZE(AO1) LD x3, 10 * SIZE(XX) LD x4, 11 * SIZE(XX) MADD y1, y1, x5, a1 LD a1, 8 * SIZE(AO1) MADD y3, y3, x6, a3 LD a3, 9 * SIZE(AO1) LD x5, 12 * SIZE(XX) LD x6, 13 * SIZE(XX) MADD y1, y1, x7, a5 LD a5, 10 * SIZE(AO1) MADD y3, y3, x8, a7 LD a7, 11 * SIZE(AO1) LD x7, 14 * SIZE(XX) LD x8, 15 * SIZE(XX) daddiu I, I, -1 daddiu XX, XX, 8 * SIZE bgtz I, .L22 daddiu AO1, AO1, 8 * SIZE .align 3 .L23: MADD y1, y1, x1, a1 LD a1, 4 * SIZE(AO1) MADD y3, y3, x2, a3 LD a3, 5 * SIZE(AO1) MADD y1, y1, x3, a5 LD a5, 6 * SIZE(AO1) MADD y3, y3, x4, a7 LD a7, 7 * SIZE(AO1) MADD y1, y1, x5, a1 MADD y3, y3, x6, a3 MADD y1, y1, x7, a5 MADD y3, y3, x8, a7 daddiu XX, XX, 8 * SIZE daddiu AO1, AO1, 8 * SIZE .align 3 .L25: andi I, M, 4 NOP blez I, .L27 NOP LD a1, 0 * SIZE(AO1) LD x1, 0 * SIZE(XX) LD a3, 1 * SIZE(AO1) LD x2, 1 * SIZE(XX) LD a5, 2 * SIZE(AO1) LD x3, 2 * SIZE(XX) MADD y1, y1, x1, a1 LD a7, 3 * SIZE(AO1) MADD y3, y3, x2, a3 LD x4, 3 * SIZE(XX) MADD y1, y1, x3, a5 daddiu XX, XX, 4 * SIZE MADD y3, y3, x4, a7 daddiu AO1, AO1, 4 * SIZE .align 3 .L27: andi I, M, 3 ADD y1, y1, y3 blez I, .L29 NOP .align 3 .L28: LD x1, 0 * SIZE(XX) LD a1, 0 * SIZE(AO1) daddiu I, I, -1 daddiu XX, XX, 1 * SIZE daddiu AO1, AO1, 1 * SIZE bgtz I, .L28 MADD y1, y1, x1, a1 .align 3 .L29: LD a1, 0 * SIZE(Y) daddu Y, Y, INCY MADD a1, a1, ALPHA, y1 NOP ST a1, 0 * SIZE(YY) daddu YY, YY, INCY .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) #ifndef __64BIT__ ldc1 $f20, 16($sp) #endif j $31 #ifdef __64BIT__ daddiu $sp, $sp, 16 #else daddiu $sp, $sp, 32 #endif EPILOGUE OpenBLAS-0.2.20/kernel/mips64/gemv_t_loongson3a.c000066400000000000000000000046471313527062700213500ustar00rootroot00000000000000#include "common.h" //These are auto-tuning codes on Loongson-3A platform. //#define prefetch(x) __builtin_prefetch(x) //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) #define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) #define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) #define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { if(!ALPHA) return 0; // if(INCX < 0) // INCX = -INCX; // if(INCY < 0) // INCY = -INCY; BLASLONG fahead = 30; BLASLONG spec_unroll = 3; BLASLONG tMQ = M - M % spec_unroll; BLASLONG j = 0, k = 0; if(ALPHA == 1) { if(INCX == 1) { for(; likely(j < N); j++, k += INCY) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(X[i + fahead]); /*loop_mark*/ spec_loop_alpha1; /*loop_mark*/ spec_loop_alpha1; /*loop_mark*/ spec_loop_alpha1; } for(; likely(i < M);) { spec_loop_alpha1; } } } else { for(; likely(j < N); j++, k += INCY) { BLASLONG i = 0, h = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(X[h + fahead]); /*loop_mark*/ norm_loop_alpha1; /*loop_mark*/ norm_loop_alpha1; /*loop_mark*/ norm_loop_alpha1; } for(; likely(i < M);) { norm_loop_alpha1; } } } } else { if(INCX == 1) { for(; likely(j < N); j++, k += INCY) { BLASLONG i = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(X[i + fahead]); /*loop_mark*/ spec_loop; /*loop_mark*/ spec_loop; /*loop_mark*/ spec_loop; } for(; likely(i < M);) { spec_loop; } } } else { for(; likely(j < N); j++, k += INCY) { BLASLONG i = 0, h = 0; for(; likely(i < tMQ);) { prefetch(A[LDA * j + i + fahead]); prefetch(X[h + fahead]); /*loop_mark*/ norm_loop; /*loop_mark*/ norm_loop; /*loop_mark*/ norm_loop; } for(; likely(i < M);) { norm_loop; } } } } return 0; } OpenBLAS-0.2.20/kernel/mips64/iamax.S000066400000000000000000000136571313527062700170050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $3 #define TEMP $7 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define x1 $2 #define x2 $8 #define x3 $9 #define x4 $10 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 li x1, 0 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD a1, 0 * SIZE(X) daddiu N, N, -1 blez N, .L999 li x1, 1 FABS s1, a1 daddu X, X, INCX FABS s2, a1 li x2, 1 FABS s3, a1 dsra I, N, 3 FABS s4, a1 li x3, 1 li TEMP, 2 blez I, .L15 li x4, 1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 daddu X, X, INCX FABS t3, a3 LD a2, 0 * SIZE(X) FABS t4, a4 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a3, 0 * SIZE(X) CMPLT $fcc1, s2, t2 daddu X, X, INCX CMPLT $fcc2, s3, t3 LD a4, 0 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu I, I, -1 FABS t1, a5 LD a5, 0 * SIZE(X) FABS t2, a6 daddu X, X, INCX FABS t3, a7 LD a6, 0 * SIZE(X) FABS t4, a8 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, t2 daddu X, X, INCX CMPLT $fcc2, s3, t3 LD a8, 0 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 bgtz I, .L12 daddiu TEMP, TEMP, 4 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 FABS t1, a5 daddiu TEMP, TEMP, 4 FABS t2, a6 NOP FABS t3, a7 FABS t4, a8 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu x2, x2, 1 daddiu x3, x3, 2 daddiu x4, x4, 3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddu X, X, INCX FABS t1, a1 daddiu I, I, -1 CMPLT $fcc0, s1, t1 NOP CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 bgtz I, .L16 daddiu TEMP, TEMP, 1 .align 3 .L998: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 movt x1, x2, $fcc0 CMOVT s3, s4, $fcc1 movt x3, x4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 movt x1, x3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/iamin.S000066400000000000000000000136571313527062700170030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $3 #define TEMP $7 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define x1 $2 #define x2 $8 #define x3 $9 #define x4 $10 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 li x1, 0 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD a1, 0 * SIZE(X) daddiu N, N, -1 blez N, .L999 li x1, 1 FABS s1, a1 daddu X, X, INCX FABS s2, a1 li x2, 1 FABS s3, a1 dsra I, N, 3 FABS s4, a1 li x3, 1 li TEMP, 2 blez I, .L15 li x4, 1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 daddu X, X, INCX FABS t3, a3 LD a2, 0 * SIZE(X) FABS t4, a4 daddu X, X, INCX CMPLT $fcc0, t1, s1 LD a3, 0 * SIZE(X) CMPLT $fcc1, t2, s2 daddu X, X, INCX CMPLT $fcc2, t3, s3 LD a4, 0 * SIZE(X) CMPLT $fcc3, t4, s4 daddu X, X, INCX CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu I, I, -1 FABS t1, a5 LD a5, 0 * SIZE(X) FABS t2, a6 daddu X, X, INCX FABS t3, a7 LD a6, 0 * SIZE(X) FABS t4, a8 daddu X, X, INCX CMPLT $fcc0, t1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, t2, s2 daddu X, X, INCX CMPLT $fcc2, t3, s3 LD a8, 0 * SIZE(X) CMPLT $fcc3, t4, s4 daddu X, X, INCX CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 bgtz I, .L12 daddiu TEMP, TEMP, 4 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 CMPLT $fcc0, t1, s1 CMPLT $fcc1, t2, s2 CMPLT $fcc2, t3, s3 CMPLT $fcc3, t4, s4 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 FABS t1, a5 daddiu TEMP, TEMP, 4 FABS t2, a6 NOP FABS t3, a7 FABS t4, a8 CMPLT $fcc0, t1, s1 CMPLT $fcc1, t2, s2 CMPLT $fcc2, t3, s3 CMPLT $fcc3, t4, s4 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t4, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu x2, x2, 1 daddiu x3, x3, 2 daddiu x4, x4, 3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddu X, X, INCX FABS t1, a1 daddiu I, I, -1 CMPLT $fcc0, t1, s1 NOP CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 bgtz I, .L16 daddiu TEMP, TEMP, 1 .align 3 .L998: CMPLT $fcc0, s2, s1 CMPLT $fcc1, s4, s3 CMOVT s1, s2, $fcc0 movt x1, x2, $fcc0 CMOVT s3, s4, $fcc1 movt x3, x4, $fcc1 CMPLT $fcc0, s3, s1 CMOVT s1, s3, $fcc0 movt x1, x3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/imax.S000066400000000000000000000132721313527062700166350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $3 #define TEMP $7 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define x1 $2 #define x2 $8 #define x3 $9 #define x4 $10 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 li x1, 0 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD s1, 0 * SIZE(X) daddiu N, N, -1 blez N, .L999 li x1, 1 daddu X, X, INCX MOV s2, s1 li x2, 1 MOV s3, s1 dsra I, N, 3 MOV s4, s1 li x3, 1 li TEMP, 2 blez I, .L15 li x4, 1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: CMPLT $fcc0, s1, a1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, a2 daddu X, X, INCX CMPLT $fcc2, s3, a3 LD a8, 0 * SIZE(X) CMPLT $fcc3, s4, a4 daddu X, X, INCX CMOVT s1, a1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a2, $fcc1 movt x2, TEMP, $fcc1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX CMOVT s3, a3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a4, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu I, I, -1 CMPLT $fcc0, s1, a5 LD a3, 0 * SIZE(X) CMPLT $fcc1, s2, a6 daddu X, X, INCX CMPLT $fcc2, s3, a7 LD a4, 0 * SIZE(X) CMPLT $fcc3, s4, a8 daddu X, X, INCX CMOVT s1, a5, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a6, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, a7, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a8, $fcc3 movt x4, TEMP, $fcc3 LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX bgtz I, .L12 daddiu TEMP, TEMP, 4 .align 3 .L13: CMPLT $fcc0, s1, a1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, a2 daddu X, X, INCX CMPLT $fcc2, s3, a3 LD a8, 0 * SIZE(X) CMPLT $fcc3, s4, a4 daddu X, X, INCX CMOVT s1, a1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, a3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a4, $fcc3 movt x4, TEMP, $fcc3 CMPLT $fcc0, s1, a5 daddiu TEMP, TEMP, 4 CMPLT $fcc1, s2, a6 NOP CMPLT $fcc2, s3, a7 CMPLT $fcc3, s4, a8 CMOVT s1, a5, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a6, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, a7, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a8, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu x2, x2, 1 daddiu x3, x3, 2 daddiu x4, x4, 3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddu X, X, INCX daddiu I, I, -1 CMPLT $fcc0, s1, a1 NOP CMOVT s1, a1, $fcc0 movt x1, TEMP, $fcc0 bgtz I, .L16 daddiu TEMP, TEMP, 1 .align 3 .L998: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 movt x1, x2, $fcc0 CMOVT s3, s4, $fcc1 movt x3, x4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 movt x1, x3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/imin.S000066400000000000000000000132721313527062700166330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $3 #define TEMP $7 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define x1 $2 #define x2 $8 #define x3 $9 #define x4 $10 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 li x1, 0 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD s1, 0 * SIZE(X) daddiu N, N, -1 blez N, .L999 li x1, 1 daddu X, X, INCX MOV s2, s1 li x2, 1 MOV s3, s1 dsra I, N, 3 MOV s4, s1 li x3, 1 li TEMP, 2 blez I, .L15 li x4, 1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: CMPLT $fcc0, a1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, a2, s2 daddu X, X, INCX CMPLT $fcc2, a3, s3 LD a8, 0 * SIZE(X) CMPLT $fcc3, a4, s4 daddu X, X, INCX CMOVT s1, a1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a2, $fcc1 movt x2, TEMP, $fcc1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX CMOVT s3, a3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a4, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu I, I, -1 CMPLT $fcc0, a5, s1 LD a3, 0 * SIZE(X) CMPLT $fcc1, a6, s2 daddu X, X, INCX CMPLT $fcc2, a7, s3 LD a4, 0 * SIZE(X) CMPLT $fcc3, a8, s4 daddu X, X, INCX CMOVT s1, a5, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a6, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, a7, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a8, $fcc3 movt x4, TEMP, $fcc3 LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX bgtz I, .L12 daddiu TEMP, TEMP, 4 .align 3 .L13: CMPLT $fcc0, a1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, a2, s2 daddu X, X, INCX CMPLT $fcc2, a3, s3 LD a8, 0 * SIZE(X) CMPLT $fcc3, a4, s4 daddu X, X, INCX CMOVT s1, a1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a2, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, a3, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a4, $fcc3 movt x4, TEMP, $fcc3 CMPLT $fcc0, a5, s1 daddiu TEMP, TEMP, 4 CMPLT $fcc1, a6, s2 NOP CMPLT $fcc2, a7, s3 CMPLT $fcc3, a8, s4 CMOVT s1, a5, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, a6, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, a7, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, a8, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu x2, x2, 1 daddiu x3, x3, 2 daddiu x4, x4, 3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddu X, X, INCX daddiu I, I, -1 CMPLT $fcc0, a1, s1 NOP CMOVT s1, a1, $fcc0 movt x1, TEMP, $fcc0 bgtz I, .L16 daddiu TEMP, TEMP, 1 .align 3 .L998: CMPLT $fcc0, s2, s1 CMPLT $fcc1, s4, s3 CMOVT s1, s2, $fcc0 movt x1, x2, $fcc0 CMOVT s3, s4, $fcc1 movt x3, x4, $fcc1 CMPLT $fcc0, s3, s1 CMOVT s1, s3, $fcc0 movt x1, x3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/izamax.S000066400000000000000000000130451313527062700171660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $3 #define TEMP $7 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define t5 $f16 #define t6 $f17 #define t7 $f18 #define t8 $f19 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define x1 $2 #define x2 $8 #define x3 $9 #define x4 $10 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 li x1, 0 blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) FABS t1, a1 FABS t2, a2 ADD s1, t1, t2 ADD s2, t1, t2 ADD s3, t1, t2 ADD s4, t1, t2 daddiu N, N, -1 blez N, .L999 li x1, 1 daddu X, X, INCX li x2, 1 dsra I, N, 2 li x3, 1 li TEMP, 2 blez I, .L15 li x4, 1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 LD a2, 1 * SIZE(X) FABS t3, a3 daddu X, X, INCX FABS t4, a4 NOP FABS t5, a5 LD a3, 0 * SIZE(X) FABS t6, a6 LD a4, 1 * SIZE(X) FABS t7, a7 daddu X, X, INCX FABS t8, a8 NOP ADD t1, t1, t2 LD a5, 0 * SIZE(X) ADD t3, t3, t4 LD a6, 1 * SIZE(X) ADD t5, t5, t6 daddu X, X, INCX ADD t7, t7, t8 NOP CMPLT $fcc0, s1, t1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, t3 LD a8, 1 * SIZE(X) CMPLT $fcc2, s3, t5 daddu X, X, INCX CMPLT $fcc3, s4, t7 daddiu I, I, -1 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t3, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t5, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t7, $fcc3 movt x4, TEMP, $fcc3 bgtz I, .L12 daddiu TEMP, TEMP, 4 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 FABS t5, a5 FABS t6, a6 FABS t7, a7 FABS t8, a8 ADD t1, t1, t2 ADD t3, t3, t4 ADD t5, t5, t6 ADD t7, t7, t8 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t3 CMPLT $fcc2, s3, t5 CMPLT $fcc3, s4, t7 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t3, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t5, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t7, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu x2, x2, 1 daddiu x3, x3, 2 daddiu x4, x4, 3 .align 3 .L15: andi I, N, 3 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX FABS t1, a1 FABS t2, a2 ADD t1, t1, t2 daddiu I, I, -1 CMPLT $fcc0, s1, t1 NOP CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 bgtz I, .L16 daddiu TEMP, TEMP, 1 .align 3 .L998: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 movt x1, x2, $fcc0 CMOVT s3, s4, $fcc1 movt x3, x4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 movt x1, x3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/izamin.S000066400000000000000000000130451313527062700171640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $3 #define TEMP $7 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define t5 $f16 #define t6 $f17 #define t7 $f18 #define t8 $f19 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define x1 $2 #define x2 $8 #define x3 $9 #define x4 $10 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 li x1, 0 blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) FABS t1, a1 FABS t2, a2 ADD s1, t1, t2 ADD s2, t1, t2 ADD s3, t1, t2 ADD s4, t1, t2 daddiu N, N, -1 blez N, .L999 li x1, 1 daddu X, X, INCX li x2, 1 dsra I, N, 2 li x3, 1 li TEMP, 2 blez I, .L15 li x4, 1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 LD a2, 1 * SIZE(X) FABS t3, a3 daddu X, X, INCX FABS t4, a4 NOP FABS t5, a5 LD a3, 0 * SIZE(X) FABS t6, a6 LD a4, 1 * SIZE(X) FABS t7, a7 daddu X, X, INCX FABS t8, a8 NOP ADD t1, t1, t2 LD a5, 0 * SIZE(X) ADD t3, t3, t4 LD a6, 1 * SIZE(X) ADD t5, t5, t6 daddu X, X, INCX ADD t7, t7, t8 NOP CMPLT $fcc0, t1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, t3, s2 LD a8, 1 * SIZE(X) CMPLT $fcc2, t5, s3 daddu X, X, INCX CMPLT $fcc3, t7, s4 daddiu I, I, -1 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t3, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t5, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t7, $fcc3 movt x4, TEMP, $fcc3 bgtz I, .L12 daddiu TEMP, TEMP, 4 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 FABS t5, a5 FABS t6, a6 FABS t7, a7 FABS t8, a8 ADD t1, t1, t2 ADD t3, t3, t4 ADD t5, t5, t6 ADD t7, t7, t8 CMPLT $fcc0, t1, s1 CMPLT $fcc1, t3, s2 CMPLT $fcc2, t5, s3 CMPLT $fcc3, t7, s4 CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 CMOVT s2, t3, $fcc1 movt x2, TEMP, $fcc1 CMOVT s3, t5, $fcc2 movt x3, TEMP, $fcc2 CMOVT s4, t7, $fcc3 movt x4, TEMP, $fcc3 daddiu TEMP, TEMP, 4 daddiu x2, x2, 1 daddiu x3, x3, 2 daddiu x4, x4, 3 .align 3 .L15: andi I, N, 3 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX FABS t1, a1 FABS t2, a2 ADD t1, t1, t2 daddiu I, I, -1 CMPLT $fcc0, t1, s1 NOP CMOVT s1, t1, $fcc0 movt x1, TEMP, $fcc0 bgtz I, .L16 daddiu TEMP, TEMP, 1 .align 3 .L998: CMPLT $fcc0, s2, s1 CMPLT $fcc1, s4, s3 CMOVT s1, s2, $fcc0 movt x1, x2, $fcc0 CMOVT s3, s4, $fcc1 movt x3, x4, $fcc1 CMPLT $fcc0, s3, s1 CMOVT s1, s3, $fcc0 movt x1, x3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/max.S000066400000000000000000000116561313527062700164700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD s1, 0 * SIZE(X) daddiu N, N, -1 daddu X, X, INCX NOP blez N, .L999 MOV s2, s1 MOV s3, s1 dsra I, N, 3 blez I, .L15 MOV s4, s1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: CMPLT $fcc0, s1, a1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, a2 daddu X, X, INCX CMPLT $fcc2, s3, a3 LD a8, 0 * SIZE(X) CMPLT $fcc3, s4, a4 daddu X, X, INCX CMOVT s1, a1, $fcc0 LD a1, 0 * SIZE(X) CMOVT s2, a2, $fcc1 daddu X, X, INCX CMOVT s3, a3, $fcc2 LD a2, 0 * SIZE(X) CMOVT s4, a4, $fcc3 daddu X, X, INCX CMPLT $fcc0, s1, a5 LD a3, 0 * SIZE(X) CMPLT $fcc1, s2, a6 daddu X, X, INCX CMPLT $fcc2, s3, a7 LD a4, 0 * SIZE(X) CMPLT $fcc3, s4, a8 daddu X, X, INCX CMOVT s1, a5, $fcc0 LD a5, 0 * SIZE(X) CMOVT s2, a6, $fcc1 daddu X, X, INCX CMOVT s3, a7, $fcc2 LD a6, 0 * SIZE(X) CMOVT s4, a8, $fcc3 daddiu I, I, -1 bgtz I, .L12 daddu X, X, INCX .align 3 .L13: CMPLT $fcc0, s1, a1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, a2 daddu X, X, INCX CMPLT $fcc2, s3, a3 LD a8, 0 * SIZE(X) CMPLT $fcc3, s4, a4 daddu X, X, INCX CMOVT s1, a1, $fcc0 CMOVT s2, a2, $fcc1 CMOVT s3, a3, $fcc2 CMOVT s4, a4, $fcc3 CMPLT $fcc0, s1, a5 CMPLT $fcc1, s2, a6 CMPLT $fcc2, s3, a7 CMPLT $fcc3, s4, a8 CMOVT s1, a5, $fcc0 CMOVT s2, a6, $fcc1 CMOVT s3, a7, $fcc2 CMOVT s4, a8, $fcc3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu I, I, -1 CMPLT $fcc0, s1, a1 CMOVT s1, a1, $fcc0 bgtz I, .L16 daddu X, X, INCX .align 3 .L998: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/min.S000066400000000000000000000116561313527062700164660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT LD s1, 0 * SIZE(X) daddiu N, N, -1 daddu X, X, INCX NOP blez N, .L999 MOV s2, s1 MOV s3, s1 dsra I, N, 3 blez I, .L15 MOV s4, s1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: CMPLT $fcc0, a1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, a2, s2 daddu X, X, INCX CMPLT $fcc2, a3, s3 LD a8, 0 * SIZE(X) CMPLT $fcc3, a4, s4 daddu X, X, INCX CMOVT s1, a1, $fcc0 LD a1, 0 * SIZE(X) CMOVT s2, a2, $fcc1 daddu X, X, INCX CMOVT s3, a3, $fcc2 LD a2, 0 * SIZE(X) CMOVT s4, a4, $fcc3 daddu X, X, INCX CMPLT $fcc0, a5, s1 LD a3, 0 * SIZE(X) CMPLT $fcc1, a6, s2 daddu X, X, INCX CMPLT $fcc2, a7, s3 LD a4, 0 * SIZE(X) CMPLT $fcc3, a8, s4 daddu X, X, INCX CMOVT s1, a5, $fcc0 LD a5, 0 * SIZE(X) CMOVT s2, a6, $fcc1 daddu X, X, INCX CMOVT s3, a7, $fcc2 LD a6, 0 * SIZE(X) CMOVT s4, a8, $fcc3 daddiu I, I, -1 bgtz I, .L12 daddu X, X, INCX .align 3 .L13: CMPLT $fcc0, a1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, a2, s2 daddu X, X, INCX CMPLT $fcc2, a3, s3 LD a8, 0 * SIZE(X) CMPLT $fcc3, a4, s4 daddu X, X, INCX CMOVT s1, a1, $fcc0 CMOVT s2, a2, $fcc1 CMOVT s3, a3, $fcc2 CMOVT s4, a4, $fcc3 CMPLT $fcc0, a5, s1 CMPLT $fcc1, a6, s2 CMPLT $fcc2, a7, s3 CMPLT $fcc3, a8, s4 CMOVT s1, a5, $fcc0 CMOVT s2, a6, $fcc1 CMOVT s3, a7, $fcc2 CMOVT s4, a8, $fcc3 .align 3 .L15: andi I, N, 7 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu I, I, -1 CMPLT $fcc0, a1, s1 CMOVT s1, a1, $fcc0 bgtz I, .L16 daddu X, X, INCX .align 3 .L998: CMPLT $fcc0, s2, s1 CMPLT $fcc1, s4, s3 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s3, s1 CMOVT s1, s3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/rot.S000066400000000000000000000162341313527062700165040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define Y $7 #define INCY $8 #define XX $9 #define YY $10 #define C $f17 #define S $f18 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define t1 $f0 #define t2 $f1 #define t3 $f2 #define t4 $f3 PROLOGUE dsll INCX, INCX, BASE_SHIFT li TEMP, SIZE blez N, .L999 dsll INCY, INCY, BASE_SHIFT bne INCX, TEMP, .L20 dsra I, N, 2 bne INCY, TEMP, .L20 NOP blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) LD a3, 2 * SIZE(X) LD b3, 2 * SIZE(Y) MUL t1, S, b1 LD a4, 3 * SIZE(X) MUL t2, C, b1 LD b4, 3 * SIZE(Y) MUL t3, S, b2 blez I, .L13 MUL t4, C, b2 .align 3 .L12: MADD t1, t1, C, a1 LD b1, 4 * SIZE(Y) NMSUB t2, t2, S, a1 LD a1, 4 * SIZE(X) MADD t3, t3, C, a2 LD b2, 5 * SIZE(Y) NMSUB t4, t4, S, a2 LD a2, 5 * SIZE(X) ST t1, 0 * SIZE(X) MUL t1, S, b3 ST t2, 0 * SIZE(Y) MUL t2, C, b3 ST t3, 1 * SIZE(X) MUL t3, S, b4 ST t4, 1 * SIZE(Y) MUL t4, C, b4 MADD t1, t1, C, a3 LD b3, 6 * SIZE(Y) NMSUB t2, t2, S, a3 LD a3, 6 * SIZE(X) MADD t3, t3, C, a4 LD b4, 7 * SIZE(Y) NMSUB t4, t4, S, a4 LD a4, 7 * SIZE(X) ST t1, 2 * SIZE(X) MUL t1, S, b1 ST t2, 2 * SIZE(Y) MUL t2, C, b1 ST t3, 3 * SIZE(X) MUL t3, S, b2 ST t4, 3 * SIZE(Y) MUL t4, C, b2 daddiu I, I, -1 daddiu X, X, 4 * SIZE bgtz I, .L12 daddiu Y, Y, 4 * SIZE .align 3 .L13: MADD t1, t1, C, a1 NMSUB t2, t2, S, a1 MADD t3, t3, C, a2 NMSUB t4, t4, S, a2 ST t1, 0 * SIZE(X) MUL t1, S, b3 ST t2, 0 * SIZE(Y) MUL t2, C, b3 ST t3, 1 * SIZE(X) MUL t3, S, b4 ST t4, 1 * SIZE(Y) MUL t4, C, b4 MADD t1, t1, C, a3 NMSUB t2, t2, S, a3 MADD t3, t3, C, a4 daddiu X, X, 4 * SIZE NMSUB t4, t4, S, a4 daddiu Y, Y, 4 * SIZE ST t1, -2 * SIZE(X) ST t2, -2 * SIZE(Y) ST t3, -1 * SIZE(X) ST t4, -1 * SIZE(Y) .align 3 .L15: andi I, N, 3 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) MUL t1, S, b1 MUL t2, C, b1 MADD t1, t1, C, a1 NMSUB t2, t2, S, a1 ST t1, 0 * SIZE(X) ST t2, 0 * SIZE(Y) daddiu I, I, -1 daddiu X, X, SIZE daddiu Y, Y, SIZE bgtz I, .L16 NOP j .L999 NOP .align 3 .L20: move XX, X move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) dadd X, X, INCX LD b1, 0 * SIZE(Y) dadd Y, Y, INCY LD a2, 0 * SIZE(X) dadd X, X, INCX LD b2, 0 * SIZE(Y) dadd Y, Y, INCY LD a3, 0 * SIZE(X) dadd X, X, INCX LD b3, 0 * SIZE(Y) dadd Y, Y, INCY MUL t1, S, b1 LD a4, 0 * SIZE(X) dadd X, X, INCX MUL t2, C, b1 LD b4, 0 * SIZE(Y) dadd Y, Y, INCY MUL t3, S, b2 blez I, .L23 MUL t4, C, b2 .align 3 .L22: MADD t1, t1, C, a1 LD b1, 0 * SIZE(Y) dadd Y, Y, INCY NMSUB t2, t2, S, a1 LD a1, 0 * SIZE(X) dadd X, X, INCX MADD t3, t3, C, a2 LD b2, 0 * SIZE(Y) dadd Y, Y, INCY NMSUB t4, t4, S, a2 LD a2, 0 * SIZE(X) dadd X, X, INCX ST t1, 0 * SIZE(XX) dadd XX, XX, INCX MUL t1, S, b3 ST t2, 0 * SIZE(YY) dadd YY, YY, INCY MUL t2, C, b3 ST t3, 0 * SIZE(XX) dadd XX, XX, INCX MUL t3, S, b4 ST t4, 0 * SIZE(YY) dadd YY, YY, INCY MUL t4, C, b4 MADD t1, t1, C, a3 LD b3, 0 * SIZE(Y) dadd Y, Y, INCY NMSUB t2, t2, S, a3 LD a3, 0 * SIZE(X) dadd X, X, INCX MADD t3, t3, C, a4 LD b4, 0 * SIZE(Y) dadd Y, Y, INCY NMSUB t4, t4, S, a4 LD a4, 0 * SIZE(X) dadd X, X, INCX ST t1, 0 * SIZE(XX) dadd XX, XX, INCX MUL t1, S, b1 ST t2, 0 * SIZE(YY) dadd YY, YY, INCY MUL t2, C, b1 ST t3, 0 * SIZE(XX) dadd XX, XX, INCX MUL t3, S, b2 ST t4, 0 * SIZE(YY) MUL t4, C, b2 daddiu I, I, -1 bgtz I, .L22 dadd YY, YY, INCY .align 3 .L23: MADD t1, t1, C, a1 NMSUB t2, t2, S, a1 MADD t3, t3, C, a2 NMSUB t4, t4, S, a2 ST t1, 0 * SIZE(XX) dadd XX, XX, INCX MUL t1, S, b3 ST t2, 0 * SIZE(YY) dadd YY, YY, INCY MUL t2, C, b3 ST t3, 0 * SIZE(XX) dadd XX, XX, INCX MUL t3, S, b4 ST t4, 0 * SIZE(YY) dadd YY, YY, INCY MUL t4, C, b4 MADD t1, t1, C, a3 NMSUB t2, t2, S, a3 MADD t3, t3, C, a4 NMSUB t4, t4, S, a4 ST t1, 0 * SIZE(XX) dadd XX, XX, INCX ST t2, 0 * SIZE(YY) dadd YY, YY, INCY ST t3, 0 * SIZE(XX) dadd XX, XX, INCX ST t4, 0 * SIZE(YY) dadd YY, YY, INCY .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) MUL t1, S, b1 MUL t2, C, b1 MADD t1, t1, C, a1 daddiu I, I, -1 NMSUB t2, t2, S, a1 ST t1, 0 * SIZE(X) ST t2, 0 * SIZE(Y) dadd X, X, INCX bgtz I, .L26 dadd Y, Y, INCY .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/scal.S000066400000000000000000000166031313527062700166220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $8 #define INCX $9 #define I $2 #define TEMP $3 #define XX $5 #define ALPHA $f15 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define t1 $f8 #define t2 $f9 #define t3 $f10 #define t4 $f11 PROLOGUE li TEMP, SIZE MTC $0, a1 blez N, .L999 dsll INCX, INCX, BASE_SHIFT CMPEQ $fcc0, ALPHA, a1 NOP bc1f $fcc0, .L50 NOP bne INCX, TEMP, .L20 dsra I, N, 3 blez I, .L15 NOP .align 3 .L12: ST a1, 0 * SIZE(X) ST a1, 1 * SIZE(X) ST a1, 2 * SIZE(X) ST a1, 3 * SIZE(X) ST a1, 4 * SIZE(X) ST a1, 5 * SIZE(X) ST a1, 6 * SIZE(X) ST a1, 7 * SIZE(X) addiu I, I, -1 bgtz I, .L12 daddiu X, X, 8 * SIZE .align 3 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: ST a1, 0 * SIZE(X) daddiu I, I, -1 bgtz I, .L16 daddiu X, X, SIZE j $31 NOP .align 3 .L20: dsra I, N, 3 blez I, .L25 NOP .align 3 .L22: ST a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) daddiu I, I, -1 bgtz I, .L22 daddu X, X, INCX .align 3 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: daddiu I, I, -1 ST a1, 0 * SIZE(X) bgtz I, .L26 daddu X, X, INCX j $31 NOP .align 3 .L50: bne INCX, TEMP, .L60 dsra I, N, 3 blez I, .L55 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD a3, 2 * SIZE(X) LD a4, 3 * SIZE(X) LD a5, 4 * SIZE(X) LD a6, 5 * SIZE(X) LD a7, 6 * SIZE(X) LD a8, 7 * SIZE(X) blez I, .L53 NOP .align 3 .L52: MUL t1, ALPHA, a1 LD a1, 8 * SIZE(X) MUL t2, ALPHA, a2 LD a2, 9 * SIZE(X) MUL t3, ALPHA, a3 LD a3, 10 * SIZE(X) MUL t4, ALPHA, a4 LD a4, 11 * SIZE(X) ST t1, 0 * SIZE(X) MUL t1, ALPHA, a5 LD a5, 12 * SIZE(X) ST t2, 1 * SIZE(X) MUL t2, ALPHA, a6 LD a6, 13 * SIZE(X) ST t3, 2 * SIZE(X) MUL t3, ALPHA, a7 LD a7, 14 * SIZE(X) ST t4, 3 * SIZE(X) MUL t4, ALPHA, a8 LD a8, 15 * SIZE(X) daddiu I, I, -1 ST t1, 4 * SIZE(X) ST t2, 5 * SIZE(X) ST t3, 6 * SIZE(X) ST t4, 7 * SIZE(X) bgtz I, .L52 daddiu X, X, 8 * SIZE .align 3 .L53: MUL t1, ALPHA, a1 MUL t2, ALPHA, a2 MUL t3, ALPHA, a3 MUL t4, ALPHA, a4 ST t1, 0 * SIZE(X) MUL t1, ALPHA, a5 ST t2, 1 * SIZE(X) MUL t2, ALPHA, a6 ST t3, 2 * SIZE(X) MUL t3, ALPHA, a7 ST t4, 3 * SIZE(X) MUL t4, ALPHA, a8 ST t1, 4 * SIZE(X) ST t2, 5 * SIZE(X) ST t3, 6 * SIZE(X) ST t4, 7 * SIZE(X) daddiu X, X, 8 * SIZE .align 3 .L55: andi I, N, 7 blez I, .L999 NOP .align 3 .L56: LD a1, 0 * SIZE(X) MUL t1, ALPHA, a1 daddiu X, X, SIZE daddiu I, I, -1 bgtz I, .L56 ST t1, -1 * SIZE(X) j $31 NOP .align 3 .L60: dsra I, N, 3 move XX, X blez I, .L65 daddiu I, I, -1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddu X, X, INCX blez I, .L63 NOP .align 3 .L62: MUL t1, ALPHA, a1 LD a1, 0 * SIZE(X) daddu X, X, INCX MUL t2, ALPHA, a2 LD a2, 0 * SIZE(X) daddu X, X, INCX MUL t3, ALPHA, a3 LD a3, 0 * SIZE(X) daddu X, X, INCX MUL t4, ALPHA, a4 LD a4, 0 * SIZE(X) daddu X, X, INCX ST t1, 0 * SIZE(XX) daddu XX, XX, INCX ST t2, 0 * SIZE(XX) daddu XX, XX, INCX ST t3, 0 * SIZE(XX) daddu XX, XX, INCX ST t4, 0 * SIZE(XX) daddu XX, XX, INCX MUL t1, ALPHA, a5 LD a5, 0 * SIZE(X) daddu X, X, INCX MUL t2, ALPHA, a6 LD a6, 0 * SIZE(X) daddu X, X, INCX MUL t3, ALPHA, a7 LD a7, 0 * SIZE(X) daddu X, X, INCX MUL t4, ALPHA, a8 LD a8, 0 * SIZE(X) daddu X, X, INCX ST t1, 0 * SIZE(XX) daddu XX, XX, INCX ST t2, 0 * SIZE(XX) daddu XX, XX, INCX ST t3, 0 * SIZE(XX) daddu XX, XX, INCX ST t4, 0 * SIZE(XX) daddiu I, I, -1 bgtz I, .L62 daddu XX, XX, INCX .align 3 .L63: MUL t1, ALPHA, a1 MUL t2, ALPHA, a2 MUL t3, ALPHA, a3 MUL t4, ALPHA, a4 ST t1, 0 * SIZE(XX) daddu XX, XX, INCX ST t2, 0 * SIZE(XX) daddu XX, XX, INCX ST t3, 0 * SIZE(XX) daddu XX, XX, INCX ST t4, 0 * SIZE(XX) daddu XX, XX, INCX MUL t1, ALPHA, a5 MUL t2, ALPHA, a6 MUL t3, ALPHA, a7 MUL t4, ALPHA, a8 ST t1, 0 * SIZE(XX) daddu XX, XX, INCX ST t2, 0 * SIZE(XX) daddu XX, XX, INCX ST t3, 0 * SIZE(XX) daddu XX, XX, INCX ST t4, 0 * SIZE(XX) daddu XX, XX, INCX .align 3 .L65: andi I, N, 7 blez I, .L999 NOP .align 3 .L66: LD a1, 0 * SIZE(X) MUL t1, ALPHA, a1 daddiu I, I, -1 ST t1, 0 * SIZE(X) bgtz I, .L66 daddu X, X, INCX .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/sgemm_kernel_8x4_ps.S000066400000000000000000004131501313527062700215530ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define FETCH ld #define STACKSIZE 160 #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) ##### Parameter registers #### #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #### Pointer A, B, C #### #define AO $12 #define BO $13 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define PREA $18 #define PREB $19 #### Used registers #### #define A1 $f0 #define A2 $f1 #define A3 $f2 #define A4 $f3 #define A5 $f4 #define A6 $f5 #define A7 $f6 #define A8 $f7 #define B1 $f8 #define B2 $f9 #define B3 $f10 #define B4 $f11 #define B5 $f12 #define B6 $f13 #define B7 $f14 #define B8 $f15 #define C11 $f16 #define C12 $f17 #define C21 $f18 #define C22 $f19 #define C31 $f20 #define C32 $f21 #define C41 $f22 #define C42 $f23 #define C13 $f24 #define C14 $f25 #define C23 $f26 #define C24 $f27 #define C33 $f28 #define C34 $f29 #define C43 $f30 #define C44 $f31 #define I $2 #define J $3 #define L $7 #### Alpha register #### #define ALPHA $f15 #define F31 31 #define F30 30 #define F29 29 #define F28 28 #define F27 27 #define F26 26 #define F25 25 #define F24 24 #define F23 23 #define F22 22 #define F21 21 #define F20 20 #define F19 19 #define F18 18 #define F17 17 #define F16 16 #define F15 15 #define F14 14 #define F13 13 #define F12 12 #define F11 11 #define F10 10 #define F9 9 #define F8 8 #define F7 7 #define F6 6 #define F5 5 #define F4 4 #define F3 3 #define F2 2 #define F1 1 #define F0 0 #define R12 12 #define R13 13 #define R14 14 #define R15 15 #define R16 16 #define R17 17 #if defined(TRMMKERNEL) #define OFFSET $23 #define KK $24 #define TEMP $25 #endif # .text # .align 2 ## .globl gemm # .set nomips16 # .ent gemm # .type gemm, @function #gemm: # .frame $sp,STACKSIZE,$31 # vars= 48, regs= 1/0, args= 0, gp= 0 # .mask 0x40000000,-8 # .fmask 0x00000000,0 # .set noreorder # .set nomacro PROLOGUE daddiu $sp,$sp,-STACKSIZE sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 48($sp) ST $f24, 56($sp) ST $f25, 64($sp) ST $f26, 72($sp) ST $f27, 80($sp) ST $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) sd $24, 104($sp) sd $25, 112($sp) LDARG OFFSET, 160($sp) #endif #ifndef __64BIT__ ST $f20,120($sp) ST $f21,128($sp) ST $f22,136($sp) ST $f23,144($sp) #endif .align 4 .L4: dsra J, N, 2 # NR=4 dsll LDC, LDC, BASE_SHIFT# LDC*SIZE #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif blez J, .L2 ST ALPHA, 152($sp) .L48: dsra I, M, 3 # MR=8 dsll PREA, K, BASE_SHIFT move AO, A # Reset A move CO1, C daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC daddu PREA, A, PREA #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif blez I, .L44 daddu C, CO4, LDC .align 4 .L481: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) ||\ (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 3 + BASE_SHIFT # kk*8mr*datasize dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AO, L # AO point to the data addr daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 FETCH $0, 4 * SIZE(CO1) MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 FETCH $0, 4 * SIZE(CO2) daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) MOV C44, C11 FETCH $0, 4 * SIZE(CO3) PLU B3, B1, B1 FETCH $0, 0 * SIZE(CO4) PLU B4, B2, B2 FETCH $0, 4 * SIZE(CO4) #if (defined(LEFT) && !defined(TRANSA)) ||\ (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK # TEMP is the length of the data part #elif defined(LEFT) daddiu TEMP, KK, 8 #else daddiu TEMP, KK, 4 #endif dsra L, TEMP, 6 blez L, .L482 NOP #else # GEMM PART move BO, B # Reset B dsra L, K, 6 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 gsLQC1(R12, F3, F2, 1) # A3 A4 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 FETCH $0, 4 * SIZE(CO1) MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 FETCH $0, 4 * SIZE(CO2) daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) MOV C44, C11 FETCH $0, 4 * SIZE(CO3) PLU B3, B1, B1 FETCH $0, 0 * SIZE(CO4) PLU B4, B2, B2 blez L, .L482 FETCH $0, 4 * SIZE(CO4) #endif .L4810: daddiu L, L, -1 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 bgtz L, .L4810 MADPS C44, C44, A8, B8 .align 4 .L482: #ifndef TRMMKERNEL andi L, K, 32 #else andi L, TEMP, 32 #endif blez L, .L483 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 .align 4 .L483: #ifndef TRMMKERNEL andi L, K, 16 #else andi L, TEMP, 16 #endif blez L, .L484 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 .align 4 .L484: #ifndef TRMMKERNEL andi L, K, 8 #else andi L, TEMP, 8 #endif blez L, .L485 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 .align 4 .L485: #ifndef TRMMKERNEL andi L, K, 4 #else andi L, TEMP, 4 #endif blez L, .L486 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 4) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 5) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 6) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 7) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 8 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 32 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 16 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 20 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 12 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 24 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 28 * SIZE(PREA) daddiu PREA, PREA, 32 * SIZE MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 .align 4 .L486: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L487 NOP MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 FETCH $0, 0 * SIZE(PREB) MADPS C13, C13, A1, B3 daddiu BO, BO, 8 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 16 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 PLU B7, B5, B5 FETCH $0, 0 * SIZE(PREA) MADPS C24, C24, A2, B4 PLU B8, B6, B6 FETCH $0, 4 * SIZE(PREA) MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 MADPS C11, C11, A5, B5 MADPS C21, C21, A6, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C12, C12, A5, B6 MADPS C22, C22, A6, B6 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C31, C31, A7, B5 MADPS C41, C41, A8, B5 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C32, C32, A7, B6 MADPS C42, C42, A8, B6 FETCH $0, 4 * SIZE(PREB) MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 daddiu PREB, PREB, 8 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 MADPS C14, C14, A5, B8 PLU B3, B1, B1 FETCH $0, 8 * SIZE(PREA) MADPS C24, C24, A6, B8 PLU B4, B2, B2 FETCH $0, 12 * SIZE(PREA) MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 daddiu PREA, PREA, 16 * SIZE .align 4 .L487: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L480 LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 MADPS C12, C12, A1, B2 MADPS C22, C22, A2, B2 MADPS C31, C31, A3, B1 MADPS C41, C41, A4, B1 MADPS C32, C32, A3, B2 MADPS C42, C42, A4, B2 MADPS C13, C13, A1, B3 daddiu BO, BO, 4 * SIZE # 4KR*4NR MADPS C23, C23, A2, B3 daddiu AO, AO, 8 * SIZE # 4KR*8MR MADPS C33, C33, A3, B3 MADPS C43, C43, A4, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 MADPS C34, C34, A3, B4 MADPS C44, C44, A4, B4 .align 4 .L480: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 CVTU A1, C13 # A1=C13.upper=c12 CVTU A2, C11 # A2=C11.upper=c22 CVTU A3, C23 # A3=C23.upper=c14 LD B1, 1 * SIZE(CO1) CVTU A4, C21 # A4=C21.upper=c24 LD B2, 1 * SIZE(CO2) CVTU A5, C33 # A5=C33.upper=c16 LD B3, 3 * SIZE(CO1) CVTU A6, C31 # A6=C31.upper=c26 LD B4, 3 * SIZE(CO2) CVTU A7, C43 # A7=C43.upper=c18 LD B5, 5 * SIZE(CO1) CVTU A8, C41 # A8=C41.upper=c28 LD B6, 5 * SIZE(CO2) MADD A1, B1, A1, ALPHA # c12 LD B7, 7 * SIZE(CO1) MADD A2, B2, A2, ALPHA # c22 LD B1, 7 * SIZE(CO2) MADD A3, B3, A3, ALPHA # c14 LD B2, 0 * SIZE(CO1) MADD A4, B4, A4, ALPHA # c24 LD B3, 0 * SIZE(CO2) MADD A5, B5, A5, ALPHA # c16 LD B4, 2 * SIZE(CO1) MADD A6, B6, A6, ALPHA # c26 LD B5, 2 * SIZE(CO2) MADD A7, B7, A7, ALPHA # c18 LD B6, 4 * SIZE(CO1) MADD A8, B1, A8, ALPHA # c28 ST A1, 1 * SIZE(CO1) MADD C11, B2, C11, ALPHA # c12 LD B7, 4 * SIZE(CO2) MADD C13, B3, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) MADD C21, B4, C21, ALPHA # c14 LD A1, 6 * SIZE(CO1) MADD C23, B5, C23, ALPHA # c24 ST A3, 3 * SIZE(CO1) MADD C31, B6, C31, ALPHA # c16 LD A2, 6 * SIZE(CO2) MADD C33, B7, C33, ALPHA # c26 ST A4, 3 * SIZE(CO2) ST A5, 5 * SIZE(CO1) ST A6, 5 * SIZE(CO2) ST A7, 7 * SIZE(CO1) ST A8, 7 * SIZE(CO2) MADD C41, A1, C41, ALPHA # c18 ST C11, 0 * SIZE(CO1) MADD C43, A2, C43, ALPHA # c28 ST C13, 0 * SIZE(CO2) ST C21, 2 * SIZE(CO1) ST C23, 2 * SIZE(CO2) ST C31, 4 * SIZE(CO1) ST C33, 4 * SIZE(CO2) ST C41, 6 * SIZE(CO1) CVTU A1, C14 # B1=C12.upper=c42 ST C43, 6 * SIZE(CO2) CVTU A2, C12 # B2=C14.upper=c32 LD B1, 1 * SIZE(CO3) CVTU A3, C24 # B3=C22.upper=c44 LD B2, 1 * SIZE(CO4) CVTU A4, C22 # B4=C24.upper=c34 LD B3, 3 * SIZE(CO3) CVTU A5, C34 # B5=C32.upper=c46 LD B4, 3 * SIZE(CO4) CVTU A6, C32 # B6=C24.upper=c36 LD B5, 5 * SIZE(CO3) CVTU A7, C44 # B7=C42.upper=c48 LD B6, 5 * SIZE(CO4) CVTU A8, C42 # A1=C44.upper=c38 LD B7, 7 * SIZE(CO3) MADD A1, B1, A1, ALPHA # c31 LD C11, 7 * SIZE(CO4) MADD A2, B2, A2, ALPHA LD C13, 0 * SIZE(CO3) MADD A3, B3, A3, ALPHA LD C21, 0 * SIZE(CO4) MADD A4, B4, A4, ALPHA LD C23, 2 * SIZE(CO3) MADD A5, B5, A5, ALPHA LD C31, 2 * SIZE(CO4) MADD A6, B6, A6, ALPHA LD C33, 4 * SIZE(CO3) MADD A7, B7, A7, ALPHA LD C41, 4 * SIZE(CO4) MADD A8, C11, A8, ALPHA ST A1, 1 * SIZE(CO3) MADD C12, C13, C12, ALPHA LD C43, 6 * SIZE(CO3) MADD C14, C21, C14, ALPHA ST A2, 1 * SIZE(CO4) MADD C22, C23, C22, ALPHA LD B1, 6 * SIZE(CO4) MADD C24, C31, C24, ALPHA ST A3, 3 * SIZE(CO3) MADD C32, C33, C32, ALPHA ST A4, 3 * SIZE(CO4) MADD C34, C41, C34, ALPHA ST A5, 5 * SIZE(CO3) MADD C42, C43, C42, ALPHA ST A6, 5 * SIZE(CO4) ST A7, 7 * SIZE(CO3) NOP MADD C44, B1, C44, ALPHA ST A8, 7 * SIZE(CO4) ST C12, 0 * SIZE(CO3) ST C14, 0 * SIZE(CO4) ST C22, 2 * SIZE(CO3) ST C24, 2 * SIZE(CO4) ST C32, 4 * SIZE(CO3) ST C34, 4 * SIZE(CO4) ST C42, 6 * SIZE(CO3) ST C44, 6 * SIZE(CO4) daddiu CO1, CO1, 8 * SIZE daddiu CO2, CO2, 8 * SIZE daddiu CO3, CO3, 8 * SIZE bgtz I, .L481 daddiu CO4, CO4, 8 * SIZE #else daddiu I, I, -1 CVTU A1, C13 # A1=C13.upper=c12 CVTU A2, C11 # A2=C11.upper=c22 CVTU A3, C23 # A3=C23.upper=c14 CVTU A4, C21 # A4=C21.upper=c24 CVTU A5, C33 # A5=C33.upper=c16 CVTU A6, C31 # A6=C31.upper=c26 CVTU A7, C43 # A7=C43.upper=c18 CVTU A8, C41 # A8=C41.upper=c28 MUL A1, A1, ALPHA # c12 MUL A2, A2, ALPHA # c22 MUL A3, A3, ALPHA # c14 MUL A4, A4, ALPHA # c24 MUL A5, A5, ALPHA # c16 MUL A6, A6, ALPHA # c26 MUL A7, A7, ALPHA # c18 MUL A8, A8, ALPHA # c28 MUL C11, C11, ALPHA # c12 ST A1, 1 * SIZE(CO1) MUL C13, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) MUL C21, C21, ALPHA # c14 ST A3, 3 * SIZE(CO1) MUL C23, C23, ALPHA # c24 ST A4, 3 * SIZE(CO2) MUL C31, C31, ALPHA # c16 ST A5, 5 * SIZE(CO1) MUL C33, C33, ALPHA # c26 ST A6, 5 * SIZE(CO2) MUL C41, C41, ALPHA # c18 ST A7, 7 * SIZE(CO1) MUL C43, C43, ALPHA # c28 ST A8, 7 * SIZE(CO2) CVTU A1, C14 # B1=C12.upper=c42 ST C11, 0 * SIZE(CO1) CVTU A2, C12 # B2=C14.upper=c32 ST C13, 0 * SIZE(CO2) CVTU A3, C24 # B3=C22.upper=c44 ST C21, 2 * SIZE(CO1) CVTU A4, C22 # B4=C24.upper=c34 ST C23, 2 * SIZE(CO2) CVTU A5, C34 # B5=C32.upper=c46 ST C31, 4 * SIZE(CO1) CVTU A6, C32 # B6=C24.upper=c36 ST C33, 4 * SIZE(CO2) CVTU A7, C44 # B7=C42.upper=c48 ST C41, 6 * SIZE(CO1) CVTU A8, C42 # A1=C44.upper=c38 ST C43, 6 * SIZE(CO2) MUL A1, A1, ALPHA # c31 MUL A2, A2, ALPHA MUL A3, A3, ALPHA MUL A4, A4, ALPHA MUL A5, A5, ALPHA MUL A6, A6, ALPHA MUL A7, A7, ALPHA MUL A8, A8, ALPHA MUL C12, C12, ALPHA ST A1, 1 * SIZE(CO3) MUL C14, C14, ALPHA ST A2, 1 * SIZE(CO4) MUL C22, C22, ALPHA ST A3, 3 * SIZE(CO3) MUL C24, C24, ALPHA ST A4, 3 * SIZE(CO4) MUL C32, C32, ALPHA ST A5, 5 * SIZE(CO3) MUL C34, C34, ALPHA ST A6, 5 * SIZE(CO4) MUL C42, C42, ALPHA ST A7, 7 * SIZE(CO3) MUL C44, C44, ALPHA ST A8, 7 * SIZE(CO4) ST C12, 0 * SIZE(CO3) ST C14, 0 * SIZE(CO4) ST C22, 2 * SIZE(CO3) ST C24, 2 * SIZE(CO4) ST C32, 4 * SIZE(CO3) ST C34, 4 * SIZE(CO4) ST C42, 6 * SIZE(CO3) ST C44, 6 * SIZE(CO4) daddiu CO1, CO1, 8 * SIZE daddiu CO2, CO2, 8 * SIZE daddiu CO3, CO3, 8 * SIZE daddiu CO4, CO4, 8 * SIZE #if ( defined(LEFT) && defined(TRANSA)) ||\ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -8 #else daddiu TEMP, TEMP, -4 #endif dsll L, TEMP, 3 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 8 #endif bgtz I, .L481 NOP #endif .align 4 .L44: andi I, M, 4 # MR=4 blez I, .L42 NOP .align 4 .L441: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) ||\ (!defined(LEFT) && !defined(TRANSA)) move BO, B # Reset B #else dsll L, KK, 2 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) MOV C44, C11 PLU B3, B1, B1 FETCH $0, 0 * SIZE(CO4) PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) ||\ (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddu TEMP, KK, 4 #else daddu TEMP, KK, 4 #endif dsra L, TEMP, 2 blez L, .L442 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) MOV C44, C11 PLU B3, B1, B1 FETCH $0, 0 * SIZE(CO4) blez L, .L442 PLU B4, B2, B2 #endif .L4410: # daddiu L, L, -1 MADPS C11, C11, A1, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C21, C21, A2, B1 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C12, C12, A1, B2 FETCH $0, 0 * SIZE(PREB) MADPS C22, C22, A2, B2 FETCH $0, 0 * SIZE(PREA) MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C11, C11, A3, B5 gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C21, C21, A4, B5 gsLQC1(R12, F5, F4, 2) # A5 A6 MADPS C12, C12, A3, B6 FETCH $0, 4 * SIZE(PREB) MADPS C22, C22, A4, B6 FETCH $0, 4 * SIZE(PREA) MADPS C13, C13, A3, B7 MADPS C23, C23, A4, B7 MADPS C14, C14, A3, B8 MADPS C24, C24, A4, B8 PLU B3, B1, B1 PLU B4, B2, B2 MADPS C11, C11, A5, B1 gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C21, C21, A6, B1 gsLQC1(R12, F7, F6, 3) # A7 A8 MADPS C12, C12, A5, B2 FETCH $0, 8 * SIZE(PREB) daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C22, C22, A6, B2 FETCH $0, 8 * SIZE(PREA) daddiu AO, AO, 16 * SIZE # 4KR*4MR MADPS C13, C13, A5, B3 MADPS C23, C23, A6, B3 MADPS C14, C14, A5, B4 MADPS C24, C24, A6, B4 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C11, C11, A7, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C21, C21, A8, B5 gsLQC1(R12, F1, F0, 0) # A1 A2 MADPS C12, C12, A7, B6 FETCH $0, 12 * SIZE(PREB) MADPS C22, C22, A8, B6 FETCH $0, 12 * SIZE(PREA) MADPS C13, C13, A7, B7 daddiu PREA, PREA, 16 * SIZE MADPS C23, C23, A8, B7 daddiu PREB, PREB, 16 * SIZE MADPS C14, C14, A7, B8 MADPS C24, C24, A8, B8 PLU B3, B1, B1 bgtz L, .L4410 PLU B4, B2, B2 .align 4 .L442: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L443 NOP MADPS C11, C11, A1, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C21, C21, A2, B1 gsLQC1(R12, F3, F2, 1) # A3 A4 MADPS C12, C12, A1, B2 FETCH $0, 0 * SIZE(PREB) daddiu BO, BO, 8 * SIZE # 2KR*4NR MADPS C22, C22, A2, B2 FETCH $0, 0 * SIZE(PREA) daddiu AO, AO, 8 * SIZE # 2KR*4MR MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C11, C11, A3, B5 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C21, C21, A4, B5 gsLQC1(R12, F1, F0, 0) # A5 A6 MADPS C12, C12, A3, B6 FETCH $0, 4 * SIZE(PREB) MADPS C22, C22, A4, B6 FETCH $0, 4 * SIZE(PREA) MADPS C13, C13, A3, B7 daddiu PREB, PREB, 8 MADPS C23, C23, A4, B7 daddiu PREA, PREA, 8 MADPS C14, C14, A3, B8 MADPS C24, C24, A4, B8 PLU B3, B1, B1 PLU B4, B2, B2 .align 4 .L443: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L440 LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 MADPS C12, C12, A1, B2 daddiu BO, BO, 4 * SIZE # 1KR*4NR MADPS C22, C22, A2, B2 daddiu AO, AO, 4 * SIZE # 1KR*4MR MADPS C13, C13, A1, B3 MADPS C23, C23, A2, B3 MADPS C14, C14, A1, B4 MADPS C24, C24, A2, B4 .align 4 .L440: #ifndef TRMMKERNEL CVTU A1, C13 # A1=C13.upper=c12 LD B1, 1 * SIZE(CO1) CVTU A2, C11 # A2=C11.upper=c22 LD B2, 1 * SIZE(CO2) CVTU A3, C23 # A3=C23.upper=c14 LD B3, 3 * SIZE(CO1) CVTU A4, C21 # A4=C21.upper=c24 LD B4, 3 * SIZE(CO2) MADD A1, B1, A1, ALPHA # c12 LD B5, 0 * SIZE(CO1) MADD A2, B2, A2, ALPHA # c22 LD B6, 0 * SIZE(CO2) MADD A3, B3, A3, ALPHA # c14 LD B7, 2 * SIZE(CO1) MADD A4, B4, A4, ALPHA # c24 LD B1, 2 * SIZE(CO2) MADD C11, B5, C11, ALPHA # c12 ST A1, 1 * SIZE(CO1) MADD C13, B6, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) MADD C21, B7, C21, ALPHA # c14 ST A3, 3 * SIZE(CO1) MADD C23, B1, C23, ALPHA # c24 ST A4, 3 * SIZE(CO2) ST C11, 0 * SIZE(CO1) ST C13, 0 * SIZE(CO2) ST C21, 2 * SIZE(CO1) ST C23, 2 * SIZE(CO2) CVTU A1, C14 # B1=C12.upper=c42 LD B1, 1 * SIZE(CO3) CVTU A2, C12 # B2=C14.upper=c32 LD B2, 1 * SIZE(CO4) CVTU A3, C24 # B3=C22.upper=c44 LD B3, 3 * SIZE(CO3) CVTU A4, C22 # B4=C24.upper=c34 LD B4, 3 * SIZE(CO4) MADD A1, B1, A1, ALPHA # c31 LD A5, 0 * SIZE(CO3) MADD A2, B2, A2, ALPHA LD A6, 0 * SIZE(CO4) MADD A3, B3, A3, ALPHA LD A7, 2 * SIZE(CO3) MADD A4, B4, A4, ALPHA LD A8, 2 * SIZE(CO4) MADD C12, A5, C12, ALPHA ST A1, 1 * SIZE(CO3) MADD C14, A6, C14, ALPHA ST A2, 1 * SIZE(CO4) MADD C22, A7, C22, ALPHA ST A3, 3 * SIZE(CO3) MADD C24, A8, C24, ALPHA ST A4, 3 * SIZE(CO4) ST C12, 0 * SIZE(CO3) ST C14, 0 * SIZE(CO4) ST C22, 2 * SIZE(CO3) ST C24, 2 * SIZE(CO4) daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE #else CVTU A1, C13 # A1=C13.upper=c12 CVTU A2, C11 # A2=C11.upper=c22 CVTU A3, C23 # A3=C23.upper=c14 CVTU A4, C21 # A4=C21.upper=c24 MUL A1, A1, ALPHA # c12 MUL A2, A2, ALPHA # c22 MUL A3, A3, ALPHA # c14 MUL A4, A4, ALPHA # c24 MUL C11, C11, ALPHA # c12 ST A1, 1 * SIZE(CO1) MUL C13, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) MUL C21, C21, ALPHA # c14 ST A3, 3 * SIZE(CO1) MUL C23, C23, ALPHA # c24 ST A4, 3 * SIZE(CO2) CVTU A5, C14 # B1=C12.upper=c42 ST C11, 0 * SIZE(CO1) CVTU A6, C12 # B2=C14.upper=c32 ST C13, 0 * SIZE(CO2) CVTU A7, C24 # B3=C22.upper=c44 ST C21, 2 * SIZE(CO1) CVTU A8, C22 # B4=C24.upper=c34 ST C23, 2 * SIZE(CO2) MUL A5, A5, ALPHA # c31 MUL A6, A6, ALPHA MUL A7, A7, ALPHA MUL A8, A8, ALPHA MUL C12, C12, ALPHA ST A5, 1 * SIZE(CO3) MUL C14, C14, ALPHA ST A6, 1 * SIZE(CO4) MUL C22, C22, ALPHA ST A7, 3 * SIZE(CO3) MUL C24, C24, ALPHA ST A8, 3 * SIZE(CO4) ST C12, 0 * SIZE(CO3) ST C14, 0 * SIZE(CO4) ST C22, 2 * SIZE(CO3) ST C24, 2 * SIZE(CO4) daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE #if ( defined(LEFT) && defined(TRANSA))||\ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -4 #endif dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif #endif .align 4 .L42: andi I, M, 2 blez I, .L41 NOP .align 4 .L421: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) ||\ (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 MOV C43, C11 FETCH $0, 0 * SIZE(CO3) MOV C44, C11 PLU B3, B1, B1 FETCH $0, 0 * SIZE(CO4) PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) ||\ (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 4 #endif dsra L, TEMP, 2 blez L, .L422 NOP #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C41, C11 MOV C42, C11 gsLQC1(R12, F1, F0, 0) # A1 A2 MOV C13, C11 MOV C14, C11 MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 MOV C43, C11 FETCH $0, 0 * SIZE(CO3) MOV C44, C11 PLU B3, B1, B1 FETCH $0, 0 * SIZE(CO4) blez L, .L422 PLU B4, B2, B2 #endif .L4210: daddiu L, L, -1 MADPS C11, C11, A1, B1 MADPS C12, C12, A1, B2 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C13, C13, A1, B3 MADPS C14, C14, A1, B4 gsLQC1(R12, F3, F2, 1) # B1 B2 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C11, C11, A2, B5 MADPS C12, C12, A2, B6 daddiu AO, AO, 8 * SIZE # 4KR*2MR gsLQC1(R13, F9, F8, 2) # B1 B2 MADPS C13, C13, A2, B7 MADPS C14, C14, A2, B8 PLU B3, B1, B1 PLU B4, B2, B2 MADPS C11, C11, A3, B1 gsLQC1(R12, F1, F0, 0) # B3 B4 MADPS C12, C12, A3, B2 gsLQC1(R13, F13, F12, 3) # B3 B4 daddiu BO, BO, 16 * SIZE # 4KR*4NR MADPS C13, C13, A3, B3 MADPS C14, C14, A3, B4 PLU B7, B5, B5 PLU B8, B6, B6 MADPS C11, C11, A4, B5 MADPS C12, C12, A4, B6 gsLQC1(R13, F9, F8, 0) # B3 B4 MADPS C13, C13, A4, B7 MADPS C14, C14, A4, B8 PLU B3, B1, B1 bgtz L, .L4210 PLU B4, B2, B2 .align 4 .L422: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L423 NOP daddiu AO, AO, 4 * SIZE # 2KR*2MR MADPS C11, C11, A1, B1 MADPS C12, C12, A1, B2 gsLQC1(R13, F13, F12, 1) # B3 B4 MADPS C13, C13, A1, B3 MADPS C14, C14, A1, B4 daddiu BO, BO, 8 * SIZE # 2KR*2MR PLU B7, B5, B5 PLU B8, B6, B6 MADPS C11, C11, A2, B5 MADPS C12, C12, A2, B6 gsLQC1(R13, F9, F8, 0) # B1 B2 MADPS C13, C13, A2, B7 MADPS C14, C14, A2, B8 gsLQC1(R12, F1, F0, 0) PLU B3, B1, B1 PLU B4, B2, B2 .L423: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L420 LD ALPHA, 152($sp) MADPS C11, C11, A1, B1 MADPS C12, C12, A1, B2 daddiu BO, BO, 4 * SIZE # 2KR*4NR daddiu AO, AO, 2 * SIZE # 2KR*4MR MADPS C13, C13, A1, B3 MADPS C14, C14, A1, B4 .align 4 .L420: #ifndef TRMMKERNEL CVTU A1, C13 # A1=C13.upper=c12 LD B1, 1 * SIZE(CO1) CVTU A2, C11 # A2=C11.upper=c22 LD B2, 1 * SIZE(CO2) MADD A1, B1, A1, ALPHA # c12 LD B5, 0 * SIZE(CO1) MADD A2, B2, A2, ALPHA # c22 LD B6, 0 * SIZE(CO2) MADD C11, B5, C11, ALPHA # c12 ST A1, 1 * SIZE(CO1) MADD C13, B6, C13, ALPHA # c22 ST A2, 1 * SIZE(CO2) ST C11, 0 * SIZE(CO1) ST C13, 0 * SIZE(CO2) CVTU A1, C14 # B1=C12.upper=c42 LD B1, 1 * SIZE(CO3) CVTU A2, C12 # B2=C14.upper=c32 LD B2, 1 * SIZE(CO4) MADD A1, B1, A1, ALPHA # c31 LD A5, 0 * SIZE(CO3) MADD A2, B2, A2, ALPHA LD A6, 0 * SIZE(CO4) MADD C12, A5, C12, ALPHA ST A1, 1 * SIZE(CO3) MADD C14, A6, C14, ALPHA ST A2, 1 * SIZE(CO4) ST C12, 0 * SIZE(CO3) ST C14, 0 * SIZE(CO4) daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE #else CVTU A1, C13 # A1=C13.upper=c12 CVTU A2, C11 # A2=C11.upper=c22 MUL A1, A1, ALPHA # c12 MUL A2, A2, ALPHA # c22 MUL C11, C11, ALPHA # c12 MUL C13, C13, ALPHA # c22 CVTU A3, C14 # B1=C12.upper=c42 CVTU A4, C12 # B2=C14.upper=c32 MUL A3, A3, ALPHA # c31 ST A1, 1 * SIZE(CO1) MUL A4, A4, ALPHA ST A2, 1 * SIZE(CO2) MUL C12, C12, ALPHA ST C11, 0 * SIZE(CO1) MUL C14, C14, ALPHA ST C13, 0 * SIZE(CO2) ST A3, 1 * SIZE(CO3) ST A4, 1 * SIZE(CO4) ST C12, 0 * SIZE(CO3) ST C14, 0 * SIZE(CO4) daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA))||\ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -4 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 4 .L41: andi I, M, 1 blez I, .L40 NOP .align 4 .L411: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) ||\ (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD B1, 0 * SIZE(BO) MOV C21, C11 MOV C22, C11 LD A1, 0 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD B2, 1 * SIZE(BO) MOV C41, C11 MOV C42, C11 LD B3, 2 * SIZE(BO) MOV C13, C11 MOV C14, C11 LD B4, 3 * SIZE(BO) MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA))||\ (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra L, TEMP, 2 blez L, .L412 #else move BO, B # Reset B dsra L, K, 2 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD B1, 0 * SIZE(BO) MOV C21, C11 MOV C22, C11 LD A1, 0 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD B2, 1 * SIZE(BO) MOV C41, C11 MOV C42, C11 LD B3, 2 * SIZE(BO) MOV C13, C11 MOV C14, C11 LD B4, 3 * SIZE(BO) MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 blez L, .L412 MOV C44, C11 #endif .L4110: daddiu L, L, -1 LD A2, 1 * SIZE(AO) MADD C11, C11, A1, B1 LD B5, 4 * SIZE(BO) MADD C12, C12, A1, B2 LD B6, 5 * SIZE(BO) MADD C13, C13, A1, B3 LD B7, 6 * SIZE(BO) MADD C14, C14, A1, B4 LD B8, 7 * SIZE(BO) LD A3, 2 * SIZE(AO) NOP MADD C11, C11, A2, B5 LD B1, 8 * SIZE(BO) MADD C12, C12, A2, B6 LD B2, 9 * SIZE(BO) MADD C13, C13, A2, B7 LD B3, 10 * SIZE(BO) MADD C14, C14, A2, B8 LD B4, 11 * SIZE(BO) LD A4, 3 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD C11, C11, A3, B1 LD B5, 12 * SIZE(BO) MADD C12, C12, A3, B2 LD B6, 13 * SIZE(BO) MADD C13, C13, A3, B3 LD B7, 14 * SIZE(BO) MADD C14, C14, A3, B4 LD B8, 15 * SIZE(BO) LD A1, 0 * SIZE(AO) daddiu BO, BO, 16 * SIZE MADD C11, C11, A4, B5 LD B1, 0 * SIZE(BO) MADD C12, C12, A4, B6 LD B2, 1 * SIZE(BO) MADD C13, C13, A4, B7 LD B3, 2 * SIZE(BO) MADD C14, C14, A4, B8 bgtz L, .L4110 LD B4, 3 * SIZE(BO) .L412: #ifndef TRMMKERNEL andi L, K, 2 #else andi L, TEMP, 2 #endif blez L, .L413 NOP LD A2, 1 * SIZE(AO) daddiu AO, AO, 2 * SIZE MADD C11, C11, A1, B1 LD B5, 4 * SIZE(BO) MADD C12, C12, A1, B2 LD B6, 5 * SIZE(BO) MADD C13, C13, A1, B3 LD B7, 6 * SIZE(BO) MADD C14, C14, A1, B4 LD B8, 7 * SIZE(BO) LD A1, 0 * SIZE(AO) daddiu BO, BO, 8 * SIZE MADD C11, C11, A2, B5 LD B1, 0 * SIZE(BO) MADD C12, C12, A2, B6 LD B2, 1 * SIZE(BO) MADD C13, C13, A2, B7 LD B3, 2 * SIZE(BO) MADD C14, C14, A2, B8 LD B4, 3 * SIZE(BO) .L413: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L410 LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C12, C12, A1, B2 daddiu AO, AO, 1 * SIZE MADD C13, C13, A1, B3 MADD C14, C14, A1, B4 daddiu BO, BO, 4 * SIZE .align 4 .L410: #ifndef TRMMKERNEL LD A5, 0 * SIZE(CO1) LD A6, 0 * SIZE(CO2) LD A7, 0 * SIZE(CO3) LD A8, 0 * SIZE(CO4) MADD A5, A5, C11, ALPHA MADD A6, A6, C12, ALPHA MADD A7, A7, C13, ALPHA MADD A8, A8, C14, ALPHA ST A5, 0 * SIZE(CO1) ST A6, 0 * SIZE(CO2) ST A7, 0 * SIZE(CO3) ST A8, 0 * SIZE(CO4) daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE #else MUL A5, C11, ALPHA MUL A6, C12, ALPHA MUL A7, C13, ALPHA MUL A8, C14, ALPHA ST A5, 0 * SIZE(CO1) ST A6, 0 * SIZE(CO2) ST A7, 0 * SIZE(CO3) ST A8, 0 * SIZE(CO4) daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE #if ( defined(LEFT) && defined(TRANSA))||\ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 4 .L40: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 4 #endif daddiu J, J, -1 move B, BO bgtz J, .L48 NOP .align 4 .L2: # Nr=2 andi J, N, 2 blez J, .L1 NOP .L28: dsra I, M, 3 # MR=8 move AO, A # Reset A move CO1, C #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif daddu CO2, C, LDC blez I, .L24 daddu C, CO2, LDC .align 4 .L281: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 3 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 LD A3, 2 * SIZE(AO) MOV C22, C11 LD A4, 3 * SIZE(AO) MOV C31, C11 LD A5, 4 * SIZE(AO) MOV C32, C11 LD A6, 5 * SIZE(AO) MOV C41, C11 LD B1, 0 * SIZE(BO) MOV C42, C11 LD B2, 1 * SIZE(BO) MOV C13, C11 LD A7, 6 * SIZE(AO) MOV C14, C11 LD A8, 7 * SIZE(AO) MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 8 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 1 blez L, .L282 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 LD A3, 2 * SIZE(AO) MOV C22, C11 LD A4, 3 * SIZE(AO) MOV C31, C11 LD A5, 4 * SIZE(AO) MOV C32, C11 LD A6, 5 * SIZE(AO) MOV C41, C11 LD B1, 0 * SIZE(BO) MOV C42, C11 LD B2, 1 * SIZE(BO) MOV C13, C11 LD A7, 6 * SIZE(AO) MOV C14, C11 LD A8, 7 * SIZE(AO) MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 blez L, .L282 MOV C44, C11 #endif .align 4 .L2810: daddiu L, L, -1 MADD C11, C11, A1, B1 LD B5, 8 * SIZE(AO) MADD C21, C21, A2, B1 LD B6, 9 * SIZE(AO) MADD C31, C31, A3, B1 LD B7, 10 * SIZE(AO) MADD C41, C41, A4, B1 LD B8, 11 * SIZE(AO) MADD C12, C12, A1, B2 MADD C22, C22, A2, B2 LD B3, 2 * SIZE(BO) MADD C32, C32, A3, B2 MADD C42, C42, A4, B2 LD B4, 3 * SIZE(BO) daddiu BO, BO, 4 * SIZE MADD C13, C13, A5, B1 MADD C23, C23, A6, B1 LD A1, 12 * SIZE(AO) MADD C33, C33, A7, B1 MADD C43, C43, A8, B1 LD A2, 13 * SIZE(AO) MADD C14, C14, A5, B2 MADD C24, C24, A6, B2 LD A3, 14 * SIZE(AO) MADD C34, C34, A7, B2 MADD C44, C44, A8, B2 LD A4, 15 * SIZE(AO) daddiu AO, AO, 16 * SIZE MADD C11, C11, B5, B3 LD A5, 4 * SIZE(AO) MADD C21, C21, B6, B3 LD A6, 5 * SIZE(AO) MADD C13, C13, A1, B3 MADD C23, C23, A2, B3 LD A7, 6 * SIZE(AO) MADD C33, C33, A3, B3 MADD C43, C43, A4, B3 LD A8, 7 * SIZE(AO) MADD C14, C14, A1, B4 MADD C24, C24, A2, B4 LD B1, 0 * SIZE(BO) MADD C34, C34, A3, B4 MADD C44, C44, A4, B4 LD B2, 1 * SIZE(BO) MADD C31, C31, B7, B3 MADD C41, C41, B8, B3 LD A1, 0 * SIZE(AO) MADD C12, C12, B5, B4 LD A2, 1 * SIZE(AO) MADD C22, C22, B6, B4 LD A3, 2 * SIZE(AO) LD A4, 3 * SIZE(AO) MADD C32, C32, B7, B4 bgtz L, .L2810 MADD C42, C42, B8, B4 .align 4 .L282: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L280 LD ALPHA, 152($sp) MADD C13, C13, A5, B1 MADD C23, C23, A6, B1 MADD C33, C33, A7, B1 MADD C43, C43, A8, B1 MADD C14, C14, A5, B2 MADD C24, C24, A6, B2 MADD C34, C34, A7, B2 MADD C44, C44, A8, B2 daddiu AO, AO, 8 * SIZE MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 MADD C31, C31, A3, B1 MADD C41, C41, A4, B1 MADD C12, C12, A1, B2 MADD C22, C22, A2, B2 MADD C32, C32, A3, B2 MADD C42, C42, A4, B2 daddiu BO, BO, 2 * SIZE .align 4 .L280: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 LD A1, 0 * SIZE(CO1) LD A2, 1 * SIZE(CO1) LD A3, 2 * SIZE(CO1) LD A4, 3 * SIZE(CO1) LD A5, 4 * SIZE(CO1) LD A6, 5 * SIZE(CO1) LD A7, 6 * SIZE(CO1) LD A8, 7 * SIZE(CO1) MADD A1, A1, C11, ALPHA LD B1, 0 * SIZE(CO2) MADD A2, A2, C21, ALPHA LD B2, 1 * SIZE(CO2) MADD A3, A3, C31, ALPHA LD B3, 2 * SIZE(CO2) MADD A4, A4, C41, ALPHA LD B4, 3 * SIZE(CO2) MADD A5, A5, C13, ALPHA LD B5, 4 * SIZE(CO2) MADD A6, A6, C23, ALPHA LD B6, 5 * SIZE(CO2) MADD A7, A7, C33, ALPHA LD B7, 6 * SIZE(CO2) MADD A8, A8, C43, ALPHA LD C11, 7 * SIZE(CO2) MADD B1, B1, C12, ALPHA ST A1, 0 * SIZE(CO1) MADD B2, B2, C22, ALPHA ST A2, 1 * SIZE(CO1) MADD B3, B3, C32, ALPHA ST A3, 2 * SIZE(CO1) MADD B4, B4, C42, ALPHA ST A4, 3 * SIZE(CO1) MADD B5, B5, C14, ALPHA ST A5, 4 * SIZE(CO1) MADD B6, B6, C24, ALPHA ST A6, 5 * SIZE(CO1) MADD B7, B7, C34, ALPHA ST A7, 6 * SIZE(CO1) MADD C11, C11, C44, ALPHA ST A8, 7 * SIZE(CO1) ST B1, 0 * SIZE(CO2) ST B2, 1 * SIZE(CO2) ST B3, 2 * SIZE(CO2) ST B4, 3 * SIZE(CO2) ST B5, 4 * SIZE(CO2) ST B6, 5 * SIZE(CO2) ST B7, 6 * SIZE(CO2) ST C11, 7 * SIZE(CO2) daddiu CO1, CO1, 8 * SIZE bgtz I, .L281 daddiu CO2, CO2, 8 * SIZE #else daddiu I, I, -1 MUL A1, C11, ALPHA MUL A2, C21, ALPHA MUL A3, C31, ALPHA MUL A4, C41, ALPHA MUL A5, C13, ALPHA MUL A6, C23, ALPHA MUL A7, C33, ALPHA MUL A8, C43, ALPHA MUL B1, C12, ALPHA ST A1, 0 * SIZE(CO1) MUL B2, C22, ALPHA ST A2, 1 * SIZE(CO1) MUL B3, C32, ALPHA ST A3, 2 * SIZE(CO1) MUL B4, C42, ALPHA ST A4, 3 * SIZE(CO1) MUL B5, C14, ALPHA ST A5, 4 * SIZE(CO1) MUL B6, C24, ALPHA ST A6, 5 * SIZE(CO1) MUL B7, C34, ALPHA ST A7, 6 * SIZE(CO1) MUL C11, C44, ALPHA ST A8, 7 * SIZE(CO1) ST B1, 0 * SIZE(CO2) ST B2, 1 * SIZE(CO2) ST B3, 2 * SIZE(CO2) ST B4, 3 * SIZE(CO2) ST B5, 4 * SIZE(CO2) ST B6, 5 * SIZE(CO2) ST B7, 6 * SIZE(CO2) ST C11, 7 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -8 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, 3 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 8 #endif daddiu CO1, CO1, 8 * SIZE bgtz I, .L281 daddiu CO2, CO2, 8 * SIZE #endif .align 4 .L24: andi I, M, 4 # MR=4 blez I, .L22 NOP .align 4 .L241: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 2 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD A3, 2 * SIZE(AO) MOV C41, C11 MOV C42, C11 LD A4, 3 * SIZE(AO) MOV C13, C11 MOV C14, C11 LD B1, 0 * SIZE(BO) MOV C23, C11 MOV C24, C11 LD B2, 1 * SIZE(BO) MOV C33, C11 MOV C34, C11 MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 1 blez L, .L242 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD A3, 2 * SIZE(AO) MOV C41, C11 MOV C42, C11 LD A4, 3 * SIZE(AO) MOV C13, C11 MOV C14, C11 LD B1, 0 * SIZE(BO) MOV C23, C11 MOV C24, C11 LD B2, 1 * SIZE(BO) MOV C33, C11 MOV C34, C11 MOV C43, C11 blez L, .L242 MOV C44, C11 #endif .align 4 .L2410: daddiu L, L, -1 MADD C11, C11, A1, B1 LD A5, 4 * SIZE(AO) MADD C21, C21, A2, B1 LD B3, 2 * SIZE(BO) MADD C31, C31, A3, B1 LD B4, 3 * SIZE(BO) MADD C41, C41, A4, B1 LD A6, 5 * SIZE(AO) daddiu BO, BO, 4 * SIZE MADD C12, C12, A1, B2 LD A7, 6 * SIZE(AO) MADD C22, C22, A2, B2 LD A8, 7 * SIZE(AO) daddiu AO, AO, 8 * SIZE MADD C32, C32, A3, B2 MADD C42, C42, A4, B2 MADD C11, C11, A5, B3 LD A1, 0 * SIZE(AO) MADD C21, C21, A6, B3 LD B1, 0 * SIZE(BO) MADD C31, C31, A7, B3 LD B2, 1 * SIZE(BO) MADD C41, C41, A8, B3 LD A2, 1 * SIZE(AO) MADD C12, C12, A5, B4 LD A3, 2 * SIZE(AO) MADD C22, C22, A6, B4 LD A4, 3 * SIZE(AO) MADD C32, C32, A7, B4 bgtz L, .L2410 MADD C42, C42, A8, B4 .align 4 .L242: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L240 LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 MADD C31, C31, A3, B1 MADD C41, C41, A4, B1 MADD C12, C12, A1, B2 MADD C22, C22, A2, B2 MADD C32, C32, A3, B2 MADD C42, C42, A4, B2 daddiu AO, AO, 4 * SIZE daddiu BO, BO, 2 * SIZE .align 4 .L240: # Write Back #ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) LD A2, 1 * SIZE(CO1) LD A3, 2 * SIZE(CO1) LD A4, 3 * SIZE(CO1) MADD A1, A1, C11, ALPHA LD B1, 0 * SIZE(CO2) MADD A2, A2, C21, ALPHA LD B2, 1 * SIZE(CO2) MADD A3, A3, C31, ALPHA LD B3, 2 * SIZE(CO2) MADD A4, A4, C41, ALPHA LD B4, 3 * SIZE(CO2) MADD B1, B1, C12, ALPHA ST A1, 0 * SIZE(CO1) MADD B2, B2, C22, ALPHA ST A2, 1 * SIZE(CO1) MADD B3, B3, C32, ALPHA ST A3, 2 * SIZE(CO1) MADD B4, B4, C42, ALPHA ST A4, 3 * SIZE(CO1) ST B1, 0 * SIZE(CO2) ST B2, 1 * SIZE(CO2) ST B3, 2 * SIZE(CO2) ST B4, 3 * SIZE(CO2) daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE #else MUL A1, C11, ALPHA MUL A2, C21, ALPHA MUL A3, C31, ALPHA MUL A4, C41, ALPHA MUL B1, C12, ALPHA ST A1, 0 * SIZE(CO1) MUL B2, C22, ALPHA ST A2, 1 * SIZE(CO1) MUL B3, C32, ALPHA ST A3, 2 * SIZE(CO1) MUL B4, C42, ALPHA ST A4, 3 * SIZE(CO1) ST B1, 0 * SIZE(CO2) ST B2, 1 * SIZE(CO2) ST B3, 2 * SIZE(CO2) ST B4, 3 * SIZE(CO2) daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif #endif .align 4 .L22: andi I, M, 2 blez I, .L21 NOP .align 4 .L221: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD B1, 0 * SIZE(BO) MOV C41, C11 MOV C42, C11 LD B2, 1 * SIZE(BO) MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 1 blez L, .L222 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD B1, 0 * SIZE(BO) MOV C41, C11 MOV C42, C11 LD B2, 1 * SIZE(BO) MOV C43, C11 blez L, .L222 MOV C44, C11 #endif .align 4 .L2210: daddiu L, L, -1 MADD C11, C11, A1, B1 LD A3, 2 * SIZE(AO) MADD C21, C21, A2, B1 LD B3, 2 * SIZE(BO) MADD C12, C12, A1, B2 LD A4, 3 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD C22, C22, A2, B2 LD B4, 3 * SIZE(BO) daddiu BO, BO, 4 * SIZE MADD C11, C11, A3, B3 LD A1, 0 * SIZE(AO) MADD C21, C21, A4, B3 LD B1, 0 * SIZE(BO) MADD C12, C12, A3, B4 LD B2, 1 * SIZE(BO) MADD C22, C22, A4, B4 bgtz L, .L2210 LD A2, 1 * SIZE(AO) .align 4 .L222: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L220 LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 MADD C12, C12, A1, B2 MADD C22, C22, A2, B2 daddiu AO, AO, 2 * SIZE daddiu BO, BO, 2 * SIZE .align 4 .L220: # Write Back #ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) LD A2, 1 * SIZE(CO1) MADD A1, A1, C11, ALPHA LD B1, 0 * SIZE(CO2) MADD A2, A2, C21, ALPHA LD B2, 1 * SIZE(CO2) MADD B1, B1, C12, ALPHA ST A1, 0 * SIZE(CO1) MADD B2, B2, C22, ALPHA ST A2, 1 * SIZE(CO1) ST B1, 0 * SIZE(CO2) ST B2, 1 * SIZE(CO2) daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE #else MUL A1, C11, ALPHA MUL A2, C21, ALPHA MUL B1, C12, ALPHA MUL B2, C22, ALPHA ST A1, 0 * SIZE(CO1) ST A2, 1 * SIZE(CO1) ST B1, 0 * SIZE(CO2) ST B2, 1 * SIZE(CO2) daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddu KK, KK, 2 #endif #endif .align 4 .L21: andi I, M, 1 blez I, .L20 NOP .align 4 .L211: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B # Reset B #else dsll L, KK, BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 LD B1, 0 * SIZE(BO) MOV C41, C11 MOV C42, C11 LD B2, 1 * SIZE(BO) MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 1 blez L, .L212 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 MOV C31, C11 MOV C32, C11 LD B1, 0 * SIZE(BO) MOV C41, C11 MOV C42, C11 LD B2, 1 * SIZE(BO) MOV C43, C11 blez L, .L212 MOV C44, C11 #endif .align 4 .L2110: daddiu L, L, -1 MADD C11, C11, A1, B1 LD A2, 1 * SIZE(AO) MADD C12, C12, A1, B2 LD B3, 2 * SIZE(BO) LD B4, 3 * SIZE(BO) daddiu AO, AO, 2 * SIZE daddiu BO, BO, 4 * SIZE MADD C11, C11, A2, B3 LD A1, 0 * SIZE(AO) MADD C12, C12, A2, B4 LD B1, 0 * SIZE(BO) bgtz L, .L2110 LD B2, 1 * SIZE(BO) .align 4 .L212: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L210 LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C12, C12, A1, B2 daddiu AO, AO, 1 * SIZE daddiu BO, BO, 2 * SIZE .align 4 .L210: # Write Back #ifndef TRMMKERNEL LD A1, 0 * SIZE(CO1) MADD A1, A1, C11, ALPHA LD B1, 0 * SIZE(CO2) MADD B1, B1, C12, ALPHA ST A1, 0 * SIZE(CO1) ST B1, 0 * SIZE(CO2) daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE #else MUL A1, C11, ALPHA MUL B1, C12, ALPHA ST A1, 0 * SIZE(CO1) ST B1, 0 * SIZE(CO2) daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 4 .L20: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move B, BO .align 4 .L1: andi J, N, 1 blez J, .L999 NOP .L18: dsra I, M, 3 # MR=8 move AO, A # Reset A #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif blez I, .L14 NOP .align 4 .L181: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B # Reset B #else dsll L, KK, 3 + BASE_SHIFT dsll TEMP, KK, BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 LD A3, 2 * SIZE(AO) MOV C22, C11 LD A4, 3 * SIZE(AO) MOV C31, C11 LD A5, 4 * SIZE(AO) MOV C32, C11 LD A6, 5 * SIZE(AO) MOV C41, C11 LD B1, 0 * SIZE(BO) MOV C42, C11 LD A7, 6 * SIZE(AO) MOV C13, C11 LD A8, 7 * SIZE(AO) MOV C14, C11 MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 8 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 1 blez L, .L182 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 LD A3, 2 * SIZE(AO) MOV C22, C11 LD A4, 3 * SIZE(AO) MOV C31, C11 LD A5, 4 * SIZE(AO) MOV C32, C11 LD A6, 5 * SIZE(AO) MOV C41, C11 LD B1, 0 * SIZE(BO) MOV C42, C11 LD A7, 6 * SIZE(AO) MOV C13, C11 LD A8, 7 * SIZE(AO) MOV C14, C11 MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 blez L, .L182 MOV C44, C11 #endif .align 4 .L1810: daddiu L, L, -1 MADD C11, C11, A1, B1 LD B5, 8 * SIZE(AO) MADD C21, C21, A2, B1 LD B6, 9 * SIZE(AO) MADD C31, C31, A3, B1 LD B7, 10 * SIZE(AO) MADD C41, C41, A4, B1 LD B8, 11 * SIZE(AO) MADD C13, C13, A5, B1 LD B2, 1 * SIZE(BO) daddiu BO, BO, 2 * SIZE MADD C23, C23, A6, B1 LD A1, 12 * SIZE(AO) MADD C33, C33, A7, B1 LD A2, 13 * SIZE(AO) MADD C43, C43, A8, B1 LD A3, 14 * SIZE(AO) LD A4, 15 * SIZE(AO) daddiu AO, AO, 16 * SIZE MADD C11, C11, B5, B2 LD A5, 4 * SIZE(AO) MADD C21, C21, B6, B2 LD A6, 5 * SIZE(AO) MADD C13, C13, A1, B2 LD A7, 6 * SIZE(AO) MADD C23, C23, A2, B2 LD A8, 7 * SIZE(AO) MADD C33, C33, A3, B2 LD B1, 0 * SIZE(BO) MADD C43, C43, A4, B2 LD A1, 0 * SIZE(AO) MADD C31, C31, B7, B2 LD A2, 1 * SIZE(AO) MADD C41, C41, B8, B2 LD A3, 2 * SIZE(AO) bgtz L, .L1810 LD A4, 3 * SIZE(AO) .align 4 .L182: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L180 LD ALPHA, 152($sp) MADD C13, C13, A5, B1 MADD C23, C23, A6, B1 MADD C33, C33, A7, B1 MADD C43, C43, A8, B1 daddiu AO, AO, 8 * SIZE MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 MADD C31, C31, A3, B1 MADD C41, C41, A4, B1 daddiu BO, BO, 1 * SIZE .align 4 .L180: # Write Back #ifndef TRMMKERNEL daddiu I, I, -1 LD A1, 0 * SIZE(C) LD A2, 1 * SIZE(C) LD A3, 2 * SIZE(C) LD A4, 3 * SIZE(C) LD A5, 4 * SIZE(C) LD A6, 5 * SIZE(C) LD A7, 6 * SIZE(C) LD A8, 7 * SIZE(C) MADD A1, A1, C11, ALPHA MADD A2, A2, C21, ALPHA MADD A3, A3, C31, ALPHA MADD A4, A4, C41, ALPHA MADD A5, A5, C13, ALPHA MADD A6, A6, C23, ALPHA MADD A7, A7, C33, ALPHA MADD A8, A8, C43, ALPHA ST A1, 0 * SIZE(C) ST A2, 1 * SIZE(C) ST A3, 2 * SIZE(C) ST A4, 3 * SIZE(C) ST A5, 4 * SIZE(C) ST A6, 5 * SIZE(C) ST A7, 6 * SIZE(C) ST A8, 7 * SIZE(C) daddiu C, C, 8 * SIZE bgtz I, .L181 NOP #else daddiu I, I, -1 MUL A1, C11, ALPHA MUL A2, C21, ALPHA MUL A3, C31, ALPHA MUL A4, C41, ALPHA MUL A5, C13, ALPHA MUL A6, C23, ALPHA MUL A7, C33, ALPHA MUL A8, C43, ALPHA ST A1, 0 * SIZE(C) ST A2, 1 * SIZE(C) ST A3, 2 * SIZE(C) ST A4, 3 * SIZE(C) ST A5, 4 * SIZE(C) ST A6, 5 * SIZE(C) ST A7, 6 * SIZE(C) ST A8, 7 * SIZE(C) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -8 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 3 + BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 8 #endif daddiu C, C, 8 * SIZE bgtz I, .L181 NOP #endif .align 4 .L14: andi I, M, 4 # MR=4 blez I, .L12 NOP .align 4 .L141: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 2 + BASE_SHIFT dsll TEMP, KK, BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD A3, 2 * SIZE(AO) MOV C41, C11 MOV C42, C11 LD A4, 3 * SIZE(AO) MOV C13, C11 MOV C14, C11 LD B1, 0 * SIZE(BO) MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 1 blez L, .L142 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD A3, 2 * SIZE(AO) MOV C41, C11 MOV C42, C11 LD A4, 3 * SIZE(AO) MOV C13, C11 MOV C14, C11 LD B1, 0 * SIZE(BO) MOV C23, C11 MOV C24, C11 MOV C33, C11 MOV C34, C11 MOV C43, C11 blez L, .L142 MOV C44, C11 #endif .align 4 .L1410: daddiu L, L, -1 MADD C11, C11, A1, B1 LD A5, 4 * SIZE(AO) MADD C21, C21, A2, B1 LD B3, 1 * SIZE(BO) MADD C31, C31, A3, B1 LD A6, 5 * SIZE(AO) daddiu BO, BO, 2 * SIZE MADD C41, C41, A4, B1 LD A7, 6 * SIZE(AO) LD A8, 7 * SIZE(AO) daddiu AO, AO, 8 * SIZE MADD C11, C11, A5, B3 LD A1, 0 * SIZE(AO) MADD C21, C21, A6, B3 LD B1, 0 * SIZE(BO) MADD C31, C31, A7, B3 LD A2, 1 * SIZE(AO) MADD C41, C41, A8, B3 LD A3, 2 * SIZE(AO) bgtz L, .L1410 LD A4, 3 * SIZE(AO) .align 4 .L142: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L140 LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 MADD C31, C31, A3, B1 MADD C41, C41, A4, B1 daddiu AO, AO, 4 * SIZE daddiu BO, BO, 1 * SIZE .align 4 .L140: # Write Back #ifndef TRMMKERNEL LD A1, 0 * SIZE(C) LD A2, 1 * SIZE(C) LD A3, 2 * SIZE(C) LD A4, 3 * SIZE(C) MADD A1, A1, C11, ALPHA MADD A2, A2, C21, ALPHA MADD A3, A3, C31, ALPHA MADD A4, A4, C41, ALPHA ST A1, 0 * SIZE(C) ST A2, 1 * SIZE(C) ST A3, 2 * SIZE(C) ST A4, 3 * SIZE(C) daddiu C, C, 4 * SIZE #else MUL A1, C11, ALPHA MUL A2, C21, ALPHA MUL A3, C31, ALPHA MUL A4, C41, ALPHA ST A1, 0 * SIZE(C) ST A2, 1 * SIZE(C) ST A3, 2 * SIZE(C) ST A4, 3 * SIZE(C) daddiu C, C, 4 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif #endif .align 4 .L12: andi I, M, 2 blez I, .L11 NOP .align 4 .L121: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) ||\ (!defined(LEFT) && !defined(TRANSA)) move BO, B # Reset B #else dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, BASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD B1, 0 * SIZE(BO) MOV C41, C11 MOV C42, C11 MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) ||\ (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 1 blez L, .L122 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD A2, 1 * SIZE(AO) MOV C31, C11 MOV C32, C11 LD B1, 0 * SIZE(BO) MOV C41, C11 MOV C42, C11 MOV C43, C11 blez L, .L122 MOV C44, C11 #endif .align 4 .L1210: daddiu L, L, -1 MADD C11, C11, A1, B1 LD B3, 1 * SIZE(BO) MADD C21, C21, A2, B1 daddiu BO, BO, 2 * SIZE LD A3, 2 * SIZE(AO) LD A4, 3 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD C11, C11, A3, B3 LD B1, 0 * SIZE(BO) MADD C21, C21, A4, B3 LD A1, 0 * SIZE(AO) bgtz L, .L1210 LD A2, 1 * SIZE(AO) .align 4 .L122: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L120 LD ALPHA, 152($sp) MADD C11, C11, A1, B1 MADD C21, C21, A2, B1 daddiu AO, AO, 2 * SIZE daddiu BO, BO, 1 * SIZE .align 4 .L120: # Write Back #ifndef TRMMKERNEL LD A1, 0 * SIZE(C) LD A2, 1 * SIZE(C) MADD A1, A1, C11, ALPHA MADD A2, A2, C21, ALPHA ST A1, 0 * SIZE(C) ST A2, 1 * SIZE(C) daddiu C, C, 2 * SIZE #else MUL A1, C11, ALPHA MUL A2, C21, ALPHA ST A1, 0 * SIZE(C) ST A2, 1 * SIZE(C) daddiu C, C, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA))||\ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 4 .L11: andi I, M, 1 blez I, .L10 NOP .align 4 .L111: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA))||\ (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, BASE_SHIFT daddu AO, AO, L daddu BO, B, L #endif MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD B1, 0 * SIZE(BO) MOV C31, C11 MOV C32, C11 #if (defined(LEFT) && !defined(TRANSA))||\ (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 1 blez L, .L112 NOP #else move BO, B # Reset B dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 MOV C22, C11 LD B1, 0 * SIZE(BO) MOV C31, C11 blez L, .L112 MOV C32, C11 #endif .align 4 .L1110: daddiu L, L, -1 MADD C11, C11, A1, B1 LD A2, 1 * SIZE(AO) LD B2, 1 * SIZE(BO) daddiu AO, AO, 2 * SIZE daddiu BO, BO, 2 * SIZE MADD C11, C11, A2, B2 LD A1, 0 * SIZE(AO) LD B1, 0 * SIZE(BO) bgtz L, .L1110 NOP .align 4 .L112: #ifndef TRMMKERNEL andi L, K, 1 #else andi L, TEMP, 1 #endif blez L, .L110 LD ALPHA, 152($sp) MADD C11, C11, A1, B1 daddiu AO, AO, 1 * SIZE daddiu BO, BO, 1 * SIZE .align 4 .L110: # Write Back #ifndef TRMMKERNEL LD A1, 0 * SIZE(C) MADD A1, A1, C11, ALPHA ST A1, 0 * SIZE(C) daddiu C, C, 1 * SIZE #else MUL A1, C11, ALPHA ST A1, 0 * SIZE(C) daddiu C, C, 1 * SIZE #endif .align 4 .L10: move B, BO NOP .L999: ld $16, 0($sp) ld $17, 8($sp) ld $18, 16($sp) ld $19, 24($sp) ld $20, 32($sp) ld $21, 40($sp) ld $22, 48($sp) LD $f24, 56($sp) LD $f25, 64($sp) LD $f26, 72($sp) LD $f27, 80($sp) LD $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) ld $24, 104($sp) ld $25, 112($sp) #endif #ifndef __64BIT__ LD $f20,120($sp) LD $f21,128($sp) LD $f22,136($sp) LD $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE j $31 nop EPILOGUE # .set macro # .set reorder # .end gemm # .size gemm, .-gemm # .ident "GCC: (Debian 4.4.6-6) 4.4.6" OpenBLAS-0.2.20/kernel/mips64/sgemm_kernel_loongson3a_4x4.S000066400000000000000000001176461313527062700232220ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define KCO $18 #define MCO $19 #define NCO $20 #define SPANB $21 #define PREB $23 #define PREA $24 #define SPANA $25 #define ALPHA $f15 #if defined(TRMMKERNEL) #define OFFSET $2 #define KK $3 #define TEMP $7 #endif #define R8 8 #define R9 9 #define R14 14 #define R15 15 #define R16 16 #define R17 17 #define t11 $f30 #define t21 $f31 #define t31 $f28 #define t41 $f29 #define t12 $f26 #define t22 $f27 #define t32 $f24 #define t42 $f25 #define t13 $f22 #define t23 $f23 #define t33 $f20 #define t43 $f21 #define t14 $f18 #define t24 $f19 #define t34 $f16 #define t44 $f17 #define c11 $f0 #define c21 $f1 #define c31 $f2 #define c41 $f3 #define c12 $f4 #define c22 $f5 #define c32 $f6 #define c42 $f7 #define c13 $f8 #define c23 $f9 #define c33 $f10 #define c43 $f11 #define c14 $f12 #define c24 $f13 #define c34 $f14 #define c44 $f0 #define a0 $f0 #define a1 $f1 #define a2 $f2 #define a3 $f3 #define a4 $f4 #define a5 $f5 #define a6 $f6 #define a7 $f7 #define b0 $f8 #define b1 $f9 #define b2 $f10 #define b3 $f11 #define b4 $f12 #define b5 $f13 #define b6 $f14 #define b7 $f15 #define F31 31 #define F30 30 #define F29 29 #define F28 28 #define F27 27 #define F26 26 #define F25 25 #define F24 24 #define F23 23 #define F22 22 #define F21 21 #define F20 20 #define F19 19 #define F18 18 #define F17 17 #define F16 16 #define F15 15 #define F14 14 #define F13 13 #define F12 12 #define F11 11 #define F10 10 #define F9 9 #define F8 8 #define F7 7 #define F6 6 #define F5 5 #define F4 4 #define F3 3 #define F2 2 #define F1 1 #define F0 0 PROLOGUE daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 48($sp) ST $f24, 56($sp) ST $f25, 64($sp) ST $f26, 72($sp) ST $f27, 80($sp) ST $f28, 88($sp) sd $23, 96($sp) sd $24, 104($sp) sd $25, 112($sp) ST $f20,120($sp) ST $f21,128($sp) ST $f22,136($sp) ST $f23,144($sp) .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M move NCO,N # Backup N move KCO,K # Backup K move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 #if defined(TRMMKERNEL) LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK,OFFSET #endif move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 move CO1,C dsra M,MCO,2 # M=MCO/2 move A,AO # Reset A daddu CO2,C,LDC daddu PREB,BO,SPANB # PreB point next panelB daddu CO3,CO2,LDC daddu PREA,AO,SPANA daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj .L10: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) #else dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) MOV t21,t11 MOV t31,t11 LD a1,1*SIZE(A) MOV t41,t11 MOV t12,t11 LD b0,0(B) MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 LD a2,2*SIZE(A) MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) MOV t14,t11 MOV t24,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif dsra K,TEMP,2 # K=KCO/2 MOV t34,t11 beqz K,.L15 MOV t44,t11 #else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) MOV t21,t11 MOV t31,t11 LD a1,1*SIZE(A) MOV t41,t11 MOV t12,t11 LD b0,0(B) MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 dsra K,KCO,2 # K=KCO/2 LD a2,2*SIZE(A) MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) MOV t14,t11 MOV t24,t11 LD b3,3*SIZE(B) MOV t34,t11 beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif .align 5 .L11: # kr=4 MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 LD b6,6*SIZE(B) FETCH $0,(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 LD a7,7*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,7*SIZE(B) .L12: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,8*SIZE(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,9*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 LD b0,8*SIZE(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 LD b1,9*SIZE(B) FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,10*SIZE(A) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,10*SIZE(B) FETCH $0,4*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 LD a3,11*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 LD b3,11*SIZE(B) .L13: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,12*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,13*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,12*SIZE(B) FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,13*SIZE(B) FETCH $0,8*SIZE(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,14*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,16*SIZE # 4mr*4kr LD b6,14*SIZE(B) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 daddu B,B,16*SIZE # 4nr*4kr LD a7,-1*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,-1*SIZE(B) .L14: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,0(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,1*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddiu K,K,-1 LD b0,0(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,16*SIZE LD b1,1*SIZE(B) FETCH $0,12*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,2*SIZE(A) FETCH $0,-4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,2*SIZE(B) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREB,PREB,16*SIZE LD a3,3*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 bnez K,.L11 LD b3,3*SIZE(B) .L15: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop .L16: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) FETCH $0,0(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,0(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,8*SIZE # 4mr*2kr LD b6,6*SIZE(B) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 daddu B,B,8*SIZE # 4nr*2kr LD a7,-1*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,-1*SIZE(B) .L17: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,0*SIZE(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,1*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 LD b0,0*SIZE(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 LD b1,1*SIZE(B) FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,2*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,2*SIZE(B) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREA,PREA,8*SIZE LD a3,3*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 daddu PREB,PREB,8*SIZE LD b3,3*SIZE(B) .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # 4mr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 4nr*kr FETCH $0,0(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu PREB,PREB,4*SIZE MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu PREA,PREA,4*SIZE MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L19: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA LD c13,0(CO3) MADD t12,c12,t12,ALPHA LD c23,1*SIZE(CO3) MADD t22,c22,t22,ALPHA LD c33,2*SIZE(CO3) MADD t32,c32,t32,ALPHA LD c43,3*SIZE(CO3) MADD t42,c42,t42,ALPHA LD c14,0(CO4) MADD t13,c13,t13,ALPHA LD c24,1*SIZE(CO4) MADD t23,c23,t23,ALPHA LD c34,2*SIZE(CO4) MADD t33,c33,t33,ALPHA LD c44,3*SIZE(CO4) MADD t43,c43,t43,ALPHA ST t11,0(CO1) MADD t14,c14,t14,ALPHA ST t21,1*SIZE(CO1) MADD t24,c24,t24,ALPHA ST t31,2*SIZE(CO1) MADD t34,c34,t34,ALPHA ST t41,3*SIZE(CO1) MADD t44,c44,t44,ALPHA daddiu M,M,-1 # M-- ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) ST t13,0(CO3) ST t23,1*SIZE(CO3) ST t33,2*SIZE(CO3) ST t43,3*SIZE(CO3) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) FETCH $0,8*SIZE(CO3) FETCH $0,8*SIZE(CO4) ST t14,0(CO4) daddu CO1,CO1,4*SIZE # COi += 4 ST t24,1*SIZE(CO4) daddu CO2,CO2,4*SIZE ST t34,2*SIZE(CO4) daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB bnez M,.L10 daddu CO4,CO4,4*SIZE #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t31, 2 * SIZE(CO1) MUL t32, ALPHA, t32 ST t41, 3 * SIZE(CO1) MUL t42, ALPHA, t42 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t32, 2 * SIZE(CO2) MUL t33, ALPHA, t33 ST t42, 3 * SIZE(CO2) MUL t43, ALPHA, t43 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t33, 2 * SIZE(CO3) MUL t34, ALPHA, t34 ST t43, 3 * SIZE(CO3) MUL t44, ALPHA, t44 ST t14, 0 * SIZE(CO4) daddiu M,M,-1 # M-- ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else daddiu TEMP,TEMP, -4 #endif dsll K,TEMP,2 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # mov A to the end of panel Ai daddu B,B,TEMP # mov B to the end of panel Bj #endif #ifdef LEFT daddiu KK, KK,4 #endif bnez M,.L10 nop #endif .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 beqz M,.L14_M1 nop .L20: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 LD b2,2*SIZE(B) MOV t13,t11 MOV t23,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 #else daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 #endif dsra K,TEMP,2 MOV t14,t11 beqz K,.L25 MOV t24,t11 # clear 2*4=8 results registers #else move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 dsra K,KCO,2 LD b2,2*SIZE(B) MOV t13,t11 MOV t23,t11 LD b3,3*SIZE(B) MOV t14,t11 beqz K,.L25 MOV t24,t11 #endif .L21: # nr=4,mr=2,kr=4 MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD a5,3*SIZE(A) MADD t12,t12,a0,b1 LD b4,4*SIZE(B) MADD t22,t22,a1,b1 LD b5,5*SIZE(B) MADD t13,t13,a0,b2 LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 LD a3,5*SIZE(A) MADD t12,t12,a4,b5 LD b0,8*SIZE(B) MADD t22,t22,a5,b5 LD b1,9*SIZE(B) MADD t13,t13,a4,b6 LD b2,10*SIZE(B) MADD t23,t23,a5,b6 LD b3,11*SIZE(B) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 daddiu K,K,-1 MADD t11,t11,a2,b0 LD a6,6*SIZE(A) MADD t21,t21,a3,b0 LD a7,7*SIZE(A) MADD t12,t12,a2,b1 LD b4,12*SIZE(B) MADD t22,t22,a3,b1 LD b5,13*SIZE(B) MADD t13,t13,a2,b2 LD b6,14*SIZE(B) MADD t23,t23,a3,b2 LD b7,15*SIZE(B) MADD t14,t14,a2,b3 MADD t24,t24,a3,b3 daddu A,A,8*SIZE # 2mr*4kr daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a6,b4 LD a0,0*SIZE(A) MADD t21,t21,a7,b4 LD a1,1*SIZE(A) MADD t12,t12,a6,b5 LD b0,0*SIZE(B) MADD t22,t22,a7,b5 LD b1,1*SIZE(B) MADD t13,t13,a6,b6 LD b2,2*SIZE(B) MADD t23,t23,a7,b6 LD b3,3*SIZE(B) MADD t14,t14,a6,b7 bnez K,.L21 MADD t24,t24,a7,b7 .L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else andi K,TEMP,2 #endif beqz K,.L28 nop .L26: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD a5,3*SIZE(A) MADD t12,t12,a0,b1 LD b4,4*SIZE(B) MADD t22,t22,a1,b1 LD b5,5*SIZE(B) MADD t13,t13,a0,b2 LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,4*SIZE # 2mr*2kr daddu B,B,8*SIZE # 4nr*2kr .L27: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD a1,1*SIZE(A) MADD t12,t12,a4,b5 LD b0,0*SIZE(B) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) MADD t13,t13,a4,b6 LD b2,2*SIZE(B) MADD t23,t23,a5,b6 LD b3,3*SIZE(B) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 .L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr daddu B,B,4*SIZE # 4nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) MADD t21,c21,t21,ALPHA LD c14,0(CO4) MADD t12,c12,t12,ALPHA LD c24,1*SIZE(CO4) MADD t22,c22,t22,ALPHA ST t11,0(CO1) MADD t13,c13,t13,ALPHA ST t21,1*SIZE(CO1) MADD t23,c23,t23,ALPHA ST t12,0(CO2) MADD t14,c14,t14,ALPHA ST t22,1*SIZE(CO2) MADD t24,c24,t24,ALPHA ST t13,0(CO3) daddu CO1,CO1,2*SIZE # COi += 2 ST t23,1*SIZE(CO3) daddu CO2,CO2,2*SIZE ST t14,0(CO4) daddu CO3,CO3,2*SIZE ST t24,1*SIZE(CO4) daddu CO4,CO4,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP,-2 #else daddiu TEMP,TEMP,-4 #endif dsll K,TEMP,1 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # move A to next panel Ai daddu B,B,TEMP # move B to next panel Bj #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L14_M1: andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop .L30: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, BASE_SHIFT dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) MOV t14,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra K,TEMP, 2 nop beqz K,.L35 nop #else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) MOV t14,t11 beqz K,.L35 LD b3,3*SIZE(B) #endif .L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 LD b6,6*SIZE(B) LD b7,7*SIZE(B) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 LD b0,8*SIZE(B) LD b1,9*SIZE(B) MADD t12,t12,a1,b5 LD b2,10*SIZE(B) LD b3,11*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 daddiu K,K,-1 LD b4,12*SIZE(B) LD b5,13*SIZE(B) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr LD b6,14*SIZE(B) LD b7,15*SIZE(B) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 LD a0, 0*SIZE(A) # a0 daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a3,b4 LD b0,0*SIZE(B) MADD t12,t12,a3,b5 LD b1,1*SIZE(B) MADD t13,t13,a3,b6 LD b2,2*SIZE(B) MADD t14,t14,a3,b7 bnez K,.L31 LD b3,3*SIZE(B) .L35: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop .L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr LD b6,6*SIZE(B) MADD t13,t13,a0,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr .L37: LD a0,0(A) MADD t11,t11,a1,b4 LD b0,0*SIZE(B) LD b1,1*SIZE(B) MADD t12,t12,a1,b5 LD b2,2*SIZE(B) LD b3,3*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE daddu B,B,4*SIZE MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA MADD t14,c14,t14,ALPHA ST t11,0(CO1) ST t12,0(CO2) ST t13,0(CO3) ST t14,0(CO4) #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 MUL t13, ALPHA, t13 MUL t14, ALPHA, t14 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll K,TEMP, BASE_SHIFT dsll TEMP,TEMP, 2 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK,4 #endif bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj .align 5 .L0_N2: andi N,NCO,2 # nr = 2 beqz N,.L0_N1 nop .L0_N2_Lb: move CO1,C daddu CO2,C,LDC dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA daddu C,CO2,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L12_M2 nop .L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 # gemm part LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t31,t11 LD b1,1*SIZE(B) MOV t41,t11 LD a2,2*SIZE(A) LD a3,3*SIZE(A) MOV t12,t11 MOV t22,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 #else move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 # gemm part LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t31,t11 LD b1,1*SIZE(B) MOV t41,t11 LD a2,2*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a3,3*SIZE(A) MOV t12,t11 MOV t22,t11 MOV t32,t11 beqz K,.L45 MOV t42,t11 #endif .L41: # nr=2,mr=kr=4 MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 LD a5,5*SIZE(A) MADD t12,t12,a0,b1 LD b4,2*SIZE(B) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t31,t31,a2,b0 LD a6,6*SIZE(A) MADD t41,t41,a3,b0 LD a7,7*SIZE(A) FETCH $0,(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L42: MADD t11,t11,a4,b4 LD a0,8*SIZE(A) MADD t21,t21,a5,b4 LD a1,9*SIZE(A) MADD t12,t12,a4,b5 LD b2,4*SIZE(B) MADD t22,t22,a5,b5 LD b3,5*SIZE(B) MADD t31,t31,a6,b4 LD a2,10*SIZE(A) MADD t41,t41,a7,b4 LD a3,11*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 .L43: MADD t11,t11,a0,b2 LD a4,12*SIZE(A) MADD t21,t21,a1,b2 LD a5,13*SIZE(A) MADD t12,t12,a0,b3 LD b6,6*SIZE(B) MADD t22,t22,a1,b3 LD b7,7*SIZE(B) MADD t31,t31,a2,b2 LD a6,14*SIZE(A) MADD t41,t41,a3,b2 LD a7,15*SIZE(A) FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 daddu A,A,16*SIZE # 4mr*4kr daddu B,B,8*SIZE # 2nr*4kr .L44: MADD t11,t11,a4,b6 LD a0,0*SIZE(A) MADD t21,t21,a5,b6 LD a1,1*SIZE(A) MADD t12,t12,a4,b7 LD b0,0*SIZE(B) MADD t22,t22,a5,b7 LD b1,1*SIZE(B) daddiu K,K,-1 daddu PREA,PREA,16*SIZE MADD t31,t31,a6,b6 LD a2,2*SIZE(A) MADD t41,t41,a7,b6 LD a3,3*SIZE(A) FETCH $0,-4*SIZE(PREA) MADD t32,t32,a6,b7 bnez K,.L41 MADD t42,t42,a7,b7 .L45: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop .L46: MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 LD a5,5*SIZE(A) MADD t12,t12,a0,b1 LD b4,2*SIZE(B) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t31,t31,a2,b0 LD a6,6*SIZE(A) MADD t41,t41,a3,b0 LD a7,7*SIZE(A) FETCH $0,0(PREA) MADD t32,t32,a2,b1 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L47: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD a1,1*SIZE(A) MADD t12,t12,a4,b5 LD b0,0*SIZE(B) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) MADD t31,t31,a6,b4 LD a2,2*SIZE(A) MADD t41,t41,a7,b4 LD a3,3*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,8*SIZE .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,2*SIZE daddu PREA,PREA,4*SIZE MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA ST t11,0(CO1) MADD t12,c12,t12,ALPHA ST t21,1*SIZE(CO1) MADD t22,c22,t22,ALPHA ST t31,2*SIZE(CO1) MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) daddu CO1,CO1,4*SIZE bnez M,.L40 daddu CO2,CO2,4*SIZE #else MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 ST t21, 1 * SIZE(CO1) MUL t32, ALPHA, t32 ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,4(CO1) FETCH $0,4(CO2) #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif dsll K,TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L40 nop #endif .align 3 .L12_M2: andi M,MCO,2 # mr = 2 beqz M,.L12_M1 nop .L50: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO #else dsll K, KK, 1 + BASE_SHIFT #mr=2 dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 daddu A, A, K daddu B, BO, TEMP #endif LD a0,0*SIZE(A) LD a1,1*SIZE(A) MTC $0,t11 LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 #else move B,BO LD a0,0*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a1,1*SIZE(A) MTC $0,t11 LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) MOV t12,t11 beqz K,.L55 MOV t22,t11 #endif .L51: # nr=2 mr=2,kr=4 MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD b4,2*SIZE(B) MADD t12,t12,a0,b1 LD a5,3*SIZE(A) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 LD b2,4*SIZE(B) MADD t12,t12,a4,b5 LD a3,5*SIZE(A) MADD t22,t22,a5,b5 daddiu K,K,-1 LD b3,5*SIZE(B) MADD t11,t11,a2,b2 LD a6,6*SIZE(A) MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE LD b6,6*SIZE(B) MADD t12,t12,a2,b3 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE LD a7,-1*SIZE(A) MADD t22,t22,a3,b3 LD b7,-1*SIZE(B) MADD t11,t11,a6,b6 LD a0,0*SIZE(A) MADD t21,t21,a7,b6 LD b0,0*SIZE(B) MADD t12,t12,a6,b7 LD a1,1*SIZE(A) MADD t22,t22,a7,b7 bnez K,.L51 LD b1,1*SIZE(B) .L55: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop .L56: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 LD b4,2*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE # 2nr*2kr LD a5,-1*SIZE(A) MADD t22,t22,a1,b1 LD b5,-1*SIZE(B) .L57: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD b0,0*SIZE(B) MADD t12,t12,a4,b5 LD a1,1*SIZE(A) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) .L58: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP, 1 #endif beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE # 2nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA MADD t22,c22,t22,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t12,0(CO2) ST t22,1*SIZE(CO2) daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) #else daddiu M, M, -1 daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t12, ALPHA, t12 MUL t22, ALPHA, t22 ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) ST t12, -2 * SIZE(CO2) ST t22, -1 * SIZE(CO2) FETCH $0,0(CO1) FETCH $0,0(CO2) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L12_M1: andi M,MCO,1 # mr = 1 beqz M,.L0_N2_Loop nop .L60: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t22,t11 beqz K,.L65 nop #else dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) beqz K,.L65 MOV t22,t11 #endif .L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 LD b4, 2*SIZE(B) MADD t11,t11,a0,b0 LD b5,3*SIZE(B) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 LD b2,4*SIZE(B) MADD t11,t11,a4,b4 LD b3,5*SIZE(B) MADD t12,t12,a4,b5 LD a6, 3*SIZE(A) # a4 daddiu K,K,-1 LD b6,6*SIZE(B) MADD t11,t11,a2,b2 LD b7,7*SIZE(B) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE LD b0,0*SIZE(B) MADD t11,t11,a6,b6 LD b1,1*SIZE(B) bnez K,.L61 MADD t12,t12,a6,b7 .L65: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop .L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 LD b4,2*SIZE(B) daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 LD b5,3*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE .L67: LD a0,0(A) # a0 LD b0,0*SIZE(B) MADD t11,t11,a4,b4 LD b1,1*SIZE(B) MADD t12,t12,a4,b5 .L68: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE .L69: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .L0_N2_Loop: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move BO, B .align 5 .L0_N1: andi N,NCO,1 # nr = 1 beqz N,.L999 nop move CO1,C dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L11_M2 daddu C,CO1,LDC .L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B #else dsll K, KK, 2 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 LD a3,3*SIZE(A) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 beqz K,.L75 nop #else move B, BO # Reset B dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 beqz K,.L75 LD a3,3*SIZE(A) #endif .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 LD a4, 4*SIZE(A) MADD t21,t21,a1,b0 LD a5, 5*SIZE(A) FETCH $0,(PREA) LD a6,6*SIZE(A) MADD t31,t31,a2,b0 LD a7,7*SIZE(A) MADD t41,t41,a3,b0 .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 LD a0,8*SIZE(A) MADD t21,t21,a5,b4 LD a1,9*SIZE(A) FETCH $0,4*SIZE(PREA) LD a2,10*SIZE(A) MADD t31,t31,a6,b4 LD a3,11*SIZE(A) MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 LD a4,12*SIZE(A) daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a5,13*SIZE(A) MADD t21,t21,a1,b2 LD a6,14*SIZE(A) FETCH $0,8*SIZE(PREA) MADD t31,t31,a2,b2 LD a7,15*SIZE(A) MADD t41,t41,a3,b2 daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 LD a0,0*SIZE(A) daddu PREA,PREA,16*SIZE LD a1,1*SIZE(A) MADD t21,t21,a5,b6 LD a2,2*SIZE(A) daddiu K,K,-1 MADD t31,t31,a6,b6 LD a3,3*SIZE(A) MADD t41,t41,a7,b6 bnez K,.L71 FETCH $0,-32(PREA) .L75: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop .L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a4,4*SIZE(A) daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 LD a5,5*SIZE(A) MADD t21,t21,a1,b0 FETCH $0,0(PREA) LD a6,6*SIZE(A) MADD t31,t31,a2,b0 LD a7,7*SIZE(A) MADD t41,t41,a3,b0 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L77: LD b0,0(B) MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 FETCH $0,4*SIZE(PREA) LD a1,1*SIZE(A) MADD t31,t31,a6,b4 LD a2,2*SIZE(A) MADD t41,t41,a7,b4 LD a3,3*SIZE(A) daddu PREA,PREA,8*SIZE .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu B,B,1*SIZE daddu PREA,PREA,4*SIZE .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t31,c31,t31,ALPHA MADD t41,c41,t41,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) daddiu M,M,-1 # M-- FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) bnez M,.L70 # M!=0 daddu CO1,CO1,4*SIZE # COx += 4*8Byte #else daddiu M,M,-1 # M-- MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A,K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L70 nop #endif .align 3 .L11_M2: andi M,MCO,2 # mr = 2 beqz M,.L11_M1 nop .L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) LD a1,1*SIZE(A) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 # K=KCO/2 beqz K,.L85 nop #else move B, BO dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) beqz K,.L85 LD a1,1*SIZE(A) #endif .L81: # nr=1,mr=2,kr=4 LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 LD b2, 2*SIZE(B) LD a2,4*SIZE(A) MADD t11,t11,a4,b4 LD a3,5*SIZE(A) MADD t21,t21,a5,b4 LD b6, 3*SIZE(B) LD a6,6*SIZE(A) MADD t11,t11,a2,b2 LD a7,7*SIZE(A) MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD b0, 0*SIZE(B) daddiu K,K,-1 LD a0,0*SIZE(A) MADD t11,t11,a6,b6 LD a1,1*SIZE(A) bnez K,.L81 MADD t21,t21,a7,b6 .L85: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop .L86: LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 LD b0,0(B) LD a0,0*SIZE(A) MADD t11,t11,a4,b4 LD a1,1*SIZE(A) MADD t21,t21,a5,b4 .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,1*SIZE .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else daddu CO1,CO1,2*SIZE # COx += 2*8Byte MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 FETCH $0,0(CO1) ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L11_M1: andi M,MCO,1 # mr = 1 beqz M,.L999 nop .L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MTC $0,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra K, TEMP, 2 beqz K,.L95 nop #else move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif .L91: # nr=mr=1,kr=4 LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop .L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 .L99: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C MADD t11,c11,t11,ALPHA ST t11,0(CO1) #else MUL t11, ALPHA, t11 ST t11, 0 * SIZE(CO1) #endif .L999: # End ld $16, 0($sp) ld $17, 8($sp) ld $18, 16($sp) ld $19, 24($sp) ld $20, 32($sp) ld $21, 40($sp) ld $22, 48($sp) LD $f24, 56($sp) LD $f25, 64($sp) LD $f26, 72($sp) LD $f27, 80($sp) LD $f28, 88($sp) ld $23, 96($sp) ld $24, 104($sp) ld $25, 112($sp) LD $f20,120($sp) LD $f21,128($sp) LD $f22,136($sp) LD $f23,144($sp) j $31 daddiu $sp, $sp, 160 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/sgemm_kernel_loongson3b_4x4.S000066400000000000000000001176461313527062700232230ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define KCO $18 #define MCO $19 #define NCO $20 #define SPANB $21 #define PREB $23 #define PREA $24 #define SPANA $25 #define ALPHA $f15 #if defined(TRMMKERNEL) #define OFFSET $2 #define KK $3 #define TEMP $7 #endif #define R8 8 #define R9 9 #define R14 14 #define R15 15 #define R16 16 #define R17 17 #define t11 $f30 #define t21 $f31 #define t31 $f28 #define t41 $f29 #define t12 $f26 #define t22 $f27 #define t32 $f24 #define t42 $f25 #define t13 $f22 #define t23 $f23 #define t33 $f20 #define t43 $f21 #define t14 $f18 #define t24 $f19 #define t34 $f16 #define t44 $f17 #define c11 $f0 #define c21 $f1 #define c31 $f2 #define c41 $f3 #define c12 $f4 #define c22 $f5 #define c32 $f6 #define c42 $f7 #define c13 $f8 #define c23 $f9 #define c33 $f10 #define c43 $f11 #define c14 $f12 #define c24 $f13 #define c34 $f14 #define c44 $f0 #define a0 $f0 #define a1 $f1 #define a2 $f2 #define a3 $f3 #define a4 $f4 #define a5 $f5 #define a6 $f6 #define a7 $f7 #define b0 $f8 #define b1 $f9 #define b2 $f10 #define b3 $f11 #define b4 $f12 #define b5 $f13 #define b6 $f14 #define b7 $f15 #define F31 31 #define F30 30 #define F29 29 #define F28 28 #define F27 27 #define F26 26 #define F25 25 #define F24 24 #define F23 23 #define F22 22 #define F21 21 #define F20 20 #define F19 19 #define F18 18 #define F17 17 #define F16 16 #define F15 15 #define F14 14 #define F13 13 #define F12 12 #define F11 11 #define F10 10 #define F9 9 #define F8 8 #define F7 7 #define F6 6 #define F5 5 #define F4 4 #define F3 3 #define F2 2 #define F1 1 #define F0 0 PROLOGUE daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 48($sp) ST $f24, 56($sp) ST $f25, 64($sp) ST $f26, 72($sp) ST $f27, 80($sp) ST $f28, 88($sp) sd $23, 96($sp) sd $24, 104($sp) sd $25, 112($sp) ST $f20,120($sp) ST $f21,128($sp) ST $f22,136($sp) ST $f23,144($sp) .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M move NCO,N # Backup N move KCO,K # Backup K move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 #if defined(TRMMKERNEL) LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK,OFFSET #endif move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 move CO1,C dsra M,MCO,2 # M=MCO/2 move A,AO # Reset A daddu CO2,C,LDC daddu PREB,BO,SPANB # PreB point next panelB daddu CO3,CO2,LDC daddu PREA,AO,SPANA daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj .L10: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) #else dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K # move A B to data part daddu B,BO,TEMP #endif MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) MOV t21,t11 MOV t31,t11 LD a1,1*SIZE(A) MOV t41,t11 MOV t12,t11 LD b0,0(B) MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 LD a2,2*SIZE(A) MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) MOV t14,t11 MOV t24,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif dsra K,TEMP,2 # K=KCO/2 MOV t34,t11 beqz K,.L15 MOV t44,t11 #else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) MOV t21,t11 MOV t31,t11 LD a1,1*SIZE(A) MOV t41,t11 MOV t12,t11 LD b0,0(B) MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 dsra K,KCO,2 # K=KCO/2 LD a2,2*SIZE(A) MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) MOV t14,t11 MOV t24,t11 LD b3,3*SIZE(B) MOV t34,t11 beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif .align 5 .L11: # kr=4 MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 LD b6,6*SIZE(B) FETCH $0,(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 LD a7,7*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,7*SIZE(B) .L12: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,8*SIZE(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,9*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 LD b0,8*SIZE(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 LD b1,9*SIZE(B) FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,10*SIZE(A) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,10*SIZE(B) FETCH $0,4*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 LD a3,11*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 LD b3,11*SIZE(B) .L13: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,12*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,13*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,12*SIZE(B) FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,13*SIZE(B) FETCH $0,8*SIZE(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,14*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,16*SIZE # 4mr*4kr LD b6,14*SIZE(B) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 daddu B,B,16*SIZE # 4nr*4kr LD a7,-1*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,-1*SIZE(B) .L14: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,0(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,1*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 daddiu K,K,-1 LD b0,0(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,16*SIZE LD b1,1*SIZE(B) FETCH $0,12*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,2*SIZE(A) FETCH $0,-4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,2*SIZE(B) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREB,PREB,16*SIZE LD a3,3*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 bnez K,.L11 LD b3,3*SIZE(B) .L15: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop .L16: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) FETCH $0,0(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,0(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,8*SIZE # 4mr*2kr LD b6,6*SIZE(B) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 daddu B,B,8*SIZE # 4nr*2kr LD a7,-1*SIZE(A) MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,-1*SIZE(B) .L17: MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 LD a0,0*SIZE(A) MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 LD a1,1*SIZE(A) MADD t31,t31,a6,b4 MADD t41,t41,a7,b4 LD b0,0*SIZE(B) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 LD b1,1*SIZE(B) FETCH $0,4*SIZE(PREB) MADD t13,t13,a4,b6 MADD t23,t23,a5,b6 LD a2,2*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 LD b2,2*SIZE(B) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREA,PREA,8*SIZE LD a3,3*SIZE(A) MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 daddu PREB,PREB,8*SIZE LD b3,3*SIZE(B) .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # 4mr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 4nr*kr FETCH $0,0(PREA) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu PREB,PREB,4*SIZE MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 daddu PREA,PREA,4*SIZE MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 .L19: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA LD c13,0(CO3) MADD t12,c12,t12,ALPHA LD c23,1*SIZE(CO3) MADD t22,c22,t22,ALPHA LD c33,2*SIZE(CO3) MADD t32,c32,t32,ALPHA LD c43,3*SIZE(CO3) MADD t42,c42,t42,ALPHA LD c14,0(CO4) MADD t13,c13,t13,ALPHA LD c24,1*SIZE(CO4) MADD t23,c23,t23,ALPHA LD c34,2*SIZE(CO4) MADD t33,c33,t33,ALPHA LD c44,3*SIZE(CO4) MADD t43,c43,t43,ALPHA ST t11,0(CO1) MADD t14,c14,t14,ALPHA ST t21,1*SIZE(CO1) MADD t24,c24,t24,ALPHA ST t31,2*SIZE(CO1) MADD t34,c34,t34,ALPHA ST t41,3*SIZE(CO1) MADD t44,c44,t44,ALPHA daddiu M,M,-1 # M-- ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) ST t13,0(CO3) ST t23,1*SIZE(CO3) ST t33,2*SIZE(CO3) ST t43,3*SIZE(CO3) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) FETCH $0,8*SIZE(CO3) FETCH $0,8*SIZE(CO4) ST t14,0(CO4) daddu CO1,CO1,4*SIZE # COi += 4 ST t24,1*SIZE(CO4) daddu CO2,CO2,4*SIZE ST t34,2*SIZE(CO4) daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB bnez M,.L10 daddu CO4,CO4,4*SIZE #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t31, 2 * SIZE(CO1) MUL t32, ALPHA, t32 ST t41, 3 * SIZE(CO1) MUL t42, ALPHA, t42 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t32, 2 * SIZE(CO2) MUL t33, ALPHA, t33 ST t42, 3 * SIZE(CO2) MUL t43, ALPHA, t43 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t33, 2 * SIZE(CO3) MUL t34, ALPHA, t34 ST t43, 3 * SIZE(CO3) MUL t44, ALPHA, t44 ST t14, 0 * SIZE(CO4) daddiu M,M,-1 # M-- ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,4*SIZE(CO3) FETCH $0,4*SIZE(CO4) FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else daddiu TEMP,TEMP, -4 #endif dsll K,TEMP,2 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # mov A to the end of panel Ai daddu B,B,TEMP # mov B to the end of panel Bj #endif #ifdef LEFT daddiu KK, KK,4 #endif bnez M,.L10 nop #endif .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 beqz M,.L14_M1 nop .L20: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 LD b2,2*SIZE(B) MOV t13,t11 MOV t23,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 #else daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 #endif dsra K,TEMP,2 MOV t14,t11 beqz K,.L25 MOV t24,t11 # clear 2*4=8 results registers #else move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 dsra K,KCO,2 LD b2,2*SIZE(B) MOV t13,t11 MOV t23,t11 LD b3,3*SIZE(B) MOV t14,t11 beqz K,.L25 MOV t24,t11 #endif .L21: # nr=4,mr=2,kr=4 MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD a5,3*SIZE(A) MADD t12,t12,a0,b1 LD b4,4*SIZE(B) MADD t22,t22,a1,b1 LD b5,5*SIZE(B) MADD t13,t13,a0,b2 LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 LD a3,5*SIZE(A) MADD t12,t12,a4,b5 LD b0,8*SIZE(B) MADD t22,t22,a5,b5 LD b1,9*SIZE(B) MADD t13,t13,a4,b6 LD b2,10*SIZE(B) MADD t23,t23,a5,b6 LD b3,11*SIZE(B) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 daddiu K,K,-1 MADD t11,t11,a2,b0 LD a6,6*SIZE(A) MADD t21,t21,a3,b0 LD a7,7*SIZE(A) MADD t12,t12,a2,b1 LD b4,12*SIZE(B) MADD t22,t22,a3,b1 LD b5,13*SIZE(B) MADD t13,t13,a2,b2 LD b6,14*SIZE(B) MADD t23,t23,a3,b2 LD b7,15*SIZE(B) MADD t14,t14,a2,b3 MADD t24,t24,a3,b3 daddu A,A,8*SIZE # 2mr*4kr daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a6,b4 LD a0,0*SIZE(A) MADD t21,t21,a7,b4 LD a1,1*SIZE(A) MADD t12,t12,a6,b5 LD b0,0*SIZE(B) MADD t22,t22,a7,b5 LD b1,1*SIZE(B) MADD t13,t13,a6,b6 LD b2,2*SIZE(B) MADD t23,t23,a7,b6 LD b3,3*SIZE(B) MADD t14,t14,a6,b7 bnez K,.L21 MADD t24,t24,a7,b7 .L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else andi K,TEMP,2 #endif beqz K,.L28 nop .L26: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD a5,3*SIZE(A) MADD t12,t12,a0,b1 LD b4,4*SIZE(B) MADD t22,t22,a1,b1 LD b5,5*SIZE(B) MADD t13,t13,a0,b2 LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,4*SIZE # 2mr*2kr daddu B,B,8*SIZE # 4nr*2kr .L27: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD a1,1*SIZE(A) MADD t12,t12,a4,b5 LD b0,0*SIZE(B) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) MADD t13,t13,a4,b6 LD b2,2*SIZE(B) MADD t23,t23,a5,b6 LD b3,3*SIZE(B) MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 .L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr daddu B,B,4*SIZE # 4nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) MADD t21,c21,t21,ALPHA LD c14,0(CO4) MADD t12,c12,t12,ALPHA LD c24,1*SIZE(CO4) MADD t22,c22,t22,ALPHA ST t11,0(CO1) MADD t13,c13,t13,ALPHA ST t21,1*SIZE(CO1) MADD t23,c23,t23,ALPHA ST t12,0(CO2) MADD t14,c14,t14,ALPHA ST t22,1*SIZE(CO2) MADD t24,c24,t24,ALPHA ST t13,0(CO3) daddu CO1,CO1,2*SIZE # COi += 2 ST t23,1*SIZE(CO3) daddu CO2,CO2,2*SIZE ST t14,0(CO4) daddu CO3,CO3,2*SIZE ST t24,1*SIZE(CO4) daddu CO4,CO4,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,0(CO3) FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP,-2 #else daddiu TEMP,TEMP,-4 #endif dsll K,TEMP,1 + BASE_SHIFT dsll TEMP,TEMP,2 + BASE_SHIFT daddu A,A,K # move A to next panel Ai daddu B,B,TEMP # move B to next panel Bj #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L14_M1: andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop .L30: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, BASE_SHIFT dsll TEMP,KK,2 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) MOV t14,t11 LD b3,3*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra K,TEMP, 2 nop beqz K,.L35 nop #else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) MOV t14,t11 beqz K,.L35 LD b3,3*SIZE(B) #endif .L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 LD b6,6*SIZE(B) LD b7,7*SIZE(B) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 LD b0,8*SIZE(B) LD b1,9*SIZE(B) MADD t12,t12,a1,b5 LD b2,10*SIZE(B) LD b3,11*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 daddiu K,K,-1 LD b4,12*SIZE(B) LD b5,13*SIZE(B) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr LD b6,14*SIZE(B) LD b7,15*SIZE(B) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 LD a0, 0*SIZE(A) # a0 daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a3,b4 LD b0,0*SIZE(B) MADD t12,t12,a3,b5 LD b1,1*SIZE(B) MADD t13,t13,a3,b6 LD b2,2*SIZE(B) MADD t14,t14,a3,b7 bnez K,.L31 LD b3,3*SIZE(B) .L35: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop .L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr LD b6,6*SIZE(B) MADD t13,t13,a0,b2 LD b7,7*SIZE(B) MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr .L37: LD a0,0(A) MADD t11,t11,a1,b4 LD b0,0*SIZE(B) LD b1,1*SIZE(B) MADD t12,t12,a1,b5 LD b2,2*SIZE(B) LD b3,3*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE daddu B,B,4*SIZE MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA MADD t14,c14,t14,ALPHA ST t11,0(CO1) ST t12,0(CO2) ST t13,0(CO3) ST t14,0(CO4) #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 MUL t13, ALPHA, t13 MUL t14, ALPHA, t14 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll K,TEMP, BASE_SHIFT dsll TEMP,TEMP, 2 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 3 .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK,4 #endif bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj .align 5 .L0_N2: andi N,NCO,2 # nr = 2 beqz N,.L0_N1 nop .L0_N2_Lb: move CO1,C daddu CO2,C,LDC dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA daddu C,CO2,LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L12_M2 nop .L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 # gemm part LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t31,t11 LD b1,1*SIZE(B) MOV t41,t11 LD a2,2*SIZE(A) LD a3,3*SIZE(A) MOV t12,t11 MOV t22,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 #else move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 # gemm part LD a1,1*SIZE(A) MOV t21,t11 LD b0,0*SIZE(B) MOV t31,t11 LD b1,1*SIZE(B) MOV t41,t11 LD a2,2*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a3,3*SIZE(A) MOV t12,t11 MOV t22,t11 MOV t32,t11 beqz K,.L45 MOV t42,t11 #endif .L41: # nr=2,mr=kr=4 MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 LD a5,5*SIZE(A) MADD t12,t12,a0,b1 LD b4,2*SIZE(B) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t31,t31,a2,b0 LD a6,6*SIZE(A) MADD t41,t41,a3,b0 LD a7,7*SIZE(A) FETCH $0,(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L42: MADD t11,t11,a4,b4 LD a0,8*SIZE(A) MADD t21,t21,a5,b4 LD a1,9*SIZE(A) MADD t12,t12,a4,b5 LD b2,4*SIZE(B) MADD t22,t22,a5,b5 LD b3,5*SIZE(B) MADD t31,t31,a6,b4 LD a2,10*SIZE(A) MADD t41,t41,a7,b4 LD a3,11*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 .L43: MADD t11,t11,a0,b2 LD a4,12*SIZE(A) MADD t21,t21,a1,b2 LD a5,13*SIZE(A) MADD t12,t12,a0,b3 LD b6,6*SIZE(B) MADD t22,t22,a1,b3 LD b7,7*SIZE(B) MADD t31,t31,a2,b2 LD a6,14*SIZE(A) MADD t41,t41,a3,b2 LD a7,15*SIZE(A) FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 daddu A,A,16*SIZE # 4mr*4kr daddu B,B,8*SIZE # 2nr*4kr .L44: MADD t11,t11,a4,b6 LD a0,0*SIZE(A) MADD t21,t21,a5,b6 LD a1,1*SIZE(A) MADD t12,t12,a4,b7 LD b0,0*SIZE(B) MADD t22,t22,a5,b7 LD b1,1*SIZE(B) daddiu K,K,-1 daddu PREA,PREA,16*SIZE MADD t31,t31,a6,b6 LD a2,2*SIZE(A) MADD t41,t41,a7,b6 LD a3,3*SIZE(A) FETCH $0,-4*SIZE(PREA) MADD t32,t32,a6,b7 bnez K,.L41 MADD t42,t42,a7,b7 .L45: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop .L46: MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 LD a5,5*SIZE(A) MADD t12,t12,a0,b1 LD b4,2*SIZE(B) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t31,t31,a2,b0 LD a6,6*SIZE(A) MADD t41,t41,a3,b0 LD a7,7*SIZE(A) FETCH $0,0(PREA) MADD t32,t32,a2,b1 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L47: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD a1,1*SIZE(A) MADD t12,t12,a4,b5 LD b0,0*SIZE(B) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) MADD t31,t31,a6,b4 LD a2,2*SIZE(A) MADD t41,t41,a7,b4 LD a3,3*SIZE(A) FETCH $0,4*SIZE(PREA) MADD t32,t32,a6,b5 MADD t42,t42,a7,b5 daddu PREA,PREA,8*SIZE .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,2*SIZE daddu PREA,PREA,4*SIZE MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) LD c12,0(CO2) MADD t11,c11,t11,ALPHA LD c22,1*SIZE(CO2) MADD t21,c21,t21,ALPHA LD c32,2*SIZE(CO2) MADD t31,c31,t31,ALPHA LD c42,3*SIZE(CO2) MADD t41,c41,t41,ALPHA ST t11,0(CO1) MADD t12,c12,t12,ALPHA ST t21,1*SIZE(CO1) MADD t22,c22,t22,ALPHA ST t31,2*SIZE(CO1) MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) ST t32,2*SIZE(CO2) ST t42,3*SIZE(CO2) FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) daddu CO1,CO1,4*SIZE bnez M,.L40 daddu CO2,CO2,4*SIZE #else MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 ST t21, 1 * SIZE(CO1) MUL t32, ALPHA, t32 ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) FETCH $0,4(CO1) FETCH $0,4(CO2) #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -2 #endif dsll K,TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A,A,K daddu B,B,TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L40 nop #endif .align 3 .L12_M2: andi M,MCO,2 # mr = 2 beqz M,.L12_M1 nop .L50: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO #else dsll K, KK, 1 + BASE_SHIFT #mr=2 dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 daddu A, A, K daddu B, BO, TEMP #endif LD a0,0*SIZE(A) LD a1,1*SIZE(A) MTC $0,t11 LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 #else move B,BO LD a0,0*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a1,1*SIZE(A) MTC $0,t11 LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) MOV t12,t11 beqz K,.L55 MOV t22,t11 #endif .L51: # nr=2 mr=2,kr=4 MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 LD b4,2*SIZE(B) MADD t12,t12,a0,b1 LD a5,3*SIZE(A) MADD t22,t22,a1,b1 LD b5,3*SIZE(B) MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 LD b2,4*SIZE(B) MADD t12,t12,a4,b5 LD a3,5*SIZE(A) MADD t22,t22,a5,b5 daddiu K,K,-1 LD b3,5*SIZE(B) MADD t11,t11,a2,b2 LD a6,6*SIZE(A) MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE LD b6,6*SIZE(B) MADD t12,t12,a2,b3 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE LD a7,-1*SIZE(A) MADD t22,t22,a3,b3 LD b7,-1*SIZE(B) MADD t11,t11,a6,b6 LD a0,0*SIZE(A) MADD t21,t21,a7,b6 LD b0,0*SIZE(B) MADD t12,t12,a6,b7 LD a1,1*SIZE(A) MADD t22,t22,a7,b7 bnez K,.L51 LD b1,1*SIZE(B) .L55: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop .L56: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 LD b4,2*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE # 2nr*2kr LD a5,-1*SIZE(A) MADD t22,t22,a1,b1 LD b5,-1*SIZE(B) .L57: MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 LD b0,0*SIZE(B) MADD t12,t12,a4,b5 LD a1,1*SIZE(A) MADD t22,t22,a5,b5 LD b1,1*SIZE(B) .L58: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP, 1 #endif beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE # 2nr*kr MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA MADD t22,c22,t22,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t12,0(CO2) ST t22,1*SIZE(CO2) daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) FETCH $0,0(CO2) #else daddiu M, M, -1 daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t12, ALPHA, t12 MUL t22, ALPHA, t22 ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) ST t12, -2 * SIZE(CO2) ST t22, -1 * SIZE(CO2) FETCH $0,0(CO1) FETCH $0,0(CO2) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L12_M1: andi M,MCO,1 # mr = 1 beqz M,.L0_N2_Loop nop .L60: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra K,TEMP,2 MOV t22,t11 beqz K,.L65 nop #else dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) beqz K,.L65 MOV t22,t11 #endif .L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 LD b4, 2*SIZE(B) MADD t11,t11,a0,b0 LD b5,3*SIZE(B) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 LD b2,4*SIZE(B) MADD t11,t11,a4,b4 LD b3,5*SIZE(B) MADD t12,t12,a4,b5 LD a6, 3*SIZE(A) # a4 daddiu K,K,-1 LD b6,6*SIZE(B) MADD t11,t11,a2,b2 LD b7,7*SIZE(B) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE LD b0,0*SIZE(B) MADD t11,t11,a6,b6 LD b1,1*SIZE(B) bnez K,.L61 MADD t12,t12,a6,b7 .L65: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop .L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 LD b4,2*SIZE(B) daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 LD b5,3*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE .L67: LD a0,0(A) # a0 LD b0,0*SIZE(B) MADD t11,t11,a4,b4 LD b1,1*SIZE(B) MADD t12,t12,a4,b5 .L68: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 daddu B,B,2*SIZE .L69: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else MUL t11, ALPHA, t11 MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll K, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .L0_N2_Loop: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move BO, B .align 5 .L0_N1: andi N,NCO,1 # nr = 1 beqz N,.L999 nop move CO1,C dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif beqz M,.L11_M2 daddu C,CO1,LDC .L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B #else dsll K, KK, 2 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 LD a3,3*SIZE(A) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 4 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 beqz K,.L75 nop #else move B, BO # Reset B dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 beqz K,.L75 LD a3,3*SIZE(A) #endif .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 LD a4, 4*SIZE(A) MADD t21,t21,a1,b0 LD a5, 5*SIZE(A) FETCH $0,(PREA) LD a6,6*SIZE(A) MADD t31,t31,a2,b0 LD a7,7*SIZE(A) MADD t41,t41,a3,b0 .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 LD a0,8*SIZE(A) MADD t21,t21,a5,b4 LD a1,9*SIZE(A) FETCH $0,4*SIZE(PREA) LD a2,10*SIZE(A) MADD t31,t31,a6,b4 LD a3,11*SIZE(A) MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 LD a4,12*SIZE(A) daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a5,13*SIZE(A) MADD t21,t21,a1,b2 LD a6,14*SIZE(A) FETCH $0,8*SIZE(PREA) MADD t31,t31,a2,b2 LD a7,15*SIZE(A) MADD t41,t41,a3,b2 daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 LD a0,0*SIZE(A) daddu PREA,PREA,16*SIZE LD a1,1*SIZE(A) MADD t21,t21,a5,b6 LD a2,2*SIZE(A) daddiu K,K,-1 MADD t31,t31,a6,b6 LD a3,3*SIZE(A) MADD t41,t41,a7,b6 bnez K,.L71 FETCH $0,-32(PREA) .L75: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop .L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a4,4*SIZE(A) daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 LD a5,5*SIZE(A) MADD t21,t21,a1,b0 FETCH $0,0(PREA) LD a6,6*SIZE(A) MADD t31,t31,a2,b0 LD a7,7*SIZE(A) MADD t41,t41,a3,b0 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE .L77: LD b0,0(B) MADD t11,t11,a4,b4 LD a0,0*SIZE(A) MADD t21,t21,a5,b4 FETCH $0,4*SIZE(PREA) LD a1,1*SIZE(A) MADD t31,t31,a6,b4 LD a2,2*SIZE(A) MADD t41,t41,a7,b4 LD a3,3*SIZE(A) daddu PREA,PREA,8*SIZE .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 daddu B,B,1*SIZE daddu PREA,PREA,4*SIZE .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t31,c31,t31,ALPHA MADD t41,c41,t41,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) daddiu M,M,-1 # M-- FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) bnez M,.L70 # M!=0 daddu CO1,CO1,4*SIZE # COx += 4*8Byte #else daddiu M,M,-1 # M-- MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 ST t11,0(CO1) ST t21,1*SIZE(CO1) ST t31,2*SIZE(CO1) ST t41,3*SIZE(CO1) FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -4 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A,K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 4 #endif bnez M,.L70 nop #endif .align 3 .L11_M2: andi M,MCO,2 # mr = 2 beqz M,.L11_M1 nop .L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) LD a1,1*SIZE(A) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 1 #endif dsra K,TEMP,2 # K=KCO/2 beqz K,.L85 nop #else move B, BO dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) beqz K,.L85 LD a1,1*SIZE(A) #endif .L81: # nr=1,mr=2,kr=4 LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 LD b2, 2*SIZE(B) LD a2,4*SIZE(A) MADD t11,t11,a4,b4 LD a3,5*SIZE(A) MADD t21,t21,a5,b4 LD b6, 3*SIZE(B) LD a6,6*SIZE(A) MADD t11,t11,a2,b2 LD a7,7*SIZE(A) MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD b0, 0*SIZE(B) daddiu K,K,-1 LD a0,0*SIZE(A) MADD t11,t11,a6,b6 LD a1,1*SIZE(A) bnez K,.L81 MADD t21,t21,a7,b6 .L85: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop .L86: LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 LD b0,0(B) LD a0,0*SIZE(A) MADD t11,t11,a4,b4 LD a1,1*SIZE(A) MADD t21,t21,a5,b4 .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 daddu B,B,1*SIZE .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA ST t11,0(CO1) ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else daddu CO1,CO1,2*SIZE # COx += 2*8Byte MUL t11, ALPHA, t11 MUL t21, ALPHA, t21 FETCH $0,0(CO1) ST t11, -2 * SIZE(CO1) ST t21, -1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll K, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu A, A, K daddu B, B, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif .align 3 .L11_M1: andi M,MCO,1 # mr = 1 beqz M,.L999 nop .L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO #else dsll K, KK, 0 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu A, A, K daddu B, BO, TEMP #endif LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MTC $0,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra K, TEMP, 2 beqz K,.L95 nop #else move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif .L91: # nr=mr=1,kr=4 LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop .L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 .L99: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C MADD t11,c11,t11,ALPHA ST t11,0(CO1) #else MUL t11, ALPHA, t11 ST t11, 0 * SIZE(CO1) #endif .L999: # End ld $16, 0($sp) ld $17, 8($sp) ld $18, 16($sp) ld $19, 24($sp) ld $20, 32($sp) ld $21, 40($sp) ld $22, 48($sp) LD $f24, 56($sp) LD $f25, 64($sp) LD $f26, 72($sp) LD $f27, 80($sp) LD $f28, 88($sp) ld $23, 96($sp) ld $24, 104($sp) ld $25, 112($sp) LD $f20,120($sp) LD $f21,128($sp) LD $f22,136($sp) LD $f23,144($sp) j $31 daddiu $sp, $sp, 160 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/snrm2.S000066400000000000000000000141731313527062700167410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f6 #define a2 $f7 #define a3 $f8 #define a4 $f9 #define a5 $f10 #define a6 $f11 #define a7 $f12 #define a8 $f13 #define s1 $f0 #define s2 $f1 #define t1 $f2 #define t2 $f3 #define t3 $f4 #define t4 $f5 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif dmtc1 $0, s1 li TEMP, SIZE blez N, .L999 mov.d s2, s1 blez INCX, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 dsra I, N, 3 blez I, .L15 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD a3, 2 * SIZE(X) LD a4, 3 * SIZE(X) LD a5, 4 * SIZE(X) daddiu I, I, -1 cvt.d.s t1, a1 LD a6, 5 * SIZE(X) cvt.d.s t2, a2 LD a7, 6 * SIZE(X) cvt.d.s t3, a3 LD a8, 7 * SIZE(X) blez I, .L13 cvt.d.s t4, a4 .align 3 .L12: madd.d s1, s1, t1, t1 LD a1, 8 * SIZE(X) cvt.d.s t1, a5 NOP madd.d s2, s2, t2, t2 LD a2, 9 * SIZE(X) cvt.d.s t2, a6 NOP madd.d s1, s1, t3, t3 LD a3, 10 * SIZE(X) cvt.d.s t3, a7 NOP madd.d s2, s2, t4, t4 LD a4, 11 * SIZE(X) cvt.d.s t4, a8 NOP madd.d s1, s1, t1, t1 LD a5, 12 * SIZE(X) cvt.d.s t1, a1 NOP madd.d s2, s2, t2, t2 LD a6, 13 * SIZE(X) cvt.d.s t2, a2 daddiu I, I, -1 madd.d s1, s1, t3, t3 LD a7, 14 * SIZE(X) cvt.d.s t3, a3 daddiu X, X, 8 * SIZE madd.d s2, s2, t4, t4 LD a8, 7 * SIZE(X) bgtz I, .L12 cvt.d.s t4, a4 .align 3 .L13: madd.d s1, s1, t1, t1 cvt.d.s t1, a5 madd.d s2, s2, t2, t2 cvt.d.s t2, a6 madd.d s1, s1, t3, t3 cvt.d.s t3, a7 madd.d s2, s2, t4, t4 cvt.d.s t4, a8 madd.d s1, s1, t1, t1 madd.d s2, s2, t2, t2 madd.d s1, s1, t3, t3 madd.d s2, s2, t4, t4 daddiu X, X, 8 * SIZE .align 3 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) daddiu I, I, -1 cvt.d.s t1, a1 madd.d s1, s1, t1, t1 bgtz I, .L16 daddiu X, X, SIZE j .L999 NOP .align 3 .L20: blez I, .L25 NOP LD a1, 0 * SIZE(X) daddu X, X, INCX LD a2, 0 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) daddu X, X, INCX LD a4, 0 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) daddu X, X, INCX LD a6, 0 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) daddu X, X, INCX LD a8, 0 * SIZE(X) daddiu I, I, -1 cvt.d.s t1, a1 cvt.d.s t2, a2 cvt.d.s t3, a3 cvt.d.s t4, a4 blez I, .L24 daddu X, X, INCX .align 3 .L23: madd.d s1, s1, t1, t1 LD a1, 0 * SIZE(X) cvt.d.s t1, a5 daddu X, X, INCX madd.d s2, s2, t2, t2 LD a2, 0 * SIZE(X) cvt.d.s t2, a6 daddu X, X, INCX madd.d s1, s1, t3, t3 LD a3, 0 * SIZE(X) cvt.d.s t3, a7 daddu X, X, INCX madd.d s2, s2, t4, t4 LD a4, 0 * SIZE(X) cvt.d.s t4, a8 daddu X, X, INCX madd.d s1, s1, t1, t1 LD a5, 0 * SIZE(X) cvt.d.s t1, a1 daddu X, X, INCX madd.d s2, s2, t2, t2 LD a6, 0 * SIZE(X) cvt.d.s t2, a2 daddu X, X, INCX madd.d s1, s1, t3, t3 LD a7, 0 * SIZE(X) cvt.d.s t3, a3 daddu X, X, INCX madd.d s2, s2, t4, t4 LD a8, 0 * SIZE(X) cvt.d.s t4, a4 daddiu I, I, -1 bgtz I, .L23 daddu X, X, INCX .align 3 .L24: madd.d s1, s1, t1, t1 cvt.d.s t1, a5 madd.d s2, s2, t2, t2 cvt.d.s t2, a6 madd.d s1, s1, t3, t3 cvt.d.s t3, a7 madd.d s2, s2, t4, t4 cvt.d.s t4, a8 madd.d s1, s1, t1, t1 madd.d s2, s2, t2, t2 madd.d s1, s1, t3, t3 madd.d s2, s2, t4, t4 .align 3 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) daddiu I, I, -1 cvt.d.s t1, a1 daddu X, X, INCX bgtz I, .L26 madd.d s1, s1, t1, t1 .align 3 .L999: add.d s1, s1, s2 sqrt.d s1, s1 j $31 cvt.s.d s1, s1 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/swap.S000066400000000000000000000176431313527062700166570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $8 #define INCX $9 #define Y $10 #define INCY $11 #define I $2 #define TEMP $3 #define XX $5 #define YY $6 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f14 #define b8 $f15 PROLOGUE li TEMP, SIZE NOP blez N, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, BASE_SHIFT bne INCY, TEMP, .L20 dsra I, N, 3 blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) LD a3, 2 * SIZE(X) LD b3, 2 * SIZE(Y) LD a4, 3 * SIZE(X) LD b4, 3 * SIZE(Y) LD a5, 4 * SIZE(X) LD b5, 4 * SIZE(Y) LD a6, 5 * SIZE(X) LD b6, 5 * SIZE(Y) LD a7, 6 * SIZE(X) LD b7, 6 * SIZE(Y) LD a8, 7 * SIZE(X) LD b8, 7 * SIZE(Y) blez I, .L13 NOP .align 3 .L12: ST a1, 0 * SIZE(Y) LD a1, 8 * SIZE(X) ST b1, 0 * SIZE(X) LD b1, 8 * SIZE(Y) ST a2, 1 * SIZE(Y) LD a2, 9 * SIZE(X) ST b2, 1 * SIZE(X) LD b2, 9 * SIZE(Y) ST a3, 2 * SIZE(Y) LD a3, 10 * SIZE(X) ST b3, 2 * SIZE(X) LD b3, 10 * SIZE(Y) ST a4, 3 * SIZE(Y) LD a4, 11 * SIZE(X) ST b4, 3 * SIZE(X) LD b4, 11 * SIZE(Y) ST a5, 4 * SIZE(Y) LD a5, 12 * SIZE(X) ST b5, 4 * SIZE(X) LD b5, 12 * SIZE(Y) ST a6, 5 * SIZE(Y) LD a6, 13 * SIZE(X) ST b6, 5 * SIZE(X) LD b6, 13 * SIZE(Y) ST a7, 6 * SIZE(Y) LD a7, 14 * SIZE(X) ST b7, 6 * SIZE(X) LD b7, 14 * SIZE(Y) ST a8, 7 * SIZE(Y) LD a8, 15 * SIZE(X) ST b8, 7 * SIZE(X) LD b8, 15 * SIZE(Y) daddiu I, I, -1 daddiu X, X, 8 * SIZE bgtz I, .L12 daddiu Y, Y, 8 * SIZE .align 3 .L13: ST a1, 0 * SIZE(Y) ST b1, 0 * SIZE(X) ST a2, 1 * SIZE(Y) ST b2, 1 * SIZE(X) ST a3, 2 * SIZE(Y) ST b3, 2 * SIZE(X) ST a4, 3 * SIZE(Y) ST b4, 3 * SIZE(X) ST a5, 4 * SIZE(Y) ST b5, 4 * SIZE(X) ST a6, 5 * SIZE(Y) ST b6, 5 * SIZE(X) ST a7, 6 * SIZE(Y) ST b7, 6 * SIZE(X) ST a8, 7 * SIZE(Y) ST b8, 7 * SIZE(X) daddiu X, X, 8 * SIZE daddiu Y, Y, 8 * SIZE .align 3 .L15: andi I, N, 7 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu X, X, SIZE daddiu I, I, -1 daddiu Y, Y, SIZE ST b1, -1 * SIZE(X) bgtz I, .L16 ST a1, -1 * SIZE(Y) j .L999 NOP .align 3 .L20: dsra I, N, 3 move XX, X move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) daddu X, X, INCX LD b1, 0 * SIZE(Y) daddu Y, Y, INCY LD a2, 0 * SIZE(X) daddu X, X, INCX LD b2, 0 * SIZE(Y) daddu Y, Y, INCY LD a3, 0 * SIZE(X) daddu X, X, INCX LD b3, 0 * SIZE(Y) daddu Y, Y, INCY LD a4, 0 * SIZE(X) daddu X, X, INCX LD b4, 0 * SIZE(Y) daddu Y, Y, INCY LD a5, 0 * SIZE(X) daddu X, X, INCX LD b5, 0 * SIZE(Y) daddu Y, Y, INCY LD a6, 0 * SIZE(X) daddu X, X, INCX LD b6, 0 * SIZE(Y) daddu Y, Y, INCY LD a7, 0 * SIZE(X) daddu X, X, INCX LD b7, 0 * SIZE(Y) daddu Y, Y, INCY LD a8, 0 * SIZE(X) daddu X, X, INCX LD b8, 0 * SIZE(Y) daddu Y, Y, INCY blez I, .L23 NOP .align 3 .L22: ST a1, 0 * SIZE(YY) daddu YY, YY, INCY LD a1, 0 * SIZE(X) daddu X, X, INCX ST b1, 0 * SIZE(XX) daddu XX, XX, INCX LD b1, 0 * SIZE(Y) daddu Y, Y, INCY ST a2, 0 * SIZE(YY) daddu YY, YY, INCY LD a2, 0 * SIZE(X) daddu X, X, INCX ST b2, 0 * SIZE(XX) daddu XX, XX, INCX LD b2, 0 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(YY) daddu YY, YY, INCY LD a3, 0 * SIZE(X) daddu X, X, INCX ST b3, 0 * SIZE(XX) daddu XX, XX, INCX LD b3, 0 * SIZE(Y) daddu Y, Y, INCY ST a4, 0 * SIZE(YY) daddu YY, YY, INCY LD a4, 0 * SIZE(X) daddu X, X, INCX ST b4, 0 * SIZE(XX) daddu XX, XX, INCX LD b4, 0 * SIZE(Y) daddu Y, Y, INCY ST a5, 0 * SIZE(YY) daddu YY, YY, INCY LD a5, 0 * SIZE(X) daddu X, X, INCX ST b5, 0 * SIZE(XX) daddu XX, XX, INCX LD b5, 0 * SIZE(Y) daddu Y, Y, INCY ST a6, 0 * SIZE(YY) daddu YY, YY, INCY LD a6, 0 * SIZE(X) daddu X, X, INCX ST b6, 0 * SIZE(XX) daddu XX, XX, INCX LD b6, 0 * SIZE(Y) daddu Y, Y, INCY ST a7, 0 * SIZE(YY) daddu YY, YY, INCY LD a7, 0 * SIZE(X) daddu X, X, INCX ST b7, 0 * SIZE(XX) daddu XX, XX, INCX LD b7, 0 * SIZE(Y) daddu Y, Y, INCY ST a8, 0 * SIZE(YY) daddu YY, YY, INCY LD a8, 0 * SIZE(X) daddu X, X, INCX ST b8, 0 * SIZE(XX) daddu XX, XX, INCX LD b8, 0 * SIZE(Y) daddiu I, I, -1 bgtz I, .L22 daddu Y, Y, INCY .align 3 .L23: ST a1, 0 * SIZE(YY) daddu YY, YY, INCY ST b1, 0 * SIZE(XX) daddu XX, XX, INCX ST a2, 0 * SIZE(YY) daddu YY, YY, INCY ST b2, 0 * SIZE(XX) daddu XX, XX, INCX ST a3, 0 * SIZE(YY) daddu YY, YY, INCY ST b3, 0 * SIZE(XX) daddu XX, XX, INCX ST a4, 0 * SIZE(YY) daddu YY, YY, INCY ST b4, 0 * SIZE(XX) daddu XX, XX, INCX ST a5, 0 * SIZE(YY) daddu YY, YY, INCY ST b5, 0 * SIZE(XX) daddu XX, XX, INCX ST a6, 0 * SIZE(YY) daddu YY, YY, INCY ST b6, 0 * SIZE(XX) daddu XX, XX, INCX ST a7, 0 * SIZE(YY) daddu YY, YY, INCY ST b7, 0 * SIZE(XX) daddu XX, XX, INCX ST a8, 0 * SIZE(YY) daddu YY, YY, INCY ST b8, 0 * SIZE(XX) daddu XX, XX, INCX .align 3 .L25: andi I, N, 7 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu I, I, -1 ST a1, 0 * SIZE(Y) ST b1, 0 * SIZE(X) daddu X, X, INCX bgtz I, .L26 daddu Y, Y, INCY .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/symv_L.S000066400000000000000000000314421313527062700171470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define A $6 #define LDA $7 #define X $8 #define INCX $9 #define Y $10 #define INCY $11 #define BUFFER $5 #define XX $12 #define YY $13 #define I $14 #define IS $15 #define AO1 $16 #define AO2 $17 #define Y1 $18 #define TEMP $19 #define II INCX #define ALPHA $f13 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define alpha1 $f8 #define alpha2 $f9 #define x1 $f10 #define x2 $f11 #define x3 $f12 #define x4 $f14 #define xsum1 $f15 #define xsum2 $f16 #define ysum1 $f17 #define ysum2 $f18 #define ysum3 $f19 #define ysum4 $f20 PROLOGUE LDARG BUFFER, 0($sp) daddiu $sp, $sp, -32 SDARG $16, 0($sp) dsll LDA, LDA, BASE_SHIFT SDARG $17, 8($sp) dsll INCX, INCX, BASE_SHIFT SDARG $18, 16($sp) dsll INCY, INCY, BASE_SHIFT SDARG $19, 24($sp) nop blez M, .L999 li IS, SIZE beq IS, INCX, .L05 move Y1, Y dsra I, M, 2 move XX, X blez I, .L02 move X, BUFFER .align 3 .L01: LD a1, 0 * SIZE(XX) daddu XX, XX, INCX LD a2, 0 * SIZE(XX) daddu XX, XX, INCX LD a3, 0 * SIZE(XX) daddu XX, XX, INCX LD a4, 0 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L01 daddiu BUFFER, BUFFER, 4 * SIZE .align 3 .L02: andi I, M, 3 blez I, .L05 NOP .align 3 .L03: LD a1, 0 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L03 daddiu BUFFER, BUFFER, 1 * SIZE .align 3 .L05: beq IS, INCY, .L10 daddiu BUFFER, BUFFER, 255 li TEMP, -256 and BUFFER, BUFFER, TEMP dsra I, M, 2 move Y1, BUFFER blez I, .L07 move YY, Y .align 3 .L06: LD a1, 0 * SIZE(YY) daddu YY, YY, INCY LD a2, 0 * SIZE(YY) daddu YY, YY, INCY LD a3, 0 * SIZE(YY) daddu YY, YY, INCY LD a4, 0 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L06 daddiu BUFFER, BUFFER, 4 * SIZE .align 3 .L07: andi I, M, 3 blez I, .L10 NOP .align 3 .L08: LD a1, 0 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L08 daddiu BUFFER, BUFFER, 1 * SIZE .align 3 .L10: slti TEMP, M, 2 nop bgtz TEMP, .L20 li IS, 0 .align 3 .L11: dsll TEMP, IS, BASE_SHIFT nop daddu XX, X, TEMP daddu YY, Y1, TEMP LD alpha1, 0 * SIZE(XX) move AO1, A LD alpha2, 1 * SIZE(XX) daddiu XX, XX, 2 * SIZE LD a1, 0 * SIZE(AO1) daddu AO2, A, LDA LD a2, 1 * SIZE(AO1) daddiu AO1, AO1, 2 * SIZE LD a3, 0 * SIZE(AO2) daddu A, AO2, LDA LD a4, 1 * SIZE(AO2) daddiu AO2, AO2, 2 * SIZE MUL xsum1, alpha1, a1 daddiu A, A, 2 * SIZE MUL xsum2, alpha1, a2 dsubu II, M, IS MADD xsum1, xsum1, alpha2, a2 MADD xsum2, xsum2, alpha2, a4 daddiu II, II, - 2 MUL alpha1, ALPHA, alpha1 daddiu YY, YY, 2 * SIZE MUL alpha2, ALPHA, alpha2 dsra I, II, 3 blez I, .L15 daddiu I, I, -1 LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x3, 2 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a5, 2 * SIZE(AO1) LD a6, 3 * SIZE(AO1) LD a3, 0 * SIZE(AO2) LD a4, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) blez I, .L13 LD ysum3, 2 * SIZE(YY) .align 3 .L12: MADD ysum1, ysum1, alpha1, a1 LD ysum4, 3 * SIZE(YY) MADD ysum2, ysum2, alpha1, a2 LD x4, 3 * SIZE(XX) MADD xsum1, xsum1, x1, a1 LD a1, 4 * SIZE(AO1) MADD xsum2, xsum2, x1, a3 LD x1, 4 * SIZE(XX) MADD ysum1, ysum1, alpha2, a3 LD a3, 4 * SIZE(AO2) MADD ysum2, ysum2, alpha2, a4 daddiu I, I, -1 MADD xsum1, xsum1, x2, a2 LD a2, 5 * SIZE(AO1) MADD xsum2, xsum2, x2, a4 LD a4, 5 * SIZE(AO2) ST ysum1, 0 * SIZE(YY) LD ysum1, 4 * SIZE(YY) ST ysum2, 1 * SIZE(YY) LD ysum2, 5 * SIZE(YY) MADD ysum3, ysum3, alpha1, a5 nop MADD ysum4, ysum4, alpha1, a6 LD x2, 5 * SIZE(XX) MADD xsum1, xsum1, x3, a5 LD a5, 6 * SIZE(AO1) MADD xsum2, xsum2, x3, a7 LD x3, 6 * SIZE(XX) MADD ysum3, ysum3, alpha2, a7 LD a7, 6 * SIZE(AO2) MADD ysum4, ysum4, alpha2, a8 daddiu XX, XX, 8 * SIZE MADD xsum1, xsum1, x4, a6 LD a6, 7 * SIZE(AO1) MADD xsum2, xsum2, x4, a8 LD a8, 7 * SIZE(AO2) ST ysum3, 2 * SIZE(YY) LD ysum3, 6 * SIZE(YY) ST ysum4, 3 * SIZE(YY) LD ysum4, 7 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 daddiu AO2, AO2, 8 * SIZE MADD ysum2, ysum2, alpha1, a2 LD x4,-1 * SIZE(XX) MADD xsum1, xsum1, x1, a1 LD a1, 8 * SIZE(AO1) MADD xsum2, xsum2, x1, a3 LD x1, 0 * SIZE(XX) MADD ysum1, ysum1, alpha2, a3 LD a3, 0 * SIZE(AO2) MADD ysum2, ysum2, alpha2, a4 nop MADD xsum1, xsum1, x2, a2 LD a2, 9 * SIZE(AO1) MADD xsum2, xsum2, x2, a4 LD a4, 1 * SIZE(AO2) ST ysum1, 4 * SIZE(YY) LD ysum1, 8 * SIZE(YY) ST ysum2, 5 * SIZE(YY) LD ysum2, 9 * SIZE(YY) MADD ysum3, ysum3, alpha1, a5 daddiu AO1, AO1, 8 * SIZE MADD ysum4, ysum4, alpha1, a6 LD x2, 1 * SIZE(XX) MADD xsum1, xsum1, x3, a5 LD a5, 2 * SIZE(AO1) MADD xsum2, xsum2, x3, a7 LD x3, 2 * SIZE(XX) MADD ysum3, ysum3, alpha2, a7 LD a7, 2 * SIZE(AO2) MADD ysum4, ysum4, alpha2, a8 daddiu YY, YY, 8 * SIZE MADD xsum1, xsum1, x4, a6 LD a6, 3 * SIZE(AO1) MADD xsum2, xsum2, x4, a8 LD a8, 3 * SIZE(AO2) ST ysum3,-2 * SIZE(YY) LD ysum3, 2 * SIZE(YY) bgtz I, .L12 ST ysum4,-1 * SIZE(YY) .align 3 .L13: MADD ysum1, ysum1, alpha1, a1 LD ysum4, 3 * SIZE(YY) MADD ysum2, ysum2, alpha1, a2 LD x4, 3 * SIZE(XX) MADD xsum1, xsum1, x1, a1 LD a1, 4 * SIZE(AO1) MADD xsum2, xsum2, x1, a3 LD x1, 4 * SIZE(XX) MADD ysum1, ysum1, alpha2, a3 LD a3, 4 * SIZE(AO2) MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 LD a2, 5 * SIZE(AO1) MADD xsum2, xsum2, x2, a4 LD a4, 5 * SIZE(AO2) LD x2, 5 * SIZE(XX) ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) LD ysum1, 4 * SIZE(YY) LD ysum2, 5 * SIZE(YY) MADD ysum3, ysum3, alpha1, a5 MADD ysum4, ysum4, alpha1, a6 MADD xsum1, xsum1, x3, a5 LD a5, 6 * SIZE(AO1) MADD xsum2, xsum2, x3, a7 LD x3, 6 * SIZE(XX) MADD ysum3, ysum3, alpha2, a7 LD a7, 6 * SIZE(AO2) MADD ysum4, ysum4, alpha2, a8 MADD xsum1, xsum1, x4, a6 LD a6, 7 * SIZE(AO1) MADD xsum2, xsum2, x4, a8 LD a8, 7 * SIZE(AO2) LD x4, 7 * SIZE(XX) ST ysum3, 2 * SIZE(YY) ST ysum4, 3 * SIZE(YY) LD ysum3, 6 * SIZE(YY) LD ysum4, 7 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha1, a2 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x1, a3 MADD ysum1, ysum1, alpha2, a3 MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 MADD xsum2, xsum2, x2, a4 MADD ysum3, ysum3, alpha1, a5 MADD ysum4, ysum4, alpha1, a6 MADD xsum1, xsum1, x3, a5 MADD xsum2, xsum2, x3, a7 MADD ysum3, ysum3, alpha2, a7 daddiu XX, XX, 8 * SIZE MADD ysum4, ysum4, alpha2, a8 daddiu AO1, AO1, 8 * SIZE MADD xsum1, xsum1, x4, a6 daddiu AO2, AO2, 8 * SIZE MADD xsum2, xsum2, x4, a8 ST ysum1, 4 * SIZE(YY) ST ysum2, 5 * SIZE(YY) ST ysum3, 6 * SIZE(YY) ST ysum4, 7 * SIZE(YY) daddiu YY, YY, 8 * SIZE .align 3 .L15: andi I, II, 4 NOP blez I, .L16 NOP LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x3, 2 * SIZE(XX) LD x4, 3 * SIZE(XX) daddiu XX, XX, 4 * SIZE LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a5, 2 * SIZE(AO1) LD a6, 3 * SIZE(AO1) daddiu AO1, AO1, 4 * SIZE LD a3, 0 * SIZE(AO2) LD a4, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) daddiu AO2, AO2, 4 * SIZE LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) LD ysum3, 2 * SIZE(YY) LD ysum4, 3 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha1, a2 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x1, a3 MADD ysum1, ysum1, alpha2, a3 MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 MADD xsum2, xsum2, x2, a4 MADD ysum3, ysum3, alpha1, a5 MADD ysum4, ysum4, alpha1, a6 MADD xsum1, xsum1, x3, a5 MADD xsum2, xsum2, x3, a7 MADD ysum3, ysum3, alpha2, a7 MADD ysum4, ysum4, alpha2, a8 MADD xsum1, xsum1, x4, a6 MADD xsum2, xsum2, x4, a8 ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) ST ysum3, 2 * SIZE(YY) ST ysum4, 3 * SIZE(YY) daddiu YY, YY, 4 * SIZE .align 3 .L16: andi I, II, 2 NOP blez I, .L17 NOP LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) daddiu XX, XX, 2 * SIZE LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) daddiu AO1, AO1, 2 * SIZE LD a3, 0 * SIZE(AO2) LD a4, 1 * SIZE(AO2) daddiu AO2, AO2, 2 * SIZE LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha1, a2 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x1, a3 MADD ysum1, ysum1, alpha2, a3 MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 MADD xsum2, xsum2, x2, a4 ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) daddiu YY, YY, 2 * SIZE .align 3 .L17: andi I, M, 1 NOP blez I, .L19 NOP LD x1, 0 * SIZE(XX) daddiu XX, XX, 1 * SIZE LD a1, 0 * SIZE(AO1) daddiu AO1, AO1, 1 * SIZE LD a3, 0 * SIZE(AO2) daddiu AO2, AO2, 1 * SIZE LD ysum1, 0 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD xsum1, xsum1, x1, a1 MADD ysum1, ysum1, alpha2, a3 MADD xsum2, xsum2, x1, a3 ST ysum1, 0 * SIZE(YY) .align 3 .L19: dsll TEMP, IS, BASE_SHIFT daddu TEMP, Y1, TEMP LD ysum1, 0 * SIZE(TEMP) LD ysum2, 1 * SIZE(TEMP) MADD ysum1, ysum1, ALPHA, xsum1 MADD ysum2, ysum2, ALPHA, xsum2 ST ysum1, 0 * SIZE(TEMP) ST ysum2, 1 * SIZE(TEMP) daddiu TEMP, IS, 4 slt TEMP, M, TEMP beqz TEMP, .L11 daddiu IS, IS, 2 .align 3 .L20: andi I, M, 1 dsll TEMP, IS, BASE_SHIFT blez I, .L900 daddu XX, X, TEMP daddu YY, Y1, TEMP LD x1, 0 * SIZE(XX) LD ysum1, 0 * SIZE(YY) LD a1, 0 * SIZE(A) MUL xsum1, a1, x1 MADD ysum1, ysum1, ALPHA, xsum1 ST ysum1, 0 * SIZE(YY) .align 3 .L900: li IS, SIZE beq INCY, IS, .L999 NOP dsra I, M, 2 blez I, .L905 NOP .align 3 .L902: LD a1, 0 * SIZE(Y1) LD a2, 1 * SIZE(Y1) LD a3, 2 * SIZE(Y1) LD a4, 3 * SIZE(Y1) ST a1, 0 * SIZE(Y) daddu Y, Y, INCY ST a2, 0 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) daddu Y, Y, INCY ST a4, 0 * SIZE(Y) daddu Y, Y, INCY daddiu I, I, -1 bgtz I, .L902 daddiu Y1, Y1, 4 * SIZE .align 3 .L905: andi I, M, 3 blez I, .L999 NOP .align 3 .L906: LD a1, 0 * SIZE(Y1) daddiu Y1, Y1, 1 * SIZE ST a1, 0 * SIZE(Y) daddiu I, I, -1 bgtz I, .L906 daddu Y, Y, INCY .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) j $31 daddiu $sp, $sp, 32 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/symv_U.S000066400000000000000000000353241313527062700171630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define A $6 #define LDA $7 #define X $8 #define INCX $9 #define Y $10 #define INCY $11 #define BUFFER $5 #define XX $12 #define YY $13 #define I $14 #define IS $15 #define AO1 $16 #define AO2 $17 #define Y1 $18 #define TEMP $19 #define ALPHA $f13 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define alpha1 $f8 #define alpha2 $f9 #define x1 $f10 #define x2 $f11 #define x3 $f12 #define x4 $f14 #define xsum1 $f15 #define xsum2 $f16 #define ysum1 $f17 #define ysum2 $f18 #define ysum3 $f19 #define ysum4 $f20 PROLOGUE LDARG BUFFER, 0($sp) daddiu $sp, $sp, -32 SDARG $16, 0($sp) dsll LDA, LDA, BASE_SHIFT SDARG $17, 8($sp) dsll INCX, INCX, BASE_SHIFT SDARG $18, 16($sp) dsll INCY, INCY, BASE_SHIFT SDARG $19, 24($sp) nop blez M, .L999 li IS, SIZE beq IS, INCX, .L05 move Y1, Y dsra I, M, 2 move XX, X blez I, .L02 move X, BUFFER .align 3 .L01: LD a1, 0 * SIZE(XX) daddu XX, XX, INCX LD a2, 0 * SIZE(XX) daddu XX, XX, INCX LD a3, 0 * SIZE(XX) daddu XX, XX, INCX LD a4, 0 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L01 daddiu BUFFER, BUFFER, 4 * SIZE .align 3 .L02: andi I, M, 3 blez I, .L05 NOP .align 3 .L03: LD a1, 0 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L03 daddiu BUFFER, BUFFER, 1 * SIZE .align 3 .L05: beq IS, INCY, .L10 daddiu BUFFER, BUFFER, 255 li TEMP, -256 and BUFFER, BUFFER, TEMP dsra I, M, 2 move Y1, BUFFER blez I, .L07 move YY, Y .align 3 .L06: LD a1, 0 * SIZE(YY) daddu YY, YY, INCY LD a2, 0 * SIZE(YY) daddu YY, YY, INCY LD a3, 0 * SIZE(YY) daddu YY, YY, INCY LD a4, 0 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L06 daddiu BUFFER, BUFFER, 4 * SIZE .align 3 .L07: andi I, M, 3 blez I, .L10 NOP .align 3 .L08: LD a1, 0 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L08 daddiu BUFFER, BUFFER, 1 * SIZE .align 3 .L10: slti TEMP, M, 2 nop bgtz TEMP, .L20 li IS, 0 .align 3 .L11: dsll TEMP, IS, BASE_SHIFT daddu TEMP, X, TEMP LD alpha1, 0 * SIZE(TEMP) LD alpha2, 1 * SIZE(TEMP) move AO1, A dsra I, IS, 3 daddu AO2, A, LDA daddu A, AO2, LDA MTC $0, xsum1 MTC $0, xsum2 move XX, X MUL alpha1, ALPHA, alpha1 move YY, Y1 MUL alpha2, ALPHA, alpha2 blez I, .L15 daddiu I, I, -1 LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x3, 2 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a5, 2 * SIZE(AO1) LD a6, 3 * SIZE(AO1) LD a3, 0 * SIZE(AO2) LD a4, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) blez I, .L13 LD ysum3, 2 * SIZE(YY) .align 3 .L12: MADD ysum1, ysum1, alpha1, a1 LD ysum4, 3 * SIZE(YY) MADD ysum2, ysum2, alpha1, a2 LD x4, 3 * SIZE(XX) MADD xsum1, xsum1, x1, a1 LD a1, 4 * SIZE(AO1) MADD xsum2, xsum2, x1, a3 LD x1, 4 * SIZE(XX) MADD ysum1, ysum1, alpha2, a3 LD a3, 4 * SIZE(AO2) MADD ysum2, ysum2, alpha2, a4 daddiu I, I, -1 MADD xsum1, xsum1, x2, a2 LD a2, 5 * SIZE(AO1) MADD xsum2, xsum2, x2, a4 LD a4, 5 * SIZE(AO2) ST ysum1, 0 * SIZE(YY) LD ysum1, 4 * SIZE(YY) ST ysum2, 1 * SIZE(YY) LD ysum2, 5 * SIZE(YY) MADD ysum3, ysum3, alpha1, a5 nop MADD ysum4, ysum4, alpha1, a6 LD x2, 5 * SIZE(XX) MADD xsum1, xsum1, x3, a5 LD a5, 6 * SIZE(AO1) MADD xsum2, xsum2, x3, a7 LD x3, 6 * SIZE(XX) MADD ysum3, ysum3, alpha2, a7 LD a7, 6 * SIZE(AO2) MADD ysum4, ysum4, alpha2, a8 daddiu XX, XX, 8 * SIZE MADD xsum1, xsum1, x4, a6 LD a6, 7 * SIZE(AO1) MADD xsum2, xsum2, x4, a8 LD a8, 7 * SIZE(AO2) ST ysum3, 2 * SIZE(YY) LD ysum3, 6 * SIZE(YY) ST ysum4, 3 * SIZE(YY) LD ysum4, 7 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 daddiu AO2, AO2, 8 * SIZE MADD ysum2, ysum2, alpha1, a2 LD x4,-1 * SIZE(XX) MADD xsum1, xsum1, x1, a1 LD a1, 8 * SIZE(AO1) MADD xsum2, xsum2, x1, a3 LD x1, 0 * SIZE(XX) MADD ysum1, ysum1, alpha2, a3 LD a3, 0 * SIZE(AO2) MADD ysum2, ysum2, alpha2, a4 nop MADD xsum1, xsum1, x2, a2 LD a2, 9 * SIZE(AO1) MADD xsum2, xsum2, x2, a4 LD a4, 1 * SIZE(AO2) ST ysum1, 4 * SIZE(YY) LD ysum1, 8 * SIZE(YY) ST ysum2, 5 * SIZE(YY) LD ysum2, 9 * SIZE(YY) MADD ysum3, ysum3, alpha1, a5 daddiu AO1, AO1, 8 * SIZE MADD ysum4, ysum4, alpha1, a6 LD x2, 1 * SIZE(XX) MADD xsum1, xsum1, x3, a5 LD a5, 2 * SIZE(AO1) MADD xsum2, xsum2, x3, a7 LD x3, 2 * SIZE(XX) MADD ysum3, ysum3, alpha2, a7 LD a7, 2 * SIZE(AO2) MADD ysum4, ysum4, alpha2, a8 daddiu YY, YY, 8 * SIZE MADD xsum1, xsum1, x4, a6 LD a6, 3 * SIZE(AO1) MADD xsum2, xsum2, x4, a8 LD a8, 3 * SIZE(AO2) ST ysum3,-2 * SIZE(YY) LD ysum3, 2 * SIZE(YY) bgtz I, .L12 ST ysum4,-1 * SIZE(YY) .align 3 .L13: MADD ysum1, ysum1, alpha1, a1 LD ysum4, 3 * SIZE(YY) MADD ysum2, ysum2, alpha1, a2 LD x4, 3 * SIZE(XX) MADD xsum1, xsum1, x1, a1 LD a1, 4 * SIZE(AO1) MADD xsum2, xsum2, x1, a3 LD x1, 4 * SIZE(XX) MADD ysum1, ysum1, alpha2, a3 LD a3, 4 * SIZE(AO2) MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 LD a2, 5 * SIZE(AO1) MADD xsum2, xsum2, x2, a4 LD a4, 5 * SIZE(AO2) LD x2, 5 * SIZE(XX) ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) LD ysum1, 4 * SIZE(YY) LD ysum2, 5 * SIZE(YY) MADD ysum3, ysum3, alpha1, a5 MADD ysum4, ysum4, alpha1, a6 MADD xsum1, xsum1, x3, a5 LD a5, 6 * SIZE(AO1) MADD xsum2, xsum2, x3, a7 LD x3, 6 * SIZE(XX) MADD ysum3, ysum3, alpha2, a7 LD a7, 6 * SIZE(AO2) MADD ysum4, ysum4, alpha2, a8 MADD xsum1, xsum1, x4, a6 LD a6, 7 * SIZE(AO1) MADD xsum2, xsum2, x4, a8 LD a8, 7 * SIZE(AO2) LD x4, 7 * SIZE(XX) ST ysum3, 2 * SIZE(YY) ST ysum4, 3 * SIZE(YY) LD ysum3, 6 * SIZE(YY) LD ysum4, 7 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha1, a2 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x1, a3 MADD ysum1, ysum1, alpha2, a3 MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 MADD xsum2, xsum2, x2, a4 MADD ysum3, ysum3, alpha1, a5 MADD ysum4, ysum4, alpha1, a6 MADD xsum1, xsum1, x3, a5 MADD xsum2, xsum2, x3, a7 MADD ysum3, ysum3, alpha2, a7 daddiu XX, XX, 8 * SIZE MADD ysum4, ysum4, alpha2, a8 daddiu AO1, AO1, 8 * SIZE MADD xsum1, xsum1, x4, a6 daddiu AO2, AO2, 8 * SIZE MADD xsum2, xsum2, x4, a8 ST ysum1, 4 * SIZE(YY) ST ysum2, 5 * SIZE(YY) ST ysum3, 6 * SIZE(YY) ST ysum4, 7 * SIZE(YY) daddiu YY, YY, 8 * SIZE .align 3 .L15: andi I, IS, 4 NOP blez I, .L16 NOP LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x3, 2 * SIZE(XX) LD x4, 3 * SIZE(XX) daddiu XX, XX, 4 * SIZE LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a5, 2 * SIZE(AO1) LD a6, 3 * SIZE(AO1) daddiu AO1, AO1, 4 * SIZE LD a3, 0 * SIZE(AO2) LD a4, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) daddiu AO2, AO2, 4 * SIZE LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) LD ysum3, 2 * SIZE(YY) LD ysum4, 3 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha1, a2 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x1, a3 MADD ysum1, ysum1, alpha2, a3 MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 MADD xsum2, xsum2, x2, a4 MADD ysum3, ysum3, alpha1, a5 MADD ysum4, ysum4, alpha1, a6 MADD xsum1, xsum1, x3, a5 MADD xsum2, xsum2, x3, a7 MADD ysum3, ysum3, alpha2, a7 MADD ysum4, ysum4, alpha2, a8 MADD xsum1, xsum1, x4, a6 MADD xsum2, xsum2, x4, a8 ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) ST ysum3, 2 * SIZE(YY) ST ysum4, 3 * SIZE(YY) daddiu YY, YY, 4 * SIZE .align 3 .L16: andi I, IS, 2 NOP blez I, .L19 NOP LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) daddiu XX, XX, 2 * SIZE LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) daddiu AO1, AO1, 2 * SIZE LD a3, 0 * SIZE(AO2) LD a4, 1 * SIZE(AO2) daddiu AO2, AO2, 2 * SIZE LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha1, a2 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x1, a3 MADD ysum1, ysum1, alpha2, a3 MADD ysum2, ysum2, alpha2, a4 MADD xsum1, xsum1, x2, a2 MADD xsum2, xsum2, x2, a4 ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) .align 3 .L19: dsll TEMP, IS, BASE_SHIFT daddu TEMP, Y1, TEMP LD ysum1, 0 * SIZE(TEMP) LD ysum2, 1 * SIZE(TEMP) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a3, 0 * SIZE(AO2) LD a4, 1 * SIZE(AO2) MUL xsum1, ALPHA, xsum1 MUL xsum2, ALPHA, xsum2 MADD xsum1, xsum1, alpha1, a1 MADD xsum2, xsum2, alpha1, a3 MADD xsum1, xsum1, alpha2, a3 MADD xsum2, xsum2, alpha2, a4 ADD ysum1, ysum1, xsum1 ADD ysum2, ysum2, xsum2 ST ysum1, 0 * SIZE(TEMP) ST ysum2, 1 * SIZE(TEMP) daddiu TEMP, IS, 4 slt TEMP, M, TEMP beqz TEMP, .L11 daddiu IS, IS, 2 .align 3 .L20: andi TEMP, M, 1 nop blez TEMP, .L900 nop .align 3 dsll TEMP, IS, BASE_SHIFT daddu TEMP, X, TEMP LD alpha1, 0 * SIZE(TEMP) move AO1, A dsra I, IS, 2 daddu A, AO1, LDA MTC $0, xsum1 MTC $0, xsum2 move XX, X MUL alpha1, ALPHA, alpha1 move YY, Y1 blez I, .L25 daddiu I, I, -1 LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x3, 2 * SIZE(XX) LD x4, 3 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a3, 2 * SIZE(AO1) LD a4, 3 * SIZE(AO1) LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) LD ysum3, 2 * SIZE(YY) blez I, .L23 LD ysum4, 3 * SIZE(YY) .align 3 .L22: MADD ysum1, ysum1, alpha1, a1 daddiu I, I, -1 MADD xsum1, xsum1, x1, a1 LD a1, 4 * SIZE(AO1) MADD ysum2, ysum2, alpha1, a2 LD x1, 4 * SIZE(XX) MADD xsum2, xsum2, x2, a2 LD a2, 5 * SIZE(AO1) ST ysum1, 0 * SIZE(YY) LD ysum1, 4 * SIZE(YY) ST ysum2, 1 * SIZE(YY) LD ysum2, 5 * SIZE(YY) daddiu AO1, AO1, 4 * SIZE nop MADD ysum3, ysum3, alpha1, a3 LD x2, 5 * SIZE(XX) MADD xsum1, xsum1, x3, a3 LD a3, 2 * SIZE(AO1) MADD ysum4, ysum4, alpha1, a4 LD x3, 6 * SIZE(XX) MADD xsum2, xsum2, x4, a4 LD a4, 3 * SIZE(AO1) ST ysum3, 2 * SIZE(YY) LD ysum3, 6 * SIZE(YY) ST ysum4, 3 * SIZE(YY) LD ysum4, 7 * SIZE(YY) daddiu XX, XX, 4 * SIZE daddiu YY, YY, 4 * SIZE bgtz I, .L22 LD x4, 3 * SIZE(XX) .align 3 .L23: MADD ysum1, ysum1, alpha1, a1 daddiu AO1, AO1, 4 * SIZE MADD xsum1, xsum1, x1, a1 daddiu XX, XX, 4 * SIZE MADD ysum2, ysum2, alpha1, a2 daddiu YY, YY, 4 * SIZE MADD xsum2, xsum2, x2, a2 nop MADD ysum3, ysum3, alpha1, a3 ST ysum1,-4 * SIZE(YY) MADD xsum1, xsum1, x3, a3 ST ysum2,-3 * SIZE(YY) MADD ysum4, ysum4, alpha1, a4 ST ysum3,-2 * SIZE(YY) MADD xsum2, xsum2, x4, a4 ST ysum4,-1 * SIZE(YY) .align 3 .L25: andi I, IS, 2 NOP blez I, .L26 NOP LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) daddiu XX, XX, 2 * SIZE LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) daddiu AO1, AO1, 2 * SIZE LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD xsum1, xsum1, x1, a1 MADD ysum2, ysum2, alpha1, a2 MADD xsum2, xsum2, x2, a2 ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) daddiu YY, YY, 2 * SIZE .align 3 .L26: andi I, IS, 1 NOP blez I, .L29 NOP LD x1, 0 * SIZE(XX) daddiu XX, XX, 1 * SIZE LD a1, 0 * SIZE(AO1) daddiu AO1, AO1, 1* SIZE LD ysum1, 0 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD xsum1, xsum1, x1, a1 ST ysum1, 0 * SIZE(YY) .align 3 .L29: dsll TEMP, IS, BASE_SHIFT daddu TEMP, Y1, TEMP LD ysum1, 0 * SIZE(TEMP) LD a1, 0 * SIZE(AO1) ADD xsum1, xsum1, xsum2 MUL xsum1, ALPHA, xsum1 MADD xsum1, xsum1, alpha1, a1 ADD ysum1, ysum1, xsum1 ST ysum1, 0 * SIZE(TEMP) .align 3 .L900: li IS, SIZE beq INCY, IS, .L999 NOP dsra I, M, 2 blez I, .L905 NOP .align 3 .L902: LD a1, 0 * SIZE(Y1) LD a2, 1 * SIZE(Y1) LD a3, 2 * SIZE(Y1) LD a4, 3 * SIZE(Y1) ST a1, 0 * SIZE(Y) daddu Y, Y, INCY ST a2, 0 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) daddu Y, Y, INCY ST a4, 0 * SIZE(Y) daddu Y, Y, INCY daddiu I, I, -1 bgtz I, .L902 daddiu Y1, Y1, 4 * SIZE .align 3 .L905: andi I, M, 3 blez I, .L999 NOP .align 3 .L906: LD a1, 0 * SIZE(Y1) daddiu Y1, Y1, 1 * SIZE ST a1, 0 * SIZE(Y) daddiu I, I, -1 bgtz I, .L906 daddu Y, Y, INCY .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) j $31 daddiu $sp, $sp, 32 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/trsm_kernel_LN.S000066400000000000000000001625731313527062700206260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define CO5 $18 #define CO6 $19 #define CO7 $20 #define CO8 $21 #define OFFSET $22 #define KK $23 #define TEMP $24 #define AORIG $25 #define a1 $f0 #define a2 $f1 #define a3 $f27 #define a4 $f28 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f16 #define c41 $f17 #define c42 $f18 #define c51 $f19 #define c52 $f20 #define c61 $f21 #define c62 $f22 #define c71 $f23 #define c72 $f24 #define c81 $f25 #define c82 $f26 #define ALPHA $f15 PROLOGUE daddiu $sp, $sp, -144 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) SDARG $22, 88($sp) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) #ifndef __64BIT__ sdc1 $f20,112($sp) sdc1 $f21,120($sp) sdc1 $f22,128($sp) sdc1 $f23,136($sp) #endif LDARG OFFSET, 144($sp) dsll LDC, LDC, BASE_SHIFT #ifdef LN mult M, K mflo TEMP dsll TEMP, TEMP, BASE_SHIFT daddu A, A, TEMP dsll TEMP, M, BASE_SHIFT daddu C, C, TEMP #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mult N, K mflo TEMP dsll TEMP, TEMP, BASE_SHIFT daddu B, B, TEMP mult N, LDC mflo TEMP daddu C, C, TEMP dsubu KK, N, OFFSET #endif dsra J, N, 3 blez J, .L30 nop .L10: #ifdef RT dsll TEMP, K, 3 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 3 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 daddu CO5, CO4, LDC MOV c31, c11 daddu CO6, CO5, LDC MOV c41, c11 daddu CO7, CO6, LDC MOV c51, c11 daddu CO8, CO7, LDC #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO8, LDC #endif andi I, M, 1 MOV c61, c11 blez I, .L20 MOV c71, c11 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 MOV c81, c11 blez L, .L25 move BO, B #else #ifdef LN dsll TEMP, K, 0 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 MOV c81, c11 blez L, .L25 NOP #endif .align 3 .L22: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 LD b5, 20 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 9 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 10 * SIZE(BO) MADD c81, c81, a1, b4 LD b4, 11 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a2, b7 LD b7, 28 * SIZE(BO) MADD c61, c61, a2, b2 LD b2, 17 * SIZE(BO) MADD c71, c71, a2, b3 LD b3, 18 * SIZE(BO) MADD c81, c81, a2, b4 LD b4, 19 * SIZE(BO) LD a2, 5 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a3, b1 LD b1, 32 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 21 * SIZE(BO) MADD c31, c31, a3, b3 LD b3, 22 * SIZE(BO) MADD c41, c41, a3, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 LD b5, 36 * SIZE(BO) MADD c61, c61, a3, b2 LD b2, 25 * SIZE(BO) MADD c71, c71, a3, b3 LD b3, 26 * SIZE(BO) MADD c81, c81, a3, b4 LD b4, 27 * SIZE(BO) LD a3, 2 * SIZE(AO) daddiu BO, BO, 32 * SIZE MADD c11, c11, a4, b6 LD b6, 8 * SIZE(BO) MADD c21, c21, a4, b2 LD b2, -3 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, -2 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, -1 * SIZE(BO) MADD c51, c51, a4, b7 LD b7, 12 * SIZE(BO) MADD c61, c61, a4, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a4, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a4, b4 LD b4, 3 * SIZE(BO) bgtz L, .L22 LD a4, 3 * SIZE(AO) .align 3 .L25: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L28 NOP .align 3 .L26: MADD c11, c11, a1, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) daddiu L, L, -1 MOV a2, a2 daddiu AO, AO, 1 * SIZE daddiu BO, BO, 8 * SIZE MADD c51, c51, a1, b5 LD b5, 4 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) bgtz L, .L26 LD b4, 3 * SIZE(BO) .L28: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -8 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c51, b5, c51 SUB c61, b6, c61 SUB c71, b7, c71 SUB c81, b8, c81 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c51, b5, c51 SUB c61, b6, c61 SUB c71, b7, c71 SUB c81, b8, c81 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 MUL c51, b1, c51 MUL c61, b1, c61 MUL c71, b1, c71 MUL c81, b1, c81 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 NMSUB c31, c31, b3, c11 NMSUB c41, c41, b4, c11 NMSUB c51, c51, b5, c11 NMSUB c61, c61, b6, c11 NMSUB c71, c71, b7, c11 NMSUB c81, c81, b8, c11 LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MUL c21, b2, c21 NMSUB c31, c31, b3, c21 NMSUB c41, c41, b4, c21 NMSUB c51, c51, b5, c21 NMSUB c61, c61, b6, c21 NMSUB c71, c71, b7, c21 NMSUB c81, c81, b8, c21 LD b3, 18 * SIZE(BO) LD b4, 19 * SIZE(BO) LD b5, 20 * SIZE(BO) LD b6, 21 * SIZE(BO) LD b7, 22 * SIZE(BO) LD b8, 23 * SIZE(BO) MUL c31, b3, c31 NMSUB c41, c41, b4, c31 NMSUB c51, c51, b5, c31 NMSUB c61, c61, b6, c31 NMSUB c71, c71, b7, c31 NMSUB c81, c81, b8, c31 LD b4, 27 * SIZE(BO) LD b5, 28 * SIZE(BO) LD b6, 29 * SIZE(BO) LD b7, 30 * SIZE(BO) LD b8, 31 * SIZE(BO) MUL c41, b4, c41 NMSUB c51, c51, b5, c41 NMSUB c61, c61, b6, c41 NMSUB c71, c71, b7, c41 NMSUB c81, c81, b8, c41 LD b5, 36 * SIZE(BO) LD b6, 37 * SIZE(BO) LD b7, 38 * SIZE(BO) LD b8, 39 * SIZE(BO) MUL c51, b5, c51 NMSUB c61, c61, b6, c51 NMSUB c71, c71, b7, c51 NMSUB c81, c81, b8, c51 LD b6, 45 * SIZE(BO) LD b7, 46 * SIZE(BO) LD b8, 47 * SIZE(BO) MUL c61, b6, c61 NMSUB c71, c71, b7, c61 NMSUB c81, c81, b8, c61 LD b7, 54 * SIZE(BO) LD b8, 55 * SIZE(BO) MUL c71, b7, c71 NMSUB c81, c81, b8, c71 LD b8, 63 * SIZE(BO) MUL c81, b8, c81 #endif #ifdef RT LD b1, 63 * SIZE(BO) LD b2, 62 * SIZE(BO) LD b3, 61 * SIZE(BO) LD b4, 60 * SIZE(BO) LD b5, 59 * SIZE(BO) LD b6, 58 * SIZE(BO) LD b7, 57 * SIZE(BO) LD b8, 56 * SIZE(BO) MUL c81, b1, c81 NMSUB c71, c71, b2, c81 NMSUB c61, c61, b3, c81 NMSUB c51, c51, b4, c81 NMSUB c41, c41, b5, c81 NMSUB c31, c31, b6, c81 NMSUB c21, c21, b7, c81 NMSUB c11, c11, b8, c81 LD b2, 54 * SIZE(BO) LD b3, 53 * SIZE(BO) LD b4, 52 * SIZE(BO) LD b5, 51 * SIZE(BO) LD b6, 50 * SIZE(BO) LD b7, 49 * SIZE(BO) LD b8, 48 * SIZE(BO) MUL c71, b2, c71 NMSUB c61, c61, b3, c71 NMSUB c51, c51, b4, c71 NMSUB c41, c41, b5, c71 NMSUB c31, c31, b6, c71 NMSUB c21, c21, b7, c71 NMSUB c11, c11, b8, c71 LD b3, 45 * SIZE(BO) LD b4, 44 * SIZE(BO) LD b5, 43 * SIZE(BO) LD b6, 42 * SIZE(BO) LD b7, 41 * SIZE(BO) LD b8, 40 * SIZE(BO) MUL c61, b3, c61 NMSUB c51, c51, b4, c61 NMSUB c41, c41, b5, c61 NMSUB c31, c31, b6, c61 NMSUB c21, c21, b7, c61 NMSUB c11, c11, b8, c61 LD b4, 36 * SIZE(BO) LD b5, 35 * SIZE(BO) LD b6, 34 * SIZE(BO) LD b7, 33 * SIZE(BO) LD b8, 32 * SIZE(BO) MUL c51, b4, c51 NMSUB c41, c41, b5, c51 NMSUB c31, c31, b6, c51 NMSUB c21, c21, b7, c51 NMSUB c11, c11, b8, c51 LD b5, 27 * SIZE(BO) LD b6, 26 * SIZE(BO) LD b7, 25 * SIZE(BO) LD b8, 24 * SIZE(BO) MUL c41, b5, c41 NMSUB c31, c31, b6, c41 NMSUB c21, c21, b7, c41 NMSUB c11, c11, b8, c41 LD b6, 18 * SIZE(BO) LD b7, 17 * SIZE(BO) LD b8, 16 * SIZE(BO) MUL c31, b6, c31 NMSUB c21, c21, b7, c31 NMSUB c11, c11, b8, c31 LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c21, b7, c21 NMSUB c11, c11, b8, c21 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE daddiu CO3, CO3, -1 * SIZE daddiu CO4, CO4, -1 * SIZE daddiu CO5, CO5, -1 * SIZE daddiu CO6, CO6, -1 * SIZE daddiu CO7, CO7, -1 * SIZE daddiu CO8, CO8, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c61, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c81, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c41, 3 * SIZE(AO) ST c51, 4 * SIZE(AO) ST c61, 5 * SIZE(AO) ST c71, 6 * SIZE(AO) ST c81, 7 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c51, 0 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c81, 0 * SIZE(CO8) MTC $0, c11 #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE daddiu CO5, CO5, 1 * SIZE daddiu CO6, CO6, 1 * SIZE daddiu CO7, CO7, 1 * SIZE daddiu CO8, CO8, 1 * SIZE #endif MOV c21, c11 #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif MOV c31, c11 #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif MOV c41, c11 #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L20: dsra I, M, 1 MOV c51, c11 blez I, .L29 MOV c61, c11 .L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, KK, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(BO) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 MOV c32, c11 LD b3, 2 * SIZE(BO) MOV c42, c11 LD b4, 3 * SIZE(BO) MOV c52, c11 LD b5, 4 * SIZE(BO) MOV c62, c11 LD b6, 8 * SIZE(BO) MOV c72, c11 LD b7, 12 * SIZE(BO) MOV c82, c11 dsra L, TEMP, 2 blez L, .L15 NOP #endif MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 blez L, .L13 MADD c41, c41, a1, b4 NOP .align 3 .L12: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 daddiu L, L, -1 MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 bgtz L, .L12 MADD c41, c41, a1, b4 NOP .align 3 .L13: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L18 NOP .align 3 .L16: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 NOP MADD c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 daddiu L, L, -1 MADD c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -8 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 LD b5, 4 * SIZE(BO) SUB c21, b2, c21 LD b6, 5 * SIZE(BO) SUB c31, b3, c31 LD b7, 6 * SIZE(BO) SUB c41, b4, c41 LD b8, 7 * SIZE(BO) SUB c51, b5, c51 LD b1, 8 * SIZE(BO) SUB c61, b6, c61 LD b2, 9 * SIZE(BO) SUB c71, b7, c71 LD b3, 10 * SIZE(BO) SUB c81, b8, c81 LD b4, 11 * SIZE(BO) SUB c12, b1, c12 LD b5, 12 * SIZE(BO) SUB c22, b2, c22 LD b6, 13 * SIZE(BO) SUB c32, b3, c32 LD b7, 14 * SIZE(BO) SUB c42, b4, c42 LD b8, 15 * SIZE(BO) SUB c52, b5, c52 #ifdef LN LD b1, 3 * SIZE(AO) #else LD b1, 0 * SIZE(AO) #endif SUB c62, b6, c62 SUB c72, b7, c72 SUB c82, b8, c82 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 LD b5, 4 * SIZE(AO) SUB c12, b2, c12 LD b6, 5 * SIZE(AO) SUB c21, b3, c21 LD b7, 6 * SIZE(AO) SUB c22, b4, c22 LD b8, 7 * SIZE(AO) SUB c31, b5, c31 LD b1, 8 * SIZE(AO) SUB c32, b6, c32 LD b2, 9 * SIZE(AO) SUB c41, b7, c41 LD b3, 10 * SIZE(AO) SUB c42, b8, c42 LD b4, 11 * SIZE(AO) LD b5, 12 * SIZE(AO) SUB c51, b1, c51 LD b6, 13 * SIZE(AO) SUB c52, b2, c52 LD b7, 14 * SIZE(AO) SUB c61, b3, c61 LD b8, 15 * SIZE(AO) SUB c62, b4, c62 SUB c71, b5, c71 SUB c72, b6, c72 SUB c81, b7, c81 SUB c82, b8, c82 #endif #ifdef LN MUL c12, b1, c12 LD b2, 2 * SIZE(AO) MUL c22, b1, c22 MUL c32, b1, c32 MUL c42, b1, c42 MUL c52, b1, c52 MUL c62, b1, c62 MUL c72, b1, c72 MUL c82, b1, c82 NMSUB c11, c11, b2, c12 LD b3, 0 * SIZE(AO) NMSUB c21, c21, b2, c22 NMSUB c31, c31, b2, c32 NMSUB c41, c41, b2, c42 NMSUB c51, c51, b2, c52 NMSUB c61, c61, b2, c62 NMSUB c71, c71, b2, c72 NMSUB c81, c81, b2, c82 MUL c11, b3, c11 daddiu CO1, CO1, -2 * SIZE MUL c21, b3, c21 daddiu CO2, CO2, -2 * SIZE MUL c31, b3, c31 daddiu CO3, CO3, -2 * SIZE MUL c41, b3, c41 daddiu CO4, CO4, -2 * SIZE MUL c51, b3, c51 daddiu CO5, CO5, -2 * SIZE MUL c61, b3, c61 daddiu CO6, CO6, -2 * SIZE MUL c71, b3, c71 daddiu CO7, CO7, -2 * SIZE MUL c81, b3, c81 daddiu CO8, CO8, -2 * SIZE #endif #ifdef LT MUL c11, b1, c11 LD b2, 1 * SIZE(AO) MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 MUL c51, b1, c51 MUL c61, b1, c61 MUL c71, b1, c71 MUL c81, b1, c81 NMSUB c12, c12, b2, c11 LD b3, 3 * SIZE(AO) NMSUB c22, c22, b2, c21 NMSUB c32, c32, b2, c31 NMSUB c42, c42, b2, c41 NMSUB c52, c52, b2, c51 NMSUB c62, c62, b2, c61 NMSUB c72, c72, b2, c71 NMSUB c82, c82, b2, c81 MUL c12, b3, c12 MUL c22, b3, c22 MUL c32, b3, c32 MUL c42, b3, c42 MUL c52, b3, c52 MUL c62, b3, c62 MUL c72, b3, c72 MUL c82, b3, c82 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 LD b5, 4 * SIZE(BO) NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 LD b6, 5 * SIZE(BO) NMSUB c31, c31, b3, c11 NMSUB c32, c32, b3, c12 LD b7, 6 * SIZE(BO) NMSUB c41, c41, b4, c11 NMSUB c42, c42, b4, c12 LD b8, 7 * SIZE(BO) NMSUB c51, c51, b5, c11 NMSUB c52, c52, b5, c12 LD b2, 9 * SIZE(BO) NMSUB c61, c61, b6, c11 NMSUB c62, c62, b6, c12 LD b3, 10 * SIZE(BO) NMSUB c71, c71, b7, c11 NMSUB c72, c72, b7, c12 LD b4, 11 * SIZE(BO) NMSUB c81, c81, b8, c11 NMSUB c82, c82, b8, c12 LD b5, 12 * SIZE(BO) MUL c21, b2, c21 MUL c22, b2, c22 LD b6, 13 * SIZE(BO) NMSUB c31, c31, b3, c21 NMSUB c32, c32, b3, c22 LD b7, 14 * SIZE(BO) NMSUB c41, c41, b4, c21 NMSUB c42, c42, b4, c22 LD b8, 15 * SIZE(BO) NMSUB c51, c51, b5, c21 NMSUB c52, c52, b5, c22 LD b3, 18 * SIZE(BO) NMSUB c61, c61, b6, c21 NMSUB c62, c62, b6, c22 LD b4, 19 * SIZE(BO) NMSUB c71, c71, b7, c21 NMSUB c72, c72, b7, c22 LD b5, 20 * SIZE(BO) NMSUB c81, c81, b8, c21 NMSUB c82, c82, b8, c22 LD b6, 21 * SIZE(BO) MUL c31, b3, c31 MUL c32, b3, c32 LD b7, 22 * SIZE(BO) NMSUB c41, c41, b4, c31 NMSUB c42, c42, b4, c32 LD b8, 23 * SIZE(BO) NMSUB c51, c51, b5, c31 NMSUB c52, c52, b5, c32 LD b4, 27 * SIZE(BO) NMSUB c61, c61, b6, c31 NMSUB c62, c62, b6, c32 LD b5, 28 * SIZE(BO) NMSUB c71, c71, b7, c31 NMSUB c72, c72, b7, c32 LD b6, 29 * SIZE(BO) NMSUB c81, c81, b8, c31 NMSUB c82, c82, b8, c32 LD b7, 30 * SIZE(BO) MUL c41, b4, c41 MUL c42, b4, c42 LD b8, 31 * SIZE(BO) NMSUB c51, c51, b5, c41 NMSUB c52, c52, b5, c42 LD b5, 36 * SIZE(BO) NMSUB c61, c61, b6, c41 NMSUB c62, c62, b6, c42 LD b6, 37 * SIZE(BO) NMSUB c71, c71, b7, c41 NMSUB c72, c72, b7, c42 LD b7, 38 * SIZE(BO) NMSUB c81, c81, b8, c41 NMSUB c82, c82, b8, c42 LD b8, 39 * SIZE(BO) MUL c51, b5, c51 MUL c52, b5, c52 NMSUB c61, c61, b6, c51 NMSUB c62, c62, b6, c52 LD b6, 45 * SIZE(BO) NMSUB c71, c71, b7, c51 NMSUB c72, c72, b7, c52 LD b7, 46 * SIZE(BO) NMSUB c81, c81, b8, c51 NMSUB c82, c82, b8, c52 LD b8, 47 * SIZE(BO) MUL c61, b6, c61 MUL c62, b6, c62 NMSUB c71, c71, b7, c61 NMSUB c72, c72, b7, c62 LD b7, 54 * SIZE(BO) NMSUB c81, c81, b8, c61 NMSUB c82, c82, b8, c62 LD b8, 55 * SIZE(BO) MUL c71, b7, c71 MUL c72, b7, c72 NMSUB c81, c81, b8, c71 NMSUB c82, c82, b8, c72 LD b8, 63 * SIZE(BO) MUL c81, b8, c81 MUL c82, b8, c82 #endif #ifdef RT LD b1, 63 * SIZE(BO) LD b2, 62 * SIZE(BO) LD b3, 61 * SIZE(BO) LD b4, 60 * SIZE(BO) MUL c81, b1, c81 MUL c82, b1, c82 LD b5, 59 * SIZE(BO) NMSUB c71, c71, b2, c81 NMSUB c72, c72, b2, c82 LD b6, 58 * SIZE(BO) NMSUB c61, c61, b3, c81 NMSUB c62, c62, b3, c82 LD b7, 57 * SIZE(BO) NMSUB c51, c51, b4, c81 NMSUB c52, c52, b4, c82 LD b8, 56 * SIZE(BO) NMSUB c41, c41, b5, c81 NMSUB c42, c42, b5, c82 LD b2, 54 * SIZE(BO) NMSUB c31, c31, b6, c81 NMSUB c32, c32, b6, c82 LD b3, 53 * SIZE(BO) NMSUB c21, c21, b7, c81 NMSUB c22, c22, b7, c82 LD b4, 52 * SIZE(BO) NMSUB c11, c11, b8, c81 NMSUB c12, c12, b8, c82 LD b5, 51 * SIZE(BO) MUL c71, b2, c71 MUL c72, b2, c72 LD b6, 50 * SIZE(BO) NMSUB c61, c61, b3, c71 NMSUB c62, c62, b3, c72 LD b7, 49 * SIZE(BO) NMSUB c51, c51, b4, c71 NMSUB c52, c52, b4, c72 LD b8, 48 * SIZE(BO) NMSUB c41, c41, b5, c71 NMSUB c42, c42, b5, c72 LD b3, 45 * SIZE(BO) NMSUB c31, c31, b6, c71 NMSUB c32, c32, b6, c72 LD b4, 44 * SIZE(BO) NMSUB c21, c21, b7, c71 NMSUB c22, c22, b7, c72 LD b5, 43 * SIZE(BO) NMSUB c11, c11, b8, c71 NMSUB c12, c12, b8, c72 LD b6, 42 * SIZE(BO) MUL c61, b3, c61 MUL c62, b3, c62 LD b7, 41 * SIZE(BO) NMSUB c51, c51, b4, c61 NMSUB c52, c52, b4, c62 LD b8, 40 * SIZE(BO) NMSUB c41, c41, b5, c61 NMSUB c42, c42, b5, c62 LD b4, 36 * SIZE(BO) NMSUB c31, c31, b6, c61 NMSUB c32, c32, b6, c62 LD b5, 35 * SIZE(BO) NMSUB c21, c21, b7, c61 NMSUB c22, c22, b7, c62 LD b6, 34 * SIZE(BO) NMSUB c11, c11, b8, c61 NMSUB c12, c12, b8, c62 LD b7, 33 * SIZE(BO) MUL c51, b4, c51 MUL c52, b4, c52 LD b8, 32 * SIZE(BO) NMSUB c41, c41, b5, c51 NMSUB c42, c42, b5, c52 LD b5, 27 * SIZE(BO) NMSUB c31, c31, b6, c51 NMSUB c32, c32, b6, c52 LD b6, 26 * SIZE(BO) NMSUB c21, c21, b7, c51 NMSUB c22, c22, b7, c52 LD b7, 25 * SIZE(BO) NMSUB c11, c11, b8, c51 NMSUB c12, c12, b8, c52 LD b8, 24 * SIZE(BO) MUL c41, b5, c41 MUL c42, b5, c42 NMSUB c31, c31, b6, c41 NMSUB c32, c32, b6, c42 LD b6, 18 * SIZE(BO) NMSUB c21, c21, b7, c41 NMSUB c22, c22, b7, c42 LD b7, 17 * SIZE(BO) NMSUB c11, c11, b8, c41 NMSUB c12, c12, b8, c42 LD b8, 16 * SIZE(BO) MUL c31, b6, c31 MUL c32, b6, c32 NMSUB c21, c21, b7, c31 NMSUB c22, c22, b7, c32 LD b7, 9 * SIZE(BO) NMSUB c11, c11, b8, c31 NMSUB c12, c12, b8, c32 LD b8, 8 * SIZE(BO) MUL c21, b7, c21 MUL c22, b7, c22 NMSUB c11, c11, b8, c21 NMSUB c12, c12, b8, c22 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 MUL c12, b8, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c61, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c81, 7 * SIZE(BO) ST c12, 8 * SIZE(BO) ST c22, 9 * SIZE(BO) ST c32, 10 * SIZE(BO) ST c42, 11 * SIZE(BO) ST c52, 12 * SIZE(BO) ST c62, 13 * SIZE(BO) ST c72, 14 * SIZE(BO) ST c82, 15 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) ST c31, 4 * SIZE(AO) ST c32, 5 * SIZE(AO) ST c41, 6 * SIZE(AO) ST c42, 7 * SIZE(AO) ST c51, 8 * SIZE(AO) ST c52, 9 * SIZE(AO) ST c61, 10 * SIZE(AO) ST c62, 11 * SIZE(AO) ST c71, 12 * SIZE(AO) ST c72, 13 * SIZE(AO) ST c81, 14 * SIZE(AO) ST c82, 15 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c32, 1 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c42, 1 * SIZE(CO4) ST c51, 0 * SIZE(CO5) ST c52, 1 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c62, 1 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c72, 1 * SIZE(CO7) ST c81, 0 * SIZE(CO8) ST c82, 1 * SIZE(CO8) MTC $0, a1 #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE daddiu CO5, CO5, 2 * SIZE daddiu CO6, CO6, 2 * SIZE daddiu CO7, CO7, 2 * SIZE daddiu CO8, CO8, 2 * SIZE #endif MOV c11, a1 MOV c21, a1 #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif MOV c31, a1 MOV c41, a1 #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif daddiu I, I, -1 MOV c51, a1 bgtz I, .L11 MOV c61, a1 .align 3 .L29: #ifdef LN dsll TEMP, K, 3 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 8 #endif #ifdef RT daddiu KK, KK, -8 #endif bgtz J, .L10 NOP .align 3 .L30: andi J, N, 4 blez J, .L50 move AO, A #ifdef RT dsll TEMP, K, 2 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 2 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC MOV c21, c11 daddu CO4, CO3, LDC MOV c31, c11 #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO4, LDC #endif andi I, M, 1 blez I, .L40 MOV c41, c11 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 blez L, .L45 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 blez L, .L45 NOP #endif .align 3 .L42: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b5 LD b5, 20 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 11 * SIZE(BO) LD a2, 2 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) LD a2, -1 * SIZE(AO) daddiu BO, BO, 16 * SIZE MADD c11, c11, a2, b7 LD b7, 12 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 1 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 2 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L42 LD a2, 1 * SIZE(AO) .align 3 .L45: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L48 NOP .align 3 .L46: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD a1, 1 * SIZE(AO) LD b4, 7 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE MOV a2, a2 bgtz L, .L46 daddiu BO, BO, 4 * SIZE .L48: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 NMSUB c31, c31, b3, c11 NMSUB c41, c41, b4, c11 LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL c21, b2, c21 NMSUB c31, c31, b3, c21 NMSUB c41, c41, b4, c21 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MUL c31, b3, c31 NMSUB c41, c41, b4, c31 LD b4, 15 * SIZE(BO) MUL c41, b4, c41 #endif #ifdef RT LD b5, 15 * SIZE(BO) LD b6, 14 * SIZE(BO) LD b7, 13 * SIZE(BO) LD b8, 12 * SIZE(BO) MUL c41, b5, c41 NMSUB c31, c31, b6, c41 NMSUB c21, c21, b7, c41 NMSUB c11, c11, b8, c41 LD b6, 10 * SIZE(BO) LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c31, b6, c31 NMSUB c21, c21, b7, c31 NMSUB c11, c11, b8, c31 LD b7, 5 * SIZE(BO) LD b8, 4 * SIZE(BO) MUL c21, b7, c21 NMSUB c11, c11, b8, c21 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE daddiu CO3, CO3, -1 * SIZE daddiu CO4, CO4, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c41, 3 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) MTC $0, c11 #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE #endif MOV c21, c11 #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif MOV c31, c11 #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L40: dsra I, M, 1 MOV c61, c11 blez I, .L49 MOV c41, c11 .L31: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) MOV c32, c11 LD b4, 3 * SIZE(B) MOV c42, c11 LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L35 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) MOV c32, c11 LD b4, 3 * SIZE(BO) MOV c42, c11 LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L35 NOP #endif .align 3 .L32: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a3, b6 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c11, c11, a3, b7 LD a2, 7 * SIZE(AO) MADD c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD c31, c31, a3, b3 daddiu BO, BO, 16 * SIZE MADD c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD c12, c12, a2, b7 LD b7, 12 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD c42, c42, a2, b4 NOP bgtz L, .L32 LD b4, 3 * SIZE(BO) .align 3 .L35: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L38 NOP .align 3 .L36: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 daddiu AO, AO, 2 * SIZE MADD c41, c41, a1, b4 LD a1, 0 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 4 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) bgtz L, .L36 daddiu BO, BO, 4 * SIZE .L38: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c12, b5, c12 SUB c22, b6, c22 SUB c32, b7, c32 SUB c42, b8, c42 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c21, b3, c21 SUB c22, b4, c22 SUB c31, b5, c31 SUB c32, b6, c32 SUB c41, b7, c41 SUB c42, b8, c42 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 MUL c22, b1, c22 MUL c32, b1, c32 MUL c42, b1, c42 NMSUB c11, c11, b2, c12 NMSUB c21, c21, b2, c22 NMSUB c31, c31, b2, c32 NMSUB c41, c41, b2, c42 MUL c11, b3, c11 MUL c21, b3, c21 MUL c31, b3, c31 MUL c41, b3, c41 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 NMSUB c12, c12, b2, c11 NMSUB c22, c22, b2, c21 NMSUB c32, c32, b2, c31 NMSUB c42, c42, b2, c41 MUL c12, b3, c12 MUL c22, b3, c22 MUL c32, b3, c32 MUL c42, b3, c42 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 NMSUB c31, c31, b3, c11 NMSUB c32, c32, b3, c12 NMSUB c41, c41, b4, c11 NMSUB c42, c42, b4, c12 LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL c21, b2, c21 MUL c22, b2, c22 NMSUB c31, c31, b3, c21 NMSUB c32, c32, b3, c22 NMSUB c41, c41, b4, c21 NMSUB c42, c42, b4, c22 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MUL c31, b3, c31 MUL c32, b3, c32 NMSUB c41, c41, b4, c31 NMSUB c42, c42, b4, c32 LD b4, 15 * SIZE(BO) MUL c41, b4, c41 MUL c42, b4, c42 #endif #ifdef RT LD b5, 15 * SIZE(BO) LD b6, 14 * SIZE(BO) LD b7, 13 * SIZE(BO) LD b8, 12 * SIZE(BO) MUL c41, b5, c41 MUL c42, b5, c42 NMSUB c31, c31, b6, c41 NMSUB c32, c32, b6, c42 NMSUB c21, c21, b7, c41 NMSUB c22, c22, b7, c42 NMSUB c11, c11, b8, c41 NMSUB c12, c12, b8, c42 LD b6, 10 * SIZE(BO) LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c31, b6, c31 MUL c32, b6, c32 NMSUB c21, c21, b7, c31 NMSUB c22, c22, b7, c32 NMSUB c11, c11, b8, c31 NMSUB c12, c12, b8, c32 LD b7, 5 * SIZE(BO) LD b8, 4 * SIZE(BO) MUL c21, b7, c21 MUL c22, b7, c22 NMSUB c11, c11, b8, c21 NMSUB c12, c12, b8, c22 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 MUL c12, b8, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE daddiu CO3, CO3, -2 * SIZE daddiu CO4, CO4, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c12, 4 * SIZE(BO) ST c22, 5 * SIZE(BO) ST c32, 6 * SIZE(BO) ST c42, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) ST c31, 4 * SIZE(AO) ST c32, 5 * SIZE(AO) ST c41, 6 * SIZE(AO) ST c42, 7 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c32, 1 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c42, 1 * SIZE(CO4) #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif MTC $0, a1 MOV c11, a1 MOV c21, a1 MOV c31, a1 daddiu I, I, -1 bgtz I, .L31 MOV c41, c11 .align 3 .L49: #ifdef LN dsll TEMP, K, 2 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 4 #endif #ifdef RT daddiu KK, KK, -4 #endif .align 3 .L50: andi J, N, 2 blez J, .L70 #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 1 dsubu C, C, TEMP #endif move AO, A move CO1, C daddu CO2, C, LDC #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO2, LDC #endif andi I, M, 1 blez I, .L60 NOP #if defined(LT) || defined(RN) dsra L, KK, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L65 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK dsra L, TEMP, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L65 NOP #endif .align 3 .L62: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) MADD c11, c11, a3, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, 11 * SIZE(BO) LD a3, 6 * SIZE(AO) LD a4, 7 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L62 daddiu BO, BO, 8 * SIZE .align 3 .L65: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L68 NOP .align 3 .L66: MADD c11, c11, a1, b1 LD b1, 2 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 3 * SIZE(BO) LD a1, 1 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L66 daddiu BO, BO, 2 * SIZE .L68: ADD c11, c11, c31 ADD c21, c21, c41 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 #endif #if defined(LN) || defined(LT) LD b3, 0 * SIZE(AO) MUL c11, b3, c11 MUL c21, b3, c21 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 3 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 MUL c21, b3, c21 #endif #ifdef RT LD b1, 3 * SIZE(BO) LD b2, 2 * SIZE(BO) LD b3, 0 * SIZE(BO) MUL c21, b1, c21 NMSUB c11, c11, b2, c21 MUL c11, b3, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE #endif #ifdef RT dsll TEMP, K, 0 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L60: dsra I, M, 1 blez I, .L69 NOP .L51: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L55 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L55 NOP #endif .align 3 .L52: MADD c11, c11, a1, b1 LD a3, 2 * SIZE(AO) MADD c21, c21, a1, b2 LD b4, 3 * SIZE(BO) MADD c12, c12, a2, b1 LD a4, 3 * SIZE(AO) MADD c22, c22, a2, b2 LD b1, 8 * SIZE(BO) MADD c11, c11, a3, b3 LD a1, 8 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 5 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 5 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 6 * SIZE(BO) MADD c11, c11, a5, b5 LD a3, 6 * SIZE(AO) MADD c21, c21, a5, b2 LD b4, 7 * SIZE(BO) MADD c12, c12, a2, b5 LD a4, 7 * SIZE(AO) MADD c22, c22, a2, b2 LD b5, 12 * SIZE(BO) MADD c11, c11, a3, b3 LD a5, 12 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 9 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 9 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 10 * SIZE(BO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L52 daddiu BO, BO, 8 * SIZE .align 3 .L55: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L58 NOP .align 3 .L56: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 3 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L56 daddiu BO, BO, 2 * SIZE .L58: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c12, b3, c12 SUB c22, b4, c22 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c21, b3, c21 SUB c22, b4, c22 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 MUL c22, b1, c22 NMSUB c11, c11, b2, c12 NMSUB c21, c21, b2, c22 MUL c11, b3, c11 MUL c21, b3, c21 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 NMSUB c12, c12, b2, c11 NMSUB c22, c22, b2, c21 MUL c12, b3, c12 MUL c22, b3, c22 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 MUL c21, b3, c21 MUL c22, b3, c22 #endif #ifdef RT LD b1, 3 * SIZE(BO) LD b2, 2 * SIZE(BO) LD b3, 0 * SIZE(BO) MUL c21, b1, c21 MUL c22, b1, c22 NMSUB c11, c11, b2, c21 NMSUB c12, c12, b2, c22 MUL c11, b3, c11 MUL c12, b3, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c12, 2 * SIZE(BO) ST c22, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif MTC $0, a1 MOV c11, a1 MOV c21, a1 MOV c31, a1 daddiu I, I, -1 bgtz I, .L51 MOV c41, c11 .align 3 .L69: #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 2 #endif #ifdef RT daddiu KK, KK, -2 #endif .align 3 .L70: andi J, N, 1 blez J, .L999 NOP #ifdef RT dsll TEMP, K, BASE_SHIFT dsubu B, B, TEMP dsubu C, C, LDC #endif move AO, A move CO1, C #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO1, LDC #endif andi I, M, 1 blez I, .L80 NOP #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 blez L, .L85 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll TEMP, KK, BASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 blez L, .L85 NOP #endif .align 3 .L82: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c21, c21, a1, b1 LD a1, 2 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 3 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c21, c21, a1, b1 daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L82 daddiu BO, BO, 4 * SIZE .align 3 .L85: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L88 NOP .align 3 .L86: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L86 daddiu BO, BO, 1 * SIZE .L88: ADD c11, c11, c21 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -1 #endif dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) SUB c11, b1, c11 #else LD b1, 0 * SIZE(AO) SUB c11, b1, c11 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) MUL c11, b1, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) #else ST c11, 0 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) #ifndef LN daddiu CO1, CO1, 1 * SIZE #endif #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L80: dsra I, M, 1 blez I, .L89 NOP .L71: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L75 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L75 NOP #endif .align 3 .L72: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 2 * SIZE(AO) LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 8 * SIZE bgtz L, .L72 daddiu BO, BO, 4 * SIZE .align 3 .L75: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L78 NOP .align 3 .L76: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L76 daddiu BO, BO, 1 * SIZE .L78: ADD c11, c11, c21 ADD c12, c12, c22 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -1 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 NMSUB c11, c11, b2, c12 MUL c11, b3, c11 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 NMSUB c12, c12, b2, c11 MUL c12, b3, c12 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) #ifndef LN daddiu CO1, CO1, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif daddiu I, I, -1 bgtz I, .L71 NOP .align 3 .L89: #ifdef LN dsll TEMP, K, BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 1 #endif #ifdef RT daddiu KK, KK, -1 #endif .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) LDARG $22, 88($sp) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #ifndef __64BIT__ ldc1 $f20,112($sp) ldc1 $f21,120($sp) ldc1 $f22,128($sp) ldc1 $f23,136($sp) #endif j $31 daddiu $sp, $sp, 144 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/trsm_kernel_LN_loongson3a.S000066400000000000000000001034701313527062700227570ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define OFFSET $22 #define KK $23 #define TEMP $24 #define AORIG $25 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f14 #define b8 $f15 #define t11 $f16 #define t21 $f17 #define t31 $f18 #define t41 $f19 #define t12 $f20 #define t22 $f21 #define t32 $f22 #define t42 $f23 #define t13 $f24 #define t23 $f25 #define t33 $f26 #define t43 $f27 #define t14 $f28 #define t24 $f29 #define t34 $f30 #define t44 $f31 #define ALPHA $f15 PROLOGUE daddiu $sp, $sp, -144 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) SDARG $22, 88($sp) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) #ifndef __64BIT__ sdc1 $f20,112($sp) sdc1 $f21,120($sp) sdc1 $f22,128($sp) sdc1 $f23,136($sp) #endif # LN compute from bottom to top LDARG OFFSET, 144($sp) dsll LDC, LDC, BASE_SHIFT # ldc mult M, K mflo TEMP # TEMP=MC*KC dsll TEMP, TEMP, BASE_SHIFT daddu A, A, TEMP # A move to the end of sa dsll TEMP, M, BASE_SHIFT daddu C, C, TEMP # C+=MC dsra J, N, 2 # j = nc/4 blez J, .L30 nop .L10: # nr=4 daddiu J, J, -1 move CO1, C daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC MTC $0, t11 # clear result registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai move AORIG, A # reset A daddu C, CO4, LDC # fixed pointer C, the write back address andi I, M, 1 # mr=2,nr=4 blez I, .L50 nop dsll TEMP, K, BASE_SHIFT # mr=1 dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 daddu AO, AORIG, L # AO point to the rectangular data part daddu BO, B, TEMP dsubu TEMP, K, KK MOV t13, t11 # mr=2 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD b1, 0 * SIZE(BO) # get 4b LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) dsra L, TEMP, 2 blez L, .L55 nop .align 3 .L52: LD a5, 1 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 LD a3, 2 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t12, t12, a5, b6 MADD t13, t13, a5, b7 MADD t14, t14, a5, b8 LD a7, 3 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 # 3rd compute MADD t12, t12, a3, b2 MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 # 4th compute MADD t12, t12, a7, b6 MADD t13, t13, a7, b7 MADD t14, t14, a7, b8 daddiu L, L, -1 bgtz L, .L52 nop .align 3 .L55: andi L, TEMP, 3 blez L, .L58 nop .align 3 .L56: MADD t11, t11, a1, b1 # 3rd compute MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L56 nop .L58: # deal with the triangular part daddiu TEMP, KK, -1 dsll L, TEMP, BASE_SHIFT # mr=1 dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L # Ao point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t13, b3, t13 SUB t14, b4, t14 LD b3, 0 * SIZE(AO) MUL t11, b3, t11 MUL t12, b3, t12 MUL t13, b3, t13 MUL t14, b3, t14 daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE daddiu CO3, CO3, -1 * SIZE daddiu CO4, CO4, -1 * SIZE ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t13, 2 * SIZE(BO) ST t14, 3 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) daddiu KK, KK, -1 # the length of rectangular data part increases by 1 MTC $0, t11 # clear result registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 .L50: andi I, M, 2 # mr=2,nr=4 blez I, .L20 nop dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L # AO point to the rectangular data part daddu BO, B, TEMP dsubu TEMP, K, KK MOV t13, t11 # mr=2 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD b1, 0 * SIZE(BO) # get 4b LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) dsra L, TEMP, 2 blez L, .L25 nop .align 3 .L22: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 # 3rd compute MADD t21, t21, a4, b1 MADD t12, t12, a3, b2 MADD t22, t22, a4, b2 MADD t13, t13, a3, b3 MADD t23, t23, a4, b3 MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 # 4th compute MADD t21, t21, a8, b5 MADD t12, t12, a7, b6 MADD t22, t22, a8, b6 MADD t13, t13, a7, b7 MADD t23, t23, a8, b7 MADD t14, t14, a7, b8 MADD t24, t24, a8, b8 daddiu L, L, -1 bgtz L, .L22 nop .align 3 .L25: andi L, TEMP, 3 blez L, .L28 nop .align 3 .L26: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L26 nop .L28: # deal with the triangular part daddiu TEMP, KK, -2 dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L # Ao point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t13, b3, t13 SUB t14, b4, t14 SUB t21, b5, t21 SUB t22, b6, t22 SUB t23, b7, t23 SUB t24, b8, t24 LD b1, 3 * SIZE(AO) # computes the triangular_part LD b2, 2 * SIZE(AO) MUL t21, b1, t21 MUL t22, b1, t22 MUL t23, b1, t23 MUL t24, b1, t24 NMSUB t11, t11, b2, t21 NMSUB t12, t12, b2, t22 NMSUB t13, t13, b2, t23 NMSUB t14, t14, b2, t24 LD b3, 0 * SIZE(AO) MUL t11, b3, t11 MUL t12, b3, t12 MUL t13, b3, t13 MUL t14, b3, t14 daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE daddiu CO3, CO3, -2 * SIZE daddiu CO4, CO4, -2 * SIZE ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t13, 2 * SIZE(BO) ST t14, 3 * SIZE(BO) ST t21, 4 * SIZE(BO) ST t22, 5 * SIZE(BO) ST t23, 6 * SIZE(BO) ST t24, 7 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu KK, KK, -2 # the length of rectangular data part increases by 2 MTC $0, t11 # clear result registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 .L20: dsra I, M, 2 # I=MC/4 blez I, .L29 nop .L11: # mr=4 dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte daddu AO, AORIG, L # AO point to the rectangular data part daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) # get 4a LD b1, 0 * SIZE(BO) # get 4b LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV t13, t11 # clear result registers MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L15 nop .align 3 .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a5, b5 # 4th compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 daddiu L, L, -1 bgtz L, .L12 nop .align 3 .L15: andi L, TEMP, 3 blez L, .L18 nop .align 3 .L16: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L16 nop .L18: # deal with the triangular data part of panel Ai daddiu TEMP, KK, -4 # dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L # AO point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t13, b3, t13 SUB t14, b4, t14 LD b5, 4 * SIZE(BO) # sb store in row major LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB t21, b5, t21 SUB t22, b6, t22 SUB t23, b7, t23 SUB t24, b8, t24 LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) SUB t31, b1, t31 SUB t32, b2, t32 SUB t33, b3, t33 SUB t34, b4, t34 LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) SUB t41, b5, t41 SUB t42, b6, t42 SUB t43, b7, t43 SUB t44, b8, t44 LD b1, 15 * SIZE(AO) LD b2, 14 * SIZE(AO) LD b4, 13 * SIZE(AO) LD b7, 12 * SIZE(AO) MUL t41, b1, t41 MUL t42, b1, t42 MUL t43, b1, t43 MUL t44, b1, t44 NMSUB t31, t31, b2, t41 NMSUB t32, t32, b2, t42 NMSUB t33, t33, b2, t43 NMSUB t34, t34, b2, t44 NMSUB t21, t21, b4, t41 NMSUB t22, t22, b4, t42 NMSUB t23, t23, b4, t43 NMSUB t24, t24, b4, t44 NMSUB t11, t11, b7, t41 NMSUB t12, t12, b7, t42 NMSUB t13, t13, b7, t43 NMSUB t14, t14, b7, t44 LD b3, 10 * SIZE(AO) LD b5, 9 * SIZE(AO) LD b8, 8 * SIZE(AO) MUL t31, b3, t31 MUL t32, b3, t32 MUL t33, b3, t33 MUL t34, b3, t34 NMSUB t21, t21, b5, t31 NMSUB t22, t22, b5, t32 NMSUB t23, t23, b5, t33 NMSUB t24, t24, b5, t34 NMSUB t11, t11, b8, t31 NMSUB t12, t12, b8, t32 NMSUB t13, t13, b8, t33 NMSUB t14, t14, b8, t34 LD b6, 5 * SIZE(AO) LD b1, 4 * SIZE(AO) MUL t21, b6, t21 MUL t22, b6, t22 MUL t23, b6, t23 MUL t24, b6, t24 NMSUB t11, t11, b1, t21 NMSUB t12, t12, b1, t22 NMSUB t13, t13, b1, t23 NMSUB t14, t14, b1, t24 LD b2, 0 * SIZE(AO) MUL t11, b2, t11 MUL t12, b2, t12 MUL t13, b2, t13 MUL t14, b2, t14 daddiu CO1, CO1, -4 * SIZE # modify daddiu CO2, CO2, -4 * SIZE daddiu CO3, CO3, -4 * SIZE daddiu CO4, CO4, -4 * SIZE ST t11, 0 * SIZE(BO) # update packed B ST t12, 1 * SIZE(BO) ST t13, 2 * SIZE(BO) ST t14, 3 * SIZE(BO) ST t21, 4 * SIZE(BO) ST t22, 5 * SIZE(BO) ST t23, 6 * SIZE(BO) ST t24, 7 * SIZE(BO) ST t31, 8 * SIZE(BO) ST t32, 9 * SIZE(BO) ST t33, 10 * SIZE(BO) ST t34, 11 * SIZE(BO) ST t41, 12 * SIZE(BO) ST t42, 13 * SIZE(BO) ST t43, 14 * SIZE(BO) ST t44, 15 * SIZE(BO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t33, 2 * SIZE(CO3) ST t43, 3 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu KK, KK, -4 # KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4 daddiu I, I, -1 MTC $0, a1 MOV t11, a1 MOV t21, a1 MOV t31, a1 MOV t41, a1 MOV t12, a1 MOV t22, a1 MOV t32, a1 MOV t42, a1 bgtz I, .L11 nop .align 3 .L29: dsll TEMP, K, 2 + BASE_SHIFT daddu B, B, TEMP # B point to next Bj bgtz J, .L10 nop .align 3 .L30: andi J, N, 2 # nr=2 blez J, .L70 nop move CO1, C daddu CO2, C, LDC MTC $0, t11 # clear result regusters MOV t21, t11 MOV t31, t11 MOV t41, t11 daddu KK, M, OFFSET move AORIG, A # reset A daddu C, CO2, LDC # fixed andi I, M, 1 # mr=1 blez I, .L60 nop dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L # AO point to rectangular data part daddu BO, B, TEMP dsubu TEMP, K, KK MOV t12, t11 # clear result registers MOV t22, t11 MOV t32, t11 MOV t42, t11 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 blez L, .L65 nop .align 3 .L62: LD a5, 1 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t12, t12, a1, b2 LD a3, 2 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t12, t12, a5, b6 LD a7, 3 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute MADD t12, t12, a3, b4 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute MADD t12, t12, a7, b8 daddiu L, L, -1 bgtz L, .L62 nop .align 3 .L65: andi L, TEMP, 3 blez L, .L68 nop .align 3 .L66: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 daddiu AO, AO, 1 * SIZE # AO += mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L66 nop .L68: daddiu TEMP, KK, -1 # mr=1 dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L # Ao point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 LD b3, 0 * SIZE(AO) MUL t11, b3, t11 MUL t12, b3, t12 daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddiu KK, KK, -1 MTC $0, t11 # clear result regusters MOV t21, t11 MOV t31, t11 MOV t41, t11 .L60: andi I, M, 2 blez I, .L40 nop dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai dsll L, KK, 1 + BASE_SHIFT # mr=2 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L # AO point to rectangular data part daddu BO, B, TEMP dsubu TEMP, K, KK MOV t12, t11 # clear result registers MOV t22, t11 MOV t32, t11 MOV t42, t11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 blez L, .L45 nop .align 3 .L42: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute MADD t21, t21, a4, b3 MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute MADD t21, t21, a8, b7 MADD t12, t12, a7, b8 MADD t22, t22, a8, b8 daddiu L, L, -1 bgtz L, .L42 nop .align 3 .L45: andi L, TEMP, 3 blez L, .L48 nop .align 3 .L46: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L46 nop .L48: daddiu TEMP, KK, -2 dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L # Ao point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t21, b3, t21 SUB t22, b4, t22 LD b1, 3 * SIZE(AO) # computes the triangular_part LD b2, 2 * SIZE(AO) MUL t21, b1, t21 MUL t22, b1, t22 NMSUB t11, t11, b2, t21 NMSUB t12, t12, b2, t22 LD b3, 0 * SIZE(AO) MUL t11, b3, t11 MUL t12, b3, t12 daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t21, 2 * SIZE(BO) ST t22, 3 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) daddiu KK, KK, -2 MTC $0, t11 # clear result regusters MOV t21, t11 MOV t31, t11 MOV t41, t11 .L40: dsra I, M, 2 # I = mc/4 blez I, .L49 nop .L31: dsll TEMP, K, 2 + BASE_SHIFT dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L # AO point to the rectangular data part daddu BO, B, TEMP dsubu TEMP, K, KK MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) # get 4a LD b1, 0 * SIZE(BO) # get 4b LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 blez L, .L35 nop .align 3 .L32: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b3 # 3rd compute MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 MADD t12, t12, a1, b4 MADD t22, t22, a2, b4 MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a5, b7 # 4th compute MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 MADD t12, t12, a5, b8 MADD t22, t22, a6, b8 MADD t32, t32, a7, b8 MADD t42, t42, a8, b8 daddiu L, L, -1 bgtz L, .L32 nop .align 3 .L35: andi L, TEMP, 3 blez L, .L38 nop .align 3 .L36: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L36 nop .L38: # daddiu TEMP, KK, -4 dsll L, TEMP, 2 + BASE_SHIFT # mr=4 dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L # AO point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t21, b3, t21 SUB t22, b4, t22 SUB t31, b5, t31 SUB t32, b6, t32 SUB t41, b7, t41 SUB t42, b8, t42 LD b1, 15 * SIZE(AO) LD b2, 14 * SIZE(AO) LD b4, 13 * SIZE(AO) LD b7, 12 * SIZE(AO) MUL t41, b1, t41 MUL t42, b1, t42 NMSUB t31, t31, b2, t41 NMSUB t32, t32, b2, t42 NMSUB t21, t21, b4, t41 NMSUB t22, t22, b4, t42 NMSUB t11, t11, b7, t41 NMSUB t12, t12, b7, t42 LD b3, 10 * SIZE(AO) LD b5, 9 * SIZE(AO) LD b8, 8 * SIZE(AO) MUL t31, b3, t31 MUL t32, b3, t32 NMSUB t21, t21, b5, t31 NMSUB t22, t22, b5, t32 NMSUB t11, t11, b8, t31 NMSUB t12, t12, b8, t32 LD b6, 5 * SIZE(AO) LD b1, 4 * SIZE(AO) MUL t21, b6, t21 MUL t22, b6, t22 NMSUB t11, t11, b1, t21 NMSUB t12, t12, b1, t22 LD b2, 0 * SIZE(AO) MUL t11, b2, t11 MUL t12, b2, t12 daddiu CO1, CO1, -4 * SIZE daddiu CO2, CO2, -4 * SIZE ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t21, 2 * SIZE(BO) ST t22, 3 * SIZE(BO) ST t31, 4 * SIZE(BO) ST t32, 5 * SIZE(BO) ST t41, 6 * SIZE(BO) ST t42, 7 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu KK, KK, -4 MTC $0, t11 MOV t21, t11 MOV t31, t11 MOV t41, t11 daddiu I, I, -1 bgtz I, .L31 nop .align 3 .L49: dsll TEMP, K, 1 + BASE_SHIFT # nr=2 daddu B, B, TEMP .align 3 .L70: andi J, N, 1 # nr=1 blez J, .L999 # END nop move CO1, C daddu KK, M, OFFSET move AORIG, A # reset A andi I, M, 1 # mr=1 blez I, .L90 NOP MTC $0, t11 dsll TEMP, K, BASE_SHIFT # mr=1 dsubu AORIG, AORIG, TEMP dsll L, KK, BASE_SHIFT daddu AO, AORIG, L # AO point to the rectangular data part daddu BO, B, L dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) dsra L, TEMP, 2 blez L, .L95 nop .align 3 .L92: LD a5, 1 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute LD a3, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute LD a7, 3 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute daddiu L, L, -1 bgtz L, .L92 nop .align 3 .L95: andi L, TEMP, 3 blez L, .L98 nop .align 3 .L96: MADD t11, t11, a1, b1 # 3rd compute daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L96 nop .L98: daddiu TEMP, KK, -1 # mr=2 dsll TEMP, TEMP, BASE_SHIFT daddu AO, AORIG, TEMP # AO point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) SUB t11, b1, t11 LD b3, 0 * SIZE(AO) MUL t11, b3, t11 daddiu CO1, CO1, -1 * SIZE ST t11, 0 * SIZE(BO) ST t11, 0 * SIZE(CO1) daddiu KK, KK, -1 .L90: andi I, M, 2 blez I, .L80 NOP MTC $0, t11 MOV t21, t11 # clear result registers dsll TEMP, K, 1+BASE_SHIFT # mr=2 dsubu AORIG, AORIG, TEMP dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu AO, AORIG, L # AO point to the rectangular data part daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) dsra L, TEMP, 2 blez L, .L85 nop .align 3 .L82: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute MADD t21, t21, a4, b3 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute MADD t21, t21, a8, b7 daddiu L, L, -1 bgtz L, .L82 nop .align 3 .L85: andi L, TEMP, 3 blez L, .L88 nop .align 3 .L86: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L86 nop .L88: daddiu TEMP, KK, -2 # mr=2 dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AORIG, L # AO point to the triangular data part daddu BO, B, TEMP LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB t11, b1, t11 SUB t21, b2, t21 LD b1, 3 * SIZE(AO) # computes the triangular_part LD b2, 2 * SIZE(AO) MUL t21, b1, t21 NMSUB t11, t11, b2, t21 LD b3, 0 * SIZE(AO) MUL t11, b3, t11 daddiu CO1, CO1, -2 * SIZE ST t11, 0 * SIZE(BO) ST t21, 1 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) daddiu KK, KK, -2 .align 3 .L80: dsra I, M, 2 blez I, .L89 nop .L71: dsll TEMP, K, 2 + BASE_SHIFT # mr=4 dsubu AORIG, AORIG, TEMP dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, 0 + BASE_SHIFT # nr=1 daddu AO, AORIG, L # AO point to the rectangular daddu BO, B, TEMP dsubu TEMP, K, KK MTC $0, t11 # clear result regusters MOV t21, t11 MOV t31, t11 MOV t41, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) # get 4a LD b1, 0 * SIZE(BO) # get 4b dsra L, TEMP, 2 blez L, .L75 nop # reset B .align 3 .L72: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a1, b3 # 3rd compute MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a5, b7 # 4th compute MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 daddiu L, L, -1 bgtz L, .L72 nop .align 3 .L75: andi L, TEMP, 3 blez L, .L78 nop .align 3 .L76: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L76 nop .L78: daddiu TEMP, KK, -4 # mr=4 dsll L, TEMP, 2 + BASE_SHIFT # mr=4 dsll TEMP, TEMP, 0 + BASE_SHIFT # nr=1 daddu AO, AORIG, L # AO point to the triangular daddu BO, B, TEMP LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD b1, 15 * SIZE(AO) LD b2, 14 * SIZE(AO) LD b4, 13 * SIZE(AO) LD b7, 12 * SIZE(AO) MUL t41, b1, t41 NMSUB t31, t31, b2, t41 NMSUB t21, t21, b4, t41 NMSUB t11, t11, b7, t41 LD b3, 10 * SIZE(AO) LD b5, 9 * SIZE(AO) LD b8, 8 * SIZE(AO) MUL t31, b3, t31 NMSUB t21, t21, b5, t31 NMSUB t11, t11, b8, t31 LD b6, 5 * SIZE(AO) LD b1, 4 * SIZE(AO) MUL t21, b6, t21 NMSUB t11, t11, b1, t21 LD b2, 0 * SIZE(AO) MUL t11, b2, t11 daddiu CO1, CO1, -4 * SIZE ST t11, 0 * SIZE(BO) ST t21, 1 * SIZE(BO) ST t31, 2 * SIZE(BO) ST t41, 3 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) daddiu KK, KK, -4 daddiu I, I, -1 bgtz I, .L71 nop .align 3 .L89: dsll TEMP, K, BASE_SHIFT # nr=1 daddu B, B, TEMP .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) LDARG $22, 88($sp) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #ifndef __64BIT__ ldc1 $f20,112($sp) ldc1 $f21,120($sp) ldc1 $f22,128($sp) ldc1 $f23,136($sp) #endif j $31 daddiu $sp, $sp, 144 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/trsm_kernel_LT.S000066400000000000000000001623651313527062700206330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define CO5 $18 #define CO6 $19 #define CO7 $20 #define CO8 $21 #define OFFSET $22 #define KK $23 #define TEMP $24 #define AORIG $25 #define a1 $f0 #define a2 $f1 #define a3 $f27 #define a4 $f28 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f16 #define c41 $f17 #define c42 $f18 #define c51 $f19 #define c52 $f20 #define c61 $f21 #define c62 $f22 #define c71 $f23 #define c72 $f24 #define c81 $f25 #define c82 $f26 #define ALPHA $f15 PROLOGUE daddiu $sp, $sp, -144 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) SDARG $22, 88($sp) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) #ifndef __64BIT__ sdc1 $f20,112($sp) sdc1 $f21,120($sp) sdc1 $f22,128($sp) sdc1 $f23,136($sp) #endif LDARG OFFSET, 144($sp) dsll LDC, LDC, BASE_SHIFT #ifdef LN mult M, K mflo TEMP dsll TEMP, TEMP, BASE_SHIFT daddu A, A, TEMP dsll TEMP, M, BASE_SHIFT daddu C, C, TEMP #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mult N, K mflo TEMP dsll TEMP, TEMP, BASE_SHIFT daddu B, B, TEMP mult N, LDC mflo TEMP daddu C, C, TEMP dsubu KK, N, OFFSET #endif dsra J, N, 3 blez J, .L30 nop .L10: #ifdef RT dsll TEMP, K, 3 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 3 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 daddu CO5, CO4, LDC MOV c31, c11 daddu CO6, CO5, LDC MOV c41, c11 daddu CO7, CO6, LDC MOV c51, c11 daddu CO8, CO7, LDC dsra I, M, 1 #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO8, LDC #endif blez I, .L20 MOV c61, c11 .L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, KK, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(BO) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 dsra L, TEMP, 2 MOV c32, c11 LD b3, 2 * SIZE(BO) MOV c42, c11 LD b4, 3 * SIZE(BO) MOV c52, c11 LD b5, 4 * SIZE(BO) MOV c62, c11 LD b6, 8 * SIZE(BO) MOV c72, c11 LD b7, 12 * SIZE(BO) MOV c82, c11 blez L, .L15 NOP #endif MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 blez L, .L13 MADD c41, c41, a1, b4 NOP .align 3 .L12: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 daddiu L, L, -1 MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 bgtz L, .L12 MADD c41, c41, a1, b4 NOP .align 3 .L13: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L18 NOP .align 3 .L16: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 NOP MADD c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 daddiu L, L, -1 MADD c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -8 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 LD b5, 4 * SIZE(BO) SUB c21, b2, c21 LD b6, 5 * SIZE(BO) SUB c31, b3, c31 LD b7, 6 * SIZE(BO) SUB c41, b4, c41 LD b8, 7 * SIZE(BO) SUB c51, b5, c51 LD b1, 8 * SIZE(BO) SUB c61, b6, c61 LD b2, 9 * SIZE(BO) SUB c71, b7, c71 LD b3, 10 * SIZE(BO) SUB c81, b8, c81 LD b4, 11 * SIZE(BO) SUB c12, b1, c12 LD b5, 12 * SIZE(BO) SUB c22, b2, c22 LD b6, 13 * SIZE(BO) SUB c32, b3, c32 LD b7, 14 * SIZE(BO) SUB c42, b4, c42 LD b8, 15 * SIZE(BO) SUB c52, b5, c52 #ifdef LN LD b1, 3 * SIZE(AO) #else LD b1, 0 * SIZE(AO) #endif SUB c62, b6, c62 SUB c72, b7, c72 SUB c82, b8, c82 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 LD b5, 4 * SIZE(AO) SUB c12, b2, c12 LD b6, 5 * SIZE(AO) SUB c21, b3, c21 LD b7, 6 * SIZE(AO) SUB c22, b4, c22 LD b8, 7 * SIZE(AO) SUB c31, b5, c31 LD b1, 8 * SIZE(AO) SUB c32, b6, c32 LD b2, 9 * SIZE(AO) SUB c41, b7, c41 LD b3, 10 * SIZE(AO) SUB c42, b8, c42 LD b4, 11 * SIZE(AO) LD b5, 12 * SIZE(AO) SUB c51, b1, c51 LD b6, 13 * SIZE(AO) SUB c52, b2, c52 LD b7, 14 * SIZE(AO) SUB c61, b3, c61 LD b8, 15 * SIZE(AO) SUB c62, b4, c62 SUB c71, b5, c71 SUB c72, b6, c72 SUB c81, b7, c81 SUB c82, b8, c82 #endif #ifdef LN MUL c12, b1, c12 LD b2, 2 * SIZE(AO) MUL c22, b1, c22 MUL c32, b1, c32 MUL c42, b1, c42 MUL c52, b1, c52 MUL c62, b1, c62 MUL c72, b1, c72 MUL c82, b1, c82 NMSUB c11, c11, b2, c12 LD b3, 0 * SIZE(AO) NMSUB c21, c21, b2, c22 NMSUB c31, c31, b2, c32 NMSUB c41, c41, b2, c42 NMSUB c51, c51, b2, c52 NMSUB c61, c61, b2, c62 NMSUB c71, c71, b2, c72 NMSUB c81, c81, b2, c82 MUL c11, b3, c11 daddiu CO1, CO1, -2 * SIZE MUL c21, b3, c21 daddiu CO2, CO2, -2 * SIZE MUL c31, b3, c31 daddiu CO3, CO3, -2 * SIZE MUL c41, b3, c41 daddiu CO4, CO4, -2 * SIZE MUL c51, b3, c51 daddiu CO5, CO5, -2 * SIZE MUL c61, b3, c61 daddiu CO6, CO6, -2 * SIZE MUL c71, b3, c71 daddiu CO7, CO7, -2 * SIZE MUL c81, b3, c81 daddiu CO8, CO8, -2 * SIZE #endif #ifdef LT MUL c11, b1, c11 LD b2, 1 * SIZE(AO) MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 MUL c51, b1, c51 MUL c61, b1, c61 MUL c71, b1, c71 MUL c81, b1, c81 NMSUB c12, c12, b2, c11 LD b3, 3 * SIZE(AO) NMSUB c22, c22, b2, c21 NMSUB c32, c32, b2, c31 NMSUB c42, c42, b2, c41 NMSUB c52, c52, b2, c51 NMSUB c62, c62, b2, c61 NMSUB c72, c72, b2, c71 NMSUB c82, c82, b2, c81 MUL c12, b3, c12 MUL c22, b3, c22 MUL c32, b3, c32 MUL c42, b3, c42 MUL c52, b3, c52 MUL c62, b3, c62 MUL c72, b3, c72 MUL c82, b3, c82 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 LD b5, 4 * SIZE(BO) NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 LD b6, 5 * SIZE(BO) NMSUB c31, c31, b3, c11 NMSUB c32, c32, b3, c12 LD b7, 6 * SIZE(BO) NMSUB c41, c41, b4, c11 NMSUB c42, c42, b4, c12 LD b8, 7 * SIZE(BO) NMSUB c51, c51, b5, c11 NMSUB c52, c52, b5, c12 LD b2, 9 * SIZE(BO) NMSUB c61, c61, b6, c11 NMSUB c62, c62, b6, c12 LD b3, 10 * SIZE(BO) NMSUB c71, c71, b7, c11 NMSUB c72, c72, b7, c12 LD b4, 11 * SIZE(BO) NMSUB c81, c81, b8, c11 NMSUB c82, c82, b8, c12 LD b5, 12 * SIZE(BO) MUL c21, b2, c21 MUL c22, b2, c22 LD b6, 13 * SIZE(BO) NMSUB c31, c31, b3, c21 NMSUB c32, c32, b3, c22 LD b7, 14 * SIZE(BO) NMSUB c41, c41, b4, c21 NMSUB c42, c42, b4, c22 LD b8, 15 * SIZE(BO) NMSUB c51, c51, b5, c21 NMSUB c52, c52, b5, c22 LD b3, 18 * SIZE(BO) NMSUB c61, c61, b6, c21 NMSUB c62, c62, b6, c22 LD b4, 19 * SIZE(BO) NMSUB c71, c71, b7, c21 NMSUB c72, c72, b7, c22 LD b5, 20 * SIZE(BO) NMSUB c81, c81, b8, c21 NMSUB c82, c82, b8, c22 LD b6, 21 * SIZE(BO) MUL c31, b3, c31 MUL c32, b3, c32 LD b7, 22 * SIZE(BO) NMSUB c41, c41, b4, c31 NMSUB c42, c42, b4, c32 LD b8, 23 * SIZE(BO) NMSUB c51, c51, b5, c31 NMSUB c52, c52, b5, c32 LD b4, 27 * SIZE(BO) NMSUB c61, c61, b6, c31 NMSUB c62, c62, b6, c32 LD b5, 28 * SIZE(BO) NMSUB c71, c71, b7, c31 NMSUB c72, c72, b7, c32 LD b6, 29 * SIZE(BO) NMSUB c81, c81, b8, c31 NMSUB c82, c82, b8, c32 LD b7, 30 * SIZE(BO) MUL c41, b4, c41 MUL c42, b4, c42 LD b8, 31 * SIZE(BO) NMSUB c51, c51, b5, c41 NMSUB c52, c52, b5, c42 LD b5, 36 * SIZE(BO) NMSUB c61, c61, b6, c41 NMSUB c62, c62, b6, c42 LD b6, 37 * SIZE(BO) NMSUB c71, c71, b7, c41 NMSUB c72, c72, b7, c42 LD b7, 38 * SIZE(BO) NMSUB c81, c81, b8, c41 NMSUB c82, c82, b8, c42 LD b8, 39 * SIZE(BO) MUL c51, b5, c51 MUL c52, b5, c52 NMSUB c61, c61, b6, c51 NMSUB c62, c62, b6, c52 LD b6, 45 * SIZE(BO) NMSUB c71, c71, b7, c51 NMSUB c72, c72, b7, c52 LD b7, 46 * SIZE(BO) NMSUB c81, c81, b8, c51 NMSUB c82, c82, b8, c52 LD b8, 47 * SIZE(BO) MUL c61, b6, c61 MUL c62, b6, c62 NMSUB c71, c71, b7, c61 NMSUB c72, c72, b7, c62 LD b7, 54 * SIZE(BO) NMSUB c81, c81, b8, c61 NMSUB c82, c82, b8, c62 LD b8, 55 * SIZE(BO) MUL c71, b7, c71 MUL c72, b7, c72 NMSUB c81, c81, b8, c71 NMSUB c82, c82, b8, c72 LD b8, 63 * SIZE(BO) MUL c81, b8, c81 MUL c82, b8, c82 #endif #ifdef RT LD b1, 63 * SIZE(BO) LD b2, 62 * SIZE(BO) LD b3, 61 * SIZE(BO) LD b4, 60 * SIZE(BO) MUL c81, b1, c81 MUL c82, b1, c82 LD b5, 59 * SIZE(BO) NMSUB c71, c71, b2, c81 NMSUB c72, c72, b2, c82 LD b6, 58 * SIZE(BO) NMSUB c61, c61, b3, c81 NMSUB c62, c62, b3, c82 LD b7, 57 * SIZE(BO) NMSUB c51, c51, b4, c81 NMSUB c52, c52, b4, c82 LD b8, 56 * SIZE(BO) NMSUB c41, c41, b5, c81 NMSUB c42, c42, b5, c82 LD b2, 54 * SIZE(BO) NMSUB c31, c31, b6, c81 NMSUB c32, c32, b6, c82 LD b3, 53 * SIZE(BO) NMSUB c21, c21, b7, c81 NMSUB c22, c22, b7, c82 LD b4, 52 * SIZE(BO) NMSUB c11, c11, b8, c81 NMSUB c12, c12, b8, c82 LD b5, 51 * SIZE(BO) MUL c71, b2, c71 MUL c72, b2, c72 LD b6, 50 * SIZE(BO) NMSUB c61, c61, b3, c71 NMSUB c62, c62, b3, c72 LD b7, 49 * SIZE(BO) NMSUB c51, c51, b4, c71 NMSUB c52, c52, b4, c72 LD b8, 48 * SIZE(BO) NMSUB c41, c41, b5, c71 NMSUB c42, c42, b5, c72 LD b3, 45 * SIZE(BO) NMSUB c31, c31, b6, c71 NMSUB c32, c32, b6, c72 LD b4, 44 * SIZE(BO) NMSUB c21, c21, b7, c71 NMSUB c22, c22, b7, c72 LD b5, 43 * SIZE(BO) NMSUB c11, c11, b8, c71 NMSUB c12, c12, b8, c72 LD b6, 42 * SIZE(BO) MUL c61, b3, c61 MUL c62, b3, c62 LD b7, 41 * SIZE(BO) NMSUB c51, c51, b4, c61 NMSUB c52, c52, b4, c62 LD b8, 40 * SIZE(BO) NMSUB c41, c41, b5, c61 NMSUB c42, c42, b5, c62 LD b4, 36 * SIZE(BO) NMSUB c31, c31, b6, c61 NMSUB c32, c32, b6, c62 LD b5, 35 * SIZE(BO) NMSUB c21, c21, b7, c61 NMSUB c22, c22, b7, c62 LD b6, 34 * SIZE(BO) NMSUB c11, c11, b8, c61 NMSUB c12, c12, b8, c62 LD b7, 33 * SIZE(BO) MUL c51, b4, c51 MUL c52, b4, c52 LD b8, 32 * SIZE(BO) NMSUB c41, c41, b5, c51 NMSUB c42, c42, b5, c52 LD b5, 27 * SIZE(BO) NMSUB c31, c31, b6, c51 NMSUB c32, c32, b6, c52 LD b6, 26 * SIZE(BO) NMSUB c21, c21, b7, c51 NMSUB c22, c22, b7, c52 LD b7, 25 * SIZE(BO) NMSUB c11, c11, b8, c51 NMSUB c12, c12, b8, c52 LD b8, 24 * SIZE(BO) MUL c41, b5, c41 MUL c42, b5, c42 NMSUB c31, c31, b6, c41 NMSUB c32, c32, b6, c42 LD b6, 18 * SIZE(BO) NMSUB c21, c21, b7, c41 NMSUB c22, c22, b7, c42 LD b7, 17 * SIZE(BO) NMSUB c11, c11, b8, c41 NMSUB c12, c12, b8, c42 LD b8, 16 * SIZE(BO) MUL c31, b6, c31 MUL c32, b6, c32 NMSUB c21, c21, b7, c31 NMSUB c22, c22, b7, c32 LD b7, 9 * SIZE(BO) NMSUB c11, c11, b8, c31 NMSUB c12, c12, b8, c32 LD b8, 8 * SIZE(BO) MUL c21, b7, c21 MUL c22, b7, c22 NMSUB c11, c11, b8, c21 NMSUB c12, c12, b8, c22 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 MUL c12, b8, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c61, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c81, 7 * SIZE(BO) ST c12, 8 * SIZE(BO) ST c22, 9 * SIZE(BO) ST c32, 10 * SIZE(BO) ST c42, 11 * SIZE(BO) ST c52, 12 * SIZE(BO) ST c62, 13 * SIZE(BO) ST c72, 14 * SIZE(BO) ST c82, 15 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) ST c31, 4 * SIZE(AO) ST c32, 5 * SIZE(AO) ST c41, 6 * SIZE(AO) ST c42, 7 * SIZE(AO) ST c51, 8 * SIZE(AO) ST c52, 9 * SIZE(AO) ST c61, 10 * SIZE(AO) ST c62, 11 * SIZE(AO) ST c71, 12 * SIZE(AO) ST c72, 13 * SIZE(AO) ST c81, 14 * SIZE(AO) ST c82, 15 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c32, 1 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c42, 1 * SIZE(CO4) ST c51, 0 * SIZE(CO5) ST c52, 1 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c62, 1 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c72, 1 * SIZE(CO7) ST c81, 0 * SIZE(CO8) ST c82, 1 * SIZE(CO8) MTC $0, a1 #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE daddiu CO5, CO5, 2 * SIZE daddiu CO6, CO6, 2 * SIZE daddiu CO7, CO7, 2 * SIZE daddiu CO8, CO8, 2 * SIZE #endif MOV c11, a1 MOV c21, a1 #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif MOV c31, a1 MOV c41, a1 #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif daddiu I, I, -1 MOV c51, a1 bgtz I, .L11 MOV c61, a1 .align 3 .L20: andi I, M, 1 MOV c61, c11 blez I, .L29 MOV c71, c11 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 MOV c81, c11 blez L, .L25 move BO, B #else #ifdef LN dsll TEMP, K, 0 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 MOV c81, c11 blez L, .L25 NOP #endif .align 3 .L22: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 LD b5, 20 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 9 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 10 * SIZE(BO) MADD c81, c81, a1, b4 LD b4, 11 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a2, b7 LD b7, 28 * SIZE(BO) MADD c61, c61, a2, b2 LD b2, 17 * SIZE(BO) MADD c71, c71, a2, b3 LD b3, 18 * SIZE(BO) MADD c81, c81, a2, b4 LD b4, 19 * SIZE(BO) LD a2, 5 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a3, b1 LD b1, 32 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 21 * SIZE(BO) MADD c31, c31, a3, b3 LD b3, 22 * SIZE(BO) MADD c41, c41, a3, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 LD b5, 36 * SIZE(BO) MADD c61, c61, a3, b2 LD b2, 25 * SIZE(BO) MADD c71, c71, a3, b3 LD b3, 26 * SIZE(BO) MADD c81, c81, a3, b4 LD b4, 27 * SIZE(BO) LD a3, 2 * SIZE(AO) daddiu BO, BO, 32 * SIZE MADD c11, c11, a4, b6 LD b6, 8 * SIZE(BO) MADD c21, c21, a4, b2 LD b2, -3 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, -2 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, -1 * SIZE(BO) MADD c51, c51, a4, b7 LD b7, 12 * SIZE(BO) MADD c61, c61, a4, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a4, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a4, b4 LD b4, 3 * SIZE(BO) bgtz L, .L22 LD a4, 3 * SIZE(AO) .align 3 .L25: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L28 NOP .align 3 .L26: MADD c11, c11, a1, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) daddiu L, L, -1 MOV a2, a2 daddiu AO, AO, 1 * SIZE daddiu BO, BO, 8 * SIZE MADD c51, c51, a1, b5 LD b5, 4 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) bgtz L, .L26 LD b4, 3 * SIZE(BO) .L28: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -8 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c51, b5, c51 SUB c61, b6, c61 SUB c71, b7, c71 SUB c81, b8, c81 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c51, b5, c51 SUB c61, b6, c61 SUB c71, b7, c71 SUB c81, b8, c81 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 MUL c51, b1, c51 MUL c61, b1, c61 MUL c71, b1, c71 MUL c81, b1, c81 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 NMSUB c31, c31, b3, c11 NMSUB c41, c41, b4, c11 NMSUB c51, c51, b5, c11 NMSUB c61, c61, b6, c11 NMSUB c71, c71, b7, c11 NMSUB c81, c81, b8, c11 LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MUL c21, b2, c21 NMSUB c31, c31, b3, c21 NMSUB c41, c41, b4, c21 NMSUB c51, c51, b5, c21 NMSUB c61, c61, b6, c21 NMSUB c71, c71, b7, c21 NMSUB c81, c81, b8, c21 LD b3, 18 * SIZE(BO) LD b4, 19 * SIZE(BO) LD b5, 20 * SIZE(BO) LD b6, 21 * SIZE(BO) LD b7, 22 * SIZE(BO) LD b8, 23 * SIZE(BO) MUL c31, b3, c31 NMSUB c41, c41, b4, c31 NMSUB c51, c51, b5, c31 NMSUB c61, c61, b6, c31 NMSUB c71, c71, b7, c31 NMSUB c81, c81, b8, c31 LD b4, 27 * SIZE(BO) LD b5, 28 * SIZE(BO) LD b6, 29 * SIZE(BO) LD b7, 30 * SIZE(BO) LD b8, 31 * SIZE(BO) MUL c41, b4, c41 NMSUB c51, c51, b5, c41 NMSUB c61, c61, b6, c41 NMSUB c71, c71, b7, c41 NMSUB c81, c81, b8, c41 LD b5, 36 * SIZE(BO) LD b6, 37 * SIZE(BO) LD b7, 38 * SIZE(BO) LD b8, 39 * SIZE(BO) MUL c51, b5, c51 NMSUB c61, c61, b6, c51 NMSUB c71, c71, b7, c51 NMSUB c81, c81, b8, c51 LD b6, 45 * SIZE(BO) LD b7, 46 * SIZE(BO) LD b8, 47 * SIZE(BO) MUL c61, b6, c61 NMSUB c71, c71, b7, c61 NMSUB c81, c81, b8, c61 LD b7, 54 * SIZE(BO) LD b8, 55 * SIZE(BO) MUL c71, b7, c71 NMSUB c81, c81, b8, c71 LD b8, 63 * SIZE(BO) MUL c81, b8, c81 #endif #ifdef RT LD b1, 63 * SIZE(BO) LD b2, 62 * SIZE(BO) LD b3, 61 * SIZE(BO) LD b4, 60 * SIZE(BO) LD b5, 59 * SIZE(BO) LD b6, 58 * SIZE(BO) LD b7, 57 * SIZE(BO) LD b8, 56 * SIZE(BO) MUL c81, b1, c81 NMSUB c71, c71, b2, c81 NMSUB c61, c61, b3, c81 NMSUB c51, c51, b4, c81 NMSUB c41, c41, b5, c81 NMSUB c31, c31, b6, c81 NMSUB c21, c21, b7, c81 NMSUB c11, c11, b8, c81 LD b2, 54 * SIZE(BO) LD b3, 53 * SIZE(BO) LD b4, 52 * SIZE(BO) LD b5, 51 * SIZE(BO) LD b6, 50 * SIZE(BO) LD b7, 49 * SIZE(BO) LD b8, 48 * SIZE(BO) MUL c71, b2, c71 NMSUB c61, c61, b3, c71 NMSUB c51, c51, b4, c71 NMSUB c41, c41, b5, c71 NMSUB c31, c31, b6, c71 NMSUB c21, c21, b7, c71 NMSUB c11, c11, b8, c71 LD b3, 45 * SIZE(BO) LD b4, 44 * SIZE(BO) LD b5, 43 * SIZE(BO) LD b6, 42 * SIZE(BO) LD b7, 41 * SIZE(BO) LD b8, 40 * SIZE(BO) MUL c61, b3, c61 NMSUB c51, c51, b4, c61 NMSUB c41, c41, b5, c61 NMSUB c31, c31, b6, c61 NMSUB c21, c21, b7, c61 NMSUB c11, c11, b8, c61 LD b4, 36 * SIZE(BO) LD b5, 35 * SIZE(BO) LD b6, 34 * SIZE(BO) LD b7, 33 * SIZE(BO) LD b8, 32 * SIZE(BO) MUL c51, b4, c51 NMSUB c41, c41, b5, c51 NMSUB c31, c31, b6, c51 NMSUB c21, c21, b7, c51 NMSUB c11, c11, b8, c51 LD b5, 27 * SIZE(BO) LD b6, 26 * SIZE(BO) LD b7, 25 * SIZE(BO) LD b8, 24 * SIZE(BO) MUL c41, b5, c41 NMSUB c31, c31, b6, c41 NMSUB c21, c21, b7, c41 NMSUB c11, c11, b8, c41 LD b6, 18 * SIZE(BO) LD b7, 17 * SIZE(BO) LD b8, 16 * SIZE(BO) MUL c31, b6, c31 NMSUB c21, c21, b7, c31 NMSUB c11, c11, b8, c31 LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c21, b7, c21 NMSUB c11, c11, b8, c21 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE daddiu CO3, CO3, -1 * SIZE daddiu CO4, CO4, -1 * SIZE daddiu CO5, CO5, -1 * SIZE daddiu CO6, CO6, -1 * SIZE daddiu CO7, CO7, -1 * SIZE daddiu CO8, CO8, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c61, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c81, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c41, 3 * SIZE(AO) ST c51, 4 * SIZE(AO) ST c61, 5 * SIZE(AO) ST c71, 6 * SIZE(AO) ST c81, 7 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c51, 0 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c81, 0 * SIZE(CO8) #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE daddiu CO5, CO5, 1 * SIZE daddiu CO6, CO6, 1 * SIZE daddiu CO7, CO7, 1 * SIZE daddiu CO8, CO8, 1 * SIZE #endif #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L29: #ifdef LN dsll TEMP, K, 3 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 8 #endif #ifdef RT daddiu KK, KK, -8 #endif bgtz J, .L10 NOP .align 3 .L30: andi J, N, 4 blez J, .L50 move AO, A #ifdef RT dsll TEMP, K, 2 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 2 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC MOV c21, c11 dsra I, M, 1 MOV c31, c11 #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO4, LDC #endif blez I, .L40 MOV c41, c11 .L31: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) MOV c32, c11 LD b4, 3 * SIZE(B) MOV c42, c11 LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L35 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) MOV c32, c11 LD b4, 3 * SIZE(BO) MOV c42, c11 LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L35 NOP #endif .align 3 .L32: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a3, b6 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c11, c11, a3, b7 LD a2, 7 * SIZE(AO) MADD c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD c31, c31, a3, b3 daddiu BO, BO, 16 * SIZE MADD c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD c12, c12, a2, b7 LD b7, 12 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD c42, c42, a2, b4 NOP bgtz L, .L32 LD b4, 3 * SIZE(BO) .align 3 .L35: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L38 NOP .align 3 .L36: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 daddiu AO, AO, 2 * SIZE MADD c41, c41, a1, b4 LD a1, 0 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 4 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) bgtz L, .L36 daddiu BO, BO, 4 * SIZE .L38: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c12, b5, c12 SUB c22, b6, c22 SUB c32, b7, c32 SUB c42, b8, c42 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c21, b3, c21 SUB c22, b4, c22 SUB c31, b5, c31 SUB c32, b6, c32 SUB c41, b7, c41 SUB c42, b8, c42 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 MUL c22, b1, c22 MUL c32, b1, c32 MUL c42, b1, c42 NMSUB c11, c11, b2, c12 NMSUB c21, c21, b2, c22 NMSUB c31, c31, b2, c32 NMSUB c41, c41, b2, c42 MUL c11, b3, c11 MUL c21, b3, c21 MUL c31, b3, c31 MUL c41, b3, c41 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 NMSUB c12, c12, b2, c11 NMSUB c22, c22, b2, c21 NMSUB c32, c32, b2, c31 NMSUB c42, c42, b2, c41 MUL c12, b3, c12 MUL c22, b3, c22 MUL c32, b3, c32 MUL c42, b3, c42 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 NMSUB c31, c31, b3, c11 NMSUB c32, c32, b3, c12 NMSUB c41, c41, b4, c11 NMSUB c42, c42, b4, c12 LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL c21, b2, c21 MUL c22, b2, c22 NMSUB c31, c31, b3, c21 NMSUB c32, c32, b3, c22 NMSUB c41, c41, b4, c21 NMSUB c42, c42, b4, c22 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MUL c31, b3, c31 MUL c32, b3, c32 NMSUB c41, c41, b4, c31 NMSUB c42, c42, b4, c32 LD b4, 15 * SIZE(BO) MUL c41, b4, c41 MUL c42, b4, c42 #endif #ifdef RT LD b5, 15 * SIZE(BO) LD b6, 14 * SIZE(BO) LD b7, 13 * SIZE(BO) LD b8, 12 * SIZE(BO) MUL c41, b5, c41 MUL c42, b5, c42 NMSUB c31, c31, b6, c41 NMSUB c32, c32, b6, c42 NMSUB c21, c21, b7, c41 NMSUB c22, c22, b7, c42 NMSUB c11, c11, b8, c41 NMSUB c12, c12, b8, c42 LD b6, 10 * SIZE(BO) LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c31, b6, c31 MUL c32, b6, c32 NMSUB c21, c21, b7, c31 NMSUB c22, c22, b7, c32 NMSUB c11, c11, b8, c31 NMSUB c12, c12, b8, c32 LD b7, 5 * SIZE(BO) LD b8, 4 * SIZE(BO) MUL c21, b7, c21 MUL c22, b7, c22 NMSUB c11, c11, b8, c21 NMSUB c12, c12, b8, c22 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 MUL c12, b8, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE daddiu CO3, CO3, -2 * SIZE daddiu CO4, CO4, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c12, 4 * SIZE(BO) ST c22, 5 * SIZE(BO) ST c32, 6 * SIZE(BO) ST c42, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) ST c31, 4 * SIZE(AO) ST c32, 5 * SIZE(AO) ST c41, 6 * SIZE(AO) ST c42, 7 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c32, 1 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c42, 1 * SIZE(CO4) #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif MTC $0, a1 MOV c11, a1 MOV c21, a1 MOV c31, a1 daddiu I, I, -1 bgtz I, .L31 MOV c41, c11 .align 3 .L40: andi I, M, 1 blez I, .L49 MOV c61, c11 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 blez L, .L45 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 blez L, .L45 NOP #endif .align 3 .L42: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b5 LD b5, 20 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 11 * SIZE(BO) LD a2, 2 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) LD a2, -1 * SIZE(AO) daddiu BO, BO, 16 * SIZE MADD c11, c11, a2, b7 LD b7, 12 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 1 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 2 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L42 LD a2, 1 * SIZE(AO) .align 3 .L45: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L48 NOP .align 3 .L46: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD a1, 1 * SIZE(AO) LD b4, 7 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE MOV a2, a2 bgtz L, .L46 daddiu BO, BO, 4 * SIZE .L48: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 NMSUB c31, c31, b3, c11 NMSUB c41, c41, b4, c11 LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL c21, b2, c21 NMSUB c31, c31, b3, c21 NMSUB c41, c41, b4, c21 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MUL c31, b3, c31 NMSUB c41, c41, b4, c31 LD b4, 15 * SIZE(BO) MUL c41, b4, c41 #endif #ifdef RT LD b5, 15 * SIZE(BO) LD b6, 14 * SIZE(BO) LD b7, 13 * SIZE(BO) LD b8, 12 * SIZE(BO) MUL c41, b5, c41 NMSUB c31, c31, b6, c41 NMSUB c21, c21, b7, c41 NMSUB c11, c11, b8, c41 LD b6, 10 * SIZE(BO) LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c31, b6, c31 NMSUB c21, c21, b7, c31 NMSUB c11, c11, b8, c31 LD b7, 5 * SIZE(BO) LD b8, 4 * SIZE(BO) MUL c21, b7, c21 NMSUB c11, c11, b8, c21 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE daddiu CO3, CO3, -1 * SIZE daddiu CO4, CO4, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c41, 3 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE #endif #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L49: #ifdef LN dsll TEMP, K, 2 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 4 #endif #ifdef RT daddiu KK, KK, -4 #endif .align 3 .L50: andi J, N, 2 blez J, .L70 #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 1 dsubu C, C, TEMP #endif move AO, A move CO1, C daddu CO2, C, LDC #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO2, LDC #endif dsra I, M, 1 blez I, .L60 NOP .L51: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L55 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L55 NOP #endif .align 3 .L52: MADD c11, c11, a1, b1 LD a3, 2 * SIZE(AO) MADD c21, c21, a1, b2 LD b4, 3 * SIZE(BO) MADD c12, c12, a2, b1 LD a4, 3 * SIZE(AO) MADD c22, c22, a2, b2 LD b1, 8 * SIZE(BO) MADD c11, c11, a3, b3 LD a1, 8 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 5 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 5 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 6 * SIZE(BO) MADD c11, c11, a5, b5 LD a3, 6 * SIZE(AO) MADD c21, c21, a5, b2 LD b4, 7 * SIZE(BO) MADD c12, c12, a2, b5 LD a4, 7 * SIZE(AO) MADD c22, c22, a2, b2 LD b5, 12 * SIZE(BO) MADD c11, c11, a3, b3 LD a5, 12 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 9 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 9 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 10 * SIZE(BO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L52 daddiu BO, BO, 8 * SIZE .align 3 .L55: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L58 NOP .align 3 .L56: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 3 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L56 daddiu BO, BO, 2 * SIZE .L58: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c12, b3, c12 SUB c22, b4, c22 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c21, b3, c21 SUB c22, b4, c22 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 MUL c22, b1, c22 NMSUB c11, c11, b2, c12 NMSUB c21, c21, b2, c22 MUL c11, b3, c11 MUL c21, b3, c21 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 NMSUB c12, c12, b2, c11 NMSUB c22, c22, b2, c21 MUL c12, b3, c12 MUL c22, b3, c22 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 MUL c21, b3, c21 MUL c22, b3, c22 #endif #ifdef RT LD b1, 3 * SIZE(BO) LD b2, 2 * SIZE(BO) LD b3, 0 * SIZE(BO) MUL c21, b1, c21 MUL c22, b1, c22 NMSUB c11, c11, b2, c21 NMSUB c12, c12, b2, c22 MUL c11, b3, c11 MUL c12, b3, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c12, 2 * SIZE(BO) ST c22, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif MTC $0, a1 MOV c11, a1 MOV c21, a1 MOV c31, a1 daddiu I, I, -1 bgtz I, .L51 MOV c41, c11 .align 3 .L60: andi I, M, 1 blez I, .L69 NOP #if defined(LT) || defined(RN) dsra L, KK, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L65 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK dsra L, TEMP, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L65 NOP #endif .align 3 .L62: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) MADD c11, c11, a3, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, 11 * SIZE(BO) LD a3, 6 * SIZE(AO) LD a4, 7 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L62 daddiu BO, BO, 8 * SIZE .align 3 .L65: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L68 NOP .align 3 .L66: MADD c11, c11, a1, b1 LD b1, 2 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 3 * SIZE(BO) LD a1, 1 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L66 daddiu BO, BO, 2 * SIZE .L68: ADD c11, c11, c31 ADD c21, c21, c41 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 #endif #if defined(LN) || defined(LT) LD b3, 0 * SIZE(AO) MUL c11, b3, c11 MUL c21, b3, c21 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 3 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 MUL c21, b3, c21 #endif #ifdef RT LD b1, 3 * SIZE(BO) LD b2, 2 * SIZE(BO) LD b3, 0 * SIZE(BO) MUL c21, b1, c21 NMSUB c11, c11, b2, c21 MUL c11, b3, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE #endif #ifdef RT dsll TEMP, K, 0 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L69: #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 2 #endif #ifdef RT daddiu KK, KK, -2 #endif .align 3 .L70: andi J, N, 1 blez J, .L999 NOP #ifdef RT dsll TEMP, K, BASE_SHIFT dsubu B, B, TEMP dsubu C, C, LDC #endif move AO, A move CO1, C #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO1, LDC #endif dsra I, M, 1 blez I, .L80 NOP .L71: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L75 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L75 NOP #endif .align 3 .L72: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 2 * SIZE(AO) LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 8 * SIZE bgtz L, .L72 daddiu BO, BO, 4 * SIZE .align 3 .L75: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L78 NOP .align 3 .L76: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L76 daddiu BO, BO, 1 * SIZE .L78: ADD c11, c11, c21 ADD c12, c12, c22 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -1 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 NMSUB c11, c11, b2, c12 MUL c11, b3, c11 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 NMSUB c12, c12, b2, c11 MUL c12, b3, c12 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) #ifndef LN daddiu CO1, CO1, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif daddiu I, I, -1 bgtz I, .L71 NOP .align 3 .L80: andi I, M, 1 blez I, .L89 NOP #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 blez L, .L85 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll TEMP, KK, BASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 blez L, .L85 NOP #endif .align 3 .L82: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c21, c21, a1, b1 LD a1, 2 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 3 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c21, c21, a1, b1 daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L82 daddiu BO, BO, 4 * SIZE .align 3 .L85: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L88 NOP .align 3 .L86: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L86 daddiu BO, BO, 1 * SIZE .L88: ADD c11, c11, c21 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -1 #endif dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) SUB c11, b1, c11 #else LD b1, 0 * SIZE(AO) SUB c11, b1, c11 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) MUL c11, b1, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) #else ST c11, 0 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) #ifndef LN daddiu CO1, CO1, 1 * SIZE #endif #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L89: #ifdef LN dsll TEMP, K, BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 1 #endif #ifdef RT daddiu KK, KK, -1 #endif .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) LDARG $22, 88($sp) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #ifndef __64BIT__ ldc1 $f20,112($sp) ldc1 $f21,120($sp) ldc1 $f22,128($sp) ldc1 $f23,136($sp) #endif j $31 daddiu $sp, $sp, 144 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/trsm_kernel_LT_loongson3a.S000066400000000000000000000752271313527062700227750ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define OFFSET $22 #define KK $23 #define TEMP $24 #define AORIG $25 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f14 #define b8 $f15 #define t11 $f16 #define t21 $f17 #define t31 $f18 #define t41 $f19 #define t12 $f20 #define t22 $f21 #define t32 $f22 #define t42 $f23 #define t13 $f24 #define t23 $f25 #define t33 $f26 #define t43 $f27 #define t14 $f28 #define t24 $f29 #define t34 $f30 #define t44 $f31 #define ALPHA $f15 PROLOGUE daddiu $sp, $sp, -144 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) SDARG $22, 88($sp) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) #ifndef __64BIT__ sdc1 $f20,112($sp) sdc1 $f21,120($sp) sdc1 $f22,128($sp) sdc1 $f23,136($sp) #endif # LT compute from left to right, top to bottom LDARG OFFSET, 144($sp) dsll LDC, LDC, BASE_SHIFT # ldc dsra J, N, 2 # j = nc/4 blez J, .L30 nop .L10: # nr=4 daddiu J, J, -1 move CO1, C daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC MTC $0, t11 # clear result registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 dsra I, M, 2 # i = mc/4 move KK, OFFSET # kk is the length of the rectangular data part of panel Ai move AO, A # reset A daddu C, CO4, LDC # fixed pointer C, the write back address blez I, .L20 nop .L11: # mr=4 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) # get 4a LD b1, 0 * SIZE(B) # get 4b LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) MOV t13, t11 # clear result registers MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 dsra L, KK, 2 # L = kk/4 blez L, .L15 move BO, B # .align 3 .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a5, b5 # 4th compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 daddiu L, L, -1 bgtz L, .L12 nop .align 3 .L15: andi L, KK, 3 # the remainder part: KK-KK/4 blez L, .L18 nop .align 3 .L16: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L16 nop .L18: # deal with the triangular data part of panel Ai LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t13, b3, t13 SUB t14, b4, t14 LD b5, 4 * SIZE(BO) # sb store in row major LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB t21, b5, t21 SUB t22, b6, t22 SUB t23, b7, t23 SUB t24, b8, t24 LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) SUB t31, b1, t31 SUB t32, b2, t32 SUB t33, b3, t33 SUB t34, b4, t34 LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) SUB t41, b5, t41 SUB t42, b6, t42 SUB t43, b7, t43 SUB t44, b8, t44 LD a1, 0 * SIZE(AO) # sa stores in col major LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL t11, a1, t11 MUL t12, a1, t12 MUL t13, a1, t13 MUL t14, a1, t14 NMSUB t21, t21, a2, t11 NMSUB t22, t22, a2, t12 NMSUB t23, t23, a2, t13 NMSUB t24, t24, a2, t14 NMSUB t31, t31, a3, t11 NMSUB t32, t32, a3, t12 NMSUB t33, t33, a3, t13 NMSUB t34, t34, a3, t14 NMSUB t41, t41, a4, t11 NMSUB t42, t42, a4, t12 NMSUB t43, t43, a4, t13 NMSUB t44, t44, a4, t14 LD a5, 5 * SIZE(AO) LD a6, 6 * SIZE(AO) LD a7, 7 * SIZE(AO) MUL t21, a5, t21 MUL t22, a5, t22 MUL t23, a5, t23 MUL t24, a5, t24 NMSUB t31, t31, a6, t21 NMSUB t32, t32, a6, t22 NMSUB t33, t33, a6, t23 NMSUB t34, t34, a6, t24 NMSUB t41, t41, a7, t21 NMSUB t42, t42, a7, t22 NMSUB t43, t43, a7, t23 NMSUB t44, t44, a7, t24 LD a8, 10 * SIZE(AO) LD a1, 11 * SIZE(AO) MUL t31, a8, t31 MUL t32, a8, t32 MUL t33, a8, t33 MUL t34, a8, t34 NMSUB t41, t41, a1, t31 NMSUB t42, t42, a1, t32 NMSUB t43, t43, a1, t33 NMSUB t44, t44, a1, t34 LD a2, 15 * SIZE(AO) MUL t41, a2, t41 MUL t42, a2, t42 MUL t43, a2, t43 MUL t44, a2, t44 ST t11, 0 * SIZE(BO) # update packed B ST t12, 1 * SIZE(BO) ST t13, 2 * SIZE(BO) ST t14, 3 * SIZE(BO) ST t21, 4 * SIZE(BO) ST t22, 5 * SIZE(BO) ST t23, 6 * SIZE(BO) ST t24, 7 * SIZE(BO) ST t31, 8 * SIZE(BO) ST t32, 9 * SIZE(BO) ST t33, 10 * SIZE(BO) ST t34, 11 * SIZE(BO) ST t41, 12 * SIZE(BO) ST t42, 13 * SIZE(BO) ST t43, 14 * SIZE(BO) ST t44, 15 * SIZE(BO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t33, 2 * SIZE(CO3) ST t43, 3 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1, CO1, 4 * SIZE # fixed pointers daddiu CO2, CO2, 4 * SIZE daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE dsubu TEMP, K, KK dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L # mov AO to the end of panel Ai daddu BO, BO, TEMP # mov BO to the end of panel Bj daddiu KK, KK, 4 # the length of rectangular data part increases by 4 daddiu I, I, -1 MTC $0, a1 MOV t11, a1 MOV t21, a1 MOV t31, a1 MOV t41, a1 MOV t12, a1 MOV t22, a1 MOV t32, a1 MOV t42, a1 bgtz I, .L11 nop .align 3 .L20: andi I, M, 2 # mr=2,nr=4 blez I, .L50 nop MOV t13, t11 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD b1, 0 * SIZE(B) # get 4b LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) dsra L, KK, 2 blez L, .L25 move BO, B .align 3 .L22: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 # 3rd compute MADD t21, t21, a4, b1 MADD t12, t12, a3, b2 MADD t22, t22, a4, b2 MADD t13, t13, a3, b3 MADD t23, t23, a4, b3 MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 # 4th compute MADD t21, t21, a8, b5 MADD t12, t12, a7, b6 MADD t22, t22, a8, b6 MADD t13, t13, a7, b7 MADD t23, t23, a8, b7 MADD t14, t14, a7, b8 MADD t24, t24, a8, b8 daddiu L, L, -1 bgtz L, .L22 nop .align 3 .L25: andi L, KK, 3 blez L, .L28 nop .align 3 .L26: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L26 nop .L28: # deal with the triangular part LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t13, b3, t13 SUB t14, b4, t14 SUB t21, b5, t21 SUB t22, b6, t22 SUB t23, b7, t23 SUB t24, b8, t24 LD b1, 0 * SIZE(AO) # computes the triangular_part LD b2, 1 * SIZE(AO) MUL t11, b1, t11 MUL t12, b1, t12 MUL t13, b1, t13 MUL t14, b1, t14 NMSUB t21, t21, b2, t11 NMSUB t22, t22, b2, t12 NMSUB t23, t23, b2, t13 NMSUB t24, t24, b2, t14 LD b3, 3 * SIZE(AO) MUL t21, b3, t21 MUL t22, b3, t22 MUL t23, b3, t23 MUL t24, b3, t24 ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t13, 2 * SIZE(BO) ST t14, 3 * SIZE(BO) ST t21, 4 * SIZE(BO) ST t22, 5 * SIZE(BO) ST t23, 6 * SIZE(BO) ST t24, 7 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L # mov AO to the end of Ai daddu BO, BO, TEMP # mov BO to the end of Bj daddiu KK, KK, 2 # the length of rectangular data part increases by 2 MTC $0, a1 MOV t11, a1 MOV t21, a1 MOV t31, a1 MOV t41, a1 MOV t12, a1 MOV t22, a1 MOV t32, a1 MOV t42, a1 .align 3 .L50: andi I, M, 1 # mr=1,nr=4 blez I, .L29 nop MOV t13, t11 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD b1, 0 * SIZE(B) # get 4b LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) dsra L, KK, 2 blez L, .L55 move BO, B .align 3 .L52: LD a5, 1 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 LD a3, 2 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t12, t12, a5, b6 MADD t13, t13, a5, b7 MADD t14, t14, a5, b8 LD a7, 3 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 # 3rd compute MADD t12, t12, a3, b2 MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 daddiu AO, AO, 4 * SIZE # AO += mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 # 4th compute MADD t12, t12, a7, b6 MADD t13, t13, a7, b7 MADD t14, t14, a7, b8 daddiu L, L, -1 bgtz L, .L52 nop .align 3 .L55: andi L, KK, 3 blez L, .L58 nop .align 3 .L56: MADD t11, t11, a1, b1 # 3rd compute MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L56 nop .L58: # deal with the triangular part LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t13, b3, t13 SUB t14, b4, t14 LD b1, 0 * SIZE(AO) # computes the triangular_part MUL t11, b1, t11 MUL t12, b1, t12 MUL t13, b1, t13 MUL t14, b1, t14 ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t13, 2 * SIZE(BO) ST t14, 3 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE dsubu TEMP, K, KK dsll L, TEMP, BASE_SHIFT # mr=1 dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L # mov AO to the end of Ai daddu BO, BO, TEMP # mov BO to the end of Bj daddiu KK, KK, 1 # the length of rectangular data part increases by 2 .align 3 .L29: move B, BO # fixed panel Bj bgtz J, .L10 nop .align 3 .L30: andi J, N, 2 # nr=2 blez J, .L70 nop move CO1, C daddu CO2, C, LDC MTC $0, t11 # clear result regusters MOV t21, t11 MOV t31, t11 MOV t41, t11 move KK, OFFSET move AO, A # reset A daddu C, CO2, LDC # fixed dsra I, M, 2 # I = mc/4 blez I, .L40 nop .L31: MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) # get 4a LD b1, 0 * SIZE(B) # get 4b LD b2, 1 * SIZE(B) dsra L, KK, 2 # L=kk/4 blez L, .L35 move BO, B # reset B .align 3 .L32: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b3 # 3rd compute MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 MADD t12, t12, a1, b4 MADD t22, t22, a2, b4 MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a5, b7 # 4th compute MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 MADD t12, t12, a5, b8 MADD t22, t22, a6, b8 MADD t32, t32, a7, b8 MADD t42, t42, a8, b8 daddiu L, L, -1 bgtz L, .L32 nop .align 3 .L35: andi L, KK, 3 blez L, .L38 nop .align 3 .L36: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L36 nop .L38: # LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t21, b3, t21 SUB t22, b4, t22 SUB t31, b5, t31 SUB t32, b6, t32 SUB t41, b7, t41 SUB t42, b8, t42 LD a1, 0 * SIZE(AO) # sa stores in col major LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL t11, a1, t11 MUL t12, a1, t12 NMSUB t21, t21, a2, t11 NMSUB t22, t22, a2, t12 NMSUB t31, t31, a3, t11 NMSUB t32, t32, a3, t12 NMSUB t41, t41, a4, t11 NMSUB t42, t42, a4, t12 LD a5, 5 * SIZE(AO) LD a6, 6 * SIZE(AO) LD a7, 7 * SIZE(AO) MUL t21, a5, t21 MUL t22, a5, t22 NMSUB t31, t31, a6, t21 NMSUB t32, t32, a6, t22 NMSUB t41, t41, a7, t21 NMSUB t42, t42, a7, t22 LD a8, 10 * SIZE(AO) LD a1, 11 * SIZE(AO) MUL t31, a8, t31 MUL t32, a8, t32 NMSUB t41, t41, a1, t31 NMSUB t42, t42, a1, t32 LD a2, 15 * SIZE(AO) MUL t41, a2, t41 MUL t42, a2, t42 ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t21, 2 * SIZE(BO) ST t22, 3 * SIZE(BO) ST t31, 4 * SIZE(BO) ST t32, 5 * SIZE(BO) ST t41, 6 * SIZE(BO) ST t42, 7 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1, CO1, 4 * SIZE daddiu CO2, CO2, 4 * SIZE dsubu TEMP, K, KK dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L # move AO to the end of Ai daddu BO, BO, TEMP daddiu KK, KK, 4 # MTC $0, a1 MOV t11, a1 MOV t21, a1 MOV t31, a1 MOV t41, a1 daddiu I, I, -1 bgtz I, .L31 nop .align 3 .L40: andi I, M, 2 blez I, .L60 nop MOV t12, t11 # clear result registers MOV t22, t21 MOV t32, t31 MOV t42, t41 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) dsra L, KK, 2 blez L, .L45 move BO, B # reset B .align 3 .L42: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute MADD t21, t21, a4, b3 MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute MADD t21, t21, a8, b7 MADD t12, t12, a7, b8 MADD t22, t22, a8, b8 daddiu L, L, -1 bgtz L, .L42 nop .align 3 .L45: andi L, KK, 3 blez L, .L48 nop .align 3 .L46: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L46 nop .L48: LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 SUB t21, b3, t21 SUB t22, b4, t22 LD b1, 0 * SIZE(AO) # computes the triangular_part LD b2, 1 * SIZE(AO) MUL t11, b1, t11 MUL t12, b1, t12 NMSUB t21, t21, b2, t11 NMSUB t22, t22, b2, t12 LD b3, 3 * SIZE(AO) MUL t21, b3, t21 MUL t22, b3, t22 ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t21, 2 * SIZE(BO) ST t22, 3 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP daddiu KK, KK, 2 MTC $0, a1 MOV t11, a1 MOV t21, a1 MOV t31, a1 MOV t41, a1 .align 3 .L60: andi I, M, 1 # mr=1 blez I, .L49 nop MOV t12, t11 # clear result registers MOV t22, t21 MOV t32, t31 MOV t42, t41 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) dsra L, KK, 2 blez L, .L65 move BO, B # reset B .align 3 .L62: LD a5, 1 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t12, t12, a1, b2 LD a3, 2 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t12, t12, a5, b6 LD a7, 3 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute MADD t12, t12, a3, b4 daddiu AO, AO, 4 * SIZE # AO += mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute MADD t12, t12, a7, b8 daddiu L, L, -1 bgtz L, .L62 nop .align 3 .L65: andi L, KK, 3 blez L, .L68 nop .align 3 .L66: MADD t11, t11, a1, b1 # 3rd compute MADD t12, t12, a1, b2 daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L66 nop .L68: LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 LD b1, 0 * SIZE(AO) # computes the triangular_part MUL t11, b1, t11 MUL t12, b1, t12 ST t11, 0 * SIZE(BO) ST t12, 1 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE dsubu TEMP, K, KK dsll L, TEMP, BASE_SHIFT # mr=1 dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP daddiu KK, KK, 1 .align 3 .L49: move B, BO .align 3 .L70: andi J, N, 1 # nr=1 blez J, .L999 # END nop move CO1, C move KK, OFFSET move AO, A dsra I, M, 2 blez I, .L80 nop .L71: MTC $0, t11 # clear result regusters MOV t21, t11 MOV t31, t11 MOV t41, t11 LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) # get 4a LD b1, 0 * SIZE(B) # get 4b dsra L, KK, 2 blez L, .L75 move BO, B # reset B .align 3 .L72: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a1, b3 # 3rd compute MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a5, b7 # 4th compute MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 daddiu L, L, -1 bgtz L, .L72 nop .align 3 .L75: andi L, KK, 3 blez L, .L78 nop .align 3 .L76: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L76 nop .L78: LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD a1, 0 * SIZE(AO) # sa stores in col major LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MUL t11, a1, t11 NMSUB t21, t21, a2, t11 NMSUB t31, t31, a3, t11 NMSUB t41, t41, a4, t11 LD a5, 5 * SIZE(AO) LD a6, 6 * SIZE(AO) LD a7, 7 * SIZE(AO) MUL t21, a5, t21 NMSUB t31, t31, a6, t21 NMSUB t41, t41, a7, t21 LD a8, 10 * SIZE(AO) LD a1, 11 * SIZE(AO) MUL t31, a8, t31 NMSUB t41, t41, a1, t31 LD a2, 15 * SIZE(AO) MUL t41, a2, t41 ST t11, 0 * SIZE(BO) ST t21, 1 * SIZE(BO) ST t31, 2 * SIZE(BO) ST t41, 3 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) daddiu CO1, CO1, 4 * SIZE dsubu TEMP, K, KK dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP daddiu KK, KK, 4 daddiu I, I, -1 bgtz I, .L71 nop .align 3 .L80: andi I, M, 2 blez I, .L90 NOP MTC $0, t11 MOV t21, t11 # clear result registers LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(B) dsra L, KK, 2 blez L, .L85 move BO, B .align 3 .L82: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute MADD t21, t21, a2, b1 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute MADD t21, t21, a6, b5 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute MADD t21, t21, a4, b3 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute MADD t21, t21, a8, b7 daddiu L, L, -1 bgtz L, .L82 nop .align 3 .L85: andi L, KK, 3 blez L, .L88 nop .align 3 .L86: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L86 nop .L88: LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB t11, b1, t11 SUB t21, b2, t21 LD b1, 0 * SIZE(AO) # computes the triangular_part LD b2, 1 * SIZE(AO) MUL t11, b1, t11 NMSUB t21, t21, b2, t11 LD b3, 3 * SIZE(AO) MUL t21, b3, t21 ST t11, 0 * SIZE(BO) ST t21, 1 * SIZE(BO) ST t11, 0 * SIZE(CO1) ST t21, 1 * SIZE(CO1) daddiu CO1, CO1, 2 * SIZE dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP daddiu KK, KK, 2 .align 3 .L90: andi I, M, 1 # mr=1 blez I, .L89 NOP MTC $0, t11 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(B) dsra L, KK, 2 blez L, .L95 move BO, B .align 3 .L92: LD a5, 1 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute LD a3, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 # 2ed compute LD a7, 3 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 # 3rd compute daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute daddiu L, L, -1 bgtz L, .L92 nop .align 3 .L95: andi L, KK, 3 blez L, .L98 nop .align 3 .L96: MADD t11, t11, a1, b1 # 3rd compute daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L96 nop .L98: LD b1, 0 * SIZE(BO) SUB t11, b1, t11 LD b1, 0 * SIZE(AO) # computes the triangular_part MUL t11, b1, t11 ST t11, 0 * SIZE(BO) ST t11, 0 * SIZE(CO1) daddiu CO1, CO1, 1 * SIZE dsubu TEMP, K, KK dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP daddiu KK, KK, 1 .align 3 .L89: move B, BO .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) LDARG $22, 88($sp) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #ifndef __64BIT__ ldc1 $f20,112($sp) ldc1 $f21,120($sp) ldc1 $f22,128($sp) ldc1 $f23,136($sp) #endif j $31 daddiu $sp, $sp, 144 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/trsm_kernel_RN_loongson3a.S000066400000000000000000001041131313527062700227600ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define OFFSET $22 #define KK $23 #define TEMP $24 #define AORIG $25 #define a1 $f0 #define a2 $f1 #define a3 $f26 #define a4 $f27 #define a5 $f28 #define a6 $f29 #define a7 $f30 #define a8 $f31 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define t11 $f10 #define t21 $f11 #define t31 $f12 #define t41 $f13 #define t12 $f14 #define t22 $f15 #define t32 $f16 #define t42 $f17 #define t13 $f18 #define t23 $f19 #define t33 $f20 #define t43 $f21 #define t14 $f22 #define t24 $f23 #define t34 $f24 #define t44 $f25 PROLOGUE daddiu $sp, $sp, -144 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) SDARG $22, 88($sp) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) #ifndef __64BIT__ sdc1 $f20,112($sp) sdc1 $f21,120($sp) sdc1 $f22,128($sp) sdc1 $f23,136($sp) #endif # RN compute from top to bottom left to right .align 3 LDARG OFFSET, 144($sp) # get the last parameter dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte neg KK, OFFSET # for RN OFFSET always 0 dsra J, N, 2 # J = NC/4 blez J, .L30 NOP .L10: daddiu J, J, -1 move CO1, C daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC move AO, A # A is the retangular matrix and B is the trigular matrix daddu C, CO4, LDC # Fixed pointer C dsra I, M, 2 # I=MC/4 blez I, .L20 NOP .align 3 .L11: MTC $0, t11 # clear results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 MOV t13, t11 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD a2, 1 * SIZE(AO) # get 4 a LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L15 move BO, B # reset B .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # fisrt LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 # second LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # third daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 # fouth daddiu L, L, -1 bgtz L, .L12 NOP .L15: andi L, KK, 3 # deal with kc remainder part blez L, .L18 NOP .align 3 .L16: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BP += 4nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L16 NOP .align 3 .L18: # .L18 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) # sa stored as col major SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB t12, b5, t12 SUB t22, b6, t22 SUB t32, b7, t32 SUB t42, b8, t42 LD b1, 8 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 10 * SIZE(AO) LD b4, 11 * SIZE(AO) SUB t13, b1, t13 SUB t23, b2, t23 SUB t33, b3, t33 SUB t43, b4, t43 LD b5, 12 * SIZE(AO) LD b6, 13 * SIZE(AO) LD b7, 14 * SIZE(AO) LD b8, 15 * SIZE(AO) SUB t14, b5, t14 SUB t24, b6, t24 SUB t34, b7, t34 SUB t44, b8, t44 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL t11, b1, t11 MUL t21, b1, t21 MUL t31, b1, t31 MUL t41, b1, t41 NMSUB t12, t12, b2, t11 NMSUB t22, t22, b2, t21 NMSUB t32, t32, b2, t31 NMSUB t42, t42, b2, t41 NMSUB t13, t13, b3, t11 NMSUB t23, t23, b3, t21 NMSUB t33, t33, b3, t31 NMSUB t43, t43, b3, t41 NMSUB t14, t14, b4, t11 NMSUB t24, t24, b4, t21 NMSUB t34, t34, b4, t31 NMSUB t44, t44, b4, t41 LD b5, 5 * SIZE(BO) LD b6, 6 * SIZE(BO) LD b7, 7 * SIZE(BO) MUL t12, b5, t12 MUL t22, b5, t22 MUL t32, b5, t32 MUL t42, b5, t42 NMSUB t13, t13, b6, t12 NMSUB t23, t23, b6, t22 NMSUB t33, t33, b6, t32 NMSUB t43, t43, b6, t42 NMSUB t14, t14, b7, t12 NMSUB t24, t24, b7, t22 NMSUB t34, t34, b7, t32 NMSUB t44, t44, b7, t42 LD b8, 10 * SIZE(BO) LD b1, 11 * SIZE(BO) MUL t13, b8, t13 MUL t23, b8, t23 MUL t33, b8, t33 MUL t43, b8, t43 NMSUB t14, t14, b1, t13 NMSUB t24, t24, b1, t23 NMSUB t34, t34, b1, t33 NMSUB t44, t44, b1, t43 LD b2, 15 * SIZE(BO) MUL t14, b2, t14 MUL t24, b2, t24 MUL t34, b2, t34 MUL t44, b2, t44 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t21, 1 * SIZE(AO) ST t31, 2 * SIZE(AO) ST t41, 3 * SIZE(AO) ST t12, 4 * SIZE(AO) ST t22, 5 * SIZE(AO) ST t32, 6 * SIZE(AO) ST t42, 7 * SIZE(AO) ST t13, 8 * SIZE(AO) ST t23, 9 * SIZE(AO) ST t33, 10 * SIZE(AO) ST t43, 11 * SIZE(AO) ST t14, 12 * SIZE(AO) ST t24, 13 * SIZE(AO) ST t34, 14 * SIZE(AO) ST t44, 15 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t33, 2 * SIZE(CO3) ST t43, 3 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1, CO1, 4 * SIZE # fixed address daddiu CO2, CO2, 4 * SIZE daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel daddiu I, I, -1 bgtz I, .L11 NOP .align 3 .L20: andi I, M, 2 # mr=2 blez I, .L50 nop MTC $0, t11 # clear results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 MOV t13, t11 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD a2, 1 * SIZE(AO) # get 4 a LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L25 move BO, B # reset B .L22: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 MADD t21, t21, a4, b1 MADD t12, t12, a3, b2 MADD t22, t22, a4, b2 MADD t13, t13, a3, b3 MADD t23, t23, a4, b3 MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 MADD t21, t21, a8, b5 MADD t12, t12, a7, b6 MADD t22, t22, a8, b6 MADD t13, t13, a7, b7 MADD t23, t23, a8, b7 MADD t14, t14, a7, b8 MADD t24, t24, a8, b8 daddiu L, L, -1 bgtz L, .L22 NOP .L25: andi L, KK, 3 # deal with kc remainder part blez L, .L28 NOP .align 3 .L26: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BP += 4nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L26 NOP .align 3 .L28: # .L18 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results SUB t11, b1, t11 SUB t21, b2, t21 LD b5, 2 * SIZE(AO) LD b6, 3 * SIZE(AO) SUB t12, b5, t12 SUB t22, b6, t22 LD b3, 4 * SIZE(AO) LD b4, 5 * SIZE(AO) SUB t13, b3, t13 SUB t23, b4, t23 LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB t14, b7, t14 SUB t24, b8, t24 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL t11, b1, t11 MUL t21, b1, t21 NMSUB t12, t12, b2, t11 NMSUB t22, t22, b2, t21 NMSUB t13, t13, b3, t11 NMSUB t23, t23, b3, t21 NMSUB t14, t14, b4, t11 NMSUB t24, t24, b4, t21 LD b5, 5 * SIZE(BO) LD b6, 6 * SIZE(BO) LD b7, 7 * SIZE(BO) MUL t12, b5, t12 MUL t22, b5, t22 NMSUB t13, t13, b6, t12 NMSUB t23, t23, b6, t22 NMSUB t14, t14, b7, t12 NMSUB t24, t24, b7, t22 LD b8, 10 * SIZE(BO) LD b1, 11 * SIZE(BO) MUL t13, b8, t13 MUL t23, b8, t23 NMSUB t14, t14, b1, t13 NMSUB t24, t24, b1, t23 LD b2, 15 * SIZE(BO) MUL t14, b2, t14 MUL t24, b2, t24 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t21, 1 * SIZE(AO) ST t12, 2 * SIZE(AO) ST t22, 3 * SIZE(AO) ST t13, 4 * SIZE(AO) ST t23, 5 * SIZE(AO) ST t14, 6 * SIZE(AO) ST t24, 7 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1, CO1, 2 * SIZE # fixed address daddiu CO2, CO2, 2 * SIZE # mr=2 daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, 1 + BASE_SHIFT # mr=2 dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel .align 3 .L50: andi I, M, 1 # mr=1 blez I, .L29 nop MTC $0, t11 # clear results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 MOV t13, t11 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L55 move BO, B # reset B .L52: LD a5, 1 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 LD a3, 2 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 MADD t12, t12, a5, b6 MADD t13, t13, a5, b7 MADD t14, t14, a5, b8 LD a7, 3 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 MADD t12, t12, a3, b2 MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 MADD t12, t12, a7, b6 MADD t13, t13, a7, b7 MADD t14, t14, a7, b8 daddiu L, L, -1 bgtz L, .L52 NOP .L55: andi L, KK, 3 # deal with kc remainder part blez L, .L58 NOP .align 3 .L56: MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 4 * SIZE # BP += 4nr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L56 NOP .align 3 .L58: # .L18 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b5, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b7, 3 * SIZE(AO) SUB t11, b1, t11 SUB t12, b5, t12 SUB t13, b3, t13 SUB t14, b7, t14 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL t11, b1, t11 NMSUB t12, t12, b2, t11 NMSUB t13, t13, b3, t11 NMSUB t14, t14, b4, t11 LD b5, 5 * SIZE(BO) LD b6, 6 * SIZE(BO) LD b7, 7 * SIZE(BO) MUL t12, b5, t12 NMSUB t13, t13, b6, t12 NMSUB t14, t14, b7, t12 LD b8, 10 * SIZE(BO) LD b1, 11 * SIZE(BO) MUL t13, b8, t13 NMSUB t14, t14, b1, t13 LD b2, 15 * SIZE(BO) MUL t14, b2, t14 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t12, 1 * SIZE(AO) ST t13, 2 * SIZE(AO) ST t14, 3 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) daddiu CO1, CO1, 1 * SIZE # fixed address daddiu CO2, CO2, 1 * SIZE # mr=2 daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, BASE_SHIFT # mr=2 dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel .align 3 .L29: move B, BO # change to next panel of Bj daddiu KK, KK, 4 # rectangular data length increase by 4 bgtz J, .L10 NOP .align 3 .L30: andi J, N, 2 blez J, .L70 nop move CO1, C daddu CO2, C, LDC move AO, A # A is the retangular matrix and B is the trigular matrix daddu C, CO2, LDC # Fixed pointer C dsra I, M, 2 # I=MC/4 blez I, .L40 NOP .align 3 .L31: MTC $0, t11 # clear results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD a2, 1 * SIZE(AO) # get 4 a LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L35 move BO, B # reset B .L32: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b3 MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 MADD t12, t12, a1, b4 MADD t22, t22, a2, b4 MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a5, b7 MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 MADD t12, t12, a5, b8 MADD t22, t22, a6, b8 MADD t32, t32, a7, b8 MADD t42, t42, a8, b8 daddiu L, L, -1 bgtz L, .L32 NOP .L35: andi L, KK, 3 # deal with kc remainder part blez L, .L38 NOP .align 3 .L36: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BP += 2nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L36 NOP .align 3 .L38: # .L38 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) # sa stored as col major SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB t12, b5, t12 SUB t22, b6, t22 SUB t32, b7, t32 SUB t42, b8, t42 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj LD b2, 1 * SIZE(BO) MUL t11, b1, t11 MUL t21, b1, t21 MUL t31, b1, t31 MUL t41, b1, t41 NMSUB t12, t12, b2, t11 NMSUB t22, t22, b2, t21 NMSUB t32, t32, b2, t31 NMSUB t42, t42, b2, t41 LD b5, 3 * SIZE(BO) MUL t12, b5, t12 MUL t22, b5, t22 MUL t32, b5, t32 MUL t42, b5, t42 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t21, 1 * SIZE(AO) ST t31, 2 * SIZE(AO) ST t41, 3 * SIZE(AO) ST t12, 4 * SIZE(AO) ST t22, 5 * SIZE(AO) ST t32, 6 * SIZE(AO) ST t42, 7 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1, CO1, 4 * SIZE # fixed address daddiu CO2, CO2, 4 * SIZE dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel daddiu I, I, -1 bgtz I, .L31 NOP .align 3 .L40: andi I, M,2 blez I,.L60 nop MTC $0, t11 # clear results registers MOV t21, t11 MOV t12, t11 MOV t22, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD a2, 1 * SIZE(AO) # get 4 a LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L45 move BO, B # reset B .L42: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 MADD t21, t21, a4, b3 MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 MADD t21, t21, a8, b7 MADD t12, t12, a7, b8 MADD t22, t22, a8, b8 daddiu L, L, -1 bgtz L, .L42 NOP .L45: andi L, KK, 3 # deal with kc remainder part blez L, .L48 NOP .align 3 .L46: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BP += 2nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L46 NOP .align 3 .L48: # .L48 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results SUB t11, b1, t11 SUB t21, b2, t21 LD b5, 2 * SIZE(AO) LD b6, 3 * SIZE(AO) SUB t12, b5, t12 SUB t22, b6, t22 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj LD b2, 1 * SIZE(BO) MUL t11, b1, t11 MUL t21, b1, t21 NMSUB t12, t12, b2, t11 NMSUB t22, t22, b2, t21 LD b5, 3 * SIZE(BO) MUL t12, b5, t12 MUL t22, b5, t22 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t21, 1 * SIZE(AO) ST t12, 2 * SIZE(AO) ST t22, 3 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) daddiu CO1, CO1, 2 * SIZE # fixed address daddiu CO2, CO2, 2 * SIZE dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel .align 3 .L60: andi I,M,1 # nr=2 mr=1 blez I,.L39 nop MTC $0, t11 # clear results registers MOV t12, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L65 move BO, B # reset B .L62: LD a5, 1 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 LD a3, 2 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 MADD t12, t12, a5, b6 LD a7, 3 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 MADD t12, t12, a3, b4 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 MADD t12, t12, a7, b8 daddiu L, L, -1 bgtz L, .L62 NOP .L65: andi L, KK, 3 # deal with kc remainder part blez L, .L68 NOP .align 3 .L66: MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 daddiu AO, AO, 1 * SIZE # AO += mr daddiu BO, BO, 2 * SIZE # BP += 2nr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L66 NOP .align 3 .L68: # .L48 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b5, 1 * SIZE(AO) # Fixed results SUB t11, b1, t11 SUB t12, b5, t12 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj LD b2, 1 * SIZE(BO) MUL t11, b1, t11 NMSUB t12, t12, b2, t11 LD b5, 3 * SIZE(BO) MUL t12, b5, t12 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t12, 1 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t12, 0 * SIZE(CO2) daddiu CO1, CO1, 1 * SIZE # fixed address daddiu CO2, CO2, 1 * SIZE dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, BASE_SHIFT # mr=1 dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel .align 3 .L39: move B, BO # change to next panel of Bj daddiu KK, KK, 2 # rectangular data length increase by 4 .align 3 .L70: andi J, N, 1 # nr=1 blez J, .L999 NOP move CO1, C move AO, A daddu C, CO1, LDC dsra I, M, 2 # I=MC/4 blez I, .L80 NOP .align 3 .L71: MTC $0, t11 # clear results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD a2, 1 * SIZE(AO) # get 4 a LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L75 move BO, B # reset B .L72: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a1, b3 MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a5, b7 MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 daddiu L, L, -1 bgtz L, .L72 NOP .L75: andi L, KK, 3 # deal with kc remainder part blez L, .L78 NOP .align 3 .L76: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BP += 1nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L76 NOP .align 3 .L78: # .L78 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) # sa stored as col major SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj MUL t11, b1, t11 MUL t21, b1, t21 MUL t31, b1, t31 MUL t41, b1, t41 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t21, 1 * SIZE(AO) ST t31, 2 * SIZE(AO) ST t41, 3 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) daddiu CO1, CO1, 4 * SIZE # fixed address dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT # nr=1 daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel daddiu I, I, -1 bgtz I, .L71 NOP .align 3 .L80: andi I, M, 2 # mr=2 blez I, .L90 nop MTC $0, t11 # clear results registers MOV t21, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD a2, 1 * SIZE(AO) # get 4 a LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L85 move BO, B # reset B .L82: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 MADD t21, t21, a4, b3 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 MADD t21, t21, a8, b7 daddiu L, L, -1 bgtz L, .L82 NOP .L85: andi L, KK, 3 # deal with kc remainder part blez L, .L88 NOP .align 3 .L86: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BP += 1nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L86 NOP .align 3 .L88: # .L88 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results SUB t11, b1, t11 SUB t21, b2, t21 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj MUL t11, b1, t11 MUL t21, b1, t21 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t21, 1 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) daddiu CO1, CO1, 2 * SIZE # fixed address dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT # nr=1 daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel .align 3 .L90: andi I, M, 1 # mr=1 blez I, .L79 nop MTC $0, t11 # clear results registers LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj blez L, .L95 move BO, B # reset B .L92: LD a5, 1 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 LD a3, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 LD a7, 3 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 daddiu L, L, -1 bgtz L, .L92 NOP .L95: andi L, KK, 3 # deal with kc remainder part blez L, .L98 NOP .align 3 .L96: MADD t11, t11, a1, b1 daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BP += 1nr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L96 NOP .align 3 .L98: # .L98 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix SUB t11, b1, t11 LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj MUL t11, b1, t11 ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t11, 0 * SIZE(CO1) # write back results daddiu CO1, CO1, 1 * SIZE # fixed address dsubu TEMP, K, KK # temp = kc - retangular data length of every panel dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT # nr=1 daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel daddu BO, BO, TEMP # move BO to the end of this panel .align 3 .L79: move B, BO daddiu KK, KK, 1 .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) LDARG $22, 88($sp) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #ifndef __64BIT__ ldc1 $f20,112($sp) ldc1 $f21,120($sp) ldc1 $f22,128($sp) ldc1 $f23,136($sp) #endif j $31 daddiu $sp, $sp, 144 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/trsm_kernel_RT.S000066400000000000000000001623731313527062700206400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define CO5 $18 #define CO6 $19 #define CO7 $20 #define CO8 $21 #define OFFSET $22 #define KK $23 #define TEMP $24 #define AORIG $25 #define a1 $f0 #define a2 $f1 #define a3 $f27 #define a4 $f28 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f16 #define c41 $f17 #define c42 $f18 #define c51 $f19 #define c52 $f20 #define c61 $f21 #define c62 $f22 #define c71 $f23 #define c72 $f24 #define c81 $f25 #define c82 $f26 #define ALPHA $f15 PROLOGUE daddiu $sp, $sp, -144 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) SDARG $22, 88($sp) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) #ifndef __64BIT__ sdc1 $f20,112($sp) sdc1 $f21,120($sp) sdc1 $f22,128($sp) sdc1 $f23,136($sp) #endif LDARG OFFSET, 144($sp) dsll LDC, LDC, BASE_SHIFT #ifdef LN mult M, K mflo TEMP dsll TEMP, TEMP, BASE_SHIFT daddu A, A, TEMP dsll TEMP, M, BASE_SHIFT daddu C, C, TEMP #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mult N, K mflo TEMP dsll TEMP, TEMP, BASE_SHIFT daddu B, B, TEMP mult N, LDC mflo TEMP daddu C, C, TEMP dsubu KK, N, OFFSET #endif andi J, N, 1 blez J, .L30 NOP #ifdef RT dsll TEMP, K, BASE_SHIFT dsubu B, B, TEMP dsubu C, C, LDC #endif move AO, A move CO1, C #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO1, LDC #endif dsra I, M, 1 blez I, .L80 NOP .L71: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L75 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L75 NOP #endif .align 3 .L72: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 2 * SIZE(AO) LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 8 * SIZE bgtz L, .L72 daddiu BO, BO, 4 * SIZE .align 3 .L75: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L78 NOP .align 3 .L76: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L76 daddiu BO, BO, 1 * SIZE .L78: ADD c11, c11, c21 ADD c12, c12, c22 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -1 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 NMSUB c11, c11, b2, c12 MUL c11, b3, c11 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 NMSUB c12, c12, b2, c11 MUL c12, b3, c12 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) #ifndef LN daddiu CO1, CO1, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif daddiu I, I, -1 bgtz I, .L71 NOP .align 3 .L80: andi I, M, 1 blez I, .L89 NOP #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) MOV c21, c11 LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 blez L, .L85 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll TEMP, KK, BASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c21, c11 LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 blez L, .L85 NOP #endif .align 3 .L82: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c21, c21, a1, b1 LD a1, 2 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 3 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c21, c21, a1, b1 daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L82 daddiu BO, BO, 4 * SIZE .align 3 .L85: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L88 NOP .align 3 .L86: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L86 daddiu BO, BO, 1 * SIZE .L88: ADD c11, c11, c21 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -1 #endif dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) SUB c11, b1, c11 #else LD b1, 0 * SIZE(AO) SUB c11, b1, c11 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) MUL c11, b1, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) #else ST c11, 0 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) #ifndef LN daddiu CO1, CO1, 1 * SIZE #endif #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, 0 + BASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L89: #ifdef LN dsll TEMP, K, BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 1 #endif #ifdef RT daddiu KK, KK, -1 #endif .align 3 .L30: andi J, N, 2 blez J, .L50 NOP #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 1 dsubu C, C, TEMP #endif move AO, A move CO1, C daddu CO2, C, LDC #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO2, LDC #endif dsra I, M, 1 blez I, .L60 NOP .L51: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L55 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L55 NOP #endif .align 3 .L52: MADD c11, c11, a1, b1 LD a3, 2 * SIZE(AO) MADD c21, c21, a1, b2 LD b4, 3 * SIZE(BO) MADD c12, c12, a2, b1 LD a4, 3 * SIZE(AO) MADD c22, c22, a2, b2 LD b1, 8 * SIZE(BO) MADD c11, c11, a3, b3 LD a1, 8 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 5 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 5 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 6 * SIZE(BO) MADD c11, c11, a5, b5 LD a3, 6 * SIZE(AO) MADD c21, c21, a5, b2 LD b4, 7 * SIZE(BO) MADD c12, c12, a2, b5 LD a4, 7 * SIZE(AO) MADD c22, c22, a2, b2 LD b5, 12 * SIZE(BO) MADD c11, c11, a3, b3 LD a5, 12 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 9 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 9 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 10 * SIZE(BO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L52 daddiu BO, BO, 8 * SIZE .align 3 .L55: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L58 NOP .align 3 .L56: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 3 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L56 daddiu BO, BO, 2 * SIZE .L58: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c12, b3, c12 SUB c22, b4, c22 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c21, b3, c21 SUB c22, b4, c22 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 MUL c22, b1, c22 NMSUB c11, c11, b2, c12 NMSUB c21, c21, b2, c22 MUL c11, b3, c11 MUL c21, b3, c21 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 NMSUB c12, c12, b2, c11 NMSUB c22, c22, b2, c21 MUL c12, b3, c12 MUL c22, b3, c22 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 MUL c21, b3, c21 MUL c22, b3, c22 #endif #ifdef RT LD b1, 3 * SIZE(BO) LD b2, 2 * SIZE(BO) LD b3, 0 * SIZE(BO) MUL c21, b1, c21 MUL c22, b1, c22 NMSUB c11, c11, b2, c21 NMSUB c12, c12, b2, c22 MUL c11, b3, c11 MUL c12, b3, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c12, 2 * SIZE(BO) ST c22, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif MTC $0, a1 MOV c11, a1 MOV c21, a1 MOV c31, a1 daddiu I, I, -1 bgtz I, .L51 MOV c41, c11 .align 3 .L60: andi I, M, 1 blez I, .L69 NOP #if defined(LT) || defined(RN) dsra L, KK, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L65 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK dsra L, TEMP, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L65 NOP #endif .align 3 .L62: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) MADD c11, c11, a3, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, 11 * SIZE(BO) LD a3, 6 * SIZE(AO) LD a4, 7 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L62 daddiu BO, BO, 8 * SIZE .align 3 .L65: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L68 NOP .align 3 .L66: MADD c11, c11, a1, b1 LD b1, 2 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 3 * SIZE(BO) LD a1, 1 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L66 daddiu BO, BO, 2 * SIZE .L68: ADD c11, c11, c31 ADD c21, c21, c41 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 #endif #if defined(LN) || defined(LT) LD b3, 0 * SIZE(AO) MUL c11, b3, c11 MUL c21, b3, c21 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 3 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 MUL c21, b3, c21 #endif #ifdef RT LD b1, 3 * SIZE(BO) LD b2, 2 * SIZE(BO) LD b3, 0 * SIZE(BO) MUL c21, b1, c21 NMSUB c11, c11, b2, c21 MUL c11, b3, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE #endif #ifdef RT dsll TEMP, K, 0 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L69: #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 2 #endif #ifdef RT daddiu KK, KK, -2 #endif .align 3 .L50: andi J, N, 4 blez J, .L70 move AO, A #ifdef RT dsll TEMP, K, 2 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 2 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC MOV c21, c11 dsra I, M, 1 MOV c31, c11 #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO4, LDC #endif blez I, .L40 MOV c41, c11 .L31: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) MOV c32, c11 LD b4, 3 * SIZE(B) MOV c42, c11 LD b5, 4 * SIZE(B) dsra L, KK, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L35 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(BO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 LD b3, 2 * SIZE(BO) MOV c32, c11 LD b4, 3 * SIZE(BO) MOV c42, c11 LD b5, 4 * SIZE(BO) dsra L, TEMP, 2 LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) blez L, .L35 NOP #endif .align 3 .L32: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a3, b6 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c11, c11, a3, b7 LD a2, 7 * SIZE(AO) MADD c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD c31, c31, a3, b3 daddiu BO, BO, 16 * SIZE MADD c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD c12, c12, a2, b7 LD b7, 12 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD c42, c42, a2, b4 NOP bgtz L, .L32 LD b4, 3 * SIZE(BO) .align 3 .L35: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L38 NOP .align 3 .L36: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 daddiu AO, AO, 2 * SIZE MADD c41, c41, a1, b4 LD a1, 0 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 4 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) bgtz L, .L36 daddiu BO, BO, 4 * SIZE .L38: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c12, b5, c12 SUB c22, b6, c22 SUB c32, b7, c32 SUB c42, b8, c42 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c21, b3, c21 SUB c22, b4, c22 SUB c31, b5, c31 SUB c32, b6, c32 SUB c41, b7, c41 SUB c42, b8, c42 #endif #ifdef LN LD b1, 3 * SIZE(AO) LD b2, 2 * SIZE(AO) LD b3, 0 * SIZE(AO) MUL c12, b1, c12 MUL c22, b1, c22 MUL c32, b1, c32 MUL c42, b1, c42 NMSUB c11, c11, b2, c12 NMSUB c21, c21, b2, c22 NMSUB c31, c31, b2, c32 NMSUB c41, c41, b2, c42 MUL c11, b3, c11 MUL c21, b3, c21 MUL c31, b3, c31 MUL c41, b3, c41 #endif #ifdef LT LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 3 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 NMSUB c12, c12, b2, c11 NMSUB c22, c22, b2, c21 NMSUB c32, c32, b2, c31 NMSUB c42, c42, b2, c41 MUL c12, b3, c12 MUL c22, b3, c22 MUL c32, b3, c32 MUL c42, b3, c42 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 NMSUB c31, c31, b3, c11 NMSUB c32, c32, b3, c12 NMSUB c41, c41, b4, c11 NMSUB c42, c42, b4, c12 LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL c21, b2, c21 MUL c22, b2, c22 NMSUB c31, c31, b3, c21 NMSUB c32, c32, b3, c22 NMSUB c41, c41, b4, c21 NMSUB c42, c42, b4, c22 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MUL c31, b3, c31 MUL c32, b3, c32 NMSUB c41, c41, b4, c31 NMSUB c42, c42, b4, c32 LD b4, 15 * SIZE(BO) MUL c41, b4, c41 MUL c42, b4, c42 #endif #ifdef RT LD b5, 15 * SIZE(BO) LD b6, 14 * SIZE(BO) LD b7, 13 * SIZE(BO) LD b8, 12 * SIZE(BO) MUL c41, b5, c41 MUL c42, b5, c42 NMSUB c31, c31, b6, c41 NMSUB c32, c32, b6, c42 NMSUB c21, c21, b7, c41 NMSUB c22, c22, b7, c42 NMSUB c11, c11, b8, c41 NMSUB c12, c12, b8, c42 LD b6, 10 * SIZE(BO) LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c31, b6, c31 MUL c32, b6, c32 NMSUB c21, c21, b7, c31 NMSUB c22, c22, b7, c32 NMSUB c11, c11, b8, c31 NMSUB c12, c12, b8, c32 LD b7, 5 * SIZE(BO) LD b8, 4 * SIZE(BO) MUL c21, b7, c21 MUL c22, b7, c22 NMSUB c11, c11, b8, c21 NMSUB c12, c12, b8, c22 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 MUL c12, b8, c12 #endif #ifdef LN daddiu CO1, CO1, -2 * SIZE daddiu CO2, CO2, -2 * SIZE daddiu CO3, CO3, -2 * SIZE daddiu CO4, CO4, -2 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c12, 4 * SIZE(BO) ST c22, 5 * SIZE(BO) ST c32, 6 * SIZE(BO) ST c42, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) ST c31, 4 * SIZE(AO) ST c32, 5 * SIZE(AO) ST c41, 6 * SIZE(AO) ST c42, 7 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c32, 1 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c42, 1 * SIZE(CO4) #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE #endif #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif MTC $0, a1 MOV c11, a1 MOV c21, a1 MOV c31, a1 daddiu I, I, -1 bgtz I, .L31 MOV c41, c11 .align 3 .L40: andi I, M, 1 blez I, .L49 MOV c61, c11 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 blez L, .L45 move BO, B #else #ifdef LN dsll TEMP, K, BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 blez L, .L45 NOP #endif .align 3 .L42: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b5 LD b5, 20 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 11 * SIZE(BO) LD a2, 2 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) LD a2, -1 * SIZE(AO) daddiu BO, BO, 16 * SIZE MADD c11, c11, a2, b7 LD b7, 12 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 1 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 2 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L42 LD a2, 1 * SIZE(AO) .align 3 .L45: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L48 NOP .align 3 .L46: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD a1, 1 * SIZE(AO) LD b4, 7 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE MOV a2, a2 bgtz L, .L46 daddiu BO, BO, 4 * SIZE .L48: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 NMSUB c31, c31, b3, c11 NMSUB c41, c41, b4, c11 LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL c21, b2, c21 NMSUB c31, c31, b3, c21 NMSUB c41, c41, b4, c21 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MUL c31, b3, c31 NMSUB c41, c41, b4, c31 LD b4, 15 * SIZE(BO) MUL c41, b4, c41 #endif #ifdef RT LD b5, 15 * SIZE(BO) LD b6, 14 * SIZE(BO) LD b7, 13 * SIZE(BO) LD b8, 12 * SIZE(BO) MUL c41, b5, c41 NMSUB c31, c31, b6, c41 NMSUB c21, c21, b7, c41 NMSUB c11, c11, b8, c41 LD b6, 10 * SIZE(BO) LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c31, b6, c31 NMSUB c21, c21, b7, c31 NMSUB c11, c11, b8, c31 LD b7, 5 * SIZE(BO) LD b8, 4 * SIZE(BO) MUL c21, b7, c21 NMSUB c11, c11, b8, c21 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE daddiu CO3, CO3, -1 * SIZE daddiu CO4, CO4, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c41, 3 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE #endif #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L49: #ifdef LN dsll TEMP, K, 2 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 4 #endif #ifdef RT daddiu KK, KK, -4 #endif .align 3 .L70: dsra J, N, 3 blez J, .L999 nop .L10: #ifdef RT dsll TEMP, K, 3 + BASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 3 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 daddu CO5, CO4, LDC MOV c31, c11 daddu CO6, CO5, LDC MOV c41, c11 daddu CO7, CO6, LDC MOV c51, c11 daddu CO8, CO7, LDC dsra I, M, 1 #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO8, LDC #endif blez I, .L20 MOV c61, c11 .L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, KK, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B #else #ifdef LN dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(BO) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 MOV c32, c11 LD b3, 2 * SIZE(BO) MOV c42, c11 LD b4, 3 * SIZE(BO) MOV c52, c11 LD b5, 4 * SIZE(BO) MOV c62, c11 LD b6, 8 * SIZE(BO) MOV c72, c11 LD b7, 12 * SIZE(BO) MOV c82, c11 dsra L, TEMP, 2 blez L, .L15 NOP #endif MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 blez L, .L13 MADD c41, c41, a1, b4 NOP .align 3 .L12: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 daddiu L, L, -1 MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 bgtz L, .L12 MADD c41, c41, a1, b4 NOP .align 3 .L13: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L18 NOP .align 3 .L16: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 NOP MADD c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 daddiu L, L, -1 MADD c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -2 #else daddiu TEMP, KK, -8 #endif dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 LD b5, 4 * SIZE(BO) SUB c21, b2, c21 LD b6, 5 * SIZE(BO) SUB c31, b3, c31 LD b7, 6 * SIZE(BO) SUB c41, b4, c41 LD b8, 7 * SIZE(BO) SUB c51, b5, c51 LD b1, 8 * SIZE(BO) SUB c61, b6, c61 LD b2, 9 * SIZE(BO) SUB c71, b7, c71 LD b3, 10 * SIZE(BO) SUB c81, b8, c81 LD b4, 11 * SIZE(BO) SUB c12, b1, c12 LD b5, 12 * SIZE(BO) SUB c22, b2, c22 LD b6, 13 * SIZE(BO) SUB c32, b3, c32 LD b7, 14 * SIZE(BO) SUB c42, b4, c42 LD b8, 15 * SIZE(BO) SUB c52, b5, c52 #ifdef LN LD b1, 3 * SIZE(AO) #else LD b1, 0 * SIZE(AO) #endif SUB c62, b6, c62 SUB c72, b7, c72 SUB c82, b8, c82 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 LD b5, 4 * SIZE(AO) SUB c12, b2, c12 LD b6, 5 * SIZE(AO) SUB c21, b3, c21 LD b7, 6 * SIZE(AO) SUB c22, b4, c22 LD b8, 7 * SIZE(AO) SUB c31, b5, c31 LD b1, 8 * SIZE(AO) SUB c32, b6, c32 LD b2, 9 * SIZE(AO) SUB c41, b7, c41 LD b3, 10 * SIZE(AO) SUB c42, b8, c42 LD b4, 11 * SIZE(AO) LD b5, 12 * SIZE(AO) SUB c51, b1, c51 LD b6, 13 * SIZE(AO) SUB c52, b2, c52 LD b7, 14 * SIZE(AO) SUB c61, b3, c61 LD b8, 15 * SIZE(AO) SUB c62, b4, c62 SUB c71, b5, c71 SUB c72, b6, c72 SUB c81, b7, c81 SUB c82, b8, c82 #endif #ifdef LN MUL c12, b1, c12 LD b2, 2 * SIZE(AO) MUL c22, b1, c22 MUL c32, b1, c32 MUL c42, b1, c42 MUL c52, b1, c52 MUL c62, b1, c62 MUL c72, b1, c72 MUL c82, b1, c82 NMSUB c11, c11, b2, c12 LD b3, 0 * SIZE(AO) NMSUB c21, c21, b2, c22 NMSUB c31, c31, b2, c32 NMSUB c41, c41, b2, c42 NMSUB c51, c51, b2, c52 NMSUB c61, c61, b2, c62 NMSUB c71, c71, b2, c72 NMSUB c81, c81, b2, c82 MUL c11, b3, c11 daddiu CO1, CO1, -2 * SIZE MUL c21, b3, c21 daddiu CO2, CO2, -2 * SIZE MUL c31, b3, c31 daddiu CO3, CO3, -2 * SIZE MUL c41, b3, c41 daddiu CO4, CO4, -2 * SIZE MUL c51, b3, c51 daddiu CO5, CO5, -2 * SIZE MUL c61, b3, c61 daddiu CO6, CO6, -2 * SIZE MUL c71, b3, c71 daddiu CO7, CO7, -2 * SIZE MUL c81, b3, c81 daddiu CO8, CO8, -2 * SIZE #endif #ifdef LT MUL c11, b1, c11 LD b2, 1 * SIZE(AO) MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 MUL c51, b1, c51 MUL c61, b1, c61 MUL c71, b1, c71 MUL c81, b1, c81 NMSUB c12, c12, b2, c11 LD b3, 3 * SIZE(AO) NMSUB c22, c22, b2, c21 NMSUB c32, c32, b2, c31 NMSUB c42, c42, b2, c41 NMSUB c52, c52, b2, c51 NMSUB c62, c62, b2, c61 NMSUB c72, c72, b2, c71 NMSUB c82, c82, b2, c81 MUL c12, b3, c12 MUL c22, b3, c22 MUL c32, b3, c32 MUL c42, b3, c42 MUL c52, b3, c52 MUL c62, b3, c62 MUL c72, b3, c72 MUL c82, b3, c82 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL c11, b1, c11 MUL c12, b1, c12 LD b5, 4 * SIZE(BO) NMSUB c21, c21, b2, c11 NMSUB c22, c22, b2, c12 LD b6, 5 * SIZE(BO) NMSUB c31, c31, b3, c11 NMSUB c32, c32, b3, c12 LD b7, 6 * SIZE(BO) NMSUB c41, c41, b4, c11 NMSUB c42, c42, b4, c12 LD b8, 7 * SIZE(BO) NMSUB c51, c51, b5, c11 NMSUB c52, c52, b5, c12 LD b2, 9 * SIZE(BO) NMSUB c61, c61, b6, c11 NMSUB c62, c62, b6, c12 LD b3, 10 * SIZE(BO) NMSUB c71, c71, b7, c11 NMSUB c72, c72, b7, c12 LD b4, 11 * SIZE(BO) NMSUB c81, c81, b8, c11 NMSUB c82, c82, b8, c12 LD b5, 12 * SIZE(BO) MUL c21, b2, c21 MUL c22, b2, c22 LD b6, 13 * SIZE(BO) NMSUB c31, c31, b3, c21 NMSUB c32, c32, b3, c22 LD b7, 14 * SIZE(BO) NMSUB c41, c41, b4, c21 NMSUB c42, c42, b4, c22 LD b8, 15 * SIZE(BO) NMSUB c51, c51, b5, c21 NMSUB c52, c52, b5, c22 LD b3, 18 * SIZE(BO) NMSUB c61, c61, b6, c21 NMSUB c62, c62, b6, c22 LD b4, 19 * SIZE(BO) NMSUB c71, c71, b7, c21 NMSUB c72, c72, b7, c22 LD b5, 20 * SIZE(BO) NMSUB c81, c81, b8, c21 NMSUB c82, c82, b8, c22 LD b6, 21 * SIZE(BO) MUL c31, b3, c31 MUL c32, b3, c32 LD b7, 22 * SIZE(BO) NMSUB c41, c41, b4, c31 NMSUB c42, c42, b4, c32 LD b8, 23 * SIZE(BO) NMSUB c51, c51, b5, c31 NMSUB c52, c52, b5, c32 LD b4, 27 * SIZE(BO) NMSUB c61, c61, b6, c31 NMSUB c62, c62, b6, c32 LD b5, 28 * SIZE(BO) NMSUB c71, c71, b7, c31 NMSUB c72, c72, b7, c32 LD b6, 29 * SIZE(BO) NMSUB c81, c81, b8, c31 NMSUB c82, c82, b8, c32 LD b7, 30 * SIZE(BO) MUL c41, b4, c41 MUL c42, b4, c42 LD b8, 31 * SIZE(BO) NMSUB c51, c51, b5, c41 NMSUB c52, c52, b5, c42 LD b5, 36 * SIZE(BO) NMSUB c61, c61, b6, c41 NMSUB c62, c62, b6, c42 LD b6, 37 * SIZE(BO) NMSUB c71, c71, b7, c41 NMSUB c72, c72, b7, c42 LD b7, 38 * SIZE(BO) NMSUB c81, c81, b8, c41 NMSUB c82, c82, b8, c42 LD b8, 39 * SIZE(BO) MUL c51, b5, c51 MUL c52, b5, c52 NMSUB c61, c61, b6, c51 NMSUB c62, c62, b6, c52 LD b6, 45 * SIZE(BO) NMSUB c71, c71, b7, c51 NMSUB c72, c72, b7, c52 LD b7, 46 * SIZE(BO) NMSUB c81, c81, b8, c51 NMSUB c82, c82, b8, c52 LD b8, 47 * SIZE(BO) MUL c61, b6, c61 MUL c62, b6, c62 NMSUB c71, c71, b7, c61 NMSUB c72, c72, b7, c62 LD b7, 54 * SIZE(BO) NMSUB c81, c81, b8, c61 NMSUB c82, c82, b8, c62 LD b8, 55 * SIZE(BO) MUL c71, b7, c71 MUL c72, b7, c72 NMSUB c81, c81, b8, c71 NMSUB c82, c82, b8, c72 LD b8, 63 * SIZE(BO) MUL c81, b8, c81 MUL c82, b8, c82 #endif #ifdef RT LD b1, 63 * SIZE(BO) LD b2, 62 * SIZE(BO) LD b3, 61 * SIZE(BO) LD b4, 60 * SIZE(BO) MUL c81, b1, c81 MUL c82, b1, c82 LD b5, 59 * SIZE(BO) NMSUB c71, c71, b2, c81 NMSUB c72, c72, b2, c82 LD b6, 58 * SIZE(BO) NMSUB c61, c61, b3, c81 NMSUB c62, c62, b3, c82 LD b7, 57 * SIZE(BO) NMSUB c51, c51, b4, c81 NMSUB c52, c52, b4, c82 LD b8, 56 * SIZE(BO) NMSUB c41, c41, b5, c81 NMSUB c42, c42, b5, c82 LD b2, 54 * SIZE(BO) NMSUB c31, c31, b6, c81 NMSUB c32, c32, b6, c82 LD b3, 53 * SIZE(BO) NMSUB c21, c21, b7, c81 NMSUB c22, c22, b7, c82 LD b4, 52 * SIZE(BO) NMSUB c11, c11, b8, c81 NMSUB c12, c12, b8, c82 LD b5, 51 * SIZE(BO) MUL c71, b2, c71 MUL c72, b2, c72 LD b6, 50 * SIZE(BO) NMSUB c61, c61, b3, c71 NMSUB c62, c62, b3, c72 LD b7, 49 * SIZE(BO) NMSUB c51, c51, b4, c71 NMSUB c52, c52, b4, c72 LD b8, 48 * SIZE(BO) NMSUB c41, c41, b5, c71 NMSUB c42, c42, b5, c72 LD b3, 45 * SIZE(BO) NMSUB c31, c31, b6, c71 NMSUB c32, c32, b6, c72 LD b4, 44 * SIZE(BO) NMSUB c21, c21, b7, c71 NMSUB c22, c22, b7, c72 LD b5, 43 * SIZE(BO) NMSUB c11, c11, b8, c71 NMSUB c12, c12, b8, c72 LD b6, 42 * SIZE(BO) MUL c61, b3, c61 MUL c62, b3, c62 LD b7, 41 * SIZE(BO) NMSUB c51, c51, b4, c61 NMSUB c52, c52, b4, c62 LD b8, 40 * SIZE(BO) NMSUB c41, c41, b5, c61 NMSUB c42, c42, b5, c62 LD b4, 36 * SIZE(BO) NMSUB c31, c31, b6, c61 NMSUB c32, c32, b6, c62 LD b5, 35 * SIZE(BO) NMSUB c21, c21, b7, c61 NMSUB c22, c22, b7, c62 LD b6, 34 * SIZE(BO) NMSUB c11, c11, b8, c61 NMSUB c12, c12, b8, c62 LD b7, 33 * SIZE(BO) MUL c51, b4, c51 MUL c52, b4, c52 LD b8, 32 * SIZE(BO) NMSUB c41, c41, b5, c51 NMSUB c42, c42, b5, c52 LD b5, 27 * SIZE(BO) NMSUB c31, c31, b6, c51 NMSUB c32, c32, b6, c52 LD b6, 26 * SIZE(BO) NMSUB c21, c21, b7, c51 NMSUB c22, c22, b7, c52 LD b7, 25 * SIZE(BO) NMSUB c11, c11, b8, c51 NMSUB c12, c12, b8, c52 LD b8, 24 * SIZE(BO) MUL c41, b5, c41 MUL c42, b5, c42 NMSUB c31, c31, b6, c41 NMSUB c32, c32, b6, c42 LD b6, 18 * SIZE(BO) NMSUB c21, c21, b7, c41 NMSUB c22, c22, b7, c42 LD b7, 17 * SIZE(BO) NMSUB c11, c11, b8, c41 NMSUB c12, c12, b8, c42 LD b8, 16 * SIZE(BO) MUL c31, b6, c31 MUL c32, b6, c32 NMSUB c21, c21, b7, c31 NMSUB c22, c22, b7, c32 LD b7, 9 * SIZE(BO) NMSUB c11, c11, b8, c31 NMSUB c12, c12, b8, c32 LD b8, 8 * SIZE(BO) MUL c21, b7, c21 MUL c22, b7, c22 NMSUB c11, c11, b8, c21 NMSUB c12, c12, b8, c22 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 MUL c12, b8, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c61, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c81, 7 * SIZE(BO) ST c12, 8 * SIZE(BO) ST c22, 9 * SIZE(BO) ST c32, 10 * SIZE(BO) ST c42, 11 * SIZE(BO) ST c52, 12 * SIZE(BO) ST c62, 13 * SIZE(BO) ST c72, 14 * SIZE(BO) ST c82, 15 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c21, 2 * SIZE(AO) ST c22, 3 * SIZE(AO) ST c31, 4 * SIZE(AO) ST c32, 5 * SIZE(AO) ST c41, 6 * SIZE(AO) ST c42, 7 * SIZE(AO) ST c51, 8 * SIZE(AO) ST c52, 9 * SIZE(AO) ST c61, 10 * SIZE(AO) ST c62, 11 * SIZE(AO) ST c71, 12 * SIZE(AO) ST c72, 13 * SIZE(AO) ST c81, 14 * SIZE(AO) ST c82, 15 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c22, 1 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c32, 1 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c42, 1 * SIZE(CO4) ST c51, 0 * SIZE(CO5) ST c52, 1 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c62, 1 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c72, 1 * SIZE(CO7) ST c81, 0 * SIZE(CO8) ST c82, 1 * SIZE(CO8) MTC $0, a1 #ifndef LN daddiu CO1, CO1, 2 * SIZE daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE daddiu CO5, CO5, 2 * SIZE daddiu CO6, CO6, 2 * SIZE daddiu CO7, CO7, 2 * SIZE daddiu CO8, CO8, 2 * SIZE #endif MOV c11, a1 MOV c21, a1 #ifdef RT dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP #endif MOV c31, a1 MOV c41, a1 #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 2 #endif #ifdef LN daddiu KK, KK, -2 #endif daddiu I, I, -1 MOV c51, a1 bgtz I, .L11 MOV c61, a1 .align 3 .L20: andi I, M, 1 MOV c61, c11 blez I, .L29 MOV c71, c11 #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, KK, 2 MOV c81, c11 blez L, .L25 move BO, B #else #ifdef LN dsll TEMP, K, 0 + BASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, 0 + BASE_SHIFT dsll TEMP, KK, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 8 * SIZE(BO) LD b7, 12 * SIZE(BO) dsra L, TEMP, 2 MOV c81, c11 blez L, .L25 NOP #endif .align 3 .L22: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 LD b5, 20 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 9 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 10 * SIZE(BO) MADD c81, c81, a1, b4 LD b4, 11 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a2, b7 LD b7, 28 * SIZE(BO) MADD c61, c61, a2, b2 LD b2, 17 * SIZE(BO) MADD c71, c71, a2, b3 LD b3, 18 * SIZE(BO) MADD c81, c81, a2, b4 LD b4, 19 * SIZE(BO) LD a2, 5 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a3, b1 LD b1, 32 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 21 * SIZE(BO) MADD c31, c31, a3, b3 LD b3, 22 * SIZE(BO) MADD c41, c41, a3, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 LD b5, 36 * SIZE(BO) MADD c61, c61, a3, b2 LD b2, 25 * SIZE(BO) MADD c71, c71, a3, b3 LD b3, 26 * SIZE(BO) MADD c81, c81, a3, b4 LD b4, 27 * SIZE(BO) LD a3, 2 * SIZE(AO) daddiu BO, BO, 32 * SIZE MADD c11, c11, a4, b6 LD b6, 8 * SIZE(BO) MADD c21, c21, a4, b2 LD b2, -3 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, -2 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, -1 * SIZE(BO) MADD c51, c51, a4, b7 LD b7, 12 * SIZE(BO) MADD c61, c61, a4, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a4, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a4, b4 LD b4, 3 * SIZE(BO) bgtz L, .L22 LD a4, 3 * SIZE(AO) .align 3 .L25: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L28 NOP .align 3 .L26: MADD c11, c11, a1, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) daddiu L, L, -1 MOV a2, a2 daddiu AO, AO, 1 * SIZE daddiu BO, BO, 8 * SIZE MADD c51, c51, a1, b5 LD b5, 4 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) bgtz L, .L26 LD b4, 3 * SIZE(BO) .L28: #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -8 #endif dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c51, b5, c51 SUB c61, b6, c61 SUB c71, b7, c71 SUB c81, b8, c81 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c21, b2, c21 SUB c31, b3, c31 SUB c41, b4, c41 SUB c51, b5, c51 SUB c61, b6, c61 SUB c71, b7, c71 SUB c81, b8, c81 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) MUL c11, b1, c11 MUL c21, b1, c21 MUL c31, b1, c31 MUL c41, b1, c41 MUL c51, b1, c51 MUL c61, b1, c61 MUL c71, b1, c71 MUL c81, b1, c81 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MUL c11, b1, c11 NMSUB c21, c21, b2, c11 NMSUB c31, c31, b3, c11 NMSUB c41, c41, b4, c11 NMSUB c51, c51, b5, c11 NMSUB c61, c61, b6, c11 NMSUB c71, c71, b7, c11 NMSUB c81, c81, b8, c11 LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MUL c21, b2, c21 NMSUB c31, c31, b3, c21 NMSUB c41, c41, b4, c21 NMSUB c51, c51, b5, c21 NMSUB c61, c61, b6, c21 NMSUB c71, c71, b7, c21 NMSUB c81, c81, b8, c21 LD b3, 18 * SIZE(BO) LD b4, 19 * SIZE(BO) LD b5, 20 * SIZE(BO) LD b6, 21 * SIZE(BO) LD b7, 22 * SIZE(BO) LD b8, 23 * SIZE(BO) MUL c31, b3, c31 NMSUB c41, c41, b4, c31 NMSUB c51, c51, b5, c31 NMSUB c61, c61, b6, c31 NMSUB c71, c71, b7, c31 NMSUB c81, c81, b8, c31 LD b4, 27 * SIZE(BO) LD b5, 28 * SIZE(BO) LD b6, 29 * SIZE(BO) LD b7, 30 * SIZE(BO) LD b8, 31 * SIZE(BO) MUL c41, b4, c41 NMSUB c51, c51, b5, c41 NMSUB c61, c61, b6, c41 NMSUB c71, c71, b7, c41 NMSUB c81, c81, b8, c41 LD b5, 36 * SIZE(BO) LD b6, 37 * SIZE(BO) LD b7, 38 * SIZE(BO) LD b8, 39 * SIZE(BO) MUL c51, b5, c51 NMSUB c61, c61, b6, c51 NMSUB c71, c71, b7, c51 NMSUB c81, c81, b8, c51 LD b6, 45 * SIZE(BO) LD b7, 46 * SIZE(BO) LD b8, 47 * SIZE(BO) MUL c61, b6, c61 NMSUB c71, c71, b7, c61 NMSUB c81, c81, b8, c61 LD b7, 54 * SIZE(BO) LD b8, 55 * SIZE(BO) MUL c71, b7, c71 NMSUB c81, c81, b8, c71 LD b8, 63 * SIZE(BO) MUL c81, b8, c81 #endif #ifdef RT LD b1, 63 * SIZE(BO) LD b2, 62 * SIZE(BO) LD b3, 61 * SIZE(BO) LD b4, 60 * SIZE(BO) LD b5, 59 * SIZE(BO) LD b6, 58 * SIZE(BO) LD b7, 57 * SIZE(BO) LD b8, 56 * SIZE(BO) MUL c81, b1, c81 NMSUB c71, c71, b2, c81 NMSUB c61, c61, b3, c81 NMSUB c51, c51, b4, c81 NMSUB c41, c41, b5, c81 NMSUB c31, c31, b6, c81 NMSUB c21, c21, b7, c81 NMSUB c11, c11, b8, c81 LD b2, 54 * SIZE(BO) LD b3, 53 * SIZE(BO) LD b4, 52 * SIZE(BO) LD b5, 51 * SIZE(BO) LD b6, 50 * SIZE(BO) LD b7, 49 * SIZE(BO) LD b8, 48 * SIZE(BO) MUL c71, b2, c71 NMSUB c61, c61, b3, c71 NMSUB c51, c51, b4, c71 NMSUB c41, c41, b5, c71 NMSUB c31, c31, b6, c71 NMSUB c21, c21, b7, c71 NMSUB c11, c11, b8, c71 LD b3, 45 * SIZE(BO) LD b4, 44 * SIZE(BO) LD b5, 43 * SIZE(BO) LD b6, 42 * SIZE(BO) LD b7, 41 * SIZE(BO) LD b8, 40 * SIZE(BO) MUL c61, b3, c61 NMSUB c51, c51, b4, c61 NMSUB c41, c41, b5, c61 NMSUB c31, c31, b6, c61 NMSUB c21, c21, b7, c61 NMSUB c11, c11, b8, c61 LD b4, 36 * SIZE(BO) LD b5, 35 * SIZE(BO) LD b6, 34 * SIZE(BO) LD b7, 33 * SIZE(BO) LD b8, 32 * SIZE(BO) MUL c51, b4, c51 NMSUB c41, c41, b5, c51 NMSUB c31, c31, b6, c51 NMSUB c21, c21, b7, c51 NMSUB c11, c11, b8, c51 LD b5, 27 * SIZE(BO) LD b6, 26 * SIZE(BO) LD b7, 25 * SIZE(BO) LD b8, 24 * SIZE(BO) MUL c41, b5, c41 NMSUB c31, c31, b6, c41 NMSUB c21, c21, b7, c41 NMSUB c11, c11, b8, c41 LD b6, 18 * SIZE(BO) LD b7, 17 * SIZE(BO) LD b8, 16 * SIZE(BO) MUL c31, b6, c31 NMSUB c21, c21, b7, c31 NMSUB c11, c11, b8, c31 LD b7, 9 * SIZE(BO) LD b8, 8 * SIZE(BO) MUL c21, b7, c21 NMSUB c11, c11, b8, c21 LD b8, 0 * SIZE(BO) MUL c11, b8, c11 #endif #ifdef LN daddiu CO1, CO1, -1 * SIZE daddiu CO2, CO2, -1 * SIZE daddiu CO3, CO3, -1 * SIZE daddiu CO4, CO4, -1 * SIZE daddiu CO5, CO5, -1 * SIZE daddiu CO6, CO6, -1 * SIZE daddiu CO7, CO7, -1 * SIZE daddiu CO8, CO8, -1 * SIZE #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c21, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c41, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c61, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c81, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c21, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c41, 3 * SIZE(AO) ST c51, 4 * SIZE(AO) ST c61, 5 * SIZE(AO) ST c71, 6 * SIZE(AO) ST c81, 7 * SIZE(AO) #endif ST c11, 0 * SIZE(CO1) ST c21, 0 * SIZE(CO2) ST c31, 0 * SIZE(CO3) ST c41, 0 * SIZE(CO4) ST c51, 0 * SIZE(CO5) ST c61, 0 * SIZE(CO6) ST c71, 0 * SIZE(CO7) ST c81, 0 * SIZE(CO8) #ifndef LN daddiu CO1, CO1, 1 * SIZE daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE daddiu CO5, CO5, 1 * SIZE daddiu CO6, CO6, 1 * SIZE daddiu CO7, CO7, 1 * SIZE daddiu CO8, CO8, 1 * SIZE #endif #ifdef RT dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, 0 + BASE_SHIFT dsll TEMP, TEMP, 3 + BASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif .align 3 .L29: #ifdef LN dsll TEMP, K, 3 + BASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 8 #endif #ifdef RT daddiu KK, KK, -8 #endif bgtz J, .L10 NOP .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) LDARG $22, 88($sp) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #ifndef __64BIT__ ldc1 $f20,112($sp) ldc1 $f21,120($sp) ldc1 $f22,128($sp) ldc1 $f23,136($sp) #endif j $31 daddiu $sp, $sp, 144 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/trsm_kernel_RT_loongson3a.S000066400000000000000000001016551313527062700227760ustar00rootroot00000000000000#define REALNAME ASMNAME #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $8 #define B $9 #define C $10 #define LDC $11 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define OFFSET $22 #define KK $23 #define TEMP $24 #define AORIG $25 #define a1 $f0 #define a2 $f1 #define a3 $f26 #define a4 $f27 #define a5 $f28 #define a6 $f29 #define a7 $f30 #define a8 $f31 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define t11 $f10 #define t21 $f11 #define t31 $f12 #define t41 $f13 #define t12 $f14 #define t22 $f15 #define t32 $f16 #define t42 $f17 #define t13 $f18 #define t23 $f19 #define t33 $f20 #define t43 $f21 #define t14 $f22 #define t24 $f23 #define t34 $f24 #define t44 $f25 PROLOGUE daddiu $sp, $sp, -144 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) SDARG $22, 88($sp) SDARG $23, 96($sp) SDARG $24, 104($sp) SDARG $25, 112($sp) #ifndef __64BIT__ sdc1 $f20,112($sp) sdc1 $f21,120($sp) sdc1 $f22,128($sp) sdc1 $f23,136($sp) #endif .align 3 # RT compute from right to left LDARG OFFSET, 144($sp) # get the last parameter dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte mult N, K mflo TEMP dsll TEMP, TEMP, BASE_SHIFT # B Representative triangle matrix!!! daddu B, B, TEMP # B point to the end of sb # Be carefull B has no effeck of mc!! mult N, LDC mflo TEMP daddu C, C, TEMP # C point to the last colum of blockB dsubu KK, K, OFFSET # KC-KK is the length of rectangular data part of Bj andi J, N, 1 blez J, .L30 nop dsll TEMP, K, BASE_SHIFT dsubu B, B, TEMP # move B to the beginning address of Bj dsubu C, C, LDC move CO1, C move AORIG, A dsra I, M, 2 blez I, .L80 NOP .L31: # mr=4,nr=1 dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, BASE_SHIFT # nr=1 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L35 NOP .align 3 .L32: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a1, b3 MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a5, b7 MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 daddiu L, L, -1 bgtz L, .L32 NOP .align 3 .L35: andi L, TEMP, 3 blez L, .L38 NOP .align 3 .L36: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L36 NOP .align .L38: daddiu TEMP, KK, -1 # deal with the triangular data part dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT # nr=1 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD b2, 0 * SIZE(BO) MUL t11, b2, t11 MUL t21, b2, t21 MUL t31, b2, t31 MUL t41, b2, t41 ST t11, 0 * SIZE(AO) # updata packed A ST t21, 1 * SIZE(AO) ST t31, 2 * SIZE(AO) ST t41, 3 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) daddiu CO1, CO1, 4 * SIZE # fixed pointer dsll TEMP, K, 2 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai daddiu I, I, -1 bgtz I, .L31 NOP .align 3 .L80: andi I, M, 2 blez I, .L90 nop dsll L, KK, 1 + BASE_SHIFT # mr=2 dsll TEMP, KK, BASE_SHIFT # nr=1 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers MOV t21, t11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L85 NOP .align 3 .L82: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 MADD t21, t21, a4, b3 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 MADD t21, t21, a8, b7 daddiu L, L, -1 bgtz L, .L82 NOP .align 3 .L85: andi L, TEMP, 3 blez L, .L88 NOP .align 3 .L86: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L86 NOP .align .L88: daddiu TEMP, KK, -1 # deal with the triangular data part dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT # nr=1 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b2, 1 * SIZE(AO) SUB t11, b1, t11 SUB t21, b2, t21 LD b2, 0 * SIZE(BO) MUL t11, b2, t11 MUL t21, b2, t21 ST t11, 0 * SIZE(AO) # updata packed A ST t21, 1 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) daddiu CO1, CO1, 2 * SIZE # fixed pointer dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai .align 3 .L90: andi I, M, 1 blez I, .L39 nop dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, BASE_SHIFT # nr=1 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L95 NOP .align 3 .L92: LD a5, 1 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 LD a3, 2 * SIZE(AO) LD b3, 2 * SIZE(BO) MADD t11, t11, a5, b5 LD a7, 3 * SIZE(AO) LD b7, 3 * SIZE(BO) MADD t11, t11, a3, b3 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 daddiu L, L, -1 bgtz L, .L92 NOP .align 3 .L95: andi L, TEMP, 3 blez L, .L98 NOP .align 3 .L96: MADD t11, t11, a1, b1 daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) daddiu L, L, -1 bgtz L, .L96 NOP .align .L98: daddiu TEMP, KK, -1 # deal with the triangular data part dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT # nr=1 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results SUB t11, b1, t11 LD b2, 0 * SIZE(BO) MUL t11, b2, t11 ST t11, 0 * SIZE(AO) # updata packed A ST t11, 0 * SIZE(CO1) # write back daddiu CO1, CO1, 1 * SIZE # fixed pointer dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai .L39: daddiu KK, KK, -1 # rectangular data length increased by 1 .align 3 .L30: # nr=2 andi J, N, 2 blez J, .L50 nop dsll TEMP, K, 1 + BASE_SHIFT # Kc*2nr move B to the beginning address of Bj dsubu B, B, TEMP dsll TEMP, LDC, 1 # C dsubu C, C, TEMP move CO1, C daddu CO2, C, LDC move AORIG, A dsra I, M, 2 blez I, .L60 NOP .L51: # mr=4,nr=2 dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L55 NOP .align 3 .L52: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b3 MADD t21, t21, a2, b3 MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 MADD t12, t12, a1, b4 MADD t22, t22, a2, b4 MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a5, b7 MADD t21, t21, a6, b7 MADD t31, t31, a7, b7 MADD t41, t41, a8, b7 MADD t12, t12, a5, b8 MADD t22, t22, a6, b8 MADD t32, t32, a7, b8 MADD t42, t42, a8, b8 daddiu L, L, -1 bgtz L, .L52 NOP .align 3 .L55: andi L, TEMP, 3 blez L, .L58 NOP .align 3 .L56: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L56 NOP .align .L58: daddiu TEMP, KK, -2 # deal with the triangular data part dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB t12, b5, t12 SUB t22, b6, t22 SUB t32, b7, t32 SUB t42, b8, t42 LD b8, 3 * SIZE(BO) LD b1, 2 * SIZE(BO) MUL t12, b8, t12 MUL t22, b8, t22 MUL t32, b8, t32 MUL t42, b8, t42 NMSUB t11, t11, b1, t12 NMSUB t21, t21, b1, t22 NMSUB t31, t31, b1, t32 NMSUB t41, t41, b1, t42 LD b2, 0 * SIZE(BO) MUL t11, b2, t11 MUL t21, b2, t21 MUL t31, b2, t31 MUL t41, b2, t41 ST t11, 0 * SIZE(AO) # updata packed A ST t21, 1 * SIZE(AO) ST t31, 2 * SIZE(AO) ST t41, 3 * SIZE(AO) ST t12, 4 * SIZE(AO) ST t22, 5 * SIZE(AO) ST t32, 6 * SIZE(AO) ST t42, 7 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) daddiu CO1, CO1, 4 * SIZE # fixed pointer daddiu CO2, CO2, 4 * SIZE dsll TEMP, K, 2 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai daddiu I, I, -1 bgtz I, .L51 NOP .align 3 .L60: andi I, M, 2 # mr=2 blez I, .L70 nop dsll L, KK, 1 + BASE_SHIFT # mr=2 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t21, t11 MOV t12, t11 MOV t22, t11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L65 NOP .align 3 .L62: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 MADD t21, t21, a4, b3 MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 MADD t21, t21, a8, b7 MADD t12, t12, a7, b8 MADD t22, t22, a8, b8 daddiu L, L, -1 bgtz L, .L62 NOP .align 3 .L65: andi L, TEMP, 3 blez L, .L68 NOP .align 3 .L66: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L66 NOP .align .L68: daddiu TEMP, KK, -2 # deal with the triangular data part dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB t11, b1, t11 SUB t21, b2, t21 SUB t12, b3, t12 SUB t22, b4, t22 LD b8, 3 * SIZE(BO) LD b7, 2 * SIZE(BO) MUL t12, b8, t12 MUL t22, b8, t22 NMSUB t11, t11, b7, t12 NMSUB t21, t21, b7, t22 LD b6, 0 * SIZE(BO) MUL t11, b6, t11 MUL t21, b6, t21 ST t11, 0 * SIZE(AO) # updata packed A ST t21, 1 * SIZE(AO) ST t12, 2 * SIZE(AO) ST t22, 3 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) daddiu CO1, CO1, 2 * SIZE # fixed pointer daddiu CO2, CO2, 2 * SIZE dsll TEMP, K, 1 + BASE_SHIFT # mr=2 daddu AORIG, AORIG, TEMP # move to next panel Ai .align 3 .L70: andi I, M, 1 # mr=1 blez I, .L59 nop dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t12, t11 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L75 NOP .align 3 .L72: LD a5, 1 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 LD a3, 2 * SIZE(AO) LD b3, 4 * SIZE(BO) LD b4, 5 * SIZE(BO) MADD t11, t11, a5, b5 MADD t12, t12, a5, b6 LD a7, 3 * SIZE(AO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a3, b3 MADD t12, t12, a3, b4 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD t11, t11, a7, b7 MADD t12, t12, a7, b8 daddiu L, L, -1 bgtz L, .L72 NOP .align 3 .L75: andi L, TEMP, 3 blez L, .L78 NOP .align 3 .L76: MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 2 * SIZE # BO += 2nr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) daddiu L, L, -1 bgtz L, .L76 NOP .align .L78: daddiu TEMP, KK, -2 # deal with the triangular data part dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b2, 1 * SIZE(AO) SUB t11, b1, t11 SUB t12, b2, t12 LD b8, 3 * SIZE(BO) LD b7, 2 * SIZE(BO) MUL t12, b8, t12 NMSUB t11, t11, b7, t12 LD b6, 0 * SIZE(BO) MUL t11, b6, t11 ST t11, 0 * SIZE(AO) # updata packed A ST t12, 1 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t12, 0 * SIZE(CO2) daddiu CO1, CO1, 1 * SIZE # fixed pointer daddiu CO2, CO2, 1 * SIZE dsll TEMP, K, BASE_SHIFT # mr=2 daddu AORIG, AORIG, TEMP # move to next panel Ai .L59: daddiu KK, KK, -2 # rectangular data length increased by 2 .align 3 .L50: dsra J, N, 2 # J = NC/4 blez J, .L999 NOP .L10: dsll TEMP, K, 2 + BASE_SHIFT dsubu B, B, TEMP # move B to the beginning address of Bj dsll TEMP, LDC, 2 dsubu C, C, TEMP # move C to the beginning address of Cj daddiu J, J, -1 move CO1, C daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC move AORIG, A # reset A dsra I, M, 2 # I=MC/4 blez I, .L20 NOP .align 3 .L11: dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 16 results registers MOV t21, t11 MOV t31, t11 MOV t41, t11 MOV t12, t11 MOV t22, t11 MOV t32, t11 MOV t42, t11 MOV t13, t11 MOV t23, t11 MOV t33, t11 MOV t43, t11 MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L15 NOP .align 3 .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # fisrt LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 # second LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # third daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t31, t31, a7, b5 MADD t41, t41, a8, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t32, t32, a7, b6 MADD t42, t42, a8, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t33, t33, a7, b7 MADD t43, t43, a8, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 MADD t44, t44, a8, b8 # fouth daddiu L, L, -1 bgtz L, .L12 NOP .align 3 .L15: andi L, TEMP, 3 blez L, .L18 NOP .align 3 .L16: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t33, t33, a3, b3 MADD t43, t43, a4, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # third daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L16 NOP .align .L18: daddiu TEMP, KK, -4 # deal with the triangular data part dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB t11, b1, t11 SUB t21, b2, t21 SUB t31, b3, t31 SUB t41, b4, t41 LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB t12, b5, t12 SUB t22, b6, t22 SUB t32, b7, t32 SUB t42, b8, t42 LD b1, 8 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 10 * SIZE(AO) LD b4, 11 * SIZE(AO) SUB t13, b1, t13 SUB t23, b2, t23 SUB t33, b3, t33 SUB t43, b4, t43 LD b5, 12 * SIZE(AO) LD b6, 13 * SIZE(AO) LD b7, 14 * SIZE(AO) LD b8, 15 * SIZE(AO) SUB t14, b5, t14 SUB t24, b6, t24 SUB t34, b7, t34 SUB t44, b8, t44 LD b1, 15 * SIZE(BO) LD b2, 14 * SIZE(BO) LD b3, 13 * SIZE(BO) LD b4, 12 * SIZE(BO) MUL t14, b1, t14 MUL t24, b1, t24 MUL t34, b1, t34 MUL t44, b1, t44 NMSUB t13, t13, b2, t14 NMSUB t23, t23, b2, t24 NMSUB t33, t33, b2, t34 NMSUB t43, t43, b2, t44 NMSUB t12, t12, b3, t14 NMSUB t22, t22, b3, t24 NMSUB t32, t32, b3, t34 NMSUB t42, t42, b3, t44 NMSUB t11, t11, b4, t14 NMSUB t21, t21, b4, t24 NMSUB t31, t31, b4, t34 NMSUB t41, t41, b4, t44 LD b5, 10 * SIZE(BO) LD b6, 9 * SIZE(BO) LD b7, 8 * SIZE(BO) MUL t13, b5, t13 MUL t23, b5, t23 MUL t33, b5, t33 MUL t43, b5, t43 NMSUB t12, t12, b6, t13 NMSUB t22, t22, b6, t23 NMSUB t32, t32, b6, t33 NMSUB t42, t42, b6, t43 NMSUB t11, t11, b7, t13 NMSUB t21, t21, b7, t23 NMSUB t31, t31, b7, t33 NMSUB t41, t41, b7, t43 LD b8, 5 * SIZE(BO) LD b1, 4 * SIZE(BO) MUL t12, b8, t12 MUL t22, b8, t22 MUL t32, b8, t32 MUL t42, b8, t42 NMSUB t11, t11, b1, t12 NMSUB t21, t21, b1, t22 NMSUB t31, t31, b1, t32 NMSUB t41, t41, b1, t42 LD b2, 0 * SIZE(BO) MUL t11, b2, t11 MUL t21, b2, t21 MUL t31, b2, t31 MUL t41, b2, t41 ST t11, 0 * SIZE(AO) # updata packed A ST t21, 1 * SIZE(AO) ST t31, 2 * SIZE(AO) ST t41, 3 * SIZE(AO) ST t12, 4 * SIZE(AO) ST t22, 5 * SIZE(AO) ST t32, 6 * SIZE(AO) ST t42, 7 * SIZE(AO) ST t13, 8 * SIZE(AO) ST t23, 9 * SIZE(AO) ST t33, 10 * SIZE(AO) ST t43, 11 * SIZE(AO) ST t14, 12 * SIZE(AO) ST t24, 13 * SIZE(AO) ST t34, 14 * SIZE(AO) ST t44, 15 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t33, 2 * SIZE(CO3) ST t43, 3 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) ST t44, 3 * SIZE(CO4) daddiu CO1, CO1, 4 * SIZE # fixed pointer daddiu CO2, CO2, 4 * SIZE daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE dsll TEMP, K, 2 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai daddiu I, I, -1 bgtz I, .L11 NOP .align 3 .L20: andi I, M, 2 # mr=2 blez I, .L40 NOP dsll L, KK, 1 + BASE_SHIFT # mr=2 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t21, t11 MOV t12, t11 MOV t22, t11 MOV t13, t11 MOV t23, t11 MOV t14, t11 MOV t24, t11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L25 NOP .align 3 .L22: LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 LD a3, 4 * SIZE(AO) LD a4, 5 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 MADD t21, t21, a6, b5 MADD t12, t12, a5, b6 MADD t22, t22, a6, b6 MADD t13, t13, a5, b7 MADD t23, t23, a6, b7 MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 MADD t21, t21, a4, b1 MADD t12, t12, a3, b2 MADD t22, t22, a4, b2 MADD t13, t13, a3, b3 MADD t23, t23, a4, b3 MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 MADD t21, t21, a8, b5 MADD t12, t12, a7, b6 MADD t22, t22, a8, b6 MADD t13, t13, a7, b7 MADD t23, t23, a8, b7 MADD t14, t14, a7, b8 MADD t24, t24, a8, b8 daddiu L, L, -1 bgtz L, .L22 NOP .align 3 .L25: andi L, TEMP, 3 blez L, .L28 NOP .align 3 .L26: MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t13, t13, a1, b3 MADD t23, t23, a2, b3 MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L26 NOP .align .L28: daddiu TEMP, KK, -4 # deal with the triangular data part dsll L, TEMP, 1 + BASE_SHIFT # mr=2 dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b2, 1 * SIZE(AO) SUB t11, b1, t11 SUB t21, b2, t21 LD b5, 2 * SIZE(AO) LD b6, 3 * SIZE(AO) SUB t12, b5, t12 SUB t22, b6, t22 LD b3, 4 * SIZE(AO) LD b4, 5 * SIZE(AO) SUB t13, b3, t13 SUB t23, b4, t23 LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB t14, b7, t14 SUB t24, b8, t24 LD b1, 15 * SIZE(BO) LD b2, 14 * SIZE(BO) LD b3, 13 * SIZE(BO) LD b4, 12 * SIZE(BO) MUL t14, b1, t14 MUL t24, b1, t24 NMSUB t13, t13, b2, t14 NMSUB t23, t23, b2, t24 NMSUB t12, t12, b3, t14 NMSUB t22, t22, b3, t24 NMSUB t11, t11, b4, t14 NMSUB t21, t21, b4, t24 LD b5, 10 * SIZE(BO) LD b6, 9 * SIZE(BO) LD b7, 8 * SIZE(BO) MUL t13, b5, t13 MUL t23, b5, t23 NMSUB t12, t12, b6, t13 NMSUB t22, t22, b6, t23 NMSUB t11, t11, b7, t13 NMSUB t21, t21, b7, t23 LD b8, 5 * SIZE(BO) LD b1, 4 * SIZE(BO) MUL t12, b8, t12 MUL t22, b8, t22 NMSUB t11, t11, b1, t12 NMSUB t21, t21, b1, t22 LD b2, 0 * SIZE(BO) MUL t11, b2, t11 MUL t21, b2, t21 ST t11, 0 * SIZE(AO) # updata packed A ST t21, 1 * SIZE(AO) ST t12, 2 * SIZE(AO) ST t22, 3 * SIZE(AO) ST t13, 4 * SIZE(AO) ST t23, 5 * SIZE(AO) ST t14, 6 * SIZE(AO) ST t24, 7 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) daddiu CO1, CO1, 2 * SIZE # fixed pointer daddiu CO2, CO2, 2 * SIZE daddiu CO3, CO3, 2 * SIZE daddiu CO4, CO4, 2 * SIZE dsll TEMP, K, 1 + BASE_SHIFT # mr=2 daddu AORIG, AORIG, TEMP # move to next panel Ai .align 3 .L40: andi I, M, 1 blez I, .L29 NOP dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers MOV t12, t11 MOV t13, t11 MOV t14, t11 LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L45 NOP .align 3 .L42: LD a5, 1 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 LD a3, 2 * SIZE(AO) LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD t11, t11, a5, b5 MADD t12, t12, a5, b6 MADD t13, t13, a5, b7 MADD t14, t14, a5, b8 LD a7, 3 * SIZE(AO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD t11, t11, a3, b1 MADD t12, t12, a3, b2 MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD t11, t11, a7, b5 MADD t12, t12, a7, b6 MADD t13, t13, a7, b7 MADD t14, t14, a7, b8 daddiu L, L, -1 bgtz L, .L42 NOP .align 3 .L45: andi L, TEMP, 3 blez L, .L48 NOP .align 3 .L46: MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) daddiu L, L, -1 bgtz L, .L46 NOP .align .L48: daddiu TEMP, KK, -4 # deal with the triangular data part dsll L, TEMP, BASE_SHIFT # mr=1 dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the trigular data part LD b1, 0 * SIZE(AO) # fixed results LD b5, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b7, 3 * SIZE(AO) SUB t11, b1, t11 SUB t12, b5, t12 SUB t13, b3, t13 SUB t14, b7, t14 LD b1, 15 * SIZE(BO) LD b2, 14 * SIZE(BO) LD b3, 13 * SIZE(BO) LD b4, 12 * SIZE(BO) MUL t14, b1, t14 NMSUB t13, t13, b2, t14 NMSUB t12, t12, b3, t14 NMSUB t11, t11, b4, t14 LD b5, 10 * SIZE(BO) LD b6, 9 * SIZE(BO) LD b7, 8 * SIZE(BO) MUL t13, b5, t13 NMSUB t12, t12, b6, t13 NMSUB t11, t11, b7, t13 LD b8, 5 * SIZE(BO) LD b1, 4 * SIZE(BO) MUL t12, b8, t12 NMSUB t11, t11, b1, t12 LD b2, 0 * SIZE(BO) MUL t11, b2, t11 ST t11, 0 * SIZE(AO) # updata packed A ST t12, 1 * SIZE(AO) ST t13, 2 * SIZE(AO) ST t14, 3 * SIZE(AO) ST t11, 0 * SIZE(CO1) # write back ST t12, 0 * SIZE(CO2) ST t13, 0 * SIZE(CO3) ST t14, 0 * SIZE(CO4) daddiu CO1, CO1, 1 * SIZE # fixed pointer daddiu CO2, CO2, 1 * SIZE daddiu CO3, CO3, 1 * SIZE daddiu CO4, CO4, 1 * SIZE dsll TEMP, K, BASE_SHIFT # mr=2 daddu AORIG, AORIG, TEMP # move to next panel Ai .L29: daddiu KK, KK, -4 # rectangular data part increased by 4 bgtz J, .L10 NOP .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) LDARG $22, 88($sp) LDARG $23, 96($sp) LDARG $24, 104($sp) LDARG $25, 112($sp) #ifndef __64BIT__ ldc1 $f20,112($sp) ldc1 $f21,120($sp) ldc1 $f22,128($sp) ldc1 $f23,136($sp) #endif j $31 daddiu $sp, $sp, 144 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zamax.S000066400000000000000000000121211313527062700170070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define t5 $f16 #define t6 $f17 #define t7 $f18 #define t8 $f19 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT LD a1, 0 * SIZE(X) daddiu N, N, -1 LD a2, 1 * SIZE(X) daddu X, X, INCX FABS t1, a1 FABS t2, a2 blez N, .L999 ADD s1, t1, t2 NOP ADD s2, t1, t2 dsra I, N, 2 ADD s3, t1, t2 blez I, .L15 ADD s4, t1, t2 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 LD a2, 1 * SIZE(X) FABS t3, a3 daddu X, X, INCX FABS t4, a4 NOP FABS t5, a5 LD a3, 0 * SIZE(X) FABS t6, a6 LD a4, 1 * SIZE(X) FABS t7, a7 daddu X, X, INCX FABS t8, a8 NOP ADD t1, t1, t2 LD a5, 0 * SIZE(X) ADD t3, t3, t4 LD a6, 1 * SIZE(X) ADD t5, t5, t6 daddu X, X, INCX ADD t7, t7, t8 NOP CMPLT $fcc0, s1, t1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, t3 LD a8, 1 * SIZE(X) CMPLT $fcc2, s3, t5 daddu X, X, INCX CMPLT $fcc3, s4, t7 NOP CMOVT s1, t1, $fcc0 daddiu I, I, -1 CMOVT s2, t3, $fcc1 NOP CMOVT s3, t5, $fcc2 bgtz I, .L12 CMOVT s4, t7, $fcc3 NOP .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 FABS t5, a5 FABS t6, a6 FABS t7, a7 FABS t8, a8 ADD t1, t1, t2 ADD t3, t3, t4 ADD t5, t5, t6 ADD t7, t7, t8 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t3 CMPLT $fcc2, s3, t5 CMPLT $fcc3, s4, t7 CMOVT s1, t1, $fcc0 CMOVT s2, t3, $fcc1 CMOVT s3, t5, $fcc2 CMOVT s4, t7, $fcc3 .align 3 .L15: andi I, N, 3 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddiu I, I, -1 FABS t1, a1 FABS t2, a2 ADD t1, t1, t2 CMPLT $fcc0, s1, t1 CMOVT s1, t1, $fcc0 bgtz I, .L16 daddu X, X, INCX .align 3 .L998: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zamin.S000066400000000000000000000121211313527062700170050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define t5 $f16 #define t6 $f17 #define t7 $f18 #define t8 $f19 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT LD a1, 0 * SIZE(X) daddiu N, N, -1 LD a2, 1 * SIZE(X) daddu X, X, INCX FABS t1, a1 FABS t2, a2 blez N, .L999 ADD s1, t1, t2 NOP ADD s2, t1, t2 dsra I, N, 2 ADD s3, t1, t2 blez I, .L15 ADD s4, t1, t2 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 LD a2, 1 * SIZE(X) FABS t3, a3 daddu X, X, INCX FABS t4, a4 NOP FABS t5, a5 LD a3, 0 * SIZE(X) FABS t6, a6 LD a4, 1 * SIZE(X) FABS t7, a7 daddu X, X, INCX FABS t8, a8 NOP ADD t1, t1, t2 LD a5, 0 * SIZE(X) ADD t3, t3, t4 LD a6, 1 * SIZE(X) ADD t5, t5, t6 daddu X, X, INCX ADD t7, t7, t8 NOP CMPLT $fcc0, t1, s1 LD a7, 0 * SIZE(X) CMPLT $fcc1, t3, s2 LD a8, 1 * SIZE(X) CMPLT $fcc2, t5, s3 daddu X, X, INCX CMPLT $fcc3, t7, s4 NOP CMOVT s1, t1, $fcc0 daddiu I, I, -1 CMOVT s2, t3, $fcc1 NOP CMOVT s3, t5, $fcc2 bgtz I, .L12 CMOVT s4, t7, $fcc3 NOP .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 FABS t5, a5 FABS t6, a6 FABS t7, a7 FABS t8, a8 ADD t1, t1, t2 ADD t3, t3, t4 ADD t5, t5, t6 ADD t7, t7, t8 CMPLT $fcc0, t1, s1 CMPLT $fcc1, t3, s2 CMPLT $fcc2, t5, s3 CMPLT $fcc3, t7, s4 CMOVT s1, t1, $fcc0 CMOVT s2, t3, $fcc1 CMOVT s3, t5, $fcc2 CMOVT s4, t7, $fcc3 .align 3 .L15: andi I, N, 3 blez I, .L998 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddiu I, I, -1 FABS t1, a1 FABS t2, a2 ADD t1, t1, t2 CMPLT $fcc0, t1, s1 CMOVT s1, t1, $fcc0 bgtz I, .L16 daddu X, X, INCX .align 3 .L998: CMPLT $fcc0, s2, s1 CMPLT $fcc1, s4, s3 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s3, s1 CMOVT s1, s3, $fcc0 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zasum.S000066400000000000000000000106611313527062700170350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define I $2 #define TEMP $3 #define a1 $f2 #define a2 $f3 #define a3 $f4 #define a4 $f5 #define a5 $f6 #define a6 $f7 #define a7 $f8 #define a8 $f9 #define t1 $f10 #define t2 $f11 #define t3 $f12 #define t4 $f13 #define s1 $f0 #define s2 $f1 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif MTC $0, s1 MTC $0, s2 dsll INCX, INCX, ZBASE_SHIFT blez N, .L999 dsra I, N, 2 blez I, .L25 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX FABS t1, a1 FABS t2, a2 LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) FABS t3, a3 FABS t4, a4 daddiu I, I, -1 blez I, .L24 daddu X, X, INCX .align 3 .L23: ADD s1, s1, t1 LD a1, 0 * SIZE(X) FABS t1, a5 daddiu I, I, -1 ADD s2, s2, t2 LD a2, 1 * SIZE(X) FABS t2, a6 daddu X, X, INCX ADD s1, s1, t3 LD a3, 0 * SIZE(X) FABS t3, a7 NOP ADD s2, s2, t4 LD a4, 1 * SIZE(X) FABS t4, a8 daddu X, X, INCX ADD s1, s1, t1 LD a5, 0 * SIZE(X) FABS t1, a1 NOP ADD s2, s2, t2 LD a6, 1 * SIZE(X) FABS t2, a2 daddu X, X, INCX ADD s1, s1, t3 LD a7, 0 * SIZE(X) FABS t3, a3 LD a8, 1 * SIZE(X) ADD s2, s2, t4 daddu X, X, INCX bgtz I, .L23 FABS t4, a4 .align 3 .L24: ADD s1, s1, t1 FABS t1, a5 ADD s2, s2, t2 FABS t2, a6 ADD s1, s1, t3 FABS t3, a7 ADD s2, s2, t4 FABS t4, a8 ADD s1, s1, t1 ADD s2, s2, t2 ADD s1, s1, t3 ADD s2, s2, t4 .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) FABS t1, a1 daddiu I, I, -1 FABS t2, a2 daddu X, X, INCX ADD s1, s1, t1 bgtz I, .L26 ADD s2, s2, t2 .align 3 .L999: j $31 ADD s1, s1, s2 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zaxpy.S000066400000000000000000000216411313527062700170510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $9 #define INCX $10 #define Y $11 #define INCY $8 #define I $2 #define TEMP $3 #define YY $5 #define ALPHA_R $f15 #define ALPHA_I $f16 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f14 #define b8 $f17 #define t1 $f18 #define t2 $f19 #define t3 $f20 #define t4 $f21 #ifndef CONJ #define MADD1 NMSUB #define MADD2 MADD #else #define MADD1 MADD #define MADD2 NMSUB #endif PROLOGUE LDARG INCY, 0($sp) li TEMP, 2 * SIZE #ifndef __64BIT__ daddiu $sp, $sp, -16 sdc1 $f20, 0($sp) sdc1 $f21, 8($sp) #endif blez N, .L999 dsll INCX, INCX, ZBASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, ZBASE_SHIFT bne INCY, TEMP, .L20 dsra I, N, 2 blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) LD a3, 2 * SIZE(X) LD b3, 2 * SIZE(Y) LD a4, 3 * SIZE(X) LD b4, 3 * SIZE(Y) LD a5, 4 * SIZE(X) LD b5, 4 * SIZE(Y) LD a6, 5 * SIZE(X) LD b6, 5 * SIZE(Y) LD a7, 6 * SIZE(X) LD b7, 6 * SIZE(Y) LD a8, 7 * SIZE(X) LD b8, 7 * SIZE(Y) blez I, .L13 NOP .align 3 .L12: MADD t1, b1, ALPHA_R, a1 LD b1, 8 * SIZE(Y) MADD t2, b2, ALPHA_I, a1 LD a1, 8 * SIZE(X) MADD t3, b3, ALPHA_R, a3 LD b3, 10 * SIZE(Y) MADD t4, b4, ALPHA_I, a3 LD a3, 10 * SIZE(X) MADD1 t1, t1, ALPHA_I, a2 LD b2, 9 * SIZE(Y) MADD2 t2, t2, ALPHA_R, a2 LD a2, 9 * SIZE(X) MADD1 t3, t3, ALPHA_I, a4 LD b4, 11 * SIZE(Y) MADD2 t4, t4, ALPHA_R, a4 LD a4, 11 * SIZE(X) ST t1, 0 * SIZE(Y) ST t2, 1 * SIZE(Y) ST t3, 2 * SIZE(Y) ST t4, 3 * SIZE(Y) MADD t1, b5, ALPHA_R, a5 LD b5, 12 * SIZE(Y) MADD t2, b6, ALPHA_I, a5 LD a5, 12 * SIZE(X) MADD t3, b7, ALPHA_R, a7 LD b7, 14 * SIZE(Y) MADD t4, b8, ALPHA_I, a7 LD a7, 14 * SIZE(X) MADD1 t1, t1, ALPHA_I, a6 LD b6, 13 * SIZE(Y) MADD2 t2, t2, ALPHA_R, a6 LD a6, 13 * SIZE(X) MADD1 t3, t3, ALPHA_I, a8 LD b8, 15 * SIZE(Y) MADD2 t4, t4, ALPHA_R, a8 LD a8, 15 * SIZE(X) ST t1, 4 * SIZE(Y) ST t2, 5 * SIZE(Y) ST t3, 6 * SIZE(Y) ST t4, 7 * SIZE(Y) daddiu I, I, -1 daddiu Y, Y, 8 * SIZE bgtz I, .L12 daddiu X, X, 8 * SIZE .align 3 .L13: MADD t1, b1, ALPHA_R, a1 MADD t2, b2, ALPHA_I, a1 MADD t3, b3, ALPHA_R, a3 MADD t4, b4, ALPHA_I, a3 MADD1 t1, t1, ALPHA_I, a2 MADD2 t2, t2, ALPHA_R, a2 MADD1 t3, t3, ALPHA_I, a4 MADD2 t4, t4, ALPHA_R, a4 ST t1, 0 * SIZE(Y) MADD t1, b5, ALPHA_R, a5 ST t2, 1 * SIZE(Y) MADD t2, b6, ALPHA_I, a5 ST t3, 2 * SIZE(Y) MADD t3, b7, ALPHA_R, a7 ST t4, 3 * SIZE(Y) MADD t4, b8, ALPHA_I, a7 MADD1 t1, t1, ALPHA_I, a6 MADD2 t2, t2, ALPHA_R, a6 MADD1 t3, t3, ALPHA_I, a8 MADD2 t4, t4, ALPHA_R, a8 ST t1, 4 * SIZE(Y) ST t2, 5 * SIZE(Y) ST t3, 6 * SIZE(Y) ST t4, 7 * SIZE(Y) daddiu X, X, 8 * SIZE daddiu Y, Y, 8 * SIZE .align 3 .L15: andi I, N, 3 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) MADD t1, b1, ALPHA_R, a1 daddiu X, X, 2 * SIZE MADD t2, b2, ALPHA_I, a1 MADD1 t1, t1, ALPHA_I, a2 daddiu I, I, -1 MADD2 t2, t2, ALPHA_R, a2 daddiu Y, Y, 2 * SIZE ST t1, -2 * SIZE(Y) bgtz I, .L16 ST t2, -1 * SIZE(Y) #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f21, 8($sp) daddiu $sp, $sp, 16 #endif j $31 NOP .align 3 .L20: dsra I, N, 2 move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY LD a3, 0 * SIZE(X) LD b3, 0 * SIZE(Y) LD a4, 1 * SIZE(X) LD b4, 1 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY LD a5, 0 * SIZE(X) LD b5, 0 * SIZE(Y) LD a6, 1 * SIZE(X) LD b6, 1 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY LD a7, 0 * SIZE(X) blez I, .L23 LD b7, 0 * SIZE(Y) .align 3 .L22: MADD t1, b1, ALPHA_R, a1 LD b8, 1 * SIZE(Y) daddu Y, Y, INCY MADD t2, b2, ALPHA_I, a1 LD a8, 1 * SIZE(X) daddu X, X, INCX MADD t3, b3, ALPHA_R, a3 LD b1, 0 * SIZE(Y) MADD t4, b4, ALPHA_I, a3 LD a1, 0 * SIZE(X) MADD1 t1, t1, ALPHA_I, a2 LD b2, 1 * SIZE(Y) daddu Y, Y, INCY MADD2 t2, t2, ALPHA_R, a2 LD a2, 1 * SIZE(X) daddu X, X, INCX MADD1 t3, t3, ALPHA_I, a4 LD a3, 0 * SIZE(X) MADD2 t4, t4, ALPHA_R, a4 LD b3, 0 * SIZE(Y) ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) ST t4, 1 * SIZE(YY) daddu YY, YY, INCY MADD t1, b5, ALPHA_R, a5 LD a4, 1 * SIZE(X) daddu X, X, INCX MADD t2, b6, ALPHA_I, a5 LD b4, 1 * SIZE(Y) daddu Y, Y, INCY MADD t3, b7, ALPHA_R, a7 LD b5, 0 * SIZE(Y) MADD t4, b8, ALPHA_I, a7 LD a5, 0 * SIZE(X) MADD1 t1, t1, ALPHA_I, a6 LD b6, 1 * SIZE(Y) daddu Y, Y, INCY MADD2 t2, t2, ALPHA_R, a6 LD a6, 1 * SIZE(X) daddu X, X, INCX MADD1 t3, t3, ALPHA_I, a8 LD b7, 0 * SIZE(Y) MADD2 t4, t4, ALPHA_R, a8 LD a7, 0 * SIZE(X) ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) ST t4, 1 * SIZE(YY) daddu YY, YY, INCY daddiu I, I, -1 bgtz I, .L22 NOP .align 3 .L23: MADD t1, b1, ALPHA_R, a1 LD a8, 1 * SIZE(X) MADD t2, b2, ALPHA_I, a1 LD b8, 1 * SIZE(Y) MADD t3, b3, ALPHA_R, a3 daddu X, X, INCX MADD t4, b4, ALPHA_I, a3 daddu Y, Y, INCY MADD1 t1, t1, ALPHA_I, a2 MADD2 t2, t2, ALPHA_R, a2 MADD1 t3, t3, ALPHA_I, a4 MADD2 t4, t4, ALPHA_R, a4 ST t1, 0 * SIZE(YY) MADD t1, b5, ALPHA_R, a5 ST t2, 1 * SIZE(YY) MADD t2, b6, ALPHA_I, a5 daddu YY, YY, INCY ST t3, 0 * SIZE(YY) MADD t3, b7, ALPHA_R, a7 ST t4, 1 * SIZE(YY) MADD t4, b8, ALPHA_I, a7 daddu YY, YY, INCY MADD1 t1, t1, ALPHA_I, a6 MADD2 t2, t2, ALPHA_R, a6 MADD1 t3, t3, ALPHA_I, a8 MADD2 t4, t4, ALPHA_R, a8 ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) daddu YY, YY, INCY ST t3, 0 * SIZE(YY) ST t4, 1 * SIZE(YY) daddu YY, YY, INCY .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) MADD t1, b1, ALPHA_R, a1 MADD t2, b2, ALPHA_I, a1 daddu X, X, INCX MADD1 t1, t1, ALPHA_I, a2 MADD2 t2, t2, ALPHA_R, a2 daddiu I, I, -1 ST t1, 0 * SIZE(Y) ST t2, 1 * SIZE(Y) bgtz I, .L26 daddu Y, Y, INCY .align 3 .L999: #ifndef __64BIT__ ldc1 $f20, 0($sp) ldc1 $f21, 8($sp) daddiu $sp, $sp, 16 #endif j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zcopy.S000066400000000000000000000131101313527062700170320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define Y $7 #define INCY $8 #define I $2 #define TEMP $3 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif li TEMP, 2 * SIZE NOP blez N, .L999 dsll INCX, INCX, ZBASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, ZBASE_SHIFT bne INCY, TEMP, .L20 dsra I, N, 2 blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD a3, 2 * SIZE(X) LD a4, 3 * SIZE(X) LD a5, 4 * SIZE(X) LD a6, 5 * SIZE(X) LD a7, 6 * SIZE(X) LD a8, 7 * SIZE(X) blez I, .L13 NOP .align 3 .L12: ST a1, 0 * SIZE(Y) LD a1, 8 * SIZE(X) ST a2, 1 * SIZE(Y) LD a2, 9 * SIZE(X) ST a3, 2 * SIZE(Y) LD a3, 10 * SIZE(X) ST a4, 3 * SIZE(Y) LD a4, 11 * SIZE(X) ST a5, 4 * SIZE(Y) LD a5, 12 * SIZE(X) ST a6, 5 * SIZE(Y) LD a6, 13 * SIZE(X) ST a7, 6 * SIZE(Y) LD a7, 14 * SIZE(X) ST a8, 7 * SIZE(Y) LD a8, 15 * SIZE(X) daddiu I, I, -1 daddiu X, X, 8 * SIZE bgtz I, .L12 daddiu Y, Y, 8 * SIZE .align 3 .L13: ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) ST a3, 2 * SIZE(Y) ST a4, 3 * SIZE(Y) ST a5, 4 * SIZE(Y) ST a6, 5 * SIZE(Y) ST a7, 6 * SIZE(Y) ST a8, 7 * SIZE(Y) daddiu X, X, 8 * SIZE daddiu Y, Y, 8 * SIZE .align 3 .L15: andi I, N, 3 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddiu X, X, 2 * SIZE daddiu Y, Y, 2 * SIZE ST a1, -2 * SIZE(Y) daddiu I, I, -1 bgtz I, .L16 ST a2, -1 * SIZE(Y) j $31 NOP .align 3 .L20: dsra I, N, 2 blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) blez I, .L23 daddu X, X, INCX .align 3 .L22: ST a1, 0 * SIZE(Y) LD a1, 0 * SIZE(X) ST a2, 1 * SIZE(Y) daddu Y, Y, INCY LD a2, 1 * SIZE(X) daddu X, X, INCX ST a3, 0 * SIZE(Y) LD a3, 0 * SIZE(X) ST a4, 1 * SIZE(Y) daddu Y, Y, INCY LD a4, 1 * SIZE(X) daddu X, X, INCX ST a5, 0 * SIZE(Y) LD a5, 0 * SIZE(X) ST a6, 1 * SIZE(Y) daddu Y, Y, INCY LD a6, 1 * SIZE(X) daddu X, X, INCX ST a7, 0 * SIZE(Y) LD a7, 0 * SIZE(X) ST a8, 1 * SIZE(Y) daddu Y, Y, INCY LD a8, 1 * SIZE(X) daddiu I, I, -1 bgtz I, .L22 daddu X, X, INCX .align 3 .L23: ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) ST a4, 1 * SIZE(Y) daddu Y, Y, INCY ST a5, 0 * SIZE(Y) ST a6, 1 * SIZE(Y) daddu Y, Y, INCY ST a7, 0 * SIZE(Y) ST a8, 1 * SIZE(Y) daddu Y, Y, INCY .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX daddiu I, I, -1 ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) bgtz I, .L26 daddu Y, Y, INCY .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zdot.S000066400000000000000000000173061313527062700166610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define Y $7 #define INCY $8 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif MTC $0, s1 MOV s2, s1 MOV s3, s2 MOV s4, s3 dsll INCX, INCX, ZBASE_SHIFT li TEMP, 2 * SIZE blez N, .L999 dsll INCY, INCY, ZBASE_SHIFT bne INCX, TEMP, .L20 dsra I, N, 2 bne INCY, TEMP, .L20 NOP blez I, .L15 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu I, I, -1 blez I, .L14 LD b2, 1 * SIZE(Y) .align 3 .L13: MADD s1, s1, a1, b1 LD a3, 2 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 3 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 2 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 3 * SIZE(Y) MADD s1, s1, a3, b3 LD a1, 4 * SIZE(X) MADD s2, s2, a4, b3 LD a2, 5 * SIZE(X) MADD s3, s3, a3, b4 LD b1, 4 * SIZE(Y) MADD s4, s4, a4, b4 LD b2, 5 * SIZE(Y) MADD s1, s1, a1, b1 LD a3, 6 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 7 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 6 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 7 * SIZE(Y) MADD s1, s1, a3, b3 LD a1, 8 * SIZE(X) MADD s2, s2, a4, b3 LD a2, 9 * SIZE(X) MADD s3, s3, a3, b4 LD b1, 8 * SIZE(Y) MADD s4, s4, a4, b4 LD b2, 9 * SIZE(Y) daddiu I, I, -1 daddiu X, X, 8 * SIZE bgtz I, .L13 daddiu Y, Y, 8 * SIZE .align 3 .L14: MADD s1, s1, a1, b1 LD a3, 2 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 3 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 2 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 3 * SIZE(Y) MADD s1, s1, a3, b3 LD a1, 4 * SIZE(X) MADD s2, s2, a4, b3 LD a2, 5 * SIZE(X) MADD s3, s3, a3, b4 LD b1, 4 * SIZE(Y) MADD s4, s4, a4, b4 LD b2, 5 * SIZE(Y) MADD s1, s1, a1, b1 LD a3, 6 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 7 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 6 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 7 * SIZE(Y) MADD s1, s1, a3, b3 daddiu X, X, 8 * SIZE MADD s2, s2, a4, b3 daddiu Y, Y, 8 * SIZE MADD s3, s3, a3, b4 MADD s4, s4, a4, b4 .align 3 .L15: andi I, N, 3 blez I, .L999 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) daddiu I, I, -1 blez I, .L17 LD b2, 1 * SIZE(Y) .align 3 .L16: MADD s1, s1, a1, b1 daddiu I, I, -1 MADD s2, s2, a2, b1 LD b1, 2 * SIZE(Y) MADD s3, s3, a1, b2 LD a1, 2 * SIZE(X) MADD s4, s4, a2, b2 LD a2, 3 * SIZE(X) LD b2, 3 * SIZE(Y) daddiu X, X, 2 * SIZE bgtz I, .L16 daddiu Y, Y, 2 * SIZE .align 3 .L17: MADD s1, s1, a1, b1 MADD s2, s2, a2, b1 NOP MADD s3, s3, a1, b2 j .L999 MADD s4, s4, a2, b2 .align 3 .L20: #ifdef F_INTERFACE bgez INCX, .L21 daddiu TEMP, N, -1 mult TEMP, INCX mflo TEMP dsub X, X, TEMP .align 3 .L21: bgez INCY, .L22 daddiu TEMP, N, -1 mult TEMP, INCY mflo TEMP dsub Y, Y, TEMP .align 3 .L22: #endif blez I, .L25 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) dadd X, X, INCX daddiu I, I, -1 blez I, .L24 dadd Y, Y, INCY .align 3 .L23: MADD s1, s1, a1, b1 LD a3, 0 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 1 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 0 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 1 * SIZE(Y) dadd X, X, INCX dadd Y, Y, INCY MADD s1, s1, a3, b3 LD a1, 0 * SIZE(X) MADD s2, s2, a4, b3 LD a2, 1 * SIZE(X) MADD s3, s3, a3, b4 LD b1, 0 * SIZE(Y) MADD s4, s4, a4, b4 LD b2, 1 * SIZE(Y) dadd X, X, INCX dadd Y, Y, INCY MADD s1, s1, a1, b1 LD a3, 0 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 1 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 0 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 1 * SIZE(Y) dadd X, X, INCX dadd Y, Y, INCY MADD s1, s1, a3, b3 LD a1, 0 * SIZE(X) MADD s2, s2, a4, b3 LD a2, 1 * SIZE(X) MADD s3, s3, a3, b4 LD b1, 0 * SIZE(Y) MADD s4, s4, a4, b4 LD b2, 1 * SIZE(Y) dadd X, X, INCX daddiu I, I, -1 bgtz I, .L23 dadd Y, Y, INCY .align 3 .L24: MADD s1, s1, a1, b1 LD a3, 0 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 1 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 0 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 1 * SIZE(Y) dadd X, X, INCX dadd Y, Y, INCY MADD s1, s1, a3, b3 LD a1, 0 * SIZE(X) MADD s2, s2, a4, b3 LD a2, 1 * SIZE(X) MADD s3, s3, a3, b4 LD b1, 0 * SIZE(Y) MADD s4, s4, a4, b4 LD b2, 1 * SIZE(Y) dadd X, X, INCX dadd Y, Y, INCY MADD s1, s1, a1, b1 LD a3, 0 * SIZE(X) MADD s2, s2, a2, b1 LD a4, 1 * SIZE(X) MADD s3, s3, a1, b2 LD b3, 0 * SIZE(Y) MADD s4, s4, a2, b2 LD b4, 1 * SIZE(Y) MADD s1, s1, a3, b3 dadd X, X, INCX MADD s2, s2, a4, b3 dadd Y, Y, INCY MADD s3, s3, a3, b4 MADD s4, s4, a4, b4 .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) MADD s1, s1, a1, b1 MADD s2, s2, a2, b1 MADD s3, s3, a1, b2 MADD s4, s4, a2, b2 dadd X, X, INCX dadd Y, Y, INCY daddiu I, I, -1 bgtz I, .L26 NOP .align 3 .L999: NOP #ifndef CONJ SUB s1, s1, s4 #else ADD s1, s1, s4 #endif j $31 #ifndef CONJ ADD s3, s3, s2 #else SUB s3, s3, s2 #endif EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zgemm3m_kernel.S000066400000000000000000000731641313527062700206240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define CO5 $18 #define CO6 $19 #define CO7 $20 #define CO8 $21 #if defined(TRMMKERNEL) #define OFFSET $22 #define KK $23 #define TEMP $24 #endif #define a1 $f0 #define a2 $f1 #define a3 $f28 #define a4 $f29 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f17 #define c41 $f18 #define c42 $f19 #define c51 $f20 #define c52 $f21 #define c61 $f22 #define c62 $f23 #define c71 $f24 #define c72 $f25 #define c81 $f26 #define c82 $f27 #define ALPHA_R $f15 #define ALPHA_I $f16 PROLOGUE daddiu $sp, $sp, -128 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) sdc1 $f28, 80($sp) sdc1 $f29, 88($sp) LDARG LDC, 128($sp) dsll LDC, LDC, ZBASE_SHIFT dsra J, N, 3 blez J, .L30 nop .L10: move CO1, C MTC $0, c11 daddu CO2, C, LDC move AO, A daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 daddu CO5, CO4, LDC MOV c31, c11 daddu CO6, CO5, LDC MOV c41, c11 daddu CO7, CO6, LDC MOV c51, c11 daddu CO8, CO7, LDC dsra I, M, 1 daddu C, CO8, LDC blez I, .L20 MOV c61, c11 .L11: LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, K, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 blez L, .L13 MADD c41, c41, a1, b4 NOP .align 3 .L12: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 LD a4, 2 * SIZE(AO) MADD c61, c61, a1, b2 NOP MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 LD a4, 6 * SIZE(AO) MADD c61, c61, a3, b2 NOP MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 daddiu L, L, -1 MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 bgtz L, .L12 MADD c41, c41, a1, b4 NOP .align 3 .L13: MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 NOP MADD c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD c71, c71, a1, b3 NOP MADD c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a4, b7 NOP MADD c61, c61, a4, b2 NOP MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 NOP MADD c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 NOP MADD c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD c71, c71, a3, b3 NOP MADD c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD c21, c21, a4, b2 NOP MADD c31, c31, a4, b3 NOP MADD c41, c41, a4, b4 NOP MADD c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD c71, c71, a4, b3 NOP MADD c81, c81, a4, b4 NOP MADD c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: andi L, K, 3 NOP blez L, .L18 NOP .align 3 .L16: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 NOP MADD c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 daddiu L, L, -1 MADD c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) LD $f2, 2 * SIZE(CO1) LD $f3, 3 * SIZE(CO1) LD $f4, 0 * SIZE(CO2) MADD $f0, $f0, ALPHA_R, c11 LD $f5, 1 * SIZE(CO2) MADD $f1, $f1, ALPHA_I, c11 LD $f6, 2 * SIZE(CO2) MADD $f2, $f2, ALPHA_R, c12 LD $f7, 3 * SIZE(CO2) MADD $f3, $f3, ALPHA_I, c12 MADD $f4, $f4, ALPHA_R, c21 ST $f0, 0 * SIZE(CO1) MADD $f5, $f5, ALPHA_I, c21 ST $f1, 1 * SIZE(CO1) MADD $f6, $f6, ALPHA_R, c22 ST $f2, 2 * SIZE(CO1) MADD $f7, $f7, ALPHA_I, c22 ST $f3, 3 * SIZE(CO1) LD $f0, 0 * SIZE(CO3) LD $f1, 1 * SIZE(CO3) LD $f2, 2 * SIZE(CO3) LD $f3, 3 * SIZE(CO3) ST $f4, 0 * SIZE(CO2) ST $f5, 1 * SIZE(CO2) ST $f6, 2 * SIZE(CO2) ST $f7, 3 * SIZE(CO2) LD $f4, 0 * SIZE(CO4) LD $f5, 1 * SIZE(CO4) LD $f6, 2 * SIZE(CO4) LD $f7, 3 * SIZE(CO4) MADD $f0, $f0, ALPHA_R, c31 MADD $f1, $f1, ALPHA_I, c31 MADD $f2, $f2, ALPHA_R, c32 MADD $f3, $f3, ALPHA_I, c32 MADD $f4, $f4, ALPHA_R, c41 ST $f0, 0 * SIZE(CO3) MADD $f5, $f5, ALPHA_I, c41 ST $f1, 1 * SIZE(CO3) MADD $f6, $f6, ALPHA_R, c42 ST $f2, 2 * SIZE(CO3) MADD $f7, $f7, ALPHA_I, c42 ST $f3, 3 * SIZE(CO3) LD $f0, 0 * SIZE(CO5) LD $f1, 1 * SIZE(CO5) LD $f2, 2 * SIZE(CO5) LD $f3, 3 * SIZE(CO5) ST $f4, 0 * SIZE(CO4) ST $f5, 1 * SIZE(CO4) ST $f6, 2 * SIZE(CO4) ST $f7, 3 * SIZE(CO4) LD $f4, 0 * SIZE(CO6) LD $f5, 1 * SIZE(CO6) LD $f6, 2 * SIZE(CO6) LD $f7, 3 * SIZE(CO6) MADD $f0, $f0, ALPHA_R, c51 daddiu CO1,CO1, 4 * SIZE MADD $f1, $f1, ALPHA_I, c51 daddiu CO2,CO2, 4 * SIZE MADD $f2, $f2, ALPHA_R, c52 daddiu CO3,CO3, 4 * SIZE MADD $f3, $f3, ALPHA_I, c52 daddiu CO4,CO4, 4 * SIZE MADD $f4, $f4, ALPHA_R, c61 ST $f0, 0 * SIZE(CO5) MADD $f5, $f5, ALPHA_I, c61 ST $f1, 1 * SIZE(CO5) MADD $f6, $f6, ALPHA_R, c62 ST $f2, 2 * SIZE(CO5) MADD $f7, $f7, ALPHA_I, c62 ST $f3, 3 * SIZE(CO5) LD $f0, 0 * SIZE(CO7) LD $f1, 1 * SIZE(CO7) LD $f2, 2 * SIZE(CO7) LD $f3, 3 * SIZE(CO7) ST $f4, 0 * SIZE(CO6) ST $f5, 1 * SIZE(CO6) ST $f6, 2 * SIZE(CO6) ST $f7, 3 * SIZE(CO6) LD $f4, 0 * SIZE(CO8) daddiu I, I, -1 LD $f5, 1 * SIZE(CO8) MTC $0, c11 LD $f6, 2 * SIZE(CO8) LD $f7, 3 * SIZE(CO8) MADD $f0, $f0, ALPHA_R, c71 daddiu CO5,CO5, 4 * SIZE MADD $f1, $f1, ALPHA_I, c71 daddiu CO6,CO6, 4 * SIZE MADD $f2, $f2, ALPHA_R, c72 daddiu CO7,CO7, 4 * SIZE MADD $f3, $f3, ALPHA_I, c72 daddiu CO8,CO8, 4 * SIZE MADD $f4, $f4, ALPHA_R, c81 ST $f0, -4 * SIZE(CO7) MADD $f5, $f5, ALPHA_I, c81 ST $f1, -3 * SIZE(CO7) MADD $f6, $f6, ALPHA_R, c82 ST $f2, -2 * SIZE(CO7) MADD $f7, $f7, ALPHA_I, c82 ST $f3, -1 * SIZE(CO7) ST $f4, -4 * SIZE(CO8) MOV c21, c11 ST $f5, -3 * SIZE(CO8) MOV c31, c11 ST $f6, -2 * SIZE(CO8) MOV c41, c11 ST $f7, -1 * SIZE(CO8) MOV c51, c11 bgtz I, .L11 MOV c61, c11 .align 3 .L20: andi I, M, 1 MOV c61, c11 blez I, .L29 MOV c71, c11 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, K, 2 MOV c81, c11 blez L, .L25 move BO, B .align 3 .L22: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) MADD c51, c51, a1, b5 LD b5, 20 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 9 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 10 * SIZE(BO) MADD c81, c81, a1, b4 LD b4, 11 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) MADD c51, c51, a2, b7 LD b7, 28 * SIZE(BO) MADD c61, c61, a2, b2 LD b2, 17 * SIZE(BO) MADD c71, c71, a2, b3 LD b3, 18 * SIZE(BO) MADD c81, c81, a2, b4 LD b4, 19 * SIZE(BO) LD a2, 5 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a3, b1 LD b1, 32 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 21 * SIZE(BO) MADD c31, c31, a3, b3 LD b3, 22 * SIZE(BO) MADD c41, c41, a3, b4 LD b4, 23 * SIZE(BO) MADD c51, c51, a3, b5 LD b5, 36 * SIZE(BO) MADD c61, c61, a3, b2 LD b2, 25 * SIZE(BO) MADD c71, c71, a3, b3 LD b3, 26 * SIZE(BO) MADD c81, c81, a3, b4 LD b4, 27 * SIZE(BO) LD a3, 2 * SIZE(AO) daddiu BO, BO, 32 * SIZE MADD c11, c11, a4, b6 LD b6, 8 * SIZE(BO) MADD c21, c21, a4, b2 LD b2, -3 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, -2 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, -1 * SIZE(BO) MADD c51, c51, a4, b7 LD b7, 12 * SIZE(BO) MADD c61, c61, a4, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a4, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a4, b4 LD b4, 3 * SIZE(BO) bgtz L, .L22 LD a4, 3 * SIZE(AO) .align 3 .L25: andi L, K, 3 NOP blez L, .L28 NOP .align 3 .L26: MADD c11, c11, a1, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) daddiu L, L, -1 MOV a2, a2 daddiu AO, AO, 1 * SIZE daddiu BO, BO, 8 * SIZE MADD c51, c51, a1, b5 LD b5, 4 * SIZE(BO) MADD c61, c61, a1, b2 LD b2, 1 * SIZE(BO) MADD c71, c71, a1, b3 LD b3, 2 * SIZE(BO) MADD c81, c81, a1, b4 LD a1, 0 * SIZE(AO) bgtz L, .L26 LD b4, 3 * SIZE(BO) .L28: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) LD $f2, 0 * SIZE(CO2) LD $f3, 1 * SIZE(CO2) LD $f4, 0 * SIZE(CO3) MADD $f0, $f0, ALPHA_R, c11 LD $f5, 1 * SIZE(CO3) MADD $f1, $f1, ALPHA_I, c11 LD $f6, 0 * SIZE(CO4) MADD $f2, $f2, ALPHA_R, c21 LD $f7, 1 * SIZE(CO4) MADD $f3, $f3, ALPHA_I, c21 MADD $f4, $f4, ALPHA_R, c31 ST $f0, 0 * SIZE(CO1) MADD $f5, $f5, ALPHA_I, c31 ST $f1, 1 * SIZE(CO1) MADD $f6, $f6, ALPHA_R, c41 ST $f2, 0 * SIZE(CO2) MADD $f7, $f7, ALPHA_I, c41 ST $f3, 1 * SIZE(CO2) LD $f0, 0 * SIZE(CO5) LD $f1, 1 * SIZE(CO5) LD $f2, 0 * SIZE(CO6) LD $f3, 1 * SIZE(CO6) ST $f4, 0 * SIZE(CO3) ST $f5, 1 * SIZE(CO3) ST $f6, 0 * SIZE(CO4) ST $f7, 1 * SIZE(CO4) LD $f4, 0 * SIZE(CO7) MADD $f0, $f0, ALPHA_R, c51 LD $f5, 1 * SIZE(CO7) MADD $f1, $f1, ALPHA_I, c51 LD $f6, 0 * SIZE(CO8) MADD $f2, $f2, ALPHA_R, c61 LD $f7, 1 * SIZE(CO8) MADD $f3, $f3, ALPHA_I, c61 MADD $f4, $f4, ALPHA_R, c71 ST $f0, 0 * SIZE(CO5) MADD $f5, $f5, ALPHA_I, c71 ST $f1, 1 * SIZE(CO5) MADD $f6, $f6, ALPHA_R, c81 ST $f2, 0 * SIZE(CO6) MADD $f7, $f7, ALPHA_I, c81 ST $f3, 1 * SIZE(CO6) ST $f4, 0 * SIZE(CO7) ST $f5, 1 * SIZE(CO7) ST $f6, 0 * SIZE(CO8) ST $f7, 1 * SIZE(CO8) .align 3 .L29: bgtz J, .L10 move B, BO .align 3 .L30: andi J, N, 4 blez J, .L50 move AO, A move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC MOV c21, c11 daddu C, CO4, LDC MOV c31, c11 dsra I, M, 1 blez I, .L40 MOV c41, c11 .L31: LD a1, 0 * SIZE(AO) LD a3, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) MOV c32, c11 LD b4, 3 * SIZE(B) MOV c42, c11 LD b5, 4 * SIZE(B) dsra L, K, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L35 move BO, B .align 3 .L32: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD c21, c21, a1, b2 NOP MADD c31, c31, a1, b3 NOP MADD c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD c11, c11, a3, b6 LD a2, 5 * SIZE(AO) MADD c21, c21, a3, b2 NOP MADD c31, c31, a3, b3 NOP MADD c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD c11, c11, a3, b7 LD a2, 7 * SIZE(AO) MADD c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD c31, c31, a3, b3 daddiu BO, BO, 16 * SIZE MADD c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD c12, c12, a2, b7 LD b7, 12 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD c42, c42, a2, b4 NOP bgtz L, .L32 LD b4, 3 * SIZE(BO) .align 3 .L35: andi L, K, 3 NOP blez L, .L38 NOP .align 3 .L36: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 daddiu L, L, -1 MADD c31, c31, a1, b3 daddiu AO, AO, 2 * SIZE MADD c41, c41, a1, b4 LD a1, 0 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 4 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD c42, c42, a2, b4 LD b4, 7 * SIZE(BO) bgtz L, .L36 daddiu BO, BO, 4 * SIZE .L38: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) LD $f2, 2 * SIZE(CO1) LD $f3, 3 * SIZE(CO1) LD $f4, 0 * SIZE(CO2) LD $f5, 1 * SIZE(CO2) LD $f6, 2 * SIZE(CO2) LD $f7, 3 * SIZE(CO2) MADD $f0, $f0, ALPHA_R, c11 MADD $f1, $f1, ALPHA_I, c11 MADD $f2, $f2, ALPHA_R, c12 MADD $f3, $f3, ALPHA_I, c12 MADD $f4, $f4, ALPHA_R, c21 ST $f0, 0 * SIZE(CO1) MADD $f5, $f5, ALPHA_I, c21 ST $f1, 1 * SIZE(CO1) MADD $f6, $f6, ALPHA_R, c22 ST $f2, 2 * SIZE(CO1) MADD $f7, $f7, ALPHA_I, c22 ST $f3, 3 * SIZE(CO1) LD $f0, 0 * SIZE(CO3) LD $f1, 1 * SIZE(CO3) LD $f2, 2 * SIZE(CO3) LD $f3, 3 * SIZE(CO3) ST $f4, 0 * SIZE(CO2) MADD $f0, $f0, ALPHA_R, c31 ST $f5, 1 * SIZE(CO2) MADD $f1, $f1, ALPHA_I, c31 ST $f6, 2 * SIZE(CO2) MADD $f2, $f2, ALPHA_R, c32 ST $f7, 3 * SIZE(CO2) MADD $f3, $f3, ALPHA_I, c32 LD $f4, 0 * SIZE(CO4) LD $f5, 1 * SIZE(CO4) LD $f6, 2 * SIZE(CO4) LD $f7, 3 * SIZE(CO4) MADD $f4, $f4, ALPHA_R, c41 daddiu CO1,CO1, 4 * SIZE MADD $f5, $f5, ALPHA_I, c41 daddiu CO2,CO2, 4 * SIZE MADD $f6, $f6, ALPHA_R, c42 daddiu CO3,CO3, 4 * SIZE MADD $f7, $f7, ALPHA_I, c42 daddiu CO4,CO4, 4 * SIZE ST $f0, -4 * SIZE(CO3) daddiu I, I, -1 ST $f1, -3 * SIZE(CO3) ST $f2, -2 * SIZE(CO3) ST $f3, -1 * SIZE(CO3) ST $f4, -4 * SIZE(CO4) MTC $0, c11 ST $f5, -3 * SIZE(CO4) MOV c21, c11 ST $f6, -2 * SIZE(CO4) MOV c31, c11 ST $f7, -1 * SIZE(CO4) bgtz I, .L31 MOV c41, c11 .align 3 .L40: andi I, M, 1 blez I, .L49 MOV c61, c11 LD a1, 0 * SIZE(AO) MOV c71, c11 LD a2, 1 * SIZE(AO) MOV c81, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, K, 2 blez L, .L45 move BO, B .align 3 .L42: MADD c11, c11, a1, b1 LD b1, 16 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) daddiu L, L, -1 MADD c11, c11, a2, b5 LD b5, 20 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 11 * SIZE(BO) LD a2, 2 * SIZE(AO) daddiu AO, AO, 4 * SIZE MADD c11, c11, a2, b6 LD b6, 24 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 13 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 14 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 15 * SIZE(BO) LD a2, -1 * SIZE(AO) daddiu BO, BO, 16 * SIZE MADD c11, c11, a2, b7 LD b7, 12 * SIZE(BO) MADD c21, c21, a2, b2 LD b2, 1 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 2 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L42 LD a2, 1 * SIZE(AO) .align 3 .L45: andi L, K, 3 NOP blez L, .L48 NOP .align 3 .L46: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a1, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a1, b4 LD a1, 1 * SIZE(AO) LD b4, 7 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE MOV a2, a2 bgtz L, .L46 daddiu BO, BO, 4 * SIZE .L48: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) LD $f2, 0 * SIZE(CO2) LD $f3, 1 * SIZE(CO2) LD $f4, 0 * SIZE(CO3) MADD $f0, $f0, ALPHA_R, c11 LD $f5, 1 * SIZE(CO3) MADD $f1, $f1, ALPHA_I, c11 LD $f6, 0 * SIZE(CO4) MADD $f2, $f2, ALPHA_R, c21 LD $f7, 1 * SIZE(CO4) MADD $f3, $f3, ALPHA_I, c21 MADD $f4, $f4, ALPHA_R, c31 ST $f0, 0 * SIZE(CO1) MADD $f5, $f5, ALPHA_I, c31 ST $f1, 1 * SIZE(CO1) MADD $f6, $f6, ALPHA_R, c41 ST $f2, 0 * SIZE(CO2) MADD $f7, $f7, ALPHA_I, c41 ST $f3, 1 * SIZE(CO2) ST $f4, 0 * SIZE(CO3) ST $f5, 1 * SIZE(CO3) ST $f6, 0 * SIZE(CO4) ST $f7, 1 * SIZE(CO4) .align 3 .L49: move B, BO .align 3 .L50: andi J, N, 2 blez J, .L70 move AO, A move CO1, C daddu CO2, C, LDC dsra I, M, 1 blez I, .L60 daddu C, CO2, LDC .L51: LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, K, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L55 move BO, B .align 3 .L52: MADD c11, c11, a1, b1 LD a3, 2 * SIZE(AO) MADD c21, c21, a1, b2 LD b4, 3 * SIZE(BO) MADD c12, c12, a2, b1 LD a4, 3 * SIZE(AO) MADD c22, c22, a2, b2 LD b1, 8 * SIZE(BO) MADD c11, c11, a3, b3 LD a1, 8 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 5 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 5 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 6 * SIZE(BO) MADD c11, c11, a5, b5 LD a3, 6 * SIZE(AO) MADD c21, c21, a5, b2 LD b4, 7 * SIZE(BO) MADD c12, c12, a2, b5 LD a4, 7 * SIZE(AO) MADD c22, c22, a2, b2 LD b5, 12 * SIZE(BO) MADD c11, c11, a3, b3 LD a5, 12 * SIZE(AO) MADD c21, c21, a3, b4 LD b2, 9 * SIZE(BO) MADD c12, c12, a4, b3 LD a2, 9 * SIZE(AO) MADD c22, c22, a4, b4 LD b3, 10 * SIZE(BO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L52 daddiu BO, BO, 8 * SIZE .align 3 .L55: andi L, K, 3 NOP blez L, .L58 NOP .align 3 .L56: MADD c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD c22, c22, a2, b2 LD b2, 3 * SIZE(BO) daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L56 daddiu BO, BO, 2 * SIZE .L58: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) LD $f2, 2 * SIZE(CO1) LD $f3, 3 * SIZE(CO1) LD $f4, 0 * SIZE(CO2) LD $f5, 1 * SIZE(CO2) LD $f6, 2 * SIZE(CO2) LD $f7, 3 * SIZE(CO2) MADD $f0, $f0, ALPHA_R, c11 daddiu I, I, -1 MADD $f1, $f1, ALPHA_I, c11 daddiu CO1,CO1, 4 * SIZE MADD $f2, $f2, ALPHA_R, c12 daddiu CO2,CO2, 4 * SIZE MADD $f3, $f3, ALPHA_I, c12 MADD $f4, $f4, ALPHA_R, c21 MADD $f5, $f5, ALPHA_I, c21 MADD $f6, $f6, ALPHA_R, c22 MADD $f7, $f7, ALPHA_I, c22 ST $f0, -4 * SIZE(CO1) ST $f1, -3 * SIZE(CO1) ST $f2, -2 * SIZE(CO1) ST $f3, -1 * SIZE(CO1) ST $f4, -4 * SIZE(CO2) ST $f5, -3 * SIZE(CO2) ST $f6, -2 * SIZE(CO2) bgtz I, .L51 ST $f7, -1 * SIZE(CO2) .align 3 .L60: andi I, M, 1 blez I, .L69 NOP dsra L, K, 2 LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c31, c11 LD a4, 3 * SIZE(AO) MOV c41, c11 LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L65 move BO, B .align 3 .L62: MADD c11, c11, a1, b1 LD b1, 4 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 5 * SIZE(BO) MADD c31, c31, a2, b3 LD b3, 6 * SIZE(BO) MADD c41, c41, a2, b4 LD b4, 7 * SIZE(BO) LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) MADD c11, c11, a3, b1 LD b1, 8 * SIZE(BO) MADD c21, c21, a3, b2 LD b2, 9 * SIZE(BO) MADD c31, c31, a4, b3 LD b3, 10 * SIZE(BO) MADD c41, c41, a4, b4 LD b4, 11 * SIZE(BO) LD a3, 6 * SIZE(AO) LD a4, 7 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L62 daddiu BO, BO, 8 * SIZE .align 3 .L65: andi L, K, 3 NOP blez L, .L68 NOP .align 3 .L66: MADD c11, c11, a1, b1 LD b1, 2 * SIZE(BO) MADD c21, c21, a1, b2 LD b2, 3 * SIZE(BO) LD a1, 1 * SIZE(AO) daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L66 daddiu BO, BO, 2 * SIZE .L68: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) LD $f2, 0 * SIZE(CO2) LD $f3, 1 * SIZE(CO2) ADD c11, c11, c31 ADD c21, c21, c41 MADD $f0, $f0, ALPHA_R, c11 MADD $f1, $f1, ALPHA_I, c11 MADD $f2, $f2, ALPHA_R, c21 MADD $f3, $f3, ALPHA_I, c21 ST $f0, 0 * SIZE(CO1) ST $f1, 1 * SIZE(CO1) ST $f2, 0 * SIZE(CO2) ST $f3, 1 * SIZE(CO2) .align 3 .L69: move B, BO .align 3 .L70: andi J, N, 1 blez J, .L999 move AO, A move CO1, C dsra I, M, 1 blez I, .L80 daddu C, CO1, LDC .L71: LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a5, 4 * SIZE(AO) LD b1, 0 * SIZE(B) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 LD b3, 2 * SIZE(B) LD b5, 4 * SIZE(B) dsra L, K, 2 LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) blez L, .L75 move BO, B .align 3 .L72: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 2 * SIZE(AO) LD a2, 3 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 4 * SIZE(AO) LD a2, 5 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 LD a1, 6 * SIZE(AO) LD a2, 7 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 8 * SIZE bgtz L, .L72 daddiu BO, BO, 4 * SIZE .align 3 .L75: andi L, K, 3 NOP blez L, .L78 NOP .align 3 .L76: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 MADD c12, c12, a2, b1 daddiu L, L, -1 daddiu AO, AO, 2 * SIZE bgtz L, .L76 daddiu BO, BO, 1 * SIZE .L78: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) LD $f2, 2 * SIZE(CO1) LD $f3, 3 * SIZE(CO1) ADD c11, c11, c21 daddiu I, I, -1 ADD c12, c12, c22 daddiu CO1,CO1, 4 * SIZE MADD $f0, $f0, ALPHA_R, c11 MADD $f1, $f1, ALPHA_I, c11 MADD $f2, $f2, ALPHA_R, c12 MADD $f3, $f3, ALPHA_I, c12 ST $f0, -4 * SIZE(CO1) ST $f1, -3 * SIZE(CO1) ST $f2, -2 * SIZE(CO1) bgtz I, .L71 ST $f3, -1 * SIZE(CO1) .align 3 .L80: andi I, M, 1 blez I, .L89 NOP LD a1, 0 * SIZE(AO) MTC $0, c11 LD a2, 1 * SIZE(AO) MOV c21, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b1, 0 * SIZE(B) LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) LD b5, 4 * SIZE(B) LD b6, 8 * SIZE(B) LD b7, 12 * SIZE(B) dsra L, K, 2 blez L, .L85 move BO, B .align 3 .L82: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 1 * SIZE(AO) LD b1, 1 * SIZE(BO) MADD c21, c21, a1, b1 LD a1, 2 * SIZE(AO) LD b1, 2 * SIZE(BO) MADD c11, c11, a1, b1 LD a1, 3 * SIZE(AO) LD b1, 3 * SIZE(BO) MADD c21, c21, a1, b1 daddiu L, L, -1 daddiu AO, AO, 4 * SIZE bgtz L, .L82 daddiu BO, BO, 4 * SIZE .align 3 .L85: andi L, K, 3 NOP blez L, .L88 NOP .align 3 .L86: LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) MADD c11, c11, a1, b1 daddiu L, L, -1 daddiu AO, AO, 1 * SIZE bgtz L, .L86 daddiu BO, BO, 1 * SIZE .L88: LD $f0, 0 * SIZE(CO1) LD $f1, 1 * SIZE(CO1) ADD c11, c11, c21 MADD $f0, $f0, ALPHA_R, c11 MADD $f1, $f1, ALPHA_I, c11 ST $f0, 0 * SIZE(CO1) ST $f1, 1 * SIZE(CO1) .align 3 .L89: move B, BO .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) ldc1 $f28, 80($sp) ldc1 $f29, 88($sp) j $31 daddiu $sp, $sp, 128 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zgemm_kernel.S000066400000000000000000000556741313527062700203720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #if defined(TRMMKERNEL) #define OFFSET $18 #define KK $19 #define TEMP $20 #endif #define a1 $f0 #define a2 $f1 #define a3 $f28 #define a4 $f29 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f17 #define c41 $f18 #define c42 $f19 #define c51 $f20 #define c52 $f21 #define c61 $f22 #define c62 $f23 #define c71 $f24 #define c72 $f25 #define c81 $f26 #define c82 $f27 #define ALPHA_R $f15 #define ALPHA_I $f16 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 NMSUB #define MADD4 NMSUB #endif PROLOGUE LDARG LDC, 0($sp) daddiu $sp, $sp, -128 SDARG $16, 0($sp) SDARG $17, 8($sp) sdc1 $f24, 16($sp) sdc1 $f25, 24($sp) sdc1 $f26, 32($sp) sdc1 $f27, 40($sp) sdc1 $f28, 48($sp) sdc1 $f29, 56($sp) #if defined(TRMMKERNEL) SDARG $18, 64($sp) SDARG $19, 72($sp) SDARG $20, 80($sp) LDARG OFFSET, 128 + 8($sp) #endif #ifndef __64BIT__ sdc1 $f20, 88($sp) sdc1 $f21, 96($sp) sdc1 $f22,104($sp) sdc1 $f23,112($sp) #endif dsll LDC, LDC, ZBASE_SHIFT #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif dsra J, N, 2 blez J, .L20 nop .L10: move CO1, C MTC $0, c11 daddu CO2, C, LDC move AO, A daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 MOV c31, c11 #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif MOV c41, c11 MOV c51, c11 move I, M daddu C, CO4, LDC blez I, .L19 MOV c61, c11 .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, ZBASE_SHIFT dsll TEMP, KK, 2 + ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(BO) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 MOV c32, c11 LD b3, 2 * SIZE(BO) MOV c42, c11 LD b4, 3 * SIZE(BO) MOV c52, c11 LD b5, 4 * SIZE(BO) MOV c62, c11 LD b6, 8 * SIZE(BO) MOV c72, c11 LD b7, 12 * SIZE(BO) MOV c82, c11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 4 #endif dsra L, TEMP, 2 blez L, .L15 NOP #else LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, K, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B #endif MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 NOP blez L, .L13 MADD3 c41, c41, a1, b4 .align 3 .L12: MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 NOP MADD3 c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD1 c71, c71, a1, b3 NOP MADD3 c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c51, c51, a4, b7 NOP MADD3 c61, c61, a4, b2 NOP MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD1 c51, c51, a3, b5 NOP MADD3 c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD1 c71, c71, a3, b3 NOP MADD3 c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 daddiu L, L, -1 MADD2 c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD1 c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD3 c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP bgtz L, .L12 MADD3 c41, c41, a1, b4 .align 3 .L13: MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 NOP MADD3 c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD1 c71, c71, a1, b3 NOP MADD3 c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c51, c51, a4, b7 NOP MADD3 c61, c61, a4, b2 NOP MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD1 c51, c51, a3, b5 NOP MADD3 c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD1 c71, c71, a3, b3 NOP MADD3 c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD1 c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD3 c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L18 NOP .align 3 .L16: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 daddiu L, L, -1 MADD3 c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD1 c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD3 c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: #ifndef TRMMKERNEL LD b1, 0 * SIZE(CO1) ADD c11, c11, c22 LD b2, 1 * SIZE(CO1) ADD c12, c12, c21 LD b3, 0 * SIZE(CO2) ADD c31, c31, c42 LD b4, 1 * SIZE(CO2) ADD c32, c32, c41 LD b5, 0 * SIZE(CO3) ADD c51, c51, c62 LD b6, 1 * SIZE(CO3) ADD c52, c52, c61 LD b7, 0 * SIZE(CO4) ADD c71, c71, c82 LD b8, 1 * SIZE(CO4) ADD c72, c72, c81 MADD b1, b1, ALPHA_R, c11 daddiu CO1,CO1, 2 * SIZE MADD b2, b2, ALPHA_R, c12 daddiu CO2,CO2, 2 * SIZE MADD b3, b3, ALPHA_R, c31 daddiu CO3,CO3, 2 * SIZE MADD b4, b4, ALPHA_R, c32 daddiu CO4,CO4, 2 * SIZE MADD b5, b5, ALPHA_R, c51 daddiu I, I, -1 MADD b6, b6, ALPHA_R, c52 NOP MADD b7, b7, ALPHA_R, c71 NOP MADD b8, b8, ALPHA_R, c72 NOP NMSUB b1, b1, ALPHA_I, c12 NOP MADD b2, b2, ALPHA_I, c11 MTC $0, c11 NMSUB b3, b3, ALPHA_I, c32 NOP MADD b4, b4, ALPHA_I, c31 NOP ST b1, -2 * SIZE(CO1) NMSUB b5, b5, ALPHA_I, c52 ST b2, -1 * SIZE(CO1) MADD b6, b6, ALPHA_I, c51 ST b3, -2 * SIZE(CO2) NMSUB b7, b7, ALPHA_I, c72 ST b4, -1 * SIZE(CO2) MADD b8, b8, ALPHA_I, c71 ST b5, -2 * SIZE(CO3) MOV c21, c11 ST b6, -1 * SIZE(CO3) MOV c31, c11 ST b7, -2 * SIZE(CO4) MOV c41, c11 ST b8, -1 * SIZE(CO4) MOV c51, c11 #else ADD c11, c11, c22 daddiu CO1,CO1, 2 * SIZE ADD c12, c12, c21 daddiu CO2,CO2, 2 * SIZE ADD c31, c31, c42 daddiu CO3,CO3, 2 * SIZE ADD c32, c32, c41 daddiu CO4,CO4, 2 * SIZE ADD c51, c51, c62 daddiu I, I, -1 ADD c52, c52, c61 ADD c71, c71, c82 ADD c72, c72, c81 MUL b1, ALPHA_R, c11 MUL b2, ALPHA_R, c12 MUL b3, ALPHA_R, c31 MUL b4, ALPHA_R, c32 MUL b5, ALPHA_R, c51 MUL b6, ALPHA_R, c52 MUL b7, ALPHA_R, c71 MUL b8, ALPHA_R, c72 NMSUB b1, b1, ALPHA_I, c12 NOP MADD b2, b2, ALPHA_I, c11 MTC $0, c11 NMSUB b3, b3, ALPHA_I, c32 NOP MADD b4, b4, ALPHA_I, c31 NOP ST b1, -2 * SIZE(CO1) NMSUB b5, b5, ALPHA_I, c52 ST b2, -1 * SIZE(CO1) MADD b6, b6, ALPHA_I, c51 ST b3, -2 * SIZE(CO2) NMSUB b7, b7, ALPHA_I, c72 ST b4, -1 * SIZE(CO2) MADD b8, b8, ALPHA_I, c71 ST b5, -2 * SIZE(CO3) MOV c21, c11 ST b6, -1 * SIZE(CO3) MOV c31, c11 ST b7, -2 * SIZE(CO4) MOV c41, c11 ST b8, -1 * SIZE(CO4) MOV c51, c11 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -4 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 2 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif bgtz I, .L11 MOV c61, c11 .align 3 .L19: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 4 #endif bgtz J, .L10 move B, BO .align 3 .L20: andi J, N, 2 MTC $0, c11 blez J, .L30 move CO1, C daddu CO2, C, LDC daddu C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif move I, M blez I, .L29 move AO, A .align 3 .L21: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, ZBASE_SHIFT dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(BO) MOV c31, c11 LD a3, 4 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) MOV c12, c11 LD b4, 3 * SIZE(BO) MOV c22, c11 LD b5, 4 * SIZE(BO) MOV c32, c11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 blez L, .L25 MOV c42, c11 #else LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(B) MOV c31, c11 LD a3, 4 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(B) dsra L, K, 2 LD b3, 2 * SIZE(B) MOV c12, c11 LD b4, 3 * SIZE(B) MOV c22, c11 LD b5, 4 * SIZE(B) MOV c32, c11 NOP MOV c42, c11 blez L, .L25 move BO, B #endif .align 3 .L22: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c12, c12, a2, b5 LD b5, 12 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c11, c11, a3, b5 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD2 c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 19 * SIZE(BO) bgtz L, .L22 daddiu BO, BO, 16 * SIZE .align 3 .L25: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L28 NOP .align 3 .L26: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 daddiu BO, BO, 4 * SIZE MADD3 c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 0 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L26 daddiu AO, AO, 2 * SIZE .L28: #ifndef TRMMKERNEL LD b1, 0 * SIZE(CO1) ADD c11, c11, c22 LD b2, 1 * SIZE(CO1) ADD c12, c12, c21 LD b3, 0 * SIZE(CO2) ADD c31, c31, c42 LD b4, 1 * SIZE(CO2) ADD c32, c32, c41 MADD b1, b1, ALPHA_R, c11 daddiu CO1,CO1, 2 * SIZE MADD b2, b2, ALPHA_R, c12 daddiu CO2,CO2, 2 * SIZE MADD b3, b3, ALPHA_R, c31 daddiu I, I, -1 MADD b4, b4, ALPHA_R, c32 NMSUB b1, b1, ALPHA_I, c12 NOP MADD b2, b2, ALPHA_I, c11 MTC $0, c11 NMSUB b3, b3, ALPHA_I, c32 NOP MADD b4, b4, ALPHA_I, c31 NOP ST b1, -2 * SIZE(CO1) ST b2, -1 * SIZE(CO1) ST b3, -2 * SIZE(CO2) #else ADD c11, c11, c22 ADD c12, c12, c21 ADD c31, c31, c42 ADD c32, c32, c41 MUL b1, ALPHA_R, c11 daddiu CO1,CO1, 2 * SIZE MUL b2, ALPHA_R, c12 daddiu CO2,CO2, 2 * SIZE MUL b3, ALPHA_R, c31 daddiu I, I, -1 MUL b4, ALPHA_R, c32 NMSUB b1, b1, ALPHA_I, c12 NOP MADD b2, b2, ALPHA_I, c11 MTC $0, c11 NMSUB b3, b3, ALPHA_I, c32 NOP MADD b4, b4, ALPHA_I, c31 NOP ST b1, -2 * SIZE(CO1) ST b2, -1 * SIZE(CO1) ST b3, -2 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif bgtz I, .L21 ST b4, -1 * SIZE(CO2) .align 3 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif move B, BO .align 3 .L30: andi J, N, 1 MTC $0, c11 blez J, .L999 move CO1, C #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif move I, M daddu C, CO1, LDC blez I, .L39 move AO, A .align 3 .L31: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(BO) MOV c31, c11 LD a2, 1 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(BO) MOV c12, c11 NOP MOV c22, c11 LD a3, 4 * SIZE(AO) MOV c32, c11 LD b3, 4 * SIZE(BO) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L35 MOV c42, c11 #else LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(B) MOV c31, c11 LD a2, 1 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(B) MOV c12, c11 dsra L, K, 2 MOV c22, c11 LD a3, 4 * SIZE(AO) MOV c32, c11 LD b3, 4 * SIZE(B) NOP MOV c42, c11 blez L, .L35 move BO, B #endif .align 3 .L32: MADD1 c11, c11, a1, b1 LD b4, 3 * SIZE(BO) MADD3 c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 LD b2, 5 * SIZE(BO) MADD3 c21, c21, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b4 LD a2, 5 * SIZE(AO) MADD1 c11, c11, a3, b3 LD b4, 7 * SIZE(BO) MADD3 c21, c21, a3, b2 LD a3, 6 * SIZE(AO) MADD2 c12, c12, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 7 * SIZE(AO) MADD1 c11, c11, a3, b3 LD b2, 9 * SIZE(BO) MADD3 c21, c21, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c12, c12, a2, b3 LD b3, 12 * SIZE(BO) MADD4 c22, c22, a2, b4 LD a2, 9 * SIZE(AO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L32 daddiu BO, BO, 8 * SIZE .align 3 .L35: #ifndef TRMMKERNEL andi L, K, 3 #else andi L, TEMP, 3 #endif NOP blez L, .L38 NOP .align 3 .L36: MADD1 c11, c11, a1, b1 daddiu L, L, -1 MADD3 c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 3 * SIZE(AO) LD b2, 3 * SIZE(BO) daddiu BO, BO, 2 * SIZE bgtz L, .L36 daddiu AO, AO, 2 * SIZE .L38: #ifndef TRMMKERNEL LD b1, 0 * SIZE(CO1) ADD c11, c11, c22 LD b2, 1 * SIZE(CO1) ADD c12, c12, c21 MADD b1, b1, ALPHA_R, c11 daddiu CO1,CO1, 2 * SIZE MADD b2, b2, ALPHA_R, c12 daddiu I, I, -1 NMSUB b1, b1, ALPHA_I, c12 NOP MADD b2, b2, ALPHA_I, c11 MTC $0, c11 ST b1, -2 * SIZE(CO1) NOP bgtz I, .L31 ST b2, -1 * SIZE(CO1) #else ADD c11, c11, c22 ADD c12, c12, c21 MUL b1, ALPHA_R, c11 daddiu CO1,CO1, 2 * SIZE MUL b2, ALPHA_R, c12 daddiu I, I, -1 NMSUB b1, b1, ALPHA_I, c12 NOP MADD b2, b2, ALPHA_I, c11 MTC $0, c11 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -1 #endif dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif ST b1, -2 * SIZE(CO1) NOP bgtz I, .L31 ST b2, -1 * SIZE(CO1) #endif .align 3 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 1 #endif move B, BO .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) ldc1 $f24, 16($sp) ldc1 $f25, 24($sp) ldc1 $f26, 32($sp) ldc1 $f27, 40($sp) ldc1 $f28, 48($sp) ldc1 $f29, 56($sp) #if defined(TRMMKERNEL) LDARG $18, 64($sp) LDARG $19, 72($sp) LDARG $20, 80($sp) #endif #ifndef __64BIT__ ldc1 $f20, 88($sp) ldc1 $f21, 96($sp) ldc1 $f22,104($sp) ldc1 $f23,112($sp) #endif j $31 daddiu $sp, $sp, 128 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zgemm_kernel_loongson3a_2x2.S000066400000000000000000000622311313527062700232120ustar00rootroot00000000000000#define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define STACKSIZE 160 #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define R12 12 #define R13 13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define PREA $16 #define PREB $17 #if defined(TRMMKERNEL) #define OFFSET $18 #define KK $19 #define TEMP $20 #endif #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define b1 $f4 #define b2 $f5 #define b3 $f6 #define b4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f15 #define b8 $f16 #define c11 $f14 #define c12 $f17 #define c13 $f18 #define c14 $f19 #define c21 $f20 #define c22 $f21 #define c23 $f22 #define c24 $f23 #define c31 $f24 #define c32 $f25 #define c33 $f26 #define c34 $f27 #define c41 $f28 #define c42 $f29 #define c43 $f30 #define c44 $f31 #define F0 0 #define F1 1 #define F2 2 #define F3 3 #define F4 4 #define F5 5 #define F6 6 #define F7 7 #define F8 8 #define F9 9 #define F10 10 #define F11 11 #define F12 12 #define F13 13 #define F14 14 #define F15 15 #define F16 16 #define F17 17 #define F18 18 #define F19 19 #define F20 20 #define F21 21 #define F22 22 #define F23 23 #define F24 24 #define F25 25 #define F26 26 #define F27 27 #define F28 28 #define F29 29 #define F30 30 #define F31 31 #define ALPHA_R $f15 #define ALPHA_I $f16 ################################# ## MADD1 a*c ## MADD2 b*c ## MADD3 a*d ## MADD4 d*b ################################## #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 NMSUB #define MADD4 NMSUB #endif PROLOGUE LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE SDARG $16, 0($sp) SDARG $17, 8($sp) sdc1 $f24, 16($sp) sdc1 $f25, 24($sp) sdc1 $f26, 32($sp) sdc1 $f27, 40($sp) sdc1 $f28, 48($sp) sdc1 $f29, 56($sp) #if defined(TRMMKERNEL) SDARG $18, 64($sp) SDARG $19, 72($sp) SDARG $20, 80($sp) LDARG OFFSET, STACKSIZE + 8($sp) #endif #ifndef __64BIT__ sdc1 $f20, 88($sp) sdc1 $f21, 96($sp) sdc1 $f22,104($sp) sdc1 $f23,112($sp) #endif dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE blez J, .L20 ST ALPHA_I, 136($sp) .align 5 .L10: #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 move CO1, C # Fix pointer Cx daddu CO2, C, LDC move AO, A # Reset AO blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + ZBASE_SHIFT # MR=NR=2 dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MOV c13, c11 MOV c14, c11 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c21, c11 MOV c22, c11 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c23, c11 MOV c24, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 FETCH $0, 0 * SIZE(CO2) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO1) MOV c43, c11 MOV c44, c11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MOV c13, c11 MOV c14, c11 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c21, c11 MOV c22, c11 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c23, c11 MOV c24, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 FETCH $0, 0 * SIZE(CO2) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO1) MOV c43, c11 daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 MOV c44, c11 #endif .align 5 .L12: gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R13, F13, F12, 2) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd gsLQC1(R12, F11, F10, 3) gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 4 * SIZE(PREA) FETCH $0, 4 * SIZE(PREB) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 4) # unroll k=2 gsLQC1(R13, F5, F4, 4) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd gsLQC1(R12, F3, F2, 5) gsLQC1(R13, F7, F6, 5) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 8 * SIZE(PREA) FETCH $0, 8 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 MADD4 c44, c44, a8, b8 gsLQC1(R12, F9, F8, 6) # Unroll K=3 gsLQC1(R13, F13, F12, 6) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd gsLQC1(R13, F16, F15, 7) gsLQC1(R12, F11, F10, 7) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 12 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 daddu PREA, PREA, 16 * SIZE daddu PREB, PREB, 16 * SIZE MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 0 * SIZE(PREA) FETCH $0, 0 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 bgtz L, .L12 MADD4 c44, c44, a8, b8 .align 5 .L15: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L18 LD ALPHA_I, 136($sp) .align 5 .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu PREA, PREA, 4 * SIZE daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 0 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 FETCH $0, 0 * SIZE(PREB) MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 bgtz L, .L16 NOP .L18: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) ADD c41, c44, c41 LD b3, 2 * SIZE(CO2) ADD c42, c43, c42 LD b4, 3 * SIZE(CO2) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 ST a1, 0 * SIZE(CO1) MADD b3, b3, ALPHA_R, c41 MADD b4, b4, ALPHA_R, c42 ST a2, 1 * SIZE(CO1) NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST b1, 2 * SIZE(CO1) NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 ADD c31, c34, c31 ADD c32, c33, c32 ADD c41, c44, c41 ADD c42, c43, c42 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 MUL b3, ALPHA_R, c41 MUL b4, ALPHA_R, c42 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, 1 + ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE .align 5 .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel daddu PREB, PREB, B # PREA=A+panel size blez I, .L19 daddu C, C, LDC # Change C to next panel #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, ZBASE_SHIFT # MR=1 dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 daddu AO, AO, L daddu BO, B, TEMP #endif gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(PREB) MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 #else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 blez L, .L35 NOP #else gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 dsra L, K, 2 # Unroll K 4 times move BO, B gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREB) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) MOV c33, c11 blez L, .L35 MOV c34, c11 #endif .align 5 .L32: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F13, F12, 2) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd NOP MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 NOP gsLQC1(R12, F9, F8, 2) # Unroll K=1 gsLQC1(R13, F5, F4, 4) MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd gsLQC1(R13, F7, F6, 5) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd NOP MADD1 c31, c31, a3, b7 # A1xB2 MADD3 c33, c33, a3, b8 FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a4, b7 MADD4 c34, c34, a4, b8 daddiu L, L, -1 gsLQC1(R12, F11, F10, 3) gsLQC1(R13, F13, F12, 6) MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd gsLQC1(R13, F16, F15, 7) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx MADD1 c31, c31, a5, b3 # A1xB2 MADD3 c33, c33, a5, b4 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a6, b3 MADD4 c34, c34, a6, b4 daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a8, b5 # bxc MADD4 c14, c14, a8, b6 # bxd daddiu PREB, PREB, 16 * SIZE MADD1 c31, c31, a7, b7 # A1xB2 MADD3 c33, c33, a7, b8 FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a8, b7 bgtz L, .L32 MADD4 c34, c34, a8, b8 .L35: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L38 LD ALPHA_I, 136($sp) .align 5 .L36: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 NOP bgtz L, .L36 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 .L38: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c31, c34, c31 ADD c32, c33, c32 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 5 .L19: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif bgtz J, .L10 move B, BO .align 5 .L20: andi J, N, 1 blez J, .L999 dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 dsra I, M, 1 # I=M/2 move CO1, C #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif move AO, A # Reset AO blez I, .L29 daddu PREA, PREA, A .L21: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + ZBASE_SHIFT dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 # define Mr=2 #else daddiu TEMP, KK, 1 # define NR=1 #endif dsra L, TEMP, 2 blez L, .L25 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) blez L, .L25 NOP #endif .align 5 .L22: gsLQC1(R12, F9, F8, 2) # Unroll K=1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 gsLQC1(R12, F1, F0, 4) # Unroll K=2 MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd gsLQC1(R12, F3, F2, 5) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 FETCH $0, 8 * SIZE(PREA) MADD2 c22, c22, a8, b3 MADD4 c24, c24, a8, b4 daddiu L, L, -1 gsLQC1(R12, F9, F8, 6) # Unroll K=3 MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd daddiu PREA, PREA, 16 * SIZE gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a6, b7 # bxc MADD4 c14, c14, a6, b8 # bxd gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c21, c21, a7, b7 # A2xB1 MADD3 c23, c23, a7, b8 FETCH $0, 0 * SIZE(PREA) MADD2 c22, c22, a8, b7 bgtz L, .L22 MADD4 c24, c24, a8, b8 .L25: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L28 LD ALPHA_I, 136($sp) .align 3 .L26: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 bgtz L, .L26 FETCH $0, 0 * SIZE(PREA) .L28: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 1 + ZBASE_SHIFT dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP .L29: andi I, M, 1 blez I, .L999 NOP #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L45 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP #endif .align 3 .L42: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd gsLQC1(R12, F9, F8, 2) # Unroll K=1 MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd daddiu L, L, -1 gsLQC1(R12, F11, F10, 3) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a8, b7 # bxc bgtz L, .L42 MADD4 c14, c14, a8, b8 # bxd .align 5 .L45: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L48 LD ALPHA_I, 136($sp) .L46: daddiu L, L, -1 daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 bgtz L, .L46 NOP .L48: #ifndef TRMMKERNEL ADD c11, c14, c11 ADD c12, c13, c12 LD a1, 0 * SIZE(CO1) LD a2, 1 * SIZE(CO1) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -1 #endif dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif daddiu CO1,CO1, 2 * SIZE #endif .align 5 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) ldc1 $f24, 16($sp) ldc1 $f25, 24($sp) ldc1 $f26, 32($sp) ldc1 $f27, 40($sp) ldc1 $f28, 48($sp) ldc1 $f29, 56($sp) #if defined(TRMMKERNEL) LDARG $18, 64($sp) LDARG $19, 72($sp) LDARG $20, 80($sp) #endif #ifndef __64BIT__ ldc1 $f20, 88($sp) ldc1 $f21, 96($sp) ldc1 $f22,104($sp) ldc1 $f23,112($sp) #endif j $31 daddiu $sp, $sp, STACKSIZE EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zgemm_kernel_loongson3b_2x2.S000066400000000000000000000642401313527062700232150ustar00rootroot00000000000000#define ASSEMBLER #include "common.h" #define FETCH ld #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define STACKSIZE 160 #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define R12 12 #define R13 13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define PREA $16 #define PREB $17 #if defined(TRMMKERNEL) #define OFFSET $18 #define KK $19 #define TEMP $20 #endif #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define b1 $f4 #define b2 $f5 #define b3 $f6 #define b4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f15 #define b8 $f16 #define c11 $f14 #define c12 $f17 #define c13 $f18 #define c14 $f19 #define c21 $f20 #define c22 $f21 #define c23 $f22 #define c24 $f23 #define c31 $f24 #define c32 $f25 #define c33 $f26 #define c34 $f27 #define c41 $f28 #define c42 $f29 #define c43 $f30 #define c44 $f31 #define F0 0 #define F1 1 #define F2 2 #define F3 3 #define F4 4 #define F5 5 #define F6 6 #define F7 7 #define F8 8 #define F9 9 #define F10 10 #define F11 11 #define F12 12 #define F13 13 #define F14 14 #define F15 15 #define F16 16 #define F17 17 #define F18 18 #define F19 19 #define F20 20 #define F21 21 #define F22 22 #define F23 23 #define F24 24 #define F25 25 #define F26 26 #define F27 27 #define F28 28 #define F29 29 #define F30 30 #define F31 31 #define ALPHA_R $f15 #define ALPHA_I $f16 ################################# ## MADD1 a*c ## MADD2 b*c ## MADD3 a*d ## MADD4 d*b ################################## #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 NMSUB #define MADD4 NMSUB #endif PROLOGUE LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE SDARG $16, 0($sp) SDARG $17, 8($sp) sdc1 $f24, 16($sp) sdc1 $f25, 24($sp) sdc1 $f26, 32($sp) sdc1 $f27, 40($sp) sdc1 $f28, 48($sp) sdc1 $f29, 56($sp) #if defined(TRMMKERNEL) SDARG $18, 64($sp) SDARG $19, 72($sp) SDARG $20, 80($sp) LDARG OFFSET, STACKSIZE + 8($sp) #endif #ifndef __64BIT__ sdc1 $f20, 88($sp) sdc1 $f21, 96($sp) sdc1 $f22,104($sp) sdc1 $f23,112($sp) #endif dsra J, N, 1 # J=N/2 ST ALPHA_R, 128($sp) # store alpha_r & alpha_i #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif dsll LDC, LDC, ZBASE_SHIFT # LDC*SIZE*COMPSIZE blez J, .L20 ST ALPHA_I, 136($sp) .align 5 .L10: #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 dsll PREA, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 move CO1, C # Fix pointer Cx daddu CO2, C, LDC move AO, A # Reset AO blez I, .L30 daddu PREA, PREA, A # PREA=A+panel size .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif MTC $0, c11 # Clear results regs LD a1, 0 * SIZE(AO) MOV c12, c11 LD a2, 1 * SIZE(AO) MOV c13, c11 LD b1, 0 * SIZE(BO) MOV c14, c11 LD b2, 1 * SIZE(BO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 LD b4, 3 * SIZE(BO) FETCH $0, 0 * SIZE(CO2) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO1) MOV c43, c11 MOV c44, c11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 #else daddiu TEMP, KK, 2 #endif dsra L, TEMP, 2 daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs LD a1, 0 * SIZE(AO) MOV c12, c11 LD a2, 1 * SIZE(AO) MOV c13, c11 LD b1, 0 * SIZE(BO) MOV c14, c11 LD b2, 1 * SIZE(BO) MOV c21, c11 LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 LD b4, 3 * SIZE(BO) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO2) MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO2) MOV c43, c11 NOP FETCH $0, 4 * SIZE(CO1) daddu PREB, PREB, B # PREA=A+panel size blez L, .L15 MOV c44, c11 #endif .align 5 .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 4 * SIZE(PREA) FETCH $0, 4 * SIZE(PREB) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 8 * SIZE(PREA) FETCH $0, 8 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 MADD4 c44, c44, a8, b8 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 12 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 daddiu AO, AO, 16 * SIZE daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 daddu PREA, PREA, 16 * SIZE MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 daddu PREB, PREB, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c22, c22, a8, b5 MADD4 c24, c24, a8, b6 FETCH $0, 0 * SIZE(PREA) FETCH $0, 0 * SIZE(PREB) MADD1 c31, c31, a5, b7 # A1xB2 MADD3 c33, c33, a5, b8 MADD2 c32, c32, a6, b7 MADD4 c34, c34, a6, b8 MADD1 c41, c41, a7, b7 # A2xB2 MADD3 c43, c43, a7, b8 MADD2 c42, c42, a8, b7 bgtz L, .L12 MADD4 c44, c44, a8, b8 .align 5 .L15: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L18 LD ALPHA_I, 136($sp) .align 5 .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu PREA, PREA, 4 * SIZE daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 FETCH $0, 0 * SIZE(PREA) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu L, L, -1 MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 FETCH $0, 0 * SIZE(PREB) MADD1 c41, c41, a3, b3 # A2xB2 MADD3 c43, c43, a3, b4 MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) bgtz L, .L16 NOP .L18: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) ADD c41, c44, c41 LD b3, 2 * SIZE(CO2) ADD c42, c43, c42 LD b4, 3 * SIZE(CO2) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 ST a1, 0 * SIZE(CO1) MADD b3, b3, ALPHA_R, c41 MADD b4, b4, ALPHA_R, c42 ST a2, 1 * SIZE(CO1) NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST b1, 2 * SIZE(CO1) NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 ADD c31, c34, c31 ADD c32, c33, c32 ADD c41, c44, c41 ADD c42, c43, c42 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 MUL b3, ALPHA_R, c41 MUL b4, ALPHA_R, c42 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 NMSUB b3, b3, ALPHA_I, c42 MADD b4, b4, ALPHA_I, c41 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) ST b3, 2 * SIZE(CO2) ST b4, 3 * SIZE(CO2) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -2 #endif dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 daddiu CO2,CO2, 4 * SIZE .align 5 .L30: andi I, M, 1 daddu C, C, LDC # Change C to next panel daddu PREB, PREB, B # PREA=A+panel size blez I, .L19 daddu C, C, LDC # Change C to next panel #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, ZBASE_SHIFT # MR=1 dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2 daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(PREB) MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 #else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 blez L, .L35 NOP #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) dsra L, K, 2 # Unroll K 4 times move BO, B LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREB) MOV c31, c11 MOV c32, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 0 * SIZE(CO2) FETCH $0, 4 * SIZE(CO1) FETCH $0, 4 * SIZE(CO2) MOV c33, c11 blez L, .L35 MOV c34, c11 #endif .align 5 .L32: LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 NOP LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD1 c31, c31, a3, b7 # A1xB2 MADD3 c33, c33, a3, b8 FETCH $0, 8 * SIZE(PREB) MADD2 c32, c32, a4, b7 MADD4 c34, c34, a4, b8 daddiu L, L, -1 LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MADD1 c31, c31, a5, b3 # A1xB2 MADD3 c33, c33, a5, b4 daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu BO, BO, 16 * SIZE # 2nr*4kr*cmpx FETCH $0, 12 * SIZE(PREB) MADD2 c32, c32, a6, b3 MADD4 c34, c34, a6, b4 NOP LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a8, b5 # bxc MADD4 c14, c14, a8, b6 # bxd LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD1 c31, c31, a7, b7 # A1xB2 NOP MADD3 c33, c33, a7, b8 daddiu PREB, PREB, 16 * SIZE FETCH $0, 0 * SIZE(PREB) MADD2 c32, c32, a8, b7 bgtz L, .L32 MADD4 c34, c34, a8, b8 .L35: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L38 LD ALPHA_I, 136($sp) .align 5 .L36: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) bgtz L, .L36 NOP .L38: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c31, c34, c31 LD a3, 0 * SIZE(CO2) ADD c32, c33, c32 LD a4, 1 * SIZE(CO2) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD a3, a3, ALPHA_R, c31 MADD a4, a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c31, c34, c31 ADD c32, c33, c32 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL a3, ALPHA_R, c31 MUL a4, ALPHA_R, c32 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB a3, a3, ALPHA_I, c32 MADD a4, a4, ALPHA_I, c31 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif #endif .align 5 .L19: #if defined(TRMMKERNEL) && !defined(LEFT) daddiu KK, KK, 2 #endif bgtz J, .L10 move B, BO .align 5 .L20: andi J, N, 1 blez J, .L999 dsll PREA, K, 1+ZBASE_SHIFT # PREA=K*2*2^4 dsra I, M, 1 # I=M/2 move CO1, C #if defined(TRMMKERNEL) && defined(LEFT) move KK, OFFSET #endif move AO, A # Reset AO blez I, .L29 daddu PREA, PREA, A .L21: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll L, KK, 1 + ZBASE_SHIFT dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, L daddu BO, B, TEMP #endif LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 # define Mr=2 #else daddiu TEMP, KK, 1 # define NR=1 #endif dsra L, TEMP, 2 blez L, .L25 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MOV c21, c11 MOV c22, c11 FETCH $0, 0 * SIZE(PREA) MOV c23, c11 MOV c24, c11 FETCH $0, 0 * SIZE(CO1) FETCH $0, 4 * SIZE(CO1) blez L, .L25 NOP #endif .align 5 .L22: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd LD a3, 10 * SIZE(AO) LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 FETCH $0, 8 * SIZE(PREA) MADD2 c22, c22, a8, b3 MADD4 c24, c24, a8, b4 daddiu L, L, -1 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd LD a7, 14 * SIZE(AO) LD a8, 15 * SIZE(AO) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 daddiu PREA, PREA, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a6, b7 # bxc MADD4 c14, c14, a6, b8 # bxd LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c21, c21, a7, b7 # A2xB1 MADD3 c23, c23, a7, b8 FETCH $0, 0 * SIZE(PREA) MADD2 c22, c22, a8, b7 bgtz L, .L22 MADD4 c24, c24, a8, b8 .L25: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L28 LD ALPHA_I, 136($sp) .align 3 .L26: daddiu L, L, -1 MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 daddiu PREA, PREA, 4 * SIZE # 2mr*1kr*cmpx MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) bgtz L, .L26 FETCH $0, 0 * SIZE(PREA) .L28: #ifndef TRMMKERNEL ADD c11, c14, c11 LD a1, 0 * SIZE(CO1) ADD c12, c13, c12 LD a2, 1 * SIZE(CO1) ADD c21, c24, c21 LD b1, 2 * SIZE(CO1) ADD c22, c23, c22 LD b2, 3 * SIZE(CO1) daddiu I, I, -1 MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 MADD b1, b1, ALPHA_R, c21 MADD b2, b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 ADD c21, c24, c21 ADD c22, c23, c22 daddiu I, I, -1 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 MUL b1, ALPHA_R, c21 MUL b2, ALPHA_R, c22 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 NMSUB b1, b1, ALPHA_I, c22 MADD b2, b2, ALPHA_I, c21 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) ST b1, 2 * SIZE(CO1) ST b2, 3 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -2 #else daddiu TEMP, TEMP, -1 #endif dsll L, TEMP, 1 + ZBASE_SHIFT dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 2 #endif #endif daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP .L29: andi I, M, 1 blez I, .L999 NOP #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move BO, B #else dsll TEMP, KK, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, B, TEMP #endif # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 #else daddiu TEMP, KK, 1 #endif dsra L, TEMP, 2 blez L, .L45 NOP #else dsra L, K, 2 # Unroll K 4 times move BO, B # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 MOV c14, c11 FETCH $0, 0 * SIZE(PREA) FETCH $0, 4 * SIZE(PREA) blez L, .L45 NOP #endif .align 3 .L42: # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd # gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd # gsLQC1(R12, F9, F8, 2) # Unroll K=1 LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd # gsLQC1(R13, F13, F12, 2) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd # gsLQC1(R12, F11, F10, 3) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu L, L, -1 # gsLQC1(R13, F16, F15, 3) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MADD2 c12, c12, a8, b7 # bxc MADD4 c14, c14, a8, b8 # bxd bgtz L, .L42 NOP .align 5 .L45: #ifndef TRMMKERNEL andi L, K, 3 LD ALPHA_R, 128($sp) #else andi L, TEMP, 3 LD ALPHA_R, 128($sp) #endif blez L, .L48 LD ALPHA_I, 136($sp) .L46: daddiu L, L, -1 daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) bgtz L, .L46 NOP .L48: #ifndef TRMMKERNEL ADD c11, c14, c11 ADD c12, c13, c12 LD a1, 0 * SIZE(CO1) LD a2, 1 * SIZE(CO1) MADD a1, a1, ALPHA_R, c11 MADD a2, a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #else ADD c11, c14, c11 ADD c12, c13, c12 MUL a1, ALPHA_R, c11 MUL a2, ALPHA_R, c12 NMSUB a1, a1, ALPHA_I, c12 MADD a2, a2, ALPHA_I, c11 ST a1, 0 * SIZE(CO1) ST a2, 1 * SIZE(CO1) #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, K, KK #ifdef LEFT daddiu TEMP, TEMP, -1 #else daddiu TEMP, TEMP, -1 #endif dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LEFT daddiu KK, KK, 1 #endif daddiu CO1,CO1, 2 * SIZE #endif .align 5 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) ldc1 $f24, 16($sp) ldc1 $f25, 24($sp) ldc1 $f26, 32($sp) ldc1 $f27, 40($sp) ldc1 $f28, 48($sp) ldc1 $f29, 56($sp) #if defined(TRMMKERNEL) LDARG $18, 64($sp) LDARG $19, 72($sp) LDARG $20, 80($sp) #endif #ifndef __64BIT__ ldc1 $f20, 88($sp) ldc1 $f21, 96($sp) ldc1 $f22,104($sp) ldc1 $f23,112($sp) #endif j $31 daddiu $sp, $sp, STACKSIZE EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zgemv_n.S000066400000000000000000000344361313527062700173510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define A $9 #define LDA $10 #define X $11 #define INCX $2 #define Y $6 #define INCY $7 #define BUFFER $8 #define YORIG $3 #define XX $12 #define YY $13 #define I $14 #define J $15 #define AO1 $16 #define AO2 $17 #define ALPHA_R $f15 #define ALPHA_I $f16 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define x1 $f8 #define x2 $f9 #define x3 $f10 #define x4 $f11 #define y1 $f12 #define y2 $f13 #define y3 $f14 #define y4 $f17 #define t1 $f18 #define t2 $f19 #define t3 $f20 #define t4 $f21 #define t5 $f22 #define t6 $f23 #define t7 $f24 #define t8 $f25 #if !defined(CONJ) && !defined(XCONJ) #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #if defined(CONJ) && !defined(XCONJ) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #endif #if !defined(CONJ) && defined(XCONJ) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #endif #if defined(CONJ) && defined(XCONJ) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 NMSUB #define MADD4 NMSUB #endif PROLOGUE LDARG INCX, 0($sp) LDARG Y, 8($sp) LDARG INCY, 16($sp) LDARG BUFFER, 24($sp) #ifndef __64BIT__ daddiu $sp, $sp, -64 #else daddiu $sp, $sp, -32 #endif SDARG $16, 0($sp) SDARG $17, 8($sp) sdc1 $f24, 16($sp) sdc1 $f25, 24($sp) #ifndef __64BIT__ sdc1 $f20, 32($sp) sdc1 $f21, 40($sp) sdc1 $f22, 48($sp) sdc1 $f23, 56($sp) #endif dsll LDA, LDA, ZBASE_SHIFT blez M, .L999 dsll INCX, INCX, ZBASE_SHIFT blez N, .L999 dsll INCY, INCY, ZBASE_SHIFT li YORIG, 2 * SIZE beq INCY, YORIG, .L10 move YORIG, Y dsra I, M, 2 move YORIG, BUFFER move XX, Y blez I, .L05 move YY, BUFFER .align 3 .L02: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddu XX, XX, INCY LD a3, 0 * SIZE(XX) LD a4, 1 * SIZE(XX) daddu XX, XX, INCY LD a5, 0 * SIZE(XX) LD a6, 1 * SIZE(XX) daddu XX, XX, INCY LD a7, 0 * SIZE(XX) LD a8, 1 * SIZE(XX) daddu XX, XX, INCY daddiu I, I, -1 daddiu YY, YY, 8 * SIZE ST a1, -8 * SIZE(YY) ST a2, -7 * SIZE(YY) ST a3, -6 * SIZE(YY) ST a4, -5 * SIZE(YY) ST a5, -4 * SIZE(YY) ST a6, -3 * SIZE(YY) ST a7, -2 * SIZE(YY) bgtz I, .L02 ST a8, -1 * SIZE(YY) .align 3 .L05: andi I, M, 3 blez I, .L10 NOP .align 3 .L06: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddu XX, XX, INCY daddiu I, I, -1 ST a1, 0 * SIZE(YY) ST a2, 1 * SIZE(YY) bgtz I, .L06 daddiu YY, YY, 2 * SIZE .align 3 .L10: dsra J, N, 1 blez J, .L20 NOP .align 3 .L11: LD x1, 0 * SIZE(X) LD x2, 1 * SIZE(X) daddu X, X, INCX LD x3, 0 * SIZE(X) LD x4, 1 * SIZE(X) daddu X, X, INCX MUL a1, ALPHA_R, x1 move AO1, A MUL a2, ALPHA_I, x1 daddu AO2, A, LDA MUL a3, ALPHA_R, x3 daddu A, AO2, LDA MUL a4, ALPHA_I, x3 #ifndef XCONJ NMSUB x1, a1, ALPHA_I, x2 MADD x2, a2, ALPHA_R, x2 NMSUB x3, a3, ALPHA_I, x4 MADD x4, a4, ALPHA_R, x4 #else MADD x1, a1, ALPHA_I, x2 MSUB x2, a2, ALPHA_R, x2 MADD x3, a3, ALPHA_I, x4 MSUB x4, a4, ALPHA_R, x4 #endif dsra I, M, 2 blez I, .L15 move YY, YORIG LD y1, 0 * SIZE(YY) LD a1, 0 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y4, 3 * SIZE(YY) LD a4, 3 * SIZE(AO1) LD a5, 0 * SIZE(AO2) LD a6, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) MADD1 t1, y1, x1, a1 LD y1, 4 * SIZE(YY) MADD2 t2, y2, x2, a1 LD a1, 4 * SIZE(AO1) MADD1 t3, y3, x1, a3 LD y2, 5 * SIZE(YY) MADD2 t4, y4, x2, a3 LD a3, 6 * SIZE(AO1) MADD3 t1, t1, x2, a2 LD y3, 6 * SIZE(YY) MADD4 t2, t2, x1, a2 LD a2, 5 * SIZE(AO1) MADD3 t3, t3, x2, a4 LD y4, 7 * SIZE(YY) MADD4 t4, t4, x1, a4 LD a4, 7 * SIZE(AO1) MADD1 t1, t1, x3, a5 NOP MADD2 t2, t2, x4, a5 LD a5, 4 * SIZE(AO2) MADD1 t3, t3, x3, a7 NOP MADD2 t4, t4, x4, a7 LD a7, 6 * SIZE(AO2) MADD3 t1, t1, x4, a6 NOP MADD4 t2, t2, x3, a6 LD a6, 5 * SIZE(AO2) MADD3 t3, t3, x4, a8 daddiu I, I, -1 MADD4 t4, t4, x3, a8 blez I, .L13 LD a8, 7 * SIZE(AO2) .align 3 .L12: MADD1 t5, y1, x1, a1 LD y1, 8 * SIZE(YY) MADD2 t6, y2, x2, a1 LD a1, 8 * SIZE(AO1) MADD1 t7, y3, x1, a3 LD y2, 9 * SIZE(YY) MADD2 t8, y4, x2, a3 LD a3, 10 * SIZE(AO1) MADD3 t5, t5, x2, a2 LD y3, 10 * SIZE(YY) MADD4 t6, t6, x1, a2 LD a2, 9 * SIZE(AO1) MADD3 t7, t7, x2, a4 LD y4, 11 * SIZE(YY) MADD4 t8, t8, x1, a4 LD a4, 11 * SIZE(AO1) MADD1 t5, t5, x3, a5 ST t1, 0 * SIZE(YY) MADD2 t6, t6, x4, a5 LD a5, 8 * SIZE(AO2) MADD1 t7, t7, x3, a7 ST t2, 1 * SIZE(YY) MADD2 t8, t8, x4, a7 LD a7, 10 * SIZE(AO2) MADD3 t5, t5, x4, a6 ST t3, 2 * SIZE(YY) MADD4 t6, t6, x3, a6 LD a6, 9 * SIZE(AO2) MADD3 t7, t7, x4, a8 ST t4, 3 * SIZE(YY) MADD4 t8, t8, x3, a8 LD a8, 11 * SIZE(AO2) MADD1 t1, y1, x1, a1 LD y1, 12 * SIZE(YY) MADD2 t2, y2, x2, a1 LD a1, 12 * SIZE(AO1) MADD1 t3, y3, x1, a3 LD y2, 13 * SIZE(YY) MADD2 t4, y4, x2, a3 LD a3, 14 * SIZE(AO1) MADD3 t1, t1, x2, a2 LD y3, 14 * SIZE(YY) MADD4 t2, t2, x1, a2 LD a2, 13 * SIZE(AO1) MADD3 t3, t3, x2, a4 LD y4, 15 * SIZE(YY) MADD4 t4, t4, x1, a4 LD a4, 15 * SIZE(AO1) MADD1 t1, t1, x3, a5 ST t5, 4 * SIZE(YY) MADD2 t2, t2, x4, a5 LD a5, 12 * SIZE(AO2) MADD1 t3, t3, x3, a7 ST t6, 5 * SIZE(YY) MADD2 t4, t4, x4, a7 LD a7, 14 * SIZE(AO2) MADD3 t1, t1, x4, a6 ST t7, 6 * SIZE(YY) MADD4 t2, t2, x3, a6 LD a6, 13 * SIZE(AO2) MADD3 t3, t3, x4, a8 ST t8, 7 * SIZE(YY) MADD4 t4, t4, x3, a8 LD a8, 15 * SIZE(AO2) daddiu I, I, -1 daddiu YY, YY, 8 * SIZE daddiu AO1, AO1, 8 * SIZE bgtz I, .L12 daddiu AO2, AO2, 8 * SIZE .align 3 .L13: ST t1, 0 * SIZE(YY) MADD1 t1, y1, x1, a1 ST t2, 1 * SIZE(YY) MADD2 t2, y2, x2, a1 ST t3, 2 * SIZE(YY) MADD1 t3, y3, x1, a3 ST t4, 3 * SIZE(YY) MADD2 t4, y4, x2, a3 MADD3 t1, t1, x2, a2 MADD4 t2, t2, x1, a2 MADD3 t3, t3, x2, a4 MADD4 t4, t4, x1, a4 MADD1 t1, t1, x3, a5 MADD2 t2, t2, x4, a5 MADD1 t3, t3, x3, a7 MADD2 t4, t4, x4, a7 MADD3 t1, t1, x4, a6 daddiu AO1, AO1, 8 * SIZE MADD4 t2, t2, x3, a6 daddiu AO2, AO2, 8 * SIZE MADD3 t3, t3, x4, a8 daddiu YY, YY, 8 * SIZE MADD4 t4, t4, x3, a8 NOP ST t1, -4 * SIZE(YY) ST t2, -3 * SIZE(YY) ST t3, -2 * SIZE(YY) ST t4, -1 * SIZE(YY) .align 3 .L15: andi I, M, 2 NOP blez I, .L16 NOP LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a4, 3 * SIZE(AO1) LD y4, 3 * SIZE(YY) MADD1 t1, y1, x1, a1 LD a5, 0 * SIZE(AO2) MADD2 t2, y2, x2, a1 LD a6, 1 * SIZE(AO2) MADD1 t3, y3, x1, a3 LD a7, 2 * SIZE(AO2) MADD2 t4, y4, x2, a3 LD a8, 3 * SIZE(AO2) MADD3 t1, t1, x2, a2 MADD4 t2, t2, x1, a2 MADD3 t3, t3, x2, a4 MADD4 t4, t4, x1, a4 MADD1 t1, t1, x3, a5 MADD2 t2, t2, x4, a5 MADD1 t3, t3, x3, a7 MADD2 t4, t4, x4, a7 MADD3 t1, t1, x4, a6 daddiu YY, YY, 4 * SIZE MADD4 t2, t2, x3, a6 daddiu AO1, AO1, 4 * SIZE MADD3 t3, t3, x4, a8 daddiu AO2, AO2, 4 * SIZE MADD4 t4, t4, x3, a8 NOP ST t1, -4 * SIZE(YY) ST t2, -3 * SIZE(YY) ST t3, -2 * SIZE(YY) ST t4, -1 * SIZE(YY) .align 3 .L16: andi I, M, 1 NOP blez I, .L19 NOP LD y1, 0 * SIZE(YY) LD y2, 1 * SIZE(YY) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) MADD1 t1, y1, x1, a1 LD a5, 0 * SIZE(AO2) MADD2 t2, y2, x2, a1 LD a6, 1 * SIZE(AO2) MADD3 t1, t1, x2, a2 MADD4 t2, t2, x1, a2 MADD1 t1, t1, x3, a5 MADD2 t2, t2, x4, a5 MADD3 t1, t1, x4, a6 MADD4 t2, t2, x3, a6 ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) .align 3 .L19: daddiu J, J, -1 bgtz J, .L11 NOP .align 3 .L20: andi J, N, 1 blez J, .L900 NOP LD x1, 0 * SIZE(X) LD x2, 1 * SIZE(X) daddu X, X, INCX MUL a1, ALPHA_R, x1 move AO1, A MUL a2, ALPHA_I, x1 #ifndef XCONJ NMSUB x1, a1, ALPHA_I, x2 MADD x2, a2, ALPHA_R, x2 #else MADD x1, a1, ALPHA_I, x2 MSUB x2, a2, ALPHA_R, x2 #endif dsra I, M, 2 blez I, .L25 move YY, YORIG LD y1, 0 * SIZE(YY) LD a1, 0 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y4, 3 * SIZE(YY) LD a4, 3 * SIZE(AO1) MADD1 t1, y1, x1, a1 LD y1, 4 * SIZE(YY) MADD2 t2, y2, x2, a1 LD a1, 4 * SIZE(AO1) MADD1 t3, y3, x1, a3 LD y2, 5 * SIZE(YY) MADD2 t4, y4, x2, a3 LD a3, 6 * SIZE(AO1) MADD3 t1, t1, x2, a2 LD y3, 6 * SIZE(YY) MADD4 t2, t2, x1, a2 LD a2, 5 * SIZE(AO1) MADD3 t3, t3, x2, a4 LD y4, 7 * SIZE(YY) MADD4 t4, t4, x1, a4 daddiu I, I, -1 blez I, .L23 LD a4, 7 * SIZE(AO1) .align 3 .L22: MADD1 t5, y1, x1, a1 LD y1, 8 * SIZE(YY) MADD2 t6, y2, x2, a1 LD a1, 8 * SIZE(AO1) MADD1 t7, y3, x1, a3 LD y2, 9 * SIZE(YY) MADD2 t8, y4, x2, a3 LD a3, 10 * SIZE(AO1) MADD3 t5, t5, x2, a2 LD y3, 10 * SIZE(YY) MADD4 t6, t6, x1, a2 LD a2, 9 * SIZE(AO1) MADD3 t7, t7, x2, a4 LD y4, 11 * SIZE(YY) MADD4 t8, t8, x1, a4 LD a4, 11 * SIZE(AO1) ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) ST t3, 2 * SIZE(YY) ST t4, 3 * SIZE(YY) MADD1 t1, y1, x1, a1 LD y1, 12 * SIZE(YY) MADD2 t2, y2, x2, a1 LD a1, 12 * SIZE(AO1) MADD1 t3, y3, x1, a3 LD y2, 13 * SIZE(YY) MADD2 t4, y4, x2, a3 LD a3, 14 * SIZE(AO1) MADD3 t1, t1, x2, a2 LD y3, 14 * SIZE(YY) MADD4 t2, t2, x1, a2 LD a2, 13 * SIZE(AO1) MADD3 t3, t3, x2, a4 LD y4, 15 * SIZE(YY) MADD4 t4, t4, x1, a4 LD a4, 15 * SIZE(AO1) ST t5, 4 * SIZE(YY) ST t6, 5 * SIZE(YY) ST t7, 6 * SIZE(YY) ST t8, 7 * SIZE(YY) daddiu I, I, -1 daddiu YY, YY, 8 * SIZE bgtz I, .L22 daddiu AO1, AO1, 8 * SIZE .align 3 .L23: ST t1, 0 * SIZE(YY) MADD1 t1, y1, x1, a1 ST t2, 1 * SIZE(YY) MADD2 t2, y2, x2, a1 ST t3, 2 * SIZE(YY) MADD1 t3, y3, x1, a3 ST t4, 3 * SIZE(YY) MADD2 t4, y4, x2, a3 MADD3 t1, t1, x2, a2 daddiu AO1, AO1, 8 * SIZE MADD4 t2, t2, x1, a2 daddiu YY, YY, 8 * SIZE MADD3 t3, t3, x2, a4 MADD4 t4, t4, x1, a4 ST t1, -4 * SIZE(YY) ST t2, -3 * SIZE(YY) ST t3, -2 * SIZE(YY) ST t4, -1 * SIZE(YY) .align 3 .L25: andi I, M, 2 NOP blez I, .L26 NOP LD a1, 0 * SIZE(AO1) LD y1, 0 * SIZE(YY) LD a2, 1 * SIZE(AO1) LD y2, 1 * SIZE(YY) LD a3, 2 * SIZE(AO1) LD y3, 2 * SIZE(YY) LD a4, 3 * SIZE(AO1) LD y4, 3 * SIZE(YY) MADD1 t1, y1, x1, a1 MADD2 t2, y2, x2, a1 MADD1 t3, y3, x1, a3 MADD2 t4, y4, x2, a3 MADD3 t1, t1, x2, a2 daddiu YY, YY, 4 * SIZE MADD4 t2, t2, x1, a2 daddiu AO1, AO1, 4 * SIZE MADD3 t3, t3, x2, a4 MADD4 t4, t4, x1, a4 ST t1, -4 * SIZE(YY) ST t2, -3 * SIZE(YY) ST t3, -2 * SIZE(YY) ST t4, -1 * SIZE(YY) .align 3 .L26: andi I, M, 1 NOP blez I, .L900 NOP LD y1, 0 * SIZE(YY) LD y2, 1 * SIZE(YY) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) MADD1 t1, y1, x1, a1 MADD2 t2, y2, x2, a1 MADD3 t1, t1, x2, a2 MADD4 t2, t2, x1, a2 ST t1, 0 * SIZE(YY) ST t2, 1 * SIZE(YY) .align 3 .L900: li YORIG, 2 * SIZE beq INCY, YORIG, .L999 dsra I, M, 2 blez I, .L905 move XX, BUFFER .align 3 .L902: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) LD a3, 2 * SIZE(XX) LD a4, 3 * SIZE(XX) LD a5, 4 * SIZE(XX) LD a6, 5 * SIZE(XX) LD a7, 6 * SIZE(XX) LD a8, 7 * SIZE(XX) daddiu I, I, -1 ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) ST a4, 1 * SIZE(Y) daddu Y, Y, INCY ST a5, 0 * SIZE(Y) ST a6, 1 * SIZE(Y) daddu Y, Y, INCY ST a7, 0 * SIZE(Y) ST a8, 1 * SIZE(Y) daddu Y, Y, INCY bgtz I, .L902 daddiu XX, XX, 8 * SIZE .align 3 .L905: andi I, M, 3 blez I, .L999 NOP .align 3 .L906: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddiu XX, XX, 2 * SIZE daddiu I, I, -1 ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) bgtz I, .L906 daddu Y, Y, INCY .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) ldc1 $f24, 16($sp) ldc1 $f25, 24($sp) #ifndef __64BIT__ ldc1 $f20, 32($sp) ldc1 $f21, 40($sp) ldc1 $f22, 48($sp) ldc1 $f23, 56($sp) #endif j $31 #ifdef __64BIT__ daddiu $sp, $sp, 32 #else daddiu $sp, $sp, 64 #endif EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zgemv_n_loongson3a.c000066400000000000000000000142701313527062700215250ustar00rootroot00000000000000#include "common.h" //typedef int BLASLONG; //typedef double FLOAT; #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #if !defined(CONJ) && !defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_0 #define spec_loop spec_loop_0 #define norm_loop_alpha1 norm_loop_alpha1_0 #define norm_loop norm_loop_0 #endif #if defined(CONJ) && !defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_1 #define spec_loop spec_loop_1 #define norm_loop_alpha1 norm_loop_alpha1_1 #define norm_loop norm_loop_1 #endif #if !defined(CONJ) && defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_2 #define spec_loop spec_loop_2 #define norm_loop_alpha1 norm_loop_alpha1_2 #define norm_loop norm_loop_2 #endif #if defined(CONJ) && defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_3 #define spec_loop spec_loop_3 #define norm_loop_alpha1 norm_loop_alpha1_3 #define norm_loop norm_loop_3 #endif #define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) #define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) #define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) #define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) #define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) #define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) #define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) #define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) #define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) #define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) #define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) #define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { if(!rALPHA && iALPHA) return 0; BLASLONG fahead = 60; BLASLONG spec_unroll = 2; BLASLONG tMQ = M - M % spec_unroll; BLASLONG j = 0, k = 0, jj = 0; if(rALPHA == 1 && iALPHA == 0) { if(INCY == 1) { for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(Y[ii + fahead]); /*loop_mark*/ spec_loop_alpha1; /*loop_mark*/ spec_loop_alpha1; } for(; likely(i < M); i++) { spec_loop_alpha1; } } } else { for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0, iii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(Y[iii + fahead]); /*loop_mark*/ norm_loop_alpha1; /*loop_mark*/ norm_loop_alpha1; } for(; likely(i < M); i++) { norm_loop_alpha1; } } } } else { FLOAT rTmp, iTmp; if(INCY == 1) { for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(Y[ii + fahead]); /*loop_mark*/ spec_loop; /*loop_mark*/ spec_loop; } for(; likely(i < M); i++) { spec_loop; } } } else { for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0, iii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(Y[iii + fahead]); /*loop_mark*/ norm_loop; /*loop_mark*/ norm_loop; } for(; likely(i < M); i++) { norm_loop; } } } } return 0; } OpenBLAS-0.2.20/kernel/mips64/zgemv_t.S000066400000000000000000000306221313527062700173500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define A $9 #define LDA $10 #define X $11 #define INCX $2 #define Y $6 #define INCY $7 #define BUFFER $8 #define XORIG $3 #define XX $12 #define YY $13 #define I $14 #define J $15 #define AO1 $16 #define AO2 $17 #define ALPHA_R $f15 #define ALPHA_I $f16 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define y1 $f8 #define y2 $f9 #define y3 $f10 #define y4 $f11 #define x1 $f12 #define x2 $f13 #define x3 $f14 #define x4 $f17 #define x5 $f18 #define x6 $f19 #define x7 $f20 #define x8 $f21 #if !defined(CONJ) && !defined(XCONJ) #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #if defined(CONJ) && !defined(XCONJ) #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #endif #if !defined(CONJ) && defined(XCONJ) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #endif #if defined(CONJ) && defined(XCONJ) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 NMSUB #define MADD4 NMSUB #endif PROLOGUE LDARG INCX, 0($sp) LDARG Y, 8($sp) LDARG INCY, 16($sp) LDARG BUFFER, 24($sp) #ifdef __64BIT__ daddiu $sp, $sp, -16 #else daddiu $sp, $sp, -32 #endif MTC $0, y1 SDARG $16, 0($sp) SDARG $17, 8($sp) dsll LDA, LDA, ZBASE_SHIFT #ifndef __64BIT__ sdc1 $f20, 16($sp) sdc1 $f21, 24($sp) #endif blez M, .L999 dsll INCX, INCX, ZBASE_SHIFT blez N, .L999 dsll INCY, INCY, ZBASE_SHIFT li XORIG, 2 * SIZE beq INCX, XORIG, .L10 move XORIG, X dsra I, M, 2 move XORIG, BUFFER blez I, .L05 move YY, BUFFER .align 3 .L02: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) daddu X, X, INCX daddiu I, I, -1 daddiu YY, YY, 8 * SIZE ST a1, -8 * SIZE(YY) ST a2, -7 * SIZE(YY) ST a3, -6 * SIZE(YY) ST a4, -5 * SIZE(YY) ST a5, -4 * SIZE(YY) ST a6, -3 * SIZE(YY) ST a7, -2 * SIZE(YY) bgtz I, .L02 ST a8, -1 * SIZE(YY) .align 3 .L05: andi I, M, 3 blez I, .L10 NOP .align 3 .L06: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(YY) ST a2, 1 * SIZE(YY) daddiu I, I, -1 bgtz I, .L06 daddiu YY, YY, 2 * SIZE .align 3 .L10: dsra J, N, 1 blez J, .L20 move YY, Y .align 3 .L11: move AO1, A MOV y2, y1 daddu AO2, A, LDA MOV y3, y1 daddu A, AO2, LDA MOV y4, y1 dsra I, M, 2 blez I, .L15 move XX, XORIG LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x4, 3 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a3, 0 * SIZE(AO2) LD a2, 1 * SIZE(AO1) LD a4, 1 * SIZE(AO2) LD a5, 2 * SIZE(AO1) LD a7, 2 * SIZE(AO2) LD a6, 3 * SIZE(AO1) LD a8, 3 * SIZE(AO2) daddiu I, I, -1 blez I, .L13 NOP .align 3 .L12: MADD1 y1, y1, x1, a1 LD x3, 2 * SIZE(XX) MADD2 y2, y2, x2, a1 LD a1, 4 * SIZE(AO1) MADD1 y3, y3, x1, a3 NOP MADD2 y4, y4, x2, a3 LD a3, 4 * SIZE(AO2) MADD3 y1, y1, x2, a2 NOP MADD4 y2, y2, x1, a2 LD a2, 5 * SIZE(AO1) MADD3 y3, y3, x2, a4 LD x2, 5 * SIZE(XX) MADD4 y4, y4, x1, a4 LD a4, 5 * SIZE(AO2) MADD1 y1, y1, x3, a5 LD x1, 4 * SIZE(XX) MADD2 y2, y2, x4, a5 LD a5, 6 * SIZE(AO1) MADD1 y3, y3, x3, a7 MADD2 y4, y4, x4, a7 LD a7, 6 * SIZE(AO2) MADD3 y1, y1, x4, a6 daddiu I, I, -1 MADD4 y2, y2, x3, a6 LD a6, 7 * SIZE(AO1) MADD3 y3, y3, x4, a8 LD x4, 7 * SIZE(XX) MADD4 y4, y4, x3, a8 LD a8, 7 * SIZE(AO2) MADD1 y1, y1, x1, a1 LD x3, 6 * SIZE(XX) MADD2 y2, y2, x2, a1 LD a1, 8 * SIZE(AO1) MADD1 y3, y3, x1, a3 MADD2 y4, y4, x2, a3 LD a3, 8 * SIZE(AO2) MADD3 y1, y1, x2, a2 MADD4 y2, y2, x1, a2 LD a2, 9 * SIZE(AO1) MADD3 y3, y3, x2, a4 LD x2, 9 * SIZE(XX) MADD4 y4, y4, x1, a4 LD a4, 9 * SIZE(AO2) MADD1 y1, y1, x3, a5 LD x1, 8 * SIZE(XX) MADD2 y2, y2, x4, a5 LD a5, 10 * SIZE(AO1) MADD1 y3, y3, x3, a7 daddiu XX, XX, 8 * SIZE MADD2 y4, y4, x4, a7 LD a7, 10 * SIZE(AO2) MADD3 y1, y1, x4, a6 daddiu AO2, AO2, 8 * SIZE MADD4 y2, y2, x3, a6 LD a6, 11 * SIZE(AO1) MADD3 y3, y3, x4, a8 LD x4, 3 * SIZE(XX) MADD4 y4, y4, x3, a8 LD a8, 3 * SIZE(AO2) bgtz I, .L12 daddiu AO1, AO1, 8 * SIZE .align 3 .L13: MADD1 y1, y1, x1, a1 LD x3, 2 * SIZE(XX) MADD2 y2, y2, x2, a1 LD a1, 4 * SIZE(AO1) MADD1 y3, y3, x1, a3 NOP MADD2 y4, y4, x2, a3 LD a3, 4 * SIZE(AO2) MADD3 y1, y1, x2, a2 NOP MADD4 y2, y2, x1, a2 LD a2, 5 * SIZE(AO1) MADD3 y3, y3, x2, a4 LD x2, 5 * SIZE(XX) MADD4 y4, y4, x1, a4 LD a4, 5 * SIZE(AO2) MADD1 y1, y1, x3, a5 LD x1, 4 * SIZE(XX) MADD2 y2, y2, x4, a5 LD a5, 6 * SIZE(AO1) MADD1 y3, y3, x3, a7 MADD2 y4, y4, x4, a7 LD a7, 6 * SIZE(AO2) MADD3 y1, y1, x4, a6 NOP MADD4 y2, y2, x3, a6 LD a6, 7 * SIZE(AO1) MADD3 y3, y3, x4, a8 LD x4, 7 * SIZE(XX) MADD4 y4, y4, x3, a8 LD a8, 7 * SIZE(AO2) MADD1 y1, y1, x1, a1 LD x3, 6 * SIZE(XX) MADD2 y2, y2, x2, a1 NOP MADD1 y3, y3, x1, a3 MADD2 y4, y4, x2, a3 MADD3 y1, y1, x2, a2 MADD4 y2, y2, x1, a2 MADD3 y3, y3, x2, a4 MADD4 y4, y4, x1, a4 MADD1 y1, y1, x3, a5 MADD2 y2, y2, x4, a5 MADD1 y3, y3, x3, a7 MADD2 y4, y4, x4, a7 MADD3 y1, y1, x4, a6 daddiu XX, XX, 8 * SIZE MADD4 y2, y2, x3, a6 daddiu AO1, AO1, 8 * SIZE MADD3 y3, y3, x4, a8 daddiu AO2, AO2, 8 * SIZE MADD4 y4, y4, x3, a8 NOP .align 3 .L15: andi I, M, 2 NOP blez I, .L17 NOP LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x3, 2 * SIZE(XX) LD x4, 3 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a3, 0 * SIZE(AO2) LD a2, 1 * SIZE(AO1) LD a4, 1 * SIZE(AO2) LD a5, 2 * SIZE(AO1) LD a7, 2 * SIZE(AO2) LD a6, 3 * SIZE(AO1) LD a8, 3 * SIZE(AO2) MADD1 y1, y1, x1, a1 MADD2 y2, y2, x2, a1 MADD1 y3, y3, x1, a3 MADD2 y4, y4, x2, a3 MADD3 y1, y1, x2, a2 MADD4 y2, y2, x1, a2 MADD3 y3, y3, x2, a4 MADD4 y4, y4, x1, a4 MADD1 y1, y1, x3, a5 MADD2 y2, y2, x4, a5 MADD1 y3, y3, x3, a7 MADD2 y4, y4, x4, a7 MADD3 y1, y1, x4, a6 daddiu XX, XX, 4 * SIZE MADD4 y2, y2, x3, a6 daddiu AO1, AO1, 4 * SIZE MADD3 y3, y3, x4, a8 daddiu AO2, AO2, 4 * SIZE MADD4 y4, y4, x3, a8 NOP .align 3 .L17: andi I, M, 1 blez I, .L19 .align 3 .L18: LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a3, 0 * SIZE(AO2) MADD1 y1, y1, x1, a1 LD a2, 1 * SIZE(AO1) MADD2 y2, y2, x2, a1 LD a4, 1 * SIZE(AO2) MADD1 y3, y3, x1, a3 MADD2 y4, y4, x2, a3 MADD3 y1, y1, x2, a2 MADD4 y2, y2, x1, a2 MADD3 y3, y3, x2, a4 MADD4 y4, y4, x1, a4 .align 3 .L19: LD a1, 0 * SIZE(Y) LD a2, 1 * SIZE(Y) daddu Y, Y, INCY LD a3, 0 * SIZE(Y) LD a4, 1 * SIZE(Y) daddu Y, Y, INCY MADD a1, a1, ALPHA_R, y1 MADD a2, a2, ALPHA_I, y1 MADD a3, a3, ALPHA_R, y3 MADD a4, a4, ALPHA_I, y3 NMSUB a1, a1, ALPHA_I, y2 MADD a2, a2, ALPHA_R, y2 NMSUB a3, a3, ALPHA_I, y4 MTC $0, y1 MADD a4, a4, ALPHA_R, y4 daddiu J, J, -1 ST a1, 0 * SIZE(YY) ST a2, 1 * SIZE(YY) daddu YY, YY, INCY ST a3, 0 * SIZE(YY) ST a4, 1 * SIZE(YY) bgtz J, .L11 daddu YY, YY, INCY .align 3 .L20: andi J, N, 1 MOV y2, y1 blez J, .L999 dsra I, M, 2 MOV y3, y1 move AO1, A MOV y4, y1 blez I, .L25 move XX, XORIG LD a1, 0 * SIZE(AO1) LD x1, 0 * SIZE(XX) LD a2, 1 * SIZE(AO1) LD x2, 1 * SIZE(XX) LD a5, 2 * SIZE(AO1) LD x4, 3 * SIZE(XX) daddiu I, I, -1 blez I, .L23 LD a6, 3 * SIZE(AO1) .align 3 .L22: MADD1 y1, y1, x1, a1 LD x3, 2 * SIZE(XX) MADD2 y2, y2, x2, a1 LD a1, 4 * SIZE(AO1) MADD3 y3, y3, x2, a2 LD x2, 5 * SIZE(XX) MADD4 y4, y4, x1, a2 LD a2, 5 * SIZE(AO1) MADD1 y1, y1, x3, a5 LD x1, 4 * SIZE(XX) MADD2 y2, y2, x4, a5 LD a5, 6 * SIZE(AO1) MADD3 y3, y3, x4, a6 LD x4, 7 * SIZE(XX) MADD4 y4, y4, x3, a6 LD a6, 7 * SIZE(AO1) MADD1 y1, y1, x1, a1 LD x3, 6 * SIZE(XX) MADD2 y2, y2, x2, a1 LD a1, 8 * SIZE(AO1) MADD3 y3, y3, x2, a2 LD x2, 9 * SIZE(XX) MADD4 y4, y4, x1, a2 LD a2, 9 * SIZE(AO1) MADD1 y1, y1, x3, a5 LD x1, 8 * SIZE(XX) MADD2 y2, y2, x4, a5 LD a5, 10 * SIZE(AO1) MADD3 y3, y3, x4, a6 LD x4, 11 * SIZE(XX) MADD4 y4, y4, x3, a6 LD a6, 11 * SIZE(AO1) daddiu I, I, -1 daddiu XX, XX, 8 * SIZE bgtz I, .L22 daddiu AO1, AO1, 8 * SIZE .align 3 .L23: MADD1 y1, y1, x1, a1 LD x3, 2 * SIZE(XX) MADD2 y2, y2, x2, a1 LD a1, 4 * SIZE(AO1) MADD3 y3, y3, x2, a2 LD x2, 5 * SIZE(XX) MADD4 y4, y4, x1, a2 LD a2, 5 * SIZE(AO1) MADD1 y1, y1, x3, a5 LD x1, 4 * SIZE(XX) MADD2 y2, y2, x4, a5 LD a5, 6 * SIZE(AO1) MADD3 y3, y3, x4, a6 LD x4, 7 * SIZE(XX) MADD4 y4, y4, x3, a6 LD a6, 7 * SIZE(AO1) MADD1 y1, y1, x1, a1 LD x3, 6 * SIZE(XX) MADD2 y2, y2, x2, a1 NOP MADD3 y3, y3, x2, a2 MADD4 y4, y4, x1, a2 MADD1 y1, y1, x3, a5 MADD2 y2, y2, x4, a5 MADD3 y3, y3, x4, a6 daddiu XX, XX, 8 * SIZE MADD4 y4, y4, x3, a6 daddiu AO1, AO1, 8 * SIZE NOP .align 3 .L25: andi I, M, 2 NOP blez I, .L27 NOP LD a1, 0 * SIZE(AO1) LD x1, 0 * SIZE(XX) LD a2, 1 * SIZE(AO1) LD x2, 1 * SIZE(XX) LD a5, 2 * SIZE(AO1) MADD1 y1, y1, x1, a1 LD x3, 2 * SIZE(XX) MADD2 y2, y2, x2, a1 LD a6, 3 * SIZE(AO1) MADD3 y3, y3, x2, a2 LD x4, 3 * SIZE(XX) MADD4 y4, y4, x1, a2 MADD1 y1, y1, x3, a5 MADD2 y2, y2, x4, a5 MADD3 y3, y3, x4, a6 daddiu XX, XX, 4 * SIZE MADD4 y4, y4, x3, a6 daddiu AO1, AO1, 4 * SIZE .align 3 .L27: andi I, M, 1 blez I, .L29 .align 3 .L28: LD a1, 0 * SIZE(AO1) LD x1, 0 * SIZE(XX) LD a2, 1 * SIZE(AO1) LD x2, 1 * SIZE(XX) MADD1 y1, y1, x1, a1 MADD2 y2, y2, x2, a1 MADD3 y3, y3, x2, a2 MADD4 y4, y4, x1, a2 .align 3 .L29: LD a1, 0 * SIZE(Y) LD a2, 1 * SIZE(Y) ADD y1, y1, y3 ADD y2, y2, y4 MADD a1, a1, ALPHA_R, y1 MADD a2, a2, ALPHA_I, y1 NMSUB a1, a1, ALPHA_I, y2 MADD a2, a2, ALPHA_R, y2 ST a1, 0 * SIZE(YY) ST a2, 1 * SIZE(YY) .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) #ifndef __64BIT__ ldc1 $f20, 16($sp) ldc1 $f21, 24($sp) #endif j $31 #ifdef __64BIT__ daddiu $sp, $sp, 16 #else daddiu $sp, $sp, 32 #endif EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zgemv_t_loongson3a.c000066400000000000000000000142441313527062700215340ustar00rootroot00000000000000#include "common.h" #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #if !defined(CONJ) && !defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_0 #define spec_loop spec_loop_0 #define norm_loop_alpha1 norm_loop_alpha1_0 #define norm_loop norm_loop_0 #endif #if defined(CONJ) && !defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_1 #define spec_loop spec_loop_1 #define norm_loop_alpha1 norm_loop_alpha1_1 #define norm_loop norm_loop_1 #endif #if !defined(CONJ) && defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_2 #define spec_loop spec_loop_2 #define norm_loop_alpha1 norm_loop_alpha1_2 #define norm_loop norm_loop_2 #endif #if defined(CONJ) && defined(XCONJ) #define spec_loop_alpha1 spec_loop_alpha1_3 #define spec_loop spec_loop_3 #define norm_loop_alpha1 norm_loop_alpha1_3 #define norm_loop norm_loop_3 #endif #define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) #define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) #define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) #define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) #define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) #define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) #define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) #define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) #define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) #define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) #define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) #define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) #define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { if(!rALPHA && iALPHA) return 0; BLASLONG fahead = 30; BLASLONG spec_unroll = 2; BLASLONG tMQ = M - M % spec_unroll; BLASLONG j = 0, k = 0, jj = 0; if(rALPHA == 1 && iALPHA == 0) { if(INCX == 1) { for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(X[ii + fahead]); /*loop_mark*/ spec_loop_alpha1; /*loop_mark*/ spec_loop_alpha1; } for(; likely(i < M); i++) { spec_loop_alpha1; } } } else { for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0, iii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(X[iii + fahead]); /*loop_mark*/ norm_loop_alpha1; /*loop_mark*/ norm_loop_alpha1; } for(; likely(i < M); i++) { norm_loop_alpha1; } } } } else { FLOAT rTmp, iTmp; if(INCX == 1) { for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(X[ii + fahead]); /*loop_mark*/ spec_loop; /*loop_mark*/ spec_loop; } for(; likely(i < M); i++) { spec_loop; } } } else { for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { BLASLONG i = 0, ii = 0, iii = 0; for(; likely(i < tMQ); i += spec_unroll) { prefetch(A[jj + ii + fahead]); prefetch(X[iii + fahead]); /*loop_mark*/ norm_loop; /*loop_mark*/ norm_loop; } for(; likely(i < M); i++) { norm_loop; } } } } return 0; } OpenBLAS-0.2.20/kernel/mips64/znrm2.S000066400000000000000000000157501313527062700167520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define XX $7 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define a5 $f8 #define a6 $f9 #define a7 $f10 #define a8 $f11 #define t1 $f12 #define t2 $f13 #define t3 $f14 #define t4 $f15 #define s1 $f0 #define s2 $f1 #define s3 $f2 #define s4 $f3 #define ALPHA $f16 #define max $f17 PROLOGUE #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif blez N, .L999 MTC $0, s1 blez INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT move XX, X MOV s2, s1 dsra I, N, 2 MOV s3, s1 blez I, .L15 MOV s4, s1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX LD a7, 0 * SIZE(X) LD a8, 1 * SIZE(X) daddiu I, I, -1 blez I, .L13 daddu X, X, INCX .align 3 .L12: FABS t1, a1 LD a1, 0 * SIZE(X) FABS t2, a2 NOP FABS t3, a3 LD a2, 1 * SIZE(X) FABS t4, a4 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a3, 0 * SIZE(X) CMPLT $fcc1, s2, t2 NOP CMPLT $fcc2, s3, t3 LD a4, 1 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 LD a5, 0 * SIZE(X) FABS t2, a6 NOP FABS t3, a7 LD a6, 1 * SIZE(X) FABS t4, a8 daddu X, X, INCX CMPLT $fcc0, s1, t1 LD a7, 0 * SIZE(X) CMPLT $fcc1, s2, t2 NOP CMPLT $fcc2, s3, t3 LD a8, 1 * SIZE(X) CMPLT $fcc3, s4, t4 daddu X, X, INCX CMOVT s1, t1, $fcc0 daddiu I, I, -1 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 bgtz I, .L12 CMOVT s4, t4, $fcc3 .align 3 .L13: FABS t1, a1 FABS t2, a2 FABS t3, a3 FABS t4, a4 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 FABS t1, a5 FABS t2, a6 FABS t3, a7 FABS t4, a8 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMPLT $fcc2, s3, t3 CMPLT $fcc3, s4, t4 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 CMOVT s3, t3, $fcc2 CMOVT s4, t4, $fcc3 .align 3 .L15: andi I, N, 3 blez I, .L100 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddiu I, I, -1 FABS t1, a1 FABS t2, a2 CMPLT $fcc0, s1, t1 CMPLT $fcc1, s2, t2 CMOVT s1, t1, $fcc0 CMOVT s2, t2, $fcc1 bgtz I, .L16 daddu X, X, INCX .align 3 .L100: CMPLT $fcc0, s1, s2 CMPLT $fcc1, s3, s4 CMOVT s1, s2, $fcc0 CMOVT s3, s4, $fcc1 CMPLT $fcc0, s1, s3 CMOVT s1, s3, $fcc0 lui TEMP, 0x3f80 dmtc1 $0, a1 mtc1 TEMP, ALPHA CMPEQ $fcc0, s1, a1 bc1t $fcc0, .L999 cvt.d.s ALPHA, ALPHA div.d ALPHA, ALPHA, s1 MOV max, s1 MOV s1, a1 MOV s2, a1 MOV s3, a1 MOV s4, a1 dsra I, N, 2 blez I, .L105 NOP LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddu XX, XX, INCX LD a3, 0 * SIZE(XX) LD a4, 1 * SIZE(XX) daddu XX, XX, INCX LD a5, 0 * SIZE(XX) LD a6, 1 * SIZE(XX) daddu XX, XX, INCX LD a7, 0 * SIZE(XX) LD a8, 1 * SIZE(XX) daddiu I, I, -1 blez I, .L104 daddu XX, XX, INCX .align 3 .L103: MUL t1, ALPHA, a1 LD a1, 0 * SIZE(XX) MUL t2, ALPHA, a2 daddiu I, I, -1 MUL t3, ALPHA, a3 LD a2, 1 * SIZE(XX) MUL t4, ALPHA, a4 daddu XX, XX, INCX MADD s1, s1, t1, t1 LD a3, 0 * SIZE(XX) MADD s2, s2, t2, t2 NOP MADD s3, s3, t3, t3 LD a4, 1 * SIZE(XX) MADD s4, s4, t4, t4 daddu XX, XX, INCX MUL t1, ALPHA, a5 LD a5, 0 * SIZE(XX) MUL t2, ALPHA, a6 NOP MUL t3, ALPHA, a7 LD a6, 1 * SIZE(XX) MUL t4, ALPHA, a8 daddu XX, XX, INCX MADD s1, s1, t1, t1 LD a7, 0 * SIZE(XX) MADD s2, s2, t2, t2 LD a8, 1 * SIZE(XX) MADD s3, s3, t3, t3 daddu XX, XX, INCX bgtz I, .L103 MADD s4, s4, t4, t4 .align 3 .L104: MUL t1, ALPHA, a1 MUL t2, ALPHA, a2 MUL t3, ALPHA, a3 MUL t4, ALPHA, a4 MADD s1, s1, t1, t1 MADD s2, s2, t2, t2 MADD s3, s3, t3, t3 MADD s4, s4, t4, t4 MUL t1, ALPHA, a5 MUL t2, ALPHA, a6 MUL t3, ALPHA, a7 MUL t4, ALPHA, a8 MADD s1, s1, t1, t1 MADD s2, s2, t2, t2 MADD s3, s3, t3, t3 MADD s4, s4, t4, t4 .align 3 .L105: andi I, N, 3 blez I, .L998 NOP .align 3 .L106: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddiu I, I, -1 MUL t1, ALPHA, a1 MUL t2, ALPHA, a2 MADD s1, s1, t1, t1 daddu XX, XX, INCX bgtz I, .L106 MADD s2, s2, t2, t2 .align 3 .L998: ADD s1, s1, s2 ADD s3, s3, s4 ADD s1, s1, s3 sqrt.d s1, s1 j $31 MUL s1, max, s1 .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zrot.S000066400000000000000000000160231313527062700166720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $5 #define INCX $6 #define Y $7 #define INCY $8 #define XX $9 #define YY $10 #define C $f17 #define S $f18 #define I $2 #define TEMP $3 #define a1 $f4 #define a2 $f5 #define a3 $f6 #define a4 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define t1 $f0 #define t2 $f1 #define t3 $f2 #define t4 $f3 PROLOGUE dsll INCX, INCX, ZBASE_SHIFT li TEMP, 2 * SIZE blez N, .L999 dsll INCY, INCY, ZBASE_SHIFT bne INCX, TEMP, .L20 dsra I, N, 1 bne INCY, TEMP, .L20 NOP blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) LD a3, 2 * SIZE(X) LD b3, 2 * SIZE(Y) MUL t1, S, b1 LD a4, 3 * SIZE(X) MUL t2, C, b1 LD b4, 3 * SIZE(Y) MUL t3, S, b2 blez I, .L13 MUL t4, C, b2 .align 3 .L12: MADD t1, t1, C, a1 LD b1, 4 * SIZE(Y) NMSUB t2, t2, S, a1 LD a1, 4 * SIZE(X) MADD t3, t3, C, a2 LD b2, 5 * SIZE(Y) NMSUB t4, t4, S, a2 LD a2, 5 * SIZE(X) ST t1, 0 * SIZE(X) MUL t1, S, b3 ST t2, 0 * SIZE(Y) MUL t2, C, b3 ST t3, 1 * SIZE(X) MUL t3, S, b4 ST t4, 1 * SIZE(Y) MUL t4, C, b4 MADD t1, t1, C, a3 LD b3, 6 * SIZE(Y) NMSUB t2, t2, S, a3 LD a3, 6 * SIZE(X) MADD t3, t3, C, a4 LD b4, 7 * SIZE(Y) NMSUB t4, t4, S, a4 LD a4, 7 * SIZE(X) ST t1, 2 * SIZE(X) MUL t1, S, b1 ST t2, 2 * SIZE(Y) MUL t2, C, b1 ST t3, 3 * SIZE(X) MUL t3, S, b2 ST t4, 3 * SIZE(Y) MUL t4, C, b2 daddiu I, I, -1 daddiu X, X, 4 * SIZE bgtz I, .L12 daddiu Y, Y, 4 * SIZE .align 3 .L13: MADD t1, t1, C, a1 NMSUB t2, t2, S, a1 MADD t3, t3, C, a2 NMSUB t4, t4, S, a2 ST t1, 0 * SIZE(X) MUL t1, S, b3 ST t2, 0 * SIZE(Y) MUL t2, C, b3 ST t3, 1 * SIZE(X) MUL t3, S, b4 ST t4, 1 * SIZE(Y) MUL t4, C, b4 MADD t1, t1, C, a3 NMSUB t2, t2, S, a3 MADD t3, t3, C, a4 daddiu X, X, 4 * SIZE NMSUB t4, t4, S, a4 daddiu Y, Y, 4 * SIZE ST t1, -2 * SIZE(X) ST t2, -2 * SIZE(Y) ST t3, -1 * SIZE(X) ST t4, -1 * SIZE(Y) .align 3 .L15: andi I, N, 1 blez I, .L999 NOP LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) MUL t1, S, b1 MUL t2, C, b1 MUL t3, S, b2 MUL t4, C, b2 MADD t1, t1, C, a1 NMSUB t2, t2, S, a1 MADD t3, t3, C, a2 NMSUB t4, t4, S, a2 ST t1, 0 * SIZE(X) ST t2, 0 * SIZE(Y) ST t3, 1 * SIZE(X) j .L999 ST t4, 1 * SIZE(Y) .align 3 .L20: move XX, X move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) dadd X, X, INCX LD b2, 1 * SIZE(Y) dadd Y, Y, INCY LD a3, 0 * SIZE(X) LD b3, 0 * SIZE(Y) LD a4, 1 * SIZE(X) dadd X, X, INCX MUL t1, S, b1 LD b4, 1 * SIZE(Y) MUL t2, C, b1 dadd Y, Y, INCY MUL t3, S, b2 blez I, .L23 MUL t4, C, b2 .align 3 .L22: MADD t1, t1, C, a1 LD b1, 0 * SIZE(Y) NMSUB t2, t2, S, a1 LD a1, 0 * SIZE(X) MADD t3, t3, C, a2 LD b2, 1 * SIZE(Y) dadd Y, Y, INCY NMSUB t4, t4, S, a2 LD a2, 1 * SIZE(X) dadd X, X, INCX ST t1, 0 * SIZE(XX) MUL t1, S, b3 ST t2, 0 * SIZE(YY) MUL t2, C, b3 ST t3, 1 * SIZE(XX) dadd XX, XX, INCX MUL t3, S, b4 ST t4, 1 * SIZE(YY) dadd YY, YY, INCY MUL t4, C, b4 MADD t1, t1, C, a3 LD b3, 0 * SIZE(Y) NMSUB t2, t2, S, a3 LD a3, 0 * SIZE(X) MADD t3, t3, C, a4 LD b4, 1 * SIZE(Y) dadd Y, Y, INCY NMSUB t4, t4, S, a4 LD a4, 1 * SIZE(X) dadd X, X, INCX ST t1, 0 * SIZE(XX) MUL t1, S, b1 ST t2, 0 * SIZE(YY) MUL t2, C, b1 ST t3, 1 * SIZE(XX) dadd XX, XX, INCX MUL t3, S, b2 ST t4, 1 * SIZE(YY) MUL t4, C, b2 daddiu I, I, -1 bgtz I, .L22 dadd YY, YY, INCY .align 3 .L23: MADD t1, t1, C, a1 NMSUB t2, t2, S, a1 MADD t3, t3, C, a2 NMSUB t4, t4, S, a2 ST t1, 0 * SIZE(XX) MUL t1, S, b3 ST t2, 0 * SIZE(YY) MUL t2, C, b3 ST t3, 1 * SIZE(XX) dadd XX, XX, INCX MUL t3, S, b4 ST t4, 1 * SIZE(YY) dadd YY, YY, INCY MUL t4, C, b4 MADD t1, t1, C, a3 NMSUB t2, t2, S, a3 MADD t3, t3, C, a4 NMSUB t4, t4, S, a4 ST t1, 0 * SIZE(XX) ST t2, 0 * SIZE(YY) ST t3, 1 * SIZE(XX) dadd XX, XX, INCX ST t4, 1 * SIZE(YY) dadd YY, YY, INCY .align 3 .L25: andi I, N, 1 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) MUL t1, S, b1 MUL t2, C, b1 MUL t3, S, b2 MUL t4, C, b2 MADD t1, t1, C, a1 NMSUB t2, t2, S, a1 MADD t3, t3, C, a2 NMSUB t4, t4, S, a2 ST t1, 0 * SIZE(X) ST t2, 0 * SIZE(Y) ST t3, 1 * SIZE(X) ST t4, 1 * SIZE(Y) .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zscal.S000066400000000000000000000203761313527062700170160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $9 #define INCX $10 #define I $2 #define TEMP $3 #define XX $5 #define ALPHA_R $f15 #define ALPHA_I $f16 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define t1 $f8 #define t2 $f9 #define t3 $f10 #define t4 $f11 PROLOGUE li TEMP, 2 * SIZE MTC $0, a1 blez N, .L999 dsll INCX, INCX, ZBASE_SHIFT CMPEQ $fcc0, ALPHA_R, a1 CMPEQ $fcc1, ALPHA_I, a1 bc1f $fcc0, .L50 NOP bc1f $fcc1, .L50 NOP bne INCX, TEMP, .L20 dsra I, N, 2 blez I, .L15 NOP .align 3 .L12: ST a1, 0 * SIZE(X) ST a1, 1 * SIZE(X) ST a1, 2 * SIZE(X) ST a1, 3 * SIZE(X) ST a1, 4 * SIZE(X) ST a1, 5 * SIZE(X) ST a1, 6 * SIZE(X) ST a1, 7 * SIZE(X) addiu I, I, -1 bgtz I, .L12 daddiu X, X, 8 * SIZE .align 3 .L15: andi I, N, 3 blez I, .L999 NOP .align 3 .L16: ST a1, 0 * SIZE(X) ST a1, 1 * SIZE(X) daddiu I, I, -1 bgtz I, .L16 daddiu X, X, 2 * SIZE j $31 NOP .align 3 .L20: dsra I, N, 2 blez I, .L25 NOP .align 3 .L22: ST a1, 0 * SIZE(X) ST a1, 1 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) ST a1, 1 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) ST a1, 1 * SIZE(X) daddu X, X, INCX ST a1, 0 * SIZE(X) ST a1, 1 * SIZE(X) daddiu I, I, -1 bgtz I, .L22 daddu X, X, INCX .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: ST a1, 0 * SIZE(X) daddiu I, I, -1 ST a1, 1 * SIZE(X) bgtz I, .L26 daddu X, X, INCX j $31 NOP .align 3 .L50: bne INCX, TEMP, .L60 dsra I, N, 2 blez I, .L55 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD a3, 2 * SIZE(X) LD a4, 3 * SIZE(X) LD a5, 4 * SIZE(X) LD a6, 5 * SIZE(X) MUL t1, ALPHA_R, a1 LD a7, 6 * SIZE(X) MUL t2, ALPHA_I, a1 LD a8, 7 * SIZE(X) MUL t3, ALPHA_R, a3 blez I, .L53 MUL t4, ALPHA_I, a3 .align 3 .L52: NMSUB t1, t1, ALPHA_I, a2 LD a1, 8 * SIZE(X) MADD t2, t2, ALPHA_R, a2 LD a2, 9 * SIZE(X) NMSUB t3, t3, ALPHA_I, a4 LD a3, 10 * SIZE(X) MADD t4, t4, ALPHA_R, a4 LD a4, 11 * SIZE(X) ST t1, 0 * SIZE(X) MUL t1, ALPHA_R, a5 ST t2, 1 * SIZE(X) MUL t2, ALPHA_I, a5 ST t3, 2 * SIZE(X) MUL t3, ALPHA_R, a7 ST t4, 3 * SIZE(X) MUL t4, ALPHA_I, a7 NMSUB t1, t1, ALPHA_I, a6 LD a5, 12 * SIZE(X) MADD t2, t2, ALPHA_R, a6 LD a6, 13 * SIZE(X) NMSUB t3, t3, ALPHA_I, a8 LD a7, 14 * SIZE(X) MADD t4, t4, ALPHA_R, a8 LD a8, 15 * SIZE(X) ST t1, 4 * SIZE(X) MUL t1, ALPHA_R, a1 ST t2, 5 * SIZE(X) MUL t2, ALPHA_I, a1 ST t3, 6 * SIZE(X) MUL t3, ALPHA_R, a3 ST t4, 7 * SIZE(X) MUL t4, ALPHA_I, a3 daddiu I, I, -1 bgtz I, .L52 daddiu X, X, 8 * SIZE .align 3 .L53: NMSUB t1, t1, ALPHA_I, a2 MADD t2, t2, ALPHA_R, a2 NMSUB t3, t3, ALPHA_I, a4 MADD t4, t4, ALPHA_R, a4 ST t1, 0 * SIZE(X) MUL t1, ALPHA_R, a5 ST t2, 1 * SIZE(X) MUL t2, ALPHA_I, a5 ST t3, 2 * SIZE(X) MUL t3, ALPHA_R, a7 ST t4, 3 * SIZE(X) MUL t4, ALPHA_I, a7 NMSUB t1, t1, ALPHA_I, a6 MADD t2, t2, ALPHA_R, a6 NMSUB t3, t3, ALPHA_I, a8 MADD t4, t4, ALPHA_R, a8 ST t1, 4 * SIZE(X) ST t2, 5 * SIZE(X) ST t3, 6 * SIZE(X) ST t4, 7 * SIZE(X) daddiu X, X, 8 * SIZE .align 3 .L55: andi I, N, 3 blez I, .L999 NOP .align 3 .L56: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) MUL t1, ALPHA_R, a1 MUL t2, ALPHA_I, a1 NMSUB t1, t1, ALPHA_I, a2 MADD t2, t2, ALPHA_R, a2 daddiu X, X, 2 * SIZE daddiu I, I, -1 ST t1, -2 * SIZE(X) bgtz I, .L56 ST t2, -1 * SIZE(X) j $31 NOP .align 3 .L60: dsra I, N, 2 move XX, X blez I, .L65 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) daddu X, X, INCX LD a3, 0 * SIZE(X) LD a4, 1 * SIZE(X) daddu X, X, INCX LD a5, 0 * SIZE(X) LD a6, 1 * SIZE(X) daddu X, X, INCX MUL t1, ALPHA_R, a1 LD a7, 0 * SIZE(X) MUL t2, ALPHA_I, a1 LD a8, 1 * SIZE(X) MUL t3, ALPHA_R, a3 daddu X, X, INCX blez I, .L63 MUL t4, ALPHA_I, a3 .align 3 .L62: NMSUB t1, t1, ALPHA_I, a2 LD a1, 0 * SIZE(X) MADD t2, t2, ALPHA_R, a2 LD a2, 1 * SIZE(X) daddu X, X, INCX NMSUB t3, t3, ALPHA_I, a4 LD a3, 0 * SIZE(X) MADD t4, t4, ALPHA_R, a4 LD a4, 1 * SIZE(X) daddu X, X, INCX ST t1, 0 * SIZE(XX) MUL t1, ALPHA_R, a5 ST t2, 1 * SIZE(XX) MUL t2, ALPHA_I, a5 daddu XX, XX, INCX ST t3, 0 * SIZE(XX) MUL t3, ALPHA_R, a7 ST t4, 1 * SIZE(XX) MUL t4, ALPHA_I, a7 daddu XX, XX, INCX NMSUB t1, t1, ALPHA_I, a6 LD a5, 0 * SIZE(X) MADD t2, t2, ALPHA_R, a6 LD a6, 1 * SIZE(X) daddu X, X, INCX NMSUB t3, t3, ALPHA_I, a8 LD a7, 0 * SIZE(X) MADD t4, t4, ALPHA_R, a8 LD a8, 1 * SIZE(X) daddu X, X, INCX ST t1, 0 * SIZE(XX) MUL t1, ALPHA_R, a1 ST t2, 1 * SIZE(XX) MUL t2, ALPHA_I, a1 daddu XX, XX, INCX ST t3, 0 * SIZE(XX) MUL t3, ALPHA_R, a3 ST t4, 1 * SIZE(XX) MUL t4, ALPHA_I, a3 daddiu I, I, -1 bgtz I, .L62 daddu XX, XX, INCX .align 3 .L63: NMSUB t1, t1, ALPHA_I, a2 MADD t2, t2, ALPHA_R, a2 NMSUB t3, t3, ALPHA_I, a4 MADD t4, t4, ALPHA_R, a4 ST t1, 0 * SIZE(XX) MUL t1, ALPHA_R, a5 ST t2, 1 * SIZE(XX) MUL t2, ALPHA_I, a5 daddu XX, XX, INCX ST t3, 0 * SIZE(XX) MUL t3, ALPHA_R, a7 ST t4, 1 * SIZE(XX) MUL t4, ALPHA_I, a7 daddu XX, XX, INCX NMSUB t1, t1, ALPHA_I, a6 MADD t2, t2, ALPHA_R, a6 NMSUB t3, t3, ALPHA_I, a8 MADD t4, t4, ALPHA_R, a8 ST t1, 0 * SIZE(XX) ST t2, 1 * SIZE(XX) daddu XX, XX, INCX ST t3, 0 * SIZE(XX) ST t4, 1 * SIZE(XX) daddu XX, XX, INCX .align 3 .L65: andi I, N, 3 blez I, .L999 NOP .align 3 .L66: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) MUL t1, ALPHA_R, a1 MUL t2, ALPHA_I, a1 NMSUB t1, t1, ALPHA_I, a2 MADD t2, t2, ALPHA_R, a2 daddiu I, I, -1 ST t1, 0 * SIZE(X) ST t2, 1 * SIZE(X) bgtz I, .L66 daddu X, X, INCX .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zswap.S000066400000000000000000000170011313527062700170350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N $4 #define X $9 #define INCX $10 #define Y $11 #define INCY $8 #define I $2 #define TEMP $3 #define XX $5 #define YY $6 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define b1 $f8 #define b2 $f9 #define b3 $f10 #define b4 $f11 #define b5 $f12 #define b6 $f13 #define b7 $f14 #define b8 $f15 PROLOGUE LDARG INCY, 0($sp) li TEMP, 2 * SIZE blez N, .L999 dsll INCX, INCX, ZBASE_SHIFT bne INCX, TEMP, .L20 dsll INCY, INCY, ZBASE_SHIFT bne INCY, TEMP, .L20 dsra I, N, 2 blez I, .L15 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) LD a3, 2 * SIZE(X) LD b3, 2 * SIZE(Y) LD a4, 3 * SIZE(X) LD b4, 3 * SIZE(Y) LD a5, 4 * SIZE(X) LD b5, 4 * SIZE(Y) LD a6, 5 * SIZE(X) LD b6, 5 * SIZE(Y) LD a7, 6 * SIZE(X) LD b7, 6 * SIZE(Y) LD a8, 7 * SIZE(X) LD b8, 7 * SIZE(Y) blez I, .L13 NOP .align 3 .L12: ST a1, 0 * SIZE(Y) LD a1, 8 * SIZE(X) ST b1, 0 * SIZE(X) LD b1, 8 * SIZE(Y) ST a2, 1 * SIZE(Y) LD a2, 9 * SIZE(X) ST b2, 1 * SIZE(X) LD b2, 9 * SIZE(Y) ST a3, 2 * SIZE(Y) LD a3, 10 * SIZE(X) ST b3, 2 * SIZE(X) LD b3, 10 * SIZE(Y) ST a4, 3 * SIZE(Y) LD a4, 11 * SIZE(X) ST b4, 3 * SIZE(X) LD b4, 11 * SIZE(Y) ST a5, 4 * SIZE(Y) LD a5, 12 * SIZE(X) ST b5, 4 * SIZE(X) LD b5, 12 * SIZE(Y) ST a6, 5 * SIZE(Y) LD a6, 13 * SIZE(X) ST b6, 5 * SIZE(X) LD b6, 13 * SIZE(Y) ST a7, 6 * SIZE(Y) LD a7, 14 * SIZE(X) ST b7, 6 * SIZE(X) LD b7, 14 * SIZE(Y) ST a8, 7 * SIZE(Y) LD a8, 15 * SIZE(X) ST b8, 7 * SIZE(X) LD b8, 15 * SIZE(Y) daddiu I, I, -1 daddiu X, X, 8 * SIZE bgtz I, .L12 daddiu Y, Y, 8 * SIZE .align 3 .L13: ST a1, 0 * SIZE(Y) ST b1, 0 * SIZE(X) ST a2, 1 * SIZE(Y) ST b2, 1 * SIZE(X) ST a3, 2 * SIZE(Y) ST b3, 2 * SIZE(X) ST a4, 3 * SIZE(Y) ST b4, 3 * SIZE(X) ST a5, 4 * SIZE(Y) ST b5, 4 * SIZE(X) ST a6, 5 * SIZE(Y) ST b6, 5 * SIZE(X) ST a7, 6 * SIZE(Y) ST b7, 6 * SIZE(X) ST a8, 7 * SIZE(Y) ST b8, 7 * SIZE(X) daddiu X, X, 8 * SIZE daddiu Y, Y, 8 * SIZE .align 3 .L15: andi I, N, 3 blez I, .L999 NOP .align 3 .L16: LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) daddiu X, X, 2 * SIZE daddiu I, I, -1 daddiu Y, Y, 2 * SIZE ST b1, -2 * SIZE(X) ST b2, -1 * SIZE(X) ST a1, -2 * SIZE(Y) bgtz I, .L16 ST a2, -1 * SIZE(Y) j .L999 NOP .align 3 .L20: dsra I, N, 2 move XX, X move YY, Y blez I, .L25 daddiu I, I, -1 LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) LD a2, 1 * SIZE(X) LD b2, 1 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY LD a3, 0 * SIZE(X) LD b3, 0 * SIZE(Y) LD a4, 1 * SIZE(X) LD b4, 1 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY LD a5, 0 * SIZE(X) LD b5, 0 * SIZE(Y) LD a6, 1 * SIZE(X) LD b6, 1 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY LD a7, 0 * SIZE(X) LD b7, 0 * SIZE(Y) LD a8, 1 * SIZE(X) LD b8, 1 * SIZE(Y) daddu X, X, INCX daddu Y, Y, INCY blez I, .L23 NOP .align 3 .L22: ST a1, 0 * SIZE(YY) LD a1, 0 * SIZE(X) ST b1, 0 * SIZE(XX) LD b1, 0 * SIZE(Y) ST a2, 1 * SIZE(YY) daddu YY, YY, INCY LD a2, 1 * SIZE(X) daddu X, X, INCX ST b2, 1 * SIZE(XX) daddu XX, XX, INCX LD b2, 1 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(YY) LD a3, 0 * SIZE(X) ST b3, 0 * SIZE(XX) LD b3, 0 * SIZE(Y) ST a4, 1 * SIZE(YY) daddu YY, YY, INCY LD a4, 1 * SIZE(X) daddu X, X, INCX ST b4, 1 * SIZE(XX) daddu XX, XX, INCX LD b4, 1 * SIZE(Y) daddu Y, Y, INCY ST a5, 0 * SIZE(YY) LD a5, 0 * SIZE(X) ST b5, 0 * SIZE(XX) LD b5, 0 * SIZE(Y) ST a6, 1 * SIZE(YY) daddu YY, YY, INCY LD a6, 1 * SIZE(X) daddu X, X, INCX ST b6, 1 * SIZE(XX) daddu XX, XX, INCX LD b6, 1 * SIZE(Y) daddu Y, Y, INCY ST a7, 0 * SIZE(YY) LD a7, 0 * SIZE(X) ST b7, 0 * SIZE(XX) LD b7, 0 * SIZE(Y) ST a8, 1 * SIZE(YY) daddu YY, YY, INCY LD a8, 1 * SIZE(X) daddu X, X, INCX ST b8, 1 * SIZE(XX) daddu XX, XX, INCX LD b8, 1 * SIZE(Y) daddiu I, I, -1 bgtz I, .L22 daddu Y, Y, INCY .align 3 .L23: ST a1, 0 * SIZE(YY) ST b1, 0 * SIZE(XX) ST a2, 1 * SIZE(YY) ST b2, 1 * SIZE(XX) daddu YY, YY, INCY daddu XX, XX, INCX ST a3, 0 * SIZE(YY) ST b3, 0 * SIZE(XX) ST a4, 1 * SIZE(YY) ST b4, 1 * SIZE(XX) daddu YY, YY, INCY daddu XX, XX, INCX ST a5, 0 * SIZE(YY) ST b5, 0 * SIZE(XX) ST a6, 1 * SIZE(YY) ST b6, 1 * SIZE(XX) daddu YY, YY, INCY daddu XX, XX, INCX ST a7, 0 * SIZE(YY) ST b7, 0 * SIZE(XX) ST a8, 1 * SIZE(YY) ST b8, 1 * SIZE(XX) daddu YY, YY, INCY daddu XX, XX, INCX .align 3 .L25: andi I, N, 3 blez I, .L999 NOP .align 3 .L26: LD a1, 0 * SIZE(X) LD a2, 1 * SIZE(X) LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) daddiu I, I, -1 ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) daddu Y, Y, INCY ST b1, 0 * SIZE(X) ST b2, 1 * SIZE(X) bgtz I, .L26 daddu X, X, INCX .align 3 .L999: j $31 NOP EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zsymv_L.S000066400000000000000000000332551313527062700173450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define A $7 #define LDA $8 #define X $9 #define INCX $10 #define Y $11 #define INCY $5 #define BUFFER $6 #define XX $12 #define YY $13 #define I $14 #define IS $15 #define AO1 $16 #define AO2 $17 #define Y1 $18 #define TEMP $19 #define II INCX #define ALPHA_R $f13 #define ALPHA_I $f14 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define alpha1 $f8 #define alpha2 $f9 #define alpha3 $f10 #define alpha4 $f11 #define x1 $f12 #define x2 $f15 #define x3 $f16 #define x4 $f17 #define xsum1 $f18 #define xsum2 $f19 #define xsum3 $f20 #define xsum4 $f21 #define ysum1 $f22 #define ysum2 $f23 #define ysum3 $f24 #define ysum4 $f25 #ifndef HEMV #define ADD1 NMSUB #define ADD2 MADD #else #define ADD1 MADD #define ADD2 NMSUB #endif PROLOGUE LDARG INCY, 0($sp) LDARG BUFFER, 8($sp) #ifdef __64BIT__ daddiu $sp, $sp, -64 #else daddiu $sp, $sp, -80 #endif SDARG $16, 0($sp) dsll LDA, LDA, ZBASE_SHIFT SDARG $17, 8($sp) dsll INCX, INCX, ZBASE_SHIFT SDARG $18, 16($sp) dsll INCY, INCY, ZBASE_SHIFT SDARG $19, 24($sp) nop sdc1 $f24, 32($sp) sdc1 $f25, 40($sp) #ifndef __64BIT__ sdc1 $f20, 48($sp) sdc1 $f21, 56($sp) sdc1 $f22, 64($sp) sdc1 $f23, 72($sp) #endif blez M, .L999 li IS, 2 * SIZE beq IS, INCX, .L05 move Y1, Y dsra I, M, 2 move XX, X blez I, .L02 move X, BUFFER .align 3 .L01: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddu XX, XX, INCX LD a3, 0 * SIZE(XX) LD a4, 1 * SIZE(XX) daddu XX, XX, INCX LD a5, 0 * SIZE(XX) LD a6, 1 * SIZE(XX) daddu XX, XX, INCX LD a7, 0 * SIZE(XX) LD a8, 1 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) ST a5, 4 * SIZE(BUFFER) ST a6, 5 * SIZE(BUFFER) ST a7, 6 * SIZE(BUFFER) ST a8, 7 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L01 daddiu BUFFER, BUFFER, 8 * SIZE .align 3 .L02: andi I, M, 3 blez I, .L05 NOP .align 3 .L03: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L03 daddiu BUFFER, BUFFER, 2 * SIZE .align 3 .L05: beq IS, INCY, .L10 daddiu BUFFER, BUFFER, 255 li TEMP, -256 and BUFFER, BUFFER, TEMP dsra I, M, 2 move Y1, BUFFER blez I, .L07 move YY, Y .align 3 .L06: LD a1, 0 * SIZE(YY) LD a2, 1 * SIZE(YY) daddu YY, YY, INCY LD a3, 0 * SIZE(YY) LD a4, 1 * SIZE(YY) daddu YY, YY, INCY LD a5, 0 * SIZE(YY) LD a6, 1 * SIZE(YY) daddu YY, YY, INCY LD a7, 0 * SIZE(YY) LD a8, 1 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) ST a5, 4 * SIZE(BUFFER) ST a6, 5 * SIZE(BUFFER) ST a7, 6 * SIZE(BUFFER) ST a8, 7 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L06 daddiu BUFFER, BUFFER, 8 * SIZE .align 3 .L07: andi I, M, 3 blez I, .L10 NOP .align 3 .L08: LD a1, 0 * SIZE(YY) LD a2, 1 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L08 daddiu BUFFER, BUFFER, 2 * SIZE .align 3 .L10: slti TEMP, M, 2 nop bgtz TEMP, .L20 li IS, 0 .align 3 .L11: dsll TEMP, IS, ZBASE_SHIFT nop daddu XX, X, TEMP daddu YY, Y1, TEMP LD alpha1, 0 * SIZE(XX) LD alpha2, 1 * SIZE(XX) LD alpha3, 2 * SIZE(XX) LD alpha4, 3 * SIZE(XX) move AO1, A daddu AO2, A, LDA LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a3, 2 * SIZE(AO1) LD a4, 3 * SIZE(AO1) LD a5, 0 * SIZE(AO2) LD a6, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) MUL xsum1, alpha1, a1 daddiu XX, XX, 4 * SIZE MUL xsum2, alpha2, a1 daddiu YY, YY, 4 * SIZE MUL xsum3, alpha1, a3 daddu A, AO2, LDA MUL xsum4, alpha2, a3 daddiu A, A, 4 * SIZE #ifndef HEMV NMSUB xsum1, xsum1, alpha2, a2 MADD xsum2, xsum2, alpha1, a2 #endif NMSUB xsum3, xsum3, alpha2, a4 daddiu AO1, AO1, 4 * SIZE MADD xsum4, xsum4, alpha1, a4 daddiu AO2, AO2, 4 * SIZE MADD xsum1, xsum1, alpha3, a3 MADD xsum2, xsum2, alpha4, a3 MADD xsum3, xsum3, alpha3, a7 MADD xsum4, xsum4, alpha4, a7 ADD1 xsum1, xsum1, alpha4, a4 ADD2 xsum2, xsum2, alpha3, a4 #ifndef HEMV ADD1 xsum3, xsum3, alpha4, a8 ADD2 xsum4, xsum4, alpha3, a8 #endif MOV x1, alpha1 dsubu II, M, IS MOV x2, alpha2 daddiu II, II, - 2 MOV x3, alpha3 dsra I, II, 1 MOV x4, alpha4 nop MUL alpha1, ALPHA_R, alpha1 MUL alpha2, ALPHA_R, alpha2 MUL alpha3, ALPHA_R, alpha3 MUL alpha4, ALPHA_R, alpha4 NMSUB alpha1, alpha1, ALPHA_I, x2 MADD alpha2, alpha2, ALPHA_I, x1 NMSUB alpha3, alpha3, ALPHA_I, x4 MADD alpha4, alpha4, ALPHA_I, x3 blez I, .L15 daddiu I, I, -1 LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x4, 3 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a3, 2 * SIZE(AO1) LD a4, 3 * SIZE(AO1) LD a5, 0 * SIZE(AO2) LD a6, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) LD ysum1, 0 * SIZE(YY) blez I, .L13 LD ysum2, 1 * SIZE(YY) .align 3 .L12: MADD ysum1, ysum1, alpha1, a1 LD ysum3, 2 * SIZE(YY) MADD ysum2, ysum2, alpha2, a1 LD ysum4, 3 * SIZE(YY) MADD xsum1, xsum1, x1, a1 LD a8, 3 * SIZE(AO2) MADD xsum2, xsum2, x2, a1 LD a1, 4 * SIZE(AO1) MADD ysum3, ysum3, alpha1, a3 LD x3, 2 * SIZE(XX) MADD ysum4, ysum4, alpha2, a3 daddiu I, I, -1 MADD xsum3, xsum3, x1, a5 MADD xsum4, xsum4, x2, a5 NMSUB ysum1, ysum1, alpha2, a2 MADD ysum2, ysum2, alpha1, a2 ADD1 xsum1, xsum1, x2, a2 daddiu AO2, AO2, 4 * SIZE ADD2 xsum2, xsum2, x1, a2 LD a2, 5 * SIZE(AO1) NMSUB ysum3, ysum3, alpha2, a4 MADD ysum4, ysum4, alpha1, a4 ADD1 xsum3, xsum3, x2, a6 LD x2, 5 * SIZE(XX) ADD2 xsum4, xsum4, x1, a6 LD x1, 4 * SIZE(XX) MADD ysum1, ysum1, alpha3, a5 MADD ysum2, ysum2, alpha4, a5 MADD xsum1, xsum1, x3, a3 LD a5, 0 * SIZE(AO2) MADD xsum2, xsum2, x4, a3 LD a3, 6 * SIZE(AO1) MADD ysum3, ysum3, alpha3, a7 MADD ysum4, ysum4, alpha4, a7 MADD xsum3, xsum3, x3, a7 daddiu AO1, AO1, 4 * SIZE MADD xsum4, xsum4, x4, a7 LD a7, 2 * SIZE(AO2) NMSUB ysum1, ysum1, alpha4, a6 daddiu XX, XX, 4 * SIZE MADD ysum2, ysum2, alpha3, a6 LD a6, 1 * SIZE(AO2) ADD1 xsum1, xsum1, x4, a4 daddiu YY, YY, 4 * SIZE ADD2 xsum2, xsum2, x3, a4 LD a4, 3 * SIZE(AO1) NMSUB ysum3, ysum3, alpha4, a8 ST ysum1,-4 * SIZE(YY) MADD ysum4, ysum4, alpha3, a8 ST ysum2,-3 * SIZE(YY) LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) ADD1 xsum3, xsum3, x4, a8 LD x4, 3 * SIZE(XX) ADD2 xsum4, xsum4, x3, a8 ST ysum3,-2 * SIZE(YY) bgtz I, .L12 ST ysum4,-1 * SIZE(YY) .align 3 .L13: MADD ysum1, ysum1, alpha1, a1 LD ysum3, 2 * SIZE(YY) MADD ysum2, ysum2, alpha2, a1 LD ysum4, 3 * SIZE(YY) MADD xsum1, xsum1, x1, a1 LD a8, 3 * SIZE(AO2) MADD xsum2, xsum2, x2, a1 LD x3, 2 * SIZE(XX) MADD ysum3, ysum3, alpha1, a3 MADD ysum4, ysum4, alpha2, a3 MADD xsum3, xsum3, x1, a5 MADD xsum4, xsum4, x2, a5 NMSUB ysum1, ysum1, alpha2, a2 MADD ysum2, ysum2, alpha1, a2 ADD1 xsum1, xsum1, x2, a2 ADD2 xsum2, xsum2, x1, a2 NMSUB ysum3, ysum3, alpha2, a4 MADD ysum4, ysum4, alpha1, a4 ADD1 xsum3, xsum3, x2, a6 ADD2 xsum4, xsum4, x1, a6 MADD ysum1, ysum1, alpha3, a5 MADD ysum2, ysum2, alpha4, a5 MADD xsum1, xsum1, x3, a3 MADD xsum2, xsum2, x4, a3 MADD ysum3, ysum3, alpha3, a7 MADD ysum4, ysum4, alpha4, a7 MADD xsum3, xsum3, x3, a7 MADD xsum4, xsum4, x4, a7 NMSUB ysum1, ysum1, alpha4, a6 MADD ysum2, ysum2, alpha3, a6 ADD1 xsum1, xsum1, x4, a4 ADD2 xsum2, xsum2, x3, a4 NMSUB ysum3, ysum3, alpha4, a8 daddiu XX, XX, 4 * SIZE MADD ysum4, ysum4, alpha3, a8 daddiu YY, YY, 4 * SIZE ADD1 xsum3, xsum3, x4, a8 daddiu AO1, AO1, 4 * SIZE ADD2 xsum4, xsum4, x3, a8 daddiu AO2, AO2, 4 * SIZE ST ysum1, -4 * SIZE(YY) ST ysum2, -3 * SIZE(YY) ST ysum3, -2 * SIZE(YY) ST ysum4, -1 * SIZE(YY) .align 3 .L15: andi I, M, 1 NOP blez I, .L16 NOP LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a3, 2 * SIZE(AO1) LD a4, 3 * SIZE(AO1) LD a5, 0 * SIZE(AO2) LD a6, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha2, a1 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x2, a1 MADD xsum3, xsum3, x1, a5 MADD xsum4, xsum4, x2, a5 NMSUB ysum1, ysum1, alpha2, a2 MADD ysum2, ysum2, alpha1, a2 ADD1 xsum1, xsum1, x2, a2 ADD2 xsum2, xsum2, x1, a2 ADD1 xsum3, xsum3, x2, a6 ADD2 xsum4, xsum4, x1, a6 MADD ysum1, ysum1, alpha3, a5 MADD ysum2, ysum2, alpha4, a5 NMSUB ysum1, ysum1, alpha4, a6 MADD ysum2, ysum2, alpha3, a6 daddiu XX, XX, 2 * SIZE daddiu YY, YY, 2 * SIZE daddiu AO1, AO1, 2 * SIZE daddiu AO2, AO2, 2 * SIZE ST ysum1, -2 * SIZE(YY) ST ysum2, -1 * SIZE(YY) .align 3 .L16: dsll TEMP, IS, ZBASE_SHIFT daddu TEMP, Y1, TEMP LD ysum1, 0 * SIZE(TEMP) LD ysum2, 1 * SIZE(TEMP) LD ysum3, 2 * SIZE(TEMP) LD ysum4, 3 * SIZE(TEMP) MADD ysum1, ysum1, ALPHA_R, xsum1 MADD ysum2, ysum2, ALPHA_I, xsum1 MADD ysum3, ysum3, ALPHA_R, xsum3 MADD ysum4, ysum4, ALPHA_I, xsum3 NMSUB ysum1, ysum1, ALPHA_I, xsum2 MADD ysum2, ysum2, ALPHA_R, xsum2 NMSUB ysum3, ysum3, ALPHA_I, xsum4 MADD ysum4, ysum4, ALPHA_R, xsum4 ST ysum1, 0 * SIZE(TEMP) ST ysum2, 1 * SIZE(TEMP) ST ysum3, 2 * SIZE(TEMP) ST ysum4, 3 * SIZE(TEMP) daddiu TEMP, IS, 4 slt TEMP, M, TEMP beqz TEMP, .L11 daddiu IS, IS, 2 .align 3 .L20: andi TEMP, M, 1 nop blez TEMP, .L900 nop dsll TEMP, IS, ZBASE_SHIFT nop daddu XX, X, TEMP daddu YY, Y1, TEMP LD alpha1, 0 * SIZE(XX) LD alpha2, 1 * SIZE(XX) LD a1, 0 * SIZE(A) LD a2, 1 * SIZE(A) MUL xsum1, alpha1, a1 LD ysum1, 0 * SIZE(YY) MUL xsum2, alpha2, a1 LD ysum2, 1 * SIZE(YY) #ifndef HEMV NMSUB xsum1, xsum1, alpha2, a2 MADD xsum2, xsum2, alpha1, a2 #endif MOV x1, alpha1 MOV x2, alpha2 MUL alpha1, ALPHA_R, alpha1 MUL alpha2, ALPHA_R, alpha2 NMSUB alpha1, alpha1, ALPHA_I, x2 MADD alpha2, alpha2, ALPHA_I, x1 MADD ysum1, ysum1, ALPHA_R, xsum1 MADD ysum2, ysum2, ALPHA_I, xsum1 NMSUB ysum1, ysum1, ALPHA_I, xsum2 MADD ysum2, ysum2, ALPHA_R, xsum2 ST ysum1, 0 * SIZE(YY) ST ysum2, 1 * SIZE(YY) .align 3 .L900: li IS, 2 * SIZE NOP beq INCY, IS, .L999 dsra I, M, 2 blez I, .L905 NOP .align 3 .L902: LD a1, 0 * SIZE(Y1) LD a2, 1 * SIZE(Y1) LD a3, 2 * SIZE(Y1) LD a4, 3 * SIZE(Y1) LD a5, 4 * SIZE(Y1) LD a6, 5 * SIZE(Y1) LD a7, 6 * SIZE(Y1) LD a8, 7 * SIZE(Y1) ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) ST a4, 1 * SIZE(Y) daddu Y, Y, INCY ST a5, 0 * SIZE(Y) ST a6, 1 * SIZE(Y) daddu Y, Y, INCY ST a7, 0 * SIZE(Y) ST a8, 1 * SIZE(Y) daddu Y, Y, INCY daddiu I, I, -1 bgtz I, .L902 daddiu Y1, Y1, 8 * SIZE .align 3 .L905: andi I, M, 3 blez I, .L999 NOP .align 3 .L906: LD a1, 0 * SIZE(Y1) LD a2, 1 * SIZE(Y1) daddiu Y1, Y1, 2 * SIZE ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) daddiu I, I, -1 bgtz I, .L906 daddu Y, Y, INCY .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) ldc1 $f24, 32($sp) ldc1 $f25, 40($sp) #ifndef __64BIT__ ldc1 $f20, 48($sp) ldc1 $f21, 56($sp) ldc1 $f22, 64($sp) ldc1 $f23, 72($sp) #endif j $31 #ifdef __64BIT__ daddiu $sp, $sp, 64 #else daddiu $sp, $sp, 80 #endif EPILOGUE OpenBLAS-0.2.20/kernel/mips64/zsymv_U.S000066400000000000000000000337231313527062700173560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define A $7 #define LDA $8 #define X $9 #define INCX $10 #define Y $11 #define INCY $5 #define BUFFER $6 #define XX $12 #define YY $13 #define I $14 #define IS $15 #define AO1 $16 #define AO2 $17 #define Y1 $18 #define TEMP $19 #define ALPHA_R $f13 #define ALPHA_I $f14 #define a1 $f0 #define a2 $f1 #define a3 $f2 #define a4 $f3 #define a5 $f4 #define a6 $f5 #define a7 $f6 #define a8 $f7 #define alpha1 $f8 #define alpha2 $f9 #define alpha3 $f10 #define alpha4 $f11 #define x1 $f12 #define x2 $f15 #define x3 $f16 #define x4 $f17 #define xsum1 $f18 #define xsum2 $f19 #define xsum3 $f20 #define xsum4 $f21 #define ysum1 $f22 #define ysum2 $f23 #define ysum3 $f24 #define ysum4 $f25 #ifndef HEMV #define ADD1 NMSUB #define ADD2 MADD #else #define ADD1 MADD #define ADD2 NMSUB #endif PROLOGUE LDARG INCY, 0($sp) LDARG BUFFER, 8($sp) #ifdef __64BIT__ daddiu $sp, $sp, -64 #else daddiu $sp, $sp, -80 #endif SDARG $16, 0($sp) dsll LDA, LDA, ZBASE_SHIFT SDARG $17, 8($sp) dsll INCX, INCX, ZBASE_SHIFT SDARG $18, 16($sp) dsll INCY, INCY, ZBASE_SHIFT SDARG $19, 24($sp) nop sdc1 $f24, 32($sp) sdc1 $f25, 40($sp) #ifndef __64BIT__ sdc1 $f20, 48($sp) sdc1 $f21, 56($sp) sdc1 $f22, 64($sp) sdc1 $f23, 72($sp) #endif blez M, .L999 li IS, 2 * SIZE beq IS, INCX, .L05 move Y1, Y dsra I, M, 2 move XX, X blez I, .L02 move X, BUFFER .align 3 .L01: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddu XX, XX, INCX LD a3, 0 * SIZE(XX) LD a4, 1 * SIZE(XX) daddu XX, XX, INCX LD a5, 0 * SIZE(XX) LD a6, 1 * SIZE(XX) daddu XX, XX, INCX LD a7, 0 * SIZE(XX) LD a8, 1 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) ST a5, 4 * SIZE(BUFFER) ST a6, 5 * SIZE(BUFFER) ST a7, 6 * SIZE(BUFFER) ST a8, 7 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L01 daddiu BUFFER, BUFFER, 8 * SIZE .align 3 .L02: andi I, M, 3 blez I, .L05 NOP .align 3 .L03: LD a1, 0 * SIZE(XX) LD a2, 1 * SIZE(XX) daddu XX, XX, INCX ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L03 daddiu BUFFER, BUFFER, 2 * SIZE .align 3 .L05: beq IS, INCY, .L10 daddiu BUFFER, BUFFER, 255 li TEMP, -256 and BUFFER, BUFFER, TEMP dsra I, M, 2 move Y1, BUFFER blez I, .L07 move YY, Y .align 3 .L06: LD a1, 0 * SIZE(YY) LD a2, 1 * SIZE(YY) daddu YY, YY, INCY LD a3, 0 * SIZE(YY) LD a4, 1 * SIZE(YY) daddu YY, YY, INCY LD a5, 0 * SIZE(YY) LD a6, 1 * SIZE(YY) daddu YY, YY, INCY LD a7, 0 * SIZE(YY) LD a8, 1 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) ST a3, 2 * SIZE(BUFFER) ST a4, 3 * SIZE(BUFFER) ST a5, 4 * SIZE(BUFFER) ST a6, 5 * SIZE(BUFFER) ST a7, 6 * SIZE(BUFFER) ST a8, 7 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L06 daddiu BUFFER, BUFFER, 8 * SIZE .align 3 .L07: andi I, M, 3 blez I, .L10 NOP .align 3 .L08: LD a1, 0 * SIZE(YY) LD a2, 1 * SIZE(YY) daddu YY, YY, INCY ST a1, 0 * SIZE(BUFFER) ST a2, 1 * SIZE(BUFFER) daddiu I, I, -1 bgtz I, .L08 daddiu BUFFER, BUFFER, 2 * SIZE .align 3 .L10: slti TEMP, M, 2 nop bgtz TEMP, .L20 li IS, 0 .align 3 .L11: dsll TEMP, IS, ZBASE_SHIFT daddu TEMP, X, TEMP LD x1, 0 * SIZE(TEMP) LD x2, 1 * SIZE(TEMP) LD x3, 2 * SIZE(TEMP) LD x4, 3 * SIZE(TEMP) MTC $0, xsum1 MTC $0, xsum2 MTC $0, xsum3 MTC $0, xsum4 MUL alpha1, ALPHA_R, x1 move AO1, A MUL alpha2, ALPHA_I, x1 dsra I, IS, 1 MUL alpha3, ALPHA_R, x3 daddu AO2, A, LDA MUL alpha4, ALPHA_I, x3 daddu A, AO2, LDA NMSUB alpha1, alpha1, ALPHA_I, x2 move XX, X MADD alpha2, alpha2, ALPHA_R, x2 move YY, Y1 NMSUB alpha3, alpha3, ALPHA_I, x4 MADD alpha4, alpha4, ALPHA_R, x4 blez I, .L15 daddiu I, I, -1 LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD x4, 3 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a3, 2 * SIZE(AO1) LD a4, 3 * SIZE(AO1) LD a5, 0 * SIZE(AO2) LD a6, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) LD ysum1, 0 * SIZE(YY) blez I, .L13 LD ysum2, 1 * SIZE(YY) .align 3 .L12: MADD ysum1, ysum1, alpha1, a1 LD ysum3, 2 * SIZE(YY) MADD ysum2, ysum2, alpha2, a1 LD ysum4, 3 * SIZE(YY) MADD xsum1, xsum1, x1, a1 LD a8, 3 * SIZE(AO2) MADD xsum2, xsum2, x2, a1 LD a1, 4 * SIZE(AO1) MADD ysum3, ysum3, alpha1, a3 LD x3, 2 * SIZE(XX) MADD ysum4, ysum4, alpha2, a3 daddiu I, I, -1 MADD xsum3, xsum3, x1, a5 MADD xsum4, xsum4, x2, a5 NMSUB ysum1, ysum1, alpha2, a2 MADD ysum2, ysum2, alpha1, a2 ADD1 xsum1, xsum1, x2, a2 daddiu AO2, AO2, 4 * SIZE ADD2 xsum2, xsum2, x1, a2 LD a2, 5 * SIZE(AO1) NMSUB ysum3, ysum3, alpha2, a4 MADD ysum4, ysum4, alpha1, a4 ADD1 xsum3, xsum3, x2, a6 LD x2, 5 * SIZE(XX) ADD2 xsum4, xsum4, x1, a6 LD x1, 4 * SIZE(XX) MADD ysum1, ysum1, alpha3, a5 MADD ysum2, ysum2, alpha4, a5 MADD xsum1, xsum1, x3, a3 LD a5, 0 * SIZE(AO2) MADD xsum2, xsum2, x4, a3 LD a3, 6 * SIZE(AO1) MADD ysum3, ysum3, alpha3, a7 MADD ysum4, ysum4, alpha4, a7 MADD xsum3, xsum3, x3, a7 daddiu AO1, AO1, 4 * SIZE MADD xsum4, xsum4, x4, a7 LD a7, 2 * SIZE(AO2) NMSUB ysum1, ysum1, alpha4, a6 daddiu XX, XX, 4 * SIZE MADD ysum2, ysum2, alpha3, a6 LD a6, 1 * SIZE(AO2) ADD1 xsum1, xsum1, x4, a4 daddiu YY, YY, 4 * SIZE ADD2 xsum2, xsum2, x3, a4 LD a4, 3 * SIZE(AO1) NMSUB ysum3, ysum3, alpha4, a8 ST ysum1,-4 * SIZE(YY) MADD ysum4, ysum4, alpha3, a8 ST ysum2,-3 * SIZE(YY) LD ysum1, 0 * SIZE(YY) LD ysum2, 1 * SIZE(YY) ADD1 xsum3, xsum3, x4, a8 LD x4, 3 * SIZE(XX) ADD2 xsum4, xsum4, x3, a8 ST ysum3,-2 * SIZE(YY) bgtz I, .L12 ST ysum4,-1 * SIZE(YY) .align 3 .L13: MADD ysum1, ysum1, alpha1, a1 LD ysum3, 2 * SIZE(YY) MADD ysum2, ysum2, alpha2, a1 LD ysum4, 3 * SIZE(YY) MADD xsum1, xsum1, x1, a1 LD a8, 3 * SIZE(AO2) MADD xsum2, xsum2, x2, a1 LD x3, 2 * SIZE(XX) MADD ysum3, ysum3, alpha1, a3 MADD ysum4, ysum4, alpha2, a3 MADD xsum3, xsum3, x1, a5 MADD xsum4, xsum4, x2, a5 NMSUB ysum1, ysum1, alpha2, a2 MADD ysum2, ysum2, alpha1, a2 ADD1 xsum1, xsum1, x2, a2 ADD2 xsum2, xsum2, x1, a2 NMSUB ysum3, ysum3, alpha2, a4 MADD ysum4, ysum4, alpha1, a4 ADD1 xsum3, xsum3, x2, a6 ADD2 xsum4, xsum4, x1, a6 MADD ysum1, ysum1, alpha3, a5 MADD ysum2, ysum2, alpha4, a5 MADD xsum1, xsum1, x3, a3 MADD xsum2, xsum2, x4, a3 MADD ysum3, ysum3, alpha3, a7 MADD ysum4, ysum4, alpha4, a7 MADD xsum3, xsum3, x3, a7 MADD xsum4, xsum4, x4, a7 NMSUB ysum1, ysum1, alpha4, a6 MADD ysum2, ysum2, alpha3, a6 ADD1 xsum1, xsum1, x4, a4 ADD2 xsum2, xsum2, x3, a4 NMSUB ysum3, ysum3, alpha4, a8 daddiu XX, XX, 4 * SIZE MADD ysum4, ysum4, alpha3, a8 daddiu YY, YY, 4 * SIZE ADD1 xsum3, xsum3, x4, a8 daddiu AO1, AO1, 4 * SIZE ADD2 xsum4, xsum4, x3, a8 daddiu AO2, AO2, 4 * SIZE ST ysum1, -4 * SIZE(YY) ST ysum2, -3 * SIZE(YY) ST ysum3, -2 * SIZE(YY) ST ysum4, -1 * SIZE(YY) .align 3 .L15: dsll TEMP, IS, ZBASE_SHIFT daddu TEMP, Y1, TEMP LD ysum1, 0 * SIZE(TEMP) LD ysum2, 1 * SIZE(TEMP) LD ysum3, 2 * SIZE(TEMP) LD ysum4, 3 * SIZE(TEMP) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD a3, 2 * SIZE(AO1) LD a4, 3 * SIZE(AO1) LD a5, 0 * SIZE(AO2) LD a6, 1 * SIZE(AO2) LD a7, 2 * SIZE(AO2) LD a8, 3 * SIZE(AO2) MOV x1, xsum1 MOV x2, xsum2 MOV x3, xsum3 MOV x4, xsum4 MUL xsum1, ALPHA_R, xsum1 MUL xsum2, ALPHA_R, xsum2 MUL xsum3, ALPHA_R, xsum3 MUL xsum4, ALPHA_R, xsum4 NMSUB xsum1, xsum1, ALPHA_I, x2 MADD xsum2, xsum2, ALPHA_I, x1 NMSUB xsum3, xsum3, ALPHA_I, x4 MADD xsum4, xsum4, ALPHA_I, x3 MADD xsum1, xsum1, alpha1, a1 MADD xsum2, xsum2, alpha2, a1 MADD xsum3, xsum3, alpha1, a5 MADD xsum4, xsum4, alpha2, a5 #ifndef HEMV ADD1 xsum1, xsum1, alpha2, a2 ADD2 xsum2, xsum2, alpha1, a2 #endif ADD1 xsum3, xsum3, alpha2, a6 ADD2 xsum4, xsum4, alpha1, a6 MADD xsum1, xsum1, alpha3, a5 MADD xsum2, xsum2, alpha4, a5 MADD xsum3, xsum3, alpha3, a7 MADD xsum4, xsum4, alpha4, a7 NMSUB xsum1, xsum1, alpha4, a6 MADD xsum2, xsum2, alpha3, a6 #ifndef HEMV ADD1 xsum3, xsum3, alpha4, a8 ADD2 xsum4, xsum4, alpha3, a8 #endif ADD ysum1, ysum1, xsum1 ADD ysum2, ysum2, xsum2 ADD ysum3, ysum3, xsum3 ADD ysum4, ysum4, xsum4 ST ysum1, 0 * SIZE(TEMP) ST ysum2, 1 * SIZE(TEMP) ST ysum3, 2 * SIZE(TEMP) ST ysum4, 3 * SIZE(TEMP) daddiu TEMP, IS, 4 slt TEMP, M, TEMP beqz TEMP, .L11 daddiu IS, IS, 2 .align 3 .L20: andi TEMP, M, 1 nop blez TEMP, .L900 nop dsll TEMP, IS, ZBASE_SHIFT daddu TEMP, X, TEMP LD x1, 0 * SIZE(TEMP) LD x2, 1 * SIZE(TEMP) MTC $0, xsum1 MTC $0, xsum2 MUL alpha1, ALPHA_R, x1 move AO1, A MUL alpha2, ALPHA_I, x1 move I, IS daddu A, AO1, LDA NMSUB alpha1, alpha1, ALPHA_I, x2 move XX, X MADD alpha2, alpha2, ALPHA_R, x2 move YY, Y1 blez I, .L25 daddiu I, I, -1 LD x1, 0 * SIZE(XX) LD x2, 1 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) LD ysum1, 0 * SIZE(YY) blez I, .L23 LD ysum2, 1 * SIZE(YY) .align 3 .L22: MADD ysum1, ysum1, alpha1, a1 daddiu XX, XX, 2 * SIZE MADD ysum2, ysum2, alpha2, a1 daddiu YY, YY, 2 * SIZE MADD xsum1, xsum1, x1, a1 daddiu AO1, AO1, 2 * SIZE MADD xsum2, xsum2, x2, a1 daddiu I, I, -1 NMSUB ysum1, ysum1, alpha2, a2 MADD ysum2, ysum2, alpha1, a2 ADD1 xsum1, xsum1, x2, a2 LD x2, 1 * SIZE(XX) ADD2 xsum2, xsum2, x1, a2 LD x1, 0 * SIZE(XX) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) ST ysum1, -2 * SIZE(YY) LD ysum1, 0 * SIZE(YY) ST ysum2, -1 * SIZE(YY) bgtz I, .L22 LD ysum2, 1 * SIZE(YY) .align 3 .L23: MADD ysum1, ysum1, alpha1, a1 MADD ysum2, ysum2, alpha2, a1 MADD xsum1, xsum1, x1, a1 MADD xsum2, xsum2, x2, a1 NMSUB ysum1, ysum1, alpha2, a2 daddiu XX, XX, 2 * SIZE MADD ysum2, ysum2, alpha1, a2 daddiu YY, YY, 2 * SIZE ADD1 xsum1, xsum1, x2, a2 daddiu AO1, AO1, 2 * SIZE ADD2 xsum2, xsum2, x1, a2 nop ST ysum1, -2 * SIZE(YY) ST ysum2, -1 * SIZE(YY) .align 3 .L25: dsll TEMP, IS, ZBASE_SHIFT daddu TEMP, Y1, TEMP LD ysum1, 0 * SIZE(TEMP) LD ysum2, 1 * SIZE(TEMP) LD a1, 0 * SIZE(AO1) LD a2, 1 * SIZE(AO1) MOV x1, xsum1 MOV x2, xsum2 MUL xsum1, ALPHA_R, xsum1 MUL xsum2, ALPHA_R, xsum2 NMSUB xsum1, xsum1, ALPHA_I, x2 MADD xsum2, xsum2, ALPHA_I, x1 MADD xsum1, xsum1, alpha1, a1 MADD xsum2, xsum2, alpha2, a1 #ifndef HEMV NMSUB xsum1, xsum1, alpha2, a2 MADD xsum2, xsum2, alpha1, a2 #endif ADD ysum1, ysum1, xsum1 ADD ysum2, ysum2, xsum2 ST ysum1, 0 * SIZE(TEMP) ST ysum2, 1 * SIZE(TEMP) .align 3 .L900: li IS, 2 * SIZE beq INCY, IS, .L999 NOP dsra I, M, 2 blez I, .L905 NOP .align 3 .L902: LD a1, 0 * SIZE(Y1) LD a2, 1 * SIZE(Y1) LD a3, 2 * SIZE(Y1) LD a4, 3 * SIZE(Y1) LD a5, 4 * SIZE(Y1) LD a6, 5 * SIZE(Y1) LD a7, 6 * SIZE(Y1) LD a8, 7 * SIZE(Y1) ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) daddu Y, Y, INCY ST a3, 0 * SIZE(Y) ST a4, 1 * SIZE(Y) daddu Y, Y, INCY ST a5, 0 * SIZE(Y) ST a6, 1 * SIZE(Y) daddu Y, Y, INCY ST a7, 0 * SIZE(Y) ST a8, 1 * SIZE(Y) daddu Y, Y, INCY daddiu I, I, -1 bgtz I, .L902 daddiu Y1, Y1, 8 * SIZE .align 3 .L905: andi I, M, 3 blez I, .L999 NOP .align 3 .L906: LD a1, 0 * SIZE(Y1) LD a2, 1 * SIZE(Y1) daddiu Y1, Y1, 2 * SIZE ST a1, 0 * SIZE(Y) ST a2, 1 * SIZE(Y) daddiu I, I, -1 bgtz I, .L906 daddu Y, Y, INCY .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) ldc1 $f24, 32($sp) ldc1 $f25, 40($sp) #ifndef __64BIT__ ldc1 $f20, 48($sp) ldc1 $f21, 56($sp) ldc1 $f22, 64($sp) ldc1 $f23, 72($sp) #endif j $31 #ifdef __64BIT__ daddiu $sp, $sp, 64 #else daddiu $sp, $sp, 80 #endif EPILOGUE OpenBLAS-0.2.20/kernel/mips64/ztrsm_kernel_LT.S000066400000000000000000000675151313527062700210260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define OFFSET $18 #define KK $19 #define TEMP $20 #define AORIG $21 #define a1 $f0 #define a2 $f1 #define a3 $f26 #define a4 $f27 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f15 #define c41 $f16 #define c42 $f17 #define c51 $f18 #define c52 $f19 #define c61 $f20 #define c62 $f21 #define c71 $f22 #define c72 $f23 #define c81 $f24 #define c82 $f25 #ifndef CONJ #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #define MADD5 MSUB #define MADD6 MADD #define MADD7 NMSUB #define MADD8 MADD #else #if defined(LN) || defined(LT) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #else #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #define MADD5 MADD #define MADD6 MSUB #define MADD7 MADD #define MADD8 NMSUB #endif PROLOGUE daddiu $sp, $sp, -128 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) #ifndef __64BIT__ sdc1 $f20, 88($sp) sdc1 $f21, 96($sp) sdc1 $f22,104($sp) sdc1 $f23,112($sp) #endif LDARG LDC, 128 + 0($sp) LDARG OFFSET, 128 + 8($sp) dsll LDC, LDC, ZBASE_SHIFT #ifdef LN mult M, K mflo TEMP dsll TEMP, TEMP, ZBASE_SHIFT daddu A, A, TEMP dsll TEMP, M, ZBASE_SHIFT daddu C, C, TEMP #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mult N, K mflo TEMP dsll TEMP, TEMP, ZBASE_SHIFT daddu B, B, TEMP mult N, LDC mflo TEMP daddu C, C, TEMP dsubu KK, N, OFFSET #endif dsra J, N, 2 blez J, .L20 nop .L10: #ifdef RT dsll TEMP, K, 2 + ZBASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 2 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 MOV c31, c11 MOV c41, c11 MOV c51, c11 move I, M #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO4, LDC #endif blez I, .L19 MOV c61, c11 .align 3 .L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, KK, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B #else #ifdef LN dsll TEMP, K, ZBASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, ZBASE_SHIFT dsll TEMP, KK, 2 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(BO) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 dsra L, TEMP, 2 MOV c32, c11 LD b3, 2 * SIZE(BO) MOV c42, c11 LD b4, 3 * SIZE(BO) MOV c52, c11 LD b5, 4 * SIZE(BO) MOV c62, c11 LD b6, 8 * SIZE(BO) MOV c72, c11 LD b7, 12 * SIZE(BO) MOV c82, c11 blez L, .L15 NOP #endif MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 NOP blez L, .L13 MADD3 c41, c41, a1, b4 .align 3 .L12: MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 NOP MADD3 c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD1 c71, c71, a1, b3 NOP MADD3 c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c51, c51, a4, b7 NOP MADD3 c61, c61, a4, b2 NOP MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD1 c51, c51, a3, b5 NOP MADD3 c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD1 c71, c71, a3, b3 NOP MADD3 c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 daddiu L, L, -1 MADD2 c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD1 c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD3 c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP bgtz L, .L12 MADD3 c41, c41, a1, b4 .align 3 .L13: MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 NOP MADD3 c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD1 c71, c71, a1, b3 NOP MADD3 c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c51, c51, a4, b7 NOP MADD3 c61, c61, a4, b2 NOP MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD1 c51, c51, a3, b5 NOP MADD3 c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD1 c71, c71, a3, b3 NOP MADD3 c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD1 c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD3 c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L18 NOP .align 3 .L16: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 daddiu L, L, -1 MADD3 c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD1 c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD3 c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: ADD c11, c11, c22 ADD c12, c12, c21 ADD c31, c31, c42 ADD c32, c32, c41 ADD c51, c51, c62 ADD c52, c52, c61 ADD c71, c71, c82 ADD c72, c72, c81 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 2 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 SUB c51, b5, c51 SUB c52, b6, c52 SUB c71, b7, c71 SUB c72, b8, c72 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 SUB c51, b5, c51 SUB c52, b6, c52 SUB c71, b7, c71 SUB c72, b8, c72 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) MUL a1, b2, c12 MUL a2, b2, c11 MUL a3, b2, c32 MUL a4, b2, c31 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 MADD5 c31, a3, b1, c31 MADD6 c32, a4, b1, c32 MUL a1, b2, c52 MUL a2, b2, c51 MUL a3, b2, c72 MUL a4, b2, c71 MADD5 c51, a1, b1, c51 MADD6 c52, a2, b1, c52 MADD5 c71, a3, b1, c71 MADD6 c72, a4, b1, c72 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 NMSUB c31, c31, b3, c11 MADD7 c32, c32, b4, c11 NMSUB c51, c51, b5, c11 MADD7 c52, c52, b6, c11 NMSUB c71, c71, b7, c11 MADD7 c72, c72, b8, c11 MADD8 c31, c31, b4, c12 NMSUB c32, c32, b3, c12 MADD8 c51, c51, b6, c12 NMSUB c52, c52, b5, c12 MADD8 c71, c71, b8, c12 NMSUB c72, c72, b7, c12 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MUL a1, b4, c32 MUL a2, b4, c31 MADD5 c31, a1, b3, c31 MADD6 c32, a2, b3, c32 NMSUB c51, c51, b5, c31 MADD7 c52, c52, b6, c31 NMSUB c71, c71, b7, c31 MADD7 c72, c72, b8, c31 MADD8 c51, c51, b6, c32 NMSUB c52, c52, b5, c32 MADD8 c71, c71, b8, c32 NMSUB c72, c72, b7, c32 LD b5, 20 * SIZE(BO) LD b6, 21 * SIZE(BO) LD b7, 22 * SIZE(BO) LD b8, 23 * SIZE(BO) MUL a1, b6, c52 MUL a2, b6, c51 MADD5 c51, a1, b5, c51 MADD6 c52, a2, b5, c52 NMSUB c71, c71, b7, c51 MADD7 c72, c72, b8, c51 MADD8 c71, c71, b8, c52 NMSUB c72, c72, b7, c52 LD b7, 30 * SIZE(BO) LD b8, 31 * SIZE(BO) MUL a1, b8, c72 MUL a2, b8, c71 MADD5 c71, a1, b7, c71 MADD6 c72, a2, b7, c72 #endif #ifdef RT LD b1, 30 * SIZE(BO) LD b2, 31 * SIZE(BO) LD b3, 28 * SIZE(BO) LD b4, 29 * SIZE(BO) LD b5, 26 * SIZE(BO) LD b6, 27 * SIZE(BO) LD b7, 24 * SIZE(BO) LD b8, 25 * SIZE(BO) MUL a1, b2, c72 MUL a2, b2, c71 MADD5 c71, a1, b1, c71 MADD6 c72, a2, b1, c72 NMSUB c51, c51, b3, c71 MADD7 c52, c52, b4, c71 NMSUB c31, c31, b5, c71 MADD7 c32, c32, b6, c71 NMSUB c11, c11, b7, c71 MADD7 c12, c12, b8, c71 MADD8 c51, c51, b4, c72 NMSUB c52, c52, b3, c72 MADD8 c31, c31, b6, c72 NMSUB c32, c32, b5, c72 MADD8 c11, c11, b8, c72 NMSUB c12, c12, b7, c72 LD b3, 20 * SIZE(BO) LD b4, 21 * SIZE(BO) LD b5, 18 * SIZE(BO) LD b6, 19 * SIZE(BO) LD b7, 16 * SIZE(BO) LD b8, 17 * SIZE(BO) MUL a1, b4, c52 MUL a2, b4, c51 MADD5 c51, a1, b3, c51 MADD6 c52, a2, b3, c52 NMSUB c31, c31, b5, c51 MADD7 c32, c32, b6, c51 NMSUB c11, c11, b7, c51 MADD7 c12, c12, b8, c51 MADD8 c31, c31, b6, c52 NMSUB c32, c32, b5, c52 MADD8 c11, c11, b8, c52 NMSUB c12, c12, b7, c52 LD b5, 10 * SIZE(BO) LD b6, 11 * SIZE(BO) LD b7, 8 * SIZE(BO) LD b8, 9 * SIZE(BO) MUL a1, b6, c32 MUL a2, b6, c31 MADD5 c31, a1, b5, c31 MADD6 c32, a2, b5, c32 NMSUB c11, c11, b7, c31 MADD7 c12, c12, b8, c31 MADD8 c11, c11, b8, c32 NMSUB c12, c12, b7, c32 LD b7, 0 * SIZE(BO) LD b8, 1 * SIZE(BO) MUL a1, b8, c12 MUL a2, b8, c11 MADD5 c11, a1, b7, c11 MADD6 c12, a2, b7, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c32, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c52, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c72, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c32, 3 * SIZE(AO) ST c51, 4 * SIZE(AO) ST c52, 5 * SIZE(AO) ST c71, 6 * SIZE(AO) ST c72, 7 * SIZE(AO) #endif #ifdef LN daddiu CO1,CO1, -2 * SIZE daddiu CO2,CO2, -2 * SIZE daddiu CO3,CO3, -2 * SIZE daddiu CO4,CO4, -2 * SIZE #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c31, 0 * SIZE(CO2) ST c32, 1 * SIZE(CO2) ST c51, 0 * SIZE(CO3) ST c52, 1 * SIZE(CO3) ST c71, 0 * SIZE(CO4) ST c72, 1 * SIZE(CO4) #ifndef LN daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE #endif #ifdef RT dsll TEMP, K, ZBASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 2 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif MTC $0, c11 daddiu I, I, -1 MOV c21, c11 MOV c31, c11 MOV c41, c11 MOV c51, c11 bgtz I, .L11 MOV c61, c11 .align 3 .L19: #ifdef LN dsll TEMP, K, 2 + ZBASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 4 #endif #ifdef RT daddiu KK, KK, -4 #endif bgtz J, .L10 NOP .align 3 .L20: andi J, N, 2 blez J, .L30 NOP #ifdef RT dsll TEMP, K, 1 + ZBASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 1 dsubu C, C, TEMP #endif MTC $0, c11 move CO1, C daddu CO2, C, LDC #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO2, LDC #endif move I, M blez I, .L29 NOP .align 3 .L21: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(B) MOV c31, c11 LD a3, 4 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(B) dsra L, KK, 2 LD b3, 2 * SIZE(B) MOV c12, c11 LD b4, 3 * SIZE(B) MOV c22, c11 LD b5, 4 * SIZE(B) MOV c32, c11 NOP MOV c42, c11 blez L, .L25 move BO, B #else #ifdef LN dsll TEMP, K, ZBASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, ZBASE_SHIFT dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(BO) MOV c31, c11 LD a3, 4 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 LD b3, 2 * SIZE(BO) MOV c12, c11 LD b4, 3 * SIZE(BO) MOV c22, c11 LD b5, 4 * SIZE(BO) MOV c32, c11 blez L, .L25 MOV c42, c11 #endif .align 3 .L22: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c12, c12, a2, b5 LD b5, 12 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c11, c11, a3, b5 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD2 c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 19 * SIZE(BO) bgtz L, .L22 daddiu BO, BO, 16 * SIZE .align 3 .L25: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L28 NOP .align 3 .L26: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 daddiu BO, BO, 4 * SIZE MADD3 c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 0 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L26 daddiu AO, AO, 2 * SIZE .L28: ADD c11, c11, c22 ADD c12, c12, c21 ADD c31, c31, c42 ADD c32, c32, c41 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) MUL a1, b2, c12 MUL a2, b2, c11 MUL a3, b2, c32 MUL a4, b2, c31 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 MADD5 c31, a3, b1, c31 MADD6 c32, a4, b1, c32 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 NMSUB c31, c31, b3, c11 MADD7 c32, c32, b4, c11 MADD8 c31, c31, b4, c12 NMSUB c32, c32, b3, c12 LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL a1, b4, c32 MUL a2, b4, c31 MADD5 c31, a1, b3, c31 MADD6 c32, a2, b3, c32 #endif #ifdef RT LD b5, 6 * SIZE(BO) LD b6, 7 * SIZE(BO) LD b7, 4 * SIZE(BO) LD b8, 5 * SIZE(BO) MUL a1, b6, c32 MUL a2, b6, c31 MADD5 c31, a1, b5, c31 MADD6 c32, a2, b5, c32 NMSUB c11, c11, b7, c31 MADD7 c12, c12, b8, c31 MADD8 c11, c11, b8, c32 NMSUB c12, c12, b7, c32 LD b7, 0 * SIZE(BO) LD b8, 1 * SIZE(BO) MUL a1, b8, c12 MUL a2, b8, c11 MADD5 c11, a1, b7, c11 MADD6 c12, a2, b7, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c32, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c32, 3 * SIZE(AO) #endif #ifdef LN daddiu CO1,CO1, -2 * SIZE daddiu CO2,CO2, -2 * SIZE #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c31, 0 * SIZE(CO2) ST c32, 1 * SIZE(CO2) #ifndef LN daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #endif MTC $0, c11 #ifdef RT dsll TEMP, K, ZBASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif daddiu I, I, -1 bgtz I, .L21 NOP .align 3 .L29: #ifdef LN dsll TEMP, K, 1 + ZBASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 2 #endif #ifdef RT daddiu KK, KK, -2 #endif .align 3 .L30: andi J, N, 1 blez J, .L999 NOP #ifdef RT dsll TEMP, K, ZBASE_SHIFT dsubu B, B, TEMP dsubu C, C, LDC #endif MTC $0, c11 move CO1, C #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO1, LDC #endif move I, M blez I, .L39 NOP .align 3 .L31: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(B) MOV c31, c11 LD a2, 1 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(B) MOV c12, c11 dsra L, KK, 2 MOV c22, c11 LD a3, 4 * SIZE(AO) MOV c32, c11 LD b3, 4 * SIZE(B) NOP MOV c42, c11 blez L, .L35 move BO, B #else #ifdef LN dsll TEMP, K, ZBASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll TEMP, KK, ZBASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(BO) MOV c31, c11 LD a2, 1 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(BO) MOV c12, c11 dsra L, TEMP, 2 MOV c22, c11 LD a3, 4 * SIZE(AO) MOV c32, c11 LD b3, 4 * SIZE(BO) blez L, .L35 MOV c42, c11 #endif .align 3 .L32: MADD1 c11, c11, a1, b1 LD b4, 3 * SIZE(BO) MADD3 c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 LD b2, 5 * SIZE(BO) MADD3 c21, c21, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b4 LD a2, 5 * SIZE(AO) MADD1 c11, c11, a3, b3 LD b4, 7 * SIZE(BO) MADD3 c21, c21, a3, b2 LD a3, 6 * SIZE(AO) MADD2 c12, c12, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 7 * SIZE(AO) MADD1 c11, c11, a3, b3 LD b2, 9 * SIZE(BO) MADD3 c21, c21, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c12, c12, a2, b3 LD b3, 12 * SIZE(BO) MADD4 c22, c22, a2, b4 LD a2, 9 * SIZE(AO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L32 daddiu BO, BO, 8 * SIZE .align 3 .L35: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L38 NOP .align 3 .L36: MADD1 c11, c11, a1, b1 daddiu L, L, -1 MADD3 c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 3 * SIZE(AO) LD b2, 3 * SIZE(BO) daddiu BO, BO, 2 * SIZE bgtz L, .L36 daddiu AO, AO, 2 * SIZE .L38: ADD c11, c11, c22 ADD c12, c12, c21 #if defined(LN) || defined(RT) daddiu TEMP, KK, -1 dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) #endif #ifdef LN daddiu CO1,CO1, -2 * SIZE #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) #ifndef LN daddiu CO1,CO1, 2 * SIZE #endif MTC $0, c11 #ifdef RT dsll TEMP, K, ZBASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif daddiu I, I, -1 bgtz I, .L31 NOP .align 3 .L39: #ifdef LN dsll TEMP, K, ZBASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 1 #endif #ifdef RT daddiu KK, KK, -1 #endif .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) #ifndef __64BIT__ ldc1 $f20, 88($sp) ldc1 $f21, 96($sp) ldc1 $f22,104($sp) ldc1 $f23,112($sp) #endif j $31 daddiu $sp, $sp, 128 EPILOGUE OpenBLAS-0.2.20/kernel/mips64/ztrsm_kernel_RT.S000066400000000000000000000675141313527062700210330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M $4 #define N $5 #define K $6 #define A $9 #define B $10 #define C $11 #define LDC $8 #define AO $12 #define BO $13 #define I $2 #define J $3 #define L $7 #define CO1 $14 #define CO2 $15 #define CO3 $16 #define CO4 $17 #define OFFSET $18 #define KK $19 #define TEMP $20 #define AORIG $21 #define a1 $f0 #define a2 $f1 #define a3 $f26 #define a4 $f27 #define b1 $f2 #define b2 $f3 #define b3 $f4 #define b4 $f5 #define b5 $f6 #define b6 $f7 #define b7 $f8 #define b8 $f9 #define a5 b8 #define c11 $f10 #define c12 $f11 #define c21 $f12 #define c22 $f13 #define c31 $f14 #define c32 $f15 #define c41 $f16 #define c42 $f17 #define c51 $f18 #define c52 $f19 #define c61 $f20 #define c62 $f21 #define c71 $f22 #define c72 $f23 #define c81 $f24 #define c82 $f25 #ifndef CONJ #define MADD1 MADD #define MADD2 MADD #define MADD3 MADD #define MADD4 NMSUB #define MADD5 MSUB #define MADD6 MADD #define MADD7 NMSUB #define MADD8 MADD #else #if defined(LN) || defined(LT) #define MADD1 MADD #define MADD2 NMSUB #define MADD3 MADD #define MADD4 MADD #else #define MADD1 MADD #define MADD2 MADD #define MADD3 NMSUB #define MADD4 MADD #endif #define MADD5 MADD #define MADD6 MSUB #define MADD7 MADD #define MADD8 NMSUB #endif PROLOGUE daddiu $sp, $sp, -128 SDARG $16, 0($sp) SDARG $17, 8($sp) SDARG $18, 16($sp) SDARG $19, 24($sp) SDARG $20, 32($sp) SDARG $21, 40($sp) sdc1 $f24, 48($sp) sdc1 $f25, 56($sp) sdc1 $f26, 64($sp) sdc1 $f27, 72($sp) #ifndef __64BIT__ sdc1 $f20, 88($sp) sdc1 $f21, 96($sp) sdc1 $f22,104($sp) sdc1 $f23,112($sp) #endif LDARG LDC, 128 + 0($sp) LDARG OFFSET, 128 + 8($sp) dsll LDC, LDC, ZBASE_SHIFT #ifdef LN mult M, K mflo TEMP dsll TEMP, TEMP, ZBASE_SHIFT daddu A, A, TEMP dsll TEMP, M, ZBASE_SHIFT daddu C, C, TEMP #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mult N, K mflo TEMP dsll TEMP, TEMP, ZBASE_SHIFT daddu B, B, TEMP mult N, LDC mflo TEMP daddu C, C, TEMP dsubu KK, N, OFFSET #endif andi J, N, 1 blez J, .L20 NOP #ifdef RT dsll TEMP, K, ZBASE_SHIFT dsubu B, B, TEMP dsubu C, C, LDC #endif MTC $0, c11 move CO1, C #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO1, LDC #endif move I, M blez I, .L39 NOP .align 3 .L31: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(B) MOV c31, c11 LD a2, 1 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(B) MOV c12, c11 dsra L, KK, 2 MOV c22, c11 LD a3, 4 * SIZE(AO) MOV c32, c11 LD b3, 4 * SIZE(B) NOP MOV c42, c11 blez L, .L35 move BO, B #else #ifdef LN dsll TEMP, K, ZBASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll TEMP, KK, ZBASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(BO) MOV c31, c11 LD a2, 1 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(BO) MOV c12, c11 dsra L, TEMP, 2 MOV c22, c11 LD a3, 4 * SIZE(AO) MOV c32, c11 LD b3, 4 * SIZE(BO) blez L, .L35 MOV c42, c11 #endif .align 3 .L32: MADD1 c11, c11, a1, b1 LD b4, 3 * SIZE(BO) MADD3 c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 3 * SIZE(AO) MADD1 c11, c11, a1, b1 LD b2, 5 * SIZE(BO) MADD3 c21, c21, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b4 LD a2, 5 * SIZE(AO) MADD1 c11, c11, a3, b3 LD b4, 7 * SIZE(BO) MADD3 c21, c21, a3, b2 LD a3, 6 * SIZE(AO) MADD2 c12, c12, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 7 * SIZE(AO) MADD1 c11, c11, a3, b3 LD b2, 9 * SIZE(BO) MADD3 c21, c21, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c12, c12, a2, b3 LD b3, 12 * SIZE(BO) MADD4 c22, c22, a2, b4 LD a2, 9 * SIZE(AO) daddiu AO, AO, 8 * SIZE daddiu L, L, -1 bgtz L, .L32 daddiu BO, BO, 8 * SIZE .align 3 .L35: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L38 NOP .align 3 .L36: MADD1 c11, c11, a1, b1 daddiu L, L, -1 MADD3 c21, c21, a1, b2 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 2 * SIZE(BO) MADD4 c22, c22, a2, b2 LD a2, 3 * SIZE(AO) LD b2, 3 * SIZE(BO) daddiu BO, BO, 2 * SIZE bgtz L, .L36 daddiu AO, AO, 2 * SIZE .L38: ADD c11, c11, c22 ADD c12, c12, c21 #if defined(LN) || defined(RT) daddiu TEMP, KK, -1 dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AORIG, TEMP daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 #endif #if defined(RN) || defined(RT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) #endif #ifdef LN daddiu CO1,CO1, -2 * SIZE #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) #ifndef LN daddiu CO1,CO1, 2 * SIZE #endif MTC $0, c11 #ifdef RT dsll TEMP, K, ZBASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll TEMP, TEMP, ZBASE_SHIFT daddu AO, AO, TEMP daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif daddiu I, I, -1 bgtz I, .L31 NOP .align 3 .L39: #ifdef LN dsll TEMP, K, ZBASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 1 #endif #ifdef RT daddiu KK, KK, -1 #endif .align 3 .L20: andi J, N, 2 blez J, .L30 NOP #ifdef RT dsll TEMP, K, 1 + ZBASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 1 dsubu C, C, TEMP #endif MTC $0, c11 move CO1, C daddu CO2, C, LDC #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO2, LDC #endif move I, M blez I, .L29 NOP .align 3 .L21: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(B) MOV c31, c11 LD a3, 4 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(B) dsra L, KK, 2 LD b3, 2 * SIZE(B) MOV c12, c11 LD b4, 3 * SIZE(B) MOV c22, c11 LD b5, 4 * SIZE(B) MOV c32, c11 NOP MOV c42, c11 blez L, .L25 move BO, B #else #ifdef LN dsll TEMP, K, ZBASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, ZBASE_SHIFT dsll TEMP, KK, 1 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c21, c11 LD b1, 0 * SIZE(BO) MOV c31, c11 LD a3, 4 * SIZE(AO) MOV c41, c11 LD b2, 1 * SIZE(BO) dsra L, TEMP, 2 LD b3, 2 * SIZE(BO) MOV c12, c11 LD b4, 3 * SIZE(BO) MOV c22, c11 LD b5, 4 * SIZE(BO) MOV c32, c11 blez L, .L25 MOV c42, c11 #endif .align 3 .L22: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c11, c11, a1, b5 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c12, c12, a2, b5 LD b5, 12 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 LD a3, 6 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c11, c11, a3, b5 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a3, b2 daddiu AO, AO, 8 * SIZE MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 LD a3, 4 * SIZE(AO) MADD2 c12, c12, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 19 * SIZE(BO) bgtz L, .L22 daddiu BO, BO, 16 * SIZE .align 3 .L25: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L28 NOP .align 3 .L26: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 daddiu BO, BO, 4 * SIZE MADD3 c41, c41, a1, b4 LD a1, 2 * SIZE(AO) MADD2 c12, c12, a2, b1 LD b1, 0 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 3 * SIZE(BO) bgtz L, .L26 daddiu AO, AO, 2 * SIZE .L28: ADD c11, c11, c22 ADD c12, c12, c21 ADD c31, c31, c42 ADD c32, c32, c41 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -2 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) MUL a1, b2, c12 MUL a2, b2, c11 MUL a3, b2, c32 MUL a4, b2, c31 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 MADD5 c31, a3, b1, c31 MADD6 c32, a4, b1, c32 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 NMSUB c31, c31, b3, c11 MADD7 c32, c32, b4, c11 MADD8 c31, c31, b4, c12 NMSUB c32, c32, b3, c12 LD b3, 6 * SIZE(BO) LD b4, 7 * SIZE(BO) MUL a1, b4, c32 MUL a2, b4, c31 MADD5 c31, a1, b3, c31 MADD6 c32, a2, b3, c32 #endif #ifdef RT LD b5, 6 * SIZE(BO) LD b6, 7 * SIZE(BO) LD b7, 4 * SIZE(BO) LD b8, 5 * SIZE(BO) MUL a1, b6, c32 MUL a2, b6, c31 MADD5 c31, a1, b5, c31 MADD6 c32, a2, b5, c32 NMSUB c11, c11, b7, c31 MADD7 c12, c12, b8, c31 MADD8 c11, c11, b8, c32 NMSUB c12, c12, b7, c32 LD b7, 0 * SIZE(BO) LD b8, 1 * SIZE(BO) MUL a1, b8, c12 MUL a2, b8, c11 MADD5 c11, a1, b7, c11 MADD6 c12, a2, b7, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c32, 3 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c32, 3 * SIZE(AO) #endif #ifdef LN daddiu CO1,CO1, -2 * SIZE daddiu CO2,CO2, -2 * SIZE #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c31, 0 * SIZE(CO2) ST c32, 1 * SIZE(CO2) #ifndef LN daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE #endif MTC $0, c11 #ifdef RT dsll TEMP, K, ZBASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 1 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif daddiu I, I, -1 bgtz I, .L21 NOP .align 3 .L29: #ifdef LN dsll TEMP, K, 1 + ZBASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 2 #endif #ifdef RT daddiu KK, KK, -2 #endif .align 3 .L30: dsra J, N, 2 blez J, .L999 nop .L10: #ifdef RT dsll TEMP, K, 2 + ZBASE_SHIFT dsubu B, B, TEMP dsll TEMP, LDC, 2 dsubu C, C, TEMP #endif move CO1, C MTC $0, c11 daddu CO2, C, LDC daddu CO3, CO2, LDC daddiu J, J, -1 daddu CO4, CO3, LDC MOV c21, c11 MOV c31, c11 MOV c41, c11 MOV c51, c11 move I, M #ifdef LN daddu KK, M, OFFSET #endif #ifdef LT move KK, OFFSET #endif #if defined(LN) || defined(RT) move AORIG, A #else move AO, A #endif #ifndef RT daddu C, CO4, LDC #endif blez I, .L19 MOV c61, c11 .align 3 .L11: #if defined(LT) || defined(RN) LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(B) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(B) MOV c22, c11 dsra L, KK, 2 MOV c32, c11 LD b3, 2 * SIZE(B) MOV c42, c11 LD b4, 3 * SIZE(B) MOV c52, c11 LD b5, 4 * SIZE(B) MOV c62, c11 LD b6, 8 * SIZE(B) MOV c72, c11 LD b7, 12 * SIZE(B) MOV c82, c11 blez L, .L15 move BO, B #else #ifdef LN dsll TEMP, K, ZBASE_SHIFT dsubu AORIG, AORIG, TEMP #endif dsll L, KK, ZBASE_SHIFT dsll TEMP, KK, 2 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) MOV c71, c11 LD b1, 0 * SIZE(BO) MOV c81, c11 LD a3, 4 * SIZE(AO) MOV c12, c11 LD b2, 1 * SIZE(BO) MOV c22, c11 dsra L, TEMP, 2 MOV c32, c11 LD b3, 2 * SIZE(BO) MOV c42, c11 LD b4, 3 * SIZE(BO) MOV c52, c11 LD b5, 4 * SIZE(BO) MOV c62, c11 LD b6, 8 * SIZE(BO) MOV c72, c11 LD b7, 12 * SIZE(BO) MOV c82, c11 blez L, .L15 NOP #endif MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 daddiu L, L, -1 MADD1 c31, c31, a1, b3 NOP blez L, .L13 MADD3 c41, c41, a1, b4 .align 3 .L12: MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 NOP MADD3 c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD1 c71, c71, a1, b3 NOP MADD3 c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c51, c51, a4, b7 NOP MADD3 c61, c61, a4, b2 NOP MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD1 c51, c51, a3, b5 NOP MADD3 c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD1 c71, c71, a3, b3 NOP MADD3 c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 daddiu L, L, -1 MADD2 c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD1 c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD3 c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 3 * SIZE(BO) MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP bgtz L, .L12 MADD3 c41, c41, a1, b4 .align 3 .L13: MADD2 c12, c12, a2, b1 LD b1, 16 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 NOP MADD3 c61, c61, a1, b2 LD a4, 2 * SIZE(AO) MADD1 c71, c71, a1, b3 NOP MADD3 c81, c81, a1, b4 LD a1, 8 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 20 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 9 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 10 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 11 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 3 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 24 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 13 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 14 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 15 * SIZE(BO) MADD1 c51, c51, a4, b7 NOP MADD3 c61, c61, a4, b2 NOP MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 28 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 17 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 18 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 19 * SIZE(BO) MADD1 c11, c11, a3, b1 LD a2, 5 * SIZE(AO) MADD3 c21, c21, a3, b2 NOP MADD1 c31, c31, a3, b3 NOP MADD3 c41, c41, a3, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 32 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 21 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 22 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 23 * SIZE(BO) MADD1 c51, c51, a3, b5 NOP MADD3 c61, c61, a3, b2 LD a4, 6 * SIZE(AO) MADD1 c71, c71, a3, b3 NOP MADD3 c81, c81, a3, b4 LD a3, 12 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 36 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 25 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 26 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 27 * SIZE(BO) MADD1 c11, c11, a4, b6 LD a2, 7 * SIZE(AO) MADD3 c21, c21, a4, b2 NOP MADD1 c31, c31, a4, b3 NOP MADD3 c41, c41, a4, b4 NOP MADD2 c12, c12, a2, b6 LD b6, 40 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 29 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 30 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 31 * SIZE(BO) MADD1 c51, c51, a4, b7 daddiu BO, BO, 32 * SIZE MADD3 c61, c61, a4, b2 daddiu AO, AO, 8 * SIZE MADD1 c71, c71, a4, b3 NOP MADD3 c81, c81, a4, b4 NOP MADD2 c52, c52, a2, b7 LD b7, 12 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 LD b4, 3 * SIZE(BO) .align 3 .L15: #if defined(LT) || defined(RN) andi L, KK, 3 #else andi L, TEMP, 3 #endif blez L, .L18 NOP .align 3 .L16: MADD1 c11, c11, a1, b1 LD a2, 1 * SIZE(AO) MADD3 c21, c21, a1, b2 NOP MADD1 c31, c31, a1, b3 NOP MADD3 c41, c41, a1, b4 NOP MADD2 c12, c12, a2, b1 LD b1, 8 * SIZE(BO) MADD4 c22, c22, a2, b2 LD b2, 5 * SIZE(BO) MADD2 c32, c32, a2, b3 LD b3, 6 * SIZE(BO) MADD4 c42, c42, a2, b4 LD b4, 7 * SIZE(BO) MADD1 c51, c51, a1, b5 daddiu L, L, -1 MADD3 c61, c61, a1, b2 daddiu AO, AO, 2 * SIZE MADD1 c71, c71, a1, b3 daddiu BO, BO, 8 * SIZE MADD3 c81, c81, a1, b4 LD a1, 0 * SIZE(AO) MADD2 c52, c52, a2, b5 LD b5, 4 * SIZE(BO) MADD4 c62, c62, a2, b2 LD b2, 1 * SIZE(BO) MADD2 c72, c72, a2, b3 LD b3, 2 * SIZE(BO) MADD4 c82, c82, a2, b4 bgtz L, .L16 LD b4, 3 * SIZE(BO) .L18: ADD c11, c11, c22 ADD c12, c12, c21 ADD c31, c31, c42 ADD c32, c32, c41 ADD c51, c51, c62 ADD c52, c52, c61 ADD c71, c71, c82 ADD c72, c72, c81 #if defined(LN) || defined(RT) #ifdef LN daddiu TEMP, KK, -1 #else daddiu TEMP, KK, -4 #endif dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 2 + ZBASE_SHIFT daddu AO, AORIG, L daddu BO, B, TEMP #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 SUB c51, b5, c51 SUB c52, b6, c52 SUB c71, b7, c71 SUB c72, b8, c72 #else LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) LD b5, 4 * SIZE(AO) LD b6, 5 * SIZE(AO) LD b7, 6 * SIZE(AO) LD b8, 7 * SIZE(AO) SUB c11, b1, c11 SUB c12, b2, c12 SUB c31, b3, c31 SUB c32, b4, c32 SUB c51, b5, c51 SUB c52, b6, c52 SUB c71, b7, c71 SUB c72, b8, c72 #endif #if defined(LN) || defined(LT) LD b1, 0 * SIZE(AO) LD b2, 1 * SIZE(AO) MUL a1, b2, c12 MUL a2, b2, c11 MUL a3, b2, c32 MUL a4, b2, c31 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 MADD5 c31, a3, b1, c31 MADD6 c32, a4, b1, c32 MUL a1, b2, c52 MUL a2, b2, c51 MUL a3, b2, c72 MUL a4, b2, c71 MADD5 c51, a1, b1, c51 MADD6 c52, a2, b1, c52 MADD5 c71, a3, b1, c71 MADD6 c72, a4, b1, c72 #endif #ifdef RN LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MUL a1, b2, c12 MUL a2, b2, c11 MADD5 c11, a1, b1, c11 MADD6 c12, a2, b1, c12 NMSUB c31, c31, b3, c11 MADD7 c32, c32, b4, c11 NMSUB c51, c51, b5, c11 MADD7 c52, c52, b6, c11 NMSUB c71, c71, b7, c11 MADD7 c72, c72, b8, c11 MADD8 c31, c31, b4, c12 NMSUB c32, c32, b3, c12 MADD8 c51, c51, b6, c12 NMSUB c52, c52, b5, c12 MADD8 c71, c71, b8, c12 NMSUB c72, c72, b7, c12 LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) LD b8, 15 * SIZE(BO) MUL a1, b4, c32 MUL a2, b4, c31 MADD5 c31, a1, b3, c31 MADD6 c32, a2, b3, c32 NMSUB c51, c51, b5, c31 MADD7 c52, c52, b6, c31 NMSUB c71, c71, b7, c31 MADD7 c72, c72, b8, c31 MADD8 c51, c51, b6, c32 NMSUB c52, c52, b5, c32 MADD8 c71, c71, b8, c32 NMSUB c72, c72, b7, c32 LD b5, 20 * SIZE(BO) LD b6, 21 * SIZE(BO) LD b7, 22 * SIZE(BO) LD b8, 23 * SIZE(BO) MUL a1, b6, c52 MUL a2, b6, c51 MADD5 c51, a1, b5, c51 MADD6 c52, a2, b5, c52 NMSUB c71, c71, b7, c51 MADD7 c72, c72, b8, c51 MADD8 c71, c71, b8, c52 NMSUB c72, c72, b7, c52 LD b7, 30 * SIZE(BO) LD b8, 31 * SIZE(BO) MUL a1, b8, c72 MUL a2, b8, c71 MADD5 c71, a1, b7, c71 MADD6 c72, a2, b7, c72 #endif #ifdef RT LD b1, 30 * SIZE(BO) LD b2, 31 * SIZE(BO) LD b3, 28 * SIZE(BO) LD b4, 29 * SIZE(BO) LD b5, 26 * SIZE(BO) LD b6, 27 * SIZE(BO) LD b7, 24 * SIZE(BO) LD b8, 25 * SIZE(BO) MUL a1, b2, c72 MUL a2, b2, c71 MADD5 c71, a1, b1, c71 MADD6 c72, a2, b1, c72 NMSUB c51, c51, b3, c71 MADD7 c52, c52, b4, c71 NMSUB c31, c31, b5, c71 MADD7 c32, c32, b6, c71 NMSUB c11, c11, b7, c71 MADD7 c12, c12, b8, c71 MADD8 c51, c51, b4, c72 NMSUB c52, c52, b3, c72 MADD8 c31, c31, b6, c72 NMSUB c32, c32, b5, c72 MADD8 c11, c11, b8, c72 NMSUB c12, c12, b7, c72 LD b3, 20 * SIZE(BO) LD b4, 21 * SIZE(BO) LD b5, 18 * SIZE(BO) LD b6, 19 * SIZE(BO) LD b7, 16 * SIZE(BO) LD b8, 17 * SIZE(BO) MUL a1, b4, c52 MUL a2, b4, c51 MADD5 c51, a1, b3, c51 MADD6 c52, a2, b3, c52 NMSUB c31, c31, b5, c51 MADD7 c32, c32, b6, c51 NMSUB c11, c11, b7, c51 MADD7 c12, c12, b8, c51 MADD8 c31, c31, b6, c52 NMSUB c32, c32, b5, c52 MADD8 c11, c11, b8, c52 NMSUB c12, c12, b7, c52 LD b5, 10 * SIZE(BO) LD b6, 11 * SIZE(BO) LD b7, 8 * SIZE(BO) LD b8, 9 * SIZE(BO) MUL a1, b6, c32 MUL a2, b6, c31 MADD5 c31, a1, b5, c31 MADD6 c32, a2, b5, c32 NMSUB c11, c11, b7, c31 MADD7 c12, c12, b8, c31 MADD8 c11, c11, b8, c32 NMSUB c12, c12, b7, c32 LD b7, 0 * SIZE(BO) LD b8, 1 * SIZE(BO) MUL a1, b8, c12 MUL a2, b8, c11 MADD5 c11, a1, b7, c11 MADD6 c12, a2, b7, c12 #endif #if defined(LN) || defined(LT) ST c11, 0 * SIZE(BO) ST c12, 1 * SIZE(BO) ST c31, 2 * SIZE(BO) ST c32, 3 * SIZE(BO) ST c51, 4 * SIZE(BO) ST c52, 5 * SIZE(BO) ST c71, 6 * SIZE(BO) ST c72, 7 * SIZE(BO) #else ST c11, 0 * SIZE(AO) ST c12, 1 * SIZE(AO) ST c31, 2 * SIZE(AO) ST c32, 3 * SIZE(AO) ST c51, 4 * SIZE(AO) ST c52, 5 * SIZE(AO) ST c71, 6 * SIZE(AO) ST c72, 7 * SIZE(AO) #endif #ifdef LN daddiu CO1,CO1, -2 * SIZE daddiu CO2,CO2, -2 * SIZE daddiu CO3,CO3, -2 * SIZE daddiu CO4,CO4, -2 * SIZE #endif ST c11, 0 * SIZE(CO1) ST c12, 1 * SIZE(CO1) ST c31, 0 * SIZE(CO2) ST c32, 1 * SIZE(CO2) ST c51, 0 * SIZE(CO3) ST c52, 1 * SIZE(CO3) ST c71, 0 * SIZE(CO4) ST c72, 1 * SIZE(CO4) #ifndef LN daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE daddiu CO4,CO4, 2 * SIZE #endif #ifdef RT dsll TEMP, K, ZBASE_SHIFT daddu AORIG, AORIG, TEMP #endif #if defined(LT) || defined(RN) dsubu TEMP, K, KK dsll L, TEMP, ZBASE_SHIFT dsll TEMP, TEMP, 2 + ZBASE_SHIFT daddu AO, AO, L daddu BO, BO, TEMP #endif #ifdef LT daddiu KK, KK, 1 #endif #ifdef LN daddiu KK, KK, -1 #endif MTC $0, c11 daddiu I, I, -1 MOV c21, c11 MOV c31, c11 MOV c41, c11 MOV c51, c11 bgtz I, .L11 MOV c61, c11 .align 3 .L19: #ifdef LN dsll TEMP, K, 2 + ZBASE_SHIFT daddu B, B, TEMP #endif #if defined(LT) || defined(RN) move B, BO #endif #ifdef RN daddiu KK, KK, 4 #endif #ifdef RT daddiu KK, KK, -4 #endif bgtz J, .L10 NOP .align 3 .L999: LDARG $16, 0($sp) LDARG $17, 8($sp) LDARG $18, 16($sp) LDARG $19, 24($sp) LDARG $20, 32($sp) LDARG $21, 40($sp) ldc1 $f24, 48($sp) ldc1 $f25, 56($sp) ldc1 $f26, 64($sp) ldc1 $f27, 72($sp) #ifndef __64BIT__ ldc1 $f20, 88($sp) ldc1 $f21, 96($sp) ldc1 $f22,104($sp) ldc1 $f23,112($sp) #endif j $31 daddiu $sp, $sp, 128 EPILOGUE OpenBLAS-0.2.20/kernel/power/000077500000000000000000000000001313527062700155605ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/power/KERNEL000066400000000000000000000015231313527062700164640ustar00rootroot00000000000000ifndef STRSMKERNEL_LN STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c endif ifndef STRSMKERNEL_LT STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c endif ifndef STRSMKERNEL_RN STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c endif ifndef STRSMKERNEL_RT STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif ifndef CTRSMKERNEL_LN CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c endif ifndef CTRSMKERNEL_LT CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c endif ifndef CTRSMKERNEL_RN CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c endif ifndef CTRSMKERNEL_RT CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif ifndef SGEMM_BETA SGEMM_BETA = gemm_beta.S endif ifndef DGEMM_BETA DGEMM_BETA = gemm_beta.S endif ifndef CGEMM_BETA CGEMM_BETA = zgemm_beta.S endif ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta.S endif ifndef DSDOTKERNEL DSDOTKERNEL = ../generic/dot.c endif OpenBLAS-0.2.20/kernel/power/KERNEL.CELL000066400000000000000000000046541313527062700172120ustar00rootroot00000000000000SAMAXKERNEL = amax_cell.S DAMAXKERNEL = amax_cell.S CAMAXKERNEL = zamax_cell.S ZAMAXKERNEL = zamax_cell.S SAMINKERNEL = amin_cell.S DAMINKERNEL = amin_cell.S CAMINKERNEL = zamin_cell.S ZAMINKERNEL = zamin_cell.S SASUMKERNEL = asum_cell.S DASUMKERNEL = asum_cell.S CASUMKERNEL = zasum_cell.S ZASUMKERNEL = zasum_cell.S SDOTKERNEL = dot_cell.S DDOTKERNEL = dot_cell.S CDOTKERNEL = zdot_cell.S ZDOTKERNEL = zdot_cell.S SGEMMKERNEL = gemm_kernel_altivec_cell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_cell.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_altivec_cell.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_cell.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) #STRSMKERNEL_LN = trsm_kernel_LN.S #STRSMKERNEL_LT = trsm_kernel_LT.S #STRSMKERNEL_RN = trsm_kernel_LT.S #STRSMKERNEL_RT = trsm_kernel_RT.S DTRSMKERNEL_LN = trsm_kernel_cell_LN.S DTRSMKERNEL_LT = trsm_kernel_cell_LT.S DTRSMKERNEL_RN = trsm_kernel_cell_LT.S DTRSMKERNEL_RT = trsm_kernel_cell_RT.S #CTRSMKERNEL_LN = ztrsm_kernel_LN.S #CTRSMKERNEL_LT = ztrsm_kernel_LT.S #CTRSMKERNEL_RN = ztrsm_kernel_LT.S #CTRSMKERNEL_RT = ztrsm_kernel_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_cell_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_cell_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_cell_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_cell_RT.S OpenBLAS-0.2.20/kernel/power/KERNEL.POWER3000066400000000000000000000000441313527062700174370ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.POWER5 OpenBLAS-0.2.20/kernel/power/KERNEL.POWER4000066400000000000000000000000431313527062700174370ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.POWER5 OpenBLAS-0.2.20/kernel/power/KERNEL.POWER5000066400000000000000000000033121313527062700174420ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S STRSMKERNEL_RT = trsm_kernel_RT.S DTRSMKERNEL_LN = trsm_kernel_LN.S DTRSMKERNEL_LT = trsm_kernel_LT.S DTRSMKERNEL_RN = trsm_kernel_LT.S DTRSMKERNEL_RT = trsm_kernel_RT.S CTRSMKERNEL_LN = ztrsm_kernel_LN.S CTRSMKERNEL_LT = ztrsm_kernel_LT.S CTRSMKERNEL_RN = ztrsm_kernel_LT.S CTRSMKERNEL_RT = ztrsm_kernel_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S OpenBLAS-0.2.20/kernel/power/KERNEL.POWER6000066400000000000000000000040421313527062700174440ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_power6.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_power6.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_power6.S CGEMMINCOPY = ../generic/zgemm_ncopy_2.c CGEMMITCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_power6.S ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_power6_LN.S STRSMKERNEL_LT = trsm_kernel_power6_LT.S STRSMKERNEL_RN = trsm_kernel_power6_LT.S STRSMKERNEL_RT = trsm_kernel_power6_RT.S DTRSMKERNEL_LN = trsm_kernel_power6_LN.S DTRSMKERNEL_LT = trsm_kernel_power6_LT.S DTRSMKERNEL_RN = trsm_kernel_power6_LT.S DTRSMKERNEL_RT = trsm_kernel_power6_RT.S CTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S CTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S CTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S CTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S OpenBLAS-0.2.20/kernel/power/KERNEL.POWER8000066400000000000000000000121471313527062700174530ustar00rootroot00000000000000#SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c STRMMKERNEL = strmm_kernel_16x8_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o CGEMMINCOPYOBJ = cgemm_incopy.o CGEMMITCOPYOBJ = cgemm_itcopy.o ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMITCOPY = zgemm_tcopy_8_power8.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMINCOPYOBJ = zgemm_incopy.o ZGEMMITCOPYOBJ = zgemm_itcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #Pure C for other kernels #SAMAXKERNEL = ../arm/amax.c #DAMAXKERNEL = ../arm/amax.c #CAMAXKERNEL = ../arm/zamax.c #ZAMAXKERNEL = ../arm/zamax.c # #SAMINKERNEL = ../arm/amin.c #DAMINKERNEL = ../arm/amin.c #CAMINKERNEL = ../arm/zamin.c #ZAMINKERNEL = ../arm/zamin.c # #SMAXKERNEL = ../arm/max.c #DMAXKERNEL = ../arm/max.c # #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # #ISAMAXKERNEL = ../arm/iamax.c #IDAMAXKERNEL = ../arm/iamax.c #ICAMAXKERNEL = ../arm/izamax.c #IZAMAXKERNEL = ../arm/izamax.c # #ISAMINKERNEL = ../arm/iamin.c #IDAMINKERNEL = ../arm/iamin.c #ICAMINKERNEL = ../arm/izamin.c #IZAMINKERNEL = ../arm/izamin.c # #ISMAXKERNEL = ../arm/imax.c #IDMAXKERNEL = ../arm/imax.c # #ISMINKERNEL = ../arm/imin.c #IDMINKERNEL = ../arm/imin.c # SASUMKERNEL = sasum.c DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # #SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c #CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c DCOPYKERNEL = dcopy.c CCOPYKERNEL = ccopy.c ZCOPYKERNEL = zcopy.c # SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c #CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c #CROTKERNEL = ../arm/zrot.c #ZROTKERNEL = ../arm/zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c CSWAPKERNEL = cswap.c ZSWAPKERNEL = zswap.c # #SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n.c #CGEMVNKERNEL = ../arm/zgemv_n.c #ZGEMVNKERNEL = ../arm/zgemv_n.c # #SGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c #ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c #SSYMV_L_KERNEL = ../generic/symv_k.c #DSYMV_U_KERNEL = ../generic/symv_k.c #DSYMV_L_KERNEL = ../generic/symv_k.c #QSYMV_U_KERNEL = ../generic/symv_k.c #QSYMV_L_KERNEL = ../generic/symv_k.c #CSYMV_U_KERNEL = ../generic/zsymv_k.c #CSYMV_L_KERNEL = ../generic/zsymv_k.c #ZSYMV_U_KERNEL = ../generic/zsymv_k.c #ZSYMV_L_KERNEL = ../generic/zsymv_k.c #XSYMV_U_KERNEL = ../generic/zsymv_k.c #XSYMV_L_KERNEL = ../generic/zsymv_k.c #ZHEMV_U_KERNEL = ../generic/zhemv_k.c #ZHEMV_L_KERNEL = ../generic/zhemv_k.c LSAME_KERNEL = ../generic/lsame.c SCABS_KERNEL = ../generic/cabs.c DCABS_KERNEL = ../generic/cabs.c QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c OpenBLAS-0.2.20/kernel/power/KERNEL.PPC440000066400000000000000000000063141313527062700173400ustar00rootroot00000000000000SAMAXKERNEL = amax_ppc440.S DAMAXKERNEL = amax_ppc440.S CAMAXKERNEL = zamax_ppc440.S ZAMAXKERNEL = zamax_ppc440.S SAMINKERNEL = amin_ppc440.S DAMINKERNEL = amin_ppc440.S CAMINKERNEL = zamin_ppc440.S ZAMINKERNEL = zamin_ppc440.S SASUMKERNEL = asum_ppc440.S DASUMKERNEL = asum_ppc440.S CASUMKERNEL = zasum_ppc440.S ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S ISAMAXKERNEL = iamax_ppc440.S IDAMAXKERNEL = iamax_ppc440.S ICAMAXKERNEL = izamax_ppc440.S IZAMAXKERNEL = izamax_ppc440.S ISAMINKERNEL = iamin_ppc440.S IDAMINKERNEL = iamin_ppc440.S ICAMINKERNEL = izamin_ppc440.S IZAMINKERNEL = izamin_ppc440.S ISMAXKERNEL = imax_ppc440.S IDMAXKERNEL = imax_ppc440.S ISMINKERNEL = imin_ppc440.S IDMINKERNEL = imin_ppc440.S SMAXKERNEL = max_ppc440.S DMAXKERNEL = max_ppc440.S SMINKERNEL = min_ppc440.S DMINKERNEL = min_ppc440.S SNRM2KERNEL = snrm2_ppc440.S DNRM2KERNEL = dnrm2_ppc440.S CNRM2KERNEL = cnrm2_ppc440.S ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S SSCALKERNEL = scal_ppc440.S DSCALKERNEL = scal_ppc440.S CSCALKERNEL = zscal_ppc440.S ZSCALKERNEL = zscal_ppc440.S SGEMMKERNEL = gemm_kernel_ppc440.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_ppc440.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_ppc440.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_ppc440.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_ppc440_LN.S STRSMKERNEL_LT = trsm_kernel_ppc440_LT.S STRSMKERNEL_RN = trsm_kernel_ppc440_LT.S STRSMKERNEL_RT = trsm_kernel_ppc440_RT.S DTRSMKERNEL_LN = trsm_kernel_ppc440_LN.S DTRSMKERNEL_LT = trsm_kernel_ppc440_LT.S DTRSMKERNEL_RN = trsm_kernel_ppc440_LT.S DTRSMKERNEL_RT = trsm_kernel_ppc440_RT.S CTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S CTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S CTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S CTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S OpenBLAS-0.2.20/kernel/power/KERNEL.PPC440FP2000066400000000000000000000075421313527062700176540ustar00rootroot00000000000000SAMAXKERNEL = amax_hummer.S DAMAXKERNEL = amax_hummer.S CAMAXKERNEL = zamax_hummer.S ZAMAXKERNEL = zamax_hummer.S SAMINKERNEL = amin_hummer.S DAMINKERNEL = amin_hummer.S CAMINKERNEL = zamin_hummer.S ZAMINKERNEL = zamin_hummer.S SASUMKERNEL = asum_hummer.S DASUMKERNEL = asum_hummer.S CASUMKERNEL = zasum_hummer.S ZASUMKERNEL = zasum_hummer.S SAXPYKERNEL = axpy_hummer.S DAXPYKERNEL = axpy_hummer.S CAXPYKERNEL = zaxpy_hummer.S ZAXPYKERNEL = zaxpy_hummer.S SCOPYKERNEL = copy_hummer.S DCOPYKERNEL = copy_hummer.S CCOPYKERNEL = zcopy_hummer.S ZCOPYKERNEL = zcopy_hummer.S SDOTKERNEL = dot_hummer.S DDOTKERNEL = dot_hummer.S CDOTKERNEL = zdot_hummer.S ZDOTKERNEL = zdot_hummer.S ISAMAXKERNEL = iamax_hummer.S IDAMAXKERNEL = iamax_hummer.S ICAMAXKERNEL = izamax_hummer.S IZAMAXKERNEL = izamax_hummer.S ISAMINKERNEL = iamin_hummer.S IDAMINKERNEL = iamin_hummer.S ICAMINKERNEL = izamin_hummer.S IZAMINKERNEL = izamin_hummer.S ISMAXKERNEL = imax_hummer.S IDMAXKERNEL = imax_hummer.S ISMINKERNEL = imin_hummer.S IDMINKERNEL = imin_hummer.S SMAXKERNEL = max_hummer.S DMAXKERNEL = max_hummer.S SMINKERNEL = min_hummer.S DMINKERNEL = min_hummer.S SNRM2KERNEL = snrm2_hummer.S DNRM2KERNEL = dnrm2_hummer.S CNRM2KERNEL = cnrm2_hummer.S ZNRM2KERNEL = znrm2_hummer.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S SSCALKERNEL = scal_hummer.S DSCALKERNEL = scal_hummer.S CSCALKERNEL = zscal_hummer.S ZSCALKERNEL = zscal_hummer.S SSWAPKERNEL = swap_hummer.S DSWAPKERNEL = swap_hummer.S CSWAPKERNEL = zswap_hummer.S ZSWAPKERNEL = zswap_hummer.S SGEMMKERNEL = gemm_kernel_hummer.S SGEMMINCOPY = gemm_ncopy_hummer_8.S SGEMMITCOPY = gemm_tcopy_hummer_8.S SGEMMONCOPY = gemm_ncopy_hummer_4.S SGEMMOTCOPY = gemm_tcopy_hummer_4.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_hummer.S DGEMMINCOPY = gemm_ncopy_hummer_8.S DGEMMITCOPY = gemm_tcopy_hummer_8.S DGEMMONCOPY = gemm_ncopy_hummer_4.S DGEMMOTCOPY = gemm_tcopy_hummer_4.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_hummer.S CGEMMINCOPY = zgemm_ncopy_hummer_4.S CGEMMITCOPY = zgemm_tcopy_hummer_4.S CGEMMONCOPY = zgemm_ncopy_hummer_2.S CGEMMOTCOPY = zgemm_tcopy_hummer_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_hummer.S ZGEMMINCOPY = zgemm_ncopy_hummer_4.S ZGEMMITCOPY = zgemm_tcopy_hummer_4.S ZGEMMONCOPY = zgemm_ncopy_hummer_2.S ZGEMMOTCOPY = zgemm_tcopy_hummer_2.S ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_hummer_LN.S STRSMKERNEL_LT = trsm_kernel_hummer_LT.S STRSMKERNEL_RN = trsm_kernel_hummer_LT.S STRSMKERNEL_RT = trsm_kernel_hummer_RT.S DTRSMKERNEL_LN = trsm_kernel_hummer_LN.S DTRSMKERNEL_LT = trsm_kernel_hummer_LT.S DTRSMKERNEL_RN = trsm_kernel_hummer_LT.S DTRSMKERNEL_RT = trsm_kernel_hummer_RT.S CTRSMKERNEL_LN = ztrsm_kernel_hummer_LN.S CTRSMKERNEL_LT = ztrsm_kernel_hummer_LT.S CTRSMKERNEL_RN = ztrsm_kernel_hummer_LT.S CTRSMKERNEL_RT = ztrsm_kernel_hummer_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_hummer_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_hummer_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_hummer_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_hummer_RT.S OpenBLAS-0.2.20/kernel/power/KERNEL.PPC970000066400000000000000000000037041313527062700173500ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_altivec.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_altivec.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) #STRSMKERNEL_LN = trsm_kernel_LN.S #STRSMKERNEL_LT = trsm_kernel_LT.S #STRSMKERNEL_RN = trsm_kernel_LT.S #STRSMKERNEL_RT = trsm_kernel_RT.S DTRSMKERNEL_LN = trsm_kernel_LN.S DTRSMKERNEL_LT = trsm_kernel_LT.S DTRSMKERNEL_RN = trsm_kernel_LT.S DTRSMKERNEL_RT = trsm_kernel_RT.S #CTRSMKERNEL_LN = ztrsm_kernel_LN.S #CTRSMKERNEL_LT = ztrsm_kernel_LT.S #CTRSMKERNEL_RN = ztrsm_kernel_LT.S #CTRSMKERNEL_RT = ztrsm_kernel_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S OpenBLAS-0.2.20/kernel/power/KERNEL.PPCG4000066400000000000000000000066401313527062700173050ustar00rootroot00000000000000SAMAXKERNEL = amax_ppc440.S DAMAXKERNEL = amax_ppc440.S CAMAXKERNEL = zamax_ppc440.S ZAMAXKERNEL = zamax_ppc440.S SAMINKERNEL = amin_ppc440.S DAMINKERNEL = amin_ppc440.S CAMINKERNEL = zamin_ppc440.S ZAMINKERNEL = zamin_ppc440.S SASUMKERNEL = asum_ppc440.S DASUMKERNEL = asum_ppc440.S CASUMKERNEL = zasum_ppc440.S ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S ISAMAXKERNEL = iamax_ppc440.S IDAMAXKERNEL = iamax_ppc440.S ICAMAXKERNEL = izamax_ppc440.S IZAMAXKERNEL = izamax_ppc440.S ISAMINKERNEL = iamin_ppc440.S IDAMINKERNEL = iamin_ppc440.S ICAMINKERNEL = izamin_ppc440.S IZAMINKERNEL = izamin_ppc440.S ISMAXKERNEL = imax_ppc440.S IDMAXKERNEL = imax_ppc440.S ISMINKERNEL = imin_ppc440.S IDMINKERNEL = imin_ppc440.S SMAXKERNEL = max_ppc440.S DMAXKERNEL = max_ppc440.S SMINKERNEL = min_ppc440.S DMINKERNEL = min_ppc440.S SNRM2KERNEL = snrm2_ppc440.S DNRM2KERNEL = dnrm2_ppc440.S CNRM2KERNEL = cnrm2_ppc440.S ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S SSCALKERNEL = scal_ppc440.S DSCALKERNEL = scal_ppc440.S CSCALKERNEL = zscal_ppc440.S ZSCALKERNEL = zscal_ppc440.S SGEMMKERNEL = gemm_kernel_altivec_g4.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_g4.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_altivec_g4.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_g4.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) #STRSMKERNEL_LN = trsm_kernel_ppc440_LN.S #STRSMKERNEL_LT = trsm_kernel_ppc440_LT.S #STRSMKERNEL_RN = trsm_kernel_ppc440_LT.S #STRSMKERNEL_RT = trsm_kernel_ppc440_RT.S DTRSMKERNEL_LN = trsm_kernel_ppc440_LN.S DTRSMKERNEL_LT = trsm_kernel_ppc440_LT.S DTRSMKERNEL_RN = trsm_kernel_ppc440_LT.S DTRSMKERNEL_RT = trsm_kernel_ppc440_RT.S #CTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S #CTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S #CTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S #CTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S OpenBLAS-0.2.20/kernel/power/Makefile000066400000000000000000000000111313527062700172100ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/power/amax.S000066400000000000000000000243631313527062700166420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/amax_cell.S000066400000000000000000000313251313527062700176350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, 10 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(20) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(15) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) bdz LL(13) .align 4 LL(12): fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 LFD f12, 4 * SIZE(X) fabs f11, f11 LFD f13, 5 * SIZE(X) fabs f12, f12 LFD f14, 6 * SIZE(X) fabs f13, f13 LFD f15, 7 * SIZE(X) fabs f14, f14 dcbt X, PREA fabs f15, f15 nop fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 LFD f8, 8 * SIZE(X) fsel f7, f23, f7, f15 LFD f9, 9 * SIZE(X) fabs f8, f8 LFD f10, 10 * SIZE(X) fabs f9, f9 LFD f11, 11 * SIZE(X) fabs f10, f10 LFD f12, 12 * SIZE(X) fabs f11, f11 LFD f13, 13 * SIZE(X) fabs f12, f12 LFD f14, 14 * SIZE(X) fabs f13, f13 LFD f15, 15 * SIZE(X) fabs f14, f14 addi X, X, 16 * SIZE fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 LFD f8, 0 * SIZE(X) fsel f6, f22, f6, f14 LFD f9, 1 * SIZE(X) fsel f7, f23, f7, f15 bdnz LL(12) .align 4 LL(13): fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 LFD f12, 4 * SIZE(X) fabs f11, f11 LFD f13, 5 * SIZE(X) fabs f12, f12 LFD f14, 6 * SIZE(X) fabs f13, f13 LFD f15, 7 * SIZE(X) fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 LFD f8, 8 * SIZE(X) fsel f7, f23, f7, f15 LFD f9, 9 * SIZE(X) fabs f8, f8 LFD f10, 10 * SIZE(X) fabs f9, f9 LFD f11, 11 * SIZE(X) fabs f10, f10 LFD f12, 12 * SIZE(X) fabs f11, f11 LFD f13, 13 * SIZE(X) fabs f12, f12 LFD f14, 14 * SIZE(X) fabs f13, f13 LFD f15, 15 * SIZE(X) fabs f14, f14 addi X, X, 16 * SIZE fabs f15, f15 nop fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 LFD f12, 4 * SIZE(X) fabs f11, f11 LFD f13, 5 * SIZE(X) fabs f12, f12 LFD f14, 6 * SIZE(X) fabs f13, f13 LFD f15, 7 * SIZE(X) fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 nop fsel f7, f23, f7, f15 addi X, X, 8 * SIZE .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 fabs f11, f11 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 nop fsel f3, f19, f3, f11 addi X, X, 4 * SIZE .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f8, f8 fabs f9, f9 fsub f16, f0, f8 fsub f17, f1, f9 fsel f0, f16, f0, f8 nop fsel f1, f17, f1, f9 addi X, X, 2 * SIZE .align 4 LL(18): andi. r0, N, 1 beq LL(999) LFD f8, 0 * SIZE(X) fabs f8, f8 fsub f16, f0, f8 fsel f0, f16, f0, f8 b LL(999) .align 4 LL(20): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(25) LFDUX f8, X, INCX LFDUX f9, X, INCX bdz LL(23) .align 4 LL(22): fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 LFDUX f8, X, INCX fsel f7, f23, f7, f15 LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 LFDUX f8, X, INCX fsel f6, f22, f6, f14 LFDUX f9, X, INCX fsel f7, f23, f7, f15 bdnz LL(22) .align 4 LL(23): fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 LFDUX f8, X, INCX fsel f7, f23, f7, f15 LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(25): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(26) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(26): andi. r0, N, 4 beq LL(27) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 fabs f11, f11 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fsub f16, f0, f8 fsub f17, f1, f9 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 .align 4 LL(28): andi. r0, N, 1 beq LL(999) LFDUX f8, X, INCX fabs f8, f8 fsub f16, f0, f8 fsel f0, f16, f0, f8 .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/amax_hummer.S000066400000000000000000000233051313527062700202120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 #define T1 f20 #define T2 f21 #define T3 f22 #define T4 f23 #define T5 f24 #define T6 f25 #define T7 f26 #define T8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 fabs C1, C1 ble LL(999) fsmfp C1, C1 fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 fabs C2, C2 ble LL(998) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fpabs T1, A1 LFPDUX A6, X, INCX2 fpabs T2, A2 LFPDUX A7, X, INCX2 fpabs T3, A3 LFPDUX A8, X, INCX2 fpabs T4, A4 bdz LL(13) .align 4 LL(12): fpsub F1, C1, T1 LFPDUX A1, X, INCX2 fpsub F2, C2, T2 LFPDUX A2, X, INCX2 fpsub F3, C3, T3 LFPDUX A3, X, INCX2 fpsub F4, C4, T4 LFPDUX A4, X, INCX2 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsel C1, F1, C1, T1 LFPDUX A5, X, INCX2 fpsel C2, F2, C2, T2 LFPDUX A6, X, INCX2 fpsel C3, F3, C3, T3 LFPDUX A7, X, INCX2 fpsel C4, F4, C4, T4 LFPDUX A8, X, INCX2 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(12) .align 4 LL(13): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(15): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpabs A1, A1 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX2 fabs A1, A1 fsub F1, C1, A1 fsel C1, F1, C1, A1 b LL(998) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFSDUX A5, X, INCX fpabs T1, A1 LFSDUX A6, X, INCX fpabs T2, A2 LFSDUX A7, X, INCX fpabs T3, A3 LFSDUX A8, X, INCX fpabs T4, A4 bdz LL(103) .align 4 LL(102): fpsub F1, C1, T1 LFDUX A1, X, INCX fpsub F2, C2, T2 LFDUX A2, X, INCX fpsub F3, C3, T3 LFDUX A3, X, INCX fpsub F4, C4, T4 LFDUX A4, X, INCX fpabs T5, A5 LFSDUX A1, X, INCX fpabs T6, A6 LFSDUX A2, X, INCX fpabs T7, A7 LFSDUX A3, X, INCX fpabs T8, A8 LFSDUX A4, X, INCX fpsel C1, F1, C1, T1 LFDUX A5, X, INCX fpsel C2, F2, C2, T2 LFDUX A6, X, INCX fpsel C3, F3, C3, T3 LFDUX A7, X, INCX fpsel C4, F4, C4, T4 LFDUX A8, X, INCX fpsub F5, C1, T5 LFSDUX A5, X, INCX fpsub F6, C2, T6 LFSDUX A6, X, INCX fpsub F7, C3, T7 LFSDUX A7, X, INCX fpsub F8, C4, T8 LFSDUX A8, X, INCX fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(102) .align 4 LL(103): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(105): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fsub F1, C1, A1 fsub F2, C2, A2 fsub F3, C3, A3 fsub F4, C4, A4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fsub F1, C1, A1 fsub F2, C2, A2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX fabs A1, A1 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(998): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel C1, F1, C1, C2 .align 4 LL(999): li r10, 16 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/amax_ppc440.S000066400000000000000000000162311313527062700177270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREX r8 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT sub X, X, INCX cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fabs f0, f1 li PREX, 3 * 16 * SIZE fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDUX f25, X, INCX fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDUX f27, X, INCX fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDUX f29, X, INCX fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 #ifdef PPCG4 dcbt X, PREX #endif fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f4, f12 #ifdef PPCG4 dcbt X, PREX #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f31, X, INCX fsub f16, f0, f8 #ifdef PPCG4 dcbt X, PREX #endif fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f4, f12 #ifdef PPCG4 dcbt X, PREX #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/amin.S000066400000000000000000000243031313527062700166320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/amin_cell.S000066400000000000000000000311741313527062700176350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, 10 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(20) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(15) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) bdz LL(13) .align 4 LL(12): fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 LFD f12, 4 * SIZE(X) fabs f11, f11 LFD f13, 5 * SIZE(X) fabs f12, f12 LFD f14, 6 * SIZE(X) fabs f13, f13 LFD f15, 7 * SIZE(X) fabs f14, f14 dcbt X, PREA fabs f15, f15 nop fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 LFD f8, 8 * SIZE(X) fsel f7, f23, f15, f7 LFD f9, 9 * SIZE(X) fabs f8, f8 LFD f10, 10 * SIZE(X) fabs f9, f9 LFD f11, 11 * SIZE(X) fabs f10, f10 LFD f12, 12 * SIZE(X) fabs f11, f11 LFD f13, 13 * SIZE(X) fabs f12, f12 LFD f14, 14 * SIZE(X) fabs f13, f13 LFD f15, 15 * SIZE(X) fabs f14, f14 addi X, X, 16 * SIZE fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 LFD f8, 0 * SIZE(X) fsel f6, f22, f14, f6 LFD f9, 1 * SIZE(X) fsel f7, f23, f15, f7 bdnz LL(12) .align 4 LL(13): fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 LFD f12, 4 * SIZE(X) fabs f11, f11 LFD f13, 5 * SIZE(X) fabs f12, f12 LFD f14, 6 * SIZE(X) fabs f13, f13 LFD f15, 7 * SIZE(X) fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 LFD f8, 8 * SIZE(X) fsel f7, f23, f15, f7 LFD f9, 9 * SIZE(X) fabs f8, f8 LFD f10, 10 * SIZE(X) fabs f9, f9 LFD f11, 11 * SIZE(X) fabs f10, f10 LFD f12, 12 * SIZE(X) fabs f11, f11 LFD f13, 13 * SIZE(X) fabs f12, f12 LFD f14, 14 * SIZE(X) fabs f13, f13 LFD f15, 15 * SIZE(X) fabs f14, f14 addi X, X, 16 * SIZE fabs f15, f15 nop fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 LFD f12, 4 * SIZE(X) fabs f11, f11 LFD f13, 5 * SIZE(X) fabs f12, f12 LFD f14, 6 * SIZE(X) fabs f13, f13 LFD f15, 7 * SIZE(X) fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 nop fsel f7, f23, f15, f7 addi X, X, 8 * SIZE .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f8, f8 LFD f10, 2 * SIZE(X) fabs f9, f9 LFD f11, 3 * SIZE(X) fabs f10, f10 fabs f11, f11 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 nop fsel f3, f19, f11, f3 addi X, X, 4 * SIZE .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f8, f8 fabs f9, f9 fsub f16, f0, f8 fsub f17, f1, f9 fsel f0, f16, f8, f0 nop fsel f1, f17, f9, f1 addi X, X, 2 * SIZE .align 4 LL(18): andi. r0, N, 1 beq LL(999) LFD f8, 0 * SIZE(X) fabs f8, f8 fsub f16, f0, f8 fsel f0, f16, f8, f0 b LL(999) .align 4 LL(20): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(25) LFDUX f8, X, INCX LFDUX f9, X, INCX bdz LL(23) .align 4 LL(22): fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 LFDUX f8, X, INCX fsel f7, f23, f15, f7 LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 LFDUX f8, X, INCX fsel f6, f22, f14, f6 LFDUX f9, X, INCX fsel f7, f23, f15, f7 bdnz LL(22) .align 4 LL(23): fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 LFDUX f8, X, INCX fsel f7, f23, f15, f7 LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 .align 4 LL(25): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(26) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 LFDUX f12, X, INCX fabs f11, f11 LFDUX f13, X, INCX fabs f12, f12 LFDUX f14, X, INCX fabs f13, f13 LFDUX f15, X, INCX fabs f14, f14 fabs f15, f15 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 .align 4 LL(26): andi. r0, N, 4 beq LL(27) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f8, f8 LFDUX f10, X, INCX fabs f9, f9 LFDUX f11, X, INCX fabs f10, f10 fabs f11, f11 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fsub f16, f0, f8 fsub f17, f1, f9 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 .align 4 LL(28): andi. r0, N, 1 beq LL(999) LFDUX f8, X, INCX fabs f8, f8 fsub f16, f0, f8 fsel f0, f16, f8, f0 .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/amin_hummer.S000066400000000000000000000233001313527062700202030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 #define T1 f20 #define T2 f21 #define T3 f22 #define T4 f23 #define T5 f24 #define T6 f25 #define T7 f26 #define T8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 fabs C1, C1 ble LL(999) fsmfp C1, C1 fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 fabs C2, C2 ble LL(998) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fpabs T1, A1 LFPDUX A6, X, INCX2 fpabs T2, A2 LFPDUX A7, X, INCX2 fpabs T3, A3 LFPDUX A8, X, INCX2 fpabs T4, A4 bdz LL(13) .align 4 LL(12): fpsub F1, T1, C1 LFPDUX A1, X, INCX2 fpsub F2, T2, C2 LFPDUX A2, X, INCX2 fpsub F3, T3, C3 LFPDUX A3, X, INCX2 fpsub F4, T4, C4 LFPDUX A4, X, INCX2 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsel C1, F1, C1, T1 LFPDUX A5, X, INCX2 fpsel C2, F2, C2, T2 LFPDUX A6, X, INCX2 fpsel C3, F3, C3, T3 LFPDUX A7, X, INCX2 fpsel C4, F4, C4, T4 LFPDUX A8, X, INCX2 fpsub F5, T5, C1 fpsub F6, T6, C2 fpsub F7, T7, C3 fpsub F8, T8, C4 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(12) .align 4 LL(13): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, T1, C1 fpsub F2, T2, C2 fpsub F3, T3, C3 fpsub F4, T4, C4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, T5, C1 fpsub F6, T6, C2 fpsub F7, T7, C3 fpsub F8, T8, C4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(15): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpabs A1, A1 fpsub F1, A1, C1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX2 fabs A1, A1 fsub F1, A1, C1 fsel C1, F1, C1, A1 b LL(998) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFSDUX A5, X, INCX fpabs T1, A1 LFSDUX A6, X, INCX fpabs T2, A2 LFSDUX A7, X, INCX fpabs T3, A3 LFSDUX A8, X, INCX fpabs T4, A4 bdz LL(103) .align 4 LL(102): fpsub F1, T1, C1 LFDUX A1, X, INCX fpsub F2, T2, C2 LFDUX A2, X, INCX fpsub F3, T3, C3 LFDUX A3, X, INCX fpsub F4, T4, C4 LFDUX A4, X, INCX fpabs T5, A5 LFSDUX A1, X, INCX fpabs T6, A6 LFSDUX A2, X, INCX fpabs T7, A7 LFSDUX A3, X, INCX fpabs T8, A8 LFSDUX A4, X, INCX fpsel C1, F1, C1, T1 LFDUX A5, X, INCX fpsel C2, F2, C2, T2 LFDUX A6, X, INCX fpsel C3, F3, C3, T3 LFDUX A7, X, INCX fpsel C4, F4, C4, T4 LFDUX A8, X, INCX fpsub F5, T5, C1 LFSDUX A5, X, INCX fpsub F6, T6, C2 LFSDUX A6, X, INCX fpsub F7, T7, C3 LFSDUX A7, X, INCX fpsub F8, T8, C4 LFSDUX A8, X, INCX fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(102) .align 4 LL(103): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, T1, C1 fpsub F2, T2, C2 fpsub F3, T3, C3 fpsub F4, T4, C4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, T5, C1 fpsub F6, T6, C2 fpsub F7, T7, C3 fpsub F8, T8, C4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(105): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fsub F1, A1, C1 fsub F2, A2, C2 fsub F3, A3, C3 fsub F4, A4, C4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fsub F1, A1, C1 fsub F2, A2, C2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX fabs A1, A1 fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(998): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C2, C1 fsel C1, F1, C1, C2 .align 4 LL(999): li r10, 16 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/amin_ppc440.S000066400000000000000000000163341313527062700177310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INC1 r6 #define PREX r8 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT sub X, X, INCX li INC1, SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fabs f0, f1 li PREX, 3 * 16 * SIZE fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 subi N, N, 1 fabs f6, f1 srawi. r0, N, 4 fabs f7, f1 mtspr CTR, r0 fabs f1, f1 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDUX f25, X, INCX fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDUX f27, X, INCX fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDUX f29, X, INCX fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 #ifdef PPCG4 dcbt X, PREX #endif fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f9, f1 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f10, f2 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f12, f4 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PREX #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f13, f5 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f14, f6 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 #ifdef PPCG4 dcbt X, PREX #endif fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f9, f1 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f10, f2 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f12, f4 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PREX #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f13, f5 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f14, f6 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/asum.S000066400000000000000000000205031313527062700166510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f0 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f10, 2 * SIZE(X) LFD f11, 3 * SIZE(X) LFD f12, 4 * SIZE(X) LFD f13, 5 * SIZE(X) LFD f14, 6 * SIZE(X) LFD f15, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) fabs f16, f8 fabs f17, f9 fabs f18, f10 fabs f19, f11 fabs f20, f12 fabs f21, f13 fabs f22, f14 fabs f23, f15 bdz LL(20) .align 4 LL(10): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 LFD f8, 16 * SIZE(X) LFD f9, 17 * SIZE(X) LFD f10, 18 * SIZE(X) LFD f11, 19 * SIZE(X) FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 LFD f12, 20 * SIZE(X) LFD f13, 21 * SIZE(X) LFD f14, 22 * SIZE(X) LFD f15, 23 * SIZE(X) FADD f0, f0, f16 fabs f16, f8 FADD f1, f1, f17 fabs f17, f9 FADD f2, f2, f18 fabs f18, f10 FADD f3, f3, f19 fabs f19, f11 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) FADD f4, f4, f20 fabs f20, f12 FADD f5, f5, f21 fabs f21, f13 FADD f6, f6, f22 fabs f22, f14 FADD f7, f7, f23 fabs f23, f15 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 FADD f0, f0, f16 FADD f1, f1, f17 FADD f2, f2, f18 FADD f3, f3, f19 FADD f4, f4, f20 FADD f5, f5, f21 FADD f6, f6, f22 FADD f7, f7, f23 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fabs f8, f8 FADD f0, f0, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f8, X, INCX LFDUX f9, X, INCX LFDUX f10, X, INCX LFDUX f11, X, INCX LFDUX f12, X, INCX LFDUX f13, X, INCX LFDUX f14, X, INCX LFDUX f15, X, INCX LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f16, f8 fabs f17, f9 fabs f18, f10 fabs f19, f11 fabs f20, f12 fabs f21, f13 fabs f22, f14 fabs f23, f15 bdz LL(120) .align 4 LL(110): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 LFDUX f8, X, INCX LFDUX f9, X, INCX LFDUX f10, X, INCX LFDUX f11, X, INCX FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 LFDUX f12, X, INCX LFDUX f13, X, INCX LFDUX f14, X, INCX LFDUX f15, X, INCX FADD f0, f0, f16 fabs f16, f8 FADD f1, f1, f17 fabs f17, f9 FADD f2, f2, f18 fabs f18, f10 FADD f3, f3, f19 fabs f19, f11 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX FADD f4, f4, f20 fabs f20, f12 FADD f5, f5, f21 fabs f21, f13 FADD f6, f6, f22 fabs f22, f14 FADD f7, f7, f23 fabs f23, f15 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 FADD f0, f0, f16 FADD f1, f1, f17 FADD f2, f2, f18 FADD f3, f3, f19 FADD f4, f4, f20 FADD f5, f5, f21 FADD f6, f6, f22 FADD f7, f7, f23 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 FADD f0, f0, f8 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f4, f4, f5 FADD f6, f6, f7 FADD f0, f0, f2 FADD f4, f4, f6 FADD f1, f0, f4 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/asum_cell.S000066400000000000000000000225261313527062700176570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f0 #define STACKSIZE 16 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stw r0, 0(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfs FZERO, 0(SP) slwi INCX, INCX, BASE_SHIFT fmr f1, FZERO li PREA, 8 * 16 * SIZE fmr f2, FZERO cmpwi cr0, N, 0 fmr f3, FZERO ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) cmpwi cr0, INCX, SIZE bne- cr0, LL(20) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(15) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 LFD f8, 4 * SIZE(X) fabs f7, f11 bdz LL(13) .align 4 LL(12): FADD f0, f0, f4 dcbt X, PREA fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 7 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 8 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 9 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 10 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 11 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 12 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 13 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 14 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 15 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 16 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 17 * SIZE(X) FADD f1, f1, f5 addi X, X, 16 * SIZE fabs f5, f9 LFD f10, 2 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 3 * SIZE(X) FADD f3, f3, f7 LFD f8, 4 * SIZE(X) fabs f7, f11 bdnz LL(12) .align 4 LL(13): FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 7 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 8 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 9 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 10 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 11 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 12 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 13 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 14 * SIZE(X) FADD f2, f2, f6 addi X, X, 16 * SIZE fabs f6, f10 LFD f11, -1 * SIZE(X) FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 LFD f8, 4 * SIZE(X) fabs f7, f11 FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 addi X, X, 8 * SIZE fabs f6, f10 LFD f11, -1 * SIZE(X) FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 addi X, X, 4 * SIZE fabs f7, f11 nop FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 fabs f5, f9 FADD f0, f0, f4 addi X, X, 2 * SIZE FADD f1, f1, f5 nop .align 4 LL(18): andi. r0, N, 1 beq LL(999) LFD f8, 0 * SIZE(X) fabs f4, f8 FADD f0, f0, f4 b LL(999) .align 4 LL(20): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(25) .align 4 LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f4, f8 LFDUX f10, X, INCX fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 LFDUX f8, X, INCX fabs f7, f11 bdz LL(23) .align 4 LL(22): FADD f0, f0, f4 dcbt X, PREA fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDUX f8, X, INCX FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDUX f8, X, INCX FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDUX f8, X, INCX FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 LFDUX f8, X, INCX fabs f7, f11 bdnz LL(22) .align 4 LL(23): FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDUX f8, X, INCX FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDUX f8, X, INCX FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(25): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(26) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f4, f8 LFDUX f10, X, INCX fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 LFDUX f8, X, INCX fabs f7, f11 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDUX f10, X, INCX FADD f2, f2, f6 fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(26): andi. r0, N, 4 beq LL(27) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f4, f8 LFDUX f10, X, INCX fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFDUX f8, X, INCX LFDUX f9, X, INCX fabs f4, f8 fabs f5, f9 FADD f0, f0, f4 FADD f1, f1, f5 .align 4 LL(28): andi. r0, N, 1 beq LL(999) LFDUX f8, X, INCX fabs f4, f8 FADD f0, f0, f4 .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f1, f0, f2 addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/asum_hummer.S000066400000000000000000000202001313527062700202200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define T1 f12 #define T2 f13 #define T3 f14 #define T4 f15 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C1, 0(X) addi X, X, 1 * SIZE addi N, N, -1 cmpwi cr0, N, 0 fabs C1, C1 ble LL(999) .align 4 LL(05): srawi. r0, N, 4 sub X, X, INCX2 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 fpmr T1, C2 LFPDUX A2, X, INCX2 fpmr T2, C2 LFPDUX A3, X, INCX2 fpmr T3, C2 LFPDUX A4, X, INCX2 fpmr T4, C2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): fpadd C1, C1, T1 nop fpabs T1, A1 LFPDUX A1, X, INCX2 fpadd C2, C2, T2 nop fpabs T2, A2 LFPDUX A2, X, INCX2 fpadd C3, C3, T3 nop fpabs T3, A3 LFPDUX A3, X, INCX2 fpadd C4, C4, T4 nop fpabs T4, A4 LFPDUX A4, X, INCX2 fpadd C1, C1, T1 nop fpabs T1, A5 LFPDUX A5, X, INCX2 fpadd C2, C2, T2 nop fpabs T2, A6 LFPDUX A6, X, INCX2 fpadd C3, C3, T3 nop fpabs T3, A7 LFPDUX A7, X, INCX2 fpadd C4, C4, T4 fpabs T4, A8 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): fpadd C1, C1, T1 fpabs T1, A1 fpadd C2, C2, T2 fpabs T2, A2 fpadd C3, C3, T3 fpabs T3, A3 fpadd C4, C4, T4 fpabs T4, A4 fpadd C1, C1, T1 fpabs T1, A5 fpadd C2, C2, T2 fpabs T2, A6 fpadd C3, C3, T3 fpabs T3, A7 fpadd C4, C4, T4 fpabs T4, A8 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs T1, A1 fpabs T2, A2 fpadd C1, C1, T1 fpadd C2, C2, T2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpabs T1, A1 fpadd C1, C1, T1 .align 4 LL(18): andi. r0, N, 1 beq LL(999) LFDX A1, X, INCX2 fabs T1, A1 fadd C1, C1, T1 b LL(999) .align 4 LL(100): sub X2, X, INCX sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX2 fpmr T1, C2 LFDUX A2, X, INCX2 fpmr T2, C2 LFDUX A3, X, INCX2 fpmr T3, C2 LFDUX A4, X, INCX2 fpmr T4, C2 LFDUX A5, X, INCX2 LFSDUX A1, X2, INCX2 LFDUX A6, X, INCX2 LFSDUX A2, X2, INCX2 LFDUX A7, X, INCX2 LFSDUX A3, X2, INCX2 LFDUX A8, X, INCX2 LFSDUX A4, X2, INCX2 bdz LL(113) .align 4 LL(112): fpadd C1, C1, T1 LFSDUX A5, X2, INCX2 fpabs T1, A1 LFDUX A1, X, INCX2 fpadd C2, C2, T2 LFSDUX A6, X2, INCX2 fpabs T2, A2 LFDUX A2, X, INCX2 fpadd C3, C3, T3 LFSDUX A7, X2, INCX2 fpabs T3, A3 LFDUX A3, X, INCX2 fpadd C4, C4, T4 LFSDUX A8, X2, INCX2 fpabs T4, A4 LFDUX A4, X, INCX2 fpadd C1, C1, T1 LFSDUX A1, X2, INCX2 fpabs T1, A5 LFDUX A5, X, INCX2 fpadd C2, C2, T2 LFSDUX A2, X2, INCX2 fpabs T2, A6 LFDUX A6, X, INCX2 fpadd C3, C3, T3 LFSDUX A3, X2, INCX2 fpabs T3, A7 LFDUX A7, X, INCX2 fpadd C4, C4, T4 LFSDUX A4, X2, INCX2 fpabs T4, A8 LFDUX A8, X, INCX2 bdnz LL(112) .align 4 LL(113): fpadd C1, C1, T1 nop fpabs T1, A1 LFSDUX A5, X2, INCX2 fpadd C2, C2, T2 nop fpabs T2, A2 LFSDUX A6, X2, INCX2 fpadd C3, C3, T3 nop fpabs T3, A3 LFSDUX A7, X2, INCX2 fpadd C4, C4, T4 nop fpabs T4, A4 LFSDUX A8, X2, INCX2 fpadd C1, C1, T1 fpabs T1, A5 fpadd C2, C2, T2 fpabs T2, A6 fpadd C3, C3, T3 fpabs T3, A7 fpadd C4, C4, T4 fpabs T4, A8 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(115): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(116) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 fabs T1, A1 LFDUX A5, X, INCX2 fabs T2, A2 LFDUX A6, X2, INCX2 fabs T3, A3 LFDUX A7, X, INCX2 fabs T4, A4 LFDUX A8, X2, INCX2 fadd C1, C1, T1 fabs T1, A5 fadd C2, C2, T2 fabs T2, A6 fadd C3, C3, T3 fabs T3, A7 fadd C4, C4, T4 fabs T4, A8 fadd C1, C1, T1 fadd C2, C2, T2 fadd C3, C3, T3 fadd C4, C4, T4 .align 4 LL(116): andi. r0, N, 4 beq LL(117) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 fabs T1, A1 fabs T2, A2 fabs T3, A3 fabs T4, A4 fadd C1, C1, T1 fadd C2, C2, T2 fadd C3, C3, T3 fadd C4, C4, T4 .align 4 LL(117): andi. r0, N, 2 beq LL(118) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 fabs T1, A1 fabs T2, A2 fadd C1, C1, T1 fadd C2, C2, T2 .align 4 LL(118): andi. r0, N, 1 beq LL(999) LFDX A1, X, INCX2 fabs T1, A1 fadd C1, C1, T1 .align 4 LL(999): fpadd C1, C1, C2 li r10, 16 fpadd C3, C3, C4 fpadd C1, C1, C3 lfpdux f15, SP, r10 fsmtp C2, C1 lfpdux f14, SP, r10 addi SP, SP, 16 fadd C1, C2, C1 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/asum_ppc440.S000066400000000000000000000145031313527062700177460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREX r6 #define ATTR r7 #define FZERO f0 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT fmr f1, FZERO li PREX, 3 * 16 * SIZE fmr f2, FZERO sub X, X, INCX fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO cmpwi cr0, N, 0 fmr f7, FZERO ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f8, X, INCX LFDUX f9, X, INCX LFDUX f10, X, INCX LFDUX f11, X, INCX LFDUX f12, X, INCX LFDUX f13, X, INCX LFDUX f14, X, INCX LFDUX f15, X, INCX fabs f16, f8 LFDUX f24, X, INCX fabs f17, f9 LFDUX f25, X, INCX fabs f18, f10 LFDUX f26, X, INCX fabs f19, f11 LFDUX f27, X, INCX fabs f20, f12 LFDUX f28, X, INCX fabs f21, f13 LFDUX f29, X, INCX fabs f22, f14 LFDUX f30, X, INCX fabs f23, f15 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): LFDUX f8, X, INCX FADD f0, f0, f16 #ifdef PPCG4 dcbt X, PREX #else nop #endif fabs f16, f24 LFDUX f9, X, INCX FADD f1, f1, f17 nop fabs f17, f25 LFDUX f10, X, INCX FADD f2, f2, f18 nop fabs f18, f26 LFDUX f11, X, INCX FADD f3, f3, f19 nop fabs f19, f27 LFDUX f12, X, INCX FADD f4, f4, f20 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PREX #else nop #endif fabs f20, f28 LFDUX f13, X, INCX FADD f5, f5, f21 nop fabs f21, f29 LFDUX f14, X, INCX FADD f6, f6, f22 nop fabs f22, f30 LFDUX f15, X, INCX FADD f7, f7, f23 nop fabs f23, f31 LFDUX f24, X, INCX FADD f0, f0, f16 #ifdef PPCG4 dcbt X, PREX #else nop #endif fabs f16, f8 LFDUX f25, X, INCX FADD f1, f1, f17 nop fabs f17, f9 LFDUX f26, X, INCX FADD f2, f2, f18 nop fabs f18, f10 LFDUX f27, X, INCX FADD f3, f3, f19 nop fabs f19, f11 LFDUX f28, X, INCX FADD f4, f4, f20 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PREX #else nop #endif fabs f20, f12 LFDUX f29, X, INCX FADD f5, f5, f21 nop fabs f21, f13 LFDUX f30, X, INCX FADD f6, f6, f22 nop fabs f22, f14 LFDUX f31, X, INCX FADD f7, f7, f23 fabs f23, f15 bdnz LL(110) .align 4 LL(120): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 FADD f0, f0, f16 FADD f1, f1, f17 FADD f2, f2, f18 FADD f3, f3, f19 FADD f4, f4, f20 FADD f5, f5, f21 FADD f6, f6, f22 FADD f7, f7, f23 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 FADD f0, f0, f8 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f4, f4, f5 FADD f6, f6, f7 FADD f0, f0, f2 FADD f4, f4, f6 FADD f1, f0, f4 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/axpy.S000066400000000000000000000265641313527062700167020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define PREA r4 #define YY r5 #else #define N r3 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 #define YY r5 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 #define YY r6 #else #define N r3 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 #define YY r5 #endif #endif #define ALPHA f24 #ifndef NEEDPARAM #define STACKSIZE 96 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif fmr ALPHA, f1 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, SIZE bne- cr0, LL(100) cmpwi cr0, INCY, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f2, 2 * SIZE(X) LFD f3, 3 * SIZE(X) LFD f8, 0 * SIZE(Y) LFD f9, 1 * SIZE(Y) LFD f10, 2 * SIZE(Y) LFD f11, 3 * SIZE(Y) LFD f4, 4 * SIZE(X) LFD f5, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f7, 7 * SIZE(X) LFD f12, 4 * SIZE(Y) LFD f13, 5 * SIZE(Y) LFD f14, 6 * SIZE(Y) LFD f15, 7 * SIZE(Y) bdz LL(20) .align 4 LL(10): FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 LFD f0, 8 * SIZE(X) LFD f1, 9 * SIZE(X) LFD f2, 10 * SIZE(X) LFD f3, 11 * SIZE(X) LFD f8, 8 * SIZE(Y) LFD f9, 9 * SIZE(Y) LFD f10, 10 * SIZE(Y) LFD f11, 11 * SIZE(Y) STFD f16, 0 * SIZE(Y) STFD f17, 1 * SIZE(Y) STFD f18, 2 * SIZE(Y) STFD f19, 3 * SIZE(Y) FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 LFD f4, 12 * SIZE(X) LFD f5, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f7, 15 * SIZE(X) LFD f12, 12 * SIZE(Y) LFD f13, 13 * SIZE(Y) LFD f14, 14 * SIZE(Y) LFD f15, 15 * SIZE(Y) STFD f20, 4 * SIZE(Y) STFD f21, 5 * SIZE(Y) STFD f22, 6 * SIZE(Y) STFD f23, 7 * SIZE(Y) FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 LFD f0, 16 * SIZE(X) LFD f1, 17 * SIZE(X) LFD f2, 18 * SIZE(X) LFD f3, 19 * SIZE(X) LFD f8, 16 * SIZE(Y) LFD f9, 17 * SIZE(Y) LFD f10, 18 * SIZE(Y) LFD f11, 19 * SIZE(Y) STFD f16, 8 * SIZE(Y) STFD f17, 9 * SIZE(Y) STFD f18, 10 * SIZE(Y) STFD f19, 11 * SIZE(Y) FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 LFD f4, 20 * SIZE(X) LFD f5, 21 * SIZE(X) LFD f6, 22 * SIZE(X) LFD f7, 23 * SIZE(X) LFD f12, 20 * SIZE(Y) LFD f13, 21 * SIZE(Y) LFD f14, 22 * SIZE(Y) LFD f15, 23 * SIZE(Y) STFD f20, 12 * SIZE(Y) STFD f21, 13 * SIZE(Y) STFD f22, 14 * SIZE(Y) STFD f23, 15 * SIZE(Y) #ifndef POWER6 dcbtst Y, PREA #ifdef L1_DUALFETCH dcbt X, PREA #endif #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 dcbtst Y, PREA L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 LFD f0, 8 * SIZE(X) LFD f1, 9 * SIZE(X) LFD f2, 10 * SIZE(X) LFD f3, 11 * SIZE(X) LFD f8, 8 * SIZE(Y) LFD f9, 9 * SIZE(Y) LFD f10, 10 * SIZE(Y) LFD f11, 11 * SIZE(Y) FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 LFD f4, 12 * SIZE(X) LFD f5, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f7, 15 * SIZE(X) LFD f12, 12 * SIZE(Y) LFD f13, 13 * SIZE(Y) LFD f14, 14 * SIZE(Y) LFD f15, 15 * SIZE(Y) STFD f16, 0 * SIZE(Y) STFD f17, 1 * SIZE(Y) STFD f18, 2 * SIZE(Y) STFD f19, 3 * SIZE(Y) FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 STFD f20, 4 * SIZE(Y) STFD f21, 5 * SIZE(Y) STFD f22, 6 * SIZE(Y) STFD f23, 7 * SIZE(Y) FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 STFD f16, 8 * SIZE(Y) STFD f17, 9 * SIZE(Y) STFD f18, 10 * SIZE(Y) STFD f19, 11 * SIZE(Y) STFD f20, 12 * SIZE(Y) STFD f21, 13 * SIZE(Y) STFD f22, 14 * SIZE(Y) STFD f23, 15 * SIZE(Y) addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f0, 0 * SIZE(X) LFD f8, 0 * SIZE(Y) FMADD f16, ALPHA, f0, f8 STFD f16, 0 * SIZE(Y) addi X, X, 1 * SIZE addi Y, Y, 1 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX sub Y, Y, INCY mr YY, Y srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) .align 4 LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f8, Y, INCY LFDUX f9, Y, INCY LFDUX f10, Y, INCY LFDUX f11, Y, INCY LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX LFDUX f12, Y, INCY LFDUX f13, Y, INCY LFDUX f14, Y, INCY LFDUX f15, Y, INCY bdz LL(120) .align 4 LL(110): FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f8, Y, INCY LFDUX f9, Y, INCY LFDUX f10, Y, INCY LFDUX f11, Y, INCY FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX LFDUX f12, Y, INCY LFDUX f13, Y, INCY LFDUX f14, Y, INCY LFDUX f15, Y, INCY STFDUX f16, YY, INCY STFDUX f17, YY, INCY STFDUX f18, YY, INCY STFDUX f19, YY, INCY FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f8, Y, INCY LFDUX f9, Y, INCY LFDUX f10, Y, INCY LFDUX f11, Y, INCY STFDUX f20, YY, INCY STFDUX f21, YY, INCY STFDUX f22, YY, INCY STFDUX f23, YY, INCY FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX LFDUX f12, Y, INCY LFDUX f13, Y, INCY LFDUX f14, Y, INCY LFDUX f15, Y, INCY STFDUX f16, YY, INCY STFDUX f17, YY, INCY STFDUX f18, YY, INCY STFDUX f19, YY, INCY STFDUX f20, YY, INCY STFDUX f21, YY, INCY STFDUX f22, YY, INCY STFDUX f23, YY, INCY bdnz LL(110) .align 4 LL(120): FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f8, Y, INCY LFDUX f9, Y, INCY LFDUX f10, Y, INCY LFDUX f11, Y, INCY FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX LFDUX f12, Y, INCY LFDUX f13, Y, INCY LFDUX f14, Y, INCY LFDUX f15, Y, INCY STFDUX f16, YY, INCY STFDUX f17, YY, INCY STFDUX f18, YY, INCY STFDUX f19, YY, INCY FMADD f16, ALPHA, f0, f8 FMADD f17, ALPHA, f1, f9 FMADD f18, ALPHA, f2, f10 FMADD f19, ALPHA, f3, f11 STFDUX f20, YY, INCY STFDUX f21, YY, INCY STFDUX f22, YY, INCY STFDUX f23, YY, INCY FMADD f20, ALPHA, f4, f12 FMADD f21, ALPHA, f5, f13 FMADD f22, ALPHA, f6, f14 FMADD f23, ALPHA, f7, f15 STFDUX f16, YY, INCY STFDUX f17, YY, INCY STFDUX f18, YY, INCY STFDUX f19, YY, INCY STFDUX f20, YY, INCY STFDUX f21, YY, INCY STFDUX f22, YY, INCY STFDUX f23, YY, INCY .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f0, X, INCX LFDUX f8, Y, INCY FMADD f16, ALPHA, f0, f8 STFDUX f16, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/axpy_hummer.S000066400000000000000000000304761313527062700202540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define YY r4 #define INCX2 r5 #define INCY2 r10 #define ALPHA f1 #define A1 f0 #define A2 f8 #define A3 f2 #define A4 f3 #define A5 f4 #define A6 f5 #define A7 f6 #define A8 f7 #define A9 f25 #define B1 f9 #define B2 f10 #define B3 f11 #define B4 f12 #define B5 f13 #define B6 f14 #define B7 f15 #define B8 f16 #define C1 f17 #define C2 f18 #define C3 f19 #define C4 f20 #define C5 f21 #define C6 f22 #define C7 f23 #define C8 f24 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 fsmfp ALPHA, ALPHA slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX add INCY2, INCY, INCY cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, SIZE bne LL(100) cmpwi cr0, INCY, SIZE bne LL(100) andi. r0, Y, 2 * SIZE - 1 beq LL(05) LFD A1, 0 * SIZE(X) LFD B1, 0 * SIZE(Y) addi X, X, SIZE addi Y, Y, SIZE fmadd C1, ALPHA, A1, B1 addi N, N, -1 STFD C1, -1 * SIZE(Y) LL(05): andi. r0, X, 2 * SIZE - 1 bne LL(20) sub X, X, INCX2 sub Y, Y, INCY2 mr YY, Y srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 LFPDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 bdz LL(13) .align 4 LL(12): fpmadd C1, ALPHA, A1, B1 LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 fpmadd C2, ALPHA, A2, B2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 fpmadd C3, ALPHA, A3, B3 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 fpmadd C4, ALPHA, A4, B4 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 fpmadd C5, ALPHA, A5, B5 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 fpmadd C6, ALPHA, A6, B6 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 fpmadd C7, ALPHA, A7, B7 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 fpmadd C8, ALPHA, A8, B8 LFPDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 STFPDUX C5, YY, INCY2 STFPDUX C6, YY, INCY2 STFPDUX C7, YY, INCY2 STFPDUX C8, YY, INCY2 bdnz LL(12) .align 4 LL(13): fpmadd C1, ALPHA, A1, B1 fpmadd C2, ALPHA, A2, B2 fpmadd C3, ALPHA, A3, B3 fpmadd C4, ALPHA, A4, B4 fpmadd C5, ALPHA, A5, B5 fpmadd C6, ALPHA, A6, B6 STFPDUX C1, YY, INCY2 fpmadd C7, ALPHA, A7, B7 STFPDUX C2, YY, INCY2 fpmadd C8, ALPHA, A8, B8 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 STFPDUX C5, YY, INCY2 STFPDUX C6, YY, INCY2 STFPDUX C7, YY, INCY2 STFPDUX C8, YY, INCY2 .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 fpmadd C1, ALPHA, A1, B1 fpmadd C2, ALPHA, A2, B2 fpmadd C3, ALPHA, A3, B3 fpmadd C4, ALPHA, A4, B4 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 fpmadd C1, ALPHA, A1, B1 fpmadd C2, ALPHA, A2, B2 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 fpmadd C1, ALPHA, A1, B1 STFPDUX C1, YY, INCY2 .align 4 LL(18): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 fmadd C1, ALPHA, A1, B1 STFDUX C1, YY, INCY2 b LL(999) .align 4 /* X is unaliged */ LL(20): LFD A1, 0 * SIZE(X) addi X, X, SIZE sub X, X, INCX2 sub Y, Y, INCY2 mr YY, Y srawi. r0, N, 4 mtspr CTR, r0 beq- LL(25) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 LFXDUX A3, X, INCX2 LFPDUX B2, Y, INCY2 LFXDUX A4, X, INCX2 LFPDUX B3, Y, INCY2 LFXDUX A5, X, INCX2 LFPDUX B4, Y, INCY2 LFXDUX A6, X, INCX2 LFPDUX B5, Y, INCY2 LFXDUX A7, X, INCX2 LFPDUX B6, Y, INCY2 fsmr A1, A2 LFXDUX A8, X, INCX2 fsmr A2, A3 LFPDUX B7, Y, INCY2 fsmr A3, A4 LFXDUX A9, X, INCX2 fsmr A4, A5 LFPDUX B8, Y, INCY2 bdz LL(23) .align 4 LL(22): fpmadd C1, ALPHA, A1, B1 fsmr A5, A6 LFPDUX B1, Y, INCY2 fpmadd C2, ALPHA, A2, B2 LFXDUX A2, X, INCX2 fsmr A6, A7 LFPDUX B2, Y, INCY2 fpmadd C3, ALPHA, A3, B3 LFXDUX A3, X, INCX2 fsmr A7, A8 LFPDUX B3, Y, INCY2 fpmadd C4, ALPHA, A4, B4 LFXDUX A4, X, INCX2 fsmr A8, A9 LFPDUX B4, Y, INCY2 fpmadd C5, ALPHA, A5, B5 LFXDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 fpmadd C6, ALPHA, A6, B6 LFXDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 fpmadd C7, ALPHA, A7, B7 LFXDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 fpmadd C8, ALPHA, A8, B8 LFXDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 fpmr A1, A9 LFXDUX A9, X, INCX2 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 fsmr A1, A2 STFPDUX C5, YY, INCY2 fsmr A2, A3 STFPDUX C6, YY, INCY2 fsmr A3, A4 STFPDUX C7, YY, INCY2 fsmr A4, A5 STFPDUX C8, YY, INCY2 bdnz LL(22) .align 4 LL(23): fpmadd C1, ALPHA, A1, B1 fsmr A5, A6 fpmadd C2, ALPHA, A2, B2 fsmr A6, A7 fpmadd C3, ALPHA, A3, B3 fsmr A7, A8 fpmadd C4, ALPHA, A4, B4 fsmr A8, A9 fpmadd C5, ALPHA, A5, B5 fpmadd C6, ALPHA, A6, B6 fpmadd C7, ALPHA, A7, B7 fpmadd C8, ALPHA, A8, B8 fpmr A1, A9 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 STFPDUX C5, YY, INCY2 STFPDUX C6, YY, INCY2 STFPDUX C7, YY, INCY2 STFPDUX C8, YY, INCY2 .align 4 LL(25): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(26) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 LFXDUX A3, X, INCX2 LFPDUX B2, Y, INCY2 LFXDUX A4, X, INCX2 LFPDUX B3, Y, INCY2 LFXDUX A5, X, INCX2 LFPDUX B4, Y, INCY2 fsmr A1, A2 fsmr A2, A3 fsmr A3, A4 fsmr A4, A5 fpmadd C1, ALPHA, A1, B1 fpmadd C2, ALPHA, A2, B2 fpmadd C3, ALPHA, A3, B3 fpmadd C4, ALPHA, A4, B4 fpmr A1, A5 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 .align 4 LL(26): andi. r0, N, 4 beq LL(27) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 LFXDUX A3, X, INCX2 LFPDUX B2, Y, INCY2 fsmr A1, A2 fsmr A2, A3 fpmadd C1, ALPHA, A1, B1 fpmadd C2, ALPHA, A2, B2 fpmr A1, A3 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 fsmr A1, A2 fpmadd C1, ALPHA, A1, B1 fpmr A1, A2 STFPDUX C1, YY, INCY2 .align 4 LL(28): andi. r0, N, 1 beq LL(999) LFDUX B1, Y, INCY2 fmadd C1, ALPHA, A1, B1 STFDUX C1, YY, INCY2 b LL(999) .align 4 #### LL(100): sub X, X, INCX sub Y, Y, INCY mr YY, Y srawi. r0, N, 3 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX LFDUX B1, Y, INCY LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX LFDUX B3, Y, INCY LFDUX A4, X, INCX LFDUX B4, Y, INCY LFDUX A5, X, INCX LFDUX B5, Y, INCY LFDUX A6, X, INCX LFDUX B6, Y, INCY LFDUX A7, X, INCX LFDUX B7, Y, INCY LFDUX A8, X, INCX LFDUX B8, Y, INCY bdz LL(113) .align 4 LL(112): fmadd C1, ALPHA, A1, B1 LFDUX A1, X, INCX LFDUX B1, Y, INCY fmadd C2, ALPHA, A2, B2 LFDUX A2, X, INCX LFDUX B2, Y, INCY fmadd C3, ALPHA, A3, B3 LFDUX A3, X, INCX LFDUX B3, Y, INCY fmadd C4, ALPHA, A4, B4 LFDUX A4, X, INCX LFDUX B4, Y, INCY fmadd C5, ALPHA, A5, B5 LFDUX A5, X, INCX LFDUX B5, Y, INCY fmadd C6, ALPHA, A6, B6 LFDUX A6, X, INCX LFDUX B6, Y, INCY fmadd C7, ALPHA, A7, B7 LFDUX A7, X, INCX LFDUX B7, Y, INCY fmadd C8, ALPHA, A8, B8 LFDUX A8, X, INCX LFDUX B8, Y, INCY STFDUX C1, YY, INCY STFDUX C2, YY, INCY STFDUX C3, YY, INCY STFDUX C4, YY, INCY STFDUX C5, YY, INCY STFDUX C6, YY, INCY STFDUX C7, YY, INCY STFDUX C8, YY, INCY bdnz LL(112) .align 4 LL(113): fmadd C1, ALPHA, A1, B1 fmadd C2, ALPHA, A2, B2 fmadd C3, ALPHA, A3, B3 fmadd C4, ALPHA, A4, B4 fmadd C5, ALPHA, A5, B5 fmadd C6, ALPHA, A6, B6 STFDUX C1, YY, INCY fmadd C7, ALPHA, A7, B7 STFDUX C2, YY, INCY fmadd C8, ALPHA, A8, B8 STFDUX C3, YY, INCY STFDUX C4, YY, INCY STFDUX C5, YY, INCY STFDUX C6, YY, INCY STFDUX C7, YY, INCY STFDUX C8, YY, INCY .align 4 LL(115): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(117) LFDUX A1, X, INCX LFDUX B1, Y, INCY LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX LFDUX B3, Y, INCY LFDUX A4, X, INCX LFDUX B4, Y, INCY fmadd C1, ALPHA, A1, B1 fmadd C2, ALPHA, A2, B2 fmadd C3, ALPHA, A3, B3 fmadd C4, ALPHA, A4, B4 STFDUX C1, YY, INCY STFDUX C2, YY, INCY STFDUX C3, YY, INCY STFDUX C4, YY, INCY .align 4 LL(117): andi. r0, N, 2 beq LL(118) LFDUX A1, X, INCX LFDUX B1, Y, INCY LFDUX A2, X, INCX LFDUX B2, Y, INCY fmadd C1, ALPHA, A1, B1 fmadd C2, ALPHA, A2, B2 STFDUX C1, YY, INCY STFDUX C2, YY, INCY .align 4 LL(118): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX LFDUX B1, Y, INCY fmadd C1, ALPHA, A1, B1 STFDUX C1, YY, INCY .align 4 LL(999): li r10, 16 subi SP, SP, 16 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/axpy_ppc440.S000066400000000000000000000173741313527062700177730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define YY r5 #define PRE r4 #else #define N r3 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define YY r5 #define PRE r4 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define YY r6 #define PRE r5 #else #define N r3 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define YY r5 #define PRE r4 #endif #endif #define ALPHA f24 #define STACKSIZE 96 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif fmr ALPHA, f1 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PRE, 2 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) sub X, X, INCX sub Y, Y, INCY mr YY, Y srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) .align 4 LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f8, Y, INCY LFDUX f9, Y, INCY LFDUX f10, Y, INCY LFDUX f11, Y, INCY LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX LFDUX f12, Y, INCY LFDUX f13, Y, INCY LFDUX f14, Y, INCY LFDUX f15, Y, INCY bdz LL(120) .align 4 LL(110): FMADD f16, ALPHA, f0, f8 LFDUX f0, X, INCX LFDUX f8, Y, INCY #ifdef PPCG4 dcbt X, PRE #endif FMADD f17, ALPHA, f1, f9 LFDUX f1, X, INCX LFDUX f9, Y, INCY FMADD f18, ALPHA, f2, f10 LFDUX f2, X, INCX LFDUX f10, Y, INCY #ifdef PPCG4 dcbtst Y, PRE #endif FMADD f19, ALPHA, f3, f11 LFDUX f3, X, INCX LFDUX f11, Y, INCY FMADD f20, ALPHA, f4, f12 LFDUX f4, X, INCX LFDUX f12, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f21, ALPHA, f5, f13 LFDUX f5, X, INCX LFDUX f13, Y, INCY FMADD f22, ALPHA, f6, f14 LFDUX f6, X, INCX LFDUX f14, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbtst Y, PRE #endif FMADD f23, ALPHA, f7, f15 LFDUX f7, X, INCX LFDUX f15, Y, INCY STFDUX f16, YY, INCY STFDUX f17, YY, INCY STFDUX f18, YY, INCY STFDUX f19, YY, INCY FMADD f16, ALPHA, f0, f8 LFDUX f0, X, INCX LFDUX f8, Y, INCY #ifdef PPCG4 dcbt X, PRE #endif FMADD f17, ALPHA, f1, f9 LFDUX f1, X, INCX LFDUX f9, Y, INCY FMADD f18, ALPHA, f2, f10 LFDUX f2, X, INCX LFDUX f10, Y, INCY #ifdef PPCG4 dcbtst Y, PRE #endif FMADD f19, ALPHA, f3, f11 LFDUX f3, X, INCX LFDUX f11, Y, INCY STFDUX f20, YY, INCY STFDUX f21, YY, INCY STFDUX f22, YY, INCY STFDUX f23, YY, INCY FMADD f20, ALPHA, f4, f12 LFDUX f4, X, INCX LFDUX f12, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f21, ALPHA, f5, f13 LFDUX f5, X, INCX LFDUX f13, Y, INCY FMADD f22, ALPHA, f6, f14 LFDUX f6, X, INCX LFDUX f14, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbtst Y, PRE #endif FMADD f23, ALPHA, f7, f15 LFDUX f7, X, INCX LFDUX f15, Y, INCY STFDUX f16, YY, INCY STFDUX f17, YY, INCY STFDUX f18, YY, INCY STFDUX f19, YY, INCY STFDUX f20, YY, INCY STFDUX f21, YY, INCY STFDUX f22, YY, INCY STFDUX f23, YY, INCY bdnz LL(110) .align 4 LL(120): FMADD f16, ALPHA, f0, f8 LFDUX f0, X, INCX LFDUX f8, Y, INCY FMADD f17, ALPHA, f1, f9 LFDUX f1, X, INCX LFDUX f9, Y, INCY FMADD f18, ALPHA, f2, f10 LFDUX f2, X, INCX LFDUX f10, Y, INCY FMADD f19, ALPHA, f3, f11 LFDUX f3, X, INCX LFDUX f11, Y, INCY FMADD f20, ALPHA, f4, f12 LFDUX f4, X, INCX LFDUX f12, Y, INCY FMADD f21, ALPHA, f5, f13 LFDUX f5, X, INCX LFDUX f13, Y, INCY FMADD f22, ALPHA, f6, f14 LFDUX f6, X, INCX LFDUX f14, Y, INCY FMADD f23, ALPHA, f7, f15 LFDUX f7, X, INCX LFDUX f15, Y, INCY STFDUX f16, YY, INCY STFDUX f17, YY, INCY STFDUX f18, YY, INCY STFDUX f19, YY, INCY FMADD f16, ALPHA, f0, f8 STFDUX f20, YY, INCY FMADD f17, ALPHA, f1, f9 STFDUX f21, YY, INCY FMADD f18, ALPHA, f2, f10 STFDUX f22, YY, INCY FMADD f19, ALPHA, f3, f11 STFDUX f23, YY, INCY FMADD f20, ALPHA, f4, f12 STFDUX f16, YY, INCY FMADD f21, ALPHA, f5, f13 STFDUX f17, YY, INCY FMADD f22, ALPHA, f6, f14 STFDUX f18, YY, INCY FMADD f23, ALPHA, f7, f15 STFDUX f19, YY, INCY STFDUX f20, YY, INCY STFDUX f21, YY, INCY STFDUX f22, YY, INCY STFDUX f23, YY, INCY .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f0, X, INCX LFDUX f8, Y, INCY FMADD f16, ALPHA, f0, f8 STFDUX f16, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/cabs.S000066400000000000000000000054161313527062700166220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE LFD f0, 0 * SIZE(r3) LFD f1, 1 * SIZE(r3) fabs f0, f0 fabs f1, f1 fadd f1, f0, f1 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/casum.c000066400000000000000000000063401313527062700170370ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #if defined(POWER8) #include "casum_microk_power8.c" #endif #ifndef HAVE_KERNEL_16 static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1) { BLASLONG i=0; FLOAT *x = x1; FLOAT temp0, temp1, temp2, temp3; FLOAT temp4, temp5, temp6, temp7; FLOAT sum0 = 0.0; FLOAT sum1 = 0.0; FLOAT sum2 = 0.0; FLOAT sum3 = 0.0; while ( i< n ) { temp0 = ABS(x[0]); temp1 = ABS(x[1]); temp2 = ABS(x[2]); temp3 = ABS(x[3]); temp4 = ABS(x[4]); temp5 = ABS(x[5]); temp6 = ABS(x[6]); temp7 = ABS(x[7]); sum0 += temp0; sum1 += temp1; sum2 += temp2; sum3 += temp3; sum0 += temp4; sum1 += temp5; sum2 += temp6; sum3 += temp7; x+=8; i+=4; } return sum0+sum1+sum2+sum3; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ip=0; FLOAT sumf = 0.0; BLASLONG n1; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(sumf); if ( inc_x == 1 ) { n1 = n & -16; if ( n1 > 0 ) { sumf = casum_kernel_16(n1, x); i=n1; ip = 2 * n1; } while(i < n) { sumf += ABS(x[ip]) + ABS(x[ip+1]); ip += 2; i++; } } else { inc_x2 = 2 * inc_x; while(i < n) { sumf += ABS(x[ip]) + ABS(x[ip+1]); ip += inc_x2; i++; } } return(sumf); } OpenBLAS-0.2.20/kernel/power/casum_microk_power8.c000066400000000000000000000124011313527062700217020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static float casum_kernel_16 (long n, float *x) { float sum; __vector float t0; __vector float t1; __vector float t2; __vector float t3; __asm__ ( "dcbt 0, %2 \n\t" "xxlxor 32, 32, 32 \n\t" "xxlxor 33, 33, 33 \n\t" "xxlxor 34, 34, 34 \n\t" "xxlxor 35, 35, 35 \n\t" "xxlxor 36, 36, 36 \n\t" "xxlxor 37, 37, 37 \n\t" "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" "addi %2, %2, 128 \n\t" "xvaddsp 36, 36, %x3 \n\t" "xvaddsp 37, 37, %x4 \n\t" "addic. %1, %1, -16 \n\t" "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" "bgt 1b \n" "2: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" "xvaddsp 36, 36, %x3 \n\t" "xvaddsp 37, 37, %x4 \n\t" "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" "xvaddsp 32, 32, 33 \n\t" "xvaddsp 34, 34, 35 \n\t" "xvaddsp 36, 36, 37 \n\t" "xvaddsp 38, 38, 39 \n\t" "xvaddsp 32, 32, 34 \n\t" "xvaddsp 36, 36, 38 \n\t" "xvaddsp 32, 32, 36 \n\t" "xxsldwi 33, 32, 32, 2 \n\t" "xvaddsp 32, 32, 33 \n\t" "xxsldwi 33, 32, 32, 1 \n\t" "xvaddsp 32, 32, 33 \n\t" "xscvspdp %x0, 32 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" : "=f" (sum), // 0 "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0), // 3 "=wa" (t1), // 4 "=wa" (t2), // 5 "=wa" (t3) // 6 : "m" (*x), "b" (16), // 8 "b" (32), // 9 "b" (48), // 10 "b" (64), // 11 "b" (80), // 12 "b" (96), // 13 "b" (112) // 14 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); return sum; } OpenBLAS-0.2.20/kernel/power/ccopy.c000066400000000000000000000060471313527062700170500ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "ccopy_microk_power8.c" #endif #ifndef HAVE_KERNEL_32 static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { ccopy_kernel_32(n1, x, y); i=n1; ix=n1*2; iy=n1*2; } while(i < n) { y[iy] = x[iy] ; y[iy+1] = x[ix+1] ; ix+=2; iy+=2; i++ ; } } else { BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_y2 = 2 * inc_y; while(i < n) { y[iy] = x[ix] ; y[iy+1] = x[ix+1] ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/ccopy_microk_power8.c000066400000000000000000000120021313527062700217040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_32 1 static void ccopy_kernel_32 (long n, float *x, float *y) { __asm__ ( "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %5, %2 \n\t" "lxvd2x 34, %6, %2 \n\t" "lxvd2x 35, %7, %2 \n\t" "lxvd2x 36, %8, %2 \n\t" "lxvd2x 37, %9, %2 \n\t" "lxvd2x 38, %10, %2 \n\t" "lxvd2x 39, %11, %2 \n\t" "addi %2, %2, 128 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %5, %2 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "lxvd2x 34, %6, %2 \n\t" "lxvd2x 35, %7, %2 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "lxvd2x 36, %8, %2 \n\t" "lxvd2x 37, %9, %2 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "lxvd2x 38, %10, %2 \n\t" "lxvd2x 39, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "bgt 1b \n" "2: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "=m" (*y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : "m" (*x), "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } OpenBLAS-0.2.20/kernel/power/cgemm_kernel_8x4_power8.S000066400000000000000000000237411313527062700223520ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 32000 #define ALPHA_R_SP 296(SP) #define ALPHA_I_SP 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R_SP 224(SP) #define ALPHA_I_SP 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define o0 0 #define alpha_dr vs28 #define alpha_di vs29 #define alpha_sr vs30 #define alpha_si vs31 #define FRAMEPOINTER r12 #define BBUFFER r14 #define L r15 #define o12 r16 #define o4 r17 #define T2 r19 #define BBO r20 #define o8 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o16 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T1 r31 #ifndef NEEDPARAM PROLOGUE PROFCODE mr FRAMEPOINTER, SP addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) #endif stfs f1, ALPHA_R_SP stfs f2, ALPHA_I_SP // stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) #else lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) #else lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif #include "cgemm_macros_8x4_power8.S" cmpwi cr0, M, 0 ble L999_H1 cmpwi cr0, N, 0 ble L999_H1 cmpwi cr0, K, 0 ble L999_H1 slwi LDC, LDC, ZBASE_SHIFT li PRE, 384 li o4 , 4 li o8 , 8 li o12 , 12 li o16 , 16 li o32 , 32 li o48 , 48 addi BBUFFER, SP, 512+4096 li T1, -4096 and BBUFFER, BBUFFER, T1 #ifdef __64BIT__ addi T1 , SP, 296 #else addi T1 , SP, 224 #endif stxsspx vs1, 0, T1 lxsspx alpha_dr, 0, T1 stxsspx vs2, o8 , T1 lxsspx alpha_di, o8, T1 addi T1, SP, 360 li T2, 0 stw T2, 0(T1) stw T2, 4(T1) stw T2, 8(T1) stxsspx alpha_dr, o12, T1 lxvw4x alpha_sr, o0 , T1 addi T1, T1, 16 stw T2, 0(T1) stw T2, 4(T1) stw T2, 8(T1) stxsspx alpha_di, o12, T1 lxvw4x alpha_si, o0 , T1 .align 5 #include "cgemm_logic_8x4_power8.S" L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/cgemm_logic_8x4_power8.S000066400000000000000000000450071313527062700221660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 2 ble CGEMM_L4_END CGEMM_L4_BEGIN: mr BO, B mr BBO, BBUFFER slwi T1, K, 3 CGEMM_L4_COPYB: dcbtst BBO, PRE lxvw4x vs3, o0, BO lxvw4x vs11, o16, BO xxspltw vs4, vs3, 0 xxspltw vs5, vs3, 1 xxspltw vs6, vs3, 2 xxspltw vs7, vs3, 3 xxspltw vs12, vs11, 0 xxspltw vs13, vs11, 1 xxspltw vs14, vs11, 2 xxspltw vs15, vs11, 3 stxvw4x vs4, o0, BBO stxvw4x vs5, o16, BBO stxvw4x vs6, o32, BBO stxvw4x vs7, o48, BBO addi BO, BO, 32 addi BBO, BBO, 64 stxvw4x vs12, o0, BBO stxvw4x vs13, o16, BBO stxvw4x vs14, o32, BBO stxvw4x vs15, o48, BBO addic. T1, T1, -8 addi BBO, BBO, 64 bge CGEMM_L4_COPYB mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 3 ble CGEMM_L4x8_END CGEMM_L4x8_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x8_SUB0 cmpwi cr0, L, 1 ble CGEMM_L4x8_SUB4 CGEMM_L4x8_LOOP_START: dcbt AO, PRE dcbt BO, PRE LOAD4x8_1 dcbt BO, PRE KERNEL4x8_I1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 ble CGEMM_L4x8_LOOP_END .align 5 CGEMM_L4x8_LOOP: dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 bgt CGEMM_L4x8_LOOP CGEMM_L4x8_LOOP_END: dcbt BO, PRE KERNEL4x8_1 dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_E2 b CGEMM_L4x8_SUB1 CGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 b CGEMM_L4x8_SUB1 CGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 ble CGEMM_L4x8_SAVE b CGEMM_L4x8_SUB2 CGEMM_L4x8_SUB1: andi. L, K, 7 ble CGEMM_L4x8_SAVE CGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 bgt CGEMM_L4x8_SUB2 CGEMM_L4x8_SAVE: SAVE4x8 addic. I, I, -1 bgt CGEMM_L4x8_BEGIN CGEMM_L4x8_END: CGEMM_L4x4_BEGIN: andi. T2, M, 7 ble CGEMM_L4x1_END andi. T1, M, 4 ble CGEMM_L4x4_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x4_SUB0 cmpwi cr0, L, 1 ble CGEMM_L4x4_SUB4 CGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -2 ble CGEMM_L4x4_LOOP_END .align 5 CGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -1 bgt CGEMM_L4x4_LOOP CGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_E2 b CGEMM_L4x4_SUB1 CGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 b CGEMM_L4x4_SUB1 CGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 ble CGEMM_L4x4_SAVE b CGEMM_L4x4_SUB2 CGEMM_L4x4_SUB1: andi. L, K, 7 ble CGEMM_L4x4_SAVE CGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 bgt CGEMM_L4x4_SUB2 CGEMM_L4x4_SAVE: SAVE4x4 CGEMM_L4x4_END: CGEMM_L4x2_BEGIN: andi. T1, M, 2 ble CGEMM_L4x2_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x2_SUB0 cmpwi cr0, L, 1 ble CGEMM_L4x2_SUB4 CGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -2 ble CGEMM_L4x2_LOOP_END .align 5 CGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -1 bgt CGEMM_L4x2_LOOP CGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_E2 b CGEMM_L4x2_SUB1 CGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 b CGEMM_L4x2_SUB1 CGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 ble CGEMM_L4x2_SAVE b CGEMM_L4x2_SUB2 CGEMM_L4x2_SUB1: andi. L, K, 7 ble CGEMM_L4x2_SAVE CGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 bgt CGEMM_L4x2_SUB2 CGEMM_L4x2_SAVE: SAVE4x2 CGEMM_L4x2_END: CGEMM_L4x1_BEGIN: andi. T1, M, 1 ble CGEMM_L4x1_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x1_SUB0 cmpwi cr0, L, 1 ble CGEMM_L4x1_SUB4 CGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -2 ble CGEMM_L4x1_LOOP_END .align 5 CGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -1 bgt CGEMM_L4x1_LOOP CGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_E2 b CGEMM_L4x1_SUB1 CGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 b CGEMM_L4x1_SUB1 CGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 ble CGEMM_L4x1_SAVE b CGEMM_L4x1_SUB2 CGEMM_L4x1_SUB1: andi. L, K, 7 ble CGEMM_L4x1_SAVE CGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 bgt CGEMM_L4x1_SUB2 CGEMM_L4x1_SAVE: SAVE4x1 CGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 bgt CGEMM_L4_BEGIN andi. T2, N, 3 ble L999_H2 CGEMM_L4_END: b CGEMM_L2_BEGIN L999_H1: b L999_H2 CGEMM_L2_BEGIN: mr BO, B mr BBO, BBUFFER slwi T1, K, 2 CGEMM_L2_COPYB: dcbtst BBO, PRE lxvw4x vs3, o0, BO lxvw4x vs11, o16, BO xxspltw vs4, vs3, 0 xxspltw vs5, vs3, 1 xxspltw vs6, vs3, 2 xxspltw vs7, vs3, 3 xxspltw vs12, vs11, 0 xxspltw vs13, vs11, 1 xxspltw vs14, vs11, 2 xxspltw vs15, vs11, 3 stxvw4x vs4, o0, BBO stxvw4x vs5, o16, BBO stxvw4x vs6, o32, BBO stxvw4x vs7, o48, BBO addi BO, BO, 32 addi BBO, BBO, 64 stxvw4x vs12, o0, BBO stxvw4x vs13, o16, BBO stxvw4x vs14, o32, BBO stxvw4x vs15, o48, BBO addic. T1, T1, -8 addi BBO, BBO, 64 bge CGEMM_L2_COPYB andi. T1, N, 2 ble CGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 ble CGEMM_L2x8_END CGEMM_L2x8_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x8_SUB0 cmpwi cr0, L, 1 ble CGEMM_L2x8_SUB4 CGEMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 KERNEL2x8_I1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 ble CGEMM_L2x8_LOOP_END .align 5 CGEMM_L2x8_LOOP: KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 bgt CGEMM_L2x8_LOOP CGEMM_L2x8_LOOP_END: KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_E2 b CGEMM_L2x8_SUB1 CGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b CGEMM_L2x8_SUB1 CGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 ble CGEMM_L2x8_SAVE b CGEMM_L2x8_SUB2 CGEMM_L2x8_SUB1: andi. L, K, 7 ble CGEMM_L2x8_SAVE CGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt CGEMM_L2x8_SUB2 CGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 bgt CGEMM_L2x8_BEGIN CGEMM_L2x8_END: CGEMM_L2x4_BEGIN: andi. T2, M, 7 ble CGEMM_L2x1_END andi. T1, M, 4 ble CGEMM_L2x4_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x4_SUB0 cmpwi cr0, L, 1 ble CGEMM_L2x4_SUB4 CGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble CGEMM_L2x4_LOOP_END .align 5 CGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt CGEMM_L2x4_LOOP CGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b CGEMM_L2x4_SUB1 CGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b CGEMM_L2x4_SUB1 CGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 ble CGEMM_L2x4_SAVE b CGEMM_L2x4_SUB2 CGEMM_L2x4_SUB1: andi. L, K, 7 ble CGEMM_L2x4_SAVE CGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt CGEMM_L2x4_SUB2 CGEMM_L2x4_SAVE: SAVE2x4 CGEMM_L2x4_END: CGEMM_L2x2_BEGIN: andi. T1, M, 2 ble CGEMM_L2x2_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x2_SUB0 cmpwi cr0, L, 1 ble CGEMM_L2x2_SUB4 CGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble CGEMM_L2x2_LOOP_END .align 5 CGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt CGEMM_L2x2_LOOP CGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b CGEMM_L2x2_SUB1 CGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b CGEMM_L2x2_SUB1 CGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 ble CGEMM_L2x2_SAVE b CGEMM_L2x2_SUB2 CGEMM_L2x2_SUB1: andi. L, K, 7 ble CGEMM_L2x2_SAVE CGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt CGEMM_L2x2_SUB2 CGEMM_L2x2_SAVE: SAVE2x2 CGEMM_L2x2_END: CGEMM_L2x1_BEGIN: andi. T1, M, 1 ble CGEMM_L2x1_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x1_SUB0 cmpwi cr0, L, 1 ble CGEMM_L2x1_SUB4 CGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble CGEMM_L2x1_LOOP_END .align 5 CGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt CGEMM_L2x1_LOOP CGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b CGEMM_L2x1_SUB1 CGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b CGEMM_L2x1_SUB1 CGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 ble CGEMM_L2x1_SAVE b CGEMM_L2x1_SUB2 CGEMM_L2x1_SUB1: andi. L, K, 7 ble CGEMM_L2x1_SAVE CGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt CGEMM_L2x1_SUB2 CGEMM_L2x1_SAVE: SAVE2x1 CGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 CGEMM_L2_END: b CGEMM_L1_BEGIN L999_H2: b L999 CGEMM_L1_BEGIN: mr BO, B mr BBO, BBUFFER slwi T1, K, 1 CGEMM_L1_COPYB: dcbtst BBO, PRE lxvw4x vs3, o0, BO lxvw4x vs11, o16, BO xxspltw vs4, vs3, 0 xxspltw vs5, vs3, 1 xxspltw vs6, vs3, 2 xxspltw vs7, vs3, 3 xxspltw vs12, vs11, 0 xxspltw vs13, vs11, 1 xxspltw vs14, vs11, 2 xxspltw vs15, vs11, 3 stxvw4x vs4, o0, BBO stxvw4x vs5, o16, BBO stxvw4x vs6, o32, BBO stxvw4x vs7, o48, BBO addi BO, BO, 32 addi BBO, BBO, 64 stxvw4x vs12, o0, BBO stxvw4x vs13, o16, BBO stxvw4x vs14, o32, BBO stxvw4x vs15, o48, BBO addic. T1, T1, -8 addi BBO, BBO, 64 bge CGEMM_L1_COPYB andi. T1, N, 1 ble CGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 ble CGEMM_L1x8_END CGEMM_L1x8_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x8_SUB0 cmpwi cr0, L, 1 ble CGEMM_L1x8_SUB4 CGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 KERNEL1x8_I1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 ble CGEMM_L1x8_LOOP_END .align 5 CGEMM_L1x8_LOOP: KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 bgt CGEMM_L1x8_LOOP CGEMM_L1x8_LOOP_END: KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_E2 b CGEMM_L1x8_SUB1 CGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b CGEMM_L1x8_SUB1 CGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 ble CGEMM_L1x8_SAVE b CGEMM_L1x8_SUB2 CGEMM_L1x8_SUB1: andi. L, K, 7 ble CGEMM_L1x8_SAVE CGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt CGEMM_L1x8_SUB2 CGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 bgt CGEMM_L1x8_BEGIN CGEMM_L1x8_END: CGEMM_L1x4_BEGIN: andi. T2, M, 7 ble CGEMM_L1x1_END andi. T1, M, 4 ble CGEMM_L1x4_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x4_SUB0 cmpwi cr0, L, 1 ble CGEMM_L1x4_SUB4 CGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble CGEMM_L1x4_LOOP_END .align 5 CGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt CGEMM_L1x4_LOOP CGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b CGEMM_L1x4_SUB1 CGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b CGEMM_L1x4_SUB1 CGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 ble CGEMM_L1x4_SAVE b CGEMM_L1x4_SUB2 CGEMM_L1x4_SUB1: andi. L, K, 7 ble CGEMM_L1x4_SAVE CGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt CGEMM_L1x4_SUB2 CGEMM_L1x4_SAVE: SAVE1x4 CGEMM_L1x4_END: CGEMM_L1x2_BEGIN: andi. T1, M, 2 ble CGEMM_L1x2_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x2_SUB0 cmpwi cr0, L, 1 ble CGEMM_L1x2_SUB4 CGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble CGEMM_L1x2_LOOP_END .align 5 CGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt CGEMM_L1x2_LOOP CGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b CGEMM_L1x2_SUB1 CGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b CGEMM_L1x2_SUB1 CGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 ble CGEMM_L1x2_SAVE b CGEMM_L1x2_SUB2 CGEMM_L1x2_SUB1: andi. L, K, 7 ble CGEMM_L1x2_SAVE CGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt CGEMM_L1x2_SUB2 CGEMM_L1x2_SAVE: SAVE1x2 CGEMM_L1x2_END: CGEMM_L1x1_BEGIN: andi. T1, M, 1 ble CGEMM_L1x1_END mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x1_SUB0 cmpwi cr0, L, 1 ble CGEMM_L1x1_SUB4 CGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble CGEMM_L1x1_LOOP_END .align 5 CGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt CGEMM_L1x1_LOOP CGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b CGEMM_L1x1_SUB1 CGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b CGEMM_L1x1_SUB1 CGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 ble CGEMM_L1x1_SAVE b CGEMM_L1x1_SUB2 CGEMM_L1x1_SUB1: andi. L, K, 7 ble CGEMM_L1x1_SAVE CGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt CGEMM_L1x1_SUB2 CGEMM_L1x1_SAVE: SAVE1x1 CGEMM_L1x1_END: CGEMM_L1_END: OpenBLAS-0.2.20/kernel/power/cgemm_macros_8x4_power8.S000066400000000000000000005265141313527062700223640ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xsadddp #define XSFADD_I2 xsadddp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvsubsp #define XVFADD_I1 xvaddsp #define XVFADD_I2 xvaddsp #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xssubdp #define XSFADD_I2 xsadddp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvaddsp #define XVFADD_I1 xvsubsp #define XVFADD_I2 xvaddsp #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xsadddp #define XSFADD_I2 xssubdp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvaddsp #define XVFADD_I1 xvaddsp #define XVFADD_I2 xvsubsp #else // CC || CR || RC || RR #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xssubdp #define XSFADD_I2 xssubdp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvsubsp #define XVFADD_I1 xvsubsp #define XVFADD_I2 xvsubsp #endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro LOAD4x8_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 .endm .macro KERNEL4x8_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs20, o0, BO // load b2_r lxvw4x vs21, o16, BO // load b2_i lxvw4x vs22, o32, BO // load b3_r lxvw4x vs23, o48, BO // load b3_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs20, o0, BO // load b2_r lxvw4x vs21, o16, BO // load b2_i lxvw4x vs22, o32, BO // load b3_r lxvw4x vs23, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro SAVE4x8 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs40, 0 xxspltw vs9, vs40, 1 xxspltw vs10, vs40, 2 xxspltw vs11, vs40, 3 xxspltw vs12, vs41, 0 xxspltw vs13, vs41, 1 xxspltw vs14, vs41, 2 xxspltw vs15, vs41, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs42, 0 xxspltw vs9, vs42, 1 xxspltw vs10, vs42, 2 xxspltw vs11, vs42, 3 xxspltw vs12, vs43, 0 xxspltw vs13, vs43, 1 xxspltw vs14, vs43, 2 xxspltw vs15, vs43, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs44, 0 xxspltw vs9, vs44, 1 xxspltw vs10, vs44, 2 xxspltw vs11, vs44, 3 xxspltw vs12, vs45, 0 xxspltw vs13, vs45, 1 xxspltw vs14, vs45, 2 xxspltw vs15, vs45, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs46, 0 xxspltw vs9, vs46, 1 xxspltw vs10, vs46, 2 xxspltw vs11, vs46, 3 xxspltw vs12, vs47, 0 xxspltw vs13, vs47, 1 xxspltw vs14, vs47, 2 xxspltw vs15, vs47, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs48, 0 xxspltw vs9, vs48, 1 xxspltw vs10, vs48, 2 xxspltw vs11, vs48, 3 xxspltw vs12, vs49, 0 xxspltw vs13, vs49, 1 xxspltw vs14, vs49, 2 xxspltw vs15, vs49, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs50, 0 xxspltw vs9, vs50, 1 xxspltw vs10, vs50, 2 xxspltw vs11, vs50, 3 xxspltw vs12, vs51, 0 xxspltw vs13, vs51, 1 xxspltw vs14, vs51, 2 xxspltw vs15, vs51, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs52, 0 xxspltw vs9, vs52, 1 xxspltw vs10, vs52, 2 xxspltw vs11, vs52, 3 xxspltw vs12, vs53, 0 xxspltw vs13, vs53, 1 xxspltw vs14, vs53, 2 xxspltw vs15, vs53, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs54, 0 xxspltw vs9, vs54, 1 xxspltw vs10, vs54, 2 xxspltw vs11, vs54, 3 xxspltw vs12, vs55, 0 xxspltw vs13, vs55, 1 xxspltw vs14, vs55, 2 xxspltw vs15, vs55, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs56, 0 xxspltw vs9, vs56, 1 xxspltw vs10, vs56, 2 xxspltw vs11, vs56, 3 xxspltw vs12, vs57, 0 xxspltw vs13, vs57, 1 xxspltw vs14, vs57, 2 xxspltw vs15, vs57, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs58, 0 xxspltw vs9, vs58, 1 xxspltw vs10, vs58, 2 xxspltw vs11, vs58, 3 xxspltw vs12, vs59, 0 xxspltw vs13, vs59, 1 xxspltw vs14, vs59, 2 xxspltw vs15, vs59, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs60, 0 xxspltw vs9, vs60, 1 xxspltw vs10, vs60, 2 xxspltw vs11, vs60, 3 xxspltw vs12, vs61, 0 xxspltw vs13, vs61, 1 xxspltw vs14, vs61, 2 xxspltw vs15, vs61, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs62, 0 xxspltw vs9, vs62, 1 xxspltw vs10, vs62, 2 xxspltw vs11, vs62, 3 xxspltw vs12, vs63, 0 xxspltw vs13, vs63, 1 xxspltw vs14, vs63, 2 xxspltw vs15, vs63, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro LOAD4x4_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 .endm .macro KERNEL4x4_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs20, o0, BO // load b2_r lxvw4x vs21, o16, BO // load b2_i lxvw4x vs22, o32, BO // load b3_r lxvw4x vs23, o48, BO // load b3_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs20, o0, BO // load b2_r lxvw4x vs21, o16, BO // load b2_i lxvw4x vs22, o32, BO // load b3_r lxvw4x vs23, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro SAVE4x4 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs40, 0 xxspltw vs9, vs40, 1 xxspltw vs10, vs40, 2 xxspltw vs11, vs40, 3 xxspltw vs12, vs41, 0 xxspltw vs13, vs41, 1 xxspltw vs14, vs41, 2 xxspltw vs15, vs41, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs42, 0 xxspltw vs9, vs42, 1 xxspltw vs10, vs42, 2 xxspltw vs11, vs42, 3 xxspltw vs12, vs43, 0 xxspltw vs13, vs43, 1 xxspltw vs14, vs43, 2 xxspltw vs15, vs43, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs44, 0 xxspltw vs9, vs44, 1 xxspltw vs10, vs44, 2 xxspltw vs11, vs44, 3 xxspltw vs12, vs45, 0 xxspltw vs13, vs45, 1 xxspltw vs14, vs45, 2 xxspltw vs15, vs45, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs46, 0 xxspltw vs9, vs46, 1 xxspltw vs10, vs46, 2 xxspltw vs11, vs46, 3 xxspltw vs12, vs47, 0 xxspltw vs13, vs47, 1 xxspltw vs14, vs47, 2 xxspltw vs15, vs47, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro LOAD4x2_1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 .endm .macro KERNEL4x2_I1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs20, o0, BO // load b2_r lxvw4x vs21, o16, BO // load b2_i lxvw4x vs22, o32, BO // load b3_r lxvw4x vs23, o48, BO // load b3_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs20, o0, BO // load b2_r lxvw4x vs21, o16, BO // load b2_i lxvw4x vs22, o32, BO // load b3_r lxvw4x vs23, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_2 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_SUB1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 lxvw4x vs12, o0, BO // load b2_r lxvw4x vs13, o16, BO // load b2_i lxvw4x vs14, o32, BO // load b3_r lxvw4x vs15, o48, BO // load b3_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro SAVE4x2 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro LOAD4x1_1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 lxsspx vs12, o0, BO // load b2_r lxsspx vs13, o16, BO // load b2_i lxsspx vs14, o32, BO // load b3_r lxsspx vs15, o48, BO // load b3_i addi BO, BO, 64 .endm .macro KERNEL4x1_I1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs16, o0, BO // load b0_r lxsspx vs17, o16, BO // load b0_i lxsspx vs18, o32, BO // load b1_r lxsspx vs19, o48, BO // load b1_i addi BO, BO, 64 lxsspx vs20, o0, BO // load b2_r lxsspx vs21, o16, BO // load b2_i lxsspx vs22, o32, BO // load b3_r lxsspx vs23, o48, BO // load b3_i addi BO, BO, 64 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r xsmuldp vs40, vs0, vs12 // a0_r*b2_r xsmuldp vs41, vs1, vs13 // a0_i*b2_i xsmuldp vs42, vs0, vs13 // a0_r*b2_i xsmuldp vs43, vs1, vs12 // a0_i*b2_r xsmuldp vs44, vs0, vs14 // a0_r*b3_r xsmuldp vs45, vs1, vs15 // a0_i*b3_i xsmuldp vs46, vs0, vs15 // a0_r*b3_i xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm .macro KERNEL4x1_1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs16, o0, BO // load b0_r lxsspx vs17, o16, BO // load b0_i lxsspx vs18, o32, BO // load b1_r lxsspx vs19, o48, BO // load b1_i addi BO, BO, 64 lxsspx vs20, o0, BO // load b2_r lxsspx vs21, o16, BO // load b2_i lxsspx vs22, o32, BO // load b3_r lxsspx vs23, o48, BO // load b3_i addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r xsmaddadp vs40, vs0, vs12 // a0_r*b2_r xsmaddadp vs41, vs1, vs13 // a0_i*b2_i xsmaddadp vs42, vs0, vs13 // a0_r*b2_i xsmaddadp vs43, vs1, vs12 // a0_i*b2_r xsmaddadp vs44, vs0, vs14 // a0_r*b3_r xsmaddadp vs45, vs1, vs15 // a0_i*b3_i xsmaddadp vs46, vs0, vs15 // a0_r*b3_i xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm .macro KERNEL4x1_2 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 lxsspx vs12, o0, BO // load b2_r lxsspx vs13, o16, BO // load b2_i lxsspx vs14, o32, BO // load b3_r lxsspx vs15, o48, BO // load b3_i addi BO, BO, 64 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r xsmaddadp vs40, vs4, vs20 // a4_r*b2_r xsmaddadp vs41, vs5, vs21 // a4_i*b2_i xsmaddadp vs42, vs4, vs21 // a4_r*b2_i xsmaddadp vs43, vs5, vs20 // a4_i*b2_r xsmaddadp vs44, vs4, vs22 // a4_r*b3_r xsmaddadp vs45, vs5, vs23 // a4_i*b3_i xsmaddadp vs46, vs4, vs23 // a4_r*b3_i xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm .macro KERNEL4x1_E2 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r xsmaddadp vs40, vs4, vs20 // a4_r*b2_r xsmaddadp vs41, vs5, vs21 // a4_i*b2_i xsmaddadp vs42, vs4, vs21 // a4_r*b2_i xsmaddadp vs43, vs5, vs20 // a4_i*b2_r xsmaddadp vs44, vs4, vs22 // a4_r*b3_r xsmaddadp vs45, vs5, vs23 // a4_i*b3_i xsmaddadp vs46, vs4, vs23 // a4_r*b3_i xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm .macro KERNEL4x1_SUBI1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 lxsspx vs12, o0, BO // load b2_r lxsspx vs13, o16, BO // load b2_i lxsspx vs14, o32, BO // load b3_r lxsspx vs15, o48, BO // load b3_i addi BO, BO, 64 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r xsmuldp vs40, vs0, vs12 // a0_r*b2_r xsmuldp vs41, vs1, vs13 // a0_i*b2_i xsmuldp vs42, vs0, vs13 // a0_r*b2_i xsmuldp vs43, vs1, vs12 // a0_i*b2_r xsmuldp vs44, vs0, vs14 // a0_r*b3_r xsmuldp vs45, vs1, vs15 // a0_i*b3_i xsmuldp vs46, vs0, vs15 // a0_r*b3_i xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm .macro KERNEL4x1_SUB1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 lxsspx vs12, o0, BO // load b2_r lxsspx vs13, o16, BO // load b2_i lxsspx vs14, o32, BO // load b3_r lxsspx vs15, o48, BO // load b3_i addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r xsmaddadp vs40, vs0, vs12 // a0_r*b2_r xsmaddadp vs41, vs1, vs13 // a0_i*b2_i xsmaddadp vs42, vs0, vs13 // a0_r*b2_i xsmaddadp vs43, vs1, vs12 // a0_i*b2_r xsmaddadp vs44, vs0, vs14 // a0_r*b3_r xsmaddadp vs45, vs1, vs15 // a0_i*b3_i xsmaddadp vs46, vs0, vs15 // a0_r*b3_i xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm .macro SAVE4x1 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro LOAD2x8_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 .endm .macro KERNEL2x8_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro SAVE2x8 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs40, 0 xxspltw vs9, vs40, 1 xxspltw vs10, vs40, 2 xxspltw vs11, vs40, 3 xxspltw vs12, vs41, 0 xxspltw vs13, vs41, 1 xxspltw vs14, vs41, 2 xxspltw vs15, vs41, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs42, 0 xxspltw vs9, vs42, 1 xxspltw vs10, vs42, 2 xxspltw vs11, vs42, 3 xxspltw vs12, vs43, 0 xxspltw vs13, vs43, 1 xxspltw vs14, vs43, 2 xxspltw vs15, vs43, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs44, 0 xxspltw vs9, vs44, 1 xxspltw vs10, vs44, 2 xxspltw vs11, vs44, 3 xxspltw vs12, vs45, 0 xxspltw vs13, vs45, 1 xxspltw vs14, vs45, 2 xxspltw vs15, vs45, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs46, 0 xxspltw vs9, vs46, 1 xxspltw vs10, vs46, 2 xxspltw vs11, vs46, 3 xxspltw vs12, vs47, 0 xxspltw vs13, vs47, 1 xxspltw vs14, vs47, 2 xxspltw vs15, vs47, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro LOAD2x4_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 .endm .macro KERNEL2x4_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro SAVE2x4 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro LOAD2x2_1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 .endm .macro KERNEL2x2_I1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i lxvw4x vs18, o32, BO // load b1_r lxvw4x vs19, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_2 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_SUB1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i lxvw4x vs10, o32, BO // load b1_r lxvw4x vs11, o48, BO // load b1_i addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro SAVE2x2 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro LOAD2x1_1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 .endm .macro KERNEL2x1_I1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs16, o0, BO // load b0_r lxsspx vs17, o16, BO // load b0_i lxsspx vs18, o32, BO // load b1_r lxsspx vs19, o48, BO // load b1_i addi BO, BO, 64 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm .macro KERNEL2x1_1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs16, o0, BO // load b0_r lxsspx vs17, o16, BO // load b0_i lxsspx vs18, o32, BO // load b1_r lxsspx vs19, o48, BO // load b1_i addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm .macro KERNEL2x1_2 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm .macro KERNEL2x1_E2 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm .macro KERNEL2x1_SUBI1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm .macro KERNEL2x1_SUB1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i lxsspx vs10, o32, BO // load b1_r lxsspx vs11, o48, BO // load b1_i addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm .macro SAVE2x1 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro LOAD1x8_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 .endm .macro KERNEL1x8_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro SAVE1x8 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro LOAD1x4_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 .endm .macro KERNEL1x4_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro SAVE1x4 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro LOAD1x2_1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 .endm .macro KERNEL1x2_I1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs16, o0, BO // load b0_r lxvw4x vs17, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_2 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_SUB1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs8, o0, BO // load b0_r lxvw4x vs9, o16, BO // load b0_i addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro SAVE1x2 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro LOAD1x1_1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i addi BO, BO, 32 .endm .macro KERNEL1x1_I1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs16, o0, BO // load b0_r lxsspx vs17, o16, BO // load b0_i addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm .macro KERNEL1x1_1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs16, o0, BO // load b0_r lxsspx vs17, o16, BO // load b0_i addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm .macro KERNEL1x1_2 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm .macro KERNEL1x1_E2 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm .macro KERNEL1x1_SUBI1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm .macro KERNEL1x1_SUB1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 lxsspx vs8, o0, BO // load b0_r lxsspx vs9, o16, BO // load b0_i addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm .macro SAVE1x1 mr T1, CO xxlxor vs24, vs24, vs24 // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC addi CO, CO, 8 .endm OpenBLAS-0.2.20/kernel/power/cgemm_tcopy_8_power8.S000066400000000000000000000153131313527062700217500ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define A0 r8 #define A1 r9 #define A2 r10 #define A3 r11 #define J r12 #define PREA r14 #define PREB r15 #define BO r16 #define B8 r17 #define B4 r18 #define B2 r19 #define B1 r20 #define o4 r21 #define T2 r22 #define I r23 #define o16 r24 #define o32 r25 #define o48 r26 #define NOTUS2 r27 #define M8 r30 #define T1 r31 #define o0 0 #include "cgemm_tcopy_macros_8_power8.S" #define STACKSIZE 384 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) cmpwi cr0, M, 0 ble- L999 cmpwi cr0, N, 0 ble- L999 slwi LDA, LDA, ZBASE_SHIFT slwi M8, M, 3 + ZBASE_SHIFT li T2, -8 li PREA, -4 li PREB, -2 and B4, N, T2 and B2, N, PREA and B1, N, PREB mullw B4, B4, M mullw B2, B2, M mullw B1, B1, M slwi B4, B4, ZBASE_SHIFT slwi B2, B2, ZBASE_SHIFT slwi B1, B1, ZBASE_SHIFT add B4, B4, B add B2, B2, B add B1, B1, B li PREA, 384 addi PREB, M8, 128 li o4, 4 li o16, 16 li o32, 32 li o48, 48 #include "cgemm_tcopy_logic_8_power8.S" L999: li r3, 0 ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/cgemm_tcopy_logic_8_power8.S000066400000000000000000000077211313527062700231310ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. I, M, 2 ble CCOPYT_L2_BEGIN CCOPYT_L4_BEGIN: mr A0, A add A1, A0, LDA add A2, A1, LDA add A3, A2, LDA add A, A3, LDA mr B8, B addi B, B, 64*SIZE sradi. J, N, 3 ble CCOPYT_L4x4_BEGIN mr BO, B8 CCOPYT_L4x8_LOOP: dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA dcbtst BO, M8 dcbtst BO, PREB COPY_4x8 add BO, BO, M8 addic. J, J, -1 ble CCOPYT_L4x4_BEGIN COPY_4x8 add BO, BO, M8 addic. J, J, -1 bgt CCOPYT_L4x8_LOOP CCOPYT_L4x4_BEGIN: andi. T1, N, 4 ble CCOPYT_L4x2_BEGIN mr BO, B4 COPY_4x4 addi B4, B4, 32*SIZE CCOPYT_L4x2_BEGIN: andi. T1, N, 2 ble CCOPYT_L4x1_BEGIN mr BO, B2 COPY_4x2 addi B2, B2, 16*SIZE CCOPYT_L4x1_BEGIN: andi. T1, N, 1 ble CCOPYT_L4_END mr BO, B1 COPY_4x1 addi B1, B1, 8*SIZE CCOPYT_L4_END: addic. I, I, -1 bgt CCOPYT_L4_BEGIN CCOPYT_L2_BEGIN: andi. T1, M, 2 ble CCOPYT_L1_BEGIN mr A0, A add A1, A0, LDA add A, A1, LDA mr B8, B addi B, B, 32*SIZE sradi. J, N, 3 ble CCOPYT_L2x4_BEGIN mr BO, B8 CCOPYT_L2x8_LOOP: COPY_2x8 add BO, BO, M8 addic. J, J, -1 bgt CCOPYT_L2x8_LOOP CCOPYT_L2x4_BEGIN: andi. T1, N, 4 ble CCOPYT_L2x2_BEGIN mr BO, B4 COPY_2x4 addi B4, B4, 16*SIZE CCOPYT_L2x2_BEGIN: andi. T1, N, 2 ble CCOPYT_L2x1_BEGIN mr BO, B2 COPY_2x2 addi B2, B2, 8*SIZE CCOPYT_L2x1_BEGIN: andi. T1, N, 1 ble CCOPYT_L2_END mr BO, B1 COPY_2x1 addi B1, B1, 4*SIZE CCOPYT_L2_END: CCOPYT_L1_BEGIN: andi. T1, M, 1 ble L999 mr A0, A add A, A0, LDA mr B8, B addi B, B, 16*SIZE sradi. J, N, 3 ble CCOPYT_L1x4_BEGIN mr BO, B8 CCOPYT_L1x8_LOOP: COPY_1x8 add BO, BO, M8 addic. J, J, -1 bgt CCOPYT_L1x8_LOOP CCOPYT_L1x4_BEGIN: andi. T1, N, 4 ble CCOPYT_L1x2_BEGIN mr BO, B4 COPY_1x4 addi B4, B4, 8*SIZE CCOPYT_L1x2_BEGIN: andi. T1, N, 2 ble CCOPYT_L1x1_BEGIN mr BO, B2 COPY_1x2 addi B2, B2, 4*SIZE CCOPYT_L1x1_BEGIN: andi. T1, N, 1 ble CCOPYT_L1_END mr BO, B1 COPY_1x1 addi B1, B1, 2*SIZE CCOPYT_L1_END: OpenBLAS-0.2.20/kernel/power/cgemm_tcopy_macros_8_power8.S000066400000000000000000000206661313527062700233230ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro COPY_4x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o32, A0 lxvw4x vs35, o48, A0 lxvw4x vs36, o0, A1 lxvw4x vs37, o16, A1 lxvw4x vs38, o32, A1 lxvw4x vs39, o48, A1 addi A0, A0, 64 addi A1, A1, 64 lxvw4x vs40, o0, A2 lxvw4x vs41, o16, A2 lxvw4x vs42, o32, A2 lxvw4x vs43, o48, A2 lxvw4x vs44, o0, A3 lxvw4x vs45, o16, A3 lxvw4x vs46, o32, A3 lxvw4x vs47, o48, A3 mr T1, BO addi A2, A2, 64 addi A3, A3, 64 stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 addi T1, T1, 64 stxvw4x vs36, o0, T1 stxvw4x vs37, o16, T1 stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 addi T1, T1, 64 stxvw4x vs40, o0, T1 stxvw4x vs41, o16, T1 stxvw4x vs42, o32, T1 stxvw4x vs43, o48, T1 addi T1, T1, 64 stxvw4x vs44, o0, T1 stxvw4x vs45, o16, T1 stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro COPY_4x4 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 addi A0, A0, 32 lxvw4x vs34, o0, A1 lxvw4x vs35, o16, A1 addi A1, A1, 32 lxvw4x vs36, o0, A2 lxvw4x vs37, o16, A2 addi A2, A2, 32 lxvw4x vs38, o0, A3 lxvw4x vs39, o16, A3 addi A3, A3, 32 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 addi T1, T1, 64 stxvw4x vs36, o0, T1 stxvw4x vs37, o16, T1 stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro COPY_4x2 lxvw4x vs32, o0, A0 addi A0, A0, 16 lxvw4x vs33, o0, A1 addi A1, A1, 16 lxvw4x vs34, o0, A2 addi A2, A2, 16 lxvw4x vs35, o0, A3 addi A3, A3, 16 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro COPY_4x1 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 addi A0, A0, 8 lxsspx vs34, o0, A1 lxsspx vs35, o4, A1 addi A1, A1, 8 lxsspx vs36, o0, A2 lxsspx vs37, o4, A2 addi A2, A2, 8 lxsspx vs38, o0, A3 lxsspx vs39, o4, A3 addi A3, A3, 8 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 addi T1, T1, 8 stxsspx vs36, o0, T1 stxsspx vs37, o4, T1 addi T1, T1, 8 stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro COPY_2x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o32, A0 lxvw4x vs35, o48, A0 addi A0, A0, 64 lxvw4x vs36, o0, A1 lxvw4x vs37, o16, A1 lxvw4x vs38, o32, A1 lxvw4x vs39, o48, A1 addi A1, A1, 64 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 addi T1, T1, 64 stxvw4x vs36, o0, T1 stxvw4x vs37, o16, T1 stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro COPY_2x4 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 addi A0, A0, 32 lxvw4x vs34, o0, A1 lxvw4x vs35, o16, A1 addi A1, A1, 32 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro COPY_2x2 lxvw4x vs32, o0, A0 addi A0, A0, 16 lxvw4x vs33, o0, A1 addi A1, A1, 16 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro COPY_2x1 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 addi A0, A0, 8 lxsspx vs34, o0, A1 lxsspx vs35, o4, A1 addi A1, A1, 8 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro COPY_1x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o32, A0 lxvw4x vs35, o48, A0 addi A0, A0, 64 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro COPY_1x4 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 addi A0, A0, 32 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro COPY_1x2 lxvw4x vs32, o0, A0 addi A0, A0, 16 mr T1, BO stxvw4x vs32, o0, T1 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro COPY_1x1 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 addi A0, A0, 8 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 .endm OpenBLAS-0.2.20/kernel/power/cnrm2.S000066400000000000000000000211721313527062700167300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define INCXM1 r9 #define FZERO 144(SP) #define FONE 148(SP) #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, 4 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 fmr f8, f1 fmr f9, f1 fmr f10, f1 fmr f11, f1 fmr f12, f1 fmr f13, f1 fmr f14, f1 fmr f15, f1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(1000) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(150) LFD f16, 0 * SIZE(X) LFD f17, 1 * SIZE(X) LFD f18, 2 * SIZE(X) LFD f19, 3 * SIZE(X) LFD f20, 4 * SIZE(X) LFD f21, 5 * SIZE(X) LFD f22, 6 * SIZE(X) LFD f23, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(120) .align 4 LL(110): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 LFD f16, 16 * SIZE(X) LFD f17, 17 * SIZE(X) LFD f18, 18 * SIZE(X) LFD f19, 19 * SIZE(X) fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 LFD f20, 20 * SIZE(X) LFD f21, 21 * SIZE(X) LFD f22, 22 * SIZE(X) LFD f23, 23 * SIZE(X) fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(110) .align 4 LL(120): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 addi X, X, 16 * SIZE .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq- cr0, LL(170) .align 4 LL(160): LFD f16, 0 * SIZE(X) LFD f17, 1 * SIZE(X) addi X, X, 2 * SIZE fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 bdnz LL(160) .align 4 LL(170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f8, f8, f9 fadd f10, f10, f11 fadd f12, f12, f13 fadd f14, f14, f15 fadd f0, f0, f2 fadd f4, f4, f6 fadd f8, f8, f10 fadd f12, f12, f14 fadd f0, f0, f4 fadd f8, f8, f12 fadd f0, f0, f8 fsqrt f1, f0 b LL(9999) .align 4 LL(1000): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(1150) LFDX f16, X, INCXM1 LFDUX f17, X, INCX LFDX f18, X, INCXM1 LFDUX f19, X, INCX LFDX f20, X, INCXM1 LFDUX f21, X, INCX LFDX f22, X, INCXM1 LFDUX f23, X, INCX LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(1120) .align 4 LL(1110): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 LFDX f16, X, INCXM1 LFDUX f17, X, INCX LFDX f18, X, INCXM1 LFDUX f19, X, INCX fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 LFDX f20, X, INCXM1 LFDUX f21, X, INCX LFDX f22, X, INCXM1 LFDUX f23, X, INCX fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdnz LL(1110) .align 4 LL(1120): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 .align 4 LL(1150): andi. r0, N, 7 mtspr CTR, r0 beq- cr0, LL(1170) .align 4 LL(1160): LFDX f16, X, INCXM1 LFDUX f17, X, INCX fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 bdnz LL(1160) .align 4 LL(1170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f8, f8, f9 fadd f10, f10, f11 fadd f12, f12, f13 fadd f14, f14, f15 fadd f0, f0, f2 fadd f4, f4, f6 fadd f8, f8, f10 fadd f12, f12, f14 fadd f0, f0, f4 fadd f8, f8, f12 fadd f0, f0, f8 fsqrt f1, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/cnrm2_hummer.S000066400000000000000000000363371313527062700203160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define C5 f4 #define C6 f5 #define C7 f6 #define C8 f7 #define A1 f8 #define A2 f9 #define A3 f10 #define A4 f11 #define A5 f12 #define A6 f13 #define A7 f14 #define A8 f15 #define A9 f16 #define A10 f17 #define A11 f18 #define A12 f19 #define A13 f20 #define A14 f21 #define A15 f22 #define A16 f23 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 fpmr C5, C1 fpmr C6, C1 fpmr C7, C1 fpmr C8, C1 cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 ble LL(99) andi. r0, X, 2 * SIZE - 1 bne LL(100) srawi. r0, N, 4 sub X, X, INCX2 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 LFPDUX A9, X, INCX2 LFPDUX A10, X, INCX2 LFPDUX A11, X, INCX2 LFPDUX A12, X, INCX2 LFPDUX A13, X, INCX2 LFPDUX A14, X, INCX2 LFPDUX A15, X, INCX2 LFPDUX A16, X, INCX2 bdz LL(13) .align 4 LL(12): fpmadd C1, A1, A1, C1 LFPDUX A1, X, INCX2 fpmadd C2, A2, A2, C2 LFPDUX A2, X, INCX2 fpmadd C3, A3, A3, C3 LFPDUX A3, X, INCX2 fpmadd C4, A4, A4, C4 LFPDUX A4, X, INCX2 fpmadd C5, A5, A5, C5 LFPDUX A5, X, INCX2 fpmadd C6, A6, A6, C6 LFPDUX A6, X, INCX2 fpmadd C7, A7, A7, C7 LFPDUX A7, X, INCX2 fpmadd C8, A8, A8, C8 LFPDUX A8, X, INCX2 fpmadd C1, A9, A9, C1 LFPDUX A9, X, INCX2 fpmadd C2, A10, A10, C2 LFPDUX A10, X, INCX2 fpmadd C3, A11, A11, C3 LFPDUX A11, X, INCX2 fpmadd C4, A12, A12, C4 LFPDUX A12, X, INCX2 fpmadd C5, A13, A13, C5 LFPDUX A13, X, INCX2 fpmadd C6, A14, A14, C6 LFPDUX A14, X, INCX2 fpmadd C7, A15, A15, C7 LFPDUX A15, X, INCX2 fpmadd C8, A16, A16, C8 LFPDUX A16, X, INCX2 bdnz LL(12) .align 4 LL(13): fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 fpmadd C5, A5, A5, C5 fpmadd C6, A6, A6, C6 fpmadd C7, A7, A7, C7 fpmadd C8, A8, A8, C8 fpmadd C1, A9, A9, C1 fpmadd C2, A10, A10, C2 fpmadd C3, A11, A11, C3 fpmadd C4, A12, A12, C4 fpmadd C5, A13, A13, C5 fpmadd C6, A14, A14, C6 fpmadd C7, A15, A15, C7 fpmadd C8, A16, A16, C8 .align 4 LL(15): andi. r0, N, 15 beq LL(98) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 fpmadd C5, A5, A5, C5 fpmadd C6, A6, A6, C6 fpmadd C7, A7, A7, C7 fpmadd C8, A8, A8, C8 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 .align 4 LL(18): andi. r0, N, 1 beq LL(98) LFPDUX A1, X, INCX2 fpmadd C3, A1, A1, C3 .align 4 LL(98): fpadd C1, C1, C5 lis r3, 0x3f00 fpadd C2, C2, C6 lis r4, 0x4040 fpadd C3, C3, C7 stw r3, 4(SP) fpadd C4, C4, C8 stw r4, 8(SP) fpadd C1, C1, C2 lfs f10, 0(SP) fpadd C3, C3, C4 lfs f11, 4(SP) fpadd C1, C1, C3 lfs f12, 8(SP) fsmtp C2, C1 fadd C1, C2, C1 fcmpu cr0, f10, C1 beq cr0, LL(99) #ifndef HUMMER_EMULATOR frsqrte f9, f1 li r10, 16 fmul f2, f1, f9 lfpdux f23, SP, r10 fmul f3, f9, f11 lfpdux f22, SP, r10 fnmsub f4, f2, f9, f12 lfpdux f21, SP, r10 fmul f9, f3, f4 lfpdux f20, SP, r10 fadd f13, f11, f11 lfpdux f19, SP, r10 fmul f12, f1, f9 lfpdux f18, SP, r10 fmul f11, f12, f11 lfpdux f17, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 fmadd f1, f11, f1, f12 blr #else fsqrt f1, f1 li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr #endif .align 4 LL(99): li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr .align 4 LL(100): cmpwi cr0, INCX, SIZE bne LL(200) LFD C1, 0(X) addi X, X, 1 * SIZE addi N, N, -1 cmpwi cr0, N, 0 fmul C1, C1, C1 sub X, X, INCX2 ble LL(198) srawi. r0, N, 4 mtspr CTR, r0 beq- LL(115) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 LFPDUX A9, X, INCX2 LFPDUX A10, X, INCX2 LFPDUX A11, X, INCX2 LFPDUX A12, X, INCX2 LFPDUX A13, X, INCX2 LFPDUX A14, X, INCX2 LFPDUX A15, X, INCX2 LFPDUX A16, X, INCX2 bdz LL(113) .align 4 LL(112): fpmadd C1, A1, A1, C1 LFPDUX A1, X, INCX2 fpmadd C2, A2, A2, C2 LFPDUX A2, X, INCX2 fpmadd C3, A3, A3, C3 LFPDUX A3, X, INCX2 fpmadd C4, A4, A4, C4 LFPDUX A4, X, INCX2 fpmadd C5, A5, A5, C5 LFPDUX A5, X, INCX2 fpmadd C6, A6, A6, C6 LFPDUX A6, X, INCX2 fpmadd C7, A7, A7, C7 LFPDUX A7, X, INCX2 fpmadd C8, A8, A8, C8 LFPDUX A8, X, INCX2 fpmadd C1, A9, A9, C1 LFPDUX A9, X, INCX2 fpmadd C2, A10, A10, C2 LFPDUX A10, X, INCX2 fpmadd C3, A11, A11, C3 LFPDUX A11, X, INCX2 fpmadd C4, A12, A12, C4 LFPDUX A12, X, INCX2 fpmadd C5, A13, A13, C5 LFPDUX A13, X, INCX2 fpmadd C6, A14, A14, C6 LFPDUX A14, X, INCX2 fpmadd C7, A15, A15, C7 LFPDUX A15, X, INCX2 fpmadd C8, A16, A16, C8 LFPDUX A16, X, INCX2 bdnz LL(112) .align 4 LL(113): fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 fpmadd C5, A5, A5, C5 fpmadd C6, A6, A6, C6 fpmadd C7, A7, A7, C7 fpmadd C8, A8, A8, C8 fpmadd C1, A9, A9, C1 fpmadd C2, A10, A10, C2 fpmadd C3, A11, A11, C3 fpmadd C4, A12, A12, C4 fpmadd C5, A13, A13, C5 fpmadd C6, A14, A14, C6 fpmadd C7, A15, A15, C7 fpmadd C8, A16, A16, C8 .align 4 LL(115): andi. r0, N, 15 beq LL(198) andi. r0, N, 8 beq LL(116) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 fpmadd C5, A5, A5, C5 fpmadd C6, A6, A6, C6 fpmadd C7, A7, A7, C7 fpmadd C8, A8, A8, C8 .align 4 LL(116): andi. r0, N, 4 beq LL(117) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 .align 4 LL(117): andi. r0, N, 2 beq LL(118) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 .align 4 LL(118): andi. r0, N, 1 beq LL(198) LFPDUX A1, X, INCX2 fpmadd C3, A1, A1, C3 .align 4 LL(198): LFDX A1, X, INCX2 fmadd C4, A1, A1, C4 fpadd C1, C1, C5 lis r3, 0x3f00 fpadd C2, C2, C6 lis r4, 0x4040 fpadd C3, C3, C7 stw r3, 4(SP) fpadd C4, C4, C8 stw r4, 8(SP) fpadd C1, C1, C2 lfs f10, 0(SP) fpadd C3, C3, C4 lfs f11, 4(SP) fpadd C1, C1, C3 lfs f12, 8(SP) fsmtp C2, C1 fadd C1, C2, C1 fcmpu cr0, f10, C1 beq cr0, LL(199) #ifndef HUMMER_EMULATOR frsqrte f9, f1 li r10, 16 fmul f2, f1, f9 lfpdux f23, SP, r10 fmul f3, f9, f11 lfpdux f22, SP, r10 fnmsub f4, f2, f9, f12 lfpdux f21, SP, r10 fmul f9, f3, f4 lfpdux f20, SP, r10 fadd f13, f11, f11 lfpdux f19, SP, r10 fmul f12, f1, f9 lfpdux f18, SP, r10 fmul f11, f12, f11 lfpdux f17, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 fmadd f1, f11, f1, f12 blr #else fsqrt f1, f1 li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr #endif .align 4 LL(199): li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr .align 4 LL(200): sub X, X, INCX2 addi X2, X, SIZE srawi. r0, N, 3 mtspr CTR, r0 beq- LL(215) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 LFDUX A5, X, INCX2 LFDUX A6, X2, INCX2 LFDUX A7, X, INCX2 LFDUX A8, X2, INCX2 LFDUX A9, X, INCX2 LFDUX A10, X2, INCX2 LFDUX A11, X, INCX2 LFDUX A12, X2, INCX2 LFDUX A13, X, INCX2 LFDUX A14, X2, INCX2 LFDUX A15, X, INCX2 LFDUX A16, X2, INCX2 bdz LL(213) .align 4 LL(212): fmadd C1, A1, A1, C1 LFDUX A1, X, INCX2 fmadd C2, A2, A2, C2 LFDUX A2, X2, INCX2 fmadd C3, A3, A3, C3 LFDUX A3, X, INCX2 fmadd C4, A4, A4, C4 LFDUX A4, X2, INCX2 fmadd C5, A5, A5, C5 LFDUX A5, X, INCX2 fmadd C6, A6, A6, C6 LFDUX A6, X2, INCX2 fmadd C7, A7, A7, C7 LFDUX A7, X, INCX2 fmadd C8, A8, A8, C8 LFDUX A8, X2, INCX2 fmadd C1, A9, A9, C1 LFDUX A9, X, INCX2 fmadd C2, A10, A10, C2 LFDUX A10, X2, INCX2 fmadd C3, A11, A11, C3 LFDUX A11, X, INCX2 fmadd C4, A12, A12, C4 LFDUX A12, X2, INCX2 fmadd C5, A13, A13, C5 LFDUX A13, X, INCX2 fmadd C6, A14, A14, C6 LFDUX A14, X2, INCX2 fmadd C7, A15, A15, C7 LFDUX A15, X, INCX2 fmadd C8, A16, A16, C8 LFDUX A16, X2, INCX2 bdnz LL(212) .align 4 LL(213): fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 fmadd C5, A5, A5, C5 fmadd C6, A6, A6, C6 fmadd C7, A7, A7, C7 fmadd C8, A8, A8, C8 fmadd C1, A9, A9, C1 fmadd C2, A10, A10, C2 fmadd C3, A11, A11, C3 fmadd C4, A12, A12, C4 fmadd C5, A13, A13, C5 fmadd C6, A14, A14, C6 fmadd C7, A15, A15, C7 fmadd C8, A16, A16, C8 .align 4 LL(215): andi. r0, N, 7 beq LL(998) andi. r0, N, 4 beq LL(216) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 LFDUX A5, X, INCX2 LFDUX A6, X2, INCX2 LFDUX A7, X, INCX2 LFDUX A8, X2, INCX2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 fmadd C5, A5, A5, C5 fmadd C6, A6, A6, C6 fmadd C7, A7, A7, C7 fmadd C8, A8, A8, C8 .align 4 LL(216): andi. r0, N, 2 beq LL(217) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 .align 4 LL(217): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 .align 4 LL(998): fadd C1, C1, C5 lis r3, 0x3f00 fadd C2, C2, C6 lis r4, 0x4040 fadd C3, C3, C7 stw r3, 4(SP) fadd C4, C4, C8 stw r4, 8(SP) fadd C1, C1, C2 lfs f10, 0(SP) fadd C3, C3, C4 lfs f11, 4(SP) fadd C1, C1, C3 lfs f12, 8(SP) fcmpu cr0, f10, C1 beq cr0, LL(99) frsqrte f9, f1 li r10, 16 fmul f2, f1, f9 lfpdux f23, SP, r10 fmul f3, f9, f11 lfpdux f22, SP, r10 fnmsub f4, f2, f9, f12 lfpdux f21, SP, r10 fmul f9, f3, f4 lfpdux f20, SP, r10 fadd f13, f11, f11 lfpdux f19, SP, r10 fmul f12, f1, f9 lfpdux f18, SP, r10 fmul f11, f12, f11 lfpdux f17, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 fmadd f1, f11, f1, f12 blr LL(999): li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/cnrm2_ppc440.S000066400000000000000000000151001313527062700200140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PRE r8 #define INC1 r9 #define FZERO 144(SP) #define FONE 148(SP) #define C1 152(SP) #define C2 156(SP) #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 lis r6, 0x3f00 lis r7, 0x4040 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE stw r6, C1 stw r7, C2 lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT li INC1, SIZE li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) fmr f0, f1 sub X, X, INCX fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 fmr f8, f1 fmr f9, f1 fmr f10, f1 fmr f11, f1 fmr f12, f1 fmr f13, f1 fmr f14, f1 fmr f15, f1 srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(1150) LFDUX f16, X, INCX LFDX f17, X, INC1 LFDUX f18, X, INCX LFDX f19, X, INC1 LFDUX f20, X, INCX LFDX f21, X, INC1 LFDUX f22, X, INCX LFDX f23, X, INC1 LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 bdz LL(1120) .align 4 LL(1110): fmadd f0, f16, f16, f0 LFDUX f16, X, INCX fmadd f1, f17, f17, f1 LFDX f17, X, INC1 fmadd f2, f18, f18, f2 LFDUX f18, X, INCX fmadd f3, f19, f19, f3 LFDX f19, X, INC1 #ifdef PPCG4 dcbt X, PRE #endif fmadd f4, f20, f20, f4 LFDUX f20, X, INCX fmadd f5, f21, f21, f5 LFDX f21, X, INC1 fmadd f6, f22, f22, f6 LFDUX f22, X, INCX fmadd f7, f23, f23, f7 LFDX f23, X, INC1 fmadd f8, f24, f24, f8 LFDUX f24, X, INCX fmadd f9, f25, f25, f9 LFDX f25, X, INC1 fmadd f10, f26, f26, f10 LFDUX f26, X, INCX fmadd f11, f27, f27, f11 LFDX f27, X, INC1 #ifdef PPCG4 dcbt X, PRE #endif fmadd f12, f28, f28, f12 LFDUX f28, X, INCX fmadd f13, f29, f29, f13 LFDX f29, X, INC1 fmadd f14, f30, f30, f14 LFDUX f30, X, INCX fmadd f15, f31, f31, f15 LFDX f31, X, INC1 bdnz LL(1110) .align 4 LL(1120): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 .align 4 LL(1150): andi. r0, N, 7 mtspr CTR, r0 beq- cr0, LL(1170) .align 4 LL(1160): LFDUX f16, X, INCX LFDX f17, X, INC1 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 bdnz LL(1160) .align 4 LL(1170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f8, f8, f9 fadd f10, f10, f11 fadd f12, f12, f13 fadd f14, f14, f15 fadd f0, f0, f2 fadd f4, f4, f6 fadd f8, f8, f10 fadd f12, f12, f14 fadd f0, f0, f4 fadd f8, f8, f12 fadd f1, f0, f8 lfs f4, FZERO fcmpu cr0, f1, f4 beq cr0, LL(999) frsqrte f0, f1 lfs f8, C1 lfs f9, C2 fmul f2, f1, f0 fadd f7, f8, f8 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f5, f1, f0 fmul f2, f5, f8 fnmsub f3, f5, f0, f7 fmadd f1, f2, f3, f5 .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/copy.S000066400000000000000000000126461313527062700166670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #define STACKSIZE 16 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, SIZE bne- cr0, LL(100) cmpwi cr0, INCY, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LL(10): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f2, 2 * SIZE(X) LFD f3, 3 * SIZE(X) STFD f0, 0 * SIZE(Y) STFD f1, 1 * SIZE(Y) STFD f2, 2 * SIZE(Y) STFD f3, 3 * SIZE(Y) LFD f4, 4 * SIZE(X) LFD f5, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f7, 7 * SIZE(X) STFD f4, 4 * SIZE(Y) STFD f5, 5 * SIZE(Y) STFD f6, 6 * SIZE(Y) STFD f7, 7 * SIZE(Y) LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) STFD f8, 8 * SIZE(Y) STFD f9, 9 * SIZE(Y) STFD f10, 10 * SIZE(Y) STFD f11, 11 * SIZE(Y) LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) STFD f12, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f14, 14 * SIZE(Y) STFD f15, 15 * SIZE(Y) #ifndef POWER6 dcbtst Y, PREA #ifdef L1_DUALFETCH dcbt X, PREA #endif #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 dcbtst Y, PREA L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE STFD f8, 0 * SIZE(Y) addi Y, Y, 1 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX sub Y, Y, INCY srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) .align 4 LL(110): LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX LFDUX f8, X, INCX LFDUX f9, X, INCX LFDUX f10, X, INCX LFDUX f11, X, INCX LFDUX f12, X, INCX LFDUX f13, X, INCX LFDUX f14, X, INCX LFDUX f15, X, INCX STFDUX f0, Y, INCY STFDUX f1, Y, INCY STFDUX f2, Y, INCY STFDUX f3, Y, INCY STFDUX f4, Y, INCY STFDUX f5, Y, INCY STFDUX f6, Y, INCY STFDUX f7, Y, INCY STFDUX f8, Y, INCY STFDUX f9, Y, INCY STFDUX f10, Y, INCY STFDUX f11, Y, INCY STFDUX f12, Y, INCY STFDUX f13, Y, INCY STFDUX f14, Y, INCY STFDUX f15, Y, INCY bdnz LL(110) .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX STFDUX f8, Y, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/copy_hummer.S000066400000000000000000000402361313527062700202400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define INCX2 r8 #define INCY2 r9 #define X2 r10 #define Y2 r11 #define A1 f0 #define A2 f1 #define A3 f2 #define A4 f3 #define A5 f4 #define A6 f5 #define A7 f6 #define A8 f7 #define A9 f8 #define T1 f9 #define T2 f10 #define T3 f11 #define T4 f12 #define T5 f13 #define T6 f14 #define T7 f15 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX add INCY2, INCY, INCY cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCY, SIZE bne LL(60) cmpwi cr0, INCX, SIZE bne LL(50) sub X, X, INCX2 sub Y, Y, INCY2 andi. r0, X, 2 * SIZE - 1 bne LL(30) andi. r0, Y, 2 * SIZE - 1 bne LL(20) .align 4 LL(10): /* X : aligned Y : aligned */ srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): STFPDUX A1, Y, INCY2 LFPDUX A1, X, INCX2 STFPDUX A2, Y, INCY2 LFPDUX A2, X, INCX2 STFPDUX A3, Y, INCY2 LFPDUX A3, X, INCX2 STFPDUX A4, Y, INCY2 LFPDUX A4, X, INCX2 STFPDUX A5, Y, INCY2 LFPDUX A5, X, INCX2 STFPDUX A6, Y, INCY2 LFPDUX A6, X, INCX2 STFPDUX A7, Y, INCY2 LFPDUX A7, X, INCX2 STFPDUX A8, Y, INCY2 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 STFPDUX A5, Y, INCY2 STFPDUX A6, Y, INCY2 STFPDUX A7, Y, INCY2 STFPDUX A8, Y, INCY2 .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 STFPDUX A1, Y, INCY2 .align 4 LL(18): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 STFDUX A1, Y, INCY2 .align 4 b LL(999) .align 4 LL(20): /* X ): aligned Y ): unaligned */ LFXDUX A1, X, INCX2 addi N, N, -1 cmpwi cr0, N, 0 STFSDX A1, Y, INCY2 add Y, Y, INCY ble LL(999) .align 4 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(25) LFXDUX T1, X, INCX2 LFXDUX T2, X, INCX2 LFXDUX T3, X, INCX2 LFXDUX T4, X, INCX2 LFPDUX A6, X, INCX2 fsmr A1, T1 LFPDUX A7, X, INCX2 fsmr T1, T2 LFPDUX A8, X, INCX2 fsmr T2, T3 LFPDUX A9, X, INCX2 fsmr T3, T4 bdz LL(23) .align 4 LL(22): STFPDUX A1, Y, INCY2 fxmr T5, A6 STFPDUX T1, Y, INCY2 fxmr T6, A7 STFPDUX T2, Y, INCY2 fxmr T7, A8 STFPDUX T3, Y, INCY2 fxmr A1, A9 fsmr T4, T5 LFPDUX A2, X, INCX2 fsmr T5, T6 LFPDUX A3, X, INCX2 fsmr T6, T7 LFPDUX A4, X, INCX2 fsmr T7, A1 LFPDUX A5, X, INCX2 STFPDUX T4, Y, INCY2 fxmr T1, A2 STFPDUX T5, Y, INCY2 fxmr T2, A3 STFPDUX T6, Y, INCY2 fxmr T3, A4 STFPDUX T7, Y, INCY2 fxmr T4, A5 LFPDUX A6, X, INCX2 fsmr A1, T1 LFPDUX A7, X, INCX2 fsmr T1, T2 LFPDUX A8, X, INCX2 fsmr T2, T3 LFPDUX A9, X, INCX2 fsmr T3, T4 bdnz LL(22) .align 4 LL(23): STFPDUX A1, Y, INCY2 fxmr T5, A6 STFPDUX T1, Y, INCY2 fxmr T6, A7 STFPDUX T2, Y, INCY2 fxmr T7, A8 STFPDUX T3, Y, INCY2 fxmr A1, A9 fsmr T4, T5 fsmr T5, T6 fsmr T6, T7 fsmr T7, A1 STFPDUX T4, Y, INCY2 STFPDUX T5, Y, INCY2 STFPDUX T6, Y, INCY2 STFPDUX T7, Y, INCY2 .align 4 LL(25): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(26) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 LFXDUX A4, X, INCX2 LFXDUX A5, X, INCX2 fsmr A1, A2 fsmr A2, A3 fsmr A3, A4 fsmr A4, A5 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 fpmr A1, A5 .align 4 LL(26): andi. r0, N, 4 beq LL(27) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 fsmr A1, A2 fsmr A2, A3 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 fpmr A1, A3 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFXDUX A2, X, INCX2 fsmr A1, A2 STFPDUX A1, Y, INCY2 fpmr A1, A2 .align 4 LL(28): andi. r0, N, 1 beq LL(999) STFDUX A1, Y, INCY2 b LL(999) .align 4 LL(30): /* X : unaligned Y : aligned */ andi. r0, Y, 2 * SIZE - 1 bne LL(40) LFDX A1, X, INCX2 add X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(35) LFXDUX T1, X, INCX2 LFXDUX T2, X, INCX2 LFXDUX T3, X, INCX2 LFXDUX T4, X, INCX2 LFPDUX A6, X, INCX2 fsmr A1, T1 LFPDUX A7, X, INCX2 fsmr T1, T2 LFPDUX A8, X, INCX2 fsmr T2, T3 LFPDUX A9, X, INCX2 fsmr T3, T4 bdz LL(33) .align 4 LL(32): fxmr T5, A6 STFPDUX A1, Y, INCY2 fxmr T6, A7 STFPDUX T1, Y, INCY2 fxmr T7, A8 STFPDUX T2, Y, INCY2 fxmr A1, A9 STFPDUX T3, Y, INCY2 fsmr T4, T5 LFPDUX A2, X, INCX2 fsmr T5, T6 LFPDUX A3, X, INCX2 fsmr T6, T7 LFPDUX A4, X, INCX2 fsmr T7, A1 LFPDUX A5, X, INCX2 STFPDUX T4, Y, INCY2 fxmr T1, A2 STFPDUX T5, Y, INCY2 fxmr T2, A3 STFPDUX T6, Y, INCY2 fxmr T3, A4 STFPDUX T7, Y, INCY2 fxmr T4, A5 LFPDUX A6, X, INCX2 fsmr A1, T1 LFPDUX A7, X, INCX2 fsmr T1, T2 LFPDUX A8, X, INCX2 fsmr T2, T3 LFPDUX A9, X, INCX2 fsmr T3, T4 bdnz LL(32) .align 4 LL(33): STFPDUX A1, Y, INCY2 fxmr T5, A6 STFPDUX T1, Y, INCY2 fxmr T6, A7 STFPDUX T2, Y, INCY2 fxmr T7, A8 STFPDUX T3, Y, INCY2 fxmr A1, A9 fsmr T4, T5 fsmr T5, T6 fsmr T6, T7 fsmr T7, A1 STFPDUX T4, Y, INCY2 STFPDUX T5, Y, INCY2 STFPDUX T6, Y, INCY2 STFPDUX T7, Y, INCY2 .align 4 LL(35): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(36) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 LFXDUX A4, X, INCX2 LFXDUX A5, X, INCX2 fsmr A1, A2 fsmr A2, A3 fsmr A3, A4 fsmr A4, A5 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 fpmr A1, A5 .align 4 LL(36): andi. r0, N, 4 beq LL(37) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 fsmr A1, A2 fsmr A2, A3 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 fpmr A1, A3 .align 4 LL(37): andi. r0, N, 2 beq LL(38) LFXDUX A2, X, INCX2 fsmr A1, A2 STFPDUX A1, Y, INCY2 fpmr A1, A2 .align 4 LL(38): andi. r0, N, 1 beq LL(999) STFDUX A1, Y, INCY2 b LL(999) .align 4 LL(40): /* X : unaligned Y : unaligned */ LFDX A1, X, INCX2 add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 STFDX A1, Y, INCY2 add Y, Y, INCY ble LL(999) srawi. r0, N, 4 mtspr CTR, r0 beq- LL(45) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(43) .align 4 LL(42): STFPDUX A1, Y, INCY2 LFPDUX A1, X, INCX2 STFPDUX A2, Y, INCY2 LFPDUX A2, X, INCX2 STFPDUX A3, Y, INCY2 LFPDUX A3, X, INCX2 STFPDUX A4, Y, INCY2 LFPDUX A4, X, INCX2 STFPDUX A5, Y, INCY2 LFPDUX A5, X, INCX2 STFPDUX A6, Y, INCY2 LFPDUX A6, X, INCX2 STFPDUX A7, Y, INCY2 LFPDUX A7, X, INCX2 STFPDUX A8, Y, INCY2 LFPDUX A8, X, INCX2 bdnz LL(42) .align 4 LL(43): STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 STFPDUX A5, Y, INCY2 STFPDUX A6, Y, INCY2 STFPDUX A7, Y, INCY2 STFPDUX A8, Y, INCY2 .align 4 LL(45): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(46) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 .align 4 LL(46): andi. r0, N, 4 beq LL(47) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 .align 4 LL(47): andi. r0, N, 2 beq LL(48) LFPDUX A1, X, INCX2 STFPDUX A1, Y, INCY2 .align 4 LL(48): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 STFDUX A1, Y, INCY2 .align 4 b LL(999) .align 4 # INCX != 1, INCY == 1 LL(50): andi. r0, Y, 2 * SIZE - 1 beq LL(51) LFD A1, 0 * SIZE(X) add X, X, INCX STFD A1, 0 * SIZE(Y) add Y, Y, INCY addi N, N, -1 cmpwi cr0, N, 0 ble LL(999) .align 4 LL(51): sub X, X, INCX sub Y, Y, INCY2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(55) .align 4 LL(52): LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFDUX A9, X, INCX LFDUX T1, X, INCX LFDUX T2, X, INCX LFDUX T3, X, INCX fsmfp A1, A2 LFDUX T4, X, INCX fsmfp A3, A4 LFDUX T5, X, INCX fsmfp A5, A6 LFDUX T6, X, INCX fsmfp A7, A8 LFDUX T7, X, INCX fsmfp A9, T1 STFPDUX A1, Y, INCY2 fsmfp T2, T3 STFPDUX A3, Y, INCY2 fsmfp T4, T5 STFPDUX A5, Y, INCY2 fsmfp T6, T7 STFPDUX A7, Y, INCY2 STFPDUX A9, Y, INCY2 STFPDUX T2, Y, INCY2 STFPDUX T4, Y, INCY2 STFPDUX T6, Y, INCY2 bdnz LL(52) .align 4 LL(55): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(56) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX fsmfp A1, A2 fsmfp A3, A4 fsmfp A5, A6 fsmfp A7, A8 STFPDUX A1, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A5, Y, INCY2 STFPDUX A7, Y, INCY2 .align 4 LL(56): andi. r0, N, 4 beq LL(57) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fsmfp A1, A2 fsmfp A3, A4 STFPDUX A1, Y, INCY2 STFPDUX A3, Y, INCY2 .align 4 LL(57): andi. r0, N, 2 beq LL(58) LFDUX A1, X, INCX LFDUX A2, X, INCX fsmfp A1, A2 STFPDUX A1, Y, INCY2 .align 4 LL(58): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX STFDUX A1, Y, INCY2 b LL(999) .align 4 # INCX == 1, INCY != 1 LL(60): cmpwi cr0, INCY, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(61) LFD A1, 0 * SIZE(X) add X, X, INCX STFD A1, 0 * SIZE(Y) add Y, Y, INCY addi N, N, -1 cmpwi cr0, N, 0 ble LL(999) .align 4 LL(61): sub X, X, INCX2 sub Y, Y, INCY srawi. r0, N, 4 mtspr CTR, r0 beq- LL(65) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(63) .align 4 LL(62): STFDUX A1, Y, INCY STFSDUX A1, Y, INCY LFPDUX A1, X, INCX2 STFDUX A2, Y, INCY STFSDUX A2, Y, INCY LFPDUX A2, X, INCX2 STFDUX A3, Y, INCY STFSDUX A3, Y, INCY LFPDUX A3, X, INCX2 STFDUX A4, Y, INCY STFSDUX A4, Y, INCY LFPDUX A4, X, INCX2 STFDUX A5, Y, INCY STFSDUX A5, Y, INCY LFPDUX A5, X, INCX2 STFDUX A6, Y, INCY STFSDUX A6, Y, INCY LFPDUX A6, X, INCX2 STFDUX A7, Y, INCY STFSDUX A7, Y, INCY LFPDUX A7, X, INCX2 STFDUX A8, Y, INCY STFSDUX A8, Y, INCY LFPDUX A8, X, INCX2 bdnz LL(62) .align 4 LL(63): STFDUX A1, Y, INCY STFSDUX A1, Y, INCY STFDUX A2, Y, INCY STFSDUX A2, Y, INCY STFDUX A3, Y, INCY STFSDUX A3, Y, INCY STFDUX A4, Y, INCY STFSDUX A4, Y, INCY STFDUX A5, Y, INCY STFSDUX A5, Y, INCY STFDUX A6, Y, INCY STFSDUX A6, Y, INCY STFDUX A7, Y, INCY STFSDUX A7, Y, INCY STFDUX A8, Y, INCY STFSDUX A8, Y, INCY .align 4 LL(65): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(66) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 STFDUX A1, Y, INCY STFSDUX A1, Y, INCY STFDUX A2, Y, INCY STFSDUX A2, Y, INCY STFDUX A3, Y, INCY STFSDUX A3, Y, INCY STFDUX A4, Y, INCY STFSDUX A4, Y, INCY .align 4 LL(66): andi. r0, N, 4 beq LL(67) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 STFDUX A1, Y, INCY STFSDUX A1, Y, INCY STFDUX A2, Y, INCY STFSDUX A2, Y, INCY .align 4 LL(67): andi. r0, N, 2 beq LL(68) LFPDUX A1, X, INCX2 STFDUX A1, Y, INCY STFSDUX A1, Y, INCY .align 4 LL(68): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 STFDUX A1, Y, INCY b LL(999) .align 4 LL(100): sub X, X, INCX sub Y, Y, INCY srawi. r0, N, 3 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX bdz LL(113) .align 4 LL(112): STFDUX A1, Y, INCY LFDUX A1, X, INCX STFDUX A2, Y, INCY LFDUX A2, X, INCX STFDUX A3, Y, INCY LFDUX A3, X, INCX STFDUX A4, Y, INCY LFDUX A4, X, INCX STFDUX A5, Y, INCY LFDUX A5, X, INCX STFDUX A6, Y, INCY LFDUX A6, X, INCX STFDUX A7, Y, INCY LFDUX A7, X, INCX STFDUX A8, Y, INCY LFDUX A8, X, INCX bdnz LL(112) .align 4 LL(113): STFDUX A1, Y, INCY STFDUX A2, Y, INCY STFDUX A3, Y, INCY STFDUX A4, Y, INCY STFDUX A5, Y, INCY STFDUX A6, Y, INCY STFDUX A7, Y, INCY STFDUX A8, Y, INCY .align 4 LL(115): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(117) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX STFDUX A1, Y, INCY STFDUX A2, Y, INCY STFDUX A3, Y, INCY STFDUX A4, Y, INCY .align 4 LL(117): andi. r0, N, 2 beq LL(118) LFDUX A1, X, INCX LFDUX A2, X, INCX STFDUX A1, Y, INCY STFDUX A2, Y, INCY .align 4 LL(118): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX STFDUX A1, Y, INCY .align 4 LL(999): li r10, 16 addi SP, SP, -16 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/cswap.c000066400000000000000000000076151313527062700170520ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "cswap_microk_power8.c" #endif #ifndef HAVE_KERNEL_32 static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT g0, g1, g2, g3, g4, g5, g6, g7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { cswap_kernel_32(n1, x, y); i=n1; ix = 2* n1; iy = 2* n1; } while(i < n) { temp[0] = x[ix] ; temp[1] = x[ix+1] ; x[ix] = y[iy] ; x[ix+1] = y[iy+1] ; y[iy] = temp[0] ; y[iy+1] = temp[1] ; ix += 2 ; iy += 2 ; i++ ; } } else { inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { temp[0] = x[ix] ; temp[1] = x[ix+1] ; x[ix] = y[iy] ; x[ix+1] = y[iy+1] ; y[iy] = temp[0] ; y[iy+1] = temp[1] ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/cswap_microk_power8.c000066400000000000000000000121011313527062700217040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_32 1 static void cswap_kernel_32 (long n, float *x, float *y) { __asm__ ( ".p2align 5 \n" "1: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" "lxvd2x 34, %6, %4 \n\t" "lxvd2x 35, %7, %4 \n\t" "lxvd2x 36, %8, %4 \n\t" "lxvd2x 37, %9, %4 \n\t" "lxvd2x 38, %10, %4 \n\t" "lxvd2x 39, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "lxvd2x 40, 0, %4 \n\t" "lxvd2x 41, %5, %4 \n\t" "lxvd2x 42, %6, %4 \n\t" "lxvd2x 43, %7, %4 \n\t" "lxvd2x 44, %8, %4 \n\t" "lxvd2x 45, %9, %4 \n\t" "lxvd2x 46, %10, %4 \n\t" "lxvd2x 47, %11, %4 \n\t" "addi %4, %4, -128 \n\t" "lxvd2x 48, 0, %3 \n\t" "lxvd2x 49, %5, %3 \n\t" "lxvd2x 50, %6, %3 \n\t" "lxvd2x 51, %7, %3 \n\t" "lxvd2x 0, %8, %3 \n\t" "lxvd2x 1, %9, %3 \n\t" "lxvd2x 2, %10, %3 \n\t" "lxvd2x 3, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "lxvd2x 4, 0, %3 \n\t" "lxvd2x 5, %5, %3 \n\t" "lxvd2x 6, %6, %3 \n\t" "lxvd2x 7, %7, %3 \n\t" "lxvd2x 8, %8, %3 \n\t" "lxvd2x 9, %9, %3 \n\t" "lxvd2x 10, %10, %3 \n\t" "lxvd2x 11, %11, %3 \n\t" "addi %3, %3, -128 \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 48, 0, %4 \n\t" "stxvd2x 49, %5, %4 \n\t" "stxvd2x 50, %6, %4 \n\t" "stxvd2x 51, %7, %4 \n\t" "stxvd2x 0, %8, %4 \n\t" "stxvd2x 1, %9, %4 \n\t" "stxvd2x 2, %10, %4 \n\t" "stxvd2x 3, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "stxvd2x 4, 0, %4 \n\t" "stxvd2x 5, %5, %4 \n\t" "stxvd2x 6, %6, %4 \n\t" "stxvd2x 7, %7, %4 \n\t" "stxvd2x 8, %8, %4 \n\t" "stxvd2x 9, %9, %4 \n\t" "stxvd2x 10, %10, %4 \n\t" "stxvd2x 11, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" "bgt 1b \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "+m" (*x), "+m" (*y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 : "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3", "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11" ); } OpenBLAS-0.2.20/kernel/power/ctrmm_kernel_8x4_power8.S000066400000000000000000000235121313527062700224000ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 400 #define ALPHA_R_SP 304(SP) #define ALPHA_I_SP 312(SP) #else #define STACKSIZE 256 #define ALPHA_R_SP 224(SP) #define ALPHA_I_SP 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define o0 0 #define alpha_dr vs28 #define alpha_di vs29 #define alpha_sr vs30 #define alpha_si vs31 #define o12 r12 #define KKK r13 #define K1 r14 #define L r15 #define o16 r16 #define NOTUSED r17 #define T2 r19 #define KK r20 #define o8 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o4 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T1 r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) std r13, 288(SP) std r12, 296(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) stw r13, 216(SP) #endif stfs f1, ALPHA_R_SP stfs f2, ALPHA_I_SP // stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif #include "ctrmm_macros_8x4_power8.S" cmpwi cr0, M, 0 ble L999_H1 cmpwi cr0, N, 0 ble L999_H1 cmpwi cr0, K, 0 ble L999_H1 slwi LDC, LDC, ZBASE_SHIFT li PRE, 384 li o4 , 4 li o8 , 8 li o12 , 12 li o16 , 16 li o32 , 32 li o48 , 48 #ifdef __64BIT__ addi T1, SP, 304 #else addi T1, SP, 224 #endif lxsspx alpha_dr, 0, T1 lxsspx alpha_di, o8, T1 addi T1, SP, 360 li T2, 0 stw T2, 0(T1) stw T2, 4(T1) stw T2, 8(T1) stxsspx alpha_dr, o12, T1 lxvw4x alpha_sr, o0 , T1 addi T1, T1, 16 stw T2, 0(T1) stw T2, 4(T1) stw T2, 8(T1) stxsspx alpha_di, o12, T1 lxvw4x alpha_si, o0 , T1 .align 5 #include "ctrmm_logic_8x4_power8.S" L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) ld r13, 288(SP) ld r12, 296(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) lwz r13, 216(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ctrmm_logic_8x4_power8.S000066400000000000000000000751411313527062700222220ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 2 ble CTRMM_L4_END CTRMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 3 ble CTRMM_L4x8_END CTRMM_L4x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L4x8_SUB0 cmpwi cr0, L, 1 ble CTRMM_L4x8_SUB4 CTRMM_L4x8_LOOP_START: dcbt AO, PRE dcbt BO, PRE LOAD4x8_1 KERNEL4x8_I1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL4x8_2 addic. L, L, -2 ble CTRMM_L4x8_LOOP_END .align 5 CTRMM_L4x8_LOOP: KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL4x8_2 addic. L, L, -1 bgt CTRMM_L4x8_LOOP CTRMM_L4x8_LOOP_END: KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_E2 b CTRMM_L4x8_SUB1 CTRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 b CTRMM_L4x8_SUB1 CTRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 ble CTRMM_L4x8_SAVE b CTRMM_L4x8_SUB2 CTRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L4x8_SAVE CTRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 bgt CTRMM_L4x8_SUB2 CTRMM_L4x8_SAVE: SAVE4x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif addic. I, I, -1 bgt CTRMM_L4x8_BEGIN CTRMM_L4x8_END: CTRMM_L4x4_BEGIN: andi. T2, M, 7 ble CTRMM_L4x1_END andi. T1, M, 4 ble CTRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L4x4_SUB0 cmpwi cr0, L, 1 ble CTRMM_L4x4_SUB4 CTRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -2 ble CTRMM_L4x4_LOOP_END .align 5 CTRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -1 bgt CTRMM_L4x4_LOOP CTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_E2 b CTRMM_L4x4_SUB1 CTRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 b CTRMM_L4x4_SUB1 CTRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 ble CTRMM_L4x4_SAVE b CTRMM_L4x4_SUB2 CTRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L4x4_SAVE CTRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 bgt CTRMM_L4x4_SUB2 CTRMM_L4x4_SAVE: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif CTRMM_L4x4_END: CTRMM_L4x2_BEGIN: andi. T1, M, 2 ble CTRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L4x2_SUB0 cmpwi cr0, L, 1 ble CTRMM_L4x2_SUB4 CTRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -2 ble CTRMM_L4x2_LOOP_END .align 5 CTRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -1 bgt CTRMM_L4x2_LOOP CTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_E2 b CTRMM_L4x2_SUB1 CTRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 b CTRMM_L4x2_SUB1 CTRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 ble CTRMM_L4x2_SAVE b CTRMM_L4x2_SUB2 CTRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L4x2_SAVE CTRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 bgt CTRMM_L4x2_SUB2 CTRMM_L4x2_SAVE: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif CTRMM_L4x2_END: CTRMM_L4x1_BEGIN: andi. T1, M, 1 ble CTRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L4x1_SUB0 cmpwi cr0, L, 1 ble CTRMM_L4x1_SUB4 CTRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -2 ble CTRMM_L4x1_LOOP_END .align 5 CTRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -1 bgt CTRMM_L4x1_LOOP CTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_E2 b CTRMM_L4x1_SUB1 CTRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 b CTRMM_L4x1_SUB1 CTRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 ble CTRMM_L4x1_SAVE b CTRMM_L4x1_SUB2 CTRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L4x1_SAVE CTRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 bgt CTRMM_L4x1_SUB2 CTRMM_L4x1_SAVE: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif CTRMM_L4x1_END: slwi T1, K, 5 add B, B, T1 #if !defined(LEFT) addi KK, KK, 4 // KK += Number of values in B #endif addic. J, J, -1 bgt CTRMM_L4_BEGIN andi. T2, N, 3 ble L999_H2 CTRMM_L4_END: b CTRMM_L2_BEGIN L999_H1: b L999_H2 CTRMM_L2_BEGIN: andi. T1, N, 2 ble CTRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 3 ble CTRMM_L2x8_END CTRMM_L2x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L2x8_SUB0 cmpwi cr0, L, 1 ble CTRMM_L2x8_SUB4 CTRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -2 ble CTRMM_L2x8_LOOP_END .align 5 CTRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -1 bgt CTRMM_L2x8_LOOP CTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_E2 b CTRMM_L2x8_SUB1 CTRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b CTRMM_L2x8_SUB1 CTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 ble CTRMM_L2x8_SAVE b CTRMM_L2x8_SUB2 CTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L2x8_SAVE CTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt CTRMM_L2x8_SUB2 CTRMM_L2x8_SAVE: SAVE2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif addic. I, I, -1 bgt CTRMM_L2x8_BEGIN CTRMM_L2x8_END: CTRMM_L2x4_BEGIN: andi. T2, M, 7 ble CTRMM_L2x1_END andi. T1, M, 4 ble CTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L2x4_SUB0 cmpwi cr0, L, 1 ble CTRMM_L2x4_SUB4 CTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble CTRMM_L2x4_LOOP_END .align 5 CTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt CTRMM_L2x4_LOOP CTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b CTRMM_L2x4_SUB1 CTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b CTRMM_L2x4_SUB1 CTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 ble CTRMM_L2x4_SAVE b CTRMM_L2x4_SUB2 CTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L2x4_SAVE CTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt CTRMM_L2x4_SUB2 CTRMM_L2x4_SAVE: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif CTRMM_L2x4_END: CTRMM_L2x2_BEGIN: andi. T1, M, 2 ble CTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L2x2_SUB0 cmpwi cr0, L, 1 ble CTRMM_L2x2_SUB4 CTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble CTRMM_L2x2_LOOP_END .align 5 CTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt CTRMM_L2x2_LOOP CTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b CTRMM_L2x2_SUB1 CTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b CTRMM_L2x2_SUB1 CTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 ble CTRMM_L2x2_SAVE b CTRMM_L2x2_SUB2 CTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L2x2_SAVE CTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt CTRMM_L2x2_SUB2 CTRMM_L2x2_SAVE: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif CTRMM_L2x2_END: CTRMM_L2x1_BEGIN: andi. T1, M, 1 ble CTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L2x1_SUB0 cmpwi cr0, L, 1 ble CTRMM_L2x1_SUB4 CTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble CTRMM_L2x1_LOOP_END .align 5 CTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt CTRMM_L2x1_LOOP CTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b CTRMM_L2x1_SUB1 CTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b CTRMM_L2x1_SUB1 CTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 ble CTRMM_L2x1_SAVE b CTRMM_L2x1_SUB2 CTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L2x1_SAVE CTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt CTRMM_L2x1_SUB2 CTRMM_L2x1_SAVE: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif CTRMM_L2x1_END: slwi T1, K, 4 add B, B, T1 #if !defined(LEFT) addi KK, KK, 2 // KK += Number of values in B #endif CTRMM_L2_END: b CTRMM_L1_BEGIN L999_H2: b L999 CTRMM_L1_BEGIN: andi. T1, N, 1 ble CTRMM_L1_END mr CO, C mr AO, A #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 3 ble CTRMM_L1x8_END CTRMM_L1x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L1x8_SUB0 cmpwi cr0, L, 1 ble CTRMM_L1x8_SUB4 CTRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -2 ble CTRMM_L1x8_LOOP_END .align 5 CTRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -1 bgt CTRMM_L1x8_LOOP CTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_E2 b CTRMM_L1x8_SUB1 CTRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b CTRMM_L1x8_SUB1 CTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 ble CTRMM_L1x8_SAVE b CTRMM_L1x8_SUB2 CTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L1x8_SAVE CTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt CTRMM_L1x8_SUB2 CTRMM_L1x8_SAVE: SAVE1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif addic. I, I, -1 bgt CTRMM_L1x8_BEGIN CTRMM_L1x8_END: CTRMM_L1x4_BEGIN: andi. T2, M, 7 ble CTRMM_L1x1_END andi. T1, M, 4 ble CTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L1x4_SUB0 cmpwi cr0, L, 1 ble CTRMM_L1x4_SUB4 CTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble CTRMM_L1x4_LOOP_END .align 5 CTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt CTRMM_L1x4_LOOP CTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b CTRMM_L1x4_SUB1 CTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b CTRMM_L1x4_SUB1 CTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 ble CTRMM_L1x4_SAVE b CTRMM_L1x4_SUB2 CTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L1x4_SAVE CTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt CTRMM_L1x4_SUB2 CTRMM_L1x4_SAVE: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif CTRMM_L1x4_END: CTRMM_L1x2_BEGIN: andi. T1, M, 2 ble CTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L1x2_SUB0 cmpwi cr0, L, 1 ble CTRMM_L1x2_SUB4 CTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble CTRMM_L1x2_LOOP_END .align 5 CTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt CTRMM_L1x2_LOOP CTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b CTRMM_L1x2_SUB1 CTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b CTRMM_L1x2_SUB1 CTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 ble CTRMM_L1x2_SAVE b CTRMM_L1x2_SUB2 CTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L1x2_SAVE CTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt CTRMM_L1x2_SUB2 CTRMM_L1x2_SAVE: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif CTRMM_L1x2_END: CTRMM_L1x1_BEGIN: andi. T1, M, 1 ble CTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble CTRMM_L1x1_SUB0 cmpwi cr0, L, 1 ble CTRMM_L1x1_SUB4 CTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble CTRMM_L1x1_LOOP_END .align 5 CTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt CTRMM_L1x1_LOOP CTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b CTRMM_L1x1_SUB1 CTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b CTRMM_L1x1_SUB1 CTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 ble CTRMM_L1x1_SAVE b CTRMM_L1x1_SUB2 CTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble CTRMM_L1x1_SAVE CTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt CTRMM_L1x1_SUB2 CTRMM_L1x1_SAVE: SAVE1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif CTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif CTRMM_L1_END: OpenBLAS-0.2.20/kernel/power/ctrmm_macros_8x4_power8.S000066400000000000000000005315401313527062700224110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xsadddp #define XSFADD_I2 xsadddp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvsubsp #define XVFADD_I1 xvaddsp #define XVFADD_I2 xvaddsp #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xssubdp #define XSFADD_I2 xsadddp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvaddsp #define XVFADD_I1 xvsubsp #define XVFADD_I2 xvaddsp #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xsadddp #define XSFADD_I2 xssubdp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvaddsp #define XVFADD_I1 xvaddsp #define XVFADD_I2 xvsubsp #else // CC || CR || RC || RR #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xssubdp #define XSFADD_I2 xssubdp #define XVFADD_R1 xvaddsp #define XVFADD_R2 xvsubsp #define XVFADD_I1 xvsubsp #define XVFADD_I2 xvsubsp #endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro LOAD4x8_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 .endm .macro KERNEL4x8_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs20, vs25, 0 xxspltw vs21, vs25, 1 xxspltw vs22, vs25, 2 xxspltw vs23, vs25, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs20, vs25, 0 xxspltw vs21, vs25, 1 xxspltw vs22, vs25, 2 xxspltw vs23, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x8_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro SAVE4x8 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs40, 0 xxspltw vs9, vs40, 1 xxspltw vs10, vs40, 2 xxspltw vs11, vs40, 3 xxspltw vs12, vs41, 0 xxspltw vs13, vs41, 1 xxspltw vs14, vs41, 2 xxspltw vs15, vs41, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs42, 0 xxspltw vs9, vs42, 1 xxspltw vs10, vs42, 2 xxspltw vs11, vs42, 3 xxspltw vs12, vs43, 0 xxspltw vs13, vs43, 1 xxspltw vs14, vs43, 2 xxspltw vs15, vs43, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs44, 0 xxspltw vs9, vs44, 1 xxspltw vs10, vs44, 2 xxspltw vs11, vs44, 3 xxspltw vs12, vs45, 0 xxspltw vs13, vs45, 1 xxspltw vs14, vs45, 2 xxspltw vs15, vs45, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs46, 0 xxspltw vs9, vs46, 1 xxspltw vs10, vs46, 2 xxspltw vs11, vs46, 3 xxspltw vs12, vs47, 0 xxspltw vs13, vs47, 1 xxspltw vs14, vs47, 2 xxspltw vs15, vs47, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs48, 0 xxspltw vs9, vs48, 1 xxspltw vs10, vs48, 2 xxspltw vs11, vs48, 3 xxspltw vs12, vs49, 0 xxspltw vs13, vs49, 1 xxspltw vs14, vs49, 2 xxspltw vs15, vs49, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs50, 0 xxspltw vs9, vs50, 1 xxspltw vs10, vs50, 2 xxspltw vs11, vs50, 3 xxspltw vs12, vs51, 0 xxspltw vs13, vs51, 1 xxspltw vs14, vs51, 2 xxspltw vs15, vs51, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs52, 0 xxspltw vs9, vs52, 1 xxspltw vs10, vs52, 2 xxspltw vs11, vs52, 3 xxspltw vs12, vs53, 0 xxspltw vs13, vs53, 1 xxspltw vs14, vs53, 2 xxspltw vs15, vs53, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs54, 0 xxspltw vs9, vs54, 1 xxspltw vs10, vs54, 2 xxspltw vs11, vs54, 3 xxspltw vs12, vs55, 0 xxspltw vs13, vs55, 1 xxspltw vs14, vs55, 2 xxspltw vs15, vs55, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs56, 0 xxspltw vs9, vs56, 1 xxspltw vs10, vs56, 2 xxspltw vs11, vs56, 3 xxspltw vs12, vs57, 0 xxspltw vs13, vs57, 1 xxspltw vs14, vs57, 2 xxspltw vs15, vs57, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs58, 0 xxspltw vs9, vs58, 1 xxspltw vs10, vs58, 2 xxspltw vs11, vs58, 3 xxspltw vs12, vs59, 0 xxspltw vs13, vs59, 1 xxspltw vs14, vs59, 2 xxspltw vs15, vs59, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs60, 0 xxspltw vs9, vs60, 1 xxspltw vs10, vs60, 2 xxspltw vs11, vs60, 3 xxspltw vs12, vs61, 0 xxspltw vs13, vs61, 1 xxspltw vs14, vs61, 2 xxspltw vs15, vs61, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs62, 0 xxspltw vs9, vs62, 1 xxspltw vs10, vs62, 2 xxspltw vs11, vs62, 3 xxspltw vs12, vs63, 0 xxspltw vs13, vs63, 1 xxspltw vs14, vs63, 2 xxspltw vs15, vs63, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro LOAD4x4_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 .endm .macro KERNEL4x4_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs20, vs25, 0 xxspltw vs21, vs25, 1 xxspltw vs22, vs25, 2 xxspltw vs23, vs25, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs20, vs25, 0 xxspltw vs21, vs25, 1 xxspltw vs22, vs25, 2 xxspltw vs23, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x4_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro SAVE4x4 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs40, 0 xxspltw vs9, vs40, 1 xxspltw vs10, vs40, 2 xxspltw vs11, vs40, 3 xxspltw vs12, vs41, 0 xxspltw vs13, vs41, 1 xxspltw vs14, vs41, 2 xxspltw vs15, vs41, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=2 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs42, 0 xxspltw vs9, vs42, 1 xxspltw vs10, vs42, 2 xxspltw vs11, vs42, 3 xxspltw vs12, vs43, 0 xxspltw vs13, vs43, 1 xxspltw vs14, vs43, 2 xxspltw vs15, vs43, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs44, 0 xxspltw vs9, vs44, 1 xxspltw vs10, vs44, 2 xxspltw vs11, vs44, 3 xxspltw vs12, vs45, 0 xxspltw vs13, vs45, 1 xxspltw vs14, vs45, 2 xxspltw vs15, vs45, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=3 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs46, 0 xxspltw vs9, vs46, 1 xxspltw vs10, vs46, 2 xxspltw vs11, vs46, 3 xxspltw vs12, vs47, 0 xxspltw vs13, vs47, 1 xxspltw vs14, vs47, 2 xxspltw vs15, vs47, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro LOAD4x2_1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 .endm .macro KERNEL4x2_I1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs20, vs25, 0 xxspltw vs21, vs25, 1 xxspltw vs22, vs25, 2 xxspltw vs23, vs25, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs20, vs25, 0 xxspltw vs21, vs25, 1 xxspltw vs22, vs25, 2 xxspltw vs23, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_2 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro KERNEL4x2_SUB1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 xxspltw vs12, vs25, 0 xxspltw vs13, vs25, 1 xxspltw vs14, vs25, 2 xxspltw vs15, vs25, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i .endm .macro SAVE4x2 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro LOAD4x1_1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi T1, T1,8 lxsspx vs12, o0, T1 // load b2_r lxsspx vs13, o4, T1 // load b2_i addi T1, T1,8 lxsspx vs14, o0, T1 // load b3_r lxsspx vs15, o4, T1 // load b3_i addi BO, BO, 32 .endm .macro KERNEL4x1_I1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 // load b0_r lxsspx vs17, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs18, o0, T1 // load b1_r lxsspx vs19, o4, T1 // load b1_i addi T1, T1,8 lxsspx vs20, o0, T1 // load b2_r lxsspx vs21, o4, T1 // load b2_i addi T1, T1,8 lxsspx vs22, o0, T1 // load b3_r lxsspx vs23, o4, T1 // load b3_i addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r xsmuldp vs40, vs0, vs12 // a0_r*b2_r xsmuldp vs41, vs1, vs13 // a0_i*b2_i xsmuldp vs42, vs0, vs13 // a0_r*b2_i xsmuldp vs43, vs1, vs12 // a0_i*b2_r xsmuldp vs44, vs0, vs14 // a0_r*b3_r xsmuldp vs45, vs1, vs15 // a0_i*b3_i xsmuldp vs46, vs0, vs15 // a0_r*b3_i xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm .macro KERNEL4x1_1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 // load b0_r lxsspx vs17, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs18, o0, T1 // load b1_r lxsspx vs19, o4, T1 // load b1_i addi T1, T1,8 lxsspx vs20, o0, T1 // load b2_r lxsspx vs21, o4, T1 // load b2_i addi T1, T1,8 lxsspx vs22, o0, T1 // load b3_r lxsspx vs23, o4, T1 // load b3_i addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r xsmaddadp vs40, vs0, vs12 // a0_r*b2_r xsmaddadp vs41, vs1, vs13 // a0_i*b2_i xsmaddadp vs42, vs0, vs13 // a0_r*b2_i xsmaddadp vs43, vs1, vs12 // a0_i*b2_r xsmaddadp vs44, vs0, vs14 // a0_r*b3_r xsmaddadp vs45, vs1, vs15 // a0_i*b3_i xsmaddadp vs46, vs0, vs15 // a0_r*b3_i xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm .macro KERNEL4x1_2 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi T1, T1,8 lxsspx vs12, o0, T1 // load b2_r lxsspx vs13, o4, T1 // load b2_i addi T1, T1,8 lxsspx vs14, o0, T1 // load b3_r lxsspx vs15, o4, T1 // load b3_i addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r xsmaddadp vs40, vs4, vs20 // a4_r*b2_r xsmaddadp vs41, vs5, vs21 // a4_i*b2_i xsmaddadp vs42, vs4, vs21 // a4_r*b2_i xsmaddadp vs43, vs5, vs20 // a4_i*b2_r xsmaddadp vs44, vs4, vs22 // a4_r*b3_r xsmaddadp vs45, vs5, vs23 // a4_i*b3_i xsmaddadp vs46, vs4, vs23 // a4_r*b3_i xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm .macro KERNEL4x1_E2 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r xsmaddadp vs40, vs4, vs20 // a4_r*b2_r xsmaddadp vs41, vs5, vs21 // a4_i*b2_i xsmaddadp vs42, vs4, vs21 // a4_r*b2_i xsmaddadp vs43, vs5, vs20 // a4_i*b2_r xsmaddadp vs44, vs4, vs22 // a4_r*b3_r xsmaddadp vs45, vs5, vs23 // a4_i*b3_i xsmaddadp vs46, vs4, vs23 // a4_r*b3_i xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm .macro KERNEL4x1_SUBI1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi T1, T1,8 lxsspx vs12, o0, T1 // load b2_r lxsspx vs13, o4, T1 // load b2_i addi T1, T1,8 lxsspx vs14, o0, T1 // load b3_r lxsspx vs15, o4, T1 // load b3_i addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r xsmuldp vs40, vs0, vs12 // a0_r*b2_r xsmuldp vs41, vs1, vs13 // a0_i*b2_i xsmuldp vs42, vs0, vs13 // a0_r*b2_i xsmuldp vs43, vs1, vs12 // a0_i*b2_r xsmuldp vs44, vs0, vs14 // a0_r*b3_r xsmuldp vs45, vs1, vs15 // a0_i*b3_i xsmuldp vs46, vs0, vs15 // a0_r*b3_i xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm .macro KERNEL4x1_SUB1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi T1, T1,8 lxsspx vs12, o0, T1 // load b2_r lxsspx vs13, o4, T1 // load b2_i addi T1, T1,8 lxsspx vs14, o0, T1 // load b3_r lxsspx vs15, o4, T1 // load b3_i addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r xsmaddadp vs40, vs0, vs12 // a0_r*b2_r xsmaddadp vs41, vs1, vs13 // a0_i*b2_i xsmaddadp vs42, vs0, vs13 // a0_r*b2_i xsmaddadp vs43, vs1, vs12 // a0_i*b2_r xsmaddadp vs44, vs0, vs14 // a0_r*b3_r xsmaddadp vs45, vs1, vs15 // a0_i*b3_i xsmaddadp vs46, vs0, vs15 // a0_r*b3_i xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm .macro SAVE4x1 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=2 mr T2, T1 // N=2 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=3 mr T2, T1 // N=3 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro LOAD2x8_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 .endm .macro KERNEL2x8_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x8_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro SAVE2x8 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs40, 0 xxspltw vs9, vs40, 1 xxspltw vs10, vs40, 2 xxspltw vs11, vs40, 3 xxspltw vs12, vs41, 0 xxspltw vs13, vs41, 1 xxspltw vs14, vs41, 2 xxspltw vs15, vs41, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs42, 0 xxspltw vs9, vs42, 1 xxspltw vs10, vs42, 2 xxspltw vs11, vs42, 3 xxspltw vs12, vs43, 0 xxspltw vs13, vs43, 1 xxspltw vs14, vs43, 2 xxspltw vs15, vs43, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs44, 0 xxspltw vs9, vs44, 1 xxspltw vs10, vs44, 2 xxspltw vs11, vs44, 3 xxspltw vs12, vs45, 0 xxspltw vs13, vs45, 1 xxspltw vs14, vs45, 2 xxspltw vs15, vs45, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs46, 0 xxspltw vs9, vs46, 1 xxspltw vs10, vs46, 2 xxspltw vs11, vs46, 3 xxspltw vs12, vs47, 0 xxspltw vs13, vs47, 1 xxspltw vs14, vs47, 2 xxspltw vs15, vs47, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro LOAD2x4_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 .endm .macro KERNEL2x4_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x4_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro SAVE2x4 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=1 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro LOAD2x2_1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 .endm .macro KERNEL2x2_I1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_2 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro KERNEL2x2_SUB1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i .endm .macro SAVE2x2 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro LOAD2x1_1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi BO, BO, 16 .endm .macro KERNEL2x1_I1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 // load b0_r lxsspx vs17, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs18, o0, T1 // load b1_r lxsspx vs19, o4, T1 // load b1_i addi BO, BO, 16 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm .macro KERNEL2x1_1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 // load b0_r lxsspx vs17, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs18, o0, T1 // load b1_r lxsspx vs19, o4, T1 // load b1_i addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm .macro KERNEL2x1_2 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi BO, BO, 16 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm .macro KERNEL2x1_E2 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r xsmaddadp vs36, vs4, vs18 // a4_r*b1_r xsmaddadp vs37, vs5, vs19 // a4_i*b1_i xsmaddadp vs38, vs4, vs19 // a4_r*b1_i xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm .macro KERNEL2x1_SUBI1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi BO, BO, 16 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r xsmuldp vs36, vs0, vs10 // a0_r*b1_r xsmuldp vs37, vs1, vs11 // a0_i*b1_i xsmuldp vs38, vs0, vs11 // a0_r*b1_i xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm .macro KERNEL2x1_SUB1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi T1, T1,8 lxsspx vs10, o0, T1 // load b1_r lxsspx vs11, o4, T1 // load b1_i addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r xsmaddadp vs36, vs0, vs10 // a0_r*b1_r xsmaddadp vs37, vs1, vs11 // a0_i*b1_i xsmaddadp vs38, vs0, vs11 // a0_r*b1_i xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm .macro SAVE2x1 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC // N=1 mr T2, T1 // N=1 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro LOAD1x8_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 .endm .macro KERNEL1x8_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 lxvw4x vs6, o32, AO // load a4, a5 lxvw4x vs7, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x8_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 lxvw4x vs2, o32, AO // load a4, a5 lxvw4x vs3, o48, AO // load a6, a7 addi AO, AO, 64 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro SAVE1x8 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=4 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs36, 0 xxspltw vs9, vs36, 1 xxspltw vs10, vs36, 2 xxspltw vs11, vs36, 3 xxspltw vs12, vs37, 0 xxspltw vs13, vs37, 1 xxspltw vs14, vs37, 2 xxspltw vs15, vs37, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=6 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs38, 0 xxspltw vs9, vs38, 1 xxspltw vs10, vs38, 2 xxspltw vs11, vs38, 3 xxspltw vs12, vs39, 0 xxspltw vs13, vs39, 1 xxspltw vs14, vs39, 2 xxspltw vs15, vs39, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro LOAD1x4_1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 .endm .macro KERNEL1x4_I1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_1 lxvw4x vs4, o0, AO // load a0, a1 lxvw4x vs5, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_2 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x4_SUB1 lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 addi AO, AO, 32 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro SAVE1x4 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 // N=0 M=2 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs34, 0 xxspltw vs9, vs34, 1 xxspltw vs10, vs34, 2 xxspltw vs11, vs34, 3 xxspltw vs12, vs35, 0 xxspltw vs13, vs35, 1 xxspltw vs14, vs35, 2 xxspltw vs15, vs35, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro LOAD1x2_1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 .endm .macro KERNEL1x2_I1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_1 lxvw4x vs4, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs16, vs24, 0 xxspltw vs17, vs24, 1 xxspltw vs18, vs24, 2 xxspltw vs19, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_2 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_E2 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_SUBI1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro KERNEL1x2_SUB1 lxvw4x vs0, o0, AO // load a0, a1 addi AO, AO, 16 lxvw4x vs24, o0, BO // load b0, b1 xxspltw vs8, vs24, 0 xxspltw vs9, vs24, 1 xxspltw vs10, vs24, 2 xxspltw vs11, vs24, 3 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i .endm .macro SAVE1x2 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 xxlxor vs6, vs6, vs6 xxlxor vs7, vs7, vs7 #ifndef TRMMKERNEL lxvw4x vs0, o0, T2 // c0, c1 #else xxlxor vs0, vs0, vs0 #endif xxspltw vs8, vs32, 0 xxspltw vs9, vs32, 1 xxspltw vs10, vs32, 2 xxspltw vs11, vs32, 3 xxspltw vs12, vs33, 0 xxspltw vs13, vs33, 1 xxspltw vs14, vs33, 2 xxspltw vs15, vs33, 3 XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r xxsldwi vs23, vs23, vs24, 0 // r1_i xvaddsp vs20, vs20, vs21 // r0_r, r0_i xvaddsp vs22, vs22, vs23 // r1_r, r1_i xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 stxvw4x vs0, o0, T2 // c0, c1 addi T2, T2, 16 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro LOAD1x1_1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi BO, BO, 8 .endm .macro KERNEL1x1_I1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 // load b0_r lxsspx vs17, o4, T1 // load b0_i addi BO, BO, 8 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm .macro KERNEL1x1_1 lxsspx vs4, o0, AO // load a0_r lxsspx vs5, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 // load b0_r lxsspx vs17, o4, T1 // load b0_i addi BO, BO, 8 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm .macro KERNEL1x1_2 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi BO, BO, 8 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm .macro KERNEL1x1_E2 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r xsmaddadp vs33, vs5, vs17 // a4_i*b0_i xsmaddadp vs34, vs4, vs17 // a4_r*b0_i xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm .macro KERNEL1x1_SUBI1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi BO, BO, 8 xsmuldp vs32, vs0, vs8 // a0_r*b0_r xsmuldp vs33, vs1, vs9 // a0_i*b0_i xsmuldp vs34, vs0, vs9 // a0_r*b0_i xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm .macro KERNEL1x1_SUB1 lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 // load b0_r lxsspx vs9, o4, T1 // load b0_i addi BO, BO, 8 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r xsmaddadp vs33, vs1, vs9 // a0_i*b0_i xsmaddadp vs34, vs0, vs9 // a0_r*b0_i xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm .macro SAVE1x1 mr T1, CO // N=0 mr T2, T1 // N=0 M=0 xxlxor vs4, vs4, vs4 xxlxor vs5, vs5, vs5 #ifndef TRMMKERNEL lxsspx vs0, o0, T2 // load c0_r lxsspx vs1, o4, T2 // load c0_i #else xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 #endif XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r xsadddp vs0, vs0, vs20 xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r stxsspx vs1, o4, T2 // store c0_i addi T2, T2, 8 add T1, T1, LDC addi CO, CO, 8 .endm OpenBLAS-0.2.20/kernel/power/dasum.c000066400000000000000000000061721313527062700170430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #error supports double only #endif #if defined(POWER8) #include "dasum_microk_power8.c" #endif #ifndef HAVE_KERNEL_16 static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) { BLASLONG i=0; FLOAT *x = x1; FLOAT temp0, temp1, temp2, temp3; FLOAT temp4, temp5, temp6, temp7; FLOAT sum0 = 0.0; FLOAT sum1 = 0.0; FLOAT sum2 = 0.0; FLOAT sum3 = 0.0; while ( i< n ) { temp0 = ABS(x[0]); temp1 = ABS(x[1]); temp2 = ABS(x[2]); temp3 = ABS(x[3]); temp4 = ABS(x[4]); temp5 = ABS(x[5]); temp6 = ABS(x[6]); temp7 = ABS(x[7]); sum0 += temp0; sum1 += temp1; sum2 += temp2; sum3 += temp3; sum0 += temp4; sum1 += temp5; sum2 += temp6; sum3 += temp7; x+=8; i+=8; } return sum0+sum1+sum2+sum3; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; BLASLONG n1; if (n <= 0 || inc_x <= 0) return(sumf); if ( inc_x == 1 ) { n1 = n & -16; if ( n1 > 0 ) { sumf = dasum_kernel_16(n1, x); i=n1; } while(i < n) { sumf += ABS(x[i]); i++; } } else { n *= inc_x; while(i < n) { sumf += ABS(x[i]); i += inc_x; } } return(sumf); } OpenBLAS-0.2.20/kernel/power/dasum_microk_power8.c000066400000000000000000000122341313527062700217070ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static double dasum_kernel_16 (long n, double *x) { double sum; __vector double t0; __vector double t1; __vector double t2; __vector double t3; __asm__ ( "dcbt 0, %2 \n\t" "xxlxor 32, 32, 32 \n\t" "xxlxor 33, 33, 33 \n\t" "xxlxor 34, 34, 34 \n\t" "xxlxor 35, 35, 35 \n\t" "xxlxor 36, 36, 36 \n\t" "xxlxor 37, 37, 37 \n\t" "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" "xvabsdp 50, 42 \n\t" "xvabsdp 51, 43 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "xvabsdp %x3, 44 \n\t" "xvabsdp %x4, 45 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "xvabsdp %x5, 46 \n\t" "xvabsdp %x6, 47 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "xvadddp 32, 32, 48 \n\t" "xvadddp 33, 33, 49 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "xvadddp 34, 34, 50 \n\t" "xvadddp 35, 35, 51 \n\t" "addi %2, %2, 128 \n\t" "xvadddp 36, 36, %x3 \n\t" "xvadddp 37, 37, %x4 \n\t" "addic. %1, %1, -16 \n\t" "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" "bgt 1b \n" "2: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" "xvabsdp 50, 42 \n\t" "xvabsdp 51, 43 \n\t" "xvabsdp %x3, 44 \n\t" "xvabsdp %x4, 45 \n\t" "xvabsdp %x5, 46 \n\t" "xvabsdp %x6, 47 \n\t" "xvadddp 32, 32, 48 \n\t" "xvadddp 33, 33, 49 \n\t" "xvadddp 34, 34, 50 \n\t" "xvadddp 35, 35, 51 \n\t" "xvadddp 36, 36, %x3 \n\t" "xvadddp 37, 37, %x4 \n\t" "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" "xvadddp 32, 32, 33 \n\t" "xvadddp 34, 34, 35 \n\t" "xvadddp 36, 36, 37 \n\t" "xvadddp 38, 38, 39 \n\t" "xvadddp 32, 32, 34 \n\t" "xvadddp 36, 36, 38 \n\t" "xvadddp 32, 32, 36 \n\t" "xxswapd 33, 32 \n\t" "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" : "=d" (sum), // 0 "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0), // 3 "=wa" (t1), // 4 "=wa" (t2), // 5 "=wa" (t3) // 6 : "m" (*x), "b" (16), // 8 "b" (32), // 9 "b" (48), // 10 "b" (64), // 11 "b" (80), // 12 "b" (96), // 13 "b" (112) // 14 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); return sum; } OpenBLAS-0.2.20/kernel/power/daxpy.c000066400000000000000000000065121313527062700170550ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "daxpy_microk_power8.c" #endif #ifndef HAVE_KERNEL_8 static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) { BLASLONG register i = 0; while(i < n) { y[i] += alpha * x[i]; y[i+1] += alpha * x[i+1]; y[i+2] += alpha * x[i+2]; y[i+3] += alpha * x[i+3]; y[i+4] += alpha * x[i+4]; y[i+5] += alpha * x[i+5]; y[i+6] += alpha * x[i+6]; y[i+7] += alpha * x[i+7]; i+=8 ; } } #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; if ( n <= 0 ) return(0); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -16; if ( n1 ) daxpy_kernel_8(n1, x, y, da); i = n1; while(i < n) { y[i] += da * x[i] ; i++ ; } return(0); } BLASLONG n1 = n & -4; while(i < n1) { FLOAT m1 = da * x[ix] ; FLOAT m2 = da * x[ix+inc_x] ; FLOAT m3 = da * x[ix+2*inc_x] ; FLOAT m4 = da * x[ix+3*inc_x] ; y[iy] += m1 ; y[iy+inc_y] += m2 ; y[iy+2*inc_y] += m3 ; y[iy+3*inc_y] += m4 ; ix += inc_x*4 ; iy += inc_y*4 ; i+=4 ; } while(i < n) { y[iy] += da * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/power/daxpy_microk_power8.c000066400000000000000000000142431313527062700217250ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_8 1 static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) { __vector double t0; __vector double t1; __vector double t2; __vector double t3; __vector double t4; __vector double t5; __vector double t6; __vector double t7; __vector double t8; __vector double t9; __vector double t10; __vector double t11; __vector double t12; __vector double t13; __vector double t14; __vector double t15; __vector double t16; __asm__ ( "xxspltd %x4, %x22, 0 \n\t" "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" "lxvd2x %x5, 0, %2 \n\t" "lxvd2x %x6, %23, %2 \n\t" "lxvd2x %x7, %24, %2 \n\t" "lxvd2x %x8, %25, %2 \n\t" "lxvd2x %x13, 0, %3 \n\t" "lxvd2x %x14, %23, %3 \n\t" "lxvd2x %x15, %24, %3 \n\t" "lxvd2x %x16, %25, %3 \n\t" "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "lxvd2x %x9, 0, %2 \n\t" "lxvd2x %x10, %23, %2 \n\t" "lxvd2x %x11, %24, %2 \n\t" "lxvd2x %x12, %25, %2 \n\t" "lxvd2x %x17, 0, %3 \n\t" "lxvd2x %x18, %23, %3 \n\t" "lxvd2x %x19, %24, %3 \n\t" "lxvd2x %x20, %25, %3 \n\t" "addi %2, %2, 64 \n\t" "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" "ble 2f \n\t" ".align 5 \n" "1: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" "lxvd2x %x5, 0, %2 \n\t" "lxvd2x %x6, %23, %2 \n\t" "stxvd2x %x13, 0, %3 \n\t" "stxvd2x %x14, %23, %3 \n\t" "xvmaddadp %x15, %x7, %x4 \n\t" "xvmaddadp %x16, %x8, %x4 \n\t" "lxvd2x %x7, %24, %2 \n\t" "lxvd2x %x8, %25, %2 \n\t" "stxvd2x %x15, %24, %3 \n\t" "stxvd2x %x16, %25, %3 \n\t" "addi %2, %2, 64 \n\t" "addi %3, %3, 128 \n\t" "lxvd2x %x13, 0, %3 \n\t" "lxvd2x %x14, %23, %3 \n\t" "lxvd2x %x15, %24, %3 \n\t" "lxvd2x %x16, %25, %3 \n\t" "addi %3, %3, -64 \n\t" "xvmaddadp %x17, %x9, %x4 \n\t" "xvmaddadp %x18, %x10, %x4 \n\t" "lxvd2x %x9, 0, %2 \n\t" "lxvd2x %x10, %23, %2 \n\t" "stxvd2x %x17, 0, %3 \n\t" "stxvd2x %x18, %23, %3 \n\t" "xvmaddadp %x19, %x11, %x4 \n\t" "xvmaddadp %x20, %x12, %x4 \n\t" "lxvd2x %x11, %24, %2 \n\t" "lxvd2x %x12, %25, %2 \n\t" "stxvd2x %x19, %24, %3 \n\t" "stxvd2x %x20, %25, %3 \n\t" "addi %2, %2, 64 \n\t" "addi %3, %3, 128 \n\t" "lxvd2x %x17, 0, %3 \n\t" "lxvd2x %x18, %23, %3 \n\t" "lxvd2x %x19, %24, %3 \n\t" "lxvd2x %x20, %25, %3 \n\t" "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" "bgt 1b \n" "2: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" "xvmaddadp %x15, %x7, %x4 \n\t" "xvmaddadp %x16, %x8, %x4 \n\t" "xvmaddadp %x17, %x9, %x4 \n\t" "xvmaddadp %x18, %x10, %x4 \n\t" "xvmaddadp %x19, %x11, %x4 \n\t" "xvmaddadp %x20, %x12, %x4 \n\t" "stxvd2x %x13, 0, %3 \n\t" "stxvd2x %x14, %23, %3 \n\t" "stxvd2x %x15, %24, %3 \n\t" "stxvd2x %x16, %25, %3 \n\t" "addi %3, %3, 64 \n\t" "stxvd2x %x17, 0, %3 \n\t" "stxvd2x %x18, %23, %3 \n\t" "stxvd2x %x19, %24, %3 \n\t" "stxvd2x %x20, %25, %3 \n" "#n=%1 x=%21=%2 y=%0=%3 alpha=%22 o16=%23 o32=%24 o48=%25\n" "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15 t12=%x16 t13=%x17 t14=%x18 t15=%x19 t16=%x20" : "+m" (*y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y), // 3 "=wa" (t0), // 4 "=wa" (t1), // 5 "=wa" (t2), // 6 "=wa" (t3), // 7 "=wa" (t4), // 8 "=wa" (t5), // 9 "=wa" (t6), // 10 "=wa" (t7), // 11 "=wa" (t8), // 12 "=wa" (t9), // 13 "=wa" (t10), // 14 "=wa" (t11), // 15 "=wa" (t12), // 16 "=wa" (t13), // 17 "=wa" (t14), // 18 "=wa" (t15), // 19 "=wa" (t16) // 20 : "m" (*x), "d" (alpha), // 22 "b" (16), // 23 "b" (32), // 24 "b" (48) // 25 : "cr0" ); } OpenBLAS-0.2.20/kernel/power/dcopy.c000066400000000000000000000056121313527062700170460ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "dcopy_microk_power8.c" #endif #ifndef HAVE_KERNEL_32 static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { dcopy_kernel_32(n1, x, y); i=n1; } while(i < n) { y[i] = x[i] ; i++ ; } } else { while(i < n) { y[iy] = x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/dcopy_microk_power8.c000066400000000000000000000120041313527062700217070ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_32 1 static void dcopy_kernel_32 (long n, double *x, double *y) { __asm__ ( "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %5, %2 \n\t" "lxvd2x 34, %6, %2 \n\t" "lxvd2x 35, %7, %2 \n\t" "lxvd2x 36, %8, %2 \n\t" "lxvd2x 37, %9, %2 \n\t" "lxvd2x 38, %10, %2 \n\t" "lxvd2x 39, %11, %2 \n\t" "addi %2, %2, 128 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %5, %2 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "lxvd2x 34, %6, %2 \n\t" "lxvd2x 35, %7, %2 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "lxvd2x 36, %8, %2 \n\t" "lxvd2x 37, %9, %2 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "lxvd2x 38, %10, %2 \n\t" "lxvd2x 39, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "bgt 1b \n" "2: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "=m" (*y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : "m" (*x), "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } OpenBLAS-0.2.20/kernel/power/ddot.c000066400000000000000000000064751313527062700166720ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/20 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "ddot_microk_power8.c" #endif #ifndef HAVE_KERNEL_8 static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; FLOAT dot = 0.0; while(i < n) { dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] + y[i+4] * x[i+4] + y[i+5] * x[i+5] + y[i+6] * x[i+6] + y[i+7] * x[i+7] ; i+=8 ; } return dot; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot = 0.0 ; if ( n <= 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -16; if ( n1 ) dot = ddot_kernel_8(n1, x, y); i = n1; while(i < n) { dot += y[i] * x[i] ; i++ ; } return(dot); } FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; BLASLONG n1 = n & -4; while(i < n1) { FLOAT m1 = y[iy] * x[ix] ; FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; ix += inc_x*4 ; iy += inc_y*4 ; temp1 += m1+m3; temp2 += m2+m4; i+=4 ; } while(i < n) { temp1 += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } dot = temp1 + temp2; return(dot); } OpenBLAS-0.2.20/kernel/power/ddot_microk_power8.c000066400000000000000000000125511313527062700215320ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/20 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_8 1 static double ddot_kernel_8 (long n, double *x, double *y) { double dot; __vector double t0; __vector double t1; __vector double t2; __vector double t3; __asm__ ( "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" "xxlxor 32, 32, 32 \n\t" "xxlxor 33, 33, 33 \n\t" "xxlxor 34, 34, 34 \n\t" "xxlxor 35, 35, 35 \n\t" "xxlxor 36, 36, 36 \n\t" "xxlxor 37, 37, 37 \n\t" "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 48, 0, %3 \n\t" "lxvd2x 41, %10, %2 \n\t" "lxvd2x 49, %10, %3 \n\t" "lxvd2x 42, %11, %2 \n\t" "lxvd2x 50, %11, %3 \n\t" "lxvd2x 43, %12, %2 \n\t" "lxvd2x 51, %12, %3 \n\t" "lxvd2x 44, %13, %2 \n\t" "lxvd2x %x4, %13, %3 \n\t" "lxvd2x 45, %14, %2 \n\t" "lxvd2x %x5, %14, %3 \n\t" "lxvd2x 46, %15, %2 \n\t" "lxvd2x %x6, %15, %3 \n\t" "lxvd2x 47, %16, %2 \n\t" "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmaddadp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 48, 0, %3 \n\t" "xvmaddadp 33, 41, 49 \n\t" "lxvd2x 41, %10, %2 \n\t" "lxvd2x 49, %10, %3 \n\t" "xvmaddadp 34, 42, 50 \n\t" "lxvd2x 42, %11, %2 \n\t" "lxvd2x 50, %11, %3 \n\t" "xvmaddadp 35, 43, 51 \n\t" "lxvd2x 43, %12, %2 \n\t" "lxvd2x 51, %12, %3 \n\t" "xvmaddadp 36, 44, %x4 \n\t" "lxvd2x 44, %13, %2 \n\t" "lxvd2x %x4, %13, %3 \n\t" "xvmaddadp 37, 45, %x5 \n\t" "lxvd2x 45, %14, %2 \n\t" "lxvd2x %x5, %14, %3 \n\t" "xvmaddadp 38, 46, %x6 \n\t" "lxvd2x 46, %15, %2 \n\t" "lxvd2x %x6, %15, %3 \n\t" "xvmaddadp 39, 47, %x7 \n\t" "lxvd2x 47, %16, %2 \n\t" "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" "bgt 1b \n" "2: \n\t" "xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 33, 41, 49 \n\t" "xvmaddadp 34, 42, 50 \n\t" "xvmaddadp 35, 43, 51 \n\t" "xvmaddadp 36, 44, %x4 \n\t" "xvmaddadp 37, 45, %x5 \n\t" "xvmaddadp 38, 46, %x6 \n\t" "xvmaddadp 39, 47, %x7 \n\t" "xvadddp 32, 32, 33 \n\t" "xvadddp 34, 34, 35 \n\t" "xvadddp 36, 36, 37 \n\t" "xvadddp 38, 38, 39 \n\t" "xvadddp 32, 32, 34 \n\t" "xvadddp 36, 36, 38 \n\t" "xvadddp 32, 32, 36 \n\t" "xxswapd 33, 32 \n\t" "xsadddp %x0, 32, 33 \n" "#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n" "#t0=%x4 t1=%x5 t2=%x6 t3=%x7" : "=d" (dot), // 0 "+r" (n), // 1 "+b" (x), // 2 "+b" (y), // 3 "=wa" (t0), // 4 "=wa" (t1), // 5 "=wa" (t2), // 6 "=wa" (t3) // 7 : "m" (*x), "m" (*y), "b" (16), // 10 "b" (32), // 11 "b" (48), // 12 "b" (64), // 13 "b" (80), // 14 "b" (96), // 15 "b" (112) // 16 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); return dot; } OpenBLAS-0.2.20/kernel/power/def_vsx.h000066400000000000000000000017541313527062700173760ustar00rootroot00000000000000#define vs0 0 #define vs1 1 #define vs2 2 #define vs3 3 #define vs4 4 #define vs5 5 #define vs6 6 #define vs7 7 #define vs8 8 #define vs9 9 #define vs10 10 #define vs11 11 #define vs12 12 #define vs13 13 #define vs14 14 #define vs15 15 #define vs16 16 #define vs17 17 #define vs18 18 #define vs19 19 #define vs20 20 #define vs21 21 #define vs22 22 #define vs23 23 #define vs24 24 #define vs25 25 #define vs26 26 #define vs27 27 #define vs28 28 #define vs29 29 #define vs30 30 #define vs31 31 #define vs32 32 #define vs33 33 #define vs34 34 #define vs35 35 #define vs36 36 #define vs37 37 #define vs38 38 #define vs39 39 #define vs40 40 #define vs41 41 #define vs42 42 #define vs43 43 #define vs44 44 #define vs45 45 #define vs46 46 #define vs47 47 #define vs48 48 #define vs49 49 #define vs50 50 #define vs51 51 #define vs52 52 #define vs53 53 #define vs54 54 #define vs55 55 #define vs56 56 #define vs57 57 #define vs58 58 #define vs59 59 #define vs60 60 #define vs61 61 #define vs62 62 #define vs63 63 OpenBLAS-0.2.20/kernel/power/dgemm_kernel_16x4_power8.S000066400000000000000000000214451313527062700224310ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA_SP 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define alpha_r vs18 #define o0 0 #define T4 r12 #define T3 r11 #define o40 r12 #define o56 r11 #define o112 r14 #define o8 r15 #define o24 r16 #define o64 r17 #define L r18 #define T1 r19 #define o80 r20 #define o96 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o16 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T2 r31 #include "dgemm_macros_16x4_power8.S" #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) #endif stfd f1, ALPHA_SP stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif cmpwi cr0, M, 0 ble .L999_H1 cmpwi cr0, N, 0 ble .L999_H1 cmpwi cr0, K, 0 ble .L999_H1 #ifdef __64BIT__ addi T1, SP, 296 #else addi T1, SP, 224 #endif li PRE, 384 li o8 , 8 li o16, 16 li o24, 24 li o32, 32 li o48, 48 li o64, 64 li o80, 80 li o96, 96 li o112, 112 lxvdsx alpha_r, 0, T1 #include "dgemm_logic_16x4_power8.S" .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/dgemm_logic_16x4_power8.S000066400000000000000000000553601313527062700222510ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define MY_ALIGN .align 3 srawi. J, N, 2 ble LDGEMM_L4_END LDGEMM_L4_BEGIN: li T1, 128 li T2, 256 mr AO, A mr CO, C slwi T3, LDC , 2 add C, C, T3 dcbt A, T1 dcbt A, T2 srawi. I, M, 4 ble LDGEMM_L4x16_END MY_ALIGN LDGEMM_L4x16_BEGIN_FIRST: li L, -128 mr T1, CO add T2, T1, LDC add T3, T2, LDC add T4, T3, LDC and T1, T1, L and T2, T2, L and T3, T3, L and T4, T4, L dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 mr BO, B srawi. L, K, 2 addi T1, T1, 128 addi T2, T2, 128 addi T3, T3, 128 addi T4, T4, 128 dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 ble LDGEMM_L4x16_SUB0_FIRST cmpwi cr0, L, 1 ble LDGEMM_L4x16_SUB4_FIRST MY_ALIGN LDGEMM_L4x16_LOOP_START_FIRST: li T2, 512 li o40, 40 li o56, 56 dcbt AO, PRE dcbt BO, T2 LOAD4x16_1 dcbt AO, PRE KERNEL4x16_I1 dcbt AO, PRE addic. L, L, -2 KERNEL4x16_L2 dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE dcbt BO, T2 KERNEL4x16_L2 ble LDGEMM_L4x16_LOOP_END_FIRST mtctr L MY_ALIGN LDGEMM_L4x16_LOOP_FIRST: dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE KERNEL4x16_L2 dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE dcbt BO, T2 KERNEL4x16_L2 bdnz LDGEMM_L4x16_LOOP_FIRST MY_ALIGN LDGEMM_L4x16_LOOP_END_FIRST: KERNEL4x16_L1 KERNEL4x16_L2 KERNEL4x16_1 KERNEL4x16_E2 b LDGEMM_L4x16_SUB1_FIRST LDGEMM_L4x16_SUB4_FIRST: KERNEL4x16_SUBI1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 b LDGEMM_L4x16_SUB1_FIRST LDGEMM_L4x16_SUB0_FIRST: andi. L, K, 3 KERNEL4x16_SUBI1 addic. L, L, -1 ble LDGEMM_L4x16_SAVE_FIRST b LDGEMM_L4x16_SUB2_FIRST LDGEMM_L4x16_SUB1_FIRST: andi. L, K, 3 ble LDGEMM_L4x16_SAVE_FIRST LDGEMM_L4x16_SUB2_FIRST: KERNEL4x16_SUB1 addic. L, L, -1 bgt LDGEMM_L4x16_SUB2_FIRST MY_ALIGN LDGEMM_L4x16_SAVE_FIRST: SAVE4x16 addic. I, I, -1 ble LDGEMM_L4x16_END LDGEMM_L4x16_END_FIRST: MY_ALIGN LDGEMM_L4x16_BEGIN: li L, -128 mr T1, CO add T2, T1, LDC add T3, T2, LDC add T4, T3, LDC and T1, T1, L and T2, T2, L and T3, T3, L and T4, T4, L dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 mr BO, B srawi. L, K, 1 addi T1, T1, 128 addi T2, T2, 128 addi T3, T3, 128 addi T4, T4, 128 dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 ble- LDGEMM_L4x16_SUB0 cmpwi cr0, L, 1 ble- LDGEMM_L4x16_SUB4 MY_ALIGN LDGEMM_L4x16_LOOP_START: li o40, 40 li o56, 56 dcbt AO, PRE LOAD4x16_1 dcbt AO, PRE KERNEL4x16_I1 dcbt AO, PRE addic. L, L, -2 KERNEL4x16_L2 ble- LDGEMM_L4x16_LOOP_END mtctr L MY_ALIGN LDGEMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_L1 dcbt AO, PRE KERNEL4x16_L2 bdnz+ LDGEMM_L4x16_LOOP MY_ALIGN LDGEMM_L4x16_LOOP_END: KERNEL4x16_1 KERNEL4x16_E2 b LDGEMM_L4x16_SUB1 MY_ALIGN LDGEMM_L4x16_SUB4: KERNEL4x16_SUBI1 KERNEL4x16_SUB1 b LDGEMM_L4x16_SUB1 MY_ALIGN LDGEMM_L4x16_SUB0: andi. L, K, 1 KERNEL4x16_SUBI1 addic. L, L, -1 ble LDGEMM_L4x16_SAVE b LDGEMM_L4x16_SUB2 MY_ALIGN LDGEMM_L4x16_SUB1: andi. L, K, 1 ble LDGEMM_L4x16_SAVE MY_ALIGN LDGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 bgt LDGEMM_L4x16_SUB2 MY_ALIGN LDGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 bgt+ LDGEMM_L4x16_BEGIN LDGEMM_L4x16_END: LDGEMM_L4x8_BEGIN: andi. T2, M, 15 ble LDGEMM_L4x1_END andi. T1, M, 8 ble LDGEMM_L4x8_END mr BO, B srawi. L, K, 3 ble LDGEMM_L4x8_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L4x8_SUB4 LDGEMM_L4x8_LOOP_START: dcbt AO, PRE LOAD4x8_1 KERNEL4x8_I1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 ble LDGEMM_L4x8_LOOP_END MY_ALIGN LDGEMM_L4x8_LOOP: KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 bgt LDGEMM_L4x8_LOOP LDGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_E2 b LDGEMM_L4x8_SUB1 LDGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 b LDGEMM_L4x8_SUB1 LDGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 ble LDGEMM_L4x8_SAVE b LDGEMM_L4x8_SUB2 LDGEMM_L4x8_SUB1: andi. L, K, 7 ble LDGEMM_L4x8_SAVE LDGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 bgt LDGEMM_L4x8_SUB2 LDGEMM_L4x8_SAVE: SAVE4x8 LDGEMM_L4x8_END: LDGEMM_L4x4_BEGIN: andi. T1, M, 4 ble LDGEMM_L4x4_END mr BO, B srawi. L, K, 3 ble LDGEMM_L4x4_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L4x4_SUB4 LDGEMM_L4x4_LOOP_START: dcbt AO, PRE LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 dcbt AO, PRE KERNEL4x4_2 addic. L, L, -2 ble LDGEMM_L4x4_LOOP_END MY_ALIGN LDGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 dcbt AO, PRE KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 dcbt AO, PRE KERNEL4x4_2 addic. L, L, -1 bgt LDGEMM_L4x4_LOOP LDGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_E2 b LDGEMM_L4x4_SUB1 LDGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 b LDGEMM_L4x4_SUB1 LDGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 ble LDGEMM_L4x4_SAVE b LDGEMM_L4x4_SUB2 LDGEMM_L4x4_SUB1: andi. L, K, 7 ble LDGEMM_L4x4_SAVE LDGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 bgt LDGEMM_L4x4_SUB2 LDGEMM_L4x4_SAVE: SAVE4x4 LDGEMM_L4x4_END: LDGEMM_L4x2_BEGIN: andi. T1, M, 2 ble LDGEMM_L4x2_END mr BO, B srawi. L, K, 3 ble LDGEMM_L4x2_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L4x2_SUB4 LDGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -2 ble LDGEMM_L4x2_LOOP_END MY_ALIGN LDGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -1 bgt LDGEMM_L4x2_LOOP LDGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_E2 b LDGEMM_L4x2_SUB1 LDGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 b LDGEMM_L4x2_SUB1 LDGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 ble LDGEMM_L4x2_SAVE b LDGEMM_L4x2_SUB2 LDGEMM_L4x2_SUB1: andi. L, K, 7 ble LDGEMM_L4x2_SAVE LDGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 bgt LDGEMM_L4x2_SUB2 LDGEMM_L4x2_SAVE: SAVE4x2 LDGEMM_L4x2_END: LDGEMM_L4x1_BEGIN: andi. T1, M, 1 ble LDGEMM_L4x1_END mr BO, B srawi. L, K, 3 ble LDGEMM_L4x1_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L4x1_SUB4 LDGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -2 ble LDGEMM_L4x1_LOOP_END MY_ALIGN LDGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -1 bgt LDGEMM_L4x1_LOOP LDGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_E2 b LDGEMM_L4x1_SUB1 LDGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 b LDGEMM_L4x1_SUB1 LDGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 ble LDGEMM_L4x1_SAVE b LDGEMM_L4x1_SUB2 LDGEMM_L4x1_SUB1: andi. L, K, 7 ble LDGEMM_L4x1_SAVE LDGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 bgt LDGEMM_L4x1_SUB2 LDGEMM_L4x1_SAVE: SAVE4x1 LDGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 bgt LDGEMM_L4_BEGIN andi. T2, N, 3 ble .L999 LDGEMM_L4_END: b LDGEMM_L2_BEGIN .L999_H1: b .L999 LDGEMM_L2_BEGIN: andi. T1, N, 2 ble LDGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 ble LDGEMM_L2x16_END LDGEMM_L2x16_BEGIN: mr BO, B srawi. L, K, 3 ble LDGEMM_L2x16_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L2x16_SUB4 LDGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 dcbt AO, PRE KERNEL2x16_I1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -2 ble LDGEMM_L2x16_LOOP_END MY_ALIGN LDGEMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -1 bgt LDGEMM_L2x16_LOOP LDGEMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 KERNEL2x16_E2 b LDGEMM_L2x16_SUB1 LDGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 dcbt AO, PRE KERNEL2x16_SUB1 dcbt AO, PRE KERNEL2x16_SUB1 dcbt AO, PRE KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 b LDGEMM_L2x16_SUB1 LDGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 ble LDGEMM_L2x16_SAVE b LDGEMM_L2x16_SUB2 LDGEMM_L2x16_SUB1: andi. L, K, 7 ble LDGEMM_L2x16_SAVE LDGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 bgt LDGEMM_L2x16_SUB2 LDGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 bgt LDGEMM_L2x16_BEGIN LDGEMM_L2x16_END: LDGEMM_L2x8_BEGIN: andi. T2, M, 15 ble LDGEMM_L2x1_END andi. T1, M, 8 ble LDGEMM_L2x8_END mr BO, B srawi. L, K, 3 ble LDGEMM_L2x8_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L2x8_SUB4 LDGEMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 KERNEL2x8_I1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 ble LDGEMM_L2x8_LOOP_END MY_ALIGN LDGEMM_L2x8_LOOP: KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 bgt LDGEMM_L2x8_LOOP LDGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_E2 b LDGEMM_L2x8_SUB1 LDGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b LDGEMM_L2x8_SUB1 LDGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 ble LDGEMM_L2x8_SAVE b LDGEMM_L2x8_SUB2 LDGEMM_L2x8_SUB1: andi. L, K, 7 ble LDGEMM_L2x8_SAVE LDGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt LDGEMM_L2x8_SUB2 LDGEMM_L2x8_SAVE: SAVE2x8 LDGEMM_L2x8_END: LDGEMM_L2x4_BEGIN: andi. T1, M, 4 ble LDGEMM_L2x4_END mr BO, B srawi. L, K, 3 ble LDGEMM_L2x4_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L2x4_SUB4 LDGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble LDGEMM_L2x4_LOOP_END MY_ALIGN LDGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt LDGEMM_L2x4_LOOP LDGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b LDGEMM_L2x4_SUB1 LDGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b LDGEMM_L2x4_SUB1 LDGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 ble LDGEMM_L2x4_SAVE b LDGEMM_L2x4_SUB2 LDGEMM_L2x4_SUB1: andi. L, K, 7 ble LDGEMM_L2x4_SAVE LDGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt LDGEMM_L2x4_SUB2 LDGEMM_L2x4_SAVE: SAVE2x4 LDGEMM_L2x4_END: LDGEMM_L2x2_BEGIN: andi. T1, M, 2 ble LDGEMM_L2x2_END mr BO, B srawi. L, K, 3 ble LDGEMM_L2x2_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L2x2_SUB4 LDGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble LDGEMM_L2x2_LOOP_END MY_ALIGN LDGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt LDGEMM_L2x2_LOOP LDGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b LDGEMM_L2x2_SUB1 LDGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b LDGEMM_L2x2_SUB1 LDGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 ble LDGEMM_L2x2_SAVE b LDGEMM_L2x2_SUB2 LDGEMM_L2x2_SUB1: andi. L, K, 7 ble LDGEMM_L2x2_SAVE LDGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt LDGEMM_L2x2_SUB2 LDGEMM_L2x2_SAVE: SAVE2x2 LDGEMM_L2x2_END: LDGEMM_L2x1_BEGIN: andi. T1, M, 1 ble LDGEMM_L2x1_END mr BO, B srawi. L, K, 3 ble LDGEMM_L2x1_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L2x1_SUB4 LDGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble LDGEMM_L2x1_LOOP_END MY_ALIGN LDGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt LDGEMM_L2x1_LOOP LDGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b LDGEMM_L2x1_SUB1 LDGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b LDGEMM_L2x1_SUB1 LDGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 ble LDGEMM_L2x1_SAVE b LDGEMM_L2x1_SUB2 LDGEMM_L2x1_SUB1: andi. L, K, 7 ble LDGEMM_L2x1_SAVE LDGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt LDGEMM_L2x1_SUB2 LDGEMM_L2x1_SAVE: SAVE2x1 LDGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 LDGEMM_L2_END: LDGEMM_L1_BEGIN: andi. T1, N, 1 ble LDGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 ble LDGEMM_L1x16_END LDGEMM_L1x16_BEGIN: mr BO, B srawi. L, K, 3 ble LDGEMM_L1x16_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L1x16_SUB4 LDGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 dcbt AO, PRE KERNEL1x16_I1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -2 ble LDGEMM_L1x16_LOOP_END MY_ALIGN LDGEMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -1 bgt LDGEMM_L1x16_LOOP LDGEMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 KERNEL1x16_E2 b LDGEMM_L1x16_SUB1 LDGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 dcbt AO, PRE KERNEL1x16_SUB1 dcbt AO, PRE KERNEL1x16_SUB1 dcbt AO, PRE KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 b LDGEMM_L1x16_SUB1 LDGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 ble LDGEMM_L1x16_SAVE b LDGEMM_L1x16_SUB2 LDGEMM_L1x16_SUB1: andi. L, K, 7 ble LDGEMM_L1x16_SAVE LDGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 bgt LDGEMM_L1x16_SUB2 LDGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 bgt LDGEMM_L1x16_BEGIN LDGEMM_L1x16_END: LDGEMM_L1x8_BEGIN: andi. T2, M, 15 ble LDGEMM_L1x1_END andi. T1, M, 8 ble LDGEMM_L1x8_END mr BO, B srawi. L, K, 3 ble LDGEMM_L1x8_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L1x8_SUB4 LDGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 KERNEL1x8_I1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 ble LDGEMM_L1x8_LOOP_END MY_ALIGN LDGEMM_L1x8_LOOP: KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 bgt LDGEMM_L1x8_LOOP LDGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_E2 b LDGEMM_L1x8_SUB1 LDGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b LDGEMM_L1x8_SUB1 LDGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 ble LDGEMM_L1x8_SAVE b LDGEMM_L1x8_SUB2 LDGEMM_L1x8_SUB1: andi. L, K, 7 ble LDGEMM_L1x8_SAVE LDGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt LDGEMM_L1x8_SUB2 LDGEMM_L1x8_SAVE: SAVE1x8 LDGEMM_L1x8_END: LDGEMM_L1x4_BEGIN: andi. T1, M, 4 ble LDGEMM_L1x4_END mr BO, B srawi. L, K, 3 ble LDGEMM_L1x4_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L1x4_SUB4 LDGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble LDGEMM_L1x4_LOOP_END MY_ALIGN LDGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt LDGEMM_L1x4_LOOP LDGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b LDGEMM_L1x4_SUB1 LDGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b LDGEMM_L1x4_SUB1 LDGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 ble LDGEMM_L1x4_SAVE b LDGEMM_L1x4_SUB2 LDGEMM_L1x4_SUB1: andi. L, K, 7 ble LDGEMM_L1x4_SAVE LDGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt LDGEMM_L1x4_SUB2 LDGEMM_L1x4_SAVE: SAVE1x4 LDGEMM_L1x4_END: LDGEMM_L1x2_BEGIN: andi. T1, M, 2 ble LDGEMM_L1x2_END mr BO, B srawi. L, K, 3 ble LDGEMM_L1x2_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L1x2_SUB4 LDGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble LDGEMM_L1x2_LOOP_END MY_ALIGN LDGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt LDGEMM_L1x2_LOOP LDGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b LDGEMM_L1x2_SUB1 LDGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b LDGEMM_L1x2_SUB1 LDGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 ble LDGEMM_L1x2_SAVE b LDGEMM_L1x2_SUB2 LDGEMM_L1x2_SUB1: andi. L, K, 7 ble LDGEMM_L1x2_SAVE LDGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt LDGEMM_L1x2_SUB2 LDGEMM_L1x2_SAVE: SAVE1x2 LDGEMM_L1x2_END: LDGEMM_L1x1_BEGIN: andi. T1, M, 1 ble LDGEMM_L1x1_END mr BO, B srawi. L, K, 3 ble LDGEMM_L1x1_SUB0 cmpwi cr0, L, 1 ble LDGEMM_L1x1_SUB4 LDGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble LDGEMM_L1x1_LOOP_END MY_ALIGN LDGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt LDGEMM_L1x1_LOOP LDGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b LDGEMM_L1x1_SUB1 LDGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b LDGEMM_L1x1_SUB1 LDGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 ble LDGEMM_L1x1_SAVE b LDGEMM_L1x1_SUB2 LDGEMM_L1x1_SUB1: andi. L, K, 7 ble LDGEMM_L1x1_SAVE LDGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt LDGEMM_L1x1_SUB2 LDGEMM_L1x1_SAVE: SAVE1x1 LDGEMM_L1x1_END: LDGEMM_L1_END: OpenBLAS-0.2.20/kernel/power/dgemm_macros_16x4_power8.S000066400000000000000000001600441313527062700224340ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************* * Macros for N=4, M=16 * *********************************************************************/ .macro LOAD4x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvd2x vs4, o64, AO lxvd2x vs5, o80, AO lxvd2x vs6, o96, AO lxvd2x vs7, o112, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 128 addi BO, BO, 32 .endm .macro KERNEL4x16_I1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 lxvd2x vs12, o64, AO lxvd2x vs13, o80, AO xvmuldp vs52, vs4, vs26 xvmuldp vs53, vs5, vs26 xvmuldp vs54, vs6, vs26 xvmuldp vs55, vs7, vs26 lxvd2x vs14, o96, AO lxvd2x vs15, o112, AO xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmuldp vs60, vs4, vs27 xvmuldp vs61, vs5, vs27 xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 addi AO, AO, 128 .endm .macro KERNEL4x16_1 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 lxvd2x vs12, o64, AO lxvd2x vs13, o80, AO xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 lxvd2x vs14, o96, AO lxvd2x vs15, o112, AO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmaddadp vs60, vs4, vs27 xvmaddadp vs61, vs5, vs27 xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 addi AO, AO, 128 addi BO, BO, 32 .endm .macro KERNEL4x16_2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 lxvd2x vs4, o64, AO lxvd2x vs5, o80, AO xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 lxvd2x vs6, o96, AO lxvd2x vs7, o112, AO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO xvmaddadp vs60, vs12, vs31 xvmaddadp vs61, vs13, vs31 xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 addi AO, AO, 128 addi BO, BO, 32 .endm .macro KERNEL4x16_L1 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 lxvd2x vs8, o0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 lxvd2x vs12, o64, AO lxvd2x vs13, o80, AO xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 lxvd2x vs14, o96, AO lxvd2x vs15, o112, AO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmaddadp vs60, vs4, vs27 xvmaddadp vs61, vs5, vs27 xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 addi AO, AO, 128 .endm .macro KERNEL4x16_L2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 lxvdsx vs24, o32, BO lxvdsx vs25, o40, BO xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 lxvd2x vs4, o64, AO lxvd2x vs5, o80, AO xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 lxvd2x vs6, o96, AO lxvd2x vs7, o112, AO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 lxvdsx vs26, o48, BO lxvdsx vs27, o56, BO xvmaddadp vs60, vs12, vs31 addi AO, AO, 128 xvmaddadp vs61, vs13, vs31 xvmaddadp vs62, vs14, vs31 addi BO, BO, 64 xvmaddadp vs63, vs15, vs31 .endm .macro KERNEL4x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 xvmaddadp vs60, vs12, vs31 xvmaddadp vs61, vs13, vs31 xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 .endm .macro KERNEL4x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 xvmuldp vs52, vs4, vs26 xvmuldp vs53, vs5, vs26 xvmuldp vs54, vs6, vs26 xvmuldp vs55, vs7, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 xvmuldp vs60, vs4, vs27 xvmuldp vs61, vs5, vs27 xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 .endm .macro KERNEL4x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO lxvd2x vs4, o64, AO lxvd2x vs5, o80, AO lxvd2x vs6, o96, AO lxvd2x vs7, o112, AO xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 addi BO, BO, 32 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 addi AO, AO, 128 xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 xvmaddadp vs60, vs4, vs27 xvmaddadp vs61, vs5, vs27 xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 .endm .macro SAVE4x16 add T2, CO, LDC lxvd2x vs0, 0, CO lxvd2x vs1, o16, CO lxvd2x vs2, o32, CO lxvd2x vs3, o48, CO lxvd2x vs4, o64, CO lxvd2x vs5, o80, CO add T3, T2, LDC lxvd2x vs6, o96, CO lxvd2x vs7, o112, CO lxvd2x vs8, 0, T2 lxvd2x vs9, o16, T2 lxvd2x vs10, o32, T2 lxvd2x vs11, o48, T2 lxvd2x vs12, o64, T2 lxvd2x vs13, o80, T2 add T4, T3, LDC lxvd2x vs14, o96, T2 lxvd2x vs15, o112, T2 lxvd2x vs24, 0, T3 lxvd2x vs25, o16, T3 lxvd2x vs26, o32, T3 lxvd2x vs27, o48, T3 lxvd2x vs28, o64, T3 lxvd2x vs29, o80, T3 lxvd2x vs30, o96, T3 lxvd2x vs31, o112, T3 xvmaddadp vs0, vs32, alpha_r lxvd2x vs32, 0, T4 xvmaddadp vs1, vs33, alpha_r lxvd2x vs33, o16, T4 xvmaddadp vs2, vs34, alpha_r lxvd2x vs34, o32, T4 xvmaddadp vs3, vs35, alpha_r lxvd2x vs35, o48, T4 xvmaddadp vs4, vs36, alpha_r lxvd2x vs36, o64, T4 xvmaddadp vs5, vs37, alpha_r lxvd2x vs37, o80, T4 xvmaddadp vs6, vs38, alpha_r lxvd2x vs38, o96, T4 xvmaddadp vs7, vs39, alpha_r lxvd2x vs39, o112, T4 xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r xvmaddadp vs24, vs48, alpha_r xvmaddadp vs25, vs49, alpha_r xvmaddadp vs26, vs50, alpha_r xvmaddadp vs27, vs51, alpha_r xvmaddadp vs28, vs52, alpha_r xvmaddadp vs29, vs53, alpha_r xvmaddadp vs30, vs54, alpha_r xvmaddadp vs31, vs55, alpha_r stxvd2x vs0, 0, CO stxvd2x vs1, o16, CO stxvd2x vs2, o32, CO stxvd2x vs3, o48, CO stxvd2x vs4, o64, CO stxvd2x vs5, o80, CO stxvd2x vs6, o96, CO stxvd2x vs7, o112, CO xvmaddadp vs32, vs56, alpha_r xvmaddadp vs33, vs57, alpha_r xvmaddadp vs34, vs58, alpha_r xvmaddadp vs35, vs59, alpha_r xvmaddadp vs36, vs60, alpha_r xvmaddadp vs37, vs61, alpha_r xvmaddadp vs38, vs62, alpha_r xvmaddadp vs39, vs63, alpha_r addi CO, CO, 128 stxvd2x vs8, o0, T2 stxvd2x vs9, o16, T2 stxvd2x vs10, o32, T2 stxvd2x vs11, o48, T2 stxvd2x vs12, o64, T2 stxvd2x vs13, o80, T2 stxvd2x vs14, o96, T2 stxvd2x vs15, o112, T2 stxvd2x vs24, 0, T3 stxvd2x vs25, o16, T3 stxvd2x vs28, o64, T3 stxvd2x vs29, o80, T3 stxvd2x vs26, o32, T3 stxvd2x vs27, o48, T3 stxvd2x vs30, o96, T3 stxvd2x vs31, o112, T3 stxvd2x vs32, o0, T4 stxvd2x vs33, o16, T4 stxvd2x vs34, o32, T4 stxvd2x vs35, o48, T4 stxvd2x vs36, o64, T4 stxvd2x vs37, o80, T4 stxvd2x vs38, o96, T4 stxvd2x vs39, o112, T4 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD4x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_I1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_1 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 .endm .macro KERNEL4x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 .endm .macro KERNEL4x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 .endm .macro SAVE4x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r xvmaddadp vs1, vs49, alpha_r xvmaddadp vs2, vs50, alpha_r xvmaddadp vs3, vs51, alpha_r #else xvmuldp vs0, vs48, alpha_r xvmuldp vs1, vs49, alpha_r xvmuldp vs2, vs50, alpha_r xvmuldp vs3, vs51, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r xvmaddadp vs9, vs57, alpha_r xvmaddadp vs10, vs58, alpha_r xvmaddadp vs11, vs59, alpha_r #else xvmuldp vs8, vs56, alpha_r xvmuldp vs9, vs57, alpha_r xvmuldp vs10, vs58, alpha_r xvmuldp vs11, vs59, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ .macro LOAD4x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 .endm .macro KERNEL4x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 .endm .macro KERNEL4x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 .endm .macro KERNEL4x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 .endm .macro KERNEL4x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 .endm .macro KERNEL4x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 .endm .macro KERNEL4x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 .endm .macro SAVE4x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r xvmaddadp vs1, vs49, alpha_r #else xvmuldp vs0, vs48, alpha_r xvmuldp vs1, vs49, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r xvmaddadp vs9, vs57, alpha_r #else xvmuldp vs8, vs56, alpha_r xvmuldp vs9, vs57, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ .macro LOAD4x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 .endm .macro KERNEL4x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs56, vs0, vs27 .endm .macro KERNEL4x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs56, vs0, vs27 .endm .macro KERNEL4x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x2_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs56, vs0, vs27 .endm .macro KERNEL4x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs56, vs0, vs27 .endm .macro SAVE4x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r #else xvmuldp vs8, vs40, alpha_r #endif stxvd2x vs8, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r #else xvmuldp vs0, vs48, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r #else xvmuldp vs8, vs56, alpha_r #endif stxvd2x vs8, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ .macro LOAD4x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 .endm .macro KERNEL4x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO lxsdx vs30, o16, BO lxsdx vs31, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 xsmuldp vs48, vs0, vs26 xsmuldp vs56, vs0, vs27 .endm .macro KERNEL4x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO lxsdx vs30, o16, BO lxsdx vs31, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 xsmaddadp vs48, vs0, vs26 xsmaddadp vs56, vs0, vs27 .endm .macro KERNEL4x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 xsmaddadp vs48, vs8, vs30 xsmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x1_E2 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 xsmaddadp vs48, vs8, vs30 xsmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 xsmuldp vs48, vs0, vs26 xsmuldp vs56, vs0, vs27 .endm .macro KERNEL4x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 xsmaddadp vs48, vs0, vs26 xsmaddadp vs56, vs0, vs27 .endm .macro SAVE4x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs40, alpha_r #else xsmuldp vs8, vs40, alpha_r #endif stxsdx vs8, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs48, alpha_r #else xsmuldp vs0, vs48, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs56, alpha_r #else xsmuldp vs8, vs56, alpha_r #endif stxsdx vs8, 0, T1 addi CO, CO, 8 .endm /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ .macro LOAD2x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 .endm .macro KERNEL2x16_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 .endm .macro KERNEL2x16_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 .endm .macro KERNEL2x16_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 .endm .macro KERNEL2x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 .endm .macro KERNEL2x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 .endm .macro KERNEL2x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 .endm .macro SAVE2x16 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r xvmaddadp vs4, vs36, alpha_r xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r xvmuldp vs4, vs36, alpha_r xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 lxvd2x vs12, 0, T2 lxvd2x vs13, o16, T2 lxvd2x vs14, o32, T2 lxvd2x vs15, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r xvmuldp vs12, vs44, alpha_r xvmuldp vs13, vs45, alpha_r xvmuldp vs14, vs46, alpha_r xvmuldp vs15, vs47, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, 0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD2x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 .endm .macro KERNEL2x8_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 .endm .macro KERNEL2x8_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 .endm .macro KERNEL2x8_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .endm .macro KERNEL2x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .endm .macro KERNEL2x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 .endm .macro KERNEL2x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 .endm .macro SAVE2x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ .macro LOAD2x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 .endm .macro KERNEL2x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 .endm .macro KERNEL2x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 .endm .macro KERNEL2x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 .endm .macro KERNEL2x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 .endm .macro KERNEL2x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 .endm .macro KERNEL2x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 .endm .macro SAVE2x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ .macro LOAD2x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 .endm .macro KERNEL2x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 .endm .macro KERNEL2x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 .endm .macro KERNEL2x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x2_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 .endm .macro KERNEL2x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 .endm .macro SAVE2x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r #else xvmuldp vs8, vs40, alpha_r #endif stxvd2x vs8, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ .macro LOAD2x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 .endm .macro KERNEL2x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 .endm .macro KERNEL2x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 .endm .macro KERNEL2x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x1_E2 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 .endm .macro KERNEL2x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 .endm .macro SAVE2x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs40, alpha_r #else xsmuldp vs8, vs40, alpha_r #endif stxsdx vs8, 0, T1 addi CO, CO, 8 .endm /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ .macro LOAD1x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 .endm .macro KERNEL1x16_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 .endm .macro KERNEL1x16_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 .endm .macro KERNEL1x16_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 .endm .macro KERNEL1x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 .endm .macro KERNEL1x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 .endm .macro KERNEL1x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 .endm .macro SAVE1x16 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r xvmaddadp vs4, vs36, alpha_r xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r xvmuldp vs4, vs36, alpha_r xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD1x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 .endm .macro KERNEL1x8_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .endm .macro KERNEL1x8_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endm .macro KERNEL1x8_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .endm .macro KERNEL1x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .endm .macro KERNEL1x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .endm .macro KERNEL1x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endm .macro SAVE1x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ .macro LOAD1x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 .endm .macro KERNEL1x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 .endm .macro KERNEL1x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 .endm .macro KERNEL1x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 .endm .macro KERNEL1x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 .endm .macro KERNEL1x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 .endm .macro KERNEL1x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 .endm .macro SAVE1x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ .macro LOAD1x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 .endm .macro KERNEL1x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 .endm .macro KERNEL1x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 .endm .macro KERNEL1x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x2_E2 xvmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 .endm .macro KERNEL1x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 .endm .macro SAVE1x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ .macro LOAD1x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 .endm .macro KERNEL1x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmuldp vs32, vs0, vs24 .endm .macro KERNEL1x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs0, vs24 .endm .macro KERNEL1x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x1_E2 xsmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmuldp vs32, vs0, vs24 .endm .macro KERNEL1x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs0, vs24 .endm .macro SAVE1x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 addi CO, CO, 8 .endm OpenBLAS-0.2.20/kernel/power/dgemm_ncopy_4_power8.S000066400000000000000000000170141313527062700217370ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define A0 r8 #define A1 r9 #define A2 r10 #define A3 r11 #define J r12 #define PREA r14 #define PREB r15 #define BO r16 #define o64 r17 #define o80 r18 #define o96 r19 #define o112 r20 #define o8 r21 #define T2 r22 #define I r23 #define o16 r24 #define o32 r25 #define o48 r26 #define NOTU1 r27 #define NOTU2 r30 #define T1 r31 #define o0 0 #include "dgemm_ncopy_macros_4_power8.S" #define STACKSIZE 384 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) cmpwi cr0, M, 0 ble- L999 cmpwi cr0, N, 0 ble- L999 slwi LDA, LDA, BASE_SHIFT li PREA, 384 li PREB, 384 li o8, 8 li o16, 16 li o32, 32 li o48, 48 li o64, 64 li o80, 80 li o96, 96 li o112, 112 #include "dgemm_ncopy_logic_4_power8.S" L999: li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/dgemm_ncopy_logic_4_power8.S000066400000000000000000000076651313527062700231270ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ mr BO, B srawi. I, N, 2 ble DCOPYN_L2_BEGIN DCOPYN_L4_BEGIN: DCOPYN_L4_LOOP: mr A0, A add A1, A0, LDA add A2, A1, LDA add A3, A2, LDA add A, A3, LDA DCOPYN_L4x16_BEGIN: srawi. J, M, 4 ble DCOPYN_L4x16_END DCOPYN_L4x16_LOOP: dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA COPY_4x16 addic. J, J, -1 bgt DCOPYN_L4x16_LOOP DCOPYN_L4x16_END: DCOPYN_L4x8_BEGIN: andi. J, M, 8 ble DCOPYN_L4x8_END COPY_4x8 DCOPYN_L4x8_END: DCOPYN_L4x4_BEGIN: andi. J, M, 4 ble DCOPYN_L4x4_END COPY_4x4 DCOPYN_L4x4_END: DCOPYN_L4x2_BEGIN: andi. J, M, 2 ble DCOPYN_L4x2_END COPY_4x2 DCOPYN_L4x2_END: DCOPYN_L4x1_BEGIN: andi. J, M, 1 ble DCOPYN_L4x1_END COPY_4x1 DCOPYN_L4x1_END: DCOPYN_L4_END: addic. I, I, -1 bgt DCOPYN_L4_LOOP DCOPYN_L2_BEGIN: andi. T1, 4, 2 ble DCOPYN_L2_END DCOPYN_L2_LOOP: mr A0, A add A1, A0, LDA add A, A1, LDA DCOPYN_L2x16_BEGIN: srawi. J, M, 4 ble DCOPYN_L2x16_END DCOPYN_L2x16_LOOP: COPY_2x16 addic. J, J, -1 bgt DCOPYN_L2x16_LOOP DCOPYN_L2x16_END: DCOPYN_L2x8_BEGIN: andi. J, M, 8 ble DCOPYN_L2x8_END COPY_2x8 DCOPYN_L2x8_END: DCOPYN_L2x4_BEGIN: andi. J, M, 4 ble DCOPYN_L2x4_END COPY_2x4 DCOPYN_L2x4_END: DCOPYN_L2x2_BEGIN: andi. J, M, 2 ble DCOPYN_L2x2_END COPY_2x2 DCOPYN_L2x2_END: DCOPYN_L2x1_BEGIN: andi. J, M, 1 ble DCOPYN_L2x1_END COPY_2x1 DCOPYN_L2x1_END: DCOPYN_L2_END: DCOPYN_L1_BEGIN: andi. T1, 4, 1 ble DCOPYN_L1_END DCOPYN_L1_LOOP: mr A0, A add A, A0, LDA DCOPYN_L1x16_BEGIN: srawi. J, M, 4 ble DCOPYN_L1x16_END DCOPYN_L1x16_LOOP: COPY_1x16 addic. J, J, -1 bgt DCOPYN_L1x16_LOOP DCOPYN_L1x16_END: DCOPYN_L1x8_BEGIN: andi. J, M, 8 ble DCOPYN_L1x8_END COPY_1x8 DCOPYN_L1x8_END: DCOPYN_L1x4_BEGIN: andi. J, M, 4 ble DCOPYN_L1x4_END COPY_1x4 DCOPYN_L1x4_END: DCOPYN_L1x2_BEGIN: andi. J, M, 2 ble DCOPYN_L1x2_END COPY_1x2 DCOPYN_L1x2_END: DCOPYN_L1x1_BEGIN: andi. J, M, 1 ble DCOPYN_L1x1_END COPY_1x1 DCOPYN_L1x1_END: DCOPYN_L1_END: OpenBLAS-0.2.20/kernel/power/dgemm_ncopy_macros_4_power8.S000066400000000000000000000344071313527062700233100ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ .macro COPY_4x16 lxvd2x vs0, o0, A0 lxvd2x vs8, o0, A1 lxvd2x vs24, o0, A3 lxvd2x vs16, o0, A2 lxvd2x vs1, o16, A0 lxvd2x vs9, o16, A1 lxvd2x vs17, o16, A2 lxvd2x vs25, o16, A3 lxvd2x vs2, o32, A0 lxvd2x vs10, o32, A1 lxvd2x vs18, o32, A2 lxvd2x vs26, o32, A3 lxvd2x vs3, o48, A0 lxvd2x vs11, o48, A1 lxvd2x vs19, o48, A2 lxvd2x vs27, o48, A3 lxvd2x vs4, o64, A0 lxvd2x vs12, o64, A1 lxvd2x vs20, o64, A2 lxvd2x vs28, o64, A3 lxvd2x vs5, o80, A0 lxvd2x vs13, o80, A1 lxvd2x vs21, o80, A2 lxvd2x vs29, o80, A3 lxvd2x vs6, o96, A0 lxvd2x vs14, o96, A1 lxvd2x vs22, o96, A2 lxvd2x vs30, o96, A3 lxvd2x vs7, o112, A0 lxvd2x vs15, o112, A1 lxvd2x vs23, o112, A2 lxvd2x vs31, o112, A3 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs16, vs24, 0 xxpermdi vs34, vs0, vs8, 3 xxpermdi vs35, vs16, vs24, 3 xxpermdi vs36, vs1, vs9, 0 xxpermdi vs37, vs17, vs25, 0 xxpermdi vs38, vs1, vs9, 3 xxpermdi vs39, vs17, vs25, 3 xxpermdi vs40, vs2, vs10, 0 xxpermdi vs41, vs18, vs26, 0 xxpermdi vs42, vs2, vs10, 3 xxpermdi vs43, vs18, vs26, 3 xxpermdi vs44, vs3, vs11, 0 xxpermdi vs45, vs19, vs27, 0 xxpermdi vs46, vs3, vs11, 3 xxpermdi vs47, vs19, vs27, 3 xxpermdi vs48, vs4, vs12, 0 xxpermdi vs49, vs20, vs28, 0 xxpermdi vs50, vs4, vs12, 3 xxpermdi vs51, vs20, vs28, 3 xxpermdi vs52, vs5, vs13, 0 xxpermdi vs53, vs21, vs29, 0 xxpermdi vs54, vs5, vs13, 3 xxpermdi vs55, vs21, vs29, 3 addi A0, A0, 128 addi A1, A1, 128 xxpermdi vs56, vs6, vs14, 0 xxpermdi vs57, vs22, vs30, 0 xxpermdi vs58, vs6, vs14, 3 xxpermdi vs59, vs22, vs30, 3 addi A3, A3, 128 addi A2, A2, 128 xxpermdi vs60, vs7, vs15, 0 xxpermdi vs61, vs23, vs31, 0 xxpermdi vs62, vs7, vs15, 3 xxpermdi vs63, vs23, vs31, 3 dcbt BO, PREB stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO stxvd2x vs35, o48, BO stxvd2x vs36, o64, BO stxvd2x vs37, o80, BO stxvd2x vs38, o96, BO stxvd2x vs39, o112, BO addi BO, BO, 128 dcbt BO, PREB stxvd2x vs40, o0, BO stxvd2x vs41, o16, BO stxvd2x vs42, o32, BO stxvd2x vs43, o48, BO stxvd2x vs44, o64, BO stxvd2x vs45, o80, BO stxvd2x vs46, o96, BO stxvd2x vs47, o112, BO addi BO, BO, 128 dcbt BO, PREB stxvd2x vs48, o0, BO stxvd2x vs49, o16, BO stxvd2x vs50, o32, BO stxvd2x vs51, o48, BO stxvd2x vs52, o64, BO stxvd2x vs53, o80, BO stxvd2x vs54, o96, BO stxvd2x vs55, o112, BO addi BO, BO, 128 dcbt BO, PREB stxvd2x vs56, o0, BO stxvd2x vs57, o16, BO stxvd2x vs58, o32, BO stxvd2x vs59, o48, BO stxvd2x vs60, o64, BO stxvd2x vs61, o80, BO stxvd2x vs62, o96, BO stxvd2x vs63, o112, BO addi BO, BO, 128 .endm /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro COPY_4x8 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 lxvd2x vs2, o32, A0 lxvd2x vs3, o48, A0 addi A0, A0, 64 lxvd2x vs8, o0, A1 lxvd2x vs9, o16, A1 lxvd2x vs10, o32, A1 lxvd2x vs11, o48, A1 addi A1, A1, 64 lxvd2x vs16, o0, A2 lxvd2x vs17, o16, A2 lxvd2x vs18, o32, A2 lxvd2x vs19, o48, A2 addi A2, A2, 64 lxvd2x vs24, o0, A3 lxvd2x vs25, o16, A3 lxvd2x vs26, o32, A3 lxvd2x vs27, o48, A3 addi A3, A3, 64 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs16, vs24, 0 xxpermdi vs34, vs0, vs8, 3 xxpermdi vs35, vs16, vs24, 3 xxpermdi vs36, vs1, vs9, 0 xxpermdi vs37, vs17, vs25, 0 xxpermdi vs38, vs1, vs9, 3 xxpermdi vs39, vs17, vs25, 3 xxpermdi vs40, vs2, vs10, 0 xxpermdi vs41, vs18, vs26, 0 xxpermdi vs42, vs2, vs10, 3 xxpermdi vs43, vs18, vs26, 3 xxpermdi vs44, vs3, vs11, 0 xxpermdi vs45, vs19, vs27, 0 xxpermdi vs46, vs3, vs11, 3 xxpermdi vs47, vs19, vs27, 3 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO stxvd2x vs35, o48, BO stxvd2x vs36, o64, BO stxvd2x vs37, o80, BO stxvd2x vs38, o96, BO stxvd2x vs39, o112, BO addi BO, BO, 128 stxvd2x vs40, o0, BO stxvd2x vs41, o16, BO stxvd2x vs42, o32, BO stxvd2x vs43, o48, BO stxvd2x vs44, o64, BO stxvd2x vs45, o80, BO stxvd2x vs46, o96, BO stxvd2x vs47, o112, BO addi BO, BO, 128 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro COPY_4x4 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 addi A0, A0, 32 lxvd2x vs8, o0, A1 lxvd2x vs9, o16, A1 addi A1, A1, 32 lxvd2x vs16, o0, A2 lxvd2x vs17, o16, A2 addi A2, A2, 32 lxvd2x vs24, o0, A3 lxvd2x vs25, o16, A3 addi A3, A3, 32 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs16, vs24, 0 xxpermdi vs34, vs0, vs8, 3 xxpermdi vs35, vs16, vs24, 3 xxpermdi vs36, vs1, vs9, 0 xxpermdi vs37, vs17, vs25, 0 xxpermdi vs38, vs1, vs9, 3 xxpermdi vs39, vs17, vs25, 3 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO stxvd2x vs35, o48, BO stxvd2x vs36, o64, BO stxvd2x vs37, o80, BO stxvd2x vs38, o96, BO stxvd2x vs39, o112, BO addi BO, BO, 128 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro COPY_4x2 lxvd2x vs0, o0, A0 addi A0, A0, 16 lxvd2x vs8, o0, A1 addi A1, A1, 16 lxvd2x vs16, o0, A2 addi A2, A2, 16 lxvd2x vs24, o0, A3 addi A3, A3, 16 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs16, vs24, 0 xxpermdi vs34, vs0, vs8, 3 xxpermdi vs35, vs16, vs24, 3 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO stxvd2x vs35, o48, BO addi BO, BO, 64 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro COPY_4x1 lxsdx vs0, o0, A0 addi A0, A0, 8 lxsdx vs8, o0, A1 addi A1, A1, 8 lxsdx vs16, o0, A2 addi A2, A2, 8 lxsdx vs24, o0, A3 addi A3, A3, 8 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs16, vs24, 0 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO addi BO, BO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ .macro COPY_2x16 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 lxvd2x vs2, o32, A0 lxvd2x vs3, o48, A0 lxvd2x vs4, o64, A0 lxvd2x vs5, o80, A0 lxvd2x vs6, o96, A0 lxvd2x vs7, o112, A0 addi A0, A0, 128 lxvd2x vs8, o0, A1 lxvd2x vs9, o16, A1 lxvd2x vs10, o32, A1 lxvd2x vs11, o48, A1 lxvd2x vs12, o64, A1 lxvd2x vs13, o80, A1 lxvd2x vs14, o96, A1 lxvd2x vs15, o112, A1 addi A1, A1, 128 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs0, vs8, 3 xxpermdi vs34, vs1, vs9, 0 xxpermdi vs35, vs1, vs9, 3 xxpermdi vs36, vs2, vs10, 0 xxpermdi vs37, vs2, vs10, 3 xxpermdi vs38, vs3, vs11, 0 xxpermdi vs39, vs3, vs11, 3 xxpermdi vs40, vs4, vs12, 0 xxpermdi vs41, vs4, vs12, 3 xxpermdi vs42, vs5, vs13, 0 xxpermdi vs43, vs5, vs13, 3 xxpermdi vs44, vs6, vs14, 0 xxpermdi vs45, vs6, vs14, 3 xxpermdi vs46, vs7, vs15, 0 xxpermdi vs47, vs7, vs15, 3 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO stxvd2x vs35, o48, BO stxvd2x vs36, o64, BO stxvd2x vs37, o80, BO stxvd2x vs38, o96, BO stxvd2x vs39, o112, BO addi BO, BO, 128 stxvd2x vs40, o0, BO stxvd2x vs41, o16, BO stxvd2x vs42, o32, BO stxvd2x vs43, o48, BO stxvd2x vs44, o64, BO stxvd2x vs45, o80, BO stxvd2x vs46, o96, BO stxvd2x vs47, o112, BO addi BO, BO, 128 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro COPY_2x8 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 lxvd2x vs2, o32, A0 lxvd2x vs3, o48, A0 addi A0, A0, 64 lxvd2x vs8, o0, A1 lxvd2x vs9, o16, A1 lxvd2x vs10, o32, A1 lxvd2x vs11, o48, A1 addi A1, A1, 64 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs0, vs8, 3 xxpermdi vs34, vs1, vs9, 0 xxpermdi vs35, vs1, vs9, 3 xxpermdi vs36, vs2, vs10, 0 xxpermdi vs37, vs2, vs10, 3 xxpermdi vs38, vs3, vs11, 0 xxpermdi vs39, vs3, vs11, 3 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO stxvd2x vs35, o48, BO stxvd2x vs36, o64, BO stxvd2x vs37, o80, BO stxvd2x vs38, o96, BO stxvd2x vs39, o112, BO addi BO, BO, 128 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro COPY_2x4 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 addi A0, A0, 32 lxvd2x vs8, o0, A1 lxvd2x vs9, o16, A1 addi A1, A1, 32 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs0, vs8, 3 xxpermdi vs34, vs1, vs9, 0 xxpermdi vs35, vs1, vs9, 3 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO stxvd2x vs35, o48, BO addi BO, BO, 64 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro COPY_2x2 lxvd2x vs0, o0, A0 addi A0, A0, 16 lxvd2x vs8, o0, A1 addi A1, A1, 16 xxpermdi vs32, vs0, vs8, 0 xxpermdi vs33, vs0, vs8, 3 stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO addi BO, BO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro COPY_2x1 lxsdx vs0, o0, A0 addi A0, A0, 8 lxsdx vs8, o0, A1 addi A1, A1, 8 xxpermdi vs32, vs0, vs8, 0 stxvd2x vs32, o0, BO addi BO, BO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ .macro COPY_1x16 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 lxvd2x vs2, o32, A0 lxvd2x vs3, o48, A0 lxvd2x vs4, o64, A0 lxvd2x vs5, o80, A0 lxvd2x vs6, o96, A0 lxvd2x vs7, o112, A0 addi A0, A0, 128 stxvd2x vs0, o0, BO stxvd2x vs1, o16, BO stxvd2x vs2, o32, BO stxvd2x vs3, o48, BO addi BO, BO, 64 stxvd2x vs4, o0, BO stxvd2x vs5, o16, BO stxvd2x vs6, o32, BO stxvd2x vs7, o48, BO addi BO, BO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro COPY_1x8 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 lxvd2x vs2, o32, A0 lxvd2x vs3, o48, A0 addi A0, A0, 64 stxvd2x vs0, o0, BO stxvd2x vs1, o16, BO stxvd2x vs2, o32, BO stxvd2x vs3, o48, BO addi BO, BO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro COPY_1x4 lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 addi A0, A0, 32 stxvd2x vs0, o0, BO stxvd2x vs1, o16, BO addi BO, BO, 32 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro COPY_1x2 lxvd2x vs0, o0, A0 addi A0, A0, 16 stxvd2x vs0, o0, BO addi BO, BO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro COPY_1x1 lxsdx vs0, o0, A0 addi A0, A0, 8 stxsdx vs0, o0, BO addi BO, BO, 8 .endm OpenBLAS-0.2.20/kernel/power/dgemm_tcopy_16_power8.S000066400000000000000000000154271313527062700220360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define A0 r8 #define A1 r9 #define A2 r10 #define A3 r11 #define J r12 #define PREA r14 #define PREB r15 #define BO r16 #define B8 r17 #define B4 r18 #define B2 r19 #define B1 r20 #define o8 r21 #define T2 r22 #define I r23 #define o16 r24 #define o32 r25 #define o48 r26 #define B16 r29 #define M16 r30 #define T1 r31 #define o0 0 #include "dgemm_tcopy_macros_16_power8.S" #define STACKSIZE 384 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) cmpwi cr0, M, 0 ble- L999 cmpwi cr0, N, 0 ble- L999 slwi LDA, LDA, BASE_SHIFT slwi M16, M, 4 + BASE_SHIFT li T1, -16 li T2, -8 li PREA, -4 li PREB, -2 and B8, N, T1 and B4, N, T2 and B2, N, PREA and B1, N, PREB mullw B8, B8, M mullw B4, B4, M mullw B2, B2, M mullw B1, B1, M slwi B8, B8, BASE_SHIFT slwi B4, B4, BASE_SHIFT slwi B2, B2, BASE_SHIFT slwi B1, B1, BASE_SHIFT add B8, B8, B add B4, B4, B add B2, B2, B add B1, B1, B li PREA, 384 addi PREB, M16, 128 li o8, 8 li o16, 16 li o32, 32 li o48, 48 #include "dgemm_tcopy_logic_16_power8.S" L999: li r3, 0 ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/dgemm_tcopy_logic_16_power8.S000066400000000000000000000104761313527062700232120ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. I, M, 2 ble DCOPYT_L2_BEGIN DCOPYT_L4_BEGIN: mr A0, A add A1, A0, LDA add A2, A1, LDA add A3, A2, LDA add A, A3, LDA mr B16, B addi B, B, 64*SIZE sradi. J, N, 4 ble DCOPYT_L4x8_BEGIN mr BO, B16 addi T2, M16, 384 mtctr J .align 5 DCOPYT_L4x16_LOOP: addi T1, M16, 256 dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA dcbt BO, M16 dcbt BO, PREB dcbt BO, T1 dcbt BO, T2 COPY_4x16 add BO, BO, M16 // addic. J, J, -1 bdnz+ DCOPYT_L4x16_LOOP DCOPYT_L4x8_BEGIN: andi. T1, N, 8 ble DCOPYT_L4x4_BEGIN mr BO, B8 COPY_4x8 addi B8, B8, 32*SIZE DCOPYT_L4x4_BEGIN: andi. T1, N, 4 ble DCOPYT_L4x2_BEGIN mr BO, B4 COPY_4x4 addi B4, B4, 16*SIZE DCOPYT_L4x2_BEGIN: andi. T1, N, 2 ble DCOPYT_L4x1_BEGIN mr BO, B2 COPY_4x2 addi B2, B2, 8*SIZE DCOPYT_L4x1_BEGIN: andi. T1, N, 1 ble DCOPYT_L4_END mr BO, B1 COPY_4x1 addi B1, B1, 4*SIZE DCOPYT_L4_END: addic. I, I, -1 bgt DCOPYT_L4_BEGIN DCOPYT_L2_BEGIN: andi. T1, M, 2 ble DCOPYT_L1_BEGIN mr A0, A add A1, A0, LDA add A, A1, LDA mr B16, B addi B, B, 32*SIZE sradi. J, N, 4 ble DCOPYT_L2x8_BEGIN mr BO, B16 DCOPYT_L2x16_LOOP: COPY_2x16 add BO, BO, M16 addic. J, J, -1 bgt DCOPYT_L2x16_LOOP DCOPYT_L2x8_BEGIN: andi. T1, N, 8 ble DCOPYT_L2x4_BEGIN mr BO, B8 COPY_2x8 addi B8, B8, 16*SIZE DCOPYT_L2x4_BEGIN: andi. T1, N, 4 ble DCOPYT_L2x2_BEGIN mr BO, B4 COPY_2x4 addi B4, B4, 8*SIZE DCOPYT_L2x2_BEGIN: andi. T1, N, 2 ble DCOPYT_L2x1_BEGIN mr BO, B2 COPY_2x2 addi B2, B2, 4*SIZE DCOPYT_L2x1_BEGIN: andi. T1, N, 1 ble DCOPYT_L2_END mr BO, B1 COPY_2x1 addi B1, B1, 2*SIZE DCOPYT_L2_END: DCOPYT_L1_BEGIN: andi. T1, M, 1 ble L999 mr A0, A add A, A0, LDA mr B16, B addi B, B, 16*SIZE sradi. J, N, 4 ble DCOPYT_L1x8_BEGIN mr BO, B16 DCOPYT_L1x16_LOOP: COPY_1x16 add BO, BO, M16 addic. J, J, -1 bgt DCOPYT_L1x16_LOOP DCOPYT_L1x8_BEGIN: andi. T1, N, 8 ble DCOPYT_L1x4_BEGIN mr BO, B8 COPY_1x8 addi B8, B8, 8*SIZE DCOPYT_L1x4_BEGIN: andi. T1, N, 4 ble DCOPYT_L1x2_BEGIN mr BO, B4 COPY_1x4 addi B4, B4, 4*SIZE DCOPYT_L1x2_BEGIN: andi. T1, N, 2 ble DCOPYT_L1x1_BEGIN mr BO, B2 COPY_1x2 addi B2, B2, 2*SIZE DCOPYT_L1x1_BEGIN: andi. T1, N, 1 ble DCOPYT_L1_END mr BO, B1 COPY_1x1 addi B1, B1, 1*SIZE DCOPYT_L1_END: OpenBLAS-0.2.20/kernel/power/dgemm_tcopy_macros_16_power8.S000066400000000000000000000275651313527062700234100ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ .macro COPY_4x16 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs40, o0, A1 lxvd2x vs41, o16, A1 lxvd2x vs42, o32, A1 lxvd2x vs43, o48, A1 addi A1, A1, 64 lxvd2x vs48, o0, A2 lxvd2x vs49, o16, A2 lxvd2x vs50, o32, A2 lxvd2x vs51, o48, A2 addi A2, A2, 64 lxvd2x vs56, o0, A3 lxvd2x vs57, o16, A3 lxvd2x vs58, o32, A3 lxvd2x vs59, o48, A3 addi A3, A3, 64 lxvd2x vs36, o0, A0 lxvd2x vs37, o16, A0 lxvd2x vs38, o32, A0 lxvd2x vs39, o48, A0 addi A0, A0, 64 lxvd2x vs44, o0, A1 lxvd2x vs45, o16, A1 lxvd2x vs46, o32, A1 lxvd2x vs47, o48, A1 addi A1, A1, 64 lxvd2x vs52, o0, A2 lxvd2x vs53, o16, A2 lxvd2x vs54, o32, A2 lxvd2x vs55, o48, A2 addi A2, A2, 64 lxvd2x vs60, o0, A3 lxvd2x vs61, o16, A3 lxvd2x vs62, o32, A3 lxvd2x vs63, o48, A3 addi A3, A3, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 addi T1, T1, 64 stxvd2x vs48, o0, T1 stxvd2x vs49, o16, T1 stxvd2x vs50, o32, T1 stxvd2x vs51, o48, T1 addi T1, T1, 64 stxvd2x vs52, o0, T1 stxvd2x vs53, o16, T1 stxvd2x vs54, o32, T1 stxvd2x vs55, o48, T1 addi T1, T1, 64 stxvd2x vs56, o0, T1 stxvd2x vs57, o16, T1 stxvd2x vs58, o32, T1 stxvd2x vs59, o48, T1 addi T1, T1, 64 stxvd2x vs60, o0, T1 stxvd2x vs61, o16, T1 stxvd2x vs62, o32, T1 stxvd2x vs63, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro COPY_4x8 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A1 lxvd2x vs37, o16, A1 lxvd2x vs38, o32, A1 lxvd2x vs39, o48, A1 addi A1, A1, 64 lxvd2x vs40, o0, A2 lxvd2x vs41, o16, A2 lxvd2x vs42, o32, A2 lxvd2x vs43, o48, A2 addi A2, A2, 64 lxvd2x vs44, o0, A3 lxvd2x vs45, o16, A3 lxvd2x vs46, o32, A3 lxvd2x vs47, o48, A3 addi A3, A3, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro COPY_4x4 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 addi A0, A0, 32 lxvd2x vs34, o0, A1 lxvd2x vs35, o16, A1 addi A1, A1, 32 lxvd2x vs36, o0, A2 lxvd2x vs37, o16, A2 addi A2, A2, 32 lxvd2x vs38, o0, A3 lxvd2x vs39, o16, A3 addi A3, A3, 32 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro COPY_4x2 lxvd2x vs32, o0, A0 addi A0, A0, 16 lxvd2x vs33, o0, A1 addi A1, A1, 16 lxvd2x vs34, o0, A2 addi A2, A2, 16 lxvd2x vs35, o0, A3 addi A3, A3, 16 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro COPY_4x1 lxsdx vs32, o0, A0 addi A0, A0, 8 lxsdx vs33, o0, A1 addi A1, A1, 8 lxsdx vs34, o0, A2 addi A2, A2, 8 lxsdx vs35, o0, A3 addi A3, A3, 8 mr T1, BO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 addi T1, T1, 16 stxsdx vs34, o0, T1 stxsdx vs35, o8, T1 .endm /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ .macro COPY_2x16 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A0 lxvd2x vs37, o16, A0 lxvd2x vs38, o32, A0 lxvd2x vs39, o48, A0 addi A0, A0, 64 lxvd2x vs40, o0, A1 lxvd2x vs41, o16, A1 lxvd2x vs42, o32, A1 lxvd2x vs43, o48, A1 addi A1, A1, 64 lxvd2x vs44, o0, A1 lxvd2x vs45, o16, A1 lxvd2x vs46, o32, A1 lxvd2x vs47, o48, A1 addi A1, A1, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro COPY_2x8 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A1 lxvd2x vs37, o16, A1 lxvd2x vs38, o32, A1 lxvd2x vs39, o48, A1 addi A1, A1, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro COPY_2x4 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 addi A0, A0, 32 lxvd2x vs34, o0, A1 lxvd2x vs35, o16, A1 addi A1, A1, 32 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro COPY_2x2 lxvd2x vs32, o0, A0 addi A0, A0, 16 lxvd2x vs33, o0, A1 addi A1, A1, 16 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro COPY_2x1 lxsdx vs32, o0, A0 addi A0, A0, 8 lxsdx vs33, o0, A1 addi A1, A1, 8 mr T1, BO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 .endm /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ .macro COPY_1x16 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A0 lxvd2x vs37, o16, A0 lxvd2x vs38, o32, A0 lxvd2x vs39, o48, A0 addi A0, A0, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro COPY_1x8 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro COPY_1x4 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 addi A0, A0, 32 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro COPY_1x2 lxvd2x vs32, o0, A0 addi A0, A0, 16 mr T1, BO stxvd2x vs32, o0, T1 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro COPY_1x1 lxsdx vs32, o0, A0 addi A0, A0, 8 mr T1, BO stxsdx vs32, o0, T1 .endm OpenBLAS-0.2.20/kernel/power/dgemv_n.c000066400000000000000000000175421313527062700173540ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/30 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "dgemv_n_microk_power8.c" #endif #define NBMAX 4096 #ifndef HAVE_KERNEL_4x4 static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha) { BLASLONG i; FLOAT x[4] __attribute__ ((aligned (16)));; FLOAT *a0 = a_ptr; FLOAT *a1 = a0 + lda; FLOAT *a2 = a1 + lda; FLOAT *a3 = a2 + lda; for ( i=0; i<4; i++) x[i] = xo[i] * alpha; for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; } } #endif #ifndef HAVE_KERNEL_4x2 static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha) { BLASLONG i; FLOAT x[4] __attribute__ ((aligned (16)));; for ( i=0; i<2; i++) x[i] = xo[i] * alpha; for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0] + a1[i]*x[1]; y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; } } #endif #ifndef HAVE_KERNEL_4x1 static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha) { BLASLONG i; FLOAT x[4] __attribute__ ((aligned (16)));; for ( i=0; i<1; i++) x[i] = xo[i] * alpha; for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0]; y[i+1] += a0[i+1]*x[0]; y[i+2] += a0[i+2]*x[0]; y[i+3] += a0[i+3]*x[0]; } } #endif static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; if ( inc_dest != 1 ) { for ( i=0; i> 2 ; n2 = n & 3 ; m3 = m & 3 ; m1 = m & -4 ; m2 = (m & (NBMAX-1)) - m3 ; y_ptr = y; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } a_ptr = a; x_ptr = x; if ( inc_y != 1 ) memset(ybuffer,0,NB*8); else ybuffer = y_ptr; if ( inc_x == 1 ) { for( i = 0; i < n1 ; i++) { dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); a_ptr += lda4; x_ptr += 4; } if ( n2 & 2 ) { dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha); a_ptr += lda*2; x_ptr += 2; } if ( n2 & 1 ) { dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha); a_ptr += lda; x_ptr += 1; } } else { for( i = 0; i < n1 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; xbuffer[1] = x_ptr[0]; x_ptr += inc_x; xbuffer[2] = x_ptr[0]; x_ptr += inc_x; xbuffer[3] = x_ptr[0]; x_ptr += inc_x; dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha); a_ptr += lda4; } for( i = 0; i < n2 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); a_ptr += lda; } } a += NB; if ( inc_y != 1 ) { add_y(NB,ybuffer,y_ptr,inc_y); y_ptr += NB * inc_y; } else y_ptr += NB ; } if ( m3 == 0 ) return(0); if ( m3 == 3 ) { a_ptr = a; x_ptr = x; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; if ( lda == 3 && inc_x ==1 ) { for( i = 0; i < ( n & -4 ); i+=4 ) { temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; a_ptr += 12; x_ptr += 4; } for( ; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0]; a_ptr += 3; x_ptr ++; } } else { for( i = 0; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp0; y_ptr += inc_y; y_ptr[0] += alpha * temp1; y_ptr += inc_y; y_ptr[0] += alpha * temp2; return(0); } if ( m3 == 2 ) { a_ptr = a; x_ptr = x; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; if ( lda == 2 && inc_x ==1 ) { for( i = 0; i < (n & -4) ; i+=4 ) { temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; a_ptr += 8; x_ptr += 4; } for( ; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; a_ptr += 2; x_ptr ++; } } else { for( i = 0; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp0; y_ptr += inc_y; y_ptr[0] += alpha * temp1; return(0); } if ( m3 == 1 ) { a_ptr = a; x_ptr = x; FLOAT temp = 0.0; if ( lda == 1 && inc_x ==1 ) { for( i = 0; i < (n & -4); i+=4 ) { temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; } for( ; i < n; i++ ) { temp += a_ptr[i] * x_ptr[i]; } } else { for( i = 0; i < n; i++ ) { temp += a_ptr[0] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp; return(0); } return(0); } OpenBLAS-0.2.20/kernel/power/dgemv_n_microk_power8.c000066400000000000000000000214731313527062700222220ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/30 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) { double *a0; double *a1; double *a2; double *a3; __asm__ ( "lxvd2x 34, 0, %10 \n\t" // x0, x1 "lxvd2x 35, %11, %10 \n\t" // x2, x3 "xxspltd 32, %x9, 0 \n\t" // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) "xvmuldp 34, 34, 32 \n\t" // x0 * alpha, x1 * alpha "xvmuldp 35, 35, 32 \n\t" // x2 * alpha, x3 * alpha "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda "xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha "xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha "xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha "xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda "dcbt 0, %3 \n\t" "dcbt 0, %4 \n\t" "dcbt 0, %5 \n\t" "dcbt 0, %6 \n\t" "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1] "lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3] "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1] "lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3] "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1] "lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3] "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1] "lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3] "dcbt 0, %2 \n\t" "addi %3, %3, 32 \n\t" "addi %4, %4, 32 \n\t" "addi %5, %5, 32 \n\t" "addi %6, %6, 32 \n\t" "addic. %1, %1, -4 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 "xvmaddadp 36, 40, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t" "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1] "lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3] "xvmaddadp 36, 42, 33 \n\t" "addi %3, %3, 32 \n\t" "xvmaddadp 37, 43, 33 \n\t" "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1] "lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3] "xvmaddadp 36, 44, 34 \n\t" "addi %4, %4, 32 \n\t" "xvmaddadp 37, 45, 34 \n\t" "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1] "lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3] "xvmaddadp 36, 46, 35 \n\t" "addi %5, %5, 32 \n\t" "xvmaddadp 37, 47, 35 \n\t" "stxvd2x 36, 0, %2 \n\t" // y0, y1 "stxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1] "lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3] "addi %6, %6, 32 \n\t" "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" "ble 2f \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 "xvmaddadp 36, 40, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t" "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1] "lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3] "xvmaddadp 36, 42, 33 \n\t" "addi %3, %3, 32 \n\t" "xvmaddadp 37, 43, 33 \n\t" "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1] "lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3] "xvmaddadp 36, 44, 34 \n\t" "addi %4, %4, 32 \n\t" "xvmaddadp 37, 45, 34 \n\t" "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1] "lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3] "xvmaddadp 36, 46, 35 \n\t" "addi %5, %5, 32 \n\t" "xvmaddadp 37, 47, 35 \n\t" "stxvd2x 36, 0, %2 \n\t" // y0, y1 "stxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1] "lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3] "addi %6, %6, 32 \n\t" "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" "ble 2f \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 "xvmaddadp 36, 40, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t" "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1] "lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3] "xvmaddadp 36, 42, 33 \n\t" "addi %3, %3, 32 \n\t" "xvmaddadp 37, 43, 33 \n\t" "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1] "lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3] "xvmaddadp 36, 44, 34 \n\t" "addi %4, %4, 32 \n\t" "xvmaddadp 37, 45, 34 \n\t" "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1] "lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3] "xvmaddadp 36, 46, 35 \n\t" "addi %5, %5, 32 \n\t" "xvmaddadp 37, 47, 35 \n\t" "stxvd2x 36, 0, %2 \n\t" // y0, y1 "stxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1] "lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3] "addi %6, %6, 32 \n\t" "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" "ble 2f \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 "xvmaddadp 36, 40, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t" "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1] "lxvd2x 41, %11, %3 \n\t" // a0[2], a0[3] "xvmaddadp 36, 42, 33 \n\t" "addi %3, %3, 32 \n\t" "xvmaddadp 37, 43, 33 \n\t" "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1] "lxvd2x 43, %11, %4 \n\t" // a1[2], a1[3] "xvmaddadp 36, 44, 34 \n\t" "addi %4, %4, 32 \n\t" "xvmaddadp 37, 45, 34 \n\t" "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1] "lxvd2x 45, %11, %5 \n\t" // a2[2], a2[3] "xvmaddadp 36, 46, 35 \n\t" "addi %5, %5, 32 \n\t" "xvmaddadp 37, 47, 35 \n\t" "stxvd2x 36, 0, %2 \n\t" // y0, y1 "stxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1] "lxvd2x 47, %11, %6 \n\t" // a3[2], a3[3] "addi %6, %6, 32 \n\t" "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" "bgt 1b \n" "2: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 "xvmaddadp 36, 40, 32 \n\t" "xvmaddadp 37, 41, 32 \n\t" "xvmaddadp 36, 42, 33 \n\t" "xvmaddadp 37, 43, 33 \n\t" "xvmaddadp 36, 44, 34 \n\t" "xvmaddadp 37, 45, 34 \n\t" "xvmaddadp 36, 46, 35 \n\t" "xvmaddadp 37, 47, 35 \n\t" "stxvd2x 36, 0, %2 \n\t" // y0, y1 "stxvd2x 37, %11, %2 \n" // y2, y3 "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n" "#a0=%3 a1=%4 a2=%5 a3=%6" : "+m" (*y), "+r" (n), // 1 "+b" (y), // 2 "=b" (a0), // 3 "=b" (a1), // 4 "=&b" (a2), // 5 "=&b" (a3) // 6 : "m" (*x), "m" (*ap), "d" (alpha), // 9 "r" (x), // 10 "b" (16), // 11 "3" (ap), // 12 "4" (lda) // 13 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } OpenBLAS-0.2.20/kernel/power/dnrm2_hummer.S000066400000000000000000000441121313527062700203050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define ALPHA f4 #define ALPHA_R f5 #define A1 f6 #define A2 f7 #define A3 f8 #define A4 f9 #define A5 f10 #define A6 f11 #define A7 f12 #define A8 f13 #define F1 f14 #define F2 f15 #define F3 f16 #define F4 f17 #define F5 f18 #define F6 f19 #define F7 f20 #define F8 f21 #define T1 f22 #define T2 f23 #define T3 f24 #define T4 f25 #define T5 f26 #define T6 f27 #define T7 f28 #define T8 f29 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 stfpdux f28, SP, r10 stfpdux f29, SP, r10 li r10, 0 lis r11, 0x3f80 stwu r11, -4(SP) stwu r11, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpsx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 ble LL(99) mr XX, X cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C1, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 fabs C1, C1 ble LL(20) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fpabs T1, A1 LFPDUX A6, X, INCX2 fpabs T2, A2 LFPDUX A7, X, INCX2 fpabs T3, A3 LFPDUX A8, X, INCX2 fpabs T4, A4 bdz LL(13) .align 4 LL(12): fpsub F1, C1, T1 LFPDUX A1, X, INCX2 fpsub F2, C2, T2 LFPDUX A2, X, INCX2 fpsub F3, C3, T3 LFPDUX A3, X, INCX2 fpsub F4, C4, T4 LFPDUX A4, X, INCX2 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsel C1, F1, C1, T1 LFPDUX A5, X, INCX2 fpsel C2, F2, C2, T2 LFPDUX A6, X, INCX2 fpsel C3, F3, C3, T3 LFPDUX A7, X, INCX2 fpsel C4, F4, C4, T4 LFPDUX A8, X, INCX2 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(12) .align 4 LL(13): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(15): andi. r0, N, 15 beq LL(20) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpabs A1, A1 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(20) LFDUX A1, X, INCX2 fabs A1, A1 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(20): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel ALPHA, F1, C1, C2 li r10, 0 lfs ALPHA_R, 8(SP) # load 1.0 fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 fsmfp ALPHA_R, ALPHA_R andi. r0, XX, 2 * SIZE - 1 beq LL(21) LFD C1, 0 * SIZE(XX) add XX, XX, INCX cmpwi cr0, N, 0 fmul C1, ALPHA_R, C1 fmul C1, C1, C1 ble LL(998) .align 4 LL(21): sub XX, XX, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(25) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 LFPDUX A5, XX, INCX2 LFPDUX A6, XX, INCX2 LFPDUX A7, XX, INCX2 LFPDUX A8, XX, INCX2 fpmul T1, ALPHA_R, A1 fpmul T2, ALPHA_R, A2 fpmul T3, ALPHA_R, A3 fpmul T4, ALPHA_R, A4 bdz LL(23) .align 4 LL(22): fpmadd C1, T1, T1, C1 LFPDUX A1, XX, INCX2 fpmul T1, ALPHA_R, A5 LFPDUX A2, XX, INCX2 fpmadd C2, T2, T2, C2 LFPDUX A3, XX, INCX2 fpmul T2, ALPHA_R, A6 LFPDUX A4, XX, INCX2 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A7 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A8 fpmadd C1, T1, T1, C1 LFPDUX A5, XX, INCX2 fpmul T1, ALPHA_R, A1 LFPDUX A6, XX, INCX2 fpmadd C2, T2, T2, C2 LFPDUX A7, XX, INCX2 fpmul T2, ALPHA_R, A2 LFPDUX A8, XX, INCX2 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A3 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A4 bdnz LL(22) .align 4 LL(23): fpmadd C1, T1, T1, C1 fpmul T1, ALPHA_R, A5 fpmadd C2, T2, T2, C2 fpmul T2, ALPHA_R, A6 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A7 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A8 fpmadd C1, T1, T1, C1 fpmadd C2, T2, T2, C2 fpmadd C3, T3, T3, C3 fpmadd C4, T4, T4, C4 .align 4 LL(25): andi. r0, N, 15 beq LL(98) andi. r0, N, 8 beq LL(26) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 fpmul A1, ALPHA_R, A1 fpmul A2, ALPHA_R, A2 fpmul A3, ALPHA_R, A3 fpmul A4, ALPHA_R, A4 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 .align 4 LL(26): andi. r0, N, 4 beq LL(27) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 fpmul A1, ALPHA_R, A1 fpmul A2, ALPHA_R, A2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFPDUX A1, XX, INCX2 fpmul A1, ALPHA_R, A1 fpmadd C1, A1, A1, C1 .align 4 LL(28): andi. r0, N, 1 beq LL(98) LFDUX A1, XX, INCX2 fmul A1, ALPHA_R, A1 fmadd C1, A1, A1, C1 .align 4 LL(98): fpadd C1, C1, C2 lis r3, 0x3f00 fpadd C3, C3, C4 lis r4, 0x4040 stw r3, 4(SP) stw r4, 8(SP) fpadd C1, C1, C3 lfs f10, 0(SP) fsmtp C2, C1 lfs f11, 4(SP) fadd C1, C2, C1 lfs f12, 8(SP) fcmpu cr0, f10, C1 beq cr0, LL(99) #ifndef HUMMER_EMULATOR frsqrte f9, C1 li r10, 16 fmul f2, f1, f9 lfpdux f29, SP, r10 fmul f3, f9, f11 lfpdux f28, SP, r10 fnmsub f7, f2, f9, f12 lfpdux f27, SP, r10 fmul f9, f3, f7 lfpdux f26, SP, r10 fadd f13, f11, f11 lfpdux f25, SP, r10 fmul f12, f1, f9 lfpdux f24, SP, r10 fmul f11, f12, f11 lfpdux f23, SP, r10 lfpdux f22, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 fmadd f1, f11, f1, f12 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fmul C1, ALPHA, C1 addi SP, SP, 16 blr #else fsqrt C1, C1 li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fmul C1, ALPHA, C1 addi SP, SP, 16 blr #endif .align 4 LL(99): li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFSDUX A5, X, INCX fpabs T1, A1 LFSDUX A6, X, INCX fpabs T2, A2 LFSDUX A7, X, INCX fpabs T3, A3 LFSDUX A8, X, INCX fpabs T4, A4 bdz LL(103) .align 4 LL(102): fpsub F1, C1, T1 LFDUX A1, X, INCX fpsub F2, C2, T2 LFDUX A2, X, INCX fpsub F3, C3, T3 LFDUX A3, X, INCX fpsub F4, C4, T4 LFDUX A4, X, INCX fpabs T5, A5 LFSDUX A1, X, INCX fpabs T6, A6 LFSDUX A2, X, INCX fpabs T7, A7 LFSDUX A3, X, INCX fpabs T8, A8 LFSDUX A4, X, INCX fpsel C1, F1, C1, T1 LFDUX A5, X, INCX fpsel C2, F2, C2, T2 LFDUX A6, X, INCX fpsel C3, F3, C3, T3 LFDUX A7, X, INCX fpsel C4, F4, C4, T4 LFDUX A8, X, INCX fpsub F5, C1, T5 LFSDUX A5, X, INCX fpsub F6, C2, T6 LFSDUX A6, X, INCX fpsub F7, C3, T7 LFSDUX A7, X, INCX fpsub F8, C4, T8 LFSDUX A8, X, INCX fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(102) .align 4 LL(103): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(105): andi. r0, N, 15 beq LL(120) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fsub F1, C1, A1 fsub F2, C2, A2 fsub F3, C3, A3 fsub F4, C4, A4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fsub F1, C1, A1 fsub F2, C2, A2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX fabs A1, A1 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(120): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel ALPHA, F1, C1, C2 li r10, 0 lfs ALPHA_R, 8(SP) # load 1.0 fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 fsmfp ALPHA_R, ALPHA_R sub XX, XX, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(125) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX LFSDUX A1, XX, INCX LFSDUX A2, XX, INCX LFSDUX A3, XX, INCX LFSDUX A4, XX, INCX LFDUX A5, XX, INCX LFDUX A6, XX, INCX LFDUX A7, XX, INCX LFDUX A8, XX, INCX LFSDUX A5, XX, INCX fpmul T1, ALPHA_R, A1 LFSDUX A6, XX, INCX fpmul T2, ALPHA_R, A2 LFSDUX A7, XX, INCX fpmul T3, ALPHA_R, A3 LFSDUX A8, XX, INCX fpmul T4, ALPHA_R, A4 bdz LL(123) .align 4 LL(122): fpmadd C1, T1, T1, C1 LFDUX A1, XX, INCX fpmul T1, ALPHA_R, A5 LFDUX A2, XX, INCX fpmadd C2, T2, T2, C2 LFDUX A3, XX, INCX fpmul T2, ALPHA_R, A6 LFDUX A4, XX, INCX fpmadd C3, T3, T3, C3 LFSDUX A1, XX, INCX fpmul T3, ALPHA_R, A7 LFSDUX A2, XX, INCX fpmadd C4, T4, T4, C4 LFSDUX A3, XX, INCX fpmul T4, ALPHA_R, A8 LFSDUX A4, XX, INCX fpmadd C1, T1, T1, C1 LFDUX A5, XX, INCX fpmul T1, ALPHA_R, A1 LFDUX A6, XX, INCX fpmadd C2, T2, T2, C2 LFDUX A7, XX, INCX fpmul T2, ALPHA_R, A2 LFDUX A8, XX, INCX fpmadd C3, T3, T3, C3 LFSDUX A5, XX, INCX fpmul T3, ALPHA_R, A3 LFSDUX A6, XX, INCX fpmadd C4, T4, T4, C4 LFSDUX A7, XX, INCX fpmul T4, ALPHA_R, A4 LFSDUX A8, XX, INCX bdnz LL(122) .align 4 LL(123): fpmadd C1, T1, T1, C1 fpmul T1, ALPHA_R, A5 fpmadd C2, T2, T2, C2 fpmul T2, ALPHA_R, A6 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A7 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A8 fpmadd C1, T1, T1, C1 fpmadd C2, T2, T2, C2 fpmadd C3, T3, T3, C3 fpmadd C4, T4, T4, C4 .align 4 LL(125): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(126) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX LFSDUX A1, XX, INCX LFSDUX A2, XX, INCX LFSDUX A3, XX, INCX LFSDUX A4, XX, INCX fpmul A1, ALPHA_R, A1 fpmul A2, ALPHA_R, A2 fpmul A3, ALPHA_R, A3 fpmul A4, ALPHA_R, A4 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 .align 4 LL(126): andi. r0, N, 4 beq LL(127) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX fmul A1, ALPHA_R, A1 fmul A2, ALPHA_R, A2 fmul A3, ALPHA_R, A3 fmul A4, ALPHA_R, A4 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 .align 4 LL(127): andi. r0, N, 2 beq LL(128) LFDUX A1, XX, INCX LFDUX A2, XX, INCX fmul A1, ALPHA_R, A1 fmul A2, ALPHA_R, A2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 .align 4 LL(128): andi. r0, N, 1 beq LL(998) LFDUX A1, XX, INCX fmul A1, ALPHA_R, A1 fmadd C1, A1, A1, C1 .align 4 LL(998): fpadd C1, C1, C2 lis r3, 0x3f00 fpadd C3, C3, C4 lis r4, 0x4040 stw r3, 4(SP) stw r4, 8(SP) fpadd C1, C1, C3 lfs f10, 0(SP) fsmtp C2, C1 lfs f11, 4(SP) fadd C1, C2, C1 lfs f12, 8(SP) fcmpu cr0, f10, C1 beq cr0, LL(999) #ifndef HUMMER_EMULATOR frsqrte f9, C1 li r10, 16 fmul f2, f1, f9 lfpdux f29, SP, r10 fmul f3, f9, f11 lfpdux f28, SP, r10 fnmsub f7, f2, f9, f12 lfpdux f27, SP, r10 fmul f9, f3, f7 lfpdux f26, SP, r10 fadd f13, f11, f11 lfpdux f25, SP, r10 fmul f12, f1, f9 lfpdux f24, SP, r10 fmul f11, f12, f11 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 fmadd f1, f11, f1, f12 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fmul C1, ALPHA, C1 addi SP, SP, 16 blr #else fsqrt C1, C1 li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fmul C1, ALPHA, C1 addi SP, SP, 16 blr #endif .align 4 LL(999): li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/dnrm2_ppc440.S000066400000000000000000000253531313527062700200300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define NN r6 #define XX r7 #define PRE r8 #define FZERO 144(SP) #define FONE 148(SP) #define FMAX 152(SP) #define C1 156(SP) #define C2 160(SP) #define STACKSIZE 168 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 lis r12, 0x5fe0 lis r6, 0x3f00 lis r7, 0x4040 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE stw r12, FMAX stw r10, 4 + FMAX stw r6, C1 stw r7, C2 lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT sub X, X, INCX li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) mr NN, N mr XX, X LFDUX f1, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, N, 0 ble- LL(999) srawi. r0, N, 4 mtspr CTR, r0 beq- LL(50) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDUX f25, X, INCX fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDUX f27, X, INCX fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDUX f29, X, INCX fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDUX f31, X, INCX bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f4, f12 #ifdef PPCG4 dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f4, f12 #ifdef PPCG4 dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f31, X, INCX bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(99) .align 4 LL(60): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) .align 4 LL(99): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f31, f8, f0, f4 lfs f1, FZERO lfs f0, FONE lfd f2, FMAX fcmpu cr0, f1, f31 beq- cr0, LL(999) fdiv f30, f0, f31 fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 srawi. r0, NN, 4 mtspr CTR, r0 beq- cr0, LL(150) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX fmul f16, f30, f8 LFDUX f8, XX, INCX fmul f17, f30, f9 LFDUX f9, XX, INCX fmul f18, f30, f10 LFDUX f10, XX, INCX fmul f19, f30, f11 LFDUX f11, XX, INCX fmul f20, f30, f12 LFDUX f12, XX, INCX fmul f21, f30, f13 LFDUX f13, XX, INCX fmul f22, f30, f14 LFDUX f14, XX, INCX fmul f23, f30, f15 LFDUX f15, XX, INCX bdz LL(120) .align 4 LL(110): fmadd f0, f16, f16, f0 #ifdef PPCG4 dcbt XX, PRE #endif fmul f16, f30, f8 LFDUX f8, XX, INCX fmadd f1, f17, f17, f1 fmul f17, f30, f9 LFDUX f9, XX, INCX fmadd f2, f18, f18, f2 fmul f18, f30, f10 LFDUX f10, XX, INCX fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDUX f11, XX, INCX fmadd f4, f20, f20, f4 #ifdef PPCG4 dcbt XX, PRE #endif fmul f20, f30, f12 LFDUX f12, XX, INCX fmadd f5, f21, f21, f5 fmul f21, f30, f13 LFDUX f13, XX, INCX fmadd f6, f22, f22, f6 fmul f22, f30, f14 LFDUX f14, XX, INCX fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDUX f15, XX, INCX fmadd f0, f16, f16, f0 #ifdef PPCG4 dcbt XX, PRE #endif fmul f16, f30, f8 LFDUX f8, XX, INCX fmadd f1, f17, f17, f1 fmul f17, f30, f9 LFDUX f9, XX, INCX fmadd f2, f18, f18, f2 fmul f18, f30, f10 LFDUX f10, XX, INCX fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDUX f11, XX, INCX fmadd f4, f20, f20, f4 #ifdef PPCG4 dcbt XX, PRE #endif fmul f20, f30, f12 LFDUX f12, XX, INCX fmadd f5, f21, f21, f5 fmul f21, f30, f13 LFDUX f13, XX, INCX fmadd f6, f22, f22, f6 fmul f22, f30, f14 LFDUX f14, XX, INCX fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDUX f15, XX, INCX bdnz LL(110) .align 4 LL(120): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 .align 4 LL(150): andi. r0, NN, 15 mtspr CTR, r0 beq- cr0, LL(170) .align 4 LL(160): LFDUX f8, XX, INCX fmul f16, f30, f8 fmadd f0, f16, f16, f0 bdnz LL(160) .align 4 LL(170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f0, f0, f2 fadd f4, f4, f6 fadd f1, f0, f4 frsqrte f0, f1 lfs f8, C1 lfs f9, C2 fmul f2, f1, f0 fadd f7, f8, f8 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f2, f1, f0 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f2, f1, f0 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f5, f1, f0 fmul f2, f5, f8 fnmsub f3, f5, f0, f7 fmadd f1, f2, f3, f5 fmul f1, f31, f1 .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/dot.S000066400000000000000000000230601313527062700164730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #define FZERO f0 #define STACKSIZE 96 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stw r0, 80(SP) lfs FZERO, 80(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- cr0, LL(999) cmpwi cr0, INCX, SIZE bne cr0, LL(100) cmpwi cr0, INCY, SIZE bne cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f10, 2 * SIZE(X) LFD f11, 3 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) LFD f18, 2 * SIZE(Y) LFD f19, 3 * SIZE(Y) LFD f12, 4 * SIZE(X) LFD f13, 5 * SIZE(X) LFD f14, 6 * SIZE(X) LFD f15, 7 * SIZE(X) LFD f20, 4 * SIZE(Y) LFD f21, 5 * SIZE(Y) LFD f22, 6 * SIZE(Y) LFD f23, 7 * SIZE(Y) bdz LL(20) .align 4 LL(10): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) LFD f16, 8 * SIZE(Y) LFD f17, 9 * SIZE(Y) LFD f18, 10 * SIZE(Y) LFD f19, 11 * SIZE(Y) FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) LFD f20, 12 * SIZE(Y) LFD f21, 13 * SIZE(Y) LFD f22, 14 * SIZE(Y) LFD f23, 15 * SIZE(Y) FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFD f8, 16 * SIZE(X) LFD f9, 17 * SIZE(X) LFD f10, 18 * SIZE(X) LFD f11, 19 * SIZE(X) LFD f16, 16 * SIZE(Y) LFD f17, 17 * SIZE(Y) LFD f18, 18 * SIZE(Y) LFD f19, 19 * SIZE(Y) FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFD f12, 20 * SIZE(X) LFD f13, 21 * SIZE(X) LFD f14, 22 * SIZE(X) LFD f15, 23 * SIZE(X) LFD f20, 20 * SIZE(Y) LFD f21, 21 * SIZE(Y) LFD f22, 22 * SIZE(Y) LFD f23, 23 * SIZE(Y) #ifndef POWER6 L1_PREFETCH X, PREA #ifdef L1_DUALFETCH L1_PREFETCH Y, PREA #endif #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #ifdef L1_DUALFETCH L1_PREFETCH Y, PREA #endif #endif bdnz LL(10) .align 4 LL(20): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) LFD f16, 8 * SIZE(Y) LFD f17, 9 * SIZE(Y) LFD f18, 10 * SIZE(Y) LFD f19, 11 * SIZE(Y) FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) LFD f20, 12 * SIZE(Y) LFD f21, 13 * SIZE(Y) LFD f22, 14 * SIZE(Y) LFD f23, 15 * SIZE(Y) FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f16, 0 * SIZE(Y) addi X, X, 1 * SIZE addi Y, Y, 1 * SIZE FMADD f0, f8, f16, f0 bdnz LL(60) b LL(999) .align 4 LL(100): #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(102) subi r0, N, 1 mullw r0, r0, INCX sub X, X, r0 .align 4 LL(102): cmpwi cr0, INCY, 0 bge+ LL(104) subi r0, N, 1 mullw r0, r0, INCY sub Y, Y, r0 .align 4 LL(104): #endif sub X, X, INCX sub Y, Y, INCY srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY bdz LL(120) .align 4 LL(110): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY bdnz LL(110) .align 4 LL(120): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDUX f16, Y, INCY FMADD f0, f8, f16, f0 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f4, f4, f5 FADD f6, f6, f7 FADD f0, f0, f2 FADD f4, f4, f6 FADD f1, f0, f4 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/dot_cell.S000066400000000000000000000225271313527062700175010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #define FZERO f0 #define STACKSIZE 96 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stw r0, 80(SP) lfs FZERO, 80(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO li PREA, 16 * 20 * SIZE cmpwi cr0, N, 0 ble- cr0, LL(999) cmpwi cr0, INCX, SIZE bne cr0, LL(100) cmpwi cr0, INCY, SIZE bne cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f10, 2 * SIZE(X) LFD f11, 3 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) LFD f18, 2 * SIZE(Y) LFD f19, 3 * SIZE(Y) LFD f12, 4 * SIZE(X) LFD f13, 5 * SIZE(X) LFD f14, 6 * SIZE(X) LFD f15, 7 * SIZE(X) LFD f20, 4 * SIZE(Y) LFD f21, 5 * SIZE(Y) LFD f22, 6 * SIZE(Y) LFD f23, 7 * SIZE(Y) bdz LL(20) .align 4 LL(10): FMADD f0, f8, f16, f0 LFD f8, 8 * SIZE(X) LFD f16, 8 * SIZE(Y) FMADD f1, f9, f17, f1 LFD f9, 9 * SIZE(X) LFD f17, 9 * SIZE(Y) FMADD f2, f10, f18, f2 LFD f10, 10 * SIZE(X) LFD f18, 10 * SIZE(Y) FMADD f3, f11, f19, f3 LFD f11, 11 * SIZE(X) LFD f19, 11 * SIZE(Y) FMADD f4, f12, f20, f4 LFD f12, 12 * SIZE(X) LFD f20, 12 * SIZE(Y) FMADD f5, f13, f21, f5 LFD f13, 13 * SIZE(X) LFD f21, 13 * SIZE(Y) FMADD f6, f14, f22, f6 LFD f14, 14 * SIZE(X) LFD f22, 14 * SIZE(Y) FMADD f7, f15, f23, f7 LFD f15, 15 * SIZE(X) LFD f23, 15 * SIZE(Y) FMADD f0, f8, f16, f0 LFD f8, 16 * SIZE(X) LFD f16, 16 * SIZE(Y) FMADD f1, f9, f17, f1 LFD f9, 17 * SIZE(X) LFD f17, 17 * SIZE(Y) FMADD f2, f10, f18, f2 LFD f10, 18 * SIZE(X) LFD f18, 18 * SIZE(Y) FMADD f3, f11, f19, f3 LFD f11, 19 * SIZE(X) LFD f19, 19 * SIZE(Y) FMADD f4, f12, f20, f4 LFD f12, 20 * SIZE(X) LFD f20, 20 * SIZE(Y) FMADD f5, f13, f21, f5 LFD f13, 21 * SIZE(X) LFD f21, 21 * SIZE(Y) FMADD f6, f14, f22, f6 LFD f14, 22 * SIZE(X) LFD f22, 22 * SIZE(Y) FMADD f7, f15, f23, f7 LFD f15, 23 * SIZE(X) LFD f23, 23 * SIZE(Y) dcbt X, PREA addi X, X, 16 * SIZE dcbt Y, PREA addi Y, Y, 16 * SIZE bdnz LL(10) .align 4 LL(20): FMADD f0, f8, f16, f0 LFD f8, 8 * SIZE(X) LFD f16, 8 * SIZE(Y) FMADD f1, f9, f17, f1 LFD f9, 9 * SIZE(X) LFD f17, 9 * SIZE(Y) FMADD f2, f10, f18, f2 LFD f10, 10 * SIZE(X) LFD f18, 10 * SIZE(Y) FMADD f3, f11, f19, f3 LFD f11, 11 * SIZE(X) LFD f19, 11 * SIZE(Y) FMADD f4, f12, f20, f4 LFD f12, 12 * SIZE(X) LFD f20, 12 * SIZE(Y) FMADD f5, f13, f21, f5 LFD f13, 13 * SIZE(X) LFD f21, 13 * SIZE(Y) FMADD f6, f14, f22, f6 LFD f14, 14 * SIZE(X) LFD f22, 14 * SIZE(Y) FMADD f7, f15, f23, f7 LFD f15, 15 * SIZE(X) LFD f23, 15 * SIZE(Y) FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 addi X, X, 16 * SIZE FMADD f7, f15, f23, f7 addi Y, Y, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f16, 0 * SIZE(Y) addi X, X, 1 * SIZE addi Y, Y, 1 * SIZE FMADD f0, f8, f16, f0 bdnz LL(60) b LL(999) .align 4 LL(100): #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(102) subi r0, N, 1 mullw r0, r0, INCX sub X, X, r0 .align 4 LL(102): cmpwi cr0, INCY, 0 bge+ LL(104) subi r0, N, 1 mullw r0, r0, INCY sub Y, Y, r0 .align 4 LL(104): #endif sub X, X, INCX sub Y, Y, INCY srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY bdz LL(120) .align 4 LL(110): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY bdnz LL(110) .align 4 LL(120): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDUX f16, Y, INCY FMADD f0, f8, f16, f0 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f4, f4, f5 FADD f6, f6, f7 FADD f0, f0, f2 FADD f4, f4, f6 FADD f1, f0, f4 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/dot_hummer.S000066400000000000000000000366261313527062700200640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define INCX2 r8 #define INCY2 r9 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define A9 f20 #define B1 f12 #define B2 f13 #define B3 f14 #define B4 f15 #define B5 f16 #define B6 f17 #define B7 f18 #define B8 f19 #define B9 f20 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 slwi INCY, INCY, BASE_SHIFT fpmr C3, C1 add INCY2, INCY, INCY fpmr C4, C1 cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, SIZE bne LL(100) cmpwi cr0, INCY, SIZE bne LL(100) /* X is aligned, Y is aligned */ LL(10): andi. r0, X, 2 * SIZE - 1 bne LL(30) andi. r0, Y, 2 * SIZE - 1 bne LL(20) sub X, X, INCX2 sub Y, Y, INCY2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 LFPDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 bdz LL(14) .align 4 LL(13): fpmadd C1, A1, B1, C1 LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 fpmadd C2, A2, B2, C2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 fpmadd C3, A3, B3, C3 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 fpmadd C4, A4, B4, C4 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 fpmadd C1, A5, B5, C1 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 fpmadd C2, A6, B6, C2 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 fpmadd C3, A7, B7, C3 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 fpmadd C4, A8, B8, C4 LFPDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 bdnz LL(13) .align 4 LL(14): fpmadd C1, A1, B1, C1 fpmadd C2, A2, B2, C2 fpmadd C3, A3, B3, C3 fpmadd C4, A4, B4, C4 fpmadd C1, A5, B5, C1 fpmadd C2, A6, B6, C2 fpmadd C3, A7, B7, C3 fpmadd C4, A8, B8, C4 .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 fpmadd C1, A1, B1, C1 fpmadd C2, A2, B2, C2 fpmadd C3, A3, B3, C3 fpmadd C4, A4, B4, C4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 fpmadd C1, A1, B1, C1 fpmadd C2, A2, B2, C2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 fpmadd C1, A1, B1, C1 .align 4 LL(18): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 fmadd C1, A1, B1, C1 b LL(999) .align 4 /* X is aligned, Y is NOT aligned */ LL(20): LFD B1, 0 * SIZE(Y) sub X, X, INCX2 sub Y, Y, INCY srawi. r0, N, 4 mtspr CTR, r0 beq- LL(25) LFPDUX A1, X, INCX2 LFXDUX B2, Y, INCY2 LFPDUX A2, X, INCX2 LFXDUX B3, Y, INCY2 LFPDUX A3, X, INCX2 LFXDUX B4, Y, INCY2 LFPDUX A4, X, INCX2 LFXDUX B5, Y, INCY2 LFPDUX A5, X, INCX2 LFXDUX B6, Y, INCY2 LFPDUX A6, X, INCX2 LFXDUX B7, Y, INCY2 LFPDUX A7, X, INCX2 fsmr B1, B2 LFXDUX B8, Y, INCY2 fsmr B2, B3 LFPDUX A8, X, INCX2 fsmr B3, B4 bdz LL(24) .align 4 LL(23): fpmadd C1, A1, B1, C1 LFPDUX A1, X, INCX2 fsmr B4, B5 LFXDUX B9, Y, INCY2 fpmadd C2, A2, B2, C2 LFPDUX A2, X, INCX2 fsmr B5, B6 LFXDUX B2, Y, INCY2 fpmadd C3, A3, B3, C3 LFXDUX B3, Y, INCY2 fsmr B6, B7 LFPDUX A3, X, INCX2 fpmadd C4, A4, B4, C4 LFXDUX B4, Y, INCY2 fsmr B7, B8 LFPDUX A4, X, INCX2 fpmadd C1, A5, B5, C1 LFXDUX B5, Y, INCY2 fsmr B8, B9 LFPDUX A5, X, INCX2 fpmadd C2, A6, B6, C2 LFXDUX B6, Y, INCY2 fpmr B1, B9 LFPDUX A6, X, INCX2 fpmadd C3, A7, B7, C3 LFXDUX B7, Y, INCY2 fsmr B1, B2 LFPDUX A7, X, INCX2 fpmadd C4, A8, B8, C4 LFXDUX B8, Y, INCY2 fsmr B2, B3 LFPDUX A8, X, INCX2 fsmr B3, B4 bdnz LL(23) .align 4 LL(24): LFXDUX B9, Y, INCY2 fpmadd C1, A1, B1, C1 fsmr B4, B5 fpmadd C2, A2, B2, C2 fsmr B5, B6 fpmadd C3, A3, B3, C3 fsmr B6, B7 fpmadd C4, A4, B4, C4 fsmr B7, B8 fpmadd C1, A5, B5, C1 fsmr B8, B9 fpmadd C2, A6, B6, C2 fpmr B1, B9 fpmadd C3, A7, B7, C3 fpmadd C4, A8, B8, C4 .align 4 LL(25): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(26) LFPDUX A1, X, INCX2 LFXDUX B2, Y, INCY2 LFPDUX A2, X, INCX2 LFXDUX B3, Y, INCY2 LFPDUX A3, X, INCX2 LFXDUX B4, Y, INCY2 LFPDUX A4, X, INCX2 LFXDUX B5, Y, INCY2 fsmr B1, B2 fsmr B2, B3 fsmr B3, B4 fsmr B4, B5 fpmadd C1, A1, B1, C1 fpmadd C2, A2, B2, C2 fpmadd C3, A3, B3, C3 fpmadd C4, A4, B4, C4 fpmr B1, B5 .align 4 LL(26): andi. r0, N, 4 beq LL(27) LFPDUX A1, X, INCX2 LFXDUX B2, Y, INCY2 LFPDUX A2, X, INCX2 LFXDUX B3, Y, INCY2 fsmr B1, B2 fsmr B2, B3 fpmadd C1, A1, B1, C1 fpmr B1, B3 fpmadd C2, A2, B2, C2 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFPDUX A1, X, INCX2 LFXDUX B2, Y, INCY2 fsmr B1, B2 fpmadd C1, A1, B1, C1 fpmr B1, B2 .align 4 LL(28): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 fmadd C1, A1, B1, C1 b LL(999) .align 4 /* X is not aligned, Y is aligned */ LL(30): andi. r0, Y, 2 * SIZE - 1 bne LL(40) LFD A1, 0 * SIZE(X) sub X, X, INCX sub Y, Y, INCY2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(35) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 LFXDUX A3, X, INCX2 LFPDUX B2, Y, INCY2 LFXDUX A4, X, INCX2 LFPDUX B3, Y, INCY2 LFXDUX A5, X, INCX2 LFPDUX B4, Y, INCY2 LFXDUX A6, X, INCX2 LFPDUX B5, Y, INCY2 LFXDUX A7, X, INCX2 LFPDUX B6, Y, INCY2 LFXDUX A8, X, INCX2 fsmr A1, A2 LFPDUX B7, Y, INCY2 fsmr A2, A3 LFPDUX B8, Y, INCY2 fsmr A3, A4 bdz LL(34) .align 4 LL(33): fpmadd C1, A1, B1, C1 LFXDUX A9, X, INCX2 fsmr A4, A5 LFPDUX B1, Y, INCY2 fpmadd C2, A2, B2, C2 LFXDUX A2, X, INCX2 fsmr A5, A6 LFPDUX B2, Y, INCY2 fpmadd C3, A3, B3, C3 LFXDUX A3, X, INCX2 fsmr A6, A7 LFPDUX B3, Y, INCY2 fpmadd C4, A4, B4, C4 LFXDUX A4, X, INCX2 fsmr A7, A8 LFPDUX B4, Y, INCY2 fpmadd C1, A5, B5, C1 LFXDUX A5, X, INCX2 fsmr A8, A9 LFPDUX B5, Y, INCY2 fpmadd C2, A6, B6, C2 LFXDUX A6, X, INCX2 fpmr A1, A9 LFPDUX B6, Y, INCY2 fpmadd C3, A7, B7, C3 LFXDUX A7, X, INCX2 fsmr A1, A2 LFPDUX B7, Y, INCY2 fpmadd C4, A8, B8, C4 LFXDUX A8, X, INCX2 fsmr A2, A3 LFPDUX B8, Y, INCY2 fsmr A3, A4 bdnz LL(33) .align 4 LL(34): LFXDUX A9, X, INCX2 fpmadd C1, A1, B1, C1 fsmr A4, A5 fpmadd C2, A2, B2, C2 fsmr A5, A6 fpmadd C3, A3, B3, C3 fsmr A6, A7 fpmadd C4, A4, B4, C4 fsmr A7, A8 fpmadd C1, A5, B5, C1 fsmr A8, A9 fpmadd C2, A6, B6, C2 fpmr A1, A9 fpmadd C3, A7, B7, C3 fpmadd C4, A8, B8, C4 .align 4 LL(35): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(36) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 LFXDUX A3, X, INCX2 LFPDUX B2, Y, INCY2 LFXDUX A4, X, INCX2 LFPDUX B3, Y, INCY2 LFXDUX A5, X, INCX2 LFPDUX B4, Y, INCY2 fsmr A1, A2 fsmr A2, A3 fsmr A3, A4 fsmr A4, A5 fpmadd C1, A1, B1, C1 fpmr A1, A5 fpmadd C2, A2, B2, C2 fpmadd C3, A3, B3, C3 fpmadd C4, A4, B4, C4 .align 4 LL(36): andi. r0, N, 4 beq LL(37) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 LFXDUX A3, X, INCX2 LFPDUX B2, Y, INCY2 fsmr A1, A2 fsmr A2, A3 fpmadd C1, A1, B1, C1 fpmr A1, A3 fpmadd C2, A2, B2, C2 .align 4 LL(37): andi. r0, N, 2 beq LL(38) LFXDUX A2, X, INCX2 LFPDUX B1, Y, INCY2 fsmr A1, A2 fpmadd C1, A1, B1, C1 fpmr A1, A2 .align 4 LL(38): andi. r0, N, 1 beq LL(999) LFDUX B1, Y, INCY2 fmadd C1, A1, B1, C1 b LL(999) .align 4 /* X is NOT aligned, Y is NOT aligned */ LL(40): LFD A1, 0 * SIZE(X) LFD B1, 0 * SIZE(Y) sub X, X, INCX sub Y, Y, INCY addi N, N, -1 cmpwi cr0, N, 0 fmadd C1, A1, B1, C1 ble LL(999) srawi. r0, N, 4 mtspr CTR, r0 beq- LL(45) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 LFPDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 bdz LL(44) .align 4 LL(43): fpmadd C1, A1, B1, C1 LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 fpmadd C2, A2, B2, C2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 fpmadd C3, A3, B3, C3 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 fpmadd C4, A4, B4, C4 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 fpmadd C1, A5, B5, C1 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 fpmadd C2, A6, B6, C2 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 fpmadd C3, A7, B7, C3 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 fpmadd C4, A8, B8, C4 LFPDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 bdnz LL(43) .align 4 LL(44): fpmadd C1, A1, B1, C1 fpmadd C2, A2, B2, C2 fpmadd C3, A3, B3, C3 fpmadd C4, A4, B4, C4 fpmadd C1, A5, B5, C1 fpmadd C2, A6, B6, C2 fpmadd C3, A7, B7, C3 fpmadd C4, A8, B8, C4 .align 4 LL(45): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(46) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 fpmadd C1, A1, B1, C1 fpmadd C2, A2, B2, C2 fpmadd C3, A3, B3, C3 fpmadd C4, A4, B4, C4 .align 4 LL(46): andi. r0, N, 4 beq LL(47) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 fpmadd C1, A1, B1, C1 fpmadd C2, A2, B2, C2 .align 4 LL(47): andi. r0, N, 2 beq LL(48) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 fpmadd C1, A1, B1, C1 .align 4 LL(48): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 fmadd C1, A1, B1, C1 b LL(999) .align 4 LL(100): #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(101) subi r0, N, 1 mullw r0, r0, INCX sub X, X, r0 .align 4 LL(101): cmpwi cr0, INCY, 0 bge+ LL(102) subi r0, N, 1 mullw r0, r0, INCY sub Y, Y, r0 .align 4 LL(102): #endif sub X, X, INCX sub Y, Y, INCY srawi. r0, N, 3 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX B1, Y, INCY LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX LFDUX B3, Y, INCY LFDUX A4, X, INCX LFDUX B4, Y, INCY LFDUX A5, X, INCX LFDUX B5, Y, INCY LFDUX A6, X, INCX LFDUX B6, Y, INCY LFDUX A7, X, INCX LFDUX B7, Y, INCY LFDUX A8, X, INCX LFDUX B8, Y, INCY bdz LL(104) .align 4 LL(103): fmadd C1, A1, B1, C1 LFDUX A1, X, INCX LFDUX B1, Y, INCY fmadd C2, A2, B2, C2 LFDUX A2, X, INCX LFDUX B2, Y, INCY fmadd C3, A3, B3, C3 LFDUX A3, X, INCX LFDUX B3, Y, INCY fmadd C4, A4, B4, C4 LFDUX A4, X, INCX LFDUX B4, Y, INCY fmadd C1, A5, B5, C1 LFDUX A5, X, INCX LFDUX B5, Y, INCY fmadd C2, A6, B6, C2 LFDUX A6, X, INCX LFDUX B6, Y, INCY fmadd C3, A7, B7, C3 LFDUX A7, X, INCX LFDUX B7, Y, INCY fmadd C4, A8, B8, C4 LFDUX A8, X, INCX LFDUX B8, Y, INCY bdnz LL(103) .align 4 LL(104): fmadd C1, A1, B1, C1 fmadd C2, A2, B2, C2 fmadd C3, A3, B3, C3 fmadd C4, A4, B4, C4 fmadd C1, A5, B5, C1 fmadd C2, A6, B6, C2 fmadd C3, A7, B7, C3 fmadd C4, A8, B8, C4 .align 4 LL(105): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX B1, Y, INCY LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX LFDUX B3, Y, INCY LFDUX A4, X, INCX LFDUX B4, Y, INCY fmadd C1, A1, B1, C1 fmadd C2, A2, B2, C2 fmadd C3, A3, B3, C3 fmadd C4, A4, B4, C4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX B1, Y, INCY LFDUX A2, X, INCX LFDUX B2, Y, INCY fmadd C1, A1, B1, C1 fmadd C2, A2, B2, C2 .align 4 LL(108): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX LFDUX B1, Y, INCY fmadd C1, A1, B1, C1 .align 4 LL(999): li r10, 16 fpadd C1, C1, C2 fpadd C3, C3, C4 fpadd C1, C1, C3 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 fsmtp C2, C1 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fadd C1, C1, C2 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/dot_ppc440.S000066400000000000000000000154661313527062700176000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PRE r8 #define FZERO f0 #define STACKSIZE 96 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stw r0, 80(SP) lfs FZERO, 80(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(102) subi r0, N, 1 mullw r0, r0, INCX sub X, X, r0 .align 4 LL(102): cmpwi cr0, INCY, 0 bge+ LL(104) subi r0, N, 1 mullw r0, r0, INCY sub Y, Y, r0 .align 4 LL(104): #endif sub X, X, INCX sub Y, Y, INCY srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f8, X, INCX LFDUX f16, Y, INCY LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDUX f10, X, INCX LFDUX f18, Y, INCY LFDUX f11, X, INCX LFDUX f19, Y, INCY LFDUX f12, X, INCX LFDUX f20, Y, INCY LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDUX f14, X, INCX LFDUX f22, Y, INCY LFDUX f15, X, INCX LFDUX f23, Y, INCY bdz LL(120) .align 4 LL(110): FMADD f0, f8, f16, f0 LFDUX f8, X, INCX LFDUX f16, Y, INCY #ifdef PPCG4 dcbt X, PRE #endif FMADD f1, f9, f17, f1 LFDUX f9, X, INCX LFDUX f17, Y, INCY FMADD f2, f10, f18, f2 LFDUX f10, X, INCX LFDUX f18, Y, INCY #ifdef PPCG4 dcbt Y, PRE #endif FMADD f3, f11, f19, f3 LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 LFDUX f12, X, INCX LFDUX f20, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f5, f13, f21, f5 LFDUX f13, X, INCX LFDUX f21, Y, INCY FMADD f6, f14, f22, f6 LFDUX f14, X, INCX LFDUX f22, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbt Y, PRE #endif FMADD f7, f15, f23, f7 LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 LFDUX f8, X, INCX LFDUX f16, Y, INCY #ifdef PPCG4 dcbt X, PRE #endif FMADD f1, f9, f17, f1 LFDUX f9, X, INCX LFDUX f17, Y, INCY FMADD f2, f10, f18, f2 LFDUX f10, X, INCX LFDUX f18, Y, INCY #ifdef PPCG4 dcbt Y, PRE #endif FMADD f3, f11, f19, f3 LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 LFDUX f12, X, INCX LFDUX f20, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f5, f13, f21, f5 LFDUX f13, X, INCX LFDUX f21, Y, INCY FMADD f6, f14, f22, f6 LFDUX f14, X, INCX LFDUX f22, Y, INCY #if defined(PPCG4) && defined(DOUBLE) dcbt Y, PRE #endif FMADD f7, f15, f23, f7 LFDUX f15, X, INCX LFDUX f23, Y, INCY bdnz LL(110) .align 4 LL(120): FMADD f0, f8, f16, f0 LFDUX f8, X, INCX LFDUX f16, Y, INCY FMADD f1, f9, f17, f1 LFDUX f9, X, INCX LFDUX f17, Y, INCY FMADD f2, f10, f18, f2 LFDUX f10, X, INCX LFDUX f18, Y, INCY FMADD f3, f11, f19, f3 LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f4, f12, f20, f4 LFDUX f12, X, INCX LFDUX f20, Y, INCY FMADD f5, f13, f21, f5 LFDUX f13, X, INCX LFDUX f21, Y, INCY FMADD f6, f14, f22, f6 LFDUX f14, X, INCX LFDUX f22, Y, INCY FMADD f7, f15, f23, f7 LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f10, f18, f2 FMADD f3, f11, f19, f3 FMADD f4, f12, f20, f4 FMADD f5, f13, f21, f5 FMADD f6, f14, f22, f6 FMADD f7, f15, f23, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDUX f16, Y, INCY FMADD f0, f8, f16, f0 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f4, f4, f5 FADD f6, f6, f7 FADD f0, f0, f2 FADD f4, f4, f6 FADD f1, f0, f4 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/drot.c000066400000000000000000000065521313527062700167040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #pragma GCC optimize "O1" #if defined(POWER8) #include "drot_microk_power8.c" #endif #ifndef HAVE_KERNEL_16 static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i=0; FLOAT f0, f1, f2, f3; FLOAT x00, x01, x02, x03; FLOAT g0, g1, g2, g3; FLOAT y00, y01, y02, y03; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { drot_kernel_16(n1, x1, y1, c, s); i=n1; } while(i < n) { temp = c*x[i] + s*y[i] ; y[i] = c*y[i] - s*x[i] ; x[i] = temp ; i++ ; } } else { while(i < n) { temp = c*x[ix] + s*y[iy] ; y[iy] = c*y[iy] - s*x[ix] ; x[ix] = temp ; ix += inc_x ; iy += inc_y ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/drot_microk_power8.c000066400000000000000000000152741313527062700215550ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * * I don't use fused multiply-add ( precision problems with lapack ) * * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static void drot_kernel_16 (long n, double *x, double *y, double c, double s) { __vector double t0; __vector double t1; __vector double t2; __vector double t3; __vector double t4; __vector double t5; __vector double t6; __vector double t7; __asm__ ( "xxspltd 36, %x13, 0 \n\t" // load c to both dwords "xxspltd 37, %x14, 0 \n\t" // load s to both dwords "lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 33, %15, %3 \n\t" "lxvd2x 34, %16, %3 \n\t" "lxvd2x 35, %17, %3 \n\t" "lxvd2x 48, 0, %4 \n\t" // load y "lxvd2x 49, %15, %4 \n\t" "lxvd2x 50, %16, %4 \n\t" "lxvd2x 51, %17, %4 \n\t" "addi %3, %3, 64 \n\t" "addi %4, %4, 64 \n\t" "addic. %2, %2, -8 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" "xvmuldp 42, 34, 36 \n\t" "xvmuldp 43, 35, 36 \n\t" "xvmuldp %x5, 48, 36 \n\t" // c * y "xvmuldp %x6, 49, 36 \n\t" "xvmuldp %x7, 50, 36 \n\t" "xvmuldp %x8, 51, 36 \n\t" "xvmuldp 44, 32, 37 \n\t" // s * x "xvmuldp 45, 33, 37 \n\t" "lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 33, %15, %3 \n\t" "xvmuldp 46, 34, 37 \n\t" "xvmuldp 47, 35, 37 \n\t" "lxvd2x 34, %16, %3 \n\t" "lxvd2x 35, %17, %3 \n\t" "xvmuldp %x9, 48, 37 \n\t" // s * y "xvmuldp %x10, 49, 37 \n\t" "lxvd2x 48, 0, %4 \n\t" // load y "lxvd2x 49, %15, %4 \n\t" "xvmuldp %x11, 50, 37 \n\t" "xvmuldp %x12, 51, 37 \n\t" "lxvd2x 50, %16, %4 \n\t" "lxvd2x 51, %17, %4 \n\t" "xvadddp 40, 40, %x9 \n\t" // c * x + s * y "xvadddp 41, 41, %x10 \n\t" // c * x + s * y "addi %3, %3, -64 \n\t" "addi %4, %4, -64 \n\t" "xvadddp 42, 42, %x11 \n\t" // c * x + s * y "xvadddp 43, 43, %x12 \n\t" // c * x + s * y "xvsubdp %x5, %x5, 44 \n\t" // c * y - s * x "xvsubdp %x6, %x6, 45 \n\t" // c * y - s * x "xvsubdp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubdp %x8, %x8, 47 \n\t" // c * y - s * x "stxvd2x 40, 0, %3 \n\t" // store x "stxvd2x 41, %15, %3 \n\t" "stxvd2x 42, %16, %3 \n\t" "stxvd2x 43, %17, %3 \n\t" "stxvd2x %x5, 0, %4 \n\t" // store y "stxvd2x %x6, %15, %4 \n\t" "stxvd2x %x7, %16, %4 \n\t" "stxvd2x %x8, %17, %4 \n\t" "addi %3, %3, 128 \n\t" "addi %4, %4, 128 \n\t" "addic. %2, %2, -8 \n\t" "bgt 1b \n" "2: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" "xvmuldp 42, 34, 36 \n\t" "xvmuldp 43, 35, 36 \n\t" "xvmuldp %x5, 48, 36 \n\t" // c * y "xvmuldp %x6, 49, 36 \n\t" "xvmuldp %x7, 50, 36 \n\t" "xvmuldp %x8, 51, 36 \n\t" "xvmuldp 44, 32, 37 \n\t" // s * x "xvmuldp 45, 33, 37 \n\t" "xvmuldp 46, 34, 37 \n\t" "xvmuldp 47, 35, 37 \n\t" "xvmuldp %x9, 48, 37 \n\t" // s * y "xvmuldp %x10, 49, 37 \n\t" "xvmuldp %x11, 50, 37 \n\t" "xvmuldp %x12, 51, 37 \n\t" "addi %3, %3, -64 \n\t" "addi %4, %4, -64 \n\t" "xvadddp 40, 40, %x9 \n\t" // c * x + s * y "xvadddp 41, 41, %x10 \n\t" // c * x + s * y "xvadddp 42, 42, %x11 \n\t" // c * x + s * y "xvadddp 43, 43, %x12 \n\t" // c * x + s * y "xvsubdp %x5, %x5, 44 \n\t" // c * y - s * x "xvsubdp %x6, %x6, 45 \n\t" // c * y - s * x "xvsubdp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubdp %x8, %x8, 47 \n\t" // c * y - s * x "stxvd2x 40, 0, %3 \n\t" // store x "stxvd2x 41, %15, %3 \n\t" "stxvd2x 42, %16, %3 \n\t" "stxvd2x 43, %17, %3 \n\t" "stxvd2x %x5, 0, %4 \n\t" // store y "stxvd2x %x6, %15, %4 \n\t" "stxvd2x %x7, %16, %4 \n\t" "stxvd2x %x8, %17, %4 \n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" : "+m" (*x), "+m" (*y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y), // 4 "=wa" (t0), // 5 "=wa" (t1), // 6 "=wa" (t2), // 7 "=wa" (t3), // 8 "=wa" (t4), // 9 "=wa" (t5), // 10 "=wa" (t6), // 11 "=wa" (t7) // 12 : "d" (c), // 13 "d" (s), // 14 "b" (16), // 15 "b" (32), // 16 "b" (48) // 17 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); } OpenBLAS-0.2.20/kernel/power/dscal.c000066400000000000000000000071661313527062700170240ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "dscal_microk_power8.c" #endif #if !defined(HAVE_KERNEL_8) static void dscal_kernel_8 (BLASLONG n, FLOAT *x, FLOAT alpha) { BLASLONG i; for( i=0; i 0 ) { dscal_kernel_8_zero(n1, x); j=n1; } while(j < n) { x[j]=0.0; j++; } } else { BLASLONG n1 = n & -16; if ( n1 > 0 ) { dscal_kernel_8(n1, x, da); j=n1; } while(j < n) { x[j] = da * x[j] ; j++; } } } else { if ( da == 0.0 ) { while(j < n) { x[i]=0.0; i += inc_x ; j++; } } else { while(j < n) { x[i] = da * x[i] ; i += inc_x ; j++; } } } return 0; } OpenBLAS-0.2.20/kernel/power/dscal_microk_power8.c000066400000000000000000000125331313527062700216660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_8 1 static void dscal_kernel_8 (long n, double *x, double alpha) { __asm__ ( "dcbt 0, %2 \n\t" "xxspltd %x3, %x3, 0 \n\t" "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %4, %2 \n\t" "lxvd2x 34, %5, %2 \n\t" "lxvd2x 35, %6, %2 \n\t" "lxvd2x 36, %7, %2 \n\t" "lxvd2x 37, %8, %2 \n\t" "lxvd2x 38, %9, %2 \n\t" "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %4, %2 \n\t" "xvmuldp 42, 34, %x3 \n\t" "xvmuldp 43, 35, %x3 \n\t" "lxvd2x 34, %5, %2 \n\t" "lxvd2x 35, %6, %2 \n\t" "xvmuldp 44, 36, %x3 \n\t" "xvmuldp 45, 37, %x3 \n\t" "lxvd2x 36, %7, %2 \n\t" "lxvd2x 37, %8, %2 \n\t" "xvmuldp 46, 38, %x3 \n\t" "xvmuldp 47, 39, %x3 \n\t" "lxvd2x 38, %9, %2 \n\t" "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, -128 \n\t" "stxvd2x 40, 0, %2 \n\t" "stxvd2x 41, %4, %2 \n\t" "stxvd2x 42, %5, %2 \n\t" "stxvd2x 43, %6, %2 \n\t" "stxvd2x 44, %7, %2 \n\t" "stxvd2x 45, %8, %2 \n\t" "stxvd2x 46, %9, %2 \n\t" "stxvd2x 47, %10, %2 \n\t" "addi %2, %2, 256 \n\t" "addic. %1, %1, -16 \n\t" "bgt 1b \n" "2: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" "xvmuldp 42, 34, %x3 \n\t" "xvmuldp 43, 35, %x3 \n\t" "addi %2, %2, -128 \n\t" "xvmuldp 44, 36, %x3 \n\t" "xvmuldp 45, 37, %x3 \n\t" "xvmuldp 46, 38, %x3 \n\t" "xvmuldp 47, 39, %x3 \n\t" "stxvd2x 40, 0, %2 \n\t" "stxvd2x 41, %4, %2 \n\t" "stxvd2x 42, %5, %2 \n\t" "stxvd2x 43, %6, %2 \n\t" "stxvd2x 44, %7, %2 \n\t" "stxvd2x 45, %8, %2 \n\t" "stxvd2x 46, %9, %2 \n\t" "stxvd2x 47, %10, %2 \n" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : "+m" (*x), "+r" (n), // 1 "+b" (x) // 2 : "d" (alpha), // 3 "b" (16), // 4 "b" (32), // 5 "b" (48), // 6 "b" (64), // 7 "b" (80), // 8 "b" (96), // 9 "b" (112) // 10 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } static void dscal_kernel_8_zero (long n, double *x) { __vector double t0; __asm__ ( "xxlxor %x3, %x3, %x3 \n\t" ".p2align 5 \n" "1: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" "stxvd2x %x3, %5, %2 \n\t" "stxvd2x %x3, %6, %2 \n\t" "stxvd2x %x3, %7, %2 \n\t" "stxvd2x %x3, %8, %2 \n\t" "stxvd2x %x3, %9, %2 \n\t" "stxvd2x %x3, %10, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" "bgt 1b \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : "=m" (*x), "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0) // 3 : "b" (16), // 4 "b" (32), // 5 "b" (48), // 6 "b" (64), // 7 "b" (80), // 8 "b" (96), // 9 "b" (112) // 10 : "cr0" ); } OpenBLAS-0.2.20/kernel/power/dswap.c000066400000000000000000000064621313527062700170520ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "dswap_microk_power8.c" #endif #ifndef HAVE_KERNEL_32 static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT g0, g1, g2, g3, g4, g5, g6, g7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { dswap_kernel_32(n1, x, y); i=n1; } while(i < n) { temp = y[i]; y[i] = x[i] ; x[i] = temp; i++ ; } } else { while(i < n) { temp = y[iy]; y[iy] = x[ix] ; x[ix] = temp; ix += inc_x ; iy += inc_y ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/dswap_microk_power8.c000066400000000000000000000121031313527062700217070ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_32 1 static void dswap_kernel_32 (long n, double *x, double *y) { __asm__ ( ".p2align 5 \n" "1: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" "lxvd2x 34, %6, %4 \n\t" "lxvd2x 35, %7, %4 \n\t" "lxvd2x 36, %8, %4 \n\t" "lxvd2x 37, %9, %4 \n\t" "lxvd2x 38, %10, %4 \n\t" "lxvd2x 39, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "lxvd2x 40, 0, %4 \n\t" "lxvd2x 41, %5, %4 \n\t" "lxvd2x 42, %6, %4 \n\t" "lxvd2x 43, %7, %4 \n\t" "lxvd2x 44, %8, %4 \n\t" "lxvd2x 45, %9, %4 \n\t" "lxvd2x 46, %10, %4 \n\t" "lxvd2x 47, %11, %4 \n\t" "addi %4, %4, -128 \n\t" "lxvd2x 48, 0, %3 \n\t" "lxvd2x 49, %5, %3 \n\t" "lxvd2x 50, %6, %3 \n\t" "lxvd2x 51, %7, %3 \n\t" "lxvd2x 0, %8, %3 \n\t" "lxvd2x 1, %9, %3 \n\t" "lxvd2x 2, %10, %3 \n\t" "lxvd2x 3, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "lxvd2x 4, 0, %3 \n\t" "lxvd2x 5, %5, %3 \n\t" "lxvd2x 6, %6, %3 \n\t" "lxvd2x 7, %7, %3 \n\t" "lxvd2x 8, %8, %3 \n\t" "lxvd2x 9, %9, %3 \n\t" "lxvd2x 10, %10, %3 \n\t" "lxvd2x 11, %11, %3 \n\t" "addi %3, %3, -128 \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 48, 0, %4 \n\t" "stxvd2x 49, %5, %4 \n\t" "stxvd2x 50, %6, %4 \n\t" "stxvd2x 51, %7, %4 \n\t" "stxvd2x 0, %8, %4 \n\t" "stxvd2x 1, %9, %4 \n\t" "stxvd2x 2, %10, %4 \n\t" "stxvd2x 3, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "stxvd2x 4, 0, %4 \n\t" "stxvd2x 5, %5, %4 \n\t" "stxvd2x 6, %6, %4 \n\t" "stxvd2x 7, %7, %4 \n\t" "stxvd2x 8, %8, %4 \n\t" "stxvd2x 9, %9, %4 \n\t" "stxvd2x 10, %10, %4 \n\t" "stxvd2x 11, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" "bgt 1b \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "+m" (*x), "+m" (*y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 : "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3", "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11" ); } OpenBLAS-0.2.20/kernel/power/dtrmm_kernel_16x4_power8.S000066400000000000000000000215541313527062700224640ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA_SP 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define alpha_r vs18 #define o0 0 #define K1 r13 #define KKK r14 #define o8 r15 #define o24 r16 #define ALPHA r17 #define L r18 #define T1 r19 #define KK r20 #define BB r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o16 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T2 r31 #include "dtrmm_macros_16x4_power8.S" #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) std r13, 288(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) stw r13, 216(SP) #endif stfd f1, ALPHA_SP stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif mr KK, OFFSET #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, KK #endif cmpwi cr0, M, 0 ble .L999_H1 cmpwi cr0, N, 0 ble .L999_H1 cmpwi cr0, K, 0 ble .L999_H1 #ifdef __64BIT__ addi ALPHA, SP, 296 #else addi ALPHA, SP, 224 #endif li PRE, 256 li o8 , 8 li o16, 16 li o24, 24 li o32, 32 li o48, 48 lxvdsx alpha_r, 0, ALPHA #include "dtrmm_logic_16x4_power8.S" .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) ld r13, 288(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) lwz r13, 216(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/dtrmm_logic_16x4_power8.S000066400000000000000000001163521313527062700223020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 2 ble .LDTRMM_L4_END .LDTRMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 4 ble .LDTRMM_L4x16_END .LDTRMM_L4x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 7 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L4x16_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L4x16_SUB4 .LDTRMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 dcbt AO, PRE KERNEL4x16_I1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 addic. L, L, -2 ble .LDTRMM_L4x16_LOOP_END .align 5 .LDTRMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 addic. L, L, -1 bgt .LDTRMM_L4x16_LOOP .LDTRMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 KERNEL4x16_E2 b .LDTRMM_L4x16_SUB1 .LDTRMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 dcbt AO, PRE KERNEL4x16_SUB1 dcbt AO, PRE KERNEL4x16_SUB1 dcbt AO, PRE KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 b .LDTRMM_L4x16_SUB1 .LDTRMM_L4x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x16_SUBI1 addic. L, L, -1 ble .LDTRMM_L4x16_SAVE b .LDTRMM_L4x16_SUB2 .LDTRMM_L4x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L4x16_SAVE .LDTRMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 bgt .LDTRMM_L4x16_SUB2 .LDTRMM_L4x16_SAVE: SAVE4x16 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 16 // KK += Number of values in A #endif addic. I, I, -1 bgt .LDTRMM_L4x16_BEGIN .LDTRMM_L4x16_END: .LDTRMM_L4x8_BEGIN: andi. T2, M, 15 ble .LDTRMM_L4x1_END andi. T1, M, 8 ble .LDTRMM_L4x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L4x8_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L4x8_SUB4 .LDTRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 addic. L, L, -2 ble .LDTRMM_L4x8_LOOP_END .align 5 .LDTRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 addic. L, L, -1 bgt .LDTRMM_L4x8_LOOP .LDTRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_E2 b .LDTRMM_L4x8_SUB1 .LDTRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 b .LDTRMM_L4x8_SUB1 .LDTRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 ble .LDTRMM_L4x8_SAVE b .LDTRMM_L4x8_SUB2 .LDTRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L4x8_SAVE .LDTRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 bgt .LDTRMM_L4x8_SUB2 .LDTRMM_L4x8_SAVE: SAVE4x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif .LDTRMM_L4x8_END: .LDTRMM_L4x4_BEGIN: andi. T1, M, 4 ble .LDTRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L4x4_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L4x4_SUB4 .LDTRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -2 ble .LDTRMM_L4x4_LOOP_END .align 5 .LDTRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -1 bgt .LDTRMM_L4x4_LOOP .LDTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_E2 b .LDTRMM_L4x4_SUB1 .LDTRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 b .LDTRMM_L4x4_SUB1 .LDTRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 ble .LDTRMM_L4x4_SAVE b .LDTRMM_L4x4_SUB2 .LDTRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L4x4_SAVE .LDTRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 bgt .LDTRMM_L4x4_SUB2 .LDTRMM_L4x4_SAVE: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif .LDTRMM_L4x4_END: .LDTRMM_L4x2_BEGIN: andi. T1, M, 2 ble .LDTRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L4x2_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L4x2_SUB4 .LDTRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -2 ble .LDTRMM_L4x2_LOOP_END .align 5 .LDTRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -1 bgt .LDTRMM_L4x2_LOOP .LDTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_E2 b .LDTRMM_L4x2_SUB1 .LDTRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 b .LDTRMM_L4x2_SUB1 .LDTRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 ble .LDTRMM_L4x2_SAVE b .LDTRMM_L4x2_SUB2 .LDTRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L4x2_SAVE .LDTRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 bgt .LDTRMM_L4x2_SUB2 .LDTRMM_L4x2_SAVE: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif .LDTRMM_L4x2_END: .LDTRMM_L4x1_BEGIN: andi. T1, M, 1 ble .LDTRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L4x1_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L4x1_SUB4 .LDTRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -2 ble .LDTRMM_L4x1_LOOP_END .align 5 .LDTRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -1 bgt .LDTRMM_L4x1_LOOP .LDTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_E2 b .LDTRMM_L4x1_SUB1 .LDTRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 b .LDTRMM_L4x1_SUB1 .LDTRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 ble .LDTRMM_L4x1_SAVE b .LDTRMM_L4x1_SUB2 .LDTRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L4x1_SAVE .LDTRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 bgt .LDTRMM_L4x1_SUB2 .LDTRMM_L4x1_SAVE: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif .LDTRMM_L4x1_END: slwi T1, K, 5 add B, B, T1 #if !defined(LEFT) addi KK, KK, 4 // KK += Number of values in B #endif addic. J, J, -1 bgt .LDTRMM_L4_BEGIN andi. T2, N, 3 ble .L999 .LDTRMM_L4_END: b .LDTRMM_L2_BEGIN .L999_H1: b .L999 .LDTRMM_L2_BEGIN: andi. T1, N, 2 ble .LDTRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 4 ble .LDTRMM_L2x16_END .LDTRMM_L2x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 7 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L2x16_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L2x16_SUB4 .LDTRMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 dcbt AO, PRE KERNEL2x16_I1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -2 ble .LDTRMM_L2x16_LOOP_END .align 5 .LDTRMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -1 bgt .LDTRMM_L2x16_LOOP .LDTRMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 KERNEL2x16_E2 b .LDTRMM_L2x16_SUB1 .LDTRMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 dcbt AO, PRE KERNEL2x16_SUB1 dcbt AO, PRE KERNEL2x16_SUB1 dcbt AO, PRE KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 b .LDTRMM_L2x16_SUB1 .LDTRMM_L2x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x16_SUBI1 addic. L, L, -1 ble .LDTRMM_L2x16_SAVE b .LDTRMM_L2x16_SUB2 .LDTRMM_L2x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L2x16_SAVE .LDTRMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 bgt .LDTRMM_L2x16_SUB2 .LDTRMM_L2x16_SAVE: SAVE2x16 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 16 // KK += Number of values in A #endif addic. I, I, -1 bgt .LDTRMM_L2x16_BEGIN .LDTRMM_L2x16_END: .LDTRMM_L2x8_BEGIN: andi. T2, M, 15 ble .LDTRMM_L2x1_END andi. T1, M, 8 ble .LDTRMM_L2x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L2x8_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L2x8_SUB4 .LDTRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -2 ble .LDTRMM_L2x8_LOOP_END .align 5 .LDTRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -1 bgt .LDTRMM_L2x8_LOOP .LDTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_E2 b .LDTRMM_L2x8_SUB1 .LDTRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b .LDTRMM_L2x8_SUB1 .LDTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 ble .LDTRMM_L2x8_SAVE b .LDTRMM_L2x8_SUB2 .LDTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L2x8_SAVE .LDTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt .LDTRMM_L2x8_SUB2 .LDTRMM_L2x8_SAVE: SAVE2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif .LDTRMM_L2x8_END: .LDTRMM_L2x4_BEGIN: andi. T1, M, 4 ble .LDTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L2x4_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L2x4_SUB4 .LDTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble .LDTRMM_L2x4_LOOP_END .align 5 .LDTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt .LDTRMM_L2x4_LOOP .LDTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b .LDTRMM_L2x4_SUB1 .LDTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b .LDTRMM_L2x4_SUB1 .LDTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 ble .LDTRMM_L2x4_SAVE b .LDTRMM_L2x4_SUB2 .LDTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L2x4_SAVE .LDTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt .LDTRMM_L2x4_SUB2 .LDTRMM_L2x4_SAVE: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif .LDTRMM_L2x4_END: .LDTRMM_L2x2_BEGIN: andi. T1, M, 2 ble .LDTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L2x2_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L2x2_SUB4 .LDTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble .LDTRMM_L2x2_LOOP_END .align 5 .LDTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt .LDTRMM_L2x2_LOOP .LDTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b .LDTRMM_L2x2_SUB1 .LDTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b .LDTRMM_L2x2_SUB1 .LDTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 ble .LDTRMM_L2x2_SAVE b .LDTRMM_L2x2_SUB2 .LDTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L2x2_SAVE .LDTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt .LDTRMM_L2x2_SUB2 .LDTRMM_L2x2_SAVE: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif .LDTRMM_L2x2_END: .LDTRMM_L2x1_BEGIN: andi. T1, M, 1 ble .LDTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L2x1_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L2x1_SUB4 .LDTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble .LDTRMM_L2x1_LOOP_END .align 5 .LDTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt .LDTRMM_L2x1_LOOP .LDTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b .LDTRMM_L2x1_SUB1 .LDTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b .LDTRMM_L2x1_SUB1 .LDTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 ble .LDTRMM_L2x1_SAVE b .LDTRMM_L2x1_SUB2 .LDTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L2x1_SAVE .LDTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt .LDTRMM_L2x1_SUB2 .LDTRMM_L2x1_SAVE: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif .LDTRMM_L2x1_END: slwi T1, K, 4 add B, B, T1 #if !defined(LEFT) addi KK, KK, 2 // KK += Number of values in B #endif .LDTRMM_L2_END: .LDTRMM_L1_BEGIN: andi. T1, N, 1 ble .LDTRMM_L1_END mr CO, C mr AO, A #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 4 ble .LDTRMM_L1x16_END .LDTRMM_L1x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 7 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L1x16_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L1x16_SUB4 .LDTRMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 dcbt AO, PRE KERNEL1x16_I1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -2 ble .LDTRMM_L1x16_LOOP_END .align 5 .LDTRMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -1 bgt .LDTRMM_L1x16_LOOP .LDTRMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 KERNEL1x16_E2 b .LDTRMM_L1x16_SUB1 .LDTRMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 dcbt AO, PRE KERNEL1x16_SUB1 dcbt AO, PRE KERNEL1x16_SUB1 dcbt AO, PRE KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 b .LDTRMM_L1x16_SUB1 .LDTRMM_L1x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x16_SUBI1 addic. L, L, -1 ble .LDTRMM_L1x16_SAVE b .LDTRMM_L1x16_SUB2 .LDTRMM_L1x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L1x16_SAVE .LDTRMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 bgt .LDTRMM_L1x16_SUB2 .LDTRMM_L1x16_SAVE: SAVE1x16 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 16 // KK += Number of values in A #endif addic. I, I, -1 bgt .LDTRMM_L1x16_BEGIN .LDTRMM_L1x16_END: .LDTRMM_L1x8_BEGIN: andi. T2, M, 15 ble .LDTRMM_L1x1_END andi. T1, M, 8 ble .LDTRMM_L1x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L1x8_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L1x8_SUB4 .LDTRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -2 ble .LDTRMM_L1x8_LOOP_END .align 5 .LDTRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -1 bgt .LDTRMM_L1x8_LOOP .LDTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_E2 b .LDTRMM_L1x8_SUB1 .LDTRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b .LDTRMM_L1x8_SUB1 .LDTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 ble .LDTRMM_L1x8_SAVE b .LDTRMM_L1x8_SUB2 .LDTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L1x8_SAVE .LDTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt .LDTRMM_L1x8_SUB2 .LDTRMM_L1x8_SAVE: SAVE1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif .LDTRMM_L1x8_END: .LDTRMM_L1x4_BEGIN: andi. T1, M, 4 ble .LDTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L1x4_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L1x4_SUB4 .LDTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble .LDTRMM_L1x4_LOOP_END .align 5 .LDTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt .LDTRMM_L1x4_LOOP .LDTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b .LDTRMM_L1x4_SUB1 .LDTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b .LDTRMM_L1x4_SUB1 .LDTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 ble .LDTRMM_L1x4_SAVE b .LDTRMM_L1x4_SUB2 .LDTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L1x4_SAVE .LDTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt .LDTRMM_L1x4_SUB2 .LDTRMM_L1x4_SAVE: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif .LDTRMM_L1x4_END: .LDTRMM_L1x2_BEGIN: andi. T1, M, 2 ble .LDTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L1x2_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L1x2_SUB4 .LDTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble .LDTRMM_L1x2_LOOP_END .align 5 .LDTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt .LDTRMM_L1x2_LOOP .LDTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b .LDTRMM_L1x2_SUB1 .LDTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b .LDTRMM_L1x2_SUB1 .LDTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 ble .LDTRMM_L1x2_SAVE b .LDTRMM_L1x2_SUB2 .LDTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L1x2_SAVE .LDTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt .LDTRMM_L1x2_SUB2 .LDTRMM_L1x2_SAVE: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif .LDTRMM_L1x2_END: .LDTRMM_L1x1_BEGIN: andi. T1, M, 1 ble .LDTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LDTRMM_L1x1_SUB0 cmpwi cr0, L, 1 ble .LDTRMM_L1x1_SUB4 .LDTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble .LDTRMM_L1x1_LOOP_END .align 5 .LDTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt .LDTRMM_L1x1_LOOP .LDTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b .LDTRMM_L1x1_SUB1 .LDTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b .LDTRMM_L1x1_SUB1 .LDTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 ble .LDTRMM_L1x1_SAVE b .LDTRMM_L1x1_SUB2 .LDTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LDTRMM_L1x1_SAVE .LDTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt .LDTRMM_L1x1_SUB2 .LDTRMM_L1x1_SAVE: SAVE1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif .LDTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif .LDTRMM_L1_END: OpenBLAS-0.2.20/kernel/power/dtrmm_macros_16x4_power8.S000066400000000000000000001556411313527062700224750ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************* * Macros for N=4, M=16 * *********************************************************************/ .macro LOAD4x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x16_I1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 addi AO, AO, 64 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO xvmuldp vs52, vs4, vs26 xvmuldp vs53, vs5, vs26 xvmuldp vs54, vs6, vs26 xvmuldp vs55, vs7, vs26 lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmuldp vs60, vs4, vs27 xvmuldp vs61, vs5, vs27 xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x16_1 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 addi AO, AO, 64 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmaddadp vs60, vs4, vs27 xvmaddadp vs61, vs5, vs27 xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x16_2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 addi AO, AO, 64 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO xvmaddadp vs60, vs12, vs31 xvmaddadp vs61, vs13, vs31 xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 xvmaddadp vs60, vs12, vs31 xvmaddadp vs61, vs13, vs31 xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 .endm .macro KERNEL4x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 xvmuldp vs52, vs4, vs26 xvmuldp vs53, vs5, vs26 xvmuldp vs54, vs6, vs26 xvmuldp vs55, vs7, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 xvmuldp vs60, vs4, vs27 xvmuldp vs61, vs5, vs27 xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 .endm .macro KERNEL4x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 xvmaddadp vs60, vs4, vs27 xvmaddadp vs61, vs5, vs27 xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 .endm .macro SAVE4x16 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r xvmaddadp vs4, vs36, alpha_r xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r xvmuldp vs4, vs36, alpha_r xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 lxvd2x vs12, 0, T2 lxvd2x vs13, o16, T2 lxvd2x vs14, o32, T2 lxvd2x vs15, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r xvmuldp vs12, vs44, alpha_r xvmuldp vs13, vs45, alpha_r xvmuldp vs14, vs46, alpha_r xvmuldp vs15, vs47, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, 0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r xvmaddadp vs1, vs49, alpha_r xvmaddadp vs2, vs50, alpha_r xvmaddadp vs3, vs51, alpha_r xvmaddadp vs4, vs52, alpha_r xvmaddadp vs5, vs53, alpha_r xvmaddadp vs6, vs54, alpha_r xvmaddadp vs7, vs55, alpha_r #else xvmuldp vs0, vs48, alpha_r xvmuldp vs1, vs49, alpha_r xvmuldp vs2, vs50, alpha_r xvmuldp vs3, vs51, alpha_r xvmuldp vs4, vs52, alpha_r xvmuldp vs5, vs53, alpha_r xvmuldp vs6, vs54, alpha_r xvmuldp vs7, vs55, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 lxvd2x vs12, 0, T2 lxvd2x vs13, o16, T2 lxvd2x vs14, o32, T2 lxvd2x vs15, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r xvmaddadp vs9, vs57, alpha_r xvmaddadp vs10, vs58, alpha_r xvmaddadp vs11, vs59, alpha_r xvmaddadp vs12, vs60, alpha_r xvmaddadp vs13, vs61, alpha_r xvmaddadp vs14, vs62, alpha_r xvmaddadp vs15, vs63, alpha_r #else xvmuldp vs8, vs56, alpha_r xvmuldp vs9, vs57, alpha_r xvmuldp vs10, vs58, alpha_r xvmuldp vs11, vs59, alpha_r xvmuldp vs12, vs60, alpha_r xvmuldp vs13, vs61, alpha_r xvmuldp vs14, vs62, alpha_r xvmuldp vs15, vs63, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, 0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD4x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_I1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_1 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL4x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 .endm .macro KERNEL4x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 .endm .macro KERNEL4x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 64 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 .endm .macro SAVE4x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r xvmaddadp vs1, vs49, alpha_r xvmaddadp vs2, vs50, alpha_r xvmaddadp vs3, vs51, alpha_r #else xvmuldp vs0, vs48, alpha_r xvmuldp vs1, vs49, alpha_r xvmuldp vs2, vs50, alpha_r xvmuldp vs3, vs51, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r xvmaddadp vs9, vs57, alpha_r xvmaddadp vs10, vs58, alpha_r xvmaddadp vs11, vs59, alpha_r #else xvmuldp vs8, vs56, alpha_r xvmuldp vs9, vs57, alpha_r xvmuldp vs10, vs58, alpha_r xvmuldp vs11, vs59, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ .macro LOAD4x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 .endm .macro KERNEL4x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 .endm .macro KERNEL4x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 .endm .macro KERNEL4x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 .endm .macro KERNEL4x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 .endm .macro KERNEL4x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 .endm .macro KERNEL4x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 .endm .macro SAVE4x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r xvmaddadp vs1, vs49, alpha_r #else xvmuldp vs0, vs48, alpha_r xvmuldp vs1, vs49, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r xvmaddadp vs9, vs57, alpha_r #else xvmuldp vs8, vs56, alpha_r xvmuldp vs9, vs57, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ .macro LOAD4x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 .endm .macro KERNEL4x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs56, vs0, vs27 .endm .macro KERNEL4x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs56, vs0, vs27 .endm .macro KERNEL4x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x2_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs56, vs0, vs27 .endm .macro KERNEL4x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs56, vs0, vs27 .endm .macro SAVE4x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r #else xvmuldp vs8, vs40, alpha_r #endif stxvd2x vs8, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r #else xvmuldp vs0, vs48, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r #else xvmuldp vs8, vs56, alpha_r #endif stxvd2x vs8, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ .macro LOAD4x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 .endm .macro KERNEL4x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO lxsdx vs30, o16, BO lxsdx vs31, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 xsmuldp vs48, vs0, vs26 xsmuldp vs56, vs0, vs27 .endm .macro KERNEL4x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO lxsdx vs30, o16, BO lxsdx vs31, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 xsmaddadp vs48, vs0, vs26 xsmaddadp vs56, vs0, vs27 .endm .macro KERNEL4x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 xsmaddadp vs48, vs8, vs30 xsmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x1_E2 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 xsmaddadp vs48, vs8, vs30 xsmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 xsmuldp vs48, vs0, vs26 xsmuldp vs56, vs0, vs27 .endm .macro KERNEL4x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 xsmaddadp vs48, vs0, vs26 xsmaddadp vs56, vs0, vs27 .endm .macro SAVE4x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs40, alpha_r #else xsmuldp vs8, vs40, alpha_r #endif stxsdx vs8, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs48, alpha_r #else xsmuldp vs0, vs48, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs56, alpha_r #else xsmuldp vs8, vs56, alpha_r #endif stxsdx vs8, 0, T1 addi CO, CO, 8 .endm /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ .macro LOAD2x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 .endm .macro KERNEL2x16_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 .endm .macro KERNEL2x16_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 .endm .macro KERNEL2x16_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 .endm .macro KERNEL2x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 .endm .macro KERNEL2x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 .endm .macro KERNEL2x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 .endm .macro SAVE2x16 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r xvmaddadp vs4, vs36, alpha_r xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r xvmuldp vs4, vs36, alpha_r xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 lxvd2x vs12, 0, T2 lxvd2x vs13, o16, T2 lxvd2x vs14, o32, T2 lxvd2x vs15, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r xvmuldp vs12, vs44, alpha_r xvmuldp vs13, vs45, alpha_r xvmuldp vs14, vs46, alpha_r xvmuldp vs15, vs47, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, 0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD2x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 .endm .macro KERNEL2x8_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 .endm .macro KERNEL2x8_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 .endm .macro KERNEL2x8_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .endm .macro KERNEL2x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .endm .macro KERNEL2x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 .endm .macro KERNEL2x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 .endm .macro SAVE2x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ .macro LOAD2x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 .endm .macro KERNEL2x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 .endm .macro KERNEL2x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 .endm .macro KERNEL2x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 .endm .macro KERNEL2x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 .endm .macro KERNEL2x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 .endm .macro KERNEL2x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 .endm .macro SAVE2x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ .macro LOAD2x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 .endm .macro KERNEL2x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 .endm .macro KERNEL2x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 .endm .macro KERNEL2x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x2_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 .endm .macro KERNEL2x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 .endm .macro SAVE2x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r #else xvmuldp vs8, vs40, alpha_r #endif stxvd2x vs8, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ .macro LOAD2x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 .endm .macro KERNEL2x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 .endm .macro KERNEL2x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 .endm .macro KERNEL2x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x1_E2 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 .endm .macro KERNEL2x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 .endm .macro SAVE2x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs40, alpha_r #else xsmuldp vs8, vs40, alpha_r #endif stxsdx vs8, 0, T1 addi CO, CO, 8 .endm /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ .macro LOAD1x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 .endm .macro KERNEL1x16_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 .endm .macro KERNEL1x16_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 .endm .macro KERNEL1x16_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 .endm .macro KERNEL1x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 .endm .macro KERNEL1x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 .endm .macro KERNEL1x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 .endm .macro SAVE1x16 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r xvmaddadp vs4, vs36, alpha_r xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r xvmuldp vs4, vs36, alpha_r xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD1x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 .endm .macro KERNEL1x8_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .endm .macro KERNEL1x8_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endm .macro KERNEL1x8_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .endm .macro KERNEL1x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .endm .macro KERNEL1x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .endm .macro KERNEL1x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endm .macro SAVE1x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ .macro LOAD1x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 .endm .macro KERNEL1x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 .endm .macro KERNEL1x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 .endm .macro KERNEL1x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 .endm .macro KERNEL1x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 .endm .macro KERNEL1x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 .endm .macro KERNEL1x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 .endm .macro SAVE1x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ .macro LOAD1x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 .endm .macro KERNEL1x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 .endm .macro KERNEL1x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 .endm .macro KERNEL1x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x2_E2 xvmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 .endm .macro KERNEL1x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 .endm .macro SAVE1x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ .macro LOAD1x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 .endm .macro KERNEL1x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmuldp vs32, vs0, vs24 .endm .macro KERNEL1x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs0, vs24 .endm .macro KERNEL1x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x1_E2 xsmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmuldp vs32, vs0, vs24 .endm .macro KERNEL1x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs0, vs24 .endm .macro SAVE1x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 addi CO, CO, 8 .endm OpenBLAS-0.2.20/kernel/power/dtrsm_kernel_LT_16x4_power8.S000066400000000000000000000146171313527062700230730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define o0 0 #define PRE r15 #define T4 r16 #define L r17 #define T3 r18 #define T2 r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO r25 #define o8 r26 #define o16 r27 #define o24 r28 #define o32 r29 #define o48 r30 #define T1 r31 #include "dtrsm_macros_LT_16x4_power8.S" #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif cmpwi cr0, M, 0 ble L999 cmpwi cr0, N, 0 ble L999 cmpwi cr0, K, 0 ble L999 slwi LDC, LDC, BASE_SHIFT li o8, 8 li o16, 16 li o24, 24 li o32, 32 li o48, 48 li PRE, 384 mr KK, OFFSET #include "dtrsm_logic_LT_16x4_power8.S" L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/dtrsm_logic_LT_16x4_power8.S000066400000000000000000000211141313527062700226760ustar00rootroot00000000000000 srawi. J, N, 2 ble DSTRM_LT_L4_END DSTRM_LT_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 mr KK, OFFSET srawi. I, M, 4 ble DSTRM_LT_L4x16_END DSTRM_LT_L4x16_BEGIN: mr BO, B li L, -128 mr T1, CO add T2, T1, LDC add T3, T2, LDC add T4, T3, LDC and T1, T1, L and T2, T2, L and T3, T3, L and T4, T4, L dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 addi T1, T1, 128 addi T2, T2, 128 addi T3, T3, 128 addi T4, T4, 128 dcbt T1, r0 dcbt T2, r0 dcbt T3, r0 dcbt T4, r0 DSTRM_LT_L4x16_LOOP_START: INIT_16x4 addic. L, KK, 0 ble- DSTRM_LT_L4x16_SAVE mtctr L DSTRM_LT_L4x16_LOOP: dcbt AO, PRE dcbt BO, PRE KERNEL_16x4 bdz- DSTRM_LT_L4x16_SAVE dcbt AO, PRE KERNEL_16x4 bdz- DSTRM_LT_L4x16_SAVE dcbt AO, PRE KERNEL_16x4 bdz- DSTRM_LT_L4x16_SAVE dcbt AO, PRE KERNEL_16x4 bdnz+ DSTRM_LT_L4x16_LOOP DSTRM_LT_L4x16_SAVE: SOLVE_LT_16x4 addi CO, CO, 16*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 4+BASE_SHIFT slwi T4, T4, 2+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 16 addic. I, I, -1 bgt DSTRM_LT_L4x16_BEGIN DSTRM_LT_L4x16_END: DSTRM_LT_L4x8_BEGIN: andi. T2, M, 15 ble DSTRM_LT_L4x1_END andi. T1, M, 8 ble DSTRM_LT_L4x8_END mr BO, B DSTRM_LT_L4x8_LOOP_START: INIT_8x4 addic. L, KK, 0 ble DSTRM_LT_L4x8_SAVE DSTRM_LT_L4x8_LOOP: KERNEL_8x4 addic. L, L, -1 bgt DSTRM_LT_L4x8_LOOP DSTRM_LT_L4x8_SAVE: SOLVE_LT_8x4 addi CO, CO, 8*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 3+BASE_SHIFT slwi T4, T4, 2+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 8 DSTRM_LT_L4x8_END: DSTRM_LT_L4x4_BEGIN: andi. T1, M, 4 ble DSTRM_LT_L4x4_END mr BO, B DSTRM_LT_L4x4_LOOP_START: INIT_4x4 addic. L, KK, 0 ble DSTRM_LT_L4x4_SAVE DSTRM_LT_L4x4_LOOP: KERNEL_4x4 addic. L, L, -1 bgt DSTRM_LT_L4x4_LOOP DSTRM_LT_L4x4_SAVE: SOLVE_LT_4x4 addi CO, CO, 4*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 2+BASE_SHIFT slwi T4, T4, 2+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 4 DSTRM_LT_L4x4_END: DSTRM_LT_L4x2_BEGIN: andi. T1, M, 2 ble DSTRM_LT_L4x2_END mr BO, B DSTRM_LT_L4x2_LOOP_START: INIT_2x4 addic. L, KK, 0 ble DSTRM_LT_L4x2_SAVE DSTRM_LT_L4x2_LOOP: KERNEL_2x4 addic. L, L, -1 bgt DSTRM_LT_L4x2_LOOP DSTRM_LT_L4x2_SAVE: SOLVE_LT_2x4 addi CO, CO, 2*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 1+BASE_SHIFT slwi T4, T4, 2+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 2 DSTRM_LT_L4x2_END: DSTRM_LT_L4x1_BEGIN: andi. T1, M, 1 ble DSTRM_LT_L4x1_END mr BO, B DSTRM_LT_L4x1_LOOP_START: INIT_1x4 addic. L, KK, 0 ble DSTRM_LT_L4x1_SAVE DSTRM_LT_L4x1_LOOP: KERNEL_1x4 addic. L, L, -1 bgt DSTRM_LT_L4x1_LOOP DSTRM_LT_L4x1_SAVE: SOLVE_LT_1x4 addi CO, CO, 1*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 0+BASE_SHIFT slwi T4, T4, 2+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 1 DSTRM_LT_L4x1_END: slwi T1, K, 2+BASE_SHIFT add B, B, T1 addic. J, J, -1 bgt DSTRM_LT_L4_BEGIN andi. T2, N, 3 ble L999 DSTRM_LT_L4_END: b DSTRM_LT_L2_BEGIN L999_H1: b L999 DSTRM_LT_L2_BEGIN: andi. T1, N, 2 ble DSTRM_LT_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 mr KK, OFFSET srawi. I, M, 4 ble DSTRM_LT_L2x16_END DSTRM_LT_L2x16_BEGIN: mr BO, B DSTRM_LT_L2x16_LOOP_START: INIT_16x2 addic. L, KK, 0 ble DSTRM_LT_L2x16_SAVE DSTRM_LT_L2x16_LOOP: KERNEL_16x2 addic. L, L, -1 bgt DSTRM_LT_L2x16_LOOP DSTRM_LT_L2x16_SAVE: SOLVE_LT_16x2 addi CO, CO, 16*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 4+BASE_SHIFT slwi T4, T4, 1+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 16 addic. I, I, -1 bgt DSTRM_LT_L2x16_BEGIN DSTRM_LT_L2x16_END: DSTRM_LT_L2x8_BEGIN: andi. T2, M, 15 ble DSTRM_LT_L2x1_END andi. T1, M, 8 ble DSTRM_LT_L2x8_END mr BO, B DSTRM_LT_L2x8_LOOP_START: INIT_8x2 addic. L, KK, 0 ble DSTRM_LT_L2x8_SAVE DSTRM_LT_L2x8_LOOP: KERNEL_8x2 addic. L, L, -1 bgt DSTRM_LT_L2x8_LOOP DSTRM_LT_L2x8_SAVE: SOLVE_LT_8x2 addi CO, CO, 8*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 3+BASE_SHIFT slwi T4, T4, 1+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 8 DSTRM_LT_L2x8_END: DSTRM_LT_L2x4_BEGIN: andi. T1, M, 4 ble DSTRM_LT_L2x4_END mr BO, B DSTRM_LT_L2x4_LOOP_START: INIT_4x2 addic. L, KK, 0 ble DSTRM_LT_L2x4_SAVE DSTRM_LT_L2x4_LOOP: KERNEL_4x2 addic. L, L, -1 bgt DSTRM_LT_L2x4_LOOP DSTRM_LT_L2x4_SAVE: SOLVE_LT_4x2 addi CO, CO, 4*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 2+BASE_SHIFT slwi T4, T4, 1+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 4 DSTRM_LT_L2x4_END: DSTRM_LT_L2x2_BEGIN: andi. T1, M, 2 ble DSTRM_LT_L2x2_END mr BO, B DSTRM_LT_L2x2_LOOP_START: INIT_2x2 addic. L, KK, 0 ble DSTRM_LT_L2x2_SAVE DSTRM_LT_L2x2_LOOP: KERNEL_2x2 addic. L, L, -1 bgt DSTRM_LT_L2x2_LOOP DSTRM_LT_L2x2_SAVE: SOLVE_LT_2x2 addi CO, CO, 2*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 1+BASE_SHIFT slwi T4, T4, 1+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 2 DSTRM_LT_L2x2_END: DSTRM_LT_L2x1_BEGIN: andi. T1, M, 1 ble DSTRM_LT_L2x1_END mr BO, B DSTRM_LT_L2x1_LOOP_START: INIT_1x2 addic. L, KK, 0 ble DSTRM_LT_L2x1_SAVE DSTRM_LT_L2x1_LOOP: KERNEL_1x2 addic. L, L, -1 bgt DSTRM_LT_L2x1_LOOP DSTRM_LT_L2x1_SAVE: SOLVE_LT_1x2 addi CO, CO, 1*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 0+BASE_SHIFT slwi T4, T4, 1+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 1 DSTRM_LT_L2x1_END: slwi T1, K, 1+BASE_SHIFT add B, B, T1 DSTRM_LT_L2_END: DSTRM_LT_L1_BEGIN: andi. T1, N, 1 ble DSTRM_LT_L1_END mr CO, C mr AO, A mr KK, OFFSET srawi. I, M, 4 ble DSTRM_LT_L1x16_END DSTRM_LT_L1x16_BEGIN: mr BO, B DSTRM_LT_L1x16_LOOP_START: INIT_16x1 addic. L, KK, 0 ble DSTRM_LT_L1x16_SAVE DSTRM_LT_L1x16_LOOP: KERNEL_16x1 addic. L, L, -1 bgt DSTRM_LT_L1x16_LOOP DSTRM_LT_L1x16_SAVE: SOLVE_LT_16x1 addi CO, CO, 16*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 4+BASE_SHIFT slwi T4, T4, 0+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 16 addic. I, I, -1 bgt DSTRM_LT_L1x16_BEGIN DSTRM_LT_L1x16_END: DSTRM_LT_L1x8_BEGIN: andi. T1, M, 8 ble DSTRM_LT_L1x8_END mr BO, B DSTRM_LT_L1x8_LOOP_START: INIT_8x1 addic. L, KK, 0 ble DSTRM_LT_L1x8_SAVE DSTRM_LT_L1x8_LOOP: KERNEL_8x1 addic. L, L, -1 bgt DSTRM_LT_L1x8_LOOP DSTRM_LT_L1x8_SAVE: SOLVE_LT_8x1 addi CO, CO, 8*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 3+BASE_SHIFT slwi T4, T4, 0+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 8 DSTRM_LT_L1x8_END: DSTRM_LT_L1x4_BEGIN: andi. T1, M, 4 ble DSTRM_LT_L1x4_END mr BO, B DSTRM_LT_L1x4_LOOP_START: INIT_4x1 addic. L, KK, 0 ble DSTRM_LT_L1x4_SAVE DSTRM_LT_L1x4_LOOP: KERNEL_4x1 addic. L, L, -1 bgt DSTRM_LT_L1x4_LOOP DSTRM_LT_L1x4_SAVE: SOLVE_LT_4x1 addi CO, CO, 4*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 2+BASE_SHIFT slwi T4, T4, 0+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 4 DSTRM_LT_L1x4_END: DSTRM_LT_L1x2_BEGIN: andi. T1, M, 2 ble DSTRM_LT_L1x2_END mr BO, B DSTRM_LT_L1x2_LOOP_START: INIT_2x1 addic. L, KK, 0 ble DSTRM_LT_L1x2_SAVE DSTRM_LT_L1x2_LOOP: KERNEL_2x1 addic. L, L, -1 bgt DSTRM_LT_L1x2_LOOP DSTRM_LT_L1x2_SAVE: SOLVE_LT_2x1 addi CO, CO, 2*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 1+BASE_SHIFT slwi T4, T4, 0+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 2 DSTRM_LT_L1x2_END: DSTRM_LT_L1x1_BEGIN: andi. T1, M, 1 ble DSTRM_LT_L1x1_END mr BO, B DSTRM_LT_L1x1_LOOP_START: INIT_1x1 addic. L, KK, 0 ble DSTRM_LT_L1x1_SAVE DSTRM_LT_L1x1_LOOP: KERNEL_1x1 addic. L, L, -1 bgt DSTRM_LT_L1x1_LOOP DSTRM_LT_L1x1_SAVE: SOLVE_LT_1x1 addi CO, CO, 1*SIZE sub T3, K, KK sub T4, K, KK slwi T3, T3, 0+BASE_SHIFT slwi T4, T4, 0+BASE_SHIFT add AO, AO, T3 add BO, BO, T4 addi KK, KK, 1 DSTRM_LT_L1x1_END: DSTRM_LT_L1_END: OpenBLAS-0.2.20/kernel/power/dtrsm_macros_LT_16x4_power8.S000066400000000000000000002356761313527062700231110ustar00rootroot00000000000000 .macro INIT_16x4 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 xvmovdp vs36, vs0 xvmovdp vs37, vs0 xvmovdp vs38, vs0 xvmovdp vs39, vs0 xvmovdp vs40, vs0 xvmovdp vs41, vs0 xvmovdp vs42, vs0 xvmovdp vs43, vs0 xvmovdp vs44, vs0 xvmovdp vs45, vs0 xvmovdp vs46, vs0 xvmovdp vs47, vs0 xvmovdp vs48, vs0 xvmovdp vs49, vs0 xvmovdp vs50, vs0 xvmovdp vs51, vs0 xvmovdp vs52, vs0 xvmovdp vs53, vs0 xvmovdp vs54, vs0 xvmovdp vs55, vs0 xvmovdp vs56, vs0 xvmovdp vs57, vs0 xvmovdp vs58, vs0 xvmovdp vs59, vs0 xvmovdp vs60, vs0 xvmovdp vs61, vs0 xvmovdp vs62, vs0 xvmovdp vs63, vs0 .endm .macro KERNEL_16x4 lxvd2x vs0, o0, AO lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO lxvdsx vs18, o16, BO lxvdsx vs19, o24, BO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO addi BO, BO, 32 addi AO, AO, 64 lxvd2x vs4, o0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs0, vs18 xvmaddadp vs35, vs0, vs19 xvmaddadp vs36, vs1, vs16 xvmaddadp vs37, vs1, vs17 xvmaddadp vs38, vs1, vs18 xvmaddadp vs39, vs1, vs19 xvmaddadp vs40, vs2, vs16 xvmaddadp vs41, vs2, vs17 xvmaddadp vs42, vs2, vs18 xvmaddadp vs43, vs2, vs19 xvmaddadp vs44, vs3, vs16 xvmaddadp vs45, vs3, vs17 xvmaddadp vs46, vs3, vs18 xvmaddadp vs47, vs3, vs19 xvmaddadp vs48, vs4, vs16 xvmaddadp vs49, vs4, vs17 xvmaddadp vs50, vs4, vs18 xvmaddadp vs51, vs4, vs19 xvmaddadp vs52, vs5, vs16 xvmaddadp vs53, vs5, vs17 xvmaddadp vs54, vs5, vs18 xvmaddadp vs55, vs5, vs19 xvmaddadp vs56, vs6, vs16 xvmaddadp vs57, vs6, vs17 xvmaddadp vs58, vs6, vs18 xvmaddadp vs59, vs6, vs19 xvmaddadp vs60, vs7, vs16 xvmaddadp vs61, vs7, vs17 xvmaddadp vs62, vs7, vs18 xvmaddadp vs63, vs7, vs19 .endm .macro INIT_8x4 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 xvmovdp vs36, vs0 xvmovdp vs37, vs0 xvmovdp vs38, vs0 xvmovdp vs39, vs0 xvmovdp vs40, vs0 xvmovdp vs41, vs0 xvmovdp vs42, vs0 xvmovdp vs43, vs0 xvmovdp vs44, vs0 xvmovdp vs45, vs0 xvmovdp vs46, vs0 xvmovdp vs47, vs0 .endm .macro KERNEL_8x4 lxvd2x vs0, o0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO addi AO, AO, 64 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO lxvdsx vs18, o16, BO lxvdsx vs19, o24, BO addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs0, vs18 xvmaddadp vs35, vs0, vs19 xvmaddadp vs36, vs1, vs16 xvmaddadp vs37, vs1, vs17 xvmaddadp vs38, vs1, vs18 xvmaddadp vs39, vs1, vs19 xvmaddadp vs40, vs2, vs16 xvmaddadp vs41, vs2, vs17 xvmaddadp vs42, vs2, vs18 xvmaddadp vs43, vs2, vs19 xvmaddadp vs44, vs3, vs16 xvmaddadp vs45, vs3, vs17 xvmaddadp vs46, vs3, vs18 xvmaddadp vs47, vs3, vs19 .endm .macro INIT_4x4 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 xvmovdp vs36, vs0 xvmovdp vs37, vs0 xvmovdp vs38, vs0 xvmovdp vs39, vs0 .endm .macro KERNEL_4x4 lxvd2x vs0, o0, AO lxvd2x vs1, o16, AO addi AO, AO, 32 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO lxvdsx vs18, o16, BO lxvdsx vs19, o24, BO addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs0, vs18 xvmaddadp vs35, vs0, vs19 xvmaddadp vs36, vs1, vs16 xvmaddadp vs37, vs1, vs17 xvmaddadp vs38, vs1, vs18 xvmaddadp vs39, vs1, vs19 .endm .macro INIT_2x4 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 .endm .macro KERNEL_2x4 lxvd2x vs0, o0, AO addi AO, AO, 16 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO lxvdsx vs18, o16, BO lxvdsx vs19, o24, BO addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs0, vs18 xvmaddadp vs35, vs0, vs19 .endm .macro INIT_1x4 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 .endm .macro KERNEL_1x4 lxvdsx vs0, o0, AO addi AO, AO, 8 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO lxvdsx vs18, o16, BO lxvdsx vs19, o24, BO addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs0, vs18 xvmaddadp vs35, vs0, vs19 .endm /*########################################################################################## SOLVE_LT 16x4 ##########################################################################################*/ .macro SOLVE_LT_16x4 //############### LOAD B ####################### mr T1, BO mr T4, BO xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 xxpermdi vs2, vs32, vs33, 3 xxpermdi vs3, vs34, vs35, 3 lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 lxvd2x vs35, o48, T1 addi T1, T1, 64 xxpermdi vs4, vs36, vs37, 0 xxpermdi vs5, vs38, vs39, 0 xxpermdi vs6, vs36, vs37, 3 xxpermdi vs7, vs38, vs39, 3 lxvd2x vs36, o0, T1 lxvd2x vs37, o16, T1 lxvd2x vs38, o32, T1 lxvd2x vs39, o48, T1 addi T1, T1, 64 xxpermdi vs8, vs40, vs41, 0 xxpermdi vs9, vs42, vs43, 0 xxpermdi vs10, vs40, vs41, 3 xxpermdi vs11, vs42, vs43, 3 lxvd2x vs40, o0, T1 lxvd2x vs41, o16, T1 lxvd2x vs42, o32, T1 lxvd2x vs43, o48, T1 addi T1, T1, 64 xxpermdi vs12, vs44, vs45, 0 xxpermdi vs13, vs46, vs47, 0 xxpermdi vs14, vs44, vs45, 3 xxpermdi vs15, vs46, vs47, 3 lxvd2x vs44, o0, T1 lxvd2x vs45, o16, T1 lxvd2x vs46, o32, T1 lxvd2x vs47, o48, T1 addi T1, T1, 64 xxpermdi vs16, vs48, vs49, 0 xxpermdi vs17, vs50, vs51, 0 xxpermdi vs18, vs48, vs49, 3 xxpermdi vs19, vs50, vs51, 3 lxvd2x vs48, o0, T1 lxvd2x vs49, o16, T1 lxvd2x vs50, o32, T1 lxvd2x vs51, o48, T1 addi T1, T1, 64 xxpermdi vs20, vs52, vs53, 0 xxpermdi vs21, vs54, vs55, 0 xxpermdi vs22, vs52, vs53, 3 xxpermdi vs23, vs54, vs55, 3 lxvd2x vs52, o0, T1 lxvd2x vs53, o16, T1 lxvd2x vs54, o32, T1 lxvd2x vs55, o48, T1 addi T1, T1, 64 xxpermdi vs24, vs56, vs57, 0 xxpermdi vs25, vs58, vs59, 0 xxpermdi vs26, vs56, vs57, 3 xxpermdi vs27, vs58, vs59, 3 lxvd2x vs56, o0, T1 lxvd2x vs57, o16, T1 lxvd2x vs58, o32, T1 lxvd2x vs59, o48, T1 addi T1, T1, 64 xxpermdi vs28, vs60, vs61, 0 xxpermdi vs29, vs62, vs63, 0 xxpermdi vs30, vs60, vs61, 3 xxpermdi vs31, vs62, vs63, 3 lxvd2x vs60, o0, T1 lxvd2x vs61, o16, T1 lxvd2x vs62, o32, T1 lxvd2x vs63, o48, T1 //############### OFFSET 0 ####################### dcbt AO, PRE mr T1, AO xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvsubdp vs36, vs36, vs4 xvsubdp vs37, vs37, vs5 xvsubdp vs38, vs38, vs6 xvsubdp vs39, vs39, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvsubdp vs40, vs40, vs8 xvsubdp vs41, vs41, vs9 xvsubdp vs42, vs42, vs10 xvsubdp vs43, vs43, vs11 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 xvsubdp vs44, vs44, vs12 xvsubdp vs45, vs45, vs13 xvsubdp vs46, vs46, vs14 xvsubdp vs47, vs47, vs15 lxvdsx vs12, o0, T1 lxvdsx vs13, o8, T1 lxvdsx vs14, o16, T1 lxvdsx vs15, o24, T1 addi T1, T1, 32 xvsubdp vs48, vs48, vs16 xvsubdp vs49, vs49, vs17 xvsubdp vs50, vs50, vs18 xvsubdp vs51, vs51, vs19 xvsubdp vs52, vs52, vs20 xvsubdp vs53, vs53, vs21 xvsubdp vs54, vs54, vs22 xvsubdp vs55, vs55, vs23 xvsubdp vs56, vs56, vs24 xvsubdp vs57, vs57, vs25 xvsubdp vs58, vs58, vs26 xvsubdp vs59, vs59, vs27 xvsubdp vs60, vs60, vs28 xvsubdp vs61, vs61, vs29 xvsubdp vs62, vs62, vs30 xvsubdp vs63, vs63, vs31 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE xvmuldp vs32, vs32, vs0 xvmuldp vs33, vs33, vs0 xvnmsubadp vs34, vs32, vs1 xvnmsubadp vs35, vs33, vs1 xvnmsubadp vs36, vs32, vs2 dcbt T1, PRE xvnmsubadp vs37, vs33, vs2 xvnmsubadp vs38, vs32, vs3 xvnmsubadp vs39, vs33, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs40, vs32, vs4 xvnmsubadp vs41, vs33, vs4 xvnmsubadp vs42, vs32, vs5 xvnmsubadp vs43, vs33, vs5 xvnmsubadp vs44, vs32, vs6 xvnmsubadp vs45, vs33, vs6 xvnmsubadp vs46, vs32, vs7 xvnmsubadp vs47, vs33, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvnmsubadp vs48, vs32, vs8 xvnmsubadp vs49, vs33, vs8 xvnmsubadp vs50, vs32, vs9 xvnmsubadp vs51, vs33, vs9 xvnmsubadp vs52, vs32, vs10 xvnmsubadp vs53, vs33, vs10 xvnmsubadp vs54, vs32, vs11 xvnmsubadp vs55, vs33, vs11 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 xvnmsubadp vs56, vs32, vs12 xvnmsubadp vs57, vs33, vs12 xvnmsubadp vs58, vs32, vs13 xvnmsubadp vs59, vs33, vs13 xvnmsubadp vs60, vs32, vs14 xvnmsubadp vs61, vs33, vs14 xvnmsubadp vs62, vs32, vs15 xvnmsubadp vs63, vs33, vs15 lxvdsx vs12, o0, T1 lxvdsx vs13, o8, T1 lxvdsx vs14, o16, T1 addi T1, T1, 24 //############### OFFSET 2 ####################### xvmuldp vs34, vs34, vs0 xvmuldp vs35, vs35, vs0 addi T1, T1, 2*SIZE xvnmsubadp vs36, vs34, vs1 xvnmsubadp vs37, vs35, vs1 xvnmsubadp vs38, vs34, vs2 dcbt T1, PRE xvnmsubadp vs39, vs35, vs2 xvnmsubadp vs40, vs34, vs3 xvnmsubadp vs41, vs35, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs42, vs34, vs4 xvnmsubadp vs43, vs35, vs4 xvnmsubadp vs44, vs34, vs5 xvnmsubadp vs45, vs35, vs5 xvnmsubadp vs46, vs34, vs6 xvnmsubadp vs47, vs35, vs6 xvnmsubadp vs48, vs34, vs7 xvnmsubadp vs49, vs35, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvnmsubadp vs50, vs34, vs8 xvnmsubadp vs51, vs35, vs8 xvnmsubadp vs52, vs34, vs9 xvnmsubadp vs53, vs35, vs9 xvnmsubadp vs54, vs34, vs10 xvnmsubadp vs55, vs35, vs10 xvnmsubadp vs56, vs34, vs11 xvnmsubadp vs57, vs35, vs11 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 xvnmsubadp vs58, vs34, vs12 xvnmsubadp vs59, vs35, vs12 xvnmsubadp vs60, vs34, vs13 xvnmsubadp vs61, vs35, vs13 xvnmsubadp vs62, vs34, vs14 xvnmsubadp vs63, vs35, vs14 lxvdsx vs12, o0, T1 lxvdsx vs13, o8, T1 addi T1, T1, 16 //############### OFFSET 3 ####################### xvmuldp vs36, vs36, vs0 xvmuldp vs37, vs37, vs0 addi T1, T1, 3*SIZE xvnmsubadp vs38, vs36, vs1 xvnmsubadp vs39, vs37, vs1 xvnmsubadp vs40, vs36, vs2 dcbt T1, PRE xvnmsubadp vs41, vs37, vs2 xvnmsubadp vs42, vs36, vs3 xvnmsubadp vs43, vs37, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs44, vs36, vs4 xvnmsubadp vs45, vs37, vs4 xvnmsubadp vs46, vs36, vs5 xvnmsubadp vs47, vs37, vs5 xvnmsubadp vs48, vs36, vs6 xvnmsubadp vs49, vs37, vs6 xvnmsubadp vs50, vs36, vs7 xvnmsubadp vs51, vs37, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvnmsubadp vs52, vs36, vs8 xvnmsubadp vs53, vs37, vs8 xvnmsubadp vs54, vs36, vs9 xvnmsubadp vs55, vs37, vs9 xvnmsubadp vs56, vs36, vs10 xvnmsubadp vs57, vs37, vs10 xvnmsubadp vs58, vs36, vs11 xvnmsubadp vs59, vs37, vs11 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 xvnmsubadp vs60, vs36, vs12 xvnmsubadp vs61, vs37, vs12 xvnmsubadp vs62, vs36, vs13 xvnmsubadp vs63, vs37, vs13 lxvdsx vs12, o0, T1 stxvd2x vs32, o0, T4 stxvd2x vs33, o16, T4 stxvd2x vs34, o32, T4 stxvd2x vs35, o48, T4 addi T4, T4, 64 addi T1, T1, 8 //############### OFFSET 4 ####################### xvmuldp vs38, vs38, vs0 xvmuldp vs39, vs39, vs0 addi T1, T1, 4*SIZE xvnmsubadp vs40, vs38, vs1 xvnmsubadp vs41, vs39, vs1 xvnmsubadp vs42, vs38, vs2 dcbt T1, PRE xvnmsubadp vs43, vs39, vs2 xvnmsubadp vs44, vs38, vs3 xvnmsubadp vs45, vs39, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs46, vs38, vs4 xvnmsubadp vs47, vs39, vs4 xvnmsubadp vs48, vs38, vs5 xvnmsubadp vs49, vs39, vs5 xvnmsubadp vs50, vs38, vs6 xvnmsubadp vs51, vs39, vs6 xvnmsubadp vs52, vs38, vs7 xvnmsubadp vs53, vs39, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvnmsubadp vs54, vs38, vs8 xvnmsubadp vs55, vs39, vs8 xvnmsubadp vs56, vs38, vs9 xvnmsubadp vs57, vs39, vs9 xvnmsubadp vs58, vs38, vs10 xvnmsubadp vs59, vs39, vs10 xvnmsubadp vs60, vs38, vs11 xvnmsubadp vs61, vs39, vs11 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 xvnmsubadp vs62, vs38, vs12 xvnmsubadp vs63, vs39, vs12 //############### OFFSET 5 ####################### xvmuldp vs40, vs40, vs0 xvmuldp vs41, vs41, vs0 addi T1, T1, 5*SIZE xvnmsubadp vs42, vs40, vs1 xvnmsubadp vs43, vs41, vs1 xvnmsubadp vs44, vs40, vs2 dcbt T1, PRE xvnmsubadp vs45, vs41, vs2 xvnmsubadp vs46, vs40, vs3 xvnmsubadp vs47, vs41, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs48, vs40, vs4 xvnmsubadp vs49, vs41, vs4 xvnmsubadp vs50, vs40, vs5 xvnmsubadp vs51, vs41, vs5 xvnmsubadp vs52, vs40, vs6 xvnmsubadp vs53, vs41, vs6 xvnmsubadp vs54, vs40, vs7 xvnmsubadp vs55, vs41, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvnmsubadp vs56, vs40, vs8 xvnmsubadp vs57, vs41, vs8 xvnmsubadp vs58, vs40, vs9 xvnmsubadp vs59, vs41, vs9 xvnmsubadp vs60, vs40, vs10 xvnmsubadp vs61, vs41, vs10 xvnmsubadp vs62, vs40, vs11 xvnmsubadp vs63, vs41, vs11 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 addi T1, T1, 24 //############### OFFSET 6 ####################### xvmuldp vs42, vs42, vs0 xvmuldp vs43, vs43, vs0 addi T1, T1, 6*SIZE xvnmsubadp vs44, vs42, vs1 xvnmsubadp vs45, vs43, vs1 xvnmsubadp vs46, vs42, vs2 dcbt T1, PRE xvnmsubadp vs47, vs43, vs2 xvnmsubadp vs48, vs42, vs3 xvnmsubadp vs49, vs43, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs50, vs42, vs4 xvnmsubadp vs51, vs43, vs4 xvnmsubadp vs52, vs42, vs5 xvnmsubadp vs53, vs43, vs5 xvnmsubadp vs54, vs42, vs6 xvnmsubadp vs55, vs43, vs6 xvnmsubadp vs56, vs42, vs7 xvnmsubadp vs57, vs43, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvnmsubadp vs58, vs42, vs8 xvnmsubadp vs59, vs43, vs8 xvnmsubadp vs60, vs42, vs9 xvnmsubadp vs61, vs43, vs9 xvnmsubadp vs62, vs42, vs10 xvnmsubadp vs63, vs43, vs10 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 addi T1, T1, 16 stxvd2x vs36, o0, T4 stxvd2x vs37, o16, T4 stxvd2x vs38, o32, T4 stxvd2x vs39, o48, T4 addi T4, T4, 64 //############### OFFSET 7 ####################### xvmuldp vs44, vs44, vs0 xvmuldp vs45, vs45, vs0 addi T1, T1, 7*SIZE xvnmsubadp vs46, vs44, vs1 xvnmsubadp vs47, vs45, vs1 xvnmsubadp vs48, vs44, vs2 dcbt T1, PRE xvnmsubadp vs49, vs45, vs2 xvnmsubadp vs50, vs44, vs3 xvnmsubadp vs51, vs45, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs52, vs44, vs4 xvnmsubadp vs53, vs45, vs4 xvnmsubadp vs54, vs44, vs5 xvnmsubadp vs55, vs45, vs5 xvnmsubadp vs56, vs44, vs6 xvnmsubadp vs57, vs45, vs6 xvnmsubadp vs58, vs44, vs7 xvnmsubadp vs59, vs45, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvnmsubadp vs60, vs44, vs8 xvnmsubadp vs61, vs45, vs8 xvnmsubadp vs62, vs44, vs9 xvnmsubadp vs63, vs45, vs9 lxvdsx vs8, o0, T1 addi T1, T1, 8 //############### OFFSET 8 ####################### xvmuldp vs46, vs46, vs0 xvmuldp vs47, vs47, vs0 addi T1, T1, 8*SIZE xvnmsubadp vs48, vs46, vs1 xvnmsubadp vs49, vs47, vs1 xvnmsubadp vs50, vs46, vs2 dcbt T1, PRE xvnmsubadp vs51, vs47, vs2 xvnmsubadp vs52, vs46, vs3 xvnmsubadp vs53, vs47, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs54, vs46, vs4 xvnmsubadp vs55, vs47, vs4 xvnmsubadp vs56, vs46, vs5 xvnmsubadp vs57, vs47, vs5 xvnmsubadp vs58, vs46, vs6 xvnmsubadp vs59, vs47, vs6 xvnmsubadp vs60, vs46, vs7 xvnmsubadp vs61, vs47, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 stxvd2x vs40, o0, T4 stxvd2x vs41, o16, T4 stxvd2x vs42, o32, T4 stxvd2x vs43, o48, T4 addi T4, T4, 64 xvnmsubadp vs62, vs46, vs8 xvnmsubadp vs63, vs47, vs8 //############### OFFSET 9 ####################### xvmuldp vs48, vs48, vs0 xvmuldp vs49, vs49, vs0 addi T1, T1, 9*SIZE xvnmsubadp vs50, vs48, vs1 xvnmsubadp vs51, vs49, vs1 xvnmsubadp vs52, vs48, vs2 dcbt T1, PRE xvnmsubadp vs53, vs49, vs2 xvnmsubadp vs54, vs48, vs3 xvnmsubadp vs55, vs49, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs56, vs48, vs4 xvnmsubadp vs57, vs49, vs4 xvnmsubadp vs58, vs48, vs5 xvnmsubadp vs59, vs49, vs5 xvnmsubadp vs60, vs48, vs6 xvnmsubadp vs61, vs49, vs6 xvnmsubadp vs62, vs48, vs7 xvnmsubadp vs63, vs49, vs7 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 addi T1, T1, 24 //############### OFFSET 10 ####################### xvmuldp vs50, vs50, vs0 xvmuldp vs51, vs51, vs0 addi T1, T1, 10*SIZE xvnmsubadp vs52, vs50, vs1 xvnmsubadp vs53, vs51, vs1 xvnmsubadp vs54, vs50, vs2 dcbt T1, PRE xvnmsubadp vs55, vs51, vs2 xvnmsubadp vs56, vs50, vs3 xvnmsubadp vs57, vs51, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs58, vs50, vs4 xvnmsubadp vs59, vs51, vs4 xvnmsubadp vs60, vs50, vs5 xvnmsubadp vs61, vs51, vs5 xvnmsubadp vs62, vs50, vs6 xvnmsubadp vs63, vs51, vs6 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 addi T1, T1, 16 stxvd2x vs44, o0, T4 stxvd2x vs45, o16, T4 stxvd2x vs46, o32, T4 stxvd2x vs47, o48, T4 addi T4, T4, 64 //############### OFFSET 11 ####################### xvmuldp vs52, vs52, vs0 xvmuldp vs53, vs53, vs0 addi T1, T1, 11*SIZE xvnmsubadp vs54, vs52, vs1 xvnmsubadp vs55, vs53, vs1 xvnmsubadp vs56, vs52, vs2 dcbt T1, PRE xvnmsubadp vs57, vs53, vs2 xvnmsubadp vs58, vs52, vs3 xvnmsubadp vs59, vs53, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvnmsubadp vs60, vs52, vs4 xvnmsubadp vs61, vs53, vs4 xvnmsubadp vs62, vs52, vs5 xvnmsubadp vs63, vs53, vs5 lxvdsx vs4, o0, T1 addi T1, T1, 8 //############### OFFSET 12 ####################### xvmuldp vs54, vs54, vs0 xvmuldp vs55, vs55, vs0 addi T1, T1, 12*SIZE xvnmsubadp vs56, vs54, vs1 xvnmsubadp vs57, vs55, vs1 xvnmsubadp vs58, vs54, vs2 dcbt T1, PRE xvnmsubadp vs59, vs55, vs2 xvnmsubadp vs60, vs54, vs3 xvnmsubadp vs61, vs55, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 stxvd2x vs48, o0, T4 stxvd2x vs49, o16, T4 stxvd2x vs50, o32, T4 stxvd2x vs51, o48, T4 addi T4, T4, 64 xvnmsubadp vs62, vs54, vs4 xvnmsubadp vs63, vs55, vs4 //############### OFFSET 13 ####################### xvmuldp vs56, vs56, vs0 xvmuldp vs57, vs57, vs0 addi T1, T1, 13*SIZE xvnmsubadp vs58, vs56, vs1 xvnmsubadp vs59, vs57, vs1 xvnmsubadp vs60, vs56, vs2 xvnmsubadp vs61, vs57, vs2 xvnmsubadp vs62, vs56, vs3 xvnmsubadp vs63, vs57, vs3 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 addi T1, T1, 24 //############### OFFSET 14 ####################### xvmuldp vs58, vs58, vs0 xvmuldp vs59, vs59, vs0 addi T1, T1, 14*SIZE xvnmsubadp vs60, vs58, vs1 xvnmsubadp vs61, vs59, vs1 xvnmsubadp vs62, vs58, vs2 xvnmsubadp vs63, vs59, vs2 lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 stxvd2x vs52, o0, T4 stxvd2x vs53, o16, T4 stxvd2x vs54, o32, T4 stxvd2x vs55, o48, T4 addi T4, T4, 64 //############### OFFSET 15 ####################### xvmuldp vs60, vs60, vs0 xvmuldp vs61, vs61, vs0 addi T1, T1, 15*SIZE xvnmsubadp vs62, vs60, vs1 xvnmsubadp vs63, vs61, vs1 lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs62, vs62, vs0 xvmuldp vs63, vs63, vs0 //############### SAVE B ####################### stxvd2x vs56, o0, T4 stxvd2x vs57, o16, T4 stxvd2x vs58, o32, T4 stxvd2x vs59, o48, T4 addi T4, T4, 64 stxvd2x vs60, o0, T4 stxvd2x vs61, o16, T4 stxvd2x vs62, o32, T4 stxvd2x vs63, o48, T4 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs34, o8, T1 xxswapd vs34, vs34 stxsdx vs36, o16, T1 xxswapd vs36, vs36 stxsdx vs38, o24, T1 xxswapd vs38, vs38 addi T1, T1, 32 stxsdx vs40, o0, T1 xxswapd vs40, vs40 stxsdx vs42, o8, T1 xxswapd vs42, vs42 stxsdx vs44, o16, T1 xxswapd vs44, vs44 stxsdx vs46, o24, T1 xxswapd vs46, vs46 addi T1, T1, 32 stxsdx vs48, o0, T1 xxswapd vs48, vs48 stxsdx vs50, o8, T1 xxswapd vs50, vs50 stxsdx vs52, o16, T1 xxswapd vs52, vs52 stxsdx vs54, o24, T1 xxswapd vs54, vs54 addi T1, T1, 32 stxsdx vs56, o0, T1 xxswapd vs56, vs56 stxsdx vs58, o8, T1 xxswapd vs58, vs58 stxsdx vs60, o16, T1 xxswapd vs60, vs60 stxsdx vs62, o24, T1 xxswapd vs62, vs62 stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 stxsdx vs36, o16, T2 stxsdx vs38, o24, T2 addi T2, T2, 32 stxsdx vs40, o0, T2 stxsdx vs42, o8, T2 stxsdx vs44, o16, T2 stxsdx vs46, o24, T2 addi T2, T2, 32 stxsdx vs48, o0, T2 stxsdx vs50, o8, T2 stxsdx vs52, o16, T2 stxsdx vs54, o24, T2 addi T2, T2, 32 stxsdx vs56, o0, T2 stxsdx vs58, o8, T2 stxsdx vs60, o16, T2 stxsdx vs62, o24, T2 mr T1, CO add T2, CO, LDC add T1, T2, LDC add T2, T1, LDC stxsdx vs33, o0, T1 xxswapd vs33, vs33 stxsdx vs35, o8, T1 xxswapd vs35, vs35 stxsdx vs37, o16, T1 xxswapd vs37, vs37 stxsdx vs39, o24, T1 xxswapd vs39, vs39 addi T1, T1, 32 stxsdx vs41, o0, T1 xxswapd vs41, vs41 stxsdx vs43, o8, T1 xxswapd vs43, vs43 stxsdx vs45, o16, T1 xxswapd vs45, vs45 stxsdx vs47, o24, T1 xxswapd vs47, vs47 addi T1, T1, 32 stxsdx vs49, o0, T1 xxswapd vs49, vs49 stxsdx vs51, o8, T1 xxswapd vs51, vs51 stxsdx vs53, o16, T1 xxswapd vs53, vs53 stxsdx vs55, o24, T1 xxswapd vs55, vs55 addi T1, T1, 32 stxsdx vs57, o0, T1 xxswapd vs57, vs57 stxsdx vs59, o8, T1 xxswapd vs59, vs59 stxsdx vs61, o16, T1 xxswapd vs61, vs61 stxsdx vs63, o24, T1 xxswapd vs63, vs63 stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 stxsdx vs37, o16, T2 stxsdx vs39, o24, T2 addi T2, T2, 32 stxsdx vs41, o0, T2 stxsdx vs43, o8, T2 stxsdx vs45, o16, T2 stxsdx vs47, o24, T2 addi T2, T2, 32 stxsdx vs49, o0, T2 stxsdx vs51, o8, T2 stxsdx vs53, o16, T2 stxsdx vs55, o24, T2 addi T2, T2, 32 stxsdx vs57, o0, T2 stxsdx vs59, o8, T2 stxsdx vs61, o16, T2 stxsdx vs63, o24, T2 .endm /*########################################################################################## SOLVE_LT 8x4 ##########################################################################################*/ .macro SOLVE_LT_8x4 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 xxpermdi vs2, vs32, vs33, 3 xxpermdi vs3, vs34, vs35, 3 xxpermdi vs4, vs36, vs37, 0 xxpermdi vs5, vs38, vs39, 0 xxpermdi vs6, vs36, vs37, 3 xxpermdi vs7, vs38, vs39, 3 xxpermdi vs8, vs40, vs41, 0 xxpermdi vs9, vs42, vs43, 0 xxpermdi vs10, vs40, vs41, 3 xxpermdi vs11, vs42, vs43, 3 xxpermdi vs12, vs44, vs45, 0 xxpermdi vs13, vs46, vs47, 0 xxpermdi vs14, vs44, vs45, 3 xxpermdi vs15, vs46, vs47, 3 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 lxvd2x vs35, o48, T1 addi T1, T1, 64 lxvd2x vs36, o0, T1 lxvd2x vs37, o16, T1 lxvd2x vs38, o32, T1 lxvd2x vs39, o48, T1 addi T1, T1, 64 lxvd2x vs40, o0, T1 lxvd2x vs41, o16, T1 lxvd2x vs42, o32, T1 lxvd2x vs43, o48, T1 addi T1, T1, 64 lxvd2x vs44, o0, T1 lxvd2x vs45, o16, T1 lxvd2x vs46, o32, T1 lxvd2x vs47, o48, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 xvsubdp vs36, vs36, vs4 xvsubdp vs37, vs37, vs5 xvsubdp vs38, vs38, vs6 xvsubdp vs39, vs39, vs7 xvsubdp vs40, vs40, vs8 xvsubdp vs41, vs41, vs9 xvsubdp vs42, vs42, vs10 xvsubdp vs43, vs43, vs11 xvsubdp vs44, vs44, vs12 xvsubdp vs45, vs45, vs13 xvsubdp vs46, vs46, vs14 xvsubdp vs47, vs47, vs15 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvmuldp vs32, vs32, vs0 xvmuldp vs33, vs33, vs0 xvnmsubadp vs34, vs32, vs1 xvnmsubadp vs35, vs33, vs1 xvnmsubadp vs36, vs32, vs2 xvnmsubadp vs37, vs33, vs2 xvnmsubadp vs38, vs32, vs3 xvnmsubadp vs39, vs33, vs3 xvnmsubadp vs40, vs32, vs4 xvnmsubadp vs41, vs33, vs4 xvnmsubadp vs42, vs32, vs5 xvnmsubadp vs43, vs33, vs5 xvnmsubadp vs44, vs32, vs6 xvnmsubadp vs45, vs33, vs6 xvnmsubadp vs46, vs32, vs7 xvnmsubadp vs47, vs33, vs7 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 addi T1, T1, 24 xvmuldp vs34, vs34, vs0 xvmuldp vs35, vs35, vs0 xvnmsubadp vs36, vs34, vs1 xvnmsubadp vs37, vs35, vs1 xvnmsubadp vs38, vs34, vs2 xvnmsubadp vs39, vs35, vs2 xvnmsubadp vs40, vs34, vs3 xvnmsubadp vs41, vs35, vs3 xvnmsubadp vs42, vs34, vs4 xvnmsubadp vs43, vs35, vs4 xvnmsubadp vs44, vs34, vs5 xvnmsubadp vs45, vs35, vs5 xvnmsubadp vs46, vs34, vs6 xvnmsubadp vs47, vs35, vs6 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 addi T1, T1, 16 xvmuldp vs36, vs36, vs0 xvmuldp vs37, vs37, vs0 xvnmsubadp vs38, vs36, vs1 xvnmsubadp vs39, vs37, vs1 xvnmsubadp vs40, vs36, vs2 xvnmsubadp vs41, vs37, vs2 xvnmsubadp vs42, vs36, vs3 xvnmsubadp vs43, vs37, vs3 xvnmsubadp vs44, vs36, vs4 xvnmsubadp vs45, vs37, vs4 xvnmsubadp vs46, vs36, vs5 xvnmsubadp vs47, vs37, vs5 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 addi T1, T1, 8 xvmuldp vs38, vs38, vs0 xvmuldp vs39, vs39, vs0 xvnmsubadp vs40, vs38, vs1 xvnmsubadp vs41, vs39, vs1 xvnmsubadp vs42, vs38, vs2 xvnmsubadp vs43, vs39, vs2 xvnmsubadp vs44, vs38, vs3 xvnmsubadp vs45, vs39, vs3 xvnmsubadp vs46, vs38, vs4 xvnmsubadp vs47, vs39, vs4 //############### OFFSET 4 ####################### addi T1, T1, 4*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvmuldp vs40, vs40, vs0 xvmuldp vs41, vs41, vs0 xvnmsubadp vs42, vs40, vs1 xvnmsubadp vs43, vs41, vs1 xvnmsubadp vs44, vs40, vs2 xvnmsubadp vs45, vs41, vs2 xvnmsubadp vs46, vs40, vs3 xvnmsubadp vs47, vs41, vs3 //############### OFFSET 5 ####################### addi T1, T1, 5*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 addi T1, T1, 24 xvmuldp vs42, vs42, vs0 xvmuldp vs43, vs43, vs0 xvnmsubadp vs44, vs42, vs1 xvnmsubadp vs45, vs43, vs1 xvnmsubadp vs46, vs42, vs2 xvnmsubadp vs47, vs43, vs2 //############### OFFSET 6 ####################### addi T1, T1, 6*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 xvmuldp vs44, vs44, vs0 xvmuldp vs45, vs45, vs0 xvnmsubadp vs46, vs44, vs1 xvnmsubadp vs47, vs45, vs1 //############### OFFSET 7 ####################### addi T1, T1, 7*SIZE lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs46, vs46, vs0 xvmuldp vs47, vs47, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs34, o8, T1 xxswapd vs34, vs34 stxsdx vs36, o16, T1 xxswapd vs36, vs36 stxsdx vs38, o24, T1 xxswapd vs38, vs38 addi T1, T1, 32 stxsdx vs40, o0, T1 xxswapd vs40, vs40 stxsdx vs42, o8, T1 xxswapd vs42, vs42 stxsdx vs44, o16, T1 xxswapd vs44, vs44 stxsdx vs46, o24, T1 xxswapd vs46, vs46 stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 stxsdx vs36, o16, T2 stxsdx vs38, o24, T2 addi T2, T2, 32 stxsdx vs40, o0, T2 stxsdx vs42, o8, T2 stxsdx vs44, o16, T2 stxsdx vs46, o24, T2 mr T1, CO add T2, CO, LDC add T1, T2, LDC add T2, T1, LDC stxsdx vs33, o0, T1 xxswapd vs33, vs33 stxsdx vs35, o8, T1 xxswapd vs35, vs35 stxsdx vs37, o16, T1 xxswapd vs37, vs37 stxsdx vs39, o24, T1 xxswapd vs39, vs39 addi T1, T1, 32 stxsdx vs41, o0, T1 xxswapd vs41, vs41 stxsdx vs43, o8, T1 xxswapd vs43, vs43 stxsdx vs45, o16, T1 xxswapd vs45, vs45 stxsdx vs47, o24, T1 xxswapd vs47, vs47 stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 stxsdx vs37, o16, T2 stxsdx vs39, o24, T2 addi T2, T2, 32 stxsdx vs41, o0, T2 stxsdx vs43, o8, T2 stxsdx vs45, o16, T2 stxsdx vs47, o24, T2 .endm /*########################################################################################## SOLVE_LT 4x4 ##########################################################################################*/ .macro SOLVE_LT_4x4 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 xxpermdi vs2, vs32, vs33, 3 xxpermdi vs3, vs34, vs35, 3 xxpermdi vs4, vs36, vs37, 0 xxpermdi vs5, vs38, vs39, 0 xxpermdi vs6, vs36, vs37, 3 xxpermdi vs7, vs38, vs39, 3 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 lxvd2x vs35, o48, T1 addi T1, T1, 64 lxvd2x vs36, o0, T1 lxvd2x vs37, o16, T1 lxvd2x vs38, o32, T1 lxvd2x vs39, o48, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 xvsubdp vs36, vs36, vs4 xvsubdp vs37, vs37, vs5 xvsubdp vs38, vs38, vs6 xvsubdp vs39, vs39, vs7 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvmuldp vs32, vs32, vs0 xvmuldp vs33, vs33, vs0 xvnmsubadp vs34, vs32, vs1 xvnmsubadp vs35, vs33, vs1 xvnmsubadp vs36, vs32, vs2 xvnmsubadp vs37, vs33, vs2 xvnmsubadp vs38, vs32, vs3 xvnmsubadp vs39, vs33, vs3 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 addi T1, T1, 24 xvmuldp vs34, vs34, vs0 xvmuldp vs35, vs35, vs0 xvnmsubadp vs36, vs34, vs1 xvnmsubadp vs37, vs35, vs1 xvnmsubadp vs38, vs34, vs2 xvnmsubadp vs39, vs35, vs2 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 xvmuldp vs36, vs36, vs0 xvmuldp vs37, vs37, vs0 xvnmsubadp vs38, vs36, vs1 xvnmsubadp vs39, vs37, vs1 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs38, vs38, vs0 xvmuldp vs39, vs39, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs34, o8, T1 xxswapd vs34, vs34 stxsdx vs36, o16, T1 xxswapd vs36, vs36 stxsdx vs38, o24, T1 xxswapd vs38, vs38 stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 stxsdx vs36, o16, T2 stxsdx vs38, o24, T2 mr T1, CO add T2, CO, LDC add T1, T2, LDC add T2, T1, LDC stxsdx vs33, o0, T1 xxswapd vs33, vs33 stxsdx vs35, o8, T1 xxswapd vs35, vs35 stxsdx vs37, o16, T1 xxswapd vs37, vs37 stxsdx vs39, o24, T1 xxswapd vs39, vs39 stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 stxsdx vs37, o16, T2 stxsdx vs39, o24, T2 .endm /*########################################################################################## SOLVE_LT 2x4 ##########################################################################################*/ .macro SOLVE_LT_2x4 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 xxpermdi vs2, vs32, vs33, 3 xxpermdi vs3, vs34, vs35, 3 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 lxvd2x vs35, o48, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 xvmuldp vs32, vs32, vs0 xvmuldp vs33, vs33, vs0 xvnmsubadp vs34, vs32, vs1 xvnmsubadp vs35, vs33, vs1 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs34, vs34, vs0 xvmuldp vs35, vs35, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs34, o8, T1 xxswapd vs34, vs34 stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 mr T1, CO add T2, CO, LDC add T1, T2, LDC add T2, T1, LDC stxsdx vs33, o0, T1 xxswapd vs33, vs33 stxsdx vs35, o8, T1 xxswapd vs35, vs35 stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 .endm /*########################################################################################## SOLVE_LT 1x4 ##########################################################################################*/ .macro SOLVE_LT_1x4 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs32, vs32, vs0 xvmuldp vs33, vs33, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs32, o0, T2 mr T1, CO add T2, CO, LDC add T1, T2, LDC add T2, T1, LDC stxsdx vs33, o0, T1 xxswapd vs33, vs33 stxsdx vs33, o0, T2 .endm .macro INIT_16x2 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 xvmovdp vs36, vs0 xvmovdp vs37, vs0 xvmovdp vs38, vs0 xvmovdp vs39, vs0 xvmovdp vs40, vs0 xvmovdp vs41, vs0 xvmovdp vs42, vs0 xvmovdp vs43, vs0 xvmovdp vs44, vs0 xvmovdp vs45, vs0 xvmovdp vs46, vs0 xvmovdp vs47, vs0 .endm .macro KERNEL_16x2 lxvd2x vs0, o0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO addi AO, AO, 64 lxvd2x vs4, o0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 xvmaddadp vs40, vs4, vs16 xvmaddadp vs41, vs4, vs17 xvmaddadp vs42, vs5, vs16 xvmaddadp vs43, vs5, vs17 xvmaddadp vs44, vs6, vs16 xvmaddadp vs45, vs6, vs17 xvmaddadp vs46, vs7, vs16 xvmaddadp vs47, vs7, vs17 .endm .macro INIT_8x2 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 xvmovdp vs36, vs0 xvmovdp vs37, vs0 xvmovdp vs38, vs0 xvmovdp vs39, vs0 .endm .macro KERNEL_8x2 lxvd2x vs0, o0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO addi AO, AO, 64 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 xvmaddadp vs36, vs2, vs16 xvmaddadp vs37, vs2, vs17 xvmaddadp vs38, vs3, vs16 xvmaddadp vs39, vs3, vs17 .endm .macro INIT_4x2 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 .endm .macro KERNEL_4x2 lxvd2x vs0, o0, AO lxvd2x vs1, o16, AO addi AO, AO, 32 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 xvmaddadp vs34, vs1, vs16 xvmaddadp vs35, vs1, vs17 .endm .macro INIT_2x2 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 .endm .macro KERNEL_2x2 lxvd2x vs0, o0, AO addi AO, AO, 16 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 .endm .macro INIT_1x2 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 .endm .macro KERNEL_1x2 lxvdsx vs0, o0, AO addi AO, AO, 8 lxvdsx vs16, o0, BO lxvdsx vs17, o8, BO addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs0, vs17 .endm /*########################################################################################## SOLVE_LT 16x2 ##########################################################################################*/ .macro SOLVE_LT_16x2 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 xxpermdi vs2, vs34, vs35, 0 xxpermdi vs3, vs34, vs35, 3 xxpermdi vs4, vs36, vs37, 0 xxpermdi vs5, vs36, vs37, 3 xxpermdi vs6, vs38, vs39, 0 xxpermdi vs7, vs38, vs39, 3 xxpermdi vs8, vs40, vs41, 0 xxpermdi vs9, vs40, vs41, 3 xxpermdi vs10, vs42, vs43, 0 xxpermdi vs11, vs42, vs43, 3 xxpermdi vs12, vs44, vs45, 0 xxpermdi vs13, vs44, vs45, 3 xxpermdi vs14, vs46, vs47, 0 xxpermdi vs15, vs46, vs47, 3 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 lxvd2x vs35, o48, T1 addi T1, T1, 64 lxvd2x vs36, o0, T1 lxvd2x vs37, o16, T1 lxvd2x vs38, o32, T1 lxvd2x vs39, o48, T1 addi T1, T1, 64 lxvd2x vs40, o0, T1 lxvd2x vs41, o16, T1 lxvd2x vs42, o32, T1 lxvd2x vs43, o48, T1 addi T1, T1, 64 lxvd2x vs44, o0, T1 lxvd2x vs45, o16, T1 lxvd2x vs46, o32, T1 lxvd2x vs47, o48, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 xvsubdp vs36, vs36, vs4 xvsubdp vs37, vs37, vs5 xvsubdp vs38, vs38, vs6 xvsubdp vs39, vs39, vs7 xvsubdp vs40, vs40, vs8 xvsubdp vs41, vs41, vs9 xvsubdp vs42, vs42, vs10 xvsubdp vs43, vs43, vs11 xvsubdp vs44, vs44, vs12 xvsubdp vs45, vs45, vs13 xvsubdp vs46, vs46, vs14 xvsubdp vs47, vs47, vs15 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 lxvdsx vs12, o0, T1 lxvdsx vs13, o8, T1 lxvdsx vs14, o16, T1 lxvdsx vs15, o24, T1 addi T1, T1, 32 xvmuldp vs32, vs32, vs0 xvnmsubadp vs33, vs32, vs1 xvnmsubadp vs34, vs32, vs2 xvnmsubadp vs35, vs32, vs3 xvnmsubadp vs36, vs32, vs4 xvnmsubadp vs37, vs32, vs5 xvnmsubadp vs38, vs32, vs6 xvnmsubadp vs39, vs32, vs7 xvnmsubadp vs40, vs32, vs8 xvnmsubadp vs41, vs32, vs9 xvnmsubadp vs42, vs32, vs10 xvnmsubadp vs43, vs32, vs11 xvnmsubadp vs44, vs32, vs12 xvnmsubadp vs45, vs32, vs13 xvnmsubadp vs46, vs32, vs14 xvnmsubadp vs47, vs32, vs15 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 lxvdsx vs12, o0, T1 lxvdsx vs13, o8, T1 lxvdsx vs14, o16, T1 addi T1, T1, 24 xvmuldp vs33, vs33, vs0 xvnmsubadp vs34, vs33, vs1 xvnmsubadp vs35, vs33, vs2 xvnmsubadp vs36, vs33, vs3 xvnmsubadp vs37, vs33, vs4 xvnmsubadp vs38, vs33, vs5 xvnmsubadp vs39, vs33, vs6 xvnmsubadp vs40, vs33, vs7 xvnmsubadp vs41, vs33, vs8 xvnmsubadp vs42, vs33, vs9 xvnmsubadp vs43, vs33, vs10 xvnmsubadp vs44, vs33, vs11 xvnmsubadp vs45, vs33, vs12 xvnmsubadp vs46, vs33, vs13 xvnmsubadp vs47, vs33, vs14 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 lxvdsx vs12, o0, T1 lxvdsx vs13, o8, T1 addi T1, T1, 16 xvmuldp vs34, vs34, vs0 xvnmsubadp vs35, vs34, vs1 xvnmsubadp vs36, vs34, vs2 xvnmsubadp vs37, vs34, vs3 xvnmsubadp vs38, vs34, vs4 xvnmsubadp vs39, vs34, vs5 xvnmsubadp vs40, vs34, vs6 xvnmsubadp vs41, vs34, vs7 xvnmsubadp vs42, vs34, vs8 xvnmsubadp vs43, vs34, vs9 xvnmsubadp vs44, vs34, vs10 xvnmsubadp vs45, vs34, vs11 xvnmsubadp vs46, vs34, vs12 xvnmsubadp vs47, vs34, vs13 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 lxvdsx vs12, o0, T1 addi T1, T1, 8 xvmuldp vs35, vs35, vs0 xvnmsubadp vs36, vs35, vs1 xvnmsubadp vs37, vs35, vs2 xvnmsubadp vs38, vs35, vs3 xvnmsubadp vs39, vs35, vs4 xvnmsubadp vs40, vs35, vs5 xvnmsubadp vs41, vs35, vs6 xvnmsubadp vs42, vs35, vs7 xvnmsubadp vs43, vs35, vs8 xvnmsubadp vs44, vs35, vs9 xvnmsubadp vs45, vs35, vs10 xvnmsubadp vs46, vs35, vs11 xvnmsubadp vs47, vs35, vs12 //############### OFFSET 4 ####################### addi T1, T1, 4*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 lxvdsx vs11, o24, T1 addi T1, T1, 32 xvmuldp vs36, vs36, vs0 xvnmsubadp vs37, vs36, vs1 xvnmsubadp vs38, vs36, vs2 xvnmsubadp vs39, vs36, vs3 xvnmsubadp vs40, vs36, vs4 xvnmsubadp vs41, vs36, vs5 xvnmsubadp vs42, vs36, vs6 xvnmsubadp vs43, vs36, vs7 xvnmsubadp vs44, vs36, vs8 xvnmsubadp vs45, vs36, vs9 xvnmsubadp vs46, vs36, vs10 xvnmsubadp vs47, vs36, vs11 //############### OFFSET 5 ####################### addi T1, T1, 5*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 lxvdsx vs10, o16, T1 addi T1, T1, 24 xvmuldp vs37, vs37, vs0 xvnmsubadp vs38, vs37, vs1 xvnmsubadp vs39, vs37, vs2 xvnmsubadp vs40, vs37, vs3 xvnmsubadp vs41, vs37, vs4 xvnmsubadp vs42, vs37, vs5 xvnmsubadp vs43, vs37, vs6 xvnmsubadp vs44, vs37, vs7 xvnmsubadp vs45, vs37, vs8 xvnmsubadp vs46, vs37, vs9 xvnmsubadp vs47, vs37, vs10 //############### OFFSET 6 ####################### addi T1, T1, 6*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 lxvdsx vs9, o8, T1 addi T1, T1, 16 xvmuldp vs38, vs38, vs0 xvnmsubadp vs39, vs38, vs1 xvnmsubadp vs40, vs38, vs2 xvnmsubadp vs41, vs38, vs3 xvnmsubadp vs42, vs38, vs4 xvnmsubadp vs43, vs38, vs5 xvnmsubadp vs44, vs38, vs6 xvnmsubadp vs45, vs38, vs7 xvnmsubadp vs46, vs38, vs8 xvnmsubadp vs47, vs38, vs9 //############### OFFSET 7 ####################### addi T1, T1, 7*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 lxvdsx vs8, o0, T1 addi T1, T1, 8 xvmuldp vs39, vs39, vs0 xvnmsubadp vs40, vs39, vs1 xvnmsubadp vs41, vs39, vs2 xvnmsubadp vs42, vs39, vs3 xvnmsubadp vs43, vs39, vs4 xvnmsubadp vs44, vs39, vs5 xvnmsubadp vs45, vs39, vs6 xvnmsubadp vs46, vs39, vs7 xvnmsubadp vs47, vs39, vs8 //############### OFFSET 8 ####################### addi T1, T1, 8*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvmuldp vs40, vs40, vs0 xvnmsubadp vs41, vs40, vs1 xvnmsubadp vs42, vs40, vs2 xvnmsubadp vs43, vs40, vs3 xvnmsubadp vs44, vs40, vs4 xvnmsubadp vs45, vs40, vs5 xvnmsubadp vs46, vs40, vs6 xvnmsubadp vs47, vs40, vs7 //############### OFFSET 9 ####################### addi T1, T1, 9*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 addi T1, T1, 24 xvmuldp vs41, vs41, vs0 xvnmsubadp vs42, vs41, vs1 xvnmsubadp vs43, vs41, vs2 xvnmsubadp vs44, vs41, vs3 xvnmsubadp vs45, vs41, vs4 xvnmsubadp vs46, vs41, vs5 xvnmsubadp vs47, vs41, vs6 //############### OFFSET 10 ####################### addi T1, T1, 10*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 addi T1, T1, 16 xvmuldp vs42, vs42, vs0 xvnmsubadp vs43, vs42, vs1 xvnmsubadp vs44, vs42, vs2 xvnmsubadp vs45, vs42, vs3 xvnmsubadp vs46, vs42, vs4 xvnmsubadp vs47, vs42, vs5 //############### OFFSET 11 ####################### addi T1, T1, 11*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 addi T1, T1, 8 xvmuldp vs43, vs43, vs0 xvnmsubadp vs44, vs43, vs1 xvnmsubadp vs45, vs43, vs2 xvnmsubadp vs46, vs43, vs3 xvnmsubadp vs47, vs43, vs4 //############### OFFSET 12 ####################### addi T1, T1, 12*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvmuldp vs44, vs44, vs0 xvnmsubadp vs45, vs44, vs1 xvnmsubadp vs46, vs44, vs2 xvnmsubadp vs47, vs44, vs3 //############### OFFSET 13 ####################### addi T1, T1, 13*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 addi T1, T1, 24 xvmuldp vs45, vs45, vs0 xvnmsubadp vs46, vs45, vs1 xvnmsubadp vs47, vs45, vs2 //############### OFFSET 14 ####################### addi T1, T1, 14*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 xvmuldp vs46, vs46, vs0 xvnmsubadp vs47, vs46, vs1 //############### OFFSET 15 ####################### addi T1, T1, 15*SIZE lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs47, vs47, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs33, o8, T1 xxswapd vs33, vs33 stxsdx vs34, o16, T1 xxswapd vs34, vs34 stxsdx vs35, o24, T1 xxswapd vs35, vs35 addi T1, T1, 32 stxsdx vs36, o0, T1 xxswapd vs36, vs36 stxsdx vs37, o8, T1 xxswapd vs37, vs37 stxsdx vs38, o16, T1 xxswapd vs38, vs38 stxsdx vs39, o24, T1 xxswapd vs39, vs39 addi T1, T1, 32 stxsdx vs40, o0, T1 xxswapd vs40, vs40 stxsdx vs41, o8, T1 xxswapd vs41, vs41 stxsdx vs42, o16, T1 xxswapd vs42, vs42 stxsdx vs43, o24, T1 xxswapd vs43, vs43 addi T1, T1, 32 stxsdx vs44, o0, T1 xxswapd vs44, vs44 stxsdx vs45, o8, T1 xxswapd vs45, vs45 stxsdx vs46, o16, T1 xxswapd vs46, vs46 stxsdx vs47, o24, T1 xxswapd vs47, vs47 stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 stxsdx vs34, o16, T2 stxsdx vs35, o24, T2 addi T2, T2, 32 stxsdx vs36, o0, T2 stxsdx vs37, o8, T2 stxsdx vs38, o16, T2 stxsdx vs39, o24, T2 addi T2, T2, 32 stxsdx vs40, o0, T2 stxsdx vs41, o8, T2 stxsdx vs42, o16, T2 stxsdx vs43, o24, T2 addi T2, T2, 32 stxsdx vs44, o0, T2 stxsdx vs45, o8, T2 stxsdx vs46, o16, T2 stxsdx vs47, o24, T2 .endm /*########################################################################################## SOLVE_LT 8x2 ##########################################################################################*/ .macro SOLVE_LT_8x2 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 xxpermdi vs2, vs34, vs35, 0 xxpermdi vs3, vs34, vs35, 3 xxpermdi vs4, vs36, vs37, 0 xxpermdi vs5, vs36, vs37, 3 xxpermdi vs6, vs38, vs39, 0 xxpermdi vs7, vs38, vs39, 3 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 lxvd2x vs35, o48, T1 addi T1, T1, 64 lxvd2x vs36, o0, T1 lxvd2x vs37, o16, T1 lxvd2x vs38, o32, T1 lxvd2x vs39, o48, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 xvsubdp vs36, vs36, vs4 xvsubdp vs37, vs37, vs5 xvsubdp vs38, vs38, vs6 xvsubdp vs39, vs39, vs7 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 lxvdsx vs7, o24, T1 addi T1, T1, 32 xvmuldp vs32, vs32, vs0 xvnmsubadp vs33, vs32, vs1 xvnmsubadp vs34, vs32, vs2 xvnmsubadp vs35, vs32, vs3 xvnmsubadp vs36, vs32, vs4 xvnmsubadp vs37, vs32, vs5 xvnmsubadp vs38, vs32, vs6 xvnmsubadp vs39, vs32, vs7 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 lxvdsx vs6, o16, T1 addi T1, T1, 24 xvmuldp vs33, vs33, vs0 xvnmsubadp vs34, vs33, vs1 xvnmsubadp vs35, vs33, vs2 xvnmsubadp vs36, vs33, vs3 xvnmsubadp vs37, vs33, vs4 xvnmsubadp vs38, vs33, vs5 xvnmsubadp vs39, vs33, vs6 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 lxvdsx vs5, o8, T1 addi T1, T1, 16 xvmuldp vs34, vs34, vs0 xvnmsubadp vs35, vs34, vs1 xvnmsubadp vs36, vs34, vs2 xvnmsubadp vs37, vs34, vs3 xvnmsubadp vs38, vs34, vs4 xvnmsubadp vs39, vs34, vs5 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 lxvdsx vs4, o0, T1 addi T1, T1, 8 xvmuldp vs35, vs35, vs0 xvnmsubadp vs36, vs35, vs1 xvnmsubadp vs37, vs35, vs2 xvnmsubadp vs38, vs35, vs3 xvnmsubadp vs39, vs35, vs4 //############### OFFSET 4 ####################### addi T1, T1, 4*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvmuldp vs36, vs36, vs0 xvnmsubadp vs37, vs36, vs1 xvnmsubadp vs38, vs36, vs2 xvnmsubadp vs39, vs36, vs3 //############### OFFSET 5 ####################### addi T1, T1, 5*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 addi T1, T1, 24 xvmuldp vs37, vs37, vs0 xvnmsubadp vs38, vs37, vs1 xvnmsubadp vs39, vs37, vs2 //############### OFFSET 6 ####################### addi T1, T1, 6*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 xvmuldp vs38, vs38, vs0 xvnmsubadp vs39, vs38, vs1 //############### OFFSET 7 ####################### addi T1, T1, 7*SIZE lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs39, vs39, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs33, o8, T1 xxswapd vs33, vs33 stxsdx vs34, o16, T1 xxswapd vs34, vs34 stxsdx vs35, o24, T1 xxswapd vs35, vs35 addi T1, T1, 32 stxsdx vs36, o0, T1 xxswapd vs36, vs36 stxsdx vs37, o8, T1 xxswapd vs37, vs37 stxsdx vs38, o16, T1 xxswapd vs38, vs38 stxsdx vs39, o24, T1 xxswapd vs39, vs39 stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 stxsdx vs34, o16, T2 stxsdx vs35, o24, T2 addi T2, T2, 32 stxsdx vs36, o0, T2 stxsdx vs37, o8, T2 stxsdx vs38, o16, T2 stxsdx vs39, o24, T2 .endm /*########################################################################################## SOLVE_LT 4x2 ##########################################################################################*/ .macro SOLVE_LT_4x2 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 xxpermdi vs2, vs34, vs35, 0 xxpermdi vs3, vs34, vs35, 3 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 lxvd2x vs34, o32, T1 lxvd2x vs35, o48, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 xvsubdp vs34, vs34, vs2 xvsubdp vs35, vs35, vs3 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 lxvdsx vs3, o24, T1 addi T1, T1, 32 xvmuldp vs32, vs32, vs0 xvnmsubadp vs33, vs32, vs1 xvnmsubadp vs34, vs32, vs2 xvnmsubadp vs35, vs32, vs3 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 lxvdsx vs2, o16, T1 addi T1, T1, 24 xvmuldp vs33, vs33, vs0 xvnmsubadp vs34, vs33, vs1 xvnmsubadp vs35, vs33, vs2 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 xvmuldp vs34, vs34, vs0 xvnmsubadp vs35, vs34, vs1 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs35, vs35, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs33, o8, T1 xxswapd vs33, vs33 stxsdx vs34, o16, T1 xxswapd vs34, vs34 stxsdx vs35, o24, T1 xxswapd vs35, vs35 stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 stxsdx vs34, o16, T2 stxsdx vs35, o24, T2 .endm /*########################################################################################## SOLVE_LT 2x2 ##########################################################################################*/ .macro SOLVE_LT_2x2 xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 lxvd2x vs33, o16, T1 xvsubdp vs32, vs32, vs0 xvsubdp vs33, vs33, vs1 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 lxvdsx vs1, o8, T1 addi T1, T1, 16 xvmuldp vs32, vs32, vs0 xvnmsubadp vs33, vs32, vs1 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs33, vs33, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs33, o8, T1 xxswapd vs33, vs33 stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 .endm /*########################################################################################## SOLVE_LT 1x2 ##########################################################################################*/ .macro SOLVE_LT_1x2 xxpermdi vs0, vs32, vs33, 0 //############### LOAD B ####################### mr T1, BO lxvd2x vs32, o0, T1 xvsubdp vs32, vs32, vs0 mr T1, AO //############### OFFSET 0 ####################### lxvdsx vs0, o0, T1 addi T1, T1, 8 xvmuldp vs32, vs32, vs0 //############### SAVE B ####################### mr T1, BO stxvd2x vs32, o0, T1 //############### SAVE C ####################### mr T1, CO add T2, CO, LDC stxsdx vs32, o0, T1 xxswapd vs32, vs32 stxsdx vs32, o0, T2 .endm .macro INIT_16x1 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 xvmovdp vs36, vs0 xvmovdp vs37, vs0 xvmovdp vs38, vs0 xvmovdp vs39, vs0 xvmovdp vs40, vs0 xvmovdp vs41, vs0 xvmovdp vs42, vs0 xvmovdp vs43, vs0 xvmovdp vs44, vs0 xvmovdp vs45, vs0 xvmovdp vs46, vs0 xvmovdp vs47, vs0 .endm .macro KERNEL_16x1 lxvdsx vs0, o0, AO lxvdsx vs1, o8, AO lxvdsx vs2, o16, AO lxvdsx vs3, o24, AO addi AO, AO, 32 lxvdsx vs4, o0, AO lxvdsx vs5, o8, AO lxvdsx vs6, o16, AO lxvdsx vs7, o24, AO addi AO, AO, 32 lxvdsx vs8, o0, AO lxvdsx vs9, o8, AO lxvdsx vs10, o16, AO lxvdsx vs11, o24, AO addi AO, AO, 32 lxvdsx vs12, o0, AO lxvdsx vs13, o8, AO lxvdsx vs14, o16, AO lxvdsx vs15, o24, AO addi AO, AO, 32 lxvdsx vs16, o0, BO addi BO, BO, 8 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs1, vs16 xvmaddadp vs34, vs2, vs16 xvmaddadp vs35, vs3, vs16 xvmaddadp vs36, vs4, vs16 xvmaddadp vs37, vs5, vs16 xvmaddadp vs38, vs6, vs16 xvmaddadp vs39, vs7, vs16 xvmaddadp vs40, vs8, vs16 xvmaddadp vs41, vs9, vs16 xvmaddadp vs42, vs10, vs16 xvmaddadp vs43, vs11, vs16 xvmaddadp vs44, vs12, vs16 xvmaddadp vs45, vs13, vs16 xvmaddadp vs46, vs14, vs16 xvmaddadp vs47, vs15, vs16 .endm .macro INIT_8x1 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 xvmovdp vs36, vs0 xvmovdp vs37, vs0 xvmovdp vs38, vs0 xvmovdp vs39, vs0 .endm .macro KERNEL_8x1 lxvdsx vs0, o0, AO lxvdsx vs1, o8, AO lxvdsx vs2, o16, AO lxvdsx vs3, o24, AO addi AO, AO, 32 lxvdsx vs4, o0, AO lxvdsx vs5, o8, AO lxvdsx vs6, o16, AO lxvdsx vs7, o24, AO addi AO, AO, 32 lxvdsx vs16, o0, BO addi BO, BO, 8 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs1, vs16 xvmaddadp vs34, vs2, vs16 xvmaddadp vs35, vs3, vs16 xvmaddadp vs36, vs4, vs16 xvmaddadp vs37, vs5, vs16 xvmaddadp vs38, vs6, vs16 xvmaddadp vs39, vs7, vs16 .endm .macro INIT_4x1 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 xvmovdp vs34, vs0 xvmovdp vs35, vs0 .endm .macro KERNEL_4x1 lxvdsx vs0, o0, AO lxvdsx vs1, o8, AO lxvdsx vs2, o16, AO lxvdsx vs3, o24, AO addi AO, AO, 32 lxvdsx vs16, o0, BO addi BO, BO, 8 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs1, vs16 xvmaddadp vs34, vs2, vs16 xvmaddadp vs35, vs3, vs16 .endm .macro INIT_2x1 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 xvmovdp vs33, vs0 .endm .macro KERNEL_2x1 lxvdsx vs0, o0, AO lxvdsx vs1, o8, AO addi AO, AO, 16 lxvdsx vs16, o0, BO addi BO, BO, 8 xvmaddadp vs32, vs0, vs16 xvmaddadp vs33, vs1, vs16 .endm .macro INIT_1x1 xxlxor vs0, vs0, vs0 xvmovdp vs32, vs0 .endm .macro KERNEL_1x1 lxvdsx vs0, o0, AO addi AO, AO, 8 lxvdsx vs16, o0, BO addi BO, BO, 8 xvmaddadp vs32, vs0, vs16 .endm /*########################################################################################## SOLVE_LT 16x1 ##########################################################################################*/ .macro SOLVE_LT_16x1 xxswapd vs0, vs32 xxswapd vs1, vs33 xxswapd vs2, vs34 xxswapd vs3, vs35 xxswapd vs4, vs36 xxswapd vs5, vs37 xxswapd vs6, vs38 xxswapd vs7, vs39 xxswapd vs8, vs40 xxswapd vs9, vs41 xxswapd vs10, vs42 xxswapd vs11, vs43 xxswapd vs12, vs44 xxswapd vs13, vs45 xxswapd vs14, vs46 xxswapd vs15, vs47 //############### LOAD B ####################### mr T1, BO lxsdx vs32, o0, T1 lxsdx vs33, o8, T1 lxsdx vs34, o16, T1 lxsdx vs35, o24, T1 addi T1, T1, 32 lxsdx vs36, o0, T1 lxsdx vs37, o8, T1 lxsdx vs38, o16, T1 lxsdx vs39, o24, T1 addi T1, T1, 32 lxsdx vs40, o0, T1 lxsdx vs41, o8, T1 lxsdx vs42, o16, T1 lxsdx vs43, o24, T1 addi T1, T1, 32 lxsdx vs44, o0, T1 lxsdx vs45, o8, T1 lxsdx vs46, o16, T1 lxsdx vs47, o24, T1 xssubdp vs32, vs32, vs0 xssubdp vs33, vs33, vs1 xssubdp vs34, vs34, vs2 xssubdp vs35, vs35, vs3 xssubdp vs36, vs36, vs4 xssubdp vs37, vs37, vs5 xssubdp vs38, vs38, vs6 xssubdp vs39, vs39, vs7 xssubdp vs40, vs40, vs8 xssubdp vs41, vs41, vs9 xssubdp vs42, vs42, vs10 xssubdp vs43, vs43, vs11 xssubdp vs44, vs44, vs12 xssubdp vs45, vs45, vs13 xssubdp vs46, vs46, vs14 xssubdp vs47, vs47, vs15 mr T1, AO //############### OFFSET 0 ####################### lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 lxsdx vs9, o8, T1 lxsdx vs10, o16, T1 lxsdx vs11, o24, T1 addi T1, T1, 32 lxsdx vs12, o0, T1 lxsdx vs13, o8, T1 lxsdx vs14, o16, T1 lxsdx vs15, o24, T1 addi T1, T1, 32 xsmuldp vs32, vs32, vs0 xsnmsubadp vs33, vs32, vs1 xsnmsubadp vs34, vs32, vs2 xsnmsubadp vs35, vs32, vs3 xsnmsubadp vs36, vs32, vs4 xsnmsubadp vs37, vs32, vs5 xsnmsubadp vs38, vs32, vs6 xsnmsubadp vs39, vs32, vs7 xsnmsubadp vs40, vs32, vs8 xsnmsubadp vs41, vs32, vs9 xsnmsubadp vs42, vs32, vs10 xsnmsubadp vs43, vs32, vs11 xsnmsubadp vs44, vs32, vs12 xsnmsubadp vs45, vs32, vs13 xsnmsubadp vs46, vs32, vs14 xsnmsubadp vs47, vs32, vs15 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 lxsdx vs9, o8, T1 lxsdx vs10, o16, T1 lxsdx vs11, o24, T1 addi T1, T1, 32 lxsdx vs12, o0, T1 lxsdx vs13, o8, T1 lxsdx vs14, o16, T1 addi T1, T1, 24 xsmuldp vs33, vs33, vs0 xsnmsubadp vs34, vs33, vs1 xsnmsubadp vs35, vs33, vs2 xsnmsubadp vs36, vs33, vs3 xsnmsubadp vs37, vs33, vs4 xsnmsubadp vs38, vs33, vs5 xsnmsubadp vs39, vs33, vs6 xsnmsubadp vs40, vs33, vs7 xsnmsubadp vs41, vs33, vs8 xsnmsubadp vs42, vs33, vs9 xsnmsubadp vs43, vs33, vs10 xsnmsubadp vs44, vs33, vs11 xsnmsubadp vs45, vs33, vs12 xsnmsubadp vs46, vs33, vs13 xsnmsubadp vs47, vs33, vs14 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 lxsdx vs9, o8, T1 lxsdx vs10, o16, T1 lxsdx vs11, o24, T1 addi T1, T1, 32 lxsdx vs12, o0, T1 lxsdx vs13, o8, T1 addi T1, T1, 16 xsmuldp vs34, vs34, vs0 xsnmsubadp vs35, vs34, vs1 xsnmsubadp vs36, vs34, vs2 xsnmsubadp vs37, vs34, vs3 xsnmsubadp vs38, vs34, vs4 xsnmsubadp vs39, vs34, vs5 xsnmsubadp vs40, vs34, vs6 xsnmsubadp vs41, vs34, vs7 xsnmsubadp vs42, vs34, vs8 xsnmsubadp vs43, vs34, vs9 xsnmsubadp vs44, vs34, vs10 xsnmsubadp vs45, vs34, vs11 xsnmsubadp vs46, vs34, vs12 xsnmsubadp vs47, vs34, vs13 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 lxsdx vs9, o8, T1 lxsdx vs10, o16, T1 lxsdx vs11, o24, T1 addi T1, T1, 32 lxsdx vs12, o0, T1 addi T1, T1, 8 xsmuldp vs35, vs35, vs0 xsnmsubadp vs36, vs35, vs1 xsnmsubadp vs37, vs35, vs2 xsnmsubadp vs38, vs35, vs3 xsnmsubadp vs39, vs35, vs4 xsnmsubadp vs40, vs35, vs5 xsnmsubadp vs41, vs35, vs6 xsnmsubadp vs42, vs35, vs7 xsnmsubadp vs43, vs35, vs8 xsnmsubadp vs44, vs35, vs9 xsnmsubadp vs45, vs35, vs10 xsnmsubadp vs46, vs35, vs11 xsnmsubadp vs47, vs35, vs12 //############### OFFSET 4 ####################### addi T1, T1, 4*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 lxsdx vs9, o8, T1 lxsdx vs10, o16, T1 lxsdx vs11, o24, T1 addi T1, T1, 32 xsmuldp vs36, vs36, vs0 xsnmsubadp vs37, vs36, vs1 xsnmsubadp vs38, vs36, vs2 xsnmsubadp vs39, vs36, vs3 xsnmsubadp vs40, vs36, vs4 xsnmsubadp vs41, vs36, vs5 xsnmsubadp vs42, vs36, vs6 xsnmsubadp vs43, vs36, vs7 xsnmsubadp vs44, vs36, vs8 xsnmsubadp vs45, vs36, vs9 xsnmsubadp vs46, vs36, vs10 xsnmsubadp vs47, vs36, vs11 //############### OFFSET 5 ####################### addi T1, T1, 5*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 lxsdx vs9, o8, T1 lxsdx vs10, o16, T1 addi T1, T1, 24 xsmuldp vs37, vs37, vs0 xsnmsubadp vs38, vs37, vs1 xsnmsubadp vs39, vs37, vs2 xsnmsubadp vs40, vs37, vs3 xsnmsubadp vs41, vs37, vs4 xsnmsubadp vs42, vs37, vs5 xsnmsubadp vs43, vs37, vs6 xsnmsubadp vs44, vs37, vs7 xsnmsubadp vs45, vs37, vs8 xsnmsubadp vs46, vs37, vs9 xsnmsubadp vs47, vs37, vs10 //############### OFFSET 6 ####################### addi T1, T1, 6*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 lxsdx vs9, o8, T1 addi T1, T1, 16 xsmuldp vs38, vs38, vs0 xsnmsubadp vs39, vs38, vs1 xsnmsubadp vs40, vs38, vs2 xsnmsubadp vs41, vs38, vs3 xsnmsubadp vs42, vs38, vs4 xsnmsubadp vs43, vs38, vs5 xsnmsubadp vs44, vs38, vs6 xsnmsubadp vs45, vs38, vs7 xsnmsubadp vs46, vs38, vs8 xsnmsubadp vs47, vs38, vs9 //############### OFFSET 7 ####################### addi T1, T1, 7*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 lxsdx vs8, o0, T1 addi T1, T1, 8 xsmuldp vs39, vs39, vs0 xsnmsubadp vs40, vs39, vs1 xsnmsubadp vs41, vs39, vs2 xsnmsubadp vs42, vs39, vs3 xsnmsubadp vs43, vs39, vs4 xsnmsubadp vs44, vs39, vs5 xsnmsubadp vs45, vs39, vs6 xsnmsubadp vs46, vs39, vs7 xsnmsubadp vs47, vs39, vs8 //############### OFFSET 8 ####################### addi T1, T1, 8*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 xsmuldp vs40, vs40, vs0 xsnmsubadp vs41, vs40, vs1 xsnmsubadp vs42, vs40, vs2 xsnmsubadp vs43, vs40, vs3 xsnmsubadp vs44, vs40, vs4 xsnmsubadp vs45, vs40, vs5 xsnmsubadp vs46, vs40, vs6 xsnmsubadp vs47, vs40, vs7 //############### OFFSET 9 ####################### addi T1, T1, 9*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 addi T1, T1, 24 xsmuldp vs41, vs41, vs0 xsnmsubadp vs42, vs41, vs1 xsnmsubadp vs43, vs41, vs2 xsnmsubadp vs44, vs41, vs3 xsnmsubadp vs45, vs41, vs4 xsnmsubadp vs46, vs41, vs5 xsnmsubadp vs47, vs41, vs6 //############### OFFSET 10 ####################### addi T1, T1, 10*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 addi T1, T1, 16 xsmuldp vs42, vs42, vs0 xsnmsubadp vs43, vs42, vs1 xsnmsubadp vs44, vs42, vs2 xsnmsubadp vs45, vs42, vs3 xsnmsubadp vs46, vs42, vs4 xsnmsubadp vs47, vs42, vs5 //############### OFFSET 11 ####################### addi T1, T1, 11*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 addi T1, T1, 8 xsmuldp vs43, vs43, vs0 xsnmsubadp vs44, vs43, vs1 xsnmsubadp vs45, vs43, vs2 xsnmsubadp vs46, vs43, vs3 xsnmsubadp vs47, vs43, vs4 //############### OFFSET 12 ####################### addi T1, T1, 12*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 xsmuldp vs44, vs44, vs0 xsnmsubadp vs45, vs44, vs1 xsnmsubadp vs46, vs44, vs2 xsnmsubadp vs47, vs44, vs3 //############### OFFSET 13 ####################### addi T1, T1, 13*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 addi T1, T1, 24 xsmuldp vs45, vs45, vs0 xsnmsubadp vs46, vs45, vs1 xsnmsubadp vs47, vs45, vs2 //############### OFFSET 14 ####################### addi T1, T1, 14*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 addi T1, T1, 16 xsmuldp vs46, vs46, vs0 xsnmsubadp vs47, vs46, vs1 //############### OFFSET 15 ####################### addi T1, T1, 15*SIZE lxsdx vs0, o0, T1 addi T1, T1, 8 xsmuldp vs47, vs47, vs0 //############### SAVE B ####################### mr T1, BO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 addi T1, T1, 32 stxsdx vs36, o0, T1 stxsdx vs37, o8, T1 stxsdx vs38, o16, T1 stxsdx vs39, o24, T1 addi T1, T1, 32 stxsdx vs40, o0, T1 stxsdx vs41, o8, T1 stxsdx vs42, o16, T1 stxsdx vs43, o24, T1 addi T1, T1, 32 stxsdx vs44, o0, T1 stxsdx vs45, o8, T1 stxsdx vs46, o16, T1 stxsdx vs47, o24, T1 //############### SAVE C ####################### mr T1, CO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 addi T1, T1, 32 stxsdx vs36, o0, T1 stxsdx vs37, o8, T1 stxsdx vs38, o16, T1 stxsdx vs39, o24, T1 addi T1, T1, 32 stxsdx vs40, o0, T1 stxsdx vs41, o8, T1 stxsdx vs42, o16, T1 stxsdx vs43, o24, T1 addi T1, T1, 32 stxsdx vs44, o0, T1 stxsdx vs45, o8, T1 stxsdx vs46, o16, T1 stxsdx vs47, o24, T1 .endm /*########################################################################################## SOLVE_LT 8x1 ##########################################################################################*/ .macro SOLVE_LT_8x1 xxswapd vs0, vs32 xxswapd vs1, vs33 xxswapd vs2, vs34 xxswapd vs3, vs35 xxswapd vs4, vs36 xxswapd vs5, vs37 xxswapd vs6, vs38 xxswapd vs7, vs39 //############### LOAD B ####################### mr T1, BO lxsdx vs32, o0, T1 lxsdx vs33, o8, T1 lxsdx vs34, o16, T1 lxsdx vs35, o24, T1 addi T1, T1, 32 lxsdx vs36, o0, T1 lxsdx vs37, o8, T1 lxsdx vs38, o16, T1 lxsdx vs39, o24, T1 xssubdp vs32, vs32, vs0 xssubdp vs33, vs33, vs1 xssubdp vs34, vs34, vs2 xssubdp vs35, vs35, vs3 xssubdp vs36, vs36, vs4 xssubdp vs37, vs37, vs5 xssubdp vs38, vs38, vs6 xssubdp vs39, vs39, vs7 mr T1, AO //############### OFFSET 0 ####################### lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 lxsdx vs7, o24, T1 addi T1, T1, 32 xsmuldp vs32, vs32, vs0 xsnmsubadp vs33, vs32, vs1 xsnmsubadp vs34, vs32, vs2 xsnmsubadp vs35, vs32, vs3 xsnmsubadp vs36, vs32, vs4 xsnmsubadp vs37, vs32, vs5 xsnmsubadp vs38, vs32, vs6 xsnmsubadp vs39, vs32, vs7 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 lxsdx vs6, o16, T1 addi T1, T1, 24 xsmuldp vs33, vs33, vs0 xsnmsubadp vs34, vs33, vs1 xsnmsubadp vs35, vs33, vs2 xsnmsubadp vs36, vs33, vs3 xsnmsubadp vs37, vs33, vs4 xsnmsubadp vs38, vs33, vs5 xsnmsubadp vs39, vs33, vs6 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 lxsdx vs5, o8, T1 addi T1, T1, 16 xsmuldp vs34, vs34, vs0 xsnmsubadp vs35, vs34, vs1 xsnmsubadp vs36, vs34, vs2 xsnmsubadp vs37, vs34, vs3 xsnmsubadp vs38, vs34, vs4 xsnmsubadp vs39, vs34, vs5 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 lxsdx vs4, o0, T1 addi T1, T1, 8 xsmuldp vs35, vs35, vs0 xsnmsubadp vs36, vs35, vs1 xsnmsubadp vs37, vs35, vs2 xsnmsubadp vs38, vs35, vs3 xsnmsubadp vs39, vs35, vs4 //############### OFFSET 4 ####################### addi T1, T1, 4*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 xsmuldp vs36, vs36, vs0 xsnmsubadp vs37, vs36, vs1 xsnmsubadp vs38, vs36, vs2 xsnmsubadp vs39, vs36, vs3 //############### OFFSET 5 ####################### addi T1, T1, 5*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 addi T1, T1, 24 xsmuldp vs37, vs37, vs0 xsnmsubadp vs38, vs37, vs1 xsnmsubadp vs39, vs37, vs2 //############### OFFSET 6 ####################### addi T1, T1, 6*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 addi T1, T1, 16 xsmuldp vs38, vs38, vs0 xsnmsubadp vs39, vs38, vs1 //############### OFFSET 7 ####################### addi T1, T1, 7*SIZE lxsdx vs0, o0, T1 addi T1, T1, 8 xsmuldp vs39, vs39, vs0 //############### SAVE B ####################### mr T1, BO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 addi T1, T1, 32 stxsdx vs36, o0, T1 stxsdx vs37, o8, T1 stxsdx vs38, o16, T1 stxsdx vs39, o24, T1 //############### SAVE C ####################### mr T1, CO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 addi T1, T1, 32 stxsdx vs36, o0, T1 stxsdx vs37, o8, T1 stxsdx vs38, o16, T1 stxsdx vs39, o24, T1 .endm /*########################################################################################## SOLVE_LT 4x1 ##########################################################################################*/ .macro SOLVE_LT_4x1 xxswapd vs0, vs32 xxswapd vs1, vs33 xxswapd vs2, vs34 xxswapd vs3, vs35 //############### LOAD B ####################### mr T1, BO lxsdx vs32, o0, T1 lxsdx vs33, o8, T1 lxsdx vs34, o16, T1 lxsdx vs35, o24, T1 xssubdp vs32, vs32, vs0 xssubdp vs33, vs33, vs1 xssubdp vs34, vs34, vs2 xssubdp vs35, vs35, vs3 mr T1, AO //############### OFFSET 0 ####################### lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 lxsdx vs3, o24, T1 addi T1, T1, 32 xsmuldp vs32, vs32, vs0 xsnmsubadp vs33, vs32, vs1 xsnmsubadp vs34, vs32, vs2 xsnmsubadp vs35, vs32, vs3 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 lxsdx vs2, o16, T1 addi T1, T1, 24 xsmuldp vs33, vs33, vs0 xsnmsubadp vs34, vs33, vs1 xsnmsubadp vs35, vs33, vs2 //############### OFFSET 2 ####################### addi T1, T1, 2*SIZE lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 addi T1, T1, 16 xsmuldp vs34, vs34, vs0 xsnmsubadp vs35, vs34, vs1 //############### OFFSET 3 ####################### addi T1, T1, 3*SIZE lxsdx vs0, o0, T1 addi T1, T1, 8 xsmuldp vs35, vs35, vs0 //############### SAVE B ####################### mr T1, BO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 //############### SAVE C ####################### mr T1, CO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 .endm /*########################################################################################## SOLVE_LT 2x1 ##########################################################################################*/ .macro SOLVE_LT_2x1 xxswapd vs0, vs32 xxswapd vs1, vs33 //############### LOAD B ####################### mr T1, BO lxsdx vs32, o0, T1 lxsdx vs33, o8, T1 xssubdp vs32, vs32, vs0 xssubdp vs33, vs33, vs1 mr T1, AO //############### OFFSET 0 ####################### lxsdx vs0, o0, T1 lxsdx vs1, o8, T1 addi T1, T1, 16 xsmuldp vs32, vs32, vs0 xsnmsubadp vs33, vs32, vs1 //############### OFFSET 1 ####################### addi T1, T1, 1*SIZE lxsdx vs0, o0, T1 addi T1, T1, 8 xsmuldp vs33, vs33, vs0 //############### SAVE B ####################### mr T1, BO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 //############### SAVE C ####################### mr T1, CO stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 .endm /*########################################################################################## SOLVE_LT 1x1 ##########################################################################################*/ .macro SOLVE_LT_1x1 xxswapd vs0, vs32 //############### LOAD B ####################### mr T1, BO lxsdx vs32, o0, T1 xssubdp vs32, vs32, vs0 mr T1, AO //############### OFFSET 0 ####################### lxsdx vs0, o0, T1 addi T1, T1, 8 xsmuldp vs32, vs32, vs0 //############### SAVE B ####################### mr T1, BO stxsdx vs32, o0, T1 //############### SAVE C ####################### mr T1, CO stxsdx vs32, o0, T1 .endm OpenBLAS-0.2.20/kernel/power/exfunc.S000066400000000000000000000057271313527062700172070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" .machine "any" .globl .rpcc .rpcc: mftb r3 rlinm r3, r3, 3, 0, 31 # ldc(scaling) bcr BO_ALWAYS,CR0_LT .globl .blas_lock .blas_lock: cal r7, 1(r0) LL(0): l r6, 0(r3) cmpi CR0, r6, 0 bne LL(2) lwarx r6, r0, r3 cmpwi CR6, r6, 0 bne LL(2) stwcx. r7, r0, r3 bne- LL(0) LL(1): bcr BO_ALWAYS,CR0_LT LL(2): b LL(0) OpenBLAS-0.2.20/kernel/power/gemm_beta.S000066400000000000000000000142031313527062700176240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define C r10 #define LDC r11 #define J r5 #define PRE r6 #define CO1 r7 #define ALPHA f31 #define STACKSIZE 32 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f31, 16(SP) stw r0, 24(SP) #ifdef linux #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else ld C, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld C, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz C, FRAMESLOT(0) + STACKSIZE(SP) lwz LDC, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, BASE_SHIFT fmr ALPHA, f1 lfs f0, 24(SP) cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) mr J, N fcmpu cr7, f1, f0 bne cr7, LL(20) .align 4 LL(10): mr CO1, C add C, C, LDC addi PRE, 0, 32 * SIZE srawi. r0, M, 4 mtspr CTR, r0 ble LL(15) .align 4 LL(12): STFD f0, 0 * SIZE(CO1) STFD f0, 1 * SIZE(CO1) STFD f0, 2 * SIZE(CO1) STFD f0, 3 * SIZE(CO1) STFD f0, 4 * SIZE(CO1) STFD f0, 5 * SIZE(CO1) STFD f0, 6 * SIZE(CO1) STFD f0, 7 * SIZE(CO1) STFD f0, 8 * SIZE(CO1) STFD f0, 9 * SIZE(CO1) STFD f0, 10 * SIZE(CO1) STFD f0, 11 * SIZE(CO1) STFD f0, 12 * SIZE(CO1) STFD f0, 13 * SIZE(CO1) STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) dcbst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 LL(15): andi. r0, M, 15 mtspr CTR, r0 beq LL(19) .align 4 LL(16): STFD f0, 0 * SIZE(CO1) addi CO1, CO1, 1 * SIZE bdnz LL(16) .align 4 LL(19): addic. J, J, -1 bgt LL(10) b LL(999) .align 4 LL(20): mr CO1, C add C, C, LDC addi PRE, 0, 16 * SIZE srawi. r0, M, 4 mtspr CTR, r0 ble LL(25) .align 4 LL(22): LFD f0, 0 * SIZE(CO1) LFD f1, 1 * SIZE(CO1) LFD f2, 2 * SIZE(CO1) LFD f3, 3 * SIZE(CO1) LFD f4, 4 * SIZE(CO1) LFD f5, 5 * SIZE(CO1) LFD f6, 6 * SIZE(CO1) LFD f7, 7 * SIZE(CO1) LFD f8, 8 * SIZE(CO1) LFD f9, 9 * SIZE(CO1) LFD f10, 10 * SIZE(CO1) LFD f11, 11 * SIZE(CO1) LFD f12, 12 * SIZE(CO1) LFD f13, 13 * SIZE(CO1) LFD f14, 14 * SIZE(CO1) LFD f15, 15 * SIZE(CO1) FMUL f0, ALPHA, f0 FMUL f1, ALPHA, f1 FMUL f2, ALPHA, f2 FMUL f3, ALPHA, f3 FMUL f4, ALPHA, f4 FMUL f5, ALPHA, f5 FMUL f6, ALPHA, f6 FMUL f7, ALPHA, f7 FMUL f8, ALPHA, f8 FMUL f9, ALPHA, f9 FMUL f10, ALPHA, f10 FMUL f11, ALPHA, f11 FMUL f12, ALPHA, f12 FMUL f13, ALPHA, f13 FMUL f14, ALPHA, f14 FMUL f15, ALPHA, f15 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 4 * SIZE(CO1) STFD f5, 5 * SIZE(CO1) STFD f6, 6 * SIZE(CO1) STFD f7, 7 * SIZE(CO1) STFD f8, 8 * SIZE(CO1) STFD f9, 9 * SIZE(CO1) STFD f10, 10 * SIZE(CO1) STFD f11, 11 * SIZE(CO1) STFD f12, 12 * SIZE(CO1) STFD f13, 13 * SIZE(CO1) STFD f14, 14 * SIZE(CO1) STFD f15, 15 * SIZE(CO1) addi CO1, CO1, 16 * SIZE dcbtst PRE, CO1 bdnz LL(22) .align 4 LL(25): andi. r0, M, 15 mtspr CTR, r0 ble LL(29) .align 4 LL(26): LFD f0, 0 * SIZE(CO1) FMUL f0, f0, ALPHA STFD f0, 0 * SIZE(CO1) addi CO1, CO1, 1 * SIZE bdnz LL(26) .align 4 LL(29): addic. J, J, -1 bgt LL(20) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f31, 16(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_kernel.S000066400000000000000000001354221313527062700202000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define TEMP r18 #define KK r19 #define BB r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) #if defined(TRMMKERNEL) std r19, 240(SP) std r18, 248(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) #if defined(TRMMKERNEL) stw r19, 192(SP) stw r18, 196(SP) #endif #endif stfd f1, ALPHA stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST /* Normal prefetch */ #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 3 * SIZE #endif #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE) li PREB, (16 * 5 * SIZE) #else li PREA, (16 * 19 * SIZE) li PREB, (16 * 8 * SIZE) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE) li PREB, (16 * 1 * SIZE) #else li PREA, (16 * 2 * SIZE) li PREB, (16 * 2 * SIZE) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE) li PREB, (16 * 7 * SIZE) #else li PREA, (16 * 12 * SIZE) li PREB, (16 * 6 * SIZE) #endif #endif #endif srawi. J, N, 2 ble LL(40) .align 4 LL(10): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif slwi BB, K, BASE_SHIFT + 2 lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 mr AO, A add C, CO4, LDC ble LL(20) .align 4 LL(11): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) #ifdef POWER5 LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) LFD f31, 7 * SIZE(B) #endif mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #ifdef POWER5 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) #endif #endif DCBTST(CO1, PREC) DCBTST(CO2, PREC) DCBTST(CO3, PREC) DCBTST(CO4, PREC) dcbt B, BB addi BB, BB, 16 * SIZE #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(15) #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) #ifdef POWER5 LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) LFD f31, 7 * SIZE(B) #endif DCBTST(CO1, PREC) DCBTST(CO2, PREC) DCBTST(CO3, PREC) DCBTST(CO4, PREC) dcbt B, BB addi BB, BB, 16 * SIZE srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(15) #endif .align 4 LL(12): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 #if defined(ALLOC_HUGETLB) && !defined(POWER5) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) #endif FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 #if !defined(ALLOC_HUGETLB) && !defined(POWER5) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) #endif LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 #ifndef POWER5 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) #else LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) #endif FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 #ifndef POWER5 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) #else LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) #endif FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 #if (L2_SIZE == 1024976) && defined (ALLOC_HUGETLB) nop nop nop nop #endif #ifdef POWER5 LFD f28, 20 * SIZE(BO) LFD f29, 21 * SIZE(BO) LFD f30, 22 * SIZE(BO) LFD f31, 23 * SIZE(BO) #endif addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 #ifndef ALLOC_HUGETLB DCBT(BO, PREB) DCBT(AO, PREA) #endif #endif bdnz LL(12) .align 4 LL(15): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 LFD f16, 0 * SIZE(CO3) LFD f17, 1 * SIZE(CO3) LFD f18, 2 * SIZE(CO3) LFD f19, 3 * SIZE(CO3) LFD f20, 0 * SIZE(CO4) LFD f21, 1 * SIZE(CO4) LFD f22, 2 * SIZE(CO4) LFD f23, 3 * SIZE(CO4) FMADD f8, f8, f30, f16 FMADD f9, f9, f30, f17 FMADD f10, f10, f30, f18 FMADD f11, f11, f30, f19 FMADD f12, f12, f30, f20 FMADD f13, f13, f30, f21 FMADD f14, f14, f30, f22 FMADD f15, f15, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f10, f10, f30 FMUL f11, f11, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 FMUL f14, f14, f30 FMUL f15, f15, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -4 #endif slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(TRMMKERNEL) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f4, f4, f30, f18 FMADD f5, f5, f30, f19 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) FMADD f8, f8, f30, f20 FMADD f9, f9, f30, f21 FMADD f12, f12, f30, f22 FMADD f13, f13, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f20, 0 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) FMADD f0, f0, f30, f16 FMADD f4, f4, f30, f18 FMADD f8, f8, f30, f20 FMADD f12, f12, f30, f22 #else FMUL f0, f0, f30 FMUL f4, f4, f30 FMUL f8, f8, f30 FMUL f12, f12, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(39): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif mr B, BO addic. J, J, -1 bgt LL(10) .align 4 LL(40): mr CO1, C add CO2, C, LDC andi. J, N, 2 ble LL(70) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble LL(50) .align 4 LL(41): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif DCBTST(CO1, PREC) DCBTST(CO2, PREC) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) DCBTST(CO1, PREC) DCBTST(CO2, PREC) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(42) .align 4 LL(45): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f18 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(69): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO .align 4 LL(70): mr CO1, C andi. J, N, 1 ble LL(999) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 mr AO, A ble LL(80) .align 4 LL(71): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif DCBTST(CO1, PREC) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) DCBTST(CO1, PREC) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(72) .align 4 LL(75): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 addi CO1, CO1, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(90): andi. I, M, 1 ble LL(999) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 3 mtspr CTR, r0 mr BO, B #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMADD f0, f0, f30, f16 #else FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMUL f0, f0, f30 #endif STFD f0, 0 * SIZE(CO1) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) ld r19, 240(SP) ld r18, 248(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) lwz r19, 192(SP) lwz r18, 196(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_altivec.S000066400000000000000000001375601313527062700217140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 360 #else #define STACKSIZE 272 #endif #define ALPHA 0 #define FZERO 16 #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #define STACK r11 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r29 #define PREC r30 #define VREG r31 #define LOAD_A lvx #define LOAD_B lvx #define OFFSET_0 0 #define OFFSET_1 r14 #define OFFSET_2 r15 #define OFFSET_3 r16 #define OFFSET_4 r17 #define OFFSET_5 r18 #define OFFSET_6 r19 #define OFFSET_7 r20 #define c01 v0 #define c02 v1 #define c03 v2 #define c04 v3 #define c05 v4 #define c06 v5 #define c07 v6 #define c08 v7 #define c09 v8 #define c10 v9 #define c11 v10 #define c12 v11 #define c13 v12 #define c14 v13 #define c15 v14 #define c16 v15 #define a1 v16 #define a2 v17 #define a3 v18 #define a4 v19 #define a5 v20 #define a6 v21 #define a7 v22 #define a8 v23 #define b1 v24 #define b2 v25 #define bp1 v26 #define bp2 v27 #define C1 v16 #define C2 v17 #define C3 v18 #define C4 v19 #define C5 v20 #define C6 v21 #define C7 v22 #define C8 v23 #define C9 v24 #define c00 v25 #define PERMRSHIFT1 v26 #define PERMRSHIFT2 v27 #define PERMRSHIFT3 v28 #define PERMRSHIFT4 v29 #define VZERO v30 #define alpha v31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE mr STACK, SP li r0, 0 * 16 stvx v20, SP, r0 li r0, 1 * 16 stvx v21, SP, r0 li r0, 2 * 16 stvx v22, SP, r0 li r0, 3 * 16 stvx v23, SP, r0 li r0, 4 * 16 stvx v24, SP, r0 li r0, 5 * 16 stvx v25, SP, r0 li r0, 6 * 16 stvx v26, SP, r0 li r0, 7 * 16 stvx v27, SP, r0 li r0, 8 * 16 stvx v28, SP, r0 li r0, 9 * 16 stvx v29, SP, r0 li r0, 10 * 16 stvx v30, SP, r0 li r0, 11 * 16 stvx v31, SP, r0 #ifdef __64BIT__ std r31, 192(SP) std r30, 200(SP) std r29, 208(SP) std r28, 216(SP) std r27, 224(SP) std r26, 232(SP) std r25, 240(SP) std r24, 248(SP) std r23, 256(SP) std r22, 264(SP) std r21, 272(SP) std r20, 280(SP) std r19, 288(SP) std r18, 296(SP) std r17, 304(SP) std r16, 312(SP) std r15, 320(SP) std r14, 328(SP) #else stw r31, 192(SP) stw r30, 196(SP) stw r29, 200(SP) stw r28, 204(SP) stw r27, 208(SP) stw r26, 212(SP) stw r25, 216(SP) stw r24, 220(SP) stw r23, 224(SP) stw r22, 228(SP) stw r21, 232(SP) stw r20, 236(SP) stw r19, 240(SP) stw r18, 244(SP) stw r17, 248(SP) stw r16, 252(SP) stw r15, 256(SP) stw r14, 260(SP) #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif li r0, -1 mfspr VREG, VRsave mtspr VRsave, r0 addi SP, SP, -128 li r0, -128 and SP, SP, r0 li OFFSET_1, 4 * SIZE li OFFSET_2, 8 * SIZE li OFFSET_3, 12 * SIZE li OFFSET_4, 16 * SIZE li OFFSET_5, 20 * SIZE li OFFSET_6, 24 * SIZE li OFFSET_7, 28 * SIZE stfs f1, ALPHA + 0(SP) stfs f1, ALPHA + 4(SP) stfs f1, ALPHA + 8(SP) stfs f1, ALPHA + 12(SP) li r29, 0 stw r29, FZERO(SP) slwi LDC, LDC, BASE_SHIFT li PREC, (15 * SIZE) #ifdef CELL li PREB, (3 * 32 * SIZE) #else li PREB, (5 * 32 * SIZE) #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 2 ble LL(60) .align 4 LL(01): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC mr AO, A srawi. I, M, 4 ble LL(20) .align 4 LL(11): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO vxor c03, c03, c03 LOAD_A a2, OFFSET_1, AO vxor c04, c04, c04 LOAD_A a3, OFFSET_2, AO vxor c05, c05, c05 LOAD_A a4, OFFSET_3, AO vxor c06, c06, c06 LOAD_A a5, OFFSET_4, AO vxor c07, c07, c07 nop vxor c08, c08, c08 vxor c09, c09, c09 dcbtst CO1, PREC vxor c10, c10, c10 dcbtst CO2, PREC vxor c11, c11, c11 dcbtst CO3, PREC vxor c12, c12, c12 dcbtst CO4, PREC vxor c13, c13, c13 mr BO, B vxor c14, c14, c14 srawi. r0, K, 2 vxor c15, c15, c15 mtspr CTR, r0 vxor c16, c16, c16 vspltw bp1, b1, 0 ble LL(13) .align 4 #define NOP1 mr r3, r3 #define NOP2 mr r4, r4 LL(12): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 DCBT(A, PREA) vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 vspltw bp1, b1, 2 vmaddfp c05, a1, bp2, c05 DCBT(B, PREB) vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 vspltw bp2, b1, 3 vmaddfp c09, a1, bp1, c09 NOP1 vmaddfp c10, a2, bp1, c10 LOAD_B b2, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 addi BO, BO, 8 * SIZE vmaddfp c12, a4, bp1, c12 vspltw bp1, b2, 0 vmaddfp c13, a1, bp2, c13 NOP1 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 addi AO, AO, 32 * SIZE vmaddfp c07, a7, bp2, c07 LOAD_B b1, OFFSET_0, BO vmaddfp c08, a8, bp2, c08 vspltw bp2, b2, 3 vmaddfp c09, a5, bp1, c09 NOP1 vmaddfp c10, a6, bp1, c10 NOP2 vmaddfp c11, a7, bp1, c11 NOP1 vmaddfp c12, a8, bp1, c12 vspltw bp1, b1, 0 vmaddfp c13, a5, bp2, c13 DCBT(A, PREA) vmaddfp c14, a6, bp2, c14 LOAD_A a1, OFFSET_0, AO vmaddfp c15, a7, bp2, c15 LOAD_A a2, OFFSET_1, AO vmaddfp c16, a8, bp2, c16 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 LOAD_A a3, OFFSET_2, AO vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 vspltw bp1, b1, 2 vmaddfp c05, a1, bp2, c05 NOP1 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 vspltw bp2, b1, 3 vmaddfp c09, a1, bp1, c09 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a2, bp1, c10 NOP2 vmaddfp c11, a3, bp1, c11 NOP1 vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 addi AO, AO, 32 * SIZE vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO // vmaddfp c11, a7, bp1, c11 LOAD_A a2, OFFSET_1, AO vmaddfp c12, a8, bp1, c12 NOP2 vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 LOAD_A a4, OFFSET_3, AO vmaddfp c16, a8, bp2, c16 bdnz+ LL(12) .align 4 LL(13): andi. r0, K, 2 nop nop ble+ LL(15) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 NOP2 vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 LOAD_B b2, OFFSET_1, BO vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_A a5, OFFSET_4, AO vmaddfp c11, a3, bp1, c11 LOAD_A a6, OFFSET_5, AO vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a7, OFFSET_6, AO vmaddfp c15, a3, bp2, c15 LOAD_A a8, OFFSET_7, AO vmaddfp c16, a4, bp2, c16 addi AO, AO, 32 * SIZE vmaddfp c01, a5, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a6, bp1, c02 NOP2 vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO vmaddfp c11, a7, bp1, c11 LOAD_A a2, OFFSET_1, AO vmaddfp c12, a8, bp1, c12 NOP2 vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 LOAD_A a4, OFFSET_3, AO vmaddfp c16, a8, bp2, c16 .align 4 LL(15): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(18) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 16 * SIZE vmaddfp c11, a3, bp1, c11 addi BO, BO, 4 * SIZE vmaddfp c12, a4, bp1, c12 nop vmaddfp c13, a1, bp2, c13 vmaddfp c14, a2, bp2, c14 vmaddfp c15, a3, bp2, c15 vmaddfp c16, a4, bp2, c16 .align 4 LL(18): lvx C1, OFFSET_0, CO1 cmpwi cr0, LDC, 32 * SIZE lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT2, 0, CO2 lvx C4, OFFSET_3, CO1 lvsr PERMRSHIFT3, 0, CO3 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT4, 0, CO4 ble LL(19) vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO2 vmaddfp c01, alpha, c01, C2 lvx C6, OFFSET_1, CO2 vmaddfp c02, alpha, c02, C3 lvx C7, OFFSET_2, CO2 vmaddfp c03, alpha, c03, C4 lvx C8, OFFSET_3, CO2 vmaddfp c04, alpha, c04, C5 lvx C9, OFFSET_4, CO2 stvx c00, OFFSET_0, CO1 vperm c00, VZERO, c05, PERMRSHIFT2 stvx c01, OFFSET_1, CO1 vperm c05, c05, c06, PERMRSHIFT2 stvx c02, OFFSET_2, CO1 vperm c06, c06, c07, PERMRSHIFT2 stvx c03, OFFSET_3, CO1 vperm c07, c07, c08, PERMRSHIFT2 stvx c04, OFFSET_4, CO1 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO3 vmaddfp c05, alpha, c05, C6 lvx C2, OFFSET_1, CO3 vmaddfp c06, alpha, c06, C7 lvx C3, OFFSET_2, CO3 vmaddfp c07, alpha, c07, C8 lvx C4, OFFSET_3, CO3 vmaddfp c08, alpha, c08, C9 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO2 vperm c00, VZERO, c09, PERMRSHIFT3 stvx c05, OFFSET_1, CO2 vperm c09, c09, c10, PERMRSHIFT3 stvx c06, OFFSET_2, CO2 vperm c10, c10, c11, PERMRSHIFT3 stvx c07, OFFSET_3, CO2 vperm c11, c11, c12, PERMRSHIFT3 stvx c08, OFFSET_4, CO2 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 lvx C9, OFFSET_4, CO4 vmaddfp c09, alpha, c09, C2 lvx C1, OFFSET_0, CO4 vmaddfp c10, alpha, c10, C3 lvx C6, OFFSET_1, CO4 vmaddfp c11, alpha, c11, C4 lvx C7, OFFSET_2, CO4 vmaddfp c12, alpha, c12, C5 lvx C8, OFFSET_3, CO4 stvx c00, OFFSET_0, CO3 vperm c00, VZERO, c13, PERMRSHIFT4 stvx c09, OFFSET_1, CO3 vperm c13, c13, c14, PERMRSHIFT4 stvx c10, OFFSET_2, CO3 vperm c14, c14, c15, PERMRSHIFT4 stvx c11, OFFSET_3, CO3 vperm c15, c15, c16, PERMRSHIFT4 stvx c12, OFFSET_4, CO3 vperm c16, c16, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C6 vmaddfp c14, alpha, c14, C7 vmaddfp c15, alpha, c15, C8 vmaddfp c16, alpha, c16, C9 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 stvx c15, OFFSET_3, CO4 stvx c16, OFFSET_4, CO4 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE addic. I, I, -1 bgt+ LL(11) b LL(20) .align 4 LL(19): lvx C6, OFFSET_1, CO2 lvx C7, OFFSET_2, CO2 lvx C8, OFFSET_3, CO2 lvx C9, OFFSET_4, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 lvx C2, OFFSET_1, CO3 vmaddfp c02, alpha, c02, C3 lvx C3, OFFSET_2, CO3 vmaddfp c03, alpha, c03, C4 lvx C4, OFFSET_3, CO3 vmaddfp c04, alpha, c04, C5 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C6 lvx C6, OFFSET_1, CO4 vmaddfp c06, alpha, c06, C7 lvx C7, OFFSET_2, CO4 vmaddfp c07, alpha, c07, C8 lvx C8, OFFSET_3, CO4 vmaddfp c08, alpha, c08, C9 lvx C9, OFFSET_4, CO4 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 lvx C1, OFFSET_0, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, c11, PERMRSHIFT3 vperm c11, c11, c12, PERMRSHIFT3 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 vmaddfp c11, alpha, c11, C4 vmaddfp c12, alpha, c12, C5 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 stvx c11, OFFSET_3, CO3 stvx c12, OFFSET_4, CO3 lvx C1, OFFSET_0, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, c15, PERMRSHIFT4 vperm c15, c15, c16, PERMRSHIFT4 vperm c16, c16, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C6 vmaddfp c14, alpha, c14, C7 vmaddfp c15, alpha, c15, C8 vmaddfp c16, alpha, c16, C9 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 stvx c15, OFFSET_3, CO4 stvx c16, OFFSET_4, CO4 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 8 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4 LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4 LL(25): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(28) .align 4 LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4 LL(28): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 lvx C3, OFFSET_2, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 lvx C3, OFFSET_2, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 vmaddfp c14, alpha, c14, C3 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE addi CO3, CO3, 8 * SIZE addi CO4, CO4, 8 * SIZE .align 4 LL(30): andi. I, M, 4 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4 LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4 LL(35): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(38) .align 4 LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE .align 4 LL(40): andi. I, M, 2 ble LL(50) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4 LL(42): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): andi. r0, K, 1 ble LL(48) .align 4 LL(46): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(48): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 LFD f8, 0 * SIZE(CO3) LFD f9, 1 * SIZE(CO3) LFD f10, 0 * SIZE(CO4) LFD f11, 1 * SIZE(CO4) FMADD f4, f4, f13, f8 FMADD f5, f5, f13, f9 FMADD f6, f6, f13, f10 FMADD f7, f7, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) STFD f4, 0 * SIZE(CO3) STFD f5, 1 * SIZE(CO3) STFD f6, 0 * SIZE(CO4) STFD f7, 1 * SIZE(CO4) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE .align 4 LL(50): andi. I, M, 1 ble LL(59) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(55) .align 4 LL(52): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f9, f10, f0 FMADD f1, f9, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f9, 3 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(52) .align 4 LL(55): andi. r0, K, 1 ble LL(58) .align 4 LL(56): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(58): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) LFD f10, 0 * SIZE(CO3) LFD f11, 0 * SIZE(CO4) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) STFD f2, 0 * SIZE(CO3) STFD f3, 0 * SIZE(CO4) .align 4 LL(59): mr B, BO addic. J, J, -1 bgt LL(01) .align 4 LL(60): andi. r0, N, 2 ble LL(120) mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 4 ble LL(80) .align 4 LL(71): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B dcbtst CO1, PREC dcbtst CO2, PREC vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(75) .align 4 LL(72): LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 3 vmaddfp c02, a6, bp1, c02 vmaddfp c03, a7, bp1, c03 vmaddfp c04, a8, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c05, a5, bp2, c05 vmaddfp c06, a6, bp2, c06 vmaddfp c07, a7, bp2, c07 vmaddfp c08, a8, bp2, c08 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(72) .align 4 LL(75): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(78) .align 4 LL(76): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 16 * SIZE vmaddfp c03, a3, bp1, c03 addi BO, BO, 2 * SIZE vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 .align 4 LL(78): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 vmaddfp c07, alpha, c07, C4 vmaddfp c08, alpha, c08, C5 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 8 ble LL(90) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(85) .align 4 LL(82): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c03, a3, bp1, c03 vspltw bp2, b1, 3 vmaddfp c04, a4, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(82) .align 4 LL(85): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(88) .align 4 LL(86): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c05, a1, bp2, c05 addi BO, BO, 2 * SIZE vmaddfp c06, a2, bp2, c06 .align 4 LL(88): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 vaddfp c01, c01, c03 vaddfp c02, c02, c04 vaddfp c05, c05, c07 vaddfp c06, c06, c08 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4 LL(90): andi. I, M, 4 ble LL(100) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 vxor c06, c06, c06 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(95) .align 4 LL(92): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c06, a2, bp2, c06 addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO bdnz LL(92) .align 4 LL(95): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(98) .align 4 LL(96): vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c05, a1, bp2, c05 addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(98): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4 LL(100): andi. I, M, 2 ble LL(110) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(105) .align 4 LL(102): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) FMADD f4, f8, f12, f4 FMADD f5, f9, f12, f5 FMADD f6, f8, f13, f6 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(102) .align 4 LL(105): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(108) .align 4 LL(106): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(108): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE .align 4 LL(110): andi. I, M, 1 ble LL(119) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(115) .align 4 LL(112): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(112) .align 4 LL(115): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(118) .align 4 LL(116): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 LFD f8, 1 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(118): LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) .align 4 LL(119): mr B, BO .align 4 LL(120): andi. r0, N, 1 ble LL(999) mr CO1, C mr AO, A srawi. I, M, 4 ble LL(140) .align 4 LL(130): vxor c01, c01, c01 vxor c02, c02, c02 vxor c03, c03, c03 vxor c04, c04, c04 mr BO, B dcbtst CO1, PREC mr J, K andi. r0, B, 15 ble+ LL(131) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) .align 4 LL(131): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(135) .align 4 LL(133): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vspltw bp2, b1, 1 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vspltw bp2, b1, 3 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(133) .align 4 LL(135): andi. r0, J, 3 ble+ LL(138) cmpwi cr0, r0, 3 bne LL(136) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp2, b1, 1 vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 3 * SIZE b LL(138) .align 4 LL(136): cmpwi cr0, r0, 2 bne LL(137) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO LOAD_A a3, OFFSET_6, AO LOAD_A a4, OFFSET_7, AO vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 2 * SIZE b LL(138) .align 4 LL(137): cmpwi cr0, r0, 1 bne LL(138) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(138): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 addi CO1, CO1, 16 * SIZE addic. I, I, -1 bgt+ LL(130) .align 4 LL(140): andi. I, M, 8 ble LL(150) vxor c01, c01, c01 vxor c02, c02, c02 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(141) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) .align 4 LL(141): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(145) .align 4 LL(143): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 vspltw bp1, b1, 2 vmaddfp c01, a5, bp1, c01 vmaddfp c02, a6, bp1, c02 vspltw bp2, b1, 3 vmaddfp c01, a7, bp2, c01 vmaddfp c02, a8, bp2, c02 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(143) .align 4 LL(145): andi. r0, J, 3 ble+ LL(148) cmpwi cr0, r0, 3 bne LL(146) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 24 * SIZE addi BO, BO, 3 * SIZE b LL(148) .align 4 LL(146): cmpwi cr0, r0, 2 bne LL(147) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 addi AO, AO, 16 * SIZE addi BO, BO, 2 * SIZE b LL(148) .align 4 LL(147): cmpwi cr0, r0, 1 bne LL(148) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(148): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 addi CO1, CO1, 8 * SIZE .align 4 LL(150): andi. I, M, 4 ble LL(160) vxor c01, c01, c01 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(151) LOAD_A a1, OFFSET_0, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) LOAD_A a1, OFFSET_0, AO addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) .align 4 LL(151): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(155) .align 4 LL(153): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 vspltw bp2, b1, 3 vmaddfp c01, a4, bp2, c01 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO bdnz LL(153) .align 4 LL(155): andi. r0, J, 3 ble+ LL(158) cmpwi cr0, r0, 3 bne LL(156) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 addi AO, AO, 12 * SIZE addi BO, BO, 3 * SIZE b LL(158) .align 4 LL(156): cmpwi cr0, r0, 2 bne LL(157) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c01, a2, bp2, c01 addi AO, AO, 8 * SIZE addi BO, BO, 2 * SIZE b LL(158) .align 4 LL(157): cmpwi cr0, r0, 1 bne LL(158) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 addi AO, AO, 4 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(158): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 addi CO1, CO1, 4 * SIZE .align 4 LL(160): andi. I, M, 2 ble LL(170) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 2 * SIZE(AO) LFD f11, 3 * SIZE(AO) LFD f12, 0 * SIZE(B) LFD f13, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(165) .align 4 LL(162): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 FMADD f2, f10, f13, f2 FMADD f3, f11, f13, f3 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 6 * SIZE(AO) LFD f11, 7 * SIZE(AO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE bdnz LL(162) .align 4 LL(165): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(168) .align 4 LL(166): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 addi AO, AO, 2 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(168): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE .align 4 LL(170): andi. I, M, 1 ble LL(999) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(175) .align 4 LL(172): FMADD f0, f8, f10, f0 FMADD f1, f9, f11, f1 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(172) .align 4 LL(175): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(178) .align 4 LL(176): FMADD f0, f8, f10, f0 addi AO, AO, 1 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(178): LFD f8, 0 * SIZE(CO1) FADD f0, f0, f1 FMADD f0, f0, f13, f8 STFD f0, 0 * SIZE(CO1) .align 4 LL(999): mr SP, STACK li r0, 0 * 16 lvx v20, SP, r0 li r0, 1 * 16 lvx v21, SP, r0 li r0, 2 * 16 lvx v22, SP, r0 li r0, 3 * 16 lvx v23, SP, r0 li r0, 4 * 16 lvx v24, SP, r0 li r0, 5 * 16 lvx v25, SP, r0 li r0, 6 * 16 lvx v26, SP, r0 li r0, 7 * 16 lvx v27, SP, r0 li r0, 8 * 16 lvx v28, SP, r0 li r0, 9 * 16 lvx v29, SP, r0 li r0, 10 * 16 lvx v30, SP, r0 li r0, 11 * 16 lvx v31, SP, r0 mtspr VRsave, VREG #ifdef __64BIT__ ld r31, 192(SP) ld r30, 200(SP) ld r29, 208(SP) ld r28, 216(SP) ld r27, 224(SP) ld r26, 232(SP) ld r25, 240(SP) ld r24, 248(SP) ld r23, 256(SP) ld r22, 264(SP) ld r21, 272(SP) ld r20, 280(SP) ld r19, 288(SP) ld r18, 296(SP) ld r17, 304(SP) ld r16, 312(SP) ld r15, 320(SP) ld r14, 328(SP) #else lwz r31, 192(SP) lwz r30, 196(SP) lwz r29, 200(SP) lwz r28, 204(SP) lwz r27, 208(SP) lwz r26, 212(SP) lwz r25, 216(SP) lwz r24, 220(SP) lwz r23, 224(SP) lwz r22, 228(SP) lwz r21, 232(SP) lwz r20, 236(SP) lwz r19, 240(SP) lwz r18, 244(SP) lwz r17, 248(SP) lwz r16, 252(SP) lwz r15, 256(SP) lwz r14, 260(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_altivec_cell.S000066400000000000000000001376271313527062700227170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 360 #else #define STACKSIZE 272 #endif #define ALPHA 0 #define FZERO 16 #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #define STACK r11 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r29 #define PREC r30 #define VREG r31 #define LOAD_A lvx #define LOAD_B lvx #define OFFSET_0 0 #define OFFSET_1 r14 #define OFFSET_2 r15 #define OFFSET_3 r16 #define OFFSET_4 r17 #define OFFSET_5 r18 #define OFFSET_6 r19 #define OFFSET_7 r20 #define c01 v0 #define c02 v1 #define c03 v2 #define c04 v3 #define c05 v4 #define c06 v5 #define c07 v6 #define c08 v7 #define c09 v8 #define c10 v9 #define c11 v10 #define c12 v11 #define c13 v12 #define c14 v13 #define c15 v14 #define c16 v15 #define a1 v16 #define a2 v17 #define a3 v18 #define a4 v19 #define a5 v20 #define a6 v21 #define a7 v22 #define a8 v23 #define b1 v24 #define b2 v25 #define bp1 v26 #define bp2 v27 #define C1 v16 #define C2 v17 #define C3 v18 #define C4 v19 #define C5 v20 #define C6 v21 #define C7 v22 #define C8 v23 #define C9 v24 #define c00 v25 #define PERMRSHIFT1 v26 #define PERMRSHIFT2 v27 #define PERMRSHIFT3 v28 #define PERMRSHIFT4 v29 #define VZERO v30 #define alpha v31 #ifndef NEEDPARAM #ifndef DOUBLE #include "../sparam.h" #else #include "../dparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE mr STACK, SP li r0, 0 * 16 stvx v20, SP, r0 li r0, 1 * 16 stvx v21, SP, r0 li r0, 2 * 16 stvx v22, SP, r0 li r0, 3 * 16 stvx v23, SP, r0 li r0, 4 * 16 stvx v24, SP, r0 li r0, 5 * 16 stvx v25, SP, r0 li r0, 6 * 16 stvx v26, SP, r0 li r0, 7 * 16 stvx v27, SP, r0 li r0, 8 * 16 stvx v28, SP, r0 li r0, 9 * 16 stvx v29, SP, r0 li r0, 10 * 16 stvx v30, SP, r0 li r0, 11 * 16 stvx v31, SP, r0 #ifdef __64BIT__ std r31, 192(SP) std r30, 200(SP) std r29, 208(SP) std r28, 216(SP) std r27, 224(SP) std r26, 232(SP) std r25, 240(SP) std r24, 248(SP) std r23, 256(SP) std r22, 264(SP) std r21, 272(SP) std r20, 280(SP) std r19, 288(SP) std r18, 296(SP) std r17, 304(SP) std r16, 312(SP) std r15, 320(SP) std r14, 328(SP) #else stw r31, 192(SP) stw r30, 196(SP) stw r29, 200(SP) stw r28, 204(SP) stw r27, 208(SP) stw r26, 212(SP) stw r25, 216(SP) stw r24, 220(SP) stw r23, 224(SP) stw r22, 228(SP) stw r21, 232(SP) stw r20, 236(SP) stw r19, 240(SP) stw r18, 244(SP) stw r17, 248(SP) stw r16, 252(SP) stw r15, 256(SP) stw r14, 260(SP) #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif li r0, -1 mfspr VREG, VRsave mtspr VRsave, r0 addi SP, SP, -128 li r0, -128 and SP, SP, r0 li OFFSET_1, 4 * SIZE li OFFSET_2, 8 * SIZE li OFFSET_3, 12 * SIZE li OFFSET_4, 16 * SIZE li OFFSET_5, 20 * SIZE li OFFSET_6, 24 * SIZE li OFFSET_7, 28 * SIZE stfs f1, ALPHA + 0(SP) stfs f1, ALPHA + 4(SP) stfs f1, ALPHA + 8(SP) stfs f1, ALPHA + 12(SP) li r29, 0 stw r29, FZERO(SP) slwi LDC, LDC, BASE_SHIFT li PREC, (15 * SIZE) #ifdef CELL li PREB, (5 * 32 * SIZE) #else li PREB, (5 * 32 * SIZE) #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 2 ble LL(60) .align 4 LL(01): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC mr AO, A srawi. I, M, 4 ble LL(20) .align 4 LL(11): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO vxor c03, c03, c03 LOAD_A a2, OFFSET_1, AO vxor c04, c04, c04 LOAD_A a3, OFFSET_2, AO vxor c05, c05, c05 vxor c06, c06, c06 vxor c07, c07, c07 vxor c08, c08, c08 vxor c09, c09, c09 dcbtst CO1, PREC vxor c10, c10, c10 dcbtst CO2, PREC vxor c11, c11, c11 dcbtst CO3, PREC vxor c12, c12, c12 dcbtst CO4, PREC vxor c13, c13, c13 mr BO, B vxor c14, c14, c14 srawi. r0, K, 2 vxor c15, c15, c15 mtspr CTR, r0 vxor c16, c16, c16 vspltw bp1, b1, 0 ble LL(13) .align 4 #define NOP1 mr r3, r3 #define NOP2 mr r4, r4 LL(12): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 dcbt AO, PREA vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 dcbt BO, PREB vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b2, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 addi BO, BO, 8 * SIZE vmaddfp c12, a4, bp1, c12 NOP1 vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 addi AO, AO, 32 * SIZE vmaddfp c07, a7, bp2, c07 LOAD_B b1, OFFSET_0, BO vmaddfp c08, a8, bp2, c08 NOP1 vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 NOP2 vmaddfp c11, a7, bp1, c11 NOP1 vmaddfp c12, a8, bp1, c12 dcbt AO, PREA vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a1, OFFSET_0, AO // vmaddfp c15, a7, bp2, c15 LOAD_A a2, OFFSET_1, AO vmaddfp c16, a8, bp2, c16 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 LOAD_A a3, OFFSET_2, AO vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 LOAD_B b2, OFFSET_1, BO vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 NOP2 vmaddfp c11, a3, bp1, c11 NOP1 vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 addi AO, AO, 32 * SIZE vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO // vmaddfp c11, a7, bp1, c11 NOP2 vmaddfp c12, a8, bp1, c12 vspltw bp1, b1, 0 vmaddfp c13, a5, bp2, c13 LOAD_A a2, OFFSET_1, AO vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 NOP1 vmaddfp c16, a8, bp2, c16 bdnz+ LL(12) .align 4 LL(13): andi. r0, K, 2 nop nop ble+ LL(15) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 LOAD_B b2, OFFSET_1, BO vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_A a5, OFFSET_4, AO vmaddfp c11, a3, bp1, c11 LOAD_A a6, OFFSET_5, AO vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a7, OFFSET_6, AO vmaddfp c15, a3, bp2, c15 LOAD_A a8, OFFSET_7, AO vmaddfp c16, a4, bp2, c16 addi AO, AO, 32 * SIZE vmaddfp c01, a5, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a6, bp1, c02 NOP2 vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO vmaddfp c11, a7, bp1, c11 LOAD_A a2, OFFSET_1, AO vmaddfp c12, a8, bp1, c12 NOP2 vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 vmaddfp c16, a8, bp2, c16 .align 4 LL(15): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(18) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 16 * SIZE vmaddfp c11, a3, bp1, c11 addi BO, BO, 4 * SIZE vmaddfp c12, a4, bp1, c12 nop vmaddfp c13, a1, bp2, c13 vmaddfp c14, a2, bp2, c14 vmaddfp c15, a3, bp2, c15 vmaddfp c16, a4, bp2, c16 .align 4 LL(18): lvx C1, OFFSET_0, CO1 cmpwi cr0, LDC, 32 * SIZE lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT2, 0, CO2 lvx C4, OFFSET_3, CO1 lvsr PERMRSHIFT3, 0, CO3 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT4, 0, CO4 ble LL(19) vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO2 vmaddfp c01, alpha, c01, C2 lvx C6, OFFSET_1, CO2 vmaddfp c02, alpha, c02, C3 lvx C7, OFFSET_2, CO2 vmaddfp c03, alpha, c03, C4 lvx C8, OFFSET_3, CO2 vmaddfp c04, alpha, c04, C5 lvx C9, OFFSET_4, CO2 stvx c00, OFFSET_0, CO1 vperm c00, VZERO, c05, PERMRSHIFT2 stvx c01, OFFSET_1, CO1 vperm c05, c05, c06, PERMRSHIFT2 stvx c02, OFFSET_2, CO1 vperm c06, c06, c07, PERMRSHIFT2 stvx c03, OFFSET_3, CO1 vperm c07, c07, c08, PERMRSHIFT2 stvx c04, OFFSET_4, CO1 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO3 vmaddfp c05, alpha, c05, C6 lvx C2, OFFSET_1, CO3 vmaddfp c06, alpha, c06, C7 lvx C3, OFFSET_2, CO3 vmaddfp c07, alpha, c07, C8 lvx C4, OFFSET_3, CO3 vmaddfp c08, alpha, c08, C9 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO2 vperm c00, VZERO, c09, PERMRSHIFT3 stvx c05, OFFSET_1, CO2 vperm c09, c09, c10, PERMRSHIFT3 stvx c06, OFFSET_2, CO2 vperm c10, c10, c11, PERMRSHIFT3 stvx c07, OFFSET_3, CO2 vperm c11, c11, c12, PERMRSHIFT3 stvx c08, OFFSET_4, CO2 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 lvx C9, OFFSET_4, CO4 vmaddfp c09, alpha, c09, C2 lvx C1, OFFSET_0, CO4 vmaddfp c10, alpha, c10, C3 lvx C6, OFFSET_1, CO4 vmaddfp c11, alpha, c11, C4 lvx C7, OFFSET_2, CO4 vmaddfp c12, alpha, c12, C5 lvx C8, OFFSET_3, CO4 stvx c00, OFFSET_0, CO3 vperm c00, VZERO, c13, PERMRSHIFT4 stvx c09, OFFSET_1, CO3 vperm c13, c13, c14, PERMRSHIFT4 stvx c10, OFFSET_2, CO3 vperm c14, c14, c15, PERMRSHIFT4 stvx c11, OFFSET_3, CO3 vperm c15, c15, c16, PERMRSHIFT4 stvx c12, OFFSET_4, CO3 vperm c16, c16, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C6 vmaddfp c14, alpha, c14, C7 vmaddfp c15, alpha, c15, C8 vmaddfp c16, alpha, c16, C9 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 stvx c15, OFFSET_3, CO4 stvx c16, OFFSET_4, CO4 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE addic. I, I, -1 bgt+ LL(11) b LL(20) .align 4 LL(19): lvx C6, OFFSET_1, CO2 lvx C7, OFFSET_2, CO2 lvx C8, OFFSET_3, CO2 lvx C9, OFFSET_4, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 lvx C2, OFFSET_1, CO3 vmaddfp c02, alpha, c02, C3 lvx C3, OFFSET_2, CO3 vmaddfp c03, alpha, c03, C4 lvx C4, OFFSET_3, CO3 vmaddfp c04, alpha, c04, C5 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C6 lvx C6, OFFSET_1, CO4 vmaddfp c06, alpha, c06, C7 lvx C7, OFFSET_2, CO4 vmaddfp c07, alpha, c07, C8 lvx C8, OFFSET_3, CO4 vmaddfp c08, alpha, c08, C9 lvx C9, OFFSET_4, CO4 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 lvx C1, OFFSET_0, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, c11, PERMRSHIFT3 vperm c11, c11, c12, PERMRSHIFT3 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 vmaddfp c11, alpha, c11, C4 vmaddfp c12, alpha, c12, C5 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 stvx c11, OFFSET_3, CO3 stvx c12, OFFSET_4, CO3 lvx C1, OFFSET_0, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, c15, PERMRSHIFT4 vperm c15, c15, c16, PERMRSHIFT4 vperm c16, c16, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C6 vmaddfp c14, alpha, c14, C7 vmaddfp c15, alpha, c15, C8 vmaddfp c16, alpha, c16, C9 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 stvx c15, OFFSET_3, CO4 stvx c16, OFFSET_4, CO4 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 8 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4 LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4 LL(25): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(28) .align 4 LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4 LL(28): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 lvx C3, OFFSET_2, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 lvx C3, OFFSET_2, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 vmaddfp c14, alpha, c14, C3 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE addi CO3, CO3, 8 * SIZE addi CO4, CO4, 8 * SIZE .align 4 LL(30): andi. I, M, 4 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4 LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4 LL(35): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(38) .align 4 LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE .align 4 LL(40): andi. I, M, 2 ble LL(50) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4 LL(42): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): andi. r0, K, 1 ble LL(48) .align 4 LL(46): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(48): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 LFD f8, 0 * SIZE(CO3) LFD f9, 1 * SIZE(CO3) LFD f10, 0 * SIZE(CO4) LFD f11, 1 * SIZE(CO4) FMADD f4, f4, f13, f8 FMADD f5, f5, f13, f9 FMADD f6, f6, f13, f10 FMADD f7, f7, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) STFD f4, 0 * SIZE(CO3) STFD f5, 1 * SIZE(CO3) STFD f6, 0 * SIZE(CO4) STFD f7, 1 * SIZE(CO4) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE .align 4 LL(50): andi. I, M, 1 ble LL(59) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(55) .align 4 LL(52): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f9, f10, f0 FMADD f1, f9, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f9, 3 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(52) .align 4 LL(55): andi. r0, K, 1 ble LL(58) .align 4 LL(56): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(58): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) LFD f10, 0 * SIZE(CO3) LFD f11, 0 * SIZE(CO4) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) STFD f2, 0 * SIZE(CO3) STFD f3, 0 * SIZE(CO4) .align 4 LL(59): mr B, BO addic. J, J, -1 bgt LL(01) .align 4 LL(60): andi. r0, N, 2 ble LL(120) mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 4 ble LL(80) .align 4 LL(71): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B dcbtst CO1, PREC dcbtst CO2, PREC vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(75) .align 4 LL(72): LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 3 vmaddfp c02, a6, bp1, c02 vmaddfp c03, a7, bp1, c03 vmaddfp c04, a8, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c05, a5, bp2, c05 vmaddfp c06, a6, bp2, c06 vmaddfp c07, a7, bp2, c07 vmaddfp c08, a8, bp2, c08 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(72) .align 4 LL(75): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(78) .align 4 LL(76): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 16 * SIZE vmaddfp c03, a3, bp1, c03 addi BO, BO, 2 * SIZE vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 .align 4 LL(78): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 vmaddfp c07, alpha, c07, C4 vmaddfp c08, alpha, c08, C5 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 8 ble LL(90) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(85) .align 4 LL(82): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c03, a3, bp1, c03 vspltw bp2, b1, 3 vmaddfp c04, a4, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(82) .align 4 LL(85): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(88) .align 4 LL(86): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c05, a1, bp2, c05 addi BO, BO, 2 * SIZE vmaddfp c06, a2, bp2, c06 .align 4 LL(88): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 vaddfp c01, c01, c03 vaddfp c02, c02, c04 vaddfp c05, c05, c07 vaddfp c06, c06, c08 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4 LL(90): andi. I, M, 4 ble LL(100) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 vxor c06, c06, c06 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(95) .align 4 LL(92): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c06, a2, bp2, c06 addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO bdnz LL(92) .align 4 LL(95): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(98) .align 4 LL(96): vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c05, a1, bp2, c05 addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(98): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4 LL(100): andi. I, M, 2 ble LL(110) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(105) .align 4 LL(102): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) FMADD f4, f8, f12, f4 FMADD f5, f9, f12, f5 FMADD f6, f8, f13, f6 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(102) .align 4 LL(105): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(108) .align 4 LL(106): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(108): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE .align 4 LL(110): andi. I, M, 1 ble LL(119) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(115) .align 4 LL(112): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(112) .align 4 LL(115): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(118) .align 4 LL(116): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 LFD f8, 1 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(118): LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) .align 4 LL(119): mr B, BO .align 4 LL(120): andi. r0, N, 1 ble LL(999) mr CO1, C mr AO, A srawi. I, M, 4 ble LL(140) .align 4 LL(130): vxor c01, c01, c01 vxor c02, c02, c02 vxor c03, c03, c03 vxor c04, c04, c04 mr BO, B dcbtst CO1, PREC mr J, K andi. r0, B, 15 ble+ LL(131) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) .align 4 LL(131): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(135) .align 4 LL(133): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vspltw bp2, b1, 1 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vspltw bp2, b1, 3 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(133) .align 4 LL(135): andi. r0, J, 3 ble+ LL(138) cmpwi cr0, r0, 3 bne LL(136) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp2, b1, 1 vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 3 * SIZE b LL(138) .align 4 LL(136): cmpwi cr0, r0, 2 bne LL(137) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO LOAD_A a3, OFFSET_6, AO LOAD_A a4, OFFSET_7, AO vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 2 * SIZE b LL(138) .align 4 LL(137): cmpwi cr0, r0, 1 bne LL(138) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(138): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 addi CO1, CO1, 16 * SIZE addic. I, I, -1 bgt+ LL(130) .align 4 LL(140): andi. I, M, 8 ble LL(150) vxor c01, c01, c01 vxor c02, c02, c02 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(141) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) .align 4 LL(141): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(145) .align 4 LL(143): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 vspltw bp1, b1, 2 vmaddfp c01, a5, bp1, c01 vmaddfp c02, a6, bp1, c02 vspltw bp2, b1, 3 vmaddfp c01, a7, bp2, c01 vmaddfp c02, a8, bp2, c02 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(143) .align 4 LL(145): andi. r0, J, 3 ble+ LL(148) cmpwi cr0, r0, 3 bne LL(146) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 24 * SIZE addi BO, BO, 3 * SIZE b LL(148) .align 4 LL(146): cmpwi cr0, r0, 2 bne LL(147) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 addi AO, AO, 16 * SIZE addi BO, BO, 2 * SIZE b LL(148) .align 4 LL(147): cmpwi cr0, r0, 1 bne LL(148) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(148): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 addi CO1, CO1, 8 * SIZE .align 4 LL(150): andi. I, M, 4 ble LL(160) vxor c01, c01, c01 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(151) LOAD_A a1, OFFSET_0, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) LOAD_A a1, OFFSET_0, AO addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) .align 4 LL(151): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(155) .align 4 LL(153): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 vspltw bp2, b1, 3 vmaddfp c01, a4, bp2, c01 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO bdnz LL(153) .align 4 LL(155): andi. r0, J, 3 ble+ LL(158) cmpwi cr0, r0, 3 bne LL(156) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 addi AO, AO, 12 * SIZE addi BO, BO, 3 * SIZE b LL(158) .align 4 LL(156): cmpwi cr0, r0, 2 bne LL(157) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c01, a2, bp2, c01 addi AO, AO, 8 * SIZE addi BO, BO, 2 * SIZE b LL(158) .align 4 LL(157): cmpwi cr0, r0, 1 bne LL(158) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 addi AO, AO, 4 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(158): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 addi CO1, CO1, 4 * SIZE .align 4 LL(160): andi. I, M, 2 ble LL(170) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 2 * SIZE(AO) LFD f11, 3 * SIZE(AO) LFD f12, 0 * SIZE(B) LFD f13, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(165) .align 4 LL(162): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 FMADD f2, f10, f13, f2 FMADD f3, f11, f13, f3 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 6 * SIZE(AO) LFD f11, 7 * SIZE(AO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE bdnz LL(162) .align 4 LL(165): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(168) .align 4 LL(166): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 addi AO, AO, 2 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(168): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE .align 4 LL(170): andi. I, M, 1 ble LL(999) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(175) .align 4 LL(172): FMADD f0, f8, f10, f0 FMADD f1, f9, f11, f1 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(172) .align 4 LL(175): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(178) .align 4 LL(176): FMADD f0, f8, f10, f0 addi AO, AO, 1 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(178): LFD f8, 0 * SIZE(CO1) FADD f0, f0, f1 FMADD f0, f0, f13, f8 STFD f0, 0 * SIZE(CO1) .align 4 LL(999): mr SP, STACK li r0, 0 * 16 lvx v20, SP, r0 li r0, 1 * 16 lvx v21, SP, r0 li r0, 2 * 16 lvx v22, SP, r0 li r0, 3 * 16 lvx v23, SP, r0 li r0, 4 * 16 lvx v24, SP, r0 li r0, 5 * 16 lvx v25, SP, r0 li r0, 6 * 16 lvx v26, SP, r0 li r0, 7 * 16 lvx v27, SP, r0 li r0, 8 * 16 lvx v28, SP, r0 li r0, 9 * 16 lvx v29, SP, r0 li r0, 10 * 16 lvx v30, SP, r0 li r0, 11 * 16 lvx v31, SP, r0 mtspr VRsave, VREG #ifdef __64BIT__ ld r31, 192(SP) ld r30, 200(SP) ld r29, 208(SP) ld r28, 216(SP) ld r27, 224(SP) ld r26, 232(SP) ld r25, 240(SP) ld r24, 248(SP) ld r23, 256(SP) ld r22, 264(SP) ld r21, 272(SP) ld r20, 280(SP) ld r19, 288(SP) ld r18, 296(SP) ld r17, 304(SP) ld r16, 312(SP) ld r15, 320(SP) ld r14, 328(SP) #else lwz r31, 192(SP) lwz r30, 196(SP) lwz r29, 200(SP) lwz r28, 204(SP) lwz r27, 208(SP) lwz r26, 212(SP) lwz r25, 216(SP) lwz r24, 220(SP) lwz r23, 224(SP) lwz r22, 228(SP) lwz r21, 232(SP) lwz r20, 236(SP) lwz r19, 240(SP) lwz r18, 244(SP) lwz r17, 248(SP) lwz r16, 252(SP) lwz r15, 256(SP) lwz r14, 260(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_altivec_g4.S000066400000000000000000001355171313527062700223060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 360 #else #define STACKSIZE 272 #endif #define ALPHA 0 #define FZERO 16 #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #define STACK r11 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r29 #define PREC r30 #define VREG r31 #define LOAD_A lvx #define LOAD_B lvx #define OFFSET_0 0 #define OFFSET_1 r14 #define OFFSET_2 r15 #define OFFSET_3 r16 #define OFFSET_4 r17 #define OFFSET_5 r18 #define OFFSET_6 r19 #define OFFSET_7 r20 #define c01 v0 #define c02 v1 #define c03 v2 #define c04 v3 #define c05 v4 #define c06 v5 #define c07 v6 #define c08 v7 #define c09 v8 #define c10 v9 #define c11 v10 #define c12 v11 #define c13 v12 #define c14 v13 #define c15 v14 #define c16 v15 #define a1 v16 #define a2 v17 #define a3 v18 #define a4 v19 #define a5 v20 #define a6 v21 #define a7 v22 #define a8 v23 #define b1 v24 #define b2 v25 #define bp1 v26 #define bp2 v27 #define C1 v16 #define C2 v17 #define C3 v18 #define C4 v19 #define C5 v20 #define C6 v21 #define C7 v22 #define C8 v23 #define C9 v24 #define c00 v25 #define PERMRSHIFT1 v26 #define PERMRSHIFT2 v27 #define PERMRSHIFT3 v28 #define PERMRSHIFT4 v29 #define VZERO v30 #define alpha v31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE mr STACK, SP li r0, 0 * 16 stvx v20, SP, r0 li r0, 1 * 16 stvx v21, SP, r0 li r0, 2 * 16 stvx v22, SP, r0 li r0, 3 * 16 stvx v23, SP, r0 li r0, 4 * 16 stvx v24, SP, r0 li r0, 5 * 16 stvx v25, SP, r0 li r0, 6 * 16 stvx v26, SP, r0 li r0, 7 * 16 stvx v27, SP, r0 li r0, 8 * 16 stvx v28, SP, r0 li r0, 9 * 16 stvx v29, SP, r0 li r0, 10 * 16 stvx v30, SP, r0 li r0, 11 * 16 stvx v31, SP, r0 #ifdef __64BIT__ std r31, 192(SP) std r30, 200(SP) std r29, 208(SP) std r28, 216(SP) std r27, 224(SP) std r26, 232(SP) std r25, 240(SP) std r24, 248(SP) std r23, 256(SP) std r22, 264(SP) std r21, 272(SP) std r20, 280(SP) std r19, 288(SP) std r18, 296(SP) std r17, 304(SP) std r16, 312(SP) std r15, 320(SP) std r14, 328(SP) #else stw r31, 192(SP) stw r30, 196(SP) stw r29, 200(SP) stw r28, 204(SP) stw r27, 208(SP) stw r26, 212(SP) stw r25, 216(SP) stw r24, 220(SP) stw r23, 224(SP) stw r22, 228(SP) stw r21, 232(SP) stw r20, 236(SP) stw r19, 240(SP) stw r18, 244(SP) stw r17, 248(SP) stw r16, 252(SP) stw r15, 256(SP) stw r14, 260(SP) #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif li r0, -1 mfspr VREG, VRsave mtspr VRsave, r0 addi SP, SP, -128 li r0, -128 and SP, SP, r0 li OFFSET_1, 4 * SIZE li OFFSET_2, 8 * SIZE li OFFSET_3, 12 * SIZE li OFFSET_4, 16 * SIZE li OFFSET_5, 20 * SIZE li OFFSET_6, 24 * SIZE li OFFSET_7, 28 * SIZE stfs f1, ALPHA + 0(SP) stfs f1, ALPHA + 4(SP) stfs f1, ALPHA + 8(SP) stfs f1, ALPHA + 12(SP) li r29, 0 stw r29, FZERO(SP) slwi LDC, LDC, BASE_SHIFT li PREC, (15 * SIZE) li PREB, (25 * 8 * SIZE) cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 2 ble LL(60) .align 4 LL(01): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC mr AO, A srawi. I, M, 4 ble LL(20) .align 4 LL(11): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO vxor c03, c03, c03 LOAD_A a2, OFFSET_1, AO vxor c04, c04, c04 LOAD_A a3, OFFSET_2, AO vxor c05, c05, c05 LOAD_A a4, OFFSET_3, AO vxor c06, c06, c06 LOAD_B b2, OFFSET_2, B vxor c07, c07, c07 LOAD_A a5, OFFSET_4, AO vxor c08, c08, c08 LOAD_A a6, OFFSET_5, AO vxor c09, c09, c09 dcbtst CO1, PREC vxor c10, c10, c10 dcbtst CO2, PREC vxor c11, c11, c11 dcbtst CO3, PREC vxor c12, c12, c12 dcbtst CO4, PREC vxor c13, c13, c13 mr BO, B vxor c14, c14, c14 srawi. r0, K, 2 vxor c15, c15, c15 mtspr CTR, r0 vxor c16, c16, c16 vspltw bp1, b1, 0 ble LL(15) .align 4 LL(12): /* 1 */ vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c03, a3, bp1, c03 LOAD_A a7, OFFSET_4, AO vmaddfp c04, a4, bp1, c04 LOAD_A a8, OFFSET_5, AO /* 2 */ vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 dcbt BO, PREB vmaddfp c07, a3, bp2, c07 dcbt AO, PREB vmaddfp c08, a4, bp2, c08 addi AO, AO, 8 * SIZE /* 3 */ vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b1, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 dcbt AO, PREB vmaddfp c12, a4, bp1, c12 addi AO, AO, 8 * SIZE /* 4 */ vmaddfp c13, a1, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a1, OFFSET_2, AO vmaddfp c15, a3, bp2, c15 dcbt AO, PREB vmaddfp c16, a4, bp2, c16 addi AO, AO, 8 * SIZE /* 5 */ vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a6, bp1, c02 LOAD_A a2, OFFSET_1, AO vmaddfp c03, a7, bp1, c03 LOAD_A a3, OFFSET_2, AO vmaddfp c04, a8, bp1, c04 LOAD_A a4, OFFSET_3, AO /* 6 */ vmaddfp c05, a5, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a6, bp2, c06 nop vmaddfp c07, a7, bp2, c07 dcbt AO, PREA vmaddfp c08, a8, bp2, c08 addi AO, AO, 8 * SIZE /* 7 */ vmaddfp c09, a5, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a6, bp1, c10 LOAD_B b1, OFFSET_4, BO vmaddfp c11, a7, bp1, c11 nop vmaddfp c12, a8, bp1, c12 nop /* 8 */ vmaddfp c13, a5, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a5, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 LOAD_A a6, OFFSET_3, AO vmaddfp c16, a8, bp2, c16 LOAD_A a7, OFFSET_4, AO /* 9 */ vmaddfp c01, a1, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a8, OFFSET_5, AO vmaddfp c03, a3, bp1, c03 addi BO, BO, 8 * SIZE vmaddfp c04, a4, bp1, c04 nop /* 10 */ vmaddfp c05, a1, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop /* 11 */ vmaddfp c09, a1, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b2, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 dcbt AO, PREA vmaddfp c12, a4, bp1, c12 addi AO, AO, 8 * SIZE /* 12 */ vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a1, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a2, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 LOAD_A a3, OFFSET_6, AO /* 13 */ vmaddfp c01, a5, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a6, bp1, c02 LOAD_A a4, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 dcbt AO, PREA vmaddfp c04, a8, bp1, c04 addi AO, AO, 8 * SIZE /* 14 */ vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 nop vmaddfp c07, a7, bp2, c07 dcbt AO, PREA vmaddfp c08, a8, bp2, c08 addi AO, AO, 8 * SIZE /* 15 */ vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_B b2, OFFSET_4, BO vmaddfp c11, a7, bp1, c11 dcbt AO, PREA vmaddfp c12, a8, bp1, c12 addi BO, BO, 8 * SIZE /* 16 */ vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a7, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a8, bp2, c16 bdnz+ LL(12) .align 4 LL(15): andi. r0, K, 3 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b1, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 addi AO, AO, 16 * SIZE vmaddfp c12, a4, bp1, c12 addi BO, BO, 4 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a1, OFFSET_0, AO vmaddfp c15, a3, bp2, c15 LOAD_A a2, OFFSET_1, AO vmaddfp c16, a4, bp2, c16 LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz+ LL(16) .align 4 LL(18): lvx C1, OFFSET_0, CO1 cmpwi cr0, LDC, 32 * SIZE lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT2, 0, CO2 lvx C4, OFFSET_3, CO1 lvsr PERMRSHIFT3, 0, CO3 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT4, 0, CO4 ble LL(19) vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO2 vmaddfp c01, alpha, c01, C2 lvx C6, OFFSET_1, CO2 vmaddfp c02, alpha, c02, C3 lvx C7, OFFSET_2, CO2 vmaddfp c03, alpha, c03, C4 lvx C8, OFFSET_3, CO2 vmaddfp c04, alpha, c04, C5 lvx C9, OFFSET_4, CO2 stvx c00, OFFSET_0, CO1 vperm c00, VZERO, c05, PERMRSHIFT2 stvx c01, OFFSET_1, CO1 vperm c05, c05, c06, PERMRSHIFT2 stvx c02, OFFSET_2, CO1 vperm c06, c06, c07, PERMRSHIFT2 stvx c03, OFFSET_3, CO1 vperm c07, c07, c08, PERMRSHIFT2 stvx c04, OFFSET_4, CO1 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 lvx C1, OFFSET_0, CO3 vmaddfp c05, alpha, c05, C6 lvx C2, OFFSET_1, CO3 vmaddfp c06, alpha, c06, C7 lvx C3, OFFSET_2, CO3 vmaddfp c07, alpha, c07, C8 lvx C4, OFFSET_3, CO3 vmaddfp c08, alpha, c08, C9 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO2 vperm c00, VZERO, c09, PERMRSHIFT3 stvx c05, OFFSET_1, CO2 vperm c09, c09, c10, PERMRSHIFT3 stvx c06, OFFSET_2, CO2 vperm c10, c10, c11, PERMRSHIFT3 stvx c07, OFFSET_3, CO2 vperm c11, c11, c12, PERMRSHIFT3 stvx c08, OFFSET_4, CO2 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 lvx C9, OFFSET_4, CO4 vmaddfp c09, alpha, c09, C2 lvx C1, OFFSET_0, CO4 vmaddfp c10, alpha, c10, C3 lvx C6, OFFSET_1, CO4 vmaddfp c11, alpha, c11, C4 lvx C7, OFFSET_2, CO4 vmaddfp c12, alpha, c12, C5 lvx C8, OFFSET_3, CO4 stvx c00, OFFSET_0, CO3 vperm c00, VZERO, c13, PERMRSHIFT4 stvx c09, OFFSET_1, CO3 vperm c13, c13, c14, PERMRSHIFT4 stvx c10, OFFSET_2, CO3 vperm c14, c14, c15, PERMRSHIFT4 stvx c11, OFFSET_3, CO3 vperm c15, c15, c16, PERMRSHIFT4 stvx c12, OFFSET_4, CO3 vperm c16, c16, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C6 vmaddfp c14, alpha, c14, C7 vmaddfp c15, alpha, c15, C8 vmaddfp c16, alpha, c16, C9 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 stvx c15, OFFSET_3, CO4 stvx c16, OFFSET_4, CO4 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE addic. I, I, -1 bgt+ LL(11) b LL(20) .align 4 LL(19): lvx C6, OFFSET_1, CO2 lvx C7, OFFSET_2, CO2 lvx C8, OFFSET_3, CO2 lvx C9, OFFSET_4, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 lvx C2, OFFSET_1, CO3 vmaddfp c02, alpha, c02, C3 lvx C3, OFFSET_2, CO3 vmaddfp c03, alpha, c03, C4 lvx C4, OFFSET_3, CO3 vmaddfp c04, alpha, c04, C5 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C6 lvx C6, OFFSET_1, CO4 vmaddfp c06, alpha, c06, C7 lvx C7, OFFSET_2, CO4 vmaddfp c07, alpha, c07, C8 lvx C8, OFFSET_3, CO4 vmaddfp c08, alpha, c08, C9 lvx C9, OFFSET_4, CO4 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 lvx C1, OFFSET_0, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, c11, PERMRSHIFT3 vperm c11, c11, c12, PERMRSHIFT3 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 vmaddfp c11, alpha, c11, C4 vmaddfp c12, alpha, c12, C5 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 stvx c11, OFFSET_3, CO3 stvx c12, OFFSET_4, CO3 lvx C1, OFFSET_0, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, c15, PERMRSHIFT4 vperm c15, c15, c16, PERMRSHIFT4 vperm c16, c16, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C6 vmaddfp c14, alpha, c14, C7 vmaddfp c15, alpha, c15, C8 vmaddfp c16, alpha, c16, C9 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 stvx c15, OFFSET_3, CO4 stvx c16, OFFSET_4, CO4 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 8 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4 LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4 LL(25): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(28) .align 4 LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4 LL(28): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 lvx C3, OFFSET_2, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 lvx C3, OFFSET_2, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 vmaddfp c14, alpha, c14, C3 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE addi CO3, CO3, 8 * SIZE addi CO4, CO4, 8 * SIZE .align 4 LL(30): andi. I, M, 4 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4 LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4 LL(35): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(38) .align 4 LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE .align 4 LL(40): andi. I, M, 2 ble LL(50) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4 LL(42): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): andi. r0, K, 1 ble LL(48) .align 4 LL(46): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(48): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 LFD f8, 0 * SIZE(CO3) LFD f9, 1 * SIZE(CO3) LFD f10, 0 * SIZE(CO4) LFD f11, 1 * SIZE(CO4) FMADD f4, f4, f13, f8 FMADD f5, f5, f13, f9 FMADD f6, f6, f13, f10 FMADD f7, f7, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) STFD f4, 0 * SIZE(CO3) STFD f5, 1 * SIZE(CO3) STFD f6, 0 * SIZE(CO4) STFD f7, 1 * SIZE(CO4) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE .align 4 LL(50): andi. I, M, 1 ble LL(59) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(55) .align 4 LL(52): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f9, f10, f0 FMADD f1, f9, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f9, 3 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(52) .align 4 LL(55): andi. r0, K, 1 ble LL(58) .align 4 LL(56): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(58): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) LFD f10, 0 * SIZE(CO3) LFD f11, 0 * SIZE(CO4) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) STFD f2, 0 * SIZE(CO3) STFD f3, 0 * SIZE(CO4) .align 4 LL(59): mr B, BO addic. J, J, -1 bgt LL(01) .align 4 LL(60): andi. r0, N, 2 ble LL(120) mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 4 ble LL(80) .align 4 LL(71): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B dcbtst CO1, PREC dcbtst CO2, PREC vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(75) .align 4 LL(72): LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 3 vmaddfp c02, a6, bp1, c02 vmaddfp c03, a7, bp1, c03 vmaddfp c04, a8, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c05, a5, bp2, c05 vmaddfp c06, a6, bp2, c06 vmaddfp c07, a7, bp2, c07 vmaddfp c08, a8, bp2, c08 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(72) .align 4 LL(75): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(78) .align 4 LL(76): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 16 * SIZE vmaddfp c03, a3, bp1, c03 addi BO, BO, 2 * SIZE vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 .align 4 LL(78): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 vmaddfp c07, alpha, c07, C4 vmaddfp c08, alpha, c08, C5 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 8 ble LL(90) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(85) .align 4 LL(82): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c03, a3, bp1, c03 vspltw bp2, b1, 3 vmaddfp c04, a4, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(82) .align 4 LL(85): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(88) .align 4 LL(86): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c05, a1, bp2, c05 addi BO, BO, 2 * SIZE vmaddfp c06, a2, bp2, c06 .align 4 LL(88): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 vaddfp c01, c01, c03 vaddfp c02, c02, c04 vaddfp c05, c05, c07 vaddfp c06, c06, c08 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4 LL(90): andi. I, M, 4 ble LL(100) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 vxor c06, c06, c06 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(95) .align 4 LL(92): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c06, a2, bp2, c06 addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO bdnz LL(92) .align 4 LL(95): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(98) .align 4 LL(96): vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c05, a1, bp2, c05 addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(98): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4 LL(100): andi. I, M, 2 ble LL(110) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(105) .align 4 LL(102): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) FMADD f4, f8, f12, f4 FMADD f5, f9, f12, f5 FMADD f6, f8, f13, f6 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(102) .align 4 LL(105): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(108) .align 4 LL(106): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(108): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE .align 4 LL(110): andi. I, M, 1 ble LL(119) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(115) .align 4 LL(112): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(112) .align 4 LL(115): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(118) .align 4 LL(116): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 LFD f8, 1 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(118): LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) .align 4 LL(119): mr B, BO .align 4 LL(120): andi. r0, N, 1 ble LL(999) mr CO1, C mr AO, A srawi. I, M, 4 ble LL(140) .align 4 LL(130): vxor c01, c01, c01 vxor c02, c02, c02 vxor c03, c03, c03 vxor c04, c04, c04 mr BO, B dcbtst CO1, PREC mr J, K andi. r0, B, 15 ble+ LL(131) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) .align 4 LL(131): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(135) .align 4 LL(133): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vspltw bp2, b1, 1 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vspltw bp2, b1, 3 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(133) .align 4 LL(135): andi. r0, J, 3 ble+ LL(138) cmpwi cr0, r0, 3 bne LL(136) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp2, b1, 1 vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 3 * SIZE b LL(138) .align 4 LL(136): cmpwi cr0, r0, 2 bne LL(137) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO LOAD_A a3, OFFSET_6, AO LOAD_A a4, OFFSET_7, AO vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 2 * SIZE b LL(138) .align 4 LL(137): cmpwi cr0, r0, 1 bne LL(138) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(138): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 addi CO1, CO1, 16 * SIZE addic. I, I, -1 bgt+ LL(130) .align 4 LL(140): andi. I, M, 8 ble LL(150) vxor c01, c01, c01 vxor c02, c02, c02 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(141) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) .align 4 LL(141): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(145) .align 4 LL(143): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 vspltw bp1, b1, 2 vmaddfp c01, a5, bp1, c01 vmaddfp c02, a6, bp1, c02 vspltw bp2, b1, 3 vmaddfp c01, a7, bp2, c01 vmaddfp c02, a8, bp2, c02 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(143) .align 4 LL(145): andi. r0, J, 3 ble+ LL(148) cmpwi cr0, r0, 3 bne LL(146) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 24 * SIZE addi BO, BO, 3 * SIZE b LL(148) .align 4 LL(146): cmpwi cr0, r0, 2 bne LL(147) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 addi AO, AO, 16 * SIZE addi BO, BO, 2 * SIZE b LL(148) .align 4 LL(147): cmpwi cr0, r0, 1 bne LL(148) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(148): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 addi CO1, CO1, 8 * SIZE .align 4 LL(150): andi. I, M, 4 ble LL(160) vxor c01, c01, c01 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(151) LOAD_A a1, OFFSET_0, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) LOAD_A a1, OFFSET_0, AO addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) .align 4 LL(151): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(155) .align 4 LL(153): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 vspltw bp2, b1, 3 vmaddfp c01, a4, bp2, c01 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO bdnz LL(153) .align 4 LL(155): andi. r0, J, 3 ble+ LL(158) cmpwi cr0, r0, 3 bne LL(156) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 addi AO, AO, 12 * SIZE addi BO, BO, 3 * SIZE b LL(158) .align 4 LL(156): cmpwi cr0, r0, 2 bne LL(157) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c01, a2, bp2, c01 addi AO, AO, 8 * SIZE addi BO, BO, 2 * SIZE b LL(158) .align 4 LL(157): cmpwi cr0, r0, 1 bne LL(158) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 addi AO, AO, 4 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(158): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 addi CO1, CO1, 4 * SIZE .align 4 LL(160): andi. I, M, 2 ble LL(170) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 2 * SIZE(AO) LFD f11, 3 * SIZE(AO) LFD f12, 0 * SIZE(B) LFD f13, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(165) .align 4 LL(162): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 FMADD f2, f10, f13, f2 FMADD f3, f11, f13, f3 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 6 * SIZE(AO) LFD f11, 7 * SIZE(AO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE bdnz LL(162) .align 4 LL(165): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(168) .align 4 LL(166): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 addi AO, AO, 2 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(168): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE .align 4 LL(170): andi. I, M, 1 ble LL(999) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(175) .align 4 LL(172): FMADD f0, f8, f10, f0 FMADD f1, f9, f11, f1 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(172) .align 4 LL(175): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(178) .align 4 LL(176): FMADD f0, f8, f10, f0 addi AO, AO, 1 * SIZE addi BO, BO, 1 * SIZE .align 4 LL(178): LFD f8, 0 * SIZE(CO1) FADD f0, f0, f1 FMADD f0, f0, f13, f8 STFD f0, 0 * SIZE(CO1) .align 4 LL(999): mr SP, STACK li r0, 0 * 16 lvx v20, SP, r0 li r0, 1 * 16 lvx v21, SP, r0 li r0, 2 * 16 lvx v22, SP, r0 li r0, 3 * 16 lvx v23, SP, r0 li r0, 4 * 16 lvx v24, SP, r0 li r0, 5 * 16 lvx v25, SP, r0 li r0, 6 * 16 lvx v26, SP, r0 li r0, 7 * 16 lvx v27, SP, r0 li r0, 8 * 16 lvx v28, SP, r0 li r0, 9 * 16 lvx v29, SP, r0 li r0, 10 * 16 lvx v30, SP, r0 li r0, 11 * 16 lvx v31, SP, r0 mtspr VRsave, VREG #ifdef __64BIT__ ld r31, 192(SP) ld r30, 200(SP) ld r29, 208(SP) ld r28, 216(SP) ld r27, 224(SP) ld r26, 232(SP) ld r25, 240(SP) ld r24, 248(SP) ld r23, 256(SP) ld r22, 264(SP) ld r21, 272(SP) ld r20, 280(SP) ld r19, 288(SP) ld r18, 296(SP) ld r17, 304(SP) ld r16, 312(SP) ld r15, 320(SP) ld r14, 328(SP) #else lwz r31, 192(SP) lwz r30, 196(SP) lwz r29, 200(SP) lwz r28, 204(SP) lwz r27, 208(SP) lwz r26, 212(SP) lwz r25, 216(SP) lwz r24, 220(SP) lwz r23, 224(SP) lwz r22, 228(SP) lwz r21, 232(SP) lwz r20, 236(SP) lwz r19, 240(SP) lwz r18, 244(SP) lwz r17, 248(SP) lwz r16, 252(SP) lwz r15, 256(SP) lwz r14, 260(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_cell.S000066400000000000000000001332151313527062700211750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM #ifndef DOUBLE #include "../sparam.h" #else #include "../dparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) #if defined(TRMMKERNEL) std r19, 240(SP) std r18, 248(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) #if defined(TRMMKERNEL) stw r19, 192(SP) stw r18, 196(SP) #endif #endif stfd f1, ALPHA stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST /* Normal prefetch */ #ifdef CELL li PREC, 4 * SIZE #endif #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ xc ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST li PREC, 3 * SIZE li PREA, 16 * 12 * SIZE li PREB, 16 * 12 * SIZE #endif srawi. J, N, 2 ble LL(40) .align 4 LL(10): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 mr AO, A add C, CO4, LDC ble LL(20) .align 4 LL(11): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) #endif dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(15) #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(15) #endif .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 LFD f16, 0 * SIZE(CO3) LFD f17, 1 * SIZE(CO3) LFD f18, 2 * SIZE(CO3) LFD f19, 3 * SIZE(CO3) LFD f20, 0 * SIZE(CO4) LFD f21, 1 * SIZE(CO4) LFD f22, 2 * SIZE(CO4) LFD f23, 3 * SIZE(CO4) FMADD f8, f8, f30, f16 FMADD f9, f9, f30, f17 FMADD f10, f10, f30, f18 FMADD f11, f11, f30, f19 FMADD f12, f12, f30, f20 FMADD f13, f13, f30, f21 FMADD f14, f14, f30, f22 FMADD f15, f15, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f10, f10, f30 FMUL f11, f11, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 FMUL f14, f14, f30 FMUL f15, f15, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -4 #endif slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(TRMMKERNEL) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE dcbt 0, BO, PREB bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f4, f4, f30, f18 FMADD f5, f5, f30, f19 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) FMADD f8, f8, f30, f20 FMADD f9, f9, f30, f21 FMADD f12, f12, f30, f22 FMADD f13, f13, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE dcbt 0, BO, PREB bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f20, 0 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) FMADD f0, f0, f30, f16 FMADD f4, f4, f30, f18 FMADD f8, f8, f30, f20 FMADD f12, f12, f30, f22 #else FMUL f0, f0, f30 FMUL f4, f4, f30 FMUL f8, f8, f30 FMUL f12, f12, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(39): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif mr B, BO addic. J, J, -1 bgt LL(10) .align 4 LL(40): mr CO1, C add CO2, C, LDC andi. J, N, 2 ble LL(70) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble LL(50) .align 4 LL(41): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif dcbt CO1, PREC dcbt CO2, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt 0, BO, PREB bdnz LL(42) .align 4 LL(45): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE dcbt 0, BO, PREB bdnz LL(52) .align 4 LL(55): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f18 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(69): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO .align 4 LL(70): mr CO1, C andi. J, N, 1 ble LL(999) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 mr AO, A ble LL(80) .align 4 LL(71): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif dcbt CO1, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE dcbt 0, BO, PREB bdnz LL(72) .align 4 LL(75): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE dcbt 0, BO, PREB bdnz LL(82) .align 4 LL(85): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 addi CO1, CO1, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(90): andi. I, M, 1 ble LL(999) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 3 mtspr CTR, r0 mr BO, B #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMADD f0, f0, f30, f16 #else FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMUL f0, f0, f30 #endif STFD f0, 0 * SIZE(CO1) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) ld r19, 240(SP) ld r18, 248(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) lwz r19, 192(SP) lwz r18, 196(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_g4.S000066400000000000000000001233431313527062700205710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREC r30 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) #if defined(TRMMKERNEL) std r19, 240(SP) std r18, 248(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) #if defined(TRMMKERNEL) stw r19, 192(SP) stw r18, 196(SP) #endif #endif stfd f1, ALPHA stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif li PREA, 8 * 8 * SIZE li PREC, 3 * SIZE cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 srawi. J, N, 2 ble .L40 .align 4 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 .L10: mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 mr AO, A add C, CO4, LDC ble .L20 .align 4 .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A3, 2 * SIZE(AO) LFDU A5, 4 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A3, 2 * SIZE(AO) LFDU A5, 4 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) #endif dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 1 mtspr CTR, TEMP ble .L15 #else LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A3, 2 * SIZE(AO) LFDU A5, 4 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, K, 1 mtspr CTR, r0 mr BO, B ble .L15 #endif .align 4 .L12: FMADD f0, A1, B1, f0 LFDU B5, 4 * SIZE(BO) FMADD f4, A1, B2, f4 dcbt AO, PREA FMADD f8, A1, B3, f8 LFD A4, -1 * SIZE(AO) FMADD f12, A1, B4, f12 dcbt BO, PREA FMADD f1, A2, B1, f1 LFD B6, 1 * SIZE(BO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 LFDU A1, 4 * SIZE(AO) FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 LFD B7, 2 * SIZE(BO) FMADD f6, A3, B2, f6 nop FMADD f10, A3, B3, f10 LFD A2, -3 * SIZE(AO) FMADD f14, A3, B4, f14 nop FMADD f3, A4, B1, f3 LFD B8, 3 * SIZE(BO) FMADD f7, A4, B2, f7 nop FMADD f11, A4, B3, f11 LFD A3, -2 * SIZE(AO) FMADD f15, A4, B4, f15 nop FMADD f0, A5, B5, f0 LFDU B1, 4 * SIZE(BO) FMADD f4, A5, B6, f4 nop FMADD f8, A5, B7, f8 LFD A4, -1 * SIZE(AO) FMADD f12, A5, B8, f12 #ifdef DOUBLE dcbt BO, PREA #else nop #endif FMADD f1, A2, B5, f1 LFD B2, 1 * SIZE(BO) FMADD f5, A2, B6, f5 nop FMADD f9, A2, B7, f9 LFDU A5, 4 * SIZE(AO) FMADD f13, A2, B8, f13 #ifdef DOUBLE dcbt AO, PREA #else nop #endif FMADD f2, A3, B5, f2 LFD B3, 2 * SIZE(BO) FMADD f6, A3, B6, f6 nop FMADD f10, A3, B7, f10 LFD A2, -3 * SIZE(AO) FMADD f14, A3, B8, f14 nop FMADD f3, A4, B5, f3 LFD B4, 3 * SIZE(BO) FMADD f7, A4, B6, f7 nop FMADD f11, A4, B7, f11 LFD A3, -2 * SIZE(AO) FMADD f15, A4, B8, f15 bdnz .L12 .align 4 .L15: addi AO, AO, -4 * SIZE lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 1 #else andi. r0, K, 1 #endif ble+ .L18 .L16: LFD A4, 3 * SIZE(AO) FMADD f0, A1, B1, f0 FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 FMADD f3, A4, B1, f3 FMADD f7, A4, B2, f7 FMADD f11, A4, B3, f11 FMADD f15, A4, B4, f15 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 .L18: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 LFD f16, 0 * SIZE(CO3) FMADD f1, f1, f30, f17 LFD f17, 1 * SIZE(CO3) FMADD f2, f2, f30, f18 LFD f18, 2 * SIZE(CO3) FMADD f3, f3, f30, f19 LFD f19, 3 * SIZE(CO3) FMADD f4, f4, f30, f20 LFD f20, 0 * SIZE(CO4) FMADD f5, f5, f30, f21 LFD f21, 1 * SIZE(CO4) FMADD f6, f6, f30, f22 LFD f22, 2 * SIZE(CO4) FMADD f7, f7, f30, f23 LFD f23, 3 * SIZE(CO4) FMADD f8, f8, f30, f16 FMADD f9, f9, f30, f17 FMADD f10, f10, f30, f18 FMADD f11, f11, f30, f19 FMADD f12, f12, f30, f20 FMADD f13, f13, f30, f21 FMADD f14, f14, f30, f22 FMADD f15, f15, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f10, f10, f30 FMUL f11, f11, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 FMUL f14, f14, f30 FMUL f15, f15, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) fmr f4, f0 STFD f5, 1 * SIZE(CO2) fmr f5, f0 STFD f6, 2 * SIZE(CO2) fmr f6, f0 STFD f7, 3 * SIZE(CO2) fmr f7, f0 STFD f8, 0 * SIZE(CO3) fmr f8, f0 STFD f9, 1 * SIZE(CO3) fmr f9, f0 STFD f10, 2 * SIZE(CO3) fmr f10, f0 STFD f11, 3 * SIZE(CO3) fmr f11, f0 STFD f12, 0 * SIZE(CO4) fmr f12, f0 STFD f13, 1 * SIZE(CO4) fmr f13, f0 STFD f14, 2 * SIZE(CO4) fmr f14, f0 STFD f15, 3 * SIZE(CO4) fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -4 #endif slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ .L11 .align 4 .L20: andi. I, M, 2 ble .L30 #if defined(TRMMKERNEL) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L25 .align 5 .L22: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f16, 4 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 5 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 12 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 13 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 14 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 6 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f27, 15 * SIZE(BO) FMADD f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 16 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 8 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 3 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 1 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 4 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 5 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 6 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 2 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L22 fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 .L25: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L28 .align 4 .L26: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 2 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L28: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f4, f4, f30, f18 FMADD f5, f5, f30, f19 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) FMADD f8, f8, f30, f20 FMADD f9, f9, f30, f21 FMADD f12, f12, f30, f22 FMADD f13, f13, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L30: andi. I, M, 1 ble .L39 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L35 .align 5 .L32: FMADD f0, f16, f20, f0 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f23, 11 * SIZE(BO) LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f24, f1 LFD f24, 12 * SIZE(BO) FMADD f5, f17, f25, f5 LFD f25, 13 * SIZE(BO) FMADD f9, f17, f26, f9 LFD f26, 14 * SIZE(BO) FMADD f13, f17, f27, f13 LFD f27, 15 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f0, f18, f20, f0 LFDU f20, 16 * SIZE(BO) FMADD f4, f18, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f18, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f18, f23, f12 LFD f23, 3 * SIZE(BO) LFD f18, 2 * SIZE(AO) FMADD f1, f19, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) FMADD f9, f19, f26, f9 LFD f26, 6 * SIZE(BO) FMADD f13, f19, f27, f13 LFD f27, 7 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L32 fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 .L35: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L38 .align 4 .L36: FMADD f0, f16, f20, f0 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L36 .align 4 .L38: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f20, 0 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) FMADD f0, f0, f30, f16 FMADD f4, f4, f30, f18 FMADD f8, f8, f30, f20 FMADD f12, f12, f30, f22 #else FMUL f0, f0, f30 FMUL f4, f4, f30 FMUL f8, f8, f30 FMUL f12, f12, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif mr B, BO addic. J, J, -1 bgt .L10 .align 4 .L40: mr CO1, C add CO2, C, LDC andi. J, N, 2 ble .L70 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble .L50 .align 4 .L41: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L45 .align 5 .L42: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 5 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 5 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 6 * SIZE(BO) FMADD f4, f16, f23, f4 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 9 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 10 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 11 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 12 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 13 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 14 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 1 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 2 * SIZE(BO) FMADD f4, f16, f23, f4 LFDU f16, 16 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L42 .align 4 .L45: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L48 .align 4 .L46: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 2 * SIZE(BO) FMADD f4, f16, f21, f4 LFDU f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 3 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L46 .align 4 .L48: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ .L41 .align 4 .L50: andi. I, M, 2 ble .L60 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L55 .align 5 .L52: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 8 * SIZE(BO) FMADD f2, f16, f21, f2 LFD f16, 4 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 5 * SIZE(AO) FMADD f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) FMADD f5, f19, f22, f5 LFD f22, 2 * SIZE(BO) FMADD f6, f18, f23, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f24, f0 LFD f23, 3 * SIZE(BO) FMADD f1, f17, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f2, f16, f25, f2 LFDU f16, 8 * SIZE(AO) FMADD f3, f17, f25, f3 LFD f17, 1 * SIZE(AO) FMADD f4, f18, f26, f4 LFD f25, 5 * SIZE(BO) FMADD f5, f19, f26, f5 LFD f26, 6 * SIZE(BO) FMADD f6, f18, f27, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f27, f7 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L52 .align 4 .L55: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L58 .align 4 .L56: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 2 * SIZE(BO) FMADD f2, f16, f21, f2 LFDU f16, 2 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L56 .align 4 .L58: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L60: andi. I, M, 1 ble .L69 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L65 .align 5 .L62: FMADD f0, f16, f20, f0 LFDU f20, 8 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 4 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f17, f22, f2 LFD f22, 2 * SIZE(BO) FMADD f3, f17, f23, f3 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) FMADD f0, f18, f24, f0 LFD f24, 4 * SIZE(BO) FMADD f1, f18, f25, f1 LFD f18, 2 * SIZE(AO) LFD f25, 5 * SIZE(BO) FMADD f2, f19, f26, f2 LFD f26, 6 * SIZE(BO) FMADD f3, f19, f27, f3 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L62 .align 4 .L65: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L68 .align 4 .L66: FMADD f0, f16, f20, f0 LFDU f20, 2 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L66 .align 4 .L68: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f18 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO .align 4 .L70: mr CO1, C andi. J, N, 1 ble .L999 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 mr AO, A ble .L80 .align 4 .L71: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble .L75 #endif ble .L75 .align 5 .L72: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 5 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f19, 7 * SIZE(AO) LFDU f20, 4 * SIZE(BO) FMADD f0, f16, f21, f0 LFD f16, 8 * SIZE(AO) FMADD f1, f17, f21, f1 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f19, 11 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f0, f16, f22, f0 LFD f16, 12 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f17, 13 * SIZE(AO) FMADD f2, f18, f22, f2 LFD f18, 14 * SIZE(AO) FMADD f3, f19, f22, f3 LFD f19, 15 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f0, f16, f23, f0 LFDU f16, 16 * SIZE(AO) FMADD f1, f17, f23, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L72 .align 4 .L75: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L78 .align 4 .L76: FMADD f0, f16, f20, f0 LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f20, f3 LFDU f20, 1 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L76 .align 4 .L78: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt+ .L71 .align 4 .L80: andi. I, M, 2 ble .L90 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L85 .align 5 .L82: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) LFD f17, 5 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f21, 1 * SIZE(BO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFDU f16, 8 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f22, 2 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f23, 3 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L82 .align 4 .L85: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L88 .align 4 .L86: FMADD f0, f16, f20, f0 LFDU f16, 2 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 1 * SIZE(BO) LFD f17, 1 * SIZE(AO) bdnz .L86 .align 4 .L88: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 addi CO1, CO1, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L90: andi. I, M, 1 ble .L999 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 3 mtspr CTR, r0 mr BO, B #endif ble .L95 .align 5 .L92: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) LFD f20, 4 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) LFD f21, 5 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 6 * SIZE(AO) LFD f22, 6 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 7 * SIZE(AO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 LFDU f16, 8 * SIZE(AO) LFDU f20, 8 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L92 .align 4 .L95: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L98 .align 4 .L96: FMADD f0, f16, f20, f0 LFDU f16, 1 * SIZE(AO) LFDU f20, 1 * SIZE(BO) bdnz .L96 .align 4 .L98: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMADD f0, f0, f30, f16 #else FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMUL f0, f0, f30 #endif STFD f0, 0 * SIZE(CO1) .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) ld r19, 240(SP) ld r18, 248(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) lwz r19, 192(SP) lwz r18, 196(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_kernel_hummer.S000066400000000000000000003565561313527062700215720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define ALPHA 0 #define FZERO 8 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define KK r14 #define INCM1 r15 #define INCM3 r16 #define INCM5 r17 #define INCM7 r18 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define CO3 r30 #define CO4 r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) # dummy li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f1, -8(SP) slwi LDC, LDC, BASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif andi. r0, C, 2 * SIZE - 1 bne .L1000 andi. r0, LDC, 2 * SIZE - 1 bne .L1000 /* High performance version */ li INCM3, -2 * SIZE li INCM5, -5 * SIZE li INCM7, -6 * SIZE addi C, C, - 2 * SIZE srawi. J, N, 2 ble .L50 .align 4 .L10: mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -4 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L20 .align 4 .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 3 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 fpmr f1, f0 mtspr CTR, TEMP ble .L14 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A1, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 LFPDUX B1, BO, INC4 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A3, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 LFPDUX A5, AO, INC4 fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 LFPDUX B3, BO, INC4 fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A6, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A7, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A9, f12 LFPDUX B5, BO, INC4 fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A8, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 LFPDUX A2, AO2, INC4 fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 #else nop #endif fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 #ifndef TRMMKERNEL LFPDUX B1, CO1, INC4 #else nop #endif fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 #ifndef TRMMKERNEL LFPDUX A3, CO2, INC2 #else nop #endif fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 #ifndef TRMMKERNEL LFPDUX A5, CO2, INC4 #else nop #endif fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 #ifndef TRMMKERNEL LFPDUX B3, CO3, INC2 #else nop #endif fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 #ifndef TRMMKERNEL LFPDUX A6, CO3, INC4 #else nop #endif fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 #ifndef TRMMKERNEL LFPDUX A7, CO4, INC2 #else nop #endif fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 nop fxcsmadd f12, B4, A9, f12 #ifndef TRMMKERNEL LFPDUX B2, CO4, INC4 #else nop #endif fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 #ifndef TRMMKERNEL LFPDUX B5, CO1, INCM3 #else nop #endif fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 #ifndef TRMMKERNEL LFPDUX A8, CO1, INC4 #else nop #endif fxcpmadd f3, B6, A4, f3 nop fxcsmadd f7, B6, A4, f7 nop fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 #ifndef TRMMKERNEL LFPDUX A9, CO2, INCM3 #else nop #endif .align 4 .L14: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 4 #endif andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 cmpwi cr0, TEMP, 3 bgt+ .L15 #else andi. r0, K, 3 mtspr CTR, r0 ble+ .L18 cmpwi cr0, K, 3 bgt+ .L15 #endif #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 fpmr f5, f0 LFPDUX B1, CO1, INC4 fpmr f9, f0 LFPDUX A3, CO2, INC2 fpmr f13, f0 LFPDUX A5, CO2, INC4 fpmr f2, f0 LFPDUX B3, CO3, INC2 fpmr f6, f0 LFPDUX A6, CO3, INC4 fpmr f10, f0 LFPDUX A7, CO4, INC2 fpmr f14, f0 LFPDUX B2, CO4, INC4 fpmr f3, f0 LFPDUX B5, CO1, INCM3 fpmr f7, f0 LFPDUX A8, CO1, INC4 fpmr f11, f0 LFPDUX A9, CO2, INCM3 fpmr f15, f0 #else fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop #endif .align 4 .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 LFPDUX A2, AO, INC4 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 LFPDUX A10, BO, INC4 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 .align 4 .L18: #ifndef TRMMKERNEL fxcpmadd f0, AP, f0, A1 LFPDUX B4, CO2, INC4 fxcpmadd f1, AP, f1, B5 LFPDUX A2, CO3, INCM3 fxcpmadd f2, AP, f2, B1 LFPDUX A4, CO3, INC4 fxcpmadd f3, AP, f3, A8 LFPDUX A10, CO4, INCM3 fxcpmadd f4, AP, f4, A3 LFPDUX A1, CO4, INC4 fxcpmadd f5, AP, f5, A9 STFPDUX f0, CO1, INCM7 fxcpmadd f6, AP, f6, A5 STFPDUX f1, CO1, INC2 fxcpmadd f7, AP, f7, B4 STFPDUX f2, CO1, INC2 fxcpmadd f8, AP, f8, B3 STFPDUX f3, CO1, INC2 fxcpmadd f9, AP, f9, A2 STFPDUX f4, CO2, INCM7 fxcpmadd f10, AP, f10, A6 STFPDUX f5, CO2, INC2 fxcpmadd f11, AP, f11, A4 STFPDUX f6, CO2, INC2 fxcpmadd f12, AP, f12, A7 STFPDUX f7, CO2, INC2 fxcpmadd f13, AP, f13, A10 STFPDUX f8, CO3, INCM7 fxcpmadd f14, AP, f14, B2 STFPDUX f9, CO3, INC2 fxcpmadd f15, AP, f15, A1 STFPDUX f10, CO3, INC2 STFPDUX f11, CO3, INC2 STFPDUX f12, CO4, INCM7 STFPDUX f13, CO4, INC2 STFPDUX f14, CO4, INC2 STFPDUX f15, CO4, INC2 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 fpmul f4, AP, f4 fpmul f5, AP, f5 STFPDUX f0, CO1, INC2 fpmul f6, AP, f6 STFPDUX f1, CO1, INC2 fpmul f7, AP, f7 STFPDUX f2, CO1, INC2 fpmul f8, AP, f8 STFPDUX f3, CO1, INC2 fpmul f9, AP, f9 STFPDUX f4, CO2, INC2 fpmul f10, AP, f10 STFPDUX f5, CO2, INC2 fpmul f11, AP, f11 STFPDUX f6, CO2, INC2 fpmul f12, AP, f12 STFPDUX f7, CO2, INC2 fpmul f13, AP, f13 STFPDUX f8, CO3, INC2 fpmul f14, AP, f14 STFPDUX f9, CO3, INC2 fpmul f15, AP, f15 STFPDUX f10, CO3, INC2 STFPDUX f11, CO3, INC2 STFPDUX f12, CO4, INC2 STFPDUX f13, CO4, INC2 STFPDUX f14, CO4, INC2 STFPDUX f15, CO4, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -8 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 8 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L20: andi. I, M, 4 beq .L30 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 2 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, TEMP fpmr f13, f0 ble .L24 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 nop fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 LFPDUX B2, BO2, INC4 fxcpmadd f0, B3, A3, f0 nop fxcsmadd f4, B3, A3, f4 LFPDUX A2, AO2, INC4 fxcpmadd f8, B4, A3, f8 nop fxcsmadd f12, B4, A3, f12 LFPDUX A3, AO, INC4 fxcpmadd f1, B3, A4, f1 nop fxcsmadd f5, B3, A4, f5 LFPDUX B3, BO, INC4 fxcpmadd f9, B4, A4, f9 nop fxcsmadd f13, B4, A4, f13 LFPDUX B4, BO2, INC4 fxcpmadd f0, B5, A5, f0 nop fxcsmadd f4, B5, A5, f4 LFPDUX A4, AO2, INC4 fxcpmadd f8, B6, A5, f8 nop fxcsmadd f12, B6, A5, f12 LFPDUX A5, AO, INC4 fxcpmadd f1, B5, A6, f1 nop fxcsmadd f5, B5, A6, f5 LFPDUX B5, BO, INC4 fxcpmadd f9, B6, A6, f9 nop fxcsmadd f13, B6, A6, f13 LFPDUX B6, BO2, INC4 fxcpmadd f0, A9, A7, f0 nop fxcsmadd f4, A9, A7, f4 LFPDUX A6, AO2, INC4 fxcpmadd f8, A10, A7, f8 nop fxcsmadd f12, A10, A7, f12 LFPDUX A7, AO, INC4 fxcpmadd f1, A9, A8, f1 nop fxcsmadd f5, A9, A8, f5 LFPDUX A9, BO, INC4 fxcpmadd f9, A10, A8, f9 nop fxcsmadd f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 fxcpmadd f0, B3, A3, f0 fxcsmadd f4, B3, A3, f4 fxcpmadd f8, B4, A3, f8 fxcsmadd f12, B4, A3, f12 fxcpmadd f1, B3, A4, f1 fxcsmadd f5, B3, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 fxcpmadd f0, B5, A5, f0 fxcsmadd f4, B5, A5, f4 fxcpmadd f8, B6, A5, f8 fxcsmadd f12, B6, A5, f12 fxcpmadd f1, B5, A6, f1 fxcsmadd f5, B5, A6, f5 fxcpmadd f9, B6, A6, f9 fxcsmadd f13, B6, A6, f13 fxcpmadd f0, A9, A7, f0 fxcsmadd f4, A9, A7, f4 fxcpmadd f8, A10, A7, f8 fxcsmadd f12, A10, A7, f12 fxcpmadd f1, A9, A8, f1 fxcsmadd f5, A9, A8, f5 fxcpmadd f9, A10, A8, f9 fxcsmadd f13, A10, A8, f13 .align 4 .L24: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L28 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 .align 4 .L28: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX B1, CO1, INC2 LFPDUX B3, CO2, INC2 LFPDUX A6, CO2, INC2 LFPDUX B5, CO3, INC2 LFPDUX A8, CO3, INC2 LFPDUX A2, CO4, INC2 LFPDUX A4, CO4, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 fxcpmadd f4, AP, f4, B3 fxcpmadd f5, AP, f5, A6 fxcpmadd f8, AP, f8, B5 fxcpmadd f9, AP, f9, A8 STFPDUX f0, CO1, INCM3 fxcpmadd f12, AP, f12, A2 STFPDUX f1, CO1, INC2 fxcpmadd f13, AP, f13, A4 STFPDUX f4, CO2, INCM3 STFPDUX f5, CO2, INC2 STFPDUX f8, CO3, INCM3 STFPDUX f9, CO3, INC2 STFPDUX f12, CO4, INCM3 STFPDUX f13, CO4, INC2 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f4, AP, f4 fpmul f5, AP, f5 fpmul f8, AP, f8 fpmul f9, AP, f9 STFPDUX f0, CO1, INC2 fpmul f12, AP, f12 STFPDUX f1, CO1, INC2 fpmul f13, AP, f13 STFPDUX f4, CO2, INC2 STFPDUX f5, CO2, INC2 STFPDUX f8, CO3, INC2 STFPDUX f9, CO3, INC2 STFPDUX f12, CO4, INC2 STFPDUX f13, CO4, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: andi. I, M, 2 beq .L40 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 #else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX B1, BO, INC4 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, B3, A2, f0 fxcsmadd f1, B3, A2, f1 LFPDUX B3, BO, INC4 fxcpmadd f2, B4, A2, f2 fxcsmadd f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A5, A3, f0 fxcsmadd f1, A5, A3, f1 LFPDUX A5, BO, INC4 fxcpmadd f2, A6, A3, f2 fxcsmadd f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A7, A4, f0 fxcsmadd f1, A7, A4, f1 LFPDUX A7, BO, INC4 fxcpmadd f2, A8, A4, f2 fxcsmadd f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 fxcpmadd f0, B3, A2, f0 fxcsmadd f1, B3, A2, f1 fxcpmadd f2, B4, A2, f2 fxcsmadd f3, B4, A2, f3 fxcpmadd f0, A5, A3, f0 fxcsmadd f1, A5, A3, f1 fxcpmadd f2, A6, A3, f2 fxcsmadd f3, A6, A3, f3 fxcpmadd f0, A7, A4, f0 fxcsmadd f1, A7, A4, f1 fxcpmadd f2, A8, A4, f2 fxcsmadd f3, A8, A4, f3 .align 4 .L34: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L38 LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX B1, BO, INC4 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 .align 4 .L38: #ifndef TRMMKERNEL LFPDX A1, CO1, INC2 LFPDX A2, CO2, INC2 LFPDX A3, CO3, INC2 LFPDX A4, CO4, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, A2 fxcpmadd f2, AP, f2, A3 fxcpmadd f3, AP, f3, A4 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 #endif STFPDUX f0, CO1, INC2 STFPDUX f1, CO2, INC2 STFPDUX f2, CO3, INC2 STFPDUX f3, CO4, INC2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L40: andi. I, M, 1 beq .L49 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 #else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L44 #else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L44 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L43 .align 4 .L42: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A3, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A3, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A3, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A4, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A4, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A4, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A4, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L42 .align 4 .L43: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 fxcpmadd f0, A3, B1, f0 fxcpmadd f1, A3, B2, f1 fxcsmadd f2, A3, B3, f2 fxcsmadd f3, A3, B4, f3 fxcpmadd f0, A4, A5, f0 fxcpmadd f1, A4, A6, f1 fxcsmadd f2, A4, A7, f2 fxcsmadd f3, A4, A8, f3 .align 4 .L44: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L48 LFDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdz- .L47 .align 4 .L46: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdnz+ .L46 .align 4 .L47: fxcpmadd f0, A1, B1, f0 fxcpmadd f1, A1, B2, f1 .align 4 .L48: #ifndef TRMMKERNEL LFDX A1, CO1, INC2 LFDX A2, CO2, INC2 LFDX A3, CO3, INC2 LFDX A4, CO4, INC2 fpadd f0, f0, f2 fpadd f1, f1, f3 fsmfp A1, A2 fsmfp A3, A4 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, A3 #else fpadd f0, f0, f2 fpadd f1, f1, f3 fpmul f0, AP, f0 fpmul f1, AP, f1 #endif STFDX f0, CO1, INC2 STFSDX f0, CO2, INC2 STFDX f1, CO3, INC2 STFSDX f1, CO4, INC2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif addi B, BO, 4 * SIZE addic. J, J, -1 bgt+ .L10 .align 4 .L50: andi. J, N, 2 beq .L90 mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L60 .align 4 .L51: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #else slwi TEMP, KK, 3 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #else fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, K, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX B1, BO, INC2 fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX B2, BO, INC2 fxcsmadd f4, B3, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B3, A3, f2 nop fxcsmadd f6, B3, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX B3, BO, INC2 fxcsmadd f4, B4, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 nop fxcsmadd f5, B4, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B4, A7, f2 nop fxcsmadd f6, B4, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B4, A8, f3 nop fxcsmadd f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 nop fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcsmadd f4, B3, A1, f4 fxcpmadd f1, B3, A2, f1 fxcsmadd f5, B3, A2, f5 fxcpmadd f2, B3, A3, f2 fxcsmadd f6, B3, A3, f6 fxcpmadd f3, B3, A4, f3 fxcsmadd f7, B3, A4, f7 fxcpmadd f0, B4, A5, f0 fxcsmadd f4, B4, A5, f4 fxcpmadd f1, B4, A6, f1 fxcsmadd f5, B4, A6, f5 fxcpmadd f2, B4, A7, f2 fxcsmadd f6, B4, A7, f6 fxcpmadd f3, B4, A8, f3 fxcsmadd f7, B4, A8, f7 .align 4 .L54: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L58 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 .align 4 .L58: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX B1, CO1, INC2 LFPDUX A3, CO1, INC2 LFPDUX A5, CO1, INC2 LFPDUX B3, CO2, INC2 LFPDUX A6, CO2, INC2 LFPDUX A7, CO2, INC2 LFPDUX B2, CO2, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 fxcpmadd f2, AP, f2, A3 fxcpmadd f3, AP, f3, A5 fxcpmadd f4, AP, f4, B3 fxcpmadd f5, AP, f5, A6 STFPDUX f0, CO1, INCM7 fxcpmadd f6, AP, f6, A7 STFPDUX f1, CO1, INC2 fxcpmadd f7, AP, f7, B2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 STFPDUX f4, CO2, INCM7 STFPDUX f5, CO2, INC2 STFPDUX f6, CO2, INC2 STFPDUX f7, CO2, INC2 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 fpmul f4, AP, f4 fpmul f5, AP, f5 STFPDUX f0, CO1, INC2 fpmul f6, AP, f6 STFPDUX f1, CO1, INC2 fpmul f7, AP, f7 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 STFPDUX f4, CO2, INC2 STFPDUX f5, CO2, INC2 STFPDUX f6, CO2, INC2 STFPDUX f7, CO2, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -8 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 8 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L60: andi. I, M, 4 beq .L70 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 2 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif fpmr f2, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 fpmr f3, f0 ble .L64 #else srawi. r0, K, 2 fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 LFPDUX A5, AO, INC2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 .align 4 .L64: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L68 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 .align 4 .L68: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX A2, CO1, INC2 LFPDUX A3, CO2, INC2 LFPDUX A4, CO2, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, A2 fxcpmadd f2, AP, f2, A3 fxcpmadd f3, AP, f3, A4 STFPDUX f0, CO1, INCM3 STFPDUX f1, CO1, INC2 STFPDUX f2, CO2, INCM3 STFPDUX f3, CO2, INC2 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f2, CO2, INC2 STFPDUX f3, CO2, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: andi. I, M, 2 beq .L80 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L74 #else addi BO, B, - 2 * SIZE fpmr f1, f0 srawi. r0, K, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 .align 4 .L74: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L78 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 .align 4 .L78: #ifndef TRMMKERNEL LFPDX A1, CO1, INC2 LFPDX B3, CO2, INC2 fpadd f0, f0, f2 fpadd f1, f1, f3 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B3 #else fpadd f0, f0, f2 fpadd f1, f1, f3 fpmul f0, AP, f0 fpmul f1, AP, f1 #endif STFPDUX f0, CO1, INC2 STFPDUX f1, CO2, INC2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L80: andi. I, M, 1 beq .L89 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L84 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L84 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L83 .align 4 .L82: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A1, AO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A2, AO, INC2 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A3, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 fxcpmadd f2, A4, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A4, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A4, AO, INC2 bdnz+ .L82 .align 4 .L83: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 fxcpmadd f0, A3, B1, f0 fxcsmadd f1, A3, B2, f1 fxcpmadd f2, A4, B3, f2 fxcsmadd f3, A4, B4, f3 .align 4 .L84: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L88 LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdz- .L87 .align 4 .L86: fxcpmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdnz+ .L86 .align 4 .L87: fxcpmadd f0, A1, B1, f0 .align 4 .L88: #ifndef TRMMKERNEL LFDX A1, CO1, INC2 LFDX A2, CO2, INC2 fpadd f0, f0, f1 fpadd f2, f2, f3 fsmfp A1, A2 fpadd f0, f0, f2 fxcpmadd f0, AP, f0, A1 #else fpadd f0, f0, f1 fpadd f2, f2, f3 fsmfp A1, A2 fpadd f0, f0, f2 fpmul f0, AP, f0 #endif STFDX f0, CO1, INC2 STFSDX f0, CO2, INC2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L89: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif addi B, BO, 2 * SIZE .align 4 .L90: andi. J, N, 1 beq .L999 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif mr CO1, C addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L100 .align 4 .L91: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 3 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 1 #endif fpmr f2, f0 srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 ble .L94 #else srawi. r0, K, 2 fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 mtspr CTR, r0 ble .L94 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L93 .align 4 .L92: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B2, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 bdnz+ .L92 .align 4 .L93: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B2, A1, f0 fxcpmadd f1, B2, A2, f1 fxcpmadd f2, B2, A3, f2 fxcpmadd f3, B2, A4, f3 fxcsmadd f0, B2, A5, f0 fxcsmadd f1, B2, A6, f1 fxcsmadd f2, B2, A7, f2 fxcsmadd f3, B2, A8, f3 .align 4 .L94: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L98 LFDX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdz- .L97 .align 4 .L96: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFDX B1, BO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdnz+ .L96 .align 4 .L97: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 fxcpmadd f2, B1, A3, f2 fxcpmadd f3, B1, A4, f3 .align 4 .L98: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX B1, CO1, INC2 LFPDUX A3, CO1, INC2 LFPDUX A5, CO1, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 fxcpmadd f2, AP, f2, A3 fxcpmadd f3, AP, f3, A5 STFPDUX f0, CO1, INCM7 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -8 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 8 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L91 .align 4 .L100: andi. I, M, 4 beq .L110 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 2 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L104 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L104 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L103 .align 4 .L102: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B3, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B3, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L102 .align 4 .L103: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcpmadd f1, B3, A2, f1 fxcsmadd f2, B3, A3, f2 fxcsmadd f3, B3, A4, f3 fxcpmadd f0, B4, A5, f0 fxcpmadd f1, B4, A6, f1 fxcsmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L104: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L108 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdz- .L107 .align 4 .L106: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdnz+ .L106 .align 4 .L107: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 .align 4 .L108: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX B1, CO1, INC2 fpadd f0, f0, f2 fpadd f1, f1, f3 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 STFPDUX f0, CO1, INCM3 STFPDUX f1, CO1, INC2 #else fpadd f0, f0, f2 fpadd f1, f1, f3 fpmul f0, AP, f0 fpmul f1, AP, f1 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L110: andi. I, M, 2 beq .L120 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L114 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L114 #endif LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L113 .align 4 .L112: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcsmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B3, A6, f1 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L112 .align 4 .L113: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A2, f1 fxcpmadd f2, B2, A3, f2 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f1, B3, A6, f1 fxcpmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L114: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L118 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdz- .L117 .align 4 .L116: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdnz+ .L116 .align 4 .L117: fxcpmadd f0, B1, A1, f0 .align 4 .L118: #ifndef TRMMKERNEL LFPDX A1, CO1, INC2 fpadd f0, f0, f1 fpadd f2, f3, f2 fpadd f0, f0, f2 fxcpmadd f1, AP, f0, A1 li r0, FZERO lfpsx f0, SP, r0 STFPDUX f1, CO1, INC2 #else fpadd f0, f0, f1 fpadd f2, f3, f2 fpadd f0, f0, f2 fpmul f1, AP, f0 li r0, FZERO lfpsx f0, SP, r0 STFPDUX f1, CO1, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L120: andi. I, M, 1 beq .L999 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L124 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L124 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L123 .align 4 .L122: fpmadd f0, A1, B1, f0 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fpmadd f1, A2, B2, f1 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fpmadd f2, A3, B3, f2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fpmadd f3, A4, B4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L122 .align 4 .L123: fpmadd f0, A1, B1, f0 fpmadd f1, A2, B2, f1 fpmadd f2, A3, B3, f2 fpmadd f3, A4, B4, f3 .align 4 .L124: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L128 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdz- .L127 .align 4 .L126: fmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdnz+ .L126 .align 4 .L127: fmadd f0, A1, B1, f0 .align 4 .L128: #ifndef TRMMKERNEL LFDX A1, CO1, INC2 fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 fmadd f0, AP, f0, A1 #else fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 fpmul f0, AP, f0 #endif STFDUX f0, CO1, INC2 .align 4 .L999: addi SP, SP, 12 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 .L1000: li INCM1, -1 * SIZE li INCM3, -3 * SIZE li INCM5, -5 * SIZE li INCM7, -7 * SIZE addi C, C, - 1 * SIZE srawi. J, N, 2 ble .L1050 .align 4 .L1010: mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -4 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L1020 .align 4 .L1011: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 3 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 fpmr f1, f0 mtspr CTR, TEMP ble .L1014 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 mtspr CTR, r0 ble .L1014 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L1013 .align 4 .L1012: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A1, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 LFPDUX B1, BO, INC4 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A3, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 LFPDUX A5, AO, INC4 fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 LFPDUX B3, BO, INC4 fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A6, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A7, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A9, f12 LFPDUX B5, BO, INC4 fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A8, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 LFPDUX A2, AO2, INC4 fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 bdnz+ .L1012 .align 4 .L1013: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 #ifndef TRMMKERNEL LFDUX A1, CO1, INC #else nop #endif fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 #ifndef TRMMKERNEL LFDUX B1, CO1, INC2 #else nop #endif fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 #ifndef TRMMKERNEL LFDUX A3, CO1, INC2 #else nop #endif fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 #ifndef TRMMKERNEL LFDUX A5, CO1, INC2 #else nop #endif fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 #ifndef TRMMKERNEL LFSDUX A1, CO1, INCM5 #else nop #endif fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 #ifndef TRMMKERNEL LFSDUX B1, CO1, INC2 #else nop #endif fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 #ifndef TRMMKERNEL LFSDUX A3, CO1, INC2 #else nop #endif fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 #ifndef TRMMKERNEL LFSDUX A5, CO1, INC2 #else nop #endif fxcsmadd f12, B4, A9, f12 #ifndef TRMMKERNEL LFDUX B3, CO2, INC #else nop #endif fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 #ifndef TRMMKERNEL LFDUX A6, CO2, INC2 #else nop #endif fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 #ifndef TRMMKERNEL LFDUX A7, CO2, INC2 #else nop #endif fxcpmadd f3, B6, A4, f3 nop fxcsmadd f7, B6, A4, f7 nop fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 #ifndef TRMMKERNEL LFDUX B2, CO2, INC2 #else nop #endif .align 4 .L1014: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 4 #endif andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L1018 cmpwi cr0, TEMP, 3 bgt+ .L1015 #else andi. r0, K, 3 mtspr CTR, r0 ble+ .L1018 cmpwi cr0, K, 3 bgt+ .L1015 #endif #ifndef TRMMKERNEL LFDUX A1, CO1, INC fpmr f5, f0 LFDUX B1, CO1, INC2 fpmr f9, f0 LFDUX A3, CO1, INC2 fpmr f13, f0 LFDUX A5, CO1, INC2 fpmr f2, f0 LFSDUX A1, CO1, INCM5 fpmr f6, f0 LFSDUX B1, CO1, INC2 fpmr f10, f0 LFSDUX A3, CO1, INC2 fpmr f14, f0 LFSDUX A5, CO1, INC2 fpmr f3, f0 LFDUX B3, CO2, INC fpmr f7, f0 LFDUX A6, CO2, INC2 fpmr f11, f0 LFDUX A7, CO2, INC2 fpmr f15, f0 LFDUX B2, CO2, INC2 #else fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop #endif .align 4 .L1015: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L1017 .align 4 .L1016: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 LFPDUX A2, AO, INC4 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 LFPDUX A10, BO, INC4 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L1016 .align 4 .L1017: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 .align 4 .L1018: #ifndef TRMMKERNEL LFSDUX B3, CO2, INCM5 LFSDUX A6, CO2, INC2 LFSDUX A7, CO2, INC2 LFSDUX B2, CO2, INC2 LFDUX B5, CO3, INC LFDUX A8, CO3, INC2 LFDUX A9, CO3, INC2 LFDUX B4, CO3, INC2 LFSDUX B5, CO3, INCM5 LFSDUX A8, CO3, INC2 LFSDUX A9, CO3, INC2 LFSDUX B4, CO3, INC2 LFDUX A2, CO4, INC LFDUX A4, CO4, INC2 fxcpmadd f0, AP, f0, A1 LFDUX A10, CO4, INC2 LFDUX A1, CO4, INC2 fxcpmadd f1, AP, f1, B1 LFSDUX A2, CO4, INCM5 LFSDUX A4, CO4, INC2 fxcpmadd f2, AP, f2, A3 LFSDUX A10, CO4, INC2 LFSDUX A1, CO4, INC2 fxcpmadd f3, AP, f3, A5 STFDUX f0, CO1, INCM7 STFSDUX f0, CO1, INC fxcpmadd f4, AP, f4, B3 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC fxcpmadd f5, AP, f5, A6 STFDUX f2, CO1, INC STFSDUX f2, CO1, INC fxcpmadd f6, AP, f6, A7 STFDUX f3, CO1, INC STFSDUX f3, CO1, INC fxcpmadd f7, AP, f7, B2 STFDUX f4, CO2, INCM7 STFSDUX f4, CO2, INC fxcpmadd f8, AP, f8, B5 STFDUX f5, CO2, INC STFSDUX f5, CO2, INC fxcpmadd f9, AP, f9, A8 STFDUX f6, CO2, INC STFSDUX f6, CO2, INC fxcpmadd f10, AP, f10, A9 STFDUX f7, CO2, INC STFSDUX f7, CO2, INC fxcpmadd f11, AP, f11, B4 STFDUX f8, CO3, INCM7 STFSDUX f8, CO3, INC fxcpmadd f12, AP, f12, A2 STFDUX f9, CO3, INC STFSDUX f9, CO3, INC fxcpmadd f13, AP, f13, A4 STFDUX f10, CO3, INC STFSDUX f10, CO3, INC fxcpmadd f14, AP, f14, A10 STFDUX f11, CO3, INC STFSDUX f11, CO3, INC fxcpmadd f15, AP, f15, A1 STFDUX f12, CO4, INCM7 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC fpmul f4, AP, f4 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC fpmul f5, AP, f5 STFDUX f2, CO1, INC STFSDUX f2, CO1, INC fpmul f6, AP, f6 STFDUX f3, CO1, INC STFSDUX f3, CO1, INC fpmul f7, AP, f7 STFDUX f4, CO2, INC STFSDUX f4, CO2, INC fpmul f8, AP, f8 STFDUX f5, CO2, INC STFSDUX f5, CO2, INC fpmul f9, AP, f9 STFDUX f6, CO2, INC STFSDUX f6, CO2, INC fpmul f10, AP, f10 STFDUX f7, CO2, INC STFSDUX f7, CO2, INC fpmul f11, AP, f11 STFDUX f8, CO3, INC STFSDUX f8, CO3, INC fpmul f12, AP, f12 STFDUX f9, CO3, INC STFSDUX f9, CO3, INC fpmul f13, AP, f13 STFDUX f10, CO3, INC STFSDUX f10, CO3, INC fpmul f14, AP, f14 STFDUX f11, CO3, INC STFSDUX f11, CO3, INC fpmul f15, AP, f15 STFDUX f12, CO4, INC #endif STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC STFDUX f14, CO4, INC STFSDUX f14, CO4, INC STFDUX f15, CO4, INC STFSDUX f15, CO4, INC #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -8 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 8 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L1011 .align 4 .L1020: andi. I, M, 4 beq .L1030 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 2 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, TEMP fpmr f13, f0 ble .L1024 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L1024 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L1023 .align 4 .L1022: fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 nop fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 LFPDUX B2, BO2, INC4 fxcpmadd f0, B3, A3, f0 nop fxcsmadd f4, B3, A3, f4 LFPDUX A2, AO2, INC4 fxcpmadd f8, B4, A3, f8 nop fxcsmadd f12, B4, A3, f12 LFPDUX A3, AO, INC4 fxcpmadd f1, B3, A4, f1 nop fxcsmadd f5, B3, A4, f5 LFPDUX B3, BO, INC4 fxcpmadd f9, B4, A4, f9 nop fxcsmadd f13, B4, A4, f13 LFPDUX B4, BO2, INC4 fxcpmadd f0, B5, A5, f0 nop fxcsmadd f4, B5, A5, f4 LFPDUX A4, AO2, INC4 fxcpmadd f8, B6, A5, f8 nop fxcsmadd f12, B6, A5, f12 LFPDUX A5, AO, INC4 fxcpmadd f1, B5, A6, f1 nop fxcsmadd f5, B5, A6, f5 LFPDUX B5, BO, INC4 fxcpmadd f9, B6, A6, f9 nop fxcsmadd f13, B6, A6, f13 LFPDUX B6, BO2, INC4 fxcpmadd f0, A9, A7, f0 nop fxcsmadd f4, A9, A7, f4 LFPDUX A6, AO2, INC4 fxcpmadd f8, A10, A7, f8 nop fxcsmadd f12, A10, A7, f12 LFPDUX A7, AO, INC4 fxcpmadd f1, A9, A8, f1 nop fxcsmadd f5, A9, A8, f5 LFPDUX A9, BO, INC4 fxcpmadd f9, A10, A8, f9 nop fxcsmadd f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L1022 .align 4 .L1023: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 fxcpmadd f0, B3, A3, f0 fxcsmadd f4, B3, A3, f4 fxcpmadd f8, B4, A3, f8 fxcsmadd f12, B4, A3, f12 fxcpmadd f1, B3, A4, f1 fxcsmadd f5, B3, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 fxcpmadd f0, B5, A5, f0 fxcsmadd f4, B5, A5, f4 fxcpmadd f8, B6, A5, f8 fxcsmadd f12, B6, A5, f12 fxcpmadd f1, B5, A6, f1 fxcsmadd f5, B5, A6, f5 fxcpmadd f9, B6, A6, f9 fxcsmadd f13, B6, A6, f13 fxcpmadd f0, A9, A7, f0 fxcsmadd f4, A9, A7, f4 fxcpmadd f8, A10, A7, f8 fxcsmadd f12, A10, A7, f12 fxcpmadd f1, A9, A8, f1 fxcsmadd f5, A9, A8, f5 fxcpmadd f9, A10, A8, f9 fxcsmadd f13, A10, A8, f13 .align 4 .L1024: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1028 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L1027 .align 4 .L1026: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L1026 .align 4 .L1027: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 .align 4 .L1028: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX B1, CO1, INC2 LFDUX B3, CO2, INC LFDUX A6, CO2, INC2 LFSDUX A1, CO1, INCM1 LFSDUX B1, CO1, INC2 LFSDUX B3, CO2, INCM1 LFSDUX A6, CO2, INC2 LFDUX B5, CO3, INC LFDUX A8, CO3, INC2 LFDUX A2, CO4, INC LFDUX A4, CO4, INC2 fxcpmadd f0, AP, f0, A1 LFSDUX B5, CO3, INCM1 LFSDUX A8, CO3, INC2 fxcpmadd f1, AP, f1, B1 LFSDUX A2, CO4, INCM1 LFSDUX A4, CO4, INC2 fxcpmadd f4, AP, f4, B3 STFDUX f0, CO1, INCM3 STFSDUX f0, CO1, INC fxcpmadd f5, AP, f5, A6 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC fxcpmadd f8, AP, f8, B5 STFDUX f4, CO2, INCM3 STFSDUX f4, CO2, INC fxcpmadd f9, AP, f9, A8 STFDUX f5, CO2, INC STFSDUX f5, CO2, INC fxcpmadd f12, AP, f12, A2 STFDUX f8, CO3, INCM3 STFSDUX f8, CO3, INC fxcpmadd f13, AP, f13, A4 STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f12, CO4, INCM3 STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f4, AP, f4 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC fpmul f5, AP, f5 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC fpmul f8, AP, f8 STFDUX f4, CO2, INC STFSDUX f4, CO2, INC fpmul f9, AP, f9 STFDUX f5, CO2, INC STFSDUX f5, CO2, INC fpmul f12, AP, f12 STFDUX f8, CO3, INC STFSDUX f8, CO3, INC fpmul f13, AP, f13 STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1030: andi. I, M, 2 beq .L1040 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 #else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L1034 #else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 2 mtspr CTR, r0 ble .L1034 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L1033 .align 4 .L1032: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX B1, BO, INC4 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, B3, A2, f0 fxcsmadd f1, B3, A2, f1 LFPDUX B3, BO, INC4 fxcpmadd f2, B4, A2, f2 fxcsmadd f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A5, A3, f0 fxcsmadd f1, A5, A3, f1 LFPDUX A5, BO, INC4 fxcpmadd f2, A6, A3, f2 fxcsmadd f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A7, A4, f0 fxcsmadd f1, A7, A4, f1 LFPDUX A7, BO, INC4 fxcpmadd f2, A8, A4, f2 fxcsmadd f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L1032 .align 4 .L1033: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 fxcpmadd f0, B3, A2, f0 fxcsmadd f1, B3, A2, f1 fxcpmadd f2, B4, A2, f2 fxcsmadd f3, B4, A2, f3 fxcpmadd f0, A5, A3, f0 fxcsmadd f1, A5, A3, f1 fxcpmadd f2, A6, A3, f2 fxcsmadd f3, A6, A3, f3 fxcpmadd f0, A7, A4, f0 fxcsmadd f1, A7, A4, f1 fxcpmadd f2, A8, A4, f2 fxcsmadd f3, A8, A4, f3 .align 4 .L1034: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1038 LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L1037 .align 4 .L1036: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX B1, BO, INC4 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L1036 .align 4 .L1037: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A1, f2 fxcsmadd f3, B2, A1, f3 .align 4 .L1038: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO2, INC LFDUX A3, CO3, INC LFDUX A4, CO4, INC LFSDUX A1, CO1, INC LFSDUX A2, CO2, INC LFSDUX A3, CO3, INC LFSDUX A4, CO4, INC fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, A2 fxcpmadd f2, AP, f2, A3 fxcpmadd f3, AP, f3, A4 STFDUX f0, CO1, INCM1 STFSDUX f0, CO1, INC STFDUX f1, CO2, INCM1 STFSDUX f1, CO2, INC STFDUX f2, CO3, INCM1 STFSDUX f2, CO3, INC STFDUX f3, CO4, INCM1 STFSDUX f3, CO4, INC #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 fpmul f3, AP, f3 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO2, INC STFSDUX f1, CO2, INC STFDUX f2, CO3, INC STFSDUX f2, CO3, INC STFDUX f3, CO4, INC STFSDUX f3, CO4, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1040: andi. I, M, 1 beq .L1049 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 #else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 2 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L1044 #else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L1044 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L1043 .align 4 .L1042: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A3, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A3, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A3, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A4, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A4, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A4, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A4, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L1042 .align 4 .L1043: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 fxcpmadd f0, A3, B1, f0 fxcpmadd f1, A3, B2, f1 fxcsmadd f2, A3, B3, f2 fxcsmadd f3, A3, B4, f3 fxcpmadd f0, A4, A5, f0 fxcpmadd f1, A4, A6, f1 fxcsmadd f2, A4, A7, f2 fxcsmadd f3, A4, A8, f3 .align 4 .L1044: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L1048 LFDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdz- .L1047 .align 4 .L1046: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdnz+ .L1046 .align 4 .L1047: fxcpmadd f0, A1, B1, f0 fxcpmadd f1, A1, B2, f1 .align 4 .L1048: #ifndef TRMMKERNEL LFDX A1, CO1, INC LFDX B3, CO3, INC LFSDX A1, CO2, INC LFSDX B3, CO4, INC fpadd f0, f0, f2 fpadd f1, f1, f3 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B3 #else fpadd f0, f0, f2 fpadd f1, f1, f3 fpmul f0, AP, f0 fpmul f1, AP, f1 #endif STFDUX f0, CO1, INC STFSDUX f0, CO2, INC STFDUX f1, CO3, INC STFSDUX f1, CO4, INC #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L1049: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif addi B, BO, 4 * SIZE addic. J, J, -1 bgt+ .L1010 .align 4 .L1050: andi. J, N, 2 beq .L1090 mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L1060 .align 4 .L1051: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #else slwi TEMP, KK, 3 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L1054 #else fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, K, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L1054 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L1053 .align 4 .L1052: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX B1, BO, INC2 fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX B2, BO, INC2 fxcsmadd f4, B3, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B3, A3, f2 nop fxcsmadd f6, B3, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX B3, BO, INC2 fxcsmadd f4, B4, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 nop fxcsmadd f5, B4, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B4, A7, f2 nop fxcsmadd f6, B4, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B4, A8, f3 nop fxcsmadd f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L1052 .align 4 .L1053: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 nop fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcsmadd f4, B3, A1, f4 fxcpmadd f1, B3, A2, f1 fxcsmadd f5, B3, A2, f5 fxcpmadd f2, B3, A3, f2 fxcsmadd f6, B3, A3, f6 fxcpmadd f3, B3, A4, f3 fxcsmadd f7, B3, A4, f7 fxcpmadd f0, B4, A5, f0 fxcsmadd f4, B4, A5, f4 fxcpmadd f1, B4, A6, f1 fxcsmadd f5, B4, A6, f5 fxcpmadd f2, B4, A7, f2 fxcsmadd f6, B4, A7, f6 fxcpmadd f3, B4, A8, f3 fxcsmadd f7, B4, A8, f7 .align 4 .L1054: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1058 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L1057 .align 4 .L1056: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L1056 .align 4 .L1057: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 .align 4 .L1058: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX B1, CO1, INC2 LFDUX A3, CO1, INC2 LFDUX A5, CO1, INC2 LFSDUX A1, CO1, INCM5 LFSDUX B1, CO1, INC2 LFSDUX A3, CO1, INC2 LFSDUX A5, CO1, INC2 LFDUX B3, CO2, INC LFDUX A6, CO2, INC2 LFDUX A7, CO2, INC2 LFDUX B2, CO2, INC2 fxcpmadd f0, AP, f0, A1 LFSDUX B3, CO2, INCM5 LFSDUX A6, CO2, INC2 fxcpmadd f1, AP, f1, B1 LFSDUX A7, CO2, INC2 LFSDUX B2, CO2, INC2 fxcpmadd f2, AP, f2, A3 STFDUX f0, CO1, INCM7 STFSDUX f0, CO1, INC fxcpmadd f3, AP, f3, A5 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC fxcpmadd f4, AP, f4, B3 STFDUX f2, CO1, INC STFSDUX f2, CO1, INC fxcpmadd f5, AP, f5, A6 STFDUX f3, CO1, INC STFSDUX f3, CO1, INC fxcpmadd f6, AP, f6, A7 STFDUX f4, CO2, INCM7 STFSDUX f4, CO2, INC fxcpmadd f7, AP, f7, B2 STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC fpmul f3, AP, f3 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC fpmul f4, AP, f4 STFDUX f2, CO1, INC STFSDUX f2, CO1, INC fpmul f5, AP, f5 STFDUX f3, CO1, INC STFSDUX f3, CO1, INC fpmul f6, AP, f6 STFDUX f4, CO2, INC STFSDUX f4, CO2, INC fpmul f7, AP, f7 STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -8 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 8 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L1051 .align 4 .L1060: andi. I, M, 4 beq .L1070 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 2 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif fpmr f2, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 fpmr f3, f0 ble .L1064 #else srawi. r0, K, 2 fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L1064 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L1063 .align 4 .L1062: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 LFPDUX A5, AO, INC2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L1062 .align 4 .L1063: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 .align 4 .L1064: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1068 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L1067 .align 4 .L1066: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L1066 .align 4 .L1067: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 .align 4 .L1068: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO1, INC2 LFDUX A3, CO2, INC LFDUX A4, CO2, INC2 LFSDUX A1, CO1, INCM1 LFSDUX A2, CO1, INC2 LFSDUX A3, CO2, INCM1 LFSDUX A4, CO2, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, A2 fxcpmadd f2, AP, f2, A3 STFDUX f0, CO1, INCM3 STFSDUX f0, CO1, INC fxcpmadd f3, AP, f3, A4 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO2, INCM3 STFSDUX f2, CO2, INC STFDUX f3, CO2, INC STFSDUX f3, CO2, INC #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC fpmul f3, AP, f3 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC STFDUX f3, CO2, INC STFSDUX f3, CO2, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1070: andi. I, M, 2 beq .L1080 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L1074 #else addi BO, B, - 2 * SIZE fpmr f1, f0 srawi. r0, K, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L1074 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L1073 .align 4 .L1072: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L1072 .align 4 .L1073: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 .align 4 .L1074: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L1078 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L1077 .align 4 .L1076: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L1076 .align 4 .L1077: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 .align 4 .L1078: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX B3, CO2, INC LFSDUX A1, CO1, INC LFSDUX B3, CO2, INC fpadd f0, f0, f2 fpadd f1, f1, f3 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B3 STFDUX f0, CO1, INCM1 STFSDUX f0, CO1, INC STFDUX f1, CO2, INCM1 STFSDUX f1, CO2, INC #else fpadd f0, f0, f2 fpadd f1, f1, f3 fpmul f0, AP, f0 fpmul f1, AP, f1 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO2, INC STFSDUX f1, CO2, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1080: andi. I, M, 1 beq .L1089 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 1 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L1084 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L1084 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L1083 .align 4 .L1082: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A1, AO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A2, AO, INC2 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A3, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 fxcpmadd f2, A4, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A4, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A4, AO, INC2 bdnz+ .L1082 .align 4 .L1083: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 fxcpmadd f0, A3, B1, f0 fxcsmadd f1, A3, B2, f1 fxcpmadd f2, A4, B3, f2 fxcsmadd f3, A4, B4, f3 .align 4 .L1084: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L1088 LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdz- .L1087 .align 4 .L1086: fxcpmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdnz+ .L1086 .align 4 .L1087: fxcpmadd f0, A1, B1, f0 .align 4 .L1088: #ifndef TRMMKERNEL LFDX A1, CO1, INC LFDX A2, CO2, INC fpadd f0, f0, f1 fpadd f2, f2, f3 fsmfp A1, A2 fpadd f0, f0, f2 fxcpmadd f0, AP, f0, A1 #else fpadd f0, f0, f1 fpadd f2, f2, f3 fsmfp A1, A2 fpadd f0, f0, f2 fpmul f0, AP, f0 #endif STFDUX f0, CO1, INC STFSDUX f0, CO2, INC #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L1089: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif addi B, BO, 2 * SIZE .align 4 .L1090: andi. J, N, 1 beq .L10999 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif mr CO1, C addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L10100 .align 4 .L1091: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 3 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 1 #endif fpmr f2, f0 srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 ble .L1094 #else srawi. r0, K, 2 fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 mtspr CTR, r0 ble .L1094 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L1093 .align 4 .L1092: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B2, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 bdnz+ .L1092 .align 4 .L1093: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B2, A1, f0 fxcpmadd f1, B2, A2, f1 fxcpmadd f2, B2, A3, f2 fxcpmadd f3, B2, A4, f3 fxcsmadd f0, B2, A5, f0 fxcsmadd f1, B2, A6, f1 fxcsmadd f2, B2, A7, f2 fxcsmadd f3, B2, A8, f3 .align 4 .L1094: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 8 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1098 LFDX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdz- .L1097 .align 4 .L1096: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFDX B1, BO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdnz+ .L1096 .align 4 .L1097: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 fxcpmadd f2, B1, A3, f2 fxcpmadd f3, B1, A4, f3 .align 4 .L1098: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX B1, CO1, INC2 LFDUX A3, CO1, INC2 LFDUX A5, CO1, INC2 LFSDUX A1, CO1, INCM5 LFSDUX B1, CO1, INC2 LFSDUX A3, CO1, INC2 LFSDUX A5, CO1, INC2 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 fxcpmadd f2, AP, f2, A3 STFDUX f0, CO1, INCM7 STFSDUX f0, CO1, INC fxcpmadd f3, AP, f3, A5 #else fpmul f0, AP, f0 fpmul f1, AP, f1 fpmul f2, AP, f2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC fpmul f3, AP, f3 #endif STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -8 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 8 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L1091 .align 4 .L10100: andi. I, M, 4 beq .L10110 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 2 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L10104 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L10104 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L10103 .align 4 .L10102: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B3, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B3, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L10102 .align 4 .L10103: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcpmadd f1, B3, A2, f1 fxcsmadd f2, B3, A3, f2 fxcsmadd f3, B3, A4, f3 fxcpmadd f0, B4, A5, f0 fxcpmadd f1, B4, A6, f1 fxcsmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L10104: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L10108 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdz- .L10107 .align 4 .L10106: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdnz+ .L10106 .align 4 .L10107: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 .align 4 .L10108: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX B1, CO1, INC2 LFSDUX A1, CO1, INCM1 LFSDUX B1, CO1, INC2 fpadd f0, f0, f2 fpadd f1, f1, f3 fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, B1 STFDUX f0, CO1, INCM3 STFSDUX f0, CO1, INC #else fpadd f0, f0, f2 fpadd f1, f1, f3 fpmul f0, AP, f0 fpmul f1, AP, f1 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #endif STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L10110: andi. I, M, 2 beq .L10120 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 1 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L10114 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L10114 #endif LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L10113 .align 4 .L10112: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcsmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B3, A6, f1 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L10112 .align 4 .L10113: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A2, f1 fxcpmadd f2, B2, A3, f2 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f1, B3, A6, f1 fxcpmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L10114: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L10118 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdz- .L10117 .align 4 .L10116: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdnz+ .L10116 .align 4 .L10117: fxcpmadd f0, B1, A1, f0 .align 4 .L10118: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO1, INC fpadd f0, f0, f1 fpadd f2, f3, f2 fsmfp A1, A2 fpadd f0, f0, f2 fxcpmadd f1, AP, f0, A1 li r0, FZERO lfpsx f0, SP, r0 STFDUX f1, CO1, INCM1 STFSDUX f1, CO1, INC #else fpadd f0, f0, f1 fpadd f2, f3, f2 fsmfp A1, A2 fpadd f0, f0, f2 fpmul f1, AP, f0 li r0, FZERO lfpsx f0, SP, r0 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L10120: andi. I, M, 1 beq .L10999 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 #else slwi TEMP, KK, 0 + BASE_SHIFT slwi r0, KK, 0 + BASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L10124 #else addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, K, 3 mtspr CTR, r0 ble .L10124 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L10123 .align 4 .L10122: fpmadd f0, A1, B1, f0 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fpmadd f1, A2, B2, f1 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fpmadd f2, A3, B3, f2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fpmadd f3, A4, B4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L10122 .align 4 .L10123: fpmadd f0, A1, B1, f0 fpmadd f1, A2, B2, f1 fpmadd f2, A3, B3, f2 fpmadd f3, A4, B4, f3 .align 4 .L10124: lfd AP, ALPHA(SP) #ifdef TRMMKERNEL fsmfp AP, AP #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L10128 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdz- .L10127 .align 4 .L10126: fmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdnz+ .L10126 .align 4 .L10127: fmadd f0, A1, B1, f0 .align 4 .L10128: #ifndef TRMMKERNEL LFDX A1, CO1, INC fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 fmadd f0, AP, f0, A1 STFDUX f0, CO1, INC #else fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 fmul f0, AP, f0 STFDUX f0, CO1, INC #endif .align 4 .L10999: addi SP, SP, 12 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_power3.S000066400000000000000000000725641313527062700215060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #endif #endif #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM #ifndef DOUBLE #include "../sparam.h" #else #include "../dparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stfd f1, ALPHA stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST li PREA, (16 * 5 * SIZE + 16) li PREB, (16 * 5 * SIZE + 16) li PREC, 4 * SIZE #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif lfs f0, FZERO srawi. J, N, 2 ble LL(40) .align 4 LL(10): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 mr AO, A add C, CO4, LDC ble LL(20) .align 4 LL(11): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) #if 0 PREFETCH_C1 PREFETCH_C2 PREFETCH_C3 PREFETCH_C4 #endif srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(15) .align 4 LL(12): fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 LFD f28, 4 * SIZE(BO) fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFD f16, 8 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f5, f17, f21, f5 LFD f29, 5 * SIZE(BO) fmadd f9, f17, f22, f9 fmadd f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) fmadd f2, f18, f20, f2 fmadd f6, f18, f21, f6 LFD f30, 6 * SIZE(BO) fmadd f10, f18, f22, f10 fmadd f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) fmadd f3, f19, f20, f3 fmadd f7, f19, f21, f7 LFD f31, 7 * SIZE(BO) fmadd f11, f19, f22, f11 fmadd f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) fmadd f0, f24, f28, f0 fmadd f4, f24, f29, f4 LFD f20, 8 * SIZE(BO) fmadd f8, f24, f30, f8 fmadd f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) fmadd f1, f25, f28, f1 fmadd f5, f25, f29, f5 LFD f21, 9 * SIZE(BO) fmadd f9, f25, f30, f9 fmadd f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) fmadd f2, f26, f28, f2 fmadd f6, f26, f29, f6 LFD f22, 10 * SIZE(BO) fmadd f10, f26, f30, f10 fmadd f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) fmadd f3, f27, f28, f3 fmadd f7, f27, f29, f7 LFD f23, 11 * SIZE(BO) fmadd f11, f27, f30, f11 fmadd f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 LFD f28, 12 * SIZE(BO) fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFDU f16, 16 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f5, f17, f21, f5 LFD f29, 13 * SIZE(BO) fmadd f9, f17, f22, f9 fmadd f13, f17, f23, f13 LFD f17, 1 * SIZE(AO) fmadd f2, f18, f20, f2 fmadd f6, f18, f21, f6 LFD f30, 14 * SIZE(BO) fmadd f10, f18, f22, f10 fmadd f14, f18, f23, f14 LFD f18, 2 * SIZE(AO) fmadd f3, f19, f20, f3 fmadd f7, f19, f21, f7 LFD f31, 15 * SIZE(BO) fmadd f11, f19, f22, f11 fmadd f15, f19, f23, f15 LFD f19, 3 * SIZE(AO) fmadd f0, f24, f28, f0 fmadd f4, f24, f29, f4 LFDU f20, 16 * SIZE(BO) fmadd f8, f24, f30, f8 fmadd f12, f24, f31, f12 LFD f24, 4 * SIZE(AO) fmadd f1, f25, f28, f1 fmadd f5, f25, f29, f5 LFD f21, 1 * SIZE(BO) fmadd f9, f25, f30, f9 fmadd f13, f25, f31, f13 LFD f25, 5 * SIZE(AO) fmadd f2, f26, f28, f2 fmadd f6, f26, f29, f6 LFD f22, 2 * SIZE(BO) fmadd f10, f26, f30, f10 fmadd f14, f26, f31, f14 LFD f26, 6 * SIZE(AO) fmadd f3, f27, f28, f3 fmadd f7, f27, f29, f7 LFD f23, 3 * SIZE(BO) fmadd f11, f27, f30, f11 fmadd f15, f27, f31, f15 LFD f27, 7 * SIZE(AO) bdnz LL(12) .align 4 LL(15): andi. r0, K, 3 lfd f30, ALPHA lfs f31, FZERO mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFD f16, 4 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f5, f17, f21, f5 fmadd f9, f17, f22, f9 fmadd f13, f17, f23, f13 LFD f17, 5 * SIZE(AO) fmadd f2, f18, f20, f2 fmadd f6, f18, f21, f6 fmadd f10, f18, f22, f10 fmadd f14, f18, f23, f14 LFD f18, 6 * SIZE(AO) fmadd f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) fmadd f7, f19, f21, f7 LFD f21, 5 * SIZE(BO) fmadd f11, f19, f22, f11 LFD f22, 6 * SIZE(BO) fmadd f15, f19, f23, f15 LFD f19, 7 * SIZE(AO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) fmadd f0, f0, f30, f16 LFD f16, 0 * SIZE(CO3) fmadd f1, f1, f30, f17 LFD f17, 1 * SIZE(CO3) fmadd f2, f2, f30, f18 LFD f18, 2 * SIZE(CO3) fmadd f3, f3, f30, f19 LFD f19, 3 * SIZE(CO3) fmadd f4, f4, f30, f20 LFD f20, 0 * SIZE(CO4) fmadd f5, f5, f30, f21 LFD f21, 1 * SIZE(CO4) fmadd f6, f6, f30, f22 LFD f22, 2 * SIZE(CO4) fmadd f7, f7, f30, f23 LFD f23, 3 * SIZE(CO4) fmadd f8, f8, f30, f16 fmadd f9, f9, f30, f17 STFD f0, 0 * SIZE(CO1) fmadd f10, f10, f30, f18 fmadd f11, f11, f30, f19 STFD f1, 1 * SIZE(CO1) fmadd f12, f12, f30, f20 fmadd f13, f13, f30, f21 STFD f2, 2 * SIZE(CO1) fmadd f14, f14, f30, f22 fmadd f15, f15, f30, f23 STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) fmr f0, f31 fmr f1, f31 STFD f5, 1 * SIZE(CO2) fmr f2, f31 fmr f3, f31 STFD f6, 2 * SIZE(CO2) fmr f4, f31 fmr f5, f31 STFD f7, 3 * SIZE(CO2) fmr f6, f31 fmr f7, f31 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) addi CO1, CO1, 4 * SIZE fmr f8, f31 fmr f9, f31 STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) addi CO2, CO2, 4 * SIZE fmr f10, f31 fmr f11, f31 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) addi CO3, CO3, 4 * SIZE fmr f12, f31 fmr f13, f31 STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) addi CO4, CO4, 4 * SIZE fmr f14, f31 fmr f15, f31 addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(25) .align 5 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f2, f18, f24, f2 fmadd f3, f19, f24, f3 fmadd f6, f18, f25, f6 fmadd f7, f19, f25, f7 fmadd f10, f18, f26, f10 fmadd f11, f19, f26, f11 fmadd f14, f18, f27, f14 fmadd f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f2, f18, f24, f2 fmadd f3, f19, f24, f3 fmadd f6, f18, f25, f6 fmadd f7, f19, f25, f7 fmadd f10, f18, f26, f10 fmadd f11, f19, f26, f11 fmadd f14, f18, f27, f14 fmadd f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 fmadd f4, f4, f30, f18 fmadd f5, f5, f30, f19 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) fmadd f8, f8, f30, f20 fmadd f9, f9, f30, f21 fmadd f12, f12, f30, f22 fmadd f13, f13, f30, f23 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE .align 4 LL(30): andi. I, M, 1 ble LL(39) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(35) .align 5 LL(32): fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f1, f17, f24, f1 fmadd f5, f17, f25, f5 fmadd f9, f17, f26, f9 fmadd f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) fmadd f0, f18, f20, f0 fmadd f4, f18, f21, f4 fmadd f8, f18, f22, f8 fmadd f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f1, f19, f24, f1 fmadd f5, f19, f25, f5 fmadd f9, f19, f26, f9 fmadd f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f20, 0 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) fmadd f0, f0, f30, f16 fmadd f4, f4, f30, f18 fmadd f8, f8, f30, f20 fmadd f12, f12, f30, f22 STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 .align 4 LL(39): mr B, BO addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(40): mr CO1, C add CO2, C, LDC andi. J, N, 2 ble LL(70) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble LL(50) .align 4 LL(41): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) PREFETCH_C1 PREFETCH_C2 srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(45) .align 5 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f22, f3 fmadd f4, f16, f23, f4 fmadd f5, f17, f23, f5 fmadd f6, f18, f23, f6 fmadd f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f22, f3 fmadd f4, f16, f23, f4 fmadd f5, f17, f23, f5 fmadd f6, f18, f23, f6 fmadd f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(42) .align 4 LL(45): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 fmadd f2, f2, f30, f18 fmadd f3, f3, f30, f19 fmadd f4, f4, f30, f20 fmadd f5, f5, f30, f21 fmadd f6, f6, f30, f22 fmadd f7, f7, f30, f23 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(55) .align 5 LL(52): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f16, f21, f2 fmadd f3, f17, f21, f3 fmadd f4, f18, f22, f4 fmadd f5, f19, f22, f5 fmadd f6, f18, f23, f6 fmadd f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f16, f24, f0 fmadd f1, f17, f24, f1 fmadd f2, f16, f25, f2 fmadd f3, f17, f25, f3 fmadd f4, f18, f26, f4 fmadd f5, f19, f26, f5 fmadd f6, f18, f27, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f16, f21, f2 fmadd f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) fadd f0, f4, f0 fadd f1, f5, f1 fadd f2, f6, f2 fadd f3, f7, f3 fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 fmadd f2, f2, f30, f18 fmadd f3, f3, f30, f19 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE .align 4 LL(60): andi. I, M, 1 ble LL(69) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(65) .align 5 LL(62): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f17, f22, f2 fmadd f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f19, f26, f2 fmadd f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) fadd f0, f2, f0 fadd f1, f3, f1 fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f18 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 .align 4 LL(69): mr B, BO lfs f0, FZERO .align 4 LL(70): mr CO1, C andi. J, N, 1 ble LL(999) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 mr AO, A ble LL(80) .align 4 LL(71): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) PREFETCH_C1 srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(75) .align 5 LL(72): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f21, f0 fmadd f1, f17, f21, f1 fmadd f2, f18, f21, f2 fmadd f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) fmadd f0, f16, f23, f0 fmadd f1, f17, f23, f1 fmadd f2, f18, f23, f2 fmadd f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(72) .align 4 LL(75): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 fmadd f2, f2, f30, f18 fmadd f3, f3, f30, f19 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(85) .align 5 LL(82): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f21, f2 fmadd f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f23, f2 fmadd f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) fadd f0, f2, f0 fadd f1, f3, f1 fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 .align 4 LL(90): andi. I, M, 1 ble LL(999) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 3 mtspr CTR, r0 mr BO, B ble LL(95) .align 5 LL(92): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): lfd f30, ALPHA andi. r0, K, 7 mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): fmadd f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): LFD f16, 0 * SIZE(CO1) fadd f0, f1, f0 fadd f2, f3, f2 fadd f0, f2, f0 fmadd f0, f0, f30, f16 STFD f0, 0 * SIZE(CO1) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_power6.S000066400000000000000000001344371313527062700215070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define TEMP r19 #define KK r20 #define BB r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO1 r26 #define CO2 r27 #define CO3 r28 #define CO4 r29 #define PREA r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #ifdef TRMMKERNEL std r20, 232(SP) std r19, 240(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #ifdef TRMMKERNEL stw r20, 188(SP) stw r19, 192(SP) #endif #endif stfd f1, ALPHA stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) lfs f0, FZERO li PREA, (16 * 3) * SIZE srawi. J, N, 2 li PREC, 3 * SIZE ble LL(40) .align 4 LL(10): mr CO1, C fmr f1, f0 add CO2, C, LDC fmr f2, f0 add CO3, CO2, LDC fmr f3, f0 add CO4, CO3, LDC fmr f4, f0 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif slwi BB, K, BASE_SHIFT + 2 fmr f5, f0 srawi. I, M, 2 fmr f6, f0 mr AO, A fmr f7, f0 add C, CO4, LDC fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 ble LL(20) .align 4 LL(11): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(B) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(B) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) #endif dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP ble LL(15) #else LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(B) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(B) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, K, 3 mtctr r0 mr BO, B ble LL(15) #endif .align 4 LL(12): dcbt AO, PREA FMADD f0, f16, f20, f0 nop FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 LFD f28, 20 * SIZE(BO) LFD f29, 21 * SIZE(BO) FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 LFD f30, 22 * SIZE(BO) LFD f31, 23 * SIZE(BO) FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 26 * SIZE(AO) LFD f19, 27 * SIZE(AO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 28 * SIZE(AO) LFD f25, 29 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 LFD f28, 28 * SIZE(BO) LFD f29, 29 * SIZE(BO) FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 30 * SIZE(AO) LFD f27, 31 * SIZE(AO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 LFD f18, 34 * SIZE(AO) LFD f19, 35 * SIZE(AO) FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE bdnz LL(12) .align 4 LL(15): lfd f30, ALPHA dcbtst B, BB addi BB, BB, 16 * SIZE dcbtst B, BB addi BB, BB, 16 * SIZE #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE nop bdnz LL(16) .align 4 LL(18): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 LFD f16, 0 * SIZE(CO3) LFD f17, 1 * SIZE(CO3) LFD f18, 2 * SIZE(CO3) LFD f19, 3 * SIZE(CO3) LFD f20, 0 * SIZE(CO4) LFD f21, 1 * SIZE(CO4) LFD f22, 2 * SIZE(CO4) LFD f23, 3 * SIZE(CO4) FMADD f8, f8, f30, f16 FMADD f9, f9, f30, f17 FMADD f10, f10, f30, f18 FMADD f11, f11, f30, f19 FMADD f12, f12, f30, f20 FMADD f13, f13, f30, f21 FMADD f14, f14, f30, f22 FMADD f15, f15, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f10, f10, f30 FMUL f11, f11, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 FMUL f14, f14, f30 FMUL f15, f15, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -4 #endif slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(TRMMKERNEL) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f4, f4, f30, f18 FMADD f5, f5, f30, f19 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) FMADD f8, f8, f30, f20 FMADD f9, f9, f30, f21 FMADD f12, f12, f30, f22 FMADD f13, f13, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f20, 0 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) FMADD f0, f0, f30, f16 FMADD f4, f4, f30, f18 FMADD f8, f8, f30, f20 FMADD f12, f12, f30, f22 #else FMUL f0, f0, f30 FMUL f4, f4, f30 FMUL f8, f8, f30 FMUL f12, f12, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(39): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif lfs f0, FZERO mr B, BO addic. J, J, -1 bgt LL(10) .align 4 LL(40): mr CO1, C add CO2, C, LDC andi. J, N, 2 ble LL(70) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble LL(50) .align 4 LL(41): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif dcbtst CO1, PREC dcbtst CO2, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(52) .align 4 LL(55): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f18 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(69): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO .align 4 LL(70): mr CO1, C andi. J, N, 1 ble LL(999) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 mr AO, A ble LL(80) .align 4 LL(71): #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif dcbtst CO1, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE bdnz LL(72) .align 4 LL(75): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE bdnz LL(82) .align 4 LL(85): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 addi CO1, CO1, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 LL(90): andi. I, M, 1 ble LL(999) #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 3 mtspr CTR, r0 mr BO, B #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMADD f0, f0, f30, f16 #else FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMUL f0, f0, f30 #endif STFD f0, 0 * SIZE(CO1) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #ifdef TRMMKERNEL ld r20, 232(SP) ld r19, 240(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #ifdef TRMMKERNEL lwz r20, 188(SP) lwz r19, 192(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemm_kernel_ppc440.S000066400000000000000000001257331313527062700212760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) #if defined(TRMMKERNEL) std r19, 240(SP) std r18, 248(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) #if defined(TRMMKERNEL) stw r19, 192(SP) stw r18, 196(SP) #endif #endif stfd f1, ALPHA stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 srawi. J, N, 2 ble .L40 .align 4 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 .L10: mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 mr AO, A add C, CO4, LDC ble .L20 .align 4 .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) LFD B6, 8 * SIZE(BO) LFD B7, 12 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L15 #else LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble .L15 #endif .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L18 .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .L18: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 LFD f16, 0 * SIZE(CO3) FMADD f1, f1, f30, f17 LFD f17, 1 * SIZE(CO3) FMADD f2, f2, f30, f18 LFD f18, 2 * SIZE(CO3) FMADD f3, f3, f30, f19 LFD f19, 3 * SIZE(CO3) FMADD f4, f4, f30, f20 LFD f20, 0 * SIZE(CO4) FMADD f5, f5, f30, f21 LFD f21, 1 * SIZE(CO4) FMADD f6, f6, f30, f22 LFD f22, 2 * SIZE(CO4) FMADD f7, f7, f30, f23 LFD f23, 3 * SIZE(CO4) FMADD f8, f8, f30, f16 FMADD f9, f9, f30, f17 FMADD f10, f10, f30, f18 FMADD f11, f11, f30, f19 FMADD f12, f12, f30, f20 FMADD f13, f13, f30, f21 FMADD f14, f14, f30, f22 FMADD f15, f15, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f10, f10, f30 FMUL f11, f11, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 FMUL f14, f14, f30 FMUL f15, f15, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) fmr f4, f0 STFD f5, 1 * SIZE(CO2) fmr f5, f0 STFD f6, 2 * SIZE(CO2) fmr f6, f0 STFD f7, 3 * SIZE(CO2) fmr f7, f0 STFD f8, 0 * SIZE(CO3) fmr f8, f0 STFD f9, 1 * SIZE(CO3) fmr f9, f0 STFD f10, 2 * SIZE(CO3) fmr f10, f0 STFD f11, 3 * SIZE(CO3) fmr f11, f0 STFD f12, 0 * SIZE(CO4) fmr f12, f0 STFD f13, 1 * SIZE(CO4) fmr f13, f0 STFD f14, 2 * SIZE(CO4) fmr f14, f0 STFD f15, 3 * SIZE(CO4) fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -4 #endif slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ .L11 .align 4 .L20: andi. I, M, 2 ble .L30 #if defined(TRMMKERNEL) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L25 .align 5 .L22: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f16, 4 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 5 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 12 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 13 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 14 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 6 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f27, 15 * SIZE(BO) FMADD f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 16 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 8 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 3 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 1 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 4 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 5 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 6 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 2 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L22 fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 .L25: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L28 .align 4 .L26: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 2 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L28: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f4, f4, f30, f18 FMADD f5, f5, f30, f19 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) FMADD f8, f8, f30, f20 FMADD f9, f9, f30, f21 FMADD f12, f12, f30, f22 FMADD f13, f13, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f12, f12, f30 FMUL f13, f13, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L30: andi. I, M, 1 ble .L39 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L35 .align 5 .L32: FMADD f0, f16, f20, f0 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f23, 11 * SIZE(BO) LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f24, f1 LFD f24, 12 * SIZE(BO) FMADD f5, f17, f25, f5 LFD f25, 13 * SIZE(BO) FMADD f9, f17, f26, f9 LFD f26, 14 * SIZE(BO) FMADD f13, f17, f27, f13 LFD f27, 15 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f0, f18, f20, f0 LFDU f20, 16 * SIZE(BO) FMADD f4, f18, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f18, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f18, f23, f12 LFD f23, 3 * SIZE(BO) LFD f18, 2 * SIZE(AO) FMADD f1, f19, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) FMADD f9, f19, f26, f9 LFD f26, 6 * SIZE(BO) FMADD f13, f19, f27, f13 LFD f27, 7 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L32 fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 .L35: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L38 .align 4 .L36: FMADD f0, f16, f20, f0 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L36 .align 4 .L38: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f20, 0 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) FMADD f0, f0, f30, f16 FMADD f4, f4, f30, f18 FMADD f8, f8, f30, f20 FMADD f12, f12, f30, f22 #else FMUL f0, f0, f30 FMUL f4, f4, f30 FMUL f8, f8, f30 FMUL f12, f12, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif mr B, BO addic. J, J, -1 bgt .L10 .align 4 .L40: mr CO1, C add CO2, C, LDC andi. J, N, 2 ble .L70 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble .L50 .align 4 .L41: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L45 .align 5 .L42: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 5 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 5 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 6 * SIZE(BO) FMADD f4, f16, f23, f4 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 9 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 10 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 11 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 12 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 13 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 14 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 1 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 2 * SIZE(BO) FMADD f4, f16, f23, f4 LFDU f16, 16 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L42 .align 4 .L45: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L48 .align 4 .L46: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 2 * SIZE(BO) FMADD f4, f16, f21, f4 LFDU f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 3 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L46 .align 4 .L48: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 bgt+ .L41 .align 4 .L50: andi. I, M, 2 ble .L60 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L55 .align 5 .L52: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 8 * SIZE(BO) FMADD f2, f16, f21, f2 LFD f16, 4 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 5 * SIZE(AO) FMADD f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) FMADD f5, f19, f22, f5 LFD f22, 2 * SIZE(BO) FMADD f6, f18, f23, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f24, f0 LFD f23, 3 * SIZE(BO) FMADD f1, f17, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f2, f16, f25, f2 LFDU f16, 8 * SIZE(AO) FMADD f3, f17, f25, f3 LFD f17, 1 * SIZE(AO) FMADD f4, f18, f26, f4 LFD f25, 5 * SIZE(BO) FMADD f5, f19, f26, f5 LFD f26, 6 * SIZE(BO) FMADD f6, f18, f27, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f27, f7 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L52 .align 4 .L55: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L58 .align 4 .L56: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 2 * SIZE(BO) FMADD f2, f16, f21, f2 LFDU f16, 2 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L56 .align 4 .L58: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L60: andi. I, M, 1 ble .L69 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L65 .align 5 .L62: FMADD f0, f16, f20, f0 LFDU f20, 8 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 4 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f17, f22, f2 LFD f22, 2 * SIZE(BO) FMADD f3, f17, f23, f3 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) FMADD f0, f18, f24, f0 LFD f24, 4 * SIZE(BO) FMADD f1, f18, f25, f1 LFD f18, 2 * SIZE(AO) LFD f25, 5 * SIZE(BO) FMADD f2, f19, f26, f2 LFD f26, 6 * SIZE(BO) FMADD f3, f19, f27, f3 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L62 .align 4 .L65: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L68 .align 4 .L66: FMADD f0, f16, f20, f0 LFDU f20, 2 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L66 .align 4 .L68: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f18 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO .align 4 .L70: mr CO1, C andi. J, N, 1 ble .L999 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 mr AO, A ble .L80 .align 4 .L71: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble .L75 #endif ble .L75 .align 5 .L72: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 5 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f19, 7 * SIZE(AO) LFDU f20, 4 * SIZE(BO) FMADD f0, f16, f21, f0 LFD f16, 8 * SIZE(AO) FMADD f1, f17, f21, f1 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f19, 11 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f0, f16, f22, f0 LFD f16, 12 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f17, 13 * SIZE(AO) FMADD f2, f18, f22, f2 LFD f18, 14 * SIZE(AO) FMADD f3, f19, f22, f3 LFD f19, 15 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f0, f16, f23, f0 LFDU f16, 16 * SIZE(AO) FMADD f1, f17, f23, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L72 .align 4 .L75: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L78 .align 4 .L76: FMADD f0, f16, f20, f0 LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f20, f3 LFDU f20, 1 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L76 .align 4 .L78: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 #else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt+ .L71 .align 4 .L80: andi. I, M, 2 ble .L90 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B #endif ble .L85 .align 5 .L82: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) LFD f17, 5 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f21, 1 * SIZE(BO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFDU f16, 8 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f22, 2 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f23, 3 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L82 .align 4 .L85: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L88 .align 4 .L86: FMADD f0, f16, f20, f0 LFDU f16, 2 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 1 * SIZE(BO) LFD f17, 1 * SIZE(AO) bdnz .L86 .align 4 .L88: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) FADD f0, f2, f0 FADD f1, f3, f1 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 #else FADD f0, f2, f0 FADD f1, f3, f1 FMUL f0, f0, f30 FMUL f1, f1, f30 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 addi CO1, CO1, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0 , TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif .align 4 .L90: andi. I, M, 1 ble .L999 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 3 mtspr CTR, r0 mr BO, B #endif ble .L95 .align 5 .L92: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) LFD f20, 4 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) LFD f21, 5 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 6 * SIZE(AO) LFD f22, 6 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 7 * SIZE(AO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 LFDU f16, 8 * SIZE(AO) LFDU f20, 8 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L92 .align 4 .L95: lfd f30, ALPHA #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L98 .align 4 .L96: FMADD f0, f16, f20, f0 LFDU f16, 1 * SIZE(AO) LFDU f20, 1 * SIZE(BO) bdnz .L96 .align 4 .L98: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMADD f0, f0, f30, f16 #else FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 FMUL f0, f0, f30 #endif STFD f0, 0 * SIZE(CO1) .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) ld r19, 240(SP) ld r18, 248(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) #if defined(TRMMKERNEL) || defined(TRSMKERNEL) lwz r19, 192(SP) lwz r18, 196(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_ncopy_4.S000066400000000000000000000170561313527062700202750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r12 #define PREA r14 #define PREB1 r15 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 #define STACKSIZE 32 #ifdef CELL #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif #ifdef PPC970 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif #ifdef PPC440 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif #ifdef POWER4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif #ifdef POWER5 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif #ifdef POWER6 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif #ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif #ifdef POWER8 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) #ifdef __64BIT__ std r14, 16(SP) std r15, 24(SP) #else stw r14, 16(SP) stw r15, 20(SP) #endif slwi LDA, LDA, BASE_SHIFT li PREA, PREFETCHSIZE * SIZE li PREB1, (PREFETCHWSIZE + 0) * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) srawi. J, N, 2 ble LL(20) .align 4 LL(10): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(15) .align 4 LL(12): LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 2 * SIZE(AO1) LFD c04, 3 * SIZE(AO1) LFD c05, 0 * SIZE(AO2) LFD c06, 1 * SIZE(AO2) LFD c07, 2 * SIZE(AO2) LFD c08, 3 * SIZE(AO2) LFD c09, 0 * SIZE(AO3) LFD c10, 1 * SIZE(AO3) LFD c11, 2 * SIZE(AO3) LFD c12, 3 * SIZE(AO3) LFD c13, 0 * SIZE(AO4) LFD c14, 1 * SIZE(AO4) LFD c15, 2 * SIZE(AO4) LFD c16, 3 * SIZE(AO4) STFD c01, 0 * SIZE(B) STFD c05, 1 * SIZE(B) STFD c09, 2 * SIZE(B) STFD c13, 3 * SIZE(B) STFD c02, 4 * SIZE(B) STFD c06, 5 * SIZE(B) STFD c10, 6 * SIZE(B) STFD c14, 7 * SIZE(B) STFD c03, 8 * SIZE(B) STFD c07, 9 * SIZE(B) STFD c11, 10 * SIZE(B) STFD c15, 11 * SIZE(B) STFD c04, 12 * SIZE(B) STFD c08, 13 * SIZE(B) STFD c12, 14 * SIZE(B) STFD c16, 15 * SIZE(B) #if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 dcbtst PREA, AO4 #else dcbt PREA, AO1 dcbt PREA, AO2 dcbt PREA, AO3 dcbt PREA, AO4 #endif dcbtst PREB1, B addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE addi B, B, 16 * SIZE bdnz LL(12) .align 4 LL(15): andi. r0, M, 3 mtspr CTR, r0 ble LL(17) .align 4 LL(16): LFD c01, 0 * SIZE(AO1) LFD c05, 0 * SIZE(AO2) LFD c09, 0 * SIZE(AO3) LFD c13, 0 * SIZE(AO4) STFD c01, 0 * SIZE(B) STFD c05, 1 * SIZE(B) STFD c09, 2 * SIZE(B) STFD c13, 3 * SIZE(B) addi AO1, AO1, 1 * SIZE addi AO2, AO2, 1 * SIZE addi AO3, AO3, 1 * SIZE addi AO4, AO4, 1 * SIZE addi B, B, 4 * SIZE bdnz LL(16) .align 4 LL(17): addic. J, J, -1 bgt LL(10) .align 4 LL(20): andi. J, N, 2 ble LL(30) mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(25) .align 4 LL(22): LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 2 * SIZE(AO1) LFD c04, 3 * SIZE(AO1) LFD c05, 0 * SIZE(AO2) LFD c06, 1 * SIZE(AO2) LFD c07, 2 * SIZE(AO2) LFD c08, 3 * SIZE(AO2) STFD c01, 0 * SIZE(B) STFD c05, 1 * SIZE(B) STFD c02, 2 * SIZE(B) STFD c06, 3 * SIZE(B) STFD c03, 4 * SIZE(B) STFD c07, 5 * SIZE(B) STFD c04, 6 * SIZE(B) STFD c08, 7 * SIZE(B) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi B, B, 8 * SIZE bdnz LL(22) .align 4 LL(25): andi. r0, M, 3 mtspr CTR, r0 ble LL(30) .align 4 LL(26): LFD c01, 0 * SIZE(AO1) LFD c05, 0 * SIZE(AO2) STFD c01, 0 * SIZE(B) STFD c05, 1 * SIZE(B) addi AO1, AO1, 1 * SIZE addi AO2, AO2, 1 * SIZE addi B, B, 2 * SIZE bdnz LL(26) .align 4 LL(30): andi. J, N, 1 ble LL(999) mr AO1, A srawi. r0, M, 2 mtspr CTR, r0 ble LL(35) .align 4 LL(32): LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 2 * SIZE(AO1) LFD c04, 3 * SIZE(AO1) STFD c01, 0 * SIZE(B) STFD c02, 1 * SIZE(B) STFD c03, 2 * SIZE(B) STFD c04, 3 * SIZE(B) addi AO1, AO1, 4 * SIZE addi B, B, 4 * SIZE bdnz LL(32) .align 4 LL(35): andi. r0, M, 3 mtspr CTR, r0 ble LL(999) .align 4 LL(36): LFD c01, 0 * SIZE(AO1) STFD c01, 0 * SIZE(B) addi AO1, AO1, 1 * SIZE addi B, B, 1 * SIZE bdnz LL(36) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) #ifdef __64BIT__ ld r14, 16(SP) ld r15, 24(SP) #else lwz r14, 16(SP) lwz r15, 20(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_ncopy_hummer_4.S000066400000000000000000000340661313527062700216520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r12 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 #define sel_p f16 #define sel_s f17 #define c17 f18 #define c18 f19 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) lis r9, 0x3f80 lis r10, 0xbf80 stwu r9, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r9, -4(SP) slwi LDA, LDA, BASE_SHIFT cmpwi cr0, M, 0 ble- .L99 cmpwi cr0, N, 0 ble- .L99 andi. r0, A, 2 * SIZE - 1 bne .L100 andi. r0, LDA, 2 * SIZE - 1 bne .L100 li r0, 8 addi SP, SP, -8 lfpsux sel_p, SP, r0 lfpsux sel_s, SP, r0 li INC, 1 * SIZE li INC2, 2 * SIZE subi A, A, 2 * SIZE subi B, B, 2 * SIZE srawi. J, N, 2 ble .L20 .align 4 .L11: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L15 .align 4 .L12: LFPDUX c01, AO1, INC2 LFXDUX c05, AO2, INC2 LFPDUX c09, AO3, INC2 LFXDUX c13, AO4, INC2 LFPDUX c02, AO1, INC2 LFXDUX c06, AO2, INC2 LFPDUX c10, AO3, INC2 LFXDUX c14, AO4, INC2 LFPDUX c03, AO1, INC2 LFXDUX c07, AO2, INC2 LFPDUX c11, AO3, INC2 LFXDUX c15, AO4, INC2 LFPDUX c04, AO1, INC2 LFXDUX c08, AO2, INC2 LFPDUX c12, AO3, INC2 LFXDUX c16, AO4, INC2 fpsel c17, sel_p, c01, c05 fpsel c18, sel_p, c09, c13 fpsel c01, sel_s, c01, c05 fpsel c05, sel_s, c09, c13 fpsel c09, sel_p, c02, c06 fpsel c13, sel_p, c10, c14 STFPDUX c17, B, INC2 fpsel c02, sel_s, c02, c06 STFPDUX c18, B, INC2 fpsel c06, sel_s, c10, c14 STFXDUX c01, B, INC2 fpsel c10, sel_p, c03, c07 STFXDUX c05, B, INC2 fpsel c14, sel_p, c11, c15 STFPDUX c09, B, INC2 fpsel c03, sel_s, c03, c07 STFPDUX c13, B, INC2 fpsel c07, sel_s, c11, c15 STFXDUX c02, B, INC2 fpsel c11, sel_p, c04, c08 STFXDUX c06, B, INC2 fpsel c15, sel_p, c12, c16 STFPDUX c10, B, INC2 fpsel c04, sel_s, c04, c08 STFPDUX c14, B, INC2 fpsel c08, sel_s, c12, c16 STFXDUX c03, B, INC2 STFXDUX c07, B, INC2 STFPDUX c11, B, INC2 STFPDUX c15, B, INC2 STFXDUX c04, B, INC2 STFXDUX c08, B, INC2 bdnz .L12 .align 4 .L15: andi. r0, M, 7 ble .L19 andi. r0, M, 4 beq .L16 LFPDUX c01, AO1, INC2 LFXDUX c05, AO2, INC2 LFPDUX c09, AO3, INC2 LFXDUX c13, AO4, INC2 LFPDUX c02, AO1, INC2 LFXDUX c06, AO2, INC2 LFPDUX c10, AO3, INC2 LFXDUX c14, AO4, INC2 fpsel c17, sel_p, c01, c05 fpsel c18, sel_p, c09, c13 fpsel c01, sel_s, c01, c05 fpsel c05, sel_s, c09, c13 fpsel c09, sel_p, c02, c06 fpsel c13, sel_p, c10, c14 STFPDUX c17, B, INC2 fpsel c02, sel_s, c02, c06 STFPDUX c18, B, INC2 fpsel c06, sel_s, c10, c14 STFXDUX c01, B, INC2 STFXDUX c05, B, INC2 STFPDUX c09, B, INC2 STFPDUX c13, B, INC2 STFXDUX c02, B, INC2 STFXDUX c06, B, INC2 .align 4 .L16: andi. r0, M, 2 beq .L17 LFPDUX c01, AO1, INC2 LFXDUX c05, AO2, INC2 LFPDUX c09, AO3, INC2 LFXDUX c13, AO4, INC2 fpsel c17, sel_p, c01, c05 fpsel c18, sel_p, c09, c13 fpsel c01, sel_s, c01, c05 fpsel c05, sel_s, c09, c13 STFPDUX c17, B, INC2 STFPDUX c18, B, INC2 STFXDUX c01, B, INC2 STFXDUX c05, B, INC2 .align 4 .L17: andi. r0, M, 1 beq .L19 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 LFDUX c03, AO3, INC2 LFDUX c04, AO4, INC2 fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 .L19: addic. J, J, -1 bgt .L11 .align 4 .L20: andi. J, N, 2 ble .L30 mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L25 .align 4 .L22: LFPDUX c01, AO1, INC2 LFXDUX c05, AO2, INC2 LFPDUX c02, AO1, INC2 LFXDUX c06, AO2, INC2 LFPDUX c03, AO1, INC2 LFXDUX c07, AO2, INC2 LFPDUX c04, AO1, INC2 LFXDUX c08, AO2, INC2 fpsel c17, sel_p, c01, c05 fpsel c01, sel_s, c01, c05 fpsel c09, sel_p, c02, c06 fpsel c02, sel_s, c02, c06 fpsel c10, sel_p, c03, c07 fpsel c03, sel_s, c03, c07 STFPDUX c17, B, INC2 fpsel c11, sel_p, c04, c08 STFXDUX c01, B, INC2 fpsel c04, sel_s, c04, c08 STFPDUX c09, B, INC2 STFXDUX c02, B, INC2 STFPDUX c10, B, INC2 STFXDUX c03, B, INC2 STFPDUX c11, B, INC2 STFXDUX c04, B, INC2 bdnz .L22 .align 4 .L25: andi. r0, M, 7 ble .L30 andi. r0, M, 4 beq .L26 LFPDUX c01, AO1, INC2 LFXDUX c05, AO2, INC2 LFPDUX c02, AO1, INC2 LFXDUX c06, AO2, INC2 fpsel c17, sel_p, c01, c05 fpsel c01, sel_s, c01, c05 fpsel c09, sel_p, c02, c06 fpsel c02, sel_s, c02, c06 STFPDUX c17, B, INC2 STFXDUX c01, B, INC2 STFPDUX c09, B, INC2 STFXDUX c02, B, INC2 .align 4 .L26: andi. r0, M, 2 beq .L27 LFPDUX c01, AO1, INC2 LFXDUX c05, AO2, INC2 fpsel c17, sel_p, c01, c05 fpsel c01, sel_s, c01, c05 STFPDUX c17, B, INC2 STFXDUX c01, B, INC2 .align 4 .L27: andi. r0, M, 1 beq .L30 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 .L30: andi. J, N, 1 ble .L99 mr AO1, A srawi. r0, M, 3 mtspr CTR, r0 ble .L35 .align 4 .L32: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 bdnz .L32 .align 4 .L35: andi. r0, M, 7 ble .L99 andi. r0, M, 4 beq .L36 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 .align 4 .L36: andi. r0, M, 2 beq .L37 LFPDUX c01, AO1, INC2 STFPDUX c01, B, INC2 .align 4 .L37: andi. r0, M, 1 beq .L99 LFDX c01, AO1, INC2 STFDX c01, B, INC2 .align 4 .L99: addi SP, SP, 4 lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 .L100: li INC, 1 * SIZE li INC2, 2 * SIZE subi A, A, 1 * SIZE subi B, B, 2 * SIZE srawi. J, N, 2 ble .L120 .align 4 .L111: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L115 .align 4 .L112: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c09, AO1, INC LFDUX c10, AO1, INC LFDUX c11, AO1, INC LFDUX c12, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC LFSDUX c09, AO2, INC LFSDUX c10, AO2, INC LFSDUX c11, AO2, INC LFSDUX c12, AO2, INC LFDUX c05, AO3, INC LFDUX c06, AO3, INC LFDUX c07, AO3, INC LFDUX c08, AO3, INC LFDUX c13, AO3, INC LFDUX c14, AO3, INC LFDUX c15, AO3, INC LFDUX c16, AO3, INC LFSDUX c05, AO4, INC LFSDUX c06, AO4, INC LFSDUX c07, AO4, INC LFSDUX c08, AO4, INC LFSDUX c13, AO4, INC LFSDUX c14, AO4, INC LFSDUX c15, AO4, INC LFSDUX c16, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c04, B, INC2 STFPDUX c08, B, INC2 STFPDUX c09, B, INC2 STFPDUX c13, B, INC2 STFPDUX c10, B, INC2 STFPDUX c14, B, INC2 STFPDUX c11, B, INC2 STFPDUX c15, B, INC2 STFPDUX c12, B, INC2 STFPDUX c16, B, INC2 bdnz .L112 .align 4 .L115: andi. r0, M, 7 ble .L119 andi. r0, M, 4 beq .L116 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC LFDUX c05, AO3, INC LFDUX c06, AO3, INC LFDUX c07, AO3, INC LFDUX c08, AO3, INC LFSDUX c05, AO4, INC LFSDUX c06, AO4, INC LFSDUX c07, AO4, INC LFSDUX c08, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c04, B, INC2 STFPDUX c08, B, INC2 .align 4 .L116: andi. r0, M, 2 beq .L117 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFDUX c05, AO3, INC LFDUX c06, AO3, INC LFSDUX c05, AO4, INC LFSDUX c06, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 .align 4 .L117: andi. r0, M, 1 beq .L119 LFDUX c01, AO1, INC LFDUX c05, AO3, INC nop nop LFSDUX c01, AO2, INC LFSDUX c05, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 .align 4 .L119: addic. J, J, -1 bgt .L111 .align 4 .L120: andi. J, N, 2 ble .L130 mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L125 .align 4 .L122: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c09, AO1, INC LFDUX c10, AO1, INC LFDUX c11, AO1, INC LFDUX c12, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC LFSDUX c09, AO2, INC LFSDUX c10, AO2, INC LFSDUX c11, AO2, INC LFSDUX c12, AO2, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFPDUX c09, B, INC2 STFPDUX c10, B, INC2 STFPDUX c11, B, INC2 STFPDUX c12, B, INC2 bdnz .L122 .align 4 .L125: andi. r0, M, 7 ble .L130 andi. r0, M, 4 beq .L126 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 .align 4 .L126: andi. r0, M, 2 beq .L127 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 .align 4 .L127: andi. r0, M, 1 beq .L130 LFDUX c01, AO1, INC LFDUX c02, AO2, INC fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 .L130: andi. J, N, 1 ble .L999 mr AO1, A srawi. r0, M, 3 mtspr CTR, r0 ble .L135 .align 4 .L132: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO1, INC LFDUX c08, AO1, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 bdnz .L132 .align 4 .L135: andi. r0, M, 7 ble .L999 andi. r0, M, 4 beq .L136 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 .L136: andi. r0, M, 2 beq .L137 LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 .L137: andi. r0, M, 1 beq .L999 LFDX c01, AO1, INC STFDX c01, B, INC2 .align 4 .L999: addi SP, SP, 12 lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_ncopy_hummer_8.S000066400000000000000000000535271313527062700216610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r12 #define AO5 r26 #define AO6 r27 #define AO7 r28 #define AO8 r29 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 #define c17 f16 #define c18 f17 #define c19 f18 #define c20 f19 #define c21 f20 #define c22 f21 #define c23 f22 #define c24 f23 #define c25 f24 #define c26 f25 #define c27 f26 #define c28 f27 #define c29 f28 #define c30 f29 #define c31 f30 #define c32 f31 #define sel_p f30 #define sel_s f31 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) lis r9, 0x3f80 lis r10, 0xbf80 stwu r9, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r9, -4(SP) slwi LDA, LDA, BASE_SHIFT li r0, 0 lfpsux sel_p, SP, r0 li r0, 8 lfpsux sel_s, SP, r0 cmpwi cr0, M, 0 ble- .L999 cmpwi cr0, N, 0 ble- .L999 li INC, 1 * SIZE li INC2, 2 * SIZE subi B, B, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne .L100 andi. r0, LDA, 2 * SIZE - 1 bne .L100 subi A, A, 2 * SIZE srawi. J, N, 3 ble .L20 .align 4 .L11: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA srawi. r0, M, 2 mtspr CTR, r0 ble .L15 .align 4 .L12: LFPDUX c01, AO1, INC2 LFXDUX c02, AO2, INC2 LFPDUX c03, AO3, INC2 LFXDUX c04, AO4, INC2 LFPDUX c05, AO5, INC2 LFXDUX c06, AO6, INC2 LFPDUX c07, AO7, INC2 LFXDUX c08, AO8, INC2 LFPDUX c09, AO1, INC2 LFXDUX c10, AO2, INC2 LFPDUX c11, AO3, INC2 LFXDUX c12, AO4, INC2 fpsel c17, sel_p, c01, c02 LFPDUX c13, AO5, INC2 fpsel c18, sel_p, c03, c04 LFXDUX c14, AO6, INC2 fpsel c19, sel_p, c05, c06 LFPDUX c15, AO7, INC2 fpsel c20, sel_p, c07, c08 LFXDUX c16, AO8, INC2 fpsel c21, sel_s, c01, c02 fpsel c22, sel_s, c03, c04 STFPDUX c17, B, INC2 fpsel c23, sel_s, c05, c06 STFPDUX c18, B, INC2 fpsel c24, sel_s, c07, c08 STFPDUX c19, B, INC2 fpsel c01, sel_p, c09, c10 STFPDUX c20, B, INC2 fpsel c02, sel_p, c11, c12 STFXDUX c21, B, INC2 fpsel c03, sel_p, c13, c14 STFXDUX c22, B, INC2 fpsel c04, sel_p, c15, c16 STFXDUX c23, B, INC2 fpsel c05, sel_s, c09, c10 STFXDUX c24, B, INC2 fpsel c06, sel_s, c11, c12 STFPDUX c01, B, INC2 fpsel c07, sel_s, c13, c14 STFPDUX c02, B, INC2 fpsel c08, sel_s, c15, c16 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFXDUX c05, B, INC2 STFXDUX c06, B, INC2 STFXDUX c07, B, INC2 STFXDUX c08, B, INC2 bdnz .L12 .align 4 .L15: andi. r0, M, 3 ble .L19 andi. r0, M, 2 beq .L17 LFPDUX c01, AO1, INC2 LFXDUX c02, AO2, INC2 LFPDUX c03, AO3, INC2 LFXDUX c04, AO4, INC2 LFPDUX c05, AO5, INC2 fpsel c09, sel_p, c01, c02 LFXDUX c06, AO6, INC2 fpsel c10, sel_p, c03, c04 LFPDUX c07, AO7, INC2 fpsel c11, sel_p, c05, c06 LFXDUX c08, AO8, INC2 fpsel c12, sel_p, c07, c08 fpsel c13, sel_s, c01, c02 fpsel c14, sel_s, c03, c04 STFPDUX c09, B, INC2 fpsel c15, sel_s, c05, c06 STFPDUX c10, B, INC2 fpsel c16, sel_s, c07, c08 STFPDUX c11, B, INC2 STFPDUX c12, B, INC2 STFXDUX c13, B, INC2 STFXDUX c14, B, INC2 STFXDUX c15, B, INC2 STFXDUX c16, B, INC2 .align 4 .L17: andi. r0, M, 1 beq .L19 LFDUX c01, AO1, INC2 LFDUX c02, AO3, INC2 LFDUX c03, AO5, INC2 LFDUX c04, AO7, INC2 LFSDUX c01, AO2, INC2 LFSDUX c02, AO4, INC2 LFSDUX c03, AO6, INC2 LFSDUX c04, AO8, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 .align 4 .L19: addic. J, J, -1 bgt .L11 .align 4 .L20: andi. J, N, 4 ble .L30 .align 4 .L21: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L25 .align 4 .L22: LFPDUX c01, AO1, INC2 LFXDUX c02, AO2, INC2 LFPDUX c03, AO3, INC2 LFXDUX c04, AO4, INC2 LFPDUX c05, AO1, INC2 LFXDUX c06, AO2, INC2 LFPDUX c07, AO3, INC2 LFXDUX c08, AO4, INC2 LFPDUX c09, AO1, INC2 LFXDUX c10, AO2, INC2 LFPDUX c11, AO3, INC2 LFXDUX c12, AO4, INC2 fpsel c17, sel_p, c01, c02 LFPDUX c13, AO1, INC2 fpsel c18, sel_p, c03, c04 LFXDUX c14, AO2, INC2 fpsel c19, sel_s, c01, c02 LFPDUX c15, AO3, INC2 fpsel c20, sel_s, c03, c04 LFXDUX c16, AO4, INC2 fpsel c21, sel_p, c05, c06 fpsel c22, sel_p, c07, c08 STFPDUX c17, B, INC2 fpsel c23, sel_s, c05, c06 STFPDUX c18, B, INC2 fpsel c24, sel_s, c07, c08 STFXDUX c19, B, INC2 fpsel c01, sel_p, c09, c10 STFXDUX c20, B, INC2 fpsel c02, sel_p, c11, c12 STFPDUX c21, B, INC2 fpsel c03, sel_s, c09, c10 STFPDUX c22, B, INC2 fpsel c04, sel_s, c11, c12 STFXDUX c23, B, INC2 fpsel c05, sel_p, c13, c14 STFXDUX c24, B, INC2 fpsel c06, sel_p, c15, c16 STFPDUX c01, B, INC2 fpsel c07, sel_s, c13, c14 STFPDUX c02, B, INC2 fpsel c08, sel_s, c15, c16 STFXDUX c03, B, INC2 STFXDUX c04, B, INC2 STFPDUX c05, B, INC2 STFPDUX c06, B, INC2 STFXDUX c07, B, INC2 STFXDUX c08, B, INC2 bdnz .L22 .align 4 .L25: andi. r0, M, 7 ble .L30 andi. r0, M, 4 beq .L26 LFPDUX c01, AO1, INC2 LFXDUX c02, AO2, INC2 LFPDUX c03, AO3, INC2 LFXDUX c04, AO4, INC2 LFPDUX c05, AO1, INC2 fpsel c09, sel_p, c01, c02 LFXDUX c06, AO2, INC2 fpsel c10, sel_p, c03, c04 LFPDUX c07, AO3, INC2 fpsel c11, sel_s, c01, c02 LFXDUX c08, AO4, INC2 fpsel c12, sel_s, c03, c04 fpsel c13, sel_p, c05, c06 fpsel c14, sel_p, c07, c08 STFPDUX c09, B, INC2 fpsel c15, sel_s, c05, c06 STFPDUX c10, B, INC2 fpsel c16, sel_s, c07, c08 STFXDUX c11, B, INC2 STFXDUX c12, B, INC2 STFPDUX c13, B, INC2 STFPDUX c14, B, INC2 STFXDUX c15, B, INC2 STFXDUX c16, B, INC2 .align 4 .L26: andi. r0, M, 2 beq .L27 LFPDUX c01, AO1, INC2 LFXDUX c02, AO2, INC2 LFPDUX c03, AO3, INC2 LFXDUX c04, AO4, INC2 fpsel c05, sel_p, c01, c02 fpsel c06, sel_p, c03, c04 fpsel c07, sel_s, c01, c02 fpsel c08, sel_s, c03, c04 STFPDUX c05, B, INC2 STFPDUX c06, B, INC2 STFXDUX c07, B, INC2 STFXDUX c08, B, INC2 .align 4 .L27: andi. r0, M, 1 beq .L30 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 LFDUX c03, AO3, INC2 LFDUX c04, AO4, INC2 fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 .L30: andi. J, N, 2 ble .L40 mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L35 .align 4 .L32: LFPDUX c01, AO1, INC2 LFXDUX c05, AO2, INC2 LFPDUX c02, AO1, INC2 LFXDUX c06, AO2, INC2 LFPDUX c03, AO1, INC2 fpsel c09, sel_p, c01, c05 LFXDUX c07, AO2, INC2 fpsel c10, sel_s, c01, c05 LFPDUX c04, AO1, INC2 fpsel c11, sel_p, c02, c06 LFXDUX c08, AO2, INC2 fpsel c12, sel_s, c02, c06 fpsel c13, sel_p, c03, c07 fpsel c14, sel_s, c03, c07 STFPDUX c09, B, INC2 fpsel c15, sel_p, c04, c08 STFXDUX c10, B, INC2 fpsel c16, sel_s, c04, c08 STFPDUX c11, B, INC2 STFXDUX c12, B, INC2 STFPDUX c13, B, INC2 STFXDUX c14, B, INC2 STFPDUX c15, B, INC2 STFXDUX c16, B, INC2 bdnz .L32 .align 4 .L35: andi. r0, M, 7 ble .L40 andi. r0, M, 4 beq .L36 LFPDUX c01, AO1, INC2 LFXDUX c03, AO2, INC2 LFPDUX c02, AO1, INC2 LFXDUX c04, AO2, INC2 fpsel c05, sel_p, c01, c03 fpsel c06, sel_s, c01, c03 fpsel c07, sel_p, c02, c04 fpsel c08, sel_s, c02, c04 STFPDUX c05, B, INC2 STFXDUX c06, B, INC2 STFPDUX c07, B, INC2 STFXDUX c08, B, INC2 .align 4 .L36: andi. r0, M, 2 beq .L37 LFPDUX c01, AO1, INC2 LFXDUX c02, AO2, INC2 fpsel c03, sel_p, c01, c02 fpsel c04, sel_s, c01, c02 STFPDUX c03, B, INC2 STFXDUX c04, B, INC2 .align 4 .L37: andi. r0, M, 1 beq .L40 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 .L40: andi. J, N, 1 ble .L999 mr AO1, A srawi. r0, M, 3 mtspr CTR, r0 ble .L45 .align 4 .L42: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 bdnz .L42 .align 4 .L45: andi. r0, M, 7 ble .L999 andi. r0, M, 4 beq .L46 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 .align 4 .L46: andi. r0, M, 2 beq .L47 LFPDUX c01, AO1, INC2 STFPDUX c01, B, INC2 .align 4 .L47: andi. r0, M, 1 beq .L999 LFDX c01, AO1, INC2 STFDX c01, B, INC2 b .L999 .align 4 .L100: subi A, A, 1 * SIZE srawi. J, N, 3 ble .L120 .align 4 .L111: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L115 .align 4 .L112: LFDUX c01, AO1, INC LFDUX c05, AO1, INC LFDUX c09, AO1, INC LFDUX c13, AO1, INC LFDUX c17, AO1, INC LFDUX c21, AO1, INC LFDUX c25, AO1, INC LFDUX c29, AO1, INC LFSDUX c01, AO2, INC LFSDUX c05, AO2, INC LFSDUX c09, AO2, INC LFSDUX c13, AO2, INC LFSDUX c17, AO2, INC LFSDUX c21, AO2, INC LFSDUX c25, AO2, INC LFSDUX c29, AO2, INC LFDUX c02, AO3, INC LFDUX c06, AO3, INC LFDUX c10, AO3, INC LFDUX c14, AO3, INC LFDUX c18, AO3, INC LFDUX c22, AO3, INC LFDUX c26, AO3, INC LFDUX c30, AO3, INC LFSDUX c02, AO4, INC LFSDUX c06, AO4, INC LFSDUX c10, AO4, INC LFSDUX c14, AO4, INC LFSDUX c18, AO4, INC LFSDUX c22, AO4, INC LFSDUX c26, AO4, INC LFSDUX c30, AO4, INC LFDUX c03, AO5, INC LFDUX c07, AO5, INC LFDUX c11, AO5, INC LFDUX c15, AO5, INC LFDUX c19, AO5, INC LFDUX c23, AO5, INC LFDUX c27, AO5, INC LFDUX c31, AO5, INC LFSDUX c03, AO6, INC LFSDUX c07, AO6, INC LFSDUX c11, AO6, INC LFSDUX c15, AO6, INC LFSDUX c19, AO6, INC LFSDUX c23, AO6, INC LFSDUX c27, AO6, INC LFSDUX c31, AO6, INC LFDUX c04, AO7, INC LFDUX c08, AO7, INC LFDUX c12, AO7, INC LFDUX c16, AO7, INC LFDUX c20, AO7, INC LFDUX c24, AO7, INC LFDUX c28, AO7, INC LFDUX c32, AO7, INC LFSDUX c04, AO8, INC LFSDUX c08, AO8, INC LFSDUX c12, AO8, INC LFSDUX c16, AO8, INC LFSDUX c20, AO8, INC LFSDUX c24, AO8, INC LFSDUX c28, AO8, INC LFSDUX c32, AO8, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFPDUX c05, B, INC2 STFPDUX c06, B, INC2 STFPDUX c07, B, INC2 STFPDUX c08, B, INC2 STFPDUX c09, B, INC2 STFPDUX c10, B, INC2 STFPDUX c11, B, INC2 STFPDUX c12, B, INC2 STFPDUX c13, B, INC2 STFPDUX c14, B, INC2 STFPDUX c15, B, INC2 STFPDUX c16, B, INC2 STFPDUX c17, B, INC2 STFPDUX c18, B, INC2 STFPDUX c19, B, INC2 STFPDUX c20, B, INC2 STFPDUX c21, B, INC2 STFPDUX c22, B, INC2 STFPDUX c23, B, INC2 STFPDUX c24, B, INC2 STFPDUX c25, B, INC2 STFPDUX c26, B, INC2 STFPDUX c27, B, INC2 STFPDUX c28, B, INC2 STFPDUX c29, B, INC2 STFPDUX c30, B, INC2 STFPDUX c31, B, INC2 STFPDUX c32, B, INC2 bdnz .L112 .align 4 .L115: andi. r0, M, 7 ble .L119 andi. r0, M, 4 beq .L116 LFDUX c01, AO1, INC LFDUX c05, AO1, INC LFDUX c09, AO1, INC LFDUX c13, AO1, INC LFSDUX c01, AO2, INC LFSDUX c05, AO2, INC LFSDUX c09, AO2, INC LFSDUX c13, AO2, INC LFDUX c02, AO3, INC LFDUX c06, AO3, INC LFDUX c10, AO3, INC LFDUX c14, AO3, INC LFSDUX c02, AO4, INC LFSDUX c06, AO4, INC LFSDUX c10, AO4, INC LFSDUX c14, AO4, INC LFDUX c03, AO5, INC LFDUX c07, AO5, INC LFDUX c11, AO5, INC LFDUX c15, AO5, INC LFSDUX c03, AO6, INC LFSDUX c07, AO6, INC LFSDUX c11, AO6, INC LFSDUX c15, AO6, INC LFDUX c04, AO7, INC LFDUX c08, AO7, INC LFDUX c12, AO7, INC LFDUX c16, AO7, INC LFSDUX c04, AO8, INC LFSDUX c08, AO8, INC LFSDUX c12, AO8, INC LFSDUX c16, AO8, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFPDUX c05, B, INC2 STFPDUX c06, B, INC2 STFPDUX c07, B, INC2 STFPDUX c08, B, INC2 STFPDUX c09, B, INC2 STFPDUX c10, B, INC2 STFPDUX c11, B, INC2 STFPDUX c12, B, INC2 STFPDUX c13, B, INC2 STFPDUX c14, B, INC2 STFPDUX c15, B, INC2 STFPDUX c16, B, INC2 .align 4 .L116: andi. r0, M, 2 beq .L117 LFDUX c01, AO1, INC LFDUX c05, AO1, INC LFDUX c02, AO3, INC LFDUX c06, AO3, INC LFSDUX c01, AO2, INC LFSDUX c05, AO2, INC LFSDUX c02, AO4, INC LFSDUX c06, AO4, INC LFDUX c03, AO5, INC LFDUX c07, AO5, INC LFDUX c04, AO7, INC LFDUX c08, AO7, INC LFSDUX c03, AO6, INC LFSDUX c07, AO6, INC LFSDUX c04, AO8, INC LFSDUX c08, AO8, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFPDUX c05, B, INC2 STFPDUX c06, B, INC2 STFPDUX c07, B, INC2 STFPDUX c08, B, INC2 .align 4 .L117: andi. r0, M, 1 beq .L119 LFDUX c01, AO1, INC LFDUX c02, AO3, INC LFDUX c03, AO5, INC LFDUX c04, AO7, INC LFSDUX c01, AO2, INC LFSDUX c02, AO4, INC LFSDUX c03, AO6, INC LFSDUX c04, AO8, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 .align 4 .L119: addic. J, J, -1 bgt .L111 .align 4 .L120: andi. J, N, 4 ble .L130 .align 4 .L121: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L125 .align 4 .L122: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c09, AO1, INC LFDUX c10, AO1, INC LFDUX c11, AO1, INC LFDUX c12, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC LFSDUX c09, AO2, INC LFSDUX c10, AO2, INC LFSDUX c11, AO2, INC LFSDUX c12, AO2, INC LFDUX c05, AO3, INC LFDUX c06, AO3, INC LFDUX c07, AO3, INC LFDUX c08, AO3, INC LFDUX c13, AO3, INC LFDUX c14, AO3, INC LFDUX c15, AO3, INC LFDUX c16, AO3, INC LFSDUX c05, AO4, INC LFSDUX c06, AO4, INC LFSDUX c07, AO4, INC LFSDUX c08, AO4, INC LFSDUX c13, AO4, INC LFSDUX c14, AO4, INC LFSDUX c15, AO4, INC LFSDUX c16, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c04, B, INC2 STFPDUX c08, B, INC2 STFPDUX c09, B, INC2 STFPDUX c13, B, INC2 STFPDUX c10, B, INC2 STFPDUX c14, B, INC2 STFPDUX c11, B, INC2 STFPDUX c15, B, INC2 STFPDUX c12, B, INC2 STFPDUX c16, B, INC2 bdnz .L122 .align 4 .L125: andi. r0, M, 7 ble .L130 andi. r0, M, 4 beq .L126 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC LFDUX c05, AO3, INC LFDUX c06, AO3, INC LFDUX c07, AO3, INC LFDUX c08, AO3, INC LFSDUX c05, AO4, INC LFSDUX c06, AO4, INC LFSDUX c07, AO4, INC LFSDUX c08, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c04, B, INC2 STFPDUX c08, B, INC2 .align 4 .L126: andi. r0, M, 2 beq .L127 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFDUX c05, AO3, INC LFDUX c06, AO3, INC LFSDUX c05, AO4, INC LFSDUX c06, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 .align 4 .L127: andi. r0, M, 1 beq .L130 LFDUX c01, AO1, INC LFDUX c05, AO3, INC nop nop LFSDUX c01, AO2, INC LFSDUX c05, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 .align 4 .L130: andi. J, N, 2 ble .L140 mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 3 mtspr CTR, r0 ble .L135 .align 4 .L132: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c09, AO1, INC LFDUX c10, AO1, INC LFDUX c11, AO1, INC LFDUX c12, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC LFSDUX c09, AO2, INC LFSDUX c10, AO2, INC LFSDUX c11, AO2, INC LFSDUX c12, AO2, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFPDUX c09, B, INC2 STFPDUX c10, B, INC2 STFPDUX c11, B, INC2 STFPDUX c12, B, INC2 bdnz .L132 .align 4 .L135: andi. r0, M, 7 ble .L140 andi. r0, M, 4 beq .L136 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC LFSDUX c03, AO2, INC LFSDUX c04, AO2, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 .align 4 .L136: andi. r0, M, 2 beq .L137 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFSDUX c01, AO2, INC LFSDUX c02, AO2, INC STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 .align 4 .L137: andi. r0, M, 1 beq .L140 LFDUX c01, AO1, INC LFDUX c02, AO2, INC fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 .L140: andi. J, N, 1 ble .L999 mr AO1, A srawi. r0, M, 3 mtspr CTR, r0 ble .L145 .align 4 .L142: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO1, INC LFDUX c08, AO1, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 bdnz .L142 .align 4 .L145: andi. r0, M, 7 ble .L999 andi. r0, M, 4 beq .L146 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 .L146: andi. r0, M, 2 beq .L147 LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 .L147: andi. r0, M, 1 beq .L999 LFDX c01, AO1, INC STFDX c01, B, INC2 .align 4 .L999: addi SP, SP, 4 lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_tcopy_4.S000066400000000000000000000217351313527062700203020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r12 #define PREA r14 #define PREB1 r15 #define B1 r16 #define B2 r17 #define B3 r18 #define M4 r19 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 #define STACKSIZE 64 #ifdef CELL #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif #ifdef PPC970 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif #ifdef PPC440 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif #ifdef POWER4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif #ifdef POWER5 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif #ifdef POWER6 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif #ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif #ifdef POWER8 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) #ifdef __64BIT__ std r14, 16(SP) std r15, 24(SP) std r16, 32(SP) std r17, 40(SP) std r18, 48(SP) std r19, 56(SP) #else stw r14, 16(SP) stw r15, 20(SP) stw r16, 24(SP) stw r17, 28(SP) stw r18, 32(SP) stw r19, 36(SP) #endif slwi LDA, LDA, BASE_SHIFT slwi M4, M, 2 + BASE_SHIFT li PREA, -4 li PREB1, -2 and B2, N, PREA and B3, N, PREB1 mullw B2, B2, M mullw B3, B3, M slwi B2, B2, BASE_SHIFT slwi B3, B3, BASE_SHIFT add B2, B2, B add B3, B3, B li PREA, PREFETCHSIZE * SIZE li PREB1, (PREFETCHWSIZE + 0) * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) srawi. J, M, 2 ble LL(20) .align 4 LL(10): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr B1, B addi B, B, 16 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble LL(13) .align 4 LL(12): LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 2 * SIZE(AO1) LFD c04, 3 * SIZE(AO1) LFD c05, 0 * SIZE(AO2) LFD c06, 1 * SIZE(AO2) LFD c07, 2 * SIZE(AO2) LFD c08, 3 * SIZE(AO2) LFD c09, 0 * SIZE(AO3) LFD c10, 1 * SIZE(AO3) LFD c11, 2 * SIZE(AO3) LFD c12, 3 * SIZE(AO3) LFD c13, 0 * SIZE(AO4) LFD c14, 1 * SIZE(AO4) LFD c15, 2 * SIZE(AO4) LFD c16, 3 * SIZE(AO4) STFD c01, 0 * SIZE(B1) STFD c02, 1 * SIZE(B1) STFD c03, 2 * SIZE(B1) STFD c04, 3 * SIZE(B1) STFD c05, 4 * SIZE(B1) STFD c06, 5 * SIZE(B1) STFD c07, 6 * SIZE(B1) STFD c08, 7 * SIZE(B1) STFD c09, 8 * SIZE(B1) STFD c10, 9 * SIZE(B1) STFD c11, 10 * SIZE(B1) STFD c12, 11 * SIZE(B1) STFD c13, 12 * SIZE(B1) STFD c14, 13 * SIZE(B1) STFD c15, 14 * SIZE(B1) STFD c16, 15 * SIZE(B1) #if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 dcbtst PREA, AO4 #else dcbt PREA, AO1 dcbt PREA, AO2 dcbt PREA, AO3 dcbt PREA, AO4 #endif dcbtst PREB1, B addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE add B1, B1, M4 bdnz LL(12) .align 4 LL(13): andi. r0, N, 2 ble LL(14) LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 0 * SIZE(AO2) LFD c04, 1 * SIZE(AO2) LFD c05, 0 * SIZE(AO3) LFD c06, 1 * SIZE(AO3) LFD c07, 0 * SIZE(AO4) LFD c08, 1 * SIZE(AO4) STFD c01, 0 * SIZE(B2) STFD c02, 1 * SIZE(B2) STFD c03, 2 * SIZE(B2) STFD c04, 3 * SIZE(B2) STFD c05, 4 * SIZE(B2) STFD c06, 5 * SIZE(B2) STFD c07, 6 * SIZE(B2) STFD c08, 7 * SIZE(B2) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi AO3, AO3, 2 * SIZE addi AO4, AO4, 2 * SIZE addi B2, B2, 8 * SIZE .align 4 LL(14): andi. r0, N, 1 ble LL(17) LFD c01, 0 * SIZE(AO1) LFD c02, 0 * SIZE(AO2) LFD c03, 0 * SIZE(AO3) LFD c04, 0 * SIZE(AO4) STFD c01, 0 * SIZE(B3) STFD c02, 1 * SIZE(B3) STFD c03, 2 * SIZE(B3) STFD c04, 3 * SIZE(B3) addi B3, B3, 4 * SIZE .align 4 LL(17): addic. J, J, -1 bgt LL(10) .align 4 LL(20): andi. J, M, 2 ble LL(30) mr AO1, A add AO2, A, LDA add A, AO2, LDA mr B1, B addi B, B, 8 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble LL(23) .align 4 LL(22): LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 2 * SIZE(AO1) LFD c04, 3 * SIZE(AO1) LFD c05, 0 * SIZE(AO2) LFD c06, 1 * SIZE(AO2) LFD c07, 2 * SIZE(AO2) LFD c08, 3 * SIZE(AO2) STFD c01, 0 * SIZE(B1) STFD c02, 1 * SIZE(B1) STFD c03, 2 * SIZE(B1) STFD c04, 3 * SIZE(B1) STFD c05, 4 * SIZE(B1) STFD c06, 5 * SIZE(B1) STFD c07, 6 * SIZE(B1) STFD c08, 7 * SIZE(B1) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE add B1, B1, M4 bdnz LL(22) .align 4 LL(23): andi. r0, N, 2 ble LL(24) LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 0 * SIZE(AO2) LFD c04, 1 * SIZE(AO2) STFD c01, 0 * SIZE(B2) STFD c02, 1 * SIZE(B2) STFD c03, 2 * SIZE(B2) STFD c04, 3 * SIZE(B2) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi B2, B2, 4 * SIZE .align 4 LL(24): andi. r0, N, 1 ble LL(30) LFD c01, 0 * SIZE(AO1) LFD c02, 0 * SIZE(AO2) STFD c01, 0 * SIZE(B3) STFD c02, 1 * SIZE(B3) addi B3, B3, 2 * SIZE .align 4 LL(30): andi. J, M, 1 ble LL(999) mr AO1, A mr B1, B srawi. r0, N, 2 mtspr CTR, r0 ble LL(33) .align 4 LL(32): LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) LFD c03, 2 * SIZE(AO1) LFD c04, 3 * SIZE(AO1) STFD c01, 0 * SIZE(B1) STFD c02, 1 * SIZE(B1) STFD c03, 2 * SIZE(B1) STFD c04, 3 * SIZE(B1) addi AO1, AO1, 4 * SIZE add B1, B1, M4 bdnz LL(32) .align 4 LL(33): andi. r0, N, 2 ble LL(34) LFD c01, 0 * SIZE(AO1) LFD c02, 1 * SIZE(AO1) STFD c01, 0 * SIZE(B2) STFD c02, 1 * SIZE(B2) addi AO1, AO1, 2 * SIZE addi B2, B2, 2 * SIZE .align 4 LL(34): andi. r0, N, 1 ble LL(999) LFD c01, 0 * SIZE(AO1) STFD c01, 0 * SIZE(B3) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) #ifdef __64BIT__ ld r14, 16(SP) ld r15, 24(SP) ld r16, 32(SP) ld r17, 40(SP) ld r18, 48(SP) ld r19, 56(SP) #else lwz r14, 16(SP) lwz r15, 20(SP) lwz r16, 24(SP) lwz r17, 28(SP) lwz r18, 32(SP) lwz r19, 36(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_tcopy_hummer_4.S000066400000000000000000000222421313527062700216510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r25 #define B1 r26 #define B2 r27 #define B3 r28 #define M4 r29 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 PROLOGUE PROFCODE stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) slwi LDA, LDA, BASE_SHIFT slwi M4, M, 2 + BASE_SHIFT li r8, -4 li r9, -2 and B2, N, r8 and B3, N, r9 mullw B2, B2, M mullw B3, B3, M slwi B2, B2, BASE_SHIFT slwi B3, B3, BASE_SHIFT add B2, B2, B add B3, B3, B cmpwi cr0, M, 0 ble- .L99 cmpwi cr0, N, 0 ble- .L99 subi B2, B2, 2 * SIZE subi B3, B3, 2 * SIZE subi M4, M4, 14 * SIZE li INC, 1 * SIZE li INC2, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne .L100 andi. r0, LDA, 2 * SIZE - 1 bne .L100 subi A, A, 2 * SIZE srawi. J, M, 2 ble .L20 .align 4 .L10: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA sub B1, B, M4 addi B, B, 16 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble .L15 .align 4 .L12: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c06, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c08, AO4, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 bdnz .L12 .align 4 .L15: andi. r0, N, 3 ble .L19 andi. r0, N, 2 ble .L17 LFPDUX c01, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c07, AO4, INC2 STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c07, B2, INC2 .align 4 .L17: andi. r0, N, 1 ble .L19 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 LFDUX c03, AO3, INC2 LFDUX c04, AO4, INC2 fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 .align 4 .L19: addic. J, J, -1 bgt .L10 .align 4 .L20: andi. J, M, 2 addi M4, M4, 8 * SIZE ble .L30 mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M4 addi B, B, 8 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble .L23 .align 4 .L22: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 bdnz .L22 .align 4 .L23: andi. r0, N, 2 ble .L24 LFPDUX c01, AO1, INC2 LFPDUX c03, AO2, INC2 STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 .align 4 .L24: andi. r0, N, 1 ble .L30 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 fsmfp c01, c02 STFPDUX c01, B3, INC2 .align 4 .L30: andi. J, M, 1 addi M4, M4, 4 * SIZE ble .L99 mr AO1, A sub B1, B, M4 srawi. r0, N, 2 mtspr CTR, r0 ble .L33 .align 4 .L32: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 bdnz .L32 .align 4 .L33: andi. r0, N, 2 ble .L34 LFPDUX c01, AO1, INC2 STFPDUX c01, B2, INC2 .align 4 .L34: andi. r0, N, 1 ble .L99 LFDX c01, AO1, INC2 STFDX c01, B3, INC2 .align 4 .L99: addi SP, SP, -4 lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) addi SP, SP, 4 blr .L100: subi A, A, SIZE srawi. J, M, 2 ble .L120 .align 4 .L110: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA sub B1, B, M4 addi B, B, 16 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble .L115 .align 4 .L112: LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFDUX c05, AO3, INC LFDUX c07, AO4, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC LFSDUX c05, AO3, INC LFSDUX c07, AO4, INC LFDUX c02, AO1, INC LFDUX c04, AO2, INC LFDUX c06, AO3, INC LFDUX c08, AO4, INC LFSDUX c02, AO1, INC LFSDUX c04, AO2, INC LFSDUX c06, AO3, INC LFSDUX c08, AO4, INC STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 bdnz .L112 .align 4 .L115: andi. r0, N, 3 ble .L119 andi. r0, N, 2 ble .L117 LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFDUX c05, AO3, INC LFDUX c07, AO4, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC LFSDUX c05, AO3, INC LFSDUX c07, AO4, INC STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c07, B2, INC2 .align 4 .L117: andi. r0, N, 1 ble .L119 LFDUX c01, AO1, INC LFDUX c02, AO2, INC LFDUX c03, AO3, INC LFDUX c04, AO4, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 .align 4 .L119: addic. J, J, -1 bgt .L110 .align 4 .L120: andi. J, M, 2 addi M4, M4, 8 * SIZE ble .L130 mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M4 addi B, B, 8 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble .L123 .align 4 .L122: LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC LFDUX c02, AO1, INC LFDUX c04, AO2, INC LFSDUX c02, AO1, INC LFSDUX c04, AO2, INC STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 bdnz .L122 .align 4 .L123: andi. r0, N, 2 ble .L124 LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 .align 4 .L124: andi. r0, N, 1 ble .L130 LFDUX c01, AO1, INC LFDUX c02, AO2, INC fsmfp c01, c02 STFPDUX c01, B3, INC2 .align 4 .L130: andi. J, M, 1 addi M4, M4, 4 * SIZE ble .L999 mr AO1, A sub B1, B, M4 srawi. r0, N, 2 mtspr CTR, r0 ble .L133 .align 4 .L132: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B1, M4 STFPDUX c03, B1, INC2 bdnz .L132 .align 4 .L133: andi. r0, N, 2 ble .L134 LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDUX c01, B2, INC2 .align 4 .L134: andi. r0, N, 1 ble .L999 LFDX c01, AO1, INC STFDX c01, B3, INC2 .align 4 .L999: addi SP, SP, -4 lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) addi SP, SP, 4 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemm_tcopy_hummer_8.S000066400000000000000000000555311313527062700216640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r12 #define B1 r21 #define B2 r22 #define B3 r23 #define B4 r24 #define M8 r25 #define AO5 r26 #define AO6 r27 #define AO7 r28 #define AO8 r29 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 #define c17 f16 #define c18 f17 #define c19 f18 #define c20 f19 #define c21 f20 #define c22 f21 #define c23 f22 #define c24 f23 #define c25 f24 #define c26 f25 #define c27 f26 #define c28 f27 #define c29 f28 #define c30 f29 #define c31 f30 #define c32 f31 #define STACKSIZE 64 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) slwi LDA, LDA, BASE_SHIFT slwi M8, M, 3 + BASE_SHIFT li r8, -8 li r9, -4 li r10, -2 and B2, N, r8 and B3, N, r9 and B4, N, r10 mullw B2, B2, M mullw B3, B3, M mullw B4, B4, M slwi B2, B2, BASE_SHIFT slwi B3, B3, BASE_SHIFT slwi B4, B4, BASE_SHIFT add B2, B2, B add B3, B3, B add B4, B4, B cmpwi cr0, M, 0 ble- .L999 cmpwi cr0, N, 0 ble- .L999 subi B2, B2, 2 * SIZE subi B3, B3, 2 * SIZE subi B4, B4, 2 * SIZE subi M8, M8, 62 * SIZE li INC, 1 * SIZE li INC2, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne .L100 andi. r0, LDA, 2 * SIZE - 1 bne .L100 subi A, A, 2 * SIZE srawi. J, M, 3 ble .L20 .align 4 .L10: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA sub B1, B, M8 addi B, B, 64 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L15 .align 4 .L12: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO2, INC2 LFPDUX c08, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c10, AO3, INC2 LFPDUX c11, AO3, INC2 LFPDUX c12, AO3, INC2 LFPDUX c13, AO4, INC2 LFPDUX c14, AO4, INC2 LFPDUX c15, AO4, INC2 LFPDUX c16, AO4, INC2 LFPDUX c17, AO5, INC2 LFPDUX c18, AO5, INC2 LFPDUX c19, AO5, INC2 LFPDUX c20, AO5, INC2 LFPDUX c21, AO6, INC2 LFPDUX c22, AO6, INC2 LFPDUX c23, AO6, INC2 LFPDUX c24, AO6, INC2 LFPDUX c25, AO7, INC2 LFPDUX c26, AO7, INC2 LFPDUX c27, AO7, INC2 LFPDUX c28, AO7, INC2 LFPDUX c29, AO8, INC2 LFPDUX c30, AO8, INC2 LFPDUX c31, AO8, INC2 LFPDUX c32, AO8, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 STFPDUX c17, B1, INC2 STFPDUX c18, B1, INC2 STFPDUX c19, B1, INC2 STFPDUX c20, B1, INC2 STFPDUX c21, B1, INC2 STFPDUX c22, B1, INC2 STFPDUX c23, B1, INC2 STFPDUX c24, B1, INC2 STFPDUX c25, B1, INC2 STFPDUX c26, B1, INC2 STFPDUX c27, B1, INC2 STFPDUX c28, B1, INC2 STFPDUX c29, B1, INC2 STFPDUX c30, B1, INC2 STFPDUX c31, B1, INC2 STFPDUX c32, B1, INC2 bdnz .L12 .align 4 .L15: andi. r0, N, 7 ble .L19 andi. r0, N, 4 ble .L16 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c06, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c08, AO4, INC2 LFPDUX c09, AO5, INC2 LFPDUX c10, AO5, INC2 LFPDUX c11, AO6, INC2 LFPDUX c12, AO6, INC2 LFPDUX c13, AO7, INC2 LFPDUX c14, AO7, INC2 LFPDUX c15, AO8, INC2 LFPDUX c16, AO8, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c06, B2, INC2 STFPDUX c07, B2, INC2 STFPDUX c08, B2, INC2 STFPDUX c09, B2, INC2 STFPDUX c10, B2, INC2 STFPDUX c11, B2, INC2 STFPDUX c12, B2, INC2 STFPDUX c13, B2, INC2 STFPDUX c14, B2, INC2 STFPDUX c15, B2, INC2 STFPDUX c16, B2, INC2 .align 4 .L16: andi. r0, N, 2 ble .L17 LFPDUX c01, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c09, AO5, INC2 LFPDUX c11, AO6, INC2 LFPDUX c13, AO7, INC2 LFPDUX c15, AO8, INC2 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c05, B3, INC2 STFPDUX c07, B3, INC2 STFPDUX c09, B3, INC2 STFPDUX c11, B3, INC2 STFPDUX c13, B3, INC2 STFPDUX c15, B3, INC2 .align 4 .L17: andi. r0, N, 1 ble .L19 LFDUX c01, AO1, INC2 LFDUX c02, AO3, INC2 LFDUX c03, AO5, INC2 LFDUX c04, AO7, INC2 LFSDUX c01, AO2, INC2 LFSDUX c02, AO4, INC2 LFSDUX c03, AO6, INC2 LFSDUX c04, AO8, INC2 STFPDUX c01, B4, INC2 STFPDUX c02, B4, INC2 STFPDUX c03, B4, INC2 STFPDUX c04, B4, INC2 .align 4 .L19: addic. J, J, -1 bgt .L10 .align 4 .L20: andi. J, M, 4 addi M8, M8, 32 * SIZE ble .L30 mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA sub B1, B, M8 addi B, B, 32 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L25 .align 4 .L22: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO2, INC2 LFPDUX c08, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c10, AO3, INC2 LFPDUX c11, AO3, INC2 LFPDUX c12, AO3, INC2 LFPDUX c13, AO4, INC2 LFPDUX c14, AO4, INC2 LFPDUX c15, AO4, INC2 LFPDUX c16, AO4, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 bdnz .L22 .align 4 .L25: andi. r0, N, 7 ble .L30 andi. r0, N, 4 ble .L26 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c06, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c08, AO4, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c06, B2, INC2 STFPDUX c07, B2, INC2 STFPDUX c08, B2, INC2 .align 4 .L26: andi. r0, N, 2 ble .L27 LFPDUX c01, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c07, AO4, INC2 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c05, B3, INC2 STFPDUX c07, B3, INC2 .align 4 .L27: andi. r0, N, 1 ble .L30 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 LFDUX c03, AO3, INC2 LFDUX c04, AO4, INC2 fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B4, INC2 STFPDUX c03, B4, INC2 .align 4 .L30: andi. J, M, 2 addi M8, M8, 16 * SIZE ble .L40 mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M8 addi B, B, 16 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L35 .align 4 .L32: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO2, INC2 LFPDUX c08, AO2, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 bdnz .L32 .align 4 .L35: andi. r0, N, 7 ble .L40 andi. r0, N, 4 ble .L36 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 .align 4 .L36: andi. r0, N, 2 ble .L37 LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 STFPDUX c01, B3, INC2 STFPDUX c02, B3, INC2 .align 4 .L37: andi. r0, N, 1 ble .L40 LFDUX c01, AO1, INC2 LFDUX c02, AO2, INC2 fsmfp c01, c02 STFPDUX c01, B4, INC2 .align 4 .L40: andi. J, M, 1 addi M8, M8, 8 * SIZE ble .L999 mr AO1, A sub B1, B, M8 srawi. r0, N, 3 mtspr CTR, r0 ble .L45 .align 4 .L42: LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 bdnz .L42 .align 4 .L45: andi. r0, N, 7 ble .L999 andi. r0, N, 4 ble .L46 LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 .align 4 .L46: andi. r0, N, 2 ble .L47 LFPDUX c01, AO1, INC2 STFPDUX c01, B3, INC2 .align 4 .L47: andi. r0, N, 1 ble .L999 LFDX c01, AO1, INC2 STFDX c01, B4, INC2 b .L999 .align 4 .L100: subi A, A, SIZE srawi. J, M, 3 ble .L120 .align 4 .L110: mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA sub B1, B, M8 addi B, B, 64 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L115 .align 4 .L112: LFDUX c01, AO1, INC LFDUX c05, AO2, INC LFDUX c09, AO3, INC LFDUX c13, AO4, INC LFSDUX c01, AO1, INC LFSDUX c05, AO2, INC LFSDUX c09, AO3, INC LFSDUX c13, AO4, INC LFDUX c02, AO1, INC LFDUX c06, AO2, INC LFDUX c10, AO3, INC LFDUX c14, AO4, INC LFSDUX c02, AO1, INC LFSDUX c06, AO2, INC LFSDUX c10, AO3, INC LFSDUX c14, AO4, INC LFDUX c03, AO1, INC LFDUX c07, AO2, INC LFDUX c11, AO3, INC LFDUX c15, AO4, INC LFSDUX c03, AO1, INC LFSDUX c07, AO2, INC LFSDUX c11, AO3, INC LFSDUX c15, AO4, INC LFDUX c04, AO1, INC LFDUX c08, AO2, INC LFDUX c12, AO3, INC LFDUX c16, AO4, INC LFSDUX c04, AO1, INC LFSDUX c08, AO2, INC LFSDUX c12, AO3, INC LFSDUX c16, AO4, INC LFDUX c17, AO5, INC LFDUX c21, AO6, INC LFDUX c25, AO7, INC LFDUX c29, AO8, INC LFSDUX c17, AO5, INC LFSDUX c21, AO6, INC LFSDUX c25, AO7, INC LFSDUX c29, AO8, INC LFDUX c18, AO5, INC LFDUX c22, AO6, INC LFDUX c26, AO7, INC LFDUX c30, AO8, INC LFSDUX c18, AO5, INC LFSDUX c22, AO6, INC LFSDUX c26, AO7, INC LFSDUX c30, AO8, INC LFDUX c19, AO5, INC LFDUX c23, AO6, INC LFDUX c27, AO7, INC LFDUX c31, AO8, INC LFSDUX c19, AO5, INC LFSDUX c23, AO6, INC LFSDUX c27, AO7, INC LFSDUX c31, AO8, INC LFDUX c20, AO5, INC LFDUX c24, AO6, INC LFDUX c28, AO7, INC LFDUX c32, AO8, INC LFSDUX c20, AO5, INC LFSDUX c24, AO6, INC LFSDUX c28, AO7, INC LFSDUX c32, AO8, INC STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 STFPDUX c17, B1, INC2 STFPDUX c18, B1, INC2 STFPDUX c19, B1, INC2 STFPDUX c20, B1, INC2 STFPDUX c21, B1, INC2 STFPDUX c22, B1, INC2 STFPDUX c23, B1, INC2 STFPDUX c24, B1, INC2 STFPDUX c25, B1, INC2 STFPDUX c26, B1, INC2 STFPDUX c27, B1, INC2 STFPDUX c28, B1, INC2 STFPDUX c29, B1, INC2 STFPDUX c30, B1, INC2 STFPDUX c31, B1, INC2 STFPDUX c32, B1, INC2 bdnz .L112 .align 4 .L115: andi. r0, N, 7 ble .L119 andi. r0, N, 4 ble .L116 LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFDUX c05, AO3, INC LFDUX c07, AO4, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC LFSDUX c05, AO3, INC LFSDUX c07, AO4, INC LFDUX c02, AO1, INC LFDUX c04, AO2, INC LFDUX c06, AO3, INC LFDUX c08, AO4, INC LFSDUX c02, AO1, INC LFSDUX c04, AO2, INC LFSDUX c06, AO3, INC LFSDUX c08, AO4, INC LFDUX c09, AO5, INC LFDUX c11, AO6, INC LFDUX c13, AO7, INC LFDUX c15, AO8, INC LFSDUX c09, AO5, INC LFSDUX c11, AO6, INC LFSDUX c13, AO7, INC LFSDUX c15, AO8, INC LFDUX c10, AO5, INC LFDUX c12, AO6, INC LFDUX c14, AO7, INC LFDUX c16, AO8, INC LFSDUX c10, AO5, INC LFSDUX c12, AO6, INC LFSDUX c14, AO7, INC LFSDUX c16, AO8, INC STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c06, B2, INC2 STFPDUX c07, B2, INC2 STFPDUX c08, B2, INC2 STFPDUX c09, B2, INC2 STFPDUX c10, B2, INC2 STFPDUX c11, B2, INC2 STFPDUX c12, B2, INC2 STFPDUX c13, B2, INC2 STFPDUX c14, B2, INC2 STFPDUX c15, B2, INC2 STFPDUX c16, B2, INC2 .align 4 .L116: andi. r0, N, 2 ble .L117 LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFDUX c05, AO3, INC LFDUX c07, AO4, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC LFSDUX c05, AO3, INC LFSDUX c07, AO4, INC LFDUX c09, AO5, INC LFDUX c11, AO6, INC LFDUX c13, AO7, INC LFDUX c15, AO8, INC LFSDUX c09, AO5, INC LFSDUX c11, AO6, INC LFSDUX c13, AO7, INC LFSDUX c15, AO8, INC STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c05, B3, INC2 STFPDUX c07, B3, INC2 STFPDUX c09, B3, INC2 STFPDUX c11, B3, INC2 STFPDUX c13, B3, INC2 STFPDUX c15, B3, INC2 .align 4 .L117: andi. r0, N, 1 ble .L119 LFDUX c01, AO1, INC LFDUX c02, AO3, INC LFDUX c03, AO5, INC LFDUX c04, AO7, INC LFSDUX c01, AO2, INC LFSDUX c02, AO4, INC LFSDUX c03, AO6, INC LFSDUX c04, AO8, INC STFPDUX c01, B4, INC2 STFPDUX c02, B4, INC2 STFPDUX c03, B4, INC2 STFPDUX c04, B4, INC2 .align 4 .L119: addic. J, J, -1 bgt .L110 .align 4 .L120: andi. J, M, 4 addi M8, M8, 32 * SIZE ble .L130 mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA sub B1, B, M8 addi B, B, 32 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L125 .align 4 .L122: LFDUX c01, AO1, INC LFDUX c05, AO2, INC LFDUX c09, AO3, INC LFDUX c13, AO4, INC LFSDUX c01, AO1, INC LFSDUX c05, AO2, INC LFSDUX c09, AO3, INC LFSDUX c13, AO4, INC LFDUX c02, AO1, INC LFDUX c06, AO2, INC LFDUX c10, AO3, INC LFDUX c14, AO4, INC LFSDUX c02, AO1, INC LFSDUX c06, AO2, INC LFSDUX c10, AO3, INC LFSDUX c14, AO4, INC LFDUX c03, AO1, INC LFDUX c07, AO2, INC LFDUX c11, AO3, INC LFDUX c15, AO4, INC LFSDUX c03, AO1, INC LFSDUX c07, AO2, INC LFSDUX c11, AO3, INC LFSDUX c15, AO4, INC LFDUX c04, AO1, INC LFDUX c08, AO2, INC LFDUX c12, AO3, INC LFDUX c16, AO4, INC LFSDUX c04, AO1, INC LFSDUX c08, AO2, INC LFSDUX c12, AO3, INC LFSDUX c16, AO4, INC STFPDUX c01, B1, M8 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 bdnz .L122 .align 4 .L125: andi. r0, N, 7 ble .L130 andi. r0, N, 4 ble .L126 LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFDUX c05, AO3, INC LFDUX c07, AO4, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC LFSDUX c05, AO3, INC LFSDUX c07, AO4, INC LFDUX c02, AO1, INC LFDUX c04, AO2, INC LFDUX c06, AO3, INC LFDUX c08, AO4, INC LFSDUX c02, AO1, INC LFSDUX c04, AO2, INC LFSDUX c06, AO3, INC LFSDUX c08, AO4, INC STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c06, B2, INC2 STFPDUX c07, B2, INC2 STFPDUX c08, B2, INC2 .align 4 .L126: andi. r0, N, 2 ble .L127 LFDUX c01, AO1, INC LFDUX c03, AO2, INC LFDUX c05, AO3, INC LFDUX c07, AO4, INC LFSDUX c01, AO1, INC LFSDUX c03, AO2, INC LFSDUX c05, AO3, INC LFSDUX c07, AO4, INC STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c05, B3, INC2 STFPDUX c07, B3, INC2 .align 4 .L127: andi. r0, N, 1 ble .L130 LFDUX c01, AO1, INC LFDUX c02, AO2, INC LFDUX c03, AO3, INC LFDUX c04, AO4, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B4, INC2 STFPDUX c03, B4, INC2 .align 4 .L130: andi. J, M, 2 addi M8, M8, 16 * SIZE ble .L140 mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M8 addi B, B, 16 * SIZE srawi. r0, N, 3 mtspr CTR, r0 ble .L135 .align 4 .L132: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO1, INC LFDUX c08, AO1, INC LFDUX c09, AO2, INC LFDUX c10, AO2, INC LFDUX c11, AO2, INC LFDUX c12, AO2, INC LFDUX c13, AO2, INC LFDUX c14, AO2, INC LFDUX c15, AO2, INC LFDUX c16, AO2, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 fsmfp c09, c10 fsmfp c11, c12 fsmfp c13, c14 fsmfp c15, c16 STFPDUX c01, B1, M8 STFPDUX c03, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c15, B1, INC2 bdnz .L132 .align 4 .L135: andi. r0, N, 7 ble .L140 andi. r0, N, 4 ble .L136 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c09, AO2, INC LFDUX c10, AO2, INC LFDUX c11, AO2, INC LFDUX c12, AO2, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c09, c10 fsmfp c11, c12 STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c09, B2, INC2 STFPDUX c11, B2, INC2 .align 4 .L136: andi. r0, N, 2 ble .L137 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c09, AO2, INC LFDUX c10, AO2, INC fsmfp c01, c02 fsmfp c09, c10 STFPDUX c01, B3, INC2 STFPDUX c09, B3, INC2 .align 4 .L137: andi. r0, N, 1 ble .L140 LFDUX c01, AO1, INC LFDUX c02, AO2, INC fsmfp c01, c02 STFPDUX c01, B4, INC2 .align 4 .L140: andi. J, M, 1 addi M8, M8, 8 * SIZE ble .L999 mr AO1, A sub B1, B, M8 srawi. r0, N, 3 mtspr CTR, r0 ble .L145 .align 4 .L142: LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO1, INC LFDUX c08, AO1, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B1, M8 STFPDUX c03, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c07, B1, INC2 bdnz .L142 .align 4 .L145: andi. r0, N, 7 ble .L999 andi. r0, N, 4 ble .L146 LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 .align 4 .L146: andi. r0, N, 2 ble .L147 LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDUX c01, B3, INC2 .align 4 .L147: andi. r0, N, 1 ble .L999 LFDX c01, AO1, INC STFDX c01, B4, INC2 .align 4 .L999: addi SP, SP, -4 lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemv_hummer_n.S000066400000000000000000000766611313527062700205540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define I r11 #define J r12 #define INCY2 r24 #define A1 r25 #define A2 r26 #define A3 r27 #define A4 r28 #define YL r29 #define YS r30 #define INC2 r31 #define yl1 f0 #define yl2 f2 #define yl3 f3 #define yl4 f4 #define ys1 f5 #define ys2 f6 #define ys3 f7 #define ys4 f8 #define yl5 f27 #define ys5 f28 #define alpha1 f9 #define alpha2 f10 #define a1 f11 #define a2 f12 #define a3 f13 #define a4 f14 #define a5 f15 #define a6 f16 #define a7 f17 #define a8 f18 #define a9 f19 #define a10 f20 #define a11 f21 #define a12 f22 #define a13 f23 #define a14 f24 #define a15 f25 #define a16 f26 #define alpha f1 PROLOGUE PROFCODE li r0, -16 lwz INCY, 8(SP) stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT fsmfp alpha, alpha cmpwi cr0, M, 0 ble- .L999 cmpwi cr0, N, 0 ble- .L999 add INCY2, INCY, INCY li INC2, 2 * SIZE sub X, X, INCX andi. r0, A, 2 * SIZE - 1 # bne .L100 # All cases for aligned A, even LDA cmpwi cr0, INCY, SIZE bne .L70 andi. r0, Y, 2 * SIZE - 1 bne .L40 # A : aligned LDA : even Y : Unit Aligned sub A, A, INC2 sub Y, Y, INCY2 srawi. J, N, 2 ble .L20 .align 4 .L11: LFDUX alpha1, X, INCX mr A1, A add A2, A, LDA add A3, A2, LDA LFSDUX alpha1, X, INCX LFDUX alpha2, X, INCX add A4, A3, LDA add A, A4, LDA mr YL, Y LFSDUX alpha2, X, INCX fpmul alpha1, alpha, alpha1 mr YS, Y srawi. r0, M, 3 mtspr CTR, r0 fpmul alpha2, alpha, alpha2 ble .L15 LFPDUX yl1, YL, INCY2 LFPDUX yl2, YL, INCY2 LFPDUX yl3, YL, INCY2 LFPDUX yl4, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a10, A2, INC2 LFPDUX a14, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFPDUX a11, A3, INC2 LFPDUX a15, A3, INC2 LFPDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 LFPDUX a8, A4, INC2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX a12, A4, INC2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX a16, A4, INC2 fxcpmadd ys4, alpha1, a13, yl4 bdz .L13 .align 4 .L12: LFPDUX yl1, YL, INCY2 fxcsmadd ys1, alpha1, a2, ys1 LFPDUX a1, A1, INC2 fxcsmadd ys2, alpha1, a6, ys2 LFPDUX a5, A1, INC2 fxcsmadd ys3, alpha1, a10, ys3 LFPDUX a9, A1, INC2 fxcsmadd ys4, alpha1, a14, ys4 LFPDUX a13, A1, INC2 LFPDUX yl2, YL, INCY2 fxcpmadd ys1, alpha2, a3, ys1 LFPDUX a2, A2, INC2 fxcpmadd ys2, alpha2, a7, ys2 LFPDUX a6, A2, INC2 fxcpmadd ys3, alpha2, a11, ys3 LFPDUX a10, A2, INC2 fxcpmadd ys4, alpha2, a15, ys4 LFPDUX a14, A2, INC2 LFPDUX yl3, YL, INCY2 fxcsmadd ys1, alpha2, a4, ys1 LFPDUX a3, A3, INC2 fxcsmadd ys2, alpha2, a8, ys2 LFPDUX a7, A3, INC2 fxcsmadd ys3, alpha2, a12, ys3 LFPDUX a11, A3, INC2 fxcsmadd ys4, alpha2, a16, ys4 LFPDUX a15, A3, INC2 LFPDUX yl4, YL, INCY2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 LFPDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 LFPDUX a8, A4, INC2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX a12, A4, INC2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX a16, A4, INC2 fxcpmadd ys4, alpha1, a13, yl4 bdnz .L12 .align 4 .L13: fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcsmadd ys3, alpha1, a10, ys3 fxcsmadd ys4, alpha1, a14, ys4 fxcpmadd ys1, alpha2, a3, ys1 fxcpmadd ys2, alpha2, a7, ys2 fxcpmadd ys3, alpha2, a11, ys3 fxcpmadd ys4, alpha2, a15, ys4 fxcsmadd ys1, alpha2, a4, ys1 fxcsmadd ys2, alpha2, a8, ys2 fxcsmadd ys3, alpha2, a12, ys3 fxcsmadd ys4, alpha2, a16, ys4 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 .align 4 .L15: andi. r0, M, 7 ble .L19 andi. r0, M, 4 ble .L17 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFPDUX a4, A4, INC2 LFPDUX a8, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcpmadd ys1, alpha2, a3, ys1 fxcpmadd ys2, alpha2, a7, ys2 fxcsmadd ys1, alpha2, a4, ys1 fxcsmadd ys2, alpha2, a8, ys2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 .align 4 .L17: andi. r0, M, 2 ble .L18 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys1, alpha2, a3, ys1 fxcsmadd ys1, alpha2, a4, ys1 STFPDUX ys1, YS, INCY2 .align 4 .L18: andi. r0, M, 1 ble .L19 LFDUX yl1, YL, INCY2 LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 LFDUX a3, A3, INC2 LFDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys1, alpha2, a3, ys1 fxcsmadd ys1, alpha2, a4, ys1 STFDUX ys1, YS, INCY2 .align 4 .L19: addi J, J, -1 cmpi cr0, 0, J, 0 bgt .L11 .align 4 .L20: andi. J, N, 2 ble .L30 LFDUX alpha1, X, INCX mr A1, A add A2, A, LDA add A, A2, LDA LFSDUX alpha1, X, INCX mr YL, Y mr YS, Y fpmul alpha1, alpha, alpha1 srawi. r0, M, 3 mtspr CTR, r0 ble .L25 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX yl3, YL, INCY2 LFPDUX a9, A1, INC2 LFPDUX yl4, YL, INCY2 LFPDUX a13, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a10, A2, INC2 LFPDUX a14, A2, INC2 bdz .L23 .align 4 .L22: fxcpmadd ys1, alpha1, a1, yl1 LFPDUX a1, A1, INC2 LFPDUX yl1, YL, INCY2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX a5, A1, INC2 LFPDUX yl2, YL, INCY2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX a9, A1, INC2 LFPDUX yl3, YL, INCY2 fxcpmadd ys4, alpha1, a13, yl4 LFPDUX a13, A1, INC2 LFPDUX yl4, YL, INCY2 fxcsmadd ys1, alpha1, a2, ys1 LFPDUX a2, A2, INC2 fxcsmadd ys2, alpha1, a6, ys2 LFPDUX a6, A2, INC2 fxcsmadd ys3, alpha1, a10, ys3 LFPDUX a10, A2, INC2 fxcsmadd ys4, alpha1, a14, ys4 LFPDUX a14, A2, INC2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 bdnz .L22 .align 4 .L23: fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcpmadd ys3, alpha1, a9, yl3 fxcpmadd ys4, alpha1, a13, yl4 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcsmadd ys3, alpha1, a10, ys3 fxcsmadd ys4, alpha1, a14, ys4 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 .align 4 .L25: andi. r0, M, 7 ble .L30 andi. r0, M, 4 ble .L27 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX a6, A2, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys2, alpha1, a5, yl2 fxcsmadd ys2, alpha1, a6, ys2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 .align 4 .L27: andi. r0, M, 2 ble .L28 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 STFPDUX ys1, YS, INCY2 .align 4 .L28: andi. r0, M, 1 ble .L30 LFDUX yl1, YL, INCY2 LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 STFDUX ys1, YS, INCY2 .align 4 .L30: andi. J, N, 1 ble .L999 LFDUX alpha1, X, INCX mr A1, A mr YL, Y mr YS, Y fmul alpha1, alpha, alpha1 srawi. r0, M, 3 mtspr CTR, r0 ble .L35 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 LFPDUX yl3, YL, INCY2 LFPDUX a9, A1, INC2 LFPDUX yl4, YL, INCY2 LFPDUX a13, A1, INC2 bdz .L33 .align 4 .L32: fxcpmadd ys1, alpha1, a1, yl1 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 fxcpmadd ys2, alpha1, a5, yl2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX yl3, YL, INCY2 LFPDUX a9, A1, INC2 fxcpmadd ys4, alpha1, a13, yl4 LFPDUX yl4, YL, INCY2 LFPDUX a13, A1, INC2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 bdnz .L32 .align 4 .L33: fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcpmadd ys3, alpha1, a9, yl3 fxcpmadd ys4, alpha1, a13, yl4 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 STFPDUX ys3, YS, INCY2 STFPDUX ys4, YS, INCY2 .align 4 .L35: andi. r0, M, 7 ble .L999 andi. r0, M, 4 ble .L37 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX yl2, YL, INCY2 LFPDUX a5, A1, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 STFPDUX ys1, YS, INCY2 STFPDUX ys2, YS, INCY2 .align 4 .L37: andi. r0, M, 2 ble .L38 LFPDUX yl1, YL, INCY2 LFPDUX a1, A1, INC2 fxcpmadd ys1, alpha1, a1, yl1 STFPDUX ys1, YS, INCY2 .align 4 .L38: andi. r0, M, 1 ble .L999 LFDUX yl1, YL, INCY2 LFDUX a1, A1, INC2 fxcpmadd ys1, alpha1, a1, yl1 STFDUX ys1, YS, INCY2 b .L999 .align 4 .L40: # A : aligned LDA : even Y : Unaligned sub A, A, INC2 sub Y, Y, INCY srawi. J, N, 2 ble .L50 .align 4 .L41: LFDUX alpha1, X, INCX LFSDUX alpha1, X, INCX LFDUX alpha2, X, INCX LFSDUX alpha2, X, INCX fpmul alpha1, alpha, alpha1 fpmul alpha2, alpha, alpha2 mr A1, A add A2, A, LDA add A3, A2, LDA add A4, A3, LDA add A, A4, LDA mr YL, Y sub YS, Y, INCY2 LFSDX ys1, YS, INCY2 LFDX yl1, YL, INCY srawi. r0, M, 3 mtspr CTR, r0 ble .L45 LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 LFXDUX yl2, YL, INCY2 LFXDUX yl3, YL, INCY2 LFXDUX yl4, YL, INCY2 LFXDUX yl5, YL, INCY2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a10, A2, INC2 LFPDUX a14, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFPDUX a11, A3, INC2 LFPDUX a15, A3, INC2 LFPDUX a4, A4, INC2 fsmr yl1, yl2 LFPDUX a8, A4, INC2 fsmr yl2, yl3 LFPDUX a12, A4, INC2 fsmr yl3, yl4 LFPDUX a16, A4, INC2 fsmr yl4, yl5 bdz .L43 .align 4 .L42: fxcpmadd ys2, alpha1, a1, yl1 LFPDUX a1, A1, INC2 fxcpmadd ys3, alpha1, a5, yl2 LFPDUX a5, A1, INC2 fxcpmadd ys4, alpha1, a9, yl3 LFPDUX a9, A1, INC2 fxcpmadd ys5, alpha1, a13, yl4 LFPDUX a13, A1, INC2 fxcsmadd ys2, alpha1, a2, ys2 LFPDUX a2, A2, INC2 fxcsmadd ys3, alpha1, a6, ys3 LFPDUX a6, A2, INC2 fxcsmadd ys4, alpha1, a10, ys4 LFPDUX a10, A2, INC2 fxcsmadd ys5, alpha1, a14, ys5 LFPDUX a14, A2, INC2 fxcpmadd ys2, alpha2, a3, ys2 LFPDUX a3, A3, INC2 fxcpmadd ys3, alpha2, a7, ys3 LFPDUX a7, A3, INC2 fxcpmadd ys4, alpha2, a11, ys4 LFPDUX a11, A3, INC2 fxcpmadd ys5, alpha2, a15, ys5 LFPDUX a15, A3, INC2 fxcsmadd ys2, alpha2, a4, ys2 LFPDUX a4, A4, INC2 fxcsmadd ys3, alpha2, a8, ys3 LFPDUX a8, A4, INC2 fxcsmadd ys4, alpha2, a12, ys4 LFPDUX a12, A4, INC2 fxcsmadd ys5, alpha2, a16, ys5 LFPDUX a16, A4, INC2 fmr yl1, yl5 LFXDUX yl2, YL, INCY2 fmr ys1, ys2 LFXDUX yl3, YL, INCY2 fmr ys2, ys3 LFXDUX yl4, YL, INCY2 fmr ys3, ys4 LFXDUX yl5, YL, INCY2 fmr ys4, ys5 STFXDUX ys1, YS, INCY2 fsmr ys1, ys5 STFXDUX ys2, YS, INCY2 fsmr yl1, yl2 STFXDUX ys3, YS, INCY2 fsmr yl2, yl3 STFXDUX ys4, YS, INCY2 fsmr yl3, yl4 fsmr yl4, yl5 bdnz .L42 .align 4 .L43: fxcpmadd ys2, alpha1, a1, yl1 fxcpmadd ys3, alpha1, a5, yl2 fxcpmadd ys4, alpha1, a9, yl3 fxcpmadd ys5, alpha1, a13, yl4 fxcsmadd ys2, alpha1, a2, ys2 fxcsmadd ys3, alpha1, a6, ys3 fxcsmadd ys4, alpha1, a10, ys4 fxcsmadd ys5, alpha1, a14, ys5 fxcpmadd ys2, alpha2, a3, ys2 fxcpmadd ys3, alpha2, a7, ys3 fxcpmadd ys4, alpha2, a11, ys4 fxcpmadd ys5, alpha2, a15, ys5 fxcsmadd ys2, alpha2, a4, ys2 fxcsmadd ys3, alpha2, a8, ys3 fxcsmadd ys4, alpha2, a12, ys4 fxcsmadd ys5, alpha2, a16, ys5 fmr ys1, ys2 fmr ys2, ys3 fmr ys3, ys4 fmr ys4, ys5 fmr yl1, yl5 STFXDUX ys1, YS, INCY2 fsmr ys1, ys5 STFXDUX ys2, YS, INCY2 STFXDUX ys3, YS, INCY2 STFXDUX ys4, YS, INCY2 .align 4 .L45: andi. r0, M, 7 ble .L48 andi. r0, M, 4 ble .L46 LFXDUX yl2, YL, INCY2 LFXDUX yl3, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFPDUX a4, A4, INC2 fsmr yl1, yl2 LFPDUX a8, A4, INC2 fsmr yl2, yl3 fxcpmadd ys2, alpha1, a1, yl1 fxcpmadd ys3, alpha1, a5, yl2 fxcsmadd ys2, alpha1, a2, ys2 fxcsmadd ys3, alpha1, a6, ys3 fxcpmadd ys2, alpha2, a3, ys2 fxcpmadd ys3, alpha2, a7, ys3 fxcsmadd ys2, alpha2, a4, ys2 fxcsmadd ys3, alpha2, a8, ys3 fmr yl1, yl3 fmr ys1, ys2 fmr ys2, ys3 STFXDUX ys1, YS, INCY2 fsmr ys1, ys3 STFXDUX ys2, YS, INCY2 .align 4 .L46: andi. r0, M, 2 ble .L47 LFXDUX yl2, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a3, A3, INC2 LFPDUX a4, A4, INC2 fsmr yl1, yl2 fxcpmadd ys2, alpha1, a1, yl1 fxcsmadd ys2, alpha1, a2, ys2 fxcpmadd ys2, alpha2, a3, ys2 fxcsmadd ys2, alpha2, a4, ys2 fmr yl1, yl2 fmr ys1, ys2 STFXDUX ys1, YS, INCY2 fsmr ys1, ys2 .align 4 .L47: andi. r0, M, 1 ble .L48 LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 LFDUX a3, A3, INC2 LFDUX a4, A4, INC2 fxcpmadd ys2, alpha1, a1, yl1 fxcsmadd ys2, alpha1, a2, ys2 fxcpmadd ys2, alpha2, a3, ys2 fxcsmadd ys2, alpha2, a4, ys2 STFSDX ys1, YS, INCY2 add YS, YS, INCY STFDX ys2, YS, INCY2 b .L49 .align 4 .L48: STFSDUX ys1, YS, INCY2 .align 4 .L49: addi J, J, -1 cmpi cr0, 0, J, 0 bgt .L41 .align 4 .L50: andi. J, N, 2 ble .L60 LFDUX alpha1, X, INCX mr A1, A add A2, A, LDA add A, A2, LDA LFSDUX alpha1, X, INCX mr YL, Y sub YS, Y, INCY2 fpmul alpha1, alpha, alpha1 LFSDX ys1, YS, INCY2 LFDX yl1, YL, INCY srawi. r0, M, 3 mtspr CTR, r0 ble .L55 LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 LFXDUX yl2, YL, INCY2 LFXDUX yl3, YL, INCY2 LFXDUX yl4, YL, INCY2 LFXDUX yl5, YL, INCY2 LFPDUX a2, A2, INC2 fsmr yl1, yl2 LFPDUX a6, A2, INC2 fsmr yl2, yl3 LFPDUX a10, A2, INC2 fsmr yl3, yl4 LFPDUX a14, A2, INC2 fsmr yl4, yl5 bdz .L53 .align 4 .L52: fxcpmadd ys2, alpha1, a1, yl1 LFPDUX a1, A1, INC2 fxcpmadd ys3, alpha1, a5, yl2 LFPDUX a5, A1, INC2 fxcpmadd ys4, alpha1, a9, yl3 LFPDUX a9, A1, INC2 fxcpmadd ys5, alpha1, a13, yl4 LFPDUX a13, A1, INC2 fxcsmadd ys2, alpha1, a2, ys2 LFPDUX a2, A2, INC2 fxcsmadd ys3, alpha1, a6, ys3 LFPDUX a6, A2, INC2 fxcsmadd ys4, alpha1, a10, ys4 LFPDUX a10, A2, INC2 fxcsmadd ys5, alpha1, a14, ys5 LFPDUX a14, A2, INC2 fmr yl1, yl5 LFXDUX yl2, YL, INCY2 fmr ys1, ys2 LFXDUX yl3, YL, INCY2 fmr ys2, ys3 LFXDUX yl4, YL, INCY2 fmr ys3, ys4 LFXDUX yl5, YL, INCY2 fmr ys4, ys5 STFXDUX ys1, YS, INCY2 fsmr ys1, ys5 STFXDUX ys2, YS, INCY2 fsmr yl1, yl2 STFXDUX ys3, YS, INCY2 fsmr yl2, yl3 STFXDUX ys4, YS, INCY2 fsmr yl3, yl4 fsmr yl4, yl5 bdnz .L52 .align 4 .L53: fxcpmadd ys2, alpha1, a1, yl1 fxcpmadd ys3, alpha1, a5, yl2 fxcpmadd ys4, alpha1, a9, yl3 fxcpmadd ys5, alpha1, a13, yl4 fxcsmadd ys2, alpha1, a2, ys2 fxcsmadd ys3, alpha1, a6, ys3 fxcsmadd ys4, alpha1, a10, ys4 fxcsmadd ys5, alpha1, a14, ys5 fmr yl1, yl5 fmr ys1, ys2 fmr ys2, ys3 fmr ys3, ys4 fmr ys4, ys5 STFXDUX ys1, YS, INCY2 fsmr ys1, ys5 STFXDUX ys2, YS, INCY2 STFXDUX ys3, YS, INCY2 STFXDUX ys4, YS, INCY2 .align 4 .L55: andi. r0, M, 7 ble .L59 andi. r0, M, 4 ble .L57 LFXDUX yl2, YL, INCY2 LFXDUX yl3, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFPDUX a5, A1, INC2 LFPDUX a6, A2, INC2 fsmr yl1, yl2 fsmr yl2, yl3 fxcpmadd ys2, alpha1, a1, yl1 fxcsmadd ys2, alpha1, a2, ys2 fxcpmadd ys3, alpha1, a5, yl2 fxcsmadd ys3, alpha1, a6, ys3 fmr yl1, yl3 fmr ys1, ys2 fmr ys2, ys3 STFXDUX ys1, YS, INCY2 STFXDUX ys2, YS, INCY2 fsmr ys1, ys3 .align 4 .L57: andi. r0, M, 2 ble .L58 LFXDUX yl2, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 fsmr yl1, yl2 fxcpmadd ys2, alpha1, a1, yl1 fxcsmadd ys2, alpha1, a2, ys2 fmr yl1, yl2 fmr ys1, ys2 STFXDUX ys1, YS, INCY2 fsmr ys1, ys2 .align 4 .L58: andi. r0, M, 1 ble .L59 LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 fxmr alpha2, alpha1 fmadd ys1, alpha1, a1, yl1 fmadd ys1, alpha2, a2, ys1 STFXDUX ys1, YS, INCY2 b .L60 .align 4 .L59: STFSDUX ys1, YS, INCY2 .align 4 .L60: andi. J, N, 1 ble .L999 LFDUX alpha1, X, INCX mr A1, A mr YL, Y sub YS, Y, INCY2 fmul alpha1, alpha, alpha1 LFSDX ys1, YS, INCY2 LFDX yl1, YL, INCY srawi. r0, M, 3 mtspr CTR, r0 ble .L65 LFXDUX yl2, YL, INCY2 LFXDUX yl3, YL, INCY2 LFXDUX yl4, YL, INCY2 LFXDUX yl5, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 fsmr yl1, yl2 fsmr yl2, yl3 fsmr yl3, yl4 fsmr yl4, yl5 bdz .L63 .align 4 .L62: fxcpmadd ys2, alpha1, a1, yl1 LFPDUX a1, A1, INC2 fxcpmadd ys3, alpha1, a5, yl2 LFXDUX yl2, YL, INCY2 fxcpmadd ys4, alpha1, a9, yl3 LFXDUX yl3, YL, INCY2 fxcpmadd ys5, alpha1, a13, yl4 LFXDUX yl4, YL, INCY2 fmr yl1, yl5 LFXDUX yl5, YL, INCY2 fmr ys1, ys2 LFPDUX a5, A1, INC2 fmr ys2, ys3 LFPDUX a9, A1, INC2 fmr ys3, ys4 LFPDUX a13, A1, INC2 fmr ys4, ys5 STFXDUX ys1, YS, INCY2 fsmr ys1, ys5 STFXDUX ys2, YS, INCY2 fsmr yl1, yl2 STFXDUX ys3, YS, INCY2 fsmr yl2, yl3 STFXDUX ys4, YS, INCY2 fsmr yl3, yl4 fsmr yl4, yl5 bdnz .L62 .align 4 .L63: fxcpmadd ys2, alpha1, a1, yl1 fxcpmadd ys3, alpha1, a5, yl2 fxcpmadd ys4, alpha1, a9, yl3 fxcpmadd ys5, alpha1, a13, yl4 fmr yl1, yl5 fmr ys1, ys2 fmr ys2, ys3 fmr ys3, ys4 fmr ys4, ys5 STFXDUX ys1, YS, INCY2 fsmr ys1, ys5 STFXDUX ys2, YS, INCY2 STFXDUX ys3, YS, INCY2 STFXDUX ys4, YS, INCY2 .align 4 .L65: andi. r0, M, 7 ble .L69 andi. r0, M, 4 ble .L67 LFXDUX yl2, YL, INCY2 LFXDUX yl3, YL, INCY2 LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 fsmr yl1, yl2 fsmr yl2, yl3 fxcpmadd ys2, alpha1, a1, yl1 fxcpmadd ys3, alpha1, a5, yl2 fmr yl1, yl3 fmr ys1, ys2 fmr ys2, ys3 STFXDUX ys1, YS, INCY2 fsmr ys1, ys3 STFXDUX ys2, YS, INCY2 .align 4 .L67: andi. r0, M, 2 ble .L68 LFPDUX a1, A1, INC2 LFXDUX yl2, YL, INCY2 fsmr yl1, yl2 fxcpmadd ys2, alpha1, a1, yl1 fmr yl1, yl2 fmr ys1, ys2 STFXDUX ys1, YS, INCY2 fsmr ys1, ys2 .align 4 .L68: andi. r0, M, 1 ble .L69 LFDUX a1, A1, INC2 fmadd ys1, alpha1, a1, yl1 STFXDUX ys1, YS, INCY2 b .L999 .align 4 .L69: STFSDUX ys1, YS, INCY2 b .L999 .align 4 .L70: sub A, A, INC2 sub Y, Y, INCY srawi. J, N, 2 ble .L80 .align 4 .L71: LFDUX alpha1, X, INCX mr A1, A add A2, A, LDA add A3, A2, LDA LFSDUX alpha1, X, INCX LFDUX alpha2, X, INCX add A4, A3, LDA add A, A4, LDA mr YL, Y LFSDUX alpha2, X, INCX fpmul alpha1, alpha, alpha1 mr YS, Y srawi. r0, M, 3 mtspr CTR, r0 fpmul alpha2, alpha, alpha2 ble .L75 LFDUX yl1, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 LFSDUX yl1, YL, INCY LFDUX yl2, YL, INCY LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a10, A2, INC2 LFPDUX a14, A2, INC2 LFSDUX yl2, YL, INCY LFDUX yl3, YL, INCY LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFPDUX a11, A3, INC2 LFPDUX a15, A3, INC2 LFSDUX yl3, YL, INCY LFDUX yl4, YL, INCY LFPDUX a4, A4, INC2 LFPDUX a8, A4, INC2 LFPDUX a12, A4, INC2 LFPDUX a16, A4, INC2 LFSDUX yl4, YL, INCY bdz .L73 .align 4 .L72: fxcpmadd ys1, alpha1, a1, yl1 LFPDUX a1, A1, INC2 LFDUX yl1, YL, INCY fxcpmadd ys2, alpha1, a5, yl2 LFPDUX a5, A1, INC2 fxcpmadd ys3, alpha1, a9, yl3 LFPDUX a9, A1, INC2 fxcpmadd ys4, alpha1, a13, yl4 LFPDUX a13, A1, INC2 LFSDUX yl1, YL, INCY fxcsmadd ys1, alpha1, a2, ys1 LFPDUX a2, A2, INC2 LFDUX yl2, YL, INCY fxcsmadd ys2, alpha1, a6, ys2 LFPDUX a6, A2, INC2 fxcsmadd ys3, alpha1, a10, ys3 LFPDUX a10, A2, INC2 fxcsmadd ys4, alpha1, a14, ys4 LFPDUX a14, A2, INC2 LFSDUX yl2, YL, INCY fxcpmadd ys1, alpha2, a3, ys1 LFPDUX a3, A3, INC2 LFDUX yl3, YL, INCY fxcpmadd ys2, alpha2, a7, ys2 LFPDUX a7, A3, INC2 fxcpmadd ys3, alpha2, a11, ys3 LFPDUX a11, A3, INC2 fxcpmadd ys4, alpha2, a15, ys4 LFPDUX a15, A3, INC2 LFSDUX yl3, YL, INCY fxcsmadd ys1, alpha2, a4, ys1 LFPDUX a4, A4, INC2 LFDUX yl4, YL, INCY fxcsmadd ys2, alpha2, a8, ys2 LFPDUX a8, A4, INC2 fxcsmadd ys3, alpha2, a12, ys3 LFPDUX a12, A4, INC2 fxcsmadd ys4, alpha2, a16, ys4 LFPDUX a16, A4, INC2 LFSDUX yl4, YL, INCY STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY STFDUX ys3, YS, INCY STFSDUX ys3, YS, INCY STFDUX ys4, YS, INCY STFSDUX ys4, YS, INCY bdnz .L72 .align 4 .L73: fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcpmadd ys3, alpha1, a9, yl3 fxcpmadd ys4, alpha1, a13, yl4 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcsmadd ys3, alpha1, a10, ys3 fxcsmadd ys4, alpha1, a14, ys4 fxcpmadd ys1, alpha2, a3, ys1 fxcpmadd ys2, alpha2, a7, ys2 fxcpmadd ys3, alpha2, a11, ys3 fxcpmadd ys4, alpha2, a15, ys4 fxcsmadd ys1, alpha2, a4, ys1 fxcsmadd ys2, alpha2, a8, ys2 fxcsmadd ys3, alpha2, a12, ys3 fxcsmadd ys4, alpha2, a16, ys4 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY STFDUX ys3, YS, INCY STFSDUX ys3, YS, INCY STFDUX ys4, YS, INCY STFSDUX ys4, YS, INCY .align 4 .L75: andi. r0, M, 7 ble .L79 andi. r0, M, 4 ble .L77 LFDUX yl1, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFSDUX yl1, YL, INCY LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFDUX yl2, YL, INCY LFPDUX a3, A3, INC2 LFPDUX a7, A3, INC2 LFSDUX yl2, YL, INCY LFPDUX a4, A4, INC2 LFPDUX a8, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcpmadd ys1, alpha2, a3, ys1 fxcpmadd ys2, alpha2, a7, ys2 fxcsmadd ys1, alpha2, a4, ys1 fxcsmadd ys2, alpha2, a8, ys2 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY .align 4 .L77: andi. r0, M, 2 ble .L78 LFDUX yl1, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFSDUX yl1, YL, INCY LFPDUX a3, A3, INC2 LFPDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys1, alpha2, a3, ys1 fxcsmadd ys1, alpha2, a4, ys1 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY .align 4 .L78: andi. r0, M, 1 ble .L79 LFDUX yl1, YL, INCY LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 LFDUX a3, A3, INC2 LFDUX a4, A4, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 fxcpmadd ys1, alpha2, a3, ys1 fxcsmadd ys1, alpha2, a4, ys1 STFDUX ys1, YS, INCY .align 4 .L79: addi J, J, -1 cmpi cr0, 0, J, 0 bgt .L71 .align 4 .L80: andi. J, N, 2 ble .L90 LFDUX alpha1, X, INCX mr A1, A add A2, A, LDA add A, A2, LDA LFSDUX alpha1, X, INCX mr YL, Y mr YS, Y fpmul alpha1, alpha, alpha1 srawi. r0, M, 3 mtspr CTR, r0 ble .L85 LFDUX yl1, YL, INCY LFDUX a9, YL, INCY LFDUX yl2, YL, INCY LFDUX a10, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a3, A1, INC2 LFPDUX a7, A1, INC2 LFDUX yl3, YL, INCY LFDUX a11, YL, INCY LFDUX yl4, YL, INCY LFDUX a12, YL, INCY LFPDUX a2, A2, INC2 LFPDUX a6, A2, INC2 LFPDUX a4, A2, INC2 LFPDUX a8, A2, INC2 bdz .L83 .align 4 .L82: fsmfp yl1, a9 fsmfp yl2, a10 fsmfp yl3, a11 fsmfp yl4, a12 fxcpmadd ys1, alpha1, a1, yl1 LFDUX yl1, YL, INCY LFDUX a9, YL, INCY LFPDUX a1, A1, INC2 fxcpmadd ys2, alpha1, a5, yl2 LFDUX yl2, YL, INCY LFDUX a10, YL, INCY LFPDUX a5, A1, INC2 fxcpmadd ys3, alpha1, a3, yl3 LFDUX yl3, YL, INCY LFDUX a11, YL, INCY LFPDUX a3, A1, INC2 fxcpmadd ys4, alpha1, a7, yl4 LFDUX yl4, YL, INCY LFDUX a12, YL, INCY LFPDUX a7, A1, INC2 fxcsmadd ys1, alpha1, a2, ys1 LFPDUX a2, A2, INC2 fxcsmadd ys2, alpha1, a6, ys2 LFPDUX a6, A2, INC2 fxcsmadd ys3, alpha1, a4, ys3 LFPDUX a4, A2, INC2 fxcsmadd ys4, alpha1, a8, ys4 LFPDUX a8, A2, INC2 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY STFDUX ys3, YS, INCY STFSDUX ys3, YS, INCY STFDUX ys4, YS, INCY STFSDUX ys4, YS, INCY bdnz .L82 .align 4 .L83: fsmfp yl1, a9 fsmfp yl2, a10 fsmfp yl3, a11 fsmfp yl4, a12 fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcpmadd ys3, alpha1, a3, yl3 fxcpmadd ys4, alpha1, a7, yl4 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 fxcsmadd ys3, alpha1, a4, ys3 fxcsmadd ys4, alpha1, a8, ys4 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY STFDUX ys3, YS, INCY STFSDUX ys3, YS, INCY STFDUX ys4, YS, INCY STFSDUX ys4, YS, INCY .align 4 .L85: andi. r0, M, 7 ble .L90 andi. r0, M, 4 ble .L87 LFDUX yl1, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFSDUX yl1, YL, INCY LFDUX yl2, YL, INCY LFPDUX a5, A1, INC2 LFPDUX a6, A2, INC2 LFSDUX yl2, YL, INCY fxcpmadd ys1, alpha1, a1, yl1 fxcpmadd ys2, alpha1, a5, yl2 fxcsmadd ys1, alpha1, a2, ys1 fxcsmadd ys2, alpha1, a6, ys2 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY .align 4 .L87: andi. r0, M, 2 ble .L88 LFDUX yl1, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a2, A2, INC2 LFSDUX yl1, YL, INCY fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY .align 4 .L88: andi. r0, M, 1 ble .L90 LFDUX yl1, YL, INCY LFDUX a1, A1, INC2 LFDUX a2, A2, INC2 fxcpmadd ys1, alpha1, a1, yl1 fxcsmadd ys1, alpha1, a2, ys1 STFDUX ys1, YS, INCY .align 4 .L90: andi. J, N, 1 ble .L999 LFDUX alpha1, X, INCX mr A1, A mr YL, Y mr YS, Y fmul alpha1, alpha, alpha1 srawi. r0, M, 3 mtspr CTR, r0 ble .L95 LFDUX yl1, YL, INCY LFSDUX a2, YL, INCY LFDUX yl2, YL, INCY LFSDUX a4, YL, INCY LFDUX yl3, YL, INCY LFSDUX a6, YL, INCY LFDUX yl4, YL, INCY LFSDUX a8, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 bdz .L93 .align 4 .L92: fmr a2, yl1 fmr a4, yl2 fmr a6, yl3 fmr a8, yl4 fxcpmadd ys1, alpha1, a1, a2 LFDUX yl1, YL, INCY LFSDUX a2, YL, INCY fxcpmadd ys2, alpha1, a5, a4 LFDUX yl2, YL, INCY LFSDUX a4, YL, INCY fxcpmadd ys3, alpha1, a9, a6 LFDUX yl3, YL, INCY LFSDUX a6, YL, INCY fxcpmadd ys4, alpha1, a13, a8 LFDUX yl4, YL, INCY LFSDUX a8, YL, INCY LFPDUX a1, A1, INC2 LFPDUX a5, A1, INC2 LFPDUX a9, A1, INC2 LFPDUX a13, A1, INC2 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY STFDUX ys3, YS, INCY STFSDUX ys3, YS, INCY STFDUX ys4, YS, INCY STFSDUX ys4, YS, INCY bdnz .L92 .align 4 .L93: fmr a2, yl1 fmr a4, yl2 fmr a6, yl3 fmr a8, yl4 fxcpmadd ys1, alpha1, a1, a2 fxcpmadd ys2, alpha1, a5, a4 fxcpmadd ys3, alpha1, a9, a6 fxcpmadd ys4, alpha1, a13, a8 STFDUX ys1, YS, INCY STFSDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFSDUX ys2, YS, INCY STFDUX ys3, YS, INCY STFSDUX ys3, YS, INCY STFDUX ys4, YS, INCY STFSDUX ys4, YS, INCY .align 4 .L95: andi. r0, M, 7 ble .L999 andi. r0, M, 4 ble .L97 LFPDUX a1, A1, INC2 LFDUX yl1, YL, INCY LFDUX yl2, YL, INCY LFPDUX a2, A1, INC2 LFDUX yl3, YL, INCY LFDUX yl4, YL, INCY fxcpmadd ys1, a1, alpha1, yl1 fxcsmadd ys2, a1, alpha1, yl2 fxcpmadd ys3, a2, alpha1, yl3 fxcsmadd ys4, a2, alpha1, yl4 STFDUX ys1, YS, INCY STFDUX ys2, YS, INCY STFDUX ys3, YS, INCY STFDUX ys4, YS, INCY .align 4 .L97: andi. r0, M, 2 ble .L98 LFPDUX a1, A1, INC2 LFDUX yl1, YL, INCY LFDUX yl2, YL, INCY fxcpmadd ys1, a1, alpha1, yl1 fxcsmadd ys2, a1, alpha1, yl2 STFDUX ys1, YS, INCY STFDUX ys2, YS, INCY .align 4 .L98: andi. r0, M, 1 ble .L999 LFDUX yl1, YL, INCY LFDUX a1, A1, INC2 fxcpmadd ys1, alpha1, a1, yl1 STFDUX ys1, YS, INCY b .L999 .align 4 .L999: addi SP, SP, -4 lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/gemv_n.S000066400000000000000000001732761313527062700171770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #define I r11 #define J r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define AO5 r18 #define AO6 r19 #define AO7 r20 #define AO8 r21 #define LDA8 r22 #define Y1 r23 #define PREA r24 #define PREC r25 #define YY r26 #define BUFFER r27 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define y09 f8 #define y10 f9 #define y11 f10 #define y12 f11 #define y13 f12 #define y14 f13 #define y15 f14 #define y16 f15 #define alpha1 f16 #define alpha2 f17 #define alpha3 f18 #define alpha4 f19 #define alpha5 f20 #define alpha6 f21 #define alpha7 f22 #define alpha8 f23 #define a1 f24 #define a2 f25 #define a3 f26 #define a4 f27 #define a5 f28 #define a6 f29 #define a7 f30 #define a8 f31 #define alpha f31 #if defined(PPCG4) #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 16 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 16 #endif #ifdef PPC970 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef CELL #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 40 #define PREFETCHSIZE_C 24 #endif #ifdef POWER6 #define PREFETCHSIZE_A 96 #define PREFETCHSIZE_C 40 #endif #ifdef POWER8 #define PREFETCHSIZE_A 96 #define PREFETCHSIZE_C 40 #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #define ALPHA 200(SP) #define FZERO 208(SP) #else #define STACKSIZE 280 #define ALPHA 256(SP) #define FZERO 264(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif stfd f1, ALPHA fmr alpha, f1 slwi LDA8, LDA, BASE_SHIFT + 3 slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) mr YY, Y lfd f0, FZERO cmpi cr0, 0, INCY, SIZE beq LL(10) mr YY, BUFFER mr Y1, BUFFER addi r0, M, 7 srawi. r0, r0, 3 mtspr CTR, r0 .align 4 LL(02): STFD f0, 0 * SIZE(Y1) STFD f0, 1 * SIZE(Y1) STFD f0, 2 * SIZE(Y1) STFD f0, 3 * SIZE(Y1) STFD f0, 4 * SIZE(Y1) STFD f0, 5 * SIZE(Y1) STFD f0, 6 * SIZE(Y1) STFD f0, 7 * SIZE(Y1) addi Y1, Y1, 8 * SIZE bdnz LL(02) .align 4 LL(10): srawi. J, N, 3 ble LL(20) .align 4 LL(11): LFD alpha1, 0 * SIZE(X) add X, X, INCX LFD alpha2, 0 * SIZE(X) add X, X, INCX LFD alpha3, 0 * SIZE(X) add X, X, INCX LFD alpha4, 0 * SIZE(X) add X, X, INCX LFD alpha5, 0 * SIZE(X) add X, X, INCX LFD alpha6, 0 * SIZE(X) add X, X, INCX LFD alpha7, 0 * SIZE(X) add X, X, INCX LFD alpha8, 0 * SIZE(X) add X, X, INCX FMUL alpha1, alpha, alpha1 FMUL alpha2, alpha, alpha2 FMUL alpha3, alpha, alpha3 FMUL alpha4, alpha, alpha4 FMUL alpha5, alpha, alpha5 FMUL alpha6, alpha, alpha6 FMUL alpha7, alpha, alpha7 FMUL alpha8, alpha, alpha8 mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA mr Y1, YY srawi. r0, M, 4 mtspr CTR, r0 ble LL(15) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 FMADD y06, alpha1, a6, y06 FMADD y07, alpha1, a7, y07 FMADD y08, alpha1, a8, y08 LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop nop DCBT(AO1, PREA) FMADD y09, alpha1, a1, y09 FMADD y10, alpha1, a2, y10 FMADD y11, alpha1, a3, y11 FMADD y12, alpha1, a4, y12 LFD a1, 0 * SIZE(AO2) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 FMADD y14, alpha1, a6, y14 FMADD y15, alpha1, a7, y15 FMADD y16, alpha1, a8, y16 LFD a5, 4 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 FMADD y02, alpha2, a2, y02 FMADD y03, alpha2, a3, y03 FMADD y04, alpha2, a4, y04 LFD a1, 8 * SIZE(AO2) LFD a2, 9 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 FMADD y06, alpha2, a6, y06 FMADD y07, alpha2, a7, y07 FMADD y08, alpha2, a8, y08 LFD a5, 12 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) addi AO2, AO2, 16 * SIZE nop nop DCBT(AO2, PREA) FMADD y09, alpha2, a1, y09 FMADD y10, alpha2, a2, y10 FMADD y11, alpha2, a3, y11 FMADD y12, alpha2, a4, y12 LFD a1, 0 * SIZE(AO3) LFD a2, 1 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) FMADD y13, alpha2, a5, y13 FMADD y14, alpha2, a6, y14 FMADD y15, alpha2, a7, y15 FMADD y16, alpha2, a8, y16 LFD a5, 4 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 FMADD y02, alpha3, a2, y02 FMADD y03, alpha3, a3, y03 FMADD y04, alpha3, a4, y04 LFD a1, 8 * SIZE(AO3) LFD a2, 9 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) FMADD y05, alpha3, a5, y05 FMADD y06, alpha3, a6, y06 FMADD y07, alpha3, a7, y07 FMADD y08, alpha3, a8, y08 LFD a5, 12 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) addi AO3, AO3, 16 * SIZE nop nop DCBT(AO3, PREA) FMADD y09, alpha3, a1, y09 FMADD y10, alpha3, a2, y10 FMADD y11, alpha3, a3, y11 FMADD y12, alpha3, a4, y12 LFD a1, 0 * SIZE(AO4) LFD a2, 1 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) FMADD y13, alpha3, a5, y13 FMADD y14, alpha3, a6, y14 FMADD y15, alpha3, a7, y15 FMADD y16, alpha3, a8, y16 LFD a5, 4 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 FMADD y02, alpha4, a2, y02 FMADD y03, alpha4, a3, y03 FMADD y04, alpha4, a4, y04 LFD a1, 8 * SIZE(AO4) LFD a2, 9 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) FMADD y05, alpha4, a5, y05 FMADD y06, alpha4, a6, y06 FMADD y07, alpha4, a7, y07 FMADD y08, alpha4, a8, y08 LFD a5, 12 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) addi AO4, AO4, 16 * SIZE nop nop DCBT(AO4, PREA) FMADD y09, alpha4, a1, y09 FMADD y10, alpha4, a2, y10 FMADD y11, alpha4, a3, y11 FMADD y12, alpha4, a4, y12 LFD a1, 0 * SIZE(AO5) LFD a2, 1 * SIZE(AO5) LFD a3, 2 * SIZE(AO5) LFD a4, 3 * SIZE(AO5) FMADD y13, alpha4, a5, y13 FMADD y14, alpha4, a6, y14 FMADD y15, alpha4, a7, y15 FMADD y16, alpha4, a8, y16 LFD a5, 4 * SIZE(AO5) LFD a6, 5 * SIZE(AO5) LFD a7, 6 * SIZE(AO5) LFD a8, 7 * SIZE(AO5) FMADD y01, alpha5, a1, y01 FMADD y02, alpha5, a2, y02 FMADD y03, alpha5, a3, y03 FMADD y04, alpha5, a4, y04 LFD a1, 8 * SIZE(AO5) LFD a2, 9 * SIZE(AO5) LFD a3, 10 * SIZE(AO5) LFD a4, 11 * SIZE(AO5) FMADD y05, alpha5, a5, y05 FMADD y06, alpha5, a6, y06 FMADD y07, alpha5, a7, y07 FMADD y08, alpha5, a8, y08 LFD a5, 12 * SIZE(AO5) LFD a6, 13 * SIZE(AO5) LFD a7, 14 * SIZE(AO5) LFD a8, 15 * SIZE(AO5) addi AO5, AO5, 16 * SIZE nop nop DCBT(AO5, PREA) FMADD y09, alpha5, a1, y09 FMADD y10, alpha5, a2, y10 FMADD y11, alpha5, a3, y11 FMADD y12, alpha5, a4, y12 LFD a1, 0 * SIZE(AO6) LFD a2, 1 * SIZE(AO6) LFD a3, 2 * SIZE(AO6) LFD a4, 3 * SIZE(AO6) FMADD y13, alpha5, a5, y13 FMADD y14, alpha5, a6, y14 FMADD y15, alpha5, a7, y15 FMADD y16, alpha5, a8, y16 LFD a5, 4 * SIZE(AO6) LFD a6, 5 * SIZE(AO6) LFD a7, 6 * SIZE(AO6) LFD a8, 7 * SIZE(AO6) FMADD y01, alpha6, a1, y01 FMADD y02, alpha6, a2, y02 FMADD y03, alpha6, a3, y03 FMADD y04, alpha6, a4, y04 LFD a1, 8 * SIZE(AO6) LFD a2, 9 * SIZE(AO6) LFD a3, 10 * SIZE(AO6) LFD a4, 11 * SIZE(AO6) FMADD y05, alpha6, a5, y05 FMADD y06, alpha6, a6, y06 FMADD y07, alpha6, a7, y07 FMADD y08, alpha6, a8, y08 LFD a5, 12 * SIZE(AO6) LFD a6, 13 * SIZE(AO6) LFD a7, 14 * SIZE(AO6) LFD a8, 15 * SIZE(AO6) addi AO6, AO6, 16 * SIZE nop nop DCBT(AO6, PREA) FMADD y09, alpha6, a1, y09 FMADD y10, alpha6, a2, y10 FMADD y11, alpha6, a3, y11 FMADD y12, alpha6, a4, y12 LFD a1, 0 * SIZE(AO7) LFD a2, 1 * SIZE(AO7) LFD a3, 2 * SIZE(AO7) LFD a4, 3 * SIZE(AO7) FMADD y13, alpha6, a5, y13 FMADD y14, alpha6, a6, y14 FMADD y15, alpha6, a7, y15 FMADD y16, alpha6, a8, y16 LFD a5, 4 * SIZE(AO7) LFD a6, 5 * SIZE(AO7) LFD a7, 6 * SIZE(AO7) LFD a8, 7 * SIZE(AO7) FMADD y01, alpha7, a1, y01 FMADD y02, alpha7, a2, y02 FMADD y03, alpha7, a3, y03 FMADD y04, alpha7, a4, y04 LFD a1, 8 * SIZE(AO7) LFD a2, 9 * SIZE(AO7) LFD a3, 10 * SIZE(AO7) LFD a4, 11 * SIZE(AO7) FMADD y05, alpha7, a5, y05 FMADD y06, alpha7, a6, y06 FMADD y07, alpha7, a7, y07 FMADD y08, alpha7, a8, y08 LFD a5, 12 * SIZE(AO7) LFD a6, 13 * SIZE(AO7) LFD a7, 14 * SIZE(AO7) LFD a8, 15 * SIZE(AO7) addi AO7, AO7, 16 * SIZE nop nop DCBT(AO7, PREA) FMADD y09, alpha7, a1, y09 FMADD y10, alpha7, a2, y10 FMADD y11, alpha7, a3, y11 FMADD y12, alpha7, a4, y12 LFD a1, 0 * SIZE(AO8) LFD a2, 1 * SIZE(AO8) LFD a3, 2 * SIZE(AO8) LFD a4, 3 * SIZE(AO8) FMADD y13, alpha7, a5, y13 FMADD y14, alpha7, a6, y14 FMADD y15, alpha7, a7, y15 FMADD y16, alpha7, a8, y16 LFD a5, 4 * SIZE(AO8) LFD a6, 5 * SIZE(AO8) LFD a7, 6 * SIZE(AO8) LFD a8, 7 * SIZE(AO8) FMADD y01, alpha8, a1, y01 FMADD y02, alpha8, a2, y02 FMADD y03, alpha8, a3, y03 FMADD y04, alpha8, a4, y04 LFD a1, 8 * SIZE(AO8) LFD a2, 9 * SIZE(AO8) LFD a3, 10 * SIZE(AO8) LFD a4, 11 * SIZE(AO8) FMADD y05, alpha8, a5, y05 FMADD y06, alpha8, a6, y06 FMADD y07, alpha8, a7, y07 FMADD y08, alpha8, a8, y08 LFD a5, 12 * SIZE(AO8) LFD a6, 13 * SIZE(AO8) LFD a7, 14 * SIZE(AO8) LFD a8, 15 * SIZE(AO8) addi AO8, AO8, 16 * SIZE nop nop DCBT(AO8, PREA) FMADD y09, alpha8, a1, y09 FMADD y10, alpha8, a2, y10 FMADD y11, alpha8, a3, y11 FMADD y12, alpha8, a4, y12 LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) FMADD y13, alpha8, a5, y13 FMADD y14, alpha8, a6, y14 FMADD y15, alpha8, a7, y15 FMADD y16, alpha8, a8, y16 LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) LFD y01, 16 * SIZE(Y1) LFD y02, 17 * SIZE(Y1) LFD y03, 18 * SIZE(Y1) LFD y04, 19 * SIZE(Y1) DCBT(Y1, PREC) bdz LL(13) .align 4 LL(12): FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) LFD y05, 20 * SIZE(Y1) LFD y06, 21 * SIZE(Y1) LFD y07, 22 * SIZE(Y1) LFD y08, 23 * SIZE(Y1) FMADD y05, alpha1, a5, y05 FMADD y06, alpha1, a6, y06 FMADD y07, alpha1, a7, y07 FMADD y08, alpha1, a8, y08 LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) LFD y09, 24 * SIZE(Y1) LFD y10, 25 * SIZE(Y1) LFD y11, 26 * SIZE(Y1) LFD y12, 27 * SIZE(Y1) FMADD y09, alpha1, a1, y09 FMADD y10, alpha1, a2, y10 FMADD y11, alpha1, a3, y11 FMADD y12, alpha1, a4, y12 LFD a1, 0 * SIZE(AO2) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) LFD y13, 28 * SIZE(Y1) LFD y14, 29 * SIZE(Y1) LFD y15, 30 * SIZE(Y1) LFD y16, 31 * SIZE(Y1) FMADD y13, alpha1, a5, y13 FMADD y14, alpha1, a6, y14 FMADD y15, alpha1, a7, y15 FMADD y16, alpha1, a8, y16 LFD a5, 4 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 FMADD y02, alpha2, a2, y02 FMADD y03, alpha2, a3, y03 FMADD y04, alpha2, a4, y04 LFD a1, 8 * SIZE(AO2) LFD a2, 9 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 FMADD y06, alpha2, a6, y06 FMADD y07, alpha2, a7, y07 FMADD y08, alpha2, a8, y08 LFD a5, 12 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2, a1, y09 FMADD y10, alpha2, a2, y10 FMADD y11, alpha2, a3, y11 FMADD y12, alpha2, a4, y12 LFD a1, 0 * SIZE(AO3) LFD a2, 1 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) FMADD y13, alpha2, a5, y13 FMADD y14, alpha2, a6, y14 FMADD y15, alpha2, a7, y15 FMADD y16, alpha2, a8, y16 LFD a5, 4 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 FMADD y02, alpha3, a2, y02 FMADD y03, alpha3, a3, y03 FMADD y04, alpha3, a4, y04 LFD a1, 8 * SIZE(AO3) LFD a2, 9 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) FMADD y05, alpha3, a5, y05 FMADD y06, alpha3, a6, y06 FMADD y07, alpha3, a7, y07 FMADD y08, alpha3, a8, y08 LFD a5, 12 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3, a1, y09 FMADD y10, alpha3, a2, y10 FMADD y11, alpha3, a3, y11 FMADD y12, alpha3, a4, y12 LFD a1, 0 * SIZE(AO4) LFD a2, 1 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) FMADD y13, alpha3, a5, y13 FMADD y14, alpha3, a6, y14 FMADD y15, alpha3, a7, y15 FMADD y16, alpha3, a8, y16 LFD a5, 4 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 FMADD y02, alpha4, a2, y02 FMADD y03, alpha4, a3, y03 FMADD y04, alpha4, a4, y04 LFD a1, 8 * SIZE(AO4) LFD a2, 9 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) FMADD y05, alpha4, a5, y05 FMADD y06, alpha4, a6, y06 FMADD y07, alpha4, a7, y07 FMADD y08, alpha4, a8, y08 LFD a5, 12 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) DCBT(AO3, PREA) DCBT(AO4, PREA) FMADD y09, alpha4, a1, y09 FMADD y10, alpha4, a2, y10 FMADD y11, alpha4, a3, y11 FMADD y12, alpha4, a4, y12 LFD a1, 0 * SIZE(AO5) LFD a2, 1 * SIZE(AO5) LFD a3, 2 * SIZE(AO5) LFD a4, 3 * SIZE(AO5) FMADD y13, alpha4, a5, y13 FMADD y14, alpha4, a6, y14 FMADD y15, alpha4, a7, y15 FMADD y16, alpha4, a8, y16 LFD a5, 4 * SIZE(AO5) LFD a6, 5 * SIZE(AO5) LFD a7, 6 * SIZE(AO5) LFD a8, 7 * SIZE(AO5) FMADD y01, alpha5, a1, y01 FMADD y02, alpha5, a2, y02 FMADD y03, alpha5, a3, y03 FMADD y04, alpha5, a4, y04 LFD a1, 8 * SIZE(AO5) LFD a2, 9 * SIZE(AO5) LFD a3, 10 * SIZE(AO5) LFD a4, 11 * SIZE(AO5) FMADD y05, alpha5, a5, y05 FMADD y06, alpha5, a6, y06 FMADD y07, alpha5, a7, y07 FMADD y08, alpha5, a8, y08 LFD a5, 12 * SIZE(AO5) LFD a6, 13 * SIZE(AO5) LFD a7, 14 * SIZE(AO5) LFD a8, 15 * SIZE(AO5) FMADD y09, alpha5, a1, y09 FMADD y10, alpha5, a2, y10 FMADD y11, alpha5, a3, y11 FMADD y12, alpha5, a4, y12 LFD a1, 0 * SIZE(AO6) LFD a2, 1 * SIZE(AO6) LFD a3, 2 * SIZE(AO6) LFD a4, 3 * SIZE(AO6) FMADD y13, alpha5, a5, y13 FMADD y14, alpha5, a6, y14 FMADD y15, alpha5, a7, y15 FMADD y16, alpha5, a8, y16 LFD a5, 4 * SIZE(AO6) LFD a6, 5 * SIZE(AO6) LFD a7, 6 * SIZE(AO6) LFD a8, 7 * SIZE(AO6) FMADD y01, alpha6, a1, y01 FMADD y02, alpha6, a2, y02 FMADD y03, alpha6, a3, y03 FMADD y04, alpha6, a4, y04 LFD a1, 8 * SIZE(AO6) LFD a2, 9 * SIZE(AO6) LFD a3, 10 * SIZE(AO6) LFD a4, 11 * SIZE(AO6) FMADD y05, alpha6, a5, y05 FMADD y06, alpha6, a6, y06 FMADD y07, alpha6, a7, y07 FMADD y08, alpha6, a8, y08 LFD a5, 12 * SIZE(AO6) LFD a6, 13 * SIZE(AO6) LFD a7, 14 * SIZE(AO6) LFD a8, 15 * SIZE(AO6) FMADD y09, alpha6, a1, y09 FMADD y10, alpha6, a2, y10 FMADD y11, alpha6, a3, y11 FMADD y12, alpha6, a4, y12 LFD a1, 0 * SIZE(AO7) LFD a2, 1 * SIZE(AO7) LFD a3, 2 * SIZE(AO7) LFD a4, 3 * SIZE(AO7) FMADD y13, alpha6, a5, y13 FMADD y14, alpha6, a6, y14 FMADD y15, alpha6, a7, y15 FMADD y16, alpha6, a8, y16 LFD a5, 4 * SIZE(AO7) LFD a6, 5 * SIZE(AO7) LFD a7, 6 * SIZE(AO7) LFD a8, 7 * SIZE(AO7) FMADD y01, alpha7, a1, y01 FMADD y02, alpha7, a2, y02 FMADD y03, alpha7, a3, y03 FMADD y04, alpha7, a4, y04 LFD a1, 8 * SIZE(AO7) LFD a2, 9 * SIZE(AO7) LFD a3, 10 * SIZE(AO7) LFD a4, 11 * SIZE(AO7) FMADD y05, alpha7, a5, y05 FMADD y06, alpha7, a6, y06 FMADD y07, alpha7, a7, y07 FMADD y08, alpha7, a8, y08 LFD a5, 12 * SIZE(AO7) LFD a6, 13 * SIZE(AO7) LFD a7, 14 * SIZE(AO7) LFD a8, 15 * SIZE(AO7) FMADD y09, alpha7, a1, y09 FMADD y10, alpha7, a2, y10 FMADD y11, alpha7, a3, y11 FMADD y12, alpha7, a4, y12 LFD a1, 0 * SIZE(AO8) LFD a2, 1 * SIZE(AO8) LFD a3, 2 * SIZE(AO8) LFD a4, 3 * SIZE(AO8) FMADD y13, alpha7, a5, y13 FMADD y14, alpha7, a6, y14 FMADD y15, alpha7, a7, y15 FMADD y16, alpha7, a8, y16 LFD a5, 4 * SIZE(AO8) LFD a6, 5 * SIZE(AO8) LFD a7, 6 * SIZE(AO8) LFD a8, 7 * SIZE(AO8) FMADD y01, alpha8, a1, y01 FMADD y02, alpha8, a2, y02 FMADD y03, alpha8, a3, y03 FMADD y04, alpha8, a4, y04 LFD a1, 8 * SIZE(AO8) LFD a2, 9 * SIZE(AO8) LFD a3, 10 * SIZE(AO8) LFD a4, 11 * SIZE(AO8) FMADD y05, alpha8, a5, y05 FMADD y06, alpha8, a6, y06 FMADD y07, alpha8, a7, y07 FMADD y08, alpha8, a8, y08 LFD a5, 12 * SIZE(AO8) LFD a6, 13 * SIZE(AO8) LFD a7, 14 * SIZE(AO8) LFD a8, 15 * SIZE(AO8) addi AO5, AO5, 16 * SIZE addi AO6, AO6, 16 * SIZE addi AO7, AO7, 16 * SIZE addi AO8, AO8, 16 * SIZE DCBT(AO5, PREA) DCBT(AO6, PREA) DCBT(AO7, PREA) DCBT(AO8, PREA) FMADD y09, alpha8, a1, y09 FMADD y10, alpha8, a2, y10 FMADD y11, alpha8, a3, y11 FMADD y12, alpha8, a4, y12 LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) FMADD y13, alpha8, a5, y13 FMADD y14, alpha8, a6, y14 FMADD y15, alpha8, a7, y15 FMADD y16, alpha8, a8, y16 LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y01, 16 * SIZE(Y1) STFD y02, 17 * SIZE(Y1) STFD y03, 18 * SIZE(Y1) STFD y04, 19 * SIZE(Y1) LFD y01, 32 * SIZE(Y1) LFD y02, 33 * SIZE(Y1) LFD y03, 34 * SIZE(Y1) LFD y04, 35 * SIZE(Y1) DCBT(Y1, PREC) addi Y1, Y1, 16 * SIZE bdnz LL(12) .align 4 LL(13): STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) addi Y1, Y1, 16 * SIZE .align 4 LL(15): andi. r0, M, 15 ble LL(19) andi. r0, M, 8 ble LL(16) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO2) FMADD y05, alpha1, a5, y05 LFD a5, 4 * SIZE(AO2) FMADD y06, alpha1, a6, y06 LFD a6, 5 * SIZE(AO2) FMADD y07, alpha1, a7, y07 LFD a7, 6 * SIZE(AO2) FMADD y08, alpha1, a8, y08 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 0 * SIZE(AO3) FMADD y02, alpha2, a2, y02 LFD a2, 1 * SIZE(AO3) FMADD y03, alpha2, a3, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, alpha2, a4, y04 LFD a4, 3 * SIZE(AO3) FMADD y05, alpha2, a5, y05 LFD a5, 4 * SIZE(AO3) FMADD y06, alpha2, a6, y06 LFD a6, 5 * SIZE(AO3) FMADD y07, alpha2, a7, y07 LFD a7, 6 * SIZE(AO3) FMADD y08, alpha2, a8, y08 LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFD a1, 0 * SIZE(AO4) FMADD y02, alpha3, a2, y02 LFD a2, 1 * SIZE(AO4) FMADD y03, alpha3, a3, y03 LFD a3, 2 * SIZE(AO4) FMADD y04, alpha3, a4, y04 LFD a4, 3 * SIZE(AO4) FMADD y05, alpha3, a5, y05 LFD a5, 4 * SIZE(AO4) FMADD y06, alpha3, a6, y06 LFD a6, 5 * SIZE(AO4) FMADD y07, alpha3, a7, y07 LFD a7, 6 * SIZE(AO4) FMADD y08, alpha3, a8, y08 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 LFD a1, 0 * SIZE(AO5) FMADD y02, alpha4, a2, y02 LFD a2, 1 * SIZE(AO5) FMADD y03, alpha4, a3, y03 LFD a3, 2 * SIZE(AO5) FMADD y04, alpha4, a4, y04 LFD a4, 3 * SIZE(AO5) FMADD y05, alpha4, a5, y05 LFD a5, 4 * SIZE(AO5) FMADD y06, alpha4, a6, y06 LFD a6, 5 * SIZE(AO5) FMADD y07, alpha4, a7, y07 LFD a7, 6 * SIZE(AO5) FMADD y08, alpha4, a8, y08 LFD a8, 7 * SIZE(AO5) FMADD y01, alpha5, a1, y01 LFD a1, 0 * SIZE(AO6) FMADD y02, alpha5, a2, y02 LFD a2, 1 * SIZE(AO6) FMADD y03, alpha5, a3, y03 LFD a3, 2 * SIZE(AO6) FMADD y04, alpha5, a4, y04 LFD a4, 3 * SIZE(AO6) FMADD y05, alpha5, a5, y05 LFD a5, 4 * SIZE(AO6) FMADD y06, alpha5, a6, y06 LFD a6, 5 * SIZE(AO6) FMADD y07, alpha5, a7, y07 LFD a7, 6 * SIZE(AO6) FMADD y08, alpha5, a8, y08 LFD a8, 7 * SIZE(AO6) FMADD y01, alpha6, a1, y01 LFD a1, 0 * SIZE(AO7) FMADD y02, alpha6, a2, y02 LFD a2, 1 * SIZE(AO7) FMADD y03, alpha6, a3, y03 LFD a3, 2 * SIZE(AO7) FMADD y04, alpha6, a4, y04 LFD a4, 3 * SIZE(AO7) FMADD y05, alpha6, a5, y05 LFD a5, 4 * SIZE(AO7) FMADD y06, alpha6, a6, y06 LFD a6, 5 * SIZE(AO7) FMADD y07, alpha6, a7, y07 LFD a7, 6 * SIZE(AO7) FMADD y08, alpha6, a8, y08 LFD a8, 7 * SIZE(AO7) FMADD y01, alpha7, a1, y01 LFD a1, 0 * SIZE(AO8) FMADD y02, alpha7, a2, y02 LFD a2, 1 * SIZE(AO8) FMADD y03, alpha7, a3, y03 LFD a3, 2 * SIZE(AO8) FMADD y04, alpha7, a4, y04 LFD a4, 3 * SIZE(AO8) FMADD y05, alpha7, a5, y05 LFD a5, 4 * SIZE(AO8) FMADD y06, alpha7, a6, y06 LFD a6, 5 * SIZE(AO8) FMADD y07, alpha7, a7, y07 LFD a7, 6 * SIZE(AO8) FMADD y08, alpha7, a8, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, alpha8, a1, y01 addi AO1, AO1, 8 * SIZE FMADD y02, alpha8, a2, y02 addi AO2, AO2, 8 * SIZE FMADD y03, alpha8, a3, y03 addi AO3, AO3, 8 * SIZE FMADD y04, alpha8, a4, y04 addi AO4, AO4, 8 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) FMADD y05, alpha8, a5, y05 addi AO5, AO5, 8 * SIZE FMADD y06, alpha8, a6, y06 addi AO6, AO6, 8 * SIZE FMADD y07, alpha8, a7, y07 addi AO7, AO7, 8 * SIZE FMADD y08, alpha8, a8, y08 addi AO8, AO8, 8 * SIZE STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi Y1, Y1, 8 * SIZE .align 4 LL(16): andi. r0, M, 4 ble LL(17) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO3) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO3) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2, a5, y01 LFD a5, 0 * SIZE(AO4) FMADD y02, alpha2, a6, y02 LFD a6, 1 * SIZE(AO4) FMADD y03, alpha2, a7, y03 LFD a7, 2 * SIZE(AO4) FMADD y04, alpha2, a8, y04 LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3, a1, y01 LFD a1, 0 * SIZE(AO5) FMADD y02, alpha3, a2, y02 LFD a2, 1 * SIZE(AO5) FMADD y03, alpha3, a3, y03 LFD a3, 2 * SIZE(AO5) FMADD y04, alpha3, a4, y04 LFD a4, 3 * SIZE(AO5) FMADD y01, alpha4, a5, y01 LFD a5, 0 * SIZE(AO6) FMADD y02, alpha4, a6, y02 LFD a6, 1 * SIZE(AO6) FMADD y03, alpha4, a7, y03 LFD a7, 2 * SIZE(AO6) FMADD y04, alpha4, a8, y04 LFD a8, 3 * SIZE(AO6) FMADD y01, alpha5, a1, y01 LFD a1, 0 * SIZE(AO7) FMADD y02, alpha5, a2, y02 LFD a2, 1 * SIZE(AO7) FMADD y03, alpha5, a3, y03 LFD a3, 2 * SIZE(AO7) FMADD y04, alpha5, a4, y04 LFD a4, 3 * SIZE(AO7) FMADD y01, alpha6, a5, y01 LFD a5, 0 * SIZE(AO8) FMADD y02, alpha6, a6, y02 LFD a6, 1 * SIZE(AO8) FMADD y03, alpha6, a7, y03 LFD a7, 2 * SIZE(AO8) FMADD y04, alpha6, a8, y04 LFD a8, 3 * SIZE(AO8) FMADD y01, alpha7, a1, y01 addi AO1, AO1, 4 * SIZE FMADD y02, alpha7, a2, y02 addi AO2, AO2, 4 * SIZE FMADD y03, alpha7, a3, y03 addi AO3, AO3, 4 * SIZE FMADD y04, alpha7, a4, y04 addi AO4, AO4, 4 * SIZE FMADD y01, alpha8, a5, y01 addi AO5, AO5, 4 * SIZE FMADD y02, alpha8, a6, y02 addi AO6, AO6, 4 * SIZE FMADD y03, alpha8, a7, y03 addi AO7, AO7, 4 * SIZE FMADD y04, alpha8, a8, y04 addi AO8, AO8, 4 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi Y1, Y1, 4 * SIZE .align 4 LL(17): andi. r0, M, 2 ble LL(18) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO5) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO5) FMADD y01, alpha2, a3, y01 LFD a3, 0 * SIZE(AO6) FMADD y02, alpha2, a4, y02 LFD a4, 1 * SIZE(AO6) FMADD y01, alpha3, a5, y01 LFD a5, 0 * SIZE(AO7) FMADD y02, alpha3, a6, y02 LFD a6, 1 * SIZE(AO7) FMADD y01, alpha4, a7, y01 LFD a7, 0 * SIZE(AO8) FMADD y02, alpha4, a8, y02 LFD a8, 1 * SIZE(AO8) FMADD y01, alpha5, a1, y01 addi AO1, AO1, 2 * SIZE FMADD y02, alpha5, a2, y02 addi AO2, AO2, 2 * SIZE FMADD y01, alpha6, a3, y01 addi AO3, AO3, 2 * SIZE FMADD y02, alpha6, a4, y02 addi AO4, AO4, 2 * SIZE FMADD y01, alpha7, a5, y01 addi AO5, AO5, 2 * SIZE FMADD y02, alpha7, a6, y02 addi AO6, AO6, 2 * SIZE FMADD y01, alpha8, a7, y01 addi AO7, AO7, 2 * SIZE FMADD y02, alpha8, a8, y02 addi AO8, AO8, 2 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi Y1, Y1, 2 * SIZE .align 4 LL(18): andi. r0, M, 1 ble LL(19) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 0 * SIZE(AO2) LFD a3, 0 * SIZE(AO3) LFD a4, 0 * SIZE(AO4) LFD a5, 0 * SIZE(AO5) LFD a6, 0 * SIZE(AO6) LFD a7, 0 * SIZE(AO7) LFD a8, 0 * SIZE(AO8) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 FMADD y01, alpha3, a3, y01 FMADD y01, alpha4, a4, y01 FMADD y01, alpha5, a5, y01 FMADD y01, alpha6, a6, y01 FMADD y01, alpha7, a7, y01 FMADD y01, alpha8, a8, y01 STFD y01, 0 * SIZE(Y1) .align 4 LL(19): addi J, J, -1 lfd alpha, ALPHA cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 4 mr AO1, A add AO2, A, LDA ble LL(30) .align 4 LFD alpha1, 0 * SIZE(X) add X, X, INCX LFD alpha2, 0 * SIZE(X) add X, X, INCX LFD alpha3, 0 * SIZE(X) add X, X, INCX LFD alpha4, 0 * SIZE(X) add X, X, INCX FMUL alpha1, alpha, alpha1 add AO3, AO2, LDA FMUL alpha2, alpha, alpha2 add AO4, AO3, LDA FMUL alpha3, alpha, alpha3 add A, AO4, LDA FMUL alpha4, alpha, alpha4 mr Y1, YY srawi. r0, M, 4 mtspr CTR, r0 ble LL(25) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) bdz LL(23) .align 4 LL(22): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2) FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) FMADD y09, alpha2, a1, y09 LFD a1, 0 * SIZE(AO3) FMADD y10, alpha2, a2, y10 LFD a2, 1 * SIZE(AO3) FMADD y11, alpha2, a3, y11 LFD a3, 2 * SIZE(AO3) FMADD y12, alpha2, a4, y12 LFD a4, 3 * SIZE(AO3) FMADD y13, alpha2, a5, y13 LFD a5, 4 * SIZE(AO3) FMADD y14, alpha2, a6, y14 LFD a6, 5 * SIZE(AO3) FMADD y15, alpha2, a7, y15 LFD a7, 6 * SIZE(AO3) FMADD y16, alpha2, a8, y16 LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFD a1, 8 * SIZE(AO3) FMADD y02, alpha3, a2, y02 LFD a2, 9 * SIZE(AO3) FMADD y03, alpha3, a3, y03 LFD a3, 10 * SIZE(AO3) FMADD y04, alpha3, a4, y04 LFD a4, 11 * SIZE(AO3) FMADD y05, alpha3, a5, y05 LFD a5, 12 * SIZE(AO3) FMADD y06, alpha3, a6, y06 LFD a6, 13 * SIZE(AO3) FMADD y07, alpha3, a7, y07 LFD a7, 14 * SIZE(AO3) FMADD y08, alpha3, a8, y08 LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3, a1, y09 LFD a1, 0 * SIZE(AO4) FMADD y10, alpha3, a2, y10 LFD a2, 1 * SIZE(AO4) FMADD y11, alpha3, a3, y11 LFD a3, 2 * SIZE(AO4) FMADD y12, alpha3, a4, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, alpha3, a5, y13 LFD a5, 4 * SIZE(AO4) FMADD y14, alpha3, a6, y14 LFD a6, 5 * SIZE(AO4) FMADD y15, alpha3, a7, y15 LFD a7, 6 * SIZE(AO4) FMADD y16, alpha3, a8, y16 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 LFD a1, 8 * SIZE(AO4) FMADD y02, alpha4, a2, y02 LFD a2, 9 * SIZE(AO4) FMADD y03, alpha4, a3, y03 LFD a3, 10 * SIZE(AO4) FMADD y04, alpha4, a4, y04 LFD a4, 11 * SIZE(AO4) STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) LFD y01, 16 * SIZE(Y1) LFD y02, 17 * SIZE(Y1) LFD y03, 18 * SIZE(Y1) LFD y04, 19 * SIZE(Y1) FMADD y05, alpha4, a5, y05 LFD a5, 12 * SIZE(AO4) FMADD y06, alpha4, a6, y06 LFD a6, 13 * SIZE(AO4) FMADD y07, alpha4, a7, y07 LFD a7, 14 * SIZE(AO4) FMADD y08, alpha4, a8, y08 LFD a8, 15 * SIZE(AO4) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) LFD y05, 20 * SIZE(Y1) LFD y06, 21 * SIZE(Y1) LFD y07, 22 * SIZE(Y1) LFD y08, 23 * SIZE(Y1) addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE DCBT(AO3, PREA) DCBT(AO4, PREA) FMADD y09, alpha4, a1, y09 LFD a1, 0 * SIZE(AO1) FMADD y10, alpha4, a2, y10 LFD a2, 1 * SIZE(AO1) FMADD y11, alpha4, a3, y11 LFD a3, 2 * SIZE(AO1) FMADD y12, alpha4, a4, y12 LFD a4, 3 * SIZE(AO1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) LFD y09, 24 * SIZE(Y1) LFD y10, 25 * SIZE(Y1) LFD y11, 26 * SIZE(Y1) LFD y12, 27 * SIZE(Y1) FMADD y13, alpha4, a5, y13 LFD a5, 4 * SIZE(AO1) FMADD y14, alpha4, a6, y14 LFD a6, 5 * SIZE(AO1) FMADD y15, alpha4, a7, y15 LFD a7, 6 * SIZE(AO1) FMADD y16, alpha4, a8, y16 LFD a8, 7 * SIZE(AO1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) LFD y13, 28 * SIZE(Y1) LFD y14, 29 * SIZE(Y1) LFD y15, 30 * SIZE(Y1) LFD y16, 31 * SIZE(Y1) addi Y1, Y1, 16 * SIZE DCBT(Y1, PREC) bdnz LL(22) .align 4 LL(23): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2) FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2, a1, y09 LFD a1, 0 * SIZE(AO3) FMADD y10, alpha2, a2, y10 LFD a2, 1 * SIZE(AO3) FMADD y11, alpha2, a3, y11 LFD a3, 2 * SIZE(AO3) FMADD y12, alpha2, a4, y12 LFD a4, 3 * SIZE(AO3) FMADD y13, alpha2, a5, y13 LFD a5, 4 * SIZE(AO3) FMADD y14, alpha2, a6, y14 LFD a6, 5 * SIZE(AO3) FMADD y15, alpha2, a7, y15 LFD a7, 6 * SIZE(AO3) FMADD y16, alpha2, a8, y16 LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFD a1, 8 * SIZE(AO3) FMADD y02, alpha3, a2, y02 LFD a2, 9 * SIZE(AO3) FMADD y03, alpha3, a3, y03 LFD a3, 10 * SIZE(AO3) FMADD y04, alpha3, a4, y04 LFD a4, 11 * SIZE(AO3) FMADD y05, alpha3, a5, y05 LFD a5, 12 * SIZE(AO3) FMADD y06, alpha3, a6, y06 LFD a6, 13 * SIZE(AO3) FMADD y07, alpha3, a7, y07 LFD a7, 14 * SIZE(AO3) FMADD y08, alpha3, a8, y08 LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3, a1, y09 LFD a1, 0 * SIZE(AO4) FMADD y10, alpha3, a2, y10 LFD a2, 1 * SIZE(AO4) FMADD y11, alpha3, a3, y11 LFD a3, 2 * SIZE(AO4) FMADD y12, alpha3, a4, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, alpha3, a5, y13 LFD a5, 4 * SIZE(AO4) FMADD y14, alpha3, a6, y14 LFD a6, 5 * SIZE(AO4) FMADD y15, alpha3, a7, y15 LFD a7, 6 * SIZE(AO4) FMADD y16, alpha3, a8, y16 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 LFD a1, 8 * SIZE(AO4) FMADD y02, alpha4, a2, y02 LFD a2, 9 * SIZE(AO4) FMADD y03, alpha4, a3, y03 LFD a3, 10 * SIZE(AO4) FMADD y04, alpha4, a4, y04 LFD a4, 11 * SIZE(AO4) FMADD y05, alpha4, a5, y05 LFD a5, 12 * SIZE(AO4) FMADD y06, alpha4, a6, y06 LFD a6, 13 * SIZE(AO4) FMADD y07, alpha4, a7, y07 LFD a7, 14 * SIZE(AO4) FMADD y08, alpha4, a8, y08 LFD a8, 15 * SIZE(AO4) FMADD y09, alpha4, a1, y09 addi AO1, AO1, 16 * SIZE FMADD y10, alpha4, a2, y10 addi AO2, AO2, 16 * SIZE FMADD y11, alpha4, a3, y11 addi AO3, AO3, 16 * SIZE FMADD y12, alpha4, a4, y12 addi AO4, AO4, 16 * SIZE FMADD y13, alpha4, a5, y13 FMADD y14, alpha4, a6, y14 FMADD y15, alpha4, a7, y15 FMADD y16, alpha4, a8, y16 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) addi Y1, Y1, 16 * SIZE .align 4 LL(25): andi. r0, M, 15 ble LL(30) andi. r0, M, 8 ble LL(26) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO2) FMADD y05, alpha1, a5, y05 LFD a5, 4 * SIZE(AO2) FMADD y06, alpha1, a6, y06 LFD a6, 5 * SIZE(AO2) FMADD y07, alpha1, a7, y07 LFD a7, 6 * SIZE(AO2) FMADD y08, alpha1, a8, y08 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 0 * SIZE(AO3) FMADD y02, alpha2, a2, y02 LFD a2, 1 * SIZE(AO3) FMADD y03, alpha2, a3, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, alpha2, a4, y04 LFD a4, 3 * SIZE(AO3) FMADD y05, alpha2, a5, y05 LFD a5, 4 * SIZE(AO3) FMADD y06, alpha2, a6, y06 LFD a6, 5 * SIZE(AO3) FMADD y07, alpha2, a7, y07 LFD a7, 6 * SIZE(AO3) FMADD y08, alpha2, a8, y08 LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFD a1, 0 * SIZE(AO4) FMADD y02, alpha3, a2, y02 LFD a2, 1 * SIZE(AO4) FMADD y03, alpha3, a3, y03 LFD a3, 2 * SIZE(AO4) FMADD y04, alpha3, a4, y04 LFD a4, 3 * SIZE(AO4) FMADD y05, alpha3, a5, y05 LFD a5, 4 * SIZE(AO4) FMADD y06, alpha3, a6, y06 LFD a6, 5 * SIZE(AO4) FMADD y07, alpha3, a7, y07 LFD a7, 6 * SIZE(AO4) FMADD y08, alpha3, a8, y08 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 addi AO1, AO1, 8 * SIZE FMADD y02, alpha4, a2, y02 addi AO2, AO2, 8 * SIZE FMADD y03, alpha4, a3, y03 addi AO3, AO3, 8 * SIZE FMADD y04, alpha4, a4, y04 addi AO4, AO4, 8 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) FMADD y05, alpha4, a5, y05 FMADD y06, alpha4, a6, y06 FMADD y07, alpha4, a7, y07 FMADD y08, alpha4, a8, y08 STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi Y1, Y1, 8 * SIZE .align 4 LL(26): andi. r0, M, 4 ble LL(27) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO3) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO3) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2, a5, y01 LFD a5, 0 * SIZE(AO4) FMADD y02, alpha2, a6, y02 LFD a6, 1 * SIZE(AO4) FMADD y03, alpha2, a7, y03 LFD a7, 2 * SIZE(AO4) FMADD y04, alpha2, a8, y04 LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3, a1, y01 addi AO1, AO1, 4 * SIZE FMADD y02, alpha3, a2, y02 addi AO2, AO2, 4 * SIZE FMADD y03, alpha3, a3, y03 addi AO3, AO3, 4 * SIZE FMADD y04, alpha3, a4, y04 addi AO4, AO4, 4 * SIZE FMADD y01, alpha4, a5, y01 FMADD y02, alpha4, a6, y02 FMADD y03, alpha4, a7, y03 FMADD y04, alpha4, a8, y04 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi Y1, Y1, 4 * SIZE .align 4 LL(27): andi. r0, M, 2 ble LL(28) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1, a1, y01 addi AO1, AO1, 2 * SIZE FMADD y02, alpha1, a2, y02 addi AO2, AO2, 2 * SIZE FMADD y01, alpha2, a3, y01 addi AO3, AO3, 2 * SIZE FMADD y02, alpha2, a4, y02 addi AO4, AO4, 2 * SIZE FMADD y01, alpha3, a5, y01 FMADD y02, alpha3, a6, y02 FMADD y01, alpha4, a7, y01 FMADD y02, alpha4, a8, y02 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi Y1, Y1, 2 * SIZE .align 4 LL(28): andi. r0, M, 1 ble LL(30) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 0 * SIZE(AO2) LFD a3, 0 * SIZE(AO3) LFD a4, 0 * SIZE(AO4) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 FMADD y01, alpha3, a3, y01 FMADD y01, alpha4, a4, y01 STFD y01, 0 * SIZE(Y1) .align 4 LL(30): andi. J, N, 2 lfd alpha, ALPHA ble LL(40) .align 4 LFD alpha1, 0 * SIZE(X) add X, X, INCX LFD alpha2, 0 * SIZE(X) add X, X, INCX FMUL alpha1, alpha, alpha1 FMUL alpha2, alpha, alpha2 mr AO1, A add AO2, A, LDA add A, AO2, LDA mr Y1, YY srawi. r0, M, 4 mtspr CTR, r0 ble LL(35) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) bdz LL(33) .align 4 LL(32): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2) FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2, a1, y09 LFD a1, 16 * SIZE(AO1) FMADD y10, alpha2, a2, y10 LFD a2, 17 * SIZE(AO1) FMADD y11, alpha2, a3, y11 LFD a3, 18 * SIZE(AO1) FMADD y12, alpha2, a4, y12 LFD a4, 19 * SIZE(AO1) FMADD y13, alpha2, a5, y13 LFD a5, 20 * SIZE(AO1) FMADD y14, alpha2, a6, y14 LFD a6, 21 * SIZE(AO1) FMADD y15, alpha2, a7, y15 LFD a7, 22 * SIZE(AO1) FMADD y16, alpha2, a8, y16 LFD a8, 23 * SIZE(AO1) STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) LFD y01, 16 * SIZE(Y1) LFD y02, 17 * SIZE(Y1) LFD y03, 18 * SIZE(Y1) LFD y04, 19 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) LFD y05, 20 * SIZE(Y1) LFD y06, 21 * SIZE(Y1) LFD y07, 22 * SIZE(Y1) LFD y08, 23 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) LFD y09, 24 * SIZE(Y1) LFD y10, 25 * SIZE(Y1) LFD y11, 26 * SIZE(Y1) LFD y12, 27 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) LFD y13, 28 * SIZE(Y1) LFD y14, 29 * SIZE(Y1) LFD y15, 30 * SIZE(Y1) LFD y16, 31 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi Y1, Y1, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) DCBT(Y1, PREC) bdnz LL(32) .align 4 LL(33): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2) FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2, a1, y09 FMADD y10, alpha2, a2, y10 FMADD y11, alpha2, a3, y11 FMADD y12, alpha2, a4, y12 FMADD y13, alpha2, a5, y13 FMADD y14, alpha2, a6, y14 FMADD y15, alpha2, a7, y15 FMADD y16, alpha2, a8, y16 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi Y1, Y1, 16 * SIZE .align 4 LL(35): andi. r0, M, 15 ble LL(40) andi. r0, M, 8 ble LL(36) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO2) FMADD y05, alpha1, a5, y05 LFD a5, 4 * SIZE(AO2) FMADD y06, alpha1, a6, y06 LFD a6, 5 * SIZE(AO2) FMADD y07, alpha1, a7, y07 LFD a7, 6 * SIZE(AO2) FMADD y08, alpha1, a8, y08 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 FMADD y02, alpha2, a2, y02 FMADD y03, alpha2, a3, y03 FMADD y04, alpha2, a4, y04 FMADD y05, alpha2, a5, y05 FMADD y06, alpha2, a6, y06 FMADD y07, alpha2, a7, y07 FMADD y08, alpha2, a8, y08 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi Y1, Y1, 8 * SIZE .align 4 LL(36): andi. r0, M, 4 ble LL(37) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 FMADD y01, alpha2, a5, y01 FMADD y02, alpha2, a6, y02 FMADD y03, alpha2, a7, y03 FMADD y04, alpha2, a8, y04 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi Y1, Y1, 4 * SIZE .align 4 LL(37): andi. r0, M, 2 ble LL(38) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y01, alpha2, a3, y01 FMADD y02, alpha2, a4, y02 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi Y1, Y1, 2 * SIZE .align 4 LL(38): andi. r0, M, 1 ble LL(40) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 0 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 STFD y01, 0 * SIZE(Y1) .align 4 LL(40): andi. J, N, 1 lfd alpha, ALPHA ble LL(990) .align 4 LFD alpha1, 0 * SIZE(X) FMUL alpha1, alpha, alpha1 mr AO1, A mr Y1, YY srawi. r0, M, 4 mtspr CTR, r0 ble LL(45) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) bdz LL(43) .align 4 LL(42): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 16 * SIZE(AO1) FMADD y10, alpha1, a2, y10 LFD a2, 17 * SIZE(AO1) FMADD y11, alpha1, a3, y11 LFD a3, 18 * SIZE(AO1) FMADD y12, alpha1, a4, y12 LFD a4, 19 * SIZE(AO1) FMADD y13, alpha1, a5, y13 LFD a5, 20 * SIZE(AO1) FMADD y14, alpha1, a6, y14 LFD a6, 21 * SIZE(AO1) FMADD y15, alpha1, a7, y15 LFD a7, 22 * SIZE(AO1) FMADD y16, alpha1, a8, y16 LFD a8, 23 * SIZE(AO1) STFD y01, 0 * SIZE(Y1) LFD y01, 16 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) LFD y02, 17 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) LFD y03, 18 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) LFD y04, 19 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) LFD y05, 20 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) LFD y06, 21 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) LFD y07, 22 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) LFD y08, 23 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) LFD y09, 24 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) LFD y10, 25 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) LFD y11, 26 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) LFD y12, 27 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) LFD y13, 28 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) LFD y14, 29 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) LFD y15, 30 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) LFD y16, 31 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi Y1, Y1, 16 * SIZE DCBT(AO1, PREA) DCBT(Y1, PREC) bdnz LL(42) .align 4 LL(43): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 FMADD y10, alpha1, a2, y10 FMADD y11, alpha1, a3, y11 FMADD y12, alpha1, a4, y12 FMADD y13, alpha1, a5, y13 FMADD y14, alpha1, a6, y14 FMADD y15, alpha1, a7, y15 FMADD y16, alpha1, a8, y16 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi Y1, Y1, 16 * SIZE .align 4 LL(45): andi. r0, M, 15 ble LL(990) andi. r0, M, 8 ble LL(46) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 FMADD y05, alpha1, a5, y05 FMADD y06, alpha1, a6, y06 FMADD y07, alpha1, a7, y07 FMADD y08, alpha1, a8, y08 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi AO1, AO1, 8 * SIZE addi Y1, Y1, 8 * SIZE .align 4 LL(46): andi. r0, M, 4 ble LL(47) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi AO1, AO1, 4 * SIZE addi Y1, Y1, 4 * SIZE .align 4 LL(47): andi. r0, M, 2 ble LL(48) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi AO1, AO1, 2 * SIZE addi Y1, Y1, 2 * SIZE .align 4 LL(48): andi. r0, M, 1 ble LL(990) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) FMADD y01, alpha1, a1, y01 STFD y01, 0 * SIZE(Y1) .align 4 LL(990): cmpi cr0, 0, INCY, SIZE beq LL(999) mr YY, BUFFER mr Y1, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(995) .align 4 LL(991): LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f4, 0 * SIZE(Y) add Y, Y, INCY LFD f5, 0 * SIZE(Y) add Y, Y, INCY LFD f6, 0 * SIZE(Y) add Y, Y, INCY LFD f7, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(YY) LFD f9, 1 * SIZE(YY) LFD f10, 2 * SIZE(YY) LFD f11, 3 * SIZE(YY) LFD f12, 4 * SIZE(YY) LFD f13, 5 * SIZE(YY) LFD f14, 6 * SIZE(YY) LFD f15, 7 * SIZE(YY) addi YY, YY, 8 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFD f8, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f9, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f10, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f11, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f12, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f13, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f14, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f15, 0 * SIZE(Y1) add Y1, Y1, INCY bdnz LL(991) .align 4 LL(995): andi. J, M, 4 ble LL(996) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(YY) LFD f9, 1 * SIZE(YY) LFD f10, 2 * SIZE(YY) LFD f11, 3 * SIZE(YY) addi YY, YY, 4 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFD f8, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f9, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f10, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f11, 0 * SIZE(Y1) add Y1, Y1, INCY .align 4 LL(996): andi. J, M, 2 ble LL(997) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(YY) LFD f9, 1 * SIZE(YY) addi YY, YY, 2 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 STFD f8, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f9, 0 * SIZE(Y1) add Y1, Y1, INCY .align 4 LL(997): andi. J, M, 1 ble LL(999) LFD f0, 0 * SIZE(Y) LFD f8, 0 * SIZE(YY) FADD f8, f8, f0 STFD f8, 0 * SIZE(Y1) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemv_n_ppc440.S000066400000000000000000000561651313527062700202660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #define I r11 #define J r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define LDA8 r18 #define Y1 r19 #define Y2 r20 #define PREA r21 #define YY r22 #define BUFFER r23 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define y09 f8 #define y10 f9 #define y11 f10 #define y12 f11 #define y13 f12 #define y14 f13 #define y15 f14 #define y16 f15 #define alpha1 f16 #define alpha2 f17 #define alpha3 f18 #define alpha4 f19 #define a1 f20 #define a2 f21 #define a3 f22 #define a4 f23 #define a5 f24 #define a6 f25 #define a7 f26 #define a8 f27 #define alpha f27 #if defined(PPCG4) #define PREFETCHSIZE_A (3 * 4) #endif #if defined(POWER6) #define PREFETCHSIZE_A (3 * 4) #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #define ALPHA 200(SP) #define FZERO 208(SP) #else #define STACKSIZE 280 #define ALPHA 256(SP) #define FZERO 264(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) #ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif stfd f1, ALPHA fmr alpha, f1 slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) addi A, A, -SIZE sub X, X, INCX sub Y, Y, INCY mr YY, Y lfd f0, FZERO cmpi cr0, 0, INCY, SIZE beq LL(10) addi YY, BUFFER, -SIZE addi Y1, BUFFER, -SIZE addi r0, M, 7 srawi. r0, r0, 3 mtspr CTR, r0 .align 4 LL(02): STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) bdnz LL(02) .align 4 LL(10): srawi. J, N, 2 ble LL(30) .align 4 LL(21): mr AO1, A add AO2, A, LDA LFDUX alpha1, X, INCX LFDUX alpha2, X, INCX LFDUX alpha3, X, INCX LFDUX alpha4, X, INCX FMUL alpha1, alpha, alpha1 add AO3, AO2, LDA FMUL alpha2, alpha, alpha2 add AO4, AO3, LDA FMUL alpha3, alpha, alpha3 add A, AO4, LDA FMUL alpha4, alpha, alpha4 mr Y1, YY mr Y2, YY srawi. r0, M, 3 mtspr CTR, r0 ble LL(25) LFDU y01, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) bdz LL(23) .align 4 LL(22): #ifdef PPCG4 dcbtst Y1, PREA #endif FMADD y09, alpha1, a1, y01 LFDU a1, 1 * SIZE(AO2) FMADD y10, alpha1, a2, y02 LFDU a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y03 LFDU a3, 1 * SIZE(AO2) FMADD y12, alpha1, a4, y04 LFDU a4, 1 * SIZE(AO2) LFDU y01, 1 * SIZE(Y1) #ifdef PPCG4 dcbt AO2, PREA #endif FMADD y13, alpha1, a5, y05 LFDU a5, 1 * SIZE(AO2) FMADD y14, alpha1, a6, y06 LFDU a6, 1 * SIZE(AO2) FMADD y15, alpha1, a7, y07 LFDU a7, 1 * SIZE(AO2) FMADD y16, alpha1, a8, y08 LFDU a8, 1 * SIZE(AO2) LFDU y02, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD y09, alpha2, a1, y09 LFDU a1, 1 * SIZE(AO3) FMADD y10, alpha2, a2, y10 LFDU a2, 1 * SIZE(AO3) FMADD y11, alpha2, a3, y11 LFDU a3, 1 * SIZE(AO3) FMADD y12, alpha2, a4, y12 LFDU a4, 1 * SIZE(AO3) LFDU y03, 1 * SIZE(Y1) #ifdef PPCG4 dcbt AO3, PREA #endif FMADD y13, alpha2, a5, y13 LFDU a5, 1 * SIZE(AO3) FMADD y14, alpha2, a6, y14 LFDU a6, 1 * SIZE(AO3) FMADD y15, alpha2, a7, y15 LFDU a7, 1 * SIZE(AO3) FMADD y16, alpha2, a8, y16 LFDU a8, 1 * SIZE(AO3) LFDU y04, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO3, PREA #endif FMADD y09, alpha3, a1, y09 LFDU a1, 1 * SIZE(AO4) FMADD y10, alpha3, a2, y10 LFDU a2, 1 * SIZE(AO4) FMADD y11, alpha3, a3, y11 LFDU a3, 1 * SIZE(AO4) FMADD y12, alpha3, a4, y12 LFDU a4, 1 * SIZE(AO4) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA #endif LFDU y05, 1 * SIZE(Y1) #ifdef PPCG4 dcbt AO4, PREA #endif FMADD y13, alpha3, a5, y13 LFDU a5, 1 * SIZE(AO4) FMADD y14, alpha3, a6, y14 LFDU a6, 1 * SIZE(AO4) FMADD y15, alpha3, a7, y15 LFDU a7, 1 * SIZE(AO4) FMADD y16, alpha3, a8, y16 LFDU a8, 1 * SIZE(AO4) LFDU y06, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO4, PREA #endif FMADD y09, alpha4, a1, y09 LFDU a1, 1 * SIZE(AO1) FMADD y10, alpha4, a2, y10 LFDU a2, 1 * SIZE(AO1) FMADD y11, alpha4, a3, y11 LFDU a3, 1 * SIZE(AO1) FMADD y12, alpha4, a4, y12 LFDU a4, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) #ifdef PPCG4 dcbt AO1, PREA #endif STFDU y09, 1 * SIZE(Y2) STFDU y10, 1 * SIZE(Y2) STFDU y11, 1 * SIZE(Y2) STFDU y12, 1 * SIZE(Y2) FMADD y13, alpha4, a5, y13 LFDU a5, 1 * SIZE(AO1) FMADD y14, alpha4, a6, y14 LFDU a6, 1 * SIZE(AO1) FMADD y15, alpha4, a7, y15 LFDU a7, 1 * SIZE(AO1) FMADD y16, alpha4, a8, y16 LFDU a8, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) bdnz LL(22) .align 4 LL(23): FMADD y01, alpha1, a1, y01 LFDU a1, 1 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFDU a3, 1 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFDU a4, 1 * SIZE(AO2) FMADD y05, alpha1, a5, y05 LFDU a5, 1 * SIZE(AO2) FMADD y06, alpha1, a6, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, alpha1, a7, y07 LFDU a7, 1 * SIZE(AO2) FMADD y08, alpha1, a8, y08 LFDU a8, 1 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFDU a1, 1 * SIZE(AO3) FMADD y02, alpha2, a2, y02 LFDU a2, 1 * SIZE(AO3) FMADD y03, alpha2, a3, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, alpha2, a4, y04 LFDU a4, 1 * SIZE(AO3) FMADD y05, alpha2, a5, y05 LFDU a5, 1 * SIZE(AO3) FMADD y06, alpha2, a6, y06 LFDU a6, 1 * SIZE(AO3) FMADD y07, alpha2, a7, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, alpha2, a8, y08 LFDU a8, 1 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFDU a1, 1 * SIZE(AO4) FMADD y02, alpha3, a2, y02 LFDU a2, 1 * SIZE(AO4) FMADD y03, alpha3, a3, y03 LFDU a3, 1 * SIZE(AO4) FMADD y04, alpha3, a4, y04 LFDU a4, 1 * SIZE(AO4) FMADD y05, alpha3, a5, y05 LFDU a5, 1 * SIZE(AO4) FMADD y06, alpha3, a6, y06 LFDU a6, 1 * SIZE(AO4) FMADD y07, alpha3, a7, y07 LFDU a7, 1 * SIZE(AO4) FMADD y08, alpha3, a8, y08 LFDU a8, 1 * SIZE(AO4) FMADD y01, alpha4, a1, y01 FMADD y02, alpha4, a2, y02 FMADD y03, alpha4, a3, y03 FMADD y04, alpha4, a4, y04 FMADD y05, alpha4, a5, y05 STFDU y01, 1 * SIZE(Y2) FMADD y06, alpha4, a6, y06 STFDU y02, 1 * SIZE(Y2) FMADD y07, alpha4, a7, y07 STFDU y03, 1 * SIZE(Y2) FMADD y08, alpha4, a8, y08 STFDU y04, 1 * SIZE(Y2) STFDU y05, 1 * SIZE(Y2) STFDU y06, 1 * SIZE(Y2) STFDU y07, 1 * SIZE(Y2) STFDU y08, 1 * SIZE(Y2) .align 4 LL(25): andi. r0, M, 7 ble LL(29) andi. r0, M, 4 ble LL(27) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1, a1, y01 LFDU a5, 1 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFDU a7, 1 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFDU a8, 1 * SIZE(AO2) FMADD y01, alpha2, a5, y01 LFDU a1, 1 * SIZE(AO3) FMADD y02, alpha2, a6, y02 LFDU a2, 1 * SIZE(AO3) FMADD y03, alpha2, a7, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, alpha2, a8, y04 LFDU a4, 1 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFDU a5, 1 * SIZE(AO4) FMADD y02, alpha3, a2, y02 LFDU a6, 1 * SIZE(AO4) FMADD y03, alpha3, a3, y03 LFDU a7, 1 * SIZE(AO4) FMADD y04, alpha3, a4, y04 LFDU a8, 1 * SIZE(AO4) FMADD y01, alpha4, a5, y01 FMADD y02, alpha4, a6, y02 FMADD y03, alpha4, a7, y03 FMADD y04, alpha4, a8, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4 LL(27): andi. r0, M, 2 ble LL(28) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) FMADD y01, alpha1, a1, y01 LFDU a5, 1 * SIZE(AO3) FMADD y02, alpha1, a2, y02 LFDU a6, 1 * SIZE(AO3) FMADD y01, alpha2, a3, y01 LFDU a7, 1 * SIZE(AO4) FMADD y02, alpha2, a4, y02 LFDU a8, 1 * SIZE(AO4) FMADD y01, alpha3, a5, y01 FMADD y02, alpha3, a6, y02 FMADD y01, alpha4, a7, y01 FMADD y02, alpha4, a8, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 LL(28): andi. r0, M, 1 ble LL(29) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO2) LFDU a3, 1 * SIZE(AO3) LFDU a4, 1 * SIZE(AO4) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 FMADD y01, alpha3, a3, y01 FMADD y01, alpha4, a4, y01 STFDU y01, 1 * SIZE(Y2) .align 4 LL(29): addi J, J, -1 lfd alpha, ALPHA cmpi cr0, 0, J, 0 bgt LL(21) .align 4 LL(30): andi. J, N, 2 ble LL(40) LFDUX alpha1, X, INCX LFDUX alpha2, X, INCX mr AO1, A add AO2, A, LDA add A, AO2, LDA FMUL alpha1, alpha, alpha1 mr Y1, YY FMUL alpha2, alpha, alpha2 mr Y2, YY srawi. r0, M, 3 mtspr CTR, r0 ble LL(35) LFDU y01, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) bdz LL(33) .align 4 LL(32): #ifdef PPCG4 dcbtst Y1, PREA #endif FMADD y09, alpha1, a1, y01 LFDU a1, 1 * SIZE(AO2) FMADD y10, alpha1, a2, y02 LFDU a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y03 LFDU a3, 1 * SIZE(AO2) FMADD y12, alpha1, a4, y04 LFDU a4, 1 * SIZE(AO2) LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) #ifdef PPCG4 dcbt AO2, PREA #endif FMADD y13, alpha1, a5, y05 LFDU a5, 1 * SIZE(AO2) FMADD y14, alpha1, a6, y06 LFDU a6, 1 * SIZE(AO2) FMADD y15, alpha1, a7, y07 LFDU a7, 1 * SIZE(AO2) FMADD y16, alpha1, a8, y08 LFDU a8, 1 * SIZE(AO2) LFDU y03, 1 * SIZE(Y1) LFDU y04, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD y09, alpha2, a1, y09 LFDU a1, 1 * SIZE(AO1) FMADD y10, alpha2, a2, y10 LFDU a2, 1 * SIZE(AO1) FMADD y11, alpha2, a3, y11 LFDU a3, 1 * SIZE(AO1) FMADD y12, alpha2, a4, y12 LFDU a4, 1 * SIZE(AO1) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA #endif LFDU y05, 1 * SIZE(Y1) LFDU y06, 1 * SIZE(Y1) #ifdef PPCG4 dcbt AO1, PREA #endif FMADD y13, alpha2, a5, y13 LFDU a5, 1 * SIZE(AO1) FMADD y14, alpha2, a6, y14 LFDU a6, 1 * SIZE(AO1) FMADD y15, alpha2, a7, y15 LFDU a7, 1 * SIZE(AO1) FMADD y16, alpha2, a8, y16 LFDU a8, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU y08, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif STFDU y09, 1 * SIZE(Y2) STFDU y10, 1 * SIZE(Y2) STFDU y11, 1 * SIZE(Y2) STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) bdnz LL(32) .align 4 LL(33): FMADD y01, alpha1, a1, y01 LFDU a1, 1 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFDU a3, 1 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFDU a4, 1 * SIZE(AO2) FMADD y05, alpha1, a5, y05 LFDU a5, 1 * SIZE(AO2) FMADD y06, alpha1, a6, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, alpha1, a7, y07 LFDU a7, 1 * SIZE(AO2) FMADD y08, alpha1, a8, y08 LFDU a8, 1 * SIZE(AO2) FMADD y01, alpha2, a1, y01 FMADD y02, alpha2, a2, y02 FMADD y03, alpha2, a3, y03 FMADD y04, alpha2, a4, y04 FMADD y05, alpha2, a5, y05 STFDU y01, 1 * SIZE(Y2) FMADD y06, alpha2, a6, y06 STFDU y02, 1 * SIZE(Y2) FMADD y07, alpha2, a7, y07 STFDU y03, 1 * SIZE(Y2) FMADD y08, alpha2, a8, y08 STFDU y04, 1 * SIZE(Y2) STFDU y05, 1 * SIZE(Y2) STFDU y06, 1 * SIZE(Y2) STFDU y07, 1 * SIZE(Y2) STFDU y08, 1 * SIZE(Y2) .align 4 LL(35): andi. r0, M, 7 ble LL(40) andi. r0, M, 4 ble LL(37) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1, a1, y01 LFDU a5, 1 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFDU a7, 1 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFDU a8, 1 * SIZE(AO2) FMADD y01, alpha2, a5, y01 FMADD y02, alpha2, a6, y02 FMADD y03, alpha2, a7, y03 FMADD y04, alpha2, a8, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4 LL(37): andi. r0, M, 2 ble LL(38) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y01, alpha2, a3, y01 FMADD y02, alpha2, a4, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 LL(38): andi. r0, M, 1 ble LL(40) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 STFDU y01, 1 * SIZE(Y2) .align 4 LL(40): andi. J, N, 1 lfd alpha, ALPHA ble LL(990) LFDUX alpha1, X, INCX mr AO1, A add A, A, LDA FMUL alpha1, alpha, alpha1 mr Y1, YY mr Y2, YY srawi. r0, M, 3 mtspr CTR, r0 ble LL(45) LFDU y01, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) bdz LL(43) .align 4 LL(42): #ifdef PPCG4 dcbtst Y1, PREA #endif FMADD y09, alpha1, a1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y10, alpha1, a2, y02 LFDU a2, 1 * SIZE(AO1) FMADD y11, alpha1, a3, y03 LFDU a3, 1 * SIZE(AO1) FMADD y12, alpha1, a4, y04 LFDU a4, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) LFDU y03, 1 * SIZE(Y1) LFDU y04, 1 * SIZE(Y1) #ifdef PPCG4 dcbt AO1, PREA #endif FMADD y13, alpha1, a5, y05 LFDU a5, 1 * SIZE(AO1) FMADD y14, alpha1, a6, y06 LFDU a6, 1 * SIZE(AO1) FMADD y15, alpha1, a7, y07 LFDU a7, 1 * SIZE(AO1) FMADD y16, alpha1, a8, y08 LFDU a8, 1 * SIZE(AO1) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA #endif LFDU y05, 1 * SIZE(Y1) LFDU y06, 1 * SIZE(Y1) LFDU y07, 1 * SIZE(Y1) LFDU y08, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif STFDU y09, 1 * SIZE(Y2) STFDU y10, 1 * SIZE(Y2) STFDU y11, 1 * SIZE(Y2) STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) bdnz LL(42) .align 4 LL(43): FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 FMADD y05, alpha1, a5, y05 STFDU y01, 1 * SIZE(Y2) FMADD y06, alpha1, a6, y06 STFDU y02, 1 * SIZE(Y2) FMADD y07, alpha1, a7, y07 STFDU y03, 1 * SIZE(Y2) FMADD y08, alpha1, a8, y08 STFDU y04, 1 * SIZE(Y2) STFDU y05, 1 * SIZE(Y2) STFDU y06, 1 * SIZE(Y2) STFDU y07, 1 * SIZE(Y2) STFDU y08, 1 * SIZE(Y2) .align 4 LL(45): andi. r0, M, 7 ble LL(990) andi. r0, M, 4 ble LL(47) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4 LL(47): andi. r0, M, 2 ble LL(48) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 LL(48): andi. r0, M, 1 ble LL(990) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) FMADD y01, alpha1, a1, y01 STFDU y01, 1 * SIZE(Y2) .align 4 LL(990): cmpi cr0, 0, INCY, SIZE beq LL(999) addi YY, BUFFER, -SIZE mr Y1, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(995) .align 4 LL(991): LFDUX f0, Y, INCY LFDUX f1, Y, INCY LFDUX f2, Y, INCY LFDUX f3, Y, INCY LFDUX f4, Y, INCY LFDUX f5, Y, INCY LFDUX f6, Y, INCY LFDUX f7, Y, INCY LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) LFDU f10, 1 * SIZE(YY) LFDU f11, 1 * SIZE(YY) LFDU f12, 1 * SIZE(YY) LFDU f13, 1 * SIZE(YY) LFDU f14, 1 * SIZE(YY) LFDU f15, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFDUX f8, Y1, INCY STFDUX f9, Y1, INCY STFDUX f10, Y1, INCY STFDUX f11, Y1, INCY STFDUX f12, Y1, INCY STFDUX f13, Y1, INCY STFDUX f14, Y1, INCY STFDUX f15, Y1, INCY bdnz LL(991) .align 4 LL(995): andi. J, M, 4 ble LL(996) LFDUX f0, Y, INCY LFDUX f1, Y, INCY LFDUX f2, Y, INCY LFDUX f3, Y, INCY LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) LFDU f10, 1 * SIZE(YY) LFDU f11, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFDUX f8, Y1, INCY STFDUX f9, Y1, INCY STFDUX f10, Y1, INCY STFDUX f11, Y1, INCY .align 4 LL(996): andi. J, M, 2 ble LL(997) LFDUX f0, Y, INCY LFDUX f1, Y, INCY LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 STFDUX f8, Y1, INCY STFDUX f9, Y1, INCY .align 4 LL(997): andi. J, M, 1 ble LL(999) LFDUX f0, Y, INCY LFDU f8, 1 * SIZE(YY) FADD f8, f8, f0 STFDUX f8, Y1, INCY .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemv_t.S000066400000000000000000001575511313527062700172030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #define BUFFER r11 #define XP r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define AO5 r18 #define AO6 r19 #define AO7 r20 #define AO8 r21 #define MIN_N r22 #define J r23 #define CO r24 #define PREA r25 #define PREC r26 #define BO r27 #define PLDA_M r28 #define IS r29 #define Y1 CO #if defined(PPCG4) #define PREFETCHSIZE_A 42 #define PREFETCHSIZE_C 16 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 42 #define PREFETCHSIZE_C 16 #endif #ifdef PPC970 #define PREFETCHSIZE_A 42 #define PREFETCHSIZE_C 16 #endif #ifdef CELL #define PREFETCHSIZE_A 42 #define PREFETCHSIZE_C 16 #endif #ifdef POWER4 #define PREFETCHSIZE_A 48 #define PREFETCHSIZE_C 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 40 #define PREFETCHSIZE_C 8 #endif #ifdef POWER6 #define PREFETCHSIZE_A 96 #define PREFETCHSIZE_C 8 #endif #ifdef POWER8 #define PREFETCHSIZE_A 96 #define PREFETCHSIZE_C 8 #endif #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define y09 f8 #define y10 f9 #define y11 f10 #define y12 f11 #define y13 f12 #define y14 f13 #define y15 f14 #define y16 f15 #define a1 f16 #define a2 f17 #define a3 f18 #define a4 f19 #define a5 f20 #define a6 f21 #define a7 f22 #define a8 f23 #define b1 f24 #define b2 f25 #define b3 f26 #define b4 f27 #define b5 f28 #define b6 f29 #define b7 f30 #define b8 f31 #define alpha f31 #ifndef NEEDPARAM #define P 2048 #ifndef __64BIT__ #define STACKSIZE 224 #else #define STACKSIZE 288 #endif #define FZERO 144(SP) #define ALPHA 152(SP) PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r0, FZERO stfd f1, ALPHA std r14, 160(SP) std r15, 168(SP) std r16, 176(SP) std r17, 184(SP) std r18, 192(SP) std r19, 200(SP) std r20, 208(SP) std r21, 216(SP) std r22, 224(SP) std r23, 232(SP) std r24, 240(SP) std r25, 248(SP) std r26, 256(SP) std r27, 264(SP) std r28, 272(SP) std r29, 280(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stfd f1, ALPHA stw r14, 160(SP) stw r15, 164(SP) stw r16, 168(SP) stw r17, 172(SP) stw r18, 176(SP) stw r19, 180(SP) stw r20, 184(SP) stw r21, 188(SP) stw r22, 192(SP) stw r23, 196(SP) stw r24, 200(SP) stw r25, 204(SP) stw r26, 208(SP) stw r27, 212(SP) stw r28, 216(SP) stw r29, 220(SP) #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif mullw PLDA_M, LDA, N li XP, P subf PLDA_M, XP, PLDA_M slwi PLDA_M, PLDA_M, BASE_SHIFT slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT subf Y, INCY, Y li IS, 0 addi A, A, -SIZE li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpi cr0, 0, M, 0 ble LL(999) cmpi cr0, 0, N, 0 ble LL(999) .align 4 LL(ISLoop): subf MIN_N, IS, M slwi r0, IS, BASE_SHIFT cmpi cr0, 0, MIN_N, P ble+ LL(min_nP) li MIN_N, P LL(min_nP): add XP, X, r0 cmpi cr0, 0, INCX, SIZE beq LL(10) mr XP, BUFFER addi CO, BUFFER, -SIZE srawi. r0, MIN_N, 3 mtspr CTR, r0 ble LL(CopyRemain) .align 4 LL(CopyKernel): LFD f0, 0 * SIZE(X) add X, X, INCX LFD f1, 0 * SIZE(X) add X, X, INCX LFD f2, 0 * SIZE(X) add X, X, INCX LFD f3, 0 * SIZE(X) add X, X, INCX LFD f4, 0 * SIZE(X) add X, X, INCX LFD f5, 0 * SIZE(X) add X, X, INCX LFD f6, 0 * SIZE(X) add X, X, INCX LFD f7, 0 * SIZE(X) add X, X, INCX STFD f0, 1 * SIZE(CO) STFD f1, 2 * SIZE(CO) STFD f2, 3 * SIZE(CO) STFD f3, 4 * SIZE(CO) STFD f4, 5 * SIZE(CO) STFD f5, 6 * SIZE(CO) STFD f6, 7 * SIZE(CO) STFDU f7, 8 * SIZE(CO) bdnz LL(CopyKernel) .align 4 LL(CopyRemain): andi. r0, MIN_N, 7 mtspr CTR, r0 ble LL(10) .align 4 LL(CopySub): LFD f0, 0 * SIZE(X) add X, X, INCX STFDU f0, 1 * SIZE(CO) bdnz LL(CopySub) .align 4 LL(10): mr CO, Y addi XP, XP, -SIZE srawi. J, N, 3 ble LL(20) .align 4 LL(11): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add AO5, AO4, LDA add AO6, AO5, LDA add AO7, AO6, LDA add AO8, AO7, LDA add A, AO8, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y05, y01 fmr y06, y01 fmr y07, y01 fmr y08, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 fmr y13, y01 fmr y14, y01 fmr y15, y01 fmr y16, y01 DCBT(Y1, PREC) srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(14) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 1 * SIZE(AO5) LFD a6, 1 * SIZE(AO6) LFD a7, 1 * SIZE(AO7) LFD a8, 1 * SIZE(AO8) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(13) .align 4 LL(12): FMADD y01, a1, b1, y01 LFD a1, 2 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 2 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 2 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 2 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 2 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 2 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 3 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 3 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 3 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 3 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 3 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 3 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 LFD a1, 4 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 4 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 4 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 4 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 4 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 4 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 4 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) FMADD y09, a1, b4, y09 LFD a1, 5 * SIZE(AO1) FMADD y10, a2, b4, y10 LFD a2, 5 * SIZE(AO2) FMADD y11, a3, b4, y11 LFD a3, 5 * SIZE(AO3) FMADD y12, a4, b4, y12 LFD a4, 5 * SIZE(AO4) FMADD y13, a5, b4, y13 LFD a5, 5 * SIZE(AO5) FMADD y14, a6, b4, y14 LFD a6, 5 * SIZE(AO6) FMADD y15, a7, b4, y15 LFD a7, 5 * SIZE(AO7) FMADD y16, a8, b4, y16 LFD a8, 5 * SIZE(AO8) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 6 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 6 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 6 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 6 * SIZE(AO4) FMADD y05, a5, b5, y05 LFD a5, 6 * SIZE(AO5) FMADD y06, a6, b5, y06 LFD a6, 6 * SIZE(AO6) FMADD y07, a7, b5, y07 LFD a7, 6 * SIZE(AO7) FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) FMADD y09, a1, b6, y09 LFD a1, 7 * SIZE(AO1) FMADD y10, a2, b6, y10 LFD a2, 7 * SIZE(AO2) FMADD y11, a3, b6, y11 LFD a3, 7 * SIZE(AO3) FMADD y12, a4, b6, y12 LFD a4, 7 * SIZE(AO4) FMADD y13, a5, b6, y13 LFD a5, 7 * SIZE(AO5) FMADD y14, a6, b6, y14 LFD a6, 7 * SIZE(AO6) FMADD y15, a7, b6, y15 LFD a7, 7 * SIZE(AO7) FMADD y16, a8, b6, y16 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 8 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 8 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 8 * SIZE(AO4) FMADD y05, a5, b7, y05 LFD a5, 8 * SIZE(AO5) FMADD y06, a6, b7, y06 LFD a6, 8 * SIZE(AO6) FMADD y07, a7, b7, y07 LFD a7, 8 * SIZE(AO7) FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) FMADD y09, a1, b8, y09 LFD a1, 9 * SIZE(AO1) FMADD y10, a2, b8, y10 LFD a2, 9 * SIZE(AO2) FMADD y11, a3, b8, y11 LFD a3, 9 * SIZE(AO3) FMADD y12, a4, b8, y12 LFD a4, 9 * SIZE(AO4) FMADD y13, a5, b8, y13 LFD a5, 9 * SIZE(AO5) FMADD y14, a6, b8, y14 LFD a6, 9 * SIZE(AO6) FMADD y15, a7, b8, y15 LFD a7, 9 * SIZE(AO7) FMADD y16, a8, b8, y16 LFD a8, 9 * SIZE(AO8) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) DCBT(AO1, PREA) DCBT(AO2, PREA) DCBT(AO3, PREA) DCBT(AO4, PREA) FMADD y01, a1, b1, y01 LFD a1, 10 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 10 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 10 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 10 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 10 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 10 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 10 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 10 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 11 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 11 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 11 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 11 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 11 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 11 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 11 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 11 * SIZE(AO8) FMADD y01, a1, b3, y01 LFD a1, 12 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 12 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 12 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 12 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 12 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 12 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 12 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 12 * SIZE(AO8) FMADD y09, a1, b4, y09 LFD a1, 13 * SIZE(AO1) FMADD y10, a2, b4, y10 LFD a2, 13 * SIZE(AO2) FMADD y11, a3, b4, y11 LFD a3, 13 * SIZE(AO3) FMADD y12, a4, b4, y12 LFD a4, 13 * SIZE(AO4) FMADD y13, a5, b4, y13 LFD a5, 13 * SIZE(AO5) FMADD y14, a6, b4, y14 LFD a6, 13 * SIZE(AO6) FMADD y15, a7, b4, y15 LFD a7, 13 * SIZE(AO7) FMADD y16, a8, b4, y16 LFD a8, 13 * SIZE(AO8) LFD b1, 17 * SIZE(BO) LFD b2, 18 * SIZE(BO) LFD b3, 19 * SIZE(BO) LFD b4, 20 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 14 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 14 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 14 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 14 * SIZE(AO4) FMADD y05, a5, b5, y05 LFD a5, 14 * SIZE(AO5) FMADD y06, a6, b5, y06 LFD a6, 14 * SIZE(AO6) FMADD y07, a7, b5, y07 LFD a7, 14 * SIZE(AO7) FMADD y08, a8, b5, y08 LFD a8, 14 * SIZE(AO8) FMADD y09, a1, b6, y09 LFD a1, 15 * SIZE(AO1) FMADD y10, a2, b6, y10 LFD a2, 15 * SIZE(AO2) FMADD y11, a3, b6, y11 LFD a3, 15 * SIZE(AO3) FMADD y12, a4, b6, y12 LFD a4, 15 * SIZE(AO4) FMADD y13, a5, b6, y13 LFD a5, 15 * SIZE(AO5) FMADD y14, a6, b6, y14 LFD a6, 15 * SIZE(AO6) FMADD y15, a7, b6, y15 LFD a7, 15 * SIZE(AO7) FMADD y16, a8, b6, y16 LFD a8, 15 * SIZE(AO8) FMADD y01, a1, b7, y01 LFD a1, 16 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 16 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 16 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 16 * SIZE(AO4) FMADD y05, a5, b7, y05 LFD a5, 16 * SIZE(AO5) FMADD y06, a6, b7, y06 LFD a6, 16 * SIZE(AO6) FMADD y07, a7, b7, y07 LFD a7, 16 * SIZE(AO7) FMADD y08, a8, b7, y08 LFD a8, 16 * SIZE(AO8) FMADD y09, a1, b8, y09 LFD a1, 17 * SIZE(AO1) FMADD y10, a2, b8, y10 LFD a2, 17 * SIZE(AO2) FMADD y11, a3, b8, y11 LFD a3, 17 * SIZE(AO3) FMADD y12, a4, b8, y12 LFD a4, 17 * SIZE(AO4) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE FMADD y13, a5, b8, y13 LFD a5, 17 * SIZE(AO5) FMADD y14, a6, b8, y14 LFD a6, 17 * SIZE(AO6) FMADD y15, a7, b8, y15 LFD a7, 17 * SIZE(AO7) FMADD y16, a8, b8, y16 LFD a8, 17 * SIZE(AO8) LFD b5, 21 * SIZE(BO) LFD b6, 22 * SIZE(BO) LFD b7, 23 * SIZE(BO) LFD b8, 24 * SIZE(BO) addi AO5, AO5, 16 * SIZE addi AO6, AO6, 16 * SIZE DCBT(AO5, PREA) DCBT(AO6, PREA) addi AO7, AO7, 16 * SIZE addi AO8, AO8, 16 * SIZE DCBT(AO7, PREA) DCBT(AO8, PREA) addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(13): FMADD y01, a1, b1, y01 LFD a1, 2 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 2 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 2 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 2 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 2 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 2 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 3 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 3 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 3 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 3 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 3 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 3 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 LFD a1, 4 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 4 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 4 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 4 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 4 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 4 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 4 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) FMADD y09, a1, b4, y09 LFD a1, 5 * SIZE(AO1) FMADD y10, a2, b4, y10 LFD a2, 5 * SIZE(AO2) FMADD y11, a3, b4, y11 LFD a3, 5 * SIZE(AO3) FMADD y12, a4, b4, y12 LFD a4, 5 * SIZE(AO4) FMADD y13, a5, b4, y13 LFD a5, 5 * SIZE(AO5) FMADD y14, a6, b4, y14 LFD a6, 5 * SIZE(AO6) FMADD y15, a7, b4, y15 LFD a7, 5 * SIZE(AO7) FMADD y16, a8, b4, y16 LFD a8, 5 * SIZE(AO8) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 6 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 6 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 6 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 6 * SIZE(AO4) FMADD y05, a5, b5, y05 LFD a5, 6 * SIZE(AO5) FMADD y06, a6, b5, y06 LFD a6, 6 * SIZE(AO6) FMADD y07, a7, b5, y07 LFD a7, 6 * SIZE(AO7) FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) FMADD y09, a1, b6, y09 LFD a1, 7 * SIZE(AO1) FMADD y10, a2, b6, y10 LFD a2, 7 * SIZE(AO2) FMADD y11, a3, b6, y11 LFD a3, 7 * SIZE(AO3) FMADD y12, a4, b6, y12 LFD a4, 7 * SIZE(AO4) FMADD y13, a5, b6, y13 LFD a5, 7 * SIZE(AO5) FMADD y14, a6, b6, y14 LFD a6, 7 * SIZE(AO6) FMADD y15, a7, b6, y15 LFD a7, 7 * SIZE(AO7) FMADD y16, a8, b6, y16 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 8 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 8 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 8 * SIZE(AO4) FMADD y05, a5, b7, y05 LFD a5, 8 * SIZE(AO5) FMADD y06, a6, b7, y06 LFD a6, 8 * SIZE(AO6) FMADD y07, a7, b7, y07 LFD a7, 8 * SIZE(AO7) FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) FMADD y09, a1, b8, y09 LFD a1, 9 * SIZE(AO1) FMADD y10, a2, b8, y10 LFD a2, 9 * SIZE(AO2) FMADD y11, a3, b8, y11 LFD a3, 9 * SIZE(AO3) FMADD y12, a4, b8, y12 LFD a4, 9 * SIZE(AO4) FMADD y13, a5, b8, y13 LFD a5, 9 * SIZE(AO5) FMADD y14, a6, b8, y14 LFD a6, 9 * SIZE(AO6) FMADD y15, a7, b8, y15 LFD a7, 9 * SIZE(AO7) FMADD y16, a8, b8, y16 LFD a8, 9 * SIZE(AO8) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 10 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 10 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 10 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 10 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 10 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 10 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 10 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 10 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 11 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 11 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 11 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 11 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 11 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 11 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 11 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 11 * SIZE(AO8) FMADD y01, a1, b3, y01 LFD a1, 12 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 12 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 12 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 12 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 12 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 12 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 12 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 12 * SIZE(AO8) FMADD y09, a1, b4, y09 LFD a1, 13 * SIZE(AO1) FMADD y10, a2, b4, y10 LFD a2, 13 * SIZE(AO2) FMADD y11, a3, b4, y11 LFD a3, 13 * SIZE(AO3) FMADD y12, a4, b4, y12 LFD a4, 13 * SIZE(AO4) FMADD y13, a5, b4, y13 LFD a5, 13 * SIZE(AO5) FMADD y14, a6, b4, y14 LFD a6, 13 * SIZE(AO6) FMADD y15, a7, b4, y15 LFD a7, 13 * SIZE(AO7) FMADD y16, a8, b4, y16 LFD a8, 13 * SIZE(AO8) FMADD y01, a1, b5, y01 LFD a1, 14 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 14 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 14 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 14 * SIZE(AO4) FMADD y05, a5, b5, y05 LFD a5, 14 * SIZE(AO5) FMADD y06, a6, b5, y06 LFD a6, 14 * SIZE(AO6) FMADD y07, a7, b5, y07 LFD a7, 14 * SIZE(AO7) FMADD y08, a8, b5, y08 LFD a8, 14 * SIZE(AO8) FMADD y09, a1, b6, y09 LFD a1, 15 * SIZE(AO1) FMADD y10, a2, b6, y10 LFD a2, 15 * SIZE(AO2) FMADD y11, a3, b6, y11 LFD a3, 15 * SIZE(AO3) FMADD y12, a4, b6, y12 LFD a4, 15 * SIZE(AO4) FMADD y13, a5, b6, y13 LFD a5, 15 * SIZE(AO5) FMADD y14, a6, b6, y14 LFD a6, 15 * SIZE(AO6) FMADD y15, a7, b6, y15 LFD a7, 15 * SIZE(AO7) FMADD y16, a8, b6, y16 LFD a8, 15 * SIZE(AO8) FMADD y01, a1, b7, y01 LFD a1, 16 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 16 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 16 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 16 * SIZE(AO4) FMADD y05, a5, b7, y05 LFD a5, 16 * SIZE(AO5) FMADD y06, a6, b7, y06 LFD a6, 16 * SIZE(AO6) FMADD y07, a7, b7, y07 LFD a7, 16 * SIZE(AO7) FMADD y08, a8, b7, y08 LFD a8, 16 * SIZE(AO8) FMADD y09, a1, b8, y09 FMADD y10, a2, b8, y10 FMADD y11, a3, b8, y11 FMADD y12, a4, b8, y12 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE FMADD y13, a5, b8, y13 FMADD y14, a6, b8, y14 FMADD y15, a7, b8, y15 FMADD y16, a8, b8, y16 addi AO5, AO5, 16 * SIZE addi AO6, AO6, 16 * SIZE addi AO7, AO7, 16 * SIZE addi AO8, AO8, 16 * SIZE addi BO, BO, 16 * SIZE .align 4 LL(14): andi. r0, MIN_N, 15 ble LL(18) andi. r0, MIN_N, 8 ble LL(15) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 1 * SIZE(AO5) LFD a6, 1 * SIZE(AO6) LFD a7, 1 * SIZE(AO7) LFD a8, 1 * SIZE(AO8) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 2 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 2 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 2 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 2 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 2 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 2 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 3 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 3 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 3 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 3 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 3 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 3 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 3 * SIZE(AO8) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) FMADD y01, a1, b3, y01 LFD a1, 4 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 4 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 4 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 4 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 4 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 4 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 4 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) FMADD y09, a1, b4, y09 LFD a1, 5 * SIZE(AO1) FMADD y10, a2, b4, y10 LFD a2, 5 * SIZE(AO2) FMADD y11, a3, b4, y11 LFD a3, 5 * SIZE(AO3) FMADD y12, a4, b4, y12 LFD a4, 5 * SIZE(AO4) FMADD y13, a5, b4, y13 LFD a5, 5 * SIZE(AO5) FMADD y14, a6, b4, y14 LFD a6, 5 * SIZE(AO6) FMADD y15, a7, b4, y15 LFD a7, 5 * SIZE(AO7) FMADD y16, a8, b4, y16 LFD a8, 5 * SIZE(AO8) FMADD y01, a1, b5, y01 LFD a1, 6 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 6 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 6 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 6 * SIZE(AO4) FMADD y05, a5, b5, y05 LFD a5, 6 * SIZE(AO5) FMADD y06, a6, b5, y06 LFD a6, 6 * SIZE(AO6) FMADD y07, a7, b5, y07 LFD a7, 6 * SIZE(AO7) FMADD y08, a8, b5, y08 LFD a8, 6 * SIZE(AO8) FMADD y09, a1, b6, y09 LFD a1, 7 * SIZE(AO1) FMADD y10, a2, b6, y10 LFD a2, 7 * SIZE(AO2) FMADD y11, a3, b6, y11 LFD a3, 7 * SIZE(AO3) FMADD y12, a4, b6, y12 LFD a4, 7 * SIZE(AO4) FMADD y13, a5, b6, y13 LFD a5, 7 * SIZE(AO5) FMADD y14, a6, b6, y14 LFD a6, 7 * SIZE(AO6) FMADD y15, a7, b6, y15 LFD a7, 7 * SIZE(AO7) FMADD y16, a8, b6, y16 LFD a8, 7 * SIZE(AO8) FMADD y01, a1, b7, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 8 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 8 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 8 * SIZE(AO4) FMADD y05, a5, b7, y05 LFD a5, 8 * SIZE(AO5) FMADD y06, a6, b7, y06 LFD a6, 8 * SIZE(AO6) FMADD y07, a7, b7, y07 LFD a7, 8 * SIZE(AO7) FMADD y08, a8, b7, y08 LFD a8, 8 * SIZE(AO8) FMADD y09, a1, b8, y09 addi AO1, AO1, 8 * SIZE FMADD y10, a2, b8, y10 addi AO2, AO2, 8 * SIZE FMADD y11, a3, b8, y11 addi AO3, AO3, 8 * SIZE FMADD y12, a4, b8, y12 addi AO4, AO4, 8 * SIZE FMADD y13, a5, b8, y13 addi AO5, AO5, 8 * SIZE FMADD y14, a6, b8, y14 addi AO6, AO6, 8 * SIZE FMADD y15, a7, b8, y15 addi AO7, AO7, 8 * SIZE FMADD y16, a8, b8, y16 addi AO8, AO8, 8 * SIZE addi BO, BO, 8 * SIZE .align 4 LL(15): andi. r0, MIN_N, 4 ble LL(16) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 1 * SIZE(AO5) LFD a6, 1 * SIZE(AO6) LFD a7, 1 * SIZE(AO7) LFD a8, 1 * SIZE(AO8) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 2 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 2 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 2 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 2 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 2 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 2 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) FMADD y09, a1, b2, y09 LFD a1, 3 * SIZE(AO1) FMADD y10, a2, b2, y10 LFD a2, 3 * SIZE(AO2) FMADD y11, a3, b2, y11 LFD a3, 3 * SIZE(AO3) FMADD y12, a4, b2, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, a5, b2, y13 LFD a5, 3 * SIZE(AO5) FMADD y14, a6, b2, y14 LFD a6, 3 * SIZE(AO6) FMADD y15, a7, b2, y15 LFD a7, 3 * SIZE(AO7) FMADD y16, a8, b2, y16 LFD a8, 3 * SIZE(AO8) FMADD y01, a1, b3, y01 LFD a1, 4 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 4 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 4 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 4 * SIZE(AO4) FMADD y05, a5, b3, y05 LFD a5, 4 * SIZE(AO5) FMADD y06, a6, b3, y06 LFD a6, 4 * SIZE(AO6) FMADD y07, a7, b3, y07 LFD a7, 4 * SIZE(AO7) FMADD y08, a8, b3, y08 LFD a8, 4 * SIZE(AO8) FMADD y09, a1, b4, y09 addi AO1, AO1, 4 * SIZE FMADD y10, a2, b4, y10 addi AO2, AO2, 4 * SIZE FMADD y11, a3, b4, y11 addi AO3, AO3, 4 * SIZE FMADD y12, a4, b4, y12 addi AO4, AO4, 4 * SIZE FMADD y13, a5, b4, y13 addi AO5, AO5, 4 * SIZE FMADD y14, a6, b4, y14 addi AO6, AO6, 4 * SIZE FMADD y15, a7, b4, y15 addi AO7, AO7, 4 * SIZE FMADD y16, a8, b4, y16 addi AO8, AO8, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(16): andi. r0, MIN_N, 2 ble LL(17) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 1 * SIZE(AO5) LFD a6, 1 * SIZE(AO6) LFD a7, 1 * SIZE(AO7) LFD a8, 1 * SIZE(AO8) LFD b2, 2 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 2 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 2 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 2 * SIZE(AO4) FMADD y05, a5, b1, y05 LFD a5, 2 * SIZE(AO5) FMADD y06, a6, b1, y06 LFD a6, 2 * SIZE(AO6) FMADD y07, a7, b1, y07 LFD a7, 2 * SIZE(AO7) FMADD y08, a8, b1, y08 LFD a8, 2 * SIZE(AO8) FMADD y09, a1, b2, y09 addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE FMADD y10, a2, b2, y10 addi AO3, AO3, 2 * SIZE addi AO4, AO4, 2 * SIZE FMADD y11, a3, b2, y11 FMADD y12, a4, b2, y12 addi AO5, AO5, 2 * SIZE addi AO6, AO6, 2 * SIZE FMADD y13, a5, b2, y13 FMADD y14, a6, b2, y14 addi AO7, AO7, 2 * SIZE addi AO8, AO8, 2 * SIZE FMADD y15, a7, b2, y15 FMADD y16, a8, b2, y16 addi BO, BO, 2 * SIZE .align 4 LL(17): andi. r0, MIN_N, 1 ble LL(18) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 1 * SIZE(AO5) LFD a6, 1 * SIZE(AO6) LFD a7, 1 * SIZE(AO7) LFD a8, 1 * SIZE(AO8) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 FMADD y05, a5, b1, y05 FMADD y06, a6, b1, y06 FMADD y07, a7, b1, y07 FMADD y08, a8, b1, y08 .align 4 LL(18): mr BO, CO lfd alpha, ALPHA cmpi cr0, 0, INCY, SIZE bne LL(19) LFD a1, 1 * SIZE(CO) LFD a2, 2 * SIZE(CO) LFD a3, 3 * SIZE(CO) LFD a4, 4 * SIZE(CO) LFD a5, 5 * SIZE(CO) LFD a6, 6 * SIZE(CO) LFD a7, 7 * SIZE(CO) LFD a8, 8 * SIZE(CO) FADD y01, y09, y01 FADD y02, y10, y02 FADD y03, y11, y03 FADD y04, y12, y04 FADD y05, y13, y05 FADD y06, y14, y06 FADD y07, y15, y07 FADD y08, y16, y08 FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 FMADD a3, alpha, y03, a3 FMADD a4, alpha, y04, a4 FMADD a5, alpha, y05, a5 FMADD a6, alpha, y06, a6 FMADD a7, alpha, y07, a7 FMADD a8, alpha, y08, a8 STFD a1, 1 * SIZE(CO) STFD a2, 2 * SIZE(CO) STFD a3, 3 * SIZE(CO) STFD a4, 4 * SIZE(CO) STFD a5, 5 * SIZE(CO) STFD a6, 6 * SIZE(CO) STFD a7, 7 * SIZE(CO) STFD a8, 8 * SIZE(CO) addi J, J, -1 addi CO, CO, 8 * SIZE cmpi cr0, 0, J, 0 bgt LL(11) b LL(20) .align 4 LL(19): LFDUX a1, CO, INCY LFDUX a2, CO, INCY LFDUX a3, CO, INCY LFDUX a4, CO, INCY LFDUX a5, CO, INCY LFDUX a6, CO, INCY LFDUX a7, CO, INCY LFDUX a8, CO, INCY FADD y01, y09, y01 FADD y02, y10, y02 FADD y03, y11, y03 FADD y04, y12, y04 FADD y05, y13, y05 FADD y06, y14, y06 FADD y07, y15, y07 FADD y08, y16, y08 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 FMADD a4, alpha, f3, a4 FMADD a5, alpha, f4, a5 FMADD a6, alpha, f5, a6 FMADD a7, alpha, f6, a7 FMADD a8, alpha, f7, a8 STFDUX a1, BO, INCY STFDUX a2, BO, INCY STFDUX a3, BO, INCY STFDUX a4, BO, INCY STFDUX a5, BO, INCY STFDUX a6, BO, INCY STFDUX a7, BO, INCY STFDUX a8, BO, INCY addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 7 ble LL(99) andi. J, N, 4 ble LL(30) mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 DCBT(Y1, PREC) srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(24) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(23) .align 4 LL(22): FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 5 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 6 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 6 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 6 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 7 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 7 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 7 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 8 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 8 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 8 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 9 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) FMADD y09, a5, b8, y09 LFD a5, 10 * SIZE(AO1) FMADD y10, a6, b8, y10 LFD a6, 10 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 10 * SIZE(AO3) FMADD y12, a8, b8, y12 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 11 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 11 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 11 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 12 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 12 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 12 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 13 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 14 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 14 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 14 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 14 * SIZE(AO4) LFD b1, 17 * SIZE(BO) LFD b2, 18 * SIZE(BO) LFD b3, 19 * SIZE(BO) LFD b4, 20 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 15 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 15 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 15 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 16 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 16 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 16 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 LFD a1, 17 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 17 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 17 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 17 * SIZE(AO4) FMADD y09, a5, b8, y09 LFD a5, 18 * SIZE(AO1) FMADD y10, a6, b8, y10 LFD a6, 18 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 18 * SIZE(AO3) FMADD y12, a8, b8, y12 LFD a8, 18 * SIZE(AO4) LFD b5, 21 * SIZE(BO) LFD b6, 22 * SIZE(BO) LFD b7, 23 * SIZE(BO) LFD b8, 24 * SIZE(BO) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE DCBT(AO3, PREA) DCBT(AO4, PREA) addi BO, BO, 16 * SIZE bdnz LL(22) .align 4 LL(23): FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 5 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 6 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 6 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 6 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 7 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 7 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 7 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 8 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 8 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 8 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 9 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) FMADD y09, a5, b8, y09 LFD a5, 10 * SIZE(AO1) FMADD y10, a6, b8, y10 LFD a6, 10 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 10 * SIZE(AO3) FMADD y12, a8, b8, y12 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 11 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 11 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 11 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 12 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 12 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 12 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 13 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 14 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 14 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 14 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 14 * SIZE(AO4) FMADD y01, a1, b5, y01 LFD a1, 15 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 15 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 15 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 16 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 16 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 16 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 FMADD y02, a2, b7, y02 FMADD y03, a3, b7, y03 FMADD y04, a4, b7, y04 FMADD y09, a5, b8, y09 FMADD y10, a6, b8, y10 FMADD y11, a7, b8, y11 FMADD y12, a8, b8, y12 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE addi BO, BO, 16 * SIZE .align 4 LL(24): andi. r0, MIN_N, 15 ble LL(28) andi. r0, MIN_N, 8 ble LL(25) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 5 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 6 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 6 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 6 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 6 * SIZE(AO4) LFD b1, 5 * SIZE(BO) LFD b2, 6 * SIZE(BO) LFD b3, 7 * SIZE(BO) LFD b4, 8 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 7 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 7 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 7 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 7 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 8 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 8 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 8 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b3, y01 FMADD y02, a2, b3, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 FMADD y09, a5, b4, y09 addi AO1, AO1, 8 * SIZE FMADD y10, a6, b4, y10 addi AO2, AO2, 8 * SIZE FMADD y11, a7, b4, y11 addi AO3, AO3, 8 * SIZE FMADD y12, a8, b4, y12 addi AO4, AO4, 8 * SIZE addi BO, BO, 8 * SIZE .align 4 LL(25): andi. r0, MIN_N, 4 ble LL(26) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 FMADD y02, a2, b3, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 FMADD y09, a5, b4, y09 addi AO1, AO1, 4 * SIZE FMADD y10, a6, b4, y10 addi AO2, AO2, 4 * SIZE FMADD y11, a7, b4, y11 addi AO3, AO3, 4 * SIZE FMADD y12, a8, b4, y12 addi AO4, AO4, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(26): andi. r0, MIN_N, 2 ble LL(27) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 FMADD y09, a5, b2, y09 addi AO1, AO1, 2 * SIZE FMADD y10, a6, b2, y10 addi AO2, AO2, 2 * SIZE FMADD y11, a7, b2, y11 addi AO3, AO3, 2 * SIZE FMADD y12, a8, b2, y12 addi AO4, AO4, 2 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(27): andi. r0, MIN_N, 1 ble LL(28) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 .align 4 LL(28): mr BO, CO lfd alpha, ALPHA cmpi cr0, 0, INCY, SIZE bne LL(29) LFD a1, 1 * SIZE(CO) LFD a2, 2 * SIZE(CO) LFD a3, 3 * SIZE(CO) LFD a4, 4 * SIZE(CO) FADD y01, y09, y01 FADD y02, y10, y02 FADD y03, y11, y03 FADD y04, y12, y04 FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 FMADD a3, alpha, y03, a3 FMADD a4, alpha, y04, a4 STFD a1, 1 * SIZE(CO) STFD a2, 2 * SIZE(CO) STFD a3, 3 * SIZE(CO) STFD a4, 4 * SIZE(CO) addi CO, CO, 4 * SIZE b LL(30) .align 4 LL(29): LFDUX a1, CO, INCY LFDUX a2, CO, INCY LFDUX a3, CO, INCY LFDUX a4, CO, INCY FADD y01, y09, y01 FADD y02, y10, y02 FADD y03, y11, y03 FADD y04, y12, y04 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 FMADD a4, alpha, f3, a4 STFDUX a1, BO, INCY STFDUX a2, BO, INCY STFDUX a3, BO, INCY STFDUX a4, BO, INCY .align 4 LL(30): andi. J, N, 2 ble LL(40) mr AO1, A add AO2, A, LDA add A, AO2, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 DCBT(Y1, PREC) srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(34) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) LFD a5, 3 * SIZE(AO1) LFD a6, 3 * SIZE(AO2) LFD a7, 4 * SIZE(AO1) LFD a8, 4 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(33) .align 4 LL(32): FMADD y01, a1, b1, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 6 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 6 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 7 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 7 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 8 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b6, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, a4, b6, y04 LFD a4, 10 * SIZE(AO2) FMADD y09, a5, b7, y09 LFD a5, 11 * SIZE(AO1) FMADD y10, a6, b7, y10 LFD a6, 11 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 12 * SIZE(AO1) FMADD y12, a8, b8, y12 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 14 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 14 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 15 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 15 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 16 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 16 * SIZE(AO2) LFD b1, 17 * SIZE(BO) LFD b2, 18 * SIZE(BO) LFD b3, 19 * SIZE(BO) LFD b4, 20 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 17 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 17 * SIZE(AO2) FMADD y03, a3, b6, y03 LFD a3, 18 * SIZE(AO1) FMADD y04, a4, b6, y04 LFD a4, 18 * SIZE(AO2) FMADD y09, a5, b7, y09 LFD a5, 19 * SIZE(AO1) FMADD y10, a6, b7, y10 LFD a6, 19 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 20 * SIZE(AO1) FMADD y12, a8, b8, y12 LFD a8, 20 * SIZE(AO2) LFD b5, 21 * SIZE(BO) LFD b6, 22 * SIZE(BO) LFD b7, 23 * SIZE(BO) LFD b8, 24 * SIZE(BO) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) addi BO, BO, 16 * SIZE bdnz LL(32) .align 4 LL(33): FMADD y01, a1, b1, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 6 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 6 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 7 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 7 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 8 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b6, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, a4, b6, y04 LFD a4, 10 * SIZE(AO2) FMADD y09, a5, b7, y09 LFD a5, 11 * SIZE(AO1) FMADD y10, a6, b7, y10 LFD a6, 11 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 12 * SIZE(AO1) FMADD y12, a8, b8, y12 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 14 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 14 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 15 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 15 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 16 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 16 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 FMADD y03, a3, b6, y03 FMADD y04, a4, b6, y04 FMADD y09, a5, b7, y09 FMADD y10, a6, b7, y10 FMADD y11, a7, b8, y11 FMADD y12, a8, b8, y12 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi BO, BO, 16 * SIZE .align 4 LL(34): andi. r0, MIN_N, 15 ble LL(38) andi. r0, MIN_N, 8 ble LL(35) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 3 * SIZE(AO1) LFD a6, 3 * SIZE(AO2) LFD a7, 4 * SIZE(AO1) LFD a8, 4 * SIZE(AO2) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) FMADD y09, a3, b2, y09 LFD a3, 6 * SIZE(AO1) FMADD y10, a4, b2, y10 LFD a4, 6 * SIZE(AO2) FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) FMADD y09, a7, b4, y09 LFD a7, 8 * SIZE(AO1) FMADD y10, a8, b4, y10 LFD a8, 8 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 FMADD y09, a3, b6, y09 FMADD y10, a4, b6, y10 FMADD y01, a5, b7, y01 addi AO1, AO1, 8 * SIZE FMADD y02, a6, b7, y02 addi AO2, AO2, 8 * SIZE FMADD y09, a7, b8, y09 addi BO, BO, 8 * SIZE FMADD y10, a8, b8, y10 nop .align 4 LL(35): andi. r0, MIN_N, 4 ble LL(36) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) LFD a5, 3 * SIZE(AO1) LFD a6, 3 * SIZE(AO2) LFD a7, 4 * SIZE(AO1) LFD a8, 4 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y09, a3, b2, y09 FMADD y10, a4, b2, y10 FMADD y01, a5, b3, y01 addi AO1, AO1, 4 * SIZE FMADD y02, a6, b3, y02 addi AO2, AO2, 4 * SIZE FMADD y09, a7, b4, y09 addi BO, BO, 4 * SIZE FMADD y10, a8, b4, y10 .align 4 LL(36): andi. r0, MIN_N, 2 ble LL(37) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y09, a3, b2, y09 FMADD y10, a4, b2, y10 addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(37): andi. r0, MIN_N, 1 ble LL(38) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 .align 4 LL(38): mr BO, CO lfd alpha, ALPHA cmpi cr0, 0, INCY, SIZE bne LL(39) LFD a1, 1 * SIZE(CO) LFD a2, 2 * SIZE(CO) FADD y01, y03, y01 FADD y02, y04, y02 FADD y09, y11, y09 FADD y10, y12, y10 FADD y01, y09, y01 FADD y02, y10, y02 FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 STFD a1, 1 * SIZE(CO) STFD a2, 2 * SIZE(CO) addi CO, CO, 2 * SIZE b LL(40) .align 4 LL(39): LFDUX a1, CO, INCY LFDUX a2, CO, INCY FADD y01, y03, y01 FADD y02, y04, y02 FADD y09, y11, y09 FADD y10, y12, y10 FADD y01, y09, y01 FADD y02, y10, y02 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 STFDUX a1, BO, INCY STFDUX a2, BO, INCY .align 4 LL(40): andi. J, N, 1 ble LL(99) mr AO1, A add A, A, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 DCBT(Y1, PREC) srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(44) LFD a1, 1 * SIZE(AO1) LFD a2, 2 * SIZE(AO1) LFD a3, 3 * SIZE(AO1) LFD a4, 4 * SIZE(AO1) LFD a5, 5 * SIZE(AO1) LFD a6, 6 * SIZE(AO1) LFD a7, 7 * SIZE(AO1) LFD a8, 8 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(43) .align 4 LL(42): FMADD y01, a1, b1, y01 nop LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) FMADD y02, a2, b2, y02 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) FMADD y03, a3, b3, y03 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) FMADD y04, a4, b4, y04 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) FMADD y01, a5, b5, y01 nop LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) FMADD y02, a6, b6, y02 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) FMADD y03, a7, b7, y03 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) FMADD y04, a8, b8, y04 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 nop LFD a1, 17 * SIZE(AO1) LFD b1, 17 * SIZE(BO) FMADD y02, a2, b2, y02 nop LFD a2, 18 * SIZE(AO1) LFD b2, 18 * SIZE(BO) FMADD y03, a3, b3, y03 nop LFD a3, 19 * SIZE(AO1) LFD b3, 19 * SIZE(BO) FMADD y04, a4, b4, y04 nop LFD a4, 20 * SIZE(AO1) LFD b4, 20 * SIZE(BO) FMADD y01, a5, b5, y01 nop LFD a5, 21 * SIZE(AO1) LFD b5, 21 * SIZE(BO) FMADD y02, a6, b6, y02 nop LFD a6, 22 * SIZE(AO1) LFD b6, 22 * SIZE(BO) FMADD y03, a7, b7, y03 nop LFD a7, 23 * SIZE(AO1) LFD b7, 23 * SIZE(BO) FMADD y04, a8, b8, y04 nop LFD a8, 24 * SIZE(AO1) LFD b8, 24 * SIZE(BO) addi AO1, AO1, 16 * SIZE addi BO, BO, 16 * SIZE DCBT(AO1, PREA) bdnz LL(42) .align 4 LL(43): FMADD y01, a1, b1, y01 nop LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) FMADD y02, a2, b2, y02 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) FMADD y03, a3, b3, y03 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) FMADD y04, a4, b4, y04 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) FMADD y01, a5, b5, y01 nop LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) FMADD y02, a6, b6, y02 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) FMADD y03, a7, b7, y03 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) FMADD y04, a8, b8, y04 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 FMADD y02, a2, b2, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b4, y04 FMADD y01, a5, b5, y01 addi AO1, AO1, 16 * SIZE FMADD y02, a6, b6, y02 addi BO, BO, 16 * SIZE FMADD y03, a7, b7, y03 nop FMADD y04, a8, b8, y04 nop .align 4 LL(44): andi. r0, MIN_N, 15 ble LL(48) andi. r0, MIN_N, 8 ble LL(45) LFD a1, 1 * SIZE(AO1) LFD a2, 2 * SIZE(AO1) LFD a3, 3 * SIZE(AO1) LFD a4, 4 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 5 * SIZE(AO1) LFD a6, 6 * SIZE(AO1) LFD a7, 7 * SIZE(AO1) LFD a8, 8 * SIZE(AO1) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) FMADD y01, a1, b1, y01 FMADD y02, a2, b2, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b4, y04 FMADD y01, a5, b5, y01 addi AO1, AO1, 8 * SIZE FMADD y02, a6, b6, y02 addi BO, BO, 8 * SIZE FMADD y03, a7, b7, y03 nop FMADD y04, a8, b8, y04 nop .align 4 LL(45): andi. r0, MIN_N, 4 ble LL(46) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 2 * SIZE(AO1) LFD b2, 2 * SIZE(BO) LFD a3, 3 * SIZE(AO1) LFD b3, 3 * SIZE(BO) LFD a4, 4 * SIZE(AO1) LFD b4, 4 * SIZE(BO) FMADD y01, a1, b1, y01 addi AO1, AO1, 4 * SIZE FMADD y02, a2, b2, y02 addi AO2, AO2, 4 * SIZE FMADD y03, a3, b3, y03 addi BO, BO, 4 * SIZE FMADD y04, a4, b4, y04 nop .align 4 LL(46): andi. r0, MIN_N, 2 ble LL(47) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 2 * SIZE(AO1) LFD b2, 2 * SIZE(BO) FMADD y01, a1, b1, y01 addi AO1, AO1, 2 * SIZE FMADD y02, a2, b2, y02 addi BO, BO, 2 * SIZE .align 4 LL(47): andi. r0, MIN_N, 1 ble LL(48) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) FMADD y01, a1, b1, y01 .align 4 LL(48): mr BO, CO lfd alpha, ALPHA cmpi cr0, 0, INCY, SIZE bne LL(49) LFD a1, 1 * SIZE(CO) FADD y01, y02, y01 FADD y03, y04, y03 FADD y01, y03, y01 FMADD a1, alpha, y01, a1 STFD a1, 1 * SIZE(CO) b LL(99) .align 4 LL(49): LFDUX a1, CO, INCY FADD y01, y02, y01 FADD y03, y04, y03 FADD y01, y03, y01 FMADD a1, alpha, f0, a1 STFDUX a1, BO, INCY .align 4 LL(99): subf A, PLDA_M, A addi IS, IS, P cmp cr0, 0, IS, M blt LL(ISLoop) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 160(SP) ld r15, 168(SP) ld r16, 176(SP) ld r17, 184(SP) ld r18, 192(SP) ld r19, 200(SP) ld r20, 208(SP) ld r21, 216(SP) ld r22, 224(SP) ld r23, 232(SP) ld r24, 240(SP) ld r25, 248(SP) ld r26, 256(SP) ld r27, 264(SP) ld r28, 272(SP) ld r29, 280(SP) #else lwz r14, 160(SP) lwz r15, 164(SP) lwz r16, 168(SP) lwz r17, 172(SP) lwz r18, 176(SP) lwz r19, 180(SP) lwz r20, 184(SP) lwz r21, 188(SP) lwz r22, 192(SP) lwz r23, 196(SP) lwz r24, 200(SP) lwz r25, 204(SP) lwz r26, 208(SP) lwz r27, 212(SP) lwz r28, 216(SP) lwz r29, 220(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/gemv_t_ppc440.S000066400000000000000000000474551313527062700202760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #endif #endif #define BUFFER r11 #define XP r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define J r18 #define YY r19 #define PREA r20 #define PREC r21 #define X1 r22 #if defined(PPCG4) #define PREFETCHSIZE_A 42 #define PREFETCHSIZE_C 7 #endif #if defined(POWER6) #define PREFETCHSIZE_A 42 #define PREFETCHSIZE_C 7 #endif #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define a1 f8 #define a2 f9 #define a3 f10 #define a4 f11 #define a5 f12 #define a6 f13 #define a7 f14 #define a8 f15 #define b1 f16 #define b2 f17 #define b3 f18 #define b4 f19 #define b5 f20 #define b6 f21 #define b7 f22 #define b8 f23 #define alpha f23 #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #else #define STACKSIZE 288 #endif #define FZERO 144(SP) #define ALPHA 152(SP) PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) #ifdef __64BIT__ std r0, FZERO stfd f1, ALPHA std r14, 160(SP) std r15, 168(SP) std r16, 176(SP) std r17, 184(SP) std r18, 192(SP) std r19, 200(SP) std r20, 208(SP) std r21, 216(SP) std r22, 224(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stfd f1, ALPHA stw r14, 160(SP) stw r15, 164(SP) stw r16, 168(SP) stw r17, 172(SP) stw r18, 176(SP) stw r19, 180(SP) stw r20, 184(SP) stw r21, 188(SP) stw r22, 192(SP) #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT addi A, A, -SIZE sub X, X, INCX sub Y, Y, INCY li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpi cr0, 0, M, 0 ble LL(999) cmpi cr0, 0, N, 0 ble LL(999) mr XP, X cmpi cr0, 0, INCX, SIZE beq LL(10) addi XP, BUFFER, -SIZE addi X1, BUFFER, -SIZE srawi. r0, M, 3 mtspr CTR, r0 ble LL(CopyRemain) .align 4 LL(CopyKernel): LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX STFDU f0, 1 * SIZE(X1) STFDU f1, 1 * SIZE(X1) STFDU f2, 1 * SIZE(X1) STFDU f3, 1 * SIZE(X1) STFDU f4, 1 * SIZE(X1) STFDU f5, 1 * SIZE(X1) STFDU f6, 1 * SIZE(X1) STFDU f7, 1 * SIZE(X1) bdnz LL(CopyKernel) .align 4 LL(CopyRemain): andi. r0, M, 7 mtspr CTR, r0 ble LL(10) .align 4 LL(CopySub): LFDUX f0, X, INCX STFDU f0, 1 * SIZE(X1) bdnz LL(CopySub) .align 4 LL(10): mr YY, Y srawi. J, N, 2 ble LL(30) .align 4 LL(21): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr X1, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y05, y01 fmr y06, y01 fmr y07, y01 fmr y08, y01 dcbtst Y, PREC srawi. r0, M, 3 mtspr CTR, r0 ble LL(24) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU a3, 1 * SIZE(AO3) LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1) LFDU b2, 1 * SIZE(X1) LFDU a5, 1 * SIZE(AO1) LFDU a6, 1 * SIZE(AO2) LFDU a7, 1 * SIZE(AO3) LFDU a8, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) LFDU b4, 1 * SIZE(X1) bdz LL(23) .align 4 LL(22): #ifdef PPCG4 dcbt X1, PREA #endif FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1) #ifdef PPCG4 dcbt AO1, PREA #endif FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1) #ifdef PPCG4 dcbt AO2, PREA #endif FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b3, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b3, y04 LFDU a4, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) #ifdef PPCG4 dcbt AO3, PREA #endif FMADD y05, a5, b4, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b4, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b4, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b4, y08 LFDU a8, 1 * SIZE(AO4) #ifdef PPCG4 dcbt AO4, PREA #endif LFDU b4, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA #endif FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b3, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b3, y04 LFDU a4, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO3, PREA #endif FMADD y05, a5, b4, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b4, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b4, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b4, y08 LFDU a8, 1 * SIZE(AO4) LFDU b4, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO4, PREA #endif bdnz LL(22) .align 4 LL(23): FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1) FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b3, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b3, y04 LFDU a4, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) FMADD y05, a5, b4, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b4, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b4, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b4, y08 LFDU a8, 1 * SIZE(AO4) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a3, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a4, 1 * SIZE(AO4) FMADD y05, a5, b2, y05 LFDU a5, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a6, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a7, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a8, 1 * SIZE(AO4) FMADD y01, a1, b3, y01 FMADD y02, a2, b3, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 FMADD y05, a5, b4, y05 FMADD y06, a6, b4, y06 FMADD y07, a7, b4, y07 FMADD y08, a8, b4, y08 .align 4 LL(24): andi. r0, M, 7 ble LL(28) andi. r0, M, 4 ble LL(26) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU b1, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO3) LFDU a4, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a5, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a7, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a8, 1 * SIZE(AO4) LFDU b3, 1 * SIZE(X1) FMADD y05, a5, b2, y05 LFDU a1, 1 * SIZE(AO1) FMADD y06, a6, b2, y06 LFDU a2, 1 * SIZE(AO2) FMADD y07, a7, b2, y07 LFDU a3, 1 * SIZE(AO3) FMADD y08, a8, b2, y08 LFDU a4, 1 * SIZE(AO4) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b3, y01 LFDU a5, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, a3, b3, y03 LFDU a7, 1 * SIZE(AO3) FMADD y04, a4, b3, y04 LFDU a8, 1 * SIZE(AO4) FMADD y05, a5, b4, y05 FMADD y06, a6, b4, y06 FMADD y07, a7, b4, y07 FMADD y08, a8, b4, y08 .align 4 LL(26): andi. r0, M, 2 ble LL(27) LFDU b1, 1 * SIZE(X1) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU a3, 1 * SIZE(AO3) LFDU a4, 1 * SIZE(AO4) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a5, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, a3, b1, y03 LFDU a7, 1 * SIZE(AO3) FMADD y04, a4, b1, y04 LFDU a8, 1 * SIZE(AO4) FMADD y05, a5, b2, y05 FMADD y06, a6, b2, y06 FMADD y07, a7, b2, y07 FMADD y08, a8, b2, y08 .align 4 LL(27): andi. r0, M, 1 ble LL(28) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO2) LFDU a3, 1 * SIZE(AO3) LFDU a4, 1 * SIZE(AO4) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 .align 4 LL(28): lfd alpha, ALPHA LFDUX a1, Y, INCY LFDUX a2, Y, INCY LFDUX a3, Y, INCY LFDUX a4, Y, INCY FADD y01, y05, y01 FADD y02, y06, y02 FADD y03, y07, y03 FADD y04, y08, y04 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 FMADD a4, alpha, f3, a4 STFDUX a1, YY, INCY addi J, J, -1 STFDUX a2, YY, INCY cmpi cr0, 0, J, 0 STFDUX a3, YY, INCY STFDUX a4, YY, INCY bgt LL(21) .align 4 LL(30): andi. J, N, 2 ble LL(40) mr AO1, A add AO2, A, LDA add A, AO2, LDA mr X1, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 srawi. r0, M, 3 mtspr CTR, r0 ble LL(34) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU b1, 1 * SIZE(X1) LFDU b2, 1 * SIZE(X1) LFDU a5, 1 * SIZE(AO1) LFDU a6, 1 * SIZE(AO2) LFDU b3, 1 * SIZE(X1) LFDU b4, 1 * SIZE(X1) bdz LL(33) .align 4 LL(32): #ifdef PPCG4 dcbt X1, PREA #endif FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) LFDU b1, 1 * SIZE(X1) #ifdef PPCG4 dcbt AO1, PREA #endif FMADD y03, a5, b2, y03 LFDU a5, 1 * SIZE(AO1) FMADD y04, a6, b2, y04 LFDU a6, 1 * SIZE(AO2) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) LFDU b3, 1 * SIZE(X1) #ifdef PPCG4 dcbt AO2, PREA #endif FMADD y03, a5, b4, y03 LFDU a5, 1 * SIZE(AO1) FMADD y04, a6, b4, y04 LFDU a6, 1 * SIZE(AO2) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) #if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA #endif LFDU b1, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif FMADD y03, a5, b2, y03 LFDU a5, 1 * SIZE(AO1) FMADD y04, a6, b2, y04 LFDU a6, 1 * SIZE(AO2) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) LFDU b3, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD y03, a5, b4, y03 LFDU a5, 1 * SIZE(AO1) FMADD y04, a6, b4, y04 LFDU a6, 1 * SIZE(AO2) LFDU b4, 1 * SIZE(X1) bdnz LL(32) .align 4 LL(33): FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) LFDU b1, 1 * SIZE(X1) FMADD y03, a5, b2, y03 LFDU a5, 1 * SIZE(AO1) FMADD y04, a6, b2, y04 LFDU a6, 1 * SIZE(AO2) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b3, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a2, 1 * SIZE(AO2) LFDU b3, 1 * SIZE(X1) FMADD y03, a5, b4, y03 LFDU a5, 1 * SIZE(AO1) FMADD y04, a6, b4, y04 LFDU a6, 1 * SIZE(AO2) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a2, 1 * SIZE(AO2) FMADD y03, a5, b2, y03 LFDU a5, 1 * SIZE(AO1) FMADD y04, a6, b2, y04 LFDU a6, 1 * SIZE(AO2) FMADD y01, a1, b3, y01 FMADD y02, a2, b3, y02 FMADD y03, a5, b4, y03 FMADD y04, a6, b4, y04 .align 4 LL(34): andi. r0, M, 7 ble LL(38) andi. r0, M, 4 ble LL(36) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU b1, 1 * SIZE(X1) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a5, 1 * SIZE(AO1) FMADD y02, a2, b1, y02 LFDU a6, 1 * SIZE(AO2) LFDU b3, 1 * SIZE(X1) FMADD y03, a5, b2, y03 LFDU a1, 1 * SIZE(AO1) FMADD y04, a6, b2, y04 LFDU a2, 1 * SIZE(AO2) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b3, y01 LFDU a5, 1 * SIZE(AO1) FMADD y02, a2, b3, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, a5, b4, y03 FMADD y04, a6, b4, y04 .align 4 LL(36): andi. r0, M, 2 ble LL(37) LFDU b1, 1 * SIZE(X1) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO2) LFDU b2, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO1) LFDU a4, 1 * SIZE(AO2) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y03, a3, b2, y03 FMADD y04, a4, b2, y04 .align 4 LL(37): andi. r0, M, 1 ble LL(38) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO2) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 .align 4 LL(38): lfd alpha, ALPHA LFDUX a1, Y, INCY LFDUX a2, Y, INCY FADD y01, y03, y01 FADD y02, y04, y02 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 STFDUX a1, YY, INCY STFDUX a2, YY, INCY .align 4 LL(40): andi. J, N, 1 ble LL(999) mr AO1, A add A, A, LDA mr X1, XP lfd y01, FZERO fmr y02, y01 srawi. r0, M, 3 mtspr CTR, r0 ble LL(44) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO1) LFDU a3, 1 * SIZE(AO1) LFDU a4, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU b2, 1 * SIZE(X1) LFDU b3, 1 * SIZE(X1) LFDU b4, 1 * SIZE(X1) bdz LL(43) .align 4 LL(42): FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) #ifdef PPCG4 dcbt X1, PREA #endif FMADD y02, a2, b2, y02 LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) #ifdef PPCG4 dcbt AO1, PREA #endif FMADD y01, a3, b3, y01 LFDU a3, 1 * SIZE(AO1) LFDU b3, 1 * SIZE(X1) FMADD y02, a4, b4, y02 LFDU a4, 1 * SIZE(AO1) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) FMADD y02, a2, b2, y02 LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif FMADD y01, a3, b3, y01 LFDU a3, 1 * SIZE(AO1) LFDU b3, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA #endif FMADD y02, a4, b4, y02 LFDU a4, 1 * SIZE(AO1) LFDU b4, 1 * SIZE(X1) bdnz LL(42) .align 4 LL(43): FMADD y01, a1, b1, y01 LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) FMADD y02, a2, b2, y02 LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) FMADD y01, a3, b3, y01 LFDU a3, 1 * SIZE(AO1) LFDU b3, 1 * SIZE(X1) FMADD y02, a4, b4, y02 LFDU a4, 1 * SIZE(AO1) LFDU b4, 1 * SIZE(X1) FMADD y01, a1, b1, y01 FMADD y02, a2, b2, y02 FMADD y01, a3, b3, y01 FMADD y02, a4, b4, y02 .align 4 LL(44): andi. r0, M, 7 ble LL(48) andi. r0, M, 4 ble LL(46) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) FMADD y01, a1, b1, y01 LFDU a3, 1 * SIZE(AO1) LFDU b3, 1 * SIZE(X1) FMADD y02, a2, b2, y02 LFDU a4, 1 * SIZE(AO1) LFDU b4, 1 * SIZE(X1) FMADD y01, a3, b3, y01 FMADD y02, a4, b4, y02 .align 4 LL(46): andi. r0, M, 2 ble LL(47) LFDU b1, 1 * SIZE(X1) LFDU a1, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) FMADD y01, a1, b1, y01 FMADD y02, a2, b2, y02 .align 4 LL(47): andi. r0, M, 1 ble LL(48) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) FMADD y01, a1, b1, y01 .align 4 LL(48): lfd alpha, ALPHA LFDUX a1, Y, INCY FADD y01, y02, y01 FMADD a1, alpha, f0, a1 STFDUX a1, YY, INCY .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) #ifdef __64BIT__ ld r14, 160(SP) ld r15, 168(SP) ld r16, 176(SP) ld r17, 184(SP) ld r18, 192(SP) ld r19, 200(SP) ld r20, 208(SP) ld r21, 216(SP) ld r22, 224(SP) #else lwz r14, 160(SP) lwz r15, 164(SP) lwz r16, 168(SP) lwz r17, 172(SP) lwz r18, 176(SP) lwz r19, 180(SP) lwz r20, 184(SP) lwz r21, 188(SP) lwz r22, 192(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ger.S000066400000000000000000000572231313527062700164720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef NEEDPARAM #ifndef DOUBLE #include "sparam.h" #else #include "dparam.h" #endif #endif #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define A r10 #define LDA r5 #else #define M r3 #define N r4 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define A r5 #define LDA r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define A r6 #define LDA r7 #else #define M r3 #define N r4 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define A r5 #define LDA r6 #endif #endif #define I r11 #define J r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define AO5 r18 #define AO6 r19 #define AO7 r20 #define AO8 r21 #define X1 r22 #define PREA r23 #define PREC r24 #define XX r25 #define BUFFER r26 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define alpha1 f8 #define alpha2 f9 #define a1 f12 #define a2 f13 #define a3 f14 #define a4 f15 #define a5 f16 #define a6 f17 #define a7 f18 #define a8 f19 #define a9 f20 #define a10 f21 #define a11 f22 #define a12 f23 #define a13 f24 #define a14 f25 #define a15 f26 #define a16 f27 #define alpha f31 #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 16 #endif #ifdef PPC970 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #else #define STACKSIZE 280 #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld A, FRAMESLOT(0) + STACKSIZE(SP) ld LDA, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz A, FRAMESLOT(1) + STACKSIZE(SP) lwz LDA, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #else lwz A, FRAMESLOT(0) + STACKSIZE(SP) lwz LDA, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld A, FRAMESLOT(0) + STACKSIZE(SP) ld LDA, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif fmr alpha, f1 slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) mr XX, X cmpi cr0, 0, INCX, SIZE beq LL(10) mr XX, BUFFER mr X1, BUFFER srawi. r0, M, 3 mtspr CTR, r0 ble LL(05) .align 4 LL(01): LFD a1, 0 * SIZE(X) add X, X, INCX LFD a2, 0 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) add X, X, INCX LFD a4, 0 * SIZE(X) add X, X, INCX LFD a5, 0 * SIZE(X) add X, X, INCX LFD a6, 0 * SIZE(X) add X, X, INCX LFD a7, 0 * SIZE(X) add X, X, INCX LFD a8, 0 * SIZE(X) add X, X, INCX STFD a1, 0 * SIZE(X1) STFD a2, 1 * SIZE(X1) STFD a3, 2 * SIZE(X1) STFD a4, 3 * SIZE(X1) STFD a5, 4 * SIZE(X1) STFD a6, 5 * SIZE(X1) STFD a7, 6 * SIZE(X1) STFD a8, 7 * SIZE(X1) addi X1, X1, 8 * SIZE bdnz+ LL(01) .align 4 LL(05): andi. r0, M, 7 mtspr CTR, r0 ble LL(10) .align 4 LL(06): LFD a1, 0 * SIZE(X) add X, X, INCX STFD a1, 0 * SIZE(X1) addi X1, X1, SIZE bdnz+ LL(06) .align 4 LL(10): srawi. J, N, 1 ble LL(20) .align 4 LL(11): LFD alpha1, 0 * SIZE(Y) add Y, Y, INCY LFD alpha2, 0 * SIZE(Y) add Y, Y, INCY FMUL alpha1, alpha, alpha1 FMUL alpha2, alpha, alpha2 mr AO1, A add AO2, A, LDA add A, AO2, LDA mr X1, XX srawi. r0, M, 4 mtspr CTR, r0 ble LL(15) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) LFD a9, 0 * SIZE(AO2) LFD a10, 1 * SIZE(AO2) LFD a11, 2 * SIZE(AO2) LFD a12, 3 * SIZE(AO2) LFD a13, 4 * SIZE(AO2) LFD a14, 5 * SIZE(AO2) LFD a15, 6 * SIZE(AO2) LFD a16, 7 * SIZE(AO2) bdz LL(13) .align 4 LL(12): FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD a9, alpha2, y01, a9 FMADD a10, alpha2, y02, a10 FMADD a11, alpha2, y03, a11 FMADD a12, alpha2, y04, a12 LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) FMADD a13, alpha2, y05, a13 FMADD a14, alpha2, y06, a14 FMADD a15, alpha2, y07, a15 FMADD a16, alpha2, y08, a16 LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) LFD a9, 8 * SIZE(AO2) LFD a10, 9 * SIZE(AO2) LFD a11, 10 * SIZE(AO2) LFD a12, 11 * SIZE(AO2) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) LFD a13, 12 * SIZE(AO2) LFD a14, 13 * SIZE(AO2) LFD a15, 14 * SIZE(AO2) LFD a16, 15 * SIZE(AO2) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) FMADD a9, alpha2, y01, a9 FMADD a10, alpha2, y02, a10 FMADD a11, alpha2, y03, a11 FMADD a12, alpha2, y04, a12 LFD y01, 16 * SIZE(X1) LFD y02, 17 * SIZE(X1) LFD y03, 18 * SIZE(X1) LFD y04, 19 * SIZE(X1) FMADD a13, alpha2, y05, a13 FMADD a14, alpha2, y06, a14 FMADD a15, alpha2, y07, a15 FMADD a16, alpha2, y08, a16 LFD y05, 20 * SIZE(X1) LFD y06, 21 * SIZE(X1) LFD y07, 22 * SIZE(X1) LFD y08, 23 * SIZE(X1) STFD a9, 8 * SIZE(AO2) STFD a10, 9 * SIZE(AO2) STFD a11, 10 * SIZE(AO2) STFD a12, 11 * SIZE(AO2) LFD a9, 16 * SIZE(AO2) LFD a10, 17 * SIZE(AO2) LFD a11, 18 * SIZE(AO2) LFD a12, 19 * SIZE(AO2) STFD a13, 12 * SIZE(AO2) STFD a14, 13 * SIZE(AO2) STFD a15, 14 * SIZE(AO2) STFD a16, 15 * SIZE(AO2) LFD a13, 20 * SIZE(AO2) LFD a14, 21 * SIZE(AO2) LFD a15, 22 * SIZE(AO2) LFD a16, 23 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi X1, X1, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) DCBT(Y1, PREY) bdnz+ LL(12) .align 4 LL(13): FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD a9, alpha2, y01, a9 FMADD a10, alpha2, y02, a10 FMADD a11, alpha2, y03, a11 FMADD a12, alpha2, y04, a12 LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) FMADD a13, alpha2, y05, a13 FMADD a14, alpha2, y06, a14 FMADD a15, alpha2, y07, a15 FMADD a16, alpha2, y08, a16 LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) LFD a9, 8 * SIZE(AO2) LFD a10, 9 * SIZE(AO2) LFD a11, 10 * SIZE(AO2) LFD a12, 11 * SIZE(AO2) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) LFD a13, 12 * SIZE(AO2) LFD a14, 13 * SIZE(AO2) LFD a15, 14 * SIZE(AO2) LFD a16, 15 * SIZE(AO2) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) FMADD a9, alpha2, y01, a9 FMADD a10, alpha2, y02, a10 FMADD a11, alpha2, y03, a11 FMADD a12, alpha2, y04, a12 FMADD a13, alpha2, y05, a13 FMADD a14, alpha2, y06, a14 FMADD a15, alpha2, y07, a15 FMADD a16, alpha2, y08, a16 STFD a9, 8 * SIZE(AO2) STFD a10, 9 * SIZE(AO2) STFD a11, 10 * SIZE(AO2) STFD a12, 11 * SIZE(AO2) STFD a13, 12 * SIZE(AO2) STFD a14, 13 * SIZE(AO2) STFD a15, 14 * SIZE(AO2) STFD a16, 15 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi X1, X1, 16 * SIZE .align 4 LL(15): andi. r0, M, 15 ble LL(19) andi. r0, M, 8 ble LL(16) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD a9, 0 * SIZE(AO2) LFD a10, 1 * SIZE(AO2) LFD a11, 2 * SIZE(AO2) LFD a12, 3 * SIZE(AO2) LFD a13, 4 * SIZE(AO2) LFD a14, 5 * SIZE(AO2) LFD a15, 6 * SIZE(AO2) LFD a16, 7 * SIZE(AO2) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) FMADD a9, alpha2, y01, a9 FMADD a10, alpha2, y02, a10 FMADD a11, alpha2, y03, a11 FMADD a12, alpha2, y04, a12 STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) FMADD a13, alpha2, y05, a13 FMADD a14, alpha2, y06, a14 FMADD a15, alpha2, y07, a15 FMADD a16, alpha2, y08, a16 STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi X1, X1, 8 * SIZE .align 4 LL(16): andi. r0, M, 4 ble LL(17) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) FMADD a5, alpha2, y01, a5 FMADD a6, alpha2, y02, a6 FMADD a7, alpha2, y03, a7 FMADD a8, alpha2, y04, a8 STFD a5, 0 * SIZE(AO2) STFD a6, 1 * SIZE(AO2) STFD a7, 2 * SIZE(AO2) STFD a8, 3 * SIZE(AO2) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi X1, X1, 4 * SIZE .align 4 LL(17): andi. r0, M, 2 ble LL(18) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha2, y01, a3 FMADD a4, alpha2, y02, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 0 * SIZE(AO2) STFD a4, 1 * SIZE(AO2) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi X1, X1, 2 * SIZE .align 4 LL(18): andi. r0, M, 1 ble LL(19) LFD y01, 0 * SIZE(X1) LFD a1, 0 * SIZE(AO1) LFD a2, 0 * SIZE(AO2) FMADD a1, alpha1, y01, a1 FMADD a2, alpha2, y01, a2 STFD a1, 0 * SIZE(AO1) STFD a2, 0 * SIZE(AO2) .align 4 LL(19): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 1 ble LL(999) .align 4 LL(21): LFD alpha1, 0 * SIZE(Y) FMUL alpha1, alpha, alpha1 mr AO1, A mr X1, XX srawi. r0, M, 4 mtspr CTR, r0 ble LL(25) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) bdz LL(23) .align 4 LL(22): FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) LFD y01, 16 * SIZE(X1) LFD y02, 17 * SIZE(X1) LFD y03, 18 * SIZE(X1) LFD y04, 19 * SIZE(X1) LFD y05, 20 * SIZE(X1) LFD y06, 21 * SIZE(X1) LFD y07, 22 * SIZE(X1) LFD y08, 23 * SIZE(X1) addi AO1, AO1, 16 * SIZE addi X1, X1, 16 * SIZE DCBT(AO1, PREA) DCBT(Y1, PREY) bdnz+ LL(22) .align 4 LL(23): FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) addi AO1, AO1, 16 * SIZE addi X1, X1, 16 * SIZE .align 4 LL(25): andi. r0, M, 15 ble LL(999) andi. r0, M, 8 ble LL(26) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) FMADD a5, alpha1, y05, a5 FMADD a6, alpha1, y06, a6 FMADD a7, alpha1, y07, a7 FMADD a8, alpha1, y08, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) addi AO1, AO1, 8 * SIZE addi X1, X1, 8 * SIZE .align 4 LL(26): andi. r0, M, 4 ble LL(27) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 FMADD a3, alpha1, y03, a3 FMADD a4, alpha1, y04, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) addi AO1, AO1, 4 * SIZE addi X1, X1, 4 * SIZE .align 4 LL(27): andi. r0, M, 2 ble LL(28) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) FMADD a1, alpha1, y01, a1 FMADD a2, alpha1, y02, a2 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) addi AO1, AO1, 2 * SIZE addi X1, X1, 2 * SIZE .align 4 LL(28): andi. r0, M, 1 ble LL(999) LFD y01, 0 * SIZE(X1) LFD a1, 0 * SIZE(AO1) FMADD a1, alpha1, y01, a1 STFD a1, 0 * SIZE(AO1) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/iamax.S000066400000000000000000000342531313527062700170120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PREA r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 mr NN, N mr XX, X slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(1000): cmpwi cr0, INCX, SIZE bne- cr0, LL(1100) srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(1050) LFD f24, 0 * SIZE(XX) LFD f25, 1 * SIZE(XX) LFD f26, 2 * SIZE(XX) LFD f27, 3 * SIZE(XX) LFD f28, 4 * SIZE(XX) LFD f29, 5 * SIZE(XX) LFD f30, 6 * SIZE(XX) LFD f31, 7 * SIZE(XX) bdz LL(1020) .align 4 LL(1010): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(XX) LFD f25, 9 * SIZE(XX) LFD f26, 10 * SIZE(XX) LFD f27, 11 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(XX) LFD f29, 13 * SIZE(XX) LFD f30, 14 * SIZE(XX) LFD f31, 15 * SIZE(XX) addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) addi XX, XX, 8 * SIZE bdnz LL(1010) .align 4 LL(1020): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 addi XX, XX, 8 * SIZE addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1050): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1060): LFD f8, 0 * SIZE(XX) addi XX, XX, 1 * SIZE fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 LL(1100): sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f24, XX, INCX LFDUX f25, XX, INCX LFDUX f26, XX, INCX LFDUX f27, XX, INCX LFDUX f28, XX, INCX LFDUX f29, XX, INCX LFDUX f30, XX, INCX LFDUX f31, XX, INCX bdz LL(1120) .align 4 LL(1110): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDUX f24, XX, INCX LFDUX f25, XX, INCX LFDUX f26, XX, INCX LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f28, XX, INCX LFDUX f29, XX, INCX LFDUX f30, XX, INCX LFDUX f31, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/iamax_hummer.S000066400000000000000000000401451313527062700203640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define RET r9 #define NN r10 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 #define T1 f20 #define T2 f21 #define T3 f22 #define T4 f23 #define T5 f24 #define T6 f25 #define T7 f26 #define T8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX li RET, 0 cmpwi cr0, N, 0 ble LL(999) mr NN, N cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) addi N, N, -1 cmpwi cr0, N, 0 li RET, 1 fabs C1, C1 ble LL(999) fsmfp C1, C1 mr XX, X fpmr C2, C1 add X, X, INCX fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 fabs C2, C2 ble LL(20) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fpabs T1, A1 LFPDUX A6, X, INCX2 fpabs T2, A2 LFPDUX A7, X, INCX2 fpabs T3, A3 LFPDUX A8, X, INCX2 fpabs T4, A4 bdz LL(13) .align 4 LL(12): fpsub F1, C1, T1 LFPDUX A1, X, INCX2 fpsub F2, C2, T2 LFPDUX A2, X, INCX2 fpsub F3, C3, T3 LFPDUX A3, X, INCX2 fpsub F4, C4, T4 LFPDUX A4, X, INCX2 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsel C1, F1, C1, T1 LFPDUX A5, X, INCX2 fpsel C2, F2, C2, T2 LFPDUX A6, X, INCX2 fpsel C3, F3, C3, T3 LFPDUX A7, X, INCX2 fpsel C4, F4, C4, T4 LFPDUX A8, X, INCX2 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(12) .align 4 LL(13): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(15): andi. r0, N, 15 beq LL(20) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpabs A1, A1 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(20) LFDUX A1, X, INCX2 fabs A1, A1 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(20): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel C1, F1, C1, C2 li RET, 0 fsmfp C1, C1 andi. r0, XX, 2 * SIZE - 1 beq LL(21) LFD A1, 0 * SIZE(XX) add XX, XX, INCX addi NN, NN, -1 addi RET, RET, 1 fabs A1, A1 fcmpu cr0, C1, A1 beq cr0, LL(999) .align 4 LL(21): sub XX, XX, INCX2 srawi. r0, NN, 4 mtspr CTR, r0 beq- LL(25) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 LFPDUX A5, XX, INCX2 LFPDUX A6, XX, INCX2 LFPDUX A7, XX, INCX2 LFPDUX A8, XX, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 bdz LL(23) .align 4 LL(22): addi RET, RET, 1 fcmpu cr0, C1, T1 LFPDUX A1, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 LFPDUX A2, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 LFPDUX A3, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 LFPDUX A4, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 LFPDUX A5, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T3 LFPDUX A6, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 LFPDUX A7, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T4 LFPDUX A8, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T5 fpabs T1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T5 fpabs T2, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T6 fpabs T3, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T6 fpabs T4, A4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T7 fpabs T5, A5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T7 fpabs T6, A6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T8 fpabs T7, A7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T8 fpabs T8, A8 beq cr0, LL(999) bdnz LL(22) .align 4 LL(23): addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T5 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T6 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T7 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T8 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T8 beq cr0, LL(999) .align 4 LL(25): andi. r0, NN, 8 beq LL(26) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T4 beq cr0, LL(999) .align 4 LL(26): andi. r0, NN, 4 beq LL(27) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 fpabs T1, A1 fpabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 beq cr0, LL(999) .align 4 LL(27): andi. r0, NN, 2 beq LL(28) LFPDUX A1, XX, INCX2 fpabs T1, A1 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) .align 4 LL(28): andi. r0, NN, 1 beq LL(999) addi RET, RET, 1 b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFSDUX A5, X, INCX fpabs T1, A1 LFSDUX A6, X, INCX fpabs T2, A2 LFSDUX A7, X, INCX fpabs T3, A3 LFSDUX A8, X, INCX fpabs T4, A4 bdz LL(103) .align 4 LL(102): fpsub F1, C1, T1 LFDUX A1, X, INCX fpsub F2, C2, T2 LFDUX A2, X, INCX fpsub F3, C3, T3 LFDUX A3, X, INCX fpsub F4, C4, T4 LFDUX A4, X, INCX fpabs T5, A5 LFSDUX A1, X, INCX fpabs T6, A6 LFSDUX A2, X, INCX fpabs T7, A7 LFSDUX A3, X, INCX fpabs T8, A8 LFSDUX A4, X, INCX fpsel C1, F1, C1, T1 LFDUX A5, X, INCX fpsel C2, F2, C2, T2 LFDUX A6, X, INCX fpsel C3, F3, C3, T3 LFDUX A7, X, INCX fpsel C4, F4, C4, T4 LFDUX A8, X, INCX fpsub F5, C1, T5 LFSDUX A5, X, INCX fpsub F6, C2, T6 LFSDUX A6, X, INCX fpsub F7, C3, T7 LFSDUX A7, X, INCX fpsub F8, C4, T8 LFSDUX A8, X, INCX fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(102) .align 4 LL(103): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(105): andi. r0, N, 15 beq LL(120) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fsub F1, C1, A1 fsub F2, C2, A2 fsub F3, C3, A3 fsub F4, C4, A4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fsub F1, C1, A1 fsub F2, C2, A2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX fabs A1, A1 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(120): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel C1, F1, C1, C2 li RET, 0 sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(126) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX fabs T1, A1 fabs T2, A2 LFDUX A5, XX, INCX LFDUX A6, XX, INCX LFDUX A7, XX, INCX LFDUX A8, XX, INCX bdz LL(123) .align 4 LL(122): LFDUX A1, XX, INCX fabs T3, A3 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) LFDUX A2, XX, INCX fabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) LFDUX A3, XX, INCX fabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) LFDUX A4, XX, INCX fabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) LFDUX A5, XX, INCX fabs T3, A7 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) LFDUX A6, XX, INCX fabs T4, A8 addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) LFDUX A7, XX, INCX fabs T1, A1 addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) LFDUX A8, XX, INCX fabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) bdnz LL(122) .align 4 LL(123): fabs T3, A3 fabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) fabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) fabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) fabs T3, A7 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) fabs T4, A8 addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) .align 4 LL(126): andi. r0, NN, 4 beq LL(127) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX fabs T1, A1 fabs T2, A2 fabs T3, A3 fabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) .align 4 LL(127): andi. r0, NN, 2 beq LL(128) LFDUX A1, XX, INCX LFDUX A2, XX, INCX fabs T1, A1 fabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) .align 4 LL(128): addi RET, RET, 1 .align 4 LL(999): li r10, 16 addi SP, SP, -16 mr r3, RET lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/iamax_ppc440.S000066400000000000000000000224241313527062700201010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PRE r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 slwi INCX, INCX, BASE_SHIFT sub X, X, INCX li PRE, 3 * 16 * SIZE mr NN, N mr XX, X cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 subi N, N, 1 fabs f6, f1 srawi. r0, N, 4 fabs f7, f1 mtspr CTR, r0 fabs f1, f1 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDUX f25, X, INCX fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDUX f27, X, INCX fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDUX f29, X, INCX fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f4, f12 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f31, X, INCX fsub f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f4, f12 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(1000): srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f24, XX, INCX LFDUX f25, XX, INCX LFDUX f26, XX, INCX LFDUX f27, XX, INCX LFDUX f28, XX, INCX LFDUX f29, XX, INCX LFDUX f30, XX, INCX LFDUX f31, XX, INCX bdz LL(1120) .align 4 LL(1110): fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDUX f25, XX, INCX fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDUX f27, XX, INCX #ifdef PPCG4 dcbt XX, PRE #endif fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDUX f29, XX, INCX fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDUX f31, XX, INCX #if defined(PPCG4) && defined(DOUBLE) dcbt XX, PRE #endif addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/iamin.S000066400000000000000000000342141313527062700170050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PREA r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 mr NN, N mr XX, X slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(1000): cmpwi cr0, INCX, SIZE bne- cr0, LL(1100) srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(1050) LFD f24, 0 * SIZE(XX) LFD f25, 1 * SIZE(XX) LFD f26, 2 * SIZE(XX) LFD f27, 3 * SIZE(XX) LFD f28, 4 * SIZE(XX) LFD f29, 5 * SIZE(XX) LFD f30, 6 * SIZE(XX) LFD f31, 7 * SIZE(XX) bdz LL(1020) .align 4 LL(1010): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(XX) LFD f25, 9 * SIZE(XX) LFD f26, 10 * SIZE(XX) LFD f27, 11 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(XX) LFD f29, 13 * SIZE(XX) LFD f30, 14 * SIZE(XX) LFD f31, 15 * SIZE(XX) addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) addi XX, XX, 8 * SIZE bdnz LL(1010) .align 4 LL(1020): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 addi XX, XX, 8 * SIZE addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1050): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1060): LFD f8, 0 * SIZE(XX) addi XX, XX, 1 * SIZE fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 LL(1100): sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f24, XX, INCX LFDUX f25, XX, INCX LFDUX f26, XX, INCX LFDUX f27, XX, INCX LFDUX f28, XX, INCX LFDUX f29, XX, INCX LFDUX f30, XX, INCX LFDUX f31, XX, INCX bdz LL(1120) .align 4 LL(1110): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDUX f24, XX, INCX LFDUX f25, XX, INCX LFDUX f26, XX, INCX LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f28, XX, INCX LFDUX f29, XX, INCX LFDUX f30, XX, INCX LFDUX f31, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/iamin_hummer.S000066400000000000000000000401361313527062700203620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define RET r9 #define NN r10 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 #define T1 f20 #define T2 f21 #define T3 f22 #define T4 f23 #define T5 f24 #define T6 f25 #define T7 f26 #define T8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX li RET, 0 cmpwi cr0, N, 0 ble LL(999) mr NN, N cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) addi N, N, -1 cmpwi cr0, N, 0 li RET, 1 fabs C1, C1 ble LL(999) fsmfp C1, C1 mr XX, X fpmr C2, C1 add X, X, INCX fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 fabs C2, C2 ble LL(20) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fpabs T1, A1 LFPDUX A6, X, INCX2 fpabs T2, A2 LFPDUX A7, X, INCX2 fpabs T3, A3 LFPDUX A8, X, INCX2 fpabs T4, A4 bdz LL(13) .align 4 LL(12): fpsub F1, T1, C1 LFPDUX A1, X, INCX2 fpsub F2, T2, C2 LFPDUX A2, X, INCX2 fpsub F3, T3, C3 LFPDUX A3, X, INCX2 fpsub F4, T4, C4 LFPDUX A4, X, INCX2 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsel C1, F1, C1, T1 LFPDUX A5, X, INCX2 fpsel C2, F2, C2, T2 LFPDUX A6, X, INCX2 fpsel C3, F3, C3, T3 LFPDUX A7, X, INCX2 fpsel C4, F4, C4, T4 LFPDUX A8, X, INCX2 fpsub F5, T5, C1 fpsub F6, T6, C2 fpsub F7, T7, C3 fpsub F8, T8, C4 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(12) .align 4 LL(13): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, T1, C1 fpsub F2, T2, C2 fpsub F3, T3, C3 fpsub F4, T4, C4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, T5, C1 fpsub F6, T6, C2 fpsub F7, T7, C3 fpsub F8, T8, C4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(15): andi. r0, N, 15 beq LL(20) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpabs A1, A1 fpsub F1, A1, C1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(20) LFDUX A1, X, INCX2 fabs A1, A1 fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(20): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C2, C1 fsel C1, F1, C1, C2 li RET, 0 fsmfp C1, C1 andi. r0, XX, 2 * SIZE - 1 beq LL(21) LFD A1, 0 * SIZE(XX) add XX, XX, INCX addi NN, NN, -1 addi RET, RET, 1 fabs A1, A1 fcmpu cr0, C1, A1 beq cr0, LL(999) .align 4 LL(21): sub XX, XX, INCX2 srawi. r0, NN, 4 mtspr CTR, r0 beq- LL(25) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 LFPDUX A5, XX, INCX2 LFPDUX A6, XX, INCX2 LFPDUX A7, XX, INCX2 LFPDUX A8, XX, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 bdz LL(23) .align 4 LL(22): addi RET, RET, 1 fcmpu cr0, C1, T1 LFPDUX A1, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 LFPDUX A2, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 LFPDUX A3, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 LFPDUX A4, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 LFPDUX A5, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T3 LFPDUX A6, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 LFPDUX A7, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T4 LFPDUX A8, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T5 fpabs T1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T5 fpabs T2, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T6 fpabs T3, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T6 fpabs T4, A4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T7 fpabs T5, A5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T7 fpabs T6, A6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T8 fpabs T7, A7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T8 fpabs T8, A8 beq cr0, LL(999) bdnz LL(22) .align 4 LL(23): addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T5 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T6 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T7 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T8 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T8 beq cr0, LL(999) .align 4 LL(25): andi. r0, NN, 8 beq LL(26) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T4 beq cr0, LL(999) .align 4 LL(26): andi. r0, NN, 4 beq LL(27) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 fpabs T1, A1 fpabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T2 beq cr0, LL(999) .align 4 LL(27): andi. r0, NN, 2 beq LL(28) LFPDUX A1, XX, INCX2 fpabs T1, A1 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, T1 beq cr0, LL(999) .align 4 LL(28): andi. r0, NN, 1 beq LL(999) addi RET, RET, 1 b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFSDUX A5, X, INCX fpabs T1, A1 LFSDUX A6, X, INCX fpabs T2, A2 LFSDUX A7, X, INCX fpabs T3, A3 LFSDUX A8, X, INCX fpabs T4, A4 bdz LL(103) .align 4 LL(102): fpsub F1, T1, C1 LFDUX A1, X, INCX fpsub F2, T2, C2 LFDUX A2, X, INCX fpsub F3, T3, C3 LFDUX A3, X, INCX fpsub F4, T4, C4 LFDUX A4, X, INCX fpabs T5, A5 LFSDUX A1, X, INCX fpabs T6, A6 LFSDUX A2, X, INCX fpabs T7, A7 LFSDUX A3, X, INCX fpabs T8, A8 LFSDUX A4, X, INCX fpsel C1, F1, C1, T1 LFDUX A5, X, INCX fpsel C2, F2, C2, T2 LFDUX A6, X, INCX fpsel C3, F3, C3, T3 LFDUX A7, X, INCX fpsel C4, F4, C4, T4 LFDUX A8, X, INCX fpsub F5, T5, C1 LFSDUX A5, X, INCX fpsub F6, T6, C2 LFSDUX A6, X, INCX fpsub F7, T7, C3 LFSDUX A7, X, INCX fpsub F8, T8, C4 LFSDUX A8, X, INCX fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(102) .align 4 LL(103): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, T1, C1 fpsub F2, T2, C2 fpsub F3, T3, C3 fpsub F4, T4, C4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, T5, C1 fpsub F6, T6, C2 fpsub F7, T7, C3 fpsub F8, T8, C4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(105): andi. r0, N, 15 beq LL(120) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fsub F1, A1, C1 fsub F2, A2, C2 fsub F3, A3, C3 fsub F4, A4, C4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fsub F1, A1, C1 fsub F2, A2, C2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX fabs A1, A1 fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(120): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C2, C1 fsel C1, F1, C1, C2 li RET, 0 sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(126) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX fabs T1, A1 fabs T2, A2 LFDUX A5, XX, INCX LFDUX A6, XX, INCX LFDUX A7, XX, INCX LFDUX A8, XX, INCX bdz LL(123) .align 4 LL(122): LFDUX A1, XX, INCX fabs T3, A3 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) LFDUX A2, XX, INCX fabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) LFDUX A3, XX, INCX fabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) LFDUX A4, XX, INCX fabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) LFDUX A5, XX, INCX fabs T3, A7 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) LFDUX A6, XX, INCX fabs T4, A8 addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) LFDUX A7, XX, INCX fabs T1, A1 addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) LFDUX A8, XX, INCX fabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) bdnz LL(122) .align 4 LL(123): fabs T3, A3 fabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) fabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) fabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) fabs T3, A7 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) fabs T4, A8 addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) .align 4 LL(126): andi. r0, NN, 4 beq LL(127) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX fabs T1, A1 fabs T2, A2 fabs T3, A3 fabs T4, A4 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T4 beq cr0, LL(999) .align 4 LL(127): andi. r0, NN, 2 beq LL(128) LFDUX A1, XX, INCX LFDUX A2, XX, INCX fabs T1, A1 fabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, T1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, T2 beq cr0, LL(999) .align 4 LL(128): addi RET, RET, 1 .align 4 LL(999): li r10, 16 addi SP, SP, -16 mr r3, RET lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/iamin_ppc440.S000066400000000000000000000223651313527062700201030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PRE r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 slwi INCX, INCX, BASE_SHIFT sub X, X, INCX li PRE, 3 * 16 * SIZE mr NN, N mr XX, X cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 subi N, N, 1 fabs f6, f1 srawi. r0, N, 4 fabs f7, f1 mtspr CTR, r0 fabs f1, f1 beq- LL(150) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDUX f25, X, INCX fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDUX f27, X, INCX fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDUX f29, X, INCX fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fsub f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f9, f1 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f10, f2 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f12, f4 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f13, f5 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f14, f6 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f31, X, INCX fsub f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f9, f1 fabs f9, f25 LFDUX f25, X, INCX fsel f2, f18, f10, f2 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f11, f3 fabs f11, f27 LFDUX f27, X, INCX fsel f4, f20, f12, f4 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f13, f5 fabs f13, f29 LFDUX f29, X, INCX fsel f6, f22, f14, f6 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f15, f7 fabs f15, f31 LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fabs f8, f24 fsel f1, f17, f9, f1 fabs f9, f25 fsel f2, f18, f10, f2 fabs f10, f26 fsel f3, f19, f11, f3 fabs f11, f27 fsel f4, f20, f12, f4 fabs f12, f28 fsel f5, f21, f13, f5 fabs f13, f29 fsel f6, f22, f14, f6 fabs f14, f30 fsel f7, f23, f15, f7 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f8, f0 fsel f1, f17, f9, f1 fsel f2, f18, f10, f2 fsel f3, f19, f11, f3 fsel f4, f20, f12, f4 fsel f5, f21, f13, f5 fsel f6, f22, f14, f6 fsel f7, f23, f15, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(1000): srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f24, XX, INCX LFDUX f25, XX, INCX LFDUX f26, XX, INCX LFDUX f27, XX, INCX LFDUX f28, XX, INCX LFDUX f29, XX, INCX LFDUX f30, XX, INCX LFDUX f31, XX, INCX bdz LL(1120) .align 4 LL(1110): fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDUX f25, XX, INCX fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDUX f27, XX, INCX #ifdef PPCG4 dcbt XX, PRE #endif fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDUX f29, XX, INCX fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDUX f31, XX, INCX #if defined(PPCG4) && defined(DOUBLE) dcbt XX, PRE #endif addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/imax.S000066400000000000000000000313001313527062700166370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PREA r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 mr NN, N mr XX, X slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f16, 0 * SIZE(X) LFD f17, 1 * SIZE(X) LFD f18, 2 * SIZE(X) LFD f19, 3 * SIZE(X) LFD f20, 4 * SIZE(X) LFD f21, 5 * SIZE(X) LFD f22, 6 * SIZE(X) LFD f23, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(20) .align 4 LL(10): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 LFD f16, 16 * SIZE(X) LFD f17, 17 * SIZE(X) LFD f18, 18 * SIZE(X) LFD f19, 19 * SIZE(X) fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 LFD f20, 20 * SIZE(X) LFD f21, 21 * SIZE(X) LFD f22, 22 * SIZE(X) LFD f23, 23 * SIZE(X) fsel f0, f8, f0, f24 fsub f8, f0, f16 fsel f1, f9, f1, f25 fsub f9, f1, f17 fsel f2, f10, f2, f26 fsub f10, f2, f18 fsel f3, f11, f3, f27 fsub f11, f3, f19 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f12, f4, f28 fsub f12, f4, f20 fsel f5, f13, f5, f29 fsub f13, f5, f21 fsel f6, f14, f6, f30 fsub f14, f6, f22 fsel f7, f15, f7, f31 fsub f15, f7, f23 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 fsel f0, f8, f0, f24 fsel f1, f9, f1, f25 fsel f2, f10, f2, f26 fsel f3, f11, f3, f27 fsel f4, f12, f4, f28 fsel f5, f13, f5, f29 fsel f6, f14, f6, f30 fsel f7, f15, f7, f31 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX fsel f0, f8, f0, f24 fsub f8, f0, f16 fsel f1, f9, f1, f25 fsub f9, f1, f17 fsel f2, f10, f2, f26 fsub f10, f2, f18 fsel f3, f11, f3, f27 fsub f11, f3, f19 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f12, f4, f28 fsub f12, f4, f20 fsel f5, f13, f5, f29 fsub f13, f5, f21 fsel f6, f14, f6, f30 fsub f14, f6, f22 fsel f7, f15, f7, f31 fsub f15, f7, f23 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 fsel f0, f8, f0, f24 fsel f1, f9, f1, f25 fsel f2, f10, f2, f26 fsel f3, f11, f3, f27 fsel f4, f12, f4, f28 fsel f5, f13, f5, f29 fsel f6, f14, f6, f30 fsel f7, f15, f7, f31 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(1000): cmpwi cr0, INCX, SIZE bne- cr0, LL(1100) srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(1050) LFD f8, 0 * SIZE(XX) LFD f9, 1 * SIZE(XX) LFD f10, 2 * SIZE(XX) LFD f11, 3 * SIZE(XX) LFD f12, 4 * SIZE(XX) LFD f13, 5 * SIZE(XX) LFD f14, 6 * SIZE(XX) LFD f15, 7 * SIZE(XX) bdz LL(1020) .align 4 LL(1010): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) LFD f8, 8 * SIZE(XX) LFD f9, 9 * SIZE(XX) LFD f10, 10 * SIZE(XX) LFD f11, 11 * SIZE(XX) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) LFD f12, 12 * SIZE(XX) LFD f13, 13 * SIZE(XX) LFD f14, 14 * SIZE(XX) LFD f15, 15 * SIZE(XX) addi XX, XX, 8 * SIZE bdnz LL(1010) .align 4 LL(1020): addi XX, XX, 8 * SIZE addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1050): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1060): LFD f8, 0 * SIZE(XX) addi XX, XX, 1 * SIZE addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 LL(1100): sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdz LL(1120) .align 4 LL(1110): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdnz LL(1110) .align 4 LL(1120): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/imax_hummer.S000066400000000000000000000344251313527062700202270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define RET r9 #define NN r10 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX li RET, 0 cmpwi cr0, N, 0 ble LL(999) mr NN, N cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) addi N, N, -1 cmpwi cr0, N, 0 li RET, 1 ble LL(999) fsmfp C1, C1 mr XX, X fpmr C2, C1 add X, X, INCX fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(20) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 LFPDUX A1, X, INCX2 fpsel C2, F2, C2, A2 LFPDUX A2, X, INCX2 fpsel C3, F3, C3, A3 LFPDUX A3, X, INCX2 fpsel C4, F4, C4, A4 LFPDUX A4, X, INCX2 fpsub F5, C1, A5 fpsub F6, C2, A6 fpsub F7, C3, A7 fpsub F8, C4, A8 fpsel C1, F5, C1, A5 LFPDUX A5, X, INCX2 fpsel C2, F6, C2, A6 LFPDUX A6, X, INCX2 fpsel C3, F7, C3, A7 LFPDUX A7, X, INCX2 fpsel C4, F8, C4, A8 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, C1, A5 fpsub F6, C2, A6 fpsub F7, C3, A7 fpsub F8, C4, A8 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(15): andi. r0, N, 15 beq LL(20) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(20) LFDUX A1, X, INCX2 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(20): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel C1, F1, C1, C2 li RET, 0 fsmfp C1, C1 andi. r0, XX, 2 * SIZE - 1 beq LL(21) LFD A1, 0 * SIZE(XX) add XX, XX, INCX addi NN, NN, -1 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) .align 4 LL(21): sub XX, XX, INCX2 srawi. r0, NN, 4 mtspr CTR, r0 beq- LL(25) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 LFPDUX A5, XX, INCX2 LFPDUX A6, XX, INCX2 LFPDUX A7, XX, INCX2 LFPDUX A8, XX, INCX2 bdz LL(23) .align 4 LL(22): addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 LFPDUX A1, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 LFPDUX A2, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 LFPDUX A3, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A4 LFPDUX A4, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A5 LFPDUX A5, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A6 LFPDUX A6, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A7 LFPDUX A7, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A8 LFPDUX A8, XX, INCX2 beq cr0, LL(999) bdnz LL(22) .align 4 LL(23): addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A8 beq cr0, LL(999) .align 4 LL(25): andi. r0, NN, 8 beq LL(26) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A4 beq cr0, LL(999) .align 4 LL(26): andi. r0, NN, 4 beq LL(27) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 beq cr0, LL(999) .align 4 LL(27): andi. r0, NN, 2 beq LL(28) LFPDUX A1, XX, INCX2 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) .align 4 LL(28): addi RET, RET, 1 b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX bdz LL(103) .align 4 LL(102): fpsub F1, C1, A1 LFSDUX A5, X, INCX fpsub F2, C2, A2 LFSDUX A6, X, INCX fpsub F3, C3, A3 LFSDUX A7, X, INCX fpsub F4, C4, A4 LFSDUX A8, X, INCX fpsel C1, F1, C1, A1 LFDUX A1, X, INCX fpsel C2, F2, C2, A2 LFDUX A2, X, INCX fpsel C3, F3, C3, A3 LFDUX A3, X, INCX fpsel C4, F4, C4, A4 LFDUX A4, X, INCX fpsub F5, C1, A5 LFSDUX A1, X, INCX fpsub F6, C2, A6 LFSDUX A2, X, INCX fpsub F7, C3, A7 LFSDUX A3, X, INCX fpsub F8, C4, A8 LFSDUX A4, X, INCX fpsel C1, F5, C1, A5 LFDUX A5, X, INCX fpsel C2, F6, C2, A6 LFDUX A6, X, INCX fpsel C3, F7, C3, A7 LFDUX A7, X, INCX fpsel C4, F8, C4, A8 LFDUX A8, X, INCX bdnz LL(102) .align 4 LL(103): fpsub F1, C1, A1 LFSDUX A5, X, INCX fpsub F2, C2, A2 LFSDUX A6, X, INCX fpsub F3, C3, A3 LFSDUX A7, X, INCX fpsub F4, C4, A4 LFSDUX A8, X, INCX fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, C1, A5 fpsub F6, C2, A6 fpsub F7, C3, A7 fpsub F8, C4, A8 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(105): andi. r0, N, 15 beq LL(120) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fsub F1, C1, A1 fsub F2, C2, A2 fsub F3, C3, A3 fsub F4, C4, A4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fsub F1, C1, A1 fsub F2, C2, A2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(120): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel C1, F1, C1, C2 li RET, 0 sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(126) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX LFDUX A5, XX, INCX LFDUX A6, XX, INCX LFDUX A7, XX, INCX LFDUX A8, XX, INCX bdz LL(123) .align 4 LL(122): addi RET, RET, 1 fcmpu cr0, C1, A1 LFDUX A1, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 LFDUX A2, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 LFDUX A3, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 LFDUX A4, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 LFDUX A5, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 LFDUX A6, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 LFDUX A7, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 LFDUX A8, XX, INCX beq cr0, LL(999) bdnz LL(122) .align 4 LL(123): addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 beq cr0, LL(999) .align 4 LL(126): andi. r0, NN, 4 beq LL(127) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) .align 4 LL(127): andi. r0, NN, 2 beq LL(128) LFDUX A1, XX, INCX LFDUX A2, XX, INCX addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) .align 4 LL(128): addi RET, RET, 1 .align 4 LL(999): li r10, 16 addi SP, SP, -16 mr r3, RET lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/imax_ppc440.S000066400000000000000000000211021313527062700177300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PRE r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 li PRE, 3 * 16 * SIZE slwi INCX, INCX, BASE_SHIFT sub X, X, INCX mr NN, N mr XX, X cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 subi N, N, 1 fmr f5, f1 srawi. r0, N, 4 fmr f6, f1 mtspr CTR, r0 fmr f7, f1 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX fsub f8, f0, f16 LFDUX f25, X, INCX fsub f9, f1, f17 LFDUX f26, X, INCX fsub f10, f2, f18 LFDUX f27, X, INCX fsub f11, f3, f19 LFDUX f28, X, INCX fsub f12, f4, f20 LFDUX f29, X, INCX fsub f13, f5, f21 LFDUX f30, X, INCX fsub f14, f6, f22 LFDUX f31, X, INCX fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f0, f16 LFDUX f16, X, INCX fsub f8, f0, f24 #ifdef PPCG4 dcbt X, PRE #endif fsel f1, f9, f1, f17 LFDUX f17, X, INCX fsub f9, f1, f25 fsel f2, f10, f2, f18 LFDUX f18, X, INCX fsub f10, f2, f26 fsel f3, f11, f3, f19 LFDUX f19, X, INCX fsub f11, f3, f27 fsel f4, f12, f4, f20 LFDUX f20, X, INCX fsub f12, f4, f28 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fsel f5, f13, f5, f21 LFDUX f21, X, INCX fsub f13, f5, f29 fsel f6, f14, f6, f22 LFDUX f22, X, INCX fsub f14, f6, f30 fsel f7, f15, f7, f23 LFDUX f23, X, INCX fsub f15, f7, f31 fsel f0, f8, f0, f24 LFDUX f24, X, INCX fsub f8, f0, f16 #ifdef PPCG4 dcbt X, PRE #endif fsel f1, f9, f1, f25 LFDUX f25, X, INCX fsub f9, f1, f17 fsel f2, f10, f2, f26 LFDUX f26, X, INCX fsub f10, f2, f18 fsel f3, f11, f3, f27 LFDUX f27, X, INCX fsub f11, f3, f19 fsel f4, f12, f4, f28 LFDUX f28, X, INCX fsub f12, f4, f20 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fsel f5, f13, f5, f29 LFDUX f29, X, INCX fsub f13, f5, f21 fsel f6, f14, f6, f30 LFDUX f30, X, INCX fsub f14, f6, f22 fsel f7, f15, f7, f31 LFDUX f31, X, INCX fsub f15, f7, f23 bdnz LL(110) .align 4 LL(120): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 fsel f0, f8, f0, f24 fsel f1, f9, f1, f25 fsel f2, f10, f2, f26 fsel f3, f11, f3, f27 fsel f4, f12, f4, f28 fsel f5, f13, f5, f29 fsel f6, f14, f6, f30 fsel f7, f15, f7, f31 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(1000): srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdz LL(1120) .align 4 LL(1110): addi RET, RET, 1 fcmpu cr0, f1, f8 LFDUX f8, XX, INCX beq cr0, LL(9999) #ifdef PPCG4 dcbt XX, PRE #endif addi RET, RET, 1 fcmpu cr0, f1, f9 LFDUX f9, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 LFDUX f10, XX, INCX beq cr0, LL(9999) #ifdef PPCG4 dcbt XX, PRE #endif addi RET, RET, 1 fcmpu cr0, f1, f11 LFDUX f11, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 LFDUX f12, XX, INCX beq cr0, LL(9999) #ifdef PPCG4 dcbt XX, PRE #endif addi RET, RET, 1 fcmpu cr0, f1, f13 LFDUX f13, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 LFDUX f14, XX, INCX beq cr0, LL(9999) #ifdef PPCG4 dcbt XX, PRE #endif addi RET, RET, 1 fcmpu cr0, f1, f15 LFDUX f15, XX, INCX beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/imin.S000066400000000000000000000312101313527062700166350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PREA r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 mr NN, N mr XX, X slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f16, 0 * SIZE(X) LFD f17, 1 * SIZE(X) LFD f18, 2 * SIZE(X) LFD f19, 3 * SIZE(X) LFD f20, 4 * SIZE(X) LFD f21, 5 * SIZE(X) LFD f22, 6 * SIZE(X) LFD f23, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(20) .align 4 LL(10): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 LFD f16, 16 * SIZE(X) LFD f17, 17 * SIZE(X) LFD f18, 18 * SIZE(X) LFD f19, 19 * SIZE(X) fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 LFD f20, 20 * SIZE(X) LFD f21, 21 * SIZE(X) LFD f22, 22 * SIZE(X) LFD f23, 23 * SIZE(X) fsel f0, f8, f24, f0 fsub f8, f0, f16 fsel f1, f9, f25, f1 fsub f9, f1, f17 fsel f2, f10, f26, f2 fsub f10, f2, f18 fsel f3, f11, f27, f3 fsub f11, f3, f19 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f12, f28, f4 fsub f12, f4, f20 fsel f5, f13, f29, f5 fsub f13, f5, f21 fsel f6, f14, f30, f6 fsub f14, f6, f22 fsel f7, f15, f31, f7 fsub f15, f7, f23 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 fsel f0, f8, f24, f0 fsel f1, f9, f25, f1 fsel f2, f10, f26, f2 fsel f3, f11, f27, f3 fsel f4, f12, f28, f4 fsel f5, f13, f29, f5 fsel f6, f14, f30, f6 fsel f7, f15, f31, f7 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX fsel f0, f8, f24, f0 fsub f8, f0, f16 fsel f1, f9, f25, f1 fsub f9, f1, f17 fsel f2, f10, f26, f2 fsub f10, f2, f18 fsel f3, f11, f27, f3 fsub f11, f3, f19 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f12, f28, f4 fsub f12, f4, f20 fsel f5, f13, f29, f5 fsub f13, f5, f21 fsel f6, f14, f30, f6 fsub f14, f6, f22 fsel f7, f15, f31, f7 fsub f15, f7, f23 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 fsel f0, f8, f24, f0 fsel f1, f9, f25, f1 fsel f2, f10, f26, f2 fsel f3, f11, f27, f3 fsel f4, f12, f28, f4 fsel f5, f13, f29, f5 fsel f6, f14, f30, f6 fsel f7, f15, f31, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(1000): cmpwi cr0, INCX, SIZE bne- cr0, LL(1100) srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(1050) LFD f8, 0 * SIZE(XX) LFD f9, 1 * SIZE(XX) LFD f10, 2 * SIZE(XX) LFD f11, 3 * SIZE(XX) LFD f12, 4 * SIZE(XX) LFD f13, 5 * SIZE(XX) LFD f14, 6 * SIZE(XX) LFD f15, 7 * SIZE(XX) bdz LL(1020) .align 4 LL(1010): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) LFD f8, 8 * SIZE(XX) LFD f9, 9 * SIZE(XX) LFD f10, 10 * SIZE(XX) LFD f11, 11 * SIZE(XX) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) LFD f12, 12 * SIZE(XX) LFD f13, 13 * SIZE(XX) LFD f14, 14 * SIZE(XX) LFD f15, 15 * SIZE(XX) addi XX, XX, 8 * SIZE bdnz LL(1010) .align 4 LL(1020): addi XX, XX, 8 * SIZE addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1050): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1060): LFD f8, 0 * SIZE(XX) addi XX, XX, 1 * SIZE addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 LL(1100): sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdz LL(1120) .align 4 LL(1110): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdnz LL(1110) .align 4 LL(1120): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/imin_hummer.S000066400000000000000000000344151313527062700202240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define RET r9 #define NN r10 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX li RET, 0 cmpwi cr0, N, 0 ble LL(999) mr NN, N cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) addi N, N, -1 cmpwi cr0, N, 0 li RET, 1 ble LL(999) fsmfp C1, C1 mr XX, X fpmr C2, C1 add X, X, INCX fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(20) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 LFPDUX A1, X, INCX2 fpsel C2, F2, C2, A2 LFPDUX A2, X, INCX2 fpsel C3, F3, C3, A3 LFPDUX A3, X, INCX2 fpsel C4, F4, C4, A4 LFPDUX A4, X, INCX2 fpsub F5, A5, C1 fpsub F6, A6, C2 fpsub F7, A7, C3 fpsub F8, A8, C4 fpsel C1, F5, C1, A5 LFPDUX A5, X, INCX2 fpsel C2, F6, C2, A6 LFPDUX A6, X, INCX2 fpsel C3, F7, C3, A7 LFPDUX A7, X, INCX2 fpsel C4, F8, C4, A8 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, A5, C1 fpsub F6, A6, C2 fpsub F7, A7, C3 fpsub F8, A8, C4 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(15): andi. r0, N, 15 beq LL(20) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpsub F1, A1, C1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(20) LFDUX A1, X, INCX2 fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(20): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C2, C1 fsel C1, F1, C1, C2 li RET, 0 fsmfp C1, C1 andi. r0, XX, 2 * SIZE - 1 beq LL(21) LFD A1, 0 * SIZE(XX) add XX, XX, INCX addi NN, NN, -1 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) .align 4 LL(21): sub XX, XX, INCX2 srawi. r0, NN, 4 mtspr CTR, r0 beq- LL(25) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 LFPDUX A5, XX, INCX2 LFPDUX A6, XX, INCX2 LFPDUX A7, XX, INCX2 LFPDUX A8, XX, INCX2 bdz LL(23) .align 4 LL(22): addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 LFPDUX A1, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 LFPDUX A2, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 LFPDUX A3, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A4 LFPDUX A4, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A5 LFPDUX A5, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A6 LFPDUX A6, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A7 LFPDUX A7, XX, INCX2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A8 LFPDUX A8, XX, INCX2 beq cr0, LL(999) bdnz LL(22) .align 4 LL(23): addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A8 beq cr0, LL(999) .align 4 LL(25): andi. r0, NN, 8 beq LL(26) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A4 beq cr0, LL(999) .align 4 LL(26): andi. r0, NN, 4 beq LL(27) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A2 beq cr0, LL(999) .align 4 LL(27): andi. r0, NN, 2 beq LL(28) LFPDUX A1, XX, INCX2 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) .align 4 LL(28): addi RET, RET, 1 b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX bdz LL(103) .align 4 LL(102): fpsub F1, A1, C1 LFSDUX A5, X, INCX fpsub F2, A2, C2 LFSDUX A6, X, INCX fpsub F3, A3, C3 LFSDUX A7, X, INCX fpsub F4, A4, C4 LFSDUX A8, X, INCX fpsel C1, F1, C1, A1 LFDUX A1, X, INCX fpsel C2, F2, C2, A2 LFDUX A2, X, INCX fpsel C3, F3, C3, A3 LFDUX A3, X, INCX fpsel C4, F4, C4, A4 LFDUX A4, X, INCX fpsub F5, A5, C1 LFSDUX A1, X, INCX fpsub F6, A6, C2 LFSDUX A2, X, INCX fpsub F7, A7, C3 LFSDUX A3, X, INCX fpsub F8, A8, C4 LFSDUX A4, X, INCX fpsel C1, F5, C1, A5 LFDUX A5, X, INCX fpsel C2, F6, C2, A6 LFDUX A6, X, INCX fpsel C3, F7, C3, A7 LFDUX A7, X, INCX fpsel C4, F8, C4, A8 LFDUX A8, X, INCX bdnz LL(102) .align 4 LL(103): fpsub F1, A1, C1 LFSDUX A5, X, INCX fpsub F2, A2, C2 LFSDUX A6, X, INCX fpsub F3, A3, C3 LFSDUX A7, X, INCX fpsub F4, A4, C4 LFSDUX A8, X, INCX fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, A5, C1 fpsub F6, A6, C2 fpsub F7, A7, C3 fpsub F8, A8, C4 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(105): andi. r0, N, 15 beq LL(120) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fsub F1, A1, C1 fsub F2, A2, C2 fsub F3, A3, C3 fsub F4, A4, C4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fsub F1, A1, C1 fsub F2, A2, C2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(120): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C2, C1 fsel C1, F1, C1, C2 li RET, 0 sub XX, XX, INCX srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(126) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX LFDUX A5, XX, INCX LFDUX A6, XX, INCX LFDUX A7, XX, INCX LFDUX A8, XX, INCX bdz LL(123) .align 4 LL(122): addi RET, RET, 1 fcmpu cr0, C1, A1 LFDUX A1, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 LFDUX A2, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 LFDUX A3, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 LFDUX A4, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 LFDUX A5, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 LFDUX A6, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 LFDUX A7, XX, INCX beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 LFDUX A8, XX, INCX beq cr0, LL(999) bdnz LL(122) .align 4 LL(123): addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A5 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A6 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A7 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A8 beq cr0, LL(999) .align 4 LL(126): andi. r0, NN, 4 beq LL(127) LFDUX A1, XX, INCX LFDUX A2, XX, INCX LFDUX A3, XX, INCX LFDUX A4, XX, INCX addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A4 beq cr0, LL(999) .align 4 LL(127): andi. r0, NN, 2 beq LL(128) LFDUX A1, XX, INCX LFDUX A2, XX, INCX addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A2 beq cr0, LL(999) .align 4 LL(128): addi RET, RET, 1 .align 4 LL(999): li r10, 16 addi SP, SP, -16 mr r3, RET lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/imin_ppc440.S000066400000000000000000000206301313527062700177330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PRE r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 li PRE, 3 * 16 * SIZE slwi INCX, INCX, BASE_SHIFT sub X, X, INCX mr NN, N mr XX, X cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 subi N, N, 1 fmr f5, f1 srawi. r0, N, 4 fmr f6, f1 mtspr CTR, r0 fmr f7, f1 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX fsub f8, f0, f16 LFDUX f25, X, INCX fsub f9, f1, f17 LFDUX f26, X, INCX fsub f10, f2, f18 LFDUX f27, X, INCX fsub f11, f3, f19 LFDUX f28, X, INCX fsub f12, f4, f20 LFDUX f29, X, INCX fsub f13, f5, f21 LFDUX f30, X, INCX fsub f14, f6, f22 LFDUX f31, X, INCX fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f16, f0 LFDUX f16, X, INCX fsub f8, f0, f24 #ifdef PPCG4 dcbt X, PRE #endif fsel f1, f9, f17, f1 LFDUX f17, X, INCX fsub f9, f1, f25 fsel f2, f10, f18, f2 LFDUX f18, X, INCX fsub f10, f2, f26 fsel f3, f11, f19, f3 LFDUX f19, X, INCX fsub f11, f3, f27 fsel f4, f12, f20, f4 LFDUX f20, X, INCX fsub f12, f4, f28 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fsel f5, f13, f21, f5 LFDUX f21, X, INCX fsub f13, f5, f29 fsel f6, f14, f22, f6 LFDUX f22, X, INCX fsub f14, f6, f30 fsel f7, f15, f23, f7 LFDUX f23, X, INCX fsub f15, f7, f31 fsel f0, f8, f24, f0 LFDUX f24, X, INCX fsub f8, f0, f16 #ifdef PPCG4 dcbt X, PRE #endif fsel f1, f9, f25, f1 LFDUX f25, X, INCX fsub f9, f1, f17 fsel f2, f10, f26, f2 LFDUX f26, X, INCX fsub f10, f2, f18 fsel f3, f11, f27, f3 LFDUX f27, X, INCX fsub f11, f3, f19 fsel f4, f12, f28, f4 LFDUX f28, X, INCX fsub f12, f4, f20 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fsel f5, f13, f29, f5 LFDUX f29, X, INCX fsub f13, f5, f21 fsel f6, f14, f30, f6 LFDUX f30, X, INCX fsub f14, f6, f22 fsel f7, f15, f31, f7 LFDUX f31, X, INCX fsub f15, f7, f23 bdnz LL(110) .align 4 LL(120): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 fsel f0, f8, f24, f0 fsel f1, f9, f25, f1 fsel f2, f10, f26, f2 fsel f3, f11, f27, f3 fsel f4, f12, f28, f4 fsel f5, f13, f29, f5 fsel f6, f14, f30, f6 fsel f7, f15, f31, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(1000): srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdz LL(1120) .align 4 LL(1110): addi RET, RET, 1 fcmpu cr0, f1, f8 LFDUX f8, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 LFDUX f9, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 LFDUX f10, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 LFDUX f11, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 LFDUX f12, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 LFDUX f13, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 LFDUX f14, XX, INCX beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 LFDUX f15, XX, INCX beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f9 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f10 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f11 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f12 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f13 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f14 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f15 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/izamax.S000066400000000000000000000365271313527062700172120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PREA r9 #define INCXM1 r10 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 mr NN, N mr XX, X slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) LFD f2, 1 * SIZE(X) add X, X, INCX fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 fmr f0, f1 fmr f2, f1 fmr f3, f1 subi N, N, 1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsub f8, f0, f2 fsel f1, f8, f0, f2 .align 4 LL(1000): cmpwi cr0, INCX, SIZE * 2 bne- cr0, LL(1100) srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(1050) LFD f24, 0 * SIZE(XX) LFD f25, 1 * SIZE(XX) LFD f26, 2 * SIZE(XX) LFD f27, 3 * SIZE(XX) LFD f28, 4 * SIZE(XX) LFD f29, 5 * SIZE(XX) LFD f30, 6 * SIZE(XX) LFD f31, 7 * SIZE(XX) bdz LL(1020) .align 4 LL(1010): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(XX) LFD f25, 9 * SIZE(XX) LFD f26, 10 * SIZE(XX) LFD f27, 11 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(XX) LFD f29, 13 * SIZE(XX) LFD f30, 14 * SIZE(XX) LFD f31, 15 * SIZE(XX) fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 16 * SIZE(XX) LFD f25, 17 * SIZE(XX) LFD f26, 18 * SIZE(XX) LFD f27, 19 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 20 * SIZE(XX) LFD f29, 21 * SIZE(XX) LFD f30, 22 * SIZE(XX) LFD f31, 23 * SIZE(XX) fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) addi XX, XX, 16 * SIZE bdnz LL(1010) .align 4 LL(1020): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(XX) LFD f25, 9 * SIZE(XX) LFD f26, 10 * SIZE(XX) LFD f27, 11 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(XX) LFD f29, 13 * SIZE(XX) LFD f30, 14 * SIZE(XX) LFD f31, 15 * SIZE(XX) fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) addi XX, XX, 16 * SIZE .align 4 LL(1050): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1060): LFD f8, 0 * SIZE(XX) LFD f9, 1 * SIZE(XX) addi XX, XX, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 LL(1100): sub XX, XX, INCXM1 srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX bdz LL(1120) .align 4 LL(1110): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDX f8, XX, INCXM1 LFDUX f9, XX, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/izamax_hummer.S000066400000000000000000000236501313527062700205600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define RET r9 #define NN r10 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define T1 f16 #define T2 f17 #define T3 f18 #define T4 f19 #define B1 f20 #define B2 f21 #define B3 f22 #define B4 f23 #define B5 f24 #define B6 f25 #define B7 f26 #define B8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX li RET, 0 cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 mr NN, N ble LL(999) mr XX, X LFD A1, 0 * SIZE(X) LFD A2, 1 * SIZE(X) add X, X, INCX2 li RET, 1 fabs A1, A1 fabs A2, A2 subi INCX2, INCX2, SIZE addi N, N, -1 cmpwi cr0, N, 0 fadd C1, A1, A2 ble LL(999) fsmfp C1, C1 li INCX, SIZE fpmr C2, C1 sub X, X, INCX2 fpmr C3, C1 srawi. r0, N, 3 fpmr C4, C1 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX LFDUX A5, X, INCX2 LFDUX A6, X, INCX LFDUX A7, X, INCX2 LFDUX A8, X, INCX LFSDUX A5, X, INCX2 LFSDUX A6, X, INCX LFSDUX A7, X, INCX2 LFSDUX A8, X, INCX bdz LL(103) .align 4 LL(102): fpabs B1, A1 LFDUX A1, X, INCX2 fpabs B2, A2 LFDUX A2, X, INCX fpabs B3, A3 LFDUX A3, X, INCX2 fpabs B4, A4 LFDUX A4, X, INCX fpabs B5, A5 LFSDUX A1, X, INCX2 fpabs B6, A6 LFSDUX A2, X, INCX fpabs B7, A7 LFSDUX A3, X, INCX2 fpabs B8, A8 LFSDUX A4, X, INCX fpadd T1, B1, B2 LFDUX A5, X, INCX2 fpadd T2, B3, B4 LFDUX A6, X, INCX fpadd T3, B5, B6 LFDUX A7, X, INCX2 fpadd T4, B7, B8 LFDUX A8, X, INCX fpsub F1, C1, T1 LFSDUX A5, X, INCX2 fpsub F2, C2, T2 LFSDUX A6, X, INCX fpsub F3, C3, T3 LFSDUX A7, X, INCX2 fpsub F4, C4, T4 LFSDUX A8, X, INCX fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 bdnz LL(102) .align 4 LL(103): fpabs B1, A1 fpabs B2, A2 fpabs B3, A3 fpabs B4, A4 fpabs B5, A5 fpabs B6, A6 fpabs B7, A7 fpabs B8, A8 fpadd T1, B1, B2 fpadd T2, B3, B4 fpadd T3, B5, B6 fpadd T4, B7, B8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 .align 4 LL(105): andi. r0, N, 7 beq LL(120) andi. r0, N, 4 beq LL(106) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpadd A1, A1, A2 fpadd A3, A3, A4 fpsub F1, C1, A1 fpsub F2, C2, A3 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A3 .align 4 LL(106): andi. r0, N, 2 beq LL(107) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX fpabs A1, A1 fpabs A2, A2 fpadd A1, A1, A2 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(107): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX2 LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fadd A1, A1, A2 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(120): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 li RET, 0 fsub F1, C1, C2 fsel C1, F1, C1, C2 fsmfp C1, C1 sub XX, XX, INCX2 srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(125) LFDUX A1, XX, INCX2 LFDUX A2, XX, INCX LFDUX A3, XX, INCX2 LFDUX A4, XX, INCX LFSDUX A1, XX, INCX2 LFSDUX A2, XX, INCX LFSDUX A3, XX, INCX2 LFSDUX A4, XX, INCX LFDUX A5, XX, INCX2 LFDUX A6, XX, INCX LFDUX A7, XX, INCX2 LFDUX A8, XX, INCX LFSDUX A5, XX, INCX2 LFSDUX A6, XX, INCX LFSDUX A7, XX, INCX2 LFSDUX A8, XX, INCX fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpadd B1, T1, T2 fpadd B2, T3, T4 bdz LL(123) .align 4 LL(122): LFDUX A1, XX, INCX2 fpabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, B1 LFDUX A2, XX, INCX beq cr0, LL(999) LFDUX A3, XX, INCX2 fpabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, B2 LFDUX A4, XX, INCX beq cr0, LL(999) LFSDUX A1, XX, INCX2 fpabs T3, A7 addi RET, RET, 1 fscmp cr0, C1, B1 LFSDUX A2, XX, INCX beq cr0, LL(999) LFSDUX A3, XX, INCX2 fpabs T4, A8 addi RET, RET, 1 fscmp cr0, C1, B2 LFSDUX A4, XX, INCX beq cr0, LL(999) fpadd B3, T1, T2 fpadd B4, T3, T4 LFDUX A5, XX, INCX2 fpabs T1, A1 addi RET, RET, 1 fcmpu cr0, C1, B3 LFDUX A6, XX, INCX beq cr0, LL(999) LFDUX A7, XX, INCX2 fpabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, B4 LFDUX A8, XX, INCX beq cr0, LL(999) LFSDUX A5, XX, INCX2 fpabs T3, A3 addi RET, RET, 1 fscmp cr0, C1, B3 LFSDUX A6, XX, INCX beq cr0, LL(999) LFSDUX A7, XX, INCX2 fpabs T4, A4 addi RET, RET, 1 fscmp cr0, C1, B4 LFSDUX A8, XX, INCX beq cr0, LL(999) fpadd B1, T1, T2 fpadd B2, T3, T4 bdnz LL(122) .align 4 LL(123): fpabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, B1 beq cr0, LL(999) fpabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, B2 beq cr0, LL(999) fpabs T3, A7 addi RET, RET, 1 fscmp cr0, C1, B1 beq cr0, LL(999) fpabs T4, A8 addi RET, RET, 1 fscmp cr0, C1, B2 beq cr0, LL(999) fpadd B3, T1, T2 fpadd B4, T3, T4 addi RET, RET, 1 fcmpu cr0, C1, B3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, B4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, B3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, B4 beq cr0, LL(999) .align 4 LL(125): andi. r0, NN, 4 beq LL(126) LFDUX A1, XX, INCX2 LFDUX A2, XX, INCX LFDUX A3, XX, INCX2 LFDUX A4, XX, INCX LFSDUX A1, XX, INCX2 LFSDUX A2, XX, INCX LFSDUX A3, XX, INCX2 LFSDUX A4, XX, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpadd A1, A1, A2 fpadd A3, A3, A4 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 beq cr0, LL(999) .align 4 LL(126): andi. r0, NN, 2 beq LL(127) LFDUX A1, XX, INCX2 LFDUX A2, XX, INCX LFDUX A3, XX, INCX2 LFDUX A4, XX, INCX fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fadd A1, A1, A2 fadd A3, A3, A4 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) .align 4 LL(127): addi RET, RET, 1 .align 4 LL(999): li r10, 16 addi SP, SP, -16 mr r3, RET lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/izamax_ppc440.S000066400000000000000000000234421313527062700202740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PRE r9 #define INC1 r10 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 slwi INCX, INCX, ZBASE_SHIFT sub X, X, INCX li INC1, SIZE li PRE, 3 * 16 * SIZE mr NN, N mr XX, X cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX LFDX f2, X, INC1 fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 subi N, N, 1 fmr f0, f1 srawi. r0, N, 3 fmr f2, f1 mtspr CTR, r0 fmr f3, f1 beq- LL(150) LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 #ifdef PPCG4 dcbt X, PRE #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fabs f12, f28 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fabs f13, f29 LFDUX f28, X, INCX fabs f14, f30 LFDX f29, X, INC1 fabs f15, f31 LFDUX f30, X, INCX fsub f16, f0, f4 LFDX f31, X, INC1 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 #ifdef PPCG4 dcbt X, PRE #endif fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fsel f0, f16, f0, f4 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDX f29, X, INC1 fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDX f31, X, INC1 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDX f9, X, INC1 fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsub f8, f0, f2 fsel f1, f8, f0, f2 .align 4 LL(1000): srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f24, XX, INCX LFDX f25, XX, INC1 LFDUX f26, XX, INCX LFDX f27, XX, INC1 LFDUX f28, XX, INCX LFDX f29, XX, INC1 LFDUX f30, XX, INCX LFDX f31, XX, INC1 bdz LL(1120) .align 4 LL(1110): fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDX f25, XX, INC1 fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDX f27, XX, INC1 #ifdef PPCG4 dcbt XX, PRE #endif fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDX f29, XX, INC1 fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDX f31, XX, INC1 fadd f4, f8, f9 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDX f25, XX, INC1 fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDX f27, XX, INC1 #ifdef PPCG4 dcbt XX, PRE #endif fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDX f29, XX, INC1 fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDX f31, XX, INC1 fadd f4, f8, f9 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDX f25, XX, INC1 fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDX f27, XX, INC1 fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDX f29, XX, INC1 fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDX f31, XX, INC1 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX LFDX f9, XX, INC1 fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/izamin.S000066400000000000000000000365101313527062700172000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PREA r9 #define INCXM1 r10 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 mr NN, N mr XX, X slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) LFD f2, 1 * SIZE(X) add X, X, INCX fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 fmr f0, f1 fmr f2, f1 fmr f3, f1 subi N, N, 1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsub f8, f0, f2 fsel f1, f8, f2, f0 .align 4 LL(1000): cmpwi cr0, INCX, SIZE * 2 bne- cr0, LL(1100) srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(1050) LFD f24, 0 * SIZE(XX) LFD f25, 1 * SIZE(XX) LFD f26, 2 * SIZE(XX) LFD f27, 3 * SIZE(XX) LFD f28, 4 * SIZE(XX) LFD f29, 5 * SIZE(XX) LFD f30, 6 * SIZE(XX) LFD f31, 7 * SIZE(XX) bdz LL(1020) .align 4 LL(1010): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(XX) LFD f25, 9 * SIZE(XX) LFD f26, 10 * SIZE(XX) LFD f27, 11 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(XX) LFD f29, 13 * SIZE(XX) LFD f30, 14 * SIZE(XX) LFD f31, 15 * SIZE(XX) fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 16 * SIZE(XX) LFD f25, 17 * SIZE(XX) LFD f26, 18 * SIZE(XX) LFD f27, 19 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 20 * SIZE(XX) LFD f29, 21 * SIZE(XX) LFD f30, 22 * SIZE(XX) LFD f31, 23 * SIZE(XX) fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) addi XX, XX, 16 * SIZE bdnz LL(1010) .align 4 LL(1020): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(XX) LFD f25, 9 * SIZE(XX) LFD f26, 10 * SIZE(XX) LFD f27, 11 * SIZE(XX) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(XX) LFD f29, 13 * SIZE(XX) LFD f30, 14 * SIZE(XX) LFD f31, 15 * SIZE(XX) fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) addi XX, XX, 16 * SIZE .align 4 LL(1050): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1060): LFD f8, 0 * SIZE(XX) LFD f9, 1 * SIZE(XX) addi XX, XX, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 LL(1100): sub XX, XX, INCXM1 srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX bdz LL(1120) .align 4 LL(1110): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, XX, INCXM1 LFDUX f25, XX, INCX LFDX f26, XX, INCXM1 LFDUX f27, XX, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, XX, INCXM1 LFDUX f29, XX, INCX LFDX f30, XX, INCXM1 LFDUX f31, XX, INCX fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDX f8, XX, INCXM1 LFDUX f9, XX, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/izamin_hummer.S000066400000000000000000000236441313527062700205610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define RET r9 #define NN r10 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define T1 f16 #define T2 f17 #define T3 f18 #define T4 f19 #define B1 f20 #define B2 f21 #define B3 f22 #define B4 f23 #define B5 f24 #define B6 f25 #define B7 f26 #define B8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX li RET, 0 cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 mr NN, N ble LL(999) mr XX, X LFD A1, 0 * SIZE(X) LFD A2, 1 * SIZE(X) add X, X, INCX2 li RET, 1 fabs A1, A1 fabs A2, A2 subi INCX2, INCX2, SIZE addi N, N, -1 cmpwi cr0, N, 0 fadd C1, A1, A2 ble LL(999) fsmfp C1, C1 li INCX, SIZE fpmr C2, C1 sub X, X, INCX2 fpmr C3, C1 srawi. r0, N, 3 fpmr C4, C1 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX LFDUX A5, X, INCX2 LFDUX A6, X, INCX LFDUX A7, X, INCX2 LFDUX A8, X, INCX LFSDUX A5, X, INCX2 LFSDUX A6, X, INCX LFSDUX A7, X, INCX2 LFSDUX A8, X, INCX bdz LL(103) .align 4 LL(102): fpabs B1, A1 LFDUX A1, X, INCX2 fpabs B2, A2 LFDUX A2, X, INCX fpabs B3, A3 LFDUX A3, X, INCX2 fpabs B4, A4 LFDUX A4, X, INCX fpabs B5, A5 LFSDUX A1, X, INCX2 fpabs B6, A6 LFSDUX A2, X, INCX fpabs B7, A7 LFSDUX A3, X, INCX2 fpabs B8, A8 LFSDUX A4, X, INCX fpadd T1, B1, B2 LFDUX A5, X, INCX2 fpadd T2, B3, B4 LFDUX A6, X, INCX fpadd T3, B5, B6 LFDUX A7, X, INCX2 fpadd T4, B7, B8 LFDUX A8, X, INCX fpsub F1, T1, C1 LFSDUX A5, X, INCX2 fpsub F2, T2, C2 LFSDUX A6, X, INCX fpsub F3, T3, C3 LFSDUX A7, X, INCX2 fpsub F4, T4, C4 LFSDUX A8, X, INCX fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 bdnz LL(102) .align 4 LL(103): fpabs B1, A1 fpabs B2, A2 fpabs B3, A3 fpabs B4, A4 fpabs B5, A5 fpabs B6, A6 fpabs B7, A7 fpabs B8, A8 fpadd T1, B1, B2 fpadd T2, B3, B4 fpadd T3, B5, B6 fpadd T4, B7, B8 fpsub F1, T1, C1 fpsub F2, T2, C2 fpsub F3, T3, C3 fpsub F4, T4, C4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 .align 4 LL(105): andi. r0, N, 7 beq LL(120) andi. r0, N, 4 beq LL(106) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpadd A1, A1, A2 fpadd A3, A3, A4 fpsub F1, A1, C1 fpsub F2, A3, C2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A3 .align 4 LL(106): andi. r0, N, 2 beq LL(107) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX fpabs A1, A1 fpabs A2, A2 fpadd A1, A1, A2 fpsub F1, A1, C1 fpsel C1, F1, C1, A1 .align 4 LL(107): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX2 LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fadd A1, A1, A2 fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(120): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 li RET, 0 fsub F1, C2, C1 fsel C1, F1, C1, C2 fsmfp C1, C1 sub XX, XX, INCX2 srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(125) LFDUX A1, XX, INCX2 LFDUX A2, XX, INCX LFDUX A3, XX, INCX2 LFDUX A4, XX, INCX LFSDUX A1, XX, INCX2 LFSDUX A2, XX, INCX LFSDUX A3, XX, INCX2 LFSDUX A4, XX, INCX LFDUX A5, XX, INCX2 LFDUX A6, XX, INCX LFDUX A7, XX, INCX2 LFDUX A8, XX, INCX LFSDUX A5, XX, INCX2 LFSDUX A6, XX, INCX LFSDUX A7, XX, INCX2 LFSDUX A8, XX, INCX fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpadd B1, T1, T2 fpadd B2, T3, T4 bdz LL(123) .align 4 LL(122): LFDUX A1, XX, INCX2 fpabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, B1 LFDUX A2, XX, INCX beq cr0, LL(999) LFDUX A3, XX, INCX2 fpabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, B2 LFDUX A4, XX, INCX beq cr0, LL(999) LFSDUX A1, XX, INCX2 fpabs T3, A7 addi RET, RET, 1 fscmp cr0, C1, B1 LFSDUX A2, XX, INCX beq cr0, LL(999) LFSDUX A3, XX, INCX2 fpabs T4, A8 addi RET, RET, 1 fscmp cr0, C1, B2 LFSDUX A4, XX, INCX beq cr0, LL(999) fpadd B3, T1, T2 fpadd B4, T3, T4 LFDUX A5, XX, INCX2 fpabs T1, A1 addi RET, RET, 1 fcmpu cr0, C1, B3 LFDUX A6, XX, INCX beq cr0, LL(999) LFDUX A7, XX, INCX2 fpabs T2, A2 addi RET, RET, 1 fcmpu cr0, C1, B4 LFDUX A8, XX, INCX beq cr0, LL(999) LFSDUX A5, XX, INCX2 fpabs T3, A3 addi RET, RET, 1 fscmp cr0, C1, B3 LFSDUX A6, XX, INCX beq cr0, LL(999) LFSDUX A7, XX, INCX2 fpabs T4, A4 addi RET, RET, 1 fscmp cr0, C1, B4 LFSDUX A8, XX, INCX beq cr0, LL(999) fpadd B1, T1, T2 fpadd B2, T3, T4 bdnz LL(122) .align 4 LL(123): fpabs T1, A5 addi RET, RET, 1 fcmpu cr0, C1, B1 beq cr0, LL(999) fpabs T2, A6 addi RET, RET, 1 fcmpu cr0, C1, B2 beq cr0, LL(999) fpabs T3, A7 addi RET, RET, 1 fscmp cr0, C1, B1 beq cr0, LL(999) fpabs T4, A8 addi RET, RET, 1 fscmp cr0, C1, B2 beq cr0, LL(999) fpadd B3, T1, T2 fpadd B4, T3, T4 addi RET, RET, 1 fcmpu cr0, C1, B3 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, B4 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, B3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, B4 beq cr0, LL(999) .align 4 LL(125): andi. r0, NN, 4 beq LL(126) LFDUX A1, XX, INCX2 LFDUX A2, XX, INCX LFDUX A3, XX, INCX2 LFDUX A4, XX, INCX LFSDUX A1, XX, INCX2 LFSDUX A2, XX, INCX LFSDUX A3, XX, INCX2 LFSDUX A4, XX, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpadd A1, A1, A2 fpadd A3, A3, A4 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fscmp cr0, C1, A3 beq cr0, LL(999) .align 4 LL(126): andi. r0, NN, 2 beq LL(127) LFDUX A1, XX, INCX2 LFDUX A2, XX, INCX LFDUX A3, XX, INCX2 LFDUX A4, XX, INCX fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fadd A1, A1, A2 fadd A3, A3, A4 addi RET, RET, 1 fcmpu cr0, C1, A1 beq cr0, LL(999) addi RET, RET, 1 fcmpu cr0, C1, A3 beq cr0, LL(999) .align 4 LL(127): addi RET, RET, 1 .align 4 LL(999): li r10, 16 addi SP, SP, -16 mr r3, RET lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/izamin_ppc440.S000066400000000000000000000234171313527062700202740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RET r3 #define X r4 #define INCX r5 #define N r6 #define NN r7 #define XX r8 #define PRE r9 #define INC1 r10 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(r3) LDINT INCX, 0(INCX) #else mr N, r3 #endif li RET, 0 slwi INCX, INCX, ZBASE_SHIFT sub X, X, INCX li INC1, SIZE li PRE, 3 * 16 * SIZE mr NN, N mr XX, X cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX LFDX f2, X, INC1 fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 subi N, N, 1 fmr f0, f1 srawi. r0, N, 3 fmr f2, f1 mtspr CTR, r0 fmr f3, f1 beq- LL(150) LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 #ifdef PPCG4 dcbt X, PRE #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fabs f12, f28 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fabs f13, f29 LFDUX f28, X, INCX fabs f14, f30 LFDX f29, X, INC1 fabs f15, f31 LFDUX f30, X, INCX fsub f16, f0, f4 LFDX f31, X, INC1 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 #ifdef PPCG4 dcbt X, PRE #endif fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fsel f0, f16, f4, f0 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDX f29, X, INC1 fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDX f31, X, INC1 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDX f9, X, INC1 fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsub f8, f0, f2 fsel f1, f8, f2, f0 .align 4 LL(1000): srawi. r0, NN, 3 mtspr CTR, r0 beq- LL(1150) LFDUX f24, XX, INCX LFDX f25, XX, INC1 LFDUX f26, XX, INCX LFDX f27, XX, INC1 LFDUX f28, XX, INCX LFDX f29, XX, INC1 LFDUX f30, XX, INCX LFDX f31, XX, INC1 bdz LL(1120) .align 4 LL(1110): fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDX f25, XX, INC1 fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDX f27, XX, INC1 #ifdef PPCG4 dcbt XX, PRE #endif fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDX f29, XX, INC1 fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDX f31, XX, INC1 fadd f4, f8, f9 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDX f25, XX, INC1 fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDX f27, XX, INC1 #ifdef PPCG4 dcbt XX, PRE #endif fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDX f29, XX, INC1 fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDX f31, XX, INC1 fadd f4, f8, f9 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) bdnz LL(1110) .align 4 LL(1120): fabs f8, f24 LFDUX f24, XX, INCX fabs f9, f25 LFDX f25, XX, INC1 fabs f10, f26 LFDUX f26, XX, INCX fabs f11, f27 LFDX f27, XX, INC1 fabs f12, f28 LFDUX f28, XX, INCX fabs f13, f29 LFDX f29, XX, INC1 fabs f14, f30 LFDUX f30, XX, INCX fabs f15, f31 LFDX f31, XX, INC1 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 addi RET, RET, 1 fcmpu cr0, f1, f4 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f5 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f6 beq cr0, LL(9999) addi RET, RET, 1 fcmpu cr0, f1, f7 beq cr0, LL(9999) .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq LL(9999) .align 4 LL(1160): LFDUX f8, XX, INCX LFDX f9, XX, INC1 fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 addi RET, RET, 1 fcmpu cr0, f1, f8 beq cr0, LL(9999) bdnz LL(1160) .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/lock.c000066400000000000000000000060171313527062700166600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ static void __inline blas_lock(volatile BLASULONG *address){ #ifdef __GNUC__ BLASLONG int ret, val = 1; __asm__ __volatile__ ( " .machine \"any\" ;" "0: lwarx %0,0, %1 ;" " cmpwi 0,%0,0;" " bne 1f;" " stwcx. %2,0, %1 ;" " bne- 0b;" "1: " : "=&r"(ret) : "r"(address), "r" (val) : "cr0", "memory"); #else while (*address) {}; *address = 1; #endif } OpenBLAS-0.2.20/kernel/power/lsame.S000066400000000000000000000060421313527062700170070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE lbz r3, 0(r3) lbz r4, 0(r4) cmplwi cr0, r3, 96 cmplwi cr6, r4, 96 addi r0, r3, -32 addi r11,r4, -32 ble- cr0, LL(2) #ifdef __64BIT__ rldicl r3, r0, 0, 56 #else rlwinm r3, r0, 0, 0xff #endif LL(2): ble- cr6, LL(3) #ifdef __64BIT__ rldicl r4, r11, 0, 56 #else rlwinm r4, r11, 0, 0xff #endif LL(3): xor r3, r3, r4 subfic r0, r3, 0 adde r3, r0, r3 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/max.S000066400000000000000000000224321313527062700164740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f16, 0 * SIZE(X) LFD f17, 1 * SIZE(X) LFD f18, 2 * SIZE(X) LFD f19, 3 * SIZE(X) LFD f20, 4 * SIZE(X) LFD f21, 5 * SIZE(X) LFD f22, 6 * SIZE(X) LFD f23, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(20) .align 4 LL(10): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 LFD f16, 16 * SIZE(X) LFD f17, 17 * SIZE(X) LFD f18, 18 * SIZE(X) LFD f19, 19 * SIZE(X) fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 LFD f20, 20 * SIZE(X) LFD f21, 21 * SIZE(X) LFD f22, 22 * SIZE(X) LFD f23, 23 * SIZE(X) fsel f0, f8, f0, f24 fsub f8, f0, f16 fsel f1, f9, f1, f25 fsub f9, f1, f17 fsel f2, f10, f2, f26 fsub f10, f2, f18 fsel f3, f11, f3, f27 fsub f11, f3, f19 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f12, f4, f28 fsub f12, f4, f20 fsel f5, f13, f5, f29 fsub f13, f5, f21 fsel f6, f14, f6, f30 fsub f14, f6, f22 fsel f7, f15, f7, f31 fsub f15, f7, f23 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 fsel f0, f8, f0, f24 fsel f1, f9, f1, f25 fsel f2, f10, f2, f26 fsel f3, f11, f3, f27 fsel f4, f12, f4, f28 fsel f5, f13, f5, f29 fsel f6, f14, f6, f30 fsel f7, f15, f7, f31 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX fsel f0, f8, f0, f24 fsub f8, f0, f16 fsel f1, f9, f1, f25 fsub f9, f1, f17 fsel f2, f10, f2, f26 fsub f10, f2, f18 fsel f3, f11, f3, f27 fsub f11, f3, f19 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f12, f4, f28 fsub f12, f4, f20 fsel f5, f13, f5, f29 fsub f13, f5, f21 fsel f6, f14, f6, f30 fsub f14, f6, f22 fsel f7, f15, f7, f31 fsub f15, f7, f23 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 fsel f0, f8, f0, f24 fsel f1, f9, f1, f25 fsel f2, f10, f2, f26 fsel f3, f11, f3, f27 fsel f4, f12, f4, f28 fsel f5, f13, f5, f29 fsel f6, f14, f6, f30 fsel f7, f15, f7, f31 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/max_hummer.S000066400000000000000000000217421313527062700200540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 #define T1 f20 #define T2 f21 #define T3 f22 #define T4 f23 #define T5 f24 #define T6 f25 #define T7 f26 #define T8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(999) fsmfp C1, C1 fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(998) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 LFPDUX A1, X, INCX2 fpsel C2, F2, C2, A2 LFPDUX A2, X, INCX2 fpsel C3, F3, C3, A3 LFPDUX A3, X, INCX2 fpsel C4, F4, C4, A4 LFPDUX A4, X, INCX2 fpsub F5, C1, A5 fpsub F6, C2, A6 fpsub F7, C3, A7 fpsub F8, C4, A8 fpsel C1, F5, C1, A5 LFPDUX A5, X, INCX2 fpsel C2, F6, C2, A6 LFPDUX A6, X, INCX2 fpsel C3, F7, C3, A7 LFPDUX A7, X, INCX2 fpsel C4, F8, C4, A8 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, C1, A5 fpsub F6, C2, A6 fpsub F7, C3, A7 fpsub F8, C4, A8 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(15): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX2 fsub F1, C1, A1 fsel C1, F1, C1, A1 b LL(998) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFSDUX A5, X, INCX LFSDUX A6, X, INCX LFSDUX A7, X, INCX LFSDUX A8, X, INCX fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 bdz LL(103) .align 4 LL(102): fpsel C1, F1, C1, A1 LFDUX A1, X, INCX fpsel C2, F2, C2, A2 LFDUX A2, X, INCX fpsel C3, F3, C3, A3 LFDUX A3, X, INCX fpsel C4, F4, C4, A4 LFDUX A4, X, INCX fpsub F5, C1, A5 LFSDUX A1, X, INCX fpsub F6, C2, A6 LFSDUX A2, X, INCX fpsub F7, C3, A7 LFSDUX A3, X, INCX fpsub F8, C4, A8 LFSDUX A4, X, INCX fpsel C1, F5, C1, A5 LFDUX A5, X, INCX fpsel C2, F6, C2, A6 LFDUX A6, X, INCX fpsel C3, F7, C3, A7 LFDUX A7, X, INCX fpsel C4, F8, C4, A8 LFDUX A8, X, INCX fpsub F1, C1, A1 LFSDUX A5, X, INCX fpsub F2, C2, A2 LFSDUX A6, X, INCX fpsub F3, C3, A3 LFSDUX A7, X, INCX fpsub F4, C4, A4 LFSDUX A8, X, INCX bdnz LL(102) .align 4 LL(103): fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, C1, A5 fpsub F6, C2, A6 fpsub F7, C3, A7 fpsub F8, C4, A8 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(105): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fsub F1, C1, A1 fsub F2, C2, A2 fsub F3, C3, A3 fsub F4, C4, A4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fsub F1, C1, A1 fsub F2, C2, A2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(998): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel C1, F1, C1, C2 .align 4 LL(999): li r10, 16 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/max_ppc440.S000066400000000000000000000151061313527062700175660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT sub X, X, INCX cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fmr f0, f1 fmr f2, f1 subi N, N, 1 fmr f3, f1 fmr f4, f1 fmr f5, f1 srawi. r0, N, 4 fmr f6, f1 mtspr CTR, r0 fmr f7, f1 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX fsub f8, f0, f16 LFDUX f25, X, INCX fsub f9, f1, f17 LFDUX f26, X, INCX fsub f10, f2, f18 LFDUX f27, X, INCX fsub f11, f3, f19 LFDUX f28, X, INCX fsub f12, f4, f20 LFDUX f29, X, INCX fsub f13, f5, f21 LFDUX f30, X, INCX fsub f14, f6, f22 LFDUX f31, X, INCX fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f0, f16 LFDUX f16, X, INCX fsub f8, f0, f24 fsel f1, f9, f1, f17 LFDUX f17, X, INCX fsub f9, f1, f25 fsel f2, f10, f2, f18 LFDUX f18, X, INCX fsub f10, f2, f26 fsel f3, f11, f3, f19 LFDUX f19, X, INCX fsub f11, f3, f27 fsel f4, f12, f4, f20 LFDUX f20, X, INCX fsub f12, f4, f28 fsel f5, f13, f5, f21 LFDUX f21, X, INCX fsub f13, f5, f29 fsel f6, f14, f6, f22 LFDUX f22, X, INCX fsub f14, f6, f30 fsel f7, f15, f7, f23 LFDUX f23, X, INCX fsub f15, f7, f31 fsel f0, f8, f0, f24 LFDUX f24, X, INCX fsub f8, f0, f16 fsel f1, f9, f1, f25 LFDUX f25, X, INCX fsub f9, f1, f17 fsel f2, f10, f2, f26 LFDUX f26, X, INCX fsub f10, f2, f18 fsel f3, f11, f3, f27 LFDUX f27, X, INCX fsub f11, f3, f19 fsel f4, f12, f4, f28 LFDUX f28, X, INCX fsub f12, f4, f20 fsel f5, f13, f5, f29 LFDUX f29, X, INCX fsub f13, f5, f21 fsel f6, f14, f6, f30 LFDUX f30, X, INCX fsub f14, f6, f22 fsel f7, f15, f7, f31 LFDUX f31, X, INCX fsub f15, f7, f23 bdnz LL(110) .align 4 LL(120): fsel f0, f8, f0, f16 fsub f8, f0, f24 fsel f1, f9, f1, f17 fsub f9, f1, f25 fsel f2, f10, f2, f18 fsub f10, f2, f26 fsel f3, f11, f3, f19 fsub f11, f3, f27 fsel f4, f12, f4, f20 fsub f12, f4, f28 fsel f5, f13, f5, f21 fsub f13, f5, f29 fsel f6, f14, f6, f22 fsub f14, f6, f30 fsel f7, f15, f7, f23 fsub f15, f7, f31 fsel f0, f8, f0, f24 fsel f1, f9, f1, f25 fsel f2, f10, f2, f26 fsel f3, f11, f3, f27 fsel f4, f12, f4, f28 fsel f5, f13, f5, f29 fsel f6, f14, f6, f30 fsel f7, f15, f7, f31 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f1, f8, f0, f4 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/min.S000066400000000000000000000223421313527062700164720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) add X, X, INCX fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 subi N, N, 1 cmpwi cr0, INCX, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f16, 0 * SIZE(X) LFD f17, 1 * SIZE(X) LFD f18, 2 * SIZE(X) LFD f19, 3 * SIZE(X) LFD f20, 4 * SIZE(X) LFD f21, 5 * SIZE(X) LFD f22, 6 * SIZE(X) LFD f23, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(20) .align 4 LL(10): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 LFD f16, 16 * SIZE(X) LFD f17, 17 * SIZE(X) LFD f18, 18 * SIZE(X) LFD f19, 19 * SIZE(X) fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 LFD f20, 20 * SIZE(X) LFD f21, 21 * SIZE(X) LFD f22, 22 * SIZE(X) LFD f23, 23 * SIZE(X) fsel f0, f8, f24, f0 fsub f8, f0, f16 fsel f1, f9, f25, f1 fsub f9, f1, f17 fsel f2, f10, f26, f2 fsub f10, f2, f18 fsel f3, f11, f27, f3 fsub f11, f3, f19 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f12, f28, f4 fsub f12, f4, f20 fsel f5, f13, f29, f5 fsub f13, f5, f21 fsel f6, f14, f30, f6 fsub f14, f6, f22 fsel f7, f15, f31, f7 fsub f15, f7, f23 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 fsel f0, f8, f24, f0 fsel f1, f9, f25, f1 fsel f2, f10, f26, f2 fsel f3, f11, f27, f3 fsel f4, f12, f28, f4 fsel f5, f13, f29, f5 fsel f6, f14, f30, f6 fsel f7, f15, f31, f7 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f8, f0, f16 fsub f9, f1, f17 fsub f10, f2, f18 fsub f11, f3, f19 fsub f12, f4, f20 fsub f13, f5, f21 fsub f14, f6, f22 fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX fsel f0, f8, f24, f0 fsub f8, f0, f16 fsel f1, f9, f25, f1 fsub f9, f1, f17 fsel f2, f10, f26, f2 fsub f10, f2, f18 fsel f3, f11, f27, f3 fsub f11, f3, f19 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f12, f28, f4 fsub f12, f4, f20 fsel f5, f13, f29, f5 fsub f13, f5, f21 fsel f6, f14, f30, f6 fsub f14, f6, f22 fsel f7, f15, f31, f7 fsub f15, f7, f23 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 fsel f0, f8, f24, f0 fsel f1, f9, f25, f1 fsel f2, f10, f26, f2 fsel f3, f11, f27, f3 fsel f4, f12, f28, f4 fsel f5, f13, f29, f5 fsel f6, f14, f30, f6 fsel f7, f15, f31, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/min_hummer.S000066400000000000000000000217361313527062700200550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define F5 f16 #define F6 f17 #define F7 f18 #define F8 f19 #define T1 f20 #define T2 f21 #define T3 f22 #define T4 f23 #define T5 f24 #define T6 f25 #define T7 f26 #define T8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) LFD C1, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(999) fsmfp C1, C1 fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C2, 0 * SIZE(X) add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(998) .align 4 LL(05): sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 LFPDUX A1, X, INCX2 fpsel C2, F2, C2, A2 LFPDUX A2, X, INCX2 fpsel C3, F3, C3, A3 LFPDUX A3, X, INCX2 fpsel C4, F4, C4, A4 LFPDUX A4, X, INCX2 fpsub F5, A5, C1 fpsub F6, A6, C2 fpsub F7, A7, C3 fpsub F8, A8, C4 fpsel C1, F5, C1, A5 LFPDUX A5, X, INCX2 fpsel C2, F6, C2, A6 LFPDUX A6, X, INCX2 fpsel C3, F7, C3, A7 LFPDUX A7, X, INCX2 fpsel C4, F8, C4, A8 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, A5, C1 fpsub F6, A6, C2 fpsub F7, A7, C3 fpsub F8, A8, C4 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(15): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 4 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpsub F1, A1, C1 fpsub F2, A2, C2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) LFPDUX A1, X, INCX2 fpsub F1, A1, C1 fpsel C1, F1, C1, A1 .align 4 LL(18): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX2 fsub F1, A1, C1 fsel C1, F1, C1, A1 b LL(998) .align 4 LL(100): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX LFSDUX A5, X, INCX LFSDUX A6, X, INCX LFSDUX A7, X, INCX LFSDUX A8, X, INCX fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 bdz LL(103) .align 4 LL(102): fpsel C1, F1, C1, A1 LFDUX A1, X, INCX fpsel C2, F2, C2, A2 LFDUX A2, X, INCX fpsel C3, F3, C3, A3 LFDUX A3, X, INCX fpsel C4, F4, C4, A4 LFDUX A4, X, INCX fpsub F5, A5, C1 LFSDUX A1, X, INCX fpsub F6, A6, C2 LFSDUX A2, X, INCX fpsub F7, A7, C3 LFSDUX A3, X, INCX fpsub F8, A8, C4 LFSDUX A4, X, INCX fpsel C1, F5, C1, A5 LFDUX A5, X, INCX fpsel C2, F6, C2, A6 LFDUX A6, X, INCX fpsel C3, F7, C3, A7 LFDUX A7, X, INCX fpsel C4, F8, C4, A8 LFDUX A8, X, INCX fpsub F1, A1, C1 LFSDUX A5, X, INCX fpsub F2, A2, C2 LFSDUX A6, X, INCX fpsub F3, A3, C3 LFSDUX A7, X, INCX fpsub F4, A4, C4 LFSDUX A8, X, INCX bdnz LL(102) .align 4 LL(103): fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 fpsub F5, A5, C1 fpsub F6, A6, C2 fpsub F7, A7, C3 fpsub F8, A8, C4 fpsel C1, F5, C1, A5 fpsel C2, F6, C2, A6 fpsel C3, F7, C3, A7 fpsel C4, F8, C4, A8 .align 4 LL(105): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(106) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFSDUX A1, X, INCX LFSDUX A2, X, INCX LFSDUX A3, X, INCX LFSDUX A4, X, INCX fpsub F1, A1, C1 fpsub F2, A2, C2 fpsub F3, A3, C3 fpsub F4, A4, C4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 4 beq LL(107) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fsub F1, A1, C1 fsub F2, A2, C2 fsub F3, A3, C3 fsub F4, A4, C4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 2 beq LL(108) LFDUX A1, X, INCX LFDUX A2, X, INCX fsub F1, A1, C1 fsub F2, A2, C2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(108): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(998): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C2, C1 fsel C1, F1, C1, C2 .align 4 LL(999): li r10, 16 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/min_ppc440.S000066400000000000000000000150521313527062700175640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT sub X, X, INCX cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX fmr f0, f1 subi N, N, 1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 srawi. r0, N, 4 fmr f6, f1 mtspr CTR, r0 fmr f7, f1 beq- LL(150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX fsub f8, f0, f16 LFDUX f25, X, INCX fsub f9, f1, f17 LFDUX f26, X, INCX fsub f10, f2, f18 LFDUX f27, X, INCX fsub f11, f3, f19 LFDUX f28, X, INCX fsub f12, f4, f20 LFDUX f29, X, INCX fsub f13, f5, f21 LFDUX f30, X, INCX fsub f14, f6, f22 LFDUX f31, X, INCX fsub f15, f7, f23 bdz LL(120) .align 4 LL(110): fsel f0, f8, f16, f0 LFDUX f16, X, INCX fsub f8, f0, f24 fsel f1, f9, f17, f1 LFDUX f17, X, INCX fsub f9, f1, f25 fsel f2, f10, f18, f2 LFDUX f18, X, INCX fsub f10, f2, f26 fsel f3, f11, f19, f3 LFDUX f19, X, INCX fsub f11, f3, f27 fsel f4, f12, f20, f4 LFDUX f20, X, INCX fsub f12, f4, f28 fsel f5, f13, f21, f5 LFDUX f21, X, INCX fsub f13, f5, f29 fsel f6, f14, f22, f6 LFDUX f22, X, INCX fsub f14, f6, f30 fsel f7, f15, f23, f7 LFDUX f23, X, INCX fsub f15, f7, f31 fsel f0, f8, f24, f0 LFDUX f24, X, INCX fsub f8, f0, f16 fsel f1, f9, f25, f1 LFDUX f25, X, INCX fsub f9, f1, f17 fsel f2, f10, f26, f2 LFDUX f26, X, INCX fsub f10, f2, f18 fsel f3, f11, f27, f3 LFDUX f27, X, INCX fsub f11, f3, f19 fsel f4, f12, f28, f4 LFDUX f28, X, INCX fsub f12, f4, f20 fsel f5, f13, f29, f5 LFDUX f29, X, INCX fsub f13, f5, f21 fsel f6, f14, f30, f6 LFDUX f30, X, INCX fsub f14, f6, f22 fsel f7, f15, f31, f7 LFDUX f31, X, INCX fsub f15, f7, f23 bdnz LL(110) .align 4 LL(120): fsel f0, f8, f16, f0 fsub f8, f0, f24 fsel f1, f9, f17, f1 fsub f9, f1, f25 fsel f2, f10, f18, f2 fsub f10, f2, f26 fsel f3, f11, f19, f3 fsub f11, f3, f27 fsel f4, f12, f20, f4 fsub f12, f4, f28 fsel f5, f13, f21, f5 fsub f13, f5, f29 fsel f6, f14, f22, f6 fsub f14, f6, f30 fsel f7, f15, f23, f7 fsub f15, f7, f31 fsel f0, f8, f24, f0 fsel f1, f9, f25, f1 fsel f2, f10, f26, f2 fsel f3, f11, f27, f3 fsel f4, f12, f28, f4 fsel f5, f13, f29, f5 fsel f6, f14, f30, f6 fsel f7, f15, f31, f7 .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsel f4, f10, f5, f4 fsel f6, f11, f7, f6 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f2, f0 fsel f4, f9, f6, f4 fsub f8, f0, f4 fsel f1, f8, f4, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/nrm2.S000066400000000000000000000410361313527062700165660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define NN r6 #define XX r7 #define PREA r8 #define FZERO 144(SP) #define FONE 148(SP) #define FMAX 152(SP) #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 lis r12, 0x5fe0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE stw r12, FMAX stw r10, 4 + FMAX lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) mr NN, N mr XX, X LFD f1, 0 * SIZE(X) add X, X, INCX fabs f0, f1 fabs f2, f1 fabs f3, f1 fabs f4, f1 fabs f5, f1 fabs f6, f1 fabs f7, f1 fabs f1, f1 subi N, N, 1 cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, SIZE bne- cr0, LL(1000) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(100) .align 4 LL(60): LFD f8, 0 * SIZE(X) addi X, X, 1 * SIZE fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) .align 4 LL(100): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f31, f8, f0, f4 lfs f1, FZERO lfs f0, FONE fcmpu cr0, f1, f31 beq- cr0, LL(9999) fdiv f30, f0, f31 fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 srawi. r0, NN, 4 mtspr CTR, r0 beq- cr0, LL(250) LFD f8, 0 * SIZE(XX) LFD f9, 1 * SIZE(XX) LFD f10, 2 * SIZE(XX) LFD f11, 3 * SIZE(XX) LFD f12, 4 * SIZE(XX) LFD f13, 5 * SIZE(XX) LFD f14, 6 * SIZE(XX) LFD f15, 7 * SIZE(XX) fmul f16, f30, f8 fmul f17, f30, f9 fmul f18, f30, f10 fmul f19, f30, f11 LFD f8, 8 * SIZE(XX) LFD f9, 9 * SIZE(XX) LFD f10, 10 * SIZE(XX) LFD f11, 11 * SIZE(XX) fmul f20, f30, f12 fmul f21, f30, f13 fmul f22, f30, f14 fmul f23, f30, f15 LFD f12, 12 * SIZE(XX) LFD f13, 13 * SIZE(XX) LFD f14, 14 * SIZE(XX) LFD f15, 15 * SIZE(XX) bdz LL(220) .align 4 LL(210): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFD f8, 16 * SIZE(XX) LFD f9, 17 * SIZE(XX) LFD f10, 18 * SIZE(XX) LFD f11, 19 * SIZE(XX) fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFD f12, 20 * SIZE(XX) LFD f13, 21 * SIZE(XX) LFD f14, 22 * SIZE(XX) LFD f15, 23 * SIZE(XX) fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFD f8, 24 * SIZE(XX) LFD f9, 25 * SIZE(XX) LFD f10, 26 * SIZE(XX) LFD f11, 27 * SIZE(XX) fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFD f12, 28 * SIZE(XX) LFD f13, 29 * SIZE(XX) LFD f14, 30 * SIZE(XX) LFD f15, 31 * SIZE(XX) #ifndef POWER6 L1_PREFETCH XX, PREA #endif addi XX, XX, 16 * SIZE #ifdef POWER6 L1_PREFETCH XX, PREA #endif bdnz LL(210) .align 4 LL(220): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 addi XX, XX, 16 * SIZE .align 4 LL(250): andi. r0, NN, 15 mtspr CTR, r0 beq- cr0, LL(270) .align 4 LL(260): LFD f8, 0 * SIZE(XX) addi XX, XX, 1 * SIZE fmul f16, f30, f8 fmadd f0, f16, f16, f0 bdnz LL(260) .align 4 LL(270): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f0, f0, f2 fadd f4, f4, f6 fadd f0, f0, f4 fsqrt f0, f0 fmul f1, f31, f0 b LL(9999) .align 4 LL(1000): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- LL(1050) LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdz LL(1020) .align 4 LL(1010): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(1010) .align 4 LL(1020): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(1050): andi. r0, N, 15 mtspr CTR, r0 beq LL(1999) .align 4 LL(1060): LFDUX f8, X, INCX fabs f8, f8 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(1060) .align 4 LL(1999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f31, f8, f0, f4 lfs f1, FZERO lfs f0, FONE lfd f2, FMAX fcmpu cr0, f1, f31 beq- cr0, LL(9999) fdiv f30, f0, f31 fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 sub XX, XX, INCX srawi. r0, NN, 4 mtspr CTR, r0 beq- cr0, LL(2150) LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX fmul f16, f30, f8 fmul f17, f30, f9 fmul f18, f30, f10 fmul f19, f30, f11 LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX fmul f20, f30, f12 fmul f21, f30, f13 fmul f22, f30, f14 fmul f23, f30, f15 LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdz LL(2120) .align 4 LL(2110): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDUX f8, XX, INCX LFDUX f9, XX, INCX LFDUX f10, XX, INCX LFDUX f11, XX, INCX fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDUX f12, XX, INCX LFDUX f13, XX, INCX LFDUX f14, XX, INCX LFDUX f15, XX, INCX bdnz LL(2110) .align 4 LL(2120): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 .align 4 LL(2150): andi. r0, NN, 15 mtspr CTR, r0 beq- cr0, LL(2170) .align 4 LL(2160): LFDUX f8, XX, INCX fmul f16, f30, f8 fmadd f0, f16, f16, f0 bdnz LL(2160) .align 4 LL(2170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f0, f0, f2 fadd f4, f4, f6 fadd f0, f0, f4 fsqrt f0, f0 fmul f1, f31, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/rot.S000066400000000000000000000265001313527062700165130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #define XX r9 #define YY r10 #define C f1 #define S f2 #define STACKSIZE 32 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, SIZE bne- cr0, LL(100) cmpwi cr0, INCY, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFD f0, 0 * SIZE(X) LFD f4, 1 * SIZE(X) LFD f6, 2 * SIZE(X) LFD f8, 3 * SIZE(X) LFD f3, 0 * SIZE(Y) LFD f5, 1 * SIZE(Y) LFD f7, 2 * SIZE(Y) LFD f9, 3 * SIZE(Y) bdz LL(12) .align 4 LL(10): FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 4 * SIZE(X) LFD f4, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f8, 7 * SIZE(X) LFD f3, 4 * SIZE(Y) LFD f5, 5 * SIZE(Y) LFD f7, 6 * SIZE(Y) LFD f9, 7 * SIZE(Y) STFD f10, 0 * SIZE(X) STFD f12, 1 * SIZE(X) STFD f14, 2 * SIZE(X) STFD f16, 3 * SIZE(X) STFD f11, 0 * SIZE(Y) STFD f13, 1 * SIZE(Y) STFD f15, 2 * SIZE(Y) STFD f17, 3 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 8 * SIZE(X) LFD f4, 9 * SIZE(X) LFD f6, 10 * SIZE(X) LFD f8, 11 * SIZE(X) LFD f3, 8 * SIZE(Y) LFD f5, 9 * SIZE(Y) LFD f7, 10 * SIZE(Y) LFD f9, 11 * SIZE(Y) STFD f10, 4 * SIZE(X) STFD f12, 5 * SIZE(X) STFD f14, 6 * SIZE(X) STFD f16, 7 * SIZE(X) STFD f11, 4 * SIZE(Y) STFD f13, 5 * SIZE(Y) STFD f15, 6 * SIZE(Y) STFD f17, 7 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 12 * SIZE(X) LFD f4, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f8, 15 * SIZE(X) LFD f3, 12 * SIZE(Y) LFD f5, 13 * SIZE(Y) LFD f7, 14 * SIZE(Y) LFD f9, 15 * SIZE(Y) STFD f10, 8 * SIZE(X) STFD f12, 9 * SIZE(X) STFD f14, 10 * SIZE(X) STFD f16, 11 * SIZE(X) STFD f11, 8 * SIZE(Y) STFD f13, 9 * SIZE(Y) STFD f15, 10 * SIZE(Y) STFD f17, 11 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 16 * SIZE(X) LFD f4, 17 * SIZE(X) LFD f6, 18 * SIZE(X) LFD f8, 19 * SIZE(X) LFD f3, 16 * SIZE(Y) LFD f5, 17 * SIZE(Y) LFD f7, 18 * SIZE(Y) LFD f9, 19 * SIZE(Y) STFD f10, 12 * SIZE(X) STFD f12, 13 * SIZE(X) STFD f14, 14 * SIZE(X) STFD f16, 15 * SIZE(X) STFD f11, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f15, 14 * SIZE(Y) STFD f17, 15 * SIZE(Y) #ifndef POWER6 dcbtst X, PREA #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 dcbtst X, PREA dcbtst X, PREA #endif bdnz LL(10) .align 4 LL(12): FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 0 * SIZE(X) STFD f12, 1 * SIZE(X) STFD f14, 2 * SIZE(X) STFD f16, 3 * SIZE(X) STFD f11, 0 * SIZE(Y) STFD f13, 1 * SIZE(Y) STFD f15, 2 * SIZE(Y) STFD f17, 3 * SIZE(Y) LFD f0, 4 * SIZE(X) LFD f4, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f8, 7 * SIZE(X) LFD f3, 4 * SIZE(Y) LFD f5, 5 * SIZE(Y) LFD f7, 6 * SIZE(Y) LFD f9, 7 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 4 * SIZE(X) STFD f12, 5 * SIZE(X) STFD f14, 6 * SIZE(X) STFD f16, 7 * SIZE(X) STFD f11, 4 * SIZE(Y) STFD f13, 5 * SIZE(Y) STFD f15, 6 * SIZE(Y) STFD f17, 7 * SIZE(Y) LFD f0, 8 * SIZE(X) LFD f4, 9 * SIZE(X) LFD f6, 10 * SIZE(X) LFD f8, 11 * SIZE(X) LFD f3, 8 * SIZE(Y) LFD f5, 9 * SIZE(Y) LFD f7, 10 * SIZE(Y) LFD f9, 11 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 8 * SIZE(X) STFD f12, 9 * SIZE(X) STFD f14, 10 * SIZE(X) STFD f16, 11 * SIZE(X) STFD f11, 8 * SIZE(Y) STFD f13, 9 * SIZE(Y) STFD f15, 10 * SIZE(Y) STFD f17, 11 * SIZE(Y) LFD f0, 12 * SIZE(X) LFD f4, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f8, 15 * SIZE(X) LFD f3, 12 * SIZE(Y) LFD f5, 13 * SIZE(Y) LFD f7, 14 * SIZE(Y) LFD f9, 15 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 12 * SIZE(X) STFD f12, 13 * SIZE(X) STFD f14, 14 * SIZE(X) STFD f16, 15 * SIZE(X) STFD f11, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f15, 14 * SIZE(Y) STFD f17, 15 * SIZE(Y) addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f3, 0 * SIZE(X) LFD f4, 0 * SIZE(Y) FMUL f10, C, f3 FMUL f11, C, f4 FMADD f10, S, f4, f10 FNMSUB f11, S, f3, f11 STFD f10, 0 * SIZE(X) STFD f11, 0 * SIZE(Y) addi X, X, 1 * SIZE addi Y, Y, 1 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX sub Y, Y, INCY mr XX, X mr YY, Y srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) .align 4 LL(110): LFDUX f0, X, INCX LFDUX f3, Y, INCY LFDUX f4, X, INCX LFDUX f5, Y, INCY LFDUX f6, X, INCX LFDUX f7, Y, INCY LFDUX f8, X, INCX LFDUX f9, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFDUX f10, XX, INCX STFDUX f11, YY, INCY STFDUX f12, XX, INCX STFDUX f13, YY, INCY STFDUX f14, XX, INCX STFDUX f15, YY, INCY STFDUX f16, XX, INCX STFDUX f17, YY, INCY LFDUX f0, X, INCX LFDUX f3, Y, INCY LFDUX f4, X, INCX LFDUX f5, Y, INCY LFDUX f6, X, INCX LFDUX f7, Y, INCY LFDUX f8, X, INCX LFDUX f9, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFDUX f10, XX, INCX STFDUX f11, YY, INCY STFDUX f12, XX, INCX STFDUX f13, YY, INCY STFDUX f14, XX, INCX STFDUX f15, YY, INCY STFDUX f16, XX, INCX STFDUX f17, YY, INCY bdnz LL(110) .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f0, X, INCX LFDUX f3, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 STFDUX f10, XX, INCX STFDUX f11, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/rot_ppc440.S000066400000000000000000000151361313527062700176100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PRE r8 #define XX r9 #define YY r10 #define C f1 #define S f2 #define STACKSIZE 32 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PRE, 2 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) sub X, X, INCX sub Y, Y, INCY mr XX, X mr YY, Y srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDUX f0, X, INCX LFDUX f3, Y, INCY LFDUX f4, X, INCX FMUL f10, C, f0 LFDUX f5, Y, INCY FMUL f11, C, f3 LFDUX f6, X, INCX FMUL f12, C, f4 LFDUX f7, Y, INCY FMUL f13, C, f5 LFDUX f8, X, INCX FMADD f10, S, f3, f10 LFDUX f9, Y, INCY FNMSUB f11, S, f0, f11 LFDUX f0, X, INCX FMADD f12, S, f5, f12 LFDUX f3, Y, INCY FNMSUB f13, S, f4, f13 LFDUX f4, X, INCX bdz LL(111) .align 4 LL(110): FMUL f14, C, f6 LFDUX f5, Y, INCY FMUL f15, C, f7 STFDUX f10, XX, INCX FMUL f16, C, f8 STFDUX f11, YY, INCY FMUL f17, C, f9 STFDUX f12, XX, INCX #ifdef PPCG4 dcbtst X, PRE #endif FMADD f14, S, f7, f14 STFDUX f13, YY, INCY FNMSUB f15, S, f6, f15 LFDUX f6, X, INCX FMADD f16, S, f9, f16 LFDUX f7, Y, INCY FNMSUB f17, S, f8, f17 LFDUX f8, X, INCX FMUL f10, C, f0 LFDUX f9, Y, INCY FMUL f11, C, f3 STFDUX f14, XX, INCX FMUL f12, C, f4 STFDUX f15, YY, INCY FMUL f13, C, f5 STFDUX f16, XX, INCX #ifdef PPCG4 dcbtst Y, PRE #endif FMADD f10, S, f3, f10 STFDUX f17, YY, INCY FNMSUB f11, S, f0, f11 LFDUX f0, X, INCX FMADD f12, S, f5, f12 LFDUX f3, Y, INCY FNMSUB f13, S, f4, f13 LFDUX f4, X, INCX FMUL f14, C, f6 LFDUX f5, Y, INCY FMUL f15, C, f7 STFDUX f10, XX, INCX FMUL f16, C, f8 STFDUX f11, YY, INCY FMUL f17, C, f9 STFDUX f12, XX, INCX #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f14, S, f7, f14 STFDUX f13, YY, INCY FNMSUB f15, S, f6, f15 LFDUX f6, X, INCX FMADD f16, S, f9, f16 LFDUX f7, Y, INCY FNMSUB f17, S, f8, f17 LFDUX f8, X, INCX FMUL f10, C, f0 LFDUX f9, Y, INCY FMUL f11, C, f3 STFDUX f14, XX, INCX FMUL f12, C, f4 STFDUX f15, YY, INCY FMUL f13, C, f5 STFDUX f16, XX, INCX #if defined(PPCG4) && defined(DOUBLE) dcbtst Y, PRE #endif FMADD f10, S, f3, f10 STFDUX f17, YY, INCY FNMSUB f11, S, f0, f11 LFDUX f0, X, INCX FMADD f12, S, f5, f12 LFDUX f3, Y, INCY FNMSUB f13, S, f4, f13 LFDUX f4, X, INCX bdnz LL(110) .align 4 LL(111): FMUL f14, C, f6 LFDUX f5, Y, INCY FMUL f15, C, f7 STFDUX f10, XX, INCX FMUL f16, C, f8 STFDUX f11, YY, INCY FMUL f17, C, f9 STFDUX f12, XX, INCX FMADD f14, S, f7, f14 STFDUX f13, YY, INCY FNMSUB f15, S, f6, f15 LFDUX f6, X, INCX FMADD f16, S, f9, f16 LFDUX f7, Y, INCY FNMSUB f17, S, f8, f17 LFDUX f8, X, INCX FMUL f10, C, f0 LFDUX f9, Y, INCY FMUL f11, C, f3 STFDUX f14, XX, INCX FMUL f12, C, f4 STFDUX f15, YY, INCY FMUL f13, C, f5 STFDUX f16, XX, INCX FMUL f14, C, f6 STFDUX f17, YY, INCY FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 STFDUX f10, XX, INCX FNMSUB f15, S, f6, f15 STFDUX f11, YY, INCY FMADD f16, S, f9, f16 STFDUX f12, XX, INCX FNMSUB f17, S, f8, f17 STFDUX f13, YY, INCY STFDUX f14, XX, INCX STFDUX f15, YY, INCY STFDUX f16, XX, INCX STFDUX f17, YY, INCY .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f0, X, INCX LFDUX f3, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 STFDUX f10, XX, INCX STFDUX f11, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/sasum.c000066400000000000000000000061721313527062700170620ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #error supports float only #else #define ABS fabsf #endif #if defined(POWER8) #include "sasum_microk_power8.c" #endif #ifndef HAVE_KERNEL_32 static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) { BLASLONG i=0; FLOAT *x = x1; FLOAT temp0, temp1, temp2, temp3; FLOAT temp4, temp5, temp6, temp7; FLOAT sum0 = 0.0; FLOAT sum1 = 0.0; FLOAT sum2 = 0.0; FLOAT sum3 = 0.0; while ( i< n ) { temp0 = ABS(x[0]); temp1 = ABS(x[1]); temp2 = ABS(x[2]); temp3 = ABS(x[3]); temp4 = ABS(x[4]); temp5 = ABS(x[5]); temp6 = ABS(x[6]); temp7 = ABS(x[7]); sum0 += temp0; sum1 += temp1; sum2 += temp2; sum3 += temp3; sum0 += temp4; sum1 += temp5; sum2 += temp6; sum3 += temp7; x+=8; i+=8; } return sum0+sum1+sum2+sum3; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; BLASLONG n1; if (n <= 0 || inc_x <= 0) return(sumf); if ( inc_x == 1 ) { n1 = n & -32; if ( n1 > 0 ) { sumf = sasum_kernel_32(n1, x); i=n1; } while(i < n) { sumf += ABS(x[i]); i++; } } else { n *= inc_x; while(i < n) { sumf += ABS(x[i]); i += inc_x; } } return(sumf); } OpenBLAS-0.2.20/kernel/power/sasum_microk_power8.c000066400000000000000000000124011313527062700217220ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_32 1 static float sasum_kernel_32 (long n, float *x) { float sum; __vector float t0; __vector float t1; __vector float t2; __vector float t3; __asm__ ( "dcbt 0, %2 \n\t" "xxlxor 32, 32, 32 \n\t" "xxlxor 33, 33, 33 \n\t" "xxlxor 34, 34, 34 \n\t" "xxlxor 35, 35, 35 \n\t" "xxlxor 36, 36, 36 \n\t" "xxlxor 37, 37, 37 \n\t" "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" "addi %2, %2, 128 \n\t" "xvaddsp 36, 36, %x3 \n\t" "xvaddsp 37, 37, %x4 \n\t" "addic. %1, %1, -32 \n\t" "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" "bgt 1b \n" "2: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" "xvaddsp 36, 36, %x3 \n\t" "xvaddsp 37, 37, %x4 \n\t" "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" "xvaddsp 32, 32, 33 \n\t" "xvaddsp 34, 34, 35 \n\t" "xvaddsp 36, 36, 37 \n\t" "xvaddsp 38, 38, 39 \n\t" "xvaddsp 32, 32, 34 \n\t" "xvaddsp 36, 36, 38 \n\t" "xvaddsp 32, 32, 36 \n\t" "xxsldwi 33, 32, 32, 2 \n\t" "xvaddsp 32, 32, 33 \n\t" "xxsldwi 33, 32, 32, 1 \n\t" "xvaddsp 32, 32, 33 \n\t" "xscvspdp %x0, 32 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" : "=f" (sum), // 0 "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0), // 3 "=wa" (t1), // 4 "=wa" (t2), // 5 "=wa" (t3) // 6 : "m" (*x), "b" (16), // 8 "b" (32), // 9 "b" (48), // 10 "b" (64), // 11 "b" (80), // 12 "b" (96), // 13 "b" (112) // 14 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); return sum; } OpenBLAS-0.2.20/kernel/power/scal.S000066400000000000000000000205551313527062700166350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define XX r4 #define PREA r5 #ifdef linux #ifndef __64BIT__ #define X r6 #define INCX r7 #else #define X r7 #define INCX r8 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define X r8 #define INCX r9 #else #define X r7 #define INCX r8 #endif #endif #define FZERO f0 #define ALPHA f1 PROLOGUE PROFCODE addi SP, SP, -8 li r0, 0 stw r0, 0(SP) lfs FZERO, 0(SP) addi SP, SP, 8 slwi INCX, INCX, BASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 blelr- cr0 fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) cmpwi cr0, INCX, SIZE bne- cr0, LL(A0IN) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(A0I1_Remain) .align 4 LL(A0I1_kernel): STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) STFD FZERO, 2 * SIZE(X) STFD FZERO, 3 * SIZE(X) STFD FZERO, 4 * SIZE(X) STFD FZERO, 5 * SIZE(X) STFD FZERO, 6 * SIZE(X) STFD FZERO, 7 * SIZE(X) STFD FZERO, 8 * SIZE(X) STFD FZERO, 9 * SIZE(X) STFD FZERO, 10 * SIZE(X) STFD FZERO, 11 * SIZE(X) STFD FZERO, 12 * SIZE(X) STFD FZERO, 13 * SIZE(X) STFD FZERO, 14 * SIZE(X) STFD FZERO, 15 * SIZE(X) addi X, X, 16 * SIZE bdnz LL(A0I1_kernel) .align 4 LL(A0I1_Remain): andi. r0, N, 15 mtspr CTR, r0 beqlr+ .align 4 LL(A0I1_RemainKernel): STFD FZERO, 0 * SIZE(X) addi X, X, 1 * SIZE bdnz LL(A0I1_RemainKernel) blr .align 4 LL(A0IN): srawi. r0, N, 3 mtspr CTR, r0 beq- LL(A0IN_Remain) .align 4 LL(A0IN_Kernel): dcbtst X, PREA STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) add X, X, INCX bdnz LL(A0IN_Kernel) .align 4 LL(A0IN_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4 LL(A0IN_RemainKernel): STFD FZERO, 0 * SIZE(X) add X, X, INCX bdnz LL(A0IN_RemainKernel) blr .align 4 LL(A1I1): cmpwi cr0, INCX, SIZE bne- LL(A1IN) mr XX, X srawi. r0, N, 4 mtspr CTR, r0 beq+ LL(A1I1_Remain) LFD f2, 0 * SIZE(X) LFD f3, 1 * SIZE(X) LFD f4, 2 * SIZE(X) LFD f5, 3 * SIZE(X) LFD f6, 4 * SIZE(X) LFD f7, 5 * SIZE(X) LFD f8, 6 * SIZE(X) LFD f9, 7 * SIZE(X) bdz LL(13) .align 4 LL(A1I1_kernel): FMUL f10, ALPHA, f2 FMUL f11, ALPHA, f3 FMUL f12, ALPHA, f4 FMUL f13, ALPHA, f5 LFD f2, 8 * SIZE(X) LFD f3, 9 * SIZE(X) LFD f4, 10 * SIZE(X) LFD f5, 11 * SIZE(X) STFD f10, 0 * SIZE(X) STFD f11, 1 * SIZE(X) STFD f12, 2 * SIZE(X) STFD f13, 3 * SIZE(X) FMUL f10, ALPHA, f6 FMUL f11, ALPHA, f7 FMUL f12, ALPHA, f8 FMUL f13, ALPHA, f9 LFD f6, 12 * SIZE(X) LFD f7, 13 * SIZE(X) LFD f8, 14 * SIZE(X) LFD f9, 15 * SIZE(X) STFD f10, 4 * SIZE(X) STFD f11, 5 * SIZE(X) STFD f12, 6 * SIZE(X) STFD f13, 7 * SIZE(X) FMUL f10, ALPHA, f2 FMUL f11, ALPHA, f3 FMUL f12, ALPHA, f4 FMUL f13, ALPHA, f5 LFD f2, 16 * SIZE(X) LFD f3, 17 * SIZE(X) LFD f4, 18 * SIZE(X) LFD f5, 19 * SIZE(X) STFD f10, 8 * SIZE(X) STFD f11, 9 * SIZE(X) STFD f12, 10 * SIZE(X) STFD f13, 11 * SIZE(X) FMUL f10, ALPHA, f6 FMUL f11, ALPHA, f7 FMUL f12, ALPHA, f8 FMUL f13, ALPHA, f9 LFD f6, 20 * SIZE(X) LFD f7, 21 * SIZE(X) LFD f8, 22 * SIZE(X) LFD f9, 23 * SIZE(X) STFD f10, 12 * SIZE(X) STFD f11, 13 * SIZE(X) STFD f12, 14 * SIZE(X) STFD f13, 15 * SIZE(X) addi X, X, 16 * SIZE dcbtst X, PREA bdnz LL(A1I1_kernel) .align 4 LL(13): FMUL f10, ALPHA, f2 FMUL f11, ALPHA, f3 FMUL f12, ALPHA, f4 FMUL f13, ALPHA, f5 LFD f2, 8 * SIZE(X) LFD f3, 9 * SIZE(X) LFD f4, 10 * SIZE(X) LFD f5, 11 * SIZE(X) STFD f10, 0 * SIZE(X) STFD f11, 1 * SIZE(X) STFD f12, 2 * SIZE(X) STFD f13, 3 * SIZE(X) FMUL f10, ALPHA, f6 FMUL f11, ALPHA, f7 FMUL f12, ALPHA, f8 FMUL f13, ALPHA, f9 LFD f6, 12 * SIZE(X) LFD f7, 13 * SIZE(X) LFD f8, 14 * SIZE(X) LFD f9, 15 * SIZE(X) STFD f10, 4 * SIZE(X) STFD f11, 5 * SIZE(X) STFD f12, 6 * SIZE(X) STFD f13, 7 * SIZE(X) FMUL f10, ALPHA, f2 FMUL f11, ALPHA, f3 FMUL f12, ALPHA, f4 FMUL f13, ALPHA, f5 STFD f10, 8 * SIZE(X) STFD f11, 9 * SIZE(X) STFD f12, 10 * SIZE(X) STFD f13, 11 * SIZE(X) FMUL f10, ALPHA, f6 FMUL f11, ALPHA, f7 FMUL f12, ALPHA, f8 FMUL f13, ALPHA, f9 STFD f10, 12 * SIZE(X) STFD f11, 13 * SIZE(X) STFD f12, 14 * SIZE(X) STFD f13, 15 * SIZE(X) addi X, X, 16 * SIZE .align 4 LL(A1I1_Remain): andi. r0, N, 15 mtspr CTR, r0 beqlr+ .align 4 LL(A1I1_RemainKernel): LFD f2, 0 * SIZE(X) FMUL f2, ALPHA, f2 STFD f2, 0 * SIZE(X) addi X, X, 1 * SIZE bdnz LL(A1I1_RemainKernel) blr .align 4 LL(A1IN): mr XX, X srawi. r0, N, 3 mtspr CTR, r0 beq- LL(A1IN_Remain) .align 4 LL(A1IN_Kernel): LFD f2, 0 * SIZE(XX) add XX, XX, INCX LFD f3, 0 * SIZE(XX) add XX, XX, INCX LFD f4, 0 * SIZE(XX) add XX, XX, INCX LFD f5, 0 * SIZE(XX) add XX, XX, INCX FMUL f2, ALPHA, f2 FMUL f3, ALPHA, f3 FMUL f4, ALPHA, f4 FMUL f5, ALPHA, f5 LFD f6, 0 * SIZE(XX) add XX, XX, INCX LFD f7, 0 * SIZE(XX) add XX, XX, INCX LFD f8, 0 * SIZE(XX) add XX, XX, INCX LFD f9, 0 * SIZE(XX) add XX, XX, INCX FMUL f6, ALPHA, f6 FMUL f7, ALPHA, f7 FMUL f8, ALPHA, f8 FMUL f9, ALPHA, f9 STFD f2, 0 * SIZE(X) add X, X, INCX STFD f3, 0 * SIZE(X) add X, X, INCX STFD f4, 0 * SIZE(X) add X, X, INCX STFD f5, 0 * SIZE(X) add X, X, INCX STFD f6, 0 * SIZE(X) add X, X, INCX STFD f7, 0 * SIZE(X) add X, X, INCX STFD f8, 0 * SIZE(X) add X, X, INCX STFD f9, 0 * SIZE(X) add X, X, INCX bdnz LL(A1IN_Kernel) .align 4 LL(A1IN_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4 LL(A1IN_RemainKernel): LFD f2, 0 * SIZE(XX) add XX, XX, INCX FMUL f2, ALPHA, f2 STFD f2, 0 * SIZE(X) add X, X, INCX bdnz LL(A1IN_RemainKernel) blr EPILOGUE OpenBLAS-0.2.20/kernel/power/scal_hummer.S000066400000000000000000000217031313527062700202060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r6 #define INCX r7 #define INCX2 r4 #define X2 r5 #define ALPHA f1 #define A1 f0 #define A2 f16 #define A3 f2 #define A4 f3 #define A5 f4 #define A6 f5 #define A7 f6 #define A8 f7 #define B1 f8 #define B2 f9 #define B3 f10 #define B4 f11 #define B5 f12 #define B6 f13 #define B7 f14 #define B8 f15 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) lfpdx A1, SP, r10 # Zero clear fsmfp ALPHA, ALPHA slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, SIZE bne LL(100) fcmpu cr7, ALPHA, A1 bne cr7, LL(50) sub X, X, INCX2 andi. r0, X, 2 * SIZE - 1 beq LL(11) STFDX A1, X, INCX2 addi X, X, 1 * SIZE addi N, N, -1 cmpwi cr0, N, 0 ble LL(999) .align 4 LL(11): srawi. r0, N, 4 mtspr CTR, r0 beq- LL(15) .align 4 LL(12): STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 bdnz LL(12) .align 4 LL(15): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(16) STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 .align 4 LL(16): andi. r0, N, 4 beq LL(17) STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 .align 4 LL(17): andi. r0, N, 2 beq LL(18) STFPDUX A1, X, INCX2 .align 4 LL(18): andi. r0, N, 1 beq LL(999) STFDUX A1, X, INCX2 b LL(999) .align 4 LL(50): sub X2, X, INCX2 sub X, X, INCX2 andi. r0, X, 2 * SIZE - 1 beq LL(51) LFDX A1, X, INCX2 addi X, X, 1 * SIZE fmul B1, ALPHA, A1 addi N, N, -1 cmpwi cr0, N, 0 STFDX B1, X2, INCX2 addi X2, X2, 1 * SIZE ble LL(999) .align 4 LL(51): srawi. r0, N, 4 mtspr CTR, r0 beq- LL(55) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(53) .align 4 LL(52): fpmul B1, ALPHA, A1 LFPDUX A1, X, INCX2 fpmul B2, ALPHA, A2 LFPDUX A2, X, INCX2 fpmul B3, ALPHA, A3 LFPDUX A3, X, INCX2 fpmul B4, ALPHA, A4 LFPDUX A4, X, INCX2 fpmul B5, ALPHA, A5 LFPDUX A5, X, INCX2 fpmul B6, ALPHA, A6 LFPDUX A6, X, INCX2 fpmul B7, ALPHA, A7 LFPDUX A7, X, INCX2 fpmul B8, ALPHA, A8 LFPDUX A8, X, INCX2 STFPDUX B1, X2, INCX2 STFPDUX B2, X2, INCX2 STFPDUX B3, X2, INCX2 STFPDUX B4, X2, INCX2 STFPDUX B5, X2, INCX2 STFPDUX B6, X2, INCX2 STFPDUX B7, X2, INCX2 STFPDUX B8, X2, INCX2 bdnz LL(52) .align 4 LL(53): fpmul B1, ALPHA, A1 fpmul B2, ALPHA, A2 fpmul B3, ALPHA, A3 fpmul B4, ALPHA, A4 fpmul B5, ALPHA, A5 fpmul B6, ALPHA, A6 STFPDUX B1, X2, INCX2 fpmul B7, ALPHA, A7 STFPDUX B2, X2, INCX2 fpmul B8, ALPHA, A8 STFPDUX B3, X2, INCX2 STFPDUX B4, X2, INCX2 STFPDUX B5, X2, INCX2 STFPDUX B6, X2, INCX2 STFPDUX B7, X2, INCX2 STFPDUX B8, X2, INCX2 .align 4 LL(55): andi. r0, N, 15 beq LL(999) andi. r0, N, 8 beq LL(56) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpmul B1, ALPHA, A1 fpmul B2, ALPHA, A2 fpmul B3, ALPHA, A3 fpmul B4, ALPHA, A4 STFPDUX B1, X2, INCX2 STFPDUX B2, X2, INCX2 STFPDUX B3, X2, INCX2 STFPDUX B4, X2, INCX2 .align 4 LL(56): andi. r0, N, 4 beq LL(57) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpmul B1, ALPHA, A1 fpmul B2, ALPHA, A2 STFPDUX B1, X2, INCX2 STFPDUX B2, X2, INCX2 .align 4 LL(57): andi. r0, N, 2 beq LL(58) LFPDUX A1, X, INCX2 fpmul B1, ALPHA, A1 STFPDUX B1, X2, INCX2 .align 4 LL(58): andi. r0, N, 1 beq LL(999) LFDX A1, X, INCX2 fmul B1, ALPHA, A1 STFDX B1, X2, INCX2 b LL(999) .align 4 LL(100): fcmpu cr7, ALPHA, A1 bne cr7, LL(200) sub X, X, INCX srawi. r0, N, 3 mtspr CTR, r0 beq- LL(115) .align 4 LL(112): STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX bdnz LL(112) .align 4 LL(115): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(117) STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX STFDUX A1, X, INCX .align 4 LL(117): andi. r0, N, 2 beq LL(118) STFDUX A1, X, INCX STFDUX A1, X, INCX .align 4 LL(118): andi. r0, N, 1 beq LL(999) STFDUX A1, X, INCX b LL(999) .align 4 LL(200): sub X2, X, INCX sub X, X, INCX srawi. r0, N, 3 mtspr CTR, r0 beq- LL(215) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX LFDUX A5, X, INCX LFDUX A6, X, INCX LFDUX A7, X, INCX LFDUX A8, X, INCX bdz LL(213) .align 4 LL(212): fmul B1, ALPHA, A1 LFDUX A1, X, INCX fmul B2, ALPHA, A2 LFDUX A2, X, INCX fmul B3, ALPHA, A3 LFDUX A3, X, INCX fmul B4, ALPHA, A4 LFDUX A4, X, INCX fmul B5, ALPHA, A5 LFDUX A5, X, INCX fmul B6, ALPHA, A6 LFDUX A6, X, INCX fmul B7, ALPHA, A7 LFDUX A7, X, INCX fmul B8, ALPHA, A8 LFDUX A8, X, INCX STFDUX B1, X2, INCX STFDUX B2, X2, INCX STFDUX B3, X2, INCX STFDUX B4, X2, INCX STFDUX B5, X2, INCX STFDUX B6, X2, INCX STFDUX B7, X2, INCX STFDUX B8, X2, INCX bdnz LL(212) .align 4 LL(213): fmul B1, ALPHA, A1 fmul B2, ALPHA, A2 fmul B3, ALPHA, A3 fmul B4, ALPHA, A4 fmul B5, ALPHA, A5 fmul B6, ALPHA, A6 STFDUX B1, X2, INCX fmul B7, ALPHA, A7 STFDUX B2, X2, INCX fmul B8, ALPHA, A8 STFDUX B3, X2, INCX STFDUX B4, X2, INCX STFDUX B5, X2, INCX STFDUX B6, X2, INCX STFDUX B7, X2, INCX STFDUX B8, X2, INCX .align 4 LL(215): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(217) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX A3, X, INCX LFDUX A4, X, INCX fmul B1, ALPHA, A1 fmul B2, ALPHA, A2 fmul B3, ALPHA, A3 fmul B4, ALPHA, A4 STFDUX B1, X2, INCX STFDUX B2, X2, INCX STFDUX B3, X2, INCX STFDUX B4, X2, INCX .align 4 LL(217): andi. r0, N, 2 beq LL(218) LFDUX A1, X, INCX LFDUX A2, X, INCX fmul B1, ALPHA, A1 fmul B2, ALPHA, A2 STFDUX B1, X2, INCX STFDUX B2, X2, INCX .align 4 LL(218): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX fmul B1, ALPHA, A1 STFDUX B1, X2, INCX .align 4 LL(999): li r10, 16 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/scal_ppc440.S000066400000000000000000000127061313527062700177260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define XX r4 #define PRE r5 #ifdef linux #ifndef __64BIT__ #define X r6 #define INCX r7 #else #define X r7 #define INCX r8 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define X r8 #define INCX r9 #else #define X r7 #define INCX r8 #endif #endif #define FZERO f0 #define ALPHA f1 PROLOGUE PROFCODE addi SP, SP, -8 li r0, 0 stw r0, 0(SP) lfs FZERO, 0(SP) addi SP, SP, 8 slwi INCX, INCX, BASE_SHIFT li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 blelr- cr0 sub X, X, INCX fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(A0I1_Remain) .align 4 LL(A0I1_kernel): #ifdef PPCG4 dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX #if defined(PPCG4) && defined(DOUBLE) dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX #ifdef PPCG4 dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX #if defined(PPCG4) && defined(DOUBLE) dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX STFDUX FZERO, X, INCX bdnz LL(A0I1_kernel) .align 4 LL(A0I1_Remain): andi. r0, N, 15 mtspr CTR, r0 beqlr+ .align 4 LL(A0I1_RemainKernel): STFDUX FZERO, X, INCX bdnz LL(A0I1_RemainKernel) blr .align 4 LL(A1I1): mr XX, X srawi. r0, N, 3 mtspr CTR, r0 beq+ LL(A1I1_Remain) LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f4, X, INCX LFDUX f5, X, INCX bdz LL(12) .align 4 LL(11): LFDUX f6, X, INCX FMUL f2, ALPHA, f2 LFDUX f7, X, INCX FMUL f3, ALPHA, f3 LFDUX f8, X, INCX FMUL f4, ALPHA, f4 LFDUX f9, X, INCX FMUL f5, ALPHA, f5 #ifdef PPCG4 dcbtst X, PRE #endif STFDUX f2, XX, INCX STFDUX f3, XX, INCX STFDUX f4, XX, INCX STFDUX f5, XX, INCX LFDUX f2, X, INCX FMUL f6, ALPHA, f6 LFDUX f3, X, INCX FMUL f7, ALPHA, f7 LFDUX f4, X, INCX FMUL f8, ALPHA, f8 LFDUX f5, X, INCX FMUL f9, ALPHA, f9 STFDUX f6, XX, INCX STFDUX f7, XX, INCX STFDUX f8, XX, INCX STFDUX f9, XX, INCX #if defined(PPCG4) && defined(DOUBLE) dcbtst X, PRE #endif bdnz LL(11) .align 4 LL(12): LFDUX f6, X, INCX FMUL f2, ALPHA, f2 LFDUX f7, X, INCX FMUL f3, ALPHA, f3 LFDUX f8, X, INCX FMUL f4, ALPHA, f4 LFDUX f9, X, INCX FMUL f5, ALPHA, f5 STFDUX f2, XX, INCX FMUL f6, ALPHA, f6 STFDUX f3, XX, INCX FMUL f7, ALPHA, f7 STFDUX f4, XX, INCX FMUL f8, ALPHA, f8 STFDUX f5, XX, INCX FMUL f9, ALPHA, f9 STFDUX f6, XX, INCX STFDUX f7, XX, INCX STFDUX f8, XX, INCX STFDUX f9, XX, INCX .align 4 LL(A1I1_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4 LL(A1I1_RemainKernel): LFDUX f2, X, INCX FMUL f2, ALPHA, f2 STFDUX f2, XX, INCX bdnz LL(A1I1_RemainKernel) blr .align 4 EPILOGUE OpenBLAS-0.2.20/kernel/power/scopy.c000066400000000000000000000056121313527062700170650ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "scopy_microk_power8.c" #endif #ifndef HAVE_KERNEL_32 static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { scopy_kernel_32(n1, x, y); i=n1; } while(i < n) { y[i] = x[i] ; i++ ; } } else { while(i < n) { y[iy] = x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/scopy_microk_power8.c000066400000000000000000000074101313527062700217330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_32 1 static void scopy_kernel_32 (long n, float *x, float *y) { __asm__ ( "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "bgt 1b \n" "2: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "=m" (*y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : "m" (*x), "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } OpenBLAS-0.2.20/kernel/power/sdot.c000066400000000000000000000061331313527062700167000ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "sdot_microk_power8.c" #endif #ifndef HAVE_KERNEL_16 static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; FLOAT dot = 0.0; while(i < n) { dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] + y[i+4] * x[i+4] + y[i+5] * x[i+5] + y[i+6] * x[i+6] + y[i+7] * x[i+7] ; i+=8 ; } return dot; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot = 0.0 ; if ( n <= 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -32; if ( n1 ) dot = sdot_kernel_16(n1, x, y); i = n1; while(i < n) { dot += y[i] * x[i] ; i++ ; } return(dot); } BLASLONG n1 = n & -2; while(i < n1) { dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; ix += inc_x*2 ; iy += inc_y*2 ; i+=2 ; } while(i < n) { dot += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(dot); } OpenBLAS-0.2.20/kernel/power/sdot_microk_power8.c000066400000000000000000000127201313527062700215470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static float sdot_kernel_16 (long n, float *x, float *y) { float dot; __vector float t0; __vector float t1; __vector float t2; __vector float t3; __asm__ ( "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" "xxlxor 32, 32, 32 \n\t" "xxlxor 33, 33, 33 \n\t" "xxlxor 34, 34, 34 \n\t" "xxlxor 35, 35, 35 \n\t" "xxlxor 36, 36, 36 \n\t" "xxlxor 37, 37, 37 \n\t" "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 48, 0, %3 \n\t" "lxvd2x 41, %10, %2 \n\t" "lxvd2x 49, %10, %3 \n\t" "lxvd2x 42, %11, %2 \n\t" "lxvd2x 50, %11, %3 \n\t" "lxvd2x 43, %12, %2 \n\t" "lxvd2x 51, %12, %3 \n\t" "lxvd2x 44, %13, %2 \n\t" "lxvd2x %x4, %13, %3 \n\t" "lxvd2x 45, %14, %2 \n\t" "lxvd2x %x5, %14, %3 \n\t" "lxvd2x 46, %15, %2 \n\t" "lxvd2x %x6, %15, %3 \n\t" "lxvd2x 47, %16, %2 \n\t" "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmaddasp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 48, 0, %3 \n\t" "xvmaddasp 33, 41, 49 \n\t" "lxvd2x 41, %10, %2 \n\t" "lxvd2x 49, %10, %3 \n\t" "xvmaddasp 34, 42, 50 \n\t" "lxvd2x 42, %11, %2 \n\t" "lxvd2x 50, %11, %3 \n\t" "xvmaddasp 35, 43, 51 \n\t" "lxvd2x 43, %12, %2 \n\t" "lxvd2x 51, %12, %3 \n\t" "xvmaddasp 36, 44, %x4 \n\t" "lxvd2x 44, %13, %2 \n\t" "lxvd2x %x4, %13, %3 \n\t" "xvmaddasp 37, 45, %x5 \n\t" "lxvd2x 45, %14, %2 \n\t" "lxvd2x %x5, %14, %3 \n\t" "xvmaddasp 38, 46, %x6 \n\t" "lxvd2x 46, %15, %2 \n\t" "lxvd2x %x6, %15, %3 \n\t" "xvmaddasp 39, 47, %x7 \n\t" "lxvd2x 47, %16, %2 \n\t" "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" "bgt 1b \n" "2: \n\t" "xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 33, 41, 49 \n\t" "xvmaddasp 34, 42, 50 \n\t" "xvmaddasp 35, 43, 51 \n\t" "xvmaddasp 36, 44, %x4 \n\t" "xvmaddasp 37, 45, %x5 \n\t" "xvmaddasp 38, 46, %x6 \n\t" "xvmaddasp 39, 47, %x7 \n\t" "xvaddsp 32, 32, 33 \n\t" "xvaddsp 34, 34, 35 \n\t" "xvaddsp 36, 36, 37 \n\t" "xvaddsp 38, 38, 39 \n\t" "xvaddsp 32, 32, 34 \n\t" "xvaddsp 36, 36, 38 \n\t" "xvaddsp 32, 32, 36 \n\t" "xxsldwi 33, 32, 32, 2 \n\t" "xvaddsp 32, 32, 33 \n\t" "xxsldwi 33, 32, 32, 1 \n\t" "xvaddsp 32, 32, 33 \n\t" "xscvspdp %x0, 32 \n" "#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n" "#t0=%x4 t1=%x5 t2=%x6 t3=%x7" : "=f" (dot), // 0 "+r" (n), // 1 "+b" (x), // 2 "+b" (y), // 3 "=wa" (t0), // 4 "=wa" (t1), // 5 "=wa" (t2), // 6 "=wa" (t3) // 7 : "m" (*x), "m" (*y), "b" (16), // 10 "b" (32), // 11 "b" (48), // 12 "b" (64), // 13 "b" (80), // 14 "b" (96), // 15 "b" (112) // 16 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); return dot; } OpenBLAS-0.2.20/kernel/power/sgemm_kernel_16x8_power8.S000066400000000000000000000221561313527062700224540ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 32752 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA_SP 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define alpha_r vs30 #define alpha_vr vs31 #define o0 0 #define FRAMEPOINTER r12 #define BBUFFER r14 #define o4 r15 #define o12 r16 #define o8 r17 #define L r18 #define T1 r19 #define KK r20 #define BBO r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o16 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T2 r31 #include "sgemm_macros_16x8_power8.S" #ifndef NEEDPARAM PROLOGUE PROFCODE mr FRAMEPOINTER, SP addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) #endif // stfd f1, ALPHA_SP // stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif slwi LDC, LDC, 2 #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #else lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #endif #endif cmpwi cr0, M, 0 ble L999_H1 cmpwi cr0, N, 0 ble L999_H1 cmpwi cr0, K, 0 ble L999_H1 li PRE, 256 li o4 , 4 li o8 , 8 li o12, 12 li o16, 16 li o32, 32 li o48, 48 addi BBUFFER, SP, 512+4096 li T1, -4096 and BBUFFER, BBUFFER, T1 addi T1, SP, 300 stxsspx f1, o0 , T1 stxsspx f1, o4 , T1 stxsspx f1, o8 , T1 stxsspx f1, o12 , T1 lxsspx alpha_r, o0, T1 lxvw4x alpha_vr, o0, T1 #include "sgemm_logic_16x8_power8.S" L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/sgemm_logic_16x8_power8.S000066400000000000000000000711471313527062700222750ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 3 ble SGEMM_L8_END SGEMM_L8_BEGIN: mr BO, B mr BBO, BBUFFER srawi. T1, K, 2 ble SGEMM_L8_COPYB1 SGEMM_L8_COPYB4: dcbt BO, PRE dcbtst BBO, PRE COPYB_4x8 addic. T1, T1, -1 ble SGEMM_L8_COPYB1 dcbtst BBO, PRE COPYB_4x8 addic. T1, T1, -1 ble SGEMM_L8_COPYB1 dcbtst BBO, PRE COPYB_4x8 addic. T1, T1, -1 ble SGEMM_L8_COPYB1 dcbtst BBO, PRE COPYB_4x8 addic. T1, T1, -1 bgt SGEMM_L8_COPYB4 SGEMM_L8_COPYB1: andi. T1, K, 3 ble SGEMM_L8_COPYB_END SGEMM_L8_COPYB1_LOOP: COPYB_1x8 addic. T1, T1, -1 bgt SGEMM_L8_COPYB1_LOOP SGEMM_L8_COPYB_END: mr CO, C mr AO, A slwi T1, LDC , 3 add C, C, T1 srawi. I, M, 4 ble SGEMM_L8x16_END SGEMM_L8x16_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L8x16_SUB0 cmpwi cr0, L, 1 ble SGEMM_L8x16_SUB4 SGEMM_L8x16_LOOP_START: dcbt AO, PRE dcbt BO, PRE LOAD8x16_1 dcbt BO, PRE KERNEL8x16_I1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 addic. L, L, -2 ble SGEMM_L8x16_LOOP_END .align 5 SGEMM_L8x16_LOOP: dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 addic. L, L, -1 bgt SGEMM_L8x16_LOOP SGEMM_L8x16_LOOP_END: dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE dcbt BO, PRE KERNEL8x16_2 dcbt BO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 KERNEL8x16_1 KERNEL8x16_E2 b SGEMM_L8x16_SUB1 SGEMM_L8x16_SUB4: dcbt AO, PRE KERNEL8x16_SUBI1 KERNEL8x16_SUB1 dcbt AO, PRE KERNEL8x16_SUB1 KERNEL8x16_SUB1 KERNEL8x16_SUB1 KERNEL8x16_SUB1 KERNEL8x16_SUB1 KERNEL8x16_SUB1 b SGEMM_L8x16_SUB1 SGEMM_L8x16_SUB0: andi. L, K, 7 KERNEL8x16_SUBI1 addic. L, L, -1 ble SGEMM_L8x16_SAVE b SGEMM_L8x16_SUB2 SGEMM_L8x16_SUB1: andi. L, K, 7 ble SGEMM_L8x16_SAVE SGEMM_L8x16_SUB2: KERNEL8x16_SUB1 addic. L, L, -1 bgt SGEMM_L8x16_SUB2 SGEMM_L8x16_SAVE: SAVE8x16 addic. I, I, -1 bgt SGEMM_L8x16_BEGIN SGEMM_L8x16_END: SGEMM_L8x8_BEGIN: andi. T2, M, 15 ble SGEMM_L8x1_END andi. T1, M, 8 ble SGEMM_L8x8_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L8x8_SUB0 cmpwi cr0, L, 1 ble SGEMM_L8x8_SUB4 SGEMM_L8x8_LOOP_START: LOAD8x8_1 KERNEL8x8_I1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 addic. L, L, -2 ble SGEMM_L8x8_LOOP_END .align 5 SGEMM_L8x8_LOOP: KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 addic. L, L, -1 bgt SGEMM_L8x8_LOOP SGEMM_L8x8_LOOP_END: KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_E2 b SGEMM_L8x8_SUB1 SGEMM_L8x8_SUB4: KERNEL8x8_SUBI1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 b SGEMM_L8x8_SUB1 SGEMM_L8x8_SUB0: andi. L, K, 7 KERNEL8x8_SUBI1 addic. L, L, -1 ble SGEMM_L8x8_SAVE b SGEMM_L8x8_SUB2 SGEMM_L8x8_SUB1: andi. L, K, 7 ble SGEMM_L8x8_SAVE SGEMM_L8x8_SUB2: KERNEL8x8_SUB1 addic. L, L, -1 bgt SGEMM_L8x8_SUB2 SGEMM_L8x8_SAVE: SAVE8x8 SGEMM_L8x8_END: SGEMM_L8x4_BEGIN: andi. T1, M, 4 ble SGEMM_L8x4_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L8x4_SUB0 cmpwi cr0, L, 1 ble SGEMM_L8x4_SUB4 SGEMM_L8x4_LOOP_START: LOAD8x4_1 KERNEL8x4_I1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 addic. L, L, -2 ble SGEMM_L8x4_LOOP_END .align 5 SGEMM_L8x4_LOOP: KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 addic. L, L, -1 bgt SGEMM_L8x4_LOOP SGEMM_L8x4_LOOP_END: KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_E2 b SGEMM_L8x4_SUB1 SGEMM_L8x4_SUB4: KERNEL8x4_SUBI1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 b SGEMM_L8x4_SUB1 SGEMM_L8x4_SUB0: andi. L, K, 7 KERNEL8x4_SUBI1 addic. L, L, -1 ble SGEMM_L8x4_SAVE b SGEMM_L8x4_SUB2 SGEMM_L8x4_SUB1: andi. L, K, 7 ble SGEMM_L8x4_SAVE SGEMM_L8x4_SUB2: KERNEL8x4_SUB1 addic. L, L, -1 bgt SGEMM_L8x4_SUB2 SGEMM_L8x4_SAVE: SAVE8x4 SGEMM_L8x4_END: SGEMM_L8x2_BEGIN: andi. T1, M, 2 ble SGEMM_L8x2_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L8x2_SUB0 cmpwi cr0, L, 1 ble SGEMM_L8x2_SUB4 SGEMM_L8x2_LOOP_START: LOAD8x2_1 KERNEL8x2_I1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 addic. L, L, -2 ble SGEMM_L8x2_LOOP_END .align 5 SGEMM_L8x2_LOOP: KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 addic. L, L, -1 bgt SGEMM_L8x2_LOOP SGEMM_L8x2_LOOP_END: KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_E2 b SGEMM_L8x2_SUB1 SGEMM_L8x2_SUB4: KERNEL8x2_SUBI1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 b SGEMM_L8x2_SUB1 SGEMM_L8x2_SUB0: andi. L, K, 7 KERNEL8x2_SUBI1 addic. L, L, -1 ble SGEMM_L8x2_SAVE b SGEMM_L8x2_SUB2 SGEMM_L8x2_SUB1: andi. L, K, 7 ble SGEMM_L8x2_SAVE SGEMM_L8x2_SUB2: KERNEL8x2_SUB1 addic. L, L, -1 bgt SGEMM_L8x2_SUB2 SGEMM_L8x2_SAVE: SAVE8x2 SGEMM_L8x2_END: SGEMM_L8x1_BEGIN: andi. T1, M, 1 ble SGEMM_L8x1_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L8x1_SUB0 cmpwi cr0, L, 1 ble SGEMM_L8x1_SUB4 SGEMM_L8x1_LOOP_START: LOAD8x1_1 KERNEL8x1_I1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 addic. L, L, -2 ble SGEMM_L8x1_LOOP_END .align 5 SGEMM_L8x1_LOOP: KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 addic. L, L, -1 bgt SGEMM_L8x1_LOOP SGEMM_L8x1_LOOP_END: KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_E2 b SGEMM_L8x1_SUB1 SGEMM_L8x1_SUB4: KERNEL8x1_SUBI1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 b SGEMM_L8x1_SUB1 SGEMM_L8x1_SUB0: andi. L, K, 7 KERNEL8x1_SUBI1 addic. L, L, -1 ble SGEMM_L8x1_SAVE b SGEMM_L8x1_SUB2 SGEMM_L8x1_SUB1: andi. L, K, 7 ble SGEMM_L8x1_SAVE SGEMM_L8x1_SUB2: KERNEL8x1_SUB1 addic. L, L, -1 bgt SGEMM_L8x1_SUB2 SGEMM_L8x1_SAVE: SAVE8x1 SGEMM_L8x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 bgt SGEMM_L8_BEGIN andi. T2, N, 7 ble L999 SGEMM_L8_END: b SGEMM_L4_BEGIN L999_H1: b L999 SGEMM_L4_BEGIN: mr BO, B mr BBO, BBUFFER slwi T1, K, 2 SGEMM_L4_COPYB: dcbtst BBO, PRE lxvw4x vs3, o0, BO lxvw4x vs11, o16, BO xxspltw vs4, vs3, 0 xxspltw vs5, vs3, 1 xxspltw vs6, vs3, 2 xxspltw vs7, vs3, 3 xxspltw vs12, vs11, 0 xxspltw vs13, vs11, 1 xxspltw vs14, vs11, 2 xxspltw vs15, vs11, 3 stxvw4x vs4, o0, BBO stxvw4x vs5, o16, BBO stxvw4x vs6, o32, BBO stxvw4x vs7, o48, BBO addi BO, BO, 32 addi BBO, BBO, 64 stxvw4x vs12, o0, BBO stxvw4x vs13, o16, BBO stxvw4x vs14, o32, BBO stxvw4x vs15, o48, BBO addic. T1, T1, -8 addi BBO, BBO, 64 bge SGEMM_L4_COPYB andi. T1, N, 4 ble SGEMM_L4_END mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 ble SGEMM_L4x16_END SGEMM_L4x16_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L4x16_SUB0 cmpwi cr0, L, 1 ble SGEMM_L4x16_SUB4 SGEMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 KERNEL4x16_I1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 addic. L, L, -2 ble SGEMM_L4x16_LOOP_END .align 5 SGEMM_L4x16_LOOP: KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 addic. L, L, -1 bgt SGEMM_L4x16_LOOP SGEMM_L4x16_LOOP_END: KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 KERNEL4x16_1 KERNEL4x16_E2 b SGEMM_L4x16_SUB1 SGEMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 KERNEL4x16_SUB1 dcbt AO, PRE KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 b SGEMM_L4x16_SUB1 SGEMM_L4x16_SUB0: andi. L, K, 7 KERNEL4x16_SUBI1 addic. L, L, -1 ble SGEMM_L4x16_SAVE b SGEMM_L4x16_SUB2 SGEMM_L4x16_SUB1: andi. L, K, 7 ble SGEMM_L4x16_SAVE SGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 bgt SGEMM_L4x16_SUB2 SGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 bgt SGEMM_L4x16_BEGIN SGEMM_L4x16_END: SGEMM_L4x8_BEGIN: andi. T2, M, 15 ble SGEMM_L4x1_END andi. T1, M, 8 ble SGEMM_L4x8_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L4x8_SUB0 cmpwi cr0, L, 1 ble SGEMM_L4x8_SUB4 SGEMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 addic. L, L, -2 ble SGEMM_L4x8_LOOP_END .align 5 SGEMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 addic. L, L, -1 bgt SGEMM_L4x8_LOOP SGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_E2 b SGEMM_L4x8_SUB1 SGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 b SGEMM_L4x8_SUB1 SGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 ble SGEMM_L4x8_SAVE b SGEMM_L4x8_SUB2 SGEMM_L4x8_SUB1: andi. L, K, 7 ble SGEMM_L4x8_SAVE SGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 bgt SGEMM_L4x8_SUB2 SGEMM_L4x8_SAVE: SAVE4x8 SGEMM_L4x8_END: SGEMM_L4x4_BEGIN: andi. T1, M, 4 ble SGEMM_L4x4_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L4x4_SUB0 cmpwi cr0, L, 1 ble SGEMM_L4x4_SUB4 SGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -2 ble SGEMM_L4x4_LOOP_END .align 5 SGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -1 bgt SGEMM_L4x4_LOOP SGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_E2 b SGEMM_L4x4_SUB1 SGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 b SGEMM_L4x4_SUB1 SGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 ble SGEMM_L4x4_SAVE b SGEMM_L4x4_SUB2 SGEMM_L4x4_SUB1: andi. L, K, 7 ble SGEMM_L4x4_SAVE SGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 bgt SGEMM_L4x4_SUB2 SGEMM_L4x4_SAVE: SAVE4x4 SGEMM_L4x4_END: SGEMM_L4x2_BEGIN: andi. T1, M, 2 ble SGEMM_L4x2_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L4x2_SUB0 cmpwi cr0, L, 1 ble SGEMM_L4x2_SUB4 SGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -2 ble SGEMM_L4x2_LOOP_END .align 5 SGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -1 bgt SGEMM_L4x2_LOOP SGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_E2 b SGEMM_L4x2_SUB1 SGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 b SGEMM_L4x2_SUB1 SGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 ble SGEMM_L4x2_SAVE b SGEMM_L4x2_SUB2 SGEMM_L4x2_SUB1: andi. L, K, 7 ble SGEMM_L4x2_SAVE SGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 bgt SGEMM_L4x2_SUB2 SGEMM_L4x2_SAVE: SAVE4x2 SGEMM_L4x2_END: SGEMM_L4x1_BEGIN: andi. T1, M, 1 ble SGEMM_L4x1_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L4x1_SUB0 cmpwi cr0, L, 1 ble SGEMM_L4x1_SUB4 SGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -2 ble SGEMM_L4x1_LOOP_END .align 5 SGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -1 bgt SGEMM_L4x1_LOOP SGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_E2 b SGEMM_L4x1_SUB1 SGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 b SGEMM_L4x1_SUB1 SGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 ble SGEMM_L4x1_SAVE b SGEMM_L4x1_SUB2 SGEMM_L4x1_SUB1: andi. L, K, 7 ble SGEMM_L4x1_SAVE SGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 bgt SGEMM_L4x1_SUB2 SGEMM_L4x1_SAVE: SAVE4x1 SGEMM_L4x1_END: slwi T1, K, 4 add B, B, T1 SGEMM_L4_END: SGEMM_L2_BEGIN: mr BO, B mr BBO, BBUFFER slwi T1, K, 1 SGEMM_L2_COPYB: dcbtst BBO, PRE lxvw4x vs3, o0, BO lxvw4x vs11, o16, BO xxspltw vs4, vs3, 0 xxspltw vs5, vs3, 1 xxspltw vs6, vs3, 2 xxspltw vs7, vs3, 3 xxspltw vs12, vs11, 0 xxspltw vs13, vs11, 1 xxspltw vs14, vs11, 2 xxspltw vs15, vs11, 3 stxvw4x vs4, o0, BBO stxvw4x vs5, o16, BBO stxvw4x vs6, o32, BBO stxvw4x vs7, o48, BBO addi BO, BO, 32 addi BBO, BBO, 64 stxvw4x vs12, o0, BBO stxvw4x vs13, o16, BBO stxvw4x vs14, o32, BBO stxvw4x vs15, o48, BBO addic. T1, T1, -8 addi BBO, BBO, 64 bge SGEMM_L2_COPYB andi. T1, N, 2 ble SGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 ble SGEMM_L2x16_END SGEMM_L2x16_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L2x16_SUB0 cmpwi cr0, L, 1 ble SGEMM_L2x16_SUB4 SGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 KERNEL2x16_I1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -2 ble SGEMM_L2x16_LOOP_END .align 5 SGEMM_L2x16_LOOP: KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -1 bgt SGEMM_L2x16_LOOP SGEMM_L2x16_LOOP_END: KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 KERNEL2x16_1 KERNEL2x16_E2 b SGEMM_L2x16_SUB1 SGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 KERNEL2x16_SUB1 dcbt AO, PRE KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 b SGEMM_L2x16_SUB1 SGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 ble SGEMM_L2x16_SAVE b SGEMM_L2x16_SUB2 SGEMM_L2x16_SUB1: andi. L, K, 7 ble SGEMM_L2x16_SAVE SGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 bgt SGEMM_L2x16_SUB2 SGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 bgt SGEMM_L2x16_BEGIN SGEMM_L2x16_END: SGEMM_L2x8_BEGIN: andi. T2, M, 15 ble SGEMM_L2x1_END andi. T1, M, 8 ble SGEMM_L2x8_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L2x8_SUB0 cmpwi cr0, L, 1 ble SGEMM_L2x8_SUB4 SGEMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -2 ble SGEMM_L2x8_LOOP_END .align 5 SGEMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -1 bgt SGEMM_L2x8_LOOP SGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_E2 b SGEMM_L2x8_SUB1 SGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b SGEMM_L2x8_SUB1 SGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 ble SGEMM_L2x8_SAVE b SGEMM_L2x8_SUB2 SGEMM_L2x8_SUB1: andi. L, K, 7 ble SGEMM_L2x8_SAVE SGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt SGEMM_L2x8_SUB2 SGEMM_L2x8_SAVE: SAVE2x8 SGEMM_L2x8_END: SGEMM_L2x4_BEGIN: andi. T1, M, 4 ble SGEMM_L2x4_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L2x4_SUB0 cmpwi cr0, L, 1 ble SGEMM_L2x4_SUB4 SGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble SGEMM_L2x4_LOOP_END .align 5 SGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt SGEMM_L2x4_LOOP SGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b SGEMM_L2x4_SUB1 SGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b SGEMM_L2x4_SUB1 SGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 ble SGEMM_L2x4_SAVE b SGEMM_L2x4_SUB2 SGEMM_L2x4_SUB1: andi. L, K, 7 ble SGEMM_L2x4_SAVE SGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt SGEMM_L2x4_SUB2 SGEMM_L2x4_SAVE: SAVE2x4 SGEMM_L2x4_END: SGEMM_L2x2_BEGIN: andi. T1, M, 2 ble SGEMM_L2x2_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L2x2_SUB0 cmpwi cr0, L, 1 ble SGEMM_L2x2_SUB4 SGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble SGEMM_L2x2_LOOP_END .align 5 SGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt SGEMM_L2x2_LOOP SGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b SGEMM_L2x2_SUB1 SGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b SGEMM_L2x2_SUB1 SGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 ble SGEMM_L2x2_SAVE b SGEMM_L2x2_SUB2 SGEMM_L2x2_SUB1: andi. L, K, 7 ble SGEMM_L2x2_SAVE SGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt SGEMM_L2x2_SUB2 SGEMM_L2x2_SAVE: SAVE2x2 SGEMM_L2x2_END: SGEMM_L2x1_BEGIN: andi. T1, M, 1 ble SGEMM_L2x1_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L2x1_SUB0 cmpwi cr0, L, 1 ble SGEMM_L2x1_SUB4 SGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble SGEMM_L2x1_LOOP_END .align 5 SGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt SGEMM_L2x1_LOOP SGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b SGEMM_L2x1_SUB1 SGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b SGEMM_L2x1_SUB1 SGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 ble SGEMM_L2x1_SAVE b SGEMM_L2x1_SUB2 SGEMM_L2x1_SUB1: andi. L, K, 7 ble SGEMM_L2x1_SAVE SGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt SGEMM_L2x1_SUB2 SGEMM_L2x1_SAVE: SAVE2x1 SGEMM_L2x1_END: slwi T1, K, 3 add B, B, T1 SGEMM_L2_END: SGEMM_L1_BEGIN: mr BO, B mr BBO, BBUFFER slwi T1, K, 0 SGEMM_L1_COPYB: dcbtst BBO, PRE lxvw4x vs3, o0, BO lxvw4x vs11, o16, BO xxspltw vs4, vs3, 0 xxspltw vs5, vs3, 1 xxspltw vs6, vs3, 2 xxspltw vs7, vs3, 3 xxspltw vs12, vs11, 0 xxspltw vs13, vs11, 1 xxspltw vs14, vs11, 2 xxspltw vs15, vs11, 3 stxvw4x vs4, o0, BBO stxvw4x vs5, o16, BBO stxvw4x vs6, o32, BBO stxvw4x vs7, o48, BBO addi BO, BO, 32 addi BBO, BBO, 64 stxvw4x vs12, o0, BBO stxvw4x vs13, o16, BBO stxvw4x vs14, o32, BBO stxvw4x vs15, o48, BBO addic. T1, T1, -8 addi BBO, BBO, 64 bge SGEMM_L1_COPYB andi. T1, N, 1 ble SGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 ble SGEMM_L1x16_END SGEMM_L1x16_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L1x16_SUB0 cmpwi cr0, L, 1 ble SGEMM_L1x16_SUB4 SGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 KERNEL1x16_I1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -2 ble SGEMM_L1x16_LOOP_END .align 5 SGEMM_L1x16_LOOP: KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -1 bgt SGEMM_L1x16_LOOP SGEMM_L1x16_LOOP_END: KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 KERNEL1x16_1 KERNEL1x16_E2 b SGEMM_L1x16_SUB1 SGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 KERNEL1x16_SUB1 dcbt AO, PRE KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 b SGEMM_L1x16_SUB1 SGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 ble SGEMM_L1x16_SAVE b SGEMM_L1x16_SUB2 SGEMM_L1x16_SUB1: andi. L, K, 7 ble SGEMM_L1x16_SAVE SGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 bgt SGEMM_L1x16_SUB2 SGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 bgt SGEMM_L1x16_BEGIN SGEMM_L1x16_END: SGEMM_L1x8_BEGIN: andi. T2, M, 15 ble SGEMM_L1x1_END andi. T1, M, 8 ble SGEMM_L1x8_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L1x8_SUB0 cmpwi cr0, L, 1 ble SGEMM_L1x8_SUB4 SGEMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -2 ble SGEMM_L1x8_LOOP_END .align 5 SGEMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -1 bgt SGEMM_L1x8_LOOP SGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_E2 b SGEMM_L1x8_SUB1 SGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b SGEMM_L1x8_SUB1 SGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 ble SGEMM_L1x8_SAVE b SGEMM_L1x8_SUB2 SGEMM_L1x8_SUB1: andi. L, K, 7 ble SGEMM_L1x8_SAVE SGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt SGEMM_L1x8_SUB2 SGEMM_L1x8_SAVE: SAVE1x8 SGEMM_L1x8_END: SGEMM_L1x4_BEGIN: andi. T1, M, 4 ble SGEMM_L1x4_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L1x4_SUB0 cmpwi cr0, L, 1 ble SGEMM_L1x4_SUB4 SGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble SGEMM_L1x4_LOOP_END .align 5 SGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt SGEMM_L1x4_LOOP SGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b SGEMM_L1x4_SUB1 SGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b SGEMM_L1x4_SUB1 SGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 ble SGEMM_L1x4_SAVE b SGEMM_L1x4_SUB2 SGEMM_L1x4_SUB1: andi. L, K, 7 ble SGEMM_L1x4_SAVE SGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt SGEMM_L1x4_SUB2 SGEMM_L1x4_SAVE: SAVE1x4 SGEMM_L1x4_END: SGEMM_L1x2_BEGIN: andi. T1, M, 2 ble SGEMM_L1x2_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L1x2_SUB0 cmpwi cr0, L, 1 ble SGEMM_L1x2_SUB4 SGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble SGEMM_L1x2_LOOP_END .align 5 SGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt SGEMM_L1x2_LOOP SGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b SGEMM_L1x2_SUB1 SGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b SGEMM_L1x2_SUB1 SGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 ble SGEMM_L1x2_SAVE b SGEMM_L1x2_SUB2 SGEMM_L1x2_SUB1: andi. L, K, 7 ble SGEMM_L1x2_SAVE SGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt SGEMM_L1x2_SUB2 SGEMM_L1x2_SAVE: SAVE1x2 SGEMM_L1x2_END: SGEMM_L1x1_BEGIN: andi. T1, M, 1 ble SGEMM_L1x1_END mr BO, BBUFFER srawi. L, K, 3 ble SGEMM_L1x1_SUB0 cmpwi cr0, L, 1 ble SGEMM_L1x1_SUB4 SGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble SGEMM_L1x1_LOOP_END .align 5 SGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt SGEMM_L1x1_LOOP SGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b SGEMM_L1x1_SUB1 SGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b SGEMM_L1x1_SUB1 SGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 ble SGEMM_L1x1_SAVE b SGEMM_L1x1_SUB2 SGEMM_L1x1_SUB1: andi. L, K, 7 ble SGEMM_L1x1_SAVE SGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt SGEMM_L1x1_SUB2 SGEMM_L1x1_SAVE: SAVE1x1 SGEMM_L1x1_END: SGEMM_L1_END: OpenBLAS-0.2.20/kernel/power/sgemm_macros_16x8_power8.S000066400000000000000000002406201313527062700224560ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=8 and M=16 **********************************************************************************************/ .macro LOAD8x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 .endm .macro KERNEL8x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi T1, T1, 64 lxvw4x vs20, o0, T1 lxvw4x vs21, o16, T1 lxvw4x vs22, o32, T1 lxvw4x vs23, o48, T1 addi BO, BO, 128 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 xvmulsp vs48, vs0, vs12 xvmulsp vs49, vs1, vs12 xvmulsp vs50, vs2, vs12 xvmulsp vs51, vs3, vs12 xvmulsp vs52, vs0, vs13 xvmulsp vs53, vs1, vs13 xvmulsp vs54, vs2, vs13 xvmulsp vs55, vs3, vs13 xvmulsp vs56, vs0, vs14 xvmulsp vs57, vs1, vs14 xvmulsp vs58, vs2, vs14 xvmulsp vs59, vs3, vs14 xvmulsp vs60, vs0, vs15 xvmulsp vs61, vs1, vs15 xvmulsp vs62, vs2, vs15 xvmulsp vs63, vs3, vs15 .endm .macro KERNEL8x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi T1, T1, 64 lxvw4x vs20, o0, T1 lxvw4x vs21, o16, T1 lxvw4x vs22, o32, T1 lxvw4x vs23, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 xvmaddasp vs51, vs3, vs12 xvmaddasp vs52, vs0, vs13 xvmaddasp vs53, vs1, vs13 xvmaddasp vs54, vs2, vs13 xvmaddasp vs55, vs3, vs13 xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 xvmaddasp vs60, vs0, vs15 xvmaddasp vs61, vs1, vs15 xvmaddasp vs62, vs2, vs15 xvmaddasp vs63, vs3, vs15 .endm .macro KERNEL8x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 xvmaddasp vs51, vs7, vs20 xvmaddasp vs52, vs4, vs21 xvmaddasp vs53, vs5, vs21 xvmaddasp vs54, vs6, vs21 xvmaddasp vs55, vs7, vs21 xvmaddasp vs56, vs4, vs22 xvmaddasp vs57, vs5, vs22 xvmaddasp vs58, vs6, vs22 xvmaddasp vs59, vs7, vs22 xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 .endm .macro KERNEL8x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 xvmaddasp vs51, vs7, vs20 xvmaddasp vs52, vs4, vs21 xvmaddasp vs53, vs5, vs21 xvmaddasp vs54, vs6, vs21 xvmaddasp vs55, vs7, vs21 xvmaddasp vs56, vs4, vs22 xvmaddasp vs57, vs5, vs22 xvmaddasp vs58, vs6, vs22 xvmaddasp vs59, vs7, vs22 xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 .endm .macro KERNEL8x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 xvmulsp vs48, vs0, vs12 xvmulsp vs49, vs1, vs12 xvmulsp vs50, vs2, vs12 xvmulsp vs51, vs3, vs12 xvmulsp vs52, vs0, vs13 xvmulsp vs53, vs1, vs13 xvmulsp vs54, vs2, vs13 xvmulsp vs55, vs3, vs13 xvmulsp vs56, vs0, vs14 xvmulsp vs57, vs1, vs14 xvmulsp vs58, vs2, vs14 xvmulsp vs59, vs3, vs14 xvmulsp vs60, vs0, vs15 xvmulsp vs61, vs1, vs15 xvmulsp vs62, vs2, vs15 xvmulsp vs63, vs3, vs15 .endm .macro KERNEL8x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 xvmaddasp vs51, vs3, vs12 xvmaddasp vs52, vs0, vs13 xvmaddasp vs53, vs1, vs13 xvmaddasp vs54, vs2, vs13 xvmaddasp vs55, vs3, vs13 xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 xvmaddasp vs60, vs0, vs15 xvmaddasp vs61, vs1, vs15 xvmaddasp vs62, vs2, vs15 xvmaddasp vs63, vs3, vs15 .endm .macro SAVE8x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr xvmulsp vs2, vs38, alpha_vr xvmulsp vs3, vs39, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr xvmaddasp vs2, vs38, alpha_vr xvmaddasp vs3, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs40, alpha_vr xvmulsp vs1, vs41, alpha_vr xvmulsp vs2, vs42, alpha_vr xvmulsp vs3, vs43, alpha_vr #else xvmaddasp vs0, vs40, alpha_vr xvmaddasp vs1, vs41, alpha_vr xvmaddasp vs2, vs42, alpha_vr xvmaddasp vs3, vs43, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs44, alpha_vr xvmulsp vs1, vs45, alpha_vr xvmulsp vs2, vs46, alpha_vr xvmulsp vs3, vs47, alpha_vr #else xvmaddasp vs0, vs44, alpha_vr xvmaddasp vs1, vs45, alpha_vr xvmaddasp vs2, vs46, alpha_vr xvmaddasp vs3, vs47, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs48, alpha_vr xvmulsp vs1, vs49, alpha_vr xvmulsp vs2, vs50, alpha_vr xvmulsp vs3, vs51, alpha_vr #else xvmaddasp vs0, vs48, alpha_vr xvmaddasp vs1, vs49, alpha_vr xvmaddasp vs2, vs50, alpha_vr xvmaddasp vs3, vs51, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs52, alpha_vr xvmulsp vs1, vs53, alpha_vr xvmulsp vs2, vs54, alpha_vr xvmulsp vs3, vs55, alpha_vr #else xvmaddasp vs0, vs52, alpha_vr xvmaddasp vs1, vs53, alpha_vr xvmaddasp vs2, vs54, alpha_vr xvmaddasp vs3, vs55, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs56, alpha_vr xvmulsp vs1, vs57, alpha_vr xvmulsp vs2, vs58, alpha_vr xvmulsp vs3, vs59, alpha_vr #else xvmaddasp vs0, vs56, alpha_vr xvmaddasp vs1, vs57, alpha_vr xvmaddasp vs2, vs58, alpha_vr xvmaddasp vs3, vs59, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs60, alpha_vr xvmulsp vs1, vs61, alpha_vr xvmulsp vs2, vs62, alpha_vr xvmulsp vs3, vs63, alpha_vr #else xvmaddasp vs0, vs60, alpha_vr xvmaddasp vs1, vs61, alpha_vr xvmaddasp vs2, vs62, alpha_vr xvmaddasp vs3, vs63, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ .macro LOAD8x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 .endm .macro KERNEL8x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi T1, T1, 64 lxvw4x vs20, o0, T1 lxvw4x vs21, o16, T1 lxvw4x vs22, o32, T1 lxvw4x vs23, o48, T1 addi BO, BO, 128 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 xvmulsp vs40, vs0, vs12 xvmulsp vs41, vs1, vs12 xvmulsp vs42, vs0, vs13 xvmulsp vs43, vs1, vs13 xvmulsp vs44, vs0, vs14 xvmulsp vs45, vs1, vs14 xvmulsp vs46, vs0, vs15 xvmulsp vs47, vs1, vs15 .endm .macro KERNEL8x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi T1, T1, 64 lxvw4x vs20, o0, T1 lxvw4x vs21, o16, T1 lxvw4x vs22, o32, T1 lxvw4x vs23, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 xvmaddasp vs40, vs0, vs12 xvmaddasp vs41, vs1, vs12 xvmaddasp vs42, vs0, vs13 xvmaddasp vs43, vs1, vs13 xvmaddasp vs44, vs0, vs14 xvmaddasp vs45, vs1, vs14 xvmaddasp vs46, vs0, vs15 xvmaddasp vs47, vs1, vs15 .endm .macro KERNEL8x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 xvmaddasp vs40, vs4, vs20 xvmaddasp vs41, vs5, vs20 xvmaddasp vs42, vs4, vs21 xvmaddasp vs43, vs5, vs21 xvmaddasp vs44, vs4, vs22 xvmaddasp vs45, vs5, vs22 xvmaddasp vs46, vs4, vs23 xvmaddasp vs47, vs5, vs23 .endm .macro KERNEL8x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 xvmaddasp vs40, vs4, vs20 xvmaddasp vs41, vs5, vs20 xvmaddasp vs42, vs4, vs21 xvmaddasp vs43, vs5, vs21 xvmaddasp vs44, vs4, vs22 xvmaddasp vs45, vs5, vs22 xvmaddasp vs46, vs4, vs23 xvmaddasp vs47, vs5, vs23 .endm .macro KERNEL8x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 xvmulsp vs40, vs0, vs12 xvmulsp vs41, vs1, vs12 xvmulsp vs42, vs0, vs13 xvmulsp vs43, vs1, vs13 xvmulsp vs44, vs0, vs14 xvmulsp vs45, vs1, vs14 xvmulsp vs46, vs0, vs15 xvmulsp vs47, vs1, vs15 .endm .macro KERNEL8x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 xvmaddasp vs40, vs0, vs12 xvmaddasp vs41, vs1, vs12 xvmaddasp vs42, vs0, vs13 xvmaddasp vs43, vs1, vs13 xvmaddasp vs44, vs0, vs14 xvmaddasp vs45, vs1, vs14 xvmaddasp vs46, vs0, vs15 xvmaddasp vs47, vs1, vs15 .endm .macro SAVE8x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr xvmulsp vs1, vs35, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr xvmaddasp vs1, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs38, alpha_vr xvmulsp vs1, vs39, alpha_vr #else xvmaddasp vs0, vs38, alpha_vr xvmaddasp vs1, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs40, alpha_vr xvmulsp vs1, vs41, alpha_vr #else xvmaddasp vs0, vs40, alpha_vr xvmaddasp vs1, vs41, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs42, alpha_vr xvmulsp vs1, vs43, alpha_vr #else xvmaddasp vs0, vs42, alpha_vr xvmaddasp vs1, vs43, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs44, alpha_vr xvmulsp vs1, vs45, alpha_vr #else xvmaddasp vs0, vs44, alpha_vr xvmaddasp vs1, vs45, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs46, alpha_vr xvmulsp vs1, vs47, alpha_vr #else xvmaddasp vs0, vs46, alpha_vr xvmaddasp vs1, vs47, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ .macro LOAD8x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 .endm .macro KERNEL8x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi T1, T1, 64 lxvw4x vs20, o0, T1 lxvw4x vs21, o16, T1 lxvw4x vs22, o32, T1 lxvw4x vs23, o48, T1 addi BO, BO, 128 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 xvmulsp vs36, vs0, vs12 xvmulsp vs37, vs0, vs13 xvmulsp vs38, vs0, vs14 xvmulsp vs39, vs0, vs15 .endm .macro KERNEL8x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi T1, T1, 64 lxvw4x vs20, o0, T1 lxvw4x vs21, o16, T1 lxvw4x vs22, o32, T1 lxvw4x vs23, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 xvmaddasp vs36, vs0, vs12 xvmaddasp vs37, vs0, vs13 xvmaddasp vs38, vs0, vs14 xvmaddasp vs39, vs0, vs15 .endm .macro KERNEL8x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 xvmaddasp vs36, vs4, vs20 xvmaddasp vs37, vs4, vs21 xvmaddasp vs38, vs4, vs22 xvmaddasp vs39, vs4, vs23 .endm .macro KERNEL8x4_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 xvmaddasp vs36, vs4, vs20 xvmaddasp vs37, vs4, vs21 xvmaddasp vs38, vs4, vs22 xvmaddasp vs39, vs4, vs23 .endm .macro KERNEL8x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 xvmulsp vs36, vs0, vs12 xvmulsp vs37, vs0, vs13 xvmulsp vs38, vs0, vs14 xvmulsp vs39, vs0, vs15 .endm .macro KERNEL8x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi T1, T1, 64 lxvw4x vs12, o0, T1 lxvw4x vs13, o16, T1 lxvw4x vs14, o32, T1 lxvw4x vs15, o48, T1 addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 xvmaddasp vs36, vs0, vs12 xvmaddasp vs37, vs0, vs13 xvmaddasp vs38, vs0, vs14 xvmaddasp vs39, vs0, vs15 .endm .macro SAVE8x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs33, alpha_vr #else xvmaddasp vs0, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs35, alpha_vr #else xvmaddasp vs0, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs37, alpha_vr #else xvmaddasp vs0, vs37, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs38, alpha_vr #else xvmaddasp vs0, vs38, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs39, alpha_vr #else xvmaddasp vs0, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ .macro LOAD8x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 .endm .macro KERNEL8x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi T1, T1, 64 lxsspx vs20, o0, T1 lxsspx vs21, o16, T1 lxsspx vs22, o32, T1 lxsspx vs23, o48, T1 addi BO, BO, 128 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 xsmuldp vs40, vs0, vs12 xsmuldp vs41, vs1, vs12 xsmuldp vs42, vs0, vs13 xsmuldp vs43, vs1, vs13 xsmuldp vs44, vs0, vs14 xsmuldp vs45, vs1, vs14 xsmuldp vs46, vs0, vs15 xsmuldp vs47, vs1, vs15 .endm .macro KERNEL8x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi T1, T1, 64 lxsspx vs20, o0, T1 lxsspx vs21, o16, T1 lxsspx vs22, o32, T1 lxsspx vs23, o48, T1 addi BO, BO, 128 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 xsmaddadp vs40, vs0, vs12 xsmaddadp vs41, vs1, vs12 xsmaddadp vs42, vs0, vs13 xsmaddadp vs43, vs1, vs13 xsmaddadp vs44, vs0, vs14 xsmaddadp vs45, vs1, vs14 xsmaddadp vs46, vs0, vs15 xsmaddadp vs47, vs1, vs15 .endm .macro KERNEL8x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 xsmaddadp vs40, vs4, vs20 xsmaddadp vs41, vs5, vs20 xsmaddadp vs42, vs4, vs21 xsmaddadp vs43, vs5, vs21 xsmaddadp vs44, vs4, vs22 xsmaddadp vs45, vs5, vs22 xsmaddadp vs46, vs4, vs23 xsmaddadp vs47, vs5, vs23 .endm .macro KERNEL8x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 xsmaddadp vs40, vs4, vs20 xsmaddadp vs41, vs5, vs20 xsmaddadp vs42, vs4, vs21 xsmaddadp vs43, vs5, vs21 xsmaddadp vs44, vs4, vs22 xsmaddadp vs45, vs5, vs22 xsmaddadp vs46, vs4, vs23 xsmaddadp vs47, vs5, vs23 .endm .macro KERNEL8x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 xsmuldp vs40, vs0, vs12 xsmuldp vs41, vs1, vs12 xsmuldp vs42, vs0, vs13 xsmuldp vs43, vs1, vs13 xsmuldp vs44, vs0, vs14 xsmuldp vs45, vs1, vs14 xsmuldp vs46, vs0, vs15 xsmuldp vs47, vs1, vs15 .endm .macro KERNEL8x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 xsmaddadp vs40, vs0, vs12 xsmaddadp vs41, vs1, vs12 xsmaddadp vs42, vs0, vs13 xsmaddadp vs43, vs1, vs13 xsmaddadp vs44, vs0, vs14 xsmaddadp vs45, vs1, vs14 xsmaddadp vs46, vs0, vs15 xsmaddadp vs47, vs1, vs15 .endm .macro SAVE8x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r xsmuldp vs1, vs35, alpha_r #else xsmaddadp vs0, vs34, alpha_r xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs36, alpha_r xsmuldp vs1, vs37, alpha_r #else xsmaddadp vs0, vs36, alpha_r xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs38, alpha_r xsmuldp vs1, vs39, alpha_r #else xsmaddadp vs0, vs38, alpha_r xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs40, alpha_r xsmuldp vs1, vs41, alpha_r #else xsmaddadp vs0, vs40, alpha_r xsmaddadp vs1, vs41, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs42, alpha_r xsmuldp vs1, vs43, alpha_r #else xsmaddadp vs0, vs42, alpha_r xsmaddadp vs1, vs43, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs44, alpha_r xsmuldp vs1, vs45, alpha_r #else xsmaddadp vs0, vs44, alpha_r xsmaddadp vs1, vs45, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs46, alpha_r xsmuldp vs1, vs47, alpha_r #else xsmaddadp vs0, vs46, alpha_r xsmaddadp vs1, vs47, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ .macro LOAD8x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 .endm .macro KERNEL8x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi T1, T1, 64 lxsspx vs20, o0, T1 lxsspx vs21, o16, T1 lxsspx vs22, o32, T1 lxsspx vs23, o48, T1 addi BO, BO, 128 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 xsmuldp vs36, vs0, vs12 xsmuldp vs37, vs0, vs13 xsmuldp vs38, vs0, vs14 xsmuldp vs39, vs0, vs15 .endm .macro KERNEL8x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi T1, T1, 64 lxsspx vs20, o0, T1 lxsspx vs21, o16, T1 lxsspx vs22, o32, T1 lxsspx vs23, o48, T1 addi BO, BO, 128 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 xsmaddadp vs36, vs0, vs12 xsmaddadp vs37, vs0, vs13 xsmaddadp vs38, vs0, vs14 xsmaddadp vs39, vs0, vs15 .endm .macro KERNEL8x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 xsmaddadp vs36, vs4, vs20 xsmaddadp vs37, vs4, vs21 xsmaddadp vs38, vs4, vs22 xsmaddadp vs39, vs4, vs23 .endm .macro KERNEL8x1_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 xsmaddadp vs36, vs4, vs20 xsmaddadp vs37, vs4, vs21 xsmaddadp vs38, vs4, vs22 xsmaddadp vs39, vs4, vs23 .endm .macro KERNEL8x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 xsmuldp vs36, vs0, vs12 xsmuldp vs37, vs0, vs13 xsmuldp vs38, vs0, vs14 xsmuldp vs39, vs0, vs15 .endm .macro KERNEL8x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi T1, T1, 64 lxsspx vs12, o0, T1 lxsspx vs13, o16, T1 lxsspx vs14, o32, T1 lxsspx vs15, o48, T1 addi BO, BO, 128 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 xsmaddadp vs36, vs0, vs12 xsmaddadp vs37, vs0, vs13 xsmaddadp vs38, vs0, vs14 xsmaddadp vs39, vs0, vs15 .endm .macro SAVE8x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs33, alpha_r #else xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r #else xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs35, alpha_r #else xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs36, alpha_r #else xsmaddadp vs0, vs36, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs37, alpha_r #else xsmaddadp vs0, vs37, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs38, alpha_r #else xsmaddadp vs0, vs38, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs39, alpha_r #else xsmaddadp vs0, vs39, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ .macro LOAD4x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 .endm .macro KERNEL4x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi BO, BO, 64 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 .endm .macro KERNEL4x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 .endm .macro KERNEL4x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 .endm .macro KERNEL4x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 .endm .macro KERNEL4x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 .endm .macro KERNEL4x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 .endm .macro SAVE4x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr xvmulsp vs2, vs38, alpha_vr xvmulsp vs3, vs39, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr xvmaddasp vs2, vs38, alpha_vr xvmaddasp vs3, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs40, alpha_vr xvmulsp vs1, vs41, alpha_vr xvmulsp vs2, vs42, alpha_vr xvmulsp vs3, vs43, alpha_vr #else xvmaddasp vs0, vs40, alpha_vr xvmaddasp vs1, vs41, alpha_vr xvmaddasp vs2, vs42, alpha_vr xvmaddasp vs3, vs43, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs44, alpha_vr xvmulsp vs1, vs45, alpha_vr xvmulsp vs2, vs46, alpha_vr xvmulsp vs3, vs47, alpha_vr #else xvmaddasp vs0, vs44, alpha_vr xvmaddasp vs1, vs45, alpha_vr xvmaddasp vs2, vs46, alpha_vr xvmaddasp vs3, vs47, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro LOAD4x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 .endm .macro KERNEL4x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi BO, BO, 64 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 .endm .macro KERNEL4x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 .endm .macro KERNEL4x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 .endm .macro KERNEL4x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 .endm .macro KERNEL4x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 .endm .macro KERNEL4x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 .endm .macro SAVE4x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr xvmulsp vs1, vs35, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr xvmaddasp vs1, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs38, alpha_vr xvmulsp vs1, vs39, alpha_vr #else xvmaddasp vs0, vs38, alpha_vr xvmaddasp vs1, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro LOAD4x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 .endm .macro KERNEL4x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi BO, BO, 64 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 .endm .macro KERNEL4x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 lxvw4x vs18, o32, T1 lxvw4x vs19, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 .endm .macro KERNEL4x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 .endm .macro KERNEL4x4_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 .endm .macro KERNEL4x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 .endm .macro KERNEL4x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 lxvw4x vs10, o32, T1 lxvw4x vs11, o48, T1 addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 .endm .macro SAVE4x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs33, alpha_vr #else xvmaddasp vs0, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs35, alpha_vr #else xvmaddasp vs0, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro LOAD4x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 .endm .macro KERNEL4x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi BO, BO, 64 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 .endm .macro KERNEL4x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 .endm .macro KERNEL4x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 .endm .macro KERNEL4x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 .endm .macro KERNEL4x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 .endm .macro KERNEL4x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 .endm .macro SAVE4x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r xsmuldp vs1, vs35, alpha_r #else xsmaddadp vs0, vs34, alpha_r xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs36, alpha_r xsmuldp vs1, vs37, alpha_r #else xsmaddadp vs0, vs36, alpha_r xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs38, alpha_r xsmuldp vs1, vs39, alpha_r #else xsmaddadp vs0, vs38, alpha_r xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro LOAD4x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 .endm .macro KERNEL4x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi BO, BO, 64 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 .endm .macro KERNEL4x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 lxsspx vs18, o32, T1 lxsspx vs19, o48, T1 addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 .endm .macro KERNEL4x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 .endm .macro KERNEL4x1_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 .endm .macro KERNEL4x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 .endm .macro KERNEL4x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 lxsspx vs10, o32, T1 lxsspx vs11, o48, T1 addi BO, BO, 64 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 .endm .macro SAVE4x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs33, alpha_r #else xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r #else xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs35, alpha_r #else xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ .macro LOAD2x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 .endm .macro KERNEL2x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 .endm .macro KERNEL2x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 .endm .macro KERNEL2x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 .endm .macro KERNEL2x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 .endm .macro KERNEL2x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 .endm .macro KERNEL2x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 .endm .macro SAVE2x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr xvmulsp vs2, vs38, alpha_vr xvmulsp vs3, vs39, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr xvmaddasp vs2, vs38, alpha_vr xvmaddasp vs3, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro LOAD2x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 .endm .macro KERNEL2x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 .endm .macro KERNEL2x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 .endm .macro KERNEL2x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 .endm .macro KERNEL2x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 .endm .macro KERNEL2x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 .endm .macro KERNEL2x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 .endm .macro SAVE2x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr xvmulsp vs1, vs35, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr xvmaddasp vs1, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro LOAD2x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 .endm .macro KERNEL2x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 .endm .macro KERNEL2x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 lxvw4x vs17, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 .endm .macro KERNEL2x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 .endm .macro KERNEL2x4_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 .endm .macro KERNEL2x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 .endm .macro KERNEL2x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 lxvw4x vs9, o16, T1 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 .endm .macro SAVE2x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs33, alpha_vr #else xvmaddasp vs0, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro LOAD2x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 .endm .macro KERNEL2x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 .endm .macro KERNEL2x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 .endm .macro KERNEL2x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 .endm .macro KERNEL2x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 .endm .macro KERNEL2x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 .endm .macro KERNEL2x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 .endm .macro SAVE2x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r xsmuldp vs1, vs35, alpha_r #else xsmaddadp vs0, vs34, alpha_r xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro LOAD2x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 .endm .macro KERNEL2x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 .endm .macro KERNEL2x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o16, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 .endm .macro KERNEL2x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 .endm .macro KERNEL2x1_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 .endm .macro KERNEL2x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 .endm .macro KERNEL2x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o16, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 .endm .macro SAVE2x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs33, alpha_r #else xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ .macro LOAD1x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 .endm .macro KERNEL1x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 .endm .macro KERNEL1x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs16, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 .endm .macro KERNEL1x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 .endm .macro KERNEL1x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 .endm .macro KERNEL1x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 .endm .macro KERNEL1x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 .endm .macro SAVE1x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro LOAD1x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 .endm .macro KERNEL1x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 .endm .macro KERNEL1x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs16, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 .endm .macro KERNEL1x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 .endm .macro KERNEL1x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 .endm .macro KERNEL1x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 .endm .macro KERNEL1x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 .endm .macro SAVE1x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro LOAD1x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 .endm .macro KERNEL1x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 .endm .macro KERNEL1x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs16, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 .endm .macro KERNEL1x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 .endm .macro KERNEL1x4_E2 xvmaddasp vs32, vs4, vs16 .endm .macro KERNEL1x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 .endm .macro KERNEL1x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 mr T1, BO lxvw4x vs8, o0, T1 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 .endm .macro SAVE1x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro LOAD1x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 .endm .macro KERNEL1x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 .endm .macro KERNEL1x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 .endm .macro KERNEL1x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 .endm .macro KERNEL1x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 .endm .macro KERNEL1x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 .endm .macro KERNEL1x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 .endm .macro SAVE1x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro LOAD1x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 .endm .macro KERNEL1x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 .endm .macro KERNEL1x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 .endm .macro KERNEL1x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 xsmaddadp vs32, vs4, vs16 .endm .macro KERNEL1x1_E2 xsmaddadp vs32, vs4, vs16 .endm .macro KERNEL1x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 .endm .macro KERNEL1x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 .endm .macro SAVE1x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm .macro COPYB_4x8 lxvw4x vs5, o0, BO xxspltw vs6, vs5, 0 xxspltw vs7, vs5, 1 xxspltw vs8, vs5, 2 xxspltw vs9, vs5, 3 lxvw4x vs10, o16, BO xxspltw vs11, vs10, 0 xxspltw vs12, vs10, 1 xxspltw vs13, vs10, 2 xxspltw vs14, vs10, 3 lxvw4x vs15, o32, BO xxspltw vs16, vs15, 0 xxspltw vs17, vs15, 1 xxspltw vs18, vs15, 2 xxspltw vs19, vs15, 3 lxvw4x vs20, o48, BO xxspltw vs21, vs20, 0 xxspltw vs22, vs20, 1 xxspltw vs23, vs20, 2 xxspltw vs24, vs20, 3 addi BO, BO, 64 lxvw4x vs35, o0, BO xxspltw vs36, vs35, 0 xxspltw vs37, vs35, 1 xxspltw vs38, vs35, 2 xxspltw vs39, vs35, 3 lxvw4x vs40, o16, BO xxspltw vs41, vs40, 0 xxspltw vs42, vs40, 1 xxspltw vs43, vs40, 2 xxspltw vs44, vs40, 3 lxvw4x vs45, o32, BO xxspltw vs46, vs45, 0 xxspltw vs47, vs45, 1 xxspltw vs48, vs45, 2 xxspltw vs49, vs45, 3 lxvw4x vs50, o48, BO xxspltw vs51, vs50, 0 xxspltw vs52, vs50, 1 xxspltw vs53, vs50, 2 xxspltw vs54, vs50, 3 addi BO, BO, 64 stxvw4x vs6, o0, BBO stxvw4x vs7, o16, BBO stxvw4x vs8, o32, BBO stxvw4x vs9, o48, BBO addi BBO, BBO, 64 stxvw4x vs11, o0, BBO stxvw4x vs12, o16, BBO stxvw4x vs13, o32, BBO stxvw4x vs14, o48, BBO addi BBO, BBO, 64 stxvw4x vs16, o0, BBO stxvw4x vs17, o16, BBO stxvw4x vs18, o32, BBO stxvw4x vs19, o48, BBO addi BBO, BBO, 64 stxvw4x vs21, o0, BBO stxvw4x vs22, o16, BBO stxvw4x vs23, o32, BBO stxvw4x vs24, o48, BBO addi BBO, BBO, 64 stxvw4x vs36, o0, BBO stxvw4x vs37, o16, BBO stxvw4x vs38, o32, BBO stxvw4x vs39, o48, BBO addi BBO, BBO, 64 stxvw4x vs41, o0, BBO stxvw4x vs42, o16, BBO stxvw4x vs43, o32, BBO stxvw4x vs44, o48, BBO addi BBO, BBO, 64 stxvw4x vs46, o0, BBO stxvw4x vs47, o16, BBO stxvw4x vs48, o32, BBO stxvw4x vs49, o48, BBO addi BBO, BBO, 64 stxvw4x vs51, o0, BBO stxvw4x vs52, o16, BBO stxvw4x vs53, o32, BBO stxvw4x vs54, o48, BBO addi BBO, BBO, 64 .endm .macro COPYB_1x8 lxvw4x vs5, o0, BO xxspltw vs6, vs5, 0 xxspltw vs7, vs5, 1 xxspltw vs8, vs5, 2 xxspltw vs9, vs5, 3 lxvw4x vs10, o16, BO xxspltw vs11, vs10, 0 xxspltw vs12, vs10, 1 xxspltw vs13, vs10, 2 xxspltw vs14, vs10, 3 addi BO, BO, 32 stxvw4x vs6, o0, BBO stxvw4x vs7, o16, BBO stxvw4x vs8, o32, BBO stxvw4x vs9, o48, BBO addi BBO, BBO, 64 stxvw4x vs11, o0, BBO stxvw4x vs12, o16, BBO stxvw4x vs13, o32, BBO stxvw4x vs14, o48, BBO addi BBO, BBO, 64 .endm OpenBLAS-0.2.20/kernel/power/sgemm_tcopy_16_power8.S000066400000000000000000000154301313527062700220470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define A0 r8 #define A1 r9 #define A2 r10 #define A3 r11 #define J r12 #define PREA r14 #define PREB r15 #define BO r16 #define B8 r17 #define B4 r18 #define B2 r19 #define B1 r20 #define o4 r21 #define T2 r22 #define I r23 #define o16 r24 #define o32 r25 #define o48 r26 #define B16 r29 #define M16 r30 #define T1 r31 #define o0 0 #include "sgemm_tcopy_macros_16_power8.S" #define STACKSIZE 384 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) cmpwi cr0, M, 0 ble- L999 cmpwi cr0, N, 0 ble- L999 slwi LDA, LDA, BASE_SHIFT slwi M16, M, 4 + BASE_SHIFT li T1, -16 li T2, -8 li PREA, -4 li PREB, -2 and B8, N, T1 and B4, N, T2 and B2, N, PREA and B1, N, PREB mullw B8, B8, M mullw B4, B4, M mullw B2, B2, M mullw B1, B1, M slwi B8, B8, BASE_SHIFT slwi B4, B4, BASE_SHIFT slwi B2, B2, BASE_SHIFT slwi B1, B1, BASE_SHIFT add B8, B8, B add B4, B4, B add B2, B2, B add B1, B1, B li PREA, 768 addi PREB, M16, 128 li o4, 4 li o16, 16 li o32, 32 li o48, 48 #include "sgemm_tcopy_logic_16_power8.S" L999: li r3, 0 ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/sgemm_tcopy_8_power8.S000066400000000000000000000153011313527062700217650ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define A0 r8 #define A1 r9 #define A2 r10 #define A3 r11 #define J r12 #define PREA r14 #define PREB r15 #define BO r16 #define B8 r17 #define B4 r18 #define B2 r19 #define B1 r20 #define o4 r21 #define T2 r22 #define I r23 #define o16 r24 #define o32 r25 #define o48 r26 #define NOTU1 r29 #define M8 r30 #define T1 r31 #define o0 0 #include "sgemm_tcopy_macros_8_power8.S" #define STACKSIZE 384 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) cmpwi cr0, M, 0 ble- L999 cmpwi cr0, N, 0 ble- L999 slwi LDA, LDA, BASE_SHIFT slwi M8, M, 3 + BASE_SHIFT li T2, -8 li PREA, -4 li PREB, -2 and B4, N, T2 and B2, N, PREA and B1, N, PREB mullw B4, B4, M mullw B2, B2, M mullw B1, B1, M slwi B4, B4, BASE_SHIFT slwi B2, B2, BASE_SHIFT slwi B1, B1, BASE_SHIFT add B4, B4, B add B2, B2, B add B1, B1, B li PREA, 384 addi PREB, M8, 128 li o4, 4 li o16, 16 li o32, 32 li o48, 48 #include "sgemm_tcopy_logic_8_power8.S" L999: li r3, 0 ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/sgemm_tcopy_logic_16_power8.S000066400000000000000000000122701313527062700232230ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. I, M, 2 ble SCOPYT_L2_BEGIN SCOPYT_L4_BEGIN: mr A0, A add A1, A0, LDA add A2, A1, LDA add A3, A2, LDA add A, A3, LDA mr B16, B addi B, B, 64*SIZE sradi. J, N, 4 ble SCOPYT_L4x8_BEGIN mr BO, B16 SCOPYT_L4x16_LOOP: dcbtst BO, M16 dcbtst BO, PREB dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA COPY_4x16 addi A0, A0, 16*SIZE addi A1, A1, 16*SIZE addi A2, A2, 16*SIZE addi A3, A3, 16*SIZE add BO, BO, M16 addic. J, J, -1 ble SCOPYT_L4x8_BEGIN dcbtst BO, M16 dcbtst BO, PREB COPY_4x16 addi A0, A0, 16*SIZE addi A1, A1, 16*SIZE addi A2, A2, 16*SIZE addi A3, A3, 16*SIZE add BO, BO, M16 addic. J, J, -1 bgt SCOPYT_L4x16_LOOP SCOPYT_L4x8_BEGIN: andi. T1, N, 8 ble SCOPYT_L4x4_BEGIN mr BO, B8 COPY_4x8 addi A0, A0, 8*SIZE addi A1, A1, 8*SIZE addi A2, A2, 8*SIZE addi A3, A3, 8*SIZE addi B8, B8, 32*SIZE SCOPYT_L4x4_BEGIN: andi. T1, N, 4 ble SCOPYT_L4x2_BEGIN mr BO, B4 COPY_4x4 addi A0, A0, 4*SIZE addi A1, A1, 4*SIZE addi A2, A2, 4*SIZE addi A3, A3, 4*SIZE addi B4, B4, 16*SIZE SCOPYT_L4x2_BEGIN: andi. T1, N, 2 ble SCOPYT_L4x1_BEGIN mr BO, B2 COPY_4x2 addi A0, A0, 2*SIZE addi A1, A1, 2*SIZE addi A2, A2, 2*SIZE addi A3, A3, 2*SIZE addi B2, B2, 8*SIZE SCOPYT_L4x1_BEGIN: andi. T1, N, 1 ble SCOPYT_L4_END mr BO, B1 COPY_4x1 addi A0, A0, 1*SIZE addi A1, A1, 1*SIZE addi A2, A2, 1*SIZE addi A3, A3, 1*SIZE addi B1, B1, 4*SIZE SCOPYT_L4_END: addic. I, I, -1 bgt SCOPYT_L4_BEGIN SCOPYT_L2_BEGIN: andi. T1, M, 2 ble SCOPYT_L1_BEGIN mr A0, A add A1, A0, LDA add A, A1, LDA mr B16, B addi B, B, 32*SIZE sradi. J, N, 4 ble SCOPYT_L2x8_BEGIN mr BO, B16 SCOPYT_L2x16_LOOP: COPY_2x16 addi A0, A0, 16*SIZE addi A1, A1, 16*SIZE add BO, BO, M16 addic. J, J, -1 bgt SCOPYT_L2x16_LOOP SCOPYT_L2x8_BEGIN: andi. T1, N, 8 ble SCOPYT_L2x4_BEGIN mr BO, B8 COPY_2x8 addi A0, A0, 8*SIZE addi A1, A1, 8*SIZE addi B8, B8, 16*SIZE SCOPYT_L2x4_BEGIN: andi. T1, N, 4 ble SCOPYT_L2x2_BEGIN mr BO, B4 COPY_2x4 addi A0, A0, 4*SIZE addi A1, A1, 4*SIZE addi B4, B4, 8*SIZE SCOPYT_L2x2_BEGIN: andi. T1, N, 2 ble SCOPYT_L2x1_BEGIN mr BO, B2 COPY_2x2 addi A0, A0, 2*SIZE addi A1, A1, 2*SIZE addi B2, B2, 4*SIZE SCOPYT_L2x1_BEGIN: andi. T1, N, 1 ble SCOPYT_L2_END mr BO, B1 COPY_2x1 addi A0, A0, 1*SIZE addi A1, A1, 1*SIZE addi B1, B1, 2*SIZE SCOPYT_L2_END: SCOPYT_L1_BEGIN: andi. T1, M, 1 ble L999 mr A0, A add A, A0, LDA mr B16, B addi B, B, 16*SIZE sradi. J, N, 4 ble SCOPYT_L1x8_BEGIN mr BO, B16 SCOPYT_L1x16_LOOP: COPY_1x16 addi A0, A0, 16*SIZE add BO, BO, M16 addic. J, J, -1 bgt SCOPYT_L1x16_LOOP SCOPYT_L1x8_BEGIN: andi. T1, N, 8 ble SCOPYT_L1x4_BEGIN mr BO, B8 COPY_1x8 addi A0, A0, 8*SIZE addi B8, B8, 8*SIZE SCOPYT_L1x4_BEGIN: andi. T1, N, 4 ble SCOPYT_L1x2_BEGIN mr BO, B4 COPY_1x4 addi A0, A0, 4*SIZE addi B4, B4, 4*SIZE SCOPYT_L1x2_BEGIN: andi. T1, N, 2 ble SCOPYT_L1x1_BEGIN mr BO, B2 COPY_1x2 addi A0, A0, 2*SIZE addi B2, B2, 2*SIZE SCOPYT_L1x1_BEGIN: andi. T1, N, 1 ble SCOPYT_L1_END mr BO, B1 COPY_1x1 addi A0, A0, 1*SIZE addi B1, B1, 1*SIZE SCOPYT_L1_END: OpenBLAS-0.2.20/kernel/power/sgemm_tcopy_logic_8_power8.S000066400000000000000000000117341313527062700231500ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. I, M, 2 ble SCOPYOT_L2_BEGIN SCOPYOT_L4_BEGIN: mr A0, A add A1, A0, LDA add A2, A1, LDA add A3, A2, LDA add A, A3, LDA mr B8, B addi B, B, 32*SIZE sradi. J, N, 3 ble SCOPYOT_L4x4_BEGIN mr BO, B8 .align 5 SCOPYOT_L4x8_LOOP: dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA COPY_4x8 addi A0, A0, 8*SIZE addi A1, A1, 8*SIZE addi A2, A2, 8*SIZE addi A3, A3, 8*SIZE add BO, BO, M8 addic. J, J, -1 ble SCOPYOT_L4x4_BEGIN COPY_4x8 addi A0, A0, 8*SIZE addi A1, A1, 8*SIZE addi A2, A2, 8*SIZE addi A3, A3, 8*SIZE add BO, BO, M8 addic. J, J, -1 ble SCOPYOT_L4x4_BEGIN COPY_4x8 addi A0, A0, 8*SIZE addi A1, A1, 8*SIZE addi A2, A2, 8*SIZE addi A3, A3, 8*SIZE add BO, BO, M8 addic. J, J, -1 ble SCOPYOT_L4x4_BEGIN COPY_4x8 addi A0, A0, 8*SIZE addi A1, A1, 8*SIZE addi A2, A2, 8*SIZE addi A3, A3, 8*SIZE add BO, BO, M8 addic. J, J, -1 bgt SCOPYOT_L4x8_LOOP SCOPYOT_L4x4_BEGIN: andi. T1, N, 4 ble SCOPYOT_L4x2_BEGIN mr BO, B4 COPY_4x4 addi A0, A0, 4*SIZE addi A1, A1, 4*SIZE addi A2, A2, 4*SIZE addi A3, A3, 4*SIZE addi B4, B4, 16*SIZE SCOPYOT_L4x2_BEGIN: andi. T1, N, 2 ble SCOPYOT_L4x1_BEGIN mr BO, B2 COPY_4x2 addi A0, A0, 2*SIZE addi A1, A1, 2*SIZE addi A2, A2, 2*SIZE addi A3, A3, 2*SIZE addi B2, B2, 8*SIZE SCOPYOT_L4x1_BEGIN: andi. T1, N, 1 ble SCOPYOT_L4_END mr BO, B1 COPY_4x1 addi A0, A0, 1*SIZE addi A1, A1, 1*SIZE addi A2, A2, 1*SIZE addi A3, A3, 1*SIZE addi B1, B1, 4*SIZE SCOPYOT_L4_END: addic. I, I, -1 bgt SCOPYOT_L4_BEGIN SCOPYOT_L2_BEGIN: andi. T1, M, 2 ble SCOPYOT_L1_BEGIN mr A0, A add A1, A0, LDA add A, A1, LDA mr B8, B addi B, B, 16*SIZE sradi. J, N, 3 ble SCOPYOT_L2x4_BEGIN mr BO, B8 SCOPYOT_L2x8_LOOP: COPY_2x8 addi A0, A0, 8*SIZE addi A1, A1, 8*SIZE add BO, BO, M8 addic. J, J, -1 bgt SCOPYOT_L2x8_LOOP SCOPYOT_L2x4_BEGIN: andi. T1, N, 4 ble SCOPYOT_L2x2_BEGIN mr BO, B4 COPY_2x4 addi A0, A0, 4*SIZE addi A1, A1, 4*SIZE addi B4, B4, 8*SIZE SCOPYOT_L2x2_BEGIN: andi. T1, N, 2 ble SCOPYOT_L2x1_BEGIN mr BO, B2 COPY_2x2 addi A0, A0, 2*SIZE addi A1, A1, 2*SIZE addi B2, B2, 4*SIZE SCOPYOT_L2x1_BEGIN: andi. T1, N, 1 ble SCOPYOT_L2_END mr BO, B1 COPY_2x1 addi A0, A0, 1*SIZE addi A1, A1, 1*SIZE addi B1, B1, 2*SIZE SCOPYOT_L2_END: SCOPYOT_L1_BEGIN: andi. T1, M, 1 ble L999 mr A0, A add A, A0, LDA mr B8, B addi B, B, 8*SIZE sradi. J, N, 3 ble SCOPYOT_L1x4_BEGIN mr BO, B8 SCOPYOT_L1x8_LOOP: COPY_1x8 addi A0, A0, 8*SIZE add BO, BO, M8 addic. J, J, -1 bgt SCOPYOT_L1x8_LOOP SCOPYOT_L1x4_BEGIN: andi. T1, N, 4 ble SCOPYOT_L1x2_BEGIN mr BO, B4 COPY_1x4 addi A0, A0, 4*SIZE addi B4, B4, 4*SIZE SCOPYOT_L1x2_BEGIN: andi. T1, N, 2 ble SCOPYOT_L1x1_BEGIN mr BO, B2 COPY_1x2 addi A0, A0, 2*SIZE addi B2, B2, 2*SIZE SCOPYOT_L1x1_BEGIN: andi. T1, N, 1 ble SCOPYOT_L1_END mr BO, B1 COPY_1x1 addi A0, A0, 1*SIZE addi B1, B1, 1*SIZE SCOPYOT_L1_END: OpenBLAS-0.2.20/kernel/power/sgemm_tcopy_macros_16_power8.S000066400000000000000000000220421313527062700234100ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ .macro COPY_4x16 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o32, A0 lxvw4x vs35, o48, A0 lxvw4x vs36, o0, A1 lxvw4x vs37, o16, A1 lxvw4x vs38, o32, A1 lxvw4x vs39, o48, A1 lxvw4x vs40, o0, A2 lxvw4x vs41, o16, A2 lxvw4x vs42, o32, A2 lxvw4x vs43, o48, A2 lxvw4x vs44, o0, A3 lxvw4x vs45, o16, A3 lxvw4x vs46, o32, A3 lxvw4x vs47, o48, A3 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 addi T1, T1, 64 stxvw4x vs36, o0, T1 stxvw4x vs37, o16, T1 stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 addi T1, T1, 64 stxvw4x vs40, o0, T1 stxvw4x vs41, o16, T1 stxvw4x vs42, o32, T1 stxvw4x vs43, o48, T1 addi T1, T1, 64 stxvw4x vs44, o0, T1 stxvw4x vs45, o16, T1 stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro COPY_4x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o0, A1 lxvw4x vs35, o16, A1 lxvw4x vs36, o0, A2 lxvw4x vs37, o16, A2 lxvw4x vs38, o0, A3 lxvw4x vs39, o16, A3 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 addi T1, T1, 64 stxvw4x vs36, o0, T1 stxvw4x vs37, o16, T1 stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro COPY_4x4 lxvw4x vs32, o0, A0 lxvw4x vs33, o0, A1 lxvw4x vs34, o0, A2 lxvw4x vs35, o0, A3 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro COPY_4x2 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 lxsspx vs34, o0, A1 lxsspx vs35, o4, A1 lxsspx vs36, o0, A2 lxsspx vs37, o4, A2 lxsspx vs38, o0, A3 lxsspx vs39, o4, A3 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 addi T1, T1, 8 stxsspx vs36, o0, T1 stxsspx vs37, o4, T1 addi T1, T1, 8 stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro COPY_4x1 lxsspx vs32, o0, A0 lxsspx vs33, o0, A1 lxsspx vs34, o0, A2 lxsspx vs35, o0, A3 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 .endm /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ .macro COPY_2x16 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o32, A0 lxvw4x vs35, o48, A0 lxvw4x vs36, o0, A1 lxvw4x vs37, o16, A1 lxvw4x vs38, o32, A1 lxvw4x vs39, o48, A1 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 addi T1, T1, 64 stxvw4x vs36, o0, T1 stxvw4x vs37, o16, T1 stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro COPY_2x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o0, A1 lxvw4x vs35, o16, A1 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro COPY_2x4 lxvw4x vs32, o0, A0 lxvw4x vs33, o0, A1 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro COPY_2x2 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 lxsspx vs34, o0, A1 lxsspx vs35, o4, A1 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro COPY_2x1 lxsspx vs32, o0, A0 lxsspx vs33, o0, A1 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 .endm /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ .macro COPY_1x16 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o32, A0 lxvw4x vs35, o48, A0 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro COPY_1x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro COPY_1x4 lxvw4x vs32, o0, A0 mr T1, BO stxvw4x vs32, o0, T1 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro COPY_1x2 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro COPY_1x1 lxsspx vs32, o0, A0 mr T1, BO stxsspx vs32, o0, T1 .endm OpenBLAS-0.2.20/kernel/power/sgemm_tcopy_macros_8_power8.S000066400000000000000000000156571313527062700233470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro COPY_4x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o0, A1 lxvw4x vs35, o16, A1 lxvw4x vs36, o0, A2 lxvw4x vs37, o16, A2 lxvw4x vs38, o0, A3 lxvw4x vs39, o16, A3 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 addi T1, T1, 64 stxvw4x vs36, o0, T1 stxvw4x vs37, o16, T1 stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro COPY_4x4 lxvw4x vs32, o0, A0 lxvw4x vs33, o0, A1 lxvw4x vs34, o0, A2 lxvw4x vs35, o0, A3 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro COPY_4x2 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 lxsspx vs34, o0, A1 lxsspx vs35, o4, A1 lxsspx vs36, o0, A2 lxsspx vs37, o4, A2 lxsspx vs38, o0, A3 lxsspx vs39, o4, A3 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 addi T1, T1, 8 stxsspx vs36, o0, T1 stxsspx vs37, o4, T1 addi T1, T1, 8 stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro COPY_4x1 lxsspx vs32, o0, A0 lxsspx vs33, o0, A1 lxsspx vs34, o0, A2 lxsspx vs35, o0, A3 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro COPY_2x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 lxvw4x vs34, o0, A1 lxvw4x vs35, o16, A1 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro COPY_2x4 lxvw4x vs32, o0, A0 lxvw4x vs33, o0, A1 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro COPY_2x2 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 lxsspx vs34, o0, A1 lxsspx vs35, o4, A1 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 addi T1, T1, 8 stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro COPY_2x1 lxsspx vs32, o0, A0 lxsspx vs33, o0, A1 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro COPY_1x8 lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 mr T1, BO stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro COPY_1x4 lxvw4x vs32, o0, A0 mr T1, BO stxvw4x vs32, o0, T1 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro COPY_1x2 lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 mr T1, BO stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro COPY_1x1 lxsspx vs32, o0, A0 mr T1, BO stxsspx vs32, o0, T1 .endm OpenBLAS-0.2.20/kernel/power/snrm2.S000066400000000000000000000207321313527062700167510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define FZERO 144(SP) #define FONE 148(SP) #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PREA, 4 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 fmr f8, f1 fmr f9, f1 fmr f10, f1 fmr f11, f1 fmr f12, f1 fmr f13, f1 fmr f14, f1 fmr f15, f1 cmpwi cr0, INCX, SIZE bne- cr0, LL(1000) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(150) LFD f16, 0 * SIZE(X) LFD f17, 1 * SIZE(X) LFD f18, 2 * SIZE(X) LFD f19, 3 * SIZE(X) LFD f20, 4 * SIZE(X) LFD f21, 5 * SIZE(X) LFD f22, 6 * SIZE(X) LFD f23, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(120) .align 4 LL(110): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 LFD f16, 16 * SIZE(X) LFD f17, 17 * SIZE(X) LFD f18, 18 * SIZE(X) LFD f19, 19 * SIZE(X) fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 LFD f20, 20 * SIZE(X) LFD f21, 21 * SIZE(X) LFD f22, 22 * SIZE(X) LFD f23, 23 * SIZE(X) fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(110) .align 4 LL(120): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 addi X, X, 16 * SIZE .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq- cr0, LL(170) .align 4 LL(160): LFD f16, 0 * SIZE(X) addi X, X, 1 * SIZE fmadd f0, f16, f16, f0 bdnz LL(160) .align 4 LL(170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f8, f8, f9 fadd f10, f10, f11 fadd f12, f12, f13 fadd f14, f14, f15 fadd f0, f0, f2 fadd f4, f4, f6 fadd f8, f8, f10 fadd f12, f12, f14 fadd f0, f0, f4 fadd f8, f8, f12 fadd f0, f0, f8 fsqrts f1, f0 b LL(9999) .align 4 LL(1000): sub X, X, INCX srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(1150) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdz LL(1120) .align 4 LL(1110): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdnz LL(1110) .align 4 LL(1120): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 .align 4 LL(1150): andi. r0, N, 15 mtspr CTR, r0 beq- cr0, LL(1170) .align 4 LL(1160): LFDUX f16, X, INCX fmadd f0, f16, f16, f0 bdnz LL(1160) .align 4 LL(1170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f8, f8, f9 fadd f10, f10, f11 fadd f12, f12, f13 fadd f14, f14, f15 fadd f0, f0, f2 fadd f4, f4, f6 fadd f8, f8, f10 fadd f12, f12, f14 fadd f0, f0, f4 fadd f8, f8, f12 fadd f0, f0, f8 fsqrts f1, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/snrm2_hummer.S000066400000000000000000000273721313527062700203350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define C5 f4 #define C6 f5 #define C7 f6 #define C8 f7 #define A1 f8 #define A2 f9 #define A3 f10 #define A4 f11 #define A5 f12 #define A6 f13 #define A7 f14 #define A8 f15 #define A9 f16 #define A10 f17 #define A11 f18 #define A12 f19 #define A13 f20 #define A14 f21 #define A15 f22 #define A16 f23 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 fpmr C5, C1 fpmr C6, C1 fpmr C7, C1 fpmr C8, C1 cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 ble LL(99) cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C1, 0(X) addi X, X, 1 * SIZE addi N, N, -1 cmpwi cr0, N, 0 fmul C1, C1, C1 ble LL(998) .align 4 LL(05): srawi. r0, N, 5 sub X, X, INCX2 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 LFPDUX A9, X, INCX2 LFPDUX A10, X, INCX2 LFPDUX A11, X, INCX2 LFPDUX A12, X, INCX2 LFPDUX A13, X, INCX2 LFPDUX A14, X, INCX2 LFPDUX A15, X, INCX2 LFPDUX A16, X, INCX2 bdz LL(13) .align 4 LL(12): fpmadd C1, A1, A1, C1 LFPDUX A1, X, INCX2 fpmadd C2, A2, A2, C2 LFPDUX A2, X, INCX2 fpmadd C3, A3, A3, C3 LFPDUX A3, X, INCX2 fpmadd C4, A4, A4, C4 LFPDUX A4, X, INCX2 fpmadd C5, A5, A5, C5 LFPDUX A5, X, INCX2 fpmadd C6, A6, A6, C6 LFPDUX A6, X, INCX2 fpmadd C7, A7, A7, C7 LFPDUX A7, X, INCX2 fpmadd C8, A8, A8, C8 LFPDUX A8, X, INCX2 fpmadd C1, A9, A9, C1 LFPDUX A9, X, INCX2 fpmadd C2, A10, A10, C2 LFPDUX A10, X, INCX2 fpmadd C3, A11, A11, C3 LFPDUX A11, X, INCX2 fpmadd C4, A12, A12, C4 LFPDUX A12, X, INCX2 fpmadd C5, A13, A13, C5 LFPDUX A13, X, INCX2 fpmadd C6, A14, A14, C6 LFPDUX A14, X, INCX2 fpmadd C7, A15, A15, C7 LFPDUX A15, X, INCX2 fpmadd C8, A16, A16, C8 LFPDUX A16, X, INCX2 bdnz LL(12) .align 4 LL(13): fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 fpmadd C5, A5, A5, C5 fpmadd C6, A6, A6, C6 fpmadd C7, A7, A7, C7 fpmadd C8, A8, A8, C8 fpmadd C1, A9, A9, C1 fpmadd C2, A10, A10, C2 fpmadd C3, A11, A11, C3 fpmadd C4, A12, A12, C4 fpmadd C5, A13, A13, C5 fpmadd C6, A14, A14, C6 fpmadd C7, A15, A15, C7 fpmadd C8, A16, A16, C8 .align 4 LL(15): andi. r0, N, 31 beq LL(98) andi. r0, N, 16 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 fpmadd C5, A5, A5, C5 fpmadd C6, A6, A6, C6 fpmadd C7, A7, A7, C7 fpmadd C8, A8, A8, C8 .align 4 LL(16): andi. r0, N, 8 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 .align 4 LL(17): andi. r0, N, 4 beq LL(18) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 .align 4 LL(18): andi. r0, N, 2 beq LL(19) LFPDUX A1, X, INCX2 fpmadd C3, A1, A1, C3 .align 4 LL(19): andi. r0, N, 1 beq LL(98) LFDX A1, X, INCX2 fmadd C4, A1, A1, C4 .align 4 LL(98): fpadd C1, C1, C5 lis r3, 0x3f00 fpadd C2, C2, C6 lis r4, 0x4040 fpadd C3, C3, C7 stw r3, 4(SP) fpadd C4, C4, C8 stw r4, 8(SP) fpadd C1, C1, C2 fpadd C3, C3, C4 lfs f10, 4(SP) fpadd C1, C1, C3 lfs f11, 4(SP) lfs f12, 8(SP) fsmtp C2, C1 fadd C1, C2, C1 fcmpu cr0, f10, C1 beq cr0, LL(99) #ifndef HUMMER_EMULATOR frsqrte f9, f1 li r10, 16 fmul f2, f1, f9 lfpdux f23, SP, r10 fmul f3, f9, f11 lfpdux f22, SP, r10 fnmsub f4, f2, f9, f12 lfpdux f21, SP, r10 fmul f9, f3, f4 lfpdux f20, SP, r10 fadd f13, f11, f11 lfpdux f19, SP, r10 fmul f12, f1, f9 lfpdux f18, SP, r10 fmul f11, f12, f11 lfpdux f17, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 fmadd f1, f11, f1, f12 blr #else fsqrt f1, f1 li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr #endif .align 4 LL(99): li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr .align 4 LL(100): sub X2, X, INCX sub X, X, INCX2 srawi. r0, N, 4 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 LFDUX A5, X, INCX2 LFDUX A6, X2, INCX2 LFDUX A7, X, INCX2 LFDUX A8, X2, INCX2 LFDUX A9, X, INCX2 LFDUX A10, X2, INCX2 LFDUX A11, X, INCX2 LFDUX A12, X2, INCX2 LFDUX A13, X, INCX2 LFDUX A14, X2, INCX2 LFDUX A15, X, INCX2 LFDUX A16, X2, INCX2 bdz LL(113) .align 4 LL(112): fmadd C1, A1, A1, C1 LFDUX A1, X, INCX2 fmadd C2, A2, A2, C2 LFDUX A2, X2, INCX2 fmadd C3, A3, A3, C3 LFDUX A3, X, INCX2 fmadd C4, A4, A4, C4 LFDUX A4, X2, INCX2 fmadd C5, A5, A5, C5 LFDUX A5, X, INCX2 fmadd C6, A6, A6, C6 LFDUX A6, X2, INCX2 fmadd C7, A7, A7, C7 LFDUX A7, X, INCX2 fmadd C8, A8, A8, C8 LFDUX A8, X2, INCX2 fmadd C1, A9, A9, C1 LFDUX A9, X, INCX2 fmadd C2, A10, A10, C2 LFDUX A10, X2, INCX2 fmadd C3, A11, A11, C3 LFDUX A11, X, INCX2 fmadd C4, A12, A12, C4 LFDUX A12, X2, INCX2 fmadd C5, A13, A13, C5 LFDUX A13, X, INCX2 fmadd C6, A14, A14, C6 LFDUX A14, X2, INCX2 fmadd C7, A15, A15, C7 LFDUX A15, X, INCX2 fmadd C8, A16, A16, C8 LFDUX A16, X2, INCX2 bdnz LL(112) .align 4 LL(113): fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 fmadd C5, A5, A5, C5 fmadd C6, A6, A6, C6 fmadd C7, A7, A7, C7 fmadd C8, A8, A8, C8 fmadd C1, A9, A9, C1 fmadd C2, A10, A10, C2 fmadd C3, A11, A11, C3 fmadd C4, A12, A12, C4 fmadd C5, A13, A13, C5 fmadd C6, A14, A14, C6 fmadd C7, A15, A15, C7 fmadd C8, A16, A16, C8 .align 4 LL(115): andi. r0, N, 15 beq LL(998) andi. r0, N, 8 beq LL(116) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 LFDUX A5, X, INCX2 LFDUX A6, X2, INCX2 LFDUX A7, X, INCX2 LFDUX A8, X2, INCX2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 fmadd C5, A5, A5, C5 fmadd C6, A6, A6, C6 fmadd C7, A7, A7, C7 fmadd C8, A8, A8, C8 .align 4 LL(116): andi. r0, N, 4 beq LL(117) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 .align 4 LL(117): andi. r0, N, 2 beq LL(118) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 .align 4 LL(118): andi. r0, N, 1 beq LL(998) LFDX A1, X, INCX2 fmadd C1, A1, A1, C1 .align 4 LL(998): fadd C1, C1, C5 lis r3, 0x3f00 fadd C2, C2, C6 lis r4, 0x4040 fadd C3, C3, C7 stw r3, 4(SP) fadd C4, C4, C8 stw r4, 8(SP) fadd C1, C1, C2 lfs f10, 0(SP) fadd C3, C3, C4 lfs f11, 4(SP) lfs f12, 8(SP) fadd C1, C1, C3 fcmpu cr0, f10, C1 beq cr0, LL(999) #ifndef HUMMER_EMULATOR frsqrte f9, f1 li r10, 16 fmul f2, f1, f9 lfpdux f23, SP, r10 fmul f3, f9, f11 lfpdux f22, SP, r10 fnmsub f4, f2, f9, f12 lfpdux f21, SP, r10 fmul f9, f3, f4 lfpdux f20, SP, r10 fadd f13, f11, f11 lfpdux f19, SP, r10 fmul f12, f1, f9 lfpdux f18, SP, r10 fmul f11, f12, f11 lfpdux f17, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 fmadd f1, f11, f1, f12 blr #else fsqrt f1, f1 li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr #endif .align 4 LL(999): li r10, 16 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/snrm2_ppc440.S000066400000000000000000000147611313527062700200500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PRE r8 #define FZERO 144(SP) #define FONE 148(SP) #define C1 152(SP) #define C2 156(SP) #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 lis r6, 0x3f00 lis r7, 0x4040 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE stw r6, C1 stw r7, C2 lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, BASE_SHIFT li PRE, 3 * 16 * SIZE sub X, X, INCX cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 fmr f8, f1 fmr f9, f1 fmr f10, f1 fmr f11, f1 fmr f12, f1 fmr f13, f1 fmr f14, f1 fmr f15, f1 srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) LFDUX f16, X, INCX LFDUX f17, X, INCX LFDUX f18, X, INCX LFDUX f19, X, INCX LFDUX f20, X, INCX LFDUX f21, X, INCX LFDUX f22, X, INCX LFDUX f23, X, INCX LFDUX f24, X, INCX LFDUX f25, X, INCX LFDUX f26, X, INCX LFDUX f27, X, INCX LFDUX f28, X, INCX LFDUX f29, X, INCX LFDUX f30, X, INCX LFDUX f31, X, INCX bdz LL(20) .align 4 LL(10): fmadd f0, f16, f16, f0 LFDUX f16, X, INCX fmadd f1, f17, f17, f1 LFDUX f17, X, INCX fmadd f2, f18, f18, f2 LFDUX f18, X, INCX fmadd f3, f19, f19, f3 LFDUX f19, X, INCX #ifdef PPCG4 dcbt X, PRE #endif fmadd f4, f20, f20, f4 LFDUX f20, X, INCX fmadd f5, f21, f21, f5 LFDUX f21, X, INCX fmadd f6, f22, f22, f6 LFDUX f22, X, INCX fmadd f7, f23, f23, f7 LFDUX f23, X, INCX fmadd f8, f24, f24, f8 LFDUX f24, X, INCX fmadd f9, f25, f25, f9 LFDUX f25, X, INCX fmadd f10, f26, f26, f10 LFDUX f26, X, INCX fmadd f11, f27, f27, f11 LFDUX f27, X, INCX #ifdef PPCG4 dcbt X, PRE #endif fmadd f12, f28, f28, f12 LFDUX f28, X, INCX fmadd f13, f29, f29, f13 LFDUX f29, X, INCX fmadd f14, f30, f30, f14 LFDUX f30, X, INCX fmadd f15, f31, f31, f15 LFDUX f31, X, INCX bdnz LL(10) .align 4 LL(20): fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 fmadd f8, f24, f24, f8 fmadd f9, f25, f25, f9 fmadd f10, f26, f26, f10 fmadd f11, f27, f27, f11 fmadd f12, f28, f28, f12 fmadd f13, f29, f29, f13 fmadd f14, f30, f30, f14 fmadd f15, f31, f31, f15 .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq- cr0, LL(70) .align 4 LL(60): LFDUX f16, X, INCX fmadd f0, f16, f16, f0 bdnz LL(60) .align 4 LL(70): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f8, f8, f9 fadd f10, f10, f11 fadd f12, f12, f13 fadd f14, f14, f15 fadd f0, f0, f2 fadd f4, f4, f6 fadd f8, f8, f10 fadd f12, f12, f14 fadd f0, f0, f4 fadd f8, f8, f12 fadd f1, f0, f8 lfs f4, FZERO fcmpu cr0, f1, f4 beq cr0, LL(999) frsqrte f0, f1 lfs f8, C1 lfs f9, C2 fmul f2, f1, f0 fadd f7, f8, f8 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f5, f1, f0 fmul f2, f5, f8 fnmsub f3, f5, f0, f7 fmadd f1, f2, f3, f5 .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/srot.c000066400000000000000000000065521313527062700167230ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/26 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #pragma GCC optimize "O1" #if defined(POWER8) #include "srot_microk_power8.c" #endif #ifndef HAVE_KERNEL_16 static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i=0; FLOAT f0, f1, f2, f3; FLOAT x00, x01, x02, x03; FLOAT g0, g1, g2, g3; FLOAT y00, y01, y02, y03; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { srot_kernel_16(n1, x1, y1, c, s); i=n1; } while(i < n) { temp = c*x[i] + s*y[i] ; y[i] = c*y[i] - s*x[i] ; x[i] = temp ; i++ ; } } else { while(i < n) { temp = c*x[ix] + s*y[iy] ; y[iy] = c*y[iy] - s*x[ix] ; x[ix] = temp ; ix += inc_x ; iy += inc_y ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/srot_microk_power8.c000066400000000000000000000153551313527062700215740ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * * I don't use fused multiply-add ( precision problems with lapack ) * * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static void srot_kernel_16 (long n, float *x, float *y, float c, float s) { __vector float t0; __vector float t1; __vector float t2; __vector float t3; __vector float t4; __vector float t5; __vector float t6; __vector float t7; __asm__ ( "xscvdpspn 36, %x13 \n\t" // load c to all words "xxspltw 36, 36, 0 \n\t" "xscvdpspn 37, %x14 \n\t" // load s to all words "xxspltw 37, 37, 0 \n\t" "lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 33, %15, %3 \n\t" "lxvd2x 34, %16, %3 \n\t" "lxvd2x 35, %17, %3 \n\t" "lxvd2x 48, 0, %4 \n\t" // load y "lxvd2x 49, %15, %4 \n\t" "lxvd2x 50, %16, %4 \n\t" "lxvd2x 51, %17, %4 \n\t" "addi %3, %3, 64 \n\t" "addi %4, %4, 64 \n\t" "addic. %2, %2, -16 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" "xvmulsp 43, 35, 36 \n\t" "xvmulsp %x5, 48, 36 \n\t" // c * y "xvmulsp %x6, 49, 36 \n\t" "xvmulsp %x7, 50, 36 \n\t" "xvmulsp %x8, 51, 36 \n\t" "xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 45, 33, 37 \n\t" "lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 33, %15, %3 \n\t" "xvmulsp 46, 34, 37 \n\t" "xvmulsp 47, 35, 37 \n\t" "lxvd2x 34, %16, %3 \n\t" "lxvd2x 35, %17, %3 \n\t" "xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x10, 49, 37 \n\t" "lxvd2x 48, 0, %4 \n\t" // load y "lxvd2x 49, %15, %4 \n\t" "xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t" "lxvd2x 50, %16, %4 \n\t" "lxvd2x 51, %17, %4 \n\t" "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y "addi %3, %3, -64 \n\t" "addi %4, %4, -64 \n\t" "xvaddsp 42, 42, %x11 \n\t" // c * x + s * y "xvaddsp 43, 43, %x12 \n\t" // c * x + s * y "xvsubsp %x5, %x5, 44 \n\t" // c * y - s * x "xvsubsp %x6, %x6, 45 \n\t" // c * y - s * x "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x "stxvd2x 40, 0, %3 \n\t" // store x "stxvd2x 41, %15, %3 \n\t" "stxvd2x 42, %16, %3 \n\t" "stxvd2x 43, %17, %3 \n\t" "stxvd2x %x5, 0, %4 \n\t" // store y "stxvd2x %x6, %15, %4 \n\t" "stxvd2x %x7, %16, %4 \n\t" "stxvd2x %x8, %17, %4 \n\t" "addi %3, %3, 128 \n\t" "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" "bgt 1b \n" "2: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" "xvmulsp 43, 35, 36 \n\t" "xvmulsp %x5, 48, 36 \n\t" // c * y "xvmulsp %x6, 49, 36 \n\t" "xvmulsp %x7, 50, 36 \n\t" "xvmulsp %x8, 51, 36 \n\t" "xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 45, 33, 37 \n\t" "xvmulsp 46, 34, 37 \n\t" "xvmulsp 47, 35, 37 \n\t" "xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x10, 49, 37 \n\t" "xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t" "addi %3, %3, -64 \n\t" "addi %4, %4, -64 \n\t" "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y "xvaddsp 42, 42, %x11 \n\t" // c * x + s * y "xvaddsp 43, 43, %x12 \n\t" // c * x + s * y "xvsubsp %x5, %x5, 44 \n\t" // c * y - s * x "xvsubsp %x6, %x6, 45 \n\t" // c * y - s * x "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x "stxvd2x 40, 0, %3 \n\t" // store x "stxvd2x 41, %15, %3 \n\t" "stxvd2x 42, %16, %3 \n\t" "stxvd2x 43, %17, %3 \n\t" "stxvd2x %x5, 0, %4 \n\t" // store y "stxvd2x %x6, %15, %4 \n\t" "stxvd2x %x7, %16, %4 \n\t" "stxvd2x %x8, %17, %4 \n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" : "+m" (*x), "+m" (*y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y), // 4 "=wa" (t0), // 5 "=wa" (t1), // 6 "=wa" (t2), // 7 "=wa" (t3), // 8 "=wa" (t4), // 9 "=wa" (t5), // 10 "=wa" (t6), // 11 "=wa" (t7) // 12 : "f" (c), // 13 "f" (s), // 14 "b" (16), // 15 "b" (32), // 16 "b" (48) // 17 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); } OpenBLAS-0.2.20/kernel/power/sscal.c000066400000000000000000000071761313527062700170440ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "sscal_microk_power8.c" #endif #if !defined(HAVE_KERNEL_16) static void sscal_kernel_16 (BLASLONG n, FLOAT *x, FLOAT alpha) { BLASLONG i; for( i=0; i 0 ) { sscal_kernel_16_zero(n1, x); j=n1; } while(j < n) { x[j]=0.0; j++; } } else { BLASLONG n1 = n & -32; if ( n1 > 0 ) { sscal_kernel_16(n1, x, da); j=n1; } while(j < n) { x[j] = da * x[j] ; j++; } } } else { if ( da == 0.0 ) { while(j < n) { x[i]=0.0; i += inc_x ; j++; } } else { while(j < n) { x[i] = da * x[i] ; i += inc_x ; j++; } } } return 0; } OpenBLAS-0.2.20/kernel/power/sscal_microk_power8.c000066400000000000000000000125731313527062700217110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static void sscal_kernel_16 (long n, float *x, float alpha) { __asm__ ( "dcbt 0, %2 \n\t" "xscvdpspn %x3, %x3 \n\t" "xxspltw %x3, %x3, 0 \n\t" "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %4, %2 \n\t" "lxvd2x 34, %5, %2 \n\t" "lxvd2x 35, %6, %2 \n\t" "lxvd2x 36, %7, %2 \n\t" "lxvd2x 37, %8, %2 \n\t" "lxvd2x 38, %9, %2 \n\t" "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %4, %2 \n\t" "xvmulsp 42, 34, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t" "lxvd2x 34, %5, %2 \n\t" "lxvd2x 35, %6, %2 \n\t" "xvmulsp 44, 36, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t" "lxvd2x 36, %7, %2 \n\t" "lxvd2x 37, %8, %2 \n\t" "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" "lxvd2x 38, %9, %2 \n\t" "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, -128 \n\t" "stxvd2x 40, 0, %2 \n\t" "stxvd2x 41, %4, %2 \n\t" "stxvd2x 42, %5, %2 \n\t" "stxvd2x 43, %6, %2 \n\t" "stxvd2x 44, %7, %2 \n\t" "stxvd2x 45, %8, %2 \n\t" "stxvd2x 46, %9, %2 \n\t" "stxvd2x 47, %10, %2 \n\t" "addi %2, %2, 256 \n\t" "addic. %1, %1, -32 \n\t" "bgt 1b \n" "2: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" "xvmulsp 42, 34, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t" "addi %2, %2, -128 \n\t" "xvmulsp 44, 36, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t" "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" "stxvd2x 40, 0, %2 \n\t" "stxvd2x 41, %4, %2 \n\t" "stxvd2x 42, %5, %2 \n\t" "stxvd2x 43, %6, %2 \n\t" "stxvd2x 44, %7, %2 \n\t" "stxvd2x 45, %8, %2 \n\t" "stxvd2x 46, %9, %2 \n\t" "stxvd2x 47, %10, %2 \n" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : "+m" (*x), "+r" (n), // 1 "+b" (x), // 2 "+f" (alpha) // 3 : "b" (16), // 4 "b" (32), // 5 "b" (48), // 6 "b" (64), // 7 "b" (80), // 8 "b" (96), // 9 "b" (112) // 10 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } static void sscal_kernel_16_zero (long n, float *x) { __vector float t0; __asm__ ( "xxlxor %x3, %x3, %x3 \n\t" ".p2align 5 \n" "1: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" "stxvd2x %x3, %5, %2 \n\t" "stxvd2x %x3, %6, %2 \n\t" "stxvd2x %x3, %7, %2 \n\t" "stxvd2x %x3, %8, %2 \n\t" "stxvd2x %x3, %9, %2 \n\t" "stxvd2x %x3, %10, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" "bgt 1b \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : "=m" (*x), "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0) // 3 : "b" (16), // 4 "b" (32), // 5 "b" (48), // 6 "b" (64), // 7 "b" (80), // 8 "b" (96), // 9 "b" (112) // 10 : "cr0" ); } OpenBLAS-0.2.20/kernel/power/sswap.c000066400000000000000000000064621313527062700170710ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "sswap_microk_power8.c" #endif #ifndef HAVE_KERNEL_32 static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT g0, g1, g2, g3, g4, g5, g6, g7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { sswap_kernel_32(n1, x, y); i=n1; } while(i < n) { temp = y[i]; y[i] = x[i] ; x[i] = temp; i++ ; } } else { while(i < n) { temp = y[iy]; y[iy] = x[ix] ; x[ix] = temp; ix += inc_x ; iy += inc_y ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/sswap_microk_power8.c000066400000000000000000000073371313527062700217430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_32 1 static void sswap_kernel_32 (long n, float *x, float *y) { __asm__ ( ".p2align 5 \n" "1: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" "lxvd2x 34, %6, %4 \n\t" "lxvd2x 35, %7, %4 \n\t" "lxvd2x 36, %8, %4 \n\t" "lxvd2x 37, %9, %4 \n\t" "lxvd2x 38, %10, %4 \n\t" "lxvd2x 39, %11, %4 \n\t" "lxvd2x 40, 0, %3 \n\t" "lxvd2x 41, %5, %3 \n\t" "lxvd2x 42, %6, %3 \n\t" "lxvd2x 43, %7, %3 \n\t" "lxvd2x 44, %8, %3 \n\t" "lxvd2x 45, %9, %3 \n\t" "lxvd2x 46, %10, %3 \n\t" "lxvd2x 47, %11, %3 \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 40, 0, %4 \n\t" "stxvd2x 41, %5, %4 \n\t" "stxvd2x 42, %6, %4 \n\t" "stxvd2x 43, %7, %4 \n\t" "stxvd2x 44, %8, %4 \n\t" "stxvd2x 45, %9, %4 \n\t" "stxvd2x 46, %10, %4 \n\t" "stxvd2x 47, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" "bgt 1b \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "+m" (*x), "+m" (*y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 : "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } OpenBLAS-0.2.20/kernel/power/staticbuffer.S000066400000000000000000000053531313527062700203730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ALLOC_STATIC .align 8 .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 #endif OpenBLAS-0.2.20/kernel/power/strmm_kernel_16x8_power8.S000066400000000000000000000220571313527062700225060ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 340 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA_SP 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define alpha_r vs30 #define alpha_vr vs31 #define o0 0 #define TBUFFER r13 #define o12 r14 #define o4 r15 #define K1 r16 #define o8 r17 #define L r18 #define T1 r19 #define KK r20 #define KKK r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o16 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T2 r31 #include "strmm_macros_16x8_power8.S" #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) std r13, 288(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) stw r13, 216(SP) #endif // stfd f1, ALPHA_SP // stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #endif mr KK, OFFSET #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, KK #endif cmpwi cr0, M, 0 ble L999_H1 cmpwi cr0, N, 0 ble L999_H1 cmpwi cr0, K, 0 ble L999_H1 li PRE, 256 li o4 , 4 li o8 , 8 li o12, 12 li o16, 16 li o32, 32 li o48, 48 addi TBUFFER, SP, 320 addi T1, SP, 300 stxsspx f1, o0 , T1 stxsspx f1, o4 , T1 stxsspx f1, o8 , T1 stxsspx f1, o12 , T1 lxsspx alpha_r, o0, T1 lxvw4x alpha_vr, o0, T1 #include "strmm_logic_16x8_power8.S" L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) ld r13, 288(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) lwz r13, 216(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/strmm_logic_16x8_power8.S000066400000000000000000001454441313527062700223310ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 3 ble STRMM_L8_END STRMM_L8_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 3 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 4 ble STRMM_L8x16_END STRMM_L8x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L8x16_SUB0 cmpwi cr0, L, 1 ble STRMM_L8x16_SUB4 STRMM_L8x16_LOOP_START: dcbt AO, PRE LOAD8x16_1 dcbt AO, PRE KERNEL8x16_I1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 addic. L, L, -2 ble STRMM_L8x16_LOOP_END .align 5 STRMM_L8x16_LOOP: dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 addic. L, L, -1 bgt STRMM_L8x16_LOOP STRMM_L8x16_LOOP_END: dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 dcbt AO, PRE KERNEL8x16_2 dcbt AO, PRE KERNEL8x16_1 KERNEL8x16_E2 b STRMM_L8x16_SUB1 STRMM_L8x16_SUB4: dcbt AO, PRE KERNEL8x16_SUBI1 dcbt AO, PRE KERNEL8x16_SUB1 dcbt AO, PRE KERNEL8x16_SUB1 dcbt AO, PRE KERNEL8x16_SUB1 KERNEL8x16_SUB1 KERNEL8x16_SUB1 KERNEL8x16_SUB1 KERNEL8x16_SUB1 b STRMM_L8x16_SUB1 STRMM_L8x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x16_SUBI1 addic. L, L, -1 ble STRMM_L8x16_SAVE b STRMM_L8x16_SUB2 STRMM_L8x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L8x16_SAVE STRMM_L8x16_SUB2: KERNEL8x16_SUB1 addic. L, L, -1 bgt STRMM_L8x16_SUB2 STRMM_L8x16_SAVE: SAVE8x16 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 16 // KK += Number of values in A #endif addic. I, I, -1 bgt STRMM_L8x16_BEGIN STRMM_L8x16_END: STRMM_L8x8_BEGIN: andi. T2, M, 15 ble STRMM_L8x1_END andi. T1, M, 8 ble STRMM_L8x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L8x8_SUB0 cmpwi cr0, L, 1 ble STRMM_L8x8_SUB4 STRMM_L8x8_LOOP_START: LOAD8x8_1 KERNEL8x8_I1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 addic. L, L, -2 ble STRMM_L8x8_LOOP_END .align 5 STRMM_L8x8_LOOP: KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 addic. L, L, -1 bgt STRMM_L8x8_LOOP STRMM_L8x8_LOOP_END: KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_2 KERNEL8x8_1 KERNEL8x8_E2 b STRMM_L8x8_SUB1 STRMM_L8x8_SUB4: KERNEL8x8_SUBI1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 KERNEL8x8_SUB1 b STRMM_L8x8_SUB1 STRMM_L8x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x8_SUBI1 addic. L, L, -1 ble STRMM_L8x8_SAVE b STRMM_L8x8_SUB2 STRMM_L8x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L8x8_SAVE STRMM_L8x8_SUB2: KERNEL8x8_SUB1 addic. L, L, -1 bgt STRMM_L8x8_SUB2 STRMM_L8x8_SAVE: SAVE8x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif STRMM_L8x8_END: STRMM_L8x4_BEGIN: andi. T1, M, 4 ble STRMM_L8x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L8x4_SUB0 cmpwi cr0, L, 1 ble STRMM_L8x4_SUB4 STRMM_L8x4_LOOP_START: LOAD8x4_1 KERNEL8x4_I1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 addic. L, L, -2 ble STRMM_L8x4_LOOP_END .align 5 STRMM_L8x4_LOOP: KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 addic. L, L, -1 bgt STRMM_L8x4_LOOP STRMM_L8x4_LOOP_END: KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_2 KERNEL8x4_1 KERNEL8x4_E2 b STRMM_L8x4_SUB1 STRMM_L8x4_SUB4: KERNEL8x4_SUBI1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 KERNEL8x4_SUB1 b STRMM_L8x4_SUB1 STRMM_L8x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x4_SUBI1 addic. L, L, -1 ble STRMM_L8x4_SAVE b STRMM_L8x4_SUB2 STRMM_L8x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L8x4_SAVE STRMM_L8x4_SUB2: KERNEL8x4_SUB1 addic. L, L, -1 bgt STRMM_L8x4_SUB2 STRMM_L8x4_SAVE: SAVE8x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif STRMM_L8x4_END: STRMM_L8x2_BEGIN: andi. T1, M, 2 ble STRMM_L8x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L8x2_SUB0 cmpwi cr0, L, 1 ble STRMM_L8x2_SUB4 STRMM_L8x2_LOOP_START: LOAD8x2_1 KERNEL8x2_I1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 addic. L, L, -2 ble STRMM_L8x2_LOOP_END .align 5 STRMM_L8x2_LOOP: KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 addic. L, L, -1 bgt STRMM_L8x2_LOOP STRMM_L8x2_LOOP_END: KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_1 KERNEL8x2_E2 b STRMM_L8x2_SUB1 STRMM_L8x2_SUB4: KERNEL8x2_SUBI1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 KERNEL8x2_SUB1 b STRMM_L8x2_SUB1 STRMM_L8x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x2_SUBI1 addic. L, L, -1 ble STRMM_L8x2_SAVE b STRMM_L8x2_SUB2 STRMM_L8x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L8x2_SAVE STRMM_L8x2_SUB2: KERNEL8x2_SUB1 addic. L, L, -1 bgt STRMM_L8x2_SUB2 STRMM_L8x2_SAVE: SAVE8x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif STRMM_L8x2_END: STRMM_L8x1_BEGIN: andi. T1, M, 1 ble STRMM_L8x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 2 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L8x1_SUB0 cmpwi cr0, L, 1 ble STRMM_L8x1_SUB4 STRMM_L8x1_LOOP_START: LOAD8x1_1 KERNEL8x1_I1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 addic. L, L, -2 ble STRMM_L8x1_LOOP_END .align 5 STRMM_L8x1_LOOP: KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 addic. L, L, -1 bgt STRMM_L8x1_LOOP STRMM_L8x1_LOOP_END: KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_1 KERNEL8x1_E2 b STRMM_L8x1_SUB1 STRMM_L8x1_SUB4: KERNEL8x1_SUBI1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 KERNEL8x1_SUB1 b STRMM_L8x1_SUB1 STRMM_L8x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x1_SUBI1 addic. L, L, -1 ble STRMM_L8x1_SAVE b STRMM_L8x1_SUB2 STRMM_L8x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L8x1_SAVE STRMM_L8x1_SUB2: KERNEL8x1_SUB1 addic. L, L, -1 bgt STRMM_L8x1_SUB2 STRMM_L8x1_SAVE: SAVE8x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif STRMM_L8x1_END: slwi T1, K, 5 add B, B, T1 #if !defined(LEFT) addi KK, KK, 8 // KK += Number of values in B #endif addic. J, J, -1 bgt STRMM_L8_BEGIN andi. T2, N, 7 ble L999 STRMM_L8_END: b STRMM_L4_BEGIN L999_H1: b L999 STRMM_L4_BEGIN: andi. T1, N, 4 ble STRMM_L4_END mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 4 ble STRMM_L4x16_END STRMM_L4x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L4x16_SUB0 cmpwi cr0, L, 1 ble STRMM_L4x16_SUB4 STRMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 dcbt AO, PRE KERNEL4x16_I1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 addic. L, L, -2 ble STRMM_L4x16_LOOP_END .align 5 STRMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 addic. L, L, -1 bgt STRMM_L4x16_LOOP STRMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 dcbt AO, PRE KERNEL4x16_2 dcbt AO, PRE KERNEL4x16_1 KERNEL4x16_E2 b STRMM_L4x16_SUB1 STRMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 dcbt AO, PRE KERNEL4x16_SUB1 dcbt AO, PRE KERNEL4x16_SUB1 dcbt AO, PRE KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 KERNEL4x16_SUB1 b STRMM_L4x16_SUB1 STRMM_L4x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x16_SUBI1 addic. L, L, -1 ble STRMM_L4x16_SAVE b STRMM_L4x16_SUB2 STRMM_L4x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L4x16_SAVE STRMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 bgt STRMM_L4x16_SUB2 STRMM_L4x16_SAVE: SAVE4x16 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 16 // KK += Number of values in A #endif addic. I, I, -1 bgt STRMM_L4x16_BEGIN STRMM_L4x16_END: STRMM_L4x8_BEGIN: andi. T2, M, 15 ble STRMM_L4x1_END andi. T1, M, 8 ble STRMM_L4x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L4x8_SUB0 cmpwi cr0, L, 1 ble STRMM_L4x8_SUB4 STRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 addic. L, L, -2 ble STRMM_L4x8_LOOP_END .align 5 STRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 addic. L, L, -1 bgt STRMM_L4x8_LOOP STRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_2 KERNEL4x8_1 KERNEL4x8_E2 b STRMM_L4x8_SUB1 STRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 KERNEL4x8_SUB1 b STRMM_L4x8_SUB1 STRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 ble STRMM_L4x8_SAVE b STRMM_L4x8_SUB2 STRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L4x8_SAVE STRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 bgt STRMM_L4x8_SUB2 STRMM_L4x8_SAVE: SAVE4x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif STRMM_L4x8_END: STRMM_L4x4_BEGIN: andi. T1, M, 4 ble STRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L4x4_SUB0 cmpwi cr0, L, 1 ble STRMM_L4x4_SUB4 STRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -2 ble STRMM_L4x4_LOOP_END .align 5 STRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 addic. L, L, -1 bgt STRMM_L4x4_LOOP STRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_2 KERNEL4x4_1 KERNEL4x4_E2 b STRMM_L4x4_SUB1 STRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 KERNEL4x4_SUB1 b STRMM_L4x4_SUB1 STRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 ble STRMM_L4x4_SAVE b STRMM_L4x4_SUB2 STRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L4x4_SAVE STRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 bgt STRMM_L4x4_SUB2 STRMM_L4x4_SAVE: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif STRMM_L4x4_END: STRMM_L4x2_BEGIN: andi. T1, M, 2 ble STRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L4x2_SUB0 cmpwi cr0, L, 1 ble STRMM_L4x2_SUB4 STRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -2 ble STRMM_L4x2_LOOP_END .align 5 STRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 addic. L, L, -1 bgt STRMM_L4x2_LOOP STRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_1 KERNEL4x2_E2 b STRMM_L4x2_SUB1 STRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 KERNEL4x2_SUB1 b STRMM_L4x2_SUB1 STRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 ble STRMM_L4x2_SAVE b STRMM_L4x2_SUB2 STRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L4x2_SAVE STRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 bgt STRMM_L4x2_SUB2 STRMM_L4x2_SAVE: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif STRMM_L4x2_END: STRMM_L4x1_BEGIN: andi. T1, M, 1 ble STRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 2 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L4x1_SUB0 cmpwi cr0, L, 1 ble STRMM_L4x1_SUB4 STRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -2 ble STRMM_L4x1_LOOP_END .align 5 STRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 addic. L, L, -1 bgt STRMM_L4x1_LOOP STRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_1 KERNEL4x1_E2 b STRMM_L4x1_SUB1 STRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 KERNEL4x1_SUB1 b STRMM_L4x1_SUB1 STRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 ble STRMM_L4x1_SAVE b STRMM_L4x1_SUB2 STRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L4x1_SAVE STRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 bgt STRMM_L4x1_SUB2 STRMM_L4x1_SAVE: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif STRMM_L4x1_END: slwi T1, K, 4 add B, B, T1 #if !defined(LEFT) addi KK, KK, 4 // KK += Number of values in B #endif STRMM_L4_END: STRMM_L2_BEGIN: andi. T1, N, 2 ble STRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 4 ble STRMM_L2x16_END STRMM_L2x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L2x16_SUB0 cmpwi cr0, L, 1 ble STRMM_L2x16_SUB4 STRMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 dcbt AO, PRE KERNEL2x16_I1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -2 ble STRMM_L2x16_LOOP_END .align 5 STRMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 addic. L, L, -1 bgt STRMM_L2x16_LOOP STRMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 dcbt AO, PRE KERNEL2x16_2 dcbt AO, PRE KERNEL2x16_1 KERNEL2x16_E2 b STRMM_L2x16_SUB1 STRMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 dcbt AO, PRE KERNEL2x16_SUB1 dcbt AO, PRE KERNEL2x16_SUB1 dcbt AO, PRE KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 KERNEL2x16_SUB1 b STRMM_L2x16_SUB1 STRMM_L2x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x16_SUBI1 addic. L, L, -1 ble STRMM_L2x16_SAVE b STRMM_L2x16_SUB2 STRMM_L2x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L2x16_SAVE STRMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 bgt STRMM_L2x16_SUB2 STRMM_L2x16_SAVE: SAVE2x16 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 16 // KK += Number of values in A #endif addic. I, I, -1 bgt STRMM_L2x16_BEGIN STRMM_L2x16_END: STRMM_L2x8_BEGIN: andi. T2, M, 15 ble STRMM_L2x1_END andi. T1, M, 8 ble STRMM_L2x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L2x8_SUB0 cmpwi cr0, L, 1 ble STRMM_L2x8_SUB4 STRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -2 ble STRMM_L2x8_LOOP_END .align 5 STRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 addic. L, L, -1 bgt STRMM_L2x8_LOOP STRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_2 KERNEL2x8_1 KERNEL2x8_E2 b STRMM_L2x8_SUB1 STRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b STRMM_L2x8_SUB1 STRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 ble STRMM_L2x8_SAVE b STRMM_L2x8_SUB2 STRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L2x8_SAVE STRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt STRMM_L2x8_SUB2 STRMM_L2x8_SAVE: SAVE2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif STRMM_L2x8_END: STRMM_L2x4_BEGIN: andi. T1, M, 4 ble STRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L2x4_SUB0 cmpwi cr0, L, 1 ble STRMM_L2x4_SUB4 STRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble STRMM_L2x4_LOOP_END .align 5 STRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt STRMM_L2x4_LOOP STRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b STRMM_L2x4_SUB1 STRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b STRMM_L2x4_SUB1 STRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 ble STRMM_L2x4_SAVE b STRMM_L2x4_SUB2 STRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L2x4_SAVE STRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt STRMM_L2x4_SUB2 STRMM_L2x4_SAVE: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif STRMM_L2x4_END: STRMM_L2x2_BEGIN: andi. T1, M, 2 ble STRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L2x2_SUB0 cmpwi cr0, L, 1 ble STRMM_L2x2_SUB4 STRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble STRMM_L2x2_LOOP_END .align 5 STRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt STRMM_L2x2_LOOP STRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b STRMM_L2x2_SUB1 STRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b STRMM_L2x2_SUB1 STRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 ble STRMM_L2x2_SAVE b STRMM_L2x2_SUB2 STRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L2x2_SAVE STRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt STRMM_L2x2_SUB2 STRMM_L2x2_SAVE: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif STRMM_L2x2_END: STRMM_L2x1_BEGIN: andi. T1, M, 1 ble STRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 3 // Number of values in B shifted slwi T2, KK, 2 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L2x1_SUB0 cmpwi cr0, L, 1 ble STRMM_L2x1_SUB4 STRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble STRMM_L2x1_LOOP_END .align 5 STRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt STRMM_L2x1_LOOP STRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b STRMM_L2x1_SUB1 STRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b STRMM_L2x1_SUB1 STRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 ble STRMM_L2x1_SAVE b STRMM_L2x1_SUB2 STRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L2x1_SAVE STRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt STRMM_L2x1_SUB2 STRMM_L2x1_SAVE: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif STRMM_L2x1_END: slwi T1, K, 3 add B, B, T1 #if !defined(LEFT) addi KK, KK, 2 // KK += Number of values in B #endif STRMM_L2_END: STRMM_L1_BEGIN: andi. T1, N, 1 ble STRMM_L1_END mr CO, C mr AO, A #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 4 ble STRMM_L1x16_END STRMM_L1x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 2 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L1x16_SUB0 cmpwi cr0, L, 1 ble STRMM_L1x16_SUB4 STRMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 dcbt AO, PRE KERNEL1x16_I1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -2 ble STRMM_L1x16_LOOP_END .align 5 STRMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 addic. L, L, -1 bgt STRMM_L1x16_LOOP STRMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 dcbt AO, PRE KERNEL1x16_2 dcbt AO, PRE KERNEL1x16_1 KERNEL1x16_E2 b STRMM_L1x16_SUB1 STRMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 dcbt AO, PRE KERNEL1x16_SUB1 dcbt AO, PRE KERNEL1x16_SUB1 dcbt AO, PRE KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 KERNEL1x16_SUB1 b STRMM_L1x16_SUB1 STRMM_L1x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x16_SUBI1 addic. L, L, -1 ble STRMM_L1x16_SAVE b STRMM_L1x16_SUB2 STRMM_L1x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L1x16_SAVE STRMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 bgt STRMM_L1x16_SUB2 STRMM_L1x16_SAVE: SAVE1x16 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 16 // KK += Number of values in A #endif addic. I, I, -1 bgt STRMM_L1x16_BEGIN STRMM_L1x16_END: STRMM_L1x8_BEGIN: andi. T2, M, 15 ble STRMM_L1x1_END andi. T1, M, 8 ble STRMM_L1x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 2 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L1x8_SUB0 cmpwi cr0, L, 1 ble STRMM_L1x8_SUB4 STRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -2 ble STRMM_L1x8_LOOP_END .align 5 STRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 addic. L, L, -1 bgt STRMM_L1x8_LOOP STRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_2 KERNEL1x8_1 KERNEL1x8_E2 b STRMM_L1x8_SUB1 STRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b STRMM_L1x8_SUB1 STRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 ble STRMM_L1x8_SAVE b STRMM_L1x8_SUB2 STRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L1x8_SAVE STRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt STRMM_L1x8_SUB2 STRMM_L1x8_SAVE: SAVE1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif STRMM_L1x8_END: STRMM_L1x4_BEGIN: andi. T1, M, 4 ble STRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 2 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L1x4_SUB0 cmpwi cr0, L, 1 ble STRMM_L1x4_SUB4 STRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble STRMM_L1x4_LOOP_END .align 5 STRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt STRMM_L1x4_LOOP STRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b STRMM_L1x4_SUB1 STRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b STRMM_L1x4_SUB1 STRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 ble STRMM_L1x4_SAVE b STRMM_L1x4_SUB2 STRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L1x4_SAVE STRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt STRMM_L1x4_SUB2 STRMM_L1x4_SAVE: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif STRMM_L1x4_END: STRMM_L1x2_BEGIN: andi. T1, M, 2 ble STRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 2 // Number of values in B shifted slwi T2, KK, 3 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L1x2_SUB0 cmpwi cr0, L, 1 ble STRMM_L1x2_SUB4 STRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble STRMM_L1x2_LOOP_END .align 5 STRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt STRMM_L1x2_LOOP STRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b STRMM_L1x2_SUB1 STRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b STRMM_L1x2_SUB1 STRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 ble STRMM_L1x2_SAVE b STRMM_L1x2_SUB2 STRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L1x2_SAVE STRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt STRMM_L1x2_SUB2 STRMM_L1x2_SAVE: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif STRMM_L1x2_END: STRMM_L1x1_BEGIN: andi. T1, M, 1 ble STRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 2 // Number of values in B shifted slwi T2, KK, 2 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble STRMM_L1x1_SUB0 cmpwi cr0, L, 1 ble STRMM_L1x1_SUB4 STRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble STRMM_L1x1_LOOP_END .align 5 STRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt STRMM_L1x1_LOOP STRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b STRMM_L1x1_SUB1 STRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b STRMM_L1x1_SUB1 STRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 ble STRMM_L1x1_SAVE b STRMM_L1x1_SUB2 STRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble STRMM_L1x1_SAVE STRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt STRMM_L1x1_SUB2 STRMM_L1x1_SAVE: SAVE1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif STRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif STRMM_L1_END: OpenBLAS-0.2.20/kernel/power/strmm_macros_16x8_power8.S000066400000000000000000002356171313527062700225220ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=8 and M=16 **********************************************************************************************/ .macro LOAD8x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 .endm .macro KERNEL8x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs20, vs29, 0 xxspltw vs21, vs29, 1 xxspltw vs22, vs29, 2 xxspltw vs23, vs29, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 xvmulsp vs48, vs0, vs12 xvmulsp vs49, vs1, vs12 xvmulsp vs50, vs2, vs12 xvmulsp vs51, vs3, vs12 xvmulsp vs52, vs0, vs13 xvmulsp vs53, vs1, vs13 xvmulsp vs54, vs2, vs13 xvmulsp vs55, vs3, vs13 xvmulsp vs56, vs0, vs14 xvmulsp vs57, vs1, vs14 xvmulsp vs58, vs2, vs14 xvmulsp vs59, vs3, vs14 xvmulsp vs60, vs0, vs15 xvmulsp vs61, vs1, vs15 xvmulsp vs62, vs2, vs15 xvmulsp vs63, vs3, vs15 .endm .macro KERNEL8x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs20, vs29, 0 xxspltw vs21, vs29, 1 xxspltw vs22, vs29, 2 xxspltw vs23, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 xvmaddasp vs51, vs3, vs12 xvmaddasp vs52, vs0, vs13 xvmaddasp vs53, vs1, vs13 xvmaddasp vs54, vs2, vs13 xvmaddasp vs55, vs3, vs13 xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 xvmaddasp vs60, vs0, vs15 xvmaddasp vs61, vs1, vs15 xvmaddasp vs62, vs2, vs15 xvmaddasp vs63, vs3, vs15 .endm .macro KERNEL8x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 xvmaddasp vs51, vs7, vs20 xvmaddasp vs52, vs4, vs21 xvmaddasp vs53, vs5, vs21 xvmaddasp vs54, vs6, vs21 xvmaddasp vs55, vs7, vs21 xvmaddasp vs56, vs4, vs22 xvmaddasp vs57, vs5, vs22 xvmaddasp vs58, vs6, vs22 xvmaddasp vs59, vs7, vs22 xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 .endm .macro KERNEL8x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 xvmaddasp vs51, vs7, vs20 xvmaddasp vs52, vs4, vs21 xvmaddasp vs53, vs5, vs21 xvmaddasp vs54, vs6, vs21 xvmaddasp vs55, vs7, vs21 xvmaddasp vs56, vs4, vs22 xvmaddasp vs57, vs5, vs22 xvmaddasp vs58, vs6, vs22 xvmaddasp vs59, vs7, vs22 xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 .endm .macro KERNEL8x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 xvmulsp vs48, vs0, vs12 xvmulsp vs49, vs1, vs12 xvmulsp vs50, vs2, vs12 xvmulsp vs51, vs3, vs12 xvmulsp vs52, vs0, vs13 xvmulsp vs53, vs1, vs13 xvmulsp vs54, vs2, vs13 xvmulsp vs55, vs3, vs13 xvmulsp vs56, vs0, vs14 xvmulsp vs57, vs1, vs14 xvmulsp vs58, vs2, vs14 xvmulsp vs59, vs3, vs14 xvmulsp vs60, vs0, vs15 xvmulsp vs61, vs1, vs15 xvmulsp vs62, vs2, vs15 xvmulsp vs63, vs3, vs15 .endm .macro KERNEL8x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 xvmaddasp vs51, vs3, vs12 xvmaddasp vs52, vs0, vs13 xvmaddasp vs53, vs1, vs13 xvmaddasp vs54, vs2, vs13 xvmaddasp vs55, vs3, vs13 xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 xvmaddasp vs60, vs0, vs15 xvmaddasp vs61, vs1, vs15 xvmaddasp vs62, vs2, vs15 xvmaddasp vs63, vs3, vs15 .endm .macro SAVE8x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr xvmulsp vs2, vs38, alpha_vr xvmulsp vs3, vs39, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr xvmaddasp vs2, vs38, alpha_vr xvmaddasp vs3, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs40, alpha_vr xvmulsp vs1, vs41, alpha_vr xvmulsp vs2, vs42, alpha_vr xvmulsp vs3, vs43, alpha_vr #else xvmaddasp vs0, vs40, alpha_vr xvmaddasp vs1, vs41, alpha_vr xvmaddasp vs2, vs42, alpha_vr xvmaddasp vs3, vs43, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs44, alpha_vr xvmulsp vs1, vs45, alpha_vr xvmulsp vs2, vs46, alpha_vr xvmulsp vs3, vs47, alpha_vr #else xvmaddasp vs0, vs44, alpha_vr xvmaddasp vs1, vs45, alpha_vr xvmaddasp vs2, vs46, alpha_vr xvmaddasp vs3, vs47, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs48, alpha_vr xvmulsp vs1, vs49, alpha_vr xvmulsp vs2, vs50, alpha_vr xvmulsp vs3, vs51, alpha_vr #else xvmaddasp vs0, vs48, alpha_vr xvmaddasp vs1, vs49, alpha_vr xvmaddasp vs2, vs50, alpha_vr xvmaddasp vs3, vs51, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs52, alpha_vr xvmulsp vs1, vs53, alpha_vr xvmulsp vs2, vs54, alpha_vr xvmulsp vs3, vs55, alpha_vr #else xvmaddasp vs0, vs52, alpha_vr xvmaddasp vs1, vs53, alpha_vr xvmaddasp vs2, vs54, alpha_vr xvmaddasp vs3, vs55, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs56, alpha_vr xvmulsp vs1, vs57, alpha_vr xvmulsp vs2, vs58, alpha_vr xvmulsp vs3, vs59, alpha_vr #else xvmaddasp vs0, vs56, alpha_vr xvmaddasp vs1, vs57, alpha_vr xvmaddasp vs2, vs58, alpha_vr xvmaddasp vs3, vs59, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs60, alpha_vr xvmulsp vs1, vs61, alpha_vr xvmulsp vs2, vs62, alpha_vr xvmulsp vs3, vs63, alpha_vr #else xvmaddasp vs0, vs60, alpha_vr xvmaddasp vs1, vs61, alpha_vr xvmaddasp vs2, vs62, alpha_vr xvmaddasp vs3, vs63, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ .macro LOAD8x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 .endm .macro KERNEL8x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs20, vs29, 0 xxspltw vs21, vs29, 1 xxspltw vs22, vs29, 2 xxspltw vs23, vs29, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 xvmulsp vs40, vs0, vs12 xvmulsp vs41, vs1, vs12 xvmulsp vs42, vs0, vs13 xvmulsp vs43, vs1, vs13 xvmulsp vs44, vs0, vs14 xvmulsp vs45, vs1, vs14 xvmulsp vs46, vs0, vs15 xvmulsp vs47, vs1, vs15 .endm .macro KERNEL8x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs20, vs29, 0 xxspltw vs21, vs29, 1 xxspltw vs22, vs29, 2 xxspltw vs23, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 xvmaddasp vs40, vs0, vs12 xvmaddasp vs41, vs1, vs12 xvmaddasp vs42, vs0, vs13 xvmaddasp vs43, vs1, vs13 xvmaddasp vs44, vs0, vs14 xvmaddasp vs45, vs1, vs14 xvmaddasp vs46, vs0, vs15 xvmaddasp vs47, vs1, vs15 .endm .macro KERNEL8x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 xvmaddasp vs40, vs4, vs20 xvmaddasp vs41, vs5, vs20 xvmaddasp vs42, vs4, vs21 xvmaddasp vs43, vs5, vs21 xvmaddasp vs44, vs4, vs22 xvmaddasp vs45, vs5, vs22 xvmaddasp vs46, vs4, vs23 xvmaddasp vs47, vs5, vs23 .endm .macro KERNEL8x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 xvmaddasp vs40, vs4, vs20 xvmaddasp vs41, vs5, vs20 xvmaddasp vs42, vs4, vs21 xvmaddasp vs43, vs5, vs21 xvmaddasp vs44, vs4, vs22 xvmaddasp vs45, vs5, vs22 xvmaddasp vs46, vs4, vs23 xvmaddasp vs47, vs5, vs23 .endm .macro KERNEL8x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 xvmulsp vs40, vs0, vs12 xvmulsp vs41, vs1, vs12 xvmulsp vs42, vs0, vs13 xvmulsp vs43, vs1, vs13 xvmulsp vs44, vs0, vs14 xvmulsp vs45, vs1, vs14 xvmulsp vs46, vs0, vs15 xvmulsp vs47, vs1, vs15 .endm .macro KERNEL8x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 xvmaddasp vs40, vs0, vs12 xvmaddasp vs41, vs1, vs12 xvmaddasp vs42, vs0, vs13 xvmaddasp vs43, vs1, vs13 xvmaddasp vs44, vs0, vs14 xvmaddasp vs45, vs1, vs14 xvmaddasp vs46, vs0, vs15 xvmaddasp vs47, vs1, vs15 .endm .macro SAVE8x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr xvmulsp vs1, vs35, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr xvmaddasp vs1, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs38, alpha_vr xvmulsp vs1, vs39, alpha_vr #else xvmaddasp vs0, vs38, alpha_vr xvmaddasp vs1, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs40, alpha_vr xvmulsp vs1, vs41, alpha_vr #else xvmaddasp vs0, vs40, alpha_vr xvmaddasp vs1, vs41, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs42, alpha_vr xvmulsp vs1, vs43, alpha_vr #else xvmaddasp vs0, vs42, alpha_vr xvmaddasp vs1, vs43, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs44, alpha_vr xvmulsp vs1, vs45, alpha_vr #else xvmaddasp vs0, vs44, alpha_vr xvmaddasp vs1, vs45, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs46, alpha_vr xvmulsp vs1, vs47, alpha_vr #else xvmaddasp vs0, vs46, alpha_vr xvmaddasp vs1, vs47, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ .macro LOAD8x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 .endm .macro KERNEL8x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs20, vs29, 0 xxspltw vs21, vs29, 1 xxspltw vs22, vs29, 2 xxspltw vs23, vs29, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 xvmulsp vs36, vs0, vs12 xvmulsp vs37, vs0, vs13 xvmulsp vs38, vs0, vs14 xvmulsp vs39, vs0, vs15 .endm .macro KERNEL8x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs20, vs29, 0 xxspltw vs21, vs29, 1 xxspltw vs22, vs29, 2 xxspltw vs23, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 xvmaddasp vs36, vs0, vs12 xvmaddasp vs37, vs0, vs13 xvmaddasp vs38, vs0, vs14 xvmaddasp vs39, vs0, vs15 .endm .macro KERNEL8x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 xvmaddasp vs36, vs4, vs20 xvmaddasp vs37, vs4, vs21 xvmaddasp vs38, vs4, vs22 xvmaddasp vs39, vs4, vs23 .endm .macro KERNEL8x4_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 xvmaddasp vs36, vs4, vs20 xvmaddasp vs37, vs4, vs21 xvmaddasp vs38, vs4, vs22 xvmaddasp vs39, vs4, vs23 .endm .macro KERNEL8x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 xvmulsp vs36, vs0, vs12 xvmulsp vs37, vs0, vs13 xvmulsp vs38, vs0, vs14 xvmulsp vs39, vs0, vs15 .endm .macro KERNEL8x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 lxvw4x vs29, o16, BO xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 xvmaddasp vs36, vs0, vs12 xvmaddasp vs37, vs0, vs13 xvmaddasp vs38, vs0, vs14 xvmaddasp vs39, vs0, vs15 .endm .macro SAVE8x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs33, alpha_vr #else xvmaddasp vs0, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs35, alpha_vr #else xvmaddasp vs0, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs37, alpha_vr #else xvmaddasp vs0, vs37, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs38, alpha_vr #else xvmaddasp vs0, vs38, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs39, alpha_vr #else xvmaddasp vs0, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ .macro LOAD8x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 .endm .macro KERNEL8x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi T1, T1, 16 lxsspx vs20, o0, T1 lxsspx vs21, o4, T1 lxsspx vs22, o8, T1 lxsspx vs23, o12, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 xsmuldp vs40, vs0, vs12 xsmuldp vs41, vs1, vs12 xsmuldp vs42, vs0, vs13 xsmuldp vs43, vs1, vs13 xsmuldp vs44, vs0, vs14 xsmuldp vs45, vs1, vs14 xsmuldp vs46, vs0, vs15 xsmuldp vs47, vs1, vs15 .endm .macro KERNEL8x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi T1, T1, 16 lxsspx vs20, o0, T1 lxsspx vs21, o4, T1 lxsspx vs22, o8, T1 lxsspx vs23, o12, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 xsmaddadp vs40, vs0, vs12 xsmaddadp vs41, vs1, vs12 xsmaddadp vs42, vs0, vs13 xsmaddadp vs43, vs1, vs13 xsmaddadp vs44, vs0, vs14 xsmaddadp vs45, vs1, vs14 xsmaddadp vs46, vs0, vs15 xsmaddadp vs47, vs1, vs15 .endm .macro KERNEL8x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 xsmaddadp vs40, vs4, vs20 xsmaddadp vs41, vs5, vs20 xsmaddadp vs42, vs4, vs21 xsmaddadp vs43, vs5, vs21 xsmaddadp vs44, vs4, vs22 xsmaddadp vs45, vs5, vs22 xsmaddadp vs46, vs4, vs23 xsmaddadp vs47, vs5, vs23 .endm .macro KERNEL8x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 xsmaddadp vs40, vs4, vs20 xsmaddadp vs41, vs5, vs20 xsmaddadp vs42, vs4, vs21 xsmaddadp vs43, vs5, vs21 xsmaddadp vs44, vs4, vs22 xsmaddadp vs45, vs5, vs22 xsmaddadp vs46, vs4, vs23 xsmaddadp vs47, vs5, vs23 .endm .macro KERNEL8x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 xsmuldp vs40, vs0, vs12 xsmuldp vs41, vs1, vs12 xsmuldp vs42, vs0, vs13 xsmuldp vs43, vs1, vs13 xsmuldp vs44, vs0, vs14 xsmuldp vs45, vs1, vs14 xsmuldp vs46, vs0, vs15 xsmuldp vs47, vs1, vs15 .endm .macro KERNEL8x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 xsmaddadp vs40, vs0, vs12 xsmaddadp vs41, vs1, vs12 xsmaddadp vs42, vs0, vs13 xsmaddadp vs43, vs1, vs13 xsmaddadp vs44, vs0, vs14 xsmaddadp vs45, vs1, vs14 xsmaddadp vs46, vs0, vs15 xsmaddadp vs47, vs1, vs15 .endm .macro SAVE8x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r xsmuldp vs1, vs35, alpha_r #else xsmaddadp vs0, vs34, alpha_r xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs36, alpha_r xsmuldp vs1, vs37, alpha_r #else xsmaddadp vs0, vs36, alpha_r xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs38, alpha_r xsmuldp vs1, vs39, alpha_r #else xsmaddadp vs0, vs38, alpha_r xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs40, alpha_r xsmuldp vs1, vs41, alpha_r #else xsmaddadp vs0, vs40, alpha_r xsmaddadp vs1, vs41, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs42, alpha_r xsmuldp vs1, vs43, alpha_r #else xsmaddadp vs0, vs42, alpha_r xsmaddadp vs1, vs43, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs44, alpha_r xsmuldp vs1, vs45, alpha_r #else xsmaddadp vs0, vs44, alpha_r xsmaddadp vs1, vs45, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs46, alpha_r xsmuldp vs1, vs47, alpha_r #else xsmaddadp vs0, vs46, alpha_r xsmaddadp vs1, vs47, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ .macro LOAD8x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 .endm .macro KERNEL8x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi T1, T1, 16 lxsspx vs20, o0, T1 lxsspx vs21, o4, T1 lxsspx vs22, o8, T1 lxsspx vs23, o12, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 xsmuldp vs36, vs0, vs12 xsmuldp vs37, vs0, vs13 xsmuldp vs38, vs0, vs14 xsmuldp vs39, vs0, vs15 .endm .macro KERNEL8x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi T1, T1, 16 lxsspx vs20, o0, T1 lxsspx vs21, o4, T1 lxsspx vs22, o8, T1 lxsspx vs23, o12, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 xsmaddadp vs36, vs0, vs12 xsmaddadp vs37, vs0, vs13 xsmaddadp vs38, vs0, vs14 xsmaddadp vs39, vs0, vs15 .endm .macro KERNEL8x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 xsmaddadp vs36, vs4, vs20 xsmaddadp vs37, vs4, vs21 xsmaddadp vs38, vs4, vs22 xsmaddadp vs39, vs4, vs23 .endm .macro KERNEL8x1_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 xsmaddadp vs36, vs4, vs20 xsmaddadp vs37, vs4, vs21 xsmaddadp vs38, vs4, vs22 xsmaddadp vs39, vs4, vs23 .endm .macro KERNEL8x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 xsmuldp vs36, vs0, vs12 xsmuldp vs37, vs0, vs13 xsmuldp vs38, vs0, vs14 xsmuldp vs39, vs0, vs15 .endm .macro KERNEL8x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi T1, T1, 16 lxsspx vs12, o0, T1 lxsspx vs13, o4, T1 lxsspx vs14, o8, T1 lxsspx vs15, o12, T1 addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 xsmaddadp vs36, vs0, vs12 xsmaddadp vs37, vs0, vs13 xsmaddadp vs38, vs0, vs14 xsmaddadp vs39, vs0, vs15 .endm .macro SAVE8x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs33, alpha_r #else xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r #else xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs35, alpha_r #else xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs36, alpha_r #else xsmaddadp vs0, vs36, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs37, alpha_r #else xsmaddadp vs0, vs37, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs38, alpha_r #else xsmaddadp vs0, vs38, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs39, alpha_r #else xsmaddadp vs0, vs39, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ .macro LOAD4x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 .endm .macro KERNEL4x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 .endm .macro KERNEL4x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 .endm .macro KERNEL4x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 .endm .macro KERNEL4x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 .endm .macro KERNEL4x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 xvmulsp vs40, vs0, vs10 xvmulsp vs41, vs1, vs10 xvmulsp vs42, vs2, vs10 xvmulsp vs43, vs3, vs10 xvmulsp vs44, vs0, vs11 xvmulsp vs45, vs1, vs11 xvmulsp vs46, vs2, vs11 xvmulsp vs47, vs3, vs11 .endm .macro KERNEL4x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 .endm .macro SAVE4x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr xvmulsp vs2, vs38, alpha_vr xvmulsp vs3, vs39, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr xvmaddasp vs2, vs38, alpha_vr xvmaddasp vs3, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs40, alpha_vr xvmulsp vs1, vs41, alpha_vr xvmulsp vs2, vs42, alpha_vr xvmulsp vs3, vs43, alpha_vr #else xvmaddasp vs0, vs40, alpha_vr xvmaddasp vs1, vs41, alpha_vr xvmaddasp vs2, vs42, alpha_vr xvmaddasp vs3, vs43, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs44, alpha_vr xvmulsp vs1, vs45, alpha_vr xvmulsp vs2, vs46, alpha_vr xvmulsp vs3, vs47, alpha_vr #else xvmaddasp vs0, vs44, alpha_vr xvmaddasp vs1, vs45, alpha_vr xvmaddasp vs2, vs46, alpha_vr xvmaddasp vs3, vs47, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro LOAD4x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 .endm .macro KERNEL4x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 .endm .macro KERNEL4x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 .endm .macro KERNEL4x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 .endm .macro KERNEL4x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 xvmaddasp vs36, vs4, vs18 xvmaddasp vs37, vs5, vs18 xvmaddasp vs38, vs4, vs19 xvmaddasp vs39, vs5, vs19 .endm .macro KERNEL4x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 xvmulsp vs36, vs0, vs10 xvmulsp vs37, vs1, vs10 xvmulsp vs38, vs0, vs11 xvmulsp vs39, vs1, vs11 .endm .macro KERNEL4x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 xvmaddasp vs36, vs0, vs10 xvmaddasp vs37, vs1, vs10 xvmaddasp vs38, vs0, vs11 xvmaddasp vs39, vs1, vs11 .endm .macro SAVE4x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr xvmulsp vs1, vs35, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr xvmaddasp vs1, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs38, alpha_vr xvmulsp vs1, vs39, alpha_vr #else xvmaddasp vs0, vs38, alpha_vr xvmaddasp vs1, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro LOAD4x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 .endm .macro KERNEL4x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 .endm .macro KERNEL4x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 xxspltw vs18, vs28, 2 xxspltw vs19, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 .endm .macro KERNEL4x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 .endm .macro KERNEL4x4_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 xvmaddasp vs34, vs4, vs18 xvmaddasp vs35, vs4, vs19 .endm .macro KERNEL4x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 xvmulsp vs34, vs0, vs10 xvmulsp vs35, vs0, vs11 .endm .macro KERNEL4x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 xvmaddasp vs34, vs0, vs10 xvmaddasp vs35, vs0, vs11 .endm .macro SAVE4x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs33, alpha_vr #else xvmaddasp vs0, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs35, alpha_vr #else xvmaddasp vs0, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro LOAD4x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 .endm .macro KERNEL4x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 .endm .macro KERNEL4x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 .endm .macro KERNEL4x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 .endm .macro KERNEL4x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 xsmaddadp vs36, vs4, vs18 xsmaddadp vs37, vs5, vs18 xsmaddadp vs38, vs4, vs19 xsmaddadp vs39, vs5, vs19 .endm .macro KERNEL4x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 xsmuldp vs36, vs0, vs10 xsmuldp vs37, vs1, vs10 xsmuldp vs38, vs0, vs11 xsmuldp vs39, vs1, vs11 .endm .macro KERNEL4x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 xsmaddadp vs36, vs0, vs10 xsmaddadp vs37, vs1, vs10 xsmaddadp vs38, vs0, vs11 xsmaddadp vs39, vs1, vs11 .endm .macro SAVE4x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r xsmuldp vs1, vs35, alpha_r #else xsmaddadp vs0, vs34, alpha_r xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs36, alpha_r xsmuldp vs1, vs37, alpha_r #else xsmaddadp vs0, vs36, alpha_r xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs38, alpha_r xsmuldp vs1, vs39, alpha_r #else xsmaddadp vs0, vs38, alpha_r xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro LOAD4x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 .endm .macro KERNEL4x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 .endm .macro KERNEL4x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 lxsspx vs18, o8, T1 lxsspx vs19, o12, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 .endm .macro KERNEL4x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 .endm .macro KERNEL4x1_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 xsmaddadp vs34, vs4, vs18 xsmaddadp vs35, vs4, vs19 .endm .macro KERNEL4x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 xsmuldp vs34, vs0, vs10 xsmuldp vs35, vs0, vs11 .endm .macro KERNEL4x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 lxsspx vs10, o8, T1 lxsspx vs11, o12, T1 addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 xsmaddadp vs34, vs0, vs10 xsmaddadp vs35, vs0, vs11 .endm .macro SAVE4x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs33, alpha_r #else xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r #else xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs35, alpha_r #else xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ .macro LOAD2x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 .endm .macro KERNEL2x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 .endm .macro KERNEL2x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 .endm .macro KERNEL2x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 .endm .macro KERNEL2x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 .endm .macro KERNEL2x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 xvmulsp vs36, vs0, vs9 xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 .endm .macro KERNEL2x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 .endm .macro SAVE2x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs36, alpha_vr xvmulsp vs1, vs37, alpha_vr xvmulsp vs2, vs38, alpha_vr xvmulsp vs3, vs39, alpha_vr #else xvmaddasp vs0, vs36, alpha_vr xvmaddasp vs1, vs37, alpha_vr xvmaddasp vs2, vs38, alpha_vr xvmaddasp vs3, vs39, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro LOAD2x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 .endm .macro KERNEL2x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 .endm .macro KERNEL2x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 .endm .macro KERNEL2x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 .endm .macro KERNEL2x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs4, vs17 xvmaddasp vs35, vs5, vs17 .endm .macro KERNEL2x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs0, vs9 xvmulsp vs35, vs1, vs9 .endm .macro KERNEL2x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs0, vs9 xvmaddasp vs35, vs1, vs9 .endm .macro SAVE2x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs34, alpha_vr xvmulsp vs1, vs35, alpha_vr #else xvmaddasp vs0, vs34, alpha_vr xvmaddasp vs1, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro LOAD2x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 .endm .macro KERNEL2x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 .endm .macro KERNEL2x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 xxspltw vs17, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 .endm .macro KERNEL2x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 .endm .macro KERNEL2x4_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs4, vs17 .endm .macro KERNEL2x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs0, vs9 .endm .macro KERNEL2x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 addi BO, BO, 8 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs0, vs9 .endm .macro SAVE2x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs33, alpha_vr #else xvmaddasp vs0, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro LOAD2x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 .endm .macro KERNEL2x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 addi BO, BO, 8 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 .endm .macro KERNEL2x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 addi BO, BO, 8 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 .endm .macro KERNEL2x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 .endm .macro KERNEL2x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 xsmaddadp vs34, vs4, vs17 xsmaddadp vs35, vs5, vs17 .endm .macro KERNEL2x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 xsmuldp vs34, vs0, vs9 xsmuldp vs35, vs1, vs9 .endm .macro KERNEL2x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 xsmaddadp vs34, vs0, vs9 xsmaddadp vs35, vs1, vs9 .endm .macro SAVE2x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs34, alpha_r xsmuldp vs1, vs35, alpha_r #else xsmaddadp vs0, vs34, alpha_r xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro LOAD2x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 .endm .macro KERNEL2x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 addi BO, BO, 8 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 .endm .macro KERNEL2x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 lxsspx vs17, o4, T1 addi BO, BO, 8 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 .endm .macro KERNEL2x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 .endm .macro KERNEL2x1_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs4, vs17 .endm .macro KERNEL2x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs0, vs9 .endm .macro KERNEL2x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 lxsspx vs9, o4, T1 addi BO, BO, 8 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs0, vs9 .endm .macro SAVE2x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs33, alpha_r #else xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ .macro LOAD1x16_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 .endm .macro KERNEL1x16_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 addi BO, BO, 4 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 .endm .macro KERNEL1x16_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 .endm .macro KERNEL1x16_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 .endm .macro KERNEL1x16_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 .endm .macro KERNEL1x16_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 xvmulsp vs34, vs2, vs8 xvmulsp vs35, vs3, vs8 .endm .macro KERNEL1x16_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO addi AO, AO, 64 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 .endm .macro SAVE1x16 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 lxvw4x vs2, o32, T1 lxvw4x vs3, o48, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr xvmulsp vs2, vs34, alpha_vr xvmulsp vs3, vs35, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr xvmaddasp vs2, vs34, alpha_vr xvmaddasp vs3, vs35, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro LOAD1x8_1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 .endm .macro KERNEL1x8_I1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 addi BO, BO, 4 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 .endm .macro KERNEL1x8_1 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 .endm .macro KERNEL1x8_2 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 .endm .macro KERNEL1x8_E2 xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 .endm .macro KERNEL1x8_SUBI1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmulsp vs32, vs0, vs8 xvmulsp vs33, vs1, vs8 .endm .macro KERNEL1x8_SUB1 lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO addi AO, AO, 32 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 .endm .macro SAVE1x8 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 lxvw4x vs1, o16, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr xvmulsp vs1, vs33, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr xvmaddasp vs1, vs33, alpha_vr #endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro LOAD1x4_1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 .endm .macro KERNEL1x4_I1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 addi BO, BO, 4 xvmulsp vs32, vs0, vs8 .endm .macro KERNEL1x4_1 lxvw4x vs4, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs16, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs0, vs8 .endm .macro KERNEL1x4_2 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs4, vs16 .endm .macro KERNEL1x4_E2 xvmaddasp vs32, vs4, vs16 .endm .macro KERNEL1x4_SUBI1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmulsp vs32, vs0, vs8 .endm .macro KERNEL1x4_SUB1 lxvw4x vs0, o0, AO addi AO, AO, 16 lxvw4x vs28, o0, BO xxspltw vs8, vs28, 0 addi BO, BO, 4 xvmaddasp vs32, vs0, vs8 .endm .macro SAVE1x4 mr T1, CO #ifndef TRMMKERNEL lxvw4x vs0, o0, T1 #endif #ifdef TRMMKERNEL xvmulsp vs0, vs32, alpha_vr #else xvmaddasp vs0, vs32, alpha_vr #endif stxvw4x vs0, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro LOAD1x2_1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 .endm .macro KERNEL1x2_I1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 4 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 .endm .macro KERNEL1x2_1 lxsspx vs4, o0, AO lxsspx vs5, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 4 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 .endm .macro KERNEL1x2_2 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 .endm .macro KERNEL1x2_E2 xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 .endm .macro KERNEL1x2_SUBI1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 xsmuldp vs32, vs0, vs8 xsmuldp vs33, vs1, vs8 .endm .macro KERNEL1x2_SUB1 lxsspx vs0, o0, AO lxsspx vs1, o4, AO addi AO, AO, 8 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 xsmaddadp vs32, vs0, vs8 xsmaddadp vs33, vs1, vs8 .endm .macro SAVE1x2 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 lxsspx vs1, o4, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r xsmuldp vs1, vs33, alpha_r #else xsmaddadp vs0, vs32, alpha_r xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 stxsspx vs1, o4, T1 add T1, T1, LDC addi CO, CO, 8 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro LOAD1x1_1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 .endm .macro KERNEL1x1_I1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 4 xsmuldp vs32, vs0, vs8 .endm .macro KERNEL1x1_1 lxsspx vs4, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs16, o0, T1 addi BO, BO, 4 xsmaddadp vs32, vs0, vs8 .endm .macro KERNEL1x1_2 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 xsmaddadp vs32, vs4, vs16 .endm .macro KERNEL1x1_E2 xsmaddadp vs32, vs4, vs16 .endm .macro KERNEL1x1_SUBI1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 xsmuldp vs32, vs0, vs8 .endm .macro KERNEL1x1_SUB1 lxsspx vs0, o0, AO addi AO, AO, 4 mr T1, BO lxsspx vs8, o0, T1 addi BO, BO, 4 xsmaddadp vs32, vs0, vs8 .endm .macro SAVE1x1 mr T1, CO #ifndef TRMMKERNEL lxsspx vs0, o0, T1 #endif #ifdef TRMMKERNEL xsmuldp vs0, vs32, alpha_r #else xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 add T1, T1, LDC addi CO, CO, 4 .endm OpenBLAS-0.2.20/kernel/power/swap.S000066400000000000000000000203561313527062700166640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define PREA r4 #define XX r10 #define YY r11 #else #define N r3 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 #define XX r5 #define YY r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 #define XX r6 #define YY r11 #else #define N r3 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 #define XX r5 #define YY r6 #endif #endif #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, SIZE bne- cr0, LL(100) cmpwi cr0, INCY, SIZE bne- cr0, LL(100) srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LL(10): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f2, 2 * SIZE(X) LFD f3, 3 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) LFD f18, 2 * SIZE(Y) LFD f19, 3 * SIZE(Y) LFD f4, 4 * SIZE(X) LFD f5, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f7, 7 * SIZE(X) LFD f20, 4 * SIZE(Y) LFD f21, 5 * SIZE(Y) LFD f22, 6 * SIZE(Y) LFD f23, 7 * SIZE(Y) LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) LFD f24, 8 * SIZE(Y) LFD f25, 9 * SIZE(Y) LFD f26, 10 * SIZE(Y) LFD f27, 11 * SIZE(Y) LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) LFD f28, 12 * SIZE(Y) LFD f29, 13 * SIZE(Y) LFD f30, 14 * SIZE(Y) LFD f31, 15 * SIZE(Y) STFD f16, 0 * SIZE(X) STFD f17, 1 * SIZE(X) STFD f18, 2 * SIZE(X) STFD f19, 3 * SIZE(X) STFD f0, 0 * SIZE(Y) STFD f1, 1 * SIZE(Y) STFD f2, 2 * SIZE(Y) STFD f3, 3 * SIZE(Y) STFD f20, 4 * SIZE(X) STFD f21, 5 * SIZE(X) STFD f22, 6 * SIZE(X) STFD f23, 7 * SIZE(X) STFD f4, 4 * SIZE(Y) STFD f5, 5 * SIZE(Y) STFD f6, 6 * SIZE(Y) STFD f7, 7 * SIZE(Y) STFD f24, 8 * SIZE(X) STFD f25, 9 * SIZE(X) STFD f26, 10 * SIZE(X) STFD f27, 11 * SIZE(X) STFD f8, 8 * SIZE(Y) STFD f9, 9 * SIZE(Y) STFD f10, 10 * SIZE(Y) STFD f11, 11 * SIZE(Y) STFD f28, 12 * SIZE(X) STFD f29, 13 * SIZE(X) STFD f30, 14 * SIZE(X) STFD f31, 15 * SIZE(X) STFD f12, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f14, 14 * SIZE(Y) STFD f15, 15 * SIZE(Y) addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE dcbtst X, PREA #ifdef L1_DUALFETCH dcbtst Y, PREA #endif bdnz LL(10) .align 4 LL(50): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 0 * SIZE(Y) STFD f9, 0 * SIZE(X) STFD f8, 0 * SIZE(Y) addi X, X, 1 * SIZE addi Y, Y, 1 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCX sub Y, Y, INCY mr XX, X mr YY, Y srawi. r0, N, 4 mtspr CTR, r0 beq- LL(150) .align 4 LL(110): LFDUX f0, X, INCX LFDUX f1, X, INCX LFDUX f2, X, INCX LFDUX f3, X, INCX LFDUX f16, Y, INCY LFDUX f17, Y, INCY LFDUX f18, Y, INCY LFDUX f19, Y, INCY LFDUX f4, X, INCX LFDUX f5, X, INCX LFDUX f6, X, INCX LFDUX f7, X, INCX LFDUX f20, Y, INCY LFDUX f21, Y, INCY LFDUX f22, Y, INCY LFDUX f23, Y, INCY LFDUX f8, X, INCX LFDUX f9, X, INCX LFDUX f10, X, INCX LFDUX f11, X, INCX LFDUX f24, Y, INCY LFDUX f25, Y, INCY LFDUX f26, Y, INCY LFDUX f27, Y, INCY LFDUX f12, X, INCX LFDUX f13, X, INCX LFDUX f14, X, INCX LFDUX f15, X, INCX LFDUX f28, Y, INCY LFDUX f29, Y, INCY LFDUX f30, Y, INCY LFDUX f31, Y, INCY STFDUX f16, XX, INCX STFDUX f17, XX, INCX STFDUX f18, XX, INCX STFDUX f19, XX, INCX STFDUX f0, YY, INCY STFDUX f1, YY, INCY STFDUX f2, YY, INCY STFDUX f3, YY, INCY STFDUX f20, XX, INCX STFDUX f21, XX, INCX STFDUX f22, XX, INCX STFDUX f23, XX, INCX STFDUX f4, YY, INCY STFDUX f5, YY, INCY STFDUX f6, YY, INCY STFDUX f7, YY, INCY STFDUX f24, XX, INCX STFDUX f25, XX, INCX STFDUX f26, XX, INCX STFDUX f27, XX, INCX STFDUX f8, YY, INCY STFDUX f9, YY, INCY STFDUX f10, YY, INCY STFDUX f11, YY, INCY STFDUX f28, XX, INCX STFDUX f29, XX, INCX STFDUX f30, XX, INCX STFDUX f31, XX, INCX STFDUX f12, YY, INCY STFDUX f13, YY, INCY STFDUX f14, YY, INCY STFDUX f15, YY, INCY bdnz LL(110) .align 4 LL(150): andi. r0, N, 15 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDUX f9, Y, INCY STFDUX f9, XX, INCX STFDUX f8, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/swap_hummer.S000066400000000000000000000312561313527062700202420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define INCX2 r4 #define INCY2 r5 #define X2 r10 #define Y2 r11 #define A1 f0 #define A2 f1 #define A3 f2 #define A4 f3 #define A5 f4 #define B1 f5 #define B2 f6 #define B3 f7 #define B4 f8 #define B5 f9 #define T1 f10 #define T2 f11 #define T3 f12 #define T4 f13 #define T5 f14 #define T6 f15 #define T7 f16 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX add INCY2, INCY, INCY cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, SIZE bne LL(100) cmpwi cr0, INCY, SIZE bne LL(100) sub X, X, INCX2 sub Y, Y, INCY2 mr X2, X mr Y2, Y andi. r0, X, 2 * SIZE - 1 bne LL(30) andi. r0, Y, 2 * SIZE - 1 bne LL(20) .align 4 LL(10): /* X : aligned Y : aligned */ srawi. r0, N, 3 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 bdz LL(13) .align 4 LL(12): STFPDUX B1, X2, INCY2 LFPDUX B1, Y, INCY2 STFPDUX A1, Y2, INCY2 LFPDUX A1, X, INCX2 STFPDUX B2, X2, INCY2 LFPDUX B2, Y, INCY2 STFPDUX A2, Y2, INCY2 LFPDUX A2, X, INCX2 STFPDUX B3, X2, INCY2 LFPDUX B3, Y, INCY2 STFPDUX A3, Y2, INCY2 LFPDUX A3, X, INCX2 STFPDUX B4, X2, INCY2 LFPDUX B4, Y, INCY2 STFPDUX A4, Y2, INCY2 LFPDUX A4, X, INCX2 bdnz LL(12) .align 4 LL(13): STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 STFPDUX B3, X2, INCY2 STFPDUX A3, Y2, INCY2 STFPDUX B4, X2, INCY2 STFPDUX A4, Y2, INCY2 .align 4 LL(15): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 .align 4 LL(17): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 STFDUX B1, X2, INCY2 STFDUX A1, Y2, INCY2 b LL(999) .align 4 LL(20): /* X : aligned Y : unaligned */ LFXDUX A1, X, INCX2 LFDX B1, Y, INCY2 STFSDX A1, Y2, INCY2 add Y, Y, INCY add Y2, Y2, INCY addi N, N, -1 cmpwi cr0, N, 0 ble LL(29) .align 4 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(25) LFXDUX T1, X, INCX2 LFXDUX T2, Y, INCY2 LFXDUX T3, X, INCX2 LFXDUX T4, Y, INCY2 LFPDUX A4, X, INCX2 fsmr A1, T1 LFPDUX B4, Y, INCY2 fsmr B1, T2 LFPDUX A5, X, INCX2 fsmr T1, T3 LFPDUX B5, Y, INCY2 fsmr T2, T4 bdz LL(23) .align 4 LL(22): fxmr T5, A4 STFPDUX A1, Y2, INCY2 fxmr T6, B4 STFPDUX B1, X2, INCX2 fxmr A1, A5 STFPDUX T1, Y2, INCY2 fxmr B1, B5 STFPDUX T2, X2, INCX2 fsmr T3, T5 LFPDUX A2, X, INCX2 fsmr T4, T6 LFPDUX B2, Y, INCY2 fsmr T5, A1 LFPDUX A3, X, INCX2 fsmr T6, B1 LFPDUX B3, Y, INCY2 fxmr T1, A2 STFPDUX T3, Y2, INCY2 fxmr T2, B2 STFPDUX T4, X2, INCX2 fxmr T3, A3 STFPDUX T5, Y2, INCY2 fxmr T4, B3 STFPDUX T6, X2, INCX2 fsmr A1, T1 LFPDUX A4, X, INCX2 fsmr B1, T2 LFPDUX B4, Y, INCY2 fsmr T1, T3 LFPDUX A5, X, INCX2 fsmr T2, T4 LFPDUX B5, Y, INCY2 bdnz LL(22) .align 4 LL(23): fxmr T5, A4 STFPDUX A1, Y2, INCY2 fxmr T6, B4 STFPDUX B1, X2, INCX2 fxmr A1, A5 STFPDUX T1, Y2, INCY2 fxmr B1, B5 STFPDUX T2, X2, INCX2 fsmr T3, T5 fsmr T4, T6 fsmr T5, A1 fsmr T6, B1 STFPDUX T3, Y2, INCY2 STFPDUX T4, X2, INCX2 STFPDUX T5, Y2, INCY2 STFPDUX T6, X2, INCX2 .align 4 LL(25): andi. r0, N, 7 beq LL(29) andi. r0, N, 4 beq LL(27) LFXDUX A2, X, INCX2 LFXDUX B2, Y, INCY2 LFXDUX A3, X, INCX2 LFXDUX B3, Y, INCY2 fsmr A1, A2 fsmr B1, B2 fsmr A2, A3 fsmr B2, B3 STFPDUX A1, Y2, INCY2 STFPDUX B1, X2, INCX2 STFPDUX A2, Y2, INCY2 fpmr A1, A3 STFPDUX B2, X2, INCX2 fpmr B1, B3 .align 4 LL(27): andi. r0, N, 2 beq LL(28) LFXDUX A2, X, INCX2 LFXDUX B2, Y, INCY2 fsmr A1, A2 fsmr B1, B2 STFPDUX A1, Y2, INCY2 fpmr A1, A2 STFPDUX B1, X2, INCX2 fpmr B1, B2 .align 4 LL(28): andi. r0, N, 1 beq LL(29) LFSDX B1, Y, INCY2 STFDX A1, Y2, INCY2 STFDX B1, X2, INCX2 add X2, X2, INCX fsmtp B1, B1 .align 4 LL(29): STFDX B1, X2, INCX2 b LL(999) .align 4 LL(30): /* X : unaligned Y : aligned */ andi. r0, Y, 2 * SIZE - 1 bne LL(40) LFXDUX A1, Y, INCY2 LFDX B1, X, INCX2 STFSDX A1, X2, INCX2 add X, X, INCX add X2, X2, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(39) .align 4 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(35) LFXDUX T1, Y, INCY2 LFXDUX T2, X, INCX2 LFXDUX T3, Y, INCY2 LFXDUX T4, X, INCX2 LFPDUX A4, Y, INCY2 fsmr A1, T1 LFPDUX B4, X, INCX2 fsmr B1, T2 LFPDUX A5, Y, INCY2 fsmr T1, T3 LFPDUX B5, X, INCX2 fsmr T2, T4 bdz LL(33) .align 4 LL(32): fxmr T5, A4 STFPDUX A1, X2, INCX2 fxmr T6, B4 STFPDUX B1, Y2, INCY2 fxmr A1, A5 STFPDUX T1, X2, INCX2 fxmr B1, B5 STFPDUX T2, Y2, INCY2 fsmr T3, T5 LFPDUX A2, Y, INCY2 fsmr T4, T6 LFPDUX B2, X, INCX2 fsmr T5, A1 LFPDUX A3, Y, INCY2 fsmr T6, B1 LFPDUX B3, X, INCX2 fxmr T1, A2 STFPDUX T3, X2, INCX2 fxmr T2, B2 STFPDUX T4, Y2, INCY2 fxmr T3, A3 STFPDUX T5, X2, INCX2 fxmr T4, B3 STFPDUX T6, Y2, INCY2 fsmr A1, T1 LFPDUX A4, Y, INCY2 fsmr B1, T2 LFPDUX B4, X, INCX2 fsmr T1, T3 LFPDUX A5, Y, INCY2 fsmr T2, T4 LFPDUX B5, X, INCX2 bdnz LL(32) .align 4 LL(33): fxmr T5, A4 STFPDUX A1, X2, INCX2 fxmr T6, B4 STFPDUX B1, Y2, INCY2 fxmr A1, A5 STFPDUX T1, X2, INCX2 fxmr B1, B5 STFPDUX T2, Y2, INCY2 fsmr T3, T5 fsmr T4, T6 fsmr T5, A1 fsmr T6, B1 STFPDUX T3, X2, INCX2 STFPDUX T4, Y2, INCY2 STFPDUX T5, X2, INCX2 STFPDUX T6, Y2, INCY2 .align 4 LL(35): andi. r0, N, 7 beq LL(39) andi. r0, N, 4 beq LL(37) LFXDUX A2, Y, INCY2 LFXDUX B2, X, INCX2 LFXDUX A3, Y, INCY2 LFXDUX B3, X, INCX2 fsmr A1, A2 fsmr B1, B2 fsmr A2, A3 fsmr B2, B3 STFPDUX A1, X2, INCX2 STFPDUX B1, Y2, INCY2 STFPDUX A2, X2, INCX2 fpmr A1, A3 STFPDUX B2, Y2, INCY2 fpmr B1, B3 .align 4 LL(37): andi. r0, N, 2 beq LL(38) LFXDUX A2, Y, INCY2 LFXDUX B2, X, INCX2 fsmr A1, A2 fsmr B1, B2 STFPDUX A1, X2, INCX2 fpmr A1, A2 STFPDUX B1, Y2, INCY2 fpmr B1, B2 .align 4 LL(38): andi. r0, N, 1 beq LL(39) LFSDX B1, X, INCX2 STFDX A1, X2, INCX2 STFDX B1, Y2, INCY2 add Y2, Y2, INCY fsmtp B1, B1 .align 4 LL(39): STFDX B1, Y2, INCY2 b LL(999) .align 4 LL(40): /* X : unaligned Y : unaligned */ LFDX A1, Y, INCY2 LFDX B1, X, INCX2 add X, X, INCX add Y, Y, INCY addi N, N, -1 cmpwi cr0, N, 0 STFDX A1, X2, INCX2 STFDX B1, Y2, INCY2 add X2, X2, INCX add Y2, Y2, INCY ble LL(999) srawi. r0, N, 3 mtspr CTR, r0 beq- LL(45) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 bdz LL(43) .align 4 LL(42): STFPDUX B1, X2, INCY2 LFPDUX B1, Y, INCY2 STFPDUX A1, Y2, INCY2 LFPDUX A1, X, INCX2 STFPDUX B2, X2, INCY2 LFPDUX B2, Y, INCY2 STFPDUX A2, Y2, INCY2 LFPDUX A2, X, INCX2 STFPDUX B3, X2, INCY2 LFPDUX B3, Y, INCY2 STFPDUX A3, Y2, INCY2 LFPDUX A3, X, INCX2 STFPDUX B4, X2, INCY2 LFPDUX B4, Y, INCY2 STFPDUX A4, Y2, INCY2 LFPDUX A4, X, INCX2 bdnz LL(42) .align 4 LL(43): STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 STFPDUX B3, X2, INCY2 STFPDUX A3, Y2, INCY2 STFPDUX B4, X2, INCY2 STFPDUX A4, Y2, INCY2 .align 4 LL(45): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(46) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 .align 4 LL(46): andi. r0, N, 2 beq LL(47) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 .align 4 LL(47): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 STFDUX B1, X2, INCY2 STFDUX A1, Y2, INCY2 b LL(999) .align 4 LL(100): sub X, X, INCX sub Y, Y, INCY mr X2, X mr Y2, Y srawi. r0, N, 2 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX LFDUX B1, Y, INCY LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX LFDUX B3, Y, INCY LFDUX A4, X, INCX LFDUX B4, Y, INCY bdz LL(113) .align 4 LL(112): STFDUX B1, X2, INCX LFDUX B1, Y, INCY STFDUX A1, Y2, INCY LFDUX A1, X, INCX STFDUX B2, X2, INCX LFDUX B2, Y, INCY STFDUX A2, Y2, INCY LFDUX A2, X, INCX STFDUX B3, X2, INCX LFDUX B3, Y, INCY STFDUX A3, Y2, INCY LFDUX A3, X, INCX STFDUX B4, X2, INCX LFDUX B4, Y, INCY STFDUX A4, Y2, INCY LFDUX A4, X, INCX bdnz LL(112) .align 4 LL(113): STFDUX B1, X2, INCX STFDUX A1, Y2, INCY STFDUX B2, X2, INCX STFDUX A2, Y2, INCY STFDUX B3, X2, INCX STFDUX A3, Y2, INCY STFDUX B4, X2, INCX STFDUX A4, Y2, INCY .align 4 LL(115): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(117) LFDUX A1, X, INCX LFDUX A2, X, INCX LFDUX B1, Y, INCY LFDUX B2, Y, INCY STFDUX B1, X2, INCX STFDUX B2, X2, INCX STFDUX A1, Y2, INCY STFDUX A2, Y2, INCY .align 4 LL(117): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX LFDUX B1, Y, INCY STFDUX B1, X2, INCX STFDUX A1, Y2, INCY .align 4 LL(999): li r10, 16 addi SP, SP, -16 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/symv_L.S000066400000000000000000000717271313527062700171730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r5 #define LDA r6 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define BUFFER r14 #else #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define BUFFER r14 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #define BUFFER r14 #else #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define BUFFER r14 #endif #endif #define I r11 #define J r12 #define AO1 r15 #define AO2 r16 #define AO3 r17 #define AO4 r18 #define XX r19 #define YY r20 #define NEW_Y r21 #define TEMP r22 #define PREA r24 #define IS r25 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define atemp1 f4 #define atemp2 f5 #define atemp3 f6 #define atemp4 f7 #define xtemp1 f8 #define xtemp2 f9 #define xtemp3 f10 #define xtemp4 f11 #define xsum1 f12 #define xsum2 f13 #define xsum3 f14 #define xsum4 f15 #define a1 f16 #define a2 f17 #define a3 f18 #define a4 f19 #define a5 f20 #define a6 f21 #define a7 f22 #define a8 f23 #define a9 f24 #define a10 f25 #define a11 f26 #define a12 f27 #define a13 f28 #define a14 f29 #define a15 f30 #define a16 f31 #define alpha f1 #if defined(PPCG4) #define PREFETCHSIZE_A 24 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #endif #ifdef PPC970 #define PREFETCHSIZE_A 64 #endif #ifdef CELL #define PREFETCHSIZE_A 72 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 96 #endif #ifdef POWER6 #define PREFETCHSIZE_A 40 #endif #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else #define NOP1 mr LDA, LDA #define NOP2 mr INCX, INCX #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #define ALPHA 200(SP) #define FZERO 208(SP) #else #define STACKSIZE 280 #define ALPHA 256(SP) #define FZERO 264(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #else lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif STFD alpha, ALPHA slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, INCX, SIZE beq LL(05) mr XX, X mr X, BUFFER srawi. r0, M, 3 mtspr CTR, r0 ble LL(03) .align 4 LL(01): LFD a1, 0 * SIZE(XX) add XX, XX, INCX LFD a2, 0 * SIZE(XX) add XX, XX, INCX LFD a3, 0 * SIZE(XX) add XX, XX, INCX LFD a4, 0 * SIZE(XX) add XX, XX, INCX LFD a5, 0 * SIZE(XX) add XX, XX, INCX LFD a6, 0 * SIZE(XX) add XX, XX, INCX LFD a7, 0 * SIZE(XX) add XX, XX, INCX LFD a8, 0 * SIZE(XX) add XX, XX, INCX dcbt XX, PREA dcbtst BUFFER, PREA STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) STFD a3, 2 * SIZE(BUFFER) STFD a4, 3 * SIZE(BUFFER) STFD a5, 4 * SIZE(BUFFER) STFD a6, 5 * SIZE(BUFFER) STFD a7, 6 * SIZE(BUFFER) STFD a8, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(01) .align 4 LL(03): andi. r0, M, 7 mtspr CTR, r0 ble LL(05) .align 4 LL(04): LFD a1, 0 * SIZE(XX) add XX, XX, INCX STFD a1, 0 * SIZE(BUFFER) addi BUFFER, BUFFER, 1 * SIZE bdnz LL(04) .align 4 LL(05): mr NEW_Y, Y lfd f0, FZERO cmpwi cr0, INCY, SIZE beq LL(10) mr NEW_Y, BUFFER addi r0, M, 7 srawi. r0, r0, 3 mtspr CTR, r0 .align 4 LL(06): STFD f0, 0 * SIZE(BUFFER) STFD f0, 1 * SIZE(BUFFER) STFD f0, 2 * SIZE(BUFFER) STFD f0, 3 * SIZE(BUFFER) STFD f0, 4 * SIZE(BUFFER) STFD f0, 5 * SIZE(BUFFER) STFD f0, 6 * SIZE(BUFFER) STFD f0, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(06) .align 4 LL(10): li IS, 0 cmpwi cr0, N, 4 blt LL(20) .align 4 LL(11): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA addi A, A, 4 * SIZE slwi TEMP, IS, BASE_SHIFT add XX, X, TEMP add YY, NEW_Y, TEMP LFD atemp1, 0 * SIZE(XX) LFD atemp2, 1 * SIZE(XX) LFD atemp3, 2 * SIZE(XX) LFD atemp4, 3 * SIZE(XX) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) LFD a11, 2 * SIZE(AO3) LFD a12, 3 * SIZE(AO3) LFD a16, 3 * SIZE(AO4) LFD a5, ALPHA FMUL xsum1, atemp1, a1 FMUL xsum2, atemp1, a2 FMUL xsum3, atemp1, a3 FMUL xsum4, atemp1, a4 FMADD xsum1, atemp2, a2, xsum1 FMADD xsum2, atemp2, a6, xsum2 FMADD xsum3, atemp2, a7, xsum3 FMADD xsum4, atemp2, a8, xsum4 FMADD xsum1, atemp3, a3, xsum1 FMADD xsum2, atemp3, a7, xsum2 FMADD xsum3, atemp3, a11, xsum3 FMADD xsum4, atemp3, a12, xsum4 FMADD xsum1, atemp4, a4, xsum1 FMADD xsum2, atemp4, a8, xsum2 FMADD xsum3, atemp4, a12, xsum3 FMADD xsum4, atemp4, a16, xsum4 FMUL atemp1, a5, atemp1 FMUL atemp2, a5, atemp2 FMUL atemp3, a5, atemp3 FMUL atemp4, a5, atemp4 LFD xtemp1, 4 * SIZE(XX) LFD xtemp2, 5 * SIZE(XX) LFD xtemp3, 6 * SIZE(XX) LFD xtemp4, 7 * SIZE(XX) LFD y01, 4 * SIZE(YY) LFD y02, 5 * SIZE(YY) LFD y03, 6 * SIZE(YY) LFD y04, 7 * SIZE(YY) LFD a1, 4 * SIZE(AO1) LFD a2, 5 * SIZE(AO1) LFD a3, 6 * SIZE(AO1) LFD a4, 7 * SIZE(AO1) LFD a5, 4 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) LFD a9, 4 * SIZE(AO3) LFD a10, 5 * SIZE(AO3) LFD a11, 6 * SIZE(AO3) LFD a12, 7 * SIZE(AO3) LFD a13, 4 * SIZE(AO4) LFD a14, 5 * SIZE(AO4) LFD a15, 6 * SIZE(AO4) LFD a16, 7 * SIZE(AO4) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE addi XX, XX, 4 * SIZE addi YY, YY, 4 * SIZE sub TEMP, M, IS addi TEMP, TEMP, -4 srawi. r0, TEMP, 4 mtspr CTR, r0 ble LL(14) .align 4 LL(12): FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO1, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 4 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 5 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 4 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 5 * SIZE(XX) FMADD y04, atemp2, a8, y04 # DCBT(X, PREX) NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 4 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 6 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 5 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 6 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 7 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 4 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 7 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 5 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 7 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 6 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 7 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 7 * SIZE(AO4) STFD y01, 0 * SIZE(YY) LFD y01, 4 * SIZE(YY) STFD y02, 1 * SIZE(YY) LFD y02, 5 * SIZE(YY) STFD y03, 2 * SIZE(YY) LFD y03, 6 * SIZE(YY) STFD y04, 3 * SIZE(YY) LFD y04, 7 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO2, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 9 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 8 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 9 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 9 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 10 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 8 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 10 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 9 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 10 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 11 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 8 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 11 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 9 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 11 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 10 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 11 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 11 * SIZE(AO4) STFD y01, 4 * SIZE(YY) LFD y01, 8 * SIZE(YY) STFD y02, 5 * SIZE(YY) LFD y02, 9 * SIZE(YY) STFD y03, 6 * SIZE(YY) LFD y03, 10 * SIZE(YY) STFD y04, 7 * SIZE(YY) LFD y04, 11 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO3, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 12 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 12 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 13 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 12 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 13 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 # DCBT(Y1, PREY) NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 13 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 14 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 12 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 14 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10,13 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 14 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 14 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 15 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13,12 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 15 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 13 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 15 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 14 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 15 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 15 * SIZE(AO4) STFD y01, 8 * SIZE(YY) LFD y01, 12 * SIZE(YY) STFD y02, 9 * SIZE(YY) LFD y02, 13 * SIZE(YY) STFD y03, 10 * SIZE(YY) LFD y03, 14 * SIZE(YY) STFD y04, 11 * SIZE(YY) LFD y04, 15 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO4, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 16 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 16 * SIZE(XX) FMADD y04, atemp1, a4, y04 addi YY, YY, 16 * SIZE FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 17 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 16 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 addi AO3, AO3, 16 * SIZE FMADD y02, atemp2, a6, y02 LFD a6, 17 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 addi AO1, AO1, 16 * SIZE FMADD y03, atemp2, a7, y03 addi AO2, AO2, 16 * SIZE FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 17 * SIZE(XX) FMADD y04, atemp2, a8, y04 addi AO4, AO4, 16 * SIZE FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 2 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 0 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 2 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 1 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 2 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 18 * SIZE(XX) FMADD y04, atemp3, a12, y04 addi XX, XX, 16 * SIZE FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 3 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 0 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 3 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 1 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 3 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 2 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 3 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 3 * SIZE(AO4) STFD y01, -4 * SIZE(YY) LFD y01, 0 * SIZE(YY) STFD y02, -3 * SIZE(YY) LFD y02, 1 * SIZE(YY) STFD y03, -2 * SIZE(YY) LFD y03, 2 * SIZE(YY) STFD y04, -1 * SIZE(YY) LFD y04, 3 * SIZE(YY) bdnz LL(12) .align 4 LL(14): sub TEMP, M, IS addi TEMP, TEMP, -4 andi. r0, TEMP, 8 ble LL(15) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 LFD a1, 4 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 5 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 4 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 5 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 4 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 6 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 5 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 6 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 7 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 4 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 7 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 5 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 7 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 6 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 7 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 7 * SIZE(AO4) STFD y01, 0 * SIZE(YY) LFD y01, 4 * SIZE(YY) STFD y02, 1 * SIZE(YY) LFD y02, 5 * SIZE(YY) STFD y03, 2 * SIZE(YY) LFD y03, 6 * SIZE(YY) STFD y04, 3 * SIZE(YY) LFD y04, 7 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 9 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 8 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 9 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 9 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 10 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 8 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 10 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 9 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 10 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 11 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 8 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 11 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 9 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 11 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 10 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 11 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 11 * SIZE(AO4) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi AO3, AO3, 8 * SIZE addi AO4, AO4, 8 * SIZE STFD y01, 4 * SIZE(YY) LFD y01, 8 * SIZE(YY) STFD y02, 5 * SIZE(YY) LFD y02, 9 * SIZE(YY) STFD y03, 6 * SIZE(YY) LFD y03, 10 * SIZE(YY) STFD y04, 7 * SIZE(YY) LFD y04, 11 * SIZE(YY) addi XX, XX, 8 * SIZE addi YY, YY, 8 * SIZE .align 4 LL(15): sub TEMP, M, IS addi TEMP, TEMP, -4 andi. r0, TEMP, 4 ble LL(16) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 LFD a1, 4 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 5 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 4 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 5 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 4 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 6 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 5 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 6 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 7 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 4 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 7 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 5 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 7 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 6 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 7 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 7 * SIZE(AO4) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE STFD y01, 0 * SIZE(YY) LFD y01, 4 * SIZE(YY) STFD y02, 1 * SIZE(YY) LFD y02, 5 * SIZE(YY) STFD y03, 2 * SIZE(YY) LFD y03, 6 * SIZE(YY) STFD y04, 3 * SIZE(YY) LFD y04, 7 * SIZE(YY) addi XX, XX, 4 * SIZE addi YY, YY, 4 * SIZE .align 4 LL(16): andi. r0, M, 2 ble LL(17) FMADD xsum1, xtemp1, a1, xsum1 FMADD y01, atemp1, a1, y01 LFD a1, 2 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 FMADD y02, atemp1, a2, y02 FMADD xsum3, xtemp1, a9, xsum3 FMADD y01, atemp2, a5, y01 LFD a5, 2 * SIZE(AO2) FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 2 * SIZE(XX) FMADD y02, atemp2, a6, y02 FMADD xsum1, xtemp2, a2, xsum1 FMADD y01, atemp3, a9, y01 LFD a9, 2 * SIZE(AO3) FMADD xsum2, xtemp2, a6, xsum2 FMADD y02, atemp3, a10, y02 FMADD xsum3, xtemp2, a10, xsum3 FMADD y01, atemp4, a13, y01 LFD a13, 2 * SIZE(AO4) FMADD xsum4, xtemp2, a14, xsum4 FMADD y02, atemp4, a14, y02 STFD y01, 0 * SIZE(YY) LFD y01, 2 * SIZE(YY) STFD y02, 1 * SIZE(YY) addi YY, YY, 2 * SIZE .align 4 LL(17): andi. r0, M, 1 ble LL(18) FMADD xsum1, xtemp1, a1, xsum1 FMADD y01, atemp1, a1, y01 FMADD xsum2, xtemp1, a5, xsum2 FMADD y01, atemp2, a5, y01 FMADD xsum3, xtemp1, a9, xsum3 FMADD y01, atemp3, a9, y01 FMADD xsum4, xtemp1, a13, xsum4 FMADD y01, atemp4, a13, y01 STFD y01, 0 * SIZE(YY) .align 4 LL(18): slwi TEMP, IS, BASE_SHIFT add YY, NEW_Y, TEMP LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) LFD y03, 2 * SIZE(YY) LFD y04, 3 * SIZE(YY) LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 FMUL xsum3, xtemp1, xsum3 FMUL xsum4, xtemp1, xsum4 FADD y01, y01, xsum1 FADD y02, y02, xsum2 FADD y03, y03, xsum3 FADD y04, y04, xsum4 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) STFD y03, 2 * SIZE(YY) STFD y04, 3 * SIZE(YY) addi TEMP, IS, 8 addi IS, IS, 4 cmpw cr0, TEMP, N ble LL(11) .align 4 LL(20): andi. TEMP, N, 2 ble LL(30) mr AO1, A add AO2, A, LDA add A, AO2, LDA addi A, A, 2 * SIZE slwi TEMP, IS, BASE_SHIFT add XX, X, TEMP add YY, NEW_Y, TEMP LFD atemp1, 0 * SIZE(XX) LFD atemp2, 1 * SIZE(XX) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a6, 1 * SIZE(AO2) LFD a5, ALPHA FMUL xsum1, atemp1, a1 FMUL xsum2, atemp1, a2 FMADD xsum1, atemp2, a2, xsum1 FMADD xsum2, atemp2, a6, xsum2 FMUL atemp1, a5, atemp1 FMUL atemp2, a5, atemp2 LFD xtemp1, 2 * SIZE(XX) LFD y01, 2 * SIZE(YY) LFD a1, 2 * SIZE(AO1) LFD a5, 2 * SIZE(AO2) andi. r0, M, 1 ble LL(28) FMADD xsum1, xtemp1, a1, xsum1 FMADD y01, atemp1, a1, y01 FMADD xsum2, xtemp1, a5, xsum2 FMADD y01, atemp2, a5, y01 STFD y01, 2 * SIZE(YY) .align 4 LL(28): slwi TEMP, IS, BASE_SHIFT add YY, NEW_Y, TEMP LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 FADD y01, y01, xsum1 FADD y02, y02, xsum2 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) addi IS, IS, 2 .align 4 LL(30): andi. TEMP, N, 1 ble LL(990) mr AO1, A slwi TEMP, IS, BASE_SHIFT add XX, X, TEMP add YY, NEW_Y, TEMP LFD atemp1, 0 * SIZE(XX) LFD a1, 0 * SIZE(AO1) LFD xtemp1, ALPHA LFD y01, 0 * SIZE(YY) FMUL xsum1, atemp1, a1 FMUL xsum1, xtemp1, xsum1 FADD y01, y01, xsum1 STFD y01, 0 * SIZE(YY) .align 4 LL(990): cmpwi cr0, INCY, SIZE beq LL(999) mr YY, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(995) .align 4 LL(991): LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f4, 0 * SIZE(Y) add Y, Y, INCY LFD f5, 0 * SIZE(Y) add Y, Y, INCY LFD f6, 0 * SIZE(Y) add Y, Y, INCY LFD f7, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) LFD f12, 4 * SIZE(NEW_Y) LFD f13, 5 * SIZE(NEW_Y) LFD f14, 6 * SIZE(NEW_Y) LFD f15, 7 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 8 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFD f8, 0 * SIZE(YY) add YY, YY, INCY STFD f9, 0 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) add YY, YY, INCY STFD f11, 0 * SIZE(YY) add YY, YY, INCY STFD f12, 0 * SIZE(YY) add YY, YY, INCY STFD f13, 0 * SIZE(YY) add YY, YY, INCY STFD f14, 0 * SIZE(YY) add YY, YY, INCY STFD f15, 0 * SIZE(YY) add YY, YY, INCY bdnz LL(991) .align 4 LL(995): andi. J, M, 4 ble LL(996) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 4 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFD f8, 0 * SIZE(YY) add YY, YY, INCY STFD f9, 0 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) add YY, YY, INCY STFD f11, 0 * SIZE(YY) add YY, YY, INCY .align 4 LL(996): andi. J, M, 2 ble LL(997) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 2 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 STFD f8, 0 * SIZE(YY) add YY, YY, INCY STFD f9, 0 * SIZE(YY) add YY, YY, INCY .align 4 LL(997): andi. J, M, 1 ble LL(999) LFD f0, 0 * SIZE(Y) LFD f8, 0 * SIZE(NEW_Y) FADD f8, f8, f0 STFD f8, 0 * SIZE(YY) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/symv_U.S000066400000000000000000000704301313527062700171720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define IS r4 #define A r5 #define LDA r6 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define BUFFER r14 #else #define M r3 #define IS r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define BUFFER r14 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define IS r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #define BUFFER r14 #else #define M r3 #define IS r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define BUFFER r14 #endif #endif #define I r11 #define J r12 #define AO1 r15 #define AO2 r16 #define AO3 r17 #define AO4 r18 #define XX r19 #define YY r20 #define NEW_Y r21 #define TEMP r22 #define PREA r24 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define atemp1 f4 #define atemp2 f5 #define atemp3 f6 #define atemp4 f7 #define xtemp1 f8 #define xtemp2 f9 #define xtemp3 f10 #define xtemp4 f11 #define xsum1 f12 #define xsum2 f13 #define xsum3 f14 #define xsum4 f15 #define a1 f16 #define a2 f17 #define a3 f18 #define a4 f19 #define a5 f20 #define a6 f21 #define a7 f22 #define a8 f23 #define a9 f24 #define a10 f25 #define a11 f26 #define a12 f27 #define a13 f28 #define a14 f29 #define a15 f30 #define a16 f31 #define alpha f1 #if defined(PPCG4) #define PREFETCHSIZE_A 24 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #endif #ifdef PPC970 #define PREFETCHSIZE_A 64 #endif #ifdef CELL #define PREFETCHSIZE_A 72 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 96 #endif #ifdef POWER6 #define PREFETCHSIZE_A 40 #endif #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else #define NOP1 mr LDA, LDA #define NOP2 mr INCX, INCX #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #define ALPHA 200(SP) #define FZERO 208(SP) #else #define STACKSIZE 280 #define ALPHA 256(SP) #define FZERO 264(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #else lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif STFD alpha, ALPHA slwi LDA, LDA, BASE_SHIFT slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE sub IS, M, IS cmpwi cr0, M, 0 ble- LL(999) mullw TEMP, IS, LDA add A, A, TEMP cmpwi cr0, INCX, SIZE beq LL(05) mr XX, X mr X, BUFFER srawi. r0, M, 3 mtspr CTR, r0 ble LL(03) .align 4 LL(01): LFD a1, 0 * SIZE(XX) add XX, XX, INCX LFD a2, 0 * SIZE(XX) add XX, XX, INCX LFD a3, 0 * SIZE(XX) add XX, XX, INCX LFD a4, 0 * SIZE(XX) add XX, XX, INCX LFD a5, 0 * SIZE(XX) add XX, XX, INCX LFD a6, 0 * SIZE(XX) add XX, XX, INCX LFD a7, 0 * SIZE(XX) add XX, XX, INCX LFD a8, 0 * SIZE(XX) add XX, XX, INCX dcbt XX, PREA dcbtst BUFFER, PREA STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) STFD a3, 2 * SIZE(BUFFER) STFD a4, 3 * SIZE(BUFFER) STFD a5, 4 * SIZE(BUFFER) STFD a6, 5 * SIZE(BUFFER) STFD a7, 6 * SIZE(BUFFER) STFD a8, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(01) .align 4 LL(03): andi. r0, M, 7 mtspr CTR, r0 ble LL(05) .align 4 LL(04): LFD a1, 0 * SIZE(XX) add XX, XX, INCX STFD a1, 0 * SIZE(BUFFER) addi BUFFER, BUFFER, 1 * SIZE bdnz LL(04) .align 4 LL(05): mr NEW_Y, Y lfd f0, FZERO cmpwi cr0, INCY, SIZE beq LL(10) mr NEW_Y, BUFFER addi r0, M, 7 srawi. r0, r0, 3 mtspr CTR, r0 .align 4 LL(06): STFD f0, 0 * SIZE(BUFFER) STFD f0, 1 * SIZE(BUFFER) STFD f0, 2 * SIZE(BUFFER) STFD f0, 3 * SIZE(BUFFER) STFD f0, 4 * SIZE(BUFFER) STFD f0, 5 * SIZE(BUFFER) STFD f0, 6 * SIZE(BUFFER) STFD f0, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(06) .align 4 LL(10): addi TEMP, IS, 4 cmpw cr0, TEMP, M bgt LL(20) .align 4 LL(11): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA slwi TEMP, IS, BASE_SHIFT add TEMP, X, TEMP LFD a16, ALPHA lfd xsum1, FZERO LFD atemp1, 0 * SIZE(TEMP) LFD atemp2, 1 * SIZE(TEMP) LFD atemp3, 2 * SIZE(TEMP) LFD atemp4, 3 * SIZE(TEMP) LFD xtemp1, 0 * SIZE(X) LFD xtemp2, 1 * SIZE(X) LFD xtemp3, 2 * SIZE(X) LFD xtemp4, 3 * SIZE(X) LFD y01, 0 * SIZE(NEW_Y) LFD y02, 1 * SIZE(NEW_Y) LFD y03, 2 * SIZE(NEW_Y) LFD y04, 3 * SIZE(NEW_Y) LFD a1, 0 * SIZE(AO1) FMUL atemp1, a16, atemp1 LFD a2, 1 * SIZE(AO1) FMUL atemp2, a16, atemp2 LFD a3, 2 * SIZE(AO1) FMUL atemp3, a16, atemp3 LFD a4, 3 * SIZE(AO1) FMUL atemp4, a16, atemp4 LFD a5, 0 * SIZE(AO2) fmr xsum2, xsum1 LFD a6, 1 * SIZE(AO2) fmr xsum3, xsum1 LFD a7, 2 * SIZE(AO2) fmr xsum4, xsum1 LFD a8, 3 * SIZE(AO2) LFD a9, 0 * SIZE(AO3) LFD a10, 1 * SIZE(AO3) LFD a11, 2 * SIZE(AO3) LFD a12, 3 * SIZE(AO3) LFD a13, 0 * SIZE(AO4) LFD a14, 1 * SIZE(AO4) LFD a15, 2 * SIZE(AO4) LFD a16, 3 * SIZE(AO4) mr XX, X mr YY, NEW_Y srawi. r0, IS, 4 mtspr CTR, r0 ble LL(14) .align 4 LL(12): FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO1, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 4 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 5 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 4 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 5 * SIZE(XX) FMADD y04, atemp2, a8, y04 # DCBT(X, PREX) NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 4 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 6 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 5 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 6 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 7 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 4 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 7 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 5 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 7 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 6 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 7 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 7 * SIZE(AO4) STFD y01, 0 * SIZE(YY) LFD y01, 4 * SIZE(YY) STFD y02, 1 * SIZE(YY) LFD y02, 5 * SIZE(YY) STFD y03, 2 * SIZE(YY) LFD y03, 6 * SIZE(YY) STFD y04, 3 * SIZE(YY) LFD y04, 7 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO2, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 9 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 8 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 9 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 9 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 10 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 8 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 10 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 9 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 10 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 11 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 8 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 11 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 9 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 11 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 10 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 11 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 11 * SIZE(AO4) STFD y01, 4 * SIZE(YY) LFD y01, 8 * SIZE(YY) STFD y02, 5 * SIZE(YY) LFD y02, 9 * SIZE(YY) STFD y03, 6 * SIZE(YY) LFD y03, 10 * SIZE(YY) STFD y04, 7 * SIZE(YY) LFD y04, 11 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO3, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 12 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 12 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 13 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 12 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 13 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 # DCBT(Y1, PREY) NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 13 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 14 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 12 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 14 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10,13 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 14 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 14 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 15 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13,12 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 15 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 13 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 15 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 14 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 15 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 15 * SIZE(AO4) STFD y01, 8 * SIZE(YY) LFD y01, 12 * SIZE(YY) STFD y02, 9 * SIZE(YY) LFD y02, 13 * SIZE(YY) STFD y03, 10 * SIZE(YY) LFD y03, 14 * SIZE(YY) STFD y04, 11 * SIZE(YY) LFD y04, 15 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO4, PREA) FMADD y01, atemp1, a1, y01 LFD a1, 16 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 16 * SIZE(XX) FMADD y04, atemp1, a4, y04 addi YY, YY, 16 * SIZE FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 17 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 16 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 addi AO3, AO3, 16 * SIZE FMADD y02, atemp2, a6, y02 LFD a6, 17 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 addi AO1, AO1, 16 * SIZE FMADD y03, atemp2, a7, y03 addi AO2, AO2, 16 * SIZE FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 17 * SIZE(XX) FMADD y04, atemp2, a8, y04 addi AO4, AO4, 16 * SIZE FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 2 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 0 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 2 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 1 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 2 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 18 * SIZE(XX) FMADD y04, atemp3, a12, y04 addi XX, XX, 16 * SIZE FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 3 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 0 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 3 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 1 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 3 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 2 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 3 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 3 * SIZE(AO4) STFD y01, -4 * SIZE(YY) LFD y01, 0 * SIZE(YY) STFD y02, -3 * SIZE(YY) LFD y02, 1 * SIZE(YY) STFD y03, -2 * SIZE(YY) LFD y03, 2 * SIZE(YY) STFD y04, -1 * SIZE(YY) LFD y04, 3 * SIZE(YY) bdnz LL(12) .align 4 LL(14): andi. r0, IS, 8 ble LL(15) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 LFD a1, 4 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 5 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 4 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 5 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 4 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 6 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 5 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 6 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 7 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 4 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 7 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 5 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 7 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 6 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 7 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 7 * SIZE(AO4) STFD y01, 0 * SIZE(YY) LFD y01, 4 * SIZE(YY) STFD y02, 1 * SIZE(YY) LFD y02, 5 * SIZE(YY) STFD y03, 2 * SIZE(YY) LFD y03, 6 * SIZE(YY) STFD y04, 3 * SIZE(YY) LFD y04, 7 * SIZE(YY) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 9 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 8 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 9 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 9 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 10 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 8 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 10 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 9 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 10 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 11 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 8 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 11 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 9 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 11 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 10 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 11 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 11 * SIZE(AO4) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi AO3, AO3, 8 * SIZE addi AO4, AO4, 8 * SIZE STFD y01, 4 * SIZE(YY) LFD y01, 8 * SIZE(YY) STFD y02, 5 * SIZE(YY) LFD y02, 9 * SIZE(YY) STFD y03, 6 * SIZE(YY) LFD y03, 10 * SIZE(YY) STFD y04, 7 * SIZE(YY) LFD y04, 11 * SIZE(YY) addi XX, XX, 8 * SIZE addi YY, YY, 8 * SIZE .align 4 LL(15): andi. r0, IS, 4 ble LL(18) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 LFD a1, 4 * SIZE(AO1) FMADD xsum2, xtemp1, a5, xsum2 NOP1 FMADD y02, atemp1, a2, y02 NOP2 FMADD xsum3, xtemp1, a9, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp1, a13, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp2, a2, xsum1 LFD a2, 5 * SIZE(AO1) FMADD y01, atemp2, a5, y01 LFD a5, 4 * SIZE(AO2) FMADD xsum2, xtemp2, a6, xsum2 NOP1 FMADD y02, atemp2, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD xsum3, xtemp2, a10, xsum3 NOP1 FMADD y03, atemp2, a7, y03 NOP2 FMADD xsum4, xtemp2, a14, xsum4 LFD xtemp2, 5 * SIZE(XX) FMADD y04, atemp2, a8, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 LFD a9, 4 * SIZE(AO3) FMADD xsum2, xtemp3, a7, xsum2 LFD a7, 6 * SIZE(AO2) FMADD y02, atemp3, a10, y02 LFD a10, 5 * SIZE(AO3) FMADD xsum3, xtemp3, a11, xsum3 NOP1 FMADD y03, atemp3, a11, y03 LFD a11, 6 * SIZE(AO3) FMADD xsum4, xtemp3, a15, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a12, y04 NOP2 FMADD xsum1, xtemp4, a4, xsum1 LFD a4, 7 * SIZE(AO1) FMADD y01, atemp4, a13, y01 LFD a13, 4 * SIZE(AO4) FMADD xsum2, xtemp4, a8, xsum2 LFD a8, 7 * SIZE(AO2) FMADD y02, atemp4, a14, y02 LFD a14, 5 * SIZE(AO4) FMADD xsum3, xtemp4, a12, xsum3 LFD a12, 7 * SIZE(AO3) FMADD y03, atemp4, a15, y03 LFD a15, 6 * SIZE(AO4) FMADD xsum4, xtemp4, a16, xsum4 LFD xtemp4, 7 * SIZE(XX) FMADD y04, atemp4, a16, y04 LFD a16, 7 * SIZE(AO4) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE STFD y01, 0 * SIZE(YY) LFD y01, 4 * SIZE(YY) STFD y02, 1 * SIZE(YY) LFD y02, 5 * SIZE(YY) STFD y03, 2 * SIZE(YY) LFD y03, 6 * SIZE(YY) STFD y04, 3 * SIZE(YY) LFD y04, 7 * SIZE(YY) addi XX, XX, 4 * SIZE addi YY, YY, 4 * SIZE .align 4 LL(18): LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 FMUL xsum3, xtemp1, xsum3 FMUL xsum4, xtemp1, xsum4 FMADD xsum1, atemp1, a1, xsum1 FMADD xsum2, atemp1, a5, xsum2 FMADD xsum3, atemp1, a9, xsum3 FMADD xsum4, atemp1, a13, xsum4 FMADD xsum1, atemp2, a5, xsum1 FMADD xsum2, atemp2, a6, xsum2 FMADD xsum3, atemp2, a10, xsum3 FMADD xsum4, atemp2, a14, xsum4 FMADD xsum1, atemp3, a9, xsum1 FMADD xsum2, atemp3, a10, xsum2 FMADD xsum3, atemp3, a11, xsum3 FMADD xsum4, atemp3, a15, xsum4 FMADD xsum1, atemp4, a13, xsum1 FMADD xsum2, atemp4, a14, xsum2 FMADD xsum3, atemp4, a15, xsum3 FMADD xsum4, atemp4, a16, xsum4 FADD y01, y01, xsum1 FADD y02, y02, xsum2 FADD y03, y03, xsum3 FADD y04, y04, xsum4 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) STFD y03, 2 * SIZE(YY) STFD y04, 3 * SIZE(YY) addi TEMP, IS, 8 addi IS, IS, 4 cmpw cr0, TEMP, M ble LL(11) .align 4 LL(20): andi. TEMP, M, 2 ble LL(30) mr AO1, A add AO2, A, LDA add A, AO2, LDA slwi TEMP, IS, BASE_SHIFT add TEMP, X, TEMP LFD atemp1, 0 * SIZE(TEMP) LFD atemp2, 1 * SIZE(TEMP) LFD a1, ALPHA FMUL atemp1, a1, atemp1 FMUL atemp2, a1, atemp2 lfd xsum1, FZERO fmr xsum2, xsum1 mr XX, X mr YY, NEW_Y LFD xtemp1, 0 * SIZE(XX) LFD xtemp2, 1 * SIZE(XX) LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) srawi. r0, IS, 1 mtspr CTR, r0 ble LL(28) .align 4 LL(22): FMADD xsum1, xtemp1, a1, xsum1 FMADD xsum2, xtemp1, a5, xsum2 FMADD xsum1, xtemp2, a2, xsum1 FMADD xsum2, xtemp2, a6, xsum2 FMADD y01, atemp1, a1, y01 FMADD y02, atemp1, a2, y02 FMADD y01, atemp2, a5, y01 FMADD y02, atemp2, a6, y02 LFD xtemp1, 2 * SIZE(XX) LFD xtemp2, 3 * SIZE(XX) LFD a1, 2 * SIZE(AO1) LFD a2, 3 * SIZE(AO1) LFD a5, 2 * SIZE(AO2) LFD a6, 3 * SIZE(AO2) STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) LFD y01, 2 * SIZE(YY) LFD y02, 3 * SIZE(YY) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi XX, XX, 2 * SIZE addi YY, YY, 2 * SIZE bdnz LL(22) .align 4 LL(28): LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 FMADD xsum1, atemp1, a1, xsum1 FMADD xsum2, atemp1, a5, xsum2 FMADD xsum1, atemp2, a5, xsum1 FMADD xsum2, atemp2, a6, xsum2 FADD y01, y01, xsum1 FADD y02, y02, xsum2 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) addi IS, IS, 2 .align 4 LL(30): andi. TEMP, M, 1 ble LL(990) mr AO1, A slwi TEMP, IS, BASE_SHIFT add TEMP, X, TEMP LFD atemp1, 0 * SIZE(TEMP) LFD a1, ALPHA FMUL atemp1, a1, atemp1 lfd xsum1, FZERO mr XX, X mr YY, NEW_Y LFD xtemp1, 0 * SIZE(XX) LFD y01, 0 * SIZE(YY) LFD a1, 0 * SIZE(AO1) mtspr CTR, IS cmpwi cr0, IS, 0 ble LL(38) .align 4 LL(32): FMADD xsum1, xtemp1, a1, xsum1 FMADD y01, atemp1, a1, y01 LFD xtemp1, 1 * SIZE(XX) LFD a1, 1 * SIZE(AO1) STFD y01, 0 * SIZE(YY) LFD y01, 1 * SIZE(YY) addi AO1, AO1, 1 * SIZE addi XX, XX, 1 * SIZE addi YY, YY, 1 * SIZE bdnz LL(32) .align 4 LL(38): LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMADD xsum1, atemp1, a1, xsum1 FADD y01, y01, xsum1 STFD y01, 0 * SIZE(YY) .align 4 LL(990): cmpwi cr0, INCY, SIZE beq LL(999) mr YY, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(995) .align 4 LL(991): LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f4, 0 * SIZE(Y) add Y, Y, INCY LFD f5, 0 * SIZE(Y) add Y, Y, INCY LFD f6, 0 * SIZE(Y) add Y, Y, INCY LFD f7, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) LFD f12, 4 * SIZE(NEW_Y) LFD f13, 5 * SIZE(NEW_Y) LFD f14, 6 * SIZE(NEW_Y) LFD f15, 7 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 8 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFD f8, 0 * SIZE(YY) add YY, YY, INCY STFD f9, 0 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) add YY, YY, INCY STFD f11, 0 * SIZE(YY) add YY, YY, INCY STFD f12, 0 * SIZE(YY) add YY, YY, INCY STFD f13, 0 * SIZE(YY) add YY, YY, INCY STFD f14, 0 * SIZE(YY) add YY, YY, INCY STFD f15, 0 * SIZE(YY) add YY, YY, INCY bdnz LL(991) .align 4 LL(995): andi. J, M, 4 ble LL(996) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 4 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFD f8, 0 * SIZE(YY) add YY, YY, INCY STFD f9, 0 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) add YY, YY, INCY STFD f11, 0 * SIZE(YY) add YY, YY, INCY .align 4 LL(996): andi. J, M, 2 ble LL(997) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 2 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 STFD f8, 0 * SIZE(YY) add YY, YY, INCY STFD f9, 0 * SIZE(YY) add YY, YY, INCY .align 4 LL(997): andi. J, M, 1 ble LL(999) LFD f0, 0 * SIZE(Y) LFD f8, 0 * SIZE(NEW_Y) FADD f8, f8, f0 STFD f8, 0 * SIZE(YY) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_LN.S000066400000000000000000001711461313527062700206340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #ifdef LN li PREC, -4 * SIZE #else li PREC, 4 * SIZE #endif #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE | 1) li PREB, (16 * 5 * SIZE | 3) #else li PREA, (16 * 14 * SIZE | 1) li PREB, (16 * 8 * SIZE | 3) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) li PREB, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) li PREB, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #endif lfs f0, FZERO srawi. J, N, 2 ble LL(40) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif LL(30): andi. I, M, 1 ble LL(20) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(20): andi. I, M, 2 ble LL(09) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(09): srawi. I, M, 2 ble LL(39) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 DCBT(AO, PREA) DCBT(BO, PREB) #endif bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(40): andi. J, N, 2 ble LL(70) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif LL(60): andi. I, M, 1 ble LL(50) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(50): andi. I, M, 2 ble LL(41) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(41): srawi. I, M, 2 ble LL(69) .align 4 LL(42): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(43): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(43) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(42) .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(70): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif .align 4 LL(90): andi. I, M, 1 ble LL(80) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(80): andi. I, M, 2 ble LL(71) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(71): srawi. I, M, 2 ble LL(999) .align 4 LL(72): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(73): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(73) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(72) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_LT.S000066400000000000000000001716611313527062700206440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #if defined(TRSMKERNEL) && defined(LN) /* Direction is special */ #ifdef PPC970 li PREC, -4 * SIZE #endif #ifdef POWER4 li PREC, -4 * SIZE #endif #ifdef POWER5 li PREC, -4 * SIZE #endif #ifdef CELL li PREC, -4 * SIZE #endif #else /* Normal prefetch */ #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 3 * SIZE #endif #endif #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE | 1) li PREB, (16 * 5 * SIZE | 3) #else li PREA, (16 * 14 * SIZE | 1) li PREB, (16 * 8 * SIZE | 3) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) li PREB, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) li PREB, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #ifdef CELL li PREA, (16 * 12 * SIZE) li PREB, (16 * 12 * SIZE) #endif #endif lfs f0, FZERO srawi. J, N, 2 ble LL(40) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 DCBT(AO, PREA) DCBT(BO, PREB) #endif bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(40): andi. J, N, 2 ble LL(70) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(50) .align 4 LL(41): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(70): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble LL(80) .align 4 LL(71): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(72) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(90): andi. I, M, 1 ble LL(999) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_RT.S000066400000000000000000001720271313527062700206470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #if defined(TRSMKERNEL) && defined(LN) /* Direction is special */ #ifdef PPC970 li PREC, -4 * SIZE #endif #ifdef POWER4 li PREC, -4 * SIZE #endif #ifdef POWER5 li PREC, -4 * SIZE #endif #else /* Normal prefetch */ #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 3 * SIZE #endif #endif #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE | 1) li PREB, (16 * 5 * SIZE | 3) #else li PREA, (16 * 14 * SIZE | 1) li PREB, (16 * 8 * SIZE | 3) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) li PREB, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) li PREB, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #endif lfs f0, FZERO LL(70): andi. J, N, 1 ble LL(40) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble LL(80) .align 4 LL(71): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(72) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(90): andi. I, M, 1 ble LL(99) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(99): #ifdef LN slwi r0, K, 0 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(40): andi. J, N, 2 ble LL(09) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(50) .align 4 LL(41): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(09): srawi. J, N, 2 ble LL(999) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 DCBT(AO, PREA) DCBT(BO, PREB) #endif bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_cell_LN.S000066400000000000000000001714771313527062700216420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST li PREC, -4 * SIZE #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE | 1) li PREB, (16 * 5 * SIZE | 3) #else li PREA, (16 * 14 * SIZE | 1) li PREB, (16 * 8 * SIZE | 3) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) li PREB, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) li PREB, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #ifdef CELL li PREA, (16 * 12 * SIZE) li PREB, (16 * 12 * SIZE) #endif #endif lfs f0, FZERO srawi. J, N, 2 ble LL(40) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif LL(30): andi. I, M, 1 ble LL(20) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(20): andi. I, M, 2 ble LL(09) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(09): srawi. I, M, 2 ble LL(39) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(40): andi. J, N, 2 ble LL(70) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif LL(60): andi. I, M, 1 ble LL(50) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(50): andi. I, M, 2 ble LL(41) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(41): srawi. I, M, 2 ble LL(69) .align 4 LL(42): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(43): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(43) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(42) .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(70): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif .align 4 LL(90): andi. I, M, 1 ble LL(80) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(80): andi. I, M, 2 ble LL(71) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(71): srawi. I, M, 2 ble LL(999) .align 4 LL(72): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(73): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(73) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(72) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_cell_LT.S000066400000000000000000001721471313527062700216430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #if defined(TRSMKERNEL) && defined(LN) /* Direction is special */ #ifdef PPC970 li PREC, -4 * SIZE #endif #ifdef POWER4 li PREC, -4 * SIZE #endif #ifdef POWER5 li PREC, -4 * SIZE #endif #ifdef CELL li PREC, -4 * SIZE #endif #else /* Normal prefetch */ #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 3 * SIZE #endif #endif #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE | 1) li PREB, (16 * 5 * SIZE | 3) #else li PREA, (16 * 14 * SIZE | 1) li PREB, (16 * 8 * SIZE | 3) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) li PREB, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) li PREB, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #ifdef CELL li PREA, (16 * 12 * SIZE) li PREB, (16 * 12 * SIZE) #endif #endif lfs f0, FZERO srawi. J, N, 2 ble LL(40) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(40): andi. J, N, 2 ble LL(70) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(50) .align 4 LL(41): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(70): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble LL(80) .align 4 LL(71): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(72) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(90): andi. I, M, 1 ble LL(999) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_cell_RT.S000066400000000000000000001716641313527062700216540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREB r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST li PREC, -4 * SIZE #else #ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREC, FRAMESLOT(1) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(0) + STACKSIZE(SP) ld PREB, FRAMESLOT(1) + STACKSIZE(SP) ld PREC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(1) + STACKSIZE(SP) lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(1) + STACKSIZE(SP) lwz PREC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE | 1) li PREB, (16 * 5 * SIZE | 3) #else li PREA, (16 * 14 * SIZE | 1) li PREB, (16 * 8 * SIZE | 3) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) li PREB, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) li PREB, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #ifdef CELL li PREA, (16 * 12 * SIZE) li PREB, (16 * 12 * SIZE) #endif #endif lfs f0, FZERO LL(70): andi. J, N, 1 ble LL(40) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble LL(80) .align 4 LL(71): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(72) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE DCBT(BO, PREB) bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(90): andi. I, M, 1 ble LL(99) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(99): #ifdef LN slwi r0, K, 0 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(40): andi. J, N, 2 ble LL(09) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(50) .align 4 LL(41): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE DCBT(BO, PREB) bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(09): srawi. J, N, 2 ble LL(999) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC dcbt CO3, PREC dcbt CO4, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE DCBT(BO, PREB) bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_hummer_LN.S000066400000000000000000002762471313527062700222210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define ALPHA 0 #define FZERO 8 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define AORIG r12 #define KK r14 #define INCM1 r15 #define INCM4 r16 #define INCM2 r17 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define CO3 r30 #define CO4 r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) # dummy li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f1, -8(SP) slwi LDC, LDC, BASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM2, -2 * SIZE li INCM4, -4 * SIZE addi C, C, - 1 * SIZE #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif srawi. J, N, 2 ble .L50 .align 4 .L10: #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -4 * SIZE #else addi AO, A, -4 * SIZE #endif #ifndef RT add C, CO4, LDC #endif li r0, FZERO lfpsx f0, SP, r0 andi. I, M, 1 beq .L20 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L44 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L44 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L43 .align 4 .L42: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A3, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A3, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A3, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A4, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A4, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A4, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A4, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L42 .align 4 .L43: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 fxcpmadd f0, A3, B1, f0 fxcpmadd f1, A3, B2, f1 fxcsmadd f2, A3, B3, f2 fxcsmadd f3, A3, B4, f3 fxcpmadd f0, A4, A5, f0 fxcpmadd f1, A4, A6, f1 fxcsmadd f2, A4, A7, f2 fxcsmadd f3, A4, A8, f3 .align 4 .L44: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L48 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L48 #endif LFDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdz- .L47 .align 4 .L46: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdnz+ .L46 .align 4 .L47: fxcpmadd f0, A1, B1, f0 fxcpmadd f1, A1, B2, f1 addi AO2, AO, 2 * SIZE .align 4 .L48: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC4 LFPDX f17, BO2, INC4 fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDX f16, AO, INC4 LFPDX f17, AO2, INC4 fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #if defined(LN) || defined(LT) LFPDX A1, AO, INC4 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef RN LFD A1, (4 + 0) * SIZE(BO) LFD A2, (4 + 1) * SIZE(BO) LFD A3, (4 + 2) * SIZE(BO) LFD A4, (4 + 3) * SIZE(BO) LFD A5, (4 + 5) * SIZE(BO) LFD A6, (4 + 6) * SIZE(BO) LFD A7, (4 + 7) * SIZE(BO) LFD A8, (4 + 10) * SIZE(BO) LFD A9, (4 + 11) * SIZE(BO) LFD A10, (4 + 15) * SIZE(BO) fsmtp f2, f0 fsmtp f3, f1 fmul f0, A1, f0 fnmsub f2, A2, f0, f2 fnmsub f1, A3, f0, f1 fnmsub f3, A4, f0, f3 fmul f2, A5, f2 fnmsub f1, A6, f2, f1 fnmsub f3, A7, f2, f3 fmul f1, A8, f1 fnmsub f3, A9, f1, f3 fmul f3, A10, f3 fsmfp f0, f2 fsmfp f1, f3 #endif #ifdef RT LFD A1, (4 + 15) * SIZE(BO) LFD A2, (4 + 14) * SIZE(BO) LFD A3, (4 + 13) * SIZE(BO) LFD A4, (4 + 12) * SIZE(BO) LFD A5, (4 + 10) * SIZE(BO) LFD A6, (4 + 9) * SIZE(BO) LFD A7, (4 + 8) * SIZE(BO) LFD A8, (4 + 5) * SIZE(BO) LFD A9, (4 + 4) * SIZE(BO) LFD A10, (4 + 0) * SIZE(BO) fsmtp f2, f0 fsmtp f3, f1 fmul f3, A1, f3 fnmsub f1, A2, f3, f1 fnmsub f2, A3, f3, f2 fnmsub f0, A4, f3, f0 fmul f1, A5, f1 fnmsub f2, A6, f1, f2 fnmsub f0, A7, f1, f0 fmul f2, A8, f2 fnmsub f0, A9, f2, f0 fmul f0, A10, f0 fsmfp f0, f2 fsmfp f1, f3 #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC4 STFPDX f1, BO2, INC4 #else STFPDX f0, AO, INC4 STFPDX f1, AO2, INC4 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif STFDX f0, CO1, INC STFSDX f0, CO2, INC STFDX f1, CO3, INC STFSDX f1, CO4, INC #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L20: andi. I, M, 2 beq .L30 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L34 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX B1, BO, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, B3, A2, f0 fxcsmadd f4, B3, A2, f4 LFPDUX B3, BO, INC4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A5, A3, f0 fxcsmadd f4, A5, A3, f4 LFPDUX A5, BO, INC4 fxcpmadd f8, A6, A3, f8 fxcsmadd f12, A6, A3, f12 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A7, A4, f0 fxcsmadd f4, A7, A4, f4 LFPDUX A7, BO, INC4 fxcpmadd f8, A8, A4, f8 fxcsmadd f12, A8, A4, f12 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f0, B3, A2, f0 fxcsmadd f4, B3, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 fxcpmadd f0, A5, A3, f0 fxcsmadd f4, A5, A3, f4 fxcpmadd f8, A6, A3, f8 fxcsmadd f12, A6, A3, f12 fxcpmadd f0, A7, A4, f0 fxcsmadd f4, A7, A4, f4 fxcpmadd f8, A8, A4, f8 fxcsmadd f12, A8, A4, f12 .align 4 .L34: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L38 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L38 #endif LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX B1, BO, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 .align 4 .L38: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f28, f8 fsmfp f0, f4 fsmfp f8, f12 fsmtp f4, f24 fsmtp f12, f28 LFPDUX f16, BO, INC4 LFPDUX f17, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f19, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fpsub f0, f16, f0 fpsub f8, f17, f8 fpsub f4, f18, f4 fpsub f12, f19, f12 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fpsub f0, f16, f0 fpsub f4, f17, f4 fpsub f8, f18, f8 fpsub f12, f19, f12 #endif #ifdef LN addi AO, AO, 8 * SIZE addi AO2, AO2, 8 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxsmul f4, A1, f4 fxsmul f12, A1, f12 fxcpnmsub f0, A1, f4, f0 fxcpnmsub f8, A1, f12, f8 fxpmul f0, A2, f0 fxpmul f8, A2, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 subi AO, AO, 4 * SIZE subi AO2, AO2, 4 * SIZE fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxsmul f4, A2, f4 fxsmul f12, A2, f12 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxcsnmsub f4, A1, f0, f4 fxcpnmsub f8, A2, f0, f8 fxcsnmsub f12, A2, f0, f12 fxsmul f4, A3, f4 fxcpnmsub f8, A4, f4, f8 fxcsnmsub f12, A4, f4, f12 fxpmul f8, A5, f8 fxcsnmsub f12, A5, f8, f12 fxsmul f12, A6, f12 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxcpnmsub f8, A1, f12, f8 fxcsnmsub f4, A2, f12, f4 fxcpnmsub f0, A2, f12, f0 fxpmul f8, A3, f8 fxcsnmsub f4, A4, f8, f4 fxcpnmsub f0, A4, f8, f0 fxsmul f4, A5, f4 fxcpnmsub f0, A5, f4, f0 fxpmul f0, A6, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f4, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f12, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: andi. I, M, 4 beq .L40 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 nop fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 LFPDUX B2, BO2, INC4 fxcpmadd f0, B3, A3, f0 nop fxcsmadd f4, B3, A3, f4 LFPDUX A2, AO2, INC4 fxcpmadd f8, B4, A3, f8 nop fxcsmadd f12, B4, A3, f12 LFPDUX A3, AO, INC4 fxcpmadd f1, B3, A4, f1 nop fxcsmadd f5, B3, A4, f5 LFPDUX B3, BO, INC4 fxcpmadd f9, B4, A4, f9 nop fxcsmadd f13, B4, A4, f13 LFPDUX B4, BO2, INC4 fxcpmadd f0, B5, A5, f0 nop fxcsmadd f4, B5, A5, f4 LFPDUX A4, AO2, INC4 fxcpmadd f8, B6, A5, f8 nop fxcsmadd f12, B6, A5, f12 LFPDUX A5, AO, INC4 fxcpmadd f1, B5, A6, f1 nop fxcsmadd f5, B5, A6, f5 LFPDUX B5, BO, INC4 fxcpmadd f9, B6, A6, f9 nop fxcsmadd f13, B6, A6, f13 LFPDUX B6, BO2, INC4 fxcpmadd f0, A9, A7, f0 nop fxcsmadd f4, A9, A7, f4 LFPDUX A6, AO2, INC4 fxcpmadd f8, A10, A7, f8 nop fxcsmadd f12, A10, A7, f12 LFPDUX A7, AO, INC4 fxcpmadd f1, A9, A8, f1 nop fxcsmadd f5, A9, A8, f5 LFPDUX A9, BO, INC4 fxcpmadd f9, A10, A8, f9 nop fxcsmadd f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 fxcpmadd f0, B3, A3, f0 fxcsmadd f4, B3, A3, f4 fxcpmadd f8, B4, A3, f8 fxcsmadd f12, B4, A3, f12 fxcpmadd f1, B3, A4, f1 fxcsmadd f5, B3, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 fxcpmadd f0, B5, A5, f0 fxcsmadd f4, B5, A5, f4 fxcpmadd f8, B6, A5, f8 fxcsmadd f12, B6, A5, f12 fxcpmadd f1, B5, A6, f1 fxcsmadd f5, B5, A6, f5 fxcpmadd f9, B6, A6, f9 fxcsmadd f13, B6, A6, f13 fxcpmadd f0, A9, A7, f0 fxcsmadd f4, A9, A7, f4 fxcpmadd f8, A10, A7, f8 fxcsmadd f12, A10, A7, f12 fxcpmadd f1, A9, A8, f1 fxcsmadd f5, A9, A8, f5 fxcpmadd f9, A10, A8, f9 fxcsmadd f13, A10, A8, f13 .align 4 .L24: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L28 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L28 #endif LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 .align 4 .L28: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 4 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fpmr f28, f8 fpmr f29, f9 fsmfp f0, f4 fsmfp f1, f5 fsmfp f8, f12 fsmfp f9, f13 fsmtp f4, f24 fsmtp f5, f25 fsmtp f12, f28 fsmtp f13, f29 LFPDUX f16, BO, INC4 LFPDUX f17, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f19, BO2, INC4 LFPDUX f20, BO, INC4 LFPDUX f21, BO2, INC4 LFPDUX f22, BO, INC4 LFPDUX f23, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fpsub f0, f16, f0 fpsub f8, f17, f8 fpsub f4, f18, f4 fpsub f12, f19, f12 fpsub f1, f20, f1 fpsub f9, f21, f9 fpsub f5, f22, f5 fpsub f13, f23, f13 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f4, f18, f4 fpsub f5, f19, f5 fpsub f8, f20, f8 fpsub f9, f21, f9 fpsub f12, f22, f12 fpsub f13, f23, f13 #endif #ifdef LN addi AO, AO, 20 * SIZE addi AO2, AO2, 20 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 LFPDUX A3, AO2, INCM4 LFPDUX A4, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A5, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A6, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxsmul f5, A1, f5 fxsmul f13, A1, f13 fxcpnmsub f1, A1, f5, f1 fxcpnmsub f9, A1, f13, f9 fxcsnmsub f4, A2, f5, f4 fxcsnmsub f12, A2, f13, f12 fxcpnmsub f0, A2, f5, f0 fxcpnmsub f8, A2, f13, f8 fxpmul f1, A3, f1 fxpmul f9, A3, f9 fxcsnmsub f4, A4, f1, f4 fxcsnmsub f12, A4, f9, f12 fxcpnmsub f0, A4, f1, f0 fxcpnmsub f8, A4, f9, f8 fxsmul f4, A5, f4 fxsmul f12, A5, f12 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f8, A5, f12, f8 fxpmul f0, A6, f0 fxpmul f8, A6, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 add AO, AO, INC4 LFPDUX A5, AO2, INC4 add AO, AO, INC4 LFPDUX A6, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 fxcsnmsub f5, A2, f0, f5 fxcsnmsub f13, A2, f8, f13 fxsmul f4, A3, f4 fxsmul f12, A3, f12 fxcpnmsub f1, A4, f4, f1 fxcpnmsub f9, A4, f12, f9 fxcsnmsub f5, A4, f4, f5 fxcsnmsub f13, A4, f12, f13 fxpmul f1, A5, f1 fxpmul f9, A5, f9 fxcsnmsub f5, A5, f1, f5 fxcsnmsub f13, A5, f9, f13 fxsmul f5, A6, f5 fxsmul f13, A6, f13 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcsnmsub f12, A2, f0, f12 fxcsnmsub f13, A2, f1, f13 fxsmul f4, A3, f4 fxsmul f5, A3, f5 fxcpnmsub f8, A4, f4, f8 fxcpnmsub f9, A4, f5, f9 fxcsnmsub f12, A4, f4, f12 fxcsnmsub f13, A4, f5, f13 fxpmul f8, A5, f8 fxpmul f9, A5, f9 fxcsnmsub f12, A5, f8, f12 fxcsnmsub f13, A5, f9, f13 fxsmul f12, A6, f12 fxsmul f13, A6, f13 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxsmul f13, A1, f13 fxcpnmsub f8, A1, f12, f8 fxcpnmsub f9, A1, f13, f9 fxcsnmsub f4, A2, f12, f4 fxcsnmsub f5, A2, f13, f5 fxcpnmsub f0, A2, f12, f0 fxcpnmsub f1, A2, f13, f1 fxpmul f8, A3, f8 fxpmul f9, A3, f9 fxcsnmsub f4, A4, f8, f4 fxcsnmsub f5, A4, f9, f5 fxcpnmsub f0, A4, f8, f0 fxcpnmsub f1, A4, f9, f1 fxsmul f4, A5, f4 fxsmul f5, A5, f5 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f1, A5, f5, f1 fxpmul f0, A6, f0 fxpmul f1, A6, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f5, BO, INC4 STFPDUX f13, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFDUX f9, CO3, INC STFDUX f13, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC STFSDUX f9, CO4, INC STFSDUX f13, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f4, AO, INC4 STFPDUX f5, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f12, AO, INC4 STFPDUX f13, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L40: srawi. I, M, 3 ble .L49 .align 4 .L11: #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop srawi. r0, KK, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A1, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 LFPDUX B1, BO, INC4 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A3, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 LFPDUX A5, AO, INC4 fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 LFPDUX B3, BO, INC4 fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A6, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A7, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A9, f12 LFPDUX B5, BO, INC4 fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A8, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 LFPDUX A2, AO2, INC4 fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 nop fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 nop fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 nop fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 nop fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 nop fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 nop fxcsmadd f12, B4, A9, f12 nop fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 nop fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 nop fxcsmadd f7, B6, A4, f7 nop fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 nop .align 4 .L14: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L18 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 #endif .align 4 .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 LFPDUX A2, AO, INC4 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 LFPDUX A10, BO, INC4 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 .align 4 .L18: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 4 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 LFPDUX f16, BO, INC4 fpmr f25, f1 nop fpmr f26, f2 LFPDUX f17, BO2, INC4 fpmr f27, f3 nop fpmr f28, f8 LFPDUX f18, BO, INC4 fpmr f29, f9 nop fpmr f30, f10 LFPDUX f19, BO2, INC4 fpmr f31, f11 nop fsmfp f0, f4 LFPDUX f20, BO, INC4 fsmfp f1, f5 nop fsmfp f2, f6 LFPDUX f21, BO2, INC4 fsmfp f3, f7 nop fsmfp f8, f12 LFPDUX f22, BO, INC4 fsmfp f9, f13 nop fsmfp f10, f14 LFPDUX f23, BO2, INC4 fsmfp f11, f15 nop fsmtp f4, f24 LFPDUX f24, BO, INC4 fsmtp f5, f25 nop fsmtp f6, f26 LFPDUX f25, BO2, INC4 fsmtp f7, f27 nop fsmtp f12, f28 LFPDUX f26, BO, INC4 fsmtp f13, f29 nop fsmtp f14, f30 LFPDUX f27, BO2, INC4 fsmtp f15, f31 nop fpsub f0, f16, f0 LFPDUX f28, BO, INC4 fpsub f8, f17, f8 nop fpsub f4, f18, f4 LFPDUX f29, BO2, INC4 fpsub f12, f19, f12 nop fpsub f1, f20, f1 LFPDUX f30, BO, INC4 fpsub f9, f21, f9 subi BO, BO, 32 * SIZE fpsub f5, f22, f5 LFPDUX f31, BO2, INC4 fpsub f13, f23, f13 subi BO2, BO2, 32 * SIZE fpsub f2, f24, f2 fpsub f10, f25, f10 fpsub f6, f26, f6 fpsub f14, f27, f14 fpsub f3, f28, f3 fpsub f11, f29, f11 fpsub f7, f30, f7 fpsub f15, f31, f15 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 fpsub f0, f16, f0 LFPDUX f24, AO, INC4 fpsub f1, f17, f1 LFPDUX f25, AO2, INC4 fpsub f2, f18, f2 LFPDUX f26, AO, INC4 fpsub f3, f19, f3 LFPDUX f27, AO2, INC4 fpsub f4, f20, f4 LFPDUX f28, AO, INC4 fpsub f5, f21, f5 LFPDUX f29, AO2, INC4 fpsub f6, f22, f6 LFPDUX f30, AO, INC4 fpsub f7, f23, f7 LFPDUX f31, AO2, INC4 fpsub f8, f24, f8 subi AO, AO, 32 * SIZE fpsub f9, f25, f9 subi AO2, AO2, 32 * SIZE fpsub f10, f26, f10 fpsub f11, f27, f11 fpsub f12, f28, f12 fpsub f13, f29, f13 fpsub f14, f30, f14 fpsub f15, f31, f15 #endif #ifdef LN addi AO, AO, 68 * SIZE addi AO2, AO2, 68 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 LFPDUX A3, AO2, INCM4 LFPDUX A4, AO, INCM4 LFPDUX A5, AO2, INCM4 LFPDUX A6, AO, INCM4 LFPDUX A7, AO2, INCM4 LFPDUX A8, AO, INCM4 fxsmul f7, A1, f7 fxsmul f15, A1, f15 fxcpnmsub f3, A1, f7, f3 fxcpnmsub f11, A1, f15, f11 fxcsnmsub f6, A2, f7, f6 fxcsnmsub f14, A2, f15, f14 fxcpnmsub f2, A2, f7, f2 fxcpnmsub f10, A2, f15, f10 fxcsnmsub f5, A3, f7, f5 fxcsnmsub f13, A3, f15, f13 fxcpnmsub f1, A3, f7, f1 fxcpnmsub f9, A3, f15, f9 fxcsnmsub f4, A4, f7, f4 fxcsnmsub f12, A4, f15, f12 fxcpnmsub f0, A4, f7, f0 fxcpnmsub f8, A4, f15, f8 fxpmul f3, A5, f3 fxpmul f11, A5, f11 fxcsnmsub f6, A6, f3, f6 fxcsnmsub f14, A6, f11, f14 fxcpnmsub f2, A6, f3, f2 fxcpnmsub f10, A6, f11, f10 fxcsnmsub f5, A7, f3, f5 fxcsnmsub f13, A7, f11, f13 fxcpnmsub f1, A7, f3, f1 fxcpnmsub f9, A7, f11, f9 fxcsnmsub f4, A8, f3, f4 fxcsnmsub f12, A8, f11, f12 fxcpnmsub f0, A8, f3, f0 fxcpnmsub f8, A8, f11, f8 add AO2, AO2, INCM4 LFPDUX A1, AO, INCM4 LFPDUX A2, AO2, INCM4 LFPDUX A3, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A4, AO, INCM4 LFPDUX A5, AO2, INCM4 LFPDUX A6, AO, INCM4 add AO2, AO2, INCM4 add AO, AO, INCM4 LFPDUX A7, AO2, INCM4 LFPDUX A8, AO, INCM4 fxsmul f6, A1, f6 fxsmul f14, A1, f14 fxcpnmsub f2, A1, f6, f2 fxcpnmsub f10, A1, f14, f10 fxcsnmsub f5, A2, f6, f5 fxcsnmsub f13, A2, f14, f13 fxcpnmsub f1, A2, f6, f1 fxcpnmsub f9, A2, f14, f9 fxcsnmsub f4, A3, f6, f4 fxcsnmsub f12, A3, f14, f12 fxcpnmsub f0, A3, f6, f0 fxcpnmsub f8, A3, f14, f8 fxpmul f2, A4, f2 fxpmul f10, A4, f10 fxcsnmsub f5, A5, f2, f5 fxcsnmsub f13, A5, f10, f13 fxcpnmsub f1, A5, f2, f1 fxcpnmsub f9, A5, f10, f9 fxcsnmsub f4, A6, f2, f4 fxcsnmsub f12, A6, f10, f12 fxcpnmsub f0, A6, f2, f0 fxcpnmsub f8, A6, f10, f8 fxsmul f5, A7, f5 fxsmul f13, A7, f13 fxcpnmsub f1, A7, f5, f1 fxcpnmsub f9, A7, f13, f9 fxcsnmsub f4, A8, f5, f4 fxcsnmsub f12, A8, f13, f12 fxcpnmsub f0, A8, f5, f0 fxcpnmsub f8, A8, f13, f8 add AO2, AO2, INCM4 add AO, AO, INCM4 LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 subi AO2, AO2, 8 * SIZE add AO, AO, INCM4 LFPDUX A3, AO, INCM4 subi AO2, AO2, 8 * SIZE add AO, AO, INCM4 LFPDUX A4, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxpmul f1, A1, f1 fxpmul f9, A1, f9 fxcsnmsub f4, A2, f1, f4 fxcsnmsub f12, A2, f9, f12 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 fxsmul f4, A3, f4 fxsmul f12, A3, f12 fxcpnmsub f0, A3, f4, f0 fxcpnmsub f8, A3, f12, f8 fxpmul f0, A4, f0 fxpmul f8, A4, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A5, AO, INC4 LFPDUX A6, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 fxcsnmsub f5, A2, f0, f5 fxcsnmsub f13, A2, f8, f13 fxcpnmsub f2, A3, f0, f2 fxcpnmsub f10, A3, f8, f10 fxcsnmsub f6, A3, f0, f6 fxcsnmsub f14, A3, f8, f14 fxcpnmsub f3, A4, f0, f3 fxcpnmsub f11, A4, f8, f11 fxcsnmsub f7, A4, f0, f7 fxcsnmsub f15, A4, f8, f15 fxsmul f4, A5, f4 fxsmul f12, A5, f12 fxcpnmsub f1, A6, f4, f1 fxcpnmsub f9, A6, f12, f9 fxcsnmsub f5, A6, f4, f5 fxcsnmsub f13, A6, f12, f13 fxcpnmsub f2, A7, f4, f2 fxcpnmsub f10, A7, f12, f10 fxcsnmsub f6, A7, f4, f6 fxcsnmsub f14, A7, f12, f14 fxcpnmsub f3, A8, f4, f3 fxcpnmsub f11, A8, f12, f11 fxcsnmsub f7, A8, f4, f7 fxcsnmsub f15, A8, f12, f15 add AO, AO, INC4 LFPDUX A1, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 add AO, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A5, AO, INC4 LFPDUX A6, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 fxpmul f1, A1, f1 fxpmul f9, A1, f9 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f13, A1, f9, f13 fxcpnmsub f2, A2, f1, f2 fxcpnmsub f10, A2, f9, f10 fxcsnmsub f6, A2, f1, f6 fxcsnmsub f14, A2, f9, f14 fxcpnmsub f3, A3, f1, f3 fxcpnmsub f11, A3, f9, f11 fxcsnmsub f7, A3, f1, f7 fxcsnmsub f15, A3, f9, f15 fxsmul f5, A4, f5 fxsmul f13, A4, f13 fxcpnmsub f2, A5, f5, f2 fxcpnmsub f10, A5, f13, f10 fxcsnmsub f6, A5, f5, f6 fxcsnmsub f14, A5, f13, f14 fxcpnmsub f3, A6, f5, f3 fxcpnmsub f11, A6, f13, f11 fxcsnmsub f7, A6, f5, f7 fxcsnmsub f15, A6, f13, f15 fxpmul f2, A7, f2 fxpmul f10, A7, f10 fxcsnmsub f6, A7, f2, f6 fxcsnmsub f14, A7, f10, f14 fxcpnmsub f3, A8, f2, f3 fxcpnmsub f11, A8, f10, f11 fxcsnmsub f7, A8, f2, f7 fxcsnmsub f15, A8, f10, f15 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 addi AO, AO, 8 * SIZE addi AO2, AO2, 4 * SIZE LFPDUX A3, AO2, INC4 addi AO, AO, 8 * SIZE addi AO2, AO2, 4 * SIZE LFPDUX A4, AO2, INC4 subi AO, AO, 64 * SIZE subi AO2, AO2, 64 * SIZE fxsmul f6, A1, f6 fxsmul f14, A1, f14 fxcpnmsub f3, A2, f6, f3 fxcpnmsub f11, A2, f14, f11 fxcsnmsub f7, A2, f6, f7 fxcsnmsub f15, A2, f14, f15 fxpmul f3, A3, f3 fxpmul f11, A3, f11 fxcsnmsub f7, A3, f3, f7 fxcsnmsub f15, A3, f11, f15 fxsmul f7, A4, f7 fxsmul f15, A4, f15 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f6, A1, f2, f6 fxcsnmsub f7, A1, f3, f7 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcpnmsub f10, A2, f2, f10 fxcpnmsub f11, A2, f3, f11 fxcsnmsub f12, A2, f0, f12 fxcsnmsub f13, A2, f1, f13 fxcsnmsub f14, A2, f2, f14 fxcsnmsub f15, A2, f3, f15 fxsmul f4, A3, f4 fxsmul f5, A3, f5 fxsmul f6, A3, f6 fxsmul f7, A3, f7 fxcpnmsub f8, A4, f4, f8 fxcpnmsub f9, A4, f5, f9 fxcpnmsub f10, A4, f6, f10 fxcpnmsub f11, A4, f7, f11 fxcsnmsub f12, A4, f4, f12 fxcsnmsub f13, A4, f5, f13 fxcsnmsub f14, A4, f6, f14 fxcsnmsub f15, A4, f7, f15 fxpmul f8, A5, f8 fxpmul f9, A5, f9 fxpmul f10, A5, f10 fxpmul f11, A5, f11 fxcsnmsub f12, A5, f8, f12 fxcsnmsub f13, A5, f9, f13 fxcsnmsub f14, A5, f10, f14 fxcsnmsub f15, A5, f11, f15 fxsmul f12, A6, f12 fxsmul f13, A6, f13 fxsmul f14, A6, f14 fxsmul f15, A6, f15 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxsmul f13, A1, f13 fxsmul f14, A1, f14 fxsmul f15, A1, f15 fxcpnmsub f8, A1, f12, f8 fxcpnmsub f9, A1, f13, f9 fxcpnmsub f10, A1, f14, f10 fxcpnmsub f11, A1, f15, f11 fxcsnmsub f4, A2, f12, f4 fxcsnmsub f5, A2, f13, f5 fxcsnmsub f6, A2, f14, f6 fxcsnmsub f7, A2, f15, f7 fxcpnmsub f0, A2, f12, f0 fxcpnmsub f1, A2, f13, f1 fxcpnmsub f2, A2, f14, f2 fxcpnmsub f3, A2, f15, f3 fxpmul f8, A3, f8 fxpmul f9, A3, f9 fxpmul f10, A3, f10 fxpmul f11, A3, f11 fxcsnmsub f4, A4, f8, f4 fxcsnmsub f5, A4, f9, f5 fxcsnmsub f6, A4, f10, f6 fxcsnmsub f7, A4, f11, f7 fxcpnmsub f0, A4, f8, f0 fxcpnmsub f1, A4, f9, f1 fxcpnmsub f2, A4, f10, f2 fxcpnmsub f3, A4, f11, f3 fxsmul f4, A5, f4 fxsmul f5, A5, f5 fxsmul f6, A5, f6 fxsmul f7, A5, f7 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f1, A5, f5, f1 fxcpnmsub f2, A5, f6, f2 fxcpnmsub f3, A5, f7, f3 fxpmul f0, A6, f0 fxpmul f1, A6, f1 fxpmul f2, A6, f2 fxpmul f3, A6, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE subi CO3, CO3, 8 * SIZE subi CO4, CO4, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f5, BO, INC4 STFPDUX f13, BO2, INC4 STFPDUX f2, BO, INC4 STFPDUX f10, BO2, INC4 STFPDUX f6, BO, INC4 STFPDUX f14, BO2, INC4 STFPDUX f3, BO, INC4 STFPDUX f11, BO2, INC4 STFPDUX f7, BO, INC4 STFPDUX f15, BO2, INC4 subi BO, BO, 32 * SIZE subi BO2, BO2, 32 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFDUX f2, CO1, INC STFDUX f6, CO1, INC STFDUX f3, CO1, INC STFDUX f7, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFSDUX f2, CO2, INC STFSDUX f6, CO2, INC STFSDUX f3, CO2, INC STFSDUX f7, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFDUX f9, CO3, INC STFDUX f13, CO3, INC STFDUX f10, CO3, INC STFDUX f14, CO3, INC STFDUX f11, CO3, INC STFDUX f15, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC STFSDUX f9, CO4, INC STFSDUX f13, CO4, INC STFSDUX f10, CO4, INC STFSDUX f14, CO4, INC STFSDUX f11, CO4, INC STFSDUX f15, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f2, AO, INC4 STFPDUX f3, AO2, INC4 STFPDUX f4, AO, INC4 STFPDUX f5, AO2, INC4 STFPDUX f6, AO, INC4 STFPDUX f7, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f10, AO, INC4 STFPDUX f11, AO2, INC4 STFPDUX f12, AO, INC4 STFPDUX f13, AO2, INC4 STFPDUX f14, AO, INC4 STFPDUX f15, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f10, CO3, INC STFSDUX f10, CO3, INC STFDUX f11, CO3, INC STFSDUX f11, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC STFDUX f14, CO4, INC STFSDUX f14, CO4, INC STFDUX f15, CO4, INC STFSDUX f15, CO4, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE subi CO3, CO3, 8 * SIZE subi CO4, CO4, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L49: #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 4 * SIZE #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 bgt+ .L10 .align 4 .L50: andi. J, N, 2 beq .L90 #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 andi. I, M, 1 beq .L60 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L84 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L84 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L83 .align 4 .L82: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A1, AO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A2, AO, INC2 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A3, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 fxcpmadd f2, A4, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A4, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A4, AO, INC2 bdnz+ .L82 .align 4 .L83: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 fxcpmadd f0, A3, B1, f0 fxcsmadd f1, A3, B2, f1 fxcpmadd f2, A4, B3, f2 fxcsmadd f3, A4, B4, f3 .align 4 .L84: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L88 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L88 #endif LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdz- .L87 .align 4 .L86: fxcpmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdnz+ .L86 .align 4 .L87: fxcpmadd f0, A1, B1, f0 .align 4 .L88: fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 fpsub f0, f16, f0 #else LFPDX f16, AO, INC2 fpsub f0, f16, f0 #endif #ifdef LN LFPDX A1, AO, INC2 fxpmul f0, A1, f0 #endif #ifdef LT LFPDX A1, AO, INC2 fxpmul f0, A1, f0 #endif #ifdef RN LFD A1, (2 + 0) * SIZE(BO) LFD A2, (2 + 1) * SIZE(BO) LFD A3, (2 + 3) * SIZE(BO) fsmtp f1, f0 fmul f0, A1, f0 fnmsub f1, A2, f0, f1 fmul f1, A3, f1 fsmfp f0, f1 #endif #ifdef RT LFD A1, (2 + 3) * SIZE(BO) LFD A2, (2 + 2) * SIZE(BO) LFD A3, (2 + 0) * SIZE(BO) fsmtp f1, f0 fmul f1, A1, f1 fnmsub f0, A2, f1, f0 fmul f0, A3, f0 fsmfp f0, f1 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO2, INC #else STFPDX f0, AO, INC2 STFDUX f0, CO1, INC STFDUX f1, CO2, INC #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L60: andi. I, M, 2 beq .L70 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L74 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 .align 4 .L74: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L78 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L78 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 .align 4 .L78: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fsmfp f0, f1 fsmtp f1, f24 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #ifdef LN LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, -4 * SIZE fxsmul f1, A2, f1 fxcpnmsub f0, A2, f1, f0 fxpmul f0, A1, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, -4 * SIZE fxpmul f0, A1, f0 fxcsnmsub f1, A1, f0, f1 fxsmul f1, A2, f1 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxcsnmsub f1, A1, f0, f1 fxsmul f1, A2, f1 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f1, A1, f1 fxcpnmsub f0, A1, f1, f0 fxpmul f0, A2, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE STFDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f0, CO2, INC STFSDUX f1, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO2, INC STFSDUX f1, CO2, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: andi. I, M, 4 beq .L80 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L64 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 LFPDUX A5, AO, INC2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 .align 4 .L64: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L68 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L68 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 .align 4 .L68: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fsmfp f0, f2 fsmfp f1, f3 fsmtp f2, f24 fsmtp f3, f25 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE fpsub f0, f16, f0 fpsub f2, f17, f2 fpsub f1, f18, f1 fpsub f3, f19, f3 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #endif #ifdef LN addi AO, AO, 18 * SIZE LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 LFPDUX A4, AO, INCM2 add AO, AO, INCM2 LFPDUX A5, AO, INCM2 add AO, AO, INCM2 LFPDUX A6, AO, INCM2 subi AO, AO, 2 * SIZE fxsmul f3, A1, f3 fxcpnmsub f1, A1, f3, f1 fxcsnmsub f2, A2, f3, f2 fxcpnmsub f0, A2, f3, f0 fxpmul f1, A3, f1 fxcsnmsub f2, A4, f1, f2 fxcpnmsub f0, A4, f1, f0 fxsmul f2, A5, f2 fxcpnmsub f0, A5, f2, f0 fxpmul f0, A6, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add AO, AO, INC2 LFPDUX A5, AO, INC2 add AO, AO, INC2 LFPDUX A6, AO, INC2 subi AO, AO, 16 * SIZE fxpmul f0, A1, f0 fxcsnmsub f2, A1, f0, f2 fxcpnmsub f1, A2, f0, f1 fxcsnmsub f3, A2, f0, f3 fxsmul f2, A3, f2 fxcpnmsub f1, A4, f2, f1 fxcsnmsub f3, A4, f2, f3 fxpmul f1, A5, f1 fxcsnmsub f3, A5, f1, f3 fxsmul f3, A6, f3 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxcsnmsub f2, A1, f0, f2 fxcsnmsub f3, A1, f1, f3 fxsmul f2, A2, f2 fxsmul f3, A2, f3 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f2, A1, f2 fxsmul f3, A1, f3 fxcpnmsub f0, A1, f2, f0 fxcpnmsub f1, A1, f3, f1 fxpmul f0, A2, f0 fxpmul f1, A2, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE STFDUX f0, CO1, INC STFDUX f2, CO1, INC STFDUX f1, CO1, INC STFDUX f3, CO1, INC STFSDUX f0, CO2, INC STFSDUX f2, CO2, INC STFSDUX f1, CO2, INC STFSDUX f3, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC STFDUX f3, CO2, INC STFSDUX f3, CO2, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L80: srawi. I, M, 3 ble .L89 .align 4 .L51: #if defined(LT) || defined(RN) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, KK, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX B1, BO, INC2 fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX B2, BO, INC2 fxcsmadd f4, B3, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B3, A3, f2 nop fxcsmadd f6, B3, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX B3, BO, INC2 fxcsmadd f4, B4, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 nop fxcsmadd f5, B4, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B4, A7, f2 nop fxcsmadd f6, B4, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B4, A8, f3 nop fxcsmadd f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 nop fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcsmadd f4, B3, A1, f4 fxcpmadd f1, B3, A2, f1 fxcsmadd f5, B3, A2, f5 fxcpmadd f2, B3, A3, f2 fxcsmadd f6, B3, A3, f6 fxcpmadd f3, B3, A4, f3 fxcsmadd f7, B3, A4, f7 fxcpmadd f0, B4, A5, f0 fxcsmadd f4, B4, A5, f4 fxcpmadd f1, B4, A6, f1 fxcsmadd f5, B4, A6, f5 fxcpmadd f2, B4, A7, f2 fxcsmadd f6, B4, A7, f6 fxcpmadd f3, B4, A8, f3 fxcsmadd f7, B4, A8, f7 .align 4 .L54: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L58 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L58 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 .align 4 .L58: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 2 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fpmr f26, f2 fpmr f27, f3 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 fsmtp f4, f24 fsmtp f5, f25 fsmtp f6, f26 fsmtp f7, f27 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 LFPDUX f20, BO, INC2 LFPDUX f21, BO, INC2 LFPDUX f22, BO, INC2 LFPDUX f23, BO, INC2 subi BO, BO, 16 * SIZE fpsub f0, f16, f0 fpsub f4, f17, f4 fpsub f1, f18, f1 fpsub f5, f19, f5 fpsub f2, f20, f2 fpsub f6, f21, f6 fpsub f3, f22, f3 fpsub f7, f23, f7 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 LFPDUX f20, AO, INC2 LFPDUX f21, AO, INC2 LFPDUX f22, AO, INC2 LFPDUX f23, AO, INC2 subi AO, AO, 16 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 fpsub f4, f20, f4 fpsub f5, f21, f5 fpsub f6, f22, f6 fpsub f7, f23, f7 #endif #ifdef LN addi AO, AO, 66 * SIZE LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 LFPDUX A4, AO, INCM2 LFPDUX A5, AO, INCM2 LFPDUX A6, AO, INCM2 LFPDUX A7, AO, INCM2 LFPDUX A8, AO, INCM2 fxsmul f7, A1, f7 fxcpnmsub f3, A1, f7, f3 fxcsnmsub f6, A2, f7, f6 fxcpnmsub f2, A2, f7, f2 fxcsnmsub f5, A3, f7, f5 fxcpnmsub f1, A3, f7, f1 fxcsnmsub f4, A4, f7, f4 fxcpnmsub f0, A4, f7, f0 fxpmul f3, A5, f3 fxcsnmsub f6, A6, f3, f6 fxcpnmsub f2, A6, f3, f2 fxcsnmsub f5, A7, f3, f5 fxcpnmsub f1, A7, f3, f1 fxcsnmsub f4, A8, f3, f4 fxcpnmsub f0, A8, f3, f0 add AO, AO, INCM2 LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 add AO, AO, INCM2 LFPDUX A4, AO, INCM2 LFPDUX A5, AO, INCM2 LFPDUX A6, AO, INCM2 add AO, AO, INCM2 add AO, AO, INCM2 LFPDUX A7, AO, INCM2 LFPDUX A8, AO, INCM2 fxsmul f6, A1, f6 fxcpnmsub f2, A1, f6, f2 fxcsnmsub f5, A2, f6, f5 fxcpnmsub f1, A2, f6, f1 fxcsnmsub f4, A3, f6, f4 fxcpnmsub f0, A3, f6, f0 fxpmul f2, A4, f2 fxcsnmsub f5, A5, f2, f5 fxcpnmsub f1, A5, f2, f1 fxcsnmsub f4, A6, f2, f4 fxcpnmsub f0, A6, f2, f0 fxsmul f5, A7, f5 fxcpnmsub f1, A7, f5, f1 fxcsnmsub f4, A8, f5, f4 fxcpnmsub f0, A8, f5, f0 add AO, AO, INCM2 add AO, AO, INCM2 LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 subi AO, AO, 6 * SIZE LFPDUX A3, AO, INCM2 subi AO, AO, 6 * SIZE LFPDUX A4, AO, INCM2 addi AO, AO, -2 * SIZE fxpmul f1, A1, f1 fxcsnmsub f4, A2, f1, f4 fxcpnmsub f0, A2, f1, f0 fxsmul f4, A3, f4 fxcpnmsub f0, A3, f4, f0 fxpmul f0, A4, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 fxpmul f0, A1, f0 fxcsnmsub f4, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 fxcsnmsub f5, A2, f0, f5 fxcpnmsub f2, A3, f0, f2 fxcsnmsub f6, A3, f0, f6 fxcpnmsub f3, A4, f0, f3 fxcsnmsub f7, A4, f0, f7 fxsmul f4, A5, f4 fxcpnmsub f1, A6, f4, f1 fxcsnmsub f5, A6, f4, f5 fxcpnmsub f2, A7, f4, f2 fxcsnmsub f6, A7, f4, f6 fxcpnmsub f3, A8, f4, f3 fxcsnmsub f7, A8, f4, f7 add AO, AO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 add AO, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 fxpmul f1, A1, f1 fxcsnmsub f5, A1, f1, f5 fxcpnmsub f2, A2, f1, f2 fxcsnmsub f6, A2, f1, f6 fxcpnmsub f3, A3, f1, f3 fxcsnmsub f7, A3, f1, f7 fxsmul f5, A4, f5 fxcpnmsub f2, A5, f5, f2 fxcsnmsub f6, A5, f5, f6 fxcpnmsub f3, A6, f5, f3 fxcsnmsub f7, A6, f5, f7 fxpmul f2, A7, f2 fxcsnmsub f6, A7, f2, f6 fxcpnmsub f3, A8, f2, f3 fxcsnmsub f7, A8, f2, f7 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, 6 * SIZE LFPDUX A3, AO, INC2 addi AO, AO, 6 * SIZE LFPDUX A4, AO, INC2 subi AO, AO, 64 * SIZE fxsmul f6, A1, f6 fxcpnmsub f3, A2, f6, f3 fxcsnmsub f7, A2, f6, f7 fxpmul f3, A3, f3 fxcsnmsub f7, A3, f3, f7 fxsmul f7, A4, f7 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f6, A1, f2, f6 fxcsnmsub f7, A1, f3, f7 fxsmul f4, A2, f4 fxsmul f5, A2, f5 fxsmul f6, A2, f6 fxsmul f7, A2, f7 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f4, A1, f4 fxsmul f5, A1, f5 fxsmul f6, A1, f6 fxsmul f7, A1, f7 fxcpnmsub f0, A1, f4, f0 fxcpnmsub f1, A1, f5, f1 fxcpnmsub f2, A1, f6, f2 fxcpnmsub f3, A1, f7, f3 fxpmul f0, A2, f0 fxpmul f1, A2, f1 fxpmul f2, A2, f2 fxpmul f3, A2, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f4, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f5, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f6, BO, INC2 STFPDUX f3, BO, INC2 STFPDUX f7, BO, INC2 subi BO, BO, 16 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFDUX f2, CO1, INC STFDUX f6, CO1, INC STFDUX f3, CO1, INC STFDUX f7, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFSDUX f2, CO2, INC STFSDUX f6, CO2, INC STFSDUX f3, CO2, INC STFSDUX f7, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 STFPDUX f4, AO, INC2 STFPDUX f5, AO, INC2 STFPDUX f6, AO, INC2 STFPDUX f7, AO, INC2 subi AO, AO, 16 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L89: #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif .align 4 .L90: andi. J, N, 1 beq .L999 #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO1, LDC #endif li r0, FZERO lfpsx f0, SP, r0 andi. I, M, 1 beq .L100 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L124 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L124 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L123 .align 4 .L122: fpmadd f0, A1, B1, f0 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fpmadd f1, A2, B2, f1 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fpmadd f2, A3, B3, f2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fpmadd f3, A4, B4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L122 .align 4 .L123: fpmadd f0, A1, B1, f0 fpmadd f1, A2, B2, f1 fpmadd f2, A3, B3, f2 fpmadd f3, A4, B4, f3 .align 4 .L124: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L128 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L128 #endif LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdz- .L127 .align 4 .L126: fmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdnz+ .L126 .align 4 .L127: fmadd f0, A1, B1, f0 .align 4 .L128: fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFDX f16, BO, INC2 fsub f0, f16, f0 #else LFDX f16, AO, INC2 fsub f0, f16, f0 #endif #ifdef LN LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 #endif #ifdef LT LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 #endif #ifdef RN LFDX A1, BO, INC2 fmul f0, A1, f0 #endif #ifdef RT LFDX A1, BO, INC2 fmul f0, A1, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFDX f0, BO, INC2 STFDUX f0, CO1, INC #else STFDX f0, AO, INC2 STFDUX f0, CO1, INC #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L100: andi. I, M, 2 beq .L110 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L114 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L114 #endif LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L113 .align 4 .L112: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcsmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B3, A6, f1 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L112 .align 4 .L113: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A2, f1 fxcpmadd f2, B2, A3, f2 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f1, B3, A6, f1 fxcpmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L114: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L118 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L118 #endif LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdz- .L117 .align 4 .L116: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdnz+ .L116 .align 4 .L117: fxcpmadd f0, B1, A1, f0 .align 4 .L118: fpadd f0, f0, f1 fpadd f2, f3, f2 fpadd f0, f0, f2 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 fpsub f0, f16, f0 #else LFPDX f16, AO, INC2 fpsub f0, f16, f0 #endif #ifdef LN fsmtp f4, f0 LFD A1, (2 + 3) * SIZE(AO) LFD A2, (2 + 2) * SIZE(AO) LFD A3, (2 + 0) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 fmul f0, A3, f0 fsmfp f0, f4 #endif #ifdef LT fsmtp f4, f0 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 3) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fmul f4, A3, f4 fsmfp f0, f4 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #else STFPDX f0, AO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L110: andi. I, M, 4 beq .L120 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L104 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L104 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L103 .align 4 .L102: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B3, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B3, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L102 .align 4 .L103: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcpmadd f1, B3, A2, f1 fxcsmadd f2, B3, A3, f2 fxcsmadd f3, B3, A4, f3 fxcpmadd f0, B4, A5, f0 fxcpmadd f1, B4, A6, f1 fxcsmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L104: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L108 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L108 #endif LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdz- .L107 .align 4 .L106: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdnz+ .L106 .align 4 .L107: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 .align 4 .L108: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #ifdef LN fsmtp f4, f0 fsmtp f5, f1 LFD A1, (2 + 15) * SIZE(AO) LFD A2, (2 + 14) * SIZE(AO) LFD A3, (2 + 13) * SIZE(AO) LFD A4, (2 + 12) * SIZE(AO) fmul f5, A1, f5 fnmsub f1, A2, f5, f1 fnmsub f4, A3, f5, f4 fnmsub f0, A4, f5, f0 LFD A1, (2 + 10) * SIZE(AO) LFD A2, (2 + 9) * SIZE(AO) LFD A3, (2 + 8) * SIZE(AO) fmul f1, A1, f1 fnmsub f4, A2, f1, f4 fnmsub f0, A3, f1, f0 LFD A1, (2 + 5) * SIZE(AO) LFD A2, (2 + 4) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 fsmfp f0, f4 fsmfp f1, f5 #endif #ifdef LT fsmtp f4, f0 fsmtp f5, f1 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 2) * SIZE(AO) LFD A4, (2 + 3) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fnmsub f1, A3, f0, f1 fnmsub f5, A4, f0, f5 LFD A1, (2 + 5) * SIZE(AO) LFD A2, (2 + 6) * SIZE(AO) LFD A3, (2 + 7) * SIZE(AO) fmul f4, A1, f4 fnmsub f1, A2, f4, f1 fnmsub f5, A3, f4, f5 LFD A1, (2 + 10) * SIZE(AO) LFD A2, (2 + 11) * SIZE(AO) fmul f1, A1, f1 fnmsub f5, A2, f1, f5 LFD A1, (2 + 15) * SIZE(AO) fmul f5, A1, f5 fsmfp f0, f4 fsmfp f1, f5 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L120: srawi. I, M, 3 ble .L129 .align 4 .L91: #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L94 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L94 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L93 .align 4 .L92: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B2, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 bdnz+ .L92 .align 4 .L93: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B2, A1, f0 fxcpmadd f1, B2, A2, f1 fxcpmadd f2, B2, A3, f2 fxcpmadd f3, B2, A4, f3 fxcsmadd f0, B2, A5, f0 fxcsmadd f1, B2, A6, f1 fxcsmadd f2, B2, A7, f2 fxcsmadd f3, B2, A8, f3 .align 4 .L94: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L98 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L98 #endif LFDX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdz- .L97 .align 4 .L96: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFDX B1, BO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdnz+ .L96 .align 4 .L97: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 fxcpmadd f2, B1, A3, f2 fxcpmadd f3, B1, A4, f3 .align 4 .L98: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 1 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #endif #ifdef LN fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 63) * SIZE(AO) LFD A2, (2 + 62) * SIZE(AO) LFD A3, (2 + 61) * SIZE(AO) LFD A4, (2 + 60) * SIZE(AO) LFD A5, (2 + 59) * SIZE(AO) LFD A6, (2 + 58) * SIZE(AO) LFD A7, (2 + 57) * SIZE(AO) LFD A8, (2 + 56) * SIZE(AO) fmul f7, A1, f7 fnmsub f3, A2, f7, f3 fnmsub f6, A3, f7, f6 fnmsub f2, A4, f7, f2 fnmsub f5, A5, f7, f5 fnmsub f1, A6, f7, f1 fnmsub f4, A7, f7, f4 fnmsub f0, A8, f7, f0 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 53) * SIZE(AO) LFD A3, (2 + 52) * SIZE(AO) LFD A4, (2 + 51) * SIZE(AO) LFD A5, (2 + 50) * SIZE(AO) LFD A6, (2 + 49) * SIZE(AO) LFD A7, (2 + 48) * SIZE(AO) fmul f3, A1, f3 fnmsub f6, A2, f3, f6 fnmsub f2, A3, f3, f2 fnmsub f5, A4, f3, f5 fnmsub f1, A5, f3, f1 fnmsub f4, A6, f3, f4 fnmsub f0, A7, f3, f0 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 44) * SIZE(AO) LFD A3, (2 + 43) * SIZE(AO) LFD A4, (2 + 42) * SIZE(AO) LFD A5, (2 + 41) * SIZE(AO) LFD A6, (2 + 40) * SIZE(AO) fmul f6, A1, f6 fnmsub f2, A2, f6, f2 fnmsub f5, A3, f6, f5 fnmsub f1, A4, f6, f1 fnmsub f4, A5, f6, f4 fnmsub f0, A6, f6, f0 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 35) * SIZE(AO) LFD A3, (2 + 34) * SIZE(AO) LFD A4, (2 + 33) * SIZE(AO) LFD A5, (2 + 32) * SIZE(AO) fmul f2, A1, f2 fnmsub f5, A2, f2, f5 fnmsub f1, A3, f2, f1 fnmsub f4, A4, f2, f4 fnmsub f0, A5, f2, f0 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 26) * SIZE(AO) LFD A3, (2 + 25) * SIZE(AO) LFD A4, (2 + 24) * SIZE(AO) fmul f5, A1, f5 fnmsub f1, A2, f5, f1 fnmsub f4, A3, f5, f4 fnmsub f0, A4, f5, f0 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 17) * SIZE(AO) LFD A3, (2 + 16) * SIZE(AO) fmul f1, A1, f1 fnmsub f4, A2, f1, f4 fnmsub f0, A3, f1, f0 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 8) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif #ifdef LT fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 2) * SIZE(AO) LFD A4, (2 + 3) * SIZE(AO) LFD A5, (2 + 4) * SIZE(AO) LFD A6, (2 + 5) * SIZE(AO) LFD A7, (2 + 6) * SIZE(AO) LFD A8, (2 + 7) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fnmsub f1, A3, f0, f1 fnmsub f5, A4, f0, f5 fnmsub f2, A5, f0, f2 fnmsub f6, A6, f0, f6 fnmsub f3, A7, f0, f3 fnmsub f7, A8, f0, f7 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 10) * SIZE(AO) LFD A3, (2 + 11) * SIZE(AO) LFD A4, (2 + 12) * SIZE(AO) LFD A5, (2 + 13) * SIZE(AO) LFD A6, (2 + 14) * SIZE(AO) LFD A7, (2 + 15) * SIZE(AO) fmul f4, A1, f4 fnmsub f1, A2, f4, f1 fnmsub f5, A3, f4, f5 fnmsub f2, A4, f4, f2 fnmsub f6, A5, f4, f6 fnmsub f3, A6, f4, f3 fnmsub f7, A7, f4, f7 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 19) * SIZE(AO) LFD A3, (2 + 20) * SIZE(AO) LFD A4, (2 + 21) * SIZE(AO) LFD A5, (2 + 22) * SIZE(AO) LFD A6, (2 + 23) * SIZE(AO) fmul f1, A1, f1 fnmsub f5, A2, f1, f5 fnmsub f2, A3, f1, f2 fnmsub f6, A4, f1, f6 fnmsub f3, A5, f1, f3 fnmsub f7, A6, f1, f7 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 28) * SIZE(AO) LFD A3, (2 + 29) * SIZE(AO) LFD A4, (2 + 30) * SIZE(AO) LFD A5, (2 + 31) * SIZE(AO) fmul f5, A1, f5 fnmsub f2, A2, f5, f2 fnmsub f6, A3, f5, f6 fnmsub f3, A4, f5, f3 fnmsub f7, A5, f5, f7 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 37) * SIZE(AO) LFD A3, (2 + 38) * SIZE(AO) LFD A4, (2 + 39) * SIZE(AO) fmul f2, A1, f2 fnmsub f6, A2, f2, f6 fnmsub f3, A3, f2, f3 fnmsub f7, A4, f2, f7 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 46) * SIZE(AO) LFD A3, (2 + 47) * SIZE(AO) fmul f6, A1, f6 fnmsub f3, A2, f6, f3 fnmsub f7, A3, f6, f7 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 55) * SIZE(AO) fmul f3, A1, f3 fnmsub f7, A2, f3, f7 LFD A1, (2 + 63) * SIZE(AO) fmul f7, A1, f7 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L91 .align 4 .L129: #ifdef LN slwi r0, K, 0 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L999: addi SP, SP, 12 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_hummer_LT.S000066400000000000000000002762721313527062700222250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define ALPHA 0 #define FZERO 8 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define AORIG r12 #define KK r14 #define INCM1 r15 #define INCM4 r16 #define INCM2 r17 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define CO3 r30 #define CO4 r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) # dummy li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f1, -8(SP) slwi LDC, LDC, BASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM2, -2 * SIZE li INCM4, -4 * SIZE addi C, C, - 1 * SIZE #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif srawi. J, N, 2 ble .L50 .align 4 .L10: #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -4 * SIZE #else addi AO, A, -4 * SIZE #endif #ifndef RT add C, CO4, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop srawi. r0, KK, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A1, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 LFPDUX B1, BO, INC4 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A3, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 LFPDUX A5, AO, INC4 fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 LFPDUX B3, BO, INC4 fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A6, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A7, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A9, f12 LFPDUX B5, BO, INC4 fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A8, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 LFPDUX A2, AO2, INC4 fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 nop fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 nop fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 nop fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 nop fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 nop fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 nop fxcsmadd f12, B4, A9, f12 nop fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 nop fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 nop fxcsmadd f7, B6, A4, f7 nop fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 nop .align 4 .L14: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L18 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 #endif .align 4 .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 LFPDUX A2, AO, INC4 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 LFPDUX A10, BO, INC4 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 .align 4 .L18: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 4 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 LFPDUX f16, BO, INC4 fpmr f25, f1 nop fpmr f26, f2 LFPDUX f17, BO2, INC4 fpmr f27, f3 nop fpmr f28, f8 LFPDUX f18, BO, INC4 fpmr f29, f9 nop fpmr f30, f10 LFPDUX f19, BO2, INC4 fpmr f31, f11 nop fsmfp f0, f4 LFPDUX f20, BO, INC4 fsmfp f1, f5 nop fsmfp f2, f6 LFPDUX f21, BO2, INC4 fsmfp f3, f7 nop fsmfp f8, f12 LFPDUX f22, BO, INC4 fsmfp f9, f13 nop fsmfp f10, f14 LFPDUX f23, BO2, INC4 fsmfp f11, f15 nop fsmtp f4, f24 LFPDUX f24, BO, INC4 fsmtp f5, f25 nop fsmtp f6, f26 LFPDUX f25, BO2, INC4 fsmtp f7, f27 nop fsmtp f12, f28 LFPDUX f26, BO, INC4 fsmtp f13, f29 nop fsmtp f14, f30 LFPDUX f27, BO2, INC4 fsmtp f15, f31 nop fpsub f0, f16, f0 LFPDUX f28, BO, INC4 fpsub f8, f17, f8 nop fpsub f4, f18, f4 LFPDUX f29, BO2, INC4 fpsub f12, f19, f12 nop fpsub f1, f20, f1 LFPDUX f30, BO, INC4 fpsub f9, f21, f9 subi BO, BO, 32 * SIZE fpsub f5, f22, f5 LFPDUX f31, BO2, INC4 fpsub f13, f23, f13 subi BO2, BO2, 32 * SIZE fpsub f2, f24, f2 fpsub f10, f25, f10 fpsub f6, f26, f6 fpsub f14, f27, f14 fpsub f3, f28, f3 fpsub f11, f29, f11 fpsub f7, f30, f7 fpsub f15, f31, f15 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 fpsub f0, f16, f0 LFPDUX f24, AO, INC4 fpsub f1, f17, f1 LFPDUX f25, AO2, INC4 fpsub f2, f18, f2 LFPDUX f26, AO, INC4 fpsub f3, f19, f3 LFPDUX f27, AO2, INC4 fpsub f4, f20, f4 LFPDUX f28, AO, INC4 fpsub f5, f21, f5 LFPDUX f29, AO2, INC4 fpsub f6, f22, f6 LFPDUX f30, AO, INC4 fpsub f7, f23, f7 LFPDUX f31, AO2, INC4 fpsub f8, f24, f8 subi AO, AO, 32 * SIZE fpsub f9, f25, f9 subi AO2, AO2, 32 * SIZE fpsub f10, f26, f10 fpsub f11, f27, f11 fpsub f12, f28, f12 fpsub f13, f29, f13 fpsub f14, f30, f14 fpsub f15, f31, f15 #endif #ifdef LN addi AO, AO, 68 * SIZE addi AO2, AO2, 68 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 LFPDUX A3, AO2, INCM4 LFPDUX A4, AO, INCM4 LFPDUX A5, AO2, INCM4 LFPDUX A6, AO, INCM4 LFPDUX A7, AO2, INCM4 LFPDUX A8, AO, INCM4 fxsmul f7, A1, f7 fxsmul f15, A1, f15 fxcpnmsub f3, A1, f7, f3 fxcpnmsub f11, A1, f15, f11 fxcsnmsub f6, A2, f7, f6 fxcsnmsub f14, A2, f15, f14 fxcpnmsub f2, A2, f7, f2 fxcpnmsub f10, A2, f15, f10 fxcsnmsub f5, A3, f7, f5 fxcsnmsub f13, A3, f15, f13 fxcpnmsub f1, A3, f7, f1 fxcpnmsub f9, A3, f15, f9 fxcsnmsub f4, A4, f7, f4 fxcsnmsub f12, A4, f15, f12 fxcpnmsub f0, A4, f7, f0 fxcpnmsub f8, A4, f15, f8 fxpmul f3, A5, f3 fxpmul f11, A5, f11 fxcsnmsub f6, A6, f3, f6 fxcsnmsub f14, A6, f11, f14 fxcpnmsub f2, A6, f3, f2 fxcpnmsub f10, A6, f11, f10 fxcsnmsub f5, A7, f3, f5 fxcsnmsub f13, A7, f11, f13 fxcpnmsub f1, A7, f3, f1 fxcpnmsub f9, A7, f11, f9 fxcsnmsub f4, A8, f3, f4 fxcsnmsub f12, A8, f11, f12 fxcpnmsub f0, A8, f3, f0 fxcpnmsub f8, A8, f11, f8 add AO2, AO2, INCM4 LFPDUX A1, AO, INCM4 LFPDUX A2, AO2, INCM4 LFPDUX A3, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A4, AO, INCM4 LFPDUX A5, AO2, INCM4 LFPDUX A6, AO, INCM4 add AO2, AO2, INCM4 add AO, AO, INCM4 LFPDUX A7, AO2, INCM4 LFPDUX A8, AO, INCM4 fxsmul f6, A1, f6 fxsmul f14, A1, f14 fxcpnmsub f2, A1, f6, f2 fxcpnmsub f10, A1, f14, f10 fxcsnmsub f5, A2, f6, f5 fxcsnmsub f13, A2, f14, f13 fxcpnmsub f1, A2, f6, f1 fxcpnmsub f9, A2, f14, f9 fxcsnmsub f4, A3, f6, f4 fxcsnmsub f12, A3, f14, f12 fxcpnmsub f0, A3, f6, f0 fxcpnmsub f8, A3, f14, f8 fxpmul f2, A4, f2 fxpmul f10, A4, f10 fxcsnmsub f5, A5, f2, f5 fxcsnmsub f13, A5, f10, f13 fxcpnmsub f1, A5, f2, f1 fxcpnmsub f9, A5, f10, f9 fxcsnmsub f4, A6, f2, f4 fxcsnmsub f12, A6, f10, f12 fxcpnmsub f0, A6, f2, f0 fxcpnmsub f8, A6, f10, f8 fxsmul f5, A7, f5 fxsmul f13, A7, f13 fxcpnmsub f1, A7, f5, f1 fxcpnmsub f9, A7, f13, f9 fxcsnmsub f4, A8, f5, f4 fxcsnmsub f12, A8, f13, f12 fxcpnmsub f0, A8, f5, f0 fxcpnmsub f8, A8, f13, f8 add AO2, AO2, INCM4 add AO, AO, INCM4 LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 subi AO2, AO2, 8 * SIZE add AO, AO, INCM4 LFPDUX A3, AO, INCM4 subi AO2, AO2, 8 * SIZE add AO, AO, INCM4 LFPDUX A4, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxpmul f1, A1, f1 fxpmul f9, A1, f9 fxcsnmsub f4, A2, f1, f4 fxcsnmsub f12, A2, f9, f12 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 fxsmul f4, A3, f4 fxsmul f12, A3, f12 fxcpnmsub f0, A3, f4, f0 fxcpnmsub f8, A3, f12, f8 fxpmul f0, A4, f0 fxpmul f8, A4, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A5, AO, INC4 LFPDUX A6, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 fxcsnmsub f5, A2, f0, f5 fxcsnmsub f13, A2, f8, f13 fxcpnmsub f2, A3, f0, f2 fxcpnmsub f10, A3, f8, f10 fxcsnmsub f6, A3, f0, f6 fxcsnmsub f14, A3, f8, f14 fxcpnmsub f3, A4, f0, f3 fxcpnmsub f11, A4, f8, f11 fxcsnmsub f7, A4, f0, f7 fxcsnmsub f15, A4, f8, f15 fxsmul f4, A5, f4 fxsmul f12, A5, f12 fxcpnmsub f1, A6, f4, f1 fxcpnmsub f9, A6, f12, f9 fxcsnmsub f5, A6, f4, f5 fxcsnmsub f13, A6, f12, f13 fxcpnmsub f2, A7, f4, f2 fxcpnmsub f10, A7, f12, f10 fxcsnmsub f6, A7, f4, f6 fxcsnmsub f14, A7, f12, f14 fxcpnmsub f3, A8, f4, f3 fxcpnmsub f11, A8, f12, f11 fxcsnmsub f7, A8, f4, f7 fxcsnmsub f15, A8, f12, f15 add AO, AO, INC4 LFPDUX A1, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 add AO, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A5, AO, INC4 LFPDUX A6, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 fxpmul f1, A1, f1 fxpmul f9, A1, f9 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f13, A1, f9, f13 fxcpnmsub f2, A2, f1, f2 fxcpnmsub f10, A2, f9, f10 fxcsnmsub f6, A2, f1, f6 fxcsnmsub f14, A2, f9, f14 fxcpnmsub f3, A3, f1, f3 fxcpnmsub f11, A3, f9, f11 fxcsnmsub f7, A3, f1, f7 fxcsnmsub f15, A3, f9, f15 fxsmul f5, A4, f5 fxsmul f13, A4, f13 fxcpnmsub f2, A5, f5, f2 fxcpnmsub f10, A5, f13, f10 fxcsnmsub f6, A5, f5, f6 fxcsnmsub f14, A5, f13, f14 fxcpnmsub f3, A6, f5, f3 fxcpnmsub f11, A6, f13, f11 fxcsnmsub f7, A6, f5, f7 fxcsnmsub f15, A6, f13, f15 fxpmul f2, A7, f2 fxpmul f10, A7, f10 fxcsnmsub f6, A7, f2, f6 fxcsnmsub f14, A7, f10, f14 fxcpnmsub f3, A8, f2, f3 fxcpnmsub f11, A8, f10, f11 fxcsnmsub f7, A8, f2, f7 fxcsnmsub f15, A8, f10, f15 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 addi AO, AO, 8 * SIZE addi AO2, AO2, 4 * SIZE LFPDUX A3, AO2, INC4 addi AO, AO, 8 * SIZE addi AO2, AO2, 4 * SIZE LFPDUX A4, AO2, INC4 subi AO, AO, 64 * SIZE subi AO2, AO2, 64 * SIZE fxsmul f6, A1, f6 fxsmul f14, A1, f14 fxcpnmsub f3, A2, f6, f3 fxcpnmsub f11, A2, f14, f11 fxcsnmsub f7, A2, f6, f7 fxcsnmsub f15, A2, f14, f15 fxpmul f3, A3, f3 fxpmul f11, A3, f11 fxcsnmsub f7, A3, f3, f7 fxcsnmsub f15, A3, f11, f15 fxsmul f7, A4, f7 fxsmul f15, A4, f15 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f6, A1, f2, f6 fxcsnmsub f7, A1, f3, f7 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcpnmsub f10, A2, f2, f10 fxcpnmsub f11, A2, f3, f11 fxcsnmsub f12, A2, f0, f12 fxcsnmsub f13, A2, f1, f13 fxcsnmsub f14, A2, f2, f14 fxcsnmsub f15, A2, f3, f15 fxsmul f4, A3, f4 fxsmul f5, A3, f5 fxsmul f6, A3, f6 fxsmul f7, A3, f7 fxcpnmsub f8, A4, f4, f8 fxcpnmsub f9, A4, f5, f9 fxcpnmsub f10, A4, f6, f10 fxcpnmsub f11, A4, f7, f11 fxcsnmsub f12, A4, f4, f12 fxcsnmsub f13, A4, f5, f13 fxcsnmsub f14, A4, f6, f14 fxcsnmsub f15, A4, f7, f15 fxpmul f8, A5, f8 fxpmul f9, A5, f9 fxpmul f10, A5, f10 fxpmul f11, A5, f11 fxcsnmsub f12, A5, f8, f12 fxcsnmsub f13, A5, f9, f13 fxcsnmsub f14, A5, f10, f14 fxcsnmsub f15, A5, f11, f15 fxsmul f12, A6, f12 fxsmul f13, A6, f13 fxsmul f14, A6, f14 fxsmul f15, A6, f15 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxsmul f13, A1, f13 fxsmul f14, A1, f14 fxsmul f15, A1, f15 fxcpnmsub f8, A1, f12, f8 fxcpnmsub f9, A1, f13, f9 fxcpnmsub f10, A1, f14, f10 fxcpnmsub f11, A1, f15, f11 fxcsnmsub f4, A2, f12, f4 fxcsnmsub f5, A2, f13, f5 fxcsnmsub f6, A2, f14, f6 fxcsnmsub f7, A2, f15, f7 fxcpnmsub f0, A2, f12, f0 fxcpnmsub f1, A2, f13, f1 fxcpnmsub f2, A2, f14, f2 fxcpnmsub f3, A2, f15, f3 fxpmul f8, A3, f8 fxpmul f9, A3, f9 fxpmul f10, A3, f10 fxpmul f11, A3, f11 fxcsnmsub f4, A4, f8, f4 fxcsnmsub f5, A4, f9, f5 fxcsnmsub f6, A4, f10, f6 fxcsnmsub f7, A4, f11, f7 fxcpnmsub f0, A4, f8, f0 fxcpnmsub f1, A4, f9, f1 fxcpnmsub f2, A4, f10, f2 fxcpnmsub f3, A4, f11, f3 fxsmul f4, A5, f4 fxsmul f5, A5, f5 fxsmul f6, A5, f6 fxsmul f7, A5, f7 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f1, A5, f5, f1 fxcpnmsub f2, A5, f6, f2 fxcpnmsub f3, A5, f7, f3 fxpmul f0, A6, f0 fxpmul f1, A6, f1 fxpmul f2, A6, f2 fxpmul f3, A6, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE subi CO3, CO3, 8 * SIZE subi CO4, CO4, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f5, BO, INC4 STFPDUX f13, BO2, INC4 STFPDUX f2, BO, INC4 STFPDUX f10, BO2, INC4 STFPDUX f6, BO, INC4 STFPDUX f14, BO2, INC4 STFPDUX f3, BO, INC4 STFPDUX f11, BO2, INC4 STFPDUX f7, BO, INC4 STFPDUX f15, BO2, INC4 subi BO, BO, 32 * SIZE subi BO2, BO2, 32 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFDUX f2, CO1, INC STFDUX f6, CO1, INC STFDUX f3, CO1, INC STFDUX f7, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFSDUX f2, CO2, INC STFSDUX f6, CO2, INC STFSDUX f3, CO2, INC STFSDUX f7, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFDUX f9, CO3, INC STFDUX f13, CO3, INC STFDUX f10, CO3, INC STFDUX f14, CO3, INC STFDUX f11, CO3, INC STFDUX f15, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC STFSDUX f9, CO4, INC STFSDUX f13, CO4, INC STFSDUX f10, CO4, INC STFSDUX f14, CO4, INC STFSDUX f11, CO4, INC STFSDUX f15, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f2, AO, INC4 STFPDUX f3, AO2, INC4 STFPDUX f4, AO, INC4 STFPDUX f5, AO2, INC4 STFPDUX f6, AO, INC4 STFPDUX f7, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f10, AO, INC4 STFPDUX f11, AO2, INC4 STFPDUX f12, AO, INC4 STFPDUX f13, AO2, INC4 STFPDUX f14, AO, INC4 STFPDUX f15, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f10, CO3, INC STFSDUX f10, CO3, INC STFDUX f11, CO3, INC STFSDUX f11, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC STFDUX f14, CO4, INC STFSDUX f14, CO4, INC STFDUX f15, CO4, INC STFSDUX f15, CO4, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE subi CO3, CO3, 8 * SIZE subi CO4, CO4, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L20: andi. I, M, 4 beq .L30 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 nop fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 LFPDUX B2, BO2, INC4 fxcpmadd f0, B3, A3, f0 nop fxcsmadd f4, B3, A3, f4 LFPDUX A2, AO2, INC4 fxcpmadd f8, B4, A3, f8 nop fxcsmadd f12, B4, A3, f12 LFPDUX A3, AO, INC4 fxcpmadd f1, B3, A4, f1 nop fxcsmadd f5, B3, A4, f5 LFPDUX B3, BO, INC4 fxcpmadd f9, B4, A4, f9 nop fxcsmadd f13, B4, A4, f13 LFPDUX B4, BO2, INC4 fxcpmadd f0, B5, A5, f0 nop fxcsmadd f4, B5, A5, f4 LFPDUX A4, AO2, INC4 fxcpmadd f8, B6, A5, f8 nop fxcsmadd f12, B6, A5, f12 LFPDUX A5, AO, INC4 fxcpmadd f1, B5, A6, f1 nop fxcsmadd f5, B5, A6, f5 LFPDUX B5, BO, INC4 fxcpmadd f9, B6, A6, f9 nop fxcsmadd f13, B6, A6, f13 LFPDUX B6, BO2, INC4 fxcpmadd f0, A9, A7, f0 nop fxcsmadd f4, A9, A7, f4 LFPDUX A6, AO2, INC4 fxcpmadd f8, A10, A7, f8 nop fxcsmadd f12, A10, A7, f12 LFPDUX A7, AO, INC4 fxcpmadd f1, A9, A8, f1 nop fxcsmadd f5, A9, A8, f5 LFPDUX A9, BO, INC4 fxcpmadd f9, A10, A8, f9 nop fxcsmadd f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 fxcpmadd f0, B3, A3, f0 fxcsmadd f4, B3, A3, f4 fxcpmadd f8, B4, A3, f8 fxcsmadd f12, B4, A3, f12 fxcpmadd f1, B3, A4, f1 fxcsmadd f5, B3, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 fxcpmadd f0, B5, A5, f0 fxcsmadd f4, B5, A5, f4 fxcpmadd f8, B6, A5, f8 fxcsmadd f12, B6, A5, f12 fxcpmadd f1, B5, A6, f1 fxcsmadd f5, B5, A6, f5 fxcpmadd f9, B6, A6, f9 fxcsmadd f13, B6, A6, f13 fxcpmadd f0, A9, A7, f0 fxcsmadd f4, A9, A7, f4 fxcpmadd f8, A10, A7, f8 fxcsmadd f12, A10, A7, f12 fxcpmadd f1, A9, A8, f1 fxcsmadd f5, A9, A8, f5 fxcpmadd f9, A10, A8, f9 fxcsmadd f13, A10, A8, f13 .align 4 .L24: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L28 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L28 #endif LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 .align 4 .L28: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 4 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fpmr f28, f8 fpmr f29, f9 fsmfp f0, f4 fsmfp f1, f5 fsmfp f8, f12 fsmfp f9, f13 fsmtp f4, f24 fsmtp f5, f25 fsmtp f12, f28 fsmtp f13, f29 LFPDUX f16, BO, INC4 LFPDUX f17, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f19, BO2, INC4 LFPDUX f20, BO, INC4 LFPDUX f21, BO2, INC4 LFPDUX f22, BO, INC4 LFPDUX f23, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fpsub f0, f16, f0 fpsub f8, f17, f8 fpsub f4, f18, f4 fpsub f12, f19, f12 fpsub f1, f20, f1 fpsub f9, f21, f9 fpsub f5, f22, f5 fpsub f13, f23, f13 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f4, f18, f4 fpsub f5, f19, f5 fpsub f8, f20, f8 fpsub f9, f21, f9 fpsub f12, f22, f12 fpsub f13, f23, f13 #endif #ifdef LN addi AO, AO, 20 * SIZE addi AO2, AO2, 20 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 LFPDUX A3, AO2, INCM4 LFPDUX A4, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A5, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A6, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxsmul f5, A1, f5 fxsmul f13, A1, f13 fxcpnmsub f1, A1, f5, f1 fxcpnmsub f9, A1, f13, f9 fxcsnmsub f4, A2, f5, f4 fxcsnmsub f12, A2, f13, f12 fxcpnmsub f0, A2, f5, f0 fxcpnmsub f8, A2, f13, f8 fxpmul f1, A3, f1 fxpmul f9, A3, f9 fxcsnmsub f4, A4, f1, f4 fxcsnmsub f12, A4, f9, f12 fxcpnmsub f0, A4, f1, f0 fxcpnmsub f8, A4, f9, f8 fxsmul f4, A5, f4 fxsmul f12, A5, f12 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f8, A5, f12, f8 fxpmul f0, A6, f0 fxpmul f8, A6, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 add AO, AO, INC4 LFPDUX A5, AO2, INC4 add AO, AO, INC4 LFPDUX A6, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 fxcsnmsub f5, A2, f0, f5 fxcsnmsub f13, A2, f8, f13 fxsmul f4, A3, f4 fxsmul f12, A3, f12 fxcpnmsub f1, A4, f4, f1 fxcpnmsub f9, A4, f12, f9 fxcsnmsub f5, A4, f4, f5 fxcsnmsub f13, A4, f12, f13 fxpmul f1, A5, f1 fxpmul f9, A5, f9 fxcsnmsub f5, A5, f1, f5 fxcsnmsub f13, A5, f9, f13 fxsmul f5, A6, f5 fxsmul f13, A6, f13 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcsnmsub f12, A2, f0, f12 fxcsnmsub f13, A2, f1, f13 fxsmul f4, A3, f4 fxsmul f5, A3, f5 fxcpnmsub f8, A4, f4, f8 fxcpnmsub f9, A4, f5, f9 fxcsnmsub f12, A4, f4, f12 fxcsnmsub f13, A4, f5, f13 fxpmul f8, A5, f8 fxpmul f9, A5, f9 fxcsnmsub f12, A5, f8, f12 fxcsnmsub f13, A5, f9, f13 fxsmul f12, A6, f12 fxsmul f13, A6, f13 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxsmul f13, A1, f13 fxcpnmsub f8, A1, f12, f8 fxcpnmsub f9, A1, f13, f9 fxcsnmsub f4, A2, f12, f4 fxcsnmsub f5, A2, f13, f5 fxcpnmsub f0, A2, f12, f0 fxcpnmsub f1, A2, f13, f1 fxpmul f8, A3, f8 fxpmul f9, A3, f9 fxcsnmsub f4, A4, f8, f4 fxcsnmsub f5, A4, f9, f5 fxcpnmsub f0, A4, f8, f0 fxcpnmsub f1, A4, f9, f1 fxsmul f4, A5, f4 fxsmul f5, A5, f5 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f1, A5, f5, f1 fxpmul f0, A6, f0 fxpmul f1, A6, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f5, BO, INC4 STFPDUX f13, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFDUX f9, CO3, INC STFDUX f13, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC STFSDUX f9, CO4, INC STFSDUX f13, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f4, AO, INC4 STFPDUX f5, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f12, AO, INC4 STFPDUX f13, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: andi. I, M, 2 beq .L40 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L34 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX B1, BO, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, B3, A2, f0 fxcsmadd f4, B3, A2, f4 LFPDUX B3, BO, INC4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A5, A3, f0 fxcsmadd f4, A5, A3, f4 LFPDUX A5, BO, INC4 fxcpmadd f8, A6, A3, f8 fxcsmadd f12, A6, A3, f12 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A7, A4, f0 fxcsmadd f4, A7, A4, f4 LFPDUX A7, BO, INC4 fxcpmadd f8, A8, A4, f8 fxcsmadd f12, A8, A4, f12 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f0, B3, A2, f0 fxcsmadd f4, B3, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 fxcpmadd f0, A5, A3, f0 fxcsmadd f4, A5, A3, f4 fxcpmadd f8, A6, A3, f8 fxcsmadd f12, A6, A3, f12 fxcpmadd f0, A7, A4, f0 fxcsmadd f4, A7, A4, f4 fxcpmadd f8, A8, A4, f8 fxcsmadd f12, A8, A4, f12 .align 4 .L34: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L38 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L38 #endif LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX B1, BO, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 .align 4 .L38: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f28, f8 fsmfp f0, f4 fsmfp f8, f12 fsmtp f4, f24 fsmtp f12, f28 LFPDUX f16, BO, INC4 LFPDUX f17, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f19, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fpsub f0, f16, f0 fpsub f8, f17, f8 fpsub f4, f18, f4 fpsub f12, f19, f12 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fpsub f0, f16, f0 fpsub f4, f17, f4 fpsub f8, f18, f8 fpsub f12, f19, f12 #endif #ifdef LN addi AO, AO, 8 * SIZE addi AO2, AO2, 8 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxsmul f4, A1, f4 fxsmul f12, A1, f12 fxcpnmsub f0, A1, f4, f0 fxcpnmsub f8, A1, f12, f8 fxpmul f0, A2, f0 fxpmul f8, A2, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 subi AO, AO, 4 * SIZE subi AO2, AO2, 4 * SIZE fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxsmul f4, A2, f4 fxsmul f12, A2, f12 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxcsnmsub f4, A1, f0, f4 fxcpnmsub f8, A2, f0, f8 fxcsnmsub f12, A2, f0, f12 fxsmul f4, A3, f4 fxcpnmsub f8, A4, f4, f8 fxcsnmsub f12, A4, f4, f12 fxpmul f8, A5, f8 fxcsnmsub f12, A5, f8, f12 fxsmul f12, A6, f12 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxcpnmsub f8, A1, f12, f8 fxcsnmsub f4, A2, f12, f4 fxcpnmsub f0, A2, f12, f0 fxpmul f8, A3, f8 fxcsnmsub f4, A4, f8, f4 fxcpnmsub f0, A4, f8, f0 fxsmul f4, A5, f4 fxcpnmsub f0, A5, f4, f0 fxpmul f0, A6, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f4, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f12, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L40: andi. I, M, 1 beq .L49 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L44 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L44 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L43 .align 4 .L42: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A3, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A3, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A3, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A4, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A4, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A4, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A4, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L42 .align 4 .L43: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 fxcpmadd f0, A3, B1, f0 fxcpmadd f1, A3, B2, f1 fxcsmadd f2, A3, B3, f2 fxcsmadd f3, A3, B4, f3 fxcpmadd f0, A4, A5, f0 fxcpmadd f1, A4, A6, f1 fxcsmadd f2, A4, A7, f2 fxcsmadd f3, A4, A8, f3 .align 4 .L44: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L48 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L48 #endif LFDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdz- .L47 .align 4 .L46: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdnz+ .L46 .align 4 .L47: fxcpmadd f0, A1, B1, f0 fxcpmadd f1, A1, B2, f1 addi AO2, AO, 2 * SIZE .align 4 .L48: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC4 LFPDX f17, BO2, INC4 fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDX f16, AO, INC4 LFPDX f17, AO2, INC4 fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #if defined(LN) || defined(LT) LFPDX A1, AO, INC4 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef RN LFD A1, (4 + 0) * SIZE(BO) LFD A2, (4 + 1) * SIZE(BO) LFD A3, (4 + 2) * SIZE(BO) LFD A4, (4 + 3) * SIZE(BO) LFD A5, (4 + 5) * SIZE(BO) LFD A6, (4 + 6) * SIZE(BO) LFD A7, (4 + 7) * SIZE(BO) LFD A8, (4 + 10) * SIZE(BO) LFD A9, (4 + 11) * SIZE(BO) LFD A10, (4 + 15) * SIZE(BO) fsmtp f2, f0 fsmtp f3, f1 fmul f0, A1, f0 fnmsub f2, A2, f0, f2 fnmsub f1, A3, f0, f1 fnmsub f3, A4, f0, f3 fmul f2, A5, f2 fnmsub f1, A6, f2, f1 fnmsub f3, A7, f2, f3 fmul f1, A8, f1 fnmsub f3, A9, f1, f3 fmul f3, A10, f3 fsmfp f0, f2 fsmfp f1, f3 #endif #ifdef RT LFD A1, (4 + 15) * SIZE(BO) LFD A2, (4 + 14) * SIZE(BO) LFD A3, (4 + 13) * SIZE(BO) LFD A4, (4 + 12) * SIZE(BO) LFD A5, (4 + 10) * SIZE(BO) LFD A6, (4 + 9) * SIZE(BO) LFD A7, (4 + 8) * SIZE(BO) LFD A8, (4 + 5) * SIZE(BO) LFD A9, (4 + 4) * SIZE(BO) LFD A10, (4 + 0) * SIZE(BO) fsmtp f2, f0 fsmtp f3, f1 fmul f3, A1, f3 fnmsub f1, A2, f3, f1 fnmsub f2, A3, f3, f2 fnmsub f0, A4, f3, f0 fmul f1, A5, f1 fnmsub f2, A6, f1, f2 fnmsub f0, A7, f1, f0 fmul f2, A8, f2 fnmsub f0, A9, f2, f0 fmul f0, A10, f0 fsmfp f0, f2 fsmfp f1, f3 #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC4 STFPDX f1, BO2, INC4 #else STFPDX f0, AO, INC4 STFPDX f1, AO2, INC4 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO2, INC STFDUX f1, CO3, INC STFSDUX f1, CO4, INC #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L49: #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 4 * SIZE #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 bgt+ .L10 .align 4 .L50: andi. J, N, 2 beq .L90 #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L60 .align 4 .L51: #if defined(LT) || defined(RN) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, KK, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX B1, BO, INC2 fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX B2, BO, INC2 fxcsmadd f4, B3, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B3, A3, f2 nop fxcsmadd f6, B3, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX B3, BO, INC2 fxcsmadd f4, B4, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 nop fxcsmadd f5, B4, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B4, A7, f2 nop fxcsmadd f6, B4, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B4, A8, f3 nop fxcsmadd f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 nop fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcsmadd f4, B3, A1, f4 fxcpmadd f1, B3, A2, f1 fxcsmadd f5, B3, A2, f5 fxcpmadd f2, B3, A3, f2 fxcsmadd f6, B3, A3, f6 fxcpmadd f3, B3, A4, f3 fxcsmadd f7, B3, A4, f7 fxcpmadd f0, B4, A5, f0 fxcsmadd f4, B4, A5, f4 fxcpmadd f1, B4, A6, f1 fxcsmadd f5, B4, A6, f5 fxcpmadd f2, B4, A7, f2 fxcsmadd f6, B4, A7, f6 fxcpmadd f3, B4, A8, f3 fxcsmadd f7, B4, A8, f7 .align 4 .L54: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L58 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L58 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 .align 4 .L58: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 2 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fpmr f26, f2 fpmr f27, f3 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 fsmtp f4, f24 fsmtp f5, f25 fsmtp f6, f26 fsmtp f7, f27 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 LFPDUX f20, BO, INC2 LFPDUX f21, BO, INC2 LFPDUX f22, BO, INC2 LFPDUX f23, BO, INC2 subi BO, BO, 16 * SIZE fpsub f0, f16, f0 fpsub f4, f17, f4 fpsub f1, f18, f1 fpsub f5, f19, f5 fpsub f2, f20, f2 fpsub f6, f21, f6 fpsub f3, f22, f3 fpsub f7, f23, f7 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 LFPDUX f20, AO, INC2 LFPDUX f21, AO, INC2 LFPDUX f22, AO, INC2 LFPDUX f23, AO, INC2 subi AO, AO, 16 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 fpsub f4, f20, f4 fpsub f5, f21, f5 fpsub f6, f22, f6 fpsub f7, f23, f7 #endif #ifdef LN addi AO, AO, 66 * SIZE LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 LFPDUX A4, AO, INCM2 LFPDUX A5, AO, INCM2 LFPDUX A6, AO, INCM2 LFPDUX A7, AO, INCM2 LFPDUX A8, AO, INCM2 fxsmul f7, A1, f7 fxcpnmsub f3, A1, f7, f3 fxcsnmsub f6, A2, f7, f6 fxcpnmsub f2, A2, f7, f2 fxcsnmsub f5, A3, f7, f5 fxcpnmsub f1, A3, f7, f1 fxcsnmsub f4, A4, f7, f4 fxcpnmsub f0, A4, f7, f0 fxpmul f3, A5, f3 fxcsnmsub f6, A6, f3, f6 fxcpnmsub f2, A6, f3, f2 fxcsnmsub f5, A7, f3, f5 fxcpnmsub f1, A7, f3, f1 fxcsnmsub f4, A8, f3, f4 fxcpnmsub f0, A8, f3, f0 add AO, AO, INCM2 LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 add AO, AO, INCM2 LFPDUX A4, AO, INCM2 LFPDUX A5, AO, INCM2 LFPDUX A6, AO, INCM2 add AO, AO, INCM2 add AO, AO, INCM2 LFPDUX A7, AO, INCM2 LFPDUX A8, AO, INCM2 fxsmul f6, A1, f6 fxcpnmsub f2, A1, f6, f2 fxcsnmsub f5, A2, f6, f5 fxcpnmsub f1, A2, f6, f1 fxcsnmsub f4, A3, f6, f4 fxcpnmsub f0, A3, f6, f0 fxpmul f2, A4, f2 fxcsnmsub f5, A5, f2, f5 fxcpnmsub f1, A5, f2, f1 fxcsnmsub f4, A6, f2, f4 fxcpnmsub f0, A6, f2, f0 fxsmul f5, A7, f5 fxcpnmsub f1, A7, f5, f1 fxcsnmsub f4, A8, f5, f4 fxcpnmsub f0, A8, f5, f0 add AO, AO, INCM2 add AO, AO, INCM2 LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 subi AO, AO, 6 * SIZE LFPDUX A3, AO, INCM2 subi AO, AO, 6 * SIZE LFPDUX A4, AO, INCM2 addi AO, AO, -2 * SIZE fxpmul f1, A1, f1 fxcsnmsub f4, A2, f1, f4 fxcpnmsub f0, A2, f1, f0 fxsmul f4, A3, f4 fxcpnmsub f0, A3, f4, f0 fxpmul f0, A4, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 fxpmul f0, A1, f0 fxcsnmsub f4, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 fxcsnmsub f5, A2, f0, f5 fxcpnmsub f2, A3, f0, f2 fxcsnmsub f6, A3, f0, f6 fxcpnmsub f3, A4, f0, f3 fxcsnmsub f7, A4, f0, f7 fxsmul f4, A5, f4 fxcpnmsub f1, A6, f4, f1 fxcsnmsub f5, A6, f4, f5 fxcpnmsub f2, A7, f4, f2 fxcsnmsub f6, A7, f4, f6 fxcpnmsub f3, A8, f4, f3 fxcsnmsub f7, A8, f4, f7 add AO, AO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 add AO, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 fxpmul f1, A1, f1 fxcsnmsub f5, A1, f1, f5 fxcpnmsub f2, A2, f1, f2 fxcsnmsub f6, A2, f1, f6 fxcpnmsub f3, A3, f1, f3 fxcsnmsub f7, A3, f1, f7 fxsmul f5, A4, f5 fxcpnmsub f2, A5, f5, f2 fxcsnmsub f6, A5, f5, f6 fxcpnmsub f3, A6, f5, f3 fxcsnmsub f7, A6, f5, f7 fxpmul f2, A7, f2 fxcsnmsub f6, A7, f2, f6 fxcpnmsub f3, A8, f2, f3 fxcsnmsub f7, A8, f2, f7 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, 6 * SIZE LFPDUX A3, AO, INC2 addi AO, AO, 6 * SIZE LFPDUX A4, AO, INC2 subi AO, AO, 64 * SIZE fxsmul f6, A1, f6 fxcpnmsub f3, A2, f6, f3 fxcsnmsub f7, A2, f6, f7 fxpmul f3, A3, f3 fxcsnmsub f7, A3, f3, f7 fxsmul f7, A4, f7 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f6, A1, f2, f6 fxcsnmsub f7, A1, f3, f7 fxsmul f4, A2, f4 fxsmul f5, A2, f5 fxsmul f6, A2, f6 fxsmul f7, A2, f7 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f4, A1, f4 fxsmul f5, A1, f5 fxsmul f6, A1, f6 fxsmul f7, A1, f7 fxcpnmsub f0, A1, f4, f0 fxcpnmsub f1, A1, f5, f1 fxcpnmsub f2, A1, f6, f2 fxcpnmsub f3, A1, f7, f3 fxpmul f0, A2, f0 fxpmul f1, A2, f1 fxpmul f2, A2, f2 fxpmul f3, A2, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f4, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f5, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f6, BO, INC2 STFPDUX f3, BO, INC2 STFPDUX f7, BO, INC2 subi BO, BO, 16 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFDUX f2, CO1, INC STFDUX f6, CO1, INC STFDUX f3, CO1, INC STFDUX f7, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFSDUX f2, CO2, INC STFSDUX f6, CO2, INC STFSDUX f3, CO2, INC STFSDUX f7, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 STFPDUX f4, AO, INC2 STFPDUX f5, AO, INC2 STFPDUX f6, AO, INC2 STFPDUX f7, AO, INC2 subi AO, AO, 16 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L60: andi. I, M, 4 beq .L70 #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L64 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 LFPDUX A5, AO, INC2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 .align 4 .L64: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L68 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L68 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 .align 4 .L68: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fsmfp f0, f2 fsmfp f1, f3 fsmtp f2, f24 fsmtp f3, f25 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE fpsub f0, f16, f0 fpsub f2, f17, f2 fpsub f1, f18, f1 fpsub f3, f19, f3 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #endif #ifdef LN addi AO, AO, 18 * SIZE LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 LFPDUX A4, AO, INCM2 add AO, AO, INCM2 LFPDUX A5, AO, INCM2 add AO, AO, INCM2 LFPDUX A6, AO, INCM2 subi AO, AO, 2 * SIZE fxsmul f3, A1, f3 fxcpnmsub f1, A1, f3, f1 fxcsnmsub f2, A2, f3, f2 fxcpnmsub f0, A2, f3, f0 fxpmul f1, A3, f1 fxcsnmsub f2, A4, f1, f2 fxcpnmsub f0, A4, f1, f0 fxsmul f2, A5, f2 fxcpnmsub f0, A5, f2, f0 fxpmul f0, A6, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add AO, AO, INC2 LFPDUX A5, AO, INC2 add AO, AO, INC2 LFPDUX A6, AO, INC2 subi AO, AO, 16 * SIZE fxpmul f0, A1, f0 fxcsnmsub f2, A1, f0, f2 fxcpnmsub f1, A2, f0, f1 fxcsnmsub f3, A2, f0, f3 fxsmul f2, A3, f2 fxcpnmsub f1, A4, f2, f1 fxcsnmsub f3, A4, f2, f3 fxpmul f1, A5, f1 fxcsnmsub f3, A5, f1, f3 fxsmul f3, A6, f3 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxcsnmsub f2, A1, f0, f2 fxcsnmsub f3, A1, f1, f3 fxsmul f2, A2, f2 fxsmul f3, A2, f3 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f2, A1, f2 fxsmul f3, A1, f3 fxcpnmsub f0, A1, f2, f0 fxcpnmsub f1, A1, f3, f1 fxpmul f0, A2, f0 fxpmul f1, A2, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE STFDUX f0, CO1, INC STFDUX f2, CO1, INC STFDUX f1, CO1, INC STFDUX f3, CO1, INC STFSDUX f0, CO2, INC STFSDUX f2, CO2, INC STFSDUX f1, CO2, INC STFSDUX f3, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC STFDUX f3, CO2, INC STFSDUX f3, CO2, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: andi. I, M, 2 beq .L80 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L74 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 .align 4 .L74: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L78 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L78 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 .align 4 .L78: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fsmfp f0, f1 fsmtp f1, f24 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #ifdef LN LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, -4 * SIZE fxsmul f1, A2, f1 fxcpnmsub f0, A2, f1, f0 fxpmul f0, A1, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, -4 * SIZE fxpmul f0, A1, f0 fxcsnmsub f1, A1, f0, f1 fxsmul f1, A2, f1 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxcsnmsub f1, A1, f0, f1 fxsmul f1, A2, f1 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f1, A1, f1 fxcpnmsub f0, A1, f1, f0 fxpmul f0, A2, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE STFDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f0, CO2, INC STFSDUX f1, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO2, INC STFSDUX f1, CO2, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L80: andi. I, M, 1 beq .L89 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L84 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L84 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L83 .align 4 .L82: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A1, AO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A2, AO, INC2 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A3, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 fxcpmadd f2, A4, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A4, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A4, AO, INC2 bdnz+ .L82 .align 4 .L83: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 fxcpmadd f0, A3, B1, f0 fxcsmadd f1, A3, B2, f1 fxcpmadd f2, A4, B3, f2 fxcsmadd f3, A4, B4, f3 .align 4 .L84: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L88 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L88 #endif LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdz- .L87 .align 4 .L86: fxcpmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdnz+ .L86 .align 4 .L87: fxcpmadd f0, A1, B1, f0 .align 4 .L88: fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 fpsub f0, f16, f0 #else LFPDX f16, AO, INC2 fpsub f0, f16, f0 #endif #ifdef LN LFPDX A1, AO, INC2 fxpmul f0, A1, f0 #endif #ifdef LT LFPDX A1, AO, INC2 fxpmul f0, A1, f0 #endif #ifdef RN LFD A1, (2 + 0) * SIZE(BO) LFD A2, (2 + 1) * SIZE(BO) LFD A3, (2 + 3) * SIZE(BO) fsmtp f1, f0 fmul f0, A1, f0 fnmsub f1, A2, f0, f1 fmul f1, A3, f1 fsmfp f0, f1 #endif #ifdef RT LFD A1, (2 + 3) * SIZE(BO) LFD A2, (2 + 2) * SIZE(BO) LFD A3, (2 + 0) * SIZE(BO) fsmtp f1, f0 fmul f1, A1, f1 fnmsub f0, A2, f1, f0 fmul f0, A3, f0 fsmfp f0, f1 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO2, INC #else STFPDX f0, AO, INC2 STFDUX f0, CO1, INC STFDUX f1, CO2, INC #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L89: #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif .align 4 .L90: andi. J, N, 1 beq .L999 #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO1, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L100 .align 4 .L91: #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L94 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L94 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L93 .align 4 .L92: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B2, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 bdnz+ .L92 .align 4 .L93: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B2, A1, f0 fxcpmadd f1, B2, A2, f1 fxcpmadd f2, B2, A3, f2 fxcpmadd f3, B2, A4, f3 fxcsmadd f0, B2, A5, f0 fxcsmadd f1, B2, A6, f1 fxcsmadd f2, B2, A7, f2 fxcsmadd f3, B2, A8, f3 .align 4 .L94: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L98 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L98 #endif LFDX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdz- .L97 .align 4 .L96: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFDX B1, BO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdnz+ .L96 .align 4 .L97: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 fxcpmadd f2, B1, A3, f2 fxcpmadd f3, B1, A4, f3 .align 4 .L98: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 1 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #endif #ifdef LN fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 63) * SIZE(AO) LFD A2, (2 + 62) * SIZE(AO) LFD A3, (2 + 61) * SIZE(AO) LFD A4, (2 + 60) * SIZE(AO) LFD A5, (2 + 59) * SIZE(AO) LFD A6, (2 + 58) * SIZE(AO) LFD A7, (2 + 57) * SIZE(AO) LFD A8, (2 + 56) * SIZE(AO) fmul f7, A1, f7 fnmsub f3, A2, f7, f3 fnmsub f6, A3, f7, f6 fnmsub f2, A4, f7, f2 fnmsub f5, A5, f7, f5 fnmsub f1, A6, f7, f1 fnmsub f4, A7, f7, f4 fnmsub f0, A8, f7, f0 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 53) * SIZE(AO) LFD A3, (2 + 52) * SIZE(AO) LFD A4, (2 + 51) * SIZE(AO) LFD A5, (2 + 50) * SIZE(AO) LFD A6, (2 + 49) * SIZE(AO) LFD A7, (2 + 48) * SIZE(AO) fmul f3, A1, f3 fnmsub f6, A2, f3, f6 fnmsub f2, A3, f3, f2 fnmsub f5, A4, f3, f5 fnmsub f1, A5, f3, f1 fnmsub f4, A6, f3, f4 fnmsub f0, A7, f3, f0 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 44) * SIZE(AO) LFD A3, (2 + 43) * SIZE(AO) LFD A4, (2 + 42) * SIZE(AO) LFD A5, (2 + 41) * SIZE(AO) LFD A6, (2 + 40) * SIZE(AO) fmul f6, A1, f6 fnmsub f2, A2, f6, f2 fnmsub f5, A3, f6, f5 fnmsub f1, A4, f6, f1 fnmsub f4, A5, f6, f4 fnmsub f0, A6, f6, f0 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 35) * SIZE(AO) LFD A3, (2 + 34) * SIZE(AO) LFD A4, (2 + 33) * SIZE(AO) LFD A5, (2 + 32) * SIZE(AO) fmul f2, A1, f2 fnmsub f5, A2, f2, f5 fnmsub f1, A3, f2, f1 fnmsub f4, A4, f2, f4 fnmsub f0, A5, f2, f0 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 26) * SIZE(AO) LFD A3, (2 + 25) * SIZE(AO) LFD A4, (2 + 24) * SIZE(AO) fmul f5, A1, f5 fnmsub f1, A2, f5, f1 fnmsub f4, A3, f5, f4 fnmsub f0, A4, f5, f0 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 17) * SIZE(AO) LFD A3, (2 + 16) * SIZE(AO) fmul f1, A1, f1 fnmsub f4, A2, f1, f4 fnmsub f0, A3, f1, f0 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 8) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif #ifdef LT fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 2) * SIZE(AO) LFD A4, (2 + 3) * SIZE(AO) LFD A5, (2 + 4) * SIZE(AO) LFD A6, (2 + 5) * SIZE(AO) LFD A7, (2 + 6) * SIZE(AO) LFD A8, (2 + 7) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fnmsub f1, A3, f0, f1 fnmsub f5, A4, f0, f5 fnmsub f2, A5, f0, f2 fnmsub f6, A6, f0, f6 fnmsub f3, A7, f0, f3 fnmsub f7, A8, f0, f7 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 10) * SIZE(AO) LFD A3, (2 + 11) * SIZE(AO) LFD A4, (2 + 12) * SIZE(AO) LFD A5, (2 + 13) * SIZE(AO) LFD A6, (2 + 14) * SIZE(AO) LFD A7, (2 + 15) * SIZE(AO) fmul f4, A1, f4 fnmsub f1, A2, f4, f1 fnmsub f5, A3, f4, f5 fnmsub f2, A4, f4, f2 fnmsub f6, A5, f4, f6 fnmsub f3, A6, f4, f3 fnmsub f7, A7, f4, f7 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 19) * SIZE(AO) LFD A3, (2 + 20) * SIZE(AO) LFD A4, (2 + 21) * SIZE(AO) LFD A5, (2 + 22) * SIZE(AO) LFD A6, (2 + 23) * SIZE(AO) fmul f1, A1, f1 fnmsub f5, A2, f1, f5 fnmsub f2, A3, f1, f2 fnmsub f6, A4, f1, f6 fnmsub f3, A5, f1, f3 fnmsub f7, A6, f1, f7 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 28) * SIZE(AO) LFD A3, (2 + 29) * SIZE(AO) LFD A4, (2 + 30) * SIZE(AO) LFD A5, (2 + 31) * SIZE(AO) fmul f5, A1, f5 fnmsub f2, A2, f5, f2 fnmsub f6, A3, f5, f6 fnmsub f3, A4, f5, f3 fnmsub f7, A5, f5, f7 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 37) * SIZE(AO) LFD A3, (2 + 38) * SIZE(AO) LFD A4, (2 + 39) * SIZE(AO) fmul f2, A1, f2 fnmsub f6, A2, f2, f6 fnmsub f3, A3, f2, f3 fnmsub f7, A4, f2, f7 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 46) * SIZE(AO) LFD A3, (2 + 47) * SIZE(AO) fmul f6, A1, f6 fnmsub f3, A2, f6, f3 fnmsub f7, A3, f6, f7 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 55) * SIZE(AO) fmul f3, A1, f3 fnmsub f7, A2, f3, f7 LFD A1, (2 + 63) * SIZE(AO) fmul f7, A1, f7 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L91 .align 4 .L100: andi. I, M, 4 beq .L110 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L104 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L104 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L103 .align 4 .L102: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B3, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B3, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L102 .align 4 .L103: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcpmadd f1, B3, A2, f1 fxcsmadd f2, B3, A3, f2 fxcsmadd f3, B3, A4, f3 fxcpmadd f0, B4, A5, f0 fxcpmadd f1, B4, A6, f1 fxcsmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L104: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L108 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L108 #endif LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdz- .L107 .align 4 .L106: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdnz+ .L106 .align 4 .L107: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 .align 4 .L108: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #ifdef LN fsmtp f4, f0 fsmtp f5, f1 LFD A1, (2 + 15) * SIZE(AO) LFD A2, (2 + 14) * SIZE(AO) LFD A3, (2 + 13) * SIZE(AO) LFD A4, (2 + 12) * SIZE(AO) fmul f5, A1, f5 fnmsub f1, A2, f5, f1 fnmsub f4, A3, f5, f4 fnmsub f0, A4, f5, f0 LFD A1, (2 + 10) * SIZE(AO) LFD A2, (2 + 9) * SIZE(AO) LFD A3, (2 + 8) * SIZE(AO) fmul f1, A1, f1 fnmsub f4, A2, f1, f4 fnmsub f0, A3, f1, f0 LFD A1, (2 + 5) * SIZE(AO) LFD A2, (2 + 4) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 fsmfp f0, f4 fsmfp f1, f5 #endif #ifdef LT fsmtp f4, f0 fsmtp f5, f1 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 2) * SIZE(AO) LFD A4, (2 + 3) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fnmsub f1, A3, f0, f1 fnmsub f5, A4, f0, f5 LFD A1, (2 + 5) * SIZE(AO) LFD A2, (2 + 6) * SIZE(AO) LFD A3, (2 + 7) * SIZE(AO) fmul f4, A1, f4 fnmsub f1, A2, f4, f1 fnmsub f5, A3, f4, f5 LFD A1, (2 + 10) * SIZE(AO) LFD A2, (2 + 11) * SIZE(AO) fmul f1, A1, f1 fnmsub f5, A2, f1, f5 LFD A1, (2 + 15) * SIZE(AO) fmul f5, A1, f5 fsmfp f0, f4 fsmfp f1, f5 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L110: andi. I, M, 2 beq .L120 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L114 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L114 #endif LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L113 .align 4 .L112: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcsmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B3, A6, f1 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L112 .align 4 .L113: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A2, f1 fxcpmadd f2, B2, A3, f2 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f1, B3, A6, f1 fxcpmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L114: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L118 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L118 #endif LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdz- .L117 .align 4 .L116: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdnz+ .L116 .align 4 .L117: fxcpmadd f0, B1, A1, f0 .align 4 .L118: fpadd f0, f0, f1 fpadd f2, f3, f2 fpadd f0, f0, f2 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 fpsub f0, f16, f0 #else LFPDX f16, AO, INC2 fpsub f0, f16, f0 #endif #ifdef LN fsmtp f4, f0 LFD A1, (2 + 3) * SIZE(AO) LFD A2, (2 + 2) * SIZE(AO) LFD A3, (2 + 0) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 fmul f0, A3, f0 fsmfp f0, f4 #endif #ifdef LT fsmtp f4, f0 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 3) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fmul f4, A3, f4 fsmfp f0, f4 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #else STFPDX f0, AO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L120: andi. I, M, 1 beq .L129 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L124 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L124 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L123 .align 4 .L122: fpmadd f0, A1, B1, f0 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fpmadd f1, A2, B2, f1 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fpmadd f2, A3, B3, f2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fpmadd f3, A4, B4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L122 .align 4 .L123: fpmadd f0, A1, B1, f0 fpmadd f1, A2, B2, f1 fpmadd f2, A3, B3, f2 fpmadd f3, A4, B4, f3 .align 4 .L124: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L128 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L128 #endif LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdz- .L127 .align 4 .L126: fmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdnz+ .L126 .align 4 .L127: fmadd f0, A1, B1, f0 .align 4 .L128: fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFDX f16, BO, INC2 fsub f0, f16, f0 #else LFDX f16, AO, INC2 fsub f0, f16, f0 #endif #ifdef LN LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 #endif #ifdef LT LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 #endif #ifdef RN LFDX A1, BO, INC2 fmul f0, A1, f0 #endif #ifdef RT LFDX A1, BO, INC2 fmul f0, A1, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFDX f0, BO, INC2 STFDUX f0, CO1, INC #else STFDX f0, AO, INC2 STFDUX f0, CO1, INC #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L129: #ifdef LN slwi r0, K, 0 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L999: addi SP, SP, 12 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_hummer_RT.S000066400000000000000000002763001313527062700222230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define ALPHA 0 #define FZERO 8 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define AORIG r12 #define KK r14 #define INCM1 r15 #define INCM4 r16 #define INCM2 r17 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define CO3 r30 #define CO4 r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) # dummy li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f1, -8(SP) slwi LDC, LDC, BASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM2, -2 * SIZE li INCM4, -4 * SIZE addi C, C, - 1 * SIZE #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif andi. J, N, 1 beq .L50 #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO1, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L100 .align 4 .L91: #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L94 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L94 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L93 .align 4 .L92: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B2, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 bdnz+ .L92 .align 4 .L93: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcsmadd f0, B1, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B1, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B1, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B1, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B2, A1, f0 fxcpmadd f1, B2, A2, f1 fxcpmadd f2, B2, A3, f2 fxcpmadd f3, B2, A4, f3 fxcsmadd f0, B2, A5, f0 fxcsmadd f1, B2, A6, f1 fxcsmadd f2, B2, A7, f2 fxcsmadd f3, B2, A8, f3 .align 4 .L94: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L98 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L98 #endif LFDX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdz- .L97 .align 4 .L96: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 LFDX B1, BO, INC2 LFPDUX A4, AO, INC2 add BO, BO, INC bdnz+ .L96 .align 4 .L97: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 fxcpmadd f2, B1, A3, f2 fxcpmadd f3, B1, A4, f3 .align 4 .L98: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 1 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #endif #ifdef LN fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 63) * SIZE(AO) LFD A2, (2 + 62) * SIZE(AO) LFD A3, (2 + 61) * SIZE(AO) LFD A4, (2 + 60) * SIZE(AO) LFD A5, (2 + 59) * SIZE(AO) LFD A6, (2 + 58) * SIZE(AO) LFD A7, (2 + 57) * SIZE(AO) LFD A8, (2 + 56) * SIZE(AO) fmul f7, A1, f7 fnmsub f3, A2, f7, f3 fnmsub f6, A3, f7, f6 fnmsub f2, A4, f7, f2 fnmsub f5, A5, f7, f5 fnmsub f1, A6, f7, f1 fnmsub f4, A7, f7, f4 fnmsub f0, A8, f7, f0 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 53) * SIZE(AO) LFD A3, (2 + 52) * SIZE(AO) LFD A4, (2 + 51) * SIZE(AO) LFD A5, (2 + 50) * SIZE(AO) LFD A6, (2 + 49) * SIZE(AO) LFD A7, (2 + 48) * SIZE(AO) fmul f3, A1, f3 fnmsub f6, A2, f3, f6 fnmsub f2, A3, f3, f2 fnmsub f5, A4, f3, f5 fnmsub f1, A5, f3, f1 fnmsub f4, A6, f3, f4 fnmsub f0, A7, f3, f0 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 44) * SIZE(AO) LFD A3, (2 + 43) * SIZE(AO) LFD A4, (2 + 42) * SIZE(AO) LFD A5, (2 + 41) * SIZE(AO) LFD A6, (2 + 40) * SIZE(AO) fmul f6, A1, f6 fnmsub f2, A2, f6, f2 fnmsub f5, A3, f6, f5 fnmsub f1, A4, f6, f1 fnmsub f4, A5, f6, f4 fnmsub f0, A6, f6, f0 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 35) * SIZE(AO) LFD A3, (2 + 34) * SIZE(AO) LFD A4, (2 + 33) * SIZE(AO) LFD A5, (2 + 32) * SIZE(AO) fmul f2, A1, f2 fnmsub f5, A2, f2, f5 fnmsub f1, A3, f2, f1 fnmsub f4, A4, f2, f4 fnmsub f0, A5, f2, f0 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 26) * SIZE(AO) LFD A3, (2 + 25) * SIZE(AO) LFD A4, (2 + 24) * SIZE(AO) fmul f5, A1, f5 fnmsub f1, A2, f5, f1 fnmsub f4, A3, f5, f4 fnmsub f0, A4, f5, f0 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 17) * SIZE(AO) LFD A3, (2 + 16) * SIZE(AO) fmul f1, A1, f1 fnmsub f4, A2, f1, f4 fnmsub f0, A3, f1, f0 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 8) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif #ifdef LT fsmtp f4, f0 fsmtp f5, f1 fsmtp f6, f2 fsmtp f7, f3 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 2) * SIZE(AO) LFD A4, (2 + 3) * SIZE(AO) LFD A5, (2 + 4) * SIZE(AO) LFD A6, (2 + 5) * SIZE(AO) LFD A7, (2 + 6) * SIZE(AO) LFD A8, (2 + 7) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fnmsub f1, A3, f0, f1 fnmsub f5, A4, f0, f5 fnmsub f2, A5, f0, f2 fnmsub f6, A6, f0, f6 fnmsub f3, A7, f0, f3 fnmsub f7, A8, f0, f7 LFD A1, (2 + 9) * SIZE(AO) LFD A2, (2 + 10) * SIZE(AO) LFD A3, (2 + 11) * SIZE(AO) LFD A4, (2 + 12) * SIZE(AO) LFD A5, (2 + 13) * SIZE(AO) LFD A6, (2 + 14) * SIZE(AO) LFD A7, (2 + 15) * SIZE(AO) fmul f4, A1, f4 fnmsub f1, A2, f4, f1 fnmsub f5, A3, f4, f5 fnmsub f2, A4, f4, f2 fnmsub f6, A5, f4, f6 fnmsub f3, A6, f4, f3 fnmsub f7, A7, f4, f7 LFD A1, (2 + 18) * SIZE(AO) LFD A2, (2 + 19) * SIZE(AO) LFD A3, (2 + 20) * SIZE(AO) LFD A4, (2 + 21) * SIZE(AO) LFD A5, (2 + 22) * SIZE(AO) LFD A6, (2 + 23) * SIZE(AO) fmul f1, A1, f1 fnmsub f5, A2, f1, f5 fnmsub f2, A3, f1, f2 fnmsub f6, A4, f1, f6 fnmsub f3, A5, f1, f3 fnmsub f7, A6, f1, f7 LFD A1, (2 + 27) * SIZE(AO) LFD A2, (2 + 28) * SIZE(AO) LFD A3, (2 + 29) * SIZE(AO) LFD A4, (2 + 30) * SIZE(AO) LFD A5, (2 + 31) * SIZE(AO) fmul f5, A1, f5 fnmsub f2, A2, f5, f2 fnmsub f6, A3, f5, f6 fnmsub f3, A4, f5, f3 fnmsub f7, A5, f5, f7 LFD A1, (2 + 36) * SIZE(AO) LFD A2, (2 + 37) * SIZE(AO) LFD A3, (2 + 38) * SIZE(AO) LFD A4, (2 + 39) * SIZE(AO) fmul f2, A1, f2 fnmsub f6, A2, f2, f6 fnmsub f3, A3, f2, f3 fnmsub f7, A4, f2, f7 LFD A1, (2 + 45) * SIZE(AO) LFD A2, (2 + 46) * SIZE(AO) LFD A3, (2 + 47) * SIZE(AO) fmul f6, A1, f6 fnmsub f3, A2, f6, f3 fnmsub f7, A3, f6, f7 LFD A1, (2 + 54) * SIZE(AO) LFD A2, (2 + 55) * SIZE(AO) fmul f3, A1, f3 fnmsub f7, A2, f3, f7 LFD A1, (2 + 63) * SIZE(AO) fmul f7, A1, f7 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L91 .align 4 .L100: andi. I, M, 4 beq .L110 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L104 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L104 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L103 .align 4 .L102: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B3, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B3, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L102 .align 4 .L103: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 fxcsmadd f2, B1, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B1, A4, f3 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 LFPDUX A6, AO, INC2 fxcsmadd f2, B2, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B2, A8, f3 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcpmadd f1, B3, A2, f1 fxcsmadd f2, B3, A3, f2 fxcsmadd f3, B3, A4, f3 fxcpmadd f0, B4, A5, f0 fxcpmadd f1, B4, A6, f1 fxcsmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L104: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L108 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L108 #endif LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdz- .L107 .align 4 .L106: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 LFDX B1, BO, INC2 LFPDUX A2, AO, INC2 add BO, BO, INC bdnz+ .L106 .align 4 .L107: fxcpmadd f0, B1, A1, f0 fxcpmadd f1, B1, A2, f1 .align 4 .L108: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #ifdef LN fsmtp f4, f0 fsmtp f5, f1 LFD A1, (2 + 15) * SIZE(AO) LFD A2, (2 + 14) * SIZE(AO) LFD A3, (2 + 13) * SIZE(AO) LFD A4, (2 + 12) * SIZE(AO) fmul f5, A1, f5 fnmsub f1, A2, f5, f1 fnmsub f4, A3, f5, f4 fnmsub f0, A4, f5, f0 LFD A1, (2 + 10) * SIZE(AO) LFD A2, (2 + 9) * SIZE(AO) LFD A3, (2 + 8) * SIZE(AO) fmul f1, A1, f1 fnmsub f4, A2, f1, f4 fnmsub f0, A3, f1, f0 LFD A1, (2 + 5) * SIZE(AO) LFD A2, (2 + 4) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 fsmfp f0, f4 fsmfp f1, f5 #endif #ifdef LT fsmtp f4, f0 fsmtp f5, f1 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 2) * SIZE(AO) LFD A4, (2 + 3) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fnmsub f1, A3, f0, f1 fnmsub f5, A4, f0, f5 LFD A1, (2 + 5) * SIZE(AO) LFD A2, (2 + 6) * SIZE(AO) LFD A3, (2 + 7) * SIZE(AO) fmul f4, A1, f4 fnmsub f1, A2, f4, f1 fnmsub f5, A3, f4, f5 LFD A1, (2 + 10) * SIZE(AO) LFD A2, (2 + 11) * SIZE(AO) fmul f1, A1, f1 fnmsub f5, A2, f1, f5 LFD A1, (2 + 15) * SIZE(AO) fmul f5, A1, f5 fsmfp f0, f4 fsmfp f1, f5 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L110: andi. I, M, 2 beq .L120 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L114 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L114 #endif LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L113 .align 4 .L112: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 fxcsmadd f1, B1, A2, f1 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 LFPDUX A5, AO, INC2 fxcsmadd f1, B3, A6, f1 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L112 .align 4 .L113: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A2, f1 fxcpmadd f2, B2, A3, f2 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f1, B3, A6, f1 fxcpmadd f2, B4, A7, f2 fxcsmadd f3, B4, A8, f3 .align 4 .L114: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L118 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L118 #endif LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdz- .L117 .align 4 .L116: fxcpmadd f0, B1, A1, f0 LFPDUX A1, AO, INC2 LFDX B1, BO, INC2 add BO, BO, INC bdnz+ .L116 .align 4 .L117: fxcpmadd f0, B1, A1, f0 .align 4 .L118: fpadd f0, f0, f1 fpadd f2, f3, f2 fpadd f0, f0, f2 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 fpsub f0, f16, f0 #else LFPDX f16, AO, INC2 fpsub f0, f16, f0 #endif #ifdef LN fsmtp f4, f0 LFD A1, (2 + 3) * SIZE(AO) LFD A2, (2 + 2) * SIZE(AO) LFD A3, (2 + 0) * SIZE(AO) fmul f4, A1, f4 fnmsub f0, A2, f4, f0 fmul f0, A3, f0 fsmfp f0, f4 #endif #ifdef LT fsmtp f4, f0 LFD A1, (2 + 0) * SIZE(AO) LFD A2, (2 + 1) * SIZE(AO) LFD A3, (2 + 3) * SIZE(AO) fmul f0, A1, f0 fnmsub f4, A2, f0, f4 fmul f4, A3, f4 fsmfp f0, f4 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f0, A1, f0 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f0, A1, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #else STFPDX f0, AO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L120: andi. I, M, 1 beq .L129 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L124 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L124 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdz- .L123 .align 4 .L122: fpmadd f0, A1, B1, f0 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fpmadd f1, A2, B2, f1 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fpmadd f2, A3, B3, f2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fpmadd f3, A4, B4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L122 .align 4 .L123: fpmadd f0, A1, B1, f0 fpmadd f1, A2, B2, f1 fpmadd f2, A3, B3, f2 fpmadd f3, A4, B4, f3 .align 4 .L124: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L128 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L128 #endif LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdz- .L127 .align 4 .L126: fmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFDX B1, BO, INC2 add AO, AO, INC add BO, BO, INC bdnz+ .L126 .align 4 .L127: fmadd f0, A1, B1, f0 .align 4 .L128: fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 fsmtp f1, f0 fadd f0, f0, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFDX f16, BO, INC2 fsub f0, f16, f0 #else LFDX f16, AO, INC2 fsub f0, f16, f0 #endif #ifdef LN LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 #endif #ifdef LT LFD A1, (2 + 0) * SIZE(AO) fmul f0, A1, f0 #endif #ifdef RN LFDX A1, BO, INC2 fmul f0, A1, f0 #endif #ifdef RT LFDX A1, BO, INC2 fmul f0, A1, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFDX f0, BO, INC2 STFDUX f0, CO1, INC #else STFDX f0, AO, INC2 STFDUX f0, CO1, INC #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L129: #ifdef LN slwi r0, K, 0 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L50: andi. J, N, 2 beq .L90 #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L60 .align 4 .L51: #if defined(LT) || defined(RN) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, KK, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 LFPDUX B1, BO, INC2 fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 LFPDUX B2, BO, INC2 fxcsmadd f4, B3, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B3, A3, f2 nop fxcsmadd f6, B3, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B4, A5, f0 LFPDUX B3, BO, INC2 fxcsmadd f4, B4, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B4, A6, f1 nop fxcsmadd f5, B4, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B4, A7, f2 nop fxcsmadd f6, B4, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B4, A8, f3 nop fxcsmadd f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: fxcpmadd f0, B1, A1, f0 LFPDUX B4, BO, INC2 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 fxcpmadd f0, B2, A5, f0 nop fxcsmadd f4, B2, A5, f4 LFPDUX A5, AO, INC2 fxcpmadd f1, B2, A6, f1 nop fxcsmadd f5, B2, A6, f5 LFPDUX A6, AO, INC2 fxcpmadd f2, B2, A7, f2 nop fxcsmadd f6, B2, A7, f6 LFPDUX A7, AO, INC2 fxcpmadd f3, B2, A8, f3 nop fxcsmadd f7, B2, A8, f7 LFPDUX A8, AO, INC2 fxcpmadd f0, B3, A1, f0 fxcsmadd f4, B3, A1, f4 fxcpmadd f1, B3, A2, f1 fxcsmadd f5, B3, A2, f5 fxcpmadd f2, B3, A3, f2 fxcsmadd f6, B3, A3, f6 fxcpmadd f3, B3, A4, f3 fxcsmadd f7, B3, A4, f7 fxcpmadd f0, B4, A5, f0 fxcsmadd f4, B4, A5, f4 fxcpmadd f1, B4, A6, f1 fxcsmadd f5, B4, A6, f5 fxcpmadd f2, B4, A7, f2 fxcsmadd f6, B4, A7, f6 fxcpmadd f3, B4, A8, f3 fxcsmadd f7, B4, A8, f7 .align 4 .L54: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L58 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L58 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX A2, AO, INC2 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 LFPDUX A3, AO, INC2 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f2, B1, A3, f2 fxcsmadd f6, B1, A3, f6 fxcpmadd f3, B1, A4, f3 fxcsmadd f7, B1, A4, f7 .align 4 .L58: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 2 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fpmr f26, f2 fpmr f27, f3 fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 fsmfp f3, f7 fsmtp f4, f24 fsmtp f5, f25 fsmtp f6, f26 fsmtp f7, f27 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 LFPDUX f20, BO, INC2 LFPDUX f21, BO, INC2 LFPDUX f22, BO, INC2 LFPDUX f23, BO, INC2 subi BO, BO, 16 * SIZE fpsub f0, f16, f0 fpsub f4, f17, f4 fpsub f1, f18, f1 fpsub f5, f19, f5 fpsub f2, f20, f2 fpsub f6, f21, f6 fpsub f3, f22, f3 fpsub f7, f23, f7 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 LFPDUX f20, AO, INC2 LFPDUX f21, AO, INC2 LFPDUX f22, AO, INC2 LFPDUX f23, AO, INC2 subi AO, AO, 16 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 fpsub f4, f20, f4 fpsub f5, f21, f5 fpsub f6, f22, f6 fpsub f7, f23, f7 #endif #ifdef LN addi AO, AO, 66 * SIZE LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 LFPDUX A4, AO, INCM2 LFPDUX A5, AO, INCM2 LFPDUX A6, AO, INCM2 LFPDUX A7, AO, INCM2 LFPDUX A8, AO, INCM2 fxsmul f7, A1, f7 fxcpnmsub f3, A1, f7, f3 fxcsnmsub f6, A2, f7, f6 fxcpnmsub f2, A2, f7, f2 fxcsnmsub f5, A3, f7, f5 fxcpnmsub f1, A3, f7, f1 fxcsnmsub f4, A4, f7, f4 fxcpnmsub f0, A4, f7, f0 fxpmul f3, A5, f3 fxcsnmsub f6, A6, f3, f6 fxcpnmsub f2, A6, f3, f2 fxcsnmsub f5, A7, f3, f5 fxcpnmsub f1, A7, f3, f1 fxcsnmsub f4, A8, f3, f4 fxcpnmsub f0, A8, f3, f0 add AO, AO, INCM2 LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 add AO, AO, INCM2 LFPDUX A4, AO, INCM2 LFPDUX A5, AO, INCM2 LFPDUX A6, AO, INCM2 add AO, AO, INCM2 add AO, AO, INCM2 LFPDUX A7, AO, INCM2 LFPDUX A8, AO, INCM2 fxsmul f6, A1, f6 fxcpnmsub f2, A1, f6, f2 fxcsnmsub f5, A2, f6, f5 fxcpnmsub f1, A2, f6, f1 fxcsnmsub f4, A3, f6, f4 fxcpnmsub f0, A3, f6, f0 fxpmul f2, A4, f2 fxcsnmsub f5, A5, f2, f5 fxcpnmsub f1, A5, f2, f1 fxcsnmsub f4, A6, f2, f4 fxcpnmsub f0, A6, f2, f0 fxsmul f5, A7, f5 fxcpnmsub f1, A7, f5, f1 fxcsnmsub f4, A8, f5, f4 fxcpnmsub f0, A8, f5, f0 add AO, AO, INCM2 add AO, AO, INCM2 LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 subi AO, AO, 6 * SIZE LFPDUX A3, AO, INCM2 subi AO, AO, 6 * SIZE LFPDUX A4, AO, INCM2 addi AO, AO, -2 * SIZE fxpmul f1, A1, f1 fxcsnmsub f4, A2, f1, f4 fxcpnmsub f0, A2, f1, f0 fxsmul f4, A3, f4 fxcpnmsub f0, A3, f4, f0 fxpmul f0, A4, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 fxpmul f0, A1, f0 fxcsnmsub f4, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 fxcsnmsub f5, A2, f0, f5 fxcpnmsub f2, A3, f0, f2 fxcsnmsub f6, A3, f0, f6 fxcpnmsub f3, A4, f0, f3 fxcsnmsub f7, A4, f0, f7 fxsmul f4, A5, f4 fxcpnmsub f1, A6, f4, f1 fxcsnmsub f5, A6, f4, f5 fxcpnmsub f2, A7, f4, f2 fxcsnmsub f6, A7, f4, f6 fxcpnmsub f3, A8, f4, f3 fxcsnmsub f7, A8, f4, f7 add AO, AO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 add AO, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 fxpmul f1, A1, f1 fxcsnmsub f5, A1, f1, f5 fxcpnmsub f2, A2, f1, f2 fxcsnmsub f6, A2, f1, f6 fxcpnmsub f3, A3, f1, f3 fxcsnmsub f7, A3, f1, f7 fxsmul f5, A4, f5 fxcpnmsub f2, A5, f5, f2 fxcsnmsub f6, A5, f5, f6 fxcpnmsub f3, A6, f5, f3 fxcsnmsub f7, A6, f5, f7 fxpmul f2, A7, f2 fxcsnmsub f6, A7, f2, f6 fxcpnmsub f3, A8, f2, f3 fxcsnmsub f7, A8, f2, f7 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, 6 * SIZE LFPDUX A3, AO, INC2 addi AO, AO, 6 * SIZE LFPDUX A4, AO, INC2 subi AO, AO, 64 * SIZE fxsmul f6, A1, f6 fxcpnmsub f3, A2, f6, f3 fxcsnmsub f7, A2, f6, f7 fxpmul f3, A3, f3 fxcsnmsub f7, A3, f3, f7 fxsmul f7, A4, f7 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f6, A1, f2, f6 fxcsnmsub f7, A1, f3, f7 fxsmul f4, A2, f4 fxsmul f5, A2, f5 fxsmul f6, A2, f6 fxsmul f7, A2, f7 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f4, A1, f4 fxsmul f5, A1, f5 fxsmul f6, A1, f6 fxsmul f7, A1, f7 fxcpnmsub f0, A1, f4, f0 fxcpnmsub f1, A1, f5, f1 fxcpnmsub f2, A1, f6, f2 fxcpnmsub f3, A1, f7, f3 fxpmul f0, A2, f0 fxpmul f1, A2, f1 fxpmul f2, A2, f2 fxpmul f3, A2, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f4, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f5, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f6, BO, INC2 STFPDUX f3, BO, INC2 STFPDUX f7, BO, INC2 subi BO, BO, 16 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFDUX f2, CO1, INC STFDUX f6, CO1, INC STFDUX f3, CO1, INC STFDUX f7, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFSDUX f2, CO2, INC STFSDUX f6, CO2, INC STFSDUX f3, CO2, INC STFSDUX f7, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 STFPDUX f4, AO, INC2 STFPDUX f5, AO, INC2 STFPDUX f6, AO, INC2 STFPDUX f7, AO, INC2 subi AO, AO, 16 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L60: andi. I, M, 4 beq .L70 #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L64 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 LFPDUX A3, AO, INC2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 LFPDUX A5, AO, INC2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 LFPDUX A7, AO, INC2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 fxcpmadd f0, B2, A3, f0 fxcsmadd f2, B2, A3, f2 fxcpmadd f1, B2, A4, f1 fxcsmadd f3, B2, A4, f3 fxcpmadd f0, B3, A5, f0 fxcsmadd f2, B3, A5, f2 fxcpmadd f1, B3, A6, f1 fxcsmadd f3, B3, A6, f3 fxcpmadd f0, B4, A7, f0 fxcsmadd f2, B4, A7, f2 fxcpmadd f1, B4, A8, f1 fxcsmadd f3, B4, A8, f3 .align 4 .L64: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L68 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L68 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 LFPDUX A1, AO, INC2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: fxcpmadd f0, B1, A1, f0 fxcsmadd f2, B1, A1, f2 fxcpmadd f1, B1, A2, f1 fxcsmadd f3, B1, A2, f3 .align 4 .L68: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fsmfp f0, f2 fsmfp f1, f3 fsmtp f2, f24 fsmtp f3, f25 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE fpsub f0, f16, f0 fpsub f2, f17, f2 fpsub f1, f18, f1 fpsub f3, f19, f3 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #endif #ifdef LN addi AO, AO, 18 * SIZE LFPDUX A1, AO, INCM2 LFPDUX A2, AO, INCM2 LFPDUX A3, AO, INCM2 LFPDUX A4, AO, INCM2 add AO, AO, INCM2 LFPDUX A5, AO, INCM2 add AO, AO, INCM2 LFPDUX A6, AO, INCM2 subi AO, AO, 2 * SIZE fxsmul f3, A1, f3 fxcpnmsub f1, A1, f3, f1 fxcsnmsub f2, A2, f3, f2 fxcpnmsub f0, A2, f3, f0 fxpmul f1, A3, f1 fxcsnmsub f2, A4, f1, f2 fxcpnmsub f0, A4, f1, f0 fxsmul f2, A5, f2 fxcpnmsub f0, A5, f2, f0 fxpmul f0, A6, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add AO, AO, INC2 LFPDUX A5, AO, INC2 add AO, AO, INC2 LFPDUX A6, AO, INC2 subi AO, AO, 16 * SIZE fxpmul f0, A1, f0 fxcsnmsub f2, A1, f0, f2 fxcpnmsub f1, A2, f0, f1 fxcsnmsub f3, A2, f0, f3 fxsmul f2, A3, f2 fxcpnmsub f1, A4, f2, f1 fxcsnmsub f3, A4, f2, f3 fxpmul f1, A5, f1 fxcsnmsub f3, A5, f1, f3 fxsmul f3, A6, f3 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxcsnmsub f2, A1, f0, f2 fxcsnmsub f3, A1, f1, f3 fxsmul f2, A2, f2 fxsmul f3, A2, f3 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f2, A1, f2 fxsmul f3, A1, f3 fxcpnmsub f0, A1, f2, f0 fxcpnmsub f1, A1, f3, f1 fxpmul f0, A2, f0 fxpmul f1, A2, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE STFDUX f0, CO1, INC STFDUX f2, CO1, INC STFDUX f1, CO1, INC STFDUX f3, CO1, INC STFSDUX f0, CO2, INC STFSDUX f2, CO2, INC STFSDUX f1, CO2, INC STFSDUX f3, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC STFDUX f3, CO2, INC STFSDUX f3, CO2, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: andi. I, M, 2 beq .L80 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L74 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 fxcpmadd f2, B2, A2, f2 fxcsmadd f3, B2, A2, f3 fxcpmadd f0, B3, A3, f0 fxcsmadd f1, B3, A3, f1 fxcpmadd f2, B4, A4, f2 fxcsmadd f3, B4, A4, f3 fxcpmadd f0, B5, A5, f0 fxcsmadd f1, B5, A5, f1 fxcpmadd f2, B6, A6, f2 fxcsmadd f3, B6, A6, f3 fxcpmadd f0, A9, A7, f0 fxcsmadd f1, A9, A7, f1 fxcpmadd f2, A10, A8, f2 fxcsmadd f3, A10, A8, f3 .align 4 .L74: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L78 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L78 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: fxcpmadd f0, B1, A1, f0 fxcsmadd f1, B1, A1, f1 .align 4 .L78: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fsmfp f0, f1 fsmtp f1, f24 LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #ifdef LN LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, -4 * SIZE fxsmul f1, A2, f1 fxcpnmsub f0, A2, f1, f0 fxpmul f0, A1, f0 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 addi AO, AO, -4 * SIZE fxpmul f0, A1, f0 fxcsnmsub f1, A1, f0, f1 fxsmul f1, A2, f1 #endif #ifdef RN LFPDUX A1, BO, INC2 LFPDUX A2, BO, INC2 subi BO, BO, 4 * SIZE fxpmul f0, A1, f0 fxcsnmsub f1, A1, f0, f1 fxsmul f1, A2, f1 #endif #ifdef RT LFPDUX A2, BO, INC2 LFPDUX A1, BO, INC2 subi BO, BO, 4 * SIZE fxsmul f1, A1, f1 fxcpnmsub f0, A1, f1, f0 fxpmul f0, A2, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE STFDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f0, CO2, INC STFSDUX f1, CO2, INC #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO2, INC STFSDUX f1, CO2, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L80: andi. I, M, 1 beq .L89 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L84 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L84 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX B4, BO, INC2 bdz- .L83 .align 4 .L82: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A1, AO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A2, AO, INC2 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A3, B2, f1 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 fxcpmadd f2, A4, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A4, B4, f3 LFPDUX B4, BO, INC2 LFPDUX A4, AO, INC2 bdnz+ .L82 .align 4 .L83: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC2 fxcsmadd f1, A1, B2, f1 LFPDUX B2, BO, INC2 fxcpmadd f2, A2, B3, f2 LFPDUX B3, BO, INC2 fxcsmadd f3, A2, B4, f3 LFPDUX B4, BO, INC2 fxcpmadd f0, A3, B1, f0 fxcsmadd f1, A3, B2, f1 fxcpmadd f2, A4, B3, f2 fxcsmadd f3, A4, B4, f3 .align 4 .L84: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L88 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L88 #endif LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdz- .L87 .align 4 .L86: fxcpmadd f0, A1, B1, f0 LFDX A1, AO, INC2 LFPDUX B1, BO, INC2 add AO, AO, INC bdnz+ .L86 .align 4 .L87: fxcpmadd f0, A1, B1, f0 .align 4 .L88: fpadd f0, f0, f1 fpadd f2, f2, f3 fpadd f0, f0, f2 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 fpsub f0, f16, f0 #else LFPDX f16, AO, INC2 fpsub f0, f16, f0 #endif #ifdef LN LFPDX A1, AO, INC2 fxpmul f0, A1, f0 #endif #ifdef LT LFPDX A1, AO, INC2 fxpmul f0, A1, f0 #endif #ifdef RN LFD A1, (2 + 0) * SIZE(BO) LFD A2, (2 + 1) * SIZE(BO) LFD A3, (2 + 3) * SIZE(BO) fsmtp f1, f0 fmul f0, A1, f0 fnmsub f1, A2, f0, f1 fmul f1, A3, f1 fsmfp f0, f1 #endif #ifdef RT LFD A1, (2 + 3) * SIZE(BO) LFD A2, (2 + 2) * SIZE(BO) LFD A3, (2 + 0) * SIZE(BO) fsmtp f1, f0 fmul f1, A1, f1 fnmsub f0, A2, f1, f0 fmul f0, A3, f0 fsmfp f0, f1 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 STFDUX f0, CO1, INC STFSDUX f0, CO2, INC #else STFPDX f0, AO, INC2 STFDUX f0, CO1, INC STFDUX f1, CO2, INC #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L89: #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif .align 4 .L90: srawi. J, N, 2 ble .L999 .align 4 .L10: #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -4 * SIZE #else addi AO, A, -4 * SIZE #endif #ifndef RT add C, CO4, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 3 ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop srawi. r0, KK, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #else #ifdef LN slwi r0, K, 3 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 3 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 nop srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A1, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 LFPDUX B1, BO, INC4 fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A3, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 LFPDUX A5, AO, INC4 fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 LFPDUX B3, BO, INC4 fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A6, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 LFPDUX A7, AO, INC4 fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A9, f12 LFPDUX B5, BO, INC4 fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 LFPDUX A8, AO, INC4 fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 LFPDUX A2, AO2, INC4 fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 nop fxcpmadd f8, B2, A1, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A1, f12 LFPDUX B6, BO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 LFPDUX A10, AO, INC4 fxcsmadd f13, B2, A2, f13 nop fxcpmadd f2, B1, A3, f2 nop fxcsmadd f6, B1, A3, f6 nop fxcpmadd f10, B2, A3, f10 nop fxcsmadd f14, B2, A3, f14 nop fxcpmadd f3, B1, A4, f3 nop fxcsmadd f7, B1, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 nop fxcsmadd f15, B2, A4, f15 nop ## 2 ## fxcpmadd f0, B3, A5, f0 nop fxcsmadd f4, B3, A5, f4 nop fxcpmadd f8, B4, A5, f8 LFPDUX B2, BO2, INC4 fxcsmadd f12, B4, A5, f12 nop fxcpmadd f1, B3, A2, f1 nop fxcsmadd f5, B3, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 nop fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B3, A6, f2 nop fxcsmadd f6, B3, A6, f6 nop fxcpmadd f10, B4, A6, f10 nop fxcsmadd f14, B4, A6, f14 nop fxcpmadd f3, B3, A4, f3 nop fxcsmadd f7, B3, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 nop ## 3 ## fxcpmadd f0, B5, A7, f0 nop fxcsmadd f4, B5, A7, f4 nop fxcpmadd f8, B2, A7, f8 LFPDUX B4, BO2, INC4 fxcsmadd f12, B2, A7, f12 nop fxcpmadd f1, B5, A2, f1 nop fxcsmadd f5, B5, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 fxcpmadd f2, B5, A8, f2 nop fxcsmadd f6, B5, A8, f6 nop fxcpmadd f10, B2, A8, f10 nop fxcsmadd f14, B2, A8, f14 nop fxcpmadd f3, B5, A4, f3 nop fxcsmadd f7, B5, A4, f7 LFPDUX A2, AO2, INC4 fxcpmadd f11, B2, A4, f11 nop fxcsmadd f15, B2, A4, f15 nop ## 4 ## fxcpmadd f0, B6, A9, f0 nop fxcsmadd f4, B6, A9, f4 nop fxcpmadd f8, B4, A9, f8 nop fxcsmadd f12, B4, A9, f12 nop fxcpmadd f1, B6, A2, f1 nop fxcsmadd f5, B6, A2, f5 LFPDUX A4, AO2, INC4 fxcpmadd f9, B4, A2, f9 nop fxcsmadd f13, B4, A2, f13 nop fxcpmadd f2, B6, A10, f2 nop fxcsmadd f6, B6, A10, f6 nop fxcpmadd f10, B4, A10, f10 nop fxcsmadd f14, B4, A10, f14 nop fxcpmadd f3, B6, A4, f3 nop fxcsmadd f7, B6, A4, f7 nop fxcpmadd f11, B4, A4, f11 nop fxcsmadd f15, B4, A4, f15 nop .align 4 .L14: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L18 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 #endif .align 4 .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 LFPDUX A2, AO, INC4 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 LFPDUX A10, BO, INC4 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: fxcpmadd f0, A10, A2, f0 fxcsmadd f4, A10, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX A2, AO, INC4 fxcpmadd f1, A10, A4, f1 fxcsmadd f5, A10, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 LFPDUX A4, AO2, INC4 fxcpmadd f2, A10, A2, f2 fxcsmadd f6, A10, A2, f6 fxcpmadd f10, B4, A2, f10 fxcsmadd f14, B4, A2, f14 fxcpmadd f3, A10, A4, f3 fxcsmadd f7, A10, A4, f7 fxcpmadd f11, B4, A4, f11 fxcsmadd f15, B4, A4, f15 .align 4 .L18: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 8 #else subi r0, KK, 4 #endif slwi TEMP, r0, 3 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 LFPDUX f16, BO, INC4 fpmr f25, f1 nop fpmr f26, f2 LFPDUX f17, BO2, INC4 fpmr f27, f3 nop fpmr f28, f8 LFPDUX f18, BO, INC4 fpmr f29, f9 nop fpmr f30, f10 LFPDUX f19, BO2, INC4 fpmr f31, f11 nop fsmfp f0, f4 LFPDUX f20, BO, INC4 fsmfp f1, f5 nop fsmfp f2, f6 LFPDUX f21, BO2, INC4 fsmfp f3, f7 nop fsmfp f8, f12 LFPDUX f22, BO, INC4 fsmfp f9, f13 nop fsmfp f10, f14 LFPDUX f23, BO2, INC4 fsmfp f11, f15 nop fsmtp f4, f24 LFPDUX f24, BO, INC4 fsmtp f5, f25 nop fsmtp f6, f26 LFPDUX f25, BO2, INC4 fsmtp f7, f27 nop fsmtp f12, f28 LFPDUX f26, BO, INC4 fsmtp f13, f29 nop fsmtp f14, f30 LFPDUX f27, BO2, INC4 fsmtp f15, f31 nop fpsub f0, f16, f0 LFPDUX f28, BO, INC4 fpsub f8, f17, f8 nop fpsub f4, f18, f4 LFPDUX f29, BO2, INC4 fpsub f12, f19, f12 nop fpsub f1, f20, f1 LFPDUX f30, BO, INC4 fpsub f9, f21, f9 subi BO, BO, 32 * SIZE fpsub f5, f22, f5 LFPDUX f31, BO2, INC4 fpsub f13, f23, f13 subi BO2, BO2, 32 * SIZE fpsub f2, f24, f2 fpsub f10, f25, f10 fpsub f6, f26, f6 fpsub f14, f27, f14 fpsub f3, f28, f3 fpsub f11, f29, f11 fpsub f7, f30, f7 fpsub f15, f31, f15 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 fpsub f0, f16, f0 LFPDUX f24, AO, INC4 fpsub f1, f17, f1 LFPDUX f25, AO2, INC4 fpsub f2, f18, f2 LFPDUX f26, AO, INC4 fpsub f3, f19, f3 LFPDUX f27, AO2, INC4 fpsub f4, f20, f4 LFPDUX f28, AO, INC4 fpsub f5, f21, f5 LFPDUX f29, AO2, INC4 fpsub f6, f22, f6 LFPDUX f30, AO, INC4 fpsub f7, f23, f7 LFPDUX f31, AO2, INC4 fpsub f8, f24, f8 subi AO, AO, 32 * SIZE fpsub f9, f25, f9 subi AO2, AO2, 32 * SIZE fpsub f10, f26, f10 fpsub f11, f27, f11 fpsub f12, f28, f12 fpsub f13, f29, f13 fpsub f14, f30, f14 fpsub f15, f31, f15 #endif #ifdef LN addi AO, AO, 68 * SIZE addi AO2, AO2, 68 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 LFPDUX A3, AO2, INCM4 LFPDUX A4, AO, INCM4 LFPDUX A5, AO2, INCM4 LFPDUX A6, AO, INCM4 LFPDUX A7, AO2, INCM4 LFPDUX A8, AO, INCM4 fxsmul f7, A1, f7 fxsmul f15, A1, f15 fxcpnmsub f3, A1, f7, f3 fxcpnmsub f11, A1, f15, f11 fxcsnmsub f6, A2, f7, f6 fxcsnmsub f14, A2, f15, f14 fxcpnmsub f2, A2, f7, f2 fxcpnmsub f10, A2, f15, f10 fxcsnmsub f5, A3, f7, f5 fxcsnmsub f13, A3, f15, f13 fxcpnmsub f1, A3, f7, f1 fxcpnmsub f9, A3, f15, f9 fxcsnmsub f4, A4, f7, f4 fxcsnmsub f12, A4, f15, f12 fxcpnmsub f0, A4, f7, f0 fxcpnmsub f8, A4, f15, f8 fxpmul f3, A5, f3 fxpmul f11, A5, f11 fxcsnmsub f6, A6, f3, f6 fxcsnmsub f14, A6, f11, f14 fxcpnmsub f2, A6, f3, f2 fxcpnmsub f10, A6, f11, f10 fxcsnmsub f5, A7, f3, f5 fxcsnmsub f13, A7, f11, f13 fxcpnmsub f1, A7, f3, f1 fxcpnmsub f9, A7, f11, f9 fxcsnmsub f4, A8, f3, f4 fxcsnmsub f12, A8, f11, f12 fxcpnmsub f0, A8, f3, f0 fxcpnmsub f8, A8, f11, f8 add AO2, AO2, INCM4 LFPDUX A1, AO, INCM4 LFPDUX A2, AO2, INCM4 LFPDUX A3, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A4, AO, INCM4 LFPDUX A5, AO2, INCM4 LFPDUX A6, AO, INCM4 add AO2, AO2, INCM4 add AO, AO, INCM4 LFPDUX A7, AO2, INCM4 LFPDUX A8, AO, INCM4 fxsmul f6, A1, f6 fxsmul f14, A1, f14 fxcpnmsub f2, A1, f6, f2 fxcpnmsub f10, A1, f14, f10 fxcsnmsub f5, A2, f6, f5 fxcsnmsub f13, A2, f14, f13 fxcpnmsub f1, A2, f6, f1 fxcpnmsub f9, A2, f14, f9 fxcsnmsub f4, A3, f6, f4 fxcsnmsub f12, A3, f14, f12 fxcpnmsub f0, A3, f6, f0 fxcpnmsub f8, A3, f14, f8 fxpmul f2, A4, f2 fxpmul f10, A4, f10 fxcsnmsub f5, A5, f2, f5 fxcsnmsub f13, A5, f10, f13 fxcpnmsub f1, A5, f2, f1 fxcpnmsub f9, A5, f10, f9 fxcsnmsub f4, A6, f2, f4 fxcsnmsub f12, A6, f10, f12 fxcpnmsub f0, A6, f2, f0 fxcpnmsub f8, A6, f10, f8 fxsmul f5, A7, f5 fxsmul f13, A7, f13 fxcpnmsub f1, A7, f5, f1 fxcpnmsub f9, A7, f13, f9 fxcsnmsub f4, A8, f5, f4 fxcsnmsub f12, A8, f13, f12 fxcpnmsub f0, A8, f5, f0 fxcpnmsub f8, A8, f13, f8 add AO2, AO2, INCM4 add AO, AO, INCM4 LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 subi AO2, AO2, 8 * SIZE add AO, AO, INCM4 LFPDUX A3, AO, INCM4 subi AO2, AO2, 8 * SIZE add AO, AO, INCM4 LFPDUX A4, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxpmul f1, A1, f1 fxpmul f9, A1, f9 fxcsnmsub f4, A2, f1, f4 fxcsnmsub f12, A2, f9, f12 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 fxsmul f4, A3, f4 fxsmul f12, A3, f12 fxcpnmsub f0, A3, f4, f0 fxcpnmsub f8, A3, f12, f8 fxpmul f0, A4, f0 fxpmul f8, A4, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A5, AO, INC4 LFPDUX A6, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 fxcsnmsub f5, A2, f0, f5 fxcsnmsub f13, A2, f8, f13 fxcpnmsub f2, A3, f0, f2 fxcpnmsub f10, A3, f8, f10 fxcsnmsub f6, A3, f0, f6 fxcsnmsub f14, A3, f8, f14 fxcpnmsub f3, A4, f0, f3 fxcpnmsub f11, A4, f8, f11 fxcsnmsub f7, A4, f0, f7 fxcsnmsub f15, A4, f8, f15 fxsmul f4, A5, f4 fxsmul f12, A5, f12 fxcpnmsub f1, A6, f4, f1 fxcpnmsub f9, A6, f12, f9 fxcsnmsub f5, A6, f4, f5 fxcsnmsub f13, A6, f12, f13 fxcpnmsub f2, A7, f4, f2 fxcpnmsub f10, A7, f12, f10 fxcsnmsub f6, A7, f4, f6 fxcsnmsub f14, A7, f12, f14 fxcpnmsub f3, A8, f4, f3 fxcpnmsub f11, A8, f12, f11 fxcsnmsub f7, A8, f4, f7 fxcsnmsub f15, A8, f12, f15 add AO, AO, INC4 LFPDUX A1, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 add AO, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A5, AO, INC4 LFPDUX A6, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 fxpmul f1, A1, f1 fxpmul f9, A1, f9 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f13, A1, f9, f13 fxcpnmsub f2, A2, f1, f2 fxcpnmsub f10, A2, f9, f10 fxcsnmsub f6, A2, f1, f6 fxcsnmsub f14, A2, f9, f14 fxcpnmsub f3, A3, f1, f3 fxcpnmsub f11, A3, f9, f11 fxcsnmsub f7, A3, f1, f7 fxcsnmsub f15, A3, f9, f15 fxsmul f5, A4, f5 fxsmul f13, A4, f13 fxcpnmsub f2, A5, f5, f2 fxcpnmsub f10, A5, f13, f10 fxcsnmsub f6, A5, f5, f6 fxcsnmsub f14, A5, f13, f14 fxcpnmsub f3, A6, f5, f3 fxcpnmsub f11, A6, f13, f11 fxcsnmsub f7, A6, f5, f7 fxcsnmsub f15, A6, f13, f15 fxpmul f2, A7, f2 fxpmul f10, A7, f10 fxcsnmsub f6, A7, f2, f6 fxcsnmsub f14, A7, f10, f14 fxcpnmsub f3, A8, f2, f3 fxcpnmsub f11, A8, f10, f11 fxcsnmsub f7, A8, f2, f7 fxcsnmsub f15, A8, f10, f15 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 addi AO, AO, 8 * SIZE addi AO2, AO2, 4 * SIZE LFPDUX A3, AO2, INC4 addi AO, AO, 8 * SIZE addi AO2, AO2, 4 * SIZE LFPDUX A4, AO2, INC4 subi AO, AO, 64 * SIZE subi AO2, AO2, 64 * SIZE fxsmul f6, A1, f6 fxsmul f14, A1, f14 fxcpnmsub f3, A2, f6, f3 fxcpnmsub f11, A2, f14, f11 fxcsnmsub f7, A2, f6, f7 fxcsnmsub f15, A2, f14, f15 fxpmul f3, A3, f3 fxpmul f11, A3, f11 fxcsnmsub f7, A3, f3, f7 fxcsnmsub f15, A3, f11, f15 fxsmul f7, A4, f7 fxsmul f15, A4, f15 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxpmul f2, A1, f2 fxpmul f3, A1, f3 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcsnmsub f6, A1, f2, f6 fxcsnmsub f7, A1, f3, f7 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcpnmsub f10, A2, f2, f10 fxcpnmsub f11, A2, f3, f11 fxcsnmsub f12, A2, f0, f12 fxcsnmsub f13, A2, f1, f13 fxcsnmsub f14, A2, f2, f14 fxcsnmsub f15, A2, f3, f15 fxsmul f4, A3, f4 fxsmul f5, A3, f5 fxsmul f6, A3, f6 fxsmul f7, A3, f7 fxcpnmsub f8, A4, f4, f8 fxcpnmsub f9, A4, f5, f9 fxcpnmsub f10, A4, f6, f10 fxcpnmsub f11, A4, f7, f11 fxcsnmsub f12, A4, f4, f12 fxcsnmsub f13, A4, f5, f13 fxcsnmsub f14, A4, f6, f14 fxcsnmsub f15, A4, f7, f15 fxpmul f8, A5, f8 fxpmul f9, A5, f9 fxpmul f10, A5, f10 fxpmul f11, A5, f11 fxcsnmsub f12, A5, f8, f12 fxcsnmsub f13, A5, f9, f13 fxcsnmsub f14, A5, f10, f14 fxcsnmsub f15, A5, f11, f15 fxsmul f12, A6, f12 fxsmul f13, A6, f13 fxsmul f14, A6, f14 fxsmul f15, A6, f15 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxsmul f13, A1, f13 fxsmul f14, A1, f14 fxsmul f15, A1, f15 fxcpnmsub f8, A1, f12, f8 fxcpnmsub f9, A1, f13, f9 fxcpnmsub f10, A1, f14, f10 fxcpnmsub f11, A1, f15, f11 fxcsnmsub f4, A2, f12, f4 fxcsnmsub f5, A2, f13, f5 fxcsnmsub f6, A2, f14, f6 fxcsnmsub f7, A2, f15, f7 fxcpnmsub f0, A2, f12, f0 fxcpnmsub f1, A2, f13, f1 fxcpnmsub f2, A2, f14, f2 fxcpnmsub f3, A2, f15, f3 fxpmul f8, A3, f8 fxpmul f9, A3, f9 fxpmul f10, A3, f10 fxpmul f11, A3, f11 fxcsnmsub f4, A4, f8, f4 fxcsnmsub f5, A4, f9, f5 fxcsnmsub f6, A4, f10, f6 fxcsnmsub f7, A4, f11, f7 fxcpnmsub f0, A4, f8, f0 fxcpnmsub f1, A4, f9, f1 fxcpnmsub f2, A4, f10, f2 fxcpnmsub f3, A4, f11, f3 fxsmul f4, A5, f4 fxsmul f5, A5, f5 fxsmul f6, A5, f6 fxsmul f7, A5, f7 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f1, A5, f5, f1 fxcpnmsub f2, A5, f6, f2 fxcpnmsub f3, A5, f7, f3 fxpmul f0, A6, f0 fxpmul f1, A6, f1 fxpmul f2, A6, f2 fxpmul f3, A6, f3 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE subi CO3, CO3, 8 * SIZE subi CO4, CO4, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f5, BO, INC4 STFPDUX f13, BO2, INC4 STFPDUX f2, BO, INC4 STFPDUX f10, BO2, INC4 STFPDUX f6, BO, INC4 STFPDUX f14, BO2, INC4 STFPDUX f3, BO, INC4 STFPDUX f11, BO2, INC4 STFPDUX f7, BO, INC4 STFPDUX f15, BO2, INC4 subi BO, BO, 32 * SIZE subi BO2, BO2, 32 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFDUX f2, CO1, INC STFDUX f6, CO1, INC STFDUX f3, CO1, INC STFDUX f7, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFSDUX f2, CO2, INC STFSDUX f6, CO2, INC STFSDUX f3, CO2, INC STFSDUX f7, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFDUX f9, CO3, INC STFDUX f13, CO3, INC STFDUX f10, CO3, INC STFDUX f14, CO3, INC STFDUX f11, CO3, INC STFDUX f15, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC STFSDUX f9, CO4, INC STFSDUX f13, CO4, INC STFSDUX f10, CO4, INC STFSDUX f14, CO4, INC STFSDUX f11, CO4, INC STFSDUX f15, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f2, AO, INC4 STFPDUX f3, AO2, INC4 STFPDUX f4, AO, INC4 STFPDUX f5, AO2, INC4 STFPDUX f6, AO, INC4 STFPDUX f7, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f10, AO, INC4 STFPDUX f11, AO2, INC4 STFPDUX f12, AO, INC4 STFPDUX f13, AO2, INC4 STFPDUX f14, AO, INC4 STFPDUX f15, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f6, CO2, INC STFSDUX f6, CO2, INC STFDUX f7, CO2, INC STFSDUX f7, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f10, CO3, INC STFSDUX f10, CO3, INC STFDUX f11, CO3, INC STFSDUX f11, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC STFDUX f14, CO4, INC STFSDUX f14, CO4, INC STFDUX f15, CO4, INC STFSDUX f15, CO4, INC #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE subi CO3, CO3, 8 * SIZE subi CO4, CO4, 8 * SIZE #endif #ifdef RT slwi r0, K, 3 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 3 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 8 #endif #ifdef LN subi KK, KK, 8 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L20: andi. I, M, 4 beq .L30 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: fxcpmadd f0, B1, A1, f0 nop fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 nop fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 nop fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 nop fxcsmadd f13, B2, A2, f13 LFPDUX B2, BO2, INC4 fxcpmadd f0, B3, A3, f0 nop fxcsmadd f4, B3, A3, f4 LFPDUX A2, AO2, INC4 fxcpmadd f8, B4, A3, f8 nop fxcsmadd f12, B4, A3, f12 LFPDUX A3, AO, INC4 fxcpmadd f1, B3, A4, f1 nop fxcsmadd f5, B3, A4, f5 LFPDUX B3, BO, INC4 fxcpmadd f9, B4, A4, f9 nop fxcsmadd f13, B4, A4, f13 LFPDUX B4, BO2, INC4 fxcpmadd f0, B5, A5, f0 nop fxcsmadd f4, B5, A5, f4 LFPDUX A4, AO2, INC4 fxcpmadd f8, B6, A5, f8 nop fxcsmadd f12, B6, A5, f12 LFPDUX A5, AO, INC4 fxcpmadd f1, B5, A6, f1 nop fxcsmadd f5, B5, A6, f5 LFPDUX B5, BO, INC4 fxcpmadd f9, B6, A6, f9 nop fxcsmadd f13, B6, A6, f13 LFPDUX B6, BO2, INC4 fxcpmadd f0, A9, A7, f0 nop fxcsmadd f4, A9, A7, f4 LFPDUX A6, AO2, INC4 fxcpmadd f8, A10, A7, f8 nop fxcsmadd f12, A10, A7, f12 LFPDUX A7, AO, INC4 fxcpmadd f1, A9, A8, f1 nop fxcsmadd f5, A9, A8, f5 LFPDUX A9, BO, INC4 fxcpmadd f9, A10, A8, f9 nop fxcsmadd f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX A8, AO2, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 fxcpmadd f0, B3, A3, f0 fxcsmadd f4, B3, A3, f4 fxcpmadd f8, B4, A3, f8 fxcsmadd f12, B4, A3, f12 fxcpmadd f1, B3, A4, f1 fxcsmadd f5, B3, A4, f5 fxcpmadd f9, B4, A4, f9 fxcsmadd f13, B4, A4, f13 fxcpmadd f0, B5, A5, f0 fxcsmadd f4, B5, A5, f4 fxcpmadd f8, B6, A5, f8 fxcsmadd f12, B6, A5, f12 fxcpmadd f1, B5, A6, f1 fxcsmadd f5, B5, A6, f5 fxcpmadd f9, B6, A6, f9 fxcsmadd f13, B6, A6, f13 fxcpmadd f0, A9, A7, f0 fxcsmadd f4, A9, A7, f4 fxcpmadd f8, A10, A7, f8 fxcsmadd f12, A10, A7, f12 fxcpmadd f1, A9, A8, f1 fxcsmadd f5, A9, A8, f5 fxcpmadd f9, A10, A8, f9 fxcsmadd f13, A10, A8, f13 .align 4 .L24: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L28 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L28 #endif LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX A1, AO, INC4 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 LFPDUX B1, BO, INC4 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f1, B1, A2, f1 fxcsmadd f5, B1, A2, f5 fxcpmadd f9, B2, A2, f9 fxcsmadd f13, B2, A2, f13 .align 4 .L28: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 4 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f25, f1 fpmr f28, f8 fpmr f29, f9 fsmfp f0, f4 fsmfp f1, f5 fsmfp f8, f12 fsmfp f9, f13 fsmtp f4, f24 fsmtp f5, f25 fsmtp f12, f28 fsmtp f13, f29 LFPDUX f16, BO, INC4 LFPDUX f17, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f19, BO2, INC4 LFPDUX f20, BO, INC4 LFPDUX f21, BO2, INC4 LFPDUX f22, BO, INC4 LFPDUX f23, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fpsub f0, f16, f0 fpsub f8, f17, f8 fpsub f4, f18, f4 fpsub f12, f19, f12 fpsub f1, f20, f1 fpsub f9, f21, f9 fpsub f5, f22, f5 fpsub f13, f23, f13 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f4, f18, f4 fpsub f5, f19, f5 fpsub f8, f20, f8 fpsub f9, f21, f9 fpsub f12, f22, f12 fpsub f13, f23, f13 #endif #ifdef LN addi AO, AO, 20 * SIZE addi AO2, AO2, 20 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 LFPDUX A3, AO2, INCM4 LFPDUX A4, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A5, AO, INCM4 add AO2, AO2, INCM4 LFPDUX A6, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxsmul f5, A1, f5 fxsmul f13, A1, f13 fxcpnmsub f1, A1, f5, f1 fxcpnmsub f9, A1, f13, f9 fxcsnmsub f4, A2, f5, f4 fxcsnmsub f12, A2, f13, f12 fxcpnmsub f0, A2, f5, f0 fxcpnmsub f8, A2, f13, f8 fxpmul f1, A3, f1 fxpmul f9, A3, f9 fxcsnmsub f4, A4, f1, f4 fxcsnmsub f12, A4, f9, f12 fxcpnmsub f0, A4, f1, f0 fxcpnmsub f8, A4, f9, f8 fxsmul f4, A5, f4 fxsmul f12, A5, f12 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f8, A5, f12, f8 fxpmul f0, A6, f0 fxpmul f8, A6, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 add AO, AO, INC4 LFPDUX A5, AO2, INC4 add AO, AO, INC4 LFPDUX A6, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 fxcsnmsub f5, A2, f0, f5 fxcsnmsub f13, A2, f8, f13 fxsmul f4, A3, f4 fxsmul f12, A3, f12 fxcpnmsub f1, A4, f4, f1 fxcpnmsub f9, A4, f12, f9 fxcsnmsub f5, A4, f4, f5 fxcsnmsub f13, A4, f12, f13 fxpmul f1, A5, f1 fxpmul f9, A5, f9 fxcsnmsub f5, A5, f1, f5 fxcsnmsub f13, A5, f9, f13 fxsmul f5, A6, f5 fxsmul f13, A6, f13 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxpmul f1, A1, f1 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f5, A1, f1, f5 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcsnmsub f12, A2, f0, f12 fxcsnmsub f13, A2, f1, f13 fxsmul f4, A3, f4 fxsmul f5, A3, f5 fxcpnmsub f8, A4, f4, f8 fxcpnmsub f9, A4, f5, f9 fxcsnmsub f12, A4, f4, f12 fxcsnmsub f13, A4, f5, f13 fxpmul f8, A5, f8 fxpmul f9, A5, f9 fxcsnmsub f12, A5, f8, f12 fxcsnmsub f13, A5, f9, f13 fxsmul f12, A6, f12 fxsmul f13, A6, f13 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxsmul f13, A1, f13 fxcpnmsub f8, A1, f12, f8 fxcpnmsub f9, A1, f13, f9 fxcsnmsub f4, A2, f12, f4 fxcsnmsub f5, A2, f13, f5 fxcpnmsub f0, A2, f12, f0 fxcpnmsub f1, A2, f13, f1 fxpmul f8, A3, f8 fxpmul f9, A3, f9 fxcsnmsub f4, A4, f8, f4 fxcsnmsub f5, A4, f9, f5 fxcpnmsub f0, A4, f8, f0 fxcpnmsub f1, A4, f9, f1 fxsmul f4, A5, f4 fxsmul f5, A5, f5 fxcpnmsub f0, A5, f4, f0 fxcpnmsub f1, A5, f5, f1 fxpmul f0, A6, f0 fxpmul f1, A6, f1 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f5, BO, INC4 STFPDUX f13, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFDUX f1, CO1, INC STFDUX f5, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFSDUX f1, CO2, INC STFSDUX f5, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFDUX f9, CO3, INC STFDUX f13, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC STFSDUX f9, CO4, INC STFSDUX f13, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f4, AO, INC4 STFPDUX f5, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f12, AO, INC4 STFPDUX f13, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f5, CO2, INC STFSDUX f5, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f9, CO3, INC STFSDUX f9, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC STFDUX f13, CO4, INC STFSDUX f13, CO4, INC #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: andi. I, M, 2 beq .L40 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L34 #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX B1, BO, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, B3, A2, f0 fxcsmadd f4, B3, A2, f4 LFPDUX B3, BO, INC4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A5, A3, f0 fxcsmadd f4, A5, A3, f4 LFPDUX A5, BO, INC4 fxcpmadd f8, A6, A3, f8 fxcsmadd f12, A6, A3, f12 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A7, A4, f0 fxcsmadd f4, A7, A4, f4 LFPDUX A7, BO, INC4 fxcpmadd f8, A8, A4, f8 fxcsmadd f12, A8, A4, f12 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 fxcpmadd f0, B3, A2, f0 fxcsmadd f4, B3, A2, f4 fxcpmadd f8, B4, A2, f8 fxcsmadd f12, B4, A2, f12 fxcpmadd f0, A5, A3, f0 fxcsmadd f4, A5, A3, f4 fxcpmadd f8, A6, A3, f8 fxcsmadd f12, A6, A3, f12 fxcpmadd f0, A7, A4, f0 fxcsmadd f4, A7, A4, f4 fxcpmadd f8, A8, A4, f8 fxcsmadd f12, A8, A4, f12 .align 4 .L34: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L38 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L38 #endif LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 LFPDUX B1, BO, INC4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: fxcpmadd f0, B1, A1, f0 fxcsmadd f4, B1, A1, f4 fxcpmadd f8, B2, A1, f8 fxcsmadd f12, B2, A1, f12 .align 4 .L38: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) fpmr f24, f0 fpmr f28, f8 fsmfp f0, f4 fsmfp f8, f12 fsmtp f4, f24 fsmtp f12, f28 LFPDUX f16, BO, INC4 LFPDUX f17, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f19, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fpsub f0, f16, f0 fpsub f8, f17, f8 fpsub f4, f18, f4 fpsub f12, f19, f12 #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fpsub f0, f16, f0 fpsub f4, f17, f4 fpsub f8, f18, f8 fpsub f12, f19, f12 #endif #ifdef LN addi AO, AO, 8 * SIZE addi AO2, AO2, 8 * SIZE LFPDUX A1, AO2, INCM4 LFPDUX A2, AO, INCM4 addi AO, AO, -4 * SIZE addi AO2, AO2, -4 * SIZE fxsmul f4, A1, f4 fxsmul f12, A1, f12 fxcpnmsub f0, A1, f4, f0 fxcpnmsub f8, A1, f12, f8 fxpmul f0, A2, f0 fxpmul f8, A2, f8 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 subi AO, AO, 4 * SIZE subi AO2, AO2, 4 * SIZE fxpmul f0, A1, f0 fxpmul f8, A1, f8 fxcsnmsub f4, A1, f0, f4 fxcsnmsub f12, A1, f8, f12 fxsmul f4, A2, f4 fxsmul f12, A2, f12 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 LFPDUX A3, BO, INC4 LFPDUX A4, BO2, INC4 add BO, BO, INC4 LFPDUX A5, BO2, INC4 add BO, BO, INC4 LFPDUX A6, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE fxpmul f0, A1, f0 fxcsnmsub f4, A1, f0, f4 fxcpnmsub f8, A2, f0, f8 fxcsnmsub f12, A2, f0, f12 fxsmul f4, A3, f4 fxcpnmsub f8, A4, f4, f8 fxcsnmsub f12, A4, f4, f12 fxpmul f8, A5, f8 fxcsnmsub f12, A5, f8, f12 fxsmul f12, A6, f12 #endif #ifdef RT addi BO, BO, 20 * SIZE addi BO2, BO2, 20 * SIZE LFPDUX A1, BO2, INCM4 LFPDUX A2, BO, INCM4 LFPDUX A3, BO2, INCM4 LFPDUX A4, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A5, BO, INCM4 add BO2, BO2, INCM4 LFPDUX A6, BO, INCM4 subi BO, BO, 4 * SIZE subi BO2, BO2, 4 * SIZE fxsmul f12, A1, f12 fxcpnmsub f8, A1, f12, f8 fxcsnmsub f4, A2, f12, f4 fxcpnmsub f0, A2, f12, f0 fxpmul f8, A3, f8 fxcsnmsub f4, A4, f8, f4 fxcpnmsub f0, A4, f8, f0 fxsmul f4, A5, f4 fxcpnmsub f0, A5, f4, f0 fxpmul f0, A6, f0 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f4, BO, INC4 STFPDUX f12, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE STFDUX f0, CO1, INC STFDUX f4, CO1, INC STFSDUX f0, CO2, INC STFSDUX f4, CO2, INC STFDUX f8, CO3, INC STFDUX f12, CO3, INC STFSDUX f8, CO4, INC STFSDUX f12, CO4, INC #else STFPDUX f0, AO, INC4 STFPDUX f4, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f12, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f4, CO2, INC STFSDUX f4, CO2, INC STFDUX f8, CO3, INC STFSDUX f8, CO3, INC STFDUX f12, CO4, INC STFSDUX f12, CO4, INC #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L40: andi. I, M, 1 beq .L49 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L44 #else #ifdef LN slwi r0, K, 0 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L44 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L43 .align 4 .L42: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A1, AO, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A2, AO2, INC4 fxcpmadd f0, A3, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A3, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A3, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A3, B4, f3 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 fxcpmadd f0, A4, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A4, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A4, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A4, A8, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L42 .align 4 .L43: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFPDUX B2, BO2, INC4 fxcsmadd f2, A1, B3, f2 LFPDUX B3, BO, INC4 fxcsmadd f3, A1, B4, f3 LFPDUX B4, BO2, INC4 fxcpmadd f0, A2, A5, f0 LFPDUX A5, BO, INC4 fxcpmadd f1, A2, A6, f1 LFPDUX A6, BO2, INC4 fxcsmadd f2, A2, A7, f2 LFPDUX A7, BO, INC4 fxcsmadd f3, A2, A8, f3 LFPDUX A8, BO2, INC4 fxcpmadd f0, A3, B1, f0 fxcpmadd f1, A3, B2, f1 fxcsmadd f2, A3, B3, f2 fxcsmadd f3, A3, B4, f3 fxcpmadd f0, A4, A5, f0 fxcpmadd f1, A4, A6, f1 fxcsmadd f2, A4, A7, f2 fxcsmadd f3, A4, A8, f3 .align 4 .L44: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L48 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L48 #endif LFDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdz- .L47 .align 4 .L46: fxcpmadd f0, A1, B1, f0 LFPDUX B1, BO, INC4 fxcpmadd f1, A1, B2, f1 LFDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC bdnz+ .L46 .align 4 .L47: fxcpmadd f0, A1, B1, f0 fxcpmadd f1, A1, B2, f1 addi AO2, AO, 2 * SIZE .align 4 .L48: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC4 LFPDX f17, BO2, INC4 fpsub f0, f16, f0 fpsub f1, f17, f1 #else LFPDX f16, AO, INC4 LFPDX f17, AO2, INC4 fpsub f0, f16, f0 fpsub f1, f17, f1 #endif #if defined(LN) || defined(LT) LFPDX A1, AO, INC4 fxpmul f0, A1, f0 fxpmul f1, A1, f1 #endif #ifdef RN LFD A1, (4 + 0) * SIZE(BO) LFD A2, (4 + 1) * SIZE(BO) LFD A3, (4 + 2) * SIZE(BO) LFD A4, (4 + 3) * SIZE(BO) LFD A5, (4 + 5) * SIZE(BO) LFD A6, (4 + 6) * SIZE(BO) LFD A7, (4 + 7) * SIZE(BO) LFD A8, (4 + 10) * SIZE(BO) LFD A9, (4 + 11) * SIZE(BO) LFD A10, (4 + 15) * SIZE(BO) fsmtp f2, f0 fsmtp f3, f1 fmul f0, A1, f0 fnmsub f2, A2, f0, f2 fnmsub f1, A3, f0, f1 fnmsub f3, A4, f0, f3 fmul f2, A5, f2 fnmsub f1, A6, f2, f1 fnmsub f3, A7, f2, f3 fmul f1, A8, f1 fnmsub f3, A9, f1, f3 fmul f3, A10, f3 fsmfp f0, f2 fsmfp f1, f3 #endif #ifdef RT LFD A1, (4 + 15) * SIZE(BO) LFD A2, (4 + 14) * SIZE(BO) LFD A3, (4 + 13) * SIZE(BO) LFD A4, (4 + 12) * SIZE(BO) LFD A5, (4 + 10) * SIZE(BO) LFD A6, (4 + 9) * SIZE(BO) LFD A7, (4 + 8) * SIZE(BO) LFD A8, (4 + 5) * SIZE(BO) LFD A9, (4 + 4) * SIZE(BO) LFD A10, (4 + 0) * SIZE(BO) fsmtp f2, f0 fsmtp f3, f1 fmul f3, A1, f3 fnmsub f1, A2, f3, f1 fnmsub f2, A3, f3, f2 fnmsub f0, A4, f3, f0 fmul f1, A5, f1 fnmsub f2, A6, f1, f2 fnmsub f0, A7, f1, f0 fmul f2, A8, f2 fnmsub f0, A9, f2, f0 fmul f0, A10, f0 fsmfp f0, f2 fsmfp f1, f3 #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC4 STFPDX f1, BO2, INC4 #else STFPDX f0, AO, INC4 STFPDX f1, AO2, INC4 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO2, INC STFDUX f1, CO3, INC STFSDUX f1, CO4, INC #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L49: #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 4 * SIZE #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 bgt+ .L10 .align 4 .L999: addi SP, SP, 12 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_power6_LN.S000066400000000000000000001726721313527062700221430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREA, (16 * 3 * SIZE) li PREC, -4 * SIZE lfs f0, FZERO srawi. J, N, 2 ble LL(40) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif LL(30): andi. I, M, 1 ble LL(20) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE dcbtst AO, PREA bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(20): andi. I, M, 2 ble LL(09) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE dcbtst AO, PREA bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(09): srawi. I, M, 2 ble LL(39) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): dcbt AO, PREA dcbtst BO, PREA FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 4 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f25, 5 * SIZE(AO) LFD f29, 5 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 6 * SIZE(AO) LFD f30, 6 * SIZE(BO) LFD f27, 7 * SIZE(AO) LFD f31, 7 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 8 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f17, 9 * SIZE(AO) LFD f21, 9 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 10 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f19, 11 * SIZE(AO) LFD f23, 11 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 12 * SIZE(AO) LFD f28, 12 * SIZE(BO) LFD f25, 13 * SIZE(AO) LFD f29, 13 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 14 * SIZE(AO) LFD f30, 14 * SIZE(BO) LFD f27, 15 * SIZE(AO) LFD f31, 15 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 16 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f17, 17 * SIZE(AO) LFD f21, 17 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 18 * SIZE(AO) LFD f22, 18 * SIZE(BO) LFD f19, 19 * SIZE(AO) LFD f23, 19 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 20 * SIZE(AO) LFD f28, 20 * SIZE(BO) LFD f25, 21 * SIZE(AO) LFD f29, 21 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 22 * SIZE(AO) LFD f30, 22 * SIZE(BO) LFD f27, 23 * SIZE(AO) LFD f31, 23 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 24 * SIZE(AO) LFD f20, 24 * SIZE(BO) LFD f17, 25 * SIZE(AO) LFD f21, 25 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 26 * SIZE(AO) LFD f22, 26 * SIZE(BO) LFD f19, 27 * SIZE(AO) LFD f23, 27 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 28 * SIZE(AO) LFD f28, 28 * SIZE(BO) LFD f25, 29 * SIZE(AO) LFD f29, 29 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 30 * SIZE(AO) LFD f30, 30 * SIZE(BO) LFD f27, 31 * SIZE(AO) LFD f31, 31 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 32 * SIZE(AO) LFD f20, 32 * SIZE(BO) LFD f17, 33 * SIZE(AO) LFD f21, 33 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 34 * SIZE(AO) LFD f22, 34 * SIZE(BO) LFD f19, 35 * SIZE(AO) LFD f23, 35 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(40): andi. J, N, 2 ble LL(70) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif LL(60): andi. I, M, 1 ble LL(50) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(50): andi. I, M, 2 ble LL(41) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE dcbtst AO, PREA bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(41): srawi. I, M, 2 ble LL(69) .align 4 LL(42): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(43): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbtst AO, PREA bdnz LL(43) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(42) .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(70): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif .align 4 LL(90): andi. I, M, 1 ble LL(80) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(80): andi. I, M, 2 ble LL(71) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE dcbtst AO, PREA bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(71): srawi. I, M, 2 ble LL(999) .align 4 LL(72): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(73): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE dcbtst AO, PREA bdnz LL(73) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(72) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_power6_LT.S000066400000000000000000001725521313527062700221460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define BB r20 #define KK r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO1 r26 #define CO2 r27 #define CO3 r28 #define CO4 r29 #define PREA r30 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREA, (16 * 3 * SIZE) li PREC, 4 * SIZE lfs f0, FZERO srawi. J, N, 2 ble LL(40) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): dcbt AO, PREA dcbtst BO, PREA FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 4 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f25, 5 * SIZE(AO) LFD f29, 5 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 6 * SIZE(AO) LFD f30, 6 * SIZE(BO) LFD f27, 7 * SIZE(AO) LFD f31, 7 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 8 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f17, 9 * SIZE(AO) LFD f21, 9 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 10 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f19, 11 * SIZE(AO) LFD f23, 11 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 12 * SIZE(AO) LFD f28, 12 * SIZE(BO) LFD f25, 13 * SIZE(AO) LFD f29, 13 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 14 * SIZE(AO) LFD f30, 14 * SIZE(BO) LFD f27, 15 * SIZE(AO) LFD f31, 15 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 16 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f17, 17 * SIZE(AO) LFD f21, 17 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 18 * SIZE(AO) LFD f22, 18 * SIZE(BO) LFD f19, 19 * SIZE(AO) LFD f23, 19 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 20 * SIZE(AO) LFD f28, 20 * SIZE(BO) LFD f25, 21 * SIZE(AO) LFD f29, 21 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 22 * SIZE(AO) LFD f30, 22 * SIZE(BO) LFD f27, 23 * SIZE(AO) LFD f31, 23 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 24 * SIZE(AO) LFD f20, 24 * SIZE(BO) LFD f17, 25 * SIZE(AO) LFD f21, 25 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 26 * SIZE(AO) LFD f22, 26 * SIZE(BO) LFD f19, 27 * SIZE(AO) LFD f23, 27 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 28 * SIZE(AO) LFD f28, 28 * SIZE(BO) LFD f25, 29 * SIZE(AO) LFD f29, 29 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 30 * SIZE(AO) LFD f30, 30 * SIZE(BO) LFD f27, 31 * SIZE(AO) LFD f31, 31 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 32 * SIZE(AO) LFD f20, 32 * SIZE(BO) LFD f17, 33 * SIZE(AO) LFD f21, 33 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 34 * SIZE(AO) LFD f22, 34 * SIZE(BO) LFD f19, 35 * SIZE(AO) LFD f23, 35 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE dcbtst AO, PREA bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE dcbtst AO, PREA bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(40): andi. J, N, 2 ble LL(70) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(50) .align 4 LL(41): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbtst AO, PREA bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE dcbtst AO, PREA bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(70): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble LL(80) .align 4 LL(71): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbt CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE dcbtst AO, PREA bdnz LL(72) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE dcbtst AO, PREA bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(90): andi. I, M, 1 ble LL(999) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_power6_RT.S000066400000000000000000001730671313527062700221560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define PREA r29 #define PREC r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREA, (16 * 3 * SIZE) li PREC, 4 * SIZE lfs f0, FZERO andi. J, N, 1 ble LL(40) #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble LL(80) .align 4 LL(71): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbtst CO1, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(75) .align 5 LL(72): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f21, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f23, f0 FMADD f1, f17, f23, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE dcbtst AO, PREA bdnz LL(72) .align 4 LL(75): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(78) .align 4 LL(76): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4 LL(78): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(71) .align 4 LL(80): andi. I, M, 2 ble LL(90) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(85) .align 5 LL(82): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f21, f2 FMADD f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f23, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE dcbt AO, PREA bdnz LL(82) .align 4 LL(85): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(88) .align 4 LL(86): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4 LL(88): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(90): andi. I, M, 1 ble LL(99) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(95) .align 5 LL(92): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4 LL(95): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(98) .align 4 LL(96): FMADD f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4 LL(98): FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(99): #ifdef LN slwi r0, K, 0 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(40): andi. J, N, 2 ble LL(09) #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(50) .align 4 LL(41): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 5 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbtst AO, PREA bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(48) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(41) .align 4 LL(50): andi. I, M, 2 ble LL(60) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 5 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE dcbt AO, PREA bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(58) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4 LL(58): FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 5 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f17, f22, f2 FMADD f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f19, f26, f2 FMADD f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(68) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4 LL(68): FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 LL(09): srawi. J, N, 2 ble LL(999) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): dcbt AO, PREA dcbtst BO, PREA FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 4 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f25, 5 * SIZE(AO) LFD f29, 5 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 6 * SIZE(AO) LFD f30, 6 * SIZE(BO) LFD f27, 7 * SIZE(AO) LFD f31, 7 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 8 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f17, 9 * SIZE(AO) LFD f21, 9 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 10 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f19, 11 * SIZE(AO) LFD f23, 11 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 12 * SIZE(AO) LFD f28, 12 * SIZE(BO) LFD f25, 13 * SIZE(AO) LFD f29, 13 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 14 * SIZE(AO) LFD f30, 14 * SIZE(BO) LFD f27, 15 * SIZE(AO) LFD f31, 15 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 16 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f17, 17 * SIZE(AO) LFD f21, 17 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 18 * SIZE(AO) LFD f22, 18 * SIZE(BO) LFD f19, 19 * SIZE(AO) LFD f23, 19 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 20 * SIZE(AO) LFD f28, 20 * SIZE(BO) LFD f25, 21 * SIZE(AO) LFD f29, 21 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 22 * SIZE(AO) LFD f30, 22 * SIZE(BO) LFD f27, 23 * SIZE(AO) LFD f31, 23 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 24 * SIZE(AO) LFD f20, 24 * SIZE(BO) LFD f17, 25 * SIZE(AO) LFD f21, 25 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 26 * SIZE(AO) LFD f22, 26 * SIZE(BO) LFD f19, 27 * SIZE(AO) LFD f23, 27 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 28 * SIZE(AO) LFD f28, 28 * SIZE(BO) LFD f25, 29 * SIZE(AO) LFD f29, 29 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 30 * SIZE(AO) LFD f30, 30 * SIZE(BO) LFD f27, 31 * SIZE(AO) LFD f31, 31 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 32 * SIZE(AO) LFD f20, 32 * SIZE(BO) LFD f17, 33 * SIZE(AO) LFD f21, 33 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 34 * SIZE(AO) LFD f22, 34 * SIZE(BO) LFD f19, 35 * SIZE(AO) LFD f23, 35 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 2 ble LL(30) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 5 LL(22): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f2, f18, f24, f2 FMADD f3, f19, f24, f3 FMADD f6, f18, f25, f6 FMADD f7, f19, f25, f7 FMADD f10, f18, f26, f10 FMADD f11, f19, f26, f11 FMADD f14, f18, f27, f14 FMADD f15, f19, f27, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE dcbtst AO, PREA bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(28) .align 4 LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4 LL(28): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 LL(30): andi. I, M, 1 ble LL(39) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 5 LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE dcbtst AO, PREA bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4 LL(38): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 LL(39): #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_ppc440_LN.S000066400000000000000000001636411313527062700217270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 lfs f0, FZERO srawi. J, N, 2 ble .L40 .align 4 .L10: #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif .L30: andi. I, M, 1 ble .L20 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L35 .align 5 .L32: FMADD f0, f16, f20, f0 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f23, 11 * SIZE(BO) LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f24, f1 LFD f24, 12 * SIZE(BO) FMADD f5, f17, f25, f5 LFD f25, 13 * SIZE(BO) FMADD f9, f17, f26, f9 LFD f26, 14 * SIZE(BO) FMADD f13, f17, f27, f13 LFD f27, 15 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f0, f18, f20, f0 LFDU f20, 16 * SIZE(BO) FMADD f4, f18, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f18, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f18, f23, f12 LFD f23, 3 * SIZE(BO) LFD f18, 2 * SIZE(AO) FMADD f1, f19, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) FMADD f9, f19, f26, f9 LFD f26, 6 * SIZE(BO) FMADD f13, f19, f27, f13 LFD f27, 7 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L32 fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 .L35: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L38 .align 4 .L36: FMADD f0, f16, f20, f0 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L36 .align 4 .L38: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L20: andi. I, M, 2 ble .L09 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L25 .align 5 .L22: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f16, 4 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 5 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 12 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 13 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 14 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 6 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f27, 15 * SIZE(BO) FMADD f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 16 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 8 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 3 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 1 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 4 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 5 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 6 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 2 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L22 fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 .L25: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L28 .align 4 .L26: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 2 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L28: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L09: srawi. I, M, 2 ble .L39 .align 4 .L11: #if defined(LT) || defined(RN) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) LFD B6, 8 * SIZE(BO) LFD B7, 12 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L15 .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L18 .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .L18: #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L11 .align 4 .L39: #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt .L10 .align 4 .L40: andi. J, N, 2 ble .L70 #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif .L60: andi. I, M, 1 ble .L50 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L65 .align 5 .L62: FMADD f0, f16, f20, f0 LFDU f20, 8 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 4 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f17, f22, f2 LFD f22, 2 * SIZE(BO) FMADD f3, f17, f23, f3 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) FMADD f0, f18, f24, f0 LFD f24, 4 * SIZE(BO) FMADD f1, f18, f25, f1 LFD f18, 2 * SIZE(AO) LFD f25, 5 * SIZE(BO) FMADD f2, f19, f26, f2 LFD f26, 6 * SIZE(BO) FMADD f3, f19, f27, f3 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L62 .align 4 .L65: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L68 .align 4 .L66: FMADD f0, f16, f20, f0 LFDU f20, 2 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L66 .align 4 .L68: FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L50: andi. I, M, 2 ble .L41 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L55 .align 5 .L52: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 8 * SIZE(BO) FMADD f2, f16, f21, f2 LFD f16, 4 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 5 * SIZE(AO) FMADD f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) FMADD f5, f19, f22, f5 LFD f22, 2 * SIZE(BO) FMADD f6, f18, f23, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f24, f0 LFD f23, 3 * SIZE(BO) FMADD f1, f17, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f2, f16, f25, f2 LFDU f16, 8 * SIZE(AO) FMADD f3, f17, f25, f3 LFD f17, 1 * SIZE(AO) FMADD f4, f18, f26, f4 LFD f25, 5 * SIZE(BO) FMADD f5, f19, f26, f5 LFD f26, 6 * SIZE(BO) FMADD f6, f18, f27, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f27, f7 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L52 .align 4 .L55: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L58 .align 4 .L56: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 2 * SIZE(BO) FMADD f2, f16, f21, f2 LFDU f16, 2 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L56 .align 4 .L58: FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L41: srawi. I, M, 2 ble .L69 .align 4 .L42: #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L45 .align 5 .L43: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 5 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 5 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 6 * SIZE(BO) FMADD f4, f16, f23, f4 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 9 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 10 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 11 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 12 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 13 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 14 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 1 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 2 * SIZE(BO) FMADD f4, f16, f23, f4 LFDU f16, 16 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L43 .align 4 .L45: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L48 .align 4 .L46: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 2 * SIZE(BO) FMADD f4, f16, f21, f4 LFDU f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 3 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L46 .align 4 .L48: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L42 .align 4 .L69: #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 .L70: andi. J, N, 1 ble .L999 #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif .align 4 .L90: andi. I, M, 1 ble .L80 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble .L95 .align 5 .L92: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) LFD f20, 4 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) LFD f21, 5 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 6 * SIZE(AO) LFD f22, 6 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 7 * SIZE(AO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 LFDU f16, 8 * SIZE(AO) LFDU f20, 8 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L92 .align 4 .L95: #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ .L98 .align 4 .L96: FMADD f0, f16, f20, f0 LFDU f16, 1 * SIZE(AO) LFDU f20, 1 * SIZE(BO) bdnz .L96 .align 4 .L98: FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L80: andi. I, M, 2 ble .L71 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L85 .align 5 .L82: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) LFD f17, 5 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f21, 1 * SIZE(BO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFDU f16, 8 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f22, 2 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f23, 3 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L82 .align 4 .L85: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L88 .align 4 .L86: FMADD f0, f16, f20, f0 LFDU f16, 2 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 1 * SIZE(BO) LFD f17, 1 * SIZE(AO) bdnz .L86 .align 4 .L88: FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L71: srawi. I, M, 2 ble .L999 .align 4 .L72: #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L75 .align 5 .L73: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 5 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f19, 7 * SIZE(AO) LFDU f20, 4 * SIZE(BO) FMADD f0, f16, f21, f0 LFD f16, 8 * SIZE(AO) FMADD f1, f17, f21, f1 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f19, 11 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f0, f16, f22, f0 LFD f16, 12 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f17, 13 * SIZE(AO) FMADD f2, f18, f22, f2 LFD f18, 14 * SIZE(AO) FMADD f3, f19, f22, f3 LFD f19, 15 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f0, f16, f23, f0 LFDU f16, 16 * SIZE(AO) FMADD f1, f17, f23, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L73 .align 4 .L75: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L78 .align 4 .L76: FMADD f0, f16, f20, f0 LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f20, f3 LFDU f20, 1 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L76 .align 4 .L78: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L72 .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/trsm_kernel_ppc440_LT.S000066400000000000000000001635421313527062700217350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 lfs f0, FZERO srawi. J, N, 2 ble .L40 .align 4 .L10: #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) LFD B6, 8 * SIZE(BO) LFD B7, 12 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L15 .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L18 .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .L18: #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L11 .align 4 .L20: andi. I, M, 2 ble .L30 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L25 .align 5 .L22: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f16, 4 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 5 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 12 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 13 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 14 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 6 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f27, 15 * SIZE(BO) FMADD f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 16 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 8 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 3 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 1 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 4 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 5 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 6 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 2 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L22 fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 .L25: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L28 .align 4 .L26: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 2 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L28: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L30: andi. I, M, 1 ble .L39 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L35 .align 5 .L32: FMADD f0, f16, f20, f0 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f23, 11 * SIZE(BO) LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f24, f1 LFD f24, 12 * SIZE(BO) FMADD f5, f17, f25, f5 LFD f25, 13 * SIZE(BO) FMADD f9, f17, f26, f9 LFD f26, 14 * SIZE(BO) FMADD f13, f17, f27, f13 LFD f27, 15 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f0, f18, f20, f0 LFDU f20, 16 * SIZE(BO) FMADD f4, f18, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f18, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f18, f23, f12 LFD f23, 3 * SIZE(BO) LFD f18, 2 * SIZE(AO) FMADD f1, f19, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) FMADD f9, f19, f26, f9 LFD f26, 6 * SIZE(BO) FMADD f13, f19, f27, f13 LFD f27, 7 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L32 fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 .L35: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L38 .align 4 .L36: FMADD f0, f16, f20, f0 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L36 .align 4 .L38: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L39: #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt .L10 .align 4 .L40: andi. J, N, 2 ble .L70 #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble .L50 .align 4 .L41: #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L45 .align 5 .L42: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 5 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 5 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 6 * SIZE(BO) FMADD f4, f16, f23, f4 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 9 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 10 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 11 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 12 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 13 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 14 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 1 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 2 * SIZE(BO) FMADD f4, f16, f23, f4 LFDU f16, 16 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L42 .align 4 .L45: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L48 .align 4 .L46: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 2 * SIZE(BO) FMADD f4, f16, f21, f4 LFDU f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 3 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L46 .align 4 .L48: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L41 .align 4 .L50: andi. I, M, 2 ble .L60 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L55 .align 5 .L52: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 8 * SIZE(BO) FMADD f2, f16, f21, f2 LFD f16, 4 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 5 * SIZE(AO) FMADD f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) FMADD f5, f19, f22, f5 LFD f22, 2 * SIZE(BO) FMADD f6, f18, f23, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f24, f0 LFD f23, 3 * SIZE(BO) FMADD f1, f17, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f2, f16, f25, f2 LFDU f16, 8 * SIZE(AO) FMADD f3, f17, f25, f3 LFD f17, 1 * SIZE(AO) FMADD f4, f18, f26, f4 LFD f25, 5 * SIZE(BO) FMADD f5, f19, f26, f5 LFD f26, 6 * SIZE(BO) FMADD f6, f18, f27, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f27, f7 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L52 .align 4 .L55: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L58 .align 4 .L56: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 2 * SIZE(BO) FMADD f2, f16, f21, f2 LFDU f16, 2 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L56 .align 4 .L58: FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L60: andi. I, M, 1 ble .L69 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L65 .align 5 .L62: FMADD f0, f16, f20, f0 LFDU f20, 8 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 4 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f17, f22, f2 LFD f22, 2 * SIZE(BO) FMADD f3, f17, f23, f3 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) FMADD f0, f18, f24, f0 LFD f24, 4 * SIZE(BO) FMADD f1, f18, f25, f1 LFD f18, 2 * SIZE(AO) LFD f25, 5 * SIZE(BO) FMADD f2, f19, f26, f2 LFD f26, 6 * SIZE(BO) FMADD f3, f19, f27, f3 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L62 .align 4 .L65: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L68 .align 4 .L66: FMADD f0, f16, f20, f0 LFDU f20, 2 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L66 .align 4 .L68: FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L69: #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 .L70: andi. J, N, 1 ble .L999 #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble .L80 .align 4 .L71: #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L75 .align 5 .L72: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 5 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f19, 7 * SIZE(AO) LFDU f20, 4 * SIZE(BO) FMADD f0, f16, f21, f0 LFD f16, 8 * SIZE(AO) FMADD f1, f17, f21, f1 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f19, 11 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f0, f16, f22, f0 LFD f16, 12 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f17, 13 * SIZE(AO) FMADD f2, f18, f22, f2 LFD f18, 14 * SIZE(AO) FMADD f3, f19, f22, f3 LFD f19, 15 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f0, f16, f23, f0 LFDU f16, 16 * SIZE(AO) FMADD f1, f17, f23, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L72 .align 4 .L75: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L78 .align 4 .L76: FMADD f0, f16, f20, f0 LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f20, f3 LFDU f20, 1 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L76 .align 4 .L78: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L71 .align 4 .L80: andi. I, M, 2 ble .L90 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L85 .align 5 .L82: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) LFD f17, 5 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f21, 1 * SIZE(BO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFDU f16, 8 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f22, 2 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f23, 3 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L82 .align 4 .L85: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L88 .align 4 .L86: FMADD f0, f16, f20, f0 LFDU f16, 2 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 1 * SIZE(BO) LFD f17, 1 * SIZE(AO) bdnz .L86 .align 4 .L88: FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L90: andi. I, M, 1 ble .L999 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble .L95 .align 5 .L92: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) LFD f20, 4 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) LFD f21, 5 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 6 * SIZE(AO) LFD f22, 6 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 7 * SIZE(AO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 LFDU f16, 8 * SIZE(AO) LFDU f20, 8 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L92 .align 4 .L95: #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ .L98 .align 4 .L96: FMADD f0, f16, f20, f0 LFDU f16, 1 * SIZE(AO) LFDU f20, 1 * SIZE(BO) bdnz .L96 .align 4 .L98: FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/trsm_kernel_ppc440_RT.S000066400000000000000000001640401313527062700217350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA 296(SP) #define FZERO 304(SP) #else #define STACKSIZE 240 #define ALPHA 224(SP) #define FZERO 232(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r8 #define B r9 #define C r10 #define LDC r7 #define OFFSET r6 #else #define A r7 #define B r8 #define C r9 #define LDC r10 #define OFFSET r6 #endif #endif #define AORIG r18 #define TEMP r19 #define KK r20 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define CO3 r27 #define CO4 r28 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) #endif stw r0, FZERO #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif slwi LDC, LDC, BASE_SHIFT #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef LN mullw r0, M, K slwi r0, r0, BASE_SHIFT add A, A, r0 slwi r0, M, BASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, BASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 lfs f0, FZERO .L70: andi. J, N, 1 ble .L40 #ifdef RT slwi r0, K, 0 + BASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO1, LDC #endif ble .L80 .align 4 .L71: #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L75 .align 5 .L72: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 5 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f19, 7 * SIZE(AO) LFDU f20, 4 * SIZE(BO) FMADD f0, f16, f21, f0 LFD f16, 8 * SIZE(AO) FMADD f1, f17, f21, f1 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f19, 11 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f0, f16, f22, f0 LFD f16, 12 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f17, 13 * SIZE(AO) FMADD f2, f18, f22, f2 LFD f18, 14 * SIZE(AO) FMADD f3, f19, f22, f3 LFD f19, 15 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f0, f16, f23, f0 LFDU f16, 16 * SIZE(AO) FMADD f1, f17, f23, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L72 .align 4 .L75: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L78 .align 4 .L76: FMADD f0, f16, f20, f0 LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f17, 1 * SIZE(AO) FMADD f2, f18, f20, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f20, f3 LFDU f20, 1 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L76 .align 4 .L78: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) LFD f24, 2 * SIZE(BO) LFD f28, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 FSUB f2, f24, f2 FSUB f3, f28, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FNMSUB f2, f17, f3, f2 FNMSUB f1, f18, f3, f1 FNMSUB f0, f19, f3, f0 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FNMSUB f1, f17, f2, f1 FNMSUB f0, f18, f2, f0 FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FNMSUB f2, f18, f0, f2 FNMSUB f3, f19, f0, f3 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FNMSUB f2, f18, f1, f2 FNMSUB f3, f19, f1, f3 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FNMSUB f3, f19, f2, f3 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L71 .align 4 .L80: andi. I, M, 2 ble .L90 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L85 .align 5 .L82: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) LFD f17, 5 * SIZE(AO) FMADD f2, f18, f21, f2 LFD f18, 6 * SIZE(AO) FMADD f3, f19, f21, f3 LFD f21, 1 * SIZE(BO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFDU f16, 8 * SIZE(AO) FMADD f1, f17, f22, f1 LFD f22, 2 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f2, f18, f23, f2 LFD f18, 2 * SIZE(AO) FMADD f3, f19, f23, f3 LFD f23, 3 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L82 .align 4 .L85: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L88 .align 4 .L86: FMADD f0, f16, f20, f0 LFDU f16, 2 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 1 * SIZE(BO) LFD f17, 1 * SIZE(AO) bdnz .L86 .align 4 .L88: FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f20, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f20, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L90: andi. I, M, 1 ble .L99 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 0 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble .L95 .align 5 .L92: FMADD f0, f16, f20, f0 LFD f16, 4 * SIZE(AO) LFD f20, 4 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) LFD f21, 5 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 6 * SIZE(AO) LFD f22, 6 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 7 * SIZE(AO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 LFDU f16, 8 * SIZE(AO) LFDU f20, 8 * SIZE(BO) FMADD f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f18, f22, f2 LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) FMADD f3, f19, f23, f3 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L92 .align 4 .L95: #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble+ .L98 .align 4 .L96: FMADD f0, f16, f20, f0 LFDU f16, 1 * SIZE(AO) LFDU f20, 1 * SIZE(BO) bdnz .L96 .align 4 .L98: FADD f0, f1, f0 FADD f2, f3, f2 FADD f0, f2, f0 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 0 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) FSUB f0, f16, f0 #else LFD f16, 0 * SIZE(AO) FSUB f0, f16, f0 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 #endif #ifdef RN LFD f16, 0 * SIZE(BO) FMUL f0, f16, f0 #endif #ifdef RT LFD f21, 0 * SIZE(BO) FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) lfs f0, FZERO #ifndef LN addi CO1, CO1, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 0 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L99: #ifdef LN slwi r0, K, 0 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L40: andi. J, N, 2 ble .L09 #ifdef RT slwi r0, K, 1 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble .L50 .align 4 .L41: #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L45 .align 5 .L42: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 5 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 5 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 6 * SIZE(BO) FMADD f4, f16, f23, f4 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 9 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 10 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 11 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 12 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 13 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 14 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 1 * SIZE(BO) FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 LFD f22, 2 * SIZE(BO) FMADD f4, f16, f23, f4 LFDU f16, 16 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L42 .align 4 .L45: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L48 .align 4 .L46: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 LFDU f20, 2 * SIZE(BO) FMADD f4, f16, f21, f4 LFDU f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 3 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L46 .align 4 .L48: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f28, 6 * SIZE(BO) LFD f29, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f3, f28, f3 FSUB f7, f29, f7 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FMUL f1, f19, f1 FMUL f5, f19, f5 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FMUL f0, f21, f0 FMUL f4, f21, f4 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FNMSUB f2, f18, f1, f2 FNMSUB f6, f18, f5, f6 FNMSUB f3, f19, f1, f3 FNMSUB f7, f19, f5, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMUL f2, f18, f2 FMUL f6, f18, f6 FNMSUB f3, f19, f2, f3 FNMSUB f7, f19, f6, f7 LFD f19, 15 * SIZE(AO) FMUL f3, f19, f3 FMUL f7, f19, f7 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FMUL f4, f18, f4 FMUL f5, f18, f5 FMUL f6, f18, f6 FMUL f7, f18, f7 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f6, 5 * SIZE(BO) STFD f3, 6 * SIZE(BO) STFD f7, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 4 #endif #ifdef LT addi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L41 .align 4 .L50: andi. I, M, 2 ble .L60 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L55 .align 5 .L52: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 8 * SIZE(BO) FMADD f2, f16, f21, f2 LFD f16, 4 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 5 * SIZE(AO) FMADD f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) FMADD f5, f19, f22, f5 LFD f22, 2 * SIZE(BO) FMADD f6, f18, f23, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f24, f0 LFD f23, 3 * SIZE(BO) FMADD f1, f17, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f2, f16, f25, f2 LFDU f16, 8 * SIZE(AO) FMADD f3, f17, f25, f3 LFD f17, 1 * SIZE(AO) FMADD f4, f18, f26, f4 LFD f25, 5 * SIZE(BO) FMADD f5, f19, f26, f5 LFD f26, 6 * SIZE(BO) FMADD f6, f18, f27, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f27, f7 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L52 .align 4 .L55: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L58 .align 4 .L56: FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 LFDU f20, 2 * SIZE(BO) FMADD f2, f16, f21, f2 LFDU f16, 2 * SIZE(AO) FMADD f3, f17, f21, f3 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L56 .align 4 .L58: FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f2, f17, f2 FSUB f1, f20, f1 FSUB f3, f21, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f3, f19, f3 FNMSUB f0, f20, f1, f0 FNMSUB f2, f20, f3, f2 FMUL f0, f21, f0 FMUL f2, f21, f2 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f2, f16, f2 FNMSUB f1, f17, f0, f1 FNMSUB f3, f17, f2, f3 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f3, f17, f3 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f2, f17, f0, f2 FNMSUB f3, f17, f1, f3 FMUL f2, f18, f2 FMUL f3, f18, f3 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f2, f19, f2 FMUL f3, f19, f3 FNMSUB f0, f20, f2, f0 FNMSUB f1, f20, f3, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f2, 1 * SIZE(BO) STFD f1, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L60: andi. I, M, 1 ble .L69 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L65 .align 5 .L62: FMADD f0, f16, f20, f0 LFDU f20, 8 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 4 * SIZE(AO) LFD f21, 1 * SIZE(BO) FMADD f2, f17, f22, f2 LFD f22, 2 * SIZE(BO) FMADD f3, f17, f23, f3 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) FMADD f0, f18, f24, f0 LFD f24, 4 * SIZE(BO) FMADD f1, f18, f25, f1 LFD f18, 2 * SIZE(AO) LFD f25, 5 * SIZE(BO) FMADD f2, f19, f26, f2 LFD f26, 6 * SIZE(BO) FMADD f3, f19, f27, f3 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L62 .align 4 .L65: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L68 .align 4 .L66: FMADD f0, f16, f20, f0 LFDU f20, 2 * SIZE(BO) FMADD f1, f16, f21, f1 LFDU f16, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L66 .align 4 .L68: FADD f0, f2, f0 FADD f1, f3, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 1 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f20, f1 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f1, f16, f1 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f1, f17, f0, f1 FMUL f1, f18, f1 #endif #ifdef RT LFD f19, 3 * SIZE(BO) LFD f20, 2 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f1, f19, f1 FNMSUB f0, f20, f1, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L69: #ifdef LN slwi r0, K, 1 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif lfs f0, FZERO .align 4 .L09: srawi. J, N, 2 ble .L999 .align 4 .L10: #ifdef RT slwi r0, K, 2 + BASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 2 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) LFD B6, 8 * SIZE(BO) LFD B7, 12 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L15 .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L18 .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .L18: #if defined(LN) || defined(RT) subi r0, KK, 4 slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 FSUB f2, f24, f2 FSUB f6, f25, f6 FSUB f10, f26, f10 FSUB f14, f27, f14 FSUB f3, f28, f3 FSUB f7, f29, f7 FSUB f11, f30, f11 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f16, 15 * SIZE(AO) LFD f17, 14 * SIZE(AO) LFD f18, 13 * SIZE(AO) LFD f19, 12 * SIZE(AO) FMUL f3, f16, f3 FMUL f7, f16, f7 FMUL f11, f16, f11 FMUL f15, f16, f15 FNMSUB f2, f17, f3, f2 FNMSUB f6, f17, f7, f6 FNMSUB f10, f17, f11, f10 FNMSUB f14, f17, f15, f14 FNMSUB f1, f18, f3, f1 FNMSUB f5, f18, f7, f5 FNMSUB f9, f18, f11, f9 FNMSUB f13, f18, f15, f13 FNMSUB f0, f19, f3, f0 FNMSUB f4, f19, f7, f4 FNMSUB f8, f19, f11, f8 FNMSUB f12, f19, f15, f12 LFD f16, 10 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 8 * SIZE(AO) LFD f19, 5 * SIZE(AO) FMUL f2, f16, f2 FMUL f6, f16, f6 FMUL f10, f16, f10 FMUL f14, f16, f14 LFD f20, 4 * SIZE(AO) LFD f21, 0 * SIZE(AO) FNMSUB f1, f17, f2, f1 FNMSUB f5, f17, f6, f5 FNMSUB f9, f17, f10, f9 FNMSUB f13, f17, f14, f13 FNMSUB f0, f18, f2, f0 FNMSUB f4, f18, f6, f4 FNMSUB f8, f18, f10, f8 FNMSUB f12, f18, f14, f12 FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 FNMSUB f2, f18, f0, f2 FNMSUB f6, f18, f4, f6 FNMSUB f10, f18, f8, f10 FNMSUB f14, f18, f12, f14 FNMSUB f3, f19, f0, f3 FNMSUB f7, f19, f4, f7 FNMSUB f11, f19, f8, f11 FNMSUB f15, f19, f12, f15 LFD f16, 5 * SIZE(AO) LFD f17, 6 * SIZE(AO) LFD f18, 7 * SIZE(AO) LFD f19, 10 * SIZE(AO) FMUL f1, f16, f1 FMUL f5, f16, f5 FMUL f9, f16, f9 FMUL f13, f16, f13 LFD f20, 11 * SIZE(AO) LFD f21, 15 * SIZE(AO) FNMSUB f2, f17, f1, f2 FNMSUB f6, f17, f5, f6 FNMSUB f10, f17, f9, f10 FNMSUB f14, f17, f13, f14 FNMSUB f3, f18, f1, f3 FNMSUB f7, f18, f5, f7 FNMSUB f11, f18, f9, f11 FNMSUB f15, f18, f13, f15 FMUL f2, f19, f2 FMUL f6, f19, f6 FMUL f10, f19, f10 FMUL f14, f19, f14 FNMSUB f3, f20, f2, f3 FNMSUB f7, f20, f6, f7 FNMSUB f11, f20, f10, f11 FNMSUB f15, f20, f14, f15 FMUL f3, f21, f3 FMUL f7, f21, f7 FMUL f11, f21, f11 FMUL f15, f21, f15 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FMUL f2, f16, f2 FMUL f3, f16, f3 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f6, f17, f2, f6 FNMSUB f7, f17, f3, f7 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 FNMSUB f14, f19, f2, f14 FNMSUB f15, f19, f3, f15 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FMUL f6, f16, f6 FMUL f7, f16, f7 LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f10, f17, f6, f10 FNMSUB f11, f17, f7, f11 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FNMSUB f14, f18, f6, f14 FNMSUB f15, f18, f7, f15 FMUL f8, f19, f8 FMUL f9, f19, f9 FMUL f10, f19, f10 FMUL f11, f19, f11 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FNMSUB f14, f20, f10, f14 FNMSUB f15, f20, f11, f15 FMUL f12, f21, f12 FMUL f13, f21, f13 FMUL f14, f21, f14 FMUL f15, f21, f15 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FMUL f14, f16, f14 FMUL f15, f16, f15 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f10, f17, f14, f10 FNMSUB f11, f17, f15, f11 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f6, f18, f14, f6 FNMSUB f7, f18, f15, f7 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 FNMSUB f2, f19, f14, f2 FNMSUB f3, f19, f15, f3 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FMUL f10, f16, f10 FMUL f11, f16, f11 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f6, f17, f10, f6 FNMSUB f7, f17, f11, f7 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f19, f4 FMUL f5, f19, f5 FMUL f6, f19, f6 FMUL f7, f19, f7 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FNMSUB f2, f20, f6, f2 FNMSUB f3, f20, f7, f3 FMUL f0, f21, f0 FMUL f1, f21, f1 FMUL f2, f21, f2 FMUL f3, f21, f3 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f6, 9 * SIZE(BO) STFD f10, 10 * SIZE(BO) STFD f14, 11 * SIZE(BO) STFD f3, 12 * SIZE(BO) STFD f7, 13 * SIZE(BO) STFD f11, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 bgt+ .L11 .align 4 .L20: andi. I, M, 2 ble .L30 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L25 .align 5 .L22: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f16, 4 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 11 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 5 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 12 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 13 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 14 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 6 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f27, 15 * SIZE(BO) FMADD f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFDU f20, 16 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 8 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f23, 3 * SIZE(BO) FMADD f2, f18, f24, f2 LFD f17, 1 * SIZE(AO) FMADD f3, f19, f24, f3 LFD f24, 4 * SIZE(BO) FMADD f6, f18, f25, f6 nop FMADD f7, f19, f25, f7 LFD f25, 5 * SIZE(BO) FMADD f10, f18, f26, f10 nop FMADD f11, f19, f26, f11 LFD f26, 6 * SIZE(BO) FMADD f14, f18, f27, f14 LFD f18, 2 * SIZE(AO) FMADD f15, f19, f27, f15 LFD f19, 3 * SIZE(AO) LFD f27, 7 * SIZE(BO) bdnz .L22 fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4 .L25: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L28 .align 4 .L26: FMADD f0, f16, f20, f0 nop FMADD f1, f17, f20, f1 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 nop FMADD f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 nop FMADD f9, f17, f22, f9 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 2 * SIZE(AO) FMADD f13, f17, f23, f13 LFD f17, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L28: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 FSUB f1, f20, f1 FSUB f5, f21, f5 FSUB f9, f22, f9 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f19, 3 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 0 * SIZE(AO) FMUL f1, f19, f1 FMUL f5, f19, f5 FMUL f9, f19, f9 FMUL f13, f19, f13 FNMSUB f0, f20, f1, f0 FNMSUB f4, f20, f5, f4 FNMSUB f8, f20, f9, f8 FNMSUB f12, f20, f13, f12 FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 FNMSUB f1, f17, f0, f1 FNMSUB f5, f17, f4, f5 FNMSUB f9, f17, f8, f9 FNMSUB f13, f17, f12, f13 LFD f17, 3 * SIZE(AO) FMUL f1, f17, f1 FMUL f5, f17, f5 FMUL f9, f17, f9 FMUL f13, f17, f13 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FMUL f1, f16, f1 FNMSUB f4, f17, f0, f4 FNMSUB f5, f17, f1, f5 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f12, f19, f0, f12 FNMSUB f13, f19, f1, f13 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FMUL f5, f16, f5 FNMSUB f8, f17, f4, f8 FNMSUB f9, f17, f5, f9 FNMSUB f12, f18, f4, f12 FNMSUB f13, f18, f5, f13 FMUL f8, f19, f8 FMUL f9, f19, f9 FNMSUB f12, f20, f8, f12 FNMSUB f13, f20, f9, f13 FMUL f12, f21, f12 FMUL f13, f21, f13 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FMUL f13, f16, f13 FNMSUB f8, f17, f12, f8 FNMSUB f9, f17, f13, f9 FNMSUB f4, f18, f12, f4 FNMSUB f5, f18, f13, f5 FNMSUB f0, f19, f12, f0 FNMSUB f1, f19, f13, f1 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FMUL f8, f16, f8 FMUL f9, f16, f9 FNMSUB f4, f17, f8, f4 FNMSUB f5, f17, f9, f5 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FMUL f4, f19, f4 FMUL f5, f19, f5 FNMSUB f0, f20, f4, f0 FNMSUB f1, f20, f5, f1 FMUL f0, f21, f0 FMUL f1, f21, f1 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) STFD f1, 4 * SIZE(BO) STFD f5, 5 * SIZE(BO) STFD f9, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 1 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 2 #endif #ifdef LT addi KK, KK, 2 #endif .align 4 .L30: andi. I, M, 1 ble .L39 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, BASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L35 .align 5 .L32: FMADD f0, f16, f20, f0 LFD f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 9 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 10 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f23, 11 * SIZE(BO) LFDU f16, 4 * SIZE(AO) FMADD f1, f17, f24, f1 LFD f24, 12 * SIZE(BO) FMADD f5, f17, f25, f5 LFD f25, 13 * SIZE(BO) FMADD f9, f17, f26, f9 LFD f26, 14 * SIZE(BO) FMADD f13, f17, f27, f13 LFD f27, 15 * SIZE(BO) LFD f17, 1 * SIZE(AO) FMADD f0, f18, f20, f0 LFDU f20, 16 * SIZE(BO) FMADD f4, f18, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f18, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f18, f23, f12 LFD f23, 3 * SIZE(BO) LFD f18, 2 * SIZE(AO) FMADD f1, f19, f24, f1 LFD f24, 4 * SIZE(BO) FMADD f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) FMADD f9, f19, f26, f9 LFD f26, 6 * SIZE(BO) FMADD f13, f19, f27, f13 LFD f27, 7 * SIZE(BO) LFD f19, 3 * SIZE(AO) bdnz .L32 fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4 .L35: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble+ .L38 .align 4 .L36: FMADD f0, f16, f20, f0 LFDU f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f21, 1 * SIZE(BO) FMADD f8, f16, f22, f8 LFD f22, 2 * SIZE(BO) FMADD f12, f16, f23, f12 LFDU f16, 1 * SIZE(AO) LFD f23, 3 * SIZE(BO) bdnz .L36 .align 4 .L38: #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + BASE_SHIFT slwi r0, r0, 2 + BASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f4, f17, f4 FSUB f8, f18, f8 FSUB f12, f19, f12 #else LFD f16, 0 * SIZE(AO) LFD f20, 1 * SIZE(AO) LFD f24, 2 * SIZE(AO) LFD f28, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f4, f20, f4 FSUB f8, f24, f8 FSUB f12, f28, f12 #endif #ifdef LN LFD f21, 0 * SIZE(AO) FMUL f0, f21, f0 FMUL f4, f21, f4 FMUL f8, f21, f8 FMUL f12, f21, f12 #endif #ifdef LT LFD f16, 0 * SIZE(AO) FMUL f0, f16, f0 FMUL f4, f16, f4 FMUL f8, f16, f8 FMUL f12, f16, f12 #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FMUL f0, f16, f0 FNMSUB f4, f17, f0, f4 FNMSUB f8, f18, f0, f8 FNMSUB f12, f19, f0, f12 LFD f16, 5 * SIZE(BO) LFD f17, 6 * SIZE(BO) LFD f18, 7 * SIZE(BO) LFD f19, 10 * SIZE(BO) LFD f20, 11 * SIZE(BO) LFD f21, 15 * SIZE(BO) FMUL f4, f16, f4 FNMSUB f8, f17, f4, f8 FNMSUB f12, f18, f4, f12 FMUL f8, f19, f8 FNMSUB f12, f20, f8, f12 FMUL f12, f21, f12 #endif #ifdef RT LFD f16, 15 * SIZE(BO) LFD f17, 14 * SIZE(BO) LFD f18, 13 * SIZE(BO) LFD f19, 12 * SIZE(BO) FMUL f12, f16, f12 FNMSUB f8, f17, f12, f8 FNMSUB f4, f18, f12, f4 FNMSUB f0, f19, f12, f0 LFD f16, 10 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 8 * SIZE(BO) LFD f19, 5 * SIZE(BO) FMUL f8, f16, f8 LFD f20, 4 * SIZE(BO) LFD f21, 0 * SIZE(BO) FNMSUB f4, f17, f8, f4 FNMSUB f0, f18, f8, f0 FMUL f4, f19, f4 FNMSUB f0, f20, f4, f0 FMUL f0, f21, f0 #endif #ifdef LN subi CO1, CO1, 1 * SIZE subi CO2, CO2, 1 * SIZE subi CO3, CO3, 1 * SIZE subi CO4, CO4, 1 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f4, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f12, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f4, 1 * SIZE(AO) STFD f8, 2 * SIZE(AO) STFD f12, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 #ifndef LN addi CO1, CO1, 1 * SIZE addi CO2, CO2, 1 * SIZE addi CO3, CO3, 1 * SIZE addi CO4, CO4, 1 * SIZE #endif #ifdef RT slwi r0, K, 0 + BASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LN subi KK, KK, 1 #endif #ifdef LT addi KK, KK, 1 #endif .align 4 .L39: #ifdef LN slwi r0, K, 2 + BASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 lfs f0, FZERO bgt .L10 .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamax.S000066400000000000000000000227341313527062700170340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define INCXM1 r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) LFD f2, 1 * SIZE(X) add X, X, INCX fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 fmr f0, f1 fmr f2, f1 fmr f3, f1 subi N, N, 1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsub f8, f0, f2 fsel f1, f8, f0, f2 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamax_cell.S000066400000000000000000000226271313527062700200340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define INCXM1 r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, 10 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) LFD f2, 1 * SIZE(X) add X, X, INCX fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 fmr f0, f1 fmr f2, f1 fmr f3, f1 subi N, N, 1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) fabs f8, f24 LFD f26, 2 * SIZE(X) fabs f9, f25 LFD f27, 3 * SIZE(X) fabs f10, f26 LFD f28, 4 * SIZE(X) fabs f11, f27 LFD f29, 5 * SIZE(X) fabs f12, f28 LFD f30, 6 * SIZE(X) fabs f13, f29 LFD f31, 7 * SIZE(X) fabs f14, f30 nop fabs f15, f31 bdz LL(20) .align 4 LL(10): fadd f4, f8, f9 dcbt X, PREA fadd f5, f10, f11 nop fadd f6, f12, f13 LFD f24, 8 * SIZE(X) fadd f7, f14, f15 LFD f25, 9 * SIZE(X) fabs f8, f24 LFD f26, 10 * SIZE(X) fabs f9, f25 LFD f27, 11 * SIZE(X) fabs f10, f26 fabs f11, f27 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 LFD f28, 12 * SIZE(X) fsub f19, f3, f7 LFD f29, 13 * SIZE(X) fabs f12, f28 LFD f30, 14 * SIZE(X) fabs f13, f29 LFD f31, 15 * SIZE(X) fabs f14, f30 fabs f15, f31 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 LFD f24, 16 * SIZE(X) fadd f23, f14, f15 LFD f25, 17 * SIZE(X) fabs f8, f24 LFD f26, 18 * SIZE(X) fabs f9, f25 LFD f27, 19 * SIZE(X) fabs f10, f26 fabs f11, f27 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 LFD f28, 20 * SIZE(X) fsub f19, f3, f23 LFD f29, 21 * SIZE(X) fabs f12, f28 LFD f30, 22 * SIZE(X) fabs f13, f29 LFD f31, 23 * SIZE(X) fabs f14, f30 addi X, X, 16 * SIZE fabs f15, f31 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 bdnz LL(10) .align 4 LL(20): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 LFD f24, 8 * SIZE(X) fadd f7, f14, f15 LFD f25, 9 * SIZE(X) fabs f8, f24 LFD f26, 10 * SIZE(X) fabs f9, f25 LFD f27, 11 * SIZE(X) fabs f10, f26 fabs f11, f27 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 LFD f28, 12 * SIZE(X) fsub f19, f3, f7 LFD f29, 13 * SIZE(X) fabs f12, f28 LFD f30, 14 * SIZE(X) fabs f13, f29 LFD f31, 15 * SIZE(X) fabs f14, f30 fabs f15, f31 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsub f8, f0, f2 fsel f1, f8, f0, f2 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamax_hummer.S000066400000000000000000000156001313527062700204030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define T1 f16 #define T2 f17 #define T3 f18 #define T4 f19 #define B1 f20 #define B2 f21 #define B3 f22 #define B4 f23 #define B5 f24 #define B6 f25 #define B7 f26 #define B8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) LFD A1, 0 * SIZE(X) LFD A2, 1 * SIZE(X) add X, X, INCX2 fabs A1, A1 fabs A2, A2 addi N, N, -1 cmpwi cr0, N, 0 fadd C1, A1, A2 ble LL(999) subi INCX2, INCX2, SIZE fsmfp C1, C1 li INCX, SIZE fpmr C2, C1 sub X, X, INCX2 fpmr C3, C1 srawi. r0, N, 3 fpmr C4, C1 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX LFDUX A5, X, INCX2 LFDUX A6, X, INCX LFDUX A7, X, INCX2 LFDUX A8, X, INCX LFSDUX A5, X, INCX2 LFSDUX A6, X, INCX LFSDUX A7, X, INCX2 LFSDUX A8, X, INCX bdz LL(103) .align 4 LL(102): fpabs B1, A1 LFDUX A1, X, INCX2 fpabs B2, A2 LFDUX A2, X, INCX fpabs B3, A3 LFDUX A3, X, INCX2 fpabs B4, A4 LFDUX A4, X, INCX fpabs B5, A5 LFSDUX A1, X, INCX2 fpabs B6, A6 LFSDUX A2, X, INCX fpabs B7, A7 LFSDUX A3, X, INCX2 fpabs B8, A8 LFSDUX A4, X, INCX fpadd T1, B1, B2 LFDUX A5, X, INCX2 fpadd T2, B3, B4 LFDUX A6, X, INCX fpadd T3, B5, B6 LFDUX A7, X, INCX2 fpadd T4, B7, B8 LFDUX A8, X, INCX fpsub F1, C1, T1 LFSDUX A5, X, INCX2 fpsub F2, C2, T2 LFSDUX A6, X, INCX fpsub F3, C3, T3 LFSDUX A7, X, INCX2 fpsub F4, C4, T4 LFSDUX A8, X, INCX fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 bdnz LL(102) .align 4 LL(103): fpabs B1, A1 fpabs B2, A2 fpabs B3, A3 fpabs B4, A4 fpabs B5, A5 fpabs B6, A6 fpabs B7, A7 fpabs B8, A8 fpadd T1, B1, B2 fpadd T2, B3, B4 fpadd T3, B5, B6 fpadd T4, B7, B8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 .align 4 LL(105): andi. r0, N, 7 beq LL(998) andi. r0, N, 4 beq LL(106) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpadd A1, A1, A2 fpadd A3, A3, A4 fpsub F1, C1, A1 fpsub F2, C2, A3 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A3 .align 4 LL(106): andi. r0, N, 2 beq LL(107) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX fpabs A1, A1 fpabs A2, A2 fpadd A1, A1, A2 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(107): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX2 LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fadd A1, A1, A2 fsub F1, C1, A1 fsel C1, F1, C1, A1 .align 4 LL(998): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel C1, F1, C1, C2 .align 4 LL(999): li r10, 16 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamax_ppc440.S000066400000000000000000000152461313527062700201260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREX r8 #define INC1 r9 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT sub X, X, INCX li INC1, SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX LFDX f2, X, INC1 fabs f1, f1 li PREX, 4 * 8 * SIZE fabs f2, f2 fadd f1, f1, f2 fmr f0, f1 fmr f2, f1 fmr f3, f1 subi N, N, 1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDX f29, X, INC1 fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDX f31, X, INC1 bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 #ifdef PPCG4 dcbt X, PREX #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fabs f12, f28 #ifdef PPCG4 dcbt X, PREX #endif fabs f13, f29 LFDUX f28, X, INCX fabs f14, f30 LFDX f29, X, INC1 fabs f15, f31 LFDUX f30, X, INCX fsub f16, f0, f4 LFDX f31, X, INC1 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 #ifdef PPCG4 dcbt X, PREX #endif fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fsel f0, f16, f0, f4 #ifdef PPCG4 dcbt X, PREX #endif fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDX f29, X, INC1 fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDX f31, X, INC1 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f0, f4 fsel f1, f17, f1, f5 fsel f2, f18, f2, f6 fsel f3, f19, f3, f7 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f0, f20 fsel f1, f17, f1, f21 fsel f2, f18, f2, f22 fsel f3, f19, f3, f23 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDX f9, X, INC1 fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f1, f8 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsub f8, f0, f2 fsel f1, f8, f0, f2 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamin.S000066400000000000000000000227141313527062700170300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define INCXM1 r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) LFD f2, 1 * SIZE(X) add X, X, INCX fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 fmr f0, f1 fmr f2, f1 fmr f3, f1 subi N, N, 1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsub f8, f0, f2 fsel f1, f8, f2, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamin_cell.S000066400000000000000000000225641313527062700200320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define INCXM1 r9 #define FZERO f1 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, 10 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFD f1, 0 * SIZE(X) LFD f2, 1 * SIZE(X) add X, X, INCX fabs f1, f1 fabs f2, f2 fadd f1, f1, f2 fmr f0, f1 fmr f2, f1 fmr f3, f1 subi N, N, 1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) fabs f8, f24 LFD f26, 2 * SIZE(X) fabs f9, f25 LFD f27, 3 * SIZE(X) fabs f10, f26 LFD f28, 4 * SIZE(X) fabs f11, f27 LFD f29, 5 * SIZE(X) fabs f12, f28 LFD f30, 6 * SIZE(X) fabs f13, f29 LFD f31, 7 * SIZE(X) fabs f14, f30 nop fabs f15, f31 bdz LL(20) .align 4 LL(10): fadd f4, f8, f9 dcbt X, PREA fadd f5, f10, f11 nop fadd f6, f12, f13 LFD f24, 8 * SIZE(X) fadd f7, f14, f15 LFD f25, 9 * SIZE(X) fabs f8, f24 LFD f26, 10 * SIZE(X) fabs f9, f25 LFD f27, 11 * SIZE(X) fabs f10, f26 fabs f11, f27 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 LFD f28, 12 * SIZE(X) fsub f19, f3, f7 LFD f29, 13 * SIZE(X) fabs f12, f28 LFD f30, 14 * SIZE(X) fabs f13, f29 LFD f31, 15 * SIZE(X) fabs f14, f30 fabs f15, f31 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 LFD f24, 16 * SIZE(X) fadd f23, f14, f15 LFD f25, 17 * SIZE(X) fabs f8, f24 LFD f26, 18 * SIZE(X) fabs f9, f25 LFD f27, 19 * SIZE(X) fabs f10, f26 fabs f11, f27 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 LFD f28, 20 * SIZE(X) fsub f19, f3, f23 LFD f29, 21 * SIZE(X) fabs f12, f28 LFD f30, 22 * SIZE(X) fabs f13, f29 LFD f31, 23 * SIZE(X) fabs f14, f30 addi X, X, 16 * SIZE fabs f15, f31 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 bdnz LL(10) .align 4 LL(20): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 LFD f24, 8 * SIZE(X) fadd f7, f14, f15 LFD f25, 9 * SIZE(X) fabs f8, f24 LFD f26, 10 * SIZE(X) fabs f9, f25 LFD f27, 11 * SIZE(X) fabs f10, f26 fabs f11, f27 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 LFD f28, 12 * SIZE(X) fsub f19, f3, f7 LFD f29, 13 * SIZE(X) fabs f12, f28 LFD f30, 14 * SIZE(X) fabs f13, f29 LFD f31, 15 * SIZE(X) fabs f14, f30 fabs f15, f31 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsub f8, f0, f2 fsel f1, f8, f2, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamin_hummer.S000066400000000000000000000155741313527062700204130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define F1 f12 #define F2 f13 #define F3 f14 #define F4 f15 #define T1 f16 #define T2 f17 #define T3 f18 #define T4 f19 #define B1 f20 #define B2 f21 #define B3 f22 #define B4 f23 #define B5 f24 #define B6 f25 #define B7 f26 #define B8 f27 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) LFD A1, 0 * SIZE(X) LFD A2, 1 * SIZE(X) add X, X, INCX2 fabs A1, A1 fabs A2, A2 addi N, N, -1 cmpwi cr0, N, 0 fadd C1, A1, A2 ble LL(999) subi INCX2, INCX2, SIZE fsmfp C1, C1 li INCX, SIZE fpmr C2, C1 sub X, X, INCX2 fpmr C3, C1 srawi. r0, N, 3 fpmr C4, C1 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX LFDUX A5, X, INCX2 LFDUX A6, X, INCX LFDUX A7, X, INCX2 LFDUX A8, X, INCX LFSDUX A5, X, INCX2 LFSDUX A6, X, INCX LFSDUX A7, X, INCX2 LFSDUX A8, X, INCX bdz LL(103) .align 4 LL(102): fpabs B1, A1 LFDUX A1, X, INCX2 fpabs B2, A2 LFDUX A2, X, INCX fpabs B3, A3 LFDUX A3, X, INCX2 fpabs B4, A4 LFDUX A4, X, INCX fpabs B5, A5 LFSDUX A1, X, INCX2 fpabs B6, A6 LFSDUX A2, X, INCX fpabs B7, A7 LFSDUX A3, X, INCX2 fpabs B8, A8 LFSDUX A4, X, INCX fpadd T1, B1, B2 LFDUX A5, X, INCX2 fpadd T2, B3, B4 LFDUX A6, X, INCX fpadd T3, B5, B6 LFDUX A7, X, INCX2 fpadd T4, B7, B8 LFDUX A8, X, INCX fpsub F1, T1, C1 LFSDUX A5, X, INCX2 fpsub F2, T2, C2 LFSDUX A6, X, INCX fpsub F3, T3, C3 LFSDUX A7, X, INCX2 fpsub F4, T4, C4 LFSDUX A8, X, INCX fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 bdnz LL(102) .align 4 LL(103): fpabs B1, A1 fpabs B2, A2 fpabs B3, A3 fpabs B4, A4 fpabs B5, A5 fpabs B6, A6 fpabs B7, A7 fpabs B8, A8 fpadd T1, B1, B2 fpadd T2, B3, B4 fpadd T3, B5, B6 fpadd T4, B7, B8 fpsub F1, T1, C1 fpsub F2, T2, C2 fpsub F3, T3, C3 fpsub F4, T4, C4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 .align 4 LL(105): andi. r0, N, 7 beq LL(998) andi. r0, N, 4 beq LL(106) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX A3, X, INCX2 LFDUX A4, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX LFSDUX A3, X, INCX2 LFSDUX A4, X, INCX fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpadd A1, A1, A2 fpadd A3, A3, A4 fpsub F1, A1, C1 fpsub F2, A3, C2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A3 .align 4 LL(106): andi. r0, N, 2 beq LL(107) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFSDUX A1, X, INCX2 LFSDUX A2, X, INCX fpabs A1, A1 fpabs A2, A2 fpadd A1, A1, A2 fpsub F1, A1, C1 fpsel C1, F1, C1, A1 .align 4 LL(107): andi. r0, N, 1 beq LL(998) LFDUX A1, X, INCX2 LFDUX A2, X, INCX fabs A1, A1 fabs A2, A2 fadd A1, A1, A2 fsub F1, A1, C1 fsel C1, F1, C1, A1 .align 4 LL(998): fpsub F1, C2, C1 fpsub F2, C4, C3 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C3, C1 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C2, C1 fsel C1, F1, C1, C2 .align 4 LL(999): li r10, 16 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zamin_ppc440.S000066400000000000000000000152351313527062700201220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREX r8 #define INC1 r9 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT sub X, X, INCX li INC1, SIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) LFDUX f1, X, INCX LFDX f2, X, INC1 fabs f1, f1 li PREX, 4 * 8 * SIZE fabs f2, f2 subi N, N, 1 fadd f1, f1, f2 fmr f0, f1 srawi. r0, N, 3 fmr f2, f1 mtspr CTR, r0 fmr f3, f1 beq- LL(150) LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDX f29, X, INC1 fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDX f31, X, INC1 bdz LL(120) .align 4 LL(110): fadd f4, f8, f9 #ifdef PPCG4 dcbt X, PREX #endif fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fabs f12, f28 #ifdef PPCG4 dcbt X, PREX #endif fabs f13, f29 LFDUX f28, X, INCX fabs f14, f30 LFDX f29, X, INC1 fabs f15, f31 LFDUX f30, X, INCX fsub f16, f0, f4 LFDX f31, X, INC1 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 #ifdef PPCG4 dcbt X, PREX #endif fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fsel f0, f16, f4, f0 #ifdef PPCG4 dcbt X, PREX #endif fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDX f29, X, INC1 fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDX f31, X, INC1 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 bdnz LL(110) .align 4 LL(120): fadd f4, f8, f9 fadd f5, f10, f11 fadd f6, f12, f13 fadd f7, f14, f15 fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 fsub f16, f0, f4 fsub f17, f1, f5 fsub f18, f2, f6 fsub f19, f3, f7 fadd f20, f8, f9 fadd f21, f10, f11 fadd f22, f12, f13 fadd f23, f14, f15 fsel f0, f16, f4, f0 fsel f1, f17, f5, f1 fsel f2, f18, f6, f2 fsel f3, f19, f7, f3 fsub f16, f0, f20 fsub f17, f1, f21 fsub f18, f2, f22 fsub f19, f3, f23 fsel f0, f16, f20, f0 fsel f1, f17, f21, f1 fsel f2, f18, f22, f2 fsel f3, f19, f23, f3 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDUX f8, X, INCX LFDX f9, X, INC1 fabs f8, f8 fabs f9, f9 fadd f8, f8, f9 fsub f16, f1, f8 fsel f1, f16, f8, f1 bdnz LL(160) .align 4 LL(999): fsub f8, f0, f1 fsub f9, f2, f3 fsel f0, f8, f1, f0 fsel f2, f9, f3, f2 fsub f8, f0, f2 fsel f1, f8, f2, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zasum.S000066400000000000000000000207561313527062700170550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCXM1 r9 #define PREA r8 #define FZERO f0 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f10, 2 * SIZE(X) LFD f11, 3 * SIZE(X) LFD f12, 4 * SIZE(X) LFD f13, 5 * SIZE(X) LFD f14, 6 * SIZE(X) LFD f15, 7 * SIZE(X) LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) fabs f16, f8 fabs f17, f9 fabs f18, f10 fabs f19, f11 fabs f20, f12 fabs f21, f13 fabs f22, f14 fabs f23, f15 bdz LL(20) .align 4 LL(10): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 LFD f8, 16 * SIZE(X) LFD f9, 17 * SIZE(X) LFD f10, 18 * SIZE(X) LFD f11, 19 * SIZE(X) FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 LFD f12, 20 * SIZE(X) LFD f13, 21 * SIZE(X) LFD f14, 22 * SIZE(X) LFD f15, 23 * SIZE(X) FADD f0, f0, f16 fabs f16, f8 FADD f1, f1, f17 fabs f17, f9 FADD f2, f2, f18 fabs f18, f10 FADD f3, f3, f19 fabs f19, f11 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) FADD f4, f4, f20 fabs f20, f12 FADD f5, f5, f21 fabs f21, f13 FADD f6, f6, f22 fabs f22, f14 FADD f7, f7, f23 fabs f23, f15 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 FADD f0, f0, f16 FADD f1, f1, f17 FADD f2, f2, f18 FADD f3, f3, f19 FADD f4, f4, f20 FADD f5, f5, f21 FADD f6, f6, f22 FADD f7, f7, f23 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 FADD f0, f0, f8 FADD f1, f1, f9 bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f10, X, INCXM1 LFDUX f11, X, INCX LFDX f12, X, INCXM1 LFDUX f13, X, INCX LFDX f14, X, INCXM1 LFDUX f15, X, INCX LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f16, f8 fabs f17, f9 fabs f18, f10 fabs f19, f11 fabs f20, f12 fabs f21, f13 fabs f22, f14 fabs f23, f15 bdz LL(120) .align 4 LL(110): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f10, X, INCXM1 LFDUX f11, X, INCX FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 LFDX f12, X, INCXM1 LFDUX f13, X, INCX LFDX f14, X, INCXM1 LFDUX f15, X, INCX FADD f0, f0, f16 fabs f16, f8 FADD f1, f1, f17 fabs f17, f9 FADD f2, f2, f18 fabs f18, f10 FADD f3, f3, f19 fabs f19, f11 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX FADD f4, f4, f20 fabs f20, f12 FADD f5, f5, f21 fabs f21, f13 FADD f6, f6, f22 fabs f22, f14 FADD f7, f7, f23 fabs f23, f15 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdnz LL(110) .align 4 LL(120): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 FADD f0, f0, f16 FADD f1, f1, f17 FADD f2, f2, f18 FADD f3, f3, f19 FADD f4, f4, f20 FADD f5, f5, f21 FADD f6, f6, f22 FADD f7, f7, f23 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 FADD f0, f0, f8 FADD f1, f1, f9 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f4, f4, f5 FADD f6, f6, f7 FADD f0, f0, f2 FADD f4, f4, f6 FADD f1, f0, f4 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zasum.c000066400000000000000000000063231313527062700170670ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #include #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif #if defined(POWER8) #include "zasum_microk_power8.c" #endif #ifndef HAVE_KERNEL_8 static FLOAT zasum_kernel_8(BLASLONG n, FLOAT *x1) { BLASLONG i=0; FLOAT *x = x1; FLOAT temp0, temp1, temp2, temp3; FLOAT temp4, temp5, temp6, temp7; FLOAT sum0 = 0.0; FLOAT sum1 = 0.0; FLOAT sum2 = 0.0; FLOAT sum3 = 0.0; while ( i< n ) { temp0 = ABS(x[0]); temp1 = ABS(x[1]); temp2 = ABS(x[2]); temp3 = ABS(x[3]); temp4 = ABS(x[4]); temp5 = ABS(x[5]); temp6 = ABS(x[6]); temp7 = ABS(x[7]); sum0 += temp0; sum1 += temp1; sum2 += temp2; sum3 += temp3; sum0 += temp4; sum1 += temp5; sum2 += temp6; sum3 += temp7; x+=8; i+=4; } return sum0+sum1+sum2+sum3; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ip=0; FLOAT sumf = 0.0; BLASLONG n1; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(sumf); if ( inc_x == 1 ) { n1 = n & -8; if ( n1 > 0 ) { sumf = zasum_kernel_8(n1, x); i=n1; ip=2*n1; } while(i < n) { sumf += ABS(x[ip]) + ABS(x[ip+1]); i++; ip+=2; } } else { inc_x2 = 2* inc_x; while(i < n) { sumf += ABS(x[ip]) + ABS(x[ip+1]); ip+=inc_x2; i++; } } return(sumf); } OpenBLAS-0.2.20/kernel/power/zasum_cell.S000066400000000000000000000223021313527062700200410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define INCXM1 r9 #define FZERO f0 #define STACKSIZE 16 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stw r0, 0(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfs FZERO, 0(SP) slwi INCX, INCX, ZBASE_SHIFT fmr f1, FZERO li PREA, 8 * 16 * SIZE fmr f2, FZERO subi INCXM1, INCX, SIZE cmpwi cr0, N, 0 fmr f3, FZERO ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) cmpwi cr0, INCX, SIZE * 2 bne- cr0, LL(20) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(15) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 LFD f8, 4 * SIZE(X) fabs f7, f11 bdz LL(13) .align 4 LL(12): FADD f0, f0, f4 dcbt X, PREA fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 7 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 8 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 9 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 10 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 11 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 12 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 13 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 14 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 15 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 16 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 17 * SIZE(X) FADD f1, f1, f5 addi X, X, 16 * SIZE fabs f5, f9 LFD f10, 2 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 3 * SIZE(X) FADD f3, f3, f7 LFD f8, 4 * SIZE(X) fabs f7, f11 bdnz LL(12) .align 4 LL(13): FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 7 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 8 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 9 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 10 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 11 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 12 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 13 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 14 * SIZE(X) FADD f2, f2, f6 addi X, X, 16 * SIZE fabs f6, f10 LFD f11, -1 * SIZE(X) FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(15): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(16) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 LFD f8, 4 * SIZE(X) fabs f7, f11 FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 addi X, X, 8 * SIZE fabs f6, f10 LFD f11, -1 * SIZE(X) FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 addi X, X, 4 * SIZE fabs f7, f11 nop FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(17): andi. r0, N, 1 beq LL(999) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 fabs f5, f9 FADD f0, f0, f4 addi X, X, 2 * SIZE FADD f1, f1, f5 b LL(999) .align 4 LL(20): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(25) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 LFDX f10, X, INCXM1 fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 LFDX f8, X, INCXM1 fabs f7, f11 bdz LL(23) .align 4 LL(22): FADD f0, f0, f4 dcbt X, PREA fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 LFDX f8, X, INCXM1 fabs f7, f11 bdnz LL(22) .align 4 LL(23): FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(25): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(26) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 LFDX f10, X, INCXM1 fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 LFDX f8, X, INCXM1 fabs f7, f11 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(26): andi. r0, N, 2 beq LL(27) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 LFDX f10, X, INCXM1 fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(27): andi. r0, N, 1 beq LL(999) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 fabs f5, f9 FADD f0, f0, f4 FADD f1, f1, f5 .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f1, f0, f2 addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zasum_hummer.S000066400000000000000000000235131313527062700204240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define FLAG r8 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define A1 f4 #define A2 f5 #define A3 f6 #define A4 f7 #define A5 f8 #define A6 f9 #define A7 f10 #define A8 f11 #define T1 f12 #define T2 f13 #define T3 f14 #define T4 f15 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 fpmr C3, C1 li FLAG, 0 fpmr C4, C1 cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, 0 ble LL(999) sub X, X, INCX2 cmpwi cr0, INCX, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 beq LL(05) LFD C1, 2 * SIZE(X) li FLAG, 1 addi X, X, 1 * SIZE addi N, N, -1 cmpwi cr0, N, 0 fabs C1, C1 ble LL(99) .align 4 LL(05): srawi. r0, N, 3 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 fpmr T1, C2 LFPDUX A2, X, INCX2 fpmr T2, C2 LFPDUX A3, X, INCX2 fpmr T3, C2 LFPDUX A4, X, INCX2 fpmr T4, C2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): fpadd C1, C1, T1 nop fpabs T1, A1 LFPDUX A1, X, INCX2 fpadd C2, C2, T2 nop fpabs T2, A2 LFPDUX A2, X, INCX2 fpadd C3, C3, T3 nop fpabs T3, A3 LFPDUX A3, X, INCX2 fpadd C4, C4, T4 nop fpabs T4, A4 LFPDUX A4, X, INCX2 fpadd C1, C1, T1 nop fpabs T1, A5 LFPDUX A5, X, INCX2 fpadd C2, C2, T2 nop fpabs T2, A6 LFPDUX A6, X, INCX2 fpadd C3, C3, T3 nop fpabs T3, A7 LFPDUX A7, X, INCX2 fpadd C4, C4, T4 fpabs T4, A8 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): fpadd C1, C1, T1 fpabs T1, A1 fpadd C2, C2, T2 fpabs T2, A2 fpadd C3, C3, T3 fpabs T3, A3 fpadd C4, C4, T4 fpabs T4, A4 fpadd C1, C1, T1 fpabs T1, A5 fpadd C2, C2, T2 fpabs T2, A6 fpadd C3, C3, T3 fpabs T3, A7 fpadd C4, C4, T4 fpabs T4, A8 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(15): andi. r0, N, 7 beq LL(99) andi. r0, N, 4 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs T1, A1 fpabs T2, A2 fpadd C1, C1, T1 fpadd C2, C2, T2 .align 4 LL(17): andi. r0, N, 1 beq LL(99) LFPDUX A1, X, INCX2 fpabs T1, A1 fpadd C1, C1, T1 .align 4 LL(99): cmpwi cr0, FLAG, 0 beq LL(999) LFD A1, 2 * SIZE(X) fabs T1, A1 fadd C2, C2, T1 b LL(999) .align 4 LL(100): addi X2, X, SIZE andi. r0, X, 2 * SIZE - 1 bne LL(200) srawi. r0, N, 3 mtspr CTR, r0 beq- LL(115) LFPDUX A1, X, INCX2 fpmr T1, C2 LFPDUX A2, X, INCX2 fpmr T2, C2 LFPDUX A3, X, INCX2 fpmr T3, C2 LFPDUX A4, X, INCX2 fpmr T4, C2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(113) .align 4 LL(112): fpadd C1, C1, T1 nop fpabs T1, A1 LFPDUX A1, X, INCX2 fpadd C2, C2, T2 nop fpabs T2, A2 LFPDUX A2, X, INCX2 fpadd C3, C3, T3 nop fpabs T3, A3 LFPDUX A3, X, INCX2 fpadd C4, C4, T4 nop fpabs T4, A4 LFPDUX A4, X, INCX2 fpadd C1, C1, T1 nop fpabs T1, A5 LFPDUX A5, X, INCX2 fpadd C2, C2, T2 nop fpabs T2, A6 LFPDUX A6, X, INCX2 fpadd C3, C3, T3 nop fpabs T3, A7 LFPDUX A7, X, INCX2 fpadd C4, C4, T4 fpabs T4, A8 LFPDUX A8, X, INCX2 bdnz LL(112) .align 4 LL(113): fpadd C1, C1, T1 fpabs T1, A1 fpadd C2, C2, T2 fpabs T2, A2 fpadd C3, C3, T3 fpabs T3, A3 fpadd C4, C4, T4 fpabs T4, A4 fpadd C1, C1, T1 fpabs T1, A5 fpadd C2, C2, T2 fpabs T2, A6 fpadd C3, C3, T3 fpabs T3, A7 fpadd C4, C4, T4 fpabs T4, A8 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(115): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(116) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(116): andi. r0, N, 2 beq LL(117) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs T1, A1 fpabs T2, A2 fpadd C1, C1, T1 fpadd C2, C2, T2 .align 4 LL(117): andi. r0, N, 1 beq LL(999) LFPDUX A1, X, INCX2 fpabs T1, A1 fpadd C1, C1, T1 b LL(999) .align 4 LL(200): srawi. r0, N, 3 mtspr CTR, r0 beq- LL(215) LFDUX A1, X, INCX2 fpmr T1, C2 LFDUX A2, X, INCX2 fpmr T2, C2 LFDUX A3, X, INCX2 fpmr T3, C2 LFDUX A4, X, INCX2 fpmr T4, C2 LFDUX A5, X, INCX2 LFSDUX A1, X2, INCX2 LFDUX A6, X, INCX2 LFSDUX A2, X2, INCX2 LFDUX A7, X, INCX2 LFSDUX A3, X2, INCX2 LFDUX A8, X, INCX2 LFSDUX A4, X2, INCX2 bdz LL(213) .align 4 LL(212): fpadd C1, C1, T1 LFSDUX A5, X2, INCX2 fpabs T1, A1 LFDUX A1, X, INCX2 fpadd C2, C2, T2 LFSDUX A6, X2, INCX2 fpabs T2, A2 LFDUX A2, X, INCX2 fpadd C3, C3, T3 LFSDUX A7, X2, INCX2 fpabs T3, A3 LFDUX A3, X, INCX2 fpadd C4, C4, T4 LFSDUX A8, X2, INCX2 fpabs T4, A4 LFDUX A4, X, INCX2 fpadd C1, C1, T1 LFSDUX A1, X2, INCX2 fpabs T1, A5 LFDUX A5, X, INCX2 fpadd C2, C2, T2 LFSDUX A2, X2, INCX2 fpabs T2, A6 LFDUX A6, X, INCX2 fpadd C3, C3, T3 LFSDUX A3, X2, INCX2 fpabs T3, A7 LFDUX A7, X, INCX2 fpadd C4, C4, T4 LFSDUX A4, X2, INCX2 fpabs T4, A8 LFDUX A8, X, INCX2 bdnz LL(212) .align 4 LL(213): fpadd C1, C1, T1 nop fpabs T1, A1 LFSDUX A5, X2, INCX2 fpadd C2, C2, T2 nop fpabs T2, A2 LFSDUX A6, X2, INCX2 fpadd C3, C3, T3 nop fpabs T3, A3 LFSDUX A7, X2, INCX2 fpadd C4, C4, T4 nop fpabs T4, A4 LFSDUX A8, X2, INCX2 fpadd C1, C1, T1 fpabs T1, A5 fpadd C2, C2, T2 fpabs T2, A6 fpadd C3, C3, T3 fpabs T3, A7 fpadd C4, C4, T4 fpabs T4, A8 fpadd C1, C1, T1 fpadd C2, C2, T2 fpadd C3, C3, T3 fpadd C4, C4, T4 .align 4 LL(215): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(216) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 fabs T1, A1 LFDUX A5, X, INCX2 fabs T2, A2 LFDUX A6, X2, INCX2 fabs T3, A3 LFDUX A7, X, INCX2 fabs T4, A4 LFDUX A8, X2, INCX2 fadd C1, C1, T1 fabs T1, A5 fadd C2, C2, T2 fabs T2, A6 fadd C3, C3, T3 fabs T3, A7 fadd C4, C4, T4 fabs T4, A8 fadd C1, C1, T1 fadd C2, C2, T2 fadd C3, C3, T3 fadd C4, C4, T4 .align 4 LL(216): andi. r0, N, 2 beq LL(217) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 fabs T1, A1 fabs T2, A2 fabs T3, A3 fabs T4, A4 fadd C1, C1, T1 fadd C2, C2, T2 fadd C3, C3, T3 fadd C4, C4, T4 .align 4 LL(217): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 fabs T1, A1 fabs T2, A2 fadd C1, C1, T1 fadd C2, C2, T2 .align 4 LL(999): fpadd C1, C1, C2 li r10, 16 fpadd C3, C3, C4 fpadd C1, C1, C3 lfpdux f15, SP, r10 fsmtp C2, C1 lfpdux f14, SP, r10 addi SP, SP, 16 fadd C1, C2, C1 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zasum_microk_power8.c000066400000000000000000000122261313527062700217360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/28 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_8 1 static double zasum_kernel_8 (long n, double *x) { double sum; __vector double t0; __vector double t1; __vector double t2; __vector double t3; __asm__ ( "dcbt 0, %2 \n\t" "xxlxor 32, 32, 32 \n\t" "xxlxor 33, 33, 33 \n\t" "xxlxor 34, 34, 34 \n\t" "xxlxor 35, 35, 35 \n\t" "xxlxor 36, 36, 36 \n\t" "xxlxor 37, 37, 37 \n\t" "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" "xvabsdp 50, 42 \n\t" "xvabsdp 51, 43 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %8, %2 \n\t" "xvabsdp %x3, 44 \n\t" "xvabsdp %x4, 45 \n\t" "lxvd2x 42, %9, %2 \n\t" "lxvd2x 43, %10, %2 \n\t" "xvabsdp %x5, 46 \n\t" "xvabsdp %x6, 47 \n\t" "lxvd2x 44, %11, %2 \n\t" "lxvd2x 45, %12, %2 \n\t" "xvadddp 32, 32, 48 \n\t" "xvadddp 33, 33, 49 \n\t" "lxvd2x 46, %13, %2 \n\t" "lxvd2x 47, %14, %2 \n\t" "xvadddp 34, 34, 50 \n\t" "xvadddp 35, 35, 51 \n\t" "addi %2, %2, 128 \n\t" "xvadddp 36, 36, %x3 \n\t" "xvadddp 37, 37, %x4 \n\t" "addic. %1, %1, -8 \n\t" "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" "bgt 1b \n" "2: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" "xvabsdp 50, 42 \n\t" "xvabsdp 51, 43 \n\t" "xvabsdp %x3, 44 \n\t" "xvabsdp %x4, 45 \n\t" "xvabsdp %x5, 46 \n\t" "xvabsdp %x6, 47 \n\t" "xvadddp 32, 32, 48 \n\t" "xvadddp 33, 33, 49 \n\t" "xvadddp 34, 34, 50 \n\t" "xvadddp 35, 35, 51 \n\t" "xvadddp 36, 36, %x3 \n\t" "xvadddp 37, 37, %x4 \n\t" "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" "xvadddp 32, 32, 33 \n\t" "xvadddp 34, 34, 35 \n\t" "xvadddp 36, 36, 37 \n\t" "xvadddp 38, 38, 39 \n\t" "xvadddp 32, 32, 34 \n\t" "xvadddp 36, 36, 38 \n\t" "xvadddp 32, 32, 36 \n\t" "xxswapd 33, 32 \n\t" "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" : "=d" (sum), // 0 "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0), // 3 "=wa" (t1), // 4 "=wa" (t2), // 5 "=wa" (t3) // 6 : "m" (*x), "b" (16), // 8 "b" (32), // 9 "b" (48), // 10 "b" (64), // 11 "b" (80), // 12 "b" (96), // 13 "b" (112) // 14 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); return sum; } OpenBLAS-0.2.20/kernel/power/zasum_ppc440.S000066400000000000000000000146521313527062700201450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCXM1 r9 #define PREX r8 #define FZERO f0 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif fmr f1, FZERO slwi INCX, INCX, ZBASE_SHIFT fmr f2, FZERO fmr f3, FZERO subi INCXM1, INCX, SIZE fmr f4, FZERO sub X, X, INCXM1 fmr f5, FZERO li PREX, 3 * 16 * SIZE fmr f6, FZERO cmpwi cr0, N, 0 fmr f7, FZERO ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f10, X, INCXM1 LFDUX f11, X, INCX LFDX f12, X, INCXM1 LFDUX f13, X, INCX LFDX f14, X, INCXM1 LFDUX f15, X, INCX fabs f16, f8 LFDX f24, X, INCXM1 fabs f17, f9 LFDUX f25, X, INCX fabs f18, f10 LFDX f26, X, INCXM1 fabs f19, f11 LFDUX f27, X, INCX fabs f20, f12 LFDX f28, X, INCXM1 fabs f21, f13 LFDUX f29, X, INCX fabs f22, f14 LFDX f30, X, INCXM1 fabs f23, f15 LFDUX f31, X, INCX bdz LL(120) .align 4 LL(110): LFDX f8, X, INCXM1 FADD f0, f0, f16 #ifdef PPCG4 dcbt X, PREX #else nop #endif fabs f16, f24 LFDUX f9, X, INCX FADD f1, f1, f17 nop fabs f17, f25 LFDX f10, X, INCXM1 FADD f2, f2, f18 nop fabs f18, f26 LFDUX f11, X, INCX FADD f3, f3, f19 nop fabs f19, f27 LFDX f12, X, INCXM1 FADD f4, f4, f20 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PREX #else nop #endif fabs f20, f28 LFDUX f13, X, INCX FADD f5, f5, f21 nop fabs f21, f29 LFDX f14, X, INCXM1 FADD f6, f6, f22 nop fabs f22, f30 LFDUX f15, X, INCX FADD f7, f7, f23 nop fabs f23, f31 LFDX f24, X, INCXM1 FADD f0, f0, f16 #ifdef PPCG4 dcbt X, PREX #else nop #endif fabs f16, f8 LFDUX f25, X, INCX FADD f1, f1, f17 nop fabs f17, f9 LFDX f26, X, INCXM1 FADD f2, f2, f18 nop fabs f18, f10 LFDUX f27, X, INCX FADD f3, f3, f19 nop fabs f19, f11 LFDX f28, X, INCXM1 FADD f4, f4, f20 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PREX #else nop #endif fabs f20, f12 LFDUX f29, X, INCX FADD f5, f5, f21 nop fabs f21, f13 LFDX f30, X, INCXM1 FADD f6, f6, f22 nop fabs f22, f14 LFDUX f31, X, INCX FADD f7, f7, f23 fabs f23, f15 bdnz LL(110) .align 4 LL(120): FADD f0, f0, f16 fabs f16, f24 FADD f1, f1, f17 fabs f17, f25 FADD f2, f2, f18 fabs f18, f26 FADD f3, f3, f19 fabs f19, f27 FADD f4, f4, f20 fabs f20, f28 FADD f5, f5, f21 fabs f21, f29 FADD f6, f6, f22 fabs f22, f30 FADD f7, f7, f23 fabs f23, f31 FADD f0, f0, f16 FADD f1, f1, f17 FADD f2, f2, f18 FADD f3, f3, f19 FADD f4, f4, f20 FADD f5, f5, f21 FADD f6, f6, f22 FADD f7, f7, f23 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 FADD f0, f0, f8 FADD f1, f1, f9 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f4, f4, f5 FADD f6, f6, f7 FADD f0, f0, f2 FADD f4, f4, f6 FADD f1, f0, f4 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zaxpy.S000066400000000000000000000345141313527062700170660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define INCXM1 r4 #define INCYM1 r5 #define PREA r10 #define YY r11 #else #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define INCXM1 r5 #define INCYM1 r6 #define PREA r7 #define YY r11 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r10 #define INCX r4 #define Y r5 #define INCY r6 #define INCXM1 r7 #define INCYM1 r8 #define PREA r9 #define YY r11 #else #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define INCXM1 r5 #define INCYM1 r6 #define PREA r7 #define YY r11 #endif #endif #define ALPHA_R f24 #define ALPHA_I f25 #ifndef CONJ #define ADD1 FNMSUB #define ADD2 FMADD #else #define ADD1 FMADD #define ADD2 FNMSUB #endif #ifndef NEEDPARAM #define STACKSIZE 96 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) #if defined(linux) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #else lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif fmr ALPHA_R, f1 fmr ALPHA_I, f2 slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) cmpwi cr0, INCY, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f2, 2 * SIZE(X) LFD f3, 3 * SIZE(X) LFD f8, 0 * SIZE(Y) LFD f9, 1 * SIZE(Y) LFD f10, 2 * SIZE(Y) LFD f11, 3 * SIZE(Y) LFD f4, 4 * SIZE(X) LFD f5, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f7, 7 * SIZE(X) LFD f12, 4 * SIZE(Y) LFD f13, 5 * SIZE(Y) LFD f14, 6 * SIZE(Y) LFD f15, 7 * SIZE(Y) bdz LL(20) .align 4 LL(10): FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 LFD f0, 8 * SIZE(X) LFD f1, 9 * SIZE(X) LFD f2, 10 * SIZE(X) LFD f3, 11 * SIZE(X) LFD f8, 8 * SIZE(Y) LFD f9, 9 * SIZE(Y) LFD f10, 10 * SIZE(Y) LFD f11, 11 * SIZE(Y) STFD f16, 0 * SIZE(Y) STFD f17, 1 * SIZE(Y) STFD f18, 2 * SIZE(Y) STFD f19, 3 * SIZE(Y) FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 LFD f4, 12 * SIZE(X) LFD f5, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f7, 15 * SIZE(X) LFD f12, 12 * SIZE(Y) LFD f13, 13 * SIZE(Y) LFD f14, 14 * SIZE(Y) LFD f15, 15 * SIZE(Y) STFD f20, 4 * SIZE(Y) STFD f21, 5 * SIZE(Y) STFD f22, 6 * SIZE(Y) STFD f23, 7 * SIZE(Y) FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 LFD f0, 16 * SIZE(X) LFD f1, 17 * SIZE(X) LFD f2, 18 * SIZE(X) LFD f3, 19 * SIZE(X) LFD f8, 16 * SIZE(Y) LFD f9, 17 * SIZE(Y) LFD f10, 18 * SIZE(Y) LFD f11, 19 * SIZE(Y) STFD f16, 8 * SIZE(Y) STFD f17, 9 * SIZE(Y) STFD f18, 10 * SIZE(Y) STFD f19, 11 * SIZE(Y) FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 LFD f4, 20 * SIZE(X) LFD f5, 21 * SIZE(X) LFD f6, 22 * SIZE(X) LFD f7, 23 * SIZE(X) LFD f12, 20 * SIZE(Y) LFD f13, 21 * SIZE(Y) LFD f14, 22 * SIZE(Y) LFD f15, 23 * SIZE(Y) STFD f20, 12 * SIZE(Y) STFD f21, 13 * SIZE(Y) STFD f22, 14 * SIZE(Y) STFD f23, 15 * SIZE(Y) #ifndef POWER6 dcbtst Y, PREA #ifdef L1_DUALFETCH dcbt X, PREA #endif #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 dcbtst Y, PREA L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 LFD f0, 8 * SIZE(X) LFD f1, 9 * SIZE(X) LFD f2, 10 * SIZE(X) LFD f3, 11 * SIZE(X) LFD f8, 8 * SIZE(Y) LFD f9, 9 * SIZE(Y) LFD f10, 10 * SIZE(Y) LFD f11, 11 * SIZE(Y) FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 LFD f4, 12 * SIZE(X) LFD f5, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f7, 15 * SIZE(X) LFD f12, 12 * SIZE(Y) LFD f13, 13 * SIZE(Y) LFD f14, 14 * SIZE(Y) LFD f15, 15 * SIZE(Y) STFD f16, 0 * SIZE(Y) STFD f17, 1 * SIZE(Y) STFD f18, 2 * SIZE(Y) STFD f19, 3 * SIZE(Y) FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 STFD f20, 4 * SIZE(Y) STFD f21, 5 * SIZE(Y) STFD f22, 6 * SIZE(Y) STFD f23, 7 * SIZE(Y) FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 STFD f16, 8 * SIZE(Y) STFD f17, 9 * SIZE(Y) STFD f18, 10 * SIZE(Y) STFD f19, 11 * SIZE(Y) STFD f20, 12 * SIZE(Y) STFD f21, 13 * SIZE(Y) STFD f22, 14 * SIZE(Y) STFD f23, 15 * SIZE(Y) addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f8, 0 * SIZE(Y) LFD f9, 1 * SIZE(Y) FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 STFD f16, 0 * SIZE(Y) STFD f17, 1 * SIZE(Y) addi X, X, 2 * SIZE addi Y, Y, 2 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 sub Y, Y, INCYM1 mr YY, Y srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) .align 4 LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f2, X, INCXM1 LFDUX f3, X, INCX LFDX f8, Y, INCYM1 LFDUX f9, Y, INCY LFDX f10, Y, INCYM1 LFDUX f11, Y, INCY LFDX f4, X, INCXM1 LFDUX f5, X, INCX LFDX f6, X, INCXM1 LFDUX f7, X, INCX LFDX f12, Y, INCYM1 LFDUX f13, Y, INCY LFDX f14, Y, INCYM1 LFDUX f15, Y, INCY bdz LL(120) .align 4 LL(110): FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f2, X, INCXM1 LFDUX f3, X, INCX LFDX f8, Y, INCYM1 LFDUX f9, Y, INCY LFDX f10, Y, INCYM1 LFDUX f11, Y, INCY FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 LFDX f4, X, INCXM1 LFDUX f5, X, INCX LFDX f6, X, INCXM1 LFDUX f7, X, INCX LFDX f12, Y, INCYM1 LFDUX f13, Y, INCY LFDX f14, Y, INCYM1 LFDUX f15, Y, INCY STFDX f16, YY, INCYM1 STFDUX f17, YY, INCY STFDX f18, YY, INCYM1 STFDUX f19, YY, INCY FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f2, X, INCXM1 LFDUX f3, X, INCX LFDX f8, Y, INCYM1 LFDUX f9, Y, INCY LFDX f10, Y, INCYM1 LFDUX f11, Y, INCY STFDX f20, YY, INCYM1 STFDUX f21, YY, INCY STFDX f22, YY, INCYM1 STFDUX f23, YY, INCY FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 LFDX f4, X, INCXM1 LFDUX f5, X, INCX LFDX f6, X, INCXM1 LFDUX f7, X, INCX LFDX f12, Y, INCYM1 LFDUX f13, Y, INCY LFDX f14, Y, INCYM1 LFDUX f15, Y, INCY STFDX f16, YY, INCYM1 STFDUX f17, YY, INCY STFDX f18, YY, INCYM1 STFDUX f19, YY, INCY STFDX f20, YY, INCYM1 STFDUX f21, YY, INCY STFDX f22, YY, INCYM1 STFDUX f23, YY, INCY bdnz LL(110) .align 4 LL(120): FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f2, X, INCXM1 LFDUX f3, X, INCX LFDX f8, Y, INCYM1 LFDUX f9, Y, INCY LFDX f10, Y, INCYM1 LFDUX f11, Y, INCY FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 LFDX f4, X, INCXM1 LFDUX f5, X, INCX LFDX f6, X, INCXM1 LFDUX f7, X, INCX LFDX f12, Y, INCYM1 LFDUX f13, Y, INCY LFDX f14, Y, INCYM1 LFDUX f15, Y, INCY STFDX f16, YY, INCYM1 STFDUX f17, YY, INCY STFDX f18, YY, INCYM1 STFDUX f19, YY, INCY FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 FMADD f18, ALPHA_R, f2, f10 FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 STFDX f20, YY, INCYM1 STFDUX f21, YY, INCY STFDX f22, YY, INCYM1 STFDUX f23, YY, INCY FMADD f20, ALPHA_R, f4, f12 FMADD f21, ALPHA_I, f4, f13 FMADD f22, ALPHA_R, f6, f14 FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 ADD2 f21, ALPHA_R, f5, f21 ADD1 f22, ALPHA_I, f7, f22 ADD2 f23, ALPHA_R, f7, f23 STFDX f16, YY, INCYM1 STFDUX f17, YY, INCY STFDX f18, YY, INCYM1 STFDUX f19, YY, INCY STFDX f20, YY, INCYM1 STFDUX f21, YY, INCY STFDX f22, YY, INCYM1 STFDUX f23, YY, INCY .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f8, Y, INCYM1 LFDUX f9, Y, INCY FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 STFDX f16, YY, INCYM1 STFDUX f17, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zaxpy.c000066400000000000000000000076461313527062700171140ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "zaxpy_microk_power8.c" #endif #ifndef HAVE_KERNEL_4 static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; BLASLONG register ix = 0; FLOAT da_r = alpha[0]; FLOAT da_i = alpha[1]; while(i < n) { #if !defined(CONJ) y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; #else y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; #endif ix+=4 ; i+=2 ; } } #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; if ( n <= 0 ) return(0); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -16; if ( n1 ) { zaxpy_kernel_4 (n1, x, y, da_r, da_i); ix = 2 * n1; } i = n1; while(i < n) { #if !defined(CONJ) y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif i++ ; ix += 2; } return(0); } inc_x *=2; inc_y *=2; while(i < n) { #if !defined(CONJ) y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/power/zaxpy_hummer.S000066400000000000000000000250431313527062700204400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define YY r4 #define INCX2 r5 #define INCY2 r10 #define X1 r11 #define Y1 INCX #define YY1 INCY #define ALPHA f1 #define A1 f0 #define A2 f8 #define A3 f2 #define A4 f3 #define A5 f4 #define A6 f5 #define A7 f6 #define A8 f7 #define A9 f25 #define B1 f9 #define B2 f10 #define B3 f11 #define B4 f12 #define B5 f13 #define B6 f14 #define B7 f15 #define B8 f16 #define C1 f17 #define C2 f18 #define C3 f19 #define C4 f20 #define C5 f21 #define C6 f22 #define C7 f23 #define C8 f24 #define ALPHA_R ALPHA #define ALPHA_I A9 #ifndef CONJ #define ADD1 FNMSUB #define ADD2 FMADD #else #define ADD1 FMADD #define ADD2 FNMSUB #endif #ifndef CONJ #define FXMADD1 fxcpmadd #define FXMADD2 fxcxnpma #else #define FXMADD1 fxcpnsma #define FXMADD2 fxcxma #endif PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 fsmfp ALPHA, f2 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX add INCY2, INCY, INCY cmpwi cr0, N, 0 ble LL(999) andi. r0, X, 2 * SIZE - 1 bne LL(100) andi. r0, Y, 2 * SIZE - 1 bne LL(100) sub X, X, INCX2 sub Y, Y, INCY2 mr YY, Y srawi. r0, N, 3 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 LFPDUX A8, X, INCX2 LFPDUX B8, Y, INCY2 bdz LL(13) .align 4 LL(12): FXMADD1 C1, ALPHA, A1, B1 LFPDUX B1, Y, INCY2 FXMADD1 C2, ALPHA, A2, B2 LFPDUX B2, Y, INCY2 FXMADD1 C3, ALPHA, A3, B3 LFPDUX B3, Y, INCY2 FXMADD1 C4, ALPHA, A4, B4 LFPDUX B4, Y, INCY2 FXMADD1 C5, ALPHA, A5, B5 LFPDUX B5, Y, INCY2 FXMADD1 C6, ALPHA, A6, B6 LFPDUX B6, Y, INCY2 FXMADD1 C7, ALPHA, A7, B7 LFPDUX B7, Y, INCY2 FXMADD1 C8, ALPHA, A8, B8 LFPDUX B8, Y, INCY2 FXMADD2 C1, ALPHA, A1, C1 LFPDUX A1, X, INCX2 FXMADD2 C2, ALPHA, A2, C2 LFPDUX A2, X, INCX2 FXMADD2 C3, ALPHA, A3, C3 LFPDUX A3, X, INCX2 FXMADD2 C4, ALPHA, A4, C4 LFPDUX A4, X, INCX2 FXMADD2 C5, ALPHA, A5, C5 LFPDUX A5, X, INCX2 FXMADD2 C6, ALPHA, A6, C6 LFPDUX A6, X, INCX2 FXMADD2 C7, ALPHA, A7, C7 LFPDUX A7, X, INCX2 FXMADD2 C8, ALPHA, A8, C8 LFPDUX A8, X, INCX2 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 STFPDUX C5, YY, INCY2 STFPDUX C6, YY, INCY2 STFPDUX C7, YY, INCY2 STFPDUX C8, YY, INCY2 bdnz LL(12) .align 4 LL(13): FXMADD1 C1, ALPHA, A1, B1 FXMADD1 C2, ALPHA, A2, B2 FXMADD1 C3, ALPHA, A3, B3 FXMADD1 C4, ALPHA, A4, B4 FXMADD1 C5, ALPHA, A5, B5 FXMADD1 C6, ALPHA, A6, B6 FXMADD1 C7, ALPHA, A7, B7 FXMADD1 C8, ALPHA, A8, B8 FXMADD2 C1, ALPHA, A1, C1 FXMADD2 C2, ALPHA, A2, C2 FXMADD2 C3, ALPHA, A3, C3 FXMADD2 C4, ALPHA, A4, C4 FXMADD2 C5, ALPHA, A5, C5 FXMADD2 C6, ALPHA, A6, C6 STFPDUX C1, YY, INCY2 FXMADD2 C7, ALPHA, A7, C7 STFPDUX C2, YY, INCY2 FXMADD2 C8, ALPHA, A8, C8 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 STFPDUX C5, YY, INCY2 STFPDUX C6, YY, INCY2 STFPDUX C7, YY, INCY2 STFPDUX C8, YY, INCY2 .align 4 LL(15): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 FXMADD1 C1, ALPHA, A1, B1 FXMADD1 C2, ALPHA, A2, B2 FXMADD1 C3, ALPHA, A3, B3 FXMADD1 C4, ALPHA, A4, B4 FXMADD2 C1, ALPHA, A1, C1 FXMADD2 C2, ALPHA, A2, C2 FXMADD2 C3, ALPHA, A3, C3 FXMADD2 C4, ALPHA, A4, C4 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 STFPDUX C3, YY, INCY2 STFPDUX C4, YY, INCY2 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 FXMADD1 C1, ALPHA, A1, B1 FXMADD1 C2, ALPHA, A2, B2 FXMADD2 C1, ALPHA, A1, C1 FXMADD2 C2, ALPHA, A2, C2 STFPDUX C1, YY, INCY2 STFPDUX C2, YY, INCY2 .align 4 LL(17): andi. r0, N, 1 beq LL(999) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 FXMADD1 C1, ALPHA, A1, B1 FXMADD2 C1, ALPHA, A1, C1 STFPDUX C1, YY, INCY2 b LL(999) .align 4 LL(100): fsmtp ALPHA_I, ALPHA_R sub X, X, INCX2 sub Y, Y, INCY2 addi X1, X, SIZE addi Y1, Y, SIZE mr YY, Y mr YY1, Y1 srawi. r0, N, 2 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX2 LFDUX A2, X1, INCX2 LFDUX B1, Y, INCY2 LFDUX B2, Y1, INCY2 LFDUX A3, X, INCX2 LFDUX A4, X1, INCX2 LFDUX B3, Y, INCY2 LFDUX B4, Y1, INCY2 LFDUX A5, X, INCX2 LFDUX A6, X1, INCX2 LFDUX B5, Y, INCY2 LFDUX B6, Y1, INCY2 LFDUX A7, X, INCX2 LFDUX A8, X1, INCX2 LFDUX B7, Y, INCY2 LFDUX B8, Y1, INCY2 bdz LL(113) .align 4 LL(112): FMADD C1, ALPHA_R, A1, B1 LFDUX B1, Y, INCY2 FMADD C2, ALPHA_I, A1, B2 LFDUX A1, X, INCX2 FMADD C3, ALPHA_R, A3, B3 LFDUX B3, Y, INCY2 FMADD C4, ALPHA_I, A3, B4 LFDUX A3, X, INCX2 FMADD C5, ALPHA_R, A5, B5 LFDUX B5, Y, INCY2 FMADD C6, ALPHA_I, A5, B6 LFDUX A5, X, INCX2 FMADD C7, ALPHA_R, A7, B7 LFDUX B7, Y, INCY2 FMADD C8, ALPHA_I, A7, B8 LFDUX A7, X, INCX2 ADD1 C1, ALPHA_I, A2, C1 LFDUX B2, Y1, INCY2 ADD2 C2, ALPHA_R, A2, C2 LFDUX A2, X1, INCX2 ADD1 C3, ALPHA_I, A4, C3 LFDUX B4, Y1, INCY2 ADD2 C4, ALPHA_R, A4, C4 LFDUX A4, X1, INCX2 ADD1 C5, ALPHA_I, A6, C5 LFDUX B6, Y1, INCY2 ADD2 C6, ALPHA_R, A6, C6 LFDUX A6, X1, INCX2 ADD1 C7, ALPHA_I, A8, C7 LFDUX B8, Y1, INCY2 ADD2 C8, ALPHA_R, A8, C8 LFDUX A8, X1, INCX2 STFDUX C1, YY, INCY2 STFDUX C2, YY1, INCY2 STFDUX C3, YY, INCY2 STFDUX C4, YY1, INCY2 STFDUX C5, YY, INCY2 STFDUX C6, YY1, INCY2 STFDUX C7, YY, INCY2 STFDUX C8, YY1, INCY2 bdnz LL(112) .align 4 LL(113): FMADD C1, ALPHA_R, A1, B1 FMADD C2, ALPHA_I, A1, B2 FMADD C3, ALPHA_R, A3, B3 FMADD C4, ALPHA_I, A3, B4 FMADD C5, ALPHA_R, A5, B5 FMADD C6, ALPHA_I, A5, B6 FMADD C7, ALPHA_R, A7, B7 FMADD C8, ALPHA_I, A7, B8 ADD1 C1, ALPHA_I, A2, C1 ADD2 C2, ALPHA_R, A2, C2 ADD1 C3, ALPHA_I, A4, C3 ADD2 C4, ALPHA_R, A4, C4 ADD1 C5, ALPHA_I, A6, C5 ADD2 C6, ALPHA_R, A6, C6 STFDUX C1, YY, INCY2 ADD1 C7, ALPHA_I, A8, C7 STFDUX C2, YY1, INCY2 ADD2 C8, ALPHA_R, A8, C8 STFDUX C3, YY, INCY2 STFDUX C4, YY1, INCY2 STFDUX C5, YY, INCY2 STFDUX C6, YY1, INCY2 STFDUX C7, YY, INCY2 STFDUX C8, YY1, INCY2 .align 4 LL(115): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(117) LFDUX A1, X, INCX2 LFDUX A2, X1, INCX2 LFDUX B1, Y, INCY2 LFDUX B2, Y1, INCY2 LFDUX A3, X, INCX2 FMADD C1, ALPHA_R, A1, B1 LFDUX A4, X1, INCX2 FMADD C2, ALPHA_I, A1, B2 LFDUX B3, Y, INCY2 FMADD C3, ALPHA_R, A3, B3 LFDUX B4, Y1, INCY2 FMADD C4, ALPHA_I, A3, B4 ADD1 C1, ALPHA_I, A2, C1 ADD2 C2, ALPHA_R, A2, C2 STFDUX C1, YY, INCY2 ADD1 C3, ALPHA_I, A4, C3 STFDUX C2, YY1, INCY2 ADD2 C4, ALPHA_R, A4, C4 STFDUX C3, YY, INCY2 STFDUX C4, YY1, INCY2 .align 4 LL(117): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX A2, X1, INCX2 LFDUX B1, Y, INCY2 LFDUX B2, Y1, INCY2 FMADD C1, ALPHA_R, A1, B1 FMADD C2, ALPHA_I, A1, B2 ADD1 C1, ALPHA_I, A2, C1 ADD2 C2, ALPHA_R, A2, C2 STFDUX C1, YY, INCY2 STFDUX C2, YY1, INCY2 .align 4 LL(999): li r10, 16 subi SP, SP, 16 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zaxpy_microk_power8.c000066400000000000000000000206151313527062700217530ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/23 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_4 1 static void zaxpy_kernel_4 (long n, double *x, double *y, double alpha_r, double alpha_i) { #if !defined(CONJ) static const double mvec[2] = { -1.0, 1.0 }; #else static const double mvec[2] = { 1.0, -1.0 }; #endif const double *mvecp = mvec; __vector double t0; __vector double t1; __vector double t2; __vector double t3; __vector double t4; __vector double t5; __vector double t6; __vector double t7; __vector double t8; __vector double t9; __vector double t10; __vector double t11; long ytmp; __asm__ ( "xxspltd 32, %x19, 0 \n\t" // alpha_r "xxspltd 33, %x20, 0 \n\t" // alpha_i "lxvd2x 36, 0, %21 \n\t" // mvec #if !defined(CONJ) "xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec #else "xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec #endif "mr %16, %3 \n\t" "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" "lxvd2x 40, 0, %2 \n\t" // x0 "lxvd2x 41, %22, %2 \n\t" // x1 "lxvd2x 42, %23, %2 \n\t" // x2 "lxvd2x 43, %24, %2 \n\t" // x3 "lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 49, %22, %3 \n\t" // y1 "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 "xxswapd %x8, 40 \n\t" // exchange real and imag part "xxswapd %x9, 41 \n\t" // exchange real and imag part "xxswapd %x10, 42 \n\t" // exchange real and imag part "xxswapd %x11, 43 \n\t" // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "lxvd2x 44, 0, %2 \n\t" // x4 "lxvd2x 45, %22, %2 \n\t" // x5 "lxvd2x 46, %23, %2 \n\t" // x6 "lxvd2x 47, %24, %2 \n\t" // x7 "lxvd2x %x4, 0, %3 \n\t" // y4 "lxvd2x %x5, %22, %3 \n\t" // y5 "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 "xxswapd %x12, 44 \n\t" // exchange real and imag part "xxswapd %x13, 45 \n\t" // exchange real and imag part "xxswapd %x14, 46 \n\t" // exchange real and imag part "xxswapd %x15, 47 \n\t" // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" "lxvd2x 40, 0, %2 \n\t" // x0 "lxvd2x 41, %22, %2 \n\t" // x1 "xvmaddadp 50, 42, 32 \n\t" "xvmaddadp 51, 43, 32 \n\t" "lxvd2x 42, %23, %2 \n\t" // x2 "lxvd2x 43, %24, %2 \n\t" // x3 "xvmaddadp %x4, 44, 32 \n\t" "addi %2, %2, 64 \n\t" "xvmaddadp %x5, 45, 32 \n\t" "lxvd2x 44, 0, %2 \n\t" // x4 "lxvd2x 45, %22, %2 \n\t" // x5 "xvmaddadp %x6, 46, 32 \n\t" "xvmaddadp %x7, 47, 32 \n\t" "lxvd2x 46, %23, %2 \n\t" // x6 "lxvd2x 47, %24, %2 \n\t" // x7 "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r "addi %2, %2, 64 \n\t" "xvmaddadp 49, %x9, 33 \n\t" "xvmaddadp 50, %x10, 33 \n\t" "xvmaddadp 51, %x11, 33 \n\t" "xvmaddadp %x4, %x12, 33 \n\t" "xvmaddadp %x5, %x13, 33 \n\t" "xvmaddadp %x6, %x14, 33 \n\t" "xvmaddadp %x7, %x15, 33 \n\t" "stxvd2x 48, 0, %16 \n\t" "stxvd2x 49, %22, %16 \n\t" "stxvd2x 50, %23, %16 \n\t" "stxvd2x 51, %24, %16 \n\t" "addi %16, %16, 64 \n\t" "stxvd2x %x4, 0, %16 \n\t" "stxvd2x %x5, %22, %16 \n\t" "stxvd2x %x6, %23, %16 \n\t" "stxvd2x %x7, %24, %16 \n\t" "addi %16, %16, 64 \n\t" "xxswapd %x8, 40 \n\t" // exchange real and imag part "xxswapd %x9, 41 \n\t" // exchange real and imag part "lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 49, %22, %3 \n\t" // y1 "xxswapd %x10, 42 \n\t" // exchange real and imag part "xxswapd %x11, 43 \n\t" // exchange real and imag part "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 "xxswapd %x12, 44 \n\t" // exchange real and imag part "addi %3, %3, 64 \n\t" "xxswapd %x13, 45 \n\t" // exchange real and imag part "lxvd2x %x4, 0, %3 \n\t" // y4 "lxvd2x %x5, %22, %3 \n\t" // y5 "xxswapd %x14, 46 \n\t" // exchange real and imag part "xxswapd %x15, 47 \n\t" // exchange real and imag part "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" "bgt 1b \n" "2: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" "xvmaddadp 50, 42, 32 \n\t" "xvmaddadp 51, 43, 32 \n\t" "xvmaddadp %x4, 44, 32 \n\t" "xvmaddadp %x5, 45, 32 \n\t" "xvmaddadp %x6, 46, 32 \n\t" "xvmaddadp %x7, 47, 32 \n\t" "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r "xvmaddadp 49, %x9, 33 \n\t" "xvmaddadp 50, %x10, 33 \n\t" "xvmaddadp 51, %x11, 33 \n\t" "xvmaddadp %x4, %x12, 33 \n\t" "xvmaddadp %x5, %x13, 33 \n\t" "xvmaddadp %x6, %x14, 33 \n\t" "xvmaddadp %x7, %x15, 33 \n\t" "stxvd2x 48, 0, %16 \n\t" "stxvd2x 49, %22, %16 \n\t" "stxvd2x 50, %23, %16 \n\t" "stxvd2x 51, %24, %16 \n\t" "addi %16, %16, 64 \n\t" "stxvd2x %x4, 0, %16 \n\t" "stxvd2x %x5, %22, %16 \n\t" "stxvd2x %x6, %23, %16 \n\t" "stxvd2x %x7, %24, %16 \n" "#n=%1 x=%17=%2 y=%0=%3 alpha=(%19,%20) mvecp=%18=%16 o16=%22 o32=%23 o48=%24 ytmp=%16\n" "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15" : "+m" (*y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y), // 3 "=wa" (t0), // 4 "=wa" (t1), // 5 "=wa" (t2), // 6 "=wa" (t3), // 7 "=wa" (t4), // 8 "=wa" (t5), // 9 "=wa" (t6), // 10 "=wa" (t7), // 11 "=wa" (t8), // 12 "=wa" (t9), // 13 "=wa" (t10), // 14 "=wa" (t11), // 15 "=b" (ytmp) // 16 : "m" (*x), "m" (*mvecp), "d" (alpha_r), // 19 "d" (alpha_i), // 20 "16" (mvecp), // 21 "b" (16), // 22 "b" (32), // 23 "b" (48) // 24 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); } OpenBLAS-0.2.20/kernel/power/zaxpy_ppc440.S000066400000000000000000000230071313527062700201530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define YY r4 #define PRE r5 #else #define N r3 #define X r8 #define INCX r9 #define Y r5 #define INCY r4 #define YY r6 #define PRE r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r10 #define INCX r4 #define Y r5 #define INCY r6 #define YY r7 #define PRE r8 #else #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define YY r5 #define PRE r6 #endif #endif #define ALPHA_R f24 #define ALPHA_I f25 #ifndef CONJ #define ADD1 FNMSUB #define ADD2 FMADD #else #define ADD1 FMADD #define ADD2 FNMSUB #endif #define STACKSIZE 96 PROLOGUE PROFCODE subi SP, SP, STACKSIZE stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) #if defined(linux) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #else lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif fmr ALPHA_R, f1 slwi INCX, INCX, ZBASE_SHIFT fmr ALPHA_I, f2 slwi INCY, INCY, ZBASE_SHIFT subi INCX, INCX, SIZE subi INCY, INCY, SIZE li PRE, 2 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) sub X, X, INCX sub Y, Y, INCY mr YY, Y srawi. r0, N, 3 mtspr CTR, r0 ble- LL(150) .align 4 LFDUX f0, X, INCX LFDU f1, 1 * SIZE(X) LFDUX f2, X, INCX LFDU f3, 1 * SIZE(X) LFDUX f8, Y, INCY LFDU f9, 1 * SIZE(Y) LFDUX f10, Y, INCY LFDU f11, 1 * SIZE(Y) LFDUX f4, X, INCX LFDU f5, 1 * SIZE(X) LFDUX f6, X, INCX LFDU f7, 1 * SIZE(X) LFDUX f12, Y, INCY LFDU f13, 1 * SIZE(Y) LFDUX f14, Y, INCY LFDU f15, 1 * SIZE(Y) bdz LL(120) .align 4 LL(110): FMADD f16, ALPHA_R, f0, f8 LFDUX f8, Y, INCY FMADD f17, ALPHA_I, f0, f9 LFDU f9, 1 * SIZE(Y) FMADD f18, ALPHA_R, f2, f10 LFDUX f10, Y, INCY FMADD f19, ALPHA_I, f2, f11 LFDU f11, 1 * SIZE(Y) #ifdef PPCG4 dcbt X, PRE #endif ADD1 f16, ALPHA_I, f1, f16 LFDUX f0, X, INCX ADD2 f17, ALPHA_R, f1, f17 LFDU f1, 1 * SIZE(X) ADD1 f18, ALPHA_I, f3, f18 LFDUX f2, X, INCX ADD2 f19, ALPHA_R, f3, f19 LFDU f3, 1 * SIZE(X) #ifdef PPCG4 dcbtst Y, PRE #endif FMADD f20, ALPHA_R, f4, f12 LFDUX f12, Y, INCY FMADD f21, ALPHA_I, f4, f13 LFDU f13, 1 * SIZE(Y) FMADD f22, ALPHA_R, f6, f14 LFDUX f14, Y, INCY FMADD f23, ALPHA_I, f6, f15 LFDU f15, 1 * SIZE(Y) #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif ADD1 f20, ALPHA_I, f5, f20 LFDUX f4, X, INCX ADD2 f21, ALPHA_R, f5, f21 LFDU f5, 1 * SIZE(X) ADD1 f22, ALPHA_I, f7, f22 LFDUX f6, X, INCX ADD2 f23, ALPHA_R, f7, f23 LFDU f7, 1 * SIZE(X) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y, PRE #endif STFDUX f16, YY, INCY STFDU f17, 1 * SIZE(YY) STFDUX f18, YY, INCY STFDU f19, 1 * SIZE(YY) FMADD f16, ALPHA_R, f0, f8 LFDUX f8, Y, INCY FMADD f17, ALPHA_I, f0, f9 LFDU f9, 1 * SIZE(Y) FMADD f18, ALPHA_R, f2, f10 LFDUX f10, Y, INCY FMADD f19, ALPHA_I, f2, f11 LFDU f11, 1 * SIZE(Y) #ifdef PPCG4 dcbt X, PRE #endif ADD1 f16, ALPHA_I, f1, f16 LFDUX f0, X, INCX ADD2 f17, ALPHA_R, f1, f17 LFDU f1, 1 * SIZE(X) ADD1 f18, ALPHA_I, f3, f18 LFDUX f2, X, INCX ADD2 f19, ALPHA_R, f3, f19 LFDU f3, 1 * SIZE(X) #ifdef PPCG4 dcbtst Y, PRE #endif STFDUX f20, YY, INCY STFDU f21, 1 * SIZE(YY) STFDUX f22, YY, INCY STFDU f23, 1 * SIZE(YY) FMADD f20, ALPHA_R, f4, f12 LFDUX f12, Y, INCY FMADD f21, ALPHA_I, f4, f13 LFDU f13, 1 * SIZE(Y) FMADD f22, ALPHA_R, f6, f14 LFDUX f14, Y, INCY FMADD f23, ALPHA_I, f6, f15 LFDU f15, 1 * SIZE(Y) #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif ADD1 f20, ALPHA_I, f5, f20 LFDUX f4, X, INCX ADD2 f21, ALPHA_R, f5, f21 LFDU f5, 1 * SIZE(X) ADD1 f22, ALPHA_I, f7, f22 LFDUX f6, X, INCX ADD2 f23, ALPHA_R, f7, f23 LFDU f7, 1 * SIZE(X) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y, PRE #endif STFDUX f16, YY, INCY STFDU f17, 1 * SIZE(YY) STFDUX f18, YY, INCY STFDU f19, 1 * SIZE(YY) STFDUX f20, YY, INCY STFDU f21, 1 * SIZE(YY) STFDUX f22, YY, INCY STFDU f23, 1 * SIZE(YY) bdnz LL(110) .align 4 LL(120): FMADD f16, ALPHA_R, f0, f8 LFDUX f8, Y, INCY FMADD f17, ALPHA_I, f0, f9 LFDU f9, 1 * SIZE(Y) FMADD f18, ALPHA_R, f2, f10 LFDUX f10, Y, INCY FMADD f19, ALPHA_I, f2, f11 LFDU f11, 1 * SIZE(Y) ADD1 f16, ALPHA_I, f1, f16 LFDUX f0, X, INCX ADD2 f17, ALPHA_R, f1, f17 LFDU f1, 1 * SIZE(X) ADD1 f18, ALPHA_I, f3, f18 LFDUX f2, X, INCX ADD2 f19, ALPHA_R, f3, f19 LFDU f3, 1 * SIZE(X) FMADD f20, ALPHA_R, f4, f12 LFDUX f12, Y, INCY FMADD f21, ALPHA_I, f4, f13 LFDU f13, 1 * SIZE(Y) FMADD f22, ALPHA_R, f6, f14 LFDUX f14, Y, INCY FMADD f23, ALPHA_I, f6, f15 LFDU f15, 1 * SIZE(Y) ADD1 f20, ALPHA_I, f5, f20 LFDUX f4, X, INCX ADD2 f21, ALPHA_R, f5, f21 LFDU f5, 1 * SIZE(X) ADD1 f22, ALPHA_I, f7, f22 LFDUX f6, X, INCX ADD2 f23, ALPHA_R, f7, f23 LFDU f7, 1 * SIZE(X) STFDUX f16, YY, INCY FMADD f16, ALPHA_R, f0, f8 STFDU f17, 1 * SIZE(YY) FMADD f17, ALPHA_I, f0, f9 STFDUX f18, YY, INCY FMADD f18, ALPHA_R, f2, f10 STFDU f19, 1 * SIZE(YY) FMADD f19, ALPHA_I, f2, f11 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 ADD1 f18, ALPHA_I, f3, f18 ADD2 f19, ALPHA_R, f3, f19 STFDUX f20, YY, INCY FMADD f20, ALPHA_R, f4, f12 STFDU f21, 1 * SIZE(YY) FMADD f21, ALPHA_I, f4, f13 STFDUX f22, YY, INCY FMADD f22, ALPHA_R, f6, f14 STFDU f23, 1 * SIZE(YY) FMADD f23, ALPHA_I, f6, f15 ADD1 f20, ALPHA_I, f5, f20 STFDUX f16, YY, INCY ADD2 f21, ALPHA_R, f5, f21 STFDU f17, 1 * SIZE(YY) ADD1 f22, ALPHA_I, f7, f22 STFDUX f18, YY, INCY ADD2 f23, ALPHA_R, f7, f23 STFDU f19, 1 * SIZE(YY) STFDUX f20, YY, INCY STFDU f21, 1 * SIZE(YY) STFDUX f22, YY, INCY STFDU f23, 1 * SIZE(YY) .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 ble LL(999) .align 4 LL(160): LFDUX f0, X, INCX LFDU f1, 1 * SIZE(X) LFDUX f8, Y, INCY LFDU f9, 1 * SIZE(Y) FMADD f16, ALPHA_R, f0, f8 FMADD f17, ALPHA_I, f0, f9 ADD1 f16, ALPHA_I, f1, f16 ADD2 f17, ALPHA_R, f1, f17 STFDUX f16, YY, INCY STFDU f17, 1 * SIZE(YY) bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) addi SP, SP, STACKSIZE li r0, 0 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zcopy.S000066400000000000000000000131671313527062700170600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #define INCXM1 r9 #define INCYM1 r10 #define STACKSIZE 16 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) cmpwi cr0, INCY, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LL(10): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f2, 2 * SIZE(X) LFD f3, 3 * SIZE(X) STFD f0, 0 * SIZE(Y) STFD f1, 1 * SIZE(Y) STFD f2, 2 * SIZE(Y) STFD f3, 3 * SIZE(Y) LFD f4, 4 * SIZE(X) LFD f5, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f7, 7 * SIZE(X) STFD f4, 4 * SIZE(Y) STFD f5, 5 * SIZE(Y) STFD f6, 6 * SIZE(Y) STFD f7, 7 * SIZE(Y) LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) STFD f8, 8 * SIZE(Y) STFD f9, 9 * SIZE(Y) STFD f10, 10 * SIZE(Y) STFD f11, 11 * SIZE(Y) LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) STFD f12, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f14, 14 * SIZE(Y) STFD f15, 15 * SIZE(Y) #ifndef POWER6 dcbtst Y, PREA #ifdef L1_DUALFETCH dcbt X, PREA #endif #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 dcbtst Y, PREA L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE STFD f8, 0 * SIZE(Y) STFD f9, 1 * SIZE(Y) addi Y, Y, 2 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 sub Y, Y, INCYM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) .align 4 LL(110): LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f2, X, INCXM1 LFDUX f3, X, INCX LFDX f4, X, INCXM1 LFDUX f5, X, INCX LFDX f6, X, INCXM1 LFDUX f7, X, INCX LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f10, X, INCXM1 LFDUX f11, X, INCX LFDX f12, X, INCXM1 LFDUX f13, X, INCX LFDX f14, X, INCXM1 LFDUX f15, X, INCX STFDX f0, Y, INCYM1 STFDUX f1, Y, INCY STFDX f2, Y, INCYM1 STFDUX f3, Y, INCY STFDX f4, Y, INCYM1 STFDUX f5, Y, INCY STFDX f6, Y, INCYM1 STFDUX f7, Y, INCY STFDX f8, Y, INCYM1 STFDUX f9, Y, INCY STFDX f10, Y, INCYM1 STFDUX f11, Y, INCY STFDX f12, Y, INCYM1 STFDUX f13, Y, INCY STFDX f14, Y, INCYM1 STFDUX f15, Y, INCY bdnz LL(110) .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX STFDX f8, Y, INCYM1 STFDUX f9, Y, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zcopy.c000066400000000000000000000060471313527062700170770ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "zcopy_microk_power8.c" #endif #ifndef HAVE_KERNEL_16 static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { zcopy_kernel_16(n1, x, y); i=n1; ix=n1*2; iy=n1*2; } while(i < n) { y[iy] = x[iy] ; y[iy+1] = x[ix+1] ; ix+=2; iy+=2; i++ ; } } else { BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_y2 = 2 * inc_y; while(i < n) { y[iy] = x[ix] ; y[iy+1] = x[ix+1] ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/zcopy_hummer.S000066400000000000000000000276071313527062700204410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define INCX2 r8 #define INCY2 r9 #define X2 r10 #define Y2 r11 #define A1 f0 #define A2 f1 #define A3 f2 #define A4 f3 #define A5 f4 #define A6 f5 #define A7 f6 #define A8 f7 #define A9 f8 #define T1 f9 #define T2 f10 #define T3 f11 #define T4 f12 #define T5 f13 #define T6 f14 #define T7 f15 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX add INCY2, INCY, INCY cmpwi cr0, N, 0 ble LL(999) sub X, X, INCX2 sub Y, Y, INCY2 cmpwi cr0, INCX, SIZE bne LL(100) cmpwi cr0, INCY, SIZE bne LL(100) andi. r0, X, 2 * SIZE - 1 bne LL(30) andi. r0, Y, 2 * SIZE - 1 bne LL(20) .align 4 LL(10): /* X ): aligned Y ): aligned */ srawi. r0, N, 3 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(13) .align 4 LL(12): STFPDUX A1, Y, INCY2 LFPDUX A1, X, INCX2 STFPDUX A2, Y, INCY2 LFPDUX A2, X, INCX2 STFPDUX A3, Y, INCY2 LFPDUX A3, X, INCX2 STFPDUX A4, Y, INCY2 LFPDUX A4, X, INCX2 STFPDUX A5, Y, INCY2 LFPDUX A5, X, INCX2 STFPDUX A6, Y, INCY2 LFPDUX A6, X, INCX2 STFPDUX A7, Y, INCY2 LFPDUX A7, X, INCX2 STFPDUX A8, Y, INCY2 LFPDUX A8, X, INCX2 bdnz LL(12) .align 4 LL(13): STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 STFPDUX A5, Y, INCY2 STFPDUX A6, Y, INCY2 STFPDUX A7, Y, INCY2 STFPDUX A8, Y, INCY2 .align 4 LL(15): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 .align 4 LL(17): andi. r0, N, 1 beq LL(999) LFPDUX A1, X, INCX2 STFPDUX A1, Y, INCY2 b LL(999) .align 4 LL(20): /* X : aligned Y : unaligned */ LFXDUX A1, X, INCX2 addi N, N, -1 cmpwi cr0, N, 0 STFSDX A1, Y, INCY2 add Y, Y, INCY ble LL(29) .align 4 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(25) LFXDUX T1, X, INCX2 LFXDUX T2, X, INCX2 LFXDUX T3, X, INCX2 LFXDUX T4, X, INCX2 LFPDUX A6, X, INCX2 fsmr A1, T1 LFPDUX A7, X, INCX2 fsmr T1, T2 LFPDUX A8, X, INCX2 fsmr T2, T3 LFPDUX A9, X, INCX2 fsmr T3, T4 bdz LL(23) .align 4 LL(22): STFPDUX A1, Y, INCY2 fxmr T5, A6 STFPDUX T1, Y, INCY2 fxmr T6, A7 STFPDUX T2, Y, INCY2 fxmr T7, A8 STFPDUX T3, Y, INCY2 fxmr A1, A9 fsmr T4, T5 LFPDUX A2, X, INCX2 fsmr T5, T6 LFPDUX A3, X, INCX2 fsmr T6, T7 LFPDUX A4, X, INCX2 fsmr T7, A1 LFPDUX A5, X, INCX2 STFPDUX T4, Y, INCY2 fxmr T1, A2 STFPDUX T5, Y, INCY2 fxmr T2, A3 STFPDUX T6, Y, INCY2 fxmr T3, A4 STFPDUX T7, Y, INCY2 fxmr T4, A5 LFPDUX A6, X, INCX2 fsmr A1, T1 LFPDUX A7, X, INCX2 fsmr T1, T2 LFPDUX A8, X, INCX2 fsmr T2, T3 LFPDUX A9, X, INCX2 fsmr T3, T4 bdnz LL(22) .align 4 LL(23): STFPDUX A1, Y, INCY2 fxmr T5, A6 STFPDUX T1, Y, INCY2 fxmr T6, A7 STFPDUX T2, Y, INCY2 fxmr T7, A8 STFPDUX T3, Y, INCY2 fxmr A1, A9 fsmr T4, T5 fsmr T5, T6 fsmr T6, T7 fsmr T7, A1 STFPDUX T4, Y, INCY2 STFPDUX T5, Y, INCY2 STFPDUX T6, Y, INCY2 STFPDUX T7, Y, INCY2 .align 4 LL(25): andi. r0, N, 7 beq LL(29) andi. r0, N, 4 beq LL(26) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 LFXDUX A4, X, INCX2 LFXDUX A5, X, INCX2 fsmr A1, A2 fsmr A2, A3 fsmr A3, A4 fsmr A4, A5 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 fpmr A1, A5 .align 4 LL(26): andi. r0, N, 2 beq LL(27) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 fsmr A1, A2 fsmr A2, A3 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 fpmr A1, A3 .align 4 LL(27): andi. r0, N, 1 beq LL(29) LFXDUX A2, X, INCX2 fsmr A1, A2 STFPDUX A1, Y, INCY2 fpmr A1, A2 .align 4 LL(29): STFDUX A1, Y, INCY2 b LL(999) .align 4 LL(30): /* X ): unaligned Y ): aligned */ andi. r0, Y, 2 * SIZE - 1 bne LL(40) LFDX A1, X, INCX2 add X, X, INCX srawi. r0, N, 3 mtspr CTR, r0 beq- LL(35) LFXDUX T1, X, INCX2 LFXDUX T2, X, INCX2 LFXDUX T3, X, INCX2 LFXDUX T4, X, INCX2 LFPDUX A6, X, INCX2 fsmr A1, T1 LFPDUX A7, X, INCX2 fsmr T1, T2 LFPDUX A8, X, INCX2 fsmr T2, T3 LFPDUX A9, X, INCX2 fsmr T3, T4 bdz LL(33) .align 4 LL(32): fxmr T5, A6 STFPDUX A1, Y, INCY2 fxmr T6, A7 STFPDUX T1, Y, INCY2 fxmr T7, A8 STFPDUX T2, Y, INCY2 fxmr A1, A9 STFPDUX T3, Y, INCY2 LFPDUX A2, X, INCX2 fsmr T4, T5 LFPDUX A3, X, INCX2 fsmr T5, T6 LFPDUX A4, X, INCX2 fsmr T6, T7 LFPDUX A5, X, INCX2 fsmr T7, A1 fxmr T1, A2 STFPDUX T4, Y, INCY2 fxmr T2, A3 STFPDUX T5, Y, INCY2 fxmr T3, A4 STFPDUX T6, Y, INCY2 fxmr T4, A5 STFPDUX T7, Y, INCY2 fsmr A1, T1 LFPDUX A6, X, INCX2 fsmr T1, T2 LFPDUX A7, X, INCX2 fsmr T2, T3 LFPDUX A8, X, INCX2 fsmr T3, T4 LFPDUX A9, X, INCX2 bdnz LL(32) .align 4 LL(33): STFPDUX A1, Y, INCY2 fxmr T5, A6 STFPDUX T1, Y, INCY2 fxmr T6, A7 STFPDUX T2, Y, INCY2 fxmr T7, A8 STFPDUX T3, Y, INCY2 fxmr A1, A9 fsmr T4, T5 fsmr T5, T6 fsmr T6, T7 fsmr T7, A1 STFPDUX T4, Y, INCY2 STFPDUX T5, Y, INCY2 STFPDUX T6, Y, INCY2 STFPDUX T7, Y, INCY2 .align 4 LL(35): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(36) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 LFXDUX A4, X, INCX2 LFXDUX A5, X, INCX2 fsmr A1, A2 fsmr A2, A3 fsmr A3, A4 fsmr A4, A5 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 fpmr A1, A5 .align 4 LL(36): andi. r0, N, 2 beq LL(37) LFXDUX A2, X, INCX2 LFXDUX A3, X, INCX2 fsmr A1, A2 fsmr A2, A3 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 fpmr A1, A3 .align 4 LL(37): andi. r0, N, 1 beq LL(999) LFXDUX A2, X, INCX2 fsmr A1, A2 STFPDUX A1, Y, INCY2 b LL(999) .align 4 LL(40): /* X : unaligned Y : unaligned */ LFDX A1, X, INCX2 add X, X, INCX addi N, N, -1 cmpwi cr0, N, 0 STFDX A1, Y, INCY2 add Y, Y, INCY ble LL(49) srawi. r0, N, 3 mtspr CTR, r0 beq- LL(45) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 LFPDUX A6, X, INCX2 LFPDUX A7, X, INCX2 LFPDUX A8, X, INCX2 bdz LL(43) .align 4 LL(42): STFPDUX A1, Y, INCY2 LFPDUX A1, X, INCX2 STFPDUX A2, Y, INCY2 LFPDUX A2, X, INCX2 STFPDUX A3, Y, INCY2 LFPDUX A3, X, INCX2 STFPDUX A4, Y, INCY2 LFPDUX A4, X, INCX2 STFPDUX A5, Y, INCY2 LFPDUX A5, X, INCX2 STFPDUX A6, Y, INCY2 LFPDUX A6, X, INCX2 STFPDUX A7, Y, INCY2 LFPDUX A7, X, INCX2 STFPDUX A8, Y, INCY2 LFPDUX A8, X, INCX2 bdnz LL(42) .align 4 LL(43): STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 STFPDUX A5, Y, INCY2 STFPDUX A6, Y, INCY2 STFPDUX A7, Y, INCY2 STFPDUX A8, Y, INCY2 .align 4 LL(45): andi. r0, N, 7 beq LL(49) andi. r0, N, 4 beq LL(46) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 STFPDUX A3, Y, INCY2 STFPDUX A4, Y, INCY2 .align 4 LL(46): andi. r0, N, 2 beq LL(47) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 STFPDUX A1, Y, INCY2 STFPDUX A2, Y, INCY2 .align 4 LL(47): andi. r0, N, 1 beq LL(49) LFPDUX A1, X, INCX2 STFPDUX A1, Y, INCY2 LL(49): LFDUX A1, X, INCX2 STFDUX A1, Y, INCY2 b LL(999) .align 4 LL(100): addi X2, X, SIZE addi Y2, Y, SIZE srawi. r0, N, 2 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 LFDUX A5, X, INCX2 LFDUX A6, X2, INCX2 LFDUX A7, X, INCX2 LFDUX A8, X2, INCX2 bdz LL(113) .align 4 LL(112): STFDUX A1, Y, INCY2 LFDUX A1, X, INCX2 STFDUX A2, Y2, INCY2 LFDUX A2, X2, INCX2 STFDUX A3, Y, INCY2 LFDUX A3, X, INCX2 STFDUX A4, Y2, INCY2 LFDUX A4, X2, INCX2 STFDUX A5, Y, INCY2 LFDUX A5, X, INCX2 STFDUX A6, Y2, INCY2 LFDUX A6, X2, INCX2 STFDUX A7, Y, INCY2 LFDUX A7, X, INCX2 STFDUX A8, Y2, INCY2 LFDUX A8, X2, INCX2 bdnz LL(112) .align 4 LL(113): STFDUX A1, Y, INCY2 STFDUX A2, Y2, INCY2 STFDUX A3, Y, INCY2 STFDUX A4, Y2, INCY2 STFDUX A5, Y, INCY2 STFDUX A6, Y2, INCY2 STFDUX A7, Y, INCY2 STFDUX A8, Y2, INCY2 .align 4 LL(115): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(117) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 STFDUX A1, Y, INCY2 STFDUX A2, Y2, INCY2 STFDUX A3, Y, INCY2 STFDUX A4, Y2, INCY2 .align 4 LL(117): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 STFDUX A1, Y, INCY2 STFDUX A2, Y2, INCY2 .align 4 LL(999): li r10, 16 addi SP, SP, -16 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zcopy_microk_power8.c000066400000000000000000000120021313527062700217330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) { __asm__ ( "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %5, %2 \n\t" "lxvd2x 34, %6, %2 \n\t" "lxvd2x 35, %7, %2 \n\t" "lxvd2x 36, %8, %2 \n\t" "lxvd2x 37, %9, %2 \n\t" "lxvd2x 38, %10, %2 \n\t" "lxvd2x 39, %11, %2 \n\t" "addi %2, %2, 128 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %5, %2 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "lxvd2x 34, %6, %2 \n\t" "lxvd2x 35, %7, %2 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "lxvd2x 36, %8, %2 \n\t" "lxvd2x 37, %9, %2 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "lxvd2x 38, %10, %2 \n\t" "lxvd2x 39, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "lxvd2x 40, 0, %2 \n\t" "lxvd2x 41, %5, %2 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "lxvd2x 42, %6, %2 \n\t" "lxvd2x 43, %7, %2 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "lxvd2x 44, %8, %2 \n\t" "lxvd2x 45, %9, %2 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n\t" "lxvd2x 46, %10, %2 \n\t" "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" "bgt 1b \n" "2: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "=m" (*y), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : "m" (*x), "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } OpenBLAS-0.2.20/kernel/power/zdot.S000066400000000000000000000317561313527062700167000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 #define INCX r6 #define Y r7 #define INCY r8 #define PREA r9 #else #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #endif #define INCXM1 r10 #define INCYM1 r11 #define FZERO f0 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO fmr f24, FZERO fmr f25, FZERO fmr f26, FZERO fmr f27, FZERO fmr f28, FZERO fmr f29, FZERO fmr f30, FZERO fmr f31, FZERO #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) cmpwi cr0, INCY, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f10, 2 * SIZE(X) LFD f11, 3 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) LFD f18, 2 * SIZE(Y) LFD f19, 3 * SIZE(Y) LFD f12, 4 * SIZE(X) LFD f13, 5 * SIZE(X) LFD f14, 6 * SIZE(X) LFD f15, 7 * SIZE(X) LFD f20, 4 * SIZE(Y) LFD f21, 5 * SIZE(Y) LFD f22, 6 * SIZE(Y) LFD f23, 7 * SIZE(Y) bdz LL(20) .align 4 LL(10): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) LFD f16, 8 * SIZE(Y) LFD f17, 9 * SIZE(Y) LFD f18, 10 * SIZE(Y) LFD f19, 11 * SIZE(Y) FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) LFD f20, 12 * SIZE(Y) LFD f21, 13 * SIZE(Y) LFD f22, 14 * SIZE(Y) LFD f23, 15 * SIZE(Y) FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 LFD f8, 16 * SIZE(X) LFD f9, 17 * SIZE(X) LFD f10, 18 * SIZE(X) LFD f11, 19 * SIZE(X) LFD f16, 16 * SIZE(Y) LFD f17, 17 * SIZE(Y) LFD f18, 18 * SIZE(Y) LFD f19, 19 * SIZE(Y) FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 LFD f12, 20 * SIZE(X) LFD f13, 21 * SIZE(X) LFD f14, 22 * SIZE(X) LFD f15, 23 * SIZE(X) LFD f20, 20 * SIZE(Y) LFD f21, 21 * SIZE(Y) LFD f22, 22 * SIZE(Y) LFD f23, 23 * SIZE(Y) #ifndef POWER6 L1_PREFETCH X, PREA #ifdef L1_DUALFETCH L1_PREFETCH Y, PREA #endif #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #ifdef L1_DUALFETCH L1_PREFETCH Y, PREA #endif #endif bdnz LL(10) .align 4 LL(20): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) LFD f16, 8 * SIZE(Y) LFD f17, 9 * SIZE(Y) LFD f18, 10 * SIZE(Y) LFD f19, 11 * SIZE(Y) FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) LFD f20, 12 * SIZE(Y) LFD f21, 13 * SIZE(Y) LFD f22, 14 * SIZE(Y) LFD f23, 15 * SIZE(Y) FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) addi X, X, 2 * SIZE addi Y, Y, 2 * SIZE FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 bdnz LL(60) b LL(999) .align 4 LL(100): #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(102) subi r0, N, 1 mullw r0, r0, INCX sub X, X, r0 .align 4 LL(102): cmpwi cr0, INCY, 0 bge+ LL(104) subi r0, N, 1 mullw r0, r0, INCY sub Y, Y, r0 .align 4 LL(104): #endif sub X, X, INCXM1 sub Y, Y, INCYM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f8, X, INCXM1 LFDX f16, Y, INCYM1 LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDX f10, X, INCXM1 LFDX f18, Y, INCYM1 LFDUX f11, X, INCX LFDUX f19, Y, INCY LFDX f12, X, INCXM1 LFDX f20, Y, INCYM1 LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDX f14, X, INCXM1 LFDX f22, Y, INCYM1 LFDUX f15, X, INCX LFDUX f23, Y, INCY bdz LL(120) .align 4 LL(110): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 LFDX f8, X, INCXM1 LFDX f16, Y, INCYM1 LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDX f10, X, INCXM1 LFDX f18, Y, INCYM1 LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 LFDX f12, X, INCXM1 LFDX f20, Y, INCYM1 LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDX f14, X, INCXM1 LFDX f22, Y, INCYM1 LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 LFDX f8, X, INCXM1 LFDX f16, Y, INCYM1 LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDX f10, X, INCXM1 LFDX f18, Y, INCYM1 LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 LFDX f12, X, INCXM1 LFDX f20, Y, INCYM1 LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDX f14, X, INCXM1 LFDX f22, Y, INCYM1 LFDUX f15, X, INCX LFDUX f23, Y, INCY bdnz LL(110) .align 4 LL(120): FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 LFDX f8, X, INCXM1 LFDX f16, Y, INCYM1 LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDX f10, X, INCXM1 LFDX f18, Y, INCYM1 LFDUX f11, X, INCX LFDUX f19, Y, INCY FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 LFDX f12, X, INCXM1 LFDX f20, Y, INCYM1 LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDX f14, X, INCXM1 LFDX f22, Y, INCYM1 LFDUX f15, X, INCX LFDUX f23, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 FMADD f4, f10, f18, f4 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f7, f10, f19, f7 FMADD f24, f12, f20, f24 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f27, f12, f21, f27 FMADD f28, f14, f22, f28 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 FMADD f31, f14, f23, f31 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f16, Y, INCYM1 LFDUX f17, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FADD f24, f28, f24 FADD f25, f29, f25 FADD f26, f30, f26 FADD f27, f31, f27 FADD f0, f0, f24 FADD f1, f1, f25 FADD f2, f2, f26 FADD f3, f3, f27 #ifndef CONJ FSUB f1, f0, f1 FADD f2, f2, f3 #else FADD f1, f0, f1 FSUB f2, f3, f2 #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) STFD f1, 0 * SIZE(RESULT) STFD f2, 1 * SIZE(RESULT) #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) #ifndef __64BIT__ #ifndef DOUBLE stfs f1, 144(SP) stfs f2, 148(SP) lwz r3, 144(SP) lwz r4, 148(SP) #else stfd f1, 144(SP) stfd f2, 152(SP) lwz r3, 144(SP) lwz r4, 148(SP) lwz r5, 152(SP) lwz r6, 156(SP) #endif #else #ifndef DOUBLE stfs f1, 144(SP) stfs f2, 148(SP) ld r3, 144(SP) #else stfd f1, 144(SP) stfd f2, 152(SP) ld r3, 144(SP) ld r4, 152(SP) #endif #endif #endif lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zdot.c000066400000000000000000000077611313527062700167170ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "zdot_microk_power8.c" #endif #ifndef HAVE_KERNEL_8 static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; BLASLONG j=0; while( i < n ) { dot[0] += x[j] * y[j] ; dot[1] += x[j+1] * y[j+1] ; dot[2] += x[j] * y[j+1] ; dot[3] += x[j+1] * y[j] ; dot[0] += x[j+2] * y[j+2] ; dot[1] += x[j+3] * y[j+3] ; dot[2] += x[j+2] * y[j+3] ; dot[3] += x[j+3] * y[j+2] ; dot[0] += x[j+4] * y[j+4] ; dot[1] += x[j+5] * y[j+5] ; dot[2] += x[j+4] * y[j+5] ; dot[3] += x[j+5] * y[j+4] ; dot[0] += x[j+6] * y[j+6] ; dot[1] += x[j+7] * y[j+7] ; dot[2] += x[j+6] * y[j+7] ; dot[3] += x[j+7] * y[j+6] ; j+=8; i+=4; } d[0] = dot[0]; d[1] = dot[1]; d[2] = dot[2]; d[3] = dot[3]; } #endif FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; BLASLONG ix,iy; FLOAT _Complex result; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; if ( n <= 0 ) { __real__ result = 0.0 ; __imag__ result = 0.0 ; return(result); } if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -8; if ( n1 ) zdot_kernel_8(n1, x, y , dot ); i = n1; BLASLONG j = i * 2; while( i < n ) { dot[0] += x[j] * y[j] ; dot[1] += x[j+1] * y[j+1] ; dot[2] += x[j] * y[j+1] ; dot[3] += x[j+1] * y[j] ; j+=2; i++ ; } } else { i=0; ix=0; iy=0; inc_x <<= 1; inc_y <<= 1; while(i < n) { dot[0] += x[ix] * y[iy] ; dot[1] += x[ix+1] * y[iy+1] ; dot[2] += x[ix] * y[iy+1] ; dot[3] += x[ix+1] * y[iy] ; ix += inc_x ; iy += inc_y ; i++ ; } } #if !defined(CONJ) __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; #else __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; #endif return(result); } OpenBLAS-0.2.20/kernel/power/zdot_cell.S000066400000000000000000000313771313527062700176760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 #define INCX r6 #define Y r7 #define INCY r8 #define PREA r9 #else #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #endif #define INCXM1 r10 #define INCYM1 r11 #define FZERO f0 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r0, 144(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO fmr f24, FZERO fmr f25, FZERO fmr f26, FZERO fmr f27, FZERO fmr f28, FZERO fmr f29, FZERO fmr f30, FZERO fmr f31, FZERO li PREA, 16 * 10 * SIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) cmpwi cr0, INCY, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f10, 2 * SIZE(X) LFD f11, 3 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) LFD f18, 2 * SIZE(Y) LFD f19, 3 * SIZE(Y) LFD f12, 4 * SIZE(X) LFD f13, 5 * SIZE(X) LFD f14, 6 * SIZE(X) LFD f15, 7 * SIZE(X) LFD f20, 4 * SIZE(Y) LFD f21, 5 * SIZE(Y) LFD f23, 7 * SIZE(Y) bdz LL(20) .align 4 LL(10): FMADD f0, f8, f16, f0 LFD f22, 6 * SIZE(Y) FMADD f3, f8, f17, f3 LFD f8, 8 * SIZE(X) FMADD f1, f9, f17, f1 LFD f17, 9 * SIZE(Y) FMADD f2, f9, f16, f2 LFD f9, 9 * SIZE(X) FMADD f4, f10, f18, f4 LFD f16, 8 * SIZE(Y) FMADD f7, f10, f19, f7 LFD f10, 10 * SIZE(X) FMADD f5, f11, f19, f5 LFD f19, 11 * SIZE(Y) FMADD f6, f11, f18, f6 LFD f11, 11 * SIZE(X) FMADD f24, f12, f20, f24 LFD f18, 10 * SIZE(Y) FMADD f27, f12, f21, f27 LFD f12, 12 * SIZE(X) FMADD f25, f13, f21, f25 LFD f21, 13 * SIZE(Y) FMADD f26, f13, f20, f26 LFD f13, 13 * SIZE(X) FMADD f28, f14, f22, f28 LFD f20, 12 * SIZE(Y) FMADD f31, f14, f23, f31 LFD f14, 14 * SIZE(X) FMADD f29, f15, f23, f29 LFD f23, 15 * SIZE(Y) FMADD f30, f15, f22, f30 LFD f15, 15 * SIZE(X) FMADD f0, f8, f16, f0 LFD f22, 14 * SIZE(Y) FMADD f3, f8, f17, f3 LFD f8, 16 * SIZE(X) FMADD f1, f9, f17, f1 LFD f17, 17 * SIZE(Y) FMADD f2, f9, f16, f2 LFD f9, 17 * SIZE(X) FMADD f4, f10, f18, f4 LFD f16, 16 * SIZE(Y) FMADD f7, f10, f19, f7 LFD f10, 18 * SIZE(X) FMADD f5, f11, f19, f5 LFD f19, 19 * SIZE(Y) FMADD f6, f11, f18, f6 LFD f11, 19 * SIZE(X) FMADD f24, f12, f20, f24 LFD f18, 18 * SIZE(Y) FMADD f27, f12, f21, f27 LFD f12, 20 * SIZE(X) FMADD f25, f13, f21, f25 LFD f21, 21 * SIZE(Y) FMADD f26, f13, f20, f26 LFD f13, 21 * SIZE(X) FMADD f28, f14, f22, f28 LFD f20, 20 * SIZE(Y) FMADD f31, f14, f23, f31 LFD f14, 22 * SIZE(X) FMADD f29, f15, f23, f29 LFD f23, 23 * SIZE(Y) FMADD f30, f15, f22, f30 LFD f15, 23 * SIZE(X) dcbt X, PREA addi X, X, 16 * SIZE dcbt Y, PREA addi Y, Y, 16 * SIZE bdnz LL(10) .align 4 LL(20): FMADD f0, f8, f16, f0 LFD f22, 6 * SIZE(Y) FMADD f3, f8, f17, f3 LFD f8, 8 * SIZE(X) FMADD f1, f9, f17, f1 LFD f17, 9 * SIZE(Y) FMADD f2, f9, f16, f2 LFD f9, 9 * SIZE(X) FMADD f4, f10, f18, f4 LFD f16, 8 * SIZE(Y) FMADD f7, f10, f19, f7 LFD f10, 10 * SIZE(X) FMADD f5, f11, f19, f5 LFD f19, 11 * SIZE(Y) FMADD f6, f11, f18, f6 LFD f11, 11 * SIZE(X) FMADD f24, f12, f20, f24 LFD f18, 10 * SIZE(Y) FMADD f27, f12, f21, f27 LFD f12, 12 * SIZE(X) FMADD f25, f13, f21, f25 LFD f21, 13 * SIZE(Y) FMADD f26, f13, f20, f26 LFD f13, 13 * SIZE(X) FMADD f28, f14, f22, f28 LFD f20, 12 * SIZE(Y) FMADD f31, f14, f23, f31 LFD f14, 14 * SIZE(X) FMADD f29, f15, f23, f29 LFD f23, 15 * SIZE(Y) FMADD f30, f15, f22, f30 LFD f15, 15 * SIZE(X) FMADD f0, f8, f16, f0 LFD f22, 14 * SIZE(Y) FMADD f3, f8, f17, f3 addi X, X, 16 * SIZE FMADD f1, f9, f17, f1 addi Y, Y, 16 * SIZE FMADD f2, f9, f16, f2 nop FMADD f4, f10, f18, f4 FMADD f7, f10, f19, f7 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f24, f12, f20, f24 FMADD f27, f12, f21, f27 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f28, f14, f22, f28 FMADD f31, f14, f23, f31 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) addi X, X, 2 * SIZE addi Y, Y, 2 * SIZE FMADD f0, f8, f16, f0 FMADD f3, f8, f17, f3 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 bdnz LL(60) b LL(999) .align 4 LL(100): #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(102) subi r0, N, 1 mullw r0, r0, INCX sub X, X, r0 .align 4 LL(102): cmpwi cr0, INCY, 0 bge+ LL(104) subi r0, N, 1 mullw r0, r0, INCY sub Y, Y, r0 .align 4 LL(104): #endif sub X, X, INCXM1 sub Y, Y, INCYM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f8, X, INCXM1 LFDX f16, Y, INCYM1 LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDX f10, X, INCXM1 LFDX f18, Y, INCYM1 LFDUX f11, X, INCX LFDUX f19, Y, INCY LFDX f12, X, INCXM1 LFDX f20, Y, INCYM1 LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDX f14, X, INCXM1 LFDUX f15, X, INCX bdz LL(120) .align 4 LL(110): FMADD f0, f8, f16, f0 LFDX f22, Y, INCYM1 FMADD f3, f8, f17, f3 LFDX f8, X, INCXM1 FMADD f1, f9, f17, f1 LFDUX f23, Y, INCY FMADD f2, f9, f16, f2 LFDUX f9, X, INCX FMADD f4, f10, f18, f4 LFDX f16, Y, INCYM1 FMADD f7, f10, f19, f7 LFDX f10, X, INCXM1 FMADD f5, f11, f19, f5 LFDUX f17, Y, INCY FMADD f6, f11, f18, f6 LFDUX f11, X, INCX FMADD f24, f12, f20, f24 LFDX f18, Y, INCYM1 FMADD f27, f12, f21, f27 LFDX f12, X, INCXM1 FMADD f25, f13, f21, f25 LFDUX f19, Y, INCY FMADD f26, f13, f20, f26 LFDUX f13, X, INCX FMADD f28, f14, f22, f28 LFDX f20, Y, INCYM1 FMADD f31, f14, f23, f31 LFDX f14, X, INCXM1 FMADD f29, f15, f23, f29 LFDUX f21, Y, INCY FMADD f30, f15, f22, f30 LFDUX f15, X, INCX FMADD f0, f8, f16, f0 LFDX f22, Y, INCYM1 FMADD f3, f8, f17, f3 LFDX f8, X, INCXM1 FMADD f1, f9, f17, f1 LFDUX f23, Y, INCY FMADD f2, f9, f16, f2 LFDUX f9, X, INCX FMADD f4, f10, f18, f4 LFDX f16, Y, INCYM1 FMADD f7, f10, f19, f7 LFDX f10, X, INCXM1 FMADD f5, f11, f19, f5 LFDUX f17, Y, INCY FMADD f6, f11, f18, f6 LFDUX f11, X, INCX FMADD f24, f12, f20, f24 LFDX f18, Y, INCYM1 FMADD f27, f12, f21, f27 LFDX f12, X, INCXM1 FMADD f25, f13, f21, f25 LFDUX f19, Y, INCY FMADD f26, f13, f20, f26 LFDUX f13, X, INCX FMADD f28, f14, f22, f28 LFDX f20, Y, INCYM1 FMADD f31, f14, f23, f31 LFDX f14, X, INCXM1 FMADD f29, f15, f23, f29 LFDUX f21, Y, INCY FMADD f30, f15, f22, f30 LFDUX f15, X, INCX bdnz LL(110) .align 4 LL(120): FMADD f0, f8, f16, f0 LFDX f22, Y, INCYM1 FMADD f3, f8, f17, f3 LFDX f8, X, INCXM1 FMADD f1, f9, f17, f1 LFDUX f23, Y, INCY FMADD f2, f9, f16, f2 LFDUX f9, X, INCX FMADD f4, f10, f18, f4 LFDX f16, Y, INCYM1 FMADD f7, f10, f19, f7 LFDX f10, X, INCXM1 FMADD f5, f11, f19, f5 LFDUX f17, Y, INCY FMADD f6, f11, f18, f6 LFDUX f11, X, INCX FMADD f24, f12, f20, f24 LFDX f18, Y, INCYM1 FMADD f27, f12, f21, f27 LFDX f12, X, INCXM1 FMADD f25, f13, f21, f25 LFDUX f19, Y, INCY FMADD f26, f13, f20, f26 LFDUX f13, X, INCX FMADD f28, f14, f22, f28 LFDX f20, Y, INCYM1 FMADD f31, f14, f23, f31 LFDX f14, X, INCXM1 FMADD f29, f15, f23, f29 LFDUX f21, Y, INCY FMADD f30, f15, f22, f30 LFDUX f15, X, INCX FMADD f0, f8, f16, f0 LFDX f22, Y, INCYM1 FMADD f3, f8, f17, f3 LFDUX f23, Y, INCY FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f4, f10, f18, f4 FMADD f7, f10, f19, f7 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f24, f12, f20, f24 FMADD f27, f12, f21, f27 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f28, f14, f22, f28 FMADD f31, f14, f23, f31 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f16, Y, INCYM1 LFDUX f17, Y, INCY FMADD f0, f8, f16, f0 FMADD f3, f8, f17, f3 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FADD f24, f28, f24 FADD f25, f29, f25 FADD f26, f30, f26 FADD f27, f31, f27 FADD f0, f0, f24 FADD f1, f1, f25 FADD f2, f2, f26 FADD f3, f3, f27 #ifndef CONJ FSUB f1, f0, f1 FADD f2, f2, f3 #else FADD f1, f0, f1 FSUB f2, f3, f2 #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) STFD f1, 0 * SIZE(RESULT) STFD f2, 1 * SIZE(RESULT) #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) #ifndef __64BIT__ #ifndef DOUBLE stfs f1, 144(SP) stfs f2, 148(SP) lwz r3, 144(SP) lwz r4, 148(SP) #else stfd f1, 144(SP) stfd f2, 152(SP) lwz r3, 144(SP) lwz r4, 148(SP) lwz r5, 152(SP) lwz r6, 156(SP) #endif #else #ifndef DOUBLE stfs f1, 144(SP) stfs f2, 148(SP) ld r3, 144(SP) #else stfd f1, 144(SP) stfd f2, 152(SP) ld r3, 144(SP) ld r4, 152(SP) #endif #endif #endif lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zdot_hummer.S000066400000000000000000000244621313527062700202510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 #define INCX r6 #define Y r7 #define INCY r8 #else #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #endif #define INCX2 r9 #define INCY2 r10 #define C1 f1 #define C2 f2 #define C3 f0 #define C4 f3 #define C5 f4 #define C6 f5 #define C7 f6 #define C8 f7 #define A1 f8 #define A2 f9 #define A3 f10 #define A4 f11 #define A5 f12 #define A6 f13 #define A7 f14 #define A8 f15 #define B1 f16 #define B2 f17 #define B3 f18 #define B4 f19 #define B5 f20 #define B6 f21 #define B7 f22 #define B8 f23 #ifndef CONJ #define FXCXNPMA fxcxnpma #else #define FXCXNPMA fxcxnsma #endif PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif lfpdx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 slwi INCY, INCY, BASE_SHIFT fpmr C3, C1 add INCY2, INCY, INCY fpmr C4, C1 fpmr C5, C1 fpmr C6, C1 fpmr C7, C1 fpmr C8, C1 cmpwi cr0, N, 0 ble LL(99) #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(05) subi r0, N, 1 mullw r0, r0, INCX2 sub X, X, r0 .align 4 LL(05): cmpwi cr0, INCY, 0 bge+ LL(06) subi r0, N, 1 mullw r0, r0, INCY2 sub Y, Y, r0 .align 4 LL(06): #endif andi. r0, X, 2 * SIZE - 1 bne LL(100) andi. r0, Y, 2 * SIZE - 1 bne LL(100) /* X is aligned, Y is aligned */ LL(10): sub X, X, INCX2 sub Y, Y, INCY2 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 LFPDUX A5, X, INCX2 LFPDUX B5, Y, INCY2 LFPDUX A6, X, INCX2 LFPDUX B6, Y, INCY2 LFPDUX A7, X, INCX2 LFPDUX B7, Y, INCY2 LFPDUX A8, X, INCX2 bdz LL(14) .align 4 LL(13): fxcpmadd C1, A1, B1, C1 LFPDUX B8, Y, INCY2 FXCXNPMA C2, A1, B1, C2 LFPDUX A1, X, INCX2 fxcpmadd C3, A2, B2, C3 LFPDUX B1, Y, INCY2 FXCXNPMA C4, A2, B2, C4 LFPDUX A2, X, INCX2 fxcpmadd C5, A3, B3, C5 LFPDUX B2, Y, INCY2 FXCXNPMA C6, A3, B3, C6 LFPDUX A3, X, INCX2 fxcpmadd C7, A4, B4, C7 LFPDUX B3, Y, INCY2 FXCXNPMA C8, A4, B4, C8 LFPDUX A4, X, INCX2 fxcpmadd C1, A5, B5, C1 LFPDUX B4, Y, INCY2 FXCXNPMA C2, A5, B5, C2 LFPDUX A5, X, INCX2 fxcpmadd C3, A6, B6, C3 LFPDUX B5, Y, INCY2 FXCXNPMA C4, A6, B6, C4 LFPDUX A6, X, INCX2 fxcpmadd C5, A7, B7, C5 LFPDUX B6, Y, INCY2 FXCXNPMA C6, A7, B7, C6 LFPDUX A7, X, INCX2 fxcpmadd C7, A8, B8, C7 LFPDUX B7, Y, INCY2 FXCXNPMA C8, A8, B8, C8 LFPDUX A8, X, INCX2 bdnz LL(13) .align 4 LL(14): LFPDUX B8, Y, INCY2 fxcpmadd C1, A1, B1, C1 FXCXNPMA C2, A1, B1, C2 fxcpmadd C3, A2, B2, C3 FXCXNPMA C4, A2, B2, C4 fxcpmadd C5, A3, B3, C5 FXCXNPMA C6, A3, B3, C6 fxcpmadd C7, A4, B4, C7 FXCXNPMA C8, A4, B4, C8 fxcpmadd C1, A5, B5, C1 FXCXNPMA C2, A5, B5, C2 fxcpmadd C3, A6, B6, C3 FXCXNPMA C4, A6, B6, C4 fxcpmadd C5, A7, B7, C5 FXCXNPMA C6, A7, B7, C6 fxcpmadd C7, A8, B8, C7 FXCXNPMA C8, A8, B8, C8 .align 4 LL(15): andi. r0, N, 7 beq LL(99) andi. r0, N, 4 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 fxcpmadd C1, A1, B1, C1 FXCXNPMA C2, A1, B1, C2 fxcpmadd C3, A2, B2, C3 FXCXNPMA C4, A2, B2, C4 fxcpmadd C5, A3, B3, C5 FXCXNPMA C6, A3, B3, C6 fxcpmadd C7, A4, B4, C7 FXCXNPMA C8, A4, B4, C8 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 fxcpmadd C1, A1, B1, C1 FXCXNPMA C2, A1, B1, C2 fxcpmadd C3, A2, B2, C3 FXCXNPMA C4, A2, B2, C4 .align 4 LL(17): andi. r0, N, 1 beq LL(99) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 fxcpmadd C1, A1, B1, C1 FXCXNPMA C2, A1, B1, C2 .align 4 LL(99): li r10, 16 fpadd C1, C1, C5 lfpdux f23, SP, r10 fpadd C2, C2, C6 lfpdux f22, SP, r10 fpadd C3, C3, C7 lfpdux f21, SP, r10 fpadd C4, C4, C8 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 fpadd C1, C1, C3 lfpdux f17, SP, r10 fpadd C2, C2, C4 lfpdux f16, SP, r10 fpadd C1, C1, C2 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fsmtp C2, C1 #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) STFD C1, 0 * SIZE(RESULT) STFD C2, 1 * SIZE(RESULT) #endif addi SP, SP, 16 blr .align 4 /* X is aligned, Y is NOT aligned */ LL(100): subi INCX2, INCX2, SIZE subi INCY2, INCY2, SIZE li INCX, SIZE li INCY, SIZE sub X, X, INCX2 sub Y, Y, INCY2 srawi. r0, N, 2 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX2 LFDUX B3, Y, INCY2 LFDUX A4, X, INCX LFDUX B4, Y, INCY LFDUX A5, X, INCX2 LFDUX B5, Y, INCY2 LFDUX A6, X, INCX LFDUX B6, Y, INCY LFDUX A7, X, INCX2 LFDUX B7, Y, INCY2 LFDUX A8, X, INCX bdz LL(104) .align 4 LL(103): fmadd C1, A1, B1, C1 LFDUX B8, Y, INCY fmadd C2, A1, B2, C2 LFDUX A1, X, INCX2 fmadd C3, A2, B1, C3 LFDUX B1, Y, INCY2 fmadd C4, A2, B2, C4 LFDUX A2, X, INCX fmadd C5, A3, B3, C5 LFDUX B2, Y, INCY fmadd C6, A3, B4, C6 LFDUX A3, X, INCX2 fmadd C7, A4, B3, C7 LFDUX B3, Y, INCY2 fmadd C8, A4, B4, C8 LFDUX A4, X, INCX fmadd C1, A5, B5, C1 LFDUX B4, Y, INCY fmadd C2, A5, B6, C2 LFDUX A5, X, INCX2 fmadd C3, A6, B5, C3 LFDUX B5, Y, INCY2 fmadd C4, A6, B6, C4 LFDUX A6, X, INCX fmadd C5, A7, B7, C5 LFDUX B6, Y, INCY fmadd C6, A7, B8, C6 LFDUX A7, X, INCX2 fmadd C7, A8, B7, C7 LFDUX B7, Y, INCY2 fmadd C8, A8, B8, C8 LFDUX A8, X, INCX bdnz LL(103) .align 4 LL(104): LFDUX B8, Y, INCY fmadd C1, A1, B1, C1 fmadd C2, A1, B2, C2 fmadd C3, A2, B1, C3 fmadd C4, A2, B2, C4 fmadd C5, A3, B3, C5 fmadd C6, A3, B4, C6 fmadd C7, A4, B3, C7 fmadd C8, A4, B4, C8 fmadd C1, A5, B5, C1 fmadd C2, A5, B6, C2 fmadd C3, A6, B5, C3 fmadd C4, A6, B6, C4 fmadd C5, A7, B7, C5 fmadd C6, A7, B8, C6 fmadd C7, A8, B7, C7 fmadd C8, A8, B8, C8 .align 4 LL(105): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(107) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX2 LFDUX B3, Y, INCY2 LFDUX A4, X, INCX LFDUX B4, Y, INCY fmadd C1, A1, B1, C1 fmadd C2, A1, B2, C2 fmadd C3, A2, B1, C3 fmadd C4, A2, B2, C4 fmadd C5, A3, B3, C5 fmadd C6, A3, B4, C6 fmadd C7, A4, B3, C7 fmadd C8, A4, B4, C8 .align 4 LL(107): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 LFDUX A2, X, INCX LFDUX B2, Y, INCY fmadd C1, A1, B1, C1 fmadd C2, A1, B2, C2 fmadd C3, A2, B1, C3 fmadd C4, A2, B2, C4 .align 4 LL(999): li r10, 16 fadd C1, C1, C5 lfpdux f23, SP, r10 fadd C2, C2, C6 lfpdux f22, SP, r10 fadd C3, C3, C7 lfpdux f21, SP, r10 fadd C4, C4, C8 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 #ifndef CONJ FSUB C1, C1, C4 FADD C2, C2, C3 #else FADD C1, C1, C4 FSUB C2, C2, C3 #endif lfpdux f15, SP, r10 lfpdux f14, SP, r10 #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) STFD C1, 0 * SIZE(RESULT) STFD C2, 1 * SIZE(RESULT) #endif addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zdot_microk_power8.c000066400000000000000000000172731313527062700215660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/21 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_8 1 static void zdot_kernel_8 (long n, double *x, double *y, double *dot) { __asm__ ( "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" "xxlxor 32, 32, 32 \n\t" "xxlxor 33, 33, 33 \n\t" "xxlxor 34, 34, 34 \n\t" "xxlxor 35, 35, 35 \n\t" "xxlxor 36, 36, 36 \n\t" "xxlxor 37, 37, 37 \n\t" "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 41, %7, %2 \n\t" // x1_r, x1_i "lxvd2x 49, %7, %3 \n\t" // y1_r, y1_i "lxvd2x 42, %8, %2 \n\t" // x2_r, x2_i "lxvd2x 50, %8, %3 \n\t" // y2_r, y2_i "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i "xxswapd 0, 48 \n\t" // y0_i, y0_r "xxswapd 1, 49 \n\t" // y1_i, y1_r "xxswapd 2, 50 \n\t" // y2_i, y2_r "xxswapd 3, 51 \n\t" // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 45, %7, %2 \n\t" // x1_r, x1_i "lxvd2x 5, %7, %3 \n\t" // y1_r, y1_i "lxvd2x 46, %8, %2 \n\t" // x2_r, x2_i "lxvd2x 6, %8, %3 \n\t" // y2_r, y2_i "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i "xxswapd 8, 4 \n\t" // y0_i, y0_r "xxswapd 9, 5 \n\t" // y1_i, y1_r "xxswapd 10, 6 \n\t" // y2_i, y2_r "xxswapd 11, 7 \n\t" // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i "lxvd2x 49, %7, %3 \n\t" // y1_r, y1_i "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i "lxvd2x 50, %8, %3 \n\t" // y2_r, y2_i "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i "xvmaddadp 33, 40, 0 \n\t" // x0_r * y0_i , x0_i * y0_r "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "xvmaddadp 35, 41, 1 \n\t" // x1_r * y1_i , x1_i * y1_r "lxvd2x 41, %7, %2 \n\t" // x1_r, x1_i "xvmaddadp 37, 42, 2 \n\t" // x2_r * y2_i , x2_i * y2_r "lxvd2x 42, %8, %2 \n\t" // x2_r, x2_i "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "xxswapd 0,48 \n\t" // y0_i, y0_r "xxswapd 1,49 \n\t" // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "xxswapd 2,50 \n\t" // y2_i, y2_r "xxswapd 3,51 \n\t" // y3_i, y3_r "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i "xvmaddadp 34, 45, 5 \n\t" // x1_r * y1_r , x1_i * y1_i "lxvd2x 5, %7, %3 \n\t" // y1_r, y1_i "xvmaddadp 36, 46, 6 \n\t" // x2_r * y2_r , x2_i * y2_i "lxvd2x 6, %8, %3 \n\t" // y2_r, y2_i "xvmaddadp 38, 47, 7 \n\t" // x3_r * y3_r , x3_i * y3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i "xvmaddadp 33, 44, 8 \n\t" // x0_r * y0_i , x0_i * y0_r "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i "xvmaddadp 35, 45, 9 \n\t" // x1_r * y1_i , x1_i * y1_r "lxvd2x 45, %7, %2 \n\t" // x1_r, x1_i "xvmaddadp 37, 46, 10 \n\t" // x2_r * y2_i , x2_i * y2_r "lxvd2x 46, %8, %2 \n\t" // x2_r, x2_i "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "xxswapd 8,4 \n\t" // y0_i, y0_r "xxswapd 9,5 \n\t" // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "xxswapd 10,6 \n\t" // y2_i, y2_r "xxswapd 11,7 \n\t" // y3_i, y3_r "addic. %1, %1, -8 \n\t" "bgt 1b \n" "2: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i "xvmaddadp 33, 40, 0 \n\t" // x0_r * y0_i , x0_i * y0_r "xvmaddadp 35, 41, 1 \n\t" // x1_r * y1_i , x1_i * y1_r "xvmaddadp 37, 42, 2 \n\t" // x2_r * y2_i , x2_i * y2_r "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 34, 45, 5 \n\t" // x1_r * y1_r , x1_i * y1_i "xvmaddadp 36, 46, 6 \n\t" // x2_r * y2_r , x2_i * y2_i "xvmaddadp 38, 47, 7 \n\t" // x3_r * y3_r , x3_i * y3_i "xvmaddadp 33, 44, 8 \n\t" // x0_r * y0_i , x0_i * y0_r "xvmaddadp 35, 45, 9 \n\t" // x1_r * y1_i , x1_i * y1_r "xvmaddadp 37, 46, 10 \n\t" // x2_r * y2_i , x2_i * y2_r "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r "xvadddp 32, 32, 34 \n\t" "xvadddp 36, 36, 38 \n\t" "xvadddp 33, 33, 35 \n\t" "xvadddp 37, 37, 39 \n\t" "xvadddp 32, 32, 36 \n\t" "xvadddp 33, 33, 37 \n\t" "stxvd2x 32, 0, %6 \n\t" "stxvd2x 33, %7, %6 \n" "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6 o16=%7 o32=%8 o48=%9" : "=m" (*dot), "+r" (n), // 1 "+b" (x), // 2 "+b" (y) // 3 : "m" (*x), "m" (*y), "b" (dot), // 6 "b" (16), // 7 "b" (32), // 8 "b" (48) // 9 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3", "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11" ); } OpenBLAS-0.2.20/kernel/power/zdot_ppc440.S000066400000000000000000000223761313527062700177700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 #define INCX r6 #define Y r7 #define INCY r8 #define PRE r9 #else #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PRE r8 #endif #define INCXM1 r10 #define INCYM1 r11 #define FZERO f0 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stw r0, 144(SP) stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) lfs FZERO,144(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE fmr f1, FZERO fmr f2, FZERO fmr f3, FZERO fmr f4, FZERO fmr f5, FZERO fmr f6, FZERO fmr f7, FZERO fmr f24, FZERO fmr f25, FZERO fmr f26, FZERO fmr f27, FZERO fmr f28, FZERO fmr f29, FZERO fmr f30, FZERO fmr f31, FZERO li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) #ifdef F_INTERFACE cmpwi cr0, INCX, 0 bge+ LL(102) subi r0, N, 1 mullw r0, r0, INCX sub X, X, r0 .align 4 LL(102): cmpwi cr0, INCY, 0 bge+ LL(104) subi r0, N, 1 mullw r0, r0, INCY sub Y, Y, r0 .align 4 LL(104): #endif sub X, X, INCXM1 sub Y, Y, INCYM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) LFDX f8, X, INCXM1 LFDX f16, Y, INCYM1 LFDUX f9, X, INCX LFDUX f17, Y, INCY LFDX f10, X, INCXM1 LFDX f18, Y, INCYM1 LFDUX f11, X, INCX LFDUX f19, Y, INCY LFDX f12, X, INCXM1 LFDX f20, Y, INCYM1 LFDUX f13, X, INCX LFDUX f21, Y, INCY LFDX f14, X, INCXM1 LFDUX f15, X, INCX bdz LL(120) .align 4 LL(110): FMADD f0, f8, f16, f0 LFDX f22, Y, INCYM1 #ifdef PPCG4 dcbt X, PRE #endif FMADD f3, f8, f17, f3 LFDX f8, X, INCXM1 FMADD f1, f9, f17, f1 LFDUX f23, Y, INCY FMADD f2, f9, f16, f2 LFDUX f9, X, INCX FMADD f4, f10, f18, f4 LFDX f16, Y, INCYM1 #ifdef PPCG4 dcbt Y, PRE #endif FMADD f7, f10, f19, f7 LFDX f10, X, INCXM1 FMADD f5, f11, f19, f5 LFDUX f17, Y, INCY FMADD f6, f11, f18, f6 LFDUX f11, X, INCX FMADD f24, f12, f20, f24 LFDX f18, Y, INCYM1 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f27, f12, f21, f27 LFDX f12, X, INCXM1 FMADD f25, f13, f21, f25 LFDUX f19, Y, INCY FMADD f26, f13, f20, f26 LFDUX f13, X, INCX FMADD f28, f14, f22, f28 LFDX f20, Y, INCYM1 #if defined(PPCG4) && defined(DOUBLE) dcbt Y, PRE #endif FMADD f31, f14, f23, f31 LFDX f14, X, INCXM1 FMADD f29, f15, f23, f29 LFDUX f21, Y, INCY FMADD f30, f15, f22, f30 LFDUX f15, X, INCX FMADD f0, f8, f16, f0 LFDX f22, Y, INCYM1 #ifdef PPCG4 dcbt X, PRE #endif FMADD f3, f8, f17, f3 LFDX f8, X, INCXM1 FMADD f1, f9, f17, f1 LFDUX f23, Y, INCY FMADD f2, f9, f16, f2 LFDUX f9, X, INCX FMADD f4, f10, f18, f4 LFDX f16, Y, INCYM1 #ifdef PPCG4 dcbt Y, PRE #endif FMADD f7, f10, f19, f7 LFDX f10, X, INCXM1 FMADD f5, f11, f19, f5 LFDUX f17, Y, INCY FMADD f6, f11, f18, f6 LFDUX f11, X, INCX FMADD f24, f12, f20, f24 LFDX f18, Y, INCYM1 #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f27, f12, f21, f27 LFDX f12, X, INCXM1 FMADD f25, f13, f21, f25 LFDUX f19, Y, INCY FMADD f26, f13, f20, f26 LFDUX f13, X, INCX FMADD f28, f14, f22, f28 LFDX f20, Y, INCYM1 #if defined(PPCG4) && defined(DOUBLE) dcbt Y, PRE #endif FMADD f31, f14, f23, f31 LFDX f14, X, INCXM1 FMADD f29, f15, f23, f29 LFDUX f21, Y, INCY FMADD f30, f15, f22, f30 LFDUX f15, X, INCX bdnz LL(110) .align 4 LL(120): FMADD f0, f8, f16, f0 LFDX f22, Y, INCYM1 FMADD f3, f8, f17, f3 LFDX f8, X, INCXM1 FMADD f1, f9, f17, f1 LFDUX f23, Y, INCY FMADD f2, f9, f16, f2 LFDUX f9, X, INCX FMADD f4, f10, f18, f4 LFDX f16, Y, INCYM1 FMADD f7, f10, f19, f7 LFDX f10, X, INCXM1 FMADD f5, f11, f19, f5 LFDUX f17, Y, INCY FMADD f6, f11, f18, f6 LFDUX f11, X, INCX FMADD f24, f12, f20, f24 LFDX f18, Y, INCYM1 FMADD f27, f12, f21, f27 LFDX f12, X, INCXM1 FMADD f25, f13, f21, f25 LFDUX f19, Y, INCY FMADD f26, f13, f20, f26 LFDUX f13, X, INCX FMADD f28, f14, f22, f28 LFDX f20, Y, INCYM1 FMADD f31, f14, f23, f31 LFDX f14, X, INCXM1 FMADD f29, f15, f23, f29 LFDUX f21, Y, INCY FMADD f30, f15, f22, f30 LFDUX f15, X, INCX LFDX f22, Y, INCYM1 FMADD f0, f8, f16, f0 LFDUX f23, Y, INCY FMADD f3, f8, f17, f3 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f4, f10, f18, f4 FMADD f7, f10, f19, f7 FMADD f5, f11, f19, f5 FMADD f6, f11, f18, f6 FMADD f24, f12, f20, f24 FMADD f27, f12, f21, f27 FMADD f25, f13, f21, f25 FMADD f26, f13, f20, f26 FMADD f28, f14, f22, f28 FMADD f31, f14, f23, f31 FMADD f29, f15, f23, f29 FMADD f30, f15, f22, f30 .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f16, Y, INCYM1 LFDUX f17, Y, INCY FMADD f0, f8, f16, f0 FMADD f1, f9, f17, f1 FMADD f2, f9, f16, f2 FMADD f3, f8, f17, f3 bdnz LL(160) .align 4 LL(999): FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FADD f24, f28, f24 FADD f25, f29, f25 FADD f26, f30, f26 FADD f27, f31, f27 FADD f0, f0, f24 FADD f1, f1, f25 FADD f2, f2, f26 FADD f3, f3, f27 #ifndef CONJ FSUB f1, f0, f1 FADD f2, f2, f3 #else FADD f1, f0, f1 FSUB f2, f3, f2 #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) STFD f1, 0 * SIZE(RESULT) STFD f2, 1 * SIZE(RESULT) #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) #ifndef __64BIT__ #ifndef DOUBLE stfs f1, 144(SP) stfs f2, 148(SP) lwz r3, 144(SP) lwz r4, 148(SP) #else stfd f1, 144(SP) stfd f2, 152(SP) lwz r3, 144(SP) lwz r4, 148(SP) lwz r5, 152(SP) lwz r6, 156(SP) #endif #else #ifndef DOUBLE stfs f1, 144(SP) stfs f2, 148(SP) ld r3, 144(SP) #else stfd f1, 144(SP) stfd f2, 152(SP) ld r3, 144(SP) ld r4, 152(SP) #endif #endif #endif lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zgemm_beta.S000066400000000000000000000136631313527062700200270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define C r10 #define LDC r11 #define J r5 #define PRE r6 #define CO1 r7 #define ALPHA_R f30 #define ALPHA_I f31 #define STACKSIZE 32 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f30, 0(SP) stfd f31, 8(SP) stw r0, 16(SP) #ifdef linux #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else ld C, FRAMESLOT(1) + STACKSIZE(SP) ld LDC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld C, FRAMESLOT(1) + STACKSIZE(SP) ld LDC, FRAMESLOT(2) + STACKSIZE(SP) #else #ifdef DOUBLE lwz C, FRAMESLOT(3) + STACKSIZE(SP) lwz LDC, FRAMESLOT(4) + STACKSIZE(SP) #else lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT lfs f0, 16(SP) fmr ALPHA_R, f1 fmr ALPHA_I, f2 cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) mr J, N fcmpu cr7, f1, f0 bne cr7, LL(20) fcmpu cr7, f2, f0 bne cr7, LL(20) .align 4 LL(10): mr CO1, C add C, C, LDC addi PRE, 0, 32 * SIZE srawi. r0, M, 3 mtspr CTR, r0 ble LL(15) .align 4 LL(12): STFD f0, 0 * SIZE(CO1) STFD f0, 1 * SIZE(CO1) STFD f0, 2 * SIZE(CO1) STFD f0, 3 * SIZE(CO1) STFD f0, 4 * SIZE(CO1) STFD f0, 5 * SIZE(CO1) STFD f0, 6 * SIZE(CO1) STFD f0, 7 * SIZE(CO1) STFD f0, 8 * SIZE(CO1) STFD f0, 9 * SIZE(CO1) STFD f0, 10 * SIZE(CO1) STFD f0, 11 * SIZE(CO1) STFD f0, 12 * SIZE(CO1) STFD f0, 13 * SIZE(CO1) STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) dcbst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 LL(15): andi. r0, M, 7 mtspr CTR, r0 beq LL(19) .align 4 LL(16): STFD f0, 0 * SIZE(CO1) STFD f0, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE bdnz LL(16) .align 4 LL(19): addic. J, J, -1 bgt LL(10) b LL(999) .align 4 LL(20): mr CO1, C add C, C, LDC addi PRE, 0, 16 * SIZE srawi. r0, M, 2 mtspr CTR, r0 ble LL(25) .align 4 LL(22): LFD f3, 0 * SIZE(CO1) LFD f4, 1 * SIZE(CO1) LFD f5, 2 * SIZE(CO1) LFD f6, 3 * SIZE(CO1) LFD f7, 4 * SIZE(CO1) LFD f8, 5 * SIZE(CO1) LFD f9, 6 * SIZE(CO1) LFD f10, 7 * SIZE(CO1) FMUL f0, ALPHA_I, f4 FMUL f4, ALPHA_R, f4 FMUL f11, ALPHA_I, f6 FMUL f6, ALPHA_R, f6 FMUL f12, ALPHA_I, f8 FMUL f8, ALPHA_R, f8 FMUL f13, ALPHA_I, f10 FMUL f10, ALPHA_R, f10 FMADD f4, ALPHA_I, f3, f4 FMSUB f3, ALPHA_R, f3, f0 FMADD f6, ALPHA_I, f5, f6 FMSUB f5, ALPHA_R, f5, f11 FMADD f8, ALPHA_I, f7, f8 FMSUB f7, ALPHA_R, f7, f12 FMADD f10, ALPHA_I, f9, f10 FMSUB f9, ALPHA_R, f9, f13 STFD f3, 0 * SIZE(CO1) STFD f4, 1 * SIZE(CO1) STFD f5, 2 * SIZE(CO1) STFD f6, 3 * SIZE(CO1) STFD f7, 4 * SIZE(CO1) STFD f8, 5 * SIZE(CO1) STFD f9, 6 * SIZE(CO1) STFD f10, 7 * SIZE(CO1) addi CO1, CO1, 8 * SIZE dcbtst PRE, CO1 bdnz LL(22) .align 4 LL(25): andi. r0, M, 3 mtspr CTR, r0 ble LL(29) .align 4 LL(26): LFD f0, 0 * SIZE(CO1) LFD f1, 1 * SIZE(CO1) FMUL f5, ALPHA_I, f1 FMUL f1, ALPHA_R, f1 FMADD f1, ALPHA_I, f0, f1 FMSUB f0, ALPHA_R, f0, f5 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE bdnz LL(26) .align 4 LL(29): addic. J, J, -1 bgt LL(20) .align 4 LL(999): li r3, 0 lfd f30, 0(SP) lfd f31, 8(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zgemm_kernel.S000066400000000000000000001037071313527062700203730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) #ifdef TRMMKERNEL std r23, 208(SP) std r22, 216(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) #ifdef TRMMKERNEL stw r23, 176(SP) stw r22, 180(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 4 * SIZE /* is 12 best? */ #endif #else #ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE) li PREB, (16 * 5 * SIZE) #else li PREA, (16 * 15 * SIZE) li PREB, (16 * 8 * SIZE) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE) li PREB, (16 * 1 * SIZE) #else li PREA, (16 * 2 * SIZE) li PREB, (16 * 2 * SIZE) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE) li PREB, (16 * 7 * SIZE) #else li PREA, (16 * 12 * SIZE) li PREB, (16 * 6 * SIZE) #endif #endif #endif lfs f0, FZERO srawi. J, N, 1 ble LL(30) .align 4 LL(10): fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr AO, A ble LL(20) .align 4 LL(11): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) #ifdef POWER5 LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) LFD f31, 7 * SIZE(B) #endif DCBTST(CO1, PREC) nop nop DCBTST(CO2, PREC) srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(15) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) #ifdef POWER5 LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) LFD f31, 7 * SIZE(B) #endif mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #ifdef POWER5 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) #endif #endif DCBTST(CO1, PREC) nop nop DCBTST(CO2, PREC) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(15) #endif .align 4 LL(12): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 #if defined(ALLOC_HUGETLB) && !defined(POWER5) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) #endif FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 #if !defined(ALLOC_HUGETLB) && !defined(POWER5) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) #endif LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 #ifndef POWER5 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) #else LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) #endif FMADD f0, f24, f28, f0 FMADD f5, f25, f29, f5 FMADD f10, f26, f30, f10 FMADD f15, f27, f31, f15 #ifndef POWER5 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) #else LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) #endif FMADD f1, f25, f28, f1 FMADD f2, f26, f28, f2 FMADD f3, f27, f28, f3 FMADD f4, f24, f29, f4 FMADD f6, f26, f29, f6 FMADD f7, f27, f29, f7 FMADD f8, f24, f30, f8 FMADD f9, f25, f30, f9 FMADD f11, f27, f30, f11 FMADD f12, f24, f31, f12 FMADD f13, f25, f31, f13 FMADD f14, f26, f31, f14 #ifdef POWER5 LFD f28, 20 * SIZE(BO) LFD f29, 21 * SIZE(BO) LFD f30, 22 * SIZE(BO) LFD f31, 23 * SIZE(BO) #endif addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 #ifndef ALLOC_HUGETLB DCBT(BO, PREB) DCBT(AO, PREA) #endif #endif bdnz LL(12) .align 4 LL(15): #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(KERNEL_MainFinish) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble LL(KERNEL_MainFinish) #endif .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f9, f12 FADD f10, f10, f15 FSUB f11, f11, f14 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FMADD f20, f30, f8, f20 FMADD f21, f30, f9, f21 FMADD f22, f30, f10, f22 FMADD f23, f30, f11, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 FNMSUB f20, f31, f9, f20 FMADD f21, f31, f8, f21 FNMSUB f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f20, f30, f8, f20 FNMSUB f21, f30, f9, f21 FMADD f22, f30, f10, f22 FNMSUB f23, f30, f11, f23 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FNMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FNMADD f23, f31, f10, f23 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f20, 0 * SIZE(CO2) STFD f21, 1 * SIZE(CO2) STFD f22, 2 * SIZE(CO2) STFD f23, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(25) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(25) #endif .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(27) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble LL(27) #endif .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 0 * SIZE(CO2) STFD f19, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(29): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(30): andi. J, N, 1 ble LL(999) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr CO1, C add C, C, LDC mr AO, A ble LL(40) .align 4 LL(31): #ifndef TRMMKERNEL LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(35) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif DCBTST(CO1, PREC) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(35) #endif .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE DCBT(AO, PREA) DCBT(BO, PREB) bdnz LL(32) .align 4 LL(35): #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(37) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble LL(37) #endif .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) addi CO1, CO1, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(999) #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(45) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(45) #endif .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,r0 ble LL(47) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,TEMP ble LL(47) #endif .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) fsub f0, f0, f1 fadd f2, f2, f3 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) fadd f0, f0, f1 fsub f2, f2, f3 #else fadd f0, f0, f1 fsub f2, f3, f2 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f2, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 #endif FNMSUB f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC) || defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f2, f17 FMADD f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 FMADD f16, f31, f2, f16 FNMADD f17, f31, f0, f17 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) #ifdef TRMMKERNEL ld r23, 208(SP) ld r22, 216(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) #ifdef TRMMKERNEL lwz r23, 176(SP) lwz r22, 180(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_8x2_power8.S000066400000000000000000000265151313527062700224010ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 32000 #define ALPHA_R_SP 296(SP) #define ALPHA_I_SP 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R_SP 224(SP) #define ALPHA_I_SP 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define o0 0 #define alpha_r vs30 #define alpha_i vs31 #define FRAMEPOINTER r12 #define BBUFFER r14 #define L r15 #define ALPHA r16 #define o24 r17 #define T2 r19 #define BBO r20 #define o8 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o16 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T1 r31 #ifndef NEEDPARAM PROLOGUE PROFCODE mr FRAMEPOINTER, SP addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) #endif stfd f1, ALPHA_R_SP stfd f2, ALPHA_I_SP stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) #else lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) #else lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif #include "zgemm_macros_8x2_power8.S" cmpwi cr0, M, 0 ble L999 cmpwi cr0, N, 0 ble L999 cmpwi cr0, K, 0 ble L999 slwi LDC, LDC, ZBASE_SHIFT li PRE, 512 li o8 , 8 li o16 , 16 li o24 , 24 li o32 , 32 li o48 , 48 addi BBUFFER, SP, 512+4096 li T1, -4096 and BBUFFER, BBUFFER, T1 #ifdef __64BIT__ addi ALPHA, SP, 296 #else addi ALPHA, SP, 224 #endif lxsdx alpha_r, 0, ALPHA lxsdx alpha_i, o8, ALPHA .align 4 #include "zgemm_logic_8x2_power8.S" L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) #endif addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_altivec.S000066400000000000000000000764651313527062700221140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 360 #else #define STACKSIZE 272 #endif #define ALIGN_SIZE 0xffff #define SWAP 0 #define NEG 16 #define ALPHA_R 32 #define ALPHA_I 48 #define FZERO 64 #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #define STACK r11 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define PREA r29 #define PREB r29 #define PREC r30 #define VREG r31 #define LOAD_A lvx #define LOAD_B lvx #define OFFSET_0 0 #define OFFSET_1 r14 #define OFFSET_2 r15 #define OFFSET_3 r16 #define OFFSET_4 r17 #define OFFSET_5 r18 #define OFFSET_6 r19 #define OFFSET_7 r20 #define c01 v0 #define c02 v1 #define c03 v2 #define c04 v3 #define c05 v4 #define c06 v5 #define c07 v6 #define c08 v7 #define c09 v8 #define c10 v9 #define c11 v10 #define c12 v11 #define c13 v12 #define c14 v13 #define c15 v14 #define c16 v15 #define a1 v16 #define a2 v17 #define a3 v18 #define a4 v19 #define a5 v20 #define a6 v21 #define a7 v22 #define a8 v23 #define b1 v24 #define b2 v25 #define bp1 v26 #define bp2 v27 #define C1 v16 #define C2 v17 #define C3 v18 #define C4 v19 #define C5 v20 #define c00 v24 #define VZERO v25 #define PERMRSHIFT1 v26 #define PERMRSHIFT2 v27 #define swap v28 #define neg v29 #define alpha_r v30 #define alpha_i v31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE mr STACK, SP li r0, 0 * 16 stvx v20, SP, r0 li r0, 1 * 16 stvx v21, SP, r0 li r0, 2 * 16 stvx v22, SP, r0 li r0, 3 * 16 stvx v23, SP, r0 li r0, 4 * 16 stvx v24, SP, r0 li r0, 5 * 16 stvx v25, SP, r0 li r0, 6 * 16 stvx v26, SP, r0 li r0, 7 * 16 stvx v27, SP, r0 li r0, 8 * 16 stvx v28, SP, r0 li r0, 9 * 16 stvx v29, SP, r0 li r0, 10 * 16 stvx v30, SP, r0 li r0, 11 * 16 stvx v31, SP, r0 #ifdef __64BIT__ std r31, 192(SP) std r30, 200(SP) std r29, 208(SP) std r28, 216(SP) std r27, 224(SP) std r26, 232(SP) std r25, 240(SP) std r24, 248(SP) std r23, 256(SP) std r22, 264(SP) std r21, 272(SP) std r20, 280(SP) std r19, 288(SP) std r18, 296(SP) std r17, 304(SP) std r16, 312(SP) std r15, 320(SP) std r14, 328(SP) #else stw r31, 192(SP) stw r30, 196(SP) stw r29, 200(SP) stw r28, 204(SP) stw r27, 208(SP) stw r26, 212(SP) stw r25, 216(SP) stw r24, 220(SP) stw r23, 224(SP) stw r22, 228(SP) stw r21, 232(SP) stw r20, 236(SP) stw r19, 240(SP) stw r18, 244(SP) stw r17, 248(SP) stw r16, 252(SP) stw r15, 256(SP) stw r14, 260(SP) #endif #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 li PREC, 16 * SIZE #endif #else #ifdef linux #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREB, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREB, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREB, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREB, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef CELL li PREB, (3 * 32 * SIZE) #else li PREB, (5 * 32 * SIZE) #endif #endif li r0, -1 mfspr VREG, VRsave mtspr VRsave, r0 addi SP, SP, -128 li r0, -8192 and SP, SP, r0 fneg f3, f1 fneg f4, f2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR) stfs f1, ALPHA_R + 0(SP) stfs f1, ALPHA_R + 4(SP) stfs f1, ALPHA_R + 8(SP) stfs f1, ALPHA_R + 12(SP) stfs f4, ALPHA_I + 0(SP) stfs f2, ALPHA_I + 4(SP) stfs f4, ALPHA_I + 8(SP) stfs f2, ALPHA_I + 12(SP) #else stfs f1, ALPHA_R + 0(SP) stfs f3, ALPHA_R + 4(SP) stfs f1, ALPHA_R + 8(SP) stfs f3, ALPHA_R + 12(SP) stfs f2, ALPHA_I + 0(SP) stfs f2, ALPHA_I + 4(SP) stfs f2, ALPHA_I + 8(SP) stfs f2, ALPHA_I + 12(SP) #endif li I, Address_L(0x04050607) addis I, I, Address_H(0x04050607) stw I, SWAP + 0(SP) li I, Address_L(0x00010203) addis I, I, Address_H(0x00010203) stw I, SWAP + 4(SP) li I, Address_L(0x0c0d0e0f) addis I, I, Address_H(0x0c0d0e0f) stw I, SWAP + 8(SP) li I, Address_L(0x08090a0b) addis I, I, Address_H(0x08090a0b) stw I, SWAP + 12(SP) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) lis I, 0x8000 stw I, NEG + 0(SP) stw I, NEG + 8(SP) li I, 0 stw I, NEG + 4(SP) stw I, NEG + 12(SP) #else li I, 0 stw I, NEG + 0(SP) stw I, NEG + 8(SP) lis I, 0x8000 stw I, NEG + 4(SP) stw I, NEG + 12(SP) #endif li r0, 0 stw r0, FZERO(SP) slwi LDC, LDC, ZBASE_SHIFT li OFFSET_1, 4 * SIZE li OFFSET_2, 8 * SIZE li OFFSET_3, 12 * SIZE li OFFSET_4, 16 * SIZE li OFFSET_5, 20 * SIZE li OFFSET_6, 24 * SIZE li OFFSET_7, 28 * SIZE cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 1 ble LL(50) .align 4 LL(01): mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 3 ble LL(20) .align 4 LL(11): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_B b2, OFFSET_1, B vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 LOAD_A a5, OFFSET_4, AO vxor c08, c08, c08 vxor c09, c09, c09 dcbtst CO1, PREC vxor c10, c10, c10 dcbtst CO2, PREC vxor c11, c11, c11 vxor c12, c12, c12 vxor c13, c13, c13 mr BO, B vxor c14, c14, c14 srawi. r0, K, 1 vxor c15, c15, c15 mtspr CTR, r0 vxor c16, c16, c16 vspltw bp1, b1, 0 ble LL(15) .align 4 LL(12): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 DCBT(BO, PREB) vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 LOAD_A a6, OFFSET_5, AO vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 #ifdef CELL DCBT(AO, PREA) #else nop #endif vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 LOAD_A a7, OFFSET_6, AO vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b1, OFFSET_2, BO vmaddfp c11, a3, bp1, c11 nop vmaddfp c12, a4, bp1, c12 LOAD_A a8, OFFSET_7, AO vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 addi AO, AO, 32 * SIZE vmaddfp c15, a3, bp2, c15 nop vmaddfp c16, a4, bp2, c16 LOAD_A a1, OFFSET_0, AO vmaddfp c01, a5, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a6, bp1, c02 nop vmaddfp c03, a7, bp1, c03 nop vmaddfp c04, a8, bp1, c04 LOAD_A a2, OFFSET_1, AO vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 nop vmaddfp c07, a7, bp2, c07 nop vmaddfp c08, a8, bp2, c08 LOAD_A a3, OFFSET_2, AO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_B b2, OFFSET_3, BO vmaddfp c11, a7, bp1, c11 nop vmaddfp c12, a8, bp1, c12 LOAD_A a4, OFFSET_3, AO vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 addi BO, BO, 8 * SIZE vmaddfp c15, a7, bp2, c15 LOAD_A a5, OFFSET_4, AO vmaddfp c16, a8, bp2, c16 bdnz+ LL(12) .align 4 LL(15): lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP andi. r0, K, 1 ble+ LL(18) .align 4 LL(16): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 16 * SIZE vmaddfp c11, a3, bp1, c11 addi BO, BO, 4 * SIZE vmaddfp c12, a4, bp1, c12 nop vmaddfp c13, a1, bp2, c13 vmaddfp c14, a2, bp2, c14 vmaddfp c15, a3, bp2, c15 vmaddfp c16, a4, bp2, c16 .align 4 LL(18): vxor VZERO, VZERO, VZERO vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c07, c07, c07, swap vperm c08, c08, c08, swap vperm c13, c13, c13, swap vperm c14, c14, c14, swap vperm c15, c15, c15, swap vperm c16, c16, c16, swap vxor c05, c05, neg vxor c06, c06, neg vxor c07, c07, neg vxor c08, c08, neg vxor c13, c13, neg vxor c14, c14, neg vxor c15, c15, neg vxor c16, c16, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c03, c03, c07 vaddfp c04, c04, c08 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vaddfp c11, c11, c15 vaddfp c12, c12, c16 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c07, c03, c03, swap vperm c08, c04, c04, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vperm c15, c11, c11, swap vperm c16, c12, c12, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c03, alpha_r, c03, VZERO vmaddfp c04, alpha_r, c04, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c03, alpha_i, c07, c03 vmaddfp c04, alpha_i, c08, c04 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c11, alpha_r, c11, VZERO vmaddfp c12, alpha_r, c12, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 vmaddfp c11, alpha_i, c15, c11 vmaddfp c12, alpha_i, c16, c12 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 vaddfp c03, c03, C4 vaddfp c04, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, c11, PERMRSHIFT2 vperm c11, c11, c12, PERMRSHIFT2 vperm c12, c12, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 vaddfp c11, c11, C4 vaddfp c12, c12, C5 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 stvx c11, OFFSET_3, CO2 stvx c12, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 4 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4 LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4 LL(25): andi. r0, K, 1 ble+ LL(28) .align 4 LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4 LL(28): vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c13, c13, c13, swap vperm c14, c14, c14, swap vxor c05, c05, neg vxor c06, c06, neg vxor c13, c13, neg vxor c14, c14, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4 LL(30): andi. I, M, 2 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4 LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4 LL(35): andi. r0, K, 1 ble+ LL(38) .align 4 LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c13, c13, c13, swap vxor c05, c05, neg vxor c13, c13, neg vaddfp c01, c01, c05 vaddfp c09, c09, c13 vperm c05, c01, c01, swap vperm c13, c09, c09, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c09, alpha_i, c13, c09 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4 LL(40): andi. I, M, 1 ble LL(49) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(BO) LFD f11, 1 * SIZE(BO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4 LL(42): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): andi. r0, K, 1 ble LL(48) .align 4 LL(46): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(48): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) fadd f0, f0, f3 fsub f1, f1, f2 fadd f4, f4, f7 fsub f5, f5, f6 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) fadd f0, f0, f3 fsub f1, f2, f1 fadd f4, f4, f7 fsub f5, f6, f5 #else /* RR, RC, CR, CC */ fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6 #endif LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) lfs f12, ALPHA_R + 0(SP) lfs f13, ALPHA_I + 4(SP) #if defined(RR) || defined(RC) || defined(CR) || defined(CC) fmadd f8, f12, f0, f8 fnmsub f9, f12, f1, f9 fmadd f10, f12, f4, f10 fnmsub f11, f12, f5, f11 fmadd f8, f13, f1, f8 fmadd f9, f13, f0, f9 fmadd f10, f13, f5, f10 fmadd f11, f13, f4, f11 #else fmadd f8, f12, f0, f8 fmadd f9, f12, f1, f9 fmadd f10, f12, f4, f10 fmadd f11, f12, f5, f11 fnmsub f8, f13, f1, f8 fmadd f9, f13, f0, f9 fnmsub f10, f13, f5, f10 fmadd f11, f13, f4, f11 #endif STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) STFD f10, 0 * SIZE(CO2) STFD f11, 1 * SIZE(CO2) LL(49): mr B, BO addic. J, J, -1 bgt LL(01) .align 4 LL(50): andi. J, N, 1 ble LL(999) mr CO1, C mr AO, A srawi. I, M, 3 ble LL(70) .align 4 LL(61): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B dcbtst CO1, PREC dcbtst CO2, PREC vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(65) .align 4 LL(62): LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 3 vmaddfp c02, a6, bp1, c02 vmaddfp c03, a7, bp1, c03 vmaddfp c04, a8, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c05, a5, bp2, c05 vmaddfp c06, a6, bp2, c06 vmaddfp c07, a7, bp2, c07 vmaddfp c08, a8, bp2, c08 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(62) .align 4 LL(65): andi. r0, K, 1 ble+ LL(68) .align 4 LL(66): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 16 * SIZE vmaddfp c03, a3, bp1, c03 addi BO, BO, 2 * SIZE vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 .align 4 LL(68): vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c07, c07, c07, swap vperm c08, c08, c08, swap vxor c05, c05, neg vxor c06, c06, neg vxor c07, c07, neg vxor c08, c08, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c03, c03, c07 vaddfp c04, c04, c08 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c07, c03, c03, swap vperm c08, c04, c04, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c03, alpha_r, c03, VZERO vmaddfp c04, alpha_r, c04, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c03, alpha_i, c07, c03 vmaddfp c04, alpha_i, c08, c04 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 vaddfp c03, c03, C4 vaddfp c04, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 addi CO1, CO1, 16 * SIZE addic. I, I, -1 bgt+ LL(61) .align 4 LL(70): andi. I, M, 4 ble LL(80) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(75) .align 4 LL(72): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c03, a3, bp1, c03 vspltw bp2, b1, 3 vmaddfp c04, a4, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(72) .align 4 LL(75): andi. r0, K, 1 ble+ LL(78) .align 4 LL(76): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c05, a1, bp2, c05 addi BO, BO, 2 * SIZE vmaddfp c06, a2, bp2, c06 .align 4 LL(78): vaddfp c01, c01, c03 vaddfp c02, c02, c04 vaddfp c05, c05, c07 vaddfp c06, c06, c08 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vxor c05, c05, neg vxor c06, c06, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 addi CO1, CO1, 8 * SIZE .align 4 LL(80): andi. I, M, 2 ble LL(90) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 vxor c06, c06, c06 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(85) .align 4 LL(82): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c06, a2, bp2, c06 addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO bdnz LL(82) .align 4 LL(85): andi. r0, K, 1 ble+ LL(88) .align 4 LL(86): vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c05, a1, bp2, c05 addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(88): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vxor c05, c05, neg vaddfp c01, c01, c05 vperm c05, c01, c01, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c01, alpha_i, c05, c01 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 addi CO1, CO1, 4 * SIZE .align 4 LL(90): andi. I, M, 1 ble LL(999) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(BO) LFD f11, 1 * SIZE(BO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(95) .align 4 LL(92): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) fmadd f0, f8, f12, f0 fmadd f2, f8, f13, f2 fmadd f1, f9, f12, f1 fmadd f3, f9, f13, f3 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(92) .align 4 LL(95): andi. r0, K, 1 ble LL(98) .align 4 LL(96): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 .align 4 LL(98): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) fsub f0, f0, f3 fadd f1, f1, f2 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) fadd f0, f0, f3 fsub f1, f1, f2 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) fadd f0, f0, f3 fsub f1, f2, f1 #else /* RR, RC, CR, CC */ fsub f0, f0, f3 fadd f1, f1, f2 #endif LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) lfs f12, ALPHA_R + 0(SP) lfs f13, ALPHA_I + 4(SP) #if defined(RR) || defined(RC) || defined(CR) || defined(CC) fmadd f8, f12, f0, f8 fnmsub f9, f12, f1, f9 fmadd f8, f13, f1, f8 fmadd f9, f13, f0, f9 #else fmadd f8, f12, f0, f8 fmadd f9, f12, f1, f9 fnmsub f8, f13, f1, f8 fmadd f9, f13, f0, f9 #endif STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) .align 4 LL(999): mr SP, STACK li r0, 0 * 16 lvx v20, SP, r0 li r0, 1 * 16 lvx v21, SP, r0 li r0, 2 * 16 lvx v22, SP, r0 li r0, 3 * 16 lvx v23, SP, r0 li r0, 4 * 16 lvx v24, SP, r0 li r0, 5 * 16 lvx v25, SP, r0 li r0, 6 * 16 lvx v26, SP, r0 li r0, 7 * 16 lvx v27, SP, r0 li r0, 8 * 16 lvx v28, SP, r0 li r0, 9 * 16 lvx v29, SP, r0 li r0, 10 * 16 lvx v30, SP, r0 li r0, 11 * 16 lvx v31, SP, r0 mtspr VRsave, VREG #ifdef __64BIT__ ld r31, 192(SP) ld r30, 200(SP) ld r29, 208(SP) ld r28, 216(SP) ld r27, 224(SP) ld r26, 232(SP) ld r25, 240(SP) ld r24, 248(SP) ld r23, 256(SP) ld r22, 264(SP) ld r21, 272(SP) ld r20, 280(SP) ld r19, 288(SP) ld r18, 296(SP) ld r17, 304(SP) ld r16, 312(SP) ld r15, 320(SP) ld r14, 328(SP) #else lwz r31, 192(SP) lwz r30, 196(SP) lwz r29, 200(SP) lwz r28, 204(SP) lwz r27, 208(SP) lwz r26, 212(SP) lwz r25, 216(SP) lwz r24, 220(SP) lwz r23, 224(SP) lwz r22, 228(SP) lwz r21, 232(SP) lwz r20, 236(SP) lwz r19, 240(SP) lwz r18, 244(SP) lwz r17, 248(SP) lwz r16, 252(SP) lwz r15, 256(SP) lwz r14, 260(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_altivec_cell.S000066400000000000000000001042751313527062700231020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 360 #else #define STACKSIZE 272 #endif #define ALIGN_SIZE 0xffff #define SWAP 0 #define NEG 16 #define ALPHA_R 32 #define ALPHA_I 48 #define FZERO 64 #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #define STACK r11 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define PREA r29 #define PREB r29 #define PREC r30 #define VREG r31 #define LOAD_A lvx #define LOAD_B lvx #define OFFSET_0 0 #define OFFSET_1 r14 #define OFFSET_2 r15 #define OFFSET_3 r16 #define OFFSET_4 r17 #define OFFSET_5 r18 #define OFFSET_6 r19 #define OFFSET_7 r20 #define c01 v0 #define c02 v1 #define c03 v2 #define c04 v3 #define c05 v4 #define c06 v5 #define c07 v6 #define c08 v7 #define c09 v8 #define c10 v9 #define c11 v10 #define c12 v11 #define c13 v12 #define c14 v13 #define c15 v14 #define c16 v15 #define a1 v16 #define a2 v17 #define a3 v18 #define a4 v19 #define a5 v20 #define a6 v21 #define a7 v22 #define a8 v23 #define b1 v24 #define b2 v25 #define bp1 v26 #define bp2 v27 #define C1 v16 #define C2 v17 #define C3 v18 #define C4 v19 #define C5 v20 #define c00 v24 #define VZERO v25 #define PERMRSHIFT1 v26 #define PERMRSHIFT2 v27 #define swap v28 #define neg v29 #define alpha_r v30 #define alpha_i v31 #ifndef NEEDPARAM #ifndef DOUBLE #include "../cparam.h" #else #include "../zparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE mr STACK, SP li r0, 0 * 16 stvx v20, SP, r0 li r0, 1 * 16 stvx v21, SP, r0 li r0, 2 * 16 stvx v22, SP, r0 li r0, 3 * 16 stvx v23, SP, r0 li r0, 4 * 16 stvx v24, SP, r0 li r0, 5 * 16 stvx v25, SP, r0 li r0, 6 * 16 stvx v26, SP, r0 li r0, 7 * 16 stvx v27, SP, r0 li r0, 8 * 16 stvx v28, SP, r0 li r0, 9 * 16 stvx v29, SP, r0 li r0, 10 * 16 stvx v30, SP, r0 li r0, 11 * 16 stvx v31, SP, r0 #ifdef __64BIT__ std r31, 192(SP) std r30, 200(SP) std r29, 208(SP) std r28, 216(SP) std r27, 224(SP) std r26, 232(SP) std r25, 240(SP) std r24, 248(SP) std r23, 256(SP) std r22, 264(SP) std r21, 272(SP) std r20, 280(SP) std r19, 288(SP) std r18, 296(SP) std r17, 304(SP) std r16, 312(SP) std r15, 320(SP) std r14, 328(SP) #else stw r31, 192(SP) stw r30, 196(SP) stw r29, 200(SP) stw r28, 204(SP) stw r27, 208(SP) stw r26, 212(SP) stw r25, 216(SP) stw r24, 220(SP) stw r23, 224(SP) stw r22, 228(SP) stw r21, 232(SP) stw r20, 236(SP) stw r19, 240(SP) stw r18, 244(SP) stw r17, 248(SP) stw r16, 252(SP) stw r15, 256(SP) stw r14, 260(SP) #endif #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 li PREC, 16 * SIZE #endif #else #ifdef linux #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREB, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREB, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREB, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREB, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef CELL li PREB, (3 * 32 * SIZE) #else li PREB, (5 * 32 * SIZE) #endif #endif li r0, -1 mfspr VREG, VRsave mtspr VRsave, r0 addi SP, SP, -128 li r0, -8192 and SP, SP, r0 fneg f3, f1 fneg f4, f2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR) stfs f1, ALPHA_R + 0(SP) stfs f1, ALPHA_R + 4(SP) stfs f1, ALPHA_R + 8(SP) stfs f1, ALPHA_R + 12(SP) stfs f4, ALPHA_I + 0(SP) stfs f2, ALPHA_I + 4(SP) stfs f4, ALPHA_I + 8(SP) stfs f2, ALPHA_I + 12(SP) #else stfs f1, ALPHA_R + 0(SP) stfs f3, ALPHA_R + 4(SP) stfs f1, ALPHA_R + 8(SP) stfs f3, ALPHA_R + 12(SP) stfs f2, ALPHA_I + 0(SP) stfs f2, ALPHA_I + 4(SP) stfs f2, ALPHA_I + 8(SP) stfs f2, ALPHA_I + 12(SP) #endif li I, Address_L(0x04050607) addis I, I, Address_H(0x04050607) stw I, SWAP + 0(SP) li I, Address_L(0x00010203) addis I, I, Address_H(0x00010203) stw I, SWAP + 4(SP) li I, Address_L(0x0c0d0e0f) addis I, I, Address_H(0x0c0d0e0f) stw I, SWAP + 8(SP) li I, Address_L(0x08090a0b) addis I, I, Address_H(0x08090a0b) stw I, SWAP + 12(SP) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) lis I, 0x8000 stw I, NEG + 0(SP) stw I, NEG + 8(SP) li I, 0 stw I, NEG + 4(SP) stw I, NEG + 12(SP) #else li I, 0 stw I, NEG + 0(SP) stw I, NEG + 8(SP) lis I, 0x8000 stw I, NEG + 4(SP) stw I, NEG + 12(SP) #endif li r0, 0 stw r0, FZERO(SP) slwi LDC, LDC, ZBASE_SHIFT li OFFSET_1, 4 * SIZE li OFFSET_2, 8 * SIZE li OFFSET_3, 12 * SIZE li OFFSET_4, 16 * SIZE li OFFSET_5, 20 * SIZE li OFFSET_6, 24 * SIZE li OFFSET_7, 28 * SIZE cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 1 ble LL(50) .align 4 LL(01): mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 3 ble LL(20) .align 4 LL(11): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO vxor c03, c03, c03 LOAD_A a2, OFFSET_1, AO vxor c04, c04, c04 LOAD_A a3, OFFSET_2, AO vxor c04, c04, c04 vxor c05, c05, c05 vxor c06, c06, c06 vxor c07, c07, c07 vxor c08, c08, c08 vxor c09, c09, c09 dcbtst CO1, PREC vxor c10, c10, c10 dcbtst CO2, PREC vxor c11, c11, c11 vxor c12, c12, c12 vxor c13, c13, c13 mr BO, B vxor c14, c14, c14 srawi. r0, K, 2 vxor c15, c15, c15 mtspr CTR, r0 vxor c16, c16, c16 vspltw bp1, b1, 0 ble LL(13) .align 4 #define NOP1 mr r3, r3 #define NOP2 mr r4, r4 LL(12): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 dcbt AO, PREA vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 dcbt BO, PREB vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b2, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 addi BO, BO, 8 * SIZE vmaddfp c12, a4, bp1, c12 NOP1 vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 addi AO, AO, 32 * SIZE vmaddfp c07, a7, bp2, c07 LOAD_B b1, OFFSET_0, BO vmaddfp c08, a8, bp2, c08 NOP1 vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 NOP2 vmaddfp c11, a7, bp1, c11 NOP1 vmaddfp c12, a8, bp1, c12 dcbt AO, PREA vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a1, OFFSET_0, AO // vmaddfp c15, a7, bp2, c15 LOAD_A a2, OFFSET_1, AO vmaddfp c16, a8, bp2, c16 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 LOAD_A a3, OFFSET_2, AO vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 LOAD_B b2, OFFSET_1, BO vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 NOP2 vmaddfp c11, a3, bp1, c11 NOP1 vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 vspltw bp2, b2, 1 vmaddfp c01, a5, bp1, c01 LOAD_A a7, OFFSET_6, AO vmaddfp c02, a6, bp1, c02 LOAD_A a8, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 addi AO, AO, 32 * SIZE vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO // vmaddfp c11, a7, bp1, c11 NOP2 vmaddfp c12, a8, bp1, c12 vspltw bp1, b1, 0 vmaddfp c13, a5, bp2, c13 LOAD_A a2, OFFSET_1, AO vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 NOP1 vmaddfp c16, a8, bp2, c16 bdnz+ LL(12) .align 4 LL(13): andi. r0, K, 2 nop nop ble+ LL(15) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 NOP1 vmaddfp c04, a4, bp1, c04 NOP2 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 NOP2 vmaddfp c07, a3, bp2, c07 NOP1 vmaddfp c08, a4, bp2, c08 LOAD_B b2, OFFSET_1, BO vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_A a5, OFFSET_4, AO vmaddfp c11, a3, bp1, c11 LOAD_A a6, OFFSET_5, AO vmaddfp c12, a4, bp1, c12 addi BO, BO, 8 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a7, OFFSET_6, AO vmaddfp c15, a3, bp2, c15 LOAD_A a8, OFFSET_7, AO vmaddfp c16, a4, bp2, c16 addi AO, AO, 32 * SIZE vmaddfp c01, a5, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a6, bp1, c02 NOP2 vmaddfp c03, a7, bp1, c03 NOP1 vmaddfp c04, a8, bp1, c04 NOP2 vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 NOP2 vmaddfp c07, a7, bp2, c07 NOP1 vmaddfp c08, a8, bp2, c08 LOAD_B b1, OFFSET_0, BO vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_A a1, OFFSET_0, AO vmaddfp c11, a7, bp1, c11 LOAD_A a2, OFFSET_1, AO vmaddfp c12, a8, bp1, c12 NOP2 vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a3, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 vmaddfp c16, a8, bp2, c16 .align 4 LL(15): andi. r0, K, 1 vxor VZERO, VZERO, VZERO ble+ LL(18) .align 4 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a4, OFFSET_3, AO vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 16 * SIZE vmaddfp c11, a3, bp1, c11 addi BO, BO, 4 * SIZE vmaddfp c12, a4, bp1, c12 nop vmaddfp c13, a1, bp2, c13 vmaddfp c14, a2, bp2, c14 vmaddfp c15, a3, bp2, c15 vmaddfp c16, a4, bp2, c16 .align 4 LL(18): lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vxor VZERO, VZERO, VZERO vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c07, c07, c07, swap vperm c08, c08, c08, swap vperm c13, c13, c13, swap vperm c14, c14, c14, swap vperm c15, c15, c15, swap vperm c16, c16, c16, swap vxor c05, c05, neg vxor c06, c06, neg vxor c07, c07, neg vxor c08, c08, neg vxor c13, c13, neg vxor c14, c14, neg vxor c15, c15, neg vxor c16, c16, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c03, c03, c07 vaddfp c04, c04, c08 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vaddfp c11, c11, c15 vaddfp c12, c12, c16 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c07, c03, c03, swap vperm c08, c04, c04, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vperm c15, c11, c11, swap vperm c16, c12, c12, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c03, alpha_r, c03, VZERO vmaddfp c04, alpha_r, c04, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c03, alpha_i, c07, c03 vmaddfp c04, alpha_i, c08, c04 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c11, alpha_r, c11, VZERO vmaddfp c12, alpha_r, c12, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 vmaddfp c11, alpha_i, c15, c11 vmaddfp c12, alpha_i, c16, c12 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 vaddfp c03, c03, C4 vaddfp c04, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, c11, PERMRSHIFT2 vperm c11, c11, c12, PERMRSHIFT2 vperm c12, c12, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 vaddfp c11, c11, C4 vaddfp c12, c12, C5 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 stvx c11, OFFSET_3, CO2 stvx c12, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 4 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4 LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4 LL(25): andi. r0, K, 1 ble+ LL(28) .align 4 LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4 LL(28): vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c13, c13, c13, swap vperm c14, c14, c14, swap vxor c05, c05, neg vxor c06, c06, neg vxor c13, c13, neg vxor c14, c14, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4 LL(30): andi. I, M, 2 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4 LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4 LL(35): andi. r0, K, 1 ble+ LL(38) .align 4 LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c13, c13, c13, swap vxor c05, c05, neg vxor c13, c13, neg vaddfp c01, c01, c05 vaddfp c09, c09, c13 vperm c05, c01, c01, swap vperm c13, c09, c09, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c09, alpha_i, c13, c09 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4 LL(40): andi. I, M, 1 ble LL(49) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(BO) LFD f11, 1 * SIZE(BO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4 LL(42): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): andi. r0, K, 1 ble LL(48) .align 4 LL(46): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(48): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) fadd f0, f0, f3 fsub f1, f1, f2 fadd f4, f4, f7 fsub f5, f5, f6 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) fadd f0, f0, f3 fsub f1, f2, f1 fadd f4, f4, f7 fsub f5, f6, f5 #else /* RR, RC, CR, CC */ fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6 #endif LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) lfs f12, ALPHA_R + 0(SP) lfs f13, ALPHA_I + 4(SP) #if defined(RR) || defined(RC) || defined(CR) || defined(CC) fmadd f8, f12, f0, f8 fnmsub f9, f12, f1, f9 fmadd f10, f12, f4, f10 fnmsub f11, f12, f5, f11 fmadd f8, f13, f1, f8 fmadd f9, f13, f0, f9 fmadd f10, f13, f5, f10 fmadd f11, f13, f4, f11 #else fmadd f8, f12, f0, f8 fmadd f9, f12, f1, f9 fmadd f10, f12, f4, f10 fmadd f11, f12, f5, f11 fnmsub f8, f13, f1, f8 fmadd f9, f13, f0, f9 fnmsub f10, f13, f5, f10 fmadd f11, f13, f4, f11 #endif STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) STFD f10, 0 * SIZE(CO2) STFD f11, 1 * SIZE(CO2) LL(49): mr B, BO addic. J, J, -1 bgt LL(01) .align 4 LL(50): andi. J, N, 1 ble LL(999) mr CO1, C mr AO, A srawi. I, M, 3 ble LL(70) .align 4 LL(61): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B dcbtst CO1, PREC dcbtst CO2, PREC vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(65) .align 4 LL(62): LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 3 vmaddfp c02, a6, bp1, c02 vmaddfp c03, a7, bp1, c03 vmaddfp c04, a8, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c05, a5, bp2, c05 vmaddfp c06, a6, bp2, c06 vmaddfp c07, a7, bp2, c07 vmaddfp c08, a8, bp2, c08 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(62) .align 4 LL(65): andi. r0, K, 1 ble+ LL(68) .align 4 LL(66): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 16 * SIZE vmaddfp c03, a3, bp1, c03 addi BO, BO, 2 * SIZE vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 .align 4 LL(68): vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c07, c07, c07, swap vperm c08, c08, c08, swap vxor c05, c05, neg vxor c06, c06, neg vxor c07, c07, neg vxor c08, c08, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c03, c03, c07 vaddfp c04, c04, c08 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c07, c03, c03, swap vperm c08, c04, c04, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c03, alpha_r, c03, VZERO vmaddfp c04, alpha_r, c04, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c03, alpha_i, c07, c03 vmaddfp c04, alpha_i, c08, c04 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 vaddfp c03, c03, C4 vaddfp c04, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 addi CO1, CO1, 16 * SIZE addic. I, I, -1 bgt+ LL(61) .align 4 LL(70): andi. I, M, 4 ble LL(80) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(75) .align 4 LL(72): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c03, a3, bp1, c03 vspltw bp2, b1, 3 vmaddfp c04, a4, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(72) .align 4 LL(75): andi. r0, K, 1 ble+ LL(78) .align 4 LL(76): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c05, a1, bp2, c05 addi BO, BO, 2 * SIZE vmaddfp c06, a2, bp2, c06 .align 4 LL(78): vaddfp c01, c01, c03 vaddfp c02, c02, c04 vaddfp c05, c05, c07 vaddfp c06, c06, c08 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vxor c05, c05, neg vxor c06, c06, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 addi CO1, CO1, 8 * SIZE .align 4 LL(80): andi. I, M, 2 ble LL(90) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 vxor c06, c06, c06 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(85) .align 4 LL(82): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c06, a2, bp2, c06 addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO bdnz LL(82) .align 4 LL(85): andi. r0, K, 1 ble+ LL(88) .align 4 LL(86): vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c05, a1, bp2, c05 addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(88): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vxor c05, c05, neg vaddfp c01, c01, c05 vperm c05, c01, c01, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c01, alpha_i, c05, c01 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 addi CO1, CO1, 4 * SIZE .align 4 LL(90): andi. I, M, 1 ble LL(999) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(BO) LFD f11, 1 * SIZE(BO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(95) .align 4 LL(92): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) fmadd f0, f8, f12, f0 fmadd f2, f8, f13, f2 fmadd f1, f9, f12, f1 fmadd f3, f9, f13, f3 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(92) .align 4 LL(95): andi. r0, K, 1 ble LL(98) .align 4 LL(96): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 .align 4 LL(98): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) fsub f0, f0, f3 fadd f1, f1, f2 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) fadd f0, f0, f3 fsub f1, f1, f2 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) fadd f0, f0, f3 fsub f1, f2, f1 #else /* RR, RC, CR, CC */ fsub f0, f0, f3 fadd f1, f1, f2 #endif LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) lfs f12, ALPHA_R + 0(SP) lfs f13, ALPHA_I + 4(SP) #if defined(RR) || defined(RC) || defined(CR) || defined(CC) fmadd f8, f12, f0, f8 fnmsub f9, f12, f1, f9 fmadd f8, f13, f1, f8 fmadd f9, f13, f0, f9 #else fmadd f8, f12, f0, f8 fmadd f9, f12, f1, f9 fnmsub f8, f13, f1, f8 fmadd f9, f13, f0, f9 #endif STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) .align 4 LL(999): mr SP, STACK li r0, 0 * 16 lvx v20, SP, r0 li r0, 1 * 16 lvx v21, SP, r0 li r0, 2 * 16 lvx v22, SP, r0 li r0, 3 * 16 lvx v23, SP, r0 li r0, 4 * 16 lvx v24, SP, r0 li r0, 5 * 16 lvx v25, SP, r0 li r0, 6 * 16 lvx v26, SP, r0 li r0, 7 * 16 lvx v27, SP, r0 li r0, 8 * 16 lvx v28, SP, r0 li r0, 9 * 16 lvx v29, SP, r0 li r0, 10 * 16 lvx v30, SP, r0 li r0, 11 * 16 lvx v31, SP, r0 mtspr VRsave, VREG #ifdef __64BIT__ ld r31, 192(SP) ld r30, 200(SP) ld r29, 208(SP) ld r28, 216(SP) ld r27, 224(SP) ld r26, 232(SP) ld r25, 240(SP) ld r24, 248(SP) ld r23, 256(SP) ld r22, 264(SP) ld r21, 272(SP) ld r20, 280(SP) ld r19, 288(SP) ld r18, 296(SP) ld r17, 304(SP) ld r16, 312(SP) ld r15, 320(SP) ld r14, 328(SP) #else lwz r31, 192(SP) lwz r30, 196(SP) lwz r29, 200(SP) lwz r28, 204(SP) lwz r27, 208(SP) lwz r26, 212(SP) lwz r25, 216(SP) lwz r24, 220(SP) lwz r23, 224(SP) lwz r22, 228(SP) lwz r21, 232(SP) lwz r20, 236(SP) lwz r19, 240(SP) lwz r18, 244(SP) lwz r17, 248(SP) lwz r16, 252(SP) lwz r15, 256(SP) lwz r14, 260(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_altivec_g4.S000066400000000000000000001006711313527062700224710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 360 #else #define STACKSIZE 272 #endif #define ALIGN_SIZE 0xffff #define SWAP 0 #define NEG 16 #define ALPHA_R 32 #define ALPHA_I 48 #define FZERO 64 #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #define STACK r11 #define I r21 #define J r22 #define AO r23 #define BO r24 #define CO1 r25 #define CO2 r26 #define PREA r29 #define PREB r29 #define PREC r30 #define VREG r31 #define LOAD_A lvx #define LOAD_B lvx #define OFFSET_0 0 #define OFFSET_1 r14 #define OFFSET_2 r15 #define OFFSET_3 r16 #define OFFSET_4 r17 #define OFFSET_5 r18 #define OFFSET_6 r19 #define OFFSET_7 r20 #define c01 v0 #define c02 v1 #define c03 v2 #define c04 v3 #define c05 v4 #define c06 v5 #define c07 v6 #define c08 v7 #define c09 v8 #define c10 v9 #define c11 v10 #define c12 v11 #define c13 v12 #define c14 v13 #define c15 v14 #define c16 v15 #define a1 v16 #define a2 v17 #define a3 v18 #define a4 v19 #define a5 v20 #define a6 v21 #define a7 v22 #define a8 v23 #define b1 v24 #define b2 v25 #define bp1 v26 #define bp2 v27 #define C1 v16 #define C2 v17 #define C3 v18 #define C4 v19 #define C5 v20 #define c00 v24 #define VZERO v25 #define PERMRSHIFT1 v26 #define PERMRSHIFT2 v27 #define swap v28 #define neg v29 #define alpha_r v30 #define alpha_i v31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE mr STACK, SP li r0, 0 * 16 stvx v20, SP, r0 li r0, 1 * 16 stvx v21, SP, r0 li r0, 2 * 16 stvx v22, SP, r0 li r0, 3 * 16 stvx v23, SP, r0 li r0, 4 * 16 stvx v24, SP, r0 li r0, 5 * 16 stvx v25, SP, r0 li r0, 6 * 16 stvx v26, SP, r0 li r0, 7 * 16 stvx v27, SP, r0 li r0, 8 * 16 stvx v28, SP, r0 li r0, 9 * 16 stvx v29, SP, r0 li r0, 10 * 16 stvx v30, SP, r0 li r0, 11 * 16 stvx v31, SP, r0 #ifdef __64BIT__ std r31, 192(SP) std r30, 200(SP) std r29, 208(SP) std r28, 216(SP) std r27, 224(SP) std r26, 232(SP) std r25, 240(SP) std r24, 248(SP) std r23, 256(SP) std r22, 264(SP) std r21, 272(SP) std r20, 280(SP) std r19, 288(SP) std r18, 296(SP) std r17, 304(SP) std r16, 312(SP) std r15, 320(SP) std r14, 328(SP) #else stw r31, 192(SP) stw r30, 196(SP) stw r29, 200(SP) stw r28, 204(SP) stw r27, 208(SP) stw r26, 212(SP) stw r25, 216(SP) stw r24, 220(SP) stw r23, 224(SP) stw r22, 228(SP) stw r21, 232(SP) stw r20, 236(SP) stw r19, 240(SP) stw r18, 244(SP) stw r17, 248(SP) stw r16, 252(SP) stw r15, 256(SP) stw r14, 260(SP) #endif #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif li r0, -1 mfspr VREG, VRsave mtspr VRsave, r0 addi SP, SP, -128 li r0, -8192 and SP, SP, r0 fneg f3, f1 fneg f4, f2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NC) || defined(TC) || defined(NR) || defined(TR) stfs f1, ALPHA_R + 0(SP) stfs f1, ALPHA_R + 4(SP) stfs f1, ALPHA_R + 8(SP) stfs f1, ALPHA_R + 12(SP) stfs f4, ALPHA_I + 0(SP) stfs f2, ALPHA_I + 4(SP) stfs f4, ALPHA_I + 8(SP) stfs f2, ALPHA_I + 12(SP) #else stfs f1, ALPHA_R + 0(SP) stfs f3, ALPHA_R + 4(SP) stfs f1, ALPHA_R + 8(SP) stfs f3, ALPHA_R + 12(SP) stfs f2, ALPHA_I + 0(SP) stfs f2, ALPHA_I + 4(SP) stfs f2, ALPHA_I + 8(SP) stfs f2, ALPHA_I + 12(SP) #endif li I, Address_L(0x04050607) addis I, I, Address_H(0x04050607) stw I, SWAP + 0(SP) li I, Address_L(0x00010203) addis I, I, Address_H(0x00010203) stw I, SWAP + 4(SP) li I, Address_L(0x0c0d0e0f) addis I, I, Address_H(0x0c0d0e0f) stw I, SWAP + 8(SP) li I, Address_L(0x08090a0b) addis I, I, Address_H(0x08090a0b) stw I, SWAP + 12(SP) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) lis I, 0x8000 stw I, NEG + 0(SP) stw I, NEG + 8(SP) li I, 0 stw I, NEG + 4(SP) stw I, NEG + 12(SP) #else li I, 0 stw I, NEG + 0(SP) stw I, NEG + 8(SP) lis I, 0x8000 stw I, NEG + 4(SP) stw I, NEG + 12(SP) #endif li r0, 0 stw r0, FZERO(SP) slwi LDC, LDC, ZBASE_SHIFT li PREC, (15 * SIZE) li PREB, (25 * 8 * SIZE) li OFFSET_1, 4 * SIZE li OFFSET_2, 8 * SIZE li OFFSET_3, 12 * SIZE li OFFSET_4, 16 * SIZE li OFFSET_5, 20 * SIZE li OFFSET_6, 24 * SIZE li OFFSET_7, 28 * SIZE cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 1 ble LL(50) .align 4 LL(01): mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 3 ble LL(20) .align 4 LL(11): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO vxor c03, c03, c03 LOAD_A a2, OFFSET_1, AO vxor c04, c04, c04 LOAD_A a3, OFFSET_2, AO vxor c05, c05, c05 LOAD_A a4, OFFSET_3, AO vxor c06, c06, c06 LOAD_B b2, OFFSET_2, B vxor c07, c07, c07 LOAD_A a5, OFFSET_4, AO vxor c08, c08, c08 LOAD_A a6, OFFSET_5, AO vxor c09, c09, c09 dcbtst CO1, PREC vxor c10, c10, c10 dcbtst CO2, PREC vxor c11, c11, c11 vxor c12, c12, c12 vxor c13, c13, c13 mr BO, B vxor c14, c14, c14 srawi. r0, K, 2 vxor c15, c15, c15 mtspr CTR, r0 vxor c16, c16, c16 vspltw bp1, b1, 0 ble LL(15) .align 4 LL(12): /* 1 */ vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c03, a3, bp1, c03 LOAD_A a7, OFFSET_4, AO vmaddfp c04, a4, bp1, c04 LOAD_A a8, OFFSET_5, AO /* 2 */ vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 dcbt BO, PREB vmaddfp c07, a3, bp2, c07 dcbt AO, PREB vmaddfp c08, a4, bp2, c08 addi AO, AO, 8 * SIZE /* 3 */ vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b1, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 dcbt AO, PREB vmaddfp c12, a4, bp1, c12 addi AO, AO, 8 * SIZE /* 4 */ vmaddfp c13, a1, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a1, OFFSET_2, AO vmaddfp c15, a3, bp2, c15 dcbt AO, PREB vmaddfp c16, a4, bp2, c16 addi AO, AO, 8 * SIZE /* 5 */ vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a6, bp1, c02 LOAD_A a2, OFFSET_1, AO vmaddfp c03, a7, bp1, c03 LOAD_A a3, OFFSET_2, AO vmaddfp c04, a8, bp1, c04 LOAD_A a4, OFFSET_3, AO /* 6 */ vmaddfp c05, a5, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a6, bp2, c06 nop vmaddfp c07, a7, bp2, c07 dcbt AO, PREA vmaddfp c08, a8, bp2, c08 addi AO, AO, 8 * SIZE /* 7 */ vmaddfp c09, a5, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a6, bp1, c10 LOAD_B b1, OFFSET_4, BO vmaddfp c11, a7, bp1, c11 nop vmaddfp c12, a8, bp1, c12 nop /* 8 */ vmaddfp c13, a5, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a5, OFFSET_2, AO vmaddfp c15, a7, bp2, c15 LOAD_A a6, OFFSET_3, AO vmaddfp c16, a8, bp2, c16 LOAD_A a7, OFFSET_4, AO /* 9 */ vmaddfp c01, a1, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a2, bp1, c02 LOAD_A a8, OFFSET_5, AO vmaddfp c03, a3, bp1, c03 addi BO, BO, 8 * SIZE vmaddfp c04, a4, bp1, c04 nop /* 10 */ vmaddfp c05, a1, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop /* 11 */ vmaddfp c09, a1, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b2, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 dcbt AO, PREA vmaddfp c12, a4, bp1, c12 addi AO, AO, 8 * SIZE /* 12 */ vmaddfp c13, a1, bp2, c13 vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a1, OFFSET_4, AO vmaddfp c15, a3, bp2, c15 LOAD_A a2, OFFSET_5, AO vmaddfp c16, a4, bp2, c16 LOAD_A a3, OFFSET_6, AO /* 13 */ vmaddfp c01, a5, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a6, bp1, c02 LOAD_A a4, OFFSET_7, AO vmaddfp c03, a7, bp1, c03 dcbt AO, PREA vmaddfp c04, a8, bp1, c04 addi AO, AO, 8 * SIZE /* 14 */ vmaddfp c05, a5, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a6, bp2, c06 nop vmaddfp c07, a7, bp2, c07 dcbt AO, PREA vmaddfp c08, a8, bp2, c08 addi AO, AO, 8 * SIZE /* 15 */ vmaddfp c09, a5, bp1, c09 vspltw bp2, b2, 3 vmaddfp c10, a6, bp1, c10 LOAD_B b2, OFFSET_4, BO vmaddfp c11, a7, bp1, c11 dcbt AO, PREA vmaddfp c12, a8, bp1, c12 addi BO, BO, 8 * SIZE /* 16 */ vmaddfp c13, a5, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a6, bp2, c14 LOAD_A a5, OFFSET_4, AO vmaddfp c15, a7, bp2, c15 LOAD_A a6, OFFSET_5, AO vmaddfp c16, a8, bp2, c16 bdnz+ LL(12) .align 4 LL(15): lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP andi. r0, K, 3 mtspr CTR, r0 ble+ LL(18) .align 4 LL(16): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c03, a3, bp1, c03 nop vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c07, a3, bp2, c07 nop vmaddfp c08, a4, bp2, c08 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 LOAD_B b1, OFFSET_1, BO vmaddfp c11, a3, bp1, c11 addi AO, AO, 16 * SIZE vmaddfp c12, a4, bp1, c12 addi BO, BO, 4 * SIZE vmaddfp c13, a1, bp2, c13 vspltw bp1, b1, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a1, OFFSET_0, AO vmaddfp c15, a3, bp2, c15 LOAD_A a2, OFFSET_1, AO vmaddfp c16, a4, bp2, c16 LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz+ LL(16) .align 4 LL(18): vxor VZERO, VZERO, VZERO vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c07, c07, c07, swap vperm c08, c08, c08, swap vperm c13, c13, c13, swap vperm c14, c14, c14, swap vperm c15, c15, c15, swap vperm c16, c16, c16, swap vxor c05, c05, neg vxor c06, c06, neg vxor c07, c07, neg vxor c08, c08, neg vxor c13, c13, neg vxor c14, c14, neg vxor c15, c15, neg vxor c16, c16, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c03, c03, c07 vaddfp c04, c04, c08 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vaddfp c11, c11, c15 vaddfp c12, c12, c16 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c07, c03, c03, swap vperm c08, c04, c04, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vperm c15, c11, c11, swap vperm c16, c12, c12, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c03, alpha_r, c03, VZERO vmaddfp c04, alpha_r, c04, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c03, alpha_i, c07, c03 vmaddfp c04, alpha_i, c08, c04 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c11, alpha_r, c11, VZERO vmaddfp c12, alpha_r, c12, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 vmaddfp c11, alpha_i, c15, c11 vmaddfp c12, alpha_i, c16, c12 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 vaddfp c03, c03, C4 vaddfp c04, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, c11, PERMRSHIFT2 vperm c11, c11, c12, PERMRSHIFT2 vperm c12, c12, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 vaddfp c11, c11, C4 vaddfp c12, c12, C5 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 stvx c11, OFFSET_3, CO2 stvx c12, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4 LL(20): andi. I, M, 4 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4 LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4 LL(25): andi. r0, K, 1 ble+ LL(28) .align 4 LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4 LL(28): vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c13, c13, c13, swap vperm c14, c14, c14, swap vxor c05, c05, neg vxor c06, c06, neg vxor c13, c13, neg vxor c14, c14, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4 LL(30): andi. I, M, 2 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4 LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4 LL(35): andi. r0, K, 1 ble+ LL(38) .align 4 LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c13, c13, c13, swap vxor c05, c05, neg vxor c13, c13, neg vaddfp c01, c01, c05 vaddfp c09, c09, c13 vperm c05, c01, c01, swap vperm c13, c09, c09, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c09, alpha_i, c13, c09 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4 LL(40): andi. I, M, 1 ble LL(49) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(BO) LFD f11, 1 * SIZE(BO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4 LL(42): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): andi. r0, K, 1 ble LL(48) .align 4 LL(46): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4 LL(48): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) fadd f0, f0, f3 fsub f1, f1, f2 fadd f4, f4, f7 fsub f5, f5, f6 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) fadd f0, f0, f3 fsub f1, f2, f1 fadd f4, f4, f7 fsub f5, f6, f5 #else /* RR, RC, CR, CC */ fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6 #endif LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) lfs f12, ALPHA_R + 0(SP) lfs f13, ALPHA_I + 4(SP) #if defined(RR) || defined(RC) || defined(CR) || defined(CC) fmadd f8, f12, f0, f8 fnmsub f9, f12, f1, f9 fmadd f10, f12, f4, f10 fnmsub f11, f12, f5, f11 fmadd f8, f13, f1, f8 fmadd f9, f13, f0, f9 fmadd f10, f13, f5, f10 fmadd f11, f13, f4, f11 #else fmadd f8, f12, f0, f8 fmadd f9, f12, f1, f9 fmadd f10, f12, f4, f10 fmadd f11, f12, f5, f11 fnmsub f8, f13, f1, f8 fmadd f9, f13, f0, f9 fnmsub f10, f13, f5, f10 fmadd f11, f13, f4, f11 #endif STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) STFD f10, 0 * SIZE(CO2) STFD f11, 1 * SIZE(CO2) LL(49): mr B, BO addic. J, J, -1 bgt LL(01) .align 4 LL(50): andi. J, N, 1 ble LL(999) mr CO1, C mr AO, A srawi. I, M, 3 ble LL(70) .align 4 LL(61): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B dcbtst CO1, PREC dcbtst CO2, PREC vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(65) .align 4 LL(62): LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 3 vmaddfp c02, a6, bp1, c02 vmaddfp c03, a7, bp1, c03 vmaddfp c04, a8, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c05, a5, bp2, c05 vmaddfp c06, a6, bp2, c06 vmaddfp c07, a7, bp2, c07 vmaddfp c08, a8, bp2, c08 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(62) .align 4 LL(65): andi. r0, K, 1 ble+ LL(68) .align 4 LL(66): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 16 * SIZE vmaddfp c03, a3, bp1, c03 addi BO, BO, 2 * SIZE vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 .align 4 LL(68): vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c07, c07, c07, swap vperm c08, c08, c08, swap vxor c05, c05, neg vxor c06, c06, neg vxor c07, c07, neg vxor c08, c08, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c03, c03, c07 vaddfp c04, c04, c08 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c07, c03, c03, swap vperm c08, c04, c04, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c03, alpha_r, c03, VZERO vmaddfp c04, alpha_r, c04, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c03, alpha_i, c07, c03 vmaddfp c04, alpha_i, c08, c04 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 vaddfp c03, c03, C4 vaddfp c04, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 addi CO1, CO1, 16 * SIZE addic. I, I, -1 bgt+ LL(61) .align 4 LL(70): andi. I, M, 4 ble LL(80) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(75) .align 4 LL(72): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c03, a3, bp1, c03 vspltw bp2, b1, 3 vmaddfp c04, a4, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(72) .align 4 LL(75): andi. r0, K, 1 ble+ LL(78) .align 4 LL(76): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c05, a1, bp2, c05 addi BO, BO, 2 * SIZE vmaddfp c06, a2, bp2, c06 .align 4 LL(78): vaddfp c01, c01, c03 vaddfp c02, c02, c04 vaddfp c05, c05, c07 vaddfp c06, c06, c08 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vxor c05, c05, neg vxor c06, c06, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 addi CO1, CO1, 8 * SIZE .align 4 LL(80): andi. I, M, 2 ble LL(90) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 vxor c06, c06, c06 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(85) .align 4 LL(82): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c06, a2, bp2, c06 addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO bdnz LL(82) .align 4 LL(85): andi. r0, K, 1 ble+ LL(88) .align 4 LL(86): vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c05, a1, bp2, c05 addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE .align 4 LL(88): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vxor c05, c05, neg vaddfp c01, c01, c05 vperm c05, c01, c01, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c01, alpha_i, c05, c01 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 addi CO1, CO1, 4 * SIZE .align 4 LL(90): andi. I, M, 1 ble LL(999) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(BO) LFD f11, 1 * SIZE(BO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(95) .align 4 LL(92): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) fmadd f0, f8, f12, f0 fmadd f2, f8, f13, f2 fmadd f1, f9, f12, f1 fmadd f3, f9, f13, f3 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(92) .align 4 LL(95): andi. r0, K, 1 ble LL(98) .align 4 LL(96): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 .align 4 LL(98): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) fsub f0, f0, f3 fadd f1, f1, f2 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) fadd f0, f0, f3 fsub f1, f1, f2 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) fadd f0, f0, f3 fsub f1, f2, f1 #else /* RR, RC, CR, CC */ fsub f0, f0, f3 fadd f1, f1, f2 #endif LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) lfs f12, ALPHA_R + 0(SP) lfs f13, ALPHA_I + 4(SP) #if defined(RR) || defined(RC) || defined(CR) || defined(CC) fmadd f8, f12, f0, f8 fnmsub f9, f12, f1, f9 fmadd f8, f13, f1, f8 fmadd f9, f13, f0, f9 #else fmadd f8, f12, f0, f8 fmadd f9, f12, f1, f9 fnmsub f8, f13, f1, f8 fmadd f9, f13, f0, f9 #endif STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) .align 4 LL(999): mr SP, STACK li r0, 0 * 16 lvx v20, SP, r0 li r0, 1 * 16 lvx v21, SP, r0 li r0, 2 * 16 lvx v22, SP, r0 li r0, 3 * 16 lvx v23, SP, r0 li r0, 4 * 16 lvx v24, SP, r0 li r0, 5 * 16 lvx v25, SP, r0 li r0, 6 * 16 lvx v26, SP, r0 li r0, 7 * 16 lvx v27, SP, r0 li r0, 8 * 16 lvx v28, SP, r0 li r0, 9 * 16 lvx v29, SP, r0 li r0, 10 * 16 lvx v30, SP, r0 li r0, 11 * 16 lvx v31, SP, r0 mtspr VRsave, VREG #ifdef __64BIT__ ld r31, 192(SP) ld r30, 200(SP) ld r29, 208(SP) ld r28, 216(SP) ld r27, 224(SP) ld r26, 232(SP) ld r25, 240(SP) ld r24, 248(SP) ld r23, 256(SP) ld r22, 264(SP) ld r21, 272(SP) ld r20, 280(SP) ld r19, 288(SP) ld r18, 296(SP) ld r17, 304(SP) ld r16, 312(SP) ld r15, 320(SP) ld r14, 328(SP) #else lwz r31, 192(SP) lwz r30, 196(SP) lwz r29, 200(SP) lwz r28, 204(SP) lwz r27, 208(SP) lwz r26, 212(SP) lwz r25, 216(SP) lwz r24, 220(SP) lwz r23, 224(SP) lwz r22, 228(SP) lwz r21, 232(SP) lwz r20, 236(SP) lwz r19, 240(SP) lwz r18, 244(SP) lwz r17, 248(SP) lwz r16, 252(SP) lwz r15, 256(SP) lwz r14, 260(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_cell.S000066400000000000000000001015641313527062700213710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM #ifndef DOUBLE #include "../cparam.h" #else #include "../zparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) #ifdef TRMMKERNEL std r23, 208(SP) std r22, 216(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) #ifdef TRMMKERNEL stw r23, 176(SP) stw r22, 180(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST li PREC, 3 * SIZE li PREA, 16 * 12 * SIZE #else #ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif lfs f0, FZERO srawi. J, N, 1 ble LL(30) .align 4 LL(10): fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr AO, A ble LL(20) .align 4 LL(11): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) PREFETCH_C1 nop nop PREFETCH_C2 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(15) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) #endif PREFETCH_C1 PREFETCH_C2 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(15) #endif .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(KERNEL_MainFinish) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble LL(KERNEL_MainFinish) #endif .align 4 LL(16): FMADD f0, f16, f20, f0 FMADD f5, f17, f21, f5 FMADD f10, f18, f22, f10 FMADD f15, f19, f23, f15 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f11, f19, f22, f11 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 FMADD f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f9, f12 FADD f10, f10, f15 FSUB f11, f11, f14 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FMADD f20, f30, f8, f20 FMADD f21, f30, f9, f21 FMADD f22, f30, f10, f22 FMADD f23, f30, f11, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 FNMSUB f20, f31, f9, f20 FMADD f21, f31, f8, f21 FNMSUB f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f20, f30, f8, f20 FNMSUB f21, f30, f9, f21 FMADD f22, f30, f10, f22 FNMSUB f23, f30, f11, f23 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FNMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FNMADD f23, f31, f10, f23 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f20, 0 * SIZE(CO2) STFD f21, 1 * SIZE(CO2) STFD f22, 2 * SIZE(CO2) STFD f23, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(25) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(25) #endif .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(27) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble LL(27) #endif .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 0 * SIZE(CO2) STFD f19, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(29): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4 LL(30): andi. J, N, 1 ble LL(999) #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr CO1, C add C, C, LDC mr AO, A ble LL(40) .align 4 LL(31): #ifndef TRMMKERNEL LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(35) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif PREFETCH_C1 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(35) #endif .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(32) .align 4 LL(35): #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(37) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble LL(37) #endif .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) addi CO1, CO1, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(999) #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(45) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(45) #endif .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,r0 ble LL(47) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,TEMP ble LL(47) #endif .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) fsub f0, f0, f1 fadd f2, f2, f3 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) fadd f0, f0, f1 fsub f2, f2, f3 #else fadd f0, f0, f1 fsub f2, f3, f2 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f2, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 #endif FNMSUB f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC) || defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f2, f17 FMADD f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 FMADD f16, f31, f2, f16 FNMADD f17, f31, f0, f17 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) #ifdef TRMMKERNEL ld r23, 208(SP) ld r22, 216(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) #ifdef TRMMKERNEL lwz r23, 176(SP) lwz r22, 180(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_g4.S000066400000000000000000000740231313527062700207630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) #ifdef TRMMKERNEL std r23, 208(SP) std r22, 216(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) #ifdef TRMMKERNEL stw r23, 176(SP) stw r22, 180(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif slwi LDC, LDC, ZBASE_SHIFT li PREA, 8 * 8 * SIZE li PREC, 3 * SIZE cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 lfs f0, FZERO srawi. J, N, 1 ble .L30 .align 4 .L10: fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr AO, A ble .L20 .align 4 .L11: #ifndef TRMMKERNEL LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A3, 2 * SIZE(AO) LFDU A5, 4 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, K, 1 mr BO, B mtspr CTR, r0 ble .L15 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A3, 2 * SIZE(AO) LFDU A5, 4 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A3, 2 * SIZE(AO) LFDU A5, 4 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) #endif dcbtst CO1, PREC dcbtst CO2, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 1 mtspr CTR, TEMP ble .L15 #endif .align 4 .L12: FMADD f0, A1, B1, f0 dcbt AO, PREA FMADD f4, A1, B2, f4 LFDU B5, 4 * SIZE(BO) FMADD f8, A1, B3, f8 dcbt BO, PREA FMADD f12, A1, B4, f12 LFD A4, -1 * SIZE(AO) FMADD f1, A2, B1, f1 nop FMADD f5, A2, B2, f5 LFD B6, 1 * SIZE(BO) FMADD f9, A2, B3, f9 LFDU A1, 4 * SIZE(AO) FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B7, 2 * SIZE(BO) FMADD f10, A3, B3, f10 LFD A2, -3 * SIZE(AO) FMADD f14, A3, B4, f14 nop FMADD f3, A4, B1, f3 nop FMADD f7, A4, B2, f7 LFD B8, 3 * SIZE(BO) FMADD f11, A4, B3, f11 LFD A3, -2 * SIZE(AO) FMADD f15, A4, B4, f15 nop FMADD f0, A5, B5, f0 #ifdef DOUBLE dcbt AO, PREA #else nop #endif FMADD f4, A5, B6, f4 LFDU B1, 4 * SIZE(BO) FMADD f8, A5, B7, f8 #ifdef DOUBLE dcbt BO, PREA #else nop #endif FMADD f12, A5, B8, f12 LFD A4, -1 * SIZE(AO) FMADD f1, A2, B5, f1 nop FMADD f5, A2, B6, f5 LFD B2, 1 * SIZE(BO) FMADD f9, A2, B7, f9 LFDU A5, 4 * SIZE(AO) FMADD f13, A2, B8, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B6, f6 LFD B3, 2 * SIZE(BO) FMADD f10, A3, B7, f10 LFD A2, -3 * SIZE(AO) FMADD f14, A3, B8, f14 nop FMADD f3, A4, B5, f3 nop FMADD f7, A4, B6, f7 LFD B4, 3 * SIZE(BO) FMADD f11, A4, B7, f11 LFD A3, -2 * SIZE(AO) FMADD f15, A4, B8, f15 bdnz .L12 .align 4 .align 4 .L15: addi AO, AO, -4 * SIZE #ifndef TRMMKERNEL andi. r0, K, 1 lfd f30, ALPHA_R lfd f31, ALPHA_I ble .LKERNEL_MainFinish #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 1 lfd f30, ALPHA_R lfd f31, ALPHA_I ble .LKERNEL_MainFinish #endif .align 4 .L16: FMADD f0, A1, B1, f0 LFD A4, 3 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 FMADD f3, A4, B1, f3 FMADD f7, A4, B2, f7 FMADD f11, A4, B3, f11 addi AO, AO, 4 * SIZE FMADD f15, A4, B4, f15 addi BO, BO, 4 * SIZE .align 4 .LKERNEL_MainFinish: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f9, f12 FADD f10, f10, f15 FSUB f11, f11, f14 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FMADD f20, f30, f8, f20 FMADD f21, f30, f9, f21 FMADD f22, f30, f10, f22 FMADD f23, f30, f11, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 FNMSUB f20, f31, f9, f20 FMADD f21, f31, f8, f21 FNMSUB f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f20, f30, f8, f20 FNMSUB f21, f30, f9, f21 FMADD f22, f30, f10, f22 FNMSUB f23, f30, f11, f23 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FNMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FNMADD f23, f31, f10, f23 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f20, 0 * SIZE(CO2) STFD f21, 1 * SIZE(CO2) STFD f22, 2 * SIZE(CO2) STFD f23, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt .L11 .align 4 .L20: andi. I, M, 1 ble .L29 #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble .L25 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L25 #endif .align 4 .L22: fmadd f0, f16, f20, f0 LFD f27, 7 * SIZE(BO) fmadd f1, f16, f21, f1 LFD f19, 3 * SIZE(AO) fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(AO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(BO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(BO) fmadd f7, f19, f27, f7 bdnz .L22 .align 4 .L25: #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble .L27 #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble .L27 #endif .align 4 .L26: fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) LFD f17, 1 * SIZE(AO) bdnz .L26 .align 4 .L27: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 0 * SIZE(CO2) STFD f19, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO addic. J, J, -1 lfs f0, FZERO bgt .L10 .align 4 .L30: andi. J, N, 1 ble .L999 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr CO1, C add C, C, LDC mr AO, A ble .L40 .align 4 .L31: #ifndef TRMMKERNEL LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble .L35 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L35 #endif .align 4 .L32: fmadd f0, f16, f20, f0 LFD f27, 7 * SIZE(AO) fmadd f1, f16, f21, f1 LFD f19, 3 * SIZE(BO) fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(BO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(AO) fmadd f7, f19, f27, f7 bdnz .L32 .align 4 .L35: #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble .L37 #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble .L37 #endif .align 4 .L36: fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) LFD f17, 1 * SIZE(BO) bdnz .L36 .align 4 .L37: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) addi CO1, CO1, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt .L31 .align 4 .L40: andi. I, M, 1 ble .L999 #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble .L45 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L45 #endif .align 4 .L42: fmadd f0, f16, f20, f0 LFD f23, 3 * SIZE(BO) fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) fmadd f2, f17, f20, f2 LFD f20, 4 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 5 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 6 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) fmadd f3, f16, f21, f3 LFDU f16, 8 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 8 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 2 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 3 * SIZE(AO) bdnz .L42 .align 4 .L45: fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,r0 ble .L47 #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,TEMP ble .L47 #endif .align 4 .L46: fmadd f0, f16, f20, f0 fmadd f3, f16, f21, f3 LFDU f16, 2 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 2 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L46 .align 4 .L47: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) fsub f0, f0, f1 fadd f2, f2, f3 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) fadd f0, f0, f1 fsub f2, f2, f3 #else fadd f0, f0, f1 fsub f2, f3, f2 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f2, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 #endif FNMSUB f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC) || defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f2, f17 FMADD f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 FMADD f16, f31, f2, f16 FNMADD f17, f31, f0, f17 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) #ifdef TRMMKERNEL ld r23, 208(SP) ld r22, 216(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) #ifdef TRMMKERNEL lwz r23, 176(SP) lwz r22, 180(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_hummer.S000066400000000000000000002324071313527062700217500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #undef ZERO #define ALPHA 0 #define FZERO 16 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define KK r14 #define INCM1 r15 #define INCM3 r16 #define INCM5 r17 #define INCM7 r18 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define ZERO r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) #define FXCPMADD fxcpmadd #define FXCSMADD fxcxnpma #else #define FXCPMADD fxcpnsma #define FXCSMADD fxcxma #endif PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f2, -8(SP) stfdu f1, -8(SP) slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif andi. r0, C, 2 * SIZE - 1 bne .L1000 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM3, -2 * SIZE li INCM5, -4 * SIZE li INCM7, -6 * SIZE addi C, C, - 2 * SIZE srawi. J, N, 1 ble .L50 .align 4 .L10: mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -4 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L20 .align 4 .L11: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 2 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A1, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 LFPDUX B1, BO, INC4 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A3, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 LFPDUX A5, AO, INC4 FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 LFPDUX B3, BO, INC4 FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A6, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A7, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A9, f12 LFPDUX B5, BO, INC4 FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A8, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 LFPDUX A2, AO2, INC4 FXCSMADD f7, B6, A4, f7 LFPDUX A9, AO, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 #else nop #endif FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 #ifndef TRMMKERNEL LFPDUX B1, CO1, INC2 #else nop #endif FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 #ifndef TRMMKERNEL LFPDUX A3, CO1, INC2 #else nop #endif FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 #ifndef TRMMKERNEL LFPDUX A5, CO1, INC2 #else nop #endif FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 #ifndef TRMMKERNEL LFPDUX B3, CO2, INC2 #else nop #endif FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 #ifndef TRMMKERNEL LFPDUX A6, CO2, INC2 #else nop #endif FXCSMADD f13, B2, A2, f13 FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 #ifndef TRMMKERNEL LFPDUX A7, CO2, INC2 #else nop #endif FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 #ifndef TRMMKERNEL LFPDUX B2, CO2, INC2 #else nop #endif FXCSMADD f12, B4, A9, f12 FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 FXCSMADD f6, B6, A10, f6 FXCPMADD f10, B4, A10, f10 FXCSMADD f14, B4, A10, f14 FXCPMADD f3, B6, A4, f3 FXCSMADD f7, B6, A4, f7 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 .align 4 .L14: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 cmpwi cr0, TEMP, 3 bgt+ .L15 #else andi. r0, K, 3 mtspr CTR, r0 ble+ .L18 cmpwi cr0, K, 3 bgt+ .L15 #endif #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 fpmr f5, f0 LFPDUX B1, CO1, INC2 fpmr f9, f0 LFPDUX A3, CO1, INC2 fpmr f13, f0 LFPDUX A5, CO1, INC2 fpmr f2, f0 LFPDUX B3, CO2, INC2 fpmr f6, f0 LFPDUX A6, CO2, INC2 fpmr f10, f0 LFPDUX A7, CO2, INC2 fpmr f14, f0 LFPDUX B2, CO2, INC2 fpmr f3, f0 #else fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 #endif fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 .align 4 .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 LFPDUX A2, AO, INC4 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 LFPDUX A10, BO, INC4 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 .align 4 .L18: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 fpadd f2, f2, f6 fpadd f10, f10, f14 fpadd f3, f3, f7 fpadd f11, f11, f15 #else fpsub f0, f0, f4 fpsub f8, f8, f12 fpsub f1, f1, f5 fpsub f9, f9, f13 fpsub f2, f2, f6 fpsub f10, f10, f14 fpsub f3, f3, f7 fpsub f11, f11, f15 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd B1, f1, AP, B1 fxcpmadd A3, f2, AP, A3 fxcpmadd A5, f3, AP, A5 fxcxnpma f0, f0, AP, A1 fxcpmadd B3, f8, AP, B3 fxcxnpma f1, f1, AP, B1 fxcpmadd A6, f9, AP, A6 fxcxnpma f2, f2, AP, A3 fxcpmadd A7, f10, AP, A7 fxcxnpma f3, f3, AP, A5 fxcpmadd B2, f11, AP, B2 fxcxnpma f8, f8, AP, B3 STFPDUX f0, CO1, INCM7 fxcxnpma f9, f9, AP, A6 STFPDUX f1, CO1, INC2 fxcxnpma f10, f10, AP, A7 STFPDUX f2, CO1, INC2 fxcxnpma f11, f11, AP, B2 STFPDUX f3, CO1, INC2 STFPDUX f8, CO2, INCM7 STFPDUX f9, CO2, INC2 STFPDUX f10, CO2, INC2 STFPDUX f11, CO2, INC2 #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f2, AP, f30 fxcpmadd f15, f3, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f2, f2, AP, f14 fxcxnpma f3, f3, AP, f15 fxcpmadd f16, f8, AP, f30 fxcpmadd f17, f9, AP, f30 fxcpmadd f18, f10, AP, f30 fxcpmadd f19, f11, AP, f30 fxcxnpma f8, f8, AP, f16 fxcxnpma f9, f9, AP, f17 fxcxnpma f10, f10, AP, f18 fxcxnpma f11, f11, AP, f19 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 STFPDUX f8, CO2, INC2 STFPDUX f9, CO2, INC2 STFPDUX f10, CO2, INC2 STFPDUX f11, CO2, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L20: andi. I, M, 2 beq .L30 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 1 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4 .L24: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L28 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4 .L28: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX A2, CO1, INC2 LFPDUX A3, CO2, INC2 LFPDUX A4, CO2, INC2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 #else fpsub f0, f0, f4 fpsub f8, f8, f12 fpsub f1, f1, f5 fpsub f9, f9, f13 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcpmadd A3, f8, AP, A3 fxcpmadd A4, f9, AP, A4 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 fxcxnpma f8, f8, AP, A3 fxcxnpma f9, f9, AP, A4 STFPDUX f0, CO1, INCM3 STFPDUX f1, CO1, INC2 STFPDUX f8, CO2, INCM3 STFPDUX f9, CO2, INC2 #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f8, AP, f30 fxcpmadd f15, f9, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f8, f8, AP, f14 fxcxnpma f9, f9, AP, f15 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f8, CO2, INC2 STFPDUX f9, CO2, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: andi. I, M, 1 beq .L49 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 #else slwi TEMP, KK, 0 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4 .L34: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L38 LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4 .L38: #ifndef TRMMKERNEL LFPDX A1, CO1, INC2 LFPDX A2, CO2, INC2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f1 fpadd f2, f2, f3 #else fpsub f0, f0, f1 fpsub f2, f2, f3 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f2, AP, A2 fxcxnpma f0, f0, AP, A1 fxcxnpma f2, f2, AP, A2 #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f2, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f2, f2, AP, f13 #endif STFPDUX f0, CO1, INC2 STFPDUX f2, CO2, INC2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif addi B, BO, 4 * SIZE addic. J, J, -1 bgt+ .L10 .align 4 .L50: andi. J, N, 1 beq .L999 mr CO1, C #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L60 .align 4 .L51: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #else slwi TEMP, KK, 2 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #else srawi. r0, K, 2 fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 LFPDUX B1, BO, INC2 FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 LFPDUX B2, BO, INC2 FXCSMADD f4, B3, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B3, A3, f2 nop FXCSMADD f6, B3, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B4, A5, f0 LFPDUX B3, BO, INC2 FXCSMADD f4, B4, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B4, A6, f1 nop FXCSMADD f5, B4, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B4, A7, f2 nop FXCSMADD f6, B4, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B4, A8, f3 nop FXCSMADD f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 nop FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 FXCSMADD f4, B3, A1, f4 FXCPMADD f1, B3, A2, f1 FXCSMADD f5, B3, A2, f5 FXCPMADD f2, B3, A3, f2 FXCSMADD f6, B3, A3, f6 FXCPMADD f3, B3, A4, f3 FXCSMADD f7, B3, A4, f7 FXCPMADD f0, B4, A5, f0 FXCSMADD f4, B4, A5, f4 FXCPMADD f1, B4, A6, f1 FXCSMADD f5, B4, A6, f5 FXCPMADD f2, B4, A7, f2 FXCSMADD f6, B4, A7, f6 FXCPMADD f3, B4, A8, f3 FXCSMADD f7, B4, A8, f7 .align 4 .L54: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L58 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 .align 4 .L58: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX A2, CO1, INC2 LFPDUX A3, CO1, INC2 LFPDUX A4, CO1, INC2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f1, f1, f5 fpadd f2, f2, f6 fpadd f3, f3, f7 #else fpsub f0, f0, f4 fpsub f1, f1, f5 fpsub f2, f2, f6 fpsub f3, f3, f7 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcpmadd A3, f2, AP, A3 fxcpmadd A4, f3, AP, A4 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 fxcxnpma f2, f2, AP, A3 fxcxnpma f3, f3, AP, A4 STFPDUX f0, CO1, INCM7 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f2, AP, f30 fxcpmadd f15, f3, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f2, f2, AP, f14 fxcxnpma f3, f3, AP, f15 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 STFPDUX f2, CO1, INC2 STFPDUX f3, CO1, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L60: andi. I, M, 2 beq .L70 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 1 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 2 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L64 #else srawi. r0, K, 2 fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 LFPDUX A3, AO, INC2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 LFPDUX A5, AO, INC2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 LFPDUX A7, AO, INC2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 .align 4 .L64: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L68 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 .align 4 .L68: #ifndef TRMMKERNEL LFPDUX A1, CO1, INC2 LFPDUX A2, CO1, INC2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f2 fpadd f1, f1, f3 #else fpsub f0, f0, f2 fpsub f1, f1, f3 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 STFPDUX f0, CO1, INCM3 STFPDUX f1, CO1, INC2 #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 STFPDUX f0, CO1, INC2 STFPDUX f1, CO1, INC2 #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: andi. I, M, 1 beq .L89 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 0 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L74 #else addi BO, B, - 2 * SIZE fpmr f1, f0 srawi. r0, K, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 .align 4 .L74: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. r0, TEMP, 7 mtspr CTR, r0 #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L78 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 .align 4 .L78: #ifndef TRMMKERNEL LFPDX A1, CO1, INC2 #endif fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f1 #else fpsub f0, f0, f1 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcxnpma f0, f0, AP, A1 #else fxcpmadd f12, f0, AP, f30 fxcxnpma f0, f0, AP, f12 #endif STFPDUX f0, CO1, INC2 li r0, FZERO lfpsx f0, SP, r0 .align 4 .L89: addi B, BO, 2 * SIZE .align 4 .L999: addi SP, SP, 20 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 .L1000: li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM3, -3 * SIZE li INCM5, -5 * SIZE li INCM7, -7 * SIZE addi C, C, - 1 * SIZE srawi. J, N, 1 ble .L1050 .align 4 .L1010: mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -4 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L1020 .align 4 .L1011: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 2 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L1014 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 mtspr CTR, r0 ble .L1014 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L1013 .align 4 .L1012: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A1, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 LFPDUX B1, BO, INC4 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A3, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 LFPDUX A5, AO, INC4 FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 LFPDUX B3, BO, INC4 FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A6, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A7, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A9, f12 LFPDUX B5, BO, INC4 FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A8, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 LFPDUX A2, AO2, INC4 FXCSMADD f7, B6, A4, f7 LFPDUX A9, AO, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 bdnz+ .L1012 .align 4 .L1013: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 #ifndef TRMMKERNEL LFDUX A1, CO1, INC #else nop #endif FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 #ifndef TRMMKERNEL LFDUX B1, CO1, INC2 #else nop #endif FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 #ifndef TRMMKERNEL LFDUX A3, CO1, INC2 #else nop #endif FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 #ifndef TRMMKERNEL LFDUX A5, CO1, INC2 #else nop #endif FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 #ifndef TRMMKERNEL LFSDUX A1, CO1, INCM5 #else nop #endif FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 #ifndef TRMMKERNEL LFSDUX B1, CO1, INC2 #else nop #endif FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 #ifndef TRMMKERNEL LFSDUX A3, CO1, INC2 #else nop #endif FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 #ifndef TRMMKERNEL LFSDUX A5, CO1, INC2 #else nop #endif FXCSMADD f12, B4, A9, f12 #ifndef TRMMKERNEL LFDUX B3, CO2, INC #else nop #endif FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 #ifndef TRMMKERNEL LFDUX A6, CO2, INC2 #else nop #endif FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 #ifndef TRMMKERNEL LFDUX A7, CO2, INC2 #else nop #endif FXCPMADD f3, B6, A4, f3 nop FXCSMADD f7, B6, A4, f7 nop FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 #ifndef TRMMKERNEL LFDUX B2, CO2, INC2 #else nop #endif .align 4 .L1014: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 2 #endif andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L1018 cmpwi cr0, TEMP, 3 bgt+ .L1015 #else andi. r0, K, 3 mtspr CTR, r0 ble+ .L1018 cmpwi cr0, K, 3 bgt+ .L1015 #endif #ifndef TRMMKERNEL LFDUX A1, CO1, INC fpmr f5, f0 LFDUX B1, CO1, INC2 fpmr f9, f0 LFDUX A3, CO1, INC2 fpmr f13, f0 LFDUX A5, CO1, INC2 fpmr f2, f0 LFSDUX A1, CO1, INCM5 fpmr f6, f0 LFSDUX B1, CO1, INC2 fpmr f10, f0 LFSDUX A3, CO1, INC2 fpmr f14, f0 LFSDUX A5, CO1, INC2 fpmr f3, f0 LFDUX B3, CO2, INC fpmr f7, f0 LFDUX A6, CO2, INC2 fpmr f11, f0 LFDUX A7, CO2, INC2 fpmr f15, f0 LFDUX B2, CO2, INC2 #else fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 #endif .align 4 .L1015: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L1017 .align 4 .L1016: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 LFPDUX A2, AO, INC4 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 LFPDUX A10, BO, INC4 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L1016 .align 4 .L1017: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 .align 4 .L1018: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 fpadd f2, f2, f6 fpadd f10, f10, f14 fpadd f3, f3, f7 fpadd f11, f11, f15 #else fpsub f0, f0, f4 fpsub f8, f8, f12 fpsub f1, f1, f5 fpsub f9, f9, f13 fpsub f2, f2, f6 fpsub f10, f10, f14 fpsub f3, f3, f7 fpsub f11, f11, f15 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 LFSDUX B3, CO2, INCM5 fxcpmadd B1, f1, AP, B1 LFSDUX A6, CO2, INC2 fxcpmadd A3, f2, AP, A3 LFSDUX A7, CO2, INC2 fxcpmadd A5, f3, AP, A5 LFSDUX B2, CO2, INC2 fxcxnpma f0, f0, AP, A1 fxcpmadd B3, f8, AP, B3 fxcxnpma f1, f1, AP, B1 fxcpmadd A6, f9, AP, A6 fxcxnpma f2, f2, AP, A3 fxcpmadd A7, f10, AP, A7 fxcxnpma f3, f3, AP, A5 STFDUX f0, CO1, INCM7 fxcpmadd B2, f11, AP, B2 STFSDUX f0, CO1, INC fxcxnpma f8, f8, AP, B3 STFDUX f1, CO1, INC STFSDUX f1, CO1, INC fxcxnpma f9, f9, AP, A6 STFDUX f2, CO1, INC STFSDUX f2, CO1, INC fxcxnpma f10, f10, AP, A7 STFDUX f3, CO1, INC STFSDUX f3, CO1, INC fxcxnpma f11, f11, AP, B2 STFDUX f8, CO2, INCM7 #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f2, AP, f30 fxcpmadd f15, f3, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f2, f2, AP, f14 fxcxnpma f3, f3, AP, f15 fxcpmadd f16, f8, AP, f30 fxcpmadd f17, f9, AP, f30 fxcpmadd f18, f10, AP, f30 fxcpmadd f19, f11, AP, f30 fxcxnpma f8, f8, AP, f16 fxcxnpma f9, f9, AP, f17 fxcxnpma f10, f10, AP, f18 fxcxnpma f11, f11, AP, f19 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f8, CO2, INC #endif STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC STFDUX f10, CO2, INC STFSDUX f10, CO2, INC STFDUX f11, CO2, INC STFSDUX f11, CO2, INC #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L1011 .align 4 .L1020: andi. I, M, 2 beq .L1030 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 #else slwi TEMP, KK, 1 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L1024 #else addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, K, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L1024 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L1023 .align 4 .L1022: FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L1022 .align 4 .L1023: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4 .L1024: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1028 LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L1027 .align 4 .L1026: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L1026 .align 4 .L1027: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4 .L1028: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO1, INC2 LFDUX A3, CO2, INC LFDUX A4, CO2, INC2 LFSDUX A1, CO1, INCM1 LFSDUX A2, CO1, INC2 LFSDUX A3, CO2, INCM1 LFSDUX A4, CO2, INC2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 #else fpsub f0, f0, f4 fpsub f8, f8, f12 fpsub f1, f1, f5 fpsub f9, f9, f13 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcpmadd A3, f8, AP, A3 fxcpmadd A4, f9, AP, A4 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 fxcxnpma f8, f8, AP, A3 fxcxnpma f9, f9, AP, A4 STFDUX f0, CO1, INCM3 STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INCM3 STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f8, AP, f30 fxcpmadd f15, f9, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f8, f8, AP, f14 fxcxnpma f9, f9, AP, f15 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1030: andi. I, M, 1 beq .L1049 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 #else slwi TEMP, KK, 0 + ZBASE_SHIFT slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L1034 #else addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, K, 2 mtspr CTR, r0 ble .L1034 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L1033 .align 4 .L1032: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L1032 .align 4 .L1033: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4 .L1034: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1038 LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L1037 .align 4 .L1036: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L1036 .align 4 .L1037: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4 .L1038: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO2, INC LFSDUX A1, CO1, INC LFSDUX A2, CO2, INC #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f1 fpadd f2, f2, f3 #else fpsub f0, f0, f1 fpsub f2, f2, f3 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f2, AP, A2 fxcxnpma f0, f0, AP, A1 fxcxnpma f2, f2, AP, A2 STFDUX f0, CO1, INCM1 STFSDUX f0, CO1, INC STFDUX f2, CO2, INCM1 STFSDUX f2, CO2, INC #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f2, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f2, f2, AP, f13 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1049: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif addi B, BO, 4 * SIZE addic. J, J, -1 bgt+ .L1010 .align 4 .L1050: andi. J, N, 1 beq .L10999 mr CO1, C #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif addi AO, A, -2 * SIZE li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L1060 .align 4 .L1051: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #else slwi TEMP, KK, 2 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 2 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L1054 #else srawi. r0, K, 2 fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 mtspr CTR, r0 fpmr f7, f0 ble .L1054 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L1053 .align 4 .L1052: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 LFPDUX B1, BO, INC2 FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 LFPDUX B2, BO, INC2 FXCSMADD f4, B3, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B3, A3, f2 nop FXCSMADD f6, B3, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B4, A5, f0 LFPDUX B3, BO, INC2 FXCSMADD f4, B4, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B4, A6, f1 nop FXCSMADD f5, B4, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B4, A7, f2 nop FXCSMADD f6, B4, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B4, A8, f3 nop FXCSMADD f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L1052 .align 4 .L1053: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 nop FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 FXCSMADD f4, B3, A1, f4 FXCPMADD f1, B3, A2, f1 FXCSMADD f5, B3, A2, f5 FXCPMADD f2, B3, A3, f2 FXCSMADD f6, B3, A3, f6 FXCPMADD f3, B3, A4, f3 FXCSMADD f7, B3, A4, f7 FXCPMADD f0, B4, A5, f0 FXCSMADD f4, B4, A5, f4 FXCPMADD f1, B4, A6, f1 FXCSMADD f5, B4, A6, f5 FXCPMADD f2, B4, A7, f2 FXCSMADD f6, B4, A7, f6 FXCPMADD f3, B4, A8, f3 FXCSMADD f7, B4, A8, f7 .align 4 .L1054: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 4 #else addi TEMP, KK, 1 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1058 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L1057 .align 4 .L1056: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L1056 .align 4 .L1057: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 .align 4 .L1058: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO1, INC2 LFDUX A3, CO1, INC2 LFDUX A4, CO1, INC2 LFSDUX A1, CO1, INCM5 LFSDUX A2, CO1, INC2 LFSDUX A3, CO1, INC2 LFSDUX A4, CO1, INC2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f4 fpadd f1, f1, f5 fpadd f2, f2, f6 fpadd f3, f3, f7 #else fpsub f0, f0, f4 fpsub f1, f1, f5 fpsub f2, f2, f6 fpsub f3, f3, f7 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcpmadd A3, f2, AP, A3 fxcpmadd A4, f3, AP, A4 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 fxcxnpma f2, f2, AP, A3 fxcxnpma f3, f3, AP, A4 STFDUX f0, CO1, INCM7 STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcpmadd f14, f2, AP, f30 fxcpmadd f15, f3, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 fxcxnpma f2, f2, AP, f14 fxcxnpma f3, f3, AP, f15 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -4 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 4 #endif #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L1051 .align 4 .L1060: andi. I, M, 2 beq .L1070 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 1 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 2 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L1064 #else srawi. r0, K, 2 fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L1064 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L1063 .align 4 .L1062: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 LFPDUX A3, AO, INC2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 LFPDUX A5, AO, INC2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 LFPDUX A7, AO, INC2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L1062 .align 4 .L1063: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 .align 4 .L1064: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. r0, TEMP, 3 mtspr CTR, r0 #else andi. r0, K, 3 mtspr CTR, r0 #endif ble+ .L1068 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L1067 .align 4 .L1066: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L1066 .align 4 .L1067: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 .align 4 .L1068: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO1, INC2 LFSDUX A1, CO1, INCM1 LFSDUX A2, CO1, INC2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f2 fpadd f1, f1, f3 #else fpsub f0, f0, f2 fpsub f1, f1, f3 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcpmadd A2, f1, AP, A2 fxcxnpma f0, f0, AP, A1 fxcxnpma f1, f1, AP, A2 STFDUX f0, CO1, INCM3 STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #else fxcpmadd f12, f0, AP, f30 fxcpmadd f13, f1, AP, f30 fxcxnpma f0, f0, AP, f12 fxcxnpma f1, f1, AP, f13 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #endif #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1070: andi. I, M, 1 beq .L1089 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) addi BO, B, - 2 * SIZE fpmr f1, f0 #else slwi TEMP, KK, 0 + ZBASE_SHIFT slwi r0, KK, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE fpmr f1, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. r0, TEMP, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L1074 #else addi BO, B, - 2 * SIZE fpmr f1, f0 srawi. r0, K, 3 fpmr f2, f0 mtspr CTR, r0 fpmr f3, f0 ble .L1074 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L1073 .align 4 .L1072: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L1072 .align 4 .L1073: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 .align 4 .L1074: li r0, ALPHA lfpdx AP, SP, r0 #ifdef TRMMKERNEL li r0, FZERO lfpsx f30, SP, r0 #endif #if defined(TRMMKERNEL) #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. r0, TEMP, 7 mtspr CTR, r0 #else andi. r0, K, 7 mtspr CTR, r0 #endif ble+ .L1078 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L1077 .align 4 .L1076: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L1076 .align 4 .L1077: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 .align 4 .L1078: #ifndef TRMMKERNEL LFDUX A1, CO1, INC LFDUX A2, CO1, INC #endif fpadd f0, f0, f2 fpadd f1, f1, f3 fsmfp A1, A2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) fpadd f0, f0, f1 #else fpsub f0, f0, f1 #endif #ifndef TRMMKERNEL fxcpmadd A1, f0, AP, A1 fxcxnpma f0, f0, AP, A1 STFDUX f0, CO1, INCM1 STFSDUX f0, CO1, INC #else fxcpmadd f12, f0, AP, f30 fxcxnpma f0, f0, AP, f12 STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L1089: addi B, BO, 2 * SIZE .align 4 .L10999: addi SP, SP, 20 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_power3.S000066400000000000000000000616121313527062700216700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #endif #endif #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM #ifndef DOUBLE #include "../cparam.h" #else #include "../zparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) #endif stfd f1, ALPHA_R stfd f2, ALPHA_I stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 4 * SIZE /* is 12 best? */ #endif #else #ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE + 16) #else li PREA, (16 * 9 * SIZE + 16) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 li PREA, 16 * 9 * SIZE #endif #endif lfs f0, FZERO srawi. J, N, 1 ble LL(KERNEL_N_AND_3_HEAD) .align 4 LL(KERNEL_MainHead): fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 mr CO1, C add CO2, C, LDC add C, CO2, LDC srawi. I, M, 1 mr AO, A ble LL(KERNEL_M_AND_3) .align 4 LL(KERNEL_MainSubHead): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(KERNEL_K_AND_7) .align 4 LL(KERNEL_MainLoop): fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 LFD f28, 4 * SIZE(BO) fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFD f16, 8 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f5, f17, f21, f5 LFD f29, 5 * SIZE(BO) fmadd f9, f17, f22, f9 fmadd f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) fmadd f2, f18, f20, f2 fmadd f6, f18, f21, f6 LFD f30, 6 * SIZE(BO) fmadd f10, f18, f22, f10 fmadd f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) fmadd f3, f19, f20, f3 fmadd f7, f19, f21, f7 LFD f31, 7 * SIZE(BO) fmadd f11, f19, f22, f11 fmadd f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) fmadd f0, f24, f28, f0 fmadd f4, f24, f29, f4 LFD f20, 8 * SIZE(BO) fmadd f8, f24, f30, f8 fmadd f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) fmadd f1, f25, f28, f1 fmadd f5, f25, f29, f5 LFD f21, 9 * SIZE(BO) fmadd f9, f25, f30, f9 fmadd f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) fmadd f2, f26, f28, f2 fmadd f6, f26, f29, f6 LFD f22, 10 * SIZE(BO) fmadd f10, f26, f30, f10 fmadd f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) fmadd f3, f27, f28, f3 fmadd f7, f27, f29, f7 LFD f23, 11 * SIZE(BO) fmadd f11, f27, f30, f11 fmadd f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 LFD f28, 12 * SIZE(BO) fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFDU f16, 16 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f5, f17, f21, f5 LFD f29, 13 * SIZE(BO) fmadd f9, f17, f22, f9 fmadd f13, f17, f23, f13 LFD f17, 1 * SIZE(AO) fmadd f2, f18, f20, f2 fmadd f6, f18, f21, f6 LFD f30, 14 * SIZE(BO) fmadd f10, f18, f22, f10 fmadd f14, f18, f23, f14 LFD f18, 2 * SIZE(AO) fmadd f3, f19, f20, f3 fmadd f7, f19, f21, f7 LFD f31, 15 * SIZE(BO) fmadd f11, f19, f22, f11 fmadd f15, f19, f23, f15 LFD f19, 3 * SIZE(AO) fmadd f0, f24, f28, f0 fmadd f4, f24, f29, f4 LFDU f20, 16 * SIZE(BO) fmadd f8, f24, f30, f8 fmadd f12, f24, f31, f12 LFD f24, 4 * SIZE(AO) fmadd f1, f25, f28, f1 fmadd f5, f25, f29, f5 LFD f21, 1 * SIZE(BO) fmadd f9, f25, f30, f9 fmadd f13, f25, f31, f13 LFD f25, 5 * SIZE(AO) fmadd f2, f26, f28, f2 fmadd f6, f26, f29, f6 LFD f22, 2 * SIZE(BO) fmadd f10, f26, f30, f10 fmadd f14, f26, f31, f14 LFD f26, 6 * SIZE(AO) fmadd f3, f27, f28, f3 fmadd f7, f27, f29, f7 LFD f23, 3 * SIZE(BO) fmadd f11, f27, f30, f11 fmadd f15, f27, f31, f15 LFD f27, 7 * SIZE(AO) bdnz LL(KERNEL_MainLoop) .align 4 LL(KERNEL_K_AND_7): andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(KERNEL_MainFinish) .align 4 LL(KERNEL_SubLoop): fmadd f0, f16, f20, f0 fmadd f4, f16, f21, f4 fmadd f8, f16, f22, f8 fmadd f12, f16, f23, f12 LFD f16, 4 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f5, f17, f21, f5 fmadd f9, f17, f22, f9 fmadd f13, f17, f23, f13 LFD f17, 5 * SIZE(AO) fmadd f2, f18, f20, f2 fmadd f6, f18, f21, f6 fmadd f10, f18, f22, f10 fmadd f14, f18, f23, f14 LFD f18, 6 * SIZE(AO) fmadd f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) fmadd f7, f19, f21, f7 LFD f21, 5 * SIZE(BO) fmadd f11, f19, f22, f11 LFD f22, 6 * SIZE(BO) fmadd f15, f19, f23, f15 LFD f19, 7 * SIZE(AO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(KERNEL_SubLoop) .align 4 LL(KERNEL_MainFinish): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FADD f8, f8, f13 FSUB f9, f9, f12 FADD f10, f10, f15 FSUB f11, f11, f14 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FMADD f20, f30, f8, f20 FMADD f21, f30, f9, f21 FMADD f22, f30, f10, f22 FMADD f23, f30, f11, f23 FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 FNMSUB f20, f31, f9, f20 FMADD f21, f31, f8, f21 FNMSUB f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f20, f30, f8, f20 FNMSUB f21, f30, f9, f21 FMADD f22, f30, f10, f22 FNMSUB f23, f30, f11, f23 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FMADD f23, f31, f10, f23 #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f20, 0 * SIZE(CO2) STFD f21, 1 * SIZE(CO2) STFD f22, 2 * SIZE(CO2) STFD f23, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addic. I, I, -1 bgt LL(KERNEL_MainSubHead) .align 4 LL(KERNEL_M_AND_3): andi. I, M, 1 ble LL(KERNEL_MainTail) .align 4 LL(KERNEL_M_AND_3_SubHead): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(KERNEL_M_AND_3_K_AND_3) .align 4 LL(KERNEL_M_AND_3_MainLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(KERNEL_M_AND_3_MainLoop) .align 4 LL(KERNEL_M_AND_3_K_AND_3): andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(KERNEL_M_AND3_Finish) .align 4 LL(KERNEL_M_AND_3_SubLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(KERNEL_M_AND_3_SubLoop) .align 4 LL(KERNEL_M_AND3_Finish): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #endif LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 0 * SIZE(CO2) STFD f19, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addic. I, I, -1 bgt LL(KERNEL_M_AND_3_SubHead) .align 4 LL(KERNEL_MainTail): mr B, BO addic. J, J, -1 lfs f0, FZERO bgt LL(KERNEL_MainHead) .align 4 LL(KERNEL_N_AND_3_HEAD): andi. J, N, 1 ble LL(999) .align 4 LL(KERNEL_N_AND_3_MainHead): srawi. I, M, 1 mr CO1, C add C, C, LDC mr AO, A ble LL(KERNEL_MN_AND_3_Head) .align 4 LL(KERNEL_N_AND_3_SubHead): LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(KERNEL_N_AND_3_K_AND_3) .align 4 LL(KERNEL_N_AND_3_MainLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(KERNEL_N_AND_3_MainLoop) .align 4 LL(KERNEL_N_AND_3_K_AND_3): andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(KERNEL_N_AND_3_Finish) .align 4 LL(KERNEL_N_AND_3_SubLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(KERNEL_N_AND_3_SubLoop) .align 4 LL(KERNEL_N_AND_3_Finish): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt LL(KERNEL_N_AND_3_SubHead) .align 4 LL(KERNEL_MN_AND_3_Head): andi. I, M, 1 ble LL(KERNEL_SubEnd) .align 4 LL(KERNEL_MN_AND_3_SubHead): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(KERNEL_MN_AND_3_K_AND_3) .align 4 LL(KERNEL_MN_AND_3_MainLoop): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(KERNEL_MN_AND_3_MainLoop) .align 4 LL(KERNEL_MN_AND_3_K_AND_3): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,r0 ble LL(KERNEL_MN_AND_3_Finish) .align 4 LL(KERNEL_MN_AND_3_SubLoop): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(KERNEL_MN_AND_3_SubLoop) .align 4 LL(KERNEL_MN_AND_3_Finish): #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) fsub f0, f0, f1 fadd f2, f2, f3 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) fadd f0, f0, f1 fsub f2, f2, f3 #else fadd f0, f0, f1 fsub f2, f3, f2 #endif LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) FMADD f16, f30, f0, f16 FMADD f17, f30, f2, f17 FNMSUB f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC) || defined(RR) */ FMADD f16, f30, f0, f16 FNMSUB f17, f30, f2, f17 FMADD f16, f31, f2, f16 FMADD f17, f31, f0, f17 #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE addic. I, I, -1 bgt LL(KERNEL_MN_AND_3_SubHead) .align 4 LL(KERNEL_SubEnd): mr B, BO addic. J, J, -1 bgt LL(KERNEL_N_AND_3_MainHead) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_power6.S000066400000000000000000001520541313527062700216740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define TEMP r19 #define KK r20 #define BB r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO1 r26 #define CO2 r27 #define CO3 r28 #define CO4 r29 #define PREA r30 #define PREC r31 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FNMSUB #define FMA4 FMADD #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define FMA1 FMADD #define FMA2 FNMSUB #define FMA3 FMADD #define FMA4 FMADD #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FMADD #define FMA4 FNMSUB #else #define FMA1 FMADD #define FMA2 FNMSUB #define FMA3 FNMSUB #define FMA4 FNMSUB #endif #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #ifdef TRMMKERNEL std r20, 232(SP) std r19, 240(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #ifdef TRMMKERNEL stw r20, 188(SP) stw r19, 192(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif slwi LDC, LDC, ZBASE_SHIFT li PREA, (16 * 3) * SIZE li PREC, 3 * SIZE cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) srawi. J, N, 2 ble LL(30) .align 4 LL(10): mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC add C, CO4, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif slwi BB, K, ZBASE_SHIFT + 2 mr AO, A lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 1 ble LL(20) .align 4 LL(11): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(B) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(B) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, K, 3 mr BO, B mtspr CTR, r0 ble LL(15) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(B) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(B) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP ble LL(15) #endif .align 4 LL(12): dcbt AO, PREA FMA1 f0, f16, f20, f0 nop FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 20 * SIZE(AO) LFD f29, 21 * SIZE(AO) LFD f30, 22 * SIZE(AO) LFD f31, 23 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 40 * SIZE(BO) LFD f21, 41 * SIZE(BO) LFD f22, 42 * SIZE(BO) LFD f23, 43 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 44 * SIZE(BO) LFD f25, 45 * SIZE(BO) LFD f26, 46 * SIZE(BO) LFD f27, 47 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) LFD f18, 26 * SIZE(AO) LFD f19, 27 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 48 * SIZE(BO) LFD f21, 49 * SIZE(BO) LFD f22, 50 * SIZE(BO) LFD f23, 51 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 52 * SIZE(BO) LFD f25, 53 * SIZE(BO) LFD f26, 54 * SIZE(BO) LFD f27, 55 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 28 * SIZE(AO) LFD f29, 29 * SIZE(AO) LFD f30, 30 * SIZE(AO) LFD f31, 31 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 56 * SIZE(BO) LFD f21, 57 * SIZE(BO) LFD f22, 58 * SIZE(BO) LFD f23, 59 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 60 * SIZE(BO) LFD f25, 61 * SIZE(BO) LFD f26, 62 * SIZE(BO) LFD f27, 63 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) LFD f18, 34 * SIZE(AO) LFD f19, 35 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 64 * SIZE(BO) LFD f21, 65 * SIZE(BO) LFD f22, 66 * SIZE(BO) LFD f23, 67 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 68 * SIZE(BO) LFD f25, 69 * SIZE(BO) LFD f26, 70 * SIZE(BO) LFD f27, 71 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 64 * SIZE bdnz LL(12) .align 4 LL(15): lfd f30, ALPHA_R lfd f31, ALPHA_I dcbtst B, BB addi BB, BB, 16 * SIZE dcbtst B, BB addi BB, BB, 16 * SIZE #ifndef TRMMKERNEL andi. r0, K, 7 mtspr CTR, r0 ble LL(18) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP ble LL(18) #endif .align 4 LL(16): FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(16) .align 4 LL(18): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FNMSUB f24, f31, f1, f16 FMADD f25, f31, f0, f17 FNMSUB f26, f31, f3, f18 FMADD f27, f31, f2, f19 LFD f16, 0 * SIZE(CO3) LFD f17, 1 * SIZE(CO3) LFD f18, 2 * SIZE(CO3) LFD f19, 3 * SIZE(CO3) FMADD f0, f30, f0, f24 FMADD f1, f30, f1, f25 FMADD f2, f30, f2, f26 FMADD f3, f30, f3, f27 FNMSUB f24, f31, f5, f20 FMADD f25, f31, f4, f21 FNMSUB f26, f31, f7, f22 FMADD f27, f31, f6, f23 LFD f20, 0 * SIZE(CO4) LFD f21, 1 * SIZE(CO4) LFD f22, 2 * SIZE(CO4) LFD f23, 3 * SIZE(CO4) FMADD f4, f30, f4, f24 FMADD f5, f30, f5, f25 FMADD f6, f30, f6, f26 FMADD f7, f30, f7, f27 FNMSUB f24, f31, f9, f16 FMADD f25, f31, f8, f17 FNMSUB f26, f31, f11, f18 FMADD f27, f31, f10, f19 FMADD f8, f30, f8, f24 FMADD f9, f30, f9, f25 FMADD f10, f30, f10, f26 FMADD f11, f30, f11, f27 FNMSUB f24, f31, f13, f20 FMADD f25, f31, f12, f21 FNMSUB f26, f31, f15, f22 FMADD f27, f31, f14, f23 FMADD f12, f30, f12, f24 FMADD f13, f30, f13, f25 FMADD f14, f30, f14, f26 FMADD f15, f30, f15, f27 #else FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMUL f20, f31, f5 FMUL f21, f31, f4 FMUL f22, f31, f7 FMUL f23, f31, f6 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMADD f2, f30, f2, f18 FMADD f3, f30, f3, f19 FMSUB f4, f30, f4, f20 FMADD f5, f30, f5, f21 FMADD f6, f30, f6, f22 FMADD f7, f30, f7, f23 FMUL f16, f31, f9 FMUL f17, f31, f8 FMUL f18, f31, f11 FMUL f19, f31, f10 FMUL f20, f31, f13 FMUL f21, f31, f12 FMUL f22, f31, f15 FMUL f23, f31, f14 FMSUB f8, f30, f8, f16 FMADD f9, f30, f9, f17 FMADD f10, f30, f10, f18 FMADD f11, f30, f11, f19 FMSUB f12, f30, f12, f20 FMADD f13, f30, f13, f21 FMADD f14, f30, f14, f22 FMADD f15, f30, f15, f23 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(25) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(25) #endif .align 4 LL(22): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f18, f20, f0 FMA4 f3, f19, f20, f3 FMA2 f1, f18, f21, f1 FMA3 f2, f19, f21, f2 FMA1 f4, f18, f22, f4 FMA4 f7, f19, f22, f7 FMA2 f5, f18, f23, f5 FMA3 f6, f19, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA1 f8, f18, f24, f8 FMA4 f11, f19, f24, f11 FMA2 f9, f18, f25, f9 FMA3 f10, f19, f25, f10 FMA1 f12, f18, f26, f12 FMA4 f15, f19, f26, f15 FMA2 f13, f18, f27, f13 FMA3 f14, f19, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA4 f3, f29, f20, f3 FMA2 f1, f28, f21, f1 FMA3 f2, f29, f21, f2 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA4 f7, f29, f22, f7 FMA2 f5, f28, f23, f5 FMA3 f6, f29, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA1 f8, f28, f24, f8 FMA4 f11, f29, f24, f11 FMA2 f9, f28, f25, f9 FMA3 f10, f29, f25, f10 FMA1 f12, f28, f26, f12 FMA4 f15, f29, f26, f15 FMA2 f13, f28, f27, f13 FMA3 f14, f29, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f30, f20, f0 FMA4 f3, f31, f20, f3 FMA2 f1, f30, f21, f1 FMA3 f2, f31, f21, f2 FMA1 f4, f30, f22, f4 FMA4 f7, f31, f22, f7 FMA2 f5, f30, f23, f5 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA1 f8, f30, f24, f8 FMA4 f11, f31, f24, f11 FMA2 f9, f30, f25, f9 FMA3 f10, f31, f25, f10 FMA1 f12, f30, f26, f12 FMA4 f15, f31, f26, f15 FMA2 f13, f30, f27, f13 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 32 * SIZE bdnz LL(22) .align 4 LL(25): lfd f30, ALPHA_R lfd f31, ALPHA_I #ifndef TRMMKERNEL andi. r0, K, 3 mtspr CTR, r0 ble LL(28) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 4 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP ble LL(28) #endif .align 4 LL(26): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(26) .align 4 LL(28): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FADD f0, f0, f2 FADD f1, f1, f3 FADD f4, f4, f6 FADD f5, f5, f7 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) FADD f8, f8, f10 FADD f9, f9, f11 FADD f12, f12, f14 FADD f13, f13, f15 FNMSUB f24, f31, f1, f16 FMADD f25, f31, f0, f17 FNMSUB f26, f31, f5, f18 FMADD f27, f31, f4, f19 FMADD f0, f30, f0, f24 FMADD f1, f30, f1, f25 FMADD f4, f30, f4, f26 FMADD f5, f30, f5, f27 FNMSUB f24, f31, f9, f20 FMADD f25, f31, f8, f21 FNMSUB f26, f31, f13, f22 FMADD f27, f31, f12, f23 FMADD f8, f30, f8, f24 FMADD f9, f30, f9, f25 FMADD f12, f30, f12, f26 FMADD f13, f30, f13, f27 #else FADD f0, f0, f2 FADD f1, f1, f3 FADD f4, f4, f6 FADD f5, f5, f7 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f5 FMUL f19, f31, f4 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMSUB f4, f30, f4, f18 FMADD f5, f30, f5, f19 FADD f8, f8, f10 FADD f9, f9, f11 FADD f12, f12, f14 FADD f13, f13, f15 FMUL f20, f31, f9 FMUL f21, f31, f8 FMUL f22, f31, f13 FMUL f23, f31, f12 FMSUB f8, f30, f8, f20 FMADD f9, f30, f9, f21 FMSUB f12, f30, f12, f22 FMADD f13, f30, f13, f23 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -4 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(29): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4 #endif mr B, BO addic. J, J, -1 bgt LL(10) .align 4 LL(30): andi. J, N, 2 ble LL(50) mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif slwi BB, K, ZBASE_SHIFT + 1 mr AO, A lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 1 ble LL(40) .align 4 LL(31): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, K, 3 mr BO, B mtspr CTR, r0 ble LL(35) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif dcbtst CO1, PREC dcbtst CO2, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP ble LL(35) #endif .align 4 LL(32): dcbt AO, PREA dcbtst BO, PREA FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 FMA4 f13, f17, f22, f13 FMA4 f15, f19, f22, f15 FMA3 f12, f17, f23, f12 FMA3 f14, f19, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f0, f28, f24, f0 FMA1 f2, f30, f24, f2 FMA2 f1, f28, f25, f1 FMA2 f3, f30, f25, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f26, f4 FMA1 f6, f30, f26, f6 FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 FMA4 f13, f17, f22, f13 FMA4 f15, f19, f22, f15 FMA3 f12, f17, f23, f12 FMA3 f14, f19, f23, f14 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA1 f0, f28, f24, f0 FMA1 f2, f30, f24, f2 FMA2 f1, f28, f25, f1 FMA2 f3, f30, f25, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMA1 f4, f28, f26, f4 FMA1 f6, f30, f26, f6 FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 20 * SIZE(AO) LFD f29, 21 * SIZE(AO) LFD f30, 22 * SIZE(AO) LFD f31, 23 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 FMA4 f13, f17, f22, f13 FMA4 f15, f19, f22, f15 FMA3 f12, f17, f23, f12 FMA3 f14, f19, f23, f14 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA1 f0, f28, f24, f0 FMA1 f2, f30, f24, f2 FMA2 f1, f28, f25, f1 FMA2 f3, f30, f25, f3 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) LFD f18, 26 * SIZE(AO) LFD f19, 27 * SIZE(AO) FMA1 f4, f28, f26, f4 FMA1 f6, f30, f26, f6 FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 28 * SIZE(AO) LFD f29, 29 * SIZE(AO) LFD f30, 30 * SIZE(AO) LFD f31, 31 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 FMA4 f13, f17, f22, f13 FMA4 f15, f19, f22, f15 FMA3 f12, f17, f23, f12 FMA3 f14, f19, f23, f14 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA1 f0, f28, f24, f0 FMA1 f2, f30, f24, f2 FMA2 f1, f28, f25, f1 FMA2 f3, f30, f25, f3 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) LFD f18, 34 * SIZE(AO) LFD f19, 35 * SIZE(AO) FMA1 f4, f28, f26, f4 FMA1 f6, f30, f26, f6 FMA2 f5, f28, f27, f5 FMA2 f7, f30, f27, f7 FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE bdnz LL(32) .align 4 LL(35): lfd f30, ALPHA_R lfd f31, ALPHA_I dcbtst B, BB addi BB, BB, 16 * SIZE #ifndef TRMMKERNEL andi. r0, K, 7 mtspr CTR, r0 ble LL(38) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP ble LL(38) #endif .align 4 LL(36): FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 LFD f16, 4 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) FMA4 f13, f17, f22, f13 FMA4 f15, f19, f22, f15 FMA3 f12, f17, f23, f12 FMA3 f14, f19, f23, f14 LFD f17, 5 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(36) .align 4 LL(38): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) FADD f0, f0, f8 FADD f1, f1, f9 FADD f2, f2, f10 FADD f3, f3, f11 LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FADD f4, f4, f12 FADD f5, f5, f13 FADD f6, f6, f14 FADD f7, f7, f15 FNMSUB f24, f31, f1, f16 FMADD f25, f31, f0, f17 FNMSUB f26, f31, f3, f18 FMADD f27, f31, f2, f19 FMADD f0, f30, f0, f24 FMADD f1, f30, f1, f25 FMADD f2, f30, f2, f26 FMADD f3, f30, f3, f27 FNMSUB f24, f31, f5, f20 FMADD f25, f31, f4, f21 FNMSUB f26, f31, f7, f22 FMADD f27, f31, f6, f23 FMADD f4, f30, f4, f24 FMADD f5, f30, f5, f25 FMADD f6, f30, f6, f26 FMADD f7, f30, f7, f27 #else FADD f0, f0, f8 FADD f1, f1, f9 FADD f2, f2, f10 FADD f3, f3, f11 FADD f4, f4, f12 FADD f5, f5, f13 FADD f6, f6, f14 FADD f7, f7, f15 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMUL f20, f31, f5 FMUL f21, f31, f4 FMUL f22, f31, f7 FMUL f23, f31, f6 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMADD f2, f30, f2, f18 FMADD f3, f30, f3, f19 FMSUB f4, f30, f4, f20 FMADD f5, f30, f5, f21 FMADD f6, f30, f6, f22 FMADD f7, f30, f7, f23 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(49) #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(45) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(45) #endif .align 4 LL(42): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f20, 12 * SIZE(BO) LFD f21, 13 * SIZE(BO) LFD f22, 14 * SIZE(BO) LFD f23, 15 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 16 * SIZE bdnz LL(42) .align 4 LL(45): lfd f30, ALPHA_R lfd f31, ALPHA_I #ifndef TRMMKERNEL andi. r0, K, 3 mtspr CTR, r0 ble LL(48) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP ble LL(48) #endif .align 4 LL(46): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(46) .align 4 LL(48): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) FADD f0, f0, f2 FADD f1, f1, f3 FADD f4, f4, f6 FADD f5, f5, f7 FNMSUB f24, f31, f1, f16 FMADD f25, f31, f0, f17 FNMSUB f26, f31, f5, f20 FMADD f27, f31, f4, f21 FMADD f0, f30, f0, f24 FMADD f1, f30, f1, f25 FMADD f4, f30, f4, f26 FMADD f5, f30, f5, f27 #else FADD f0, f0, f2 FADD f1, f1, f3 FADD f4, f4, f6 FADD f5, f5, f7 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f5 FMUL f19, f31, f4 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMSUB f4, f30, f4, f18 FMADD f5, f30, f5, f19 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(49): #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO .align 4 LL(50): andi. J, N, 1 ble LL(999) mr CO1, C add C, CO1, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif mr AO, A lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 srawi. I, M, 1 ble LL(60) .align 4 LL(51): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbtst CO1, PREC srawi. r0, K, 3 mr BO, B mtspr CTR, r0 ble LL(55) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif dcbtst CO1, PREC #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 3 mtspr CTR, TEMP ble LL(55) #endif .align 4 LL(52): dcbt AO, PREA dcbtst BO, PREA FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMA1 f0, f16, f22, f0 FMA1 f2, f18, f22, f2 FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 FMA4 f9, f17, f22, f9 FMA4 f11, f19, f22, f11 FMA3 f8, f17, f23, f8 FMA3 f10, f19, f23, f10 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMA1 f0, f16, f22, f0 FMA1 f2, f18, f22, f2 FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 FMA4 f9, f17, f22, f9 FMA4 f11, f19, f22, f11 FMA3 f8, f17, f23, f8 FMA3 f10, f19, f23, f10 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 LFD f16, 20 * SIZE(AO) LFD f17, 21 * SIZE(AO) LFD f18, 22 * SIZE(AO) LFD f19, 23 * SIZE(AO) FMA1 f0, f16, f22, f0 FMA1 f2, f18, f22, f2 FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 FMA4 f9, f17, f22, f9 FMA4 f11, f19, f22, f11 FMA3 f8, f17, f23, f8 FMA3 f10, f19, f23, f10 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) LFD f18, 26 * SIZE(AO) LFD f19, 27 * SIZE(AO) LFD f20, 12 * SIZE(BO) LFD f21, 13 * SIZE(BO) LFD f22, 14 * SIZE(BO) LFD f23, 15 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 LFD f16, 28 * SIZE(AO) LFD f17, 29 * SIZE(AO) LFD f18, 30 * SIZE(AO) LFD f19, 31 * SIZE(AO) FMA1 f0, f16, f22, f0 FMA1 f2, f18, f22, f2 FMA2 f1, f16, f23, f1 FMA2 f3, f18, f23, f3 FMA4 f9, f17, f22, f9 FMA4 f11, f19, f22, f11 FMA3 f8, f17, f23, f8 FMA3 f10, f19, f23, f10 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) LFD f18, 34 * SIZE(AO) LFD f19, 35 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 16 * SIZE bdnz LL(52) .align 4 LL(55): lfd f30, ALPHA_R lfd f31, ALPHA_I #ifndef TRMMKERNEL andi. r0, K, 7 mtspr CTR, r0 ble LL(58) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 7 mtspr CTR, TEMP ble LL(58) #endif .align 4 LL(56): FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f16, 4 * SIZE(AO) LFD f18, 6 * SIZE(AO) FMA4 f9, f17, f20, f9 FMA4 f11, f19, f20, f11 FMA3 f8, f17, f21, f8 FMA3 f10, f19, f21, f10 LFD f17, 5 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE bdnz LL(56) .align 4 LL(58): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) FADD f0, f0, f8 FADD f1, f1, f9 FADD f2, f2, f10 FADD f3, f3, f11 FNMSUB f24, f31, f1, f16 FMADD f25, f31, f0, f17 FNMSUB f26, f31, f3, f18 FMADD f27, f31, f2, f19 FMADD f0, f30, f0, f24 FMADD f1, f30, f1, f25 FMADD f2, f30, f2, f26 FMADD f3, f30, f3, f27 #else FADD f0, f0, f8 FADD f1, f1, f9 FADD f2, f2, f10 FADD f3, f3, f11 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMADD f2, f30, f2, f18 FMADD f3, f30, f3, f19 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 addi CO1, CO1, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt LL(51) .align 4 LL(60): andi. I, M, 1 ble LL(999) #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(65) #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(65) #endif .align 4 LL(62): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) FMA1 f0, f18, f22, f0 FMA4 f3, f19, f22, f3 FMA2 f1, f18, f23, f1 FMA3 f2, f19, f23, f2 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) FMA1 f0, f18, f22, f0 FMA4 f3, f19, f22, f3 FMA2 f1, f18, f23, f1 FMA3 f2, f19, f23, f2 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): lfd f30, ALPHA_R lfd f31, ALPHA_I #ifndef TRMMKERNEL andi. r0, K, 3 mtspr CTR, r0 ble LL(68) #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP ble LL(68) #endif .align 4 LL(66): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 LFD f20, 2 * SIZE(BO) FMA2 f1, f16, f21, f1 LFD f16, 2 * SIZE(AO) FMA3 f2, f17, f21, f2 LFD f17, 3 * SIZE(AO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(66) .align 4 LL(68): #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) FADD f0, f0, f2 FADD f1, f1, f3 FNMSUB f24, f31, f1, f16 FMADD f25, f31, f0, f17 FMADD f0, f30, f0, f24 FMADD f1, f30, f1, f25 #else FADD f0, f0, f2 FADD f1, f1, f3 FMUL f16, f31, f1 FMUL f17, f31, f0 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #ifdef TRMMKERNEL ld r20, 232(SP) ld r19, 240(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #ifdef TRMMKERNEL lwz r20, 188(SP) lwz r19, 192(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_kernel_ppc440.S000066400000000000000000000767121313527062700214720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) #ifdef TRMMKERNEL std r23, 208(SP) std r22, 216(SP) #endif #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) #ifdef TRMMKERNEL stw r23, 176(SP) stw r22, 180(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 lfs f0, FZERO srawi. J, N, 1 ble .L30 .align 4 .L10: fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 mr CO1, C add CO2, C, LDC add C, CO2, LDC #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr AO, A ble .L20 .align 4 .L11: #ifndef TRMMKERNEL LFD A1, 0 * SIZE(AO) ### LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) ### LFD A5, 8 * SIZE(AO) ### LFD B1, 0 * SIZE(B) ### LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) ### LFD B6, 8 * SIZE(B) ### LFD B7, 12 * SIZE(B) ### srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble .L15 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD A1, 0 * SIZE(AO) ### LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) ### LFD A5, 8 * SIZE(AO) ### LFD B1, 0 * SIZE(B) ### LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) ### LFD B6, 8 * SIZE(B) ### LFD B7, 12 * SIZE(B) ### mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, r0 LFD A1, 0 * SIZE(AO) ### LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) ### LFD A5, 8 * SIZE(AO) ### LFD B1, 0 * SIZE(BO) ### LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) ### LFD B6, 8 * SIZE(BO) ### LFD B7, 12 * SIZE(BO) ### #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L15 #endif .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) ### FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) ### FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop ############ FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) ### FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) ### FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop ############ FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) ### FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) ### FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop ############ FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) ### FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) ### FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble .LKERNEL_MainFinish #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble .LKERNEL_MainFinish #endif .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .LKERNEL_MainFinish: #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f9, f12 FADD f10, f10, f15 FSUB f11, f11, f14 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) #endif FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FMADD f20, f30, f8, f20 FMADD f21, f30, f9, f21 FMADD f22, f30, f10, f22 FMADD f23, f30, f11, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 FNMSUB f20, f31, f9, f20 FMADD f21, f31, f8, f21 FNMSUB f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f20, f30, f8, f20 FNMSUB f21, f30, f9, f21 FMADD f22, f30, f10, f22 FNMSUB f23, f30, f11, f23 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FMADD f23, f31, f10, f23 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FNMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FNMADD f23, f31, f10, f23 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f20, 0 * SIZE(CO2) STFD f21, 1 * SIZE(CO2) STFD f22, 2 * SIZE(CO2) STFD f23, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -2 #endif slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt .L11 .align 4 .L20: andi. I, M, 1 ble .L29 #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble .L25 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L25 #endif .align 4 .L22: fmadd f0, f16, f20, f0 LFD f27, 7 * SIZE(BO) fmadd f1, f16, f21, f1 LFD f19, 3 * SIZE(AO) fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(AO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(BO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(BO) fmadd f7, f19, f27, f7 bdnz .L22 .align 4 .L25: #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble .L27 #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 2 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble .L27 #endif .align 4 .L26: fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) LFD f17, 1 * SIZE(AO) bdnz .L26 .align 4 .L27: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 0 * SIZE(CO2) STFD f19, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -1 #else addi TEMP, TEMP, -2 #endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 1 #endif #endif .align 4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2 #endif mr B, BO addic. J, J, -1 lfs f0, FZERO bgt .L10 .align 4 .L30: andi. J, N, 1 ble .L999 #if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET #endif srawi. I, M, 1 mr CO1, C add C, C, LDC mr AO, A ble .L40 .align 4 .L31: #ifndef TRMMKERNEL LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble .L35 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L35 #endif .align 4 .L32: fmadd f0, f16, f20, f0 LFD f27, 7 * SIZE(AO) fmadd f1, f16, f21, f1 LFD f19, 3 * SIZE(BO) fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(BO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(AO) fmadd f7, f19, f27, f7 bdnz .L32 .align 4 .L35: #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble .L37 #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 2 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble .L37 #endif .align 4 .L36: fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) LFD f17, 1 * SIZE(BO) bdnz .L36 .align 4 .L37: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 #endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 #else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) addi CO1, CO1, 4 * SIZE #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK #ifdef LEFT addi TEMP, TEMP, -2 #else addi TEMP, TEMP, -1 #endif slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LEFT addi KK, KK, 2 #endif #endif addic. I, I, -1 bgt .L31 .align 4 .L40: andi. I, M, 1 ble .L999 #ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble .L45 #else #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 mr BO, B #else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble .L45 #endif .align 4 .L42: fmadd f0, f16, f20, f0 LFD f23, 3 * SIZE(BO) fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) fmadd f2, f17, f20, f2 LFD f20, 4 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 5 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 6 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) fmadd f3, f16, f21, f3 LFDU f16, 8 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 8 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 2 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 3 * SIZE(AO) bdnz .L42 .align 4 .L45: fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,r0 ble .L47 #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK #elif defined(LEFT) addi TEMP, KK, 1 #else addi TEMP, KK, 1 #endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,TEMP ble .L47 #endif .align 4 .L46: fmadd f0, f16, f20, f0 fmadd f3, f16, f21, f3 LFDU f16, 2 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 2 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) bdnz .L46 .align 4 .L47: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) fsub f0, f0, f1 fadd f2, f2, f3 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) fadd f0, f0, f1 fsub f2, f2, f3 #else fadd f0, f0, f1 fsub f2, f3, f2 #endif #ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f2, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 #endif FNMSUB f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC) || defined(RR) */ #ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f2, f17 FMADD f16, f31, f2, f16 FMADD f17, f31, f0, f17 #else FMUL f16, f30, f0 FMUL f17, f30, f2 FMADD f16, f31, f2, f16 FNMADD f17, f31, f0, f17 #endif #endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) #ifdef TRMMKERNEL ld r23, 208(SP) ld r22, 216(SP) #endif #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) #ifdef TRMMKERNEL lwz r23, 176(SP) lwz r22, 180(SP) #endif #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemm_logic_8x2_power8.S000066400000000000000000000316501313527062700222120ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 1 ble ZGEMM_L2_END ZGEMM_L2_BEGIN: mr BO, B mr BBO, BBUFFER srawi. T1, K, 2 ble ZGEMM_L2_COPYB1 ZGEMM_L2_COPYB8: addi T2, PRE, 128 dcbt BO, PRE dcbtst BBO, PRE dcbtst BBO, T2 ZCOPYB_8x1 addic. T1, T1, -1 bgt ZGEMM_L2_COPYB8 ZGEMM_L2_COPYB1: andi. T1, K, 3 ble ZGEMM_L2_COPYB_END ZGEMM_L2_COPYB_LOOP: ZCOPYB_1x1 ZCOPYB_1x1 addic. T1, T1, -1 bgt ZGEMM_L2_COPYB_LOOP ZGEMM_L2_COPYB_END: mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 ble ZGEMM_L2x8_END ZGEMM_L2x8_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L2x8_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L2x8_SUB4 ZGEMM_L2x8_LOOP_START: dcbt AO, PRE dcbt BO, PRE LOAD2x8_1 dcbt AO, PRE KERNEL2x8_I1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 addic. L, L, -2 ble ZGEMM_L2x8_LOOP_END .align 5 ZGEMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 addic. L, L, -1 bgt ZGEMM_L2x8_LOOP ZGEMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 KERNEL2x8_E2 b ZGEMM_L2x8_SUB1 ZGEMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 dcbt AO, PRE KERNEL2x8_SUB1 dcbt AO, PRE KERNEL2x8_SUB1 dcbt AO, PRE KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b ZGEMM_L2x8_SUB1 ZGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 ble ZGEMM_L2x8_SAVE b ZGEMM_L2x8_SUB2 ZGEMM_L2x8_SUB1: andi. L, K, 7 ble ZGEMM_L2x8_SAVE ZGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt ZGEMM_L2x8_SUB2 ZGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 bgt ZGEMM_L2x8_BEGIN ZGEMM_L2x8_END: ZGEMM_L2x4_BEGIN: andi. T2, M, 7 ble ZGEMM_L2x1_END andi. T1, M, 4 ble ZGEMM_L2x4_END mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L2x4_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L2x4_SUB4 ZGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble ZGEMM_L2x4_LOOP_END .align 5 ZGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt ZGEMM_L2x4_LOOP ZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b ZGEMM_L2x4_SUB1 ZGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b ZGEMM_L2x4_SUB1 ZGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 ble ZGEMM_L2x4_SAVE b ZGEMM_L2x4_SUB2 ZGEMM_L2x4_SUB1: andi. L, K, 7 ble ZGEMM_L2x4_SAVE ZGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt ZGEMM_L2x4_SUB2 ZGEMM_L2x4_SAVE: SAVE2x4 ZGEMM_L2x4_END: ZGEMM_L2x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L2x2_END mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L2x2_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L2x2_SUB4 ZGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble ZGEMM_L2x2_LOOP_END .align 5 ZGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt ZGEMM_L2x2_LOOP ZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b ZGEMM_L2x2_SUB1 ZGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b ZGEMM_L2x2_SUB1 ZGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 ble ZGEMM_L2x2_SAVE b ZGEMM_L2x2_SUB2 ZGEMM_L2x2_SUB1: andi. L, K, 7 ble ZGEMM_L2x2_SAVE ZGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt ZGEMM_L2x2_SUB2 ZGEMM_L2x2_SAVE: SAVE2x2 ZGEMM_L2x2_END: ZGEMM_L2x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L2x1_END mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L2x1_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L2x1_SUB4 ZGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble ZGEMM_L2x1_LOOP_END .align 5 ZGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt ZGEMM_L2x1_LOOP ZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b ZGEMM_L2x1_SUB1 ZGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b ZGEMM_L2x1_SUB1 ZGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 ble ZGEMM_L2x1_SAVE b ZGEMM_L2x1_SUB2 ZGEMM_L2x1_SUB1: andi. L, K, 7 ble ZGEMM_L2x1_SAVE ZGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt ZGEMM_L2x1_SUB2 ZGEMM_L2x1_SAVE: SAVE2x1 ZGEMM_L2x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 bgt ZGEMM_L2_BEGIN andi. T2, N, 1 ble L999 ZGEMM_L2_END: b ZGEMM_L1_BEGIN L999_H1: b L999 ZGEMM_L1_BEGIN: mr BO, B mr BBO, BBUFFER slwi T1, K, 0 ZGEMM_L1_COPYB: dcbtst BBO, PRE lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i addi BO, BO, 16 stxvd2x vs4, o0, BBO stxvd2x vs5, o16, BBO addic. T1, T1, -1 addi BBO, BBO, 32 bge ZGEMM_L1_COPYB andi. T1, N, 1 ble ZGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 ble ZGEMM_L1x8_END ZGEMM_L1x8_BEGIN: mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L1x8_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L1x8_SUB4 ZGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 dcbt AO, PRE KERNEL1x8_I1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 ble ZGEMM_L1x8_LOOP_END .align 5 ZGEMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 bgt ZGEMM_L1x8_LOOP ZGEMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 KERNEL1x8_E2 b ZGEMM_L1x8_SUB1 ZGEMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 dcbt AO, PRE KERNEL1x8_SUB1 dcbt AO, PRE KERNEL1x8_SUB1 dcbt AO, PRE KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b ZGEMM_L1x8_SUB1 ZGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 ble ZGEMM_L1x8_SAVE b ZGEMM_L1x8_SUB2 ZGEMM_L1x8_SUB1: andi. L, K, 7 ble ZGEMM_L1x8_SAVE ZGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt ZGEMM_L1x8_SUB2 ZGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 bgt ZGEMM_L1x8_BEGIN ZGEMM_L1x8_END: ZGEMM_L1x4_BEGIN: andi. T2, M, 7 ble ZGEMM_L1x1_END andi. T1, M, 4 ble ZGEMM_L1x4_END mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L1x4_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L1x4_SUB4 ZGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble ZGEMM_L1x4_LOOP_END .align 5 ZGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt ZGEMM_L1x4_LOOP ZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b ZGEMM_L1x4_SUB1 ZGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b ZGEMM_L1x4_SUB1 ZGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 ble ZGEMM_L1x4_SAVE b ZGEMM_L1x4_SUB2 ZGEMM_L1x4_SUB1: andi. L, K, 7 ble ZGEMM_L1x4_SAVE ZGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt ZGEMM_L1x4_SUB2 ZGEMM_L1x4_SAVE: SAVE1x4 ZGEMM_L1x4_END: ZGEMM_L1x2_BEGIN: andi. T1, M, 2 ble ZGEMM_L1x2_END mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L1x2_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L1x2_SUB4 ZGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble ZGEMM_L1x2_LOOP_END .align 5 ZGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt ZGEMM_L1x2_LOOP ZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b ZGEMM_L1x2_SUB1 ZGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b ZGEMM_L1x2_SUB1 ZGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 ble ZGEMM_L1x2_SAVE b ZGEMM_L1x2_SUB2 ZGEMM_L1x2_SUB1: andi. L, K, 7 ble ZGEMM_L1x2_SAVE ZGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt ZGEMM_L1x2_SUB2 ZGEMM_L1x2_SAVE: SAVE1x2 ZGEMM_L1x2_END: ZGEMM_L1x1_BEGIN: andi. T1, M, 1 ble ZGEMM_L1x1_END mr BO, BBUFFER srawi. L, K, 3 ble ZGEMM_L1x1_SUB0 cmpwi cr0, L, 1 ble ZGEMM_L1x1_SUB4 ZGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble ZGEMM_L1x1_LOOP_END .align 5 ZGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt ZGEMM_L1x1_LOOP ZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b ZGEMM_L1x1_SUB1 ZGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b ZGEMM_L1x1_SUB1 ZGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 ble ZGEMM_L1x1_SAVE b ZGEMM_L1x1_SUB2 ZGEMM_L1x1_SUB1: andi. L, K, 7 ble ZGEMM_L1x1_SAVE ZGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt ZGEMM_L1x1_SUB2 ZGEMM_L1x1_SAVE: SAVE1x1 ZGEMM_L1x1_END: ZGEMM_L1_END: OpenBLAS-0.2.20/kernel/power/zgemm_macros_8x2_power8.S000066400000000000000000002713131313527062700224030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xsadddp #define XSFADD_I2 xsadddp #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xssubdp #define XSFADD_I2 xsadddp #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xsadddp #define XSFADD_I2 xssubdp #else // CC || CR || RC || RR #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xssubdp #define XSFADD_I2 xssubdp #endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro LOAD2x8_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL2x8_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag xvmuldp vs48, vs0, vs18 // real*real, imag*real xvmuldp vs49, vs0, vs19 // real*imag, imag*imag xvmuldp vs50, vs1, vs18 // real*real, imag*real xvmuldp vs51, vs1, vs19 // real*imag, imag*imag xvmuldp vs52, vs2, vs18 // real*real, imag*real xvmuldp vs53, vs2, vs19 // real*imag, imag*imag xvmuldp vs54, vs3, vs18 // real*real, imag*real xvmuldp vs55, vs3, vs19 // real*imag, imag*imag xvmuldp vs56, vs4, vs18 // real*real, imag*real xvmuldp vs57, vs4, vs19 // real*imag, imag*imag xvmuldp vs58, vs5, vs18 // real*real, imag*real xvmuldp vs59, vs5, vs19 // real*imag, imag*imag xvmuldp vs60, vs6, vs18 // real*real, imag*real xvmuldp vs61, vs6, vs19 // real*imag, imag*imag xvmuldp vs62, vs7, vs18 // real*real, imag*real xvmuldp vs63, vs7, vs19 // real*imag, imag*imag .endm .macro KERNEL2x8_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag xvmaddadp vs48, vs0, vs18 // real*real, imag*real xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag .endm .macro KERNEL2x8_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag .endm .macro KERNEL2x8_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag .endm .macro KERNEL2x8_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag xvmuldp vs48, vs0, vs18 // real*real, imag*real xvmuldp vs49, vs0, vs19 // real*imag, imag*imag xvmuldp vs50, vs1, vs18 // real*real, imag*real xvmuldp vs51, vs1, vs19 // real*imag, imag*imag xvmuldp vs52, vs2, vs18 // real*real, imag*real xvmuldp vs53, vs2, vs19 // real*imag, imag*imag xvmuldp vs54, vs3, vs18 // real*real, imag*real xvmuldp vs55, vs3, vs19 // real*imag, imag*imag xvmuldp vs56, vs4, vs18 // real*real, imag*real xvmuldp vs57, vs4, vs19 // real*imag, imag*imag xvmuldp vs58, vs5, vs18 // real*real, imag*real xvmuldp vs59, vs5, vs19 // real*imag, imag*imag xvmuldp vs60, vs6, vs18 // real*real, imag*real xvmuldp vs61, vs6, vs19 // real*imag, imag*imag xvmuldp vs62, vs7, vs18 // real*real, imag*real xvmuldp vs63, vs7, vs19 // real*imag, imag*imag .endm .macro KERNEL2x8_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag xvmaddadp vs48, vs0, vs18 // real*real, imag*real xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag .endm .macro SAVE2x8 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 lxvd2x vs20, o0, T2 lxvd2x vs21, o16, T2 lxvd2x vs22, o32, T2 lxvd2x vs23, o48, T2 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs12, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs13, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs14, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs15, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 xvadddp vs12, vs12, vs20 xvadddp vs13, vs13, vs21 xvadddp vs14, vs14, vs22 xvadddp vs15, vs15, vs23 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, o0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 lxvd2x vs20, o0, T2 lxvd2x vs21, o16, T2 lxvd2x vs22, o32, T2 lxvd2x vs23, o48, T2 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs12, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs13, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs14, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs15, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 xvadddp vs12, vs12, vs20 xvadddp vs13, vs13, vs21 xvadddp vs14, vs14, vs22 xvadddp vs15, vs15, vs23 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, o0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 add T1, T1, LDC add T2, T2, LDC addi CO, CO, 128 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro LOAD2x4_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL2x4_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs0, vs18 // real*real, imag*real xvmuldp vs41, vs0, vs19 // real*imag, imag*imag xvmuldp vs42, vs1, vs18 // real*real, imag*real xvmuldp vs43, vs1, vs19 // real*imag, imag*imag xvmuldp vs44, vs2, vs18 // real*real, imag*real xvmuldp vs45, vs2, vs19 // real*imag, imag*imag xvmuldp vs46, vs3, vs18 // real*real, imag*real xvmuldp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro KERNEL2x4_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs0, vs18 // real*real, imag*real xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag xvmaddadp vs42, vs1, vs18 // real*real, imag*real xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag xvmaddadp vs44, vs2, vs18 // real*real, imag*real xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag xvmaddadp vs46, vs3, vs18 // real*real, imag*real xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro KERNEL2x4_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs8, vs22 // real*real, imag*real xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag xvmaddadp vs42, vs9, vs22 // real*real, imag*real xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag xvmaddadp vs44, vs10, vs22 // real*real, imag*real xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag xvmaddadp vs46, vs11, vs22 // real*real, imag*real xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag .endm .macro KERNEL2x4_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs8, vs22 // real*real, imag*real xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag xvmaddadp vs42, vs9, vs22 // real*real, imag*real xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag xvmaddadp vs44, vs10, vs22 // real*real, imag*real xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag xvmaddadp vs46, vs11, vs22 // real*real, imag*real xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag .endm .macro KERNEL2x4_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs0, vs18 // real*real, imag*real xvmuldp vs41, vs0, vs19 // real*imag, imag*imag xvmuldp vs42, vs1, vs18 // real*real, imag*real xvmuldp vs43, vs1, vs19 // real*imag, imag*imag xvmuldp vs44, vs2, vs18 // real*real, imag*real xvmuldp vs45, vs2, vs19 // real*imag, imag*imag xvmuldp vs46, vs3, vs18 // real*real, imag*real xvmuldp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro KERNEL2x4_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs0, vs18 // real*real, imag*real xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag xvmaddadp vs42, vs1, vs18 // real*real, imag*real xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag xvmaddadp vs44, vs2, vs18 // real*real, imag*real xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag xvmaddadp vs46, vs3, vs18 // real*real, imag*real xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro SAVE2x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro LOAD2x2_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 .endm .macro KERNEL2x2_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs0, vs18 // real*real, imag*real xvmuldp vs37, vs0, vs19 // real*imag, imag*imag xvmuldp vs38, vs1, vs18 // real*real, imag*real xvmuldp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro KERNEL2x2_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs0, vs18 // real*real, imag*real xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag xvmaddadp vs38, vs1, vs18 // real*real, imag*real xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro KERNEL2x2_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs8, vs22 // real*real, imag*real xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag xvmaddadp vs38, vs9, vs22 // real*real, imag*real xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag .endm .macro KERNEL2x2_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs8, vs22 // real*real, imag*real xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag xvmaddadp vs38, vs9, vs22 // real*real, imag*real xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag .endm .macro KERNEL2x2_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs0, vs18 // real*real, imag*real xvmuldp vs37, vs0, vs19 // real*imag, imag*imag xvmuldp vs38, vs1, vs18 // real*real, imag*real xvmuldp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro KERNEL2x2_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs0, vs18 // real*real, imag*real xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag xvmaddadp vs38, vs1, vs18 // real*real, imag*real xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro SAVE2x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro LOAD2x1_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 .endm .macro KERNEL2x1_I1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs0, vs18 // real*real, imag*real xvmuldp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro KERNEL2x1_1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B lxvd2x vs22, o32, BO // load real part from B lxvd2x vs23, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs0, vs18 // real*real, imag*real xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro KERNEL2x1_2 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs8, vs22 // real*real, imag*real xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag .endm .macro KERNEL2x1_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs8, vs22 // real*real, imag*real xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag .endm .macro KERNEL2x1_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs0, vs18 // real*real, imag*real xvmuldp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro KERNEL2x1_SUB1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B lxvd2x vs18, o32, BO // load real part from B lxvd2x vs19, o48, BO // load imag part from B addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs0, vs18 // real*real, imag*real xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro SAVE2x1 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 #endif stxvd2x vs8, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 #endif stxvd2x vs8, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro LOAD1x8_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL1x8_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro KERNEL1x8_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro KERNEL1x8_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag .endm .macro KERNEL1x8_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag .endm .macro KERNEL1x8_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro KERNEL1x8_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro SAVE1x8 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 lxvd2x vs20, o0, T2 lxvd2x vs21, o16, T2 lxvd2x vs22, o32, T2 lxvd2x vs23, o48, T2 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs12, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs13, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs14, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs15, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 xvadddp vs12, vs12, vs20 xvadddp vs13, vs13, vs21 xvadddp vs14, vs14, vs22 xvadddp vs15, vs15, vs23 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, o0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 add T1, T1, LDC add T2, T2, LDC addi CO, CO, 128 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro LOAD1x4_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL1x4_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro KERNEL1x4_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro KERNEL1x4_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag .endm .macro KERNEL1x4_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag .endm .macro KERNEL1x4_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro KERNEL1x4_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro SAVE1x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro LOAD1x2_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 .endm .macro KERNEL1x2_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro KERNEL1x2_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro KERNEL1x2_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag .endm .macro KERNEL1x2_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag .endm .macro KERNEL1x2_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro KERNEL1x2_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro SAVE1x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro LOAD1x1_1 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 .endm .macro KERNEL1x1_I1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro KERNEL1x1_1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs20, o0, BO // load real part from B lxvd2x vs21, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro KERNEL1x1_2 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag .endm .macro KERNEL1x1_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag .endm .macro KERNEL1x1_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro KERNEL1x1_SUB1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro SAVE1x1 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 #endif stxvd2x vs8, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm .macro ZCOPYB_1x1 lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i addi BO, BO, 16 stxvd2x vs4, o0, BBO stxvd2x vs5, o16, BBO addi BBO, BBO, 32 .endm .macro ZCOPYB_8x1 lxvd2x vs32, o0, BO lxvd2x vs33, o16, BO lxvd2x vs34, o32, BO lxvd2x vs35, o48, BO addi BO, BO, 64 lxvd2x vs36, o0, BO lxvd2x vs37, o16, BO lxvd2x vs38, o32, BO lxvd2x vs39, o48, BO addi BO, BO, 64 xxspltd vs40, vs32, 0 xxspltd vs41, vs32, 1 xxspltd vs42, vs33, 0 xxspltd vs43, vs33, 1 xxspltd vs44, vs34, 0 xxspltd vs45, vs34, 1 xxspltd vs46, vs35, 0 xxspltd vs47, vs35, 1 xxspltd vs48, vs36, 0 xxspltd vs49, vs36, 1 xxspltd vs50, vs37, 0 xxspltd vs51, vs37, 1 xxspltd vs52, vs38, 0 xxspltd vs53, vs38, 1 xxspltd vs54, vs39, 0 xxspltd vs55, vs39, 1 stxvd2x vs40, o0, BBO stxvd2x vs41, o16, BBO stxvd2x vs42, o32, BBO stxvd2x vs43, o48, BBO addi BBO, BBO, 64 stxvd2x vs44, o0, BBO stxvd2x vs45, o16, BBO stxvd2x vs46, o32, BBO stxvd2x vs47, o48, BBO addi BBO, BBO, 64 stxvd2x vs48, o0, BBO stxvd2x vs49, o16, BBO stxvd2x vs50, o32, BBO stxvd2x vs51, o48, BBO addi BBO, BBO, 64 stxvd2x vs52, o0, BBO stxvd2x vs53, o16, BBO stxvd2x vs54, o32, BBO stxvd2x vs55, o48, BBO addi BBO, BBO, 64 .endm OpenBLAS-0.2.20/kernel/power/zgemm_ncopy_hummer_2.S000066400000000000000000000211141313527062700220300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define J r12 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) slwi LDA, LDA, ZBASE_SHIFT cmpwi cr0, M, 0 ble- LL(99) cmpwi cr0, N, 0 ble- LL(99) li INC, 1 * SIZE li INC2, 2 * SIZE subi B, B, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne LL(100) subi A, A, 2 * SIZE srawi. J, N, 1 ble LL(20) .align 4 LL(11): mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 3 mtspr CTR, r0 ble LL(15) .align 4 LL(12): LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO1, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO1, INC2 LFPDUX c08, AO2, INC2 LFPDUX c09, AO1, INC2 LFPDUX c10, AO2, INC2 LFPDUX c11, AO1, INC2 LFPDUX c12, AO2, INC2 LFPDUX c13, AO1, INC2 LFPDUX c14, AO2, INC2 LFPDUX c15, AO1, INC2 LFPDUX c16, AO2, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFPDUX c05, B, INC2 STFPDUX c06, B, INC2 STFPDUX c07, B, INC2 STFPDUX c08, B, INC2 STFPDUX c09, B, INC2 STFPDUX c10, B, INC2 STFPDUX c11, B, INC2 STFPDUX c12, B, INC2 STFPDUX c13, B, INC2 STFPDUX c14, B, INC2 STFPDUX c15, B, INC2 STFPDUX c16, B, INC2 bdnz LL(12) .align 4 LL(15): andi. r0, M, 7 ble LL(19) andi. r0, M, 4 beq LL(16) LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO1, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO1, INC2 LFPDUX c08, AO2, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 STFPDUX c05, B, INC2 STFPDUX c06, B, INC2 STFPDUX c07, B, INC2 STFPDUX c08, B, INC2 .align 4 LL(16): andi. r0, M, 2 beq LL(17) LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO2, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 .align 4 LL(17): andi. r0, M, 1 beq LL(19) LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 .align 4 LL(19): addic. J, J, -1 bgt LL(11) .align 4 LL(20): andi. J, N, 1 ble LL(99) mr AO1, A srawi. r0, M, 2 mtspr CTR, r0 ble LL(25) .align 4 LL(22): LFPDUX c01, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c05, AO1, INC2 LFPDUX c07, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 bdnz LL(22) .align 4 LL(25): andi. r0, M, 3 ble LL(99) andi. r0, M, 2 beq LL(27) LFPDUX c01, AO1, INC2 LFPDUX c03, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 LL(27): andi. r0, M, 1 beq LL(99) LFPDUX c01, AO1, INC2 STFPDUX c01, B, INC2 .align 4 LL(99): addi SP, SP, -4 lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 LL(100): subi A, A, 1 * SIZE srawi. J, N, 1 ble LL(120) .align 4 LL(111): mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(115) .align 4 LL(112): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO2, INC LFDUX c08, AO2, INC LFDUX c09, AO1, INC LFDUX c10, AO1, INC LFDUX c11, AO2, INC LFDUX c12, AO2, INC fsmfp c01, c02 LFDUX c13, AO1, INC fsmfp c03, c04 LFDUX c14, AO1, INC fsmfp c05, c06 LFDUX c15, AO2, INC fsmfp c07, c08 LFDUX c16, AO2, INC fsmfp c09, c10 STFPDUX c01, B, INC2 fsmfp c11, c12 STFPDUX c03, B, INC2 fsmfp c13, c14 STFPDUX c05, B, INC2 fsmfp c15, c16 STFPDUX c07, B, INC2 STFPDUX c09, B, INC2 STFPDUX c11, B, INC2 STFPDUX c13, B, INC2 STFPDUX c15, B, INC2 bdnz LL(112) .align 4 LL(115): andi. r0, M, 3 ble LL(119) andi. r0, M, 2 beq LL(117) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO2, INC LFDUX c08, AO2, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 .align 4 LL(117): andi. r0, M, 1 beq LL(119) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 LL(119): addic. J, J, -1 bgt LL(111) .align 4 LL(120): andi. J, N, 1 ble LL(999) mr AO1, A srawi. r0, M, 2 mtspr CTR, r0 ble LL(125) .align 4 LL(122): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO1, INC LFDUX c08, AO1, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 bdnz LL(122) .align 4 LL(125): andi. r0, M, 3 ble LL(999) andi. r0, M, 2 beq LL(127) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 LL(127): andi. r0, M, 1 beq LL(999) LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 LL(999): addi SP, SP, -4 lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zgemm_ncopy_hummer_4.S000066400000000000000000000277441313527062700220510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r12 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) slwi LDA, LDA, ZBASE_SHIFT cmpwi cr0, M, 0 ble- LL(99) cmpwi cr0, N, 0 ble- LL(99) li INC, 1 * SIZE li INC2, 2 * SIZE subi B, B, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne LL(100) subi A, A, 2 * SIZE srawi. J, N, 2 ble LL(20) .align 4 LL(11): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(15) .align 4 LL(12): LFPDUX c01, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c13, AO4, INC2 LFPDUX c02, AO1, INC2 LFPDUX c06, AO2, INC2 LFPDUX c10, AO3, INC2 LFPDUX c14, AO4, INC2 LFPDUX c03, AO1, INC2 LFPDUX c07, AO2, INC2 LFPDUX c11, AO3, INC2 LFPDUX c15, AO4, INC2 LFPDUX c04, AO1, INC2 LFPDUX c08, AO2, INC2 LFPDUX c12, AO3, INC2 LFPDUX c16, AO4, INC2 STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c09, B, INC2 STFPDUX c13, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c10, B, INC2 STFPDUX c14, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c11, B, INC2 STFPDUX c15, B, INC2 STFPDUX c04, B, INC2 STFPDUX c08, B, INC2 STFPDUX c12, B, INC2 STFPDUX c16, B, INC2 bdnz LL(12) .align 4 LL(15): andi. r0, M, 3 ble LL(19) andi. r0, M, 2 beq LL(17) LFPDUX c01, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c13, AO4, INC2 LFPDUX c02, AO1, INC2 LFPDUX c06, AO2, INC2 LFPDUX c10, AO3, INC2 LFPDUX c14, AO4, INC2 STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c09, B, INC2 STFPDUX c13, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c10, B, INC2 STFPDUX c14, B, INC2 .align 4 LL(17): andi. r0, M, 1 beq LL(19) LFPDUX c01, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c13, AO4, INC2 STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c09, B, INC2 STFPDUX c13, B, INC2 .align 4 LL(19): addic. J, J, -1 bgt LL(11) .align 4 LL(20): andi. J, N, 2 ble LL(30) mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(25) .align 4 LL(22): LFPDUX c01, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c02, AO1, INC2 LFPDUX c06, AO2, INC2 LFPDUX c03, AO1, INC2 LFPDUX c07, AO2, INC2 LFPDUX c04, AO1, INC2 LFPDUX c08, AO2, INC2 STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c04, B, INC2 STFPDUX c08, B, INC2 bdnz LL(22) .align 4 LL(25): andi. r0, M, 3 ble LL(30) andi. r0, M, 2 beq LL(27) LFPDUX c01, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c02, AO1, INC2 LFPDUX c06, AO2, INC2 STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 .align 4 LL(27): andi. r0, M, 1 beq LL(30) LFPDUX c01, AO1, INC2 LFPDUX c05, AO2, INC2 STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 .align 4 LL(30): andi. J, N, 1 ble LL(99) mr AO1, A srawi. r0, M, 2 mtspr CTR, r0 ble LL(35) .align 4 LL(32): LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 STFPDUX c03, B, INC2 STFPDUX c04, B, INC2 bdnz LL(32) .align 4 LL(35): andi. r0, M, 3 ble LL(99) andi. r0, M, 2 beq LL(37) LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B, INC2 STFPDUX c02, B, INC2 .align 4 LL(37): andi. r0, M, 1 beq LL(99) LFPDUX c01, AO1, INC2 STFPDUX c01, B, INC2 .align 4 LL(99): addi SP, SP, -4 lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 LL(100): subi A, A, 1 * SIZE srawi. J, N, 2 ble LL(120) .align 4 LL(111): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(115) .align 4 LL(112): LFDUX c01, AO1, INC LFDUX c05, AO2, INC LFDUX c09, AO3, INC LFDUX c13, AO4, INC LFSDUX c01, AO1, INC LFSDUX c05, AO2, INC LFSDUX c09, AO3, INC LFSDUX c13, AO4, INC LFDUX c02, AO1, INC LFDUX c06, AO2, INC LFDUX c10, AO3, INC LFDUX c14, AO4, INC LFSDUX c02, AO1, INC LFSDUX c06, AO2, INC LFSDUX c10, AO3, INC LFSDUX c14, AO4, INC LFDUX c03, AO1, INC LFDUX c07, AO2, INC LFDUX c11, AO3, INC LFDUX c15, AO4, INC LFSDUX c03, AO1, INC LFSDUX c07, AO2, INC LFSDUX c11, AO3, INC LFSDUX c15, AO4, INC LFDUX c04, AO1, INC LFDUX c08, AO2, INC LFDUX c12, AO3, INC LFDUX c16, AO4, INC LFSDUX c04, AO1, INC LFSDUX c08, AO2, INC LFSDUX c12, AO3, INC LFSDUX c16, AO4, INC STFPDUX c01, B, INC2 STFPDUX c05, B, INC2 STFPDUX c09, B, INC2 STFPDUX c13, B, INC2 STFPDUX c02, B, INC2 STFPDUX c06, B, INC2 STFPDUX c10, B, INC2 STFPDUX c14, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c11, B, INC2 STFPDUX c15, B, INC2 STFPDUX c04, B, INC2 STFPDUX c08, B, INC2 STFPDUX c12, B, INC2 STFPDUX c16, B, INC2 bdnz LL(112) .align 4 LL(115): andi. r0, M, 3 ble LL(119) andi. r0, M, 2 beq LL(117) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c05, AO2, INC LFDUX c06, AO2, INC LFDUX c09, AO3, INC LFDUX c10, AO3, INC LFDUX c13, AO4, INC LFDUX c14, AO4, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c07, AO2, INC LFDUX c08, AO2, INC fsmfp c01, c02 LFDUX c11, AO3, INC fsmfp c05, c06 LFDUX c12, AO3, INC fsmfp c09, c10 LFDUX c15, AO4, INC fsmfp c13, c14 LFDUX c16, AO4, INC fsmfp c03, c04 STFPDUX c01, B, INC2 fsmfp c07, c08 STFPDUX c05, B, INC2 fsmfp c11, c12 STFPDUX c09, B, INC2 fsmfp c15, c16 STFPDUX c13, B, INC2 STFPDUX c03, B, INC2 STFPDUX c07, B, INC2 STFPDUX c11, B, INC2 STFPDUX c15, B, INC2 .align 4 LL(117): andi. r0, M, 1 beq LL(119) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC LFDUX c05, AO3, INC LFDUX c06, AO3, INC LFDUX c07, AO4, INC LFDUX c08, AO4, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 .align 4 LL(119): addic. J, J, -1 bgt LL(111) .align 4 LL(120): andi. J, N, 2 ble LL(130) mr AO1, A add AO2, A, LDA add A, AO2, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(125) .align 4 LL(122): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c09, AO2, INC LFDUX c10, AO2, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c11, AO2, INC LFDUX c12, AO2, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c13, AO2, INC LFDUX c14, AO2, INC fsmfp c01, c02 LFDUX c07, AO1, INC fsmfp c09, c10 LFDUX c08, AO1, INC fsmfp c03, c04 LFDUX c15, AO2, INC fsmfp c11, c12 LFDUX c16, AO2, INC fsmfp c05, c06 STFPDUX c01, B, INC2 fsmfp c13, c14 STFPDUX c09, B, INC2 fsmfp c07, c08 STFPDUX c03, B, INC2 fsmfp c15, c16 STFPDUX c11, B, INC2 STFPDUX c05, B, INC2 STFPDUX c13, B, INC2 STFPDUX c07, B, INC2 STFPDUX c15, B, INC2 bdnz LL(122) .align 4 LL(125): andi. r0, M, 3 ble LL(130) andi. r0, M, 2 beq LL(127) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO2, INC LFDUX c08, AO2, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 .align 4 LL(127): andi. r0, M, 1 beq LL(130) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 LL(130): andi. J, N, 1 ble LL(999) mr AO1, A srawi. r0, M, 2 mtspr CTR, r0 ble LL(135) .align 4 LL(132): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO1, INC LFDUX c08, AO1, INC fsmfp c01, c02 fsmfp c03, c04 fsmfp c05, c06 fsmfp c07, c08 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 STFPDUX c05, B, INC2 STFPDUX c07, B, INC2 bdnz LL(132) .align 4 LL(135): andi. r0, M, 3 ble LL(999) andi. r0, M, 2 beq LL(137) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 LL(137): andi. r0, M, 1 beq LL(999) LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDUX c01, B, INC2 .align 4 LL(999): addi SP, SP, -4 lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 EPILOGUE OpenBLAS-0.2.20/kernel/power/zgemm_tcopy_8_power8.S000066400000000000000000000153011313527062700217740ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define A0 r8 #define A1 r9 #define A2 r10 #define A3 r11 #define J r12 #define PREA r14 #define PREB r15 #define BO r16 #define B8 r17 #define B4 r18 #define B2 r19 #define B1 r20 #define NOTUS1 r21 #define T2 r22 #define I r23 #define o16 r24 #define o32 r25 #define o48 r26 #define NOTUS2 r27 #define M8 r30 #define T1 r31 #define o0 0 #include "zgemm_tcopy_macros_8_power8.S" #define STACKSIZE 384 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) cmpwi cr0, M, 0 ble- L999 cmpwi cr0, N, 0 ble- L999 slwi LDA, LDA, ZBASE_SHIFT slwi M8, M, 3 + ZBASE_SHIFT li T2, -8 li PREA, -4 li PREB, -2 and B4, N, T2 and B2, N, PREA and B1, N, PREB mullw B4, B4, M mullw B2, B2, M mullw B1, B1, M slwi B4, B4, ZBASE_SHIFT slwi B2, B2, ZBASE_SHIFT slwi B1, B1, ZBASE_SHIFT add B4, B4, B add B2, B2, B add B1, B1, B li PREA, 384 addi PREB, M8, 128 li o16, 16 li o32, 32 li o48, 48 #include "zgemm_tcopy_logic_8_power8.S" L999: li r3, 0 ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zgemm_tcopy_hummer_2.S000066400000000000000000000137571313527062700220540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define J r10 #define B1 r11 #define B2 r28 #define M4 r29 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 PROLOGUE PROFCODE stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) slwi LDA, LDA, ZBASE_SHIFT slwi M4, M, 1 + ZBASE_SHIFT li r9, -2 and B2, N, r9 mullw B2, B2, M slwi B2, B2, ZBASE_SHIFT add B2, B2, B cmpwi cr0, M, 0 ble- LL(99) cmpwi cr0, N, 0 ble- LL(99) subi B2, B2, 2 * SIZE subi M4, M4, 6 * SIZE li INC, 1 * SIZE li INC2, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne LL(100) subi A, A, 2 * SIZE srawi. J, M, 1 ble LL(20) .align 4 LL(10): mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M4 addi B, B, 8 * SIZE srawi. r0, N, 1 mtspr CTR, r0 ble LL(15) .align 4 LL(12): LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 bdnz LL(12) .align 4 LL(15): andi. r0, N, 1 ble LL(19) LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 .align 4 LL(19): addic. J, J, -1 bgt LL(10) .align 4 LL(20): andi. J, M, 1 addi M4, M4, 4 * SIZE ble LL(99) mr AO1, A sub B1, B, M4 srawi. r0, N, 1 mtspr CTR, r0 ble LL(23) .align 4 LL(22): LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 bdnz LL(22) .align 4 LL(23): andi. r0, N, 1 ble LL(99) LFPDUX c01, AO1, INC2 STFPDUX c01, B2, INC2 .align 4 LL(99): addi SP, SP, -4 lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) addi SP, SP, 4 blr .align 4 LL(100): subi A, A, SIZE srawi. J, M, 1 ble LL(120) .align 4 LL(110): mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M4 addi B, B, 8 * SIZE srawi. r0, N, 1 mtspr CTR, r0 ble LL(115) .align 4 LL(112): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO2, INC fsmfp c01, c02 LFDUX c06, AO2, INC fsmfp c03, c04 LFDUX c07, AO2, INC fsmfp c05, c06 LFDUX c08, AO2, INC fsmfp c07, c08 STFPDUX c01, B1, M4 STFPDUX c03, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c07, B1, INC2 bdnz LL(112) .align 4 LL(115): andi. r0, N, 1 ble LL(119) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 .align 4 LL(119): addic. J, J, -1 bgt LL(110) .align 4 LL(120): andi. J, M, 1 addi M4, M4, 4 * SIZE ble LL(999) mr AO1, A sub B1, B, M4 srawi. r0, N, 1 mtspr CTR, r0 ble LL(123) .align 4 LL(122): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B1, M4 STFPDUX c03, B1, INC2 bdnz LL(122) .align 4 LL(123): andi. r0, N, 1 ble LL(999) LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDUX c01, B2, INC2 .align 4 LL(999): addi SP, SP, -4 lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) addi SP, SP, 4 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zgemm_tcopy_hummer_4.S000066400000000000000000000314251313527062700220460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M r3 #define N r4 #define A r5 #define LDA r6 #define B r7 #define AO1 r8 #define AO2 r9 #define AO3 r10 #define AO4 r11 #define J r25 #define B1 r26 #define B2 r27 #define B3 r28 #define M4 r29 #define INC r30 #define INC2 r31 #define c01 f0 #define c02 f1 #define c03 f2 #define c04 f3 #define c05 f4 #define c06 f5 #define c07 f6 #define c08 f7 #define c09 f8 #define c10 f9 #define c11 f10 #define c12 f11 #define c13 f12 #define c14 f13 #define c15 f14 #define c16 f15 PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) slwi LDA, LDA, ZBASE_SHIFT slwi M4, M, 2 + ZBASE_SHIFT li r8, -4 li r9, -2 and B2, N, r8 and B3, N, r9 mullw B2, B2, M mullw B3, B3, M slwi B2, B2, ZBASE_SHIFT slwi B3, B3, ZBASE_SHIFT add B2, B2, B add B3, B3, B cmpwi cr0, M, 0 ble- LL(99) cmpwi cr0, N, 0 ble- LL(99) subi B2, B2, 2 * SIZE subi B3, B3, 2 * SIZE subi M4, M4, 30 * SIZE li INC, 1 * SIZE li INC2, 2 * SIZE andi. r0, A, 2 * SIZE - 1 bne LL(100) subi A, A, 2 * SIZE srawi. J, M, 2 ble LL(20) .align 4 LL(10): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA sub B1, B, M4 addi B, B, 32 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble LL(15) .align 4 LL(12): LFPDUX c01, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c09, AO3, INC2 LFPDUX c13, AO4, INC2 LFPDUX c02, AO1, INC2 LFPDUX c06, AO2, INC2 LFPDUX c10, AO3, INC2 LFPDUX c14, AO4, INC2 LFPDUX c03, AO1, INC2 LFPDUX c07, AO2, INC2 LFPDUX c11, AO3, INC2 LFPDUX c15, AO4, INC2 LFPDUX c04, AO1, INC2 LFPDUX c08, AO2, INC2 LFPDUX c12, AO3, INC2 LFPDUX c16, AO4, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 bdnz LL(12) .align 4 LL(15): andi. r0, N, 3 ble LL(19) andi. r0, N, 2 ble LL(17) LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 LFPDUX c05, AO3, INC2 LFPDUX c06, AO3, INC2 LFPDUX c07, AO4, INC2 LFPDUX c08, AO4, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c06, B2, INC2 STFPDUX c07, B2, INC2 STFPDUX c08, B2, INC2 .align 4 LL(17): andi. r0, N, 1 ble LL(19) LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 LFPDUX c03, AO3, INC2 LFPDUX c04, AO4, INC2 STFPDUX c01, B3, INC2 STFPDUX c02, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c04, B3, INC2 .align 4 LL(19): addic. J, J, -1 bgt LL(10) .align 4 LL(20): andi. J, M, 2 addi M4, M4, 16 * SIZE ble LL(30) mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M4 addi B, B, 16 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble LL(23) .align 4 LL(22): LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 LFPDUX c05, AO2, INC2 LFPDUX c06, AO2, INC2 LFPDUX c07, AO2, INC2 LFPDUX c08, AO2, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 bdnz LL(22) .align 4 LL(23): andi. r0, N, 2 ble LL(24) LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO2, INC2 LFPDUX c04, AO2, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c04, B2, INC2 .align 4 LL(24): andi. r0, N, 1 ble LL(30) LFPDUX c01, AO1, INC2 LFPDUX c02, AO2, INC2 STFPDUX c01, B3, INC2 STFPDUX c02, B3, INC2 .align 4 LL(30): andi. J, M, 1 addi M4, M4, 8 * SIZE ble LL(99) mr AO1, A sub B1, B, M4 srawi. r0, N, 2 mtspr CTR, r0 ble LL(33) .align 4 LL(32): LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 LFPDUX c03, AO1, INC2 LFPDUX c04, AO1, INC2 STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 bdnz LL(32) .align 4 LL(33): andi. r0, N, 2 ble LL(34) LFPDUX c01, AO1, INC2 LFPDUX c02, AO1, INC2 STFPDUX c01, B2, INC2 STFPDUX c02, B2, INC2 .align 4 LL(34): andi. r0, N, 1 ble LL(99) LFPDUX c01, AO1, INC2 STFPDX c01, B3, INC2 .align 4 LL(99): addi SP, SP, -4 lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 LL(100): subi A, A, SIZE srawi. J, M, 2 ble LL(120) .align 4 LL(110): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA sub B1, B, M4 addi B, B, 32 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble LL(115) .align 4 LL(112): LFDUX c01, AO1, INC LFDUX c05, AO2, INC LFDUX c09, AO3, INC LFDUX c13, AO4, INC LFSDUX c01, AO1, INC LFSDUX c05, AO2, INC LFSDUX c09, AO3, INC LFSDUX c13, AO4, INC LFDUX c02, AO1, INC LFDUX c06, AO2, INC LFDUX c10, AO3, INC LFDUX c14, AO4, INC LFSDUX c02, AO1, INC LFSDUX c06, AO2, INC LFSDUX c10, AO3, INC LFSDUX c14, AO4, INC LFDUX c03, AO1, INC LFDUX c07, AO2, INC LFDUX c11, AO3, INC LFDUX c15, AO4, INC LFSDUX c03, AO1, INC LFSDUX c07, AO2, INC LFSDUX c11, AO3, INC LFSDUX c15, AO4, INC LFDUX c04, AO1, INC LFDUX c08, AO2, INC LFDUX c12, AO3, INC LFDUX c16, AO4, INC LFSDUX c04, AO1, INC LFSDUX c08, AO2, INC LFSDUX c12, AO3, INC LFSDUX c16, AO4, INC STFPDUX c01, B1, M4 STFPDUX c02, B1, INC2 STFPDUX c03, B1, INC2 STFPDUX c04, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c06, B1, INC2 STFPDUX c07, B1, INC2 STFPDUX c08, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c10, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c12, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c14, B1, INC2 STFPDUX c15, B1, INC2 STFPDUX c16, B1, INC2 bdnz LL(112) .align 4 LL(115): andi. r0, N, 3 ble LL(119) andi. r0, N, 2 ble LL(117) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO2, INC LFDUX c06, AO2, INC LFDUX c07, AO2, INC LFDUX c08, AO2, INC LFDUX c09, AO3, INC LFDUX c10, AO3, INC LFDUX c11, AO3, INC LFDUX c12, AO3, INC fsmfp c01, c02 LFDUX c13, AO4, INC fsmfp c03, c04 LFDUX c14, AO4, INC fsmfp c05, c06 LFDUX c15, AO4, INC fsmfp c07, c08 LFDUX c16, AO4, INC fsmfp c09, c10 STFPDUX c01, B2, INC2 fsmfp c11, c12 STFPDUX c03, B2, INC2 fsmfp c13, c14 STFPDUX c05, B2, INC2 fsmfp c15, c16 STFPDUX c07, B2, INC2 STFPDUX c09, B2, INC2 STFPDUX c11, B2, INC2 STFPDUX c13, B2, INC2 STFPDUX c15, B2, INC2 .align 4 LL(117): andi. r0, N, 1 ble LL(119) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC LFDUX c05, AO3, INC fsmfp c01, c02 LFDUX c06, AO3, INC fsmfp c03, c04 LFDUX c07, AO4, INC fsmfp c05, c06 LFDUX c08, AO4, INC fsmfp c07, c08 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 STFPDUX c05, B3, INC2 STFPDUX c07, B3, INC2 .align 4 LL(119): addic. J, J, -1 bgt LL(110) .align 4 LL(120): andi. J, M, 2 addi M4, M4, 16 * SIZE ble LL(130) mr AO1, A add AO2, A, LDA add A, AO2, LDA sub B1, B, M4 addi B, B, 16 * SIZE srawi. r0, N, 2 mtspr CTR, r0 ble LL(123) .align 4 LL(122): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC LFDUX c06, AO1, INC LFDUX c07, AO1, INC LFDUX c08, AO1, INC LFDUX c09, AO2, INC LFDUX c10, AO2, INC LFDUX c11, AO2, INC LFDUX c12, AO2, INC fsmfp c01, c02 LFDUX c13, AO2, INC fsmfp c03, c04 LFDUX c14, AO2, INC fsmfp c05, c06 LFDUX c15, AO2, INC fsmfp c07, c08 LFDUX c16, AO2, INC fsmfp c09, c10 STFPDUX c01, B1, M4 fsmfp c11, c12 STFPDUX c03, B1, INC2 fsmfp c13, c14 STFPDUX c05, B1, INC2 fsmfp c15, c16 STFPDUX c07, B1, INC2 STFPDUX c09, B1, INC2 STFPDUX c11, B1, INC2 STFPDUX c13, B1, INC2 STFPDUX c15, B1, INC2 bdnz LL(122) .align 4 LL(123): andi. r0, N, 2 ble LL(124) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO2, INC fsmfp c01, c02 LFDUX c06, AO2, INC fsmfp c03, c04 LFDUX c07, AO2, INC fsmfp c05, c06 LFDUX c08, AO2, INC fsmfp c07, c08 STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 STFPDUX c05, B2, INC2 STFPDUX c07, B2, INC2 .align 4 LL(124): andi. r0, N, 1 ble LL(130) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO2, INC LFDUX c04, AO2, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B3, INC2 STFPDUX c03, B3, INC2 .align 4 LL(130): andi. J, M, 1 addi M4, M4, 8 * SIZE ble LL(999) mr AO1, A sub B1, B, M4 srawi. r0, N, 2 mtspr CTR, r0 ble LL(133) .align 4 LL(132): LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC LFDUX c05, AO1, INC fsmfp c01, c02 LFDUX c06, AO1, INC fsmfp c03, c04 LFDUX c07, AO1, INC fsmfp c05, c06 LFDUX c08, AO1, INC fsmfp c07, c08 STFPDUX c01, B1, M4 STFPDUX c03, B1, INC2 STFPDUX c05, B1, INC2 STFPDUX c07, B1, INC2 bdnz LL(132) .align 4 LL(133): andi. r0, N, 2 ble LL(134) LFDUX c01, AO1, INC LFDUX c02, AO1, INC LFDUX c03, AO1, INC LFDUX c04, AO1, INC fsmfp c01, c02 fsmfp c03, c04 STFPDUX c01, B2, INC2 STFPDUX c03, B2, INC2 .align 4 LL(134): andi. r0, N, 1 ble LL(999) LFDUX c01, AO1, INC LFDUX c02, AO1, INC fsmfp c01, c02 STFPDX c01, B3, INC2 .align 4 LL(999): addi SP, SP, -4 lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zgemm_tcopy_logic_8_power8.S000066400000000000000000000101251313527062700231500ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. I, M, 2 ble ZCOPYT_L2_BEGIN ZCOPYT_L4_BEGIN: mr A0, A add A1, A0, LDA add A2, A1, LDA add A3, A2, LDA add A, A3, LDA mr B8, B addi B, B, 64*SIZE sradi. J, N, 3 ble ZCOPYT_L4x4_BEGIN mr BO, B8 .align 5 ZCOPYT_L4x8_LOOP: addi T1, PREB, 128 addi T2, PREB, 256 dcbt A0, PREA dcbt A1, PREA dcbt A2, PREA dcbt A3, PREA dcbtst BO, M8 dcbtst BO, PREB dcbtst BO, T1 dcbtst BO, T2 COPY_4x8 add BO, BO, M8 addic. J, J, -1 bgt ZCOPYT_L4x8_LOOP ZCOPYT_L4x4_BEGIN: andi. T1, N, 4 ble ZCOPYT_L4x2_BEGIN mr BO, B4 COPY_4x4 addi B4, B4, 32*SIZE ZCOPYT_L4x2_BEGIN: andi. T1, N, 2 ble ZCOPYT_L4x1_BEGIN mr BO, B2 COPY_4x2 addi B2, B2, 16*SIZE ZCOPYT_L4x1_BEGIN: andi. T1, N, 1 ble ZCOPYT_L4_END mr BO, B1 COPY_4x1 addi B1, B1, 8*SIZE ZCOPYT_L4_END: addic. I, I, -1 bgt ZCOPYT_L4_BEGIN ZCOPYT_L2_BEGIN: andi. T1, M, 2 ble ZCOPYT_L1_BEGIN mr A0, A add A1, A0, LDA add A, A1, LDA mr B8, B addi B, B, 32*SIZE sradi. J, N, 3 ble ZCOPYT_L2x4_BEGIN mr BO, B8 ZCOPYT_L2x8_LOOP: COPY_2x8 add BO, BO, M8 addic. J, J, -1 bgt ZCOPYT_L2x8_LOOP ZCOPYT_L2x4_BEGIN: andi. T1, N, 4 ble ZCOPYT_L2x2_BEGIN mr BO, B4 COPY_2x4 addi B4, B4, 16*SIZE ZCOPYT_L2x2_BEGIN: andi. T1, N, 2 ble ZCOPYT_L2x1_BEGIN mr BO, B2 COPY_2x2 addi B2, B2, 8*SIZE ZCOPYT_L2x1_BEGIN: andi. T1, N, 1 ble ZCOPYT_L2_END mr BO, B1 COPY_2x1 addi B1, B1, 4*SIZE ZCOPYT_L2_END: ZCOPYT_L1_BEGIN: andi. T1, M, 1 ble L999 mr A0, A add A, A0, LDA mr B8, B addi B, B, 16*SIZE sradi. J, N, 3 ble ZCOPYT_L1x4_BEGIN mr BO, B8 ZCOPYT_L1x8_LOOP: COPY_1x8 add BO, BO, M8 addic. J, J, -1 bgt ZCOPYT_L1x8_LOOP ZCOPYT_L1x4_BEGIN: andi. T1, N, 4 ble ZCOPYT_L1x2_BEGIN mr BO, B4 COPY_1x4 addi B4, B4, 8*SIZE ZCOPYT_L1x2_BEGIN: andi. T1, N, 2 ble ZCOPYT_L1x1_BEGIN mr BO, B2 COPY_1x2 addi B2, B2, 4*SIZE ZCOPYT_L1x1_BEGIN: andi. T1, N, 1 ble ZCOPYT_L1_END mr BO, B1 COPY_1x1 addi B1, B1, 2*SIZE ZCOPYT_L1_END: OpenBLAS-0.2.20/kernel/power/zgemm_tcopy_macros_8_power8.S000066400000000000000000000252471313527062700233520ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/04/22 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ .macro COPY_4x8 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A0 lxvd2x vs37, o16, A0 lxvd2x vs38, o32, A0 lxvd2x vs39, o48, A0 addi A0, A0, 64 lxvd2x vs40, o0, A1 lxvd2x vs41, o16, A1 lxvd2x vs42, o32, A1 lxvd2x vs43, o48, A1 addi A1, A1, 64 lxvd2x vs44, o0, A1 lxvd2x vs45, o16, A1 lxvd2x vs46, o32, A1 lxvd2x vs47, o48, A1 addi A1, A1, 64 lxvd2x vs48, o0, A2 lxvd2x vs49, o16, A2 lxvd2x vs50, o32, A2 lxvd2x vs51, o48, A2 addi A2, A2, 64 lxvd2x vs52, o0, A2 lxvd2x vs53, o16, A2 lxvd2x vs54, o32, A2 lxvd2x vs55, o48, A2 addi A2, A2, 64 lxvd2x vs56, o0, A3 lxvd2x vs57, o16, A3 lxvd2x vs58, o32, A3 lxvd2x vs59, o48, A3 addi A3, A3, 64 lxvd2x vs60, o0, A3 lxvd2x vs61, o16, A3 lxvd2x vs62, o32, A3 lxvd2x vs63, o48, A3 addi A3, A3, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 addi T1, T1, 64 stxvd2x vs48, o0, T1 stxvd2x vs49, o16, T1 stxvd2x vs50, o32, T1 stxvd2x vs51, o48, T1 addi T1, T1, 64 stxvd2x vs52, o0, T1 stxvd2x vs53, o16, T1 stxvd2x vs54, o32, T1 stxvd2x vs55, o48, T1 addi T1, T1, 64 stxvd2x vs56, o0, T1 stxvd2x vs57, o16, T1 stxvd2x vs58, o32, T1 stxvd2x vs59, o48, T1 addi T1, T1, 64 stxvd2x vs60, o0, T1 stxvd2x vs61, o16, T1 stxvd2x vs62, o32, T1 stxvd2x vs63, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ .macro COPY_4x4 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A1 lxvd2x vs37, o16, A1 lxvd2x vs38, o32, A1 lxvd2x vs39, o48, A1 addi A1, A1, 64 lxvd2x vs40, o0, A2 lxvd2x vs41, o16, A2 lxvd2x vs42, o32, A2 lxvd2x vs43, o48, A2 addi A2, A2, 64 lxvd2x vs44, o0, A3 lxvd2x vs45, o16, A3 lxvd2x vs46, o32, A3 lxvd2x vs47, o48, A3 addi A3, A3, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ .macro COPY_4x2 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 addi A0, A0, 32 lxvd2x vs34, o0, A1 lxvd2x vs35, o16, A1 addi A1, A1, 32 lxvd2x vs36, o0, A2 lxvd2x vs37, o16, A2 addi A2, A2, 32 lxvd2x vs38, o0, A3 lxvd2x vs39, o16, A3 addi A3, A3, 32 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ .macro COPY_4x1 lxvd2x vs32, o0, A0 addi A0, A0, 16 lxvd2x vs33, o0, A1 addi A1, A1, 16 lxvd2x vs34, o0, A2 addi A2, A2, 16 lxvd2x vs35, o0, A3 addi A3, A3, 16 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro COPY_2x8 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A0 lxvd2x vs37, o16, A0 lxvd2x vs38, o32, A0 lxvd2x vs39, o48, A0 addi A0, A0, 64 lxvd2x vs40, o0, A1 lxvd2x vs41, o16, A1 lxvd2x vs42, o32, A1 lxvd2x vs43, o48, A1 addi A1, A1, 64 lxvd2x vs44, o0, A1 lxvd2x vs45, o16, A1 lxvd2x vs46, o32, A1 lxvd2x vs47, o48, A1 addi A1, A1, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 addi T1, T1, 64 stxvd2x vs40, o0, T1 stxvd2x vs41, o16, T1 stxvd2x vs42, o32, T1 stxvd2x vs43, o48, T1 addi T1, T1, 64 stxvd2x vs44, o0, T1 stxvd2x vs45, o16, T1 stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro COPY_2x4 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A1 lxvd2x vs37, o16, A1 lxvd2x vs38, o32, A1 lxvd2x vs39, o48, A1 addi A1, A1, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro COPY_2x2 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 addi A0, A0, 32 lxvd2x vs34, o0, A1 lxvd2x vs35, o16, A1 addi A1, A1, 32 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro COPY_2x1 lxvd2x vs32, o0, A0 addi A0, A0, 16 lxvd2x vs33, o0, A1 addi A1, A1, 16 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro COPY_1x8 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 lxvd2x vs36, o0, A0 lxvd2x vs37, o16, A0 lxvd2x vs38, o32, A0 lxvd2x vs39, o48, A0 addi A0, A0, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 addi T1, T1, 64 stxvd2x vs36, o0, T1 stxvd2x vs37, o16, T1 stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro COPY_1x4 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 lxvd2x vs34, o32, A0 lxvd2x vs35, o48, A0 addi A0, A0, 64 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro COPY_1x2 lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 addi A0, A0, 32 mr T1, BO stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro COPY_1x1 lxvd2x vs32, o0, A0 addi A0, A0, 16 mr T1, BO stxvd2x vs32, o0, T1 .endm OpenBLAS-0.2.20/kernel/power/zgemv_n.S000066400000000000000000002510341313527062700173560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r10 #define LDA r5 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #define I r11 #define J r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define LDA4 r18 #define Y1 r19 #define Y2 r20 #define PREA r21 #define PREC r22 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define y09 f8 #define y10 f9 #define y11 f10 #define y12 f11 #define y13 f12 #define y14 f13 #define y15 f14 #define y16 f15 #define alpha1r f16 #define alpha1i f17 #define alpha2r f18 #define alpha2i f19 #define alpha3r f20 #define alpha3i f21 #define alpha4r f22 #define alpha4i f23 #define a1 f24 #define a2 f25 #define a3 f26 #define a4 f27 #define a5 f28 #define a6 f29 #define a7 f30 #define a8 f31 #define alpha_r f14 #define alpha_i f15 #if defined(PPCG4) #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 #endif #ifdef PPC970 #define PREFETCHSIZE_A 56 #define PREFETCHSIZE_C 16 #endif #ifdef CELL #define PREFETCHSIZE_A 56 #define PREFETCHSIZE_C 16 #endif #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 40 #define PREFETCHSIZE_C 24 #endif #ifdef POWER6 #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 24 #endif #ifdef POWER8 #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 24 #endif #ifndef XCONJ #define FMADDR FMADD #define FMSUBR FNMSUB #else #define FMADDR FNMSUB #define FMSUBR FMADD #endif #ifndef CONJ #define FMADDX FMADD #define FMSUBX FNMSUB #else #define FMADDX FNMSUB #define FMSUBX FMADD #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #define ALPHA_R 208(SP) #define ALPHA_I 216(SP) #else #define STACKSIZE 280 #define ALPHA_R 256(SP) #define ALPHA_I 264(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) #else stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz X, FRAMESLOT(1) + STACKSIZE(SP) lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) lwz Y, FRAMESLOT(3) + STACKSIZE(SP) lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) #else lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I slwi LDA4, LDA, ZBASE_SHIFT + 2 slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) cmpi cr0, 0, INCY, 2 * SIZE bne LL(100) srawi. J, N, 2 ble LL(20) .align 4 LL(11): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX LFD a5, 0 * SIZE(X) LFD a6, 1 * SIZE(X) add X, X, INCX LFD a7, 0 * SIZE(X) LFD a8, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMUL alpha3r, alpha_r, a5 FMUL alpha3i, alpha_i, a5 FMUL alpha4r, alpha_r, a7 FMUL alpha4i, alpha_i, a7 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i FMSUBR alpha2r, alpha_i, a4, alpha2r FMADDR alpha2i, alpha_r, a4, alpha2i FMSUBR alpha3r, alpha_i, a6, alpha3r FMADDR alpha3i, alpha_r, a6, alpha3i FMSUBR alpha4r, alpha_i, a8, alpha4r FMADDR alpha4i, alpha_r, a8, alpha4i mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(15) .align 4 LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) addi Y1, Y1, 16 * SIZE bdz LL(13) .align 4 LL(12): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop DCBT(AO1, PREA) nop FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) addi AO2, AO2, 16 * SIZE nop DCBT(AO2, PREA) nop FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 8 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a5, 12 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 9 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) addi AO3, AO3, 16 * SIZE nop DCBT(AO3, PREA) nop FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y09, alpha3i, a2, y09 FMADDX y10, alpha3r, a2, y10 FMSUBX y11, alpha3i, a4, y11 FMADDX y12, alpha3r, a4, y12 FMSUBX y13, alpha3i, a6, y13 FMADDX y14, alpha3r, a6, y14 FMSUBX y15, alpha3i, a8, y15 FMADDX y16, alpha3r, a8, y16 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 LFD a1, 8 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a5, 12 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 LFD a2, 9 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) addi AO4, AO4, 16 * SIZE nop DCBT(AO4, PREA) nop STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) addi Y2, Y2, 16 * SIZE addi Y1, Y1, 16 * SIZE DCBT(Y1, PREC) bdnz LL(12) .align 4 LL(13): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 8 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a5, 12 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 9 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y09, alpha3i, a2, y09 FMADDX y10, alpha3r, a2, y10 FMSUBX y11, alpha3i, a4, y11 FMADDX y12, alpha3r, a4, y12 FMSUBX y13, alpha3i, a6, y13 FMADDX y14, alpha3r, a6, y14 FMSUBX y15, alpha3i, a8, y15 FMADDX y16, alpha3r, a8, y16 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 LFD a1, 8 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a5, 12 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 LFD a2, 9 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 LFD a1, 16 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 LFD a2, 17 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) addi Y2, Y2, 16 * SIZE .align 4 LL(15): andi. r0, M, 7 ble LL(19) andi. r0, M, 4 ble LL(16) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi AO3, AO3, 8 * SIZE addi AO4, AO4, 8 * SIZE addi Y1, Y1, 8 * SIZE addi Y2, Y2, 8 * SIZE .align 4 LL(16): andi. r0, M, 2 nop nop ble LL(17) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 LFD a1, 0 * SIZE(AO3) LFD a2, 1 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 LFD a5, 0 * SIZE(AO4) LFD a6, 1 * SIZE(AO4) LFD a7, 2 * SIZE(AO4) LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMADD y01, alpha4r, a5, y01 FMADD y02, alpha4i, a5, y02 FMADD y03, alpha4r, a7, y03 FMADD y04, alpha4i, a7, y04 FMSUBX y01, alpha4i, a6, y01 FMADDX y02, alpha4r, a6, y02 FMSUBX y03, alpha4i, a8, y03 FMADDX y04, alpha4r, a8, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE addi Y1, Y1, 4 * SIZE addi Y2, Y2, 4 * SIZE .align 4 LL(17): andi. r0, M, 1 ble LL(19) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 FMADD y01, alpha3r, a5, y01 FMADD y02, alpha3i, a5, y02 FMSUBX y01, alpha3i, a6, y01 FMADDX y02, alpha3r, a6, y02 FMADD y01, alpha4r, a7, y01 FMADD y02, alpha4i, a7, y02 FMSUBX y01, alpha4i, a8, y01 FMADDX y02, alpha4r, a8, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y1, Y1, INCY add Y2, Y2, INCY .align 4 LL(19): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 2 ble LL(30) .align 4 LL(21): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i FMSUBR alpha2r, alpha_i, a4, alpha2r FMADDR alpha2i, alpha_r, a4, alpha2i mr AO1, A add AO2, A, LDA add A, AO2, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(25) .align 4 LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) addi Y1, Y1, 16 * SIZE bdz LL(23) .align 4 LL(22): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop DCBT(AO1, PREA) nop FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) addi AO2, AO2, 16 * SIZE nop DCBT(AO2, PREA) nop FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) addi Y2, Y2, 16 * SIZE addi Y1, Y1, 16 * SIZE DCBT(Y1, PREC) bdnz LL(22) .align 4 LL(23): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi Y2, Y2, 16 * SIZE .align 4 LL(25): andi. r0, M, 7 ble LL(30) andi. r0, M, 4 ble LL(26) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi Y1, Y1, 8 * SIZE addi Y2, Y2, 8 * SIZE .align 4 LL(26): andi. r0, M, 2 ble LL(27) LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 1 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi Y1, Y1, 4 * SIZE addi Y2, Y2, 4 * SIZE .align 4 LL(27): andi. r0, M, 1 ble LL(30) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y1, Y1, INCY add Y2, Y2, INCY .align 4 LL(30): andi. J, N, 1 ble LL(999) .align 4 LL(31): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i mr AO1, A add A, AO1, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(35) .align 4 LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) addi Y1, Y1, 16 * SIZE bdz LL(33) .align 4 LL(32): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop DCBT(AO1, PREA) nop STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) addi Y1, Y1, 16 * SIZE addi Y2, Y2, 16 * SIZE DCBT(Y1, PREC) bdnz LL(32) .align 4 LL(33): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) addi AO1, AO1, 16 * SIZE addi Y2, Y2, 16 * SIZE .align 4 LL(35): andi. r0, M, 7 ble LL(999) andi. r0, M, 4 ble LL(36) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) addi AO1, AO1, 8 * SIZE addi Y1, Y1, 8 * SIZE addi Y2, Y2, 8 * SIZE .align 4 LL(36): andi. r0, M, 2 ble LL(37) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) addi AO1, AO1, 4 * SIZE addi Y1, Y1, 4 * SIZE addi Y2, Y2, 4 * SIZE .align 4 LL(37): andi. r0, M, 1 ble LL(999) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y1, Y1, INCY add Y2, Y2, INCY b LL(999) .align 4 LL(100): srawi. J, N, 2 ble LL(120) .align 4 LL(111): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX LFD a5, 0 * SIZE(X) LFD a6, 1 * SIZE(X) add X, X, INCX LFD a7, 0 * SIZE(X) LFD a8, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMUL alpha3r, alpha_r, a5 FMUL alpha3i, alpha_i, a5 FMUL alpha4r, alpha_r, a7 FMUL alpha4i, alpha_i, a7 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i FMSUBR alpha2r, alpha_i, a4, alpha2r FMADDR alpha2i, alpha_r, a4, alpha2i FMSUBR alpha3r, alpha_i, a6, alpha3r FMADDR alpha3i, alpha_r, a6, alpha3i FMSUBR alpha4r, alpha_i, a8, alpha4r FMADDR alpha4i, alpha_r, a8, alpha4i mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(115) .align 4 LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y09, 0 * SIZE(Y1) LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y11, 0 * SIZE(Y1) LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y13, 0 * SIZE(Y1) LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y15, 0 * SIZE(Y1) LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) bdz LL(113) .align 4 LL(112): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop DCBT(AO1, PREA) nop FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) addi AO2, AO2, 16 * SIZE nop DCBT(AO2, PREA) nop FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 8 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a5, 12 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 9 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) addi AO3, AO3, 16 * SIZE nop DCBT(AO3, PREA) nop FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y09, alpha3i, a2, y09 FMADDX y10, alpha3r, a2, y10 FMSUBX y11, alpha3i, a4, y11 FMADDX y12, alpha3r, a4, y12 FMSUBX y13, alpha3i, a6, y13 FMADDX y14, alpha3r, a6, y14 FMSUBX y15, alpha3i, a8, y15 FMADDX y16, alpha3r, a8, y16 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 LFD a1, 8 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a5, 12 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 STFD y01, 0 * SIZE(Y2) nop STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y01, 0 * SIZE(Y1) nop LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y03, 0 * SIZE(Y1) nop LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 LFD a2, 9 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) addi AO4, AO4, 16 * SIZE nop DCBT(AO4, PREA) nop STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y05, 0 * SIZE(Y1) nop LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y07, 0 * SIZE(Y1) nop LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y09, 0 * SIZE(Y1) nop LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y11, 0 * SIZE(Y1) nop LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y13, 0 * SIZE(Y1) nop LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y15, 0 * SIZE(Y1) nop LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY DCBT(Y1, PREC) bdnz LL(112) .align 4 LL(113): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 8 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a5, 12 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 9 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y09, alpha3i, a2, y09 FMADDX y10, alpha3r, a2, y10 FMSUBX y11, alpha3i, a4, y11 FMADDX y12, alpha3r, a4, y12 FMSUBX y13, alpha3i, a6, y13 FMADDX y14, alpha3r, a6, y14 FMSUBX y15, alpha3i, a8, y15 FMADDX y16, alpha3r, a8, y16 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 LFD a1, 8 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a5, 12 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 STFD y01, 0 * SIZE(Y2) nop STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 LFD a2, 9 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE .align 4 LL(115): andi. r0, M, 7 ble LL(119) andi. r0, M, 4 ble LL(116) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 8 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) addi AO2, AO2, 8 * SIZE STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y05, 0 * SIZE(Y2) addi AO3, AO3, 8 * SIZE STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) addi AO4, AO4, 8 * SIZE STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(116): andi. r0, M, 2 ble LL(117) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 LFD a1, 0 * SIZE(AO3) LFD a2, 1 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 LFD a5, 0 * SIZE(AO4) LFD a6, 1 * SIZE(AO4) LFD a7, 2 * SIZE(AO4) LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMADD y01, alpha4r, a5, y01 FMADD y02, alpha4i, a5, y02 FMADD y03, alpha4r, a7, y03 FMADD y04, alpha4i, a7, y04 FMSUBX y01, alpha4i, a6, y01 FMADDX y02, alpha4r, a6, y02 FMSUBX y03, alpha4i, a8, y03 FMADDX y04, alpha4r, a8, y04 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 4 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) addi AO2, AO2, 4 * SIZE STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE .align 4 LL(117): andi. r0, M, 1 ble LL(119) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 FMADD y01, alpha3r, a5, y01 FMADD y02, alpha3i, a5, y02 FMSUBX y01, alpha3i, a6, y01 FMADDX y02, alpha3r, a6, y02 FMADD y01, alpha4r, a7, y01 FMADD y02, alpha4i, a7, y02 FMSUBX y01, alpha4i, a8, y01 FMADDX y02, alpha4r, a8, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(119): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(111) .align 4 LL(120): andi. J, N, 2 ble LL(130) .align 4 LL(121): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i FMSUBR alpha2r, alpha_i, a4, alpha2r FMADDR alpha2i, alpha_r, a4, alpha2i mr AO1, A add AO2, A, LDA add A, AO2, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(125) .align 4 LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y09, 0 * SIZE(Y1) LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y11, 0 * SIZE(Y1) LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y13, 0 * SIZE(Y1) LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y15, 0 * SIZE(Y1) LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY bdz LL(123) .align 4 LL(122): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop DCBT(AO1, PREA) nop FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 STFD y01, 0 * SIZE(Y2) nop STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y01, 0 * SIZE(Y1) nop LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y03, 0 * SIZE(Y1) nop LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) addi AO2, AO2, 16 * SIZE nop DCBT(AO2, PREA) nop STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y05, 0 * SIZE(Y1) nop LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y07, 0 * SIZE(Y1) nop LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y09, 0 * SIZE(Y1) nop LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y11, 0 * SIZE(Y1) nop LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y13, 0 * SIZE(Y1) nop LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y15, 0 * SIZE(Y1) nop LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY DCBT(Y1, PREC) bdnz LL(122) .align 4 LL(123): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 16 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) STFD y05, 0 * SIZE(Y2) addi AO2, AO2, 16 * SIZE STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(125): andi. r0, M, 7 ble LL(130) andi. r0, M, 4 ble LL(126) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 8 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) addi AO2, AO2, 8 * SIZE STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(126): andi. r0, M, 2 ble LL(127) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 4 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) addi AO2, AO2, 4 * SIZE STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(127): andi. r0, M, 1 ble LL(130) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(130): andi. J, N, 1 ble LL(999) .align 4 LL(131): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i mr AO1, A add A, AO1, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(135) .align 4 LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y09, 0 * SIZE(Y1) LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y11, 0 * SIZE(Y1) LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y13, 0 * SIZE(Y1) LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y15, 0 * SIZE(Y1) LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY bdz LL(133) .align 4 LL(132): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFD y01, 0 * SIZE(Y2) nop STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y01, 0 * SIZE(Y1) nop LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y03, 0 * SIZE(Y1) nop LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop DCBT(AO1, PREA) nop STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y05, 0 * SIZE(Y1) nop LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y07, 0 * SIZE(Y1) nop LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y09, 0 * SIZE(Y1) nop LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y11, 0 * SIZE(Y1) nop LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y13, 0 * SIZE(Y1) nop LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y15, 0 * SIZE(Y1) nop LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY DCBT(Y1, PREC) bdnz LL(132) .align 4 LL(133): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 16 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(135): andi. r0, M, 7 ble LL(999) andi. r0, M, 4 ble LL(136) LFD y01, 0 * SIZE(Y1) nop LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) nop LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y05, 0 * SIZE(Y1) nop LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) nop LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 8 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(136): andi. r0, M, 2 ble LL(137) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) nop LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) nop LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 4 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(137): andi. r0, M, 1 ble LL(999) LFD y01, 0 * SIZE(Y1) nop LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 STFD y01, 0 * SIZE(Y2) nop STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemv_n_ppc440.S000066400000000000000000000712321313527062700204500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r10 #define LDA r5 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #define I r11 #define J r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define Y1 r18 #define Y2 r19 #define PREA r20 #define YY r21 #define BUFFER r22 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define y09 f8 #define y10 f9 #define y11 f10 #define y12 f11 #define y13 f12 #define y14 f13 #define y15 f14 #define y16 f15 #define alpha1r f16 #define alpha1i f17 #define alpha2r f18 #define alpha2i f19 #define alpha3r f20 #define alpha3i f21 #define alpha4r f22 #define alpha4i f23 #define a1 f24 #define a2 f25 #define a3 f26 #define a4 f27 #define a5 f28 #define a6 f29 #define a7 f30 #define a8 f31 #define alpha_r f14 #define alpha_i f15 #if defined(PPCG4) #define PREFETCHSIZE_A (3 * 4) #endif #if defined(POWER6) #define PREFETCHSIZE_A (3 * 4) #endif #ifndef XCONJ #define FMADDR FMADD #define FMSUBR FNMSUB #else #define FMADDR FNMSUB #define FMSUBR FMADD #endif #ifndef CONJ #define FMADDX FMADD #define FMSUBX FNMSUB #else #define FMADDX FNMSUB #define FMSUBX FMADD #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 232 #define ALPHA_R 208(SP) #define ALPHA_I 216(SP) #define FZERO 224(SP) #else #define STACKSIZE 280 #define ALPHA_R 256(SP) #define ALPHA_I 264(SP) #define FZERO 272(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz X, FRAMESLOT(1) + STACKSIZE(SP) lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) lwz Y, FRAMESLOT(3) + STACKSIZE(SP) lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT addi INCX, INCX, -SIZE addi INCY, INCY, -SIZE addi A, A, -SIZE cmpwi cr0, M, 0 ble- LL(999) sub X, X, INCX cmpwi cr0, N, 0 sub Y, Y, INCY ble- LL(999) li PREA, PREFETCHSIZE_A * SIZE mr YY, Y lfd f0, FZERO cmpi cr0, 0, INCY, SIZE beq LL(10) addi YY, BUFFER, -SIZE addi Y1, BUFFER, -SIZE addi r0, M, 3 srawi. r0, r0, 2 mtspr CTR, r0 .align 4 LL(02): STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) STFDU f0, 1 * SIZE(Y1) bdnz LL(02) .align 4 LL(10): srawi. J, N, 2 ble LL(20) .align 4 LL(11): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFDUX a1, X, INCX LFDU a2, 1 * SIZE(X) LFDUX a3, X, INCX LFDU a4, 1 * SIZE(X) LFDUX a5, X, INCX LFDU a6, 1 * SIZE(X) LFDUX a7, X, INCX LFDU a8, 1 * SIZE(X) FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMUL alpha3r, alpha_r, a5 mr Y1, YY FMUL alpha3i, alpha_i, a5 mr Y2, YY FMUL alpha4r, alpha_r, a7 mr AO1, A FMUL alpha4i, alpha_i, a7 add AO2, A, LDA FMSUBR alpha1r, alpha_i, a2, alpha1r add AO3, AO2, LDA FMADDR alpha1i, alpha_r, a2, alpha1i add AO4, AO3, LDA FMSUBR alpha2r, alpha_i, a4, alpha2r add A, AO4, LDA FMADDR alpha2i, alpha_r, a4, alpha2i FMSUBR alpha3r, alpha_i, a6, alpha3r srawi. r0, M, 2 FMADDR alpha3i, alpha_r, a6, alpha3i FMSUBR alpha4r, alpha_i, a8, alpha4r mtspr CTR, r0 FMADDR alpha4i, alpha_r, a8, alpha4i ble LL(15) .align 4 LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) FMADD y09, alpha1r, a1, y01 FMADD y10, alpha1i, a1, y02 FMADD y11, alpha1r, a3, y03 FMADD y12, alpha1i, a3, y04 FMADD y13, alpha1r, a5, y05 FMADD y14, alpha1i, a5, y06 FMADD y15, alpha1r, a7, y07 FMADD y16, alpha1i, a7, y08 bdz LL(13) .align 4 LL(12): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2) #ifdef PPCG4 dcbt AO2, PREA #endif FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2) #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD y09, alpha2r, a1, y09 LFDU y01, 1 * SIZE(Y1) FMADD y10, alpha2i, a1, y10 LFDU y02, 1 * SIZE(Y1) FMADD y11, alpha2r, a3, y11 LFDU y03, 1 * SIZE(Y1) FMADD y12, alpha2i, a3, y12 LFDU y04, 1 * SIZE(Y1) #ifdef PPCG4 dcbtst Y1, PREA #endif FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 LFDU a1, 1 * SIZE(AO3) FMADDX y10, alpha2r, a2, y10 LFDU a2, 1 * SIZE(AO3) FMSUBX y11, alpha2i, a4, y11 LFDU a3, 1 * SIZE(AO3) FMADDX y12, alpha2r, a4, y12 LFDU a4, 1 * SIZE(AO3) #ifdef PPCG4 dcbt AO3, PREA #endif FMSUBX y13, alpha2i, a6, y13 LFDU a5, 1 * SIZE(AO3) FMADDX y14, alpha2r, a6, y14 LFDU a6, 1 * SIZE(AO3) FMSUBX y15, alpha2i, a8, y15 LFDU a7, 1 * SIZE(AO3) FMADDX y16, alpha2r, a8, y16 LFDU a8, 1 * SIZE(AO3) #if defined(PPCG4) && defined(DOUBLE) dcbt AO3, PREA #endif FMADD y09, alpha3r, a1, y09 LFDU y05, 1 * SIZE(Y1) FMADD y10, alpha3i, a1, y10 LFDU y06, 1 * SIZE(Y1) FMADD y11, alpha3r, a3, y11 LFDU y07, 1 * SIZE(Y1) FMADD y12, alpha3i, a3, y12 LFDU y08, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA #endif FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 FMSUBX y09, alpha3i, a2, y09 LFDU a1, 1 * SIZE(AO4) FMADDX y10, alpha3r, a2, y10 LFDU a2, 1 * SIZE(AO4) FMSUBX y11, alpha3i, a4, y11 LFDU a3, 1 * SIZE(AO4) FMADDX y12, alpha3r, a4, y12 LFDU a4, 1 * SIZE(AO4) #ifdef PPCG4 dcbt AO4, PREA #endif FMSUBX y13, alpha3i, a6, y13 LFDU a5, 1 * SIZE(AO4) FMADDX y14, alpha3r, a6, y14 LFDU a6, 1 * SIZE(AO4) FMSUBX y15, alpha3i, a8, y15 LFDU a7, 1 * SIZE(AO4) FMADDX y16, alpha3r, a8, y16 LFDU a8, 1 * SIZE(AO4) #if defined(PPCG4) && defined(DOUBLE) dcbt AO4, PREA #endif FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 FMSUBX y09, alpha4i, a2, y09 LFDU a1, 1 * SIZE(AO1) FMADDX y10, alpha4r, a2, y10 LFDU a2, 1 * SIZE(AO1) FMSUBX y11, alpha4i, a4, y11 LFDU a3, 1 * SIZE(AO1) FMADDX y12, alpha4r, a4, y12 LFDU a4, 1 * SIZE(AO1) #ifdef PPCG4 dcbt AO1, PREA #endif FMSUBX y13, alpha4i, a6, y13 LFDU a5, 1 * SIZE(AO1) FMADDX y14, alpha4r, a6, y14 LFDU a6, 1 * SIZE(AO1) FMSUBX y15, alpha4i, a8, y15 LFDU a7, 1 * SIZE(AO1) FMADDX y16, alpha4r, a8, y16 LFDU a8, 1 * SIZE(AO1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif STFDU y09, 1 * SIZE(Y2) FMADD y09, alpha1r, a1, y01 STFDU y10, 1 * SIZE(Y2) FMADD y10, alpha1i, a1, y02 STFDU y11, 1 * SIZE(Y2) FMADD y11, alpha1r, a3, y03 STFDU y12, 1 * SIZE(Y2) FMADD y12, alpha1i, a3, y04 STFDU y13, 1 * SIZE(Y2) FMADD y13, alpha1r, a5, y05 STFDU y14, 1 * SIZE(Y2) FMADD y14, alpha1i, a5, y06 STFDU y15, 1 * SIZE(Y2) FMADD y15, alpha1r, a7, y07 STFDU y16, 1 * SIZE(Y2) FMADD y16, alpha1i, a7, y08 bdnz LL(12) .align 4 LL(13): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2) FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 LFDU a1, 1 * SIZE(AO3) FMADDX y10, alpha2r, a2, y10 LFDU a2, 1 * SIZE(AO3) FMSUBX y11, alpha2i, a4, y11 LFDU a3, 1 * SIZE(AO3) FMADDX y12, alpha2r, a4, y12 LFDU a4, 1 * SIZE(AO3) FMSUBX y13, alpha2i, a6, y13 LFDU a5, 1 * SIZE(AO3) FMADDX y14, alpha2r, a6, y14 LFDU a6, 1 * SIZE(AO3) FMSUBX y15, alpha2i, a8, y15 LFDU a7, 1 * SIZE(AO3) FMADDX y16, alpha2r, a8, y16 LFDU a8, 1 * SIZE(AO3) FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 FMSUBX y09, alpha3i, a2, y09 LFDU a1, 1 * SIZE(AO4) FMADDX y10, alpha3r, a2, y10 LFDU a2, 1 * SIZE(AO4) FMSUBX y11, alpha3i, a4, y11 LFDU a3, 1 * SIZE(AO4) FMADDX y12, alpha3r, a4, y12 LFDU a4, 1 * SIZE(AO4) FMSUBX y13, alpha3i, a6, y13 LFDU a5, 1 * SIZE(AO4) FMADDX y14, alpha3r, a6, y14 LFDU a6, 1 * SIZE(AO4) FMSUBX y15, alpha3i, a8, y15 LFDU a7, 1 * SIZE(AO4) FMADDX y16, alpha3r, a8, y16 LFDU a8, 1 * SIZE(AO4) FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 FMSUBX y13, alpha4i, a6, y13 STFDU y09, 1 * SIZE(Y2) FMADDX y14, alpha4r, a6, y14 STFDU y10, 1 * SIZE(Y2) FMSUBX y15, alpha4i, a8, y15 STFDU y11, 1 * SIZE(Y2) FMADDX y16, alpha4r, a8, y16 STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) .align 4 LL(15): andi. r0, M, 2 ble LL(17) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 LFDU a5, 1 * SIZE(AO2) FMADD y02, alpha1i, a1, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, alpha1r, a3, y03 LFDU a7, 1 * SIZE(AO2) FMADD y04, alpha1i, a3, y04 LFDU a8, 1 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 LFDU a1, 1 * SIZE(AO3) FMADDX y02, alpha1r, a2, y02 LFDU a2, 1 * SIZE(AO3) FMSUBX y03, alpha1i, a4, y03 LFDU a3, 1 * SIZE(AO3) FMADDX y04, alpha1r, a4, y04 LFDU a4, 1 * SIZE(AO3) FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 LFDU a5, 1 * SIZE(AO4) FMADDX y02, alpha2r, a6, y02 LFDU a6, 1 * SIZE(AO4) FMSUBX y03, alpha2i, a8, y03 LFDU a7, 1 * SIZE(AO4) FMADDX y04, alpha2r, a8, y04 LFDU a8, 1 * SIZE(AO4) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMADD y01, alpha4r, a5, y01 FMADD y02, alpha4i, a5, y02 FMADD y03, alpha4r, a7, y03 FMADD y04, alpha4i, a7, y04 FMSUBX y01, alpha4i, a6, y01 FMADDX y02, alpha4r, a6, y02 FMSUBX y03, alpha4i, a8, y03 FMADDX y04, alpha4r, a8, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4 LL(17): andi. r0, M, 1 ble LL(19) LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 LFDU a5, 1 * SIZE(AO3) FMADD y02, alpha1i, a1, y02 LFDU a6, 1 * SIZE(AO3) FMSUBX y01, alpha1i, a2, y01 LFDU a7, 1 * SIZE(AO4) FMADDX y02, alpha1r, a2, y02 LFDU a8, 1 * SIZE(AO4) FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 FMADD y01, alpha3r, a5, y01 FMADD y02, alpha3i, a5, y02 FMSUBX y01, alpha3i, a6, y01 FMADDX y02, alpha3r, a6, y02 FMADD y01, alpha4r, a7, y01 FMADD y02, alpha4i, a7, y02 FMSUBX y01, alpha4i, a8, y01 FMADDX y02, alpha4r, a8, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 LL(19): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 2 ble LL(30) lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFDUX a1, X, INCX LFDU a2, 1 * SIZE(X) LFDUX a3, X, INCX LFDU a4, 1 * SIZE(X) FMUL alpha1r, alpha_r, a1 mr Y1, YY FMUL alpha1i, alpha_i, a1 mr Y2, YY FMUL alpha2r, alpha_r, a3 mr AO1, A FMUL alpha2i, alpha_i, a3 add AO2, A, LDA FMSUBR alpha1r, alpha_i, a2, alpha1r add A, AO2, LDA FMADDR alpha1i, alpha_r, a2, alpha1i srawi. r0, M, 2 FMSUBR alpha2r, alpha_i, a4, alpha2r mtspr CTR, r0 FMADDR alpha2i, alpha_r, a4, alpha2i ble LL(25) .align 4 LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) FMADD y09, alpha1r, a1, y01 FMADD y10, alpha1i, a1, y02 FMADD y11, alpha1r, a3, y03 FMADD y12, alpha1i, a3, y04 FMADD y13, alpha1r, a5, y05 FMADD y14, alpha1i, a5, y06 FMADD y15, alpha1r, a7, y07 FMADD y16, alpha1i, a7, y08 bdz LL(23) .align 4 LL(22): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2) #ifdef PPCG4 dcbt AO2, PREA #endif FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2) #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD y09, alpha2r, a1, y09 LFDU y01, 1 * SIZE(Y1) FMADD y10, alpha2i, a1, y10 LFDU y02, 1 * SIZE(Y1) FMADD y11, alpha2r, a3, y11 LFDU y03, 1 * SIZE(Y1) FMADD y12, alpha2i, a3, y12 LFDU y04, 1 * SIZE(Y1) #ifdef PPCG4 dcbtst Y1, PREA #endif FMADD y13, alpha2r, a5, y13 LFDU y05, 1 * SIZE(Y1) FMADD y14, alpha2i, a5, y14 LFDU y06, 1 * SIZE(Y1) FMADD y15, alpha2r, a7, y15 LFDU y07, 1 * SIZE(Y1) FMADD y16, alpha2i, a7, y16 LFDU y08, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA #endif FMSUBX y09, alpha2i, a2, y09 LFDU a1, 1 * SIZE(AO1) FMADDX y10, alpha2r, a2, y10 LFDU a2, 1 * SIZE(AO1) FMSUBX y11, alpha2i, a4, y11 LFDU a3, 1 * SIZE(AO1) FMADDX y12, alpha2r, a4, y12 LFDU a4, 1 * SIZE(AO1) #ifdef PPCG4 dcbt AO1, PREA #endif FMSUBX y13, alpha2i, a6, y13 LFDU a5, 1 * SIZE(AO1) FMADDX y14, alpha2r, a6, y14 LFDU a6, 1 * SIZE(AO1) FMSUBX y15, alpha2i, a8, y15 LFDU a7, 1 * SIZE(AO1) FMADDX y16, alpha2r, a8, y16 LFDU a8, 1 * SIZE(AO1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif STFDU y09, 1 * SIZE(Y2) FMADD y09, alpha1r, a1, y01 STFDU y10, 1 * SIZE(Y2) FMADD y10, alpha1i, a1, y02 STFDU y11, 1 * SIZE(Y2) FMADD y11, alpha1r, a3, y03 STFDU y12, 1 * SIZE(Y2) FMADD y12, alpha1i, a3, y04 STFDU y13, 1 * SIZE(Y2) FMADD y13, alpha1r, a5, y05 STFDU y14, 1 * SIZE(Y2) FMADD y14, alpha1i, a5, y06 STFDU y15, 1 * SIZE(Y2) FMADD y15, alpha1r, a7, y07 STFDU y16, 1 * SIZE(Y2) FMADD y16, alpha1i, a7, y08 bdnz LL(22) .align 4 LL(23): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2) FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 STFDU y09, 1 * SIZE(Y2) FMADDX y14, alpha2r, a6, y14 STFDU y10, 1 * SIZE(Y2) FMSUBX y15, alpha2i, a8, y15 STFDU y11, 1 * SIZE(Y2) FMADDX y16, alpha2r, a8, y16 STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) .align 4 LL(25): andi. r0, M, 2 ble LL(27) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 LFDU a5, 1 * SIZE(AO2) FMADD y02, alpha1i, a1, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, alpha1r, a3, y03 LFDU a7, 1 * SIZE(AO2) FMADD y04, alpha1i, a3, y04 LFDU a8, 1 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4 LL(27): andi. r0, M, 1 ble LL(30) LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 LL(30): andi. J, N, 1 ble LL(990) .align 4 lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFDUX a1, X, INCX LFDU a2, 1 * SIZE(X) FMUL alpha1r, alpha_r, a1 mr Y1, YY mr Y2, YY FMUL alpha1i, alpha_i, a1 mr AO1, A add A, A, LDA FMSUBR alpha1r, alpha_i, a2, alpha1r srawi. r0, M, 2 mtspr CTR, r0 FMADDR alpha1i, alpha_r, a2, alpha1i ble LL(35) .align 4 LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) FMADD y09, alpha1r, a1, y01 FMADD y10, alpha1i, a1, y02 FMADD y11, alpha1r, a3, y03 FMADD y12, alpha1i, a3, y04 FMADD y13, alpha1r, a5, y05 FMADD y14, alpha1i, a5, y06 FMADD y15, alpha1r, a7, y07 FMADD y16, alpha1i, a7, y08 bdz LL(33) .align 4 LL(32): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO1) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO1) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO1) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO1) #ifdef PPCG4 dcbt AO1, PREA #endif LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) LFDU y03, 1 * SIZE(Y1) LFDU y04, 1 * SIZE(Y1) #ifdef PPCG4 dcbtst Y1, PREA #endif FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO1) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO1) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO1) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif LFDU y05, 1 * SIZE(Y1) LFDU y06, 1 * SIZE(Y1) LFDU y07, 1 * SIZE(Y1) LFDU y08, 1 * SIZE(Y1) #if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA #endif STFDU y09, 1 * SIZE(Y2) FMADD y09, alpha1r, a1, y01 STFDU y10, 1 * SIZE(Y2) FMADD y10, alpha1i, a1, y02 STFDU y11, 1 * SIZE(Y2) FMADD y11, alpha1r, a3, y03 STFDU y12, 1 * SIZE(Y2) FMADD y12, alpha1i, a3, y04 STFDU y13, 1 * SIZE(Y2) FMADD y13, alpha1r, a5, y05 STFDU y14, 1 * SIZE(Y2) FMADD y14, alpha1i, a5, y06 STFDU y15, 1 * SIZE(Y2) FMADD y15, alpha1r, a7, y07 STFDU y16, 1 * SIZE(Y2) FMADD y16, alpha1i, a7, y08 bdnz LL(32) .align 4 LL(33): FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 STFDU y09, 1 * SIZE(Y2) FMADDX y14, alpha1r, a6, y14 STFDU y10, 1 * SIZE(Y2) FMSUBX y15, alpha1i, a8, y15 STFDU y11, 1 * SIZE(Y2) FMADDX y16, alpha1r, a8, y16 STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) .align 4 LL(35): andi. r0, M, 2 ble LL(37) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4 LL(37): andi. r0, M, 1 ble LL(990) LFDU y01, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 LL(990): cmpi cr0, 0, INCY, SIZE beq LL(999) addi YY, BUFFER, -SIZE mr Y1, Y srawi. r0, M, 2 mtspr CTR, r0 ble LL(995) .align 4 LL(991): LFDUX f0, Y, INCY LFDU f1, 1 * SIZE(Y) LFDUX f2, Y, INCY LFDU f3, 1 * SIZE(Y) LFDUX f4, Y, INCY LFDU f5, 1 * SIZE(Y) LFDUX f6, Y, INCY LFDU f7, 1 * SIZE(Y) LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) LFDU f10, 1 * SIZE(YY) LFDU f11, 1 * SIZE(YY) LFDU f12, 1 * SIZE(YY) LFDU f13, 1 * SIZE(YY) LFDU f14, 1 * SIZE(YY) LFDU f15, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFDUX f8, Y1, INCY STFDU f9, 1 * SIZE(Y1) STFDUX f10, Y1, INCY STFDU f11, 1 * SIZE(Y1) STFDUX f12, Y1, INCY STFDU f13, 1 * SIZE(Y1) STFDUX f14, Y1, INCY STFDU f15, 1 * SIZE(Y1) bdnz LL(991) .align 4 LL(995): andi. J, M, 2 ble LL(996) LFDUX f0, Y, INCY LFDU f1, 1 * SIZE(Y) LFDUX f2, Y, INCY LFDU f3, 1 * SIZE(Y) LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) LFDU f10, 1 * SIZE(YY) LFDU f11, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFDUX f8, Y1, INCY STFDU f9, 1 * SIZE(Y1) STFDUX f10, Y1, INCY STFDU f11, 1 * SIZE(Y1) .align 4 LL(996): andi. J, M, 1 ble LL(999) LFDUX f0, Y, INCY LFDU f1, 1 * SIZE(Y) LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 STFDUX f8, Y1, INCY STFDU f9, 1 * SIZE(Y1) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemv_t.S000066400000000000000000000736461313527062700173770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define P 2048 #ifndef __64BIT__ #define STACKSIZE 224 #else #define STACKSIZE 304 #endif #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r10 #define LDA r5 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #define BUFFER r11 #define XP r12 #define MIN_N r14 #define J r15 #define CO r16 #define BO r17 #define PLDA_M r18 #define AO1 r19 #define AO2 r20 #define AO3 r21 #define AO4 r22 #define IS r23 #define PREA r24 #define PREC r25 #define Y1 r23 /* dummy; should be same as gemv_n.S */ #define Y2 r24 /* dummy; should be same as gemv_n.S */ #if defined(PPCG4) #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 #endif #ifdef PPC970 #define PREFETCHSIZE_A 56 #define PREFETCHSIZE_C 16 #endif #ifdef CELL #define PREFETCHSIZE_A 56 #define PREFETCHSIZE_C 16 #endif #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 40 #define PREFETCHSIZE_C 8 #endif #ifdef POWER6 #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 8 #endif #ifdef POWER8 #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 8 #endif #if !(defined(CONJ) && defined(XCONJ)) #define FMADDR FMADD #define FMSUBR FNMSUB #else #define FMADDR FNMSUB #define FMSUBR FMADD #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define FZERO 200(SP) #define ALPHA_R 208(SP) #define ALPHA_I 216(SP) #else #define FZERO 256(SP) #define ALPHA_R 264(SP) #define ALPHA_I 272(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r0, FZERO #else stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r0, FZERO stw r0, 4 + FZERO #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz X, FRAMESLOT(1) + STACKSIZE(SP) lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) lwz Y, FRAMESLOT(3) + STACKSIZE(SP) lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif stfd f1, ALPHA_R stfd f2, ALPHA_I mullw PLDA_M, LDA, N li XP, P subf PLDA_M, XP, PLDA_M slwi PLDA_M, PLDA_M, ZBASE_SHIFT slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li IS, 0 li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpwi cr0, M, 0 ble LL(End) cmpwi cr0, N, 0 ble LL(End) .align 4 LL(ISLoop): subf MIN_N, IS, M slwi r0, IS, ZBASE_SHIFT cmpi cr0, 0, MIN_N, P ble+ LL(min_nP) li MIN_N, P LL(min_nP): add XP, X, r0 cmpwi cr0, INCX, 2 * SIZE beq LL(Main) mr XP, BUFFER addi CO, BUFFER, -SIZE srawi. r0, MIN_N, 2 mtspr CTR, r0 ble LL(CopyRemain) .align 4 LL(CopyKernel): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) add X, X, INCX LFD f2, 0 * SIZE(X) LFD f3, 1 * SIZE(X) add X, X, INCX LFD f4, 0 * SIZE(X) LFD f5, 1 * SIZE(X) add X, X, INCX LFD f6, 0 * SIZE(X) LFD f7, 1 * SIZE(X) add X, X, INCX STFD f0, 1 * SIZE(CO) STFD f1, 2 * SIZE(CO) STFD f2, 3 * SIZE(CO) STFD f3, 4 * SIZE(CO) STFD f4, 5 * SIZE(CO) STFD f5, 6 * SIZE(CO) STFD f6, 7 * SIZE(CO) STFDU f7, 8 * SIZE(CO) bdnz LL(CopyKernel) .align 4 LL(CopyRemain): andi. r0, MIN_N, 3 mtspr CTR, r0 ble LL(Main) .align 4 LL(CopySub): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) add X, X, INCX STFD f0, 1 * SIZE(CO) STFDU f1, 2 * SIZE(CO) bdnz LL(CopySub) .align 4 LL(Main): mr CO, Y addi XP, XP, -SIZE srawi. J, N, 2 ble LL(Remain) .align 4 LL(MainHead): mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr BO, XP lfd f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst PREC, CO srawi. r0, MIN_N, 3 mtspr CTR, r0 ble LL(MainN3) LFD f16, 0 * SIZE(AO1) LFD f17, 1 * SIZE(AO1) LFD f18, 0 * SIZE(AO2) LFD f19, 1 * SIZE(AO2) LFD f20, 0 * SIZE(AO3) LFD f21, 1 * SIZE(AO3) LFD f22, 0 * SIZE(AO4) LFD f23, 1 * SIZE(AO4) LFD f24, 1 * SIZE(BO) LFD f25, 2 * SIZE(BO) LFD f26, 3 * SIZE(BO) LFD f27, 4 * SIZE(BO) LFD f28, 5 * SIZE(BO) LFD f29, 6 * SIZE(BO) LFD f30, 7 * SIZE(BO) LFD f31, 8 * SIZE(BO) bdz LL(MainKernelSkip) .align 5 LL(MainKernel): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f24, f4 FMADD f5, f18, f25, f5 FMADD f6, f19, f24, f6 FMADD f7, f19, f25, f7 LFD f16, 2 * SIZE(AO1) LFD f17, 3 * SIZE(AO1) LFD f18, 2 * SIZE(AO2) LFD f19, 3 * SIZE(AO2) FMADD f8, f20, f24, f8 FMADD f9, f20, f25, f9 FMADD f10, f21, f24, f10 FMADD f11, f21, f25, f11 FMADD f12, f22, f24, f12 FMADD f13, f22, f25, f13 FMADD f14, f23, f24, f14 FMADD f15, f23, f25, f15 LFD f20, 2 * SIZE(AO3) LFD f21, 3 * SIZE(AO3) LFD f22, 2 * SIZE(AO4) LFD f23, 3 * SIZE(AO4) FMADD f0, f16, f26, f0 FMADD f1, f16, f27, f1 FMADD f2, f17, f26, f2 FMADD f3, f17, f27, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 4 * SIZE(AO1) LFD f17, 5 * SIZE(AO1) LFD f18, 4 * SIZE(AO2) LFD f19, 5 * SIZE(AO2) FMADD f8, f20, f26, f8 FMADD f9, f20, f27, f9 FMADD f10, f21, f26, f10 FMADD f11, f21, f27, f11 FMADD f12, f22, f26, f12 FMADD f13, f22, f27, f13 FMADD f14, f23, f26, f14 FMADD f15, f23, f27, f15 LFD f20, 4 * SIZE(AO3) LFD f21, 5 * SIZE(AO3) LFD f22, 4 * SIZE(AO4) LFD f23, 5 * SIZE(AO4) LFD f24, 9 * SIZE(BO) LFD f25, 10 * SIZE(BO) LFD f26, 11 * SIZE(BO) LFD f27, 12 * SIZE(BO) FMADD f0, f16, f28, f0 FMADD f1, f16, f29, f1 FMADD f2, f17, f28, f2 FMADD f3, f17, f29, f3 FMADD f4, f18, f28, f4 FMADD f5, f18, f29, f5 FMADD f6, f19, f28, f6 FMADD f7, f19, f29, f7 LFD f16, 6 * SIZE(AO1) LFD f17, 7 * SIZE(AO1) LFD f18, 6 * SIZE(AO2) LFD f19, 7 * SIZE(AO2) FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f28, f12 FMADD f13, f22, f29, f13 FMADD f14, f23, f28, f14 FMADD f15, f23, f29, f15 LFD f20, 6 * SIZE(AO3) LFD f21, 7 * SIZE(AO3) LFD f22, 6 * SIZE(AO4) LFD f23, 7 * SIZE(AO4) FMADD f0, f16, f30, f0 FMADD f1, f16, f31, f1 FMADD f2, f17, f30, f2 FMADD f3, f17, f31, f3 FMADD f4, f18, f30, f4 FMADD f5, f18, f31, f5 FMADD f6, f19, f30, f6 FMADD f7, f19, f31, f7 LFD f16, 8 * SIZE(AO1) LFD f17, 9 * SIZE(AO1) LFD f18, 8 * SIZE(AO2) LFD f19, 9 * SIZE(AO2) FMADD f8, f20, f30, f8 FMADD f9, f20, f31, f9 FMADD f10, f21, f30, f10 FMADD f11, f21, f31, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 LFD f20, 8 * SIZE(AO3) LFD f21, 9 * SIZE(AO3) LFD f22, 8 * SIZE(AO4) LFD f23, 9 * SIZE(AO4) LFD f28, 13 * SIZE(BO) LFD f29, 14 * SIZE(BO) LFD f30, 15 * SIZE(BO) LFD f31, 16 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f24, f4 FMADD f5, f18, f25, f5 FMADD f6, f19, f24, f6 FMADD f7, f19, f25, f7 LFD f16, 10 * SIZE(AO1) LFD f17, 11 * SIZE(AO1) LFD f18, 10 * SIZE(AO2) LFD f19, 11 * SIZE(AO2) FMADD f8, f20, f24, f8 FMADD f9, f20, f25, f9 FMADD f10, f21, f24, f10 FMADD f11, f21, f25, f11 FMADD f12, f22, f24, f12 FMADD f13, f22, f25, f13 FMADD f14, f23, f24, f14 FMADD f15, f23, f25, f15 LFD f20, 10 * SIZE(AO3) LFD f21, 11 * SIZE(AO3) LFD f22, 10 * SIZE(AO4) LFD f23, 11 * SIZE(AO4) FMADD f0, f16, f26, f0 FMADD f1, f16, f27, f1 FMADD f2, f17, f26, f2 FMADD f3, f17, f27, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 12 * SIZE(AO1) LFD f17, 13 * SIZE(AO1) LFD f18, 12 * SIZE(AO2) LFD f19, 13 * SIZE(AO2) FMADD f8, f20, f26, f8 FMADD f9, f20, f27, f9 FMADD f10, f21, f26, f10 FMADD f11, f21, f27, f11 FMADD f12, f22, f26, f12 FMADD f13, f22, f27, f13 FMADD f14, f23, f26, f14 FMADD f15, f23, f27, f15 LFD f20, 12 * SIZE(AO3) LFD f21, 13 * SIZE(AO3) LFD f22, 12 * SIZE(AO4) LFD f23, 13 * SIZE(AO4) LFD f24, 17 * SIZE(BO) LFD f25, 18 * SIZE(BO) LFD f26, 19 * SIZE(BO) LFD f27, 20 * SIZE(BO) FMADD f0, f16, f28, f0 FMADD f1, f16, f29, f1 FMADD f2, f17, f28, f2 FMADD f3, f17, f29, f3 FMADD f4, f18, f28, f4 FMADD f5, f18, f29, f5 FMADD f6, f19, f28, f6 FMADD f7, f19, f29, f7 LFD f16, 14 * SIZE(AO1) LFD f17, 15 * SIZE(AO1) LFD f18, 14 * SIZE(AO2) LFD f19, 15 * SIZE(AO2) FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f28, f12 FMADD f13, f22, f29, f13 FMADD f14, f23, f28, f14 FMADD f15, f23, f29, f15 LFD f20, 14 * SIZE(AO3) LFD f21, 15 * SIZE(AO3) LFD f22, 14 * SIZE(AO4) LFD f23, 15 * SIZE(AO4) FMADD f0, f16, f30, f0 FMADD f1, f16, f31, f1 FMADD f2, f17, f30, f2 FMADD f3, f17, f31, f3 FMADD f4, f18, f30, f4 FMADD f5, f18, f31, f5 FMADD f6, f19, f30, f6 FMADD f7, f19, f31, f7 LFD f16, 16 * SIZE(AO1) LFD f17, 17 * SIZE(AO1) LFD f18, 16 * SIZE(AO2) LFD f19, 17 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) FMADD f8, f20, f30, f8 FMADD f9, f20, f31, f9 FMADD f10, f21, f30, f10 FMADD f11, f21, f31, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 LFD f20, 16 * SIZE(AO3) LFD f21, 17 * SIZE(AO3) LFD f22, 16 * SIZE(AO4) LFD f23, 17 * SIZE(AO4) LFD f28, 21 * SIZE(BO) LFD f29, 22 * SIZE(BO) LFD f30, 23 * SIZE(BO) LFD f31, 24 * SIZE(BO) addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE DCBT(AO3, PREA) DCBT(AO4, PREA) addi BO, BO, 16 * SIZE bdnz LL(MainKernel) .align 4 LL(MainKernelSkip): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f24, f4 FMADD f5, f18, f25, f5 FMADD f6, f19, f24, f6 FMADD f7, f19, f25, f7 LFD f16, 2 * SIZE(AO1) LFD f17, 3 * SIZE(AO1) LFD f18, 2 * SIZE(AO2) LFD f19, 3 * SIZE(AO2) FMADD f8, f20, f24, f8 FMADD f9, f20, f25, f9 FMADD f10, f21, f24, f10 FMADD f11, f21, f25, f11 FMADD f12, f22, f24, f12 FMADD f13, f22, f25, f13 FMADD f14, f23, f24, f14 FMADD f15, f23, f25, f15 LFD f20, 2 * SIZE(AO3) LFD f21, 3 * SIZE(AO3) LFD f22, 2 * SIZE(AO4) LFD f23, 3 * SIZE(AO4) FMADD f0, f16, f26, f0 FMADD f1, f16, f27, f1 FMADD f2, f17, f26, f2 FMADD f3, f17, f27, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 4 * SIZE(AO1) LFD f17, 5 * SIZE(AO1) LFD f18, 4 * SIZE(AO2) LFD f19, 5 * SIZE(AO2) FMADD f8, f20, f26, f8 FMADD f9, f20, f27, f9 FMADD f10, f21, f26, f10 FMADD f11, f21, f27, f11 FMADD f12, f22, f26, f12 FMADD f13, f22, f27, f13 FMADD f14, f23, f26, f14 FMADD f15, f23, f27, f15 LFD f20, 4 * SIZE(AO3) LFD f21, 5 * SIZE(AO3) LFD f22, 4 * SIZE(AO4) LFD f23, 5 * SIZE(AO4) FMADD f0, f16, f28, f0 FMADD f1, f16, f29, f1 FMADD f2, f17, f28, f2 FMADD f3, f17, f29, f3 FMADD f4, f18, f28, f4 FMADD f5, f18, f29, f5 FMADD f6, f19, f28, f6 FMADD f7, f19, f29, f7 LFD f16, 6 * SIZE(AO1) LFD f17, 7 * SIZE(AO1) LFD f18, 6 * SIZE(AO2) LFD f19, 7 * SIZE(AO2) FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f28, f12 FMADD f13, f22, f29, f13 FMADD f14, f23, f28, f14 FMADD f15, f23, f29, f15 LFD f20, 6 * SIZE(AO3) LFD f21, 7 * SIZE(AO3) LFD f22, 6 * SIZE(AO4) LFD f23, 7 * SIZE(AO4) FMADD f0, f16, f30, f0 FMADD f1, f16, f31, f1 FMADD f2, f17, f30, f2 FMADD f3, f17, f31, f3 FMADD f4, f18, f30, f4 FMADD f5, f18, f31, f5 FMADD f6, f19, f30, f6 FMADD f7, f19, f31, f7 LFD f16, 8 * SIZE(AO1) LFD f17, 9 * SIZE(AO1) LFD f18, 8 * SIZE(AO2) LFD f19, 9 * SIZE(AO2) FMADD f8, f20, f30, f8 FMADD f9, f20, f31, f9 FMADD f10, f21, f30, f10 FMADD f11, f21, f31, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 LFD f20, 8 * SIZE(AO3) LFD f21, 9 * SIZE(AO3) LFD f22, 8 * SIZE(AO4) LFD f23, 9 * SIZE(AO4) LFD f24, 9 * SIZE(BO) LFD f25, 10 * SIZE(BO) LFD f26, 11 * SIZE(BO) LFD f27, 12 * SIZE(BO) LFD f28, 13 * SIZE(BO) LFD f29, 14 * SIZE(BO) LFD f30, 15 * SIZE(BO) LFDU f31, 16 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f24, f4 FMADD f5, f18, f25, f5 FMADD f6, f19, f24, f6 FMADD f7, f19, f25, f7 LFD f16, 10 * SIZE(AO1) LFD f17, 11 * SIZE(AO1) LFD f18, 10 * SIZE(AO2) LFD f19, 11 * SIZE(AO2) FMADD f8, f20, f24, f8 FMADD f9, f20, f25, f9 FMADD f10, f21, f24, f10 FMADD f11, f21, f25, f11 FMADD f12, f22, f24, f12 FMADD f13, f22, f25, f13 FMADD f14, f23, f24, f14 FMADD f15, f23, f25, f15 LFD f20, 10 * SIZE(AO3) LFD f21, 11 * SIZE(AO3) LFD f22, 10 * SIZE(AO4) LFD f23, 11 * SIZE(AO4) FMADD f0, f16, f26, f0 FMADD f1, f16, f27, f1 FMADD f2, f17, f26, f2 FMADD f3, f17, f27, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 12 * SIZE(AO1) LFD f17, 13 * SIZE(AO1) LFD f18, 12 * SIZE(AO2) LFD f19, 13 * SIZE(AO2) FMADD f8, f20, f26, f8 FMADD f9, f20, f27, f9 FMADD f10, f21, f26, f10 FMADD f11, f21, f27, f11 FMADD f12, f22, f26, f12 FMADD f13, f22, f27, f13 FMADD f14, f23, f26, f14 FMADD f15, f23, f27, f15 LFD f20, 12 * SIZE(AO3) LFD f21, 13 * SIZE(AO3) LFD f22, 12 * SIZE(AO4) LFD f23, 13 * SIZE(AO4) FMADD f0, f16, f28, f0 FMADD f1, f16, f29, f1 FMADD f2, f17, f28, f2 FMADD f3, f17, f29, f3 FMADD f4, f18, f28, f4 FMADD f5, f18, f29, f5 FMADD f6, f19, f28, f6 FMADD f7, f19, f29, f7 LFD f16, 14 * SIZE(AO1) LFD f17, 15 * SIZE(AO1) LFD f18, 14 * SIZE(AO2) LFD f19, 15 * SIZE(AO2) FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f28, f12 FMADD f13, f22, f29, f13 FMADD f14, f23, f28, f14 FMADD f15, f23, f29, f15 LFD f20, 14 * SIZE(AO3) LFD f21, 15 * SIZE(AO3) LFD f22, 14 * SIZE(AO4) LFD f23, 15 * SIZE(AO4) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE FMADD f0, f16, f30, f0 FMADD f1, f16, f31, f1 FMADD f2, f17, f30, f2 FMADD f3, f17, f31, f3 FMADD f4, f18, f30, f4 FMADD f5, f18, f31, f5 FMADD f6, f19, f30, f6 FMADD f7, f19, f31, f7 FMADD f8, f20, f30, f8 FMADD f9, f20, f31, f9 FMADD f10, f21, f30, f10 FMADD f11, f21, f31, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 .align 4 LL(MainN3): andi. r0, MIN_N, 7 mtspr CTR, r0 ble LL(MainFinish) .align 4 LFD f16, 0 * SIZE(AO1) LFD f17, 1 * SIZE(AO1) LFD f18, 0 * SIZE(AO2) LFD f19, 1 * SIZE(AO2) LFD f20, 0 * SIZE(AO3) LFD f21, 1 * SIZE(AO3) LFD f22, 0 * SIZE(AO4) LFD f23, 1 * SIZE(AO4) LFD f24, 1 * SIZE(BO) LFDU f25, 2 * SIZE(BO) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi AO3, AO3, 2 * SIZE addi AO4, AO4, 2 * SIZE bdz LL(MainN3KernelSkip) .align 4 LL(MainN3Kernel): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f24, f4 FMADD f5, f18, f25, f5 FMADD f6, f19, f24, f6 FMADD f7, f19, f25, f7 LFD f16, 0 * SIZE(AO1) LFD f17, 1 * SIZE(AO1) LFD f18, 0 * SIZE(AO2) LFD f19, 1 * SIZE(AO2) FMADD f8, f20, f24, f8 FMADD f9, f20, f25, f9 FMADD f10, f21, f24, f10 FMADD f11, f21, f25, f11 FMADD f12, f22, f24, f12 FMADD f13, f22, f25, f13 FMADD f14, f23, f24, f14 FMADD f15, f23, f25, f15 LFD f20, 0 * SIZE(AO3) LFD f21, 1 * SIZE(AO3) LFD f22, 0 * SIZE(AO4) LFD f23, 1 * SIZE(AO4) LFD f24, 1 * SIZE(BO) LFDU f25, 2 * SIZE(BO) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi AO3, AO3, 2 * SIZE addi AO4, AO4, 2 * SIZE bdnz LL(MainN3Kernel) .align 4 LL(MainN3KernelSkip): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f24, f4 FMADD f5, f18, f25, f5 FMADD f6, f19, f24, f6 FMADD f7, f19, f25, f7 FMADD f8, f20, f24, f8 FMADD f9, f20, f25, f9 FMADD f10, f21, f24, f10 FMADD f11, f21, f25, f11 FMADD f12, f22, f24, f12 FMADD f13, f22, f25, f13 FMADD f14, f23, f24, f14 FMADD f15, f23, f25, f15 .align 4 LL(MainFinish): lfd f30, ALPHA_R lfd f31, ALPHA_I #ifndef XCONJ #ifndef CONJ FSUB f0, f0, f3 FADD f1, f1, f2 FSUB f4, f4, f7 FADD f5, f5, f6 FSUB f8, f8, f11 FADD f9, f9, f10 FSUB f12, f12, f15 FADD f13, f13, f14 #else FADD f0, f0, f3 FSUB f1, f1, f2 FADD f4, f4, f7 FSUB f5, f5, f6 FADD f8, f8, f11 FSUB f9, f9, f10 FADD f12, f12, f15 FSUB f13, f13, f14 #endif #else #ifndef CONJ FADD f0, f0, f3 FSUB f1, f2, f1 FADD f4, f4, f7 FSUB f5, f6, f5 FADD f8, f8, f11 FSUB f9, f10, f9 FADD f12, f12, f15 FSUB f13, f14, f13 #else FSUB f0, f0, f3 FADD f1, f1, f2 FSUB f4, f4, f7 FADD f5, f5, f6 FSUB f8, f8, f11 FADD f9, f9, f10 FSUB f12, f12, f15 FADD f13, f13, f14 #endif #endif mr BO, CO cmpwi cr0, INCY, 2 * SIZE bne LL(FinishN1) LFD f16, 0 * SIZE(CO) LFD f17, 1 * SIZE(CO) LFD f18, 2 * SIZE(CO) LFD f19, 3 * SIZE(CO) LFD f20, 4 * SIZE(CO) LFD f21, 5 * SIZE(CO) LFD f22, 6 * SIZE(CO) LFD f23, 7 * SIZE(CO) FMADD f16, f30, f0, f16 FMADDR f17, f30, f1, f17 FMADD f18, f30, f4, f18 FMADDR f19, f30, f5, f19 FMADD f20, f30, f8, f20 FMADDR f21, f30, f9, f21 FMADD f22, f30, f12, f22 FMADDR f23, f30, f13, f23 FMSUBR f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMSUBR f18, f31, f5, f18 FMADD f19, f31, f4, f19 FMSUBR f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMSUBR f22, f31, f13, f22 FMADD f23, f31, f12, f23 STFD f16, 0 * SIZE(CO) STFD f17, 1 * SIZE(CO) STFD f18, 2 * SIZE(CO) STFD f19, 3 * SIZE(CO) STFD f20, 4 * SIZE(CO) STFD f21, 5 * SIZE(CO) STFD f22, 6 * SIZE(CO) STFD f23, 7 * SIZE(CO) addi CO, CO, 8 * SIZE addi J, J, -1 cmpwi cr0, J, 0 bgt LL(MainHead) b LL(Remain) .align 4 LL(FinishN1): LFD f16, 0 * SIZE(CO) LFD f17, 1 * SIZE(CO) add CO, CO, INCY LFD f18, 0 * SIZE(CO) LFD f19, 1 * SIZE(CO) add CO, CO, INCY LFD f20, 0 * SIZE(CO) LFD f21, 1 * SIZE(CO) add CO, CO, INCY LFD f22, 0 * SIZE(CO) LFD f23, 1 * SIZE(CO) add CO, CO, INCY FMADD f16, f30, f0, f16 FMADDR f17, f30, f1, f17 FMADD f18, f30, f4, f18 FMADDR f19, f30, f5, f19 FMADD f20, f30, f8, f20 FMADDR f21, f30, f9, f21 FMADD f22, f30, f12, f22 FMADDR f23, f30, f13, f23 FMSUBR f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMSUBR f18, f31, f5, f18 FMADD f19, f31, f4, f19 FMSUBR f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMSUBR f22, f31, f13, f22 FMADD f23, f31, f12, f23 STFD f16, 0 * SIZE(BO) STFD f17, 1 * SIZE(BO) add BO, BO, INCY STFD f18, 0 * SIZE(BO) STFD f19, 1 * SIZE(BO) add BO, BO, INCY STFD f20, 0 * SIZE(BO) STFD f21, 1 * SIZE(BO) add BO, BO, INCY STFD f22, 0 * SIZE(BO) STFD f23, 1 * SIZE(BO) addi J, J, -1 cmpwi cr0, J, 0 bgt LL(MainHead) .align 4 LL(Remain): andi. J, N, 3 ble LL(ISEnd) .align 4 LL(RemainHead): mr AO1, A add A, A, LDA mr BO, XP lfd f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. r0 , MIN_N, 3 mtspr CTR, r0 ble LL(RemainN3) LFD f16, 0 * SIZE(AO1) LFD f17, 1 * SIZE(AO1) LFD f18, 2 * SIZE(AO1) LFD f19, 3 * SIZE(AO1) LFD f20, 4 * SIZE(AO1) LFD f21, 5 * SIZE(AO1) LFD f22, 6 * SIZE(AO1) LFD f23, 7 * SIZE(AO1) LFD f24, 1 * SIZE(BO) LFD f25, 2 * SIZE(BO) LFD f26, 3 * SIZE(BO) LFD f27, 4 * SIZE(BO) LFD f28, 5 * SIZE(BO) LFD f29, 6 * SIZE(BO) LFD f30, 7 * SIZE(BO) LFD f31, 8 * SIZE(BO) bdz LL(RemainKernelSkip) .align 4 LL(RemainKernel): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO1) LFD f17, 9 * SIZE(AO1) LFD f18, 10 * SIZE(AO1) LFD f19, 11 * SIZE(AO1) LFD f24, 9 * SIZE(BO) LFD f25, 10 * SIZE(BO) LFD f26, 11 * SIZE(BO) LFD f27, 12 * SIZE(BO) FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 LFD f20, 12 * SIZE(AO1) LFD f21, 13 * SIZE(AO1) LFD f22, 14 * SIZE(AO1) LFD f23, 15 * SIZE(AO1) LFD f28, 13 * SIZE(BO) LFD f29, 14 * SIZE(BO) LFD f30, 15 * SIZE(BO) LFD f31, 16 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 16 * SIZE(AO1) LFD f17, 17 * SIZE(AO1) LFD f18, 18 * SIZE(AO1) LFD f19, 19 * SIZE(AO1) LFD f24, 17 * SIZE(BO) LFD f25, 18 * SIZE(BO) LFD f26, 19 * SIZE(BO) LFD f27, 20 * SIZE(BO) FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 LFD f20, 20 * SIZE(AO1) LFD f21, 21 * SIZE(AO1) LFD f22, 22 * SIZE(AO1) LFD f23, 23 * SIZE(AO1) LFD f28, 21 * SIZE(BO) LFD f29, 22 * SIZE(BO) LFD f30, 23 * SIZE(BO) LFD f31, 24 * SIZE(BO) addi AO1, AO1, 16 * SIZE addi BO, BO, 16 * SIZE DCBT(AO1, PREA) bdnz LL(RemainKernel) .align 4 LL(RemainKernelSkip): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO1) LFD f17, 9 * SIZE(AO1) LFD f18, 10 * SIZE(AO1) LFD f19, 11 * SIZE(AO1) LFD f24, 9 * SIZE(BO) LFD f25, 10 * SIZE(BO) LFD f26, 11 * SIZE(BO) LFD f27, 12 * SIZE(BO) FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 LFD f20, 12 * SIZE(AO1) LFD f21, 13 * SIZE(AO1) LFD f22, 14 * SIZE(AO1) LFD f23, 15 * SIZE(AO1) LFD f28, 13 * SIZE(BO) LFD f29, 14 * SIZE(BO) LFD f30, 15 * SIZE(BO) LFDU f31, 16 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f18, f27, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 FMADD f8, f20, f28, f8 FMADD f9, f20, f29, f9 FMADD f10, f21, f28, f10 FMADD f11, f21, f29, f11 FMADD f12, f22, f30, f12 FMADD f13, f22, f31, f13 FMADD f14, f23, f30, f14 FMADD f15, f23, f31, f15 addi AO1, AO1, 16 * SIZE .align 4 LL(RemainN3): andi. r0, MIN_N, 7 mtspr CTR, r0 ble LL(RemainFinish) .align 4 LFD f16, 0 * SIZE(AO1) LFD f17, 1 * SIZE(AO1) LFD f24, 1 * SIZE(BO) LFDU f25, 2 * SIZE(BO) addi AO1, AO1, 2 * SIZE bdz LL(RemainN3KernelSkip) .align 4 LL(RemainN3Kernel): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 LFD f16, 0 * SIZE(AO1) LFD f17, 1 * SIZE(AO1) LFD f24, 1 * SIZE(BO) LFDU f25, 2 * SIZE(BO) addi AO1, AO1, 2 * SIZE bdnz LL(RemainN3Kernel) .align 4 LL(RemainN3KernelSkip): FMADD f0, f16, f24, f0 FMADD f1, f16, f25, f1 FMADD f2, f17, f24, f2 FMADD f3, f17, f25, f3 .align 4 LL(RemainFinish): lfd f30, ALPHA_R lfd f31, ALPHA_I LFD f16, 0 * SIZE(CO) LFD f17, 1 * SIZE(CO) FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FADD f8, f8, f12 FADD f9, f9, f13 FADD f10, f10, f14 FADD f11, f11, f15 FADD f0, f0, f8 FADD f1, f1, f9 FADD f2, f2, f10 FADD f3, f3, f11 #ifndef XCONJ #ifndef CONJ FSUB f0, f0, f3 FADD f1, f1, f2 #else FADD f0, f0, f3 FSUB f1, f1, f2 #endif #else #ifndef CONJ FADD f0, f0, f3 FSUB f1, f2, f1 #else FSUB f0, f0, f3 FADD f1, f1, f2 #endif #endif FMADD f16, f30, f0, f16 FMADDR f17, f30, f1, f17 FMSUBR f16, f31, f1, f16 FMADD f17, f31, f0, f17 STFD f16, 0 * SIZE(CO) STFD f17, 1 * SIZE(CO) add CO, CO, INCY addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(RemainHead) .align 4 LL(ISEnd): subf A, PLDA_M, A addi IS, IS, P cmp cr0, 0, IS, M blt LL(ISLoop) .align 4 LL(End): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zgemv_t_ppc440.S000066400000000000000000000615021313527062700204550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define P 1024 #ifndef __64BIT__ #define STACKSIZE 224 #else #define STACKSIZE 304 #endif #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r6 #define LDA r7 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r10 #define LDA r5 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #else #define M r3 #define N r4 #define A r8 #define LDA r9 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #endif #endif #define BUFFER r11 #define XP r12 #define X1 r14 #define J r15 #define AO1 r16 #define AO2 r17 #define AO3 r18 #define AO4 r19 #define PREA r20 #define PREC r21 #define YY r22 #if defined(PPCG4) #define PREFETCHSIZE_A (3 * 8) #define PREFETCHSIZE_C 7 #endif #if defined(POWER6) #define PREFETCHSIZE_A (3 * 8) #define PREFETCHSIZE_C 7 #endif #if !(defined(CONJ) && defined(XCONJ)) #define FMADDR FMADD #define FMSUBR FNMSUB #else #define FMADDR FNMSUB #define FMSUBR FMADD #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define FZERO 200(SP) #else #define FZERO 256(SP) #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r0, FZERO #else stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r0, FZERO stw r0, 4 + FZERO #endif #ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz X, FRAMESLOT(1) + STACKSIZE(SP) lwz INCX, FRAMESLOT(2) + STACKSIZE(SP) lwz Y, FRAMESLOT(3) + STACKSIZE(SP) lwz INCY, FRAMESLOT(4) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else ld INCX, FRAMESLOT(0) + STACKSIZE(SP) ld Y, FRAMESLOT(1) + STACKSIZE(SP) ld INCY, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #ifndef XCONJ #ifndef CONJ #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FNMSUB #define FMADD4 FMADD #else #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FMADD #define FMADD4 FNMSUB #endif #else #ifndef CONJ #define FMADD1 FMADD #define FMADD2 FNMSUB #define FMADD3 FMADD #define FMADD4 FMADD #else #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FNMSUB #define FMADD4 FMADD #endif #endif #define y1 f0 #define y2 f1 #define y3 f2 #define y4 f3 #define y5 f4 #define y6 f5 #define y7 f6 #define y8 f7 #define a1 f8 #define a2 f9 #define a3 f10 #define a4 f11 #define a5 f12 #define a6 f13 #define a7 f14 #define a8 f15 #define b1 f16 #define b2 f17 #define b3 f18 #define b4 f19 #define b5 f20 #define b6 f21 #define b7 f22 #define b8 f23 #define alpha_r f24 #define alpha_i f25 fmr alpha_r, f1 fmr alpha_i, f2 slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE addi A, A, -SIZE addi INCX, INCX, -SIZE addi INCY, INCY, -SIZE sub X, X, INCX sub Y, Y, INCY mr YY, Y cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) mr XP, X cmpwi cr0, INCX, SIZE beq LL(10) addi XP, BUFFER, -SIZE addi X1, BUFFER, -SIZE srawi. r0, M, 2 mtspr CTR, r0 ble LL(05) .align 4 LL(02): LFDUX f0, X, INCX LFDU f1, 1 * SIZE(X) LFDUX f2, X, INCX LFDU f3, 1 * SIZE(X) LFDUX f4, X, INCX LFDU f5, 1 * SIZE(X) LFDUX f6, X, INCX LFDU f7, 1 * SIZE(X) STFDU f0, 1 * SIZE(X1) STFDU f1, 1 * SIZE(X1) STFDU f2, 1 * SIZE(X1) STFDU f3, 1 * SIZE(X1) STFDU f4, 1 * SIZE(X1) STFDU f5, 1 * SIZE(X1) STFDU f6, 1 * SIZE(X1) STFDU f7, 1 * SIZE(X1) bdnz LL(02) .align 4 LL(05): andi. r0, M, 3 mtspr CTR, r0 ble LL(10) .align 4 LL(06): LFDUX f0, X, INCX LFDU f1, 1 * SIZE(X) STFDU f0, 1 * SIZE(X1) STFDU f1, 1 * SIZE(X1) bdnz LL(06) .align 4 LL(10): srawi. J, N, 2 ble LL(20) .align 4 LL(11): lfd y1, FZERO mr AO1, A fmr y2, y1 mr X1, XP fmr y3, y1 add AO2, A, LDA fmr y4, y1 add AO3, AO2, LDA fmr y5, y1 add AO4, AO3, LDA fmr y6, y1 add A, AO4, LDA fmr y7, y1 dcbtst PREC, Y fmr y8, y1 srawi. r0, M, 2 mtspr CTR, r0 ble LL(15) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) LFDU a5, 1 * SIZE(AO3) LFDU a6, 1 * SIZE(AO3) LFDU a7, 1 * SIZE(AO4) bdz LL(13) .align 5 LL(12): FMADD1 y1, a1, b1, y1 LFDU a8, 1 * SIZE(AO4) FMADD2 y2, a1, b2, y2 LFDU b3, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 LFDU b4, 1 * SIZE(X1) FMADD2 y4, a3, b2, y4 #ifdef PPCG4 dcbt AO1, PREA #endif FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) #ifdef PPCG4 dcbt X1, PREA #endif FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 #ifdef PPCG4 dcbt AO2, PREA #endif FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 #ifdef PPCG4 dcbt AO3, PREA #endif FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b3, y5 FMADD2 y6, a5, b4, y6 FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8 #ifdef PPCG4 dcbt AO4, PREA #endif FMADD3 y5, a6, b4, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b3, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b4, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b3, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) #if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA #endif FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 #if defined(PPCG4) && defined(DOUBLE) dcbt AO3, PREA #endif FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b3, y5 LFDU b1, 1 * SIZE(X1) FMADD2 y6, a5, b4, y6 LFDU b2, 1 * SIZE(X1) FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8 #if defined(PPCG4) && defined(DOUBLE) dcbt AO4, PREA #endif FMADD3 y5, a6, b4, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b3, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b4, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b3, y8 bdnz LL(12) .align 4 LL(13): FMADD1 y1, a1, b1, y1 LFDU a8, 1 * SIZE(AO4) FMADD2 y2, a1, b2, y2 LFDU b3, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 LFDU b4, 1 * SIZE(X1) FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b3, y5 FMADD2 y6, a5, b4, y6 FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8 FMADD3 y5, a6, b4, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b3, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b4, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b3, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 FMADD4 y2, a2, b3, y2 FMADD3 y3, a4, b4, y3 FMADD4 y4, a4, b3, y4 FMADD1 y5, a5, b3, y5 FMADD2 y6, a5, b4, y6 FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8 FMADD3 y5, a6, b4, y5 FMADD4 y6, a6, b3, y6 FMADD3 y7, a8, b4, y7 FMADD4 y8, a8, b3, y8 .align 4 LL(15): andi. r0, M, 2 ble LL(17) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO2) LFDU b3, 1 * SIZE(X1) LFDU a4, 1 * SIZE(AO2) LFDU b4, 1 * SIZE(X1) FMADD1 y1, a1, b1, y1 LFDU a5, 1 * SIZE(AO3) FMADD2 y2, a1, b2, y2 LFDU a6, 1 * SIZE(AO3) FMADD1 y3, a3, b1, y3 LFDU a7, 1 * SIZE(AO4) FMADD2 y4, a3, b2, y4 LFDU a8, 1 * SIZE(AO4) FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 FMADD3 y5, a6, b2, y5 LFDU a5, 1 * SIZE(AO3) FMADD4 y6, a6, b1, y6 LFDU a6, 1 * SIZE(AO3) FMADD3 y7, a8, b2, y7 LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b1, y8 LFDU a8, 1 * SIZE(AO4) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 FMADD4 y2, a2, b3, y2 FMADD3 y3, a4, b4, y3 FMADD4 y4, a4, b3, y4 FMADD1 y5, a5, b3, y5 FMADD2 y6, a5, b4, y6 FMADD1 y7, a7, b3, y7 FMADD2 y8, a7, b4, y8 FMADD3 y5, a6, b4, y5 FMADD4 y6, a6, b3, y6 FMADD3 y7, a8, b4, y7 FMADD4 y8, a8, b3, y8 .align 4 LL(17): andi. r0, M, 1 ble LL(19) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) LFDU a5, 1 * SIZE(AO3) LFDU a6, 1 * SIZE(AO3) LFDU a7, 1 * SIZE(AO4) LFDU a8, 1 * SIZE(AO4) LFDU b1, 1 * SIZE(X1) LFDU b2, 1 * SIZE(X1) FMADD1 y1, a1, b1, y1 FMADD2 y2, a1, b2, y2 FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 FMADD4 y2, a2, b1, y2 FMADD3 y3, a4, b2, y3 FMADD4 y4, a4, b1, y4 FMADD1 y5, a5, b1, y5 FMADD2 y6, a5, b2, y6 FMADD1 y7, a7, b1, y7 FMADD2 y8, a7, b2, y8 FMADD3 y5, a6, b2, y5 FMADD4 y6, a6, b1, y6 FMADD3 y7, a8, b2, y7 FMADD4 y8, a8, b1, y8 .align 4 LL(19): LFDUX b1, Y, INCY LFDU b2, 1 * SIZE(Y) LFDUX b3, Y, INCY LFDU b4, 1 * SIZE(Y) LFDUX b5, Y, INCY LFDU b6, 1 * SIZE(Y) LFDUX b7, Y, INCY LFDU b8, 1 * SIZE(Y) FMADD b1, alpha_r, y1, b1 FMADDR b2, alpha_r, y2, b2 FMADD b3, alpha_r, y3, b3 FMADDR b4, alpha_r, y4, b4 FMADD b5, alpha_r, y5, b5 FMADDR b6, alpha_r, y6, b6 FMADD b7, alpha_r, y7, b7 FMADDR b8, alpha_r, y8, b8 FMSUBR b1, alpha_i, y2, b1 FMADD b2, alpha_i, y1, b2 FMSUBR b3, alpha_i, y4, b3 FMADD b4, alpha_i, y3, b4 FMSUBR b5, alpha_i, y6, b5 FMADD b6, alpha_i, y5, b6 FMSUBR b7, alpha_i, y8, b7 FMADD b8, alpha_i, y7, b8 STFDUX b1, YY, INCY STFDU b2, 1 * SIZE(YY) STFDUX b3, YY, INCY STFDU b4, 1 * SIZE(YY) STFDUX b5, YY, INCY STFDU b6, 1 * SIZE(YY) STFDUX b7, YY, INCY STFDU b8, 1 * SIZE(YY) addi J, J, -1 cmpwi cr0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 2 ble LL(30) lfd y1, FZERO mr AO1, A fmr y2, y1 mr X1, XP fmr y3, y1 add AO2, A, LDA fmr y4, y1 add A, AO2, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(25) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO2) bdz LL(23) .align 5 LL(22): FMADD1 y1, a1, b1, y1 LFDU a4, 1 * SIZE(AO2) FMADD2 y2, a1, b2, y2 LFDU b3, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 LFDU b4, 1 * SIZE(X1) FMADD2 y4, a3, b2, y4 #ifdef PPCG4 dcbt AO1, PREA #endif FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) #ifdef PPCG4 dcbt AO2, PREA #endif FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 #ifdef PPCG4 dcbt X1, PREA #endif FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) #if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA #endif FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 #if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA #endif FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 bdnz LL(22) .align 4 LL(23): FMADD1 y1, a1, b1, y1 LFDU a4, 1 * SIZE(AO2) FMADD2 y2, a1, b2, y2 LFDU b3, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 LFDU b4, 1 * SIZE(X1) FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b3, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b4, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b3, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 FMADD4 y2, a2, b3, y2 FMADD3 y3, a4, b4, y3 FMADD4 y4, a4, b3, y4 .align 4 LL(25): andi. r0, M, 2 ble LL(27) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) LFDU a3, 1 * SIZE(AO2) LFDU b3, 1 * SIZE(X1) LFDU a4, 1 * SIZE(AO2) LFDU b4, 1 * SIZE(X1) FMADD1 y1, a1, b1, y1 FMADD2 y2, a1, b2, y2 FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 LFDU a1, 1 * SIZE(AO1) FMADD4 y2, a2, b1, y2 LFDU a2, 1 * SIZE(AO1) FMADD3 y3, a4, b2, y3 LFDU a3, 1 * SIZE(AO2) FMADD4 y4, a4, b1, y4 LFDU a4, 1 * SIZE(AO2) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD1 y3, a3, b3, y3 FMADD2 y4, a3, b4, y4 FMADD3 y1, a2, b4, y1 FMADD4 y2, a2, b3, y2 FMADD3 y3, a4, b4, y3 FMADD4 y4, a4, b3, y4 .align 4 LL(27): andi. r0, M, 1 ble LL(29) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) LFDU b1, 1 * SIZE(X1) LFDU b2, 1 * SIZE(X1) FMADD1 y1, a1, b1, y1 FMADD2 y2, a1, b2, y2 FMADD1 y3, a3, b1, y3 FMADD2 y4, a3, b2, y4 FMADD3 y1, a2, b2, y1 FMADD4 y2, a2, b1, y2 FMADD3 y3, a4, b2, y3 FMADD4 y4, a4, b1, y4 .align 4 LL(29): LFDUX b1, Y, INCY LFDU b2, 1 * SIZE(Y) LFDUX b3, Y, INCY LFDU b4, 1 * SIZE(Y) FMADD b1, alpha_r, y1, b1 FMADDR b2, alpha_r, y2, b2 FMADD b3, alpha_r, y3, b3 FMADDR b4, alpha_r, y4, b4 FMSUBR b1, alpha_i, y2, b1 FMADD b2, alpha_i, y1, b2 FMSUBR b3, alpha_i, y4, b3 FMADD b4, alpha_i, y3, b4 STFDUX b1, YY, INCY STFDU b2, 1 * SIZE(YY) STFDUX b3, YY, INCY STFDU b4, 1 * SIZE(YY) .align 4 LL(30): andi. J, N, 1 ble LL(999) lfd y1, FZERO mr AO1, A fmr y2, y1 mr X1, XP fmr y3, y1 fmr y4, y1 add A, A, LDA srawi. r0, M, 2 mtspr CTR, r0 ble LL(35) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) bdz LL(33) .align 5 LL(32): FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) #ifdef PPCG4 dcbt AO1, PREA #endif FMADD3 y3, a2, b2, y3 LFDU a1, 1 * SIZE(AO1) FMADD4 y4, a2, b1, y4 LFDU a2, 1 * SIZE(AO1) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) #ifdef PPCG4 dcbt X1, PREA #endif FMADD3 y3, a2, b4, y3 LFDU a1, 1 * SIZE(AO1) FMADD4 y4, a2, b3, y4 LFDU a2, 1 * SIZE(AO1) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA #endif FMADD3 y3, a2, b2, y3 LFDU a1, 1 * SIZE(AO1) FMADD4 y4, a2, b1, y4 LFDU a2, 1 * SIZE(AO1) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) #if defined(PPCG4) && defined(DOUBLE) dcbt X1, PREA #endif FMADD3 y3, a2, b4, y3 LFDU a1, 1 * SIZE(AO1) FMADD4 y4, a2, b3, y4 LFDU a2, 1 * SIZE(AO1) bdnz LL(32) .align 4 LL(33): FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD3 y3, a2, b2, y3 LFDU a1, 1 * SIZE(AO1) FMADD4 y4, a2, b1, y4 LFDU a2, 1 * SIZE(AO1) FMADD1 y1, a1, b3, y1 LFDU b1, 1 * SIZE(X1) FMADD2 y2, a1, b4, y2 LFDU b2, 1 * SIZE(X1) FMADD3 y3, a2, b4, y3 LFDU a1, 1 * SIZE(AO1) FMADD4 y4, a2, b3, y4 LFDU a2, 1 * SIZE(AO1) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU b4, 1 * SIZE(X1) FMADD3 y3, a2, b2, y3 LFDU a1, 1 * SIZE(AO1) FMADD4 y4, a2, b1, y4 LFDU a2, 1 * SIZE(AO1) FMADD1 y1, a1, b3, y1 FMADD2 y2, a1, b4, y2 FMADD3 y3, a2, b4, y3 FMADD4 y4, a2, b3, y4 .align 4 LL(35): andi. r0, M, 2 ble LL(37) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) FMADD1 y1, a1, b1, y1 LFDU b3, 1 * SIZE(X1) FMADD2 y2, a1, b2, y2 LFDU a3, 1 * SIZE(AO1) FMADD3 y3, a2, b2, y3 LFDU b4, 1 * SIZE(X1) FMADD4 y4, a2, b1, y4 LFDU a4, 1 * SIZE(AO1) FMADD1 y1, a3, b3, y1 FMADD2 y2, a3, b4, y2 FMADD3 y3, a4, b4, y3 FMADD4 y4, a4, b3, y4 .align 4 LL(37): andi. r0, M, 1 ble LL(39) LFDU a1, 1 * SIZE(AO1) LFDU b1, 1 * SIZE(X1) LFDU a2, 1 * SIZE(AO1) LFDU b2, 1 * SIZE(X1) FMADD1 y1, a1, b1, y1 FMADD2 y2, a1, b2, y2 FMADD3 y3, a2, b2, y3 FMADD4 y4, a2, b1, y4 .align 4 LL(39): LFDUX b1, Y, INCY LFDU b2, 1 * SIZE(Y) FADD y1, y1, y3 FADD y2, y2, y4 FMADD b1, alpha_r, y1, b1 FMADDR b2, alpha_r, y2, b2 FMSUBR b1, alpha_i, y2, b1 FMADD b2, alpha_i, y1, b2 STFDUX b1, YY, INCY STFDU b2, 1 * SIZE(YY) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zger.S000066400000000000000000000700021313527062700166520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef NEEDPARAM #ifndef DOUBLE #include "cparam.h" #else #include "zparam.h" #endif #endif #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define A r10 #define LDA r5 #else #define M r3 #define N r4 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define A r6 #define LDA r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #define A r8 #define LDA r9 #else #define M r3 #define N r4 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define A r6 #define LDA r7 #endif #endif #define I r11 #define J r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define AO5 r18 #define AO6 r19 #define AO7 r20 #define AO8 r21 #define X1 r22 #define PREA r23 #define PREC r24 #define XX r25 #define BUFFER r26 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define alpha1_r f8 #define alpha1_i f9 #define alpha2_r f10 #define alpha2_i f11 #define a1 f12 #define a2 f13 #define a3 f14 #define a4 f15 #define a5 f16 #define a6 f17 #define a7 f18 #define a8 f19 #define a9 f20 #define a10 f21 #define a11 f22 #define a12 f23 #define a13 f24 #define a14 f25 #define a15 f26 #define a16 f27 #define alpha_r f30 #define alpha_i f31 #ifndef CONJ #define FMA1 FNMSUB #define FMA2 FMADD #else #define FMA1 FMADD #define FMA2 FNMSUB #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 16 #endif #ifdef PPC970 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #else #define STACKSIZE 280 #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld A, FRAMESLOT(1) + STACKSIZE(SP) ld LDA, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz A, FRAMESLOT(3) + STACKSIZE(SP) lwz LDA, FRAMESLOT(4) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz A, FRAMESLOT(1) + STACKSIZE(SP) lwz LDA, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld A, FRAMESLOT(1) + STACKSIZE(SP) ld LDA, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif fmr alpha_r, f1 fmr alpha_i, f2 slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) mr XX, X cmpi cr0, 0, INCX, 2 * SIZE beq LL(10) mr XX, BUFFER mr X1, BUFFER srawi. r0, M, 2 mtspr CTR, r0 ble LL(05) .align 4 LL(01): LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX LFD a5, 0 * SIZE(X) LFD a6, 1 * SIZE(X) add X, X, INCX LFD a7, 0 * SIZE(X) LFD a8, 1 * SIZE(X) add X, X, INCX STFD a1, 0 * SIZE(X1) STFD a2, 1 * SIZE(X1) STFD a3, 2 * SIZE(X1) STFD a4, 3 * SIZE(X1) STFD a5, 4 * SIZE(X1) STFD a6, 5 * SIZE(X1) STFD a7, 6 * SIZE(X1) STFD a8, 7 * SIZE(X1) addi X1, X1, 8 * SIZE bdnz+ LL(01) .align 4 LL(05): andi. r0, M, 7 mtspr CTR, r0 ble LL(10) .align 4 LL(06): LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) STFD a1, 0 * SIZE(X1) STFD a2, 1 * SIZE(X1) add X, X, INCX addi X1, X1, 2 * SIZE bdnz+ LL(06) .align 4 LL(10): srawi. J, N, 1 ble LL(20) .align 4 LL(11): LFD alpha1_r, 0 * SIZE(Y) LFD alpha1_i, 1 * SIZE(Y) add Y, Y, INCY LFD alpha2_r, 0 * SIZE(Y) LFD alpha2_i, 1 * SIZE(Y) add Y, Y, INCY FMUL a1, alpha_r, alpha1_r FMUL a2, alpha_i, alpha1_r FMUL a3, alpha_r, alpha2_r FMUL a4, alpha_i, alpha2_r FMA1 alpha1_r, alpha_i, alpha1_i, a1 FMA2 alpha1_i, alpha_r, alpha1_i, a2 FMA1 alpha2_r, alpha_i, alpha2_i, a3 FMA2 alpha2_i, alpha_r, alpha2_i, a4 mr AO1, A add AO2, A, LDA add A, AO2, LDA mr X1, XX srawi. r0, M, 3 mtspr CTR, r0 ble LL(15) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) LFD a9, 0 * SIZE(AO2) LFD a10, 1 * SIZE(AO2) LFD a11, 2 * SIZE(AO2) LFD a12, 3 * SIZE(AO2) LFD a13, 4 * SIZE(AO2) LFD a14, 5 * SIZE(AO2) LFD a15, 6 * SIZE(AO2) LFD a16, 7 * SIZE(AO2) bdz LL(13) .align 4 LL(12): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) LFD a9, 8 * SIZE(AO2) LFD a10, 9 * SIZE(AO2) LFD a11, 10 * SIZE(AO2) LFD a12, 11 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) LFD a13, 12 * SIZE(AO2) LFD a14, 13 * SIZE(AO2) LFD a15, 14 * SIZE(AO2) LFD a16, 15 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 LFD y01, 16 * SIZE(X1) LFD y02, 17 * SIZE(X1) LFD y03, 18 * SIZE(X1) LFD y04, 19 * SIZE(X1) STFD a9, 8 * SIZE(AO2) STFD a10, 9 * SIZE(AO2) STFD a11, 10 * SIZE(AO2) STFD a12, 11 * SIZE(AO2) LFD a9, 16 * SIZE(AO2) LFD a10, 17 * SIZE(AO2) LFD a11, 18 * SIZE(AO2) LFD a12, 19 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 LFD y05, 20 * SIZE(X1) LFD y06, 21 * SIZE(X1) LFD y07, 22 * SIZE(X1) LFD y08, 23 * SIZE(X1) STFD a13, 12 * SIZE(AO2) STFD a14, 13 * SIZE(AO2) STFD a15, 14 * SIZE(AO2) STFD a16, 15 * SIZE(AO2) LFD a13, 20 * SIZE(AO2) LFD a14, 21 * SIZE(AO2) LFD a15, 22 * SIZE(AO2) LFD a16, 23 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi X1, X1, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) DCBT(Y1, PREY) bdnz+ LL(12) .align 4 LL(13): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) LFD a9, 8 * SIZE(AO2) LFD a10, 9 * SIZE(AO2) LFD a11, 10 * SIZE(AO2) LFD a12, 11 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) LFD a13, 12 * SIZE(AO2) LFD a14, 13 * SIZE(AO2) LFD a15, 14 * SIZE(AO2) LFD a16, 15 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 STFD a9, 8 * SIZE(AO2) STFD a10, 9 * SIZE(AO2) STFD a11, 10 * SIZE(AO2) STFD a12, 11 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 STFD a13, 12 * SIZE(AO2) STFD a14, 13 * SIZE(AO2) STFD a15, 14 * SIZE(AO2) STFD a16, 15 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi X1, X1, 16 * SIZE .align 4 LL(15): andi. r0, M, 7 ble LL(19) andi. r0, M, 4 ble LL(17) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) LFD a9, 0 * SIZE(AO2) LFD a10, 1 * SIZE(AO2) LFD a11, 2 * SIZE(AO2) LFD a12, 3 * SIZE(AO2) LFD a13, 4 * SIZE(AO2) LFD a14, 5 * SIZE(AO2) LFD a15, 6 * SIZE(AO2) LFD a16, 7 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi X1, X1, 8 * SIZE .align 4 LL(17): andi. r0, M, 2 ble LL(18) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha2_r, y01, a5 FMADD a6, alpha2_r, y02, a6 FMADD a7, alpha2_r, y03, a7 FMADD a8, alpha2_r, y04, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 FNMSUB a5, alpha2_i, y02, a5 FMADD a6, alpha2_i, y01, a6 FNMSUB a7, alpha2_i, y04, a7 FMADD a8, alpha2_i, y03, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) STFD a5, 0 * SIZE(AO2) STFD a6, 1 * SIZE(AO2) STFD a7, 2 * SIZE(AO2) STFD a8, 3 * SIZE(AO2) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi X1, X1, 4 * SIZE .align 4 LL(18): andi. r0, M, 1 ble LL(19) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha2_r, y01, a3 FMADD a4, alpha2_r, y02, a4 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha2_i, y02, a3 FMADD a4, alpha2_i, y01, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 0 * SIZE(AO2) STFD a4, 1 * SIZE(AO2) .align 4 LL(19): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 1 ble LL(999) LFD alpha1_r, 0 * SIZE(Y) LFD alpha1_i, 1 * SIZE(Y) FMUL a1, alpha_r, alpha1_r FMUL a2, alpha_i, alpha1_r FMA1 alpha1_r, alpha_i, alpha1_i, a1 FMA2 alpha1_i, alpha_r, alpha1_i, a2 mr AO1, A mr X1, XX srawi. r0, M, 3 mtspr CTR, r0 ble LL(25) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) bdz LL(23) .align 4 LL(22): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) LFD y01, 16 * SIZE(X1) LFD y02, 17 * SIZE(X1) LFD y03, 18 * SIZE(X1) LFD y04, 19 * SIZE(X1) LFD y05, 20 * SIZE(X1) LFD y06, 21 * SIZE(X1) LFD y07, 22 * SIZE(X1) LFD y08, 23 * SIZE(X1) addi AO1, AO1, 16 * SIZE addi X1, X1, 16 * SIZE DCBT(AO1, PREA) DCBT(Y1, PREY) bdnz+ LL(22) .align 4 LL(23): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE addi X1, X1, 16 * SIZE .align 4 LL(25): andi. r0, M, 7 ble LL(999) andi. r0, M, 4 ble LL(27) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) addi AO1, AO1, 8 * SIZE addi X1, X1, 8 * SIZE .align 4 LL(27): andi. r0, M, 2 ble LL(28) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) addi AO1, AO1, 4 * SIZE addi X1, X1, 4 * SIZE .align 4 LL(28): andi. r0, M, 1 ble LL(999) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/znrm2.S000066400000000000000000000415621313527062700167640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define NN r6 #define XX r7 #define PREA r8 #define INCXM1 r9 #define FZERO 144(SP) #define FONE 148(SP) #define FMAX 152(SP) #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 lis r12, 0x5fe0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE stw r12, FMAX stw r10, 4 + FMAX lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT subi INCXM1, INCX, SIZE li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 ble- LL(9999) mr NN, N mr XX, X LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) add X, X, INCX fabs f2, f0 fabs f3, f1 fabs f4, f0 fabs f5, f1 fabs f6, f0 fabs f7, f1 fabs f0, f0 fabs f1, f1 subi N, N, 1 cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(1000) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) LFD f24, 0 * SIZE(X) LFD f25, 1 * SIZE(X) LFD f26, 2 * SIZE(X) LFD f27, 3 * SIZE(X) LFD f28, 4 * SIZE(X) LFD f29, 5 * SIZE(X) LFD f30, 6 * SIZE(X) LFD f31, 7 * SIZE(X) fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFD f24, 8 * SIZE(X) LFD f25, 9 * SIZE(X) LFD f26, 10 * SIZE(X) LFD f27, 11 * SIZE(X) fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFD f28, 12 * SIZE(X) LFD f29, 13 * SIZE(X) LFD f30, 14 * SIZE(X) LFD f31, 15 * SIZE(X) bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 16 * SIZE(X) LFD f25, 17 * SIZE(X) LFD f26, 18 * SIZE(X) LFD f27, 19 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 20 * SIZE(X) LFD f29, 21 * SIZE(X) LFD f30, 22 * SIZE(X) LFD f31, 23 * SIZE(X) fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFD f24, 24 * SIZE(X) LFD f25, 25 * SIZE(X) LFD f26, 26 * SIZE(X) LFD f27, 27 * SIZE(X) fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFD f28, 28 * SIZE(X) LFD f29, 29 * SIZE(X) LFD f30, 30 * SIZE(X) LFD f31, 31 * SIZE(X) #ifndef POWER6 L1_PREFETCH X, PREA #endif addi X, X, 16 * SIZE #ifdef POWER6 L1_PREFETCH X, PREA #endif bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 addi X, X, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(100) .align 4 LL(60): LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) addi X, X, 2 * SIZE fabs f8, f8 fabs f9, f9 fsub f16, f0, f8 fsub f17, f1, f9 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 bdnz LL(60) .align 4 LL(100): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f31, f8, f0, f4 lfs f1, FZERO lfs f0, FONE fcmpu cr0, f1, f31 beq- cr0, LL(9999) fdiv f30, f0, f31 fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(150) LFD f8, 0 * SIZE(XX) LFD f9, 1 * SIZE(XX) LFD f10, 2 * SIZE(XX) LFD f11, 3 * SIZE(XX) LFD f12, 4 * SIZE(XX) LFD f13, 5 * SIZE(XX) LFD f14, 6 * SIZE(XX) LFD f15, 7 * SIZE(XX) fmul f16, f30, f8 fmul f17, f30, f9 fmul f18, f30, f10 fmul f19, f30, f11 LFD f8, 8 * SIZE(XX) LFD f9, 9 * SIZE(XX) LFD f10, 10 * SIZE(XX) LFD f11, 11 * SIZE(XX) fmul f20, f30, f12 fmul f21, f30, f13 fmul f22, f30, f14 fmul f23, f30, f15 LFD f12, 12 * SIZE(XX) LFD f13, 13 * SIZE(XX) LFD f14, 14 * SIZE(XX) LFD f15, 15 * SIZE(XX) bdz LL(120) .align 4 LL(110): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFD f8, 16 * SIZE(XX) LFD f9, 17 * SIZE(XX) LFD f10, 18 * SIZE(XX) LFD f11, 19 * SIZE(XX) fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFD f12, 20 * SIZE(XX) LFD f13, 21 * SIZE(XX) LFD f14, 22 * SIZE(XX) LFD f15, 23 * SIZE(XX) fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFD f8, 24 * SIZE(XX) LFD f9, 25 * SIZE(XX) LFD f10, 26 * SIZE(XX) LFD f11, 27 * SIZE(XX) fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFD f12, 28 * SIZE(XX) LFD f13, 29 * SIZE(XX) LFD f14, 30 * SIZE(XX) LFD f15, 31 * SIZE(XX) #ifndef POWER6 L1_PREFETCH XX, PREA #endif addi XX, XX, 16 * SIZE #ifdef POWER6 L1_PREFETCH XX, PREA #endif bdnz LL(110) .align 4 LL(120): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 addi XX, XX, 16 * SIZE .align 4 LL(150): andi. r0, NN, 7 mtspr CTR, r0 beq- cr0, LL(170) .align 4 LL(160): LFD f8, 0 * SIZE(XX) LFD f9, 1 * SIZE(XX) addi XX, XX, 2 * SIZE fmul f16, f30, f8 fmul f17, f30, f9 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 bdnz LL(160) .align 4 LL(170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f0, f0, f2 fadd f4, f4, f6 fadd f0, f0, f4 fsqrt f0, f0 fmul f1, f31, f0 b LL(9999) .align 4 LL(1000): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(1050) LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fabs f8, f24 fabs f9, f25 fabs f10, f26 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fabs f12, f28 fabs f13, f29 fabs f14, f30 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdz LL(1020) .align 4 LL(1010): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 LFDX f24, X, INCXM1 LFDUX f25, X, INCX LFDX f26, X, INCXM1 LFDUX f27, X, INCX fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 LFDX f28, X, INCXM1 LFDUX f29, X, INCX LFDX f30, X, INCXM1 LFDUX f31, X, INCX bdnz LL(1010) .align 4 LL(1020): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(1050): andi. r0, N, 7 mtspr CTR, r0 beq LL(1999) .align 4 LL(1060): LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f8, f8 fabs f9, f9 fsub f16, f0, f8 fsub f17, f1, f9 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 bdnz LL(1060) .align 4 LL(1999): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f31, f8, f0, f4 lfs f1, FZERO lfs f0, FONE fcmpu cr0, f1, f31 beq- cr0, LL(9999) fdiv f30, f0, f31 fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 sub XX, XX, INCXM1 srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(1150) LFDX f8, XX, INCXM1 LFDUX f9, XX, INCX LFDX f10, XX, INCXM1 LFDUX f11, XX, INCX LFDX f12, XX, INCXM1 LFDUX f13, XX, INCX LFDX f14, XX, INCXM1 LFDUX f15, XX, INCX fmul f16, f30, f8 fmul f17, f30, f9 fmul f18, f30, f10 fmul f19, f30, f11 LFDX f8, XX, INCXM1 LFDUX f9, XX, INCX LFDX f10, XX, INCXM1 LFDUX f11, XX, INCX fmul f20, f30, f12 fmul f21, f30, f13 fmul f22, f30, f14 fmul f23, f30, f15 LFDX f12, XX, INCXM1 LFDUX f13, XX, INCX LFDX f14, XX, INCXM1 LFDUX f15, XX, INCX bdz LL(1120) .align 4 LL(1110): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDX f8, XX, INCXM1 LFDUX f9, XX, INCX LFDX f10, XX, INCXM1 LFDUX f11, XX, INCX fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDX f12, XX, INCXM1 LFDUX f13, XX, INCX LFDX f14, XX, INCXM1 LFDUX f15, XX, INCX fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDX f8, XX, INCXM1 LFDUX f9, XX, INCX LFDX f10, XX, INCXM1 LFDUX f11, XX, INCX fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDX f12, XX, INCXM1 LFDUX f13, XX, INCX LFDX f14, XX, INCXM1 LFDUX f15, XX, INCX bdnz LL(1110) .align 4 LL(1120): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 .align 4 LL(1150): andi. r0, NN, 7 mtspr CTR, r0 beq- cr0, LL(1170) .align 4 LL(1160): LFDX f8, XX, INCXM1 LFDUX f9, XX, INCX fmul f16, f30, f8 fmul f17, f30, f9 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 bdnz LL(1160) .align 4 LL(1170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f0, f0, f2 fadd f4, f4, f6 fadd f0, f0, f4 fsqrt f0, f0 fmul f1, f31, f0 .align 4 LL(9999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/znrm2_hummer.S000066400000000000000000000431421313527062700203350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define INCX2 r6 #define X2 r7 #define XX r8 #define C1 f1 #define C2 f0 #define C3 f2 #define C4 f3 #define ALPHA f4 #define ALPHA_R f5 #define A1 f6 #define A2 f7 #define A3 f8 #define A4 f9 #define A5 f10 #define A6 f11 #define A7 f12 #define A8 f13 #define F1 f14 #define F2 f15 #define F3 f16 #define F4 f17 #define F5 f18 #define F6 f19 #define F7 f20 #define F8 f21 #define T1 f22 #define T2 f23 #define T3 f24 #define T4 f25 #define T5 f26 #define T6 f27 #define T7 f28 #define T8 f29 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 stfpdux f19, SP, r10 stfpdux f20, SP, r10 stfpdux f21, SP, r10 stfpdux f22, SP, r10 stfpdux f23, SP, r10 stfpdux f24, SP, r10 stfpdux f25, SP, r10 stfpdux f26, SP, r10 stfpdux f27, SP, r10 stfpdux f28, SP, r10 stfpdux f29, SP, r10 li r10, 0 lis r11, 0x3f80 stwu r11, -4(SP) stwu r11, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfpsx C1, SP, r10 # Zero clear slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 ble LL(99) mr XX, X andi. r0, X, 2 * SIZE - 1 bne LL(100) /* aligned */ sub X, X, INCX2 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fpabs T1, A1 LFPDUX A6, X, INCX2 fpabs T2, A2 LFPDUX A7, X, INCX2 fpabs T3, A3 LFPDUX A8, X, INCX2 fpabs T4, A4 bdz LL(13) .align 4 LL(12): fpsub F1, C1, T1 LFPDUX A1, X, INCX2 fpsub F2, C2, T2 LFPDUX A2, X, INCX2 fpsub F3, C3, T3 LFPDUX A3, X, INCX2 fpsub F4, C4, T4 LFPDUX A4, X, INCX2 fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsel C1, F1, C1, T1 LFPDUX A5, X, INCX2 fpsel C2, F2, C2, T2 LFPDUX A6, X, INCX2 fpsel C3, F3, C3, T3 LFPDUX A7, X, INCX2 fpsel C4, F4, C4, T4 LFPDUX A8, X, INCX2 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(12) .align 4 LL(13): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(15): andi. r0, N, 7 beq LL(20) andi. r0, N, 4 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fpabs A1, A1 fpabs A2, A2 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 .align 4 LL(17): andi. r0, N, 1 beq LL(20) LFPDUX A1, X, INCX2 fpabs A1, A1 fpsub F1, C1, A1 fpsel C1, F1, C1, A1 .align 4 LL(20): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel ALPHA, F1, C1, C2 li r10, 0 lfs ALPHA_R, 8(SP) # load 1.0 fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 fsmfp ALPHA_R, ALPHA_R andi. r0, XX, 2 * SIZE - 1 beq LL(21) LFD C1, 0 * SIZE(XX) add XX, XX, INCX cmpwi cr0, N, 0 fmul C1, ALPHA_R, C1 fmul C1, C1, C1 ble LL(98) .align 4 LL(21): sub XX, XX, INCX2 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(25) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 LFPDUX A5, XX, INCX2 LFPDUX A6, XX, INCX2 LFPDUX A7, XX, INCX2 LFPDUX A8, XX, INCX2 fpmul T1, ALPHA_R, A1 fpmul T2, ALPHA_R, A2 fpmul T3, ALPHA_R, A3 fpmul T4, ALPHA_R, A4 bdz LL(23) .align 4 LL(22): fpmadd C1, T1, T1, C1 LFPDUX A1, XX, INCX2 fpmul T1, ALPHA_R, A5 LFPDUX A2, XX, INCX2 fpmadd C2, T2, T2, C2 LFPDUX A3, XX, INCX2 fpmul T2, ALPHA_R, A6 LFPDUX A4, XX, INCX2 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A7 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A8 fpmadd C1, T1, T1, C1 LFPDUX A5, XX, INCX2 fpmul T1, ALPHA_R, A1 LFPDUX A6, XX, INCX2 fpmadd C2, T2, T2, C2 LFPDUX A7, XX, INCX2 fpmul T2, ALPHA_R, A2 LFPDUX A8, XX, INCX2 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A3 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A4 bdnz LL(22) .align 4 LL(23): fpmadd C1, T1, T1, C1 fpmul T1, ALPHA_R, A5 fpmadd C2, T2, T2, C2 fpmul T2, ALPHA_R, A6 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A7 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A8 fpmadd C1, T1, T1, C1 fpmadd C2, T2, T2, C2 fpmadd C3, T3, T3, C3 fpmadd C4, T4, T4, C4 .align 4 LL(25): andi. r0, N, 7 beq LL(98) andi. r0, N, 4 beq LL(26) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 LFPDUX A3, XX, INCX2 LFPDUX A4, XX, INCX2 fpmul A1, ALPHA_R, A1 fpmul A2, ALPHA_R, A2 fpmul A3, ALPHA_R, A3 fpmul A4, ALPHA_R, A4 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 .align 4 LL(26): andi. r0, N, 2 beq LL(27) LFPDUX A1, XX, INCX2 LFPDUX A2, XX, INCX2 fpmul A1, ALPHA_R, A1 fpmul A2, ALPHA_R, A2 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 .align 4 LL(27): andi. r0, N, 1 beq LL(98) LFPDUX A1, XX, INCX2 fpmul A1, ALPHA_R, A1 fpmadd C1, A1, A1, C1 .align 4 LL(98): fpadd C1, C1, C2 lis r3, 0x3f00 fpadd C3, C3, C4 lis r4, 0x4040 stw r3, 4(SP) stw r4, 8(SP) fpadd C1, C1, C3 lfs f10, 0(SP) fsmtp C2, C1 lfs f11, 4(SP) fadd C1, C2, C1 lfs f12, 8(SP) fcmpu cr0, f10, C1 beq cr0, LL(99) #ifndef HUMMER_EMULATOR frsqrte f9, C1 li r10, 16 fmul f2, f1, f9 lfpdux f29, SP, r10 fmul f3, f9, f11 lfpdux f28, SP, r10 fnmsub f7, f2, f9, f12 lfpdux f27, SP, r10 fmul f9, f3, f7 lfpdux f26, SP, r10 fadd f13, f11, f11 lfpdux f25, SP, r10 fmul f12, f1, f9 lfpdux f24, SP, r10 fmul f11, f12, f11 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 fmadd f1, f11, f1, f12 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 fmul C1, ALPHA, C1 blr #else fsqrt C1, C1 li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fmul C1, ALPHA, C1 addi SP, SP, 16 blr #endif .align 4 LL(99): li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr .align 4 LL(100): sub X, X, INCX2 addi X2, X, SIZE srawi. r0, N, 3 mtspr CTR, r0 beq- LL(105) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 LFSDUX A1, X, INCX2 LFSDUX A2, X2, INCX2 LFSDUX A3, X, INCX2 LFSDUX A4, X2, INCX2 LFDUX A5, X, INCX2 LFDUX A6, X2, INCX2 LFDUX A7, X, INCX2 LFDUX A8, X2, INCX2 LFSDUX A5, X, INCX2 fpabs T1, A1 LFSDUX A6, X2, INCX2 fpabs T2, A2 LFSDUX A7, X, INCX2 fpabs T3, A3 LFSDUX A8, X2, INCX2 fpabs T4, A4 bdz LL(103) .align 4 LL(102): fpsub F1, C1, T1 LFDUX A1, X, INCX2 fpsub F2, C2, T2 LFDUX A2, X2, INCX2 fpsub F3, C3, T3 LFDUX A3, X, INCX2 fpsub F4, C4, T4 LFDUX A4, X2, INCX2 fpabs T5, A5 LFSDUX A1, X, INCX2 fpabs T6, A6 LFSDUX A2, X2, INCX2 fpabs T7, A7 LFSDUX A3, X, INCX2 fpabs T8, A8 LFSDUX A4, X2, INCX2 fpsel C1, F1, C1, T1 LFDUX A5, X, INCX2 fpsel C2, F2, C2, T2 LFDUX A6, X2, INCX2 fpsel C3, F3, C3, T3 LFDUX A7, X, INCX2 fpsel C4, F4, C4, T4 LFDUX A8, X2, INCX2 fpsub F5, C1, T5 LFSDUX A5, X, INCX2 fpsub F6, C2, T6 LFSDUX A6, X2, INCX2 fpsub F7, C3, T7 LFSDUX A7, X, INCX2 fpsub F8, C4, T8 LFSDUX A8, X2, INCX2 fpabs T1, A1 fpabs T2, A2 fpabs T3, A3 fpabs T4, A4 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 bdnz LL(102) .align 4 LL(103): fpabs T5, A5 fpabs T6, A6 fpabs T7, A7 fpabs T8, A8 fpsub F1, C1, T1 fpsub F2, C2, T2 fpsub F3, C3, T3 fpsub F4, C4, T4 fpsel C1, F1, C1, T1 fpsel C2, F2, C2, T2 fpsel C3, F3, C3, T3 fpsel C4, F4, C4, T4 fpsub F5, C1, T5 fpsub F6, C2, T6 fpsub F7, C3, T7 fpsub F8, C4, T8 fpsel C1, F5, C1, T5 fpsel C2, F6, C2, T6 fpsel C3, F7, C3, T7 fpsel C4, F8, C4, T8 .align 4 LL(105): andi. r0, N, 7 beq LL(120) andi. r0, N, 4 beq LL(106) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 LFSDUX A1, X, INCX2 LFSDUX A2, X2, INCX2 LFSDUX A3, X, INCX2 LFSDUX A4, X2, INCX2 fpabs A1, A1 fpabs A2, A2 fpabs A3, A3 fpabs A4, A4 fpsub F1, C1, A1 fpsub F2, C2, A2 fpsub F3, C3, A3 fpsub F4, C4, A4 fpsel C1, F1, C1, A1 fpsel C2, F2, C2, A2 fpsel C3, F3, C3, A3 fpsel C4, F4, C4, A4 .align 4 LL(106): andi. r0, N, 2 beq LL(107) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, X, INCX2 LFDUX A4, X2, INCX2 fabs A1, A1 fabs A2, A2 fabs A3, A3 fabs A4, A4 fsub F1, C1, A1 fsub F2, C2, A2 fsub F3, C3, A3 fsub F4, C4, A4 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 fsel C3, F3, C3, A3 fsel C4, F4, C4, A4 .align 4 LL(107): andi. r0, N, 1 beq LL(120) LFDUX A1, X, INCX2 LFDUX A2, X2, INCX2 fabs A1, A1 fabs A2, A2 fsub F1, C1, A1 fsub F2, C2, A2 fsel C1, F1, C1, A1 fsel C2, F2, C2, A2 .align 4 LL(120): fpsub F1, C1, C2 fpsub F2, C3, C4 fpsel C1, F1, C1, C2 fpsel C3, F2, C3, C4 fpsub F1, C1, C3 fpsel C1, F1, C1, C3 fsmtp C2, C1 fsub F1, C1, C2 fsel ALPHA, F1, C1, C2 li r10, 0 lfs ALPHA_R, 8(SP) # load 1.0 fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 fsmfp ALPHA_R, ALPHA_R sub XX, XX, INCX2 addi X2, XX, SIZE srawi. r0, N, 3 mtspr CTR, r0 beq- LL(125) LFDUX A1, XX, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, XX, INCX2 LFDUX A4, X2, INCX2 LFSDUX A1, XX, INCX2 LFSDUX A2, X2, INCX2 LFSDUX A3, XX, INCX2 LFSDUX A4, X2, INCX2 LFDUX A5, XX, INCX2 LFDUX A6, X2, INCX2 LFDUX A7, XX, INCX2 LFDUX A8, X2, INCX2 LFSDUX A5, XX, INCX2 fpmul T1, ALPHA_R, A1 LFSDUX A6, X2, INCX2 fpmul T2, ALPHA_R, A2 LFSDUX A7, XX, INCX2 fpmul T3, ALPHA_R, A3 LFSDUX A8, X2, INCX2 fpmul T4, ALPHA_R, A4 bdz LL(123) .align 4 LL(122): fpmadd C1, T1, T1, C1 LFDUX A1, XX, INCX2 fpmul T1, ALPHA_R, A5 LFDUX A2, X2, INCX2 fpmadd C2, T2, T2, C2 LFDUX A3, XX, INCX2 fpmul T2, ALPHA_R, A6 LFDUX A4, X2, INCX2 fpmadd C3, T3, T3, C3 LFSDUX A1, XX, INCX2 fpmul T3, ALPHA_R, A7 LFSDUX A2, X2, INCX2 fpmadd C4, T4, T4, C4 LFSDUX A3, XX, INCX2 fpmul T4, ALPHA_R, A8 LFSDUX A4, X2, INCX2 fpmadd C1, T1, T1, C1 LFDUX A5, XX, INCX2 fpmul T1, ALPHA_R, A1 LFDUX A6, X2, INCX2 fpmadd C2, T2, T2, C2 LFDUX A7, XX, INCX2 fpmul T2, ALPHA_R, A2 LFDUX A8, X2, INCX2 fpmadd C3, T3, T3, C3 LFSDUX A5, XX, INCX2 fpmul T3, ALPHA_R, A3 LFSDUX A6, X2, INCX2 fpmadd C4, T4, T4, C4 LFSDUX A7, XX, INCX2 fpmul T4, ALPHA_R, A4 LFSDUX A8, X2, INCX2 bdnz LL(122) .align 4 LL(123): fpmadd C1, T1, T1, C1 fpmul T1, ALPHA_R, A5 fpmadd C2, T2, T2, C2 fpmul T2, ALPHA_R, A6 fpmadd C3, T3, T3, C3 fpmul T3, ALPHA_R, A7 fpmadd C4, T4, T4, C4 fpmul T4, ALPHA_R, A8 fpmadd C1, T1, T1, C1 fpmadd C2, T2, T2, C2 fpmadd C3, T3, T3, C3 fpmadd C4, T4, T4, C4 .align 4 LL(125): andi. r0, N, 7 beq LL(998) andi. r0, N, 4 beq LL(126) LFDUX A1, XX, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, XX, INCX2 LFDUX A4, X2, INCX2 LFSDUX A1, XX, INCX2 LFSDUX A2, X2, INCX2 LFSDUX A3, XX, INCX2 LFSDUX A4, X2, INCX2 fpmul A1, ALPHA_R, A1 fpmul A2, ALPHA_R, A2 fpmul A3, ALPHA_R, A3 fpmul A4, ALPHA_R, A4 fpmadd C1, A1, A1, C1 fpmadd C2, A2, A2, C2 fpmadd C3, A3, A3, C3 fpmadd C4, A4, A4, C4 .align 4 LL(126): andi. r0, N, 2 beq LL(127) LFDUX A1, XX, INCX2 LFDUX A2, X2, INCX2 LFDUX A3, XX, INCX2 LFDUX A4, X2, INCX2 fmul A1, ALPHA_R, A1 fmul A2, ALPHA_R, A2 fmul A3, ALPHA_R, A3 fmul A4, ALPHA_R, A4 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 fmadd C3, A3, A3, C3 fmadd C4, A4, A4, C4 .align 4 LL(127): andi. r0, N, 1 beq LL(998) LFDUX A1, XX, INCX2 LFDUX A2, X2, INCX2 fmul A1, ALPHA_R, A1 fmul A2, ALPHA_R, A2 fmadd C1, A1, A1, C1 fmadd C2, A2, A2, C2 .align 4 LL(998): fpadd C1, C1, C2 lis r3, 0x3f00 fpadd C3, C3, C4 lis r4, 0x4040 stw r3, 4(SP) stw r4, 8(SP) fpadd C1, C1, C3 lfs f10, 0(SP) fsmtp C2, C1 lfs f11, 4(SP) fadd C1, C2, C1 lfs f12, 8(SP) fcmpu cr0, f10, C1 beq cr0, LL(99) #ifndef HUMMER_EMULATOR frsqrte f9, C1 li r10, 16 fmul f2, f1, f9 lfpdux f29, SP, r10 fmul f3, f9, f11 lfpdux f28, SP, r10 fnmsub f7, f2, f9, f12 lfpdux f27, SP, r10 fmul f9, f3, f7 lfpdux f26, SP, r10 fadd f13, f11, f11 lfpdux f25, SP, r10 fmul f12, f1, f9 lfpdux f24, SP, r10 fmul f11, f12, f11 lfpdux f23, SP, r10 lfpdux f22, SP, r10 fnmsub f1, f12, f9, f13 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 fmadd f1, f11, f1, f12 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 fmul C1, ALPHA, C1 blr #else fsqrt C1, C1 li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 fmul C1, ALPHA, C1 addi SP, SP, 16 blr #endif .align 4 LL(999): li r10, 16 lfpdux f29, SP, r10 lfpdux f28, SP, r10 lfpdux f27, SP, r10 lfpdux f26, SP, r10 lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 lfpdux f22, SP, r10 lfpdux f21, SP, r10 lfpdux f20, SP, r10 lfpdux f19, SP, r10 lfpdux f18, SP, r10 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/znrm2_ppc440.S000066400000000000000000000255311313527062700200540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define NN r6 #define XX r7 #define INC1 r9 #define PRE r10 #define FZERO 144(SP) #define FONE 148(SP) #define FMAX 152(SP) #define C1 156(SP) #define C2 160(SP) #define STACKSIZE 168 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r10, 0 lis r11, 0x3f80 lis r12, 0x5fe0 lis r6, 0x3f00 lis r7, 0x4040 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) stw r10, FZERO stw r11, FONE stw r12, FMAX stw r10, 4 + FMAX stw r6, C1 stw r7, C2 lfs f1, FZERO #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif slwi INCX, INCX, ZBASE_SHIFT sub X, X, INCX li INC1, SIZE li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) mr NN, N mr XX, X LFDUX f0, X, INCX LFDX f1, X, INC1 fabs f2, f0 fabs f3, f1 fabs f4, f0 fabs f5, f1 fabs f6, f0 fabs f7, f1 fabs f0, f0 fabs f1, f1 subi N, N, 1 srawi. r0, N, 3 mtspr CTR, r0 beq- LL(50) LFDUX f24, X, INCX LFDX f25, X, INC1 LFDUX f26, X, INCX LFDX f27, X, INC1 LFDUX f28, X, INCX LFDX f29, X, INC1 LFDUX f30, X, INCX LFDX f31, X, INC1 fabs f8, f24 LFDUX f24, X, INCX fabs f9, f25 LFDX f25, X, INC1 fabs f10, f26 LFDUX f26, X, INCX fabs f11, f27 LFDX f27, X, INC1 fabs f12, f28 LFDUX f28, X, INCX fabs f13, f29 LFDX f29, X, INC1 fabs f14, f30 LFDUX f30, X, INCX fabs f15, f31 LFDX f31, X, INC1 bdz LL(20) .align 4 LL(10): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDX f25, X, INC1 fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDX f27, X, INC1 fsel f4, f20, f4, f12 #ifdef PPCG4 dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDX f29, X, INC1 fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDX f31, X, INC1 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 #ifdef PPCG4 dcbt X, PRE #endif fabs f8, f24 LFDUX f24, X, INCX fsel f1, f17, f1, f9 fabs f9, f25 LFDX f25, X, INC1 fsel f2, f18, f2, f10 fabs f10, f26 LFDUX f26, X, INCX fsel f3, f19, f3, f11 fabs f11, f27 LFDX f27, X, INC1 fsel f4, f20, f4, f12 #ifdef PPCG4 dcbt X, PRE #endif fabs f12, f28 LFDUX f28, X, INCX fsel f5, f21, f5, f13 fabs f13, f29 LFDX f29, X, INC1 fsel f6, f22, f6, f14 fabs f14, f30 LFDUX f30, X, INCX fsel f7, f23, f7, f15 fabs f15, f31 LFDX f31, X, INC1 bdnz LL(10) .align 4 LL(20): fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fabs f8, f24 fsel f1, f17, f1, f9 fabs f9, f25 fsel f2, f18, f2, f10 fabs f10, f26 fsel f3, f19, f3, f11 fabs f11, f27 fsel f4, f20, f4, f12 fabs f12, f28 fsel f5, f21, f5, f13 fabs f13, f29 fsel f6, f22, f6, f14 fabs f14, f30 fsel f7, f23, f7, f15 fabs f15, f31 fsub f16, f0, f8 fsub f17, f1, f9 fsub f18, f2, f10 fsub f19, f3, f11 fsub f20, f4, f12 fsub f21, f5, f13 fsub f22, f6, f14 fsub f23, f7, f15 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 fsel f2, f18, f2, f10 fsel f3, f19, f3, f11 fsel f4, f20, f4, f12 fsel f5, f21, f5, f13 fsel f6, f22, f6, f14 fsel f7, f23, f7, f15 .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(99) .align 4 LL(60): LFDUX f8, X, INCX LFDX f9, X, INC1 fabs f8, f8 fabs f9, f9 fsub f16, f0, f8 fsub f17, f1, f9 fsel f0, f16, f0, f8 fsel f1, f17, f1, f9 bdnz LL(60) .align 4 LL(99): fsub f8, f0, f1 fsub f9, f2, f3 fsub f10, f4, f5 fsub f11, f6, f7 fsel f0, f8, f0, f1 fsel f2, f9, f2, f3 fsel f4, f10, f4, f5 fsel f6, f11, f6, f7 fsub f8, f0, f2 fsub f9, f4, f6 fsel f0, f8, f0, f2 fsel f4, f9, f4, f6 fsub f8, f0, f4 fsel f31, f8, f0, f4 lfs f1, FZERO lfs f0, FONE fcmpu cr0, f1, f31 beq- cr0, LL(999) fdiv f30, f0, f31 fmr f0, f1 fmr f2, f1 fmr f3, f1 fmr f4, f1 fmr f5, f1 fmr f6, f1 fmr f7, f1 srawi. r0, NN, 3 mtspr CTR, r0 beq- cr0, LL(150) LFDUX f8, XX, INCX LFDX f9, XX, INC1 LFDUX f10, XX, INCX LFDX f11, XX, INC1 LFDUX f12, XX, INCX LFDX f13, XX, INC1 LFDUX f14, XX, INCX LFDX f15, XX, INC1 fmul f16, f30, f8 LFDUX f8, XX, INCX fmul f17, f30, f9 LFDX f9, XX, INC1 fmul f18, f30, f10 LFDUX f10, XX, INCX fmul f19, f30, f11 LFDX f11, XX, INC1 fmul f20, f30, f12 LFDUX f12, XX, INCX fmul f21, f30, f13 LFDX f13, XX, INC1 fmul f22, f30, f14 LFDUX f14, XX, INCX fmul f23, f30, f15 LFDX f15, XX, INC1 bdz LL(120) .align 4 LL(110): fmadd f0, f16, f16, f0 #ifdef PPCG4 dcbt XX, PRE #endif fmul f16, f30, f8 LFDUX f8, XX, INCX fmadd f1, f17, f17, f1 fmul f17, f30, f9 LFDX f9, XX, INC1 fmadd f2, f18, f18, f2 fmul f18, f30, f10 LFDUX f10, XX, INCX fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDX f11, XX, INC1 fmadd f4, f20, f20, f4 #ifdef PPCG4 dcbt XX, PRE #endif fmul f20, f30, f12 LFDUX f12, XX, INCX fmadd f5, f21, f21, f5 fmul f21, f30, f13 LFDX f13, XX, INC1 fmadd f6, f22, f22, f6 fmul f22, f30, f14 LFDUX f14, XX, INCX fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDX f15, XX, INC1 fmadd f0, f16, f16, f0 #ifdef PPCG4 dcbt XX, PRE #endif fmul f16, f30, f8 LFDUX f8, XX, INCX fmadd f1, f17, f17, f1 fmul f17, f30, f9 LFDX f9, XX, INC1 fmadd f2, f18, f18, f2 fmul f18, f30, f10 LFDUX f10, XX, INCX fmadd f3, f19, f19, f3 fmul f19, f30, f11 LFDX f11, XX, INC1 fmadd f4, f20, f20, f4 #ifdef PPCG4 dcbt XX, PRE #endif fmul f20, f30, f12 LFDUX f12, XX, INCX fmadd f5, f21, f21, f5 fmul f21, f30, f13 LFDX f13, XX, INC1 fmadd f6, f22, f22, f6 fmul f22, f30, f14 LFDUX f14, XX, INCX fmadd f7, f23, f23, f7 fmul f23, f30, f15 LFDX f15, XX, INC1 bdnz LL(110) .align 4 LL(120): fmadd f0, f16, f16, f0 fmul f16, f30, f8 fmadd f1, f17, f17, f1 fmul f17, f30, f9 fmadd f2, f18, f18, f2 fmul f18, f30, f10 fmadd f3, f19, f19, f3 fmul f19, f30, f11 fmadd f4, f20, f20, f4 fmul f20, f30, f12 fmadd f5, f21, f21, f5 fmul f21, f30, f13 fmadd f6, f22, f22, f6 fmul f22, f30, f14 fmadd f7, f23, f23, f7 fmul f23, f30, f15 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 fmadd f2, f18, f18, f2 fmadd f3, f19, f19, f3 fmadd f4, f20, f20, f4 fmadd f5, f21, f21, f5 fmadd f6, f22, f22, f6 fmadd f7, f23, f23, f7 .align 4 LL(150): andi. r0, NN, 7 mtspr CTR, r0 beq- cr0, LL(170) .align 4 LL(160): LFDUX f8, XX, INCX LFDX f9, XX, INC1 fmul f16, f30, f8 fmul f17, f30, f9 fmadd f0, f16, f16, f0 fmadd f1, f17, f17, f1 bdnz LL(160) .align 4 LL(170): fadd f0, f0, f1 fadd f2, f2, f3 fadd f4, f4, f5 fadd f6, f6, f7 fadd f0, f0, f2 fadd f4, f4, f6 fadd f1, f0, f4 frsqrte f0, f1 lfs f8, C1 lfs f9, C2 fmul f2, f1, f0 fadd f7, f8, f8 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f2, f1, f0 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f2, f1, f0 fmul f3, f0, f8 fnmsub f4, f2, f0, f9 fmul f0, f3, f4 fmul f5, f1, f0 fmul f2, f5, f8 fnmsub f3, f5, f0, f7 fmadd f1, f2, f3, f5 fmul f1, f31, f1 .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zrot.S000066400000000000000000000274331313527062700167130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 #define XX r9 #define YY r10 #define INCXM1 r11 #define INCYM1 r12 #define C f1 #define S f2 #define STACKSIZE 32 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) cmpwi cr0, INCY, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) LFD f0, 0 * SIZE(X) LFD f4, 1 * SIZE(X) LFD f6, 2 * SIZE(X) LFD f8, 3 * SIZE(X) LFD f3, 0 * SIZE(Y) LFD f5, 1 * SIZE(Y) LFD f7, 2 * SIZE(Y) LFD f9, 3 * SIZE(Y) bdz LL(12) .align 4 LL(10): FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 4 * SIZE(X) LFD f4, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f8, 7 * SIZE(X) LFD f3, 4 * SIZE(Y) LFD f5, 5 * SIZE(Y) LFD f7, 6 * SIZE(Y) LFD f9, 7 * SIZE(Y) STFD f10, 0 * SIZE(X) STFD f12, 1 * SIZE(X) STFD f14, 2 * SIZE(X) STFD f16, 3 * SIZE(X) STFD f11, 0 * SIZE(Y) STFD f13, 1 * SIZE(Y) STFD f15, 2 * SIZE(Y) STFD f17, 3 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 8 * SIZE(X) LFD f4, 9 * SIZE(X) LFD f6, 10 * SIZE(X) LFD f8, 11 * SIZE(X) LFD f3, 8 * SIZE(Y) LFD f5, 9 * SIZE(Y) LFD f7, 10 * SIZE(Y) LFD f9, 11 * SIZE(Y) STFD f10, 4 * SIZE(X) STFD f12, 5 * SIZE(X) STFD f14, 6 * SIZE(X) STFD f16, 7 * SIZE(X) STFD f11, 4 * SIZE(Y) STFD f13, 5 * SIZE(Y) STFD f15, 6 * SIZE(Y) STFD f17, 7 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 12 * SIZE(X) LFD f4, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f8, 15 * SIZE(X) LFD f3, 12 * SIZE(Y) LFD f5, 13 * SIZE(Y) LFD f7, 14 * SIZE(Y) LFD f9, 15 * SIZE(Y) STFD f10, 8 * SIZE(X) STFD f12, 9 * SIZE(X) STFD f14, 10 * SIZE(X) STFD f16, 11 * SIZE(X) STFD f11, 8 * SIZE(Y) STFD f13, 9 * SIZE(Y) STFD f15, 10 * SIZE(Y) STFD f17, 11 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 LFD f0, 16 * SIZE(X) LFD f4, 17 * SIZE(X) LFD f6, 18 * SIZE(X) LFD f8, 19 * SIZE(X) LFD f3, 16 * SIZE(Y) LFD f5, 17 * SIZE(Y) LFD f7, 18 * SIZE(Y) LFD f9, 19 * SIZE(Y) STFD f10, 12 * SIZE(X) STFD f12, 13 * SIZE(X) STFD f14, 14 * SIZE(X) STFD f16, 15 * SIZE(X) STFD f11, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f15, 14 * SIZE(Y) STFD f17, 15 * SIZE(Y) #ifndef POWER6 dcbtst X, PREA #endif addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE #ifdef POWER6 dcbtst X, PREA dcbtst X, PREA #endif bdnz LL(10) .align 4 LL(12): FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 0 * SIZE(X) STFD f12, 1 * SIZE(X) STFD f14, 2 * SIZE(X) STFD f16, 3 * SIZE(X) STFD f11, 0 * SIZE(Y) STFD f13, 1 * SIZE(Y) STFD f15, 2 * SIZE(Y) STFD f17, 3 * SIZE(Y) LFD f0, 4 * SIZE(X) LFD f4, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f8, 7 * SIZE(X) LFD f3, 4 * SIZE(Y) LFD f5, 5 * SIZE(Y) LFD f7, 6 * SIZE(Y) LFD f9, 7 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 4 * SIZE(X) STFD f12, 5 * SIZE(X) STFD f14, 6 * SIZE(X) STFD f16, 7 * SIZE(X) STFD f11, 4 * SIZE(Y) STFD f13, 5 * SIZE(Y) STFD f15, 6 * SIZE(Y) STFD f17, 7 * SIZE(Y) LFD f0, 8 * SIZE(X) LFD f4, 9 * SIZE(X) LFD f6, 10 * SIZE(X) LFD f8, 11 * SIZE(X) LFD f3, 8 * SIZE(Y) LFD f5, 9 * SIZE(Y) LFD f7, 10 * SIZE(Y) LFD f9, 11 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 8 * SIZE(X) STFD f12, 9 * SIZE(X) STFD f14, 10 * SIZE(X) STFD f16, 11 * SIZE(X) STFD f11, 8 * SIZE(Y) STFD f13, 9 * SIZE(Y) STFD f15, 10 * SIZE(Y) STFD f17, 11 * SIZE(Y) LFD f0, 12 * SIZE(X) LFD f4, 13 * SIZE(X) LFD f6, 14 * SIZE(X) LFD f8, 15 * SIZE(X) LFD f3, 12 * SIZE(Y) LFD f5, 13 * SIZE(Y) LFD f7, 14 * SIZE(Y) LFD f9, 15 * SIZE(Y) FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFD f10, 12 * SIZE(X) STFD f12, 13 * SIZE(X) STFD f14, 14 * SIZE(X) STFD f16, 15 * SIZE(X) STFD f11, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f15, 14 * SIZE(Y) STFD f17, 15 * SIZE(Y) addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f3, 0 * SIZE(X) LFD f4, 0 * SIZE(Y) LFD f5, 1 * SIZE(X) LFD f6, 1 * SIZE(Y) FMUL f10, C, f3 FMUL f11, C, f4 FMUL f12, C, f5 FMUL f13, C, f6 FMADD f10, S, f4, f10 FNMSUB f11, S, f3, f11 FMADD f12, S, f6, f12 FNMSUB f13, S, f5, f13 STFD f10, 0 * SIZE(X) STFD f11, 0 * SIZE(Y) STFD f12, 1 * SIZE(X) STFD f13, 1 * SIZE(Y) addi X, X, 2 * SIZE addi Y, Y, 2 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 sub Y, Y, INCYM1 mr XX, X mr YY, Y srawi. r0, N, 2 mtspr CTR, r0 beq- LL(150) .align 4 LL(110): LFDX f0, X, INCXM1 LFDX f3, Y, INCYM1 LFDUX f4, X, INCX LFDUX f5, Y, INCY LFDX f6, X, INCXM1 LFDX f7, Y, INCYM1 LFDUX f8, X, INCX LFDUX f9, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFDX f10, XX, INCXM1 STFDX f11, YY, INCYM1 STFDUX f12, XX, INCX STFDUX f13, YY, INCY STFDX f14, XX, INCXM1 STFDX f15, YY, INCYM1 STFDUX f16, XX, INCX STFDUX f17, YY, INCY LFDX f0, X, INCXM1 LFDX f3, Y, INCYM1 LFDUX f4, X, INCX LFDUX f5, Y, INCY LFDX f6, X, INCXM1 LFDX f7, Y, INCYM1 LFDUX f8, X, INCX LFDUX f9, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMUL f14, C, f6 FMUL f15, C, f7 FMUL f16, C, f8 FMUL f17, C, f9 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFDX f10, XX, INCXM1 STFDX f11, YY, INCYM1 STFDUX f12, XX, INCX STFDUX f13, YY, INCY STFDX f14, XX, INCXM1 STFDX f15, YY, INCYM1 STFDUX f16, XX, INCX STFDUX f17, YY, INCY bdnz LL(110) .align 4 LL(150): andi. r0, N, 3 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f0, X, INCXM1 LFDX f3, Y, INCYM1 LFDUX f4, X, INCX LFDUX f5, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 STFDX f10, XX, INCXM1 STFDX f11, YY, INCYM1 STFDUX f12, XX, INCX STFDUX f13, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zrot_ppc440.S000066400000000000000000000156241313527062700200040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define Y r6 #define INCY r7 #define PRE r8 #define XX r9 #define YY r10 #define INCXM1 r11 #define INCYM1 r12 #define C f1 #define S f2 #define STACKSIZE 32 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE li PRE, 2 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) sub X, X, INCXM1 sub Y, Y, INCYM1 mr XX, X mr YY, Y srawi. r0, N, 2 mtspr CTR, r0 beq- LL(150) LFDX f0, X, INCXM1 LFDX f3, Y, INCYM1 LFDUX f4, X, INCX FMUL f10, C, f0 LFDUX f5, Y, INCY FMUL f11, C, f3 LFDX f6, X, INCXM1 FMUL f12, C, f4 LFDX f7, Y, INCYM1 FMUL f13, C, f5 LFDUX f8, X, INCX FMADD f10, S, f3, f10 LFDUX f9, Y, INCY FNMSUB f11, S, f0, f11 LFDX f0, X, INCXM1 FMADD f12, S, f5, f12 LFDX f3, Y, INCYM1 FNMSUB f13, S, f4, f13 LFDUX f4, X, INCX bdz LL(111) .align 4 LL(110): FMUL f14, C, f6 LFDUX f5, Y, INCY FMUL f15, C, f7 STFDX f10, XX, INCXM1 FMUL f16, C, f8 STFDX f11, YY, INCYM1 FMUL f17, C, f9 STFDUX f12, XX, INCX #ifdef PPCG4 dcbtst X, PRE #endif FMADD f14, S, f7, f14 STFDUX f13, YY, INCY FNMSUB f15, S, f6, f15 LFDX f6, X, INCXM1 FMADD f16, S, f9, f16 LFDX f7, Y, INCYM1 FNMSUB f17, S, f8, f17 LFDUX f8, X, INCX FMUL f10, C, f0 LFDUX f9, Y, INCY FMUL f11, C, f3 STFDX f14, XX, INCXM1 FMUL f12, C, f4 STFDX f15, YY, INCYM1 FMUL f13, C, f5 STFDUX f16, XX, INCX #ifdef PPCG4 dcbtst Y, PRE #endif FMADD f10, S, f3, f10 STFDUX f17, YY, INCY FNMSUB f11, S, f0, f11 LFDX f0, X, INCXM1 FMADD f12, S, f5, f12 LFDX f3, Y, INCYM1 FNMSUB f13, S, f4, f13 LFDUX f4, X, INCX FMUL f14, C, f6 LFDUX f5, Y, INCY FMUL f15, C, f7 STFDX f10, XX, INCXM1 FMUL f16, C, f8 STFDX f11, YY, INCYM1 FMUL f17, C, f9 STFDUX f12, XX, INCX #if defined(PPCG4) && defined(DOUBLE) dcbt X, PRE #endif FMADD f14, S, f7, f14 STFDUX f13, YY, INCY FNMSUB f15, S, f6, f15 LFDX f6, X, INCXM1 FMADD f16, S, f9, f16 LFDX f7, Y, INCYM1 FNMSUB f17, S, f8, f17 LFDUX f8, X, INCX FMUL f10, C, f0 STFDX f14, XX, INCXM1 FMUL f11, C, f3 STFDX f15, YY, INCYM1 FMUL f12, C, f4 STFDUX f16, XX, INCX FMUL f13, C, f5 STFDUX f17, YY, INCY #if defined(PPCG4) && defined(DOUBLE) dcbtst Y, PRE #endif FMADD f10, S, f3, f10 LFDUX f9, Y, INCY FNMSUB f11, S, f0, f11 LFDX f0, X, INCXM1 FMADD f12, S, f5, f12 LFDX f3, Y, INCYM1 FNMSUB f13, S, f4, f13 LFDUX f4, X, INCX bdnz LL(110) .align 4 LL(111): FMUL f14, C, f6 LFDUX f5, Y, INCY FMUL f15, C, f7 STFDX f10, XX, INCXM1 FMUL f16, C, f8 STFDX f11, YY, INCYM1 FMUL f17, C, f9 STFDUX f12, XX, INCX FMADD f14, S, f7, f14 STFDUX f13, YY, INCY FNMSUB f15, S, f6, f15 LFDX f6, X, INCXM1 FMADD f16, S, f9, f16 LFDX f7, Y, INCYM1 FNMSUB f17, S, f8, f17 LFDUX f8, X, INCX FMUL f10, C, f0 LFDUX f9, Y, INCY FMUL f11, C, f3 STFDX f14, XX, INCXM1 FMUL f12, C, f4 STFDX f15, YY, INCYM1 FMUL f13, C, f5 STFDUX f16, XX, INCX FMADD f10, S, f3, f10 STFDUX f17, YY, INCY FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 FMUL f14, C, f6 STFDX f10, XX, INCXM1 FMUL f15, C, f7 STFDX f11, YY, INCYM1 FMUL f16, C, f8 STFDUX f12, XX, INCX FMUL f17, C, f9 STFDUX f13, YY, INCY FMADD f14, S, f7, f14 FNMSUB f15, S, f6, f15 FMADD f16, S, f9, f16 FNMSUB f17, S, f8, f17 STFDX f14, XX, INCXM1 STFDX f15, YY, INCYM1 STFDUX f16, XX, INCX STFDUX f17, YY, INCY .align 4 LL(150): andi. r0, N, 3 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f0, X, INCXM1 LFDX f3, Y, INCYM1 LFDUX f4, X, INCX LFDUX f5, Y, INCY FMUL f10, C, f0 FMUL f11, C, f3 FMUL f12, C, f4 FMUL f13, C, f5 FMADD f10, S, f3, f10 FNMSUB f11, S, f0, f11 FMADD f12, S, f5, f12 FNMSUB f13, S, f4, f13 STFDX f10, XX, INCXM1 STFDX f11, YY, INCYM1 STFDUX f12, XX, INCX STFDUX f13, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zscal.S000066400000000000000000000210301313527062700170140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define XX r4 #define PREA r5 #ifdef linux #ifndef __64BIT__ #define X r6 #define INCX r7 #else #define X r8 #define INCX r9 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define X r10 #define INCX r8 #else #define X r8 #define INCX r9 #endif #endif #define FZERO f0 #define ALPHA_R f1 #define ALPHA_I f2 PROLOGUE PROFCODE addi SP, SP, -8 li r0, 0 stw r0, 0(SP) lfs FZERO, 0(SP) addi SP, SP, 8 #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCX, 56(SP) #endif slwi INCX, INCX, ZBASE_SHIFT li PREA, L1_PREFETCHSIZE cmpwi cr0, N, 0 blelr- cr0 fcmpu cr0, FZERO, ALPHA_R bne- cr0, LL(A1I1) fcmpu cr0, FZERO, ALPHA_I bne- cr0, LL(A1I1) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(A0IN) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(A0I1_Remain) .align 4 LL(A0I1_kernel): STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) STFD FZERO, 2 * SIZE(X) STFD FZERO, 3 * SIZE(X) STFD FZERO, 4 * SIZE(X) STFD FZERO, 5 * SIZE(X) STFD FZERO, 6 * SIZE(X) STFD FZERO, 7 * SIZE(X) STFD FZERO, 8 * SIZE(X) STFD FZERO, 9 * SIZE(X) STFD FZERO, 10 * SIZE(X) STFD FZERO, 11 * SIZE(X) STFD FZERO, 12 * SIZE(X) STFD FZERO, 13 * SIZE(X) STFD FZERO, 14 * SIZE(X) STFD FZERO, 15 * SIZE(X) addi X, X, 16 * SIZE bdnz LL(A0I1_kernel) .align 4 LL(A0I1_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4 LL(A0I1_RemainKernel): STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) addi X, X, 2 * SIZE bdnz LL(A0I1_RemainKernel) blr .align 4 LL(A0IN): srawi. r0, N, 3 mtspr CTR, r0 beq- LL(A0IN_Remain) .align 4 LL(A0IN_Kernel): dcbtst X, PREA STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX bdnz LL(A0IN_Kernel) .align 4 LL(A0IN_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4 LL(A0IN_RemainKernel): STFD FZERO, 0 * SIZE(X) STFD FZERO, 1 * SIZE(X) add X, X, INCX bdnz LL(A0IN_RemainKernel) blr .align 4 LL(A1I1): cmpwi cr0, INCX, 2 * SIZE bne- LL(A1IN) mr XX, X srawi. r0, N, 3 mtspr CTR, r0 beq+ LL(A1I1_Remain) .align 4 LL(A1I1_kernel): LFD f3, 0 * SIZE(X) LFD f4, 1 * SIZE(X) LFD f5, 2 * SIZE(X) LFD f6, 3 * SIZE(X) LFD f7, 4 * SIZE(X) LFD f8, 5 * SIZE(X) LFD f9, 6 * SIZE(X) LFD f10, 7 * SIZE(X) FMUL f0, ALPHA_I, f4 FMUL f4, ALPHA_R, f4 FMUL f11, ALPHA_I, f6 FMUL f6, ALPHA_R, f6 FMUL f12, ALPHA_I, f8 FMUL f8, ALPHA_R, f8 FMUL f13, ALPHA_I, f10 FMUL f10, ALPHA_R, f10 FMADD f4, ALPHA_I, f3, f4 FMSUB f3, ALPHA_R, f3, f0 FMADD f6, ALPHA_I, f5, f6 FMSUB f5, ALPHA_R, f5, f11 FMADD f8, ALPHA_I, f7, f8 FMSUB f7, ALPHA_R, f7, f12 FMADD f10, ALPHA_I, f9, f10 FMSUB f9, ALPHA_R, f9, f13 STFD f3, 0 * SIZE(X) STFD f4, 1 * SIZE(X) STFD f5, 2 * SIZE(X) STFD f6, 3 * SIZE(X) STFD f7, 4 * SIZE(X) STFD f8, 5 * SIZE(X) STFD f9, 6 * SIZE(X) STFD f10, 7 * SIZE(X) LFD f3, 8 * SIZE(X) LFD f4, 9 * SIZE(X) LFD f5, 10 * SIZE(X) LFD f6, 11 * SIZE(X) LFD f7, 12 * SIZE(X) LFD f8, 13 * SIZE(X) LFD f9, 14 * SIZE(X) LFD f10,15 * SIZE(X) FMUL f0, ALPHA_I, f4 FMUL f4, ALPHA_R, f4 FMUL f11, ALPHA_I, f6 FMUL f6, ALPHA_R, f6 FMUL f12, ALPHA_I, f8 FMUL f8, ALPHA_R, f8 FMUL f13, ALPHA_I, f10 FMUL f10, ALPHA_R, f10 FMADD f4, ALPHA_I, f3, f4 FMSUB f3, ALPHA_R, f3, f0 FMADD f6, ALPHA_I, f5, f6 FMSUB f5, ALPHA_R, f5, f11 FMADD f8, ALPHA_I, f7, f8 FMSUB f7, ALPHA_R, f7, f12 FMADD f10, ALPHA_I, f9, f10 FMSUB f9, ALPHA_R, f9, f13 STFD f3, 8 * SIZE(X) STFD f4, 9 * SIZE(X) STFD f5, 10 * SIZE(X) STFD f6, 11 * SIZE(X) STFD f7, 12 * SIZE(X) STFD f8, 13 * SIZE(X) STFD f9, 14 * SIZE(X) STFD f10,15 * SIZE(X) addi X, X, 16 * SIZE dcbtst X, PREA bdnz LL(A1I1_kernel) .align 4 LL(A1I1_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4 LL(A1I1_RemainKernel): LFD f3, 0 * SIZE(X) LFD f4, 1 * SIZE(X) FMUL f5, ALPHA_I, f4 FMUL f4, ALPHA_R, f4 FMADD f4, ALPHA_I, f3, f4 FMSUB f3, ALPHA_R, f3, f5 STFD f3, 0 * SIZE(X) STFD f4, 1 * SIZE(X) addi X, X, 2 * SIZE bdnz LL(A1I1_RemainKernel) blr .align 4 LL(A1IN): mr XX, X srawi. r0, N, 2 mtspr CTR, r0 beq- LL(A1IN_Remain) .align 4 LL(A1IN_Kernel): LFD f3, 0 * SIZE(XX) LFD f4, 1 * SIZE(XX) add XX, XX, INCX LFD f5, 0 * SIZE(XX) LFD f6, 1 * SIZE(XX) add XX, XX, INCX LFD f7, 0 * SIZE(XX) LFD f8, 1 * SIZE(XX) add XX, XX, INCX LFD f9, 0 * SIZE(XX) LFD f10, 1 * SIZE(XX) add XX, XX, INCX FMUL f0, ALPHA_I, f4 FMUL f4, ALPHA_R, f4 FMUL f11, ALPHA_I, f6 FMUL f6, ALPHA_R, f6 FMUL f12, ALPHA_I, f8 FMUL f8, ALPHA_R, f8 FMUL f13, ALPHA_I, f10 FMUL f10, ALPHA_R, f10 FMADD f4, ALPHA_I, f3, f4 FMSUB f3, ALPHA_R, f3, f0 FMADD f6, ALPHA_I, f5, f6 FMSUB f5, ALPHA_R, f5, f11 FMADD f8, ALPHA_I, f7, f8 FMSUB f7, ALPHA_R, f7, f12 FMADD f10, ALPHA_I, f9, f10 FMSUB f9, ALPHA_R, f9, f13 STFD f3, 0 * SIZE(X) STFD f4, 1 * SIZE(X) add X, X, INCX STFD f5, 0 * SIZE(X) STFD f6, 1 * SIZE(X) add X, X, INCX STFD f7, 0 * SIZE(X) STFD f8, 1 * SIZE(X) add X, X, INCX STFD f9, 0 * SIZE(X) STFD f10, 1 * SIZE(X) add X, X, INCX dcbtst X, PREA bdnz LL(A1IN_Kernel) .align 4 LL(A1IN_Remain): andi. r0, N, 3 mtspr CTR, r0 beqlr+ .align 4 LL(A1IN_RemainKernel): LFD f3, 0 * SIZE(XX) LFD f4, 1 * SIZE(XX) add XX, XX, INCX FMUL f5, ALPHA_I, f4 FMUL f4, ALPHA_R, f4 FMADD f4, ALPHA_I, f3, f4 FMSUB f3, ALPHA_R, f3, f5 STFD f3, 0 * SIZE(X) STFD f4, 1 * SIZE(X) add X, X, INCX bdnz LL(A1IN_RemainKernel) blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zscal.c000066400000000000000000000103751313527062700170460ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #pragma GCC optimize "O1" #if defined(POWER8) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif #endif #ifndef HAVE_KERNEL_8 static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT da_r, FLOAT da_i) { BLASLONG i=0; FLOAT *x1=x; FLOAT alpha_r1=da_r; FLOAT alpha_r2=da_r; FLOAT alpha_i1=-da_i; FLOAT alpha_i2=da_i; FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31; FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i; while ( i 0 ) { zscal_kernel_8(n1, x, da_r, da_i); i=n1; ip = n1 * 2; } while ( i < n ) { temp = da_r * x[ip] - da_i * x[ip+1] ; x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; x[ip] = temp; ip += 2; i++; } } else { inc_x2 = 2 * inc_x; while ( i < n ) { temp = da_r * x[ip] - da_i * x[ip+1] ; x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; x[ip] = temp; ip += inc_x2; i++; } } return(0); } OpenBLAS-0.2.20/kernel/power/zscal_hummer.S000066400000000000000000000372761313527062700204140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r6 #define INCX r7 #define INCX2 r4 #define XX r5 #define Y r8 #define YY r9 #define ALPHA f1 #define ALPHA_I f2 #define A1 f0 #define A2 f16 #define A3 f17 #define A4 f3 #define A5 f4 #define A6 f5 #define A7 f6 #define A8 f7 #define B1 f8 #define B2 f9 #define B3 f10 #define B4 f11 #define B5 f12 #define B6 f13 #define B7 f14 #define B8 f15 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) stwu r10, -4(SP) lfpdx A1, SP, r10 # Zero clear fsmfp ALPHA, ALPHA_I slwi INCX, INCX, BASE_SHIFT add INCX2, INCX, INCX cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, SIZE bne LL(100) fcmpu cr7, ALPHA, A1 bne cr7, LL(50) fscmp cr7, ALPHA, A1 bne cr7, LL(50) andi. r0, X, 2 * SIZE - 1 bne LL(20) sub X, X, INCX2 srawi. r0, N, 2 mtspr CTR, r0 beq- LL(15) .align 4 LL(12): STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 bdnz LL(12) .align 4 LL(15): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(17) STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 .align 4 LL(17): andi. r0, N, 1 beq LL(999) STFPDUX A1, X, INCX2 b LL(999) .align 4 LL(20): sub X, X, INCX2 STFDX A1, X, INCX2 addi X, X, SIZE addi N, N, -1 cmpwi cr0, N, 0 ble LL(29) srawi. r0, N, 2 mtspr CTR, r0 beq- LL(25) .align 4 LL(22): STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 bdnz LL(22) .align 4 LL(25): andi. r0, N, 3 beq LL(29) andi. r0, N, 2 beq LL(27) STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 .align 4 LL(27): andi. r0, N, 1 beq LL(29) STFPDUX A1, X, INCX2 .align 4 LL(29): STFDX A1, X, INCX2 b LL(999) .align 4 LL(50): sub Y, X, INCX2 sub X, X, INCX2 andi. r0, X, 2 * SIZE - 1 bne LL(60) srawi. r0, N, 3 mtspr CTR, r0 beq- LL(55) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fxpmul B1, ALPHA, A1 LFPDUX A6, X, INCX2 fxpmul B2, ALPHA, A2 LFPDUX A7, X, INCX2 fxpmul B3, ALPHA, A3 LFPDUX A8, X, INCX2 fxpmul B4, ALPHA, A4 fxpmul B5, ALPHA, A5 fxcxnpma B1, ALPHA, A1, B1 fxcxnpma B2, ALPHA, A2, B2 bdz LL(53) .align 4 LL(52): fxcxnpma B3, ALPHA, A3, B3 LFPDUX A1, X, INCX2 fxpmul B6, ALPHA, A6 STFPDUX B1, Y, INCX2 fxcxnpma B4, ALPHA, A4, B4 LFPDUX A2, X, INCX2 fxpmul B7, ALPHA, A7 STFPDUX B2, Y, INCX2 fxcxnpma B5, ALPHA, A5, B5 LFPDUX A3, X, INCX2 fxpmul B8, ALPHA, A8 STFPDUX B3, Y, INCX2 fxcxnpma B6, ALPHA, A6, B6 LFPDUX A4, X, INCX2 fxpmul B1, ALPHA, A1 STFPDUX B4, Y, INCX2 fxcxnpma B7, ALPHA, A7, B7 LFPDUX A5, X, INCX2 fxpmul B2, ALPHA, A2 STFPDUX B5, Y, INCX2 fxcxnpma B8, ALPHA, A8, B8 LFPDUX A6, X, INCX2 fxpmul B3, ALPHA, A3 STFPDUX B6, Y, INCX2 fxcxnpma B1, ALPHA, A1, B1 LFPDUX A7, X, INCX2 fxpmul B4, ALPHA, A4 STFPDUX B7, Y, INCX2 fxcxnpma B2, ALPHA, A2, B2 LFPDUX A8, X, INCX2 fxpmul B5, ALPHA, A5 STFPDUX B8, Y, INCX2 bdnz LL(52) .align 4 LL(53): fxcxnpma B3, ALPHA, A3, B3 fxpmul B6, ALPHA, A6 STFPDUX B1, Y, INCX2 fxcxnpma B4, ALPHA, A4, B4 fxpmul B7, ALPHA, A7 STFPDUX B2, Y, INCX2 fxcxnpma B5, ALPHA, A5, B5 fxpmul B8, ALPHA, A8 STFPDUX B3, Y, INCX2 fxcxnpma B6, ALPHA, A6, B6 STFPDUX B4, Y, INCX2 fxcxnpma B7, ALPHA, A7, B7 STFPDUX B5, Y, INCX2 fxcxnpma B8, ALPHA, A8, B8 STFPDUX B6, Y, INCX2 STFPDUX B7, Y, INCX2 STFPDUX B8, Y, INCX2 .align 4 LL(55): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(56) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fxpmul B1, ALPHA, A1 fxpmul B2, ALPHA, A2 fxpmul B3, ALPHA, A3 fxpmul B4, ALPHA, A4 fxcxnpma B1, ALPHA, A1, B1 fxcxnpma B2, ALPHA, A2, B2 fxcxnpma B3, ALPHA, A3, B3 fxcxnpma B4, ALPHA, A4, B4 STFPDUX B1, Y, INCX2 STFPDUX B2, Y, INCX2 STFPDUX B3, Y, INCX2 STFPDUX B4, Y, INCX2 .align 4 LL(56): andi. r0, N, 2 beq LL(57) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fxpmul B1, ALPHA, A1 fxpmul B2, ALPHA, A2 fxcxnpma B1, ALPHA, A1, B1 fxcxnpma B2, ALPHA, A2, B2 STFPDUX B1, Y, INCX2 STFPDUX B2, Y, INCX2 .align 4 LL(57): andi. r0, N, 1 beq LL(999) LFPDUX A1, X, INCX2 fxpmul B1, ALPHA, A1 fxcxnpma B1, ALPHA, A1, B1 STFPDUX B1, Y, INCX2 b LL(999) .align 4 LL(60): addi XX, X, SIZE addi YY, Y, SIZE srawi. r0, N, 2 mtspr CTR, r0 beq- LL(65) LFDUX A1, X, INCX2 LFDUX A2, XX, INCX2 LFDUX A3, X, INCX2 LFDUX A4, XX, INCX2 LFDUX A5, X, INCX2 fmul B1, ALPHA, A1 LFDUX A6, XX, INCX2 fmul B2, ALPHA_I, A1 LFDUX A7, X, INCX2 fmul B3, ALPHA, A3 LFDUX A8, XX, INCX2 fmul B4, ALPHA_I, A3 fmul B5, ALPHA, A5 fnmsub B1, ALPHA_I, A2, B1 fmadd B2, ALPHA , A2, B2 bdz LL(63) .align 4 LL(62): fnmsub B3, ALPHA_I, A4, B3 LFDUX A1, X, INCX2 fmul B6, ALPHA_I, A5 STFDUX B1, Y, INCX2 fmadd B4, ALPHA , A4, B4 LFDUX A2, XX, INCX2 fmul B7, ALPHA, A7 STFDUX B2, YY, INCX2 fnmsub B5, ALPHA_I, A6, B5 LFDUX A3, X, INCX2 fmul B8, ALPHA_I, A7 STFDUX B3, Y, INCX2 fmadd B6, ALPHA , A6, B6 LFDUX A4, XX, INCX2 fmul B1, ALPHA, A1 STFDUX B4, YY, INCX2 fnmsub B7, ALPHA_I, A8, B7 LFDUX A5, X, INCX2 fmul B2, ALPHA_I, A1 STFDUX B5, Y, INCX2 fmadd B8, ALPHA , A8, B8 LFDUX A6, XX, INCX2 fmul B3, ALPHA, A3 STFDUX B6, YY, INCX2 fnmsub B1, ALPHA_I, A2, B1 LFDUX A7, X, INCX2 fmul B4, ALPHA_I, A3 STFDUX B7, Y, INCX2 fmadd B2, ALPHA , A2, B2 LFDUX A8, XX, INCX2 fmul B5, ALPHA, A5 STFDUX B8, YY, INCX2 bdnz LL(62) .align 4 LL(63): fnmsub B3, ALPHA_I, A4, B3 fmul B6, ALPHA_I, A5 STFDUX B1, Y, INCX2 fmadd B4, ALPHA , A4, B4 fmul B7, ALPHA, A7 STFDUX B2, YY, INCX2 fnmsub B5, ALPHA_I, A6, B5 fmul B8, ALPHA_I, A7 STFDUX B3, Y, INCX2 fmadd B6, ALPHA , A6, B6 STFDUX B4, YY, INCX2 fnmsub B7, ALPHA_I, A8, B7 STFDUX B5, Y, INCX2 fmadd B8, ALPHA , A8, B8 STFDUX B6, YY, INCX2 STFDUX B7, Y, INCX2 STFDUX B8, YY, INCX2 .align 4 LL(65): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(67) LFDUX A1, X, INCX2 LFDUX A2, XX, INCX2 LFDUX A3, X, INCX2 LFDUX A4, XX, INCX2 fmul B1, ALPHA, A1 fmul B2, ALPHA, A2 fmul B3, ALPHA, A3 fmul B4, ALPHA, A4 fnmsub B1, ALPHA_I, A2, B1 fmadd B2, ALPHA_I, A1, B2 fnmsub B3, ALPHA_I, A4, B3 fmadd B4, ALPHA_I, A3, B4 STFDUX B1, Y, INCX2 STFDUX B2, YY, INCX2 STFDUX B3, Y, INCX2 STFDUX B4, YY, INCX2 .align 4 LL(67): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX A2, XX, INCX2 fmul B1, ALPHA, A1 fmul B2, ALPHA, A2 fnmsub B1, ALPHA_I, A2, B1 fmadd B2, ALPHA_I, A1, B2 STFDUX B1, Y, INCX2 STFDUX B2, YY, INCX2 b LL(999) .align 4 LL(100): fcmpu cr7, ALPHA, A1 bne cr7, LL(150) fscmp cr7, ALPHA, A1 bne cr7, LL(150) andi. r0, X, 2 * SIZE - 1 bne LL(120) sub X, X, INCX2 srawi. r0, N, 2 mtspr CTR, r0 beq- LL(115) .align 4 LL(112): STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 bdnz LL(112) .align 4 LL(115): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(117) STFPDUX A1, X, INCX2 STFPDUX A1, X, INCX2 .align 4 LL(117): andi. r0, N, 1 beq LL(999) STFPDUX A1, X, INCX2 b LL(999) .align 4 LL(120): subi INCX2, INCX2, SIZE li INCX, SIZE sub X, X, INCX2 srawi. r0, N, 2 mtspr CTR, r0 beq- LL(125) .align 4 LL(122): STFDUX A1, X, INCX2 STFDUX A1, X, INCX STFDUX A1, X, INCX2 STFDUX A1, X, INCX STFDUX A1, X, INCX2 STFDUX A1, X, INCX STFDUX A1, X, INCX2 STFDUX A1, X, INCX bdnz LL(122) .align 4 LL(125): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(127) STFDUX A1, X, INCX2 STFDUX A1, X, INCX STFDUX A1, X, INCX2 STFDUX A1, X, INCX .align 4 LL(127): andi. r0, N, 1 beq LL(999) STFDUX A1, X, INCX2 STFDUX A1, X, INCX b LL(999) .align 4 LL(150): sub Y, X, INCX2 sub X, X, INCX2 andi. r0, X, 2 * SIZE - 1 bne LL(160) srawi. r0, N, 3 mtspr CTR, r0 beq- LL(155) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 LFPDUX A5, X, INCX2 fxpmul B1, ALPHA, A1 LFPDUX A6, X, INCX2 fxpmul B2, ALPHA, A2 LFPDUX A7, X, INCX2 fxpmul B3, ALPHA, A3 LFPDUX A8, X, INCX2 fxpmul B4, ALPHA, A4 fxpmul B5, ALPHA, A5 fxcxnpma B1, ALPHA, A1, B1 fxcxnpma B2, ALPHA, A2, B2 bdz LL(153) .align 4 LL(152): fxcxnpma B3, ALPHA, A3, B3 LFPDUX A1, X, INCX2 fxpmul B6, ALPHA, A6 STFPDUX B1, Y, INCX2 fxcxnpma B4, ALPHA, A4, B4 LFPDUX A2, X, INCX2 fxpmul B7, ALPHA, A7 STFPDUX B2, Y, INCX2 fxcxnpma B5, ALPHA, A5, B5 LFPDUX A3, X, INCX2 fxpmul B8, ALPHA, A8 STFPDUX B3, Y, INCX2 fxcxnpma B6, ALPHA, A6, B6 LFPDUX A4, X, INCX2 fxpmul B1, ALPHA, A1 STFPDUX B4, Y, INCX2 fxcxnpma B7, ALPHA, A7, B7 LFPDUX A5, X, INCX2 fxpmul B2, ALPHA, A2 STFPDUX B5, Y, INCX2 fxcxnpma B8, ALPHA, A8, B8 LFPDUX A6, X, INCX2 fxpmul B3, ALPHA, A3 STFPDUX B6, Y, INCX2 fxcxnpma B1, ALPHA, A1, B1 LFPDUX A7, X, INCX2 fxpmul B4, ALPHA, A4 STFPDUX B7, Y, INCX2 fxcxnpma B2, ALPHA, A2, B2 LFPDUX A8, X, INCX2 fxpmul B5, ALPHA, A5 STFPDUX B8, Y, INCX2 bdnz LL(152) .align 4 LL(153): fxcxnpma B3, ALPHA, A3, B3 fxpmul B6, ALPHA, A6 STFPDUX B1, Y, INCX2 fxcxnpma B4, ALPHA, A4, B4 fxpmul B7, ALPHA, A7 STFPDUX B2, Y, INCX2 fxcxnpma B5, ALPHA, A5, B5 fxpmul B8, ALPHA, A8 STFPDUX B3, Y, INCX2 fxcxnpma B6, ALPHA, A6, B6 STFPDUX B4, Y, INCX2 fxcxnpma B7, ALPHA, A7, B7 STFPDUX B5, Y, INCX2 fxcxnpma B8, ALPHA, A8, B8 STFPDUX B6, Y, INCX2 STFPDUX B7, Y, INCX2 STFPDUX B8, Y, INCX2 .align 4 LL(155): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(156) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 LFPDUX A3, X, INCX2 LFPDUX A4, X, INCX2 fxpmul B1, ALPHA, A1 fxpmul B2, ALPHA, A2 fxpmul B3, ALPHA, A3 fxpmul B4, ALPHA, A4 fxcxnpma B1, ALPHA, A1, B1 fxcxnpma B2, ALPHA, A2, B2 fxcxnpma B3, ALPHA, A3, B3 fxcxnpma B4, ALPHA, A4, B4 STFPDUX B1, Y, INCX2 STFPDUX B2, Y, INCX2 STFPDUX B3, Y, INCX2 STFPDUX B4, Y, INCX2 .align 4 LL(156): andi. r0, N, 2 beq LL(157) LFPDUX A1, X, INCX2 LFPDUX A2, X, INCX2 fxpmul B1, ALPHA, A1 fxpmul B2, ALPHA, A2 fxcxnpma B1, ALPHA, A1, B1 fxcxnpma B2, ALPHA, A2, B2 STFPDUX B1, Y, INCX2 STFPDUX B2, Y, INCX2 .align 4 LL(157): andi. r0, N, 1 beq LL(999) LFPDUX A1, X, INCX2 fxpmul B1, ALPHA, A1 fxcxnpma B1, ALPHA, A1, B1 STFPDUX B1, Y, INCX2 b LL(999) .align 4 LL(160): addi XX, X, SIZE addi YY, Y, SIZE srawi. r0, N, 2 mtspr CTR, r0 beq- LL(165) LFDUX A1, X, INCX2 LFDUX A2, XX, INCX2 LFDUX A3, X, INCX2 LFDUX A4, XX, INCX2 LFDUX A5, X, INCX2 fmul B1, ALPHA, A1 LFDUX A6, XX, INCX2 fmul B2, ALPHA_I, A1 LFDUX A7, X, INCX2 fmul B3, ALPHA, A3 LFDUX A8, XX, INCX2 fmul B4, ALPHA_I, A3 fmul B5, ALPHA, A5 fnmsub B1, ALPHA_I, A2, B1 fmadd B2, ALPHA , A2, B2 bdz LL(163) .align 4 LL(162): fnmsub B3, ALPHA_I, A4, B3 LFDUX A1, X, INCX2 fmul B6, ALPHA_I, A5 STFDUX B1, Y, INCX2 fmadd B4, ALPHA , A4, B4 LFDUX A2, XX, INCX2 fmul B7, ALPHA, A7 STFDUX B2, YY, INCX2 fnmsub B5, ALPHA_I, A6, B5 LFDUX A3, X, INCX2 fmul B8, ALPHA_I, A7 STFDUX B3, Y, INCX2 fmadd B6, ALPHA , A6, B6 LFDUX A4, XX, INCX2 fmul B1, ALPHA, A1 STFDUX B4, YY, INCX2 fnmsub B7, ALPHA_I, A8, B7 LFDUX A5, X, INCX2 fmul B2, ALPHA_I, A1 STFDUX B5, Y, INCX2 fmadd B8, ALPHA , A8, B8 LFDUX A6, XX, INCX2 fmul B3, ALPHA, A3 STFDUX B6, YY, INCX2 fnmsub B1, ALPHA_I, A2, B1 LFDUX A7, X, INCX2 fmul B4, ALPHA_I, A3 STFDUX B7, Y, INCX2 fmadd B2, ALPHA , A2, B2 LFDUX A8, XX, INCX2 fmul B5, ALPHA, A5 STFDUX B8, YY, INCX2 bdnz LL(162) .align 4 LL(163): fnmsub B3, ALPHA_I, A4, B3 fmul B6, ALPHA_I, A5 STFDUX B1, Y, INCX2 fmadd B4, ALPHA , A4, B4 fmul B7, ALPHA, A7 STFDUX B2, YY, INCX2 fnmsub B5, ALPHA_I, A6, B5 fmul B8, ALPHA_I, A7 STFDUX B3, Y, INCX2 fmadd B6, ALPHA , A6, B6 STFDUX B4, YY, INCX2 fnmsub B7, ALPHA_I, A8, B7 STFDUX B5, Y, INCX2 fmadd B8, ALPHA , A8, B8 STFDUX B6, YY, INCX2 STFDUX B7, Y, INCX2 STFDUX B8, YY, INCX2 .align 4 LL(165): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(167) LFDUX A1, X, INCX2 LFDUX A2, XX, INCX2 LFDUX A3, X, INCX2 LFDUX A4, XX, INCX2 fmul B1, ALPHA, A1 fmul B2, ALPHA, A2 fmul B3, ALPHA, A3 fmul B4, ALPHA, A4 fnmsub B1, ALPHA_I, A2, B1 fmadd B2, ALPHA_I, A1, B2 fnmsub B3, ALPHA_I, A4, B3 fmadd B4, ALPHA_I, A3, B4 STFDUX B1, Y, INCX2 STFDUX B2, YY, INCX2 STFDUX B3, Y, INCX2 STFDUX B4, YY, INCX2 .align 4 LL(167): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX A2, XX, INCX2 fmul B1, ALPHA, A1 fmul B2, ALPHA, A2 fnmsub B1, ALPHA_I, A2, B1 fmadd B2, ALPHA_I, A1, B2 STFDUX B1, Y, INCX2 STFDUX B2, YY, INCX2 .align 4 LL(999): li r10, 16 lfpdux f17, SP, r10 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zscal_microk_power8.c000066400000000000000000000163741313527062700217230ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/25 Werner Saar (wernsaar@googlemail.com) * * I don't use fused multipy-add ( lapack precision problems ) * * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_8 1 static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) { __vector double t0; __vector double t1; __vector double t2; __vector double t3; __vector double t4; __vector double t5; __vector double t6; __vector double t7; __vector double t8; __vector double t9; __vector double t10; __vector double t11; __asm__ ( "dcbt 0, %2 \n\t" "xsnegdp 33, %x16 \n\t" // -alpha_i "xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r "xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 41, %17, %2 \n\t" "lxvd2x 42, %18, %2 \n\t" "lxvd2x 43, %19, %2 \n\t" "lxvd2x 44, %20, %2 \n\t" "lxvd2x 45, %21, %2 \n\t" "lxvd2x 46, %22, %2 \n\t" "lxvd2x 47, %23, %2 \n\t" "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" "ble 2f \n\t" ".p2align 5 \n" "1: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" "xvmuldp 50, 42, 32 \n\t" "xvmuldp 51, 43, 32 \n\t" "xvmuldp %x3, 44, 32 \n\t" "xvmuldp %x4, 45, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" "xxswapd %x7, 40 \n\t" "xxswapd %x8, 41 \n\t" "xxswapd %x9, 42 \n\t" "xxswapd %x10, 43 \n\t" "xxswapd %x11, 44 \n\t" "xxswapd %x12, 45 \n\t" "xxswapd %x13, 46 \n\t" "xxswapd %x14, 47 \n\t" "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x8, %x8, 33 \n\t" "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 41, %17, %2 \n\t" "xvmuldp %x9, %x9, 33 \n\t" "xvmuldp %x10, %x10, 33 \n\t" "lxvd2x 42, %18, %2 \n\t" "lxvd2x 43, %19, %2 \n\t" "xvmuldp %x11, %x11, 33 \n\t" "xvmuldp %x12, %x12, 33 \n\t" "lxvd2x 44, %20, %2 \n\t" "lxvd2x 45, %21, %2 \n\t" "xvmuldp %x13, %x13, 33 \n\t" "xvmuldp %x14, %x14, 33 \n\t" "lxvd2x 46, %22, %2 \n\t" "lxvd2x 47, %23, %2 \n\t" "addi %2, %2, -128 \n\t" "xvadddp 48, 48, %x7 \n\t" "xvadddp 49, 49, %x8 \n\t" "xvadddp 50, 50, %x9 \n\t" "xvadddp 51, 51, %x10 \n\t" "stxvd2x 48, 0, %2 \n\t" "stxvd2x 49, %17, %2 \n\t" "xvadddp %x3, %x3, %x11 \n\t" "xvadddp %x4, %x4, %x12 \n\t" "stxvd2x 50, %18, %2 \n\t" "stxvd2x 51, %19, %2 \n\t" "xvadddp %x5, %x5, %x13 \n\t" "xvadddp %x6, %x6, %x14 \n\t" "stxvd2x %x3, %20, %2 \n\t" "stxvd2x %x4, %21, %2 \n\t" "stxvd2x %x5, %22, %2 \n\t" "stxvd2x %x6, %23, %2 \n\t" "addi %2, %2, 256 \n\t" "addic. %1, %1, -8 \n\t" "bgt 1b \n" "2: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" "xvmuldp 50, 42, 32 \n\t" "xvmuldp 51, 43, 32 \n\t" "xvmuldp %x3, 44, 32 \n\t" "xvmuldp %x4, 45, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" "xxswapd %x7, 40 \n\t" "xxswapd %x8, 41 \n\t" "xxswapd %x9, 42 \n\t" "xxswapd %x10, 43 \n\t" "xxswapd %x11, 44 \n\t" "xxswapd %x12, 45 \n\t" "xxswapd %x13, 46 \n\t" "xxswapd %x14, 47 \n\t" "addi %2, %2, -128 \n\t" "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x8, %x8, 33 \n\t" "xvmuldp %x9, %x9, 33 \n\t" "xvmuldp %x10, %x10, 33 \n\t" "xvmuldp %x11, %x11, 33 \n\t" "xvmuldp %x12, %x12, 33 \n\t" "xvmuldp %x13, %x13, 33 \n\t" "xvmuldp %x14, %x14, 33 \n\t" "xvadddp 48, 48, %x7 \n\t" "xvadddp 49, 49, %x8 \n\t" "xvadddp 50, 50, %x9 \n\t" "xvadddp 51, 51, %x10 \n\t" "stxvd2x 48, 0, %2 \n\t" "stxvd2x 49, %17, %2 \n\t" "xvadddp %x3, %x3, %x11 \n\t" "xvadddp %x4, %x4, %x12 \n\t" "stxvd2x 50, %18, %2 \n\t" "stxvd2x 51, %19, %2 \n\t" "xvadddp %x5, %x5, %x13 \n\t" "xvadddp %x6, %x6, %x14 \n\t" "stxvd2x %x3, %20, %2 \n\t" "stxvd2x %x4, %21, %2 \n\t" "stxvd2x %x5, %22, %2 \n\t" "stxvd2x %x6, %23, %2 \n" "#n=%1 x=%0=%2 alpha=(%15,%16) o16=%17 o32=%18 o48=%19 o64=%20 o80=%21 o96=%22 o112=%23\n" "#t0=%x3 t1=%x4 t2=%x5 t3=%x6 t4=%x7 t5=%x8 t6=%x9 t7=%x10 t8=%x11 t9=%x12 t10=%x13 t11=%x14" : "+m" (*x), "+r" (n), // 1 "+b" (x), // 2 "=wa" (t0), // 3 "=wa" (t1), // 4 "=wa" (t2), // 5 "=wa" (t3), // 6 "=wa" (t4), // 7 "=wa" (t5), // 8 "=wa" (t6), // 9 "=wa" (t7), // 10 "=wa" (t8), // 11 "=wa" (t9), // 12 "=wa" (t10), // 13 "=wa" (t11) // 14 : "d" (alpha_r), // 15 "d" (alpha_i), // 16 "b" (16), // 17 "b" (32), // 18 "b" (48), // 19 "b" (64), // 20 "b" (80), // 21 "b" (96), // 22 "b" (112) // 23 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51" ); } OpenBLAS-0.2.20/kernel/power/zscal_ppc440.S000066400000000000000000000145601313527062700201200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define XX r4 #define PRE r5 #ifdef linux #ifndef __64BIT__ #define X r6 #define INCX r7 #else #define X r8 #define INCX r9 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define X r10 #define INCX r8 #else #define X r8 #define INCX r9 #endif #endif #define INC1 r11 #define FZERO f0 #define ALPHA_R f1 #define ALPHA_I f2 PROLOGUE PROFCODE addi SP, SP, -8 li r0, 0 stw r0, 0(SP) lfs FZERO, 0(SP) addi SP, SP, 8 #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCX, 56(SP) #endif slwi INCX, INCX, ZBASE_SHIFT li INC1, SIZE sub X, X, INCX li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 blelr- cr0 fcmpu cr0, FZERO, ALPHA_R bne- cr0, LL(A1I1) fcmpu cr0, FZERO, ALPHA_I bne- cr0, LL(A1I1) LL(A0IN): srawi. r0, N, 3 mtspr CTR, r0 beq- LL(A0IN_Remain) .align 4 LL(A0IN_Kernel): #ifdef PPCG4 dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 #if defined(PPCG4) && defined(DOUBLE) dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 #ifdef PPCG4 dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 #if defined(PPCG4) && defined(DOUBLE) dcbtst X, PRE #endif STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 bdnz LL(A0IN_Kernel) .align 4 LL(A0IN_Remain): andi. r0, N, 7 mtspr CTR, r0 beqlr+ .align 4 LL(A0IN_RemainKernel): STFDUX FZERO, X, INCX STFDX FZERO, X, INC1 bdnz LL(A0IN_RemainKernel) blr .align 4 LL(A1I1): mr XX, X srawi. r0, N, 2 mtspr CTR, r0 beq- LL(15) LFDUX f0, X, INCX LFDX f3, X, INC1 LFDUX f4, X, INCX LFDX f5, X, INC1 LFDUX f6, X, INCX FMUL f10, ALPHA_R, f0 LFDX f7, X, INC1 FMUL f11, ALPHA_R, f3 LFDUX f8, X, INCX FMUL f12, ALPHA_R, f4 FMUL f13, ALPHA_R, f5 bdz LL(13) .align 4 LL(12): #ifdef PPCG4 dcbtst X, PRE #endif FNMSUB f10, ALPHA_I, f3, f10 LFDX f9, X, INC1 FMADD f11, ALPHA_I, f0, f11 LFDUX f0, X, INCX FNMSUB f12, ALPHA_I, f5, f12 LFDX f3, X, INC1 FMADD f13, ALPHA_I, f4, f13 LFDUX f4, X, INCX #if defined(PPCG4) && defined(DOUBLE) dcbtst X, PRE #endif STFDUX f10, XX, INCX FMUL f10, ALPHA_R, f6 STFDX f11, XX, INC1 FMUL f11, ALPHA_R, f7 STFDUX f12, XX, INCX FMUL f12, ALPHA_R, f8 STFDX f13, XX, INC1 FMUL f13, ALPHA_R, f9 #ifdef PPCG4 dcbtst X, PRE #endif FNMSUB f10, ALPHA_I, f7, f10 LFDX f5, X, INC1 FMADD f11, ALPHA_I, f6, f11 LFDUX f6, X, INCX FNMSUB f12, ALPHA_I, f9, f12 LFDX f7, X, INC1 FMADD f13, ALPHA_I, f8, f13 LFDUX f8, X, INCX #if defined(PPCG4) && defined(DOUBLE) dcbtst X, PRE #endif STFDUX f10, XX, INCX FMUL f10, ALPHA_R, f0 STFDX f11, XX, INC1 FMUL f11, ALPHA_R, f3 STFDUX f12, XX, INCX FMUL f12, ALPHA_R, f4 STFDX f13, XX, INC1 FMUL f13, ALPHA_R, f5 bdnz LL(12) .align 4 LL(13): FNMSUB f10, ALPHA_I, f3, f10 LFDX f9, X, INC1 FMADD f11, ALPHA_I, f0, f11 FNMSUB f12, ALPHA_I, f5, f12 FMADD f13, ALPHA_I, f4, f13 STFDUX f10, XX, INCX FMUL f10, ALPHA_R, f6 STFDX f11, XX, INC1 FMUL f11, ALPHA_R, f7 STFDUX f12, XX, INCX FMUL f12, ALPHA_R, f8 STFDX f13, XX, INC1 FMUL f13, ALPHA_R, f9 FNMSUB f10, ALPHA_I, f7, f10 FMADD f11, ALPHA_I, f6, f11 FNMSUB f12, ALPHA_I, f9, f12 FMADD f13, ALPHA_I, f8, f13 STFDUX f10, XX, INCX STFDX f11, XX, INC1 STFDUX f12, XX, INCX STFDX f13, XX, INC1 .align 4 LL(15): andi. r0, N, 3 mtspr CTR, r0 beqlr+ .align 4 LL(A1IN_RemainKernel): LFDUX f3, X, INCX LFDX f4, X, INC1 FMUL f5, ALPHA_R, f3 FMUL f6, ALPHA_R, f4 FNMSUB f5, ALPHA_I, f4, f5 FMADD f6, ALPHA_I, f3, f6 STFDUX f5, XX, INCX STFDX f6, XX, INC1 bdnz LL(A1IN_RemainKernel) blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zswap.S000066400000000000000000000214611313527062700170540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define PREA r4 #define XX r5 #define YY r10 #else #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 #define XX r6 #define YY r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r10 #define INCX r4 #define Y r5 #define INCY r6 #define PREA r7 #define XX r8 #define YY r9 #else #define N r3 #define X r8 #define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 #define XX r6 #define YY r7 #endif #endif #define INCXM1 r11 #define INCYM1 r12 #define STACKSIZE 160 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #if defined(linux) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) #else lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT subi INCXM1, INCX, SIZE subi INCYM1, INCY, SIZE #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(100) cmpwi cr0, INCY, 2 * SIZE bne- cr0, LL(100) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) .align 4 LL(10): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f2, 2 * SIZE(X) LFD f3, 3 * SIZE(X) LFD f16, 0 * SIZE(Y) LFD f17, 1 * SIZE(Y) LFD f18, 2 * SIZE(Y) LFD f19, 3 * SIZE(Y) LFD f4, 4 * SIZE(X) LFD f5, 5 * SIZE(X) LFD f6, 6 * SIZE(X) LFD f7, 7 * SIZE(X) LFD f20, 4 * SIZE(Y) LFD f21, 5 * SIZE(Y) LFD f22, 6 * SIZE(Y) LFD f23, 7 * SIZE(Y) LFD f8, 8 * SIZE(X) LFD f9, 9 * SIZE(X) LFD f10, 10 * SIZE(X) LFD f11, 11 * SIZE(X) LFD f24, 8 * SIZE(Y) LFD f25, 9 * SIZE(Y) LFD f26, 10 * SIZE(Y) LFD f27, 11 * SIZE(Y) LFD f12, 12 * SIZE(X) LFD f13, 13 * SIZE(X) LFD f14, 14 * SIZE(X) LFD f15, 15 * SIZE(X) LFD f28, 12 * SIZE(Y) LFD f29, 13 * SIZE(Y) LFD f30, 14 * SIZE(Y) LFD f31, 15 * SIZE(Y) STFD f16, 0 * SIZE(X) STFD f17, 1 * SIZE(X) STFD f18, 2 * SIZE(X) STFD f19, 3 * SIZE(X) STFD f0, 0 * SIZE(Y) STFD f1, 1 * SIZE(Y) STFD f2, 2 * SIZE(Y) STFD f3, 3 * SIZE(Y) STFD f20, 4 * SIZE(X) STFD f21, 5 * SIZE(X) STFD f22, 6 * SIZE(X) STFD f23, 7 * SIZE(X) STFD f4, 4 * SIZE(Y) STFD f5, 5 * SIZE(Y) STFD f6, 6 * SIZE(Y) STFD f7, 7 * SIZE(Y) STFD f24, 8 * SIZE(X) STFD f25, 9 * SIZE(X) STFD f26, 10 * SIZE(X) STFD f27, 11 * SIZE(X) STFD f8, 8 * SIZE(Y) STFD f9, 9 * SIZE(Y) STFD f10, 10 * SIZE(Y) STFD f11, 11 * SIZE(Y) STFD f28, 12 * SIZE(X) STFD f29, 13 * SIZE(X) STFD f30, 14 * SIZE(X) STFD f31, 15 * SIZE(X) STFD f12, 12 * SIZE(Y) STFD f13, 13 * SIZE(Y) STFD f14, 14 * SIZE(Y) STFD f15, 15 * SIZE(Y) addi X, X, 16 * SIZE addi Y, Y, 16 * SIZE dcbtst X, PREA #ifdef L1_DUALFETCH dcbtst Y, PREA #endif bdnz LL(10) .align 4 LL(50): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(60): LFD f0, 0 * SIZE(X) LFD f1, 1 * SIZE(X) LFD f2, 0 * SIZE(Y) LFD f3, 1 * SIZE(Y) STFD f2, 0 * SIZE(X) STFD f3, 1 * SIZE(X) STFD f0, 0 * SIZE(Y) STFD f1, 1 * SIZE(Y) addi X, X, 2 * SIZE addi Y, Y, 2 * SIZE bdnz LL(60) b LL(999) .align 4 LL(100): sub X, X, INCXM1 sub Y, Y, INCYM1 mr XX, X mr YY, Y srawi. r0, N, 3 mtspr CTR, r0 beq- LL(150) .align 4 LL(110): LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f2, X, INCXM1 LFDUX f3, X, INCX LFDX f16, Y, INCYM1 LFDUX f17, Y, INCY LFDX f18, Y, INCYM1 LFDUX f19, Y, INCY LFDX f4, X, INCXM1 LFDUX f5, X, INCX LFDX f6, X, INCXM1 LFDUX f7, X, INCX LFDX f20, Y, INCYM1 LFDUX f21, Y, INCY LFDX f22, Y, INCYM1 LFDUX f23, Y, INCY LFDX f8, X, INCXM1 LFDUX f9, X, INCX LFDX f10, X, INCXM1 LFDUX f11, X, INCX LFDX f24, Y, INCYM1 LFDUX f25, Y, INCY LFDX f26, Y, INCYM1 LFDUX f27, Y, INCY LFDX f12, X, INCXM1 LFDUX f13, X, INCX LFDX f14, X, INCXM1 LFDUX f15, X, INCX LFDX f28, Y, INCYM1 LFDUX f29, Y, INCY LFDX f30, Y, INCYM1 LFDUX f31, Y, INCY STFDX f16, XX, INCXM1 STFDUX f17, XX, INCX STFDX f18, XX, INCXM1 STFDUX f19, XX, INCX STFDX f0, YY, INCYM1 STFDUX f1, YY, INCY STFDX f2, YY, INCYM1 STFDUX f3, YY, INCY STFDX f20, XX, INCXM1 STFDUX f21, XX, INCX STFDX f22, XX, INCXM1 STFDUX f23, XX, INCX STFDX f4, YY, INCYM1 STFDUX f5, YY, INCY STFDX f6, YY, INCYM1 STFDUX f7, YY, INCY STFDX f24, XX, INCXM1 STFDUX f25, XX, INCX STFDX f26, XX, INCXM1 STFDUX f27, XX, INCX STFDX f8, YY, INCYM1 STFDUX f9, YY, INCY STFDX f10, YY, INCYM1 STFDUX f11, YY, INCY STFDX f28, XX, INCXM1 STFDUX f29, XX, INCX STFDX f30, XX, INCXM1 STFDUX f31, XX, INCX STFDX f12, YY, INCYM1 STFDUX f13, YY, INCY STFDX f14, YY, INCYM1 STFDUX f15, YY, INCY bdnz LL(110) .align 4 LL(150): andi. r0, N, 7 mtspr CTR, r0 beq LL(999) .align 4 LL(160): LFDX f0, X, INCXM1 LFDUX f1, X, INCX LFDX f2, Y, INCYM1 LFDUX f3, Y, INCY STFDX f2, XX, INCXM1 STFDUX f3, XX, INCX STFDX f0, YY, INCYM1 STFDUX f1, YY, INCY bdnz LL(160) .align 4 LL(999): lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zswap.c000066400000000000000000000076151313527062700171010ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #include "common.h" #if defined(POWER8) #include "zswap_microk_power8.c" #endif #ifndef HAVE_KERNEL_16 static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG i=0; FLOAT f0, f1, f2, f3, f4, f5, f6, f7; FLOAT g0, g1, g2, g3, g4, g5, g6, g7; FLOAT *x1=x; FLOAT *y1=y; while ( i 0 ) { zswap_kernel_16(n1, x, y); i=n1; ix = 2* n1; iy = 2* n1; } while(i < n) { temp[0] = x[ix] ; temp[1] = x[ix+1] ; x[ix] = y[iy] ; x[ix+1] = y[iy+1] ; y[iy] = temp[0] ; y[iy+1] = temp[1] ; ix += 2 ; iy += 2 ; i++ ; } } else { inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; while(i < n) { temp[0] = x[ix] ; temp[1] = x[ix+1] ; x[ix] = y[iy] ; x[ix+1] = y[iy+1] ; y[iy] = temp[0] ; y[iy+1] = temp[1] ; ix += inc_x2 ; iy += inc_y2 ; i++ ; } } return(0); } OpenBLAS-0.2.20/kernel/power/zswap_hummer.S000066400000000000000000000303241313527062700204270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define INCX2 r4 #define INCY2 r5 #define X2 r10 #define Y2 r11 #define A1 f0 #define A2 f1 #define A3 f2 #define A4 f3 #define A5 f4 #define B1 f5 #define B2 f6 #define B3 f7 #define B4 f8 #define B5 f9 #define T1 f10 #define T2 f11 #define T3 f12 #define T4 f13 #define T5 f14 #define T6 f15 #define T7 f16 PROLOGUE PROFCODE li r10, -16 stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX add INCY2, INCY, INCY cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, INCX, SIZE bne LL(100) cmpwi cr0, INCY, SIZE bne LL(100) sub X, X, INCX2 sub Y, Y, INCY2 mr X2, X mr Y2, Y andi. r0, X, 2 * SIZE - 1 bne LL(30) andi. r0, Y, 2 * SIZE - 1 bne LL(20) .align 4 LL(10): /* X : aligned Y : aligned */ srawi. r0, N, 2 mtspr CTR, r0 beq- LL(15) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 bdz LL(13) .align 4 LL(12): STFPDUX B1, X2, INCY2 LFPDUX B1, Y, INCY2 STFPDUX A1, Y2, INCY2 LFPDUX A1, X, INCX2 STFPDUX B2, X2, INCY2 LFPDUX B2, Y, INCY2 STFPDUX A2, Y2, INCY2 LFPDUX A2, X, INCX2 STFPDUX B3, X2, INCY2 LFPDUX B3, Y, INCY2 STFPDUX A3, Y2, INCY2 LFPDUX A3, X, INCX2 STFPDUX B4, X2, INCY2 LFPDUX B4, Y, INCY2 STFPDUX A4, Y2, INCY2 LFPDUX A4, X, INCX2 bdnz LL(12) .align 4 LL(13): STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 STFPDUX B3, X2, INCY2 STFPDUX A3, Y2, INCY2 STFPDUX B4, X2, INCY2 STFPDUX A4, Y2, INCY2 .align 4 LL(15): andi. r0, N, 3 beq LL(999) andi. r0, N, 2 beq LL(16) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 .align 4 LL(16): andi. r0, N, 1 beq LL(999) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 b LL(999) .align 4 LL(20): /* X : aligned Y : unaligned */ LFXDUX A1, X, INCX2 LFDX B1, Y, INCY2 STFSDX A1, Y2, INCY2 add Y, Y, INCY add Y2, Y2, INCY addi N, N, -1 cmpwi cr0, N, 0 ble LL(29) .align 4 srawi. r0, N, 2 mtspr CTR, r0 beq- LL(25) LFXDUX T1, X, INCX2 LFXDUX T2, Y, INCY2 LFXDUX T3, X, INCX2 LFXDUX T4, Y, INCY2 LFPDUX A4, X, INCX2 fsmr A1, T1 LFPDUX B4, Y, INCY2 fsmr B1, T2 LFPDUX A5, X, INCX2 fsmr T1, T3 LFPDUX B5, Y, INCY2 fsmr T2, T4 bdz LL(23) .align 4 LL(22): fxmr T5, A4 STFPDUX A1, Y2, INCY2 fxmr T6, B4 STFPDUX B1, X2, INCX2 fxmr A1, A5 STFPDUX T1, Y2, INCY2 fxmr B1, B5 STFPDUX T2, X2, INCX2 fsmr T3, T5 LFPDUX A2, X, INCX2 fsmr T4, T6 LFPDUX B2, Y, INCY2 fsmr T5, A1 LFPDUX A3, X, INCX2 fsmr T6, B1 LFPDUX B3, Y, INCY2 fxmr T1, A2 STFPDUX T3, Y2, INCY2 fxmr T2, B2 STFPDUX T4, X2, INCX2 fxmr T3, A3 STFPDUX T5, Y2, INCY2 fxmr T4, B3 STFPDUX T6, X2, INCX2 fsmr A1, T1 LFPDUX A4, X, INCX2 fsmr B1, T2 LFPDUX B4, Y, INCY2 fsmr T1, T3 LFPDUX A5, X, INCX2 fsmr T2, T4 LFPDUX B5, Y, INCY2 bdnz LL(22) .align 4 LL(23): fxmr T5, A4 STFPDUX A1, Y2, INCY2 fxmr T6, B4 STFPDUX B1, X2, INCX2 fxmr A1, A5 STFPDUX T1, Y2, INCY2 fxmr B1, B5 STFPDUX T2, X2, INCX2 fsmr T3, T5 fsmr T4, T6 fsmr T5, A1 fsmr T6, B1 STFPDUX T3, Y2, INCY2 STFPDUX T4, X2, INCX2 STFPDUX T5, Y2, INCY2 STFPDUX T6, X2, INCX2 .align 4 LL(25): andi. r0, N, 3 beq LL(29) andi. r0, N, 2 beq LL(27) LFXDUX A2, X, INCX2 LFXDUX B2, Y, INCY2 LFXDUX A3, X, INCX2 LFXDUX B3, Y, INCY2 fsmr A1, A2 fsmr B1, B2 fsmr A2, A3 fsmr B2, B3 STFPDUX A1, Y2, INCY2 STFPDUX B1, X2, INCX2 STFPDUX A2, Y2, INCY2 fpmr A1, A3 STFPDUX B2, X2, INCX2 fpmr B1, B3 .align 4 LL(27): andi. r0, N, 1 beq LL(29) LFXDUX A2, X, INCX2 LFXDUX B2, Y, INCY2 fsmr A1, A2 fsmr B1, B2 STFPDUX A1, Y2, INCY2 fpmr A1, A2 STFPDUX B1, X2, INCX2 fpmr B1, B2 .align 4 LL(29): LFSDX B1, Y, INCY2 STFDX A1, Y2, INCY2 STFPDX B1, X2, INCX2 b LL(999) .align 4 LL(30): /* X : unaligned Y : aligned */ andi. r0, Y, 2 * SIZE - 1 bne LL(40) LFXDUX A1, Y, INCY2 LFDX B1, X, INCX2 STFSDX A1, X2, INCX2 add X, X, INCX add X2, X2, INCX addi N, N, -1 cmpwi cr0, N, 0 ble LL(39) .align 4 srawi. r0, N, 2 mtspr CTR, r0 beq- LL(35) LFXDUX T1, Y, INCY2 LFXDUX T2, X, INCX2 LFXDUX T3, Y, INCY2 LFXDUX T4, X, INCX2 LFPDUX A4, Y, INCY2 fsmr A1, T1 LFPDUX B4, X, INCX2 fsmr B1, T2 LFPDUX A5, Y, INCY2 fsmr T1, T3 LFPDUX B5, X, INCX2 fsmr T2, T4 bdz LL(33) .align 4 LL(32): fxmr T5, A4 STFPDUX A1, X2, INCX2 fxmr T6, B4 STFPDUX B1, Y2, INCY2 fxmr A1, A5 STFPDUX T1, X2, INCX2 fxmr B1, B5 STFPDUX T2, Y2, INCY2 fsmr T3, T5 LFPDUX A2, Y, INCY2 fsmr T4, T6 LFPDUX B2, X, INCX2 fsmr T5, A1 LFPDUX A3, Y, INCY2 fsmr T6, B1 LFPDUX B3, X, INCX2 fxmr T1, A2 STFPDUX T3, X2, INCX2 fxmr T2, B2 STFPDUX T4, Y2, INCY2 fxmr T3, A3 STFPDUX T5, X2, INCX2 fxmr T4, B3 STFPDUX T6, Y2, INCY2 fsmr A1, T1 LFPDUX A4, Y, INCY2 fsmr B1, T2 LFPDUX B4, X, INCX2 fsmr T1, T3 LFPDUX A5, Y, INCY2 fsmr T2, T4 LFPDUX B5, X, INCX2 bdnz LL(32) .align 4 LL(33): fxmr T5, A4 STFPDUX A1, X2, INCX2 fxmr T6, B4 STFPDUX B1, Y2, INCY2 fxmr A1, A5 STFPDUX T1, X2, INCX2 fxmr B1, B5 STFPDUX T2, Y2, INCY2 fsmr T3, T5 fsmr T4, T6 fsmr T5, A1 fsmr T6, B1 STFPDUX T3, X2, INCX2 STFPDUX T4, Y2, INCY2 STFPDUX T5, X2, INCX2 STFPDUX T6, Y2, INCY2 .align 4 LL(35): andi. r0, N, 3 beq LL(39) andi. r0, N, 2 beq LL(37) LFXDUX A2, Y, INCY2 LFXDUX B2, X, INCX2 LFXDUX A3, Y, INCY2 LFXDUX B3, X, INCX2 fsmr A1, A2 fsmr B1, B2 fsmr A2, A3 fsmr B2, B3 STFPDUX A1, X2, INCX2 STFPDUX B1, Y2, INCY2 STFPDUX A2, X2, INCX2 fpmr A1, A3 STFPDUX B2, Y2, INCY2 fpmr B1, B3 .align 4 LL(37): andi. r0, N, 1 beq LL(39) LFXDUX A2, Y, INCY2 LFXDUX B2, X, INCX2 fsmr A1, A2 fsmr B1, B2 STFPDUX A1, X2, INCX2 fpmr A1, A2 STFPDUX B1, Y2, INCY2 fpmr B1, B2 .align 4 LL(39): LFSDX B1, X, INCX2 STFDX A1, X2, INCX2 STFPDX B1, Y2, INCY2 b LL(999) .align 4 LL(40): /* X : unaligned Y : unaligned */ LFDX A1, Y, INCY2 LFDX B1, X, INCX2 add X, X, INCX add Y, Y, INCY addi N, N, -1 cmpwi cr0, N, 0 STFDX A1, X2, INCX2 STFDX B1, Y2, INCY2 add X2, X2, INCX add Y2, Y2, INCY ble LL(49) srawi. r0, N, 2 mtspr CTR, r0 beq- LL(45) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 LFPDUX A3, X, INCX2 LFPDUX B3, Y, INCY2 LFPDUX A4, X, INCX2 LFPDUX B4, Y, INCY2 bdz LL(43) .align 4 LL(42): STFPDUX B1, X2, INCY2 LFPDUX B1, Y, INCY2 STFPDUX A1, Y2, INCY2 LFPDUX A1, X, INCX2 STFPDUX B2, X2, INCY2 LFPDUX B2, Y, INCY2 STFPDUX A2, Y2, INCY2 LFPDUX A2, X, INCX2 STFPDUX B3, X2, INCY2 LFPDUX B3, Y, INCY2 STFPDUX A3, Y2, INCY2 LFPDUX A3, X, INCX2 STFPDUX B4, X2, INCY2 LFPDUX B4, Y, INCY2 STFPDUX A4, Y2, INCY2 LFPDUX A4, X, INCX2 bdnz LL(42) .align 4 LL(43): STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 STFPDUX B3, X2, INCY2 STFPDUX A3, Y2, INCY2 STFPDUX B4, X2, INCY2 STFPDUX A4, Y2, INCY2 .align 4 LL(45): andi. r0, N, 3 beq LL(49) andi. r0, N, 2 beq LL(46) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 LFPDUX A2, X, INCX2 LFPDUX B2, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 STFPDUX B2, X2, INCY2 STFPDUX A2, Y2, INCY2 .align 4 LL(46): andi. r0, N, 1 beq LL(49) LFPDUX A1, X, INCX2 LFPDUX B1, Y, INCY2 STFPDUX B1, X2, INCY2 STFPDUX A1, Y2, INCY2 .align 4 LL(49): LFDX A1, Y, INCY2 LFDX B1, X, INCX2 STFDX A1, X2, INCX2 STFDX B1, Y2, INCY2 b LL(999) .align 4 LL(100): subi INCX2, INCX2, SIZE subi INCY2, INCY2, SIZE li INCX, SIZE li INCY, SIZE sub X, X, INCX2 sub Y, Y, INCY2 mr X2, X mr Y2, Y srawi. r0, N, 1 mtspr CTR, r0 beq- LL(115) LFDUX A1, X, INCX2 LFDUX B1, Y, INCY2 LFDUX A2, X, INCX LFDUX B2, Y, INCY LFDUX A3, X, INCX2 LFDUX B3, Y, INCY2 LFDUX A4, X, INCX LFDUX B4, Y, INCY bdz LL(113) .align 4 LL(112): STFDUX B1, X2, INCX2 LFDUX B1, Y, INCY2 STFDUX A1, Y2, INCY2 LFDUX A1, X, INCX2 STFDUX B2, X2, INCX LFDUX B2, Y, INCY STFDUX A2, Y2, INCY LFDUX A2, X, INCX STFDUX B3, X2, INCX2 LFDUX B3, Y, INCY2 STFDUX A3, Y2, INCY2 LFDUX A3, X, INCX2 STFDUX B4, X2, INCX LFDUX B4, Y, INCY STFDUX A4, Y2, INCY LFDUX A4, X, INCX bdnz LL(112) .align 4 LL(113): STFDUX B1, X2, INCX2 STFDUX A1, Y2, INCY2 STFDUX B2, X2, INCX STFDUX A2, Y2, INCY STFDUX B3, X2, INCX2 STFDUX A3, Y2, INCY2 STFDUX B4, X2, INCX STFDUX A4, Y2, INCY .align 4 LL(115): andi. r0, N, 1 beq LL(999) LFDUX A1, X, INCX2 LFDUX A2, X, INCX LFDUX B1, Y, INCY2 LFDUX B2, Y, INCY STFDUX B1, X2, INCX2 STFDUX B2, X2, INCX STFDUX A1, Y2, INCY2 STFDUX A2, Y2, INCY .align 4 LL(999): li r10, 16 addi SP, SP, -16 lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 addi SP, SP, 16 blr EPILOGUE OpenBLAS-0.2.20/kernel/power/zswap_microk_power8.c000066400000000000000000000121011313527062700217330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/27 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #define HAVE_KERNEL_16 1 static void zswap_kernel_16 (long n, double *x, double *y) { __asm__ ( ".p2align 5 \n" "1: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" "lxvd2x 34, %6, %4 \n\t" "lxvd2x 35, %7, %4 \n\t" "lxvd2x 36, %8, %4 \n\t" "lxvd2x 37, %9, %4 \n\t" "lxvd2x 38, %10, %4 \n\t" "lxvd2x 39, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "lxvd2x 40, 0, %4 \n\t" "lxvd2x 41, %5, %4 \n\t" "lxvd2x 42, %6, %4 \n\t" "lxvd2x 43, %7, %4 \n\t" "lxvd2x 44, %8, %4 \n\t" "lxvd2x 45, %9, %4 \n\t" "lxvd2x 46, %10, %4 \n\t" "lxvd2x 47, %11, %4 \n\t" "addi %4, %4, -128 \n\t" "lxvd2x 48, 0, %3 \n\t" "lxvd2x 49, %5, %3 \n\t" "lxvd2x 50, %6, %3 \n\t" "lxvd2x 51, %7, %3 \n\t" "lxvd2x 0, %8, %3 \n\t" "lxvd2x 1, %9, %3 \n\t" "lxvd2x 2, %10, %3 \n\t" "lxvd2x 3, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "lxvd2x 4, 0, %3 \n\t" "lxvd2x 5, %5, %3 \n\t" "lxvd2x 6, %6, %3 \n\t" "lxvd2x 7, %7, %3 \n\t" "lxvd2x 8, %8, %3 \n\t" "lxvd2x 9, %9, %3 \n\t" "lxvd2x 10, %10, %3 \n\t" "lxvd2x 11, %11, %3 \n\t" "addi %3, %3, -128 \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" "stxvd2x 34, %6, %3 \n\t" "stxvd2x 35, %7, %3 \n\t" "stxvd2x 36, %8, %3 \n\t" "stxvd2x 37, %9, %3 \n\t" "stxvd2x 38, %10, %3 \n\t" "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" "stxvd2x 42, %6, %3 \n\t" "stxvd2x 43, %7, %3 \n\t" "stxvd2x 44, %8, %3 \n\t" "stxvd2x 45, %9, %3 \n\t" "stxvd2x 46, %10, %3 \n\t" "stxvd2x 47, %11, %3 \n\t" "addi %3, %3, 128 \n\t" "stxvd2x 48, 0, %4 \n\t" "stxvd2x 49, %5, %4 \n\t" "stxvd2x 50, %6, %4 \n\t" "stxvd2x 51, %7, %4 \n\t" "stxvd2x 0, %8, %4 \n\t" "stxvd2x 1, %9, %4 \n\t" "stxvd2x 2, %10, %4 \n\t" "stxvd2x 3, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "stxvd2x 4, 0, %4 \n\t" "stxvd2x 5, %5, %4 \n\t" "stxvd2x 6, %6, %4 \n\t" "stxvd2x 7, %7, %4 \n\t" "stxvd2x 8, %8, %4 \n\t" "stxvd2x 9, %9, %4 \n\t" "stxvd2x 10, %10, %4 \n\t" "stxvd2x 11, %11, %4 \n\t" "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" "bgt 1b \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : "+m" (*x), "+m" (*y), "+r" (n), // 2 "+b" (x), // 3 "+b" (y) // 4 : "b" (16), // 5 "b" (32), // 6 "b" (48), // 7 "b" (64), // 8 "b" (80), // 9 "b" (96), // 10 "b" (112) // 11 : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3", "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11" ); } OpenBLAS-0.2.20/kernel/power/zsymv_L.S000066400000000000000000001006201313527062700173460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define A r5 #define LDA r6 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define BUFFER r14 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #define BUFFER r14 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define A r9 #define LDA r10 #define X r5 #define INCX r6 #define Y r7 #define INCY r8 #define BUFFER r14 #else #define M r3 #define N r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #define BUFFER r14 #endif #endif #define I r11 #define J r12 #define AO1 r15 #define AO2 r16 #define AO3 r17 #define AO4 r18 #define XX r19 #define YY r20 #define NEW_Y r21 #define TEMP r22 #define PREA r24 #define IS r25 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define xtemp1 f8 #define xtemp2 f9 #define xtemp3 f10 #define xtemp4 f11 #define xtemp5 f12 #define xtemp6 f13 #define xtemp7 f14 #define xtemp8 f15 #define atemp1 f16 #define atemp2 f17 #define atemp3 f18 #define atemp4 f19 #define xsum1 f20 #define xsum2 f21 #define xsum3 f22 #define xsum4 f23 #define a1 f24 #define a2 f25 #define a3 f26 #define a4 f27 #define a5 f28 #define a6 f29 #define a7 f30 #define a8 f31 #define alpha_r f1 #define alpha_i f2 #if defined(PPCG4) #define PREFETCHSIZE_A 24 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #endif #ifdef PPC970 #define PREFETCHSIZE_A 32 #endif #ifdef CELL #define PREFETCHSIZE_A 72 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 96 #endif #ifdef POWER6 #define PREFETCHSIZE_A 112 #endif #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else #define NOP1 mr LDA, LDA #define NOP2 mr INCX, INCX #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #define ALPHA_R 200(SP) #define ALPHA_I 208(SP) #define FZERO 216(SP) #else #define STACKSIZE 280 #define ALPHA_R 256(SP) #define ALPHA_I 264(SP) #define FZERO 272(SP) #endif #ifndef HEMV #define FMADD1 FNMSUB #define FMADD2 FMADD #else #define FMADD1 FMADD #define FMADD2 FNMSUB #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz X, FRAMESLOT(0) + STACKSIZE(SP) lwz INCX, FRAMESLOT(1) + STACKSIZE(SP) lwz Y, FRAMESLOT(2) + STACKSIZE(SP) lwz INCY, FRAMESLOT(3) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP) #else lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif STFD alpha_r, ALPHA_R STFD alpha_i, ALPHA_I slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, INCX, 2 * SIZE beq LL(05) mr XX, X mr X, BUFFER srawi. r0, M, 2 mtspr CTR, r0 ble LL(03) .align 4 LL(01): LFD a1, 0 * SIZE(XX) LFD a2, 1 * SIZE(XX) add XX, XX, INCX LFD a3, 0 * SIZE(XX) LFD a4, 1 * SIZE(XX) add XX, XX, INCX LFD a5, 0 * SIZE(XX) LFD a6, 1 * SIZE(XX) add XX, XX, INCX LFD a7, 0 * SIZE(XX) LFD a8, 1 * SIZE(XX) add XX, XX, INCX dcbt XX, PREA dcbtst BUFFER, PREA STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) STFD a3, 2 * SIZE(BUFFER) STFD a4, 3 * SIZE(BUFFER) STFD a5, 4 * SIZE(BUFFER) STFD a6, 5 * SIZE(BUFFER) STFD a7, 6 * SIZE(BUFFER) STFD a8, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(01) .align 4 LL(03): andi. r0, M, 3 mtspr CTR, r0 ble LL(05) .align 4 LL(04): LFD a1, 0 * SIZE(XX) LFD a2, 1 * SIZE(XX) add XX, XX, INCX STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) addi BUFFER, BUFFER, 2 * SIZE bdnz LL(04) .align 4 LL(05): mr NEW_Y, Y lfd f0, FZERO cmpwi cr0, INCY, 2 * SIZE beq LL(10) mr NEW_Y, BUFFER addi r0, M, 3 srawi. r0, r0, 2 mtspr CTR, r0 .align 4 LL(06): STFD f0, 0 * SIZE(BUFFER) STFD f0, 1 * SIZE(BUFFER) STFD f0, 2 * SIZE(BUFFER) STFD f0, 3 * SIZE(BUFFER) STFD f0, 4 * SIZE(BUFFER) STFD f0, 5 * SIZE(BUFFER) STFD f0, 6 * SIZE(BUFFER) STFD f0, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(06) .align 4 LL(10): li IS, 0 cmpwi cr0, N, 2 blt LL(20) .align 4 LL(11): mr AO1, A slwi TEMP, IS, ZBASE_SHIFT add AO2, A, LDA add XX, X, TEMP add A, AO2, LDA add YY, NEW_Y, TEMP addi A, A, 4 * SIZE NOP2 LFD y05, ALPHA_R LFD y06, ALPHA_I LFD atemp1, 0 * SIZE(XX) LFD atemp2, 1 * SIZE(XX) LFD atemp3, 2 * SIZE(XX) LFD atemp4, 3 * SIZE(XX) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMUL xsum1, atemp1, a1 addi AO2, AO2, 4 * SIZE FMUL xsum2, atemp2, a1 LFD a1, 4 * SIZE(AO1) FMUL xsum3, atemp1, a3 addi AO1, AO1, 4 * SIZE FMUL xsum4, atemp2, a3 LFD a5, 0 * SIZE(AO2) #ifndef HEMV FNMSUB xsum1, atemp2, a2, xsum1 #endif addi XX, XX, 4 * SIZE #ifndef HEMV FMADD xsum2, atemp1, a2, xsum2 #endif LFD a2, 1 * SIZE(AO1) FNMSUB xsum3, atemp2, a4, xsum3 addi YY, YY, 4 * SIZE FMADD xsum4, atemp1, a4, xsum4 LFD a6, 1 * SIZE(AO2) FMADD xsum1, atemp3, a3, xsum1 sub TEMP, M, IS FMADD xsum2, atemp4, a3, xsum2 LFD a3, 2 * SIZE(AO1) FMADD xsum3, atemp3, a7, xsum3 addi TEMP, TEMP, -2 FMADD xsum4, atemp4, a7, xsum4 LFD a7, 2 * SIZE(AO2) FMADD1 xsum1, atemp4, a4, xsum1 srawi. r0, TEMP, 3 FMADD2 xsum2, atemp3, a4, xsum2 LFD a4, 3 * SIZE(AO1) #ifndef HEMV FMADD1 xsum3, atemp4, a8, xsum3 #endif mtspr CTR, r0 #ifndef HEMV FMADD2 xsum4, atemp3, a8, xsum4 #endif LFD a8, 3 * SIZE(AO2) FMUL xtemp1, y05, atemp1 LFD y01, 0 * SIZE(YY) FMUL xtemp2, y06, atemp1 LFD y02, 1 * SIZE(YY) FMUL xtemp3, y05, atemp3 LFD y03, 2 * SIZE(YY) FMUL xtemp4, y06, atemp3 LFD y04, 3 * SIZE(YY) FNMSUB atemp1, y06, atemp2, xtemp1 LFD xtemp1, 0 * SIZE(XX) FMADD atemp2, y05, atemp2, xtemp2 LFD xtemp2, 1 * SIZE(XX) FNMSUB atemp3, y06, atemp4, xtemp3 LFD xtemp3, 2 * SIZE(XX) FMADD atemp4, y05, atemp4, xtemp4 LFD xtemp4, 3 * SIZE(XX) NOP1 ble LL(15) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO1, PREA) FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 NOP1 FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 NOP1 FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 4 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 5 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 6 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 6 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 7 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 6 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 # DCBT(X, PREX) NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 7 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 7 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 7 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 0 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 1 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 8 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 2 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 3 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 8 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 9 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 9 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 9 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y08, atemp1, a4, y08 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 10 * SIZE(YY) FMADD y05, atemp3, a5, y05 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 10 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 8 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 11 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y08, atemp4, a7, y08 LFD a7, 10 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 11 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 9 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 11 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 bdz LL(13) .align 4 LL(12): FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 11 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y05, 4 * SIZE(YY) FMADD y01, atemp1, a1, y01 DCBT(AO2, PREA) FMADD xsum2, xtemp2, a1, xsum2 STFD y06, 5 * SIZE(YY) FMADD y02, atemp2, a1, y02 LFD a1, 12 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y07, 6 * SIZE(YY) FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y08, 7 * SIZE(YY) FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 12 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 13 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 13 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 13 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 12 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 14 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 14 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 12 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 15 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 14 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 # DCBT(Y1, PREY) NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 15 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 13 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 15 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 14 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 15 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 8 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 9 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 16 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 10 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 11 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 16 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 17 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 17 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 17 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 16 * SIZE(XX) FMADD y08, atemp1, a4, y08 addi AO2, AO2, 16 * SIZE FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 18 * SIZE(YY) FMADD y05, atemp3, a5, y05 addi XX, XX, 16 * SIZE FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 18 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 0 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 19 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 addi AO1, AO1, 16 * SIZE FMADD y08, atemp4, a7, y08 LFD a7, 2 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 addi YY, YY, 16 * SIZE FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 3 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 1 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 3 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 2 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 3 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y05, -4 * SIZE(YY) FMADD y01, atemp1, a1, y01 DCBT(AO1, PREA) FMADD xsum2, xtemp2, a1, xsum2 STFD y06, -3 * SIZE(YY) FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y07, -2 * SIZE(YY) FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y08, -1 * SIZE(YY) FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 4 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 5 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 6 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 6 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 7 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 6 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 # DCBT(X, PREX) NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 7 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 7 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 7 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 0 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 1 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 8 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 2 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 3 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 8 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 9 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 9 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 9 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y08, atemp1, a4, y08 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 10 * SIZE(YY) FMADD y05, atemp3, a5, y05 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 10 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 8 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 11 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y08, atemp4, a7, y08 LFD a7, 10 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 11 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 9 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 11 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 bdnz LL(12) .align 4 LL(13): FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 11 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y05, 4 * SIZE(YY) FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y06, 5 * SIZE(YY) FMADD y02, atemp2, a1, y02 LFD a1, 12 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y07, 6 * SIZE(YY) FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y08, 7 * SIZE(YY) FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 12 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 13 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 13 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 13 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 12 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 14 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 14 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 12 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 15 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 14 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 15 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 13 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 15 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 14 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 15 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 8 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 9 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 16 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 10 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 11 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 16 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 17 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 17 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 17 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 16 * SIZE(XX) FMADD y08, atemp1, a4, y08 addi AO2, AO2, 16 * SIZE FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 18 * SIZE(YY) FMADD y05, atemp3, a5, y05 addi XX, XX, 16 * SIZE FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 18 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 0 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 19 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 addi AO1, AO1, 16 * SIZE FMADD y08, atemp4, a7, y08 LFD a7, 2 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 addi YY, YY, 16 * SIZE FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 3 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 1 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 3 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 2 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 3 * SIZE(AO2) STFD y05, -4 * SIZE(YY) STFD y06, -3 * SIZE(YY) STFD y07, -2 * SIZE(YY) STFD y08, -1 * SIZE(YY) .align 4 LL(15): andi. r0, TEMP, 4 ble LL(16) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 NOP1 FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 NOP1 FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 4 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 5 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 6 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 6 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 7 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 6 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 7 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 7 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 7 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 0 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 1 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 8 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 2 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 3 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 8 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 9 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 9 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 9 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y08, atemp1, a4, y08 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 10 * SIZE(YY) FMADD y05, atemp3, a5, y05 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 10 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 8 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 11 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 addi YY, YY, 8 * SIZE FMADD y08, atemp4, a7, y08 LFD a7, 10 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 addi AO2, AO2, 8 * SIZE FNMSUB y05, atemp4, a6, y05 addi XX, XX, 8 * SIZE FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 11 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 1 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 3 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 addi AO1, AO1, 8 * SIZE FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 2 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 3 * SIZE(AO2) STFD y05, -4 * SIZE(YY) STFD y06, -3 * SIZE(YY) STFD y07, -2 * SIZE(YY) STFD y08, -1 * SIZE(YY) .align 4 LL(16): andi. r0, TEMP, 2 ble LL(17) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 NOP1 FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 FMADD y03, atemp1, a3, y03 FMADD xsum4, xtemp2, a5, xsum4 FMADD y04, atemp2, a3, y04 FMADD1 xsum1, xtemp2, a2, xsum1 NOP1 FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 NOP1 FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 NOP1 FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 NOP1 FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 FMADD y03, atemp3, a7, y03 FMADD xsum4, xtemp4, a7, xsum4 FMADD y04, atemp4, a7, y04 FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 NOP1 FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 addi AO1, AO1, 4 * SIZE FNMSUB y03, atemp4, a8, y03 addi AO2, AO2, 4 * SIZE FMADD2 xsum4, xtemp3, a8, xsum4 addi YY, YY, 4 * SIZE FMADD y04, atemp3, a8, y04 NOP2 STFD y01, -4 * SIZE(YY) LFD y01, 0 * SIZE(YY) STFD y02, -3 * SIZE(YY) LFD y02, 1 * SIZE(YY) STFD y03, -2 * SIZE(YY) STFD y04, -1 * SIZE(YY) .align 4 LL(17): andi. r0, M, 1 ble LL(18) FMADD xsum1, xtemp1, a1, xsum1 FMADD y01, atemp1, a1, y01 FMADD xsum2, xtemp2, a1, xsum2 FMADD y02, atemp2, a1, y02 FMADD xsum3, xtemp1, a5, xsum3 FNMSUB y01, atemp2, a2, y01 FMADD xsum4, xtemp2, a5, xsum4 FMADD y02, atemp1, a2, y02 FMADD1 xsum1, xtemp2, a2, xsum1 FMADD y01, atemp3, a5, y01 FMADD2 xsum2, xtemp1, a2, xsum2 FMADD y02, atemp4, a5, y02 FMADD1 xsum3, xtemp2, a6, xsum3 FNMSUB y01, atemp4, a6, y01 FMADD2 xsum4, xtemp1, a6, xsum4 FMADD y02, atemp3, a6, y02 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) STFD y03, 2 * SIZE(YY) STFD y04, 3 * SIZE(YY) .align 4 LL(18): LFD y05, ALPHA_R LFD y06, ALPHA_I slwi TEMP, IS, ZBASE_SHIFT add YY, NEW_Y, TEMP LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) LFD y03, 2 * SIZE(YY) LFD y04, 3 * SIZE(YY) FMUL xtemp1, y05, xsum1 FMUL xtemp2, y06, xsum1 FMUL xtemp3, y05, xsum3 FMUL xtemp4, y06, xsum3 FNMSUB xsum1, y06, xsum2, xtemp1 FMADD xsum2, y05, xsum2, xtemp2 FNMSUB xsum3, y06, xsum4, xtemp3 FMADD xsum4, y05, xsum4, xtemp4 FADD y01, y01, xsum1 FADD y02, y02, xsum2 FADD y03, y03, xsum3 FADD y04, y04, xsum4 STFD y01, 0 * SIZE(YY) addi TEMP, IS, 4 STFD y02, 1 * SIZE(YY) addi IS, IS, 2 STFD y03, 2 * SIZE(YY) cmpw cr0, TEMP, N STFD y04, 3 * SIZE(YY) ble LL(11) .align 4 LL(20): andi. TEMP, N, 1 ble LL(990) slwi TEMP, IS, ZBASE_SHIFT add XX, X, TEMP add YY, NEW_Y, TEMP LFD y05, ALPHA_R LFD y06, ALPHA_I LFD atemp1, 0 * SIZE(XX) LFD atemp2, 1 * SIZE(XX) LFD a1, 0 * SIZE(A) LFD a2, 1 * SIZE(A) FMUL xsum1, atemp1, a1 FMUL xsum2, atemp2, a1 #ifndef HEMV FNMSUB xsum1, atemp2, a2, xsum1 FMADD xsum2, atemp1, a2, xsum2 #endif FMUL xtemp1, y05, atemp1 FMUL xtemp2, y06, atemp1 FNMSUB atemp1, y06, atemp2, xtemp1 FMADD atemp2, y05, atemp2, xtemp2 LFD y05, ALPHA_R LFD y06, ALPHA_I LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) FMUL xtemp1, y05, xsum1 FMUL xtemp2, y06, xsum1 FNMSUB xsum1, y06, xsum2, xtemp1 FMADD xsum2, y05, xsum2, xtemp2 FADD y01, y01, xsum1 FADD y02, y02, xsum2 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) .align 4 LL(990): cmpwi cr0, INCY, 2 * SIZE beq LL(999) mr YY, Y srawi. r0, M, 2 mtspr CTR, r0 ble LL(995) .align 4 LL(991): LFD f0, 0 * SIZE(Y) LFD f1, 1 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) LFD f3, 1 * SIZE(Y) add Y, Y, INCY LFD f4, 0 * SIZE(Y) LFD f5, 1 * SIZE(Y) add Y, Y, INCY LFD f6, 0 * SIZE(Y) LFD f7, 1 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) LFD f12, 4 * SIZE(NEW_Y) LFD f13, 5 * SIZE(NEW_Y) LFD f14, 6 * SIZE(NEW_Y) LFD f15, 7 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 8 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFD f8, 0 * SIZE(YY) STFD f9, 1 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) STFD f11, 1 * SIZE(YY) add YY, YY, INCY STFD f12, 0 * SIZE(YY) STFD f13, 1 * SIZE(YY) add YY, YY, INCY STFD f14, 0 * SIZE(YY) STFD f15, 1 * SIZE(YY) add YY, YY, INCY bdnz LL(991) .align 4 LL(995): andi. J, M, 2 ble LL(996) LFD f0, 0 * SIZE(Y) LFD f1, 1 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) LFD f3, 1 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 4 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFD f8, 0 * SIZE(YY) STFD f9, 1 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) STFD f11, 1 * SIZE(YY) add YY, YY, INCY .align 4 LL(996): andi. J, M, 1 ble LL(999) LFD f0, 0 * SIZE(Y) LFD f1, 1 * SIZE(Y) LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) FADD f8, f8, f0 FADD f9, f9, f1 STFD f8, 0 * SIZE(YY) STFD f9, 1 * SIZE(YY) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/zsymv_U.S000066400000000000000000001000471313527062700173620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef linux #ifndef __64BIT__ #define M r3 #define IS r4 #define A r5 #define LDA r6 #define X r7 #define INCX r8 #define Y r9 #define INCY r10 #define BUFFER r14 #else #define M r3 #define IS r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #define BUFFER r14 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define IS r4 #define A r9 #define LDA r10 #define X r5 #define INCX r6 #define Y r7 #define INCY r8 #define BUFFER r14 #else #define M r3 #define IS r4 #define A r7 #define LDA r8 #define X r9 #define INCX r10 #define Y r5 #define INCY r6 #define BUFFER r14 #endif #endif #define I r11 #define J r12 #define AO1 r15 #define AO2 r16 #define XX r19 #define YY r20 #define NEW_Y r21 #define TEMP r22 #define PREA r24 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define xtemp1 f8 #define xtemp2 f9 #define xtemp3 f10 #define xtemp4 f11 #define xtemp5 f12 #define xtemp6 f13 #define xtemp7 f14 #define xtemp8 f15 #define atemp1 f16 #define atemp2 f17 #define atemp3 f18 #define atemp4 f19 #define xsum1 f20 #define xsum2 f21 #define xsum3 f22 #define xsum4 f23 #define a1 f24 #define a2 f25 #define a3 f26 #define a4 f27 #define a5 f28 #define a6 f29 #define a7 f30 #define a8 f31 #define alpha_r f1 #define alpha_i f2 #if defined(PPCG4) #define PREFETCHSIZE_A 24 #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #endif #ifdef PPC970 #define PREFETCHSIZE_A 32 #endif #ifdef CELL #define PREFETCHSIZE_A 72 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 96 #endif #ifdef POWER6 #define PREFETCHSIZE_A 112 #endif #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) #define NOP1 #define NOP2 #else #define NOP1 mr LDA, LDA #define NOP2 mr INCX, INCX #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #define ALPHA_R 200(SP) #define ALPHA_I 208(SP) #define FZERO 216(SP) #else #define STACKSIZE 280 #define ALPHA_R 256(SP) #define ALPHA_I 264(SP) #define FZERO 272(SP) #endif #ifndef HEMV #define FMADD1 FNMSUB #define FMADD2 FMADD #else #define FMADD1 FMADD #define FMADD2 FNMSUB #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r0, FZERO std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r0, 0 + FZERO stw r0, 4 + FZERO stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz X, FRAMESLOT(0) + STACKSIZE(SP) lwz INCX, FRAMESLOT(1) + STACKSIZE(SP) lwz Y, FRAMESLOT(2) + STACKSIZE(SP) lwz INCY, FRAMESLOT(3) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP) #else lwz Y, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #else ld Y, FRAMESLOT(0) + STACKSIZE(SP) ld INCY, FRAMESLOT(1) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP) #endif #endif STFD alpha_r, ALPHA_R STFD alpha_i, ALPHA_I slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE sub IS, M, IS cmpwi cr0, M, 0 ble- LL(999) mullw TEMP, IS, LDA add A, A, TEMP cmpwi cr0, INCX, 2 * SIZE beq LL(05) mr XX, X mr X, BUFFER srawi. r0, M, 2 mtspr CTR, r0 ble LL(03) .align 4 LL(01): LFD a1, 0 * SIZE(XX) LFD a2, 1 * SIZE(XX) add XX, XX, INCX LFD a3, 0 * SIZE(XX) LFD a4, 1 * SIZE(XX) add XX, XX, INCX LFD a5, 0 * SIZE(XX) LFD a6, 1 * SIZE(XX) add XX, XX, INCX LFD a7, 0 * SIZE(XX) LFD a8, 1 * SIZE(XX) add XX, XX, INCX dcbt XX, PREA dcbtst BUFFER, PREA STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) STFD a3, 2 * SIZE(BUFFER) STFD a4, 3 * SIZE(BUFFER) STFD a5, 4 * SIZE(BUFFER) STFD a6, 5 * SIZE(BUFFER) STFD a7, 6 * SIZE(BUFFER) STFD a8, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(01) .align 4 LL(03): andi. r0, M, 3 mtspr CTR, r0 ble LL(05) .align 4 LL(04): LFD a1, 0 * SIZE(XX) LFD a2, 1 * SIZE(XX) add XX, XX, INCX STFD a1, 0 * SIZE(BUFFER) STFD a2, 1 * SIZE(BUFFER) addi BUFFER, BUFFER, 2 * SIZE bdnz LL(04) .align 4 LL(05): mr NEW_Y, Y lfd f0, FZERO cmpwi cr0, INCY, 2 * SIZE beq LL(10) mr NEW_Y, BUFFER addi r0, M, 3 srawi. r0, r0, 2 mtspr CTR, r0 .align 4 LL(06): STFD f0, 0 * SIZE(BUFFER) STFD f0, 1 * SIZE(BUFFER) STFD f0, 2 * SIZE(BUFFER) STFD f0, 3 * SIZE(BUFFER) STFD f0, 4 * SIZE(BUFFER) STFD f0, 5 * SIZE(BUFFER) STFD f0, 6 * SIZE(BUFFER) STFD f0, 7 * SIZE(BUFFER) addi BUFFER, BUFFER, 8 * SIZE bdnz LL(06) .align 4 LL(10): addi TEMP, IS, 2 cmpw cr0, TEMP, M bgt LL(20) .align 4 LL(11): mr AO1, A add AO2, A, LDA add A, AO2, LDA slwi TEMP, IS, ZBASE_SHIFT add TEMP, X, TEMP LFD y05, ALPHA_R LFD y06, ALPHA_I LFD xtemp1, 0 * SIZE(TEMP) LFD xtemp2, 1 * SIZE(TEMP) LFD xtemp3, 2 * SIZE(TEMP) LFD xtemp4, 3 * SIZE(TEMP) FMUL atemp1, y05, xtemp1 FMUL atemp2, y06, xtemp1 FMUL atemp3, y05, xtemp3 FMUL atemp4, y06, xtemp3 FNMSUB atemp1, y06, xtemp2, atemp1 FMADD atemp2, y05, xtemp2, atemp2 FNMSUB atemp3, y06, xtemp4, atemp3 FMADD atemp4, y05, xtemp4, atemp4 lfd xsum1, FZERO fmr xsum2, xsum1 fmr xsum3, xsum1 fmr xsum4, xsum1 mr XX, X mr YY, NEW_Y LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) LFD xtemp1, 0 * SIZE(XX) LFD xtemp2, 1 * SIZE(XX) LFD xtemp3, 2 * SIZE(XX) LFD xtemp4, 3 * SIZE(XX) LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) LFD y03, 2 * SIZE(YY) LFD y04, 3 * SIZE(YY) srawi. r0, IS, 3 mtspr CTR, r0 ble LL(15) FMADD xsum1, xtemp1, a1, xsum1 DCBT(AO1, PREA) FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 NOP1 FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 NOP1 FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 4 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 5 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 6 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 6 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 7 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 6 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 # DCBT(X, PREX) NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 7 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 7 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 7 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 0 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 1 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 8 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 2 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 3 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 8 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 9 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 9 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 9 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y08, atemp1, a4, y08 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 10 * SIZE(YY) FMADD y05, atemp3, a5, y05 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 10 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 8 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 11 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y08, atemp4, a7, y08 LFD a7, 10 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 11 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 9 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 11 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 bdz LL(13) .align 4 LL(12): FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 11 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y05, 4 * SIZE(YY) FMADD y01, atemp1, a1, y01 DCBT(AO2, PREA) FMADD xsum2, xtemp2, a1, xsum2 STFD y06, 5 * SIZE(YY) FMADD y02, atemp2, a1, y02 LFD a1, 12 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y07, 6 * SIZE(YY) FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y08, 7 * SIZE(YY) FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 12 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 13 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 13 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 13 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 12 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 14 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 14 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 12 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 15 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 14 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 # DCBT(Y1, PREY) NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 15 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 13 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 15 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 14 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 15 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 8 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 9 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 16 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 10 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 11 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 16 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 17 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 17 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 17 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 16 * SIZE(XX) FMADD y08, atemp1, a4, y08 addi AO2, AO2, 16 * SIZE FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 18 * SIZE(YY) FMADD y05, atemp3, a5, y05 addi XX, XX, 16 * SIZE FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 18 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 0 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 19 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 addi AO1, AO1, 16 * SIZE FMADD y08, atemp4, a7, y08 LFD a7, 2 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 addi YY, YY, 16 * SIZE FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 3 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 1 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 3 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 2 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 3 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y05, -4 * SIZE(YY) FMADD y01, atemp1, a1, y01 DCBT(AO1, PREA) FMADD xsum2, xtemp2, a1, xsum2 STFD y06, -3 * SIZE(YY) FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y07, -2 * SIZE(YY) FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y08, -1 * SIZE(YY) FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 4 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 5 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 6 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 6 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 7 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 6 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 # DCBT(X, PREX) NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 7 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 7 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 7 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 0 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 1 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 8 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 2 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 3 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 8 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 9 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 9 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 9 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y08, atemp1, a4, y08 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 10 * SIZE(YY) FMADD y05, atemp3, a5, y05 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 10 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 8 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 11 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y08, atemp4, a7, y08 LFD a7, 10 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 11 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 9 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 11 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 bdnz LL(12) .align 4 LL(13): FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 11 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y05, 4 * SIZE(YY) FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y06, 5 * SIZE(YY) FMADD y02, atemp2, a1, y02 LFD a1, 12 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y07, 6 * SIZE(YY) FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y08, 7 * SIZE(YY) FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 12 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 13 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 13 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 13 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 12 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 14 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 14 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 12 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 15 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 14 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 15 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 13 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 15 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 14 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 15 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 8 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 9 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 16 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 10 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 11 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 16 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 17 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 17 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 17 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 16 * SIZE(XX) FMADD y08, atemp1, a4, y08 addi AO2, AO2, 16 * SIZE FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 18 * SIZE(YY) FMADD y05, atemp3, a5, y05 addi XX, XX, 16 * SIZE FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 18 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 0 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 19 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 addi AO1, AO1, 16 * SIZE FMADD y08, atemp4, a7, y08 LFD a7, 2 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 addi YY, YY, 16 * SIZE FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 3 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 1 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 3 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 2 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 3 * SIZE(AO2) STFD y05, -4 * SIZE(YY) STFD y06, -3 * SIZE(YY) STFD y07, -2 * SIZE(YY) STFD y08, -1 * SIZE(YY) .align 4 LL(15): andi. r0, IS, 4 ble LL(16) FMADD xsum1, xtemp1, a1, xsum1 NOP1 FMADD y01, atemp1, a1, y01 NOP2 FMADD xsum2, xtemp2, a1, xsum2 NOP1 FMADD y02, atemp2, a1, y02 LFD a1, 4 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 NOP1 FMADD y03, atemp1, a3, y03 NOP2 FMADD xsum4, xtemp2, a5, xsum4 NOP1 FMADD y04, atemp2, a3, y04 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y05, 4 * SIZE(YY) FNMSUB y01, atemp2, a2, y01 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y06, 5 * SIZE(YY) FMADD y02, atemp1, a2, y02 LFD a2, 5 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 5 * SIZE(XX) FNMSUB y03, atemp2, a4, y03 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 4 * SIZE(XX) FMADD y04, atemp1, a4, y04 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y07, 6 * SIZE(YY) FMADD y01, atemp3, a5, y01 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 6 * SIZE(AO1) FMADD y02, atemp4, a5, y02 LFD a5, 4 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y08, 7 * SIZE(YY) FMADD y03, atemp3, a7, y03 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y04, atemp4, a7, y04 LFD a7, 6 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y01, atemp4, a6, y01 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 7 * SIZE(AO1) FMADD y02, atemp3, a6, y02 LFD a6, 5 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 7 * SIZE(XX) FNMSUB y03, atemp4, a8, y03 NOP2 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 6 * SIZE(XX) FMADD y04, atemp3, a8, y04 LFD a8, 7 * SIZE(AO2) FMADD xsum1, xtemp1, a1, xsum1 STFD y01, 0 * SIZE(YY) FMADD y05, atemp1, a1, y05 NOP2 FMADD xsum2, xtemp2, a1, xsum2 STFD y02, 1 * SIZE(YY) FMADD y06, atemp2, a1, y06 LFD a1, 8 * SIZE(AO1) FMADD xsum3, xtemp1, a5, xsum3 STFD y03, 2 * SIZE(YY) FMADD y07, atemp1, a3, y07 NOP2 FMADD xsum4, xtemp2, a5, xsum4 STFD y04, 3 * SIZE(YY) FMADD y08, atemp2, a3, y08 NOP2 FMADD1 xsum1, xtemp2, a2, xsum1 LFD y01, 8 * SIZE(YY) FNMSUB y05, atemp2, a2, y05 NOP2 FMADD2 xsum2, xtemp1, a2, xsum2 LFD y02, 9 * SIZE(YY) FMADD y06, atemp1, a2, y06 LFD a2, 9 * SIZE(AO1) FMADD1 xsum3, xtemp2, a6, xsum3 LFD xtemp2, 9 * SIZE(XX) FNMSUB y07, atemp2, a4, y07 NOP2 FMADD2 xsum4, xtemp1, a6, xsum4 LFD xtemp1, 8 * SIZE(XX) FMADD y08, atemp1, a4, y08 NOP2 FMADD xsum1, xtemp3, a3, xsum1 LFD y03, 10 * SIZE(YY) FMADD y05, atemp3, a5, y05 NOP2 FMADD xsum2, xtemp4, a3, xsum2 LFD a3, 10 * SIZE(AO1) FMADD y06, atemp4, a5, y06 LFD a5, 8 * SIZE(AO2) FMADD xsum3, xtemp3, a7, xsum3 LFD y04, 11 * SIZE(YY) FMADD y07, atemp3, a7, y07 NOP2 FMADD xsum4, xtemp4, a7, xsum4 NOP1 FMADD y08, atemp4, a7, y08 LFD a7, 10 * SIZE(AO2) FMADD1 xsum1, xtemp4, a4, xsum1 NOP1 FNMSUB y05, atemp4, a6, y05 NOP2 FMADD2 xsum2, xtemp3, a4, xsum2 LFD a4, 11 * SIZE(AO1) FMADD y06, atemp3, a6, y06 LFD a6, 9 * SIZE(AO2) FMADD1 xsum3, xtemp4, a8, xsum3 LFD xtemp4, 11 * SIZE(XX) FNMSUB y07, atemp4, a8, y07 FMADD2 xsum4, xtemp3, a8, xsum4 LFD xtemp3, 10 * SIZE(XX) FMADD y08, atemp3, a8, y08 LFD a8, 11 * SIZE(AO2) STFD y05, 4 * SIZE(YY) STFD y06, 5 * SIZE(YY) STFD y07, 6 * SIZE(YY) STFD y08, 7 * SIZE(YY) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi XX, XX, 8 * SIZE addi YY, YY, 8 * SIZE .align 4 LL(16): andi. r0, IS, 2 ble LL(18) FMADD xsum1, xtemp1, a1, xsum1 FMADD y01, atemp1, a1, y01 FMADD xsum2, xtemp2, a1, xsum2 FMADD y02, atemp2, a1, y02 FMADD xsum3, xtemp1, a5, xsum3 FMADD y03, atemp1, a3, y03 FMADD xsum4, xtemp2, a5, xsum4 FMADD y04, atemp2, a3, y04 FMADD1 xsum1, xtemp2, a2, xsum1 FNMSUB y01, atemp2, a2, y01 FMADD2 xsum2, xtemp1, a2, xsum2 FMADD y02, atemp1, a2, y02 FMADD1 xsum3, xtemp2, a6, xsum3 FNMSUB y03, atemp2, a4, y03 FMADD2 xsum4, xtemp1, a6, xsum4 FMADD y04, atemp1, a4, y04 FMADD xsum1, xtemp3, a3, xsum1 FMADD y01, atemp3, a5, y01 FMADD xsum2, xtemp4, a3, xsum2 FMADD y02, atemp4, a5, y02 FMADD xsum3, xtemp3, a7, xsum3 FMADD y03, atemp3, a7, y03 FMADD xsum4, xtemp4, a7, xsum4 FMADD y04, atemp4, a7, y04 FMADD1 xsum1, xtemp4, a4, xsum1 FNMSUB y01, atemp4, a6, y01 FMADD2 xsum2, xtemp3, a4, xsum2 FMADD y02, atemp3, a6, y02 FMADD1 xsum3, xtemp4, a8, xsum3 FNMSUB y03, atemp4, a8, y03 FMADD2 xsum4, xtemp3, a8, xsum4 FMADD y04, atemp3, a8, y04 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) STFD y03, 2 * SIZE(YY) STFD y04, 3 * SIZE(YY) LFD a1, 4 * SIZE(AO1) LFD a2, 5 * SIZE(AO1) LFD a5, 4 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) LFD y01, 4 * SIZE(YY) LFD y02, 5 * SIZE(YY) LFD y03, 6 * SIZE(YY) LFD y04, 7 * SIZE(YY) addi YY, YY, 4 * SIZE .align 4 LL(18): LFD y05, ALPHA_R LFD y06, ALPHA_I FMUL xtemp1, y05, xsum1 FMUL xtemp2, y06, xsum1 FMUL xtemp3, y05, xsum3 FMUL xtemp4, y06, xsum3 FNMSUB xsum1, y06, xsum2, xtemp1 FMADD xsum2, y05, xsum2, xtemp2 FNMSUB xsum3, y06, xsum4, xtemp3 FMADD xsum4, y05, xsum4, xtemp4 FMADD xsum1, atemp1, a1, xsum1 FMADD xsum2, atemp2, a1, xsum2 FMADD xsum3, atemp1, a5, xsum3 FMADD xsum4, atemp2, a5, xsum4 #ifndef HEMV FMADD1 xsum1, atemp2, a2, xsum1 FMADD2 xsum2, atemp1, a2, xsum2 #endif FMADD1 xsum3, atemp2, a6, xsum3 FMADD2 xsum4, atemp1, a6, xsum4 FMADD xsum1, atemp3, a5, xsum1 FMADD xsum2, atemp4, a5, xsum2 FMADD xsum3, atemp3, a7, xsum3 FMADD xsum4, atemp4, a7, xsum4 FNMSUB xsum1, atemp4, a6, xsum1 FMADD xsum2, atemp3, a6, xsum2 #ifndef HEMV FNMSUB xsum3, atemp4, a8, xsum3 FMADD xsum4, atemp3, a8, xsum4 #endif FADD y01, y01, xsum1 FADD y02, y02, xsum2 FADD y03, y03, xsum3 FADD y04, y04, xsum4 STFD y01, 0 * SIZE(YY) addi TEMP, IS, 4 STFD y02, 1 * SIZE(YY) addi IS, IS, 2 STFD y03, 2 * SIZE(YY) cmpw cr0, TEMP, M STFD y04, 3 * SIZE(YY) ble LL(11) .align 4 LL(20): andi. TEMP, M, 1 ble LL(990) mr AO1, A slwi TEMP, IS, ZBASE_SHIFT add TEMP, X, TEMP LFD y05, ALPHA_R LFD y06, ALPHA_I LFD xtemp1, 0 * SIZE(TEMP) LFD xtemp2, 1 * SIZE(TEMP) FMUL atemp1, y05, xtemp1 FMUL atemp2, y06, xtemp1 FNMSUB atemp1, y06, xtemp2, atemp1 FMADD atemp2, y05, xtemp2, atemp2 lfd xsum1, FZERO fmr xsum2, xsum1 mr XX, X mr YY, NEW_Y LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD xtemp1, 0 * SIZE(XX) LFD xtemp2, 1 * SIZE(XX) LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) mtspr CTR, IS cmpwi cr0, IS, 0 ble LL(28) .align 4 LL(22): FMADD xsum1, xtemp1, a1, xsum1 FMADD y01, atemp1, a1, y01 FMADD xsum2, xtemp2, a1, xsum2 FMADD y02, atemp2, a1, y02 LFD a1, 2 * SIZE(AO1) FMADD1 xsum1, xtemp2, a2, xsum1 LFD xtemp2, 3 * SIZE(XX) FNMSUB y01, atemp2, a2, y01 FMADD2 xsum2, xtemp1, a2, xsum2 LFD xtemp1, 2 * SIZE(XX) FMADD y02, atemp1, a2, y02 LFD a2, 3 * SIZE(AO1) addi AO1, AO1, 2 * SIZE addi XX, XX, 2 * SIZE addi YY, YY, 2 * SIZE STFD y01, -2 * SIZE(YY) LFD y01, 0 * SIZE(YY) STFD y02, -1 * SIZE(YY) LFD y02, 1 * SIZE(YY) bdnz LL(22) .align 4 LL(28): LFD y05, ALPHA_R LFD y06, ALPHA_I FMUL xtemp1, y05, xsum1 FMUL xtemp2, y06, xsum1 FNMSUB xsum1, y06, xsum2, xtemp1 FMADD xsum2, y05, xsum2, xtemp2 FMADD xsum1, atemp1, a1, xsum1 FMADD xsum2, atemp2, a1, xsum2 #ifndef HEMV FNMSUB xsum1, atemp2, a2, xsum1 FMADD xsum2, atemp1, a2, xsum2 #endif FADD y01, y01, xsum1 FADD y02, y02, xsum2 STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) .align 4 LL(990): cmpwi cr0, INCY, 2 * SIZE beq LL(999) mr YY, Y srawi. r0, M, 2 mtspr CTR, r0 ble LL(995) .align 4 LL(991): LFD f0, 0 * SIZE(Y) LFD f1, 1 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) LFD f3, 1 * SIZE(Y) add Y, Y, INCY LFD f4, 0 * SIZE(Y) LFD f5, 1 * SIZE(Y) add Y, Y, INCY LFD f6, 0 * SIZE(Y) LFD f7, 1 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) LFD f12, 4 * SIZE(NEW_Y) LFD f13, 5 * SIZE(NEW_Y) LFD f14, 6 * SIZE(NEW_Y) LFD f15, 7 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 8 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFD f8, 0 * SIZE(YY) STFD f9, 1 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) STFD f11, 1 * SIZE(YY) add YY, YY, INCY STFD f12, 0 * SIZE(YY) STFD f13, 1 * SIZE(YY) add YY, YY, INCY STFD f14, 0 * SIZE(YY) STFD f15, 1 * SIZE(YY) add YY, YY, INCY bdnz LL(991) .align 4 LL(995): andi. J, M, 2 ble LL(996) LFD f0, 0 * SIZE(Y) LFD f1, 1 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) LFD f3, 1 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) LFD f10, 2 * SIZE(NEW_Y) LFD f11, 3 * SIZE(NEW_Y) addi NEW_Y, NEW_Y, 4 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFD f8, 0 * SIZE(YY) STFD f9, 1 * SIZE(YY) add YY, YY, INCY STFD f10, 0 * SIZE(YY) STFD f11, 1 * SIZE(YY) add YY, YY, INCY .align 4 LL(996): andi. J, M, 1 ble LL(999) LFD f0, 0 * SIZE(Y) LFD f1, 1 * SIZE(Y) LFD f8, 0 * SIZE(NEW_Y) LFD f9, 1 * SIZE(NEW_Y) FADD f8, f8, f0 FADD f9, f9, f1 STFD f8, 0 * SIZE(YY) STFD f9, 1 * SIZE(YY) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrmm_kernel_8x2_power8.S000066400000000000000000000222651313527062700224310ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "def_vsx.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R_SP 296(SP) #define ALPHA_I_SP 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R_SP 224(SP) #define ALPHA_I_SP 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define o0 0 #define alpha_r vs30 #define alpha_i vs31 #define KKK r13 #define K1 r14 #define L r15 #define ALPHA r16 #define o24 r17 #define T2 r19 #define KK r20 #define o8 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO r26 #define o16 r27 #define o32 r28 #define o48 r29 #define PRE r30 #define T1 r31 #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) std r18, 248(SP) std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) std r13, 288(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) stw r18, 196(SP) stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) stw r13, 216(SP) #endif stfd f1, ALPHA_R_SP stfd f2, ALPHA_I_SP stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg KK, OFFSET #endif #endif #include "ztrmm_macros_8x2_power8.S" cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 slwi LDC, LDC, ZBASE_SHIFT li PRE, 256 li o8 , 8 li o16 , 16 li o24 , 24 li o32 , 32 li o48 , 48 #ifdef __64BIT__ addi ALPHA, SP, 296 #else addi ALPHA, SP, 224 #endif lxsdx alpha_r, 0, ALPHA lxsdx alpha_i, o8, ALPHA .align 4 #include "ztrmm_logic_8x2_power8.S" .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) ld r18, 248(SP) ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) ld r13, 288(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) lwz r18, 196(SP) lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) lwz r13, 216(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrmm_logic_8x2_power8.S000066400000000000000000000540671313527062700222530ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 1 ble .LZTRMM_L2_END .LZTRMM_L2_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 3 ble .LZTRMM_L2x8_END .LZTRMM_L2x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 7 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L2x8_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L2x8_SUB4 .LZTRMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 dcbt AO, PRE KERNEL2x8_I1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 ble .LZTRMM_L2x8_LOOP_END .align 5 .LZTRMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 bgt .LZTRMM_L2x8_LOOP .LZTRMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 KERNEL2x8_E2 b .LZTRMM_L2x8_SUB1 .LZTRMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 dcbt AO, PRE KERNEL2x8_SUB1 dcbt AO, PRE KERNEL2x8_SUB1 dcbt AO, PRE KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b .LZTRMM_L2x8_SUB1 .LZTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 ble .LZTRMM_L2x8_SAVE b .LZTRMM_L2x8_SUB2 .LZTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L2x8_SAVE .LZTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt .LZTRMM_L2x8_SUB2 .LZTRMM_L2x8_SAVE: SAVE2x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif addic. I, I, -1 bgt .LZTRMM_L2x8_BEGIN .LZTRMM_L2x8_END: .LZTRMM_L2x4_BEGIN: andi. T2, M, 7 ble .LZTRMM_L2x1_END andi. T1, M, 4 ble .LZTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L2x4_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L2x4_SUB4 .LZTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble .LZTRMM_L2x4_LOOP_END .align 5 .LZTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt .LZTRMM_L2x4_LOOP .LZTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b .LZTRMM_L2x4_SUB1 .LZTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b .LZTRMM_L2x4_SUB1 .LZTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 ble .LZTRMM_L2x4_SAVE b .LZTRMM_L2x4_SUB2 .LZTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L2x4_SAVE .LZTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt .LZTRMM_L2x4_SUB2 .LZTRMM_L2x4_SAVE: SAVE2x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif .LZTRMM_L2x4_END: .LZTRMM_L2x2_BEGIN: andi. T1, M, 2 ble .LZTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L2x2_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L2x2_SUB4 .LZTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble .LZTRMM_L2x2_LOOP_END .align 5 .LZTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt .LZTRMM_L2x2_LOOP .LZTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b .LZTRMM_L2x2_SUB1 .LZTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b .LZTRMM_L2x2_SUB1 .LZTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 ble .LZTRMM_L2x2_SAVE b .LZTRMM_L2x2_SUB2 .LZTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L2x2_SAVE .LZTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt .LZTRMM_L2x2_SUB2 .LZTRMM_L2x2_SAVE: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif .LZTRMM_L2x2_END: .LZTRMM_L2x1_BEGIN: andi. T1, M, 1 ble .LZTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 5 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L2x1_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L2x1_SUB4 .LZTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble .LZTRMM_L2x1_LOOP_END .align 5 .LZTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt .LZTRMM_L2x1_LOOP .LZTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b .LZTRMM_L2x1_SUB1 .LZTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b .LZTRMM_L2x1_SUB1 .LZTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 ble .LZTRMM_L2x1_SAVE b .LZTRMM_L2x1_SUB2 .LZTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L2x1_SAVE .LZTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt .LZTRMM_L2x1_SUB2 .LZTRMM_L2x1_SAVE: SAVE2x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif .LZTRMM_L2x1_END: slwi T1, K, 5 add B, B, T1 #if !defined(LEFT) addi KK, KK, 2 // KK += Number of values in B #endif addic. J, J, -1 bgt .LZTRMM_L2_BEGIN andi. T2, N, 1 ble .L999 .LZTRMM_L2_END: b .LZTRMM_L1_BEGIN .L999_H1: b .L999 .LZTRMM_L1_BEGIN: andi. T1, N, 1 ble .LZTRMM_L1_END mr CO, C mr AO, A #if defined(LEFT) mr KK, OFFSET // OFFSET -> KK #endif srawi. I, M, 3 ble .LZTRMM_L1x8_END .LZTRMM_L1x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 7 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L1x8_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L1x8_SUB4 .LZTRMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 dcbt AO, PRE KERNEL1x8_I1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 ble .LZTRMM_L1x8_LOOP_END .align 5 .LZTRMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 bgt .LZTRMM_L1x8_LOOP .LZTRMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 KERNEL1x8_E2 b .LZTRMM_L1x8_SUB1 .LZTRMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 dcbt AO, PRE KERNEL1x8_SUB1 dcbt AO, PRE KERNEL1x8_SUB1 dcbt AO, PRE KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b .LZTRMM_L1x8_SUB1 .LZTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 ble .LZTRMM_L1x8_SAVE b .LZTRMM_L1x8_SUB2 .LZTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L1x8_SAVE .LZTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt .LZTRMM_L1x8_SUB2 .LZTRMM_L1x8_SAVE: SAVE1x8 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 8 // KK += Number of values in A #endif addic. I, I, -1 bgt .LZTRMM_L1x8_BEGIN .LZTRMM_L1x8_END: .LZTRMM_L1x4_BEGIN: andi. T2, M, 7 ble .LZTRMM_L1x1_END andi. T1, M, 4 ble .LZTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 6 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L1x4_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L1x4_SUB4 .LZTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble .LZTRMM_L1x4_LOOP_END .align 5 .LZTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt .LZTRMM_L1x4_LOOP .LZTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b .LZTRMM_L1x4_SUB1 .LZTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b .LZTRMM_L1x4_SUB1 .LZTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 ble .LZTRMM_L1x4_SAVE b .LZTRMM_L1x4_SUB2 .LZTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L1x4_SAVE .LZTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt .LZTRMM_L1x4_SUB2 .LZTRMM_L1x4_SAVE: SAVE1x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 4 // KK += Number of values in A #endif .LZTRMM_L1x4_END: .LZTRMM_L1x2_BEGIN: andi. T1, M, 2 ble .LZTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 5 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L1x2_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L1x2_SUB4 .LZTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble .LZTRMM_L1x2_LOOP_END .align 5 .LZTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt .LZTRMM_L1x2_LOOP .LZTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b .LZTRMM_L1x2_SUB1 .LZTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b .LZTRMM_L1x2_SUB1 .LZTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 ble .LZTRMM_L1x2_SAVE b .LZTRMM_L1x2_SUB2 .LZTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L1x2_SAVE .LZTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt .LZTRMM_L1x2_SUB2 .LZTRMM_L1x2_SAVE: SAVE1x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 2 // KK += Number of values in A #endif .LZTRMM_L1x2_END: .LZTRMM_L1x1_BEGIN: andi. T1, M, 1 ble .LZTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO #else mr BO, B // B -> BO slwi T1, KK, 4 // Number of values in B shifted slwi T2, KK, 4 // Number of values in A shifted add BO, BO, T1 // Add values to BO add AO, AO, T2 // Add values to AO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub T1, K, KK // K - KK -> TEMP1 #else mr T1, KK // KK -> KTEMP #ifdef LEFT addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP #else addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP #endif #endif mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L ble .LZTRMM_L1x1_SUB0 cmpwi cr0, L, 1 ble .LZTRMM_L1x1_SUB4 .LZTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble .LZTRMM_L1x1_LOOP_END .align 5 .LZTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt .LZTRMM_L1x1_LOOP .LZTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b .LZTRMM_L1x1_SUB1 .LZTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b .LZTRMM_L1x1_SUB1 .LZTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 ble .LZTRMM_L1x1_SAVE b .LZTRMM_L1x1_SUB2 .LZTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L ble .LZTRMM_L1x1_SAVE .LZTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt .LZTRMM_L1x1_SUB2 .LZTRMM_L1x1_SAVE: SAVE1x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub T1, K, KKK // K - KKK -> TEMP1 slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 add BO, BO, T2 // BO += TEMP2 * number of values in B shifted add AO, AO, T1 // AO += TEMP1 * number of values in A shifted #endif #if defined(LEFT) addi KK, KK, 1 // KK += Number of values in A #endif .LZTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif .LZTRMM_L1_END: OpenBLAS-0.2.20/kernel/power/ztrmm_macros_8x2_power8.S000066400000000000000000002652241313527062700224410ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2016/03/05 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xsadddp #define XSFADD_I2 xsadddp #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xssubdp #define XSFADD_I2 xsadddp #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) #define XSFADD_R1 xsadddp #define XSFADD_R2 xsadddp #define XSFADD_I1 xsadddp #define XSFADD_I2 xssubdp #else // CC || CR || RC || RR #define XSFADD_R1 xsadddp #define XSFADD_R2 xssubdp #define XSFADD_I1 xssubdp #define XSFADD_I2 xssubdp #endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ .macro LOAD2x8_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL2x8_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag xvmuldp vs48, vs0, vs18 // real*real, imag*real xvmuldp vs49, vs0, vs19 // real*imag, imag*imag xvmuldp vs50, vs1, vs18 // real*real, imag*real xvmuldp vs51, vs1, vs19 // real*imag, imag*imag xvmuldp vs52, vs2, vs18 // real*real, imag*real xvmuldp vs53, vs2, vs19 // real*imag, imag*imag xvmuldp vs54, vs3, vs18 // real*real, imag*real xvmuldp vs55, vs3, vs19 // real*imag, imag*imag xvmuldp vs56, vs4, vs18 // real*real, imag*real xvmuldp vs57, vs4, vs19 // real*imag, imag*imag xvmuldp vs58, vs5, vs18 // real*real, imag*real xvmuldp vs59, vs5, vs19 // real*imag, imag*imag xvmuldp vs60, vs6, vs18 // real*real, imag*real xvmuldp vs61, vs6, vs19 // real*imag, imag*imag xvmuldp vs62, vs7, vs18 // real*real, imag*real xvmuldp vs63, vs7, vs19 // real*imag, imag*imag .endm .macro KERNEL2x8_1 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag addi AO, AO, 64 xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag xvmaddadp vs48, vs0, vs18 // real*real, imag*real xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL2x8_2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag addi AO, AO, 64 xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag addi AO, AO, 64 addi BO, BO, 32 .endm .macro KERNEL2x8_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag .endm .macro KERNEL2x8_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag xvmuldp vs48, vs0, vs18 // real*real, imag*real xvmuldp vs49, vs0, vs19 // real*imag, imag*imag xvmuldp vs50, vs1, vs18 // real*real, imag*real xvmuldp vs51, vs1, vs19 // real*imag, imag*imag xvmuldp vs52, vs2, vs18 // real*real, imag*real xvmuldp vs53, vs2, vs19 // real*imag, imag*imag xvmuldp vs54, vs3, vs18 // real*real, imag*real xvmuldp vs55, vs3, vs19 // real*imag, imag*imag xvmuldp vs56, vs4, vs18 // real*real, imag*real xvmuldp vs57, vs4, vs19 // real*imag, imag*imag xvmuldp vs58, vs5, vs18 // real*real, imag*real xvmuldp vs59, vs5, vs19 // real*imag, imag*imag xvmuldp vs60, vs6, vs18 // real*real, imag*real xvmuldp vs61, vs6, vs19 // real*imag, imag*imag xvmuldp vs62, vs7, vs18 // real*real, imag*real xvmuldp vs63, vs7, vs19 // real*imag, imag*imag .endm .macro KERNEL2x8_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag xvmaddadp vs48, vs0, vs18 // real*real, imag*real xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag .endm .macro SAVE2x8 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 lxvd2x vs20, o0, T2 lxvd2x vs21, o16, T2 lxvd2x vs22, o32, T2 lxvd2x vs23, o48, T2 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs12, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs13, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs14, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs15, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 xvadddp vs12, vs12, vs20 xvadddp vs13, vs13, vs21 xvadddp vs14, vs14, vs22 xvadddp vs15, vs15, vs23 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, o0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 lxvd2x vs20, o0, T2 lxvd2x vs21, o16, T2 lxvd2x vs22, o32, T2 lxvd2x vs23, o48, T2 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs12, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs13, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs14, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs15, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 xvadddp vs12, vs12, vs20 xvadddp vs13, vs13, vs21 xvadddp vs14, vs14, vs22 xvadddp vs15, vs15, vs23 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, o0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 add T1, T1, LDC add T2, T2, LDC addi CO, CO, 128 .endm /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ .macro LOAD2x4_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL2x4_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs0, vs18 // real*real, imag*real xvmuldp vs41, vs0, vs19 // real*imag, imag*imag xvmuldp vs42, vs1, vs18 // real*real, imag*real xvmuldp vs43, vs1, vs19 // real*imag, imag*imag xvmuldp vs44, vs2, vs18 // real*real, imag*real xvmuldp vs45, vs2, vs19 // real*imag, imag*imag xvmuldp vs46, vs3, vs18 // real*real, imag*real xvmuldp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro KERNEL2x4_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs0, vs18 // real*real, imag*real xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag xvmaddadp vs42, vs1, vs18 // real*real, imag*real xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag xvmaddadp vs44, vs2, vs18 // real*real, imag*real xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag xvmaddadp vs46, vs3, vs18 // real*real, imag*real xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro KERNEL2x4_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs8, vs22 // real*real, imag*real xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag xvmaddadp vs42, vs9, vs22 // real*real, imag*real xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag xvmaddadp vs44, vs10, vs22 // real*real, imag*real xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag xvmaddadp vs46, vs11, vs22 // real*real, imag*real xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag .endm .macro KERNEL2x4_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs8, vs22 // real*real, imag*real xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag xvmaddadp vs42, vs9, vs22 // real*real, imag*real xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag xvmaddadp vs44, vs10, vs22 // real*real, imag*real xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag xvmaddadp vs46, vs11, vs22 // real*real, imag*real xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag .endm .macro KERNEL2x4_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs0, vs18 // real*real, imag*real xvmuldp vs41, vs0, vs19 // real*imag, imag*imag xvmuldp vs42, vs1, vs18 // real*real, imag*real xvmuldp vs43, vs1, vs19 // real*imag, imag*imag xvmuldp vs44, vs2, vs18 // real*real, imag*real xvmuldp vs45, vs2, vs19 // real*imag, imag*imag xvmuldp vs46, vs3, vs18 // real*real, imag*real xvmuldp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro KERNEL2x4_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs0, vs18 // real*real, imag*real xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag xvmaddadp vs42, vs1, vs18 // real*real, imag*real xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag xvmaddadp vs44, vs2, vs18 // real*real, imag*real xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag xvmaddadp vs46, vs3, vs18 // real*real, imag*real xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag .endm .macro SAVE2x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ .macro LOAD2x2_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 .endm .macro KERNEL2x2_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs0, vs18 // real*real, imag*real xvmuldp vs37, vs0, vs19 // real*imag, imag*imag xvmuldp vs38, vs1, vs18 // real*real, imag*real xvmuldp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro KERNEL2x2_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs0, vs18 // real*real, imag*real xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag xvmaddadp vs38, vs1, vs18 // real*real, imag*real xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro KERNEL2x2_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs8, vs22 // real*real, imag*real xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag xvmaddadp vs38, vs9, vs22 // real*real, imag*real xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag .endm .macro KERNEL2x2_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs8, vs22 // real*real, imag*real xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag xvmaddadp vs38, vs9, vs22 // real*real, imag*real xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag .endm .macro KERNEL2x2_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs0, vs18 // real*real, imag*real xvmuldp vs37, vs0, vs19 // real*imag, imag*imag xvmuldp vs38, vs1, vs18 // real*real, imag*real xvmuldp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro KERNEL2x2_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs0, vs18 // real*real, imag*real xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag xvmaddadp vs38, vs1, vs18 // real*real, imag*real xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag .endm .macro SAVE2x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ .macro LOAD2x1_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 .endm .macro KERNEL2x1_I1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs0, vs18 // real*real, imag*real xvmuldp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro KERNEL2x1_1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B lxvdsx vs22, o16, BO // load real part from B lxvdsx vs23, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs0, vs18 // real*real, imag*real xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro KERNEL2x1_2 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs8, vs22 // real*real, imag*real xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag .endm .macro KERNEL2x1_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs8, vs22 // real*real, imag*real xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag .endm .macro KERNEL2x1_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs0, vs18 // real*real, imag*real xvmuldp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro KERNEL2x1_SUB1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B lxvdsx vs18, o16, BO // load real part from B lxvdsx vs19, o24, BO // load imag part from B addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs0, vs18 // real*real, imag*real xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag .endm .macro SAVE2x1 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 #endif stxvd2x vs8, o0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 #endif stxvd2x vs8, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ .macro LOAD1x8_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL1x8_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro KERNEL1x8_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs12, o0, AO // load real,imag from A lxvd2x vs13, o16, AO // load real,imag from A lxvd2x vs14, o32, AO // load real,imag from A lxvd2x vs15, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro KERNEL1x8_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag .endm .macro KERNEL1x8_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag .endm .macro KERNEL1x8_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag xvmuldp vs40, vs4, vs16 // real*real, imag*real xvmuldp vs41, vs4, vs17 // real*imag, imag*imag xvmuldp vs42, vs5, vs16 // real*real, imag*real xvmuldp vs43, vs5, vs17 // real*imag, imag*imag xvmuldp vs44, vs6, vs16 // real*real, imag*real xvmuldp vs45, vs6, vs17 // real*imag, imag*imag xvmuldp vs46, vs7, vs16 // real*real, imag*real xvmuldp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro KERNEL1x8_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvd2x vs4, o0, AO // load real,imag from A lxvd2x vs5, o16, AO // load real,imag from A lxvd2x vs6, o32, AO // load real,imag from A lxvd2x vs7, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag .endm .macro SAVE1x8 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 lxvd2x vs20, o0, T2 lxvd2x vs21, o16, T2 lxvd2x vs22, o32, T2 lxvd2x vs23, o48, T2 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs12, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs13, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs14, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs15, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 xvadddp vs12, vs12, vs20 xvadddp vs13, vs13, vs21 xvadddp vs14, vs14, vs22 xvadddp vs15, vs15, vs23 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, o0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 add T1, T1, LDC add T2, T2, LDC addi CO, CO, 128 .endm /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ .macro LOAD1x4_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 .endm .macro KERNEL1x4_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro KERNEL1x4_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A lxvd2x vs10, o32, AO // load real,imag from A lxvd2x vs11, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro KERNEL1x4_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag .endm .macro KERNEL1x4_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag .endm .macro KERNEL1x4_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag xvmuldp vs36, vs2, vs16 // real*real, imag*real xvmuldp vs37, vs2, vs17 // real*imag, imag*imag xvmuldp vs38, vs3, vs16 // real*real, imag*real xvmuldp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro KERNEL1x4_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A lxvd2x vs2, o32, AO // load real,imag from A lxvd2x vs3, o48, AO // load real,imag from A addi AO, AO, 64 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag .endm .macro SAVE1x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 lxvd2x vs18, o32, T1 lxvd2x vs19, o48, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs10, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs11, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 xvadddp vs10, vs10, vs18 xvadddp vs11, vs11, vs19 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 add T1, T1, LDC addi CO, CO, 64 .endm /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ .macro LOAD1x2_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 .endm .macro KERNEL1x2_I1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro KERNEL1x2_1 lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro KERNEL1x2_2 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag .endm .macro KERNEL1x2_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag .endm .macro KERNEL1x2_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag xvmuldp vs34, vs1, vs16 // real*real, imag*real xvmuldp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro KERNEL1x2_SUB1 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A addi AO, AO, 32 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag .endm .macro SAVE1x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 lxvd2x vs17, o16, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs9, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 xvadddp vs9, vs9, vs17 #endif stxvd2x vs8, o0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC addi CO, CO, 32 .endm /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ .macro LOAD1x1_1 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 .endm .macro KERNEL1x1_I1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro KERNEL1x1_1 lxvd2x vs8, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs20, o0, BO // load real part from B lxvdsx vs21, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro KERNEL1x1_2 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag .endm .macro KERNEL1x1_E2 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag .endm .macro KERNEL1x1_SUBI1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro KERNEL1x1_SUB1 lxvd2x vs0, o0, AO // load real,imag from A addi AO, AO, 16 lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B addi BO, BO, 16 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag .endm .macro SAVE1x1 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs16, o0, T1 #endif xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB xsmuldp vs4, vs0, alpha_r // real*alpha_r xsmuldp vs5, vs1, alpha_i // imag*alpha_i xsmuldp vs6, vs0, alpha_i // real*alpha_i xsmuldp vs7, vs1, alpha_r // imag*alpha_r xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r xxpermdi vs8, vs2, vs3, 0 // merge real and imag part #ifndef TRMMKERNEL xvadddp vs8, vs8, vs16 #endif stxvd2x vs8, o0, T1 add T1, T1, LDC addi CO, CO, 16 .endm OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_LN.S000066400000000000000000001151351313527062700210220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #ifdef LN li PREC, -4 * SIZE #else li PREC, 4 * SIZE #endif #else #ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE + 16) #else li PREA, (16 * 9 * SIZE + 16) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #endif srawi. J, N, 1 ble LL(30) .align 4 LL(10): #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif LL(20): andi. I, M, 1 ble LL(09) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f20, f2 FADD f3, f21, f3 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(09): srawi. I, M, 1 ble LL(29) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f24, f28, f0 fmadd f5, f25, f29, f5 fmadd f10, f26, f30, f10 fmadd f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) fmadd f1, f25, f28, f1 fmadd f2, f26, f28, f2 fmadd f3, f27, f28, f3 fmadd f4, f24, f29, f4 fmadd f6, f26, f29, f6 fmadd f7, f27, f29, f7 fmadd f8, f24, f30, f8 fmadd f9, f25, f30, f9 fmadd f11, f27, f30, f11 fmadd f12, f24, f31, f12 fmadd f13, f25, f31, f13 fmadd f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) fmadd f0, f24, f28, f0 fmadd f5, f25, f29, f5 fmadd f10, f26, f30, f10 fmadd f15, f27, f31, f15 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f1, f25, f28, f1 fmadd f2, f26, f28, f2 fmadd f3, f27, f28, f3 fmadd f4, f24, f29, f4 fmadd f6, f26, f29, f6 fmadd f7, f27, f29, f7 fmadd f8, f24, f30, f8 fmadd f9, f25, f30, f9 fmadd f11, f27, f30, f11 fmadd f12, f24, f31, f12 fmadd f13, f25, f31, f13 fmadd f14, f26, f31, f14 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 DCBT(AO, PREA) DCBT(BO, PREB) #endif bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(KERNEL_MainFinish) .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(29): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(30): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif andi. I, M, 1 ble LL(40) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(47) .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(40): srawi. I, M, 1 ble LL(49) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(37) .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(49): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_LT.S000066400000000000000000001153211313527062700210250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 4 * SIZE /* is 12 best? */ #endif #else #ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE + 16) #else li PREA, (16 * 9 * SIZE + 16) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #endif srawi. J, N, 1 ble LL(30) .align 4 LL(10): #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f24, f28, f0 fmadd f5, f25, f29, f5 fmadd f10, f26, f30, f10 fmadd f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) fmadd f1, f25, f28, f1 fmadd f2, f26, f28, f2 fmadd f3, f27, f28, f3 fmadd f4, f24, f29, f4 fmadd f6, f26, f29, f6 fmadd f7, f27, f29, f7 fmadd f8, f24, f30, f8 fmadd f9, f25, f30, f9 fmadd f11, f27, f30, f11 fmadd f12, f24, f31, f12 fmadd f13, f25, f31, f13 fmadd f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) fmadd f0, f24, f28, f0 fmadd f5, f25, f29, f5 fmadd f10, f26, f30, f10 fmadd f15, f27, f31, f15 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f1, f25, f28, f1 fmadd f2, f26, f28, f2 fmadd f3, f27, f28, f3 fmadd f4, f24, f29, f4 fmadd f6, f26, f29, f6 fmadd f7, f27, f29, f7 fmadd f8, f24, f30, f8 fmadd f9, f25, f30, f9 fmadd f11, f27, f30, f11 fmadd f12, f24, f31, f12 fmadd f13, f25, f31, f13 fmadd f14, f26, f31, f14 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 DCBT(AO, PREA) DCBT(BO, PREB) #endif bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(KERNEL_MainFinish) .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(29): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(30): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble LL(40) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(37) .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(49) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(47) .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(49): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_RT.S000066400000000000000000001153221313527062700210340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST #ifdef PPC970 li PREC, 4 * SIZE #endif #ifdef POWER4 li PREC, 4 * SIZE /* is 12 best? */ #endif #ifdef POWER5 li PREC, 4 * SIZE /* is 12 best? */ #endif #else #ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif #ifndef PREFETCHTEST #ifdef PPC970 #ifdef ALLOC_HUGETLB li PREA, (16 * 5 * SIZE + 16) #else li PREA, (16 * 9 * SIZE + 16) #endif #endif #ifdef POWER4 #ifdef ALLOC_HUGETLB li PREA, (16 * 1 * SIZE + 16) #else li PREA, (16 * 2 * SIZE + 16) #endif #endif #ifdef POWER5 #ifdef ALLOC_HUGETLB li PREA, (16 * 7 * SIZE | 1) li PREB, (16 * 7 * SIZE | 3) #else li PREA, (16 * 12 * SIZE | 1) li PREB, (16 * 6 * SIZE | 3) #endif #endif #endif andi. J, N, 1 ble LL(30) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble LL(40) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(37) .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(49) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(47) .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(49): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(30): srawi. J, N, 1 ble LL(999) .align 4 LL(10): #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f24, f28, f0 fmadd f5, f25, f29, f5 fmadd f10, f26, f30, f10 fmadd f15, f27, f31, f15 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) fmadd f1, f25, f28, f1 fmadd f2, f26, f28, f2 fmadd f3, f27, f28, f3 fmadd f4, f24, f29, f4 fmadd f6, f26, f29, f6 fmadd f7, f27, f29, f7 fmadd f8, f24, f30, f8 fmadd f9, f25, f30, f9 fmadd f11, f27, f30, f11 fmadd f12, f24, f31, f12 fmadd f13, f25, f31, f13 fmadd f14, f26, f31, f14 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) fmadd f0, f24, f28, f0 fmadd f5, f25, f29, f5 fmadd f10, f26, f30, f10 fmadd f15, f27, f31, f15 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f1, f25, f28, f1 fmadd f2, f26, f28, f2 fmadd f3, f27, f28, f3 fmadd f4, f24, f29, f4 fmadd f6, f26, f29, f6 fmadd f7, f27, f29, f7 fmadd f8, f24, f30, f8 fmadd f9, f25, f30, f9 fmadd f11, f27, f30, f11 fmadd f12, f24, f31, f12 fmadd f13, f25, f31, f13 fmadd f14, f26, f31, f14 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE #ifdef PPC970 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER4 #ifndef ALLOC_HUGETLB DCBT(AO, PREA) #endif DCBT(BO, PREB) #endif #ifdef POWER5 DCBT(AO, PREA) DCBT(BO, PREB) #endif bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(KERNEL_MainFinish) .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(29): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_cell_LN.S000066400000000000000000001134521313527062700220210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM #ifndef DOUBLE #include "cparam.h" #else #include "zparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREC, -4 * SIZE li PREA, 16 * 12 * SIZE srawi. J, N, 1 ble LL(30) .align 4 LL(10): #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif LL(20): andi. I, M, 1 ble LL(09) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f20, f2 FADD f3, f21, f3 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(09): srawi. I, M, 1 ble LL(29) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(KERNEL_MainFinish) .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(29): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(30): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif andi. I, M, 1 ble LL(40) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(47) .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(40): srawi. I, M, 1 ble LL(49) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbtst CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(37) .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(49): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_cell_LT.S000066400000000000000000001146011313527062700220240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM #ifndef DOUBLE #include "cparam.h" #else #include "zparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) #ifndef PREFETCHTEST li PREC, 3 * SIZE li PREA, 16 * 12 * SIZE #else #ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) #else ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld PREA, FRAMESLOT(3) + STACKSIZE(SP) ld PREC, FRAMESLOT(4) + STACKSIZE(SP) #else #ifdef DOUBLE lwz PREA, FRAMESLOT(4) + STACKSIZE(SP) lwz PREC, FRAMESLOT(5) + STACKSIZE(SP) #else lwz PREA, FRAMESLOT(3) + STACKSIZE(SP) lwz PREC, FRAMESLOT(4) + STACKSIZE(SP) #endif #endif #endif #endif srawi. J, N, 1 ble LL(30) .align 4 LL(10): #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) LFD f30, 6 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(KERNEL_MainFinish) .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(29): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(30): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble LL(40) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(37) .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(49) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(47) .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(49): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_cell_RT.S000066400000000000000000001134571313527062700220420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define PREA r30 #define PREC r31 #define PREB PREA #ifndef NEEDPARAM #ifndef DOUBLE #include "cparam.h" #else #include "zparam.h" #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREC, 3 * SIZE li PREA, 16 * 12 * SIZE andi. J, N, 1 ble LL(30) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble LL(40) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(37) .align 4 LL(36): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(37): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(49) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4 LL(45): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(47) .align 4 LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(49): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(30): srawi. J, N, 1 ble LL(999) .align 4 LL(10): #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbt CO1, PREC dcbt CO2, PREC srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(15) .align 4 #define NOP1 mr r18, r18 #define NOP2 mr r19, r19 LL(12): FMADD f0, f16, f20, f0 dcbt AO, PREA FMADD f4, f16, f21, f4 dcbt BO, PREB FMADD f8, f16, f22, f8 LFD f31, 7 * SIZE(BO) FMADD f12, f16, f23, f12 LFD f27, 7 * SIZE(AO) FMADD f1, f17, f20, f1 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 9 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 10 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 8 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 9 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 10 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 11 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 11 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 12 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 13 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 14 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 12 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 13 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 14 * SIZE(BO) FMADD f15, f27, f31, f15 LFD f27, 15 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f31, 15 * SIZE(BO) FMADD f4, f16, f21, f4 NOP2 FMADD f8, f16, f22, f8 NOP1 FMADD f12, f16, f23, f12 LFD f16, 16 * SIZE(AO) FMADD f1, f17, f20, f1 NOP1 FMADD f5, f17, f21, f5 NOP2 FMADD f9, f17, f22, f9 NOP1 FMADD f13, f17, f23, f13 LFD f17, 17 * SIZE(AO) FMADD f2, f18, f20, f2 NOP1 FMADD f6, f18, f21, f6 NOP2 FMADD f10, f18, f22, f10 NOP1 FMADD f14, f18, f23, f14 LFD f18, 18 * SIZE(AO) FMADD f3, f19, f20, f3 LFD f20, 16 * SIZE(BO) FMADD f7, f19, f21, f7 LFD f21, 17 * SIZE(BO) FMADD f11, f19, f22, f11 LFD f22, 18 * SIZE(BO) FMADD f15, f19, f23, f15 LFD f19, 19 * SIZE(AO) FMADD f0, f24, f28, f0 LFD f23, 19 * SIZE(BO) FMADD f4, f24, f29, f4 NOP2 FMADD f8, f24, f30, f8 NOP1 FMADD f12, f24, f31, f12 LFD f24, 20 * SIZE(AO) FMADD f1, f25, f28, f1 NOP1 FMADD f5, f25, f29, f5 NOP2 FMADD f9, f25, f30, f9 NOP1 FMADD f13, f25, f31, f13 LFD f25, 21 * SIZE(AO) FMADD f2, f26, f28, f2 NOP1 FMADD f6, f26, f29, f6 NOP2 FMADD f10, f26, f30, f10 NOP1 FMADD f14, f26, f31, f14 LFD f26, 22 * SIZE(AO) FMADD f3, f27, f28, f3 LFD f28, 20 * SIZE(BO) FMADD f7, f27, f29, f7 LFD f29, 21 * SIZE(BO) FMADD f11, f27, f30, f11 LFD f30, 22 * SIZE(BO) FMADD f15, f27, f31, f15 addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(KERNEL_MainFinish) .align 4 LL(16): fmadd f0, f16, f20, f0 fmadd f5, f17, f21, f5 fmadd f10, f18, f22, f10 fmadd f15, f19, f23, f15 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4 LL(KERNEL_MainFinish): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4 LL(27): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(29): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_hummer_LN.S000066400000000000000000001433161313527062700224010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #undef ZERO #define ALPHA 0 #define FZERO 16 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define AORIG r12 #define KK r14 #define INCM1 r15 #define INCM3 r16 #define INCM5 r17 #define INCM7 r18 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define ZERO r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 #ifndef CONJ #define FXCPMADD fxcpmadd #define FXCSMADD fxcxnpma #else #if defined(LN) || defined(LT) #define FXCPMADD fxcpnsma #define FXCSMADD fxcxma #else #define FXCPMADD fxcpmadd #define FXCSMADD fxcxnsma #endif #endif #ifndef CONJ #define FXCXNPMA fxcxnpma #define FXCXNSMA fxcxnsma #else #define FXCXNPMA fxcxnsma #define FXCXNSMA fxcxnpma #endif PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f2, -8(SP) stfdu f1, -8(SP) slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM3, -3 * SIZE li INCM5, -5 * SIZE li INCM7, -7 * SIZE addi C, C, - 1 * SIZE #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif srawi. J, N, 1 ble .L50 .align 4 .L10: #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -4 * SIZE #else addi AO, A, -4 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 andi. I, M, 1 beq .L20 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L34 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4 .L34: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L38 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L38 #endif LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4 .L38: fpadd f0, f0, f1 fpadd f2, f2, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 4 * SIZE #endif addi AO2, AO, 2 * SIZE addi BO2, BO, 2 * SIZE #if defined(LN) || defined(LT) LFPDX f16, BO, INC4 LFPDX f17, BO2, INC4 #else LFPDX f16, AO, INC4 LFPDX f17, AO2, INC4 #endif fpsub f0, f16, f0 fpsub f2, f17, f2 #ifdef LN LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5 #endif #ifdef LT LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f2, A2, f0, f2 FXCXNSMA f2, A2, f0, f2 fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4 fxcpnmsub f0, A2, f2, f0 FXCXNSMA f0, A2, f2, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC4 STFPDX f2, BO2, INC4 #else STFPDX f0, AO, INC4 STFPDX f2, AO2, INC4 #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L20: andi. I, M, 2 beq .L30 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4 .L24: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L28 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L28 #endif LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4 .L28: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f18, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f19, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f8, f18, f8 fpsub f9, f19, f9 #ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 add AO, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxpmul f6, A3, f1 fxpmul f7, A3, f9 FXCXNPMA f1, A3, f1, f6 FXCXNPMA f9, A3, f9, f7 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, r0 #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: srawi. I, M, 2 ble .L49 .align 4 .L11: #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 srawi. r0, KK, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #else #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A1, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 LFPDUX B1, BO, INC4 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A3, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 LFPDUX A5, AO, INC4 FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 LFPDUX B3, BO, INC4 FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A6, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A7, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A9, f12 LFPDUX B5, BO, INC4 FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A8, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 LFPDUX A2, AO2, INC4 FXCSMADD f7, B6, A4, f7 LFPDUX A9, AO, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 nop FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 nop FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 nop FXCSMADD f12, B4, A9, f12 nop FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 nop FXCSMADD f7, B6, A4, f7 nop FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop .align 4 .L14: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L18 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 #endif .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 LFPDUX A2, AO, INC4 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 LFPDUX A10, BO, INC4 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 .align 4 .L18: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 fpadd f2, f2, f6 fpadd f10, f10, f14 fpadd f3, f3, f7 fpadd f11, f11, f15 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f20, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f21, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f22, BO2, INC4 LFPDUX f19, BO, INC4 LFPDUX f23, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 fpsub f8, f20, f8 fpsub f9, f21, f9 fpsub f10, f22, f10 fpsub f11, f23, f11 #ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A4, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 add AO2, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 LFPDUX A9, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A10, f3 fxpmul f5, A10, f11 FXCXNPMA f3, A10, f3, f4 FXCXNPMA f11, A10, f11, f5 fxcpnmsub f2, A9, f3, f2 fxcpnmsub f10, A9, f11, f10 FXCXNSMA f2, A9, f3, f2 FXCXNSMA f10, A9, f11, f10 fxcpnmsub f1, A8, f3, f1 fxcpnmsub f9, A8, f11, f9 FXCXNSMA f1, A8, f3, f1 FXCXNSMA f9, A8, f11, f9 fxcpnmsub f0, A7, f3, f0 fxcpnmsub f8, A7, f11, f8 FXCXNSMA f0, A7, f3, f0 FXCXNSMA f8, A7, f11, f8 fxpmul f4, A6, f2 fxpmul f5, A6, f10 FXCXNPMA f2, A6, f2, f4 FXCXNPMA f10, A6, f10, f5 fxcpnmsub f1, A5, f2, f1 fxcpnmsub f9, A5, f10, f9 FXCXNSMA f1, A5, f2, f1 FXCXNSMA f9, A5, f10, f9 fxcpnmsub f0, A4, f2, f0 fxcpnmsub f8, A4, f10, f8 FXCXNSMA f0, A4, f2, f0 FXCXNSMA f8, A4, f10, f8 fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 add AO, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 LFPDUX A7, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A8, AO, INC4 LFPDUX A9, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxcpnmsub f2, A3, f0, f2 fxcpnmsub f10, A3, f8, f10 FXCXNSMA f2, A3, f0, f2 FXCXNSMA f10, A3, f8, f10 fxcpnmsub f3, A4, f0, f3 fxcpnmsub f11, A4, f8, f11 FXCXNSMA f3, A4, f0, f3 FXCXNSMA f11, A4, f8, f11 fxpmul f6, A5, f1 fxpmul f7, A5, f9 FXCXNPMA f1, A5, f1, f6 FXCXNPMA f9, A5, f9, f7 fxcpnmsub f2, A6, f1, f2 fxcpnmsub f10, A6, f9, f10 FXCXNSMA f2, A6, f1, f2 FXCXNSMA f10, A6, f9, f10 fxcpnmsub f3, A7, f1, f3 fxcpnmsub f11, A7, f9, f11 FXCXNSMA f3, A7, f1, f3 FXCXNSMA f11, A7, f9, f11 fxpmul f4, A8, f2 fxpmul f5, A8, f10 FXCXNPMA f2, A8, f2, f4 FXCXNPMA f10, A8, f10, f5 fxcpnmsub f3, A9, f2, f3 fxcpnmsub f11, A9, f10, f11 FXCXNSMA f3, A9, f2, f3 FXCXNSMA f11, A9, f10, f11 fxpmul f6, A10, f3 fxpmul f7, A10, f11 FXCXNPMA f3, A10, f3, f6 FXCXNPMA f11, A10, f11, f7 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcpnmsub f10, A2, f2, f10 fxcpnmsub f11, A2, f3, f11 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 FXCXNSMA f10, A2, f2, f10 FXCXNSMA f11, A2, f3, f11 fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 fxcpnmsub f2, A2, f10, f2 fxcpnmsub f3, A2, f11, f3 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 FXCXNSMA f2, A2, f10, f2 FXCXNSMA f3, A2, f11, f3 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f2, BO, INC4 STFPDUX f10, BO2, INC4 STFPDUX f3, BO, INC4 STFPDUX f11, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f2, AO, INC4 STFPDUX f3, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f10, AO, INC4 STFPDUX f11, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC STFDUX f10, CO2, INC STFSDUX f10, CO2, INC STFDUX f11, CO2, INC STFSDUX f11, CO2, INC #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L49: #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 4 * SIZE #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt+ .L10 .align 4 .L50: andi. J, N, 1 beq .L999 #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 andi. I, M, 1 beq .L60 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L74 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 .align 4 .L74: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L78 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L78 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 .align 4 .L78: fpadd f0, f0, f2 fpadd f1, f1, f3 fpadd f0, f0, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 #else LFPDX f16, AO, INC2 #endif fpsub f0, f16, f0 #ifdef LN LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 #else STFPDX f0, AO, INC2 #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #ifdef LN subi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L60: andi. I, M, 2 beq .L70 #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L64 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 LFPDUX A3, AO, INC2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 LFPDUX A5, AO, INC2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 LFPDUX A7, AO, INC2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 .align 4 .L64: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L68 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L68 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 .align 4 .L68: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 #ifdef LN LFPDUX A1, AO, INC2 add AO, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 subi AO, AO, 8 * SIZE fxpmul f4, A3, f1 FXCXNPMA f1, A3, f1, f4 fxcpnmsub f0, A2, f1, f0 FXCXNSMA f0, A2, f1, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 add AO, AO, INC2 LFPDUX A3, AO, INC2 subi AO, AO, 8 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 FXCXNSMA f1, A2, f0, f1 fxpmul f6, A3, f1 FXCXNPMA f1, A3, f1, f6 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #ifdef LN subi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: srawi. I, M, 2 ble .L89 .align 4 .L51: #if defined(LT) || defined(RN) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 fpmr f7, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L54 #else #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 fpmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 LFPDUX B1, BO, INC2 FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 LFPDUX B2, BO, INC2 FXCSMADD f4, B3, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B3, A3, f2 nop FXCSMADD f6, B3, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B4, A5, f0 LFPDUX B3, BO, INC2 FXCSMADD f4, B4, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B4, A6, f1 nop FXCSMADD f5, B4, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B4, A7, f2 nop FXCSMADD f6, B4, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B4, A8, f3 nop FXCSMADD f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 nop FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 FXCSMADD f4, B3, A1, f4 FXCPMADD f1, B3, A2, f1 FXCSMADD f5, B3, A2, f5 FXCPMADD f2, B3, A3, f2 FXCSMADD f6, B3, A3, f6 FXCPMADD f3, B3, A4, f3 FXCSMADD f7, B3, A4, f7 FXCPMADD f0, B4, A5, f0 FXCSMADD f4, B4, A5, f4 FXCPMADD f1, B4, A6, f1 FXCSMADD f5, B4, A6, f5 FXCPMADD f2, B4, A7, f2 FXCSMADD f6, B4, A7, f6 FXCPMADD f3, B4, A8, f3 FXCSMADD f7, B4, A8, f7 .align 4 .L54: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L58 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L58 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 .align 4 .L58: fpadd f0, f0, f4 fpadd f1, f1, f5 fpadd f2, f2, f6 fpadd f3, f3, f7 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #ifdef LN LFPDUX A1, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 add AO, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX A9, AO, INC2 LFPDUX A10, AO, INC2 subi AO, AO, 32 * SIZE fxpmul f4, A10, f3 FXCXNPMA f3, A10, f3, f4 fxcpnmsub f2, A9, f3, f2 FXCXNSMA f2, A9, f3, f2 fxcpnmsub f1, A8, f3, f1 FXCXNSMA f1, A8, f3, f1 fxcpnmsub f0, A7, f3, f0 FXCXNSMA f0, A7, f3, f0 fxpmul f4, A6, f2 FXCXNPMA f2, A6, f2, f4 fxcpnmsub f1, A5, f2, f1 FXCXNSMA f1, A5, f2, f1 fxcpnmsub f0, A4, f2, f0 FXCXNSMA f0, A4, f2, f0 fxpmul f4, A3, f1 FXCXNPMA f1, A3, f1, f4 fxcpnmsub f0, A2, f1, f0 FXCXNSMA f0, A2, f1, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add AO, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX A9, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A10, AO, INC2 subi AO, AO, 32 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 FXCXNSMA f1, A2, f0, f1 fxcpnmsub f2, A3, f0, f2 FXCXNSMA f2, A3, f0, f2 fxcpnmsub f3, A4, f0, f3 FXCXNSMA f3, A4, f0, f3 fxpmul f6, A5, f1 FXCXNPMA f1, A5, f1, f6 fxcpnmsub f2, A6, f1, f2 FXCXNSMA f2, A6, f1, f2 fxcpnmsub f3, A7, f1, f3 FXCXNSMA f3, A7, f1, f3 fxpmul f4, A8, f2 FXCXNPMA f2, A8, f2, f4 fxcpnmsub f3, A9, f2, f3 FXCXNSMA f3, A9, f2, f3 fxpmul f6, A10, f3 FXCXNPMA f3, A10, f3, f6 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #ifdef LN subi CO1, CO1, 8 * SIZE #endif #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L89: #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L999: addi SP, SP, 20 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_hummer_LT.S000066400000000000000000001433151313527062700224060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #undef ZERO #define ALPHA 0 #define FZERO 16 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define AORIG r12 #define KK r14 #define INCM1 r15 #define INCM3 r16 #define INCM5 r17 #define INCM7 r18 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define ZERO r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 #ifndef CONJ #define FXCPMADD fxcpmadd #define FXCSMADD fxcxnpma #else #if defined(LN) || defined(LT) #define FXCPMADD fxcpnsma #define FXCSMADD fxcxma #else #define FXCPMADD fxcpmadd #define FXCSMADD fxcxnsma #endif #endif #ifndef CONJ #define FXCXNPMA fxcxnpma #define FXCXNSMA fxcxnsma #else #define FXCXNPMA fxcxnsma #define FXCXNSMA fxcxnpma #endif PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f2, -8(SP) stfdu f1, -8(SP) slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM3, -3 * SIZE li INCM5, -5 * SIZE li INCM7, -7 * SIZE addi C, C, - 1 * SIZE #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif srawi. J, N, 1 ble .L50 .align 4 .L10: #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -4 * SIZE #else addi AO, A, -4 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 srawi. r0, KK, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #else #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A1, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 LFPDUX B1, BO, INC4 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A3, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 LFPDUX A5, AO, INC4 FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 LFPDUX B3, BO, INC4 FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A6, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A7, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A9, f12 LFPDUX B5, BO, INC4 FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A8, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 LFPDUX A2, AO2, INC4 FXCSMADD f7, B6, A4, f7 LFPDUX A9, AO, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 nop FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 nop FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 nop FXCSMADD f12, B4, A9, f12 nop FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 nop FXCSMADD f7, B6, A4, f7 nop FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop .align 4 .L14: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L18 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 #endif .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 LFPDUX A2, AO, INC4 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 LFPDUX A10, BO, INC4 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 .align 4 .L18: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 fpadd f2, f2, f6 fpadd f10, f10, f14 fpadd f3, f3, f7 fpadd f11, f11, f15 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f20, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f21, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f22, BO2, INC4 LFPDUX f19, BO, INC4 LFPDUX f23, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 fpsub f8, f20, f8 fpsub f9, f21, f9 fpsub f10, f22, f10 fpsub f11, f23, f11 #ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A4, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 add AO2, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 LFPDUX A9, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A10, f3 fxpmul f5, A10, f11 FXCXNPMA f3, A10, f3, f4 FXCXNPMA f11, A10, f11, f5 fxcpnmsub f2, A9, f3, f2 fxcpnmsub f10, A9, f11, f10 FXCXNSMA f2, A9, f3, f2 FXCXNSMA f10, A9, f11, f10 fxcpnmsub f1, A8, f3, f1 fxcpnmsub f9, A8, f11, f9 FXCXNSMA f1, A8, f3, f1 FXCXNSMA f9, A8, f11, f9 fxcpnmsub f0, A7, f3, f0 fxcpnmsub f8, A7, f11, f8 FXCXNSMA f0, A7, f3, f0 FXCXNSMA f8, A7, f11, f8 fxpmul f4, A6, f2 fxpmul f5, A6, f10 FXCXNPMA f2, A6, f2, f4 FXCXNPMA f10, A6, f10, f5 fxcpnmsub f1, A5, f2, f1 fxcpnmsub f9, A5, f10, f9 FXCXNSMA f1, A5, f2, f1 FXCXNSMA f9, A5, f10, f9 fxcpnmsub f0, A4, f2, f0 fxcpnmsub f8, A4, f10, f8 FXCXNSMA f0, A4, f2, f0 FXCXNSMA f8, A4, f10, f8 fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 add AO, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 LFPDUX A7, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A8, AO, INC4 LFPDUX A9, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxcpnmsub f2, A3, f0, f2 fxcpnmsub f10, A3, f8, f10 FXCXNSMA f2, A3, f0, f2 FXCXNSMA f10, A3, f8, f10 fxcpnmsub f3, A4, f0, f3 fxcpnmsub f11, A4, f8, f11 FXCXNSMA f3, A4, f0, f3 FXCXNSMA f11, A4, f8, f11 fxpmul f6, A5, f1 fxpmul f7, A5, f9 FXCXNPMA f1, A5, f1, f6 FXCXNPMA f9, A5, f9, f7 fxcpnmsub f2, A6, f1, f2 fxcpnmsub f10, A6, f9, f10 FXCXNSMA f2, A6, f1, f2 FXCXNSMA f10, A6, f9, f10 fxcpnmsub f3, A7, f1, f3 fxcpnmsub f11, A7, f9, f11 FXCXNSMA f3, A7, f1, f3 FXCXNSMA f11, A7, f9, f11 fxpmul f4, A8, f2 fxpmul f5, A8, f10 FXCXNPMA f2, A8, f2, f4 FXCXNPMA f10, A8, f10, f5 fxcpnmsub f3, A9, f2, f3 fxcpnmsub f11, A9, f10, f11 FXCXNSMA f3, A9, f2, f3 FXCXNSMA f11, A9, f10, f11 fxpmul f6, A10, f3 fxpmul f7, A10, f11 FXCXNPMA f3, A10, f3, f6 FXCXNPMA f11, A10, f11, f7 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcpnmsub f10, A2, f2, f10 fxcpnmsub f11, A2, f3, f11 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 FXCXNSMA f10, A2, f2, f10 FXCXNSMA f11, A2, f3, f11 fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 fxcpnmsub f2, A2, f10, f2 fxcpnmsub f3, A2, f11, f3 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 FXCXNSMA f2, A2, f10, f2 FXCXNSMA f3, A2, f11, f3 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f2, BO, INC4 STFPDUX f10, BO2, INC4 STFPDUX f3, BO, INC4 STFPDUX f11, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f2, AO, INC4 STFPDUX f3, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f10, AO, INC4 STFPDUX f11, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC STFDUX f10, CO2, INC STFSDUX f10, CO2, INC STFDUX f11, CO2, INC STFSDUX f11, CO2, INC #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L20: andi. I, M, 2 beq .L30 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4 .L24: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L28 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L28 #endif LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4 .L28: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f18, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f19, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f8, f18, f8 fpsub f9, f19, f9 #ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 add AO, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxpmul f6, A3, f1 fxpmul f7, A3, f9 FXCXNPMA f1, A3, f1, f6 FXCXNPMA f9, A3, f9, f7 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, r0 #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: andi. I, M, 1 beq .L49 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L34 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4 .L34: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L38 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L38 #endif LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4 .L38: fpadd f0, f0, f1 fpadd f2, f2, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 4 * SIZE #endif addi AO2, AO, 2 * SIZE addi BO2, BO, 2 * SIZE #if defined(LN) || defined(LT) LFPDX f16, BO, INC4 LFPDX f17, BO2, INC4 #else LFPDX f16, AO, INC4 LFPDX f17, AO2, INC4 #endif fpsub f0, f16, f0 fpsub f2, f17, f2 #ifdef LN LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5 #endif #ifdef LT LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f2, A2, f0, f2 FXCXNSMA f2, A2, f0, f2 fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4 fxcpnmsub f0, A2, f2, f0 FXCXNSMA f0, A2, f2, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC4 STFPDX f2, BO2, INC4 #else STFPDX f0, AO, INC4 STFPDX f2, AO2, INC4 #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L49: #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 4 * SIZE #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt+ .L10 .align 4 .L50: andi. J, N, 1 beq .L999 #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L60 .align 4 .L51: #if defined(LT) || defined(RN) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 fpmr f7, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L54 #else #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 fpmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 LFPDUX B1, BO, INC2 FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 LFPDUX B2, BO, INC2 FXCSMADD f4, B3, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B3, A3, f2 nop FXCSMADD f6, B3, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B4, A5, f0 LFPDUX B3, BO, INC2 FXCSMADD f4, B4, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B4, A6, f1 nop FXCSMADD f5, B4, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B4, A7, f2 nop FXCSMADD f6, B4, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B4, A8, f3 nop FXCSMADD f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 nop FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 FXCSMADD f4, B3, A1, f4 FXCPMADD f1, B3, A2, f1 FXCSMADD f5, B3, A2, f5 FXCPMADD f2, B3, A3, f2 FXCSMADD f6, B3, A3, f6 FXCPMADD f3, B3, A4, f3 FXCSMADD f7, B3, A4, f7 FXCPMADD f0, B4, A5, f0 FXCSMADD f4, B4, A5, f4 FXCPMADD f1, B4, A6, f1 FXCSMADD f5, B4, A6, f5 FXCPMADD f2, B4, A7, f2 FXCSMADD f6, B4, A7, f6 FXCPMADD f3, B4, A8, f3 FXCSMADD f7, B4, A8, f7 .align 4 .L54: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L58 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L58 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 .align 4 .L58: fpadd f0, f0, f4 fpadd f1, f1, f5 fpadd f2, f2, f6 fpadd f3, f3, f7 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #ifdef LN LFPDUX A1, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 add AO, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX A9, AO, INC2 LFPDUX A10, AO, INC2 subi AO, AO, 32 * SIZE fxpmul f4, A10, f3 FXCXNPMA f3, A10, f3, f4 fxcpnmsub f2, A9, f3, f2 FXCXNSMA f2, A9, f3, f2 fxcpnmsub f1, A8, f3, f1 FXCXNSMA f1, A8, f3, f1 fxcpnmsub f0, A7, f3, f0 FXCXNSMA f0, A7, f3, f0 fxpmul f4, A6, f2 FXCXNPMA f2, A6, f2, f4 fxcpnmsub f1, A5, f2, f1 FXCXNSMA f1, A5, f2, f1 fxcpnmsub f0, A4, f2, f0 FXCXNSMA f0, A4, f2, f0 fxpmul f4, A3, f1 FXCXNPMA f1, A3, f1, f4 fxcpnmsub f0, A2, f1, f0 FXCXNSMA f0, A2, f1, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add AO, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX A9, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A10, AO, INC2 subi AO, AO, 32 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 FXCXNSMA f1, A2, f0, f1 fxcpnmsub f2, A3, f0, f2 FXCXNSMA f2, A3, f0, f2 fxcpnmsub f3, A4, f0, f3 FXCXNSMA f3, A4, f0, f3 fxpmul f6, A5, f1 FXCXNPMA f1, A5, f1, f6 fxcpnmsub f2, A6, f1, f2 FXCXNSMA f2, A6, f1, f2 fxcpnmsub f3, A7, f1, f3 FXCXNSMA f3, A7, f1, f3 fxpmul f4, A8, f2 FXCXNPMA f2, A8, f2, f4 fxcpnmsub f3, A9, f2, f3 FXCXNSMA f3, A9, f2, f3 fxpmul f6, A10, f3 FXCXNPMA f3, A10, f3, f6 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #ifdef LN subi CO1, CO1, 8 * SIZE #endif #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L60: andi. I, M, 2 beq .L70 #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L64 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 LFPDUX A3, AO, INC2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 LFPDUX A5, AO, INC2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 LFPDUX A7, AO, INC2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 .align 4 .L64: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L68 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L68 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 .align 4 .L68: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 #ifdef LN LFPDUX A1, AO, INC2 add AO, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 subi AO, AO, 8 * SIZE fxpmul f4, A3, f1 FXCXNPMA f1, A3, f1, f4 fxcpnmsub f0, A2, f1, f0 FXCXNSMA f0, A2, f1, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 add AO, AO, INC2 LFPDUX A3, AO, INC2 subi AO, AO, 8 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 FXCXNSMA f1, A2, f0, f1 fxpmul f6, A3, f1 FXCXNPMA f1, A3, f1, f6 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #ifdef LN subi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: andi. I, M, 1 beq .L89 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L74 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 .align 4 .L74: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L78 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L78 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 .align 4 .L78: fpadd f0, f0, f2 fpadd f1, f1, f3 fpadd f0, f0, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 #else LFPDX f16, AO, INC2 #endif fpsub f0, f16, f0 #ifdef LN LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 #else STFPDX f0, AO, INC2 #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #ifdef LN subi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L89: #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L999: addi SP, SP, 20 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_hummer_RT.S000066400000000000000000001433151313527062700224140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #undef ZERO #define ALPHA 0 #define FZERO 16 #define M r3 #define N r4 #define K r5 #ifdef linux #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #endif #define TEMP r11 #define AORIG r12 #define KK r14 #define INCM1 r15 #define INCM3 r16 #define INCM5 r17 #define INCM7 r18 #define INC2 r19 #define INC r20 #define INC4 r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define AO2 r26 #define BO2 r27 #define CO1 r28 #define CO2 r29 #define ZERO r31 #ifndef NEEDPARAM #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define A7 f22 #define A8 f23 #define A9 f24 #define A10 f25 #define B1 f26 #define B2 f27 #define B3 f28 #define B4 f29 #define B5 f30 #define B6 f31 #define AP B6 #ifndef CONJ #define FXCPMADD fxcpmadd #define FXCSMADD fxcxnpma #else #if defined(LN) || defined(LT) #define FXCPMADD fxcpnsma #define FXCSMADD fxcxma #else #define FXCPMADD fxcpmadd #define FXCSMADD fxcxnsma #endif #endif #ifndef CONJ #define FXCXNPMA fxcxnpma #define FXCXNSMA fxcxnsma #else #define FXCXNPMA fxcxnsma #define FXCXNSMA fxcxnpma #endif PROLOGUE PROFCODE li r0, -16 stfpdux f14, SP, r0 stfpdux f15, SP, r0 stfpdux f16, SP, r0 stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 stfpdux f20, SP, r0 stfpdux f21, SP, r0 stfpdux f22, SP, r0 stfpdux f23, SP, r0 stfpdux f24, SP, r0 stfpdux f25, SP, r0 stfpdux f26, SP, r0 stfpdux f27, SP, r0 stfpdux f28, SP, r0 stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) stwu r28, -4(SP) stwu r27, -4(SP) stwu r26, -4(SP) stwu r25, -4(SP) stwu r24, -4(SP) stwu r23, -4(SP) stwu r22, -4(SP) stwu r21, -4(SP) stwu r20, -4(SP) stwu r19, -4(SP) stwu r18, -4(SP) stwu r17, -4(SP) stwu r16, -4(SP) stwu r15, -4(SP) stwu r14, -4(SP) li r0, 0 stwu r0, -4(SP) stwu r0, -4(SP) stfdu f2, -8(SP) stfdu f1, -8(SP) slwi LDC, LDC, ZBASE_SHIFT cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 li INC, 1 * SIZE li INC2, 2 * SIZE li INC4, 4 * SIZE li INCM1, -1 * SIZE li INCM3, -3 * SIZE li INCM5, -5 * SIZE li INCM7, -7 * SIZE addi C, C, - 1 * SIZE #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif andi. J, N, 1 beq .L50 #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -2 * SIZE #else addi AO, A, -2 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L60 .align 4 .L51: #if defined(LT) || defined(RN) fpmr f4, f0 addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 fpmr f7, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L54 #else #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f4, f0 addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f5, f0 fpmr f2, f0 fpmr f6, f0 fpmr f3, f0 fpmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L54 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L53 .align 4 .L52: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 LFPDUX B1, BO, INC2 FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 LFPDUX B2, BO, INC2 FXCSMADD f4, B3, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B3, A3, f2 nop FXCSMADD f6, B3, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B4, A5, f0 LFPDUX B3, BO, INC2 FXCSMADD f4, B4, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B4, A6, f1 nop FXCSMADD f5, B4, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B4, A7, f2 nop FXCSMADD f6, B4, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B4, A8, f3 nop FXCSMADD f7, B4, A8, f7 LFPDUX A8, AO, INC2 bdnz+ .L52 .align 4 .L53: FXCPMADD f0, B1, A1, f0 LFPDUX B4, BO, INC2 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 FXCPMADD f0, B2, A5, f0 nop FXCSMADD f4, B2, A5, f4 LFPDUX A5, AO, INC2 FXCPMADD f1, B2, A6, f1 nop FXCSMADD f5, B2, A6, f5 LFPDUX A6, AO, INC2 FXCPMADD f2, B2, A7, f2 nop FXCSMADD f6, B2, A7, f6 LFPDUX A7, AO, INC2 FXCPMADD f3, B2, A8, f3 nop FXCSMADD f7, B2, A8, f7 LFPDUX A8, AO, INC2 FXCPMADD f0, B3, A1, f0 FXCSMADD f4, B3, A1, f4 FXCPMADD f1, B3, A2, f1 FXCSMADD f5, B3, A2, f5 FXCPMADD f2, B3, A3, f2 FXCSMADD f6, B3, A3, f6 FXCPMADD f3, B3, A4, f3 FXCSMADD f7, B3, A4, f7 FXCPMADD f0, B4, A5, f0 FXCSMADD f4, B4, A5, f4 FXCPMADD f1, B4, A6, f1 FXCSMADD f5, B4, A6, f5 FXCPMADD f2, B4, A7, f2 FXCSMADD f6, B4, A7, f6 FXCPMADD f3, B4, A8, f3 FXCSMADD f7, B4, A8, f7 .align 4 .L54: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L58 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L58 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 bdz- .L57 .align 4 .L56: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX A2, AO, INC2 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 LFPDUX A3, AO, INC2 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 LFPDUX A4, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L56 .align 4 .L57: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f2, B1, A3, f2 FXCSMADD f6, B1, A3, f6 FXCPMADD f3, B1, A4, f3 FXCSMADD f7, B1, A4, f7 .align 4 .L58: fpadd f0, f0, f4 fpadd f1, f1, f5 fpadd f2, f2, f6 fpadd f3, f3, f7 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 1 #endif slwi TEMP, r0, 2 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 LFPDUX f18, BO, INC2 LFPDUX f19, BO, INC2 subi BO, BO, 8 * SIZE #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 LFPDUX f18, AO, INC2 LFPDUX f19, AO, INC2 subi AO, AO, 8 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 #ifdef LN LFPDUX A1, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 add AO, AO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX A9, AO, INC2 LFPDUX A10, AO, INC2 subi AO, AO, 32 * SIZE fxpmul f4, A10, f3 FXCXNPMA f3, A10, f3, f4 fxcpnmsub f2, A9, f3, f2 FXCXNSMA f2, A9, f3, f2 fxcpnmsub f1, A8, f3, f1 FXCXNSMA f1, A8, f3, f1 fxcpnmsub f0, A7, f3, f0 FXCXNSMA f0, A7, f3, f0 fxpmul f4, A6, f2 FXCXNPMA f2, A6, f2, f4 fxcpnmsub f1, A5, f2, f1 FXCXNSMA f1, A5, f2, f1 fxcpnmsub f0, A4, f2, f0 FXCXNSMA f0, A4, f2, f0 fxpmul f4, A3, f1 FXCXNPMA f1, A3, f1, f4 fxcpnmsub f0, A2, f1, f0 FXCXNSMA f0, A2, f1, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 add AO, AO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX A7, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A8, AO, INC2 LFPDUX A9, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 add AO, AO, INC2 LFPDUX A10, AO, INC2 subi AO, AO, 32 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 FXCXNSMA f1, A2, f0, f1 fxcpnmsub f2, A3, f0, f2 FXCXNSMA f2, A3, f0, f2 fxcpnmsub f3, A4, f0, f3 FXCXNSMA f3, A4, f0, f3 fxpmul f6, A5, f1 FXCXNPMA f1, A5, f1, f6 fxcpnmsub f2, A6, f1, f2 FXCXNSMA f2, A6, f1, f2 fxcpnmsub f3, A7, f1, f3 FXCXNSMA f3, A7, f1, f3 fxpmul f4, A8, f2 FXCXNPMA f2, A8, f2, f4 fxcpnmsub f3, A9, f2, f3 FXCXNSMA f3, A9, f2, f3 fxpmul f6, A10, f3 FXCXNPMA f3, A10, f3, f6 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef LN subi CO1, CO1, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 STFPDUX f2, BO, INC2 STFPDUX f3, BO, INC2 subi BO, BO, 8 * SIZE #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 STFPDUX f2, AO, INC2 STFPDUX f3, AO, INC2 subi AO, AO, 8 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC #ifdef LN subi CO1, CO1, 8 * SIZE #endif #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L51 .align 4 .L60: andi. I, M, 2 beq .L70 #if defined(LT) || defined(RN) fpmr f1, f0 addi BO, B, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L64 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f1, f0 addi BO, BO, - 2 * SIZE fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L64 #endif LFPDUX B1, BO, INC2 LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX A4, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX A6, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A8, AO, INC2 bdz- .L63 .align 4 .L62: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 LFPDUX A3, AO, INC2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 LFPDUX A5, AO, INC2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 LFPDUX A7, AO, INC2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 LFPDUX A8, AO, INC2 LFPDUX B4, BO, INC2 bdnz+ .L62 .align 4 .L63: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 FXCPMADD f0, B2, A3, f0 FXCSMADD f2, B2, A3, f2 FXCPMADD f1, B2, A4, f1 FXCSMADD f3, B2, A4, f3 FXCPMADD f0, B3, A5, f0 FXCSMADD f2, B3, A5, f2 FXCPMADD f1, B3, A6, f1 FXCSMADD f3, B3, A6, f3 FXCPMADD f0, B4, A7, f0 FXCSMADD f2, B4, A7, f2 FXCPMADD f1, B4, A8, f1 FXCSMADD f3, B4, A8, f3 .align 4 .L64: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L68 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L68 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdz- .L67 .align 4 .L66: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 LFPDUX A1, AO, INC2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 bdnz+ .L66 .align 4 .L67: FXCPMADD f0, B1, A1, f0 FXCSMADD f2, B1, A1, f2 FXCPMADD f1, B1, A2, f1 FXCSMADD f3, B1, A2, f3 .align 4 .L68: fpadd f0, f0, f2 fpadd f1, f1, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC2 LFPDUX f17, BO, INC2 subi BO, BO, 4 * SIZE #else LFPDUX f16, AO, INC2 LFPDUX f17, AO, INC2 subi AO, AO, 4 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 #ifdef LN LFPDUX A1, AO, INC2 add AO, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX A3, AO, INC2 subi AO, AO, 8 * SIZE fxpmul f4, A3, f1 FXCXNPMA f1, A3, f1, f4 fxcpnmsub f0, A2, f1, f0 FXCXNSMA f0, A2, f1, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 add AO, AO, INC2 LFPDUX A3, AO, INC2 subi AO, AO, 8 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f1, A2, f0, f1 FXCXNSMA f1, A2, f0, f1 fxpmul f6, A3, f1 FXCXNPMA f1, A3, f1, f6 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC2 STFPDUX f1, BO, INC2 subi BO, BO, 4 * SIZE #else STFPDUX f0, AO, INC2 STFPDUX f1, AO, INC2 subi AO, AO, 4 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC #ifdef LN subi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L70: andi. I, M, 1 beq .L89 #if defined(LT) || defined(RN) addi BO, B, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, KK, 3 mtspr CTR, r0 ble .L74 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK addi BO, BO, - 2 * SIZE fpmr f1, f0 fpmr f2, f0 fpmr f3, f0 srawi. r0, TEMP, 3 mtspr CTR, r0 ble .L74 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdz- .L73 .align 4 .L72: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 LFPDUX A2, AO, INC2 LFPDUX B2, BO, INC2 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 LFPDUX A3, AO, INC2 LFPDUX B3, BO, INC2 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 LFPDUX A4, AO, INC2 LFPDUX B4, BO, INC2 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 LFPDUX A5, AO, INC2 LFPDUX B5, BO, INC2 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 LFPDUX A6, AO, INC2 LFPDUX B6, BO, INC2 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 LFPDUX A7, AO, INC2 LFPDUX A9, BO, INC2 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 LFPDUX A8, AO, INC2 LFPDUX A10, BO, INC2 bdnz+ .L72 .align 4 .L73: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A2, f2 FXCSMADD f3, B2, A2, f3 FXCPMADD f0, B3, A3, f0 FXCSMADD f1, B3, A3, f1 FXCPMADD f2, B4, A4, f2 FXCSMADD f3, B4, A4, f3 FXCPMADD f0, B5, A5, f0 FXCSMADD f1, B5, A5, f1 FXCPMADD f2, B6, A6, f2 FXCSMADD f3, B6, A6, f3 FXCPMADD f0, A9, A7, f0 FXCSMADD f1, A9, A7, f1 FXCPMADD f2, A10, A8, f2 FXCSMADD f3, A10, A8, f3 .align 4 .L74: #if defined(LT) || defined(RN) andi. r0, KK, 7 mtspr CTR, r0 ble+ .L78 #else andi. r0, TEMP, 7 mtspr CTR, r0 ble+ .L78 #endif LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdz- .L77 .align 4 .L76: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX A1, AO, INC2 LFPDUX B1, BO, INC2 bdnz+ .L76 .align 4 .L77: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 .align 4 .L78: fpadd f0, f0, f2 fpadd f1, f1, f3 fpadd f0, f0, f1 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 1 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP addi BO, BO, - 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDX f16, BO, INC2 #else LFPDX f16, AO, INC2 #endif fpsub f0, f16, f0 #ifdef LN LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LT LFPDX A1, AO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef RN LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef RT LFPDX A1, BO, INC2 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC2 #else STFPDX f0, AO, INC2 #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC #ifdef LN subi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L89: #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 2 * SIZE #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L50: srawi. J, N, 1 ble .L999 .align 4 .L10: #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) addi AORIG, A, -4 * SIZE #else addi AO, A, -4 * SIZE #endif #ifndef RT add C, CO2, LDC #endif li r0, FZERO lfpsx f0, SP, r0 srawi. I, M, 2 ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 srawi. r0, KK, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #else #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 2 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 fpmr f2, f0 fpmr f6, f0 fpmr f10, f0 fpmr f14, f0 fpmr f3, f0 fpmr f7, f0 fpmr f11, f0 fpmr f15, f0 addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 srawi. r0, TEMP, 2 fpmr f1, f0 mtspr CTR, r0 ble .L14 #endif LFPDUX A1, AO, INC4 fpmr f5, f0 LFPDUX A3, AO, INC4 fpmr f9, f0 LFPDUX B1, BO, INC4 fpmr f13, f0 LFPDUX A5, AO, INC4 fpmr f2, f0 LFPDUX A6, AO, INC4 fpmr f6, f0 LFPDUX B3, BO, INC4 fpmr f10, f0 LFPDUX A7, AO, INC4 fpmr f14, f0 LFPDUX A8, AO, INC4 fpmr f3, f0 LFPDUX B5, BO, INC4 fpmr f7, f0 LFPDUX A9, AO, INC4 fpmr f11, f0 LFPDUX A2, AO2, INC4 fpmr f15, f0 LFPDUX B2, BO2, INC4 bdz- .L13 .align 4 .L12: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A1, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 LFPDUX B1, BO, INC4 FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A3, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 LFPDUX A5, AO, INC4 FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 LFPDUX B3, BO, INC4 FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A6, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 LFPDUX A7, AO, INC4 FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A9, f12 LFPDUX B5, BO, INC4 FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 LFPDUX A8, AO, INC4 FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 LFPDUX A2, AO2, INC4 FXCSMADD f7, B6, A4, f7 LFPDUX A9, AO, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 bdnz+ .L12 .align 4 .L13: ## 1 ## FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 nop FXCPMADD f8, B2, A1, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A1, f12 LFPDUX B6, BO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 LFPDUX A10, AO, INC4 FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B1, A3, f2 nop FXCSMADD f6, B1, A3, f6 nop FXCPMADD f10, B2, A3, f10 nop FXCSMADD f14, B2, A3, f14 nop FXCPMADD f3, B1, A4, f3 nop FXCSMADD f7, B1, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop ## 2 ## FXCPMADD f0, B3, A5, f0 nop FXCSMADD f4, B3, A5, f4 nop FXCPMADD f8, B4, A5, f8 LFPDUX B2, BO2, INC4 FXCSMADD f12, B4, A5, f12 nop FXCPMADD f1, B3, A2, f1 nop FXCSMADD f5, B3, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B3, A6, f2 nop FXCSMADD f6, B3, A6, f6 nop FXCPMADD f10, B4, A6, f10 nop FXCSMADD f14, B4, A6, f14 nop FXCPMADD f3, B3, A4, f3 nop FXCSMADD f7, B3, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop ## 3 ## FXCPMADD f0, B5, A7, f0 nop FXCSMADD f4, B5, A7, f4 nop FXCPMADD f8, B2, A7, f8 LFPDUX B4, BO2, INC4 FXCSMADD f12, B2, A7, f12 nop FXCPMADD f1, B5, A2, f1 nop FXCSMADD f5, B5, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 nop FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 nop FXCPMADD f10, B2, A8, f10 nop FXCSMADD f14, B2, A8, f14 nop FXCPMADD f3, B5, A4, f3 nop FXCSMADD f7, B5, A4, f7 LFPDUX A2, AO2, INC4 FXCPMADD f11, B2, A4, f11 nop FXCSMADD f15, B2, A4, f15 nop ## 4 ## FXCPMADD f0, B6, A9, f0 nop FXCSMADD f4, B6, A9, f4 nop FXCPMADD f8, B4, A9, f8 nop FXCSMADD f12, B4, A9, f12 nop FXCPMADD f1, B6, A2, f1 nop FXCSMADD f5, B6, A2, f5 LFPDUX A4, AO2, INC4 FXCPMADD f9, B4, A2, f9 nop FXCSMADD f13, B4, A2, f13 nop FXCPMADD f2, B6, A10, f2 nop FXCSMADD f6, B6, A10, f6 nop FXCPMADD f10, B4, A10, f10 nop FXCSMADD f14, B4, A10, f14 nop FXCPMADD f3, B6, A4, f3 nop FXCSMADD f7, B6, A4, f7 nop FXCPMADD f11, B4, A4, f11 nop FXCSMADD f15, B4, A4, f15 nop .align 4 .L14: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L18 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L18 #endif .L15: LFPDUX A2, AO, INC4 LFPDUX A4, AO2, INC4 LFPDUX A10, BO, INC4 LFPDUX B4, BO2, INC4 bdz- .L17 .align 4 .L16: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 LFPDUX A2, AO, INC4 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 LFPDUX A10, BO, INC4 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 bdnz+ .L16 .align 4 .L17: FXCPMADD f0, A10, A2, f0 FXCSMADD f4, A10, A2, f4 FXCPMADD f8, B4, A2, f8 FXCSMADD f12, B4, A2, f12 LFPDUX A2, AO, INC4 FXCPMADD f1, A10, A4, f1 FXCSMADD f5, A10, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 LFPDUX A4, AO2, INC4 FXCPMADD f2, A10, A2, f2 FXCSMADD f6, A10, A2, f6 FXCPMADD f10, B4, A2, f10 FXCSMADD f14, B4, A2, f14 FXCPMADD f3, A10, A4, f3 FXCSMADD f7, A10, A4, f7 FXCPMADD f11, B4, A4, f11 FXCSMADD f15, B4, A4, f15 .align 4 .L18: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 fpadd f2, f2, f6 fpadd f10, f10, f14 fpadd f3, f3, f7 fpadd f11, f11, f15 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 4 #else subi r0, KK, 2 #endif slwi TEMP, r0, 2 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f20, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f21, BO2, INC4 LFPDUX f18, BO, INC4 LFPDUX f22, BO2, INC4 LFPDUX f19, BO, INC4 LFPDUX f23, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 LFPDUX f20, AO, INC4 LFPDUX f21, AO2, INC4 LFPDUX f22, AO, INC4 LFPDUX f23, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f2, f18, f2 fpsub f3, f19, f3 fpsub f8, f20, f8 fpsub f9, f21, f9 fpsub f10, f22, f10 fpsub f11, f23, f11 #ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A4, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 add AO2, AO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A8, AO2, INC4 LFPDUX A9, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A10, f3 fxpmul f5, A10, f11 FXCXNPMA f3, A10, f3, f4 FXCXNPMA f11, A10, f11, f5 fxcpnmsub f2, A9, f3, f2 fxcpnmsub f10, A9, f11, f10 FXCXNSMA f2, A9, f3, f2 FXCXNSMA f10, A9, f11, f10 fxcpnmsub f1, A8, f3, f1 fxcpnmsub f9, A8, f11, f9 FXCXNSMA f1, A8, f3, f1 FXCXNSMA f9, A8, f11, f9 fxcpnmsub f0, A7, f3, f0 fxcpnmsub f8, A7, f11, f8 FXCXNSMA f0, A7, f3, f0 FXCXNSMA f8, A7, f11, f8 fxpmul f4, A6, f2 fxpmul f5, A6, f10 FXCXNPMA f2, A6, f2, f4 FXCXNPMA f10, A6, f10, f5 fxcpnmsub f1, A5, f2, f1 fxcpnmsub f9, A5, f10, f9 FXCXNSMA f1, A5, f2, f1 FXCXNSMA f9, A5, f10, f9 fxcpnmsub f0, A4, f2, f0 fxcpnmsub f8, A4, f10, f8 FXCXNSMA f0, A4, f2, f0 FXCXNSMA f8, A4, f10, f8 fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A4, AO2, INC4 add AO, AO, INC4 LFPDUX A5, AO2, INC4 LFPDUX A6, AO, INC4 LFPDUX A7, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 LFPDUX A8, AO, INC4 LFPDUX A9, AO2, INC4 add AO, AO, INC4 add AO2, AO2, INC4 add AO, AO, INC4 LFPDUX A10, AO2, INC4 subi AO, AO, 32 * SIZE subi AO2, AO2, 32 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxcpnmsub f2, A3, f0, f2 fxcpnmsub f10, A3, f8, f10 FXCXNSMA f2, A3, f0, f2 FXCXNSMA f10, A3, f8, f10 fxcpnmsub f3, A4, f0, f3 fxcpnmsub f11, A4, f8, f11 FXCXNSMA f3, A4, f0, f3 FXCXNSMA f11, A4, f8, f11 fxpmul f6, A5, f1 fxpmul f7, A5, f9 FXCXNPMA f1, A5, f1, f6 FXCXNPMA f9, A5, f9, f7 fxcpnmsub f2, A6, f1, f2 fxcpnmsub f10, A6, f9, f10 FXCXNSMA f2, A6, f1, f2 FXCXNSMA f10, A6, f9, f10 fxcpnmsub f3, A7, f1, f3 fxcpnmsub f11, A7, f9, f11 FXCXNSMA f3, A7, f1, f3 FXCXNSMA f11, A7, f9, f11 fxpmul f4, A8, f2 fxpmul f5, A8, f10 FXCXNPMA f2, A8, f2, f4 FXCXNPMA f10, A8, f10, f5 fxcpnmsub f3, A9, f2, f3 fxcpnmsub f11, A9, f10, f11 FXCXNSMA f3, A9, f2, f3 FXCXNSMA f11, A9, f10, f11 fxpmul f6, A10, f3 fxpmul f7, A10, f11 FXCXNPMA f3, A10, f3, f6 FXCXNPMA f11, A10, f11, f7 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 fxcpnmsub f10, A2, f2, f10 fxcpnmsub f11, A2, f3, f11 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 FXCXNSMA f10, A2, f2, f10 FXCXNSMA f11, A2, f3, f11 fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 fxpmul f6, A3, f10 fxpmul f7, A3, f11 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 FXCXNPMA f10, A3, f10, f6 FXCXNPMA f11, A3, f11, f7 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 fxcpnmsub f2, A2, f10, f2 fxcpnmsub f3, A2, f11, f3 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 FXCXNSMA f2, A2, f10, f2 FXCXNSMA f3, A2, f11, f3 fxpmul f4, A1, f0 fxpmul f5, A1, f1 fxpmul f6, A1, f2 fxpmul f7, A1, f3 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 FXCXNPMA f2, A1, f2, f6 FXCXNPMA f3, A1, f3, f7 #endif #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 STFPDUX f2, BO, INC4 STFPDUX f10, BO2, INC4 STFPDUX f3, BO, INC4 STFPDUX f11, BO2, INC4 subi BO, BO, 16 * SIZE subi BO2, BO2, 16 * SIZE #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f2, AO, INC4 STFPDUX f3, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 STFPDUX f10, AO, INC4 STFPDUX f11, AO2, INC4 subi AO, AO, 16 * SIZE subi AO2, AO2, 16 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f2, CO1, INC STFSDUX f2, CO1, INC STFDUX f3, CO1, INC STFSDUX f3, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC STFDUX f10, CO2, INC STFSDUX f10, CO2, INC STFDUX f11, CO2, INC STFSDUX f11, CO2, INC #ifdef LN subi CO1, CO1, 8 * SIZE subi CO2, CO2, 8 * SIZE #endif #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 2 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 4 #endif #ifdef LN subi KK, KK, 4 #endif addic. I, I, -1 li r0, FZERO lfpsx f0, SP, r0 bgt+ .L11 .align 4 .L20: andi. I, M, 2 beq .L30 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, B, - 4 * SIZE fpmr f8, f0 addi BO2, B, - 2 * SIZE fpmr f12, f0 srawi. r0, KK, 2 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 mtspr CTR, r0 fpmr f13, f0 ble .L24 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f4, f0 addi BO, BO, - 4 * SIZE fpmr f8, f0 addi BO2, BO, 2 * SIZE fpmr f12, f0 fpmr f1, f0 fpmr f5, f0 fpmr f9, f0 fpmr f13, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L24 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX B3, BO, INC4 LFPDUX A4, AO2, INC4 LFPDUX B4, BO2, INC4 LFPDUX A5, AO, INC4 LFPDUX B5, BO, INC4 LFPDUX A6, AO2, INC4 LFPDUX B6, BO2, INC4 LFPDUX A7, AO, INC4 LFPDUX A9, BO, INC4 LFPDUX A10, BO2, INC4 bdz- .L23 .align 4 .L22: FXCPMADD f0, B1, A1, f0 nop FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 nop FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 nop FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 nop FXCSMADD f13, B2, A2, f13 LFPDUX B2, BO2, INC4 FXCPMADD f0, B3, A3, f0 nop FXCSMADD f4, B3, A3, f4 LFPDUX A2, AO2, INC4 FXCPMADD f8, B4, A3, f8 nop FXCSMADD f12, B4, A3, f12 LFPDUX A3, AO, INC4 FXCPMADD f1, B3, A4, f1 nop FXCSMADD f5, B3, A4, f5 LFPDUX B3, BO, INC4 FXCPMADD f9, B4, A4, f9 nop FXCSMADD f13, B4, A4, f13 LFPDUX B4, BO2, INC4 FXCPMADD f0, B5, A5, f0 nop FXCSMADD f4, B5, A5, f4 LFPDUX A4, AO2, INC4 FXCPMADD f8, B6, A5, f8 nop FXCSMADD f12, B6, A5, f12 LFPDUX A5, AO, INC4 FXCPMADD f1, B5, A6, f1 nop FXCSMADD f5, B5, A6, f5 LFPDUX B5, BO, INC4 FXCPMADD f9, B6, A6, f9 nop FXCSMADD f13, B6, A6, f13 LFPDUX B6, BO2, INC4 FXCPMADD f0, A9, A7, f0 nop FXCSMADD f4, A9, A7, f4 LFPDUX A6, AO2, INC4 FXCPMADD f8, A10, A7, f8 nop FXCSMADD f12, A10, A7, f12 LFPDUX A7, AO, INC4 FXCPMADD f1, A9, A8, f1 nop FXCSMADD f5, A9, A8, f5 LFPDUX A9, BO, INC4 FXCPMADD f9, A10, A8, f9 nop FXCSMADD f13, A10, A8, f13 LFPDUX A10, BO2, INC4 bdnz+ .L22 .align 4 .L23: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 LFPDUX A8, AO2, INC4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 FXCPMADD f0, B3, A3, f0 FXCSMADD f4, B3, A3, f4 FXCPMADD f8, B4, A3, f8 FXCSMADD f12, B4, A3, f12 FXCPMADD f1, B3, A4, f1 FXCSMADD f5, B3, A4, f5 FXCPMADD f9, B4, A4, f9 FXCSMADD f13, B4, A4, f13 FXCPMADD f0, B5, A5, f0 FXCSMADD f4, B5, A5, f4 FXCPMADD f8, B6, A5, f8 FXCSMADD f12, B6, A5, f12 FXCPMADD f1, B5, A6, f1 FXCSMADD f5, B5, A6, f5 FXCPMADD f9, B6, A6, f9 FXCSMADD f13, B6, A6, f13 FXCPMADD f0, A9, A7, f0 FXCSMADD f4, A9, A7, f4 FXCPMADD f8, A10, A7, f8 FXCSMADD f12, A10, A7, f12 FXCPMADD f1, A9, A8, f1 FXCSMADD f5, A9, A8, f5 FXCPMADD f9, A10, A8, f9 FXCSMADD f13, A10, A8, f13 .align 4 .L24: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L28 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L28 #endif LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 bdz- .L27 .align 4 .L26: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 LFPDUX A1, AO, INC4 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 LFPDUX B1, BO, INC4 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 LFPDUX A2, AO2, INC4 LFPDUX B2, BO2, INC4 bdnz+ .L26 .align 4 .L27: FXCPMADD f0, B1, A1, f0 FXCSMADD f4, B1, A1, f4 FXCPMADD f8, B2, A1, f8 FXCSMADD f12, B2, A1, f12 FXCPMADD f1, B1, A2, f1 FXCSMADD f5, B1, A2, f5 FXCPMADD f9, B2, A2, f9 FXCSMADD f13, B2, A2, f13 .align 4 .L28: fpadd f0, f0, f4 fpadd f8, f8, f12 fpadd f1, f1, f5 fpadd f9, f9, f13 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 2 #endif slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 addi AO2, AO, 2 * SIZE addi BO, BO, - 4 * SIZE addi BO2, BO, 2 * SIZE #endif #if defined(LN) || defined(LT) LFPDUX f16, BO, INC4 LFPDUX f18, BO2, INC4 LFPDUX f17, BO, INC4 LFPDUX f19, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE #else LFPDUX f16, AO, INC4 LFPDUX f17, AO2, INC4 LFPDUX f18, AO, INC4 LFPDUX f19, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE #endif fpsub f0, f16, f0 fpsub f1, f17, f1 fpsub f8, f18, f8 fpsub f9, f19, f9 #ifdef LN LFPDUX A1, AO, INC4 add AO2, AO2, INC4 LFPDUX A2, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A3, f1 fxpmul f5, A3, f9 FXCXNPMA f1, A3, f1, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f1, f0 fxcpnmsub f8, A2, f9, f8 FXCXNSMA f0, A2, f1, f0 FXCXNSMA f8, A2, f9, f8 fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 #endif #ifdef LT LFPDUX A1, AO, INC4 LFPDUX A2, AO2, INC4 add AO, AO, INC4 LFPDUX A3, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f8 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f8, A1, f8, f5 fxcpnmsub f1, A2, f0, f1 fxcpnmsub f9, A2, f8, f9 FXCXNSMA f1, A2, f0, f1 FXCXNSMA f9, A2, f8, f9 fxpmul f6, A3, f1 fxpmul f7, A3, f9 FXCXNPMA f1, A3, f1, f6 FXCXNPMA f9, A3, f9, f7 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 fxcpnmsub f8, A2, f0, f8 fxcpnmsub f9, A2, f1, f9 FXCXNSMA f8, A2, f0, f8 FXCXNSMA f9, A2, f1, f9 fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f8 fxpmul f5, A3, f9 FXCXNPMA f8, A3, f8, f4 FXCXNPMA f9, A3, f9, f5 fxcpnmsub f0, A2, f8, f0 fxcpnmsub f1, A2, f9, f1 FXCXNSMA f0, A2, f8, f0 FXCXNSMA f1, A2, f9, f1 fxpmul f4, A1, f0 fxpmul f5, A1, f1 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f1, A1, f1, f5 #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFPDUX f0, BO, INC4 STFPDUX f8, BO2, INC4 STFPDUX f1, BO, INC4 STFPDUX f9, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE #else STFPDUX f0, AO, INC4 STFPDUX f1, AO2, INC4 STFPDUX f8, AO, INC4 STFPDUX f9, AO2, INC4 subi AO, AO, 8 * SIZE subi AO2, AO2, 8 * SIZE #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f1, CO1, INC STFSDUX f1, CO1, INC STFDUX f8, CO2, INC STFSDUX f8, CO2, INC STFDUX f9, CO2, INC STFSDUX f9, CO2, INC #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, r0 #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L30: andi. I, M, 1 beq .L49 #if defined(LT) || defined(RN) addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, B, - 4 * SIZE fpmr f2, f0 addi BO2, B, - 2 * SIZE fpmr f3, f0 srawi. r0, KK, 2 mtspr CTR, r0 ble .L34 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0 , KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK addi AO2, AO, 2 * SIZE fpmr f1, f0 addi BO, BO, - 4 * SIZE fpmr f2, f0 addi BO2, BO, 2 * SIZE fpmr f3, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 ble .L34 #endif LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 LFPDUX A2, AO2, INC4 LFPDUX B3, BO, INC4 LFPDUX B4, BO2, INC4 LFPDUX A3, AO, INC4 LFPDUX A5, BO, INC4 LFPDUX A6, BO2, INC4 LFPDUX A4, AO2, INC4 LFPDUX A7, BO, INC4 LFPDUX A8, BO2, INC4 bdz- .L33 .align 4 .L32: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDUX B2, BO2, INC4 LFPDUX A1, AO, INC4 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 LFPDUX B3, BO, INC4 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 LFPDUX B4, BO2, INC4 LFPDUX A2, AO2, INC4 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 LFPDUX A5, BO, INC4 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 LFPDUX A6, BO2, INC4 LFPDUX A3, AO, INC4 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 LFPDUX A7, BO, INC4 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 LFPDUX A8, BO2, INC4 LFPDUX A4, AO2, INC4 bdnz+ .L32 .align 4 .L33: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 FXCPMADD f0, B3, A2, f0 FXCSMADD f1, B3, A2, f1 FXCPMADD f2, B4, A2, f2 FXCSMADD f3, B4, A2, f3 FXCPMADD f0, A5, A3, f0 FXCSMADD f1, A5, A3, f1 FXCPMADD f2, A6, A3, f2 FXCSMADD f3, A6, A3, f3 FXCPMADD f0, A7, A4, f0 FXCSMADD f1, A7, A4, f1 FXCPMADD f2, A8, A4, f2 FXCSMADD f3, A8, A4, f3 .align 4 .L34: #if defined(LT) || defined(RN) andi. r0, KK, 3 mtspr CTR, r0 ble+ .L38 #else andi. r0, TEMP, 3 mtspr CTR, r0 ble+ .L38 #endif LFPDX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdz- .L37 .align 4 .L36: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 LFPDUX B1, BO, INC4 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 LFPDX A1, AO, INC4 LFPDUX B2, BO2, INC4 add AO, AO, INC2 bdnz+ .L36 .align 4 .L37: FXCPMADD f0, B1, A1, f0 FXCSMADD f1, B1, A1, f1 FXCPMADD f2, B2, A1, f2 FXCSMADD f3, B2, A1, f3 .align 4 .L38: fpadd f0, f0, f1 fpadd f2, f2, f3 #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 addi BO, BO, - 4 * SIZE #endif addi AO2, AO, 2 * SIZE addi BO2, BO, 2 * SIZE #if defined(LN) || defined(LT) LFPDX f16, BO, INC4 LFPDX f17, BO2, INC4 #else LFPDX f16, AO, INC4 LFPDX f17, AO2, INC4 #endif fpsub f0, f16, f0 fpsub f2, f17, f2 #ifdef LN LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5 #endif #ifdef LT LFPDX A1, AO, INC4 fxpmul f4, A1, f0 fxpmul f5, A1, f2 FXCXNPMA f0, A1, f0, f4 FXCXNPMA f2, A1, f2, f5 #endif #ifdef RN LFPDUX A1, BO, INC4 LFPDUX A2, BO2, INC4 add BO, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 fxcpnmsub f2, A2, f0, f2 FXCXNSMA f2, A2, f0, f2 fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4 #endif #ifdef RT LFPDUX A1, BO, INC4 add BO2, BO2, INC4 LFPDUX A2, BO, INC4 LFPDUX A3, BO2, INC4 subi BO, BO, 8 * SIZE subi BO2, BO2, 8 * SIZE fxpmul f4, A3, f2 FXCXNPMA f2, A3, f2, f4 fxcpnmsub f0, A2, f2, f0 FXCXNSMA f0, A2, f2, f0 fxpmul f4, A1, f0 FXCXNPMA f0, A1, f0, f4 #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFPDX f0, BO, INC4 STFPDX f2, BO2, INC4 #else STFPDX f0, AO, INC4 STFPDX f2, AO2, INC4 #endif STFDUX f0, CO1, INC STFSDUX f0, CO1, INC STFDUX f2, CO2, INC STFSDUX f2, CO2, INC #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif li r0, FZERO lfpsx f0, SP, r0 .align 4 .L49: #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) addi B, BO, 4 * SIZE #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt+ .L10 .align 4 .L999: addi SP, SP, 20 lwzu r14, 4(SP) lwzu r15, 4(SP) lwzu r16, 4(SP) lwzu r17, 4(SP) lwzu r18, 4(SP) lwzu r19, 4(SP) lwzu r20, 4(SP) lwzu r21, 4(SP) lwzu r22, 4(SP) lwzu r23, 4(SP) lwzu r24, 4(SP) lwzu r25, 4(SP) lwzu r26, 4(SP) lwzu r27, 4(SP) lwzu r28, 4(SP) lwzu r29, 4(SP) lwzu r30, 4(SP) lwzu r31, 4(SP) subi SP, SP, 12 li r0, 16 lfpdux f31, SP, r0 lfpdux f30, SP, r0 lfpdux f29, SP, r0 lfpdux f28, SP, r0 lfpdux f27, SP, r0 lfpdux f26, SP, r0 lfpdux f25, SP, r0 lfpdux f24, SP, r0 lfpdux f23, SP, r0 lfpdux f22, SP, r0 lfpdux f21, SP, r0 lfpdux f20, SP, r0 lfpdux f19, SP, r0 lfpdux f18, SP, r0 lfpdux f17, SP, r0 lfpdux f16, SP, r0 lfpdux f15, SP, r0 lfpdux f14, SP, r0 addi SP, SP, 16 blr .align 4 EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_power6_LN.S000066400000000000000000002453041313527062700223260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define FZERO 312(SP) #else #define STACKSIZE 256 #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r19 #define TEMP r20 #define KK r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO1 r26 #define CO2 r27 #define CO3 r28 #define CO4 r29 #define PREA r30 #define PREC r31 #ifndef CONJ #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FNMSUB #define FMA4 FMADD #elif defined(LN) || defined(LT) #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FMADD #define FMA4 FNMSUB #else #define FMA1 FMADD #define FMA2 FNMSUB #define FMA3 FMADD #define FMA4 FMADD #endif #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREA, 48 * SIZE li PREC, -4 * SIZE srawi. J, N, 2 ble LL(30) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif andi. I, M, 1 ble LL(20) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f18, f20, f0 FMA4 f3, f19, f20, f3 FMA2 f1, f18, f21, f1 FMA3 f2, f19, f21, f2 FMA1 f4, f18, f22, f4 FMA4 f7, f19, f22, f7 FMA2 f5, f18, f23, f5 FMA3 f6, f19, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA1 f8, f18, f24, f8 FMA4 f11, f19, f24, f11 FMA2 f9, f18, f25, f9 FMA3 f10, f19, f25, f10 FMA1 f12, f18, f26, f12 FMA4 f15, f19, f26, f15 FMA2 f13, f18, f27, f13 FMA3 f14, f19, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA4 f3, f29, f20, f3 FMA2 f1, f28, f21, f1 FMA3 f2, f29, f21, f2 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA4 f7, f29, f22, f7 FMA2 f5, f28, f23, f5 FMA3 f6, f29, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA1 f8, f28, f24, f8 FMA4 f11, f29, f24, f11 FMA2 f9, f28, f25, f9 FMA3 f10, f29, f25, f10 FMA1 f12, f28, f26, f12 FMA4 f15, f29, f26, f15 FMA2 f13, f28, f27, f13 FMA3 f14, f29, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f30, f20, f0 FMA4 f3, f31, f20, f3 FMA2 f1, f30, f21, f1 FMA3 f2, f31, f21, f2 FMA1 f4, f30, f22, f4 FMA4 f7, f31, f22, f7 FMA2 f5, f30, f23, f5 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA1 f8, f30, f24, f8 FMA4 f11, f31, f24, f11 FMA2 f9, f30, f25, f9 FMA3 f10, f31, f25, f10 FMA1 f12, f30, f26, f12 FMA4 f15, f31, f26, f15 FMA2 f13, f30, f27, f13 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 32 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(26) .align 4 LL(27): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 2 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif FADD f0, f0, f2 FADD f1, f1, f3 FADD f4, f4, f6 FADD f5, f5, f7 FADD f8, f8, f10 FADD f9, f9, f11 FADD f12, f12, f14 FADD f13, f13, f15 #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 FSUB f5, f19, f5 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f12, f22, f12 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f28, 0 * SIZE(AO) LFD f29, 1 * SIZE(AO) FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 #ifndef CONJ FMSUB f0, f28, f0, f16 FMADD f1, f28, f1, f17 FMSUB f4, f28, f4, f18 FMADD f5, f28, f5, f19 FMSUB f8, f28, f8, f20 FMADD f9, f28, f9, f21 FMSUB f12, f28, f12, f22 FMADD f13, f28, f13, f23 #else FMADD f0, f28, f0, f16 FMSUB f1, f28, f1, f17 FMADD f4, f28, f4, f18 FMSUB f5, f28, f5, f19 FMADD f8, f28, f8, f20 FMSUB f9, f28, f9, f21 FMADD f12, f28, f12, f22 FMSUB f13, f28, f13, f23 #endif #endif #ifdef LT LFD f24, 0 * SIZE(AO) LFD f25, 1 * SIZE(AO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f5 FMUL f19, f25, f4 FMUL f20, f25, f9 FMUL f21, f25, f8 FMUL f22, f25, f13 FMUL f23, f25, f12 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f4, f24, f4, f18 FMADD f5, f24, f5, f19 FMSUB f8, f24, f8, f20 FMADD f9, f24, f9, f21 FMSUB f12, f24, f12, f22 FMADD f13, f24, f13, f23 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f4, f24, f4, f18 FMSUB f5, f24, f5, f19 FMADD f8, f24, f8, f20 FMSUB f9, f24, f9, f21 FMADD f12, f24, f12, f22 FMSUB f13, f24, f13, f23 #endif #endif #ifdef RN LFD f24, 0 * SIZE(BO) LFD f25, 1 * SIZE(BO) LFD f26, 2 * SIZE(BO) LFD f27, 3 * SIZE(BO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMUL f16, f25, f1 FMUL f17, f25, f0 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMADD f4, f27, f1, f4 FNMSUB f5, f27, f0, f5 FNMSUB f4, f26, f0, f4 FNMSUB f5, f26, f1, f5 FMADD f8, f29, f1, f8 FNMSUB f9, f29, f0, f9 FNMSUB f8, f28, f0, f8 FNMSUB f9, f28, f1, f9 FMADD f12, f31, f1, f12 FNMSUB f13, f31, f0, f13 FNMSUB f12, f30, f0, f12 FNMSUB f13, f30, f1, f13 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMADD f8, f29, f5, f8 FNMSUB f9, f29, f4, f9 FNMSUB f8, f28, f4, f8 FNMSUB f9, f28, f5, f9 FMADD f12, f31, f5, f12 FNMSUB f13, f31, f4, f13 FNMSUB f12, f30, f4, f12 FNMSUB f13, f30, f5, f13 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMADD f12, f29, f9, f12 FNMSUB f13, f29, f8, f13 FNMSUB f12, f28, f8, f12 FNMSUB f13, f28, f9, f13 FMUL f16, f31, f13 FMUL f17, f31, f12 FMSUB f12, f30, f12, f16 FMADD f13, f30, f13, f17 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMSUB f4, f27, f1, f4 FNMADD f5, f27, f0, f5 FNMADD f4, f26, f0, f4 FNMADD f5, f26, f1, f5 FMSUB f8, f29, f1, f8 FNMADD f9, f29, f0, f9 FNMADD f8, f28, f0, f8 FNMADD f9, f28, f1, f9 FMSUB f12, f31, f1, f12 FNMADD f13, f31, f0, f13 FNMADD f12, f30, f0, f12 FNMADD f13, f30, f1, f13 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMSUB f8, f29, f5, f8 FNMADD f9, f29, f4, f9 FNMADD f8, f28, f4, f8 FNMADD f9, f28, f5, f9 FMSUB f12, f31, f5, f12 FNMADD f13, f31, f4, f13 FNMADD f12, f30, f4, f12 FNMADD f13, f30, f5, f13 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMSUB f12, f29, f9, f12 FNMADD f13, f29, f8, f13 FNMADD f12, f28, f8, f12 FNMADD f13, f28, f9, f13 FMUL f16, f31, f13 FMUL f17, f31, f12 FMADD f12, f30, f12, f16 FMSUB f13, f30, f13, f17 #endif #endif #ifdef RT LFD f24, 30 * SIZE(BO) LFD f25, 31 * SIZE(BO) LFD f26, 28 * SIZE(BO) LFD f27, 29 * SIZE(BO) LFD f28, 26 * SIZE(BO) LFD f29, 27 * SIZE(BO) LFD f30, 24 * SIZE(BO) LFD f31, 25 * SIZE(BO) FMUL f16, f25, f13 FMUL f17, f25, f12 #ifndef CONJ FMSUB f12, f24, f12, f16 FMADD f13, f24, f13, f17 FMADD f8, f27, f13, f8 FNMSUB f9, f27, f12, f9 FNMSUB f8, f26, f12, f8 FNMSUB f9, f26, f13, f9 FMADD f4, f29, f13, f4 FNMSUB f5, f29, f12, f5 FNMSUB f4, f28, f12, f4 FNMSUB f5, f28, f13, f5 FMADD f0, f31, f13, f0 FNMSUB f1, f31, f12, f1 FNMSUB f0, f30, f12, f0 FNMSUB f1, f30, f13, f1 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMADD f4, f29, f9, f4 FNMSUB f5, f29, f8, f5 FNMSUB f4, f28, f8, f4 FNMSUB f5, f28, f9, f5 FMADD f0, f31, f9, f0 FNMSUB f1, f31, f8, f1 FNMSUB f0, f30, f8, f0 FNMSUB f1, f30, f9, f1 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMADD f0, f29, f5, f0 FNMSUB f1, f29, f4, f1 FNMSUB f0, f28, f4, f0 FNMSUB f1, f28, f5, f1 FMUL f16, f31, f1 FMUL f17, f31, f0 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 #else FMADD f12, f24, f12, f16 FMSUB f13, f24, f13, f17 FMSUB f8, f27, f13, f8 FNMADD f9, f27, f12, f9 FNMADD f8, f26, f12, f8 FNMADD f9, f26, f13, f9 FMSUB f4, f29, f13, f4 FNMADD f5, f29, f12, f5 FNMADD f4, f28, f12, f4 FNMADD f5, f28, f13, f5 FMSUB f0, f31, f13, f0 FNMADD f1, f31, f12, f1 FNMADD f0, f30, f12, f0 FNMADD f1, f30, f13, f1 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMSUB f4, f29, f9, f4 FNMADD f5, f29, f8, f5 FNMADD f4, f28, f8, f4 FNMADD f5, f28, f9, f5 FMSUB f0, f31, f9, f0 FNMADD f1, f31, f8, f1 FNMADD f0, f30, f8, f0 FNMADD f1, f30, f9, f1 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMSUB f0, f29, f5, f0 FNMADD f1, f29, f4, f1 FNMADD f0, f28, f4, f0 FNMADD f1, f28, f5, f1 FMUL f16, f31, f1 FMUL f17, f31, f0 FMADD f0, f30, f0, f16 FMSUB f1, f30, f1, f17 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f4, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f8, 4 * SIZE(BO) STFD f9, 5 * SIZE(BO) STFD f12, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(20): srawi. I, M, 1 ble LL(29) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(B) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(B) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): dcbt AO, PREA dcbtst BO, PREA FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 20 * SIZE(AO) LFD f29, 21 * SIZE(AO) LFD f30, 22 * SIZE(AO) LFD f31, 23 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 40 * SIZE(BO) LFD f21, 41 * SIZE(BO) LFD f22, 42 * SIZE(BO) LFD f23, 43 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 44 * SIZE(BO) LFD f25, 45 * SIZE(BO) LFD f26, 46 * SIZE(BO) LFD f27, 47 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) LFD f18, 26 * SIZE(AO) LFD f19, 27 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 48 * SIZE(BO) LFD f21, 49 * SIZE(BO) LFD f22, 50 * SIZE(BO) LFD f23, 51 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 52 * SIZE(BO) LFD f25, 53 * SIZE(BO) LFD f26, 54 * SIZE(BO) LFD f27, 55 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 28 * SIZE(AO) LFD f29, 29 * SIZE(AO) LFD f30, 30 * SIZE(AO) LFD f31, 31 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 56 * SIZE(BO) LFD f21, 57 * SIZE(BO) LFD f22, 58 * SIZE(BO) LFD f23, 59 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 60 * SIZE(BO) LFD f25, 61 * SIZE(BO) LFD f26, 62 * SIZE(BO) LFD f27, 63 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) LFD f18, 34 * SIZE(AO) LFD f19, 35 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 64 * SIZE(BO) LFD f21, 65 * SIZE(BO) LFD f22, 66 * SIZE(BO) LFD f23, 67 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 68 * SIZE(BO) LFD f25, 69 * SIZE(BO) LFD f26, 70 * SIZE(BO) LFD f27, 71 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 64 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble LL(18) .align 4 LL(16): FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 2 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 FSUB f5, f19, f5 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f12, f22, f12 FSUB f13, f23, f13 LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) FSUB f2, f24, f2 FSUB f3, f25, f3 FSUB f6, f26, f6 FSUB f7, f27, f7 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f10, f28, f10 FSUB f11, f29, f11 FSUB f14, f30, f14 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f24, 6 * SIZE(AO) LFD f25, 7 * SIZE(AO) LFD f26, 4 * SIZE(AO) LFD f27, 5 * SIZE(AO) LFD f28, 0 * SIZE(AO) LFD f29, 1 * SIZE(AO) FMUL f16, f25, f3 FMUL f17, f25, f2 FMUL f18, f25, f7 FMUL f19, f25, f6 FMUL f20, f25, f11 FMUL f21, f25, f10 FMUL f22, f25, f15 FMUL f23, f25, f14 #ifndef CONJ FMSUB f2, f24, f2, f16 FMADD f3, f24, f3, f17 FMSUB f6, f24, f6, f18 FMADD f7, f24, f7, f19 FMSUB f10, f24, f10, f20 FMADD f11, f24, f11, f21 FMSUB f14, f24, f14, f22 FMADD f15, f24, f15, f23 FMADD f0, f27, f3, f0 FNMSUB f1, f27, f2, f1 FMADD f4, f27, f7, f4 FNMSUB f5, f27, f6, f5 FMADD f8, f27, f11, f8 FNMSUB f9, f27, f10, f9 FMADD f12, f27, f15, f12 FNMSUB f13, f27, f14, f13 FNMSUB f0, f26, f2, f0 FNMSUB f1, f26, f3, f1 FNMSUB f4, f26, f6, f4 FNMSUB f5, f26, f7, f5 FNMSUB f8, f26, f10, f8 FNMSUB f9, f26, f11, f9 FNMSUB f12, f26, f14, f12 FNMSUB f13, f26, f15, f13 FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 FMSUB f0, f28, f0, f16 FMADD f1, f28, f1, f17 FMSUB f4, f28, f4, f18 FMADD f5, f28, f5, f19 FMSUB f8, f28, f8, f20 FMADD f9, f28, f9, f21 FMSUB f12, f28, f12, f22 FMADD f13, f28, f13, f23 #else FMADD f2, f24, f2, f16 FMSUB f3, f24, f3, f17 FMADD f6, f24, f6, f18 FMSUB f7, f24, f7, f19 FMADD f10, f24, f10, f20 FMSUB f11, f24, f11, f21 FMADD f14, f24, f14, f22 FMSUB f15, f24, f15, f23 FMSUB f0, f27, f3, f0 FNMADD f1, f27, f2, f1 FMSUB f4, f27, f7, f4 FNMADD f5, f27, f6, f5 FMSUB f8, f27, f11, f8 FNMADD f9, f27, f10, f9 FMSUB f12, f27, f15, f12 FNMADD f13, f27, f14, f13 FNMADD f0, f26, f2, f0 FNMADD f1, f26, f3, f1 FNMADD f4, f26, f6, f4 FNMADD f5, f26, f7, f5 FNMADD f8, f26, f10, f8 FNMADD f9, f26, f11, f9 FNMADD f12, f26, f14, f12 FNMADD f13, f26, f15, f13 FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 FMADD f0, f28, f0, f16 FMSUB f1, f28, f1, f17 FMADD f4, f28, f4, f18 FMSUB f5, f28, f5, f19 FMADD f8, f28, f8, f20 FMSUB f9, f28, f9, f21 FMADD f12, f28, f12, f22 FMSUB f13, f28, f13, f23 #endif #endif #ifdef LT LFD f24, 0 * SIZE(AO) LFD f25, 1 * SIZE(AO) LFD f26, 2 * SIZE(AO) LFD f27, 3 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f5 FMUL f19, f25, f4 FMUL f20, f25, f9 FMUL f21, f25, f8 FMUL f22, f25, f13 FMUL f23, f25, f12 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f4, f24, f4, f18 FMADD f5, f24, f5, f19 FMSUB f8, f24, f8, f20 FMADD f9, f24, f9, f21 FMSUB f12, f24, f12, f22 FMADD f13, f24, f13, f23 FMADD f2, f27, f1, f2 FNMSUB f3, f27, f0, f3 FMADD f6, f27, f5, f6 FNMSUB f7, f27, f4, f7 FMADD f10, f27, f9, f10 FNMSUB f11, f27, f8, f11 FMADD f14, f27, f13, f14 FNMSUB f15, f27, f12, f15 FNMSUB f2, f26, f0, f2 FNMSUB f3, f26, f1, f3 FNMSUB f6, f26, f4, f6 FNMSUB f7, f26, f5, f7 FNMSUB f10, f26, f8, f10 FNMSUB f11, f26, f9, f11 FNMSUB f14, f26, f12, f14 FNMSUB f15, f26, f13, f15 FMUL f16, f29, f3 FMUL f17, f29, f2 FMUL f18, f29, f7 FMUL f19, f29, f6 FMUL f20, f29, f11 FMUL f21, f29, f10 FMUL f22, f29, f15 FMUL f23, f29, f14 FMSUB f2, f28, f2, f16 FMADD f3, f28, f3, f17 FMSUB f6, f28, f6, f18 FMADD f7, f28, f7, f19 FMSUB f10, f28, f10, f20 FMADD f11, f28, f11, f21 FMSUB f14, f28, f14, f22 FMADD f15, f28, f15, f23 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f4, f24, f4, f18 FMSUB f5, f24, f5, f19 FMADD f8, f24, f8, f20 FMSUB f9, f24, f9, f21 FMADD f12, f24, f12, f22 FMSUB f13, f24, f13, f23 FMSUB f2, f27, f1, f2 FNMADD f3, f27, f0, f3 FMSUB f6, f27, f5, f6 FNMADD f7, f27, f4, f7 FMSUB f10, f27, f9, f10 FNMADD f11, f27, f8, f11 FMSUB f14, f27, f13, f14 FNMADD f15, f27, f12, f15 FNMADD f2, f26, f0, f2 FNMADD f3, f26, f1, f3 FNMADD f6, f26, f4, f6 FNMADD f7, f26, f5, f7 FNMADD f10, f26, f8, f10 FNMADD f11, f26, f9, f11 FNMADD f14, f26, f12, f14 FNMADD f15, f26, f13, f15 FMUL f16, f29, f3 FMUL f17, f29, f2 FMUL f18, f29, f7 FMUL f19, f29, f6 FMUL f20, f29, f11 FMUL f21, f29, f10 FMUL f22, f29, f15 FMUL f23, f29, f14 FMADD f2, f28, f2, f16 FMSUB f3, f28, f3, f17 FMADD f6, f28, f6, f18 FMSUB f7, f28, f7, f19 FMADD f10, f28, f10, f20 FMSUB f11, f28, f11, f21 FMADD f14, f28, f14, f22 FMSUB f15, f28, f15, f23 #endif #endif #ifdef RN LFD f24, 0 * SIZE(BO) LFD f25, 1 * SIZE(BO) LFD f26, 2 * SIZE(BO) LFD f27, 3 * SIZE(BO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f3 FMUL f19, f25, f2 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f2, f24, f2, f18 FMADD f3, f24, f3, f19 FMADD f4, f27, f1, f4 FNMSUB f5, f27, f0, f5 FMADD f6, f27, f3, f6 FNMSUB f7, f27, f2, f7 FNMSUB f4, f26, f0, f4 FNMSUB f5, f26, f1, f5 FNMSUB f6, f26, f2, f6 FNMSUB f7, f26, f3, f7 FMADD f8, f29, f1, f8 FNMSUB f9, f29, f0, f9 FMADD f10, f29, f3, f10 FNMSUB f11, f29, f2, f11 FNMSUB f8, f28, f0, f8 FNMSUB f9, f28, f1, f9 FNMSUB f10, f28, f2, f10 FNMSUB f11, f28, f3, f11 FMADD f12, f31, f1, f12 FNMSUB f13, f31, f0, f13 FMADD f14, f31, f3, f14 FNMSUB f15, f31, f2, f15 FNMSUB f12, f30, f0, f12 FNMSUB f13, f30, f1, f13 FNMSUB f14, f30, f2, f14 FNMSUB f15, f30, f3, f15 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMSUB f6, f26, f6, f18 FMADD f7, f26, f7, f19 FMADD f8, f29, f5, f8 FNMSUB f9, f29, f4, f9 FMADD f10, f29, f7, f10 FNMSUB f11, f29, f6, f11 FNMSUB f8, f28, f4, f8 FNMSUB f9, f28, f5, f9 FNMSUB f10, f28, f6, f10 FNMSUB f11, f28, f7, f11 FMADD f12, f31, f5, f12 FNMSUB f13, f31, f4, f13 FMADD f14, f31, f7, f14 FNMSUB f15, f31, f6, f15 FNMSUB f12, f30, f4, f12 FNMSUB f13, f30, f5, f13 FNMSUB f14, f30, f6, f14 FNMSUB f15, f30, f7, f15 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMSUB f10, f26, f10, f18 FMADD f11, f26, f11, f19 FMADD f12, f29, f9, f12 FNMSUB f13, f29, f8, f13 FMADD f14, f29, f11, f14 FNMSUB f15, f29, f10, f15 FNMSUB f12, f28, f8, f12 FNMSUB f13, f28, f9, f13 FNMSUB f14, f28, f10, f14 FNMSUB f15, f28, f11, f15 FMUL f16, f31, f13 FMUL f17, f31, f12 FMUL f18, f31, f15 FMUL f19, f31, f14 FMSUB f12, f30, f12, f16 FMADD f13, f30, f13, f17 FMSUB f14, f30, f14, f18 FMADD f15, f30, f15, f19 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f2, f24, f2, f18 FMSUB f3, f24, f3, f19 FMSUB f4, f27, f1, f4 FNMADD f5, f27, f0, f5 FMSUB f6, f27, f3, f6 FNMADD f7, f27, f2, f7 FNMADD f4, f26, f0, f4 FNMADD f5, f26, f1, f5 FNMADD f6, f26, f2, f6 FNMADD f7, f26, f3, f7 FMSUB f8, f29, f1, f8 FNMADD f9, f29, f0, f9 FMSUB f10, f29, f3, f10 FNMADD f11, f29, f2, f11 FNMADD f8, f28, f0, f8 FNMADD f9, f28, f1, f9 FNMADD f10, f28, f2, f10 FNMADD f11, f28, f3, f11 FMSUB f12, f31, f1, f12 FNMADD f13, f31, f0, f13 FMSUB f14, f31, f3, f14 FNMADD f15, f31, f2, f15 FNMADD f12, f30, f0, f12 FNMADD f13, f30, f1, f13 FNMADD f14, f30, f2, f14 FNMADD f15, f30, f3, f15 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMADD f6, f26, f6, f18 FMSUB f7, f26, f7, f19 FMSUB f8, f29, f5, f8 FNMADD f9, f29, f4, f9 FMSUB f10, f29, f7, f10 FNMADD f11, f29, f6, f11 FNMADD f8, f28, f4, f8 FNMADD f9, f28, f5, f9 FNMADD f10, f28, f6, f10 FNMADD f11, f28, f7, f11 FMSUB f12, f31, f5, f12 FNMADD f13, f31, f4, f13 FMSUB f14, f31, f7, f14 FNMADD f15, f31, f6, f15 FNMADD f12, f30, f4, f12 FNMADD f13, f30, f5, f13 FNMADD f14, f30, f6, f14 FNMADD f15, f30, f7, f15 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMADD f10, f26, f10, f18 FMSUB f11, f26, f11, f19 FMSUB f12, f29, f9, f12 FNMADD f13, f29, f8, f13 FMSUB f14, f29, f11, f14 FNMADD f15, f29, f10, f15 FNMADD f12, f28, f8, f12 FNMADD f13, f28, f9, f13 FNMADD f14, f28, f10, f14 FNMADD f15, f28, f11, f15 FMUL f16, f31, f13 FMUL f17, f31, f12 FMUL f18, f31, f15 FMUL f19, f31, f14 FMADD f12, f30, f12, f16 FMSUB f13, f30, f13, f17 FMADD f14, f30, f14, f18 FMSUB f15, f30, f15, f19 #endif #endif #ifdef RT LFD f24, 30 * SIZE(BO) LFD f25, 31 * SIZE(BO) LFD f26, 28 * SIZE(BO) LFD f27, 29 * SIZE(BO) LFD f28, 26 * SIZE(BO) LFD f29, 27 * SIZE(BO) LFD f30, 24 * SIZE(BO) LFD f31, 25 * SIZE(BO) FMUL f16, f25, f13 FMUL f17, f25, f12 FMUL f18, f25, f15 FMUL f19, f25, f14 #ifndef CONJ FMSUB f12, f24, f12, f16 FMADD f13, f24, f13, f17 FMSUB f14, f24, f14, f18 FMADD f15, f24, f15, f19 FMADD f8, f27, f13, f8 FNMSUB f9, f27, f12, f9 FMADD f10, f27, f15, f10 FNMSUB f11, f27, f14, f11 FNMSUB f8, f26, f12, f8 FNMSUB f9, f26, f13, f9 FNMSUB f10, f26, f14, f10 FNMSUB f11, f26, f15, f11 FMADD f4, f29, f13, f4 FNMSUB f5, f29, f12, f5 FMADD f6, f29, f15, f6 FNMSUB f7, f29, f14, f7 FNMSUB f4, f28, f12, f4 FNMSUB f5, f28, f13, f5 FNMSUB f6, f28, f14, f6 FNMSUB f7, f28, f15, f7 FMADD f0, f31, f13, f0 FNMSUB f1, f31, f12, f1 FMADD f2, f31, f15, f2 FNMSUB f3, f31, f14, f3 FNMSUB f0, f30, f12, f0 FNMSUB f1, f30, f13, f1 FNMSUB f2, f30, f14, f2 FNMSUB f3, f30, f15, f3 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMSUB f10, f26, f10, f18 FMADD f11, f26, f11, f19 FMADD f4, f29, f9, f4 FNMSUB f5, f29, f8, f5 FMADD f6, f29, f11, f6 FNMSUB f7, f29, f10, f7 FNMSUB f4, f28, f8, f4 FNMSUB f5, f28, f9, f5 FNMSUB f6, f28, f10, f6 FNMSUB f7, f28, f11, f7 FMADD f0, f31, f9, f0 FNMSUB f1, f31, f8, f1 FMADD f2, f31, f11, f2 FNMSUB f3, f31, f10, f3 FNMSUB f0, f30, f8, f0 FNMSUB f1, f30, f9, f1 FNMSUB f2, f30, f10, f2 FNMSUB f3, f30, f11, f3 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMSUB f6, f26, f6, f18 FMADD f7, f26, f7, f19 FMADD f0, f29, f5, f0 FNMSUB f1, f29, f4, f1 FMADD f2, f29, f7, f2 FNMSUB f3, f29, f6, f3 FNMSUB f0, f28, f4, f0 FNMSUB f1, f28, f5, f1 FNMSUB f2, f28, f6, f2 FNMSUB f3, f28, f7, f3 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMSUB f2, f30, f2, f18 FMADD f3, f30, f3, f19 #else FMADD f12, f24, f12, f16 FMSUB f13, f24, f13, f17 FMADD f14, f24, f14, f18 FMSUB f15, f24, f15, f19 FMSUB f8, f27, f13, f8 FNMADD f9, f27, f12, f9 FMSUB f10, f27, f15, f10 FNMADD f11, f27, f14, f11 FNMADD f8, f26, f12, f8 FNMADD f9, f26, f13, f9 FNMADD f10, f26, f14, f10 FNMADD f11, f26, f15, f11 FMSUB f4, f29, f13, f4 FNMADD f5, f29, f12, f5 FMSUB f6, f29, f15, f6 FNMADD f7, f29, f14, f7 FNMADD f4, f28, f12, f4 FNMADD f5, f28, f13, f5 FNMADD f6, f28, f14, f6 FNMADD f7, f28, f15, f7 FMSUB f0, f31, f13, f0 FNMADD f1, f31, f12, f1 FMSUB f2, f31, f15, f2 FNMADD f3, f31, f14, f3 FNMADD f0, f30, f12, f0 FNMADD f1, f30, f13, f1 FNMADD f2, f30, f14, f2 FNMADD f3, f30, f15, f3 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMADD f10, f26, f10, f18 FMSUB f11, f26, f11, f19 FMSUB f4, f29, f9, f4 FNMADD f5, f29, f8, f5 FMSUB f6, f29, f11, f6 FNMADD f7, f29, f10, f7 FNMADD f4, f28, f8, f4 FNMADD f5, f28, f9, f5 FNMADD f6, f28, f10, f6 FNMADD f7, f28, f11, f7 FMSUB f0, f31, f9, f0 FNMADD f1, f31, f8, f1 FMSUB f2, f31, f11, f2 FNMADD f3, f31, f10, f3 FNMADD f0, f30, f8, f0 FNMADD f1, f30, f9, f1 FNMADD f2, f30, f10, f2 FNMADD f3, f30, f11, f3 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMADD f6, f26, f6, f18 FMSUB f7, f26, f7, f19 FMSUB f0, f29, f5, f0 FNMADD f1, f29, f4, f1 FMSUB f2, f29, f7, f2 FNMADD f3, f29, f6, f3 FNMADD f0, f28, f4, f0 FNMADD f1, f28, f5, f1 FNMADD f2, f28, f6, f2 FNMADD f3, f28, f7, f3 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMADD f0, f30, f0, f16 FMSUB f1, f30, f1, f17 FMADD f2, f30, f2, f18 FMSUB f3, f30, f3, f19 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f4, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f8, 4 * SIZE(BO) STFD f9, 5 * SIZE(BO) STFD f12, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f3, 9 * SIZE(BO) STFD f6, 10 * SIZE(BO) STFD f7, 11 * SIZE(BO) STFD f10, 12 * SIZE(BO) STFD f11, 13 * SIZE(BO) STFD f14, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(29): #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(30): andi. J, N, 2 ble LL(50) .align 4 #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif andi. I, M, 1 ble LL(40) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(47) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(40): srawi. I, M, 1 ble LL(49) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): dcbt AO, PREA dcbtst BO, PREA FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 4 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f25, 5 * SIZE(AO) LFD f29, 5 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 6 * SIZE(AO) LFD f30, 6 * SIZE(BO) LFD f27, 7 * SIZE(AO) LFD f31, 7 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 8 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f17, 9 * SIZE(AO) LFD f21, 9 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 10 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f19, 11 * SIZE(AO) LFD f23, 11 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 12 * SIZE(AO) LFD f28, 12 * SIZE(BO) LFD f25, 13 * SIZE(AO) LFD f29, 13 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 14 * SIZE(AO) LFD f30, 14 * SIZE(BO) LFD f27, 15 * SIZE(AO) LFD f31, 15 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 16 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f17, 17 * SIZE(AO) LFD f21, 17 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 18 * SIZE(AO) LFD f22, 18 * SIZE(BO) LFD f19, 19 * SIZE(AO) LFD f23, 19 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 20 * SIZE(AO) LFD f28, 20 * SIZE(BO) LFD f25, 21 * SIZE(AO) LFD f29, 21 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 22 * SIZE(AO) LFD f30, 22 * SIZE(BO) LFD f27, 23 * SIZE(AO) LFD f31, 23 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 24 * SIZE(AO) LFD f20, 24 * SIZE(BO) LFD f17, 25 * SIZE(AO) LFD f21, 25 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 26 * SIZE(AO) LFD f22, 26 * SIZE(BO) LFD f19, 27 * SIZE(AO) LFD f23, 27 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 28 * SIZE(AO) LFD f28, 28 * SIZE(BO) LFD f25, 29 * SIZE(AO) LFD f29, 29 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 30 * SIZE(AO) LFD f30, 30 * SIZE(BO) LFD f27, 31 * SIZE(AO) LFD f31, 31 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 32 * SIZE(AO) LFD f20, 32 * SIZE(BO) LFD f17, 33 * SIZE(AO) LFD f21, 33 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 34 * SIZE(AO) LFD f22, 34 * SIZE(BO) LFD f19, 35 * SIZE(AO) LFD f23, 35 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(38): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(49): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif .align 4 LL(50): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif andi. I, M, 1 ble LL(60) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 4 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) FMADD f4, f18, f22, f4 FMADD f5, f19, f23, f5 FMADD f6, f19, f22, f6 FMADD f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) FMADD f4, f18, f22, f4 FMADD f5, f19, f23, f5 FMADD f6, f19, f22, f6 FMADD f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(67) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(66) .align 4 LL(67): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(60): srawi. I, M, 1 ble LL(69) .align 4 LL(51): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 4 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(57) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(56) .align 4 LL(57): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(51) .align 4 LL(69): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_power6_LT.S000066400000000000000000002447511313527062700223410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define FZERO 312(SP) #else #define STACKSIZE 256 #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r19 #define TEMP r20 #define KK r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO1 r26 #define CO2 r27 #define CO3 r28 #define CO4 r29 #define PREA r30 #define PREC r31 #ifndef CONJ #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FNMSUB #define FMA4 FMADD #elif defined(LN) || defined(LT) #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FMADD #define FMA4 FNMSUB #else #define FMA1 FMADD #define FMA2 FNMSUB #define FMA3 FMADD #define FMA4 FMADD #endif #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREA, 48 * SIZE li PREC, 4 * SIZE srawi. J, N, 2 ble LL(30) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(B) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(B) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): dcbt AO, PREA dcbtst BO, PREA FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 20 * SIZE(AO) LFD f29, 21 * SIZE(AO) LFD f30, 22 * SIZE(AO) LFD f31, 23 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 40 * SIZE(BO) LFD f21, 41 * SIZE(BO) LFD f22, 42 * SIZE(BO) LFD f23, 43 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 44 * SIZE(BO) LFD f25, 45 * SIZE(BO) LFD f26, 46 * SIZE(BO) LFD f27, 47 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) LFD f18, 26 * SIZE(AO) LFD f19, 27 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 48 * SIZE(BO) LFD f21, 49 * SIZE(BO) LFD f22, 50 * SIZE(BO) LFD f23, 51 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 52 * SIZE(BO) LFD f25, 53 * SIZE(BO) LFD f26, 54 * SIZE(BO) LFD f27, 55 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 28 * SIZE(AO) LFD f29, 29 * SIZE(AO) LFD f30, 30 * SIZE(AO) LFD f31, 31 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 56 * SIZE(BO) LFD f21, 57 * SIZE(BO) LFD f22, 58 * SIZE(BO) LFD f23, 59 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 60 * SIZE(BO) LFD f25, 61 * SIZE(BO) LFD f26, 62 * SIZE(BO) LFD f27, 63 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) LFD f18, 34 * SIZE(AO) LFD f19, 35 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 64 * SIZE(BO) LFD f21, 65 * SIZE(BO) LFD f22, 66 * SIZE(BO) LFD f23, 67 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 68 * SIZE(BO) LFD f25, 69 * SIZE(BO) LFD f26, 70 * SIZE(BO) LFD f27, 71 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 64 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble LL(18) .align 4 LL(16): FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 2 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 FSUB f5, f19, f5 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f12, f22, f12 FSUB f13, f23, f13 LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) FSUB f2, f24, f2 FSUB f3, f25, f3 FSUB f6, f26, f6 FSUB f7, f27, f7 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f10, f28, f10 FSUB f11, f29, f11 FSUB f14, f30, f14 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f24, 6 * SIZE(AO) LFD f25, 7 * SIZE(AO) LFD f26, 4 * SIZE(AO) LFD f27, 5 * SIZE(AO) LFD f28, 0 * SIZE(AO) LFD f29, 1 * SIZE(AO) FMUL f16, f25, f3 FMUL f17, f25, f2 FMUL f18, f25, f7 FMUL f19, f25, f6 FMUL f20, f25, f11 FMUL f21, f25, f10 FMUL f22, f25, f15 FMUL f23, f25, f14 #ifndef CONJ FMSUB f2, f24, f2, f16 FMADD f3, f24, f3, f17 FMSUB f6, f24, f6, f18 FMADD f7, f24, f7, f19 FMSUB f10, f24, f10, f20 FMADD f11, f24, f11, f21 FMSUB f14, f24, f14, f22 FMADD f15, f24, f15, f23 FMADD f0, f27, f3, f0 FNMSUB f1, f27, f2, f1 FMADD f4, f27, f7, f4 FNMSUB f5, f27, f6, f5 FMADD f8, f27, f11, f8 FNMSUB f9, f27, f10, f9 FMADD f12, f27, f15, f12 FNMSUB f13, f27, f14, f13 FNMSUB f0, f26, f2, f0 FNMSUB f1, f26, f3, f1 FNMSUB f4, f26, f6, f4 FNMSUB f5, f26, f7, f5 FNMSUB f8, f26, f10, f8 FNMSUB f9, f26, f11, f9 FNMSUB f12, f26, f14, f12 FNMSUB f13, f26, f15, f13 FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 FMSUB f0, f28, f0, f16 FMADD f1, f28, f1, f17 FMSUB f4, f28, f4, f18 FMADD f5, f28, f5, f19 FMSUB f8, f28, f8, f20 FMADD f9, f28, f9, f21 FMSUB f12, f28, f12, f22 FMADD f13, f28, f13, f23 #else FMADD f2, f24, f2, f16 FMSUB f3, f24, f3, f17 FMADD f6, f24, f6, f18 FMSUB f7, f24, f7, f19 FMADD f10, f24, f10, f20 FMSUB f11, f24, f11, f21 FMADD f14, f24, f14, f22 FMSUB f15, f24, f15, f23 FMSUB f0, f27, f3, f0 FNMADD f1, f27, f2, f1 FMSUB f4, f27, f7, f4 FNMADD f5, f27, f6, f5 FMSUB f8, f27, f11, f8 FNMADD f9, f27, f10, f9 FMSUB f12, f27, f15, f12 FNMADD f13, f27, f14, f13 FNMADD f0, f26, f2, f0 FNMADD f1, f26, f3, f1 FNMADD f4, f26, f6, f4 FNMADD f5, f26, f7, f5 FNMADD f8, f26, f10, f8 FNMADD f9, f26, f11, f9 FNMADD f12, f26, f14, f12 FNMADD f13, f26, f15, f13 FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 FMADD f0, f28, f0, f16 FMSUB f1, f28, f1, f17 FMADD f4, f28, f4, f18 FMSUB f5, f28, f5, f19 FMADD f8, f28, f8, f20 FMSUB f9, f28, f9, f21 FMADD f12, f28, f12, f22 FMSUB f13, f28, f13, f23 #endif #endif #ifdef LT LFD f24, 0 * SIZE(AO) LFD f25, 1 * SIZE(AO) LFD f26, 2 * SIZE(AO) LFD f27, 3 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f5 FMUL f19, f25, f4 FMUL f20, f25, f9 FMUL f21, f25, f8 FMUL f22, f25, f13 FMUL f23, f25, f12 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f4, f24, f4, f18 FMADD f5, f24, f5, f19 FMSUB f8, f24, f8, f20 FMADD f9, f24, f9, f21 FMSUB f12, f24, f12, f22 FMADD f13, f24, f13, f23 FMADD f2, f27, f1, f2 FNMSUB f3, f27, f0, f3 FMADD f6, f27, f5, f6 FNMSUB f7, f27, f4, f7 FMADD f10, f27, f9, f10 FNMSUB f11, f27, f8, f11 FMADD f14, f27, f13, f14 FNMSUB f15, f27, f12, f15 FNMSUB f2, f26, f0, f2 FNMSUB f3, f26, f1, f3 FNMSUB f6, f26, f4, f6 FNMSUB f7, f26, f5, f7 FNMSUB f10, f26, f8, f10 FNMSUB f11, f26, f9, f11 FNMSUB f14, f26, f12, f14 FNMSUB f15, f26, f13, f15 FMUL f16, f29, f3 FMUL f17, f29, f2 FMUL f18, f29, f7 FMUL f19, f29, f6 FMUL f20, f29, f11 FMUL f21, f29, f10 FMUL f22, f29, f15 FMUL f23, f29, f14 FMSUB f2, f28, f2, f16 FMADD f3, f28, f3, f17 FMSUB f6, f28, f6, f18 FMADD f7, f28, f7, f19 FMSUB f10, f28, f10, f20 FMADD f11, f28, f11, f21 FMSUB f14, f28, f14, f22 FMADD f15, f28, f15, f23 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f4, f24, f4, f18 FMSUB f5, f24, f5, f19 FMADD f8, f24, f8, f20 FMSUB f9, f24, f9, f21 FMADD f12, f24, f12, f22 FMSUB f13, f24, f13, f23 FMSUB f2, f27, f1, f2 FNMADD f3, f27, f0, f3 FMSUB f6, f27, f5, f6 FNMADD f7, f27, f4, f7 FMSUB f10, f27, f9, f10 FNMADD f11, f27, f8, f11 FMSUB f14, f27, f13, f14 FNMADD f15, f27, f12, f15 FNMADD f2, f26, f0, f2 FNMADD f3, f26, f1, f3 FNMADD f6, f26, f4, f6 FNMADD f7, f26, f5, f7 FNMADD f10, f26, f8, f10 FNMADD f11, f26, f9, f11 FNMADD f14, f26, f12, f14 FNMADD f15, f26, f13, f15 FMUL f16, f29, f3 FMUL f17, f29, f2 FMUL f18, f29, f7 FMUL f19, f29, f6 FMUL f20, f29, f11 FMUL f21, f29, f10 FMUL f22, f29, f15 FMUL f23, f29, f14 FMADD f2, f28, f2, f16 FMSUB f3, f28, f3, f17 FMADD f6, f28, f6, f18 FMSUB f7, f28, f7, f19 FMADD f10, f28, f10, f20 FMSUB f11, f28, f11, f21 FMADD f14, f28, f14, f22 FMSUB f15, f28, f15, f23 #endif #endif #ifdef RN LFD f24, 0 * SIZE(BO) LFD f25, 1 * SIZE(BO) LFD f26, 2 * SIZE(BO) LFD f27, 3 * SIZE(BO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f3 FMUL f19, f25, f2 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f2, f24, f2, f18 FMADD f3, f24, f3, f19 FMADD f4, f27, f1, f4 FNMSUB f5, f27, f0, f5 FMADD f6, f27, f3, f6 FNMSUB f7, f27, f2, f7 FNMSUB f4, f26, f0, f4 FNMSUB f5, f26, f1, f5 FNMSUB f6, f26, f2, f6 FNMSUB f7, f26, f3, f7 FMADD f8, f29, f1, f8 FNMSUB f9, f29, f0, f9 FMADD f10, f29, f3, f10 FNMSUB f11, f29, f2, f11 FNMSUB f8, f28, f0, f8 FNMSUB f9, f28, f1, f9 FNMSUB f10, f28, f2, f10 FNMSUB f11, f28, f3, f11 FMADD f12, f31, f1, f12 FNMSUB f13, f31, f0, f13 FMADD f14, f31, f3, f14 FNMSUB f15, f31, f2, f15 FNMSUB f12, f30, f0, f12 FNMSUB f13, f30, f1, f13 FNMSUB f14, f30, f2, f14 FNMSUB f15, f30, f3, f15 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMSUB f6, f26, f6, f18 FMADD f7, f26, f7, f19 FMADD f8, f29, f5, f8 FNMSUB f9, f29, f4, f9 FMADD f10, f29, f7, f10 FNMSUB f11, f29, f6, f11 FNMSUB f8, f28, f4, f8 FNMSUB f9, f28, f5, f9 FNMSUB f10, f28, f6, f10 FNMSUB f11, f28, f7, f11 FMADD f12, f31, f5, f12 FNMSUB f13, f31, f4, f13 FMADD f14, f31, f7, f14 FNMSUB f15, f31, f6, f15 FNMSUB f12, f30, f4, f12 FNMSUB f13, f30, f5, f13 FNMSUB f14, f30, f6, f14 FNMSUB f15, f30, f7, f15 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMSUB f10, f26, f10, f18 FMADD f11, f26, f11, f19 FMADD f12, f29, f9, f12 FNMSUB f13, f29, f8, f13 FMADD f14, f29, f11, f14 FNMSUB f15, f29, f10, f15 FNMSUB f12, f28, f8, f12 FNMSUB f13, f28, f9, f13 FNMSUB f14, f28, f10, f14 FNMSUB f15, f28, f11, f15 FMUL f16, f31, f13 FMUL f17, f31, f12 FMUL f18, f31, f15 FMUL f19, f31, f14 FMSUB f12, f30, f12, f16 FMADD f13, f30, f13, f17 FMSUB f14, f30, f14, f18 FMADD f15, f30, f15, f19 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f2, f24, f2, f18 FMSUB f3, f24, f3, f19 FMSUB f4, f27, f1, f4 FNMADD f5, f27, f0, f5 FMSUB f6, f27, f3, f6 FNMADD f7, f27, f2, f7 FNMADD f4, f26, f0, f4 FNMADD f5, f26, f1, f5 FNMADD f6, f26, f2, f6 FNMADD f7, f26, f3, f7 FMSUB f8, f29, f1, f8 FNMADD f9, f29, f0, f9 FMSUB f10, f29, f3, f10 FNMADD f11, f29, f2, f11 FNMADD f8, f28, f0, f8 FNMADD f9, f28, f1, f9 FNMADD f10, f28, f2, f10 FNMADD f11, f28, f3, f11 FMSUB f12, f31, f1, f12 FNMADD f13, f31, f0, f13 FMSUB f14, f31, f3, f14 FNMADD f15, f31, f2, f15 FNMADD f12, f30, f0, f12 FNMADD f13, f30, f1, f13 FNMADD f14, f30, f2, f14 FNMADD f15, f30, f3, f15 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMADD f6, f26, f6, f18 FMSUB f7, f26, f7, f19 FMSUB f8, f29, f5, f8 FNMADD f9, f29, f4, f9 FMSUB f10, f29, f7, f10 FNMADD f11, f29, f6, f11 FNMADD f8, f28, f4, f8 FNMADD f9, f28, f5, f9 FNMADD f10, f28, f6, f10 FNMADD f11, f28, f7, f11 FMSUB f12, f31, f5, f12 FNMADD f13, f31, f4, f13 FMSUB f14, f31, f7, f14 FNMADD f15, f31, f6, f15 FNMADD f12, f30, f4, f12 FNMADD f13, f30, f5, f13 FNMADD f14, f30, f6, f14 FNMADD f15, f30, f7, f15 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMADD f10, f26, f10, f18 FMSUB f11, f26, f11, f19 FMSUB f12, f29, f9, f12 FNMADD f13, f29, f8, f13 FMSUB f14, f29, f11, f14 FNMADD f15, f29, f10, f15 FNMADD f12, f28, f8, f12 FNMADD f13, f28, f9, f13 FNMADD f14, f28, f10, f14 FNMADD f15, f28, f11, f15 FMUL f16, f31, f13 FMUL f17, f31, f12 FMUL f18, f31, f15 FMUL f19, f31, f14 FMADD f12, f30, f12, f16 FMSUB f13, f30, f13, f17 FMADD f14, f30, f14, f18 FMSUB f15, f30, f15, f19 #endif #endif #ifdef RT LFD f24, 30 * SIZE(BO) LFD f25, 31 * SIZE(BO) LFD f26, 28 * SIZE(BO) LFD f27, 29 * SIZE(BO) LFD f28, 26 * SIZE(BO) LFD f29, 27 * SIZE(BO) LFD f30, 24 * SIZE(BO) LFD f31, 25 * SIZE(BO) FMUL f16, f25, f13 FMUL f17, f25, f12 FMUL f18, f25, f15 FMUL f19, f25, f14 #ifndef CONJ FMSUB f12, f24, f12, f16 FMADD f13, f24, f13, f17 FMSUB f14, f24, f14, f18 FMADD f15, f24, f15, f19 FMADD f8, f27, f13, f8 FNMSUB f9, f27, f12, f9 FMADD f10, f27, f15, f10 FNMSUB f11, f27, f14, f11 FNMSUB f8, f26, f12, f8 FNMSUB f9, f26, f13, f9 FNMSUB f10, f26, f14, f10 FNMSUB f11, f26, f15, f11 FMADD f4, f29, f13, f4 FNMSUB f5, f29, f12, f5 FMADD f6, f29, f15, f6 FNMSUB f7, f29, f14, f7 FNMSUB f4, f28, f12, f4 FNMSUB f5, f28, f13, f5 FNMSUB f6, f28, f14, f6 FNMSUB f7, f28, f15, f7 FMADD f0, f31, f13, f0 FNMSUB f1, f31, f12, f1 FMADD f2, f31, f15, f2 FNMSUB f3, f31, f14, f3 FNMSUB f0, f30, f12, f0 FNMSUB f1, f30, f13, f1 FNMSUB f2, f30, f14, f2 FNMSUB f3, f30, f15, f3 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMSUB f10, f26, f10, f18 FMADD f11, f26, f11, f19 FMADD f4, f29, f9, f4 FNMSUB f5, f29, f8, f5 FMADD f6, f29, f11, f6 FNMSUB f7, f29, f10, f7 FNMSUB f4, f28, f8, f4 FNMSUB f5, f28, f9, f5 FNMSUB f6, f28, f10, f6 FNMSUB f7, f28, f11, f7 FMADD f0, f31, f9, f0 FNMSUB f1, f31, f8, f1 FMADD f2, f31, f11, f2 FNMSUB f3, f31, f10, f3 FNMSUB f0, f30, f8, f0 FNMSUB f1, f30, f9, f1 FNMSUB f2, f30, f10, f2 FNMSUB f3, f30, f11, f3 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMSUB f6, f26, f6, f18 FMADD f7, f26, f7, f19 FMADD f0, f29, f5, f0 FNMSUB f1, f29, f4, f1 FMADD f2, f29, f7, f2 FNMSUB f3, f29, f6, f3 FNMSUB f0, f28, f4, f0 FNMSUB f1, f28, f5, f1 FNMSUB f2, f28, f6, f2 FNMSUB f3, f28, f7, f3 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMSUB f2, f30, f2, f18 FMADD f3, f30, f3, f19 #else FMADD f12, f24, f12, f16 FMSUB f13, f24, f13, f17 FMADD f14, f24, f14, f18 FMSUB f15, f24, f15, f19 FMSUB f8, f27, f13, f8 FNMADD f9, f27, f12, f9 FMSUB f10, f27, f15, f10 FNMADD f11, f27, f14, f11 FNMADD f8, f26, f12, f8 FNMADD f9, f26, f13, f9 FNMADD f10, f26, f14, f10 FNMADD f11, f26, f15, f11 FMSUB f4, f29, f13, f4 FNMADD f5, f29, f12, f5 FMSUB f6, f29, f15, f6 FNMADD f7, f29, f14, f7 FNMADD f4, f28, f12, f4 FNMADD f5, f28, f13, f5 FNMADD f6, f28, f14, f6 FNMADD f7, f28, f15, f7 FMSUB f0, f31, f13, f0 FNMADD f1, f31, f12, f1 FMSUB f2, f31, f15, f2 FNMADD f3, f31, f14, f3 FNMADD f0, f30, f12, f0 FNMADD f1, f30, f13, f1 FNMADD f2, f30, f14, f2 FNMADD f3, f30, f15, f3 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMADD f10, f26, f10, f18 FMSUB f11, f26, f11, f19 FMSUB f4, f29, f9, f4 FNMADD f5, f29, f8, f5 FMSUB f6, f29, f11, f6 FNMADD f7, f29, f10, f7 FNMADD f4, f28, f8, f4 FNMADD f5, f28, f9, f5 FNMADD f6, f28, f10, f6 FNMADD f7, f28, f11, f7 FMSUB f0, f31, f9, f0 FNMADD f1, f31, f8, f1 FMSUB f2, f31, f11, f2 FNMADD f3, f31, f10, f3 FNMADD f0, f30, f8, f0 FNMADD f1, f30, f9, f1 FNMADD f2, f30, f10, f2 FNMADD f3, f30, f11, f3 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMADD f6, f26, f6, f18 FMSUB f7, f26, f7, f19 FMSUB f0, f29, f5, f0 FNMADD f1, f29, f4, f1 FMSUB f2, f29, f7, f2 FNMADD f3, f29, f6, f3 FNMADD f0, f28, f4, f0 FNMADD f1, f28, f5, f1 FNMADD f2, f28, f6, f2 FNMADD f3, f28, f7, f3 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMADD f0, f30, f0, f16 FMSUB f1, f30, f1, f17 FMADD f2, f30, f2, f18 FMSUB f3, f30, f3, f19 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f4, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f8, 4 * SIZE(BO) STFD f9, 5 * SIZE(BO) STFD f12, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f3, 9 * SIZE(BO) STFD f6, 10 * SIZE(BO) STFD f7, 11 * SIZE(BO) STFD f10, 12 * SIZE(BO) STFD f11, 13 * SIZE(BO) STFD f14, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f18, f20, f0 FMA4 f3, f19, f20, f3 FMA2 f1, f18, f21, f1 FMA3 f2, f19, f21, f2 FMA1 f4, f18, f22, f4 FMA4 f7, f19, f22, f7 FMA2 f5, f18, f23, f5 FMA3 f6, f19, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA1 f8, f18, f24, f8 FMA4 f11, f19, f24, f11 FMA2 f9, f18, f25, f9 FMA3 f10, f19, f25, f10 FMA1 f12, f18, f26, f12 FMA4 f15, f19, f26, f15 FMA2 f13, f18, f27, f13 FMA3 f14, f19, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA4 f3, f29, f20, f3 FMA2 f1, f28, f21, f1 FMA3 f2, f29, f21, f2 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA4 f7, f29, f22, f7 FMA2 f5, f28, f23, f5 FMA3 f6, f29, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA1 f8, f28, f24, f8 FMA4 f11, f29, f24, f11 FMA2 f9, f28, f25, f9 FMA3 f10, f29, f25, f10 FMA1 f12, f28, f26, f12 FMA4 f15, f29, f26, f15 FMA2 f13, f28, f27, f13 FMA3 f14, f29, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f30, f20, f0 FMA4 f3, f31, f20, f3 FMA2 f1, f30, f21, f1 FMA3 f2, f31, f21, f2 FMA1 f4, f30, f22, f4 FMA4 f7, f31, f22, f7 FMA2 f5, f30, f23, f5 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA1 f8, f30, f24, f8 FMA4 f11, f31, f24, f11 FMA2 f9, f30, f25, f9 FMA3 f10, f31, f25, f10 FMA1 f12, f30, f26, f12 FMA4 f15, f31, f26, f15 FMA2 f13, f30, f27, f13 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 32 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(26) .align 4 LL(27): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 2 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif FADD f0, f0, f2 FADD f1, f1, f3 FADD f4, f4, f6 FADD f5, f5, f7 FADD f8, f8, f10 FADD f9, f9, f11 FADD f12, f12, f14 FADD f13, f13, f15 #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 FSUB f5, f19, f5 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f12, f22, f12 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f28, 0 * SIZE(AO) LFD f29, 1 * SIZE(AO) FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 #ifndef CONJ FMSUB f0, f28, f0, f16 FMADD f1, f28, f1, f17 FMSUB f4, f28, f4, f18 FMADD f5, f28, f5, f19 FMSUB f8, f28, f8, f20 FMADD f9, f28, f9, f21 FMSUB f12, f28, f12, f22 FMADD f13, f28, f13, f23 #else FMADD f0, f28, f0, f16 FMSUB f1, f28, f1, f17 FMADD f4, f28, f4, f18 FMSUB f5, f28, f5, f19 FMADD f8, f28, f8, f20 FMSUB f9, f28, f9, f21 FMADD f12, f28, f12, f22 FMSUB f13, f28, f13, f23 #endif #endif #ifdef LT LFD f24, 0 * SIZE(AO) LFD f25, 1 * SIZE(AO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f5 FMUL f19, f25, f4 FMUL f20, f25, f9 FMUL f21, f25, f8 FMUL f22, f25, f13 FMUL f23, f25, f12 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f4, f24, f4, f18 FMADD f5, f24, f5, f19 FMSUB f8, f24, f8, f20 FMADD f9, f24, f9, f21 FMSUB f12, f24, f12, f22 FMADD f13, f24, f13, f23 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f4, f24, f4, f18 FMSUB f5, f24, f5, f19 FMADD f8, f24, f8, f20 FMSUB f9, f24, f9, f21 FMADD f12, f24, f12, f22 FMSUB f13, f24, f13, f23 #endif #endif #ifdef RN LFD f24, 0 * SIZE(BO) LFD f25, 1 * SIZE(BO) LFD f26, 2 * SIZE(BO) LFD f27, 3 * SIZE(BO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMUL f16, f25, f1 FMUL f17, f25, f0 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMADD f4, f27, f1, f4 FNMSUB f5, f27, f0, f5 FNMSUB f4, f26, f0, f4 FNMSUB f5, f26, f1, f5 FMADD f8, f29, f1, f8 FNMSUB f9, f29, f0, f9 FNMSUB f8, f28, f0, f8 FNMSUB f9, f28, f1, f9 FMADD f12, f31, f1, f12 FNMSUB f13, f31, f0, f13 FNMSUB f12, f30, f0, f12 FNMSUB f13, f30, f1, f13 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMADD f8, f29, f5, f8 FNMSUB f9, f29, f4, f9 FNMSUB f8, f28, f4, f8 FNMSUB f9, f28, f5, f9 FMADD f12, f31, f5, f12 FNMSUB f13, f31, f4, f13 FNMSUB f12, f30, f4, f12 FNMSUB f13, f30, f5, f13 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMADD f12, f29, f9, f12 FNMSUB f13, f29, f8, f13 FNMSUB f12, f28, f8, f12 FNMSUB f13, f28, f9, f13 FMUL f16, f31, f13 FMUL f17, f31, f12 FMSUB f12, f30, f12, f16 FMADD f13, f30, f13, f17 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMSUB f4, f27, f1, f4 FNMADD f5, f27, f0, f5 FNMADD f4, f26, f0, f4 FNMADD f5, f26, f1, f5 FMSUB f8, f29, f1, f8 FNMADD f9, f29, f0, f9 FNMADD f8, f28, f0, f8 FNMADD f9, f28, f1, f9 FMSUB f12, f31, f1, f12 FNMADD f13, f31, f0, f13 FNMADD f12, f30, f0, f12 FNMADD f13, f30, f1, f13 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMSUB f8, f29, f5, f8 FNMADD f9, f29, f4, f9 FNMADD f8, f28, f4, f8 FNMADD f9, f28, f5, f9 FMSUB f12, f31, f5, f12 FNMADD f13, f31, f4, f13 FNMADD f12, f30, f4, f12 FNMADD f13, f30, f5, f13 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMSUB f12, f29, f9, f12 FNMADD f13, f29, f8, f13 FNMADD f12, f28, f8, f12 FNMADD f13, f28, f9, f13 FMUL f16, f31, f13 FMUL f17, f31, f12 FMADD f12, f30, f12, f16 FMSUB f13, f30, f13, f17 #endif #endif #ifdef RT LFD f24, 30 * SIZE(BO) LFD f25, 31 * SIZE(BO) LFD f26, 28 * SIZE(BO) LFD f27, 29 * SIZE(BO) LFD f28, 26 * SIZE(BO) LFD f29, 27 * SIZE(BO) LFD f30, 24 * SIZE(BO) LFD f31, 25 * SIZE(BO) FMUL f16, f25, f13 FMUL f17, f25, f12 #ifndef CONJ FMSUB f12, f24, f12, f16 FMADD f13, f24, f13, f17 FMADD f8, f27, f13, f8 FNMSUB f9, f27, f12, f9 FNMSUB f8, f26, f12, f8 FNMSUB f9, f26, f13, f9 FMADD f4, f29, f13, f4 FNMSUB f5, f29, f12, f5 FNMSUB f4, f28, f12, f4 FNMSUB f5, f28, f13, f5 FMADD f0, f31, f13, f0 FNMSUB f1, f31, f12, f1 FNMSUB f0, f30, f12, f0 FNMSUB f1, f30, f13, f1 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMADD f4, f29, f9, f4 FNMSUB f5, f29, f8, f5 FNMSUB f4, f28, f8, f4 FNMSUB f5, f28, f9, f5 FMADD f0, f31, f9, f0 FNMSUB f1, f31, f8, f1 FNMSUB f0, f30, f8, f0 FNMSUB f1, f30, f9, f1 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMADD f0, f29, f5, f0 FNMSUB f1, f29, f4, f1 FNMSUB f0, f28, f4, f0 FNMSUB f1, f28, f5, f1 FMUL f16, f31, f1 FMUL f17, f31, f0 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 #else FMADD f12, f24, f12, f16 FMSUB f13, f24, f13, f17 FMSUB f8, f27, f13, f8 FNMADD f9, f27, f12, f9 FNMADD f8, f26, f12, f8 FNMADD f9, f26, f13, f9 FMSUB f4, f29, f13, f4 FNMADD f5, f29, f12, f5 FNMADD f4, f28, f12, f4 FNMADD f5, f28, f13, f5 FMSUB f0, f31, f13, f0 FNMADD f1, f31, f12, f1 FNMADD f0, f30, f12, f0 FNMADD f1, f30, f13, f1 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMSUB f4, f29, f9, f4 FNMADD f5, f29, f8, f5 FNMADD f4, f28, f8, f4 FNMADD f5, f28, f9, f5 FMSUB f0, f31, f9, f0 FNMADD f1, f31, f8, f1 FNMADD f0, f30, f8, f0 FNMADD f1, f30, f9, f1 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMSUB f0, f29, f5, f0 FNMADD f1, f29, f4, f1 FNMADD f0, f28, f4, f0 FNMADD f1, f28, f5, f1 FMUL f16, f31, f1 FMUL f17, f31, f0 FMADD f0, f30, f0, f16 FMSUB f1, f30, f1, f17 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f4, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f8, 4 * SIZE(BO) STFD f9, 5 * SIZE(BO) STFD f12, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(29): #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(30): andi. J, N, 2 ble LL(50) .align 4 #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(40) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): dcbt AO, PREA dcbtst BO, PREA FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 4 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f25, 5 * SIZE(AO) LFD f29, 5 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 6 * SIZE(AO) LFD f30, 6 * SIZE(BO) LFD f27, 7 * SIZE(AO) LFD f31, 7 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 8 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f17, 9 * SIZE(AO) LFD f21, 9 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 10 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f19, 11 * SIZE(AO) LFD f23, 11 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 12 * SIZE(AO) LFD f28, 12 * SIZE(BO) LFD f25, 13 * SIZE(AO) LFD f29, 13 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 14 * SIZE(AO) LFD f30, 14 * SIZE(BO) LFD f27, 15 * SIZE(AO) LFD f31, 15 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 16 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f17, 17 * SIZE(AO) LFD f21, 17 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 18 * SIZE(AO) LFD f22, 18 * SIZE(BO) LFD f19, 19 * SIZE(AO) LFD f23, 19 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 20 * SIZE(AO) LFD f28, 20 * SIZE(BO) LFD f25, 21 * SIZE(AO) LFD f29, 21 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 22 * SIZE(AO) LFD f30, 22 * SIZE(BO) LFD f27, 23 * SIZE(AO) LFD f31, 23 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 24 * SIZE(AO) LFD f20, 24 * SIZE(BO) LFD f17, 25 * SIZE(AO) LFD f21, 25 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 26 * SIZE(AO) LFD f22, 26 * SIZE(BO) LFD f19, 27 * SIZE(AO) LFD f23, 27 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 28 * SIZE(AO) LFD f28, 28 * SIZE(BO) LFD f25, 29 * SIZE(AO) LFD f29, 29 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 30 * SIZE(AO) LFD f30, 30 * SIZE(BO) LFD f27, 31 * SIZE(AO) LFD f31, 31 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 32 * SIZE(AO) LFD f20, 32 * SIZE(BO) LFD f17, 33 * SIZE(AO) LFD f21, 33 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 34 * SIZE(AO) LFD f22, 34 * SIZE(BO) LFD f19, 35 * SIZE(AO) LFD f23, 35 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(38): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(49) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(47) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(49): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif .align 4 LL(50): andi. J, N, 1 ble LL(999) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble LL(60) .align 4 LL(51): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 4 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(57) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(56) .align 4 LL(57): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(51) .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 4 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) FMADD f4, f18, f22, f4 FMADD f5, f19, f23, f5 FMADD f6, f19, f22, f6 FMADD f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) FMADD f4, f18, f22, f4 FMADD f5, f19, f23, f5 FMADD f6, f19, f22, f6 FMADD f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(67) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(66) .align 4 LL(67): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_power6_RT.S000066400000000000000000002447371313527062700223530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define FZERO 312(SP) #else #define STACKSIZE 256 #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r19 #define TEMP r20 #define KK r21 #define I r22 #define J r23 #define AO r24 #define BO r25 #define CO1 r26 #define CO2 r27 #define CO3 r28 #define CO4 r29 #define PREA r30 #define PREC r31 #ifndef CONJ #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FNMSUB #define FMA4 FMADD #elif defined(LN) || defined(LT) #define FMA1 FMADD #define FMA2 FMADD #define FMA3 FMADD #define FMA4 FNMSUB #else #define FMA1 FMADD #define FMA2 FNMSUB #define FMA3 FMADD #define FMA4 FMADD #endif #ifndef NEEDPARAM PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) std r20, 232(SP) std r19, 240(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) stw r20, 188(SP) stw r19, 192(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble LL(999) cmpwi cr0, N, 0 ble LL(999) cmpwi cr0, K, 0 ble LL(999) li PREA, 48 * SIZE li PREC, 4 * SIZE andi. J, N, 1 ble LL(30) #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble LL(60) .align 4 LL(51): #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 dcbt CO1, PREC srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(55) .align 4 LL(52): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(52) .align 4 LL(55): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(57) .align 4 LL(56): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(56) .align 4 LL(57): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(51) .align 4 LL(60): andi. I, M, 1 ble LL(69) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(65) .align 4 LL(62): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) FMADD f4, f18, f22, f4 FMADD f5, f19, f23, f5 FMADD f6, f19, f22, f6 FMADD f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) FMADD f4, f18, f22, f4 FMADD f5, f19, f23, f5 FMADD f6, f19, f22, f6 FMADD f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4 LL(65): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble LL(67) .align 4 LL(66): FMADD f0, f16, f20, f0 FMADD f1, f17, f21, f1 FMADD f2, f17, f20, f2 FMADD f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(66) .align 4 LL(67): #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(69): #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 LL(30): andi. J, N, 2 ble LL(50) #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble LL(40) .align 4 LL(31): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 dcbtst CO1, PREC dcbtst CO2, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(35) .align 4 LL(32): dcbt AO, PREA dcbtst BO, PREA FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 4 * SIZE(AO) LFD f28, 4 * SIZE(BO) LFD f25, 5 * SIZE(AO) LFD f29, 5 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 6 * SIZE(AO) LFD f30, 6 * SIZE(BO) LFD f27, 7 * SIZE(AO) LFD f31, 7 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 8 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f17, 9 * SIZE(AO) LFD f21, 9 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 10 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f19, 11 * SIZE(AO) LFD f23, 11 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 12 * SIZE(AO) LFD f28, 12 * SIZE(BO) LFD f25, 13 * SIZE(AO) LFD f29, 13 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 14 * SIZE(AO) LFD f30, 14 * SIZE(BO) LFD f27, 15 * SIZE(AO) LFD f31, 15 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 16 * SIZE(AO) LFD f20, 16 * SIZE(BO) LFD f17, 17 * SIZE(AO) LFD f21, 17 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 18 * SIZE(AO) LFD f22, 18 * SIZE(BO) LFD f19, 19 * SIZE(AO) LFD f23, 19 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 20 * SIZE(AO) LFD f28, 20 * SIZE(BO) LFD f25, 21 * SIZE(AO) LFD f29, 21 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 22 * SIZE(AO) LFD f30, 22 * SIZE(BO) LFD f27, 23 * SIZE(AO) LFD f31, 23 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 24 * SIZE(AO) LFD f20, 24 * SIZE(BO) LFD f17, 25 * SIZE(AO) LFD f21, 25 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 26 * SIZE(AO) LFD f22, 26 * SIZE(BO) LFD f19, 27 * SIZE(AO) LFD f23, 27 * SIZE(BO) FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f24, 28 * SIZE(AO) LFD f28, 28 * SIZE(BO) LFD f25, 29 * SIZE(AO) LFD f29, 29 * SIZE(BO) FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 LFD f26, 30 * SIZE(AO) LFD f30, 30 * SIZE(BO) LFD f27, 31 * SIZE(AO) LFD f31, 31 * SIZE(BO) FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 FMADD f0, f24, f28, f0 FMADD f4, f24, f29, f4 FMADD f8, f24, f30, f8 FMADD f12, f24, f31, f12 LFD f16, 32 * SIZE(AO) LFD f20, 32 * SIZE(BO) LFD f17, 33 * SIZE(AO) LFD f21, 33 * SIZE(BO) FMADD f1, f25, f28, f1 FMADD f5, f25, f29, f5 FMADD f9, f25, f30, f9 FMADD f13, f25, f31, f13 FMADD f2, f26, f28, f2 FMADD f6, f26, f29, f6 FMADD f10, f26, f30, f10 FMADD f14, f26, f31, f14 LFD f18, 34 * SIZE(AO) LFD f22, 34 * SIZE(BO) LFD f19, 35 * SIZE(AO) LFD f23, 35 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 32 * SIZE FMADD f3, f27, f28, f3 FMADD f7, f27, f29, f7 FMADD f11, f27, f30, f11 FMADD f15, f27, f31, f15 bdnz LL(32) .align 4 LL(35): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble LL(38) .align 4 LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 FMADD f1, f17, f20, f1 FMADD f5, f17, f21, f5 FMADD f9, f17, f22, f9 FMADD f13, f17, f23, f13 FMADD f2, f18, f20, f2 FMADD f6, f18, f21, f6 FMADD f10, f18, f22, f10 FMADD f14, f18, f23, f14 FMADD f3, f19, f20, f3 FMADD f7, f19, f21, f7 FMADD f11, f19, f22, f11 FMADD f15, f19, f23, f15 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(36) .align 4 LL(38): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(31) .align 4 LL(40): andi. I, M, 1 ble LL(49) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(45) .align 4 LL(42): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f0, f18, f24, f0 FMADD f1, f18, f25, f1 FMADD f2, f18, f26, f2 FMADD f3, f18, f27, f3 FMADD f4, f19, f24, f4 FMADD f5, f19, f25, f5 FMADD f6, f19, f26, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(42) .align 4 LL(45): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(47) .align 4 LL(46): FMADD f0, f16, f20, f0 FMADD f1, f16, f21, f1 FMADD f2, f16, f22, f2 FMADD f3, f16, f23, f3 FMADD f4, f17, f20, f4 FMADD f5, f17, f21, f5 FMADD f6, f17, f22, f6 FMADD f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(46) .align 4 LL(47): #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(49): #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif .align 4 LL(50): srawi. J, N, 2 ble LL(999) .align 4 LL(10): #ifdef RT slwi r0, K, 2 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 2 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC add CO3, CO2, LDC add CO4, CO3, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO4, LDC #endif ble LL(20) .align 4 LL(11): #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(B) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(B) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, KK, 3 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f17, 1 * SIZE(AO) LFD f21, 1 * SIZE(BO) LFD f18, 2 * SIZE(AO) LFD f22, 2 * SIZE(BO) LFD f19, 3 * SIZE(AO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) dcbtst CO1, PREC dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC srawi. r0, TEMP, 3 mtspr CTR, r0 #endif ble LL(15) .align 4 LL(12): dcbt AO, PREA dcbtst BO, PREA FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 20 * SIZE(AO) LFD f29, 21 * SIZE(AO) LFD f30, 22 * SIZE(AO) LFD f31, 23 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 40 * SIZE(BO) LFD f21, 41 * SIZE(BO) LFD f22, 42 * SIZE(BO) LFD f23, 43 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 44 * SIZE(BO) LFD f25, 45 * SIZE(BO) LFD f26, 46 * SIZE(BO) LFD f27, 47 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 24 * SIZE(AO) LFD f17, 25 * SIZE(AO) LFD f18, 26 * SIZE(AO) LFD f19, 27 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 48 * SIZE(BO) LFD f21, 49 * SIZE(BO) LFD f22, 50 * SIZE(BO) LFD f23, 51 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 52 * SIZE(BO) LFD f25, 53 * SIZE(BO) LFD f26, 54 * SIZE(BO) LFD f27, 55 * SIZE(BO) FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 LFD f28, 28 * SIZE(AO) LFD f29, 29 * SIZE(AO) LFD f30, 30 * SIZE(AO) LFD f31, 31 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 LFD f20, 56 * SIZE(BO) LFD f21, 57 * SIZE(BO) LFD f22, 58 * SIZE(BO) LFD f23, 59 * SIZE(BO) FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f24, 60 * SIZE(BO) LFD f25, 61 * SIZE(BO) LFD f26, 62 * SIZE(BO) LFD f27, 63 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA1 f2, f30, f20, f2 FMA2 f1, f28, f21, f1 FMA2 f3, f30, f21, f3 LFD f16, 32 * SIZE(AO) LFD f17, 33 * SIZE(AO) LFD f18, 34 * SIZE(AO) LFD f19, 35 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA1 f6, f30, f22, f6 FMA2 f5, f28, f23, f5 FMA2 f7, f30, f23, f7 FMA1 f8, f28, f24, f8 FMA1 f10, f30, f24, f10 FMA2 f9, f28, f25, f9 FMA2 f11, f30, f25, f11 FMA1 f12, f28, f26, f12 FMA1 f14, f30, f26, f14 FMA2 f13, f28, f27, f13 FMA2 f15, f30, f27, f15 FMA4 f1, f29, f20, f1 FMA4 f3, f31, f20, f3 FMA3 f0, f29, f21, f0 FMA3 f2, f31, f21, f2 FMA4 f5, f29, f22, f5 FMA4 f7, f31, f22, f7 FMA3 f4, f29, f23, f4 FMA3 f6, f31, f23, f6 LFD f20, 64 * SIZE(BO) LFD f21, 65 * SIZE(BO) LFD f22, 66 * SIZE(BO) LFD f23, 67 * SIZE(BO) FMA4 f9, f29, f24, f9 FMA4 f11, f31, f24, f11 FMA3 f8, f29, f25, f8 FMA3 f10, f31, f25, f10 FMA4 f13, f29, f26, f13 FMA4 f15, f31, f26, f15 FMA3 f12, f29, f27, f12 FMA3 f14, f31, f27, f14 LFD f24, 68 * SIZE(BO) LFD f25, 69 * SIZE(BO) LFD f26, 70 * SIZE(BO) LFD f27, 71 * SIZE(BO) addi AO, AO, 32 * SIZE addi BO, BO, 64 * SIZE bdnz LL(12) .align 4 LL(15): #if defined(LT) || defined(RN) andi. r0, KK, 7 #else andi. r0, TEMP, 7 #endif mtspr CTR, r0 ble LL(18) .align 4 LL(16): FMA1 f0, f16, f20, f0 FMA1 f2, f18, f20, f2 FMA2 f1, f16, f21, f1 FMA2 f3, f18, f21, f3 FMA1 f4, f16, f22, f4 FMA1 f6, f18, f22, f6 FMA2 f5, f16, f23, f5 FMA2 f7, f18, f23, f7 FMA1 f8, f16, f24, f8 FMA1 f10, f18, f24, f10 FMA2 f9, f16, f25, f9 FMA2 f11, f18, f25, f11 FMA1 f12, f16, f26, f12 FMA1 f14, f18, f26, f14 FMA2 f13, f16, f27, f13 FMA2 f15, f18, f27, f15 FMA4 f1, f17, f20, f1 FMA4 f3, f19, f20, f3 FMA3 f0, f17, f21, f0 FMA3 f2, f19, f21, f2 FMA4 f5, f17, f22, f5 FMA4 f7, f19, f22, f7 FMA3 f4, f17, f23, f4 FMA3 f6, f19, f23, f6 FMA4 f9, f17, f24, f9 FMA4 f11, f19, f24, f11 FMA3 f8, f17, f25, f8 FMA3 f10, f19, f25, f10 FMA4 f13, f17, f26, f13 FMA4 f15, f19, f26, f15 FMA3 f12, f17, f27, f12 FMA3 f14, f19, f27, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(16) .align 4 LL(18): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 4 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 2 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 FSUB f5, f19, f5 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f12, f22, f12 FSUB f13, f23, f13 LFD f24, 8 * SIZE(BO) LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) FSUB f2, f24, f2 FSUB f3, f25, f3 FSUB f6, f26, f6 FSUB f7, f27, f7 LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FSUB f10, f28, f10 FSUB f11, f29, f11 FSUB f14, f30, f14 FSUB f15, f31, f15 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) FSUB f4, f20, f4 FSUB f5, f21, f5 FSUB f6, f22, f6 FSUB f7, f23, f7 LFD f24, 8 * SIZE(AO) LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 FSUB f11, f27, f11 LFD f28, 12 * SIZE(AO) LFD f29, 13 * SIZE(AO) LFD f30, 14 * SIZE(AO) LFD f31, 15 * SIZE(AO) FSUB f12, f28, f12 FSUB f13, f29, f13 FSUB f14, f30, f14 FSUB f15, f31, f15 #endif #ifdef LN LFD f24, 6 * SIZE(AO) LFD f25, 7 * SIZE(AO) LFD f26, 4 * SIZE(AO) LFD f27, 5 * SIZE(AO) LFD f28, 0 * SIZE(AO) LFD f29, 1 * SIZE(AO) FMUL f16, f25, f3 FMUL f17, f25, f2 FMUL f18, f25, f7 FMUL f19, f25, f6 FMUL f20, f25, f11 FMUL f21, f25, f10 FMUL f22, f25, f15 FMUL f23, f25, f14 #ifndef CONJ FMSUB f2, f24, f2, f16 FMADD f3, f24, f3, f17 FMSUB f6, f24, f6, f18 FMADD f7, f24, f7, f19 FMSUB f10, f24, f10, f20 FMADD f11, f24, f11, f21 FMSUB f14, f24, f14, f22 FMADD f15, f24, f15, f23 FMADD f0, f27, f3, f0 FNMSUB f1, f27, f2, f1 FMADD f4, f27, f7, f4 FNMSUB f5, f27, f6, f5 FMADD f8, f27, f11, f8 FNMSUB f9, f27, f10, f9 FMADD f12, f27, f15, f12 FNMSUB f13, f27, f14, f13 FNMSUB f0, f26, f2, f0 FNMSUB f1, f26, f3, f1 FNMSUB f4, f26, f6, f4 FNMSUB f5, f26, f7, f5 FNMSUB f8, f26, f10, f8 FNMSUB f9, f26, f11, f9 FNMSUB f12, f26, f14, f12 FNMSUB f13, f26, f15, f13 FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 FMSUB f0, f28, f0, f16 FMADD f1, f28, f1, f17 FMSUB f4, f28, f4, f18 FMADD f5, f28, f5, f19 FMSUB f8, f28, f8, f20 FMADD f9, f28, f9, f21 FMSUB f12, f28, f12, f22 FMADD f13, f28, f13, f23 #else FMADD f2, f24, f2, f16 FMSUB f3, f24, f3, f17 FMADD f6, f24, f6, f18 FMSUB f7, f24, f7, f19 FMADD f10, f24, f10, f20 FMSUB f11, f24, f11, f21 FMADD f14, f24, f14, f22 FMSUB f15, f24, f15, f23 FMSUB f0, f27, f3, f0 FNMADD f1, f27, f2, f1 FMSUB f4, f27, f7, f4 FNMADD f5, f27, f6, f5 FMSUB f8, f27, f11, f8 FNMADD f9, f27, f10, f9 FMSUB f12, f27, f15, f12 FNMADD f13, f27, f14, f13 FNMADD f0, f26, f2, f0 FNMADD f1, f26, f3, f1 FNMADD f4, f26, f6, f4 FNMADD f5, f26, f7, f5 FNMADD f8, f26, f10, f8 FNMADD f9, f26, f11, f9 FNMADD f12, f26, f14, f12 FNMADD f13, f26, f15, f13 FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 FMADD f0, f28, f0, f16 FMSUB f1, f28, f1, f17 FMADD f4, f28, f4, f18 FMSUB f5, f28, f5, f19 FMADD f8, f28, f8, f20 FMSUB f9, f28, f9, f21 FMADD f12, f28, f12, f22 FMSUB f13, f28, f13, f23 #endif #endif #ifdef LT LFD f24, 0 * SIZE(AO) LFD f25, 1 * SIZE(AO) LFD f26, 2 * SIZE(AO) LFD f27, 3 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f5 FMUL f19, f25, f4 FMUL f20, f25, f9 FMUL f21, f25, f8 FMUL f22, f25, f13 FMUL f23, f25, f12 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f4, f24, f4, f18 FMADD f5, f24, f5, f19 FMSUB f8, f24, f8, f20 FMADD f9, f24, f9, f21 FMSUB f12, f24, f12, f22 FMADD f13, f24, f13, f23 FMADD f2, f27, f1, f2 FNMSUB f3, f27, f0, f3 FMADD f6, f27, f5, f6 FNMSUB f7, f27, f4, f7 FMADD f10, f27, f9, f10 FNMSUB f11, f27, f8, f11 FMADD f14, f27, f13, f14 FNMSUB f15, f27, f12, f15 FNMSUB f2, f26, f0, f2 FNMSUB f3, f26, f1, f3 FNMSUB f6, f26, f4, f6 FNMSUB f7, f26, f5, f7 FNMSUB f10, f26, f8, f10 FNMSUB f11, f26, f9, f11 FNMSUB f14, f26, f12, f14 FNMSUB f15, f26, f13, f15 FMUL f16, f29, f3 FMUL f17, f29, f2 FMUL f18, f29, f7 FMUL f19, f29, f6 FMUL f20, f29, f11 FMUL f21, f29, f10 FMUL f22, f29, f15 FMUL f23, f29, f14 FMSUB f2, f28, f2, f16 FMADD f3, f28, f3, f17 FMSUB f6, f28, f6, f18 FMADD f7, f28, f7, f19 FMSUB f10, f28, f10, f20 FMADD f11, f28, f11, f21 FMSUB f14, f28, f14, f22 FMADD f15, f28, f15, f23 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f4, f24, f4, f18 FMSUB f5, f24, f5, f19 FMADD f8, f24, f8, f20 FMSUB f9, f24, f9, f21 FMADD f12, f24, f12, f22 FMSUB f13, f24, f13, f23 FMSUB f2, f27, f1, f2 FNMADD f3, f27, f0, f3 FMSUB f6, f27, f5, f6 FNMADD f7, f27, f4, f7 FMSUB f10, f27, f9, f10 FNMADD f11, f27, f8, f11 FMSUB f14, f27, f13, f14 FNMADD f15, f27, f12, f15 FNMADD f2, f26, f0, f2 FNMADD f3, f26, f1, f3 FNMADD f6, f26, f4, f6 FNMADD f7, f26, f5, f7 FNMADD f10, f26, f8, f10 FNMADD f11, f26, f9, f11 FNMADD f14, f26, f12, f14 FNMADD f15, f26, f13, f15 FMUL f16, f29, f3 FMUL f17, f29, f2 FMUL f18, f29, f7 FMUL f19, f29, f6 FMUL f20, f29, f11 FMUL f21, f29, f10 FMUL f22, f29, f15 FMUL f23, f29, f14 FMADD f2, f28, f2, f16 FMSUB f3, f28, f3, f17 FMADD f6, f28, f6, f18 FMSUB f7, f28, f7, f19 FMADD f10, f28, f10, f20 FMSUB f11, f28, f11, f21 FMADD f14, f28, f14, f22 FMSUB f15, f28, f15, f23 #endif #endif #ifdef RN LFD f24, 0 * SIZE(BO) LFD f25, 1 * SIZE(BO) LFD f26, 2 * SIZE(BO) LFD f27, 3 * SIZE(BO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f3 FMUL f19, f25, f2 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f2, f24, f2, f18 FMADD f3, f24, f3, f19 FMADD f4, f27, f1, f4 FNMSUB f5, f27, f0, f5 FMADD f6, f27, f3, f6 FNMSUB f7, f27, f2, f7 FNMSUB f4, f26, f0, f4 FNMSUB f5, f26, f1, f5 FNMSUB f6, f26, f2, f6 FNMSUB f7, f26, f3, f7 FMADD f8, f29, f1, f8 FNMSUB f9, f29, f0, f9 FMADD f10, f29, f3, f10 FNMSUB f11, f29, f2, f11 FNMSUB f8, f28, f0, f8 FNMSUB f9, f28, f1, f9 FNMSUB f10, f28, f2, f10 FNMSUB f11, f28, f3, f11 FMADD f12, f31, f1, f12 FNMSUB f13, f31, f0, f13 FMADD f14, f31, f3, f14 FNMSUB f15, f31, f2, f15 FNMSUB f12, f30, f0, f12 FNMSUB f13, f30, f1, f13 FNMSUB f14, f30, f2, f14 FNMSUB f15, f30, f3, f15 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMSUB f6, f26, f6, f18 FMADD f7, f26, f7, f19 FMADD f8, f29, f5, f8 FNMSUB f9, f29, f4, f9 FMADD f10, f29, f7, f10 FNMSUB f11, f29, f6, f11 FNMSUB f8, f28, f4, f8 FNMSUB f9, f28, f5, f9 FNMSUB f10, f28, f6, f10 FNMSUB f11, f28, f7, f11 FMADD f12, f31, f5, f12 FNMSUB f13, f31, f4, f13 FMADD f14, f31, f7, f14 FNMSUB f15, f31, f6, f15 FNMSUB f12, f30, f4, f12 FNMSUB f13, f30, f5, f13 FNMSUB f14, f30, f6, f14 FNMSUB f15, f30, f7, f15 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMSUB f10, f26, f10, f18 FMADD f11, f26, f11, f19 FMADD f12, f29, f9, f12 FNMSUB f13, f29, f8, f13 FMADD f14, f29, f11, f14 FNMSUB f15, f29, f10, f15 FNMSUB f12, f28, f8, f12 FNMSUB f13, f28, f9, f13 FNMSUB f14, f28, f10, f14 FNMSUB f15, f28, f11, f15 FMUL f16, f31, f13 FMUL f17, f31, f12 FMUL f18, f31, f15 FMUL f19, f31, f14 FMSUB f12, f30, f12, f16 FMADD f13, f30, f13, f17 FMSUB f14, f30, f14, f18 FMADD f15, f30, f15, f19 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f2, f24, f2, f18 FMSUB f3, f24, f3, f19 FMSUB f4, f27, f1, f4 FNMADD f5, f27, f0, f5 FMSUB f6, f27, f3, f6 FNMADD f7, f27, f2, f7 FNMADD f4, f26, f0, f4 FNMADD f5, f26, f1, f5 FNMADD f6, f26, f2, f6 FNMADD f7, f26, f3, f7 FMSUB f8, f29, f1, f8 FNMADD f9, f29, f0, f9 FMSUB f10, f29, f3, f10 FNMADD f11, f29, f2, f11 FNMADD f8, f28, f0, f8 FNMADD f9, f28, f1, f9 FNMADD f10, f28, f2, f10 FNMADD f11, f28, f3, f11 FMSUB f12, f31, f1, f12 FNMADD f13, f31, f0, f13 FMSUB f14, f31, f3, f14 FNMADD f15, f31, f2, f15 FNMADD f12, f30, f0, f12 FNMADD f13, f30, f1, f13 FNMADD f14, f30, f2, f14 FNMADD f15, f30, f3, f15 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMADD f6, f26, f6, f18 FMSUB f7, f26, f7, f19 FMSUB f8, f29, f5, f8 FNMADD f9, f29, f4, f9 FMSUB f10, f29, f7, f10 FNMADD f11, f29, f6, f11 FNMADD f8, f28, f4, f8 FNMADD f9, f28, f5, f9 FNMADD f10, f28, f6, f10 FNMADD f11, f28, f7, f11 FMSUB f12, f31, f5, f12 FNMADD f13, f31, f4, f13 FMSUB f14, f31, f7, f14 FNMADD f15, f31, f6, f15 FNMADD f12, f30, f4, f12 FNMADD f13, f30, f5, f13 FNMADD f14, f30, f6, f14 FNMADD f15, f30, f7, f15 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMADD f10, f26, f10, f18 FMSUB f11, f26, f11, f19 FMSUB f12, f29, f9, f12 FNMADD f13, f29, f8, f13 FMSUB f14, f29, f11, f14 FNMADD f15, f29, f10, f15 FNMADD f12, f28, f8, f12 FNMADD f13, f28, f9, f13 FNMADD f14, f28, f10, f14 FNMADD f15, f28, f11, f15 FMUL f16, f31, f13 FMUL f17, f31, f12 FMUL f18, f31, f15 FMUL f19, f31, f14 FMADD f12, f30, f12, f16 FMSUB f13, f30, f13, f17 FMADD f14, f30, f14, f18 FMSUB f15, f30, f15, f19 #endif #endif #ifdef RT LFD f24, 30 * SIZE(BO) LFD f25, 31 * SIZE(BO) LFD f26, 28 * SIZE(BO) LFD f27, 29 * SIZE(BO) LFD f28, 26 * SIZE(BO) LFD f29, 27 * SIZE(BO) LFD f30, 24 * SIZE(BO) LFD f31, 25 * SIZE(BO) FMUL f16, f25, f13 FMUL f17, f25, f12 FMUL f18, f25, f15 FMUL f19, f25, f14 #ifndef CONJ FMSUB f12, f24, f12, f16 FMADD f13, f24, f13, f17 FMSUB f14, f24, f14, f18 FMADD f15, f24, f15, f19 FMADD f8, f27, f13, f8 FNMSUB f9, f27, f12, f9 FMADD f10, f27, f15, f10 FNMSUB f11, f27, f14, f11 FNMSUB f8, f26, f12, f8 FNMSUB f9, f26, f13, f9 FNMSUB f10, f26, f14, f10 FNMSUB f11, f26, f15, f11 FMADD f4, f29, f13, f4 FNMSUB f5, f29, f12, f5 FMADD f6, f29, f15, f6 FNMSUB f7, f29, f14, f7 FNMSUB f4, f28, f12, f4 FNMSUB f5, f28, f13, f5 FNMSUB f6, f28, f14, f6 FNMSUB f7, f28, f15, f7 FMADD f0, f31, f13, f0 FNMSUB f1, f31, f12, f1 FMADD f2, f31, f15, f2 FNMSUB f3, f31, f14, f3 FNMSUB f0, f30, f12, f0 FNMSUB f1, f30, f13, f1 FNMSUB f2, f30, f14, f2 FNMSUB f3, f30, f15, f3 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMSUB f10, f26, f10, f18 FMADD f11, f26, f11, f19 FMADD f4, f29, f9, f4 FNMSUB f5, f29, f8, f5 FMADD f6, f29, f11, f6 FNMSUB f7, f29, f10, f7 FNMSUB f4, f28, f8, f4 FNMSUB f5, f28, f9, f5 FNMSUB f6, f28, f10, f6 FNMSUB f7, f28, f11, f7 FMADD f0, f31, f9, f0 FNMSUB f1, f31, f8, f1 FMADD f2, f31, f11, f2 FNMSUB f3, f31, f10, f3 FNMSUB f0, f30, f8, f0 FNMSUB f1, f30, f9, f1 FNMSUB f2, f30, f10, f2 FNMSUB f3, f30, f11, f3 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMSUB f6, f26, f6, f18 FMADD f7, f26, f7, f19 FMADD f0, f29, f5, f0 FNMSUB f1, f29, f4, f1 FMADD f2, f29, f7, f2 FNMSUB f3, f29, f6, f3 FNMSUB f0, f28, f4, f0 FNMSUB f1, f28, f5, f1 FNMSUB f2, f28, f6, f2 FNMSUB f3, f28, f7, f3 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 FMSUB f2, f30, f2, f18 FMADD f3, f30, f3, f19 #else FMADD f12, f24, f12, f16 FMSUB f13, f24, f13, f17 FMADD f14, f24, f14, f18 FMSUB f15, f24, f15, f19 FMSUB f8, f27, f13, f8 FNMADD f9, f27, f12, f9 FMSUB f10, f27, f15, f10 FNMADD f11, f27, f14, f11 FNMADD f8, f26, f12, f8 FNMADD f9, f26, f13, f9 FNMADD f10, f26, f14, f10 FNMADD f11, f26, f15, f11 FMSUB f4, f29, f13, f4 FNMADD f5, f29, f12, f5 FMSUB f6, f29, f15, f6 FNMADD f7, f29, f14, f7 FNMADD f4, f28, f12, f4 FNMADD f5, f28, f13, f5 FNMADD f6, f28, f14, f6 FNMADD f7, f28, f15, f7 FMSUB f0, f31, f13, f0 FNMADD f1, f31, f12, f1 FMSUB f2, f31, f15, f2 FNMADD f3, f31, f14, f3 FNMADD f0, f30, f12, f0 FNMADD f1, f30, f13, f1 FNMADD f2, f30, f14, f2 FNMADD f3, f30, f15, f3 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMUL f18, f27, f11 FMUL f19, f27, f10 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMADD f10, f26, f10, f18 FMSUB f11, f26, f11, f19 FMSUB f4, f29, f9, f4 FNMADD f5, f29, f8, f5 FMSUB f6, f29, f11, f6 FNMADD f7, f29, f10, f7 FNMADD f4, f28, f8, f4 FNMADD f5, f28, f9, f5 FNMADD f6, f28, f10, f6 FNMADD f7, f28, f11, f7 FMSUB f0, f31, f9, f0 FNMADD f1, f31, f8, f1 FMSUB f2, f31, f11, f2 FNMADD f3, f31, f10, f3 FNMADD f0, f30, f8, f0 FNMADD f1, f30, f9, f1 FNMADD f2, f30, f10, f2 FNMADD f3, f30, f11, f3 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMUL f18, f27, f7 FMUL f19, f27, f6 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMADD f6, f26, f6, f18 FMSUB f7, f26, f7, f19 FMSUB f0, f29, f5, f0 FNMADD f1, f29, f4, f1 FMSUB f2, f29, f7, f2 FNMADD f3, f29, f6, f3 FNMADD f0, f28, f4, f0 FNMADD f1, f28, f5, f1 FNMADD f2, f28, f6, f2 FNMADD f3, f28, f7, f3 FMUL f16, f31, f1 FMUL f17, f31, f0 FMUL f18, f31, f3 FMUL f19, f31, f2 FMADD f0, f30, f0, f16 FMSUB f1, f30, f1, f17 FMADD f2, f30, f2, f18 FMSUB f3, f30, f3, f19 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE subi CO3, CO3, 4 * SIZE subi CO4, CO4, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f4, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f8, 4 * SIZE(BO) STFD f9, 5 * SIZE(BO) STFD f12, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) STFD f2, 8 * SIZE(BO) STFD f3, 9 * SIZE(BO) STFD f6, 10 * SIZE(BO) STFD f7, 11 * SIZE(BO) STFD f10, 12 * SIZE(BO) STFD f11, 13 * SIZE(BO) STFD f14, 14 * SIZE(BO) STFD f15, 15 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f4, 4 * SIZE(AO) STFD f5, 5 * SIZE(AO) STFD f6, 6 * SIZE(AO) STFD f7, 7 * SIZE(AO) STFD f8, 8 * SIZE(AO) STFD f9, 9 * SIZE(AO) STFD f10, 10 * SIZE(AO) STFD f11, 11 * SIZE(AO) STFD f12, 12 * SIZE(AO) STFD f13, 13 * SIZE(AO) STFD f14, 14 * SIZE(AO) STFD f15, 15 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f10, 2 * SIZE(CO3) STFD f11, 3 * SIZE(CO3) fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) STFD f14, 2 * SIZE(CO4) STFD f15, 3 * SIZE(CO4) fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt LL(11) .align 4 LL(20): andi. I, M, 1 ble LL(29) #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 2 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble LL(25) .align 4 LL(22): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 LFD f28, 4 * SIZE(AO) LFD f29, 5 * SIZE(AO) LFD f30, 6 * SIZE(AO) LFD f31, 7 * SIZE(AO) FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMA1 f0, f18, f20, f0 FMA4 f3, f19, f20, f3 FMA2 f1, f18, f21, f1 FMA3 f2, f19, f21, f2 FMA1 f4, f18, f22, f4 FMA4 f7, f19, f22, f7 FMA2 f5, f18, f23, f5 FMA3 f6, f19, f23, f6 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMA1 f8, f18, f24, f8 FMA4 f11, f19, f24, f11 FMA2 f9, f18, f25, f9 FMA3 f10, f19, f25, f10 FMA1 f12, f18, f26, f12 FMA4 f15, f19, f26, f15 FMA2 f13, f18, f27, f13 FMA3 f14, f19, f27, f14 LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) FMA1 f0, f28, f20, f0 FMA4 f3, f29, f20, f3 FMA2 f1, f28, f21, f1 FMA3 f2, f29, f21, f2 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) FMA1 f4, f28, f22, f4 FMA4 f7, f29, f22, f7 FMA2 f5, f28, f23, f5 FMA3 f6, f29, f23, f6 LFD f20, 24 * SIZE(BO) LFD f21, 25 * SIZE(BO) LFD f22, 26 * SIZE(BO) LFD f23, 27 * SIZE(BO) FMA1 f8, f28, f24, f8 FMA4 f11, f29, f24, f11 FMA2 f9, f28, f25, f9 FMA3 f10, f29, f25, f10 FMA1 f12, f28, f26, f12 FMA4 f15, f29, f26, f15 FMA2 f13, f28, f27, f13 FMA3 f14, f29, f27, f14 LFD f24, 28 * SIZE(BO) LFD f25, 29 * SIZE(BO) LFD f26, 30 * SIZE(BO) LFD f27, 31 * SIZE(BO) FMA1 f0, f30, f20, f0 FMA4 f3, f31, f20, f3 FMA2 f1, f30, f21, f1 FMA3 f2, f31, f21, f2 FMA1 f4, f30, f22, f4 FMA4 f7, f31, f22, f7 FMA2 f5, f30, f23, f5 FMA3 f6, f31, f23, f6 LFD f20, 32 * SIZE(BO) LFD f21, 33 * SIZE(BO) LFD f22, 34 * SIZE(BO) LFD f23, 35 * SIZE(BO) FMA1 f8, f30, f24, f8 FMA4 f11, f31, f24, f11 FMA2 f9, f30, f25, f9 FMA3 f10, f31, f25, f10 FMA1 f12, f30, f26, f12 FMA4 f15, f31, f26, f15 FMA2 f13, f30, f27, f13 FMA3 f14, f31, f27, f14 LFD f24, 36 * SIZE(BO) LFD f25, 37 * SIZE(BO) LFD f26, 38 * SIZE(BO) LFD f27, 39 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 32 * SIZE bdnz LL(22) .align 4 LL(25): #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble LL(27) .align 4 LL(26): FMA1 f0, f16, f20, f0 FMA4 f3, f17, f20, f3 FMA2 f1, f16, f21, f1 FMA3 f2, f17, f21, f2 FMA1 f4, f16, f22, f4 FMA4 f7, f17, f22, f7 FMA2 f5, f16, f23, f5 FMA3 f6, f17, f23, f6 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMA1 f8, f16, f24, f8 FMA4 f11, f17, f24, f11 FMA2 f9, f16, f25, f9 FMA3 f10, f17, f25, f10 FMA1 f12, f16, f26, f12 FMA4 f15, f17, f26, f15 FMA2 f13, f16, f27, f13 FMA3 f14, f17, f27, f14 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(26) .align 4 LL(27): #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 4 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 2 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif FADD f0, f0, f2 FADD f1, f1, f3 FADD f4, f4, f6 FADD f5, f5, f7 FADD f8, f8, f10 FADD f9, f9, f11 FADD f12, f12, f14 FADD f13, f13, f15 #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 FSUB f5, f19, f5 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f12, f22, f12 FSUB f13, f23, f13 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f20, f4 FSUB f5, f21, f5 LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f28, 6 * SIZE(AO) LFD f29, 7 * SIZE(AO) FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f12, f28, f12 FSUB f13, f29, f13 #endif #ifdef LN LFD f28, 0 * SIZE(AO) LFD f29, 1 * SIZE(AO) FMUL f16, f29, f1 FMUL f17, f29, f0 FMUL f18, f29, f5 FMUL f19, f29, f4 FMUL f20, f29, f9 FMUL f21, f29, f8 FMUL f22, f29, f13 FMUL f23, f29, f12 #ifndef CONJ FMSUB f0, f28, f0, f16 FMADD f1, f28, f1, f17 FMSUB f4, f28, f4, f18 FMADD f5, f28, f5, f19 FMSUB f8, f28, f8, f20 FMADD f9, f28, f9, f21 FMSUB f12, f28, f12, f22 FMADD f13, f28, f13, f23 #else FMADD f0, f28, f0, f16 FMSUB f1, f28, f1, f17 FMADD f4, f28, f4, f18 FMSUB f5, f28, f5, f19 FMADD f8, f28, f8, f20 FMSUB f9, f28, f9, f21 FMADD f12, f28, f12, f22 FMSUB f13, f28, f13, f23 #endif #endif #ifdef LT LFD f24, 0 * SIZE(AO) LFD f25, 1 * SIZE(AO) FMUL f16, f25, f1 FMUL f17, f25, f0 FMUL f18, f25, f5 FMUL f19, f25, f4 FMUL f20, f25, f9 FMUL f21, f25, f8 FMUL f22, f25, f13 FMUL f23, f25, f12 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMSUB f4, f24, f4, f18 FMADD f5, f24, f5, f19 FMSUB f8, f24, f8, f20 FMADD f9, f24, f9, f21 FMSUB f12, f24, f12, f22 FMADD f13, f24, f13, f23 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMADD f4, f24, f4, f18 FMSUB f5, f24, f5, f19 FMADD f8, f24, f8, f20 FMSUB f9, f24, f9, f21 FMADD f12, f24, f12, f22 FMSUB f13, f24, f13, f23 #endif #endif #ifdef RN LFD f24, 0 * SIZE(BO) LFD f25, 1 * SIZE(BO) LFD f26, 2 * SIZE(BO) LFD f27, 3 * SIZE(BO) LFD f28, 4 * SIZE(BO) LFD f29, 5 * SIZE(BO) LFD f30, 6 * SIZE(BO) LFD f31, 7 * SIZE(BO) FMUL f16, f25, f1 FMUL f17, f25, f0 #ifndef CONJ FMSUB f0, f24, f0, f16 FMADD f1, f24, f1, f17 FMADD f4, f27, f1, f4 FNMSUB f5, f27, f0, f5 FNMSUB f4, f26, f0, f4 FNMSUB f5, f26, f1, f5 FMADD f8, f29, f1, f8 FNMSUB f9, f29, f0, f9 FNMSUB f8, f28, f0, f8 FNMSUB f9, f28, f1, f9 FMADD f12, f31, f1, f12 FNMSUB f13, f31, f0, f13 FNMSUB f12, f30, f0, f12 FNMSUB f13, f30, f1, f13 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMADD f8, f29, f5, f8 FNMSUB f9, f29, f4, f9 FNMSUB f8, f28, f4, f8 FNMSUB f9, f28, f5, f9 FMADD f12, f31, f5, f12 FNMSUB f13, f31, f4, f13 FNMSUB f12, f30, f4, f12 FNMSUB f13, f30, f5, f13 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMADD f12, f29, f9, f12 FNMSUB f13, f29, f8, f13 FNMSUB f12, f28, f8, f12 FNMSUB f13, f28, f9, f13 FMUL f16, f31, f13 FMUL f17, f31, f12 FMSUB f12, f30, f12, f16 FMADD f13, f30, f13, f17 #else FMADD f0, f24, f0, f16 FMSUB f1, f24, f1, f17 FMSUB f4, f27, f1, f4 FNMADD f5, f27, f0, f5 FNMADD f4, f26, f0, f4 FNMADD f5, f26, f1, f5 FMSUB f8, f29, f1, f8 FNMADD f9, f29, f0, f9 FNMADD f8, f28, f0, f8 FNMADD f9, f28, f1, f9 FMSUB f12, f31, f1, f12 FNMADD f13, f31, f0, f13 FNMADD f12, f30, f0, f12 FNMADD f13, f30, f1, f13 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 12 * SIZE(BO) LFD f29, 13 * SIZE(BO) LFD f30, 14 * SIZE(BO) LFD f31, 15 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMSUB f8, f29, f5, f8 FNMADD f9, f29, f4, f9 FNMADD f8, f28, f4, f8 FNMADD f9, f28, f5, f9 FMSUB f12, f31, f5, f12 FNMADD f13, f31, f4, f13 FNMADD f12, f30, f4, f12 FNMADD f13, f30, f5, f13 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 22 * SIZE(BO) LFD f29, 23 * SIZE(BO) LFD f30, 30 * SIZE(BO) LFD f31, 31 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMSUB f12, f29, f9, f12 FNMADD f13, f29, f8, f13 FNMADD f12, f28, f8, f12 FNMADD f13, f28, f9, f13 FMUL f16, f31, f13 FMUL f17, f31, f12 FMADD f12, f30, f12, f16 FMSUB f13, f30, f13, f17 #endif #endif #ifdef RT LFD f24, 30 * SIZE(BO) LFD f25, 31 * SIZE(BO) LFD f26, 28 * SIZE(BO) LFD f27, 29 * SIZE(BO) LFD f28, 26 * SIZE(BO) LFD f29, 27 * SIZE(BO) LFD f30, 24 * SIZE(BO) LFD f31, 25 * SIZE(BO) FMUL f16, f25, f13 FMUL f17, f25, f12 #ifndef CONJ FMSUB f12, f24, f12, f16 FMADD f13, f24, f13, f17 FMADD f8, f27, f13, f8 FNMSUB f9, f27, f12, f9 FNMSUB f8, f26, f12, f8 FNMSUB f9, f26, f13, f9 FMADD f4, f29, f13, f4 FNMSUB f5, f29, f12, f5 FNMSUB f4, f28, f12, f4 FNMSUB f5, f28, f13, f5 FMADD f0, f31, f13, f0 FNMSUB f1, f31, f12, f1 FNMSUB f0, f30, f12, f0 FNMSUB f1, f30, f13, f1 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMSUB f8, f26, f8, f16 FMADD f9, f26, f9, f17 FMADD f4, f29, f9, f4 FNMSUB f5, f29, f8, f5 FNMSUB f4, f28, f8, f4 FNMSUB f5, f28, f9, f5 FMADD f0, f31, f9, f0 FNMSUB f1, f31, f8, f1 FNMSUB f0, f30, f8, f0 FNMSUB f1, f30, f9, f1 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMSUB f4, f26, f4, f16 FMADD f5, f26, f5, f17 FMADD f0, f29, f5, f0 FNMSUB f1, f29, f4, f1 FNMSUB f0, f28, f4, f0 FNMSUB f1, f28, f5, f1 FMUL f16, f31, f1 FMUL f17, f31, f0 FMSUB f0, f30, f0, f16 FMADD f1, f30, f1, f17 #else FMADD f12, f24, f12, f16 FMSUB f13, f24, f13, f17 FMSUB f8, f27, f13, f8 FNMADD f9, f27, f12, f9 FNMADD f8, f26, f12, f8 FNMADD f9, f26, f13, f9 FMSUB f4, f29, f13, f4 FNMADD f5, f29, f12, f5 FNMADD f4, f28, f12, f4 FNMADD f5, f28, f13, f5 FMSUB f0, f31, f13, f0 FNMADD f1, f31, f12, f1 FNMADD f0, f30, f12, f0 FNMADD f1, f30, f13, f1 LFD f26, 20 * SIZE(BO) LFD f27, 21 * SIZE(BO) LFD f28, 18 * SIZE(BO) LFD f29, 19 * SIZE(BO) LFD f30, 16 * SIZE(BO) LFD f31, 17 * SIZE(BO) FMUL f16, f27, f9 FMUL f17, f27, f8 FMADD f8, f26, f8, f16 FMSUB f9, f26, f9, f17 FMSUB f4, f29, f9, f4 FNMADD f5, f29, f8, f5 FNMADD f4, f28, f8, f4 FNMADD f5, f28, f9, f5 FMSUB f0, f31, f9, f0 FNMADD f1, f31, f8, f1 FNMADD f0, f30, f8, f0 FNMADD f1, f30, f9, f1 LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) LFD f28, 8 * SIZE(BO) LFD f29, 9 * SIZE(BO) LFD f30, 0 * SIZE(BO) LFD f31, 1 * SIZE(BO) FMUL f16, f27, f5 FMUL f17, f27, f4 FMADD f4, f26, f4, f16 FMSUB f5, f26, f5, f17 FMSUB f0, f29, f5, f0 FNMADD f1, f29, f4, f1 FNMADD f0, f28, f4, f0 FNMADD f1, f28, f5, f1 FMUL f16, f31, f1 FMUL f17, f31, f0 FMADD f0, f30, f0, f16 FMSUB f1, f30, f1, f17 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE subi CO3, CO3, 2 * SIZE subi CO4, CO4, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f4, 2 * SIZE(BO) STFD f5, 3 * SIZE(BO) STFD f8, 4 * SIZE(BO) STFD f9, 5 * SIZE(BO) STFD f12, 6 * SIZE(BO) STFD f13, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f4, 2 * SIZE(AO) STFD f5, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f12, 6 * SIZE(AO) STFD f13, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 2 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 LL(29): #ifdef LN slwi r0, K, 2 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 4 #endif #ifdef RT subi KK, KK, 4 #endif addic. J, J, -1 bgt LL(10) .align 4 LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) ld r20, 232(SP) ld r19, 240(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) lwz r20, 188(SP) lwz r19, 192(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_ppc440_LN.S000066400000000000000000001134331313527062700221130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 srawi. J, N, 1 ble .L30 .align 4 .L10: #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif .L20: andi. I, M, 1 ble .L09 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L25 .align 4 .L22: fmadd f0, f16, f20, f0 LFD f19, 3 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(AO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(BO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 7 * SIZE(BO) bdnz .L22 .align 4 .L25: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .L27 .align 4 .L26: fmadd f0, f16, f20, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L27: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f20, f2 FADD f3, f21, f3 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L09: srawi. I, M, 1 ble .L29 .align 4 .L11: #if defined(LT) || defined(RN) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) LFD B6, 8 * SIZE(BO) LFD B7, 12 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L15 .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .LKERNEL_MainFinish .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .LKERNEL_MainFinish: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt .L11 .align 4 .L29: #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt .L10 .align 4 .L30: andi. J, N, 1 ble .L999 #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif andi. I, M, 1 ble .L40 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L45 .align 4 .L42: FMADD f0, f16, f20, f0 LFD f23, 3 * SIZE(BO) FMADD f1, f17, f20, f1 nop FMADD f2, f18, f20, f2 nop FMADD f3, f19, f20, f3 LFD f20, 4 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 5 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 6 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 5 * SIZE(BO) FMADD f1, f17, f22, f1 nop FMADD f2, f18, f22, f2 nop FMADD f3, f19, f22, f3 LFD f22, 6 * SIZE(BO) FMADD f4, f16, f23, f4 LFD f16, 8 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 9 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 10 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 11 * SIZE(AO) FMADD f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) FMADD f1, f17, f20, f1 nop FMADD f2, f18, f20, f2 nop FMADD f3, f19, f20, f3 LFDU f20, 8 * SIZE(BO) FMADD f4, f16, f21, f4 LFD f16, 12 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 13 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 14 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 LFD f21, 1 * SIZE(BO) FMADD f1, f17, f22, f1 nop FMADD f2, f18, f22, f2 nop FMADD f3, f19, f22, f3 LFD f22, 2 * SIZE(BO) FMADD f4, f16, f23, f4 LFDU f16, 16 * SIZE(AO) FMADD f5, f17, f23, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f23, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f23, f7 LFD f19, 3 * SIZE(AO) bdnz .L42 .align 4 .L45: fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble .L47 .align 4 .L46: FMADD f0, f16, f20, f0 LFD f21, 1 * SIZE(BO) FMADD f1, f17, f20, f1 nop FMADD f2, f18, f20, f2 nop FMADD f3, f19, f20, f3 LFDU f20, 2 * SIZE(BO) FMADD f4, f16, f21, f4 LFDU f16, 4 * SIZE(AO) FMADD f5, f17, f21, f5 LFD f17, 1 * SIZE(AO) FMADD f6, f18, f21, f6 LFD f18, 2 * SIZE(AO) FMADD f7, f19, f21, f7 LFD f19, 3 * SIZE(AO) bdnz .L46 .align 4 .L47: #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L40: srawi. I, M, 1 ble .L49 .align 4 .L31: #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L35 .align 4 .L32: fmadd f0, f16, f20, f0 LFD f19, 3 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(BO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 7 * SIZE(AO) bdnz .L32 .align 4 .L35: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .L37 .align 4 .L36: fmadd f0, f16, f20, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) bdnz .L36 .align 4 .L37: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt .L31 .align 4 .L49: #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_ppc440_LT.S000066400000000000000000001120661313527062700221220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 srawi. J, N, 1 ble .L30 .align 4 .L10: #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) LFD B6, 8 * SIZE(BO) LFD B7, 12 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L15 .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .LKERNEL_MainFinish .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .LKERNEL_MainFinish: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt .L11 .align 4 .L20: andi. I, M, 1 ble .L29 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L25 .align 4 .L22: fmadd f0, f16, f20, f0 LFD f19, 3 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(AO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(BO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 7 * SIZE(BO) bdnz .L22 .align 4 .L25: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .L27 .align 4 .L26: fmadd f0, f16, f20, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L27: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L29: #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt .L10 .align 4 .L30: andi. J, N, 1 ble .L999 #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble .L40 .align 4 .L31: #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L35 .align 4 .L32: fmadd f0, f16, f20, f0 LFD f19, 3 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(BO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 7 * SIZE(AO) bdnz .L32 .align 4 .L35: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .L37 .align 4 .L36: fmadd f0, f16, f20, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) bdnz .L36 .align 4 .L37: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt .L31 .align 4 .L40: andi. I, M, 1 ble .L49 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L45 .align 4 .L42: fmadd f0, f16, f20, f0 LFD f23, 3 * SIZE(BO) fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) fmadd f2, f17, f20, f2 LFD f20, 4 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 5 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 6 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) fmadd f3, f16, f21, f3 LFDU f16, 8 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 8 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 2 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 3 * SIZE(AO) bdnz .L42 .align 4 .L45: fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble .L47 .align 4 .L46: fmadd f0, f16, f20, f0 LFD f21, 1 * SIZE(BO) fmadd f3, f16, f21, f3 LFDU f16, 2 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 2 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) bdnz .L46 .align 4 .L47: #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L49: #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/power/ztrsm_kernel_ppc440_RT.S000066400000000000000000001120671313527062700221310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef __64BIT__ #define LOAD lwz #else #define LOAD ld #endif #ifdef __64BIT__ #define STACKSIZE 320 #define ALPHA_R 296(SP) #define ALPHA_I 304(SP) #define FZERO 312(SP) #else #define STACKSIZE 256 #define ALPHA_R 224(SP) #define ALPHA_I 232(SP) #define FZERO 240(SP) #endif #define M r3 #define N r4 #define K r5 #ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 #define C r8 #define LDC r9 #define OFFSET r10 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define A r10 #define B r6 #define C r7 #define LDC r8 #define OFFSET r9 #else #define A r8 #define B r9 #define C r10 #define LDC r6 #define OFFSET r7 #endif #endif #define AORIG r21 #define TEMP r22 #define KK r23 #define I r24 #define J r25 #define AO r26 #define BO r27 #define CO1 r28 #define CO2 r29 #define A1 f16 #define A2 f17 #define A3 f18 #define A4 f19 #define A5 f20 #define A6 f21 #define B1 f22 #define B2 f23 #define B3 f24 #define B4 f25 #define B5 f26 #define B6 f27 #define B7 f28 #define B8 f29 #define B9 f30 #define B10 f31 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r31, 144(SP) std r30, 152(SP) std r29, 160(SP) std r28, 168(SP) std r27, 176(SP) std r26, 184(SP) std r25, 192(SP) std r24, 200(SP) std r23, 208(SP) std r22, 216(SP) std r21, 224(SP) #else stw r31, 144(SP) stw r30, 148(SP) stw r29, 152(SP) stw r28, 156(SP) stw r27, 160(SP) stw r26, 164(SP) stw r25, 168(SP) stw r24, 172(SP) stw r23, 176(SP) stw r22, 180(SP) stw r21, 184(SP) #endif stw r0, FZERO #ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #else #ifdef DOUBLE lwz B, FRAMESLOT(0) + STACKSIZE(SP) lwz C, FRAMESLOT(1) + STACKSIZE(SP) lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) #else lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif #endif #endif #if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #else #ifdef DOUBLE lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) #else lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif #endif #endif slwi LDC, LDC, ZBASE_SHIFT #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT add A, A, r0 slwi r0, M, ZBASE_SHIFT add C, C, r0 #endif #ifdef RN neg KK, OFFSET #endif #ifdef RT mullw r0, N, K slwi r0, r0, ZBASE_SHIFT add B, B, r0 mullw r0, N, LDC add C, C, r0 sub KK, N, OFFSET #endif cmpwi cr0, M, 0 ble .L999 cmpwi cr0, N, 0 ble .L999 cmpwi cr0, K, 0 ble .L999 andi. J, N, 1 ble .L30 #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT sub B, B, r0 sub C, C, LDC #endif mr CO1, C #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, C, LDC #endif ble .L40 .align 4 .L31: #if defined(LT) || defined(RN) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 1 + ZBASE_SHIFT slwi TEMP, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L35 .align 4 .L32: fmadd f0, f16, f20, f0 LFD f19, 3 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(BO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(BO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(AO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(AO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(AO) fmadd f7, f19, f27, f7 LFD f27, 7 * SIZE(AO) bdnz .L32 .align 4 .L35: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .L37 .align 4 .L36: fmadd f0, f16, f20, f0 LFD f17, 1 * SIZE(BO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(BO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(AO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(AO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(AO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(AO) bdnz .L36 .align 4 .L37: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 2 #else subi r0, KK, 1 #endif slwi TEMP, r0, 1 + ZBASE_SHIFT slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) #ifndef LN addi CO1, CO1, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 1 + ZBASE_SHIFT slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt .L31 .align 4 .L40: andi. I, M, 1 ble .L49 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L45 .align 4 .L42: fmadd f0, f16, f20, f0 LFD f23, 3 * SIZE(BO) fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) fmadd f2, f17, f20, f2 LFD f20, 4 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 5 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 5 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 6 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 LFD f23, 7 * SIZE(BO) fmadd f3, f16, f21, f3 LFDU f16, 8 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 8 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) fmadd f4, f18, f22, f4 LFD f21, 1 * SIZE(BO) fmadd f7, f18, f23, f7 LFD f18, 2 * SIZE(AO) fmadd f6, f19, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f5, f19, f23, f5 LFD f19, 3 * SIZE(AO) bdnz .L42 .align 4 .L45: fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR,r0 ble .L47 .align 4 .L46: fmadd f0, f16, f20, f0 LFD f21, 1 * SIZE(BO) fmadd f3, f16, f21, f3 LFDU f16, 2 * SIZE(AO) fmadd f2, f17, f20, f2 LFDU f20, 2 * SIZE(BO) fmadd f1, f17, f21, f1 LFD f17, 1 * SIZE(AO) bdnz .L46 .align 4 .L47: #ifndef CONJ FSUB f0, f0, f1 FADD f1, f2, f3 #else FADD f0, f0, f1 FSUB f1, f3, f2 #endif #if defined(LN) || defined(RT) subi r0, KK, 1 slwi r0, r0, 0 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 #else FSUB f0, f16, f0 FADD f1, f17, f1 #endif #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 #endif #endif #ifdef RT LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f4, f21, f1 FMUL f5, f21, f0 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) #ifndef LN addi CO1, CO1, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 0 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L49: #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 1 #endif #ifdef RT subi KK, KK, 1 #endif .align 4 .L30: srawi. J, N, 1 ble .L999 .align 4 .L10: #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT sub B, B, r0 slwi r0, LDC, 1 sub C, C, r0 #endif mr CO1, C add CO2, C, LDC #ifdef LN add KK, M, OFFSET #endif #ifdef LT mr KK, OFFSET #endif srawi. I, M, 1 #if defined(LN) || defined(RT) mr AORIG, A #else mr AO, A #endif #ifndef RT add C, CO2, LDC #endif ble .L20 .align 4 .L11: #if defined(LT) || defined(RN) LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(B) LFD B2, 1 * SIZE(B) LFD B3, 2 * SIZE(B) LFD B4, 3 * SIZE(B) LFD B5, 4 * SIZE(B) LFD B6, 8 * SIZE(B) LFD B7, 12 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B #else #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, TEMP sub TEMP, K, KK LFD A1, 0 * SIZE(AO) LFD A2, 1 * SIZE(AO) LFD A4, 4 * SIZE(AO) LFD A5, 8 * SIZE(AO) LFD B1, 0 * SIZE(BO) LFD B2, 1 * SIZE(BO) LFD B3, 2 * SIZE(BO) LFD B4, 3 * SIZE(BO) LFD B5, 4 * SIZE(BO) LFD B6, 8 * SIZE(BO) LFD B7, 12 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L15 .align 4 .L12: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 LFD A6, 12 * SIZE(AO) FMADD f8, A1, B3, f8 nop FMADD f12, A1, B4, f12 nop FMADD f1, A2, B1, f1 LFD A1, 3 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B1, f2 nop FMADD f6, A3, B2, f6 LFD B8, 5 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 6 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10, 7 * SIZE(BO) FMADD f3, A1, B1, f3 LFD A2, 5 * SIZE(AO) FMADD f7, A1, B2, f7 LFD B1, 16 * SIZE(BO) FMADD f11, A1, B3, f11 nop FMADD f15, A1, B4, f15 nop FMADD f0, A4, B5, f0 LFD A3, 6 * SIZE(AO) FMADD f4, A4, B8, f4 LFD A1, 16 * SIZE(AO) FMADD f8, A4, B9, f8 nop FMADD f12, A4, B10, f12 nop FMADD f1, A2, B5, f1 LFD A4, 7 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B5, f2 nop FMADD f6, A3, B8, f6 LFD B2, 9 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 10 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 11 * SIZE(BO) FMADD f3, A4, B5, f3 LFD A2, 9 * SIZE(AO) FMADD f7, A4, B8, f7 LFD B5, 20 * SIZE(BO) FMADD f11, A4, B9, f11 nop FMADD f15, A4, B10, f15 nop FMADD f0, A5, B6, f0 LFD A3, 10 * SIZE(AO) FMADD f4, A5, B2, f4 LFD A4, 20 * SIZE(AO) FMADD f8, A5, B3, f8 nop FMADD f12, A5, B4, f12 nop FMADD f1, A2, B6, f1 LFD A5, 11 * SIZE(AO) FMADD f5, A2, B2, f5 nop FMADD f9, A2, B3, f9 nop FMADD f13, A2, B4, f13 nop FMADD f2, A3, B6, f2 nop FMADD f6, A3, B2, f6 LFD B8, 13 * SIZE(BO) FMADD f10, A3, B3, f10 LFD B9, 14 * SIZE(BO) FMADD f14, A3, B4, f14 LFD B10,15 * SIZE(BO) FMADD f3, A5, B6, f3 LFD A2, 13 * SIZE(AO) FMADD f7, A5, B2, f7 LFD B6, 24 * SIZE(BO) FMADD f11, A5, B3, f11 nop FMADD f15, A5, B4, f15 nop FMADD f0, A6, B7, f0 LFD A3, 14 * SIZE(AO) FMADD f4, A6, B8, f4 LFD A5, 24 * SIZE(AO) FMADD f8, A6, B9, f8 nop FMADD f12, A6, B10, f12 nop FMADD f1, A2, B7, f1 LFD A6, 15 * SIZE(AO) FMADD f5, A2, B8, f5 nop FMADD f9, A2, B9, f9 nop FMADD f13, A2, B10, f13 nop FMADD f2, A3, B7, f2 addi AO, AO, 16 * SIZE FMADD f6, A3, B8, f6 LFD B2, 17 * SIZE(BO) FMADD f10, A3, B9, f10 LFD B3, 18 * SIZE(BO) FMADD f14, A3, B10, f14 LFD B4, 19 * SIZE(BO) FMADD f3, A6, B7, f3 LFD A2, 1 * SIZE(AO) FMADD f7, A6, B8, f7 LFD B7, 28 * SIZE(BO) FMADD f11, A6, B9, f11 addi BO, BO, 16 * SIZE FMADD f15, A6, B10, f15 bdnz .L12 .align 4 .L15: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .LKERNEL_MainFinish .align 4 .L16: FMADD f0, A1, B1, f0 LFD A3, 2 * SIZE(AO) FMADD f4, A1, B2, f4 FMADD f8, A1, B3, f8 FMADD f12, A1, B4, f12 LFD A4, 3 * SIZE(AO) FMADD f1, A2, B1, f1 FMADD f5, A2, B2, f5 FMADD f9, A2, B3, f9 FMADD f13, A2, B4, f13 LFDU A1, 4 * SIZE(AO) FMADD f2, A3, B1, f2 FMADD f6, A3, B2, f6 FMADD f10, A3, B3, f10 FMADD f14, A3, B4, f14 LFD A2, 1 * SIZE(AO) FMADD f3, A4, B1, f3 LFDU B1, 4 * SIZE(BO) FMADD f7, A4, B2, f7 LFD B2, 1 * SIZE(BO) FMADD f11, A4, B3, f11 LFD B3, 2 * SIZE(BO) FMADD f15, A4, B4, f15 LFD B4, 3 * SIZE(BO) bdnz .L16 .align 4 .LKERNEL_MainFinish: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11 #endif #if defined(LN) || defined(RT) subi r0, KK, 2 slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f8, f18, f8 FSUB f9, f19, f9 FSUB f2, f20, f2 FSUB f3, f21, f3 FSUB f10, f22, f10 FSUB f11, f23, f11 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) #ifndef CONJ FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 FSUB f8, f20, f8 FSUB f9, f21, f9 FSUB f10, f22, f10 FSUB f11, f23, f11 #else FSUB f0, f16, f0 FADD f1, f17, f1 FSUB f2, f18, f2 FADD f3, f19, f3 FSUB f8, f20, f8 FADD f9, f21, f9 FSUB f10, f22, f10 FADD f11, f23, f11 #endif #endif #ifdef LN LFD f16, 6 * SIZE(AO) LFD f17, 7 * SIZE(AO) LFD f18, 4 * SIZE(AO) LFD f19, 5 * SIZE(AO) LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f6, f17, f3 FMUL f7, f17, f2 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FMADD f8, f19, f11, f8 FNMSUB f9, f19, f10, f9 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FNMSUB f8, f18, f10, f8 FNMSUB f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f8, f20, f8, f12 FMADD f9, f20, f9, f13 #else FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FMSUB f8, f19, f11, f8 FNMADD f9, f19, f10, f9 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FNMADD f8, f18, f10, f8 FNMADD f9, f18, f11, f9 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f9 FMUL f13, f21, f8 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f8, f20, f8, f12 FMSUB f9, f20, f9, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 6 * SIZE(AO) LFD f21, 7 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f9 FMUL f13, f17, f8 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FMADD f10, f19, f9, f10 FNMSUB f11, f19, f8, f11 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FNMSUB f10, f18, f8, f10 FNMSUB f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 FMSUB f10, f20, f10, f12 FMADD f11, f20, f11, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FMSUB f10, f19, f9, f10 FNMADD f11, f19, f8, f11 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FNMADD f10, f18, f8, f10 FNMADD f11, f18, f9, f11 FMUL f4, f21, f3 FMUL f5, f21, f2 FMUL f12, f21, f11 FMUL f13, f21, f10 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 FMADD f10, f20, f10, f12 FMSUB f11, f20, f11, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f6, f17, f3 FMUL f7, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f6 FMADD f3, f16, f3, f7 FMADD f8, f19, f1, f8 FNMSUB f9, f19, f0, f9 FMADD f10, f19, f3, f10 FNMSUB f11, f19, f2, f11 FNMSUB f8, f18, f0, f8 FNMSUB f9, f18, f1, f9 FNMSUB f10, f18, f2, f10 FNMSUB f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMSUB f8, f20, f8, f4 FMADD f9, f20, f9, f5 FMSUB f10, f20, f10, f6 FMADD f11, f20, f11, f7 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f6 FMSUB f3, f16, f3, f7 FMSUB f8, f19, f1, f8 FNMADD f9, f19, f0, f9 FMSUB f10, f19, f3, f10 FNMADD f11, f19, f2, f11 FNMADD f8, f18, f0, f8 FNMADD f9, f18, f1, f9 FNMADD f10, f18, f2, f10 FNMADD f11, f18, f3, f11 FMUL f4, f21, f9 FMUL f5, f21, f8 FMUL f6, f21, f11 FMUL f7, f21, f10 FMADD f8, f20, f8, f4 FMSUB f9, f20, f9, f5 FMADD f10, f20, f10, f6 FMSUB f11, f20, f11, f7 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f9 FMUL f13, f17, f8 FMUL f14, f17, f11 FMUL f15, f17, f10 #ifndef CONJ FMSUB f8, f16, f8, f12 FMADD f9, f16, f9, f13 FMSUB f10, f16, f10, f14 FMADD f11, f16, f11, f15 FMADD f0, f19, f9, f0 FNMSUB f1, f19, f8, f1 FMADD f2, f19, f11, f2 FNMSUB f3, f19, f10, f3 FNMSUB f0, f18, f8, f0 FNMSUB f1, f18, f9, f1 FNMSUB f2, f18, f10, f2 FNMSUB f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f6 FMADD f3, f20, f3, f7 #else FMADD f8, f16, f8, f12 FMSUB f9, f16, f9, f13 FMADD f10, f16, f10, f14 FMSUB f11, f16, f11, f15 FMSUB f0, f19, f9, f0 FNMADD f1, f19, f8, f1 FMSUB f2, f19, f11, f2 FNMADD f3, f19, f10, f3 FNMADD f0, f18, f8, f0 FNMADD f1, f18, f9, f1 FNMADD f2, f18, f10, f2 FNMADD f3, f18, f11, f3 FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f6, f21, f3 FMUL f7, f21, f2 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f6 FMSUB f3, f20, f3, f7 #endif #endif #ifdef LN subi CO1, CO1, 4 * SIZE subi CO2, CO2, 4 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f8, 2 * SIZE(BO) STFD f9, 3 * SIZE(BO) STFD f2, 4 * SIZE(BO) STFD f3, 5 * SIZE(BO) STFD f10, 6 * SIZE(BO) STFD f11, 7 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) STFD f8, 4 * SIZE(AO) STFD f9, 5 * SIZE(AO) STFD f10, 6 * SIZE(AO) STFD f11, 7 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) STFD f8, 0 * SIZE(CO2) STFD f9, 1 * SIZE(CO2) STFD f10, 2 * SIZE(CO2) STFD f11, 3 * SIZE(CO2) #ifndef LN addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 2 #endif #ifdef LN subi KK, KK, 2 #endif addic. I, I, -1 bgt .L11 .align 4 .L20: andi. I, M, 1 ble .L29 #if defined(LT) || defined(RN) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, KK, 2 mr BO, B mtspr CTR, r0 #else #ifdef LN slwi r0, K, 0 + ZBASE_SHIFT sub AORIG, AORIG, r0 #endif slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AORIG, r0 add BO, B, TEMP sub TEMP, K, KK LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, TEMP, 2 mtspr CTR, r0 #endif ble .L25 .align 4 .L22: fmadd f0, f16, f20, f0 LFD f19, 3 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFD f16, 4 * SIZE(AO) fmadd f4, f17, f20, f4 LFD f20, 8 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 9 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 10 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 5 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 6 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 12 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 13 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 14 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 15 * SIZE(BO) fmadd f0, f16, f20, f0 LFD f19, 7 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 8 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 16 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) fmadd f0, f18, f24, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f18, f25, f1 nop fmadd f2, f18, f26, f2 nop fmadd f3, f18, f27, f3 LFD f18, 2 * SIZE(AO) fmadd f4, f19, f24, f4 LFD f24, 4 * SIZE(BO) fmadd f5, f19, f25, f5 LFD f25, 5 * SIZE(BO) fmadd f6, f19, f26, f6 LFD f26, 6 * SIZE(BO) fmadd f7, f19, f27, f7 LFD f27, 7 * SIZE(BO) bdnz .L22 .align 4 .L25: #if defined(LT) || defined(RN) andi. r0, KK, 3 #else andi. r0, TEMP, 3 #endif mtspr CTR, r0 ble .L27 .align 4 .L26: fmadd f0, f16, f20, f0 LFD f17, 1 * SIZE(AO) fmadd f1, f16, f21, f1 nop fmadd f2, f16, f22, f2 nop fmadd f3, f16, f23, f3 LFDU f16, 2 * SIZE(AO) fmadd f4, f17, f20, f4 LFDU f20, 4 * SIZE(BO) fmadd f5, f17, f21, f5 LFD f21, 1 * SIZE(BO) fmadd f6, f17, f22, f6 LFD f22, 2 * SIZE(BO) fmadd f7, f17, f23, f7 LFD f23, 3 * SIZE(BO) bdnz .L26 .align 4 .L27: #ifndef CONJ FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6 #else #if defined(LN) || defined(LT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6 #else FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3 #endif #endif #if defined(LN) || defined(RT) #ifdef LN subi r0, KK, 1 #else subi r0, KK, 2 #endif slwi TEMP, r0, 0 + ZBASE_SHIFT slwi r0, r0, 1 + ZBASE_SHIFT add AO, AORIG, TEMP add BO, B, r0 #endif #if defined(LN) || defined(LT) LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 #else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f20, 2 * SIZE(AO) LFD f21, 3 * SIZE(AO) FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f20, f2 FSUB f3, f21, f3 #endif #ifdef LN LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) FMUL f4, f21, f1 FMUL f5, f21, f0 FMUL f12, f21, f3 FMUL f13, f21, f2 #ifndef CONJ FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 FMSUB f2, f20, f2, f12 FMADD f3, f20, f3, f13 #else FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 FMADD f2, f20, f2, f12 FMSUB f3, f20, f3, f13 #endif #endif #ifdef LT LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) FMUL f4, f17, f1 FMUL f5, f17, f0 FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 #endif #endif #ifdef RN LFD f16, 0 * SIZE(BO) LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) LFD f20, 6 * SIZE(BO) LFD f21, 7 * SIZE(BO) FMUL f4, f17, f1 FMUL f5, f17, f0 #ifndef CONJ FMSUB f0, f16, f0, f4 FMADD f1, f16, f1, f5 FMADD f2, f19, f1, f2 FNMSUB f3, f19, f0, f3 FNMSUB f2, f18, f0, f2 FNMSUB f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMSUB f2, f20, f2, f4 FMADD f3, f20, f3, f5 #else FMADD f0, f16, f0, f4 FMSUB f1, f16, f1, f5 FMSUB f2, f19, f1, f2 FNMADD f3, f19, f0, f3 FNMADD f2, f18, f0, f2 FNMADD f3, f18, f1, f3 FMUL f4, f21, f3 FMUL f5, f21, f2 FMADD f2, f20, f2, f4 FMSUB f3, f20, f3, f5 #endif #endif #ifdef RT LFD f16, 6 * SIZE(BO) LFD f17, 7 * SIZE(BO) LFD f18, 4 * SIZE(BO) LFD f19, 5 * SIZE(BO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) FMUL f12, f17, f3 FMUL f13, f17, f2 #ifndef CONJ FMSUB f2, f16, f2, f12 FMADD f3, f16, f3, f13 FMADD f0, f19, f3, f0 FNMSUB f1, f19, f2, f1 FNMSUB f0, f18, f2, f0 FNMSUB f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMSUB f0, f20, f0, f4 FMADD f1, f20, f1, f5 #else FMADD f2, f16, f2, f12 FMSUB f3, f16, f3, f13 FMSUB f0, f19, f3, f0 FNMADD f1, f19, f2, f1 FNMADD f0, f18, f2, f0 FNMADD f1, f18, f3, f1 FMUL f4, f21, f1 FMUL f5, f21, f0 FMADD f0, f20, f0, f4 FMSUB f1, f20, f1, f5 #endif #endif #ifdef LN subi CO1, CO1, 2 * SIZE subi CO2, CO2, 2 * SIZE #endif #if defined(LN) || defined(LT) STFD f0, 0 * SIZE(BO) STFD f1, 1 * SIZE(BO) STFD f2, 2 * SIZE(BO) STFD f3, 3 * SIZE(BO) #else STFD f0, 0 * SIZE(AO) STFD f1, 1 * SIZE(AO) STFD f2, 2 * SIZE(AO) STFD f3, 3 * SIZE(AO) #endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) #ifndef LN addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE #endif #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 #endif #if defined(LT) || defined(RN) sub TEMP, K, KK slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP #endif #ifdef LT addi KK, KK, 1 #endif #ifdef LN subi KK, KK, 1 #endif .align 4 .L29: #ifdef LN slwi r0, K, 1 + ZBASE_SHIFT add B, B, r0 #endif #if defined(LT) || defined(RN) mr B, BO #endif #ifdef RN addi KK, KK, 2 #endif #ifdef RT subi KK, KK, 2 #endif addic. J, J, -1 bgt .L10 .align 4 .L999: addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE OpenBLAS-0.2.20/kernel/setparam-ref.c000066400000000000000000001030631313527062700171610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" #ifdef BUILD_KERNEL #include "kernelTS.h" #endif #undef DEBUG static void init_parameter(void); gotoblas_t TABLE_NAME = { DTB_DEFAULT_ENTRIES , GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, #ifdef SGEMM_DEFAULT_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN, #else MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N), #endif #ifdef HAVE_EXCLUSIVE_CACHE 1, #else 0, #endif samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, #else sgemm_oncopyTS, sgemm_otcopyTS, #endif sgemm_oncopyTS, sgemm_otcopyTS, strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS, #else strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, #endif strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS, #else strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, #endif strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N ssymm_iutcopyTS, ssymm_iltcopyTS, #else ssymm_outcopyTS, ssymm_oltcopyTS, #endif ssymm_outcopyTS, ssymm_oltcopyTS, #ifndef NO_LAPACK sneg_tcopyTS, slaswp_ncopyTS, #else NULL,NULL, #endif 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, #ifdef DGEMM_DEFAULT_UNROLL_MN DGEMM_DEFAULT_UNROLL_MN, #else MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), #endif damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, dgemv_nTS, dgemv_tTS, dger_kTS, dsymv_LTS, dsymv_UTS, dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, #else dgemm_oncopyTS, dgemm_otcopyTS, #endif dgemm_oncopyTS, dgemm_otcopyTS, dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, dtrsm_ilnucopyTS, dtrsm_ilnncopyTS, dtrsm_iltucopyTS, dtrsm_iltncopyTS, #else dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS, dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS, #endif dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS, dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS, dtrmm_kernel_RNTS, dtrmm_kernel_RTTS, dtrmm_kernel_LNTS, dtrmm_kernel_LTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrmm_iunucopyTS, dtrmm_iunncopyTS, dtrmm_iutucopyTS, dtrmm_iutncopyTS, dtrmm_ilnucopyTS, dtrmm_ilnncopyTS, dtrmm_iltucopyTS, dtrmm_iltncopyTS, #else dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, #endif dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dsymm_iutcopyTS, dsymm_iltcopyTS, #else dsymm_outcopyTS, dsymm_oltcopyTS, #endif dsymm_outcopyTS, dsymm_oltcopyTS, #ifndef NO_LAPACK dneg_tcopyTS, dlaswp_ncopyTS, #else NULL, NULL, #endif #ifdef EXPRECISION 0, 0, 0, QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, qgemm_kernelTS, qgemm_betaTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qgemm_incopyTS, qgemm_itcopyTS, #else qgemm_oncopyTS, qgemm_otcopyTS, #endif qgemm_oncopyTS, qgemm_otcopyTS, qtrsm_kernel_LNTS, qtrsm_kernel_LTTS, qtrsm_kernel_RNTS, qtrsm_kernel_RTTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qtrsm_iunucopyTS, qtrsm_iunncopyTS, qtrsm_iutucopyTS, qtrsm_iutncopyTS, qtrsm_ilnucopyTS, qtrsm_ilnncopyTS, qtrsm_iltucopyTS, qtrsm_iltncopyTS, #else qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS, qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS, #endif qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS, qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS, qtrmm_kernel_RNTS, qtrmm_kernel_RTTS, qtrmm_kernel_LNTS, qtrmm_kernel_LTTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qtrmm_iunucopyTS, qtrmm_iunncopyTS, qtrmm_iutucopyTS, qtrmm_iutncopyTS, qtrmm_ilnucopyTS, qtrmm_ilnncopyTS, qtrmm_iltucopyTS, qtrmm_iltncopyTS, #else qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS, qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS, #endif qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS, qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qsymm_iutcopyTS, qsymm_iltcopyTS, #else qsymm_outcopyTS, qsymm_oltcopyTS, #endif qsymm_outcopyTS, qsymm_oltcopyTS, #ifndef NO_LAPACK qneg_tcopyTS, qlaswp_ncopyTS, #else NULL, NULL, #endif #endif 0, 0, 0, CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, #ifdef CGEMM_DEFAULT_UNROLL_MN CGEMM_DEFAULT_UNROLL_MN, #else MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N), #endif camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, cnrm2_kTS, casum_kTS, ccopy_kTS, cdotu_kTS, cdotc_kTS, csrot_kTS, caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, csymv_LTS, csymv_UTS, chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS, cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, cgemm_betaTS, #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N cgemm_incopyTS, cgemm_itcopyTS, #else cgemm_oncopyTS, cgemm_otcopyTS, #endif cgemm_oncopyTS, cgemm_otcopyTS, ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS, ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS, #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS, ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS, #else ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, #endif ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS, ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS, #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS, ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS, #else ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, #endif ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N csymm_iutcopyTS, csymm_iltcopyTS, #else csymm_outcopyTS, csymm_oltcopyTS, #endif csymm_outcopyTS, csymm_oltcopyTS, #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N chemm_iutcopyTS, chemm_iltcopyTS, #else chemm_outcopyTS, chemm_oltcopyTS, #endif chemm_outcopyTS, chemm_oltcopyTS, 0, 0, 0, #ifdef CGEMM3M_DEFAULT_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), #else SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N), #endif cgemm3m_kernelTS, cgemm3m_incopybTS, cgemm3m_incopyrTS, cgemm3m_incopyiTS, cgemm3m_itcopybTS, cgemm3m_itcopyrTS, cgemm3m_itcopyiTS, cgemm3m_oncopybTS, cgemm3m_oncopyrTS, cgemm3m_oncopyiTS, cgemm3m_otcopybTS, cgemm3m_otcopyrTS, cgemm3m_otcopyiTS, csymm3m_iucopybTS, csymm3m_ilcopybTS, csymm3m_iucopyrTS, csymm3m_ilcopyrTS, csymm3m_iucopyiTS, csymm3m_ilcopyiTS, csymm3m_oucopybTS, csymm3m_olcopybTS, csymm3m_oucopyrTS, csymm3m_olcopyrTS, csymm3m_oucopyiTS, csymm3m_olcopyiTS, chemm3m_iucopybTS, chemm3m_ilcopybTS, chemm3m_iucopyrTS, chemm3m_ilcopyrTS, chemm3m_iucopyiTS, chemm3m_ilcopyiTS, chemm3m_oucopybTS, chemm3m_olcopybTS, chemm3m_oucopyrTS, chemm3m_olcopyrTS, chemm3m_oucopyiTS, chemm3m_olcopyiTS, #ifndef NO_LAPACK cneg_tcopyTS, claswp_ncopyTS, #else NULL, NULL, #endif 0, 0, 0, ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, #ifdef ZGEMM_DEFAULT_UNROLL_MN ZGEMM_DEFAULT_UNROLL_MN, #else MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N), #endif zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, znrm2_kTS, zasum_kTS, zcopy_kTS, zdotu_kTS, zdotc_kTS, zdrot_kTS, zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS, zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS, zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS, zsymv_LTS, zsymv_UTS, zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS, zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS, zgemm_betaTS, #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N zgemm_incopyTS, zgemm_itcopyTS, #else zgemm_oncopyTS, zgemm_otcopyTS, #endif zgemm_oncopyTS, zgemm_otcopyTS, ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS, ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS, #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS, ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS, #else ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS, ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS, #endif ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS, ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS, ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS, ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS, #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS, ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS, #else ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, #endif ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N zsymm_iutcopyTS, zsymm_iltcopyTS, #else zsymm_outcopyTS, zsymm_oltcopyTS, #endif zsymm_outcopyTS, zsymm_oltcopyTS, #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N zhemm_iutcopyTS, zhemm_iltcopyTS, #else zhemm_outcopyTS, zhemm_oltcopyTS, #endif zhemm_outcopyTS, zhemm_oltcopyTS, 0, 0, 0, #ifdef ZGEMM3M_DEFAULT_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), #else DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), #endif zgemm3m_kernelTS, zgemm3m_incopybTS, zgemm3m_incopyrTS, zgemm3m_incopyiTS, zgemm3m_itcopybTS, zgemm3m_itcopyrTS, zgemm3m_itcopyiTS, zgemm3m_oncopybTS, zgemm3m_oncopyrTS, zgemm3m_oncopyiTS, zgemm3m_otcopybTS, zgemm3m_otcopyrTS, zgemm3m_otcopyiTS, zsymm3m_iucopybTS, zsymm3m_ilcopybTS, zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS, zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS, zsymm3m_oucopybTS, zsymm3m_olcopybTS, zsymm3m_oucopyrTS, zsymm3m_olcopyrTS, zsymm3m_oucopyiTS, zsymm3m_olcopyiTS, zhemm3m_iucopybTS, zhemm3m_ilcopybTS, zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS, zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS, zhemm3m_oucopybTS, zhemm3m_olcopybTS, zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, #ifndef NO_LAPACK zneg_tcopyTS, zlaswp_ncopyTS, #else NULL, NULL, #endif #ifdef EXPRECISION 0, 0, 0, XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, xnrm2_kTS, xasum_kTS, xcopy_kTS, xdotu_kTS, xdotc_kTS, xqrot_kTS, xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS, xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS, xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS, xsymv_LTS, xsymv_UTS, xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS, xgemm_kernel_nTS, xgemm_kernel_lTS, xgemm_kernel_rTS, xgemm_kernel_bTS, xgemm_betaTS, #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xgemm_incopyTS, xgemm_itcopyTS, #else xgemm_oncopyTS, xgemm_otcopyTS, #endif xgemm_oncopyTS, xgemm_otcopyTS, xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS, xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS, #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS, xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS, #else xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS, xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS, #endif xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS, xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS, xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS, xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS, #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS, xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS, #else xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS, xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS, #endif xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS, xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS, #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xsymm_iutcopyTS, xsymm_iltcopyTS, #else xsymm_outcopyTS, xsymm_oltcopyTS, #endif xsymm_outcopyTS, xsymm_oltcopyTS, #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xhemm_iutcopyTS, xhemm_iltcopyTS, #else xhemm_outcopyTS, xhemm_oltcopyTS, #endif xhemm_outcopyTS, xhemm_oltcopyTS, 0, 0, 0, QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), xgemm3m_kernelTS, xgemm3m_incopybTS, xgemm3m_incopyrTS, xgemm3m_incopyiTS, xgemm3m_itcopybTS, xgemm3m_itcopyrTS, xgemm3m_itcopyiTS, xgemm3m_oncopybTS, xgemm3m_oncopyrTS, xgemm3m_oncopyiTS, xgemm3m_otcopybTS, xgemm3m_otcopyrTS, xgemm3m_otcopyiTS, xsymm3m_iucopybTS, xsymm3m_ilcopybTS, xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS, xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS, xsymm3m_oucopybTS, xsymm3m_olcopybTS, xsymm3m_oucopyrTS, xsymm3m_olcopyrTS, xsymm3m_oucopyiTS, xsymm3m_olcopyiTS, xhemm3m_iucopybTS, xhemm3m_ilcopybTS, xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS, xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS, xhemm3m_oucopybTS, xhemm3m_olcopybTS, xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, #ifndef NO_LAPACK xneg_tcopyTS, xlaswp_ncopyTS, #else NULL, NULL, #endif #endif init_parameter, SNUMOPT, DNUMOPT, QNUMOPT, saxpby_kTS, daxpby_kTS, caxpby_kTS, zaxpby_kTS, somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS, domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS, comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS, comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS, zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS, dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS, cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS, cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS, zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS, zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS, sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS }; #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; int info[15]; cpuid(2, &eax, &ebx, &ecx, &edx); info[ 0] = BITMASK(eax, 8, 0xff); info[ 1] = BITMASK(eax, 16, 0xff); info[ 2] = BITMASK(eax, 24, 0xff); info[ 3] = BITMASK(ebx, 0, 0xff); info[ 4] = BITMASK(ebx, 8, 0xff); info[ 5] = BITMASK(ebx, 16, 0xff); info[ 6] = BITMASK(ebx, 24, 0xff); info[ 7] = BITMASK(ecx, 0, 0xff); info[ 8] = BITMASK(ecx, 8, 0xff); info[ 9] = BITMASK(ecx, 16, 0xff); info[10] = BITMASK(ecx, 24, 0xff); info[11] = BITMASK(edx, 0, 0xff); info[12] = BITMASK(edx, 8, 0xff); info[13] = BITMASK(edx, 16, 0xff); info[14] = BITMASK(edx, 24, 0xff); for (i = 0; i < 15; i++){ switch (info[i]){ /* This table is from http://www.sandpile.org/ia32/cpuid.htm */ case 0x1a : return 96; case 0x39 : case 0x3b : case 0x41 : case 0x79 : case 0x81 : return 128; case 0x3a : return 192; case 0x21 : case 0x3c : case 0x42 : case 0x7a : case 0x7e : case 0x82 : return 256; case 0x3d : return 384; case 0x3e : case 0x43 : case 0x7b : case 0x7f : case 0x83 : case 0x86 : return 512; case 0x44 : case 0x78 : case 0x7c : case 0x84 : case 0x87 : return 1024; case 0x45 : case 0x7d : case 0x85 : return 2048; case 0x48 : return 3184; case 0x49 : return 4096; case 0x4e : return 6144; } } return 0; } #endif static __inline__ int get_l2_size(void){ int eax, ebx, ecx, edx, l2; cpuid(0x80000006, &eax, &ebx, &ecx, &edx); l2 = BITMASK(ecx, 16, 0xffff); #ifndef ARCH_X86 return l2; #else if (l2 > 0) return l2; return get_l2_size_old(); #endif } static __inline__ int get_l3_size(void){ int eax, ebx, ecx, edx; cpuid(0x80000006, &eax, &ebx, &ecx, &edx); return BITMASK(edx, 18, 0x3fff) * 512; } static void init_parameter(void) { int l2 = get_l2_size(); TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; #ifdef CGEMM3M_DEFAULT_Q TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; #else TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q; #endif #ifdef ZGEMM3M_DEFAULT_Q TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; #else TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q; #endif #ifdef EXPRECISION TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q; #endif #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) #ifdef DEBUG fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); #endif TABLE_NAME.sgemm_p = 64 * (l2 >> 7); TABLE_NAME.dgemm_p = 32 * (l2 >> 7); TABLE_NAME.cgemm_p = 32 * (l2 >> 7); TABLE_NAME.zgemm_p = 16 * (l2 >> 7); #ifdef EXPRECISION TABLE_NAME.qgemm_p = 16 * (l2 >> 7); TABLE_NAME.xgemm_p = 8 * (l2 >> 7); #endif #endif #ifdef CORE_NORTHWOOD #ifdef DEBUG fprintf(stderr, "Northwood\n"); #endif TABLE_NAME.sgemm_p = 96 * (l2 >> 7); TABLE_NAME.dgemm_p = 48 * (l2 >> 7); TABLE_NAME.cgemm_p = 48 * (l2 >> 7); TABLE_NAME.zgemm_p = 24 * (l2 >> 7); #ifdef EXPRECISION TABLE_NAME.qgemm_p = 24 * (l2 >> 7); TABLE_NAME.xgemm_p = 12 * (l2 >> 7); #endif #endif #ifdef ATOM #ifdef DEBUG fprintf(stderr, "Atom\n"); #endif TABLE_NAME.sgemm_p = 256; TABLE_NAME.dgemm_p = 128; TABLE_NAME.cgemm_p = 128; TABLE_NAME.zgemm_p = 64; #ifdef EXPRECISION TABLE_NAME.qgemm_p = 64; TABLE_NAME.xgemm_p = 32; #endif #endif #ifdef CORE_PRESCOTT #ifdef DEBUG fprintf(stderr, "Prescott\n"); #endif TABLE_NAME.sgemm_p = 56 * (l2 >> 7); TABLE_NAME.dgemm_p = 28 * (l2 >> 7); TABLE_NAME.cgemm_p = 28 * (l2 >> 7); TABLE_NAME.zgemm_p = 14 * (l2 >> 7); #ifdef EXPRECISION TABLE_NAME.qgemm_p = 14 * (l2 >> 7); TABLE_NAME.xgemm_p = 7 * (l2 >> 7); #endif #endif #ifdef CORE2 #ifdef DEBUG fprintf(stderr, "Core2\n"); #endif TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8; TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8; TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4; TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4; #ifdef EXPRECISION TABLE_NAME.qgemm_p = 92 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 46 * (l2 >> 9) + 4; #endif #endif #ifdef PENRYN #ifdef DEBUG fprintf(stderr, "Penryn\n"); #endif TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; #ifdef EXPRECISION TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; #endif #endif #ifdef DUNNINGTON #ifdef DEBUG fprintf(stderr, "Dunnington\n"); #endif TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; #ifdef EXPRECISION TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; #endif #endif #ifdef NEHALEM #ifdef DEBUG fprintf(stderr, "Nehalem\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef SANDYBRIDGE #ifdef DEBUG fprintf(stderr, "Sandybridge\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef HASWELL #ifdef DEBUG fprintf(stderr, "Haswell\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef OPTERON #ifdef DEBUG fprintf(stderr, "Opteron\n"); #endif TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7); TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7); #ifdef EXPRECISION TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7); TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7); #endif #endif #ifdef BARCELONA #ifdef DEBUG fprintf(stderr, "Barcelona\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef BOBCAT #ifdef DEBUG fprintf(stderr, "Bobcate\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef BULLDOZER #ifdef DEBUG fprintf(stderr, "Bulldozer\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef EXCAVATOR #ifdef DEBUG fprintf(stderr, "Excavator\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef PILEDRIVER #ifdef DEBUG fprintf(stderr, "Piledriver\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef STEAMROLLER #ifdef DEBUG fprintf(stderr, "Steamroller\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef ZEN #ifdef DEBUG fprintf(stderr, "Zen\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef NANO #ifdef DEBUG fprintf(stderr, "NANO\n"); #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; #endif #endif #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; #else TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; #endif #ifdef ZGEMM3M_DEFAULT_P TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; #else TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; #endif #ifdef EXPRECISION TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; #endif TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M; TABLE_NAME.zgemm_p = ((TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1)/ZGEMM_DEFAULT_UNROLL_M) * ZGEMM_DEFAULT_UNROLL_M; #ifdef CGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1)/CGEMM3M_DEFAULT_UNROLL_M) * CGEMM3M_DEFAULT_UNROLL_M; #else TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; #endif #ifdef ZGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1)/ZGEMM3M_DEFAULT_UNROLL_M) * ZGEMM3M_DEFAULT_UNROLL_M; #else TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; #endif #ifdef QUAD_PRECISION TABLE_NAME.qgemm_p = ((TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M; TABLE_NAME.xgemm_p = ((TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1)/XGEMM_DEFAULT_UNROLL_M) * XGEMM_DEFAULT_UNROLL_M; TABLE_NAME.xgemm3m_p = ((TABLE_NAME.xgemm3m_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M; #endif #ifdef DEBUG fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); TABLE_NAME.dgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15); #ifdef EXPRECISION TABLE_NAME.qgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); #endif TABLE_NAME.cgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); TABLE_NAME.zgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15); TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15); #ifdef EXPRECISION TABLE_NAME.xgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15); TABLE_NAME.xgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.xgemm3m_p * TABLE_NAME.xgemm3m_q * 32 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.xgemm3m_q * 32) - 15) & ~15); #endif } OpenBLAS-0.2.20/kernel/sparc/000077500000000000000000000000001313527062700155345ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/sparc/KERNEL000066400000000000000000000016231313527062700164410ustar00rootroot00000000000000ifndef SAMINKERNEL SAMINKERNEL = amax.S endif ifndef DAMINKERNEL DAMINKERNEL = amax.S endif ifndef CAMINKERNEL CAMINKERNEL = zamax.S endif ifndef ZAMINKERNEL ZAMINKERNEL = zamax.S endif ifndef SMINKERNEL SMINKERNEL = max.S endif ifndef DMINKERNEL DMINKERNEL = max.S endif ifndef ISAMINKERNEL ISAMINKERNEL = iamax.S endif ifndef IDAMINKERNEL IDAMINKERNEL = iamax.S endif ifndef ICAMINKERNEL ICAMINKERNEL = izamax.S endif ifndef IZAMINKERNEL IZAMINKERNEL = izamax.S endif ifndef ISMINKERNEL ISMINKERNEL = iamax.S endif ifndef IDMINKERNEL IDMINKERNEL = iamax.S endif ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif ifndef DNRM2KERNEL DNRM2KERNEL = dnrm2.S endif ifndef CNRM2KERNEL CNRM2KERNEL = cnrm2.S endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.S endif SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c OpenBLAS-0.2.20/kernel/sparc/KERNEL.sparc000066400000000000000000000030201313527062700175410ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy.S SGEMMOTCOPY = gemm_tcopy.S SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) DGEMMKERNEL = gemm_kernel.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy.S DGEMMOTCOPY = gemm_tcopy.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) CGEMMKERNEL = zgemm_kernel.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = zgemm_ncopy.S CGEMMOTCOPY = zgemm_tcopy.S CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) ZGEMMKERNEL = zgemm_kernel.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy.S ZGEMMOTCOPY = zgemm_tcopy.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S STRSMKERNEL_RT = trsm_kernel_RT.S DTRSMKERNEL_LN = trsm_kernel_LN.S DTRSMKERNEL_LT = trsm_kernel_LT.S DTRSMKERNEL_RN = trsm_kernel_LT.S DTRSMKERNEL_RT = trsm_kernel_RT.S CTRSMKERNEL_LN = ztrsm_kernel_LN.S CTRSMKERNEL_LT = ztrsm_kernel_LT.S CTRSMKERNEL_RN = ztrsm_kernel_LT.S CTRSMKERNEL_RT = ztrsm_kernel_RT.S ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S OpenBLAS-0.2.20/kernel/sparc/KERNEL.sparcv7000066400000000000000000000040371313527062700200270ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_2x8.S SGEMMINCOPY = gemm_ncopy_2.S SGEMMITCOPY = gemm_tcopy_2.S SGEMMONCOPY = gemm_ncopy_8.S SGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMMINCOPYOBJ = sgemm_incopy.$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy.$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x8.S DGEMMINCOPY = gemm_ncopy_2.S DGEMMITCOPY = gemm_tcopy_2.S DGEMMONCOPY = gemm_ncopy_8.S DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy.$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy.$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) CGEMMKERNEL = zgemm_kernel_1x4.S CGEMMINCOPY = ../generic/zgemm_ncopy_1.c CGEMMITCOPY = ../generic/zgemm_tcopy_1.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMINCOPYOBJ = cgemm_incopy.$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy.$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x4.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ZGEMMINCOPYOBJ = zgemm_incopy.$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy.$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_2x8.S STRSMKERNEL_LT = trsm_kernel_LT_2x8.S STRSMKERNEL_RN = trsm_kernel_LT_2x8.S STRSMKERNEL_RT = trsm_kernel_RT_2x8.S DTRSMKERNEL_LN = trsm_kernel_LN_2x8.S DTRSMKERNEL_LT = trsm_kernel_LT_2x8.S DTRSMKERNEL_RN = trsm_kernel_LT_2x8.S DTRSMKERNEL_RT = trsm_kernel_RT_2x8.S CTRSMKERNEL_LN = ztrsm_kernel_LT_1x4.S CTRSMKERNEL_LT = ztrsm_kernel_LT_1x4.S CTRSMKERNEL_RN = ztrsm_kernel_LT_1x4.S CTRSMKERNEL_RT = ztrsm_kernel_RT_1x4.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4.S OpenBLAS-0.2.20/kernel/sparc/Makefile000066400000000000000000000000121313527062700171650ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/sparc/amax.S000066400000000000000000000166471313527062700166240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #endif #ifndef USE_MIN #define FCMOV FMOVG #else #define FCMOV FMOVL #endif PROLOGUE SAVESP FCLR(0) cmp N, 0 ble .LL20 nop cmp INCX, 0 ble .LL20 sll INCX, BASE_SHIFT, INCX add N, -1, N LDF [X], c4 add X, INCX, X cmp N, 0 ble .LL20 FABS c4, c1 FABS c4, c2 FABS c4, c3 FABS c4, c4 cmp INCX, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 40 .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a1, t1 LDF [X + 0 * SIZE], a1 FABS a2, t2 LDF [X + 1 * SIZE], a2 FABS a3, t3 LDF [X + 2 * SIZE], a3 FABS a4, t4 LDF [X + 3 * SIZE], a4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t2, c2 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 FABS a5, t1 LDF [X + 4 * SIZE], a5 FABS a6, t2 LDF [X + 5 * SIZE], a6 FABS a7, t3 LDF [X + 6 * SIZE], a7 FABS a8, t4 LDF [X + 7 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 add I, -1, I FCMOV %fcc1, t2, c2 cmp I, 0 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t2, c2 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t2, c2 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 FABS a1, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 FCMOV %fcc0, c2, c1 FCMOV %fcc1, c4, c3 FCMP %fcc0, c3, c1 FCMOV %fcc0, c3, c1 .LL20: return %i7 + 8 clr %g0 .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [X + 0 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 add X, INCX, X LDF [X + 0 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FABS a1, t1 LDF [X + 0 * SIZE], a1 add X, INCX, X FABS a2, t2 LDF [X + 0 * SIZE], a2 add X, INCX, X FABS a3, t3 LDF [X + 0 * SIZE], a3 add X, INCX, X FABS a4, t4 LDF [X + 0 * SIZE], a4 add X, INCX, X FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t2, c2 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 FABS a5, t1 LDF [X + 0 * SIZE], a5 add X, INCX, X FABS a6, t2 LDF [X + 0 * SIZE], a6 add X, INCX, X FABS a7, t3 LDF [X + 0 * SIZE], a7 add X, INCX, X FABS a8, t4 LDF [X + 0 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 add I, -1, I FCMOV %fcc1, t2, c2 cmp I, 0 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 bg,pt %icc, .LL51 add X, INCX, X .LL52: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t2, c2 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t2, c2 FCMOV %fcc2, t3, c3 FCMOV %fcc3, t4, c4 .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 FABS a1, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 FCMOV %fcc0, c2, c1 FCMOV %fcc1, c4, c3 FCMP %fcc0, c3, c1 FCMOV %fcc0, c3, c1 return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/asum.S000066400000000000000000000145361313527062700166360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #else #define c1 %f0 #define c2 %f1 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #endif PROLOGUE SAVESP FCLR(0) sll INCX, BASE_SHIFT, INCX FMOV c1, c2 FMOV c1, t1 FMOV c1, t2 FMOV c1, t3 FMOV c1, t4 cmp INCX, 0 ble .LL19 cmp INCX, SIZE bne .LL50 sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 add I, -1, I LDF [X + 1 * SIZE], a2 cmp I, 0 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 LDF [X + 5 * SIZE], a6 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 128 .LL11: FADD c1, t1, c1 prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a1, t1 LDF [X + 0 * SIZE], a1 FADD c2, t2, c2 add I, -1, I FABS a2, t2 LDF [X + 1 * SIZE], a2 FADD c1, t3, c1 cmp I, 0 FABS a3, t3 LDF [X + 2 * SIZE], a3 FADD c2, t4, c2 nop FABS a4, t4 LDF [X + 3 * SIZE], a4 FADD c1, t1, c1 nop FABS a5, t1 LDF [X + 4 * SIZE], a5 FADD c2, t2, c2 nop FABS a6, t2 LDF [X + 5 * SIZE], a6 FADD c1, t3, c1 FABS a7, t3 LDF [X + 6 * SIZE], a7 add X, 8 * SIZE, X FADD c2, t4, c2 FABS a8, t4 bg,pt %icc, .LL11 LDF [X - 1 * SIZE], a8 .LL12: FADD c1, t1, c1 FABS a1, t1 FADD c2, t2, c2 FABS a2, t2 FADD c1, t3, c1 FABS a3, t3 FADD c2, t4, c2 FABS a4, t4 FADD c1, t1, c1 FABS a5, t1 FADD c2, t2, c2 FABS a6, t2 FADD c1, t3, c1 FABS a7, t3 FADD c2, t4, c2 FABS a8, t4 .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 add I, -1, I cmp I, 0 FADD c1, t1, c1 FABS a1, t1 bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: FADD c1, t1, c1 FADD c2, t2, c2 FADD c1, t3, c1 FADD c2, t4, c2 FADD c1, c2, c1 return %i7 + 8 clr %g0 .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [X + 0 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 add X, INCX, X LDF [X + 0 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FADD c1, t1, c1 add I, -1, I FABS a1, t1 LDF [X + 0 * SIZE], a1 add X, INCX, X FADD c2, t2, c2 cmp I, 0 FABS a2, t2 LDF [X + 0 * SIZE], a2 add X, INCX, X FADD c1, t3, c1 FABS a3, t3 LDF [X + 0 * SIZE], a3 add X, INCX, X FADD c2, t4, c2 FABS a4, t4 LDF [X + 0 * SIZE], a4 add X, INCX, X FADD c1, t1, c1 FABS a5, t1 LDF [X + 0 * SIZE], a5 add X, INCX, X FADD c2, t2, c2 FABS a6, t2 LDF [X + 0 * SIZE], a6 add X, INCX, X FADD c1, t3, c1 FABS a7, t3 LDF [X + 0 * SIZE], a7 add X, INCX, X FADD c2, t4, c2 FABS a8, t4 LDF [X + 0 * SIZE], a8 bg,pt %icc, .LL51 add X, INCX, X .LL52: FADD c1, t1, c1 FABS a1, t1 FADD c2, t2, c2 FABS a2, t2 FADD c1, t3, c1 FABS a3, t3 FADD c2, t4, c2 FABS a4, t4 FADD c1, t1, c1 FABS a5, t1 FADD c2, t2, c2 FABS a6, t2 FADD c1, t3, c1 FABS a7, t3 FADD c2, t4, c2 FABS a8, t4 .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 FADD c1, t1, c1 add I, -1, I FABS a1, t1 cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: FADD c1, t1, c1 FADD c2, t2, c2 FADD c1, t3, c1 FADD c2, t4, c2 FADD c1, c2, c1 return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/axpy.S000066400000000000000000000236211313527062700166450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(DOUBLE) && !defined(__64BIT__) #define N %i0 #define X %i5 #define INCX %i1 #define Y %i2 #define INCY %i3 #define I %i4 #else #define N %i0 #define X %i4 #define INCX %i5 #define Y %i1 #define INCY %i2 #define I %i3 #endif #define YY %l1 #ifdef DOUBLE #define a1 %f0 #define a2 %f2 #define a3 %f4 #define a4 %f6 #define a5 %f8 #define a6 %f10 #define a7 %f12 #define a8 %f14 #define b1 %f16 #define b2 %f18 #define b3 %f20 #define b4 %f22 #define b5 %f24 #define b6 %f26 #define b7 %f28 #define b8 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define c1 %f40 #define c2 %f42 #define c3 %f44 #define c4 %f46 #define c5 %f48 #define c6 %f50 #define c7 %f52 #define c8 %f54 #define ALPHA %f62 #else #define a1 %f0 #define a2 %f1 #define a3 %f2 #define a4 %f3 #define a5 %f4 #define a6 %f5 #define a7 %f6 #define a8 %f7 #define b1 %f8 #define b2 %f9 #define b3 %f10 #define b4 %f11 #define b5 %f12 #define b6 %f13 #define b7 %f14 #define b8 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define c1 %f20 #define c2 %f21 #define c3 %f22 #define c4 %f23 #define c5 %f24 #define c6 %f25 #define c7 %f26 #define c8 %f27 #define ALPHA %f31 #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], INCX ld [%sp + STACK_START + 32], Y ld [%sp + STACK_START + 36], INCY #else st %i3, [%sp + STACK_START + 16] ld [%sp + STACK_START + 28], Y ld [%sp + STACK_START + 32], INCY #endif LDF [%sp + STACK_START + 16], ALPHA #else ldx [%sp + STACK_START + 56], Y ldx [%sp + STACK_START + 64], INCY #ifdef DOUBLE FMOV %f6, ALPHA #else FMOV %f7, ALPHA #endif #endif sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY cmp INCX, SIZE bne .LL50 nop cmp INCY, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 LDF [Y + 6 * SIZE], b7 LDF [X + 7 * SIZE], a8 LDF [Y + 7 * SIZE], b8 FMUL ALPHA, a1, t1 FMUL ALPHA, a2, t2 FMUL ALPHA, a3, t3 FMUL ALPHA, a4, t4 FADD b1, t1, c1 FMUL ALPHA, a5, t1 FADD b2, t2, c2 FMUL ALPHA, a6, t2 add I, -1, I cmp I, 0 ble,pt %icc, .LL12 nop #ifdef DOUBLE #define PREFETCHSIZE 54 #else #define PREFETCHSIZE 108 #endif .LL11: prefetch [Y + PREFETCHSIZE * SIZE], 0 LDF [X + 8 * SIZE], a1 LDF [X + 9 * SIZE], a2 LDF [X + 10 * SIZE], a3 LDF [X + 11 * SIZE], a4 FADD b3, t3, c3 STF c1, [Y + 0 * SIZE] FMUL ALPHA, a7, t3 FADD b4, t4, c4 STF c2, [Y + 1 * SIZE] FMUL ALPHA, a8, t4 LDF [Y + 8 * SIZE], b1 LDF [Y + 9 * SIZE], b2 LDF [Y + 10 * SIZE], b3 LDF [Y + 11 * SIZE], b4 FADD b5, t1, c5 STF c3, [Y + 2 * SIZE] FMUL ALPHA, a1, t1 FADD b6, t2, c6 STF c4, [Y + 3 * SIZE] FMUL ALPHA, a2, t2 prefetch [X + PREFETCHSIZE * SIZE], 0 LDF [X + 12 * SIZE], a5 LDF [X + 13 * SIZE], a6 LDF [X + 14 * SIZE], a7 LDF [X + 15 * SIZE], a8 FADD b7, t3, c7 STF c5, [Y + 4 * SIZE] FMUL ALPHA, a3, t3 FADD b8, t4, c8 STF c6, [Y + 5 * SIZE] FMUL ALPHA, a4, t4 LDF [Y + 12 * SIZE], b5 LDF [Y + 13 * SIZE], b6 LDF [Y + 14 * SIZE], b7 LDF [Y + 15 * SIZE], b8 FADD b1, t1, c1 STF c7, [Y + 6 * SIZE] FMUL ALPHA, a5, t1 deccc I FADD b2, t2, c2 STF c8, [Y + 7 * SIZE] FMUL ALPHA, a6, t2 add Y, 8 * SIZE, Y bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FADD b3, t3, c3 FMUL ALPHA, a7, t3 FADD b4, t4, c4 FMUL ALPHA, a8, t4 FADD b5, t1, c5 FADD b6, t2, c6 FADD b7, t3, c7 FADD b8, t4, c8 STF c1, [Y + 0 * SIZE] STF c2, [Y + 1 * SIZE] STF c3, [Y + 2 * SIZE] STF c4, [Y + 3 * SIZE] STF c5, [Y + 4 * SIZE] STF c6, [Y + 5 * SIZE] STF c7, [Y + 6 * SIZE] STF c8, [Y + 7 * SIZE] add Y, 8 * SIZE, Y add X, 8 * SIZE, X .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 FMUL ALPHA, a1, t1 FADD b1, t1, c1 add I, -1, I cmp I, 0 STF c1, [Y + 0 * SIZE] add Y, 1 * SIZE, Y bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: return %i7 + 8 clr %g0 .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 mov Y, YY LDF [X + 0 * SIZE], a1 add I, -1, I add X, INCX, X LDF [Y + 0 * SIZE], b1 cmp I, 0 add Y, INCY, Y LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [Y + 0 * SIZE], b2 add Y, INCY, Y LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [Y + 0 * SIZE], b3 add Y, INCY, Y LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [Y + 0 * SIZE], b4 add Y, INCY, Y LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [Y + 0 * SIZE], b5 add Y, INCY, Y LDF [X + 0 * SIZE], a6 add X, INCX, X LDF [Y + 0 * SIZE], b6 add Y, INCY, Y LDF [X + 0 * SIZE], a7 add X, INCX, X LDF [Y + 0 * SIZE], b7 add Y, INCY, Y LDF [X + 0 * SIZE], a8 add X, INCX, X LDF [Y + 0 * SIZE], b8 ble,pt %icc, .LL52 add Y, INCY, Y .LL51: FMUL ALPHA, a1, t1 LDF [X + 0 * SIZE], a1 add X, INCX, X FMUL ALPHA, a2, t2 LDF [X + 0 * SIZE], a2 add X, INCX, X FMUL ALPHA, a3, t3 LDF [X + 0 * SIZE], a3 add X, INCX, X FMUL ALPHA, a4, t4 LDF [X + 0 * SIZE], a4 add X, INCX, X FADD b1, t1, c1 LDF [Y + 0 * SIZE], b1 add Y, INCY, Y FMUL ALPHA, a5, t1 LDF [X + 0 * SIZE], a5 add X, INCX, X FADD b2, t2, c2 LDF [Y + 0 * SIZE], b2 add Y, INCY, Y FMUL ALPHA, a6, t2 LDF [X + 0 * SIZE], a6 add X, INCX, X FADD b3, t3, c3 LDF [Y + 0 * SIZE], b3 add Y, INCY, Y FMUL ALPHA, a7, t3 LDF [X + 0 * SIZE], a7 add X, INCX, X FADD b4, t4, c4 LDF [Y + 0 * SIZE], b4 add Y, INCY, Y FMUL ALPHA, a8, t4 LDF [X + 0 * SIZE], a8 add X, INCX, X STF c1, [YY + 0 * SIZE] add YY, INCY, YY FADD b5, t1, c1 STF c2, [YY + 0 * SIZE] add YY, INCY, YY FADD b6, t2, c2 STF c3, [YY + 0 * SIZE] add YY, INCY, YY FADD b7, t3, c3 STF c4, [YY + 0 * SIZE] add YY, INCY, YY FADD b8, t4, c4 LDF [Y + 0 * SIZE], b5 add I, -1, I add Y, INCY, Y LDF [Y + 0 * SIZE], b6 cmp I, 0 add Y, INCY, Y LDF [Y + 0 * SIZE], b7 add Y, INCY, Y LDF [Y + 0 * SIZE], b8 add Y, INCY, Y STF c1, [YY + 0 * SIZE] add YY, INCY, YY STF c2, [YY + 0 * SIZE] add YY, INCY, YY STF c3, [YY + 0 * SIZE] add YY, INCY, YY STF c4, [YY + 0 * SIZE] bg,pt %icc, .LL51 add YY, INCY, YY .LL52: FMUL ALPHA, a1, t1 FMUL ALPHA, a2, t2 FMUL ALPHA, a3, t3 FMUL ALPHA, a4, t4 FADD b1, t1, c1 FMUL ALPHA, a5, t1 FADD b2, t2, c2 FMUL ALPHA, a6, t2 FADD b3, t3, c3 FMUL ALPHA, a7, t3 FADD b4, t4, c4 FMUL ALPHA, a8, t4 STF c1, [YY + 0 * SIZE] add YY, INCY, YY FADD b5, t1, c1 STF c2, [YY + 0 * SIZE] add YY, INCY, YY FADD b6, t2, c2 STF c3, [YY + 0 * SIZE] add YY, INCY, YY FADD b7, t3, c3 STF c4, [YY + 0 * SIZE] add YY, INCY, YY FADD b8, t4, c4 STF c1, [YY + 0 * SIZE] add YY, INCY, YY STF c2, [YY + 0 * SIZE] add YY, INCY, YY STF c3, [YY + 0 * SIZE] add YY, INCY, YY STF c4, [YY + 0 * SIZE] add YY, INCY, YY .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 FMUL ALPHA, a1, t1 FADD b1, t1, c1 add I, -1, I cmp I, 0 STF c1, [Y + 0 * SIZE] add Y, INCY, Y bg,pt %icc, .LL56 add X, INCX, X .LL59: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/cabs.S000066400000000000000000000055761313527062700166050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE add %sp, -128, %sp LDF [%o0 + 0 * SIZE], %f0 LDF [%o0 + 1 * SIZE], %f8 FABS %f8, %f8 FABS %f0, %f0 FADD %f0, %f8, %f0 #if !defined(DOUBLE) && defined(F2CCONV) fstod %f0, %f0 #endif retl sub %sp, -128, %sp EPILOGUE OpenBLAS-0.2.20/kernel/sparc/cnrm2.S000066400000000000000000000152231313527062700167040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 PROLOGUE SAVESP FCLR(0) FMOV c1, c2 FMOV c1, c3 FMOV c1, c4 FMOV c1, t1 FMOV c1, t2 FMOV c1, t3 FMOV c1, t4 cmp INCX, 0 ble .LL20 sll INCX, ZBASE_SHIFT, INCX cmp N, 0 ble .LL20 nop cmp INCX, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop ld [X + 0 * SIZE], a1 add I, -1, I ld [X + 1 * SIZE], a2 cmp I, 0 ld [X + 2 * SIZE], a3 ld [X + 3 * SIZE], a4 ld [X + 4 * SIZE], a5 ld [X + 5 * SIZE], a6 ld [X + 6 * SIZE], a7 ld [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 40 .LL11: faddd c1, t1, c1 fsmuld a1, a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 faddd c2, t2, c2 add I, -1, I fsmuld a2, a2, t2 ld [X + 0 * SIZE], a1 faddd c3, t3, c3 cmp I, 0 fsmuld a3, a3, t3 ld [X + 1 * SIZE], a2 faddd c4, t4, c4 fsmuld a4, a4, t4 ld [X + 2 * SIZE], a3 faddd c1, t1, c1 fsmuld a5, a5, t1 ld [X + 3 * SIZE], a4 faddd c2, t2, c2 fsmuld a6, a6, t2 ld [X + 4 * SIZE], a5 faddd c3, t3, c3 fsmuld a7, a7, t3 ld [X + 5 * SIZE], a6 faddd c4, t4, c4 ld [X + 6 * SIZE], a7 fsmuld a8, a8, t4 add X, 8 * SIZE, X bg,pt %icc, .LL11 ld [X - 1 * SIZE], a8 .LL12: faddd c1, t1, c1 fsmuld a1, a1, t1 faddd c2, t2, c2 fsmuld a2, a2, t2 faddd c3, t3, c3 fsmuld a3, a3, t3 faddd c4, t4, c4 fsmuld a4, a4, t4 faddd c1, t1, c1 fsmuld a5, a5, t1 faddd c2, t2, c2 fsmuld a6, a6, t2 faddd c3, t3, c3 fsmuld a7, a7, t3 faddd c4, t4, c4 fsmuld a8, a8, t4 .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: ld [X + 0 * SIZE], a1 add I, -1, I ld [X + 1 * SIZE], a2 cmp I, 0 faddd c1, t1, c1 faddd c2, t2, c2 fsmuld a1, a1, t1 fsmuld a2, a2, t2 bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: faddd c1, t1, c1 faddd c2, t2, c2 faddd c3, t3, c3 faddd c4, t4, c4 faddd c1, c2, c1 faddd c3, c4, c3 faddd c1, c3, c1 fsqrtd c1, c1 #if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) fdtos c1, c1 #endif .LL20: return %i7 + 8 clr %g0 .LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop ld [X + 0 * SIZE], a1 ld [X + 1 * SIZE], a2 add X, INCX, X ld [X + 0 * SIZE], a3 ld [X + 1 * SIZE], a4 add X, INCX, X ld [X + 0 * SIZE], a5 ld [X + 1 * SIZE], a6 add X, INCX, X add I, -1, I ld [X + 0 * SIZE], a7 cmp I, 0 ld [X + 1 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: faddd c1, t1, c1 add I, -1, I fsmuld a1, a1, t1 ld [X + 0 * SIZE], a1 faddd c2, t2, c2 cmp I, 0 fsmuld a2, a2, t2 ld [X + 1 * SIZE], a2 add X, INCX, X faddd c3, t3, c3 fsmuld a3, a3, t3 ld [X + 0 * SIZE], a3 faddd c4, t4, c4 fsmuld a4, a4, t4 ld [X + 1 * SIZE], a4 add X, INCX, X faddd c1, t1, c1 fsmuld a5, a5, t1 ld [X + 0 * SIZE], a5 faddd c2, t2, c2 fsmuld a6, a6, t2 ld [X + 1 * SIZE], a6 add X, INCX, X faddd c3, t3, c3 fsmuld a7, a7, t3 ld [X + 0 * SIZE], a7 faddd c4, t4, c4 fsmuld a8, a8, t4 ld [X + 1 * SIZE], a8 bg,pt %icc, .LL51 add X, INCX, X .LL52: faddd c1, t1, c1 fsmuld a1, a1, t1 faddd c2, t2, c2 fsmuld a2, a2, t2 faddd c3, t3, c3 fsmuld a3, a3, t3 faddd c4, t4, c4 fsmuld a4, a4, t4 faddd c1, t1, c1 fsmuld a5, a5, t1 faddd c2, t2, c2 fsmuld a6, a6, t2 faddd c3, t3, c3 fsmuld a7, a7, t3 faddd c4, t4, c4 fsmuld a8, a8, t4 .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: ld [X + 0 * SIZE], a1 add I, -1, I ld [X + 1 * SIZE], a2 cmp I, 0 faddd c1, t1, c1 faddd c2, t2, c2 fsmuld a1, a1, t1 fsmuld a2, a2, t2 bg,pt %icc, .LL56 add X, INCX, X .LL59: faddd c1, t1, c1 faddd c2, t2, c2 faddd c3, t3, c3 faddd c4, t4, c4 faddd c1, c2, c1 faddd c3, c4, c3 faddd c1, c3, c1 fsqrtd c1, c1 #if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) fdtos c1, c1 #endif return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/copy.S000066400000000000000000000123651313527062700166410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #define I %i5 #ifdef DOUBLE #define a1 %f0 #define a2 %f2 #define a3 %f4 #define a4 %f6 #define a5 %f8 #define a6 %f10 #define a7 %f12 #define a8 %f14 #define a9 %f16 #define a10 %f18 #define a11 %f20 #define a12 %f22 #define a13 %f24 #define a14 %f26 #define a15 %f28 #define a16 %f30 #else #define a1 %f0 #define a2 %f1 #define a3 %f2 #define a4 %f3 #define a5 %f4 #define a6 %f5 #define a7 %f6 #define a8 %f7 #define a9 %f8 #define a10 %f9 #define a11 %f10 #define a12 %f11 #define a13 %f12 #define a14 %f13 #define a15 %f14 #define a16 %f15 #endif PROLOGUE SAVESP sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY cmp INCX, SIZE bne .LL50 nop cmp INCY, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop #define PREFETCHSIZE 32 .LL11: LDF [X + 0 * SIZE], a1 prefetch [X + PREFETCHSIZE * SIZE], 0 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 LDF [X + 5 * SIZE], a6 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 STF a1, [Y + 0 * SIZE] prefetch [Y + PREFETCHSIZE * SIZE], 0 STF a2, [Y + 1 * SIZE] STF a3, [Y + 2 * SIZE] STF a4, [Y + 3 * SIZE] STF a5, [Y + 4 * SIZE] STF a6, [Y + 5 * SIZE] STF a7, [Y + 6 * SIZE] STF a8, [Y + 7 * SIZE] add I, -1, I cmp I, 0 add Y, 8 * SIZE, Y add X, 8 * SIZE, X bg,pt %icc, .LL11 nop .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 add I, -1, I cmp I, 0 add X, 1 * SIZE, X STF a1, [Y + 0 * SIZE] bg,pt %icc, .LL16 add Y, 1 * SIZE, Y .LL19: return %i7 + 8 clr %g0 .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop .LL51: LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [X + 0 * SIZE], a6 add X, INCX, X LDF [X + 0 * SIZE], a7 add X, INCX, X LDF [X + 0 * SIZE], a8 add X, INCX, X STF a1, [Y + 0 * SIZE] add Y, INCY, Y add I, -1, I STF a2, [Y + 0 * SIZE] add Y, INCY, Y cmp I, 0 STF a3, [Y + 0 * SIZE] add Y, INCY, Y STF a4, [Y + 0 * SIZE] add Y, INCY, Y STF a5, [Y + 0 * SIZE] add Y, INCY, Y STF a6, [Y + 0 * SIZE] add Y, INCY, Y STF a7, [Y + 0 * SIZE] add Y, INCY, Y STF a8, [Y + 0 * SIZE] bg,pt %icc, .LL51 add Y, INCY, Y .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 add I, -1, I cmp I, 0 add X, INCX, X STF a1, [Y + 0 * SIZE] bg,pt %icc, .LL56 add Y, INCY, Y .LL59: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/dnrm2.S000066400000000000000000000273301313527062700167070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #define XX %i4 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #define fmax %f32 #define fzero %f34 #define fone %f36 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #define fmax %f16 #define fzero %f17 #define fone %f18 #endif PROLOGUE SAVESP #ifdef DOUBLE FCLR(3) #else FCLR(17) #endif mov X, XX mov 0x3ff, %g1 sll %g1, 20, %g1 cmp N, 0 ble .LL99 FMOV fzero, c1 cmp INCX, 0 ble .LL99 sll INCX, BASE_SHIFT, INCX add %sp, -8, %sp st %g1, [%sp + STACK_START + 0] st %g0, [%sp + STACK_START + 4] add N, -1, N LDF [X], c4 add X, INCX, X LDF [%sp + STACK_START], fone add %sp, 8, %sp FABS c4, c1 FABS c4, c2 FABS c4, c3 FABS c4, c4 cmp INCX, SIZE bne .LL100 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 40 .LL11: FABS a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a2, t2 LDF [X + 0 * SIZE], a1 FABS a3, t3 LDF [X + 1 * SIZE], a2 FABS a4, t4 LDF [X + 2 * SIZE], a3 FCMP %fcc0, t1, c1 LDF [X + 3 * SIZE], a4 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 LDF [X + 4 * SIZE], a5 FABS a6, t2 LDF [X + 5 * SIZE], a6 FABS a7, t3 LDF [X + 6 * SIZE], a7 FABS a8, t4 LDF [X + 7 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 add I, -1, I FMOVG %fcc1, t2, c2 cmp I, 0 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 FABS a1, t1 FCMP %fcc0, t1, c1 FMOVG %fcc0, t1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 mov XX, X FMOVG %fcc0, c2, c1 FMOVG %fcc1, c4, c3 FCMP %fcc0, c3, c1 FMOVG %fcc0, c3, c1 FCMP c1, fzero fbe .LL99 nop FMOV c1, fmax add N, 1, N FDIV fone, c1, fone FMOV fzero, c1 FMOV fzero, c2 FMOV fzero, c3 FMOV fzero, c4 sra N, 3, I cmp I, 0 ble,pn %icc, .LL35 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL32 add X, 8 * SIZE, X .LL31: FMUL fone, a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 FMUL fone, a2, t2 LDF [X + 0 * SIZE], a1 FMUL fone, a3, t3 LDF [X + 1 * SIZE], a2 FMUL fone, a4, t4 LDF [X + 2 * SIZE], a3 FMUL t1, t1, t1 LDF [X + 3 * SIZE], a4 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 LDF [X + 4 * SIZE], a5 FADD c2, t2, c2 FMUL fone, a6, t2 LDF [X + 5 * SIZE], a6 FADD c3, t3, c3 FMUL fone, a7, t3 LDF [X + 6 * SIZE], a7 FADD c4, t4, c4 FMUL fone, a8, t4 LDF [X + 7 * SIZE], a8 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 add I, -1, I FADD c2, t2, c2 cmp I, 0 FADD c3, t3, c3 FADD c4, t4, c4 bg,pt %icc, .LL31 add X, 8 * SIZE, X .LL32: FMUL fone, a1, t1 FMUL fone, a2, t2 FMUL fone, a3, t3 FMUL fone, a4, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 FADD c2, t2, c2 FMUL fone, a6, t2 FADD c3, t3, c3 FMUL fone, a7, t3 FADD c4, t4, c4 FMUL fone, a8, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FADD c2, t2, c2 FADD c3, t3, c3 FADD c4, t4, c4 .LL35: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL39 nop .LL36: LDF [X + 0 * SIZE], a1 FMUL fone, a1, t1 FMUL t1, t1, t1 FADD c1, t1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL36 add X, 1 * SIZE, X .LL39: FADD c1, c2, c1 FADD c3, c4, c3 FADD c1, c3, c1 FSQRT c1, c1 FMUL fmax, c1, c1 .LL99: return %i7 + 8 clr %g0 .LL100: sra N, 3, I cmp I, 0 ble,pn %icc, .LL105 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [X + 0 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 add X, INCX, X LDF [X + 0 * SIZE], a8 ble,pt %icc, .LL102 add X, INCX, X .LL101: FABS a1, t1 LDF [X + 0 * SIZE], a1 add X, INCX, X FABS a2, t2 LDF [X + 0 * SIZE], a2 add X, INCX, X FABS a3, t3 LDF [X + 0 * SIZE], a3 add X, INCX, X FABS a4, t4 LDF [X + 0 * SIZE], a4 add X, INCX, X FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 LDF [X + 0 * SIZE], a5 add X, INCX, X FABS a6, t2 LDF [X + 0 * SIZE], a6 add X, INCX, X FABS a7, t3 LDF [X + 0 * SIZE], a7 add X, INCX, X FABS a8, t4 LDF [X + 0 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 add I, -1, I FMOVG %fcc1, t2, c2 cmp I, 0 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 bg,pt %icc, .LL101 add X, INCX, X .LL102: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 .LL105: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL109 nop .LL106: LDF [X + 0 * SIZE], a1 FABS a1, t1 FCMP %fcc0, t1, c1 FMOVG %fcc0, t1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL106 add X, INCX, X .LL109: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 mov XX, X FMOVG %fcc0, c2, c1 FMOVG %fcc1, c4, c3 FCMP %fcc0, c3, c1 FMOVG %fcc0, c3, c1 FCMP c1, fzero fbe .LL99 nop FMOV c1, fmax FDIV fone, c1, fone FMOV fzero, c1 FMOV fzero, c2 FMOV fzero, c3 FMOV fzero, c4 add N, 1, N sra N, 3, I cmp I, 0 ble,pn %icc, .LL135 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a6 add X, INCX, X cmp I, 0 LDF [X + 0 * SIZE], a7 add X, INCX, X LDF [X + 0 * SIZE], a8 ble,pt %icc, .LL132 add X, INCX, X .LL131: FMUL fone, a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 FMUL fone, a2, t2 LDF [X + 0 * SIZE], a1 add X, INCX, X FMUL fone, a3, t3 LDF [X + 0 * SIZE], a2 add X, INCX, X FMUL fone, a4, t4 LDF [X + 0 * SIZE], a3 add X, INCX, X FMUL t1, t1, t1 LDF [X + 0 * SIZE], a4 add X, INCX, X FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 LDF [X + 0 * SIZE], a5 add X, INCX, X FADD c2, t2, c2 FMUL fone, a6, t2 LDF [X + 0 * SIZE], a6 add X, INCX, X FADD c3, t3, c3 FMUL fone, a7, t3 LDF [X + 0 * SIZE], a7 add X, INCX, X FADD c4, t4, c4 FMUL fone, a8, t4 LDF [X + 0 * SIZE], a8 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 add I, -1, I FADD c2, t2, c2 cmp I, 0 FADD c3, t3, c3 FADD c4, t4, c4 bg,pt %icc, .LL131 add X, INCX, X .LL132: FMUL fone, a1, t1 FMUL fone, a2, t2 FMUL fone, a3, t3 FMUL fone, a4, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 FADD c2, t2, c2 FMUL fone, a6, t2 FADD c3, t3, c3 FMUL fone, a7, t3 FADD c4, t4, c4 FMUL fone, a8, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FADD c2, t2, c2 FADD c3, t3, c3 FADD c4, t4, c4 .LL135: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL139 nop .LL136: LDF [X + 0 * SIZE], a1 FMUL fone, a1, t1 FMUL t1, t1, t1 FADD c1, t1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL136 add X, INCX, X .LL139: FADD c1, c2, c1 FADD c3, c4, c3 FADD c1, c3, c1 FSQRT c1, c1 FMUL fmax, c1, c1 return %i7 + 8 clr %g0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/dot.S000066400000000000000000000176421313527062700164600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #define I %i5 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define t1 %f4 #define t2 %f6 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #define b1 %f32 #define b2 %f34 #define b3 %f36 #define b4 %f38 #define b5 %f40 #define b6 %f42 #define b7 %f44 #define b8 %f46 #else #define c1 %f0 #define c2 %f1 #define t1 %f4 #define t2 %f5 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #define b1 %f16 #define b2 %f17 #define b3 %f18 #define b4 %f19 #define b5 %f20 #define b6 %f21 #define b7 %f22 #define b8 %f23 #endif PROLOGUE SAVESP #ifdef DOUBLE FCLR(0) FCLR(2) FCLR(4) FCLR(6) #else FCLR(0) FCLR(1) FCLR(4) FCLR(5) #endif cmp N, 0 ble .LL19 nop sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY cmp INCX, SIZE bne .LL50 nop cmp INCY, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 add I, -1, I LDF [Y + 6 * SIZE], b7 cmp I, 0 LDF [X + 7 * SIZE], a8 add X, 8 * SIZE, X LDF [Y + 7 * SIZE], b8 add Y, 8 * SIZE, Y ble,pt %icc, .LL12 nop #define PREFETCHSIZE 40 .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 FADD c1, t1, c1 prefetch [Y + PREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 LDF [X + 0 * SIZE], a1 FADD c2, t2, c2 FMUL a2, b2, t2 LDF [Y + 0 * SIZE], b1 add I, -1, I LDF [X + 1 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b3, t1 LDF [Y + 1 * SIZE], b2 cmp I, 0 LDF [X + 2 * SIZE], a3 FADD c2, t2, c2 FMUL a4, b4, t2 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 FADD c1, t1, c1 FMUL a5, b5, t1 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 FADD c2, t2, c2 FMUL a6, b6, t2 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 FADD c1, t1, c1 FMUL a7, b7, t1 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 FADD c2, t2, c2 FMUL a8, b8, t2 LDF [Y + 6 * SIZE], b7 add Y, 8 * SIZE, Y LDF [X + 7 * SIZE], a8 add X, 8 * SIZE, X bg,pt %icc, .LL11 LDF [Y - 1 * SIZE], b8 .LL12: FADD c1, t1, c1 FMUL a1, b1, t1 FADD c2, t2, c2 FMUL a2, b2, t2 FADD c1, t1, c1 FMUL a3, b3, t1 FADD c2, t2, c2 FMUL a4, b4, t2 FADD c1, t1, c1 FMUL a5, b5, t1 FADD c2, t2, c2 FMUL a6, b6, t2 FADD c1, t1, c1 FMUL a7, b7, t1 FADD c2, t2, c2 FMUL a8, b8, t2 .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 add I, -1, I LDF [Y + 0 * SIZE], b1 cmp I, 0 add X, 1 * SIZE, X FADD c1, t1, c1 FMUL a1, b1, t1 bg,pt %icc, .LL16 add Y, 1 * SIZE, Y .LL19: FADD c1, t1, c1 FADD c2, t2, c2 FADD c1, c2, c1 return %i7 + 8 nop .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [Y + 0 * SIZE], b1 add Y, INCY, Y LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [Y + 0 * SIZE], b2 add Y, INCY, Y LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [Y + 0 * SIZE], b3 add Y, INCY, Y LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [Y + 0 * SIZE], b4 add Y, INCY, Y LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [Y + 0 * SIZE], b5 add Y, INCY, Y LDF [X + 0 * SIZE], a6 add X, INCX, X LDF [Y + 0 * SIZE], b6 add Y, INCY, Y LDF [X + 0 * SIZE], a7 add X, INCX, X LDF [Y + 0 * SIZE], b7 add Y, INCY, Y LDF [X + 0 * SIZE], a8 add X, INCX, X LDF [Y + 0 * SIZE], b8 add Y, INCY, Y add I, -1, I cmp I, 0 ble,pt %icc, .LL52 nop .LL51: FADD c1, t1, c1 FMUL a1, b1, t1 LDF [X + 0 * SIZE], a1 FADD c2, t2, c2 add X, INCX, X FMUL a2, b2, t2 LDF [Y + 0 * SIZE], b1 add Y, INCY, Y LDF [X + 0 * SIZE], a2 FADD c1, t1, c1 add X, INCX, X FMUL a3, b3, t1 LDF [Y + 0 * SIZE], b2 add Y, INCY, Y add I, -1, I LDF [X + 0 * SIZE], a3 add X, INCX, X FADD c2, t2, c2 FMUL a4, b4, t2 LDF [Y + 0 * SIZE], b3 add Y, INCY, Y cmp I, 0 LDF [X + 0 * SIZE], a4 add X, INCX, X FADD c1, t1, c1 FMUL a5, b5, t1 LDF [Y + 0 * SIZE], b4 add Y, INCY, Y LDF [X + 0 * SIZE], a5 add X, INCX, X FADD c2, t2, c2 FMUL a6, b6, t2 LDF [Y + 0 * SIZE], b5 add Y, INCY, Y LDF [X + 0 * SIZE], a6 add X, INCX, X FADD c1, t1, c1 FMUL a7, b7, t1 LDF [Y + 0 * SIZE], b6 add Y, INCY, Y LDF [X + 0 * SIZE], a7 add X, INCX, X FADD c2, t2, c2 FMUL a8, b8, t2 LDF [Y + 0 * SIZE], b7 add Y, INCY, Y LDF [X + 0 * SIZE], a8 add X, INCX, X LDF [Y + 0 * SIZE], b8 bg,pt %icc, .LL51 add Y, INCY, Y .LL52: FADD c1, t1, c1 FMUL a1, b1, t1 FADD c2, t2, c2 FMUL a2, b2, t2 FADD c1, t1, c1 FMUL a3, b3, t1 FADD c2, t2, c2 FMUL a4, b4, t2 FADD c1, t1, c1 FMUL a5, b5, t1 FADD c2, t2, c2 FMUL a6, b6, t2 FADD c1, t1, c1 FMUL a7, b7, t1 FADD c2, t2, c2 FMUL a8, b8, t2 .LL55: and N, 7, I cmp I, 0 ble %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 add X, INCX, X add Y, INCY, Y FADD c1, t1, c1 FMUL a1, b1, t1 addcc I, -1, I bg %icc, .LL56 nop .LL59: FADD c1, t1, c1 FADD c2, t2, c2 FADD c1, c2, c1 return %i7 + 8 nop EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemm_kernel.S000066400000000000000000001407251313527062700201560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define OFFSET %l5 #define KK %l6 #define TEMP1 %l7 #define TEMP2 %i3 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f58 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f60 #define ALPHA %f62 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #define ALPHA %f30 #endif PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] /* ALPHA */ st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 40], OFFSET #endif #else st %i3, [%sp + STACK_START + 16] /* ALPHA */ ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 36], OFFSET #endif #endif LDF [%sp + STACK_START + 16], ALPHA #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC #ifdef TRMMKERNEL ldx [%sp+ STACK_START + 72], OFFSET #endif #ifdef DOUBLE FMOV %f6, ALPHA #else FMOV %f7, ALPHA #endif #endif FCLR(29) #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sra N, 2, J cmp J, 0 ble,pn %icc, .LL100 sll LDC, BASE_SHIFT, LDC .LL11: add C, LDC, C2 FMOV FZERO, t1 nop mov C, C1 add C2, LDC, C3 FMOV FZERO, t2 sra K, 2, L mov A, AO sra M, 2, I add C3, LDC, C4 FMOV FZERO, t3 #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif cmp I, 0 add C4, LDC, C FMOV FZERO, t4 ble,pn %icc, .LL50 FMOV FZERO, c01 .LL21: #if !defined(TRMMKERNEL) FMOV FZERO, c02 mov B, BO FMOV FZERO, c03 cmp L, 0 #else FMOV FZERO, c02 FMOV FZERO, c03 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 2 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add B, TEMP1, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 4, L #else add KK, 4, L #endif sra L, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c04 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c05 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c06 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c07 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c08 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c09 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c10 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c11 LDF [BO + 4 * SIZE], b5 /* ***** */ LDF [AO + 4 * SIZE], a5 /* ***** */ prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c12 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C3 + 3 * SIZE], 3 FMOV FZERO, c14 prefetch [C4 + 3 * SIZE], 3 FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 .LL22: FADD c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD c08, t2, c08 nop FMUL a5, b2, t2 nop FADD c12, t3, c12 nop FMUL a5, b3, t3 nop FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c08, t2, c08 nop FMUL a1, b2, t2 nop FADD c12, t3, c12 nop FMUL a1, b3, t3 nop FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD c05, t2, c05 nop FMUL a2, b2, t2 FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 nop FADD c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD c08, t2, c08 FMUL a5, b2, t2 FADD c12, t3, c12 FMUL a5, b3, t3 FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 4, L #else add KK, 4, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL29 nop .LL26: FADD c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #ifndef TRMMKERNEL FADD c04, t1, c04 add I, -1, I FMUL c01, ALPHA, c01 LDF [C1 + 0 * SIZE], a1 FADD c08, t2, c08 cmp I, 0 FMUL c02, ALPHA, c02 LDF [C1 + 1 * SIZE], a2 FADD c12, t3, c12 nop FMUL c03, ALPHA, c03 LDF [C1 + 2 * SIZE], a3 FADD c16, t4, c16 nop FMUL c04, ALPHA, c04 LDF [C1 + 3 * SIZE], a4 FMUL c05, ALPHA, c05 LDF [C2 + 0 * SIZE], b1 FMUL c06, ALPHA, c06 LDF [C2 + 1 * SIZE], b2 FMUL c07, ALPHA, c07 LDF [C2 + 2 * SIZE], b3 FMUL c08, ALPHA, c08 LDF [C2 + 3 * SIZE], b4 FMUL c09, ALPHA, c09 LDF [C3 + 0 * SIZE], t1 FMUL c10, ALPHA, c10 LDF [C3 + 1 * SIZE], t2 FMUL c11, ALPHA, c11 LDF [C3 + 2 * SIZE], t3 FMUL c12, ALPHA, c12 LDF [C3 + 3 * SIZE], t4 FMUL c13, ALPHA, c13 add C1, 4 * SIZE, C1 FADD c01, a1, c01 LDF [C4 + 0 * SIZE], a1 FMUL c14, ALPHA, c14 add C2, 4 * SIZE, C2 FADD c02, a2, c02 LDF [C4 + 1 * SIZE], a2 FMUL c15, ALPHA, c15 add C3, 4 * SIZE, C3 FADD c03, a3, c03 LDF [C4 + 2 * SIZE], a3 FMUL c16, ALPHA, c16 nop FADD c04, a4, c04 LDF [C4 + 3 * SIZE], a4 STF c01, [C1 - 4 * SIZE] FADD c05, b1, c05 STF c02, [C1 - 3 * SIZE] FADD c06, b2, c06 STF c03, [C1 - 2 * SIZE] FADD c07, b3, c07 STF c04, [C1 - 1 * SIZE] FADD c08, b4, c08 STF c05, [C2 - 4 * SIZE] FADD c09, t1, c09 STF c06, [C2 - 3 * SIZE] FADD c10, t2, c10 STF c07, [C2 - 2 * SIZE] FADD c11, t3, c11 STF c08, [C2 - 1 * SIZE] FADD c12, t4, c12 STF c09, [C3 - 4 * SIZE] FADD c13, a1, c13 STF c10, [C3 - 3 * SIZE] FADD c14, a2, c14 STF c11, [C3 - 2 * SIZE] FADD c15, a3, c15 STF c12, [C3 - 1 * SIZE] FADD c16, a4, c16 STF c13, [C4 + 0 * SIZE] FMOV FZERO, t1 STF c14, [C4 + 1 * SIZE] FMOV FZERO, t2 STF c15, [C4 + 2 * SIZE] FMOV FZERO, t3 STF c16, [C4 + 3 * SIZE] FMOV FZERO, t4 add C4, 4 * SIZE, C4 #else FADD c04, t1, c04 FMUL c01, ALPHA, c01 FADD c08, t2, c08 FMUL c02, ALPHA, c02 FADD c12, t3, c12 FMUL c03, ALPHA, c03 FADD c16, t4, c16 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] FMUL c05, ALPHA, c05 STF c02, [C1 + 1 * SIZE] FMUL c06, ALPHA, c06 STF c03, [C1 + 2 * SIZE] FMUL c07, ALPHA, c07 STF c04, [C1 + 3 * SIZE] FMUL c08, ALPHA, c08 STF c05, [C2 + 0 * SIZE] FMUL c09, ALPHA, c09 STF c06, [C2 + 1 * SIZE] FMUL c10, ALPHA, c10 STF c07, [C2 + 2 * SIZE] FMUL c11, ALPHA, c11 STF c08, [C2 + 3 * SIZE] FMUL c12, ALPHA, c12 STF c09, [C3 + 0 * SIZE] FMUL c13, ALPHA, c13 STF c10, [C3 + 1 * SIZE] FMUL c14, ALPHA, c14 STF c11, [C3 + 2 * SIZE] FMUL c15, ALPHA, c15 STF c12, [C3 + 3 * SIZE] FMUL c16, ALPHA, c16 STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] STF c15, [C4 + 2 * SIZE] STF c16, [C4 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 add C3, 4 * SIZE, C3 add C4, 4 * SIZE, C4 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -4, TEMP1 #else add TEMP1, -4, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 4, KK #endif add I, -1, I cmp I, 0 #endif sra K, 2, L bg,pt %icc, .LL21 FMOV FZERO, c01 .LL50: and M, 2, I FMOV FZERO, c02 cmp I, 0 FMOV FZERO, t1 ble,pn %icc, .LL70 FMOV FZERO, c04 #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t2 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t3 LDF [B + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [B + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [B + 3 * SIZE], b4 FMOV FZERO, c05 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 4, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 #endif ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD c04, t2, c04 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 4, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #ifndef TRMMKERNEL FADD c02, t1, c02 FMUL c01, ALPHA, c01 LDF [C1 + 0 * SIZE], a1 FADD c04, t2, c04 FMUL c03, ALPHA, c03 LDF [C1 + 1 * SIZE], a2 FADD c06, t3, c06 FMUL c05, ALPHA, c05 LDF [C2 + 0 * SIZE], a3 FADD c08, t4, c08 FMUL c07, ALPHA, c07 LDF [C2 + 1 * SIZE], a4 FMUL c02, ALPHA, c02 FADD c01, a1, c01 LDF [C3 + 0 * SIZE], b1 FMUL c04, ALPHA, c04 FADD c02, a2, c02 LDF [C3 + 1 * SIZE], b2 FMUL c06, ALPHA, c06 FADD c03, a3, c03 LDF [C4 + 0 * SIZE], b3 FMUL c08, ALPHA, c08 FADD c04, a4, c04 LDF [C4 + 1 * SIZE], b4 STF c01, [C1 + 0 * SIZE] FADD c05, b1, c05 STF c02, [C1 + 1 * SIZE] FADD c06, b2, c06 add C1, 2 * SIZE, C1 STF c03, [C2 + 0 * SIZE] FADD c07, b3, c07 STF c04, [C2 + 1 * SIZE] FADD c08, b4, c08 add C2, 2 * SIZE, C2 STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] add C3, 2 * SIZE, C3 STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] add C4, 2 * SIZE, C4 #else FADD c02, t1, c02 FADD c04, t2, c04 FADD c06, t3, c06 FADD c08, t4, c08 FMUL c01, ALPHA, c01 FMUL c03, ALPHA, c03 FMUL c05, ALPHA, c05 FMUL c07, ALPHA, c07 FMUL c02, ALPHA, c02 FMUL c04, ALPHA, c04 FMUL c06, ALPHA, c06 FMUL c08, ALPHA, c08 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -4, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif .LL70: and M, 1, I cmp I, 0 ble,pn %icc, .LL99 nop .LL71: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 4, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #endif ble,pn %icc, .LL75 nop .LL72: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a1, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 4 * SIZE], a1 FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a2, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [BO + 9 * SIZE], b2 FADD c03, t3, c03 FMUL a2, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 11 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 12 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 13 * SIZE], b2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [BO + 14 * SIZE], b3 FADD c04, t4, c04 FMUL a3, b4, t4 LDF [BO + 15 * SIZE], b4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 16 * SIZE], b1 FADD c02, t2, c02 FMUL a4, b2, t2 LDF [BO + 17 * SIZE], b2 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 18 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 19 * SIZE], b4 add BO, 16 * SIZE, BO bg,pt %icc, .LL72 LDF [AO + 3 * SIZE], a4 .LL75: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 4, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL79 nop .LL76: FADD c01, t1, c01 add AO, 1 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 cmp L, 0 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 add BO, 4 * SIZE, BO FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 bg,pt %icc, .LL76 LDF [BO + 3 * SIZE], b4 .LL79: #ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 LDF [C2 + 0 * SIZE], a2 FADD c03, t3, c03 LDF [C3 + 0 * SIZE], a3 FADD c04, t4, c04 LDF [C4 + 0 * SIZE], a4 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 FADD c01, a1, c01 FADD c02, a2, c02 FADD c03, a3, c03 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE] #else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE] #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -4, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .LL99: add J, -1, J mov BO, B cmp J, 0 bg,pt %icc, .LL11 #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 4, KK #else nop #endif .LL100: /* n & 2 */ sra M, 2, I and N, 2, J cmp J, 0 add C, LDC, C2 ble,pn %icc, .LL200 mov A, AO #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif mov C, C1 add C2, LDC, C cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03 .LL121: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t1 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t2 LDF [B + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [B + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [B + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 + 3 * SIZE], 2 FMOV FZERO, c02 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 4, L #else add KK, 2, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 + 3 * SIZE], 2 FMOV FZERO, c02 #endif ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b1, t3 nop FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 4, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 FADD c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: #ifndef TRMMKERNEL FADD c03, t1, c03 add I, -1, I LDF [C1 + 0 * SIZE], a1 FADD c07, t2, c07 cmp I, 0 LDF [C1 + 1 * SIZE], a2 FADD c04, t3, c04 LDF [C1 + 2 * SIZE], a3 FADD c08, t4, c08 LDF [C1 + 3 * SIZE], a4 LDF [C2 + 0 * SIZE], b1 FMUL c01, ALPHA, c01 LDF [C2 + 1 * SIZE], b2 FMUL c02, ALPHA, c02 LDF [C2 + 2 * SIZE], b3 FMUL c03, ALPHA, c03 LDF [C2 + 3 * SIZE], b4 FMUL c04, ALPHA, c04 FMUL c05, ALPHA, c05 FADD c01, a1, c01 FMUL c06, ALPHA, c06 FADD c02, a2, c02 FMUL c07, ALPHA, c07 FADD c03, a3, c03 FMUL c08, ALPHA, c08 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] FADD c05, b1, c05 STF c02, [C1 + 1 * SIZE] FADD c06, b2, c06 STF c03, [C1 + 2 * SIZE] FADD c07, b3, c07 STF c04, [C1 + 3 * SIZE] add C1, 4 * SIZE, C1 FADD c08, b4, c08 STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] add C2, 4 * SIZE, C2 #else FADD c03, t1, c03 FADD c07, t2, c07 FADD c04, t3, c04 FADD c08, t4, c08 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 FMUL c05, ALPHA, c05 FMUL c06, ALPHA, c06 FMUL c07, ALPHA, c07 FMUL c08, ALPHA, c08 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -4, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 4, KK #endif add I, -1, I cmp I, 0 #endif bg,pt %icc, .LL121 FMOV FZERO, c03 .LL150: and M, 2, I cmp I, 0 ble,pn %icc, .LL170 nop .LL151: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #endif ble,pn %icc, .LL155 nop .LL152: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL a1, b1, t1 FMUL a1, b2, t2 FMUL a2, b1, t3 FMUL a2, b2, t4 add AO, 2 * SIZE, AO add BO, 2 * SIZE, BO add L, -1, L cmp L, 0 bg,pt %icc, .LL156 nop .LL159: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C2 + 0 * SIZE], a2 LDF [C1 + 1 * SIZE], a3 LDF [C2 + 1 * SIZE], a4 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 FADD c01, a1, c01 FADD c02, a2, c02 FADD c03, a3, c03 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] add C1, 2 * SIZE, C1 STF c04, [C2 + 1 * SIZE] add C2, 2 * SIZE, C2 #else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] STF c04, [C2 + 1 * SIZE] add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif .LL170: and M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop .LL171: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #endif ble,pn %icc, .LL175 nop .LL172: FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 add L, -1, L LDF [AO + 0 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 9 * SIZE], b2 LDF [AO + 2 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 11 * SIZE], b4 add BO, 8 * SIZE, BO bg,pt %icc, .LL172 LDF [AO + 3 * SIZE], a4 .LL175: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL179 nop .LL176: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 add AO, 1 * SIZE, AO LDF [BO + 2 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 3 * SIZE], b2 add BO, 2 * SIZE, BO bg,pt %icc, .LL176 LDF [AO + 0 * SIZE], a1 .LL179: #ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 LDF [C2 + 0 * SIZE], a2 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FADD c01, a1, c01 FADD c02, a2, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] #else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .LL199: mov BO, B #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 2, KK #else nop #endif .LL200: and N, 1, J sra M, 2, I cmp J, 0 ble,pn %icc, .LL999 mov A, AO #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif cmp I, 0 ble,pn %icc, .LL250 mov C, C1 .LL221: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 4, L #else add KK, 1, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #endif ble,pn %icc, .LL225 prefetch [C1 + 4 * SIZE], 2 .LL222: FADD c01, t1, c01 add BO, 4 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 add L, -1, L FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FADD c01, t1, c01 cmp L, 0 FMUL a1, b2, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [AO + 9 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b2, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 11 * SIZE], a4 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 12 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 13 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [AO + 14 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b3, t4 LDF [AO + 15 * SIZE], a4 LDF [BO + 2 * SIZE], b3 FADD c01, t1, c01 FMUL a1, b4, t1 LDF [AO + 16 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b4, t2 LDF [AO + 17 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 18 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 19 * SIZE], a4 add AO, 16 * SIZE, AO bg,pt %icc, .LL222 LDF [BO + 3 * SIZE], b4 .LL225: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 4, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL229 nop .LL226: FADD c01, t1, c01 add BO, 1 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 add L, -1, L FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 cmp L, 0 FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO bg,pt %icc, .LL226 LDF [BO + 0 * SIZE], b1 .LL229: #ifndef TRMMKERNEL FADD c01, t1, c01 add I, -1, I FADD c02, t2, c02 cmp I, 0 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 LDF [C1 + 2 * SIZE], a3 LDF [C1 + 3 * SIZE], a4 FADD c01, a1, c01 FADD c02, a2, c02 FADD c03, a3, c03 FADD c04, a4, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] add C1, 4 * SIZE, C1 #else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FMUL c03, ALPHA, c03 FMUL c04, ALPHA, c04 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] add C1, 4 * SIZE, C1 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -4, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 4, KK #endif add I, -1, I cmp I, 0 #endif bg,pt %icc, .LL221 nop .LL250: and M, 2, I cmp I, 0 ble,pn %icc, .LL270 nop .LL251: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #endif ble,pn %icc, .LL255 nop .LL252: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 FADD c03, t3, c03 cmp L, 0 FMUL a3, b2, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 9 * SIZE], a2 LDF [BO + 6 * SIZE], b3 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL252 add BO, 4 * SIZE, BO .LL255: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL259 nop .LL256: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 2 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 FMUL a2, b1, t2 LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add AO, 2 * SIZE, AO bg,pt %icc, .LL256 add BO, 1 * SIZE, BO .LL259: #ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 LDF [C1 + 1 * SIZE], a2 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 FADD c01, a1, c01 FADD c02, a2, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] add C1, 2 * SIZE, C1 #else FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 FMUL c01, ALPHA, c01 FMUL c02, ALPHA, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] add C1, 2 * SIZE, C1 #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif .LL270: and M, 1, I cmp I, 0 ble,pn %icc, .LL999 nop .LL271: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 mov B, BO FMOV FZERO, c01 LDF [AO + 2 * SIZE], a3 cmp L, 0 FMOV FZERO, t2 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c02 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c01 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t2 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c02 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 #endif ble,pn %icc, .LL275 LDF [BO + 3 * SIZE], b4 .LL272: FADD c01, t1, c01 add L, -1, L add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 4 * SIZE, BO LDF [AO + 0 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 LDF [BO + 0 * SIZE], b1 FMUL a2, b2, t2 LDF [AO + 1 * SIZE], a2 FADD c01, t3, c01 LDF [BO + 1 * SIZE], b2 FMUL a3, b3, t3 LDF [AO + 2 * SIZE], a3 FADD c02, t4, c02 LDF [BO + 2 * SIZE], b3 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL272 LDF [BO + 3 * SIZE], b4 .LL275: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL279 nop .LL276: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add BO, 1 * SIZE, BO cmp L, 0 bg,pt %icc, .LL276 add AO, 1 * SIZE, AO .LL279: #ifndef TRMMKERNEL FADD c01, t1, c01 LDF [C1 + 0 * SIZE], a1 FADD c02, t2, c02 FADD c01, t3, c01 FADD c02, t4, c02 FADD c01, c02, c01 FMUL c01, ALPHA, c01 FADD c01, a1, c01 STF c01, [C1 + 0 * SIZE] #else FADD c01, t1, c01 FADD c02, t2, c02 FADD c01, t3, c01 FADD c02, t4, c02 FADD c01, c02, c01 FMUL c01, ALPHA, c01 STF c01, [C1 + 0 * SIZE] #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemm_kernel_2x8.S000066400000000000000000001426721313527062700206620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define BB %o7 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define C5 %l5 #define C6 %l6 #define C7 %l7 #define C8 %i3 #define OFFSET %g1 #define KK %g2 #define TEMP1 %g3 #define TEMP2 %g4 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define ALPHA %f62 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #define alpha 31 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define ALPHA %f31 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #define alpha 31 #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 40], OFFSET #endif #else st %i3, [%sp + STACK_START + 16] ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 36], OFFSET #endif #endif LDF [%sp + STACK_START + 16], ALPHA #ifdef TRMMKERNEL st %g1, [%sp + STACK_START + 8] st %g2, [%sp + STACK_START + 12] st %g3, [%sp + STACK_START + 16] st %g4, [%sp + STACK_START + 20] #endif #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC #ifdef TRMMKERNEL ldx [%sp+ STACK_START + 72], OFFSET #endif #ifdef DOUBLE FMOV %f6, ALPHA #else FMOV %f7, ALPHA #endif #ifdef TRMMKERNEL stx %g1, [%sp + STACK_START + 32] stx %g2, [%sp + STACK_START + 40] stx %g3, [%sp + STACK_START + 48] stx %g4, [%sp + STACK_START + 56] #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sra N, 3, J cmp J, 0 ble,pn %icc, .LL30 sll LDC, BASE_SHIFT, LDC .LL11: mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C5 add C5, LDC, C6 add C6, LDC, C7 add C7, LDC, C8 add C8, LDC, C sll K, BASE_SHIFT + 3, BB #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif mov A, AO sra M, 1, I cmp I, 0 ble,pn %icc, .LL20 add B, BB, BB .align 4 .LL12: prefetch [BB + 0 * SIZE], 1 #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 8 * SIZE], a5 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FCLR (cc01) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc09) LDF [BO + 4 * SIZE], b5 FCLR (cc13) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc06) LDF [BO + 7 * SIZE], b8 FCLR (cc10) LDF [BO + 8 * SIZE], b9 FCLR (cc14) prefetch [C1 + 1 * SIZE], 3 FCLR (cc03) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) prefetch [C3 + 1 * SIZE], 3 FCLR (cc11) prefetch [C4 + 2 * SIZE], 3 FCLR (cc15) prefetch [C5 + 1 * SIZE], 3 FCLR (cc04) prefetch [C6 + 2 * SIZE], 3 FCLR (cc08) prefetch [C7 + 1 * SIZE], 3 FCLR (cc12) prefetch [C8 + 2 * SIZE], 3 FCLR (cc16) #ifndef TRMMKERNEL sra K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 8, L #endif sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 add BB, 32 * SIZE, BB .align 4 .LL13: FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #ifndef TRMMKERNEL and K, 7, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 8, L #endif and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) nop FMADD (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) nop FMADD (aa2, bb5, cc10, cc10) nop FMADD (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 LDF [C2 + 0 * SIZE], a3 LDF [C2 + 1 * SIZE], a4 LDF [C3 + 0 * SIZE], b1 LDF [C3 + 1 * SIZE], b2 LDF [C4 + 0 * SIZE], b3 LDF [C4 + 1 * SIZE], b4 FMADD (alpha, cc01, aa1, cc01) LDF [C5 + 0 * SIZE], a1 FMADD (alpha, cc02, aa2, cc02) LDF [C5 + 1 * SIZE], a2 FMADD (alpha, cc03, aa3, cc03) LDF [C6 + 0 * SIZE], a3 FMADD (alpha, cc04, aa4, cc04) LDF [C6 + 1 * SIZE], a4 FMADD (alpha, cc05, bb1, cc05) LDF [C7 + 0 * SIZE], b1 FMADD (alpha, cc06, bb2, cc06) LDF [C7 + 1 * SIZE], b2 FMADD (alpha, cc07, bb3, cc07) LDF [C8 + 0 * SIZE], b3 FMADD (alpha, cc08, bb4, cc08) LDF [C8 + 1 * SIZE], b4 FMADD (alpha, cc09, aa1, cc09) STF c01, [C1 + 0 * SIZE] FMADD (alpha, cc10, aa2, cc10) STF c02, [C1 + 1 * SIZE] FMADD (alpha, cc11, aa3, cc11) STF c03, [C2 + 0 * SIZE] FMADD (alpha, cc12, aa4, cc12) STF c04, [C2 + 1 * SIZE] FMADD (alpha, cc13, bb1, cc13) STF c05, [C3 + 0 * SIZE] FMADD (alpha, cc14, bb2, cc14) STF c06, [C3 + 1 * SIZE] FMADD (alpha, cc15, bb3, cc15) STF c07, [C4 + 0 * SIZE] FMADD (alpha, cc16, bb4, cc16) STF c08, [C4 + 1 * SIZE] #else FMUL ALPHA, c01, c01 FMUL ALPHA, c02, c02 FMUL ALPHA, c03, c03 FMUL ALPHA, c04, c04 FMUL ALPHA, c05, c05 FMUL ALPHA, c06, c06 FMUL ALPHA, c07, c07 FMUL ALPHA, c08, c08 FMUL ALPHA, c09, c09 STF c01, [C1 + 0 * SIZE] FMUL ALPHA, c10, c10 STF c02, [C1 + 1 * SIZE] FMUL ALPHA, c11, c11 STF c03, [C2 + 0 * SIZE] FMUL ALPHA, c12, c12 STF c04, [C2 + 1 * SIZE] FMUL ALPHA, c13, c13 STF c05, [C3 + 0 * SIZE] FMUL ALPHA, c14, c14 STF c06, [C3 + 1 * SIZE] FMUL ALPHA, c15, c15 STF c07, [C4 + 0 * SIZE] FMUL ALPHA, c16, c16 STF c08, [C4 + 1 * SIZE] #endif STF c09, [C5 + 0 * SIZE] add C1, 2 * SIZE, C1 STF c10, [C5 + 1 * SIZE] add C2, 2 * SIZE, C2 STF c11, [C6 + 0 * SIZE] add C3, 2 * SIZE, C3 STF c12, [C6 + 1 * SIZE] add C4, 2 * SIZE, C4 STF c13, [C7 + 0 * SIZE] add C5, 2 * SIZE, C5 STF c14, [C7 + 1 * SIZE] add C6, 2 * SIZE, C6 STF c15, [C8 + 0 * SIZE] add C7, 2 * SIZE, C7 STF c16, [C8 + 1 * SIZE] add C8, 2 * SIZE, C8 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop .align 4 .LL20: and M, 1, I cmp I, 0 ble,pn %icc, .LL29 nop #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FCLR (cc01) LDF [BO + 1 * SIZE], b2 FCLR (cc03) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc07) LDF [BO + 4 * SIZE], b5 FCLR (cc09) LDF [BO + 5 * SIZE], b6 FCLR (cc11) LDF [BO + 6 * SIZE], b7 FCLR (cc13) LDF [BO + 7 * SIZE], b8 FCLR (cc15) #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 8, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 LDF [BO + 8 * SIZE], b9 .align 4 .LL23: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 FMADD (aa2, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa2, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa2, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 FMADD (aa2, bb5, cc09, cc09) LDF [BO + 20 * SIZE], b5 FMADD (aa2, bb6, cc11, cc11) LDF [BO + 21 * SIZE], b6 FMADD (aa2, bb7, cc13, cc13) LDF [BO + 22 * SIZE], b7 FMADD (aa2, bb8, cc15, cc15) LDF [BO + 23 * SIZE], b8 LDF [AO + 4 * SIZE], a1 LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb1, cc01, cc01) LDF [BO + 32 * SIZE], b1 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 25 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 26 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 27 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [BO + 28 * SIZE], b5 FMADD (aa3, bb6, cc11, cc11) LDF [BO + 29 * SIZE], b6 FMADD (aa3, bb7, cc13, cc13) LDF [BO + 30 * SIZE], b7 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 31 * SIZE], b8 FMADD (aa4, bb9, cc01, cc01) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb2, cc03, cc03) LDF [BO + 33 * SIZE], b2 FMADD (aa4, bb3, cc05, cc05) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc07, cc07) LDF [BO + 35 * SIZE], b4 FMADD (aa4, bb5, cc09, cc09) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb6, cc11, cc11) LDF [BO + 37 * SIZE], b6 FMADD (aa4, bb7, cc13, cc13) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc15, cc15) LDF [BO + 39 * SIZE], b8 LDF [AO + 6 * SIZE], a3 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL23 add BO, 32 * SIZE, BO .align 4 .LL25: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 8, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 LDF [AO + 1 * SIZE], a1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL27 add BO, 8 * SIZE, BO .align 4 .LL28: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C2 + 0 * SIZE], a2 LDF [C3 + 0 * SIZE], a3 LDF [C4 + 0 * SIZE], a4 FMADD (alpha, cc01, aa1, cc01) LDF [C5 + 0 * SIZE], b1 FMADD (alpha, cc03, aa2, cc03) LDF [C6 + 0 * SIZE], b2 FMADD (alpha, cc05, aa3, cc05) LDF [C7 + 0 * SIZE], b3 FMADD (alpha, cc07, aa4, cc07) LDF [C8 + 0 * SIZE], b4 FMADD (alpha, cc09, bb1, cc09) STF c01, [C1 + 0 * SIZE] FMADD (alpha, cc11, bb2, cc11) STF c03, [C2 + 0 * SIZE] FMADD (alpha, cc13, bb3, cc13) STF c05, [C3 + 0 * SIZE] FMADD (alpha, cc15, bb4, cc15) STF c07, [C4 + 0 * SIZE] #else FMUL ALPHA, c01, c01 FMUL ALPHA, c03, c03 FMUL ALPHA, c05, c05 FMUL ALPHA, c07, c07 FMUL ALPHA, c09, c09 STF c01, [C1 + 0 * SIZE] FMUL ALPHA, c11, c11 STF c03, [C2 + 0 * SIZE] FMUL ALPHA, c13, c13 STF c05, [C3 + 0 * SIZE] FMUL ALPHA, c15, c15 STF c07, [C4 + 0 * SIZE] #endif STF c09, [C5 + 0 * SIZE] STF c11, [C6 + 0 * SIZE] STF c13, [C7 + 0 * SIZE] STF c15, [C8 + 0 * SIZE] #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .align 4 .LL29: #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 8, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 mov BO, B .align 4 .LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL50 mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL40 mov A, AO .align 4 .LL32: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc02) LDF [BO + 7 * SIZE], b8 FCLR (cc03) LDF [BO + 8 * SIZE], b9 FCLR (cc04) prefetch [C1 + 2 * SIZE], 3 FCLR (cc05) prefetch [C2 + 2 * SIZE], 3 FCLR (cc06) prefetch [C3 + 2 * SIZE], 3 FCLR (cc07) prefetch [C4 + 2 * SIZE], 3 FCLR (cc08) #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 4, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 nop .align 4 .LL33: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb3, cc06, cc06) add L, -1, L FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) cmp L, 0 FMADD (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) nop FMADD (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD (aa3, bb8, cc07, cc07) FMADD (aa4, bb8, cc08, cc08) bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 4, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL37 add BO, 4 * SIZE, BO .align 4 .LL38: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 LDF [C2 + 0 * SIZE], a3 LDF [C2 + 1 * SIZE], a4 FMADD (alpha, cc01, aa1, cc01) LDF [C3 + 0 * SIZE], b1 FMADD (alpha, cc02, aa2, cc02) LDF [C3 + 1 * SIZE], b2 FMADD (alpha, cc03, aa3, cc03) LDF [C4 + 0 * SIZE], b3 FMADD (alpha, cc04, aa4, cc04) LDF [C4 + 1 * SIZE], b4 FMADD (alpha, cc05, bb1, cc05) STF c01, [C1 + 0 * SIZE] FMADD (alpha, cc06, bb2, cc06) STF c02, [C1 + 1 * SIZE] FMADD (alpha, cc07, bb3, cc07) STF c03, [C2 + 0 * SIZE] FMADD (alpha, cc08, bb4, cc08) STF c04, [C2 + 1 * SIZE] #else FMUL ALPHA, c01, c01 FMUL ALPHA, c02, c02 FMUL ALPHA, c03, c03 FMUL ALPHA, c04, c04 FMUL ALPHA, c05, c05 STF c01, [C1 + 0 * SIZE] FMUL ALPHA, c06, c06 STF c02, [C1 + 1 * SIZE] FMUL ALPHA, c07, c07 STF c03, [C2 + 0 * SIZE] FMUL ALPHA, c08, c08 STF c04, [C2 + 1 * SIZE] #endif STF c05, [C3 + 0 * SIZE] add C1, 2 * SIZE, C1 STF c06, [C3 + 1 * SIZE] add C2, 2 * SIZE, C2 STF c07, [C4 + 0 * SIZE] add C3, 2 * SIZE, C3 STF c08, [C4 + 1 * SIZE] add C4, 2 * SIZE, C4 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 nop .LL40: and M, 1, I cmp I, 0 ble,pn %icc, .LL49 nop #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc05) LDF [BO + 8 * SIZE], b9 FCLR (cc07) #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 4, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL45 nop .LL43: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 FMADD (aa2, bb7, cc05, cc05) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc07, cc07) LDF [BO + 15 * SIZE], b8 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 LDF [AO + 2 * SIZE], a3 add BO, 16 * SIZE, BO FMADD (aa4, bb5, cc01, cc01) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc03, cc03) LDF [BO + 5 * SIZE], b6 FMADD (aa4, bb7, cc05, cc05) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc07, cc07) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL43 LDF [AO + 3 * SIZE], a4 .align 4 .LL45: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 4, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL48 nop .align 4 .LL47: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 4 * SIZE], b1 add L, -1, L FMADD (aa1, bb2, cc03, cc03) LDF [BO + 5 * SIZE], b2 add AO, 1 * SIZE, AO FMADD (aa1, bb3, cc05, cc05) LDF [BO + 6 * SIZE], b3 cmp L, 0 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 7 * SIZE], b4 add BO, 4 * SIZE, BO bg,pt %icc, .LL47 LDF [AO + 0 * SIZE], a1 .align 4 .LL48: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C2 + 0 * SIZE], a2 LDF [C3 + 0 * SIZE], a3 LDF [C4 + 0 * SIZE], a4 FMADD (alpha, cc01, aa1, cc01) FMADD (alpha, cc03, aa2, cc03) FMADD (alpha, cc05, aa3, cc05) FMADD (alpha, cc07, aa4, cc07) #else FMUL ALPHA, c01, c01 FMUL ALPHA, c03, c03 FMUL ALPHA, c05, c05 FMUL ALPHA, c07, c07 #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .align 4 .LL49: #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 4, KK #endif mov BO, B .align 4 .LL50: and N, 2, J cmp J, 0 ble,pn %icc, .LL70 mov C, C1 add C, LDC, C2 add C2, LDC, C #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL60 mov A, AO .align 4 .LL52: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL55 nop .align 4 .LL53: FMADD (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL53 LDF [BO + 7 * SIZE], b8 .align 4 .LL55: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL58 nop .align 4 .LL57: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL57 LDF [BO + 1 * SIZE], b2 .align 4 .LL58: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 LDF [C2 + 0 * SIZE], a3 LDF [C2 + 1 * SIZE], a4 FMADD (alpha, cc01, aa1, cc01) FMADD (alpha, cc02, aa2, cc02) FMADD (alpha, cc03, aa3, cc03) FMADD (alpha, cc04, aa4, cc04) #else FMUL ALPHA, c01, c01 FMUL ALPHA, c02, c02 FMUL ALPHA, c03, c03 FMUL ALPHA, c04, c04 #endif STF c01, [C1 + 0 * SIZE] add I, -1, I STF c02, [C1 + 1 * SIZE] add C1, 2 * SIZE, C1 STF c03, [C2 + 0 * SIZE] cmp I, 0 STF c04, [C2 + 1 * SIZE] add C2, 2 * SIZE, C2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif bg,pt %icc, .LL52 nop .align 4 .LL60: and M, 1, I cmp I, 0 ble,pn %icc, .LL69 nop #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 LDF [BO + 6 * SIZE], b7 FCLR (cc01) LDF [BO + 7 * SIZE], b8 FCLR (cc03) #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL65 nop .align 4 .LL63: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb3, cc01, cc01) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc03, cc03) LDF [BO + 11 * SIZE], b4 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 LDF [AO + 2 * SIZE], a3 add BO, 8 * SIZE, BO FMADD (aa4, bb7, cc01, cc01) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc03, cc03) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL63 LDF [AO + 3 * SIZE], a4 .align 4 .LL65: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL68 nop .align 4 .LL67: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 3 * SIZE], b2 LDF [AO + 1 * SIZE], a1 add L, -1, L add AO, 1 * SIZE, AO cmp L, 0 bg,pt %icc, .LL67 add BO, 2 * SIZE, BO .align 4 .LL68: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C2 + 0 * SIZE], a2 FMADD (alpha, cc01, aa1, cc01) FMADD (alpha, cc03, aa2, cc03) #else FMUL ALPHA, c01, c01 FMUL ALPHA, c03, c03 #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .align 4 .LL69: #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 2, KK #endif mov BO, B .align 4 .LL70: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 mov C, C1 #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL80 mov A, AO .align 4 .LL72: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) prefetch [C1 + 2 * SIZE], 3 #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL75 nop .LL73: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 cmp L, 0 FMADD (aa3, bb2, cc01, cc01) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb2, cc02, cc02) LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 add BO, 4 * SIZE, BO FMADD (aa1, bb3, cc01, cc01) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb3, cc02, cc02) LDF [AO + 9 * SIZE], a2 LDF [BO + 2 * SIZE], b3 add AO, 8 * SIZE, AO FMADD (aa3, bb4, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa4, bb4, cc02, cc02) LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL73 LDF [BO + 3 * SIZE], b4 .align 4 .LL75: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL78 nop .align 4 .LL77: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add L, -1, L add AO, 2 * SIZE, AO cmp L, 0 bg,pt %icc, .LL77 add BO, 1 * SIZE, BO .align 4 .LL78: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 FMADD (alpha, cc01, aa1, cc01) FMADD (alpha, cc02, aa2, cc02) #else FMUL ALPHA, c01, c01 FMUL ALPHA, c02, c02 #endif STF c01, [C1 + 0 * SIZE] add I, -1, I STF c02, [C1 + 1 * SIZE] cmp I, 0 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif bg,pt %icc, .LL72 add C1, 2 * SIZE, C1 .align 4 .LL80: and M, 1, I cmp I, 0 ble,pn %icc, .LL999 nop #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [BO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], a2 LDF [BO + 1 * SIZE], b2 LDF [AO + 2 * SIZE], a3 LDF [BO + 2 * SIZE], b3 LDF [AO + 3 * SIZE], a4 LDF [BO + 3 * SIZE], b4 #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL85 FCLR (cc01) .align 4 .LL83: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 LDF [BO + 4 * SIZE], b1 FMADD (aa2, bb2, cc01, cc01) LDF [AO + 5 * SIZE], a2 LDF [BO + 5 * SIZE], b2 FMADD (aa3, bb3, cc01, cc01) LDF [AO + 6 * SIZE], a3 LDF [BO + 6 * SIZE], b3 FMADD (aa4, bb4, cc01, cc01) LDF [AO + 7 * SIZE], a4 LDF [BO + 7 * SIZE], b4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL83 add BO, 4 * SIZE, BO .align 4 .LL85: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL88 nop .align 4 .LL87: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL87 add BO, 1 * SIZE, BO .align 4 .LL88: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 FMADD (alpha, cc01, aa1, cc01) #else FMUL ALPHA, c01, c01 #endif STF c01, [C1 + 0 * SIZE] .align 4 .LL999: #ifdef TRMMKERNEL #ifndef __64BIT__ ld [%sp + STACK_START + 8], %g1 ld [%sp + STACK_START + 12], %g2 ld [%sp + STACK_START + 16], %g3 ld [%sp + STACK_START + 20], %g4 #else ldx [%sp + STACK_START + 32], %g1 ldx [%sp + STACK_START + 40], %g2 ldx [%sp + STACK_START + 48], %g3 ldx [%sp + STACK_START + 56], %g4 #endif #endif return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemm_ncopy.S000066400000000000000000000160131313527062700200160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define A %i2 #define LDA %i3 #define B %i4 #define A1 %l0 #define A2 %l1 #define A3 %l2 #define A4 %l3 #define I %l4 #define J %l5 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #endif PROLOGUE SAVESP sra N, 2, J cmp J, 0 ble,pn %icc, .LL100 sll LDA, BASE_SHIFT, LDA .LL11: add A, LDA, A2 mov A, A1 add A2, LDA, A3 sra M, 2, I add A3, LDA, A4 cmp I, 0 ble,pn %icc, .LL15 add A4, LDA, A #define PREFETCHSIZE 36 #define WPREFETCHSIZE 20 .LL12: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c05 LDF [A3 + 0 * SIZE], c09 LDF [A4 + 0 * SIZE], c13 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 1 * SIZE], c06 LDF [A3 + 1 * SIZE], c10 LDF [A4 + 1 * SIZE], c14 prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 2 * SIZE], c03 LDF [A2 + 2 * SIZE], c07 LDF [A3 + 2 * SIZE], c11 LDF [A4 + 2 * SIZE], c15 prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 3 * SIZE], c04 LDF [A2 + 3 * SIZE], c08 LDF [A3 + 3 * SIZE], c12 LDF [A4 + 3 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 STF c01, [B + 0 * SIZE] add A1, 4 * SIZE, A1 STF c05, [B + 1 * SIZE] add A2, 4 * SIZE, A2 STF c09, [B + 2 * SIZE] add A3, 4 * SIZE, A3 STF c13, [B + 3 * SIZE] add A4, 4 * SIZE, A4 STF c02, [B + 4 * SIZE] add I, -1, I STF c06, [B + 5 * SIZE] cmp I, 0 STF c10, [B + 6 * SIZE] STF c14, [B + 7 * SIZE] #ifdef DOUBLE prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 #endif STF c03, [B + 8 * SIZE] STF c07, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c15, [B + 11 * SIZE] STF c04, [B + 12 * SIZE] STF c08, [B + 13 * SIZE] STF c12, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] bg,pt %icc, .LL12 add B, 16 * SIZE, B .LL15: and M, 3, I cmp I, 0 ble,pn %icc, .LL99 nop .LL16: LDF [A1 + 0 * SIZE], c01 add A1, 1 * SIZE, A1 LDF [A2 + 0 * SIZE], c05 add A2, 1 * SIZE, A2 LDF [A3 + 0 * SIZE], c09 add A3, 1 * SIZE, A3 LDF [A4 + 0 * SIZE], c13 add A4, 1 * SIZE, A4 STF c01, [B + 0 * SIZE] add I, -1, I STF c05, [B + 1 * SIZE] cmp I, 0 STF c09, [B + 2 * SIZE] STF c13, [B + 3 * SIZE] bg,pt %icc, .LL16 add B, 4 * SIZE, B .LL99: add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and N, 2, J cmp J, 0 ble,pn %icc, .LL200 nop .LL111: sra M, 2, I add A, LDA, A2 cmp I, 0 mov A, A1 ble,pn %icc, .LL115 add A2, LDA, A .LL112: LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c05 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 1 * SIZE], c06 LDF [A1 + 2 * SIZE], c03 LDF [A2 + 2 * SIZE], c07 LDF [A1 + 3 * SIZE], c04 LDF [A2 + 3 * SIZE], c08 STF c01, [B + 0 * SIZE] add A1, 4 * SIZE, A1 STF c05, [B + 1 * SIZE] add A2, 4 * SIZE, A2 STF c02, [B + 2 * SIZE] add I, -1, I STF c06, [B + 3 * SIZE] cmp I, 0 STF c03, [B + 4 * SIZE] STF c07, [B + 5 * SIZE] STF c04, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] bg,pt %icc, .LL112 add B, 8 * SIZE, B .LL115: and M, 3, I cmp I, 0 ble,pn %icc, .LL200 nop .LL116: LDF [A1 + 0 * SIZE], c01 add A1, 1 * SIZE, A1 add I, -1, I LDF [A2 + 0 * SIZE], c05 add A2, 1 * SIZE, A2 cmp I, 0 STF c01, [B + 0 * SIZE] STF c05, [B + 1 * SIZE] bg,pt %icc, .LL116 add B, 2 * SIZE, B .LL200: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop .LL211: sra M, 2, I cmp I, 0 ble,pn %icc, .LL215 mov A, A1 .LL212: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 STF c01, [B + 0 * SIZE] add I, -1, I STF c02, [B + 1 * SIZE] cmp I, 0 STF c03, [B + 2 * SIZE] add A1, 4 * SIZE, A1 STF c04, [B + 3 * SIZE] bg,pt %icc, .LL212 add B, 4 * SIZE, B .LL215: and M, 3, I cmp I, 0 ble,pn %icc, .LL999 nop .LL216: LDF [A1 + 0 * SIZE], c01 add A1, 1 * SIZE, A1 add I, -1, I cmp I, 0 STF c01, [B + 0 * SIZE] bg,pt %icc, .LL216 add B, 1 * SIZE, B .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemm_ncopy_2.S000066400000000000000000000133231313527062700202400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 72 #define WPREFETCHSIZE 20 #define M %i0 #define N %i1 #define A %i2 #define LDA %i3 #define B %i4 #define A1 %l0 #define A2 %l1 #define A3 %l2 #define A4 %l3 #define I %l4 #define J %l5 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #endif PROLOGUE SAVESP sra N, 1, J cmp J, 0 ble,pn %icc, .LL100 sll LDA, BASE_SHIFT, LDA .LL11: add A, LDA, A2 mov A, A1 sra M, 3, I cmp I, 0 ble,pn %icc, .LL15 add A2, LDA, A .LL12: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A1 + 1 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 LDF [A1 + 2 * SIZE], c05 LDF [A2 + 2 * SIZE], c06 LDF [A1 + 3 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 4 * SIZE], c09 LDF [A2 + 4 * SIZE], c10 LDF [A1 + 5 * SIZE], c11 LDF [A2 + 5 * SIZE], c12 LDF [A1 + 6 * SIZE], c13 LDF [A2 + 6 * SIZE], c14 LDF [A1 + 7 * SIZE], c15 LDF [A2 + 7 * SIZE], c16 add A1, 8 * SIZE, A1 add I, -1, I add A2, 8 * SIZE, A2 cmp I, 0 prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] bg,pt %icc, .LL12 add B, 16 * SIZE, B .LL15: and M, 7, I cmp I, 0 ble,pn %icc, .LL99 nop .LL16: LDF [A1 + 0 * SIZE], c01 add A1, 1 * SIZE, A1 LDF [A2 + 0 * SIZE], c02 add A2, 1 * SIZE, A2 STF c01, [B + 0 * SIZE] add I, -1, I STF c02, [B + 1 * SIZE] cmp I, 0 bg,pt %icc, .LL16 add B, 2 * SIZE, B .LL99: add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop .LL111: sra M, 2, I cmp I, 0 ble,pn %icc, .LL115 mov A, A1 .LL112: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 STF c01, [B + 0 * SIZE] add I, -1, I STF c02, [B + 1 * SIZE] cmp I, 0 STF c03, [B + 2 * SIZE] add A1, 4 * SIZE, A1 STF c04, [B + 3 * SIZE] bg,pt %icc, .LL112 add B, 4 * SIZE, B .LL115: and M, 3, I cmp I, 0 ble,pn %icc, .LL999 nop .LL116: LDF [A1 + 0 * SIZE], c01 add A1, 1 * SIZE, A1 add I, -1, I cmp I, 0 STF c01, [B + 0 * SIZE] bg,pt %icc, .LL116 add B, 1 * SIZE, B .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemm_ncopy_8.S000066400000000000000000000473571313527062700202640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 42 #define WPREFETCHSIZE 20 #define M %i0 #define N %i1 #define A %i2 #define LDA %i3 #define B %i4 #define A1 %l0 #define A2 %l1 #define A3 %l2 #define A4 %l3 #define A5 %o0 #define A6 %o1 #define A7 %o2 #define A8 %o3 #define I %l4 #define J %l5 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #endif PROLOGUE SAVESP sra N, 3, J cmp J, 0 ble,pn %icc, .LL20 sll LDA, BASE_SHIFT, LDA .LL11: add A, LDA, A2 mov A, A1 add A2, LDA, A3 sra M, 3, I add A3, LDA, A4 cmp I, 0 add A4, LDA, A5 add A5, LDA, A6 add A6, LDA, A7 add A7, LDA, A8 ble,pn %icc, .LL13 add A8, LDA, A .align 4 .LL12: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 LDF [A5 + 0 * SIZE], c05 LDF [A6 + 0 * SIZE], c06 LDF [A7 + 0 * SIZE], c07 LDF [A8 + 0 * SIZE], c08 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 1 * SIZE], c09 LDF [A2 + 1 * SIZE], c10 LDF [A3 + 1 * SIZE], c11 LDF [A4 + 1 * SIZE], c12 LDF [A5 + 1 * SIZE], c13 LDF [A6 + 1 * SIZE], c14 LDF [A7 + 1 * SIZE], c15 LDF [A8 + 1 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 2 * SIZE], c01 LDF [A2 + 2 * SIZE], c02 LDF [A3 + 2 * SIZE], c03 LDF [A4 + 2 * SIZE], c04 LDF [A5 + 2 * SIZE], c05 LDF [A6 + 2 * SIZE], c06 LDF [A7 + 2 * SIZE], c07 LDF [A8 + 2 * SIZE], c08 prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 3 * SIZE], c09 LDF [A2 + 3 * SIZE], c10 LDF [A3 + 3 * SIZE], c11 LDF [A4 + 3 * SIZE], c12 LDF [A5 + 3 * SIZE], c13 LDF [A6 + 3 * SIZE], c14 LDF [A7 + 3 * SIZE], c15 LDF [A8 + 3 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 16) * SIZE], 2 STF c01, [B + 16 * SIZE] STF c02, [B + 17 * SIZE] STF c03, [B + 18 * SIZE] STF c04, [B + 19 * SIZE] STF c05, [B + 20 * SIZE] STF c06, [B + 21 * SIZE] STF c07, [B + 22 * SIZE] STF c08, [B + 23 * SIZE] prefetch [B + (WPREFETCHSIZE + 24) * SIZE], 2 STF c09, [B + 24 * SIZE] STF c10, [B + 25 * SIZE] STF c11, [B + 26 * SIZE] STF c12, [B + 27 * SIZE] STF c13, [B + 28 * SIZE] STF c14, [B + 29 * SIZE] STF c15, [B + 30 * SIZE] STF c16, [B + 31 * SIZE] prefetch [A5 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 4 * SIZE], c01 LDF [A2 + 4 * SIZE], c02 LDF [A3 + 4 * SIZE], c03 LDF [A4 + 4 * SIZE], c04 LDF [A5 + 4 * SIZE], c05 LDF [A6 + 4 * SIZE], c06 LDF [A7 + 4 * SIZE], c07 LDF [A8 + 4 * SIZE], c08 prefetch [A6 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 5 * SIZE], c09 LDF [A2 + 5 * SIZE], c10 LDF [A3 + 5 * SIZE], c11 LDF [A4 + 5 * SIZE], c12 LDF [A5 + 5 * SIZE], c13 LDF [A6 + 5 * SIZE], c14 LDF [A7 + 5 * SIZE], c15 LDF [A8 + 5 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 32) * SIZE], 2 STF c01, [B + 32 * SIZE] STF c02, [B + 33 * SIZE] STF c03, [B + 34 * SIZE] STF c04, [B + 35 * SIZE] STF c05, [B + 36 * SIZE] STF c06, [B + 37 * SIZE] STF c07, [B + 38 * SIZE] STF c08, [B + 39 * SIZE] prefetch [B + (WPREFETCHSIZE + 40) * SIZE], 2 STF c09, [B + 40 * SIZE] STF c10, [B + 41 * SIZE] STF c11, [B + 42 * SIZE] STF c12, [B + 43 * SIZE] STF c13, [B + 44 * SIZE] STF c14, [B + 45 * SIZE] STF c15, [B + 46 * SIZE] STF c16, [B + 47 * SIZE] prefetch [A7 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 6 * SIZE], c01 LDF [A2 + 6 * SIZE], c02 LDF [A3 + 6 * SIZE], c03 LDF [A4 + 6 * SIZE], c04 LDF [A5 + 6 * SIZE], c05 LDF [A6 + 6 * SIZE], c06 LDF [A7 + 6 * SIZE], c07 LDF [A8 + 6 * SIZE], c08 prefetch [A8 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 7 * SIZE], c09 LDF [A2 + 7 * SIZE], c10 LDF [A3 + 7 * SIZE], c11 LDF [A4 + 7 * SIZE], c12 LDF [A5 + 7 * SIZE], c13 LDF [A6 + 7 * SIZE], c14 LDF [A7 + 7 * SIZE], c15 LDF [A8 + 7 * SIZE], c16 add A1, 8 * SIZE, A1 add A2, 8 * SIZE, A2 add A3, 8 * SIZE, A3 add A4, 8 * SIZE, A4 prefetch [B + (WPREFETCHSIZE + 48) * SIZE], 2 STF c01, [B + 48 * SIZE] STF c02, [B + 49 * SIZE] STF c03, [B + 50 * SIZE] STF c04, [B + 51 * SIZE] STF c05, [B + 52 * SIZE] STF c06, [B + 53 * SIZE] STF c07, [B + 54 * SIZE] STF c08, [B + 55 * SIZE] add A5, 8 * SIZE, A5 add A6, 8 * SIZE, A6 add A7, 8 * SIZE, A7 add A8, 8 * SIZE, A8 prefetch [B + (WPREFETCHSIZE + 56) * SIZE], 2 STF c09, [B + 56 * SIZE] STF c10, [B + 57 * SIZE] STF c11, [B + 58 * SIZE] STF c12, [B + 59 * SIZE] STF c13, [B + 60 * SIZE] STF c14, [B + 61 * SIZE] STF c15, [B + 62 * SIZE] STF c16, [B + 63 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL12 add B, 64 * SIZE, B .align 4 .LL13: and M, 4, I cmp I, 0 ble,pn %icc, .LL14 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 LDF [A5 + 0 * SIZE], c05 LDF [A6 + 0 * SIZE], c06 LDF [A7 + 0 * SIZE], c07 LDF [A8 + 0 * SIZE], c08 LDF [A1 + 1 * SIZE], c09 LDF [A2 + 1 * SIZE], c10 LDF [A3 + 1 * SIZE], c11 LDF [A4 + 1 * SIZE], c12 LDF [A5 + 1 * SIZE], c13 LDF [A6 + 1 * SIZE], c14 LDF [A7 + 1 * SIZE], c15 LDF [A8 + 1 * SIZE], c16 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] LDF [A1 + 2 * SIZE], c01 LDF [A2 + 2 * SIZE], c02 LDF [A3 + 2 * SIZE], c03 LDF [A4 + 2 * SIZE], c04 LDF [A5 + 2 * SIZE], c05 LDF [A6 + 2 * SIZE], c06 LDF [A7 + 2 * SIZE], c07 LDF [A8 + 2 * SIZE], c08 LDF [A1 + 3 * SIZE], c09 LDF [A2 + 3 * SIZE], c10 LDF [A3 + 3 * SIZE], c11 LDF [A4 + 3 * SIZE], c12 LDF [A5 + 3 * SIZE], c13 LDF [A6 + 3 * SIZE], c14 LDF [A7 + 3 * SIZE], c15 LDF [A8 + 3 * SIZE], c16 STF c01, [B + 16 * SIZE] STF c02, [B + 17 * SIZE] STF c03, [B + 18 * SIZE] STF c04, [B + 19 * SIZE] STF c05, [B + 20 * SIZE] STF c06, [B + 21 * SIZE] STF c07, [B + 22 * SIZE] STF c08, [B + 23 * SIZE] STF c09, [B + 24 * SIZE] STF c10, [B + 25 * SIZE] STF c11, [B + 26 * SIZE] STF c12, [B + 27 * SIZE] STF c13, [B + 28 * SIZE] STF c14, [B + 29 * SIZE] STF c15, [B + 30 * SIZE] STF c16, [B + 31 * SIZE] add A1, 4 * SIZE, A1 add A2, 4 * SIZE, A2 add A3, 4 * SIZE, A3 add A4, 4 * SIZE, A4 add A5, 4 * SIZE, A5 add A6, 4 * SIZE, A6 add A7, 4 * SIZE, A7 add A8, 4 * SIZE, A8 add B, 32 * SIZE, B .align 4 .LL14: and M, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 LDF [A5 + 0 * SIZE], c05 LDF [A6 + 0 * SIZE], c06 LDF [A7 + 0 * SIZE], c07 LDF [A8 + 0 * SIZE], c08 LDF [A1 + 1 * SIZE], c09 LDF [A2 + 1 * SIZE], c10 LDF [A3 + 1 * SIZE], c11 LDF [A4 + 1 * SIZE], c12 LDF [A5 + 1 * SIZE], c13 LDF [A6 + 1 * SIZE], c14 LDF [A7 + 1 * SIZE], c15 LDF [A8 + 1 * SIZE], c16 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] add A1, 2 * SIZE, A1 add A2, 2 * SIZE, A2 add A3, 2 * SIZE, A3 add A4, 2 * SIZE, A4 add A5, 2 * SIZE, A5 add A6, 2 * SIZE, A6 add A7, 2 * SIZE, A7 add A8, 2 * SIZE, A8 add B, 16 * SIZE, B .align 4 .LL15: and M, 1, I cmp I, 0 ble,pn %icc, .LL19 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 LDF [A5 + 0 * SIZE], c05 LDF [A6 + 0 * SIZE], c06 LDF [A7 + 0 * SIZE], c07 LDF [A8 + 0 * SIZE], c08 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] add B, 8 * SIZE, B .align 4 .LL19: add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .align 4 .LL20: and N, 4, J cmp J, 0 ble,pn %icc, .LL30 nop add A, LDA, A2 mov A, A1 add A2, LDA, A3 sra M, 3, I add A3, LDA, A4 cmp I, 0 ble,pn %icc, .LL23 add A4, LDA, A .align 4 .LL22: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 LDF [A1 + 1 * SIZE], c05 LDF [A2 + 1 * SIZE], c06 LDF [A3 + 1 * SIZE], c07 LDF [A4 + 1 * SIZE], c08 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 2 * SIZE], c09 LDF [A2 + 2 * SIZE], c10 LDF [A3 + 2 * SIZE], c11 LDF [A4 + 2 * SIZE], c12 LDF [A1 + 3 * SIZE], c13 LDF [A2 + 3 * SIZE], c14 LDF [A3 + 3 * SIZE], c15 LDF [A4 + 3 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 4 * SIZE], c01 LDF [A2 + 4 * SIZE], c02 LDF [A3 + 4 * SIZE], c03 LDF [A4 + 4 * SIZE], c04 LDF [A1 + 5 * SIZE], c05 LDF [A2 + 5 * SIZE], c06 LDF [A3 + 5 * SIZE], c07 LDF [A4 + 5 * SIZE], c08 prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 6 * SIZE], c09 LDF [A2 + 6 * SIZE], c10 LDF [A3 + 6 * SIZE], c11 LDF [A4 + 6 * SIZE], c12 LDF [A1 + 7 * SIZE], c13 LDF [A2 + 7 * SIZE], c14 LDF [A3 + 7 * SIZE], c15 LDF [A4 + 7 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 16) * SIZE], 2 STF c01, [B + 16 * SIZE] STF c02, [B + 17 * SIZE] STF c03, [B + 18 * SIZE] STF c04, [B + 19 * SIZE] STF c05, [B + 20 * SIZE] STF c06, [B + 21 * SIZE] STF c07, [B + 22 * SIZE] STF c08, [B + 23 * SIZE] prefetch [B + (WPREFETCHSIZE + 24) * SIZE], 2 STF c09, [B + 24 * SIZE] STF c10, [B + 25 * SIZE] STF c11, [B + 26 * SIZE] STF c12, [B + 27 * SIZE] STF c13, [B + 28 * SIZE] STF c14, [B + 29 * SIZE] STF c15, [B + 30 * SIZE] STF c16, [B + 31 * SIZE] add A1, 8 * SIZE, A1 add A2, 8 * SIZE, A2 add A3, 8 * SIZE, A3 add A4, 8 * SIZE, A4 add I, -1, I cmp I, 0 bg,pt %icc, .LL22 add B, 32 * SIZE, B .align 4 .LL23: and M, 4, I cmp I, 0 ble,pn %icc, .LL24 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 LDF [A1 + 1 * SIZE], c05 LDF [A2 + 1 * SIZE], c06 LDF [A3 + 1 * SIZE], c07 LDF [A4 + 1 * SIZE], c08 LDF [A1 + 2 * SIZE], c09 LDF [A2 + 2 * SIZE], c10 LDF [A3 + 2 * SIZE], c11 LDF [A4 + 2 * SIZE], c12 LDF [A1 + 3 * SIZE], c13 LDF [A2 + 3 * SIZE], c14 LDF [A3 + 3 * SIZE], c15 LDF [A4 + 3 * SIZE], c16 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] add A1, 4 * SIZE, A1 add A2, 4 * SIZE, A2 add A3, 4 * SIZE, A3 add A4, 4 * SIZE, A4 add B, 16 * SIZE, B .align 4 .LL24: and M, 2, I cmp I, 0 ble,pn %icc, .LL25 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 LDF [A1 + 1 * SIZE], c05 LDF [A2 + 1 * SIZE], c06 LDF [A3 + 1 * SIZE], c07 LDF [A4 + 1 * SIZE], c08 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] add A1, 2 * SIZE, A1 add A2, 2 * SIZE, A2 add A3, 2 * SIZE, A3 add A4, 2 * SIZE, A4 add B, 8 * SIZE, B .align 4 .LL25: and M, 1, I cmp I, 0 ble,pn %icc, .LL30 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] add B, 4 * SIZE, B .align 4 .LL30: and N, 2, J cmp J, 0 ble,pn %icc, .LL40 nop add A, LDA, A2 mov A, A1 sra M, 3, I cmp I, 0 ble,pn %icc, .LL33 add A2, LDA, A .align 4 .LL32: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A1 + 1 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 LDF [A1 + 2 * SIZE], c05 LDF [A2 + 2 * SIZE], c06 LDF [A1 + 3 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 4 * SIZE], c09 LDF [A2 + 4 * SIZE], c10 LDF [A1 + 5 * SIZE], c11 LDF [A2 + 5 * SIZE], c12 LDF [A1 + 6 * SIZE], c13 LDF [A2 + 6 * SIZE], c14 LDF [A1 + 7 * SIZE], c15 LDF [A2 + 7 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] add A1, 8 * SIZE, A1 add A2, 8 * SIZE, A2 add I, -1, I cmp I, 0 bg,pt %icc, .LL32 add B, 16 * SIZE, B .align 4 .LL33: and M, 4, I cmp I, 0 ble,pn %icc, .LL34 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A1 + 1 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 LDF [A1 + 2 * SIZE], c05 LDF [A2 + 2 * SIZE], c06 LDF [A1 + 3 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] add A1, 4 * SIZE, A1 add A2, 4 * SIZE, A2 add B, 8 * SIZE, B .align 4 .LL34: and M, 2, I cmp I, 0 ble,pn %icc, .LL35 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A1 + 1 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] add A1, 2 * SIZE, A1 add A2, 2 * SIZE, A2 add B, 4 * SIZE, B .align 4 .LL35: and M, 1, I cmp I, 0 ble,pn %icc, .LL40 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] add B, 2 * SIZE, B .align 4 .LL40: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop sra M, 3, I cmp I, 0 ble,pn %icc, .LL43 mov A, A1 .align 4 .LL42: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 LDF [A1 + 4 * SIZE], c05 LDF [A1 + 5 * SIZE], c06 LDF [A1 + 6 * SIZE], c07 LDF [A1 + 7 * SIZE], c08 prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] add A1, 8 * SIZE, A1 add I, -1, I cmp I, 0 bg,pt %icc, .LL42 add B, 8 * SIZE, B .align 4 .LL43: and M, 4, I cmp I, 0 ble,pn %icc, .LL44 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] add A1, 4 * SIZE, A1 add B, 4 * SIZE, B .align 4 .LL44: and M, 2, I cmp I, 0 ble,pn %icc, .LL45 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] add A1, 2 * SIZE, A1 add B, 2 * SIZE, B .align 4 .LL45: and M, 1, I cmp I, 0 ble,pn %icc, .LL999 nop LDF [A1 + 0 * SIZE], c01 STF c01, [B + 0 * SIZE] .align 4 .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemm_tcopy.S000066400000000000000000000201311313527062700200200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define A %i2 #define LDA %i3 #define B %i4 #define A1 %l0 #define A2 %l1 #define A3 %l2 #define A4 %l3 #define I %l4 #define J %l5 #define B1 %o0 #define B2 %o1 #define B3 %o3 #define M4 %o4 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #endif PROLOGUE SAVESP sll M, BASE_SHIFT + 2, M4 and N, -4, B2 and N, -2, B3 sll M, BASE_SHIFT, B1 smul B1, B2, B2 smul B1, B3, B3 add B, B2, B2 add B, B3, B3 sra M, 2, J cmp J, 0 ble,pn %icc, .LL100 sll LDA, BASE_SHIFT, LDA .LL11: add A, LDA, A2 mov A, A1 add A2, LDA, A3 sra N, 2, I add A3, LDA, A4 cmp I, 0 mov B, B1 add B, 16 * SIZE, B ble,pn %icc, .LL15 add A4, LDA, A #define PREFETCHSIZE 8 .LL12: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A2 + 0 * SIZE], c05 LDF [A2 + 1 * SIZE], c06 LDF [A2 + 2 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A3 + 0 * SIZE], c09 LDF [A3 + 1 * SIZE], c10 LDF [A3 + 2 * SIZE], c11 LDF [A3 + 3 * SIZE], c12 prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A4 + 0 * SIZE], c13 LDF [A4 + 1 * SIZE], c14 LDF [A4 + 2 * SIZE], c15 LDF [A4 + 3 * SIZE], c16 prefetch [B1 + (PREFETCHSIZE + 0) * SIZE], 0 STF c01, [B1 + 0 * SIZE] add A1, 4 * SIZE, A1 STF c02, [B1 + 1 * SIZE] add A2, 4 * SIZE, A2 STF c03, [B1 + 2 * SIZE] add A3, 4 * SIZE, A3 STF c04, [B1 + 3 * SIZE] add A4, 4 * SIZE, A4 STF c05, [B1 + 4 * SIZE] add I, -1, I STF c06, [B1 + 5 * SIZE] cmp I, 0 STF c07, [B1 + 6 * SIZE] STF c08, [B1 + 7 * SIZE] #ifdef DOUBLE prefetch [B1 + (PREFETCHSIZE + 8) * SIZE], 0 #endif STF c09, [B1 + 8 * SIZE] STF c10, [B1 + 9 * SIZE] STF c11, [B1 + 10 * SIZE] STF c12, [B1 + 11 * SIZE] STF c13, [B1 + 12 * SIZE] STF c14, [B1 + 13 * SIZE] STF c15, [B1 + 14 * SIZE] STF c16, [B1 + 15 * SIZE] bg,pt %icc, .LL12 add B1, M4, B1 .LL15: and N, 2, I cmp I, 0 ble,pn %icc, .LL17 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 0 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 LDF [A3 + 0 * SIZE], c05 LDF [A3 + 1 * SIZE], c06 LDF [A4 + 0 * SIZE], c07 LDF [A4 + 1 * SIZE], c08 STF c01, [B2 + 0 * SIZE] add A1, 2 * SIZE, A1 STF c02, [B2 + 1 * SIZE] add A2, 2 * SIZE, A2 STF c03, [B2 + 2 * SIZE] add A3, 2 * SIZE, A3 STF c04, [B2 + 3 * SIZE] add A4, 2 * SIZE, A4 STF c05, [B2 + 4 * SIZE] STF c06, [B2 + 5 * SIZE] STF c07, [B2 + 6 * SIZE] STF c08, [B2 + 7 * SIZE] add B2, 8 * SIZE, B2 .LL17: and N, 1, I cmp I, 0 ble,pn %icc, .LL99 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 LDF [A3 + 0 * SIZE], c03 LDF [A4 + 0 * SIZE], c04 STF c01, [B3 + 0 * SIZE] STF c02, [B3 + 1 * SIZE] STF c03, [B3 + 2 * SIZE] STF c04, [B3 + 3 * SIZE] add B3, 4 * SIZE, B3 .LL99: add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and M, 2, J cmp J, 0 ble,pn %icc, .LL200 nop .LL111: sra N, 2, I add A, LDA, A2 cmp I, 0 mov A, A1 mov B, B1 add B, 8 * SIZE, B ble,pn %icc, .LL115 add A2, LDA, A .LL112: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 LDF [A2 + 0 * SIZE], c05 LDF [A2 + 1 * SIZE], c06 LDF [A2 + 2 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 STF c01, [B1 + 0 * SIZE] add A1, 4 * SIZE, A1 STF c02, [B1 + 1 * SIZE] add A2, 4 * SIZE, A2 STF c03, [B1 + 2 * SIZE] add I, -1, I STF c04, [B1 + 3 * SIZE] cmp I, 0 STF c05, [B1 + 4 * SIZE] STF c06, [B1 + 5 * SIZE] STF c07, [B1 + 6 * SIZE] STF c08, [B1 + 7 * SIZE] bg,pt %icc, .LL112 add B1, M4, B1 .LL115: and N, 2, I cmp I, 0 ble,pn %icc, .LL117 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 0 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 STF c01, [B2 + 0 * SIZE] add A1, 2 * SIZE, A1 STF c02, [B2 + 1 * SIZE] add A2, 2 * SIZE, A2 STF c03, [B2 + 2 * SIZE] add I, -1, I STF c04, [B2 + 3 * SIZE] cmp I, 0 add B2, 4 * SIZE, B2 .LL117: and N, 1, I cmp I, 0 ble,pn %icc, .LL200 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 STF c01, [B3 + 0 * SIZE] STF c02, [B3 + 1 * SIZE] add B3, 2 * SIZE, B3 .LL200: and M, 1, J cmp J, 0 ble,pn %icc, .LL999 nop .LL211: sra N, 2, I cmp I, 0 mov B, B1 ble,pn %icc, .LL215 mov A, A1 .LL212: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 STF c01, [B + 0 * SIZE] add I, -1, I STF c02, [B + 1 * SIZE] cmp I, 0 STF c03, [B + 2 * SIZE] add A1, 4 * SIZE, A1 STF c04, [B + 3 * SIZE] bg,pt %icc, .LL212 add B, M4, B .LL215: and N, 2, I cmp I, 0 ble,pn %icc, .LL217 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 STF c01, [B2 + 0 * SIZE] STF c02, [B2 + 1 * SIZE] add A1, 2 * SIZE, A1 .LL217: and N, 1, I cmp I, 0 ble,pn %icc, .LL999 nop LDF [A1 + 0 * SIZE], c01 STF c01, [B3 + 0 * SIZE] .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemm_tcopy_2.S000066400000000000000000000152711313527062700202520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE 72 #define WPREFETCHSIZE 16 #define M %i0 #define N %i1 #define A %i2 #define LDA %i3 #define B %i4 #define A1 %l0 #define A2 %l1 #define A3 %l2 #define A4 %l3 #define I %l4 #define J %l5 #define B1 %o0 #define B2 %o1 #define B3 %o3 #define M2 %o4 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #endif PROLOGUE SAVESP sll M, BASE_SHIFT + 1, M2 and N, -2, B2 sll M, BASE_SHIFT, B1 smul B1, B2, B2 add B, B2, B2 sra M, 1, J cmp J, 0 ble,pn %icc, .LL100 sll LDA, BASE_SHIFT, LDA .LL11: add A, LDA, A2 mov A, A1 sra N, 3, I cmp I, 0 mov B, B1 add B, 4 * SIZE, B ble,pn %icc, .LL13 add A2, LDA, A .align 4 .LL12: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 0 * SIZE], c09 LDF [A2 + 1 * SIZE], c10 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 LDF [A2 + 2 * SIZE], c11 LDF [A2 + 3 * SIZE], c12 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 4 * SIZE], c05 LDF [A1 + 5 * SIZE], c06 LDF [A2 + 4 * SIZE], c13 LDF [A2 + 5 * SIZE], c14 LDF [A1 + 6 * SIZE], c07 LDF [A1 + 7 * SIZE], c08 LDF [A2 + 6 * SIZE], c15 LDF [A2 + 7 * SIZE], c16 add A1, 8 * SIZE, A1 add A2, 8 * SIZE, A2 add I, -1, I cmp I, 0 prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 STF c01, [B1 + 0 * SIZE] STF c02, [B1 + 1 * SIZE] STF c09, [B1 + 2 * SIZE] STF c10, [B1 + 3 * SIZE] add B1, M2, B1 prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 STF c03, [B1 + 0 * SIZE] STF c04, [B1 + 1 * SIZE] STF c11, [B1 + 2 * SIZE] STF c12, [B1 + 3 * SIZE] add B1, M2, B1 prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 STF c05, [B1 + 0 * SIZE] STF c06, [B1 + 1 * SIZE] STF c13, [B1 + 2 * SIZE] STF c14, [B1 + 3 * SIZE] add B1, M2, B1 prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 STF c07, [B1 + 0 * SIZE] STF c08, [B1 + 1 * SIZE] STF c15, [B1 + 2 * SIZE] STF c16, [B1 + 3 * SIZE] bg,pt %icc, .LL12 add B1, M2, B1 .LL13: and N, 4, I cmp I, 0 ble,pn %icc, .LL14 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 0 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 LDF [A1 + 2 * SIZE], c05 LDF [A1 + 3 * SIZE], c06 LDF [A2 + 2 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 add A1, 4 * SIZE, A1 add A2, 4 * SIZE, A2 STF c01, [B1 + 0 * SIZE] STF c02, [B1 + 1 * SIZE] STF c03, [B1 + 2 * SIZE] STF c04, [B1 + 3 * SIZE] add B1, M2, B1 STF c05, [B1 + 0 * SIZE] STF c06, [B1 + 1 * SIZE] STF c07, [B1 + 2 * SIZE] STF c08, [B1 + 3 * SIZE] add B1, M2, B1 .align 4 .LL14: and N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 0 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 add A1, 2 * SIZE, A1 add A2, 2 * SIZE, A2 STF c01, [B1 + 0 * SIZE] STF c02, [B1 + 1 * SIZE] STF c03, [B1 + 2 * SIZE] STF c04, [B1 + 3 * SIZE] add B1, M2, B1 .align 4 .LL15: and N, 1, I cmp I, 0 ble,pn %icc, .LL99 nop LDF [A1 + 0 * SIZE], c01 LDF [A2 + 0 * SIZE], c02 STF c01, [B2 + 0 * SIZE] STF c02, [B2 + 1 * SIZE] add B2, 2 * SIZE, B2 .LL99: add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and M, 1, J cmp J, 0 ble,pn %icc, .LL999 nop .LL211: sra N, 1, I cmp I, 0 mov B, B1 ble,pn %icc, .LL215 mov A, A1 .LL212: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 add A1, 2 * SIZE, A1 add I, -1, I cmp I, 0 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] bg,pt %icc, .LL212 add B, M2, B .LL215: and N, 1, I cmp I, 0 ble,pn %icc, .LL999 nop LDF [A1 + 0 * SIZE], c01 STF c01, [B2 + 0 * SIZE] .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemv_n.S000066400000000000000000000603041313527062700171360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define LDA %i2 #define X %i3 #define INCX %i4 #else #define A %i4 #define LDA %i5 #define X %i2 #define INCX %i3 #endif #define Y %l0 #define INCY %l1 #define BUFFER %l2 #define I %l3 #define J %l5 #define A1 %o0 #define A2 %o1 #define A3 %o2 #define A4 %o3 #define Y1 %l4 #define YY %l6 #ifdef DOUBLE #define t1 %f0 #define t2 %f2 #define t3 %f4 #define t4 %f6 #define y1 %f8 #define y2 %f10 #define y3 %f12 #define y4 %f14 #define y5 %f16 #define y6 %f18 #define y7 %f20 #define y8 %f22 #define a1 %f24 #define a2 %f26 #define a3 %f28 #define a4 %f30 #define a5 %f32 #define a6 %f34 #define a7 %f36 #define a8 %f38 #define a9 %f40 #define a10 %f42 #define a11 %f44 #define a12 %f46 #define a13 %f48 #define a14 %f50 #define a15 %f52 #define a16 %f54 #define x1 %f56 #define x2 %f58 #define x3 %f60 #define x4 %f62 #define FZERO %f52 #define ALPHA %f54 #else #define t1 %f0 #define t2 %f1 #define t3 %f2 #define t4 %f3 #define y1 %f4 #define y2 %f5 #define y3 %f6 #define y4 %f7 #define y5 %f8 #define y6 %f9 #define y7 %f10 #define y8 %f11 #define a1 %f12 #define a2 %f13 #define a3 %f14 #define a4 %f15 #define a5 %f16 #define a6 %f17 #define a7 %f18 #define a8 %f19 #define a9 %f20 #define a10 %f21 #define a11 %f22 #define a12 %f23 #define a13 %f24 #define a14 %f25 #define a15 %f26 #define a16 %f27 #define x1 %f28 #define x2 %f29 #define x3 %f30 #define x4 %f31 #define FZERO %f26 #define ALPHA %f27 #endif #ifndef __64BIT__ #define STACK_FZERO [%sp + STACK_START + 8] #define STACK_ALPHA [%sp + STACK_START + 16] #else #define STACK_FZERO [%sp + STACK_START + 32] #define STACK_ALPHA [%sp + STACK_START + 40] #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] /* ALPHA */ st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], LDA ld [%sp + STACK_START + 32], X ld [%sp + STACK_START + 36], INCX ld [%sp + STACK_START + 40], Y ld [%sp + STACK_START + 44], INCY ld [%sp + STACK_START + 48], BUFFER #else st %i3, [%sp + STACK_START + 16] /* ALPHA */ ld [%sp + STACK_START + 28], X ld [%sp + STACK_START + 32], INCX ld [%sp + STACK_START + 36], Y ld [%sp + STACK_START + 40], INCY ld [%sp + STACK_START + 44], BUFFER #endif LDF [%sp + STACK_START + 16], ALPHA #else ldx [%sp + STACK_START + 56], X ldx [%sp + STACK_START + 64], INCX ldx [%sp + STACK_START + 72], Y ldx [%sp + STACK_START + 80], INCY ldx [%sp + STACK_START + 88], BUFFER #ifdef DOUBLE FMOV %f6, ALPHA STF %f6, STACK_ALPHA #else FMOV %f7, ALPHA STF %f7, STACK_ALPHA #endif #endif sll LDA, BASE_SHIFT, LDA cmp M, 0 ble %icc, .LL999 sll INCX, BASE_SHIFT, INCX cmp N, 0 ble %icc, .LL999 sll INCY, BASE_SHIFT, INCY #ifdef DOUBLE FCLR(21) #else FCLR(26) #endif cmp INCY, SIZE be %icc, .LL10 mov Y, YY add M, 7, J sra J, 3, J mov BUFFER, YY mov BUFFER, Y1 .LL01: STF FZERO, [Y1 + 0 * SIZE] STF FZERO, [Y1 + 1 * SIZE] STF FZERO, [Y1 + 2 * SIZE] STF FZERO, [Y1 + 3 * SIZE] STF FZERO, [Y1 + 4 * SIZE] STF FZERO, [Y1 + 5 * SIZE] STF FZERO, [Y1 + 6 * SIZE] deccc J STF FZERO, [Y1 + 7 * SIZE] bg,pn %icc, .LL01 add Y1, 8 * SIZE, Y1 .LL10: sra N, 2, J cmp J, 0 ble,pn %icc, .LL20 nop .LL11: mov YY, Y1 mov A, A1 add A, LDA, A2 add A2, LDA, A3 add A3, LDA, A4 add A4, LDA, A LDF STACK_ALPHA, ALPHA LDF [X], x1 add X, INCX, X LDF [X], x2 add X, INCX, X LDF [X], x3 add X, INCX, X LDF [X], x4 add X, INCX, X FMUL ALPHA, x1, x1 FMUL ALPHA, x2, x2 FMUL ALPHA, x3, x3 FMUL ALPHA, x4, x4 sra M, 3, I cmp I, 0 ble,pn %icc, .LL16 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a5 LDF [A1 + 5 * SIZE], a6 LDF [A1 + 6 * SIZE], a7 LDF [A1 + 7 * SIZE], a8 LDF [A2 + 0 * SIZE], a9 LDF [A2 + 1 * SIZE], a10 LDF [A2 + 2 * SIZE], a11 LDF [A2 + 3 * SIZE], a12 LDF [A2 + 4 * SIZE], a13 LDF [A2 + 5 * SIZE], a14 LDF [A2 + 6 * SIZE], a15 LDF [A2 + 7 * SIZE], a16 FMUL a1, x1, t1 LDF [A3 + 0 * SIZE], a1 FMUL a2, x1, t2 LDF [A3 + 1 * SIZE], a2 FMUL a3, x1, t3 LDF [A3 + 2 * SIZE], a3 FMUL a4, x1, t4 LDF [A3 + 3 * SIZE], a4 deccc I ble,pn %icc, .LL13 nop nop nop nop #ifdef DOUBLE #define PREFETCHSIZE 20 #else #define PREFETCHSIZE 40 #endif .LL12: LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 LDF [Y1 + 4 * SIZE], y5 LDF [Y1 + 5 * SIZE], y6 LDF [Y1 + 6 * SIZE], y7 LDF [Y1 + 7 * SIZE], y8 FADD y1, t1, y1 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a5, x1, t1 LDF [A3 + 4 * SIZE], a5 FADD y2, t2, y2 nop FMUL a6, x1, t2 LDF [A3 + 5 * SIZE], a6 FADD y3, t3, y3 nop FMUL a7, x1, t3 LDF [A3 + 6 * SIZE], a7 FADD y4, t4, y4 nop FMUL a8, x1, t4 LDF [A3 + 7 * SIZE], a8 FADD y5, t1, y5 nop FMUL a9, x2, t1 LDF [A4 + 0 * SIZE], a9 FADD y6, t2, y6 nop FMUL a10, x2, t2 LDF [A4 + 1 * SIZE], a10 FADD y7, t3, y7 nop FMUL a11, x2, t3 LDF [A4 + 2 * SIZE], a11 FADD y8, t4, y8 nop FMUL a12, x2, t4 LDF [A4 + 3 * SIZE], a12 FADD y1, t1, y1 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FMUL a13, x2, t1 LDF [A4 + 4 * SIZE], a13 FADD y2, t2, y2 nop FMUL a14, x2, t2 LDF [A4 + 5 * SIZE], a14 FADD y3, t3, y3 nop FMUL a15, x2, t3 LDF [A4 + 6 * SIZE], a15 FADD y4, t4, y4 nop FMUL a16, x2, t4 LDF [A4 + 7 * SIZE], a16 FADD y5, t1, y5 nop FMUL a1, x3, t1 LDF [A1 + 8 * SIZE], a1 FADD y6, t2, y6 nop FMUL a2, x3, t2 LDF [A1 + 9 * SIZE], a2 FADD y7, t3, y7 nop FMUL a3, x3, t3 LDF [A1 + 10 * SIZE], a3 FADD y8, t4, y8 nop FMUL a4, x3, t4 LDF [A1 + 11 * SIZE], a4 FADD y1, t1, y1 prefetch [A3 + PREFETCHSIZE * SIZE], 1 FMUL a5, x3, t1 LDF [A1 + 12 * SIZE], a5 FADD y2, t2, y2 nop FMUL a6, x3, t2 LDF [A1 + 13 * SIZE], a6 FADD y3, t3, y3 nop FMUL a7, x3, t3 LDF [A1 + 14 * SIZE], a7 FADD y4, t4, y4 nop FMUL a8, x3, t4 LDF [A1 + 15 * SIZE], a8 FADD y5, t1, y5 nop FMUL a9, x4, t1 LDF [A2 + 8 * SIZE], a9 FADD y6, t2, y6 nop FMUL a10, x4, t2 LDF [A2 + 9 * SIZE], a10 FADD y7, t3, y7 nop FMUL a11, x4, t3 LDF [A2 + 10 * SIZE], a11 FADD y8, t4, y8 nop FMUL a12, x4, t4 LDF [A2 + 11 * SIZE], a12 FADD y1, t1, y1 prefetch [A4 + PREFETCHSIZE * SIZE], 1 FMUL a13, x4, t1 LDF [A2 + 12 * SIZE], a13 FADD y2, t2, y2 add A3, 8 * SIZE, A3 FMUL a14, x4, t2 LDF [A2 + 13 * SIZE], a14 FADD y3, t3, y3 add Y1, 8 * SIZE, Y1 FMUL a15, x4, t3 LDF [A2 + 14 * SIZE], a15 FADD y4, t4, y4 deccc I FMUL a16, x4, t4 LDF [A2 + 15 * SIZE], a16 FADD y5, t1, y5 add A1, 8 * SIZE, A1 FMUL a1, x1, t1 LDF [A3 + 0 * SIZE], a1 FADD y6, t2, y6 add A2, 8 * SIZE, A2 FMUL a2, x1, t2 LDF [A3 + 1 * SIZE], a2 FADD y7, t3, y7 add A4, 8 * SIZE, A4 FMUL a3, x1, t3 LDF [A3 + 2 * SIZE], a3 FADD y8, t4, y8 nop FMUL a4, x1, t4 LDF [A3 + 3 * SIZE], a4 STF y1, [Y1 - 8 * SIZE] STF y2, [Y1 - 7 * SIZE] STF y3, [Y1 - 6 * SIZE] STF y4, [Y1 - 5 * SIZE] STF y5, [Y1 - 4 * SIZE] STF y6, [Y1 - 3 * SIZE] STF y7, [Y1 - 2 * SIZE] bg,pn %icc, .LL12 STF y8, [Y1 - 1 * SIZE] .LL13: LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 LDF [Y1 + 4 * SIZE], y5 LDF [Y1 + 5 * SIZE], y6 LDF [Y1 + 6 * SIZE], y7 LDF [Y1 + 7 * SIZE], y8 FADD y1, t1, y1 FMUL a5, x1, t1 LDF [A3 + 0 * SIZE], a1 FADD y2, t2, y2 FMUL a6, x1, t2 LDF [A3 + 1 * SIZE], a2 FADD y3, t3, y3 FMUL a7, x1, t3 LDF [A3 + 2 * SIZE], a3 FADD y4, t4, y4 FMUL a8, x1, t4 LDF [A3 + 3 * SIZE], a4 FADD y5, t1, y5 FMUL a9, x2, t1 LDF [A3 + 4 * SIZE], a5 FADD y6, t2, y6 FMUL a10, x2, t2 LDF [A3 + 5 * SIZE], a6 FADD y7, t3, y7 FMUL a11, x2, t3 LDF [A3 + 6 * SIZE], a7 FADD y8, t4, y8 FMUL a12, x2, t4 LDF [A3 + 7 * SIZE], a8 FADD y1, t1, y1 FMUL a13, x2, t1 LDF [A4 + 0 * SIZE], a9 FADD y2, t2, y2 FMUL a14, x2, t2 LDF [A4 + 1 * SIZE], a10 FADD y3, t3, y3 FMUL a15, x2, t3 LDF [A4 + 2 * SIZE], a11 FADD y4, t4, y4 FMUL a16, x2, t4 LDF [A4 + 3 * SIZE], a12 FADD y5, t1, y5 FMUL a1, x3, t1 LDF [A4 + 4 * SIZE], a13 FADD y6, t2, y6 FMUL a2, x3, t2 LDF [A4 + 5 * SIZE], a14 FADD y7, t3, y7 FMUL a3, x3, t3 LDF [A4 + 6 * SIZE], a15 FADD y8, t4, y8 FMUL a4, x3, t4 LDF [A4 + 7 * SIZE], a16 FADD y1, t1, y1 FMUL a5, x3, t1 FADD y2, t2, y2 FMUL a6, x3, t2 FADD y3, t3, y3 FMUL a7, x3, t3 FADD y4, t4, y4 FMUL a8, x3, t4 FADD y5, t1, y5 FMUL a9, x4, t1 FADD y6, t2, y6 FMUL a10, x4, t2 FADD y7, t3, y7 FMUL a11, x4, t3 FADD y8, t4, y8 FMUL a12, x4, t4 FADD y1, t1, y1 FMUL a13, x4, t1 FADD y2, t2, y2 FMUL a14, x4, t2 FADD y3, t3, y3 FMUL a15, x4, t3 FADD y4, t4, y4 FMUL a16, x4, t4 add A4, 8 * SIZE, A4 STF y1, [Y1 + 0 * SIZE] FADD y5, t1, y5 STF y2, [Y1 + 1 * SIZE] FADD y6, t2, y6 STF y3, [Y1 + 2 * SIZE] FADD y7, t3, y7 STF y4, [Y1 + 3 * SIZE] FADD y8, t4, y8 STF y5, [Y1 + 4 * SIZE] add A1, 8 * SIZE, A1 STF y6, [Y1 + 5 * SIZE] add A2, 8 * SIZE, A2 STF y7, [Y1 + 6 * SIZE] add A3, 8 * SIZE, A3 STF y8, [Y1 + 7 * SIZE] add Y1, 8 * SIZE, Y1 .LL16: andcc M, 4, I ble,pn %icc, .LL17 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A2 + 0 * SIZE], a5 LDF [A2 + 1 * SIZE], a6 LDF [A2 + 2 * SIZE], a7 LDF [A2 + 3 * SIZE], a8 LDF [A3 + 0 * SIZE], a9 LDF [A3 + 1 * SIZE], a10 LDF [A3 + 2 * SIZE], a11 LDF [A3 + 3 * SIZE], a12 LDF [A4 + 0 * SIZE], a13 LDF [A4 + 1 * SIZE], a14 LDF [A4 + 2 * SIZE], a15 LDF [A4 + 3 * SIZE], a16 LDF [Y1 + 0 * SIZE], y1 add A1, 4 * SIZE, A1 LDF [Y1 + 1 * SIZE], y2 add A2, 4 * SIZE, A2 LDF [Y1 + 2 * SIZE], y3 add A3, 4 * SIZE, A3 LDF [Y1 + 3 * SIZE], y4 add A4, 4 * SIZE, A4 FMUL a1, x1, t1 FMUL a2, x1, t2 FMUL a3, x1, t3 FMUL a4, x1, t4 FADD y1, t1, y1 FMUL a5, x2, t1 FADD y2, t2, y2 FMUL a6, x2, t2 FADD y3, t3, y3 FMUL a7, x2, t3 FADD y4, t4, y4 FMUL a8, x2, t4 FADD y1, t1, y1 FMUL a9, x3, t1 FADD y2, t2, y2 FMUL a10, x3, t2 FADD y3, t3, y3 FMUL a11, x3, t3 FADD y4, t4, y4 FMUL a12, x3, t4 FADD y1, t1, y1 FMUL a13, x4, t1 FADD y2, t2, y2 FMUL a14, x4, t2 FADD y3, t3, y3 FMUL a15, x4, t3 FADD y4, t4, y4 FMUL a16, x4, t4 FADD y1, t1, y1 FADD y2, t2, y2 FADD y3, t3, y3 FADD y4, t4, y4 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] add Y1, 4 * SIZE, Y1 .LL17: andcc M, 2, I ble,pn %icc, .LL18 nop LDF [A1 + 0 * SIZE], a1 LDF [A2 + 0 * SIZE], a2 LDF [A3 + 0 * SIZE], a3 LDF [A4 + 0 * SIZE], a4 LDF [Y1 + 0 * SIZE], y1 LDF [A1 + 1 * SIZE], a5 LDF [A2 + 1 * SIZE], a6 LDF [A3 + 1 * SIZE], a7 LDF [A4 + 1 * SIZE], a8 LDF [Y1 + 1 * SIZE], y2 add A1, 2 * SIZE, A1 add A2, 2 * SIZE, A2 add A3, 2 * SIZE, A3 add A4, 2 * SIZE, A4 FMUL a1, x1, t1 FMUL a2, x2, t2 FMUL a3, x3, t3 FMUL a4, x4, t4 FADD y1, t1, y1 FMUL a5, x1, t1 FADD y1, t2, y1 FMUL a6, x2, t2 FADD y1, t3, y1 FMUL a7, x3, t3 FADD y1, t4, y1 FMUL a8, x4, t4 FADD y2, t1, y2 FADD y2, t2, y2 FADD y2, t3, y2 FADD y2, t4, y2 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] add Y1, 2 * SIZE, Y1 .LL18: andcc M, 1, I ble,pn %icc, .LL19 nop LDF [A1 + 0 * SIZE], a1 LDF [A2 + 0 * SIZE], a2 LDF [A3 + 0 * SIZE], a3 LDF [A4 + 0 * SIZE], a4 LDF [Y1 + 0 * SIZE], y1 FMUL a1, x1, t1 FMUL a2, x2, t2 FMUL a3, x3, t3 FMUL a4, x4, t4 FADD y1, t1, y1 FADD y1, t2, y1 FADD y1, t3, y1 FADD y1, t4, y1 STF y1, [Y1] .LL19: deccc J bg %icc, .LL11 nop .LL20: andcc N, 2, J ble,pn %icc, .LL30 nop .LL21: mov YY, Y1 mov A, A1 add A, LDA, A2 add A2, LDA, A LDF STACK_ALPHA, ALPHA LDF [X], x1 add X, INCX, X LDF [X], x2 add X, INCX, X FMUL ALPHA, x1, x1 FMUL ALPHA, x2, x2 sra M, 3, I cmp I, 0 ble,pn %icc, .LL26 nop LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 LDF [Y1 + 4 * SIZE], y5 LDF [Y1 + 5 * SIZE], y6 LDF [Y1 + 6 * SIZE], y7 LDF [Y1 + 7 * SIZE], y8 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a5 LDF [A1 + 5 * SIZE], a6 LDF [A1 + 6 * SIZE], a7 LDF [A1 + 7 * SIZE], a8 LDF [A2 + 0 * SIZE], a9 LDF [A2 + 1 * SIZE], a10 LDF [A2 + 2 * SIZE], a11 LDF [A2 + 3 * SIZE], a12 LDF [A2 + 4 * SIZE], a13 LDF [A2 + 5 * SIZE], a14 LDF [A2 + 6 * SIZE], a15 LDF [A2 + 7 * SIZE], a16 FMUL a1, x1, t1 deccc I LDF [A1 + 8 * SIZE], a1 FMUL a2, x1, t2 LDF [A1 + 9 * SIZE], a2 FMUL a3, x1, t3 LDF [A1 + 10 * SIZE], a3 FMUL a4, x1, t4 ble,pn %icc, .LL23 LDF [A1 + 11 * SIZE], a4 .LL22: FADD y1, t1, y1 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a5, x1, t1 LDF [A1 + 12 * SIZE], a5 FADD y2, t2, y2 FMUL a6, x1, t2 LDF [A1 + 13 * SIZE], a6 FADD y3, t3, y3 FMUL a7, x1, t3 LDF [A1 + 14 * SIZE], a7 FADD y4, t4, y4 FMUL a8, x1, t4 LDF [A1 + 15 * SIZE], a8 FADD y5, t1, y5 FMUL a9, x2, t1 LDF [A2 + 8 * SIZE], a9 FADD y6, t2, y6 FMUL a10, x2, t2 LDF [A2 + 9 * SIZE], a10 FADD y7, t3, y7 FMUL a11, x2, t3 LDF [A2 + 10 * SIZE], a11 FADD y8, t4, y8 FMUL a12, x2, t4 LDF [A2 + 11 * SIZE], a12 FADD y1, t1, y1 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FMUL a13, x2, t1 LDF [A2 + 12 * SIZE], a13 FADD y2, t2, y2 FMUL a14, x2, t2 LDF [A2 + 13 * SIZE], a14 FADD y3, t3, y3 FMUL a15, x2, t3 LDF [A2 + 14 * SIZE], a15 FADD y4, t4, y4 FMUL a16, x2, t4 LDF [A2 + 15 * SIZE], a16 FADD y5, t1, y5 FMUL a1, x1, t1 LDF [A1 + 16 * SIZE], a1 FADD y6, t2, y6 FMUL a2, x1, t2 LDF [A1 + 17 * SIZE], a2 FADD y7, t3, y7 FMUL a3, x1, t3 LDF [A1 + 18 * SIZE], a3 FADD y8, t4, y8 FMUL a4, x1, t4 LDF [A1 + 19 * SIZE], a4 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] STF y5, [Y1 + 4 * SIZE] STF y6, [Y1 + 5 * SIZE] STF y7, [Y1 + 6 * SIZE] STF y8, [Y1 + 7 * SIZE] LDF [Y1 + 8 * SIZE], y1 add A1, 8 * SIZE, A1 LDF [Y1 + 9 * SIZE], y2 add A2, 8 * SIZE, A2 LDF [Y1 + 10 * SIZE], y3 deccc I LDF [Y1 + 11 * SIZE], y4 LDF [Y1 + 12 * SIZE], y5 LDF [Y1 + 13 * SIZE], y6 LDF [Y1 + 14 * SIZE], y7 LDF [Y1 + 15 * SIZE], y8 bg,pn %icc, .LL22 add Y1, 8 * SIZE, Y1 .LL23: FADD y1, t1, y1 FMUL a5, x1, t1 FADD y2, t2, y2 FMUL a6, x1, t2 FADD y3, t3, y3 FMUL a7, x1, t3 FADD y4, t4, y4 FMUL a8, x1, t4 FADD y5, t1, y5 FMUL a9, x2, t1 FADD y6, t2, y6 FMUL a10, x2, t2 FADD y7, t3, y7 FMUL a11, x2, t3 FADD y8, t4, y8 FMUL a12, x2, t4 FADD y1, t1, y1 FMUL a13, x2, t1 FADD y2, t2, y2 FMUL a14, x2, t2 FADD y3, t3, y3 FMUL a15, x2, t3 FADD y4, t4, y4 FMUL a16, x2, t4 STF y1, [Y1 + 0 * SIZE] FADD y5, t1, y5 STF y2, [Y1 + 1 * SIZE] FADD y6, t2, y6 STF y3, [Y1 + 2 * SIZE] FADD y7, t3, y7 STF y4, [Y1 + 3 * SIZE] FADD y8, t4, y8 STF y5, [Y1 + 4 * SIZE] add A1, 8 * SIZE, A1 STF y6, [Y1 + 5 * SIZE] add A2, 8 * SIZE, A2 STF y7, [Y1 + 6 * SIZE] nop STF y8, [Y1 + 7 * SIZE] add Y1, 8 * SIZE, Y1 .LL26: andcc M, 4, I ble,pn %icc, .LL27 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A2 + 0 * SIZE], a5 LDF [A2 + 1 * SIZE], a6 LDF [A2 + 2 * SIZE], a7 LDF [A2 + 3 * SIZE], a8 LDF [Y1 + 0 * SIZE], y1 add A1, 4 * SIZE, A1 LDF [Y1 + 1 * SIZE], y2 add A2, 4 * SIZE, A2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 FMUL a1, x1, t1 FMUL a2, x1, t2 FMUL a3, x1, t3 FMUL a4, x1, t4 FADD y1, t1, y1 FMUL a5, x2, t1 FADD y2, t2, y2 FMUL a6, x2, t2 FADD y3, t3, y3 FMUL a7, x2, t3 FADD y4, t4, y4 FMUL a8, x2, t4 FADD y1, t1, y1 FADD y2, t2, y2 FADD y3, t3, y3 FADD y4, t4, y4 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] add Y1, 4 * SIZE, Y1 .LL27: andcc M, 2, I ble,pn %icc, .LL28 nop LDF [A1 + 0 * SIZE], a1 LDF [A2 + 0 * SIZE], a2 LDF [Y1 + 0 * SIZE], y1 LDF [A1 + 1 * SIZE], a5 LDF [A2 + 1 * SIZE], a6 add A1, 2 * SIZE, A1 LDF [Y1 + 1 * SIZE], y2 add A2, 2 * SIZE, A2 FMUL a1, x1, t1 FMUL a2, x2, t2 FADD y1, t1, y1 FMUL a5, x1, t1 FADD y1, t2, y1 FMUL a6, x2, t2 FADD y2, t1, y2 FADD y2, t2, y2 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] add Y1, 2 * SIZE, Y1 .LL28: andcc M, 1, I ble,pn %icc, .LL30 nop LDF [A1 + 0 * SIZE], a1 LDF [A2 + 0 * SIZE], a2 LDF [Y1 + 0 * SIZE], y1 FMUL a1, x1, t1 FMUL a2, x2, t2 FADD y1, t1, y1 FADD y1, t2, y1 STF y1, [Y1] .LL30: andcc N, 1, J ble,pn %icc, .LL990 nop .LL31: mov YY, Y1 mov A, A1 LDF STACK_ALPHA, ALPHA LDF [X], x1 add X, INCX, X FMUL ALPHA, x1, x1 sra M, 3, I cmp I, 0 ble,pn %icc, .LL36 nop LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 LDF [Y1 + 4 * SIZE], y5 LDF [Y1 + 5 * SIZE], y6 LDF [Y1 + 6 * SIZE], y7 LDF [Y1 + 7 * SIZE], y8 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a5 LDF [A1 + 5 * SIZE], a6 LDF [A1 + 6 * SIZE], a7 LDF [A1 + 7 * SIZE], a8 FMUL a1, x1, t1 deccc I LDF [A1 + 8 * SIZE], a1 FMUL a2, x1, t2 LDF [A1 + 9 * SIZE], a2 FMUL a3, x1, t3 LDF [A1 + 10 * SIZE], a3 FMUL a4, x1, t4 ble,pn %icc, .LL33 LDF [A1 + 11 * SIZE], a4 .LL32: FADD y1, t1, y1 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a5, x1, t1 LDF [A1 + 12 * SIZE], a5 FADD y2, t2, y2 FMUL a6, x1, t2 LDF [A1 + 13 * SIZE], a6 FADD y3, t3, y3 FMUL a7, x1, t3 LDF [A1 + 14 * SIZE], a7 FADD y4, t4, y4 FMUL a8, x1, t4 LDF [A1 + 15 * SIZE], a8 FADD y5, t1, y5 FMUL a1, x1, t1 LDF [A1 + 16 * SIZE], a1 FADD y6, t2, y6 FMUL a2, x1, t2 LDF [A1 + 17 * SIZE], a2 FADD y7, t3, y7 FMUL a3, x1, t3 LDF [A1 + 18 * SIZE], a3 FADD y8, t4, y8 FMUL a4, x1, t4 LDF [A1 + 19 * SIZE], a4 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] STF y5, [Y1 + 4 * SIZE] STF y6, [Y1 + 5 * SIZE] STF y7, [Y1 + 6 * SIZE] STF y8, [Y1 + 7 * SIZE] LDF [Y1 + 8 * SIZE], y1 LDF [Y1 + 9 * SIZE], y2 LDF [Y1 + 10 * SIZE], y3 LDF [Y1 + 11 * SIZE], y4 LDF [Y1 + 12 * SIZE], y5 deccc I LDF [Y1 + 13 * SIZE], y6 add A1, 8 * SIZE, A1 LDF [Y1 + 14 * SIZE], y7 add Y1, 8 * SIZE, Y1 bg,pn %icc, .LL32 LDF [Y1 + 7 * SIZE], y8 .LL33: FADD y1, t1, y1 FMUL a5, x1, t1 FADD y2, t2, y2 FMUL a6, x1, t2 FADD y3, t3, y3 FMUL a7, x1, t3 FADD y4, t4, y4 FMUL a8, x1, t4 STF y1, [Y1 + 0 * SIZE] FADD y5, t1, y5 STF y2, [Y1 + 1 * SIZE] FADD y6, t2, y6 STF y3, [Y1 + 2 * SIZE] FADD y7, t3, y7 STF y4, [Y1 + 3 * SIZE] FADD y8, t4, y8 STF y5, [Y1 + 4 * SIZE] STF y6, [Y1 + 5 * SIZE] STF y7, [Y1 + 6 * SIZE] add A1, 8 * SIZE, A1 STF y8, [Y1 + 7 * SIZE] add Y1, 8 * SIZE, Y1 .LL36: andcc M, 4, I ble,pn %icc, .LL37 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [Y1 + 0 * SIZE], y1 add A1, 4 * SIZE, A1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 FMUL a1, x1, t1 FMUL a2, x1, t2 FMUL a3, x1, t3 FMUL a4, x1, t4 FADD y1, t1, y1 FADD y2, t2, y2 FADD y3, t3, y3 FADD y4, t4, y4 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] add Y1, 4 * SIZE, Y1 .LL37: andcc M, 2, I ble,pn %icc, .LL38 nop LDF [A1 + 0 * SIZE], a1 LDF [Y1 + 0 * SIZE], y1 LDF [A1 + 1 * SIZE], a5 LDF [Y1 + 1 * SIZE], y2 add A1, 2 * SIZE, A1 FMUL a1, x1, t1 FADD y1, t1, y1 FMUL a5, x1, t1 FADD y2, t1, y2 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] add Y1, 2 * SIZE, Y1 .LL38: andcc M, 1, I ble,pn %icc, .LL990 nop LDF [A1 + 0 * SIZE], a1 LDF [Y1 + 0 * SIZE], y1 FMUL a1, x1, t1 FADD y1, t1, y1 STF y1, [Y1] .LL990: cmp INCY, SIZE be %icc, .LL999 mov Y, Y1 sra M, 3, I cmp I, 0 ble,pn %icc, .LL995 nop .LL991: LDF [BUFFER + 0 * SIZE], a1 LDF [Y], y1 add Y, INCY, Y LDF [BUFFER + 1 * SIZE], a2 LDF [Y], y2 add Y, INCY, Y LDF [BUFFER + 2 * SIZE], a3 LDF [Y], y3 add Y, INCY, Y LDF [BUFFER + 3 * SIZE], a4 LDF [Y], y4 add Y, INCY, Y LDF [BUFFER + 4 * SIZE], a5 FADD y1, a1, y1 LDF [Y], y5 add Y, INCY, Y LDF [BUFFER + 5 * SIZE], a6 FADD y2, a2, y2 LDF [Y], y6 add Y, INCY, Y LDF [BUFFER + 6 * SIZE], a7 FADD y3, a3, y3 LDF [Y], y7 add Y, INCY, Y LDF [BUFFER + 7 * SIZE], a8 FADD y4, a4, y4 LDF [Y], y8 add Y, INCY, Y STF y1, [Y1] FADD y5, a5, y5 add Y1, INCY, Y1 STF y2, [Y1] FADD y6, a6, y6 add Y1, INCY, Y1 STF y3, [Y1] FADD y7, a7, y7 add Y1, INCY, Y1 STF y4, [Y1] FADD y8, a8, y8 add Y1, INCY, Y1 STF y5, [Y1] add Y1, INCY, Y1 STF y6, [Y1] add Y1, INCY, Y1 STF y7, [Y1] add Y1, INCY, Y1 STF y8, [Y1] add Y1, INCY, Y1 deccc I bg,pn %icc, .LL991 add BUFFER, 8 * SIZE, BUFFER .LL995: andcc M, 7, I ble,pn %icc, .LL999 nop andcc M, 4, I ble,pn %icc, .LL996 nop LDF [BUFFER + 0 * SIZE], a1 LDF [BUFFER + 1 * SIZE], a2 LDF [BUFFER + 2 * SIZE], a3 LDF [BUFFER + 3 * SIZE], a4 add BUFFER, 4 * SIZE, BUFFER LDF [Y], y1 add Y, INCY, Y LDF [Y], y2 add Y, INCY, Y LDF [Y], y3 add Y, INCY, Y LDF [Y], y4 add Y, INCY, Y FADD y1, a1, y1 FADD y2, a2, y2 FADD y3, a3, y3 FADD y4, a4, y4 STF y1, [Y1] add Y1, INCY, Y1 STF y2, [Y1] add Y1, INCY, Y1 STF y3, [Y1] add Y1, INCY, Y1 STF y4, [Y1] add Y1, INCY, Y1 .LL996: andcc M, 2, I ble,pn %icc, .LL997 nop LDF [BUFFER + 0 * SIZE], a1 LDF [BUFFER + 1 * SIZE], a2 add BUFFER, 2 * SIZE, BUFFER LDF [Y], y1 add Y, INCY, Y LDF [Y], y2 add Y, INCY, Y FADD y1, a1, y1 FADD y2, a2, y2 STF y1, [Y1] add Y1, INCY, Y1 STF y2, [Y1] add Y1, INCY, Y1 .LL997: andcc M, 1, I ble,pn %icc, .LL999 nop LDF [BUFFER + 0 * SIZE], a1 LDF [Y], y1 FADD y1, a1, y1 STF y1, [Y1] .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/gemv_t.S000066400000000000000000000313351313527062700171460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define P 1020 #define M %i0 #define N %i1 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define LDA %i2 #define X %i3 #define INCX %i4 #else #define A %i4 #define LDA %i5 #define X %i2 #define INCX %i3 #endif #define Y %l0 #define INCY %l1 #define BUFFER %l2 #define I %l3 #define IS %l4 #define J %l5 #define MIN_M %l6 #define XP %l7 #define A1 %o0 #define A2 %o1 #define A3 %o2 #define A4 %o3 #define X1 %o4 #define Y1 %o5 #define PNLDA %g1 #define Y2 %o7 /* Danger? */ #ifdef DOUBLE #define t1 %f0 #define t2 %f2 #define t3 %f4 #define t4 %f6 #define c1 %f8 #define c2 %f10 #define c3 %f12 #define c4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #define a9 %f32 #define a10 %f34 #define a11 %f36 #define a12 %f38 #define a13 %f40 #define a14 %f42 #define a15 %f44 #define a16 %f46 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define b6 %f58 #define b7 %f60 #define b8 %f62 #define FZERO %f60 #define ALPHA %f62 #else #define t1 %f0 #define t2 %f1 #define t3 %f2 #define t4 %f3 #define c1 %f4 #define c2 %f5 #define c3 %f6 #define c4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #define a9 %f16 #define a10 %f17 #define a11 %f18 #define a12 %f19 #define a13 %f20 #define a14 %f21 #define a15 %f22 #define a16 %f23 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define b6 %f29 #define b7 %f30 #define b8 %f31 #define FZERO %f30 #define ALPHA %f31 #endif #ifndef __64BIT__ #define STACK_FZERO [%sp + STACK_START + 8] #define STACK_ALPHA [%sp + STACK_START + 16] #else #define STACK_FZERO [%sp + STACK_START + 32] #define STACK_ALPHA [%sp + STACK_START + 40] #endif #ifdef DOUBLE #define PREFETCHSIZE 36 #else #define PREFETCHSIZE 72 #endif PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] /* ALPHA */ st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], LDA ld [%sp + STACK_START + 32], X ld [%sp + STACK_START + 36], INCX ld [%sp + STACK_START + 40], Y ld [%sp + STACK_START + 44], INCY ld [%sp + STACK_START + 48], BUFFER #else st %i3, [%sp + STACK_START + 16] /* ALPHA */ ld [%sp + STACK_START + 28], X ld [%sp + STACK_START + 32], INCX ld [%sp + STACK_START + 36], Y ld [%sp + STACK_START + 40], INCY ld [%sp + STACK_START + 44], BUFFER #endif LDF [%sp + STACK_START + 16], ALPHA #else ldx [%sp+ STACK_START + 56], X ldx [%sp+ STACK_START + 64], INCX ldx [%sp+ STACK_START + 72], Y ldx [%sp+ STACK_START + 80], INCY ldx [%sp+ STACK_START + 88], BUFFER #ifdef DOUBLE FMOV %f6, ALPHA STF %f6, STACK_ALPHA #else FMOV %f7, ALPHA STF %f7, STACK_ALPHA #endif #endif #ifdef DOUBLE FCLR(29) #else FCLR(30) #endif clr IS mov P, I sll LDA, BASE_SHIFT, LDA sll I, BASE_SHIFT, I smul LDA, N, PNLDA sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY sub I, PNLDA, PNLDA .LL10: sll IS, BASE_SHIFT, I sub M, IS, MIN_M cmp MIN_M, P nop movg %icc, P, MIN_M nop cmp INCX, SIZE beq .LL100 add X, I, XP sra MIN_M, 2, I mov BUFFER, XP cmp I, 0 ble,pn %icc, .LL15 mov BUFFER, Y1 .LL11: LDF [X], a1 add X, INCX, X LDF [X], a2 add X, INCX, X LDF [X], a3 add X, INCX, X LDF [X], a4 add X, INCX, X STF a1, [Y1 + 0 * SIZE] add I, -1, I STF a2, [Y1 + 1 * SIZE] cmp I, 0 STF a3, [Y1 + 2 * SIZE] STF a4, [Y1 + 3 * SIZE] bg,pn %icc, .LL11 add Y1, 4 * SIZE, Y1 .LL15: and MIN_M, 3, I cmp I, 0 ble,pn %icc, .LL100 nop .LL16: LDF [X], a1 add X, INCX, X add I, -1, I cmp I, 0 nop STF a1, [Y1] bg,pn %icc, .LL16 add Y1, 1 * SIZE, Y1 .LL100: sra N, 1, J cmp J, 0 ble %icc, .LL200 mov Y, Y1 .LL110: #ifdef DOUBLE FCLR(29) #else FCLR(30) #endif FMOV FZERO, c1 FMOV FZERO, c2 FMOV FZERO, c3 FMOV FZERO, c4 FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 mov A, A1 add A, LDA, A2 add A2, LDA, A mov XP, X1 sra MIN_M, 3, I cmp I, 0 ble %icc, .LL115 prefetch [Y1 + 2 * SIZE], 0 LDF [A1 + 0 * SIZE], a1 deccc I LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a5 LDF [A1 + 5 * SIZE], a6 LDF [A1 + 6 * SIZE], a7 LDF [A1 + 7 * SIZE], a8 LDF [A2 + 0 * SIZE], a9 LDF [A2 + 1 * SIZE], a10 LDF [A2 + 2 * SIZE], a11 LDF [A2 + 3 * SIZE], a12 LDF [A2 + 4 * SIZE], a13 LDF [A2 + 5 * SIZE], a14 LDF [A2 + 6 * SIZE], a15 LDF [A2 + 7 * SIZE], a16 LDF [X1 + 0 * SIZE], b1 LDF [X1 + 1 * SIZE], b2 LDF [X1 + 2 * SIZE], b3 LDF [X1 + 3 * SIZE], b4 LDF [X1 + 4 * SIZE], b5 LDF [X1 + 5 * SIZE], b6 ble %icc, .LL112 LDF [X1 + 6 * SIZE], b7 .LL111: FADD c1, t1, c1 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a1, b1, t1 LDF [A1 + 8 * SIZE], a1 FADD c2, t2, c2 LDF [X1 + 7 * SIZE], b8 FMUL a9, b1, t2 LDF [A2 + 8 * SIZE], a9 FADD c3, t3, c3 LDF [X1 + 8 * SIZE], b1 FMUL a2, b2, t3 LDF [A1 + 9 * SIZE], a2 FADD c4, t4, c4 deccc I FMUL a10, b2, t4 LDF [A2 + 9 * SIZE], a10 FADD c1, t1, c1 LDF [X1 + 9 * SIZE], b2 FMUL a3, b3, t1 LDF [A1 + 10 * SIZE], a3 FADD c2, t2, c2 nop FMUL a11, b3, t2 LDF [A2 + 10 * SIZE], a11 FADD c3, t3, c3 LDF [X1 + 10 * SIZE], b3 FMUL a4, b4, t3 LDF [A1 + 11 * SIZE], a4 FADD c4, t4, c4 nop FMUL a12, b4, t4 LDF [A2 + 11 * SIZE], a12 FADD c1, t1, c1 LDF [X1 + 11 * SIZE], b4 FMUL a5, b5, t1 LDF [A1 + 12 * SIZE], a5 FADD c2, t2, c2 prefetch [A2 + (PREFETCHSIZE + 4) * SIZE], 1 FMUL a13, b5, t2 LDF [A2 + 12 * SIZE], a13 FADD c3, t3, c3 LDF [X1 + 12 * SIZE], b5 FMUL a6, b6, t3 LDF [A1 + 13 * SIZE], a6 FADD c4, t4, c4 FMUL a14, b6, t4 LDF [A2 + 13 * SIZE], a14 FADD c1, t1, c1 LDF [X1 + 13 * SIZE], b6 FMUL a7, b7, t1 LDF [A1 + 14 * SIZE], a7 FADD c2, t2, c2 add X1, 8 * SIZE, X1 FMUL a15, b7, t2 LDF [A2 + 14 * SIZE], a15 FADD c3, t3, c3 LDF [X1 + 6 * SIZE], b7 FMUL a8, b8, t3 LDF [A1 + 15 * SIZE], a8 FADD c4, t4, c4 add A1, 8 * SIZE, A1 FMUL a16, b8, t4 LDF [A2 + 15 * SIZE], a16 bg,pn %icc, .LL111 add A2, 8 * SIZE, A2 .LL112: FADD c1, t1, c1 LDF [X1 + 7 * SIZE], b8 FMUL a1, b1, t1 add A1, 8 * SIZE, A1 FADD c2, t2, c2 add A2, 8 * SIZE, A2 FMUL a9, b1, t2 add X1, 8 * SIZE, X1 FADD c3, t3, c3 FMUL a2, b2, t3 FADD c4, t4, c4 FMUL a10, b2, t4 FADD c1, t1, c1 FMUL a3, b3, t1 FADD c2, t2, c2 FMUL a11, b3, t2 FADD c3, t3, c3 FMUL a4, b4, t3 FADD c4, t4, c4 FMUL a12, b4, t4 FADD c1, t1, c1 FMUL a5, b5, t1 FADD c2, t2, c2 FMUL a13, b5, t2 FADD c3, t3, c3 FMUL a6, b6, t3 FADD c4, t4, c4 FMUL a14, b6, t4 FADD c1, t1, c1 FMUL a7, b7, t1 FADD c2, t2, c2 FMUL a15, b7, t2 FADD c3, t3, c3 FMUL a8, b8, t3 FADD c4, t4, c4 FMUL a16, b8, t4 .LL115: andcc MIN_M, 7, I ble %icc, .LL119 mov Y1, Y2 LDF [X1 + 0 * SIZE], b1 deccc I LDF [A1 + 0 * SIZE], a1 ble %icc, .LL117 LDF [A2 + 0 * SIZE], a2 .LL116: FADD c1, t1, c1 add X1, 1 * SIZE, X1 FMUL a1, b1, t1 LDF [A1 + 1 * SIZE], a1 FADD c2, t2, c2 add A1, 1 * SIZE, A1 FMUL a2, b1, t2 LDF [X1 + 0 * SIZE], b1 add A2, 1 * SIZE, A2 deccc I bg,pn %icc, .LL116 LDF [A2 + 0 * SIZE], a2 .LL117: FADD c1, t1, c1 add X1, 1 * SIZE, X1 FADD c2, t2, c2 add A1, 1 * SIZE, A1 FMUL a1, b1, t1 add A2, 1 * SIZE, A2 FMUL a2, b1, t2 nop .LL119: FADD c1, t1, c1 FADD c2, t2, c2 FADD c3, t3, c3 FADD c4, t4, c4 FADD c1, c3, c1 FADD c2, c4, c2 LDF [Y1], a1 LDF [Y1 + INCY], a2 add Y1, INCY, Y1 add Y1, INCY, Y1 LDF STACK_ALPHA, ALPHA FMUL ALPHA, c1, c1 FMUL ALPHA, c2, c2 FADD a1, c1, a1 FADD a2, c2, a2 STF a1, [Y2] STF a2, [Y2 + INCY] deccc J bg %icc, .LL110 #ifdef DOUBLE FCLR(29) #else FCLR(30) #endif .LL200: andcc N, 1, J nop ble %icc, .LL400 FMOV FZERO, c1 .LL310: FMOV FZERO, t1 sra MIN_M, 3, I FMOV FZERO, c2 mov A, A1 FMOV FZERO, t2 add A, LDA, A FMOV FZERO, t3 cmp I, 0 FMOV FZERO, t4 ble %icc, .LL315 mov XP, X1 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a5 LDF [A1 + 5 * SIZE], a6 LDF [A1 + 6 * SIZE], a7 LDF [A1 + 7 * SIZE], a8 add A1, 8 * SIZE, A1 LDF [X1 + 0 * SIZE], a9 add I, -1, I LDF [X1 + 1 * SIZE], a10 cmp I, 0 LDF [X1 + 2 * SIZE], a11 LDF [X1 + 3 * SIZE], a12 LDF [X1 + 4 * SIZE], a13 LDF [X1 + 5 * SIZE], a14 LDF [X1 + 6 * SIZE], a15 LDF [X1 + 7 * SIZE], a16 ble %icc, .LL312 add X1, 8 * SIZE, X1 .LL311: prefetch [A1 + PREFETCHSIZE * SIZE], 1 FADD c1, t1, c1 FMUL a1, a9, t1 LDF [A1 + 0 * SIZE], a1 LDF [X1 + 0 * SIZE], a9 FADD c2, t2, c2 FMUL a2, a10, t2 LDF [A1 + 1 * SIZE], a2 LDF [X1 + 1 * SIZE], a10 FADD c1, t3, c1 add I, -1, I FMUL a3, a11, t3 LDF [A1 + 2 * SIZE], a3 LDF [X1 + 2 * SIZE], a11 FADD c2, t4, c2 cmp I, 0 FMUL a4, a12, t4 LDF [A1 + 3 * SIZE], a4 LDF [X1 + 3 * SIZE], a12 FADD c1, t1, c1 nop FMUL a5, a13, t1 LDF [A1 + 4 * SIZE], a5 LDF [X1 + 4 * SIZE], a13 FADD c2, t2, c2 nop FMUL a6, a14, t2 LDF [A1 + 5 * SIZE], a6 LDF [X1 + 5 * SIZE], a14 FADD c1, t3, c1 FMUL a7, a15, t3 LDF [A1 + 6 * SIZE], a7 LDF [X1 + 6 * SIZE], a15 FADD c2, t4, c2 add X1, 8 * SIZE, X1 FMUL a8, a16, t4 LDF [A1 + 7 * SIZE], a8 add A1, 8 * SIZE, A1 bg,pn %icc, .LL311 LDF [X1 - 1 * SIZE], a16 .LL312: FADD c1, t1, c1 FMUL a1, a9, t1 FADD c2, t2, c2 FMUL a2, a10, t2 FADD c1, t3, c1 FMUL a3, a11, t3 FADD c2, t4, c2 FMUL a4, a12, t4 FADD c1, t1, c1 FMUL a5, a13, t1 FADD c2, t2, c2 FMUL a6, a14, t2 FADD c1, t3, c1 FMUL a7, a15, t3 FADD c2, t4, c2 FMUL a8, a16, t4 .LL315: and MIN_M, 7, I cmp I, 0 ble %icc, .LL319 nop .LL316: LDF [A1 + 0 * SIZE], a1 add A1, 1 * SIZE, A1 LDF [X1 + 0 * SIZE], b1 nop FADD c1, t1, c1 nop add I, -1, I FMUL a1, b1, t1 nop cmp I, 0 bg,pn %icc, .LL316 add X1, 1 * SIZE, X1 .LL319: FADD c1, t1, c1 nop FADD c2, t2, c2 nop FADD c1, t3, c1 FADD c2, t4, c2 FADD c1, c2, c1 FMUL ALPHA, c1, c1 LDF [Y1 + 0 * SIZE], a1 FADD a1, c1, a1 STF a1, [Y1 + 0 * SIZE] add Y1, INCY, Y1 .LL400: add IS, P, IS cmp IS, M bl %icc, .LL10 add A, PNLDA, A .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/ger.S000066400000000000000000000220711313527062700164370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #if defined(DOUBLE) && !defined(__64BIT__) #define X %i5 #define INCX %i2 #define Y %i3 #define INCY %i4 #else #define X %i4 #define INCX %i5 #define Y %i2 #define INCY %i3 #endif #define A %l0 #define LDA %l1 #define BUFFER %l2 #define I %l3 #define J %l4 #define A1 %o0 #define X1 %o2 #define XX %o3 #ifdef DOUBLE #define t1 %f0 #define t2 %f2 #define t3 %f4 #define t4 %f6 #define x1 %f8 #define x2 %f10 #define x3 %f12 #define x4 %f14 #define x5 %f16 #define x6 %f18 #define x7 %f20 #define x8 %f22 #define a1 %f24 #define a2 %f26 #define a3 %f28 #define a4 %f30 #define a5 %f32 #define a6 %f34 #define a7 %f36 #define a8 %f38 #define a9 %f40 #define a10 %f42 #define a11 %f44 #define a12 %f46 #define a13 %f48 #define a14 %f50 #define a15 %f52 #define a16 %f54 #define y1 %f56 #define y2 %f58 #define ALPHA %f60 #else #define t1 %f0 #define t2 %f1 #define t3 %f2 #define t4 %f3 #define x1 %f4 #define x2 %f5 #define x3 %f6 #define x4 %f7 #define x5 %f8 #define x6 %f9 #define x7 %f10 #define x8 %f11 #define a1 %f12 #define a2 %f13 #define a3 %f14 #define a4 %f15 #define a5 %f16 #define a6 %f17 #define a7 %f18 #define a8 %f19 #define a9 %f20 #define a10 %f21 #define a11 %f22 #define a12 %f23 #define a13 %f24 #define a14 %f25 #define a15 %f26 #define a16 %f27 #define y1 %f28 #define y2 %f29 #define ALPHA %f30 #endif #define PREFETCHSIZE 60 PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], INCX ld [%sp + STACK_START + 32], Y ld [%sp + STACK_START + 36], INCY ld [%sp + STACK_START + 40], A ld [%sp + STACK_START + 44], LDA ld [%sp + STACK_START + 48], BUFFER #else st %i3, [%sp + STACK_START + 16] ld [%sp + STACK_START + 28], Y ld [%sp + STACK_START + 32], INCY ld [%sp + STACK_START + 36], A ld [%sp + STACK_START + 40], LDA ld [%sp + STACK_START + 44], BUFFER #endif LDF [%sp + STACK_START + 16], ALPHA #else ldx [%sp + STACK_START + 56], Y ldx [%sp + STACK_START + 64], INCY ldx [%sp + STACK_START + 72], A ldx [%sp + STACK_START + 80], LDA ldx [%sp + STACK_START + 88], BUFFER #ifdef DOUBLE FMOV %f6, ALPHA #else FMOV %f7, ALPHA #endif #endif sll LDA, BASE_SHIFT, LDA cmp M, 0 ble %icc, .LL999 sll INCX, BASE_SHIFT, INCX cmp N, 0 ble %icc, .LL999 sll INCY, BASE_SHIFT, INCY cmp INCX, SIZE be %icc, .LL10 mov X, XX mov BUFFER, XX mov BUFFER, X1 sra M, 3, J cmp J, 0 ble,pn %icc, .LL05 nop .LL01: LDF [X], a1 add X, INCX, X LDF [X], a2 add X, INCX, X LDF [X], a3 add X, INCX, X LDF [X], a4 add X, INCX, X LDF [X], a5 add X, INCX, X LDF [X], a6 add X, INCX, X LDF [X], a7 add X, INCX, X LDF [X], a8 add X, INCX, X STF a1, [X1 + 0 * SIZE] STF a2, [X1 + 1 * SIZE] STF a3, [X1 + 2 * SIZE] STF a4, [X1 + 3 * SIZE] STF a5, [X1 + 4 * SIZE] STF a6, [X1 + 5 * SIZE] STF a7, [X1 + 6 * SIZE] STF a8, [X1 + 7 * SIZE] add X1, 8 * SIZE, X1 deccc J bg,pn %icc, .LL01 nop .LL05: andcc M, 7, J ble,pn %icc, .LL10 nop .LL06: LDF [X], a1 add X, INCX, X STF a1, [X1 + 0 * SIZE] add X1, 1 * SIZE, X1 deccc J bg,pn %icc, .LL06 nop .LL10: mov N, J cmp N, 0 ble,pn %icc, .LL999 nop .LL11: mov XX, X1 mov A, A1 add A, LDA, A LDF [Y], y1 add Y, INCY, Y FMUL ALPHA, y1, y1 sra M, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X1 + 0 * SIZE], x1 LDF [A1 + 0 * SIZE], a1 LDF [X1 + 1 * SIZE], x2 LDF [A1 + 1 * SIZE], a2 LDF [X1 + 2 * SIZE], x3 LDF [A1 + 2 * SIZE], a3 LDF [X1 + 3 * SIZE], x4 LDF [A1 + 3 * SIZE], a4 LDF [X1 + 4 * SIZE], x5 LDF [A1 + 4 * SIZE], a5 LDF [X1 + 5 * SIZE], x6 LDF [A1 + 5 * SIZE], a6 LDF [X1 + 6 * SIZE], x7 LDF [A1 + 6 * SIZE], a7 LDF [X1 + 7 * SIZE], x8 LDF [A1 + 7 * SIZE], a8 FMUL x1, y1, t1 FMUL x2, y1, t2 FMUL x3, y1, t3 FMUL x4, y1, t4 FADD a1, t1, a1 FMUL x5, y1, t1 FADD a2, t2, a2 FMUL x6, y1, t2 deccc I ble,pn %icc, .LL13 nop .LL12: prefetch [A1 + PREFETCHSIZE * SIZE], 0 FADD a3, t3, a3 LDF [X1 + 8 * SIZE], x1 FMUL x7, y1, t3 LDF [X1 + 9 * SIZE], x2 FADD a4, t4, a4 LDF [X1 + 10 * SIZE], x3 FMUL x8, y1, t4 LDF [X1 + 11 * SIZE], x4 FADD a5, t1, a5 STF a1, [A1 + 0 * SIZE] LDF [A1 + 8 * SIZE], a1 FMUL x1, y1, t1 STF a2, [A1 + 1 * SIZE] LDF [A1 + 9 * SIZE], a2 FADD a6, t2, a6 STF a3, [A1 + 2 * SIZE] LDF [A1 + 10 * SIZE], a3 FMUL x2, y1, t2 STF a4, [A1 + 3 * SIZE] LDF [A1 + 11 * SIZE], a4 FADD a7, t3, a7 LDF [X1 + 12 * SIZE], x5 FMUL x3, y1, t3 LDF [X1 + 13 * SIZE], x6 FADD a8, t4, a8 LDF [X1 + 14 * SIZE], x7 FMUL x4, y1, t4 LDF [X1 + 15 * SIZE], x8 FADD a1, t1, a1 STF a5, [A1 + 4 * SIZE] deccc I LDF [A1 + 12 * SIZE], a5 FMUL x5, y1, t1 STF a6, [A1 + 5 * SIZE] LDF [A1 + 13 * SIZE], a6 FADD a2, t2, a2 STF a7, [A1 + 6 * SIZE] LDF [A1 + 14 * SIZE], a7 FMUL x6, y1, t2 STF a8, [A1 + 7 * SIZE] LDF [A1 + 15 * SIZE], a8 add A1, 8 * SIZE, A1 bg,pn %icc, .LL12 add X1, 8 * SIZE, X1 .LL13: FADD a3, t3, a3 FMUL x7, y1, t3 FADD a4, t4, a4 FMUL x8, y1, t4 FADD a5, t1, a5 FADD a6, t2, a6 FADD a7, t3, a7 FADD a8, t4, a8 STF a1, [A1 + 0 * SIZE] STF a2, [A1 + 1 * SIZE] STF a3, [A1 + 2 * SIZE] STF a4, [A1 + 3 * SIZE] STF a5, [A1 + 4 * SIZE] STF a6, [A1 + 5 * SIZE] STF a7, [A1 + 6 * SIZE] STF a8, [A1 + 7 * SIZE] add A1, 8 * SIZE, A1 add X1, 8 * SIZE, X1 .LL15: andcc M, 4, I ble,pn %icc, .LL16 nop LDF [X1 + 0 * SIZE], x1 LDF [A1 + 0 * SIZE], a1 LDF [X1 + 1 * SIZE], x2 LDF [A1 + 1 * SIZE], a2 LDF [X1 + 2 * SIZE], x3 LDF [A1 + 2 * SIZE], a3 LDF [X1 + 3 * SIZE], x4 LDF [A1 + 3 * SIZE], a4 FMUL x1, y1, t1 FMUL x2, y1, t2 FMUL x3, y1, t3 FMUL x4, y1, t4 FADD a1, t1, a1 FADD a2, t2, a2 FADD a3, t3, a3 FADD a4, t4, a4 STF a1, [A1 + 0 * SIZE] STF a2, [A1 + 1 * SIZE] STF a3, [A1 + 2 * SIZE] add X1, 4 * SIZE, X1 STF a4, [A1 + 3 * SIZE] add A1, 4 * SIZE, A1 .LL16: andcc M, 2, I ble,pn %icc, .LL17 nop LDF [X1 + 0 * SIZE], x1 LDF [X1 + 1 * SIZE], x2 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 FMUL x1, y1, t1 FMUL x2, y1, t2 FADD a1, t1, a1 FADD a2, t2, a2 STF a1, [A1 + 0 * SIZE] add X1, 2 * SIZE, X1 STF a2, [A1 + 1 * SIZE] add A1, 2 * SIZE, A1 .LL17: andcc M, 1, I ble,pn %icc, .LL19 nop LDF [X1 + 0 * SIZE], x1 add X1, 1 * SIZE, X1 LDF [A1 + 0 * SIZE], a1 FMUL x1, y1, t1 FADD a1, t1, a1 STF a1, [A1 + 0 * SIZE] add A1, 1 * SIZE, A1 .LL19: deccc J bg %icc, .LL11 nop .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/iamax.S000066400000000000000000000214001313527062700167540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #define v1 %o0 #define v2 %o1 #define v3 %o2 #define v4 %o3 #define count %o4 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #endif #ifndef USE_MIN #define FCMOV FMOVG #define CMOV movg #else #define FCMOV FMOVL #define CMOV movl #endif PROLOGUE SAVESP FCLR(0) cmp N, 0 ble .LL20 clr v1 cmp INCX, 0 ble .LL20 sll INCX, BASE_SHIFT, INCX mov 1, v1 add N, -1, N LDF [X], c4 add X, INCX, X cmp N, 0 ble .LL20 FABS c4, c1 FABS c4, c2 mov 1, v2 FABS c4, c3 mov 1, v3 FABS c4, c4 mov 1, v4 mov 2, count cmp INCX, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 40 .LL11: FABS a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a2, t2 LDF [X + 0 * SIZE], a1 FABS a3, t3 LDF [X + 1 * SIZE], a2 FABS a4, t4 LDF [X + 2 * SIZE], a3 FCMP %fcc0, t1, c1 LDF [X + 3 * SIZE], a4 FCMP %fcc1, t2, c2 nop FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 add count, 4, count FABS a5, t1 LDF [X + 4 * SIZE], a5 FABS a6, t2 LDF [X + 5 * SIZE], a6 FABS a7, t3 LDF [X + 6 * SIZE], a7 FABS a8, t4 LDF [X + 7 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 nop CMOV %fcc0, count, v1 add I, -1, I FCMOV %fcc1, t2, c2 cmp I, 0 CMOV %fcc1, count, v2 add X, 8 * SIZE, X FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 bg,pt %icc, .LL11 add count, 4, count .LL12: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 add count, 4, count FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 add count, 4, count .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 FABS a1, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 add I, -1, I add count, 1, count cmp I, 0 bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: FCMP %fcc0, c2, c1 add v2, 1, v2 FCMP %fcc1, c4, c3 add v3, 2, v3 add v4, 3, v4 FCMOV %fcc0, c2, c1 CMOV %fcc0, v2, v1 FCMOV %fcc1, c4, c3 CMOV %fcc1, v4, v3 FCMP %fcc0, c3, c1 CMOV %fcc0, v3, v1 .LL20: mov v1, %i0 return %i7 + 8 nop .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [X + 0 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 add X, INCX, X LDF [X + 0 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FABS a1, t1 LDF [X + 0 * SIZE], a1 add X, INCX, X FABS a2, t2 LDF [X + 0 * SIZE], a2 add X, INCX, X FABS a3, t3 LDF [X + 0 * SIZE], a3 add X, INCX, X FABS a4, t4 LDF [X + 0 * SIZE], a4 add X, INCX, X FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 add count, 4, count FABS a5, t1 LDF [X + 0 * SIZE], a5 add X, INCX, X FABS a6, t2 LDF [X + 0 * SIZE], a6 add X, INCX, X FABS a7, t3 LDF [X + 0 * SIZE], a7 add X, INCX, X FABS a8, t4 LDF [X + 0 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 add I, -1, I FCMOV %fcc1, t2, c2 CMOV %fcc1, count, v2 cmp I, 0 FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 add count, 4, count bg,pt %icc, .LL51 add X, INCX, X .LL52: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 add count, 4, count FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t4, c4 CMOV %fcc3, count, v4 add count, 4, count .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 FABS a1, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 add I, -1, I add count, 1, count cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: FCMP %fcc0, c2, c1 add v2, 1, v2 FCMP %fcc1, c4, c3 add v3, 2, v3 add v4, 3, v4 FCMOV %fcc0, c2, c1 CMOV %fcc0, v2, v1 FCMOV %fcc1, c4, c3 CMOV %fcc1, v4, v3 FCMP %fcc0, c3, c1 CMOV %fcc0, v3, v1 mov v1, %i0 return %i7 + 8 nop EPILOGUE OpenBLAS-0.2.20/kernel/sparc/imax.S000066400000000000000000000204211313527062700166150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #define v1 %o0 #define v2 %o1 #define v3 %o2 #define v4 %o3 #define count %o4 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #endif #ifndef USE_MIN #define FCMOV FMOVG #define CMOV movg #else #define FCMOV FMOVL #define CMOV movl #endif PROLOGUE SAVESP FCLR(0) cmp N, 0 ble .LL20 clr v1 cmp INCX, 0 ble .LL20 sll INCX, BASE_SHIFT, INCX mov 1, v1 add N, -1, N LDF [X], c1 add X, INCX, X cmp N, 0 ble .LL20 nop FMOV c1, c2 mov 1, v2 FMOV c1, c3 mov 1, v3 FMOV c1, c4 mov 1, v4 mov 2, count cmp INCX, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 LDF [X + 5 * SIZE], a6 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 add X, 8 * SIZE, X add I, -1, I cmp I, 0 ble,pt %icc, .LL12 nop #define PREFETCHSIZE 40 .LL11: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 CMOV %fcc0, count, v1 LDF [X + 0 * SIZE], a1 FCMOV %fcc1, a2, c2 CMOV %fcc1, count, v2 LDF [X + 1 * SIZE], a2 FCMOV %fcc2, a3, c3 CMOV %fcc2, count, v3 LDF [X + 2 * SIZE], a3 FCMOV %fcc3, a4, c4 CMOV %fcc3, count, v4 LDF [X + 3 * SIZE], a4 add count, 4, count FCMP %fcc0, a5, c1 FCMP %fcc1, a6, c2 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 CMOV %fcc0, count, v1 LDF [X + 4 * SIZE], a5 add I, -1, I FCMOV %fcc1, a6, c2 CMOV %fcc1, count, v2 LDF [X + 5 * SIZE], a6 cmp I, 0 FCMOV %fcc2, a7, c3 CMOV %fcc2, count, v3 LDF [X + 6 * SIZE], a7 FCMOV %fcc3, a8, c4 CMOV %fcc3, count, v4 LDF [X + 7 * SIZE], a8 add count, 4, count bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, a2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, a3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, a4, c4 CMOV %fcc3, count, v4 add count, 4, count FCMP %fcc0, a5, c1 FCMP %fcc1, a6, c2 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, a6, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, a7, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, a8, c4 CMOV %fcc3, count, v4 add count, 4, count .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 FCMP %fcc0, a1, c1 FCMOV %fcc0, a1, c1 CMOV %fcc0, count, v1 add I, -1, I cmp I, 0 add count, 1, count bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: FCMP %fcc0, c2, c1 add v2, 1, v2 FCMP %fcc1, c4, c3 add v3, 2, v3 add v4, 3, v4 FCMOV %fcc0, c2, c1 CMOV %fcc0, v2, v1 FCMOV %fcc1, c4, c3 CMOV %fcc1, v4, v3 FCMP %fcc0, c3, c1 CMOV %fcc0, v3, v1 .LL20: mov v1, %i0 return %i7 + 8 nop .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [X + 0 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 add X, INCX, X LDF [X + 0 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 CMOV %fcc0, count, v1 LDF [X + 0 * SIZE], a1 add X, INCX, X FCMOV %fcc1, a2, c2 CMOV %fcc1, count, v2 LDF [X + 0 * SIZE], a2 add X, INCX, X FCMOV %fcc2, a3, c3 CMOV %fcc2, count, v3 LDF [X + 0 * SIZE], a3 add X, INCX, X FCMOV %fcc3, a4, c4 CMOV %fcc3, count, v4 LDF [X + 0 * SIZE], a4 add X, INCX, X add count, 4, count FCMP %fcc0, a5, c1 FCMP %fcc1, a6, c2 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 CMOV %fcc0, count, v1 LDF [X + 0 * SIZE], a5 add X, INCX, X FCMOV %fcc1, a6, c2 add I, -1, I CMOV %fcc1, count, v2 LDF [X + 0 * SIZE], a6 add X, INCX, X FCMOV %fcc2, a7, c3 CMOV %fcc2, count, v3 LDF [X + 0 * SIZE], a7 add X, INCX, X cmp I, 0 FCMOV %fcc3, a8, c4 CMOV %fcc3, count, v4 LDF [X + 0 * SIZE], a8 add count, 4, count bg,pt %icc, .LL51 add X, INCX, X .LL52: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, a2, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, a3, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, a4, c4 CMOV %fcc3, count, v4 add count, 4, count FCMP %fcc0, a5, c1 FCMP %fcc1, a6, c2 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, a6, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, a7, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, a8, c4 CMOV %fcc3, count, v4 add count, 4, count .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 FCMP %fcc0, a1, c1 FCMOV %fcc0, a1, c1 CMOV %fcc0, count, v1 add I, -1, I add count, 1, count cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: FCMP %fcc0, c2, c1 add v2, 1, v2 FCMP %fcc1, c4, c3 add v3, 2, v3 add v4, 3, v4 FCMOV %fcc0, c2, c1 CMOV %fcc0, v2, v1 FCMOV %fcc1, c4, c3 CMOV %fcc1, v4, v3 FCMP %fcc0, c3, c1 CMOV %fcc0, v3, v1 mov v1, %i0 return %i7 + 8 nop EPILOGUE OpenBLAS-0.2.20/kernel/sparc/izamax.S000066400000000000000000000201441313527062700171520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #define v1 %o0 #define v2 %o1 #define v3 %o2 #define v4 %o3 #define count %o4 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define t5 %f16 #define t6 %f18 #define t7 %f20 #define t8 %f22 #define a1 %f24 #define a2 %f26 #define a3 %f28 #define a4 %f30 #define a5 %f32 #define a6 %f34 #define a7 %f36 #define a8 %f38 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define t5 %f8 #define t6 %f9 #define t7 %f10 #define t8 %f11 #define a1 %f12 #define a2 %f13 #define a3 %f14 #define a4 %f15 #define a5 %f16 #define a6 %f17 #define a7 %f18 #define a8 %f19 #endif #ifndef USE_MIN #define FCMOV FMOVG #define CMOV movg #else #define FCMOV FMOVL #define CMOV movl #endif PROLOGUE SAVESP FCLR(0) cmp N, 0 ble .LL20 clr v1 cmp INCX, 0 ble .LL20 sll INCX, ZBASE_SHIFT, INCX mov 1, v1 LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 add N, -1, N FABS c1, c1 add X, INCX, X FABS c2, c2 cmp N, 0 ble .LL20 FADD c1, c2, c1 FMOV c1, c2 mov 1, v2 FMOV c1, c3 mov 1, v3 FMOV c1, c4 mov 1, v4 mov 2, count cmp INCX, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 32 .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a1, t1 LDF [X + 0 * SIZE], a1 FABS a2, t2 LDF [X + 1 * SIZE], a2 FABS a3, t3 LDF [X + 2 * SIZE], a3 FABS a4, t4 LDF [X + 3 * SIZE], a4 FABS a5, t5 LDF [X + 4 * SIZE], a5 FABS a6, t6 LDF [X + 5 * SIZE], a6 FABS a7, t7 LDF [X + 6 * SIZE], a7 FABS a8, t8 LDF [X + 7 * SIZE], a8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 add I, -1, I FCMOV %fcc1, t3, c2 CMOV %fcc1, count, v2 cmp I, 0 FCMOV %fcc2, t5, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t7, c4 CMOV %fcc3, count, v4 add count, 4, count bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FABS a5, t5 FABS a6, t6 FABS a7, t7 FABS a8, t8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t3, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t5, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t7, c4 CMOV %fcc3, count, v4 add count, 4, count .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FABS a1, t1 FABS a2, t2 FADD t1, t2, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 add count, 1, count add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: FCMP %fcc0, c2, c1 add v2, 1, v2 FCMP %fcc1, c4, c3 add v3, 2, v3 add v4, 3, v4 FCMOV %fcc0, c2, c1 CMOV %fcc0, v2, v1 FCMOV %fcc1, c4, c3 CMOV %fcc1, v4, v3 FCMP %fcc0, c3, c1 CMOV %fcc0, v3, v1 .LL20: mov v1, %i0 return %i7 + 8 nop .LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 LDF [X + 1 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FABS a1, t1 LDF [X + 0 * SIZE], a1 FABS a2, t2 LDF [X + 1 * SIZE], a2 add X, INCX, X FABS a3, t3 LDF [X + 0 * SIZE], a3 FABS a4, t4 LDF [X + 1 * SIZE], a4 add X, INCX, X FABS a5, t5 LDF [X + 0 * SIZE], a5 FABS a6, t6 LDF [X + 1 * SIZE], a6 add X, INCX, X FABS a7, t7 LDF [X + 0 * SIZE], a7 FABS a8, t8 LDF [X + 1 * SIZE], a8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 add I, -1, I FCMOV %fcc1, t3, c2 CMOV %fcc1, count, v2 cmp I, 0 FCMOV %fcc2, t5, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t7, c4 CMOV %fcc3, count, v4 add count, 4, count bg,pt %icc, .LL51 add X, INCX, X .LL52: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FABS a5, t5 FABS a6, t6 FABS a7, t7 FABS a8, t8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 FCMOV %fcc1, t3, c2 CMOV %fcc1, count, v2 FCMOV %fcc2, t5, c3 CMOV %fcc2, count, v3 FCMOV %fcc3, t7, c4 CMOV %fcc3, count, v4 add count, 4, count .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FABS a1, t1 add I, -1, I FABS a2, t2 cmp I, 0 FADD t1, t2, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 CMOV %fcc0, count, v1 add count, 1, count bg,pt %icc, .LL56 add X, INCX, X .LL59: FCMP %fcc0, c2, c1 add v2, 1, v2 FCMP %fcc1, c4, c3 add v3, 2, v3 add v4, 3, v4 FCMOV %fcc0, c2, c1 CMOV %fcc0, v2, v1 FCMOV %fcc1, c4, c3 CMOV %fcc1, v4, v3 FCMP %fcc0, c3, c1 CMOV %fcc0, v3, v1 mov v1, %i0 return %i7 + 8 nop EPILOGUE OpenBLAS-0.2.20/kernel/sparc/lsame.S000066400000000000000000000056241313527062700167700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define A %o0 #define B %o1 #define AA %o4 #define BB %o3 PROLOGUE ldub [A], A ldub [B], B add A, -32, AA add B, -32, BB cmp A, 96 movge %icc, AA, A cmp B, 96 movge %icc, BB, B clr %g1 cmp A, B move %icc, 1, %g1 retl mov %g1, %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/max.S000066400000000000000000000156741313527062700164620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #endif #ifndef USE_MIN #define FCMOV FMOVG #else #define FCMOV FMOVL #endif PROLOGUE SAVESP FCLR(0) cmp N, 0 ble .LL20 nop cmp INCX, 0 ble .LL20 sll INCX, BASE_SHIFT, INCX add N, -1, N LDF [X], c1 add X, INCX, X cmp N, 0 ble .LL20 nop FMOV c1, c2 FMOV c1, c3 FMOV c1, c4 cmp INCX, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 LDF [X + 5 * SIZE], a6 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 add X, 8 * SIZE, X add I, -1, I cmp I, 0 ble,pt %icc, .LL12 nop #define PREFETCHSIZE 40 .LL11: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 LDF [X + 0 * SIZE], a1 FCMOV %fcc1, a2, c2 LDF [X + 1 * SIZE], a2 FCMOV %fcc2, a3, c3 LDF [X + 2 * SIZE], a3 FCMOV %fcc3, a4, c4 LDF [X + 3 * SIZE], a4 FCMP %fcc0, a5, c1 FCMP %fcc1, a6, c2 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 LDF [X + 4 * SIZE], a5 add I, -1, I FCMOV %fcc1, a6, c2 LDF [X + 5 * SIZE], a6 cmp I, 0 FCMOV %fcc2, a7, c3 LDF [X + 6 * SIZE], a7 FCMOV %fcc3, a8, c4 LDF [X + 7 * SIZE], a8 bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 FCMOV %fcc1, a2, c2 FCMOV %fcc2, a3, c3 FCMOV %fcc3, a4, c4 FCMP %fcc0, a5, c1 FCMP %fcc1, a6, c2 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 FCMOV %fcc1, a6, c2 FCMOV %fcc2, a7, c3 FCMOV %fcc3, a8, c4 .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 FCMP %fcc0, a1, c1 FCMOV %fcc0, a1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 FCMOV %fcc0, c2, c1 FCMOV %fcc1, c4, c3 FCMP %fcc0, c3, c1 FCMOV %fcc0, c3, c1 .LL20: return %i7 + 8 clr %g0 .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [X + 0 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 add X, INCX, X LDF [X + 0 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 LDF [X + 0 * SIZE], a1 add X, INCX, X FCMOV %fcc1, a2, c2 LDF [X + 0 * SIZE], a2 add X, INCX, X FCMOV %fcc2, a3, c3 LDF [X + 0 * SIZE], a3 add X, INCX, X FCMOV %fcc3, a4, c4 LDF [X + 0 * SIZE], a4 add X, INCX, X FCMP %fcc0, a5, c1 add I, -1, I FCMP %fcc1, a6, c2 cmp I, 0 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 LDF [X + 0 * SIZE], a5 add X, INCX, X FCMOV %fcc1, a6, c2 LDF [X + 0 * SIZE], a6 add X, INCX, X FCMOV %fcc2, a7, c3 LDF [X + 0 * SIZE], a7 add X, INCX, X FCMOV %fcc3, a8, c4 LDF [X + 0 * SIZE], a8 bg,pt %icc, .LL51 add X, INCX, X .LL52: FCMP %fcc0, a1, c1 FCMP %fcc1, a2, c2 FCMP %fcc2, a3, c3 FCMP %fcc3, a4, c4 FCMOV %fcc0, a1, c1 FCMOV %fcc1, a2, c2 FCMOV %fcc2, a3, c3 FCMOV %fcc3, a4, c4 FCMP %fcc0, a5, c1 FCMP %fcc1, a6, c2 FCMP %fcc2, a7, c3 FCMP %fcc3, a8, c4 FCMOV %fcc0, a5, c1 FCMOV %fcc1, a6, c2 FCMOV %fcc2, a7, c3 FCMOV %fcc3, a8, c4 .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 FCMP %fcc0, a1, c1 FCMOV %fcc0, a1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 FCMOV %fcc0, c2, c1 FCMOV %fcc1, c4, c3 FCMP %fcc0, c3, c1 FCMOV %fcc0, c3, c1 return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/rot.S000066400000000000000000000257161313527062700164770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #define I %i5 #define XX %l0 #define YY %l1 #ifdef DOUBLE #define a1 %f4 #define a2 %f6 #define a3 %f8 #define a4 %f10 #define a5 %f12 #define a6 %f14 #define a7 %f16 #define a8 %f18 #define b1 %f20 #define b2 %f22 #define b3 %f24 #define b4 %f26 #define b5 %f28 #define b6 %f30 #define b7 %f32 #define b8 %f34 #define c1 %f36 #define c2 %f38 #define c3 %f40 #define c4 %f42 #define c5 %f44 #define c6 %f46 #define c7 %f48 #define c8 %f50 #define t1 %f52 #define t2 %f54 #define t3 %f56 #define t4 %f58 #else #define a1 %f2 #define a2 %f3 #define a3 %f4 #define a4 %f5 #define a5 %f6 #define a6 %f7 #define a7 %f8 #define a8 %f9 #define b1 %f10 #define b2 %f11 #define b3 %f12 #define b4 %f13 #define b5 %f14 #define b6 %f15 #define b7 %f16 #define b8 %f17 #define c1 %f18 #define c2 %f19 #define c3 %f20 #define c4 %f21 #define c5 %f22 #define c6 %f23 #define c7 %f24 #define c8 %f25 #define t1 %f26 #define t2 %f27 #define t3 %f28 #define t4 %f29 #endif #ifdef DOUBLE #define C %f0 #define S %f2 #else #define C %f0 #define S %f1 #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i5, [%sp + STACK_START + 24] LDF [%sp + STACK_START + 24], C LDF [%sp + STACK_START + 32], S #else st %i5, [%sp + STACK_START + 24] LDF [%sp + STACK_START + 24], C LDF [%sp + STACK_START + 28], S #endif #else #ifdef DOUBLE FMOV %f10, C FMOV %f12, S #else FMOV %f11, C FMOV %f13, S #endif #endif cmp N, 0 ble .LL19 nop sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY cmp INCX, SIZE bne .LL50 nop cmp INCY, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 LDF [Y + 6 * SIZE], b7 LDF [X + 7 * SIZE], a8 LDF [Y + 7 * SIZE], b8 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 LDF [Y + 8 * SIZE], b1 FMUL S, a1, c4 LDF [X + 8 * SIZE], a1 FMUL C, a2, c5 FMUL S, b2, c6 FADD c1, c2, t1 FMUL C, b2, c7 LDF [Y + 9 * SIZE], b2 FMUL S, a2, c8 LDF [X + 9 * SIZE], a2 FSUB c3, c4, t2 addcc I, -1, I ble,pt %icc, .LL12 nop #define PREFETCHSIZE 64 .LL11: FMUL C, a3, c1 nop prefetch [Y + PREFETCHSIZE * SIZE], 1 nop FMUL S, b3, c2 STF t1, [X + 0 * SIZE] FADD c5, c6, t3 nop FMUL C, b3, c3 LDF [Y + 10 * SIZE], b3 nop nop FMUL S, a3, c4 STF t2, [Y + 0 * SIZE] FSUB c7, c8, t4 nop FMUL C, a4, c5 LDF [X + 10 * SIZE], a3 nop nop FMUL S, b4, c6 STF t3, [X + 1 * SIZE] FADD c1, c2, t1 nop FMUL C, b4, c7 LDF [Y + 11 * SIZE], b4 nop nop FMUL S, a4, c8 STF t4, [Y + 1 * SIZE] FSUB c3, c4, t2 nop FMUL C, a5, c1 LDF [X + 11 * SIZE], a4 nop nop FMUL S, b5, c2 STF t1, [X + 2 * SIZE] FADD c5, c6, t3 nop FMUL C, b5, c3 LDF [Y + 12 * SIZE], b5 nop nop FMUL S, a5, c4 STF t2, [Y + 2 * SIZE] FSUB c7, c8, t4 nop FMUL C, a6, c5 LDF [X + 12 * SIZE], a5 nop nop FMUL S, b6, c6 STF t3, [X + 3 * SIZE] FADD c1, c2, t1 nop FMUL C, b6, c7 LDF [Y + 13 * SIZE], b6 nop nop FMUL S, a6, c8 STF t4, [Y + 3 * SIZE] FSUB c3, c4, t2 nop FMUL C, a7, c1 LDF [X + 13 * SIZE], a6 nop nop FMUL S, b7, c2 STF t1, [X + 4 * SIZE] FADD c5, c6, t3 nop FMUL C, b7, c3 LDF [Y + 14 * SIZE], b7 nop nop FMUL S, a7, c4 STF t2, [Y + 4 * SIZE] FSUB c7, c8, t4 nop FMUL C, a8, c5 LDF [X + 14 * SIZE], a7 nop nop FMUL S, b8, c6 STF t3, [X + 5 * SIZE] FADD c1, c2, t1 nop FMUL C, b8, c7 LDF [Y + 15 * SIZE], b8 nop nop FMUL S, a8, c8 STF t4, [Y + 5 * SIZE] FSUB c3, c4, t2 nop FMUL C, a1, c1 LDF [X + 15 * SIZE], a8 addcc I, -1, I nop FMUL S, b1, c2 STF t1, [X + 6 * SIZE] FADD c5, c6, t3 nop FMUL C, b1, c3 LDF [Y + 16 * SIZE], b1 nop nop FMUL S, a1, c4 STF t2, [Y + 6 * SIZE] FSUB c7, c8, t4 nop FMUL C, a2, c5 LDF [X + 16 * SIZE], a1 add Y, 8 * SIZE, Y nop FMUL S, b2, c6 STF t3, [X + 7 * SIZE] FADD c1, c2, t1 nop FMUL C, b2, c7 LDF [Y + 9 * SIZE], b2 add X, 8 * SIZE, X nop FMUL S, a2, c8 STF t4, [Y - 1 * SIZE] FSUB c3, c4, t2 nop bg,pt %icc, .LL11 LDF [X + 9 * SIZE], a2 .LL12: FMUL C, a3, c1 FMUL S, b3, c2 STF t1, [X + 0 * SIZE] FADD c5, c6, t3 FMUL C, b3, c3 FMUL S, a3, c4 STF t2, [Y + 0 * SIZE] FSUB c7, c8, t4 FMUL C, a4, c5 FMUL S, b4, c6 STF t3, [X + 1 * SIZE] FADD c1, c2, t1 FMUL C, b4, c7 FMUL S, a4, c8 STF t4, [Y + 1 * SIZE] FSUB c3, c4, t2 FMUL C, a5, c1 FMUL S, b5, c2 STF t1, [X + 2 * SIZE] FADD c5, c6, t3 FMUL C, b5, c3 FMUL S, a5, c4 STF t2, [Y + 2 * SIZE] FSUB c7, c8, t4 FMUL C, a6, c5 FMUL S, b6, c6 STF t3, [X + 3 * SIZE] FADD c1, c2, t1 FMUL C, b6, c7 FMUL S, a6, c8 STF t4, [Y + 3 * SIZE] FSUB c3, c4, t2 FMUL C, a7, c1 FMUL S, b7, c2 STF t1, [X + 4 * SIZE] FADD c5, c6, t3 FMUL C, b7, c3 FMUL S, a7, c4 STF t2, [Y + 4 * SIZE] FSUB c7, c8, t4 FMUL C, a8, c5 FMUL S, b8, c6 STF t3, [X + 5 * SIZE] FADD c1, c2, t1 FMUL C, b8, c7 FMUL S, a8, c8 STF t4, [Y + 5 * SIZE] FSUB c3, c4, t2 FADD c5, c6, t3 STF t1, [X + 6 * SIZE] FSUB c7, c8, t4 STF t2, [Y + 6 * SIZE] STF t3, [X + 7 * SIZE] STF t4, [Y + 7 * SIZE] add X, 8 * SIZE, X add Y, 8 * SIZE, Y .LL15: andcc N, 7, I nop ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 add X, 1 * SIZE, X LDF [Y + 0 * SIZE], b1 add Y, 1 * SIZE, Y FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FADD c1, c2, c2 addcc I, -1, I FSUB c3, c4, c4 nop STF c2, [X - 1 * SIZE] STF c4, [Y - 1 * SIZE] bg,pt %icc, .LL16 nop .LL19: return %i7 + 8 nop .LL50: mov X, XX mov Y, YY sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop .LL51: LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [Y + 0 * SIZE], b1 add Y, INCY, Y LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [Y + 0 * SIZE], b2 add Y, INCY, Y LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [Y + 0 * SIZE], b3 add Y, INCY, Y LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [Y + 0 * SIZE], b4 add Y, INCY, Y LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [Y + 0 * SIZE], b5 add Y, INCY, Y LDF [X + 0 * SIZE], a6 add X, INCX, X LDF [Y + 0 * SIZE], b6 add Y, INCY, Y LDF [X + 0 * SIZE], a7 add X, INCX, X LDF [Y + 0 * SIZE], b7 add Y, INCY, Y LDF [X + 0 * SIZE], a8 add X, INCX, X LDF [Y + 0 * SIZE], b8 add Y, INCY, Y FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FADD c1, c2, t1 FSUB c3, c4, t2 STF t1, [XX + 0 * SIZE] add XX, INCX, XX STF t2, [YY + 0 * SIZE] add YY, INCY, YY FMUL C, a2, c5 FMUL S, b2, c6 FMUL C, b2, c7 FMUL S, a2, c8 FADD c5, c6, t3 FSUB c7, c8, t4 STF t3, [XX + 0 * SIZE] add XX, INCX, XX STF t4, [YY + 0 * SIZE] add YY, INCY, YY FMUL C, a3, c1 FMUL S, b3, c2 FMUL C, b3, c3 FMUL S, a3, c4 FADD c1, c2, t1 FSUB c3, c4, t2 STF t1, [XX + 0 * SIZE] add XX, INCX, XX STF t2, [YY + 0 * SIZE] add YY, INCY, YY FMUL C, a4, c5 FMUL S, b4, c6 FMUL C, b4, c7 FMUL S, a4, c8 FADD c5, c6, t3 FSUB c7, c8, t4 STF t3, [XX + 0 * SIZE] add XX, INCX, XX STF t4, [YY + 0 * SIZE] add YY, INCY, YY FMUL C, a5, c1 FMUL S, b5, c2 FMUL C, b5, c3 FMUL S, a5, c4 FADD c1, c2, t1 FSUB c3, c4, t2 STF t1, [XX + 0 * SIZE] add XX, INCX, XX STF t2, [YY + 0 * SIZE] add YY, INCY, YY FMUL C, a6, c5 FMUL S, b6, c6 FMUL C, b6, c7 FMUL S, a6, c8 FADD c5, c6, t3 FSUB c7, c8, t4 STF t3, [XX + 0 * SIZE] add XX, INCX, XX STF t4, [YY + 0 * SIZE] add YY, INCY, YY FMUL C, a7, c1 FMUL S, b7, c2 FMUL C, b7, c3 FMUL S, a7, c4 FADD c1, c2, t1 FSUB c3, c4, t2 STF t1, [XX + 0 * SIZE] add XX, INCX, XX STF t2, [YY + 0 * SIZE] add YY, INCY, YY FMUL C, a8, c5 FMUL S, b8, c6 FMUL C, b8, c7 FMUL S, a8, c8 FADD c5, c6, t3 FSUB c7, c8, t4 STF t3, [XX + 0 * SIZE] add XX, INCX, XX STF t4, [YY + 0 * SIZE] add YY, INCY, YY addcc I, -1, I bg,pt %icc, .LL51 nop .LL55: andcc N, 7, I nop ble %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FADD c1, c2, c2 FSUB c3, c4, c4 STF c2, [X + 0 * SIZE] add X, INCX, X STF c4, [Y + 0 * SIZE] addcc I, -1, I bg %icc, .LL56 add Y, INCY, Y .LL59: return %i7 + 8 nop EPILOGUE OpenBLAS-0.2.20/kernel/sparc/scal.S000066400000000000000000000175661313527062700166210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #if defined(DOUBLE) && !defined(__64BIT__) #define X %i5 #define INCX %i1 #else #define X %i4 #define INCX %i5 #endif #define I %i2 #define XX %i3 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define c5 %f8 #define c6 %f10 #define c7 %f12 #define c8 %f14 #define t1 %f16 #define t2 %f18 #define t3 %f20 #define t4 %f22 #define t5 %f24 #define t6 %f26 #define t7 %f28 #define t8 %f30 #define FZERO %f60 #define ALPHA %f62 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define c5 %f4 #define c6 %f5 #define c7 %f6 #define c8 %f7 #define t1 %f8 #define t2 %f9 #define t3 %f10 #define t4 %f11 #define t5 %f12 #define t6 %f13 #define t7 %f14 #define t8 %f15 #define FZERO %f29 #define ALPHA %f30 #endif #define PREFETCHSIZE 168 PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], INCX #else st %i3, [%sp + STACK_START + 16] #endif LDF [%sp + STACK_START + 16], ALPHA #else #ifdef DOUBLE FMOV %f6, ALPHA #else FMOV %f7, ALPHA #endif #endif FCLR(29) FCMP ALPHA, FZERO fbne .LL100 sll INCX, BASE_SHIFT, INCX cmp INCX, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 STF FZERO, [X + 0 * SIZE] add I, -1, I STF FZERO, [X + 1 * SIZE] cmp I, 0 STF FZERO, [X + 2 * SIZE] STF FZERO, [X + 3 * SIZE] STF FZERO, [X + 4 * SIZE] STF FZERO, [X + 5 * SIZE] add X, 8 * SIZE, X STF FZERO, [X - 2 * SIZE] bg,pt %icc, .LL11 STF FZERO, [X - 1 * SIZE] .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: STF FZERO, [X + 0 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: return %i7 + 8 clr %o0 .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop .LL51: STF FZERO, [X + 0 * SIZE] add X, INCX, X add I, -1, I STF FZERO, [X + 0 * SIZE] add X, INCX, X cmp I, 0 STF FZERO, [X + 0 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] bg,pt %icc, .LL51 add X, INCX, X .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: STF FZERO, [X + 0 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: return %i7 + 8 clr %o0 .LL100: cmp INCX, SIZE bne .LL150 sra N, 3, I cmp I, 0 ble,pn %icc, .LL115 nop LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 LDF [X + 2 * SIZE], c3 LDF [X + 3 * SIZE], c4 LDF [X + 4 * SIZE], c5 LDF [X + 5 * SIZE], c6 LDF [X + 6 * SIZE], c7 LDF [X + 7 * SIZE], c8 FMUL ALPHA, c1, t1 LDF [X + 8 * SIZE], c1 FMUL ALPHA, c2, t2 LDF [X + 9 * SIZE], c2 deccc I ble,pt %icc, .LL112 nop .LL111: prefetch [X + PREFETCHSIZE * SIZE], 0 deccc I FMUL ALPHA, c3, t3 LDF [X + 10 * SIZE], c3 nop STF t1, [X + 0 * SIZE] FMUL ALPHA, c4, t4 LDF [X + 11 * SIZE], c4 nop STF t2, [X + 1 * SIZE] FMUL ALPHA, c5, t5 LDF [X + 12 * SIZE], c5 nop STF t3, [X + 2 * SIZE] FMUL ALPHA, c6, t6 LDF [X + 13 * SIZE], c6 nop STF t4, [X + 3 * SIZE] FMUL ALPHA, c7, t7 LDF [X + 14 * SIZE], c7 nop STF t5, [X + 4 * SIZE] FMUL ALPHA, c8, t8 LDF [X + 15 * SIZE], c8 nop STF t6, [X + 5 * SIZE] FMUL ALPHA, c1, t1 STF t7, [X + 6 * SIZE] nop LDF [X + 16 * SIZE], c1 FMUL ALPHA, c2, t2 STF t8, [X + 7 * SIZE] nop LDF [X + 17 * SIZE], c2 bg,pt %icc, .LL111 add X, 8 * SIZE, X .LL112: FMUL ALPHA, c3, t3 STF t1, [X + 0 * SIZE] FMUL ALPHA, c4, t4 STF t2, [X + 1 * SIZE] FMUL ALPHA, c5, t5 STF t3, [X + 2 * SIZE] FMUL ALPHA, c6, t6 STF t4, [X + 3 * SIZE] FMUL ALPHA, c7, t7 STF t5, [X + 4 * SIZE] FMUL ALPHA, c8, t8 STF t6, [X + 5 * SIZE] STF t7, [X + 6 * SIZE] STF t8, [X + 7 * SIZE] add X, 8 * SIZE, X .LL115: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL119 nop .LL116: LDF [X + 0 * SIZE], c1 add I, -1, I FMUL ALPHA, c1, c1 cmp I, 0 STF c1, [X + 0 * SIZE] bg,pt %icc, .LL116 add X, 1 * SIZE, X .LL119: return %i7 + 8 clr %o0 .LL150: sra N, 3, I cmp I, 0 ble,pn %icc, .LL155 mov X, XX .LL151: LDF [X + 0 * SIZE], c1 add X, INCX, X LDF [X + 0 * SIZE], c2 add X, INCX, X LDF [X + 0 * SIZE], c3 add X, INCX, X LDF [X + 0 * SIZE], c4 add X, INCX, X LDF [X + 0 * SIZE], c5 FMUL ALPHA, c1, c1 add X, INCX, X LDF [X + 0 * SIZE], c6 FMUL ALPHA, c2, c2 add X, INCX, X LDF [X + 0 * SIZE], c7 FMUL ALPHA, c3, c3 add X, INCX, X LDF [X + 0 * SIZE], c8 FMUL ALPHA, c4, c4 add X, INCX, X STF c1, [XX + 0 * SIZE] FMUL ALPHA, c5, c5 add XX, INCX, XX STF c2, [XX + 0 * SIZE] FMUL ALPHA, c6, c6 add XX, INCX, XX STF c3, [XX + 0 * SIZE] FMUL ALPHA, c7, c7 add XX, INCX, XX STF c4, [XX + 0 * SIZE] FMUL ALPHA, c8, c8 add XX, INCX, XX STF c5, [XX + 0 * SIZE] add XX, INCX, XX add I, -1, I STF c6, [XX + 0 * SIZE] add XX, INCX, XX cmp I, 0 STF c7, [XX + 0 * SIZE] add XX, INCX, XX STF c8, [XX + 0 * SIZE] bg,pt %icc, .LL151 add XX, INCX, XX .LL155: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL159 nop .LL156: LDF [X + 0 * SIZE], c1 add I, -1, I FMUL ALPHA, c1, c1 cmp I, 0 STF c1, [X + 0 * SIZE] bg,pt %icc, .LL156 add X, INCX, X .LL159: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/snrm2.S000066400000000000000000000152271313527062700167300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 PROLOGUE SAVESP FCLR(0) FMOV c1, c2 FMOV c1, c3 FMOV c1, c4 FMOV c1, t1 FMOV c1, t2 FMOV c1, t3 FMOV c1, t4 cmp INCX, 0 ble .LL20 sll INCX, BASE_SHIFT, INCX cmp N, 0 ble .LL20 nop cmp INCX, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop ld [X + 0 * SIZE], a1 add I, -1, I ld [X + 1 * SIZE], a2 cmp I, 0 ld [X + 2 * SIZE], a3 ld [X + 3 * SIZE], a4 ld [X + 4 * SIZE], a5 ld [X + 5 * SIZE], a6 ld [X + 6 * SIZE], a7 ld [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 40 .LL11: faddd c1, t1, c1 fsmuld a1, a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 faddd c2, t2, c2 add I, -1, I fsmuld a2, a2, t2 ld [X + 0 * SIZE], a1 faddd c3, t3, c3 cmp I, 0 fsmuld a3, a3, t3 ld [X + 1 * SIZE], a2 faddd c4, t4, c4 fsmuld a4, a4, t4 ld [X + 2 * SIZE], a3 faddd c1, t1, c1 fsmuld a5, a5, t1 ld [X + 3 * SIZE], a4 faddd c2, t2, c2 fsmuld a6, a6, t2 ld [X + 4 * SIZE], a5 faddd c3, t3, c3 fsmuld a7, a7, t3 ld [X + 5 * SIZE], a6 faddd c4, t4, c4 ld [X + 6 * SIZE], a7 fsmuld a8, a8, t4 add X, 8 * SIZE, X bg,pt %icc, .LL11 ld [X - 1 * SIZE], a8 .LL12: faddd c1, t1, c1 fsmuld a1, a1, t1 faddd c2, t2, c2 fsmuld a2, a2, t2 faddd c3, t3, c3 fsmuld a3, a3, t3 faddd c4, t4, c4 fsmuld a4, a4, t4 faddd c1, t1, c1 fsmuld a5, a5, t1 faddd c2, t2, c2 fsmuld a6, a6, t2 faddd c3, t3, c3 fsmuld a7, a7, t3 faddd c4, t4, c4 fsmuld a8, a8, t4 .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: ld [X + 0 * SIZE], a1 add I, -1, I cmp I, 0 faddd c1, t1, c1 fsmuld a1, a1, t1 bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: faddd c1, t1, c1 faddd c2, t2, c2 faddd c3, t3, c3 faddd c4, t4, c4 faddd c1, c2, c1 faddd c3, c4, c3 faddd c1, c3, c1 fsqrtd c1, c1 #if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) fdtos c1, c1 #endif .LL20: return %i7 + 8 clr %g0 .LL50: sra N, 3, I cmp I, 0 ble,pn %icc, .LL55 nop ld [X + 0 * SIZE], a1 add X, INCX, X ld [X + 0 * SIZE], a2 add X, INCX, X ld [X + 0 * SIZE], a3 add X, INCX, X ld [X + 0 * SIZE], a4 add X, INCX, X ld [X + 0 * SIZE], a5 add X, INCX, X ld [X + 0 * SIZE], a6 add X, INCX, X add I, -1, I ld [X + 0 * SIZE], a7 cmp I, 0 add X, INCX, X ld [X + 0 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: faddd c1, t1, c1 add I, -1, I fsmuld a1, a1, t1 ld [X + 0 * SIZE], a1 add X, INCX, X faddd c2, t2, c2 cmp I, 0 fsmuld a2, a2, t2 ld [X + 0 * SIZE], a2 add X, INCX, X faddd c3, t3, c3 fsmuld a3, a3, t3 ld [X + 0 * SIZE], a3 add X, INCX, X faddd c4, t4, c4 fsmuld a4, a4, t4 ld [X + 0 * SIZE], a4 add X, INCX, X faddd c1, t1, c1 fsmuld a5, a5, t1 ld [X + 0 * SIZE], a5 add X, INCX, X faddd c2, t2, c2 fsmuld a6, a6, t2 ld [X + 0 * SIZE], a6 add X, INCX, X faddd c3, t3, c3 fsmuld a7, a7, t3 ld [X + 0 * SIZE], a7 add X, INCX, X faddd c4, t4, c4 fsmuld a8, a8, t4 ld [X + 0 * SIZE], a8 bg,pt %icc, .LL51 add X, INCX, X .LL52: faddd c1, t1, c1 fsmuld a1, a1, t1 faddd c2, t2, c2 fsmuld a2, a2, t2 faddd c3, t3, c3 fsmuld a3, a3, t3 faddd c4, t4, c4 fsmuld a4, a4, t4 faddd c1, t1, c1 fsmuld a5, a5, t1 faddd c2, t2, c2 fsmuld a6, a6, t2 faddd c3, t3, c3 fsmuld a7, a7, t3 faddd c4, t4, c4 fsmuld a8, a8, t4 .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: ld [X + 0 * SIZE], a1 add I, -1, I cmp I, 0 faddd c1, t1, c1 fsmuld a1, a1, t1 bg,pt %icc, .LL56 add X, INCX, X .LL59: faddd c1, t1, c1 faddd c2, t2, c2 faddd c3, t3, c3 faddd c4, t4, c4 faddd c1, c2, c1 faddd c3, c4, c3 faddd c1, c3, c1 fsqrtd c1, c1 #if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) fdtos c1, c1 #endif return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/staticbuffer.S000066400000000000000000000053541313527062700203500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ALLOC_STATIC .align 256 .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 #endif OpenBLAS-0.2.20/kernel/sparc/swap.S000066400000000000000000000174361313527062700166450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(DOUBLE) && !defined(__64BIT__) #define N %i0 #define X %i5 #define INCX %i1 #define Y %i2 #define INCY %i3 #define I %i4 #else #define N %i0 #define X %i4 #define INCX %i5 #define Y %i1 #define INCY %i2 #define I %i3 #endif #define XX %l0 #define YY %l1 #ifdef DOUBLE #define a1 %f0 #define a2 %f2 #define a3 %f4 #define a4 %f6 #define a5 %f8 #define a6 %f10 #define a7 %f12 #define a8 %f14 #define b1 %f16 #define b2 %f18 #define b3 %f20 #define b4 %f22 #define b5 %f24 #define b6 %f26 #define b7 %f28 #define b8 %f30 #else #define a1 %f0 #define a2 %f1 #define a3 %f2 #define a4 %f3 #define a5 %f4 #define a6 %f5 #define a7 %f6 #define a8 %f7 #define b1 %f8 #define b2 %f9 #define b3 %f10 #define b4 %f11 #define b5 %f12 #define b6 %f13 #define b7 %f14 #define b8 %f15 #endif #ifdef DOUBLE #define PREFETCHSIZE 128 #else #define PREFETCHSIZE 256 #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], INCX ld [%sp + STACK_START + 32], Y ld [%sp + STACK_START + 36], INCY #else ld [%sp+ STACK_START + 28], Y ld [%sp+ STACK_START + 32], INCY #endif #else ldx [%sp+ STACK_START + 56], Y ldx [%sp+ STACK_START + 64], INCY #endif sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY cmp INCX, SIZE bne .LL50 nop cmp INCY, SIZE bne .LL50 nop sra N, 3, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 LDF [Y + 6 * SIZE], b7 LDF [X + 7 * SIZE], a8 LDF [Y + 7 * SIZE], b8 deccc I ble,pn %icc, .LL12 nop .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 deccc I STF a1, [Y + 0 * SIZE] LDF [X + 8 * SIZE], a1 STF b1, [X + 0 * SIZE] LDF [Y + 8 * SIZE], b1 STF a2, [Y + 1 * SIZE] LDF [X + 9 * SIZE], a2 STF b2, [X + 1 * SIZE] LDF [Y + 9 * SIZE], b2 STF a3, [Y + 2 * SIZE] LDF [X + 10 * SIZE], a3 STF b3, [X + 2 * SIZE] LDF [Y + 10 * SIZE], b3 STF a4, [Y + 3 * SIZE] LDF [X + 11 * SIZE], a4 STF b4, [X + 3 * SIZE] LDF [Y + 11 * SIZE], b4 prefetch [Y + PREFETCHSIZE * SIZE], 0 add X, 8 * SIZE, X STF a5, [Y + 4 * SIZE] LDF [X + 4 * SIZE], a5 STF b5, [X - 4 * SIZE] LDF [Y + 12 * SIZE], b5 STF a6, [Y + 5 * SIZE] LDF [X + 5 * SIZE], a6 STF b6, [X - 3 * SIZE] LDF [Y + 13 * SIZE], b6 STF a7, [Y + 6 * SIZE] LDF [X + 6 * SIZE], a7 STF b7, [X - 2 * SIZE] LDF [Y + 14 * SIZE], b7 STF a8, [Y + 7 * SIZE] LDF [X + 7 * SIZE], a8 STF b8, [X - 1 * SIZE] LDF [Y + 15 * SIZE], b8 bg,pt %icc, .LL11 add Y, 8 * SIZE, Y .LL12: STF a1, [Y + 0 * SIZE] STF b1, [X + 0 * SIZE] STF a2, [Y + 1 * SIZE] STF b2, [X + 1 * SIZE] STF a3, [Y + 2 * SIZE] STF b3, [X + 2 * SIZE] STF a4, [Y + 3 * SIZE] STF b4, [X + 3 * SIZE] STF a5, [Y + 4 * SIZE] STF b5, [X + 4 * SIZE] STF a6, [Y + 5 * SIZE] STF b6, [X + 5 * SIZE] STF a7, [Y + 6 * SIZE] STF b7, [X + 6 * SIZE] STF a8, [Y + 7 * SIZE] STF b8, [X + 7 * SIZE] add X, 8 * SIZE, X add Y, 8 * SIZE, Y .LL15: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 add I, -1, I LDF [Y + 0 * SIZE], b1 cmp I, 0 STF a1, [Y + 0 * SIZE] add Y, 1 * SIZE, Y STF b1, [X + 0 * SIZE] bg,pt %icc, .LL16 add X, 1 * SIZE, X .LL19: return %i7 + 8 clr %g0 .LL50: sra N, 3, I mov X, XX cmp I, 0 ble,pn %icc, .LL55 mov Y, YY .LL51: LDF [X + 0 * SIZE], a1 add X, INCX, X LDF [Y + 0 * SIZE], b1 add Y, INCY, Y LDF [X + 0 * SIZE], a2 add X, INCX, X LDF [Y + 0 * SIZE], b2 add Y, INCY, Y LDF [X + 0 * SIZE], a3 add X, INCX, X LDF [Y + 0 * SIZE], b3 add Y, INCY, Y LDF [X + 0 * SIZE], a4 add X, INCX, X LDF [Y + 0 * SIZE], b4 add Y, INCY, Y LDF [X + 0 * SIZE], a5 add X, INCX, X LDF [Y + 0 * SIZE], b5 add Y, INCY, Y LDF [X + 0 * SIZE], a6 add X, INCX, X LDF [Y + 0 * SIZE], b6 add Y, INCY, Y LDF [X + 0 * SIZE], a7 add X, INCX, X LDF [Y + 0 * SIZE], b7 add Y, INCY, Y LDF [X + 0 * SIZE], a8 add X, INCX, X LDF [Y + 0 * SIZE], b8 add Y, INCY, Y STF a1, [YY + 0 * SIZE] add I, -1, I add YY, INCY, YY STF b1, [XX + 0 * SIZE] cmp I, 0 add XX, INCX, XX STF a2, [YY + 0 * SIZE] add YY, INCY, YY STF b2, [XX + 0 * SIZE] add XX, INCX, XX STF a3, [YY + 0 * SIZE] add YY, INCY, YY STF b3, [XX + 0 * SIZE] add XX, INCX, XX STF a4, [YY + 0 * SIZE] add YY, INCY, YY STF b4, [XX + 0 * SIZE] add XX, INCX, XX STF a5, [YY + 0 * SIZE] add YY, INCY, YY STF b5, [XX + 0 * SIZE] add XX, INCX, XX STF a6, [YY + 0 * SIZE] add YY, INCY, YY STF b6, [XX + 0 * SIZE] add XX, INCX, XX STF a7, [YY + 0 * SIZE] add YY, INCY, YY STF b7, [XX + 0 * SIZE] add XX, INCX, XX STF a8, [YY + 0 * SIZE] add YY, INCY, YY STF b8, [XX + 0 * SIZE] bg,pt %icc, .LL51 add XX, INCX, XX .LL55: and N, 7, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 add I, -1, I cmp I, 0 STF b1, [X + 0 * SIZE] add X, INCX, X STF a1, [Y + 0 * SIZE] bg,pt %icc, .LL56 add Y, INCY, Y .LL59: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/trsm_kernel_LN.S000066400000000000000000001777321313527062700206170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define OFFSET %l5 #define KK %l6 #define TEMP1 %l7 #define TEMP2 %i3 #define AORIG %g1 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f58 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f60 #define ALPHA %f62 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #define ALPHA %f30 #endif #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #else ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC ld [%sp + STACK_START + 36], OFFSET #endif #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC ldx [%sp+ STACK_START + 72], OFFSET #endif FCLR(29) sll LDC, BASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 2, J cmp J, 0 ble,pn %icc, .LL100 nop .LL11: #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 2, TEMP1 sub C, TEMP1, C #endif mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C4, LDC, C #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL50 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL75 nop .LL72: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a1, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 4 * SIZE], a1 FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a2, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [BO + 9 * SIZE], b2 FADD c03, t3, c03 FMUL a2, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 11 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 12 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 13 * SIZE], b2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [BO + 14 * SIZE], b3 FADD c04, t4, c04 FMUL a3, b4, t4 LDF [BO + 15 * SIZE], b4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 16 * SIZE], b1 FADD c02, t2, c02 FMUL a4, b2, t2 LDF [BO + 17 * SIZE], b2 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 18 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 19 * SIZE], b4 add BO, 16 * SIZE, BO bg,pt %icc, .LL72 LDF [AO + 3 * SIZE], a4 .LL75: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL79 nop .LL76: FADD c01, t1, c01 add AO, 1 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 cmp L, 0 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 add BO, 4 * SIZE, BO FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 bg,pt %icc, .LL76 LDF [BO + 3 * SIZE], b4 .LL79: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [BO + 15 * SIZE], a1 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2 add C3, 1 * SIZE, C3 add C4, 1 * SIZE, C4 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL50: and M, 2, I cmp I, 0 ble,pn %icc, .LL70 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, c02 FMOV FZERO, t1 FMOV FZERO, c04 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD c04, t2, c04 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif FADD c02, t1, c02 FADD c04, t2, c04 FADD c06, t3, c06 FADD c08, t4, c08 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a2, c02, t1 FMUL a2, c04, t2 FMUL a2, c06, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c03, t2, c03 FSUB c05, t3, c05 FSUB c07, t4, c07 FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a2, c01, t1 FMUL a2, c03, t2 FMUL a2, c05, t3 FMUL a2, c07, t4 FSUB c02, t1, c02 FSUB c04, t2, c04 FSUB c06, t3, c06 FSUB c08, t4, c08 FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c01, t1 FMUL a3, c02, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a4, c01, t1 FMUL a4, c02, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c03, t1 FMUL a3, c04, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c07, t1 FMUL a2, c08, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c07, t1 FMUL a3, c08, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a4, c07, t1 FMUL a4, c08, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c05, t1 FMUL a3, c06, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL70: sra M, 2, I cmp I, 0 ble,pn %icc, .LL99 nop .LL21: FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 FMOV FZERO, c01 FMOV FZERO, c02 FMOV FZERO, c03 #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c04 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c05 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c06 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c07 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c08 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c09 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c10 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c11 LDF [BO + 4 * SIZE], b5 /* ***** */ LDF [AO + 4 * SIZE], a5 /* ***** */ #ifdef LN prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c12 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C3 + 3 * SIZE], 3 FMOV FZERO, c14 prefetch [C4 + 3 * SIZE], 3 FMOV FZERO, c15 #else prefetch [C1 - 3 * SIZE], 3 FMOV FZERO, c12 prefetch [C2 - 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C3 - 3 * SIZE], 3 FMOV FZERO, c14 prefetch [C4 - 3 * SIZE], 3 FMOV FZERO, c15 #endif ble,pn %icc, .LL25 FMOV FZERO, c16 .LL22: FADD c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD c08, t2, c08 nop FMUL a5, b2, t2 nop FADD c12, t3, c12 nop FMUL a5, b3, t3 nop FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c08, t2, c08 nop FMUL a1, b2, t2 nop FADD c12, t3, c12 nop FMUL a1, b3, t3 nop FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD c05, t2, c05 nop FMUL a2, b2, t2 FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 nop FADD c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD c08, t2, c08 FMUL a5, b2, t2 FADD c12, t3, c12 FMUL a5, b3, t3 FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL29 nop .LL26: FADD c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #if defined(LN) || defined(RT) sub KK, 4, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif FADD c04, t1, c04 FADD c08, t2, c08 FADD c12, t3, c12 FADD c16, t4, c16 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c09, c09 FSUB a4, c13, c13 FSUB b1, c02, c02 FSUB b2, c06, c06 FSUB b3, c10, c10 FSUB b4, c14, c14 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c03, c03 FSUB a2, c07, c07 FSUB a3, c11, c11 FSUB a4, c15, c15 FSUB b1, c04, c04 FSUB b2, c08, c08 FSUB b3, c12, c12 FSUB b4, c16, c16 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 FMUL a2, c04, t1 FMUL a2, c08, t2 FMUL a2, c12, t3 FMUL a2, c16, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c04, t1 FMUL a3, c08, t2 FMUL a3, c12, t3 FMUL a3, c16, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a4, c04, t1 FMUL a4, c08, t2 FMUL a4, c12, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c03, t1 FMUL a3, c07, t2 FMUL a3, c11, t3 FMUL a3, c15, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13 FMUL a2, c01, t1 FMUL a2, c05, t2 FMUL a2, c09, t3 FMUL a2, c13, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c01, t1 FMUL a3, c05, t2 FMUL a3, c09, t3 FMUL a3, c13, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a4, c01, t1 FMUL a4, c05, t2 FMUL a4, c09, t3 FMUL a4, c13, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c02, t1 FMUL a3, c06, t2 FMUL a3, c10, t3 FMUL a3, c14, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c01, t1 FMUL a2, c02, t2 FMUL a2, c03, t3 FMUL a2, c04, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c03, t3 FMUL a3, c04, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a4, c01, t1 FMUL a4, c02, t2 FMUL a4, c03, t3 FMUL a4, c04, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a3, c07, t3 FMUL a3, c08, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 15 * SIZE], a1 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16 FMUL a2, c13, t1 FMUL a2, c14, t2 FMUL a2, c15, t3 FMUL a2, c16, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c13, t1 FMUL a3, c14, t2 FMUL a3, c15, t3 FMUL a3, c16, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a4, c13, t1 FMUL a4, c14, t2 FMUL a4, c15, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c09, t1 FMUL a3, c10, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 add C3, -4 * SIZE, C3 add C4, -4 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c05, [BO + 1 * SIZE] STF c09, [BO + 2 * SIZE] STF c13, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c06, [BO + 5 * SIZE] STF c10, [BO + 6 * SIZE] STF c14, [BO + 7 * SIZE] STF c03, [BO + 8 * SIZE] STF c07, [BO + 9 * SIZE] STF c11, [BO + 10 * SIZE] STF c15, [BO + 11 * SIZE] STF c04, [BO + 12 * SIZE] STF c08, [BO + 13 * SIZE] STF c12, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] STF c09, [C3 + 0 * SIZE] STF c10, [C3 + 1 * SIZE] STF c11, [C3 + 2 * SIZE] STF c12, [C3 + 3 * SIZE] STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] STF c15, [C4 + 2 * SIZE] STF c16, [C4 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 add C3, 4 * SIZE, C3 add C4, 4 * SIZE, C4 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 sra K, 2, L bg,pt %icc, .LL21 FMOV FZERO, c01 .LL99: #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: /* n & 2 */ and N, 2, J cmp J, 0 ble,pn %icc, .LL200 nop #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 1, TEMP1 sub C, TEMP1, C #endif mov C, C1 add C, LDC, C2 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C2, LDC, C #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL150 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL175 nop .LL172: FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 add L, -1, L LDF [AO + 0 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 9 * SIZE], b2 LDF [AO + 2 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 11 * SIZE], b4 add BO, 8 * SIZE, BO bg,pt %icc, .LL172 LDF [AO + 3 * SIZE], a4 .LL175: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL179 nop .LL176: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 add AO, 1 * SIZE, AO LDF [BO + 2 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 3 * SIZE], b2 add BO, 2 * SIZE, BO bg,pt %icc, .LL176 LDF [AO + 0 * SIZE], a1 .LL179: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL150: and M, 2, I cmp I, 0 ble,pn %icc, .LL170 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL155 nop .LL152: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL a1, b1, t1 FMUL a1, b2, t2 FMUL a2, b1, t3 FMUL a2, b2, t4 add AO, 2 * SIZE, AO add BO, 2 * SIZE, BO add L, -1, L cmp L, 0 bg,pt %icc, .LL156 nop .LL159: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 FMUL a3, c01, c01 FMUL a3, c02, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c03, c03 FMUL a3, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a2, c01, t1 FMUL a2, c03, t2 FSUB c02, t1, c02 FSUB c04, t2, c04 FMUL a3, c02, c02 FMUL a3, c04, c04 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a2, c02, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c03, t2, c03 FMUL a3, c01, c01 FMUL a3, c03, c03 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c02, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] STF c02, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL170: sra M, 2, I cmp I, 0 ble,pn %icc, .LL199 FMOV FZERO, c03 .LL121: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 #ifdef LN prefetch [C1 - 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 - 3 * SIZE], 2 FMOV FZERO, c02 #else prefetch [C1 + 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 + 3 * SIZE], 2 FMOV FZERO, c02 #endif ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b1, t3 nop FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 FADD c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: FADD c03, t1, c03 FADD c07, t2, c07 FADD c04, t3, c04 FADD c08, t4, c08 #if defined(LN) || defined(RT) #ifdef LN sub KK, 4, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c02, c02 FSUB a4, c06, c06 FSUB b1, c03, c03 FSUB b2, c07, c07 FSUB b3, c04, c04 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a2, c04, t1 FMUL a2, c08, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a3, c04, t1 FMUL a3, c08, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a4, c04, t1 FMUL a4, c08, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a2, c03, t1 FMUL a2, c07, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a3, c03, t1 FMUL a3, c07, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a2, c02, t1 FMUL a2, c06, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c05, c05 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a2, c01, t1 FMUL a2, c05, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a3, c01, t1 FMUL a3, c05, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a4, c01, t1 FMUL a4, c05, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a2, c02, t1 FMUL a2, c06, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a3, c02, t1 FMUL a3, c06, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a2, c03, t1 FMUL a2, c07, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 FMUL a1, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c01, t1 FMUL a2, c02, t2 FMUL a2, c03, t3 FMUL a2, c04, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c05, c05 FMUL a3, c06, c06 FMUL a3, c07, c07 FMUL a3, c08, c08 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 FMUL a3, c01, c01 FMUL a3, c02, c02 FMUL a3, c03, c03 FMUL a3, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c05, [BO + 1 * SIZE] STF c02, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] STF c03, [BO + 4 * SIZE] STF c07, [BO + 5 * SIZE] STF c04, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL199: #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .LL200: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sub C, LDC, C #endif mov C, C1 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C, LDC, C #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL250 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c01 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t2 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c02 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 ble,pn %icc, .LL275 LDF [BO + 3 * SIZE], b4 .LL272: FADD c01, t1, c01 add L, -1, L add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 4 * SIZE, BO LDF [AO + 0 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 LDF [BO + 0 * SIZE], b1 FMUL a2, b2, t2 LDF [AO + 1 * SIZE], a2 FADD c01, t3, c01 LDF [BO + 1 * SIZE], b2 FMUL a3, b3, t3 LDF [AO + 2 * SIZE], a3 FADD c02, t4, c02 LDF [BO + 2 * SIZE], b3 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL272 LDF [BO + 3 * SIZE], b4 .LL275: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL279 nop .LL276: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add BO, 1 * SIZE, BO cmp L, 0 bg,pt %icc, .LL276 add AO, 1 * SIZE, AO .LL279: FADD c01, t1, c01 FADD c02, t2, c02 FADD c01, t3, c01 FADD c02, t4, c02 FADD c01, c02, c01 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 FSUB a1, c01, c01 #else LDF [AO + 0 * SIZE], a1 FSUB a1, c01, c01 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] #else STF c01, [AO + 0 * SIZE] #endif STF c01, [C1 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL250: and M, 2, I cmp I, 0 ble,pn %icc, .LL270 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL255 nop .LL252: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 FADD c03, t3, c03 cmp L, 0 FMUL a3, b2, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 9 * SIZE], a2 LDF [BO + 6 * SIZE], b3 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL252 add BO, 4 * SIZE, BO .LL255: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL259 nop .LL256: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 2 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 FMUL a2, b1, t2 LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add AO, 2 * SIZE, AO bg,pt %icc, .LL256 add BO, 1 * SIZE, BO .LL259: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL270: sra M, 2, I cmp I, 0 ble,pn %icc, .LL299 nop .LL221: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #ifdef LN prefetch [C1 - 3 * SIZE], 2 #else prefetch [C1 + 3 * SIZE], 2 #endif ble,pn %icc, .LL225 prefetch [C1 + 4 * SIZE], 2 .LL222: FADD c01, t1, c01 add BO, 4 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 add L, -1, L FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FADD c01, t1, c01 cmp L, 0 FMUL a1, b2, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [AO + 9 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b2, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 11 * SIZE], a4 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 12 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 13 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [AO + 14 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b3, t4 LDF [AO + 15 * SIZE], a4 LDF [BO + 2 * SIZE], b3 FADD c01, t1, c01 FMUL a1, b4, t1 LDF [AO + 16 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b4, t2 LDF [AO + 17 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 18 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 19 * SIZE], a4 add AO, 16 * SIZE, AO bg,pt %icc, .LL222 LDF [BO + 3 * SIZE], b4 .LL225: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL229 nop .LL226: FADD c01, t1, c01 add BO, 1 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 add L, -1, L FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 cmp L, 0 FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO bg,pt %icc, .LL226 LDF [BO + 0 * SIZE], b1 .LL229: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 4, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL221 nop .LL299: #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/trsm_kernel_LN_2x8.S000066400000000000000000002157761313527062700213210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define C5 %l5 #define C6 %l6 #define C7 %l7 #define C8 %i3 #define OFFSET %g1 #define KK %g2 #define TEMP1 %g3 #define TEMP2 %g4 #define AORIG %o7 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #else ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC ld [%sp + STACK_START + 36], OFFSET #endif st %g1, [%sp + STACK_START + 8] st %g2, [%sp + STACK_START + 12] st %g3, [%sp + STACK_START + 16] st %g4, [%sp + STACK_START + 20] #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC ldx [%sp+ STACK_START + 72], OFFSET stx %g1, [%sp + STACK_START + 32] stx %g2, [%sp + STACK_START + 40] stx %g3, [%sp + STACK_START + 48] stx %g4, [%sp + STACK_START + 56] #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sll LDC, BASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 3, J cmp J, 0 ble,pn %icc, .LL30 nop .align 4 .LL11: #ifdef RT sll K, BASE_SHIFT + 3, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C5 add C5, LDC, C6 add C6, LDC, C7 add C7, LDC, C8 add C8, LDC, C #else sub C, LDC, C8 sub C8, LDC, C7 sub C7, LDC, C6 sub C6, LDC, C5 sub C5, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL20 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FCLR (cc01) LDF [BO + 1 * SIZE], b2 FCLR (cc03) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc07) LDF [BO + 4 * SIZE], b5 FCLR (cc09) LDF [BO + 5 * SIZE], b6 FCLR (cc11) LDF [BO + 6 * SIZE], b7 FCLR (cc13) LDF [BO + 7 * SIZE], b8 FCLR (cc15) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 LDF [BO + 8 * SIZE], b9 .align 4 .LL23: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 FMADD (aa2, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa2, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa2, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 FMADD (aa2, bb5, cc09, cc09) LDF [BO + 20 * SIZE], b5 FMADD (aa2, bb6, cc11, cc11) LDF [BO + 21 * SIZE], b6 FMADD (aa2, bb7, cc13, cc13) LDF [BO + 22 * SIZE], b7 FMADD (aa2, bb8, cc15, cc15) LDF [BO + 23 * SIZE], b8 LDF [AO + 4 * SIZE], a1 LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb1, cc01, cc01) LDF [BO + 32 * SIZE], b1 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 25 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 26 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 27 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [BO + 28 * SIZE], b5 FMADD (aa3, bb6, cc11, cc11) LDF [BO + 29 * SIZE], b6 FMADD (aa3, bb7, cc13, cc13) LDF [BO + 30 * SIZE], b7 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 31 * SIZE], b8 FMADD (aa4, bb9, cc01, cc01) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb2, cc03, cc03) LDF [BO + 33 * SIZE], b2 FMADD (aa4, bb3, cc05, cc05) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc07, cc07) LDF [BO + 35 * SIZE], b4 FMADD (aa4, bb5, cc09, cc09) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb6, cc11, cc11) LDF [BO + 37 * SIZE], b6 FMADD (aa4, bb7, cc13, cc13) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc15, cc15) LDF [BO + 39 * SIZE], b8 LDF [AO + 6 * SIZE], a3 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL23 add BO, 32 * SIZE, BO .align 4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 LDF [AO + 1 * SIZE], a1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL27 add BO, 8 * SIZE, BO .align 4 .LL28: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb4, cc01, cc15, cc15) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb3, cc03, cc15, cc15) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb2, cc05, cc15, cc15) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (bb1, cc07, cc15, cc15) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa4, cc09, cc15, cc15) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa3, cc11, cc15, cc15) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc15, cc15) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c15, c15 FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 add C5, -1 * SIZE, C5 add C6, -1 * SIZE, C6 add C7, -1 * SIZE, C7 add C8, -1 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c11, [AO + 5 * SIZE] STF c13, [AO + 6 * SIZE] STF c15, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] STF c09, [C5 + 0 * SIZE] STF c11, [C6 + 0 * SIZE] STF c13, [C7 + 0 * SIZE] STF c15, [C8 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL20: sra M, 1, I cmp I, 0 ble,pn %icc, .LL29 nop .align 4 .LL12: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 8 * SIZE], a5 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FCLR (cc01) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc09) LDF [BO + 4 * SIZE], b5 FCLR (cc13) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc06) LDF [BO + 7 * SIZE], b8 FCLR (cc10) LDF [BO + 8 * SIZE], b9 FCLR (cc14) prefetch [C1 + 1 * SIZE], 3 FCLR (cc03) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) prefetch [C3 + 1 * SIZE], 3 FCLR (cc11) prefetch [C4 + 2 * SIZE], 3 FCLR (cc15) prefetch [C5 + 1 * SIZE], 3 FCLR (cc04) prefetch [C6 + 2 * SIZE], 3 FCLR (cc08) prefetch [C7 + 1 * SIZE], 3 FCLR (cc12) prefetch [C8 + 2 * SIZE], 3 FCLR (cc16) #if defined(LT) || defined(RN) sra KK, 3, L #else sub K, KK, L sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 nop .align 4 .LL13: FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #if defined(LT) || defined(RN) and KK, 7, L #else sub K, KK, L and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) nop FMADD (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) nop FMADD (aa2, bb5, cc10, cc10) nop FMADD (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c02, c02 FSUB a2, c04, c04 FSUB a3, c06, c06 FSUB a4, c08, c08 FSUB b1, c10, c10 FSUB b2, c12, c12 FSUB b3, c14, c14 FSUB b4, c16, c16 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a1, c10, c10 FMUL a1, c12, c12 FMUL a1, c14, c14 FMUL a1, c16, c16 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FNMSUB (aa2, cc10, cc09, cc09) FNMSUB (aa2, cc12, cc11, cc11) FNMSUB (aa2, cc14, cc13, cc13) FNMSUB (aa2, cc16, cc15, cc15) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 FMUL a3, c09, c09 FMUL a3, c11, c11 FMUL a3, c13, c13 FMUL a3, c15, c15 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FNMSUB (aa2, cc09, cc10, cc10) FNMSUB (aa2, cc11, cc12, cc12) FNMSUB (aa2, cc13, cc14, cc14) FNMSUB (aa2, cc15, cc16, cc16) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 FMUL a3, c10, c10 FMUL a3, c12, c12 FMUL a3, c14, c14 FMUL a3, c16, c16 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb1, cc02, cc10, cc10) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb2, cc02, cc12, cc12) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb3, cc02, cc14, cc14) FNMSUB (bb4, cc01, cc15, cc15) FNMSUB (bb4, cc02, cc16, cc16) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (aa4, cc04, cc10, cc10) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb1, cc04, cc12, cc12) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb2, cc04, cc14, cc14) FNMSUB (bb3, cc03, cc15, cc15) FNMSUB (bb3, cc04, cc16, cc16) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa3, cc06, cc10, cc10) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (aa4, cc06, cc12, cc12) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb1, cc06, cc14, cc14) FNMSUB (bb2, cc05, cc15, cc15) FNMSUB (bb2, cc06, cc16, cc16) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FMUL a1, c08, c08 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa2, cc08, cc10, cc10) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa3, cc08, cc12, cc12) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (aa4, cc08, cc14, cc14) FNMSUB (bb1, cc07, cc15, cc15) FNMSUB (bb1, cc08, cc16, cc16) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FMUL a1, c10, c10 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa2, cc10, cc12, cc12) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa3, cc10, cc14, cc14) FNMSUB (aa4, cc09, cc15, cc15) FNMSUB (aa4, cc10, cc16, cc16) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FMUL a1, c12, c12 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa2, cc12, cc14, cc14) FNMSUB (aa3, cc11, cc15, cc15) FNMSUB (aa3, cc12, cc16, cc16) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FMUL a1, c14, c14 FNMSUB (aa2, cc13, cc15, cc15) FNMSUB (aa2, cc14, cc16, cc16) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 FMUL a1, c16, c16 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c16, c16 FMUL a1, c15, c15 FNMSUB (aa2, cc16, cc14, cc14) FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc16, cc12, cc12) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc16, cc10, cc10) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc16, cc08, cc08) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc16, cc06, cc06) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc16, cc04, cc04) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc16, cc02, cc02) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c14, c14 FMUL a1, c13, c13 FNMSUB (aa2, cc14, cc12, cc12) FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc14, cc10, cc10) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc14, cc08, cc08) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc14, cc06, cc06) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc14, cc04, cc04) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc14, cc02, cc02) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c12, c12 FMUL a1, c11, c11 FNMSUB (aa2, cc12, cc10, cc10) FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc12, cc08, cc08) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc12, cc06, cc06) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc12, cc04, cc04) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc12, cc02, cc02) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c10, c10 FMUL a1, c09, c09 FNMSUB (aa2, cc10, cc08, cc08) FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc10, cc06, cc06) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc10, cc04, cc04) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc10, cc02, cc02) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 add C5, -2 * SIZE, C5 add C6, -2 * SIZE, C6 add C7, -2 * SIZE, C7 add C8, -2 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] STF c02, [BO + 8 * SIZE] STF c04, [BO + 9 * SIZE] STF c06, [BO + 10 * SIZE] STF c08, [BO + 11 * SIZE] STF c10, [BO + 12 * SIZE] STF c12, [BO + 13 * SIZE] STF c14, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] STF c09, [C5 + 0 * SIZE] STF c10, [C5 + 1 * SIZE] STF c11, [C6 + 0 * SIZE] STF c12, [C6 + 1 * SIZE] STF c13, [C7 + 0 * SIZE] STF c14, [C7 + 1 * SIZE] STF c15, [C8 + 0 * SIZE] STF c16, [C8 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 add C5, 2 * SIZE, C5 add C6, 2 * SIZE, C6 add C7, 2 * SIZE, C7 add C8, 2 * SIZE, C8 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop .align 4 .LL29: #ifdef LN sll K, BASE_SHIFT + 3, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 8, KK #endif #ifdef RT sub KK, 8, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .align 4 .LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL50 nop #ifdef RT sll K, BASE_SHIFT + 2, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C #else sub C, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL40 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc05) LDF [BO + 8 * SIZE], b9 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL45 nop .LL43: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 FMADD (aa2, bb7, cc05, cc05) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc07, cc07) LDF [BO + 15 * SIZE], b8 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 LDF [AO + 2 * SIZE], a3 add BO, 16 * SIZE, BO FMADD (aa4, bb5, cc01, cc01) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc03, cc03) LDF [BO + 5 * SIZE], b6 FMADD (aa4, bb7, cc05, cc05) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc07, cc07) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL43 LDF [AO + 3 * SIZE], a4 .align 4 .LL45: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL48 nop .align 4 .LL47: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 4 * SIZE], b1 add L, -1, L FMADD (aa1, bb2, cc03, cc03) LDF [BO + 5 * SIZE], b2 add AO, 1 * SIZE, AO FMADD (aa1, bb3, cc05, cc05) LDF [BO + 6 * SIZE], b3 cmp L, 0 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 7 * SIZE], b4 add BO, 4 * SIZE, BO bg,pt %icc, .LL47 LDF [AO + 0 * SIZE], a1 .align 4 .LL48: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL40: sra M, 1, I cmp I, 0 ble,pn %icc, .LL49 nop .align 4 .LL32: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc02) LDF [BO + 7 * SIZE], b8 FCLR (cc03) LDF [BO + 8 * SIZE], b9 FCLR (cc04) prefetch [C1 + 2 * SIZE], 3 FCLR (cc05) prefetch [C2 + 2 * SIZE], 3 FCLR (cc06) prefetch [C3 + 2 * SIZE], 3 FCLR (cc07) prefetch [C4 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 nop .align 4 .LL33: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb3, cc06, cc06) add L, -1, L FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) cmp L, 0 FMADD (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) nop FMADD (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD (aa3, bb8, cc07, cc07) FMADD (aa4, bb8, cc08, cc08) bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL37 add BO, 4 * SIZE, BO .align 4 .LL38: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 nop .LL49: #ifdef LN sll K, BASE_SHIFT + 2, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif .align 4 .LL50: and N, 2, J cmp J, 0 ble,pn %icc, .LL70 nop #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C #else sub C, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL60 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 LDF [BO + 6 * SIZE], b7 FCLR (cc01) LDF [BO + 7 * SIZE], b8 FCLR (cc03) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL65 nop .align 4 .LL63: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb3, cc01, cc01) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc03, cc03) LDF [BO + 11 * SIZE], b4 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 LDF [AO + 2 * SIZE], a3 add BO, 8 * SIZE, BO FMADD (aa4, bb7, cc01, cc01) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc03, cc03) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL63 LDF [AO + 3 * SIZE], a4 .align 4 .LL65: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL68 nop .align 4 .LL67: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 3 * SIZE], b2 LDF [AO + 1 * SIZE], a1 add L, -1, L add AO, 1 * SIZE, AO cmp L, 0 bg,pt %icc, .LL67 add BO, 2 * SIZE, BO .align 4 .LL68: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL60: sra M, 1, I cmp I, 0 ble,pn %icc, .LL69 nop .align 4 .LL52: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL55 nop .align 4 .LL53: FMADD (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL53 LDF [BO + 7 * SIZE], b8 .align 4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL58 nop .align 4 .LL57: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL57 LDF [BO + 1 * SIZE], b2 .align 4 .LL58: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FMUL a3, c01, c01 FMUL a3, c03, c03 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FMUL a3, c02, c02 FMUL a3, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c02, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL52 nop .align 4 .LL69: #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .align 4 .LL70: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, BASE_SHIFT, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C1, LDC, C #else sub C, LDC, C1 sub C, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL80 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [BO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], a2 LDF [BO + 1 * SIZE], b2 LDF [AO + 2 * SIZE], a3 LDF [BO + 2 * SIZE], b3 LDF [AO + 3 * SIZE], a4 LDF [BO + 3 * SIZE], b4 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL85 FCLR (cc01) .align 4 .LL83: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 LDF [BO + 4 * SIZE], b1 FMADD (aa2, bb2, cc01, cc01) LDF [AO + 5 * SIZE], a2 LDF [BO + 5 * SIZE], b2 FMADD (aa3, bb3, cc01, cc01) LDF [AO + 6 * SIZE], a3 LDF [BO + 6 * SIZE], b3 FMADD (aa4, bb4, cc01, cc01) LDF [AO + 7 * SIZE], a4 LDF [BO + 7 * SIZE], b4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL83 add BO, 4 * SIZE, BO .align 4 .LL85: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL88 nop .align 4 .LL87: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL87 add BO, 1 * SIZE, BO .align 4 .LL88: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 FSUB a1, c01, c01 #else LDF [AO + 0 * SIZE], a1 FSUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] #else STF c01, [AO + 0 * SIZE] #endif STF c01, [C1 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL80: sra M, 1, I cmp I, 0 ble,pn %icc, .LL89 nop .align 4 .LL72: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) prefetch [C1 + 2 * SIZE], 3 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL75 nop .LL73: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 cmp L, 0 FMADD (aa3, bb2, cc01, cc01) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb2, cc02, cc02) LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 add BO, 4 * SIZE, BO FMADD (aa1, bb3, cc01, cc01) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb3, cc02, cc02) LDF [AO + 9 * SIZE], a2 LDF [BO + 2 * SIZE], b3 add AO, 8 * SIZE, AO FMADD (aa3, bb4, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa4, bb4, cc02, cc02) LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL73 LDF [BO + 3 * SIZE], b4 .align 4 .LL75: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL78 nop .align 4 .LL77: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add L, -1, L add AO, 2 * SIZE, AO cmp L, 0 bg,pt %icc, .LL77 add BO, 1 * SIZE, BO .align 4 .LL78: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FNMSUB (aa2, cc02, cc01, cc01) FMUL a3, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc02, cc02) FMUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL72 nop .align 4 .LL89: #ifdef LN sll K, BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .align 4 .LL999: #ifdef TRMMKERNEL #ifndef __64BIT__ ld [%sp + STACK_START + 8], %g1 ld [%sp + STACK_START + 12], %g2 ld [%sp + STACK_START + 16], %g3 ld [%sp + STACK_START + 20], %g4 #else ldx [%sp + STACK_START + 32], %g1 ldx [%sp + STACK_START + 40], %g2 ldx [%sp + STACK_START + 48], %g3 ldx [%sp + STACK_START + 56], %g4 #endif #endif return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/trsm_kernel_LT.S000066400000000000000000001770211313527062700206140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define OFFSET %l5 #define KK %l6 #define TEMP1 %l7 #define TEMP2 %i3 #define AORIG %g1 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f58 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f60 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #endif PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #else ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC ld [%sp + STACK_START + 36], OFFSET #endif #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC ldx [%sp+ STACK_START + 72], OFFSET #endif FCLR(29) sll LDC, BASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 2, J cmp J, 0 ble,pn %icc, .LL100 nop .LL11: #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 2, TEMP1 sub C, TEMP1, C #endif add C, LDC, C2 FMOV FZERO, t1 nop mov C, C1 add C2, LDC, C3 FMOV FZERO, t2 sra M, 2, I add C3, LDC, C4 FMOV FZERO, t3 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif cmp I, 0 #ifndef RT add C4, LDC, C #endif FMOV FZERO, t4 ble,pn %icc, .LL50 FMOV FZERO, c01 .LL21: FMOV FZERO, c02 FMOV FZERO, c03 #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c04 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c05 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c06 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c07 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c08 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c09 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c10 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c11 LDF [BO + 4 * SIZE], b5 /* ***** */ LDF [AO + 4 * SIZE], a5 /* ***** */ prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c12 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C3 + 3 * SIZE], 3 FMOV FZERO, c14 prefetch [C4 + 3 * SIZE], 3 FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 .LL22: FADD c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD c08, t2, c08 nop FMUL a5, b2, t2 nop FADD c12, t3, c12 nop FMUL a5, b3, t3 nop FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c08, t2, c08 nop FMUL a1, b2, t2 nop FADD c12, t3, c12 nop FMUL a1, b3, t3 nop FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD c05, t2, c05 nop FMUL a2, b2, t2 FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 nop FADD c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD c08, t2, c08 FMUL a5, b2, t2 FADD c12, t3, c12 FMUL a5, b3, t3 FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL29 nop .LL26: FADD c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #if defined(LN) || defined(RT) sub KK, 4, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif FADD c04, t1, c04 FADD c08, t2, c08 FADD c12, t3, c12 FADD c16, t4, c16 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c09, c09 FSUB a4, c13, c13 FSUB b1, c02, c02 FSUB b2, c06, c06 FSUB b3, c10, c10 FSUB b4, c14, c14 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c03, c03 FSUB a2, c07, c07 FSUB a3, c11, c11 FSUB a4, c15, c15 FSUB b1, c04, c04 FSUB b2, c08, c08 FSUB b3, c12, c12 FSUB b4, c16, c16 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 FMUL a2, c04, t1 FMUL a2, c08, t2 FMUL a2, c12, t3 FMUL a2, c16, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c04, t1 FMUL a3, c08, t2 FMUL a3, c12, t3 FMUL a3, c16, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a4, c04, t1 FMUL a4, c08, t2 FMUL a4, c12, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c03, t1 FMUL a3, c07, t2 FMUL a3, c11, t3 FMUL a3, c15, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13 FMUL a2, c01, t1 FMUL a2, c05, t2 FMUL a2, c09, t3 FMUL a2, c13, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c01, t1 FMUL a3, c05, t2 FMUL a3, c09, t3 FMUL a3, c13, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a4, c01, t1 FMUL a4, c05, t2 FMUL a4, c09, t3 FMUL a4, c13, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c02, t1 FMUL a3, c06, t2 FMUL a3, c10, t3 FMUL a3, c14, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c01, t1 FMUL a2, c02, t2 FMUL a2, c03, t3 FMUL a2, c04, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c03, t3 FMUL a3, c04, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a4, c01, t1 FMUL a4, c02, t2 FMUL a4, c03, t3 FMUL a4, c04, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a3, c07, t3 FMUL a3, c08, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 15 * SIZE], a1 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16 FMUL a2, c13, t1 FMUL a2, c14, t2 FMUL a2, c15, t3 FMUL a2, c16, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c13, t1 FMUL a3, c14, t2 FMUL a3, c15, t3 FMUL a3, c16, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a4, c13, t1 FMUL a4, c14, t2 FMUL a4, c15, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c09, t1 FMUL a3, c10, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 add C3, -4 * SIZE, C3 add C4, -4 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c05, [BO + 1 * SIZE] STF c09, [BO + 2 * SIZE] STF c13, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c06, [BO + 5 * SIZE] STF c10, [BO + 6 * SIZE] STF c14, [BO + 7 * SIZE] STF c03, [BO + 8 * SIZE] STF c07, [BO + 9 * SIZE] STF c11, [BO + 10 * SIZE] STF c15, [BO + 11 * SIZE] STF c04, [BO + 12 * SIZE] STF c08, [BO + 13 * SIZE] STF c12, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] STF c09, [C3 + 0 * SIZE] STF c10, [C3 + 1 * SIZE] STF c11, [C3 + 2 * SIZE] STF c12, [C3 + 3 * SIZE] STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] STF c15, [C4 + 2 * SIZE] STF c16, [C4 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 add C3, 4 * SIZE, C3 add C4, 4 * SIZE, C4 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL21 FMOV FZERO, c01 .LL50: and M, 2, I cmp I, 0 ble,pn %icc, .LL70 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, c02 FMOV FZERO, t1 FMOV FZERO, c04 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD c04, t2, c04 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif FADD c02, t1, c02 FADD c04, t2, c04 FADD c06, t3, c06 FADD c08, t4, c08 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a2, c02, t1 FMUL a2, c04, t2 FMUL a2, c06, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c03, t2, c03 FSUB c05, t3, c05 FSUB c07, t4, c07 FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a2, c01, t1 FMUL a2, c03, t2 FMUL a2, c05, t3 FMUL a2, c07, t4 FSUB c02, t1, c02 FSUB c04, t2, c04 FSUB c06, t3, c06 FSUB c08, t4, c08 FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c01, t1 FMUL a3, c02, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a4, c01, t1 FMUL a4, c02, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c03, t1 FMUL a3, c04, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c07, t1 FMUL a2, c08, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c07, t1 FMUL a3, c08, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a4, c07, t1 FMUL a4, c08, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c05, t1 FMUL a3, c06, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL70: and M, 1, I cmp I, 0 ble,pn %icc, .LL99 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL75 nop .LL72: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a1, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 4 * SIZE], a1 FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a2, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [BO + 9 * SIZE], b2 FADD c03, t3, c03 FMUL a2, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 11 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 12 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 13 * SIZE], b2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [BO + 14 * SIZE], b3 FADD c04, t4, c04 FMUL a3, b4, t4 LDF [BO + 15 * SIZE], b4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 16 * SIZE], b1 FADD c02, t2, c02 FMUL a4, b2, t2 LDF [BO + 17 * SIZE], b2 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 18 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 19 * SIZE], b4 add BO, 16 * SIZE, BO bg,pt %icc, .LL72 LDF [AO + 3 * SIZE], a4 .LL75: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL79 nop .LL76: FADD c01, t1, c01 add AO, 1 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 cmp L, 0 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 add BO, 4 * SIZE, BO FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 bg,pt %icc, .LL76 LDF [BO + 3 * SIZE], b4 .LL79: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [BO + 15 * SIZE], a1 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2 add C3, 1 * SIZE, C3 add C4, 1 * SIZE, C4 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL99: #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: /* n & 2 */ and N, 2, J cmp J, 0 ble,pn %icc, .LL200 nop #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 1, TEMP1 sub C, TEMP1, C #endif mov C, C1 add C, LDC, C2 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C2, LDC, C #endif sra M, 2, I cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03 .LL121: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 + 3 * SIZE], 2 FMOV FZERO, c02 ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b1, t3 nop FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 FADD c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: FADD c03, t1, c03 FADD c07, t2, c07 FADD c04, t3, c04 FADD c08, t4, c08 #if defined(LN) || defined(RT) #ifdef LN sub KK, 4, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c02, c02 FSUB a4, c06, c06 FSUB b1, c03, c03 FSUB b2, c07, c07 FSUB b3, c04, c04 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a2, c04, t1 FMUL a2, c08, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a3, c04, t1 FMUL a3, c08, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a4, c04, t1 FMUL a4, c08, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a2, c03, t1 FMUL a2, c07, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a3, c03, t1 FMUL a3, c07, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a2, c02, t1 FMUL a2, c06, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c05, c05 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a2, c01, t1 FMUL a2, c05, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a3, c01, t1 FMUL a3, c05, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a4, c01, t1 FMUL a4, c05, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a2, c02, t1 FMUL a2, c06, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a3, c02, t1 FMUL a3, c06, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a2, c03, t1 FMUL a2, c07, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 FMUL a1, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c01, t1 FMUL a2, c02, t2 FMUL a2, c03, t3 FMUL a2, c04, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c05, c05 FMUL a3, c06, c06 FMUL a3, c07, c07 FMUL a3, c08, c08 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 FMUL a3, c01, c01 FMUL a3, c02, c02 FMUL a3, c03, c03 FMUL a3, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c05, [BO + 1 * SIZE] STF c02, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] STF c03, [BO + 4 * SIZE] STF c07, [BO + 5 * SIZE] STF c04, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL150: and M, 2, I cmp I, 0 ble,pn %icc, .LL170 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL155 nop .LL152: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL a1, b1, t1 FMUL a1, b2, t2 FMUL a2, b1, t3 FMUL a2, b2, t4 add AO, 2 * SIZE, AO add BO, 2 * SIZE, BO add L, -1, L cmp L, 0 bg,pt %icc, .LL156 nop .LL159: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 FMUL a3, c01, c01 FMUL a3, c02, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c03, c03 FMUL a3, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a2, c01, t1 FMUL a2, c03, t2 FSUB c02, t1, c02 FSUB c04, t2, c04 FMUL a3, c02, c02 FMUL a3, c04, c04 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a2, c02, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c03, t2, c03 FMUL a3, c01, c01 FMUL a3, c03, c03 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c02, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] STF c02, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL170: and M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL175 nop .LL172: FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 add L, -1, L LDF [AO + 0 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 9 * SIZE], b2 LDF [AO + 2 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 11 * SIZE], b4 add BO, 8 * SIZE, BO bg,pt %icc, .LL172 LDF [AO + 3 * SIZE], a4 .LL175: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL179 nop .LL176: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 add AO, 1 * SIZE, AO LDF [BO + 2 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 3 * SIZE], b2 add BO, 2 * SIZE, BO bg,pt %icc, .LL176 LDF [AO + 0 * SIZE], a1 .LL179: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL199: #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .LL200: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sub C, LDC, C #endif mov C, C1 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C, LDC, C #endif sra M, 2, I cmp I, 0 ble,pn %icc, .LL250 nop .LL221: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL225 prefetch [C1 + 4 * SIZE], 2 .LL222: FADD c01, t1, c01 add BO, 4 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 add L, -1, L FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FADD c01, t1, c01 cmp L, 0 FMUL a1, b2, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [AO + 9 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b2, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 11 * SIZE], a4 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 12 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 13 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [AO + 14 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b3, t4 LDF [AO + 15 * SIZE], a4 LDF [BO + 2 * SIZE], b3 FADD c01, t1, c01 FMUL a1, b4, t1 LDF [AO + 16 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b4, t2 LDF [AO + 17 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 18 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 19 * SIZE], a4 add AO, 16 * SIZE, AO bg,pt %icc, .LL222 LDF [BO + 3 * SIZE], b4 .LL225: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL229 nop .LL226: FADD c01, t1, c01 add BO, 1 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 add L, -1, L FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 cmp L, 0 FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO bg,pt %icc, .LL226 LDF [BO + 0 * SIZE], b1 .LL229: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 4, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL221 nop .LL250: and M, 2, I cmp I, 0 ble,pn %icc, .LL270 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL255 nop .LL252: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 FADD c03, t3, c03 cmp L, 0 FMUL a3, b2, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 9 * SIZE], a2 LDF [BO + 6 * SIZE], b3 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL252 add BO, 4 * SIZE, BO .LL255: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL259 nop .LL256: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 2 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 FMUL a2, b1, t2 LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add AO, 2 * SIZE, AO bg,pt %icc, .LL256 add BO, 1 * SIZE, BO .LL259: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL270: and M, 1, I cmp I, 0 ble,pn %icc, .LL299 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c01 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t2 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c02 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 ble,pn %icc, .LL275 LDF [BO + 3 * SIZE], b4 .LL272: FADD c01, t1, c01 add L, -1, L add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 4 * SIZE, BO LDF [AO + 0 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 LDF [BO + 0 * SIZE], b1 FMUL a2, b2, t2 LDF [AO + 1 * SIZE], a2 FADD c01, t3, c01 LDF [BO + 1 * SIZE], b2 FMUL a3, b3, t3 LDF [AO + 2 * SIZE], a3 FADD c02, t4, c02 LDF [BO + 2 * SIZE], b3 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL272 LDF [BO + 3 * SIZE], b4 .LL275: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL279 nop .LL276: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add BO, 1 * SIZE, BO cmp L, 0 bg,pt %icc, .LL276 add AO, 1 * SIZE, AO .LL279: FADD c01, t1, c01 FADD c02, t2, c02 FADD c01, t3, c01 FADD c02, t4, c02 FADD c01, c02, c01 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 FSUB a1, c01, c01 #else LDF [AO + 0 * SIZE], a1 FSUB a1, c01, c01 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] #else STF c01, [AO + 0 * SIZE] #endif STF c01, [C1 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL299: #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/trsm_kernel_LT_2x8.S000066400000000000000000002157751313527062700213260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define C5 %l5 #define C6 %l6 #define C7 %l7 #define C8 %i3 #define OFFSET %g1 #define KK %g2 #define TEMP1 %g3 #define TEMP2 %g4 #define AORIG %o7 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #else ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC ld [%sp + STACK_START + 36], OFFSET #endif st %g1, [%sp + STACK_START + 8] st %g2, [%sp + STACK_START + 12] st %g3, [%sp + STACK_START + 16] st %g4, [%sp + STACK_START + 20] #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC ldx [%sp+ STACK_START + 72], OFFSET stx %g1, [%sp + STACK_START + 32] stx %g2, [%sp + STACK_START + 40] stx %g3, [%sp + STACK_START + 48] stx %g4, [%sp + STACK_START + 56] #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sll LDC, BASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 3, J cmp J, 0 ble,pn %icc, .LL30 nop .align 4 .LL11: #ifdef RT sll K, BASE_SHIFT + 3, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C5 add C5, LDC, C6 add C6, LDC, C7 add C7, LDC, C8 add C8, LDC, C #else sub C, LDC, C8 sub C8, LDC, C7 sub C7, LDC, C6 sub C6, LDC, C5 sub C5, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL20 nop .align 4 .LL12: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 8 * SIZE], a5 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FCLR (cc01) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc09) LDF [BO + 4 * SIZE], b5 FCLR (cc13) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc06) LDF [BO + 7 * SIZE], b8 FCLR (cc10) LDF [BO + 8 * SIZE], b9 FCLR (cc14) prefetch [C1 + 1 * SIZE], 3 FCLR (cc03) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) prefetch [C3 + 1 * SIZE], 3 FCLR (cc11) prefetch [C4 + 2 * SIZE], 3 FCLR (cc15) prefetch [C5 + 1 * SIZE], 3 FCLR (cc04) prefetch [C6 + 2 * SIZE], 3 FCLR (cc08) prefetch [C7 + 1 * SIZE], 3 FCLR (cc12) prefetch [C8 + 2 * SIZE], 3 FCLR (cc16) #if defined(LT) || defined(RN) sra KK, 3, L #else sub K, KK, L sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 nop .align 4 .LL13: FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #if defined(LT) || defined(RN) and KK, 7, L #else sub K, KK, L and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) nop FMADD (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) nop FMADD (aa2, bb5, cc10, cc10) nop FMADD (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c02, c02 FSUB a2, c04, c04 FSUB a3, c06, c06 FSUB a4, c08, c08 FSUB b1, c10, c10 FSUB b2, c12, c12 FSUB b3, c14, c14 FSUB b4, c16, c16 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a1, c10, c10 FMUL a1, c12, c12 FMUL a1, c14, c14 FMUL a1, c16, c16 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FNMSUB (aa2, cc10, cc09, cc09) FNMSUB (aa2, cc12, cc11, cc11) FNMSUB (aa2, cc14, cc13, cc13) FNMSUB (aa2, cc16, cc15, cc15) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 FMUL a3, c09, c09 FMUL a3, c11, c11 FMUL a3, c13, c13 FMUL a3, c15, c15 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FNMSUB (aa2, cc09, cc10, cc10) FNMSUB (aa2, cc11, cc12, cc12) FNMSUB (aa2, cc13, cc14, cc14) FNMSUB (aa2, cc15, cc16, cc16) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 FMUL a3, c10, c10 FMUL a3, c12, c12 FMUL a3, c14, c14 FMUL a3, c16, c16 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb1, cc02, cc10, cc10) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb2, cc02, cc12, cc12) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb3, cc02, cc14, cc14) FNMSUB (bb4, cc01, cc15, cc15) FNMSUB (bb4, cc02, cc16, cc16) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (aa4, cc04, cc10, cc10) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb1, cc04, cc12, cc12) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb2, cc04, cc14, cc14) FNMSUB (bb3, cc03, cc15, cc15) FNMSUB (bb3, cc04, cc16, cc16) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa3, cc06, cc10, cc10) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (aa4, cc06, cc12, cc12) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb1, cc06, cc14, cc14) FNMSUB (bb2, cc05, cc15, cc15) FNMSUB (bb2, cc06, cc16, cc16) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FMUL a1, c08, c08 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa2, cc08, cc10, cc10) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa3, cc08, cc12, cc12) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (aa4, cc08, cc14, cc14) FNMSUB (bb1, cc07, cc15, cc15) FNMSUB (bb1, cc08, cc16, cc16) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FMUL a1, c10, c10 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa2, cc10, cc12, cc12) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa3, cc10, cc14, cc14) FNMSUB (aa4, cc09, cc15, cc15) FNMSUB (aa4, cc10, cc16, cc16) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FMUL a1, c12, c12 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa2, cc12, cc14, cc14) FNMSUB (aa3, cc11, cc15, cc15) FNMSUB (aa3, cc12, cc16, cc16) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FMUL a1, c14, c14 FNMSUB (aa2, cc13, cc15, cc15) FNMSUB (aa2, cc14, cc16, cc16) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 FMUL a1, c16, c16 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c16, c16 FMUL a1, c15, c15 FNMSUB (aa2, cc16, cc14, cc14) FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc16, cc12, cc12) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc16, cc10, cc10) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc16, cc08, cc08) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc16, cc06, cc06) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc16, cc04, cc04) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc16, cc02, cc02) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c14, c14 FMUL a1, c13, c13 FNMSUB (aa2, cc14, cc12, cc12) FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc14, cc10, cc10) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc14, cc08, cc08) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc14, cc06, cc06) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc14, cc04, cc04) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc14, cc02, cc02) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c12, c12 FMUL a1, c11, c11 FNMSUB (aa2, cc12, cc10, cc10) FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc12, cc08, cc08) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc12, cc06, cc06) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc12, cc04, cc04) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc12, cc02, cc02) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c10, c10 FMUL a1, c09, c09 FNMSUB (aa2, cc10, cc08, cc08) FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc10, cc06, cc06) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc10, cc04, cc04) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc10, cc02, cc02) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 add C5, -2 * SIZE, C5 add C6, -2 * SIZE, C6 add C7, -2 * SIZE, C7 add C8, -2 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] STF c02, [BO + 8 * SIZE] STF c04, [BO + 9 * SIZE] STF c06, [BO + 10 * SIZE] STF c08, [BO + 11 * SIZE] STF c10, [BO + 12 * SIZE] STF c12, [BO + 13 * SIZE] STF c14, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] STF c09, [C5 + 0 * SIZE] STF c10, [C5 + 1 * SIZE] STF c11, [C6 + 0 * SIZE] STF c12, [C6 + 1 * SIZE] STF c13, [C7 + 0 * SIZE] STF c14, [C7 + 1 * SIZE] STF c15, [C8 + 0 * SIZE] STF c16, [C8 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 add C5, 2 * SIZE, C5 add C6, 2 * SIZE, C6 add C7, 2 * SIZE, C7 add C8, 2 * SIZE, C8 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop .align 4 .LL20: and M, 1, I cmp I, 0 ble,pn %icc, .LL29 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FCLR (cc01) LDF [BO + 1 * SIZE], b2 FCLR (cc03) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc07) LDF [BO + 4 * SIZE], b5 FCLR (cc09) LDF [BO + 5 * SIZE], b6 FCLR (cc11) LDF [BO + 6 * SIZE], b7 FCLR (cc13) LDF [BO + 7 * SIZE], b8 FCLR (cc15) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 LDF [BO + 8 * SIZE], b9 .align 4 .LL23: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 FMADD (aa2, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa2, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa2, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 FMADD (aa2, bb5, cc09, cc09) LDF [BO + 20 * SIZE], b5 FMADD (aa2, bb6, cc11, cc11) LDF [BO + 21 * SIZE], b6 FMADD (aa2, bb7, cc13, cc13) LDF [BO + 22 * SIZE], b7 FMADD (aa2, bb8, cc15, cc15) LDF [BO + 23 * SIZE], b8 LDF [AO + 4 * SIZE], a1 LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb1, cc01, cc01) LDF [BO + 32 * SIZE], b1 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 25 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 26 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 27 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [BO + 28 * SIZE], b5 FMADD (aa3, bb6, cc11, cc11) LDF [BO + 29 * SIZE], b6 FMADD (aa3, bb7, cc13, cc13) LDF [BO + 30 * SIZE], b7 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 31 * SIZE], b8 FMADD (aa4, bb9, cc01, cc01) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb2, cc03, cc03) LDF [BO + 33 * SIZE], b2 FMADD (aa4, bb3, cc05, cc05) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc07, cc07) LDF [BO + 35 * SIZE], b4 FMADD (aa4, bb5, cc09, cc09) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb6, cc11, cc11) LDF [BO + 37 * SIZE], b6 FMADD (aa4, bb7, cc13, cc13) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc15, cc15) LDF [BO + 39 * SIZE], b8 LDF [AO + 6 * SIZE], a3 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL23 add BO, 32 * SIZE, BO .align 4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 LDF [AO + 1 * SIZE], a1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL27 add BO, 8 * SIZE, BO .align 4 .LL28: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb4, cc01, cc15, cc15) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb3, cc03, cc15, cc15) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb2, cc05, cc15, cc15) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (bb1, cc07, cc15, cc15) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa4, cc09, cc15, cc15) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa3, cc11, cc15, cc15) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc15, cc15) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c15, c15 FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 add C5, -1 * SIZE, C5 add C6, -1 * SIZE, C6 add C7, -1 * SIZE, C7 add C8, -1 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c11, [AO + 5 * SIZE] STF c13, [AO + 6 * SIZE] STF c15, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] STF c09, [C5 + 0 * SIZE] STF c11, [C6 + 0 * SIZE] STF c13, [C7 + 0 * SIZE] STF c15, [C8 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL29: #ifdef LN sll K, BASE_SHIFT + 3, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 8, KK #endif #ifdef RT sub KK, 8, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .align 4 .LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL50 nop #ifdef RT sll K, BASE_SHIFT + 2, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C #else sub C, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL40 nop .align 4 .LL32: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc02) LDF [BO + 7 * SIZE], b8 FCLR (cc03) LDF [BO + 8 * SIZE], b9 FCLR (cc04) prefetch [C1 + 2 * SIZE], 3 FCLR (cc05) prefetch [C2 + 2 * SIZE], 3 FCLR (cc06) prefetch [C3 + 2 * SIZE], 3 FCLR (cc07) prefetch [C4 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 nop .align 4 .LL33: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb3, cc06, cc06) add L, -1, L FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) cmp L, 0 FMADD (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) nop FMADD (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD (aa3, bb8, cc07, cc07) FMADD (aa4, bb8, cc08, cc08) bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL37 add BO, 4 * SIZE, BO .align 4 .LL38: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 nop .LL40: and M, 1, I cmp I, 0 ble,pn %icc, .LL49 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc05) LDF [BO + 8 * SIZE], b9 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL45 nop .LL43: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 FMADD (aa2, bb7, cc05, cc05) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc07, cc07) LDF [BO + 15 * SIZE], b8 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 LDF [AO + 2 * SIZE], a3 add BO, 16 * SIZE, BO FMADD (aa4, bb5, cc01, cc01) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc03, cc03) LDF [BO + 5 * SIZE], b6 FMADD (aa4, bb7, cc05, cc05) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc07, cc07) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL43 LDF [AO + 3 * SIZE], a4 .align 4 .LL45: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL48 nop .align 4 .LL47: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 4 * SIZE], b1 add L, -1, L FMADD (aa1, bb2, cc03, cc03) LDF [BO + 5 * SIZE], b2 add AO, 1 * SIZE, AO FMADD (aa1, bb3, cc05, cc05) LDF [BO + 6 * SIZE], b3 cmp L, 0 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 7 * SIZE], b4 add BO, 4 * SIZE, BO bg,pt %icc, .LL47 LDF [AO + 0 * SIZE], a1 .align 4 .LL48: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL49: #ifdef LN sll K, BASE_SHIFT + 2, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif .align 4 .LL50: and N, 2, J cmp J, 0 ble,pn %icc, .LL70 nop #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C #else sub C, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL60 nop .align 4 .LL52: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL55 nop .align 4 .LL53: FMADD (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL53 LDF [BO + 7 * SIZE], b8 .align 4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL58 nop .align 4 .LL57: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL57 LDF [BO + 1 * SIZE], b2 .align 4 .LL58: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FMUL a3, c01, c01 FMUL a3, c03, c03 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FMUL a3, c02, c02 FMUL a3, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c02, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL52 nop .align 4 .LL60: and M, 1, I cmp I, 0 ble,pn %icc, .LL69 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 LDF [BO + 6 * SIZE], b7 FCLR (cc01) LDF [BO + 7 * SIZE], b8 FCLR (cc03) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL65 nop .align 4 .LL63: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb3, cc01, cc01) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc03, cc03) LDF [BO + 11 * SIZE], b4 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 LDF [AO + 2 * SIZE], a3 add BO, 8 * SIZE, BO FMADD (aa4, bb7, cc01, cc01) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc03, cc03) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL63 LDF [AO + 3 * SIZE], a4 .align 4 .LL65: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL68 nop .align 4 .LL67: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 3 * SIZE], b2 LDF [AO + 1 * SIZE], a1 add L, -1, L add AO, 1 * SIZE, AO cmp L, 0 bg,pt %icc, .LL67 add BO, 2 * SIZE, BO .align 4 .LL68: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL69: #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .align 4 .LL70: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, BASE_SHIFT, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C1, LDC, C #else sub C, LDC, C1 sub C, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL80 nop .align 4 .LL72: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) prefetch [C1 + 2 * SIZE], 3 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL75 nop .LL73: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 cmp L, 0 FMADD (aa3, bb2, cc01, cc01) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb2, cc02, cc02) LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 add BO, 4 * SIZE, BO FMADD (aa1, bb3, cc01, cc01) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb3, cc02, cc02) LDF [AO + 9 * SIZE], a2 LDF [BO + 2 * SIZE], b3 add AO, 8 * SIZE, AO FMADD (aa3, bb4, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa4, bb4, cc02, cc02) LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL73 LDF [BO + 3 * SIZE], b4 .align 4 .LL75: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL78 nop .align 4 .LL77: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add L, -1, L add AO, 2 * SIZE, AO cmp L, 0 bg,pt %icc, .LL77 add BO, 1 * SIZE, BO .align 4 .LL78: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FNMSUB (aa2, cc02, cc01, cc01) FMUL a3, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc02, cc02) FMUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL72 nop .align 4 .LL80: and M, 1, I cmp I, 0 ble,pn %icc, .LL89 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [BO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], a2 LDF [BO + 1 * SIZE], b2 LDF [AO + 2 * SIZE], a3 LDF [BO + 2 * SIZE], b3 LDF [AO + 3 * SIZE], a4 LDF [BO + 3 * SIZE], b4 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL85 FCLR (cc01) .align 4 .LL83: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 LDF [BO + 4 * SIZE], b1 FMADD (aa2, bb2, cc01, cc01) LDF [AO + 5 * SIZE], a2 LDF [BO + 5 * SIZE], b2 FMADD (aa3, bb3, cc01, cc01) LDF [AO + 6 * SIZE], a3 LDF [BO + 6 * SIZE], b3 FMADD (aa4, bb4, cc01, cc01) LDF [AO + 7 * SIZE], a4 LDF [BO + 7 * SIZE], b4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL83 add BO, 4 * SIZE, BO .align 4 .LL85: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL88 nop .align 4 .LL87: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL87 add BO, 1 * SIZE, BO .align 4 .LL88: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 FSUB a1, c01, c01 #else LDF [AO + 0 * SIZE], a1 FSUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] #else STF c01, [AO + 0 * SIZE] #endif STF c01, [C1 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL89: #ifdef LN sll K, BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .align 4 .LL999: #ifdef TRMMKERNEL #ifndef __64BIT__ ld [%sp + STACK_START + 8], %g1 ld [%sp + STACK_START + 12], %g2 ld [%sp + STACK_START + 16], %g3 ld [%sp + STACK_START + 20], %g4 #else ldx [%sp + STACK_START + 32], %g1 ldx [%sp + STACK_START + 40], %g2 ldx [%sp + STACK_START + 48], %g3 ldx [%sp + STACK_START + 56], %g4 #endif #endif return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/trsm_kernel_RT.S000066400000000000000000001771171313527062700206300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define OFFSET %l5 #define KK %l6 #define TEMP1 %l7 #define TEMP2 %i3 #define AORIG %g1 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f58 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f60 #define ALPHA %f62 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #define ALPHA %f30 #endif #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #else ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC ld [%sp + STACK_START + 36], OFFSET #endif #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC ldx [%sp+ STACK_START + 72], OFFSET #endif FCLR(29) sll LDC, BASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif and N, 1, J cmp J, 0 ble,pn %icc, .LL100 nop #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sub C, LDC, C #endif mov C, C1 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C, LDC, C #endif sra M, 2, I cmp I, 0 ble,pn %icc, .LL250 nop .LL221: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL225 prefetch [C1 + 4 * SIZE], 2 .LL222: FADD c01, t1, c01 add BO, 4 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 add L, -1, L FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FADD c01, t1, c01 cmp L, 0 FMUL a1, b2, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [AO + 9 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b2, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 11 * SIZE], a4 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 12 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 13 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [AO + 14 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b3, t4 LDF [AO + 15 * SIZE], a4 LDF [BO + 2 * SIZE], b3 FADD c01, t1, c01 FMUL a1, b4, t1 LDF [AO + 16 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b4, t2 LDF [AO + 17 * SIZE], a2 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 18 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 19 * SIZE], a4 add AO, 16 * SIZE, AO bg,pt %icc, .LL222 LDF [BO + 3 * SIZE], b4 .LL225: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL229 nop .LL226: FADD c01, t1, c01 add BO, 1 * SIZE, BO FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 add L, -1, L FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 FADD c03, t3, c03 cmp L, 0 FMUL a3, b1, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b1, t4 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO bg,pt %icc, .LL226 LDF [BO + 0 * SIZE], b1 .LL229: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 4, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL221 nop .LL250: and M, 2, I cmp I, 0 ble,pn %icc, .LL270 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 0 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL255 nop .LL252: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 4 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b1, t2 LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 FADD c03, t3, c03 cmp L, 0 FMUL a3, b2, t3 LDF [AO + 6 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b2, t4 LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 FADD c01, t1, c01 FMUL a1, b3, t1 LDF [AO + 8 * SIZE], a1 FADD c02, t2, c02 FMUL a2, b3, t2 LDF [AO + 9 * SIZE], a2 LDF [BO + 6 * SIZE], b3 FADD c03, t3, c03 FMUL a3, b4, t3 LDF [AO + 10 * SIZE], a3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL252 add BO, 4 * SIZE, BO .LL255: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL259 nop .LL256: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 2 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 FMUL a2, b1, t2 LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add AO, 2 * SIZE, AO bg,pt %icc, .LL256 add BO, 1 * SIZE, BO .LL259: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL270: and M, 1, I cmp I, 0 ble,pn %icc, .LL299 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c01 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t2 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c02 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 ble,pn %icc, .LL275 LDF [BO + 3 * SIZE], b4 .LL272: FADD c01, t1, c01 add L, -1, L add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 4 * SIZE, BO LDF [AO + 0 * SIZE], a1 FADD c02, t2, c02 cmp L, 0 LDF [BO + 0 * SIZE], b1 FMUL a2, b2, t2 LDF [AO + 1 * SIZE], a2 FADD c01, t3, c01 LDF [BO + 1 * SIZE], b2 FMUL a3, b3, t3 LDF [AO + 2 * SIZE], a3 FADD c02, t4, c02 LDF [BO + 2 * SIZE], b3 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL272 LDF [BO + 3 * SIZE], b4 .LL275: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL279 nop .LL276: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add BO, 1 * SIZE, BO cmp L, 0 bg,pt %icc, .LL276 add AO, 1 * SIZE, AO .LL279: FADD c01, t1, c01 FADD c02, t2, c02 FADD c01, t3, c01 FADD c02, t4, c02 FADD c01, c02, c01 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 FSUB a1, c01, c01 #else LDF [AO + 0 * SIZE], a1 FSUB a1, c01, c01 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] #else STF c01, [AO + 0 * SIZE] #endif STF c01, [C1 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL299: #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .LL100: /* n & 2 */ and N, 2, J cmp J, 0 ble,pn %icc, .LL200 nop #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 1, TEMP1 sub C, TEMP1, C #endif mov C, C1 add C, LDC, C2 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C2, LDC, C #endif sra M, 2, I cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03 .LL121: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 2 FMOV FZERO, c05 prefetch [C2 + 3 * SIZE], 2 FMOV FZERO, c02 ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b1, t3 nop FADD c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b1, t1 nop FADD c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 nop FMUL a2, b3, t3 nop FADD c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 nop FADD c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 FADD c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: FADD c03, t1, c03 FADD c07, t2, c07 FADD c04, t3, c04 FADD c08, t4, c08 #if defined(LN) || defined(RT) #ifdef LN sub KK, 4, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c02, c02 FSUB a4, c06, c06 FSUB b1, c03, c03 FSUB b2, c07, c07 FSUB b3, c04, c04 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a2, c04, t1 FMUL a2, c08, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a3, c04, t1 FMUL a3, c08, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a4, c04, t1 FMUL a4, c08, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a2, c03, t1 FMUL a2, c07, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a3, c03, t1 FMUL a3, c07, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a2, c02, t1 FMUL a2, c06, t2 FSUB c01, t1, c01 FSUB c05, t2, c05 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c05, c05 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a2, c01, t1 FMUL a2, c05, t2 FSUB c02, t1, c02 FSUB c06, t2, c06 FMUL a3, c01, t1 FMUL a3, c05, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a4, c01, t1 FMUL a4, c05, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a2, c02, t1 FMUL a2, c06, t2 FSUB c03, t1, c03 FSUB c07, t2, c07 FMUL a3, c02, t1 FMUL a3, c06, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a2, c03, t1 FMUL a2, c07, t2 FSUB c04, t1, c04 FSUB c08, t2, c08 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 FMUL a1, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c01, t1 FMUL a2, c02, t2 FMUL a2, c03, t3 FMUL a2, c04, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c05, c05 FMUL a3, c06, c06 FMUL a3, c07, c07 FMUL a3, c08, c08 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 FMUL a3, c01, c01 FMUL a3, c02, c02 FMUL a3, c03, c03 FMUL a3, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c05, [BO + 1 * SIZE] STF c02, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] STF c03, [BO + 4 * SIZE] STF c07, [BO + 5 * SIZE] STF c04, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL150: and M, 2, I cmp I, 0 ble,pn %icc, .LL170 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL155 nop .LL152: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FMUL a1, b1, t1 FMUL a1, b2, t2 FMUL a2, b1, t3 FMUL a2, b2, t4 add AO, 2 * SIZE, AO add BO, 2 * SIZE, BO add L, -1, L cmp L, 0 bg,pt %icc, .LL156 nop .LL159: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 FMUL a3, c01, c01 FMUL a3, c02, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c03, c03 FMUL a3, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a2, c01, t1 FMUL a2, c03, t2 FSUB c02, t1, c02 FSUB c04, t2, c04 FMUL a3, c02, c02 FMUL a3, c04, c04 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a2, c02, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c03, t2, c03 FMUL a3, c01, c01 FMUL a3, c03, c03 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c02, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C1 + 1 * SIZE] STF c02, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL170: and M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 1 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL175 nop .LL172: FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 add L, -1, L LDF [AO + 0 * SIZE], a1 FADD c03, t3, c03 cmp L, 0 FMUL a2, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 9 * SIZE], b2 LDF [AO + 2 * SIZE], a3 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 11 * SIZE], b4 add BO, 8 * SIZE, BO bg,pt %icc, .LL172 LDF [AO + 3 * SIZE], a4 .LL175: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL179 nop .LL176: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 add AO, 1 * SIZE, AO LDF [BO + 2 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 3 * SIZE], b2 add BO, 2 * SIZE, BO bg,pt %icc, .LL176 LDF [AO + 0 * SIZE], a1 .LL179: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 FADD c01, c03, c01 FADD c02, c04, c02 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c02, c02 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 LDF [BO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 FMUL a3, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 1 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL199: #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .LL200: sra N, 2, J cmp J, 0 ble,pn %icc, .LL999 nop .LL11: #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 sub B, TEMP1, B sll LDC, 2, TEMP1 sub C, TEMP1, C #endif add C, LDC, C2 FMOV FZERO, t1 nop mov C, C1 add C2, LDC, C3 FMOV FZERO, t2 nop mov A, AO sra M, 2, I add C3, LDC, C4 FMOV FZERO, t3 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif cmp I, 0 #ifndef RT add C4, LDC, C #endif FMOV FZERO, t4 ble,pn %icc, .LL50 FMOV FZERO, c01 .LL21: FMOV FZERO, c02 FMOV FZERO, c03 #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c04 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c05 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c06 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c07 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c08 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c09 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c10 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c11 LDF [BO + 4 * SIZE], b5 /* ***** */ LDF [AO + 4 * SIZE], a5 /* ***** */ prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c12 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C3 + 3 * SIZE], 3 FMOV FZERO, c14 prefetch [C4 + 3 * SIZE], 3 FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 .LL22: FADD c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD c08, t2, c08 nop FMUL a5, b2, t2 nop FADD c12, t3, c12 nop FMUL a5, b3, t3 nop FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD c08, t2, c08 nop FMUL a1, b2, t2 nop FADD c12, t3, c12 nop FMUL a1, b3, t3 nop FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD c05, t2, c05 nop FMUL a2, b2, t2 FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 nop FADD c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD c08, t2, c08 FMUL a5, b2, t2 FADD c12, t3, c12 FMUL a5, b3, t3 FADD c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD c01, t1, c01 nop FMUL a2, b5, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b5, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL29 nop .LL26: FADD c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 nop FMUL a2, b1, t1 nop FADD c05, t2, c05 nop FMUL a2, b2, t2 nop FADD c09, t3, c09 nop FMUL a2, b3, t3 nop FADD c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD c02, t1, c02 nop FMUL a3, b1, t1 nop FADD c06, t2, c06 nop FMUL a3, b2, t2 nop FADD c10, t3, c10 nop FMUL a3, b3, t3 nop FADD c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #if defined(LN) || defined(RT) sub KK, 4, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif FADD c04, t1, c04 FADD c08, t2, c08 FADD c12, t3, c12 FADD c16, t4, c16 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c05, c05 FSUB a3, c09, c09 FSUB a4, c13, c13 FSUB b1, c02, c02 FSUB b2, c06, c06 FSUB b3, c10, c10 FSUB b4, c14, c14 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c03, c03 FSUB a2, c07, c07 FSUB a3, c11, c11 FSUB a4, c15, c15 FSUB b1, c04, c04 FSUB b2, c08, c08 FSUB b3, c12, c12 FSUB b4, c16, c16 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16 #endif #ifdef LN LDF [AO + 15 * SIZE], a1 LDF [AO + 14 * SIZE], a2 LDF [AO + 13 * SIZE], a3 LDF [AO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 FMUL a2, c04, t1 FMUL a2, c08, t2 FMUL a2, c12, t3 FMUL a2, c16, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c04, t1 FMUL a3, c08, t2 FMUL a3, c12, t3 FMUL a3, c16, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a4, c04, t1 FMUL a4, c08, t2 FMUL a4, c12, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 10 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c03, t1 FMUL a3, c07, t2 FMUL a3, c11, t3 FMUL a3, c15, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 5 * SIZE], a1 LDF [AO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c01, t1, c01 FSUB c05, t2, c05 FSUB c09, t3, c09 FSUB c13, t4, c13 LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c05, c05 FMUL a1, c09, c09 FMUL a1, c13, c13 FMUL a2, c01, t1 FMUL a2, c05, t2 FMUL a2, c09, t3 FMUL a2, c13, t4 FSUB c02, t1, c02 FSUB c06, t2, c06 FSUB c10, t3, c10 FSUB c14, t4, c14 FMUL a3, c01, t1 FMUL a3, c05, t2 FMUL a3, c09, t3 FMUL a3, c13, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a4, c01, t1 FMUL a4, c05, t2 FMUL a4, c09, t3 FMUL a4, c13, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 5 * SIZE], a1 LDF [AO + 6 * SIZE], a2 LDF [AO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c06, c06 FMUL a1, c10, c10 FMUL a1, c14, c14 FMUL a2, c02, t1 FMUL a2, c06, t2 FMUL a2, c10, t3 FMUL a2, c14, t4 FSUB c03, t1, c03 FSUB c07, t2, c07 FSUB c11, t3, c11 FSUB c15, t4, c15 FMUL a3, c02, t1 FMUL a3, c06, t2 FMUL a3, c10, t3 FMUL a3, c14, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 10 * SIZE], a1 LDF [AO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c07, c07 FMUL a1, c11, c11 FMUL a1, c15, c15 FMUL a2, c03, t1 FMUL a2, c07, t2 FMUL a2, c11, t3 FMUL a2, c15, t4 FSUB c04, t1, c04 FSUB c08, t2, c08 FSUB c12, t3, c12 FSUB c16, t4, c16 LDF [AO + 15 * SIZE], a1 FMUL a1, c04, c04 FMUL a1, c08, c08 FMUL a1, c12, c12 FMUL a1, c16, c16 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c01, t1 FMUL a2, c02, t2 FMUL a2, c03, t3 FMUL a2, c04, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c03, t3 FMUL a3, c04, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a4, c01, t1 FMUL a4, c02, t2 FMUL a4, c03, t3 FMUL a4, c04, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a3, c07, t3 FMUL a3, c08, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c13, t1, c13 FSUB c14, t2, c14 FSUB c15, t3, c15 FSUB c16, t4, c16 LDF [BO + 15 * SIZE], a1 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c13, c13 FMUL a1, c14, c14 FMUL a1, c15, c15 FMUL a1, c16, c16 FMUL a2, c13, t1 FMUL a2, c14, t2 FMUL a2, c15, t3 FMUL a2, c16, t4 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FMUL a3, c13, t1 FMUL a3, c14, t2 FMUL a3, c15, t3 FMUL a3, c16, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a4, c13, t1 FMUL a4, c14, t2 FMUL a4, c15, t3 FMUL a4, c16, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c09, c09 FMUL a1, c10, c10 FMUL a1, c11, c11 FMUL a1, c12, c12 FMUL a2, c09, t1 FMUL a2, c10, t2 FMUL a2, c11, t3 FMUL a2, c12, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FSUB c07, t3, c07 FSUB c08, t4, c08 FMUL a3, c09, t1 FMUL a3, c10, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c05, t1 FMUL a2, c06, t2 FMUL a2, c07, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 add C3, -4 * SIZE, C3 add C4, -4 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c05, [BO + 1 * SIZE] STF c09, [BO + 2 * SIZE] STF c13, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c06, [BO + 5 * SIZE] STF c10, [BO + 6 * SIZE] STF c14, [BO + 7 * SIZE] STF c03, [BO + 8 * SIZE] STF c07, [BO + 9 * SIZE] STF c11, [BO + 10 * SIZE] STF c15, [BO + 11 * SIZE] STF c04, [BO + 12 * SIZE] STF c08, [BO + 13 * SIZE] STF c12, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c07, [C2 + 2 * SIZE] STF c08, [C2 + 3 * SIZE] STF c09, [C3 + 0 * SIZE] STF c10, [C3 + 1 * SIZE] STF c11, [C3 + 2 * SIZE] STF c12, [C3 + 3 * SIZE] STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] STF c15, [C4 + 2 * SIZE] STF c16, [C4 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 add C3, 4 * SIZE, C3 add C4, 4 * SIZE, C4 #endif #ifdef RT sll K, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 4, KK #endif #ifdef LN sub KK, 4, KK #endif add I, -1, I cmp I, 0 sra K, 2, L bg,pt %icc, .LL21 FMOV FZERO, c01 .LL50: and M, 2, I cmp I, 0 ble,pn %icc, .LL70 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, c02 FMOV FZERO, t1 FMOV FZERO, c04 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD c04, t2, c04 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD c04, t2, c04 FMUL a3, b2, t2 FADD c06, t3, c06 FMUL a3, b3, t3 FADD c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD c06, t3, c06 FMUL a1, b3, t3 FADD c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif FADD c02, t1, c02 FADD c04, t2, c04 FADD c06, t3, c06 FADD c08, t4, c08 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a2, c02, t1 FMUL a2, c04, t2 FMUL a2, c06, t3 FMUL a2, c08, t4 FSUB c01, t1, c01 FSUB c03, t2, c03 FSUB c05, t3, c05 FSUB c07, t4, c07 FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a2, c01, t1 FMUL a2, c03, t2 FMUL a2, c05, t3 FMUL a2, c07, t4 FSUB c02, t1, c02 FSUB c04, t2, c04 FSUB c06, t3, c06 FSUB c08, t4, c08 FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a2, c01, t1 FMUL a2, c02, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c01, t1 FMUL a3, c02, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a4, c01, t1 FMUL a4, c02, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c03, t1 FMUL a3, c04, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c07, t1, c07 FSUB c08, t2, c08 LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FMUL a1, c08, c08 FMUL a2, c07, t1 FMUL a2, c08, t2 FSUB c05, t1, c05 FSUB c06, t2, c06 FMUL a3, c07, t1 FMUL a3, c08, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a4, c07, t1 FMUL a4, c08, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FMUL a1, c06, c06 FMUL a2, c05, t1 FMUL a2, c06, t2 FSUB c03, t1, c03 FSUB c04, t2, c04 FMUL a3, c05, t1 FMUL a3, c06, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FMUL a1, c04, c04 FMUL a2, c03, t1 FMUL a2, c04, t2 FSUB c01, t1, c01 FSUB c02, t2, c02 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, 1 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif .LL70: and M, 1, I cmp I, 0 ble,pn %icc, .LL99 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + BASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + BASE_SHIFT, TEMP1 sll KK, 2 + BASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL75 nop .LL72: FADD c01, t1, c01 add L, -1, L FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 cmp L, 0 FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 FMUL a1, b4, t4 LDF [BO + 7 * SIZE], b4 LDF [AO + 4 * SIZE], a1 FADD c01, t1, c01 add AO, 4 * SIZE, AO FMUL a2, b1, t1 LDF [BO + 8 * SIZE], b1 FADD c02, t2, c02 FMUL a2, b2, t2 LDF [BO + 9 * SIZE], b2 FADD c03, t3, c03 FMUL a2, b3, t3 LDF [BO + 10 * SIZE], b3 FADD c04, t4, c04 FMUL a2, b4, t4 LDF [BO + 11 * SIZE], b4 LDF [AO + 1 * SIZE], a2 FADD c01, t1, c01 FMUL a3, b1, t1 LDF [BO + 12 * SIZE], b1 FADD c02, t2, c02 FMUL a3, b2, t2 LDF [BO + 13 * SIZE], b2 FADD c03, t3, c03 FMUL a3, b3, t3 LDF [BO + 14 * SIZE], b3 FADD c04, t4, c04 FMUL a3, b4, t4 LDF [BO + 15 * SIZE], b4 LDF [AO + 2 * SIZE], a3 FADD c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 16 * SIZE], b1 FADD c02, t2, c02 FMUL a4, b2, t2 LDF [BO + 17 * SIZE], b2 FADD c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 18 * SIZE], b3 FADD c04, t4, c04 FMUL a4, b4, t4 LDF [BO + 19 * SIZE], b4 add BO, 16 * SIZE, BO bg,pt %icc, .LL72 LDF [AO + 3 * SIZE], a4 .LL75: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL79 nop .LL76: FADD c01, t1, c01 add AO, 1 * SIZE, AO FMUL a1, b1, t1 LDF [BO + 4 * SIZE], b1 FADD c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [BO + 5 * SIZE], b2 FADD c03, t3, c03 cmp L, 0 FMUL a1, b3, t3 LDF [BO + 6 * SIZE], b3 FADD c04, t4, c04 add BO, 4 * SIZE, BO FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 bg,pt %icc, .LL76 LDF [BO + 3 * SIZE], b4 .LL79: FADD c01, t1, c01 FADD c02, t2, c02 FADD c03, t3, c03 FADD c04, t4, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a2, c01, t1 FSUB c02, t1, c02 FMUL a3, c01, t1 FSUB c03, t1, c03 FMUL a4, c01, t1 FSUB c04, t1, c04 LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c03, t1, c03 FMUL a3, c02, t1 FSUB c04, t1, c04 LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c04, t1, c04 LDF [BO + 15 * SIZE], a1 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c04, c04 FMUL a2, c04, t1 FSUB c03, t1, c03 FMUL a3, c04, t1 FSUB c02, t1, c02 FMUL a4, c04, t1 FSUB c01, t1, c01 LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c03, c03 FMUL a2, c03, t1 FSUB c02, t1, c02 FMUL a3, c03, t1 FSUB c01, t1, c01 LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c02, c02 FMUL a2, c02, t1 FSUB c01, t1, c01 LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C2 + 0 * SIZE] STF c03, [C3 + 0 * SIZE] STF c04, [C4 + 0 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 1 * SIZE, C1 add C2, 1 * SIZE, C2 add C3, 1 * SIZE, C3 add C4, 1 * SIZE, C4 #endif #ifdef RT sll K, 0 + BASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + BASE_SHIFT, TEMP2 sll TEMP1, 2 + BASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL99: #ifdef LN sll K, 2 + BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/trsm_kernel_RT_2x8.S000066400000000000000000002157751313527062700213340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define C5 %l5 #define C6 %l6 #define C7 %l7 #define C8 %i3 #define OFFSET %g1 #define KK %g2 #define TEMP1 %g3 #define TEMP2 %g4 #define AORIG %o7 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #else ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC ld [%sp + STACK_START + 36], OFFSET #endif st %g1, [%sp + STACK_START + 8] st %g2, [%sp + STACK_START + 12] st %g3, [%sp + STACK_START + 16] st %g4, [%sp + STACK_START + 20] #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC ldx [%sp+ STACK_START + 72], OFFSET stx %g1, [%sp + STACK_START + 32] stx %g2, [%sp + STACK_START + 40] stx %g3, [%sp + STACK_START + 48] stx %g4, [%sp + STACK_START + 56] #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sll LDC, BASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif and N, 1, J cmp J, 0 ble,pn %icc, .LL50 nop #ifdef RT sll K, BASE_SHIFT, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C1, LDC, C #else sub C, LDC, C1 sub C, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL80 nop .align 4 .LL72: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) prefetch [C1 + 2 * SIZE], 3 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL75 nop .LL73: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 cmp L, 0 FMADD (aa3, bb2, cc01, cc01) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb2, cc02, cc02) LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 add BO, 4 * SIZE, BO FMADD (aa1, bb3, cc01, cc01) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb3, cc02, cc02) LDF [AO + 9 * SIZE], a2 LDF [BO + 2 * SIZE], b3 add AO, 8 * SIZE, AO FMADD (aa3, bb4, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa4, bb4, cc02, cc02) LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL73 LDF [BO + 3 * SIZE], b4 .align 4 .LL75: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL78 nop .align 4 .LL77: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add L, -1, L add AO, 2 * SIZE, AO cmp L, 0 bg,pt %icc, .LL77 add BO, 1 * SIZE, BO .align 4 .LL78: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FNMSUB (aa2, cc02, cc01, cc01) FMUL a3, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc02, cc02) FMUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL72 nop .align 4 .LL80: and M, 1, I cmp I, 0 ble,pn %icc, .LL89 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [BO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], a2 LDF [BO + 1 * SIZE], b2 LDF [AO + 2 * SIZE], a3 LDF [BO + 2 * SIZE], b3 LDF [AO + 3 * SIZE], a4 LDF [BO + 3 * SIZE], b4 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL85 FCLR (cc01) .align 4 .LL83: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 LDF [BO + 4 * SIZE], b1 FMADD (aa2, bb2, cc01, cc01) LDF [AO + 5 * SIZE], a2 LDF [BO + 5 * SIZE], b2 FMADD (aa3, bb3, cc01, cc01) LDF [AO + 6 * SIZE], a3 LDF [BO + 6 * SIZE], b3 FMADD (aa4, bb4, cc01, cc01) LDF [AO + 7 * SIZE], a4 LDF [BO + 7 * SIZE], b4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL83 add BO, 4 * SIZE, BO .align 4 .LL85: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL88 nop .align 4 .LL87: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL87 add BO, 1 * SIZE, BO .align 4 .LL88: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 FSUB a1, c01, c01 #else LDF [AO + 0 * SIZE], a1 FSUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] #else STF c01, [AO + 0 * SIZE] #endif STF c01, [C1 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL89: #ifdef LN sll K, BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .align 4 .LL50: and N, 2, J cmp J, 0 ble,pn %icc, .LL30 nop #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C #else sub C, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL60 nop .align 4 .LL52: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL55 nop .align 4 .LL53: FMADD (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL53 LDF [BO + 7 * SIZE], b8 .align 4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL58 nop .align 4 .LL57: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL57 LDF [BO + 1 * SIZE], b2 .align 4 .LL58: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FMUL a3, c01, c01 FMUL a3, c03, c03 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FMUL a3, c02, c02 FMUL a3, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c02, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL52 nop .align 4 .LL60: and M, 1, I cmp I, 0 ble,pn %icc, .LL69 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 LDF [BO + 6 * SIZE], b7 FCLR (cc01) LDF [BO + 7 * SIZE], b8 FCLR (cc03) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL65 nop .align 4 .LL63: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb3, cc01, cc01) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc03, cc03) LDF [BO + 11 * SIZE], b4 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 LDF [AO + 2 * SIZE], a3 add BO, 8 * SIZE, BO FMADD (aa4, bb7, cc01, cc01) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc03, cc03) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL63 LDF [AO + 3 * SIZE], a4 .align 4 .LL65: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL68 nop .align 4 .LL67: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 3 * SIZE], b2 LDF [AO + 1 * SIZE], a1 add L, -1, L add AO, 1 * SIZE, AO cmp L, 0 bg,pt %icc, .LL67 add BO, 2 * SIZE, BO .align 4 .LL68: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL69: #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .align 4 .LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL10 nop #ifdef RT sll K, BASE_SHIFT + 2, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C #else sub C, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL40 nop .align 4 .LL32: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc02) LDF [BO + 7 * SIZE], b8 FCLR (cc03) LDF [BO + 8 * SIZE], b9 FCLR (cc04) prefetch [C1 + 2 * SIZE], 3 FCLR (cc05) prefetch [C2 + 2 * SIZE], 3 FCLR (cc06) prefetch [C3 + 2 * SIZE], 3 FCLR (cc07) prefetch [C4 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 nop .align 4 .LL33: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb3, cc06, cc06) add L, -1, L FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) cmp L, 0 FMADD (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) nop FMADD (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD (aa3, bb8, cc07, cc07) FMADD (aa4, bb8, cc08, cc08) bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL37 add BO, 4 * SIZE, BO .align 4 .LL38: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 nop .LL40: and M, 1, I cmp I, 0 ble,pn %icc, .LL49 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc05) LDF [BO + 8 * SIZE], b9 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL45 nop .LL43: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 FMADD (aa2, bb7, cc05, cc05) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc07, cc07) LDF [BO + 15 * SIZE], b8 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 LDF [AO + 2 * SIZE], a3 add BO, 16 * SIZE, BO FMADD (aa4, bb5, cc01, cc01) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc03, cc03) LDF [BO + 5 * SIZE], b6 FMADD (aa4, bb7, cc05, cc05) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc07, cc07) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL43 LDF [AO + 3 * SIZE], a4 .align 4 .LL45: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL48 nop .align 4 .LL47: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 4 * SIZE], b1 add L, -1, L FMADD (aa1, bb2, cc03, cc03) LDF [BO + 5 * SIZE], b2 add AO, 1 * SIZE, AO FMADD (aa1, bb3, cc05, cc05) LDF [BO + 6 * SIZE], b3 cmp L, 0 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 7 * SIZE], b4 add BO, 4 * SIZE, BO bg,pt %icc, .LL47 LDF [AO + 0 * SIZE], a1 .align 4 .LL48: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL49: #ifdef LN sll K, BASE_SHIFT + 2, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif .align 4 .LL10: sra N, 3, J cmp J, 0 ble,pn %icc, .LL999 nop .align 4 .LL11: #ifdef RT sll K, BASE_SHIFT + 3, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C5 add C5, LDC, C6 add C6, LDC, C7 add C7, LDC, C8 add C8, LDC, C #else sub C, LDC, C8 sub C8, LDC, C7 sub C7, LDC, C6 sub C6, LDC, C5 sub C5, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL20 nop .align 4 .LL12: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 8 * SIZE], a5 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FCLR (cc01) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc09) LDF [BO + 4 * SIZE], b5 FCLR (cc13) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc06) LDF [BO + 7 * SIZE], b8 FCLR (cc10) LDF [BO + 8 * SIZE], b9 FCLR (cc14) prefetch [C1 + 1 * SIZE], 3 FCLR (cc03) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) prefetch [C3 + 1 * SIZE], 3 FCLR (cc11) prefetch [C4 + 2 * SIZE], 3 FCLR (cc15) prefetch [C5 + 1 * SIZE], 3 FCLR (cc04) prefetch [C6 + 2 * SIZE], 3 FCLR (cc08) prefetch [C7 + 1 * SIZE], 3 FCLR (cc12) prefetch [C8 + 2 * SIZE], 3 FCLR (cc16) #if defined(LT) || defined(RN) sra KK, 3, L #else sub K, KK, L sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 nop .align 4 .LL13: FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #if defined(LT) || defined(RN) and KK, 7, L #else sub K, KK, L and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) nop FMADD (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) nop FMADD (aa2, bb5, cc10, cc10) nop FMADD (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c02, c02 FSUB a2, c04, c04 FSUB a3, c06, c06 FSUB a4, c08, c08 FSUB b1, c10, c10 FSUB b2, c12, c12 FSUB b3, c14, c14 FSUB b4, c16, c16 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a1, c10, c10 FMUL a1, c12, c12 FMUL a1, c14, c14 FMUL a1, c16, c16 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FNMSUB (aa2, cc10, cc09, cc09) FNMSUB (aa2, cc12, cc11, cc11) FNMSUB (aa2, cc14, cc13, cc13) FNMSUB (aa2, cc16, cc15, cc15) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 FMUL a3, c09, c09 FMUL a3, c11, c11 FMUL a3, c13, c13 FMUL a3, c15, c15 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FNMSUB (aa2, cc09, cc10, cc10) FNMSUB (aa2, cc11, cc12, cc12) FNMSUB (aa2, cc13, cc14, cc14) FNMSUB (aa2, cc15, cc16, cc16) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 FMUL a3, c10, c10 FMUL a3, c12, c12 FMUL a3, c14, c14 FMUL a3, c16, c16 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb1, cc02, cc10, cc10) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb2, cc02, cc12, cc12) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb3, cc02, cc14, cc14) FNMSUB (bb4, cc01, cc15, cc15) FNMSUB (bb4, cc02, cc16, cc16) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (aa4, cc04, cc10, cc10) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb1, cc04, cc12, cc12) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb2, cc04, cc14, cc14) FNMSUB (bb3, cc03, cc15, cc15) FNMSUB (bb3, cc04, cc16, cc16) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa3, cc06, cc10, cc10) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (aa4, cc06, cc12, cc12) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb1, cc06, cc14, cc14) FNMSUB (bb2, cc05, cc15, cc15) FNMSUB (bb2, cc06, cc16, cc16) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FMUL a1, c08, c08 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa2, cc08, cc10, cc10) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa3, cc08, cc12, cc12) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (aa4, cc08, cc14, cc14) FNMSUB (bb1, cc07, cc15, cc15) FNMSUB (bb1, cc08, cc16, cc16) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FMUL a1, c10, c10 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa2, cc10, cc12, cc12) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa3, cc10, cc14, cc14) FNMSUB (aa4, cc09, cc15, cc15) FNMSUB (aa4, cc10, cc16, cc16) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FMUL a1, c12, c12 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa2, cc12, cc14, cc14) FNMSUB (aa3, cc11, cc15, cc15) FNMSUB (aa3, cc12, cc16, cc16) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FMUL a1, c14, c14 FNMSUB (aa2, cc13, cc15, cc15) FNMSUB (aa2, cc14, cc16, cc16) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 FMUL a1, c16, c16 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c16, c16 FMUL a1, c15, c15 FNMSUB (aa2, cc16, cc14, cc14) FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc16, cc12, cc12) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc16, cc10, cc10) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc16, cc08, cc08) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc16, cc06, cc06) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc16, cc04, cc04) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc16, cc02, cc02) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c14, c14 FMUL a1, c13, c13 FNMSUB (aa2, cc14, cc12, cc12) FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc14, cc10, cc10) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc14, cc08, cc08) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc14, cc06, cc06) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc14, cc04, cc04) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc14, cc02, cc02) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c12, c12 FMUL a1, c11, c11 FNMSUB (aa2, cc12, cc10, cc10) FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc12, cc08, cc08) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc12, cc06, cc06) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc12, cc04, cc04) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc12, cc02, cc02) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c10, c10 FMUL a1, c09, c09 FNMSUB (aa2, cc10, cc08, cc08) FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc10, cc06, cc06) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc10, cc04, cc04) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc10, cc02, cc02) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 add C5, -2 * SIZE, C5 add C6, -2 * SIZE, C6 add C7, -2 * SIZE, C7 add C8, -2 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] STF c02, [BO + 8 * SIZE] STF c04, [BO + 9 * SIZE] STF c06, [BO + 10 * SIZE] STF c08, [BO + 11 * SIZE] STF c10, [BO + 12 * SIZE] STF c12, [BO + 13 * SIZE] STF c14, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] STF c09, [C5 + 0 * SIZE] STF c10, [C5 + 1 * SIZE] STF c11, [C6 + 0 * SIZE] STF c12, [C6 + 1 * SIZE] STF c13, [C7 + 0 * SIZE] STF c14, [C7 + 1 * SIZE] STF c15, [C8 + 0 * SIZE] STF c16, [C8 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 add C5, 2 * SIZE, C5 add C6, 2 * SIZE, C6 add C7, 2 * SIZE, C7 add C8, 2 * SIZE, C8 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop .align 4 .LL20: and M, 1, I cmp I, 0 ble,pn %icc, .LL29 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FCLR (cc01) LDF [BO + 1 * SIZE], b2 FCLR (cc03) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc07) LDF [BO + 4 * SIZE], b5 FCLR (cc09) LDF [BO + 5 * SIZE], b6 FCLR (cc11) LDF [BO + 6 * SIZE], b7 FCLR (cc13) LDF [BO + 7 * SIZE], b8 FCLR (cc15) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 LDF [BO + 8 * SIZE], b9 .align 4 .LL23: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 FMADD (aa2, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa2, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa2, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 FMADD (aa2, bb5, cc09, cc09) LDF [BO + 20 * SIZE], b5 FMADD (aa2, bb6, cc11, cc11) LDF [BO + 21 * SIZE], b6 FMADD (aa2, bb7, cc13, cc13) LDF [BO + 22 * SIZE], b7 FMADD (aa2, bb8, cc15, cc15) LDF [BO + 23 * SIZE], b8 LDF [AO + 4 * SIZE], a1 LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb1, cc01, cc01) LDF [BO + 32 * SIZE], b1 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 25 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 26 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 27 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [BO + 28 * SIZE], b5 FMADD (aa3, bb6, cc11, cc11) LDF [BO + 29 * SIZE], b6 FMADD (aa3, bb7, cc13, cc13) LDF [BO + 30 * SIZE], b7 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 31 * SIZE], b8 FMADD (aa4, bb9, cc01, cc01) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb2, cc03, cc03) LDF [BO + 33 * SIZE], b2 FMADD (aa4, bb3, cc05, cc05) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc07, cc07) LDF [BO + 35 * SIZE], b4 FMADD (aa4, bb5, cc09, cc09) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb6, cc11, cc11) LDF [BO + 37 * SIZE], b6 FMADD (aa4, bb7, cc13, cc13) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc15, cc15) LDF [BO + 39 * SIZE], b8 LDF [AO + 6 * SIZE], a3 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL23 add BO, 32 * SIZE, BO .align 4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 LDF [AO + 1 * SIZE], a1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL27 add BO, 8 * SIZE, BO .align 4 .LL28: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb4, cc01, cc15, cc15) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb3, cc03, cc15, cc15) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb2, cc05, cc15, cc15) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (bb1, cc07, cc15, cc15) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa4, cc09, cc15, cc15) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa3, cc11, cc15, cc15) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc15, cc15) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c15, c15 FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 add C5, -1 * SIZE, C5 add C6, -1 * SIZE, C6 add C7, -1 * SIZE, C7 add C8, -1 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c11, [AO + 5 * SIZE] STF c13, [AO + 6 * SIZE] STF c15, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] STF c09, [C5 + 0 * SIZE] STF c11, [C6 + 0 * SIZE] STF c13, [C7 + 0 * SIZE] STF c15, [C8 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL29: #ifdef LN sll K, BASE_SHIFT + 3, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 8, KK #endif #ifdef RT sub KK, 8, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .align 4 .LL999: #ifdef TRMMKERNEL #ifndef __64BIT__ ld [%sp + STACK_START + 8], %g1 ld [%sp + STACK_START + 12], %g2 ld [%sp + STACK_START + 16], %g3 ld [%sp + STACK_START + 20], %g4 #else ldx [%sp + STACK_START + 32], %g1 ldx [%sp + STACK_START + 40], %g2 ldx [%sp + STACK_START + 48], %g3 ldx [%sp + STACK_START + 56], %g4 #endif #endif return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zamax.S000066400000000000000000000163341313527062700170070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define t5 %f16 #define t6 %f18 #define t7 %f20 #define t8 %f22 #define a1 %f24 #define a2 %f26 #define a3 %f28 #define a4 %f30 #define a5 %f32 #define a6 %f34 #define a7 %f36 #define a8 %f38 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define t5 %f8 #define t6 %f9 #define t7 %f10 #define t8 %f11 #define a1 %f12 #define a2 %f13 #define a3 %f14 #define a4 %f15 #define a5 %f16 #define a6 %f17 #define a7 %f18 #define a8 %f19 #endif #ifndef USE_MIN #define FCMOV FMOVG #else #define FCMOV FMOVL #endif PROLOGUE SAVESP FCLR(0) cmp N, 0 ble .LL20 nop cmp INCX, 0 ble .LL20 sll INCX, ZBASE_SHIFT, INCX LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 add N, -1, N FABS c1, c1 add X, INCX, X FABS c2, c2 cmp N, 0 ble .LL20 FADD c1, c2, c1 FMOV c1, c2 FMOV c1, c3 FMOV c1, c4 cmp INCX, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 40 .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a1, t1 LDF [X + 0 * SIZE], a1 FABS a2, t2 LDF [X + 1 * SIZE], a2 FABS a3, t3 LDF [X + 2 * SIZE], a3 FABS a4, t4 LDF [X + 3 * SIZE], a4 FABS a5, t5 LDF [X + 4 * SIZE], a5 FABS a6, t6 LDF [X + 5 * SIZE], a6 FABS a7, t7 LDF [X + 6 * SIZE], a7 FABS a8, t8 LDF [X + 7 * SIZE], a8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 add I, -1, I FCMOV %fcc1, t3, c2 cmp I, 0 FCMOV %fcc2, t5, c3 FCMOV %fcc3, t7, c4 bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FABS a5, t5 FABS a6, t6 FABS a7, t7 FABS a8, t8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t3, c2 FCMOV %fcc2, t5, c3 FCMOV %fcc3, t7, c4 .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FABS a1, t1 FABS a2, t2 FADD t1, t2, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 FCMOV %fcc0, c2, c1 FCMOV %fcc1, c4, c3 FCMP %fcc0, c3, c1 FCMOV %fcc0, c3, c1 .LL20: return %i7 + 8 clr %g0 .LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 LDF [X + 1 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FABS a1, t1 LDF [X + 0 * SIZE], a1 FABS a2, t2 LDF [X + 1 * SIZE], a2 add X, INCX, X FABS a3, t3 LDF [X + 0 * SIZE], a3 FABS a4, t4 LDF [X + 1 * SIZE], a4 add X, INCX, X FABS a5, t5 LDF [X + 0 * SIZE], a5 FABS a6, t6 LDF [X + 1 * SIZE], a6 add X, INCX, X FABS a7, t7 LDF [X + 0 * SIZE], a7 FABS a8, t8 LDF [X + 1 * SIZE], a8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 add I, -1, I FCMOV %fcc1, t3, c2 cmp I, 0 FCMOV %fcc2, t5, c3 FCMOV %fcc3, t7, c4 bg,pt %icc, .LL51 add X, INCX, X .LL52: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FABS a5, t5 FABS a6, t6 FABS a7, t7 FABS a8, t8 FADD t1, t2, t1 FADD t3, t4, t3 FADD t5, t6, t5 FADD t7, t8, t7 FCMP %fcc0, t1, c1 FCMP %fcc1, t3, c2 FCMP %fcc2, t5, c3 FCMP %fcc3, t7, c4 FCMOV %fcc0, t1, c1 FCMOV %fcc1, t3, c2 FCMOV %fcc2, t5, c3 FCMOV %fcc3, t7, c4 .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FABS a1, t1 add I, -1, I FABS a2, t2 cmp I, 0 FADD t1, t2, t1 FCMP %fcc0, t1, c1 FCMOV %fcc0, t1, c1 bg,pt %icc, .LL56 add X, INCX, X .LL59: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 FCMOV %fcc0, c2, c1 FCMOV %fcc1, c4, c3 FCMP %fcc0, c3, c1 FCMOV %fcc0, c3, c1 return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zasum.S000066400000000000000000000145341313527062700170260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #else #define c1 %f0 #define c2 %f1 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #endif PROLOGUE SAVESP FCLR(0) sll INCX, ZBASE_SHIFT, INCX FMOV c1, c2 FMOV c1, t1 FMOV c1, t2 FMOV c1, t3 FMOV c1, t4 cmp INCX, 0 ble .LL19 nop cmp INCX, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 add I, -1, I LDF [X + 1 * SIZE], a2 cmp I, 0 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 LDF [X + 5 * SIZE], a6 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 32 .LL11: FADD c1, t1, c1 prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a1, t1 LDF [X + 0 * SIZE], a1 FADD c2, t2, c2 add I, -1, I FABS a2, t2 LDF [X + 1 * SIZE], a2 FADD c1, t3, c1 cmp I, 0 FABS a3, t3 LDF [X + 2 * SIZE], a3 FADD c2, t4, c2 nop FABS a4, t4 LDF [X + 3 * SIZE], a4 FADD c1, t1, c1 nop FABS a5, t1 LDF [X + 4 * SIZE], a5 FADD c2, t2, c2 nop FABS a6, t2 LDF [X + 5 * SIZE], a6 FADD c1, t3, c1 FABS a7, t3 LDF [X + 6 * SIZE], a7 add X, 8 * SIZE, X FADD c2, t4, c2 FABS a8, t4 bg,pt %icc, .LL11 LDF [X - 1 * SIZE], a8 .LL12: FADD c1, t1, c1 FABS a1, t1 FADD c2, t2, c2 FABS a2, t2 FADD c1, t3, c1 FABS a3, t3 FADD c2, t4, c2 FABS a4, t4 FADD c1, t1, c1 FABS a5, t1 FADD c2, t2, c2 FABS a6, t2 FADD c1, t3, c1 FABS a7, t3 FADD c2, t4, c2 FABS a8, t4 .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add I, -1, I cmp I, 0 FADD c1, t1, c1 FADD c2, t2, c2 FABS a1, t1 FABS a2, t2 bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: FADD c1, t1, c1 FADD c2, t2, c2 FADD c1, t3, c1 FADD c2, t4, c2 FADD c1, c2, c1 return %i7 + 8 clr %g0 .LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 LDF [X + 1 * SIZE], a8 ble,pt %icc, .LL52 add X, INCX, X .LL51: FADD c1, t1, c1 add I, -1, I FABS a1, t1 LDF [X + 0 * SIZE], a1 FADD c2, t2, c2 cmp I, 0 FABS a2, t2 LDF [X + 1 * SIZE], a2 add X, INCX, X FADD c1, t3, c1 FABS a3, t3 LDF [X + 0 * SIZE], a3 FADD c2, t4, c2 FABS a4, t4 LDF [X + 1 * SIZE], a4 add X, INCX, X FADD c1, t1, c1 FABS a5, t1 LDF [X + 0 * SIZE], a5 FADD c2, t2, c2 FABS a6, t2 LDF [X + 1 * SIZE], a6 add X, INCX, X FADD c1, t3, c1 FABS a7, t3 LDF [X + 0 * SIZE], a7 FADD c2, t4, c2 FABS a8, t4 LDF [X + 1 * SIZE], a8 bg,pt %icc, .LL51 add X, INCX, X .LL52: FADD c1, t1, c1 FABS a1, t1 FADD c2, t2, c2 FABS a2, t2 FADD c1, t3, c1 FABS a3, t3 FADD c2, t4, c2 FABS a4, t4 FADD c1, t1, c1 FABS a5, t1 FADD c2, t2, c2 FABS a6, t2 FADD c1, t3, c1 FABS a7, t3 FADD c2, t4, c2 FABS a8, t4 .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FADD c1, t1, c1 FADD c2, t2, c2 add I, -1, I FABS a1, t1 FABS a2, t2 cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: FADD c1, t1, c1 FADD c2, t2, c2 FADD c1, t3, c1 FADD c2, t4, c2 FADD c1, c2, c1 return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zaxpy.S000066400000000000000000000272051313527062700170410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(DOUBLE) && !defined(__64BIT__) #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #define I %i5 #else #define N %i0 #define X %i5 #define INCX %i1 #define Y %i2 #define INCY %i3 #define I %i4 #endif #define YY %l1 #ifdef DOUBLE #define a1 %f0 #define a2 %f2 #define a3 %f4 #define a4 %f6 #define a5 %f8 #define a6 %f10 #define a7 %f12 #define a8 %f14 #define b1 %f16 #define b2 %f18 #define b3 %f20 #define b4 %f22 #define b5 %f24 #define b6 %f26 #define b7 %f28 #define b8 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define c1 %f40 #define c2 %f42 #define c3 %f44 #define c4 %f46 #define c5 %f48 #define c6 %f50 #define c7 %f52 #define c8 %f54 #define ALPHA_R %f60 #define ALPHA_I %f62 #else #define a1 %f0 #define a2 %f1 #define a3 %f2 #define a4 %f3 #define a5 %f4 #define a6 %f5 #define a7 %f6 #define a8 %f7 #define b1 %f8 #define b2 %f9 #define b3 %f10 #define b4 %f11 #define b5 %f12 #define b6 %f13 #define b7 %f14 #define b8 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define c1 %f20 #define c2 %f21 #define c3 %f22 #define c4 %f23 #define c5 %f24 #define c6 %f25 #define c7 %f26 #define c8 %f27 #define ALPHA_R %f30 #define ALPHA_I %f31 #endif #ifndef CONJ #define ADD1 FSUB #define ADD2 FADD #else #define ADD1 FADD #define ADD2 FSUB #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] ld [%sp+ STACK_START + 32], X ld [%sp+ STACK_START + 36], INCX ld [%sp+ STACK_START + 40], Y ld [%sp+ STACK_START + 44], INCY ldd [%sp + STACK_START + 16], ALPHA_R ldd [%sp + STACK_START + 24], ALPHA_I #else st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp+ STACK_START + 28], INCX ld [%sp+ STACK_START + 32], Y ld [%sp+ STACK_START + 36], INCY ld [%sp + STACK_START + 16], ALPHA_R ld [%sp + STACK_START + 20], ALPHA_I #endif #else ldx [%sp + STACK_START + 56], INCX ldx [%sp + STACK_START + 64], Y ldx [%sp + STACK_START + 72], INCY #ifdef DOUBLE FMOV %f6, ALPHA_R FMOV %f8, ALPHA_I #else FMOV %f7, ALPHA_R FMOV %f9, ALPHA_I #endif #endif sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY cmp INCX, 2 * SIZE bne .LL50 nop cmp INCY, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [Y + 0 * SIZE], b1 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [Y + 2 * SIZE], b3 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [X + 5 * SIZE], a6 LDF [Y + 4 * SIZE], b5 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 LDF [Y + 6 * SIZE], b7 LDF [Y + 7 * SIZE], b8 FMUL ALPHA_R, a1, t1 FMUL ALPHA_R, a2, t2 FMUL ALPHA_R, a3, t3 FMUL ALPHA_R, a4, t4 FADD b1, t1, c1 FMUL ALPHA_I, a2, t1 ADD2 b2, t2, c2 FMUL ALPHA_I, a1, t2 deccc I ble,pt %icc, .LL12 nop #ifdef DOUBLE #define PREFETCHSIZE 54 #else #define PREFETCHSIZE 108 #endif .LL11: FADD b3, t3, c3 prefetch [Y + PREFETCHSIZE * SIZE], 0 FMUL ALPHA_I, a4, t3 prefetch [X + PREFETCHSIZE * SIZE], 0 ADD2 b4, t4, c4 LDF [Y + 8 * SIZE], b1 FMUL ALPHA_I, a3, t4 LDF [X + 9 * SIZE], a2 ADD1 c1, t1, c1 LDF [Y + 9 * SIZE], b2 FMUL ALPHA_R, a5, t1 LDF [X + 8 * SIZE], a1 FADD c2, t2, c2 LDF [Y + 10 * SIZE], b3 FMUL ALPHA_R, a6, t2 LDF [X + 11 * SIZE], a4 ADD1 c3, t3, c3 STF c1, [Y + 0 * SIZE] FMUL ALPHA_R, a7, t3 LDF [Y + 11 * SIZE], b4 FADD c4, t4, c4 STF c2, [Y + 1 * SIZE] FMUL ALPHA_R, a8, t4 LDF [X + 10 * SIZE], a3 FADD b5, t1, c5 STF c3, [Y + 2 * SIZE] FMUL ALPHA_I, a6, t1 ADD2 b6, t2, c6 STF c4, [Y + 3 * SIZE] FMUL ALPHA_I, a5, t2 FADD b7, t3, c7 LDF [Y + 12 * SIZE], b5 FMUL ALPHA_I, a8, t3 LDF [X + 13 * SIZE], a6 ADD2 b8, t4, c8 LDF [Y + 13 * SIZE], b6 FMUL ALPHA_I, a7, t4 LDF [X + 12 * SIZE], a5 ADD1 c5, t1, c5 LDF [Y + 14 * SIZE], b7 FMUL ALPHA_R, a1, t1 LDF [X + 15 * SIZE], a8 FADD c6, t2, c6 LDF [Y + 15 * SIZE], b8 FMUL ALPHA_R, a2, t2 LDF [X + 14 * SIZE], a7 ADD1 c7, t3, c7 STF c5, [Y + 4 * SIZE] FMUL ALPHA_R, a3, t3 add X, 8 * SIZE, X FADD c8, t4, c8 STF c6, [Y + 5 * SIZE] FMUL ALPHA_R, a4, t4 deccc I FADD b1, t1, c1 STF c7, [Y + 6 * SIZE] FMUL ALPHA_I, a2, t1 ADD2 b2, t2, c2 STF c8, [Y + 7 * SIZE] FMUL ALPHA_I, a1, t2 bg,pt %icc, .LL11 add Y, 8 * SIZE, Y .LL12: FADD b3, t3, c3 FMUL ALPHA_I, a4, t3 ADD2 b4, t4, c4 FMUL ALPHA_I, a3, t4 ADD1 c1, t1, c1 FMUL ALPHA_R, a5, t1 FADD c2, t2, c2 FMUL ALPHA_R, a6, t2 ADD1 c3, t3, c3 FMUL ALPHA_R, a7, t3 FADD c4, t4, c4 FMUL ALPHA_R, a8, t4 FADD b5, t1, c5 FMUL ALPHA_I, a6, t1 ADD2 b6, t2, c6 FMUL ALPHA_I, a5, t2 FADD b7, t3, c7 FMUL ALPHA_I, a8, t3 ADD2 b8, t4, c8 FMUL ALPHA_I, a7, t4 ADD1 c5, t1, c5 FADD c6, t2, c6 ADD1 c7, t3, c7 FADD c8, t4, c8 STF c1, [Y + 0 * SIZE] STF c2, [Y + 1 * SIZE] STF c3, [Y + 2 * SIZE] STF c4, [Y + 3 * SIZE] STF c5, [Y + 4 * SIZE] STF c6, [Y + 5 * SIZE] STF c7, [Y + 6 * SIZE] STF c8, [Y + 7 * SIZE] add X, 8 * SIZE, X add Y, 8 * SIZE, Y .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [Y + 0 * SIZE], b1 LDF [Y + 1 * SIZE], b2 FMUL ALPHA_R, a1, t1 FMUL ALPHA_R, a2, t2 FMUL ALPHA_I, a2, t3 FMUL ALPHA_I, a1, t4 FADD b1, t1, b1 add I, -1, I ADD2 b2, t2, b2 cmp I, 0 ADD1 b1, t3, c1 FADD b2, t4, c2 STF c1, [Y + 0 * SIZE] STF c2, [Y + 1 * SIZE] add Y, 2 * SIZE, Y bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: return %i7 + 8 clr %g0 .LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 mov Y, YY LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [Y + 1 * SIZE], b2 add Y, INCY, Y LDF [X + 0 * SIZE], a3 LDF [Y + 0 * SIZE], b3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [Y + 1 * SIZE], b4 add Y, INCY, Y LDF [X + 0 * SIZE], a5 add I, -1, I LDF [Y + 0 * SIZE], b5 LDF [X + 1 * SIZE], a6 cmp I, 0 add X, INCX, X LDF [Y + 1 * SIZE], b6 add Y, INCY, Y LDF [X + 0 * SIZE], a7 FMUL ALPHA_R, a1, t1 LDF [Y + 0 * SIZE], b7 FMUL ALPHA_R, a2, t2 LDF [X + 1 * SIZE], a8 FMUL ALPHA_R, a3, t3 add X, INCX, X LDF [Y + 1 * SIZE], b8 FMUL ALPHA_R, a4, t4 ble,pt %icc, .LL52 add Y, INCY, Y .LL51: FADD b1, t1, c1 LDF [Y + 0 * SIZE], b1 FMUL ALPHA_I, a2, t1 LDF [X + 1 * SIZE], a2 ADD2 b2, t2, c2 LDF [Y + 1 * SIZE], b2 add Y, INCY, Y FMUL ALPHA_I, a1, t2 LDF [X + 0 * SIZE], a1 add X, INCX, X FADD b3, t3, c3 LDF [Y + 0 * SIZE], b3 FMUL ALPHA_I, a4, t3 LDF [X + 1 * SIZE], a4 ADD2 b4, t4, c4 LDF [Y + 1 * SIZE], b4 add Y, INCY, Y FMUL ALPHA_I, a3, t4 LDF [X + 0 * SIZE], a3 add X, INCX, X ADD1 c1, t1, c1 FMUL ALPHA_R, a5, t1 FADD c2, t2, c2 FMUL ALPHA_R, a6, t2 ADD1 c3, t3, c3 FMUL ALPHA_R, a7, t3 FADD c4, t4, c4 FMUL ALPHA_R, a8, t4 STF c1, [YY + 0 * SIZE] FADD b5, t1, c1 FMUL ALPHA_I, a6, t1 STF c2, [YY + 1 * SIZE] ADD2 b6, t2, c2 FMUL ALPHA_I, a5, t2 add YY, INCY, YY STF c3, [YY + 0 * SIZE] FADD b7, t3, c3 FMUL ALPHA_I, a8, t3 STF c4, [YY + 1 * SIZE] ADD2 b8, t4, c4 FMUL ALPHA_I, a7, t4 add YY, INCY, YY LDF [X + 0 * SIZE], a5 ADD1 c1, t1, c1 LDF [Y + 0 * SIZE], b5 FMUL ALPHA_R, a1, t1 LDF [X + 1 * SIZE], a6 add X, INCX, X FADD c2, t2, c2 LDF [Y + 1 * SIZE], b6 add Y, INCY, Y FMUL ALPHA_R, a2, t2 LDF [X + 0 * SIZE], a7 ADD1 c3, t3, c3 LDF [Y + 0 * SIZE], b7 FMUL ALPHA_R, a3, t3 LDF [X + 1 * SIZE], a8 add X, INCX, X FADD c4, t4, c4 LDF [Y + 1 * SIZE], b8 add Y, INCY, Y FMUL ALPHA_R, a4, t4 STF c1, [YY + 0 * SIZE] add I, -1, I STF c2, [YY + 1 * SIZE] add YY, INCY, YY STF c3, [YY + 0 * SIZE] cmp I, 0 STF c4, [YY + 1 * SIZE] bg,pt %icc, .LL51 add YY, INCY, YY .LL52: FADD b1, t1, c1 FMUL ALPHA_I, a2, t1 ADD2 b2, t2, c2 FMUL ALPHA_I, a1, t2 FADD b3, t3, c3 FMUL ALPHA_I, a4, t3 ADD2 b4, t4, c4 FMUL ALPHA_I, a3, t4 ADD1 c1, t1, c1 FMUL ALPHA_R, a5, t1 FADD c2, t2, c2 FMUL ALPHA_R, a6, t2 ADD1 c3, t3, c3 FMUL ALPHA_R, a7, t3 FADD c4, t4, c4 FMUL ALPHA_R, a8, t4 STF c1, [YY + 0 * SIZE] STF c2, [YY + 1 * SIZE] add YY, INCY, YY STF c3, [YY + 0 * SIZE] STF c4, [YY + 1 * SIZE] add YY, INCY, YY FADD b5, t1, c1 FMUL ALPHA_I, a6, t1 ADD2 b6, t2, c2 FMUL ALPHA_I, a5, t2 FADD b7, t3, c3 FMUL ALPHA_I, a8, t3 ADD2 b8, t4, c4 FMUL ALPHA_I, a7, t4 ADD1 c1, t1, c1 FADD c2, t2, c2 ADD1 c3, t3, c3 FADD c4, t4, c4 STF c1, [YY + 0 * SIZE] STF c2, [YY + 1 * SIZE] add YY, INCY, YY STF c3, [YY + 0 * SIZE] STF c4, [YY + 1 * SIZE] add YY, INCY, YY .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [Y + 0 * SIZE], b1 LDF [Y + 1 * SIZE], b2 FMUL ALPHA_R, a1, t1 FMUL ALPHA_R, a2, t2 FMUL ALPHA_I, a2, t3 FMUL ALPHA_I, a1, t4 FADD b1, t1, b1 ADD2 b2, t2, b2 ADD1 b1, t3, c1 FADD b2, t4, c2 add I, -1, I cmp I, 0 STF c1, [Y + 0 * SIZE] STF c2, [Y + 1 * SIZE] add Y, INCY, Y bg,pt %icc, .LL56 add X, INCX, X .LL59: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zcopy.S000066400000000000000000000117161313527062700170320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #define I %i5 #ifdef DOUBLE #define a1 %f0 #define a2 %f2 #define a3 %f4 #define a4 %f6 #define a5 %f8 #define a6 %f10 #define a7 %f12 #define a8 %f14 #else #define a1 %f0 #define a2 %f1 #define a3 %f2 #define a4 %f3 #define a5 %f4 #define a6 %f5 #define a7 %f6 #define a8 %f7 #endif PROLOGUE SAVESP sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY cmp INCX, 2 * SIZE bne .LL50 nop cmp INCY, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop #define PREFETCHSIZE 32 .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 prefetch [Y + PREFETCHSIZE * SIZE], 0 LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 LDF [X + 5 * SIZE], a6 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 STF a1, [Y + 0 * SIZE] add I, -1, I STF a2, [Y + 1 * SIZE] cmp I, 0 STF a3, [Y + 2 * SIZE] add X, 8 * SIZE, X STF a4, [Y + 3 * SIZE] STF a5, [Y + 4 * SIZE] STF a6, [Y + 5 * SIZE] STF a7, [Y + 6 * SIZE] STF a8, [Y + 7 * SIZE] bg,pt %icc, .LL11 add Y, 8 * SIZE, Y .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add I, -1, I cmp I, 0 STF a1, [Y + 0 * SIZE] add X, 2 * SIZE, X STF a2, [Y + 1 * SIZE] bg,pt %icc, .LL16 add Y, 2 * SIZE, Y .LL19: return %i7 + 8 clr %g0 .LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop .LL51: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X LDF [X + 0 * SIZE], a7 LDF [X + 1 * SIZE], a8 add X, INCX, X STF a1, [Y + 0 * SIZE] add I, -1, I STF a2, [Y + 1 * SIZE] add Y, INCY, Y cmp I, 0 STF a3, [Y + 0 * SIZE] STF a4, [Y + 1 * SIZE] add Y, INCY, Y STF a5, [Y + 0 * SIZE] STF a6, [Y + 1 * SIZE] add Y, INCY, Y STF a7, [Y + 0 * SIZE] STF a8, [Y + 1 * SIZE] bg,pt %icc, .LL51 add Y, INCY, Y .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add I, -1, I cmp I, 0 add X, INCX, X STF a1, [Y + 0 * SIZE] STF a2, [Y + 1 * SIZE] bg,pt %icc, .LL56 add Y, INCY, Y .LL59: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zdot.S000066400000000000000000000241461313527062700166470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define OUT %i0 #define N %i1 #define X %i2 #define INCX %i3 #define Y %i4 #define INCY %i5 #else #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #endif #define I %l0 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #define b1 %f32 #define b2 %f34 #define b3 %f36 #define b4 %f38 #define b5 %f40 #define b6 %f42 #define b7 %f44 #define b8 %f46 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #define b1 %f16 #define b2 %f17 #define b3 %f18 #define b4 %f19 #define b5 %f20 #define b6 %f21 #define b7 %f22 #define b8 %f23 #endif PROLOGUE SAVESP #ifdef DOUBLE FCLR(0) FCLR(2) FCLR(4) FCLR(6) #else FCLR(0) FCLR(1) FCLR(2) FCLR(3) #endif FMOV c1, c4 FMOV c1, t1 sll INCX, ZBASE_SHIFT, INCX FMOV c1, t2 sll INCY, ZBASE_SHIFT, INCY FMOV c1, t3 FMOV c1, t4 cmp INCX, 2 * SIZE bne .LL50 nop cmp INCY, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 add I, -1, I LDF [Y + 0 * SIZE], b1 cmp I, 0 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 LDF [Y + 6 * SIZE], b7 LDF [X + 7 * SIZE], a8 add X, 8 * SIZE, X LDF [Y + 7 * SIZE], b8 ble,pt %icc, .LL12 add Y, 8 * SIZE, Y #define PREFETCHSIZE 40 .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 FADD c1, t1, c1 prefetch [Y + PREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 FADD c2, t2, c2 FMUL a2, b1, t2 LDF [Y + 0 * SIZE], b1 FADD c3, t3, c3 FMUL a1, b2, t3 LDF [X + 0 * SIZE], a1 FADD c4, t4, c4 FMUL a2, b2, t4 LDF [X + 1 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b3, t1 LDF [Y + 1 * SIZE], b2 FADD c2, t2, c2 FMUL a4, b3, t2 LDF [Y + 2 * SIZE], b3 FADD c3, t3, c3 FMUL a3, b4, t3 LDF [X + 2 * SIZE], a3 FADD c4, t4, c4 FMUL a4, b4, t4 LDF [X + 3 * SIZE], a4 FADD c1, t1, c1 FMUL a5, b5, t1 LDF [Y + 3 * SIZE], b4 FADD c2, t2, c2 FMUL a6, b5, t2 LDF [Y + 4 * SIZE], b5 FADD c3, t3, c3 FMUL a5, b6, t3 LDF [X + 4 * SIZE], a5 FADD c4, t4, c4 FMUL a6, b6, t4 LDF [X + 5 * SIZE], a6 FADD c1, t1, c1 add I, -1, I FMUL a7, b7, t1 LDF [Y + 5 * SIZE], b6 FADD c2, t2, c2 cmp I, 0 FMUL a8, b7, t2 LDF [Y + 6 * SIZE], b7 FADD c3, t3, c3 add Y, 8 * SIZE, Y FMUL a7, b8, t3 LDF [X + 6 * SIZE], a7 FADD c4, t4, c4 FMUL a8, b8, t4 LDF [X + 7 * SIZE], a8 add X, 8 * SIZE, X bg,pt %icc, .LL11 LDF [Y - 1 * SIZE], b8 .LL12: FADD c1, t1, c1 FMUL a1, b1, t1 FADD c2, t2, c2 FMUL a2, b1, t2 FADD c3, t3, c3 FMUL a1, b2, t3 FADD c4, t4, c4 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b3, t1 FADD c2, t2, c2 FMUL a4, b3, t2 FADD c3, t3, c3 FMUL a3, b4, t3 FADD c4, t4, c4 FMUL a4, b4, t4 FADD c1, t1, c1 FMUL a5, b5, t1 FADD c2, t2, c2 FMUL a6, b5, t2 FADD c3, t3, c3 FMUL a5, b6, t3 FADD c4, t4, c4 FMUL a6, b6, t4 FADD c1, t1, c1 FMUL a7, b7, t1 FADD c2, t2, c2 FMUL a8, b7, t2 FADD c3, t3, c3 FMUL a7, b8, t3 FADD c4, t4, c4 FMUL a8, b8, t4 .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, 2 * SIZE, X LDF [Y + 0 * SIZE], b1 LDF [Y + 1 * SIZE], b2 add Y, 2 * SIZE, Y FADD c1, t1, c1 FMUL a1, b1, t1 FADD c2, t2, c2 FMUL a2, b1, t2 FADD c3, t3, c3 FMUL a1, b2, t3 FADD c4, t4, c4 FMUL a2, b2, t4 add I, -1, I cmp I, 0 bg,pt %icc, .LL16 nop .LL19: FADD c1, t1, c1 FADD c2, t2, c2 FADD c3, t3, c3 FADD c4, t4, c4 #ifndef CONJ FSUB c1, c4, c1 FADD c2, c3, c2 #else FADD c1, c4, c1 FSUB c3, c2, c2 #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) STF c1, [OUT + 0 * SIZE] STF c2, [OUT + 1 * SIZE] #endif return %i7 + 8 clr %g0 .LL50: #ifdef F_INTERFACE cmp INCX, 0 bge .LL41 sub N, 1, I smul I, INCX, I sub X, I, X .LL41: cmp INCY, 0 bge .LL42 sub N, 1, I smul I, INCY, I sub Y, I, Y .LL42: #endif sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X LDF [X + 0 * SIZE], a7 LDF [X + 1 * SIZE], a8 add X, INCX, X LDF [Y + 0 * SIZE], b1 LDF [Y + 1 * SIZE], b2 add Y, INCY, Y LDF [Y + 0 * SIZE], b3 LDF [Y + 1 * SIZE], b4 add Y, INCY, Y LDF [Y + 0 * SIZE], b5 LDF [Y + 1 * SIZE], b6 add Y, INCY, Y LDF [Y + 0 * SIZE], b7 LDF [Y + 1 * SIZE], b8 add Y, INCY, Y add I, -1, I cmp I, 0 ble,pt %icc, .LL52 .LL51: FADD c1, t1, c1 prefetch [X + PREFETCHSIZE * SIZE], 0 add I, -1, I FMUL a1, b1, t1 prefetch [Y + PREFETCHSIZE * SIZE], 0 FADD c2, t2, c2 cmp I, 0 FMUL a2, b1, t2 LDF [Y + 0 * SIZE], b1 FADD c3, t3, c3 FMUL a1, b2, t3 LDF [X + 0 * SIZE], a1 FADD c4, t4, c4 FMUL a2, b2, t4 LDF [X + 1 * SIZE], a2 add X, INCX, X FADD c1, t1, c1 FMUL a3, b3, t1 LDF [Y + 1 * SIZE], b2 add Y, INCY, Y FADD c2, t2, c2 FMUL a4, b3, t2 LDF [Y + 0 * SIZE], b3 FADD c3, t3, c3 FMUL a3, b4, t3 LDF [X + 0 * SIZE], a3 FADD c4, t4, c4 FMUL a4, b4, t4 LDF [X + 1 * SIZE], a4 add X, INCX, X FADD c1, t1, c1 FMUL a5, b5, t1 LDF [Y + 1 * SIZE], b4 add Y, INCY, Y FADD c2, t2, c2 FMUL a6, b5, t2 LDF [Y + 0 * SIZE], b5 FADD c3, t3, c3 FMUL a5, b6, t3 LDF [X + 0 * SIZE], a5 FADD c4, t4, c4 FMUL a6, b6, t4 LDF [X + 1 * SIZE], a6 add X, INCX, X FADD c1, t1, c1 FMUL a7, b7, t1 LDF [Y + 1 * SIZE], b6 add Y, INCY, Y FADD c2, t2, c2 FMUL a8, b7, t2 LDF [Y + 0 * SIZE], b7 FADD c3, t3, c3 FMUL a7, b8, t3 LDF [X + 0 * SIZE], a7 FADD c4, t4, c4 FMUL a8, b8, t4 LDF [X + 1 * SIZE], a8 add X, INCX, X LDF [Y + 1 * SIZE], b8 bg,pt %icc, .LL51 add Y, INCY, Y .LL52: FADD c1, t1, c1 FMUL a1, b1, t1 FADD c2, t2, c2 FMUL a2, b1, t2 FADD c3, t3, c3 FMUL a1, b2, t3 FADD c4, t4, c4 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b3, t1 FADD c2, t2, c2 FMUL a4, b3, t2 FADD c3, t3, c3 FMUL a3, b4, t3 FADD c4, t4, c4 FMUL a4, b4, t4 FADD c1, t1, c1 FMUL a5, b5, t1 FADD c2, t2, c2 FMUL a6, b5, t2 FADD c3, t3, c3 FMUL a5, b6, t3 FADD c4, t4, c4 FMUL a6, b6, t4 FADD c1, t1, c1 FMUL a7, b7, t1 FADD c2, t2, c2 FMUL a8, b7, t2 FADD c3, t3, c3 FMUL a7, b8, t3 FADD c4, t4, c4 FMUL a8, b8, t4 .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [Y + 0 * SIZE], b1 LDF [Y + 1 * SIZE], b2 add Y, INCY, Y FADD c1, t1, c1 FMUL a1, b1, t1 FADD c2, t2, c2 FMUL a2, b1, t2 FADD c3, t3, c3 FMUL a1, b2, t3 FADD c4, t4, c4 FMUL a2, b2, t4 add I, -1, I cmp I, 0 bg,pt %icc, .LL56 nop .LL59: FADD c1, t1, c1 FADD c2, t2, c2 FADD c3, t3, c3 FADD c4, t4, c4 #ifndef CONJ FSUB c1, c4, c1 FADD c2, c3, c2 #else FADD c1, c4, c1 FSUB c3, c2, c2 #endif #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) STF c1, [OUT + 0 * SIZE] STF c2, [OUT + 1 * SIZE] #endif return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zgemm_kernel.S000066400000000000000000000754551313527062700203570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define C1 %l0 #define C2 %l1 #define OFFSET %l2 #define KK %l3 #define TEMP1 %l4 #define TEMP2 %l5 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f62 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f58 #define ALPHA_R %f60 #define ALPHA_I %f62 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #define ALPHA_R %f30 #define ALPHA_I %f31 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD1 FADD #define FADD2 FADD #define FADD3 FADD #define FADD4 FSUB #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define FADD1 FADD #define FADD2 FADD #define FADD3 FSUB #define FADD4 FADD #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define FADD1 FADD #define FADD2 FSUB #define FADD3 FADD #define FADD4 FADD #else #define FADD1 FADD #define FADD2 FSUB #define FADD3 FSUB #define FADD4 FSUB #endif #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE #define STACK_ALPHA [%sp + STACK_START + 24] #else #define STACK_ALPHA [%sp + STACK_START + 20] #endif #else #define STACK_ALPHA [%sp + STACK_START + 40] #endif #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 48], OFFSET #endif ldd [%sp + STACK_START + 16], ALPHA_R ldd [%sp + STACK_START + 24], ALPHA_I #else st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 40], OFFSET #endif ld [%sp + STACK_START + 16], ALPHA_R ld [%sp + STACK_START + 20], ALPHA_I #endif #else #ifdef DOUBLE FMOV %f6, ALPHA_R FMOV %f8, ALPHA_I STF %f8, STACK_ALPHA #else FMOV %f7, ALPHA_R FMOV %f9, ALPHA_I STF %f9, STACK_ALPHA #endif ldx [%sp+ STACK_START + 56], B nop ldx [%sp+ STACK_START + 64], C nop ldx [%sp+ STACK_START + 72], LDC #ifdef TRMMKERNEL ldx [%sp+ STACK_START + 80], OFFSET #endif LDF [%sp + STACK_START + 32], FZERO #endif #ifdef DOUBLE FCLR(27) #else FCLR(29) #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sra N, 1, J cmp J, 0 ble,pn %icc, .LL100 sll LDC, ZBASE_SHIFT, LDC .LL11: sra M, 1, I FMOV FZERO, t1 add C, LDC, C2 FMOV FZERO, t2 mov C, C1 FMOV FZERO, t3 cmp I, 0 #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif mov A, AO add C2, LDC, C nop ble,pn %icc, .LL50 FMOV FZERO, t4 .LL21: #if !defined(TRMMKERNEL) sra K, 2, L FMOV FZERO, c01 cmp L, 0 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [B + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [B + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [B + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [B + 3 * SIZE], b4 FMOV FZERO, c10 LDF [B + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14 mov B, BO #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add B, TEMP1, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif sra L, 2, L cmp L, 0 FMOV FZERO, c01 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c10 LDF [BO + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14 #endif FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 .LL22: FADD2 c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD4 c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD2 c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a5, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a5, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a1, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a1, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD3 c05, t2, c05 nop FMUL a2, b2, t2 FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 nop FADD2 c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD4 c08, t2, c08 FMUL a5, b2, t2 FADD2 c12, t3, c12 FMUL a5, b3, t3 FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,pn %icc, .LL29 LDF STACK_ALPHA, ALPHA_I .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD4 c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD2 c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #ifndef TRMMKERNEL FADD2 c04, t1, c04 LDF [C1 + 0 * SIZE], a1 FADD4 c08, t2, c08 LDF [C1 + 1 * SIZE], a2 FADD2 c12, t3, c12 LDF [C1 + 2 * SIZE], a3 FADD4 c16, t4, c16 LDF [C1 + 3 * SIZE], a4 FADD c01, c06, c01 LDF [C2 + 0 * SIZE], b1 FADD c02, c05, c02 LDF [C2 + 1 * SIZE], b2 FADD c03, c08, c03 LDF [C2 + 2 * SIZE], b3 FADD c04, c07, c04 LDF [C2 + 3 * SIZE], b4 FADD c09, c14, c09 FMUL ALPHA_R, c01, t1 FADD c10, c13, c10 FMUL ALPHA_R, c02, t2 FADD c11, c16, c11 FMUL ALPHA_R, c03, t3 FADD c12, c15, c12 FMUL ALPHA_R, c04, t4 FADD a1, t1, a1 FMUL ALPHA_I, c02, t1 FADD a2, t2, a2 FMUL ALPHA_I, c01, t2 FADD a3, t3, a3 FMUL ALPHA_I, c04, t3 FADD a4, t4, a4 FMUL ALPHA_I, c03, t4 FSUB a1, t1, a1 FMUL ALPHA_R, c09, t1 FADD a2, t2, a2 FMUL ALPHA_R, c10, t2 FSUB a3, t3, a3 FMUL ALPHA_R, c11, t3 FADD a4, t4, a4 FMUL ALPHA_R, c12, t4 FADD b1, t1, b1 FMUL ALPHA_I, c10, t1 FADD b2, t2, b2 FMUL ALPHA_I, c09, t2 FADD b3, t3, b3 FMUL ALPHA_I, c12, t3 FADD b4, t4, b4 FMUL ALPHA_I, c11, t4 STF a1, [C1 + 0 * SIZE] FSUB b1, t1, b1 STF a2, [C1 + 1 * SIZE] FADD b2, t2, b2 STF a3, [C1 + 2 * SIZE] FSUB b3, t3, b3 STF a4, [C1 + 3 * SIZE] FADD b4, t4, b4 STF b1, [C2 + 0 * SIZE] FMOV FZERO, t1 STF b2, [C2 + 1 * SIZE] FMOV FZERO, t2 STF b3, [C2 + 2 * SIZE] FMOV FZERO, t3 STF b4, [C2 + 3 * SIZE] FMOV FZERO, t4 #else FADD2 c04, t1, c04 FADD4 c08, t2, c08 FADD2 c12, t3, c12 FADD4 c16, t4, c16 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 STF c01, [C1 + 0 * SIZE] FADD c09, c14, c09 STF c02, [C1 + 1 * SIZE] FADD c10, c13, c10 STF c03, [C1 + 2 * SIZE] FADD c11, c16, c11 STF c04, [C1 + 3 * SIZE] FADD c12, c15, c12 STF c09, [C2 + 0 * SIZE] FMOV FZERO, t1 STF c10, [C2 + 1 * SIZE] FMOV FZERO, t2 STF c11, [C2 + 2 * SIZE] FMOV FZERO, t3 STF c12, [C2 + 3 * SIZE] FMOV FZERO, t4 #endif add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL21 FMOV FZERO, c01 .LL50: and M, 1, I FMOV FZERO, c02 cmp I, 0 FMOV FZERO, t1 ble,pn %icc, .LL99 FMOV FZERO, c04 #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t2 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t3 LDF [B + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [B + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [B + 3 * SIZE], b4 FMOV FZERO, c05 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 1 + ZBASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 #endif ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD2 c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD4 c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD2 c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD4 c04, t2, c04 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD2 c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD4 c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #ifndef TRMMKERNEL FADD2 c02, t1, c02 LDF [C1 + 0 * SIZE], a1 FADD4 c04, t2, c04 LDF [C1 + 1 * SIZE], a2 FADD2 c06, t3, c06 LDF [C2 + 0 * SIZE], a3 FADD4 c08, t4, c08 LDF [C2 + 1 * SIZE], a4 FADD c01, c04, c01 FMUL ALPHA_R, c01, t1 FADD c02, c03, c02 FMUL ALPHA_R, c02, t2 FADD c05, c08, c05 FMUL ALPHA_R, c05, t3 FADD c06, c07, c06 FMUL ALPHA_R, c06, t4 FADD a1, t1, a1 FMUL ALPHA_I, c02, t1 FADD a2, t2, a2 FMUL ALPHA_I, c01, t2 FADD a3, t3, a3 FMUL ALPHA_I, c06, t3 FADD a4, t4, a4 FMUL ALPHA_I, c05, t4 FSUB a1, t1, a1 FADD a2, t2, a2 FSUB a3, t3, a3 FADD a4, t4, a4 STF a1, [C1 + 0 * SIZE] FMOV FZERO, t1 STF a2, [C1 + 1 * SIZE] FMOV FZERO, t2 STF a3, [C2 + 0 * SIZE] FMOV FZERO, t3 STF a4, [C2 + 1 * SIZE] FMOV FZERO, t4 #else FADD2 c02, t1, c02 FADD4 c04, t2, c04 FADD2 c06, t3, c06 FADD4 c08, t4, c08 FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 STF c01, [C1 + 0 * SIZE] FMOV FZERO, t1 STF c02, [C1 + 1 * SIZE] FMOV FZERO, t2 STF c05, [C2 + 0 * SIZE] FMOV FZERO, t3 STF c06, [C2 + 1 * SIZE] FMOV FZERO, t4 #endif add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .LL99: add J, -1, J mov BO, B cmp J, 0 bg,pt %icc, .LL11 #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 2, KK #else nop #endif .LL100: sra M, 1, I and N, 1, J cmp J, 0 ble,pn %icc, .LL999 mov A, AO mov C, C1 add C, LDC, C #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03 .LL121: #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, t1 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, t2 LDF [B + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [B + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [B + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c05 FMOV FZERO, c02 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 1 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c05 FMOV FZERO, c02 #endif ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD1 c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD1 c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b1, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 2, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD1 c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD4 c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b1, t1 FADD3 c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: #ifndef TRMMKERNEL FADD1 c03, t1, c03 LDF [C1 + 0 * SIZE], a1 FADD3 c07, t2, c07 LDF [C1 + 1 * SIZE], a2 FADD2 c04, t3, c04 LDF [C1 + 2 * SIZE], a3 FADD4 c08, t4, c08 LDF [C1 + 3 * SIZE], a4 FADD c01, c06, c01 FMUL ALPHA_R, c01, t1 FADD c02, c05, c02 FMUL ALPHA_R, c02, t2 FADD c03, c08, c03 FMUL ALPHA_R, c03, t3 FADD c04, c07, c04 FMUL ALPHA_R, c04, t4 FADD a1, t1, a1 FMUL ALPHA_I, c02, t1 FADD a2, t2, a2 FMUL ALPHA_I, c01, t2 FADD a3, t3, a3 FMUL ALPHA_I, c04, t3 FADD a4, t4, a4 FMUL ALPHA_I, c03, t4 FSUB a1, t1, a1 FADD a2, t2, a2 FSUB a3, t3, a3 FADD a4, t4, a4 STF a1, [C1 + 0 * SIZE] FMOV FZERO, t1 STF a2, [C1 + 1 * SIZE] FMOV FZERO, t2 STF a3, [C1 + 2 * SIZE] FMOV FZERO, t3 STF a4, [C1 + 3 * SIZE] FMOV FZERO, t4 #else FADD1 c03, t1, c03 FADD3 c07, t2, c07 FADD2 c04, t3, c04 FADD4 c08, t4, c08 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 STF c01, [C1 + 0 * SIZE] FMOV FZERO, t1 STF c02, [C1 + 1 * SIZE] FMOV FZERO, t2 STF c03, [C1 + 2 * SIZE] FMOV FZERO, t3 STF c04, [C1 + 3 * SIZE] FMOV FZERO, t4 #endif add C1, 4 * SIZE, C1 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -2, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 2, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL150: and M, 1, I cmp I, 0 ble,pn %icc, .LL999 nop #if !defined(TRMMKERNEL) LDF [AO + 0 * SIZE], a1 sra K, 2, L FMOV FZERO, c01 LDF [B + 0 * SIZE], b1 mov B, BO FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 cmp L, 0 FMOV FZERO, c02 LDF [B + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [B + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [B + 3 * SIZE], b4 FMOV FZERO, t4 #else #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov B, BO #else sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif sra L, 2, L cmp L, 0 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 #endif ble,pn %icc, .LL155 nop .LL152: FADD1 c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD3 c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD2 c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD1 c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD3 c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD3 c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD2 c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: FADD1 c01, t1, c01 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 cmp L, 0 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL156 LDF [AO + 1 * SIZE], a2 .LL159: #ifndef TRMMKERNEL FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 LDF [C1 + 0 * SIZE], a1 LDF [C1 + 1 * SIZE], a2 FADD c01, c04, c01 FADD c02, c03, c02 FMUL ALPHA_R, c01, t1 FMUL ALPHA_R, c02, t2 FMUL ALPHA_I, c02, t3 FMUL ALPHA_I, c01, t4 FADD a1, t1, a1 FADD a2, t2, a2 FSUB a1, t3, a1 FADD a2, t4, a2 STF a1, [C1 + 0 * SIZE] STF a2, [C1 + 1 * SIZE] #else FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 FADD c01, c04, c01 FADD c02, c03, c02 STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #endif add C1, 2 * SIZE, C1 #ifndef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zgemm_kernel_1x4.S000066400000000000000000001007641313527062700210430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define BB %o7 #define C1 %l0 #define C2 %l1 #define C3 %l2 #define C4 %l3 #define OFFSET %l4 #define KK %l5 #define TEMP1 %l6 #define TEMP2 %l7 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define ALPHA_R %f60 #define ALPHA_I %f62 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #define alpha_r 29 #define alpha_i 31 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define ALPHA_R %f30 #define ALPHA_I %f31 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #define alpha_r 30 #define alpha_i 31 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FMADD #define FMADD4 FNMSUB #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FNMSUB #define FMADD4 FMADD #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define FMADD1 FMADD #define FMADD2 FNMSUB #define FMADD3 FMADD #define FMADD4 FMADD #else #define FMADD1 FMADD #define FMADD2 FNMSUB #define FMADD3 FNMSUB #define FMADD4 FNMSUB #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 48], OFFSET #endif ldd [%sp + STACK_START + 16], ALPHA_R ldd [%sp + STACK_START + 24], ALPHA_I #else st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC #ifdef TRMMKERNEL ld [%sp + STACK_START + 40], OFFSET #endif ld [%sp + STACK_START + 16], ALPHA_R ld [%sp + STACK_START + 20], ALPHA_I #endif #else ldx [%sp + STACK_START + 56], B ldx [%sp + STACK_START + 64], C ldx [%sp + STACK_START + 72], LDC #ifdef TRMMKERNEL ldx [%sp + STACK_START + 80], OFFSET #endif #ifdef DOUBLE FMOV %f6, ALPHA_R FMOV %f8, ALPHA_I #else FMOV %f7, ALPHA_R FMOV %f9, ALPHA_I #endif #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif cmp M, 0 ble,pn %icc, .LL999 nop sra N, 2, J cmp J, 0 ble,pn %icc, .LL20 sll LDC, ZBASE_SHIFT, LDC .LL11: mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C sll K, ZBASE_SHIFT + 2, BB #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif mov A, AO mov M, I add B, BB, BB .align 4 .LL12: prefetch [BB + 0 * SIZE], 1 #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, ZBASE_SHIFT + 0, TEMP1 sll KK, ZBASE_SHIFT + 2, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 FCLR (cc01) LDF [AO + 1 * SIZE], a2 FCLR (cc05) LDF [AO + 8 * SIZE], a5 FCLR (cc09) LDF [BO + 0 * SIZE], b1 FCLR (cc13) LDF [BO + 1 * SIZE], b2 FCLR (cc02) LDF [BO + 2 * SIZE], b3 FCLR (cc06) LDF [BO + 3 * SIZE], b4 FCLR (cc10) LDF [BO + 4 * SIZE], b5 FCLR (cc14) LDF [BO + 5 * SIZE], b6 FCLR (cc03) LDF [BO + 6 * SIZE], b7 FCLR (cc07) LDF [BO + 7 * SIZE], b8 FCLR (cc11) LDF [BO + 8 * SIZE], b9 FCLR (cc15) prefetch [C1 + 1 * SIZE], 3 FCLR (cc04) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) prefetch [C3 + 1 * SIZE], 3 FCLR (cc12) prefetch [C4 + 2 * SIZE], 3 FCLR (cc16) #ifndef TRMMKERNEL sra K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 4, L #endif sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 add BB, 32 * SIZE, BB .align 4 .LL13: FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) add L, -1, L FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) nop FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) nop FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) cmp L, 0 FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD4 (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD1 (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) FMADD4 (aa4, bb6, cc12, cc12) FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) add L, -1, L FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) nop FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) nop FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) cmp L, 0 FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD4 (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD1 (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) FMADD4 (aa4, bb6, cc12, cc12) FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #ifndef TRMMKERNEL and K, 7, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 4, L #endif and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) nop FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) cmp L, 0 FMADD2 (aa2, bb3, cc06, cc06) nop FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) nop FMADD2 (aa2, bb5, cc10, cc10) nop FMADD3 (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD4 (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD1 (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD2 (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD3 (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD4 (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 FADD c01, c04, c01 LDF [C1 + 1 * SIZE], a2 FADD c02, c03, c02 LDF [C2 + 0 * SIZE], a3 FADD c05, c08, c05 LDF [C2 + 1 * SIZE], a4 FADD c06, c07, c06 LDF [C3 + 0 * SIZE], b1 FADD c09, c12, c09 LDF [C3 + 1 * SIZE], b2 FADD c10, c11, c10 LDF [C4 + 0 * SIZE], b3 FADD c13, c16, c13 LDF [C4 + 1 * SIZE], b4 FADD c14, c15, c14 FMADD (alpha_r, cc01, aa1, aa1) FMADD (alpha_r, cc02, aa2, aa2) FMADD (alpha_r, cc05, aa3, aa3) FMADD (alpha_r, cc06, aa4, aa4) FMADD (alpha_r, cc09, bb1, bb1) FMADD (alpha_r, cc10, bb2, bb2) FMADD (alpha_r, cc13, bb3, bb3) FMADD (alpha_r, cc14, bb4, bb4) #else FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 FADD c09, c12, c09 FADD c10, c11, c10 FADD c13, c16, c13 FADD c14, c15, c14 FMUL ALPHA_R, c01, a1 FMUL ALPHA_R, c02, a2 FMUL ALPHA_R, c05, a3 FMUL ALPHA_R, c06, a4 FMUL ALPHA_R, c09, b1 FMUL ALPHA_R, c10, b2 FMUL ALPHA_R, c13, b3 FMUL ALPHA_R, c14, b4 #endif FNMSUB (alpha_i, cc02, aa1, aa1) FMADD (alpha_i, cc01, aa2, aa2) FNMSUB (alpha_i, cc06, aa3, aa3) FMADD (alpha_i, cc05, aa4, aa4) FNMSUB (alpha_i, cc10, bb1, bb1) STF a1, [C1 + 0 * SIZE] FMADD (alpha_i, cc09, bb2, bb2) STF a2, [C1 + 1 * SIZE] FNMSUB (alpha_i, cc14, bb3, bb3) STF a3, [C2 + 0 * SIZE] FMADD (alpha_i, cc13, bb4, bb4) STF a4, [C2 + 1 * SIZE] STF b1, [C3 + 0 * SIZE] add C1, 2 * SIZE, C1 STF b2, [C3 + 1 * SIZE] add C2, 2 * SIZE, C2 STF b3, [C4 + 0 * SIZE] add C3, 2 * SIZE, C3 STF b4, [C4 + 1 * SIZE] add C4, 2 * SIZE, C4 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -4, TEMP1 #endif sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 4, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 mov BO, B .align 4 .LL20: and N, 2, J cmp J, 0 ble,pn %icc, .LL30 mov C, C1 add C, LDC, C2 add C2, LDC, C #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif mov M, I mov A, AO .align 4 .LL22: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, ZBASE_SHIFT + 0, TEMP1 sll KK, ZBASE_SHIFT + 1, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 FCLR (cc01) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc04) LDF [BO + 8 * SIZE], b9 FCLR (cc05) prefetch [C1 + 2 * SIZE], 3 FCLR (cc06) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 FCLR (cc08) .align 4 .LL23: FMADD1 (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD2 (aa2, bb3, cc06, cc06) add L, -1, L FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD4 (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD1 (aa3, bb7, cc05, cc05) cmp L, 0 FMADD2 (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD3 (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD4 (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD1 (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD2 (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD2 (aa2, bb3, cc06, cc06) nop FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD2 (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD4 (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD1 (aa3, bb7, cc05, cc05) nop FMADD2 (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD3 (aa3, bb8, cc07, cc07) FMADD4 (aa4, bb8, cc08, cc08) bg,pt %icc, .LL23 LDF [BO + 7 * SIZE], b8 .align 4 .LL25: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 2, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) cmp L, 0 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD3 (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD4 (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL27 add BO, 4 * SIZE, BO .align 4 .LL28: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 FADD c01, c04, c01 LDF [C1 + 1 * SIZE], a2 FADD c02, c03, c02 LDF [C2 + 0 * SIZE], a3 FADD c05, c08, c05 LDF [C2 + 1 * SIZE], a4 FADD c06, c07, c06 FMADD (alpha_r, cc01, aa1, aa1) FMADD (alpha_r, cc02, aa2, aa2) FMADD (alpha_r, cc05, aa3, aa3) FMADD (alpha_r, cc06, aa4, aa4) #else FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 FMUL ALPHA_R, c01, a1 FMUL ALPHA_R, c02, a2 FMUL ALPHA_R, c05, a3 FMUL ALPHA_R, c06, a4 #endif FNMSUB (alpha_i, cc02, aa1, aa1) FMADD (alpha_i, cc01, aa2, aa2) FNMSUB (alpha_i, cc06, aa3, aa3) FMADD (alpha_i, cc05, aa4, aa4) STF a1, [C1 + 0 * SIZE] add I, -1, I STF a2, [C1 + 1 * SIZE] cmp I, 0 STF a3, [C2 + 0 * SIZE] add C1, 2 * SIZE, C1 STF a4, [C2 + 1 * SIZE] add C2, 2 * SIZE, C2 #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -2, TEMP1 #endif sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif bg,pt %icc, .LL22 nop #if defined(TRMMKERNEL) && !defined(LEFT) add KK, 2, KK #endif mov BO, B .align 4 .LL30: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 mov C, C1 #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif mov M, I mov A, AO .align 4 .LL32: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else sll KK, ZBASE_SHIFT + 0, TEMP1 sll KK, ZBASE_SHIFT + 0, TEMP2 add AO, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) #ifndef TRMMKERNEL sra K, 2, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 FCLR (cc08) .align 4 .LL33: FMADD1 (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD4 (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD1 (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD2 (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD3 (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD4 (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD1 (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD2 (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD3 (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD4 (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD1 (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD2 (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD3 (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD4 (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #ifndef TRMMKERNEL and K, 3, L #else #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub K, KK, L #elif defined(LEFT) add KK, 1, L #else add KK, 1, L #endif and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD4 (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL37 LDF [BO + 1 * SIZE], b2 .align 4 .LL38: #ifndef TRMMKERNEL LDF [C1 + 0 * SIZE], a1 FADD c01, c04, c01 LDF [C1 + 1 * SIZE], a2 FADD c02, c03, c02 FMADD (alpha_r, cc01, aa1, aa1) FMADD (alpha_r, cc02, aa2, aa2) #else FADD c01, c04, c01 FADD c02, c03, c02 FMUL ALPHA_R, c01, a1 FMUL ALPHA_R, c02, a2 #endif FNMSUB (alpha_i, cc02, aa1, aa1) FMADD (alpha_i, cc01, aa2, aa2) STF a1, [C1 + 0 * SIZE] STF a2, [C1 + 1 * SIZE] #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub K, KK, TEMP1 #ifdef LEFT add TEMP1, -1, TEMP1 #else add TEMP1, -1, TEMP1 #endif sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LEFT add KK, 1, KK #endif #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 add C1, 2 * SIZE, C1 .align 4 .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zgemm_ncopy.S000066400000000000000000000141001313527062700202030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define A %i2 #define LDA %i3 #define B %i4 #define A1 %l0 #define A2 %l1 #define I %l4 #define J %l5 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #endif PROLOGUE SAVESP sra N, 1, J cmp J, 0 ble,pn %icc, .LL100 sll LDA, ZBASE_SHIFT, LDA .LL11: add A, LDA, A2 mov A, A1 sra M, 2, I cmp I, 0 ble,pn %icc, .LL15 add A2, LDA, A #define PREFETCHSIZE 36 #define WPREFETCHSIZE 20 .LL12: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 0 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 LDF [A1 + 2 * SIZE], c05 LDF [A1 + 3 * SIZE], c06 LDF [A2 + 2 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 4 * SIZE], c09 LDF [A1 + 5 * SIZE], c10 LDF [A2 + 4 * SIZE], c11 LDF [A2 + 5 * SIZE], c12 LDF [A1 + 6 * SIZE], c13 LDF [A1 + 7 * SIZE], c14 LDF [A2 + 6 * SIZE], c15 LDF [A2 + 7 * SIZE], c16 prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 STF c01, [B + 0 * SIZE] add A1, 8 * SIZE, A1 STF c02, [B + 1 * SIZE] add A2, 8 * SIZE, A2 STF c03, [B + 2 * SIZE] add I, -1, I STF c04, [B + 3 * SIZE] cmp I, 0 STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] #ifdef DOUBLE prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 #endif STF c09, [B + 8 * SIZE] STF c10, [B + 9 * SIZE] STF c11, [B + 10 * SIZE] STF c12, [B + 11 * SIZE] STF c13, [B + 12 * SIZE] STF c14, [B + 13 * SIZE] STF c15, [B + 14 * SIZE] STF c16, [B + 15 * SIZE] bg,pt %icc, .LL12 add B, 16 * SIZE, B .LL15: and M, 3, I cmp I, 0 ble,pn %icc, .LL99 nop .LL16: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 add A1, 2 * SIZE, A1 LDF [A2 + 0 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 add A2, 2 * SIZE, A2 STF c01, [B + 0 * SIZE] add I, -1, I STF c02, [B + 1 * SIZE] cmp I, 0 STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] bg,pt %icc, .LL16 add B, 4 * SIZE, B .LL99: add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop .LL111: sra M, 2, I cmp I, 0 ble,pn %icc, .LL115 mov A, A1 .LL112: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 LDF [A1 + 4 * SIZE], c05 LDF [A1 + 5 * SIZE], c06 LDF [A1 + 6 * SIZE], c07 LDF [A1 + 7 * SIZE], c08 add A1, 8 * SIZE, A1 STF c01, [B + 0 * SIZE] add I, -1, I STF c02, [B + 1 * SIZE] cmp I, 0 STF c03, [B + 2 * SIZE] STF c04, [B + 3 * SIZE] STF c05, [B + 4 * SIZE] STF c06, [B + 5 * SIZE] STF c07, [B + 6 * SIZE] STF c08, [B + 7 * SIZE] bg,pt %icc, .LL112 add B, 8 * SIZE, B .LL115: and M, 3, I cmp I, 0 ble,pn %icc, .LL999 nop .LL116: LDF [A1 + 0 * SIZE], c01 add I, -1, I LDF [A1 + 1 * SIZE], c02 add A1, 2 * SIZE, A1 cmp I, 0 STF c01, [B + 0 * SIZE] STF c02, [B + 1 * SIZE] bg,pt %icc, .LL116 add B, 2 * SIZE, B .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zgemm_tcopy.S000066400000000000000000000157271313527062700202310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define A %i2 #define LDA %i3 #define B %i4 #define A1 %l0 #define A2 %l1 #define I %l4 #define J %l5 #define B1 %o0 #define B2 %o1 #define M4 %o4 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #endif PROLOGUE SAVESP sll M, BASE_SHIFT + 2, M4 and N, -2, B2 sll M, ZBASE_SHIFT, B1 smul B1, B2, B2 add B, B2, B2 sra M, 1, J cmp J, 0 ble,pn %icc, .LL100 sll LDA, ZBASE_SHIFT, LDA .LL11: add A, LDA, A2 mov A, A1 sra N, 2, I cmp I, 0 mov B, B1 add B, 8 * SIZE, B ble,pn %icc, .LL15 add A2, LDA, A #define PREFETCHSIZE 16 .LL12: prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 LDF [A1 + 4 * SIZE], c05 LDF [A1 + 5 * SIZE], c06 LDF [A1 + 6 * SIZE], c07 LDF [A1 + 7 * SIZE], c08 prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 LDF [A2 + 0 * SIZE], c09 LDF [A2 + 1 * SIZE], c10 LDF [A2 + 2 * SIZE], c11 LDF [A2 + 3 * SIZE], c12 LDF [A2 + 4 * SIZE], c13 LDF [A2 + 5 * SIZE], c14 LDF [A2 + 6 * SIZE], c15 LDF [A2 + 7 * SIZE], c16 prefetch [B1 + (PREFETCHSIZE + 0) * SIZE], 2 STF c01, [B1 + 0 * SIZE] add A1, 8 * SIZE, A1 STF c02, [B1 + 1 * SIZE] add A2, 8 * SIZE, A2 STF c03, [B1 + 2 * SIZE] STF c04, [B1 + 3 * SIZE] STF c09, [B1 + 4 * SIZE] add I, -1, I STF c10, [B1 + 5 * SIZE] cmp I, 0 STF c11, [B1 + 6 * SIZE] STF c12, [B1 + 7 * SIZE] add B1, M4, B1 #ifdef DOUBLE prefetch [B1 + (PREFETCHSIZE + 8) * SIZE], 2 #endif STF c05, [B1 + 0 * SIZE] STF c06, [B1 + 1 * SIZE] STF c07, [B1 + 2 * SIZE] STF c08, [B1 + 3 * SIZE] STF c13, [B1 + 4 * SIZE] STF c14, [B1 + 5 * SIZE] STF c15, [B1 + 6 * SIZE] STF c16, [B1 + 7 * SIZE] bg,pt %icc, .LL12 add B1, M4, B1 .LL15: and N, 2, I cmp I, 0 ble,pn %icc, .LL17 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 LDF [A2 + 0 * SIZE], c05 LDF [A2 + 1 * SIZE], c06 LDF [A2 + 2 * SIZE], c07 LDF [A2 + 3 * SIZE], c08 STF c01, [B1 + 0 * SIZE] add A1, 4 * SIZE, A1 STF c02, [B1 + 1 * SIZE] add A2, 4 * SIZE, A2 STF c03, [B1 + 2 * SIZE] STF c04, [B1 + 3 * SIZE] STF c05, [B1 + 4 * SIZE] STF c06, [B1 + 5 * SIZE] STF c07, [B1 + 6 * SIZE] STF c08, [B1 + 7 * SIZE] add B1, M4, B1 .LL17: and N, 1, I cmp I, 0 ble,pn %icc, .LL99 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A2 + 0 * SIZE], c03 LDF [A2 + 1 * SIZE], c04 STF c01, [B2 + 0 * SIZE] STF c02, [B2 + 1 * SIZE] STF c03, [B2 + 2 * SIZE] STF c04, [B2 + 3 * SIZE] add B2, 4 * SIZE, B2 .LL99: add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and M, 1, J cmp J, 0 ble,pn %icc, .LL999 nop .LL111: sra N, 2, I cmp I, 0 mov A, A1 ble,pn %icc, .LL115 mov B, B1 .LL112: LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 LDF [A1 + 4 * SIZE], c05 LDF [A1 + 5 * SIZE], c06 LDF [A1 + 6 * SIZE], c07 LDF [A1 + 7 * SIZE], c08 STF c01, [B1 + 0 * SIZE] add A1, 8 * SIZE, A1 STF c02, [B1 + 1 * SIZE] add I, -1, I STF c03, [B1 + 2 * SIZE] cmp I, 0 STF c04, [B1 + 3 * SIZE] add B1, M4, B1 STF c05, [B1 + 0 * SIZE] STF c06, [B1 + 1 * SIZE] STF c07, [B1 + 2 * SIZE] STF c08, [B1 + 3 * SIZE] bg,pt %icc, .LL112 add B1, M4, B1 .LL115: and N, 2, I cmp I, 0 ble,pn %icc, .LL117 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 LDF [A1 + 2 * SIZE], c03 LDF [A1 + 3 * SIZE], c04 STF c01, [B1 + 0 * SIZE] add A1, 4 * SIZE, A1 STF c02, [B1 + 1 * SIZE] add I, -1, I STF c03, [B1 + 2 * SIZE] cmp I, 0 STF c04, [B1 + 3 * SIZE] add B1, M4, B1 .LL117: and N, 1, I cmp I, 0 ble,pn %icc, .LL999 nop LDF [A1 + 0 * SIZE], c01 LDF [A1 + 1 * SIZE], c02 STF c01, [B2 + 0 * SIZE] STF c02, [B2 + 1 * SIZE] .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zgemv_n.S000066400000000000000000000511231313527062700173270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef DOUBLE #define PREFETCHSIZE 44 #else #define PREFETCHSIZE 88 #endif #define M %i0 #define N %i1 #define A %i5 #define LDA %i2 #define X %i3 #define INCX %i4 #define Y %l0 #define INCY %l1 #define BUFFER %l2 #define I %l3 #define J %l5 #define A1 %o0 #define A2 %o1 #define A3 %o2 #define A4 %o3 #define Y1 %l4 #define YY %l6 #ifdef DOUBLE #define t1 %f0 #define t2 %f2 #define t3 %f4 #define t4 %f6 #define y1 %f8 #define y2 %f10 #define y3 %f12 #define y4 %f14 #define y5 %f16 #define y6 %f18 #define y7 %f20 #define y8 %f22 #define a1 %f24 #define a2 %f26 #define a3 %f28 #define a4 %f30 #define a5 %f32 #define a6 %f34 #define a7 %f36 #define a8 %f38 #define a9 %f40 #define a10 %f42 #define a11 %f44 #define a12 %f46 #define a13 %f48 #define a14 %f50 #define a15 %f52 #define a16 %f54 #define x1 %f56 #define x2 %f58 #define x3 %f60 #define x4 %f62 #define FZERO %f50 #define ALPHA_R %f52 #define ALPHA_I %f54 #else #define t1 %f0 #define t2 %f1 #define t3 %f2 #define t4 %f3 #define y1 %f4 #define y2 %f5 #define y3 %f6 #define y4 %f7 #define y5 %f8 #define y6 %f9 #define y7 %f10 #define y8 %f11 #define a1 %f12 #define a2 %f13 #define a3 %f14 #define a4 %f15 #define a5 %f16 #define a6 %f17 #define a7 %f18 #define a8 %f19 #define a9 %f20 #define a10 %f21 #define a11 %f22 #define a12 %f23 #define a13 %f24 #define a14 %f25 #define a15 %f26 #define a16 %f27 #define x1 %f28 #define x2 %f29 #define x3 %f30 #define x4 %f31 #define FZERO %f25 #define ALPHA_R %f26 #define ALPHA_I %f27 #endif #ifndef __64BIT__ #define STACK_ALPHA_R [%sp + STACK_START + 16] #ifndef DOUBLE #define STACK_ALPHA_I [%sp + STACK_START + 20] #else #define STACK_ALPHA_I [%sp + STACK_START + 24] #endif #else #define STACK_ALPHA_R [%sp + STACK_START + 32] #define STACK_ALPHA_I [%sp + STACK_START + 40] #endif #ifndef CONJ #define FSUBX FSUB #define FADDX FADD #else #define FSUBX FADD #define FADDX FSUB #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], LDA ld [%sp + STACK_START + 40], X ld [%sp + STACK_START + 44], INCX ld [%sp + STACK_START + 48], Y ld [%sp + STACK_START + 52], INCY ld [%sp + STACK_START + 56], BUFFER #else st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ ld [%sp + STACK_START + 28], LDA ld [%sp + STACK_START + 32], X ld [%sp + STACK_START + 36], INCX ld [%sp + STACK_START + 40], Y ld [%sp + STACK_START + 44], INCY ld [%sp + STACK_START + 48], BUFFER #endif #else ldx [%sp + STACK_START + 56], LDA ldx [%sp + STACK_START + 64], X ldx [%sp + STACK_START + 72], INCX ldx [%sp + STACK_START + 80], Y ldx [%sp + STACK_START + 88], INCY ldx [%sp + STACK_START + 96], BUFFER #ifdef DOUBLE std %f6, STACK_ALPHA_R std %f8, STACK_ALPHA_I #else st %f7, STACK_ALPHA_R st %f9, STACK_ALPHA_I #endif #endif sll LDA, ZBASE_SHIFT, LDA cmp M, 0 ble %icc, .LL999 sll INCX, ZBASE_SHIFT, INCX cmp N, 0 ble %icc, .LL999 sll INCY, ZBASE_SHIFT, INCY cmp INCY, 2 * SIZE be %icc, .LL20 mov Y, YY #ifdef DOUBLE FCLR(19) #else FCLR(25) #endif add M, 3, J sra J, 2, J mov BUFFER, YY mov BUFFER, Y1 .LL01: STF FZERO, [Y1 + 0 * SIZE] nop STF FZERO, [Y1 + 1 * SIZE] STF FZERO, [Y1 + 2 * SIZE] STF FZERO, [Y1 + 3 * SIZE] STF FZERO, [Y1 + 4 * SIZE] nop STF FZERO, [Y1 + 5 * SIZE] deccc J STF FZERO, [Y1 + 6 * SIZE] nop STF FZERO, [Y1 + 7 * SIZE] bg,pn %icc, .LL01 add Y1, 8 * SIZE, Y1 .LL20: sra N, 1, J cmp J, 0 ble,pn %icc, .LL30 nop .LL21: mov YY, Y1 mov A, A1 LDF STACK_ALPHA_R, ALPHA_R LDF STACK_ALPHA_I, ALPHA_I add A, LDA, A2 add A2, LDA, A LDF [X + 0 * SIZE], x1 LDF [X + 1 * SIZE], x2 add X, INCX, X LDF [X + 0 * SIZE], x3 LDF [X + 1 * SIZE], x4 add X, INCX, X FMUL ALPHA_R, x1, a1 FMUL ALPHA_I, x2, a4 FMUL ALPHA_I, x1, a2 FMUL ALPHA_R, x2, a3 FMUL ALPHA_R, x3, a5 FMUL ALPHA_I, x4, a8 FMUL ALPHA_I, x3, a6 FMUL ALPHA_R, x4, a7 #ifndef XCONJ FSUB a1, a4, x1 FADD a2, a3, x2 FSUB a5, a8, x3 FADD a6, a7, x4 #else FADD a1, a4, x1 FSUB a2, a3, x2 FADD a5, a8, x3 FSUB a6, a7, x4 #endif sra M, 2, I cmp I, 0 ble,pn %icc, .LL27 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a9 LDF [A1 + 5 * SIZE], a10 LDF [A1 + 6 * SIZE], a11 LDF [A1 + 7 * SIZE], a12 LDF [A2 + 0 * SIZE], a5 LDF [A2 + 1 * SIZE], a6 LDF [A2 + 2 * SIZE], a7 LDF [A2 + 3 * SIZE], a8 LDF [A2 + 4 * SIZE], a13 LDF [A2 + 5 * SIZE], a14 LDF [A2 + 6 * SIZE], a15 LDF [A2 + 7 * SIZE], a16 LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 FMUL a1, x1, t1 deccc I FMUL a1, x2, t2 LDF [A1 + 8 * SIZE], a1 FMUL a3, x1, t3 FMUL a3, x2, t4 ble,pn %icc, .LL26 LDF [A1 + 10 * SIZE], a3 FADD y1, t1, y1 LDF [Y1 + 3 * SIZE], y4 FMUL a2, x2, t1 FADD y2, t2, y2 FMUL a2, x1, t2 LDF [A1 + 9 * SIZE], a2 FADD y3, t3, y3 LDF [Y1 + 4 * SIZE], y5 FMUL a4, x2, t3 FADD y4, t4, y4 FMUL a4, x1, t4 LDF [A1 + 11 * SIZE], a4 FSUBX y1, t1, y1 LDF [Y1 + 5 * SIZE], y6 FMUL a5, x3, t1 FADDX y2, t2, y2 FMUL a5, x4, t2 LDF [A2 + 8 * SIZE], a5 FSUBX y3, t3, y3 LDF [Y1 + 6 * SIZE], y7 FMUL a7, x3, t3 FADDX y4, t4, y4 FMUL a7, x4, t4 LDF [A2 + 10 * SIZE], a7 FADD y1, t1, y1 LDF [Y1 + 7 * SIZE], y8 FMUL a6, x4, t1 FADD y2, t2, y2 FMUL a6, x3, t2 LDF [A2 + 9 * SIZE], a6 FADD y3, t3, y3 FMUL a8, x4, t3 FADD y4, t4, y4 FMUL a8, x3, t4 LDF [A2 + 11 * SIZE], a8 FSUBX y1, t1, y1 FMUL a9, x1, t1 FADDX y2, t2, y2 FMUL a9, x2, t2 LDF [A1 + 12 * SIZE], a9 FSUBX y3, t3, y3 deccc I FMUL a11, x1, t3 FADDX y4, t4, y4 FMUL a11, x2, t4 ble,pn %icc, .LL23 LDF [A1 + 14 * SIZE], a11 .LL22: FADD y5, t1, y5 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a10, x2, t1 LDF [Y1 + 7 * SIZE], y8 FADD y6, t2, y6 FMUL a10, x1, t2 LDF [A1 + 13 * SIZE], a10 FADD y7, t3, y7 FMUL a12, x2, t3 STF y1, [Y1 + 0 * SIZE] FADD y8, t4, y8 FMUL a12, x1, t4 LDF [A1 + 15 * SIZE], a12 FSUBX y5, t1, y5 FMUL a13, x3, t1 STF y2, [Y1 + 1 * SIZE] FADDX y6, t2, y6 FMUL a13, x4, t2 LDF [A2 + 12 * SIZE], a13 FSUBX y7, t3, y7 FMUL a15, x3, t3 STF y3, [Y1 + 2 * SIZE] FADDX y8, t4, y8 FMUL a15, x4, t4 LDF [A2 + 14 * SIZE], a15 FADD y5, t1, y5 FMUL a14, x4, t1 STF y4, [Y1 + 3 * SIZE] FADD y6, t2, y6 FMUL a14, x3, t2 LDF [A2 + 13 * SIZE], a14 FADD y7, t3, y7 FMUL a16, x4, t3 LDF [Y1 + 8 * SIZE], y1 FADD y8, t4, y8 FMUL a16, x3, t4 LDF [A2 + 15 * SIZE], a16 FSUBX y5, t1, y5 FMUL a1, x1, t1 LDF [Y1 + 9 * SIZE], y2 FADDX y6, t2, y6 FMUL a1, x2, t2 LDF [A1 + 16 * SIZE], a1 FSUBX y7, t3, y7 FMUL a3, x1, t3 LDF [Y1 + 10 * SIZE], y3 FADDX y8, t4, y8 FMUL a3, x2, t4 LDF [A1 + 18 * SIZE], a3 FADD y1, t1, y1 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FMUL a2, x2, t1 LDF [Y1 + 11 * SIZE], y4 FADD y2, t2, y2 FMUL a2, x1, t2 LDF [A1 + 17 * SIZE], a2 FADD y3, t3, y3 FMUL a4, x2, t3 STF y5, [Y1 + 4 * SIZE] FADD y4, t4, y4 FMUL a4, x1, t4 LDF [A1 + 19 * SIZE], a4 FSUBX y1, t1, y1 FMUL a5, x3, t1 STF y6, [Y1 + 5 * SIZE] FADDX y2, t2, y2 FMUL a5, x4, t2 LDF [A2 + 16 * SIZE], a5 FSUBX y3, t3, y3 FMUL a7, x3, t3 STF y7, [Y1 + 6 * SIZE] FADDX y4, t4, y4 deccc I FMUL a7, x4, t4 LDF [A2 + 18 * SIZE], a7 FADD y1, t1, y1 FMUL a6, x4, t1 STF y8, [Y1 + 7 * SIZE] FADD y2, t2, y2 FMUL a6, x3, t2 LDF [A2 + 17 * SIZE], a6 FADD y3, t3, y3 add A1, 8 * SIZE, A1 FMUL a8, x4, t3 LDF [Y1 + 12 * SIZE], y5 FADD y4, t4, y4 FMUL a8, x3, t4 LDF [A2 + 19 * SIZE], a8 FSUBX y1, t1, y1 add A2, 8 * SIZE, A2 FMUL a9, x1, t1 LDF [Y1 + 13 * SIZE], y6 FADDX y2, t2, y2 add Y1, 8 * SIZE, Y1 FMUL a9, x2, t2 LDF [A1 + 12 * SIZE], a9 FSUBX y3, t3, y3 FMUL a11, x1, t3 LDF [Y1 + 6 * SIZE], y7 FADDX y4, t4, y4 FMUL a11, x2, t4 bg,pn %icc, .LL22 LDF [A1 + 14 * SIZE], a11 .LL23: FADD y5, t1, y5 FMUL a10, x2, t1 LDF [Y1 + 7 * SIZE], y8 FADD y6, t2, y6 FMUL a10, x1, t2 LDF [A1 + 13 * SIZE], a10 FADD y7, t3, y7 FMUL a12, x2, t3 STF y1, [Y1 + 0 * SIZE] FADD y8, t4, y8 FMUL a12, x1, t4 LDF [A1 + 15 * SIZE], a12 FSUBX y5, t1, y5 FMUL a13, x3, t1 STF y2, [Y1 + 1 * SIZE] FADDX y6, t2, y6 FMUL a13, x4, t2 LDF [A2 + 12 * SIZE], a13 FSUBX y7, t3, y7 FMUL a15, x3, t3 STF y3, [Y1 + 2 * SIZE] FADDX y8, t4, y8 FMUL a15, x4, t4 LDF [A2 + 14 * SIZE], a15 FADD y5, t1, y5 FMUL a14, x4, t1 STF y4, [Y1 + 3 * SIZE] FADD y6, t2, y6 FMUL a14, x3, t2 LDF [A2 + 13 * SIZE], a14 FADD y7, t3, y7 FMUL a16, x4, t3 LDF [Y1 + 8 * SIZE], y1 FADD y8, t4, y8 FMUL a16, x3, t4 LDF [A2 + 15 * SIZE], a16 FSUBX y5, t1, y5 add A1, 8 * SIZE, A1 FMUL a1, x1, t1 LDF [Y1 + 9 * SIZE], y2 FADDX y6, t2, y6 add A2, 8 * SIZE, A2 FMUL a1, x2, t2 LDF [A1 + 8 * SIZE], a1 FSUBX y7, t3, y7 FMUL a3, x1, t3 LDF [Y1 + 10 * SIZE], y3 FADDX y8, t4, y8 add Y1, 8 * SIZE, Y1 FMUL a3, x2, t4 LDF [A1 + 10 * SIZE], a3 STF y5, [Y1 - 4 * SIZE] STF y6, [Y1 - 3 * SIZE] STF y7, [Y1 - 2 * SIZE] STF y8, [Y1 - 1 * SIZE] .LL26: FADD y1, t1, y1 LDF [Y1 + 3 * SIZE], y4 FMUL a2, x2, t1 FADD y2, t2, y2 FMUL a2, x1, t2 FADD y3, t3, y3 LDF [Y1 + 4 * SIZE], y5 FMUL a4, x2, t3 FADD y4, t4, y4 FMUL a4, x1, t4 FSUBX y1, t1, y1 LDF [Y1 + 5 * SIZE], y6 FMUL a5, x3, t1 FADDX y2, t2, y2 FMUL a5, x4, t2 FSUBX y3, t3, y3 LDF [Y1 + 6 * SIZE], y7 FADDX y4, t4, y4 FMUL a7, x4, t4 FADD y1, t1, y1 LDF [Y1 + 7 * SIZE], y8 FMUL a7, x3, t3 FMUL a6, x4, t1 FADD y2, t2, y2 FMUL a6, x3, t2 FADD y3, t3, y3 FMUL a8, x4, t3 FADD y4, t4, y4 FMUL a8, x3, t4 FSUBX y1, t1, y1 FMUL a9, x1, t1 FADDX y2, t2, y2 FMUL a9, x2, t2 FSUBX y3, t3, y3 FMUL a11, x1, t3 FADDX y4, t4, y4 FMUL a11, x2, t4 FADD y5, t1, y5 FMUL a10, x2, t1 FADD y6, t2, y6 FMUL a10, x1, t2 FADD y7, t3, y7 FMUL a12, x2, t3 FADD y8, t4, y8 FMUL a12, x1, t4 FSUBX y5, t1, y5 FMUL a13, x3, t1 FADDX y6, t2, y6 FMUL a13, x4, t2 FSUBX y7, t3, y7 FMUL a15, x3, t3 FADDX y8, t4, y8 FMUL a15, x4, t4 FADD y5, t1, y5 FMUL a14, x4, t1 FADD y6, t2, y6 FMUL a14, x3, t2 FADD y7, t3, y7 FMUL a16, x4, t3 FADD y8, t4, y8 FMUL a16, x3, t4 STF y1, [Y1 + 0 * SIZE] FSUBX y5, t1, y5 STF y2, [Y1 + 1 * SIZE] FADDX y6, t2, y6 STF y3, [Y1 + 2 * SIZE] FSUBX y7, t3, y7 STF y4, [Y1 + 3 * SIZE] FADDX y8, t4, y8 STF y5, [Y1 + 4 * SIZE] add A1, 8 * SIZE, A1 STF y6, [Y1 + 5 * SIZE] add A2, 8 * SIZE, A2 STF y7, [Y1 + 6 * SIZE] STF y8, [Y1 + 7 * SIZE] add Y1, 8 * SIZE, Y1 .LL27: andcc M, 2, I ble,pn %icc, .LL28 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 FMUL a1, x1, t1 LDF [A2 + 0 * SIZE], a5 FMUL a1, x2, t2 LDF [A2 + 1 * SIZE], a6 FMUL a3, x1, t3 LDF [A2 + 2 * SIZE], a7 FMUL a3, x2, t4 LDF [A2 + 3 * SIZE], a8 FADD y1, t1, y1 FMUL a2, x2, t1 FADD y2, t2, y2 FMUL a2, x1, t2 FADD y3, t3, y3 FMUL a4, x2, t3 FADD y4, t4, y4 FMUL a4, x1, t4 FSUBX y1, t1, y1 FMUL a5, x3, t1 FADDX y2, t2, y2 FMUL a5, x4, t2 FSUBX y3, t3, y3 FMUL a7, x3, t3 FADDX y4, t4, y4 FMUL a7, x4, t4 FADD y1, t1, y1 FMUL a6, x4, t1 FADD y2, t2, y2 FMUL a6, x3, t2 FADD y3, t3, y3 FMUL a8, x4, t3 FADD y4, t4, y4 FMUL a8, x3, t4 FSUBX y1, t1, y1 FADDX y2, t2, y2 FSUBX y3, t3, y3 FADDX y4, t4, y4 STF y1, [Y1 + 0 * SIZE] add A1, 4 * SIZE, A1 STF y2, [Y1 + 1 * SIZE] add A2, 4 * SIZE, A2 STF y3, [Y1 + 2 * SIZE] nop STF y4, [Y1 + 3 * SIZE] add Y1, 4 * SIZE, Y1 .LL28: andcc M, 1, I ble,pn %icc, .LL29 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 FMUL a1, x1, t1 FMUL a1, x2, t2 FMUL a2, x2, t3 FMUL a2, x1, t4 FADD y1, t1, y1 FMUL a3, x3, t1 FADD y2, t2, y2 FMUL a3, x4, t2 FSUBX y1, t3, y1 FMUL a4, x4, t3 FADDX y2, t4, y2 FMUL a4, x3, t4 FADD y1, t1, y1 FADD y2, t2, y2 FSUBX y1, t3, y1 FADDX y2, t4, y2 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] .LL29: deccc J bg %icc, .LL21 nop .LL30: andcc N, 1, J ble,pn %icc, .LL990 nop .LL31: mov YY, Y1 mov A, A1 LDF STACK_ALPHA_R, ALPHA_R LDF STACK_ALPHA_I, ALPHA_I LDF [X + 0 * SIZE], x1 LDF [X + 1 * SIZE], x2 FMUL ALPHA_R, x1, a1 /* AC */ FMUL ALPHA_I, x1, a2 /* AD */ FMUL ALPHA_R, x2, a3 /* BC */ FMUL ALPHA_I, x2, a4 /* BD */ #ifndef XCONJ FSUB a1, a4, x1 FADD a2, a3, x2 #else FADD a1, a4, x1 FSUB a2, a3, x2 #endif sra M, 2, I cmp I, 0 ble,pn %icc, .LL37 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a9 LDF [A1 + 5 * SIZE], a10 LDF [A1 + 6 * SIZE], a11 LDF [A1 + 7 * SIZE], a12 LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 LDF [Y1 + 2 * SIZE], y3 LDF [Y1 + 3 * SIZE], y4 LDF [Y1 + 4 * SIZE], y5 LDF [Y1 + 5 * SIZE], y6 LDF [Y1 + 6 * SIZE], y7 LDF [Y1 + 7 * SIZE], y8 FMUL a1, x1, t1 deccc I FMUL a1, x2, t2 LDF [A1 + 8 * SIZE], a1 FMUL a3, x1, t3 FMUL a3, x2, t4 ble,pn %icc, .LL33 LDF [A1 + 10 * SIZE], a3 .LL32: FADD y1, t1, y1 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a2, x2, t1 FADD y2, t2, y2 FMUL a2, x1, t2 LDF [A1 + 9 * SIZE], a2 FADD y3, t3, y3 FMUL a4, x2, t3 FADD y4, t4, y4 FMUL a4, x1, t4 LDF [A1 + 11 * SIZE], a4 FSUBX y1, t1, y1 FMUL a9, x1, t1 FADDX y2, t2, y2 FMUL a9, x2, t2 LDF [A1 + 12 * SIZE], a9 FSUBX y3, t3, y3 FMUL a11, x1, t3 FADDX y4, t4, y4 FMUL a11, x2, t4 LDF [A1 + 14 * SIZE], a11 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] FADD y5, t1, y5 FMUL a10, x2, t1 LDF [Y1 + 8 * SIZE], y1 FADD y6, t2, y6 FMUL a10, x1, t2 LDF [A1 + 13 * SIZE], a10 FADD y7, t3, y7 deccc I FMUL a12, x2, t3 LDF [Y1 + 9 * SIZE], y2 FADD y8, t4, y8 FMUL a12, x1, t4 LDF [A1 + 15 * SIZE], a12 FSUBX y5, t1, y5 add A1, 8 * SIZE, A1 FMUL a1, x1, t1 LDF [Y1 + 10 * SIZE], y3 FADDX y6, t2, y6 FMUL a1, x2, t2 LDF [A1 + 8 * SIZE], a1 FSUBX y7, t3, y7 FMUL a3, x1, t3 LDF [Y1 + 11 * SIZE], y4 FADDX y8, t4, y8 FMUL a3, x2, t4 LDF [A1 + 10 * SIZE], a3 STF y5, [Y1 + 4 * SIZE] STF y6, [Y1 + 5 * SIZE] STF y7, [Y1 + 6 * SIZE] STF y8, [Y1 + 7 * SIZE] LDF [Y1 + 12 * SIZE], y5 LDF [Y1 + 13 * SIZE], y6 LDF [Y1 + 14 * SIZE], y7 add Y1, 8 * SIZE, Y1 bg,pn %icc, .LL32 LDF [Y1 + 7 * SIZE], y8 .LL33: FADD y1, t1, y1 FMUL a2, x2, t1 FADD y2, t2, y2 FMUL a2, x1, t2 FADD y3, t3, y3 FMUL a4, x2, t3 FADD y4, t4, y4 FMUL a4, x1, t4 FSUBX y1, t1, y1 FMUL a9, x1, t1 FADDX y2, t2, y2 FMUL a9, x2, t2 FSUBX y3, t3, y3 FMUL a11, x1, t3 FADDX y4, t4, y4 FMUL a11, x2, t4 FADD y5, t1, y5 FMUL a10, x2, t1 FADD y6, t2, y6 FMUL a10, x1, t2 FADD y7, t3, y7 FMUL a12, x2, t3 FADD y8, t4, y8 FMUL a12, x1, t4 FSUBX y5, t1, y5 FADDX y6, t2, y6 FSUBX y7, t3, y7 FADDX y8, t4, y8 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] STF y5, [Y1 + 4 * SIZE] STF y6, [Y1 + 5 * SIZE] STF y7, [Y1 + 6 * SIZE] STF y8, [Y1 + 7 * SIZE] add A1, 8 * SIZE, A1 add Y1, 8 * SIZE, Y1 .LL37: andcc M, 2, I ble,pn %icc, .LL38 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [Y1 + 0 * SIZE], y1 FMUL a1, x1, t1 LDF [Y1 + 1 * SIZE], y2 FMUL a1, x2, t2 LDF [Y1 + 2 * SIZE], y3 FMUL a3, x1, t3 LDF [Y1 + 3 * SIZE], y4 FMUL a3, x2, t4 FADD y1, t1, y1 FMUL a2, x2, t1 FADD y2, t2, y2 FMUL a2, x1, t2 FADD y3, t3, y3 FMUL a4, x2, t3 FADD y4, t4, y4 FMUL a4, x1, t4 FSUBX y1, t1, y1 FADDX y2, t2, y2 FSUBX y3, t3, y3 FADDX y4, t4, y4 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] STF y3, [Y1 + 2 * SIZE] STF y4, [Y1 + 3 * SIZE] add A1, 4 * SIZE, A1 add Y1, 4 * SIZE, Y1 .LL38: andcc M, 1, I ble,pn %icc, .LL990 nop LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [Y1 + 0 * SIZE], y1 LDF [Y1 + 1 * SIZE], y2 FMUL a1, x1, t1 FMUL a1, x2, t2 FMUL a2, x2, t3 FMUL a2, x1, t4 FADD y1, t1, y1 FADD y2, t2, y2 FSUBX y1, t3, y1 FADDX y2, t4, y2 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] .LL990: cmp INCY, 2 * SIZE be %icc, .LL999 mov Y, Y1 sra M, 2, I cmp I, 0 ble,pn %icc, .LL995 nop .LL991: LDF [BUFFER + 0 * SIZE], a1 LDF [BUFFER + 1 * SIZE], a2 LDF [Y + 0 * SIZE], y1 LDF [Y + 1 * SIZE], y2 add Y, INCY, Y LDF [BUFFER + 2 * SIZE], a3 LDF [BUFFER + 3 * SIZE], a4 LDF [Y + 0 * SIZE], y3 LDF [Y + 1 * SIZE], y4 add Y, INCY, Y LDF [BUFFER + 4 * SIZE], a5 LDF [BUFFER + 5 * SIZE], a6 LDF [Y + 0 * SIZE], y5 LDF [Y + 1 * SIZE], y6 add Y, INCY, Y LDF [BUFFER + 6 * SIZE], a7 LDF [BUFFER + 7 * SIZE], a8 LDF [Y + 0 * SIZE], y7 LDF [Y + 1 * SIZE], y8 add Y, INCY, Y FADD y1, a1, y1 FADD y2, a2, y2 FADD y3, a3, y3 FADD y4, a4, y4 FADD y5, a5, y5 FADD y6, a6, y6 FADD y7, a7, y7 FADD y8, a8, y8 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] add Y1, INCY, Y1 STF y3, [Y1 + 0 * SIZE] STF y4, [Y1 + 1 * SIZE] add Y1, INCY, Y1 STF y5, [Y1 + 0 * SIZE] STF y6, [Y1 + 1 * SIZE] add Y1, INCY, Y1 STF y7, [Y1 + 0 * SIZE] STF y8, [Y1 + 1 * SIZE] add Y1, INCY, Y1 deccc I bg,pn %icc, .LL991 add BUFFER, 8 * SIZE, BUFFER .LL995: andcc M, 2, I ble,pn %icc, .LL996 nop LDF [BUFFER + 0 * SIZE], a1 LDF [BUFFER + 1 * SIZE], a2 LDF [Y + 0 * SIZE], y1 LDF [Y + 1 * SIZE], y2 add Y, INCY, Y LDF [BUFFER + 2 * SIZE], a3 LDF [BUFFER + 3 * SIZE], a4 LDF [Y + 0 * SIZE], y3 LDF [Y + 1 * SIZE], y4 add Y, INCY, Y FADD y1, a1, y1 FADD y2, a2, y2 FADD y3, a3, y3 FADD y4, a4, y4 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] add Y1, INCY, Y1 STF y3, [Y1 + 0 * SIZE] STF y4, [Y1 + 1 * SIZE] add Y1, INCY, Y1 add BUFFER, 4 * SIZE, BUFFER .LL996: andcc M, 1, I ble,pn %icc, .LL999 nop LDF [BUFFER + 0 * SIZE], a1 LDF [BUFFER + 1 * SIZE], a2 LDF [Y + 0 * SIZE], y1 LDF [Y + 1 * SIZE], y2 FADD y1, a1, y1 FADD y2, a2, y2 STF y1, [Y1 + 0 * SIZE] STF y2, [Y1 + 1 * SIZE] .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zgemv_t.S000066400000000000000000000670621313527062700173460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define P 4000 #define M %i0 #define N %i1 #define A %i5 #define LDA %i2 #define X %i3 #define INCX %i4 #define Y %l0 #define INCY %l1 #define BUFFER %l2 #define I %l3 #define IS %l4 #define J %l5 #define MIN_M %l6 #define XP %l7 #define A1 %o0 #define A2 %o1 #define A3 %o2 #define A4 %o3 #define X1 %o4 #define Y1 %o5 #define PNLDA %g1 #define Y2 %o7 /* Danger? */ #ifdef DOUBLE #define t1 %f0 #define t2 %f2 #define t3 %f4 #define t4 %f6 #define c1 %f8 #define c2 %f10 #define c3 %f12 #define c4 %f14 #define c5 %f16 #define c6 %f18 #define c7 %f20 #define c8 %f22 #define c9 %f24 #define c10 %f26 #define c11 %f28 #define c12 %f30 #define c13 %f32 #define c14 %f34 #define c15 %f36 #define c16 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f48 #define a6 %f50 #define a7 %f52 #define a8 %f54 #define b1 %f56 #define b2 %f58 #define b3 %f60 #define b4 %f62 #else #define t1 %f0 #define t2 %f1 #define t3 %f2 #define t4 %f3 #define c1 %f4 #define c2 %f5 #define c3 %f6 #define c4 %f7 #define c5 %f8 #define c6 %f9 #define c7 %f10 #define c8 %f11 #define c9 %f12 #define c10 %f13 #define c11 %f14 #define c12 %f15 #define c13 %f16 #define c14 %f17 #define c15 %f18 #define c16 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f24 #define a6 %f25 #define a7 %f26 #define a8 %f27 #define b1 %f28 #define b2 %f29 #define b3 %f30 #define b4 %f31 #endif #ifndef __64BIT__ #define ALPHA_R [%sp + STACK_START + 16] #ifndef DOUBLE #define ALPHA_I [%sp + STACK_START + 20] #else #define ALPHA_I [%sp + STACK_START + 24] #endif #else #define ALPHA_R [%sp + STACK_START + 32] #define ALPHA_I [%sp + STACK_START + 40] #endif #ifdef DOUBLE #define PREFETCHSIZE 18 #else #define PREFETCHSIZE 36 #endif PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], LDA ld [%sp + STACK_START + 40], X ld [%sp + STACK_START + 44], INCX ld [%sp + STACK_START + 48], Y ld [%sp + STACK_START + 52], INCY ld [%sp + STACK_START + 56], BUFFER #else st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ ld [%sp + STACK_START + 28], LDA ld [%sp + STACK_START + 32], X ld [%sp + STACK_START + 36], INCX ld [%sp + STACK_START + 40], Y ld [%sp + STACK_START + 44], INCY ld [%sp + STACK_START + 48], BUFFER #endif #else ldx [%sp + STACK_START + 56], LDA ldx [%sp + STACK_START + 64], X ldx [%sp + STACK_START + 72], INCX ldx [%sp + STACK_START + 80], Y ldx [%sp + STACK_START + 88], INCY ldx [%sp + STACK_START + 96], BUFFER #ifdef DOUBLE std %f6, ALPHA_R std %f8, ALPHA_I #else st %f7, ALPHA_R st %f9, ALPHA_I #endif #endif clr IS mov P, I sll LDA, ZBASE_SHIFT, LDA sll I, ZBASE_SHIFT, I smul LDA, N, PNLDA sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY sub I, PNLDA, PNLDA .LL10: sll IS, ZBASE_SHIFT, I sub M, IS, MIN_M mov P, J cmp MIN_M, J nop movg %icc, J, MIN_M nop cmp INCX, 2 * SIZE beq .LL100 add X, I, XP sra MIN_M, 2, I mov BUFFER, XP cmp I, 0 ble,pn %icc, .LL15 mov BUFFER, Y1 .LL11: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X LDF [X + 0 * SIZE], a7 LDF [X + 1 * SIZE], a8 add X, INCX, X STF a1, [Y1 + 0 * SIZE] add I, -1, I STF a2, [Y1 + 1 * SIZE] cmp I, 0 STF a3, [Y1 + 2 * SIZE] STF a4, [Y1 + 3 * SIZE] STF a5, [Y1 + 4 * SIZE] STF a6, [Y1 + 5 * SIZE] STF a7, [Y1 + 6 * SIZE] STF a8, [Y1 + 7 * SIZE] bg,pn %icc, .LL11 add Y1, 8 * SIZE, Y1 .LL15: and MIN_M, 3, I cmp I, 0 ble,pn %icc, .LL100 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X add I, -1, I cmp I, 0 nop STF a1, [Y1 + 0 * SIZE] STF a2, [Y1 + 1 * SIZE] bg,pn %icc, .LL16 add Y1, 2 * SIZE, Y1 .LL100: sra N, 2, J cmp J, 0 ble %icc, .LL200 mov Y, Y1 .LL110: FCLR(0) FMOV t1, c1 sra MIN_M, 2, I FMOV t1, c2 add A, LDA, A2 FMOV t1, c3 mov A, A1 FMOV t1, c4 add A2, LDA, A3 FMOV t1, c5 FMOV t1, c6 FMOV t1, c7 FMOV t1, c8 FMOV t1, c9 FMOV t1, c10 FMOV t1, c11 FMOV t1, c12 FMOV t1, c13 FMOV t1, c14 FMOV t1, c15 FMOV t1, c16 add A3, LDA, A4 FMOV t1, t2 mov XP, X1 FMOV t1, t3 add A4, LDA, A cmp I, 0 ble %icc, .LL115 FMOV t1, t4 LDF [A1 + 0 * SIZE], a1 nop LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 LDF [A3 + 0 * SIZE], a5 LDF [A3 + 1 * SIZE], a6 add A3, 2 * SIZE, A3 LDF [A4 + 0 * SIZE], a7 LDF [A4 + 1 * SIZE], a8 add A4, 2 * SIZE, A4 LDF [X1 + 0 * SIZE], b1 nop LDF [X1 + 1 * SIZE], b2 nop LDF [X1 + 2 * SIZE], b3 add X1, 4 * SIZE, X1 deccc I ble .LL112 prefetch [Y1 + 7 * SIZE], 2 #ifndef XCONJ #define FADDX FADD #else #define FADDX FSUB #endif .LL111: FADD c13, t1, c13 prefetch [A1 + PREFETCHSIZE * SIZE], 1 FMUL a1, b1, t1 nop FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 - 1 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 0 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 1 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 0 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 1 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b3, t1 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 2 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 LDF [X1 + 1 * SIZE], b2 FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 3 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 2 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 3 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 2 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 3 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b3, t1 nop FADDX c10, t2, c10 nop FMUL a7, b4, t2 LDF [A4 + 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c12, t4, c12 nop FMUL a8, b4, t4 LDF [A4 + 3 * SIZE], a8 FADD c13, t1, c13 prefetch [A3 + PREFETCHSIZE * SIZE], 1 FMUL a1, b1, t1 nop FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 + 3 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 4 * SIZE], a5 FADD c7, t3, c7 deccc I FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 5 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 4 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 4 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 5 * SIZE], a8 FADD c13, t1, c13 prefetch [A4 + PREFETCHSIZE * SIZE], 1 FMUL a1, b3, t1 nop FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 6 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 LDF [X1 + 5 * SIZE], b2 FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 7 * SIZE], a2 FADD c1, t1, c1 add A1, 8 * SIZE, A1 FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 6 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 7 * SIZE], a4 FADD c5, t1, c5 add A2, 8 * SIZE, A2 FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 6 * SIZE], a5 FADD c7, t3, c7 add A4, 8 * SIZE, A4 FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 7 * SIZE], a6 FADD c9, t1, c9 add A3, 8 * SIZE, A3 FMUL a7, b3, t1 nop FADDX c10, t2, c10 add X1, 8 * SIZE, X1 FMUL a7, b4, t2 LDF [A4 - 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 - 2 * SIZE], b3 FADD c12, t4, c12 FMUL a8, b4, t4 bg,pn %icc, .LL111 LDF [A4 - 1 * SIZE], a8 .LL112: FADD c13, t1, c13 nop FMUL a1, b1, t1 LDF [X1 - 1 * SIZE], b4 FADDX c14, t2, c14 nop FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 LDF [X1 - 1 * SIZE], b4 FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 0 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 1 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 0 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 1 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b3, t1 LDF [X1 + 1 * SIZE], b2 FADDX c14, t2, c14 nop FMUL a1, b4, t2 LDF [A1 + 2 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b3, t3 nop FADD c16, t4, c16 nop FMUL a2, b4, t4 LDF [A1 + 3 * SIZE], a2 FADD c1, t1, c1 nop FMUL a3, b3, t1 nop FADDX c2, t2, c2 nop FMUL a3, b4, t2 LDF [A2 + 2 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b3, t3 nop FADD c4, t4, c4 nop FMUL a4, b4, t4 LDF [A2 + 3 * SIZE], a4 FADD c5, t1, c5 nop FMUL a5, b3, t1 nop FADDX c6, t2, c6 nop FMUL a5, b4, t2 LDF [A3 + 2 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b3, t3 nop FADD c8, t4, c8 nop FMUL a6, b4, t4 LDF [A3 + 3 * SIZE], a6 FADD c9, t1, c9 nop FMUL a7, b3, t1 nop FADDX c10, t2, c10 nop FMUL a7, b4, t2 LDF [A4 + 2 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c12, t4, c12 nop FMUL a8, b4, t4 LDF [A4 + 3 * SIZE], a8 FADD c13, t1, c13 nop FMUL a1, b1, t1 LDF [X1 + 3 * SIZE], b4 FADDX c14, t2, c14 add X1, 4 * SIZE, X1 FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c15, t3, c15 nop FMUL a2, b1, t3 nop FADD c16, t4, c16 nop FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 add A1, 6 * SIZE, A1 FMUL a3, b1, t1 nop FADDX c2, t2, c2 nop FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 nop FMUL a4, b1, t3 nop FADD c4, t4, c4 nop FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5 add A2, 6 * SIZE, A2 FMUL a5, b1, t1 nop FADDX c6, t2, c6 nop FMUL a5, b2, t2 LDF [A3 + 4 * SIZE], a5 FADD c7, t3, c7 nop FMUL a6, b1, t3 nop FADD c8, t4, c8 nop FMUL a6, b2, t4 LDF [A3 + 5 * SIZE], a6 FADD c9, t1, c9 add A3, 6 * SIZE, A3 FMUL a7, b1, t1 nop FADDX c10, t2, c10 nop FMUL a7, b2, t2 LDF [A4 + 4 * SIZE], a7 FADD c11, t3, c11 nop FMUL a8, b1, t3 nop FADD c12, t4, c12 nop FMUL a8, b2, t4 LDF [A4 + 5 * SIZE], a8 FADD c13, t1, c13 add A4, 6 * SIZE, A4 FMUL a1, b3, t1 nop FADDX c14, t2, c14 nop FMUL a1, b4, t2 nop FADD c15, t3, c15 FMUL a2, b3, t3 FADD c16, t4, c16 FMUL a2, b4, t4 FADD c1, t1, c1 FMUL a3, b3, t1 FADDX c2, t2, c2 FMUL a3, b4, t2 FADD c3, t3, c3 FMUL a4, b3, t3 FADD c4, t4, c4 FMUL a4, b4, t4 FADD c5, t1, c5 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 FADD c9, t1, c9 FMUL a7, b3, t1 FADDX c10, t2, c10 FMUL a7, b4, t2 FADD c11, t3, c11 FMUL a8, b3, t3 FADD c12, t4, c12 FMUL a8, b4, t4 .LL115: andcc MIN_M, 3, I LDF ALPHA_R, b3 mov Y1, Y2 ble,pn %icc, .LL119 LDF ALPHA_I, b4 .L116: LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 LDF [X1 + 0 * SIZE], b1 LDF [X1 + 1 * SIZE], b2 add X1, 2 * SIZE, X1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 LDF [A3 + 0 * SIZE], a5 LDF [A3 + 1 * SIZE], a6 add A3, 2 * SIZE, A3 LDF [A4 + 0 * SIZE], a7 LDF [A4 + 1 * SIZE], a8 add A4, 2 * SIZE, A4 FADD c13, t1, c13 FMUL a1, b1, t1 FADDX c14, t2, c14 FMUL a1, b2, t2 FADD c15, t3, c15 FMUL a2, b1, t3 FADD c16, t4, c16 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 FADD c3, t3, c3 FMUL a4, b1, t3 FADD c4, t4, c4 FMUL a4, b2, t4 FADD c5, t1, c5 FMUL a5, b1, t1 FADDX c6, t2, c6 FMUL a5, b2, t2 FADD c7, t3, c7 FMUL a6, b1, t3 FADD c8, t4, c8 FMUL a6, b2, t4 FADD c9, t1, c9 FMUL a7, b1, t1 FADDX c10, t2, c10 FMUL a7, b2, t2 FADD c11, t3, c11 FMUL a8, b1, t3 FADD c12, t4, c12 FMUL a8, b2, t4 deccc I bg %icc, .L116 nop .LL119: FADD c13, t1, c13 LDF [Y1 + 0 * SIZE], a1 FADDX c14, t2, c14 LDF [Y1 + 1 * SIZE] ,a2 add Y1, INCY, Y1 FADD c15, t3, c15 LDF [Y1 + 0 * SIZE], a3 FADD c16, t4, c16 LDF [Y1 + 1 * SIZE] ,a4 add Y1, INCY, Y1 #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) FSUB c1, c4, c1 LDF [Y1 + 0 * SIZE], a5 FSUB c5, c8, c5 LDF [Y1 + 1 * SIZE] ,a6 add Y1, INCY, Y1 FSUB c9, c12, c9 LDF [Y1 + 0 * SIZE], a7 FSUB c13, c16, c13 LDF [Y1 + 1 * SIZE] ,a8 add Y1, INCY, Y1 #else FADD c1, c4, c1 LDF [Y1 + 0 * SIZE], a5 FADD c5, c8, c5 LDF [Y1 + 1 * SIZE] ,a6 add Y1, INCY, Y1 FADD c9, c12, c9 LDF [Y1 + 0 * SIZE], a7 FADD c13, c16, c13 LDF [Y1 + 1 * SIZE] ,a8 add Y1, INCY, Y1 #endif #ifndef CONJ FADD c2, c3, c2 FCLR(0) FADD c6, c7, c6 FADD c10, c11, c10 FADD c14, c15, c14 #else FSUB c2, c3, c2 FCLR(0) FSUB c6, c7, c6 FSUB c10, c11, c10 FSUB c14, c15, c14 #endif FMUL b3, c1, c3 FMOV t1, t2 FMUL b4, c1, c4 FMOV t1, t3 FMUL b4, c2, c1 FMOV t1, t4 FMUL b3, c2, c2 FMUL b3, c5, c7 FMUL b4, c5, c8 FMUL b4, c6, c5 FMUL b3, c6, c6 FMUL b3, c9, c11 FMUL b4, c9, c12 FMUL b4, c10, c9 FMUL b3, c10, c10 FMUL b3, c13, c15 FSUB c3, c1, c1 FMUL b4, c13, c16 FADD c2, c4, c2 FMUL b4, c14, c13 FSUB c7, c5, c5 FMUL b3, c14, c14 FADD c6, c8, c6 FSUB c11, c9, c9 FADD c10, c12, c10 FSUB c15, c13, c13 FADD c14, c16, c14 FADD a1, c1, a1 FADD a2, c2, a2 FADD a3, c5, a3 FADD a4, c6, a4 STF a1, [Y2 + 0 * SIZE] FADD a5, c9, a5 STF a2, [Y2 + 1 * SIZE] FADD a6, c10, a6 add Y2, INCY, Y2 STF a3, [Y2 + 0 * SIZE] FADD a7, c13, a7 STF a4, [Y2 + 1 * SIZE] FADD a8, c14, a8 add Y2, INCY, Y2 STF a5, [Y2 + 0 * SIZE] FMOV t1, c1 add J, -1, J STF a6, [Y2 + 1 * SIZE] FMOV t1, c2 cmp J, 0 add Y2, INCY, Y2 STF a7, [Y2 + 0 * SIZE] FMOV t1, c3 STF a8, [Y2 + 1 * SIZE] FMOV t1, c4 add Y2, INCY, Y2 FMOV t1, c5 bg %icc, .LL110 FMOV t1, c6 .LL200: FCLR(0) and N, 2, J cmp J, 0 FMOV t1, c1 ble %icc, .LL300 FMOV t1, c2 sra MIN_M, 2, I FMOV t1, t2 add A, LDA, A2 FMOV t1, c3 mov A, A1 FMOV t1, t3 cmp I, 0 FMOV t1, c4 FMOV t1, c5 FMOV t1, c6 FMOV t1, c7 FMOV t1, c8 add A2, LDA, A FMOV t1, t4 ble %icc, .LL215 mov XP, X1 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a5 LDF [A1 + 3 * SIZE], a6 add A1, 4 * SIZE, A1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 LDF [A2 + 2 * SIZE], a7 LDF [A2 + 3 * SIZE], a8 add A2, 4 * SIZE, A2 LDF [X1 + 0 * SIZE], b1 add I, -1, I LDF [X1 + 1 * SIZE], b2 cmp I, 0 LDF [X1 + 2 * SIZE], b3 LDF [X1 + 3 * SIZE], b4 ble %icc, .LL212 add X1, 4 * SIZE, X1 .LL211: prefetch [A1 + PREFETCHSIZE * SIZE], 1 FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 LDF [X1 + 1 * SIZE], b2 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 LDF [A1 + 2 * SIZE], a5 FADD c7, t3, c7 add I, -1, I FMUL a6, b3, t3 FADD c8, t4, c8 cmp I, 0 FMUL a6, b4, t4 LDF [A1 + 3 * SIZE], a6 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 LDF [A2 + 2 * SIZE], a7 FADD c3, t3, c3 FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c4, t4, c4 FMUL a8, b4, t4 LDF [A2 + 3 * SIZE], a8 prefetch [A2 + PREFETCHSIZE * SIZE], 1 FADD c5, t1, c5 LDF [X1 + 3 * SIZE], b4 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 4 * SIZE], a1 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 LDF [A1 + 5 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 4 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 4 * SIZE], b1 FADD c4, t4, c4 FMUL a4, b2, t4 LDF [A2 + 5 * SIZE], a4 FADD c5, t1, c5 LDF [X1 + 5 * SIZE], b2 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 LDF [A1 + 6 * SIZE], a5 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 LDF [A1 + 7 * SIZE], a6 add A1, 8 * SIZE, A1 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 LDF [A2 + 6 * SIZE], a7 FADD c3, t3, c3 FMUL a8, b3, t3 LDF [X1 + 6 * SIZE], b3 FADD c4, t4, c4 add X1, 8 * SIZE, X1 FMUL a8, b4, t4 LDF [A2 + 7 * SIZE], a8 add A2, 8 * SIZE, A2 bg,pn %icc, .LL211 LDF [X1 - 1 * SIZE], b4 .LL212: FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 FADD c5, t1, c5 LDF [X1 + 1 * SIZE], b2 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 LDF [A1 + 2 * SIZE], a5 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 LDF [A1 + 3 * SIZE], a6 add A1, 4 * SIZE, A1 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 LDF [A2 + 2 * SIZE], a7 FADD c3, t3, c3 FMUL a8, b3, t3 LDF [X1 + 2 * SIZE], b3 FADD c4, t4, c4 FMUL a8, b4, t4 LDF [A2 + 3 * SIZE], a8 add A2, 4 * SIZE, A2 FADD c5, t1, c5 LDF [X1 + 3 * SIZE], b4 add X1, 4 * SIZE, X1 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 FADD c3, t3, c3 FMUL a4, b1, t3 FADD c4, t4, c4 FMUL a4, b2, t4 FADD c5, t1, c5 FMUL a5, b3, t1 FADDX c6, t2, c6 FMUL a5, b4, t2 FADD c7, t3, c7 FMUL a6, b3, t3 FADD c8, t4, c8 FMUL a6, b4, t4 FADD c1, t1, c1 FMUL a7, b3, t1 FADDX c2, t2, c2 FMUL a7, b4, t2 FADD c3, t3, c3 FMUL a8, b3, t3 FADD c4, t4, c4 FMUL a8, b4, t4 .LL215: andcc MIN_M, 3, I LDF ALPHA_R, b3 mov Y1, Y2 ble %icc, .LL219 LDF ALPHA_I, b4 LDF [A1 + 0 * SIZE], a1 add I, -1, I LDF [A1 + 1 * SIZE], a2 cmp I, 0 add A1, 2 * SIZE, A1 LDF [A2 + 0 * SIZE], a3 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 LDF [X1 + 0 * SIZE], b1 LDF [X1 + 1 * SIZE], b2 ble %icc, .LL217 add X1, 2 * SIZE, X1 .LL216: FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c7, t3, c7 add I, -1, I FMUL a2, b1, t3 FADD c8, t4, c8 cmp I, 0 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 LDF [A2 + 0 * SIZE], a3 FADD c3, t3, c3 FMUL a4, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 add X1, 2 * SIZE, X1 FMUL a4, b2, t4 LDF [A2 + 1 * SIZE], a4 add A2, 2 * SIZE, A2 bg,pn %icc, .LL216 LDF [X1 - 1 * SIZE], b2 .LL217: FADD c5, t1, c5 FMUL a1, b1, t1 FADDX c6, t2, c6 FMUL a1, b2, t2 FADD c7, t3, c7 FMUL a2, b1, t3 FADD c8, t4, c8 FMUL a2, b2, t4 FADD c1, t1, c1 FMUL a3, b1, t1 FADDX c2, t2, c2 FMUL a3, b2, t2 FADD c3, t3, c3 FMUL a4, b1, t3 FADD c4, t4, c4 FMUL a4, b2, t4 .LL219: FADD c5, t1, c5 LDF [Y1 + 0 * SIZE], a1 FADDX c6, t2, c6 LDF [Y1 + 1 * SIZE] ,a2 add Y1, INCY, Y1 FADD c7, t3, c7 LDF [Y1 + 0 * SIZE], a3 FADD c8, t4, c8 LDF [Y1 + 1 * SIZE] ,a4 add Y1, INCY, Y1 #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) FSUB c1, c4, c1 FSUB c5, c8, c5 #else FADD c1, c4, c1 FADD c5, c8, c5 #endif #ifndef CONJ FADD c2, c3, c2 FADD c6, c7, c6 #else FSUB c2, c3, c2 FSUB c6, c7, c6 #endif FMUL b3, c1, c3 FMUL b4, c1, c4 FMUL b4, c2, c1 FMUL b3, c2, c2 FMUL b3, c5, c7 FMUL b4, c5, c8 FMUL b4, c6, c5 FMUL b3, c6, c6 FSUB c3, c1, c1 FADD c2, c4, c2 FSUB c7, c5, c5 FADD c6, c8, c6 FADD a1, c1, a1 FADD a2, c2, a2 FADD a3, c5, a3 FADD a4, c6, a4 STF a1, [Y2 + 0 * SIZE] STF a2, [Y2 + 1 * SIZE] add Y2, INCY, Y2 STF a3, [Y2 + 0 * SIZE] STF a4, [Y2 + 1 * SIZE] .LL300: andcc N, 1, J FCLR(0) ble %icc, .LL400 FMOV t1, c1 .LL310: sra MIN_M, 2, I FMOV t1, c2 FMOV t1, c3 FMOV t1, c4 mov A, A1 FMOV t1, t2 add A, LDA, A FMOV t1, t3 cmp I, 0 FMOV t1, t4 ble %icc, .LL315 mov XP, X1 LDF [A1 + 0 * SIZE], a1 LDF [A1 + 1 * SIZE], a2 LDF [A1 + 2 * SIZE], a3 LDF [A1 + 3 * SIZE], a4 LDF [A1 + 4 * SIZE], a5 LDF [A1 + 5 * SIZE], a6 LDF [A1 + 6 * SIZE], a7 LDF [A1 + 7 * SIZE], a8 add A1, 8 * SIZE, A1 LDF [X1 + 0 * SIZE], c9 add I, -1, I LDF [X1 + 1 * SIZE], c10 cmp I, 0 LDF [X1 + 2 * SIZE], c11 LDF [X1 + 3 * SIZE], c12 LDF [X1 + 4 * SIZE], c13 LDF [X1 + 5 * SIZE], c14 LDF [X1 + 6 * SIZE], c15 LDF [X1 + 7 * SIZE], c16 ble %icc, .LL312 add X1, 8 * SIZE, X1 .LL311: prefetch [A1 + PREFETCHSIZE * SIZE], 1 FADD c1, t1, c1 FMUL a1, c9, t1 FADDX c2, t2, c2 FMUL a1, c10, t2 LDF [A1 + 0 * SIZE], a1 FADD c3, t3, c3 FMUL a2, c9, t3 LDF [X1 + 0 * SIZE], c9 FADD c4, t4, c4 FMUL a2, c10, t4 LDF [A1 + 1 * SIZE], a2 LDF [X1 + 1 * SIZE], c10 FADD c1, t1, c1 FMUL a3, c11, t1 FADDX c2, t2, c2 FMUL a3, c12, t2 LDF [A1 + 2 * SIZE], a3 FADD c3, t3, c3 add I, -1, I FMUL a4, c11, t3 LDF [X1 + 2 * SIZE], c11 FADD c4, t4, c4 cmp I, 0 FMUL a4, c12, t4 LDF [A1 + 3 * SIZE], a4 LDF [X1 + 3 * SIZE], c12 FADD c1, t1, c1 FMUL a5, c13, t1 FADDX c2, t2, c2 FMUL a5, c14, t2 LDF [A1 + 4 * SIZE], a5 FADD c3, t3, c3 FMUL a6, c13, t3 LDF [X1 + 4 * SIZE], c13 FADD c4, t4, c4 FMUL a6, c14, t4 LDF [A1 + 5 * SIZE], a6 LDF [X1 + 5 * SIZE], c14 FADD c1, t1, c1 FMUL a7, c15, t1 FADDX c2, t2, c2 FMUL a7, c16, t2 LDF [A1 + 6 * SIZE], a7 FADD c3, t3, c3 FMUL a8, c15, t3 LDF [X1 + 6 * SIZE], c15 FADD c4, t4, c4 add X1, 8 * SIZE, X1 FMUL a8, c16, t4 LDF [A1 + 7 * SIZE], a8 add A1, 8 * SIZE, A1 bg,pn %icc, .LL311 LDF [X1 - 1 * SIZE], c16 .LL312: FADD c1, t1, c1 FMUL a1, c9, t1 FADDX c2, t2, c2 FMUL a1, c10, t2 FADD c3, t3, c3 FMUL a2, c9, t3 FADD c4, t4, c4 FMUL a2, c10, t4 FADD c1, t1, c1 FMUL a3, c11, t1 FADDX c2, t2, c2 FMUL a3, c12, t2 FADD c3, t3, c3 FMUL a4, c11, t3 FADD c4, t4, c4 FMUL a4, c12, t4 FADD c1, t1, c1 FMUL a5, c13, t1 FADDX c2, t2, c2 FMUL a5, c14, t2 FADD c3, t3, c3 FMUL a6, c13, t3 FADD c4, t4, c4 FMUL a6, c14, t4 FADD c1, t1, c1 FMUL a7, c15, t1 FADDX c2, t2, c2 FMUL a7, c16, t2 FADD c3, t3, c3 FMUL a8, c15, t3 FADD c4, t4, c4 FMUL a8, c16, t4 .LL315: andcc MIN_M, 3, I LDF ALPHA_R, b3 mov Y1, Y2 ble %icc, .LL319 LDF ALPHA_I, b4 LDF [A1 + 0 * SIZE], a1 add I, -1, I LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 LDF [X1 + 0 * SIZE], b1 cmp I, 0 LDF [X1 + 1 * SIZE], b2 ble %icc, .LL317 add X1, 2 * SIZE, X1 .LL316: FADD c1, t1, c1 add I, -1, I FMUL a1, b1, t1 FADDX c2, t2, c2 FMUL a1, b2, t2 LDF [A1 + 0 * SIZE], a1 FADD c3, t3, c3 cmp I, 0 FMUL a2, b1, t3 LDF [X1 + 0 * SIZE], b1 FADD c4, t4, c4 add X1, 2 * SIZE, X1 FMUL a2, b2, t4 LDF [A1 + 1 * SIZE], a2 add A1, 2 * SIZE, A1 bg,pn %icc, .LL316 LDF [X1 - 1 * SIZE], b2 .LL317: FADD c1, t1, c1 FMUL a1, b1, t1 FADDX c2, t2, c2 FMUL a1, b2, t2 FADD c3, t3, c3 FMUL a2, b1, t3 FADD c4, t4, c4 FMUL a2, b2, t4 .LL319: FADD c1, t1, c1 LDF [Y1 + 0 * SIZE], a1 FADDX c2, t2, c2 LDF [Y1 + 1 * SIZE] ,a2 add Y1, INCY, Y1 FADD c3, t3, c3 FADD c4, t4, c4 #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) FSUB c1, c4, c1 #else FADD c1, c4, c1 #endif #ifndef CONJ FADD c2, c3, c2 #else FSUB c2, c3, c2 #endif FMUL b3, c1, c3 FMUL b4, c1, c4 FMUL b4, c2, c1 FMUL b3, c2, c2 FSUB c3, c1, c1 FADD c2, c4, c2 FADD a1, c1, a1 FADD a2, c2, a2 STF a1, [Y2 + 0 * SIZE] STF a2, [Y2 + 1 * SIZE] .LL400: mov P, I add IS, I, IS cmp IS, M bl %icc, .LL10 add A, PNLDA, A .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/znrm2.S000066400000000000000000000273121313527062700167350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define I %i3 #define XX %i4 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define t1 %f8 #define t2 %f10 #define t3 %f12 #define t4 %f14 #define a1 %f16 #define a2 %f18 #define a3 %f20 #define a4 %f22 #define a5 %f24 #define a6 %f26 #define a7 %f28 #define a8 %f30 #define fmax %f32 #define fzero %f34 #define fone %f36 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define t1 %f4 #define t2 %f5 #define t3 %f6 #define t4 %f7 #define a1 %f8 #define a2 %f9 #define a3 %f10 #define a4 %f11 #define a5 %f12 #define a6 %f13 #define a7 %f14 #define a8 %f15 #define fmax %f16 #define fzero %f17 #define fone %f18 #endif PROLOGUE SAVESP #ifdef DOUBLE FCLR(3) #else FCLR(17) #endif mov X, XX mov 0x3ff, %g1 sll %g1, 20, %g1 cmp N, 0 ble .LL99 FMOV fzero, c1 cmp INCX, 0 ble .LL99 sll INCX, ZBASE_SHIFT, INCX add %sp, -8, %sp st %g1, [%sp + STACK_START + 0] st %g0, [%sp + STACK_START + 4] LDF [%sp + STACK_START], fone add %sp, 8, %sp FMOV fzero, c2 FMOV fzero, c3 FMOV fzero, c4 cmp INCX, 2 * SIZE bne .LL100 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL12 add X, 8 * SIZE, X #define PREFETCHSIZE 40 .LL11: FABS a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 FABS a2, t2 LDF [X + 0 * SIZE], a1 FABS a3, t3 LDF [X + 1 * SIZE], a2 FABS a4, t4 LDF [X + 2 * SIZE], a3 FCMP %fcc0, t1, c1 LDF [X + 3 * SIZE], a4 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 LDF [X + 4 * SIZE], a5 FABS a6, t2 LDF [X + 5 * SIZE], a6 FABS a7, t3 LDF [X + 6 * SIZE], a7 FABS a8, t4 LDF [X + 7 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 add I, -1, I FMOVG %fcc1, t2, c2 cmp I, 0 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 bg,pt %icc, .LL11 add X, 8 * SIZE, X .LL12: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FABS a1, t1 FABS a2, t2 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 mov XX, X FMOVG %fcc0, c2, c1 FMOVG %fcc1, c4, c3 FCMP %fcc0, c3, c1 FMOVG %fcc0, c3, c1 FCMP c1, fzero fbe .LL99 nop FMOV c1, fmax FDIV fone, c1, fone FMOV fzero, c1 FMOV fzero, c2 FMOV fzero, c3 FMOV fzero, c4 sra N, 2, I cmp I, 0 ble,pn %icc, .LL35 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 LDF [X + 2 * SIZE], a3 LDF [X + 3 * SIZE], a4 LDF [X + 4 * SIZE], a5 add I, -1, I LDF [X + 5 * SIZE], a6 cmp I, 0 LDF [X + 6 * SIZE], a7 LDF [X + 7 * SIZE], a8 ble,pt %icc, .LL32 add X, 8 * SIZE, X .LL31: FMUL fone, a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 FMUL fone, a2, t2 LDF [X + 0 * SIZE], a1 FMUL fone, a3, t3 LDF [X + 1 * SIZE], a2 FMUL fone, a4, t4 LDF [X + 2 * SIZE], a3 FMUL t1, t1, t1 LDF [X + 3 * SIZE], a4 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 LDF [X + 4 * SIZE], a5 FADD c2, t2, c2 FMUL fone, a6, t2 LDF [X + 5 * SIZE], a6 FADD c3, t3, c3 FMUL fone, a7, t3 LDF [X + 6 * SIZE], a7 FADD c4, t4, c4 FMUL fone, a8, t4 LDF [X + 7 * SIZE], a8 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 add I, -1, I FADD c2, t2, c2 cmp I, 0 FADD c3, t3, c3 FADD c4, t4, c4 bg,pt %icc, .LL31 add X, 8 * SIZE, X .LL32: FMUL fone, a1, t1 FMUL fone, a2, t2 FMUL fone, a3, t3 FMUL fone, a4, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 FADD c2, t2, c2 FMUL fone, a6, t2 FADD c3, t3, c3 FMUL fone, a7, t3 FADD c4, t4, c4 FMUL fone, a8, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FADD c2, t2, c2 FADD c3, t3, c3 FADD c4, t4, c4 .LL35: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL39 nop .LL36: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FMUL fone, a1, t1 FMUL fone, a2, t2 FMUL t1, t1, t1 FMUL t2, t2, t2 FADD c1, t1, c1 FADD c2, t2, c2 add I, -1, I cmp I, 0 bg,pt %icc, .LL36 add X, 2 * SIZE, X .LL39: FADD c1, c2, c1 FADD c3, c4, c3 FADD c1, c3, c1 FSQRT c1, c1 FMUL fmax, c1, c1 .LL99: return %i7 + 8 clr %g0 .LL100: sra N, 2, I cmp I, 0 ble,pn %icc, .LL105 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 LDF [X + 1 * SIZE], a6 add X, INCX, X add I, -1, I LDF [X + 0 * SIZE], a7 cmp I, 0 LDF [X + 1 * SIZE], a8 ble,pt %icc, .LL102 add X, INCX, X .LL101: FABS a1, t1 LDF [X + 0 * SIZE], a1 FABS a2, t2 LDF [X + 1 * SIZE], a2 add X, INCX, X FABS a3, t3 LDF [X + 0 * SIZE], a3 FABS a4, t4 LDF [X + 1 * SIZE], a4 add X, INCX, X FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 LDF [X + 0 * SIZE], a5 FABS a6, t2 LDF [X + 1 * SIZE], a6 add X, INCX, X FABS a7, t3 LDF [X + 0 * SIZE], a7 FABS a8, t4 LDF [X + 1 * SIZE], a8 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 add I, -1, I FMOVG %fcc1, t2, c2 cmp I, 0 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 bg,pt %icc, .LL101 add X, INCX, X .LL102: FABS a1, t1 FABS a2, t2 FABS a3, t3 FABS a4, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 FABS a5, t1 FABS a6, t2 FABS a7, t3 FABS a8, t4 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FCMP %fcc2, t3, c3 FCMP %fcc3, t4, c4 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 FMOVG %fcc2, t3, c3 FMOVG %fcc3, t4, c4 .LL105: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL109 nop .LL106: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FABS a1, t1 FABS a2, t2 FCMP %fcc0, t1, c1 FCMP %fcc1, t2, c2 FMOVG %fcc0, t1, c1 FMOVG %fcc1, t2, c2 add I, -1, I cmp I, 0 bg,pt %icc, .LL106 add X, INCX, X .LL109: FCMP %fcc0, c2, c1 FCMP %fcc1, c4, c3 mov XX, X FMOVG %fcc0, c2, c1 FMOVG %fcc1, c4, c3 FCMP %fcc0, c3, c1 FMOVG %fcc0, c3, c1 FCMP c1, fzero fbe .LL99 nop FMOV c1, fmax FDIV fone, c1, fone FMOV fzero, c1 FMOV fzero, c2 FMOV fzero, c3 FMOV fzero, c4 sra N, 2, I cmp I, 0 ble,pn %icc, .LL135 nop LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [X + 0 * SIZE], a3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [X + 0 * SIZE], a5 add I, -1, I LDF [X + 1 * SIZE], a6 add X, INCX, X cmp I, 0 LDF [X + 0 * SIZE], a7 LDF [X + 1 * SIZE], a8 ble,pt %icc, .LL132 add X, INCX, X .LL131: FMUL fone, a1, t1 prefetch [X + PREFETCHSIZE * SIZE], 0 FMUL fone, a2, t2 LDF [X + 0 * SIZE], a1 FMUL fone, a3, t3 LDF [X + 1 * SIZE], a2 add X, INCX, X FMUL fone, a4, t4 LDF [X + 0 * SIZE], a3 FMUL t1, t1, t1 LDF [X + 1 * SIZE], a4 add X, INCX, X FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 LDF [X + 0 * SIZE], a5 FADD c2, t2, c2 FMUL fone, a6, t2 LDF [X + 1 * SIZE], a6 add X, INCX, X FADD c3, t3, c3 FMUL fone, a7, t3 LDF [X + 0 * SIZE], a7 FADD c4, t4, c4 FMUL fone, a8, t4 LDF [X + 1 * SIZE], a8 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 add I, -1, I FADD c2, t2, c2 cmp I, 0 FADD c3, t3, c3 FADD c4, t4, c4 bg,pt %icc, .LL131 add X, INCX, X .LL132: FMUL fone, a1, t1 FMUL fone, a2, t2 FMUL fone, a3, t3 FMUL fone, a4, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FMUL fone, a5, t1 FADD c2, t2, c2 FMUL fone, a6, t2 FADD c3, t3, c3 FMUL fone, a7, t3 FADD c4, t4, c4 FMUL fone, a8, t4 FMUL t1, t1, t1 FMUL t2, t2, t2 FMUL t3, t3, t3 FMUL t4, t4, t4 FADD c1, t1, c1 FADD c2, t2, c2 FADD c3, t3, c3 FADD c4, t4, c4 .LL135: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL139 nop .LL136: LDF [X + 0 * SIZE], a1 LDF [X + 1 * SIZE], a2 FMUL fone, a1, t1 FMUL fone, a2, t2 FMUL t1, t1, t1 FMUL t2, t2, t2 FADD c1, t1, c1 FADD c2, t2, c2 add I, -1, I cmp I, 0 bg,pt %icc, .LL136 add X, INCX, X .LL139: FADD c1, c2, c1 FADD c3, c4, c3 FADD c1, c3, c1 FSQRT c1, c1 FMUL fmax, c1, c1 return %i7 + 8 clr %g0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zrot.S000066400000000000000000000256611313527062700166700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #define I %i5 #define XX %l0 #define YY %l1 #ifdef DOUBLE #define a1 %f4 #define a2 %f6 #define a3 %f8 #define a4 %f10 #define a5 %f12 #define a6 %f14 #define a7 %f16 #define a8 %f18 #define b1 %f20 #define b2 %f22 #define b3 %f24 #define b4 %f26 #define b5 %f28 #define b6 %f30 #define b7 %f32 #define b8 %f34 #define c1 %f36 #define c2 %f38 #define c3 %f40 #define c4 %f42 #define c5 %f44 #define c6 %f46 #define c7 %f48 #define c8 %f50 #define t1 %f52 #define t2 %f54 #define t3 %f56 #define t4 %f58 #else #define a1 %f2 #define a2 %f3 #define a3 %f4 #define a4 %f5 #define a5 %f6 #define a6 %f7 #define a7 %f8 #define a8 %f9 #define b1 %f10 #define b2 %f11 #define b3 %f12 #define b4 %f13 #define b5 %f14 #define b6 %f15 #define b7 %f16 #define b8 %f17 #define c1 %f18 #define c2 %f19 #define c3 %f20 #define c4 %f21 #define c5 %f22 #define c6 %f23 #define c7 %f24 #define c8 %f25 #define t1 %f26 #define t2 %f27 #define t3 %f28 #define t4 %f29 #endif #ifdef DOUBLE #define C %f0 #define S %f2 #else #define C %f0 #define S %f1 #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i5, [%sp + STACK_START + 24] LDF [%sp + STACK_START + 24], C LDF [%sp + STACK_START + 32], S #else st %i5, [%sp + STACK_START + 24] LDF [%sp + STACK_START + 24], C LDF [%sp + STACK_START + 28], S #endif #else #ifdef DOUBLE FMOV %f10, C FMOV %f12, S #else FMOV %f11, C FMOV %f13, S #endif #endif cmp N, 0 ble .LL19 nop sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY cmp INCX, 2 * SIZE bne .LL50 nop cmp INCY, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 LDF [Y + 6 * SIZE], b7 LDF [X + 7 * SIZE], a8 LDF [Y + 7 * SIZE], b8 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 LDF [Y + 8 * SIZE], b1 FMUL S, a1, c4 LDF [X + 8 * SIZE], a1 FMUL C, a2, c5 FMUL S, b2, c6 FADD c1, c2, t1 FMUL C, b2, c7 LDF [Y + 9 * SIZE], b2 FMUL S, a2, c8 LDF [X + 9 * SIZE], a2 FSUB c3, c4, t2 addcc I, -1, I ble,pt %icc, .LL12 nop #define PREFETCHSIZE 64 .LL11: FMUL C, a3, c1 nop prefetch [Y + PREFETCHSIZE * SIZE], 1 nop FMUL S, b3, c2 STF t1, [X + 0 * SIZE] FADD c5, c6, t3 nop FMUL C, b3, c3 LDF [Y + 10 * SIZE], b3 nop nop FMUL S, a3, c4 STF t2, [Y + 0 * SIZE] FSUB c7, c8, t4 nop FMUL C, a4, c5 LDF [X + 10 * SIZE], a3 nop nop FMUL S, b4, c6 STF t3, [X + 1 * SIZE] FADD c1, c2, t1 nop FMUL C, b4, c7 LDF [Y + 11 * SIZE], b4 nop nop FMUL S, a4, c8 STF t4, [Y + 1 * SIZE] FSUB c3, c4, t2 nop FMUL C, a5, c1 LDF [X + 11 * SIZE], a4 nop nop FMUL S, b5, c2 STF t1, [X + 2 * SIZE] FADD c5, c6, t3 nop FMUL C, b5, c3 LDF [Y + 12 * SIZE], b5 nop nop FMUL S, a5, c4 STF t2, [Y + 2 * SIZE] FSUB c7, c8, t4 nop FMUL C, a6, c5 LDF [X + 12 * SIZE], a5 nop nop FMUL S, b6, c6 STF t3, [X + 3 * SIZE] FADD c1, c2, t1 nop FMUL C, b6, c7 LDF [Y + 13 * SIZE], b6 nop nop FMUL S, a6, c8 STF t4, [Y + 3 * SIZE] FSUB c3, c4, t2 nop FMUL C, a7, c1 LDF [X + 13 * SIZE], a6 nop nop FMUL S, b7, c2 STF t1, [X + 4 * SIZE] FADD c5, c6, t3 nop FMUL C, b7, c3 LDF [Y + 14 * SIZE], b7 nop nop FMUL S, a7, c4 STF t2, [Y + 4 * SIZE] FSUB c7, c8, t4 nop FMUL C, a8, c5 LDF [X + 14 * SIZE], a7 nop nop FMUL S, b8, c6 STF t3, [X + 5 * SIZE] FADD c1, c2, t1 nop FMUL C, b8, c7 LDF [Y + 15 * SIZE], b8 nop nop FMUL S, a8, c8 STF t4, [Y + 5 * SIZE] FSUB c3, c4, t2 nop FMUL C, a1, c1 LDF [X + 15 * SIZE], a8 addcc I, -1, I nop FMUL S, b1, c2 STF t1, [X + 6 * SIZE] FADD c5, c6, t3 nop FMUL C, b1, c3 LDF [Y + 16 * SIZE], b1 nop nop FMUL S, a1, c4 STF t2, [Y + 6 * SIZE] FSUB c7, c8, t4 nop FMUL C, a2, c5 LDF [X + 16 * SIZE], a1 add Y, 8 * SIZE, Y nop FMUL S, b2, c6 STF t3, [X + 7 * SIZE] FADD c1, c2, t1 nop FMUL C, b2, c7 LDF [Y + 9 * SIZE], b2 add X, 8 * SIZE, X nop FMUL S, a2, c8 STF t4, [Y - 1 * SIZE] FSUB c3, c4, t2 nop bg,pt %icc, .LL11 LDF [X + 9 * SIZE], a2 .LL12: FMUL C, a3, c1 FMUL S, b3, c2 STF t1, [X + 0 * SIZE] FADD c5, c6, t3 FMUL C, b3, c3 FMUL S, a3, c4 STF t2, [Y + 0 * SIZE] FSUB c7, c8, t4 FMUL C, a4, c5 FMUL S, b4, c6 STF t3, [X + 1 * SIZE] FADD c1, c2, t1 FMUL C, b4, c7 FMUL S, a4, c8 STF t4, [Y + 1 * SIZE] FSUB c3, c4, t2 FMUL C, a5, c1 FMUL S, b5, c2 STF t1, [X + 2 * SIZE] FADD c5, c6, t3 FMUL C, b5, c3 FMUL S, a5, c4 STF t2, [Y + 2 * SIZE] FSUB c7, c8, t4 FMUL C, a6, c5 FMUL S, b6, c6 STF t3, [X + 3 * SIZE] FADD c1, c2, t1 FMUL C, b6, c7 FMUL S, a6, c8 STF t4, [Y + 3 * SIZE] FSUB c3, c4, t2 FMUL C, a7, c1 FMUL S, b7, c2 STF t1, [X + 4 * SIZE] FADD c5, c6, t3 FMUL C, b7, c3 FMUL S, a7, c4 STF t2, [Y + 4 * SIZE] FSUB c7, c8, t4 FMUL C, a8, c5 FMUL S, b8, c6 STF t3, [X + 5 * SIZE] FADD c1, c2, t1 FMUL C, b8, c7 FMUL S, a8, c8 STF t4, [Y + 5 * SIZE] FSUB c3, c4, t2 FADD c5, c6, t3 STF t1, [X + 6 * SIZE] FSUB c7, c8, t4 STF t2, [Y + 6 * SIZE] STF t3, [X + 7 * SIZE] STF t4, [Y + 7 * SIZE] add X, 8 * SIZE, X add Y, 8 * SIZE, Y .LL15: andcc N, 3, I nop ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 FMUL C, a1, c1 add X, 2 * SIZE, X FMUL S, b1, c2 add Y, 2 * SIZE, Y FMUL C, b1, c3 addcc I, -1, I FMUL S, a1, c4 nop FMUL C, a2, c5 FMUL S, b2, c6 FADD c1, c2, c2 FMUL C, b2, c7 FMUL S, a2, c8 FSUB c3, c4, c4 STF c2, [X - 2 * SIZE] FADD c5, c6, c6 STF c4, [Y - 2 * SIZE] FSUB c7, c8, c8 STF c6, [X - 1 * SIZE] bg,pt %icc, .LL16 STF c8, [Y - 1 * SIZE] .LL19: return %i7 + 8 nop .LL50: mov X, XX mov Y, YY sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop .LL51: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FMUL C, a2, c5 nop FMUL S, b2, c6 FADD c1, c2, c2 FMUL C, b2, c7 nop FMUL S, a2, c8 FSUB c3, c4, c4 STF c2, [X + 0 * SIZE] FADD c5, c6, c6 STF c4, [Y + 0 * SIZE] FSUB c7, c8, c8 STF c6, [X + 1 * SIZE] add X, INCX, X STF c8, [Y + 1 * SIZE] add Y, INCY, Y LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FMUL C, a2, c5 nop FMUL S, b2, c6 FADD c1, c2, c2 FMUL C, b2, c7 nop FMUL S, a2, c8 FSUB c3, c4, c4 STF c2, [X + 0 * SIZE] FADD c5, c6, c6 STF c4, [Y + 0 * SIZE] FSUB c7, c8, c8 STF c6, [X + 1 * SIZE] add X, INCX, X STF c8, [Y + 1 * SIZE] add Y, INCY, Y LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FMUL C, a2, c5 nop FMUL S, b2, c6 FADD c1, c2, c2 FMUL C, b2, c7 nop FMUL S, a2, c8 FSUB c3, c4, c4 STF c2, [X + 0 * SIZE] FADD c5, c6, c6 STF c4, [Y + 0 * SIZE] FSUB c7, c8, c8 STF c6, [X + 1 * SIZE] add X, INCX, X STF c8, [Y + 1 * SIZE] add Y, INCY, Y LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FMUL C, a2, c5 nop FMUL S, b2, c6 FADD c1, c2, c2 FMUL C, b2, c7 nop FMUL S, a2, c8 FSUB c3, c4, c4 STF c2, [X + 0 * SIZE] FADD c5, c6, c6 STF c4, [Y + 0 * SIZE] FSUB c7, c8, c8 STF c6, [X + 1 * SIZE] add X, INCX, X STF c8, [Y + 1 * SIZE] add Y, INCY, Y addcc I, -1, I bg,pt %icc, .LL51 nop .LL55: andcc N, 3, I nop ble %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 FMUL C, a1, c1 FMUL S, b1, c2 FMUL C, b1, c3 FMUL S, a1, c4 FMUL C, a2, c5 addcc I, -1, I FMUL S, b2, c6 FADD c1, c2, c2 FMUL C, b2, c7 nop FMUL S, a2, c8 FSUB c3, c4, c4 STF c2, [X + 0 * SIZE] FADD c5, c6, c6 STF c4, [Y + 0 * SIZE] FSUB c7, c8, c8 STF c6, [X + 1 * SIZE] add X, INCX, X STF c8, [Y + 1 * SIZE] bg %icc, .LL56 add Y, INCY, Y .LL59: return %i7 + 8 nop EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zscal.S000066400000000000000000000237251313527062700170050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N %i0 #if defined(DOUBLE) && !defined(__64BIT__) #define X %i3 #define INCX %i4 #else #define X %i5 #define INCX %i3 #endif #define I %i1 #define XX %i2 #ifdef DOUBLE #define c1 %f0 #define c2 %f2 #define c3 %f4 #define c4 %f6 #define c5 %f8 #define c6 %f10 #define c7 %f12 #define c8 %f14 #define t1 %f16 #define t2 %f18 #define t3 %f20 #define t4 %f22 #define t5 %f24 #define t6 %f26 #define t7 %f28 #define t8 %f30 #define c9 %f32 #define c10 %f34 #define c11 %f36 #define c12 %f38 #define c13 %f40 #define c14 %f42 #define c15 %f44 #define c16 %f46 #define s1 %f32 #define s2 %f34 #define s3 %f36 #define s4 %f38 #define s5 %f40 #define s6 %f42 #define s7 %f44 #define s8 %f46 #define FZERO %f48 #define ALPHA_R %f50 #define ALPHA_I %f52 #else #define c1 %f0 #define c2 %f1 #define c3 %f2 #define c4 %f3 #define c5 %f4 #define c6 %f5 #define c7 %f6 #define c8 %f7 #define c9 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define s1 %f8 #define s2 %f9 #define s3 %f10 #define s4 %f11 #define s5 %f12 #define s6 %f13 #define s7 %f14 #define s8 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define t5 %f20 #define t6 %f21 #define t7 %f22 #define t8 %f23 #define FZERO %f24 #define ALPHA_R %f25 #define ALPHA_I %f26 #endif #define PREFETCHSIZE 128 PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 20] st %i5, [%sp + STACK_START + 24] ld [%sp+ STACK_START + 32], X ld [%sp+ STACK_START + 36], INCX #else st %i3, [%sp + STACK_START + 16] st %i4, [%sp + STACK_START + 24] ld [%sp+ STACK_START + 28], INCX #endif LDF [%sp + STACK_START + 16], ALPHA_R LDF [%sp + STACK_START + 24], ALPHA_I #else ldx [%sp + STACK_START + 56], INCX #ifdef DOUBLE FMOV %f6, ALPHA_R FMOV %f8, ALPHA_I #else FMOV %f7, ALPHA_R FMOV %f9, ALPHA_I #endif #endif #ifdef DOUBLE FCLR(17) #else FCLR(24) #endif FCMP ALPHA_R, FZERO fbne .LL100 sll INCX, ZBASE_SHIFT, INCX FCMP ALPHA_I, FZERO fbne .LL100 nop cmp INCX, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 STF FZERO, [X + 0 * SIZE] add I, -1, I STF FZERO, [X + 1 * SIZE] cmp I, 0 STF FZERO, [X + 2 * SIZE] STF FZERO, [X + 3 * SIZE] STF FZERO, [X + 4 * SIZE] STF FZERO, [X + 5 * SIZE] add X, 8 * SIZE, X STF FZERO, [X - 2 * SIZE] bg,pt %icc, .LL11 STF FZERO, [X - 1 * SIZE] .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: STF FZERO, [X + 0 * SIZE] STF FZERO, [X + 1 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: return %i7 + 8 clr %o0 .LL50: sra N, 2, I cmp I, 0 ble,pn %icc, .LL55 nop .LL51: STF FZERO, [X + 0 * SIZE] add I, -1, I STF FZERO, [X + 1 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] cmp I, 0 STF FZERO, [X + 1 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] STF FZERO, [X + 1 * SIZE] add X, INCX, X STF FZERO, [X + 0 * SIZE] STF FZERO, [X + 1 * SIZE] bg,pt %icc, .LL51 add X, INCX, X .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: STF FZERO, [X + 0 * SIZE] add I, -1, I STF FZERO, [X + 1 * SIZE] cmp I, 0 bg,pt %icc, .LL56 add X, INCX, X .LL59: return %i7 + 8 clr %o0 .LL100: cmp INCX, 2 * SIZE bne .LL150 sra N, 2, I cmp I, 0 ble,pn %icc, .LL115 nop LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 LDF [X + 2 * SIZE], c3 LDF [X + 3 * SIZE], c4 LDF [X + 4 * SIZE], c5 LDF [X + 5 * SIZE], c6 LDF [X + 6 * SIZE], c7 LDF [X + 7 * SIZE], c8 FMUL ALPHA_R, c1, t1 FMUL ALPHA_I, c2, t3 FMUL ALPHA_I, c1, t2 LDF [X + 8 * SIZE], c1 FMUL ALPHA_R, c2, t4 LDF [X + 9 * SIZE], c2 FMUL ALPHA_R, c3, t5 deccc I FMUL ALPHA_I, c4, t7 FSUB t1, t3, s1 FMUL ALPHA_I, c3, t6 LDF [X + 10 * SIZE], c3 FMUL ALPHA_R, c4, t8 LDF [X + 11 * SIZE], c4 FADD t4, t2, s2 ble,pn %icc, .LL112 nop .LL111: prefetch [X + PREFETCHSIZE * SIZE], 0 FMUL ALPHA_R, c5, t1 FMUL ALPHA_I, c6, t3 FSUB t5, t7, s3 STF s1, [X + 0 * SIZE] FMUL ALPHA_I, c5, t2 LDF [X + 12 * SIZE], c5 FMUL ALPHA_R, c6, t4 LDF [X + 13 * SIZE], c6 FADD t8, t6, s4 STF s2, [X + 1 * SIZE] FMUL ALPHA_R, c7, t5 FMUL ALPHA_I, c8, t7 FSUB t1, t3, s5 STF s3, [X + 2 * SIZE] FMUL ALPHA_I, c7, t6 LDF [X + 14 * SIZE], c7 FMUL ALPHA_R, c8, t8 LDF [X + 15 * SIZE], c8 FADD t4, t2, s6 STF s4, [X + 3 * SIZE] FMUL ALPHA_R, c1, t1 FMUL ALPHA_I, c2, t3 FSUB t5, t7, s7 STF s5, [X + 4 * SIZE] FMUL ALPHA_I, c1, t2 LDF [X + 16 * SIZE], c1 FMUL ALPHA_R, c2, t4 LDF [X + 17 * SIZE], c2 FADD t8, t6, s8 STF s6, [X + 5 * SIZE] FMUL ALPHA_R, c3, t5 deccc I FMUL ALPHA_I, c4, t7 FSUB t1, t3, s1 STF s7, [X + 6 * SIZE] FMUL ALPHA_I, c3, t6 LDF [X + 18 * SIZE], c3 FMUL ALPHA_R, c4, t8 LDF [X + 19 * SIZE], c4 FADD t4, t2, s2 STF s8, [X + 7 * SIZE] bg,pt %icc, .LL111 add X, 8 * SIZE, X .LL112: FMUL ALPHA_R, c5, t1 FMUL ALPHA_I, c6, t3 FSUB t5, t7, s3 STF s1, [X + 0 * SIZE] FMUL ALPHA_I, c5, t2 FMUL ALPHA_R, c6, t4 FADD t8, t6, s4 STF s2, [X + 1 * SIZE] FMUL ALPHA_R, c7, t5 FMUL ALPHA_I, c8, t7 FSUB t1, t3, s5 STF s3, [X + 2 * SIZE] FMUL ALPHA_I, c7, t6 FMUL ALPHA_R, c8, t8 FADD t4, t2, s6 STF s4, [X + 3 * SIZE] FSUB t5, t7, s7 FADD t8, t6, s8 STF s5, [X + 4 * SIZE] STF s6, [X + 5 * SIZE] STF s7, [X + 6 * SIZE] STF s8, [X + 7 * SIZE] add X, 8 * SIZE, X .LL115: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL119 nop .LL116: LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 FMUL ALPHA_R, c1, c3 FMUL ALPHA_I, c1, c4 FMUL ALPHA_I, c2, c1 FMUL ALPHA_R, c2, c2 FSUB c3, c1, c1 FADD c2, c4, c2 STF c1, [X + 0 * SIZE] STF c2, [X + 1 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL116 add X, 2 * SIZE, X .LL119: return %i7 + 8 clr %o0 .LL150: sra N, 2, I cmp I, 0 ble,pn %icc, .LL155 mov X, XX .LL151: LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 add X, INCX, X LDF [X + 0 * SIZE], c3 FMUL ALPHA_R, c1, c9 LDF [X + 1 * SIZE], c4 FMUL ALPHA_I, c1, c10 add X, INCX, X LDF [X + 0 * SIZE], c5 FMUL ALPHA_I, c2, c1 LDF [X + 1 * SIZE], c6 FMUL ALPHA_R, c2, c2 add X, INCX, X LDF [X + 0 * SIZE], c7 FMUL ALPHA_R, c3, c11 LDF [X + 1 * SIZE], c8 FMUL ALPHA_I, c3, c12 add X, INCX, X FMUL ALPHA_I, c4, c3 FMUL ALPHA_R, c4, c4 FMUL ALPHA_R, c5, c13 FMUL ALPHA_I, c5, c14 FMUL ALPHA_I, c6, c5 FMUL ALPHA_R, c6, c6 FMUL ALPHA_R, c7, c15 FSUB c9, c1, c1 FMUL ALPHA_I, c7, c16 FADD c2, c10, c2 FMUL ALPHA_I, c8, c7 FSUB c11, c3, c3 FMUL ALPHA_R, c8, c8 FADD c4, c12, c4 STF c1, [XX + 0 * SIZE] FSUB c13, c5, c5 add I, -1, I STF c2, [XX + 1 * SIZE] FADD c6, c14, c6 add XX, INCX, XX STF c3, [XX + 0 * SIZE] FSUB c15, c7, c7 cmp I, 0 STF c4, [XX + 1 * SIZE] FADD c8, c16, c8 add XX, INCX, XX STF c5, [XX + 0 * SIZE] STF c6, [XX + 1 * SIZE] add XX, INCX, XX STF c7, [XX + 0 * SIZE] STF c8, [XX + 1 * SIZE] bg,pt %icc, .LL151 add XX, INCX, XX .LL155: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL159 nop .LL156: LDF [X + 0 * SIZE], c1 LDF [X + 1 * SIZE], c2 FMUL ALPHA_R, c1, c3 FMUL ALPHA_I, c1, c4 FMUL ALPHA_I, c2, c1 FMUL ALPHA_R, c2, c2 FSUB c3, c1, c1 FADD c2, c4, c2 STF c1, [X + 0 * SIZE] STF c2, [X + 1 * SIZE] add I, -1, I cmp I, 0 bg,pt %icc, .LL156 add X, INCX, X .LL159: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/zswap.S000066400000000000000000000175131313527062700170330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(DOUBLE) && !defined(__64BIT__) #define N %i0 #define X %i1 #define INCX %i2 #define Y %i3 #define INCY %i4 #define I %i5 #else #define N %i0 #define X %i5 #define INCX %i1 #define Y %i2 #define INCY %i3 #define I %i4 #endif #define XX %l0 #define YY %l1 #ifdef DOUBLE #define a1 %f0 #define a2 %f2 #define a3 %f4 #define a4 %f6 #define a5 %f8 #define a6 %f10 #define a7 %f12 #define a8 %f14 #define b1 %f16 #define b2 %f18 #define b3 %f20 #define b4 %f22 #define b5 %f24 #define b6 %f26 #define b7 %f28 #define b8 %f30 #else #define a1 %f0 #define a2 %f1 #define a3 %f2 #define a4 %f3 #define a5 %f4 #define a6 %f5 #define a7 %f6 #define a8 %f7 #define b1 %f8 #define b2 %f9 #define b3 %f10 #define b4 %f11 #define b5 %f12 #define b6 %f13 #define b7 %f14 #define b8 %f15 #endif #ifdef DOUBLE #define PREFETCHSIZE 128 #else #define PREFETCHSIZE 256 #endif PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], X ld [%sp + STACK_START + 36], INCX ld [%sp + STACK_START + 40], Y ld [%sp + STACK_START + 44], INCY #else ld [%sp + STACK_START + 28], INCX ld [%sp + STACK_START + 32], Y ld [%sp + STACK_START + 36], INCY #endif #else ldx [%sp + STACK_START + 56], INCX ldx [%sp + STACK_START + 64], Y ldx [%sp + STACK_START + 72], INCY #endif sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY cmp INCX, 2 * SIZE bne .LL50 nop cmp INCY, 2 * SIZE bne .LL50 nop sra N, 2, I cmp I, 0 ble,pn %icc, .LL15 nop LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 LDF [Y + 1 * SIZE], b2 LDF [X + 2 * SIZE], a3 LDF [Y + 2 * SIZE], b3 LDF [X + 3 * SIZE], a4 LDF [Y + 3 * SIZE], b4 LDF [X + 4 * SIZE], a5 LDF [Y + 4 * SIZE], b5 LDF [X + 5 * SIZE], a6 LDF [Y + 5 * SIZE], b6 LDF [X + 6 * SIZE], a7 LDF [Y + 6 * SIZE], b7 LDF [X + 7 * SIZE], a8 LDF [Y + 7 * SIZE], b8 deccc I ble,pn %icc, .LL12 nop .LL11: prefetch [X + PREFETCHSIZE * SIZE], 0 deccc I STF a1, [Y + 0 * SIZE] LDF [X + 8 * SIZE], a1 STF b1, [X + 0 * SIZE] LDF [Y + 8 * SIZE], b1 STF a2, [Y + 1 * SIZE] LDF [X + 9 * SIZE], a2 STF b2, [X + 1 * SIZE] LDF [Y + 9 * SIZE], b2 STF a3, [Y + 2 * SIZE] LDF [X + 10 * SIZE], a3 STF b3, [X + 2 * SIZE] LDF [Y + 10 * SIZE], b3 STF a4, [Y + 3 * SIZE] LDF [X + 11 * SIZE], a4 STF b4, [X + 3 * SIZE] LDF [Y + 11 * SIZE], b4 prefetch [Y + PREFETCHSIZE * SIZE], 0 add X, 8 * SIZE, X STF a5, [Y + 4 * SIZE] LDF [X + 4 * SIZE], a5 STF b5, [X - 4 * SIZE] LDF [Y + 12 * SIZE], b5 STF a6, [Y + 5 * SIZE] LDF [X + 5 * SIZE], a6 STF b6, [X - 3 * SIZE] LDF [Y + 13 * SIZE], b6 STF a7, [Y + 6 * SIZE] LDF [X + 6 * SIZE], a7 STF b7, [X - 2 * SIZE] LDF [Y + 14 * SIZE], b7 STF a8, [Y + 7 * SIZE] LDF [X + 7 * SIZE], a8 STF b8, [X - 1 * SIZE] LDF [Y + 15 * SIZE], b8 bg,pt %icc, .LL11 add Y, 8 * SIZE, Y .LL12: STF a1, [Y + 0 * SIZE] STF b1, [X + 0 * SIZE] STF a2, [Y + 1 * SIZE] STF b2, [X + 1 * SIZE] STF a3, [Y + 2 * SIZE] STF b3, [X + 2 * SIZE] STF a4, [Y + 3 * SIZE] STF b4, [X + 3 * SIZE] STF a5, [Y + 4 * SIZE] STF b5, [X + 4 * SIZE] STF a6, [Y + 5 * SIZE] STF b6, [X + 5 * SIZE] STF a7, [Y + 6 * SIZE] STF b7, [X + 6 * SIZE] STF a8, [Y + 7 * SIZE] STF b8, [X + 7 * SIZE] add X, 8 * SIZE, X add Y, 8 * SIZE, Y .LL15: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL19 nop .LL16: LDF [X + 0 * SIZE], a1 add I, -1, I LDF [X + 1 * SIZE], a2 LDF [Y + 0 * SIZE], b1 LDF [Y + 1 * SIZE], b2 cmp I, 0 STF a1, [Y + 0 * SIZE] STF a2, [Y + 1 * SIZE] add Y, 2 * SIZE, Y STF b1, [X + 0 * SIZE] STF b2, [X + 1 * SIZE] bg,pt %icc, .LL16 add X, 2 * SIZE, X .LL19: return %i7 + 8 clr %g0 .LL50: sra N, 2, I mov X, XX cmp I, 0 ble,pn %icc, .LL55 mov Y, YY .LL51: LDF [X + 0 * SIZE], a1 LDF [Y + 0 * SIZE], b1 LDF [X + 1 * SIZE], a2 add X, INCX, X LDF [Y + 1 * SIZE], b2 add Y, INCY, Y LDF [X + 0 * SIZE], a3 LDF [Y + 0 * SIZE], b3 LDF [X + 1 * SIZE], a4 add X, INCX, X LDF [Y + 1 * SIZE], b4 add Y, INCY, Y LDF [X + 0 * SIZE], a5 LDF [Y + 0 * SIZE], b5 LDF [X + 1 * SIZE], a6 add X, INCX, X LDF [Y + 1 * SIZE], b6 add Y, INCY, Y LDF [X + 0 * SIZE], a7 LDF [Y + 0 * SIZE], b7 LDF [X + 1 * SIZE], a8 add X, INCX, X LDF [Y + 1 * SIZE], b8 add Y, INCY, Y STF a1, [YY + 0 * SIZE] add I, -1, I STF b1, [XX + 0 * SIZE] cmp I, 0 STF a2, [YY + 1 * SIZE] add YY, INCY, YY STF b2, [XX + 1 * SIZE] add XX, INCX, XX STF a3, [YY + 0 * SIZE] STF b3, [XX + 0 * SIZE] STF a4, [YY + 1 * SIZE] add YY, INCY, YY STF b4, [XX + 1 * SIZE] add XX, INCX, XX STF a5, [YY + 0 * SIZE] STF b5, [XX + 0 * SIZE] STF a6, [YY + 1 * SIZE] add YY, INCY, YY STF b6, [XX + 1 * SIZE] add XX, INCX, XX STF a7, [YY + 0 * SIZE] STF b7, [XX + 0 * SIZE] STF a8, [YY + 1 * SIZE] add YY, INCY, YY STF b8, [XX + 1 * SIZE] bg,pt %icc, .LL51 add XX, INCX, XX .LL55: and N, 3, I cmp I, 0 ble,a,pn %icc, .LL59 nop .LL56: LDF [X + 0 * SIZE], a1 add I, -1, I LDF [X + 1 * SIZE], a2 LDF [Y + 0 * SIZE], b1 cmp I, 0 LDF [Y + 1 * SIZE], b2 STF b1, [X + 0 * SIZE] STF b2, [X + 1 * SIZE] add X, INCX, X STF a1, [Y + 0 * SIZE] STF a2, [Y + 1 * SIZE] bg,pt %icc, .LL56 add Y, INCY, Y .LL59: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/ztrsm_kernel_LN.S000066400000000000000000001101651313527062700207740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define C1 %l0 #define C2 %l1 #define OFFSET %l2 #define KK %l3 #define TEMP1 %l4 #define TEMP2 %l5 #define AORIG %l6 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f62 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f58 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #endif #define t5 c13 #define t6 c14 #define t7 c15 #define t8 c16 #ifndef CONJ #define FADD1 FADD #define FADD2 FADD #define FADD3 FADD #define FADD4 FSUB #else #if defined(LN) || defined(LT) #define FADD1 FADD #define FADD2 FSUB #define FADD3 FADD #define FADD4 FADD #endif #if defined(RN) || defined(RT) #define FADD1 FADD #define FADD2 FADD #define FADD3 FSUB #define FADD4 FADD #endif #endif #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC ld [%sp + STACK_START + 48], OFFSET #else ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #endif #else ldx [%sp+ STACK_START + 56], B ldx [%sp+ STACK_START + 64], C ldx [%sp+ STACK_START + 72], LDC ldx [%sp+ STACK_START + 80], OFFSET #endif #ifdef DOUBLE FCLR(27) #else FCLR(29) #endif sll LDC, ZBASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add A, TEMP1, A sll M, ZBASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 1, J cmp J, 0 ble,pn %icc, .LL100 nop .LL11: #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 sub B, TEMP1, B add LDC, LDC, TEMP1 sub C, TEMP1, C #endif mov C, C1 add C, LDC, C2 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C2, LDC, C #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL50 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 1 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, c02 FMOV FZERO, t1 FMOV FZERO, c04 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD2 c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD4 c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD2 c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD4 c04, t2, c04 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD2 c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD4 c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif FADD2 c02, t1, c02 FADD4 c04, t2, c04 FADD2 c06, t3, c06 FADD4 c08, t4, c08 FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t3 FMUL a4, c01, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FADD3 c05, t3, c05 FADD4 c06, t4, c06 FMUL b1, c05, t1 FMUL b2, c06, t2 FMUL b1, c06, t3 FMUL b2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06 #endif #ifdef RT LDF [BO + 6 * SIZE], a1 LDF [BO + 7 * SIZE], a2 LDF [BO + 4 * SIZE], a3 LDF [BO + 5 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL a1, c05, t1 FMUL a2, c06, t2 FMUL a1, c06, t3 FMUL a2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a4, c06, t3 FMUL a4, c05, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD3 c01, t3, c01 FADD4 c02, t4, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL50: sra M, 1, I cmp I, 0 ble,pn %icc, .LL99 nop .LL21: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 FMOV FZERO, c01 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c10 LDF [BO + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12 #ifdef LN prefetch [C1 - 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 - 3 * SIZE], 3 FMOV FZERO, c14 #else prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14 #endif FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 .LL22: FADD2 c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD4 c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD2 c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a5, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a5, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a1, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a1, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD3 c05, t2, c05 nop FMUL a2, b2, t2 FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 nop FADD2 c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD4 c08, t2, c08 FMUL a5, b2, t2 FADD2 c12, t3, c12 FMUL a5, b3, t3 FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,pn %icc, .LL29 nop .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD4 c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD2 c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #if defined(LN) || defined(RT) sub KK, 2, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif FADD2 c04, t1, c04 FADD4 c08, t2, c08 FADD2 c12, t3, c12 FADD4 c16, t4, c16 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 FADD c09, c14, c09 FADD c10, c13, c10 FADD c11, c16, c11 FADD c12, c15, c12 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c09, c09 FSUB a4, c10, c10 FSUB b1, c03, c03 FSUB b2, c04, c04 FSUB b3, c11, c11 FSUB b4, c12, c12 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c09, c09 FSUB b2, c10, c10 FSUB b3, c11, c11 FSUB b4, c12, c12 #endif #ifdef LN LDF [AO + 6 * SIZE], a1 LDF [AO + 7 * SIZE], a2 LDF [AO + 4 * SIZE], a3 LDF [AO + 5 * SIZE], a4 LDF [AO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], b2 FMUL a1, c03, t1 FMUL a2, c04, t2 FMUL a1, c04, t3 FMUL a2, c03, t4 FMUL a1, c11, t5 FMUL a2, c12, t6 FMUL a1, c12, t7 FMUL a2, c11, t8 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FADD4 t5, t6, c11 FADD2 t7, t8, c12 FMUL a3, c03, t1 FMUL a3, c04, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FMUL a4, c04, t5 FMUL a4, c03, t6 FMUL a4, c12, t7 FMUL a4, c11, t8 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c09, t3, c09 FSUB c10, t4, c10 FADD2 c01, t5, c01 FADD4 c02, t6, c02 FADD2 c09, t7, c09 FADD4 c10, t8, c10 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FMUL b1, c09, t5 FMUL b2, c10, t6 FMUL b1, c10, t7 FMUL b2, c09, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c09 FADD2 t7, t8, c10 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 6 * SIZE], b1 LDF [AO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c09, t5 FMUL a2, c10, t6 FMUL a1, c10, t7 FMUL a2, c09, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c09 FADD2 t7, t8, c10 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c09, t3 FMUL a3, c10, t4 FMUL a4, c02, t5 FMUL a4, c01, t6 FMUL a4, c10, t7 FMUL a4, c09, t8 FSUB c03, t1, c03 FSUB c04, t2, c04 FSUB c11, t3, c11 FSUB c12, t4, c12 FADD2 c03, t5, c03 FADD4 c04, t6, c04 FADD2 c11, t7, c11 FADD4 c12, t8, c12 FMUL b1, c03, t1 FMUL b2, c04, t2 FMUL b1, c04, t3 FMUL b2, c03, t4 FMUL b1, c11, t5 FMUL b2, c12, t6 FMUL b1, c12, t7 FMUL b2, c11, t8 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FADD4 t5, t6, c11 FADD2 t7, t8, c12 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c03, t3 FMUL a3, c04, t4 FMUL a4, c02, t5 FMUL a4, c01, t6 FMUL a4, c04, t7 FMUL a4, c03, t8 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FADD3 c09, t5, c09 FADD4 c10, t6, c10 FADD3 c11, t7, c11 FADD4 c12, t8, c12 FMUL b1, c09, t1 FMUL b2, c10, t2 FMUL b1, c10, t3 FMUL b2, c09, t4 FMUL b1, c11, t5 FMUL b2, c12, t6 FMUL b1, c12, t7 FMUL b2, c11, t8 FADD4 t1, t2, c09 FADD3 t3, t4, c10 FADD4 t5, t6, c11 FADD3 t7, t8, c12 #endif #ifdef RT LDF [BO + 6 * SIZE], a1 LDF [BO + 7 * SIZE], a2 LDF [BO + 4 * SIZE], a3 LDF [BO + 5 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL a1, c09, t1 FMUL a2, c10, t2 FMUL a1, c10, t3 FMUL a2, c09, t4 FMUL a1, c11, t5 FMUL a2, c12, t6 FMUL a1, c12, t7 FMUL a2, c11, t8 FADD4 t1, t2, c09 FADD3 t3, t4, c10 FADD4 t5, t6, c11 FADD3 t7, t8, c12 FMUL a3, c09, t1 FMUL a3, c10, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FMUL a4, c10, t5 FMUL a4, c09, t6 FMUL a4, c12, t7 FMUL a4, c11, t8 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 FADD3 c01, t5, c01 FADD4 c02, t6, c02 FADD3 c03, t7, c03 FADD4 c04, t8, c04 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FMUL b1, c03, t5 FMUL b2, c04, t6 FMUL b1, c04, t7 FMUL b2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c09, [BO + 2 * SIZE] STF c10, [BO + 3 * SIZE] STF c03, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c11, [BO + 6 * SIZE] STF c12, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c10, [AO + 5 * SIZE] STF c11, [AO + 6 * SIZE] STF c12, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c09, [C2 + 0 * SIZE] STF c10, [C2 + 1 * SIZE] STF c11, [C2 + 2 * SIZE] STF c12, [C2 + 3 * SIZE] #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #endif #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL21 nop .LL99: #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 sub B, TEMP1, B sub C, LDC, C #endif mov C, C1 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C, LDC, C #endif and M, 1, I cmp I, 0 ble,pn %icc, .LL150 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL155 nop .LL152: FADD1 c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD3 c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD2 c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD1 c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD3 c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD3 c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD2 c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: FADD1 c01, t1, c01 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 cmp L, 0 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL156 LDF [AO + 1 * SIZE], a2 .LL159: FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 FADD c01, c04, c01 FADD c02, c03, c02 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL150: sra M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop .LL121: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, c03 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 #ifdef LN prefetch [C1 - 3 * SIZE], 3 #else prefetch [C1 + 3 * SIZE], 3 #endif FMOV FZERO, c05 FMOV FZERO, c02 ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD1 c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD1 c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b1, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD1 c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD4 c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b1, t1 FADD3 c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: FADD1 c03, t1, c03 FADD3 c07, t2, c07 FADD2 c04, t3, c04 FADD4 c08, t4, c08 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 6 * SIZE], a1 LDF [AO + 7 * SIZE], a2 LDF [AO + 4 * SIZE], a3 LDF [AO + 5 * SIZE], a4 LDF [AO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], b2 FMUL a1, c03, t1 FMUL a2, c04, t2 FMUL a1, c04, t3 FMUL a2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FMUL a3, c03, t1 FMUL a3, c04, t2 FMUL a4, c04, t5 FMUL a4, c03, t6 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD2 c01, t5, c01 FADD4 c02, t6, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 6 * SIZE], b1 LDF [AO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t5 FMUL a4, c01, t6 FSUB c03, t1, c03 FSUB c04, t2, c04 FADD2 c03, t5, c03 FADD4 c04, t6, c04 FMUL b1, c03, t1 FMUL b2, c04, t2 FMUL b1, c04, t3 FMUL b2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 #endif #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL199: #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/ztrsm_kernel_LT.S000066400000000000000000001101331313527062700207750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define C1 %l0 #define C2 %l1 #define OFFSET %l2 #define KK %l3 #define TEMP1 %l4 #define TEMP2 %l5 #define AORIG %l6 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f62 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f58 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #endif #define t5 c13 #define t6 c14 #define t7 c15 #define t8 c16 #ifndef CONJ #define FADD1 FADD #define FADD2 FADD #define FADD3 FADD #define FADD4 FSUB #else #if defined(LN) || defined(LT) #define FADD1 FADD #define FADD2 FSUB #define FADD3 FADD #define FADD4 FADD #endif #if defined(RN) || defined(RT) #define FADD1 FADD #define FADD2 FADD #define FADD3 FSUB #define FADD4 FADD #endif #endif #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC ld [%sp + STACK_START + 48], OFFSET #else ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #endif #else ldx [%sp+ STACK_START + 56], B ldx [%sp+ STACK_START + 64], C ldx [%sp+ STACK_START + 72], LDC ldx [%sp+ STACK_START + 80], OFFSET #endif #ifdef DOUBLE FCLR(27) #else FCLR(29) #endif sll LDC, ZBASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add A, TEMP1, A sll M, ZBASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 1, J cmp J, 0 ble,pn %icc, .LL100 nop .LL11: #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 sub B, TEMP1, B add LDC, LDC, TEMP1 sub C, TEMP1, C #endif FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 sra M, 1, I mov C, C1 add C, LDC, C2 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif cmp I, 0 #ifndef RT add C2, LDC, C #endif ble,pn %icc, .LL50 FMOV FZERO, t4 .LL21: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 FMOV FZERO, c01 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c10 LDF [BO + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14 FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 .LL22: FADD2 c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD4 c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD2 c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a5, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a5, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a1, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a1, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD3 c05, t2, c05 nop FMUL a2, b2, t2 FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 nop FADD2 c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD4 c08, t2, c08 FMUL a5, b2, t2 FADD2 c12, t3, c12 FMUL a5, b3, t3 FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,pn %icc, .LL29 nop .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD4 c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD2 c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #if defined(LN) || defined(RT) sub KK, 2, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif FADD2 c04, t1, c04 FADD4 c08, t2, c08 FADD2 c12, t3, c12 FADD4 c16, t4, c16 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 FADD c09, c14, c09 FADD c10, c13, c10 FADD c11, c16, c11 FADD c12, c15, c12 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c09, c09 FSUB a4, c10, c10 FSUB b1, c03, c03 FSUB b2, c04, c04 FSUB b3, c11, c11 FSUB b4, c12, c12 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c09, c09 FSUB b2, c10, c10 FSUB b3, c11, c11 FSUB b4, c12, c12 #endif #ifdef LN LDF [AO + 6 * SIZE], a1 LDF [AO + 7 * SIZE], a2 LDF [AO + 4 * SIZE], a3 LDF [AO + 5 * SIZE], a4 LDF [AO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], b2 FMUL a1, c03, t1 FMUL a2, c04, t2 FMUL a1, c04, t3 FMUL a2, c03, t4 FMUL a1, c11, t5 FMUL a2, c12, t6 FMUL a1, c12, t7 FMUL a2, c11, t8 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FADD4 t5, t6, c11 FADD2 t7, t8, c12 FMUL a3, c03, t1 FMUL a3, c04, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FMUL a4, c04, t5 FMUL a4, c03, t6 FMUL a4, c12, t7 FMUL a4, c11, t8 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c09, t3, c09 FSUB c10, t4, c10 FADD2 c01, t5, c01 FADD4 c02, t6, c02 FADD2 c09, t7, c09 FADD4 c10, t8, c10 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FMUL b1, c09, t5 FMUL b2, c10, t6 FMUL b1, c10, t7 FMUL b2, c09, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c09 FADD2 t7, t8, c10 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 6 * SIZE], b1 LDF [AO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c09, t5 FMUL a2, c10, t6 FMUL a1, c10, t7 FMUL a2, c09, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c09 FADD2 t7, t8, c10 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c09, t3 FMUL a3, c10, t4 FMUL a4, c02, t5 FMUL a4, c01, t6 FMUL a4, c10, t7 FMUL a4, c09, t8 FSUB c03, t1, c03 FSUB c04, t2, c04 FSUB c11, t3, c11 FSUB c12, t4, c12 FADD2 c03, t5, c03 FADD4 c04, t6, c04 FADD2 c11, t7, c11 FADD4 c12, t8, c12 FMUL b1, c03, t1 FMUL b2, c04, t2 FMUL b1, c04, t3 FMUL b2, c03, t4 FMUL b1, c11, t5 FMUL b2, c12, t6 FMUL b1, c12, t7 FMUL b2, c11, t8 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FADD4 t5, t6, c11 FADD2 t7, t8, c12 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c03, t3 FMUL a3, c04, t4 FMUL a4, c02, t5 FMUL a4, c01, t6 FMUL a4, c04, t7 FMUL a4, c03, t8 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FADD3 c09, t5, c09 FADD4 c10, t6, c10 FADD3 c11, t7, c11 FADD4 c12, t8, c12 FMUL b1, c09, t1 FMUL b2, c10, t2 FMUL b1, c10, t3 FMUL b2, c09, t4 FMUL b1, c11, t5 FMUL b2, c12, t6 FMUL b1, c12, t7 FMUL b2, c11, t8 FADD4 t1, t2, c09 FADD3 t3, t4, c10 FADD4 t5, t6, c11 FADD3 t7, t8, c12 #endif #ifdef RT LDF [BO + 6 * SIZE], a1 LDF [BO + 7 * SIZE], a2 LDF [BO + 4 * SIZE], a3 LDF [BO + 5 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL a1, c09, t1 FMUL a2, c10, t2 FMUL a1, c10, t3 FMUL a2, c09, t4 FMUL a1, c11, t5 FMUL a2, c12, t6 FMUL a1, c12, t7 FMUL a2, c11, t8 FADD4 t1, t2, c09 FADD3 t3, t4, c10 FADD4 t5, t6, c11 FADD3 t7, t8, c12 FMUL a3, c09, t1 FMUL a3, c10, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FMUL a4, c10, t5 FMUL a4, c09, t6 FMUL a4, c12, t7 FMUL a4, c11, t8 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 FADD3 c01, t5, c01 FADD4 c02, t6, c02 FADD3 c03, t7, c03 FADD4 c04, t8, c04 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FMUL b1, c03, t5 FMUL b2, c04, t6 FMUL b1, c04, t7 FMUL b2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c09, [BO + 2 * SIZE] STF c10, [BO + 3 * SIZE] STF c03, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c11, [BO + 6 * SIZE] STF c12, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c10, [AO + 5 * SIZE] STF c11, [AO + 6 * SIZE] STF c12, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c09, [C2 + 0 * SIZE] STF c10, [C2 + 1 * SIZE] STF c11, [C2 + 2 * SIZE] STF c12, [C2 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #endif #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL21 FMOV FZERO, c01 .LL50: and M, 1, I FMOV FZERO, c02 cmp I, 0 FMOV FZERO, t1 ble,pn %icc, .LL99 FMOV FZERO, c04 #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 1 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD2 c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD4 c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD2 c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD4 c04, t2, c04 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD2 c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD4 c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif FADD2 c02, t1, c02 FADD4 c04, t2, c04 FADD2 c06, t3, c06 FADD4 c08, t4, c08 FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t3 FMUL a4, c01, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FADD3 c05, t3, c05 FADD4 c06, t4, c06 FMUL b1, c05, t1 FMUL b2, c06, t2 FMUL b1, c06, t3 FMUL b2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06 #endif #ifdef RT LDF [BO + 6 * SIZE], a1 LDF [BO + 7 * SIZE], a2 LDF [BO + 4 * SIZE], a3 LDF [BO + 5 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL a1, c05, t1 FMUL a2, c06, t2 FMUL a1, c06, t3 FMUL a2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a4, c06, t3 FMUL a4, c05, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD3 c01, t3, c01 FADD4 c02, t4, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL99: #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL100: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 sub B, TEMP1, B sub C, LDC, C #endif mov C, C1 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C, LDC, C #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03 .LL121: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, c03 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c05 FMOV FZERO, c02 ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD1 c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD1 c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b1, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD1 c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD4 c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b1, t1 FADD3 c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: FADD1 c03, t1, c03 FADD3 c07, t2, c07 FADD2 c04, t3, c04 FADD4 c08, t4, c08 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 6 * SIZE], a1 LDF [AO + 7 * SIZE], a2 LDF [AO + 4 * SIZE], a3 LDF [AO + 5 * SIZE], a4 LDF [AO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], b2 FMUL a1, c03, t1 FMUL a2, c04, t2 FMUL a1, c04, t3 FMUL a2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FMUL a3, c03, t1 FMUL a3, c04, t2 FMUL a4, c04, t5 FMUL a4, c03, t6 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD2 c01, t5, c01 FADD4 c02, t6, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 6 * SIZE], b1 LDF [AO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t5 FMUL a4, c01, t6 FSUB c03, t1, c03 FSUB c04, t2, c04 FADD2 c03, t5, c03 FADD4 c04, t6, c04 FMUL b1, c03, t1 FMUL b2, c04, t2 FMUL b1, c04, t3 FMUL b2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 #endif #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL150: and M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL155 nop .LL152: FADD1 c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD3 c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD2 c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD1 c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD3 c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD3 c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD2 c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: FADD1 c01, t1, c01 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 cmp L, 0 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL156 LDF [AO + 1 * SIZE], a2 .LL159: FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 FADD c01, c04, c01 FADD c02, c03, c02 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL199: #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/ztrsm_kernel_LT_1x4.S000066400000000000000000001207631313527062700215030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define C1 %l0 #define C2 %l1 #define C3 %l2 #define C4 %l3 #define OFFSET %l4 #define KK %l5 #define TEMP1 %l6 #define TEMP2 %l7 #define AORIG %o7 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #endif #ifndef CONJ #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FMADD #define FMADD4 FNMSUB #else #if defined(LN) || defined(LT) #define FMADD1 FMADD #define FMADD2 FNMSUB #define FMADD3 FMADD #define FMADD4 FMADD #endif #if defined(RN) || defined(RT) #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FNMSUB #define FMADD4 FMADD #endif #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC ld [%sp + STACK_START + 48], OFFSET #else ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #endif #else ldx [%sp + STACK_START + 56], B ldx [%sp + STACK_START + 64], C ldx [%sp + STACK_START + 72], LDC ldx [%sp + STACK_START + 80], OFFSET #endif cmp M, 0 ble,pn %icc, .LL999 nop sll LDC, ZBASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add A, TEMP1, A sll M, ZBASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 2, J cmp J, 0 ble,pn %icc, .LL20 nop .align 4 .LL11: #ifdef RT sll K, ZBASE_SHIFT + 2, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C #else sub C, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif mov M, I .align 4 .LL12: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TEMP1 sll KK, ZBASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 FCLR (cc01) LDF [AO + 1 * SIZE], a2 FCLR (cc05) LDF [AO + 8 * SIZE], a5 FCLR (cc09) LDF [BO + 0 * SIZE], b1 FCLR (cc13) LDF [BO + 1 * SIZE], b2 FCLR (cc02) LDF [BO + 2 * SIZE], b3 FCLR (cc06) LDF [BO + 3 * SIZE], b4 FCLR (cc10) LDF [BO + 4 * SIZE], b5 FCLR (cc14) LDF [BO + 5 * SIZE], b6 FCLR (cc03) LDF [BO + 6 * SIZE], b7 FCLR (cc07) LDF [BO + 7 * SIZE], b8 FCLR (cc11) LDF [BO + 8 * SIZE], b9 FCLR (cc15) prefetch [C1 + 1 * SIZE], 3 FCLR (cc04) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) prefetch [C3 + 1 * SIZE], 3 FCLR (cc12) prefetch [C4 + 2 * SIZE], 3 FCLR (cc16) #if defined(LT) || defined(RN) sra KK, 3, L #else sub K, KK, L sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 nop .align 4 .LL13: FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) add L, -1, L FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) nop FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) nop FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) cmp L, 0 FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD4 (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD1 (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) FMADD4 (aa4, bb6, cc12, cc12) FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) add L, -1, L FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) nop FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) nop FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) cmp L, 0 FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD4 (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD1 (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) FMADD4 (aa4, bb6, cc12, cc12) FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #if defined(LT) || defined(RN) and KK, 7, L #else sub K, KK, L and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) nop FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) cmp L, 0 FMADD2 (aa2, bb3, cc06, cc06) nop FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) nop FMADD2 (aa2, bb5, cc10, cc10) nop FMADD3 (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD4 (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD1 (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD2 (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD3 (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD4 (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 FADD c09, c12, c09 FADD c10, c11, c10 FADD c13, c16, c13 FADD c14, c15, c14 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 #endif FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 FSUB b1, c09, c09 FSUB b2, c10, c10 FSUB b3, c13, c13 FSUB b4, c14, c14 #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, b1 FMUL a2, c01, b2 FMUL a1, c05, b3 FMUL a2, c05, b4 FMUL a1, c09, b5 FMUL a2, c09, b6 FMUL a1, c13, b7 FMUL a2, c13, b8 #ifndef CONJ FNMSUB (aa2, cc02, bb1, cc01) FMADD (aa1, cc02, bb2, cc02) FNMSUB (aa2, cc06, bb3, cc05) FMADD (aa1, cc06, bb4, cc06) FNMSUB (aa2, cc10, bb5, cc09) FMADD (aa1, cc10, bb6, cc10) FNMSUB (aa2, cc14, bb7, cc13) FMADD (aa1, cc14, bb8, cc14) #else FMADD (aa2, cc02, bb1, cc01) FMSUB (aa1, cc02, bb2, cc02) FMADD (aa2, cc06, bb3, cc05) FMSUB (aa1, cc06, bb4, cc06) FMADD (aa2, cc10, bb5, cc09) FMSUB (aa1, cc10, bb6, cc10) FMADD (aa2, cc14, bb7, cc13) FMSUB (aa1, cc14, bb8, cc14) #endif #endif #ifdef RN LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 LDF [BO + 6 * SIZE], b7 LDF [BO + 7 * SIZE], b8 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif FNMSUB (bb3, cc01, cc05, cc05) FNMSUB (bb3, cc02, cc06, cc06) FNMSUB (bb5, cc01, cc09, cc09) FNMSUB (bb5, cc02, cc10, cc10) FNMSUB (bb7, cc01, cc13, cc13) FNMSUB (bb7, cc02, cc14, cc14) #ifndef CONJ FMADD (bb4, cc02, cc05, cc05) FNMSUB (bb4, cc01, cc06, cc06) FMADD (bb6, cc02, cc09, cc09) FNMSUB (bb6, cc01, cc10, cc10) FMADD (bb8, cc02, cc13, cc13) FNMSUB (bb8, cc01, cc14, cc14) #else FNMSUB (bb4, cc02, cc05, cc05) FMADD (bb4, cc01, cc06, cc06) FNMSUB (bb6, cc02, cc09, cc09) FMADD (bb6, cc01, cc10, cc10) FNMSUB (bb8, cc02, cc13, cc13) FMADD (bb8, cc01, cc14, cc14) #endif LDF [BO + 10 * SIZE], b1 LDF [BO + 11 * SIZE], b2 LDF [BO + 12 * SIZE], b3 LDF [BO + 13 * SIZE], b4 LDF [BO + 14 * SIZE], b5 LDF [BO + 15 * SIZE], b6 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif FNMSUB (bb3, cc05, cc09, cc09) FNMSUB (bb3, cc06, cc10, cc10) FNMSUB (bb5, cc05, cc13, cc13) FNMSUB (bb5, cc06, cc14, cc14) #ifndef CONJ FMADD (bb4, cc06, cc09, cc09) FNMSUB (bb4, cc05, cc10, cc10) FMADD (bb6, cc06, cc13, cc13) FNMSUB (bb6, cc05, cc14, cc14) #else FNMSUB (bb4, cc06, cc09, cc09) FMADD (bb4, cc05, cc10, cc10) FNMSUB (bb6, cc06, cc13, cc13) FMADD (bb6, cc05, cc14, cc14) #endif LDF [BO + 20 * SIZE], b1 LDF [BO + 21 * SIZE], b2 LDF [BO + 22 * SIZE], b3 LDF [BO + 23 * SIZE], b4 FMUL b1, c09, a1 FMUL b2, c09, a2 #ifndef CONJ FNMSUB (bb2, cc10, aa1, cc09) FMADD (bb1, cc10, aa2, cc10) #else FMADD (bb2, cc10, aa1, cc09) FMSUB (bb1, cc10, aa2, cc10) #endif FNMSUB (bb3, cc09, cc13, cc13) FNMSUB (bb3, cc10, cc14, cc14) #ifndef CONJ FMADD (bb4, cc10, cc13, cc13) FNMSUB (bb4, cc09, cc14, cc14) #else FNMSUB (bb4, cc10, cc13, cc13) FMADD (bb4, cc09, cc14, cc14) #endif LDF [BO + 30 * SIZE], b1 LDF [BO + 31 * SIZE], b2 FMUL b1, c13, a1 FMUL b2, c13, a2 #ifndef CONJ FNMSUB (bb2, cc14, aa1, cc13) FMADD (bb1, cc14, aa2, cc14) #else FMADD (bb2, cc14, aa1, cc13) FMSUB (bb1, cc14, aa2, cc14) #endif #endif #ifdef RT LDF [BO + 30 * SIZE], b1 LDF [BO + 31 * SIZE], b2 LDF [BO + 28 * SIZE], b3 LDF [BO + 29 * SIZE], b4 LDF [BO + 26 * SIZE], b5 LDF [BO + 27 * SIZE], b6 LDF [BO + 24 * SIZE], b7 LDF [BO + 25 * SIZE], b8 FMUL b1, c13, a1 FMUL b2, c13, a2 #ifndef CONJ FNMSUB (bb2, cc14, aa1, cc13) FMADD (bb1, cc14, aa2, cc14) #else FMADD (bb2, cc14, aa1, cc13) FMSUB (bb1, cc14, aa2, cc14) #endif FNMSUB (bb3, cc13, cc09, cc09) FNMSUB (bb3, cc14, cc10, cc10) FNMSUB (bb5, cc13, cc05, cc05) FNMSUB (bb5, cc14, cc06, cc06) FNMSUB (bb7, cc13, cc01, cc01) FNMSUB (bb7, cc14, cc02, cc02) #ifndef CONJ FMADD (bb4, cc14, cc09, cc09) FNMSUB (bb4, cc13, cc10, cc10) FMADD (bb6, cc14, cc05, cc05) FNMSUB (bb6, cc13, cc06, cc06) FMADD (bb8, cc14, cc01, cc01) FNMSUB (bb8, cc13, cc02, cc02) #else FNMSUB (bb4, cc14, cc09, cc09) FMADD (bb4, cc13, cc10, cc10) FNMSUB (bb6, cc14, cc05, cc05) FMADD (bb6, cc13, cc06, cc06) FNMSUB (bb8, cc14, cc01, cc01) FMADD (bb8, cc13, cc02, cc02) #endif LDF [BO + 20 * SIZE], b1 LDF [BO + 21 * SIZE], b2 LDF [BO + 18 * SIZE], b3 LDF [BO + 19 * SIZE], b4 LDF [BO + 16 * SIZE], b5 LDF [BO + 17 * SIZE], b6 FMUL b1, c09, a1 FMUL b2, c09, a2 #ifndef CONJ FNMSUB (bb2, cc10, aa1, cc09) FMADD (bb1, cc10, aa2, cc10) #else FMADD (bb2, cc10, aa1, cc09) FMSUB (bb1, cc10, aa2, cc10) #endif FNMSUB (bb3, cc09, cc05, cc05) FNMSUB (bb3, cc10, cc06, cc06) FNMSUB (bb5, cc09, cc01, cc01) FNMSUB (bb5, cc10, cc02, cc02) #ifndef CONJ FMADD (bb4, cc10, cc05, cc05) FNMSUB (bb4, cc09, cc06, cc06) FMADD (bb6, cc10, cc01, cc01) FNMSUB (bb6, cc09, cc02, cc02) #else FNMSUB (bb4, cc10, cc05, cc05) FMADD (bb4, cc09, cc06, cc06) FNMSUB (bb6, cc10, cc01, cc01) FMADD (bb6, cc09, cc02, cc02) #endif LDF [BO + 10 * SIZE], b1 LDF [BO + 11 * SIZE], b2 LDF [BO + 8 * SIZE], b3 LDF [BO + 9 * SIZE], b4 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif FNMSUB (bb3, cc05, cc01, cc01) FNMSUB (bb3, cc06, cc02, cc02) #ifndef CONJ FMADD (bb4, cc06, cc01, cc01) FNMSUB (bb4, cc05, cc02, cc02) #else FNMSUB (bb4, cc06, cc01, cc01) FMADD (bb4, cc05, cc02, cc02) #endif LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c10, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c14, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c10, [AO + 5 * SIZE] STF c13, [AO + 6 * SIZE] STF c14, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c09, [C3 + 0 * SIZE] STF c10, [C3 + 1 * SIZE] STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop #ifdef LN sll K, ZBASE_SHIFT + 2, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .align 4 .LL20: and N, 2, J cmp J, 0 ble,pn %icc, .LL30 nop #ifdef RT sll K, ZBASE_SHIFT + 1, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C #else sub C, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif mov M, I .align 4 .LL22: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TEMP1 sll KK, ZBASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 FCLR (cc01) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc04) LDF [BO + 8 * SIZE], b9 FCLR (cc05) prefetch [C1 + 2 * SIZE], 3 FCLR (cc06) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 FCLR (cc08) .align 4 .LL23: FMADD1 (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD2 (aa2, bb3, cc06, cc06) add L, -1, L FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD4 (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD1 (aa3, bb7, cc05, cc05) cmp L, 0 FMADD2 (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD3 (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD4 (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD1 (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD2 (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD2 (aa2, bb3, cc06, cc06) nop FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD2 (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD4 (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD1 (aa3, bb7, cc05, cc05) nop FMADD2 (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD3 (aa3, bb8, cc07, cc07) FMADD4 (aa4, bb8, cc08, cc08) bg,pt %icc, .LL23 LDF [BO + 7 * SIZE], b8 .align 4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) cmp L, 0 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD3 (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD4 (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL27 add BO, 4 * SIZE, BO .align 4 .LL28: FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 #endif FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, b1 FMUL a2, c01, b2 FMUL a1, c05, b3 FMUL a2, c05, b4 #ifndef CONJ FNMSUB (aa2, cc02, bb1, cc01) FMADD (aa1, cc02, bb2, cc02) FNMSUB (aa2, cc06, bb3, cc05) FMADD (aa1, cc06, bb4, cc06) #else FMADD (aa2, cc02, bb1, cc01) FMSUB (aa1, cc02, bb2, cc02) FMADD (aa2, cc06, bb3, cc05) FMSUB (aa1, cc06, bb4, cc06) #endif #endif #ifdef RN LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif FNMSUB (bb3, cc01, cc05, cc05) FNMSUB (bb3, cc02, cc06, cc06) #ifndef CONJ FMADD (bb4, cc02, cc05, cc05) FNMSUB (bb4, cc01, cc06, cc06) #else FNMSUB (bb4, cc02, cc05, cc05) FMADD (bb4, cc01, cc06, cc06) #endif LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif #endif #ifdef RT LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 LDF [BO + 4 * SIZE], b3 LDF [BO + 5 * SIZE], b4 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif FNMSUB (bb3, cc05, cc01, cc01) FNMSUB (bb3, cc06, cc02, cc02) #ifndef CONJ FMADD (bb4, cc06, cc01, cc01) FNMSUB (bb4, cc05, cc02, cc02) #else FNMSUB (bb4, cc06, cc01, cc01) FMADD (bb4, cc05, cc02, cc02) #endif LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL22 nop #ifdef LN sll K, ZBASE_SHIFT + 1, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .align 4 .LL30: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, ZBASE_SHIFT, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C #else sub C, LDC, C1 sub C, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif mov M, I .align 4 .LL32: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 FCLR (cc08) .align 4 .LL33: FMADD1 (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD4 (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD1 (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD2 (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD3 (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD4 (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD1 (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD2 (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD3 (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD4 (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD1 (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD2 (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD3 (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD4 (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD4 (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL37 LDF [BO + 1 * SIZE], b2 .align 4 .LL38: FADD c01, c04, c01 FADD c02, c03, c02 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 #endif FSUB a1, c01, c01 FSUB a2, c02, c02 #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 #else LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 #endif FMUL a1, c01, b1 FMUL a2, c01, b2 #ifndef CONJ FNMSUB (aa2, cc02, bb1, cc01) FMADD (aa1, cc02, bb2, cc02) #else FMADD (aa2, cc02, bb1, cc01) FMSUB (aa1, cc02, bb2, cc02) #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 nop #ifdef LN sll K, ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .align 4 .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/ztrsm_kernel_RT.S000066400000000000000000001101331313527062700210030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define C1 %l0 #define C2 %l1 #define OFFSET %l2 #define KK %l3 #define TEMP1 %l4 #define TEMP2 %l5 #define AORIG %l6 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define t1 %f32 #define t2 %f34 #define t3 %f36 #define t4 %f38 #define a1 %f40 #define a2 %f42 #define a3 %f44 #define a4 %f46 #define a5 %f62 #define b1 %f48 #define b2 %f50 #define b3 %f52 #define b4 %f54 #define b5 %f56 #define FZERO %f58 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define t1 %f16 #define t2 %f17 #define t3 %f18 #define t4 %f19 #define a1 %f20 #define a2 %f21 #define a3 %f22 #define a4 %f23 #define a5 %f31 #define b1 %f24 #define b2 %f25 #define b3 %f26 #define b4 %f27 #define b5 %f28 #define FZERO %f29 #endif #define t5 c13 #define t6 c14 #define t7 c15 #define t8 c16 #ifndef CONJ #define FADD1 FADD #define FADD2 FADD #define FADD3 FADD #define FADD4 FSUB #else #if defined(LN) || defined(LT) #define FADD1 FADD #define FADD2 FSUB #define FADD3 FADD #define FADD4 FADD #endif #if defined(RN) || defined(RT) #define FADD1 FADD #define FADD2 FADD #define FADD3 FSUB #define FADD4 FADD #endif #endif #define APREFETCHSIZE 40 #define BPREFETCHSIZE 40 #define APREFETCH_CATEGORY 0 #define BPREFETCH_CATEGORY 0 PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC ld [%sp + STACK_START + 48], OFFSET #else ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #endif #else ldx [%sp+ STACK_START + 56], B ldx [%sp+ STACK_START + 64], C ldx [%sp+ STACK_START + 72], LDC ldx [%sp+ STACK_START + 80], OFFSET #endif #ifdef DOUBLE FCLR(27) #else FCLR(29) #endif sll LDC, ZBASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add A, TEMP1, A sll M, ZBASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif and N, 1, J cmp J, 0 ble,pn %icc, .LL100 nop #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 sub B, TEMP1, B sub C, LDC, C #endif mov C, C1 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif #ifndef RT add C, LDC, C #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL150 FMOV FZERO, c03 .LL121: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + ZBASE_SHIFT, TEMP1 sll KK, 0 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, c03 LDF [AO + 0 * SIZE], a1 FMOV FZERO, t1 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c07 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t2 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c04 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t3 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, t4 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c01 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c05 FMOV FZERO, c02 ble,pn %icc, .LL125 FMOV FZERO, c06 .LL122: FADD1 c03, t1, c03 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c07, t2, c07 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c04, t3, c04 add AO, 16 * SIZE, AO FMUL a2, b1, t3 cmp L, 0 FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 11 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 10 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO - 3 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 9 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO - 8 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO - 6 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c06, t4, c06 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD1 c03, t1, c03 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b2, t2 LDF [AO - 4 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b1, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b2, t4 LDF [AO - 3 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b2, t2 LDF [AO - 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 nop FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 FADD1 c03, t1, c03 nop FMUL a1, b3, t1 LDF [AO - 1 * SIZE], a4 FADD3 c07, t2, c07 nop FMUL a1, b4, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 nop FMUL a2, b3, t3 nop FADD4 c08, t4, c08 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 nop FADD3 c05, t2, c05 nop FMUL a3, b4, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c06, t4, c06 FMUL a4, b4, t4 LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL122 LDF [BO + 3 * SIZE], b4 .LL125: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL129 nop .LL126: FADD1 c03, t1, c03 add AO, 4 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c07, t2, c07 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c04, t3, c04 cmp L, 0 FMUL a2, b1, t3 FADD4 c08, t4, c08 FMUL a2, b2, t4 LDF [AO + 1 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b1, t1 FADD3 c05, t2, c05 FMUL a3, b2, t2 LDF [AO + 2 * SIZE], a3 FADD2 c02, t3, c02 FMUL a4, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c06, t4, c06 FMUL a4, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL126 LDF [AO + 3 * SIZE], a4 .LL129: FADD1 c03, t1, c03 FADD3 c07, t2, c07 FADD2 c04, t3, c04 FADD4 c08, t4, c08 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 6 * SIZE], a1 LDF [AO + 7 * SIZE], a2 LDF [AO + 4 * SIZE], a3 LDF [AO + 5 * SIZE], a4 LDF [AO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], b2 FMUL a1, c03, t1 FMUL a2, c04, t2 FMUL a1, c04, t3 FMUL a2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FMUL a3, c03, t1 FMUL a3, c04, t2 FMUL a4, c04, t5 FMUL a4, c03, t6 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD2 c01, t5, c01 FADD4 c02, t6, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 6 * SIZE], b1 LDF [AO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t5 FMUL a4, c01, t6 FSUB c03, t1, c03 FSUB c04, t2, c04 FADD2 c03, t5, c03 FADD4 c04, t6, c04 FMUL b1, c03, t1 FMUL b2, c04, t2 FMUL b1, c04, t3 FMUL b2, c03, t4 FADD4 t1, t2, c03 FADD2 t3, t4, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c03, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 #endif #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL121 FMOV FZERO, c03 .LL150: and M, 1, I cmp I, 0 ble,pn %icc, .LL199 nop #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, c01 LDF [BO + 0 * SIZE], b1 FMOV FZERO, t1 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c02 LDF [BO + 1 * SIZE], b2 FMOV FZERO, t2 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c03 LDF [BO + 2 * SIZE], b3 FMOV FZERO, t3 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c04 LDF [BO + 3 * SIZE], b4 FMOV FZERO, t4 ble,pn %icc, .LL155 nop .LL152: FADD1 c01, t1, c01 add L, -1, L FMUL a1, b1, t1 prefetch [AO + APREFETCHSIZE * SIZE], 0 FADD3 c02, t2, c02 add BO, 8 * SIZE, BO FMUL a1, b2, t2 LDF [AO + 4 * SIZE], a1 FADD2 c03, t3, c03 cmp L, 0 FMUL a2, b1, t3 LDF [BO - 4 * SIZE], b1 FADD4 c04, t4, c04 nop FMUL a2, b2, t4 LDF [AO + 5 * SIZE], a2 FADD1 c01, t1, c01 nop FMUL a3, b3, t1 LDF [BO - 3 * SIZE], b2 FADD3 c02, t2, c02 nop FMUL a3, b4, t2 LDF [AO + 6 * SIZE], a3 FADD2 c03, t3, c03 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD4 c04, t4, c04 nop FMUL a4, b4, t4 LDF [AO + 7 * SIZE], a4 FADD1 c01, t1, c01 nop FMUL a1, b1, t1 LDF [BO - 1 * SIZE], b4 FADD3 c02, t2, c02 FMUL a1, b2, t2 LDF [AO + 8 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [AO + 9 * SIZE], a2 FADD1 c01, t1, c01 FMUL a3, b3, t1 LDF [BO + 1 * SIZE], b2 FADD3 c02, t2, c02 FMUL a3, b4, t2 LDF [AO + 10 * SIZE], a3 FADD2 c03, t3, c03 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD4 c04, t4, c04 FMUL a4, b4, t4 LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO bg,pt %icc, .LL152 LDF [BO + 3 * SIZE], b4 .LL155: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL159 nop .LL156: FADD1 c01, t1, c01 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add BO, 2 * SIZE, BO FADD3 c02, t2, c02 add L, -1, L FMUL a1, b2, t2 LDF [AO + 0 * SIZE], a1 FADD2 c03, t3, c03 FMUL a2, b1, t3 LDF [BO + 0 * SIZE], b1 cmp L, 0 FADD4 c04, t4, c04 FMUL a2, b2, t4 LDF [BO + 1 * SIZE], b2 bg,pt %icc, .LL156 LDF [AO + 1 * SIZE], a2 .LL159: FADD1 c01, t1, c01 FADD3 c02, t2, c02 FADD2 c03, t3, c03 FADD4 c04, t4, c04 FADD c01, c04, c01 FADD c02, c03, c02 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD2 t3, t4, c02 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef RT LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL199: #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .LL100: sra N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop .LL11: #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 sub B, TEMP1, B add LDC, LDC, TEMP1 sub C, TEMP1, C #endif FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 sra M, 1, I mov C, C1 add C, LDC, C2 #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif cmp I, 0 #ifndef RT add C2, LDC, C #endif ble,pn %icc, .LL50 FMOV FZERO, t4 .LL21: #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 FMOV FZERO, c01 FMOV FZERO, c02 LDF [AO + 0 * SIZE], a1 FMOV FZERO, c03 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c04 LDF [AO + 1 * SIZE], a2 FMOV FZERO, c05 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c06 LDF [AO + 2 * SIZE], a3 FMOV FZERO, c07 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c08 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c09 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c10 LDF [BO + 4 * SIZE], b5 FMOV FZERO, c11 LDF [AO + 4 * SIZE], a5 FMOV FZERO, c12 prefetch [C1 + 3 * SIZE], 3 FMOV FZERO, c13 prefetch [C2 + 3 * SIZE], 3 FMOV FZERO, c14 FMOV FZERO, c15 ble,pn %icc, .LL25 FMOV FZERO, c16 .LL22: FADD2 c04, t1, c04 prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY FMUL a1, b1, t1 nop FADD4 c08, t2, c08 prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY FMUL a1, b2, t2 add AO, 16 * SIZE, AO FADD2 c12, t3, c12 LDF [AO - 13 * SIZE], a4 FMUL a1, b3, t3 add BO, 16 * SIZE, BO FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 8 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 add L, -1, L FMUL a2, b4, t4 LDF [AO - 11 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 10 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 9 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a5, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a5, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO - 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO - 7 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 6 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b5, t1 LDF [BO - 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a1, b1, t1 LDF [AO - 5 * SIZE], a4 FADD4 c08, t2, c08 nop FMUL a1, b2, t2 nop FADD2 c12, t3, c12 nop FMUL a1, b3, t3 nop FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO - 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop #ifdef DOUBLE prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY #else nop #endif FADD3 c05, t2, c05 nop FMUL a2, b2, t2 FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 nop FADD2 c02, t1, c02 nop FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c06, t2, c06 #ifdef DOUBLE prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY #else nop #endif FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO - 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c15, t4, c15 nop FMUL a4, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c04, t1, c04 nop FMUL a5, b5, t1 LDF [AO - 1 * SIZE], a4 FADD4 c08, t2, c08 FMUL a5, b2, t2 FADD2 c12, t3, c12 FMUL a5, b3, t3 FADD4 c16, t4, c16 nop FMUL a5, b4, t4 LDF [AO + 4 * SIZE], a5 FADD1 c01, t1, c01 nop FMUL a2, b5, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b5, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 cmp L, 0 FMUL a4, b5, t1 LDF [BO + 4 * SIZE], b5 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL22 LDF [BO + 3 * SIZE], b4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,pn %icc, .LL29 nop .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 FMUL a1, b1, t1 add AO, 4 * SIZE, AO FADD4 c08, t2, c08 add BO, 4 * SIZE, BO FMUL a1, b2, t2 add L, -1, L FADD2 c12, t3, c12 nop FMUL a1, b3, t3 cmp L, 0 FADD4 c16, t4, c16 nop FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 nop FMUL a2, b1, t1 nop FADD3 c05, t2, c05 nop FMUL a2, b2, t2 nop FADD1 c09, t3, c09 nop FMUL a2, b3, t3 nop FADD3 c13, t4, c13 nop FMUL a2, b4, t4 LDF [AO + 1 * SIZE], a2 FADD2 c02, t1, c02 nop FMUL a3, b1, t1 nop FADD4 c06, t2, c06 nop FMUL a3, b2, t2 nop FADD2 c10, t3, c10 nop FMUL a3, b3, t3 nop FADD4 c14, t4, c14 nop FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c03, t1, c03 nop FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c07, t2, c07 nop FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c11, t3, c11 nop FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c15, t4, c15 FMUL a4, b4, t4 bg,pt %icc, .LL26 LDF [BO + 3 * SIZE], b4 .LL29: #if defined(LN) || defined(RT) sub KK, 2, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif FADD2 c04, t1, c04 FADD4 c08, t2, c08 FADD2 c12, t3, c12 FADD4 c16, t4, c16 FADD c01, c06, c01 FADD c02, c05, c02 FADD c03, c08, c03 FADD c04, c07, c04 FADD c09, c14, c09 FADD c10, c13, c10 FADD c11, c16, c11 FADD c12, c15, c12 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c09, c09 FSUB a4, c10, c10 FSUB b1, c03, c03 FSUB b2, c04, c04 FSUB b3, c11, c11 FSUB b4, c12, c12 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c09, c09 FSUB b2, c10, c10 FSUB b3, c11, c11 FSUB b4, c12, c12 #endif #ifdef LN LDF [AO + 6 * SIZE], a1 LDF [AO + 7 * SIZE], a2 LDF [AO + 4 * SIZE], a3 LDF [AO + 5 * SIZE], a4 LDF [AO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], b2 FMUL a1, c03, t1 FMUL a2, c04, t2 FMUL a1, c04, t3 FMUL a2, c03, t4 FMUL a1, c11, t5 FMUL a2, c12, t6 FMUL a1, c12, t7 FMUL a2, c11, t8 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FADD4 t5, t6, c11 FADD2 t7, t8, c12 FMUL a3, c03, t1 FMUL a3, c04, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FMUL a4, c04, t5 FMUL a4, c03, t6 FMUL a4, c12, t7 FMUL a4, c11, t8 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c09, t3, c09 FSUB c10, t4, c10 FADD2 c01, t5, c01 FADD4 c02, t6, c02 FADD2 c09, t7, c09 FADD4 c10, t8, c10 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FMUL b1, c09, t5 FMUL b2, c10, t6 FMUL b1, c10, t7 FMUL b2, c09, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c09 FADD2 t7, t8, c10 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 6 * SIZE], b1 LDF [AO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c09, t5 FMUL a2, c10, t6 FMUL a1, c10, t7 FMUL a2, c09, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c09 FADD2 t7, t8, c10 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c09, t3 FMUL a3, c10, t4 FMUL a4, c02, t5 FMUL a4, c01, t6 FMUL a4, c10, t7 FMUL a4, c09, t8 FSUB c03, t1, c03 FSUB c04, t2, c04 FSUB c11, t3, c11 FSUB c12, t4, c12 FADD2 c03, t5, c03 FADD4 c04, t6, c04 FADD2 c11, t7, c11 FADD4 c12, t8, c12 FMUL b1, c03, t1 FMUL b2, c04, t2 FMUL b1, c04, t3 FMUL b2, c03, t4 FMUL b1, c11, t5 FMUL b2, c12, t6 FMUL b1, c12, t7 FMUL b2, c11, t8 FADD4 t1, t2, c03 FADD2 t3, t4, c04 FADD4 t5, t6, c11 FADD2 t7, t8, c12 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c03, t5 FMUL a2, c04, t6 FMUL a1, c04, t7 FMUL a2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a3, c03, t3 FMUL a3, c04, t4 FMUL a4, c02, t5 FMUL a4, c01, t6 FMUL a4, c04, t7 FMUL a4, c03, t8 FSUB c09, t1, c09 FSUB c10, t2, c10 FSUB c11, t3, c11 FSUB c12, t4, c12 FADD3 c09, t5, c09 FADD4 c10, t6, c10 FADD3 c11, t7, c11 FADD4 c12, t8, c12 FMUL b1, c09, t1 FMUL b2, c10, t2 FMUL b1, c10, t3 FMUL b2, c09, t4 FMUL b1, c11, t5 FMUL b2, c12, t6 FMUL b1, c12, t7 FMUL b2, c11, t8 FADD4 t1, t2, c09 FADD3 t3, t4, c10 FADD4 t5, t6, c11 FADD3 t7, t8, c12 #endif #ifdef RT LDF [BO + 6 * SIZE], a1 LDF [BO + 7 * SIZE], a2 LDF [BO + 4 * SIZE], a3 LDF [BO + 5 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL a1, c09, t1 FMUL a2, c10, t2 FMUL a1, c10, t3 FMUL a2, c09, t4 FMUL a1, c11, t5 FMUL a2, c12, t6 FMUL a1, c12, t7 FMUL a2, c11, t8 FADD4 t1, t2, c09 FADD3 t3, t4, c10 FADD4 t5, t6, c11 FADD3 t7, t8, c12 FMUL a3, c09, t1 FMUL a3, c10, t2 FMUL a3, c11, t3 FMUL a3, c12, t4 FMUL a4, c10, t5 FMUL a4, c09, t6 FMUL a4, c12, t7 FMUL a4, c11, t8 FSUB c01, t1, c01 FSUB c02, t2, c02 FSUB c03, t3, c03 FSUB c04, t4, c04 FADD3 c01, t5, c01 FADD4 c02, t6, c02 FADD3 c03, t7, c03 FADD4 c04, t8, c04 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FMUL b1, c03, t5 FMUL b2, c04, t6 FMUL b1, c04, t7 FMUL b2, c03, t8 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FADD4 t5, t6, c03 FADD3 t7, t8, c04 #endif #ifdef LN add C1, -4 * SIZE, C1 add C2, -4 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c09, [BO + 2 * SIZE] STF c10, [BO + 3 * SIZE] STF c03, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c11, [BO + 6 * SIZE] STF c12, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c10, [AO + 5 * SIZE] STF c11, [AO + 6 * SIZE] STF c12, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C1 + 2 * SIZE] STF c04, [C1 + 3 * SIZE] STF c09, [C2 + 0 * SIZE] STF c10, [C2 + 1 * SIZE] STF c11, [C2 + 2 * SIZE] STF c12, [C2 + 3 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 4 * SIZE, C1 add C2, 4 * SIZE, C2 #endif #ifdef RT sll K, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL21 FMOV FZERO, c01 .LL50: and M, 1, I FMOV FZERO, c02 cmp I, 0 FMOV FZERO, t1 ble,pn %icc, .LL99 FMOV FZERO, c04 #if defined(LT) || defined(RN) sra KK, 2, L mov B, BO cmp L, 0 #else #ifdef LN sll K, 0 + ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, 0 + ZBASE_SHIFT, TEMP1 sll KK, 1 + ZBASE_SHIFT, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO sub K, KK, TEMP1 sra TEMP1, 2, L cmp L, 0 #endif LDF [AO + 0 * SIZE], a1 FMOV FZERO, t2 LDF [BO + 0 * SIZE], b1 FMOV FZERO, c06 LDF [AO + 1 * SIZE], a2 FMOV FZERO, t3 LDF [BO + 1 * SIZE], b2 FMOV FZERO, c08 LDF [AO + 2 * SIZE], a3 FMOV FZERO, t4 LDF [BO + 2 * SIZE], b3 FMOV FZERO, c01 LDF [AO + 3 * SIZE], a4 FMOV FZERO, c03 LDF [BO + 3 * SIZE], b4 FMOV FZERO, c05 ble,pn %icc, .LL55 FMOV FZERO, c07 .LL52: FADD2 c02, t1, c02 add AO, 8 * SIZE, AO prefetch [AO + APREFETCHSIZE * SIZE], 0 FMUL a1, b1, t1 add BO, 16 * SIZE, BO FADD4 c04, t2, c04 add L, -1, L FMUL a1, b2, t2 FADD2 c06, t3, c06 cmp L, 0 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO - 4 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 12 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 11 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 10 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 9 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO - 3 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO - 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO - 8 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO - 7 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO - 6 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO - 5 * SIZE], b4 FADD2 c02, t1, c02 FMUL a1, b1, t1 LDF [AO - 1 * SIZE], a4 FADD4 c04, t2, c04 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO - 4 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO - 3 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO - 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO - 1 * SIZE], b4 FADD2 c02, t1, c02 FMUL a3, b1, t1 LDF [AO + 1 * SIZE], a2 FADD4 c04, t2, c04 FMUL a3, b2, t2 FADD2 c06, t3, c06 FMUL a3, b3, t3 FADD4 c08, t4, c08 FMUL a3, b4, t4 LDF [AO + 2 * SIZE], a3 FADD1 c01, t1, c01 FMUL a4, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a4, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a4, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a4, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL52 LDF [AO + 3 * SIZE], a4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else and TEMP1, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL59 nop .LL56: FADD2 c02, t1, c02 add AO, 2 * SIZE, AO FMUL a1, b1, t1 add L, -1, L add BO, 4 * SIZE, BO FADD4 c04, t2, c04 cmp L, 0 FMUL a1, b2, t2 FADD2 c06, t3, c06 FMUL a1, b3, t3 FADD4 c08, t4, c08 FMUL a1, b4, t4 LDF [AO + 0 * SIZE], a1 FADD1 c01, t1, c01 FMUL a2, b1, t1 LDF [BO + 0 * SIZE], b1 FADD3 c03, t2, c03 FMUL a2, b2, t2 LDF [BO + 1 * SIZE], b2 FADD1 c05, t3, c05 FMUL a2, b3, t3 LDF [BO + 2 * SIZE], b3 FADD3 c07, t4, c07 FMUL a2, b4, t4 LDF [BO + 3 * SIZE], b4 bg,pt %icc, .LL56 LDF [AO + 1 * SIZE], a2 .LL59: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif FADD2 c02, t1, c02 FADD4 c04, t2, c04 FADD2 c06, t3, c06 FADD4 c08, t4, c08 FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #endif #ifdef LN LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FMUL a1, c05, t5 FMUL a2, c06, t6 FMUL a1, c06, t7 FMUL a2, c05, t8 FADD4 t1, t2, c01 FADD2 t3, t4, c02 FADD4 t5, t6, c05 FADD2 t7, t8, c06 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL a1, c01, t1 FMUL a2, c02, t2 FMUL a1, c02, t3 FMUL a2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 FMUL a3, c01, t1 FMUL a3, c02, t2 FMUL a4, c02, t3 FMUL a4, c01, t4 FSUB c05, t1, c05 FSUB c06, t2, c06 FADD3 c05, t3, c05 FADD4 c06, t4, c06 FMUL b1, c05, t1 FMUL b2, c06, t2 FMUL b1, c06, t3 FMUL b2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06 #endif #ifdef RT LDF [BO + 6 * SIZE], a1 LDF [BO + 7 * SIZE], a2 LDF [BO + 4 * SIZE], a3 LDF [BO + 5 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL a1, c05, t1 FMUL a2, c06, t2 FMUL a1, c06, t3 FMUL a2, c05, t4 FADD4 t1, t2, c05 FADD3 t3, t4, c06 FMUL a3, c05, t1 FMUL a3, c06, t2 FMUL a4, c06, t3 FMUL a4, c05, t4 FSUB c01, t1, c01 FSUB c02, t2, c02 FADD3 c01, t3, c01 FADD4 c02, t4, c02 FMUL b1, c01, t1 FMUL b2, c02, t2 FMUL b1, c02, t3 FMUL b2, c01, t4 FADD4 t1, t2, c01 FADD3 t3, t4, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] FMOV FZERO, t1 FMOV FZERO, t2 FMOV FZERO, t3 FMOV FZERO, t4 #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, 0 + ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .LL99: #ifdef LN sll K, 1 + ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/sparc/ztrsm_kernel_RT_1x4.S000066400000000000000000001207641313527062700215120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #define A %i5 #define B %i3 #define C %i4 #define LDC %o0 #define AO %o1 #define BO %o2 #define I %o3 #define J %o4 #define L %o5 #define C1 %l0 #define C2 %l1 #define C3 %l2 #define C4 %l3 #define OFFSET %l4 #define KK %l5 #define TEMP1 %l6 #define TEMP2 %l7 #define AORIG %o7 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #endif #ifndef CONJ #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FMADD #define FMADD4 FNMSUB #else #if defined(LN) || defined(LT) #define FMADD1 FMADD #define FMADD2 FNMSUB #define FMADD3 FMADD #define FMADD4 FMADD #endif #if defined(RN) || defined(RT) #define FMADD1 FMADD #define FMADD2 FMADD #define FMADD3 FNMSUB #define FMADD4 FMADD #endif #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A ld [%sp + STACK_START + 36], B ld [%sp + STACK_START + 40], C ld [%sp + STACK_START + 44], LDC ld [%sp + STACK_START + 48], OFFSET #else ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #endif #else ldx [%sp + STACK_START + 56], B ldx [%sp + STACK_START + 64], C ldx [%sp + STACK_START + 72], LDC ldx [%sp + STACK_START + 80], OFFSET #endif cmp M, 0 ble,pn %icc, .LL999 nop sll LDC, ZBASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add A, TEMP1, A sll M, ZBASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif and N, 1, J cmp J, 0 ble,pn %icc, .LL20 nop #ifdef RT sll K, ZBASE_SHIFT, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C #else sub C, LDC, C1 sub C, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif mov M, I .align 4 .LL32: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 FCLR (cc08) .align 4 .LL33: FMADD1 (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD4 (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD1 (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD2 (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD3 (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD4 (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD1 (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD2 (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD3 (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD4 (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD1 (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD2 (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD3 (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD4 (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD4 (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL37 LDF [BO + 1 * SIZE], b2 .align 4 .LL38: FADD c01, c04, c01 FADD c02, c03, c02 #if defined(LN) || defined(RT) sub KK, 1, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 #endif FSUB a1, c01, c01 FSUB a2, c02, c02 #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 #else LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 #endif FMUL a1, c01, b1 FMUL a2, c01, b2 #ifndef CONJ FNMSUB (aa2, cc02, bb1, cc01) FMADD (aa1, cc02, bb2, cc02) #else FMADD (aa2, cc02, bb1, cc01) FMSUB (aa1, cc02, bb2, cc02) #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, ZBASE_SHIFT, TEMP1 add AO, TEMP1, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 nop #ifdef LN sll K, ZBASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .align 4 .LL20: and N, 2, J cmp J, 0 ble,pn %icc, .LL30 nop #ifdef RT sll K, ZBASE_SHIFT + 1, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C #else sub C, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif mov M, I .align 4 .LL22: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TEMP1 sll KK, ZBASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 FCLR (cc01) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc04) LDF [BO + 8 * SIZE], b9 FCLR (cc05) prefetch [C1 + 2 * SIZE], 3 FCLR (cc06) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 FCLR (cc08) .align 4 .LL23: FMADD1 (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD2 (aa2, bb3, cc06, cc06) add L, -1, L FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD4 (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD1 (aa3, bb7, cc05, cc05) cmp L, 0 FMADD2 (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD3 (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD4 (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD1 (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD2 (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD2 (aa2, bb3, cc06, cc06) nop FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD2 (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD4 (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD1 (aa3, bb7, cc05, cc05) nop FMADD2 (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD3 (aa3, bb8, cc07, cc07) FMADD4 (aa4, bb8, cc08, cc08) bg,pt %icc, .LL23 LDF [BO + 7 * SIZE], b8 .align 4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD3 (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) cmp L, 0 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD3 (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD4 (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL27 add BO, 4 * SIZE, BO .align 4 .LL28: FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 #endif FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, b1 FMUL a2, c01, b2 FMUL a1, c05, b3 FMUL a2, c05, b4 #ifndef CONJ FNMSUB (aa2, cc02, bb1, cc01) FMADD (aa1, cc02, bb2, cc02) FNMSUB (aa2, cc06, bb3, cc05) FMADD (aa1, cc06, bb4, cc06) #else FMADD (aa2, cc02, bb1, cc01) FMSUB (aa1, cc02, bb2, cc02) FMADD (aa2, cc06, bb3, cc05) FMSUB (aa1, cc06, bb4, cc06) #endif #endif #ifdef RN LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif FNMSUB (bb3, cc01, cc05, cc05) FNMSUB (bb3, cc02, cc06, cc06) #ifndef CONJ FMADD (bb4, cc02, cc05, cc05) FNMSUB (bb4, cc01, cc06, cc06) #else FNMSUB (bb4, cc02, cc05, cc05) FMADD (bb4, cc01, cc06, cc06) #endif LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif #endif #ifdef RT LDF [BO + 6 * SIZE], b1 LDF [BO + 7 * SIZE], b2 LDF [BO + 4 * SIZE], b3 LDF [BO + 5 * SIZE], b4 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif FNMSUB (bb3, cc05, cc01, cc01) FNMSUB (bb3, cc06, cc02, cc02) #ifndef CONJ FMADD (bb4, cc06, cc01, cc01) FNMSUB (bb4, cc05, cc02, cc02) #else FNMSUB (bb4, cc06, cc01, cc01) FMADD (bb4, cc05, cc02, cc02) #endif LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL22 nop #ifdef LN sll K, ZBASE_SHIFT + 1, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .align 4 .LL30: sra N, 2, J cmp J, 0 ble,pn %icc, .LL999 nop .align 4 .LL11: #ifdef RT sll K, ZBASE_SHIFT + 2, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C #else sub C, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif mov M, I .align 4 .LL12: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, ZBASE_SHIFT, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, ZBASE_SHIFT + 0, TEMP1 sll KK, ZBASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 FCLR (cc01) LDF [AO + 1 * SIZE], a2 FCLR (cc05) LDF [AO + 8 * SIZE], a5 FCLR (cc09) LDF [BO + 0 * SIZE], b1 FCLR (cc13) LDF [BO + 1 * SIZE], b2 FCLR (cc02) LDF [BO + 2 * SIZE], b3 FCLR (cc06) LDF [BO + 3 * SIZE], b4 FCLR (cc10) LDF [BO + 4 * SIZE], b5 FCLR (cc14) LDF [BO + 5 * SIZE], b6 FCLR (cc03) LDF [BO + 6 * SIZE], b7 FCLR (cc07) LDF [BO + 7 * SIZE], b8 FCLR (cc11) LDF [BO + 8 * SIZE], b9 FCLR (cc15) prefetch [C1 + 1 * SIZE], 3 FCLR (cc04) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) prefetch [C3 + 1 * SIZE], 3 FCLR (cc12) prefetch [C4 + 2 * SIZE], 3 FCLR (cc16) #if defined(LT) || defined(RN) sra KK, 3, L #else sub K, KK, L sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 nop .align 4 .LL13: FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) add L, -1, L FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) nop FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) nop FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) cmp L, 0 FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD4 (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD1 (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) FMADD4 (aa4, bb6, cc12, cc12) FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) add L, -1, L FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD1 (aa1, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa1, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD3 (aa1, bb6, cc11, cc11) nop FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD3 (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) nop FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD4 (aa2, bb6, cc12, cc12) nop FMADD1 (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) cmp L, 0 FMADD4 (aa4, bb6, cc12, cc12) nop FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD1 (aa5, bb1, cc01, cc01) FMADD2 (aa2, bb1, cc02, cc02) FMADD3 (aa5, bb2, cc03, cc03) FMADD4 (aa2, bb2, cc04, cc04) FMADD1 (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD2 (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD3 (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD1 (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD2 (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD3 (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD4 (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD1 (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD2 (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD3 (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD4 (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD1 (aa3, bb9, cc01, cc01) FMADD2 (aa4, bb9, cc02, cc02) FMADD3 (aa3, bb2, cc03, cc03) FMADD4 (aa4, bb2, cc04, cc04) FMADD1 (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD2 (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD3 (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD4 (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD1 (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD2 (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD3 (aa3, bb6, cc11, cc11) FMADD4 (aa4, bb6, cc12, cc12) FMADD1 (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD2 (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD3 (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD4 (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #if defined(LT) || defined(RN) and KK, 7, L #else sub K, KK, L and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD1 (aa1, bb1, cc01, cc01) add L, -1, L FMADD2 (aa2, bb1, cc02, cc02) nop FMADD3 (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD4 (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD1 (aa1, bb3, cc05, cc05) cmp L, 0 FMADD2 (aa2, bb3, cc06, cc06) nop FMADD3 (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD4 (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD1 (aa1, bb5, cc09, cc09) nop FMADD2 (aa2, bb5, cc10, cc10) nop FMADD3 (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD4 (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD1 (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD2 (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD3 (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD4 (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: FADD c01, c04, c01 FADD c02, c03, c02 FADD c05, c08, c05 FADD c06, c07, c06 FADD c09, c12, c09 FADD c10, c11, c10 FADD c13, c16, c13 FADD c14, c15, c14 #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 #endif FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c05, c05 FSUB a4, c06, c06 FSUB b1, c09, c09 FSUB b2, c10, c10 FSUB b3, c13, c13 FSUB b4, c14, c14 #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FMUL a1, c01, b1 FMUL a2, c01, b2 FMUL a1, c05, b3 FMUL a2, c05, b4 FMUL a1, c09, b5 FMUL a2, c09, b6 FMUL a1, c13, b7 FMUL a2, c13, b8 #ifndef CONJ FNMSUB (aa2, cc02, bb1, cc01) FMADD (aa1, cc02, bb2, cc02) FNMSUB (aa2, cc06, bb3, cc05) FMADD (aa1, cc06, bb4, cc06) FNMSUB (aa2, cc10, bb5, cc09) FMADD (aa1, cc10, bb6, cc10) FNMSUB (aa2, cc14, bb7, cc13) FMADD (aa1, cc14, bb8, cc14) #else FMADD (aa2, cc02, bb1, cc01) FMSUB (aa1, cc02, bb2, cc02) FMADD (aa2, cc06, bb3, cc05) FMSUB (aa1, cc06, bb4, cc06) FMADD (aa2, cc10, bb5, cc09) FMSUB (aa1, cc10, bb6, cc10) FMADD (aa2, cc14, bb7, cc13) FMSUB (aa1, cc14, bb8, cc14) #endif #endif #ifdef RN LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 LDF [BO + 6 * SIZE], b7 LDF [BO + 7 * SIZE], b8 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif FNMSUB (bb3, cc01, cc05, cc05) FNMSUB (bb3, cc02, cc06, cc06) FNMSUB (bb5, cc01, cc09, cc09) FNMSUB (bb5, cc02, cc10, cc10) FNMSUB (bb7, cc01, cc13, cc13) FNMSUB (bb7, cc02, cc14, cc14) #ifndef CONJ FMADD (bb4, cc02, cc05, cc05) FNMSUB (bb4, cc01, cc06, cc06) FMADD (bb6, cc02, cc09, cc09) FNMSUB (bb6, cc01, cc10, cc10) FMADD (bb8, cc02, cc13, cc13) FNMSUB (bb8, cc01, cc14, cc14) #else FNMSUB (bb4, cc02, cc05, cc05) FMADD (bb4, cc01, cc06, cc06) FNMSUB (bb6, cc02, cc09, cc09) FMADD (bb6, cc01, cc10, cc10) FNMSUB (bb8, cc02, cc13, cc13) FMADD (bb8, cc01, cc14, cc14) #endif LDF [BO + 10 * SIZE], b1 LDF [BO + 11 * SIZE], b2 LDF [BO + 12 * SIZE], b3 LDF [BO + 13 * SIZE], b4 LDF [BO + 14 * SIZE], b5 LDF [BO + 15 * SIZE], b6 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif FNMSUB (bb3, cc05, cc09, cc09) FNMSUB (bb3, cc06, cc10, cc10) FNMSUB (bb5, cc05, cc13, cc13) FNMSUB (bb5, cc06, cc14, cc14) #ifndef CONJ FMADD (bb4, cc06, cc09, cc09) FNMSUB (bb4, cc05, cc10, cc10) FMADD (bb6, cc06, cc13, cc13) FNMSUB (bb6, cc05, cc14, cc14) #else FNMSUB (bb4, cc06, cc09, cc09) FMADD (bb4, cc05, cc10, cc10) FNMSUB (bb6, cc06, cc13, cc13) FMADD (bb6, cc05, cc14, cc14) #endif LDF [BO + 20 * SIZE], b1 LDF [BO + 21 * SIZE], b2 LDF [BO + 22 * SIZE], b3 LDF [BO + 23 * SIZE], b4 FMUL b1, c09, a1 FMUL b2, c09, a2 #ifndef CONJ FNMSUB (bb2, cc10, aa1, cc09) FMADD (bb1, cc10, aa2, cc10) #else FMADD (bb2, cc10, aa1, cc09) FMSUB (bb1, cc10, aa2, cc10) #endif FNMSUB (bb3, cc09, cc13, cc13) FNMSUB (bb3, cc10, cc14, cc14) #ifndef CONJ FMADD (bb4, cc10, cc13, cc13) FNMSUB (bb4, cc09, cc14, cc14) #else FNMSUB (bb4, cc10, cc13, cc13) FMADD (bb4, cc09, cc14, cc14) #endif LDF [BO + 30 * SIZE], b1 LDF [BO + 31 * SIZE], b2 FMUL b1, c13, a1 FMUL b2, c13, a2 #ifndef CONJ FNMSUB (bb2, cc14, aa1, cc13) FMADD (bb1, cc14, aa2, cc14) #else FMADD (bb2, cc14, aa1, cc13) FMSUB (bb1, cc14, aa2, cc14) #endif #endif #ifdef RT LDF [BO + 30 * SIZE], b1 LDF [BO + 31 * SIZE], b2 LDF [BO + 28 * SIZE], b3 LDF [BO + 29 * SIZE], b4 LDF [BO + 26 * SIZE], b5 LDF [BO + 27 * SIZE], b6 LDF [BO + 24 * SIZE], b7 LDF [BO + 25 * SIZE], b8 FMUL b1, c13, a1 FMUL b2, c13, a2 #ifndef CONJ FNMSUB (bb2, cc14, aa1, cc13) FMADD (bb1, cc14, aa2, cc14) #else FMADD (bb2, cc14, aa1, cc13) FMSUB (bb1, cc14, aa2, cc14) #endif FNMSUB (bb3, cc13, cc09, cc09) FNMSUB (bb3, cc14, cc10, cc10) FNMSUB (bb5, cc13, cc05, cc05) FNMSUB (bb5, cc14, cc06, cc06) FNMSUB (bb7, cc13, cc01, cc01) FNMSUB (bb7, cc14, cc02, cc02) #ifndef CONJ FMADD (bb4, cc14, cc09, cc09) FNMSUB (bb4, cc13, cc10, cc10) FMADD (bb6, cc14, cc05, cc05) FNMSUB (bb6, cc13, cc06, cc06) FMADD (bb8, cc14, cc01, cc01) FNMSUB (bb8, cc13, cc02, cc02) #else FNMSUB (bb4, cc14, cc09, cc09) FMADD (bb4, cc13, cc10, cc10) FNMSUB (bb6, cc14, cc05, cc05) FMADD (bb6, cc13, cc06, cc06) FNMSUB (bb8, cc14, cc01, cc01) FMADD (bb8, cc13, cc02, cc02) #endif LDF [BO + 20 * SIZE], b1 LDF [BO + 21 * SIZE], b2 LDF [BO + 18 * SIZE], b3 LDF [BO + 19 * SIZE], b4 LDF [BO + 16 * SIZE], b5 LDF [BO + 17 * SIZE], b6 FMUL b1, c09, a1 FMUL b2, c09, a2 #ifndef CONJ FNMSUB (bb2, cc10, aa1, cc09) FMADD (bb1, cc10, aa2, cc10) #else FMADD (bb2, cc10, aa1, cc09) FMSUB (bb1, cc10, aa2, cc10) #endif FNMSUB (bb3, cc09, cc05, cc05) FNMSUB (bb3, cc10, cc06, cc06) FNMSUB (bb5, cc09, cc01, cc01) FNMSUB (bb5, cc10, cc02, cc02) #ifndef CONJ FMADD (bb4, cc10, cc05, cc05) FNMSUB (bb4, cc09, cc06, cc06) FMADD (bb6, cc10, cc01, cc01) FNMSUB (bb6, cc09, cc02, cc02) #else FNMSUB (bb4, cc10, cc05, cc05) FMADD (bb4, cc09, cc06, cc06) FNMSUB (bb6, cc10, cc01, cc01) FMADD (bb6, cc09, cc02, cc02) #endif LDF [BO + 10 * SIZE], b1 LDF [BO + 11 * SIZE], b2 LDF [BO + 8 * SIZE], b3 LDF [BO + 9 * SIZE], b4 FMUL b1, c05, a1 FMUL b2, c05, a2 #ifndef CONJ FNMSUB (bb2, cc06, aa1, cc05) FMADD (bb1, cc06, aa2, cc06) #else FMADD (bb2, cc06, aa1, cc05) FMSUB (bb1, cc06, aa2, cc06) #endif FNMSUB (bb3, cc05, cc01, cc01) FNMSUB (bb3, cc06, cc02, cc02) #ifndef CONJ FMADD (bb4, cc06, cc01, cc01) FNMSUB (bb4, cc05, cc02, cc02) #else FNMSUB (bb4, cc06, cc01, cc01) FMADD (bb4, cc05, cc02, cc02) #endif LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FMUL b1, c01, a1 FMUL b2, c01, a2 #ifndef CONJ FNMSUB (bb2, cc02, aa1, cc01) FMADD (bb1, cc02, aa2, cc02) #else FMADD (bb2, cc02, aa1, cc01) FMSUB (bb1, cc02, aa2, cc02) #endif #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c06, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c10, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c14, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c06, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c10, [AO + 5 * SIZE] STF c13, [AO + 6 * SIZE] STF c14, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c05, [C2 + 0 * SIZE] STF c06, [C2 + 1 * SIZE] STF c09, [C3 + 0 * SIZE] STF c10, [C3 + 1 * SIZE] STF c13, [C4 + 0 * SIZE] STF c14, [C4 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, ZBASE_SHIFT, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, ZBASE_SHIFT + 0, TEMP2 sll TEMP1, ZBASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop #ifdef LN sll K, ZBASE_SHIFT + 2, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .align 4 .LL999: return %i7 + 8 clr %o0 EPILOGUE OpenBLAS-0.2.20/kernel/x86/000077500000000000000000000000001313527062700150515ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/x86/KERNEL000066400000000000000000000061131313527062700157550ustar00rootroot00000000000000ifndef SAMINKERNEL SAMINKERNEL = amax.S endif ifndef DAMINKERNEL DAMINKERNEL = amax.S endif ifndef QAMINKERNEL QAMINKERNEL = amax.S endif ifndef CAMINKERNEL CAMINKERNEL = zamax.S endif ifndef ZAMINKERNEL ZAMINKERNEL = zamax.S endif ifndef XAMINKERNEL XAMINKERNEL = zamax.S endif ifndef SMAXKERNEL SMAXKERNEL = amax.S endif ifndef DMAXKERNEL DMAXKERNEL = amax.S endif ifndef QMAXKERNEL QMAXKERNEL = amax.S endif ifndef SMINKERNEL SMINKERNEL = amax.S endif ifndef DMINKERNEL DMINKERNEL = amax.S endif ifndef QMINKERNEL QMINKERNEL = amax.S endif ifndef ISAMINKERNEL ISAMINKERNEL = iamax.S endif ifndef IDAMINKERNEL IDAMINKERNEL = iamax.S endif ifndef IQAMINKERNEL IQAMINKERNEL = iamax.S endif ifndef ICAMINKERNEL ICAMINKERNEL = izamax.S endif ifndef IZAMINKERNEL IZAMINKERNEL = izamax.S endif ifndef IXAMINKERNEL IXAMINKERNEL = izamax.S endif ifndef ISMINKERNEL ISMINKERNEL = iamax.S endif ifndef IDMINKERNEL IDMINKERNEL = iamax.S endif ifndef IQMINKERNEL IQMINKERNEL = iamax.S endif ifndef QDOTKERNEL QDOTKERNEL = qdot.S endif ifndef XDOTKERNEL XDOTKERNEL = xdot.S endif ifndef QAXPYKERNEL QAXPYKERNEL = qaxpy.S endif ifndef XAXPYKERNEL XAXPYKERNEL = xaxpy.S endif #Use C kernel for sgemv and dgemv ifndef SGEMVNKERNEL SGEMVNKERNEL = ../arm/gemv_n.c endif ifndef SGEMVTKERNEL SGEMVTKERNEL = ../arm/gemv_t.c endif ifndef DGEMVNKERNEL DGEMVNKERNEL = ../arm/gemv_n.c endif ifndef DGEMVTKERNEL DGEMVTKERNEL = ../arm/gemv_t.c endif ifndef QGEMVNKERNEL QGEMVNKERNEL = qgemv_n.S endif ifndef QGEMVTKERNEL QGEMVTKERNEL = qgemv_t.S endif ifndef XGEMVNKERNEL XGEMVNKERNEL = xgemv_n.S endif ifndef XGEMVTKERNEL XGEMVTKERNEL = xgemv_t.S endif QGEMMKERNEL = qgemm_kernel_2x2.S QGEMMINCOPY = QGEMMITCOPY = QGEMMONCOPY = ../generic/gemm_ncopy_2.c QGEMMOTCOPY = ../generic/gemm_tcopy_2.c QGEMMINCOPYOBJ = QGEMMITCOPYOBJ = QGEMMONCOPYOBJ = qgemm_oncopy$(TSUFFIX).$(SUFFIX) QGEMMOTCOPYOBJ = qgemm_otcopy$(TSUFFIX).$(SUFFIX) XGEMMKERNEL = xgemm_kernel_1x1.S XGEMMINCOPY = XGEMMITCOPY = XGEMMONCOPY = ../generic/zgemm_ncopy_1.c XGEMMOTCOPY = ../generic/zgemm_tcopy_1.c XGEMMINCOPYOBJ = XGEMMITCOPYOBJ = XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S XGEMM3MKERNEL = xgemm3m_kernel_2x2.S # bug in zdot assembler kernel ifndef ZDOTKERNEL ZDOTKERNEL = ../arm/zdot.c endif DSDOTKERNEL = ../arm/dot.c # Bug in znrm2 assembler kernel ifndef ZNRM2KERNEL ZNRM2KERNEL = ../arm/znrm2.c endif # Bug in zgemv_t assembler kernel ifndef ZGEMVTKERNEL ZGEMVTKERNEL = ../arm/zgemv_t.c endif SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c QGEMM_BETA = ../generic/gemm_beta.c XGEMM_BETA = ../generic/zgemm_beta.c OpenBLAS-0.2.20/kernel/x86/KERNEL.ATHLON000066400000000000000000000047661313527062700167550ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_2x4_3dnow.S SGEMMINCOPY = ../generic/gemm_ncopy_2.c SGEMMITCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_1x4.S DGEMMINCOPY = ../generic/gemm_ncopy_1.c DGEMMITCOPY = ../generic/gemm_tcopy_1.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_1x2_3dnow.S CGEMMINCOPY = ../generic/zgemm_ncopy_1.c CGEMMITCOPY = ../generic/zgemm_tcopy_1.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = trsm_kernel_LT_1x4.S DTRSMKERNEL_LT = trsm_kernel_LT_1x4.S DTRSMKERNEL_RN = trsm_kernel_LT_1x4.S DTRSMKERNEL_RT = trsm_kernel_RT_1x4.S CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ifdef HAVE_SSE CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S CGEMM3MKERNEL = zgemm3m_kernel_1x4_athlon.S endif ZGEMM3MKERNEL = zgemm3m_kernel_1x4_athlon.S OpenBLAS-0.2.20/kernel/x86/KERNEL.ATOM000066400000000000000000000037331313527062700165210ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_penryn.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x2_atom.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_2.S DGEMMOTCOPY = gemm_tcopy_2.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_penryn.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x1_atom.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S DTRSMKERNEL_LN = trsm_kernel_LN_2x2_atom.S DTRSMKERNEL_LT = trsm_kernel_LT_2x2_atom.S DTRSMKERNEL_RN = trsm_kernel_LT_2x2_atom.S DTRSMKERNEL_RT = trsm_kernel_RT_2x2_atom.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1_atom.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1_atom.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1_atom.S ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1_atom.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S ZGEMM3MKERNEL = zgemm3m_kernel_2x2_atom.S OpenBLAS-0.2.20/kernel/x86/KERNEL.BANIAS000066400000000000000000000042321313527062700167110ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x2_sse.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x2.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x1_sse.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_1.c CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x1.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S OpenBLAS-0.2.20/kernel/x86/KERNEL.BARCELONA000066400000000000000000000043471313527062700172510ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_barcelona.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S OpenBLAS-0.2.20/kernel/x86/KERNEL.BOBCAT000066400000000000000000000043471313527062700167150ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_barcelona.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S OpenBLAS-0.2.20/kernel/x86/KERNEL.BULLDOZER000066400000000000000000000043471313527062700173250ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_barcelona.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S OpenBLAS-0.2.20/kernel/x86/KERNEL.COPPERMINE000066400000000000000000000042321313527062700174150ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x2_sse.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x2.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x1_sse.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_1.c CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x1.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S OpenBLAS-0.2.20/kernel/x86/KERNEL.CORE2000066400000000000000000000046331313527062700165730ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x2_core2.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_2.S SGEMMOTCOPY = gemm_tcopy_2.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x2_core2.S DGEMMINCOPY = gemm_ncopy_4_sse.S DGEMMITCOPY = gemm_tcopy_4_sse.S DGEMMONCOPY = gemm_ncopy_2_sse.S DGEMMOTCOPY = gemm_tcopy_2_sse.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x1_core2.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_1.c CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x1_core2.S ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x2_core2.S DTRSMKERNEL_LT = trsm_kernel_LT_4x2_core2.S DTRSMKERNEL_RN = trsm_kernel_LT_4x2_core2.S DTRSMKERNEL_RT = trsm_kernel_RT_4x2_core2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_core2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_core2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_core2.S ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_core2.S CGEMM3MKERNEL = zgemm3m_kernel_8x2_core2.S ZGEMM3MKERNEL = zgemm3m_kernel_4x2_core2.S OpenBLAS-0.2.20/kernel/x86/KERNEL.DUNNINGTON000066400000000000000000000043471313527062700174460ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_penryn.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_penryn.S DGEMMINCOPY = gemm_ncopy_2.S DGEMMITCOPY = gemm_tcopy_2.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_penryn.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_penryn.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_penryn.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_penryn.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_penryn.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_penryn.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_penryn.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_penryn.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_penryn.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_penryn.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_penryn.S OpenBLAS-0.2.20/kernel/x86/KERNEL.HASWELL000066400000000000000000000000431313527062700170470ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.PENRYN OpenBLAS-0.2.20/kernel/x86/KERNEL.KATMAI000066400000000000000000000000471313527062700167220ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.COPPERMINE OpenBLAS-0.2.20/kernel/x86/KERNEL.NANO000066400000000000000000000000431313527062700165030ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.PENRYN OpenBLAS-0.2.20/kernel/x86/KERNEL.NEHALEM000066400000000000000000000000431313527062700170210ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.PENRYN OpenBLAS-0.2.20/kernel/x86/KERNEL.NORTHWOOD000066400000000000000000000046101313527062700173370ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x2_sse.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_2.S SGEMMOTCOPY = gemm_tcopy_2.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x2_sse2.S DGEMMINCOPY = gemm_ncopy_4_sse.S DGEMMITCOPY = gemm_tcopy_4_sse.S DGEMMONCOPY = gemm_ncopy_2.S DGEMMOTCOPY = gemm_tcopy_2.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x1_sse.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_1.c CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x1_sse2.S ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x2_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_4x2_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_4x2_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_4x2_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S ZGEMM3MKERNEL = zgemm3m_kernel_4x2_northwood.S OpenBLAS-0.2.20/kernel/x86/KERNEL.OPTERON000066400000000000000000000043151313527062700171040ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_sse.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_sse2.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_sse.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_sse2.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_opteron.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_opteron.S OpenBLAS-0.2.20/kernel/x86/KERNEL.OPTERON_SSE3000066400000000000000000000000441313527062700176740ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.OPTERON OpenBLAS-0.2.20/kernel/x86/KERNEL.P5000066400000000000000000000000401313527062700162310ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.P6 OpenBLAS-0.2.20/kernel/x86/KERNEL.P6000066400000000000000000000036041313527062700162430ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_2x2.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x2.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_1x1.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_1.c CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x1.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_2x2.S STRSMKERNEL_LT = trsm_kernel_LT_2x2.S STRSMKERNEL_RN = trsm_kernel_LT_2x2.S STRSMKERNEL_RT = trsm_kernel_RT_2x2.S DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S CTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S CTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S CTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S CTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S CGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S OpenBLAS-0.2.20/kernel/x86/KERNEL.PENRYN000066400000000000000000000043471313527062700167760ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_penryn.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_penryn.S DGEMMINCOPY = gemm_ncopy_2.S DGEMMITCOPY = gemm_tcopy_2.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_penryn.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_penryn.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_penryn.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_penryn.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_penryn.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_penryn.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_penryn.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_penryn.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_penryn.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_penryn.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_penryn.S OpenBLAS-0.2.20/kernel/x86/KERNEL.PILEDRIVER000066400000000000000000000043471313527062700174300ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_barcelona.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S OpenBLAS-0.2.20/kernel/x86/KERNEL.PRESCOTT000066400000000000000000000042731313527062700172240ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_sse3.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_sse3.S DGEMMINCOPY = gemm_ncopy_2.S DGEMMITCOPY = gemm_tcopy_2.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_sse3.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse3.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse3.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse3.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse3.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse3.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse3.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse3.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse3.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_prescott.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_prescott.S OpenBLAS-0.2.20/kernel/x86/KERNEL.SANDYBRIDGE000066400000000000000000000000431313527062700175030ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.PENRYN OpenBLAS-0.2.20/kernel/x86/KERNEL.VIAC3000066400000000000000000000000431313527062700165550ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.ATHLON OpenBLAS-0.2.20/kernel/x86/KERNEL.YONAH000066400000000000000000000043211313527062700166310ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x4_sse3.S SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_sse3.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_sse3.S CGEMMINCOPY = CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse3.S DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse3.S DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse3.S DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse3.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse3.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse3.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse3.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse3.S CGEMM3MKERNEL = zgemm3m_kernel_4x4_prescott.S ZGEMM3MKERNEL = zgemm3m_kernel_2x4_prescott.S OpenBLAS-0.2.20/kernel/x86/KERNEL.ZEN000066400000000000000000000000461313527062700164070ustar00rootroot00000000000000include $(KERNELDIR)/KERNEL.BARCELONA OpenBLAS-0.2.20/kernel/x86/KERNEL.generic000066400000000000000000000107321313527062700173720ustar00rootroot00000000000000SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #Pure C for other kernels SAMAXKERNEL = ../arm/amax.c DAMAXKERNEL = ../arm/amax.c CAMAXKERNEL = ../arm/zamax.c ZAMAXKERNEL = ../arm/zamax.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c ZAMINKERNEL = ../arm/zamin.c SMAXKERNEL = ../arm/max.c DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = ../arm/iamax.c ICAMAXKERNEL = ../arm/izamax.c IZAMAXKERNEL = ../arm/izamax.c ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = ../arm/izamin.c ISMAXKERNEL = ../arm/imax.c IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c SCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = ../arm/copy.c CCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = ../arm/zcopy.c SDOTKERNEL = ../arm/dot.c DDOTKERNEL = ../arm/dot.c CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = ../arm/rot.c DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c SSCALKERNEL = ../arm/scal.c DSCALKERNEL = ../arm/scal.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c SSYMV_U_KERNEL = ../generic/symv_k.c SSYMV_L_KERNEL = ../generic/symv_k.c DSYMV_U_KERNEL = ../generic/symv_k.c DSYMV_L_KERNEL = ../generic/symv_k.c QSYMV_U_KERNEL = ../generic/symv_k.c QSYMV_L_KERNEL = ../generic/symv_k.c CSYMV_U_KERNEL = ../generic/zsymv_k.c CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c XSYMV_U_KERNEL = ../generic/zsymv_k.c XSYMV_L_KERNEL = ../generic/zsymv_k.c ZHEMV_U_KERNEL = ../generic/zhemv_k.c ZHEMV_L_KERNEL = ../generic/zhemv_k.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c OpenBLAS-0.2.20/kernel/x86/Makefile000066400000000000000000000000121313527062700165020ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/x86/amax.S000066400000000000000000000133561313527062700161330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) PROLOGUE #define M %ebx #define INCX %esi #define X %ecx #define I %edx #ifndef USE_MIN #define FMOV fcmovbe #else #define FMOV fcmovnbe #endif #include "l1param.h" pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_INCX, INCX movl STACK_X, X #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif sall $BASE_SHIFT, INCX fldz testl M, M jle .L999 testl INCX, INCX jle .L999 fstp %st(0) FLD (X) #ifdef USE_ABS fabs #endif addl INCX, X decl M jle .L999 cmpl $SIZE, INCX jne .L40 movl M, I sarl $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 1 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 2 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 3 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 4 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 5 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 6 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 7 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) addl $8 * SIZE, X decl I jg .L10 ALIGN_4 .L20: movl M, I andl $7, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) addl $1 * SIZE, X decl I jg .L21 jmp .L999 ALIGN_4 .L40: movl M, I sarl $3, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) decl I jg .L50 ALIGN_4 .L60: movl M, I andl $7, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) addl INCX, X decl I jg .L61 ALIGN_4 .L999: popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/amax_sse.S000066400000000000000000000213051313527062700167760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %ebx #define X %ecx #define INCX %edx #define I %eax #ifdef USE_MIN #define maxps minps #define maxss minss #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif xorps %xmm0, %xmm0 leal (, INCX, SIZE), INCX testl M, M jle .L999 #ifdef USE_ABS #ifndef HAVE_SSE2 subl $8, %esp movl $0x7fffffff, (%esp) movss (%esp), %xmm3 shufps $0, %xmm3, %xmm3 addl $8, %esp #else pcmpeqb %xmm3, %xmm3 psrld $1, %xmm3 #endif #endif movss (X), %xmm0 shufps $0, %xmm0, %xmm0 #ifdef USE_ABS andps %xmm3, %xmm0 #endif movaps %xmm0, %xmm1 addl INCX, X decl M jle .L999 cmpl $SIZE, INCX jne .L40 subl $-32 * SIZE, X cmpl $3, M jle .L17 testl $SIZE, X je .L05 movss -32 * SIZE(X), %xmm4 addl $SIZE, X shufps $0, %xmm4, %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 decl M ALIGN_3 .L05: testl $2 * SIZE, X je .L06 movsd -32 * SIZE(X), %xmm4 addl $2 * SIZE, X unpcklps %xmm4, %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm1 subl $2, M ALIGN_3 .L06: movl M, I sarl $5, I jle .L15 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 decl I jle .L12 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxps %xmm7, %xmm1 movaps -4 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxps %xmm6, %xmm0 movaps 8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxps %xmm7, %xmm1 movaps 12 * SIZE(X), %xmm7 subl $-32 * SIZE, X decl I jg .L11 ALIGN_4 .L12: #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxps %xmm7, %xmm1 movaps -4 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxps %xmm5, %xmm1 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxps %xmm6, %xmm0 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxps %xmm7, %xmm1 subl $-32 * SIZE, X ALIGN_3 .L15: testl $16, M je .L16 movaps -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxps %xmm5, %xmm1 movaps -24 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxps %xmm6, %xmm0 movaps -20 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxps %xmm7, %xmm1 addl $16 * SIZE, X ALIGN_3 .L16: testl $8, M je .L17 movaps -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxps %xmm5, %xmm1 addl $8 * SIZE, X ALIGN_3 .L17: testl $4, M je .L18 movaps -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm0 addl $4 * SIZE, X ALIGN_3 .L18: testl $2, M je .L19 movsd -32 * SIZE(X), %xmm4 unpcklps %xmm4, %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxps %xmm4, %xmm1 addl $2 * SIZE, X ALIGN_3 .L19: testl $1, M je .L998 movss -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxss %xmm4, %xmm0 jmp .L998 ALIGN_3 .L40: movl M, I sarl $3, I jle .L45 ALIGN_4 .L41: movss (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxss %xmm5, %xmm1 movss (X), %xmm6 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxss %xmm6, %xmm0 movss (X), %xmm7 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxss %xmm7, %xmm1 movss (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxss %xmm5, %xmm1 movss (X), %xmm6 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxss %xmm6, %xmm0 movss (X), %xmm7 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxss %xmm7, %xmm1 decl I jg .L41 ALIGN_4 .L45: testl $4, M je .L46 movss (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxss %xmm5, %xmm1 movss (X), %xmm6 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxss %xmm6, %xmm0 movss (X), %xmm7 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxss %xmm7, %xmm1 ALIGN_3 .L46: testl $2, M je .L47 movss (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxss %xmm5, %xmm1 ALIGN_3 .L47: testl $1, M je .L998 movss (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxss %xmm4, %xmm0 ALIGN_4 .L998: maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 ALIGN_4 .L999: subl $8, %esp movss %xmm0, (%esp) flds (%esp) addl $8, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/amax_sse2.S000066400000000000000000000220261313527062700170610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %ebx #define X %ecx #define INCX %edx #define I %eax #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX xorps %xmm0, %xmm0 leal (, INCX, SIZE), INCX testl M, M jle .L999 #ifdef USE_ABS pcmpeqb %xmm3, %xmm3 psrlq $1, %xmm3 #endif movsd (X), %xmm0 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm0 #endif unpcklpd %xmm0, %xmm0 movaps %xmm0, %xmm1 decl M jle .L999 cmpl $SIZE, INCX jne .L40 subl $-16 * SIZE, X testl $SIZE, X je .L05 movsd -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif unpcklpd %xmm4, %xmm4 maxpd %xmm4, %xmm0 addl $SIZE, X decl M jle .L998 ALIGN_3 .L05: movl M, I sarl $4, I jle .L15 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 decl I jle .L12 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 movaps -2 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 movaps 4 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 movaps 6 * SIZE(X), %xmm7 subl $-16 * SIZE, X decl I jg .L11 ALIGN_4 .L12: #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 movaps -2 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 subl $-16 * SIZE, X ALIGN_4 .L15: testl $8, M jle .L16 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -14 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 movaps -10 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 addl $8 * SIZE, X ALIGN_3 .L16: testl $4, M jle .L17 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -14 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 addl $4 * SIZE, X ALIGN_3 .L17: testl $2, M jle .L18 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 addl $2 * SIZE, X ALIGN_3 .L18: testl $1, M jle .L998 movsd -16 * SIZE(X), %xmm4 unpcklpd %xmm4, %xmm4 #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm1 jmp .L998 ALIGN_3 .L40: movl M, I sarl $4, I jle .L45 ALIGN_4 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 movsd (X), %xmm6 addl INCX, X movhps (X), %xmm6 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 movsd (X), %xmm7 addl INCX, X movhps (X), %xmm7 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 movsd (X), %xmm6 addl INCX, X movhps (X), %xmm6 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 movsd (X), %xmm7 addl INCX, X movhps (X), %xmm7 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 decl I jg .L41 ALIGN_4 .L45: andl $15, M jle .L998 testl $8, M je .L46 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 movsd (X), %xmm6 addl INCX, X movhps (X), %xmm6 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 movsd (X), %xmm7 addl INCX, X movhps (X), %xmm7 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 ALIGN_3 .L46: testl $4, M je .L47 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 ALIGN_3 .L47: testl $2, M je .L48 movsd (X), %xmm6 addl INCX, X movhps (X), %xmm6 addl INCX, X #ifdef USE_ABS andps %xmm3, %xmm6 #endif maxpd %xmm6, %xmm0 ALIGN_3 .L48: testl $1, M je .L998 movsd (X), %xmm7 unpcklpd %xmm7, %xmm7 #ifdef USE_ABS andps %xmm3, %xmm7 #endif maxpd %xmm7, %xmm1 ALIGN_4 .L998: maxpd %xmm1, %xmm0 movaps %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 ALIGN_4 .L999: subl $8, %esp movsd %xmm0, (%esp) fldl (%esp) addl $8, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/asum.S000066400000000000000000000111341313527062700161420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %edx #define X %ecx #define INCX %esi #define I %eax #include "l1param.h" PROLOGUE pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif fldz testl M, M jle .L999 testl INCX, INCX jle .L999 sall $BASE_SHIFT, INCX fldz fldz fldz cmpl $SIZE, INCX jne .L40 movl M, I sarl $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs addl $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L10 ALIGN_4 .L20: movl M, I andl $7, I jle .L998 ALIGN_4 .L21: FLD (X) fabs faddp %st,%st(1) addl $1 * SIZE, X decl I jg .L21 jmp .L998 ALIGN_4 .L40: movl M, I sarl $3, I jle .L60 ALIGN_4 .L50: FLD (X) addl INCX, X fabs FLD (X) addl INCX, X fabs FLD (X) addl INCX, X fabs FLD (X) addl INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD (X) addl INCX, X fabs FLD (X) addl INCX, X fabs FLD (X) addl INCX, X fabs FLD (X) addl INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L50 ALIGN_4 .L60: movl M, I andl $7, I jle .L998 ALIGN_4 .L61: FLD (X) addl INCX, X fabs faddp %st,%st(1) decl I jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/asum_sse.S000066400000000000000000000157751313527062700170330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define I %eax #define M %ecx #define X %esi #define INCX %ebx #include "l1param.h" PROLOGUE PROFCODE pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX xorps %xmm0, %xmm0 testl M, M jle .L999 testl INCX, INCX jle .L999 xorps %xmm1, %xmm1 #ifdef HAVE_SSE2 pcmpeqb %xmm3, %xmm3 psrld $1, %xmm3 #else movl $0x7fffffff, STACK_M movss STACK_M, %xmm3 shufps $0, %xmm3, %xmm3 #endif leal (, INCX, SIZE), INCX cmpl $SIZE, INCX jne .L100 subl $-32 * SIZE, X cmpl $3, M jle .L18 testl $4, X je .L05 movss -32 * SIZE(X), %xmm0 andps %xmm3, %xmm0 addl $SIZE, X decl M jle .L998 ALIGN_3 .L05: testl $8, X je .L10 movsd -32 * SIZE(X), %xmm1 andps %xmm3, %xmm1 addl $2 * SIZE, X subl $2, M jle .L998 ALIGN_3 .L10: movl M, I sarl $5, I jle .L14 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 decl I jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addps %xmm7, %xmm1 movaps -4 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addps %xmm7, %xmm1 movaps 12 * SIZE(X), %xmm7 subl $-32 * SIZE, X decl I jg .L11 ALIGN_3 .L12: andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addps %xmm7, %xmm1 movaps -4 * SIZE(X), %xmm7 andps %xmm3, %xmm4 addps %xmm4, %xmm0 andps %xmm3, %xmm5 addps %xmm5, %xmm1 andps %xmm3, %xmm6 addps %xmm6, %xmm0 andps %xmm3, %xmm7 addps %xmm7, %xmm1 subl $-32 * SIZE, X ALIGN_3 .L14: testl $16, M je .L16 movaps -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps -24 * SIZE(X), %xmm6 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(X), %xmm7 andps %xmm3, %xmm7 addps %xmm7, %xmm1 addl $16 * SIZE, X ALIGN_3 .L16: testl $8, M je .L17 movaps -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm3, %xmm5 addps %xmm5, %xmm1 addl $8 * SIZE, X ALIGN_3 .L17: testl $4, M je .L18 movaps -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 addl $4 * SIZE, X ALIGN_3 .L18: testl $2, M je .L19 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm1 addl $2 * SIZE, X ALIGN_3 .L19: testl $1, M je .L998 movss -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 jmp .L998 ALIGN_4 .L100: movl M, I sarl $3, I jle .L105 ALIGN_4 .L101: movss (X), %xmm4 addl INCX, X andps %xmm3, %xmm4 addss %xmm4, %xmm0 movss (X), %xmm5 addl INCX, X andps %xmm3, %xmm5 addss %xmm5, %xmm1 movss (X), %xmm6 addl INCX, X andps %xmm3, %xmm6 addss %xmm6, %xmm0 movss (X), %xmm7 addl INCX, X andps %xmm3, %xmm7 addss %xmm7, %xmm1 movss (X), %xmm4 addl INCX, X andps %xmm3, %xmm4 addss %xmm4, %xmm0 movss (X), %xmm5 addl INCX, X andps %xmm3, %xmm5 addss %xmm5, %xmm1 movss (X), %xmm6 addl INCX, X andps %xmm3, %xmm6 addss %xmm6, %xmm0 movss (X), %xmm7 addl INCX, X andps %xmm3, %xmm7 addss %xmm7, %xmm1 decl I jg .L101 ALIGN_4 .L105: andl $7, M jle .L998 ALIGN_4 .L106: movss (X), %xmm4 andps %xmm3, %xmm4 addss %xmm4, %xmm0 addl INCX, X decl M jg .L106 ALIGN_4 .L998: addps %xmm1, %xmm0 #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #endif ALIGN_4 .L999: movss %xmm0, STACK_M flds STACK_M popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/asum_sse2.S000066400000000000000000000150261313527062700171020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define I %eax #define M %ecx #define X %esi #define INCX %ebx #include "l1param.h" PROLOGUE PROFCODE pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 testl M, M jle .L999 testl INCX, INCX jle .L999 pcmpeqb %xmm3, %xmm3 psrlq $1, %xmm3 sall $BASE_SHIFT, INCX subl $-16 * SIZE, X cmpl $SIZE, INCX jne .L40 testl $SIZE, X je .L05 movsd -16 * SIZE(X), %xmm0 addl $SIZE, X andps %xmm3, %xmm0 subl $1, M jle .L999 ALIGN_3 .L05: movl M, I sarl $4, I jle .L20 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 decl I jle .L11 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 movaps -2 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movaps 4 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 movaps 6 * SIZE(X), %xmm7 subl $-16 * SIZE, X decl I jg .L10 ALIGN_4 .L11: andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 movaps -2 * SIZE(X), %xmm7 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 subl $-16 * SIZE, X ALIGN_3 .L20: andl $15, M jle .L999 testl $8, M je .L21 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 addl $8 * SIZE, X ALIGN_3 .L21: testl $4, M je .L22 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 addl $4 * SIZE, X ALIGN_3 .L22: testl $2, M je .L23 movaps -16 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 addl $2 * SIZE, X .L23: testl $1, M je .L999 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -16 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addsd %xmm4, %xmm1 jmp .L999 ALIGN_3 .L40: movl M, I sarl $3, I jle .L60 ALIGN_4 .L50: movsd -16 * SIZE(X), %xmm4 addl INCX, X movhps -16 * SIZE(X), %xmm4 addl INCX, X andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(X), %xmm5 addl INCX, X movhps -16 * SIZE(X), %xmm5 addl INCX, X andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movsd -16 * SIZE(X), %xmm6 addl INCX, X movhps -16 * SIZE(X), %xmm6 addl INCX, X andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd -16 * SIZE(X), %xmm7 addl INCX, X movhps -16 * SIZE(X), %xmm7 addl INCX, X andps %xmm3, %xmm7 addpd %xmm7, %xmm1 decl I jg .L50 ALIGN_4 .L60: #ifdef movsd xorps %xmm4, %xmm4 #endif andl $7, M jle .L999 ALIGN_4 .L61: movsd -16 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addsd %xmm4, %xmm0 addl INCX, X decl M jg .L61 ALIGN_4 .L999: addpd %xmm1, %xmm0 #ifndef HAVE_SSE3 movaps %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif movsd %xmm0, STACK_M fldl STACK_M popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/axpy.S000066400000000000000000000124101313527062700161540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define STACK_Y 32 + STACK + ARGS(%esp) #define STACK_INCY 36 + STACK + ARGS(%esp) #else #define STACK_X 20 + STACK + ARGS(%esp) #define STACK_INCX 24 + STACK + ARGS(%esp) #define STACK_Y 28 + STACK + ARGS(%esp) #define STACK_INCY 32 + STACK + ARGS(%esp) #endif #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif FLD STACK_ALPHA movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY testl M, M jle .L40 cmpl $SIZE, INCX jne .L14 cmpl $SIZE, INCY jne .L14 movl M, %eax sarl $3, %eax jle .L15 ALIGN_3 #define PRESIZE 33 .L16: #ifdef HAS_PREFETCH prefetcht0 PRESIZE * SIZE(X) #endif FLD 0 * SIZE(X) fmul %st(1),%st FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 1 * SIZE(X) fmul %st(1),%st FADD 1 * SIZE(Y) FST 1 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(1),%st FADD 2 * SIZE(Y) FST 2 * SIZE(Y) FLD 3 * SIZE(X) fmul %st(1),%st FADD 3 * SIZE(Y) FST 3 * SIZE(Y) #ifdef HAS_PREFETCH prefetcht0 (4 + PRESIZE) * SIZE(X) #endif FLD 4 * SIZE(X) fmul %st(1),%st FADD 4 * SIZE(Y) FST 4 * SIZE(Y) FLD 5 * SIZE(X) fmul %st(1),%st FADD 5 * SIZE(Y) FST 5 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(1),%st FADD 6 * SIZE(Y) FST 6 * SIZE(Y) FLD 7 * SIZE(X) fmul %st(1),%st FADD 7 * SIZE(Y) FST 7 * SIZE(Y) #ifdef HAVE_3DNOW prefetchw 24 * SIZE(Y) #endif addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl M, %eax andl $7, %eax jle .L40 ALIGN_3 .L22: FLD 0 * SIZE(X) fmul %st(1),%st FADD 0 * SIZE(Y) FST 0 * SIZE(Y) addl $SIZE, X addl $SIZE, Y decl %eax jg .L22 jmp .L40 ALIGN_3 .L14: movl M, %eax sarl $2, %eax jle .L28 ALIGN_3 .L29: FLD (X) fmul %st(1),%st FADD (Y) FST (Y) addl INCX, X addl INCY, Y FLD (X) fmul %st(1),%st FADD (Y) FST (Y) addl INCX, X addl INCY, Y FLD (X) fmul %st(1),%st FADD (Y) FST (Y) addl INCX, X addl INCY, Y FLD (X) fmul %st(1),%st FADD (Y) FST (Y) addl INCX, X addl INCY, Y decl %eax jg .L29 ALIGN_3 .L28: movl M, %eax andl $3, %eax jle .L40 ALIGN_3 .L35: FLD (X) fmul %st(1),%st FADD (Y) FST (Y) addl INCX, X addl INCY, Y decl %eax jg .L35 ALIGN_3 .L40: ffreep %st(0) xorl %eax,%eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/axpy_sse.S000066400000000000000000000671661313527062700170500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 20 + STACK + ARGS(%esp) #define STACK_INCX 24 + STACK + ARGS(%esp) #define STACK_Y 28 + STACK + ARGS(%esp) #define STACK_INCY 32 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define Y %edi #define INCX %ecx #define INCY %edx #define YY %ebp #define ALPHA %xmm7 #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx pushl %ebp movl STACK_M, M movss STACK_ALPHA, ALPHA movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY shufps $0, ALPHA, ALPHA leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY testl M, M jle .L19 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 subl $-32 * SIZE, X subl $-32 * SIZE, Y cmpl $3, M jle .L16 testl $SIZE, Y je .L00 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M jle .L19 ALIGN_3 .L00: testl $SIZE * 2, Y je .L10 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, M jle .L19 ALIGN_3 .L10: testl $SIZE * 3, X jne .L20 movl M, %eax sarl $5, %eax jle .L13 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 decl %eax jle .L12 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 4 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 8 * SIZE(X), %xmm2 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 12 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L12: mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L13: movl M, %eax andl $16, %eax jle .L14 ALIGN_3 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L14: movl M, %eax andl $8, %eax jle .L15 ALIGN_3 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: movl M, %eax andl $4, %eax jle .L16 ALIGN_3 movaps -32 * SIZE(X), %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: movl M, %eax andl $2, %eax jle .L17 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L17: movl M, %eax andl $1, %eax jle .L19 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: popl %ebp popl %ebx popl %esi popl %edi ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS testl $SIZE, X jne .L30 movhps -32 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L23 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 decl %eax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -18 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -10 * SIZE(X), %xmm2 SHUFPD_1 %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -6 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 6 * SIZE(X), %xmm2 SHUFPD_1 %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 10 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -18 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 SHUFPD_1 %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -10 * SIZE(X), %xmm2 SHUFPD_1 %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -6 * SIZE(X), %xmm3 SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) SHUFPD_1 %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L23: movl M, %eax andl $16, %eax jle .L24 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 SHUFPD_1 %xmm3, %xmm2 SHUFPD_1 %xmm4, %xmm3 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L24: movl M, %eax andl $8, %eax jle .L25 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: movl M, %eax andl $4, %eax jle .L26 ALIGN_3 movaps -30 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: movl M, %eax andl $2, %eax jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: movl M, %eax andl $1, %eax jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L29: popl %ebp popl %ebx popl %esi popl %edi ret ALIGN_3 .L30: testl $2 * SIZE, X jne .L40 movaps -33 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L33 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 decl %eax jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -17 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -13 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -9 * SIZE(X), %xmm2 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -5 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 3 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 7 * SIZE(X), %xmm2 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 11 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -17 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -13 * SIZE(X), %xmm1 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -9 * SIZE(X), %xmm2 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -5 * SIZE(X), %xmm3 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L33: movl M, %eax andl $16, %eax jle .L34 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L34: movl M, %eax andl $8, %eax jle .L35 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L35: movl M, %eax andl $4, %eax jle .L36 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L36: movl M, %eax andl $2, %eax jle .L37 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L37: movl M, %eax andl $1, %eax jle .L39 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L39: popl %ebp popl %ebx popl %esi popl %edi ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L43 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 decl %eax jle .L42 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -19 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -15 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -11 * SIZE(X), %xmm2 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -7 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 1 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 5 * SIZE(X), %xmm2 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 9 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -19 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -15 * SIZE(X), %xmm1 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -11 * SIZE(X), %xmm2 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -7 * SIZE(X), %xmm3 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L43: movl M, %eax andl $16, %eax jle .L44 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L44: movl M, %eax andl $8, %eax jle .L45 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L45: movl M, %eax andl $4, %eax jle .L46 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L46: movl M, %eax andl $2, %eax jle .L47 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L47: movl M, %eax andl $1, %eax jle .L49 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L49: popl %ebp popl %ebx popl %esi popl %edi ret #else movl M, %eax sarl $5, %eax jle .L23 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 decl %eax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -16 * SIZE(X), %xmm0 movhps -14 * SIZE(X), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -12 * SIZE(X), %xmm1 movhps -10 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -8 * SIZE(X), %xmm2 movhps -6 * SIZE(X), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -2 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movsd 4 * SIZE(X), %xmm1 movhps 6 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movsd 8 * SIZE(X), %xmm2 movhps 10 * SIZE(X), %xmm2 mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movsd 12 * SIZE(X), %xmm3 movhps 14 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -16 * SIZE(X), %xmm0 movhps -14 * SIZE(X), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -12 * SIZE(X), %xmm1 movhps -10 * SIZE(X), %xmm1 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -8 * SIZE(X), %xmm2 movhps -6 * SIZE(X), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -2 * SIZE(X), %xmm3 mulps ALPHA, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) mulps ALPHA, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) mulps ALPHA, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) mulps ALPHA, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L23: movl M, %eax andl $16, %eax jle .L24 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L24: movl M, %eax andl $8, %eax jle .L25 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: movl M, %eax andl $4, %eax jle .L26 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: movl M, %eax andl $2, %eax jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: movl M, %eax andl $1, %eax jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L29: popl %ebp popl %ebx popl %esi popl %edi ret #endif ALIGN_3 .L50: movl M, %eax movl Y, YY //If incx==0 || incy==0, avoid unloop. cmpl $0, INCX je .L56 cmpl $0, INCY je .L56 sarl $3, %eax jle .L55 ALIGN_3 .L51: movss (X), %xmm0 addl INCX, X mulss ALPHA, %xmm0 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm0 movss (X), %xmm1 addl INCX, X mulss ALPHA, %xmm1 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm1 movss (X), %xmm2 addl INCX, X mulss ALPHA, %xmm2 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm2 movss (X), %xmm3 addl INCX, X mulss ALPHA, %xmm3 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm3 movss %xmm0, (Y) addl INCY, Y movss %xmm1, (Y) addl INCY, Y movss %xmm2, (Y) addl INCY, Y movss %xmm3, (Y) addl INCY, Y movss (X), %xmm0 addl INCX, X mulss ALPHA, %xmm0 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm0 movss (X), %xmm1 addl INCX, X mulss ALPHA, %xmm1 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm1 movss (X), %xmm2 addl INCX, X mulss ALPHA, %xmm2 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm2 movss (X), %xmm3 addl INCX, X mulss ALPHA, %xmm3 movss (YY), %xmm6 addl INCY, YY addss %xmm6, %xmm3 movss %xmm0, (Y) addl INCY, Y movss %xmm1, (Y) addl INCY, Y movss %xmm2, (Y) addl INCY, Y movss %xmm3, (Y) addl INCY, Y decl %eax jg .L51 ALIGN_3 .L55: movl M, %eax andl $7, %eax jle .L59 ALIGN_3 .L56: movss (X), %xmm0 addl INCX, X mulss ALPHA, %xmm0 movss (Y), %xmm6 addss %xmm6, %xmm0 movss %xmm0, (Y) addl INCY, Y decl %eax jg .L56 ALIGN_3 .L59: popl %ebp popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/axpy_sse2.S000066400000000000000000000360501313527062700171160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define STACK_Y 32 + STACK + ARGS(%esp) #define STACK_INCY 36 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define Y %edi #define INCX %ecx #define INCY %edx #define YY %ebp #define ALPHA %xmm7 #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx pushl %ebp movl STACK_M, M movsd STACK_ALPHA, ALPHA movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY unpcklpd ALPHA, ALPHA leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY testl M, M jle .L47 cmpl $SIZE, INCX jne .L40 cmpl $SIZE, INCY jne .L40 testl $SIZE, Y je .L10 movsd (X), %xmm0 mulsd ALPHA, %xmm0 addsd (Y), %xmm0 movsd %xmm0, (Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M jle .L19 ALIGN_4 .L10: subl $-16 * SIZE, X subl $-16 * SIZE, Y testl $SIZE, X jne .L20 movl M, %eax sarl $4, %eax jle .L13 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -8 * SIZE(X), %xmm0 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -6 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -4 * SIZE(X), %xmm2 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -2 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulpd ALPHA, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 mulpd ALPHA, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd ALPHA, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 mulpd ALPHA, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) movaps 6 * SIZE(X), %xmm3 subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L11 ALIGN_3 .L12: mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -8 * SIZE(X), %xmm0 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -6 * SIZE(X), %xmm1 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -4 * SIZE(X), %xmm2 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -2 * SIZE(X), %xmm3 mulpd ALPHA, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) mulpd ALPHA, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) mulpd ALPHA, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) mulpd ALPHA, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) subl $-16 * SIZE, Y subl $-16 * SIZE, X ALIGN_3 .L13: movl M, %eax andl $8, %eax jle .L14 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L14: movl M, %eax andl $4, %eax jle .L15 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L15: movl M, %eax andl $2, %eax jle .L16 ALIGN_3 movaps -16 * SIZE(X), %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L16: movl M, %eax andl $1, %eax jle .L19 ALIGN_3 movsd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: popl %ebp popl %ebx popl %esi popl %edi ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm0 movl M, %eax sarl $4, %eax jle .L23 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 decl %eax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -9 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -7 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -5 * SIZE(X), %xmm2 SHUFPD_1 %xmm0, %xmm3 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -3 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) movaps 1 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) movaps 3 * SIZE(X), %xmm2 SHUFPD_1 %xmm0, %xmm3 mulpd ALPHA, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) movaps 5 * SIZE(X), %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -9 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -7 * SIZE(X), %xmm1 SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -5 * SIZE(X), %xmm2 SHUFPD_1 %xmm0, %xmm3 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -3 * SIZE(X), %xmm3 SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm3 mulpd ALPHA, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L23: movl M, %eax andl $8, %eax jle .L24 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L24: movl M, %eax andl $4, %eax jle .L25 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: movl M, %eax andl $2, %eax jle .L26 ALIGN_3 movaps -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L26: movl M, %eax andl $1, %eax jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: popl %ebp popl %ebx popl %esi popl %edi ret ALIGN_3 #else movl M, %eax sarl $3, %eax jle .L23 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -8 * SIZE(X), %xmm0 movhps -7 * SIZE(X), %xmm0 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -6 * SIZE(X), %xmm1 movhps -5 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -4 * SIZE(X), %xmm2 movhps -3 * SIZE(X), %xmm2 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd -2 * SIZE(X), %xmm3 movhps -1 * SIZE(X), %xmm3 subl $-8 * SIZE, Y subl $-8 * SIZE, X decl %eax jg .L21 ALIGN_3 .L22: mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) subl $-8 * SIZE, Y subl $-8 * SIZE, X ALIGN_3 .L23: movl M, %eax andl $4, %eax jle .L25 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: movl M, %eax andl $2, %eax jle .L26 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L26: movl M, %eax andl $1, %eax jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: popl %ebp popl %ebx popl %esi popl %edi ret ALIGN_3 #endif .L40: movl Y, YY movl M, %eax //If incx==0 || incy==0, avoid unloop. cmpl $0, INCX je .L46 cmpl $0, INCY je .L46 sarl $3, %eax jle .L45 ALIGN_3 .L41: movsd 0 * SIZE(X), %xmm0 addl INCX, X movhpd 0 * SIZE(X), %xmm0 addl INCX, X mulpd ALPHA, %xmm0 movsd 0 * SIZE(YY), %xmm6 addl INCY, YY movhpd 0 * SIZE(YY), %xmm6 addl INCY, YY addpd %xmm6, %xmm0 movsd 0 * SIZE(X), %xmm1 addl INCX, X movhpd 0 * SIZE(X), %xmm1 addl INCX, X mulpd ALPHA, %xmm1 movsd 0 * SIZE(YY), %xmm6 addl INCY, YY movhpd 0 * SIZE(YY), %xmm6 addl INCY, YY addpd %xmm6, %xmm1 movsd 0 * SIZE(X), %xmm2 addl INCX, X movhpd 0 * SIZE(X), %xmm2 addl INCX, X mulpd ALPHA, %xmm2 movsd 0 * SIZE(YY), %xmm6 addl INCY, YY movhpd 0 * SIZE(YY), %xmm6 addl INCY, YY addpd %xmm6, %xmm2 movsd 0 * SIZE(X), %xmm3 addl INCX, X movhpd 0 * SIZE(X), %xmm3 addl INCX, X mulpd ALPHA, %xmm3 movsd 0 * SIZE(YY), %xmm6 addl INCY, YY movhpd 0 * SIZE(YY), %xmm6 addl INCY, YY addpd %xmm6, %xmm3 movsd %xmm0, 0 * SIZE(Y) addl INCY, Y movhpd %xmm0, 0 * SIZE(Y) addl INCY, Y movsd %xmm1, 0 * SIZE(Y) addl INCY, Y movhpd %xmm1, 0 * SIZE(Y) addl INCY, Y movsd %xmm2, 0 * SIZE(Y) addl INCY, Y movhpd %xmm2, 0 * SIZE(Y) addl INCY, Y movsd %xmm3, 0 * SIZE(Y) addl INCY, Y movhpd %xmm3, 0 * SIZE(Y) addl INCY, Y decl %eax jg .L41 ALIGN_3 .L45: movl M, %eax andl $7, %eax jle .L47 ALIGN_3 .L46: movsd (X), %xmm0 addl INCX, X mulsd ALPHA, %xmm0 addsd (Y), %xmm0 movsd %xmm0, (Y) addl INCY, Y decl %eax jg .L46 ALIGN_3 .L47: popl %ebp popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/axpy_sse2_opteron.S000066400000000000000000000242161313527062700206650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define STACK_Y 32 + STACK + ARGS(%esp) #define STACK_INCY 36 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define Y %edi #define INCX %ecx #define INCY %edx #define PREFETCHSIZE 64 PROLOGUE pushl %edi pushl %esi pushl %ebx pushl %ebp PROFCODE movlpd ALPHA, %xmm7 unpcklpd %xmm7, %xmm7 movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY testl M, M jle .L999 cmpl $SIZE, INCX jne .L100 cmpl $SIZE, INCY jne .L100 testl $SIZE, Y je .L00 movlpd 0 * SIZE(X), %xmm0 mulsd %xmm7, %xmm0 addsd 0 * SIZE(Y), %xmm0 movlpd %xmm0, 0 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M jle .L999 ALIGN_3 .L00: testl $SIZE, X jne .L20 movl M, %eax sarl $4, %eax jle .L15 ALIGN_3 .L11: prefetch (PREFETCHSIZE + 0) * SIZE(X) movapd 0 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) movapd 2 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 2 * SIZE(Y), %xmm1 movapd %xmm1, 2 * SIZE(Y) prefetchw (PREFETCHSIZE + 0) * SIZE(Y) movapd 4 * SIZE(X), %xmm2 mulpd %xmm7, %xmm2 addpd 4 * SIZE(Y), %xmm2 movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(X), %xmm3 mulpd %xmm7, %xmm3 addpd 6 * SIZE(Y), %xmm3 movapd %xmm3, 6 * SIZE(Y) prefetch (PREFETCHSIZE + 8) * SIZE(X) movapd 8 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 8 * SIZE(Y), %xmm0 movapd %xmm0, 8 * SIZE(Y) movapd 10 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 10 * SIZE(Y), %xmm1 movapd %xmm1, 10 * SIZE(Y) prefetchw (PREFETCHSIZE + 8) * SIZE(Y) movapd 12 * SIZE(X), %xmm2 mulpd %xmm7, %xmm2 addpd 12 * SIZE(Y), %xmm2 movapd %xmm2, 12 * SIZE(Y) movapd 14 * SIZE(X), %xmm3 mulpd %xmm7, %xmm3 addpd 14 * SIZE(Y), %xmm3 movapd %xmm3, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L15: movl M, %eax testl $8, %eax jle .L16 movapd 0 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) movapd 2 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 2 * SIZE(Y), %xmm1 movapd %xmm1, 2 * SIZE(Y) movapd 4 * SIZE(X), %xmm2 mulpd %xmm7, %xmm2 addpd 4 * SIZE(Y), %xmm2 movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(X), %xmm3 mulpd %xmm7, %xmm3 addpd 6 * SIZE(Y), %xmm3 movapd %xmm3, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L16: testl $4, %eax jle .L17 movapd 0 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) movapd 2 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 2 * SIZE(Y), %xmm1 movapd %xmm1, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L17: testl $2, %eax jle .L18 movapd 0 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L18: testl $1, %eax jle .L99 movlpd 0 * SIZE(X), %xmm0 mulsd %xmm7, %xmm0 addsd 0 * SIZE(Y), %xmm0 movlpd %xmm0, 0 * SIZE(Y) jmp .L99 ALIGN_3 .L20: movl M, %eax sarl $4, %eax jle .L25 ALIGN_4 .L21: #ifdef OPTERON prefetcht0 (PREFETCHSIZE + 0) * SIZE(X) prefetchw (PREFETCHSIZE + 0) * SIZE(Y) #endif movlpd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) movlpd 2 * SIZE(X), %xmm1 movhpd 3 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 2 * SIZE(Y), %xmm1 movapd %xmm1, 2 * SIZE(Y) movlpd 4 * SIZE(X), %xmm2 movhpd 5 * SIZE(X), %xmm2 mulpd %xmm7, %xmm2 addpd 4 * SIZE(Y), %xmm2 movapd %xmm2, 4 * SIZE(Y) movlpd 6 * SIZE(X), %xmm3 movhpd 7 * SIZE(X), %xmm3 mulpd %xmm7, %xmm3 addpd 6 * SIZE(Y), %xmm3 movapd %xmm3, 6 * SIZE(Y) #ifdef OPTERON prefetcht0 (PREFETCHSIZE + 8) * SIZE(X) prefetchw (PREFETCHSIZE + 8) * SIZE(Y) #endif movlpd 8 * SIZE(X), %xmm0 movhpd 9 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 8 * SIZE(Y), %xmm0 movapd %xmm0, 8 * SIZE(Y) movlpd 10 * SIZE(X), %xmm1 movhpd 11 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 10 * SIZE(Y), %xmm1 movapd %xmm1, 10 * SIZE(Y) movlpd 12 * SIZE(X), %xmm2 movhpd 13 * SIZE(X), %xmm2 mulpd %xmm7, %xmm2 addpd 12 * SIZE(Y), %xmm2 movapd %xmm2, 12 * SIZE(Y) movlpd 14 * SIZE(X), %xmm3 movhpd 15 * SIZE(X), %xmm3 mulpd %xmm7, %xmm3 addpd 14 * SIZE(Y), %xmm3 movapd %xmm3, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L25: movl M, %eax testl $8, %eax jle .L26 movlpd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) movlpd 2 * SIZE(X), %xmm1 movhpd 3 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 2 * SIZE(Y), %xmm1 movapd %xmm1, 2 * SIZE(Y) movlpd 4 * SIZE(X), %xmm2 movhpd 5 * SIZE(X), %xmm2 mulpd %xmm7, %xmm2 addpd 4 * SIZE(Y), %xmm2 movapd %xmm2, 4 * SIZE(Y) movlpd 6 * SIZE(X), %xmm3 movhpd 7 * SIZE(X), %xmm3 mulpd %xmm7, %xmm3 addpd 6 * SIZE(Y), %xmm3 movapd %xmm3, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: testl $4, %eax jle .L27 movlpd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) movlpd 2 * SIZE(X), %xmm1 movhpd 3 * SIZE(X), %xmm1 mulpd %xmm7, %xmm1 addpd 2 * SIZE(Y), %xmm1 movapd %xmm1, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: testl $2, %eax jle .L28 movlpd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 mulpd %xmm7, %xmm0 addpd 0 * SIZE(Y), %xmm0 movapd %xmm0, 0 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L28: testl $1, %eax jle .L99 movlpd 0 * SIZE(X), %xmm0 mulsd %xmm7, %xmm0 addsd 0 * SIZE(Y), %xmm0 movlpd %xmm0, 0 * SIZE(Y) ALIGN_3 .L99: xorl %eax,%eax popl %ebp popl %ebx popl %esi popl %edi ret ALIGN_3 .L100: movl M, %eax movl Y, %ebp sarl $3, %eax jle .L114 ALIGN_3 .L110: movlpd 0 * SIZE(X), %xmm0 addl INCX, X movhpd 0 * SIZE(X), %xmm0 addl INCX, X mulpd %xmm7, %xmm0 movlpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp movhpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp addpd %xmm6, %xmm0 movlpd 0 * SIZE(X), %xmm1 addl INCX, X movhpd 0 * SIZE(X), %xmm1 addl INCX, X mulpd %xmm7, %xmm1 movlpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp movhpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp addpd %xmm6, %xmm1 movlpd 0 * SIZE(X), %xmm2 addl INCX, X movhpd 0 * SIZE(X), %xmm2 addl INCX, X mulpd %xmm7, %xmm2 movlpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp movhpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp addpd %xmm6, %xmm2 movlpd 0 * SIZE(X), %xmm3 addl INCX, X movhpd 0 * SIZE(X), %xmm3 addl INCX, X mulpd %xmm7, %xmm3 movlpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp movhpd 0 * SIZE(%ebp), %xmm6 addl INCY, %ebp addpd %xmm6, %xmm3 movlpd %xmm0, 0 * SIZE(Y) addl INCY, Y movhpd %xmm0, 0 * SIZE(Y) addl INCY, Y movlpd %xmm1, 0 * SIZE(Y) addl INCY, Y movhpd %xmm1, 0 * SIZE(Y) addl INCY, Y movlpd %xmm2, 0 * SIZE(Y) addl INCY, Y movhpd %xmm2, 0 * SIZE(Y) addl INCY, Y movlpd %xmm3, 0 * SIZE(Y) addl INCY, Y movhpd %xmm3, 0 * SIZE(Y) addl INCY, Y decl %eax jg .L110 ALIGN_3 .L114: movl M, %eax andl $7, %eax jle .L999 ALIGN_3 .L115: movlpd (X), %xmm0 addl INCX, X mulsd %xmm7, %xmm0 addsd (Y), %xmm0 movlpd %xmm0, (Y) addl INCY, Y decl %eax jg .L115 ALIGN_3 .L999: xorl %eax,%eax popl %ebp popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/cabs.S000066400000000000000000000055311313527062700161110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl 4(%esp), %eax FLD 0 * SIZE(%eax) fabs FLD 1 * SIZE(%eax) fabs faddp %st, %st(1) ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/copy.S000066400000000000000000000115001313527062700161440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define M 4 + STACK + ARGS(%esp) #define X 8 + STACK + ARGS(%esp) #define INCX 12 + STACK + ARGS(%esp) #define Y 16 + STACK + ARGS(%esp) #define INCY 20 + STACK + ARGS(%esp) PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl M, %ebx movl X, %ecx movl INCX, %esi movl Y, %edx movl INCY, %edi testl %ebx, %ebx # if m == 0 goto End jle .L999 #if SIZE > 8 sall $BASE_SHIFT, %esi sall $BASE_SHIFT, %edi #else leal (, %esi, SIZE), %esi leal (, %edi, SIZE), %edi #endif cmpl $SIZE, %esi # if incx != 1 jne .L100 cmpl $SIZE, %edi # if incy != 1 jne .L100 movl %ebx, %eax # i = m sarl $3, %eax jle .L20 ALIGN_2 .L11: FLD 7 * SIZE(%ecx) FLD 6 * SIZE(%ecx) FLD 5 * SIZE(%ecx) FLD 4 * SIZE(%ecx) FLD 3 * SIZE(%ecx) FLD 2 * SIZE(%ecx) FLD 1 * SIZE(%ecx) FLD 0 * SIZE(%ecx) FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) FST 2 * SIZE(%edx) FST 3 * SIZE(%edx) FST 4 * SIZE(%edx) FST 5 * SIZE(%edx) FST 6 * SIZE(%edx) FST 7 * SIZE(%edx) addl $8 * SIZE, %ecx addl $8 * SIZE, %edx decl %eax jg .L11 ALIGN_2 .L20: movl %ebx, %eax # i = m andl $7, %eax jle .L99 ALIGN_2 .L21: FLD (%ecx) FST (%edx) addl $SIZE, %ecx addl $SIZE, %edx decl %eax jg .L21 .L99: xorl %eax,%eax popl %ebx popl %esi popl %edi ret ALIGN_3 .L100: movl %ebx, %eax sarl $3, %eax jle .L120 ALIGN_2 .L111: FLD (%ecx) addl %esi, %ecx FLD (%ecx) addl %esi, %ecx FLD (%ecx) addl %esi, %ecx FLD (%ecx) addl %esi, %ecx FLD (%ecx) addl %esi, %ecx FLD (%ecx) addl %esi, %ecx FLD (%ecx) addl %esi, %ecx FLD (%ecx) addl %esi, %ecx fxch %st(7) FST (%edx) addl %edi, %edx fxch %st(5) FST (%edx) addl %edi, %edx fxch %st(3) FST (%edx) addl %edi, %edx fxch %st(1) FST (%edx) addl %edi, %edx FST (%edx) addl %edi, %edx FST (%edx) addl %edi, %edx FST (%edx) addl %edi, %edx FST (%edx) addl %edi, %edx decl %eax jg .L111 .L120: movl %ebx, %eax andl $7, %eax jle .L999 ALIGN_2 .L121: FLD (%ecx) FST (%edx) addl %esi, %ecx addl %edi, %edx decl %eax jg .L121 .L999: xorl %eax,%eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/copy_sse.S000066400000000000000000000424761313527062700170360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 cmpl $3, M jle .L55 subl $-32 * SIZE, X subl $-32 * SIZE, Y testl $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M ALIGN_4 .L05: testl $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, M jle .L19 ALIGN_4 .L10: testl $3 * SIZE, X jne .L20 movl M, %eax sarl $5, %eax jle .L13 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -32 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -28 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -24 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm2) movaps %xmm3, -20 * SIZE(Y) LOAD(12 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4,-16 * SIZE(Y) LOAD(16 * SIZE, X, %xmm4) movaps %xmm5,-12 * SIZE(Y) LOAD(20 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -8 * SIZE(Y) LOAD(24 * SIZE, X, %xmm6) movaps %xmm7, -4 * SIZE(Y) LOAD(28 * SIZE, X, %xmm7) subl $-32 * SIZE, Y subl $-32 * SIZE, X decl %eax jg .L11 ALIGN_3 .L12: movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, -16 * SIZE(Y) movaps %xmm5, -12 * SIZE(Y) movaps %xmm6, -8 * SIZE(Y) movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, Y subl $-32 * SIZE, X ALIGN_3 .L13: testl $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L14: testl $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: testl $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: testl $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L17: testl $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: popl %ebx popl %esi popl %edi ret ALIGN_3 .L20: testl $SIZE, X jne .L30 movhps -32 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L23 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -10 * SIZE(X), %xmm6 movaps -6 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 6 * SIZE(X), %xmm2 shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 10 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 14 * SIZE(X), %xmm4 shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 18 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 22 * SIZE(X), %xmm6 shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 26 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L23: testl $16, M jle .L24 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L24: testl $8, M jle .L25 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 shufps $0x4e, %xmm1, %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, M jle .L26 ALIGN_3 movaps -30 * SIZE(X), %xmm1 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: testl $2, M jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, M jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L29: popl %ebx popl %esi popl %edi ret ALIGN_3 .L30: testl $2 * SIZE, X jne .L40 movaps -33 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L33 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movaps -13 * SIZE(X), %xmm5 movaps -9 * SIZE(X), %xmm6 movaps -5 * SIZE(X), %xmm7 decl %eax jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 3 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 7 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 11 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 15 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 19 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 23 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 27 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L33: testl $16, M jle .L34 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L34: testl $8, M jle .L35 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L35: testl $4, M jle .L36 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L36: testl $2, M jle .L37 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L37: testl $1, M jle .L39 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L39: popl %ebx popl %esi popl %edi ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L43 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movaps -15 * SIZE(X), %xmm5 movaps -11 * SIZE(X), %xmm6 movaps -7 * SIZE(X), %xmm7 decl %eax jle .L42 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 1 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 5 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 9 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 13 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 17 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 21 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 25 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L43: testl $16, M jle .L44 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L44: testl $8, M jle .L45 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L45: testl $4, M jle .L46 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L46: testl $2, M jle .L47 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L47: testl $1, M jle .L49 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L49: popl %ebx popl %esi popl %edi ret ALIGN_4 .L50: movl M, %eax sarl $3, %eax jle .L55 ALIGN_3 .L51: movss (X), %xmm0 addl INCX, X movss (X), %xmm1 addl INCX, X movss (X), %xmm2 addl INCX, X movss (X), %xmm3 addl INCX, X movss (X), %xmm4 addl INCX, X movss (X), %xmm5 addl INCX, X movss (X), %xmm6 addl INCX, X movss (X), %xmm7 addl INCX, X movss %xmm0, (Y) addl INCY, Y movss %xmm1, (Y) addl INCY, Y movss %xmm2, (Y) addl INCY, Y movss %xmm3, (Y) addl INCY, Y movss %xmm4, (Y) addl INCY, Y movss %xmm5, (Y) addl INCY, Y movss %xmm6, (Y) addl INCY, Y movss %xmm7, (Y) addl INCY, Y decl %eax jg .L51 ALIGN_3 .L55: movl M, %eax andl $7, %eax jle .L57 ALIGN_3 .L56: movss (X), %xmm0 addl INCX, X movss %xmm0, (Y) addl INCY, Y decl %eax jg .L56 ALIGN_3 .L57: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/copy_sse2.S000066400000000000000000000310411313527062700171020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY cmpl $SIZE, INCX jne .L40 cmpl $SIZE, INCY jne .L40 #ifdef ALIGNED_ACCESS testl $SIZE, Y #else testl $SIZE, X #endif je .L10 movsd (X), %xmm0 movsd %xmm0, (Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M jle .L19 ALIGN_4 .L10: subl $-16 * SIZE, X subl $-16 * SIZE, Y #ifdef ALIGNED_ACCESS testl $SIZE, X #else testl $SIZE, Y #endif jne .L20 movl M, %eax sarl $4, %eax jle .L13 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -16 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -14 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -12 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movaps %xmm3, -10 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4, -8 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movaps %xmm5, -6 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -4 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movaps %xmm7, -2 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L11 ALIGN_3 .L12: movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, -8 * SIZE(Y) movaps %xmm5, -6 * SIZE(Y) movaps %xmm6, -4 * SIZE(Y) movaps %xmm7, -2 * SIZE(Y) subl $-16 * SIZE, Y subl $-16 * SIZE, X ALIGN_3 .L13: testl $8, M jle .L14 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L14: testl $4, M jle .L15 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L15: testl $2, M jle .L16 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps %xmm0, -16 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L16: testl $1, M jle .L19 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: popl %ebx popl %esi popl %edi ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm0 movl M, %eax sarl $4, %eax jle .L23 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 movaps -7 * SIZE(X), %xmm5 movaps -5 * SIZE(X), %xmm6 movaps -3 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) LOAD( 1 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) LOAD( 3 * SIZE, X, %xmm2) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) LOAD( 5 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) LOAD( 7 * SIZE, X, %xmm4) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) LOAD( 9 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) LOAD(11 * SIZE, X, %xmm6) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) LOAD(13 * SIZE, X, %xmm7) subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L23: testl $8, M jle .L24 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L24: testl $4, M jle .L25 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: testl $2, M jle .L26 ALIGN_3 movaps -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L26: testl $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: popl %ebx popl %esi popl %edi ret ALIGN_3 #else movl M, %eax sarl $4, %eax jle .L23 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L21 ALIGN_3 .L22: movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) subl $-16 * SIZE, Y subl $-16 * SIZE, X ALIGN_3 .L23: testl $8, M jle .L24 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movaps -10 * SIZE(X), %xmm3 movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L24: testl $4, M jle .L25 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: testl $2, M jle .L26 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L26: testl $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: popl %ebx popl %esi popl %edi ret ALIGN_3 #endif .L40: movl M, %eax sarl $3, %eax jle .L45 ALIGN_3 .L41: movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm1 addl INCX, X movhps (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X movsd (X), %xmm3 addl INCX, X movhps (X), %xmm3 addl INCX, X movlps %xmm0, (Y) addl INCY, Y movhps %xmm0, (Y) addl INCY, Y movlps %xmm1, (Y) addl INCY, Y movhps %xmm1, (Y) addl INCY, Y movlps %xmm2, (Y) addl INCY, Y movhps %xmm2, (Y) addl INCY, Y movlps %xmm3, (Y) addl INCY, Y movhps %xmm3, (Y) addl INCY, Y decl %eax jg .L41 ALIGN_3 .L45: movl M, %eax andl $7, %eax jle .L47 ALIGN_3 .L46: movsd (X), %xmm0 addl INCX, X movlps %xmm0, (Y) addl INCY, Y decl %eax jg .L46 ALIGN_3 .L47: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/cpuid.S000066400000000000000000000056421313527062700163100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE pushl %esi pushl %ebx movl 12(%esp), %eax cpuid movl 16(%esp), %esi movl %eax, (%esi) movl 20(%esp), %esi movl %ebx, (%esi) movl 24(%esp), %esi movl %ecx, (%esi) movl 28(%esp), %esi movl %edx, (%esi) popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/cpuid_win.c000066400000000000000000000035701313527062700172030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #if defined(_MSC_VER) && !defined(__clang__) #include void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) { int cpuInfo[4] = {-1}; __cpuid(cpuInfo, op); *eax = cpuInfo[0]; *ebx = cpuInfo[1]; *ecx = cpuInfo[2]; *edx = cpuInfo[3]; } #endif OpenBLAS-0.2.20/kernel/x86/dot.S000066400000000000000000000113651313527062700157710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N),N movl (INCX),INCX movl (INCY),INCY #endif leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY fldz fldz fldz fldz cmpl $SIZE, INCX jne .L14 cmpl $SIZE, INCY jne .L14 movl N, %eax sarl $2, %eax jle .L15 ALIGN_3 .L16: FLD 0 * SIZE(X) FMUL 0 * SIZE(Y) faddp %st,%st(1) FLD 1 * SIZE(X) FMUL 1 * SIZE(Y) faddp %st,%st(2) FLD 2 * SIZE(X) FMUL 2 * SIZE(Y) faddp %st,%st(3) FLD 3 * SIZE(X) FMUL 3 * SIZE(Y) faddp %st,%st(4) addl $4 * SIZE, X addl $4 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L22: FLD (X) addl $SIZE, X FMUL (Y) addl $SIZE, Y faddp %st,%st(1) decl %eax jg .L22 jmp .L27 ALIGN_3 .L14: #ifdef F_INTERFACE testl INCX, INCX jge .L28 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L28: testl INCY, INCY jge .L29 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L29: #endif movl N, %eax sarl $2, %eax jle .L30 ALIGN_3 .L31: FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(1) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(2) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(3) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(4) decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L37: FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st, %st(1) decl %eax jg .L37 ALIGN_3 .L27: faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(1) popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/dot_amd.S000066400000000000000000000120011313527062700165760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N),N movl (INCX),INCX movl (INCY),INCY #endif leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY fldz fldz fldz fldz cmpl $SIZE, INCX jne .L14 cmpl $SIZE, INCY jne .L14 movl N, %eax sarl $3, %eax jle .L15 FLD 0 * SIZE(X) ALIGN_4 .L16: FLD 1 * SIZE(X) FMUL 1 * SIZE(Y) faddp %st,%st(2) FMUL 0 * SIZE(Y) faddp %st,%st(2) FLD 2 * SIZE(X) FLD 3 * SIZE(X) FMUL 3 * SIZE(Y) faddp %st,%st(4) FMUL 2 * SIZE(Y) faddp %st,%st(4) FLD 4 * SIZE(X) FLD 5 * SIZE(X) FMUL 5 * SIZE(Y) faddp %st,%st(2) FMUL 4 * SIZE(Y) faddp %st,%st(2) FLD 6 * SIZE(X) FLD 7 * SIZE(X) FMUL 7 * SIZE(Y) faddp %st,%st(4) FMUL 6 * SIZE(Y) faddp %st,%st(4) FLD 8 * SIZE(X) prefetch 16 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax jg .L16 ffreep %st(0) ALIGN_3 .L15: movl N, %eax andl $7, %eax jle .L27 ALIGN_3 .L22: FLD (X) addl $SIZE, X FMUL (Y) addl $SIZE, Y faddp %st,%st(1) decl %eax jg .L22 jmp .L27 ALIGN_3 .L14: #ifdef F_INTERFACE testl INCX, INCX jge .L28 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L28: testl INCY, INCY jge .L29 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L29: #endif movl N, %eax sarl $2, %eax jle .L30 ALIGN_3 .L31: FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(1) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(2) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(3) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(4) decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L37: FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st, %st(1) decl %eax jg .L37 ALIGN_3 .L27: faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(1) popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/dot_sse.S000066400000000000000000000566341313527062700166530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ecx #define X %esi #define INCX %ebx #define Y %edi #define INCY %edx #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N), N # N movl (INCX),INCX # INCX movl (INCY),INCY # INCY #endif leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 cmpl $0, N jle .L999 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 subl $-32 * SIZE, X subl $-32 * SIZE, Y cmpl $3, N jle .L17 testl $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 mulss -32 * SIZE(Y), %xmm0 addl $1 * SIZE, X addl $1 * SIZE, Y decl N ALIGN_2 .L05: testl $2 * SIZE, Y je .L10 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(Y), %xmm1 mulps %xmm4, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, N jle .L999 ALIGN_2 .L10: #ifdef ALIGNED_ACCESS testl $2 * SIZE, X jne .L30 testl $SIZE, X jne .L20 #else testl $3 * SIZE, X jne .L20 #endif movl N, %eax sarl $5, %eax jle .L14 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -8 * SIZE(X), %xmm6 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -4 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 8 * SIZE(X), %xmm6 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 12 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L12: mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -8 * SIZE(X), %xmm6 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -4 * SIZE(X), %xmm7 mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L14: testl $31, N jle .L999 testl $16, N jle .L15 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L15: testl $8, N jle .L16 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L16: testl $4, N jle .L17 movaps -32 * SIZE(X), %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L17: testl $2, N jle .L18 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd -32 * SIZE(Y), %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm3 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L18: testl $1, N jle .L999 movss -32 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 #ifdef ALIGNED_ACCESS .L20: movaps -33 * SIZE(X), %xmm4 addl $3 * SIZE, X movl N, %eax sarl $5, %eax jle .L24 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm5, %xmm4 PSHUFD1($0x39, %xmm4) mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 PSHUFD1($0x39, %xmm5) mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -16 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD1($0x39, %xmm6) mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -12 * SIZE(X), %xmm6 movss %xmm4, %xmm7 PSHUFD1($0x39, %xmm7) mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm5, %xmm4 PSHUFD1($0x39, %xmm4) mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 PSHUFD1($0x39, %xmm5) mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD1($0x39, %xmm6) mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 movss %xmm4, %xmm7 PSHUFD1($0x39, %xmm7) mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 8 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: movss %xmm5, %xmm4 PSHUFD1($0x39, %xmm4) mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 PSHUFD1($0x39, %xmm5) mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -16 * SIZE(X), %xmm5 movss %xmm7, %xmm6 PSHUFD1($0x39, %xmm6) mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -12 * SIZE(X), %xmm6 movss %xmm4, %xmm7 PSHUFD1($0x39, %xmm7) mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 movss %xmm5, %xmm4 PSHUFD1($0x39, %xmm4) mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 PSHUFD1($0x39, %xmm5) mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 PSHUFD1($0x39, %xmm6) mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm4, %xmm7 PSHUFD1($0x39, %xmm7) mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L24: testl $31, N jle .L999 testl $16, N jle .L25 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 movss %xmm5, %xmm4 PSHUFD1($0x39, %xmm4) mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 PSHUFD1($0x39, %xmm5) mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 PSHUFD1($0x39, %xmm6) mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm4, %xmm7 PSHUFD1($0x39, %xmm7) mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L25: testl $8, N jle .L26 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movss %xmm5, %xmm4 PSHUFD1($0x39, %xmm4) mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movss %xmm6, %xmm5 PSHUFD1($0x39, %xmm5) mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps %xmm6, %xmm4 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: testl $4, N jle .L27 movaps -32 * SIZE(X), %xmm5 movss %xmm5, %xmm4 PSHUFD1($0x39, %xmm4) mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 movaps %xmm5, %xmm4 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: testl $2, N jle .L28 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd -32 * SIZE(Y), %xmm6 PSHUFD2($0x39, %xmm4, %xmm5) mulps %xmm6, %xmm5 addps %xmm5, %xmm3 movhlps %xmm4, %xmm4 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L28: testl $1, N jle .L999 PSHUFD1($0x39, %xmm4) mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 .L30: testl $SIZE, X jne .L40 movhps -32 * SIZE(X), %xmm4 addl $2 * SIZE, X movl N, %eax sarl $5, %eax jle .L34 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 decl %eax jle .L32 ALIGN_3 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -16 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -12 * SIZE(X), %xmm6 SHUFPD_1 %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm5, %xmm4 mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm7, %xmm6 mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 SHUFPD_1 %xmm4, %xmm7 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 8 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -16 * SIZE(X), %xmm5 SHUFPD_1 %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -12 * SIZE(X), %xmm6 SHUFPD_1 %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 SHUFPD_1 %xmm5, %xmm4 mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 SHUFPD_1 %xmm4, %xmm7 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L34: testl $31, N jle .L999 testl $16, N jle .L35 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 SHUFPD_1 %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L35: testl $8, N jle .L36 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps %xmm6, %xmm4 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L36: testl $4, N jle .L37 movaps -32 * SIZE(X), %xmm5 SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps %xmm5, %xmm4 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L37: testl $2, N jle .L38 xorps %xmm5, %xmm5 movhlps %xmm4, %xmm5 mulps -32 * SIZE(Y), %xmm5 addps %xmm5, %xmm0 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L38: testl $1, N jle .L999 movss -34 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm4 addl $SIZE, X movl N, %eax sarl $5, %eax jle .L44 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 decl %eax jle .L42 ALIGN_3 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -16 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -12 * SIZE(X), %xmm6 movss %xmm4, %xmm7 shufps $0x93, %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 movss %xmm4, %xmm7 shufps $0x93, %xmm4, %xmm7 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 8 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps -16 * SIZE(X), %xmm5 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps -12 * SIZE(X), %xmm6 movss %xmm4, %xmm7 shufps $0x93, %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm4, %xmm7 shufps $0x93, %xmm4, %xmm7 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L44: testl $31, N jle .L999 testl $16, N jle .L45 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm4, %xmm7 shufps $0x93, %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L45: testl $8, N jle .L46 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps %xmm6, %xmm4 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L46: testl $4, N jle .L47 movaps -32 * SIZE(X), %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 movaps %xmm5, %xmm4 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L47: testl $2, N jle .L48 movaps -32 * SIZE(X), %xmm5 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd -32 * SIZE(Y), %xmm7 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm7, %xmm4 addps %xmm4, %xmm3 movlhps %xmm5, %xmm4 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L48: testl $1, N jle .L999 PSHUFD1($0x93, %xmm4) mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_4 #else .L20: movl N, %eax sarl $5, %eax jle .L24 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 movlps -28 * SIZE(X), %xmm5 movhps -26 * SIZE(X), %xmm5 movlps -24 * SIZE(X), %xmm6 movhps -22 * SIZE(X), %xmm6 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movlps -16 * SIZE(X), %xmm4 movhps -14 * SIZE(X), %xmm4 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movlps -12 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movlps -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movlps -4 * SIZE(X), %xmm7 movhps -2 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movlps 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movlps 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movlps 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movlps 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movlps -16 * SIZE(X), %xmm4 movhps -14 * SIZE(X), %xmm4 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movlps -12 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movlps -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movlps -4 * SIZE(X), %xmm7 movhps -2 * SIZE(X), %xmm7 mulps -16 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -12 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -8 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -4 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L24: testl $31, N jle .L999 testl $16, N jle .L25 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 movlps -28 * SIZE(X), %xmm5 movhps -26 * SIZE(X), %xmm5 movlps -24 * SIZE(X), %xmm6 movhps -22 * SIZE(X), %xmm6 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L25: testl $8, N jle .L26 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 movlps -28 * SIZE(X), %xmm5 movhps -26 * SIZE(X), %xmm5 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: testl $4, N jle .L27 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: testl $2, N jle .L28 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd -32 * SIZE(Y), %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm3 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L28: testl $1, N jle .L999 movss -32 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 #endif .L50: movl N, %eax sarl $2, %eax jle .L55 ALIGN_3 .L53: movss 0 * SIZE(X), %xmm4 addl INCX, X mulss 0 * SIZE(Y), %xmm4 addl INCY, Y movss 0 * SIZE(X), %xmm5 addl INCX, X mulss 0 * SIZE(Y), %xmm5 addl INCY, Y movss 0 * SIZE(X), %xmm6 addl INCX, X mulss 0 * SIZE(Y), %xmm6 addl INCY, Y movss 0 * SIZE(X), %xmm7 addl INCX, X mulss 0 * SIZE(Y), %xmm7 addl INCY, Y addss %xmm4, %xmm0 addss %xmm5, %xmm1 addss %xmm6, %xmm2 addss %xmm7, %xmm3 decl %eax jg .L53 ALIGN_3 .L55: movl N, %eax andl $3, %eax jle .L999 ALIGN_3 .L56: movss 0 * SIZE(X), %xmm4 addl INCX, X mulss 0 * SIZE(Y), %xmm4 addl INCY, Y addss %xmm4, %xmm0 decl %eax jg .L56 ALIGN_3 .L999: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #if defined(HAVE_SSE3) && !defined(__INTERIX) haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #elif defined(HAVE_SSE2) movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 PSHUFD2($1, %xmm0, %xmm1) addss %xmm1, %xmm0 #else movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #endif movss %xmm0, STACK_N flds STACK_N popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/dot_sse2.S000066400000000000000000000334661313527062700167330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ecx #define X %esi #define INCX %ebx #define Y %edi #define INCY %edx #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 cmpl $0, N jle .L999 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 subl $-16 * SIZE, X subl $-16 * SIZE, Y testl $SIZE, Y je .L10 movsd -16 * SIZE(X), %xmm0 mulsd -16 * SIZE(Y), %xmm0 addl $1 * SIZE, X addl $1 * SIZE, Y decl N ALIGN_2 .L10: testl $SIZE, X jne .L20 movl N, %eax sarl $4, %eax jle .L14 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps -4 * SIZE(X), %xmm6 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps -2 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd -8 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulpd -6 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulpd -4 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 mulpd -2 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps 6 * SIZE(X), %xmm7 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L12: mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps -4 * SIZE(X), %xmm6 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps -2 * SIZE(X), %xmm7 mulpd -8 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -6 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -4 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -2 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L14: testl $15, N jle .L999 testl $8, N jle .L15 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: testl $4, N jle .L16 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: testl $2, N jle .L17 movaps -16 * SIZE(X), %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L17: testl $1, N jle .L999 movsd -16 * SIZE(X), %xmm4 mulsd -16 * SIZE(Y), %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm4 addl $SIZE, X movl N, %eax sarl $4, %eax jle .L24 movaps -16 * SIZE(X), %xmm5 movaps -14 * SIZE(X), %xmm6 movaps -12 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -10 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps -8 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm7, %xmm6 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps -6 * SIZE(X), %xmm6 SHUFPD_1 %xmm4, %xmm7 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps -4 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm5, %xmm4 mulpd -8 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -2 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -6 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm7, %xmm6 mulpd -4 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps 2 * SIZE(X), %xmm6 SHUFPD_1 %xmm4, %xmm7 mulpd -2 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps 4 * SIZE(X), %xmm7 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -10 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps -8 * SIZE(X), %xmm5 SHUFPD_1 %xmm7, %xmm6 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps -6 * SIZE(X), %xmm6 SHUFPD_1 %xmm4, %xmm7 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps -4 * SIZE(X), %xmm7 SHUFPD_1 %xmm5, %xmm4 mulpd -8 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -2 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -6 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulpd -4 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 SHUFPD_1 %xmm4, %xmm7 mulpd -2 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L24: testl $15, N jle .L999 testl $8, N jle .L25 movaps -16 * SIZE(X), %xmm5 movaps -14 * SIZE(X), %xmm6 movaps -12 * SIZE(X), %xmm7 SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -10 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 SHUFPD_1 %xmm4, %xmm7 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, N jle .L26 movaps -16 * SIZE(X), %xmm5 movaps -14 * SIZE(X), %xmm6 SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movapd %xmm6, %xmm4 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: testl $2, N jle .L27 movaps -16 * SIZE(X), %xmm5 SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movapd %xmm5, %xmm4 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, N jle .L999 SHUFPD_1 %xmm4, %xmm4 mulsd -16 * SIZE(Y), %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 #else movl N, %eax sarl $4, %eax jle .L24 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movlps -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movlps -12 * SIZE(X), %xmm6 movhps -11 * SIZE(X), %xmm6 movlps -10 * SIZE(X), %xmm7 movhps -9 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movlps -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movlps -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movlps -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movlps -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd -8 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movlps 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 mulpd -6 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movlps 2 * SIZE(X), %xmm5 movhps 3 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulpd -4 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movlps 4 * SIZE(X), %xmm6 movhps 5 * SIZE(X), %xmm6 mulpd -2 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movlps 6 * SIZE(X), %xmm7 movhps 7 * SIZE(X), %xmm7 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movlps -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movlps -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movlps -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movlps -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 mulpd -8 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -6 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -4 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -2 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L24: testl $15, N jle .L999 testl $8, N jle .L25 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movlps -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movlps -12 * SIZE(X), %xmm6 movhps -11 * SIZE(X), %xmm6 movlps -10 * SIZE(X), %xmm7 movhps -9 * SIZE(X), %xmm7 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, N jle .L26 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movlps -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: testl $2, N jle .L27 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, N jle .L999 movsd -16 * SIZE(X), %xmm4 mulsd -16 * SIZE(Y), %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 #endif .L50: movl N, %eax sarl $2, %eax jle .L55 ALIGN_3 .L53: movsd (X), %xmm4 addl INCX, X mulsd (Y), %xmm4 addl INCY, Y movsd (X), %xmm5 addl INCX, X mulsd (Y), %xmm5 addl INCY, Y movsd (X), %xmm6 addl INCX, X mulsd (Y), %xmm6 addl INCY, Y movsd (X), %xmm7 addl INCX, X mulsd (Y), %xmm7 addl INCY, Y addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 decl %eax jg .L53 ALIGN_3 .L55: movl N, %eax andl $3, %eax jle .L999 ALIGN_3 .L56: movsd (X), %xmm4 addl INCX, X mulsd (Y), %xmm4 addl INCY, Y addsd %xmm4, %xmm0 decl %eax jg .L56 ALIGN_3 .L999: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #ifndef HAVE_SSE3 pshufd $0xe, %xmm0, %xmm1 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif movlps %xmm0, STACK_N fldl STACK_N popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/dot_sse2_opteron.S000066400000000000000000000174541313527062700205000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ecx #define X %esi #define INCX %ebx #define Y %edi #define INCY %edx #define PREFETCHSIZE 84 PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N), N # N movl (INCX),INCX # INCX movl (INCY),INCY # INCY #endif leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 cmpl $0, N jle .L999 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 testl $SIZE, Y je .L10 movsd 0 * SIZE(X), %xmm0 mulsd 0 * SIZE(Y), %xmm0 addl $1 * SIZE, X addl $1 * SIZE, Y decl N ALIGN_2 .L10: movl N, %eax sarl $4, %eax jle .L24 movlpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 movlpd 2 * SIZE(X), %xmm5 movhpd 3 * SIZE(X), %xmm5 movlpd 4 * SIZE(X), %xmm6 movhpd 5 * SIZE(X), %xmm6 movlpd 6 * SIZE(X), %xmm7 movhpd 7 * SIZE(X), %xmm7 mulpd 0 * SIZE(Y), %xmm4 mulpd 2 * SIZE(Y), %xmm5 mulpd 4 * SIZE(Y), %xmm6 mulpd 6 * SIZE(Y), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: prefetch (PREFETCHSIZE + 0) * SIZE(Y) addpd %xmm4, %xmm0 movlpd 8 * SIZE(X), %xmm4 movhpd 9 * SIZE(X), %xmm4 addpd %xmm5, %xmm1 movlpd 10 * SIZE(X), %xmm5 movhpd 11 * SIZE(X), %xmm5 addpd %xmm6, %xmm2 movlpd 12 * SIZE(X), %xmm6 movhpd 13 * SIZE(X), %xmm6 addpd %xmm7, %xmm3 movlpd 14 * SIZE(X), %xmm7 movhpd 15 * SIZE(X), %xmm7 mulpd 8 * SIZE(Y), %xmm4 mulpd 10 * SIZE(Y), %xmm5 mulpd 12 * SIZE(Y), %xmm6 mulpd 14 * SIZE(Y), %xmm7 prefetch (PREFETCHSIZE + 8) * SIZE(Y) addpd %xmm4, %xmm0 movlpd 16 * SIZE(X), %xmm4 movhpd 17 * SIZE(X), %xmm4 addpd %xmm5, %xmm1 movlpd 18 * SIZE(X), %xmm5 movhpd 19 * SIZE(X), %xmm5 addpd %xmm6, %xmm2 movlpd 20 * SIZE(X), %xmm6 movhpd 21 * SIZE(X), %xmm6 addpd %xmm7, %xmm3 movlpd 22 * SIZE(X), %xmm7 movhpd 23 * SIZE(X), %xmm7 mulpd 16 * SIZE(Y), %xmm4 mulpd 18 * SIZE(Y), %xmm5 mulpd 20 * SIZE(Y), %xmm6 mulpd 22 * SIZE(Y), %xmm7 addl $16 * SIZE, X addl $16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: addpd %xmm4, %xmm0 movlpd 8 * SIZE(X), %xmm4 movhpd 9 * SIZE(X), %xmm4 addpd %xmm5, %xmm1 movlpd 10 * SIZE(X), %xmm5 movhpd 11 * SIZE(X), %xmm5 addpd %xmm6, %xmm2 movlpd 12 * SIZE(X), %xmm6 movhpd 13 * SIZE(X), %xmm6 addpd %xmm7, %xmm3 movlpd 14 * SIZE(X), %xmm7 movhpd 15 * SIZE(X), %xmm7 mulpd 8 * SIZE(Y), %xmm4 mulpd 10 * SIZE(Y), %xmm5 mulpd 12 * SIZE(Y), %xmm6 mulpd 14 * SIZE(Y), %xmm7 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L24: testl $15, N jle .L999 testl $8, N jle .L25 movlpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 movlpd 2 * SIZE(X), %xmm5 movhpd 3 * SIZE(X), %xmm5 movlpd 4 * SIZE(X), %xmm6 movhpd 5 * SIZE(X), %xmm6 movlpd 6 * SIZE(X), %xmm7 movhpd 7 * SIZE(X), %xmm7 mulpd 0 * SIZE(Y), %xmm4 mulpd 2 * SIZE(Y), %xmm5 mulpd 4 * SIZE(Y), %xmm6 mulpd 6 * SIZE(Y), %xmm7 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, N jle .L26 movlpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 movlpd 2 * SIZE(X), %xmm5 movhpd 3 * SIZE(X), %xmm5 mulpd 0 * SIZE(Y), %xmm4 mulpd 2 * SIZE(Y), %xmm5 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: testl $2, N jle .L27 movlpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 mulpd 0 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, N jle .L999 movsd 0 * SIZE(X), %xmm4 mulsd 0 * SIZE(Y), %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 .L50: #ifdef F_INTERFACE testl INCX, INCX jge .L51 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L51: testl INCY, INCY jge .L52 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L52: #endif movl N, %eax sarl $2, %eax jle .L55 ALIGN_3 .L53: movsd 0 * SIZE(X), %xmm4 addl INCX, X mulsd 0 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(X), %xmm5 addl INCX, X mulsd 0 * SIZE(Y), %xmm5 addl INCY, Y movsd 0 * SIZE(X), %xmm6 addl INCX, X mulsd 0 * SIZE(Y), %xmm6 addl INCY, Y movsd 0 * SIZE(X), %xmm7 addl INCX, X mulsd 0 * SIZE(Y), %xmm7 addl INCY, Y addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 decl %eax jg .L53 ALIGN_3 .L55: movl N, %eax andl $3, %eax jle .L999 ALIGN_3 .L56: movsd 0 * SIZE(X), %xmm4 addl INCX, X mulsd 0 * SIZE(Y), %xmm4 addl INCY, Y addsd %xmm4, %xmm0 decl %eax jg .L56 ALIGN_3 .L999: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if !defined(HAVE_SSE3) || defined(__INTERIX) movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif movsd %xmm0, STACK_N fldl STACK_N popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/dot_sse_opteron.S000066400000000000000000000205411313527062700204050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ecx #define X %esi #define INCX %ebx #define Y %edi #define INCY %edx #define PREFETCHSIZE 84 PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N), N # N movl (INCX),INCX # INCX movl (INCY),INCY # INCY #endif leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 cmpl $0, N jle .L999 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 cmpl $3, N jle .L27 testl $SIZE, Y je .L05 movss 0 * SIZE(X), %xmm0 mulss 0 * SIZE(Y), %xmm0 addl $1 * SIZE, X addl $1 * SIZE, Y decl N ALIGN_2 .L05: testl $2 * SIZE, Y je .L10 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 mulss 0 * SIZE(Y), %xmm4 mulss 1 * SIZE(Y), %xmm5 addss %xmm4, %xmm1 addss %xmm5, %xmm2 addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, N jle .L999 ALIGN_2 .L10: movl N, %eax sarl $5, %eax jle .L24 movlps 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 movlps 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 movlps 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 movlps 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 mulps 0 * SIZE(Y), %xmm4 mulps 4 * SIZE(Y), %xmm5 mulps 8 * SIZE(Y), %xmm6 mulps 12 * SIZE(Y), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: prefetch (PREFETCHSIZE + 0) * SIZE(Y) addps %xmm4, %xmm0 movlps 16 * SIZE(X), %xmm4 movhps 18 * SIZE(X), %xmm4 addps %xmm5, %xmm1 movlps 20 * SIZE(X), %xmm5 movhps 22 * SIZE(X), %xmm5 addps %xmm6, %xmm2 movlps 24 * SIZE(X), %xmm6 movhps 26 * SIZE(X), %xmm6 addps %xmm7, %xmm3 movlps 28 * SIZE(X), %xmm7 movhps 30 * SIZE(X), %xmm7 mulps 16 * SIZE(Y), %xmm4 mulps 20 * SIZE(Y), %xmm5 mulps 24 * SIZE(Y), %xmm6 mulps 28 * SIZE(Y), %xmm7 prefetch (PREFETCHSIZE + 16) * SIZE(Y) addps %xmm4, %xmm0 movlps 32 * SIZE(X), %xmm4 movhps 34 * SIZE(X), %xmm4 addps %xmm5, %xmm1 movlps 36 * SIZE(X), %xmm5 movhps 38 * SIZE(X), %xmm5 addps %xmm6, %xmm2 movlps 40 * SIZE(X), %xmm6 movhps 42 * SIZE(X), %xmm6 addps %xmm7, %xmm3 movlps 44 * SIZE(X), %xmm7 movhps 46 * SIZE(X), %xmm7 mulps 32 * SIZE(Y), %xmm4 mulps 36 * SIZE(Y), %xmm5 mulps 40 * SIZE(Y), %xmm6 mulps 44 * SIZE(Y), %xmm7 addl $32 * SIZE, X addl $32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: addps %xmm4, %xmm0 movlps 16 * SIZE(X), %xmm4 movhps 18 * SIZE(X), %xmm4 addps %xmm5, %xmm1 movlps 20 * SIZE(X), %xmm5 movhps 22 * SIZE(X), %xmm5 addps %xmm6, %xmm2 movlps 24 * SIZE(X), %xmm6 movhps 26 * SIZE(X), %xmm6 addps %xmm7, %xmm3 movlps 28 * SIZE(X), %xmm7 movhps 30 * SIZE(X), %xmm7 mulps 16 * SIZE(Y), %xmm4 mulps 20 * SIZE(Y), %xmm5 mulps 24 * SIZE(Y), %xmm6 mulps 28 * SIZE(Y), %xmm7 addps %xmm4, %xmm0 addps %xmm5, %xmm1 addps %xmm6, %xmm2 addps %xmm7, %xmm3 addl $32 * SIZE, X addl $32 * SIZE, Y ALIGN_3 .L24: testl $31, N jle .L999 testl $16, N jle .L25 movlps 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 movlps 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 movlps 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 movlps 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 mulps 0 * SIZE(Y), %xmm4 mulps 4 * SIZE(Y), %xmm5 mulps 8 * SIZE(Y), %xmm6 mulps 12 * SIZE(Y), %xmm7 addps %xmm4, %xmm0 addps %xmm5, %xmm1 addps %xmm6, %xmm2 addps %xmm7, %xmm3 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L25: testl $8, N jle .L26 movlps 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 movlps 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 mulps 0 * SIZE(Y), %xmm4 mulps 4 * SIZE(Y), %xmm5 addps %xmm4, %xmm0 addps %xmm5, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: testl $4, N jle .L27 movlps 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 mulps 0 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: testl $2, N jle .L28 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 mulss 0 * SIZE(Y), %xmm4 mulss 1 * SIZE(Y), %xmm5 addss %xmm4, %xmm0 addss %xmm5, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L28: testl $1, N jle .L999 movss 0 * SIZE(X), %xmm4 mulss 0 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 .L50: #ifdef F_INTERFACE testl INCX, INCX jge .L51 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L51: testl INCY, INCY jge .L52 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L52: #endif movl N, %eax sarl $2, %eax jle .L55 ALIGN_3 .L53: movss 0 * SIZE(X), %xmm4 addl INCX, X mulss 0 * SIZE(Y), %xmm4 addl INCY, Y movss 0 * SIZE(X), %xmm5 addl INCX, X mulss 0 * SIZE(Y), %xmm5 addl INCY, Y movss 0 * SIZE(X), %xmm6 addl INCX, X mulss 0 * SIZE(Y), %xmm6 addl INCY, Y movss 0 * SIZE(X), %xmm7 addl INCX, X mulss 0 * SIZE(Y), %xmm7 addl INCY, Y addss %xmm4, %xmm0 addss %xmm5, %xmm1 addss %xmm6, %xmm2 addss %xmm7, %xmm3 decl %eax jg .L53 ALIGN_3 .L55: movl N, %eax andl $3, %eax jle .L999 ALIGN_3 .L56: movss 0 * SIZE(X), %xmm4 addl INCX, X mulss 0 * SIZE(Y), %xmm4 addl INCY, Y addss %xmm4, %xmm0 decl %eax jg .L56 ALIGN_3 .L999: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #if !defined(HAVE_SSE3) || defined(__INTERIX) movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 pshufd $1, %xmm0, %xmm1 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #endif movss %xmm0, STACK_N flds STACK_N popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_beta.S000066400000000000000000000121741313527062700171220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #ifdef DOUBLE #define BETA 16 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define LDC 44 + STACK + ARGS(%esp) #else #define BETA 16 + STACK + ARGS(%esp) #define C 36 + STACK + ARGS(%esp) #define LDC 40 + STACK + ARGS(%esp) #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl M, %esi # m movl N, %ecx # n FLD BETA # beta movl C, %edi # C movl LDC, %ebp # ldc testl %esi, %esi # if n <= 0 goto End jle .L999 testl %ecx, %ecx # if m <= 0 goto End jle .L999 ftst fnstsw %ax andb $68, %ah je .L201 ALIGN_4 .L101: movl %edi, %eax # c_offset = c leal (%edi, %ebp, SIZE), %edi # c += ldc movl %esi, %edx sarl $3, %edx jle .L103 ALIGN_4 .L102: #ifdef HAS_PREFETCH #ifndef ATHLON prefetchnta 12 * SIZE(%eax) prefetchnta 16 * SIZE(%eax) #else prefetchw 32 * SIZE(%eax) #endif #endif FSTU 0 * SIZE(%eax) FSTU 1 * SIZE(%eax) FSTU 2 * SIZE(%eax) FSTU 3 * SIZE(%eax) FSTU 4 * SIZE(%eax) FSTU 5 * SIZE(%eax) FSTU 6 * SIZE(%eax) FSTU 7 * SIZE(%eax) addl $8 * SIZE, %eax decl %edx jg .L102 ALIGN_4 .L103: movl %esi, %edx andl $7, %edx jle .L105 ALIGN_4 .L104: FSTU 0 * SIZE(%eax) addl $SIZE, %eax decl %edx jg .L104 ALIGN_4 .L105: decl %ecx jg .L101 jmp .L999 ALIGN_3 .L201: movl %edi, %eax # c_offset = c leal (%edi, %ebp, SIZE), %edi # c += ldc movl %esi, %edx sarl $3, %edx jle .L203 ALIGN_4 .L202: #ifdef HAS_PREFETCH #ifndef ATHLON prefetchnta 16 * SIZE(%eax) prefetchnta 20 * SIZE(%eax) #else prefetchw 32 * SIZE(%eax) #endif #endif FLD 0 * SIZE(%eax) fmul %st(1),%st FST 0 * SIZE(%eax) FLD 1 * SIZE(%eax) fmul %st(1),%st FST 1 * SIZE(%eax) FLD 2 * SIZE(%eax) fmul %st(1),%st FST 2 * SIZE(%eax) FLD 3 * SIZE(%eax) fmul %st(1),%st FST 3 * SIZE(%eax) FLD 4 * SIZE(%eax) fmul %st(1),%st FST 4 * SIZE(%eax) FLD 5 * SIZE(%eax) fmul %st(1),%st FST 5 * SIZE(%eax) FLD 6 * SIZE(%eax) fmul %st(1),%st FST 6 * SIZE(%eax) FLD 7 * SIZE(%eax) fmul %st(1),%st FST 7 * SIZE(%eax) addl $8 * SIZE, %eax decl %edx jg .L202 ALIGN_4 .L203: movl %esi, %edx andl $7, %edx jle .L205 ALIGN_4 .L204: FLD 0 * SIZE(%eax) fmul %st(1), %st FST 0 * SIZE(%eax) addl $SIZE, %eax decl %edx jg .L204 ALIGN_4 .L205: decl %ecx jg .L201 ALIGN_3 .L999: #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif xorl %eax,%eax popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_1x4.S000066400000000000000000000443421313527062700201650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define STACK_A 24 + STACK + ARGS(%esp) #define STACK_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define STACK_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #else #define STACK_A 20 + STACK + ARGS(%esp) #define STACK_B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define STACK_LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #endif #define A %edx #define B %ecx #define BB %ebx #define LDC %ebp #define BX %esi #define PREFETCHSIZE (8 * 5 + 4) #define AOFFSET 1 #define BOFFSET -7 #ifdef HAVE_3DNOW #define PREFETCH prefetch #else #define PREFETCH prefetcht0 #endif #define KERNEL \ PREFETCH PREFETCHSIZE * SIZE + AOFFSET(A, %eax, 1);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD -14 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -13 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD -15 * SIZE + AOFFSET(A, %eax, 1);\ FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD -10 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -9 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD -14 * SIZE + AOFFSET(A, %eax, 1);\ FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD -6 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -5 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD -13 * SIZE + AOFFSET(A, %eax, 1);\ FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD -2 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -1 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD -12 * SIZE + AOFFSET(A, %eax, 1);\ FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 2 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 3 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD -11 * SIZE + AOFFSET(A, %eax, 1);\ FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 6 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 7 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD -10 * SIZE + AOFFSET(A, %eax, 1);\ FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 10 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 11 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD -9 * SIZE + AOFFSET(A, %eax, 1);\ FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 14 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 15 * SIZE + BOFFSET(B, %eax, 4);\ faddp %st, %st(6);\ FLD 8 * SIZE + AOFFSET(A, %eax, 1);\ fxch %st(1);\ FLD 16 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -15 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + AOFFSET(A, %eax, 1);\ faddp %st, %st(5);\ FLD -14 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -13 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD -7 * SIZE + AOFFSET(A, %eax, 1);\ FLD -12 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -11 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD -10 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -9 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD -6 * SIZE + AOFFSET(A, %eax, 1);\ FLD -8 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -7 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD -6 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -5 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD -5 * SIZE + AOFFSET(A, %eax, 1);\ FLD -4 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD -3 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD -2 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL -1 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD -4 * SIZE + AOFFSET(A, %eax, 1);\ FLD 0 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 1 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 2 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 3 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD -3 * SIZE + AOFFSET(A, %eax, 1);\ FLD 4 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 5 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 6 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 7 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD -2 * SIZE + AOFFSET(A, %eax, 1);\ FLD 8 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 9 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 10 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 11 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD -1 * SIZE + AOFFSET(A, %eax, 1);\ FLD 12 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(4);\ FLD 13 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(5);\ FLD 14 * SIZE + BOFFSET(BB, %eax, 4);\ fmul %st(1), %st;\ faddp %st, %st(6);\ FMUL 15 * SIZE + BOFFSET(BB, %eax, 4);\ faddp %st, %st(6);\ FLD 16 * SIZE + AOFFSET(A, %eax, 1);\ fxch %st(2);\ FLD 16 * SIZE + BOFFSET(BB, %eax, 4);\ subl $-16 * SIZE, %eax /* A hint of scheduling is received from following URL http://www.netlib.org/atlas/atlas-comm/msg00260.html */ PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl STACK_LDC, LDC leal (, LDC, SIZE), LDC subl $(AOFFSET - 16 * SIZE), STACK_A subl $(BOFFSET - 16 * SIZE), STACK_B movl M, %eax testl %eax, %eax jle .L999 movl N, %eax testl %eax, %eax jle .L999 movl K, %eax testl %eax, %eax jle .L999 movl N, %eax sarl $2, %eax movl %eax, J je .L20 ALIGN_3 .L11: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl STACK_B, B movl C, %edi movl K, BX sall $BASE_SHIFT + 2, BX addl B, BX movl M, %eax movl %eax, I ALIGN_3 .L14: prefetchnta -16 * SIZE + BOFFSET(BX) subl $-8 * SIZE, BX movl STACK_B, B #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 4), B #endif leal (%edi, LDC, 2), %eax fldz fldz fldz fldz FLD 0 * SIZE + AOFFSET(A) FLD -8 * SIZE + AOFFSET(A) FLD -16 * SIZE + AOFFSET(A) FLD -16 * SIZE + BOFFSET(B) #ifdef HAVE_3DNOW prefetchw 1 * SIZE(%edi) prefetchw 2 * SIZE(%edi, LDC) prefetchw 1 * SIZE(%eax) prefetchw 2 * SIZE(%eax, LDC) #elif defined(HAVE_SSE) prefetcht0 1 * SIZE(%edi) prefetcht0 2 * SIZE(%edi, LDC) prefetcht0 1 * SIZE(%eax) prefetcht0 2 * SIZE(%eax, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif andl $-16, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 1), A leal 32 * SIZE(B, %eax, 4), BB leal (B, %eax, 4), B negl %eax NOBRANCH je .L16 ALIGN_4 .L15: KERNEL jge .L16 KERNEL jge .L16 KERNEL jge .L16 KERNEL jl .L15 ALIGN_4 .L16: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $15, %eax je .L19 ALIGN_4 .L17: fmul %st(1), %st faddp %st, %st(4) FLD -15 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(5) FLD -14 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(6) FMUL -13 * SIZE + BOFFSET(B) faddp %st, %st(6) FLD -15 * SIZE + AOFFSET(A) FLD -12 * SIZE + BOFFSET(B) addl $1 * SIZE,A addl $4 * SIZE,B decl %eax jne .L17 ALIGN_4 .L19: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) FLD ALPHA fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) leal (%edi, LDC, 2), %eax #ifndef TRMMKERNEL FADD (%edi) FST (%edi) FADD (%edi,LDC) FST (%edi,LDC) FADD (%eax) FST (%eax) FADD (%eax,LDC) FST (%eax,LDC) #else FST (%edi) FST (%edi,LDC) FST (%eax) FST (%eax,LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 4), B #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $1 * SIZE, %edi decl I jne .L14 #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C movl B, STACK_B decl J jne .L11 ALIGN_4 .L20: movl N, %eax andl $2, %eax je .L30 ALIGN_3 .L21: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl STACK_B, B movl C, %edi movl M, %eax movl %eax, I ALIGN_3 .L24: movl STACK_B, B #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 2), B #endif fldz fldz fldz fldz FLD -16 * SIZE + AOFFSET(A) FLD -16 * SIZE + BOFFSET(B) prefetchw 1 * SIZE(%edi) prefetchw 1 * SIZE(%edi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L26 ALIGN_3 .L25: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE + BOFFSET(B) faddp %st, %st(2) FLD -15 * SIZE + AOFFSET(A) FLD -14 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(4) FMUL -13 * SIZE + BOFFSET(B) faddp %st, %st(4) FLD -14 * SIZE + AOFFSET(A) FLD -12 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(2) FMUL -11 * SIZE + BOFFSET(B) faddp %st, %st(2) FLD -13 * SIZE + AOFFSET(A) FLD -10 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(4) FMUL -9 * SIZE + BOFFSET(B) faddp %st, %st(4) FLD -12 * SIZE + AOFFSET(A) FLD -8 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(2) FMUL -7 * SIZE + BOFFSET(B) faddp %st, %st(2) FLD -11 * SIZE + AOFFSET(A) FLD -6 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(4) FMUL -5 * SIZE + BOFFSET(B) faddp %st, %st(4) FLD -10 * SIZE + AOFFSET(A) FLD -4 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(2) FMUL -3 * SIZE + BOFFSET(B) faddp %st, %st(2) FLD -9 * SIZE + AOFFSET(A) FLD -2 * SIZE + BOFFSET(B) fmul %st(1), %st faddp %st, %st(4) FMUL -1 * SIZE + BOFFSET(B) faddp %st, %st(4) FLD -8 * SIZE + AOFFSET(A) FLD 0 * SIZE + BOFFSET(B) addl $ 8 * SIZE, A subl $-16 * SIZE, B decl %eax jne .L25 ALIGN_4 .L26: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $7, %eax je .L29 ALIGN_4 .L27: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE + BOFFSET(B) faddp %st, %st(2) FLD -15 * SIZE + AOFFSET(A) FLD -14 * SIZE + BOFFSET(B) addl $1 * SIZE,A addl $2 * SIZE,B decl %eax jne .L27 ALIGN_4 .L29: ffreep %st(0) ffreep %st(0) faddp %st, %st(2) faddp %st, %st(2) FLD ALPHA fmul %st, %st(1) fmulp %st, %st(2) #ifndef TRMMKERNEL FADD (%edi) FST (%edi) FADD (%edi,LDC) FST (%edi,LDC) #else FST (%edi) FST (%edi,LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 2), B #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $1 * SIZE, %edi decl I jne .L24 #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C movl B, STACK_B ALIGN_4 .L30: movl N, %eax andl $1, %eax je .L999 ALIGN_3 .L31: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl STACK_B, B movl C, %edi movl M, %eax movl %eax, I ALIGN_3 .L34: movl STACK_B, B #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 1), B #endif fldz fldz fldz fldz prefetchw 1 * SIZE(%edi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L36 ALIGN_3 .L35: FLD -16 * SIZE + AOFFSET(A) FMUL -16 * SIZE + BOFFSET(B) faddp %st, %st(1) FLD -15 * SIZE + AOFFSET(A) FMUL -15 * SIZE + BOFFSET(B) faddp %st, %st(2) FLD -14 * SIZE + AOFFSET(A) FMUL -14 * SIZE + BOFFSET(B) faddp %st, %st(3) FLD -13 * SIZE + AOFFSET(A) FMUL -13 * SIZE + BOFFSET(B) faddp %st, %st(4) FLD -12 * SIZE + AOFFSET(A) FMUL -12 * SIZE + BOFFSET(B) faddp %st, %st(1) FLD -11 * SIZE + AOFFSET(A) FMUL -11 * SIZE + BOFFSET(B) faddp %st, %st(2) FLD -10 * SIZE + AOFFSET(A) FMUL -10 * SIZE + BOFFSET(B) faddp %st, %st(3) FLD -9 * SIZE + AOFFSET(A) FMUL -9 * SIZE + BOFFSET(B) faddp %st, %st(4) addl $8 * SIZE, A addl $8 * SIZE, B decl %eax jne .L35 ALIGN_4 .L36: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $7, %eax je .L39 ALIGN_4 .L37: FLD -16 * SIZE + AOFFSET(A) FMUL -16 * SIZE + BOFFSET(B) faddp %st, %st(1) addl $1 * SIZE,A addl $1 * SIZE,B decl %eax jne .L37 ALIGN_4 .L39: faddp %st, %st(2) faddp %st, %st(2) faddp %st, %st(1) FMUL ALPHA #ifndef TRMMKERNEL FADD (%edi) FST (%edi) #else FST (%edi) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 1), B #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $1 * SIZE, %edi decl I jne .L34 #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif addl LDC, C movl B, STACK_B ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x2.S000066400000000000000000000335001313527062700201560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define A 24 + STACK + ARGS(%esp) #define B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #else #define A 20 + STACK + ARGS(%esp) #define B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #endif #define PREFETCH_OFFSET 48 #if defined(PENTIUM3) || defined(PENTIUMM) #define REP rep #else #define REP rep #endif PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl N, %eax # j = (n >> 1) # MEMORY movl LDC, %ebp # ldc # MEMORY movl B, %ebx sarl $1, %eax leal (, %ebp, SIZE), %ebp leal 0(%ecx) , %ecx # NOP movl %eax, J # j = (n >> 1) # MEMORY test %eax, %eax je .L8 # if !(n >> 1) goto .L8 ALIGN_4 .L34: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl %ebx, BX movl M, %esi # m # MEMORY movl A, %edx # a # MEMORY movl C, %edi # C # MEMORY sarl $1, %esi # i = (m >> 1) je .L12 ALIGN_4 .MainHead: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 2), %ecx #endif #ifdef HAVE_SSE movl BX, %eax prefetcht2 0 * SIZE(%eax) prefetcht2 4 * SIZE(%eax) #if L2_SIZE > 262144 subl $-8 * SIZE, BX #elif L2_SIZE > 131072 prefetcht2 8 * SIZE(%eax) prefetcht2 12 * SIZE(%eax) subl $-16 * SIZE, BX #else prefetcht2 16 * SIZE(%eax) prefetcht2 20 * SIZE(%eax) prefetcht2 24 * SIZE(%eax) prefetcht2 28 * SIZE(%eax) subl $-32 * SIZE, BX #endif #endif fldz fldz #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif fldz fldz FLD 4 * SIZE(%ecx) # b5 FLD 4 * SIZE(%edx) # a5 FLD 0 * SIZE(%ecx) # b1 FLD 0 * SIZE(%edx) # a1 #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(%edi) prefetchw 2 * SIZE(%edi, %ebp, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(%edi) prefetchnta 2 * SIZE(%edi, %ebp, 1) #endif sarl $2, %eax je .L16 ALIGN_4 .MainLoop: #if defined(HAVE_3DNOW) prefetch (PREFETCH_OFFSET) * SIZE(%ecx) nop #elif defined(HAVE_SSE) prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx) #ifdef CORE_KATMAI prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx) #endif #endif fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(%ecx) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(%edx) fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(%ecx) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(%edx) fmul %st, %st(1) FMUL 3 * SIZE(%ecx) fxch %st(1) faddp %st, %st(4) FLD 2 * SIZE(%ecx) fxch %st(1) faddp %st, %st(5) FLD 3 * SIZE(%edx) fmul %st, %st(1) FMUL 3 * SIZE(%ecx) fxch %st(1) faddp %st, %st(6) FLD 8 * SIZE(%ecx) fxch %st(1) faddp %st, %st(7) FLD 8 * SIZE(%edx) fxch %st(2) #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx) #ifdef CORE_KATMAI prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx) #endif #endif fmul %st, %st(3) FMUL 5 * SIZE(%ecx) fxch %st(3) faddp %st, %st(4) FLD 4 * SIZE(%ecx) fxch %st(3) faddp %st, %st(5) FLD 5 * SIZE(%edx) fmul %st, %st(3) FMUL 5 * SIZE(%ecx) fxch %st(3) faddp %st, %st(6) FLD 6 * SIZE(%ecx) fxch %st(3) faddp %st, %st(7) FLD 6 * SIZE(%edx) fmul %st, %st(3) FMUL 7 * SIZE(%ecx) fxch %st(3) faddp %st, %st(4) FLD 6 * SIZE(%ecx) fxch %st(3) faddp %st, %st(5) FLD 7 * SIZE(%edx) fmul %st, %st(3) FMUL 7 * SIZE(%ecx) fxch %st(3) faddp %st, %st(6) FLD 12 * SIZE(%ecx) fxch %st(3) faddp %st, %st(7) FLD 12 * SIZE(%edx) fxch %st(2) subl $-8 * SIZE, %ecx subl $-8 * SIZE, %edx decl %eax # l -- jne .MainLoop ALIGN_4 .L16: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L21 ALIGN_4 .SubLoop: fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(%ecx) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(%edx) fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(%ecx) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(%edx) addl $2 * SIZE,%ecx addl $2 * SIZE,%edx decl %eax jne .SubLoop ALIGN_4 .L21: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) FLD ALPHA fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) fmulp %st, %st(3) #ifndef TRMMKERNEL FADD 0 * SIZE(%edi) FST 0 * SIZE(%edi) FADD 0 * SIZE(%edi,%ebp) FST 0 * SIZE(%edi,%ebp) FADD 1 * SIZE(%edi) FST 1 * SIZE(%edi) FADD 1 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi,%ebp) #else FST 0 * SIZE(%edi) FST 0 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi) FST 1 * SIZE(%edi,%ebp) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ecx, %eax, 2), %ecx #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %edi rep decl %esi # i -- rep jne .MainHead ALIGN_4 .L12: movl M, %eax # m # MEMORY andl $1, %eax je .L27 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 2), %ecx #endif fldz fldz FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $1,%eax # k >> 1 # MEMORY je .L54 ALIGN_4 .L55: FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0) addl $2 * SIZE, %edx addl $4 * SIZE, %ecx decl %eax jne .L55 ALIGN_4 .L54: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $1,%eax # k & 1 je .L33 ALIGN_4 FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) addl $1 * SIZE, %edx addl $2 * SIZE, %ecx ALIGN_4 .L33: ffreep %st(0) FLD ALPHA fmul %st, %st(2) fmulp %st, %st(1) #ifndef TRMMKERNEL FADD (%edi) FST (%edi) FADD (%edi,%ebp) FST (%edi,%ebp) #else FST (%edi) FST (%edi,%ebp) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ecx, %eax, 2), %ecx #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L27: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif lea (, %ebp, 2), %eax addl %eax, C # C + 2 * ldc # MEMORY movl %ecx, %ebx # b # MEMORY decl J # j-- # MEMORY jne .L34 ALIGN_4 .L8: movl N, %eax # n # MEMORY andl $1, %eax je .End #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, %edi # c # MEMORY movl A, %edx # a # MEMORY movl M, %esi # m # MEMORY sarl $1, %esi # m >> 1 je .L36 ALIGN_4 .L46: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 1), %ecx #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif fldz sarl $1, %eax fldz FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0) je .L56 ALIGN_4 .L57: FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0) FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0) addl $4 * SIZE,%edx addl $2 * SIZE,%ecx dec %eax jne .L57 ALIGN_4 .L56: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $1, %eax je .L45 ALIGN_4 FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0) addl $2 * SIZE,%edx addl $1 * SIZE,%ecx ALIGN_4 .L45: ffreep %st(0) FLD ALPHA fmul %st, %st(1) fmulp %st, %st(2) #ifndef TRMMKERNEL FADD 0 * SIZE(%edi) FST 0 * SIZE(%edi) FADD 1 * SIZE(%edi) FST 1 * SIZE(%edi) #else FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) #endif addl $2 * SIZE, %edi #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ecx, %eax, 1), %ecx #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif decl %esi # i -- jne .L46 ALIGN_4 .L36: movl M, %eax # m # MEMORY andl $1, %eax # m & 1 je .End #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 1), %ecx #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif fldz ALIGN_3 .L51: FLD (%edx) FMUL (%ecx) addl $1 * SIZE,%edx addl $1 * SIZE,%ecx faddp %st,%st(1) decl %eax jne .L51 FMUL ALPHA #ifndef TRMMKERNEL FADD (%edi) FST (%edi) #else FST (%edi) #endif ALIGN_4 .End: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x2_atom.S000066400000000000000000000340331313527062700212000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #define AA %edx #define BB %ecx #define CO1 %esi #define LDC %ebp #define B %edi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif leal (, LDC, SIZE), LDC movl N, %eax sarl $1, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 1, %eax leal (B, %eax), %eax movl %eax, BX movl C, CO1 # coffset = c leal (, LDC, 2), %eax addl %eax, C movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif movl BX, %eax prefetcht0 0 * SIZE(%eax) subl $-8 * SIZE, BX movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) xorps %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 3 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 5 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 4 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 5 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 7 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 7 * SIZE(BB), %xmm3 addl $8 * SIZE, BB addl $8 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movsd ALPHA, %xmm0 addsd %xmm2, %xmm6 addsd %xmm3, %xmm7 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 #ifndef TRMMKERNEL addsd 0 * SIZE(CO1), %xmm4 addsd 1 * SIZE(CO1), %xmm6 addsd 0 * SIZE(CO1, LDC), %xmm5 addsd 1 * SIZE(CO1, LDC), %xmm7 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm6, 1 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO1, LDC) movsd %xmm7, 1 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, CO1 decl %ebx jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L25 ALIGN_4 .L22: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 3 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 5 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 6 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 7 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: movsd ALPHA, %xmm0 addsd %xmm2, %xmm4 addsd %xmm3, %xmm5 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 #ifndef TRMMKERNEL addsd 0 * SIZE(CO1), %xmm4 addsd 0 * SIZE(CO1, LDC), %xmm5 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $1 * SIZE, CO1 ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B decl J jg .L10 ALIGN_4 .L30: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, CO1 addl LDC, C movl A, AA movl M, %ebx sarl $1, %ebx jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 prefetcht0 3 * SIZE(CO1) xorps %xmm2, %xmm2 xorps %xmm4, %xmm4 xorps %xmm6, %xmm6 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 2 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 4 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: movsd ALPHA, %xmm3 addsd %xmm0, %xmm4 addsd %xmm2, %xmm6 mulsd %xmm3, %xmm4 mulsd %xmm3, %xmm6 #ifndef TRMMKERNEL addsd 0 * SIZE(CO1), %xmm4 addsd 1 * SIZE(CO1), %xmm6 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm6, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, CO1 decl %ebx jg .L31 ALIGN_4 .L40: movl M, %ebx testl $1, %ebx jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movsd 0 * SIZE(BB), %xmm2 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L45 ALIGN_4 .L42: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 3 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: movsd ALPHA, %xmm0 addsd %xmm5, %xmm4 mulsd %xmm0, %xmm4 #ifndef TRMMKERNEL addsd 0 * SIZE(CO1), %xmm4 #endif movsd %xmm4, 0 * SIZE(CO1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x4_3dnow.S000066400000000000000000001250601313527062700212750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA 16 + STACK + ARGS(%esi) #define OLD_A 20 + STACK + ARGS(%esi) #define OLD_B 24 + STACK + ARGS(%esi) #define OLD_C 28 + STACK + ARGS(%esi) #define OLD_LDC 32 + STACK + ARGS(%esi) #define OLD_OFFSET 36 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 8(%esp) #define N 12(%esp) #define M 16(%esp) #define A 20(%esp) #define C 24(%esp) #define J 28(%esp) #define OLD_STACK 32(%esp) #define OFFSET 36(%esp) #define KK 40(%esp) #define KKK 44(%esp) #define BUFFER 64(%esp) #define AA %edx #define BB %ecx #define PREFETCHSIZE (16 * 2 + 6) #define AOFFSET -32 #define BOFFSET 128 /* A hint of scheduling is received from following URL https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11 */ PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movd OLD_ALPHA, %mm3 movl %ebx, M movl %eax, N movl %ecx, K subl $AOFFSET * SIZE, %edx movl %edx, A movl %esi, OLD_STACK movl OLD_B, %edi movl OLD_C, %ebx punpckldq %mm3, %mm3 movq %mm3, ALPHA movl %ebx, C movl OLD_LDC, %ebp leal (, %ebp, SIZE), %ebp #ifdef TRMMKERNEL movl OLD_OFFSET, %eax movl %eax, OFFSET #ifndef LEFT negl %eax movl %eax, KK #endif #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_3 .L01: /* Copying to Sub Buffer */ leal BUFFER, %ecx #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L03 ALIGN_3 .L02: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 prefetchnta 72 * SIZE(%edi) punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) movd 8 * SIZE(%edi), %mm0 movd 9 * SIZE(%edi), %mm1 movd 10 * SIZE(%edi), %mm2 movd 11 * SIZE(%edi), %mm3 movd 12 * SIZE(%edi), %mm4 movd 13 * SIZE(%edi), %mm5 movd 14 * SIZE(%edi), %mm6 movd 15 * SIZE(%edi), %mm7 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 16 * SIZE(%ecx) movq %mm1, 18 * SIZE(%ecx) movq %mm2, 20 * SIZE(%ecx) movq %mm3, 22 * SIZE(%ecx) movq %mm4, 24 * SIZE(%ecx) movq %mm5, 26 * SIZE(%ecx) movq %mm6, 28 * SIZE(%ecx) movq %mm7, 30 * SIZE(%ecx) addl $16 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L02 .L03: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_2 .L04: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 movq %mm0, 0 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) addl $4 * SIZE, %edi addl $8 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 16 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 pxor %mm7, %mm7 leal (%ebp, %ebp, 2), %eax prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, %ebp) prefetchw 2 * SIZE(%esi, %ebp, 2) prefetchw 2 * SIZE(%esi, %eax) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L15 ALIGN_4 .L12: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 8 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 10 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 12 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 14 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 32 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 18 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 20 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 22 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 24 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2 pfmul (102 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 26 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2 pfmul (110 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 28 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3 pfmul (118 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 30 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3 pfmul (126 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 48 + AOFFSET) * SIZE(AA), %mm1 subl $-32 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L12 ALIGN_3 .L15: movq ALPHA, %mm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax BRANCH je .L18 ALIGN_3 .L16: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_3 .L18: leal (%ebp, %ebp, 2), %eax #ifndef TRMMKERNEL pfmul %mm3, %mm4 pfadd 0 * SIZE(%esi), %mm4 pfmul %mm3, %mm5 pfadd 0 * SIZE(%esi, %ebp, 1), %mm5 pfmul %mm3, %mm6 pfadd 0 * SIZE(%esi, %ebp, 2), %mm6 pfmul %mm3, %mm7 pfadd 0 * SIZE(%esi, %eax, 1), %mm7 #else pfmul %mm3, %mm4 pfmul %mm3, %mm5 pfmul %mm3, %mm6 pfmul %mm3, %mm7 #endif movq %mm4, 0 * SIZE(%esi) movq %mm5, 0 * SIZE(%esi, %ebp, 1) movq %mm6, 0 * SIZE(%esi, %ebp, 2) movq %mm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 ALIGN_4 .L21: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 8 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 pxor %mm7, %mm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L25 ALIGN_4 .L22: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 1 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 10 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 2 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movd ( 18 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 3 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movd ( 26 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 34 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 5 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 42 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movd ( 50 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 7 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movd ( 58 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 16 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movd ( 66 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movd ( 68 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movd ( 72 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 9 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movd ( 74 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movd ( 76 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movd ( 96 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 10 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movd ( 82 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movd ( 84 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movd ( 88 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 11 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movd ( 90 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movd ( 92 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movd (112 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 12 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movd ( 98 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movd (100 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movd (104 + BOFFSET) * SIZE(BB), %mm2 pfmul (102 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 13 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movd (106 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movd (108 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movd (128 + BOFFSET) * SIZE(BB), %mm2 pfmul (110 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 14 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movd (114 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movd (116 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movd (120 + BOFFSET) * SIZE(BB), %mm3 pfmul (118 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 15 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movd (122 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movd (124 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movd (144 + BOFFSET) * SIZE(BB), %mm3 pfmul (126 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 24 + AOFFSET) * SIZE(AA), %mm1 subl $-16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_3 .L25: movd ALPHA, %mm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax BRANCH je .L28 ALIGN_3 .L26: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 1 + AOFFSET) * SIZE(AA), %mm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_3 .L28: leal (%ebp, %ebp, 2), %eax pfmul %mm3, %mm4 pfmul %mm3, %mm5 pfmul %mm3, %mm6 pfmul %mm3, %mm7 #ifndef TRMMKERNEL movd 0 * SIZE(%esi) , %mm0 movd 0 * SIZE(%esi, %ebp, 1), %mm1 movd 0 * SIZE(%esi, %ebp, 2), %mm2 movd 0 * SIZE(%esi, %eax, 1), %mm3 pfadd %mm0, %mm4 pfadd %mm1, %mm5 pfadd %mm2, %mm6 pfadd %mm3, %mm7 #endif movd %mm4, 0 * SIZE(%esi) movd %mm5, 0 * SIZE(%esi, %ebp, 1) movd %mm6, 0 * SIZE(%esi, %ebp, 2) movd %mm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, %ebp, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L30: movl N, %eax testl $2, %eax jle .L60 ALIGN_3 .L31: /* Copying to Sub Buffer */ leal BUFFER, %ecx #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L33 ALIGN_3 .L32: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 prefetchnta 72 * SIZE(%edi) punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L32 .L33: movl K, %eax andl $3, %eax BRANCH jle .L40 ALIGN_2 .L34: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 movq %mm0, 0 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) addl $2 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L34 ALIGN_4 .L40: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 16 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 pxor %mm7, %mm7 prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, %ebp) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L45 ALIGN_4 .L42: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 8 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 10 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 12 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 14 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 32 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 18 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 20 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 22 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 24 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 26 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 28 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 30 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 48 + AOFFSET) * SIZE(AA), %mm1 subl $-32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L42 ALIGN_3 .L45: movq ALPHA, %mm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax BRANCH je .L48 ALIGN_3 .L46: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L46 ALIGN_3 .L48: pfadd %mm6, %mm4 pfadd %mm7, %mm5 pfmul %mm3, %mm4 pfmul %mm3, %mm5 #ifndef TRMMKERNEL pfadd 0 * SIZE(%esi), %mm4 pfadd 0 * SIZE(%esi, %ebp, 1), %mm5 #endif movq %mm4, 0 * SIZE(%esi) movq %mm5, 0 * SIZE(%esi, %ebp, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 ALIGN_4 .L51: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 8 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 pxor %mm7, %mm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L55 ALIGN_4 .L52: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movd ( 1 + AOFFSET) * SIZE(AA), %mm0 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 2 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movd ( 3 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movd ( 5 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movd ( 7 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 16 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movd ( 9 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 10 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movd ( 11 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 12 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movd ( 13 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 14 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movd ( 15 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 24 + AOFFSET) * SIZE(AA), %mm1 subl $-16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L52 ALIGN_3 .L55: movd ALPHA, %mm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax BRANCH je .L58 ALIGN_3 .L56: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movd ( 1 + AOFFSET) * SIZE(AA), %mm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_3 .L58: pfadd %mm6, %mm4 pfadd %mm7, %mm5 pfmul %mm3, %mm4 pfmul %mm3, %mm5 #ifndef TRMMKERNEL movd 0 * SIZE(%esi) , %mm0 movd 0 * SIZE(%esi, %ebp, 1), %mm1 pfadd %mm0, %mm4 pfadd %mm1, %mm5 #endif movd %mm4, 0 * SIZE(%esi) movd %mm5, 0 * SIZE(%esi, %ebp, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L59: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, %ebp, 2), %eax addl %eax, C # c += 4 * ldc ALIGN_4 .L60: movl N, %eax testl $1, %eax jle .L999 ALIGN_3 .L61: /* Copying to Sub Buffer */ leal BUFFER, %ecx #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $3, %eax jle .L63 ALIGN_3 .L62: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 prefetchnta 72 * SIZE(%edi) punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L62 .L63: movl K, %eax andl $7, %eax BRANCH jle .L70 ALIGN_2 .L64: movd 0 * SIZE(%edi), %mm0 punpckldq %mm0, %mm0 movq %mm0, 0 * SIZE(%ecx) addl $1 * SIZE, %edi addl $2 * SIZE, %ecx decl %eax jne .L64 ALIGN_4 .L70: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L90 ALIGN_4 .L71: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 16 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 pxor %mm7, %mm7 prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, %ebp) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L75 ALIGN_4 .L72: pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm4 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm6 movq ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 8 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm4 movq ( 10 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 12 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm6 movq ( 14 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 32 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm4 movq ( 18 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 20 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm6 movq ( 22 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 24 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm4 movq ( 26 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 28 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm6 movq ( 30 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 48 + AOFFSET) * SIZE(AA), %mm1 subl $-32 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L72 ALIGN_3 .L75: movq ALPHA, %mm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax BRANCH je .L78 ALIGN_3 .L76: pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm4 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_3 .L78: pfadd %mm5, %mm4 pfadd %mm7, %mm6 pfadd %mm6, %mm4 pfmul %mm3, %mm4 #ifndef TRMMKERNEL pfadd 0 * SIZE(%esi), %mm4 #endif movq %mm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L71 ALIGN_4 .L90: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L999 ALIGN_4 .L91: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 8 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 pxor %mm7, %mm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L95 ALIGN_4 .L92: PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm4 movd ( 1 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movd ( 2 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm6 movd ( 3 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm4 movd ( 5 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movd ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm6 movd ( 7 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movd ( 16 + AOFFSET) * SIZE(AA), %mm0 pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm4 movd ( 9 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movd ( 10 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm6 movd ( 11 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 12 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm4 movd ( 13 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movd ( 14 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm6 movd ( 15 + AOFFSET) * SIZE(AA), %mm1 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movd ( 24 + AOFFSET) * SIZE(AA), %mm1 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L92 ALIGN_3 .L95: movd ALPHA, %mm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax BRANCH je .L98 ALIGN_3 .L96: pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm4 movd ( 1 + AOFFSET) * SIZE(AA), %mm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L96 ALIGN_3 .L98: #ifndef TRMMKERNEL movd 0 * SIZE(%esi), %mm0 #endif pfadd %mm5, %mm4 pfadd %mm7, %mm6 pfadd %mm6, %mm4 pfmul %mm3, %mm4 pfmul %mm3, %mm5 #ifndef TRMMKERNEL pfadd %mm0, %mm4 #endif movd %mm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x4_barcelona.S000066400000000000000000000616431313527062700221770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define OLD_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define OLD_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define B %edi #define LDC %ebp #define AO %edx #define BO %ecx #define CO %esi #define I %ebx #define movsd movlps #define movapd movups #define movlpd movlps #define movhpd movhps #define PREFETCH prefetch #define PREFETCHSIZE (8 * 7 + 0) #define KERNEL1(address) \ mulpd %xmm0, %xmm1; \ mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup -14 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL2(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup -12 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL3(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup -10 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL4(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd (BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup (AO, %eax, 2), %xmm0 #define KERNEL5(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup -6 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL6(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup -4 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL7(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup -2 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL8(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl OLD_B, B movl OLD_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-16 * SIZE, A subl $-16 * SIZE, B leal (, LDC, SIZE), LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax movl %eax, BX movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 pxor %xmm4, %xmm4 movddup -8 * SIZE(AO), %xmm3 leal (LDC, LDC, 2), %eax prefetchw 1 * SIZE(CO) pxor %xmm5, %xmm5 prefetchw 3 * SIZE(CO, LDC) pxor %xmm6, %xmm6 prefetchw 1 * SIZE(CO, LDC, 2) pxor %xmm7, %xmm7 prefetchw 3 * SIZE(CO, %eax) movapd %xmm1, %xmm2 movl BX, %eax prefetch -16 * SIZE(%eax) addl $8 * SIZE, %eax movl %eax, BX #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif andl $-8, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO negl %eax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax BRANCH jl .L12 ALIGN_3 .L15: movddup ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) je .L18 leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO negl %eax ALIGN_3 .L17: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BO, %eax, 4), %xmm1 addpd %xmm0, %xmm5 movddup -15 * SIZE(AO, %eax, 2), %xmm0 mulpd %xmm0, %xmm2 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 addpd %xmm0, %xmm7 movddup -14 * SIZE(AO, %eax, 2), %xmm0 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 addl $SIZE, %eax jl .L17 ALIGN_4 .L18: leal (CO, LDC, 2), %eax mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 #ifndef TRMMKERNEL movsd 0 * SIZE(CO ), %xmm0 movhpd 0 * SIZE(CO, LDC), %xmm0 movsd 0 * SIZE(%eax ), %xmm1 movhpd 0 * SIZE(%eax, LDC), %xmm1 movsd 1 * SIZE(CO ), %xmm2 movhpd 1 * SIZE(CO, LDC), %xmm2 movsd 1 * SIZE(%eax ), %xmm3 movhpd 1 * SIZE(%eax, LDC), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(CO) movsd %xmm6, 1 * SIZE(CO) movhpd %xmm4, 0 * SIZE(CO, LDC) movhpd %xmm6, 1 * SIZE(CO, LDC) movsd %xmm5, 0 * SIZE(%eax) movsd %xmm7, 1 * SIZE(%eax) movhpd %xmm5, 0 * SIZE(%eax, LDC) movhpd %xmm7, 1 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, CO # coffset += 2 decl I # i -- jg .L11 ALIGN_4 .L20: movl M, I testl $1, I # i = (m >> 2) jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 movddup -8 * SIZE(AO), %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd -8 * SIZE(BO), %xmm1 addpd %xmm0, %xmm7 movddup -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd -4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -13 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd (BO), %xmm1 addpd %xmm0, %xmm7 movddup -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -11 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd 8 * SIZE(BO), %xmm1 addpd %xmm0, %xmm7 movddup -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd 12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -9 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd 16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm7 movddup -8 * SIZE(AO), %xmm0 subl $ -8 * SIZE, AO subl $-32 * SIZE, BO decl %eax jne .L22 ALIGN_4 .L25: movddup ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -15 * SIZE(AO), %xmm0 addl $1 * SIZE, AO addl $4 * SIZE, BO decl %eax jg .L26 ALIGN_4 .L28: leal (CO, LDC, 2), %eax addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(CO ), %xmm0 movhpd 0 * SIZE(CO, LDC), %xmm0 movsd 0 * SIZE(%eax ), %xmm1 movhpd 0 * SIZE(%eax, LDC), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(CO ) movhpd %xmm4, 0 * SIZE(CO, LDC) movsd %xmm5, 0 * SIZE(%eax ) movhpd %xmm5, 0 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 1), AO leal (BO, %eax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif movl BO, B leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L30: testl $2, N je .L60 ALIGN_2 .L31: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 prefetchw 1 * SIZE(CO) pxor %xmm5, %xmm5 prefetchw 1 * SIZE(CO, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -16 * SIZE(BO), %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -14 * SIZE(BO), %xmm0 movddup -13 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -14 * SIZE(BO), %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 mulpd -12 * SIZE(BO), %xmm0 movddup -11 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -12 * SIZE(BO), %xmm1 movddup -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -10 * SIZE(BO), %xmm1 movddup -8 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO) mulpd -8 * SIZE(BO), %xmm0 movddup -7 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -8 * SIZE(BO), %xmm1 movddup -6 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -6 * SIZE(BO), %xmm0 movddup -5 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -6 * SIZE(BO), %xmm1 movddup -4 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 mulpd -4 * SIZE(BO), %xmm0 movddup -3 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -4 * SIZE(BO), %xmm1 movddup -2 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -2 * SIZE(BO), %xmm0 movddup -1 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -2 * SIZE(BO), %xmm1 movddup 0 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 subl $-16 * SIZE, AO subl $-16 * SIZE, BO decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -16 * SIZE(BO), %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 addl $2 * SIZE, AO addl $2 * SIZE, BO decl %eax jg .L46 ALIGN_4 .L48: #ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 movhpd 0 * SIZE(CO, LDC), %xmm0 movsd 1 * SIZE(CO), %xmm1 movhpd 1 * SIZE(CO, LDC), %xmm1 #endif addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movlpd %xmm4, 0 * SIZE(CO) movlpd %xmm5, 1 * SIZE(CO) movhpd %xmm4, 0 * SIZE(CO, LDC) movhpd %xmm5, 1 * SIZE(CO, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, CO # coffset += 2 decl I # i -- jg .L41 ALIGN_4 .L50: movl M, I testl $1, I # i = (m >> 2) jle .L59 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(AO), %xmm0 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -14 * SIZE(AO), %xmm0 mulpd -12 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -13 * SIZE(AO), %xmm0 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -12 * SIZE(AO), %xmm0 mulpd -8 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -11 * SIZE(AO), %xmm0 mulpd -6 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -10 * SIZE(AO), %xmm0 mulpd -4 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -9 * SIZE(AO), %xmm0 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -8 * SIZE(AO), %xmm0 subl $ -8 * SIZE, AO subl $-16 * SIZE, BO decl %eax jne .L52 ALIGN_4 .L55: movddup ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 .L56: mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(AO), %xmm0 subl $-1 * SIZE, AO subl $-2 * SIZE, BO decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 addpd %xmm5, %xmm4 mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 movhpd 0 * SIZE(CO, LDC), %xmm0 addpd %xmm0, %xmm4 #endif movlpd %xmm4, 0 * SIZE(CO) movhpd %xmm4, 0 * SIZE(CO, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 1), AO leal (BO, %eax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L59: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BO, B leal (, LDC, 2), %eax addl %eax, C # c += 4 * ldc ALIGN_4 .L60: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 prefetchw 1 * SIZE(CO) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(BO), %xmm0 mulpd -14 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -14 * SIZE(BO), %xmm0 mulpd -12 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -13 * SIZE(BO), %xmm0 mulpd -10 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -12 * SIZE(BO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd -8 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -11 * SIZE(BO), %xmm0 mulpd -6 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -10 * SIZE(BO), %xmm0 mulpd -4 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -9 * SIZE(BO), %xmm0 mulpd -2 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -8 * SIZE(BO), %xmm0 subl $-16 * SIZE, AO subl $ -8 * SIZE, BO decl %eax jne .L72 ALIGN_4 .L75: movddup ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd -16 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(BO), %xmm0 addl $2 * SIZE, AO addl $1 * SIZE, BO decl %eax jg .L76 ALIGN_4 .L78: mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 movhpd 1 * SIZE(CO), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(CO) movhpd %xmm4, 1 * SIZE(CO) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, CO # coffset += 2 decl I # i -- jg .L71 ALIGN_4 .L80: movl M, I testl $1, I # i = (m >> 2) jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movapd -14 * SIZE(AO), %xmm0 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm0, %xmm5 movapd -12 * SIZE(AO), %xmm0 mulpd -12 * SIZE(BO), %xmm0 addpd %xmm0, %xmm6 movapd -10 * SIZE(AO), %xmm0 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm0, %xmm7 movapd -8 * SIZE(AO), %xmm0 subl $-8 * SIZE, AO subl $-8 * SIZE, BO decl %eax jne .L82 ALIGN_4 .L85: movddup ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd -16 * SIZE(BO), %xmm0 addsd %xmm0, %xmm4 movsd -15 * SIZE(AO), %xmm0 addl $1 * SIZE, AO addl $1 * SIZE, BO decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 mulsd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(CO), %xmm0 addsd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(CO) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x4_core2.S000066400000000000000000000600461313527062700212570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define PREFETCH_R (8 * 4) #define PREFETCHSIZE (8 * 21 + 4) #define PREFETCH prefetcht0 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-16 * SIZE, A subl $-16 * SIZE, B leal (, LDC, SIZE), LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl B, BX movl C, C1 movl A, AA movl M, I sarl $1, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movl BX, %eax prefetcht2 (PREFETCH_R + 0) * SIZE(%eax) prefetcht2 (PREFETCH_R + 8) * SIZE(%eax) subl $-8 * SIZE, BX leal (C1, LDC, 2), %eax movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 1 * SIZE(C1) pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(C1, LDC) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 1 * SIZE(%eax, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps -10 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps -6 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps -2 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps 2 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps 6 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps 8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps 10 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps 12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps 14 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 // SHUFPD_1 %xmm0, %xmm0 pshufd $0x4e, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps 16 * SIZE(BB), %xmm1 subl $-32 * SIZE, BB subl $-16 * SIZE, AA subl $1, %eax BRANCH jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 addpd %xmm3, %xmm7 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 SHUFPD_1 %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movddup ALPHA, %xmm3 movaps %xmm4, %xmm0 unpcklpd %xmm6, %xmm4 unpckhpd %xmm0, %xmm6 movaps %xmm5, %xmm1 unpcklpd %xmm7, %xmm5 unpckhpd %xmm1, %xmm7 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhpd 1 * SIZE(C1, LDC), %xmm1 movsd 0 * SIZE(%eax), %xmm2 movhpd 1 * SIZE(%eax), %xmm2 movsd 0 * SIZE(%eax, LDC), %xmm3 movhpd 1 * SIZE(%eax, LDC), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 addpd %xmm2, %xmm5 addpd %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm6, 0 * SIZE(C1, LDC) movhpd %xmm6, 1 * SIZE(C1, LDC) movsd %xmm5, 0 * SIZE(%eax) movhpd %xmm5, 1 * SIZE(%eax) movsd %xmm7, 0 * SIZE(%eax, LDC) movhpd %xmm7, 1 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $1, I jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movaps -14 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps -8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -2 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 0 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 6 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 14 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 18 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: movddup ALPHA, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 0 * SIZE(C1, LDC), %xmm0 movsd 0 * SIZE(%eax), %xmm1 movhpd 0 * SIZE(%eax, LDC), %xmm1 #endif mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 0 * SIZE(C1, LDC) movsd %xmm5, 0 * SIZE(%eax) movhpd %xmm5, 0 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif movl BB, B leal (, LDC, 4), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L30: movl N, %eax testl $2, %eax jle .L50 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $1, I jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(C1) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(C1, LDC) pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -10 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -6 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -2 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: movddup ALPHA, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 mulpd %xmm3, %xmm4 movsd %xmm0, %xmm5 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhpd 1 * SIZE(C1, LDC), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhpd %xmm5, 1 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 decl I jg .L31 ALIGN_4 .L40: movl M, I testl $1, I jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -6 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -4 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -2 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps 0 * SIZE(BB), %xmm2 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: movddup ALPHA, %xmm3 addpd %xmm5, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 0 * SIZE(C1, LDC), %xmm0 #endif mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 0 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L50: movl N, %eax testl $1, %eax jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $1, I jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(C1) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L58 ALIGN_4 .L56: pshufd $0x44, %xmm1, %xmm2 movsd -15 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: movddup ALPHA, %xmm3 addpd %xmm5, %xmm4 mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 decl I jg .L51 ALIGN_4 .L60: movl M, I testl $1, I jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: mulsd %xmm0, %xmm2 movsd -15 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd -15 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: movddup ALPHA, %xmm3 addpd %xmm5, %xmm4 haddpd %xmm4, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 #endif mulsd %xmm3, %xmm4 #ifndef TRMMKERNEL addsd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x4_penryn.S000066400000000000000000000613311313527062700215560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx #ifdef NANO #define PREFETCHSIZE (8 * 3 + 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifdef NEHALEM #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifdef SANDYBRIDGE #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht0 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht0 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (8 * 13 + 4) #endif PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-16 * SIZE, A subl $-16 * SIZE, B leal (, LDC, SIZE), LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 2, %eax leal (B, %eax), %eax movl %eax, BX movl C, C1 movl A, AA movl M, I sarl $1, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movl BX, %eax PREFETCHB -16 * SIZE(%eax) subl $-8 * SIZE, %eax movl %eax, BX leal (C1, LDC, 2), %eax movaps -16 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 PREFETCHW 1 * SIZE(C1) xorps %xmm5, %xmm5 PREFETCHW 3 * SIZE(C1, LDC) xorps %xmm6, %xmm6 PREFETCHW 1 * SIZE(%eax) xorps %xmm7, %xmm7 PREFETCHW 3 * SIZE(%eax, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) movaps 2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 subl $-32 * SIZE, BB pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax BRANCH jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movddup ALPHA, %xmm3 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 mulpd %xmm3, %xmm4 movsd %xmm0, %xmm5 mulpd %xmm3, %xmm5 movaps %xmm6, %xmm0 movsd %xmm7, %xmm6 mulpd %xmm3, %xmm6 movsd %xmm0, %xmm7 mulpd %xmm3, %xmm7 movl C1, %eax orl LDC, %eax testl $15, %eax NOBRANCH jne .L18x leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movaps (C1), %xmm0 movaps (C1, LDC), %xmm1 movaps (%eax), %xmm2 movaps (%eax, LDC), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 #endif movaps %xmm4, (C1) movaps %xmm5, (C1, LDC) movaps %xmm6, (%eax) movaps %xmm7, (%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 decl I jg .L11 jmp .L20 ALIGN_4 .L18x: leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movups (C1), %xmm0 movups (C1, LDC), %xmm1 movups (%eax), %xmm2 movups (%eax, LDC), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 #endif movups %xmm4, (C1) movups %xmm5, (C1, LDC) movups %xmm6, (%eax) movups %xmm7, (%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $1, I jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif movaps -16 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 xorps %xmm5, %xmm5 movaps -14 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps -8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -2 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 0 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 6 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 14 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 18 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: movddup ALPHA, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 0 * SIZE(C1, LDC), %xmm0 movsd 0 * SIZE(%eax), %xmm1 movhpd 0 * SIZE(%eax, LDC), %xmm1 #endif mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 0 * SIZE(C1, LDC) movsd %xmm5, 0 * SIZE(%eax) movhpd %xmm5, 0 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif movl BB, B leal (, LDC, 4), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L30: movl N, %eax testl $2, %eax jle .L50 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $1, I jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps -16 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 xorps %xmm5, %xmm5 PREFETCHW 1 * SIZE(C1) xorps %xmm6, %xmm6 PREFETCHW 1 * SIZE(C1, LDC) xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -10 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -6 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -2 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: movddup ALPHA, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 mulpd %xmm3, %xmm4 movsd %xmm0, %xmm5 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhpd 1 * SIZE(C1, LDC), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhpd %xmm5, 1 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 decl I jg .L31 ALIGN_4 .L40: movl M, I testl $1, I jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif movaps -16 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -6 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -4 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -2 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps 0 * SIZE(BB), %xmm2 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: movddup ALPHA, %xmm3 addpd %xmm5, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 0 * SIZE(C1, LDC), %xmm0 #endif mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 0 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L50: movl N, %eax testl $1, %eax jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $1, I jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 xorps %xmm5, %xmm5 PREFETCHW 1 * SIZE(C1) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L58 ALIGN_4 .L56: pshufd $0x44, %xmm1, %xmm2 movsd -15 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: movddup ALPHA, %xmm3 addpd %xmm5, %xmm4 mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 decl I jg .L51 ALIGN_4 .L60: movl M, I testl $1, I jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: mulsd %xmm0, %xmm2 movsd -15 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd -15 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: movddup ALPHA, %xmm3 addpd %xmm5, %xmm4 haddpd %xmm4, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 #endif mulsd %xmm3, %xmm4 #ifndef TRMMKERNEL addsd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x4_sse2.S000066400000000000000000001112431313527062700211150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA 16 + STACK + ARGS(%esi) #define OLD_A 24 + STACK + ARGS(%esi) #define OLD_B 28 + STACK + ARGS(%esi) #define OLD_C 32 + STACK + ARGS(%esi) #define OLD_LDC 36 + STACK + ARGS(%esi) #define OLD_OFFT 40 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 128(%esp) #if defined(OPTERON) || defined(BARCELONA) #define movsd movlpd #endif #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movsd OLD_ALPHA, %xmm3 movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK #ifdef TRMMKERNEL movss OLD_OFFT, %xmm4 #endif unpcklpd %xmm3, %xmm3 movl OLD_B, %edi movl OLD_C, %ebx movapd %xmm3, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif leal (, LDC, SIZE), LDC sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $1, %eax jle .L05 ALIGN_4 .L02: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(%edi) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq 4 * SIZE(%edi), %mm4 movq 5 * SIZE(%edi), %mm5 movq 6 * SIZE(%edi), %mm6 movq 7 * SIZE(%edi), %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm4, 9 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm5, 11 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm6, 13 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) movq %mm7, 15 * SIZE(%ecx) addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $1, %eax BRANCH jle .L10 movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) addl $4 * SIZE, %edi ALIGN_4 .L10: movl %edi, BX movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movl BX, %eax prefetchnta 0 * SIZE(%eax) prefetchnta 8 * SIZE(%eax) subl $-8 * SIZE, BX pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax prefetchw 1 * SIZE(%esi) prefetchw 1 * SIZE(%esi, LDC) prefetchw 1 * SIZE(%esi, LDC, 2) prefetchw 1 * SIZE(%esi, %eax) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL mulpd %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 mulpd %xmm3, %xmm5 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 mulpd %xmm3, %xmm6 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 movhpd 1 * SIZE(%esi, LDC, 2), %xmm2 mulpd %xmm3, %xmm7 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 movhpd 1 * SIZE(%esi, %eax, 1), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 #else mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) movsd %xmm6, 0 * SIZE(%esi, LDC, 2) movhpd %xmm6, 1 * SIZE(%esi, LDC, 2) movsd %xmm7, 0 * SIZE(%esi, %eax, 1) movhpd %xmm7, 1 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax movsd 0 * SIZE(AA), %xmm0 movsd 4 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movsd 10 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movsd 12 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movsd 18 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movsd 20 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 22 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 32 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movsd 26 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movsd 28 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 30 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd 8 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movsd 34 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movsd 36 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 38 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movsd 48 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movsd 42 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movsd 44 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 46 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movsd 56 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movsd 50 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movsd 52 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 54 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movsd 64 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movsd 58 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movsd 60 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 62 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movsd 72 * SIZE(BB), %xmm3 addl $64 * SIZE, BB addsd %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 addl $8 * SIZE, AA decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 8 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL mulsd %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 mulsd %xmm3, %xmm5 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 mulsd %xmm3, %xmm6 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 mulsd %xmm3, %xmm7 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 addsd %xmm0, %xmm4 addsd %xmm1, %xmm5 addsd %xmm2, %xmm6 addsd %xmm3, %xmm7 #else mulsd %xmm3, %xmm4 mulsd %xmm3, %xmm5 mulsd %xmm3, %xmm6 mulsd %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movsd %xmm6, 0 * SIZE(%esi, LDC, 2) movsd %xmm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L30: testl $2, N je .L60 ALIGN_2 .L31: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L35 ALIGN_4 .L32: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movddup 1 * SIZE(%edi), %xmm1 movddup 2 * SIZE(%edi), %xmm2 movddup 3 * SIZE(%edi), %xmm3 movddup 4 * SIZE(%edi), %xmm4 movddup 5 * SIZE(%edi), %xmm5 movddup 6 * SIZE(%edi), %xmm6 movddup 7 * SIZE(%edi), %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 movsd 1 * SIZE(%edi), %xmm1 movsd 2 * SIZE(%edi), %xmm2 movsd 3 * SIZE(%edi), %xmm3 movsd 4 * SIZE(%edi), %xmm4 movsd 5 * SIZE(%edi), %xmm5 movsd 6 * SIZE(%edi), %xmm6 movsd 7 * SIZE(%edi), %xmm7 unpcklpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpckhpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpckhpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #endif prefetcht0 80 * SIZE(%edi) prefetcht1 112 * SIZE(%ecx) #endif #if defined(OPTERON) || defined(BARCELONA) #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(%edi) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq 4 * SIZE(%edi), %mm4 movq 5 * SIZE(%edi), %mm5 movq 6 * SIZE(%edi), %mm6 movq 7 * SIZE(%edi), %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm4, 9 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm5, 11 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm6, 13 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) movq %mm7, 15 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L32 ALIGN_2 .L35: movl K, %eax andl $3, %eax BRANCH jle .L40 ALIGN_2 .L36: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movddup 1 * SIZE(%edi), %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 movsd 1 * SIZE(%edi), %xmm1 unpcklpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) #endif #endif #if defined(OPTERON) || defined(BARCELONA) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) #endif addl $2 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L36 ALIGN_4 .L40: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, LDC) #endif #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: mulpd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 #endif addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax movsd 0 * SIZE(AA), %xmm0 movsd 4 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulsd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movsd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movsd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 .L56: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 addsd %xmm0, %xmm4 addsd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L59: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C # c += 4 * ldc ALIGN_4 .L60: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L65 ALIGN_4 .L62: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movddup 1 * SIZE(%edi), %xmm1 movddup 2 * SIZE(%edi), %xmm2 movddup 3 * SIZE(%edi), %xmm3 movddup 4 * SIZE(%edi), %xmm4 movddup 5 * SIZE(%edi), %xmm5 movddup 6 * SIZE(%edi), %xmm6 movddup 7 * SIZE(%edi), %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 movsd 1 * SIZE(%edi), %xmm1 movsd 2 * SIZE(%edi), %xmm2 movsd 3 * SIZE(%edi), %xmm3 movsd 4 * SIZE(%edi), %xmm4 movsd 5 * SIZE(%edi), %xmm5 movsd 6 * SIZE(%edi), %xmm6 movsd 7 * SIZE(%edi), %xmm7 unpcklpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpckhpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpckhpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #endif prefetcht1 80 * SIZE(%edi) prefetcht0 112 * SIZE(%ecx) #endif #if defined(OPTERON) || defined(BARCELONA) #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(%edi) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq 4 * SIZE(%edi), %mm4 movq 5 * SIZE(%edi), %mm5 movq 6 * SIZE(%edi), %mm6 movq 7 * SIZE(%edi), %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm4, 9 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm5, 11 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm6, 13 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) movq %mm7, 15 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L62 ALIGN_2 .L65: movl K, %eax andl $7, %eax BRANCH jle .L70 ALIGN_2 .L66: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movapd %xmm0, 0 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) #endif #endif #if defined(OPTERON) || defined(BARCELONA) movq 0 * SIZE(%edi), %mm0 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) #endif addl $1 * SIZE, %edi addl $2 * SIZE, %ecx decl %eax jne .L66 ALIGN_4 .L70: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 2 * SIZE(%esi) #endif #ifdef PENTIUM4 prefetchnta 2 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movapd 16 * SIZE(BB), %xmm2 movapd 2 * SIZE(AA), %xmm0 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movapd 24 * SIZE(BB), %xmm3 movapd 10 * SIZE(AA), %xmm1 mulpd 10 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(AA), %xmm0 movapd 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax movsd 0 * SIZE(AA), %xmm0 movsd 4 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: mulsd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 1 * SIZE(AA), %xmm0 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm6 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm7 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movsd 5 * SIZE(AA), %xmm1 mulsd 10 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm6 movsd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addsd %xmm5, %xmm4 addsd %xmm7, %xmm6 addsd %xmm6, %xmm4 mulsd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 addsd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_2x4_sse3.S000066400000000000000000001041471313527062700211230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #ifdef PENTIUM4 #define PREFETCH_R (8 * 4) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef PENTIUMM #define PREFETCH_R (8 * 4) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addpd %xmm2, %xmm4; \ movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm2, %xmm7; \ movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm2, %xmm7; \ movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm3, %xmm7; \ movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm3, %xmm7; \ movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm2, %xmm7 #define KERNEL6(address) \ movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm2, %xmm7; \ movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL7(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm3, %xmm7; \ movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm3, %xmm7; \ movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif leal (, LDC, SIZE), LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 2, %eax leal (B, %eax), %eax movl %eax, BX movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif movl BX, %eax prefetcht2 0 * SIZE(%eax) subl $-4 * SIZE, BX movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax #ifdef PENTIUM4 prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC, 1) prefetchnta 3 * SIZE(%esi, LDC, 2) prefetchnta 3 * SIZE(%esi, %eax, 1) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #ifdef CORE_PRESCOTT andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) #if 1 cmpl $128 * 8, %eax jle .L12 KERNEL1(16 * 8) KERNEL2(16 * 8) KERNEL3(16 * 8) KERNEL4(16 * 8) KERNEL5(16 * 8) KERNEL6(16 * 8) KERNEL7(16 * 8) KERNEL8(16 * 8) cmpl $128 * 9, %eax jle .L12 KERNEL1(16 * 9) KERNEL2(16 * 9) KERNEL3(16 * 9) KERNEL4(16 * 9) KERNEL5(16 * 9) KERNEL6(16 * 9) KERNEL7(16 * 9) KERNEL8(16 * 9) cmpl $128 * 10, %eax jle .L12 KERNEL1(16 * 10) KERNEL2(16 * 10) KERNEL3(16 * 10) KERNEL4(16 * 10) KERNEL5(16 * 10) KERNEL6(16 * 10) KERNEL7(16 * 10) KERNEL8(16 * 10) cmpl $128 * 11, %eax jle .L12 KERNEL1(16 * 11) KERNEL2(16 * 11) KERNEL3(16 * 11) KERNEL4(16 * 11) KERNEL5(16 * 11) KERNEL6(16 * 11) KERNEL7(16 * 11) KERNEL8(16 * 11) cmpl $128 * 12, %eax jle .L12 KERNEL1(16 * 12) KERNEL2(16 * 12) KERNEL3(16 * 12) KERNEL4(16 * 12) KERNEL5(16 * 12) KERNEL6(16 * 12) KERNEL7(16 * 12) KERNEL8(16 * 12) cmpl $128 * 13, %eax jle .L12 KERNEL1(16 * 13) KERNEL2(16 * 13) KERNEL3(16 * 13) KERNEL4(16 * 13) KERNEL5(16 * 13) KERNEL6(16 * 13) KERNEL7(16 * 13) KERNEL8(16 * 13) cmpl $128 * 14, %eax jle .L12 KERNEL1(16 * 14) KERNEL2(16 * 14) KERNEL3(16 * 14) KERNEL4(16 * 14) KERNEL5(16 * 14) KERNEL6(16 * 14) KERNEL7(16 * 14) KERNEL8(16 * 14) cmpl $128 * 15, %eax jle .L12 KERNEL1(16 * 15) KERNEL2(16 * 15) KERNEL3(16 * 15) KERNEL4(16 * 15) KERNEL5(16 * 15) KERNEL6(16 * 15) KERNEL7(16 * 15) KERNEL8(16 * 15) #else addl $32 * 4 * SIZE, AA addl $32 * 8 * SIZE, BB subl $128 * 8, %eax jg .L1X #endif .L12: leal (AA, %eax, 1), AA # * 16 leal (BB, %eax, 2), BB # * 64 #else sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 17 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 18 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 19 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 10 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 20 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 21 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 22 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 23 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 32 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 25 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 26 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 27 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 28 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 29 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 30 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 31 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 40 * SIZE(BB), %xmm3 addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: SHUFPD_2 %xmm0, %xmm0 SHUFPD_2 %xmm1, %xmm1 SHUFPD_2 %xmm2, %xmm2 SHUFPD_2 %xmm3, %xmm3 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 movl %esi, %eax orl LDC, %eax testl $15, %eax NOBRANCH jne .L18x leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movapd 0 * SIZE(%esi), %xmm0 movapd 0 * SIZE(%esi, LDC, 1), %xmm1 movapd 0 * SIZE(%esi, LDC, 2), %xmm2 movapd 0 * SIZE(%esi, %eax, 1), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 #endif movapd %xmm4, 0 * SIZE(%esi) movapd %xmm5, 0 * SIZE(%esi, LDC, 1) movapd %xmm6, 0 * SIZE(%esi, LDC, 2) movapd %xmm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 jmp .L20 ALIGN_4 .L18x: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 movhpd 1 * SIZE(%esi, LDC, 2), %xmm2 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 movhpd 1 * SIZE(%esi, %eax, 1), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) movsd %xmm6, 0 * SIZE(%esi, LDC, 2) movhpd %xmm6, 1 * SIZE(%esi, LDC, 2) movsd %xmm7, 0 * SIZE(%esi, %eax, 1) movhpd %xmm7, 1 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_3 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 18 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 22 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 26 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 7 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 30 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 34 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 36 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 9 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 38 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 48 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 42 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 44 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 11 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 46 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 56 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 50 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 52 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 13 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 54 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 64 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 58 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 60 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 15 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 62 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 72 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA, %xmm3 andl $15, %eax # if (k & 1) BRANCH je .L28 .L26: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (%esi, LDC, 1), %eax addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL #ifdef PENTIUM4 SHUFPD_2 %xmm0, %xmm0 SHUFPD_2 %xmm1, %xmm1 #endif movsd 0 * SIZE(%esi), %xmm0 movhpd 0 * SIZE(%eax), %xmm0 movsd 0 * SIZE(%esi, LDC, 2), %xmm1 movhpd 0 * SIZE(%eax, LDC, 2), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 0 * SIZE(%eax) movsd %xmm5, 0 * SIZE(%esi, LDC, 2) movhpd %xmm5, 0 * SIZE(%eax, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax movl BB, B addl %eax, C # c += 4 * ldc decl J # j -- jg .L10 ALIGN_4 .L30: testl $2, N je .L60 movl C, %esi # coffset = c movl A, AA # aoffset = a #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef HAVE_3DNOW prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, LDC) #endif #ifdef PENTIUM4 prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL #ifdef PENTIUM4 SHUFPD_2 %xmm0, %xmm0 SHUFPD_2 %xmm1, %xmm1 #endif movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 2 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 3 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup 5 * SIZE(AA), %xmm0 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm0 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 6 * SIZE(AA), %xmm0 mulpd 12 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 7 * SIZE(AA), %xmm0 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 movddup 9 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 mulpd 18 * SIZE(BB), %xmm1 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 10 * SIZE(AA), %xmm1 mulpd 20 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 11 * SIZE(AA), %xmm1 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 movddup 13 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 26 * SIZE(BB), %xmm1 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 14 * SIZE(AA), %xmm1 mulpd 28 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 15 * SIZE(AA), %xmm1 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA, %xmm3 andl $15, %eax # if (k & 1) BRANCH je .L58 .L56: mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL #ifdef PENTIUM4 SHUFPD_2 %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 movhpd 0 * SIZE(%esi, LDC, 1), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 0 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L59: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax movl BB, B addl %eax, C # c += 4 * ldc ALIGN_4 .L60: testl $1, N je .L999 movl C, %esi # coffset = c movl A, AA # aoffset = a #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 4 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef PENTIUM4 prefetchnta 3 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd 4 * SIZE(AA), %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm2, %xmm7 movddup 8 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movddup 5 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm3, %xmm5 movddup 6 * SIZE(BB), %xmm3 mulpd 12 * SIZE(AA), %xmm3 addpd %xmm3, %xmm6 movddup 7 * SIZE(BB), %xmm3 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL #ifdef PENTIUM4 SHUFPD_2 %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA, %xmm3 andl $15, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 mulsd %xmm3, %xmm4 #ifndef TRMMKERNEL #ifdef PENTIUM4 SHUFPD_2 %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 addsd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_4x2_core2.S000066400000000000000000000626321313527062700212620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA 16 + STACK + ARGS(%esi) #define OLD_A 24 + STACK + ARGS(%esi) #define OLD_B 28 + STACK + ARGS(%esi) #define OLD_C 32 + STACK + ARGS(%esi) #define OLD_LDC 36 + STACK + ARGS(%esi) #define OLD_OFFT 40 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 256(%esp) #define PREFETCH_R (8 * 16 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 7 + 4) #define PREFETCH prefetcht0 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx PROLOGUE PROFCODE pushl %ebp pushl %edi pushl %esi pushl %ebx movl %esp, %esi # save old stack subl $512 + LOCAL_BUFFER_SIZE, %esp andl $-4096, %esp # align stack STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movsd OLD_ALPHA, %xmm3 #ifdef TRMMKERNEL movd OLD_OFFT, %mm4 #endif movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK unpcklpd %xmm3, %xmm3 movl OLD_B, B movl OLD_C, %ebx movapd %xmm3, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif subl $-16 * SIZE, A subl $-16 * SIZE, B leal (, LDC, SIZE), LDC sarl $1, %eax movl %eax, J jle .L40 ALIGN_4 .L01: leal 16 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L05: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_4 .L06: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L06 ALIGN_4 .L10: movl B, BX movl C, C1 movl A, AA movl M, I sarl $2, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 prefetcht0 3 * SIZE(C1) pxor %xmm7, %xmm7 prefetcht0 7 * SIZE(C1, LDC) movapd %xmm1, %xmm2 movl BX, %eax prefetcht0 (%eax) subl $-8 * SIZE, %eax movl %eax, BX #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movapd -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd -10 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 movapd -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 PADDING; movapd 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm6 movapd -4 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm4 movapd -2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm6 PADDING; movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm2 movapd 8 * SIZE(AA), %xmm3 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 movapd 8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm6 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movapd 12 * SIZE(AA), %xmm3 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm4 movapd 14 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 subl $-32 * SIZE, BB addpd %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm6 movapd -16 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm2 movapd 24 * SIZE(AA), %xmm3 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 subl $-32 * SIZE, AA decl %eax BRANCH jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 2 * SIZE(C1), %xmm2 movhpd 3 * SIZE(C1), %xmm2 movsd 0 * SIZE(C1, LDC), %xmm1 movhpd 1 * SIZE(C1, LDC), %xmm1 movsd 2 * SIZE(C1, LDC), %xmm3 movhpd 3 * SIZE(C1, LDC), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm6, 2 * SIZE(C1) movhpd %xmm6, 3 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhpd %xmm5, 1 * SIZE(C1, LDC) movsd %xmm7, 2 * SIZE(C1, LDC) movhpd %xmm7, 3 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $2, I jle .L30 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $2, %eax movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 addpd %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 addpd %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 addpd %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 addpd %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: movapd ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhpd 1 * SIZE(C1, LDC), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhpd %xmm5, 1 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 ALIGN_4 .L30: movl M, I testl $1, I jle .L39 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -12 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm1 mulsd -10 * SIZE(BB), %xmm0 addsd %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 addsd %xmm0, %xmm7 movsd -14 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd -4 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd -13 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 8 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd -8 * SIZE(AA), %xmm0 mulsd %xmm2, %xmm1 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm4 movsd 4 * SIZE(BB), %xmm1 addsd %xmm2, %xmm5 movsd -11 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm1 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm6 movsd 16 * SIZE(BB), %xmm1 addsd %xmm2, %xmm7 movsd -10 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 10 * SIZE(BB), %xmm2 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm2, %xmm5 movsd -9 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 14 * SIZE(BB), %xmm2 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm2, %xmm7 movsd -4 * SIZE(AA), %xmm2 subl $-8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: movsd ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 mulsd %xmm3, %xmm4 mulsd %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 addsd %xmm0, %xmm4 addsd %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L40: movl N, %eax testl $1, %eax jle .L999 ALIGN_4 .L41: leal 16 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $3, %eax jle .L45 ALIGN_4 .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $7, %eax BRANCH jle .L50 ALIGN_4 .L46: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, -16 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L46 ALIGN_4 .L50: movl C, C1 movl A, AA movl M, I sarl $2, I jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(C1) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AA), %xmm1 addpd %xmm0, %xmm5 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -6 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 movapd -4 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -2 * SIZE(AA), %xmm1 addpd %xmm2, %xmm5 movapd 8 * SIZE(AA), %xmm2 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm0 mulpd 2 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 6 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm2, %xmm4 movapd 12 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm2, %xmm5 movapd 24 * SIZE(AA), %xmm2 addpd %xmm3, %xmm7 movapd 8 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: movapd ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L58 ALIGN_4 .L56: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm6 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 2 * SIZE(C1), %xmm2 movhpd 3 * SIZE(C1), %xmm2 addpd %xmm0, %xmm4 addpd %xmm2, %xmm6 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm6, 2 * SIZE(C1) movhpd %xmm6, 3 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, C1 decl I jg .L51 ALIGN_4 .L60: movl M, I testl $2, I jle .L70 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 movapd -8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm2, %xmm3 movapd -6 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -2 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd 8 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd 8 * SIZE(BB), %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: movapd ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addpd %xmm5, %xmm4 mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 ALIGN_4 .L70: movl M, I testl $1, I jle .L79 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -8 * SIZE(BB), %xmm3 movsd -12 * SIZE(AA), %xmm2 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -14 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -12 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -13 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -10 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -8 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -0 * SIZE(BB), %xmm1 mulsd %xmm2, %xmm3 movsd -11 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -6 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -10 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd -4 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -9 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -2 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -4 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd 8 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: movsd ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L78 ALIGN_4 .L76: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addsd %xmm5, %xmm4 mulsd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 addsd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) ALIGN_4 .L79: addl LDC, C ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_4x2_sse2.S000066400000000000000000000765271313527062700211340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 4) #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define AA %edx #define BB %ecx #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movq STACK_ALPHA, %mm7 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movq %mm7, 0 * SIZE + ALPHA movq %mm7, 1 * SIZE + ALPHA movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif sall $BASE_SHIFT, LDC sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L100 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B subl $-16 * SIZE, %ecx decl %eax BRANCH jne .L02 ALIGN_2 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax BRANCH jne .L04 ALIGN_4 .L05: movl B, BX movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) NOBRANCH jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC) movl BX, %eax prefetcht2 0 * SIZE(%eax) subl $-8 * SIZE, %eax movl %eax, BX #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif #ifdef PENTIUM4 andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .align 8 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else sarl $3, %eax je .L12 .L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $32 * SIZE, %ecx addl $32 * SIZE, %edx decl %eax jne .L11 #endif .L12: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 ALIGN_4 .L14: mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 movl %esi, %eax orl LDC, %eax testl $15, %eax NOBRANCH jne .L18x #ifndef TRMMKERNEL movapd 0 * SIZE(%esi), %xmm0 movapd 2 * SIZE(%esi), %xmm1 movapd 0 * SIZE(%esi, LDC), %xmm2 movapd 2 * SIZE(%esi, LDC), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 addpd %xmm2, %xmm5 addpd %xmm3, %xmm7 #endif movapd %xmm4, 0 * SIZE(%esi) movapd %xmm6, 2 * SIZE(%esi) movapd %xmm5, 0 * SIZE(%esi, LDC) movapd %xmm7, 2 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- BRANCH jg .L10 jmp .L30 ALIGN_2 .L18x: #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhpd 3 * SIZE(%esi), %xmm1 movsd 0 * SIZE(%esi, LDC), %xmm2 movhpd 1 * SIZE(%esi, LDC), %xmm2 movsd 2 * SIZE(%esi, LDC), %xmm3 movhpd 3 * SIZE(%esi, LDC), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 addpd %xmm2, %xmm5 addpd %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm6, 2 * SIZE(%esi) movhpd %xmm6, 3 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC) movhpd %xmm5, 1 * SIZE(%esi, LDC) movsd %xmm7, 2 * SIZE(%esi, LDC) movhpd %xmm7, 3 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- BRANCH jg .L10 ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, %ecx movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $2, %eax movl %eax, KKK #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L31 .L32: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 #ifndef TRMMKERNEL SHUFPD_1 %xmm0, %xmm0 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 SHUFPD_1 %xmm2, %xmm2 movsd 0 * SIZE(%esi, LDC), %xmm2 movhpd 1 * SIZE(%esi, LDC), %xmm2 addpd %xmm0, %xmm4 addpd %xmm2, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC) movhpd %xmm5, 1 * SIZE(%esi, LDC) addl $2 * SIZE, %esi # coffset += 4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif ALIGN_2 .L50: movl M, %ebx testl $1, %ebx jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, %ecx movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L51 .L52: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 mulsd %xmm3, %xmm4 mulsd %xmm3, %xmm5 #ifndef TRMMKERNEL addsd 0 * SIZE(%esi), %xmm4 addsd 0 * SIZE(%esi, LDC), %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC) addl $1 * SIZE, %esi #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C # c += 2 * ldc BRANCH decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: movl K, %eax andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movsd 0 * SIZE(B), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, B addl $2 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 .L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 10 * SIZE(AA), %xmm2 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm6 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 14 * SIZE(AA), %xmm2 addpd %xmm1, %xmm5 movapd 24 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movapd 16 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm0 mulpd 18 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 20 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd 10 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 22 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 32 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd 12 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 26 * SIZE(AA), %xmm3 addpd %xmm1, %xmm4 movapd 28 * SIZE(AA), %xmm1 addpd %xmm3, %xmm6 movapd 14 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 30 * SIZE(AA), %xmm3 addpd %xmm1, %xmm5 movapd 40 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movapd 24 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L111 .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 addl $4 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm6 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 movsd 2 * SIZE(%esi), %xmm1 movhpd 3 * SIZE(%esi), %xmm1 addpd %xmm1, %xmm6 #endif movsd %xmm4, 0 * SIZE(%esi) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%esi) movsd %xmm6, 2 * SIZE(%esi) unpckhpd %xmm6, %xmm6 movsd %xmm6, 3 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 4 BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movapd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movapd 0 * SIZE(AA), %xmm0 mulpd 0 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 addl $2 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 mulpd %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) unpckhpd %xmm4, %xmm4 movsd %xmm4, 1 * SIZE(%esi) addl $2 * SIZE, %esi # coffset += 4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif ALIGN_2 .L150: movl M, %ebx testl $1, %ebx jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 mulsd 2 * SIZE(BB), %xmm0 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movsd 5 * SIZE(AA), %xmm1 addsd %xmm3, %xmm4 mulsd 10 * SIZE(BB), %xmm1 movsd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm4 movsd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 mulsd %xmm3, %xmm4 #ifndef TRMMKERNEL addsd 0 * SIZE(%esi), %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_4x4_barcelona.S000066400000000000000000001270571313527062700222030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_ALPHA 16 + STACK(%esi) #define OLD_A 20 + STACK(%esi) #define OLD_B 24 + STACK(%esi) #define OLD_C 28 + STACK(%esi) #define OLD_LDC 32 + STACK(%esi) #define STACK_OFFT 36 + STACK(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 128(%esp) #define PREFETCH prefetch #define PREFETCHSIZE (16 * 17 + 0) #define RPREFETCHSIZE (16 * 9 + 0) #define WPREFETCHSIZE (16 * 9 + 0) #define AA %edx #define BB %ecx #define LDC %ebp #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movss OLD_ALPHA, %xmm3 #ifdef TRMMKERNEL movss STACK_OFFT, %xmm4 #endif movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK shufps $0, %xmm3, %xmm3 movl OLD_B, %edi movl OLD_C, %ebx movaps %xmm3, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif leal (, LDC, SIZE), LDC sarl $2, %eax movl %eax, J jle .L40 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $1, %eax jle .L05 ALIGN_4 .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movaps 0 * SIZE(%edi), %xmm3 movaps 4 * SIZE(%edi), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $ 8 * SIZE, %edi subl $-32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $1, %eax BRANCH jle .L10 movaps 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) addl $4 * SIZE, %edi ALIGN_4 .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 leal (%esi, LDC, 2), %eax prefetchw 3 * SIZE(%esi) prefetchw 3 * SIZE(%esi, LDC) prefetchw 3 * SIZE(%eax) prefetchw 3 * SIZE(%eax, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $128 * 8 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL shufps $0xe4, %xmm0, %xmm0 shufps $0xe4, %xmm1, %xmm1 shufps $0xe4, %xmm2, %xmm2 shufps $0xe4, %xmm3, %xmm3 mulps %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 mulps %xmm3, %xmm5 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhps 2 * SIZE(%esi, LDC, 1), %xmm1 mulps %xmm3, %xmm6 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 movhps 2 * SIZE(%esi, LDC, 2), %xmm2 mulps %xmm3, %xmm7 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 movhps 2 * SIZE(%esi, %eax, 1), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm5 addps %xmm2, %xmm6 addps %xmm3, %xmm7 #else mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhps %xmm5, 2 * SIZE(%esi, LDC, 1) movsd %xmm6, 0 * SIZE(%esi, LDC, 2) movhps %xmm6, 2 * SIZE(%esi, LDC, 2) movsd %xmm7, 0 * SIZE(%esi, %eax, 1) movhps %xmm7, 2 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movsd 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 44 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 64 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 60 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 80 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movsd 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 76 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 96 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movsd 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 92 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 112 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movsd 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 108 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 128 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movsd 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 124 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 144 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 16 * SIZE(BB), %xmm2 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL mulps %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 mulps %xmm3, %xmm5 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 mulps %xmm3, %xmm6 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 mulps %xmm3, %xmm7 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm5 addps %xmm2, %xmm6 addps %xmm3, %xmm7 #else mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movsd %xmm6, 0 * SIZE(%esi, LDC, 2) movsd %xmm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 ALIGN_4 .L30: testl $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB #endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movss 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movss 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 20 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 36 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 40 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 44 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 52 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 56 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 60 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 68 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 72 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 76 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 96 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 84 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 88 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 92 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 112 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 100 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 104 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 108 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 128 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 116 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 120 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 124 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 144 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL mulss %xmm3, %xmm4 movss 0 * SIZE(%esi), %xmm0 mulss %xmm3, %xmm5 movss 0 * SIZE(%esi, LDC, 1), %xmm1 mulss %xmm3, %xmm6 movss 0 * SIZE(%esi, LDC, 2), %xmm2 mulss %xmm3, %xmm7 movss 0 * SIZE(%esi, %eax, 1), %xmm3 addss %xmm0, %xmm4 addss %xmm1, %xmm5 addss %xmm2, %xmm6 addss %xmm3, %xmm7 #else mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 mulss %xmm3, %xmm6 mulss %xmm3, %xmm7 #endif movss %xmm4, 0 * SIZE(%esi) movss %xmm5, 0 * SIZE(%esi, LDC, 1) movss %xmm6, 0 * SIZE(%esi, LDC, 2) movss %xmm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 .L42: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movaps 0 * SIZE(%edi), %xmm3 movaps 4 * SIZE(%edi), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $ 8 * SIZE, %edi subl $-32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $3, %eax BRANCH jle .L50 ALIGN_4 .L46: movsd 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) addl $2 * SIZE, %edi addl $8 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 prefetchw 3 * SIZE(%esi) prefetchw 3 * SIZE(%esi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhps 2 * SIZE(%esi, LDC, 1), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhps %xmm5, 2 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) addl $2 * SIZE, %esi # coffset += 2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif ALIGN_4 .L70: testl $1, M je .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addss %xmm6, %xmm4 addss %xmm7, %xmm5 mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 #ifndef TRMMKERNEL movss 0 * SIZE(%esi), %xmm0 movss 0 * SIZE(%esi, LDC, 1), %xmm1 addss %xmm0, %xmm4 addss %xmm1, %xmm5 #endif movss %xmm4, 0 * SIZE(%esi) movss %xmm5, 0 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L80: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal BUFFER, %ecx movl K, %eax sarl $3, %eax jle .L85 ALIGN_4 .L82: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movups 0 * SIZE(%edi), %xmm3 movups 4 * SIZE(%edi), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $ 8 * SIZE, %edi subl $-32 * SIZE, %ecx decl %eax jne .L82 ALIGN_4 .L85: movl K, %eax andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: movss 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L86 ALIGN_4 .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 prefetchw 3 * SIZE(%esi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 mulps 20 * SIZE(BB), %xmm1 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 2 * SIZE(AA), %xmm0 movsd 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 ALIGN_4 .L110: testl $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 48 * SIZE(BB), %xmm3 mulss 20 * SIZE(BB), %xmm1 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 mulss %xmm3, %xmm4 #ifndef TRMMKERNEL movss 0 * SIZE(%esi), %xmm0 addss %xmm0, %xmm4 #endif movss %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_4x4_penryn.S000066400000000000000000001024471313527062700215640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 20 + STACK + ARGS(%esp) #define ARG_B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define ARG_LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #ifdef NANO #define PREFETCHSIZE (16 * 3 + 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifdef NEHALEM #define PREFETCHSIZE (16 * 1 - 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifdef SANDYBRIDGE #define PREFETCHSIZE (16 * 1 - 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht0 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht0 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (16 * 13 + 8) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-32 * SIZE, A subl $-32 * SIZE, B leal (, LDC, SIZE), LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 2, %eax leal (B, %eax), %eax movl %eax, BX movl C, C1 movl A, AA movl M, I sarl $2, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif movl BX, %eax PREFETCHB -32 * SIZE(%eax) subl $-16 * SIZE, %eax movl %eax, BX leal (C1, LDC, 2), %eax movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 xorps %xmm4, %xmm4 PREFETCHW 3 * SIZE(C1) xorps %xmm5, %xmm5 PREFETCHW 7 * SIZE(C1, LDC) xorps %xmm6, %xmm6 PREFETCHW 3 * SIZE(%eax) xorps %xmm7, %xmm7 PREFETCHW 7 * SIZE(%eax, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 #if !(defined(NEHALEM) || defined(SANDYBRIDGE)) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) #endif pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 subl $1, %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addps %xmm3, %xmm6 addps %xmm2, %xmm7 movss ALPHA, %xmm3 pshufd $0x39, %xmm5, %xmm2 pshufd $0x4e, %xmm6, %xmm0 pshufd $0x93, %xmm7, %xmm7 movaps %xmm4, %xmm6 unpcklps %xmm0, %xmm4 unpckhps %xmm0, %xmm6 movaps %xmm2, %xmm1 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm5 unpcklps %xmm2, %xmm4 unpckhps %xmm2, %xmm5 movaps %xmm6, %xmm7 unpcklps %xmm1, %xmm6 unpckhps %xmm1, %xmm7 pshufd $0x93, %xmm5, %xmm5 pshufd $0x4e, %xmm6, %xmm6 pshufd $0x39, %xmm7, %xmm7 shufps $0, %xmm3, %xmm3 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 2 * SIZE(C1, LDC), %xmm1 movsd 0 * SIZE(%eax), %xmm2 movhps 2 * SIZE(%eax), %xmm2 movsd 0 * SIZE(%eax, LDC), %xmm3 movhps 2 * SIZE(%eax, LDC), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm5 addps %xmm2, %xmm6 addps %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhps %xmm5, 2 * SIZE(C1, LDC) movsd %xmm6, 0 * SIZE(%eax) movhps %xmm6, 2 * SIZE(%eax) movsd %xmm7, 0 * SIZE(%eax, LDC) movhps %xmm7, 2 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $2, I jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 subl $-16 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: movss ALPHA, %xmm1 addps %xmm6, %xmm4 addps %xmm7, %xmm5 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 0 * SIZE(C1, LDC), %xmm0 movsd 0 * SIZE(%eax), %xmm1 movhps 0 * SIZE(%eax, LDC), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 0 * SIZE(C1, LDC) movsd %xmm5, 0 * SIZE(%eax) movhps %xmm5, 0 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 ALIGN_4 .L30: movl M, I testl $1, I jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: movss ALPHA, %xmm1 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm4 pshufd $0xff, %xmm4, %xmm7 pshufd $0xaa, %xmm4, %xmm6 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 leal (C1, LDC, 2), %eax #ifndef TRMMKERNEL movss 0 * SIZE(C1), %xmm0 movss 0 * SIZE(C1, LDC), %xmm1 movss 0 * SIZE(%eax), %xmm2 movss 0 * SIZE(%eax, LDC), %xmm3 addss %xmm0, %xmm4 addss %xmm1, %xmm5 addss %xmm2, %xmm6 addss %xmm3, %xmm7 #endif movss %xmm4, 0 * SIZE(C1) movss %xmm5, 0 * SIZE(C1, LDC) movss %xmm6, 0 * SIZE(%eax) movss %xmm7, 0 * SIZE(%eax, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif movl BB, B leal (, LDC, 4), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L40: movl N, %eax testl $2, %eax jle .L70 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $2, I jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(C1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(C1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: movss ALPHA, %xmm1 addps %xmm6, %xmm4 addps %xmm7, %xmm5 addps %xmm2, %xmm4 addps %xmm3, %xmm5 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 2 * SIZE(C1, LDC), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhps %xmm5, 2 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, C1 decl I jg .L41 ALIGN_4 .L50: movl M, I testl $2, I jle .L60 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movaps -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L58 ALIGN_4 .L56: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: movss ALPHA, %xmm1 addps %xmm3, %xmm4 addps %xmm5, %xmm4 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 0 * SIZE(C1, LDC), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 0 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 ALIGN_4 .L60: movl M, I testl $1, I jle .L69 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -28 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -22 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -20 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -18 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -16 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: movss ALPHA, %xmm1 addps %xmm5, %xmm4 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm4 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #ifndef TRMMKERNEL movss 0 * SIZE(C1), %xmm0 movss 0 * SIZE(C1, LDC), %xmm1 addss %xmm0, %xmm4 addss %xmm1, %xmm5 #endif movss %xmm4, 0 * SIZE(C1) movss %xmm5, 0 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L70: movl N, %eax testl $1, %eax jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $2, I jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(C1) pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L78 ALIGN_4 .L76: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: movss ALPHA, %xmm1 addps %xmm2, %xmm4 addps %xmm5, %xmm4 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA addl %eax, BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, C1 decl I jg .L71 ALIGN_4 .L80: movl M, I testl $2, I jle .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L88 ALIGN_4 .L86: pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: movss ALPHA, %xmm1 addps %xmm5, %xmm4 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 ALIGN_4 .L90: movl M, I testl $1, I jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L98 ALIGN_4 .L96: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -31 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: movss ALPHA, %xmm1 haddps %xmm4, %xmm4 mulss %xmm1, %xmm4 #ifndef TRMMKERNEL movss 0 * SIZE(C1), %xmm0 addss %xmm0, %xmm4 #endif movss %xmm4, 0 * SIZE(C1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_4x4_sse.S000066400000000000000000001537021313527062700210430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_ALPHA 16 + STACK(%esi) #define OLD_A 20 + STACK(%esi) #define OLD_B 24 + STACK(%esi) #define OLD_C 28 + STACK(%esi) #define OLD_LDC 32 + STACK(%esi) #define STACK_OFFT 36 + STACK(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 128(%esp) #ifdef ATHLON #define PREFETCH prefetch #define PREFETCHSIZE 64 #endif #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #define PREFETCHSIZE (16 * 10 + 8) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #define AA %edx #define BB %ecx #define LDC %ebp #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #if defined(OPTERON) || defined(BARCELONA) #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; #endif #ifdef PENTIUM4 #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movss OLD_ALPHA, %xmm3 #ifdef TRMMKERNEL movss STACK_OFFT, %xmm4 #endif movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK shufps $0, %xmm3, %xmm3 movl OLD_B, %edi movl OLD_C, %ebx movaps %xmm3, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif leal (, LDC, SIZE), LDC sarl $2, %eax movl %eax, J jle .L40 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $1, %eax jle .L05 ALIGN_4 .L02: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) movd %mm4, 16 * SIZE(%ecx) movd %mm4, 17 * SIZE(%ecx) movd %mm4, 18 * SIZE(%ecx) movd %mm4, 19 * SIZE(%ecx) movd %mm5, 20 * SIZE(%ecx) movd %mm5, 21 * SIZE(%ecx) movd %mm5, 22 * SIZE(%ecx) movd %mm5, 23 * SIZE(%ecx) movd %mm6, 24 * SIZE(%ecx) movd %mm6, 25 * SIZE(%ecx) movd %mm6, 26 * SIZE(%ecx) movd %mm6, 27 * SIZE(%ecx) movd %mm7, 28 * SIZE(%ecx) movd %mm7, 29 * SIZE(%ecx) movd %mm7, 30 * SIZE(%ecx) movd %mm7, 31 * SIZE(%ecx) #endif #ifdef PENTIUM4 prefetcht2 112 * SIZE(%ecx) #endif #if defined(OPTERON) || defined(BARCELONA) prefetchnta 80 * SIZE(%edi) prefetchw 112 * SIZE(%ecx) prefetchw 120 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $1, %eax BRANCH jle .L10 #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) #endif addl $4 * SIZE, %edi ALIGN_4 .L10: movl %edi, BX movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movl BX, %eax #ifdef HAVE_SSE prefetcht2 0 * SIZE(%eax) prefetcht2 4 * SIZE(%eax) #if L2_SIZE > 262144 subl $-8 * SIZE, BX #elif L2_SIZE > 131072 prefetcht2 8 * SIZE(%eax) prefetcht2 12 * SIZE(%eax) subl $-16 * SIZE, BX #else prefetcht2 16 * SIZE(%eax) prefetcht2 20 * SIZE(%eax) prefetcht2 24 * SIZE(%eax) prefetcht2 28 * SIZE(%eax) subl $-32 * SIZE, BX #endif #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 leal (LDC, LDC, 2), %eax #if defined(OPTERON) || defined(BARCELONA) prefetchw 4 * SIZE(%esi) prefetchw 4 * SIZE(%esi, LDC) prefetchw 4 * SIZE(%esi, LDC, 2) prefetchw 4 * SIZE(%esi, %eax) #endif #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) prefetchnta 4 * SIZE(%esi, LDC, 2) prefetchnta 4 * SIZE(%esi, %eax) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $128 * 8 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $128 * SIZE, BB addl $32 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL shufps $0xe4, %xmm0, %xmm0 shufps $0xe4, %xmm1, %xmm1 shufps $0xe4, %xmm2, %xmm2 shufps $0xe4, %xmm3, %xmm3 mulps %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 mulps %xmm3, %xmm5 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhps 2 * SIZE(%esi, LDC, 1), %xmm1 mulps %xmm3, %xmm6 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 movhps 2 * SIZE(%esi, LDC, 2), %xmm2 mulps %xmm3, %xmm7 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 movhps 2 * SIZE(%esi, %eax, 1), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm5 addps %xmm2, %xmm6 addps %xmm3, %xmm7 #else mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movlps %xmm5, 0 * SIZE(%esi, LDC, 1) movhps %xmm5, 2 * SIZE(%esi, LDC, 1) movlps %xmm6, 0 * SIZE(%esi, LDC, 2) movhps %xmm6, 2 * SIZE(%esi, LDC, 2) movlps %xmm7, 0 * SIZE(%esi, %eax, 1) movhps %xmm7, 2 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 76 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 96 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 92 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 112 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 108 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 128 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 124 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 144 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 16 * SIZE(BB), %xmm2 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL mulps %xmm3, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 mulps %xmm3, %xmm5 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(%esi, LDC, 1), %xmm1 mulps %xmm3, %xmm6 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(%esi, LDC, 2), %xmm2 mulps %xmm3, %xmm7 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 0 * SIZE(%esi, %eax, 1), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm5 addps %xmm2, %xmm6 addps %xmm3, %xmm7 #else mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 #endif movlps %xmm4, 0 * SIZE(%esi) movlps %xmm5, 0 * SIZE(%esi, LDC, 1) movlps %xmm6, 0 * SIZE(%esi, LDC, 2) movlps %xmm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 ALIGN_4 .L30: testl $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB #endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movss 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movss 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 20 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 36 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 40 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 44 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 52 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 56 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 60 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 68 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 72 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 76 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 96 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 84 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 88 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 92 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 112 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 100 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 104 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 108 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 128 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 116 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 120 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 124 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 144 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL mulss %xmm3, %xmm4 movss 0 * SIZE(%esi), %xmm0 mulss %xmm3, %xmm5 movss 0 * SIZE(%esi, LDC, 1), %xmm1 mulss %xmm3, %xmm6 movss 0 * SIZE(%esi, LDC, 2), %xmm2 mulss %xmm3, %xmm7 movss 0 * SIZE(%esi, %eax, 1), %xmm3 addss %xmm0, %xmm4 addss %xmm1, %xmm5 addss %xmm2, %xmm6 addss %xmm3, %xmm7 #else mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 mulss %xmm3, %xmm6 mulss %xmm3, %xmm7 #endif movss %xmm4, 0 * SIZE(%esi) movss %xmm5, 0 * SIZE(%esi, LDC, 1) movss %xmm6, 0 * SIZE(%esi, LDC, 2) movss %xmm7, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 .L42: prefetchnta 80 * SIZE(%edi) #if defined(OPTERON) || defined(BARCELONA) prefetchw 112 * SIZE(%ecx) prefetchw 120 * SIZE(%ecx) #endif #ifdef PENTIUM4 prefetcht1 112 * SIZE(%ecx) #endif #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) movd %mm4, 16 * SIZE(%ecx) movd %mm4, 17 * SIZE(%ecx) movd %mm4, 18 * SIZE(%ecx) movd %mm4, 19 * SIZE(%ecx) movd %mm5, 20 * SIZE(%ecx) movd %mm5, 21 * SIZE(%ecx) movd %mm5, 22 * SIZE(%ecx) movd %mm5, 23 * SIZE(%ecx) movd %mm6, 24 * SIZE(%ecx) movd %mm6, 25 * SIZE(%ecx) movd %mm6, 26 * SIZE(%ecx) movd %mm6, 27 * SIZE(%ecx) movd %mm7, 28 * SIZE(%ecx) movd %mm7, 29 * SIZE(%ecx) movd %mm7, 30 * SIZE(%ecx) movd %mm7, 31 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $3, %eax BRANCH jle .L50 ALIGN_4 .L46: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) #endif addl $2 * SIZE, %edi addl $8 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(%esi) prefetchw 4 * SIZE(%esi, LDC) #elif defined(HAVE_SSE) || defined(HAVE_SSE2) prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhps 2 * SIZE(%esi, LDC, 1), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movlps %xmm5, 0 * SIZE(%esi, LDC, 1) movhps %xmm5, 2 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(%esi, LDC, 1), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movlps %xmm4, 0 * SIZE(%esi) movlps %xmm5, 0 * SIZE(%esi, LDC, 1) addl $2 * SIZE, %esi # coffset += 2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif ALIGN_4 .L70: testl $1, M je .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addss %xmm6, %xmm4 addss %xmm7, %xmm5 mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 #ifndef TRMMKERNEL movss 0 * SIZE(%esi), %xmm0 movss 0 * SIZE(%esi, LDC, 1), %xmm1 addss %xmm0, %xmm4 addss %xmm1, %xmm5 #endif movss %xmm4, 0 * SIZE(%esi) movss %xmm5, 0 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L80: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 .L82: prefetchnta 80 * SIZE(%edi) #if defined(OPTERON) || defined(BARCELONA) prefetchw 112 * SIZE(%ecx) prefetchw 120 * SIZE(%ecx) #endif #ifdef PENTIUM4 prefetcht1 112 * SIZE(%ecx) #endif #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) movd %mm4, 16 * SIZE(%ecx) movd %mm4, 17 * SIZE(%ecx) movd %mm4, 18 * SIZE(%ecx) movd %mm4, 19 * SIZE(%ecx) movd %mm5, 20 * SIZE(%ecx) movd %mm5, 21 * SIZE(%ecx) movd %mm5, 22 * SIZE(%ecx) movd %mm5, 23 * SIZE(%ecx) movd %mm6, 24 * SIZE(%ecx) movd %mm6, 25 * SIZE(%ecx) movd %mm6, 26 * SIZE(%ecx) movd %mm6, 27 * SIZE(%ecx) movd %mm7, 28 * SIZE(%ecx) movd %mm7, 29 * SIZE(%ecx) movd %mm7, 30 * SIZE(%ecx) movd %mm7, 31 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L82 ALIGN_4 .L85: movl K, %eax andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) #endif addl $1 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L86 ALIGN_4 .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(%esi) #elif defined(HAVE_SSE) || defined(HAVE_SSE2) prefetcht2 4 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 mulps 20 * SIZE(BB), %xmm1 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 ALIGN_4 .L110: testl $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 48 * SIZE(BB), %xmm3 mulss 20 * SIZE(BB), %xmm1 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 mulss %xmm3, %xmm4 #ifndef TRMMKERNEL movss 0 * SIZE(%esi), %xmm0 addss %xmm0, %xmm4 #endif movss %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_4x4_sse3.S000066400000000000000000001246641313527062700211330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_ALPHA 16 + STACK(%esi) #define OLD_A 20 + STACK(%esi) #define OLD_B 24 + STACK(%esi) #define OLD_C 28 + STACK(%esi) #define OLD_LDC 32 + STACK(%esi) #define STACK_OFFT 36 + STACK(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 128(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #ifdef PENTIUMM #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA); \ addps %xmm2, %xmm4; \ movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm2, %xmm7; \ movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm2, %xmm7; \ movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm3, %xmm7; \ movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm3, %xmm7; \ movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm2, %xmm7 #define KERNEL6(address) \ movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm2, %xmm7; \ movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL7(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm3, %xmm7; \ movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm3, %xmm7; \ movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movss OLD_ALPHA, %xmm3 #ifdef TRMMKERNEL movss STACK_OFFT, %xmm4 #endif movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK shufps $0, %xmm3, %xmm3 movl OLD_B, %edi movl OLD_C, %ebx movaps %xmm3, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif leal (, LDC, SIZE), LDC sarl $2, %eax movl %eax, J jle .L40 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 .L02: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 movddup 4 * SIZE(%edi), %xmm2 movddup 6 * SIZE(%edi), %xmm3 movddup 8 * SIZE(%edi), %xmm4 movddup 10 * SIZE(%edi), %xmm5 movddup 12 * SIZE(%edi), %xmm6 movddup 14 * SIZE(%edi), %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) # prefetcht1 128 * SIZE(%ecx) prefetcht0 112 * SIZE(%edi) addl $16 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_2 .L06: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) addl $4 * SIZE, %edi addl $8 * SIZE, %ecx decl %eax jne .L06 ALIGN_4 .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsldup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsldup 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) prefetchnta 4 * SIZE(%esi, LDC, 2) prefetchnta 4 * SIZE(%esi, %eax) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) #if 1 cmpl $128 * 8, %eax jle .L12 KERNEL1(32 * 8) KERNEL2(32 * 8) KERNEL3(32 * 8) KERNEL4(32 * 8) KERNEL5(32 * 8) KERNEL6(32 * 8) KERNEL7(32 * 8) KERNEL8(32 * 8) cmpl $128 * 9, %eax jle .L12 KERNEL1(32 * 9) KERNEL2(32 * 9) KERNEL3(32 * 9) KERNEL4(32 * 9) KERNEL5(32 * 9) KERNEL6(32 * 9) KERNEL7(32 * 9) KERNEL8(32 * 9) cmpl $128 * 10, %eax jle .L12 KERNEL1(32 * 10) KERNEL2(32 * 10) KERNEL3(32 * 10) KERNEL4(32 * 10) KERNEL5(32 * 10) KERNEL6(32 * 10) KERNEL7(32 * 10) KERNEL8(32 * 10) cmpl $128 * 11, %eax jle .L12 KERNEL1(32 * 11) KERNEL2(32 * 11) KERNEL3(32 * 11) KERNEL4(32 * 11) KERNEL5(32 * 11) KERNEL6(32 * 11) KERNEL7(32 * 11) KERNEL8(32 * 11) cmpl $128 * 12, %eax jle .L12 KERNEL1(32 * 12) KERNEL2(32 * 12) KERNEL3(32 * 12) KERNEL4(32 * 12) KERNEL5(32 * 12) KERNEL6(32 * 12) KERNEL7(32 * 12) KERNEL8(32 * 12) cmpl $128 * 13, %eax jle .L12 KERNEL1(32 * 13) KERNEL2(32 * 13) KERNEL3(32 * 13) KERNEL4(32 * 13) KERNEL5(32 * 13) KERNEL6(32 * 13) KERNEL7(32 * 13) KERNEL8(32 * 13) cmpl $128 * 14, %eax jle .L12 KERNEL1(32 * 14) KERNEL2(32 * 14) KERNEL3(32 * 14) KERNEL4(32 * 14) KERNEL5(32 * 14) KERNEL6(32 * 14) KERNEL7(32 * 14) KERNEL8(32 * 14) cmpl $128 * 15, %eax jle .L12 KERNEL1(32 * 15) KERNEL2(32 * 15) KERNEL3(32 * 15) KERNEL4(32 * 15) KERNEL5(32 * 15) KERNEL6(32 * 15) KERNEL7(32 * 15) KERNEL8(32 * 15) #else addl $128 * 4 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 #endif .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movshdup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsldup 8 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL shufps $0xe4, %xmm0, %xmm0 shufps $0xe4, %xmm1, %xmm1 shufps $0xe4, %xmm2, %xmm2 shufps $0xe4, %xmm3, %xmm3 mulps %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 mulps %xmm3, %xmm5 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhps 2 * SIZE(%esi, LDC, 1), %xmm1 mulps %xmm3, %xmm6 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 movhps 2 * SIZE(%esi, LDC, 2), %xmm2 mulps %xmm3, %xmm7 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 movhps 2 * SIZE(%esi, %eax, 1), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm5 addps %xmm2, %xmm6 addps %xmm3, %xmm7 #else mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhps %xmm5, 2 * SIZE(%esi, LDC, 1) movsd %xmm6, 0 * SIZE(%esi, LDC, 2) movhps %xmm6, 2 * SIZE(%esi, LDC, 2) movsd %xmm7, 0 * SIZE(%esi, %eax, 1) movhps %xmm7, 2 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 12 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 movddup 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 movddup 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 movddup 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 44 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 movddup 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 64 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 60 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 0 * SIZE(%esi, LDC, 1), %xmm0 movsd 0 * SIZE(%esi, LDC, 2), %xmm1 movhps 0 * SIZE(%esi, %eax, 1), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 0 * SIZE(%esi, LDC, 1) movsd %xmm5, 0 * SIZE(%esi, LDC, 2) movhps %xmm5, 0 * SIZE(%esi, %eax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 ALIGN_4 .L30: testl $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: shufps $0, %xmm0, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movhps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 8 * SIZE(BB), %xmm2 shufps $0, %xmm0, %xmm0 movhps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movhps 20 * SIZE(BB), %xmm3 shufps $0, %xmm0, %xmm0 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 movss 3 * SIZE(AA), %xmm0 addps %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 shufps $0, %xmm0, %xmm0 movhps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movss 8 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 shufps $0, %xmm1, %xmm1 movhps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movss 5 * SIZE(AA), %xmm1 addps %xmm2, %xmm4 movsd 40 * SIZE(BB), %xmm2 shufps $0, %xmm1, %xmm1 movhps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movss 6 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 64 * SIZE(BB), %xmm2 shufps $0, %xmm1, %xmm1 movhps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 56 * SIZE(BB), %xmm3 shufps $0, %xmm1, %xmm1 movhps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 80 * SIZE(BB), %xmm3 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm7 andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0, %xmm0, %xmm0 movhps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 8 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: leal (LDC, LDC, 2), %eax addps %xmm5, %xmm4 mulps %xmm7, %xmm4 movhlps %xmm4, %xmm5 #ifndef TRMMKERNEL movss 0 * SIZE(%esi), %xmm0 movss 0 * SIZE(%esi, LDC, 1), %xmm1 movss 0 * SIZE(%esi, LDC, 2), %xmm2 movss 0 * SIZE(%esi, %eax, 1), %xmm3 addss %xmm4, %xmm0 psrlq $32, %xmm4 addss %xmm4, %xmm1 addss %xmm5, %xmm2 psrlq $32, %xmm5 addss %xmm5, %xmm3 movss %xmm0, 0 * SIZE(%esi) movss %xmm1, 0 * SIZE(%esi, LDC, 1) movss %xmm2, 0 * SIZE(%esi, LDC, 2) movss %xmm3 , 0 * SIZE(%esi, %eax, 1) #else movss %xmm4, 0 * SIZE(%esi) psrlq $32, %xmm4 movss %xmm4, 0 * SIZE(%esi, LDC, 1) movss %xmm5, 0 * SIZE(%esi, LDC, 2) psrlq $32, %xmm5 movss %xmm5 , 0 * SIZE(%esi, %eax, 1) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L45 ALIGN_4 .L42: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 movddup 4 * SIZE(%edi), %xmm2 movddup 6 * SIZE(%edi), %xmm3 movddup 8 * SIZE(%edi), %xmm4 movddup 10 * SIZE(%edi), %xmm5 movddup 12 * SIZE(%edi), %xmm6 movddup 14 * SIZE(%edi), %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) # prefetcht1 128 * SIZE(%ecx) prefetcht0 112 * SIZE(%edi) addl $16 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $7, %eax BRANCH jle .L50 ALIGN_4 .L46: movddup 0 * SIZE(%edi), %xmm0 movaps %xmm0, 0 * SIZE(%ecx) addl $2 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsldup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsldup 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 12 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 16 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 48 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 movhps 2 * SIZE(%esi, LDC, 1), %xmm1 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #else mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC, 1) movhps %xmm5, 2 * SIZE(%esi, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 12 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm5, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 0 * SIZE(%esi, LDC, 1), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 0 * SIZE(%esi, LDC, 1) addl $2 * SIZE, %esi # coffset += 2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif ALIGN_4 .L70: testl $1, M je .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 shufps $0, %xmm0, %xmm0 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 shufps $0, %xmm0, %xmm0 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 3 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 shufps $0, %xmm0, %xmm0 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 shufps $0, %xmm1, %xmm1 movsd 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 6 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 shufps $0, %xmm1, %xmm1 movsd 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 shufps $0, %xmm1, %xmm1 movsd 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addps %xmm5, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movss 0 * SIZE(%esi), %xmm0 movss 0 * SIZE(%esi, LDC, 1), %xmm1 addss %xmm4, %xmm0 psrlq $32, %xmm4 addss %xmm4, %xmm1 movss %xmm0, 0 * SIZE(%esi) movss %xmm1, 0 * SIZE(%esi, LDC, 1) #else movss %xmm4, 0 * SIZE(%esi) psrlq $32, %xmm4 movss %xmm4, 0 * SIZE(%esi, LDC, 1) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L80: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 .L82: movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 movss %xmm0, 0 * SIZE(%ecx) movss %xmm0, 1 * SIZE(%ecx) movss %xmm1, 2 * SIZE(%ecx) movss %xmm1, 3 * SIZE(%ecx) movss %xmm2, 4 * SIZE(%ecx) movss %xmm2, 5 * SIZE(%ecx) movss %xmm3, 6 * SIZE(%ecx) movss %xmm3, 7 * SIZE(%ecx) movss %xmm4, 8 * SIZE(%ecx) movss %xmm4, 9 * SIZE(%ecx) movss %xmm5, 10 * SIZE(%ecx) movss %xmm5, 11 * SIZE(%ecx) movss %xmm6, 12 * SIZE(%ecx) movss %xmm6, 13 * SIZE(%ecx) movss %xmm7, 14 * SIZE(%ecx) movss %xmm7, 15 * SIZE(%ecx) # prefetcht1 128 * SIZE(%ecx) prefetcht0 112 * SIZE(%edi) addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L82 ALIGN_4 .L85: movl K, %eax andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: movss 0 * SIZE(%edi), %xmm0 movss %xmm0, 0 * SIZE(%ecx) movss %xmm0, 1 * SIZE(%ecx) addl $1 * SIZE, %edi addl $2 * SIZE, %ecx decl %eax jne .L86 ALIGN_4 .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 0 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movaps 16 * SIZE(AA), %xmm1 movddup 8 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(%esi) #elif defined(HAVE_SSE) || defined(HAVE_SSE2) prefetcht2 4 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movddup 2 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movddup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 12 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movddup 6 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movddup 16 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movddup 10 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movddup 12 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movddup 14 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movddup 24 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movddup 2 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(AA), %xmm1 movsd 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 6 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 16 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 10 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 12 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 14 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 movhlps %xmm4, %xmm5 addps %xmm5, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi # coffset += 2 ALIGN_4 .L110: testl $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 0 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movss 4 * SIZE(AA), %xmm1 movss 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 2 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 movss 2 * SIZE(AA), %xmm0 addss %xmm2, %xmm5 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 movss 3 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 6 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 movss 8 * SIZE(AA), %xmm0 addss %xmm2, %xmm5 movss 16 * SIZE(BB), %xmm2 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 10 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 movss 6 * SIZE(AA), %xmm1 addss %xmm3, %xmm5 movss 12 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 14 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 movss 12 * SIZE(AA), %xmm1 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 addl $ 8 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 2 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 mulss %xmm3, %xmm4 #ifndef TRMMKERNEL movss 0 * SIZE(%esi), %xmm0 addss %xmm0, %xmm4 #else mulss %xmm3, %xmm4 #endif movss %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_8x1_sse2.S000066400000000000000000000466151313527062700211320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 36(%esp) #define J 44(%esp) #define OLD_STACK 48(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define AA %edx #define BB %ecx #define KERNELMACRO(address) \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movq STACK_ALPHA, %mm7 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movq %mm7, 0 * SIZE + ALPHA movq %mm7, 1 * SIZE + ALPHA movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK leal (, LDC, SIZE), LDC test %eax, %eax movl %eax, J jle .L999 ALIGN_2 .L01: /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L03 ALIGN_4 .L02: prefetchnta 96 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L02 ALIGN_2 .L03: movl K, %eax andl $7, %eax BRANCH jle .L05 ALIGN_2 .L04: movsd 0 * SIZE(B), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, B addl $2 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L10: leal BUFFER, %ecx # boffset1 = boffset // different point movl K, %eax movapd 0 * SIZE + BUFFER, %xmm2 movapd 0 * SIZE(%edx), %xmm0 movapd 8 * SIZE + BUFFER, %xmm3 movapd 8 * SIZE(%edx), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if 0 andl $-8, %eax leal (, %eax, 8), %eax je .L12 KERNELMACRO(32 * 0) # 0 cmpl $64 * 1, %eax jle .L11 KERNELMACRO(32 * 1) # 1 cmpl $64 * 2, %eax jle .L11 KERNELMACRO(32 * 2) # 2 cmpl $64 * 3, %eax jle .L11 KERNELMACRO(32 * 3) # 3 cmpl $64 * 4, %eax jle .L11 KERNELMACRO(32 * 4) # 4 cmpl $64 * 5, %eax jle .L11 KERNELMACRO(32 * 5) # 5 cmpl $64 * 6, %eax jle .L11 KERNELMACRO(32 * 6) # 6 cmpl $64 * 7, %eax jle .L11 KERNELMACRO(32 * 7) # 7 cmpl $64 * 8, %eax jle .L11 KERNELMACRO(32 * 8) # 8 cmpl $64 * 9, %eax jle .L11 KERNELMACRO(32 * 9) # 9 cmpl $64 * 10, %eax jle .L11 KERNELMACRO(32 * 10) # 10 cmpl $64 * 11, %eax jle .L11 KERNELMACRO(32 * 11) # 11 cmpl $64 * 12, %eax jle .L11 KERNELMACRO(32 * 12) # 12 cmpl $64 * 13, %eax jle .L11 KERNELMACRO(32 * 13) # 13 cmpl $64 * 14, %eax jle .L11 KERNELMACRO(32 * 14) # 14 cmpl $64 * 15, %eax jle .L11 movq 1 * SIZE(%esi), %mm0 movq 1 * SIZE(%esi, LDC), %mm1 KERNELMACRO(32 * 15) # 15 .L11: leal (%edx, %eax, 4), %edx leal (%ecx, %eax, 4), %ecx #else movapd 0 * SIZE(BB), %xmm0 movapd 8 * SIZE(BB), %xmm2 movapd 0 * SIZE(AA), %xmm1 movapd 8 * SIZE(AA), %xmm3 prefetchnta 8 * SIZE(%esi) sarl $3, %eax je .L12 #define PRE 40 .L11: mulpd %xmm0, %xmm1 movd (PRE + 0) * SIZE(AA), %mm0 addpd %xmm1, %xmm4 movapd 2 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 4 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd 16 * SIZE(AA), %xmm1 movd (PRE + 8) * SIZE(AA), %mm0 addpd %xmm0, %xmm7 movapd 2 * SIZE(BB), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movapd 12 * SIZE(AA), %xmm3 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(AA), %xmm3 movd (PRE + 16) * SIZE(AA), %mm0 addpd %xmm0, %xmm7 movapd 4 * SIZE(BB), %xmm0 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 18 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 20 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 mulpd 22 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd 32 * SIZE(AA), %xmm1 movd (PRE + 24) * SIZE(AA), %mm0 addpd %xmm0, %xmm7 movapd 6 * SIZE(BB), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movapd 26 * SIZE(AA), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movapd 28 * SIZE(AA), %xmm3 mulpd %xmm0, %xmm3 mulpd 30 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd 40 * SIZE(AA), %xmm3 movd (PRE + 32) * SIZE(AA), %mm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(BB), %xmm0 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movapd 34 * SIZE(AA), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm5 movapd 36 * SIZE(AA), %xmm1 mulpd %xmm2, %xmm1 mulpd 38 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd 48 * SIZE(AA), %xmm1 movd (PRE + 40) * SIZE(AA), %mm0 addpd %xmm2, %xmm7 movapd 10 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm4 movapd 42 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm5 movapd 44 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 mulpd 46 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd 56 * SIZE(AA), %xmm3 movd (PRE + 48) * SIZE(AA), %mm0 addpd %xmm2, %xmm7 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm4 movapd 50 * SIZE(AA), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm5 movapd 52 * SIZE(AA), %xmm1 mulpd %xmm2, %xmm1 mulpd 54 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd 64 * SIZE(AA), %xmm1 movd (PRE + 56) * SIZE(AA), %mm0 addpd %xmm2, %xmm7 movapd 14 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm4 movapd 58 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm5 movapd 60 * SIZE(AA), %xmm3 mulpd %xmm2, %xmm3 mulpd 62 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd 72 * SIZE(AA), %xmm3 addpd %xmm2, %xmm7 movapd 24 * SIZE(BB), %xmm2 addl $64 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L11 #endif .L12: movapd ALPHA, %xmm3 movl K, %eax andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 4 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 mulpd 6 * SIZE(AA), %xmm0 addpd %xmm0, %xmm7 addl $8 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 ALIGN_4 .L14: mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm6 mulpd %xmm3, %xmm7 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhpd 3 * SIZE(%esi), %xmm1 movsd 4 * SIZE(%esi), %xmm2 movhpd 5 * SIZE(%esi), %xmm2 movsd 6 * SIZE(%esi), %xmm3 movhpd 7 * SIZE(%esi), %xmm3 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 2 * SIZE(%esi) movhpd %xmm5, 3 * SIZE(%esi) movsd %xmm6, 4 * SIZE(%esi) movhpd %xmm6, 5 * SIZE(%esi) movsd %xmm7, 6 * SIZE(%esi) movhpd %xmm7, 7 * SIZE(%esi) addl $8 * SIZE, %esi # coffset += 4 BRANCH decl %ebx # i -- jg .L10 ALIGN_2 .L20: movl M, %ebx testl $4, %ebx jle .L30 leal BUFFER, %ecx movl K, %eax movapd 0 * SIZE + BUFFER, %xmm2 movapd 0 * SIZE(%edx), %xmm0 movapd 8 * SIZE + BUFFER, %xmm3 movapd 8 * SIZE(%edx), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 sarl $3, %eax je .L22 .L21: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 2 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 2 * SIZE(BB), %xmm0 movapd 4 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 6 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 4 * SIZE(BB), %xmm0 movapd 8 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 10 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 6 * SIZE(BB), %xmm0 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 14 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 8 * SIZE(BB), %xmm0 movapd 16 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 18 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 10 * SIZE(BB), %xmm0 movapd 20 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 22 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 12 * SIZE(BB), %xmm0 movapd 24 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 26 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 movapd 14 * SIZE(BB), %xmm0 movapd 28 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 30 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L21 .L22: movapd ALPHA, %xmm3 movl K, %eax andl $7, %eax BRANCH je .L24 .L23: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 mulpd 2 * SIZE(AA), %xmm0 addpd %xmm0, %xmm5 addl $4 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L23 ALIGN_4 .L24: mulpd %xmm3, %xmm4 mulpd %xmm3, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhpd 3 * SIZE(%esi), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm5 movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm5, 2 * SIZE(%esi) movhpd %xmm5, 3 * SIZE(%esi) addl $4 * SIZE, %esi # coffset += 4 ALIGN_4 .L30: movl M, %ebx testl $2, %ebx jle .L50 leal BUFFER, %ecx movl K, %eax movapd 0 * SIZE + BUFFER, %xmm2 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE + BUFFER, %xmm3 movapd 8 * SIZE(AA), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 sarl $3, %eax je .L32 .L31: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm0 movapd 2 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm0 movapd 4 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 6 * SIZE(BB), %xmm0 movapd 6 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 8 * SIZE(BB), %xmm0 movapd 8 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm0 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 12 * SIZE(BB), %xmm0 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(BB), %xmm0 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 addl $16 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L31 .L32: movapd ALPHA, %xmm3 movl K, %eax andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 addl $2 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L33 ALIGN_4 .L34: mulpd %xmm3, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) addl $2 * SIZE, %esi ALIGN_2 .L50: movl M, %ebx testl $1, %ebx jle .L99 leal BUFFER, %ecx movl K, %eax movsd 0 * SIZE + BUFFER, %xmm2 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE + BUFFER, %xmm3 movsd 4 * SIZE(AA), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 sarl $3, %eax je .L52 .L51: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 1 * SIZE(AA), %xmm0 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 mulsd 8 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 5 * SIZE(AA), %xmm0 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 mulsd 12 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 7 * SIZE(AA), %xmm0 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $ 8 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L51 .L52: movsd ALPHA, %xmm3 movl K, %eax andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L53 ALIGN_4 .L54: movsd 0 * SIZE(%esi), %xmm0 mulsd %xmm3, %xmm4 addsd %xmm0, %xmm4 movsd %xmm4, 0 * SIZE(%esi) ALIGN_2 .L99: addl LDC, C decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_8x2_core2.S000066400000000000000000000771461313527062700212740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 20 + STACK + ARGS(%esi) #define STACK_B 24 + STACK + ARGS(%esi) #define STACK_C 28 + STACK + ARGS(%esi) #define STACK_LDC 32 + STACK + ARGS(%esi) #define STACK_OFFT 36 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 512(%esp) #define PREFETCH_R (8 * 16 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 16 + 4) #define PREFETCH prefetcht0 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $512 + LOCAL_BUFFER_SIZE, %esp andl $-4096, %esp # align stack STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movss STACK_ALPHA, %xmm3 #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif shufps $0, %xmm3, %xmm3 movl STACK_B, B movl STACK_C, %ebx movaps %xmm3, ALPHA movl %ebx, C movl STACK_LDC, LDC subl $-32 * SIZE, A subl $-32 * SIZE, B leal (, LDC, SIZE), LDC sarl $1, %eax movl %eax, J jle .L50 ALIGN_4 .L01: leal 32 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 movss -30 * SIZE(B), %xmm2 movss -29 * SIZE(B), %xmm3 movss -28 * SIZE(B), %xmm4 movss -27 * SIZE(B), %xmm5 movss -26 * SIZE(B), %xmm6 movss -25 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_W + 0) * SIZE(BB) shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 16) * SIZE(BB) movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) movaps %xmm2, -24 * SIZE(BB) movaps %xmm3, -20 * SIZE(BB) movaps %xmm4, -16 * SIZE(BB) movaps %xmm5, -12 * SIZE(BB) movaps %xmm6, -8 * SIZE(BB) movaps %xmm7, -4 * SIZE(BB) addl $ 8 * SIZE, B subl $-32 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L05: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_4 .L06: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L06 ALIGN_4 .L10: movl C, C1 movl A, AA movl M, I sarl $3, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -16 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 prefetcht0 7 * SIZE(C1) pxor %xmm7, %xmm7 prefetcht0 7 * SIZE(C1, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps -20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 0 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -16 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps -12 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 movaps -8 * SIZE(AA), %xmm3 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps -4 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 movaps 16 * SIZE(AA), %xmm3 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps 4 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps 8 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps 12 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps 16 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps 20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 20 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 addps %xmm2, %xmm6 movaps 24 * SIZE(AA), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps 28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 28 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 subl $-64 * SIZE, BB movaps 48 * SIZE(AA), %xmm3 subl $-64 * SIZE, AA addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -32 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movaps ALPHA, %xmm3 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 4 * SIZE(C1), %xmm2 movhps 6 * SIZE(C1), %xmm2 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 2 * SIZE(C1, LDC), %xmm1 movsd 4 * SIZE(C1, LDC), %xmm3 movhps 6 * SIZE(C1, LDC), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm5 addps %xmm2, %xmm6 addps %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) movsd %xmm6, 4 * SIZE(C1) movhps %xmm6, 6 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhps %xmm5, 2 * SIZE(C1, LDC) movsd %xmm7, 4 * SIZE(C1, LDC) movhps %xmm7, 6 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $8, KK #endif addl $8 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $4, I jle .L30 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movaps -16 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movaps -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BB), %xmm0 addps %xmm1, %xmm6 movaps 0 * SIZE(BB), %xmm1 addps %xmm0, %xmm7 movaps -24 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps -8 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps -20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 16 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 mulps 4 * SIZE(BB), %xmm2 addps %xmm1, %xmm4 movaps 8 * SIZE(BB), %xmm1 addps %xmm2, %xmm5 movaps -12 * SIZE(AA), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(BB), %xmm2 addps %xmm1, %xmm6 movaps 32 * SIZE(BB), %xmm1 addps %xmm2, %xmm7 movaps -8 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 20 * SIZE(BB), %xmm2 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm2, %xmm5 movaps -4 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 28 * SIZE(BB), %xmm2 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm2, %xmm7 movaps 16 * SIZE(AA), %xmm2 subl $-32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 2 * SIZE(C1, LDC), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) movhps %xmm5, 2 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, C1 ALIGN_4 .L30: movl M, I testl $2, I jle .L40 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -24 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movsd -30 * SIZE(AA), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BB), %xmm0 addps %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 addps %xmm0, %xmm7 movsd -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movsd -8 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movsd -26 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movsd 16 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd -16 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 mulps 4 * SIZE(BB), %xmm2 addps %xmm1, %xmm4 movsd 8 * SIZE(BB), %xmm1 addps %xmm2, %xmm5 movsd -22 * SIZE(AA), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(BB), %xmm2 addps %xmm1, %xmm6 movsd 32 * SIZE(BB), %xmm1 addps %xmm2, %xmm7 movsd -20 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 20 * SIZE(BB), %xmm2 addps %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addps %xmm2, %xmm5 movsd -18 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 28 * SIZE(BB), %xmm2 addps %xmm3, %xmm6 movsd 48 * SIZE(BB), %xmm3 addps %xmm2, %xmm7 movsd -8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: movsd ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 addps %xmm0, %xmm4 addps %xmm1, %xmm5 #endif movsd %xmm4, 0 * SIZE(C1) movsd %xmm5, 0 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 ALIGN_4 .L40: movl M, I testl $1, I jle .L49 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB /* because it's doubled */ #endif movss -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movss -28 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movss -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: mulss %xmm0, %xmm1 mulss -28 * SIZE(BB), %xmm0 addss %xmm1, %xmm4 movss -24 * SIZE(BB), %xmm1 addss %xmm0, %xmm5 movss -31 * SIZE(AA), %xmm0 mulss %xmm0, %xmm1 mulss -20 * SIZE(BB), %xmm0 addss %xmm1, %xmm6 movss 0 * SIZE(BB), %xmm1 addss %xmm0, %xmm7 movss -30 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss -12 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss -8 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss -29 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss -4 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 16 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss -24 * SIZE(AA), %xmm0 mulss %xmm2, %xmm1 mulss 4 * SIZE(BB), %xmm2 addss %xmm1, %xmm4 movss 8 * SIZE(BB), %xmm1 addss %xmm2, %xmm5 movss -27 * SIZE(AA), %xmm2 mulss %xmm2, %xmm1 mulss 12 * SIZE(BB), %xmm2 addss %xmm1, %xmm6 movss 32 * SIZE(BB), %xmm1 addss %xmm2, %xmm7 movss -26 * SIZE(AA), %xmm2 mulss %xmm2, %xmm3 mulss 20 * SIZE(BB), %xmm2 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm2, %xmm5 movss -25 * SIZE(AA), %xmm2 mulss %xmm2, %xmm3 mulss 28 * SIZE(BB), %xmm2 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm2, %xmm7 movss -20 * SIZE(AA), %xmm2 subl $-8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: movss ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: mulss %xmm0, %xmm1 mulss -28 * SIZE(BB), %xmm0 addss %xmm1, %xmm4 movss -24 * SIZE(BB), %xmm1 addss %xmm0, %xmm5 movss -31 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addss %xmm6, %xmm4 addss %xmm7, %xmm5 mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 #ifndef TRMMKERNEL movss 0 * SIZE(C1), %xmm0 movss 0 * SIZE(C1, LDC), %xmm1 addss %xmm0, %xmm4 addss %xmm1, %xmm5 #endif movss %xmm4, 0 * SIZE(C1) movss %xmm5, 0 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L50: movl N, %eax testl $1, %eax jle .L999 ALIGN_4 .L51: leal 32 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $3, %eax jle .L55 ALIGN_4 .L52: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 movss -30 * SIZE(B), %xmm2 movss -29 * SIZE(B), %xmm3 movss -28 * SIZE(B), %xmm4 movss -27 * SIZE(B), %xmm5 movss -26 * SIZE(B), %xmm6 movss -25 * SIZE(B), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) movaps %xmm2, -24 * SIZE(BB) movaps %xmm3, -20 * SIZE(BB) movaps %xmm4, -16 * SIZE(BB) movaps %xmm5, -12 * SIZE(BB) movaps %xmm6, -8 * SIZE(BB) movaps %xmm7, -4 * SIZE(BB) addl $ 8 * SIZE, B subl $-32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: movl K, %eax andl $7, %eax BRANCH jle .L60 ALIGN_4 .L56: movss -32 * SIZE(B), %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L56 ALIGN_4 .L60: movl C, C1 movl A, AA movl M, I sarl $3, I jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movaps -16 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movaps -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(C1) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: mulps %xmm1, %xmm0 mulps -28 * SIZE(AA), %xmm1 addps %xmm0, %xmm4 movaps -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm6 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 mulps -20 * SIZE(AA), %xmm1 addps %xmm0, %xmm5 movaps 0 * SIZE(AA), %xmm0 addps %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 mulps %xmm1, %xmm2 mulps -12 * SIZE(AA), %xmm1 addps %xmm2, %xmm4 movaps -8 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps -20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm2 mulps -4 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 16 * SIZE(AA), %xmm2 addps %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 mulps %xmm3, %xmm0 mulps 4 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 8 * SIZE(AA), %xmm0 addps %xmm3, %xmm6 movaps -12 * SIZE(BB), %xmm3 mulps %xmm3, %xmm0 mulps 12 * SIZE(AA), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps -8 * SIZE(BB), %xmm3 mulps %xmm3, %xmm2 mulps 20 * SIZE(AA), %xmm3 addps %xmm2, %xmm4 movaps 24 * SIZE(AA), %xmm2 addps %xmm3, %xmm6 movaps -4 * SIZE(BB), %xmm3 mulps %xmm3, %xmm2 mulps 28 * SIZE(AA), %xmm3 addps %xmm2, %xmm5 movaps 48 * SIZE(AA), %xmm2 addps %xmm3, %xmm7 movaps 16 * SIZE(BB), %xmm3 addl $ 64 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: mulps %xmm1, %xmm0 mulps -28 * SIZE(AA), %xmm1 addps %xmm0, %xmm4 movaps -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm6 movaps -28 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm5, %xmm4 addps %xmm7, %xmm6 mulps %xmm3, %xmm4 mulps %xmm3, %xmm6 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 4 * SIZE(C1), %xmm2 movhps 6 * SIZE(C1), %xmm2 addps %xmm0, %xmm4 addps %xmm2, %xmm6 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) movsd %xmm6, 4 * SIZE(C1) movhps %xmm6, 6 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $8, KK #endif addl $8 * SIZE, C1 decl I jg .L61 ALIGN_4 .L70: movl M, I testl $4, I jle .L80 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movaps -16 * SIZE(AA), %xmm2 movaps -16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulps %xmm0, %xmm1 movaps -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movaps -20 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movaps 0 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 movaps -12 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movaps -12 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movaps -8 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movaps -8 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movaps -4 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movaps -4 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movaps 16 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movaps 16 * SIZE(BB), %xmm3 subl $-32 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L78 ALIGN_4 .L76: mulps %xmm0, %xmm1 movaps -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addps %xmm5, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, C1 ALIGN_4 .L80: movl M, I testl $2, I jle .L90 .L81: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB /* because it's doubled */ #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -16 * SIZE(BB), %xmm3 movsd -24 * SIZE(AA), %xmm2 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -16 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movsd -0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 movsd -22 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movsd -12 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movsd -20 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movsd -8 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movsd -18 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movsd -4 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movsd -8 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movsd 16 * SIZE(BB), %xmm3 subl $-16 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: movsd ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L88 ALIGN_4 .L86: mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addps %xmm5, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, C1 ALIGN_4 .L90: movl M, I testl $1, I jle .L99 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movss -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movss -16 * SIZE(BB), %xmm3 movss -28 * SIZE(AA), %xmm2 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -28 * SIZE(BB), %xmm1 mulss %xmm0, %xmm1 movss -30 * SIZE(AA), %xmm0 addss %xmm1, %xmm5 movss -24 * SIZE(BB), %xmm1 mulss %xmm0, %xmm1 movss -29 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -20 * SIZE(BB), %xmm1 mulss %xmm0, %xmm1 movss -24 * SIZE(AA), %xmm0 addss %xmm1, %xmm5 movss -0 * SIZE(BB), %xmm1 mulss %xmm2, %xmm3 movss -27 * SIZE(AA), %xmm2 addss %xmm3, %xmm4 movss -12 * SIZE(BB), %xmm3 mulss %xmm2, %xmm3 movss -26 * SIZE(AA), %xmm2 addss %xmm3, %xmm5 movss -8 * SIZE(BB), %xmm3 mulss %xmm2, %xmm3 movss -25 * SIZE(AA), %xmm2 addss %xmm3, %xmm4 movss -4 * SIZE(BB), %xmm3 mulss %xmm2, %xmm3 movss -20 * SIZE(AA), %xmm2 addss %xmm3, %xmm5 movss 16 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: movss ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L98 ALIGN_4 .L96: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -28 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addss %xmm5, %xmm4 mulss %xmm3, %xmm4 #ifndef TRMMKERNEL movss 0 * SIZE(C1), %xmm0 addss %xmm0, %xmm4 #endif movss %xmm4, 0 * SIZE(C1) ALIGN_4 .L99: addl LDC, C ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_kernel_8x2_sse.S000066400000000000000000001574351313527062700210540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 20 + STACK + ARGS(%esi) #define STACK_B 24 + STACK + ARGS(%esi) #define STACK_C 28 + STACK + ARGS(%esi) #define STACK_LDC 32 + STACK + ARGS(%esi) #define STACK_OFFT 36 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define PREFETCHSIZE 48 /* for PIII */ #define AA %edx #define BB %ecx #if !defined(HAVE_SSE2) || defined(OPTERON) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm1, %xmm3; \ mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL4(address) \ mulps %xmm1, %xmm3; \ mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL5(address) \ mulps %xmm0, %xmm2; \ mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL6(address) \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL7(address) \ mulps %xmm1, %xmm3; \ mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movd STACK_ALPHA, %mm7 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movd %mm7, 0 * SIZE + ALPHA movd %mm7, 1 * SIZE + ALPHA movd %mm7, 2 * SIZE + ALPHA movd %mm7, 3 * SIZE + ALPHA movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif leal (, LDC, SIZE), LDC sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L100 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L03 ALIGN_4 .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax BRANCH jne .L02 ALIGN_2 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 addl $2 * SIZE, B shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) addl $8 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef PENTIUM4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif prefetchnta 7 * SIZE(%esi) prefetchnta 7 * SIZE(%esi, %ebp) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $2, %eax #endif movl %eax, KKK #endif andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 8 * SIZE, AA addl $64 * 8 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif prefetchnta 8 * SIZE(%esi) prefetchnta 8 * SIZE(%esi, %ebp) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L12 ALIGN_2 .L11: #ifdef CORE_KATMAI prefetcht0 PREFETCHSIZE * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 ALIGN_2 #endif .L12: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 4 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm1, %xmm7 addl $8 * SIZE, AA addl $8 * SIZE, BB subl $1, %eax jg .L13 ALIGN_4 .L14: mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 #ifndef TRMMKERNEL shufps $0xe4, %xmm4, %xmm4 shufps $0xe4, %xmm5, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 shufps $0xe4, %xmm6, %xmm6 shufps $0xe4, %xmm7, %xmm7 movsd 0 * SIZE(%esi, LDC), %xmm2 movhps 2 * SIZE(%esi, LDC), %xmm2 movsd 4 * SIZE(%esi, LDC), %xmm3 movhps 6 * SIZE(%esi, LDC), %xmm3 addps %xmm0, %xmm4 addps %xmm1, %xmm6 addps %xmm2, %xmm5 addps %xmm3, %xmm7 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movsd %xmm6, 4 * SIZE(%esi) movhps %xmm6, 6 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC) movhps %xmm5, 2 * SIZE(%esi, LDC) movsd %xmm7, 4 * SIZE(%esi, LDC) movhps %xmm7, 6 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $8, KK #endif addl $8 * SIZE, %esi BRANCH decl %ebx # i -- jg .L10 ALIGN_2 .L30: movl M, %ebx andl $7, %ebx jle .L99 testl $4, %ebx jle .L50 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L32 ALIGN_2 .L31: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L31 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L32 ALIGN_2 .L31: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 12 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 20 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 44 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 52 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L31 ALIGN_2 #endif .L32: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 addps %xmm0, %xmm5 #endif #ifdef HAVE_SSE2 movsd %xmm4, 0 * SIZE(%esi) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(%esi) movsd %xmm5, 0 * SIZE(%esi, LDC) unpckhpd %xmm5, %xmm5 movsd %xmm5, 2 * SIZE(%esi, LDC) #else movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movlps %xmm5, 0 * SIZE(%esi, LDC) movhps %xmm5, 2 * SIZE(%esi, LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi ALIGN_2 .L50: testl $2, %ebx jle .L70 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 ALIGN_2 .L51: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 ALIGN_2 .L51: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 16 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 12 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 20 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 40 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 48 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 44 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 52 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 72 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 #endif .L52: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L53 ALIGN_4 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi, LDC), %xmm0 addps %xmm0, %xmm5 #endif movlps %xmm4, 0 * SIZE(%esi) movlps %xmm5, 0 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi ALIGN_2 .L70: testl $1, %ebx jle .L99 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L72 ALIGN_2 .L71: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L72 ALIGN_2 .L71: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 12 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 20 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 40 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 48 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 44 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 52 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 72 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 #endif .L72: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movss ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L74 .L73: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L73 ALIGN_4 .L74: addss %xmm6, %xmm4 addss %xmm7, %xmm5 mulss %xmm3, %xmm4 mulss %xmm3, %xmm5 #ifndef TRMMKERNEL addss 0 * SIZE(%esi), %xmm4 addss 0 * SIZE(%esi, LDC), %xmm5 #endif movss %xmm4, 0 * SIZE(%esi) movss %xmm5, 0 * SIZE(%esi, LDC) addl $1 * SIZE, %esi #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C # c += 2 * ldc BRANCH decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L103 ALIGN_4 .L102: prefetchnta 96 * SIZE(B) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 addl $ 8 * SIZE, B shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $32 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: movl K, %eax andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movss 0 * SIZE(B), %xmm0 addl $1 * SIZE, B shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(%ecx) addl $4 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_2 .L111: mulps %xmm2, %xmm0 mulps 4 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 4 * SIZE(BB), %xmm2 mulps %xmm2, %xmm0 mulps 12 * SIZE(AA), %xmm2 addps %xmm0, %xmm6 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 8 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 20 * SIZE(AA), %xmm2 addps %xmm1, %xmm4 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 12 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 28 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps 48 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 36 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 40 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 20 * SIZE(BB), %xmm3 mulps %xmm3, %xmm0 mulps 44 * SIZE(AA), %xmm3 addps %xmm0, %xmm6 movaps 64 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 52 * SIZE(AA), %xmm3 addps %xmm1, %xmm4 movaps 56 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 28 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 60 * SIZE(AA), %xmm3 addps %xmm1, %xmm6 movaps 80 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $64 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_2 .L111: mulps %xmm2, %xmm0 mulps 4 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 4 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 16 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 20 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 12 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 28 * SIZE(AA), %xmm3 addps %xmm1, %xmm6 movaps 40 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm2, %xmm0 mulps 36 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 48 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 20 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 44 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps 56 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 52 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 64 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 28 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 60 * SIZE(AA), %xmm3 addps %xmm1, %xmm6 movaps 72 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 40 * SIZE(BB), %xmm3 addl $64 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_2 #endif .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm0 addps %xmm0, %xmm4 mulps 4 * SIZE(AA), %xmm2 addps %xmm2, %xmm5 addl $8 * SIZE, AA addl $4 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addps %xmm6, %xmm4 addps %xmm7, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 movsd 4 * SIZE(%esi), %xmm0 movhps 6 * SIZE(%esi), %xmm0 addps %xmm0, %xmm5 #endif #ifdef HAVE_SSE2 movsd %xmm4, 0 * SIZE(%esi) unpckhpd %xmm4, %xmm4 movsd %xmm4, 2 * SIZE(%esi) movsd %xmm5, 4 * SIZE(%esi) unpckhpd %xmm5, %xmm5 movsd %xmm5, 6 * SIZE(%esi) #else movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movlps %xmm5, 4 * SIZE(%esi) movhps %xmm5, 6 * SIZE(%esi) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $8, KK #endif addl $8 * SIZE, %esi BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx andl $7, %ebx jle .L999 testl $4, %ebx jle .L150 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L132 ALIGN_2 .L131: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 4 * SIZE(BB), %xmm0 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 mulps 20 * SIZE(BB), %xmm1 movaps 48 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L131 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L132 ALIGN_2 .L131: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 4 * SIZE(BB), %xmm0 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 mulps 12 * SIZE(BB), %xmm1 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 movaps 20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 20 * SIZE(BB), %xmm0 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 mulps 28 * SIZE(BB), %xmm1 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L131 ALIGN_2 #endif .L132: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $4 * SIZE, %esi ALIGN_2 .L150: testl $2, %ebx jle .L170 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L152 ALIGN_2 .L151: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L151 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L152 ALIGN_2 .L151: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 16 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm6 movaps 12 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm4 movaps 20 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 40 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L151 ALIGN_2 #endif .L152: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L153 ALIGN_4 .L154: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 mulps %xmm3, %xmm4 #ifndef TRMMKERNEL #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, %esi ALIGN_2 .L170: testl $1, %ebx jle .L999 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L172 ALIGN_2 .L171: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 mulss 4 * SIZE(BB), %xmm0 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 mulss 20 * SIZE(BB), %xmm1 movss 48 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L171 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L172 ALIGN_2 .L171: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 mulss 4 * SIZE(BB), %xmm0 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 movss 3 * SIZE(AA), %xmm0 addss %xmm3, %xmm6 mulss 12 * SIZE(BB), %xmm0 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 movss 5 * SIZE(AA), %xmm1 addss %xmm2, %xmm4 mulss 20 * SIZE(BB), %xmm1 movss 32 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addss %xmm3, %xmm6 mulss 28 * SIZE(BB), %xmm1 movss 40 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L171 ALIGN_2 #endif .L172: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movss ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L174 .L173: movss 0 * SIZE(AA), %xmm0 movss 0 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L173 ALIGN_4 .L174: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 mulss %xmm3, %xmm4 #ifndef TRMMKERNEL addss 0 * SIZE(%esi), %xmm4 #endif movss %xmm4, 0 * SIZE(%esi) ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_ncopy_2.S000066400000000000000000000147671313527062700175720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 8 #define J 0 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define A 12 + STACK + ARGS(%esp) #define LDA 16 + STACK + ARGS(%esp) #define B 20 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl B, %esi # ESI : offsetB movl M, %edi movl A, %ebx # EBX : offsetA movl LDA, %edx leal (%ebx, %edx, SIZE), %ebp addl %edx, %edx subl %edi, %edx # edx = 2 * lda - m movl N, %eax sarl $1, %eax movl %eax, J je .L20 ALIGN_3 .L21: #if 0 movl %edi, %ecx # ECX : I(Counter of M) andl $-8, %ecx leal (%ebx, %ecx, SIZE), %ebx leal (%ebp, %ecx, SIZE), %ebp negl %ecx ALIGN_3 .Blocking1: MMXLOAD (%ebx, %ecx, SIZE), %mm0 MMXLOAD (%ebp, %ecx, SIZE), %mm1 addl $8, %ecx jl .Blocking1 movl %edi, %ecx # ECX : I(Counter of M) andl $-8, %ecx negl %ecx leal (%ebx, %ecx, SIZE), %ebx leal (%ebp, %ecx, SIZE), %ebp #endif movl %edi, %ecx # ECX : I(Counter of M) sarl $2, %ecx je .L24 ALIGN_3 .L25: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%ebx), %mm0 MMXLOAD 0 * SIZE(%ebp), %mm1 MMXLOAD 1 * SIZE(%ebx), %mm2 MMXLOAD 1 * SIZE(%ebp), %mm3 MMXLOAD 2 * SIZE(%ebx), %mm4 MMXLOAD 2 * SIZE(%ebp), %mm5 MMXLOAD 3 * SIZE(%ebx), %mm6 MMXLOAD 3 * SIZE(%ebp), %mm7 MMXSTORE %mm0, 0 * SIZE(%esi) MMXSTORE %mm1, 1 * SIZE(%esi) MMXSTORE %mm2, 2 * SIZE(%esi) MMXSTORE %mm3, 3 * SIZE(%esi) MMXSTORE %mm4, 4 * SIZE(%esi) MMXSTORE %mm5, 5 * SIZE(%esi) MMXSTORE %mm6, 6 * SIZE(%esi) MMXSTORE %mm7, 7 * SIZE(%esi) #else FLD 3 * SIZE(%ebp) FLD 3 * SIZE(%ebx) FLD 2 * SIZE(%ebp) FLD 2 * SIZE(%ebx) FLD 1 * SIZE(%ebp) FLD 1 * SIZE(%ebx) FLD 0 * SIZE(%ebp) FLD 0 * SIZE(%ebx) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) FST 2 * SIZE(%esi) FST 3 * SIZE(%esi) FST 4 * SIZE(%esi) FST 5 * SIZE(%esi) FST 6 * SIZE(%esi) FST 7 * SIZE(%esi) #endif addl $4 * SIZE, %ebx addl $4 * SIZE, %ebp addl $8 * SIZE, %esi decl %ecx jne .L25 ALIGN_3 .L24: movl %edi, %ecx andl $3, %ecx jle .L30 ALIGN_3 .L31: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%ebx), %mm0 MMXLOAD 0 * SIZE(%ebp), %mm1 MMXSTORE %mm0, 0 * SIZE(%esi) MMXSTORE %mm1, 1 * SIZE(%esi) #else FLD 0 * SIZE(%ebp) FLD 0 * SIZE(%ebx) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) #endif addl $1 * SIZE, %ebx addl $1 * SIZE, %ebp addl $2 * SIZE, %esi decl %ecx jne .L31 ALIGN_3 .L30: leal (%ebx, %edx, SIZE), %ebx leal (%ebp, %edx, SIZE), %ebp decl J jne .L21 ALIGN_3 .L20: movl N, %eax andl $1,%eax jle .L38 ALIGN_3 .L39: movl %edi, %ecx sarl $3, %ecx je .L42 ALIGN_3 .L43: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%ebx), %mm0 MMXLOAD 1 * SIZE(%ebx), %mm1 MMXLOAD 2 * SIZE(%ebx), %mm2 MMXLOAD 3 * SIZE(%ebx), %mm3 MMXLOAD 4 * SIZE(%ebx), %mm4 MMXLOAD 5 * SIZE(%ebx), %mm5 MMXLOAD 6 * SIZE(%ebx), %mm6 MMXLOAD 7 * SIZE(%ebx), %mm7 MMXSTORE %mm0, 0 * SIZE(%esi) MMXSTORE %mm1, 1 * SIZE(%esi) MMXSTORE %mm2, 2 * SIZE(%esi) MMXSTORE %mm3, 3 * SIZE(%esi) MMXSTORE %mm4, 4 * SIZE(%esi) MMXSTORE %mm5, 5 * SIZE(%esi) MMXSTORE %mm6, 6 * SIZE(%esi) MMXSTORE %mm7, 7 * SIZE(%esi) #else FLD 7 * SIZE(%ebx) FLD 6 * SIZE(%ebx) FLD 5 * SIZE(%ebx) FLD 4 * SIZE(%ebx) FLD 3 * SIZE(%ebx) FLD 2 * SIZE(%ebx) FLD 1 * SIZE(%ebx) FLD 0 * SIZE(%ebx) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) FST 2 * SIZE(%esi) FST 3 * SIZE(%esi) FST 4 * SIZE(%esi) FST 5 * SIZE(%esi) FST 6 * SIZE(%esi) FST 7 * SIZE(%esi) #endif addl $8 * SIZE, %ebx addl $8 * SIZE, %esi decl %ecx jne .L43 ALIGN_3 .L42: movl %edi, %ecx andl $7, %ecx jle .L38 ALIGN_3 .L49: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%ebx), %mm0 MMXSTORE %mm0, 0 * SIZE(%esi) #else FLD 0 * SIZE(%ebx) FST 0 * SIZE(%esi) #endif addl $1 * SIZE, %ebx addl $1 * SIZE, %esi decl %ecx jne .L49 ALIGN_3 .L38: EMMS popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_ncopy_2_sse.S000066400000000000000000000122011313527062700204210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 2) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht2 #define STACK 16 #define ARGS 0 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define ARG_A 12 + STACK + ARGS(%esp) #define ARG_LDA 16 + STACK + ARGS(%esp) #define ARG_B 20 + STACK + ARGS(%esp) #define A %eax #define B %ebx #define LDA %ebp #define A1 %ecx #define A2 %edx #define I %esi #define J %edi PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_A, A movl ARG_B, B movl ARG_LDA, LDA sall $BASE_SHIFT, LDA movl N, J sarl $1, J je .L20 ALIGN_3 .L10: movl A, A1 leal (A, LDA, 1), A2 leal (A, LDA, 2), A movl M, I sarl $2, I je .L15 ALIGN_3 .L12: PREFETCH RPREFETCHSIZE * SIZE(A1) movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A2) , %xmm0 movsd 1 * SIZE(A1) , %xmm1 movhps 1 * SIZE(A2) , %xmm1 PREFETCH RPREFETCHSIZE * SIZE(A2) movsd 2 * SIZE(A1) , %xmm2 movhps 2 * SIZE(A2) , %xmm2 movsd 3 * SIZE(A1) , %xmm3 movhps 3 * SIZE(A2) , %xmm3 PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) movaps %xmm2, 4 * SIZE(B) movaps %xmm3, 6 * SIZE(B) addl $ 4 * SIZE, A1 addl $ 4 * SIZE, A2 subl $-8 * SIZE, B decl I jne .L12 ALIGN_3 .L15: testl $2, M jle .L16 movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A2) , %xmm0 movsd 1 * SIZE(A1) , %xmm1 movhps 1 * SIZE(A2) , %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) addl $ 2 * SIZE, A1 addl $ 2 * SIZE, A2 subl $-4 * SIZE, B ALIGN_4 .L16: testl $1, M jle .L19 movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A2) , %xmm0 movaps %xmm0, 0 * SIZE(B) subl $-2 * SIZE, B ALIGN_4 .L19: decl J jne .L10 ALIGN_3 .L20: testl $1, N jle .L999 movl A, A1 movl M, I sarl $2, I je .L25 ALIGN_3 .L22: PREFETCH RPREFETCHSIZE * SIZE(A1) movsd 0 * SIZE(A1), %xmm0 movhps 1 * SIZE(A1), %xmm0 movsd 2 * SIZE(A1), %xmm1 movhps 3 * SIZE(A1), %xmm1 PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) addl $ 4 * SIZE, A1 subl $-4 * SIZE, B decl I jne .L22 ALIGN_3 .L25: testl $2, M jle .L26 movsd 0 * SIZE(A1), %xmm0 movhps 1 * SIZE(A1), %xmm0 movaps %xmm0, 0 * SIZE(B) addl $ 2 * SIZE, A1 subl $-2 * SIZE, B ALIGN_4 .L26: testl $1, M jle .L999 movsd 0 * SIZE(A1), %xmm0 movsd %xmm0, 0 * SIZE(B) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_ncopy_4_sse.S000066400000000000000000000157461313527062700204440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht2 #define STACK 16 #define ARGS 0 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define ARG_A 12 + STACK + ARGS(%esp) #define ARG_LDA 16 + STACK + ARGS(%esp) #define ARG_B 20 + STACK + ARGS(%esp) #define A %eax #define B %ebx #define LDA %ebp #define A1 %ecx #define A2 %edx #define I %esi #define J %edi PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_A, A movl ARG_B, B movl ARG_LDA, LDA sall $BASE_SHIFT, LDA movl N, J sarl $2, J je .L20 ALIGN_3 .L10: movl A, A1 leal (A, LDA, 2), A2 leal (A, LDA, 4), A movl M, I sarl $2, I je .L15 ALIGN_3 .L12: PREFETCH RPREFETCHSIZE * SIZE(A1) movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movsd 0 * SIZE(A2) , %xmm1 movhps 0 * SIZE(A2, LDA), %xmm1 PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) movsd 1 * SIZE(A1) , %xmm2 movhps 1 * SIZE(A1, LDA), %xmm2 movsd 1 * SIZE(A2) , %xmm3 movhps 1 * SIZE(A2, LDA), %xmm3 PREFETCH RPREFETCHSIZE * SIZE(A2) movsd 2 * SIZE(A1) , %xmm4 movhps 2 * SIZE(A1, LDA), %xmm4 movsd 2 * SIZE(A2) , %xmm5 movhps 2 * SIZE(A2, LDA), %xmm5 PREFETCH RPREFETCHSIZE * SIZE(A2, LDA) movsd 3 * SIZE(A1) , %xmm6 movhps 3 * SIZE(A1, LDA), %xmm6 movsd 3 * SIZE(A2) , %xmm7 movhps 3 * SIZE(A2, LDA), %xmm7 PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) movaps %xmm2, 4 * SIZE(B) movaps %xmm3, 6 * SIZE(B) PREFETCHW (RPREFETCHSIZE + 8) * SIZE(B) movaps %xmm4, 8 * SIZE(B) movaps %xmm5, 10 * SIZE(B) movaps %xmm6, 12 * SIZE(B) movaps %xmm7, 14 * SIZE(B) addl $ 4 * SIZE, A1 addl $ 4 * SIZE, A2 subl $-16 * SIZE, B decl I jne .L12 ALIGN_3 .L15: testl $2, M jle .L16 movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movsd 0 * SIZE(A2) , %xmm1 movhps 0 * SIZE(A2, LDA), %xmm1 movsd 1 * SIZE(A1) , %xmm2 movhps 1 * SIZE(A1, LDA), %xmm2 movsd 1 * SIZE(A2) , %xmm3 movhps 1 * SIZE(A2, LDA), %xmm3 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) movaps %xmm2, 4 * SIZE(B) movaps %xmm3, 6 * SIZE(B) addl $ 2 * SIZE, A1 addl $ 2 * SIZE, A2 subl $-8 * SIZE, B ALIGN_4 .L16: testl $1, M jle .L19 movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movsd 0 * SIZE(A2) , %xmm1 movhps 0 * SIZE(A2, LDA), %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) subl $-4 * SIZE, B ALIGN_4 .L19: decl J jne .L10 ALIGN_3 .L20: testl $2, N jle .L30 movl A, A1 leal (A, LDA, 2), A movl M, I sarl $2, I je .L25 ALIGN_3 .L22: PREFETCH RPREFETCHSIZE * SIZE(A1) movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movsd 1 * SIZE(A1) , %xmm1 movhps 1 * SIZE(A1, LDA), %xmm1 PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) movsd 2 * SIZE(A1) , %xmm2 movhps 2 * SIZE(A1, LDA), %xmm2 movsd 3 * SIZE(A1) , %xmm3 movhps 3 * SIZE(A1, LDA), %xmm3 PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) movaps %xmm2, 4 * SIZE(B) movaps %xmm3, 6 * SIZE(B) addl $ 4 * SIZE, A1 subl $-8 * SIZE, B decl I jne .L22 ALIGN_3 .L25: testl $2, M jle .L26 movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movsd 1 * SIZE(A1) , %xmm1 movhps 1 * SIZE(A1, LDA), %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) addl $ 2 * SIZE, A1 addl $ 2 * SIZE, A2 subl $-4 * SIZE, B ALIGN_4 .L26: testl $1, M jle .L30 movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movaps %xmm0, 0 * SIZE(B) subl $-2 * SIZE, B ALIGN_4 .L30: testl $1, N jle .L999 movl A, A1 movl M, I sarl $2, I je .L35 ALIGN_3 .L32: PREFETCH RPREFETCHSIZE * SIZE(A1) movsd 0 * SIZE(A1), %xmm0 movhps 1 * SIZE(A1), %xmm0 movsd 2 * SIZE(A1), %xmm1 movhps 3 * SIZE(A1), %xmm1 PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) addl $ 4 * SIZE, A1 subl $-4 * SIZE, B decl I jne .L32 ALIGN_3 .L35: testl $2, M jle .L36 movsd 0 * SIZE(A1), %xmm0 movhps 1 * SIZE(A1), %xmm0 movaps %xmm0, 0 * SIZE(B) addl $ 2 * SIZE, A1 subl $-2 * SIZE, B ALIGN_4 .L36: testl $1, M jle .L999 movsd 0 * SIZE(A1), %xmm0 movsd %xmm0, 0 * SIZE(B) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_tcopy_2.S000066400000000000000000000153131313527062700175640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 8 #define J 0 + STACK(%esp) #define BOFFSET2 4 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define A 12 + STACK + ARGS(%esp) #define LDA 16 + STACK + ARGS(%esp) #define B 20 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl A, %ebp movl B, %edi movl M, %ebx movl N, %eax andl $-2, %eax imull %ebx, %eax # m * ( n & ~1) leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1) movl %eax, BOFFSET2 movl M, %esi #ifdef DOUBLE sall $4,%esi #else sall $3,%esi #endif sarl $1, %ebx # if !(m & 1) goto L28 movl %ebx, J jle .L28 ALIGN_4 .L39: movl %ebp, %edx # aoffset1 = a movl LDA, %eax movl N, %ebx leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda movl %edi, %eax # boffset1 = b_offset addl $4 * SIZE, %edi # boffset += 4 sarl $2, %ebx jle .L32 ALIGN_4 .L36: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%edx), %mm0 MMXLOAD 1 * SIZE(%edx), %mm1 MMXLOAD 0 * SIZE(%ecx), %mm2 MMXLOAD 1 * SIZE(%ecx), %mm3 MMXLOAD 2 * SIZE(%edx), %mm4 MMXLOAD 3 * SIZE(%edx), %mm5 MMXLOAD 2 * SIZE(%ecx), %mm6 MMXLOAD 3 * SIZE(%ecx), %mm7 MMXSTORE %mm0, 0 * SIZE(%eax) MMXSTORE %mm1, 1 * SIZE(%eax) MMXSTORE %mm2, 2 * SIZE(%eax) MMXSTORE %mm3, 3 * SIZE(%eax) addl %esi, %eax MMXSTORE %mm4, 0 * SIZE(%eax) MMXSTORE %mm5, 1 * SIZE(%eax) MMXSTORE %mm6, 2 * SIZE(%eax) MMXSTORE %mm7, 3 * SIZE(%eax) #else FLD 1 * SIZE(%ecx) FLD 0 * SIZE(%ecx) FLD 1 * SIZE(%edx) FLD 0 * SIZE(%edx) FST 0 * SIZE(%eax) FST 1 * SIZE(%eax) FST 2 * SIZE(%eax) FST 3 * SIZE(%eax) addl %esi, %eax FLD 3 * SIZE(%ecx) FLD 2 * SIZE(%ecx) FLD 3 * SIZE(%edx) FLD 2 * SIZE(%edx) FST 0 * SIZE(%eax) FST 1 * SIZE(%eax) FST 2 * SIZE(%eax) FST 3 * SIZE(%eax) #endif addl $4 * SIZE, %ecx addl $4 * SIZE, %edx addl %esi, %eax decl %ebx jne .L36 ALIGN_4 .L32: movl N, %ebx test $2, %ebx je .L37 #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%edx), %mm0 MMXLOAD 1 * SIZE(%edx), %mm1 MMXLOAD 0 * SIZE(%ecx), %mm2 MMXLOAD 1 * SIZE(%ecx), %mm3 MMXSTORE %mm0, 0 * SIZE(%eax) MMXSTORE %mm1, 1 * SIZE(%eax) MMXSTORE %mm2, 2 * SIZE(%eax) MMXSTORE %mm3, 3 * SIZE(%eax) #else FLD 1 * SIZE(%ecx) FLD 0 * SIZE(%ecx) FLD 1 * SIZE(%edx) FLD 0 * SIZE(%edx) FST 0 * SIZE(%eax) FST 1 * SIZE(%eax) FST 2 * SIZE(%eax) FST 3 * SIZE(%eax) #endif addl $2 * SIZE, %ecx addl $2 * SIZE, %edx ALIGN_4 .L37: movl N, %ebx test $1, %ebx je .L38 movl BOFFSET2, %eax #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%edx), %mm0 MMXLOAD 0 * SIZE(%ecx), %mm1 MMXSTORE %mm0, 0 * SIZE(%eax) MMXSTORE %mm1, 1 * SIZE(%eax) #else FLD 0 * SIZE(%edx) FST 0 * SIZE(%eax) FLD 0 * SIZE(%ecx) FST 1 * SIZE(%eax) #endif addl $2 * SIZE, %eax movl %eax, BOFFSET2 ALIGN_4 .L38: decl J jg .L39 ALIGN_4 .L28: movl M, %eax movl N, %ebx testb $1, %al je .L40 sarl $2, %ebx jle .L41 ALIGN_4 .L45: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%ebp), %mm0 MMXLOAD 1 * SIZE(%ebp), %mm1 MMXLOAD 2 * SIZE(%ebp), %mm2 MMXLOAD 3 * SIZE(%ebp), %mm3 MMXSTORE %mm0, 0 * SIZE(%edi) MMXSTORE %mm1, 1 * SIZE(%edi) addl %esi, %edi MMXSTORE %mm2, 0 * SIZE(%edi) MMXSTORE %mm3, 1 * SIZE(%edi) #else FLD 0 * SIZE(%ebp) FST 0 * SIZE(%edi) FLD 1 * SIZE(%ebp) FST 1 * SIZE(%edi) addl %esi, %edi FLD 2 * SIZE(%ebp) FST 0 * SIZE(%edi) FLD 3 * SIZE(%ebp) FST 1 * SIZE(%edi) #endif addl %esi,%edi addl $4 * SIZE, %ebp decl %ebx jg .L45 ALIGN_4 .L41: movl N, %ebx test $2, %ebx je .L46 #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%ebp), %mm0 MMXSTORE %mm0, 0 * SIZE(%edi) MMXLOAD 1 * SIZE(%ebp), %mm1 MMXSTORE %mm1, 1 * SIZE(%edi) #else FLD 1 * SIZE(%ebp) FLD 0 * SIZE(%ebp) FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) #endif addl $2 * SIZE, %ebp ALIGN_4 .L46: movl N, %ebx test $1, %ebx je .L40 movl BOFFSET2, %eax #ifdef HAVE_MMX MMXLOAD 0 * SIZE(%ebp), %mm0 MMXSTORE %mm0, 0 * SIZE(%eax) #else FLD (%ebp) FST (%eax) #endif ALIGN_4 .L40: EMMS popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_tcopy_2_sse.S000066400000000000000000000131421313527062700204340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 2) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht2 #define STACK 16 #define ARGS 8 #define J 0 + STACK(%esp) #define BOFFSET2 4 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define A 12 + STACK + ARGS(%esp) #define LDA 16 + STACK + ARGS(%esp) #define B 20 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl A, %ebp movl B, %edi movl M, %ebx movl N, %eax andl $-2, %eax imull %ebx, %eax # m * ( n & ~1) leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1) movl %eax, BOFFSET2 movl M, %esi #ifdef DOUBLE sall $4,%esi #else sall $3,%esi #endif sarl $1, %ebx # if !(m & 1) goto L28 movl %ebx, J jle .L28 ALIGN_4 .L39: movl %ebp, %edx # aoffset1 = a movl LDA, %eax movl N, %ebx leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda movl %edi, %eax # boffset1 = b_offset addl $4 * SIZE, %edi # boffset += 4 sarl $2, %ebx jle .L32 ALIGN_4 .L36: PREFETCH RPREFETCHSIZE * SIZE(%edx) movsd 0 * SIZE(%edx), %xmm0 movhps 1 * SIZE(%edx), %xmm0 movsd 0 * SIZE(%ecx), %xmm2 movhps 1 * SIZE(%ecx), %xmm2 PREFETCH RPREFETCHSIZE * SIZE(%ecx) movsd 2 * SIZE(%edx), %xmm4 movhps 3 * SIZE(%edx), %xmm4 movsd 2 * SIZE(%ecx), %xmm6 movhps 3 * SIZE(%ecx), %xmm6 movaps %xmm0, 0 * SIZE(%eax) movaps %xmm2, 2 * SIZE(%eax) addl %esi, %eax movaps %xmm4, 0 * SIZE(%eax) movaps %xmm6, 2 * SIZE(%eax) addl $4 * SIZE, %ecx addl $4 * SIZE, %edx addl %esi, %eax decl %ebx jne .L36 ALIGN_4 .L32: movl N, %ebx test $2, %ebx je .L37 PREFETCH RPREFETCHSIZE * SIZE(%edx) movsd 0 * SIZE(%edx), %xmm0 movhps 1 * SIZE(%edx), %xmm0 PREFETCH RPREFETCHSIZE * SIZE(%ecx) movsd 0 * SIZE(%ecx), %xmm2 movhps 1 * SIZE(%ecx), %xmm2 movaps %xmm0, 0 * SIZE(%eax) movaps %xmm2, 2 * SIZE(%eax) addl $2 * SIZE, %ecx addl $2 * SIZE, %edx ALIGN_4 .L37: movl N, %ebx test $1, %ebx je .L38 movl BOFFSET2, %eax movsd 0 * SIZE(%edx), %xmm0 movhps 0 * SIZE(%ecx), %xmm0 movaps %xmm0, 0 * SIZE(%eax) addl $2 * SIZE, %eax movl %eax, BOFFSET2 ALIGN_4 .L38: decl J jg .L39 ALIGN_4 .L28: movl M, %eax movl N, %ebx testb $1, %al je .L40 sarl $2, %ebx jle .L41 ALIGN_4 .L45: movsd 0 * SIZE(%ebp), %xmm0 movhps 1 * SIZE(%ebp), %xmm0 movsd 2 * SIZE(%ebp), %xmm2 movhps 3 * SIZE(%ebp), %xmm2 movaps %xmm0, 0 * SIZE(%edi) addl %esi, %edi movaps %xmm2, 0 * SIZE(%edi) addl %esi,%edi addl $4 * SIZE, %ebp decl %ebx jg .L45 ALIGN_4 .L41: movl N, %ebx test $2, %ebx je .L46 movsd 0 * SIZE(%ebp), %xmm0 movhps 1 * SIZE(%ebp), %xmm0 movaps %xmm0, 0 * SIZE(%edi) addl $2 * SIZE, %ebp ALIGN_4 .L46: movl N, %ebx test $1, %ebx je .L40 movl BOFFSET2, %eax movsd 0 * SIZE(%ebp), %xmm0 movsd %xmm0, 0 * SIZE(%eax) ALIGN_4 .L40: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemm_tcopy_4_sse.S000066400000000000000000000155711313527062700204460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RPREFETCHSIZE 8 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht2 #define STACK 16 #define ARGS 0 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define ARG_A 12 + STACK + ARGS(%esp) #define ARG_LDA 16 + STACK + ARGS(%esp) #define ARG_B 20 + STACK + ARGS(%esp) #define A %eax #define B %ebx #define LDA %ebp #define A1 %ecx #define A2 %edx #define I %esi #define J %edi PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_A, A movl ARG_B, B movl ARG_LDA, LDA sall $BASE_SHIFT, LDA movl N, J sarl $2, J je .L20 ALIGN_3 .L10: movl A, A1 leal (A, LDA, 2), A2 addl $4 * SIZE, A movl M, I sarl $2, I je .L15 ALIGN_3 .L12: PREFETCH RPREFETCHSIZE * SIZE(A1) movsd 0 * SIZE(A1) , %xmm0 movhps 1 * SIZE(A1) , %xmm0 movsd 2 * SIZE(A1) , %xmm1 movhps 3 * SIZE(A1) , %xmm1 PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) movsd 0 * SIZE(A1, LDA), %xmm2 movhps 1 * SIZE(A1, LDA), %xmm2 movsd 2 * SIZE(A1, LDA), %xmm3 movhps 3 * SIZE(A1, LDA), %xmm3 PREFETCH RPREFETCHSIZE * SIZE(A2) movsd 0 * SIZE(A2) , %xmm4 movhps 1 * SIZE(A2) , %xmm4 movsd 2 * SIZE(A2) , %xmm5 movhps 3 * SIZE(A2) , %xmm5 PREFETCH RPREFETCHSIZE * SIZE(A2, LDA) movsd 0 * SIZE(A2, LDA), %xmm6 movhps 1 * SIZE(A2, LDA), %xmm6 movsd 2 * SIZE(A2, LDA), %xmm7 movhps 3 * SIZE(A2, LDA), %xmm7 PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) movaps %xmm2, 4 * SIZE(B) movaps %xmm3, 6 * SIZE(B) PREFETCHW (RPREFETCHSIZE + 8) * SIZE(B) movaps %xmm4, 8 * SIZE(B) movaps %xmm5, 10 * SIZE(B) movaps %xmm6, 12 * SIZE(B) movaps %xmm7, 14 * SIZE(B) leal (A1, LDA, 4), A1 leal (A2, LDA, 4), A2 subl $-16 * SIZE, B decl I jne .L12 ALIGN_3 .L15: testl $2, M jle .L16 movsd 0 * SIZE(A1) , %xmm0 movhps 1 * SIZE(A1) , %xmm0 movsd 2 * SIZE(A1) , %xmm1 movhps 3 * SIZE(A1) , %xmm1 movsd 0 * SIZE(A1, LDA), %xmm2 movhps 1 * SIZE(A1, LDA), %xmm2 movsd 2 * SIZE(A1, LDA), %xmm3 movhps 3 * SIZE(A1, LDA), %xmm3 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) movaps %xmm2, 4 * SIZE(B) movaps %xmm3, 6 * SIZE(B) leal (A1, LDA, 2), A1 subl $-8 * SIZE, B ALIGN_4 .L16: testl $1, M jle .L19 movsd 0 * SIZE(A1) , %xmm0 movhps 1 * SIZE(A1) , %xmm0 movsd 2 * SIZE(A1) , %xmm1 movhps 3 * SIZE(A1) , %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) subl $-4 * SIZE, B ALIGN_4 .L19: decl J jne .L10 ALIGN_3 .L20: testl $2, N jle .L30 movl A, A1 leal (A, LDA, 2), A2 addl $2 * SIZE, A movl M, I sarl $2, I je .L25 ALIGN_3 .L22: movsd 0 * SIZE(A1) , %xmm0 movhps 1 * SIZE(A1) , %xmm0 movsd 0 * SIZE(A1, LDA), %xmm1 movhps 1 * SIZE(A1, LDA), %xmm1 movsd 0 * SIZE(A2) , %xmm2 movhps 1 * SIZE(A2) , %xmm2 movsd 0 * SIZE(A2, LDA), %xmm3 movhps 1 * SIZE(A2, LDA), %xmm3 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) movaps %xmm2, 4 * SIZE(B) movaps %xmm3, 6 * SIZE(B) leal (A1, LDA, 4), A1 leal (A2, LDA, 4), A2 subl $-8 * SIZE, B decl I jne .L22 ALIGN_3 .L25: testl $2, M jle .L26 movsd 0 * SIZE(A1) , %xmm0 movhps 1 * SIZE(A1) , %xmm0 movsd 0 * SIZE(A1, LDA), %xmm1 movhps 1 * SIZE(A1, LDA), %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) leal (A1, LDA, 2), A1 subl $-4 * SIZE, B ALIGN_4 .L26: testl $1, M jle .L30 movsd 0 * SIZE(A1) , %xmm0 movhps 1 * SIZE(A1) , %xmm0 movaps %xmm0, 0 * SIZE(B) subl $-2 * SIZE, B ALIGN_4 .L30: testl $1, N jle .L999 movl A, A1 leal (A, LDA, 2), A2 movl M, I sarl $2, I je .L35 ALIGN_3 .L32: movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movsd 0 * SIZE(A2) , %xmm1 movhps 0 * SIZE(A2, LDA), %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 2 * SIZE(B) leal (A1, LDA, 4), A1 leal (A2, LDA, 4), A2 subl $-4 * SIZE, B decl I jne .L32 ALIGN_3 .L35: testl $2, M jle .L36 movsd 0 * SIZE(A1) , %xmm0 movhps 0 * SIZE(A1, LDA), %xmm0 movaps %xmm0, 0 * SIZE(B) leal (A1, LDA, 2), A1 subl $-2 * SIZE, B ALIGN_4 .L36: testl $1, M jle .L999 movsd 0 * SIZE(A1) , %xmm0 movsd %xmm0, 0 * SIZE(B) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_n.S000066400000000000000000000236161313527062700164600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 32 #endif #if defined(ATHLON) || defined(OPTERON) || defined(OPTERON) #define P 32 #endif #ifndef P #define P DTB_DEFAULT_ENTRIES #endif #define STACK 16 #define ARGS 16 #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) #define IS 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define A 24 + STACK + ARGS(%esp) #define LDA 28 + STACK + ARGS(%esp) #define X 32 + STACK + ARGS(%esp) #define INCX 36 + STACK + ARGS(%esp) #define Y 40 + STACK + ARGS(%esp) #define INCY 44 + STACK + ARGS(%esp) #define BUFFER 48 + STACK + ARGS(%esp) #else #define A 20 + STACK + ARGS(%esp) #define LDA 24 + STACK + ARGS(%esp) #define X 28 + STACK + ARGS(%esp) #define INCX 32 + STACK + ARGS(%esp) #define Y 36 + STACK + ARGS(%esp) #define INCY 40 + STACK + ARGS(%esp) #define BUFFER 44 + STACK + ARGS(%esp) #endif PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA movl X, %edi movl LDA, %ebx leal 0(,%ebx,SIZE),%ebx # EBX : lda movl $0, IS movl M, %edx movl N, %esi test %esi, %esi jle .L79 # goto END test %edx, %edx jle .L79 # goto END movl INCY, %eax leal (,%eax,SIZE),%eax movl %eax, INCY movl LDA, %eax imull $P, %eax # P * lda subl M ,%eax # P * lda - m leal (, %eax, SIZE), %eax movl %eax, PLDA_M ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl N, %eax subl %esi,%eax # n - is cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_N movl INCX, %edx leal (%edi, %esi, SIZE), %esi # xp = x + is movl %esi, XP cmpl $1, %edx je .L34 # if incx == 1 goto L34 movl BUFFER, %esi leal (, %edx, SIZE), %edx movl %esi, XP # xp = buffer sarl $2,%eax jle .L35 ALIGN_2 .L36: FLD (%edi) addl %edx,%edi # x += incx FLD (%edi) addl %edx,%edi # x += incx FLD (%edi) addl %edx,%edi # x += incx FLD (%edi) addl %edx,%edi # x += incx FST 3 * SIZE(%esi) FST 2 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) addl $4 * SIZE, %esi # xp += 4 decl %eax jg .L36 ALIGN_3 .L35: movl MIN_N, %eax andl $3, %eax jle .L34 ALIGN_2 .L42: FLD (%edi) addl %edx, %edi FST (%esi) addl $SIZE, %esi decl %eax jg .L42 ALIGN_3 /* Main Routine */ .L34: movl Y, %ecx # c_offset movl M, %ebp sarl $2, %ebp # j = (m >> 2) jle .L47 ALIGN_2 .L48: movl A, %edx # a_offset = a fldz addl $4 * SIZE, A # a += 4 fldz movl XP, %esi # b_offset = xp fldz movl MIN_N, %eax # i = min_n fldz FLD (%esi) # bt1 = b_offset sarl $1, %eax jle .L51 ALIGN_2 #ifdef PENTIUM3 #define PRESIZE 8 #else #define PRESIZE 24 #endif .L80: #ifdef PENTIUM3 prefetcht1 PRESIZE * SIZE(%edx, %ebx, 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 prefetcht1 PRESIZE * SIZE(%esi) faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += at1 FLD 1 * SIZE(%esi) # bt1 = b_offset prefetcht1 PRESIZE * SIZE(%edx, %ebx, 2) addl %ebx, %edx # a_offset += lda FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) addl %ebx, %edx faddp %st, %st(4) # ct4 += at1 FLD 2 * SIZE(%esi) # bt1 = b_offset addl $2 * SIZE, %esi # b_offset += 2 #else #ifdef PENTIUM4 prefetchnta 8 * SIZE(%esi) #endif FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) faddp %st, %st(4) # ct4 += at1 FLD 1 * SIZE(%esi) # bt1 = b_offset addl %ebx, %edx # a_offset += lda FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) faddp %st, %st(4) # ct4 += at1 FLD 2 * SIZE(%esi) # bt1 = b_offset addl %ebx, %edx addl $2 * SIZE, %esi # b_offset += 2 #endif decl %eax jg .L80 .L51: movl MIN_N,%eax andl $1, %eax je .L57 FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) faddp %st, %st(4) # ct4 += at1 fldz ALIGN_2 .L57: #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif fxch %st(4) fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fxch %st(4) movl INCY, %eax FADD (%ecx) FST (%ecx) addl %eax, %ecx FADD (%ecx) FST (%ecx) addl %eax, %ecx FADD (%ecx) FST (%ecx) addl %eax, %ecx FADD (%ecx) FST (%ecx) addl %eax, %ecx decl %ebp # j -- jg .L48 ALIGN_3 .L47: movl M, %ebp andl $3, %ebp # j = (m & 3) jle .L60 ALIGN_2 .L61: movl A, %edx # a_offset = a fldz addl $SIZE, A # a++ fldz movl XP,%esi fldz movl MIN_N,%eax fldz sarl $3,%eax jle .L64 ALIGN_2 .L65: FLD 0 * SIZE(%esi) FMUL (%edx) faddp %st, %st(1) addl %ebx, %edx FLD 1 * SIZE(%esi) FMUL (%edx) faddp %st, %st(2) addl %ebx ,%edx FLD 2 * SIZE(%esi) FMUL (%edx) faddp %st, %st(3) addl %ebx, %edx FLD 3 * SIZE(%esi) FMUL (%edx) faddp %st, %st(4) addl %ebx, %edx FLD 4 * SIZE(%esi) FMUL (%edx) faddp %st,%st(1) addl %ebx, %edx FLD 5 * SIZE(%esi) FMUL (%edx) faddp %st, %st(2) addl %ebx, %edx FLD 6 * SIZE(%esi) FMUL (%edx) faddp %st,%st(3) addl %ebx, %edx FLD 7 * SIZE(%esi) FMUL (%edx) faddp %st,%st(4) addl %ebx, %edx addl $8 * SIZE, %esi decl %eax jg .L65 .L64: movl MIN_N,%eax andl $7, %eax jle .L70 ALIGN_2 .L71: FLD (%esi) addl $SIZE, %esi # b_offset ++ FMUL (%edx) addl %ebx, %edx # a_offset += lda faddp %st, %st(1) decl %eax jg .L71 ALIGN_2 .L70: faddp %st, %st(1) faddp %st, %st(1) faddp %st, %st(1) fmul %st(1), %st movl INCY, %eax FADD (%ecx) FST (%ecx) addl %eax, %ecx decl %ebp jg .L61 .L60: movl PLDA_M, %esi addl %esi, A # a += P * lda - m addl $P, IS movl N, %esi cmpl %esi,IS jl .L32 .L79: #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_n_atom.S000066400000000000000000000361351313527062700175000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #define STACKSIZE 16 #define M 4 + STACKSIZE(%esp) #define N 8 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE(%esp) #define A 24 + STACKSIZE(%esp) #define STACK_LDA 28 + STACKSIZE(%esp) #define STACK_X 32 + STACKSIZE(%esp) #define STACK_INCX 36 + STACKSIZE(%esp) #define Y 40 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE(%esp) #define I %eax #define J %ebx #define INCX %ecx #define INCY J #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX leal (,INCX, SIZE), INCX leal (,LDA, SIZE), LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 pxor %xmm7, %xmm7 movl M, %eax addl $16, %eax sarl $4, %eax ALIGN_3 .L01: movapd %xmm7, 0 * SIZE(Y1) movapd %xmm7, 2 * SIZE(Y1) movapd %xmm7, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) movapd %xmm7, 8 * SIZE(Y1) movapd %xmm7, 10 * SIZE(Y1) movapd %xmm7, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) subl $-16 * SIZE, Y1 decl %eax jg .L01 ALIGN_3 .L10: movl N, J sarl $1, J jle .L20 ALIGN_3 .L11: movl BUFFER, Y1 addl $16 * SIZE, Y1 movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A movsd (X), %xmm6 addl INCX, X movsd (X), %xmm7 addl INCX, X movsd ALPHA, %xmm0 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 movsd -16 * SIZE(Y1), %xmm0 movsd -15 * SIZE(Y1), %xmm1 movl M, I sarl $3, I jle .L15 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 movsd -16 * SIZE(A1, LDA), %xmm4 movsd -15 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 decl I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -13 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -14 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -13 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) movsd -13 * SIZE(Y1), %xmm1 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -11 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -12 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -11 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -14 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 movlpd %xmm1, -13 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) #endif mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -10 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -9 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -10 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -9 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -12 * SIZE(Y1) movsd -10 * SIZE(Y1), %xmm0 movlpd %xmm1, -11 * SIZE(Y1) movsd -9 * SIZE(Y1), %xmm1 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -8 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -7 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -8 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -7 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -10 * SIZE(Y1) movsd -8 * SIZE(Y1), %xmm0 movlpd %xmm1, -9 * SIZE(Y1) movsd -7 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 subl $1, I BRANCH jg .L13 ALIGN_3 .L14: mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -13 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -14 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -13 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) movsd -13 * SIZE(Y1), %xmm1 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -11 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -12 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -11 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -14 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 movlpd %xmm1, -13 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -10 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -9 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -10 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -9 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -12 * SIZE(Y1) movsd -10 * SIZE(Y1), %xmm0 movlpd %xmm1, -11 * SIZE(Y1) movsd -9 * SIZE(Y1), %xmm1 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 movlpd %xmm0, -10 * SIZE(Y1) movsd -8 * SIZE(Y1), %xmm0 movlpd %xmm1, -9 * SIZE(Y1) movsd -7 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 ALIGN_3 .L15: testl $4, M je .L16 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 movsd -16 * SIZE(A1, LDA), %xmm4 movsd -15 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 movsd -13 * SIZE(A1), %xmm3 addsd %xmm4, %xmm0 movsd -14 * SIZE(A1, LDA), %xmm4 mulsd %xmm6, %xmm2 addsd %xmm5, %xmm1 movsd -13 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm3 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) movsd -13 * SIZE(Y1), %xmm1 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 movlpd %xmm0, -14 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 movlpd %xmm1, -13 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L16: testl $2, M je .L17 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 movsd -16 * SIZE(A1, LDA), %xmm4 movsd -15 * SIZE(A1, LDA), %xmm5 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 mulsd %xmm7, %xmm4 addsd %xmm2, %xmm0 mulsd %xmm7, %xmm5 addsd %xmm3, %xmm1 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) addl $2 * SIZE, A1 addl $2 * SIZE, Y1 ALIGN_3 .L17: testl $1, M je .L19 movsd -16 * SIZE(A1), %xmm2 movsd -16 * SIZE(A1, LDA), %xmm3 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm2 addsd %xmm2, %xmm0 mulsd %xmm7, %xmm3 addsd %xmm3, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L19: decl J jg .L11 ALIGN_4 .L20: testl $1, N jle .L990 movl BUFFER, Y1 addl $16 * SIZE, Y1 movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A movsd (X), %xmm6 addl INCX, X movsd (X), %xmm7 addl INCX, X movsd ALPHA, %xmm0 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 movsd -16 * SIZE(Y1), %xmm0 movsd -15 * SIZE(Y1), %xmm1 movsd -14 * SIZE(Y1), %xmm4 movsd -13 * SIZE(Y1), %xmm5 movl M, I sarl $3, I jle .L25 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 decl I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif addsd %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 addsd %xmm3, %xmm1 movsd -13 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm0, -16 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm3 movlpd %xmm1, -15 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 addsd %xmm2, %xmm4 movsd -12 * SIZE(A1), %xmm2 addsd %xmm3, %xmm5 movsd -11 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm4, -14 * SIZE(Y1) movsd -10 * SIZE(Y1), %xmm4 mulsd %xmm6, %xmm3 movlpd %xmm5, -13 * SIZE(Y1) movsd -9 * SIZE(Y1), %xmm5 addsd %xmm2, %xmm0 movsd -10 * SIZE(A1), %xmm2 addsd %xmm3, %xmm1 movsd -9 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm0, -12 * SIZE(Y1) movsd -8 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm3 movlpd %xmm1, -11 * SIZE(Y1) movsd -7 * SIZE(Y1), %xmm1 addsd %xmm2, %xmm4 movsd -8 * SIZE(A1), %xmm2 addsd %xmm3, %xmm5 movsd -7 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm4, -10 * SIZE(Y1) movsd -6 * SIZE(Y1), %xmm4 mulsd %xmm6, %xmm3 movlpd %xmm5, -9 * SIZE(Y1) movsd -5 * SIZE(Y1), %xmm5 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 subl $1, I BRANCH jg .L23 ALIGN_3 .L24: addsd %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 addsd %xmm3, %xmm1 movsd -13 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm0, -16 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm3 movlpd %xmm1, -15 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 addsd %xmm2, %xmm4 movsd -12 * SIZE(A1), %xmm2 addsd %xmm3, %xmm5 movsd -11 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm4, -14 * SIZE(Y1) movsd -10 * SIZE(Y1), %xmm4 mulsd %xmm6, %xmm3 movlpd %xmm5, -13 * SIZE(Y1) movsd -9 * SIZE(Y1), %xmm5 addsd %xmm2, %xmm0 movsd -10 * SIZE(A1), %xmm2 addsd %xmm3, %xmm1 movsd -9 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm0, -12 * SIZE(Y1) mulsd %xmm6, %xmm3 movlpd %xmm1, -11 * SIZE(Y1) addsd %xmm2, %xmm4 movsd -8 * SIZE(Y1), %xmm0 addsd %xmm3, %xmm5 movsd -7 * SIZE(Y1), %xmm1 movlpd %xmm4, -10 * SIZE(Y1) movsd -6 * SIZE(Y1), %xmm4 movlpd %xmm5, -9 * SIZE(Y1) movsd -5 * SIZE(Y1), %xmm5 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 ALIGN_3 .L25: testl $4, M je .L26 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 addsd %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 addsd %xmm3, %xmm1 movsd -13 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 movlpd %xmm0, -16 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm3 movlpd %xmm1, -15 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 addsd %xmm2, %xmm4 addsd %xmm3, %xmm5 movlpd %xmm4, -14 * SIZE(Y1) movlpd %xmm5, -13 * SIZE(Y1) addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L26: testl $2, M je .L27 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 addsd %xmm2, %xmm0 addsd %xmm3, %xmm1 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) addl $2 * SIZE, A1 addl $2 * SIZE, Y1 ALIGN_3 .L27: testl $1, M je .L990 movsd -16 * SIZE(A1), %xmm2 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm2 addsd %xmm2, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L990: movl Y, Y1 movl BUFFER, X movl Y1, A1 movl STACK_INCY, INCY sall $BASE_SHIFT, INCY movl M, %eax sarl $3, %eax jle .L994 ALIGN_3 .L992: movsd (Y1), %xmm0 addl INCY, Y1 movsd (Y1), %xmm1 addl INCY, Y1 movsd (Y1), %xmm2 addl INCY, Y1 movsd (Y1), %xmm3 addl INCY, Y1 movsd (Y1), %xmm4 addl INCY, Y1 movsd (Y1), %xmm5 addl INCY, Y1 movsd (Y1), %xmm6 addl INCY, Y1 movsd (Y1), %xmm7 addl INCY, Y1 addsd 0 * SIZE(X), %xmm0 addsd 1 * SIZE(X), %xmm1 addsd 2 * SIZE(X), %xmm2 addsd 3 * SIZE(X), %xmm3 addsd 4 * SIZE(X), %xmm4 addsd 5 * SIZE(X), %xmm5 addsd 6 * SIZE(X), %xmm6 addsd 7 * SIZE(X), %xmm7 movlpd %xmm0, (A1) addl INCY, A1 movlpd %xmm1, (A1) addl INCY, A1 movlpd %xmm2, (A1) addl INCY, A1 movlpd %xmm3, (A1) addl INCY, A1 movlpd %xmm4, (A1) addl INCY, A1 movlpd %xmm5, (A1) addl INCY, A1 movlpd %xmm6, (A1) addl INCY, A1 movlpd %xmm7, (A1) addl INCY, A1 addl $8 * SIZE, X decl %eax jg .L992 ALIGN_3 .L994: testl $7, M jle .L999 testl $4, M jle .L995 movsd (Y1), %xmm0 addl INCY, Y1 movsd (Y1), %xmm1 addl INCY, Y1 movsd (Y1), %xmm2 addl INCY, Y1 movsd (Y1), %xmm3 addl INCY, Y1 addsd 0 * SIZE(X), %xmm0 addsd 1 * SIZE(X), %xmm1 addsd 2 * SIZE(X), %xmm2 addsd 3 * SIZE(X), %xmm3 movlpd %xmm0, (A1) addl INCY, A1 movlpd %xmm1, (A1) addl INCY, A1 movlpd %xmm2, (A1) addl INCY, A1 movlpd %xmm3, (A1) addl INCY, A1 addl $4 * SIZE, X ALIGN_3 .L995: testl $2, M jle .L996 movsd (Y1), %xmm0 addl INCY, Y1 movsd (Y1), %xmm1 addl INCY, Y1 addsd 0 * SIZE(X), %xmm0 addsd 1 * SIZE(X), %xmm1 movlpd %xmm0, (A1) addl INCY, A1 movlpd %xmm1, (A1) addl INCY, A1 addl $2 * SIZE, X ALIGN_3 .L996: testl $1, M jle .L999 movsd (Y1), %xmm0 addsd 0 * SIZE(X), %xmm0 movlpd %xmm0, (A1) ALIGN_3 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_n_sse.S000066400000000000000000000324351313527062700173310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef movsd #undef movsd #endif #ifdef PENTIUM3 #ifdef HAVE_SSE #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define movsd movlps #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 4) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 3) #define movsd movlps #endif #ifdef BARCELONA #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) #endif #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 4) #endif #define STACKSIZE 16 #define ARGS 16 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp) #define A 20 + STACKSIZE+ARGS(%esp) #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) #define STACK_X 28 + STACKSIZE+ARGS(%esp) #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) #define Y 36 + STACKSIZE+ARGS(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp) #define MMM 0+ARGS(%esp) #define YY 4+ARGS(%esp) #define AA 8+ARGS(%esp) #define I %eax #define J %ebx #define INCX %ecx #define INCY J #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl Y,J movl J,YY # backup Y movl A,J movl J,AA # backup A movl M,J movl J,MMM # backup MM .L0t: xorl J,J addl $1,J sall $21,J subl J,MMM movl J,M jge .L00t ALIGN_4 movl MMM,%eax addl J,%eax jle .L999x movl %eax,M .L00t: movl AA,%eax movl %eax,A movl YY,J movl J,Y movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX leal (,INCX, SIZE), INCX leal (,LDA, SIZE), LDA subl $-32 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 xorps %xmm7, %xmm7 movl M, %eax addl $16, %eax sarl $4, %eax ALIGN_3 .L01: movaps %xmm7, 0 * SIZE(Y1) movaps %xmm7, 4 * SIZE(Y1) movaps %xmm7, 8 * SIZE(Y1) movaps %xmm7, 12 * SIZE(Y1) subl $-16 * SIZE, Y1 decl %eax jg .L01 ALIGN_3 .L10: movl N, J sarl $1, J jle .L20 ALIGN_3 .L11: movl BUFFER, Y1 addl $32 * SIZE, Y1 movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A movss (X), %xmm6 addl INCX, X movss (X), %xmm7 addl INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm6 mulss %xmm0, %xmm7 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 ALIGN_3 movl M, I sarl $4, I jle .L15 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movsd -28 * SIZE(A1), %xmm3 movhps -26 * SIZE(A1), %xmm3 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movsd -32 * SIZE(A1, LDA), %xmm4 movhps -30 * SIZE(A1, LDA), %xmm4 movsd -28 * SIZE(A1, LDA), %xmm5 movhps -26 * SIZE(A1, LDA), %xmm5 decl I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -24 * SIZE(A1), %xmm2 movhps -22 * SIZE(A1), %xmm2 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movsd -20 * SIZE(A1), %xmm3 movhps -18 * SIZE(A1), %xmm3 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A1, LDA), %xmm4 movhps -22 * SIZE(A1, LDA), %xmm4 movaps %xmm0, -32 * SIZE(Y1) movaps -24 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movsd -20 * SIZE(A1, LDA), %xmm5 movhps -18 * SIZE(A1, LDA), %xmm5 movaps %xmm1, -28 * SIZE(Y1) movaps -20 * SIZE(Y1), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -16 * SIZE(A1), %xmm2 movhps -14 * SIZE(A1), %xmm2 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movsd -12 * SIZE(A1), %xmm3 movhps -10 * SIZE(A1), %xmm3 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm4 movhps -14 * SIZE(A1, LDA), %xmm4 movaps %xmm0, -24 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movsd -12 * SIZE(A1, LDA), %xmm5 movhps -10 * SIZE(A1, LDA), %xmm5 movaps %xmm1, -20 * SIZE(Y1) movaps -12 * SIZE(Y1), %xmm1 subl $-16 * SIZE, A1 subl $-16 * SIZE, Y1 subl $1, I BRANCH jg .L13 ALIGN_3 .L14: mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -24 * SIZE(A1), %xmm2 movhps -22 * SIZE(A1), %xmm2 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movsd -20 * SIZE(A1), %xmm3 movhps -18 * SIZE(A1), %xmm3 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A1, LDA), %xmm4 movhps -22 * SIZE(A1, LDA), %xmm4 movaps %xmm0, -32 * SIZE(Y1) movaps -24 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movsd -20 * SIZE(A1, LDA), %xmm5 movhps -18 * SIZE(A1, LDA), %xmm5 movaps %xmm1, -28 * SIZE(Y1) movaps -20 * SIZE(Y1), %xmm1 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 movaps %xmm0, -24 * SIZE(Y1) mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, -20 * SIZE(Y1) subl $-16 * SIZE, A1 subl $-16 * SIZE, Y1 ALIGN_3 .L15: testl $8, M je .L16 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movsd -28 * SIZE(A1), %xmm3 movhps -26 * SIZE(A1), %xmm3 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movsd -32 * SIZE(A1, LDA), %xmm4 movhps -30 * SIZE(A1, LDA), %xmm4 movsd -28 * SIZE(A1, LDA), %xmm5 movhps -26 * SIZE(A1, LDA), %xmm5 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) addl $8 * SIZE, A1 addl $8 * SIZE, Y1 ALIGN_3 .L16: testl $4, M je .L17 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movsd -32 * SIZE(A1, LDA), %xmm3 movhps -30 * SIZE(A1, LDA), %xmm3 movaps -32 * SIZE(Y1), %xmm0 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(Y1) addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L17: testl $2, M je .L18 movsd -32 * SIZE(A1), %xmm2 movsd -32 * SIZE(A1, LDA), %xmm3 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addl $2 * SIZE, A1 addl $2 * SIZE, Y1 ALIGN_3 .L18: testl $1, M je .L19 movss -32 * SIZE(A1), %xmm2 movss -32 * SIZE(A1, LDA), %xmm3 movss -32 * SIZE(Y1), %xmm0 mulss %xmm6, %xmm2 addss %xmm2, %xmm0 mulss %xmm7, %xmm3 addss %xmm3, %xmm0 movss %xmm0, -32 * SIZE(Y1) ALIGN_3 .L19: decl J jg .L11 ALIGN_4 .L20: testl $1, N jle .L990 movl BUFFER, Y1 addl $32 * SIZE, Y1 movl A, A1 movss (X), %xmm6 addl INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm6 shufps $0, %xmm6, %xmm6 ALIGN_3 movl M, I sarl $4, I jle .L25 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movsd -28 * SIZE(A1), %xmm3 movhps -26 * SIZE(A1), %xmm3 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 decl I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -24 * SIZE(A1), %xmm2 movhps -22 * SIZE(A1), %xmm2 movaps %xmm0, -32 * SIZE(Y1) movaps -24 * SIZE(Y1), %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movsd -20 * SIZE(A1), %xmm3 movhps -18 * SIZE(A1), %xmm3 movaps %xmm1, -28 * SIZE(Y1) movaps -20 * SIZE(Y1), %xmm1 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -16 * SIZE(A1), %xmm2 movhps -14 * SIZE(A1), %xmm2 movaps %xmm0, -24 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movsd -12 * SIZE(A1), %xmm3 movhps -10 * SIZE(A1), %xmm3 movaps %xmm1, -20 * SIZE(Y1) movaps -12 * SIZE(Y1), %xmm1 subl $-16 * SIZE, A1 subl $-16 * SIZE, Y1 subl $1, I BRANCH jg .L23 ALIGN_3 .L24: mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -24 * SIZE(A1), %xmm2 movhps -22 * SIZE(A1), %xmm2 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movsd -20 * SIZE(A1), %xmm3 movhps -18 * SIZE(A1), %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps -24 * SIZE(Y1), %xmm0 movaps %xmm1, -28 * SIZE(Y1) movaps -20 * SIZE(Y1), %xmm1 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y1) mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movaps %xmm1, -20 * SIZE(Y1) subl $-16 * SIZE, A1 subl $-16 * SIZE, Y1 ALIGN_3 .L25: testl $8, M je .L26 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movsd -28 * SIZE(A1), %xmm3 movhps -26 * SIZE(A1), %xmm3 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) addl $8 * SIZE, A1 addl $8 * SIZE, Y1 ALIGN_3 .L26: testl $4, M je .L27 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movaps -32 * SIZE(Y1), %xmm0 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y1) addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L27: testl $2, M je .L28 movsd -32 * SIZE(A1), %xmm2 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addl $2 * SIZE, A1 addl $2 * SIZE, Y1 ALIGN_3 .L28: testl $1, M je .L990 movss -32 * SIZE(A1), %xmm2 movss -32 * SIZE(Y1), %xmm0 mulss %xmm6, %xmm2 addss %xmm2, %xmm0 movss %xmm0, -32 * SIZE(Y1) ALIGN_3 .L990: movl Y, Y1 movl BUFFER, X movl STACK_INCY, INCY sall $BASE_SHIFT, INCY movl M, %eax sarl $2, %eax jle .L994 ALIGN_3 .L992: movss (Y1), %xmm0 addss 0 * SIZE(X), %xmm0 movss %xmm0, (Y1) addl INCY, Y1 movss (Y1), %xmm0 addss 1 * SIZE(X), %xmm0 movss %xmm0, (Y1) addl INCY, Y1 movss (Y1), %xmm0 addss 2 * SIZE(X), %xmm0 movss %xmm0, (Y1) addl INCY, Y1 movss (Y1), %xmm0 addss 3 * SIZE(X), %xmm0 movss %xmm0, (Y1) addl INCY, Y1 addl $4 * SIZE, X decl %eax jg .L992 ALIGN_3 .L994: testl $2, M jle .L996 movss (Y1), %xmm0 addss 0 * SIZE(X), %xmm0 movss %xmm0, (Y1) addl INCY, Y1 movss (Y1), %xmm0 addss 1 * SIZE(X), %xmm0 movss %xmm0, (Y1) addl INCY, Y1 addl $2 * SIZE, X ALIGN_3 .L996: testl $1, M jle .L999 movss (Y1), %xmm0 addss 0 * SIZE(X), %xmm0 movss %xmm0, (Y1) ALIGN_3 .L999: movl M,J leal (,J,SIZE),%eax addl %eax,AA movl STACK_INCY,INCY imull INCY,%eax addl %eax,YY jmp .L0t ALIGN_4 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_n_sse2.S000066400000000000000000000331131313527062700174050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 3) #define movsd movlps #endif #ifdef BARCELONA #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) #endif #ifdef ATOM #define PREFETCH prefetch #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 4) #endif #define STACKSIZE 16 #define ARGS 16 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp) #define A 24 + STACKSIZE+ARGS(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) #define Y 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp) #define MMM 0+ARGS(%esp) #define YY 4+ARGS(%esp) #define AA 8+ARGS(%esp) #define I %eax #define J %ebx #define INCX %ecx #define INCY J #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl Y,J movl J,YY # backup Y movl A,J movl J,AA # backup A movl M,J movl J,MMM # backup MM .L0t: xorl J,J addl $1,J sall $20,J subl J,MMM movl J,M jge .L00t ALIGN_4 movl MMM,%eax addl J,%eax jle .L999x movl %eax,M .L00t: movl AA,%eax movl %eax,A movl YY,J movl J,Y movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX leal (,INCX, SIZE), INCX leal (,LDA, SIZE), LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 pxor %xmm7, %xmm7 movl M, %eax addl $16, %eax sarl $4, %eax ALIGN_3 .L01: movapd %xmm7, 0 * SIZE(Y1) movapd %xmm7, 2 * SIZE(Y1) movapd %xmm7, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) movapd %xmm7, 8 * SIZE(Y1) movapd %xmm7, 10 * SIZE(Y1) movapd %xmm7, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) subl $-16 * SIZE, Y1 decl %eax jg .L01 ALIGN_3 .L10: movl N, J sarl $1, J jle .L20 ALIGN_3 .L11: movl BUFFER, Y1 addl $16 * SIZE, Y1 movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A #ifdef HAVE_SSE3 movddup (X), %xmm6 addl INCX, X movddup (X), %xmm7 addl INCX, X movddup ALPHA, %xmm0 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm7 #else movsd (X), %xmm6 addl INCX, X movsd (X), %xmm7 addl INCX, X movsd ALPHA, %xmm0 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 #endif ALIGN_3 movl M, I sarl $3, I jle .L15 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movsd -14 * SIZE(A1), %xmm3 movhpd -13 * SIZE(A1), %xmm3 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 movsd -16 * SIZE(A1, LDA), %xmm4 movhpd -15 * SIZE(A1, LDA), %xmm4 movsd -14 * SIZE(A1, LDA), %xmm5 movhpd -13 * SIZE(A1, LDA), %xmm5 decl I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 movhpd -11 * SIZE(A1), %xmm2 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movsd -10 * SIZE(A1), %xmm3 movhpd -9 * SIZE(A1), %xmm3 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1, LDA), %xmm4 movhpd -11 * SIZE(A1, LDA), %xmm4 movapd %xmm0, -16 * SIZE(Y1) movapd -12 * SIZE(Y1), %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm5 movhpd -9 * SIZE(A1, LDA), %xmm5 movapd %xmm1, -14 * SIZE(Y1) movapd -10 * SIZE(Y1), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) #endif mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -8 * SIZE(A1), %xmm2 movhpd -7 * SIZE(A1), %xmm2 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movsd -6 * SIZE(A1), %xmm3 movhpd -5 * SIZE(A1), %xmm3 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd -8 * SIZE(A1, LDA), %xmm4 movhpd -7 * SIZE(A1, LDA), %xmm4 movapd %xmm0, -12 * SIZE(Y1) movapd -8 * SIZE(Y1), %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movsd -6 * SIZE(A1, LDA), %xmm5 movhpd -5 * SIZE(A1, LDA), %xmm5 movapd %xmm1, -10 * SIZE(Y1) movapd -6 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 subl $1, I BRANCH jg .L13 ALIGN_3 .L14: mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 movhpd -11 * SIZE(A1), %xmm2 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movsd -10 * SIZE(A1), %xmm3 movhpd -9 * SIZE(A1), %xmm3 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1, LDA), %xmm4 movhpd -11 * SIZE(A1, LDA), %xmm4 movapd %xmm0, -16 * SIZE(Y1) movapd -12 * SIZE(Y1), %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm5 movhpd -9 * SIZE(A1, LDA), %xmm5 movapd %xmm1, -14 * SIZE(Y1) movapd -10 * SIZE(Y1), %xmm1 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movapd %xmm0, -12 * SIZE(Y1) mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movapd %xmm1, -10 * SIZE(Y1) subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 ALIGN_3 .L15: testl $4, M je .L16 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movsd -14 * SIZE(A1), %xmm3 movhpd -13 * SIZE(A1), %xmm3 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movsd -16 * SIZE(A1, LDA), %xmm4 movhpd -15 * SIZE(A1, LDA), %xmm4 movsd -14 * SIZE(A1, LDA), %xmm5 movhpd -13 * SIZE(A1, LDA), %xmm5 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L16: testl $2, M je .L17 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movsd -16 * SIZE(A1, LDA), %xmm3 movhpd -15 * SIZE(A1, LDA), %xmm3 movapd -16 * SIZE(Y1), %xmm0 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm7, %xmm3 addpd %xmm3, %xmm0 movapd %xmm0, -16 * SIZE(Y1) addl $2 * SIZE, A1 addl $2 * SIZE, Y1 ALIGN_3 .L17: testl $1, M je .L19 movsd -16 * SIZE(A1), %xmm2 movsd -16 * SIZE(A1, LDA), %xmm3 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm2 addsd %xmm2, %xmm0 mulsd %xmm7, %xmm3 addsd %xmm3, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L19: decl J jg .L11 ALIGN_4 .L20: testl $1, N jle .L990 movl BUFFER, Y1 addl $16 * SIZE, Y1 movl A, A1 #ifdef HAVE_SSE3 movddup (X), %xmm6 addl INCX, X movddup ALPHA, %xmm0 mulpd %xmm0, %xmm6 #else movsd (X), %xmm6 addl INCX, X movsd ALPHA, %xmm0 mulsd %xmm0, %xmm6 unpcklpd %xmm6, %xmm6 #endif ALIGN_3 movl M, I sarl $3, I jle .L25 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movsd -14 * SIZE(A1), %xmm3 movhpd -13 * SIZE(A1), %xmm3 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 decl I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 movhpd -11 * SIZE(A1), %xmm2 movapd %xmm0, -16 * SIZE(Y1) movapd -12 * SIZE(Y1), %xmm0 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movsd -10 * SIZE(A1), %xmm3 movhpd -9 * SIZE(A1), %xmm3 movapd %xmm1, -14 * SIZE(Y1) movapd -10 * SIZE(Y1), %xmm1 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -8 * SIZE(A1), %xmm2 movhpd -7 * SIZE(A1), %xmm2 movapd %xmm0, -12 * SIZE(Y1) movapd -8 * SIZE(Y1), %xmm0 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movsd -6 * SIZE(A1), %xmm3 movhpd -5 * SIZE(A1), %xmm3 movapd %xmm1, -10 * SIZE(Y1) movapd -6 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 subl $1, I BRANCH jg .L23 ALIGN_3 .L24: mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 movhpd -11 * SIZE(A1), %xmm2 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movsd -10 * SIZE(A1), %xmm3 movhpd -9 * SIZE(A1), %xmm3 movapd %xmm0, -16 * SIZE(Y1) movapd -12 * SIZE(Y1), %xmm0 movapd %xmm1, -14 * SIZE(Y1) movapd -10 * SIZE(Y1), %xmm1 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movapd %xmm0, -12 * SIZE(Y1) mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movapd %xmm1, -10 * SIZE(Y1) subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 ALIGN_3 .L25: testl $4, M je .L26 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movsd -14 * SIZE(A1), %xmm3 movhpd -13 * SIZE(A1), %xmm3 movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm6, %xmm3 addpd %xmm3, %xmm1 movapd %xmm0, -16 * SIZE(Y1) movapd %xmm1, -14 * SIZE(Y1) addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L26: testl $2, M je .L27 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movapd -16 * SIZE(Y1), %xmm0 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movapd %xmm0, -16 * SIZE(Y1) addl $2 * SIZE, A1 addl $2 * SIZE, Y1 ALIGN_3 .L27: testl $1, M je .L990 movsd -16 * SIZE(A1), %xmm2 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm6, %xmm2 addsd %xmm2, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L990: movl Y, Y1 movl BUFFER, X movl STACK_INCY, INCY sall $BASE_SHIFT, INCY movl M, %eax sarl $3, %eax jle .L994 ALIGN_3 .L992: movsd (Y1), %xmm0 movhpd (Y1, INCY), %xmm0 addpd 0 * SIZE(X), %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhpd (Y1, INCY), %xmm0 addpd 2 * SIZE(X), %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhpd (Y1, INCY), %xmm0 addpd 4 * SIZE(X), %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhpd (Y1, INCY), %xmm0 addpd 6 * SIZE(X), %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 addl $8 * SIZE, X decl %eax jg .L992 ALIGN_3 .L994: testl $7, M jle .L999 testl $4, M jle .L995 movsd (Y1), %xmm0 movhpd (Y1, INCY), %xmm0 addpd 0 * SIZE(X), %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhpd (Y1, INCY), %xmm0 addpd 2 * SIZE(X), %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 addl $4 * SIZE, X ALIGN_3 .L995: testl $2, M jle .L996 movsd (Y1), %xmm0 movhpd (Y1, INCY), %xmm0 addpd 0 * SIZE(X), %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 addl $2 * SIZE, X ALIGN_3 .L996: testl $1, M jle .L999 movsd (Y1), %xmm0 movsd 0 * SIZE(X), %xmm4 addsd %xmm4, %xmm0 movlpd %xmm0, (Y1) ALIGN_3 .L999: movl M,J leal (,J,SIZE),%eax addl %eax,AA movl STACK_INCY,INCY imull INCY,%eax addl %eax,YY jmp .L0t ALIGN_4 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_t.S000066400000000000000000000272451313527062700164700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 88 #endif #ifndef P #define P 1000 #endif #define STACK 16 #define ARGS 24 #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) #define J 12 + STACK(%esp) #define IS 16 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define A 24 + STACK + ARGS(%esp) #define LDA 28 + STACK + ARGS(%esp) #define X 32 + STACK + ARGS(%esp) #define INCX 36 + STACK + ARGS(%esp) #define Y 40 + STACK + ARGS(%esp) #define INCY 44 + STACK + ARGS(%esp) #define BUFFER 48 + STACK + ARGS(%esp) #else #define A 20 + STACK + ARGS(%esp) #define LDA 24 + STACK + ARGS(%esp) #define X 28 + STACK + ARGS(%esp) #define INCX 32 + STACK + ARGS(%esp) #define Y 36 + STACK + ARGS(%esp) #define INCY 40 + STACK + ARGS(%esp) #define BUFFER 44 + STACK + ARGS(%esp) #endif PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA movl X, %edi # X movl $0, IS movl M, %ebx movl N, %eax testl %ebx, %ebx jle .L79 testl %eax, %eax jle .L79 movl INCX, %esi leal (,%esi,SIZE), %esi movl %esi, INCX movl INCY, %esi leal (, %esi, SIZE), %esi movl %esi, INCY movl LDA, %ebx imull %ebx, %eax movl $P, %esi subl %eax, %esi leal (, %esi, SIZE), %esi movl %esi, NLDA leal (,%ebx,SIZE), %esi movl %esi, LDA ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl M, %eax subl %esi, %eax cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_M movl IS, %ecx leal (%edi,%ecx,SIZE), %ecx # xp = x + is movl INCX, %ebx movl %ecx, XP cmpl $SIZE, %ebx je .L34 movl BUFFER, %esi movl MIN_M, %ecx movl %esi, XP sarl $2, %ecx jle .L35 ALIGN_3 .L36: FLD (%edi) addl %ebx, %edi FST 0 * SIZE(%esi) FLD (%edi) addl %ebx, %edi FST 1 * SIZE(%esi) FLD (%edi) addl %ebx, %edi FST 2 * SIZE(%esi) FLD (%edi) addl %ebx, %edi FST 3 * SIZE(%esi) addl $4 * SIZE, %esi decl %ecx jg .L36 ALIGN_3 .L35: movl MIN_M, %ecx andl $3,%ecx jle .L34 ALIGN_2 .L42: FLD (%edi) addl %ebx, %edi FST (%esi) addl $SIZE, %esi decl %ecx jg .L42 ALIGN_3 /* Main Routine */ .L34: movl Y, %ebp # coffset = y movl N, %esi sarl $2, %esi movl %esi, J jle .L47 ALIGN_3 .L48: movl A, %ebx # a_offset = a fldz movl LDA, %edx fldz leal (%ebx, %edx), %ecx # a_offset2 = a + lda fldz leal (%ebx, %edx, 4), %eax fldz movl %eax, A movl XP, %esi FLD (%esi) movl MIN_M, %eax sarl $2,%eax jle .L51 ALIGN_3 #define PRESIZE 8 .L80: #ifdef PENTIUM3 prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 prefetcht0 PRESIZE * SIZE(%ecx) faddp %st,%st(2) # ct1 += at1 FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 prefetcht0 PRESIZE * SIZE(%ebx) FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 1 * SIZE(%esi) FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 2 * SIZE(%esi) FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 3 * SIZE(%esi) FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) addl $4 * SIZE, %ebx faddp %st,%st(4) addl $4 * SIZE, %ecx FLD 4 * SIZE(%esi) addl $4 * SIZE, %esi #else #if defined(HAS_PREFETCH) prefetcht0 PRESIZE * SIZE(%ebx) prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) prefetcht0 PRESIZE * SIZE(%ecx) prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) #endif FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FMUL 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) faddp %st,%st(4) FLD 1 * SIZE(%esi) FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FMUL 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) faddp %st,%st(4) FLD 2 * SIZE(%esi) FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FMUL 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) faddp %st,%st(4) FLD 3 * SIZE(%esi) FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FMUL 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) faddp %st,%st(4) FLD 4 * SIZE(%esi) addl $4 * SIZE, %ebx addl $4 * SIZE, %ecx addl $4 * SIZE, %esi #endif decl %eax jg .L80 ALIGN_3 .L51: movl MIN_M, %eax andl $3, %eax je .L81 ALIGN_3 .L52: FLD (%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD (%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FMUL (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) faddp %st,%st(4) FLD 1 * SIZE(%esi) addl $SIZE, %ebx addl $SIZE, %ecx addl $SIZE, %esi decl %eax jg .L52 ALIGN_3 .L81: #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif fxch %st(4) fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fxch %st(4) movl INCY, %eax FADD (%ebp) FST (%ebp) addl %eax, %ebp FADD (%ebp) FST (%ebp) addl %eax, %ebp FADD (%ebp) FST (%ebp) addl %eax, %ebp FADD (%ebp) FST (%ebp) addl %eax, %ebp decl J jg .L48 ALIGN_3 .L47: movl N, %esi andl $3,%esi movl %esi, J jle .L60 ALIGN_2 .L61: movl A, %ebx # a_offset = a fldz # ct1 = ZERO movl LDA, %edx fldz # ct1 = ZERO addl %ebx, %edx fldz # ct1 = ZERO movl %edx, A fldz # ct1 = ZERO movl XP, %esi movl MIN_M, %eax sarl $3,%eax jle .L64 ALIGN_3 .L65: #ifdef HAS_PREFETCH prefetcht0 PRESIZE * 2 * SIZE(%ebx) prefetcht0 PRESIZE * 2 * SIZE(%ebx) #endif FLD 0 * SIZE(%esi) FMUL 0 * SIZE(%ebx) faddp %st,%st(1) FLD 1 * SIZE(%esi) FMUL 1 * SIZE(%ebx) faddp %st,%st(2) FLD 2 * SIZE(%esi) FMUL 2 * SIZE(%ebx) faddp %st,%st(3) FLD 3 * SIZE(%esi) FMUL 3 * SIZE(%ebx) faddp %st,%st(4) FLD 4 * SIZE(%esi) FMUL 4 * SIZE(%ebx) faddp %st,%st(1) FLD 5 * SIZE(%esi) FMUL 5 * SIZE(%ebx) faddp %st,%st(2) FLD 6 * SIZE(%esi) FMUL 6 * SIZE(%ebx) faddp %st,%st(3) FLD 7 * SIZE(%esi) FMUL 7 * SIZE(%ebx) faddp %st,%st(4) addl $8 * SIZE, %esi addl $8 * SIZE, %ebx decl %eax jg .L65 ALIGN_3 .L64: movl MIN_M, %eax andl $7, %eax jle .L70 ALIGN_3 .L71: FLD (%esi) FMUL (%ebx) faddp %st,%st(1) addl $SIZE, %esi addl $SIZE, %ebx decl %eax jg .L71 ALIGN_3 .L70: faddp %st, %st(1) faddp %st, %st(1) faddp %st, %st(1) fmul %st(1),%st FADD (%ebp) FST (%ebp) addl INCY, %ebp decl J jg .L61 ALIGN_3 .L60: movl A, %ebx addl NLDA, %ebx movl %ebx, A addl $P, IS movl M, %esi cmpl %esi, IS jl .L32 ALIGN_3 .L79: #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_t_atom.S000066400000000000000000000276111313527062700175050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #define STACKSIZE 16 #define M 4 + STACKSIZE(%esp) #define N 8 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE(%esp) #define A 24 + STACKSIZE(%esp) #define STACK_LDA 28 + STACKSIZE(%esp) #define STACK_X 32 + STACKSIZE(%esp) #define STACK_INCX 36 + STACKSIZE(%esp) #define Y 40 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE(%esp) #define I %eax #define J %ebx #define INCX J #define INCY %ecx #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY leal (,INCX, SIZE), INCX leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl M, I sarl $3, I jle .L05 ALIGN_4 .L02: movsd (X), %xmm0 addl INCX, X movhpd (X), %xmm0 addl INCX, X movsd (X), %xmm1 addl INCX, X movhpd (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X movhpd (X), %xmm2 addl INCX, X movsd (X), %xmm3 addl INCX, X movhpd (X), %xmm3 addl INCX, X movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) movapd %xmm2, 4 * SIZE(Y1) movapd %xmm3, 6 * SIZE(Y1) addl $8 * SIZE, Y1 decl I jg .L02 ALIGN_4 .L05: movl M, I andl $7, I jle .L10 ALIGN_2 .L06: movsd (X), %xmm0 addl INCX, X movsd %xmm0, 0 * SIZE(Y1) addl $SIZE, Y1 decl I jg .L06 ALIGN_4 .L10: movl Y, Y1 movl N, J sarl $1, J jle .L20 ALIGN_3 .L11: movl BUFFER, X addl $16 * SIZE, X movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movsd -16 * SIZE(X), %xmm2 movsd -15 * SIZE(X), %xmm3 movl M, I sarl $3, I jle .L15 movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movsd -15 * SIZE(A1), %xmm6 movsd -15 * SIZE(A1, LDA), %xmm7 mulsd %xmm2, %xmm4 mulsd %xmm2, %xmm5 movsd -14 * SIZE(X), %xmm2 decl I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -14 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -13 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -14 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -13 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -12 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -13 * SIZE(A1, LDA), %xmm7 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -11 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -12 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -11 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -10 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -11 * SIZE(A1, LDA), %xmm7 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1, LDA) #endif mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -10 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -9 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -9 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -8 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -9 * SIZE(A1, LDA), %xmm7 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -7 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -8 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -7 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -6 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -7 * SIZE(A1, LDA), %xmm7 addl $8 * SIZE, A1 addl $8 * SIZE, X decl I jg .L12 ALIGN_4 .L13: mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -14 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -13 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -14 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -13 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -12 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -13 * SIZE(A1, LDA), %xmm7 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -11 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -12 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -11 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -10 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -11 * SIZE(A1, LDA), %xmm7 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -10 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -9 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -9 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -8 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -9 * SIZE(A1, LDA), %xmm7 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 mulsd %xmm3, %xmm7 movsd -7 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 addsd %xmm6, %xmm0 addl $8 * SIZE, A1 addsd %xmm7, %xmm1 addl $8 * SIZE, X ALIGN_4 .L15: testl $4, M jle .L16 movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movsd -15 * SIZE(A1), %xmm6 movsd -15 * SIZE(A1, LDA), %xmm7 mulsd %xmm2, %xmm4 mulsd %xmm2, %xmm5 movsd -14 * SIZE(X), %xmm2 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 movsd -14 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -13 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -14 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 addsd %xmm6, %xmm0 movsd -13 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm5 movsd -12 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -13 * SIZE(A1, LDA), %xmm7 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 mulsd %xmm3, %xmm7 movsd -11 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 addsd %xmm6, %xmm0 addsd %xmm7, %xmm1 addl $4 * SIZE, A1 addl $4 * SIZE, X ALIGN_4 .L16: testl $2, M jle .L17 movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movsd -15 * SIZE(A1), %xmm6 movsd -15 * SIZE(A1, LDA), %xmm7 mulsd %xmm2, %xmm4 mulsd %xmm2, %xmm5 movsd -14 * SIZE(X), %xmm2 mulsd %xmm3, %xmm6 addsd %xmm4, %xmm0 mulsd %xmm3, %xmm7 addsd %xmm5, %xmm1 addsd %xmm6, %xmm0 addsd %xmm7, %xmm1 addl $2 * SIZE, A1 ALIGN_4 .L17: testl $1, M jle .L18 movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm4 mulsd %xmm2, %xmm5 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 ALIGN_4 .L18: movsd ALPHA, %xmm7 mulpd %xmm7, %xmm0 mulpd %xmm7, %xmm1 addsd (Y1), %xmm0 addsd (Y1, INCY), %xmm1 movsd %xmm0, (Y1) movsd %xmm1, (Y1, INCY) leal (Y1, INCY, 2), Y1 decl J jg .L11 ALIGN_4 .L20: testl $1, N jle .L999 movl BUFFER, X addl $16 * SIZE, X movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movsd -16 * SIZE(X), %xmm2 movsd -15 * SIZE(X), %xmm3 movl M, I sarl $3, I jle .L25 movsd -16 * SIZE(A1), %xmm4 movsd -15 * SIZE(A1), %xmm5 movsd -14 * SIZE(A1), %xmm6 movsd -13 * SIZE(A1), %xmm7 mulsd %xmm2, %xmm4 movsd -14 * SIZE(X), %xmm2 mulsd %xmm3, %xmm5 movsd -13 * SIZE(X), %xmm3 decl I jle .L23 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif mulsd %xmm2, %xmm6 movsd -12 * SIZE(X), %xmm2 addsd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -11 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -11 * SIZE(A1), %xmm5 addsd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm4 movsd -10 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -9 * SIZE(A1), %xmm7 mulsd %xmm3, %xmm5 movsd -9 * SIZE(X), %xmm3 mulsd %xmm2, %xmm6 movsd -8 * SIZE(X), %xmm2 addsd %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -7 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -7 * SIZE(A1), %xmm5 addsd %xmm6, %xmm0 movsd -6 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm4 movsd -6 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -5 * SIZE(A1), %xmm7 mulsd %xmm3, %xmm5 movsd -5 * SIZE(X), %xmm3 addl $8 * SIZE, A1 addl $8 * SIZE, X decl I jg .L22 ALIGN_4 .L23: mulsd %xmm2, %xmm6 movsd -12 * SIZE(X), %xmm2 addsd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 mulsd %xmm3, %xmm7 movsd -11 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 movsd -11 * SIZE(A1), %xmm5 addsd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 mulsd %xmm2, %xmm4 movsd -10 * SIZE(X), %xmm2 addsd %xmm7, %xmm1 movsd -9 * SIZE(A1), %xmm7 mulsd %xmm3, %xmm5 movsd -9 * SIZE(X), %xmm3 mulsd %xmm2, %xmm6 movsd -8 * SIZE(X), %xmm2 addsd %xmm4, %xmm0 mulsd %xmm3, %xmm7 movsd -7 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 addsd %xmm6, %xmm0 addsd %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L25: testl $4, M jle .L26 movsd -16 * SIZE(A1), %xmm4 movsd -15 * SIZE(A1), %xmm5 movsd -14 * SIZE(A1), %xmm6 movsd -13 * SIZE(A1), %xmm7 mulsd %xmm2, %xmm4 movsd -14 * SIZE(X), %xmm2 mulsd %xmm3, %xmm5 movsd -13 * SIZE(X), %xmm3 mulsd %xmm2, %xmm6 movsd -12 * SIZE(X), %xmm2 addsd %xmm4, %xmm0 mulsd %xmm3, %xmm7 movsd -11 * SIZE(X), %xmm3 addsd %xmm5, %xmm1 addsd %xmm6, %xmm0 addsd %xmm7, %xmm1 addl $4 * SIZE, A1 addl $4 * SIZE, X ALIGN_4 .L26: testl $2, M jle .L27 movsd -16 * SIZE(A1), %xmm4 movsd -15 * SIZE(A1), %xmm5 mulsd %xmm2, %xmm4 movsd -14 * SIZE(X), %xmm2 mulsd %xmm3, %xmm5 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addl $2 * SIZE, A1 ALIGN_4 .L27: testl $1, M jle .L28 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm4 addsd %xmm4, %xmm0 ALIGN_4 .L28: movsd ALPHA, %xmm7 addsd %xmm1, %xmm0 mulpd %xmm7, %xmm0 addsd (Y1), %xmm0 movsd %xmm0, (Y1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_t_sse.S000066400000000000000000000316071313527062700173370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef movsd #undef movsd #endif #ifdef PENTIUM3 #ifdef HAVE_SSE #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define movsd movlps #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 4) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 3) #define movsd movlps #endif #ifdef BARCELONA #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) #endif #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 4) #endif #define STACKSIZE 16 #define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp) #define A 20 + STACKSIZE+ARGS(%esp) #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) #define STACK_X 28 + STACKSIZE+ARGS(%esp) #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) #define Y 36 + STACKSIZE+ARGS(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp) #define MMM 0+ARGS(%esp) #define AA 4+ARGS(%esp) #define XX 8+ARGS(%esp) #define I %eax #define J %ebx #define INCX J #define INCY %ecx #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_X, X movl X,XX movl A,J movl J,AA # backup A movl M,J movl J,MMM # mov M to MMM .L0t: xorl J,J addl $1,J sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) subl $8, J # Don't use last 8 float in the buffer. subl J,MMM # MMM=MMM-J movl J,M jge .L00t ALIGN_4 movl MMM,%eax addl J,%eax jle .L999x movl %eax,M .L00t: movl AA,%eax movl %eax,A # mov AA to A movl XX,%eax movl %eax,X movl STACK_LDA, LDA movl STACK_INCX, INCX movl STACK_INCY, INCY leal (,INCX, SIZE), INCX leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA subl $-32 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl M, I sarl $3, I jle .L05 ALIGN_4 .L02: movss (X), %xmm0 addl INCX, X movss (X), %xmm1 addl INCX, X unpcklps %xmm1, %xmm0 movss (X), %xmm2 addl INCX, X movss (X), %xmm3 addl INCX, X unpcklps %xmm3, %xmm2 movss (X), %xmm4 addl INCX, X movss (X), %xmm5 addl INCX, X unpcklps %xmm5, %xmm4 movss (X), %xmm6 addl INCX, X movss (X), %xmm7 addl INCX, X unpcklps %xmm7, %xmm6 movlps %xmm0, 0 * SIZE(Y1) movlps %xmm2, 2 * SIZE(Y1) movlps %xmm4, 4 * SIZE(Y1) movlps %xmm6, 6 * SIZE(Y1) addl $8 * SIZE, Y1 decl I jg .L02 ALIGN_4 .L05: movl M, I andl $7, I jle .L10 ALIGN_2 .L06: movss (X), %xmm0 addl INCX, X movss %xmm0, 0 * SIZE(Y1) addl $SIZE, Y1 decl I jg .L06 ALIGN_4 //Padding zero to prevent loading the dirty number from buffer. movl M, I movl $8, J andl $7, I xorps %xmm0, %xmm0 subl I, J ALIGN_2 .L07: movss %xmm0, 0 * SIZE(Y1) addl $SIZE, Y1 decl J jg .L07 ALIGN_4 .L10: movl Y, Y1 movl N, J sarl $1, J jle .L20 ALIGN_3 .L11: movl BUFFER, X addl $32 * SIZE, X movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movaps -32 * SIZE(X), %xmm2 movaps -28 * SIZE(X), %xmm3 movl M, I sarl $4, I jle .L15 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movhps -30 * SIZE(A1, LDA), %xmm5 movsd -28 * SIZE(A1), %xmm6 movhps -26 * SIZE(A1), %xmm6 movsd -28 * SIZE(A1, LDA), %xmm7 movhps -26 * SIZE(A1, LDA), %xmm7 decl I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A1), %xmm4 movhps -22 * SIZE(A1), %xmm4 mulps %xmm2, %xmm5 movaps -24 * SIZE(X), %xmm2 addps %xmm5, %xmm1 movsd -24 * SIZE(A1, LDA), %xmm5 movhps -22 * SIZE(A1, LDA), %xmm5 mulps %xmm3, %xmm6 addps %xmm6, %xmm0 movsd -20 * SIZE(A1), %xmm6 movhps -18 * SIZE(A1), %xmm6 mulps %xmm3, %xmm7 movaps -20 * SIZE(X), %xmm3 addps %xmm7, %xmm1 movsd -20 * SIZE(A1, LDA), %xmm7 movhps -18 * SIZE(A1, LDA), %xmm7 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1, LDA) #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 movsd -16 * SIZE(A1), %xmm4 movhps -14 * SIZE(A1), %xmm4 mulps %xmm2, %xmm5 movaps -16 * SIZE(X), %xmm2 addps %xmm5, %xmm1 movsd -16 * SIZE(A1, LDA), %xmm5 movhps -14 * SIZE(A1, LDA), %xmm5 mulps %xmm3, %xmm6 addps %xmm6, %xmm0 movsd -12 * SIZE(A1), %xmm6 movhps -10 * SIZE(A1), %xmm6 mulps %xmm3, %xmm7 movaps -12 * SIZE(X), %xmm3 addps %xmm7, %xmm1 movsd -12 * SIZE(A1, LDA), %xmm7 movhps -10 * SIZE(A1, LDA), %xmm7 addl $16 * SIZE, A1 addl $16 * SIZE, X decl I jg .L12 ALIGN_4 .L13: mulps %xmm2, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A1), %xmm4 movhps -22 * SIZE(A1), %xmm4 mulps %xmm2, %xmm5 movaps -24 * SIZE(X), %xmm2 addps %xmm5, %xmm1 movsd -24 * SIZE(A1, LDA), %xmm5 movhps -22 * SIZE(A1, LDA), %xmm5 mulps %xmm3, %xmm6 addps %xmm6, %xmm0 movsd -20 * SIZE(A1), %xmm6 movhps -18 * SIZE(A1), %xmm6 mulps %xmm3, %xmm7 movaps -20 * SIZE(X), %xmm3 addps %xmm7, %xmm1 movsd -20 * SIZE(A1, LDA), %xmm7 movhps -18 * SIZE(A1, LDA), %xmm7 mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 movaps -16 * SIZE(X), %xmm2 addps %xmm5, %xmm1 mulps %xmm3, %xmm6 addps %xmm6, %xmm0 mulps %xmm3, %xmm7 movaps -12 * SIZE(X), %xmm3 addps %xmm7, %xmm1 addl $16 * SIZE, A1 addl $16 * SIZE, X ALIGN_4 .L15: testl $8, M jle .L16 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movhps -30 * SIZE(A1, LDA), %xmm5 movsd -28 * SIZE(A1), %xmm6 movhps -26 * SIZE(A1), %xmm6 movsd -28 * SIZE(A1, LDA), %xmm7 movhps -26 * SIZE(A1, LDA), %xmm7 mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 movaps -24 * SIZE(X), %xmm2 addps %xmm5, %xmm1 mulps %xmm3, %xmm6 addps %xmm6, %xmm0 mulps %xmm3, %xmm7 movaps -20 * SIZE(X), %xmm3 addps %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L16: testl $4, M jle .L17 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movhps -30 * SIZE(A1, LDA), %xmm5 mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 addps %xmm5, %xmm1 movaps %xmm3, %xmm2 addl $4 * SIZE, A1 ALIGN_4 .L17: testl $2, M jle .L18 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm4 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd -32 * SIZE(A1, LDA), %xmm5 mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 addps %xmm5, %xmm1 movhlps %xmm2, %xmm2 addl $2 * SIZE, A1 ALIGN_4 .L18: testl $1, M jle .L19 movss -32 * SIZE(A1), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm0 movss -32 * SIZE(A1, LDA), %xmm5 mulss %xmm2, %xmm5 addss %xmm5, %xmm1 ALIGN_4 .L19: #ifdef HAVE_SSE3 haddps %xmm0, %xmm0 haddps %xmm1, %xmm1 haddps %xmm0, %xmm0 haddps %xmm1, %xmm1 #else movhlps %xmm0, %xmm2 movhlps %xmm1, %xmm3 addps %xmm2, %xmm0 addps %xmm3, %xmm1 movaps %xmm0, %xmm2 shufps $1, %xmm0, %xmm0 movaps %xmm1, %xmm3 shufps $1, %xmm1, %xmm1 addss %xmm2, %xmm0 addss %xmm3, %xmm1 #endif movss ALPHA, %xmm7 mulss %xmm7, %xmm0 mulss %xmm7, %xmm1 addss (Y1), %xmm0 addss (Y1, INCY), %xmm1 movss %xmm0, (Y1) movss %xmm1, (Y1, INCY) leal (Y1, INCY, 2), Y1 decl J jg .L11 ALIGN_4 .L20: testl $1, N jle .L999 movl BUFFER, X addl $32 * SIZE, X movl A, A1 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movaps -32 * SIZE(X), %xmm2 movaps -28 * SIZE(X), %xmm3 movl M, I sarl $4, I jle .L25 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 movsd -28 * SIZE(A1), %xmm6 movhps -26 * SIZE(A1), %xmm6 decl I jle .L23 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif mulps %xmm2, %xmm4 movaps -24 * SIZE(X), %xmm2 addps %xmm4, %xmm0 movsd -24 * SIZE(A1), %xmm4 movhps -22 * SIZE(A1), %xmm4 mulps %xmm3, %xmm6 movaps -20 * SIZE(X), %xmm3 addps %xmm6, %xmm0 movsd -20 * SIZE(A1), %xmm6 movhps -18 * SIZE(A1), %xmm6 mulps %xmm2, %xmm4 movaps -16 * SIZE(X), %xmm2 addps %xmm4, %xmm0 movsd -16 * SIZE(A1), %xmm4 movhps -14 * SIZE(A1), %xmm4 mulps %xmm3, %xmm6 movaps -12 * SIZE(X), %xmm3 addps %xmm6, %xmm0 movsd -12 * SIZE(A1), %xmm6 movhps -10 * SIZE(A1), %xmm6 addl $16 * SIZE, A1 addl $16 * SIZE, X decl I jg .L22 ALIGN_4 .L23: mulps %xmm2, %xmm4 movaps -24 * SIZE(X), %xmm2 addps %xmm4, %xmm0 movsd -24 * SIZE(A1), %xmm4 movhps -22 * SIZE(A1), %xmm4 mulps %xmm3, %xmm6 movaps -20 * SIZE(X), %xmm3 addps %xmm6, %xmm0 movsd -20 * SIZE(A1), %xmm6 movhps -18 * SIZE(A1), %xmm6 mulps %xmm2, %xmm4 movaps -16 * SIZE(X), %xmm2 addps %xmm4, %xmm0 mulps %xmm3, %xmm6 movaps -12 * SIZE(X), %xmm3 addps %xmm6, %xmm0 addl $16 * SIZE, A1 addl $16 * SIZE, X ALIGN_4 .L25: testl $8, M jle .L26 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 movsd -28 * SIZE(A1), %xmm6 movhps -26 * SIZE(A1), %xmm6 mulps %xmm2, %xmm4 movaps -24 * SIZE(X), %xmm2 addps %xmm4, %xmm0 mulps %xmm3, %xmm6 movaps -20 * SIZE(X), %xmm3 addps %xmm6, %xmm0 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L26: testl $4, M jle .L27 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 mulps %xmm2, %xmm4 addps %xmm4, %xmm0 movaps %xmm3, %xmm2 addl $4 * SIZE, A1 ALIGN_4 .L27: testl $2, M jle .L28 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm4 mulps %xmm2, %xmm4 addps %xmm4, %xmm0 movhlps %xmm2, %xmm2 addl $2 * SIZE, A1 ALIGN_4 .L28: testl $1, M jle .L29 movss -32 * SIZE(A1), %xmm4 mulss %xmm2, %xmm4 addss %xmm4, %xmm0 ALIGN_4 .L29: #ifdef HAVE_SSE3 haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #else movhlps %xmm0, %xmm2 addps %xmm2, %xmm0 movaps %xmm0, %xmm2 shufps $1, %xmm0, %xmm0 addss %xmm2, %xmm0 #endif movss ALPHA, %xmm7 mulss %xmm7, %xmm0 addss (Y1), %xmm0 movss %xmm0, (Y1) ALIGN_4 .L999: movl M,J leal (,J,SIZE),%eax addl %eax,AA movl STACK_INCX,INCX imull INCX,%eax addl %eax,XX jmp .L0t ALIGN_4 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/gemv_t_sse2.S000066400000000000000000000274611313527062700174240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 3) #define movsd movlps #endif #ifdef BARCELONA #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) #endif #ifdef ATOM #define PREFETCH prefetch #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 4) #endif #define STACKSIZE 16 #define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp) #define A 24 + STACKSIZE+ARGS(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) #define Y 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp) #define MMM 0+ARGS(%esp) #define AA 4+ARGS(%esp) #define XX 8+ARGS(%esp) #define I %eax #define J %ebx #define INCX J #define INCY %ecx #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_X, X movl X,XX movl A,J movl J,AA # backup A movl M,J movl J,MMM # mov M to MMM .L0t: xorl J,J addl $1,J sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) subl $4, J # Don't use last 4 double in the buffer. subl J,MMM # MMM=MMM-J movl J,M jge .L00t ALIGN_4 movl MMM,%eax addl J,%eax jle .L999x movl %eax,M .L00t: movl XX,%eax movl %eax, X movl AA,%eax movl %eax,A # mov AA to A movl STACK_LDA, LDA movl STACK_INCX, INCX movl STACK_INCY, INCY leal (,INCX, SIZE), INCX leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl M, I sarl $3, I jle .L05 ALIGN_4 .L02: movsd (X), %xmm0 addl INCX, X movhpd (X), %xmm0 addl INCX, X movsd (X), %xmm1 addl INCX, X movhpd (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X movhpd (X), %xmm2 addl INCX, X movsd (X), %xmm3 addl INCX, X movhpd (X), %xmm3 addl INCX, X movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) movapd %xmm2, 4 * SIZE(Y1) movapd %xmm3, 6 * SIZE(Y1) addl $8 * SIZE, Y1 decl I jg .L02 ALIGN_4 .L05: movl M, I andl $7, I jle .L10 ALIGN_2 .L06: movsd (X), %xmm0 addl INCX, X movsd %xmm0, 0 * SIZE(Y1) addl $SIZE, Y1 decl I jg .L06 ALIGN_4 .L10: movl Y, Y1 movl N, J sarl $1, J jle .L20 ALIGN_3 .L11: movl BUFFER, X addl $16 * SIZE, X movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movapd -16 * SIZE(X), %xmm2 movapd -14 * SIZE(X), %xmm3 movl M, I sarl $3, I jle .L15 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movhpd -15 * SIZE(A1, LDA), %xmm5 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 movsd -14 * SIZE(A1, LDA), %xmm7 movhpd -13 * SIZE(A1, LDA), %xmm7 decl I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 mulpd %xmm2, %xmm5 movapd -12 * SIZE(X), %xmm2 addpd %xmm5, %xmm1 movsd -12 * SIZE(A1, LDA), %xmm5 movhpd -11 * SIZE(A1, LDA), %xmm5 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm3, %xmm7 movapd -10 * SIZE(X), %xmm3 addpd %xmm7, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm7 movhpd -9 * SIZE(A1, LDA), %xmm7 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1, LDA) #endif mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 movhpd -7 * SIZE(A1), %xmm4 mulpd %xmm2, %xmm5 movapd -8 * SIZE(X), %xmm2 addpd %xmm5, %xmm1 movsd -8 * SIZE(A1, LDA), %xmm5 movhpd -7 * SIZE(A1, LDA), %xmm5 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd -6 * SIZE(A1), %xmm6 movhpd -5 * SIZE(A1), %xmm6 mulpd %xmm3, %xmm7 movapd -6 * SIZE(X), %xmm3 addpd %xmm7, %xmm1 movsd -6 * SIZE(A1, LDA), %xmm7 movhpd -5 * SIZE(A1, LDA), %xmm7 addl $8 * SIZE, A1 addl $8 * SIZE, X decl I jg .L12 ALIGN_4 .L13: mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 mulpd %xmm2, %xmm5 movapd -12 * SIZE(X), %xmm2 addpd %xmm5, %xmm1 movsd -12 * SIZE(A1, LDA), %xmm5 movhpd -11 * SIZE(A1, LDA), %xmm5 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm3, %xmm7 movapd -10 * SIZE(X), %xmm3 addpd %xmm7, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm7 movhpd -9 * SIZE(A1, LDA), %xmm7 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm2, %xmm5 movapd -8 * SIZE(X), %xmm2 addpd %xmm5, %xmm1 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm3, %xmm7 movapd -6 * SIZE(X), %xmm3 addpd %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L15: testl $4, M jle .L16 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movhpd -15 * SIZE(A1, LDA), %xmm5 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 movsd -14 * SIZE(A1, LDA), %xmm7 movhpd -13 * SIZE(A1, LDA), %xmm7 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm2, %xmm5 movapd -12 * SIZE(X), %xmm2 addpd %xmm5, %xmm1 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm3, %xmm7 movapd -10 * SIZE(X), %xmm3 addpd %xmm7, %xmm1 addl $4 * SIZE, A1 addl $4 * SIZE, X ALIGN_4 .L16: testl $2, M jle .L17 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movhpd -15 * SIZE(A1, LDA), %xmm5 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm1 movapd %xmm3, %xmm2 addl $2 * SIZE, A1 ALIGN_4 .L17: testl $1, M jle .L18 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm4 addsd %xmm4, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm5 mulsd %xmm2, %xmm5 addsd %xmm5, %xmm1 ALIGN_4 .L18: #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 addpd %xmm2, %xmm0 #endif #ifdef HAVE_SSE3 movddup ALPHA, %xmm7 #else movsd ALPHA, %xmm7 unpcklpd %xmm7, %xmm7 #endif mulpd %xmm7, %xmm0 movsd (Y1), %xmm4 movhpd (Y1, INCY), %xmm4 addpd %xmm4, %xmm0 movlpd %xmm0, (Y1) movhpd %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 decl J jg .L11 ALIGN_4 .L20: testl $1, N jle .L999 movl BUFFER, X addl $16 * SIZE, X movl A, A1 leal (A1, LDA, 2), %eax movl %eax, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movapd -16 * SIZE(X), %xmm2 movapd -14 * SIZE(X), %xmm3 movl M, I sarl $3, I jle .L25 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 decl I jle .L23 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif mulpd %xmm2, %xmm4 movapd -12 * SIZE(X), %xmm2 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 mulpd %xmm3, %xmm6 movapd -10 * SIZE(X), %xmm3 addpd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm2, %xmm4 movapd -8 * SIZE(X), %xmm2 addpd %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 movhpd -7 * SIZE(A1), %xmm4 mulpd %xmm3, %xmm6 movapd -6 * SIZE(X), %xmm3 addpd %xmm6, %xmm0 movsd -6 * SIZE(A1), %xmm6 movhpd -5 * SIZE(A1), %xmm6 addl $8 * SIZE, A1 addl $8 * SIZE, X decl I jg .L22 ALIGN_4 .L23: mulpd %xmm2, %xmm4 movapd -12 * SIZE(X), %xmm2 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 mulpd %xmm3, %xmm6 movapd -10 * SIZE(X), %xmm3 addpd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm2, %xmm4 movapd -8 * SIZE(X), %xmm2 addpd %xmm4, %xmm0 mulpd %xmm3, %xmm6 movapd -6 * SIZE(X), %xmm3 addpd %xmm6, %xmm0 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L25: testl $4, M jle .L26 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 mulpd %xmm2, %xmm4 movapd -12 * SIZE(X), %xmm2 addpd %xmm4, %xmm0 mulpd %xmm3, %xmm6 movapd -10 * SIZE(X), %xmm3 addpd %xmm6, %xmm0 addl $4 * SIZE, A1 addl $4 * SIZE, X ALIGN_4 .L26: testl $2, M jle .L27 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movapd %xmm3, %xmm2 addl $2 * SIZE, A1 ALIGN_4 .L27: testl $1, M jle .L28 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm4 addsd %xmm4, %xmm0 ALIGN_4 .L28: #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 addsd %xmm2, %xmm0 #endif movsd ALPHA, %xmm7 mulpd %xmm7, %xmm0 addsd (Y1), %xmm0 movlpd %xmm0, (Y1) ALIGN_4 .L999: movl M,J leal (,J,SIZE),%eax addl %eax,AA movl STACK_INCX,INCX imull INCX,%eax addl %eax,XX jmp .L0t ALIGN_4 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/iamax.S000066400000000000000000000145411313527062700163010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %ebx #define INCX %esi #define X %ecx #define I %edx #define NUM %edi #define RET %eax #ifndef USE_MIN #define FMOV fcmovbe #define IMOV cmovnbe #else #define FMOV fcmovnbe #define IMOV cmovb #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_INCX, INCX movl STACK_X, X #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif sall $BASE_SHIFT, INCX fldz xorl RET, RET testl M, M jle .L999 testl INCX, INCX jle .L999 fstp %st(0) movl $2, NUM movl $1, RET FLD (X) #ifdef USE_ABS fabs #endif addl INCX, X decl M jle .L999 cmpl $SIZE, INCX jne .L40 movl M, I sarl $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 1 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 2 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 3 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 4 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 5 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 6 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 7 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM addl $8 * SIZE, X decl I jg .L10 ALIGN_4 .L20: movl M, I andl $7, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) addl $1 * SIZE, X incl NUM decl I jg .L21 jmp .L999 ALIGN_4 .L40: movl M, I sarl $3, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM decl I jg .L50 ALIGN_4 .L60: movl M, I andl $7, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM addl INCX, X decl I jg .L61 ALIGN_4 .L999: fstp %st(0) popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/iamax_sse.S000066400000000000000000000364611313527062700171600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define RET %eax #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define MM %ebp #define XX %edi #define TEMP %ebx #ifdef USE_MIN #define maxps minps #define maxss minss #endif #ifndef HAVE_SSE2 #define pxor xorps #define movsd movlps #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif pxor %xmm0, %xmm0 /* Return Value(Float) */ #ifdef USE_ABS pxor %xmm7, %xmm7 /* Generate USE_ABS */ #endif xor RET, RET /* Return Value(Int) */ testl M, M jle .L999 leal (, INCX, SIZE), INCX testl INCX, INCX jle .L999 movl M, MM movl X, XX #ifdef USE_ABS #ifndef HAVE_SSE2 subl $8, %esp movl $0x7fffffff, (%esp) movss (%esp), %xmm7 shufps $0, %xmm7, %xmm7 addl $8, %esp #else cmpeqps %xmm7, %xmm7 psrld $1, %xmm7 /* Generate USE_ABS */ #endif #endif movss (XX), %xmm0 addl INCX, XX decl MM shufps $0, %xmm0, %xmm0 #ifdef USE_ABS andps %xmm7, %xmm0 #endif movaps %xmm0, %xmm1 movaps %xmm0, %xmm2 movaps %xmm0, %xmm3 /* Generating "seed value" */ cmpl $SIZE, INCX jne .L80 /* Incx != 1 goto L80 */ /* Analigned Check */ testl $3, XX /* 00000011 */ jne .L30 /* Purely Unaligned Mode */ cmpl $8, MM jle .L30 /* if M <= 8 goto Unaligned mode */ testl $4, XX /* bit test 000100 */ je .L05 movss 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm0 decl MM addl $SIZE, XX ALIGN_3 .L05: testl $8, XX je .L06 movsd 0 * SIZE(XX), %xmm4 unpcklps %xmm4, %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm1 subl $2, MM addl $2 * SIZE, XX ALIGN_3 .L06: movl MM, I sarl $4, I jle .L15 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movaps 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm0 movaps 4 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm1 movaps 8 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm2 movaps 12 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm3 addl $16 * SIZE, XX decl I jg .L11 ALIGN_4 .L15: andl $15, MM jle .L20 testl $8, MM je .L16 movaps 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm0 movaps 4 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm1 addl $8 * SIZE, XX ALIGN_3 .L16: testl $4, MM je .L17 movaps 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm2 addl $4 * SIZE, XX ALIGN_3 .L17: testl $2, MM je .L18 movsd 0 * SIZE(XX), %xmm4 unpcklps %xmm4, %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm3 addl $2 * SIZE, XX .L18: testl $1, MM je .L20 movss 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm0 ALIGN_3 .L20: movl X, XX movl M, MM maxps %xmm1, %xmm0 maxps %xmm3, %xmm2 maxps %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 testl $4, XX je .L21 movss 0 * SIZE(XX), %xmm1 decl MM addl $SIZE, XX #ifdef USE_ABS andps %xmm7, %xmm1 #endif incl RET comiss %xmm0, %xmm1 je .L999 ALIGN_3 .L21: testl $8, XX je .L22 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 subl $2, MM addl $2 * SIZE, XX #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 #endif incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L22: movl MM, I sarl $3, I jle .L25 ALIGN_4 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movaps 0 * SIZE(XX), %xmm1 #ifdef USE_ABS andps %xmm7, %xmm1 #endif cmpeqps %xmm0, %xmm1 movaps 4 * SIZE(XX), %xmm2 #ifdef USE_ABS andps %xmm7, %xmm2 #endif cmpeqps %xmm0, %xmm2 orps %xmm2, %xmm1 movmskps %xmm1, TEMP testl $15, TEMP jne .L24 addl $8 * SIZE, XX addl $8, RET decl I jg .L23 jmp .L25 ALIGN_3 .L24: movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 #endif incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 incl RET comiss %xmm0, %xmm4 je .L999 movss 4 * SIZE(XX), %xmm1 movss 5 * SIZE(XX), %xmm2 movss 6 * SIZE(XX), %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 #endif incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 incl RET jmp .L999 ALIGN_4 .L25: testl $4, MM je .L26 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 #endif addl $4 * SIZE, XX incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 incl RET comiss %xmm0, %xmm4 je .L999 ALIGN_3 .L26: testl $2, MM je .L27 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 #endif addl $2 * SIZE, XX incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L27: incl RET jmp .L999 ALIGN_3 /* Unaligned Mode */ .L30: movl MM, I sarl $4, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm4 movhps 2 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm0 movsd 4 * SIZE(XX), %xmm4 movhps 6 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm1 movsd 8 * SIZE(XX), %xmm4 movhps 10 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm2 movsd 12 * SIZE(XX), %xmm4 movhps 14 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm3 addl $16 * SIZE, XX decl I jg .L31 ALIGN_4 .L35: andl $15, MM jle .L40 testl $8, MM je .L36 movsd 0 * SIZE(XX), %xmm4 movhps 2 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm0 movsd 4 * SIZE(XX), %xmm4 movhps 6 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm1 addl $8 * SIZE, XX ALIGN_3 .L36: testl $4, MM je .L37 movsd 0 * SIZE(XX), %xmm4 movhps 2 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm2 addl $4 * SIZE, XX ALIGN_3 .L37: testl $2, MM je .L38 movsd 0 * SIZE(XX), %xmm4 unpcklps %xmm4, %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxps %xmm4, %xmm3 addl $2 * SIZE, XX .L38: testl $1, MM je .L40 movss 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm0 jmp .L40 ALIGN_4 .L40: movl X, XX movl M, MM maxps %xmm1, %xmm0 maxps %xmm3, %xmm2 maxps %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movl MM, I sarl $3, I jle .L45 ALIGN_4 .L43: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movhps 2 * SIZE(XX), %xmm1 #ifdef USE_ABS andps %xmm7, %xmm1 #endif cmpeqps %xmm0, %xmm1 movsd 4 * SIZE(XX), %xmm2 movhps 6 * SIZE(XX), %xmm2 #ifdef USE_ABS andps %xmm7, %xmm2 #endif cmpeqps %xmm0, %xmm2 orps %xmm2, %xmm1 movmskps %xmm1, TEMP testl $15, TEMP jne .L44 addl $8 * SIZE, XX addl $8, RET decl I jg .L43 jmp .L45 ALIGN_3 .L44: movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 #endif incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 incl RET comiss %xmm0, %xmm4 je .L999 movss 4 * SIZE(XX), %xmm1 movss 5 * SIZE(XX), %xmm2 movss 6 * SIZE(XX), %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 #endif incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 incl RET jmp .L999 ALIGN_4 .L45: testl $4, MM je .L46 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 #endif addl $4 * SIZE, XX incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 incl RET comiss %xmm0, %xmm4 je .L999 ALIGN_3 .L46: testl $2, MM je .L47 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 #endif addl $2 * SIZE, XX incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L47: incl RET jmp .L999 ALIGN_3 .L80: movl MM, I sarl $3, I jle .L85 ALIGN_4 .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm1 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm2 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm3 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm1 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm2 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm3 decl I jg .L81 ALIGN_4 .L85: andl $7, MM jle .L90 testl $4, MM je .L86 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm1 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm2 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm3 ALIGN_3 .L86: testl $2, MM je .L87 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm1 ALIGN_3 .L87: testl $1, MM je .L90 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm2 ALIGN_4 .L90: movl X, XX movl M, MM maxss %xmm1, %xmm0 maxss %xmm3, %xmm2 maxss %xmm2, %xmm0 shufps $0, %xmm0, %xmm0 movl MM, I sarl $2, I jle .L96 ALIGN_4 .L92: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movss 0 * SIZE(XX), %xmm1 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm1 #endif cmpeqss %xmm0, %xmm1 movss 0 * SIZE(XX), %xmm2 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm2 #endif cmpeqss %xmm0, %xmm2 movss 0 * SIZE(XX), %xmm3 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm3 #endif cmpeqss %xmm0, %xmm3 movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm4 #endif cmpeqss %xmm0, %xmm4 orps %xmm2, %xmm1 orps %xmm4, %xmm3 orps %xmm3, %xmm1 movmskps %xmm1, TEMP testl $15, TEMP jne .L93 addl $4, RET decl I jg .L92 jmp .L96 ALIGN_3 .L93: leal (, INCX, 4), TEMP subl TEMP, XX movss 0 * SIZE(XX), %xmm1 addl INCX, XX movss 0 * SIZE(XX), %xmm2 addl INCX, XX movss 0 * SIZE(XX), %xmm3 addl INCX, XX movss 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 #endif incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 incl RET comiss %xmm0, %xmm4 je .L999 ALIGN_3 .L96: testl $2, MM je .L97 movss 0 * SIZE(XX), %xmm1 addl INCX, XX movss 0 * SIZE(XX), %xmm2 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 #endif incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L97: incl RET ALIGN_3 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/iamax_sse2.S000066400000000000000000000446131313527062700172400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define RET %eax #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define MM %ebp #define XX %edi #define TEMP %ebx #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif pxor %xmm0, %xmm0 #ifdef USE_ABS pxor %xmm7, %xmm7 #endif xor RET, RET testl M, M jle .L999 leal (, INCX, SIZE), INCX testl INCX, INCX jle .L999 movl M, MM movl X, XX #ifdef USE_ABS cmpeqpd %xmm7, %xmm7 psrlq $1, %xmm7 #endif movsd (XX), %xmm0 addl INCX, XX decl MM #ifdef USE_ABS andpd %xmm7, %xmm0 #endif unpcklpd %xmm0, %xmm0 movapd %xmm0, %xmm1 movapd %xmm0, %xmm2 movapd %xmm0, %xmm3 cmpl $SIZE, INCX jne .L80 /* Analigned Check */ cmpl $7, MM jle .L50 testl $7, XX jne .L50 # Purely Unaligned Mode testl $15, XX # Checking for 128bit align je .L05 movsd 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif unpcklpd %xmm4, %xmm4 maxpd %xmm4, %xmm3 decl MM addl $SIZE, XX ALIGN_3 .L05: movl MM, I sarl $4, I jle .L15 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movapd 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 2 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movapd 4 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movapd 6 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) #endif movapd 8 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 10 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movapd 12 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movapd 14 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 addl $16 * SIZE, XX decl I jg .L11 ALIGN_4 .L15: andl $15, MM jle .L20 testl $8, MM je .L16 movapd 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 2 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movapd 4 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movapd 6 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 addl $8 * SIZE, XX ALIGN_3 .L16: testl $4, MM je .L17 movapd 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 2 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 addl $4 * SIZE, XX ALIGN_3 .L17: testl $2, MM je .L18 movapd 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 addl $2 * SIZE, XX .L18: testl $1, MM je .L20 movsd 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif unpcklpd %xmm4, %xmm4 maxpd %xmm4, %xmm3 ALIGN_3 /* Finding Index */ .L20: movl X, XX movl M, MM maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 testl $15, XX # Checking for 128bit align je .L21 movsd 0 * SIZE(XX), %xmm1 #ifdef USE_ABS andpd %xmm7, %xmm1 #endif incl RET comisd %xmm0, %xmm1 je .L999 addl $SIZE, XX decl MM ALIGN_3 .L21: movl MM, I sarl $3, I jle .L25 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movapd 0 * SIZE(XX), %xmm1 #ifdef USE_ABS andpd %xmm7, %xmm1 #endif cmpeqpd %xmm0, %xmm1 movapd 2 * SIZE(XX), %xmm2 #ifdef USE_ABS andpd %xmm7, %xmm2 #endif cmpeqpd %xmm0, %xmm2 movapd 4 * SIZE(XX), %xmm3 #ifdef USE_ABS andpd %xmm7, %xmm3 #endif cmpeqpd %xmm0, %xmm3 movapd 6 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif cmpeqpd %xmm0, %xmm4 orpd %xmm2, %xmm1 orpd %xmm4, %xmm3 orpd %xmm3, %xmm1 movmskpd %xmm1, TEMP testl $3, TEMP jne .L23 addl $8 * SIZE, XX addl $8, RET decl I jg .L22 jmp .L25 ALIGN_4 .L23: movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movsd 2 * SIZE(XX), %xmm3 movsd 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 #endif incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET comisd %xmm0, %xmm4 je .L999 movsd 4 * SIZE(XX), %xmm1 movsd 5 * SIZE(XX), %xmm2 movsd 6 * SIZE(XX), %xmm3 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 #endif incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET jmp .L999 ALIGN_3 .L25: testl $4, MM je .L27 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movsd 2 * SIZE(XX), %xmm3 movsd 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 #endif addl $4 * SIZE, XX incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET comisd %xmm0, %xmm4 je .L999 ALIGN_3 .L27: testl $2, MM je .L28 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 #endif addl $2 * SIZE, XX incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 ALIGN_3 .L28: incl RET jmp .L999 ALIGN_3 .L50: /* Unaligned Mode */ movl MM, I sarl $4, I jle .L55 ALIGN_4 .L51: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm4 movhpd 1 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 2 * SIZE(XX), %xmm4 movhpd 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movsd 4 * SIZE(XX), %xmm4 movhpd 5 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movsd 6 * SIZE(XX), %xmm4 movhpd 7 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) #endif movsd 8 * SIZE(XX), %xmm4 movhpd 9 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 10 * SIZE(XX), %xmm4 movhpd 11 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movsd 12 * SIZE(XX), %xmm4 movhpd 13 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movsd 14 * SIZE(XX), %xmm4 movhpd 15 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 addl $16 * SIZE, XX decl I jg .L51 ALIGN_4 .L55: andl $15, MM jle .L60 testl $8, MM je .L56 movsd 0 * SIZE(XX), %xmm4 movhpd 1 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 2 * SIZE(XX), %xmm4 movhpd 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movsd 4 * SIZE(XX), %xmm4 movhpd 5 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movsd 6 * SIZE(XX), %xmm4 movhpd 7 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 addl $8 * SIZE, XX ALIGN_3 .L56: testl $4, MM je .L57 movsd 0 * SIZE(XX), %xmm4 movhpd 1 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 2 * SIZE(XX), %xmm4 movhpd 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 addl $4 * SIZE, XX ALIGN_3 .L57: testl $2, MM je .L58 movsd 0 * SIZE(XX), %xmm4 movhpd 1 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 addl $2 * SIZE, XX .L58: testl $1, MM je .L60 movsd 0 * SIZE(XX), %xmm4 unpcklpd %xmm4, %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 ALIGN_3 .L60: movl X, XX movl M, MM maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movl MM, I sarl $3, I jle .L65 ALIGN_4 .L62: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm1 #ifdef USE_ABS andpd %xmm7, %xmm1 #endif cmpeqpd %xmm0, %xmm1 movsd 2 * SIZE(XX), %xmm2 movhpd 3 * SIZE(XX), %xmm2 #ifdef USE_ABS andpd %xmm7, %xmm2 #endif cmpeqpd %xmm0, %xmm2 movsd 4 * SIZE(XX), %xmm3 movhpd 5 * SIZE(XX), %xmm3 #ifdef USE_ABS andpd %xmm7, %xmm3 #endif cmpeqpd %xmm0, %xmm3 movsd 6 * SIZE(XX), %xmm4 movhpd 7 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif cmpeqpd %xmm0, %xmm4 orpd %xmm2, %xmm1 orpd %xmm4, %xmm3 orpd %xmm3, %xmm1 movmskpd %xmm1, TEMP testl $3, TEMP jne .L63 addl $8 * SIZE, XX addl $8, RET decl I jg .L62 jmp .L65 ALIGN_4 .L63: movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movsd 2 * SIZE(XX), %xmm3 movsd 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 #endif incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET comisd %xmm0, %xmm4 je .L999 incl RET movsd 4 * SIZE(XX), %xmm1 movsd 5 * SIZE(XX), %xmm2 movsd 6 * SIZE(XX), %xmm3 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 #endif comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET jmp .L999 ALIGN_3 .L65: testl $4, MM je .L67 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movsd 2 * SIZE(XX), %xmm3 movsd 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 #endif addl $4 * SIZE, XX incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET comisd %xmm0, %xmm4 je .L999 ALIGN_3 .L67: testl $2, MM je .L68 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 #endif addl $2 * SIZE, XX incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 ALIGN_3 .L68: incl RET jmp .L999 ALIGN_4 .L80: movl MM, I sarl $4, I jle .L85 ALIGN_4 .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 decl I jg .L81 ALIGN_4 .L85: andl $15, MM jle .L90 testl $8, MM je .L86 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm3 ALIGN_3 .L86: testl $4, MM je .L87 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 ALIGN_3 .L87: testl $2, MM je .L88 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm2 ALIGN_3 .L88: testl $1, MM je .L90 movsd 0 * SIZE(XX), %xmm4 #ifdef USE_ABS andpd %xmm7, %xmm4 #endif unpcklpd %xmm4, %xmm4 maxpd %xmm4, %xmm3 ALIGN_4 .L90: movl X, XX movl M, MM maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movl MM, I sarl $3, I jle .L95 ALIGN_4 .L92: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm1 #endif cmpeqpd %xmm0, %xmm1 movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm2 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm2 #endif cmpeqpd %xmm0, %xmm2 movsd 0 * SIZE(XX), %xmm3 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm3 #endif cmpeqpd %xmm0, %xmm3 movsd 0 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm4 #endif cmpeqpd %xmm0, %xmm4 orpd %xmm2, %xmm1 orpd %xmm4, %xmm3 orpd %xmm3, %xmm1 movmskpd %xmm1, TEMP testl $3, TEMP jne .L93 addl $8, RET decl I jg .L92 jmp .L95 ALIGN_4 .L93: leal (, INCX, 8), TEMP subl TEMP, XX movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movsd 0 * SIZE(XX), %xmm3 addl INCX, XX movsd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 #endif incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET comisd %xmm0, %xmm4 je .L999 movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movsd 0 * SIZE(XX), %xmm3 #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 #endif incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET jmp .L999 ALIGN_3 .L95: testl $4, MM je .L97 movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movsd 0 * SIZE(XX), %xmm3 addl INCX, XX movsd 0 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 #endif incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 incl RET comisd %xmm0, %xmm4 je .L999 ALIGN_3 .L97: testl $2, MM je .L98 movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 #endif incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm2 je .L999 ALIGN_3 .L98: incl RET ALIGN_3 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/izamax.S000066400000000000000000000130551313527062700164720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) PROLOGUE #define M %ebx #define INCX %esi #define X %ecx #define I %edx #define NUM %edi #define RET %eax #ifndef USE_MIN #define FMOV fcmovbe #define IMOV cmovnbe #else #define FMOV fcmovnb #define IMOV cmovb #endif #include "l1param.h" pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_INCX, INCX movl STACK_X, X #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif sall $ZBASE_SHIFT, INCX fldz xorl RET, RET testl M, M jle .L999 testl INCX, INCX jle .L999 fstp %st(0) movl $2, NUM movl $1, RET FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) addl INCX, X decl M jle .L999 cmpl $2 * SIZE, INCX jne .L40 movl M, I sarl $2, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM addl $8 * SIZE, X decl I jg .L10 ALIGN_4 .L20: movl M, I andl $3, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM addl $2 * SIZE, X decl I jg .L21 jmp .L999 ALIGN_4 .L40: movl M, I sarl $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM decl I jg .L50 ALIGN_4 .L60: movl M, I andl $3, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) IMOV NUM, RET fstp %st(1) incl NUM addl INCX, X decl I jg .L61 ALIGN_4 .L999: fstp %st(0) popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/izamax_sse.S000066400000000000000000000253231313527062700173450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define RET %eax #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define MM %ebp #define XX %edi #define TEMP %ebx #ifdef USE_MIN #define maxps minps #define maxss minss #endif #ifndef HAVE_SSE2 #define pxor xorps #define movsd movlps #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif pxor %xmm0, %xmm0 pxor %xmm7, %xmm7 xor RET, RET testl M, M jle .L999 testl INCX, INCX jle .L999 sall $ZBASE_SHIFT, INCX movl M, MM movl X, XX #ifdef USE_ABS #ifndef HAVE_SSE2 subl $8, %esp movl $0x7fffffff, (%esp) movss (%esp), %xmm7 shufps $0, %xmm7, %xmm7 addl $8, %esp #else cmpeqps %xmm7, %xmm7 psrld $1, %xmm7 #endif #endif movss 0 * SIZE(XX), %xmm0 movss 1 * SIZE(XX), %xmm1 addl INCX, XX decl MM andps %xmm7, %xmm0 andps %xmm7, %xmm1 addps %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 cmpl $2 * SIZE, INCX jne .L70 .L30: movl MM, I sarl $3, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movhps 2 * SIZE(XX), %xmm1 movsd 4 * SIZE(XX), %xmm2 movhps 6 * SIZE(XX), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 movsd 8 * SIZE(XX), %xmm1 movhps 10 * SIZE(XX), %xmm1 movsd 12 * SIZE(XX), %xmm2 movhps 14 * SIZE(XX), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 addl $16 * SIZE, XX decl I jg .L31 ALIGN_4 .L35: andl $7, MM jle .L40 testl $4, MM je .L36 movsd 0 * SIZE(XX), %xmm1 movhps 2 * SIZE(XX), %xmm1 movsd 4 * SIZE(XX), %xmm2 movhps 6 * SIZE(XX), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 addl $8 * SIZE, XX ALIGN_3 .L36: testl $2, MM je .L37 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 addl $4 * SIZE, XX ALIGN_3 .L37: testl $1, MM je .L40 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 andps %xmm7, %xmm1 andps %xmm7, %xmm2 addps %xmm2, %xmm1 maxss %xmm1, %xmm0 ALIGN_4 .L40: movl X, XX movl M, MM movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movl MM, I sarl $2, I jle .L45 ALIGN_4 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movhps 2 * SIZE(XX), %xmm1 movsd 4 * SIZE(XX), %xmm2 movhps 6 * SIZE(XX), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 cmpeqps %xmm0, %xmm1 movmskps %xmm1, TEMP testl $15, TEMP jne .L43 addl $8 * SIZE, XX addl $4, RET decl I jg .L41 jmp .L45 ALIGN_4 .L43: movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 movss 4 * SIZE(XX), %xmm1 movss 5 * SIZE(XX), %xmm2 movss 6 * SIZE(XX), %xmm3 movss 7 * SIZE(XX), %xmm4 andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 addl $8 * SIZE, XX incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 ALIGN_3 .L45: testl $2, MM je .L47 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 addl $4 * SIZE, XX andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 ALIGN_3 .L47: incl RET jmp .L999 ALIGN_3 .L70: movl MM, I sarl $3, I jle .L75 ALIGN_4 .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhps 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhps 0 * SIZE(XX), %xmm2 addl INCX, XX movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhps 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhps 0 * SIZE(XX), %xmm2 addl INCX, XX movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 decl I jg .L71 ALIGN_4 .L75: andl $7, MM jle .L80 testl $4, MM je .L76 movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhps 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhps 0 * SIZE(XX), %xmm2 addl INCX, XX movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 ALIGN_3 .L76: testl $2, MM je .L77 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 addl INCX, XX movss 0 * SIZE(XX), %xmm3 movss 1 * SIZE(XX), %xmm4 addl INCX, XX andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 ALIGN_3 .L77: testl $1, MM je .L80 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 andps %xmm7, %xmm1 andps %xmm7, %xmm2 addps %xmm2, %xmm1 maxss %xmm1, %xmm0 ALIGN_4 .L80: movl X, XX movl M, MM movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movl MM, I sarl $2, I jle .L85 ALIGN_4 .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhps 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhps 0 * SIZE(XX), %xmm2 addl INCX, XX movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 cmpeqps %xmm0, %xmm1 movmskps %xmm1, TEMP testl $15, TEMP jne .L83 addl $4, RET decl I jg .L81 jmp .L85 ALIGN_4 .L83: leal (, INCX, 4), TEMP subl TEMP, XX movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 addl INCX, XX movss 0 * SIZE(XX), %xmm3 movss 1 * SIZE(XX), %xmm4 addl INCX, XX andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 addl INCX, XX movss 0 * SIZE(XX), %xmm3 movss 1 * SIZE(XX), %xmm4 addl INCX, XX andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 ALIGN_3 .L85: testl $2, MM je .L87 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 addl INCX, XX movss 0 * SIZE(XX), %xmm3 movss 1 * SIZE(XX), %xmm4 addl INCX, XX andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 incl RET comiss %xmm0, %xmm1 je .L999 incl RET comiss %xmm0, %xmm3 je .L999 ALIGN_3 .L87: incl RET ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/izamax_sse2.S000066400000000000000000000265011313527062700174260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define RET %eax #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define MM %ebp #define XX %edi #define TEMP %ebx #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif pxor %xmm0, %xmm0 pxor %xmm7, %xmm7 xor RET, RET testl M, M jle .L999 testl INCX, INCX jle .L999 sall $ZBASE_SHIFT, INCX movl M, MM movl X, XX cmpeqpd %xmm7, %xmm7 psrlq $1, %xmm7 movsd 0 * SIZE(XX), %xmm0 movsd 1 * SIZE(XX), %xmm1 addl INCX, XX decl MM andpd %xmm7, %xmm0 andpd %xmm7, %xmm1 addpd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 cmpl $2 * SIZE, INCX jne .L60 movl MM, I sarl $3, I jle .L25 ALIGN_4 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movhpd 2 * SIZE(XX), %xmm1 movhpd 3 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 4 * SIZE(XX), %xmm3 movsd 5 * SIZE(XX), %xmm4 movhpd 6 * SIZE(XX), %xmm3 movhpd 7 * SIZE(XX), %xmm4 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) #endif movsd 8 * SIZE(XX), %xmm1 movsd 9 * SIZE(XX), %xmm2 movhpd 10 * SIZE(XX), %xmm1 movhpd 11 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 12 * SIZE(XX), %xmm3 movsd 13 * SIZE(XX), %xmm4 movhpd 14 * SIZE(XX), %xmm3 movhpd 15 * SIZE(XX), %xmm4 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 addl $16 * SIZE, XX decl I jg .L21 ALIGN_4 .L25: andl $7, MM jle .L30 testl $4, MM je .L26 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movhpd 2 * SIZE(XX), %xmm1 movhpd 3 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 4 * SIZE(XX), %xmm3 movsd 5 * SIZE(XX), %xmm4 movhpd 6 * SIZE(XX), %xmm3 movhpd 7 * SIZE(XX), %xmm4 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 addl $8 * SIZE, XX ALIGN_3 .L26: testl $2, MM je .L27 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movhpd 2 * SIZE(XX), %xmm1 movhpd 3 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 addl $4 * SIZE, XX ALIGN_3 .L27: testl $1, MM je .L30 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxsd %xmm1, %xmm0 ALIGN_4 .L30: movl X, XX movl M, MM movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movl MM, I sarl $2, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movhpd 2 * SIZE(XX), %xmm1 movhpd 3 * SIZE(XX), %xmm2 movsd 4 * SIZE(XX), %xmm3 movsd 5 * SIZE(XX), %xmm4 movhpd 6 * SIZE(XX), %xmm3 movhpd 7 * SIZE(XX), %xmm4 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 cmpeqpd %xmm0, %xmm1 cmpeqpd %xmm0, %xmm3 orpd %xmm3, %xmm1 movmskpd %xmm1, TEMP testl $3, TEMP jne .L33 addl $8 * SIZE, XX addl $4, RET decl I jg .L31 jmp .L35 ALIGN_4 .L33: movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movsd 2 * SIZE(XX), %xmm3 movsd 3 * SIZE(XX), %xmm4 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 movsd 4 * SIZE(XX), %xmm1 movsd 5 * SIZE(XX), %xmm2 movsd 6 * SIZE(XX), %xmm3 movsd 7 * SIZE(XX), %xmm4 addl $8 * SIZE, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 ALIGN_3 .L35: testl $2, MM je .L36 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movsd 2 * SIZE(XX), %xmm3 movsd 3 * SIZE(XX), %xmm4 addl $4 * SIZE, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 ALIGN_3 .L36: incl RET jmp .L999 ALIGN_3 .L60: movl MM, I sarl $3, I jle .L65 ALIGN_4 .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 movhpd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 movhpd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 decl I jg .L61 ALIGN_4 .L65: andl $7, MM jle .L70 testl $4, MM je .L66 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 movhpd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 ALIGN_3 .L66: testl $2, MM je .L67 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 ALIGN_3 .L67: testl $1, MM je .L70 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxsd %xmm1, %xmm0 ALIGN_3 .L70: movl X, XX movl M, MM movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movl MM, I sarl $2, I jle .L75 ALIGN_4 .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 movhpd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 cmpeqpd %xmm0, %xmm1 cmpeqpd %xmm0, %xmm3 orpd %xmm3, %xmm1 movmskpd %xmm1, TEMP testl $3, TEMP jne .L73 addl $4, RET decl I jg .L71 jmp .L75 ALIGN_4 .L73: leal (, INCX, 4), TEMP subl TEMP, XX movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 ALIGN_3 .L75: testl $2, MM je .L76 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incl RET comisd %xmm0, %xmm1 je .L999 incl RET comisd %xmm0, %xmm3 je .L999 ALIGN_3 .L76: incl RET ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/lsame.S000066400000000000000000000064071313527062700163050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE movl 4(%esp), %eax movl 8(%esp), %edx movb (%eax), %al # a = *A movb (%edx), %dl # b = *B andl $255, %eax andl $255, %edx subl $65, %eax subl $65, %edx #ifndef HAVE_CMOV movl %eax, %ecx subl $32, %ecx jle .L1 movl %ecx, %eax .L1: movl %edx, %ecx subl $32, %ecx jle .L2 movl %ecx, %edx .L2: subl %eax, %edx movl $0, %eax movl $1, %edx jne .L3 movl %edx, %eax .L3: #else movl %eax, %ecx subl $32, %ecx cmovge %ecx, %eax movl %edx, %ecx subl $32, %ecx cmovge %ecx, %edx subl %eax, %edx movl $0, %eax movl $1, %edx cmove %edx, %eax #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/nrm2.S000066400000000000000000000114731313527062700160610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %edx #define X %ecx #define INCX %esi #define I %eax #include "l1param.h" PROLOGUE pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif fldz testl M, M jle .L999 testl INCX, INCX jle .L999 sall $BASE_SHIFT, INCX fldz fldz fldz cmpl $SIZE, INCX jne .L40 movl M, I sarl $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) fmul %st(0), %st FLD 2 * SIZE(X) fmul %st(0), %st FLD 3 * SIZE(X) fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fmul %st(0), %st FLD 5 * SIZE(X) fmul %st(0), %st FLD 6 * SIZE(X) fmul %st(0), %st FLD 7 * SIZE(X) fmul %st(0), %st addl $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L10 ALIGN_4 .L20: movl M, I andl $7, I jle .L998 ALIGN_4 .L21: FLD (X) fmul %st(0), %st faddp %st,%st(1) addl $1 * SIZE, X decl I jg .L21 jmp .L998 ALIGN_4 .L40: movl M, I sarl $3, I jle .L60 ALIGN_4 .L50: FLD (X) addl INCX, X fmul %st(0), %st FLD (X) addl INCX, X fmul %st(0), %st FLD (X) addl INCX, X fmul %st(0), %st FLD (X) addl INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD (X) addl INCX, X fmul %st(0), %st FLD (X) addl INCX, X fmul %st(0), %st FLD (X) addl INCX, X fmul %st(0), %st FLD (X) addl INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L50 ALIGN_4 .L60: movl M, I andl $7, I jle .L998 ALIGN_4 .L61: FLD (X) addl INCX, X fmul %st(0), %st faddp %st,%st(1) decl I jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: fsqrt popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/nrm2_sse.S000066400000000000000000000175541313527062700167410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %edx #define X %ecx #define INCX %esi #define I %eax #include "l1param.h" PROLOGUE PROFCODE pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX pxor %xmm0, %xmm0 testl M, M jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX jle .L999 leal (, INCX, SIZE), INCX cmpl $SIZE, INCX jne .L40 subl $-32 * SIZE, X testl $SIZE, X je .L05 movss -32 * SIZE(X), %xmm0 cvtss2sd %xmm0, %xmm0 mulsd %xmm0, %xmm0 addl INCX, X decl M jle .L998 ALIGN_3 .L05: movl M, I sarl $4, I jle .L13 movsd -32 * SIZE(X), %xmm4 movsd -30 * SIZE(X), %xmm5 movsd -28 * SIZE(X), %xmm6 movsd -26 * SIZE(X), %xmm7 decl I jle .L12 ALIGN_3 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif cvtps2pd %xmm4, %xmm2 movsd -24 * SIZE(X), %xmm4 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd -22 * SIZE(X), %xmm5 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd -20 * SIZE(X), %xmm6 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd -18 * SIZE(X), %xmm7 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm4, %xmm2 movsd -16 * SIZE(X), %xmm4 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd -14 * SIZE(X), %xmm5 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd -12 * SIZE(X), %xmm6 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd -10 * SIZE(X), %xmm7 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X decl I jg .L10 ALIGN_3 .L12: cvtps2pd %xmm4, %xmm2 movsd -24 * SIZE(X), %xmm4 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd -22 * SIZE(X), %xmm5 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd -20 * SIZE(X), %xmm6 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd -18 * SIZE(X), %xmm7 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X ALIGN_4 .L13: testl $8, M je .L14 movsd -32 * SIZE(X), %xmm4 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd -30 * SIZE(X), %xmm5 cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 movsd -28 * SIZE(X), %xmm6 cvtps2pd %xmm6, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd -26 * SIZE(X), %xmm7 cvtps2pd %xmm7, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 addl $8 * SIZE, X ALIGN_3 .L14: testl $4, M je .L15 movsd -32 * SIZE(X), %xmm4 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd -30 * SIZE(X), %xmm5 cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 addl $4 * SIZE, X ALIGN_3 .L15: testl $2, M je .L16 movsd -32 * SIZE(X), %xmm4 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 addl $2 * SIZE, X ALIGN_3 .L16: testl $1, M je .L998 movss -32 * SIZE(X), %xmm4 cvtss2sd %xmm4, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 jmp .L998 ALIGN_4 .L40: movl M, I sarl $3, I jle .L44 ALIGN_4 .L41: movss (X), %xmm4 addl INCX, X cvtss2sd %xmm4, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 movss (X), %xmm5 addl INCX, X cvtss2sd %xmm5, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 movss (X), %xmm6 addl INCX, X cvtss2sd %xmm6, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 movss (X), %xmm7 addl INCX, X cvtss2sd %xmm7, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 movss (X), %xmm4 addl INCX, X cvtss2sd %xmm4, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 movss (X), %xmm5 addl INCX, X cvtss2sd %xmm5, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 movss (X), %xmm6 addl INCX, X cvtss2sd %xmm6, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 movss (X), %xmm7 addl INCX, X cvtss2sd %xmm7, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 decl I jg .L41 ALIGN_3 .L44: testl $4, M je .L45 movss (X), %xmm4 addl INCX, X cvtss2sd %xmm4, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 movss (X), %xmm5 addl INCX, X cvtss2sd %xmm5, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 movss (X), %xmm6 addl INCX, X cvtss2sd %xmm6, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 movss (X), %xmm7 addl INCX, X cvtss2sd %xmm7, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 ALIGN_3 .L45: testl $2, M je .L46 movss (X), %xmm4 addl INCX, X cvtss2sd %xmm4, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 movss (X), %xmm5 addl INCX, X cvtss2sd %xmm5, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 ALIGN_3 .L46: testl $1, M je .L998 movss (X), %xmm4 cvtss2sd %xmm4, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm0 ALIGN_4 .L998: addpd %xmm1, %xmm0 #ifndef HAVE_SSE3 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif ALIGN_4 .L999: sqrtsd %xmm0, %xmm0 cvtsd2ss %xmm0, %xmm0 movss %xmm0, STACK_M flds STACK_M popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qaxpy.S000066400000000000000000000124721313527062700163450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 32 + STACK + ARGS(%esp) #define STACK_INCX 36 + STACK + ARGS(%esp) #define STACK_Y 40 + STACK + ARGS(%esp) #define STACK_INCY 44 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif FLD STACK_ALPHA movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $BASE_SHIFT, INCX sall $BASE_SHIFT, INCY testl M, M jle .L40 cmpl $SIZE, INCX jne .L14 cmpl $SIZE, INCY jne .L14 movl M, %eax sarl $3, %eax jle .L15 ALIGN_3 #define PRESIZE 33 .L16: #ifdef HAS_PREFETCH prefetcht0 PRESIZE * SIZE(X) #endif FLD 0 * SIZE(X) fmul %st(1),%st FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) fmul %st(1),%st FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(1),%st FLD 2 * SIZE(Y) faddp %st, %st(1) FST 2 * SIZE(Y) FLD 3 * SIZE(X) fmul %st(1),%st FLD 3 * SIZE(Y) faddp %st, %st(1) FST 3 * SIZE(Y) #ifdef HAS_PREFETCH prefetcht0 (4 + PRESIZE) * SIZE(X) #endif FLD 4 * SIZE(X) fmul %st(1),%st FLD 4 * SIZE(Y) faddp %st, %st(1) FST 4 * SIZE(Y) FLD 5 * SIZE(X) fmul %st(1),%st FLD 5 * SIZE(Y) faddp %st, %st(1) FST 5 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(1),%st FLD 6 * SIZE(Y) faddp %st, %st(1) FST 6 * SIZE(Y) FLD 7 * SIZE(X) fmul %st(1),%st FLD 7 * SIZE(Y) faddp %st, %st(1) FST 7 * SIZE(Y) #ifdef HAVE_3DNOW prefetchw 24 * SIZE(Y) #endif addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl M, %eax andl $7, %eax jle .L40 ALIGN_3 .L22: FLD 0 * SIZE(X) fmul %st(1),%st FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) addl $SIZE, X addl $SIZE, Y decl %eax jg .L22 jmp .L40 ALIGN_3 .L14: movl M, %eax sarl $2, %eax jle .L28 ALIGN_3 .L29: FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addl INCX, X addl INCY, Y FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addl INCX, X addl INCY, Y FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addl INCX, X addl INCY, Y FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addl INCX, X addl INCY, Y decl %eax jg .L29 ALIGN_3 .L28: movl M, %eax andl $3, %eax jle .L40 ALIGN_3 .L35: FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addl INCX, X addl INCY, Y decl %eax jg .L35 ALIGN_3 .L40: ffreep %st(0) xorl %eax,%eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qconjg.S000066400000000000000000000056211313527062700164620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl 4(%esp), %eax movl 8(%esp), %ecx fldz FLD 1 * SIZE(%ecx) fsubrp %st, %st(1) FLD 0 * SIZE(%ecx) FST 0 * SIZE(%eax) FST 1 * SIZE(%eax) ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qdot.S000066400000000000000000000116431313527062700161510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N),N movl (INCX),INCX movl (INCY),INCY #endif sall $BASE_SHIFT, INCX sall $BASE_SHIFT, INCY fldz fldz fldz fldz cmpl $SIZE, INCX jne .L14 cmpl $SIZE, INCY jne .L14 movl N, %eax sarl $2, %eax jle .L15 ALIGN_3 .L16: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(1) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(2) FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(3) FLD 3 * SIZE(X) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(4) addl $4 * SIZE, X addl $4 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L22: FLD (X) addl $SIZE, X FLD (Y) fmulp %st, %st(1) addl $SIZE, Y faddp %st,%st(1) decl %eax jg .L22 jmp .L27 ALIGN_3 .L14: #ifdef F_INTERFACE testl INCX, INCX jge .L28 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L28: testl INCY, INCY jge .L29 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L29: #endif movl N, %eax sarl $2, %eax jle .L30 ALIGN_3 .L31: FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(1) FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(2) FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(3) FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(4) decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L37: FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st, %st(1) decl %eax jg .L37 ALIGN_3 .L27: faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(1) popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qgemm_kernel_2x2.S000066400000000000000000000333471313527062700203500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define I %esi #define B %ebx #define CO %edi #define AO %edx #define BO %ecx #define LDC %ebp #define PREFETCH_OFFSET 48 PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl ARG_LDC, LDC movl ARG_B, B addl $8 * SIZE, A addl $8 * SIZE, B sall $BASE_SHIFT, LDC movl N, %eax sarl $1, %eax movl %eax, J je .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl A, AO movl C, CO lea (, LDC, 2), %eax addl %eax, C movl M, I sarl $1, I je .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addl $8 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addl $2 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L16 ALIGN_4 .L18: #ifndef TRMMKERNEL FLD ALPHA fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) FLD 1 * SIZE(CO, LDC) faddp %st, %st(1) FST 1 * SIZE(CO, LDC) #else FST 0 * SIZE(CO) FST 1 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO, LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, CO decl I jne .L11 ALIGN_4 .L20: movl M, %eax andl $1, %eax je .L29 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 2), BO #endif fldz fldz #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $4 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $1 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L26 ALIGN_4 .L28: #ifndef TRMMKERNEL FLD ALPHA fmul %st, %st(1) fmulp %st, %st(2) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) #else FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $1 * SIZE, CO ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BO, B decl J jne .L01 ALIGN_4 .L30: movl N, %eax testl $1, %eax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl A, AO movl C, CO addl LDC, C movl M, I sarl $1, I je .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal ( B, %eax, 1), BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $8 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $2 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L36 ALIGN_4 .L38: #ifndef TRMMKERNEL FLD ALPHA fmul %st, %st(1) fmulp %st, %st(2) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) #else FST 0 * SIZE(CO) FST 1 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $2 * SIZE, CO decl I jne .L31 ALIGN_4 .L40: movl M, %eax andl $1, %eax je .L49 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 1), BO #endif fldz #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $4 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $1 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L46 ALIGN_4 .L48: #ifndef TRMMKERNEL FLD ALPHA fmulp %st, %st(1) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) #else FST 0 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $1 * SIZE, CO ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif movl BO, B ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qgemv_n.S000066400000000000000000000234461313527062700166420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 32 #endif #if defined(ATHLON) || defined(OPTERON) #define P 32 #endif #ifndef P #define P DTB_DEFAULT_ENTRIES #endif #define STACK 16 #define ARGS 16 #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) #define IS 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define LDA 36 + STACK + ARGS(%esp) #define X 40 + STACK + ARGS(%esp) #define INCX 44 + STACK + ARGS(%esp) #define Y 48 + STACK + ARGS(%esp) #define INCY 52 + STACK + ARGS(%esp) #define BUFFER 56 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA movl X, %edi movl LDA, %ebx sall $BASE_SHIFT, %ebx movl $0, IS movl M, %edx movl N, %esi test %esi, %esi jle .L79 # goto END test %edx, %edx jle .L79 # goto END movl INCY, %eax sall $BASE_SHIFT, %eax movl %eax, INCY movl LDA, %eax imull $P, %eax # P * lda subl M ,%eax # P * lda - m sall $BASE_SHIFT, %eax movl %eax, PLDA_M ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl N, %eax subl %esi,%eax # n - is cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_N movl INCX, %edx sall $BASE_SHIFT, %esi leal (%edi, %esi, 1), %esi movl %esi, XP cmpl $1, %edx je .L34 # if incx == 1 goto L34 movl BUFFER, %esi sall $BASE_SHIFT, %edx movl %esi, XP # xp = buffer sarl $2,%eax jle .L35 ALIGN_2 .L36: FLD (%edi) addl %edx,%edi # x += incx FLD (%edi) addl %edx,%edi # x += incx FLD (%edi) addl %edx,%edi # x += incx FLD (%edi) addl %edx,%edi # x += incx FST 3 * SIZE(%esi) FST 2 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) addl $4 * SIZE, %esi # xp += 4 decl %eax jg .L36 ALIGN_3 .L35: movl MIN_N, %eax andl $3, %eax jle .L34 ALIGN_2 .L42: FLD (%edi) addl %edx, %edi FST (%esi) addl $SIZE, %esi decl %eax jg .L42 ALIGN_3 /* Main Routine */ .L34: movl Y, %ecx # c_offset movl M, %ebp sarl $2, %ebp # j = (m >> 2) jle .L47 ALIGN_2 .L48: movl A, %edx # a_offset = a fldz addl $4 * SIZE, A # a += 4 fldz movl XP, %esi # b_offset = xp fldz movl MIN_N, %eax # i = min_n fldz FLD (%esi) # bt1 = b_offset sarl $1, %eax jle .L51 ALIGN_2 #ifdef PENTIUM3 #define PRESIZE 8 #else #define PRESIZE 24 #endif .L80: #ifdef PENTIUM3 prefetcht1 PRESIZE * SIZE(%edx, %ebx, 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 prefetcht1 PRESIZE * SIZE(%esi) faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += at1 FLD 1 * SIZE(%esi) # bt1 = b_offset prefetcht1 PRESIZE * SIZE(%edx, %ebx, 2) addl %ebx, %edx # a_offset += lda FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) addl %ebx, %edx faddp %st, %st(4) # ct4 += at1 FLD 2 * SIZE(%esi) # bt1 = b_offset addl $2 * SIZE, %esi # b_offset += 2 #else #ifdef PENTIUM4 prefetchnta 8 * SIZE(%esi) #endif FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += at1 FLD 1 * SIZE(%esi) # bt1 = b_offset addl %ebx, %edx # a_offset += lda FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += at1 FLD 2 * SIZE(%esi) # bt1 = b_offset addl %ebx, %edx addl $2 * SIZE, %esi # b_offset += 2 #endif decl %eax jg .L80 .L51: movl MIN_N,%eax andl $1, %eax je .L57 FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += at1 fldz ALIGN_2 .L57: ffreep %st(0) fxch %st(4) fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fxch %st(4) movl INCY, %eax FLD (%ecx) faddp %st, %st(1) FST (%ecx) addl %eax, %ecx FLD (%ecx) faddp %st, %st(1) FST (%ecx) addl %eax, %ecx FLD (%ecx) faddp %st, %st(1) FST (%ecx) addl %eax, %ecx FLD (%ecx) faddp %st, %st(1) FST (%ecx) addl %eax, %ecx decl %ebp # j -- jg .L48 ALIGN_3 .L47: movl M, %ebp andl $3, %ebp # j = (m & 3) jle .L60 ALIGN_2 .L61: movl A, %edx # a_offset = a fldz addl $SIZE, A # a++ fldz movl XP,%esi fldz movl MIN_N,%eax fldz sarl $3,%eax jle .L64 ALIGN_2 .L65: FLD 0 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st, %st(1) addl %ebx, %edx FLD 1 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st, %st(2) addl %ebx ,%edx FLD 2 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st, %st(3) addl %ebx, %edx FLD 3 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st, %st(4) addl %ebx, %edx FLD 4 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st,%st(1) addl %ebx, %edx FLD 5 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st, %st(2) addl %ebx, %edx FLD 6 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st,%st(3) addl %ebx, %edx FLD 7 * SIZE(%esi) FLD (%edx) fmulp %st, %st(1) faddp %st,%st(4) addl %ebx, %edx addl $8 * SIZE, %esi decl %eax jg .L65 .L64: movl MIN_N,%eax andl $7, %eax jle .L70 ALIGN_2 .L71: FLD (%esi) addl $SIZE, %esi # b_offset ++ FLD (%edx) fmulp %st, %st(1) addl %ebx, %edx # a_offset += lda faddp %st, %st(1) decl %eax jg .L71 ALIGN_2 .L70: faddp %st, %st(1) faddp %st, %st(1) faddp %st, %st(1) fmul %st(1), %st movl INCY, %eax FLD (%ecx) faddp %st, %st(1) FST (%ecx) addl %eax, %ecx decl %ebp jg .L61 .L60: movl PLDA_M, %esi addl %esi, A # a += P * lda - m addl $P, IS movl N, %esi cmpl %esi,IS jl .L32 .L79: ffreep %st(0) popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qgemv_t.S000066400000000000000000000272201313527062700166420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 88 #endif #ifndef P #define P 1000 #endif #define STACK 16 #define ARGS 24 #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) #define J 12 + STACK(%esp) #define IS 16 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define LDA 36 + STACK + ARGS(%esp) #define X 40 + STACK + ARGS(%esp) #define INCX 44 + STACK + ARGS(%esp) #define Y 48 + STACK + ARGS(%esp) #define INCY 52 + STACK + ARGS(%esp) #define BUFFER 56 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA movl X, %edi # X movl $0, IS movl M, %ebx movl N, %eax testl %ebx, %ebx jle .L79 testl %eax, %eax jle .L79 movl INCX, %esi sall $BASE_SHIFT, %esi movl %esi, INCX movl INCY, %esi sall $BASE_SHIFT, %esi movl %esi, INCY movl LDA, %ebx imull %ebx, %eax movl $P, %esi subl %eax, %esi sall $BASE_SHIFT, %esi movl %esi, NLDA movl %ebx, %esi sall $BASE_SHIFT, %esi movl %esi, LDA ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl M, %eax subl %esi, %eax cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_M movl IS, %ecx sall $BASE_SHIFT, %ecx leal (%edi,%ecx, 1), %ecx movl INCX, %ebx movl %ecx, XP cmpl $SIZE, %ebx je .L34 movl BUFFER, %esi movl MIN_M, %ecx movl %esi, XP sarl $2, %ecx jle .L35 ALIGN_3 .L36: FLD (%edi) addl %ebx, %edi FST 0 * SIZE(%esi) FLD (%edi) addl %ebx, %edi FST 1 * SIZE(%esi) FLD (%edi) addl %ebx, %edi FST 2 * SIZE(%esi) FLD (%edi) addl %ebx, %edi FST 3 * SIZE(%esi) addl $4 * SIZE, %esi decl %ecx jg .L36 ALIGN_3 .L35: movl MIN_M, %ecx andl $3,%ecx jle .L34 ALIGN_2 .L42: FLD (%edi) addl %ebx, %edi FST (%esi) addl $SIZE, %esi decl %ecx jg .L42 ALIGN_3 /* Main Routine */ .L34: movl Y, %ebp # coffset = y movl N, %esi sarl $2, %esi movl %esi, J jle .L47 ALIGN_3 .L48: movl A, %ebx # a_offset = a fldz movl LDA, %edx fldz leal (%ebx, %edx), %ecx # a_offset2 = a + lda fldz leal (%ebx, %edx, 4), %eax fldz movl %eax, A movl XP, %esi FLD (%esi) movl MIN_M, %eax sarl $2,%eax jle .L51 ALIGN_3 #define PRESIZE 8 .L80: #ifdef PENTIUM3 prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 prefetcht0 PRESIZE * SIZE(%ecx) faddp %st,%st(2) # ct1 += at1 FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 prefetcht0 PRESIZE * SIZE(%ebx) FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 1 * SIZE(%esi) FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 2 * SIZE(%esi) FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 3 * SIZE(%esi) FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) addl $4 * SIZE, %ebx faddp %st,%st(4) addl $4 * SIZE, %ecx FLD 4 * SIZE(%esi) addl $4 * SIZE, %esi #else #if defined(HAS_PREFETCH) prefetcht0 PRESIZE * SIZE(%ebx) prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) prefetcht0 PRESIZE * SIZE(%ecx) prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) #endif FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 1 * SIZE(%esi) FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 2 * SIZE(%esi) FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 3 * SIZE(%esi) FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 4 * SIZE(%esi) addl $4 * SIZE, %ebx addl $4 * SIZE, %ecx addl $4 * SIZE, %esi #endif decl %eax jg .L80 ALIGN_3 .L51: movl MIN_M, %eax andl $3, %eax je .L81 ALIGN_3 .L52: FLD (%ebx) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD (%ecx) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 1 * SIZE(%esi) addl $SIZE, %ebx addl $SIZE, %ecx addl $SIZE, %esi decl %eax jg .L52 ALIGN_3 .L81: ffreep %st(0) fxch %st(4) fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fxch %st(4) movl INCY, %eax FLD (%ebp) faddp %st, %st(1) FST (%ebp) addl %eax, %ebp FLD (%ebp) faddp %st, %st(1) FST (%ebp) addl %eax, %ebp FLD (%ebp) faddp %st, %st(1) FST (%ebp) addl %eax, %ebp FLD (%ebp) faddp %st, %st(1) FST (%ebp) addl %eax, %ebp decl J jg .L48 ALIGN_3 .L47: movl N, %esi andl $3,%esi movl %esi, J jle .L60 ALIGN_2 .L61: movl A, %ebx # a_offset = a fldz # ct1 = ZERO movl LDA, %edx fldz # ct1 = ZERO addl %ebx, %edx fldz # ct1 = ZERO movl %edx, A fldz # ct1 = ZERO movl XP, %esi movl MIN_M, %eax sarl $3,%eax jle .L64 ALIGN_3 .L65: #ifdef HAS_PREFETCH prefetcht0 PRESIZE * 2 * SIZE(%ebx) prefetcht0 PRESIZE * 2 * SIZE(%ebx) #endif FLD 0 * SIZE(%esi) FLD 0 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(1) FLD 1 * SIZE(%esi) FLD 1 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(2) FLD 2 * SIZE(%esi) FLD 2 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(3) FLD 3 * SIZE(%esi) FLD 3 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(4) FLD 4 * SIZE(%esi) FLD 4 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(1) FLD 5 * SIZE(%esi) FLD 5 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(2) FLD 6 * SIZE(%esi) FLD 6 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(3) FLD 7 * SIZE(%esi) FLD 7 * SIZE(%ebx) fmulp %st, %st(1) faddp %st,%st(4) addl $8 * SIZE, %esi addl $8 * SIZE, %ebx decl %eax jg .L65 ALIGN_3 .L64: movl MIN_M, %eax andl $7, %eax jle .L70 ALIGN_3 .L71: FLD (%esi) FLD (%ebx) fmulp %st, %st(1) faddp %st,%st(1) addl $SIZE, %esi addl $SIZE, %ebx decl %eax jg .L71 ALIGN_3 .L70: faddp %st, %st(1) faddp %st, %st(1) faddp %st, %st(1) fmul %st(1),%st FLD (%ebp) faddp %st, %st(1) FST (%ebp) addl INCY, %ebp decl J jg .L61 ALIGN_3 .L60: movl A, %ebx addl NLDA, %ebx movl %ebx, A addl $P, IS movl M, %esi cmpl %esi, IS jl .L32 ALIGN_3 .L79: ffreep %st(0) popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qtrsm_kernel_LN_2x2.S000066400000000000000000000451411313527062700207740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define I %esi #define B %ebx #define CO %edi #define AO %edx #define BO %ecx #define LDC %ebp #define PREFETCH_OFFSET 48 PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_LDC, LDC movl ARG_B, B sall $BASE_SHIFT, LDC addl $8 * SIZE, A addl $8 * SIZE, B #ifdef LN movl M, %eax sall $BASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $BASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull %ebp, %eax addl %eax, C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J je .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AO #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif lea (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %eax andl $1, %eax je .L20 ALIGN_4 .L21: #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #else movl B, BO #endif fldz fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $4 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $1 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) FLD -7 * SIZE(BO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(2) #endif #ifdef RT FLD -5 * SIZE(BO) fmulp %st, %st(2) FLD -6 * SIZE(BO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 0 * SIZE(CO, LDC) FST 0 * SIZE(CO) #ifndef LN addl $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 2), BO #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L20: movl M, I sarl $1, I je .L29 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #else movl B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addl $8 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addl $2 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) FLD -6 * SIZE(BO) fsubp %st, %st(3) FLD -5 * SIZE(BO) fsubp %st, %st(4) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) FLD -6 * SIZE(AO) fsubp %st, %st(2) FLD -5 * SIZE(AO) fsubp %st, %st(4) #endif #ifdef LN FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) FLD -6 * SIZE(AO) fmul %st(3), %st FLD -6 * SIZE(AO) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) FLD -7 * SIZE(AO) fmul %st(1), %st FLD -7 * SIZE(AO) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmul %st(1), %st FLD -7 * SIZE(BO) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) FLD -6 * SIZE(BO) fmul %st(2), %st FLD -6 * SIZE(BO) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subl $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) fxch %st(2) fld %st FST -6 * SIZE(BO) fxch %st(3) fld %st FST -5 * SIZE(BO) FST 1 * SIZE(CO, LDC) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO) #else fld %st FST -8 * SIZE(AO) fxch %st(2) fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -6 * SIZE(AO) fxch %st(3) fld %st FST -5 * SIZE(AO) FST 1 * SIZE(CO, LDC) FST 1 * SIZE(CO) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #ifndef LN addl $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 2), BO #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L11 ALIGN_4 .L29: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BO, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J jne .L01 ALIGN_4 .L30: movl N, %eax testl $1, %eax je .L999 #if defined(LT) || defined(RN) movl A, AO #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %eax andl $1, %eax je .L40 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #else movl B, BO #endif fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $4 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $1 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) #endif #ifdef LN FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef RT FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) #endif FST 0 * SIZE(CO) #ifndef LN addl $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 1), BO #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L40: movl M, I sarl $1, I je .L49 ALIGN_4 .L31: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #else movl B, BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $8 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $2 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) #endif #ifdef LN FLD -5 * SIZE(AO) fmulp %st, %st(2) FLD -6 * SIZE(AO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) FLD -7 * SIZE(AO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subl $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 1 * SIZE(CO) FST 0 * SIZE(CO) #ifndef LN addl $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 1), BO #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L31 ALIGN_4 .L49: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl BO, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qtrsm_kernel_LT_2x2.S000066400000000000000000000451021313527062700207770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define AORIG 8 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define I %esi #define B %ebx #define CO %edi #define AO %edx #define BO %ecx #define LDC %ebp #define PREFETCH_OFFSET 48 PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_LDC, LDC movl ARG_B, B sall $BASE_SHIFT, LDC addl $8 * SIZE, A addl $8 * SIZE, B #ifdef LN movl M, %eax sall $BASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $BASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull %ebp, %eax addl %eax, C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J je .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AO #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif lea (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, I sarl $1, I je .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #else movl B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addl $8 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addl $2 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) FLD -6 * SIZE(BO) fsubp %st, %st(3) FLD -5 * SIZE(BO) fsubp %st, %st(4) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) FLD -6 * SIZE(AO) fsubp %st, %st(2) FLD -5 * SIZE(AO) fsubp %st, %st(4) #endif #ifdef LN FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) FLD -6 * SIZE(AO) fmul %st(3), %st FLD -6 * SIZE(AO) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) FLD -7 * SIZE(AO) fmul %st(1), %st FLD -7 * SIZE(AO) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmul %st(1), %st FLD -7 * SIZE(BO) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) FLD -6 * SIZE(BO) fmul %st(2), %st FLD -6 * SIZE(BO) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subl $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) fxch %st(2) fld %st FST -6 * SIZE(BO) fxch %st(3) fld %st FST -5 * SIZE(BO) FST 1 * SIZE(CO, LDC) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO) #else fld %st FST -8 * SIZE(AO) fxch %st(2) fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -6 * SIZE(AO) fxch %st(3) fld %st FST -5 * SIZE(AO) FST 1 * SIZE(CO, LDC) FST 1 * SIZE(CO) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #ifndef LN addl $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 2), BO #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L11 ALIGN_4 .L20: movl M, %eax andl $1, %eax je .L29 ALIGN_4 .L21: #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #else movl B, BO #endif fldz fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $4 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $1 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) FLD -7 * SIZE(BO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(2) #endif #ifdef RT FLD -5 * SIZE(BO) fmulp %st, %st(2) FLD -6 * SIZE(BO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 0 * SIZE(CO, LDC) FST 0 * SIZE(CO) #ifndef LN addl $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 2), BO #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BO, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J jne .L01 ALIGN_4 .L30: movl N, %eax testl $1, %eax je .L999 #if defined(LT) || defined(RN) movl A, AO #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, I sarl $1, I je .L40 ALIGN_4 .L31: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #else movl B, BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $8 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $2 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #ifdef LN FLD -5 * SIZE(AO) fmulp %st, %st(2) FLD -6 * SIZE(AO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) FLD -7 * SIZE(AO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subl $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 1 * SIZE(CO) FST 0 * SIZE(CO) #ifndef LN addl $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 1), BO #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L31 ALIGN_4 .L40: movl M, %eax andl $1, %eax je .L49 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #else movl B, BO #endif fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $4 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $1 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) #endif #ifdef LN FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef RT FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) #endif FST 0 * SIZE(CO) #ifndef LN addl $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 1), BO #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L49: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl BO, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/qtrsm_kernel_RT_2x2.S000066400000000000000000000451401313527062700210070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define I %esi #define B %ebx #define CO %edi #define AO %edx #define BO %ecx #define LDC %ebp #define PREFETCH_OFFSET 48 PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_LDC, LDC movl ARG_B, B sall $BASE_SHIFT, LDC addl $8 * SIZE, A addl $8 * SIZE, B #ifdef LN movl M, %eax sall $BASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $BASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull %ebp, %eax addl %eax, C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax testl $1, %eax je .L30 #if defined(LT) || defined(RN) movl A, AO #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, I sarl $1, I je .L40 ALIGN_4 .L31: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #else movl B, BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $8 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $2 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #ifdef LN FLD -5 * SIZE(AO) fmulp %st, %st(2) FLD -6 * SIZE(AO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) FLD -7 * SIZE(AO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subl $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 1 * SIZE(CO) FST 0 * SIZE(CO) #ifndef LN addl $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 1), BO #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L31 ALIGN_4 .L40: movl M, %eax andl $1, %eax je .L49 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #else movl B, BO #endif fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $4 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $1 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) #endif #ifdef LN FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef RT FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) #endif FST 0 * SIZE(CO) #ifndef LN addl $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 1), BO #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L49: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl BO, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L30: movl N, %eax sarl $1, %eax movl %eax, J je .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AO #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif lea (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, I sarl $1, I je .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #else movl B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addl $8 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addl $2 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) FLD -6 * SIZE(BO) fsubp %st, %st(3) FLD -5 * SIZE(BO) fsubp %st, %st(4) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) FLD -6 * SIZE(AO) fsubp %st, %st(2) FLD -5 * SIZE(AO) fsubp %st, %st(4) #endif #ifdef LN FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) FLD -6 * SIZE(AO) fmul %st(3), %st FLD -6 * SIZE(AO) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) FLD -7 * SIZE(AO) fmul %st(1), %st FLD -7 * SIZE(AO) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmul %st(1), %st FLD -7 * SIZE(BO) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) FLD -6 * SIZE(BO) fmul %st(2), %st FLD -6 * SIZE(BO) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subl $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) fxch %st(2) fld %st FST -6 * SIZE(BO) fxch %st(3) fld %st FST -5 * SIZE(BO) FST 1 * SIZE(CO, LDC) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO) #else fld %st FST -8 * SIZE(AO) fxch %st(2) fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -6 * SIZE(AO) fxch %st(3) fld %st FST -5 * SIZE(AO) FST 1 * SIZE(CO, LDC) FST 1 * SIZE(CO) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #ifndef LN addl $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (BO, %eax, 2), BO #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L11 ALIGN_4 .L20: movl M, %eax andl $1, %eax je .L29 ALIGN_4 .L21: #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #else movl B, BO #endif fldz fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $4 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $1 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif sall $BASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) FLD -7 * SIZE(BO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(2) #endif #ifdef RT FLD -5 * SIZE(BO) fmulp %st, %st(2) FLD -6 * SIZE(BO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 0 * SIZE(CO, LDC) FST 0 * SIZE(CO) #ifndef LN addl $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 2), BO #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BO, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J jne .L01 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/rot.S000066400000000000000000000154231313527062700160060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define STACK_C 24 + STACK + ARGS(%esp) #ifdef XDOUBLE #define STACK_S 40 + STACK + ARGS(%esp) #elif defined DOUBLE #define STACK_S 32 + STACK + ARGS(%esp) #else #define STACK_S 28 + STACK + ARGS(%esp) #endif #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define I %eax #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCH_SIZE 144 #endif #ifdef OPTERON #define PREFETCH prefetchw #define PREFETCH_SIZE 144 #endif PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY FLD STACK_S FLD STACK_C sall $BASE_SHIFT, INCX sall $BASE_SHIFT, INCY testl N, N jle .L999 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 movl N, I sarl $2, I jle .L15 ALIGN_4 .L10: #ifdef PENTIUM4 PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) #endif #ifdef OPTERON PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) #ifdef PENTIUM4 PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) #endif #ifdef OPTERON PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) #endif FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 2 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 2 * SIZE(Y) FLD 3 * SIZE(X) FLD 3 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 3 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 3 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y decl I jg .L10 ALIGN_4 .L15: movl N, I andl $3, I jle .L999 ALIGN_4 .L16: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addl $SIZE, X addl $SIZE, Y decl I jg .L16 jmp .L999 ALIGN_4 .L50: movl N, I sarl $2, I jle .L55 ALIGN_4 .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addl INCX, X addl INCY, Y decl I jg .L51 ALIGN_4 .L55: movl N, I andl $3, I jle .L999 ALIGN_4 .L56: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addl INCX, X addl INCY, Y decl I jg .L56 ALIGN_4 .L999: ffreep %st(0) ffreep %st(0) popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/rot_sse.S000066400000000000000000000433001313527062700166530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define STACK_C 24 + STACK + ARGS(%esp) #define STACK_S 28 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define I %eax #define C %xmm6 #define S %xmm7 #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY movss STACK_C, C movss STACK_S, S shufps $0x0, C, C shufps $0x0, S, S cmpl $0, N jle .L999 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 testl $SIZE, X je .L05 movss 0 * SIZE(Y), %xmm1 movss 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, 0 * SIZE(X) movss %xmm2, 0 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl N jle .L999 .L05: testl $2 * SIZE, X je .L10 cmpl $1, N je .L17 #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, N jle .L999 ALIGN_2 .L10: testl $3 * SIZE, Y jne .L20 movl N, I sarl $5, I jle .L14 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 movaps 16 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 16 * SIZE(X) movlps %xmm2, 16 * SIZE(Y) movhps %xmm2, 18 * SIZE(Y) movsd 20 * SIZE(Y), %xmm1 movhps 22 * SIZE(Y), %xmm1 movaps 20 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 20 * SIZE(X) movlps %xmm2, 20 * SIZE(Y) movhps %xmm2, 22 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movsd 24 * SIZE(Y), %xmm1 movhps 26 * SIZE(Y), %xmm1 movaps 24 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 24 * SIZE(X) movlps %xmm2, 24 * SIZE(Y) movhps %xmm2, 26 * SIZE(Y) movsd 28 * SIZE(Y), %xmm1 movhps 30 * SIZE(Y), %xmm1 movaps 28 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 28 * SIZE(X) movlps %xmm2, 28 * SIZE(Y) movhps %xmm2, 30 * SIZE(Y) addl $32 * SIZE, X addl $32 * SIZE, Y decl I jg .L11 ALIGN_3 .L14: testl $31, N jle .L999 testl $16, N jle .L15 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L15: testl $8, N jle .L16 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L16: testl $4, N jle .L17 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L17: testl $2, N jle .L18 #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L18: testl $1, N jle .L999 movss 0 * SIZE(Y), %xmm1 movss 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, 0 * SIZE(X) movss %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movl N, I sarl $5, I jle .L24 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 movaps 16 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 16 * SIZE(X) movlps %xmm2, 16 * SIZE(Y) movhps %xmm2, 18 * SIZE(Y) movsd 20 * SIZE(Y), %xmm1 movhps 22 * SIZE(Y), %xmm1 movaps 20 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 20 * SIZE(X) movlps %xmm2, 20 * SIZE(Y) movhps %xmm2, 22 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movsd 24 * SIZE(Y), %xmm1 movhps 26 * SIZE(Y), %xmm1 movaps 24 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 24 * SIZE(X) movlps %xmm2, 24 * SIZE(Y) movhps %xmm2, 26 * SIZE(Y) movsd 28 * SIZE(Y), %xmm1 movhps 30 * SIZE(Y), %xmm1 movaps 28 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 28 * SIZE(X) movlps %xmm2, 28 * SIZE(Y) movhps %xmm2, 30 * SIZE(Y) addl $32 * SIZE, X addl $32 * SIZE, Y decl I jg .L21 ALIGN_3 .L24: testl $31, N jle .L999 testl $16, N jle .L25 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L25: testl $8, N jle .L26 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: testl $4, N jle .L27 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: testl $2, N jle .L28 #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L28: testl $1, N jle .L999 movss 0 * SIZE(Y), %xmm1 movss 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, 0 * SIZE(X) movss %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L50: movl N, I sarl $2, I jle .L55 ALIGN_3 .L53: movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addl INCX, X addl INCY, Y movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addl INCX, X addl INCY, Y movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addl INCX, X addl INCY, Y movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addl INCX, X addl INCY, Y decl I jg .L53 ALIGN_3 .L55: movl N, I andl $3, I jle .L999 ALIGN_3 .L56: movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addl INCX, X addl INCY, Y decl I jg .L56 ALIGN_3 .L999: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/rot_sse2.S000066400000000000000000000366541313527062700167530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define STACK_C 24 + STACK + ARGS(%esp) #define STACK_S 32 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define I %eax #include "l1param.h" #define C %xmm6 #define S %xmm7 PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY movsd STACK_C, C movsd STACK_S, S pshufd $0x44, C, C pshufd $0x44, S, S cmpl $0, N jle .L999 cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 testl $SIZE, X je .L10 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl N jle .L999 ALIGN_2 .L10: testl $SIZE, Y jne .L20 movl N, I sarl $4, I jle .L14 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 8 * SIZE(Y), %xmm1 movapd 8 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 8 * SIZE(X) movapd %xmm2, 8 * SIZE(Y) movapd 10 * SIZE(Y), %xmm1 movapd 10 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 10 * SIZE(X) movapd %xmm2, 10 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 12 * SIZE(Y), %xmm1 movapd 12 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 12 * SIZE(X) movapd %xmm2, 12 * SIZE(Y) movapd 14 * SIZE(Y), %xmm1 movapd 14 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 14 * SIZE(X) movapd %xmm2, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y decl I jg .L11 ALIGN_3 .L14: testl $15, N jle .L999 testl $8, N jle .L15 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: testl $4, N jle .L16 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: testl $2, N jle .L17 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L17: testl $1, N jle .L999 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movapd -1 * SIZE(Y), %xmm1 movl N, I sarl $4, I jle .L24 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) movapd 3 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movlpd %xmm2, 2 * SIZE(Y) movhpd %xmm2, 3 * SIZE(Y) movapd 5 * SIZE(Y), %xmm4 movapd 4 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movlpd %xmm2, 4 * SIZE(Y) movhpd %xmm2, 5 * SIZE(Y) movapd 7 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movlpd %xmm2, 6 * SIZE(Y) movhpd %xmm2, 7 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 9 * SIZE(Y), %xmm4 movapd 8 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 8 * SIZE(X) movlpd %xmm2, 8 * SIZE(Y) movhpd %xmm2, 9 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 11 * SIZE(Y), %xmm1 movapd 10 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 10 * SIZE(X) movlpd %xmm2, 10 * SIZE(Y) movhpd %xmm2, 11 * SIZE(Y) movapd 13 * SIZE(Y), %xmm4 movapd 12 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 12 * SIZE(X) movlpd %xmm2, 12 * SIZE(Y) movhpd %xmm2, 13 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 15 * SIZE(Y), %xmm1 movapd 14 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 14 * SIZE(X) movlpd %xmm2, 14 * SIZE(Y) movhpd %xmm2, 15 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y decl I jg .L21 ALIGN_3 .L24: testl $15, N jle .L999 testl $8, N jle .L25 movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) movapd 3 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movlpd %xmm2, 2 * SIZE(Y) movhpd %xmm2, 3 * SIZE(Y) movapd 5 * SIZE(Y), %xmm4 movapd 4 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movlpd %xmm2, 4 * SIZE(Y) movhpd %xmm2, 5 * SIZE(Y) movapd 7 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movlpd %xmm2, 6 * SIZE(Y) movhpd %xmm2, 7 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, N jle .L26 movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) movapd 3 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movlpd %xmm2, 2 * SIZE(Y) movhpd %xmm2, 3 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: testl $2, N jle .L27 movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) movapd %xmm4, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, N jle .L999 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L50: movl N, I cmpl $0, INCX je .L56 cmpl $0, INCY je .L56 sarl $2, I jle .L55 ALIGN_3 .L53: movsd (Y), %xmm1 movhpd (Y, INCY), %xmm1 movsd (X), %xmm0 movhpd (X, INCX), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, (X) movhpd %xmm0, (X, INCX) movlpd %xmm2, (Y) movhpd %xmm2, (Y, INCY) leal (X, INCX, 2), X leal (Y, INCY, 2), Y movsd (Y), %xmm1 movhpd (Y, INCY), %xmm1 movsd (X), %xmm0 movhpd (X, INCX), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, (X) movhpd %xmm0, (X, INCX) movlpd %xmm2, (Y) movhpd %xmm2, (Y, INCY) leal (X, INCX, 2), X leal (Y, INCY, 2), Y decl I jg .L53 ALIGN_3 .L55: movl N, I andl $3, I jle .L999 ALIGN_3 .L56: movsd (Y), %xmm1 movsd (X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, (X) movsd %xmm2, (Y) addl INCX, X addl INCY, Y decl I jg .L56 ALIGN_3 .L999: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/scal.S000066400000000000000000000153031313527062700161210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl 16(%esp),%edx FLD 28(%esp) #ifdef XDOUBLE movl 44(%esp),%edi movl 48(%esp),%esi #elif defined(DOUBLE) movl 36(%esp),%edi movl 40(%esp),%esi #else movl 32(%esp),%edi movl 36(%esp),%esi #endif ftst fnstsw %ax andb $68, %ah je .L300 # Alpha != ZERO /* Alpha == ZERO */ cmpl $1,%esi jne .L104 movl %edx, %ecx # ecx = n sarl $3, %ecx # (n >> 3) jle .L102 ALIGN_4 .L101: #ifndef XDOUBLE FSTU 0 * SIZE(%edi) FSTU 1 * SIZE(%edi) FSTU 2 * SIZE(%edi) FSTU 3 * SIZE(%edi) FSTU 4 * SIZE(%edi) FSTU 5 * SIZE(%edi) FSTU 6 * SIZE(%edi) FSTU 7 * SIZE(%edi) #else fld %st FST 0 * SIZE(%edi) fld %st FST 1 * SIZE(%edi) fld %st FST 2 * SIZE(%edi) fld %st FST 3 * SIZE(%edi) fld %st FST 4 * SIZE(%edi) fld %st FST 5 * SIZE(%edi) fld %st FST 6 * SIZE(%edi) fld %st FST 7 * SIZE(%edi) #endif addl $8 * SIZE, %edi decl %ecx jg .L101 ALIGN_4 .L102: movl %edx, %ecx andl $7, %ecx jle .L999 ALIGN_4 .L103: #ifndef XDOUBLE FSTU 0 * SIZE(%edi) #else fld %st FST 0 * SIZE(%edi) #endif addl $SIZE, %edi decl %ecx jg .L103 jmp .L999 ALIGN_4 .L104: sall $BASE_SHIFT, %esi movl %edx, %ecx # ecx = n sarl $3, %ecx # (n >> 3) jle .L106 ALIGN_4 .L105: #ifndef XDOUBLE FSTU 0 * SIZE(%edi) addl %esi, %edi FSTU 0 * SIZE(%edi) addl %esi, %edi FSTU 0 * SIZE(%edi) addl %esi, %edi FSTU 0 * SIZE(%edi) addl %esi, %edi FSTU 0 * SIZE(%edi) addl %esi, %edi FSTU 0 * SIZE(%edi) addl %esi, %edi FSTU 0 * SIZE(%edi) addl %esi, %edi FSTU 0 * SIZE(%edi) addl %esi, %edi #else fld %st FST 0 * SIZE(%edi) addl %esi, %edi fld %st FST 0 * SIZE(%edi) addl %esi, %edi fld %st FST 0 * SIZE(%edi) addl %esi, %edi fld %st FST 0 * SIZE(%edi) addl %esi, %edi fld %st FST 0 * SIZE(%edi) addl %esi, %edi fld %st FST 0 * SIZE(%edi) addl %esi, %edi fld %st FST 0 * SIZE(%edi) addl %esi, %edi fld %st FST 0 * SIZE(%edi) addl %esi, %edi #endif decl %ecx jg .L105 ALIGN_4 .L106: movl %edx, %ecx andl $7, %ecx jle .L999 ALIGN_4 .L107: #ifndef XDOUBLE FSTU 0 * SIZE(%edi) #else fld %st FST 0 * SIZE(%edi) #endif addl %esi, %edi decl %ecx jg .L107 jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L300: cmpl $1,%esi jne .L304 movl %edx, %ecx # ecx = n sarl $3, %ecx # (n >> 3) jle .L302 ALIGN_4 .L301: FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) FLD 1 * SIZE(%edi) fmul %st(1), %st FST 1 * SIZE(%edi) FLD 2 * SIZE(%edi) fmul %st(1), %st FST 2 * SIZE(%edi) FLD 3 * SIZE(%edi) fmul %st(1), %st FST 3 * SIZE(%edi) FLD 4 * SIZE(%edi) fmul %st(1), %st FST 4 * SIZE(%edi) FLD 5 * SIZE(%edi) fmul %st(1), %st FST 5 * SIZE(%edi) FLD 6 * SIZE(%edi) fmul %st(1), %st FST 6 * SIZE(%edi) FLD 7 * SIZE(%edi) fmul %st(1), %st FST 7 * SIZE(%edi) addl $8 * SIZE, %edi decl %ecx jg .L301 ALIGN_4 .L302: movl %edx, %ecx andl $7, %ecx jle .L999 ALIGN_4 .L303: FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl $SIZE, %edi decl %ecx jg .L303 jmp .L999 ALIGN_4 .L304: sall $BASE_SHIFT, %esi movl %edx, %ecx # ecx = n sarl $3, %ecx # (n >> 3) jle .L306 ALIGN_4 .L305: FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi decl %ecx jg .L305 ALIGN_4 .L306: movl %edx, %ecx andl $7, %ecx jle .L999 ALIGN_4 .L307: FLD 0 * SIZE(%edi) fmul %st(1), %st FST 0 * SIZE(%edi) addl %esi, %edi decl %ecx jg .L307 ALIGN_4 .L999: ffreep %st(0) xorl %eax,%eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/scal_sse.S000066400000000000000000000261231313527062700167750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 20 + STACK + ARGS(%esp) #define STACK_INCX 24 + STACK + ARGS(%esp) #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define XX %edi #include "l1param.h" PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX lea (, INCX, SIZE), INCX movss STACK_ALPHA, %xmm0 testl M, M jle .L999 xorps %xmm1, %xmm1 comiss %xmm0, %xmm1 shufps $0, %xmm0, %xmm0 jne .L100 /* Alpha == ZERO */ cmpl $SIZE, INCX jne .L50 /* INCX == 1 */ cmpl $3, M jle .L14 testl $4, X # aligned for double word? je .L05 movss %xmm1, 0 * SIZE(X) addl $SIZE, X decl M jle .L999 ALIGN_3 .L05: testl $8, X # aligned for quad word? je .L06 movsd %xmm1, 0 * SIZE(X) addl $2 * SIZE, X subl $2, M jle .L999 ALIGN_3 .L06: movl M, I sarl $4, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 4 * SIZE(X) movaps %xmm1, 8 * SIZE(X) movaps %xmm1, 12 * SIZE(X) addl $16 * SIZE, X decl I jg .L11 ALIGN_4 .L12: testl $15, M je .L999 testl $8, M je .L13 movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 4 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L13: testl $4, M je .L14 movaps %xmm1, 0 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L14: testl $2, M je .L15 movsd %xmm1, 0 * SIZE(X) addl $2 * SIZE, X ALIGN_3 .L15: testl $1, M je .L999 movss %xmm1, 0 * SIZE(X) jmp .L999 ALIGN_4 /* incx != 1 */ .L50: movl M, I # rcx = n sarl $3, I # (n >> 3) jle .L52 ALIGN_4 .L51: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X decl I jg .L51 ALIGN_4 .L52: testl $7, M je .L999 testl $4, M je .L53 movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X ALIGN_3 .L53: testl $2, M je .L54 movss %xmm1, (X) addl INCX, X movss %xmm1, (X) addl INCX, X ALIGN_3 .L54: testl $1, M je .L999 movss %xmm1, (X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: cmpl $SIZE, INCX jne .L150 subl $-32 * SIZE, X cmpl $3, M jle .L116 testl $SIZE, X je .L105 movss -32 * SIZE(X), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, -32 * SIZE(X) addl $SIZE, X decl M jle .L999 ALIGN_3 .L105: testl $2 * SIZE, X je .L110 movsd -32 * SIZE(X), %xmm1 mulps %xmm0, %xmm1 movsd %xmm1, -32 * SIZE(X) addl $2 * SIZE, X subl $2, M jle .L999 ALIGN_3 .L110: movl M, I sarl $5, I jle .L113 #if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 movaps %xmm0, %xmm2 mulps -28 * SIZE(X), %xmm2 movaps %xmm0, %xmm3 mulps -24 * SIZE(X), %xmm3 movaps %xmm0, %xmm4 mulps -20 * SIZE(X), %xmm4 decl I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, -32 * SIZE(X) movaps %xmm0, %xmm1 mulps -16 * SIZE(X), %xmm1 movaps %xmm2, -28 * SIZE(X) movaps %xmm0, %xmm2 mulps -12 * SIZE(X), %xmm2 movaps %xmm3, -24 * SIZE(X) movaps %xmm0, %xmm3 mulps -8 * SIZE(X), %xmm3 movaps %xmm4, -20 * SIZE(X) movaps %xmm0, %xmm4 mulps -4 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, -16 * SIZE(X) movaps %xmm0, %xmm1 mulps 0 * SIZE(X), %xmm1 movaps %xmm2, -12 * SIZE(X) movaps %xmm0, %xmm2 mulps 4 * SIZE(X), %xmm2 movaps %xmm3, -8 * SIZE(X) movaps %xmm0, %xmm3 mulps 8 * SIZE(X), %xmm3 movaps %xmm4, -4 * SIZE(X) movaps %xmm0, %xmm4 mulps 12 * SIZE(X), %xmm4 subl $-32 * SIZE, X decl I jg .L111 ALIGN_4 .L112: movaps %xmm1, -32 * SIZE(X) movaps %xmm0, %xmm1 mulps -16 * SIZE(X), %xmm1 movaps %xmm2, -28 * SIZE(X) movaps %xmm0, %xmm2 mulps -12 * SIZE(X), %xmm2 movaps %xmm3, -24 * SIZE(X) movaps %xmm0, %xmm3 mulps -8 * SIZE(X), %xmm3 movaps %xmm4, -20 * SIZE(X) movaps %xmm0, %xmm4 mulps -4 * SIZE(X), %xmm4 movaps %xmm1, -16 * SIZE(X) movaps %xmm2, -12 * SIZE(X) movaps %xmm3, -8 * SIZE(X) movaps %xmm4, -4 * SIZE(X) #else movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movaps -20 * SIZE(X), %xmm4 decl I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) movaps -16 * SIZE(X), %xmm1 mulps %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(X) movaps -12 * SIZE(X), %xmm2 mulps %xmm0, %xmm3 movaps %xmm3, -24 * SIZE(X) movaps -8 * SIZE(X), %xmm3 mulps %xmm0, %xmm4 movaps %xmm4, -20 * SIZE(X) movaps -4 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) movaps 0 * SIZE(X), %xmm1 mulps %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(X) movaps 4 * SIZE(X), %xmm2 mulps %xmm0, %xmm3 movaps %xmm3, -8 * SIZE(X) movaps 8 * SIZE(X), %xmm3 mulps %xmm0, %xmm4 movaps %xmm4, -4 * SIZE(X) movaps 12 * SIZE(X), %xmm4 subl $-32 * SIZE, X decl I jg .L111 ALIGN_4 .L112: mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) movaps -16 * SIZE(X), %xmm1 mulps %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(X) movaps -12 * SIZE(X), %xmm2 mulps %xmm0, %xmm3 movaps %xmm3, -24 * SIZE(X) movaps -8 * SIZE(X), %xmm3 mulps %xmm0, %xmm4 movaps %xmm4, -20 * SIZE(X) movaps -4 * SIZE(X), %xmm4 mulps %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) mulps %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(X) mulps %xmm0, %xmm3 movaps %xmm3, -8 * SIZE(X) mulps %xmm0, %xmm4 movaps %xmm4, -4 * SIZE(X) #endif subl $-32 * SIZE, X ALIGN_3 .L113: testl $31, M je .L999 testl $16, M je .L114 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm3 movaps -24 * SIZE(X), %xmm5 movaps -20 * SIZE(X), %xmm7 mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) mulps %xmm0, %xmm3 movaps %xmm3, -28 * SIZE(X) mulps %xmm0, %xmm5 movaps %xmm5, -24 * SIZE(X) mulps %xmm0, %xmm7 movaps %xmm7, -20 * SIZE(X) addl $16 * SIZE, X ALIGN_3 .L114: testl $8, M je .L115 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm3 mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) mulps %xmm0, %xmm3 movaps %xmm3, -28 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L115: testl $4, M je .L116 movaps -32 * SIZE(X), %xmm1 mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L116: testl $2, M je .L117 movsd -32 * SIZE(X), %xmm1 mulps %xmm0, %xmm1 movsd %xmm1, -32 * SIZE(X) addl $2 * SIZE, X ALIGN_3 .L117: testl $1, M je .L999 movss -32 * SIZE(X), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, -32 * SIZE(X) jmp .L999 ALIGN_3 /* incx != 1 */ .L150: movl X, XX movl M, I # rcx = n sarl $3, I # (n >> 3) jle .L152 ALIGN_4 .L151: movss (X), %xmm1 addl INCX, X movss (X), %xmm2 addl INCX, X movss (X), %xmm3 addl INCX, X movss (X), %xmm4 addl INCX, X mulss %xmm0, %xmm1 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 movss %xmm1, (XX) addl INCX, XX movss %xmm2, (XX) addl INCX, XX movss %xmm3, (XX) addl INCX, XX movss %xmm4, (XX) addl INCX, XX movss (X), %xmm1 addl INCX, X movss (X), %xmm2 addl INCX, X movss (X), %xmm3 addl INCX, X movss (X), %xmm4 addl INCX, X mulss %xmm0, %xmm1 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 movss %xmm1, (XX) addl INCX, XX movss %xmm2, (XX) addl INCX, XX movss %xmm3, (XX) addl INCX, XX movss %xmm4, (XX) addl INCX, XX decl I jg .L151 ALIGN_4 .L152: testl $7, M je .L999 testl $4, M je .L153 movss (X), %xmm1 addl INCX, X movss (X), %xmm2 addl INCX, X movss (X), %xmm3 addl INCX, X movss (X), %xmm4 addl INCX, X mulss %xmm0, %xmm1 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 movss %xmm1, (XX) addl INCX, XX movss %xmm2, (XX) addl INCX, XX movss %xmm3, (XX) addl INCX, XX movss %xmm4, (XX) addl INCX, XX ALIGN_3 .L153: testl $2, M je .L154 movss (X), %xmm1 addl INCX, X movss (X), %xmm2 addl INCX, X mulss %xmm0, %xmm1 mulss %xmm0, %xmm2 movss %xmm1, (XX) addl INCX, XX movss %xmm2, (XX) addl INCX, XX ALIGN_3 .L154: testl $1, M je .L999 movss (X), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, (X) ALIGN_4 .L999: xorl %eax, %eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/scal_sse2.S000066400000000000000000000243651313527062700170650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define XX %edi #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movsd STACK_ALPHA, %xmm0 testl M, M jle .L999 leal (, INCX, SIZE), INCX xorps %xmm1, %xmm1 comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO jp .L100 # For Alpha = NaN /* Alpha == ZERO */ cmpl $SIZE, INCX jne .L50 /* INCX == 1 */ testl $15, X # aligned for quad word? je .L05 movsd %xmm1, 0 * SIZE(X) addl $SIZE, X decl M jle .L999 ALIGN_3 .L05: /* Aligned Mode */ movl M, I # rcx = n sarl $4, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 2 * SIZE(X) movaps %xmm1, 4 * SIZE(X) movaps %xmm1, 6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, 8 * SIZE(X) movaps %xmm1, 10 * SIZE(X) movaps %xmm1, 12 * SIZE(X) movaps %xmm1, 14 * SIZE(X) addl $16 * SIZE, X decl I jg .L11 ALIGN_4 .L12: testl $15, M je .L999 testl $8, M je .L13 movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 2 * SIZE(X) movaps %xmm1, 4 * SIZE(X) movaps %xmm1, 6 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L13: testl $4, M je .L14 movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 2 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L14: testl $2, M je .L15 movaps %xmm1, 0 * SIZE(X) addl $2 * SIZE, X ALIGN_3 .L15: testl $1, M je .L999 movsd %xmm1, 0 * SIZE(X) jmp .L999 ALIGN_4 .L50: movl M, I sarl $3, I jle .L52 ALIGN_4 .L51: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X decl I jg .L51 ALIGN_4 .L52: testl $7, M je .L999 testl $4, M je .L53 movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X ALIGN_3 .L53: testl $2, M je .L54 movsd %xmm1, (X) addl INCX, X movsd %xmm1, (X) addl INCX, X ALIGN_3 .L54: testl $1, M je .L999 movsd %xmm1, (X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: unpcklpd %xmm0, %xmm0 cmpl $SIZE, INCX jne .L150 testl $SIZE, X je .L105 movsd 0 * SIZE(X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, 0 * SIZE(X) addl $SIZE, X decl M jle .L999 ALIGN_3 .L105: subl $-16 * SIZE, X movl M, I # rcx = n sarl $4, I jle .L113 #if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 movaps %xmm0, %xmm2 mulpd -14 * SIZE(X), %xmm2 movaps %xmm0, %xmm3 mulpd -12 * SIZE(X), %xmm3 movaps %xmm0, %xmm4 mulpd -10 * SIZE(X), %xmm4 decl I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, -16 * SIZE(X) movaps %xmm0, %xmm1 mulpd -8 * SIZE(X), %xmm1 movaps %xmm2, -14 * SIZE(X) movaps %xmm0, %xmm2 mulpd -6 * SIZE(X), %xmm2 movaps %xmm3, -12 * SIZE(X) movaps %xmm0, %xmm3 mulpd -4 * SIZE(X), %xmm3 movaps %xmm4, -10 * SIZE(X) movaps %xmm0, %xmm4 mulpd -2 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, -8 * SIZE(X) movaps %xmm0, %xmm1 mulpd 0 * SIZE(X), %xmm1 movaps %xmm2, -6 * SIZE(X) movaps %xmm0, %xmm2 mulpd 2 * SIZE(X), %xmm2 movaps %xmm3, -4 * SIZE(X) movaps %xmm0, %xmm3 mulpd 4 * SIZE(X), %xmm3 movaps %xmm4, -2 * SIZE(X) movaps %xmm0, %xmm4 mulpd 6 * SIZE(X), %xmm4 subl $-16 * SIZE, X decl I jg .L111 ALIGN_4 .L112: movaps %xmm1, -16 * SIZE(X) movaps %xmm0, %xmm1 mulpd -8 * SIZE(X), %xmm1 movaps %xmm2, -14 * SIZE(X) movaps %xmm0, %xmm2 mulpd -6 * SIZE(X), %xmm2 movaps %xmm3, -12 * SIZE(X) movaps %xmm0, %xmm3 mulpd -4 * SIZE(X), %xmm3 movaps %xmm4, -10 * SIZE(X) movaps %xmm0, %xmm4 mulpd -2 * SIZE(X), %xmm4 movaps %xmm1, -8 * SIZE(X) movaps %xmm2, -6 * SIZE(X) movaps %xmm3, -4 * SIZE(X) movaps %xmm4, -2 * SIZE(X) #else movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 movaps -10 * SIZE(X), %xmm4 decl I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) movaps -8 * SIZE(X), %xmm1 mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) movaps -6 * SIZE(X), %xmm2 mulpd %xmm0, %xmm3 movaps %xmm3, -12 * SIZE(X) movaps -4 * SIZE(X), %xmm3 mulpd %xmm0, %xmm4 movaps %xmm4, -10 * SIZE(X) movaps -2 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd %xmm0, %xmm1 movaps %xmm1, -8 * SIZE(X) movaps 0 * SIZE(X), %xmm1 mulpd %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(X) movaps 2 * SIZE(X), %xmm2 mulpd %xmm0, %xmm3 movaps %xmm3, -4 * SIZE(X) movaps 4 * SIZE(X), %xmm3 mulpd %xmm0, %xmm4 movaps %xmm4, -2 * SIZE(X) movaps 6 * SIZE(X), %xmm4 subl $-16 * SIZE, X decl I jg .L111 ALIGN_4 .L112: mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) movaps -8 * SIZE(X), %xmm1 mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) movaps -6 * SIZE(X), %xmm2 mulpd %xmm0, %xmm3 movaps %xmm3, -12 * SIZE(X) movaps -4 * SIZE(X), %xmm3 mulpd %xmm0, %xmm4 movaps %xmm4, -10 * SIZE(X) movaps -2 * SIZE(X), %xmm4 mulpd %xmm0, %xmm1 movaps %xmm1, -8 * SIZE(X) mulpd %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(X) mulpd %xmm0, %xmm3 movaps %xmm3, -4 * SIZE(X) mulpd %xmm0, %xmm4 movaps %xmm4, -2 * SIZE(X) #endif subl $-16 * SIZE, X ALIGN_3 .L113: testl $15, M je .L999 testl $8, M je .L114 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 movaps -10 * SIZE(X), %xmm4 mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) mulpd %xmm0, %xmm3 movaps %xmm3, -12 * SIZE(X) mulpd %xmm0, %xmm4 movaps %xmm4, -10 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L114: testl $4, M je .L115 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L115: testl $2, M je .L116 movaps -16 * SIZE(X), %xmm1 mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) addl $2 * SIZE, X ALIGN_3 .L116: testl $1, M je .L999 movsd -16 * SIZE(X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, -16 * SIZE(X) jmp .L999 ALIGN_3 /* incx != 1 */ .L150: movl X, XX movl M, I sarl $2, I jle .L152 ALIGN_4 .L151: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X movsd (X), %xmm3 addl INCX, X movsd (X), %xmm4 addl INCX, X mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 movsd %xmm1, (XX) addl INCX, XX movsd %xmm2, (XX) addl INCX, XX movsd %xmm3, (XX) addl INCX, XX movsd %xmm4, (XX) addl INCX, XX decl I jg .L151 ALIGN_4 .L152: testl $2, M je .L154 movsd (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 movsd %xmm1, (XX) addl INCX, XX movsd %xmm2, (XX) addl INCX, XX ALIGN_3 .L154: testl $1, M je .L999 movsd (X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, (X) ALIGN_4 .L999: xorl %eax, %eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/staticbuffer.S000066400000000000000000000054701313527062700176640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ALLOC_STATIC ALIGN_6 #ifdef __CYGWIN__ .comm _alloc_area, (NUM_BUFFERS * BUFFER_SIZE) #else .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 #endif #endif OpenBLAS-0.2.20/kernel/x86/swap.S000066400000000000000000000117241313527062700161540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define N 4 + STACK + ARGS(%esp) #ifdef XDOUBLE #define X 32 + STACK + ARGS(%esp) #define INCX 36 + STACK + ARGS(%esp) #define Y 40 + STACK + ARGS(%esp) #define INCY 44 + STACK + ARGS(%esp) #elif defined(DOUBLE) #define X 24 + STACK + ARGS(%esp) #define INCX 28 + STACK + ARGS(%esp) #define Y 32 + STACK + ARGS(%esp) #define INCY 36 + STACK + ARGS(%esp) #else #define X 20 + STACK + ARGS(%esp) #define INCX 24 + STACK + ARGS(%esp) #define Y 28 + STACK + ARGS(%esp) #define INCY 32 + STACK + ARGS(%esp) #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl N, %edx movl X, %esi movl Y, %edi movl INCX, %ebx movl INCY, %ecx sall $BASE_SHIFT, %ebx sall $BASE_SHIFT, %ecx cmpl $SIZE, %ebx jne .L14 cmpl $SIZE, %ecx jne .L14 movl %edx, %eax sarl $2, %eax jle .L15 ALIGN_3 .L16: FLD 3 * SIZE(%esi) FLD 2 * SIZE(%esi) FLD 1 * SIZE(%esi) FLD 0 * SIZE(%esi) FLD 3 * SIZE(%edi) FLD 2 * SIZE(%edi) FLD 1 * SIZE(%edi) FLD 0 * SIZE(%edi) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) FST 2 * SIZE(%esi) FST 3 * SIZE(%esi) FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) FST 2 * SIZE(%edi) FST 3 * SIZE(%edi) addl $4 * SIZE, %esi addl $4 * SIZE, %edi decl %eax jg .L16 ALIGN_3 .L15: movl %edx, %eax andl $3, %eax jle .L27 ALIGN_3 .L22: FLD (%esi) FLD (%edi) FST (%esi) FST (%edi) addl $SIZE, %esi addl $SIZE, %edi decl %eax jg .L22 jmp .L27 ALIGN_3 /* INCX != 1 or INCY != 1 */ .L14: movl %edx, %eax sarl $2, %eax jle .L28 ALIGN_2 .L29: FLD (%esi) addl %ebx, %esi FLD (%esi) addl %ebx, %esi FLD (%esi) addl %ebx, %esi FLD (%esi) FLD (%edi) addl %ecx, %edi FLD (%edi) addl %ecx, %edi FLD (%edi) addl %ecx, %edi FLD (%edi) FST (%esi) subl %ebx, %esi FST (%esi) subl %ebx, %esi FST (%esi) subl %ebx, %esi FST (%esi) leal (%esi, %ebx, 4), %esi FST (%edi) subl %ecx, %edi FST (%edi) subl %ecx, %edi FST (%edi) subl %ecx, %edi FST (%edi) leal (%edi, %ecx, 4), %edi decl %eax jg .L29 ALIGN_3 .L28: movl %edx, %eax andl $3, %eax jle .L27 ALIGN_3 .L35: FLD (%esi) FLD (%edi) FST (%esi) addl %ebx, %esi FST (%edi) addl %ecx, %edi decl %eax jg .L35 ALIGN_3 .L27: xorl %eax,%eax popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/swap_sse.S000066400000000000000000000507161313527062700170320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 20 + STACK + ARGS(%esp) #define STACK_INCX 24 + STACK + ARGS(%esp) #define STACK_Y 28 + STACK + ARGS(%esp) #define STACK_INCY 32 + STACK + ARGS(%esp) #define M %edx #define X %esi #define Y %edi #define INCX %ebx #define INCY %ecx #include "l1param.h" PROLOGUE PROFCODE pushl %ebp pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_Y, Y movl STACK_INCX, INCX movl STACK_INCY, INCY sall $BASE_SHIFT, %ebx sall $BASE_SHIFT, %ecx cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY jne .L50 subl $-32 * SIZE, X subl $-32 * SIZE, Y cmpl $3, M jle .L16 testl $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M ALIGN_3 .L05: testl $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, M jle .L19 ALIGN_3 .L10: cmpl $3, M jle .L16 testl $2 * SIZE, X jne .L30 testl $1 * SIZE, X jne .L20 movl M, %eax sarl $5, %eax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) subl $-32 * SIZE, Y subl $-32 * SIZE, X decl %eax jg .L11 ALIGN_3 .L13: testl $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L14: testl $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: testl $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: testl $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) addl $2 * SIZE, X movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, Y ALIGN_3 .L17: testl $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) PSHUFD2($0x39, %xmm1, %xmm3) movlps %xmm3, -31 * SIZE(X) subl $3, M movl M, %eax sarl $5, %eax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -13 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -5 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -5 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L23: testl $16, M jle .L24 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L24: testl $8, M jle .L25 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, M jle .L26 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: PSHUFD2($0x39, %xmm0, %xmm2) PSHUFD1($0xff, %xmm0) movlps %xmm2, -32 * SIZE(Y) movss %xmm0, -30 * SIZE(Y) testl $2, M jle .L27 movsd -29 * SIZE(X), %xmm0 movsd -29 * SIZE(Y), %xmm1 movlps %xmm0, -29 * SIZE(Y) movlps %xmm1, -29 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, M jle .L29 movss -29 * SIZE(X), %xmm0 movss -29 * SIZE(Y), %xmm1 movss %xmm0, -29 * SIZE(Y) movss %xmm1, -29 * SIZE(X) ALIGN_3 .L29: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L30: testl $1 * SIZE, X jne .L40 movhps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) subl $2, M movl M, %eax sarl $5, %eax jle .L33 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -6 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -6 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -2 * SIZE(X) subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L33: testl $16, M jle .L34 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L34: testl $8, M jle .L35 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L35: testl $4, M jle .L36 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L36: movhps %xmm0, -32 * SIZE(Y) testl $2, M jle .L37 movsd -30 * SIZE(X), %xmm0 movsd -30 * SIZE(Y), %xmm1 movlps %xmm0, -30 * SIZE(Y) movlps %xmm1, -30 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L37: testl $1, M jle .L39 movss -30 * SIZE(X), %xmm0 movss -30 * SIZE(Y), %xmm1 movss %xmm0, -30 * SIZE(Y) movss %xmm1, -30 * SIZE(X) ALIGN_3 .L39: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) subl $3, M movl M, %eax sarl $5, %eax jle .L43 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -11 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -7 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -3 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -3 * SIZE(X) subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L43: testl $16, M jle .L44 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L44: testl $8, M jle .L45 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L45: testl $4, M jle .L46 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L46: movsd -31 * SIZE(X), %xmm2 PSHUFD2($0x39, %xmm1, %xmm1) movlps %xmm1, -31 * SIZE(X) PSHUFD1($0xff, %xmm0) movss %xmm0, -32 * SIZE(Y) movlps %xmm2, -31 * SIZE(Y) addl $3 * SIZE, X addl $3 * SIZE, Y testl $2, M jle .L47 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm0, -32 * SIZE(Y) movlps %xmm1, -32 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L47: testl $1, M jle .L49 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm0, -32 * SIZE(Y) movss %xmm1, -32 * SIZE(X) ALIGN_3 .L49: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L50: movl M, %eax sarl $3, %eax jle .L55 ALIGN_3 .L51: movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addl INCX, X movss %xmm0, (Y) addl INCY, Y decl %eax jg .L51 ALIGN_3 .L55: movl M, %eax andl $7, %eax jle .L57 ALIGN_3 .L56: movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) movss %xmm0, (Y) addl INCX, X addl INCY, Y decl %eax jg .L56 ALIGN_3 .L57: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/swap_sse2.S000066400000000000000000000253751313527062700171170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define STACK_Y 32 + STACK + ARGS(%esp) #define STACK_INCY 36 + STACK + ARGS(%esp) #define M %edx #define X %esi #define Y %edi #define INCX %ebx #define INCY %ecx #include "l1param.h" PROLOGUE PROFCODE pushl %ebp pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_Y, Y movl STACK_INCX, INCX movl STACK_INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY cmpl $SIZE, INCX jne .L40 cmpl $SIZE, INCY jne .L40 testl $SIZE, Y je .L10 movsd 0 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movsd %xmm1, 0 * SIZE(X) movsd %xmm0, 0 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M jle .L19 ALIGN_4 .L10: subl $-16 * SIZE, X subl $-16 * SIZE, Y testl $SIZE, X jne .L20 movl M, %eax sarl $4, %eax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -6 * SIZE(X), %xmm0 movaps -6 * SIZE(Y), %xmm1 movaps %xmm0, -6 * SIZE(Y) movaps %xmm1, -6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps -2 * SIZE(Y), %xmm1 movaps %xmm0, -2 * SIZE(Y) movaps %xmm1, -2 * SIZE(X) subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L11 ALIGN_3 .L13: testl $8, M jle .L14 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L14: testl $4, M jle .L15 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L15: testl $2, M jle .L16 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L16: testl $1, M jle .L19 movsd -16 * SIZE(X), %xmm0 movsd -16 * SIZE(Y), %xmm1 movlps %xmm1, -16 * SIZE(X) movlps %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L20: movhps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movlps %xmm1, -16 * SIZE(X) decl M jle .L29 movl M, %eax sarl $4, %eax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -7 * SIZE(X), %xmm2 movaps -6 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -5 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -5 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -3 * SIZE(X), %xmm2 movaps -2 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -4 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -3 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -2 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L23: testl $8, M jle .L24 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L24: testl $4, M jle .L25 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: testl $2, M jle .L26 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L26: testl $1, M jle .L29 movhps %xmm0, -16 * SIZE(Y) movhps -15 * SIZE(X), %xmm0 movhps %xmm1, -15 * SIZE(X) addl $SIZE, X addl $SIZE, Y ALIGN_3 .L29: movhps %xmm0, -16 * SIZE(Y) popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L40: movl M, %eax sarl $3, %eax jle .L45 ALIGN_3 .L41: movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addl INCX, X movsd %xmm0, (Y) addl INCY, Y decl %eax jg .L41 ALIGN_3 .L45: movl M, %eax andl $7, %eax jle .L47 ALIGN_3 .L46: movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) movsd %xmm0, (Y) addl INCX, X addl INCY, Y decl %eax jg .L46 ALIGN_3 .L47: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_2x2.S000066400000000000000000000452631313527062700206200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define A 24 + STACK + ARGS(%esp) #define B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #else #define A 20 + STACK + ARGS(%esp) #define B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #endif #define PREFETCH_OFFSET 48 #if defined(PENTIUM3) || defined(PENTIUMM) #define REP rep #else #define REP rep #endif #define AA %edx #define BB %ecx PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl LDC, %ebp # ldc # MEMORY movl B, %ebx leal (, %ebp, SIZE), %ebp #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, %ebx movl N, %eax imull %ebp, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax # j = (n >> 1) # MEMORY sarl $1, %eax movl %eax, J # j = (n >> 1) # MEMORY je .L8 ALIGN_4 .L34: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, %ebx #endif lea (, %ebp, 2), %eax #ifdef RT subl %eax, C #endif movl C, %edi #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %eax # m # MEMORY andl $1, %eax je .L12 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 2), BB #else movl %ebx, BB #endif fldz fldz FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1,%eax # k >> 1 # MEMORY je .L54 ALIGN_4 .L55: FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L55 ALIGN_4 .L54: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1,%eax # k & 1 je .L33 ALIGN_4 FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) addl $1 * SIZE, AA addl $2 * SIZE, BB ALIGN_4 .L33: ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 2), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD 0 * SIZE(BB) fmulp %st, %st(1) FLD 1 * SIZE(BB) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE(BB) fmulp %st, %st(2) #endif #ifdef RT FLD 3 * SIZE(BB) fmulp %st, %st(2) FLD 2 * SIZE(BB) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE(BB) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) #else FSTU 0 * SIZE(AA) fxch %st(1) FSTU 1 * SIZE(AA) #endif FST 0 * SIZE(%edi,%ebp) FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L12: movl M, %esi sarl $1, %esi je .L27 ALIGN_4 .MainHead: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 2), BB #else movl %ebx, BB #endif fldz fldz fldz fldz FLD 4 * SIZE(BB) # b5 FLD 4 * SIZE(AA) # a5 FLD 0 * SIZE(BB) # b1 FLD 0 * SIZE(AA) # a1 #ifdef LN #if defined(HAVE_3DNOW) prefetchw -2 * SIZE(%edi) prefetchw -2 * SIZE(%edi, %ebp, 1) #elif defined(HAVE_SSE) prefetchnta -2 * SIZE(%edi) prefetchnta -2 * SIZE(%edi, %ebp, 1) #endif #else #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(%edi) prefetchw 2 * SIZE(%edi, %ebp, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(%edi) prefetchnta 2 * SIZE(%edi, %ebp, 1) #endif #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L16 ALIGN_4 .MainLoop: #if defined(HAVE_3DNOW) prefetch (PREFETCH_OFFSET) * SIZE(BB) nop #elif defined(HAVE_SSE) prefetchnta (PREFETCH_OFFSET) * SIZE(BB) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) #endif #endif fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(AA) fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(AA) fmul %st, %st(1) FMUL 3 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 3 * SIZE(AA) fmul %st, %st(1) FMUL 3 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 8 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 8 * SIZE(AA) fxch %st(2) #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) #endif #endif fmul %st, %st(3) FMUL 5 * SIZE(BB) fxch %st(3) faddp %st, %st(4) FLD 4 * SIZE(BB) fxch %st(3) faddp %st, %st(5) FLD 5 * SIZE(AA) fmul %st, %st(3) FMUL 5 * SIZE(BB) fxch %st(3) faddp %st, %st(6) FLD 6 * SIZE(BB) fxch %st(3) faddp %st, %st(7) FLD 6 * SIZE(AA) fmul %st, %st(3) FMUL 7 * SIZE(BB) fxch %st(3) faddp %st, %st(4) FLD 6 * SIZE(BB) fxch %st(3) faddp %st, %st(5) FLD 7 * SIZE(AA) fmul %st, %st(3) FMUL 7 * SIZE(BB) fxch %st(3) faddp %st, %st(6) FLD 12 * SIZE(BB) fxch %st(3) faddp %st, %st(7) FLD 12 * SIZE(AA) fxch %st(2) subl $-8 * SIZE, BB subl $-8 * SIZE, AA decl %eax # l -- jne .MainLoop ALIGN_4 .L16: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L21 ALIGN_4 .SubLoop: fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(AA) fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(AA) addl $2 * SIZE,BB addl $2 * SIZE,AA decl %eax jne .SubLoop ALIGN_4 .L21: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 2), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) FLD 2 * SIZE(BB) fsubp %st, %st(3) FLD 3 * SIZE(BB) fsubp %st, %st(4) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(3) FLD 2 * SIZE(AA) fsubp %st, %st(2) FLD 3 * SIZE(AA) fsubp %st, %st(4) #endif #ifdef LN FLD 3 * SIZE(AA) fmul %st, %st(3) fmulp %st, %st(4) FLD 2 * SIZE(AA) fmul %st(3), %st FLD 2 * SIZE(AA) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(AA) fmul %st(1), %st FLD 1 * SIZE(AA) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD 3 * SIZE(AA) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(3) FLD 1 * SIZE(BB) fmul %st(1), %st FLD 1 * SIZE(BB) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD 3 * SIZE(BB) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD 3 * SIZE(BB) fmul %st, %st(2) fmulp %st, %st(4) FLD 2 * SIZE(BB) fmul %st(2), %st FLD 2 * SIZE(BB) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subl $2 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) fxch %st(2) FSTU 2 * SIZE(BB) fxch %st(3) FSTU 3 * SIZE(BB) FST 1 * SIZE(%edi,%ebp) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi) #else FSTU 0 * SIZE(AA) fxch %st(2) FSTU 1 * SIZE(AA) fxch %st(1) FSTU 2 * SIZE(AA) fxch %st(3) FSTU 3 * SIZE(AA) FST 1 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi,%ebp) #endif #ifndef LN addl $2 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %esi # i -- jne .MainHead ALIGN_4 .L27: #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (%ebx, %eax, 2), %ebx #endif #if defined(LT) || defined(RN) movl BB, %ebx #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j-- # MEMORY jne .L34 ALIGN_4 .L8: movl N, %eax # n # MEMORY andl $1, %eax je .End #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, %ebx #endif #ifdef RT subl %ebp, C #endif movl C, %edi # c # MEMORY #ifndef RT addl %ebp, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %eax # m # MEMORY andl $1, %eax # m & 1 je .L36 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 1), BB #else movl %ebx, BB #endif fldz #ifdef LN #if defined(HAVE_3DNOW) prefetchw -2 * SIZE(%edi) #elif defined(HAVE_SSE) prefetchnta -2 * SIZE(%edi) #endif #else #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(%edi) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(%edi) #endif #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif test %eax, %eax jle .L52 ALIGN_3 .L51: FLD (AA) FMUL (BB) addl $1 * SIZE,AA addl $1 * SIZE,BB faddp %st,%st(1) decl %eax jne .L51 ALIGN_4 .L52: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 1), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) #endif #if defined(LN) || defined(LT) FMUL 0 * SIZE(AA) #else FMUL 0 * SIZE(BB) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) #else FSTU 0 * SIZE(AA) #endif FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L36: movl M, %esi # m # MEMORY sarl $1, %esi # m >> 1 je .L99 ALIGN_4 .L46: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 1), BB #else movl %ebx, BB #endif fldz fldz FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax je .L56 ALIGN_4 .L57: FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) addl $4 * SIZE,AA addl $2 * SIZE,BB dec %eax jne .L57 ALIGN_4 .L56: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax je .L45 ALIGN_4 FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) addl $2 * SIZE,AA addl $1 * SIZE,BB ALIGN_4 .L45: ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 1), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(2) #endif #ifdef LN FLD 3 * SIZE(AA) fmulp %st, %st(2) FLD 2 * SIZE(AA) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE(AA) fmulp %st, %st(1) #endif #ifdef LT FLD 0 * SIZE(AA) fmulp %st, %st(1) FLD 1 * SIZE(AA) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE(AA) fmulp %st, %st(2) #endif #ifdef RN FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subl $2 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) #else FSTU 0 * SIZE(AA) fxch %st(1) FSTU 1 * SIZE(AA) #endif FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) #ifndef LN addl $2 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %esi # i -- jne .L46 ALIGN_4 .L99: #ifdef LN movl K, %eax leal (%ebx, %eax, SIZE), %ebx #endif #if defined(LT) || defined(RN) movl BB, %ebx #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .End: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_2x2_atom.S000066400000000000000000000461641313527062700216410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L20 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L25 ALIGN_4 .L22: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 3 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 5 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 6 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 7 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addsd %xmm2, %xmm4 addsd %xmm3, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm7 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm5 movaps %xmm5, %xmm6 movsd 3 * SIZE(BB), %xmm7 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm1 mulsd %xmm7, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm5 movaps %xmm5, %xmm6 movsd 0 * SIZE(BB), %xmm7 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm0 mulsd %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L20: movl M, %ebx sarl $1, %ebx jle .L29 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) xorps %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 3 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 5 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 4 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 5 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 7 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 7 * SIZE(BB), %xmm3 addl $8 * SIZE, BB addl $8 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addsd %xmm2, %xmm6 addsd %xmm3, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm1 movsd 2 * SIZE(BB), %xmm2 movsd 3 * SIZE(BB), %xmm3 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 subsd %xmm6, %xmm2 subsd %xmm7, %xmm3 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm2 movsd 2 * SIZE(AA), %xmm1 movsd 3 * SIZE(AA), %xmm3 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 subsd %xmm5, %xmm1 subsd %xmm7, %xmm3 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm4, %xmm3 movsd 0 * SIZE(AA), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm2, %xmm5 mulsd %xmm3, %xmm6 subsd %xmm5, %xmm0 subsd %xmm6, %xmm1 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm4, %xmm1 movsd 3 * SIZE(AA), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm0, %xmm5 mulsd %xmm1, %xmm6 subsd %xmm5, %xmm2 subsd %xmm6, %xmm3 mulsd %xmm7, %xmm2 mulsd %xmm7, %xmm3 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm5 mulsd %xmm4, %xmm2 movsd 3 * SIZE(BB), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm0, %xmm5 mulsd %xmm2, %xmm6 subsd %xmm5, %xmm1 subsd %xmm6, %xmm3 mulsd %xmm7, %xmm1 mulsd %xmm7, %xmm3 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm5 mulsd %xmm4, %xmm3 movsd 0 * SIZE(BB), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm1, %xmm5 mulsd %xmm3, %xmm6 subsd %xmm5, %xmm0 subsd %xmm6, %xmm2 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) movsd %xmm2, 2 * SIZE(BB) movsd %xmm3, 3 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm2, 1 * SIZE(AA) movsd %xmm1, 2 * SIZE(AA) movsd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) movsd %xmm3, 1 * SIZE(CO1, LDC) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L10 ALIGN_4 .L30: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L40 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 movsd 0 * SIZE(BB), %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L45 ALIGN_4 .L42: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 3 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addsd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulsd 0 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd 0 * SIZE(BB), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L40: movl M, %ebx sarl $1, %ebx jle .L49 ALIGN_4 .L31: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 prefetcht0 3 * SIZE(CO1) xorps %xmm2, %xmm2 xorps %xmm4, %xmm4 xorps %xmm6, %xmm6 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 2 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 4 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addsd %xmm0, %xmm4 addsd %xmm2, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm2 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm2 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 movsd 0 * SIZE(AA), %xmm7 subsd %xmm5, %xmm0 mulsd %xmm7, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm7 subsd %xmm5, %xmm2 mulsd %xmm7, %xmm2 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm2, 1 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L31 ALIGN_4 .L49: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_2x4_penryn.S000066400000000000000000001052201313527062700222030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC subl $-16 * SIZE, A subl $-16 * SIZE, B #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx jle .L20 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movaps -14 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps -8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -2 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 0 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 6 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 14 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 18 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L28 .L26: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm0 movapd -14 * SIZE(BB), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #else movapd -16 * SIZE(AA), %xmm1 movapd -14 * SIZE(AA), %xmm3 subpd %xmm4, %xmm1 subpd %xmm5, %xmm3 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm3, %xmm2 unpckhpd %xmm3, %xmm3 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -14 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movsd -13 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movsd -11 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -10 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movsd -9 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movsd -6 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd -5 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movsd -1 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movsd -1 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 movsd -2 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movsd -3 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movsd -4 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movsd -6 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd -7 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movsd -8 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movsd -11 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -12 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) movsd %xmm2, -14 * SIZE(AA) movsd %xmm3, -13 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L20: movl M, %ebx sarl $1, %ebx jle .L29 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif leal (CO1, LDC, 2), %eax movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -2 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -2 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 -2 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 -2 * SIZE(%eax, LDC) #else pxor %xmm4, %xmm4 prefetcht0 1 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 1 * SIZE(%eax, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 subl $-32 * SIZE, BB mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 movaps %xmm6, %xmm0 movsd %xmm7, %xmm6 movsd %xmm0, %xmm7 #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd -16 * SIZE(BB), %xmm2 movapd -14 * SIZE(BB), %xmm5 movapd -12 * SIZE(BB), %xmm3 movapd -10 * SIZE(BB), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 movapd -12 * SIZE(AA), %xmm2 movapd -10 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movddup -14 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movddup -15 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -14 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movddup -13 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movddup -11 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -10 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movddup -9 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movddup -6 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup -5 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup -1 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup -1 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 movddup -2 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup -3 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movddup -4 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movddup -6 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup -7 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movddup -8 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movddup -11 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -12 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(BB) movapd %xmm5, -14 * SIZE(BB) movapd %xmm3, -12 * SIZE(BB) movapd %xmm7, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) movapd %xmm2, -12 * SIZE(AA) movapd %xmm3, -10 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movsd %xmm5, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L30: testl $2, N je .L60 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx jle .L50 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm2 movhps -15 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -6 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -4 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -2 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps 0 * SIZE(BB), %xmm2 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L58 .L56: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm0 subpd %xmm4, %xmm0 #else movapd -16 * SIZE(AA), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -13 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movsd -13 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -14 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L50: movl M, %ebx sarl $1, %ebx jle .L59 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 #ifdef LN prefetcht0 -2 * SIZE(CO1) pxor %xmm6, %xmm6 prefetcht0 -2 * SIZE(CO1, LDC) pxor %xmm7, %xmm7 #else prefetcht0 1 * SIZE(CO1) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(CO1, LDC) pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -10 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -6 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -2 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd -16 * SIZE(BB), %xmm2 movapd -14 * SIZE(BB), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movddup -14 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movddup -15 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -13 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup -13 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -14 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(BB) movapd %xmm3, -14 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx jle .L80 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm2 movhps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 subl $-8 * SIZE, AA subl $-8 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd -15 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd -15 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 haddpd %xmm4, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd -16 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L80: movl M, %ebx sarl $1, %ebx jle .L89 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 #ifdef LN prefetcht0 -2 * SIZE(CO1) #else prefetcht0 1 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: pshufd $0x44, %xmm1, %xmm2 movsd -15 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #else movapd -16 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -14 * SIZE(AA), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BB) movsd %xmm1, -15 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L89: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_2x4_sse2.S000066400000000000000000001353511313527062700215540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA 16 + STACK + ARGS(%esi) #define OLD_A 24 + STACK + ARGS(%esi) #define OLD_B 28 + STACK + ARGS(%esi) #define OLD_C 32 + STACK + ARGS(%esi) #define OLD_LDC 36 + STACK + ARGS(%esi) #define OLD_OFFT 40 + STACK + ARGS(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movd OLD_OFFT, %mm4 movl OLD_B, B movl OLD_C, %ebx movl %ebx, C movl OLD_LDC, LDC movd %mm4, OFFSET movd %mm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L05 ALIGN_4 .L02: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L05: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L10 movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L20 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movlpd 10 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movlpd 12 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 18 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 20 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 22 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 32 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movlpd 26 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movlpd 28 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 30 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movlpd 34 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movlpd 36 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 38 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 48 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movlpd 42 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movlpd 44 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 46 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 56 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movlpd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movlpd 50 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movlpd 52 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 54 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 64 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movlpd 58 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movlpd 60 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 62 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 72 * SIZE(BB), %xmm3 addl $64 * SIZE, BB addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $8 * SIZE, AA decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 8 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm5, %xmm4 unpcklpd %xmm7, %xmm6 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm5 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 #else movlpd 0 * SIZE(AA), %xmm0 movlpd 1 * SIZE(AA), %xmm1 movlpd 2 * SIZE(AA), %xmm2 movlpd 3 * SIZE(AA), %xmm3 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 subsd %xmm6, %xmm2 subsd %xmm7, %xmm3 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movlpd 3 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movlpd 5 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 6 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movlpd 7 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movlpd 10 * SIZE(B), %xmm4 mulsd %xmm4, %xmm2 movlpd 11 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movlpd 15 * SIZE(B), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm4 mulsd %xmm4, %xmm3 movlpd 14 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movlpd 13 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movlpd 12 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movlpd 10 * SIZE(B), %xmm4 mulsd %xmm4, %xmm2 movlpd 9 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movlpd 8 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movlpd 5 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 4 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm5, 4 * SIZE(BB) movlpd %xmm5, 5 * SIZE(BB) movhpd %xmm5, 6 * SIZE(BB) movhpd %xmm5, 7 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) movlpd %xmm1, 1 * SIZE(AA) movlpd %xmm2, 2 * SIZE(AA) movlpd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L20: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L29 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) prefetchw -2 * SIZE(CO1, LDC, 2) prefetchw -2 * SIZE(CO1, %eax) #else prefetchw 1 * SIZE(CO1) prefetchw 1 * SIZE(CO1, LDC) prefetchw 1 * SIZE(CO1, LDC, 2) prefetchw 1 * SIZE(CO1, %eax) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm5 movapd 4 * SIZE(B), %xmm3 movapd 6 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movlpd 2 * SIZE(AA), %xmm4 movhpd 2 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movlpd 1 * SIZE(AA), %xmm4 movhpd 1 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movlpd 5 * SIZE(B), %xmm4 movhpd 5 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 6 * SIZE(B), %xmm4 movhpd 6 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movlpd 7 * SIZE(B), %xmm4 movhpd 7 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movlpd 10 * SIZE(B), %xmm4 movhpd 10 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 movlpd 11 * SIZE(B), %xmm4 movhpd 11 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movlpd 15 * SIZE(B), %xmm4 movhpd 15 * SIZE(B), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm4 movhpd 15 * SIZE(B), %xmm4 mulpd %xmm4, %xmm3 movlpd 14 * SIZE(B), %xmm4 movhpd 14 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movlpd 13 * SIZE(B), %xmm4 movhpd 13 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movlpd 12 * SIZE(B), %xmm4 movhpd 12 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movlpd 10 * SIZE(B), %xmm4 movhpd 10 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 movlpd 9 * SIZE(B), %xmm4 movhpd 9 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movlpd 8 * SIZE(B), %xmm4 movhpd 8 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movlpd 5 * SIZE(B), %xmm4 movhpd 5 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 4 * SIZE(B), %xmm4 movhpd 4 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movapd %xmm3, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm5, 4 * SIZE(BB) movlpd %xmm5, 5 * SIZE(BB) movhpd %xmm5, 6 * SIZE(BB) movhpd %xmm5, 7 * SIZE(BB) movlpd %xmm3, 8 * SIZE(BB) movlpd %xmm3, 9 * SIZE(BB) movhpd %xmm3, 10 * SIZE(BB) movhpd %xmm3, 11 * SIZE(BB) movlpd %xmm7, 12 * SIZE(BB) movlpd %xmm7, 13 * SIZE(BB) movhpd %xmm7, 14 * SIZE(BB) movhpd %xmm7, 15 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movlpd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 4), B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L01 ALIGN_4 .L30: testl $2, N je .L60 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L35 ALIGN_4 .L32: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L32 ALIGN_2 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L40 ALIGN_2 .L36: movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L36 ALIGN_4 .L40: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L50 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movlpd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movlpd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movlpd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movlpd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movlpd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movlpd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 .L56: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm5, %xmm4 movapd 0 * SIZE(B), %xmm2 subpd %xmm4, %xmm2 #else movlpd 0 * SIZE(AA), %xmm0 movlpd 1 * SIZE(AA), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movlpd 3 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) movlpd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L50: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L59 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) #else prefetchw 1 * SIZE(CO1) prefetchw 1 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: mulpd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movlpd 2 * SIZE(AA), %xmm4 movhpd 2 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movlpd 1 * SIZE(AA), %xmm4 movhpd 1 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm3, 4 * SIZE(BB) movlpd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movlpd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: testl $1, N je .L999 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L65 ALIGN_4 .L62: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L62 ALIGN_2 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L70 ALIGN_2 .L66: movq 0 * SIZE(B), %mm0 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L66 ALIGN_4 .L70: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L80 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L85 ALIGN_4 .L82: mulsd %xmm0, %xmm2 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) movlpd 1 * SIZE(AA), %xmm0 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm6 movlpd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movlpd 5 * SIZE(AA), %xmm1 mulsd 10 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movlpd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm6 movlpd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 2 * SIZE(BB), %xmm2 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addsd %xmm5, %xmm4 addsd %xmm7, %xmm6 addsd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(B), %xmm2 subsd %xmm4, %xmm2 #else movlpd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) #else movlpd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L80: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L99 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) #else prefetchw 1 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) movapd 16 * SIZE(BB), %xmm2 movapd 2 * SIZE(AA), %xmm0 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movapd 24 * SIZE(BB), %xmm3 movapd 10 * SIZE(AA), %xmm1 mulpd 10 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(AA), %xmm0 movapd 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 subpd %xmm4, %xmm2 #else movapd 0 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movapd %xmm2, %xmm3 unpckhpd %xmm3, %xmm3 movlpd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movlpd 2 * SIZE(AA), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm3, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm3 unpckhpd %xmm3, %xmm3 movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movlpd 1 * SIZE(AA), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movlpd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm3, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L99: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B,%eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_2x4_sse3.S000066400000000000000000001076141313527062700215560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #ifdef PENTIUM4 #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef PENTIUMM #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L20 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 18 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 22 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 26 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 7 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 30 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 34 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 36 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 9 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 38 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 48 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 42 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 44 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 11 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 46 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 56 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 50 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 52 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 13 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 54 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 64 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 58 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 60 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 15 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 62 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 72 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L28 .L26: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm0 movapd 2 * SIZE(BB), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #else movapd 0 * SIZE(AA), %xmm1 movapd 2 * SIZE(AA), %xmm3 subpd %xmm4, %xmm1 subpd %xmm5, %xmm3 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm3, %xmm2 unpckhpd %xmm3, %xmm3 #endif #ifdef LN movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movsd 3 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movsd 5 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 6 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movsd 7 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movsd 10 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd 11 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movsd 15 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movsd 15 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 movsd 14 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movsd 13 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movsd 12 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movsd 10 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd 9 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movsd 8 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movsd 5 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 4 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) movsd %xmm2, 2 * SIZE(AA) movsd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L20: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L29 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax #ifdef LN prefetchnta -2 * SIZE(CO1) prefetchnta -2 * SIZE(CO1, LDC, 1) prefetchnta -2 * SIZE(CO1, LDC, 2) prefetchnta -2 * SIZE(CO1, %eax, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) prefetchnta 2 * SIZE(CO1, LDC, 2) prefetchnta 2 * SIZE(CO1, %eax, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 17 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 18 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 19 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 10 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 20 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 21 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 22 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 23 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 32 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 25 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 26 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 27 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 28 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 29 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 30 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 31 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 40 * SIZE(BB), %xmm3 addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 2 * SIZE(BB), %xmm5 movapd 4 * SIZE(BB), %xmm3 movapd 6 * SIZE(BB), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movddup 2 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movddup 1 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup 1 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup 2 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movddup 3 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movddup 5 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 6 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movddup 7 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movddup 10 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup 11 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup 15 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup 15 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 movddup 14 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup 13 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movddup 12 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movddup 10 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup 9 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movddup 8 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movddup 5 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 4 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BB) movapd %xmm5, 2 * SIZE(BB) movapd %xmm3, 4 * SIZE(BB) movapd %xmm7, 6 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movsd %xmm5, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L30: testl $2, N je .L60 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L50 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 2 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 3 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup 5 * SIZE(AA), %xmm0 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm0 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 6 * SIZE(AA), %xmm0 mulpd 12 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 7 * SIZE(AA), %xmm0 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 movddup 9 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 mulpd 18 * SIZE(BB), %xmm1 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 10 * SIZE(AA), %xmm1 mulpd 20 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 11 * SIZE(AA), %xmm1 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 movddup 13 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 26 * SIZE(BB), %xmm1 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 14 * SIZE(AA), %xmm1 mulpd 28 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 15 * SIZE(AA), %xmm1 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L58 .L56: mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm0 subpd %xmm4, %xmm0 #else movapd 0 * SIZE(AA), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #endif #ifdef LN movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L50: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L59 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) prefetchnta -2 * SIZE(CO1, LDC, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(BB), %xmm2 movapd 2 * SIZE(BB), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movddup 2 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movddup 1 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup 1 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup 3 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup 3 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 2 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BB) movapd %xmm3, 2 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L80 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd 8 * SIZE(AA), %xmm1 movhpd 9 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movhpd 1 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsd 8 * SIZE(BB), %xmm3 movhpd 9 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L80: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L89 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 4 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) #else prefetchnta 2 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd 4 * SIZE(AA), %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm2, %xmm7 movddup 8 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movddup 5 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm3, %xmm5 movddup 6 * SIZE(BB), %xmm3 mulpd 12 * SIZE(AA), %xmm3 addpd %xmm3, %xmm6 movddup 7 * SIZE(BB), %xmm3 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #else movapd 0 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(AA), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L89: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_4x2_core2.S000066400000000000000000001126651313527062700217150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK subl $-16 * SIZE, A subl $-16 * SIZE, B sall $BASE_SHIFT, LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J jle .L100 ALIGN_2 .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 # coffset = c #ifndef RT addl %eax, C #endif movl M, %ebx testl $1, %ebx jle .L30 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -12 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm1 mulsd -10 * SIZE(BB), %xmm0 addsd %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 addsd %xmm0, %xmm7 movsd -14 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd -4 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd -13 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 8 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd -8 * SIZE(AA), %xmm0 mulsd %xmm2, %xmm1 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm4 movsd 4 * SIZE(BB), %xmm1 addsd %xmm2, %xmm5 movsd -11 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm1 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm6 movsd 16 * SIZE(BB), %xmm1 addsd %xmm2, %xmm7 movsd -10 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 10 * SIZE(BB), %xmm2 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm2, %xmm5 movsd -9 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 14 * SIZE(BB), %xmm2 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm2, %xmm7 movsd -4 * SIZE(AA), %xmm2 subl $-8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L51 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm0 movsd -15 * SIZE(B), %xmm1 #else movsd -16 * SIZE(AA), %xmm0 movsd -15 * SIZE(AA), %xmm1 #endif subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #if defined(LN) || defined(LT) movsd -16 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm0 mulsd %xmm2, %xmm1 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm0 movsd -15 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 mulsd -13 * SIZE(B), %xmm1 #endif #ifdef RT mulsd -13 * SIZE(B), %xmm1 movsd -14 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 mulsd -16 * SIZE(B), %xmm0 #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(B) movsd %xmm1, -15 * SIZE(B) movsd %xmm0, -16 * SIZE(BB) movsd %xmm0, -15 * SIZE(BB) movsd %xmm1, -14 * SIZE(BB) movsd %xmm1, -13 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 addpd %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 addpd %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 addpd %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 addpd %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L31 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd -16 * SIZE(B), %xmm2 movapd -14 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -14 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movddup -15 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movddup -13 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 #endif #ifdef RN movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movddup -14 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm3, 1 * SIZE(CO1, LDC) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) movhpd %xmm1, 1 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(B) movapd %xmm3, -14 * SIZE(B) movddup %xmm2, %xmm0 movddup %xmm3, %xmm1 unpckhpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 movapd %xmm0, -16 * SIZE(BB) movapd %xmm2, -14 * SIZE(BB) movapd %xmm1, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm7, %xmm7 prefetcht2 -3 * SIZE(CO1, LDC) #else prefetcht2 3 * SIZE(CO1) pxor %xmm7, %xmm7 prefetcht2 3 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd 8 * SIZE(AA), %xmm3 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm2, %xmm6 movapd 12 * SIZE(AA), %xmm3 addpd %xmm1, %xmm7 movapd 12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 subl $-32 * SIZE, BB movapd 24 * SIZE(AA), %xmm3 subl $-32 * SIZE, AA addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -16 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd -16 * SIZE(B), %xmm2 movapd -14 * SIZE(B), %xmm3 movapd -12 * SIZE(B), %xmm5 movapd -10 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 subpd %xmm6, %xmm5 subpd %xmm1, %xmm7 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 movapd -12 * SIZE(AA), %xmm2 movapd -10 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 subpd %xmm5, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 movddup -2 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm5 movddup -3 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm3 movddup -4 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm2 movddup -6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movddup -7 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm3 movddup -8 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm2 movddup -11 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -12 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movddup -15 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movddup -14 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm5 movddup -13 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm7 movddup -11 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -10 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm5 movddup -9 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm7 movddup -6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movddup -5 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm7 movddup -1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 movddup -15 * SIZE(B), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm1, %xmm5 subpd %xmm5, %xmm3 movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 movddup -14 * SIZE(B), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 mulpd %xmm3, %xmm5 subpd %xmm5, %xmm1 movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movsd %xmm7, 3 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm3, 1 * SIZE(CO1, LDC) movhpd %xmm5, 2 * SIZE(CO1, LDC) movhpd %xmm7, 3 * SIZE(CO1, LDC) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm2, 1 * SIZE(CO1, LDC) movsd %xmm3, 2 * SIZE(CO1, LDC) movhpd %xmm3, 3 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(B) movapd %xmm3, -14 * SIZE(B) movapd %xmm5, -12 * SIZE(B) movapd %xmm7, -10 * SIZE(B) movddup %xmm2, %xmm0 movddup %xmm3, %xmm1 movddup %xmm5, %xmm4 movddup %xmm7, %xmm6 unpckhpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 unpckhpd %xmm5, %xmm5 unpckhpd %xmm7, %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm2, -14 * SIZE(BB) movapd %xmm1, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) movapd %xmm2, -12 * SIZE(AA) movapd %xmm3, -10 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, -16 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 # coffset = c #ifndef RT addl LDC, C #endif movl M, %ebx testl $1, %ebx jle .L130 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -8 * SIZE(BB), %xmm3 movsd -12 * SIZE(AA), %xmm2 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -14 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -12 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -13 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -10 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -8 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -0 * SIZE(BB), %xmm1 mulsd %xmm2, %xmm3 movsd -11 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -6 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -10 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd -4 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -9 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -2 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -4 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd 8 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: addsd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm0 #else movsd -16 * SIZE(AA), %xmm0 #endif subsd %xmm4, %xmm0 #if defined(LN) || defined(LT) mulsd -16 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd -16 * SIZE(B), %xmm0 #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(B) movsd %xmm0, -16 * SIZE(BB) movsd %xmm0, -15 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 movapd -8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm2, %xmm3 movapd -6 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -2 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd 8 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd 8 * SIZE(BB), %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm0 #else movapd -16 * SIZE(AA), %xmm0 #endif subpd %xmm4, %xmm0 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -14 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm2, %xmm0 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(B) movddup %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 movapd %xmm1, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L159 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 .L111: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AA), %xmm1 addpd %xmm0, %xmm5 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -6 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 movapd -4 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -2 * SIZE(AA), %xmm1 addpd %xmm2, %xmm5 movapd 8 * SIZE(AA), %xmm2 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm0 mulpd 2 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 6 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm2, %xmm4 movapd 12 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm2, %xmm5 movapd 24 * SIZE(AA), %xmm2 addpd %xmm3, %xmm7 movapd 8 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L111 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $2 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 #endif subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd -1 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movsd -2 * SIZE(AA), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd -3 * SIZE(AA), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd -4 * SIZE(AA), %xmm7 mulsd %xmm3, %xmm7 subsd %xmm7, %xmm0 movsd -6 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -7 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm2 movsd -8 * SIZE(AA), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd -11 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -12 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -14 * SIZE(AA), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd -13 * SIZE(AA), %xmm7 mulsd %xmm0, %xmm7 subsd %xmm7, %xmm3 movsd -11 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -10 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm1 movsd -9 * SIZE(AA), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd -6 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -5 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd -1 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movddup %xmm0, %xmm2 movddup %xmm1, %xmm3 unpckhpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm2, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) movapd %xmm3, -12 * SIZE(BB) movapd %xmm1, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L159: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 1), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_4x2_sse2.S000066400000000000000000001241751313527062700215560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define AA %edx #define BB %ecx #define PREFETCHSIZE (8 * 4) #define KERNEL1(address) \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK sall $BASE_SHIFT, LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L100 ALIGN_2 .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, %esi # coffset = c #ifndef RT addl %eax, C #endif movl M, %ebx testl $1, %ebx jle .L30 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal BUFFER, %ecx #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L51 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm1 #endif subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm0 mulsd %xmm2, %xmm1 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 mulsd 3 * SIZE(B), %xmm1 #endif #ifdef RT mulsd 3 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 mulsd 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(B) movsd %xmm1, 1 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movsd %xmm1, 2 * SIZE(BB) movsd %xmm1, 3 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movsd %xmm1, 0 * SIZE(%esi, LDC) #ifndef LN addl $1 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L31 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 movhpd 2 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 #endif #ifdef RN movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movsd %xmm2, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movsd %xmm3, 4 * SIZE(BB) movsd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, %esi #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(%esi) movsd %xmm3, 1 * SIZE(%esi) movhpd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm3, 1 * SIZE(%esi, LDC) #else movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 0 * SIZE(%esi, LDC) movhpd %xmm1, 1 * SIZE(%esi, LDC) #endif #ifndef LN addl $2 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #ifdef LN prefetcht2 -4 * SIZE(%esi) prefetcht2 -4 * SIZE(%esi, LDC) #else prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #ifdef PENTIUM4 andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else sarl $3, %eax je .L12 .L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $32 * SIZE, %ecx addl $32 * SIZE, %edx decl %eax jne .L11 #endif .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 subpd %xmm6, %xmm5 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 subpd %xmm5, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movsd 15 * SIZE(AA), %xmm0 movhpd 15 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 movsd 14 * SIZE(AA), %xmm0 movhpd 14 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm5 movsd 13 * SIZE(AA), %xmm0 movhpd 13 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 movhpd 12 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm2 movsd 10 * SIZE(AA), %xmm0 movhpd 10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movsd 9 * SIZE(AA), %xmm0 movhpd 9 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm3 movsd 8 * SIZE(AA), %xmm0 movhpd 8 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm2 movsd 5 * SIZE(AA), %xmm0 movhpd 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 movhpd 4 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 movhpd 2 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm7 movsd 5 * SIZE(AA), %xmm0 movhpd 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 movhpd 6 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm5 movsd 7 * SIZE(AA), %xmm0 movhpd 7 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm7 movsd 10 * SIZE(AA), %xmm0 movhpd 10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movsd 11 * SIZE(AA), %xmm0 movhpd 11 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm7 movsd 15 * SIZE(AA), %xmm0 movhpd 15 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 #endif #ifdef RN movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 #endif #ifdef RT movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movsd %xmm2, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movsd %xmm3, 4 * SIZE(BB) movsd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) movsd %xmm5, 8 * SIZE(BB) movsd %xmm5, 9 * SIZE(BB) movhpd %xmm5, 10 * SIZE(BB) movhpd %xmm5, 11 * SIZE(BB) movsd %xmm7, 12 * SIZE(BB) movsd %xmm7, 13 * SIZE(BB) movhpd %xmm7, 14 * SIZE(BB) movhpd %xmm7, 15 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, %esi #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(%esi) movsd %xmm3, 1 * SIZE(%esi) movsd %xmm5, 2 * SIZE(%esi) movsd %xmm7, 3 * SIZE(%esi) movhpd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm3, 1 * SIZE(%esi, LDC) movhpd %xmm5, 2 * SIZE(%esi, LDC) movhpd %xmm7, 3 * SIZE(%esi, LDC) #else movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 2 * SIZE(%esi) movhpd %xmm1, 3 * SIZE(%esi) movsd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm2, 1 * SIZE(%esi, LDC) movsd %xmm3, 2 * SIZE(%esi, LDC) movhpd %xmm3, 3 * SIZE(%esi, LDC) #endif #ifndef LN addl $4 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movsd 0 * SIZE(B), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, B addl $2 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, %esi # coffset = c #ifndef RT addl LDC, C #endif movl M, %ebx testl $1, %ebx jle .L130 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal BUFFER, BB movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #ifdef LN prefetcht2 -4 * SIZE(%esi) #else prefetcht2 4 * SIZE(%esi) #endif #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 mulsd 2 * SIZE(BB), %xmm0 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movsd 5 * SIZE(AA), %xmm1 addsd %xmm3, %xmm4 mulsd 10 * SIZE(BB), %xmm1 movsd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm4 movsd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm0 #else movsd 0 * SIZE(AA), %xmm0 #endif subsd %xmm4, %xmm0 #if defined(LN) || defined(LT) mulsd 0 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) #ifndef LN addl $1 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movapd 0 * SIZE(AA), %xmm0 mulpd 0 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 addl $2 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm0 #else movapd 0 * SIZE(AA), %xmm0 #endif subpd %xmm4, %xmm0 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm2, %xmm0 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movhpd %xmm0, 2 * SIZE(BB) movhpd %xmm0, 3 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) #ifndef LN addl $2 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L159 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 .L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 10 * SIZE(AA), %xmm2 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm6 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 14 * SIZE(AA), %xmm2 addpd %xmm1, %xmm5 movapd 24 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movapd 16 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm0 mulpd 18 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 20 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd 10 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 22 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 32 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd 12 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 26 * SIZE(AA), %xmm3 addpd %xmm1, %xmm4 movapd 28 * SIZE(AA), %xmm1 addpd %xmm3, %xmm6 movapd 14 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 30 * SIZE(AA), %xmm3 addpd %xmm1, %xmm5 movapd 40 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movapd 24 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L111 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 addl $4 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm0 movapd 2 * SIZE(B), %xmm1 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 #endif subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd 15 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movsd 14 * SIZE(AA), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 13 * SIZE(AA), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 12 * SIZE(AA), %xmm7 mulsd %xmm3, %xmm7 subsd %xmm7, %xmm0 movsd 10 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 9 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm2 movsd 8 * SIZE(AA), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 5 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 4 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 2 * SIZE(AA), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 3 * SIZE(AA), %xmm7 mulsd %xmm0, %xmm7 subsd %xmm7, %xmm3 movsd 5 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 6 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm1 movsd 7 * SIZE(AA), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 10 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 11 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 15 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movhpd %xmm0, 2 * SIZE(BB) movhpd %xmm0, 3 * SIZE(BB) movsd %xmm1, 4 * SIZE(BB) movsd %xmm1, 5 * SIZE(BB) movhpd %xmm1, 6 * SIZE(BB) movhpd %xmm1, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 2 * SIZE(%esi) movhpd %xmm1, 3 * SIZE(%esi) #ifndef LN addl $4 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L159: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 1), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_4x4_penryn.S000066400000000000000000001540611313527062700222140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 20 + STACK + ARGS(%esp) #define ARG_B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define ARG_LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 4) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC subl $-32 * SIZE, A subl $-32 * SIZE, B #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L40 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif testl $1, M je .L20 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(BB), %xmm1 subps %xmm4, %xmm1 #else movsd -32 * SIZE(AA), %xmm0 movhps -30 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) movss %xmm1, -31 * SIZE(AA) movss %xmm2, -30 * SIZE(AA) movss %xmm3, -29 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movss %xmm1, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC, 1) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm6, 0 * SIZE(CO1, %eax, 1) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC, 1) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L20: testl $2, M je .L30 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 subl $-16 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps -32 * SIZE(BB), %xmm1 movaps -28 * SIZE(BB), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 movsd -30 * SIZE(AA), %xmm1 movsd -28 * SIZE(AA), %xmm2 movsd -26 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps -32 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) movlps %xmm1, -30 * SIZE(AA) movlps %xmm2, -28 * SIZE(AA) movlps %xmm3, -26 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L39 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif leal (CO1, LDC, 2), %eax movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 -4 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 -4 * SIZE(%eax, LDC) #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 3 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(%eax, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 4), BB #endif addps %xmm3, %xmm6 addps %xmm2, %xmm7 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm4 movaps %xmm6, %xmm2 unpcklps %xmm5, %xmm2 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm1 movlhps %xmm2, %xmm0 movhlps %xmm2, %xmm1 movaps %xmm6, %xmm7 movlhps %xmm4, %xmm6 movhlps %xmm4, %xmm7 pshufd $0x39, %xmm1, %xmm2 pshufd $0x39, %xmm7, %xmm4 movaps -32 * SIZE(BB), %xmm1 movaps -28 * SIZE(BB), %xmm3 movaps -24 * SIZE(BB), %xmm5 movaps -20 * SIZE(BB), %xmm7 subps %xmm0, %xmm1 subps %xmm2, %xmm3 subps %xmm6, %xmm5 subps %xmm4, %xmm7 #else pshufd $0x39, %xmm5, %xmm2 pshufd $0x4e, %xmm6, %xmm0 pshufd $0x93, %xmm7, %xmm7 movaps %xmm4, %xmm6 unpcklps %xmm0, %xmm4 unpckhps %xmm0, %xmm6 movaps %xmm2, %xmm1 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm5 unpcklps %xmm2, %xmm4 unpckhps %xmm2, %xmm5 movaps %xmm6, %xmm7 unpcklps %xmm1, %xmm6 unpckhps %xmm1, %xmm7 pshufd $0x93, %xmm5, %xmm5 pshufd $0x4e, %xmm6, %xmm6 pshufd $0x39, %xmm7, %xmm7 movaps -32 * SIZE(AA), %xmm0 movaps -28 * SIZE(AA), %xmm1 movaps -24 * SIZE(AA), %xmm2 movaps -20 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) movaps %xmm5, -24 * SIZE(BB) movaps %xmm7, -20 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) movaps %xmm1, -28 * SIZE(AA) movaps %xmm2, -24 * SIZE(AA) movaps %xmm3, -20 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) movhps %xmm6, 2 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) movhps %xmm3, 2 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L39: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L40: testl $2, N je .L80 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif testl $1, M je .L60 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -28 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -22 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -20 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -18 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -16 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif addps %xmm5, %xmm4 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 movsd -32 * SIZE(BB), %xmm1 subps %xmm4, %xmm1 #else movss -32 * SIZE(AA), %xmm0 movss -31 * SIZE(AA), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) movss %xmm1, -31 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) pshufd $1, %xmm1, %xmm3 movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L60: testl $2, M je .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movaps -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif addps %xmm3, %xmm4 addps %xmm5, %xmm4 movhlps %xmm4, %xmm5 #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movsd -32 * SIZE(BB), %xmm1 movsd -30 * SIZE(BB), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 movsd -30 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps -32 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) movlps %xmm1, -30 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L70: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L79 ALIGN_4 .L51: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) #endif pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), BB #endif addps %xmm6, %xmm4 addps %xmm7, %xmm5 addps %xmm2, %xmm4 addps %xmm3, %xmm5 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movsd -32 * SIZE(BB), %xmm1 movsd -30 * SIZE(BB), %xmm3 movsd -28 * SIZE(BB), %xmm5 movsd -26 * SIZE(BB), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps -32 * SIZE(AA), %xmm0 movaps -28 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) movlps %xmm5, -28 * SIZE(BB) movlps %xmm7, -26 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) movaps %xmm1, -28 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L51 ALIGN_4 .L79: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L80: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif testl $1, M je .L100 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -31 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA leal (B, %eax, SIZE), BB #endif haddps %xmm4, %xmm4 #if defined(LN) || defined(LT) movss -32 * SIZE(BB), %xmm1 subss %xmm4, %xmm1 #else movss -32 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss -32 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss -32 * SIZE(BB), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA leal (BB, %eax, SIZE), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L100: testl $2, M je .L110 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) pshufd $1, %xmm4, %xmm6 movss -32 * SIZE(BB), %xmm1 movss -31 * SIZE(BB), %xmm3 subss %xmm4, %xmm1 subss %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movsd -32 * SIZE(AA), %xmm4 movhps -30 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 #endif #if defined(RN) || defined(RT) movss -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) movss %xmm3, -31 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L110: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L119 ALIGN_4 .L91: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss -32 * SIZE(BB), %xmm1 movss -31 * SIZE(BB), %xmm3 movss -30 * SIZE(BB), %xmm5 movss -29 * SIZE(BB), %xmm7 subss %xmm4, %xmm1 subss %xmm6, %xmm3 subss %xmm0, %xmm5 subss %xmm2, %xmm7 #else movaps -32 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 #endif #if defined(RN) || defined(RT) movss -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) movss %xmm3, -31 * SIZE(BB) movss %xmm5, -30 * SIZE(BB) movss %xmm7, -29 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L91 ALIGN_4 .L119: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_4x4_sse.S000066400000000000000000002063351313527062700214750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_A 20 + STACK(%esi) #define OLD_B 24 + STACK(%esi) #define OLD_C 28 + STACK(%esi) #define OLD_LDC 32 + STACK(%esi) #define STACK_OFFT 36 + STACK(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) #endif #if defined(PENTIUM4) || defined(PENTIUMM) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-1024, %esp STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movss STACK_OFFT, %xmm4 movl OLD_B, B movl OLD_C, %ebx movl %ebx, C movl OLD_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L40 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L05 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L10 movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif testl $1, M je .L20 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movss 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movss 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 20 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 36 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 40 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 44 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 52 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 56 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 60 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 68 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 72 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 76 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 96 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 84 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 88 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 92 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 112 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 100 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 104 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 108 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 128 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 116 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 120 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 124 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 144 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (AA, %eax, SIZE), AA sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 unpcklps %xmm5, %xmm4 movaps 0 * SIZE(B), %xmm1 subps %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm1 movss 2 * SIZE(AA), %xmm2 movss 3 * SIZE(AA), %xmm3 subss %xmm4, %xmm0 subss %xmm5, %xmm1 subss %xmm6, %xmm2 subss %xmm7, %xmm3 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) movss %xmm2, 2 * SIZE(AA) movss %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movss %xmm1, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC, 1) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm6, 0 * SIZE(CO1, %eax, 1) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC, 1) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L20: testl $2, M je .L30 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 76 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 96 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 92 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 112 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 108 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 128 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 124 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 144 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 16 * SIZE(BB), %xmm2 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $1 + BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 2 * SIZE(AA), %xmm1 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 4 * SIZE(AA), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 6 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm1, 2 * SIZE(AA) movlps %xmm2, 4 * SIZE(AA) movlps %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L39 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 leal (LDC, LDC, 2), %eax PREFETCHW -4 * SIZE(CO1) PREFETCHW -4 * SIZE(CO1, LDC) PREFETCHW -4 * SIZE(CO1, LDC, 2) PREFETCHW -4 * SIZE(CO1, %eax) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(0 * 16) KERNEL2(0 * 16) KERNEL3(0 * 16) KERNEL4(0 * 16) KERNEL5(0 * 16) KERNEL6(0 * 16) KERNEL7(0 * 16) KERNEL8(0 * 16) addl $128 * SIZE, BB addl $32 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $2 + BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm5 movaps 12 * SIZE(B), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 movaps 8 * SIZE(AA), %xmm2 movaps 12 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) movaps %xmm5, 8 * SIZE(B) movaps %xmm7, 12 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm2 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm0, 32 * SIZE(BB) movaps %xmm2, 36 * SIZE(BB) movaps %xmm4, 40 * SIZE(BB) movaps %xmm6, 44 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm2 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm6 movaps %xmm0, 48 * SIZE(BB) movaps %xmm2, 52 * SIZE(BB) movaps %xmm4, 56 * SIZE(BB) movaps %xmm6, 60 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) movaps %xmm2, 8 * SIZE(AA) movaps %xmm3, 12 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) movhps %xmm6, 2 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) movhps %xmm3, 2 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $16 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L39: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 4), B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L45 ALIGN_4 .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L50 ALIGN_4 .L46: #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif testl $1, M je .L60 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addss %xmm6, %xmm4 addss %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 subps %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) pshufd $1, %xmm1, %xmm3 movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L60: testl $2, M je .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L65 ALIGN_4 .L62: #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 2 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(BB) movaps %xmm2, 12 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L70: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L79 ALIGN_4 .L51: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW -4 * SIZE(CO1) PREFETCHW -4 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $1 + BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 4 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd 6 * SIZE(B), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) movlps %xmm7, 6 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(BB) movaps %xmm2, 12 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm2 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm2 movaps %xmm0, 24 * SIZE(BB) movaps %xmm2, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L51 ALIGN_4 .L79: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L80: testl $1, N je .L999 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L85 ALIGN_4 .L82: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L86 ALIGN_4 .L90: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif testl $1, M je .L100 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 48 * SIZE(BB), %xmm3 mulss 20 * SIZE(BB), %xmm1 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L100: testl $2, M je .L110 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) pshufd $1, %xmm4, %xmm6 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm3 subss %xmm4, %xmm1 subss %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm3, 1 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 4 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L110: movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L119 ALIGN_4 .L91: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW -4 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 mulps 20 * SIZE(BB), %xmm1 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm3 movss 2 * SIZE(B), %xmm5 movss 3 * SIZE(B), %xmm7 subss %xmm4, %xmm1 subss %xmm6, %xmm3 subss %xmm0, %xmm5 subss %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm3, 1 * SIZE(B) movss %xmm5, 2 * SIZE(B) movss %xmm7, 3 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 4 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 movaps %xmm0, 8 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 movaps %xmm0, 12 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) movss %xmm2, 2 * SIZE(AA) movss %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L91 ALIGN_4 .L119: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B, %eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LN_8x2_sse.S000066400000000000000000001764701313527062700215050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 20 + STACK + ARGS(%esi) #define STACK_B 24 + STACK + ARGS(%esi) #define STACK_C 28 + STACK + ARGS(%esi) #define STACK_LDC 32 + STACK + ARGS(%esi) #define STACK_OFFT 36 + STACK + ARGS(%esi) #define TRMASK 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #ifdef HAVE_3DNOW #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if !defined(HAVE_SSE2) || defined(OPTERON) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-STACK_ALIGN, %esp STACK_TOUCHING movss STACK_M, %xmm0 movl STACK_N, %eax movss STACK_K, %xmm1 movss STACK_A, %xmm2 movl STACK_B, B movss STACK_C, %xmm3 movl STACK_LDC, LDC movss STACK_OFFT, %xmm4 movss %xmm1, K movl %eax, N movss %xmm0, M movss %xmm2, A movss %xmm3, C movl %esi, OLD_STACK movss %xmm4, OFFSET movss %xmm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif #if defined(LN) || defined(LT) movl $0x3f800000, 0 + TRMASK # 1.0 movl $0x00000000, 4 + TRMASK # 0.0 movl $0x3f800000, 8 + TRMASK # 1.0 movl $0x00000000, 12 + TRMASK # 0.0 #endif movl N, %eax sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L100 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_4 .L02: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm2 shufps $0xaa, %xmm2, %xmm2 shufps $0xff, %xmm3, %xmm3 movaps %xmm7, %xmm4 shufps $0x00, %xmm4, %xmm4 movaps %xmm7, %xmm5 shufps $0x55, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xaa, %xmm6, %xmm6 shufps $0xff, %xmm7, %xmm7 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax BRANCH jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movsd 0 * SIZE(B), %xmm3 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif testl $1, M jle .L30 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L72 ALIGN_2 .L71: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 .L72: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L74 .L73: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L73 ALIGN_4 .L74: addss %xmm6, %xmm4 addss %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm2 subss %xmm4, %xmm0 subss %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 mulss %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 movaps %xmm6, %xmm5 mulss %xmm0, %xmm5 subss %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 mulss %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 mulss %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 movaps %xmm6, %xmm5 mulss %xmm2, %xmm5 subss %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 mulss %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm2, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movss %xmm2, 0 * SIZE(CO1) movss %xmm0, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L30: testl $2, M jle .L50 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 ALIGN_2 .L51: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L53 ALIGN_4 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 2 * SIZE(AA), %xmm2 subps %xmm4, %xmm0 subps %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 3 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 3 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 subps %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 subps %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm2, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: testl $4, M jle .L70 #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $2 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 ALIGN_2 .L31: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L31 ALIGN_2 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm0 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm3 movhps 6 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm0, %xmm3 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm2 subps %xmm4, %xmm0 subps %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 11 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 subps %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 subps %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm3, 4 * SIZE(B) movhps %xmm3, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm3, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm2, 4 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) movhps %xmm0, 2 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L70: movl M, %ebx sarl $3, %ebx jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $3 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $3 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 PREFETCHW 7 * SIZE(CO1) PREFETCHW 7 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L12 ALIGN_2 .L11: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 ALIGN_2 .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 4 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm1, %xmm7 addl $8 * SIZE, AA addl $8 * SIZE, BB subl $1, %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $8, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 8), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm0 movaps %xmm6, %xmm1 unpcklps %xmm7, %xmm6 unpckhps %xmm7, %xmm1 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm3 movhps 6 * SIZE(B), %xmm3 movsd 8 * SIZE(B), %xmm5 movhps 10 * SIZE(B), %xmm5 movsd 12 * SIZE(B), %xmm7 movhps 14 * SIZE(B), %xmm7 subps %xmm4, %xmm2 subps %xmm0, %xmm3 subps %xmm6, %xmm5 subps %xmm1, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 movaps 8 * SIZE(AA), %xmm2 movaps 12 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm6, %xmm1 subps %xmm5, %xmm2 subps %xmm7, %xmm3 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 movaps %xmm7, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 62 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movsd 60 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 58 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 56 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 52 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 50 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 48 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 44 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 42 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 40 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 34 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 32 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 26 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 24 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 16 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 10 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 19 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 20 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 22 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 28 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 30 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 37 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 38 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 46 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 55 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 mulps %xmm1, %xmm6 subps %xmm5, %xmm2 subps %xmm6, %xmm3 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 mulps %xmm3, %xmm6 subps %xmm5, %xmm0 subps %xmm6, %xmm1 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm3, 4 * SIZE(B) movhps %xmm3, 6 * SIZE(B) movlps %xmm5, 8 * SIZE(B) movhps %xmm5, 10 * SIZE(B) movlps %xmm7, 12 * SIZE(B) movhps %xmm7, 14 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm3, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm5, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 32 * SIZE(BB) movaps %xmm1, 36 * SIZE(BB) movaps %xmm4, 40 * SIZE(BB) movaps %xmm6, 44 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm6 #else movaps %xmm7, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm7, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm7, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm7, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 48 * SIZE(BB) movaps %xmm1, 52 * SIZE(BB) movaps %xmm4, 56 * SIZE(BB) movaps %xmm6, 60 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) movaps %xmm2, 8 * SIZE(AA) movaps %xmm3, 12 * SIZE(AA) #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movaps %xmm5, %xmm4 shufps $0x88, %xmm7, %xmm5 shufps $0xdd, %xmm7, %xmm4 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm5, 4 * SIZE(CO1) movhps %xmm5, 6 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) movhps %xmm0, 2 * SIZE(CO1, LDC) movlps %xmm4, 4 * SIZE(CO1, LDC) movhps %xmm4, 6 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) movlps %xmm3, 4 * SIZE(CO1, LDC) movhps %xmm3, 6 * SIZE(CO1, LDC) #endif #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 8), AA #ifdef LT addl $16 * SIZE, B #endif #endif #ifdef LN subl $8, KK movl BORIG, B #endif #ifdef LT addl $8, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $3 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L100: testl $1, N jle .L999 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm2 shufps $0xaa, %xmm2, %xmm2 shufps $0xff, %xmm3, %xmm3 movaps %xmm7, %xmm4 shufps $0x00, %xmm4, %xmm4 movaps %xmm7, %xmm5 shufps $0x55, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xaa, %xmm6, %xmm6 shufps $0xff, %xmm7, %xmm7 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movss 0 * SIZE(B), %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif testl $1, M jle .L130 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L172 ALIGN_2 .L171: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 mulss 4 * SIZE(BB), %xmm0 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 mulss 20 * SIZE(BB), %xmm1 movss 48 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L171 ALIGN_2 .L172: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L174 .L173: movss 0 * SIZE(AA), %xmm0 movss 0 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L173 ALIGN_4 .L174: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) shufps $0x00, %xmm1, %xmm1 movaps %xmm1, 0 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L130: testl $2, M jle .L150 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 ALIGN_2 .L151: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L151 ALIGN_2 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L153 ALIGN_4 .L154: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 shufps $1, %xmm5, %xmm5 movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 movaps %xmm4, %xmm6 shufps $0xff, %xmm6, %xmm6 mulss %xmm6, %xmm1 movaps %xmm4, %xmm6 shufps $0xaa, %xmm6, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm0 mulss %xmm4, %xmm0 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 mulss %xmm4, %xmm0 movaps %xmm4, %xmm6 shufps $0x55, %xmm6, %xmm6 mulss %xmm0, %xmm6 subss %xmm6, %xmm1 movaps %xmm4, %xmm6 shufps $0xff, %xmm6, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #ifdef RT movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) shufps $0x00, %xmm0, %xmm0 shufps $0x00, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: testl $4, M jle .L170 #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $2 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 movhps 2 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movsd 16 * SIZE(AA), %xmm1 movhps 18 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 ALIGN_2 .L131: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 4 * SIZE(BB), %xmm0 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 mulps 20 * SIZE(BB), %xmm1 movaps 48 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L131 ALIGN_2 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 subps %xmm4, %xmm2 xorps %xmm5, %xmm5 movaps %xmm2, %xmm3 unpcklps %xmm5, %xmm2 unpckhps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 11 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #ifdef RT movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L170: movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L179 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $3 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $3 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 PREFETCHW 7 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_2 .L111: mulps %xmm2, %xmm0 mulps 4 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 4 * SIZE(BB), %xmm2 mulps %xmm2, %xmm0 mulps 12 * SIZE(AA), %xmm2 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 8 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 20 * SIZE(AA), %xmm2 addps %xmm1, %xmm4 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 28 * SIZE(AA), %xmm2 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 36 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 40 * SIZE(AA), %xmm0 addps %xmm3, %xmm6 movaps 20 * SIZE(BB), %xmm3 mulps %xmm3, %xmm0 mulps 44 * SIZE(AA), %xmm3 addps %xmm0, %xmm5 movaps 64 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 52 * SIZE(AA), %xmm3 addps %xmm1, %xmm4 movaps 56 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 60 * SIZE(AA), %xmm3 addps %xmm1, %xmm5 movaps 80 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $64 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_2 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm0 addps %xmm0, %xmm4 mulps 4 * SIZE(AA), %xmm2 addps %xmm2, %xmm6 addl $8 * SIZE, AA addl $4 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $8, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 8), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm5 movhps 6 * SIZE(B), %xmm5 subps %xmm4, %xmm2 subps %xmm6, %xmm5 xorps %xmm0, %xmm0 movaps %xmm2, %xmm3 unpcklps %xmm0, %xmm2 unpckhps %xmm0, %xmm3 movaps %xmm5, %xmm7 unpcklps %xmm0, %xmm5 unpckhps %xmm0, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 movaps %xmm7, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 62 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movsd 60 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 58 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 56 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 52 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 50 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 48 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 44 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 42 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 40 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 34 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 32 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 26 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 24 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 16 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 10 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 19 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 20 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 22 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 28 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 30 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 37 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 38 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 46 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 55 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 shufps $0x88, %xmm7, %xmm5 movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) movhps %xmm5, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm5, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm5, 4 * SIZE(CO1) movhps %xmm5, 6 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) #endif #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 8), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $8, KK movl BORIG, B #endif #ifdef LT addl $8, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $3 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_2 .L179: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B, %eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_1x4.S000066400000000000000000000476151313527062700206320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 32 #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define AORIG 16 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define STACK_A 24 + STACK + ARGS(%esp) #define STACK_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define STACK_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #else #define STACK_A 20 + STACK + ARGS(%esp) #define STACK_B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define STACK_LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #endif #define A %edx #define B %ecx #define B_ORIG %ebx #define LDC %ebp #define PREFETCHSIZE (5 + 8 * 10) PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_B, B_ORIG movl STACK_LDC, LDC leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, STACK_A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B_ORIG movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif subl $-16 * SIZE, B_ORIG subl $-16 * SIZE, STACK_A movl M, %eax testl %eax, %eax jle .L999 movl N, %eax testl %eax, %eax jle .L999 movl K, %eax testl %eax, %eax jle .L999 movl N, %eax sarl $2, %eax movl %eax, J je .L20 ALIGN_3 .L11: #if defined(LT) || defined(RN) movl STACK_A, A #else movl STACK_A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B_ORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, %edi #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl B_ORIG, B #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax jle .L13 ALIGN_4 .L12: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi movl 16 * SIZE(B), %esi movl 24 * SIZE(B), %esi movl 32 * SIZE(B), %esi movl 40 * SIZE(B), %esi subl $-64 * SIZE, B decl %eax jne .L12 ALIGN_3 .L13: movl M, %esi movl %esi, I ALIGN_3 .L14: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, A leal (A , %eax, 1), A leal (B_ORIG, %eax, 4), B #else movl B_ORIG, B #endif leal (%edi, LDC, 2), %eax fldz fldz fldz fldz FLD -8 * SIZE(A) FLD -16 * SIZE(A) FLD -16 * SIZE(B) movl $32 * SIZE, %esi prefetchw 1 * SIZE(%edi) prefetchw 1 * SIZE(%edi, LDC) prefetchw 1 * SIZE(%eax) prefetchw 1 * SIZE(%eax, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L16 ALIGN_3 .L15: fmul %st(1), %st faddp %st, %st(3) PADDING FLD -15 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -13 * SIZE(B) faddp %st, %st(5) FLD -15 * SIZE(A) FLD -12 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -11 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -10 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -9 * SIZE(B) faddp %st, %st(5) FLD -14 * SIZE(A) FLD -8 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -7 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -6 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -5 * SIZE(B) faddp %st, %st(5) FLD -13 * SIZE(A) FLD -4 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -3 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -2 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -1 * SIZE(B) faddp %st, %st(5) FLD -12 * SIZE(A) FLD 0 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 1 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 2 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 3 * SIZE(B) faddp %st, %st(5) FLD -11 * SIZE(A) FLD 4 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 5 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 6 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 7 * SIZE(B) faddp %st, %st(5) FLD -10 * SIZE(A) FLD 8 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 9 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 10 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 11 * SIZE(B) faddp %st, %st(5) FLD -9 * SIZE(A) FLD 12 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 13 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 15 * SIZE(B) faddp %st, %st(5) FLD 0 * SIZE(A) PADDING prefetch PREFETCHSIZE * SIZE(A) addl $8 * SIZE, A fxch %st(1) addl $32 * SIZE, B FLD -16 * SIZE(B) decl %eax jne .L15 ALIGN_4 .L16: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $7, %eax je .L19 ALIGN_4 .L17: fmul %st(1), %st faddp %st, %st(3) FLD -15 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) FMUL -13 * SIZE(B) faddp %st, %st(5) FLD -15 * SIZE(A) FLD -12 * SIZE(B) addl $1 * SIZE,A addl $4 * SIZE,B decl %eax jne .L17 ALIGN_4 .L19: ffreep %st(0) ffreep %st(0) ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, A leal (A, %eax, 1), A leal (B_ORIG, %eax, 4), B #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(B) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(B) fsubp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(B) fsubp %st, %st(3) FLD 3 * SIZE - 16 * SIZE(B) fsubp %st, %st(4) #else FLD 0 * SIZE - 16 * SIZE(A) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(A) fsubp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(A) fsubp %st, %st(3) FLD 3 * SIZE - 16 * SIZE(A) fsubp %st, %st(4) #endif #ifdef LN FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef LT FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FMUL 0 * SIZE - 16 * SIZE(B) FLD 1 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(3) FLD 3 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(4) FLD 5 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) FLD 6 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(3) FLD 7 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(4) FLD 10 * SIZE - 16 * SIZE(B) fmulp %st, %st(3) FLD 11 * SIZE - 16 * SIZE(B) fmul %st(3), %st fsubrp %st, %st(4) FLD 15 * SIZE - 16 * SIZE(B) fmulp %st, %st(4) #endif #ifdef RT FLD 15 * SIZE - 16 * SIZE(B) fmulp %st, %st(4) FLD 14 * SIZE - 16 * SIZE(B) fmul %st(4), %st fsubrp %st, %st(3) FLD 13 * SIZE - 16 * SIZE(B) fmul %st(4), %st fsubrp %st, %st(2) FLD 12 * SIZE - 16 * SIZE(B) fmul %st(4), %st fsubrp %st, %st(1) FLD 10 * SIZE - 16 * SIZE(B) fmulp %st, %st(3) FLD 9 * SIZE - 16 * SIZE(B) fmul %st(3), %st fsubrp %st, %st(2) FLD 8 * SIZE - 16 * SIZE(B) fmul %st(3), %st fsubrp %st, %st(1) FLD 5 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) FLD 4 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE - 16 * SIZE(B) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE - 16 * SIZE(B) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(B) fxch %st(2) FSTU 2 * SIZE - 16 * SIZE(B) fxch %st(3) FSTU 3 * SIZE - 16 * SIZE(B) #else FSTU 0 * SIZE - 16 * SIZE(A) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(A) fxch %st(2) FSTU 2 * SIZE - 16 * SIZE(A) fxch %st(3) FSTU 3 * SIZE - 16 * SIZE(A) #endif leal (%edi, LDC, 2), %eax FST 0 * SIZE(%eax, LDC) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi, LDC) FST 0 * SIZE(%eax) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 4), B #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L14 #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (B_ORIG, %eax, 4), B_ORIG #endif #if defined(LT) || defined(RN) movl B, B_ORIG #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J jne .L11 ALIGN_4 .L20: movl N, %eax andl $2, %eax je .L30 #if defined(LT) || defined(RN) movl STACK_A, A #else movl STACK_A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B_ORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, %edi #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl B_ORIG, B #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax jle .L23 ALIGN_4 .L22: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi subl $-32 * SIZE, B decl %eax jne .L22 ALIGN_3 .L23: movl M, %esi movl %esi, I ALIGN_3 .L24: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, A leal (A , %eax, 1), A leal (B_ORIG, %eax, 2), B #else movl B_ORIG, B #endif fldz fldz fldz fldz FLD -16 * SIZE(A) FLD -16 * SIZE(B) prefetchw 1 * SIZE(%edi) prefetchw 1 * SIZE(%edi, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L26 ALIGN_3 .L25: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -15 * SIZE(A) FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -13 * SIZE(B) faddp %st, %st(4) FLD -14 * SIZE(A) FLD -12 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -11 * SIZE(B) faddp %st, %st(2) FLD -13 * SIZE(A) FLD -10 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -9 * SIZE(B) faddp %st, %st(4) FLD -12 * SIZE(A) FLD -8 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -7 * SIZE(B) faddp %st, %st(2) FLD -11 * SIZE(A) FLD -6 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -5 * SIZE(B) faddp %st, %st(4) FLD -10 * SIZE(A) FLD -4 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -3 * SIZE(B) faddp %st, %st(2) FLD -9 * SIZE(A) FLD -2 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -1 * SIZE(B) faddp %st, %st(4) FLD -8 * SIZE(A) FLD 0 * SIZE(B) addl $ 8 * SIZE, A subl $-16 * SIZE, B decl %eax jne .L25 ALIGN_4 .L26: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $7, %eax je .L29 ALIGN_4 .L27: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -15 * SIZE(A) FLD -14 * SIZE(B) addl $1 * SIZE,A addl $2 * SIZE,B decl %eax jne .L27 ALIGN_4 .L29: ffreep %st(0) ffreep %st(0) faddp %st, %st(2) faddp %st, %st(2) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, A leal (A, %eax, 1), A leal (B_ORIG, %eax, 2), B #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(B) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(B) fsubp %st, %st(2) #else FLD 0 * SIZE - 16 * SIZE(A) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(A) fsubp %st, %st(2) #endif #ifdef LN FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FMUL 0 * SIZE - 16 * SIZE(B) FLD 1 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) #endif #ifdef RT FLD 3 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE - 16 * SIZE(B) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE - 16 * SIZE(B) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(B) #else FSTU 0 * SIZE - 16 * SIZE(A) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(A) #endif FST 0 * SIZE(%edi, LDC) FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 2), B #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L24 #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (B_ORIG, %eax, 2), B_ORIG #endif #if defined(LT) || defined(RN) movl B, B_ORIG #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L30: movl N, %eax andl $1, %eax je .L999 ALIGN_3 .L31: #if defined(LT) || defined(RN) movl STACK_A, A #else movl STACK_A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B_ORIG #endif #ifdef RT subl LDC, C #endif movl C, %edi #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl B_ORIG, B #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $5, %eax jle .L33 ALIGN_4 .L32: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi subl $-32 * SIZE, B decl %eax jne .L32 ALIGN_3 .L33: movl M, %esi movl %esi, I ALIGN_3 .L34: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, A leal (A , %eax, 1), A leal (B_ORIG, %eax, 1), B #else movl B_ORIG, B #endif fldz fldz fldz fldz prefetchw 1 * SIZE(%edi) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L36 ALIGN_3 .L35: FLD -16 * SIZE(A) FMUL -16 * SIZE(B) faddp %st, %st(1) FLD -15 * SIZE(A) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -14 * SIZE(A) FMUL -14 * SIZE(B) faddp %st, %st(3) FLD -13 * SIZE(A) FMUL -13 * SIZE(B) faddp %st, %st(4) FLD -12 * SIZE(A) FMUL -12 * SIZE(B) faddp %st, %st(1) FLD -11 * SIZE(A) FMUL -11 * SIZE(B) faddp %st, %st(2) FLD -10 * SIZE(A) FMUL -10 * SIZE(B) faddp %st, %st(3) FLD -9 * SIZE(A) FMUL -9 * SIZE(B) faddp %st, %st(4) addl $8 * SIZE, A addl $8 * SIZE, B decl %eax jne .L35 ALIGN_4 .L36: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $7, %eax je .L39 ALIGN_4 .L37: FLD -16 * SIZE(A) FMUL -16 * SIZE(B) faddp %st, %st(1) addl $1 * SIZE,A addl $1 * SIZE,B decl %eax jne .L37 ALIGN_4 .L39: faddp %st, %st(2) faddp %st, %st(2) faddp %st, %st(1) #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, A leal (A, %eax, SIZE), A leal (B_ORIG, %eax, SIZE), B #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(B) fsubp %st, %st(1) #else FLD 0 * SIZE - 16 * SIZE(A) fsubp %st, %st(1) #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(A) fmulp %st, %st(1) #endif #if defined(RN) || defined(RT) FMUL 0 * SIZE - 16 * SIZE(B) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE - 16 * SIZE(B) #else FSTU 0 * SIZE - 16 * SIZE(A) #endif FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (A, %eax, SIZE), A leal (B, %eax, SIZE), B #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L34 #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (B_ORIG, %eax, 1), B_ORIG #endif #if defined(LT) || defined(RN) movl B, B_ORIG #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_2x2.S000066400000000000000000000444341313527062700206250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define A 24 + STACK + ARGS(%esp) #define B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #else #define A 20 + STACK + ARGS(%esp) #define B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #endif #define PREFETCH_OFFSET 48 #if defined(PENTIUM3) || defined(PENTIUMM) #define REP rep #else #define REP rep #endif #define AA %edx #define BB %ecx PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl LDC, %ebp # ldc # MEMORY movl B, %ebx leal (, %ebp, SIZE), %ebp #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, %ebx movl N, %eax imull %ebp, %eax addl %eax, C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax # j = (n >> 1) # MEMORY sarl $1, %eax movl %eax, J # j = (n >> 1) # MEMORY je .L8 ALIGN_4 .L34: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, %ebx #endif lea (, %ebp, 2), %eax #ifdef RT subl %eax, C #endif movl C, %edi #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %esi sarl $1, %esi je .L12 ALIGN_4 .MainHead: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 2), BB #else movl %ebx, BB #endif fldz fldz fldz fldz FLD 4 * SIZE(BB) # b5 FLD 4 * SIZE(AA) # a5 FLD 0 * SIZE(BB) # b1 FLD 0 * SIZE(AA) # a1 #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(%edi) prefetchw 2 * SIZE(%edi, %ebp, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(%edi) prefetchnta 2 * SIZE(%edi, %ebp, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L16 ALIGN_4 .MainLoop: #if defined(HAVE_3DNOW) prefetch (PREFETCH_OFFSET) * SIZE(BB) nop #elif defined(HAVE_SSE) prefetchnta (PREFETCH_OFFSET) * SIZE(BB) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) #endif #endif fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(AA) fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(AA) fmul %st, %st(1) FMUL 3 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 3 * SIZE(AA) fmul %st, %st(1) FMUL 3 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 8 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 8 * SIZE(AA) fxch %st(2) #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) #endif #endif fmul %st, %st(3) FMUL 5 * SIZE(BB) fxch %st(3) faddp %st, %st(4) FLD 4 * SIZE(BB) fxch %st(3) faddp %st, %st(5) FLD 5 * SIZE(AA) fmul %st, %st(3) FMUL 5 * SIZE(BB) fxch %st(3) faddp %st, %st(6) FLD 6 * SIZE(BB) fxch %st(3) faddp %st, %st(7) FLD 6 * SIZE(AA) fmul %st, %st(3) FMUL 7 * SIZE(BB) fxch %st(3) faddp %st, %st(4) FLD 6 * SIZE(BB) fxch %st(3) faddp %st, %st(5) FLD 7 * SIZE(AA) fmul %st, %st(3) FMUL 7 * SIZE(BB) fxch %st(3) faddp %st, %st(6) FLD 12 * SIZE(BB) fxch %st(3) faddp %st, %st(7) FLD 12 * SIZE(AA) fxch %st(2) subl $-8 * SIZE, BB subl $-8 * SIZE, AA decl %eax # l -- jne .MainLoop ALIGN_4 .L16: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L21 ALIGN_4 .SubLoop: fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(AA) fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(AA) addl $2 * SIZE,BB addl $2 * SIZE,AA decl %eax jne .SubLoop ALIGN_4 .L21: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 2), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) FLD 2 * SIZE(BB) fsubp %st, %st(3) FLD 3 * SIZE(BB) fsubp %st, %st(4) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(3) FLD 2 * SIZE(AA) fsubp %st, %st(2) FLD 3 * SIZE(AA) fsubp %st, %st(4) #endif #ifdef LN FLD 3 * SIZE(AA) fmul %st, %st(3) fmulp %st, %st(4) FLD 2 * SIZE(AA) fmul %st(3), %st FLD 2 * SIZE(AA) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(AA) fmul %st(1), %st FLD 1 * SIZE(AA) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD 3 * SIZE(AA) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(3) FLD 1 * SIZE(BB) fmul %st(1), %st FLD 1 * SIZE(BB) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD 3 * SIZE(BB) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD 3 * SIZE(BB) fmul %st, %st(2) fmulp %st, %st(4) FLD 2 * SIZE(BB) fmul %st(2), %st FLD 2 * SIZE(BB) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subl $2 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) fxch %st(2) FSTU 2 * SIZE(BB) fxch %st(3) FSTU 3 * SIZE(BB) FST 1 * SIZE(%edi,%ebp) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi) #else FSTU 0 * SIZE(AA) fxch %st(2) FSTU 1 * SIZE(AA) fxch %st(1) FSTU 2 * SIZE(AA) fxch %st(3) FSTU 3 * SIZE(AA) FST 1 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi,%ebp) #endif #ifndef LN addl $2 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %esi # i -- jne .MainHead ALIGN_4 .L12: movl M, %eax # m # MEMORY andl $1, %eax je .L27 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 2), BB #else movl %ebx, BB #endif fldz fldz FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1,%eax # k >> 1 # MEMORY je .L54 ALIGN_4 .L55: FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L55 ALIGN_4 .L54: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1,%eax # k & 1 je .L33 ALIGN_4 FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) addl $1 * SIZE, AA addl $2 * SIZE, BB ALIGN_4 .L33: ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 2), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD 0 * SIZE(BB) fmulp %st, %st(1) FLD 1 * SIZE(BB) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE(BB) fmulp %st, %st(2) #endif #ifdef RT FLD 3 * SIZE(BB) fmulp %st, %st(2) FLD 2 * SIZE(BB) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE(BB) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) #else FSTU 0 * SIZE(AA) fxch %st(1) FSTU 1 * SIZE(AA) #endif FST 0 * SIZE(%edi,%ebp) FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L27: #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (%ebx, %eax, 2), %ebx #endif #if defined(LT) || defined(RN) movl BB, %ebx #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j-- # MEMORY jne .L34 ALIGN_4 .L8: movl N, %eax # n # MEMORY andl $1, %eax je .End #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, %ebx #endif #ifdef RT subl %ebp, C #endif movl C, %edi # c # MEMORY #ifndef RT addl %ebp, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %esi # m # MEMORY sarl $1, %esi # m >> 1 je .L36 ALIGN_4 .L46: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 1), BB #else movl %ebx, BB #endif fldz fldz FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax je .L56 ALIGN_4 .L57: FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) addl $4 * SIZE,AA addl $2 * SIZE,BB dec %eax jne .L57 ALIGN_4 .L56: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax je .L45 ALIGN_4 FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) addl $2 * SIZE,AA addl $1 * SIZE,BB ALIGN_4 .L45: ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 1), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(2) #endif #ifdef LN FLD 3 * SIZE(AA) fmulp %st, %st(2) FLD 2 * SIZE(AA) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE(AA) fmulp %st, %st(1) #endif #ifdef LT FLD 0 * SIZE(AA) fmulp %st, %st(1) FLD 1 * SIZE(AA) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE(AA) fmulp %st, %st(2) #endif #ifdef RN FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subl $2 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) #else FSTU 0 * SIZE(AA) fxch %st(1) FSTU 1 * SIZE(AA) #endif FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) #ifndef LN addl $2 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %esi # i -- jne .L46 ALIGN_4 .L36: movl M, %eax # m # MEMORY andl $1, %eax # m & 1 je .L99 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 1), BB #else movl %ebx, BB #endif fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif test %eax, %eax jle .L52 ALIGN_3 .L51: FLD (AA) FMUL (BB) addl $1 * SIZE,AA addl $1 * SIZE,BB faddp %st,%st(1) decl %eax jne .L51 ALIGN_4 .L52: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 1), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) #endif #if defined(LN) || defined(LT) FMUL 0 * SIZE(AA) #else FMUL 0 * SIZE(BB) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) #else FSTU 0 * SIZE(AA) #endif FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax leal (%ebx, %eax, SIZE), %ebx #endif #if defined(LT) || defined(RN) movl BB, %ebx #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .End: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_2x2_atom.S000066400000000000000000000461641313527062700216470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) xorps %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 3 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 5 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 4 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 5 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 7 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 7 * SIZE(BB), %xmm3 addl $8 * SIZE, BB addl $8 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addsd %xmm2, %xmm6 addsd %xmm3, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm1 movsd 2 * SIZE(BB), %xmm2 movsd 3 * SIZE(BB), %xmm3 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 subsd %xmm6, %xmm2 subsd %xmm7, %xmm3 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm2 movsd 2 * SIZE(AA), %xmm1 movsd 3 * SIZE(AA), %xmm3 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 subsd %xmm5, %xmm1 subsd %xmm7, %xmm3 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm4, %xmm3 movsd 0 * SIZE(AA), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm2, %xmm5 mulsd %xmm3, %xmm6 subsd %xmm5, %xmm0 subsd %xmm6, %xmm1 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm4, %xmm1 movsd 3 * SIZE(AA), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm0, %xmm5 mulsd %xmm1, %xmm6 subsd %xmm5, %xmm2 subsd %xmm6, %xmm3 mulsd %xmm7, %xmm2 mulsd %xmm7, %xmm3 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm5 mulsd %xmm4, %xmm2 movsd 3 * SIZE(BB), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm0, %xmm5 mulsd %xmm2, %xmm6 subsd %xmm5, %xmm1 subsd %xmm6, %xmm3 mulsd %xmm7, %xmm1 mulsd %xmm7, %xmm3 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm5 mulsd %xmm4, %xmm3 movsd 0 * SIZE(BB), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm1, %xmm5 mulsd %xmm3, %xmm6 subsd %xmm5, %xmm0 subsd %xmm6, %xmm2 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) movsd %xmm2, 2 * SIZE(BB) movsd %xmm3, 3 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm2, 1 * SIZE(AA) movsd %xmm1, 2 * SIZE(AA) movsd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) movsd %xmm3, 1 * SIZE(CO1, LDC) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L25 ALIGN_4 .L22: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 3 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 5 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 6 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 7 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addsd %xmm2, %xmm4 addsd %xmm3, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm7 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm5 movaps %xmm5, %xmm6 movsd 3 * SIZE(BB), %xmm7 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm1 mulsd %xmm7, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm5 movaps %xmm5, %xmm6 movsd 0 * SIZE(BB), %xmm7 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm0 mulsd %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L10 ALIGN_4 .L30: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L40 ALIGN_4 .L31: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 prefetcht0 3 * SIZE(CO1) xorps %xmm2, %xmm2 xorps %xmm4, %xmm4 xorps %xmm6, %xmm6 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 2 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 4 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addsd %xmm0, %xmm4 addsd %xmm2, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm2 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm2 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 movsd 0 * SIZE(AA), %xmm7 subsd %xmm5, %xmm0 mulsd %xmm7, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm7 subsd %xmm5, %xmm2 mulsd %xmm7, %xmm2 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm2, 1 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L31 ALIGN_4 .L40: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L49 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 movsd 0 * SIZE(BB), %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L45 ALIGN_4 .L42: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 3 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addsd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulsd 0 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd 0 * SIZE(BB), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L49: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_2x4_penryn.S000066400000000000000000001051311313527062700222120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC subl $-16 * SIZE, A subl $-16 * SIZE, B #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif leal (CO1, LDC, 2), %eax movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -2 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -2 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 -2 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 -2 * SIZE(%eax, LDC) #else pxor %xmm4, %xmm4 prefetcht0 1 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 1 * SIZE(%eax, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 subl $-32 * SIZE, BB mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 movaps %xmm6, %xmm0 movsd %xmm7, %xmm6 movsd %xmm0, %xmm7 #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd -16 * SIZE(BB), %xmm2 movapd -14 * SIZE(BB), %xmm5 movapd -12 * SIZE(BB), %xmm3 movapd -10 * SIZE(BB), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 movapd -12 * SIZE(AA), %xmm2 movapd -10 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movddup -14 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movddup -15 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -14 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movddup -13 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movddup -11 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -10 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movddup -9 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movddup -6 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup -5 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup -1 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup -1 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 movddup -2 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup -3 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movddup -4 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movddup -6 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup -7 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movddup -8 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movddup -11 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -12 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(BB) movapd %xmm5, -14 * SIZE(BB) movapd %xmm3, -12 * SIZE(BB) movapd %xmm7, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) movapd %xmm2, -12 * SIZE(AA) movapd %xmm3, -10 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movsd %xmm5, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movaps -14 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps -8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -2 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 0 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 6 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 14 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 18 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L28 .L26: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm0 movapd -14 * SIZE(BB), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #else movapd -16 * SIZE(AA), %xmm1 movapd -14 * SIZE(AA), %xmm3 subpd %xmm4, %xmm1 subpd %xmm5, %xmm3 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm3, %xmm2 unpckhpd %xmm3, %xmm3 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -14 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movsd -13 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movsd -11 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -10 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movsd -9 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movsd -6 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd -5 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movsd -1 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movsd -1 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 movsd -2 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movsd -3 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movsd -4 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movsd -6 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd -7 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movsd -8 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movsd -11 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -12 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) movsd %xmm2, -14 * SIZE(AA) movsd %xmm3, -13 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L30: testl $2, N je .L60 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 #ifdef LN prefetcht0 -2 * SIZE(CO1) pxor %xmm6, %xmm6 prefetcht0 -2 * SIZE(CO1, LDC) pxor %xmm7, %xmm7 #else prefetcht0 1 * SIZE(CO1) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(CO1, LDC) pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -10 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -6 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -2 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd -16 * SIZE(BB), %xmm2 movapd -14 * SIZE(BB), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movddup -14 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movddup -15 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -13 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup -13 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -14 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(BB) movapd %xmm3, -14 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -6 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -4 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -2 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps 0 * SIZE(BB), %xmm2 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L58 .L56: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm0 subpd %xmm4, %xmm0 #else movapd -16 * SIZE(AA), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -13 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movsd -13 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -14 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 #ifdef LN prefetcht0 -2 * SIZE(CO1) #else prefetcht0 1 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: pshufd $0x44, %xmm1, %xmm2 movsd -15 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #else movapd -16 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -14 * SIZE(AA), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BB) movsd %xmm1, -15 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L89 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 subl $-8 * SIZE, AA subl $-8 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd -15 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd -15 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 haddpd %xmm4, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd -16 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L89: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_2x4_sse2.S000066400000000000000000001353501313527062700215610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA 16 + STACK + ARGS(%esi) #define OLD_A 24 + STACK + ARGS(%esi) #define OLD_B 28 + STACK + ARGS(%esi) #define OLD_C 32 + STACK + ARGS(%esi) #define OLD_LDC 36 + STACK + ARGS(%esi) #define OLD_OFFT 40 + STACK + ARGS(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movd OLD_OFFT, %mm4 movl OLD_B, B movl OLD_C, %ebx movl %ebx, C movl OLD_LDC, LDC movd %mm4, OFFSET movd %mm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L05 ALIGN_4 .L02: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L05: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L10 movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) prefetchw -2 * SIZE(CO1, LDC, 2) prefetchw -2 * SIZE(CO1, %eax) #else prefetchw 1 * SIZE(CO1) prefetchw 1 * SIZE(CO1, LDC) prefetchw 1 * SIZE(CO1, LDC, 2) prefetchw 1 * SIZE(CO1, %eax) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm5 movapd 4 * SIZE(B), %xmm3 movapd 6 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movlpd 2 * SIZE(AA), %xmm4 movhpd 2 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movlpd 1 * SIZE(AA), %xmm4 movhpd 1 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movlpd 5 * SIZE(B), %xmm4 movhpd 5 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 6 * SIZE(B), %xmm4 movhpd 6 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movlpd 7 * SIZE(B), %xmm4 movhpd 7 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movlpd 10 * SIZE(B), %xmm4 movhpd 10 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 movlpd 11 * SIZE(B), %xmm4 movhpd 11 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movlpd 15 * SIZE(B), %xmm4 movhpd 15 * SIZE(B), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm4 movhpd 15 * SIZE(B), %xmm4 mulpd %xmm4, %xmm3 movlpd 14 * SIZE(B), %xmm4 movhpd 14 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movlpd 13 * SIZE(B), %xmm4 movhpd 13 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movlpd 12 * SIZE(B), %xmm4 movhpd 12 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movlpd 10 * SIZE(B), %xmm4 movhpd 10 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 movlpd 9 * SIZE(B), %xmm4 movhpd 9 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movlpd 8 * SIZE(B), %xmm4 movhpd 8 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movlpd 5 * SIZE(B), %xmm4 movhpd 5 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 4 * SIZE(B), %xmm4 movhpd 4 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movapd %xmm3, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm5, 4 * SIZE(BB) movlpd %xmm5, 5 * SIZE(BB) movhpd %xmm5, 6 * SIZE(BB) movhpd %xmm5, 7 * SIZE(BB) movlpd %xmm3, 8 * SIZE(BB) movlpd %xmm3, 9 * SIZE(BB) movhpd %xmm3, 10 * SIZE(BB) movhpd %xmm3, 11 * SIZE(BB) movlpd %xmm7, 12 * SIZE(BB) movlpd %xmm7, 13 * SIZE(BB) movhpd %xmm7, 14 * SIZE(BB) movhpd %xmm7, 15 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movlpd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movlpd 10 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movlpd 12 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 18 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 20 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 22 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 32 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movlpd 26 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movlpd 28 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 30 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movlpd 34 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movlpd 36 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 38 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 48 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movlpd 42 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movlpd 44 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 46 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 56 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movlpd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movlpd 50 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movlpd 52 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 54 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 64 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movlpd 58 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movlpd 60 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 62 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 72 * SIZE(BB), %xmm3 addl $64 * SIZE, BB addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $8 * SIZE, AA decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 8 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm5, %xmm4 unpcklpd %xmm7, %xmm6 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm5 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 #else movlpd 0 * SIZE(AA), %xmm0 movlpd 1 * SIZE(AA), %xmm1 movlpd 2 * SIZE(AA), %xmm2 movlpd 3 * SIZE(AA), %xmm3 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 subsd %xmm6, %xmm2 subsd %xmm7, %xmm3 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movlpd 3 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movlpd 5 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 6 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movlpd 7 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movlpd 10 * SIZE(B), %xmm4 mulsd %xmm4, %xmm2 movlpd 11 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movlpd 15 * SIZE(B), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm4 mulsd %xmm4, %xmm3 movlpd 14 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movlpd 13 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movlpd 12 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movlpd 10 * SIZE(B), %xmm4 mulsd %xmm4, %xmm2 movlpd 9 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movlpd 8 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movlpd 5 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 4 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm5, 4 * SIZE(BB) movlpd %xmm5, 5 * SIZE(BB) movhpd %xmm5, 6 * SIZE(BB) movhpd %xmm5, 7 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) movlpd %xmm1, 1 * SIZE(AA) movlpd %xmm2, 2 * SIZE(AA) movlpd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 4), B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L01 ALIGN_4 .L30: testl $2, N je .L60 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L35 ALIGN_4 .L32: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L32 ALIGN_2 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L40 ALIGN_2 .L36: movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L36 ALIGN_4 .L40: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) #else prefetchw 1 * SIZE(CO1) prefetchw 1 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: mulpd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movlpd 2 * SIZE(AA), %xmm4 movhpd 2 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movlpd 1 * SIZE(AA), %xmm4 movhpd 1 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm3, 4 * SIZE(BB) movlpd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movlpd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movlpd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movlpd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movlpd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movlpd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movlpd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movlpd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 .L56: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm5, %xmm4 movapd 0 * SIZE(B), %xmm2 subpd %xmm4, %xmm2 #else movlpd 0 * SIZE(AA), %xmm0 movlpd 1 * SIZE(AA), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movlpd 3 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) movlpd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: testl $1, N je .L999 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L65 ALIGN_4 .L62: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L62 ALIGN_2 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L70 ALIGN_2 .L66: movq 0 * SIZE(B), %mm0 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L66 ALIGN_4 .L70: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) #else prefetchw 1 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) movapd 16 * SIZE(BB), %xmm2 movapd 2 * SIZE(AA), %xmm0 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movapd 24 * SIZE(BB), %xmm3 movapd 10 * SIZE(AA), %xmm1 mulpd 10 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(AA), %xmm0 movapd 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 subpd %xmm4, %xmm2 #else movapd 0 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movapd %xmm2, %xmm3 unpckhpd %xmm3, %xmm3 movlpd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movlpd 2 * SIZE(AA), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm3, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm3 unpckhpd %xmm3, %xmm3 movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movlpd 1 * SIZE(AA), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movlpd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm3, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L99 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L85 ALIGN_4 .L82: mulsd %xmm0, %xmm2 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) movlpd 1 * SIZE(AA), %xmm0 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm6 movlpd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movlpd 5 * SIZE(AA), %xmm1 mulsd 10 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movlpd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm6 movlpd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 2 * SIZE(BB), %xmm2 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addsd %xmm5, %xmm4 addsd %xmm7, %xmm6 addsd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(B), %xmm2 subsd %xmm4, %xmm2 #else movlpd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) #else movlpd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B,%eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_2x4_sse3.S000066400000000000000000001076141313527062700215640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #ifdef PENTIUM4 #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef PENTIUMM #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax #ifdef LN prefetchnta -2 * SIZE(CO1) prefetchnta -2 * SIZE(CO1, LDC, 1) prefetchnta -2 * SIZE(CO1, LDC, 2) prefetchnta -2 * SIZE(CO1, %eax, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) prefetchnta 2 * SIZE(CO1, LDC, 2) prefetchnta 2 * SIZE(CO1, %eax, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 17 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 18 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 19 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 10 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 20 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 21 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 22 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 23 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 32 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 25 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 26 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 27 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 28 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 29 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 30 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 31 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 40 * SIZE(BB), %xmm3 addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 2 * SIZE(BB), %xmm5 movapd 4 * SIZE(BB), %xmm3 movapd 6 * SIZE(BB), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movddup 2 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movddup 1 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup 1 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup 2 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movddup 3 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movddup 5 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 6 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movddup 7 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movddup 10 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup 11 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup 15 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup 15 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 movddup 14 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup 13 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movddup 12 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movddup 10 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup 9 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movddup 8 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movddup 5 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 4 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BB) movapd %xmm5, 2 * SIZE(BB) movapd %xmm3, 4 * SIZE(BB) movapd %xmm7, 6 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movsd %xmm5, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 18 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 22 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 26 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 7 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 30 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 34 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 36 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 9 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 38 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 48 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 42 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 44 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 11 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 46 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 56 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 50 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 52 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 13 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 54 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 64 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 58 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 60 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 15 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 62 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 72 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L28 .L26: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm0 movapd 2 * SIZE(BB), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #else movapd 0 * SIZE(AA), %xmm1 movapd 2 * SIZE(AA), %xmm3 subpd %xmm4, %xmm1 subpd %xmm5, %xmm3 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm3, %xmm2 unpckhpd %xmm3, %xmm3 #endif #ifdef LN movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movsd 3 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movsd 5 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 6 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movsd 7 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movsd 10 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd 11 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movsd 15 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movsd 15 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 movsd 14 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movsd 13 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movsd 12 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movsd 10 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd 9 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movsd 8 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movsd 5 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 4 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) movsd %xmm2, 2 * SIZE(AA) movsd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L30: testl $2, N je .L60 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) prefetchnta -2 * SIZE(CO1, LDC, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(BB), %xmm2 movapd 2 * SIZE(BB), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movddup 2 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movddup 1 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup 1 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup 3 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup 3 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 2 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BB) movapd %xmm3, 2 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 2 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 3 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup 5 * SIZE(AA), %xmm0 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm0 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 6 * SIZE(AA), %xmm0 mulpd 12 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 7 * SIZE(AA), %xmm0 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 movddup 9 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 mulpd 18 * SIZE(BB), %xmm1 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 10 * SIZE(AA), %xmm1 mulpd 20 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 11 * SIZE(AA), %xmm1 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 movddup 13 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 26 * SIZE(BB), %xmm1 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 14 * SIZE(AA), %xmm1 mulpd 28 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 15 * SIZE(AA), %xmm1 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L58 .L56: mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm0 subpd %xmm4, %xmm0 #else movapd 0 * SIZE(AA), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #endif #ifdef LN movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 4 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) #else prefetchnta 2 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd 4 * SIZE(AA), %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm2, %xmm7 movddup 8 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movddup 5 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm3, %xmm5 movddup 6 * SIZE(BB), %xmm3 mulpd 12 * SIZE(AA), %xmm3 addpd %xmm3, %xmm6 movddup 7 * SIZE(BB), %xmm3 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #else movapd 0 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(AA), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L89 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd 8 * SIZE(AA), %xmm1 movhpd 9 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movhpd 1 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsd 8 * SIZE(BB), %xmm3 movhpd 9 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L89: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_4x2_core2.S000066400000000000000000001126651313527062700217230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK subl $-16 * SIZE, A subl $-16 * SIZE, B sall $BASE_SHIFT, LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J jle .L100 ALIGN_2 .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 # coffset = c #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm7, %xmm7 prefetcht2 -3 * SIZE(CO1, LDC) #else prefetcht2 3 * SIZE(CO1) pxor %xmm7, %xmm7 prefetcht2 3 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd 8 * SIZE(AA), %xmm3 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm2, %xmm6 movapd 12 * SIZE(AA), %xmm3 addpd %xmm1, %xmm7 movapd 12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 subl $-32 * SIZE, BB movapd 24 * SIZE(AA), %xmm3 subl $-32 * SIZE, AA addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -16 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd -16 * SIZE(B), %xmm2 movapd -14 * SIZE(B), %xmm3 movapd -12 * SIZE(B), %xmm5 movapd -10 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 subpd %xmm6, %xmm5 subpd %xmm1, %xmm7 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 movapd -12 * SIZE(AA), %xmm2 movapd -10 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 subpd %xmm5, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 movddup -2 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm5 movddup -3 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm3 movddup -4 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm2 movddup -6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movddup -7 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm3 movddup -8 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm2 movddup -11 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -12 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movddup -15 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movddup -14 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm5 movddup -13 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm7 movddup -11 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -10 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm5 movddup -9 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm7 movddup -6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movddup -5 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm7 movddup -1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 movddup -15 * SIZE(B), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm1, %xmm5 subpd %xmm5, %xmm3 movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 movddup -14 * SIZE(B), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 mulpd %xmm3, %xmm5 subpd %xmm5, %xmm1 movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movsd %xmm7, 3 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm3, 1 * SIZE(CO1, LDC) movhpd %xmm5, 2 * SIZE(CO1, LDC) movhpd %xmm7, 3 * SIZE(CO1, LDC) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm2, 1 * SIZE(CO1, LDC) movsd %xmm3, 2 * SIZE(CO1, LDC) movhpd %xmm3, 3 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(B) movapd %xmm3, -14 * SIZE(B) movapd %xmm5, -12 * SIZE(B) movapd %xmm7, -10 * SIZE(B) movddup %xmm2, %xmm0 movddup %xmm3, %xmm1 movddup %xmm5, %xmm4 movddup %xmm7, %xmm6 unpckhpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 unpckhpd %xmm5, %xmm5 unpckhpd %xmm7, %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm2, -14 * SIZE(BB) movapd %xmm1, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) movapd %xmm2, -12 * SIZE(AA) movapd %xmm3, -10 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 addpd %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 addpd %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 addpd %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 addpd %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L31 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd -16 * SIZE(B), %xmm2 movapd -14 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -14 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movddup -15 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movddup -13 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 #endif #ifdef RN movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movddup -14 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm3, 1 * SIZE(CO1, LDC) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) movhpd %xmm1, 1 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(B) movapd %xmm3, -14 * SIZE(B) movddup %xmm2, %xmm0 movddup %xmm3, %xmm1 unpckhpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 movapd %xmm0, -16 * SIZE(BB) movapd %xmm2, -14 * SIZE(BB) movapd %xmm1, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: movl M, %ebx testl $1, %ebx jle .L99 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -12 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm1 mulsd -10 * SIZE(BB), %xmm0 addsd %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 addsd %xmm0, %xmm7 movsd -14 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd -4 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd -13 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 8 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd -8 * SIZE(AA), %xmm0 mulsd %xmm2, %xmm1 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm4 movsd 4 * SIZE(BB), %xmm1 addsd %xmm2, %xmm5 movsd -11 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm1 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm6 movsd 16 * SIZE(BB), %xmm1 addsd %xmm2, %xmm7 movsd -10 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 10 * SIZE(BB), %xmm2 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm2, %xmm5 movsd -9 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 14 * SIZE(BB), %xmm2 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm2, %xmm7 movsd -4 * SIZE(AA), %xmm2 subl $-8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L51 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm0 movsd -15 * SIZE(B), %xmm1 #else movsd -16 * SIZE(AA), %xmm0 movsd -15 * SIZE(AA), %xmm1 #endif subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #if defined(LN) || defined(LT) movsd -16 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm0 mulsd %xmm2, %xmm1 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm0 movsd -15 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 mulsd -13 * SIZE(B), %xmm1 #endif #ifdef RT mulsd -13 * SIZE(B), %xmm1 movsd -14 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 mulsd -16 * SIZE(B), %xmm0 #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(B) movsd %xmm1, -15 * SIZE(B) movsd %xmm0, -16 * SIZE(BB) movsd %xmm0, -15 * SIZE(BB) movsd %xmm1, -14 * SIZE(BB) movsd %xmm1, -13 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, -16 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 # coffset = c #ifndef RT addl LDC, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 .L111: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AA), %xmm1 addpd %xmm0, %xmm5 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -6 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 movapd -4 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -2 * SIZE(AA), %xmm1 addpd %xmm2, %xmm5 movapd 8 * SIZE(AA), %xmm2 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm0 mulpd 2 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 6 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm2, %xmm4 movapd 12 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm2, %xmm5 movapd 24 * SIZE(AA), %xmm2 addpd %xmm3, %xmm7 movapd 8 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L111 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $2 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 #endif subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd -1 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movsd -2 * SIZE(AA), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd -3 * SIZE(AA), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd -4 * SIZE(AA), %xmm7 mulsd %xmm3, %xmm7 subsd %xmm7, %xmm0 movsd -6 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -7 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm2 movsd -8 * SIZE(AA), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd -11 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -12 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -14 * SIZE(AA), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd -13 * SIZE(AA), %xmm7 mulsd %xmm0, %xmm7 subsd %xmm7, %xmm3 movsd -11 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -10 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm1 movsd -9 * SIZE(AA), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd -6 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -5 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd -1 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movddup %xmm0, %xmm2 movddup %xmm1, %xmm3 unpckhpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm2, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) movapd %xmm3, -12 * SIZE(BB) movapd %xmm1, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 movapd -8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm2, %xmm3 movapd -6 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -2 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd 8 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd 8 * SIZE(BB), %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm0 #else movapd -16 * SIZE(AA), %xmm0 #endif subpd %xmm4, %xmm0 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -14 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm2, %xmm0 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(B) movddup %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 movapd %xmm1, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: movl M, %ebx testl $1, %ebx jle .L159 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -8 * SIZE(BB), %xmm3 movsd -12 * SIZE(AA), %xmm2 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -14 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -12 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -13 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -10 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -8 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -0 * SIZE(BB), %xmm1 mulsd %xmm2, %xmm3 movsd -11 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -6 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -10 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd -4 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -9 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -2 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -4 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd 8 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: addsd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm0 #else movsd -16 * SIZE(AA), %xmm0 #endif subsd %xmm4, %xmm0 #if defined(LN) || defined(LT) mulsd -16 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd -16 * SIZE(B), %xmm0 #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(B) movsd %xmm0, -16 * SIZE(BB) movsd %xmm0, -15 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L159: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 1), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_4x2_sse2.S000066400000000000000000001237121313527062700215600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define PREFETCHSIZE (8 * 4) #define KERNEL1(address) \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK sall $BASE_SHIFT, LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J jle .L100 ALIGN_2 .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, %esi # coffset = c #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #ifdef PENTIUM4 andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else sarl $3, %eax je .L12 .L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $32 * SIZE, %ecx addl $32 * SIZE, %edx decl %eax jne .L11 #endif .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 subpd %xmm6, %xmm5 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 subpd %xmm5, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movsd 15 * SIZE(AA), %xmm0 movhpd 15 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 movsd 14 * SIZE(AA), %xmm0 movhpd 14 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm5 movsd 13 * SIZE(AA), %xmm0 movhpd 13 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 movhpd 12 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm2 movsd 10 * SIZE(AA), %xmm0 movhpd 10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movsd 9 * SIZE(AA), %xmm0 movhpd 9 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm3 movsd 8 * SIZE(AA), %xmm0 movhpd 8 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm2 movsd 5 * SIZE(AA), %xmm0 movhpd 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 movhpd 4 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 movhpd 2 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm7 movsd 5 * SIZE(AA), %xmm0 movhpd 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 movhpd 6 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm5 movsd 7 * SIZE(AA), %xmm0 movhpd 7 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm7 movsd 10 * SIZE(AA), %xmm0 movhpd 10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movsd 11 * SIZE(AA), %xmm0 movhpd 11 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm7 movsd 15 * SIZE(AA), %xmm0 movhpd 15 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 #endif #ifdef RN movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 #endif #ifdef RT movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movsd %xmm2, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movsd %xmm3, 4 * SIZE(BB) movsd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) movsd %xmm5, 8 * SIZE(BB) movsd %xmm5, 9 * SIZE(BB) movhpd %xmm5, 10 * SIZE(BB) movhpd %xmm5, 11 * SIZE(BB) movsd %xmm7, 12 * SIZE(BB) movsd %xmm7, 13 * SIZE(BB) movhpd %xmm7, 14 * SIZE(BB) movhpd %xmm7, 15 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, %esi #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(%esi) movsd %xmm3, 1 * SIZE(%esi) movsd %xmm5, 2 * SIZE(%esi) movsd %xmm7, 3 * SIZE(%esi) movhpd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm3, 1 * SIZE(%esi, LDC) movhpd %xmm5, 2 * SIZE(%esi, LDC) movhpd %xmm7, 3 * SIZE(%esi, LDC) #else movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 2 * SIZE(%esi) movhpd %xmm1, 3 * SIZE(%esi) movsd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm2, 1 * SIZE(%esi, LDC) movsd %xmm3, 2 * SIZE(%esi, LDC) movhpd %xmm3, 3 * SIZE(%esi, LDC) #endif #ifndef LN addl $4 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L31 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 movhpd 2 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 #endif #ifdef RN movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movsd %xmm2, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movsd %xmm3, 4 * SIZE(BB) movsd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, %esi #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(%esi) movsd %xmm3, 1 * SIZE(%esi) movhpd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm3, 1 * SIZE(%esi, LDC) #else movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 0 * SIZE(%esi, LDC) movhpd %xmm1, 1 * SIZE(%esi, LDC) #endif #ifndef LN addl $2 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: movl M, %ebx testl $1, %ebx jle .L99 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal BUFFER, %ecx #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L51 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm1 #endif subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm0 mulsd %xmm2, %xmm1 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 mulsd 3 * SIZE(B), %xmm1 #endif #ifdef RT mulsd 3 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 mulsd 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(B) movsd %xmm1, 1 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movsd %xmm1, 2 * SIZE(BB) movsd %xmm1, 3 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movsd %xmm1, 0 * SIZE(%esi, LDC) #ifndef LN addl $1 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movsd 0 * SIZE(B), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, B addl $2 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, %esi # coffset = c #ifndef RT addl LDC, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 .L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 10 * SIZE(AA), %xmm2 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm6 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 14 * SIZE(AA), %xmm2 addpd %xmm1, %xmm5 movapd 24 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movapd 16 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm0 mulpd 18 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 20 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd 10 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 22 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 32 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd 12 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 26 * SIZE(AA), %xmm3 addpd %xmm1, %xmm4 movapd 28 * SIZE(AA), %xmm1 addpd %xmm3, %xmm6 movapd 14 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 30 * SIZE(AA), %xmm3 addpd %xmm1, %xmm5 movapd 40 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movapd 24 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L111 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 addl $4 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm0 movapd 2 * SIZE(B), %xmm1 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 #endif subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd 15 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movsd 14 * SIZE(AA), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 13 * SIZE(AA), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 12 * SIZE(AA), %xmm7 mulsd %xmm3, %xmm7 subsd %xmm7, %xmm0 movsd 10 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 9 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm2 movsd 8 * SIZE(AA), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 5 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 4 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 2 * SIZE(AA), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 3 * SIZE(AA), %xmm7 mulsd %xmm0, %xmm7 subsd %xmm7, %xmm3 movsd 5 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 6 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm1 movsd 7 * SIZE(AA), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 10 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 11 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 15 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movhpd %xmm0, 2 * SIZE(BB) movhpd %xmm0, 3 * SIZE(BB) movsd %xmm1, 4 * SIZE(BB) movsd %xmm1, 5 * SIZE(BB) movhpd %xmm1, 6 * SIZE(BB) movhpd %xmm1, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 2 * SIZE(%esi) movhpd %xmm1, 3 * SIZE(%esi) #ifndef LN addl $4 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movapd 0 * SIZE(AA), %xmm0 mulpd 0 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 addl $2 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm0 #else movapd 0 * SIZE(AA), %xmm0 #endif subpd %xmm4, %xmm0 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm2, %xmm0 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movhpd %xmm0, 2 * SIZE(BB) movhpd %xmm0, 3 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) #ifndef LN addl $2 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: movl M, %ebx testl $1, %ebx jle .L159 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal BUFFER, BB movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 mulsd 2 * SIZE(BB), %xmm0 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movsd 5 * SIZE(AA), %xmm1 addsd %xmm3, %xmm4 mulsd 10 * SIZE(BB), %xmm1 movsd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm4 movsd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm0 #else movsd 0 * SIZE(AA), %xmm0 #endif subsd %xmm4, %xmm0 #if defined(LN) || defined(LT) mulsd 0 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) #ifndef LN addl $1 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L159: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 1), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_4x4_penryn.S000066400000000000000000001540611313527062700222220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 20 + STACK + ARGS(%esp) #define ARG_B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define ARG_LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 4) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC subl $-32 * SIZE, A subl $-32 * SIZE, B #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L40 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif leal (CO1, LDC, 2), %eax movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 -4 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 -4 * SIZE(%eax, LDC) #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 3 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(%eax, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 4), BB #endif addps %xmm3, %xmm6 addps %xmm2, %xmm7 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm4 movaps %xmm6, %xmm2 unpcklps %xmm5, %xmm2 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm1 movlhps %xmm2, %xmm0 movhlps %xmm2, %xmm1 movaps %xmm6, %xmm7 movlhps %xmm4, %xmm6 movhlps %xmm4, %xmm7 pshufd $0x39, %xmm1, %xmm2 pshufd $0x39, %xmm7, %xmm4 movaps -32 * SIZE(BB), %xmm1 movaps -28 * SIZE(BB), %xmm3 movaps -24 * SIZE(BB), %xmm5 movaps -20 * SIZE(BB), %xmm7 subps %xmm0, %xmm1 subps %xmm2, %xmm3 subps %xmm6, %xmm5 subps %xmm4, %xmm7 #else pshufd $0x39, %xmm5, %xmm2 pshufd $0x4e, %xmm6, %xmm0 pshufd $0x93, %xmm7, %xmm7 movaps %xmm4, %xmm6 unpcklps %xmm0, %xmm4 unpckhps %xmm0, %xmm6 movaps %xmm2, %xmm1 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm5 unpcklps %xmm2, %xmm4 unpckhps %xmm2, %xmm5 movaps %xmm6, %xmm7 unpcklps %xmm1, %xmm6 unpckhps %xmm1, %xmm7 pshufd $0x93, %xmm5, %xmm5 pshufd $0x4e, %xmm6, %xmm6 pshufd $0x39, %xmm7, %xmm7 movaps -32 * SIZE(AA), %xmm0 movaps -28 * SIZE(AA), %xmm1 movaps -24 * SIZE(AA), %xmm2 movaps -20 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) movaps %xmm5, -24 * SIZE(BB) movaps %xmm7, -20 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) movaps %xmm1, -28 * SIZE(AA) movaps %xmm2, -24 * SIZE(AA) movaps %xmm3, -20 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) movhps %xmm6, 2 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) movhps %xmm3, 2 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 subl $-16 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps -32 * SIZE(BB), %xmm1 movaps -28 * SIZE(BB), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 movsd -30 * SIZE(AA), %xmm1 movsd -28 * SIZE(AA), %xmm2 movsd -26 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps -32 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) movlps %xmm1, -30 * SIZE(AA) movlps %xmm2, -28 * SIZE(AA) movlps %xmm3, -26 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: testl $1, M je .L39 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(BB), %xmm1 subps %xmm4, %xmm1 #else movsd -32 * SIZE(AA), %xmm0 movhps -30 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) movss %xmm1, -31 * SIZE(AA) movss %xmm2, -30 * SIZE(AA) movss %xmm3, -29 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movss %xmm1, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC, 1) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm6, 0 * SIZE(CO1, %eax, 1) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC, 1) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L39: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L40: testl $2, N je .L80 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) #endif pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), BB #endif addps %xmm6, %xmm4 addps %xmm7, %xmm5 addps %xmm2, %xmm4 addps %xmm3, %xmm5 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movsd -32 * SIZE(BB), %xmm1 movsd -30 * SIZE(BB), %xmm3 movsd -28 * SIZE(BB), %xmm5 movsd -26 * SIZE(BB), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps -32 * SIZE(AA), %xmm0 movaps -28 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) movlps %xmm5, -28 * SIZE(BB) movlps %xmm7, -26 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) movaps %xmm1, -28 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movaps -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif addps %xmm3, %xmm4 addps %xmm5, %xmm4 movhlps %xmm4, %xmm5 #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movsd -32 * SIZE(BB), %xmm1 movsd -30 * SIZE(BB), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 movsd -30 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps -32 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) movlps %xmm1, -30 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L70: testl $1, M je .L79 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -28 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -22 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -20 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -18 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -16 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif addps %xmm5, %xmm4 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 movsd -32 * SIZE(BB), %xmm1 subps %xmm4, %xmm1 #else movss -32 * SIZE(AA), %xmm0 movss -31 * SIZE(AA), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) movss %xmm1, -31 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) pshufd $1, %xmm1, %xmm3 movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L79: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L80: testl $1, N je .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss -32 * SIZE(BB), %xmm1 movss -31 * SIZE(BB), %xmm3 movss -30 * SIZE(BB), %xmm5 movss -29 * SIZE(BB), %xmm7 subss %xmm4, %xmm1 subss %xmm6, %xmm3 subss %xmm0, %xmm5 subss %xmm2, %xmm7 #else movaps -32 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 #endif #if defined(RN) || defined(RT) movss -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) movss %xmm3, -31 * SIZE(BB) movss %xmm5, -30 * SIZE(BB) movss %xmm7, -29 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) pshufd $1, %xmm4, %xmm6 movss -32 * SIZE(BB), %xmm1 movss -31 * SIZE(BB), %xmm3 subss %xmm4, %xmm1 subss %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movsd -32 * SIZE(AA), %xmm4 movhps -30 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 #endif #if defined(RN) || defined(RT) movss -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) movss %xmm3, -31 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L110: testl $1, M je .L119 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -31 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA leal (B, %eax, SIZE), BB #endif haddps %xmm4, %xmm4 #if defined(LN) || defined(LT) movss -32 * SIZE(BB), %xmm1 subss %xmm4, %xmm1 #else movss -32 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss -32 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss -32 * SIZE(BB), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA leal (BB, %eax, SIZE), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L119: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_4x4_sse.S000066400000000000000000002063601313527062700215010ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_A 20 + STACK(%esi) #define OLD_B 24 + STACK(%esi) #define OLD_C 28 + STACK(%esi) #define OLD_LDC 32 + STACK(%esi) #define STACK_OFFT 36 + STACK(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) #endif #if defined(PENTIUM4) || defined(PENTIUMM) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-1024, %esp STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movss STACK_OFFT, %xmm4 movl OLD_B, B movl OLD_C, %ebx movl %ebx, C movl OLD_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $2, %eax movl %eax, J jle .L40 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L05 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L10 movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 leal (LDC, LDC, 2), %eax PREFETCHW 3 * SIZE(CO1) PREFETCHW 3 * SIZE(CO1, LDC) PREFETCHW 3 * SIZE(CO1, LDC, 2) PREFETCHW 3 * SIZE(CO1, %eax) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(0 * 16) KERNEL2(0 * 16) KERNEL3(0 * 16) KERNEL4(0 * 16) KERNEL5(0 * 16) KERNEL6(0 * 16) KERNEL7(0 * 16) KERNEL8(0 * 16) addl $128 * SIZE, BB addl $32 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $2 + BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm5 movaps 12 * SIZE(B), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 movaps 8 * SIZE(AA), %xmm2 movaps 12 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) movaps %xmm5, 8 * SIZE(B) movaps %xmm7, 12 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm2 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm0, 32 * SIZE(BB) movaps %xmm2, 36 * SIZE(BB) movaps %xmm4, 40 * SIZE(BB) movaps %xmm6, 44 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm2 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm6 movaps %xmm0, 48 * SIZE(BB) movaps %xmm2, 52 * SIZE(BB) movaps %xmm4, 56 * SIZE(BB) movaps %xmm6, 60 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) movaps %xmm2, 8 * SIZE(AA) movaps %xmm3, 12 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) movhps %xmm6, 2 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) movhps %xmm3, 2 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $16 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 76 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 96 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 92 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 112 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 108 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 128 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 124 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 144 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 16 * SIZE(BB), %xmm2 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $1 + BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 2 * SIZE(AA), %xmm1 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 4 * SIZE(AA), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 6 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm1, 2 * SIZE(AA) movlps %xmm2, 4 * SIZE(AA) movlps %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: testl $1, M je .L39 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movss 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movss 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 20 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 36 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 40 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 44 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 52 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 56 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 60 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 68 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 72 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 76 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 96 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 84 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 88 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 92 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 112 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 100 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 104 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 108 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 128 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 116 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 120 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 124 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 144 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (AA, %eax, SIZE), AA sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 unpcklps %xmm5, %xmm4 movaps 0 * SIZE(B), %xmm1 subps %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm1 movss 2 * SIZE(AA), %xmm2 movss 3 * SIZE(AA), %xmm3 subss %xmm4, %xmm0 subss %xmm5, %xmm1 subss %xmm6, %xmm2 subss %xmm7, %xmm3 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) movss %xmm2, 2 * SIZE(AA) movss %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movss %xmm1, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC, 1) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm6, 0 * SIZE(CO1, %eax, 1) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC, 1) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L39: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 4), B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L45 ALIGN_4 .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L50 ALIGN_4 .L46: #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW 3 * SIZE(CO1) PREFETCHW 3 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $1 + BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 4 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd 6 * SIZE(B), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) movlps %xmm7, 6 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(BB) movaps %xmm2, 12 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm2 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm2 movaps %xmm0, 24 * SIZE(BB) movaps %xmm2, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L65 ALIGN_4 .L62: #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 2 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(BB) movaps %xmm2, 12 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L70: testl $1, M je .L79 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addss %xmm6, %xmm4 addss %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 subps %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) pshufd $1, %xmm1, %xmm3 movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L79: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L80: testl $1, N je .L999 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L85 ALIGN_4 .L82: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L86 ALIGN_4 .L90: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW 3 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 mulps 20 * SIZE(BB), %xmm1 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm3 movss 2 * SIZE(B), %xmm5 movss 3 * SIZE(B), %xmm7 subss %xmm4, %xmm1 subss %xmm6, %xmm3 subss %xmm0, %xmm5 subss %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm3, 1 * SIZE(B) movss %xmm5, 2 * SIZE(B) movss %xmm7, 3 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 4 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 movaps %xmm0, 8 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 movaps %xmm0, 12 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) movss %xmm2, 2 * SIZE(AA) movss %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + BASE_SHIFT, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) pshufd $1, %xmm4, %xmm6 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm3 subss %xmm4, %xmm1 subss %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm3, 1 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 4 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L110: testl $1, M je .L119 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 48 * SIZE(BB), %xmm3 mulss 20 * SIZE(BB), %xmm1 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L119: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B, %eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_LT_8x2_sse.S000066400000000000000000001764701313527062700215130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 20 + STACK + ARGS(%esi) #define STACK_B 24 + STACK + ARGS(%esi) #define STACK_C 28 + STACK + ARGS(%esi) #define STACK_LDC 32 + STACK + ARGS(%esi) #define STACK_OFFT 36 + STACK + ARGS(%esi) #define TRMASK 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #ifdef HAVE_3DNOW #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if !defined(HAVE_SSE2) || defined(OPTERON) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-STACK_ALIGN, %esp STACK_TOUCHING movss STACK_M, %xmm0 movl STACK_N, %eax movss STACK_K, %xmm1 movss STACK_A, %xmm2 movl STACK_B, B movss STACK_C, %xmm3 movl STACK_LDC, LDC movss STACK_OFFT, %xmm4 movss %xmm1, K movl %eax, N movss %xmm0, M movss %xmm2, A movss %xmm3, C movl %esi, OLD_STACK movss %xmm4, OFFSET movss %xmm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif #if defined(LN) || defined(LT) movl $0x3f800000, 0 + TRMASK # 1.0 movl $0x00000000, 4 + TRMASK # 0.0 movl $0x3f800000, 8 + TRMASK # 1.0 movl $0x00000000, 12 + TRMASK # 0.0 #endif movl N, %eax sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L100 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_4 .L02: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm2 shufps $0xaa, %xmm2, %xmm2 shufps $0xff, %xmm3, %xmm3 movaps %xmm7, %xmm4 shufps $0x00, %xmm4, %xmm4 movaps %xmm7, %xmm5 shufps $0x55, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xaa, %xmm6, %xmm6 shufps $0xff, %xmm7, %xmm7 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax BRANCH jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movsd 0 * SIZE(B), %xmm3 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $3, %ebx jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $3 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $3 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 PREFETCHW 7 * SIZE(CO1) PREFETCHW 7 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L12 ALIGN_2 .L11: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 ALIGN_2 .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 4 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm1, %xmm7 addl $8 * SIZE, AA addl $8 * SIZE, BB subl $1, %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $8, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 8), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm0 movaps %xmm6, %xmm1 unpcklps %xmm7, %xmm6 unpckhps %xmm7, %xmm1 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm3 movhps 6 * SIZE(B), %xmm3 movsd 8 * SIZE(B), %xmm5 movhps 10 * SIZE(B), %xmm5 movsd 12 * SIZE(B), %xmm7 movhps 14 * SIZE(B), %xmm7 subps %xmm4, %xmm2 subps %xmm0, %xmm3 subps %xmm6, %xmm5 subps %xmm1, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 movaps 8 * SIZE(AA), %xmm2 movaps 12 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm6, %xmm1 subps %xmm5, %xmm2 subps %xmm7, %xmm3 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 movaps %xmm7, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 62 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movsd 60 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 58 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 56 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 52 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 50 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 48 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 44 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 42 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 40 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 34 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 32 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 26 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 24 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 16 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 10 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 19 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 20 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 22 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 28 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 30 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 37 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 38 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 46 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 55 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 mulps %xmm1, %xmm6 subps %xmm5, %xmm2 subps %xmm6, %xmm3 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 mulps %xmm3, %xmm6 subps %xmm5, %xmm0 subps %xmm6, %xmm1 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm3, 4 * SIZE(B) movhps %xmm3, 6 * SIZE(B) movlps %xmm5, 8 * SIZE(B) movhps %xmm5, 10 * SIZE(B) movlps %xmm7, 12 * SIZE(B) movhps %xmm7, 14 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm3, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm5, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 32 * SIZE(BB) movaps %xmm1, 36 * SIZE(BB) movaps %xmm4, 40 * SIZE(BB) movaps %xmm6, 44 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm6 #else movaps %xmm7, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm7, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm7, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm7, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 48 * SIZE(BB) movaps %xmm1, 52 * SIZE(BB) movaps %xmm4, 56 * SIZE(BB) movaps %xmm6, 60 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) movaps %xmm2, 8 * SIZE(AA) movaps %xmm3, 12 * SIZE(AA) #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movaps %xmm5, %xmm4 shufps $0x88, %xmm7, %xmm5 shufps $0xdd, %xmm7, %xmm4 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm5, 4 * SIZE(CO1) movhps %xmm5, 6 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) movhps %xmm0, 2 * SIZE(CO1, LDC) movlps %xmm4, 4 * SIZE(CO1, LDC) movhps %xmm4, 6 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) movlps %xmm3, 4 * SIZE(CO1, LDC) movhps %xmm3, 6 * SIZE(CO1, LDC) #endif #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 8), AA #ifdef LT addl $16 * SIZE, B #endif #endif #ifdef LN subl $8, KK movl BORIG, B #endif #ifdef LT addl $8, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $3 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L30: testl $4, M jle .L50 #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $2 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 ALIGN_2 .L31: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L31 ALIGN_2 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm0 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm3 movhps 6 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm0, %xmm3 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm2 subps %xmm4, %xmm0 subps %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 11 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 subps %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 subps %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm3, 4 * SIZE(B) movhps %xmm3, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm3, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm2, 4 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) movhps %xmm0, 2 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: testl $2, M jle .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 ALIGN_2 .L51: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L53 ALIGN_4 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 2 * SIZE(AA), %xmm2 subps %xmm4, %xmm0 subps %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 3 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 3 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 subps %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 subps %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm2, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L70: testl $1, M jle .L99 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L72 ALIGN_2 .L71: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 .L72: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L74 .L73: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L73 ALIGN_4 .L74: addss %xmm6, %xmm4 addss %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm2 subss %xmm4, %xmm0 subss %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 mulss %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 movaps %xmm6, %xmm5 mulss %xmm0, %xmm5 subss %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 mulss %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 mulss %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 movaps %xmm6, %xmm5 mulss %xmm2, %xmm5 subss %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 mulss %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm2, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movss %xmm2, 0 * SIZE(CO1) movss %xmm0, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L100: testl $1, N jle .L999 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm2 shufps $0xaa, %xmm2, %xmm2 shufps $0xff, %xmm3, %xmm3 movaps %xmm7, %xmm4 shufps $0x00, %xmm4, %xmm4 movaps %xmm7, %xmm5 shufps $0x55, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xaa, %xmm6, %xmm6 shufps $0xff, %xmm7, %xmm7 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movss 0 * SIZE(B), %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $3 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $3 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 PREFETCHW 7 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_2 .L111: mulps %xmm2, %xmm0 mulps 4 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 4 * SIZE(BB), %xmm2 mulps %xmm2, %xmm0 mulps 12 * SIZE(AA), %xmm2 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 8 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 20 * SIZE(AA), %xmm2 addps %xmm1, %xmm4 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 28 * SIZE(AA), %xmm2 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 36 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 40 * SIZE(AA), %xmm0 addps %xmm3, %xmm6 movaps 20 * SIZE(BB), %xmm3 mulps %xmm3, %xmm0 mulps 44 * SIZE(AA), %xmm3 addps %xmm0, %xmm5 movaps 64 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 52 * SIZE(AA), %xmm3 addps %xmm1, %xmm4 movaps 56 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 60 * SIZE(AA), %xmm3 addps %xmm1, %xmm5 movaps 80 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $64 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_2 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm0 addps %xmm0, %xmm4 mulps 4 * SIZE(AA), %xmm2 addps %xmm2, %xmm6 addl $8 * SIZE, AA addl $4 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $8, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 8), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm5 movhps 6 * SIZE(B), %xmm5 subps %xmm4, %xmm2 subps %xmm6, %xmm5 xorps %xmm0, %xmm0 movaps %xmm2, %xmm3 unpcklps %xmm0, %xmm2 unpckhps %xmm0, %xmm3 movaps %xmm5, %xmm7 unpcklps %xmm0, %xmm5 unpckhps %xmm0, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 movaps %xmm7, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 62 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movsd 60 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 58 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 56 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 52 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 50 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 48 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 44 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 42 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 40 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 34 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 32 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 26 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 24 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 16 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 10 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 19 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 20 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 22 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 28 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 30 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 37 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 38 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 46 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 55 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 shufps $0x88, %xmm7, %xmm5 movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) movhps %xmm5, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm5, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm5, 4 * SIZE(CO1) movhps %xmm5, 6 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) #endif #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 8), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $8, KK movl BORIG, B #endif #ifdef LT addl $8, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $3 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_2 .L130: testl $4, M jle .L150 #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $2 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 movhps 2 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movsd 16 * SIZE(AA), %xmm1 movhps 18 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 ALIGN_2 .L131: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 4 * SIZE(BB), %xmm0 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 mulps 20 * SIZE(BB), %xmm1 movaps 48 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L131 ALIGN_2 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 subps %xmm4, %xmm2 xorps %xmm5, %xmm5 movaps %xmm2, %xmm3 unpcklps %xmm5, %xmm2 unpckhps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 11 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #ifdef RT movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: testl $2, M jle .L170 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 ALIGN_2 .L151: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L151 ALIGN_2 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L153 ALIGN_4 .L154: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 shufps $1, %xmm5, %xmm5 movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 movaps %xmm4, %xmm6 shufps $0xff, %xmm6, %xmm6 mulss %xmm6, %xmm1 movaps %xmm4, %xmm6 shufps $0xaa, %xmm6, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm0 mulss %xmm4, %xmm0 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 mulss %xmm4, %xmm0 movaps %xmm4, %xmm6 shufps $0x55, %xmm6, %xmm6 mulss %xmm0, %xmm6 subss %xmm6, %xmm1 movaps %xmm4, %xmm6 shufps $0xff, %xmm6, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #ifdef RT movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) shufps $0x00, %xmm0, %xmm0 shufps $0x00, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L170: testl $1, M jle .L179 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L172 ALIGN_2 .L171: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 mulss 4 * SIZE(BB), %xmm0 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 mulss 20 * SIZE(BB), %xmm1 movss 48 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L171 ALIGN_2 .L172: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L174 .L173: movss 0 * SIZE(AA), %xmm0 movss 0 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L173 ALIGN_4 .L174: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) shufps $0x00, %xmm1, %xmm1 movaps %xmm1, 0 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L179: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B, %eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_1x4.S000066400000000000000000000476151313527062700206400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 32 #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define AORIG 16 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define STACK_A 24 + STACK + ARGS(%esp) #define STACK_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define STACK_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #else #define STACK_A 20 + STACK + ARGS(%esp) #define STACK_B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define STACK_LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #endif #define A %edx #define B %ecx #define B_ORIG %ebx #define LDC %ebp #define PREFETCHSIZE (5 + 8 * 10) PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_B, B_ORIG movl STACK_LDC, LDC leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, STACK_A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B_ORIG movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif subl $-16 * SIZE, B_ORIG subl $-16 * SIZE, STACK_A movl M, %eax testl %eax, %eax jle .L999 movl N, %eax testl %eax, %eax jle .L999 movl K, %eax testl %eax, %eax jle .L999 movl N, %eax andl $1, %eax je .L20 ALIGN_3 .L31: #if defined(LT) || defined(RN) movl STACK_A, A #else movl STACK_A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B_ORIG #endif #ifdef RT subl LDC, C #endif movl C, %edi #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl B_ORIG, B #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $5, %eax jle .L33 ALIGN_4 .L32: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi subl $-32 * SIZE, B decl %eax jne .L32 ALIGN_3 .L33: movl M, %esi movl %esi, I ALIGN_3 .L34: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, A leal (A , %eax, 1), A leal (B_ORIG, %eax, 1), B #else movl B_ORIG, B #endif fldz fldz fldz fldz prefetchw 1 * SIZE(%edi) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L36 ALIGN_3 .L35: FLD -16 * SIZE(A) FMUL -16 * SIZE(B) faddp %st, %st(1) FLD -15 * SIZE(A) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -14 * SIZE(A) FMUL -14 * SIZE(B) faddp %st, %st(3) FLD -13 * SIZE(A) FMUL -13 * SIZE(B) faddp %st, %st(4) FLD -12 * SIZE(A) FMUL -12 * SIZE(B) faddp %st, %st(1) FLD -11 * SIZE(A) FMUL -11 * SIZE(B) faddp %st, %st(2) FLD -10 * SIZE(A) FMUL -10 * SIZE(B) faddp %st, %st(3) FLD -9 * SIZE(A) FMUL -9 * SIZE(B) faddp %st, %st(4) addl $8 * SIZE, A addl $8 * SIZE, B decl %eax jne .L35 ALIGN_4 .L36: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $7, %eax je .L39 ALIGN_4 .L37: FLD -16 * SIZE(A) FMUL -16 * SIZE(B) faddp %st, %st(1) addl $1 * SIZE,A addl $1 * SIZE,B decl %eax jne .L37 ALIGN_4 .L39: faddp %st, %st(2) faddp %st, %st(2) faddp %st, %st(1) #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, A leal (A, %eax, SIZE), A leal (B_ORIG, %eax, SIZE), B #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(B) fsubp %st, %st(1) #else FLD 0 * SIZE - 16 * SIZE(A) fsubp %st, %st(1) #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(A) fmulp %st, %st(1) #endif #if defined(RN) || defined(RT) FMUL 0 * SIZE - 16 * SIZE(B) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE - 16 * SIZE(B) #else FSTU 0 * SIZE - 16 * SIZE(A) #endif FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (A, %eax, SIZE), A leal (B, %eax, SIZE), B #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L34 #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (B_ORIG, %eax, 1), B_ORIG #endif #if defined(LT) || defined(RN) movl B, B_ORIG #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L20: movl N, %eax andl $2, %eax je .L30 #if defined(LT) || defined(RN) movl STACK_A, A #else movl STACK_A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B_ORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, %edi #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl B_ORIG, B #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax jle .L23 ALIGN_4 .L22: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi subl $-32 * SIZE, B decl %eax jne .L22 ALIGN_3 .L23: movl M, %esi movl %esi, I ALIGN_3 .L24: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, A leal (A , %eax, 1), A leal (B_ORIG, %eax, 2), B #else movl B_ORIG, B #endif fldz fldz fldz fldz FLD -16 * SIZE(A) FLD -16 * SIZE(B) prefetchw 1 * SIZE(%edi) prefetchw 1 * SIZE(%edi, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L26 ALIGN_3 .L25: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -15 * SIZE(A) FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -13 * SIZE(B) faddp %st, %st(4) FLD -14 * SIZE(A) FLD -12 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -11 * SIZE(B) faddp %st, %st(2) FLD -13 * SIZE(A) FLD -10 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -9 * SIZE(B) faddp %st, %st(4) FLD -12 * SIZE(A) FLD -8 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -7 * SIZE(B) faddp %st, %st(2) FLD -11 * SIZE(A) FLD -6 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -5 * SIZE(B) faddp %st, %st(4) FLD -10 * SIZE(A) FLD -4 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -3 * SIZE(B) faddp %st, %st(2) FLD -9 * SIZE(A) FLD -2 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -1 * SIZE(B) faddp %st, %st(4) FLD -8 * SIZE(A) FLD 0 * SIZE(B) addl $ 8 * SIZE, A subl $-16 * SIZE, B decl %eax jne .L25 ALIGN_4 .L26: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $7, %eax je .L29 ALIGN_4 .L27: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -15 * SIZE(A) FLD -14 * SIZE(B) addl $1 * SIZE,A addl $2 * SIZE,B decl %eax jne .L27 ALIGN_4 .L29: ffreep %st(0) ffreep %st(0) faddp %st, %st(2) faddp %st, %st(2) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, A leal (A, %eax, 1), A leal (B_ORIG, %eax, 2), B #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(B) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(B) fsubp %st, %st(2) #else FLD 0 * SIZE - 16 * SIZE(A) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(A) fsubp %st, %st(2) #endif #ifdef LN FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FMUL 0 * SIZE - 16 * SIZE(B) FLD 1 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) #endif #ifdef RT FLD 3 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE - 16 * SIZE(B) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE - 16 * SIZE(B) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(B) #else FSTU 0 * SIZE - 16 * SIZE(A) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(A) #endif FST 0 * SIZE(%edi, LDC) FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 2), B #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L24 #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (B_ORIG, %eax, 2), B_ORIG #endif #if defined(LT) || defined(RN) movl B, B_ORIG #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L30: movl N, %eax sarl $2, %eax movl %eax, J je .L999 ALIGN_3 .L11: #if defined(LT) || defined(RN) movl STACK_A, A #else movl STACK_A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B_ORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, %edi #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl B_ORIG, B #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax jle .L13 ALIGN_4 .L12: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi movl 16 * SIZE(B), %esi movl 24 * SIZE(B), %esi movl 32 * SIZE(B), %esi movl 40 * SIZE(B), %esi subl $-64 * SIZE, B decl %eax jne .L12 ALIGN_3 .L13: movl M, %esi movl %esi, I ALIGN_3 .L14: #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, A leal (A , %eax, 1), A leal (B_ORIG, %eax, 4), B #else movl B_ORIG, B #endif leal (%edi, LDC, 2), %eax fldz fldz fldz fldz FLD -8 * SIZE(A) FLD -16 * SIZE(A) FLD -16 * SIZE(B) movl $32 * SIZE, %esi prefetchw 1 * SIZE(%edi) prefetchw 1 * SIZE(%edi, LDC) prefetchw 1 * SIZE(%eax) prefetchw 1 * SIZE(%eax, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L16 ALIGN_3 .L15: fmul %st(1), %st faddp %st, %st(3) PADDING FLD -15 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -13 * SIZE(B) faddp %st, %st(5) FLD -15 * SIZE(A) FLD -12 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -11 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -10 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -9 * SIZE(B) faddp %st, %st(5) FLD -14 * SIZE(A) FLD -8 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -7 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -6 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -5 * SIZE(B) faddp %st, %st(5) FLD -13 * SIZE(A) FLD -4 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -3 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -2 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -1 * SIZE(B) faddp %st, %st(5) FLD -12 * SIZE(A) FLD 0 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 1 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 2 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 3 * SIZE(B) faddp %st, %st(5) FLD -11 * SIZE(A) FLD 4 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 5 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 6 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 7 * SIZE(B) faddp %st, %st(5) FLD -10 * SIZE(A) FLD 8 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 9 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 10 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 11 * SIZE(B) faddp %st, %st(5) FLD -9 * SIZE(A) FLD 12 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 13 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 15 * SIZE(B) faddp %st, %st(5) FLD 0 * SIZE(A) PADDING prefetch PREFETCHSIZE * SIZE(A) addl $8 * SIZE, A fxch %st(1) addl $32 * SIZE, B FLD -16 * SIZE(B) decl %eax jne .L15 ALIGN_4 .L16: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $7, %eax je .L19 ALIGN_4 .L17: fmul %st(1), %st faddp %st, %st(3) FLD -15 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) FMUL -13 * SIZE(B) faddp %st, %st(5) FLD -15 * SIZE(A) FLD -12 * SIZE(B) addl $1 * SIZE,A addl $4 * SIZE,B decl %eax jne .L17 ALIGN_4 .L19: ffreep %st(0) ffreep %st(0) ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, A leal (A, %eax, 1), A leal (B_ORIG, %eax, 4), B #endif #if defined(LN) || defined(LT) FLD 0 * SIZE - 16 * SIZE(B) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(B) fsubp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(B) fsubp %st, %st(3) FLD 3 * SIZE - 16 * SIZE(B) fsubp %st, %st(4) #else FLD 0 * SIZE - 16 * SIZE(A) fsubp %st, %st(1) FLD 1 * SIZE - 16 * SIZE(A) fsubp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(A) fsubp %st, %st(3) FLD 3 * SIZE - 16 * SIZE(A) fsubp %st, %st(4) #endif #ifdef LN FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef LT FLD 0 * SIZE - 16 * SIZE(A) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FMUL 0 * SIZE - 16 * SIZE(B) FLD 1 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(2) FLD 2 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(3) FLD 3 * SIZE - 16 * SIZE(B) fmul %st(1), %st fsubrp %st, %st(4) FLD 5 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) FLD 6 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(3) FLD 7 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(4) FLD 10 * SIZE - 16 * SIZE(B) fmulp %st, %st(3) FLD 11 * SIZE - 16 * SIZE(B) fmul %st(3), %st fsubrp %st, %st(4) FLD 15 * SIZE - 16 * SIZE(B) fmulp %st, %st(4) #endif #ifdef RT FLD 15 * SIZE - 16 * SIZE(B) fmulp %st, %st(4) FLD 14 * SIZE - 16 * SIZE(B) fmul %st(4), %st fsubrp %st, %st(3) FLD 13 * SIZE - 16 * SIZE(B) fmul %st(4), %st fsubrp %st, %st(2) FLD 12 * SIZE - 16 * SIZE(B) fmul %st(4), %st fsubrp %st, %st(1) FLD 10 * SIZE - 16 * SIZE(B) fmulp %st, %st(3) FLD 9 * SIZE - 16 * SIZE(B) fmul %st(3), %st fsubrp %st, %st(2) FLD 8 * SIZE - 16 * SIZE(B) fmul %st(3), %st fsubrp %st, %st(1) FLD 5 * SIZE - 16 * SIZE(B) fmulp %st, %st(2) FLD 4 * SIZE - 16 * SIZE(B) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE - 16 * SIZE(B) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE - 16 * SIZE(B) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(B) fxch %st(2) FSTU 2 * SIZE - 16 * SIZE(B) fxch %st(3) FSTU 3 * SIZE - 16 * SIZE(B) #else FSTU 0 * SIZE - 16 * SIZE(A) fxch %st(1) FSTU 1 * SIZE - 16 * SIZE(A) fxch %st(2) FSTU 2 * SIZE - 16 * SIZE(A) fxch %st(3) FSTU 3 * SIZE - 16 * SIZE(A) #endif leal (%edi, LDC, 2), %eax FST 0 * SIZE(%eax, LDC) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi, LDC) FST 0 * SIZE(%eax) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 1), A leal (B, %eax, 4), B #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L14 #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (B_ORIG, %eax, 4), B_ORIG #endif #if defined(LT) || defined(RN) movl B, B_ORIG #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J jne .L11 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_2x2.S000066400000000000000000000443461313527062700206350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE #define A 24 + STACK + ARGS(%esp) #define B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #else #define A 20 + STACK + ARGS(%esp) #define B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #endif #define PREFETCH_OFFSET 48 #if defined(PENTIUM3) || defined(PENTIUMM) #define REP rep #else #define REP rep #endif #define AA %edx #define BB %ecx PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl LDC, %ebp # ldc # MEMORY movl B, %ebx leal (, %ebp, SIZE), %ebp #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, %ebx movl N, %eax imull %ebp, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax # n # MEMORY andl $1, %eax je .L8 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, %ebx #endif #ifdef RT subl %ebp, C #endif movl C, %edi # c # MEMORY #ifndef RT addl %ebp, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %esi # m # MEMORY sarl $1, %esi # m >> 1 je .L36 ALIGN_4 .L46: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 1), BB #else movl %ebx, BB #endif fldz fldz FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax je .L56 ALIGN_4 .L57: FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) addl $4 * SIZE,AA addl $2 * SIZE,BB dec %eax jne .L57 ALIGN_4 .L56: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax je .L45 ALIGN_4 FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) addl $2 * SIZE,AA addl $1 * SIZE,BB ALIGN_4 .L45: ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 1), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(2) #endif #ifdef LN FLD 3 * SIZE(AA) fmulp %st, %st(2) FLD 2 * SIZE(AA) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE(AA) fmulp %st, %st(1) #endif #ifdef LT FLD 0 * SIZE(AA) fmulp %st, %st(1) FLD 1 * SIZE(AA) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE(AA) fmulp %st, %st(2) #endif #ifdef RN FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subl $2 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) #else FSTU 0 * SIZE(AA) fxch %st(1) FSTU 1 * SIZE(AA) #endif FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) #ifndef LN addl $2 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %esi # i -- jne .L46 ALIGN_4 .L36: movl M, %eax # m # MEMORY andl $1, %eax # m & 1 je .L99 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 1), BB #else movl %ebx, BB #endif fldz #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif test %eax, %eax jle .L52 ALIGN_3 .L51: FLD (AA) FMUL (BB) addl $1 * SIZE,AA addl $1 * SIZE,BB faddp %st,%st(1) decl %eax jne .L51 ALIGN_4 .L52: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 1), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) #endif #if defined(LN) || defined(LT) FMUL 0 * SIZE(AA) #else FMUL 0 * SIZE(BB) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) #else FSTU 0 * SIZE(AA) #endif FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax leal (%ebx, %eax, SIZE), %ebx #endif #if defined(LT) || defined(RN) movl BB, %ebx #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L8: movl N, %eax # j = (n >> 1) # MEMORY sarl $1, %eax movl %eax, J # j = (n >> 1) # MEMORY je .End ALIGN_4 .L34: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, %ebx #endif lea (, %ebp, 2), %eax #ifdef RT subl %eax, C #endif movl C, %edi #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %esi sarl $1, %esi je .L12 ALIGN_4 .MainHead: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 2), BB #else movl %ebx, BB #endif fldz fldz fldz fldz FLD 4 * SIZE(BB) # b5 FLD 4 * SIZE(AA) # a5 FLD 0 * SIZE(BB) # b1 FLD 0 * SIZE(AA) # a1 #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(%edi) prefetchw 2 * SIZE(%edi, %ebp, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(%edi) prefetchnta 2 * SIZE(%edi, %ebp, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L16 ALIGN_4 .MainLoop: #if defined(HAVE_3DNOW) prefetch (PREFETCH_OFFSET) * SIZE(BB) nop #elif defined(HAVE_SSE) prefetchnta (PREFETCH_OFFSET) * SIZE(BB) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) #endif #endif fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(AA) fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(AA) fmul %st, %st(1) FMUL 3 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 3 * SIZE(AA) fmul %st, %st(1) FMUL 3 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 8 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 8 * SIZE(AA) fxch %st(2) #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) #endif #endif fmul %st, %st(3) FMUL 5 * SIZE(BB) fxch %st(3) faddp %st, %st(4) FLD 4 * SIZE(BB) fxch %st(3) faddp %st, %st(5) FLD 5 * SIZE(AA) fmul %st, %st(3) FMUL 5 * SIZE(BB) fxch %st(3) faddp %st, %st(6) FLD 6 * SIZE(BB) fxch %st(3) faddp %st, %st(7) FLD 6 * SIZE(AA) fmul %st, %st(3) FMUL 7 * SIZE(BB) fxch %st(3) faddp %st, %st(4) FLD 6 * SIZE(BB) fxch %st(3) faddp %st, %st(5) FLD 7 * SIZE(AA) fmul %st, %st(3) FMUL 7 * SIZE(BB) fxch %st(3) faddp %st, %st(6) FLD 12 * SIZE(BB) fxch %st(3) faddp %st, %st(7) FLD 12 * SIZE(AA) fxch %st(2) subl $-8 * SIZE, BB subl $-8 * SIZE, AA decl %eax # l -- jne .MainLoop ALIGN_4 .L16: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L21 ALIGN_4 .SubLoop: fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(BB) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(AA) fmul %st, %st(1) FMUL 1 * SIZE(BB) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(BB) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(AA) addl $2 * SIZE,BB addl $2 * SIZE,AA decl %eax jne .SubLoop ALIGN_4 .L21: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 2), AA leal (%ebx, %eax, 2), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) FLD 2 * SIZE(BB) fsubp %st, %st(3) FLD 3 * SIZE(BB) fsubp %st, %st(4) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(3) FLD 2 * SIZE(AA) fsubp %st, %st(2) FLD 3 * SIZE(AA) fsubp %st, %st(4) #endif #ifdef LN FLD 3 * SIZE(AA) fmul %st, %st(3) fmulp %st, %st(4) FLD 2 * SIZE(AA) fmul %st(3), %st FLD 2 * SIZE(AA) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(AA) fmul %st(1), %st FLD 1 * SIZE(AA) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD 3 * SIZE(AA) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(3) FLD 1 * SIZE(BB) fmul %st(1), %st FLD 1 * SIZE(BB) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD 3 * SIZE(BB) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD 3 * SIZE(BB) fmul %st, %st(2) fmulp %st, %st(4) FLD 2 * SIZE(BB) fmul %st(2), %st FLD 2 * SIZE(BB) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD 0 * SIZE(BB) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subl $2 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) fxch %st(2) FSTU 2 * SIZE(BB) fxch %st(3) FSTU 3 * SIZE(BB) FST 1 * SIZE(%edi,%ebp) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi) #else FSTU 0 * SIZE(AA) fxch %st(2) FSTU 1 * SIZE(AA) fxch %st(1) FSTU 2 * SIZE(AA) fxch %st(3) FSTU 3 * SIZE(AA) FST 1 * SIZE(%edi,%ebp) FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) FST 0 * SIZE(%edi,%ebp) #endif #ifndef LN addl $2 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %esi # i -- jne .MainHead ALIGN_4 .L12: movl M, %eax # m # MEMORY andl $1, %eax je .L27 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 2), BB #else movl %ebx, BB #endif fldz fldz FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1,%eax # k >> 1 # MEMORY je .L54 ALIGN_4 .L55: FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L55 ALIGN_4 .L54: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1,%eax # k & 1 je .L33 ALIGN_4 FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) addl $1 * SIZE, AA addl $2 * SIZE, BB ALIGN_4 .L33: ffreep %st(0) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif leal (, %eax, SIZE), %eax movl AORIG, AA leal (AA, %eax, 1), AA leal (%ebx, %eax, 2), BB #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(BB) fsubp %st, %st(1) FLD 1 * SIZE(BB) fsubp %st, %st(2) #else FLD 0 * SIZE(AA) fsubp %st, %st(1) FLD 1 * SIZE(AA) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(AA) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD 0 * SIZE(BB) fmulp %st, %st(1) FLD 1 * SIZE(BB) fmul %st(1), %st fsubrp %st, %st(2) FLD 3 * SIZE(BB) fmulp %st, %st(2) #endif #ifdef RT FLD 3 * SIZE(BB) fmulp %st, %st(2) FLD 2 * SIZE(BB) fmul %st(2), %st fsubrp %st, %st(1) FLD 0 * SIZE(BB) fmulp %st, %st(1) #endif #ifdef LN subl $1 * SIZE, %edi #endif #if defined(LN) || defined(LT) FSTU 0 * SIZE(BB) fxch %st(1) FSTU 1 * SIZE(BB) #else FSTU 0 * SIZE(AA) fxch %st(1) FSTU 1 * SIZE(AA) #endif FST 0 * SIZE(%edi,%ebp) FST 0 * SIZE(%edi) #ifndef LN addl $1 * SIZE, %edi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L27: #ifdef LN movl K, %eax leal ( , %eax, SIZE), %eax leal (%ebx, %eax, 2), %ebx #endif #if defined(LT) || defined(RN) movl BB, %ebx #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j-- # MEMORY jne .L34 ALIGN_4 .End: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_2x2_atom.S000066400000000000000000000461641313527062700216550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif testl $1, N je .L30 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L40 ALIGN_4 .L31: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 prefetcht0 3 * SIZE(CO1) xorps %xmm2, %xmm2 xorps %xmm4, %xmm4 xorps %xmm6, %xmm6 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 2 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 4 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addsd %xmm0, %xmm4 addsd %xmm2, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm2 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm2 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 movsd 0 * SIZE(AA), %xmm7 subsd %xmm5, %xmm0 mulsd %xmm7, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm7 subsd %xmm5, %xmm2 mulsd %xmm7, %xmm2 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm2, 1 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L31 ALIGN_4 .L40: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L49 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 movsd 0 * SIZE(BB), %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L45 ALIGN_4 .L42: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 3 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addsd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulsd 0 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd 0 * SIZE(BB), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L49: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L30: movl N, %eax sarl $1, %eax movl %eax, J jle .L999 ALIGN_2 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) xorps %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 3 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 5 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 4 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 5 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 7 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 7 * SIZE(BB), %xmm3 addl $8 * SIZE, BB addl $8 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addsd %xmm2, %xmm6 addsd %xmm3, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm1 movsd 2 * SIZE(BB), %xmm2 movsd 3 * SIZE(BB), %xmm3 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 subsd %xmm6, %xmm2 subsd %xmm7, %xmm3 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm2 movsd 2 * SIZE(AA), %xmm1 movsd 3 * SIZE(AA), %xmm3 subsd %xmm4, %xmm0 subsd %xmm6, %xmm2 subsd %xmm5, %xmm1 subsd %xmm7, %xmm3 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm4, %xmm3 movsd 0 * SIZE(AA), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm2, %xmm5 mulsd %xmm3, %xmm6 subsd %xmm5, %xmm0 subsd %xmm6, %xmm1 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm4, %xmm1 movsd 3 * SIZE(AA), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm0, %xmm5 mulsd %xmm1, %xmm6 subsd %xmm5, %xmm2 subsd %xmm6, %xmm3 mulsd %xmm7, %xmm2 mulsd %xmm7, %xmm3 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm5 mulsd %xmm4, %xmm2 movsd 3 * SIZE(BB), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm0, %xmm5 mulsd %xmm2, %xmm6 subsd %xmm5, %xmm1 subsd %xmm6, %xmm3 mulsd %xmm7, %xmm1 mulsd %xmm7, %xmm3 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm5 mulsd %xmm4, %xmm3 movsd 0 * SIZE(BB), %xmm7 movaps %xmm5, %xmm6 mulsd %xmm1, %xmm5 mulsd %xmm3, %xmm6 subsd %xmm5, %xmm0 subsd %xmm6, %xmm2 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) movsd %xmm2, 2 * SIZE(BB) movsd %xmm3, 3 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm2, 1 * SIZE(AA) movsd %xmm1, 2 * SIZE(AA) movsd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) movsd %xmm3, 1 * SIZE(CO1, LDC) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L25 ALIGN_4 .L22: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 3 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 5 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 6 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 7 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addsd %xmm2, %xmm4 addsd %xmm3, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm7 mulsd %xmm7, %xmm0 mulsd %xmm7, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm5 movaps %xmm5, %xmm6 movsd 3 * SIZE(BB), %xmm7 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm1 mulsd %xmm7, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm5 movaps %xmm5, %xmm6 movsd 0 * SIZE(BB), %xmm7 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm0 mulsd %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L10 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_2x4_penryn.S000066400000000000000000001053151313527062700222240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC subl $-16 * SIZE, A subl $-16 * SIZE, B #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif testl $1, N je .L30 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 movhps -15 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 #ifdef LN prefetcht0 -2 * SIZE(CO1) #else prefetcht0 1 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: pshufd $0x44, %xmm1, %xmm2 movsd -15 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #else movapd -16 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -14 * SIZE(AA), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BB) movsd %xmm1, -15 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L89 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm2 movhps -15 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 subl $-8 * SIZE, AA subl $-8 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd -15 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd -15 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 haddpd %xmm4, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd -16 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L89: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L30: testl $2, N je .L60 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 #ifdef LN prefetcht0 -2 * SIZE(CO1) pxor %xmm6, %xmm6 prefetcht0 -2 * SIZE(CO1, LDC) pxor %xmm7, %xmm7 #else prefetcht0 1 * SIZE(CO1) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(CO1, LDC) pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -10 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -6 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -2 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd -16 * SIZE(BB), %xmm2 movapd -14 * SIZE(BB), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movddup -14 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movddup -15 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -13 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup -13 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -14 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(BB) movapd %xmm3, -14 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm3, 1 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -6 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -4 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -2 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps 0 * SIZE(BB), %xmm2 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L58 .L56: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm0 subpd %xmm4, %xmm0 #else movapd -16 * SIZE(AA), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -13 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movsd -13 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -14 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: movl N, %eax sarl $2, %eax movl %eax, J jle .L999 ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif leal (CO1, LDC, 2), %eax movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -2 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -2 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 -2 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 -2 * SIZE(%eax, LDC) #else pxor %xmm4, %xmm4 prefetcht0 1 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 1 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 1 * SIZE(%eax, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 subl $-32 * SIZE, BB mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 movaps %xmm6, %xmm0 movsd %xmm7, %xmm6 movsd %xmm0, %xmm7 #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd -16 * SIZE(BB), %xmm2 movapd -14 * SIZE(BB), %xmm5 movapd -12 * SIZE(BB), %xmm3 movapd -10 * SIZE(BB), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 movapd -12 * SIZE(AA), %xmm2 movapd -10 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movddup -14 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movddup -15 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movddup -13 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -14 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movddup -13 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movddup -11 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -10 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movddup -9 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movddup -6 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup -5 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup -1 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup -1 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 movddup -2 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup -3 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movddup -4 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movddup -6 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup -7 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movddup -8 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movddup -11 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup -12 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(BB) movapd %xmm5, -14 * SIZE(BB) movapd %xmm3, -12 * SIZE(BB) movapd %xmm7, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) movapd %xmm2, -12 * SIZE(AA) movapd %xmm3, -10 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm3, 1 * SIZE(CO1, LDC, 1) movsd %xmm5, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhps %xmm5, 0 * SIZE(CO1, %eax, 1) movhps %xmm7, 1 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) movhps %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movaps -14 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps -8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -2 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 0 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 6 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 14 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 18 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L28 .L26: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm0 movapd -14 * SIZE(BB), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #else movapd -16 * SIZE(AA), %xmm1 movapd -14 * SIZE(AA), %xmm3 subpd %xmm4, %xmm1 subpd %xmm5, %xmm3 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm3, %xmm2 unpckhpd %xmm3, %xmm3 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd -14 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movsd -13 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movsd -11 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -10 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movsd -9 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movsd -6 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd -5 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movsd -1 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movsd -1 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 movsd -2 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movsd -3 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movsd -4 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movsd -6 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd -7 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movsd -8 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movsd -11 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd -12 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd -16 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) movsd %xmm2, -14 * SIZE(AA) movsd %xmm3, -13 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhps %xmm1, 0 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_2x4_sse2.S000066400000000000000000001353531313527062700215720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA 16 + STACK + ARGS(%esi) #define OLD_A 24 + STACK + ARGS(%esi) #define OLD_B 28 + STACK + ARGS(%esi) #define OLD_C 32 + STACK + ARGS(%esi) #define OLD_LDC 36 + STACK + ARGS(%esi) #define OLD_OFFT 40 + STACK + ARGS(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movd OLD_OFFT, %mm4 movl OLD_B, B movl OLD_C, %ebx movl %ebx, C movl OLD_LDC, LDC movd %mm4, OFFSET movd %mm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif testl $1, N je .L30 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L65 ALIGN_4 .L62: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L62 ALIGN_2 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L70 ALIGN_2 .L66: movq 0 * SIZE(B), %mm0 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L66 ALIGN_4 .L70: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) #else prefetchw 1 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) movapd 16 * SIZE(BB), %xmm2 movapd 2 * SIZE(AA), %xmm0 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movapd 24 * SIZE(BB), %xmm3 movapd 10 * SIZE(AA), %xmm1 mulpd 10 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(AA), %xmm0 movapd 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 subpd %xmm4, %xmm2 #else movapd 0 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movapd %xmm2, %xmm3 unpckhpd %xmm3, %xmm3 movlpd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movlpd 2 * SIZE(AA), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm3, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm3 unpckhpd %xmm3, %xmm3 movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movlpd 1 * SIZE(AA), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movlpd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm3, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L99 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L85 ALIGN_4 .L82: mulsd %xmm0, %xmm2 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) movlpd 1 * SIZE(AA), %xmm0 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm6 movlpd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movlpd 5 * SIZE(AA), %xmm1 mulsd 10 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movlpd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm6 movlpd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 2 * SIZE(BB), %xmm2 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addsd %xmm5, %xmm4 addsd %xmm7, %xmm6 addsd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(B), %xmm2 subsd %xmm4, %xmm2 #else movlpd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) #else movlpd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B,%eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L30: testl $2, N je .L60 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L35 ALIGN_4 .L32: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L32 ALIGN_2 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L40 ALIGN_2 .L36: movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L36 ALIGN_4 .L40: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) #else prefetchw 1 * SIZE(CO1) prefetchw 1 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: mulpd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movlpd 2 * SIZE(AA), %xmm4 movhpd 2 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movlpd 1 * SIZE(AA), %xmm4 movhpd 1 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm3, 4 * SIZE(BB) movlpd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movlpd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movlpd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movlpd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movlpd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movlpd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movlpd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movlpd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 .L56: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movlpd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm5, %xmm4 movapd 0 * SIZE(B), %xmm2 subpd %xmm4, %xmm2 #else movlpd 0 * SIZE(AA), %xmm0 movlpd 1 * SIZE(AA), %xmm1 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movlpd 3 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) movlpd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: movl N, %eax sarl $2, %eax movl %eax, J jle .L999 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L05 ALIGN_4 .L02: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm4, 9 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm5, 11 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm6, 13 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movq %mm7, 15 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L05: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L10 movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq %mm0, 0 * SIZE(BB) movq %mm0, 1 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm1, 3 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm2, 5 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm3, 7 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) prefetchw -2 * SIZE(CO1, LDC, 2) prefetchw -2 * SIZE(CO1, %eax) #else prefetchw 1 * SIZE(CO1) prefetchw 1 * SIZE(CO1, LDC) prefetchw 1 * SIZE(CO1, LDC, 2) prefetchw 1 * SIZE(CO1, %eax) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm5 movapd 4 * SIZE(B), %xmm3 movapd 6 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movlpd 2 * SIZE(AA), %xmm4 movhpd 2 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movlpd 1 * SIZE(AA), %xmm4 movhpd 1 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movlpd 3 * SIZE(AA), %xmm4 movhpd 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movlpd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movlpd 5 * SIZE(B), %xmm4 movhpd 5 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 6 * SIZE(B), %xmm4 movhpd 6 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movlpd 7 * SIZE(B), %xmm4 movhpd 7 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movlpd 10 * SIZE(B), %xmm4 movhpd 10 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 movlpd 11 * SIZE(B), %xmm4 movhpd 11 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movlpd 15 * SIZE(B), %xmm4 movhpd 15 * SIZE(B), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm4 movhpd 15 * SIZE(B), %xmm4 mulpd %xmm4, %xmm3 movlpd 14 * SIZE(B), %xmm4 movhpd 14 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movlpd 13 * SIZE(B), %xmm4 movhpd 13 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movlpd 12 * SIZE(B), %xmm4 movhpd 12 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movlpd 10 * SIZE(B), %xmm4 movhpd 10 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 movlpd 9 * SIZE(B), %xmm4 movhpd 9 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movlpd 8 * SIZE(B), %xmm4 movhpd 8 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movlpd 5 * SIZE(B), %xmm4 movhpd 5 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movlpd 4 * SIZE(B), %xmm4 movhpd 4 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movapd %xmm3, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm5, 4 * SIZE(BB) movlpd %xmm5, 5 * SIZE(BB) movhpd %xmm5, 6 * SIZE(BB) movhpd %xmm5, 7 * SIZE(BB) movlpd %xmm3, 8 * SIZE(BB) movlpd %xmm3, 9 * SIZE(BB) movhpd %xmm3, 10 * SIZE(BB) movhpd %xmm3, 11 * SIZE(BB) movlpd %xmm7, 12 * SIZE(BB) movlpd %xmm7, 13 * SIZE(BB) movhpd %xmm7, 14 * SIZE(BB) movhpd %xmm7, 15 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movlpd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movlpd 0 * SIZE(AA), %xmm0 movlpd 4 * SIZE(AA), %xmm1 movlpd 0 * SIZE(BB), %xmm2 movlpd 8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movlpd 10 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movlpd 12 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 18 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 20 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 22 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 32 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movlpd 26 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movlpd 28 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 30 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movlpd 34 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movlpd 36 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 38 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 48 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movlpd 42 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movlpd 44 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 46 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 56 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movlpd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movlpd 50 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movlpd 52 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 54 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movlpd 64 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movlpd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movlpd 58 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movlpd 60 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 62 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movlpd 72 * SIZE(BB), %xmm3 addl $64 * SIZE, BB addsd %xmm1, %xmm7 movlpd 12 * SIZE(AA), %xmm1 addl $8 * SIZE, AA decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movlpd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movlpd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movlpd 8 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movlpd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 4), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm5, %xmm4 unpcklpd %xmm7, %xmm6 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm5 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 #else movlpd 0 * SIZE(AA), %xmm0 movlpd 1 * SIZE(AA), %xmm1 movlpd 2 * SIZE(AA), %xmm2 movlpd 3 * SIZE(AA), %xmm3 subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 subsd %xmm6, %xmm2 subsd %xmm7, %xmm3 #endif #ifdef LN movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movlpd 0 * SIZE(AA), %xmm4 movhpd 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 movlpd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movlpd 2 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movlpd 3 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movlpd 5 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 6 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movlpd 7 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movlpd 10 * SIZE(B), %xmm4 mulsd %xmm4, %xmm2 movlpd 11 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movlpd 15 * SIZE(B), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm4 mulsd %xmm4, %xmm3 movlpd 14 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movlpd 13 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movlpd 12 * SIZE(B), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movlpd 10 * SIZE(B), %xmm4 mulsd %xmm4, %xmm2 movlpd 9 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movlpd 8 * SIZE(B), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movlpd 5 * SIZE(B), %xmm4 mulsd %xmm4, %xmm1 movlpd 4 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movlpd 0 * SIZE(B), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BB) movlpd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movlpd %xmm5, 4 * SIZE(BB) movlpd %xmm5, 5 * SIZE(BB) movhpd %xmm5, 6 * SIZE(BB) movhpd %xmm5, 7 * SIZE(BB) #else movlpd %xmm0, 0 * SIZE(AA) movlpd %xmm1, 1 * SIZE(AA) movlpd %xmm2, 2 * SIZE(AA) movlpd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) #else movlpd %xmm0, 0 * SIZE(CO1) movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA,%eax, SIZE), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 4), B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L01 ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_2x4_sse3.S000066400000000000000000001076131313527062700215710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #ifdef PENTIUM4 #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef PENTIUMM #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif testl $1, N je .L30 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 4 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) #else prefetchnta 2 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd 4 * SIZE(AA), %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm2, %xmm7 movddup 8 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movddup 5 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm3, %xmm5 movddup 6 * SIZE(BB), %xmm3 mulpd 12 * SIZE(AA), %xmm3 addpd %xmm3, %xmm6 movddup 7 * SIZE(BB), %xmm3 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #else movapd 0 * SIZE(AA), %xmm0 subpd %xmm4, %xmm0 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(AA), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RT movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L89 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd 8 * SIZE(AA), %xmm1 movhpd 9 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movhpd 1 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsd 8 * SIZE(BB), %xmm3 movhpd 9 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 subsd %xmm4, %xmm0 #else movsd 0 * SIZE(AA), %xmm0 subsd %xmm4, %xmm0 #endif #ifdef LN movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #ifdef RT movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) #else movsd %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L89: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L30: testl $2, N je .L60 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) prefetchnta -2 * SIZE(CO1, LDC, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(BB), %xmm2 movapd 2 * SIZE(BB), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 movddup 2 * SIZE(AA), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 movddup 1 * SIZE(AA), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup 1 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup 3 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup 3 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 2 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BB) movapd %xmm3, 2 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 2 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 3 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup 5 * SIZE(AA), %xmm0 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm0 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 6 * SIZE(AA), %xmm0 mulpd 12 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 7 * SIZE(AA), %xmm0 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 movddup 9 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 mulpd 18 * SIZE(BB), %xmm1 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 10 * SIZE(AA), %xmm1 mulpd 20 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 11 * SIZE(AA), %xmm1 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 movddup 13 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 26 * SIZE(BB), %xmm1 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 14 * SIZE(AA), %xmm1 mulpd 28 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 15 * SIZE(AA), %xmm1 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L58 .L56: mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax addl %eax, AA leal (B, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm0 subpd %xmm4, %xmm0 #else movapd 0 * SIZE(AA), %xmm1 subpd %xmm4, %xmm1 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 #endif #ifdef LN movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L59: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L60: movl N, %eax sarl $2, %eax movl %eax, J jle .L999 ALIGN_2 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax #ifdef LN prefetchnta -2 * SIZE(CO1) prefetchnta -2 * SIZE(CO1, LDC, 1) prefetchnta -2 * SIZE(CO1, LDC, 2) prefetchnta -2 * SIZE(CO1, %eax, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) prefetchnta 2 * SIZE(CO1, LDC, 2) prefetchnta 2 * SIZE(CO1, %eax, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 17 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 18 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 19 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 10 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 20 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 21 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 22 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 23 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 32 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 25 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 26 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 27 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 28 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 29 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 30 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 31 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 40 * SIZE(BB), %xmm3 addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 2 * SIZE(BB), %xmm5 movapd 4 * SIZE(BB), %xmm3 movapd 6 * SIZE(BB), %xmm7 subpd %xmm4, %xmm2 subpd %xmm6, %xmm5 subpd %xmm0, %xmm3 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 subpd %xmm6, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 movddup 2 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm7, %xmm6 subpd %xmm6, %xmm5 movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm5 movddup 1 * SIZE(AA), %xmm4 movapd %xmm4, %xmm6 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 mulpd %xmm5, %xmm6 subpd %xmm6, %xmm7 movddup 3 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm3 mulpd %xmm4, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 movddup 1 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup 2 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movddup 3 * SIZE(BB), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm3 movddup 5 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 6 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm2 movddup 7 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movddup 10 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup 11 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm3 movddup 15 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup 15 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm3 movddup 14 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm2 movddup 13 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movddup 12 * SIZE(BB), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm0 movddup 10 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm2 movddup 9 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm1 movddup 8 * SIZE(BB), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movddup 5 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm1 movddup 4 * SIZE(BB), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup 0 * SIZE(BB), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BB) movapd %xmm5, 2 * SIZE(BB) movapd %xmm3, 4 * SIZE(BB) movapd %xmm7, 6 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) movsd %xmm5, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $4, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 18 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 22 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 26 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 7 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 30 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 34 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 36 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 9 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 38 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 48 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 42 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 44 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 11 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 46 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 56 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 50 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 52 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 13 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 54 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 64 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 58 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 60 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 15 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 62 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 72 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L28 .L26: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm0 movapd 2 * SIZE(BB), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #else movapd 0 * SIZE(AA), %xmm1 movapd 2 * SIZE(AA), %xmm3 subpd %xmm4, %xmm1 subpd %xmm5, %xmm3 movapd %xmm1, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm3, %xmm2 unpckhpd %xmm3, %xmm3 #endif #ifdef LN movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AA), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 movsd 2 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 movsd 3 * SIZE(BB), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm3 movsd 5 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 6 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm2 movsd 7 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm3 movsd 10 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd 11 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm3 movsd 15 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 #endif #ifdef RT movsd 15 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm3 movsd 14 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm2 movsd 13 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm1 movsd 12 * SIZE(BB), %xmm4 mulsd %xmm3, %xmm4 subsd %xmm4, %xmm0 movsd 10 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm2 movsd 9 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm1 movsd 8 * SIZE(BB), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 movsd 5 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm1 movsd 4 * SIZE(BB), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 movsd 0 * SIZE(BB), %xmm4 mulsd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) movsd %xmm2, 2 * SIZE(AA) movsd %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) #else movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L29: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_4x2_core2.S000066400000000000000000001126651313527062700217310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK subl $-16 * SIZE, A subl $-16 * SIZE, B sall $BASE_SHIFT, LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax testl $1, %eax jle .L100 ALIGN_2 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, -16 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 # coffset = c #ifndef RT addl LDC, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 .L111: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AA), %xmm1 addpd %xmm0, %xmm5 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -6 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 movapd -4 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -2 * SIZE(AA), %xmm1 addpd %xmm2, %xmm5 movapd 8 * SIZE(AA), %xmm2 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm0 mulpd 2 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 6 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm2, %xmm4 movapd 12 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm2, %xmm5 movapd 24 * SIZE(AA), %xmm2 addpd %xmm3, %xmm7 movapd 8 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L111 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $2 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 #endif subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd -1 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movsd -2 * SIZE(AA), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd -3 * SIZE(AA), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd -4 * SIZE(AA), %xmm7 mulsd %xmm3, %xmm7 subsd %xmm7, %xmm0 movsd -6 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -7 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm2 movsd -8 * SIZE(AA), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd -11 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -12 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -14 * SIZE(AA), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd -13 * SIZE(AA), %xmm7 mulsd %xmm0, %xmm7 subsd %xmm7, %xmm3 movsd -11 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -10 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm1 movsd -9 * SIZE(AA), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd -6 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd -5 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd -1 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movddup %xmm0, %xmm2 movddup %xmm1, %xmm3 unpckhpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm2, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) movapd %xmm3, -12 * SIZE(BB) movapd %xmm1, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 movapd -8 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm2, %xmm3 movapd -6 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -2 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd 8 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd 8 * SIZE(BB), %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm0 #else movapd -16 * SIZE(AA), %xmm0 #endif subpd %xmm4, %xmm0 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd -14 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd -16 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd -15 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -13 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm2, %xmm0 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(B) movddup %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 movapd %xmm1, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: movl M, %ebx testl $1, %ebx jle .L159 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -8 * SIZE(BB), %xmm3 movsd -12 * SIZE(AA), %xmm2 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -14 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -12 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -13 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -10 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -8 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -0 * SIZE(BB), %xmm1 mulsd %xmm2, %xmm3 movsd -11 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -6 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -10 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd -4 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -9 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -2 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -4 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd 8 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: addsd %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm0 #else movsd -16 * SIZE(AA), %xmm0 #endif subsd %xmm4, %xmm0 #if defined(LN) || defined(LT) mulsd -16 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd -16 * SIZE(B), %xmm0 #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(B) movsd %xmm0, -16 * SIZE(BB) movsd %xmm0, -15 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L159: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 1), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_2 .L100: movl N, %eax sarl $1, %eax movl %eax, J jle .L999 ALIGN_2 .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 # coffset = c #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm7, %xmm7 prefetcht2 -3 * SIZE(CO1, LDC) #else prefetcht2 3 * SIZE(CO1) pxor %xmm7, %xmm7 prefetcht2 3 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd 8 * SIZE(AA), %xmm3 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd 8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm2, %xmm6 movapd 12 * SIZE(AA), %xmm3 addpd %xmm1, %xmm7 movapd 12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 subl $-32 * SIZE, BB movapd 24 * SIZE(AA), %xmm3 subl $-32 * SIZE, AA addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -16 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd -16 * SIZE(B), %xmm2 movapd -14 * SIZE(B), %xmm3 movapd -12 * SIZE(B), %xmm5 movapd -10 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 subpd %xmm6, %xmm5 subpd %xmm1, %xmm7 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 movapd -12 * SIZE(AA), %xmm2 movapd -10 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 subpd %xmm5, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 movddup -2 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm5 movddup -3 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm3 movddup -4 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm2 movddup -6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movddup -7 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm3 movddup -8 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm2 movddup -11 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -12 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movddup -15 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movddup -14 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm5 movddup -13 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm7 movddup -11 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -10 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm5 movddup -9 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm7 movddup -6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movddup -5 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm7 movddup -1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 movddup -15 * SIZE(B), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 mulpd %xmm1, %xmm5 subpd %xmm5, %xmm3 movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 #endif #ifdef RT movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 movddup -14 * SIZE(B), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 mulpd %xmm3, %xmm5 subpd %xmm5, %xmm1 movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movsd %xmm7, 3 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm3, 1 * SIZE(CO1, LDC) movhpd %xmm5, 2 * SIZE(CO1, LDC) movhpd %xmm7, 3 * SIZE(CO1, LDC) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm2, 1 * SIZE(CO1, LDC) movsd %xmm3, 2 * SIZE(CO1, LDC) movhpd %xmm3, 3 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(B) movapd %xmm3, -14 * SIZE(B) movapd %xmm5, -12 * SIZE(B) movapd %xmm7, -10 * SIZE(B) movddup %xmm2, %xmm0 movddup %xmm3, %xmm1 movddup %xmm5, %xmm4 movddup %xmm7, %xmm6 unpckhpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 unpckhpd %xmm5, %xmm5 unpckhpd %xmm7, %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm2, -14 * SIZE(BB) movapd %xmm1, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) movapd %xmm2, -12 * SIZE(AA) movapd %xmm3, -10 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 addpd %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 addpd %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 addpd %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 addpd %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L31 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd -16 * SIZE(B), %xmm2 movapd -14 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd -16 * SIZE(AA), %xmm0 movapd -14 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movddup -13 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup -14 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movddup -15 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movddup -13 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 #endif #ifdef RN movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movddup -15 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movddup -13 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movddup -14 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movddup -16 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movsd %xmm3, 1 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO1, LDC) movhpd %xmm3, 1 * SIZE(CO1, LDC) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) movhpd %xmm1, 1 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movapd %xmm2, -16 * SIZE(B) movapd %xmm3, -14 * SIZE(B) movddup %xmm2, %xmm0 movddup %xmm3, %xmm1 unpckhpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 movapd %xmm0, -16 * SIZE(BB) movapd %xmm2, -14 * SIZE(BB) movapd %xmm1, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) #else movapd %xmm0, -16 * SIZE(AA) movapd %xmm1, -14 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: movl M, %ebx testl $1, %ebx jle .L99 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -12 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm1 mulsd -10 * SIZE(BB), %xmm0 addsd %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 addsd %xmm0, %xmm7 movsd -14 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd -4 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd -13 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 8 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd -8 * SIZE(AA), %xmm0 mulsd %xmm2, %xmm1 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm4 movsd 4 * SIZE(BB), %xmm1 addsd %xmm2, %xmm5 movsd -11 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm1 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm6 movsd 16 * SIZE(BB), %xmm1 addsd %xmm2, %xmm7 movsd -10 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 10 * SIZE(BB), %xmm2 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm2, %xmm5 movsd -9 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 14 * SIZE(BB), %xmm2 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm2, %xmm7 movsd -4 * SIZE(AA), %xmm2 subl $-8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L51 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm0 movsd -15 * SIZE(B), %xmm1 #else movsd -16 * SIZE(AA), %xmm0 movsd -15 * SIZE(AA), %xmm1 #endif subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #if defined(LN) || defined(LT) movsd -16 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm0 mulsd %xmm2, %xmm1 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm0 movsd -15 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 mulsd -13 * SIZE(B), %xmm1 #endif #ifdef RT mulsd -13 * SIZE(B), %xmm1 movsd -14 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 mulsd -16 * SIZE(B), %xmm0 #endif #ifdef LN subl $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC) #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(B) movsd %xmm1, -15 * SIZE(B) movsd %xmm0, -16 * SIZE(BB) movsd %xmm0, -15 * SIZE(BB) movsd %xmm1, -14 * SIZE(BB) movsd %xmm1, -13 * SIZE(BB) #else movsd %xmm0, -16 * SIZE(AA) movsd %xmm1, -15 * SIZE(AA) #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_4x2_sse2.S000066400000000000000000001237321313527062700215700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA 16 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define AA %edx #define BB %ecx #define PREFETCHSIZE (8 * 4) #define KERNEL1(address) \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK sall $BASE_SHIFT, LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax testl $1, %eax jle .L100 ALIGN_2 .L101: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movsd 0 * SIZE(B), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, B addl $2 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, %esi # coffset = c #ifndef RT addl LDC, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 .L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 10 * SIZE(AA), %xmm2 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm6 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 14 * SIZE(AA), %xmm2 addpd %xmm1, %xmm5 movapd 24 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movapd 16 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm0 mulpd 18 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 20 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd 10 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 22 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 32 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd 12 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 26 * SIZE(AA), %xmm3 addpd %xmm1, %xmm4 movapd 28 * SIZE(AA), %xmm1 addpd %xmm3, %xmm6 movapd 14 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 30 * SIZE(AA), %xmm3 addpd %xmm1, %xmm5 movapd 40 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movapd 24 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L111 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 addl $4 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm0 movapd 2 * SIZE(B), %xmm1 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 #endif subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd 15 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 movsd 14 * SIZE(AA), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 13 * SIZE(AA), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 12 * SIZE(AA), %xmm7 mulsd %xmm3, %xmm7 subsd %xmm7, %xmm0 movsd 10 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 9 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm2 movsd 8 * SIZE(AA), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 5 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 4 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movapd %xmm1, %xmm3 unpckhpd %xmm3, %xmm3 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 2 * SIZE(AA), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 3 * SIZE(AA), %xmm7 mulsd %xmm0, %xmm7 subsd %xmm7, %xmm3 movsd 5 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 6 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm1 movsd 7 * SIZE(AA), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 10 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm1 movsd 11 * SIZE(AA), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 15 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm3 unpcklpd %xmm2, %xmm0 unpcklpd %xmm3, %xmm1 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movhpd %xmm0, 2 * SIZE(BB) movhpd %xmm0, 3 * SIZE(BB) movsd %xmm1, 4 * SIZE(BB) movsd %xmm1, 5 * SIZE(BB) movhpd %xmm1, 6 * SIZE(BB) movhpd %xmm1, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 2 * SIZE(%esi) movhpd %xmm1, 3 * SIZE(%esi) #ifndef LN addl $4 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movapd 0 * SIZE(AA), %xmm0 mulpd 0 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 addl $2 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm0 #else movapd 0 * SIZE(AA), %xmm0 #endif subpd %xmm4, %xmm0 #ifdef LN movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 movsd 2 * SIZE(AA), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm2, %xmm0 #endif #ifdef LT movapd %xmm0, %xmm2 unpckhpd %xmm2, %xmm2 movsd 0 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm0 movsd 1 * SIZE(AA), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 3 * SIZE(AA), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm2, %xmm0 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm0, 0 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movhpd %xmm0, 2 * SIZE(BB) movhpd %xmm0, 3 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) #ifndef LN addl $2 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: movl M, %ebx testl $1, %ebx jle .L159 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal BUFFER, BB movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LN) || defined(RT) movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 mulsd 2 * SIZE(BB), %xmm0 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movsd 5 * SIZE(AA), %xmm1 addsd %xmm3, %xmm4 mulsd 10 * SIZE(BB), %xmm1 movsd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm4 movsd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 2), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm0 #else movsd 0 * SIZE(AA), %xmm0 #endif subsd %xmm4, %xmm0 #if defined(LN) || defined(LT) mulsd 0 * SIZE(AA), %xmm0 #endif #if defined(RN) || defined(RT) mulsd 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) #ifndef LN addl $1 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L159: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 1), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_2 .L100: movl N, %eax sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L999 ALIGN_2 .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, %esi # coffset = c #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #ifdef PENTIUM4 andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else sarl $3, %eax je .L12 .L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $32 * SIZE, %ecx addl $32 * SIZE, %edx decl %eax jne .L11 #endif .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd %xmm6, %xmm1 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm1 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 subpd %xmm6, %xmm5 subpd %xmm1, %xmm7 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 movapd 4 * SIZE(AA), %xmm2 movapd 6 * SIZE(AA), %xmm3 subpd %xmm4, %xmm0 subpd %xmm6, %xmm1 subpd %xmm5, %xmm2 subpd %xmm7, %xmm3 #endif #ifdef LN movsd 15 * SIZE(AA), %xmm0 movhpd 15 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 movsd 14 * SIZE(AA), %xmm0 movhpd 14 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm5 movsd 13 * SIZE(AA), %xmm0 movhpd 13 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 movhpd 12 * SIZE(AA), %xmm0 mulpd %xmm7, %xmm0 subpd %xmm0, %xmm2 movsd 10 * SIZE(AA), %xmm0 movhpd 10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movsd 9 * SIZE(AA), %xmm0 movhpd 9 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm3 movsd 8 * SIZE(AA), %xmm0 movhpd 8 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm2 movsd 5 * SIZE(AA), %xmm0 movhpd 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 movhpd 4 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 movhpd 2 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm7 movsd 5 * SIZE(AA), %xmm0 movhpd 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 movhpd 6 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm5 movsd 7 * SIZE(AA), %xmm0 movhpd 7 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm7 movsd 10 * SIZE(AA), %xmm0 movhpd 10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm5 movsd 11 * SIZE(AA), %xmm0 movhpd 11 * SIZE(AA), %xmm0 mulpd %xmm5, %xmm0 subpd %xmm0, %xmm7 movsd 15 * SIZE(AA), %xmm0 movhpd 15 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm7 #endif #ifdef RN movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm2 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm3 movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 #endif #ifdef RT movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm2 mulpd %xmm4, %xmm3 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm2, %xmm4 subpd %xmm4, %xmm0 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm1 movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 mulpd %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movsd %xmm2, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movsd %xmm3, 4 * SIZE(BB) movsd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) movsd %xmm5, 8 * SIZE(BB) movsd %xmm5, 9 * SIZE(BB) movhpd %xmm5, 10 * SIZE(BB) movhpd %xmm5, 11 * SIZE(BB) movsd %xmm7, 12 * SIZE(BB) movsd %xmm7, 13 * SIZE(BB) movhpd %xmm7, 14 * SIZE(BB) movhpd %xmm7, 15 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) movapd %xmm2, 4 * SIZE(AA) movapd %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, %esi #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(%esi) movsd %xmm3, 1 * SIZE(%esi) movsd %xmm5, 2 * SIZE(%esi) movsd %xmm7, 3 * SIZE(%esi) movhpd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm3, 1 * SIZE(%esi, LDC) movhpd %xmm5, 2 * SIZE(%esi, LDC) movhpd %xmm7, 3 * SIZE(%esi, LDC) #else movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 2 * SIZE(%esi) movhpd %xmm1, 3 * SIZE(%esi) movsd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm2, 1 * SIZE(%esi, LDC) movsd %xmm3, 2 * SIZE(%esi, LDC) movhpd %xmm3, 3 * SIZE(%esi, LDC) #endif #ifndef LN addl $4 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L31 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movapd %xmm4, %xmm0 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm0 movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm4, %xmm2 subpd %xmm0, %xmm3 #else movapd 0 * SIZE(AA), %xmm0 movapd 2 * SIZE(AA), %xmm1 subpd %xmm4, %xmm0 subpd %xmm5, %xmm1 #endif #ifdef LN movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 movhpd 2 * SIZE(AA), %xmm0 mulpd %xmm3, %xmm0 subpd %xmm0, %xmm2 movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm0 movhpd 0 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm0 subpd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 movhpd 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 #endif #ifdef RN movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 movsd 1 * SIZE(B), %xmm4 movhpd 1 * SIZE(B), %xmm4 mulpd %xmm0, %xmm4 subpd %xmm4, %xmm1 movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 #endif #ifdef RT movsd 3 * SIZE(B), %xmm4 movhpd 3 * SIZE(B), %xmm4 mulpd %xmm4, %xmm1 movsd 2 * SIZE(B), %xmm4 movhpd 2 * SIZE(B), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm0 movsd 0 * SIZE(B), %xmm4 movhpd 0 * SIZE(B), %xmm4 mulpd %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movsd %xmm2, 0 * SIZE(BB) movsd %xmm2, 1 * SIZE(BB) movhpd %xmm2, 2 * SIZE(BB) movhpd %xmm2, 3 * SIZE(BB) movsd %xmm3, 4 * SIZE(BB) movsd %xmm3, 5 * SIZE(BB) movhpd %xmm3, 6 * SIZE(BB) movhpd %xmm3, 7 * SIZE(BB) #else movapd %xmm0, 0 * SIZE(AA) movapd %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, %esi #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(%esi) movsd %xmm3, 1 * SIZE(%esi) movhpd %xmm2, 0 * SIZE(%esi, LDC) movhpd %xmm3, 1 * SIZE(%esi, LDC) #else movsd %xmm0, 0 * SIZE(%esi) movhpd %xmm0, 1 * SIZE(%esi) movsd %xmm1, 0 * SIZE(%esi, LDC) movhpd %xmm1, 1 * SIZE(%esi, LDC) #endif #ifndef LN addl $2 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: movl M, %ebx testl $1, %ebx jle .L99 #ifdef LN movl K, %eax sall $0 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA #endif leal BUFFER, %ecx #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L51 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm1 #endif subsd %xmm4, %xmm0 subsd %xmm5, %xmm1 #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm0 mulsd %xmm2, %xmm1 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm1 mulsd 3 * SIZE(B), %xmm1 #endif #ifdef RT mulsd 3 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm4 mulsd %xmm1, %xmm4 subsd %xmm4, %xmm0 mulsd 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(B) movsd %xmm1, 1 * SIZE(B) movsd %xmm0, 0 * SIZE(BB) movsd %xmm0, 1 * SIZE(BB) movsd %xmm1, 2 * SIZE(BB) movsd %xmm1, 3 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, %esi #endif movsd %xmm0, 0 * SIZE(%esi) movsd %xmm1, 0 * SIZE(%esi, LDC) #ifndef LN addl $1 * SIZE, %esi #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $0 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_4x4_penryn.S000066400000000000000000001540601313527062700222270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 20 + STACK + ARGS(%esp) #define ARG_B 24 + STACK + ARGS(%esp) #define C 28 + STACK + ARGS(%esp) #define ARG_LDC 32 + STACK + ARGS(%esp) #define OFFSET 36 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 4) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK leal (, LDC, SIZE), LDC subl $-32 * SIZE, A subl $-32 * SIZE, B #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif testl $1, N je .L40 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss -32 * SIZE(BB), %xmm1 movss -31 * SIZE(BB), %xmm3 movss -30 * SIZE(BB), %xmm5 movss -29 * SIZE(BB), %xmm7 subss %xmm4, %xmm1 subss %xmm6, %xmm3 subss %xmm0, %xmm5 subss %xmm2, %xmm7 #else movaps -32 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 #endif #if defined(RN) || defined(RT) movss -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) movss %xmm3, -31 * SIZE(BB) movss %xmm5, -30 * SIZE(BB) movss %xmm7, -29 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) pshufd $1, %xmm4, %xmm6 movss -32 * SIZE(BB), %xmm1 movss -31 * SIZE(BB), %xmm3 subss %xmm4, %xmm1 subss %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movsd -32 * SIZE(AA), %xmm4 movhps -30 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 #endif #if defined(RN) || defined(RT) movss -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) movss %xmm3, -31 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L110: testl $1, M je .L119 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -31 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA leal (B, %eax, SIZE), BB #endif haddps %xmm4, %xmm4 #if defined(LN) || defined(LT) movss -32 * SIZE(BB), %xmm1 subss %xmm4, %xmm1 #else movss -32 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss -32 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss -32 * SIZE(BB), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA leal (BB, %eax, SIZE), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L119: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L40: testl $2, N je .L80 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) #endif pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), BB #endif addps %xmm6, %xmm4 addps %xmm7, %xmm5 addps %xmm2, %xmm4 addps %xmm3, %xmm5 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movsd -32 * SIZE(BB), %xmm1 movsd -30 * SIZE(BB), %xmm3 movsd -28 * SIZE(BB), %xmm5 movsd -26 * SIZE(BB), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps -32 * SIZE(AA), %xmm0 movaps -28 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) movlps %xmm5, -28 * SIZE(BB) movlps %xmm7, -26 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) movaps %xmm1, -28 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movaps -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif addps %xmm3, %xmm4 addps %xmm5, %xmm4 movhlps %xmm4, %xmm5 #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movsd -32 * SIZE(BB), %xmm1 movsd -30 * SIZE(BB), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 movsd -30 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps -32 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) movlps %xmm1, -30 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L70: testl $1, M je .L79 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -28 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -22 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -20 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -18 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -16 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif addps %xmm5, %xmm4 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 movsd -32 * SIZE(BB), %xmm1 subps %xmm4, %xmm1 #else movss -32 * SIZE(AA), %xmm0 movss -31 * SIZE(AA), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) movss %xmm1, -31 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) pshufd $1, %xmm1, %xmm3 movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L79: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L80: movl N, %eax sarl $2, %eax movl %eax, J jle .L999 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif leal (CO1, LDC, 2), %eax movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 -4 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 -4 * SIZE(%eax, LDC) #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 prefetcht0 3 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(%eax, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (B, %eax, 4), BB #endif addps %xmm3, %xmm6 addps %xmm2, %xmm7 #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm4 movaps %xmm6, %xmm2 unpcklps %xmm5, %xmm2 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm1 movlhps %xmm2, %xmm0 movhlps %xmm2, %xmm1 movaps %xmm6, %xmm7 movlhps %xmm4, %xmm6 movhlps %xmm4, %xmm7 pshufd $0x39, %xmm1, %xmm2 pshufd $0x39, %xmm7, %xmm4 movaps -32 * SIZE(BB), %xmm1 movaps -28 * SIZE(BB), %xmm3 movaps -24 * SIZE(BB), %xmm5 movaps -20 * SIZE(BB), %xmm7 subps %xmm0, %xmm1 subps %xmm2, %xmm3 subps %xmm6, %xmm5 subps %xmm4, %xmm7 #else pshufd $0x39, %xmm5, %xmm2 pshufd $0x4e, %xmm6, %xmm0 pshufd $0x93, %xmm7, %xmm7 movaps %xmm4, %xmm6 unpcklps %xmm0, %xmm4 unpckhps %xmm0, %xmm6 movaps %xmm2, %xmm1 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm5 unpcklps %xmm2, %xmm4 unpckhps %xmm2, %xmm5 movaps %xmm6, %xmm7 unpcklps %xmm1, %xmm6 unpckhps %xmm1, %xmm7 pshufd $0x93, %xmm5, %xmm5 pshufd $0x4e, %xmm6, %xmm6 pshufd $0x39, %xmm7, %xmm7 movaps -32 * SIZE(AA), %xmm0 movaps -28 * SIZE(AA), %xmm1 movaps -24 * SIZE(AA), %xmm2 movaps -20 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps -28 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps -24 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps -20 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) movaps %xmm5, -24 * SIZE(BB) movaps %xmm7, -20 * SIZE(BB) #else movaps %xmm0, -32 * SIZE(AA) movaps %xmm1, -28 * SIZE(AA) movaps %xmm2, -24 * SIZE(AA) movaps %xmm3, -20 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) movhps %xmm6, 2 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) movhps %xmm3, 2 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $4, KK #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 subl $-16 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps -32 * SIZE(BB), %xmm1 movaps -28 * SIZE(BB), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else movsd -32 * SIZE(AA), %xmm0 movsd -30 * SIZE(AA), %xmm1 movsd -28 * SIZE(AA), %xmm2 movsd -26 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps -32 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) #else movlps %xmm0, -32 * SIZE(AA) movlps %xmm1, -30 * SIZE(AA) movlps %xmm2, -28 * SIZE(AA) movlps %xmm3, -26 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: testl $1, M je .L39 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(BB), %xmm1 subps %xmm4, %xmm1 #else movsd -32 * SIZE(AA), %xmm0 movhps -30 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm3 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm3 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm3 movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BB), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm0 movaps -24 * SIZE(BB), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm0 movaps -28 * SIZE(BB), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 movaps -32 * SIZE(BB), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, -32 * SIZE(BB) #else movss %xmm0, -32 * SIZE(AA) movss %xmm1, -31 * SIZE(AA) movss %xmm2, -30 * SIZE(AA) movss %xmm3, -29 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movss %xmm1, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC, 1) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm6, 0 * SIZE(CO1, %eax, 1) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC, 1) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L39: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L10 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_4x4_sse.S000066400000000000000000002061261313527062700215070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_A 20 + STACK(%esi) #define OLD_B 24 + STACK(%esi) #define OLD_C 28 + STACK(%esi) #define OLD_LDC 32 + STACK(%esi) #define STACK_OFFT 36 + STACK(%esi) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) #endif #if defined(PENTIUM4) || defined(PENTIUMM) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-1024, %esp STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movss STACK_OFFT, %xmm4 movl OLD_B, B movl OLD_C, %ebx movl %ebx, C movl OLD_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif testl $1, N je .L40 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L85 ALIGN_4 .L82: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L86 ALIGN_4 .L90: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW 3 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 mulps 20 * SIZE(BB), %xmm1 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm3 movss 2 * SIZE(B), %xmm5 movss 3 * SIZE(B), %xmm7 subss %xmm4, %xmm1 subss %xmm6, %xmm3 subss %xmm0, %xmm5 subss %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm7, %xmm6 subss %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulss %xmm5, %xmm6 subss %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm7 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm3, 1 * SIZE(B) movss %xmm5, 2 * SIZE(B) movss %xmm7, 3 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 4 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 movaps %xmm0, 8 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 movaps %xmm0, 12 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) pshufd $1, %xmm4, %xmm6 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm3 subss %xmm4, %xmm1 subss %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulss %xmm3, %xmm6 subss %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulss %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulss %xmm6, %xmm3 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm3, 1 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 4 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L110: testl $1, M je .L119 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 48 * SIZE(BB), %xmm3 mulss 20 * SIZE(BB), %xmm1 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 movaps %xmm0, 0 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L119: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B, %eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L40: testl $2, N je .L80 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L45 ALIGN_4 .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L50 ALIGN_4 .L46: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW 3 * SIZE(CO1) PREFETCHW 3 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $1 + BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 4 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd 6 * SIZE(B), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) movlps %xmm7, 6 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(BB) movaps %xmm2, 12 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm2 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm2 movaps %xmm0, 24 * SIZE(BB) movaps %xmm2, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm1 unpcklps %xmm7, %xmm3 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L65 ALIGN_4 .L62: #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 2 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm5, %xmm1 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(BB) movaps %xmm2, 12 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm1, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm3, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L70: testl $1, M je .L79 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addss %xmm6, %xmm4 addss %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 subps %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 #endif #ifdef RT movaps 0 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) pshufd $1, %xmm1, %xmm3 movss %xmm1, 0 * SIZE(CO1) movss %xmm3, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L79: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif ALIGN_4 .L80: movl N, %eax sarl $2, %eax movl %eax, J jle .L999 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L05 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L10 movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L10: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 4), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 leal (LDC, LDC, 2), %eax PREFETCHW 3 * SIZE(CO1) PREFETCHW 3 * SIZE(CO1, LDC) PREFETCHW 3 * SIZE(CO1, LDC, 2) PREFETCHW 3 * SIZE(CO1, %eax) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(0 * 16) KERNEL2(0 * 16) KERNEL3(0 * 16) KERNEL4(0 * 16) KERNEL5(0 * 16) KERNEL6(0 * 16) KERNEL7(0 * 16) KERNEL8(0 * 16) addl $128 * SIZE, BB addl $32 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $2 + BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm0 movaps %xmm5, %xmm1 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm5 movaps 12 * SIZE(B), %xmm7 subps %xmm4, %xmm1 subps %xmm6, %xmm3 subps %xmm0, %xmm5 subps %xmm2, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 movaps 8 * SIZE(AA), %xmm2 movaps 12 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm7, %xmm6 subps %xmm6, %xmm1 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0x55, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm1 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0x00, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm7 movaps 4 * SIZE(AA), %xmm4 pshufd $0x55, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm7 movaps 8 * SIZE(AA), %xmm4 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm6, %xmm5 pshufd $0xff, %xmm4, %xmm6 mulps %xmm5, %xmm6 subps %xmm6, %xmm7 movaps 12 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) movaps %xmm5, 8 * SIZE(B) movaps %xmm7, 12 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm2 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm0, 32 * SIZE(BB) movaps %xmm2, 36 * SIZE(BB) movaps %xmm4, 40 * SIZE(BB) movaps %xmm6, 44 * SIZE(BB) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm2 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm6 movaps %xmm0, 48 * SIZE(BB) movaps %xmm2, 52 * SIZE(BB) movaps %xmm4, 56 * SIZE(BB) movaps %xmm6, 60 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) movaps %xmm2, 8 * SIZE(AA) movaps %xmm3, 12 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movhps %xmm2, 2 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) movhps %xmm6, 2 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) movhps %xmm3, 2 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $16 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 76 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 96 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 92 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 112 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 108 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 128 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 124 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 144 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 16 * SIZE(BB), %xmm2 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $1 + BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 subps %xmm4, %xmm1 subps %xmm6, %xmm3 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 2 * SIZE(AA), %xmm1 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 4 * SIZE(AA), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 6 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm5, %xmm1 subps %xmm6, %xmm2 subps %xmm7, %xmm3 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 pshufd $0xaa, %xmm4, %xmm6 mulps %xmm3, %xmm6 subps %xmm6, %xmm1 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 pshufd $0x55, %xmm4, %xmm6 mulps %xmm1, %xmm6 subps %xmm6, %xmm3 pshufd $0xff, %xmm4, %xmm6 mulps %xmm6, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm0, %xmm7 subps %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulps %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm3, %xmm7 subps %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulps %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm2, %xmm7 subps %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulps %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulps %xmm1, %xmm7 subps %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulps %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm2 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 movaps %xmm0, 16 * SIZE(BB) movaps %xmm2, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm1, 2 * SIZE(AA) movlps %xmm2, 4 * SIZE(AA) movlps %xmm3, 6 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC, 1) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm6, 0 * SIZE(CO1, %eax, 1) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC, 1) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: testl $1, M je .L39 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movss 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movss 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 20 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 36 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 40 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 44 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 52 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 56 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 60 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 68 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 72 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 76 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 96 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 84 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 88 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 92 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 112 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 100 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 104 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 108 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 128 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 116 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 120 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 124 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 144 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $4, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB leal (AA, %eax, SIZE), AA sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm6, %xmm4 unpcklps %xmm7, %xmm5 unpcklps %xmm5, %xmm4 movaps 0 * SIZE(B), %xmm1 subps %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm1 movss 2 * SIZE(AA), %xmm2 movss 3 * SIZE(AA), %xmm3 subss %xmm4, %xmm0 subss %xmm5, %xmm1 subss %xmm6, %xmm2 subss %xmm7, %xmm3 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm4 pshufd $0x00, %xmm4, %xmm6 mulps %xmm6, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 pshufd $0x55, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm0, %xmm7 subss %xmm7, %xmm3 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm3 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0xff, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm3 movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 #endif #ifdef RT movaps 12 * SIZE(B), %xmm6 pshufd $0xff, %xmm6, %xmm7 mulss %xmm7, %xmm3 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm3, %xmm7 subss %xmm7, %xmm0 movaps 8 * SIZE(B), %xmm6 pshufd $0xaa, %xmm6, %xmm7 mulss %xmm7, %xmm2 pshufd $0x55, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm2, %xmm7 subss %xmm7, %xmm0 movaps 4 * SIZE(B), %xmm6 pshufd $0x55, %xmm6, %xmm7 mulss %xmm7, %xmm1 pshufd $0x00, %xmm6, %xmm7 mulss %xmm1, %xmm7 subss %xmm7, %xmm0 movaps 0 * SIZE(B), %xmm6 pshufd $0x00, %xmm6, %xmm7 mulss %xmm7, %xmm0 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm0 pshufd $0x55, %xmm1, %xmm2 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm0, 0 * SIZE(BB) movaps %xmm2, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm1, 1 * SIZE(AA) movss %xmm2, 2 * SIZE(AA) movss %xmm3, 3 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif leal (LDC, LDC, 2), %eax #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm0 movaps %xmm3, %xmm4 unpcklps %xmm7, %xmm3 unpckhps %xmm7, %xmm4 movaps %xmm1, %xmm2 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm2 movaps %xmm0, %xmm6 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm6 movss %xmm1, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC, 1) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm6, 0 * SIZE(CO1, %eax, 1) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO1, LDC, 1) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO1, %eax, 1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L39: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 4), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 4), B #endif #ifdef RN addl $4, KK #endif #ifdef RT subl $4, KK #endif decl J # j -- jg .L01 ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/trsm_kernel_RT_8x2_sse.S000066400000000000000000001765371313527062700215250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 20 + STACK + ARGS(%esi) #define STACK_B 24 + STACK + ARGS(%esi) #define STACK_C 28 + STACK + ARGS(%esi) #define STACK_LDC 32 + STACK + ARGS(%esi) #define STACK_OFFT 36 + STACK + ARGS(%esi) #define TRMASK 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #ifdef HAVE_3DNOW #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 96 #endif #define B %edi #define AA %edx #define BB %ecx #define LDC %ebp #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if !defined(HAVE_SSE2) || defined(OPTERON) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-STACK_ALIGN, %esp STACK_TOUCHING movss STACK_M, %xmm0 movl STACK_N, %eax movss STACK_K, %xmm1 movss STACK_A, %xmm2 movl STACK_B, B movss STACK_C, %xmm3 movl STACK_LDC, LDC movss STACK_OFFT, %xmm4 movss %xmm1, K movl %eax, N movss %xmm0, M movss %xmm2, A movss %xmm3, C movl %esi, OLD_STACK movss %xmm4, OFFSET movss %xmm4, KK leal (, LDC, SIZE), LDC #ifdef LN movl M, %eax leal (, %eax, SIZE), %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax leal (, %eax, SIZE), %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif #if defined(LN) || defined(LT) movl $0x3f800000, 0 + TRMASK # 1.0 movl $0x00000000, 4 + TRMASK # 0.0 movl $0x3f800000, 8 + TRMASK # 1.0 movl $0x00000000, 12 + TRMASK # 0.0 #endif testl $1, N jle .L100 #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm2 shufps $0xaa, %xmm2, %xmm2 shufps $0xff, %xmm3, %xmm3 movaps %xmm7, %xmm4 shufps $0x00, %xmm4, %xmm4 movaps %xmm7, %xmm5 shufps $0x55, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xaa, %xmm6, %xmm6 shufps $0xff, %xmm7, %xmm7 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax BRANCH jne .L102 ALIGN_2 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movss 0 * SIZE(B), %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $3 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $3 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 PREFETCHW 7 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_2 .L111: mulps %xmm2, %xmm0 mulps 4 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 4 * SIZE(BB), %xmm2 mulps %xmm2, %xmm0 mulps 12 * SIZE(AA), %xmm2 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 8 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 20 * SIZE(AA), %xmm2 addps %xmm1, %xmm4 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 28 * SIZE(AA), %xmm2 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 36 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 40 * SIZE(AA), %xmm0 addps %xmm3, %xmm6 movaps 20 * SIZE(BB), %xmm3 mulps %xmm3, %xmm0 mulps 44 * SIZE(AA), %xmm3 addps %xmm0, %xmm5 movaps 64 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 52 * SIZE(AA), %xmm3 addps %xmm1, %xmm4 movaps 56 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 60 * SIZE(AA), %xmm3 addps %xmm1, %xmm5 movaps 80 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $64 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_2 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm0 addps %xmm0, %xmm4 mulps 4 * SIZE(AA), %xmm2 addps %xmm2, %xmm6 addl $8 * SIZE, AA addl $4 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $8, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 8), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm5 movhps 6 * SIZE(B), %xmm5 subps %xmm4, %xmm2 subps %xmm6, %xmm5 xorps %xmm0, %xmm0 movaps %xmm2, %xmm3 unpcklps %xmm0, %xmm2 unpckhps %xmm0, %xmm3 movaps %xmm5, %xmm7 unpcklps %xmm0, %xmm5 unpckhps %xmm0, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 subps %xmm4, %xmm0 subps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 movaps %xmm7, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 62 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movsd 60 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 58 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 56 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 52 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 50 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 48 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 44 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 42 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 40 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 34 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 32 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 26 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 24 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 16 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 10 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 19 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 20 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 22 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 28 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 30 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 37 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 38 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 46 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 55 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 shufps $0x88, %xmm7, %xmm5 movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) movhps %xmm5, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm5, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm5, 4 * SIZE(CO1) movhps %xmm5, 6 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) #endif #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 8), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $8, KK movl BORIG, B #endif #ifdef LT addl $8, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $3 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_2 .L130: testl $4, M jle .L150 #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $2 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movsd 0 * SIZE(AA), %xmm0 movhps 2 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movsd 16 * SIZE(AA), %xmm1 movhps 18 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L132 ALIGN_2 .L131: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 4 * SIZE(BB), %xmm0 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 mulps 20 * SIZE(BB), %xmm1 movaps 48 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L131 ALIGN_2 .L132: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 subps %xmm4, %xmm2 xorps %xmm5, %xmm5 movaps %xmm2, %xmm3 unpcklps %xmm5, %xmm2 unpckhps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 11 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #ifdef RT movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L150: testl $2, M jle .L170 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L152 ALIGN_2 .L151: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L151 ALIGN_2 .L152: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L153 ALIGN_4 .L154: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 shufps $1, %xmm5, %xmm5 movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 subss %xmm4, %xmm0 subss %xmm5, %xmm1 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 subps %xmm4, %xmm0 #endif #ifdef LN movaps 0 * SIZE(AA), %xmm4 movaps %xmm4, %xmm6 shufps $0xff, %xmm6, %xmm6 mulss %xmm6, %xmm1 movaps %xmm4, %xmm6 shufps $0xaa, %xmm6, %xmm6 mulss %xmm1, %xmm6 subss %xmm6, %xmm0 mulss %xmm4, %xmm0 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm4 mulss %xmm4, %xmm0 movaps %xmm4, %xmm6 shufps $0x55, %xmm6, %xmm6 mulss %xmm0, %xmm6 subss %xmm6, %xmm1 movaps %xmm4, %xmm6 shufps $0xff, %xmm6, %xmm6 mulss %xmm6, %xmm1 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #ifdef RT movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) shufps $0x00, %xmm0, %xmm0 shufps $0x00, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 1 * SIZE(CO1) #else movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L170: testl $1, M jle .L179 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA leal (AA, %eax, SIZE), AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L172 ALIGN_2 .L171: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 mulss 4 * SIZE(BB), %xmm0 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 mulss 20 * SIZE(BB), %xmm1 movss 48 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L171 ALIGN_2 .L172: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L174 .L173: movss 0 * SIZE(AA), %xmm0 movss 0 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L173 ALIGN_4 .L174: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm4, %xmm1 #else movss 0 * SIZE(AA), %xmm0 subss %xmm4, %xmm0 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AA), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm0 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) shufps $0x00, %xmm1, %xmm1 movaps %xmm1, 0 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (AA, %eax, SIZE), AA #ifdef LT addl $1 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L179: #ifdef LN movl K, %eax leal (B, %eax, SIZE), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (B, %eax, SIZE), B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L100: movl N, %eax sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L999 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_4 .L02: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm2 shufps $0xaa, %xmm2, %xmm2 shufps $0xff, %xmm3, %xmm3 movaps %xmm7, %xmm4 shufps $0x00, %xmm4, %xmm4 movaps %xmm7, %xmm5 shufps $0x55, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xaa, %xmm6, %xmm6 shufps $0xff, %xmm7, %xmm7 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax BRANCH jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movsd 0 * SIZE(B), %xmm3 #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $3, %ebx jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $3 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $3 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 PREFETCHW 7 * SIZE(CO1) PREFETCHW 7 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L12 ALIGN_2 .L11: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 ALIGN_2 .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 4 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm1, %xmm7 addl $8 * SIZE, AA addl $8 * SIZE, BB subl $1, %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $8, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 8), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm0 movaps %xmm6, %xmm1 unpcklps %xmm7, %xmm6 unpckhps %xmm7, %xmm1 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm3 movhps 6 * SIZE(B), %xmm3 movsd 8 * SIZE(B), %xmm5 movhps 10 * SIZE(B), %xmm5 movsd 12 * SIZE(B), %xmm7 movhps 14 * SIZE(B), %xmm7 subps %xmm4, %xmm2 subps %xmm0, %xmm3 subps %xmm6, %xmm5 subps %xmm1, %xmm7 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm1 movaps 8 * SIZE(AA), %xmm2 movaps 12 * SIZE(AA), %xmm3 subps %xmm4, %xmm0 subps %xmm6, %xmm1 subps %xmm5, %xmm2 subps %xmm7, %xmm3 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 movaps %xmm7, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 62 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movsd 60 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 58 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 56 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 52 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 50 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 48 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 44 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 42 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 40 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 34 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 32 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 26 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 24 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 16 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 9 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 10 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 18 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 19 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 20 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 22 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 27 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 28 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 30 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 36 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm5 movaps %xmm5, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 37 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm5 movsd 38 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 45 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm5 movaps %xmm5, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 46 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 54 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm7 movaps %xmm7, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 55 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm7 movss 63 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm7 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 mulps %xmm1, %xmm6 subps %xmm5, %xmm2 subps %xmm6, %xmm3 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 mulps %xmm3, %xmm6 subps %xmm5, %xmm0 subps %xmm6, %xmm1 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm3, 4 * SIZE(B) movhps %xmm3, 6 * SIZE(B) movlps %xmm5, 8 * SIZE(B) movhps %xmm5, 10 * SIZE(B) movlps %xmm7, 12 * SIZE(B) movhps %xmm7, 14 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm3, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm5, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 32 * SIZE(BB) movaps %xmm1, 36 * SIZE(BB) movaps %xmm4, 40 * SIZE(BB) movaps %xmm6, 44 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm6 #else movaps %xmm7, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm7, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm7, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm7, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 48 * SIZE(BB) movaps %xmm1, 52 * SIZE(BB) movaps %xmm4, 56 * SIZE(BB) movaps %xmm6, 60 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm1, 4 * SIZE(AA) movaps %xmm2, 8 * SIZE(AA) movaps %xmm3, 12 * SIZE(AA) #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movaps %xmm5, %xmm4 shufps $0x88, %xmm7, %xmm5 shufps $0xdd, %xmm7, %xmm4 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm5, 4 * SIZE(CO1) movhps %xmm5, 6 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) movhps %xmm0, 2 * SIZE(CO1, LDC) movlps %xmm4, 4 * SIZE(CO1, LDC) movhps %xmm4, 6 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) movlps %xmm3, 4 * SIZE(CO1, LDC) movhps %xmm3, 6 * SIZE(CO1, LDC) #endif #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 8), AA #ifdef LT addl $16 * SIZE, B #endif #endif #ifdef LN subl $8, KK movl BORIG, B #endif #ifdef LT addl $8, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $3 + BASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L30: testl $4, M jle .L50 #ifdef LN movl K, %eax sall $2 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $2 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L32 ALIGN_2 .L31: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L31 ALIGN_2 .L32: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm0 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm0 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 movsd 4 * SIZE(B), %xmm3 movhps 6 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm0, %xmm3 #else movaps 0 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm2 subps %xmm4, %xmm0 subps %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 movaps %xmm3, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 14 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movsd 12 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movsd 8 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 4 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 5 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movsd 6 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 10 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm3 movaps %xmm3, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 11 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm3 movss 15 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm3 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 subps %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 subps %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm3, 4 * SIZE(B) movhps %xmm3, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm6 #else movaps %xmm3, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm3, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm3, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm3, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm6, 28 * SIZE(BB) #else movaps %xmm0, 0 * SIZE(AA) movaps %xmm2, 4 * SIZE(AA) #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) movhps %xmm0, 2 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: testl $2, M jle .L70 #ifdef LN movl K, %eax sall $1 + BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 ALIGN_2 .L51: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L53 ALIGN_4 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 movsd 0 * SIZE(B), %xmm2 movhps 2 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 2 * SIZE(AA), %xmm2 subps %xmm4, %xmm0 subps %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #ifdef LN movss 3 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 movaps %xmm2, %xmm1 shufps $0xee, %xmm1, %xmm1 movss 2 * SIZE(AA), %xmm0 shufps $0x50, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef LT movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 movaps %xmm2, %xmm1 shufps $0x44, %xmm1, %xmm1 movss 1 * SIZE(AA), %xmm0 shufps $0x05, %xmm0, %xmm0 mulps %xmm1, %xmm0 subps %xmm0, %xmm2 movss 3 * SIZE(AA), %xmm0 movaps %xmm6, %xmm1 shufps $0x00, %xmm0, %xmm1 mulps %xmm1, %xmm2 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm0, %xmm5 subps %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 movaps %xmm6, %xmm5 mulps %xmm2, %xmm5 subps %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 mulps %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm6 #else movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm2, %xmm4 shufps $0xaa, %xmm4, %xmm4 movaps %xmm2, %xmm6 shufps $0xff, %xmm6, %xmm6 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm6, 12 * SIZE(BB) #else movlps %xmm0, 0 * SIZE(AA) movlps %xmm2, 2 * SIZE(AA) #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO1, LDC) #else movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L70: testl $1, M jle .L99 #ifdef LN movl K, %eax sall $BASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $BASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB #endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L72 ALIGN_2 .L71: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 .L72: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L74 .L73: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L73 ALIGN_4 .L74: addss %xmm6, %xmm4 addss %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $BASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklps %xmm5, %xmm4 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else movss 0 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm2 subss %xmm4, %xmm0 subss %xmm5, %xmm2 #endif #if defined(LN) || defined(LT) movaps TRMASK, %xmm6 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AA), %xmm0 shufps $0x00, %xmm6, %xmm0 mulps %xmm0, %xmm2 #endif #ifdef RN movss 0 * SIZE(B), %xmm6 mulss %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 movaps %xmm6, %xmm5 mulss %xmm0, %xmm5 subss %xmm5, %xmm2 movss 3 * SIZE(B), %xmm6 mulss %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 mulss %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 movaps %xmm6, %xmm5 mulss %xmm2, %xmm5 subss %xmm5, %xmm0 movss 0 * SIZE(B), %xmm6 mulss %xmm6, %xmm0 #endif #if defined(LN) || defined(LT) #ifdef movsd xorps %xmm2, %xmm2 #endif movsd %xmm2, 0 * SIZE(B) movaps %xmm2, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm2, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movss %xmm0, 0 * SIZE(AA) movss %xmm2, 1 * SIZE(AA) #endif #ifdef LN subl $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, %xmm0 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm3, %xmm0 movss %xmm2, 0 * SIZE(CO1) movss %xmm0, 0 * SIZE(CO1, LDC) #else movss %xmm0, 0 * SIZE(CO1) movss %xmm2, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 1), AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $BASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L99: #ifdef LN movl K, %eax leal (, %eax, SIZE), %eax leal (B, %eax, 2), B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax leal (,%eax, SIZE), %eax leal (B, %eax, 2), B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/xaxpy.S000066400000000000000000000161241313527062700163520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 32 + STACK + ARGS(%esp) #define STACK_X 48 + STACK + ARGS(%esp) #define STACK_INCX 52 + STACK + ARGS(%esp) #define STACK_Y 56 + STACK + ARGS(%esp) #define STACK_INCY 60 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #ifndef CONJ #define ADD1 fsubrp #define ADD2 faddp #else #define ADD1 faddp #define ADD2 fsubrp #endif PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif FLD STACK_ALPHA_I FLD STACK_ALPHA_R movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY testl M, M jle .L40 cmpl $2 * SIZE, INCX jne .L14 cmpl $2 * SIZE, INCY jne .L14 movl M, %eax sarl $2, %eax jle .L15 ALIGN_3 .L16: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(1), %st FLD 3 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 2 * SIZE(Y) faddp %st, %st(1) FST 2 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(2), %st FLD 3 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 3 * SIZE(Y) faddp %st, %st(1) FST 3 * SIZE(Y) FLD 4 * SIZE(X) fmul %st(1), %st FLD 5 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 4 * SIZE(Y) faddp %st, %st(1) FST 4 * SIZE(Y) FLD 4 * SIZE(X) fmul %st(2), %st FLD 5 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 5 * SIZE(Y) faddp %st, %st(1) FST 5 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(1), %st FLD 7 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 6 * SIZE(Y) faddp %st, %st(1) FST 6 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(2), %st FLD 7 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 7 * SIZE(Y) faddp %st, %st(1) FST 7 * SIZE(Y) #ifdef HAVE_3DNOW prefetch 20 * SIZE(X) prefetchw 20 * SIZE(Y) #endif addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl M, %eax andl $3, %eax jle .L40 ALIGN_3 .L22: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y decl %eax jg .L22 jmp .L40 ALIGN_3 .L14: movl M, %eax sarl $2, %eax jle .L28 ALIGN_3 .L29: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y decl %eax jg .L29 ALIGN_3 .L28: movl M, %eax andl $3, %eax jle .L40 ALIGN_3 .L35: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y decl %eax jg .L35 ALIGN_3 .L40: ffreep %st(0) ffreep %st(0) xorl %eax,%eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/xdot.S000066400000000000000000000152321313527062700161560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_X 12 + STACK + ARGS(%esp) #define STACK_INCX 16 + STACK + ARGS(%esp) #define STACK_Y 20 + STACK + ARGS(%esp) #define STACK_INCY 24 + STACK + ARGS(%esp) #else #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #endif PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N),N movl (INCX),INCX movl (INCY),INCY #endif testl N, N jle .L88 sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY fldz fldz fldz fldz cmpl $2 * SIZE, INCX jne .L14 cmpl $2 * SIZE, INCY jne .L14 movl N, %eax sarl $1, %eax jle .L15 ALIGN_3 .L16: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 3 * SIZE(X) FLD 2 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addl $4 * SIZE, X addl $4 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl N, %eax andl $1, %eax jle .L27 ALIGN_3 .L22: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) jmp .L27 ALIGN_3 .L14: #ifdef F_INTERFACE testl INCX, INCX # if (incx < 0) jge .L28 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L28: testl INCY, INCY # if (incy < 0) jge .L29 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L29: #endif movl N, %eax sarl $1, %eax jle .L30 ALIGN_3 .L31: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addl INCX, X FLD 0 * SIZE(X) addl INCY, Y FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addl INCX, X addl INCY, Y decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $1, %eax jle .L27 ALIGN_3 .L37: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) ALIGN_3 .L27: #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) movl RESULT, %eax #endif #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) fsubp %st, %st(1) #endif #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) #else fxch %st(1) #endif popl %ebx popl %esi popl %edi #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) ret $0x4 #else ret #endif ALIGN_3 .L88: #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) movl RESULT, %eax #endif fldz fldz #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) #endif popl %ebx popl %esi popl %edi #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) ret $0x4 #else ret #endif EPILOGUE OpenBLAS-0.2.20/kernel/x86/xgemm3m_kernel_2x2.S000066400000000000000000000320051313527062700206050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 32 + STACK + ARGS(%esp) #define A 48 + STACK + ARGS(%esp) #define ARG_B 52 + STACK + ARGS(%esp) #define C 56 + STACK + ARGS(%esp) #define ARG_LDC 60 + STACK + ARGS(%esp) #define I %esi #define B %ebx #define CO %edi #define AO %edx #define BO %ecx #define LDC %ebp #define PREFETCH_OFFSET 48 PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl ARG_LDC, LDC movl ARG_B, B addl $8 * SIZE, A addl $8 * SIZE, B sall $ZBASE_SHIFT, LDC movl N, %eax sarl $1, %eax movl %eax, J je .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl A, AO movl C, CO lea (, LDC, 2), %eax addl %eax, C movl M, I sarl $1, I je .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addl $8 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addl $2 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L16 ALIGN_4 .L18: FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fld %st(3) fmul %st(1), %st FLD 2 * SIZE(CO) faddp %st, %st(1) FST 2 * SIZE(CO) fld %st(4) fmul %st(1), %st FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) fmul %st(5), %st FLD 2 * SIZE(CO, LDC) faddp %st, %st(1) FST 2 * SIZE(CO, LDC) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 3 * SIZE(CO) faddp %st, %st(1) FST 3 * SIZE(CO) FLD 1 * SIZE(CO, LDC) faddp %st, %st(1) FST 1 * SIZE(CO, LDC) FLD 3 * SIZE(CO, LDC) faddp %st, %st(1) FST 3 * SIZE(CO, LDC) addl $4 * SIZE, CO decl I jne .L11 ALIGN_4 .L20: movl M, %eax andl $1, %eax je .L29 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 2), BO #endif fldz fldz #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $4 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addl $1 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L26 ALIGN_4 .L28: FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fmul %st(3), %st FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 1 * SIZE(CO, LDC) faddp %st, %st(1) FST 1 * SIZE(CO, LDC) ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BO, B decl J jne .L01 ALIGN_4 .L30: movl N, %eax testl $1, %eax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl A, AO movl C, CO addl LDC, C movl M, I sarl $1, I je .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal ( B, %eax, 1), BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $8 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addl $2 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L36 ALIGN_4 .L38: FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fmul %st(3), %st FLD 2 * SIZE(CO) faddp %st, %st(1) FST 2 * SIZE(CO) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 3 * SIZE(CO) faddp %st, %st(1) FST 3 * SIZE(CO) addl $4 * SIZE, CO decl I jne .L31 ALIGN_4 .L40: movl M, %eax andl $1, %eax je .L49 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 1), BO #endif fldz #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $4 * SIZE,AO addl $4 * SIZE,BO decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addl $1 * SIZE,AO addl $1 * SIZE,BO decl %eax jne .L46 ALIGN_4 .L48: FLD ALPHA_I FLD ALPHA_R fmul %st(2), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fmulp %st(1), %st FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif movl BO, B ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/xgemm_kernel_1x1.S000066400000000000000000000172271313527062700203540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 32 + STACK + ARGS(%esp) #define A 48 + STACK + ARGS(%esp) #define ARG_B 52 + STACK + ARGS(%esp) #define C 56 + STACK + ARGS(%esp) #define ARG_LDC 60 + STACK + ARGS(%esp) #define OFFSET 64 + STACK + ARGS(%esp) #define I %esi #define B %ebx #define CO %edi #define AO %edx #define BO %ecx #define LDC %ebp #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 faddp #define ADD2 fsubrp #define ADD3 faddp #define ADD4 faddp #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 faddp #define ADD2 faddp #define ADD3 fsubrp #define ADD4 faddp #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 faddp #define ADD2 faddp #define ADD3 faddp #define ADD4 fsubrp #else #define ADD1 faddp #define ADD2 fsubrp #define ADD3 fsubrp #define ADD4 fsubrp #endif #define PREFETCH_OFFSET 48 PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl ARG_LDC, LDC movl ARG_B, B addl $8 * SIZE, A addl $8 * SIZE, B sall $ZBASE_SHIFT, LDC cmpl $0, M jle .L999 movl N, %eax movl %eax, J testl %eax, %eax jle .L999 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl A, AO movl C, CO addl LDC, C movl M, I ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax sall $ZBASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addl $8 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addl $2 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L16 ALIGN_4 .L18: faddp %st, %st(3) faddp %st, %st(1) #ifndef TRMMKERNEL FLD ALPHA_R fld %st fmul %st(2), %st fxch %st(1) fmul %st(3), %st FLD ALPHA_I fmul %st, %st(3) fmulp %st, %st(4) fsubp %st, %st(2) faddp %st, %st(2) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) #else FST 1 * SIZE(CO) FST 0 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax sall $ZBASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, CO decl I jne .L11 #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif movl BO, B decl J jne .L01 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/xgemv_n.S000066400000000000000000000173741313527062700166540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 32 #endif #if defined(PENTIUM4) || defined(ATHLON) #define P (DTB_DEFAULT_ENTRIES / 2) #endif #ifndef P #define P DTB_DEFAULT_ENTRIES #endif #define STACK 16 #define ARGS 16 #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) #define IS 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 32 + STACK + ARGS(%esp) #define A 48 + STACK + ARGS(%esp) #define LDA 52 + STACK + ARGS(%esp) #define X 56 + STACK + ARGS(%esp) #define INCX 60 + STACK + ARGS(%esp) #define Y 64 + STACK + ARGS(%esp) #define INCY 68 + STACK + ARGS(%esp) #define BUFFER 72 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA_I FLD ALPHA_R movl X, %edi movl LDA, %ebx sall $ZBASE_SHIFT, %ebx movl $0, IS movl M, %ecx movl N, %esi test %ecx, %ecx jle .L79 # goto END test %esi, %esi jle .L79 # goto END movl INCY, %eax sall $ZBASE_SHIFT, %eax movl %eax, INCY movl LDA, %eax imull $P, %eax # P * lda subl M ,%eax # P * lda - m sall $ZBASE_SHIFT, %eax movl %eax, PLDA_M ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl N, %eax subl %esi,%eax # n - is cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_N sall $ZBASE_SHIFT, %esi leal (%edi, %esi, 1), %esi movl %esi, XP movl INCX, %edx cmpl $1, %edx je .L34 # if incx == 1 goto L34 movl BUFFER, %esi movl %esi, XP # xp = buffer sall $ZBASE_SHIFT, %edx sarl $1,%eax jle .L35 ALIGN_2 .L36: FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %edx,%edi # x += incx FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %edx,%edi # x += incx FST 3 * SIZE(%esi) FST 2 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) addl $4 * SIZE, %esi # xp += 4 decl %eax jg .L36 ALIGN_3 .L35: movl MIN_N, %eax andl $1, %eax jle .L34 FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %edx,%edi # x += incx FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) ALIGN_3 /* Main Routine */ .L34: movl Y, %ecx # c_offset movl M, %ebp # j = m ALIGN_3 .L61: movl A, %edx # a_offset = a fldz addl $2 * SIZE, A # a++ fldz movl XP,%esi fldz movl MIN_N,%eax fldz FLD (%esi) # bt1 = *(b_offset + 0) sarl $1, %eax jle .L64 ALIGN_3 .L65: #ifdef PENTIUM4 prefetchnta 16 * SIZE(%esi) #endif FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) addl $2 * SIZE, %esi # b_offset += 2 addl %ebx, %edx # a_offset += lda FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) addl $2 * SIZE, %esi # b_offset += 2 addl %ebx, %edx # a_offset += lda decl %eax jg .L65 .L64: movl MIN_N, %eax andl $1, %eax jle .L70 ALIGN_2 .L71: FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 fldz ALIGN_2 .L70: ffreep %st(0) #ifndef XCONJ #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) faddp %st, %st(1) #endif #else #ifndef CONJ faddp %st, %st(3) fsubp %st, %st(1) #else fsubp %st, %st(3) fsubp %st, %st(1) #endif #endif fld %st(0) # ct4 = ct2 fmul %st(4) fld %st(2) fmul %st(4) fsubp %st, %st(1) movl INCY, %eax FLD 0 * SIZE(%ecx) faddp %st, %st(1) FST 0 * SIZE(%ecx) fmul %st(2) fxch %st(1) fmul %st(3) faddp %st, %st(1) FLD 1 * SIZE(%ecx) faddp %st, %st(1) FST 1 * SIZE(%ecx) addl %eax, %ecx decl %ebp jg .L61 .L60: movl PLDA_M, %esi addl %esi, A # a += P * lda - m addl $P, IS movl N, %esi cmpl %esi,IS jl .L32 .L79: ffreep %st(0) ffreep %st(0) popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/xgemv_t.S000066400000000000000000000174551313527062700166620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 88 #endif #ifndef P #define P 400 #endif #define STACK 16 #define ARGS 24 #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) #define J 12 + STACK(%esp) #define IS 16 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 32 + STACK + ARGS(%esp) #define A 48 + STACK + ARGS(%esp) #define LDA 52 + STACK + ARGS(%esp) #define X 56 + STACK + ARGS(%esp) #define INCX 60 + STACK + ARGS(%esp) #define Y 64 + STACK + ARGS(%esp) #define INCY 68 + STACK + ARGS(%esp) #define BUFFER 72 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA_I FLD ALPHA_R movl X, %edi # X movl $0, IS movl M, %ebx movl N, %ecx testl %ebx, %ebx jle .L79 testl %ecx, %ecx jle .L79 movl INCX, %esi sall $ZBASE_SHIFT, %esi movl %esi, INCX movl INCY, %esi sall $ZBASE_SHIFT, %esi movl %esi, INCY movl LDA, %ebx movl N, %eax imull %ebx, %eax movl $P, %esi subl %eax, %esi sall $ZBASE_SHIFT, %esi movl %esi, NLDA movl %ebx, %esi sall $ZBASE_SHIFT, %esi movl %esi, LDA ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl M, %eax subl %esi, %eax cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_M movl IS, %ecx sall $ZBASE_SHIFT, %ecx leal (%edi, %ecx, 1), %ecx # xp = x + is movl INCX, %ebx movl %ecx, XP cmpl $2 * SIZE, %ebx je .L34 movl BUFFER, %esi movl MIN_M, %eax movl %esi, XP sarl $1, %eax jle .L35 ALIGN_3 .L36: FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %ebx,%edi # x += incx FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %ebx,%edi # x += incx FST 3 * SIZE(%esi) FST 2 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) addl $4 * SIZE, %esi # xp += 4 decl %eax jg .L36 ALIGN_3 .L35: movl MIN_M, %eax andl $1,%eax jle .L34 FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %ebx,%edi # x += incx FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) ALIGN_3 /* Main Routine */ .L34: movl Y, %ebp # coffset = y movl N, %ecx testl %ecx, %ecx jle .L60 ALIGN_2 .L61: movl A, %ebx # a_offset = a fldz # ct1 = ZERO movl LDA, %edx fldz # ct1 = ZERO addl %ebx, %edx fldz # ct1 = ZERO movl %edx, A fldz # ct1 = ZERO movl XP, %esi FLD (%esi) # bt1 = *(b_offset + 0) movl MIN_M, %eax sarl $1, %eax jle .L64 ALIGN_3 #define PRESIZE 8 .L65: #ifdef HAS_PREFETCH prefetcht0 PRESIZE * SIZE(%ebx) prefetcht0 PRESIZE * SIZE(%esi) #endif FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 3 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 4 * SIZE(%esi) # bt1 = *(b_offset + 1) addl $4 * SIZE, %esi addl $4 * SIZE, %ebx decl %eax jg .L65 ALIGN_3 .L64: movl MIN_M, %eax andl $1, %eax jle .L70 ALIGN_3 .L71: FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 fldz ALIGN_3 .L70: ffreep %st(0) #ifndef XCONJ #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) faddp %st, %st(1) #endif #else #ifndef CONJ faddp %st, %st(3) fsubp %st, %st(1) #else fsubp %st, %st(3) fsubp %st, %st(1) #endif #endif fld %st(0) # ct4 = ct2 fmul %st(4) fld %st(2) fmul %st(4) fsubp %st, %st(1) FLD 0 * SIZE(%ebp) faddp %st, %st(1) FST 0 * SIZE(%ebp) fmul %st(2) fxch %st(1) fmul %st(3) faddp %st, %st(1) FLD 1 * SIZE(%ebp) faddp %st, %st(1) FST 1 * SIZE(%ebp) addl INCY, %ebp decl %ecx jg .L61 ALIGN_3 .L60: movl A, %ebx addl NLDA, %ebx movl %ebx, A addl $P, IS movl M, %esi cmpl %esi, IS jl .L32 ALIGN_3 .L79: ffreep %st(0) ffreep %st(0) popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/xtrsm_kernel_LT_1x1.S000066400000000000000000000214461313527062700210110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 32 + STACK + ARGS(%esp) #define A 48 + STACK + ARGS(%esp) #define ARG_B 52 + STACK + ARGS(%esp) #define C 56 + STACK + ARGS(%esp) #define ARG_LDC 60 + STACK + ARGS(%esp) #define OFFSET 64 + STACK + ARGS(%esp) #define I %esi #define B %ebx #define CO %edi #define AO %edx #define BO %ecx #define LDC %ebp #ifndef CONJ #define ADD1 faddp #define ADD2 fsubrp #define ADD3 faddp #define ADD4 faddp #elif defined(LN) || defined(LT) #define ADD1 faddp #define ADD2 faddp #define ADD3 fsubrp #define ADD4 faddp #else #define ADD1 faddp #define ADD2 faddp #define ADD3 faddp #define ADD4 fsubrp #endif #define PREFETCH_OFFSET 48 PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_LDC, LDC movl ARG_B, B sall $ZBASE_SHIFT, LDC addl $8 * SIZE, A addl $8 * SIZE, B #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif cmpl $0, M jle .L999 movl N, %eax movl %eax, J testl %eax, %eax jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AO #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, I ALIGN_4 .L11: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #else movl B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addl $8 * SIZE,AO addl $8 * SIZE,BO decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif and $3, %eax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addl $2 * SIZE,AO addl $2 * SIZE,BO decl %eax jne .L16 ALIGN_4 .L18: faddp %st, %st(3) faddp %st, %st(1) fxch %st(1) #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif sall $ZBASE_SHIFT, %eax movl AORIG, AO leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st(1), %st FLD -8 * SIZE(AO) fmul %st(3), %st FLD -7 * SIZE(AO) fmulp %st, %st(3) FLD -7 * SIZE(AO) fmulp %st, %st(4) #endif #if defined(RN) || defined(RT) FLD -8 * SIZE(BO) fmul %st(1), %st FLD -8 * SIZE(BO) fmul %st(3), %st FLD -7 * SIZE(BO) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmulp %st, %st(4) #endif #ifndef CONJ faddp %st, %st(2) fsubp %st, %st(2) #else fsubp %st, %st(2) faddp %st, %st(2) #endif #if defined(LN) || defined(LT) fld %st FST -7 * SIZE(BO) fxch %st(1) fld %st FST -8 * SIZE(BO) #else fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -8 * SIZE(AO) #endif #ifdef LN subl $2 * SIZE, CO #endif FST 0 * SIZE(CO) FST 1 * SIZE(CO) #ifndef LN addl $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (BO, %eax, 1), BO #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl I jne .L11 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax leal (B, %eax, 1), B #endif #if defined(LT) || defined(RN) movl BO, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J jne .L01 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zamax.S000066400000000000000000000122101313527062700163110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) PROLOGUE #define M %ebx #define INCX %esi #define X %ecx #define I %edx #ifndef USE_MIN #define FMOV fcmovbe #else #define FMOV fcmovnbe #endif #include "l1param.h" pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_INCX, INCX movl STACK_X, X #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif sall $ZBASE_SHIFT, INCX fldz testl M, M jle .L999 testl INCX, INCX jle .L999 fstp %st(0) FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) addl INCX, X decl M jle .L999 cmpl $2 * SIZE, INCX jne .L40 movl M, I sarl $2, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) addl $8 * SIZE, X decl I jg .L10 ALIGN_4 .L20: movl M, I andl $3, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) addl $2 * SIZE, X decl I jg .L21 jmp .L999 ALIGN_4 .L40: movl M, I sarl $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) decl I jg .L50 ALIGN_4 .L60: movl M, I andl $3, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) addl INCX, X decl I jg .L61 ALIGN_4 .L999: popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zamax_sse.S000066400000000000000000000172601313527062700171750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define RET %eax #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define MM %ebp #define XX %edi #define TEMP %ebx #ifdef USE_MIN #define maxps minps #define maxss minss #endif #ifndef HAVE_SSE2 #define pxor xorps #define movsd movlps #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX pxor %xmm0, %xmm0 pxor %xmm7, %xmm7 xor RET, RET testl M, M jle .L999 testl INCX, INCX jle .L999 sall $ZBASE_SHIFT, INCX movl M, MM movl X, XX #ifdef USE_ABS #ifndef HAVE_SSE2 subl $8, %esp movl $0x7fffffff, (%esp) movss (%esp), %xmm7 shufps $0, %xmm7, %xmm7 addl $8, %esp #else cmpeqps %xmm7, %xmm7 psrld $1, %xmm7 /* Generate USE_ABS */ #endif #endif movss 0 * SIZE(XX), %xmm0 movss 1 * SIZE(XX), %xmm1 addl INCX, XX decl MM #ifdef USE_ABS andps %xmm7, %xmm0 andps %xmm7, %xmm1 #endif addps %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 cmpl $2 * SIZE, INCX jne .L70 .L30: movl MM, I sarl $3, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movhps 2 * SIZE(XX), %xmm1 movsd 4 * SIZE(XX), %xmm2 movhps 6 * SIZE(XX), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 #endif movsd 8 * SIZE(XX), %xmm1 movhps 10 * SIZE(XX), %xmm1 movsd 12 * SIZE(XX), %xmm2 movhps 14 * SIZE(XX), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm3 #endif addps %xmm3, %xmm1 maxps %xmm1, %xmm0 addl $16 * SIZE, XX decl I jg .L31 ALIGN_4 .L35: andl $7, MM jle .L40 testl $4, MM je .L36 movsd 0 * SIZE(XX), %xmm1 movhps 2 * SIZE(XX), %xmm1 movsd 4 * SIZE(XX), %xmm2 movhps 6 * SIZE(XX), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm3 #endif addps %xmm3, %xmm1 maxps %xmm1, %xmm0 addl $8 * SIZE, XX ALIGN_3 .L36: testl $2, MM je .L37 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 movss 2 * SIZE(XX), %xmm3 movss 3 * SIZE(XX), %xmm4 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 #endif addps %xmm2, %xmm1 addps %xmm4, %xmm3 maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 addl $4 * SIZE, XX ALIGN_3 .L37: testl $1, MM je .L40 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 #endif addps %xmm2, %xmm1 maxss %xmm1, %xmm0 ALIGN_4 .L40: movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 jmp .L999 ALIGN_4 .L70: movl MM, I sarl $3, I jle .L75 ALIGN_4 .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhps 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhps 0 * SIZE(XX), %xmm2 addl INCX, XX movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm3 #endif addps %xmm3, %xmm1 maxps %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhps 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhps 0 * SIZE(XX), %xmm2 addl INCX, XX movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm3 #endif addps %xmm3, %xmm1 maxps %xmm1, %xmm0 decl I jg .L71 ALIGN_4 .L75: andl $7, MM jle .L80 testl $4, MM je .L76 movsd 0 * SIZE(XX), %xmm1 addl INCX, XX movhps 0 * SIZE(XX), %xmm1 addl INCX, XX movsd 0 * SIZE(XX), %xmm2 addl INCX, XX movhps 0 * SIZE(XX), %xmm2 addl INCX, XX movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm3 #endif addps %xmm3, %xmm1 maxps %xmm1, %xmm0 ALIGN_3 .L76: testl $2, MM je .L77 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 addl INCX, XX movss 0 * SIZE(XX), %xmm3 movss 1 * SIZE(XX), %xmm4 addl INCX, XX #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 andps %xmm7, %xmm3 andps %xmm7, %xmm4 #endif addps %xmm2, %xmm1 addps %xmm4, %xmm3 maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 ALIGN_3 .L77: testl $1, MM je .L80 movss 0 * SIZE(XX), %xmm1 movss 1 * SIZE(XX), %xmm2 #ifdef USE_ABS andps %xmm7, %xmm1 andps %xmm7, %xmm2 #endif addps %xmm2, %xmm1 maxss %xmm1, %xmm0 ALIGN_4 .L80: movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 ALIGN_4 .L999: subl $8, %esp movss %xmm0, (%esp) flds (%esp) addl $8, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zamax_sse2.S000066400000000000000000000173571313527062700172660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define RET %eax #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define MM %ebp #define XX %edi #define TEMP %ebx #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX pxor %xmm0, %xmm0 pxor %xmm7, %xmm7 xor RET, RET testl M, M jle .L999 testl INCX, INCX jle .L999 sall $ZBASE_SHIFT, INCX movl M, MM movl X, XX cmpeqpd %xmm7, %xmm7 psrlq $1, %xmm7 movsd 0 * SIZE(XX), %xmm0 movsd 1 * SIZE(XX), %xmm1 addl INCX, XX decl MM andpd %xmm7, %xmm0 andpd %xmm7, %xmm1 addpd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 cmpl $2 * SIZE, INCX jne .L60 movl MM, I sarl $3, I jle .L25 ALIGN_4 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movhpd 2 * SIZE(XX), %xmm1 movhpd 3 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 4 * SIZE(XX), %xmm3 movsd 5 * SIZE(XX), %xmm4 movhpd 6 * SIZE(XX), %xmm3 movhpd 7 * SIZE(XX), %xmm4 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) #endif movsd 8 * SIZE(XX), %xmm1 movsd 9 * SIZE(XX), %xmm2 movhpd 10 * SIZE(XX), %xmm1 movhpd 11 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 12 * SIZE(XX), %xmm3 movsd 13 * SIZE(XX), %xmm4 movhpd 14 * SIZE(XX), %xmm3 movhpd 15 * SIZE(XX), %xmm4 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 addl $16 * SIZE, XX decl I jg .L21 ALIGN_4 .L25: andl $7, MM jle .L30 testl $4, MM je .L26 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movhpd 2 * SIZE(XX), %xmm1 movhpd 3 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 4 * SIZE(XX), %xmm3 movsd 5 * SIZE(XX), %xmm4 movhpd 6 * SIZE(XX), %xmm3 movhpd 7 * SIZE(XX), %xmm4 andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 addl $8 * SIZE, XX ALIGN_3 .L26: testl $2, MM je .L27 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 movhpd 2 * SIZE(XX), %xmm1 movhpd 3 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 addl $4 * SIZE, XX ALIGN_3 .L27: testl $1, MM je .L30 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxsd %xmm1, %xmm0 ALIGN_4 .L30: movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 jmp .L999 ALIGN_3 .L60: movl MM, I sarl $3, I jle .L65 ALIGN_4 .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 movhpd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) #endif movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 movhpd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 decl I jg .L61 ALIGN_4 .L65: andl $7, MM jle .L70 testl $4, MM je .L66 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 movsd 0 * SIZE(XX), %xmm3 movsd 1 * SIZE(XX), %xmm4 addl INCX, XX movhpd 0 * SIZE(XX), %xmm3 movhpd 1 * SIZE(XX), %xmm4 addl INCX, XX andpd %xmm7, %xmm3 andpd %xmm7, %xmm4 addpd %xmm4, %xmm3 maxpd %xmm3, %xmm0 ALIGN_3 .L66: testl $2, MM je .L67 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 addl INCX, XX movhpd 0 * SIZE(XX), %xmm1 movhpd 1 * SIZE(XX), %xmm2 addl INCX, XX andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 ALIGN_3 .L67: testl $1, MM je .L70 movsd 0 * SIZE(XX), %xmm1 movsd 1 * SIZE(XX), %xmm2 andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxsd %xmm1, %xmm0 ALIGN_3 .L70: movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 ALIGN_4 .L999: subl $8, %esp movsd %xmm0, (%esp) fldl (%esp) addl $8, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zasum.S000066400000000000000000000113151313527062700163350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %edx #define X %ecx #define INCX %esi #define I %eax #include "l1param.h" PROLOGUE pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #ifdef F_INTERFACE movl (M), M movl (INCX), INCX #endif fldz testl M, M jle .L999 testl INCX, INCX jle .L999 sall $ZBASE_SHIFT, INCX fldz fldz fldz cmpl $SIZE * 2, INCX jne .L40 movl M, I sarl $2, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs addl $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L10 ALIGN_4 .L20: movl M, I andl $3, I jle .L998 ALIGN_4 .L21: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st,%st(3) faddp %st,%st(1) addl $2 * SIZE, X decl I jg .L21 jmp .L998 ALIGN_4 .L40: movl M, I sarl $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addl INCX, X fabs FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addl INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addl INCX, X fabs FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addl INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L50 ALIGN_4 .L60: movl M, I andl $3, I jle .L998 ALIGN_4 .L61: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addl INCX, X fabs faddp %st,%st(3) faddp %st,%st(1) decl I jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zasum_sse.S000066400000000000000000000152531313527062700172140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define I %eax #define M %ecx #define X %esi #define INCX %ebx #include "l1param.h" PROLOGUE PROFCODE pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 testl M, M jle .L999 testl INCX, INCX jle .L999 #ifdef HAVE_SSE2 pcmpeqb %xmm3, %xmm3 psrld $1, %xmm3 #else movl $0x7fffffff, STACK_M movss STACK_M, %xmm3 shufps $0, %xmm3, %xmm3 #endif sall $ZBASE_SHIFT, INCX cmpl $2 * SIZE, INCX jne .L100 subl $-32 * SIZE, X addl M, M cmpl $3, M jle .L18 testl $4, X je .L05 movss -32 * SIZE(X), %xmm0 andps %xmm3, %xmm0 addl $SIZE, X decl M jle .L999 ALIGN_3 .L05: testl $8, X je .L10 movsd -32 * SIZE(X), %xmm1 andps %xmm3, %xmm1 addl $2 * SIZE, X subl $2, M jle .L999 ALIGN_3 .L10: movl M, I sarl $5, I jle .L14 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 decl I jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addps %xmm7, %xmm1 movaps -4 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addps %xmm7, %xmm1 movaps 12 * SIZE(X), %xmm7 subl $-32 * SIZE, X decl I jg .L11 ALIGN_3 .L12: andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addps %xmm7, %xmm1 movaps -4 * SIZE(X), %xmm7 andps %xmm3, %xmm4 addps %xmm4, %xmm0 andps %xmm3, %xmm5 addps %xmm5, %xmm1 andps %xmm3, %xmm6 addps %xmm6, %xmm0 andps %xmm3, %xmm7 addps %xmm7, %xmm1 addl $32 * SIZE, X ALIGN_3 .L14: testl $16, M je .L16 movaps -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm3, %xmm5 addps %xmm5, %xmm1 movaps -24 * SIZE(X), %xmm6 andps %xmm3, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(X), %xmm7 andps %xmm3, %xmm7 addps %xmm7, %xmm1 addl $16 * SIZE, X ALIGN_3 .L16: testl $8, M je .L17 movaps -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm3, %xmm5 addps %xmm5, %xmm1 addl $8 * SIZE, X ALIGN_3 .L17: testl $4, M je .L18 movaps -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 addl $4 * SIZE, X ALIGN_3 .L18: testl $2, M je .L19 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm1 addl $2 * SIZE, X ALIGN_3 .L19: testl $1, M je .L999 movss -32 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 jmp .L999 ALIGN_4 .L100: movl M, I sarl $2, I jle .L105 ALIGN_4 .L101: movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X andps %xmm3, %xmm4 addps %xmm4, %xmm0 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X andps %xmm3, %xmm5 addps %xmm5, %xmm1 decl I jg .L101 ALIGN_4 .L105: #ifdef movsd xorps %xmm4, %xmm4 #endif andl $3, M jle .L999 ALIGN_4 .L106: movsd (X), %xmm4 andps %xmm3, %xmm4 addps %xmm4, %xmm0 addl INCX, X decl M jg .L106 ALIGN_4 .L999: addps %xmm1, %xmm0 #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #endif movss %xmm0, STACK_M flds STACK_M popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zasum_sse2.S000066400000000000000000000150321313527062700172710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define I %eax #define M %ecx #define X %esi #define INCX %ebx #define xmm8 xmm4 #define xmm9 xmm5 #define xmm10 xmm6 #define xmm11 xmm7 #include "l1param.h" PROLOGUE PROFCODE pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 testl M, M jle .L999 testl INCX, INCX jle .L999 pcmpeqb %xmm3, %xmm3 psrlq $1, %xmm3 sall $ZBASE_SHIFT, INCX cmpl $2 * SIZE, INCX jne .L40 subl $-16 * SIZE, X addl M, M testl $SIZE, X je .L05 movsd -16 * SIZE(X), %xmm0 addl $SIZE, X andps %xmm3, %xmm0 subl $1, M jle .L999 ALIGN_3 .L05: movl M, I sarl $4, I jle .L20 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 decl I jle .L11 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 movaps -2 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movaps 4 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 movaps 6 * SIZE(X), %xmm7 subl $-16 * SIZE, X decl I jg .L10 ALIGN_4 .L11: andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 movaps -2 * SIZE(X), %xmm7 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 subl $-16 * SIZE, X ALIGN_3 .L20: andl $15, M jle .L999 testl $8, M je .L21 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 andps %xmm3, %xmm6 addpd %xmm6, %xmm0 andps %xmm3, %xmm7 addpd %xmm7, %xmm1 addl $8 * SIZE, X ALIGN_3 .L21: testl $4, M je .L22 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 andps %xmm3, %xmm5 addpd %xmm5, %xmm1 addl $4 * SIZE, X ALIGN_3 .L22: testl $2, M je .L23 movaps -16 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 addl $2 * SIZE, X .L23: testl $1, M je .L999 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -16 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 .L40: movl M, I sarl $2, I jle .L60 ALIGN_4 .L50: movsd 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X andps %xmm3, %xmm4 addpd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X andps %xmm3, %xmm5 addpd %xmm5, %xmm1 movsd 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addl INCX, X andps %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addl INCX, X andps %xmm3, %xmm7 addpd %xmm7, %xmm1 decl I jg .L50 ALIGN_4 .L60: andl $3, M jle .L999 ALIGN_4 .L61: movsd 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 andps %xmm3, %xmm4 addpd %xmm4, %xmm0 addl INCX, X decl M jg .L61 ALIGN_4 .L999: addpd %xmm1, %xmm0 #ifndef HAVE_SSE3 movaps %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif movsd %xmm0, STACK_M fldl STACK_M popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zaxpy.S000066400000000000000000000160621313527062700163550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #ifdef DOUBLE #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 24 + STACK + ARGS(%esp) #define STACK_X 32 + STACK + ARGS(%esp) #define STACK_INCX 36 + STACK + ARGS(%esp) #define STACK_Y 40 + STACK + ARGS(%esp) #define STACK_INCY 44 + STACK + ARGS(%esp) #else #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 20 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define STACK_Y 32 + STACK + ARGS(%esp) #define STACK_INCY 36 + STACK + ARGS(%esp) #endif #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #ifndef CONJ #define ADD1 fsubrp #define ADD2 faddp #else #define ADD1 faddp #define ADD2 fsubrp #endif PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif FLD STACK_ALPHA_I FLD STACK_ALPHA_R movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY addl INCX, INCX addl INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY testl M, M jle .L40 cmpl $2 * SIZE, INCX jne .L14 cmpl $2 * SIZE, INCY jne .L14 movl M, %eax sarl $2, %eax jle .L15 ALIGN_3 .L16: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 1 * SIZE(Y) FST 1 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(1), %st FLD 3 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 2 * SIZE(Y) FST 2 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(2), %st FLD 3 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 3 * SIZE(Y) FST 3 * SIZE(Y) FLD 4 * SIZE(X) fmul %st(1), %st FLD 5 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 4 * SIZE(Y) FST 4 * SIZE(Y) FLD 4 * SIZE(X) fmul %st(2), %st FLD 5 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 5 * SIZE(Y) FST 5 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(1), %st FLD 7 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 6 * SIZE(Y) FST 6 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(2), %st FLD 7 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 7 * SIZE(Y) FST 7 * SIZE(Y) #ifdef HAVE_3DNOW prefetch 20 * SIZE(X) prefetchw 20 * SIZE(Y) #endif addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl M, %eax andl $3, %eax jle .L40 ALIGN_3 .L22: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 1 * SIZE(Y) FST 1 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y decl %eax jg .L22 jmp .L40 ALIGN_3 .L14: movl M, %eax sarl $2, %eax jle .L28 ALIGN_3 .L29: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 1 * SIZE(Y) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 1 * SIZE(Y) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 1 * SIZE(Y) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 1 * SIZE(Y) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y decl %eax jg .L29 ALIGN_3 .L28: movl M, %eax andl $3, %eax jle .L40 ALIGN_3 .L35: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FADD 0 * SIZE(Y) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FADD 1 * SIZE(Y) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y decl %eax jg .L35 ALIGN_3 .L40: ffreep %st(0) ffreep %st(0) xorl %eax,%eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zaxpy_sse.S000066400000000000000000001747221313527062700172370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 20 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define STACK_Y 32 + STACK + ARGS(%esp) #define STACK_INCY 36 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define YY %ebp #define ALPHA_R %xmm6 #define ALPHA_I %xmm7 #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx pushl %ebp movl STACK_M, M movss STACK_ALPHA_R, ALPHA_R movss STACK_ALPHA_I, ALPHA_I movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY testl M, M jle .L999 cmpl $2 * SIZE, INCX jne .L100 cmpl $2 * SIZE, INCY jne .L100 #ifdef HAVE_SSE2 pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #else movl $0x80000000, STACK_M movss STACK_M, %xmm5 shufps $0x11, %xmm5, %xmm5 #endif shufps $0, ALPHA_R, ALPHA_R shufps $0, ALPHA_I, ALPHA_I #ifndef CONJ shufps $0xb1, %xmm5, %xmm5 xorps %xmm5, ALPHA_I #else xorps %xmm5, ALPHA_R #endif subl $-32 * SIZE, X subl $-32 * SIZE, Y testl $2 * SIZE, Y je .L10 #ifndef HAVE_SSE2 xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(X), %xmm0 #ifndef HAVE_SSE2 xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(Y), %xmm1 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 addps %xmm1, %xmm0 movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y decl M jle .L999 ALIGN_2 .L10: testl $SIZE, Y jne .L50 testl $3 * SIZE, X jne .L20 movl M, %eax sarl $4, %eax jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 4 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 8 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 12 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L12: PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L15: testl $8, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L16: testl $4, M jle .L17 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L17: testl $2, M jle .L18 movaps -32 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L18: testl $1, M jle .L999 #ifndef HAVE_SSE2 xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 #ifndef HAVE_SSE2 xorps %xmm1, %xmm1 movlps -32 * SIZE(Y), %xmm1 #else movsd -32 * SIZE(Y), %xmm1 #endif addps %xmm1, %xmm0 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L20: #ifdef ALIGNED_ACCESS testl $2 * SIZE, X jne .L30 subl $1 * SIZE, X movaps -32 * SIZE(X), %xmm0 movl M, %eax sarl $4, %eax jle .L25 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 4 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 8 * SIZE(X), %xmm2 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 12 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L25: testl $8, M jle .L26 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L26: testl $4, M jle .L27 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L27: testl $2, M jle .L28 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L28: testl $1, M jle .L999 PSHUFD2($0x06, %xmm0, %xmm5) PSHUFD2($0x09, %xmm0, %xmm0) mulps ALPHA_I, %xmm5 mulps ALPHA_R, %xmm0 #ifndef HAVE_SSE2 xorps %xmm1, %xmm1 movlps -32 * SIZE(Y), %xmm1 #else movsd -32 * SIZE(Y), %xmm1 #endif addps %xmm1, %xmm0 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L30: testl $1 * SIZE, X jne .L40 #endif movl M, %eax sarl $4, %eax jle .L35 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 decl %eax jle .L32 ALIGN_3 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -16 * SIZE(X), %xmm0 movhps -14 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -12 * SIZE(X), %xmm1 movhps -10 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -8 * SIZE(X), %xmm2 movhps -6 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -2 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) movsd 4 * SIZE(X), %xmm1 movhps 6 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) movsd 8 * SIZE(X), %xmm2 movhps 10 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) movsd 12 * SIZE(X), %xmm3 movhps 14 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -16 * SIZE(X), %xmm0 movhps -14 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -12 * SIZE(X), %xmm1 movhps -10 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -8 * SIZE(X), %xmm2 movhps -6 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -2 * SIZE(X), %xmm3 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L35: testl $8, M jle .L36 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L36: testl $4, M jle .L37 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L37: testl $2, M jle .L38 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L38: testl $1, M jle .L999 movsd -32 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #ifdef ALIGNED_ACCESS .L40: subl $3 * SIZE, X movaps -32 * SIZE(X), %xmm0 movl M, %eax sarl $4, %eax jle .L45 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 decl %eax jle .L42 ALIGN_3 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 4 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 8 * SIZE(X), %xmm2 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 12 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -16 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm1 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -8 * SIZE(X), %xmm2 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -16 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -12 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -8 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(Y) movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -4 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L45: testl $8, M jle .L46 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps -24 * SIZE(Y), %xmm2 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps -20 * SIZE(Y), %xmm3 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L46: testl $4, M jle .L47 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps -28 * SIZE(Y), %xmm1 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L47: testl $2, M jle .L48 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps -32 * SIZE(Y), %xmm0 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L48: testl $1, M jle .L999 movaps -28 * SIZE(X), %xmm1 movsd -32 * SIZE(Y), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 addps %xmm2, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #endif .L50: xorps %xmm0, %xmm0 subl $1 * SIZE, Y testl $3 * SIZE, X jne .L60 movl M, %eax sarl $4, %eax jle .L55 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 decl %eax jle .L52 ALIGN_3 .L51: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -16 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -8 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 8 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L51 ALIGN_3 .L52: PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -16 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -8 * SIZE(X), %xmm3 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L55: testl $8, M jle .L56 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -24 * SIZE(X), %xmm3 movaps -20 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L56: testl $4, M jle .L57 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L57: testl $2, M jle .L58 movaps -32 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L58: testl $1, M jle .L59 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L59: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L60: #ifdef ALIGNED_ACCESS testl $2 * SIZE, X jne .L70 subl $1 * SIZE, X movaps -32 * SIZE(X), %xmm1 movl M, %eax sarl $4, %eax jle .L65 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 decl %eax jle .L62 ALIGN_3 .L61: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -16 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -8 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 8 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L61 ALIGN_3 .L62: movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -16 * SIZE(X), %xmm1 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -8 * SIZE(X), %xmm3 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L65: testl $8, M jle .L66 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movaps -16 * SIZE(X), %xmm1 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L66: testl $4, M jle .L67 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L67: testl $2, M jle .L68 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 movaps %xmm2, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L68: testl $1, M jle .L69 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movlps %xmm0, -32 * SIZE(Y) movhlps %xmm0, %xmm0 movss %xmm0, -30 * SIZE(Y) jmp .L999 .L69: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L70: testl $1 * SIZE, X jne .L80 #endif movl M, %eax sarl $4, %eax jle .L75 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 movsd -28 * SIZE(X), %xmm2 movhps -26 * SIZE(X), %xmm2 movsd -24 * SIZE(X), %xmm3 movhps -22 * SIZE(X), %xmm3 decl %eax jle .L72 ALIGN_3 .L71: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -20 * SIZE(X), %xmm0 movhps -18 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -16 * SIZE(X), %xmm1 movhps -14 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -12 * SIZE(X), %xmm2 movhps -10 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd -8 * SIZE(X), %xmm3 movhps -6 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -4 * SIZE(X), %xmm0 movhps -2 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movsd 0 * SIZE(X), %xmm1 movhps 2 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movsd 8 * SIZE(X), %xmm3 movhps 10 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L71 ALIGN_3 .L72: PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -20 * SIZE(X), %xmm0 movhps -18 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -16 * SIZE(X), %xmm1 movhps -14 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -12 * SIZE(X), %xmm2 movhps -10 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd -8 * SIZE(X), %xmm3 movhps -6 * SIZE(X), %xmm3 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -4 * SIZE(X), %xmm0 movhps -2 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L75: testl $8, M jle .L76 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 movsd -28 * SIZE(X), %xmm2 movhps -26 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -24 * SIZE(X), %xmm3 movhps -22 * SIZE(X), %xmm3 movsd -20 * SIZE(X), %xmm0 movhps -18 * SIZE(X), %xmm0 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L76: testl $4, M jle .L77 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -28 * SIZE(X), %xmm2 movhps -26 * SIZE(X), %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L77: testl $2, M jle .L78 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L78: testl $1, M jle .L79 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(X), %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L79: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #ifdef ALIGNED_ACCESS .L80: subl $3 * SIZE, X movaps -32 * SIZE(X), %xmm1 movl M, %eax sarl $4, %eax jle .L85 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 decl %eax jle .L82 ALIGN_3 .L81: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -16 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -8 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) movaps 8 * SIZE(X), %xmm3 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L81 ALIGN_3 .L82: movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -16 * SIZE(X), %xmm1 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps -8 * SIZE(X), %xmm3 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -12 * SIZE(Y), %xmm1 movaps %xmm1, -12 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -8 * SIZE(Y), %xmm2 movaps %xmm2, -8 * SIZE(Y) movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -4 * SIZE(Y), %xmm3 movaps %xmm3, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L85: testl $8, M jle .L86 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movaps -16 * SIZE(X), %xmm1 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 PSHUFD2($0xb1, %xmm3, %xmm5) mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 PSHUFD2($0xb1, %xmm0, %xmm5) mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_2 .L86: testl $4, M jle .L87 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 PSHUFD2($0xb1, %xmm2, %xmm5) mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_2 .L87: testl $2, M jle .L88 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 movaps %xmm2, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_2 .L88: testl $1, M jle .L89 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 PSHUFD2($0xb1, %xmm1, %xmm5) mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm5 addps %xmm5, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movlps %xmm0, -32 * SIZE(Y) movhlps %xmm0, %xmm0 movss %xmm0, -30 * SIZE(Y) jmp .L999 .L89: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #endif .L100: shufps $0, ALPHA_R, ALPHA_R shufps $0, ALPHA_I, ALPHA_I #ifndef CONJ xorps %xmm5, %xmm5 subps ALPHA_I, %xmm5 unpcklps ALPHA_R, %xmm5 unpcklps ALPHA_I, ALPHA_R movaps %xmm5, ALPHA_I #else xorps %xmm5, %xmm5 subps ALPHA_R, %xmm5 unpcklps ALPHA_I, ALPHA_R unpcklps %xmm5, ALPHA_I #endif //If incx==0 || incy==0, avoid unloop and jump to end. cmpl $0, INCX je .L200 cmpl $0, INCY je .L200 movl Y, YY movl M, %eax sarl $3, %eax jle .L105 ALIGN_3 .L102: movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 #else movaps %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 shufps $0xf5, %xmm1, %xmm1 movaps %xmm2, %xmm3 shufps $0xa0, %xmm2, %xmm2 shufps $0xf5, %xmm3, %xmm3 #endif mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm1 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm3 movsd (Y), %xmm4 addl INCY, Y movhps (Y), %xmm4 addl INCY, Y movsd (Y), %xmm5 addl INCY, Y movhps (Y), %xmm5 addl INCY, Y addps %xmm0, %xmm4 addps %xmm1, %xmm4 addps %xmm2, %xmm5 addps %xmm3, %xmm5 movsd %xmm4, (YY) addl INCY, YY movhps %xmm4, (YY) addl INCY, YY movsd %xmm5, (YY) addl INCY, YY movhps %xmm5, (YY) addl INCY, YY movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 #else movaps %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 shufps $0xf5, %xmm1, %xmm1 movaps %xmm2, %xmm3 shufps $0xa0, %xmm2, %xmm2 shufps $0xf5, %xmm3, %xmm3 #endif mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm1 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm3 movsd (Y), %xmm4 addl INCY, Y movhps (Y), %xmm4 addl INCY, Y movsd (Y), %xmm5 addl INCY, Y movhps (Y), %xmm5 addl INCY, Y addps %xmm0, %xmm4 addps %xmm1, %xmm4 addps %xmm2, %xmm5 addps %xmm3, %xmm5 movsd %xmm4, (YY) addl INCY, YY movhps %xmm4, (YY) addl INCY, YY movsd %xmm5, (YY) addl INCY, YY movhps %xmm5, (YY) addl INCY, YY decl %eax jg .L102 ALIGN_3 .L105: testl $4, M jle .L106 movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 #else movaps %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 shufps $0xf5, %xmm1, %xmm1 movaps %xmm2, %xmm3 shufps $0xa0, %xmm2, %xmm2 shufps $0xf5, %xmm3, %xmm3 #endif mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm1 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm3 movsd (Y), %xmm4 addl INCY, Y movhps (Y), %xmm4 addl INCY, Y movsd (Y), %xmm5 addl INCY, Y movhps (Y), %xmm5 addl INCY, Y addps %xmm0, %xmm4 addps %xmm1, %xmm4 addps %xmm2, %xmm5 addps %xmm3, %xmm5 movsd %xmm4, (YY) addl INCY, YY movhps %xmm4, (YY) addl INCY, YY movsd %xmm5, (YY) addl INCY, YY movhps %xmm5, (YY) addl INCY, YY ALIGN_3 .L106: testl $2, M jle .L107 movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 #else movaps %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 shufps $0xf5, %xmm1, %xmm1 #endif mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm1 movsd (Y), %xmm4 addl INCY, Y movhps (Y), %xmm4 addl INCY, Y addps %xmm0, %xmm4 addps %xmm1, %xmm4 movsd %xmm4, (YY) addl INCY, YY movhps %xmm4, (YY) addl INCY, YY ALIGN_3 .L107: testl $1, M jle .L999 movsd (X), %xmm0 #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 #else movaps %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 shufps $0xf5, %xmm1, %xmm1 #endif mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm1 movsd (Y), %xmm4 addps %xmm0, %xmm4 addps %xmm1, %xmm4 movsd %xmm4, (Y) jmp .L999 ALIGN_3 .L200: movl M, %eax cmpl $0, %eax jle .L999 ALIGN_3 .L201: movsd (X), %xmm0 #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 #else movaps %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 shufps $0xf5, %xmm1, %xmm1 #endif mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm1 movsd (Y), %xmm4 addps %xmm0, %xmm4 addps %xmm1, %xmm4 movsd %xmm4, (Y) decl %eax jg .L201 ALIGN_3 .L999: popl %ebp popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zaxpy_sse2.S000066400000000000000000000763151313527062700173200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 24 + STACK + ARGS(%esp) #define STACK_X 32 + STACK + ARGS(%esp) #define STACK_INCX 36 + STACK + ARGS(%esp) #define STACK_Y 40 + STACK + ARGS(%esp) #define STACK_INCY 44 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define YY %ebp #define ALPHA_R %xmm6 #define ALPHA_I %xmm7 #if defined(HAVE_SSE3) && !defined(CORE_OPTERON) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c #endif #include "l1param.h" PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_M, M movsd STACK_ALPHA_R, %xmm0 movsd STACK_ALPHA_I, %xmm1 movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY testl M, M jle .L999 cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 subl $-16 * SIZE, X subl $-16 * SIZE, Y pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #ifdef HAVE_SSE3 movddup %xmm0, ALPHA_R movddup %xmm1, ALPHA_I #else pshufd $0x44, %xmm0, ALPHA_R pshufd $0x44, %xmm1, ALPHA_I #endif #ifndef CONJ shufps $0x0c, %xmm5, %xmm5 xorpd %xmm5, ALPHA_I #else shufps $0xc0, %xmm5, %xmm5 xorpd %xmm5, ALPHA_R #endif testl $SIZE, Y jne .L30 testl $SIZE, X jne .L20 movl M, %eax sarl $3, %eax jle .L15 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -8 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -6 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -12 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -4 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -10 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -2 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -8 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -6 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -6 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -4 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -4 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -2 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -2 * SIZE(Y) movaps 6 * SIZE(X), %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L12: pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -8 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -6 * SIZE(X), %xmm1 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -12 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -4 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -10 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -2 * SIZE(X), %xmm3 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -8 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -8 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -6 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -6 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -4 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -4 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -2 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -2 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L15: movl M, %eax andl $4, %eax jle .L16 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -12 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -10 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L16: movl M, %eax andl $2, %eax jle .L17 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L17: movl M, %eax andl $1, %eax jle .L999 movaps -16 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movl M, %eax sarl $3, %eax jle .L25 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -8 * SIZE(X), %xmm0 movhps -7 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -6 * SIZE(X), %xmm1 movhps -5 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -12 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -4 * SIZE(X), %xmm2 movhps -3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -10 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd -2 * SIZE(X), %xmm3 movhps -1 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -8 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -8 * SIZE(Y) movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -6 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -6 * SIZE(Y) movsd 2 * SIZE(X), %xmm1 movhps 3 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -4 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -4 * SIZE(Y) movsd 4 * SIZE(X), %xmm2 movhps 5 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -2 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -2 * SIZE(Y) movsd 6 * SIZE(X), %xmm3 movhps 7 * SIZE(X), %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -8 * SIZE(X), %xmm0 movhps -7 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -6 * SIZE(X), %xmm1 movhps -5 * SIZE(X), %xmm1 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -12 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -4 * SIZE(X), %xmm2 movhps -3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -10 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd -2 * SIZE(X), %xmm3 movhps -1 * SIZE(X), %xmm3 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -8 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -8 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -6 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -6 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -4 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -4 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -2 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -2 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L25: movl M, %eax andl $4, %eax jle .L26 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd -12 * SIZE(Y), %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd -10 * SIZE(Y), %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: movl M, %eax andl $2, %eax jle .L27 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd -14 * SIZE(Y), %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: movl M, %eax andl $1, %eax jle .L999 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd -16 * SIZE(Y), %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L30: testl $SIZE, X jne .L40 movaps -16 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 xorps %xmm0, %xmm0 SHUFPD_1 %xmm1, %xmm0 xorps %xmm4, %xmm4 movhps -16 * SIZE(Y), %xmm4 addpd %xmm0, %xmm4 movhps %xmm4, -16 * SIZE(Y) movaps %xmm1, %xmm0 addl $2 * SIZE, X addl $1 * SIZE, Y decl M jle .L39 movl M, %eax sarl $3, %eax jle .L35 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 decl %eax jle .L32 ALIGN_3 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -10 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -8 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -6 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) movaps 2 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) movaps 4 * SIZE(X), %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -10 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -8 * SIZE(X), %xmm1 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -6 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L35: movl M, %eax andl $4, %eax jle .L36 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 movaps -10 * SIZE(X), %xmm4 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm4, %xmm5 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L36: movl M, %eax andl $2, %eax jle .L37 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L37: movl M, %eax andl $1, %eax jle .L39 movaps -16 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, %xmm0 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L39: SHUFPD_1 %xmm0, %xmm0 addsd -16 * SIZE(Y), %xmm0 movlps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L40: movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 xorps %xmm0, %xmm0 SHUFPD_1 %xmm1, %xmm0 xorps %xmm4, %xmm4 movhps -16 * SIZE(Y), %xmm4 addpd %xmm0, %xmm4 movhps %xmm4, -16 * SIZE(Y) movaps %xmm1, %xmm0 addl $2 * SIZE, X addl $1 * SIZE, Y decl M jle .L49 movl M, %eax sarl $3, %eax jle .L45 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 movsd -14 * SIZE(X), %xmm2 movhps -13 * SIZE(X), %xmm2 movsd -12 * SIZE(X), %xmm3 movhps -11 * SIZE(X), %xmm3 decl %eax jle .L42 ALIGN_3 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -10 * SIZE(X), %xmm0 movhps -9 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -8 * SIZE(X), %xmm1 movhps -7 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -6 * SIZE(X), %xmm2 movhps -5 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -3 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movsd -2 * SIZE(X), %xmm0 movhps -1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) movsd 2 * SIZE(X), %xmm2 movhps 3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) movsd 4 * SIZE(X), %xmm3 movhps 5 * SIZE(X), %xmm3 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -10 * SIZE(X), %xmm0 movhps -9 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -8 * SIZE(X), %xmm1 movhps -7 * SIZE(X), %xmm1 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -6 * SIZE(X), %xmm2 movhps -5 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -3 * SIZE(X), %xmm3 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movsd -2 * SIZE(X), %xmm0 movhps -1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) pshufd $0x4e, %xmm0, %xmm5 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L45: movl M, %eax andl $4, %eax jle .L46 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 movsd -14 * SIZE(X), %xmm2 movhps -13 * SIZE(X), %xmm2 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -12 * SIZE(X), %xmm3 movhps -11 * SIZE(X), %xmm3 movsd -10 * SIZE(X), %xmm4 movhps -9 * SIZE(X), %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm4, %xmm5 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L46: movl M, %eax andl $2, %eax jle .L47 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 movsd -14 * SIZE(X), %xmm2 movhps -13 * SIZE(X), %xmm2 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm5 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L47: movl M, %eax andl $1, %eax jle .L49 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm5 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, %xmm0 addl $2 * SIZE, Y ALIGN_3 .L49: SHUFPD_1 %xmm0, %xmm0 addsd -16 * SIZE(Y), %xmm0 movlps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L50: #ifndef CONJ movaps %xmm0, ALPHA_R pxor ALPHA_I, ALPHA_I subsd %xmm1, ALPHA_I unpcklpd ALPHA_R, ALPHA_I unpcklpd %xmm1, ALPHA_R #else movaps %xmm0, ALPHA_R movaps %xmm1, ALPHA_I pxor %xmm5, %xmm5 subsd %xmm0, %xmm5 unpcklpd %xmm5, ALPHA_I unpcklpd %xmm1, ALPHA_R #endif movl Y, YY movl M, %eax //If incx==0 || incy==0, avoid unloop and jump to end. cmpl $0, INCX je .L58 cmpl $0, INCY je .L58 sarl $2, %eax jle .L55 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addl INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addl INCX, X movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y decl %eax jle .L52 ALIGN_3 .L51: mulpd ALPHA_R, %xmm0 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm1 mulpd ALPHA_I, %xmm3 addpd %xmm0, %xmm4 addpd %xmm2, %xmm5 addpd %xmm1, %xmm4 addpd %xmm3, %xmm5 movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movlpd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addl INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addl INCX, X movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y mulpd ALPHA_R, %xmm0 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm1 mulpd ALPHA_I, %xmm3 addpd %xmm0, %xmm4 addpd %xmm2, %xmm5 addpd %xmm1, %xmm4 addpd %xmm3, %xmm5 movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movlpd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addl INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addl INCX, X movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y decl %eax jg .L51 ALIGN_3 .L52: mulpd ALPHA_R, %xmm0 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm1 mulpd ALPHA_I, %xmm3 addpd %xmm0, %xmm4 addpd %xmm2, %xmm5 addpd %xmm1, %xmm4 addpd %xmm3, %xmm5 movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movlpd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addl INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addl INCX, X movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y mulpd ALPHA_R, %xmm0 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm1 mulpd ALPHA_I, %xmm3 addpd %xmm0, %xmm4 addpd %xmm2, %xmm5 addpd %xmm1, %xmm4 addpd %xmm3, %xmm5 movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movlpd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY ALIGN_3 .L55: movl M, %eax andl $2, %eax jle .L57 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addl INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addl INCX, X movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addl INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addl INCY, Y mulpd ALPHA_R, %xmm0 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm1 mulpd ALPHA_I, %xmm3 addpd %xmm0, %xmm4 addpd %xmm2, %xmm5 addpd %xmm1, %xmm4 addpd %xmm3, %xmm5 movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) addl INCY, YY movlpd %xmm5, 0 * SIZE(YY) movhpd %xmm5, 1 * SIZE(YY) addl INCY, YY ALIGN_3 .L57: movl M, %eax andl $1, %eax jle .L999 .L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm4 movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) decl %eax jg .L58 ALIGN_3 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zcopy.S000066400000000000000000000131031313527062700163370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define M 4 + STACK + ARGS(%esp) #define X 8 + STACK + ARGS(%esp) #define INCX 12 + STACK + ARGS(%esp) #define Y 16 + STACK + ARGS(%esp) #define INCY 20 + STACK + ARGS(%esp) PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl M, %ebx movl X, %ecx movl INCX, %esi movl Y, %edx movl INCY, %edi testl %ebx, %ebx # if m == 0 goto End jle .L999 sall $ZBASE_SHIFT, %esi sall $ZBASE_SHIFT, %edi cmpl $2 * SIZE, %esi # if incx != 1 jne .L100 cmpl $2 * SIZE, %edi # if incy != 1 jne .L100 movl %ebx, %eax # i = m sarl $2, %eax jle .L20 ALIGN_2 .L11: #if defined(DOUBLE) || defined(XDOUBLE) FLD 7 * SIZE(%ecx) FLD 6 * SIZE(%ecx) FLD 5 * SIZE(%ecx) FLD 4 * SIZE(%ecx) FLD 3 * SIZE(%ecx) FLD 2 * SIZE(%ecx) FLD 1 * SIZE(%ecx) FLD 0 * SIZE(%ecx) FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) FST 2 * SIZE(%edx) FST 3 * SIZE(%edx) FST 4 * SIZE(%edx) FST 5 * SIZE(%edx) FST 6 * SIZE(%edx) FST 7 * SIZE(%edx) #else fldl 6 * SIZE(%ecx) fldl 4 * SIZE(%ecx) fldl 2 * SIZE(%ecx) fldl 0 * SIZE(%ecx) fstpl 0 * SIZE(%edx) fstpl 2 * SIZE(%edx) fstpl 4 * SIZE(%edx) fstpl 6 * SIZE(%edx) #endif addl $8 * SIZE, %ecx addl $8 * SIZE, %edx decl %eax jg .L11 ALIGN_2 .L20: movl %ebx, %eax # i = m andl $3, %eax jle .L99 ALIGN_2 .L21: #if defined(DOUBLE) || defined(XDOUBLE) FLD 1 * SIZE(%ecx) FLD 0 * SIZE(%ecx) FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) #else fldl 0 * SIZE(%ecx) fstpl 0 * SIZE(%edx) #endif addl $2 * SIZE, %ecx addl $2 * SIZE, %edx decl %eax jg .L21 .L99: xorl %eax,%eax popl %ebx popl %esi popl %edi ret ALIGN_3 .L100: movl %ebx, %eax sarl $2, %eax jle .L120 ALIGN_2 .L111: #if defined(DOUBLE) || defined(XDOUBLE) FLD 0 * SIZE(%ecx) FLD 1 * SIZE(%ecx) addl %esi, %ecx FLD 0 * SIZE(%ecx) FLD 1 * SIZE(%ecx) addl %esi, %ecx FLD 0 * SIZE(%ecx) FLD 1 * SIZE(%ecx) addl %esi, %ecx FLD 0 * SIZE(%ecx) FLD 1 * SIZE(%ecx) addl %esi, %ecx fxch %st(7) FST 0 * SIZE(%edx) fxch %st(5) FST 1 * SIZE(%edx) addl %edi, %edx fxch %st(3) FST 0 * SIZE(%edx) fxch %st(1) FST 1 * SIZE(%edx) addl %edi, %edx FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) addl %edi, %edx FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) addl %edi, %edx #else fldl 0 * SIZE(%ecx) addl %esi, %ecx fldl 0 * SIZE(%ecx) addl %esi, %ecx fldl 0 * SIZE(%ecx) addl %esi, %ecx fldl 0 * SIZE(%ecx) addl %esi, %ecx fxch %st(3) fstpl 0 * SIZE(%edx) addl %edi, %edx fxch %st(1) fstpl 0 * SIZE(%edx) addl %edi, %edx fstpl 0 * SIZE(%edx) addl %edi, %edx fstpl 0 * SIZE(%edx) addl %edi, %edx #endif decl %eax jg .L111 .L120: movl %ebx, %eax andl $3, %eax jle .L999 ALIGN_2 .L121: FLD 0 * SIZE(%ecx) FLD 1 * SIZE(%ecx) addl %esi, %ecx fxch %st(1) FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) addl %edi, %edx decl %eax jg .L121 .L999: xorl %eax,%eax popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zcopy_sse.S000066400000000000000000000434461313527062700172260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY cmpl $2 * SIZE, INCX jne .L100 cmpl $2 * SIZE, INCY jne .L100 cmpl $3, M jle .L106 subl $-32 * SIZE, X subl $-32 * SIZE, Y addl M, M testl $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M ALIGN_4 .L05: testl $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, M jle .L19 ALIGN_4 .L10: testl $3 * SIZE, X jne .L20 movl M, %eax sarl $5, %eax jle .L13 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -32 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -28 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -24 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm2) movaps %xmm3, -20 * SIZE(Y) LOAD(12 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4,-16 * SIZE(Y) LOAD(16 * SIZE, X, %xmm4) movaps %xmm5,-12 * SIZE(Y) LOAD(20 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -8 * SIZE(Y) LOAD(24 * SIZE, X, %xmm6) movaps %xmm7, -4 * SIZE(Y) LOAD(28 * SIZE, X, %xmm7) subl $-32 * SIZE, Y subl $-32 * SIZE, X decl %eax jg .L11 ALIGN_3 .L12: movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, -16 * SIZE(Y) movaps %xmm5, -12 * SIZE(Y) movaps %xmm6, -8 * SIZE(Y) movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, Y subl $-32 * SIZE, X ALIGN_3 .L13: testl $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L14: testl $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: testl $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: testl $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L17: testl $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: popl %ebx popl %esi popl %edi ret ALIGN_3 .L20: testl $SIZE, X jne .L30 movhps -32 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L23 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -10 * SIZE(X), %xmm6 movaps -6 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 6 * SIZE(X), %xmm2 shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 10 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 14 * SIZE(X), %xmm4 shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 18 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 22 * SIZE(X), %xmm6 shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 26 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L23: testl $16, M jle .L24 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L24: testl $8, M jle .L25 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 shufps $0x4e, %xmm1, %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, M jle .L26 ALIGN_3 movaps -30 * SIZE(X), %xmm1 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: testl $2, M jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, M jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L29: popl %ebx popl %esi popl %edi ret ALIGN_3 .L30: testl $2 * SIZE, X jne .L40 movaps -33 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L33 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movaps -13 * SIZE(X), %xmm5 movaps -9 * SIZE(X), %xmm6 movaps -5 * SIZE(X), %xmm7 decl %eax jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 3 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 7 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 11 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 15 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 19 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 23 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 27 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L33: testl $16, M jle .L34 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L34: testl $8, M jle .L35 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L35: testl $4, M jle .L36 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L36: testl $2, M jle .L37 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L37: testl $1, M jle .L39 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L39: popl %ebx popl %esi popl %edi ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movl M, %eax sarl $5, %eax jle .L43 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movaps -15 * SIZE(X), %xmm5 movaps -11 * SIZE(X), %xmm6 movaps -7 * SIZE(X), %xmm7 decl %eax jle .L42 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 1 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 5 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 9 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 13 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 17 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 21 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 25 * SIZE(X), %xmm7 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L43: testl $16, M jle .L44 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L44: testl $8, M jle .L45 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L45: testl $4, M jle .L46 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L46: testl $2, M jle .L47 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L47: testl $1, M jle .L49 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addl $SIZE, Y ALIGN_3 .L49: popl %ebx popl %esi popl %edi ret ALIGN_4 .L100: movl M, %eax sarl $3, %eax jle .L105 ALIGN_3 .L102: movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm1 addl INCX, X movhps (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X movsd (X), %xmm3 addl INCX, X movhps (X), %xmm3 addl INCX, X movsd %xmm0, (Y) addl INCY, Y movhps %xmm0, (Y) addl INCY, Y movsd %xmm1, (Y) addl INCY, Y movhps %xmm1, (Y) addl INCY, Y movsd %xmm2, (Y) addl INCY, Y movhps %xmm2, (Y) addl INCY, Y movsd %xmm3, (Y) addl INCY, Y movhps %xmm3, (Y) addl INCY, Y decl %eax jg .L102 ALIGN_3 .L105: testl $4, M jle .L106 movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm1 addl INCX, X movhps (X), %xmm1 addl INCX, X movsd %xmm0, (Y) addl INCY, Y movhps %xmm0, (Y) addl INCY, Y movsd %xmm1, (Y) addl INCY, Y movhps %xmm1, (Y) addl INCY, Y ALIGN_3 .L106: testl $2, M jle .L107 movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd %xmm0, (Y) addl INCY, Y movhps %xmm0, (Y) addl INCY, Y ALIGN_3 .L107: testl $1, M jle .L999 movsd (X), %xmm0 movsd %xmm0, (Y) ALIGN_3 .L999: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zcopy_sse2.S000066400000000000000000000314501313527062700173000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define M %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define xmm8 xmm0 #define xmm9 xmm1 #define xmm10 xmm2 #define xmm11 xmm3 #define xmm12 xmm4 #define xmm13 xmm5 #define xmm14 xmm6 #define xmm15 xmm7 #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 addl M, M #ifdef ALIGNED_ACCESS testl $SIZE, Y #else testl $SIZE, X #endif je .L10 movsd (X), %xmm0 movsd %xmm0, (Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M jle .L19 ALIGN_4 .L10: subl $-16 * SIZE, X subl $-16 * SIZE, Y #ifdef ALIGNED_ACCESS testl $SIZE, X #else testl $SIZE, Y #endif jne .L20 movl M, %eax sarl $4, %eax jle .L13 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -16 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -14 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -12 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movaps %xmm3, -10 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4, -8 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movaps %xmm5, -6 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -4 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movaps %xmm7, -2 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L11 ALIGN_3 .L12: movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, -8 * SIZE(Y) movaps %xmm5, -6 * SIZE(Y) movaps %xmm6, -4 * SIZE(Y) movaps %xmm7, -2 * SIZE(Y) subl $-16 * SIZE, Y subl $-16 * SIZE, X ALIGN_3 .L13: testl $8, M jle .L14 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L14: testl $4, M jle .L15 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L15: testl $2, M jle .L16 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps %xmm0, -16 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L16: testl $1, M jle .L19 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: popl %ebx popl %esi popl %edi ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm0 movl M, %eax sarl $4, %eax jle .L23 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 movaps -7 * SIZE(X), %xmm5 movaps -5 * SIZE(X), %xmm6 movaps -3 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) LOAD( 1 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) LOAD( 3 * SIZE, X, %xmm2) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) LOAD( 5 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) LOAD( 7 * SIZE, X, %xmm4) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) LOAD( 9 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) LOAD(11 * SIZE, X, %xmm6) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) LOAD(13 * SIZE, X, %xmm7) subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L23: testl $8, M jle .L24 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, %xmm0 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L24: testl $4, M jle .L25 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: testl $2, M jle .L26 ALIGN_3 movaps -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L26: testl $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: popl %ebx popl %esi popl %edi ret ALIGN_3 #else movl M, %eax sarl $4, %eax jle .L23 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L21 ALIGN_3 .L22: movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) subl $-16 * SIZE, Y subl $-16 * SIZE, X ALIGN_3 .L23: testl $8, M jle .L24 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movaps -10 * SIZE(X), %xmm3 movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L24: testl $4, M jle .L25 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: testl $2, M jle .L26 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L26: testl $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: popl %ebx popl %esi popl %edi ret ALIGN_3 #endif .L50: movl M, %eax sarl $2, %eax jle .L55 ALIGN_3 .L51: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addl INCX, X movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addl INCX, X movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addl INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addl INCY, Y movlps %xmm1, 0 * SIZE(Y) movhps %xmm1, 1 * SIZE(Y) addl INCY, Y movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) addl INCY, Y movlps %xmm3, 0 * SIZE(Y) movhps %xmm3, 1 * SIZE(Y) addl INCY, Y decl %eax jg .L51 ALIGN_3 .L55: movl M, %eax andl $3, %eax jle .L57 ALIGN_3 .L56: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addl INCY, Y decl %eax jg .L56 ALIGN_3 .L57: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zdot.S000066400000000000000000000146701313527062700161650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #if defined(DOUBLE) || defined(XDOUBLE) #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_X 12 + STACK + ARGS(%esp) #define STACK_INCX 16 + STACK + ARGS(%esp) #define STACK_Y 20 + STACK + ARGS(%esp) #define STACK_INCY 24 + STACK + ARGS(%esp) #else #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #endif #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N),N movl (INCX),INCX movl (INCY),INCY #endif #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif testl N, N jle .L88 addl INCX, INCX fldz addl INCY, INCY fldz leal (, INCX, SIZE), INCX fldz leal (, INCY, SIZE), INCY fldz cmpl $2 * SIZE, INCX jne .L14 cmpl $2 * SIZE, INCY jne .L14 movl N, %eax sarl $1, %eax jle .L15 ALIGN_3 .L16: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(Y) faddp %st, %st(2) FLD 3 * SIZE(X) FLD 2 * SIZE(Y) fmul %st(1), %st faddp %st, %st(4) FMUL 3 * SIZE(Y) faddp %st, %st(4) addl $4 * SIZE, X addl $4 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl N, %eax andl $1, %eax jle .L27 ALIGN_3 .L22: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) jmp .L27 ALIGN_3 .L14: movl N, %eax sarl $1, %eax jle .L30 ALIGN_3 .L31: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) addl INCX, X FLD 0 * SIZE(X) addl INCY, Y FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) addl INCX, X addl INCY, Y decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $1, %eax jle .L27 ALIGN_3 .L37: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1), %st faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) ALIGN_3 .L27: #if defined(DOUBLE) || defined(XDOUBLE) movl RESULT, %eax #endif #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) fsubp %st, %st(1) #endif #if !defined(DOUBLE) && !defined(XDOUBLE) subl $2 * SIZE, %esp FST 1 * SIZE(%esp) FST 0 * SIZE(%esp) movl 0 * SIZE(%esp), %eax movl 1 * SIZE(%esp), %edx addl $2 * SIZE, %esp #else FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) #endif popl %ebx popl %esi popl %edi #if defined(DOUBLE) || defined(XDOUBLE) ret $0x4 #else ret #endif ALIGN_3 .L88: #if defined(DOUBLE) || defined(XDOUBLE) movl RESULT, %eax #endif fldz fldz #if !defined(DOUBLE) && !defined(XDOUBLE) xor %eax, %eax xor %edx, %edx #else FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) #endif popl %ebx popl %esi popl %edi #if defined(DOUBLE) || defined(XDOUBLE) ret $0x4 #else ret #endif EPILOGUE OpenBLAS-0.2.20/kernel/x86/zdot_amd.S000066400000000000000000000161111313527062700167760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #if !defined(DOUBLE) && !defined(XDOUBLE) #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_X 12 + STACK + ARGS(%esp) #define STACK_INCX 16 + STACK + ARGS(%esp) #define STACK_Y 20 + STACK + ARGS(%esp) #define STACK_INCY 24 + STACK + ARGS(%esp) #else #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #endif PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #if defined(F_INTERFACE) movl (N),N movl (INCX),INCX movl (INCY),INCY #endif testl N, N jle .L88 fldz fldz fldz fldz addl INCX, INCX addl INCY, INCY leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY cmpl $2 * SIZE, INCX jne .L14 cmpl $2 * SIZE, INCY jne .L14 movl N, %eax sarl $2, %eax jle .L15 FLD 0 * SIZE(X) ALIGN_3 .L16: FLD 0 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) FLD 2 * SIZE(X) FLD 2 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(2) FMUL 3 * SIZE(Y) faddp %st, %st(2) FLD 3 * SIZE(X) FLD 2 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(4) FMUL 3 * SIZE(Y) faddp %st, %st(4) FLD 4 * SIZE(X) FLD 4 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(2) FMUL 5 * SIZE(Y) faddp %st, %st(2) FLD 5 * SIZE(X) FLD 4 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(4) FMUL 5 * SIZE(Y) faddp %st, %st(4) FLD 6 * SIZE(X) FLD 6 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(2) FMUL 7 * SIZE(Y) faddp %st, %st(2) FLD 7 * SIZE(X) FLD 6 * SIZE(Y) PADDING fmul %st(1) faddp %st, %st(4) FMUL 7 * SIZE(Y) faddp %st, %st(4) FLD 8 * SIZE(X) prefetch 32 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax jg .L16 ffreep %st(0) ALIGN_3 .L15: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L22: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) addl $2 * SIZE, X addl $2 * SIZE, Y decl %eax jg .L22 jmp .L27 ALIGN_3 .L14: #ifdef F_INTERFACE testl INCX, INCX # if (incx < 0) jge .L28 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L28: testl INCY, INCY # if (incy < 0) jge .L29 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L29: #endif movl N, %eax sarl $1, %eax jle .L30 ALIGN_3 .L31: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) addl INCX, X FLD 0 * SIZE(X) addl INCY, Y FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) addl INCX, X addl INCY, Y decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $1, %eax jle .L27 ALIGN_3 .L37: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FMUL 1 * SIZE(Y) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FMUL 1 * SIZE(Y) faddp %st, %st(4) ALIGN_3 .L27: #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) fsubp %st, %st(1) #endif #if !defined(DOUBLE) && !defined(XDOUBLE) subl $2 * SIZE, %esp FST 1 * SIZE(%esp) FST 0 * SIZE(%esp) movl 0 * SIZE(%esp), %eax movl 1 * SIZE(%esp), %edx addl $2 * SIZE, %esp #else movl RESULT, %eax FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) #endif popl %ebx popl %esi popl %edi #if defined(F_INTERFACE) && defined(F_PATHSCALE) ret $0x4 #else ret #endif ALIGN_3 .L88: #if !defined(DOUBLE) && !defined(XDOUBLE) xor %eax, %eax xor %edx, %edx #else movl RESULT, %eax fldz fldz FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) #endif popl %ebx popl %esi popl %edi #if defined(F_INTERFACE) && defined(F_PATHSCALE) ret $0x4 #else ret #endif EPILOGUE OpenBLAS-0.2.20/kernel/x86/zdot_sse.S000066400000000000000000001773721313527062700170500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 testl N, N jle .L999 cmpl $2 * SIZE, INCX jne .L200 cmpl $2 * SIZE, INCY jne .L200 subl $-32 * SIZE, X subl $-32 * SIZE, Y testl $SIZE, X jne .L50 .L0x: testl $2 * SIZE, X je .L10 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd -32 * SIZE(Y), %xmm0 PSHUFD2($0xb1, %xmm0, %xmm1) mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y decl N ALIGN_3 .L10: testl $3 * SIZE, Y jne .L20 movl N, %eax sarl $4, %eax jle .L15 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm6 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -24 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -24 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -20 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -20 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -16 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -16 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -12 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -12 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -8 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -4 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps 0 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps 0 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps 4 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps 4 * SIZE(X), %xmm5 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L12: PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -24 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -24 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -20 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -20 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -16 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -16 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -12 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -12 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -8 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -4 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L15: testl $8, N jle .L16 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm6 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm7 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -24 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -24 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -20 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -20 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L16: testl $4, N jle .L17 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm6 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm7 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L17: testl $2, N jle .L18 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm6 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L18: testl $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd -32 * SIZE(Y), %xmm6 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 jmp .L98 ALIGN_3 .L20: #ifdef ALIGNED_ACCESS testl $2 * SIZE, Y jne .L30 movaps -33 * SIZE(Y), %xmm6 addl $3 * SIZE, Y shufps $0xb1, %xmm1, %xmm1 movl N, %eax sarl $4, %eax jle .L25 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -4 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps 0 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -4 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L25: testl $8, N jle .L26 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -20 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L26: testl $4, N jle .L27 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L27: testl $2, N jle .L28 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movaps %xmm7, %xmm6 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L28: testl $1, N jle .L29 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L29: shufps $0xb1, %xmm1, %xmm1 jmp .L98 ALIGN_3 .L30: testl $SIZE, Y jne .L40 #endif movl N, %eax sarl $4, %eax jle .L35 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm6 movhps -30 * SIZE(Y), %xmm6 movaps -28 * SIZE(X), %xmm5 movsd -28 * SIZE(Y), %xmm7 movhps -26 * SIZE(Y), %xmm7 decl %eax jle .L32 ALIGN_3 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd -24 * SIZE(Y), %xmm6 movhps -22 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -24 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd -20 * SIZE(Y), %xmm7 movhps -18 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -20 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd -16 * SIZE(Y), %xmm6 movhps -14 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -16 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd -12 * SIZE(Y), %xmm7 movhps -10 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -12 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd -8 * SIZE(Y), %xmm6 movhps -6 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -8 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd -4 * SIZE(Y), %xmm7 movhps -2 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -4 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd 0 * SIZE(Y), %xmm6 movhps 2 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps 0 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd 4 * SIZE(Y), %xmm7 movhps 6 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps 4 * SIZE(X), %xmm5 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd -24 * SIZE(Y), %xmm6 movhps -22 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -24 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd -20 * SIZE(Y), %xmm7 movhps -18 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -20 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd -16 * SIZE(Y), %xmm6 movhps -14 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -16 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd -12 * SIZE(Y), %xmm7 movhps -10 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -12 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd -8 * SIZE(Y), %xmm6 movhps -6 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -8 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd -4 * SIZE(Y), %xmm7 movhps -2 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -4 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L35: testl $8, N jle .L36 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm6 movhps -30 * SIZE(Y), %xmm6 movaps -28 * SIZE(X), %xmm5 movsd -28 * SIZE(Y), %xmm7 movhps -26 * SIZE(Y), %xmm7 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd -24 * SIZE(Y), %xmm6 movhps -22 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -24 * SIZE(X), %xmm4 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd -20 * SIZE(Y), %xmm7 movhps -18 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -20 * SIZE(X), %xmm5 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L36: testl $4, N jle .L37 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm6 movhps -30 * SIZE(Y), %xmm6 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 movaps -28 * SIZE(X), %xmm5 movsd -28 * SIZE(Y), %xmm7 movhps -26 * SIZE(Y), %xmm7 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L37: testl $2, N jle .L38 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm6 movhps -30 * SIZE(Y), %xmm6 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L38: testl $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd -32 * SIZE(Y), %xmm6 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 jmp .L98 ALIGN_3 #ifdef ALIGNED_ACCESS .L40: movaps -35 * SIZE(Y), %xmm6 addl $1 * SIZE, Y shufps $0xb1, %xmm1, %xmm1 movl N, %eax sarl $4, %eax jle .L45 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 decl %eax jle .L42 ALIGN_3 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -4 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps 0 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -4 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L45: testl $8, N jle .L46 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(X), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(X), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(Y), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -20 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L46: testl $4, N jle .L47 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -28 * SIZE(Y), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L47: testl $2, N jle .L48 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movaps %xmm7, %xmm6 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L48: testl $1, N jle .L49 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 movss -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L49: shufps $0xb1, %xmm1, %xmm1 jmp .L98 ALIGN_3 #endif .L50: testl $SIZE, Y jne .L70 #ifdef ALIGNED_ACCESS testl $2 * SIZE, Y je .L50x #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(X), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 PSHUFD2($0xb1, %xmm0, %xmm1) mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y decl N ALIGN_3 .L50x: testl $2 * SIZE, X jne .L60 movaps -33 * SIZE(X), %xmm6 addl $3 * SIZE, X shufps $0xb1, %xmm1, %xmm1 movl N, %eax sarl $4, %eax jle .L55 movaps -32 * SIZE(Y), %xmm4 movaps -28 * SIZE(Y), %xmm5 movaps -32 * SIZE(X), %xmm7 decl %eax jle .L52 ALIGN_3 .L51: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(X), %xmm7 addps %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(X), %xmm7 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -4 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps 0 * SIZE(X), %xmm7 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L51 ALIGN_3 .L52: movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -4 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L55: testl $8, N jle .L56 movaps -32 * SIZE(Y), %xmm4 movaps -28 * SIZE(Y), %xmm5 movaps -32 * SIZE(X), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -20 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L56: testl $4, N jle .L57 movaps -32 * SIZE(Y), %xmm4 movaps -28 * SIZE(Y), %xmm5 movaps -32 * SIZE(X), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x39, %xmm7, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L57: testl $2, N jle .L58 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movaps %xmm7, %xmm6 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L58: testl $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x39, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 jmp .L98 ALIGN_3 .L60: movaps -35 * SIZE(X), %xmm6 addl $1 * SIZE, X shufps $0xb1, %xmm1, %xmm1 movl N, %eax sarl $4, %eax jle .L65 movaps -32 * SIZE(Y), %xmm4 movaps -28 * SIZE(Y), %xmm5 movaps -32 * SIZE(X), %xmm7 decl %eax jle .L62 ALIGN_3 .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(X), %xmm7 addps %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(X), %xmm7 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -4 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps 0 * SIZE(X), %xmm7 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L61 ALIGN_3 .L62: movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -20 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -16 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -12 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -4 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -8 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -4 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L65: testl $8, N jle .L66 movaps -32 * SIZE(Y), %xmm4 movaps -28 * SIZE(Y), %xmm5 movaps -32 * SIZE(X), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movaps -24 * SIZE(X), %xmm7 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -20 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L66: testl $4, N jle .L67 movaps -32 * SIZE(Y), %xmm4 movaps -28 * SIZE(Y), %xmm5 movaps -32 * SIZE(X), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 movaps -28 * SIZE(X), %xmm6 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) shufps $0x93, %xmm6, %xmm7 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L67: testl $2, N jle .L68 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm7, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movaps %xmm7, %xmm6 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L68: testl $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 movss -32 * SIZE(X), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) shufps $0x93, %xmm6, %xmm6 mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 jmp .L98 ALIGN_3 #else testl $2 * SIZE, Y je .L50x #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(Y), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 PSHUFD2($0xb1, %xmm0, %xmm1) mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y decl N ALIGN_3 .L50x: movl N, %eax sarl $4, %eax jle .L55 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm6 movhps -30 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm5 movlps -28 * SIZE(X), %xmm7 movhps -26 * SIZE(X), %xmm7 decl %eax jle .L52 ALIGN_3 .L51: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps -24 * SIZE(X), %xmm6 movhps -22 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps -16 * SIZE(X), %xmm6 movhps -14 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps -12 * SIZE(X), %xmm7 movhps -10 * SIZE(X), %xmm7 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps 0 * SIZE(X), %xmm6 movhps 2 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps 4 * SIZE(X), %xmm7 movhps 6 * SIZE(X), %xmm7 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L51 ALIGN_3 .L52: PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps -24 * SIZE(X), %xmm6 movhps -22 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps -16 * SIZE(X), %xmm6 movhps -14 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps -12 * SIZE(X), %xmm7 movhps -10 * SIZE(X), %xmm7 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -8 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L55: testl $8, N jle .L56 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm6 movhps -30 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm5 movlps -28 * SIZE(X), %xmm7 movhps -26 * SIZE(X), %xmm7 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 movaps -24 * SIZE(Y), %xmm4 mulps %xmm6, %xmm3 movlps -24 * SIZE(X), %xmm6 movhps -22 * SIZE(X), %xmm6 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps -20 * SIZE(Y), %xmm5 mulps %xmm7, %xmm3 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L56: testl $4, N jle .L57 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm6 movhps -30 * SIZE(X), %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 movaps -28 * SIZE(Y), %xmm5 movlps -28 * SIZE(X), %xmm7 movhps -26 * SIZE(X), %xmm7 PSHUFD2($0xb1, %xmm5, %xmm3) mulps %xmm7, %xmm5 addps %xmm5, %xmm0 mulps %xmm7, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L57: testl $2, N jle .L58 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm6 movhps -30 * SIZE(X), %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L58: testl $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd -32 * SIZE(X), %xmm6 PSHUFD2($0xb1, %xmm4, %xmm3) mulps %xmm6, %xmm4 addps %xmm4, %xmm0 mulps %xmm6, %xmm3 addps %xmm3, %xmm1 jmp .L98 ALIGN_3 #endif .L70: testl $2 * SIZE, Y je .L70x #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 addl $2 * SIZE, X #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(Y), %xmm1 addl $2 * SIZE, Y PSHUFD2($0xb1, %xmm1, %xmm0) shufps $0xb1, %xmm4, %xmm4 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 decl N ALIGN_3 .L70x: testl $2 * SIZE, X jne .L80 movaps -33 * SIZE(X), %xmm4 addl $3 * SIZE, X movaps -33 * SIZE(Y), %xmm6 addl $3 * SIZE, Y movl N, %eax sarl $4, %eax jle .L75 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 decl %eax jle .L72 ALIGN_3 .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -28 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -24 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -24 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -20 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -16 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -16 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -12 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -8 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -8 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -4 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps 0 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps 0 * SIZE(X), %xmm5 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L71 ALIGN_3 .L72: movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -28 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -24 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -24 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -20 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -16 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -16 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -12 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -8 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -8 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -4 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L75: testl $8, N jle .L76 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -28 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -24 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movaps -24 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -20 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L76: testl $4, N jle .L77 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movaps -28 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L77: testl $2, N jle .L78 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 movaps %xmm5, %xmm4 movaps %xmm7, %xmm6 ALIGN_3 .L78: testl $1, N jle .L79 xorps %xmm5, %xmm5 movss %xmm5, %xmm4 movss %xmm5, %xmm6 shufps $0x24, %xmm4, %xmm4 PSHUFD2($0x18, %xmm6, %xmm3) shufps $0x24, %xmm6, %xmm6 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L79: shufps $0x39, %xmm0, %xmm0 shufps $0x39, %xmm1, %xmm1 jmp .L98 ALIGN_3 .L80: movsd -33 * SIZE(X), %xmm4 movhps -31 * SIZE(X), %xmm4 addl $3 * SIZE, X movaps -33 * SIZE(Y), %xmm6 addl $3 * SIZE, Y movl N, %eax sarl $4, %eax jle .L85 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 decl %eax jle .L82 ALIGN_3 .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -28 * SIZE(X), %xmm4 movhps -26 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -24 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd -24 * SIZE(X), %xmm5 movhps -22 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -20 * SIZE(X), %xmm4 movhps -18 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -16 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd -16 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -12 * SIZE(X), %xmm4 movhps -10 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -8 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd -8 * SIZE(X), %xmm5 movhps -6 * SIZE(X), %xmm5 addps %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -4 * SIZE(X), %xmm4 movhps -2 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps 0 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd 0 * SIZE(X), %xmm5 movhps 2 * SIZE(X), %xmm5 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L81 ALIGN_3 .L82: movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -28 * SIZE(X), %xmm4 movhps -26 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -24 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd -24 * SIZE(X), %xmm5 movhps -22 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -20 * SIZE(X), %xmm4 movhps -18 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -16 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd -16 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -12 * SIZE(X), %xmm4 movhps -10 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -8 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd -8 * SIZE(X), %xmm5 movhps -6 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -4 * SIZE(X), %xmm4 movhps -2 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 subl $-32 * SIZE, X subl $-32 * SIZE, Y ALIGN_3 .L85: testl $8, N jle .L86 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -28 * SIZE(X), %xmm4 movhps -26 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movaps -24 * SIZE(Y), %xmm7 mulps %xmm5, %xmm3 movsd -24 * SIZE(X), %xmm5 movhps -22 * SIZE(X), %xmm5 addps %xmm3, %xmm1 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -20 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -20 * SIZE(X), %xmm4 movhps -18 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L86: testl $4, N jle .L87 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movaps -28 * SIZE(Y), %xmm6 mulps %xmm4, %xmm3 movsd -28 * SIZE(X), %xmm4 movhps -26 * SIZE(X), %xmm4 addps %xmm3, %xmm1 movss %xmm6, %xmm7 PSHUFD2($0x1b, %xmm7, %xmm3) movss %xmm4, %xmm5 mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L87: testl $2, N jle .L88 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm7 movss %xmm7, %xmm6 PSHUFD2($0x1b, %xmm6, %xmm3) movss %xmm5, %xmm4 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 movaps %xmm5, %xmm4 movaps %xmm7, %xmm6 ALIGN_3 .L88: testl $1, N jle .L89 xorps %xmm5, %xmm5 movss %xmm5, %xmm4 movss %xmm5, %xmm6 shufps $0x24, %xmm4, %xmm4 PSHUFD2($0x18, %xmm6, %xmm3) shufps $0x24, %xmm6, %xmm6 mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L89: shufps $0x39, %xmm0, %xmm0 shufps $0x39, %xmm1, %xmm1 jmp .L98 ALIGN_3 .L200: movl N, %eax sarl $4, %eax jle .L205 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y decl %eax jle .L204 ALIGN_3 .L203: PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 decl %eax jg .L203 ALIGN_3 .L204: PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L205: testl $8, N jle .L206 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y mulps %xmm4, %xmm3 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y mulps %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L206: testl $4, N jle .L207 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 movsd (X), %xmm5 addl INCX, X movhps (X), %xmm5 addl INCX, X movsd (Y), %xmm7 addl INCY, Y movhps (Y), %xmm7 addl INCY, Y PSHUFD2($0xb1, %xmm7, %xmm3) mulps %xmm5, %xmm7 addps %xmm7, %xmm0 mulps %xmm5, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L207: testl $2, N jle .L208 movsd (X), %xmm4 addl INCX, X movhps (X), %xmm4 addl INCX, X movsd (Y), %xmm6 addl INCY, Y movhps (Y), %xmm6 addl INCY, Y PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L208: testl $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd (X), %xmm4 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd (Y), %xmm6 PSHUFD2($0xb1, %xmm6, %xmm3) mulps %xmm4, %xmm6 addps %xmm6, %xmm0 mulps %xmm4, %xmm3 addps %xmm3, %xmm1 ALIGN_3 .L98: movhlps %xmm0, %xmm2 movhlps %xmm1, %xmm3 addps %xmm2, %xmm0 addps %xmm3, %xmm1 PSHUFD2($1, %xmm0, %xmm2) PSHUFD2($1, %xmm1, %xmm3) #ifndef CONJ subss %xmm2, %xmm0 addss %xmm3, %xmm1 #else addss %xmm2, %xmm0 subss %xmm3, %xmm1 #endif ALIGN_4 .L999: subl $2 * SIZE, %esp movss %xmm0, 0 * SIZE(%esp) movss %xmm1, 1 * SIZE(%esp) movl 0 * SIZE(%esp), %eax movl 1 * SIZE(%esp), %edx addl $2 * SIZE, %esp popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zdot_sse2.S000066400000000000000000000746751313527062700171340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_X 12 + STACK + ARGS(%esp) #define STACK_INCX 16 + STACK + ARGS(%esp) #define STACK_Y 20 + STACK + ARGS(%esp) #define STACK_INCY 24 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #include "l1param.h" #undef movsd #ifndef OPTERON #define MOVLPS movsd #else #define MOVLPS movlps #endif PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 cmpl $0, N jle .L999 cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 subl $-16 * SIZE, X subl $-16 * SIZE, Y testl $SIZE, Y jne .L30 testl $SIZE, X jne .L20 movl N, %eax sarl $3, %eax jle .L15 movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 decl %eax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -12 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -10 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -8 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -6 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -4 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -2 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps 0 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps 0 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps 2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps 2 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L11 ALIGN_3 .L12: pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -12 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -10 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -8 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -6 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -4 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -2 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L15: testl $4, N jle .L16 movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -12 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -10 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L16: testl $2, N jle .L17 movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L17: testl $1, N jle .L98 movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 jmp .L98 ALIGN_3 .L20: movl N, %eax sarl $3, %eax jle .L25 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 decl %eax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -12 * SIZE(X), %xmm4 movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -10 * SIZE(X), %xmm5 movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -4 * SIZE(X), %xmm4 movhps -3 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -2 * SIZE(X), %xmm5 movhps -1 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps 0 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps 2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS 2 * SIZE(X), %xmm5 movhps 3 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L22: pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -12 * SIZE(X), %xmm4 movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -10 * SIZE(X), %xmm5 movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -4 * SIZE(X), %xmm4 movhps -3 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -2 * SIZE(X), %xmm5 movhps -1 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L25: testl $4, N jle .L26 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -12 * SIZE(X), %xmm4 movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -10 * SIZE(X), %xmm5 movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: testl $2, N jle .L27 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: testl $1, N jle .L98 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 jmp .L98 ALIGN_3 .L30: testl $SIZE, X jne .L40 movl N, %eax sarl $3, %eax jle .L35 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 decl %eax jle .L32 ALIGN_3 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -12 * SIZE(Y), %xmm4 movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -10 * SIZE(Y), %xmm5 movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -8 * SIZE(Y), %xmm4 movhps -7 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -6 * SIZE(Y), %xmm5 movhps -5 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -4 * SIZE(Y), %xmm4 movhps -3 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -2 * SIZE(Y), %xmm5 movhps -1 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps 0 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(Y), %xmm4 movhps 1 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps 2 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS 2 * SIZE(Y), %xmm5 movhps 3 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L32: pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -12 * SIZE(Y), %xmm4 movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -10 * SIZE(Y), %xmm5 movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -8 * SIZE(Y), %xmm4 movhps -7 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -6 * SIZE(Y), %xmm5 movhps -5 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -4 * SIZE(Y), %xmm4 movhps -3 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -2 * SIZE(Y), %xmm5 movhps -1 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L35: testl $4, N jle .L36 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 mulpd %xmm4, %xmm3 MOVLPS -12 * SIZE(Y), %xmm4 movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 mulpd %xmm5, %xmm3 MOVLPS -10 * SIZE(Y), %xmm5 movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L36: testl $2, N jle .L37 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L37: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 testl $1, N jle .L98 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 SHUFPD_1 %xmm3, %xmm3 addpd %xmm3, %xmm1 jmp .L98 ALIGN_3 .L40: movhps -16 * SIZE(X), %xmm4 addl $SIZE, X movhps -16 * SIZE(Y), %xmm6 addl $SIZE, Y movl N, %eax sarl $3, %eax jle .L45 movaps -16 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm7 decl %eax jle .L42 ALIGN_3 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -14 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -14 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -12 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -12 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -10 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -10 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -8 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -8 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -6 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -6 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -4 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -2 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -2 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps 0 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps 0 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L42: movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -14 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -14 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -12 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -12 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -10 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -10 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -8 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -8 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -6 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -6 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -4 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -2 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -2 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X subl $-16 * SIZE, Y ALIGN_3 .L45: testl $4, N jle .L46 movaps -16 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm7 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -14 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -14 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -12 * SIZE(Y), %xmm7 mulpd %xmm5, %xmm3 movaps -12 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -10 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -10 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L46: testl $2, N jle .L47 movaps -16 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm7 movsd %xmm7, %xmm6 pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -14 * SIZE(Y), %xmm6 mulpd %xmm4, %xmm3 movaps -14 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L47: testl $1, N jle .L48 movlpd -16 * SIZE(X), %xmm4 movlpd -16 * SIZE(Y), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 ALIGN_3 .L48: SHUFPD_1 %xmm0, %xmm0 SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm2, %xmm2 SHUFPD_1 %xmm3, %xmm3 jmp .L98 ALIGN_3 .L50: movl N, %eax sarl $3, %eax jle .L55 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y decl %eax jle .L54 ALIGN_3 .L53: pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 decl %eax jg .L53 ALIGN_3 .L54: pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 ALIGN_3 .L55: testl $4, N jle .L56 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y mulpd %xmm4, %xmm3 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y mulpd %xmm5, %xmm3 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X addpd %xmm3, %xmm1 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 ALIGN_3 .L56: testl $2, N jle .L57 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addl INCX, X MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 addl INCY, Y pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addl INCX, X MOVLPS 0 * SIZE(Y), %xmm7 movhps 1 * SIZE(Y), %xmm7 addl INCY, Y pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 addpd %xmm3, %xmm1 ALIGN_3 .L57: testl $1, N jle .L98 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 ALIGN_3 .L98: pshufd $0x4e, %xmm0, %xmm2 pshufd $0x4e, %xmm1, %xmm3 #ifndef CONJ subsd %xmm2, %xmm0 addsd %xmm3, %xmm1 #else addsd %xmm2, %xmm0 subsd %xmm3, %xmm1 #endif .L999: movl RESULT, %eax MOVLPS %xmm0, 0 * SIZE(%eax) MOVLPS %xmm1, 1 * SIZE(%eax) popl %ebx popl %esi popl %edi #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #ifdef MS_ABI /* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */ ret #else /* remove the hidden return value address from the stack. For MingW GCC < 4.7 */ ret $0x4 #endif #else /*remove the hidden return value address from the stack on Linux.*/ ret $0x4 #endif EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_1x4_athlon.S000066400000000000000000000403721313527062700221630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #ifdef DOUBLE #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define STACK_A 32 + STACK + ARGS(%esp) #define STACK_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define STACK_LDC 44 + STACK + ARGS(%esp) #else #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define STACK_A 24 + STACK + ARGS(%esp) #define STACK_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define STACK_LDC 36 + STACK + ARGS(%esp) #endif #define A %edx #define B %ecx #define B_ORIG %ebx #define LDC %ebp #define PREFETCHSIZE (5 + 8 * 10) /* A hint of scheduling is received from following URL http://www.netlib.org/atlas/atlas-comm/msg00260.html Julian's code is still faster than mine, since Athlon has big defect ... So this is a sample coding and please don't expect too much. */ PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl STACK_B, B_ORIG movl STACK_LDC, LDC sall $ZBASE_SHIFT, LDC subl $-16 * SIZE, B_ORIG subl $-16 * SIZE, STACK_A movl M, %eax testl %eax, %eax jle .L999 movl N, %eax testl %eax, %eax jle .L999 movl K, %eax testl %eax, %eax jle .L999 movl N, %eax sarl $2, %eax movl %eax, J je .L20 ALIGN_3 .L11: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl C, %edi #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B_ORIG, B #else movl KK, %eax leal (, %eax, SIZE), %eax leal (B_ORIG, %eax, 4), B #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $4, %eax jle .L13 ALIGN_4 .L12: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi movl 16 * SIZE(B), %esi movl 24 * SIZE(B), %esi movl 32 * SIZE(B), %esi movl 40 * SIZE(B), %esi subl $-64 * SIZE, B decl %eax jne .L12 ALIGN_3 .L13: movl M, %esi movl %esi, I ALIGN_3 .L14: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B_ORIG, B #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 1), A leal (B_ORIG, %eax, 4), B #endif leal (%edi, LDC, 2), %eax fldz fldz fldz fldz FLD -8 * SIZE(A) FLD -16 * SIZE(A) FLD -16 * SIZE(B) movl $32 * SIZE, %esi #ifdef HAVE_3DNOW prefetchw 1 * SIZE(%edi) prefetchw 2 * SIZE(%edi, LDC) prefetchw 1 * SIZE(%eax) prefetchw 2 * SIZE(%eax, LDC) #elif defined(HAVE_SSE) prefetcht0 1 * SIZE(%edi) prefetcht0 1 * SIZE(%edi, LDC) prefetcht0 1 * SIZE(%eax) prefetcht0 1 * SIZE(%eax, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L16 ALIGN_3 .L15: fmul %st(1), %st faddp %st, %st(3) PADDING FLD -15 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -14 * SIZE(B) #if L1_DATA_LINESIZE == 32 #ifdef HAVE_3DNOW PADDING prefetch (PREFETCHSIZE - 4) * SIZE(A) #elif defined(HAVE_SSE) PADDING prefetcht0 (PREFETCHSIZE - 4) * SIZE(A) #endif #endif fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -13 * SIZE(B) faddp %st, %st(5) FLD -15 * SIZE(A) FLD -12 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -11 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -10 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -9 * SIZE(B) faddp %st, %st(5) FLD -14 * SIZE(A) FLD -8 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -7 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -6 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -5 * SIZE(B) faddp %st, %st(5) FLD -13 * SIZE(A) FLD -4 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD -3 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD -2 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL -1 * SIZE(B) faddp %st, %st(5) FLD -12 * SIZE(A) FLD 0 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 1 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 2 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 3 * SIZE(B) faddp %st, %st(5) FLD -11 * SIZE(A) FLD 4 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 5 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 6 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 7 * SIZE(B) faddp %st, %st(5) FLD -10 * SIZE(A) FLD 8 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 9 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 10 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 11 * SIZE(B) faddp %st, %st(5) FLD -9 * SIZE(A) FLD 12 * SIZE(B) fmul %st(1), %st faddp %st, %st(3) PADDING FLD 13 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) PADDING FLD 14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) PADDING FMUL 15 * SIZE(B) faddp %st, %st(5) FLD 0 * SIZE(A) #ifdef HAVE_3DNOW PADDING prefetch PREFETCHSIZE * SIZE(A) #elif defined(HAVE_SSE) PADDING prefetcht0 PREFETCHSIZE * SIZE(A) #endif addl $8 * SIZE, A fxch %st(1) addl $32 * SIZE, B FLD -16 * SIZE(B) decl %eax jne .L15 ALIGN_4 .L16: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $7, %eax je .L19 ALIGN_4 .L17: fmul %st(1), %st faddp %st, %st(3) FLD -15 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(5) FMUL -13 * SIZE(B) faddp %st, %st(5) FLD -15 * SIZE(A) FLD -12 * SIZE(B) addl $1 * SIZE,A addl $4 * SIZE,B decl %eax jne .L17 ALIGN_4 .L19: ffreep %st(0) ffreep %st(0) ffreep %st(0) leal (%edi, LDC, 2), %eax FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(%edi) faddp %st, %st(1) FST 0 * SIZE(%edi) fld %st(3) fmul %st(1), %st FLD 0 * SIZE(%edi, LDC) faddp %st, %st(1) FST 0 * SIZE(%edi, LDC) fld %st(4) fmul %st(1), %st FLD 0 * SIZE(%eax) faddp %st, %st(1) FST 0 * SIZE(%eax) fmul %st(5), %st FLD 0 * SIZE(%eax, LDC) faddp %st, %st(1) FST 0 * SIZE(%eax, LDC) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) FLD 1 * SIZE(%edi) faddp %st, %st(1) FST 1 * SIZE(%edi) FLD 1 * SIZE(%edi, LDC) faddp %st, %st(1) FST 1 * SIZE(%edi, LDC) FLD 1 * SIZE(%eax) faddp %st, %st(1) FST 1 * SIZE(%eax) FLD 1 * SIZE(%eax, LDC) faddp %st, %st(1) FST 1 * SIZE(%eax, LDC) addl $2 * SIZE, %edi decl I jne .L14 #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C movl B, B_ORIG decl J jne .L11 ALIGN_4 .L20: movl N, %eax andl $2, %eax je .L30 ALIGN_3 .L21: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl C, %edi #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B_ORIG, B #else movl KK, %eax leal (, %eax, SIZE), %eax leal (B_ORIG, %eax, 2), B #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $4, %eax jle .L23 ALIGN_4 .L22: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi subl $-32 * SIZE, B decl %eax jne .L22 ALIGN_3 .L23: movl M, %esi movl %esi, I ALIGN_3 .L24: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B_ORIG, B #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 1), A leal (B_ORIG, %eax, 2), B #endif fldz fldz fldz fldz FLD -16 * SIZE(A) FLD -16 * SIZE(B) prefetchw 1 * SIZE(%edi) prefetchw 1 * SIZE(%edi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L26 ALIGN_3 .L25: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -15 * SIZE(A) FLD -14 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -13 * SIZE(B) faddp %st, %st(4) FLD -14 * SIZE(A) FLD -12 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -11 * SIZE(B) faddp %st, %st(2) FLD -13 * SIZE(A) FLD -10 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -9 * SIZE(B) faddp %st, %st(4) FLD -12 * SIZE(A) FLD -8 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -7 * SIZE(B) faddp %st, %st(2) FLD -11 * SIZE(A) FLD -6 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -5 * SIZE(B) faddp %st, %st(4) FLD -10 * SIZE(A) FLD -4 * SIZE(B) fmul %st(1), %st faddp %st, %st(2) FMUL -3 * SIZE(B) faddp %st, %st(2) FLD -9 * SIZE(A) FLD -2 * SIZE(B) fmul %st(1), %st faddp %st, %st(4) FMUL -1 * SIZE(B) faddp %st, %st(4) FLD -8 * SIZE(A) FLD 0 * SIZE(B) addl $ 8 * SIZE, A subl $-16 * SIZE, B decl %eax jne .L25 ALIGN_4 .L26: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $7, %eax je .L29 ALIGN_4 .L27: fmul %st(1), %st faddp %st, %st(2) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -15 * SIZE(A) FLD -14 * SIZE(B) addl $1 * SIZE,A addl $2 * SIZE,B decl %eax jne .L27 ALIGN_4 .L29: ffreep %st(0) ffreep %st(0) faddp %st, %st(2) faddp %st, %st(2) FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(%edi) faddp %st, %st(1) FST 0 * SIZE(%edi) fmul %st(3), %st FLD 0 * SIZE(%edi, LDC) faddp %st, %st(1) FST 0 * SIZE(%edi, LDC) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(%edi) faddp %st, %st(1) FST 1 * SIZE(%edi) FLD 1 * SIZE(%edi, LDC) faddp %st, %st(1) FST 1 * SIZE(%edi, LDC) addl $2 * SIZE, %edi decl I jne .L24 #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C movl B, B_ORIG ALIGN_4 .L30: movl N, %eax andl $1, %eax je .L999 ALIGN_3 .L31: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl C, %edi #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B_ORIG, B #else movl KK, %eax leal (, %eax, SIZE), %eax leal (B_ORIG, %eax, 1), B #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $5, %eax jle .L33 ALIGN_4 .L32: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi movl 8 * SIZE(B), %esi subl $-32 * SIZE, B decl %eax jne .L32 ALIGN_3 .L33: movl M, %esi movl %esi, I ALIGN_3 .L34: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B_ORIG, B #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 1), A leal (B_ORIG, %eax, 1), B #endif fldz fldz fldz fldz prefetchw 1 * SIZE(%edi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L36 ALIGN_3 .L35: FLD -16 * SIZE(A) FMUL -16 * SIZE(B) faddp %st, %st(1) FLD -15 * SIZE(A) FMUL -15 * SIZE(B) faddp %st, %st(2) FLD -14 * SIZE(A) FMUL -14 * SIZE(B) faddp %st, %st(3) FLD -13 * SIZE(A) FMUL -13 * SIZE(B) faddp %st, %st(4) FLD -12 * SIZE(A) FMUL -12 * SIZE(B) faddp %st, %st(1) FLD -11 * SIZE(A) FMUL -11 * SIZE(B) faddp %st, %st(2) FLD -10 * SIZE(A) FMUL -10 * SIZE(B) faddp %st, %st(3) FLD -9 * SIZE(A) FMUL -9 * SIZE(B) faddp %st, %st(4) addl $8 * SIZE, A addl $8 * SIZE, B decl %eax jne .L35 ALIGN_4 .L36: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $7, %eax je .L39 ALIGN_4 .L37: FLD -16 * SIZE(A) FMUL -16 * SIZE(B) faddp %st, %st(1) addl $1 * SIZE,A addl $1 * SIZE,B decl %eax jne .L37 ALIGN_4 .L39: faddp %st, %st(2) faddp %st, %st(2) faddp %st, %st(1) FLD ALPHA_I FLD ALPHA_R fmul %st(2), %st FLD 0 * SIZE(%edi) faddp %st, %st(1) FST 0 * SIZE(%edi) fmulp %st, %st(1) FLD 1 * SIZE(%edi) faddp %st, %st(1) FST 1 * SIZE(%edi) addl $2 * SIZE, %edi decl I jne .L34 #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif addl LDC, C movl B, B_ORIG ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_2x2_atom.S000066400000000000000000000340121313527062700216270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #define AA %edx #define BB %ecx #define CO1 %esi #define LDC %ebp #define B %edi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif sall $ZBASE_SHIFT, LDC movl N, %eax sarl $1, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 1, %eax leal (B, %eax), %eax movl %eax, BX movl C, CO1 # coffset = c leal (, LDC, 2), %eax addl %eax, C movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif movl BX, %eax prefetcht0 0 * SIZE(%eax) subl $-8 * SIZE, BX movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) xorps %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 3 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 5 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 4 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 5 * SIZE(BB), %xmm3 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 7 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 7 * SIZE(BB), %xmm3 addl $8 * SIZE, BB addl $8 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movsd ALPHA_R, %xmm0 movsd ALPHA_I, %xmm1 addsd %xmm2, %xmm6 addsd %xmm3, %xmm7 movaps %xmm4, %xmm2 mulsd %xmm0, %xmm4 mulsd %xmm1, %xmm2 movaps %xmm6, %xmm3 mulsd %xmm0, %xmm6 mulsd %xmm1, %xmm3 addsd 0 * SIZE(CO1), %xmm4 addsd 1 * SIZE(CO1), %xmm2 addsd 2 * SIZE(CO1), %xmm6 addsd 3 * SIZE(CO1), %xmm3 movlps %xmm4, 0 * SIZE(CO1) movlps %xmm2, 1 * SIZE(CO1) movlps %xmm6, 2 * SIZE(CO1) movlps %xmm3, 3 * SIZE(CO1) movaps %xmm5, %xmm2 mulsd %xmm0, %xmm5 mulsd %xmm1, %xmm2 movaps %xmm7, %xmm3 mulsd %xmm0, %xmm7 mulsd %xmm1, %xmm3 addsd 0 * SIZE(CO1, LDC), %xmm5 addsd 1 * SIZE(CO1, LDC), %xmm2 addsd 2 * SIZE(CO1, LDC), %xmm7 addsd 3 * SIZE(CO1, LDC), %xmm3 movlps %xmm5, 0 * SIZE(CO1, LDC) movlps %xmm2, 1 * SIZE(CO1, LDC) movlps %xmm7, 2 * SIZE(CO1, LDC) movlps %xmm3, 3 * SIZE(CO1, LDC) addl $4 * SIZE, CO1 decl %ebx jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L25 ALIGN_4 .L22: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 3 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 5 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 6 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 7 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addsd %xmm2, %xmm4 movsd 0 * SIZE(BB), %xmm2 addsd %xmm3, %xmm5 movsd 1 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: movsd ALPHA_R, %xmm0 movsd ALPHA_I, %xmm1 addsd %xmm2, %xmm4 addsd %xmm3, %xmm5 movaps %xmm4, %xmm2 mulsd %xmm0, %xmm4 mulsd %xmm1, %xmm2 movaps %xmm5, %xmm3 mulsd %xmm0, %xmm5 mulsd %xmm1, %xmm3 addsd 0 * SIZE(CO1), %xmm4 addsd 1 * SIZE(CO1), %xmm2 addsd 0 * SIZE(CO1, LDC), %xmm5 addsd 1 * SIZE(CO1, LDC), %xmm3 movlps %xmm4, 0 * SIZE(CO1) movlps %xmm2, 1 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) movlps %xmm3, 1 * SIZE(CO1, LDC) ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B decl J jg .L10 ALIGN_4 .L30: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, CO1 addl LDC, C movl A, AA movl M, %ebx sarl $1, %ebx jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 prefetcht0 3 * SIZE(CO1) xorps %xmm2, %xmm2 xorps %xmm4, %xmm4 xorps %xmm6, %xmm6 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 2 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 3 * SIZE(BB), %xmm1 addsd %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 4 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addsd %xmm0, %xmm4 movsd 0 * SIZE(AA), %xmm0 addsd %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 mulsd %xmm1, %xmm0 mulsd %xmm1, %xmm2 movsd 1 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addsd %xmm0, %xmm4 addsd %xmm2, %xmm6 movsd ALPHA_R, %xmm0 movsd ALPHA_I, %xmm1 movaps %xmm4, %xmm2 mulsd %xmm0, %xmm4 mulsd %xmm1, %xmm2 movaps %xmm6, %xmm3 mulsd %xmm0, %xmm6 mulsd %xmm1, %xmm3 addsd 0 * SIZE(CO1), %xmm4 addsd 1 * SIZE(CO1), %xmm2 addsd 2 * SIZE(CO1), %xmm6 addsd 3 * SIZE(CO1), %xmm3 movlps %xmm4, 0 * SIZE(CO1) movlps %xmm2, 1 * SIZE(CO1) movlps %xmm6, 2 * SIZE(CO1) movlps %xmm3, 3 * SIZE(CO1) addl $4 * SIZE, CO1 decl %ebx jg .L31 ALIGN_4 .L40: movl M, %ebx testl $1, %ebx jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movsd 0 * SIZE(BB), %xmm2 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L45 ALIGN_4 .L42: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 3 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 3 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addsd %xmm5, %xmm4 movsd ALPHA_R, %xmm0 movsd ALPHA_I, %xmm1 movaps %xmm4, %xmm2 mulsd %xmm0, %xmm4 mulsd %xmm1, %xmm2 addsd 0 * SIZE(CO1), %xmm4 addsd 1 * SIZE(CO1), %xmm2 movlps %xmm4, 0 * SIZE(CO1) movlps %xmm2, 1 * SIZE(CO1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_2x2_coppermine.S000066400000000000000000000333461313527062700230410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #ifdef DOUBLE #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define LDC 44 + STACK + ARGS(%esp) #else #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define LDC 36 + STACK + ARGS(%esp) #endif #define PREFETCH_OFFSET 48 #if defined(PENTIUM3) || defined(PENTIUMM) #define REP rep #else #define REP rep #endif PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl N, %eax # j = (n >> 1) # MEMORY movl LDC, %ebp # ldc # MEMORY movl B, %ebx sall $ZBASE_SHIFT, %ebp sarl $1, %eax leal 0(%ecx) , %ecx # NOP movl %eax, J # j = (n >> 1) # MEMORY test %eax, %eax je .L8 # if !(n >> 1) goto .L8 ALIGN_4 .L34: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl %ebx, BX movl M, %esi # m # MEMORY movl A, %edx # a # MEMORY movl C, %edi # C # MEMORY sarl $1, %esi # i = (m >> 1) je .L12 ALIGN_4 .MainHead: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 2), %ecx #endif #ifdef HAVE_SSE movl BX, %eax prefetcht2 0 * SIZE(%eax) prefetcht2 4 * SIZE(%eax) #if L2_SIZE > 262144 subl $-8 * SIZE, BX #elif L2_SIZE > 131072 prefetcht2 8 * SIZE(%eax) prefetcht2 12 * SIZE(%eax) subl $-16 * SIZE, BX #else prefetcht2 16 * SIZE(%eax) prefetcht2 20 * SIZE(%eax) prefetcht2 24 * SIZE(%eax) prefetcht2 28 * SIZE(%eax) subl $-32 * SIZE, BX #endif #endif fldz fldz #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif fldz fldz FLD 4 * SIZE(%ecx) # b5 FLD 4 * SIZE(%edx) # a5 FLD 0 * SIZE(%ecx) # b1 FLD 0 * SIZE(%edx) # a1 #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(%edi) prefetchw 2 * SIZE(%edi, %ebp, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(%edi) prefetchnta 2 * SIZE(%edi, %ebp, 1) #endif sarl $2, %eax je .L16 ALIGN_4 .MainLoop: #if defined(HAVE_3DNOW) prefetch (PREFETCH_OFFSET) * SIZE(%ecx) nop #elif defined(HAVE_SSE) prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx) #ifdef CORE_KATMAI prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx) #endif #endif fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(%ecx) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(%edx) fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(%ecx) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(%edx) fmul %st, %st(1) FMUL 3 * SIZE(%ecx) fxch %st(1) faddp %st, %st(4) FLD 2 * SIZE(%ecx) fxch %st(1) faddp %st, %st(5) FLD 3 * SIZE(%edx) fmul %st, %st(1) FMUL 3 * SIZE(%ecx) fxch %st(1) faddp %st, %st(6) FLD 8 * SIZE(%ecx) fxch %st(1) faddp %st, %st(7) FLD 8 * SIZE(%edx) fxch %st(2) #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx) #ifdef CORE_KATMAI prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx) #endif #endif fmul %st, %st(3) FMUL 5 * SIZE(%ecx) fxch %st(3) faddp %st, %st(4) FLD 4 * SIZE(%ecx) fxch %st(3) faddp %st, %st(5) FLD 5 * SIZE(%edx) fmul %st, %st(3) FMUL 5 * SIZE(%ecx) fxch %st(3) faddp %st, %st(6) FLD 6 * SIZE(%ecx) fxch %st(3) faddp %st, %st(7) FLD 6 * SIZE(%edx) fmul %st, %st(3) FMUL 7 * SIZE(%ecx) fxch %st(3) faddp %st, %st(4) FLD 6 * SIZE(%ecx) fxch %st(3) faddp %st, %st(5) FLD 7 * SIZE(%edx) fmul %st, %st(3) FMUL 7 * SIZE(%ecx) fxch %st(3) faddp %st, %st(6) FLD 12 * SIZE(%ecx) fxch %st(3) faddp %st, %st(7) FLD 12 * SIZE(%edx) fxch %st(2) subl $-8 * SIZE, %ecx subl $-8 * SIZE, %edx decl %eax # l -- jne .MainLoop ALIGN_4 .L16: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L21 ALIGN_4 .SubLoop: fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(4) FLD 0 * SIZE(%ecx) fxch %st(1) faddp %st, %st(5) FLD 1 * SIZE(%edx) fmul %st, %st(1) FMUL 1 * SIZE(%ecx) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(%ecx) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(%edx) addl $2 * SIZE,%ecx addl $2 * SIZE,%edx decl %eax jne .SubLoop ALIGN_4 .L21: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(%edi) faddp %st, %st(1) FST 0 * SIZE(%edi) fld %st(3) fmul %st(1), %st FLD 0 * SIZE(%edi, %ebp) faddp %st, %st(1) FST 0 * SIZE(%edi, %ebp) fld %st(4) fmul %st(1), %st FLD 2 * SIZE(%edi) faddp %st, %st(1) FST 2 * SIZE(%edi) fmul %st(5), %st FLD 2 * SIZE(%edi, %ebp) faddp %st, %st(1) FST 2 * SIZE(%edi, %ebp) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) FLD 1 * SIZE(%edi) faddp %st, %st(1) FST 1 * SIZE(%edi) FLD 1 * SIZE(%edi, %ebp) faddp %st, %st(1) FST 1 * SIZE(%edi, %ebp) FLD 3 * SIZE(%edi) faddp %st, %st(1) FST 3 * SIZE(%edi) FLD 3 * SIZE(%edi, %ebp) faddp %st, %st(1) FST 3 * SIZE(%edi, %ebp) addl $4 * SIZE, %edi rep decl %esi # i -- rep jne .MainHead ALIGN_4 .L12: movl M, %eax # m # MEMORY andl $1, %eax je .L27 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 2), %ecx #endif fldz fldz FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $1,%eax # k >> 1 # MEMORY je .L54 ALIGN_4 .L55: FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0) addl $2 * SIZE, %edx addl $4 * SIZE, %ecx decl %eax jne .L55 ALIGN_4 .L54: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $1,%eax # k & 1 je .L33 ALIGN_4 FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) rep fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) faddp %st, %st(2) FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) addl $1 * SIZE, %edx addl $2 * SIZE, %ecx ALIGN_4 .L33: ffreep %st(0) FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(%edi) faddp %st, %st(1) FST 0 * SIZE(%edi) fmul %st(3), %st FLD 0 * SIZE(%edi, %ebp) faddp %st, %st(1) FST 0 * SIZE(%edi, %ebp) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(%edi) faddp %st, %st(1) FST 1 * SIZE(%edi) FLD 1 * SIZE(%edi, %ebp) faddp %st, %st(1) FST 1 * SIZE(%edi, %ebp) ALIGN_4 .L27: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif lea (, %ebp, 2), %eax addl %eax, C # C + 2 * ldc # MEMORY movl %ecx, %ebx # b # MEMORY decl J # j-- # MEMORY jne .L34 ALIGN_4 .L8: movl N, %eax # n # MEMORY andl $1, %eax je .End #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, %edi # c # MEMORY movl A, %edx # a # MEMORY movl M, %esi # m # MEMORY sarl $1, %esi # m >> 1 je .L36 ALIGN_4 .L46: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 1), %ecx #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif fldz sarl $1, %eax fldz FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0) je .L56 ALIGN_4 .L57: FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0) FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0) addl $4 * SIZE,%edx addl $2 * SIZE,%ecx dec %eax jne .L57 ALIGN_4 .L56: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $1, %eax je .L45 ALIGN_4 FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) fmul %st(1), %st faddp %st, %st(2) FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) faddp %st, %st(2) FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0) addl $2 * SIZE,%edx addl $1 * SIZE,%ecx ALIGN_4 .L45: ffreep %st(0) FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(%edi) faddp %st, %st(1) FST 0 * SIZE(%edi) fmul %st(3), %st FLD 2 * SIZE(%edi) faddp %st, %st(1) FST 2 * SIZE(%edi) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(%edi) faddp %st, %st(1) FST 1 * SIZE(%edi) FLD 3 * SIZE(%edi) faddp %st, %st(1) FST 3 * SIZE(%edi) addl $4 * SIZE, %edi decl %esi # i -- jne .L46 ALIGN_4 .L36: movl M, %eax # m # MEMORY andl $1, %eax # m & 1 je .End #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl %ebx, %ecx #else movl KK, %eax leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 1), %ecx #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif fldz ALIGN_3 .L51: FLD (%edx) FMUL (%ecx) addl $1 * SIZE,%edx addl $1 * SIZE,%ecx faddp %st,%st(1) decl %eax jne .L51 FLD ALPHA_I FLD ALPHA_R fmul %st(2), %st FLD 0 * SIZE(%edi) faddp %st, %st(1) FST 0 * SIZE(%edi) fmulp %st, %st(1) FLD 1 * SIZE(%edi) faddp %st, %st(1) FST 1 * SIZE(%edi) ALIGN_4 .End: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_2x4_barcelona.S000066400000000000000000000627101313527062700226250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define OLD_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define OLD_LDC 44 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define B %edi #define LDC %ebp #define AO %edx #define BO %ecx #define CO %esi #define I %ebx #define movsd movlps #define movapd movups #define movlpd movlps #define movhpd movhps #define PREFETCH prefetch #define PREFETCHSIZE (8 * 7 + 0) #define KERNEL1(address) \ mulpd %xmm0, %xmm1; \ mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup -14 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL2(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup -12 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL3(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup -10 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL4(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm1, %xmm4; \ movapd (BO, %eax, 4), %xmm1; \ addpd %xmm0, %xmm5; \ movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ addpd %xmm0, %xmm7; \ movddup (AO, %eax, 2), %xmm0 #define KERNEL5(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup -6 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL6(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup -4 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL7(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup -2 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL8(address) \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm1, %xmm4; \ movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ addpd %xmm3, %xmm5; \ movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ addpd %xmm3, %xmm7; \ movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ addpd %xmm2, %xmm6; \ movapd %xmm1, %xmm2 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl OLD_B, B movl OLD_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax movl %eax, BX movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 pxor %xmm4, %xmm4 movddup -8 * SIZE(AO), %xmm3 leal (LDC, LDC, 2), %eax prefetchw 1 * SIZE(CO) pxor %xmm5, %xmm5 prefetchw 3 * SIZE(CO, LDC) pxor %xmm6, %xmm6 prefetchw 1 * SIZE(CO, LDC, 2) pxor %xmm7, %xmm7 prefetchw 3 * SIZE(CO, %eax) movapd %xmm1, %xmm2 movl BX, %eax prefetch -16 * SIZE(%eax) addl $8 * SIZE, %eax movl %eax, BX #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif andl $-8, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO negl %eax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax BRANCH jl .L12 ALIGN_3 .L15: movups ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) je .L18 leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO negl %eax ALIGN_3 .L17: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BO, %eax, 4), %xmm1 addpd %xmm0, %xmm5 movddup -15 * SIZE(AO, %eax, 2), %xmm0 mulpd %xmm0, %xmm2 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 addpd %xmm0, %xmm7 movddup -14 * SIZE(AO, %eax, 2), %xmm0 addpd %xmm2, %xmm6 movapd %xmm1, %xmm2 addl $SIZE, %eax jl .L17 ALIGN_4 .L18: leal (CO, LDC, 2), %eax movsd 0 * SIZE(CO), %xmm0 movhps 1 * SIZE(CO), %xmm0 movsd 0 * SIZE(CO, LDC), %xmm1 movhps 1 * SIZE(CO, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(CO) movhps %xmm0, 1 * SIZE(CO) movlps %xmm1, 0 * SIZE(CO, LDC) movhps %xmm1, 1 * SIZE(CO, LDC) movsd 2 * SIZE(CO), %xmm0 movhps 3 * SIZE(CO), %xmm0 movsd 2 * SIZE(CO, LDC), %xmm1 movhps 3 * SIZE(CO, LDC), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 2 * SIZE(CO) movhps %xmm0, 3 * SIZE(CO) movlps %xmm1, 2 * SIZE(CO, LDC) movhps %xmm1, 3 * SIZE(CO, LDC) movsd 0 * SIZE(%eax), %xmm0 movhps 1 * SIZE(%eax), %xmm0 movsd 0 * SIZE(%eax, LDC), %xmm1 movhps 1 * SIZE(%eax, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%eax) movhps %xmm0, 1 * SIZE(%eax) movlps %xmm1, 0 * SIZE(%eax, LDC) movhps %xmm1, 1 * SIZE(%eax, LDC) movsd 2 * SIZE(%eax), %xmm0 movhps 3 * SIZE(%eax), %xmm0 movsd 2 * SIZE(%eax, LDC), %xmm1 movhps 3 * SIZE(%eax, LDC), %xmm1 pshufd $0x44, %xmm7, %xmm2 unpckhpd %xmm7, %xmm7 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm7 addpd %xmm7, %xmm1 movlps %xmm0, 2 * SIZE(%eax) movhps %xmm0, 3 * SIZE(%eax) movlps %xmm1, 2 * SIZE(%eax, LDC) movhps %xmm1, 3 * SIZE(%eax, LDC) addl $4 * SIZE, %esi # coffset += 2 decl I # i -- jg .L11 ALIGN_4 .L20: movl M, I testl $1, I # i = (m >> 2) jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 movddup -8 * SIZE(AO), %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd -8 * SIZE(BO), %xmm1 addpd %xmm0, %xmm7 movddup -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd -4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -13 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd (BO), %xmm1 addpd %xmm0, %xmm7 movddup -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -11 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd 8 * SIZE(BO), %xmm1 addpd %xmm0, %xmm7 movddup -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd 12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -9 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm6 movapd 16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm7 movddup -8 * SIZE(AO), %xmm0 subl $ -8 * SIZE, AO subl $-32 * SIZE, BO decl %eax jne .L22 ALIGN_4 .L25: movups ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm5 movddup -15 * SIZE(AO), %xmm0 addl $1 * SIZE, AO addl $4 * SIZE, BO decl %eax jg .L26 ALIGN_4 .L28: leal (CO, LDC, 2), %eax addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movsd 0 * SIZE(CO), %xmm0 movhps 1 * SIZE(CO), %xmm0 movsd 0 * SIZE(CO, LDC), %xmm1 movhps 1 * SIZE(CO, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(CO) movhps %xmm0, 1 * SIZE(CO) movlps %xmm1, 0 * SIZE(CO, LDC) movhps %xmm1, 1 * SIZE(CO, LDC) movsd 0 * SIZE(%eax), %xmm0 movhps 1 * SIZE(%eax), %xmm0 movsd 0 * SIZE(%eax, LDC), %xmm1 movhps 1 * SIZE(%eax, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%eax) movhps %xmm0, 1 * SIZE(%eax) movlps %xmm1, 0 * SIZE(%eax, LDC) movhps %xmm1, 1 * SIZE(%eax, LDC) ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif movl BO, B leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L30: testl $2, N je .L60 ALIGN_2 .L31: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 prefetchw 1 * SIZE(CO) pxor %xmm5, %xmm5 prefetchw 1 * SIZE(CO, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -16 * SIZE(BO), %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -14 * SIZE(BO), %xmm0 movddup -13 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -14 * SIZE(BO), %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 mulpd -12 * SIZE(BO), %xmm0 movddup -11 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -12 * SIZE(BO), %xmm1 movddup -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -10 * SIZE(BO), %xmm1 movddup -8 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO) mulpd -8 * SIZE(BO), %xmm0 movddup -7 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -8 * SIZE(BO), %xmm1 movddup -6 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -6 * SIZE(BO), %xmm0 movddup -5 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -6 * SIZE(BO), %xmm1 movddup -4 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 mulpd -4 * SIZE(BO), %xmm0 movddup -3 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -4 * SIZE(BO), %xmm1 movddup -2 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 mulpd -2 * SIZE(BO), %xmm0 movddup -1 * SIZE(AO), %xmm1 addpd %xmm0, %xmm6 mulpd -2 * SIZE(BO), %xmm1 movddup 0 * SIZE(AO), %xmm0 addpd %xmm1, %xmm7 subl $-16 * SIZE, AO subl $-16 * SIZE, BO decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movups ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(AO), %xmm1 addpd %xmm0, %xmm4 mulpd -16 * SIZE(BO), %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm5 addl $2 * SIZE, AO addl $2 * SIZE, BO decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movsd 0 * SIZE(CO), %xmm0 movhps 1 * SIZE(CO), %xmm0 movsd 0 * SIZE(CO, LDC), %xmm1 movhps 1 * SIZE(CO, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(CO) movhps %xmm0, 1 * SIZE(CO) movlps %xmm1, 0 * SIZE(CO, LDC) movhps %xmm1, 1 * SIZE(CO, LDC) movsd 2 * SIZE(CO), %xmm0 movhps 3 * SIZE(CO), %xmm0 movsd 2 * SIZE(CO, LDC), %xmm1 movhps 3 * SIZE(CO, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 2 * SIZE(CO) movhps %xmm0, 3 * SIZE(CO) movlps %xmm1, 2 * SIZE(CO, LDC) movhps %xmm1, 3 * SIZE(CO, LDC) addl $4 * SIZE, %esi # coffset += 2 decl I # i -- jg .L41 ALIGN_4 .L50: movl M, I testl $1, I # i = (m >> 2) jle .L59 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(AO), %xmm0 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -14 * SIZE(AO), %xmm0 mulpd -12 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -13 * SIZE(AO), %xmm0 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -12 * SIZE(AO), %xmm0 mulpd -8 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -11 * SIZE(AO), %xmm0 mulpd -6 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -10 * SIZE(AO), %xmm0 mulpd -4 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -9 * SIZE(AO), %xmm0 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -8 * SIZE(AO), %xmm0 subl $ -8 * SIZE, AO subl $-16 * SIZE, BO decl %eax jne .L52 ALIGN_4 .L55: movups ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L58 .L56: mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(AO), %xmm0 subl $-1 * SIZE, AO subl $-2 * SIZE, BO decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 addpd %xmm5, %xmm4 movsd 0 * SIZE(CO), %xmm0 movhps 1 * SIZE(CO), %xmm0 movsd 0 * SIZE(CO, LDC), %xmm1 movhps 1 * SIZE(CO, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(CO) movhps %xmm0, 1 * SIZE(CO) movlps %xmm1, 0 * SIZE(CO, LDC) movhps %xmm1, 1 * SIZE(CO, LDC) ALIGN_4 .L59: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BO, B leal (, LDC, 2), %eax addl %eax, C # c += 4 * ldc ALIGN_4 .L60: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, CO # coffset = c movl A, AO # aoffset = a movl M, I sarl $1, I # i = (m >> 2) jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 1), BO #endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 prefetchw 1 * SIZE(CO) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(BO), %xmm0 mulpd -14 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -14 * SIZE(BO), %xmm0 mulpd -12 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -13 * SIZE(BO), %xmm0 mulpd -10 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -12 * SIZE(BO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd -8 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -11 * SIZE(BO), %xmm0 mulpd -6 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -10 * SIZE(BO), %xmm0 mulpd -4 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -9 * SIZE(BO), %xmm0 mulpd -2 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -8 * SIZE(BO), %xmm0 subl $-16 * SIZE, AO subl $ -8 * SIZE, BO decl %eax jne .L72 ALIGN_4 .L75: movups ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd -16 * SIZE(AO), %xmm0 addpd %xmm0, %xmm4 movddup -15 * SIZE(BO), %xmm0 addl $2 * SIZE, AO addl $1 * SIZE, BO decl %eax jg .L76 ALIGN_4 .L78: movsd 0 * SIZE(CO), %xmm0 movhps 1 * SIZE(CO), %xmm0 movsd 2 * SIZE(CO), %xmm1 movhps 3 * SIZE(CO), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(CO) movhps %xmm0, 1 * SIZE(CO) movlps %xmm1, 2 * SIZE(CO) movhps %xmm1, 3 * SIZE(CO) addl $4 * SIZE, %esi # coffset += 2 decl I # i -- jg .L71 ALIGN_4 .L80: movl M, I testl $1, I # i = (m >> 2) jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 1), BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(BO), %xmm0 addpd %xmm0, %xmm4 movapd -14 * SIZE(AO), %xmm0 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm0, %xmm5 movapd -12 * SIZE(AO), %xmm0 mulpd -12 * SIZE(BO), %xmm0 addpd %xmm0, %xmm6 movapd -10 * SIZE(AO), %xmm0 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm0, %xmm7 movapd -8 * SIZE(AO), %xmm0 subl $-8 * SIZE, AO subl $-8 * SIZE, BO decl %eax jne .L82 ALIGN_4 .L85: movups ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd -16 * SIZE(BO), %xmm0 addsd %xmm0, %xmm4 movsd -15 * SIZE(AO), %xmm0 addl $1 * SIZE, AO addl $1 * SIZE, BO decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 movsd 0 * SIZE(CO), %xmm0 movhps 1 * SIZE(CO), %xmm0 unpcklpd %xmm4, %xmm4 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(CO) movhps %xmm0, 1 * SIZE(CO) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_2x4_opteron.S000066400000000000000000001121241313527062700223600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA_R 16 + STACK + ARGS(%esi) #define OLD_ALPHA_I 24 + STACK + ARGS(%esi) #define OLD_A 32 + STACK + ARGS(%esi) #define OLD_B 36 + STACK + ARGS(%esi) #define OLD_C 40 + STACK + ARGS(%esi) #define OLD_LDC 44 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 128(%esp) #if defined(OPTERON) || defined(BARCELONA) #define movsd movlpd #endif #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movsd OLD_ALPHA_R, %xmm0 movhps OLD_ALPHA_I, %xmm0 movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK #ifdef TRMMKERNEL movss OLD_OFFT, %xmm4 #endif movl OLD_B, %edi movl OLD_C, %ebx movapd %xmm0, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $1, %eax jle .L05 ALIGN_4 .L02: #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(%edi) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq 4 * SIZE(%edi), %mm4 movq 5 * SIZE(%edi), %mm5 movq 6 * SIZE(%edi), %mm6 movq 7 * SIZE(%edi), %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm4, 9 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm5, 11 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm6, 13 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) movq %mm7, 15 * SIZE(%ecx) addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $1, %eax BRANCH jle .L10 movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) addl $4 * SIZE, %edi ALIGN_4 .L10: movl %edi, BX movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movl BX, %eax prefetchnta 0 * SIZE(%eax) prefetchnta 8 * SIZE(%eax) subl $-8 * SIZE, BX pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax prefetchw 1 * SIZE(%esi) prefetchw 1 * SIZE(%esi, LDC) prefetchw 1 * SIZE(%esi, LDC, 2) prefetchw 1 * SIZE(%esi, %eax) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 1 * SIZE(%esi, LDC, 2), %xmm0 movsd 2 * SIZE(%esi, LDC, 2), %xmm1 movhps 3 * SIZE(%esi, LDC, 2), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 1 * SIZE(%esi, LDC, 2) movlps %xmm1, 2 * SIZE(%esi, LDC, 2) movhps %xmm1, 3 * SIZE(%esi, LDC, 2) movsd 0 * SIZE(%esi, %eax), %xmm0 movhps 1 * SIZE(%esi, %eax), %xmm0 movsd 2 * SIZE(%esi, %eax), %xmm1 movhps 3 * SIZE(%esi, %eax), %xmm1 pshufd $0x44, %xmm7, %xmm2 unpckhpd %xmm7, %xmm7 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm7 addpd %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, %eax) movhps %xmm0, 1 * SIZE(%esi, %eax) movlps %xmm1, 2 * SIZE(%esi, %eax) movhps %xmm1, 3 * SIZE(%esi, %eax) addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax movsd 0 * SIZE(AA), %xmm0 movsd 4 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movsd 10 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movsd 12 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movsd 18 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movsd 20 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 22 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 32 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm4 movsd 26 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm5 movsd 28 * SIZE(BB), %xmm3 mulsd %xmm0, %xmm3 mulsd 30 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd 8 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movsd 34 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movsd 36 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 38 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movsd 48 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movsd 42 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movsd 44 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 46 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movsd 56 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm4 movsd 50 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 addsd %xmm2, %xmm5 movsd 52 * SIZE(BB), %xmm2 mulsd %xmm1, %xmm2 mulsd 54 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movsd 64 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm4 movsd 58 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 addsd %xmm3, %xmm5 movsd 60 * SIZE(BB), %xmm3 mulsd %xmm1, %xmm3 mulsd 62 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movsd 72 * SIZE(BB), %xmm3 addl $64 * SIZE, BB addsd %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 addl $8 * SIZE, AA decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 8 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 1 * SIZE(%esi, LDC), %xmm1 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 1 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 1 * SIZE(%esi, LDC, 2), %xmm0 movsd 0 * SIZE(%esi, %eax), %xmm1 movhps 1 * SIZE(%esi, %eax), %xmm1 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm3, %xmm7 addpd %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 1 * SIZE(%esi, LDC, 2) movlps %xmm1, 0 * SIZE(%esi, %eax) movhps %xmm1, 1 * SIZE(%esi, %eax) ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L30: testl $2, N je .L60 ALIGN_2 .L31: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L35 ALIGN_4 .L32: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movddup 1 * SIZE(%edi), %xmm1 movddup 2 * SIZE(%edi), %xmm2 movddup 3 * SIZE(%edi), %xmm3 movddup 4 * SIZE(%edi), %xmm4 movddup 5 * SIZE(%edi), %xmm5 movddup 6 * SIZE(%edi), %xmm6 movddup 7 * SIZE(%edi), %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 movsd 1 * SIZE(%edi), %xmm1 movsd 2 * SIZE(%edi), %xmm2 movsd 3 * SIZE(%edi), %xmm3 movsd 4 * SIZE(%edi), %xmm4 movsd 5 * SIZE(%edi), %xmm5 movsd 6 * SIZE(%edi), %xmm6 movsd 7 * SIZE(%edi), %xmm7 unpcklpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpckhpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpckhpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #endif prefetcht0 80 * SIZE(%edi) prefetcht1 112 * SIZE(%ecx) #endif #if defined(OPTERON) || defined(BARCELONA) #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(%edi) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq 4 * SIZE(%edi), %mm4 movq 5 * SIZE(%edi), %mm5 movq 6 * SIZE(%edi), %mm6 movq 7 * SIZE(%edi), %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm4, 9 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm5, 11 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm6, 13 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) movq %mm7, 15 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L32 ALIGN_2 .L35: movl K, %eax andl $3, %eax BRANCH jle .L40 ALIGN_2 .L36: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movddup 1 * SIZE(%edi), %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 movsd 1 * SIZE(%edi), %xmm1 unpcklpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) #endif #endif #if defined(OPTERON) || defined(BARCELONA) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) #endif addl $2 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L36 ALIGN_4 .L40: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, LDC) #endif #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: mulpd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax movsd 0 * SIZE(AA), %xmm0 movsd 4 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulsd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm6 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm6 movsd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm7 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm6 movsd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 .L56: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 1 * SIZE(%esi, LDC), %xmm1 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 1 * SIZE(%esi, LDC) ALIGN_4 .L59: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C # c += 4 * ldc ALIGN_4 .L60: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L65 ALIGN_4 .L62: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movddup 1 * SIZE(%edi), %xmm1 movddup 2 * SIZE(%edi), %xmm2 movddup 3 * SIZE(%edi), %xmm3 movddup 4 * SIZE(%edi), %xmm4 movddup 5 * SIZE(%edi), %xmm5 movddup 6 * SIZE(%edi), %xmm6 movddup 7 * SIZE(%edi), %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 movsd 1 * SIZE(%edi), %xmm1 movsd 2 * SIZE(%edi), %xmm2 movsd 3 * SIZE(%edi), %xmm3 movsd 4 * SIZE(%edi), %xmm4 movsd 5 * SIZE(%edi), %xmm5 movsd 6 * SIZE(%edi), %xmm6 movsd 7 * SIZE(%edi), %xmm7 unpcklpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpckhpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpckhpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) #endif prefetcht1 80 * SIZE(%edi) prefetcht0 112 * SIZE(%ecx) #endif #if defined(OPTERON) || defined(BARCELONA) #define COPYPREFETCH 40 prefetchnta (COPYPREFETCH) * SIZE(%edi) movq 0 * SIZE(%edi), %mm0 movq 1 * SIZE(%edi), %mm1 movq 2 * SIZE(%edi), %mm2 movq 3 * SIZE(%edi), %mm3 movq 4 * SIZE(%edi), %mm4 movq 5 * SIZE(%edi), %mm5 movq 6 * SIZE(%edi), %mm6 movq 7 * SIZE(%edi), %mm7 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) movq %mm1, 2 * SIZE(%ecx) movq %mm1, 3 * SIZE(%ecx) movq %mm2, 4 * SIZE(%ecx) movq %mm2, 5 * SIZE(%ecx) movq %mm3, 6 * SIZE(%ecx) movq %mm3, 7 * SIZE(%ecx) movq %mm4, 8 * SIZE(%ecx) movq %mm4, 9 * SIZE(%ecx) movq %mm5, 10 * SIZE(%ecx) movq %mm5, 11 * SIZE(%ecx) movq %mm6, 12 * SIZE(%ecx) movq %mm6, 13 * SIZE(%ecx) movq %mm7, 14 * SIZE(%ecx) movq %mm7, 15 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L62 ALIGN_2 .L65: movl K, %eax andl $7, %eax BRANCH jle .L70 ALIGN_2 .L66: #ifdef PENTIUM4 #ifdef HAVE_SSE3 movddup 0 * SIZE(%edi), %xmm0 movapd %xmm0, 0 * SIZE(%ecx) #else movsd 0 * SIZE(%edi), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) #endif #endif #if defined(OPTERON) || defined(BARCELONA) movq 0 * SIZE(%edi), %mm0 movq %mm0, 0 * SIZE(%ecx) movq %mm0, 1 * SIZE(%ecx) #endif addl $1 * SIZE, %edi addl $2 * SIZE, %ecx decl %eax jne .L66 ALIGN_4 .L70: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 2 * SIZE(%esi) #endif #ifdef PENTIUM4 prefetchnta 2 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movapd 16 * SIZE(BB), %xmm2 movapd 2 * SIZE(AA), %xmm0 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movapd 24 * SIZE(BB), %xmm3 movapd 10 * SIZE(AA), %xmm1 mulpd 10 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(AA), %xmm0 movapd 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax movsd 0 * SIZE(AA), %xmm0 movsd 4 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: mulsd %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 1 * SIZE(AA), %xmm0 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm6 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm7 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movsd 5 * SIZE(AA), %xmm1 mulsd 10 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm6 movsd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addsd %xmm5, %xmm4 addsd %xmm7, %xmm6 addsd %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_2x4_penryn.S000066400000000000000000000610631313527062700222120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx #ifdef NANO #define PREFETCHSIZE (8 * 3 + 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht2 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht2 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (8 * 21 + 4) #endif PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 2, %eax leal (B, %eax), %eax movl %eax, BX movl C, C1 movl A, AA movl M, I sarl $1, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movl BX, %eax PREFETCHB -16 * SIZE(%eax) subl $-8 * SIZE, BX leal (C1, LDC, 2), %eax movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 PREFETCHW 1 * SIZE(C1) pxor %xmm5, %xmm5 PREFETCHW 1 * SIZE(C1, LDC) pxor %xmm6, %xmm6 PREFETCHW 1 * SIZE(%eax) pxor %xmm7, %xmm7 PREFETCHW 1 * SIZE(%eax, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps -2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 6 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movaps 14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 subl $-32 * SIZE, BB mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax BRANCH jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: addpd %xmm3, %xmm7 movaps -14 * SIZE(BB), %xmm3 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm1, %xmm5 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addpd %xmm2, %xmm6 addpd %xmm3, %xmm7 movups ALPHA, %xmm3 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 movaps %xmm6, %xmm0 movsd %xmm7, %xmm6 movsd %xmm0, %xmm7 leal (C1, LDC, 2), %eax movsd 0 * SIZE(C1), %xmm0 movhps 1 * SIZE(C1), %xmm0 movsd 2 * SIZE(C1), %xmm1 movhps 3 * SIZE(C1), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 1 * SIZE(C1) movlps %xmm1, 2 * SIZE(C1) movhps %xmm1, 3 * SIZE(C1) movsd 0 * SIZE(C1, LDC), %xmm0 movhps 1 * SIZE(C1, LDC), %xmm0 movsd 2 * SIZE(C1, LDC), %xmm1 movhps 3 * SIZE(C1, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(C1, LDC) movhps %xmm0, 1 * SIZE(C1, LDC) movlps %xmm1, 2 * SIZE(C1, LDC) movhps %xmm1, 3 * SIZE(C1, LDC) movsd 0 * SIZE(%eax), %xmm0 movhps 1 * SIZE(%eax), %xmm0 movsd 2 * SIZE(%eax), %xmm1 movhps 3 * SIZE(%eax), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 0 * SIZE(%eax) movhps %xmm0, 1 * SIZE(%eax) movlps %xmm1, 2 * SIZE(%eax) movhps %xmm1, 3 * SIZE(%eax) movsd 0 * SIZE(%eax, LDC), %xmm0 movhps 1 * SIZE(%eax, LDC), %xmm0 movsd 2 * SIZE(%eax, LDC), %xmm1 movhps 3 * SIZE(%eax, LDC), %xmm1 pshufd $0x44, %xmm7, %xmm2 unpckhpd %xmm7, %xmm7 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm7 addpd %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%eax, LDC) movhps %xmm0, 1 * SIZE(%eax, LDC) movlps %xmm1, 2 * SIZE(%eax, LDC) movhps %xmm1, 3 * SIZE(%eax, LDC) addl $4 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $1, I jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movaps -14 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps -8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps -6 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -2 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 0 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 2 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 6 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 10 * SIZE(BB), %xmm3 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps 12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps 14 * SIZE(BB), %xmm3 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addpd %xmm3, %xmm7 movaps 18 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm4 movaps -12 * SIZE(BB), %xmm2 addpd %xmm3, %xmm5 movaps -10 * SIZE(BB), %xmm3 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: movups ALPHA, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 leal (C1, LDC, 2), %eax movsd 0 * SIZE(C1), %xmm0 movhps 1 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 1 * SIZE(C1, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 1 * SIZE(C1) movlps %xmm1, 0 * SIZE(C1, LDC) movhps %xmm1, 1 * SIZE(C1, LDC) movsd 0 * SIZE(%eax), %xmm0 movhps 1 * SIZE(%eax), %xmm0 movsd 0 * SIZE(%eax, LDC), %xmm1 movhps 1 * SIZE(%eax, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%eax) movhps %xmm0, 1 * SIZE(%eax) movlps %xmm1, 0 * SIZE(%eax, LDC) movhps %xmm1, 1 * SIZE(%eax, LDC) ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif movl BB, B leal (, LDC, 4), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L30: movl N, %eax testl $2, %eax jle .L50 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $1, I jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 PREFETCHW 1 * SIZE(C1) pxor %xmm6, %xmm6 PREFETCHW 1 * SIZE(C1, LDC) pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -12 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -10 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -6 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps -4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -2 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movaps -14 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: movups ALPHA, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movaps %xmm4, %xmm0 movsd %xmm5, %xmm4 movsd %xmm0, %xmm5 movsd 0 * SIZE(C1), %xmm0 movhps 1 * SIZE(C1), %xmm0 movsd 2 * SIZE(C1), %xmm1 movhps 3 * SIZE(C1), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 1 * SIZE(C1) movlps %xmm1, 2 * SIZE(C1) movhps %xmm1, 3 * SIZE(C1) movsd 0 * SIZE(C1, LDC), %xmm0 movhps 1 * SIZE(C1, LDC), %xmm0 movsd 2 * SIZE(C1, LDC), %xmm1 movhps 3 * SIZE(C1, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(C1, LDC) movhps %xmm0, 1 * SIZE(C1, LDC) movlps %xmm1, 2 * SIZE(C1, LDC) movhps %xmm1, 3 * SIZE(C1, LDC) addl $4 * SIZE, C1 decl I jg .L31 ALIGN_4 .L40: movl M, I testl $1, I jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -14 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -12 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -6 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -10 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps -4 * SIZE(BB), %xmm2 pshufd $0x44, %xmm0, %xmm1 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -2 * SIZE(BB), %xmm2 pshufd $0xee, %xmm0, %xmm1 movaps -8 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movaps 0 * SIZE(BB), %xmm2 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: pshufd $0x44, %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: movups ALPHA, %xmm3 addpd %xmm5, %xmm4 movsd 0 * SIZE(C1), %xmm0 movhps 1 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 1 * SIZE(C1, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 1 * SIZE(C1) movlps %xmm1, 0 * SIZE(C1, LDC) movhps %xmm1, 1 * SIZE(C1, LDC) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L50: movl N, %eax testl $1, %eax jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $1, I jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 PREFETCHW 1 * SIZE(C1) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 pshufd $0x44, %xmm1, %xmm2 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 pshufd $0xee, %xmm1, %xmm2 movaps -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L58 ALIGN_4 .L56: pshufd $0x44, %xmm1, %xmm2 movsd -15 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: movups ALPHA, %xmm3 addpd %xmm5, %xmm4 movsd 0 * SIZE(C1), %xmm0 movhps 1 * SIZE(C1), %xmm0 movsd 2 * SIZE(C1), %xmm1 movhps 3 * SIZE(C1), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 1 * SIZE(C1) movlps %xmm1, 2 * SIZE(C1) movhps %xmm1, 3 * SIZE(C1) addl $4 * SIZE, C1 decl I jg .L51 ALIGN_4 .L60: movl M, I testl $1, I jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -16 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -14 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movaps -10 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movaps -8 * SIZE(BB), %xmm2 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: mulsd %xmm0, %xmm2 movsd -15 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd -15 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: movups ALPHA, %xmm3 addpd %xmm5, %xmm4 haddpd %xmm4, %xmm4 movsd 0 * SIZE(C1), %xmm0 movhps 1 * SIZE(C1), %xmm0 pshufd $0x44, %xmm4, %xmm2 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 1 * SIZE(C1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_2x4_prescott.S000066400000000000000000001033351313527062700225410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #if defined(PENTIUM4) || defined(PENTIUMM) #define PREFETCH_R (8 * 4) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addpd %xmm2, %xmm4; \ movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm2, %xmm7; \ movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm2, %xmm7; \ movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm3, %xmm7; \ movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ addpd %xmm3, %xmm7; \ movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm2, %xmm7 #define KERNEL6(address) \ movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm2, %xmm7; \ movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL7(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm3, %xmm7; \ movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ addpd %xmm3, %xmm7; \ movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif sall $ZBASE_SHIFT, LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L30 ALIGN_2 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 2, %eax leal (B, %eax), %eax movl %eax, BX movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif movl BX, %eax prefetcht2 0 * SIZE(%eax) subl $-4 * SIZE, BX movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax #ifdef PENTIUM4 prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC, 1) prefetchnta 3 * SIZE(%esi, LDC, 2) prefetchnta 3 * SIZE(%esi, %eax, 1) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #ifdef CORE_PRESCOTT andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) #if 1 cmpl $128 * 8, %eax jle .L12 KERNEL1(16 * 8) KERNEL2(16 * 8) KERNEL3(16 * 8) KERNEL4(16 * 8) KERNEL5(16 * 8) KERNEL6(16 * 8) KERNEL7(16 * 8) KERNEL8(16 * 8) cmpl $128 * 9, %eax jle .L12 KERNEL1(16 * 9) KERNEL2(16 * 9) KERNEL3(16 * 9) KERNEL4(16 * 9) KERNEL5(16 * 9) KERNEL6(16 * 9) KERNEL7(16 * 9) KERNEL8(16 * 9) cmpl $128 * 10, %eax jle .L12 KERNEL1(16 * 10) KERNEL2(16 * 10) KERNEL3(16 * 10) KERNEL4(16 * 10) KERNEL5(16 * 10) KERNEL6(16 * 10) KERNEL7(16 * 10) KERNEL8(16 * 10) cmpl $128 * 11, %eax jle .L12 KERNEL1(16 * 11) KERNEL2(16 * 11) KERNEL3(16 * 11) KERNEL4(16 * 11) KERNEL5(16 * 11) KERNEL6(16 * 11) KERNEL7(16 * 11) KERNEL8(16 * 11) cmpl $128 * 12, %eax jle .L12 KERNEL1(16 * 12) KERNEL2(16 * 12) KERNEL3(16 * 12) KERNEL4(16 * 12) KERNEL5(16 * 12) KERNEL6(16 * 12) KERNEL7(16 * 12) KERNEL8(16 * 12) cmpl $128 * 13, %eax jle .L12 KERNEL1(16 * 13) KERNEL2(16 * 13) KERNEL3(16 * 13) KERNEL4(16 * 13) KERNEL5(16 * 13) KERNEL6(16 * 13) KERNEL7(16 * 13) KERNEL8(16 * 13) cmpl $128 * 14, %eax jle .L12 KERNEL1(16 * 14) KERNEL2(16 * 14) KERNEL3(16 * 14) KERNEL4(16 * 14) KERNEL5(16 * 14) KERNEL6(16 * 14) KERNEL7(16 * 14) KERNEL8(16 * 14) cmpl $128 * 15, %eax jle .L12 KERNEL1(16 * 15) KERNEL2(16 * 15) KERNEL3(16 * 15) KERNEL4(16 * 15) KERNEL5(16 * 15) KERNEL6(16 * 15) KERNEL7(16 * 15) KERNEL8(16 * 15) #else addl $32 * 4 * SIZE, AA addl $32 * 8 * SIZE, BB subl $128 * 8, %eax jg .L1X #endif .L12: leal (AA, %eax, 1), AA # * 16 leal (BB, %eax, 2), BB # * 64 #else sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 6 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 17 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 18 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 19 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 10 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 20 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm4 movddup 21 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm5 movddup 22 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 addpd %xmm2, %xmm6 movddup 23 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm2 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movddup 32 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 25 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 26 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 27 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 28 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 29 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movddup 30 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 31 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 40 * SIZE(BB), %xmm3 addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd 0 + ALPHA, %xmm3 movhps 8 + ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 1 * SIZE(%esi, LDC, 2), %xmm0 movsd 2 * SIZE(%esi, LDC, 2), %xmm1 movhps 3 * SIZE(%esi, LDC, 2), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 1 * SIZE(%esi, LDC, 2) movlps %xmm1, 2 * SIZE(%esi, LDC, 2) movhps %xmm1, 3 * SIZE(%esi, LDC, 2) movsd 0 * SIZE(%esi, %eax), %xmm0 movhps 1 * SIZE(%esi, %eax), %xmm0 movsd 2 * SIZE(%esi, %eax), %xmm1 movhps 3 * SIZE(%esi, %eax), %xmm1 pshufd $0x44, %xmm7, %xmm2 unpckhpd %xmm7, %xmm7 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm7 addpd %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, %eax) movhps %xmm0, 1 * SIZE(%esi, %eax) movlps %xmm1, 2 * SIZE(%esi, %eax) movhps %xmm1, 3 * SIZE(%esi, %eax) addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_3 .L20: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 3 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 18 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 5 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 22 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movddup 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 26 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 7 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 30 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 34 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 36 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 9 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 38 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 48 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 42 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 44 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 11 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 46 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 56 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 50 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 52 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 13 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 54 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 64 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movddup 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 58 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 60 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 15 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 62 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 72 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd 0 + ALPHA, %xmm3 movhps 8 + ALPHA, %xmm3 andl $15, %eax # if (k & 1) BRANCH je .L28 .L26: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (%esi, LDC, 1), %eax addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 1 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 1 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 1 * SIZE(%esi, LDC, 2), %xmm0 movsd 0 * SIZE(%esi, %eax), %xmm1 movhps 1 * SIZE(%esi, %eax), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 1 * SIZE(%esi, LDC, 2) movlps %xmm1, 0 * SIZE(%esi, %eax) movhps %xmm1, 1 * SIZE(%esi, %eax) ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax movl BB, B addl %eax, C # c += 4 * ldc decl J # j -- jg .L10 ALIGN_4 .L30: testl $2, N je .L60 movl C, %esi # coffset = c movl A, AA # aoffset = a #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef HAVE_3DNOW prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, LDC) #endif #ifdef PENTIUM4 prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 addpd %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd 0 + ALPHA, %xmm3 movhps 8 + ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L41 ALIGN_4 .L50: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L59 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movddup 2 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 3 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 movddup 5 * SIZE(AA), %xmm0 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm0 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movddup 6 * SIZE(AA), %xmm0 mulpd 12 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movddup 7 * SIZE(AA), %xmm0 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movddup 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 movddup 9 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 mulpd 18 * SIZE(BB), %xmm1 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movddup 10 * SIZE(AA), %xmm1 mulpd 20 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 11 * SIZE(AA), %xmm1 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 movddup 13 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 26 * SIZE(BB), %xmm1 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movddup 14 * SIZE(AA), %xmm1 mulpd 28 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movddup 15 * SIZE(AA), %xmm1 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movddup 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd 0 + ALPHA, %xmm3 movhps 8 + ALPHA, %xmm3 andl $15, %eax # if (k & 1) BRANCH je .L58 .L56: mulpd %xmm0, %xmm2 movddup 1 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 1 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 1 * SIZE(%esi, LDC) ALIGN_4 .L59: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax movl BB, B addl %eax, C # c += 4 * ldc ALIGN_4 .L60: testl $1, N je .L999 movl C, %esi # coffset = c movl A, AA # aoffset = a #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 4 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef PENTIUM4 prefetchnta 3 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 16 * SIZE(AA), %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd 4 * SIZE(AA), %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm2, %xmm7 movddup 8 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movddup 5 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 24 * SIZE(AA), %xmm1 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm3, %xmm5 movddup 6 * SIZE(BB), %xmm3 mulpd 12 * SIZE(AA), %xmm3 addpd %xmm3, %xmm6 movddup 7 * SIZE(BB), %xmm3 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd 0 + ALPHA, %xmm3 movhps 8 + ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulpd %xmm2, %xmm0 movddup 1 * SIZE(BB), %xmm2 addpd %xmm0, %xmm4 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) addl $4 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L71 ALIGN_4 .L80: movl M, %ebx testl $1, %ebx # i = (m >> 2) jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd 0 + ALPHA, %xmm3 movhps 8 + ALPHA, %xmm3 andl $15, %eax # if (k & 1) BRANCH je .L88 .L86: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 movsd 1 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 haddpd %xmm4, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_4x2_core2.S000066400000000000000000000643731313527062700217200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA_R 16 + STACK + ARGS(%esi) #define OLD_ALPHA_I 24 + STACK + ARGS(%esi) #define OLD_A 32 + STACK + ARGS(%esi) #define OLD_B 36 + STACK + ARGS(%esi) #define OLD_C 40 + STACK + ARGS(%esi) #define OLD_LDC 44 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 256(%esp) #define PREFETCH_R (8 * 16 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 7 + 4) #define PREFETCH prefetcht0 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx PROLOGUE PROFCODE pushl %ebp pushl %edi pushl %esi pushl %ebx movl %esp, %esi # save old stack subl $512 + LOCAL_BUFFER_SIZE, %esp andl $-4096, %esp # align stack STACK_TOUCHING movl OLD_M, %ebx movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movsd OLD_ALPHA_R, %xmm0 movhps OLD_ALPHA_I, %xmm0 movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl OLD_B, B movl OLD_C, %ebx movaps %xmm0, ALPHA movl %ebx, C movl OLD_LDC, LDC subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC sarl $1, %eax movl %eax, J jle .L40 ALIGN_4 .L01: leal 16 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L05: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_4 .L06: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L06 ALIGN_4 .L10: movl B, BX movl C, C1 movl A, AA movl M, I sarl $2, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 prefetcht0 3 * SIZE(C1) pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(C1, LDC) movapd %xmm1, %xmm2 movl BX, %eax prefetcht0 (%eax) subl $-8 * SIZE, %eax movl %eax, BX #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movapd -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd -10 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 movapd -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 PADDING; movapd 0 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm6 movapd -4 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm4 movapd -2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm6 PADDING; movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm2 movapd 8 * SIZE(AA), %xmm3 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm6 movapd 8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm6 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movapd 12 * SIZE(AA), %xmm3 addpd %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm4 movapd 14 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 subl $-32 * SIZE, BB addpd %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm1 addpd %xmm1, %xmm6 movapd -16 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm2 movapd 24 * SIZE(AA), %xmm3 addpd %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 subl $-32 * SIZE, AA decl %eax BRANCH jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 4 * SIZE(%esi), %xmm0 movhps 5 * SIZE(%esi), %xmm0 movsd 6 * SIZE(%esi), %xmm1 movhps 7 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 4 * SIZE(%esi) movhps %xmm0, 5 * SIZE(%esi) movlps %xmm1, 6 * SIZE(%esi) movhps %xmm1, 7 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) movsd 4 * SIZE(%esi, LDC), %xmm0 movhps 5 * SIZE(%esi, LDC), %xmm0 movsd 6 * SIZE(%esi, LDC), %xmm1 movhps 7 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm7, %xmm2 unpckhpd %xmm7, %xmm7 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm7 addpd %xmm7, %xmm1 movlps %xmm0, 4 * SIZE(%esi, LDC) movhps %xmm0, 5 * SIZE(%esi, LDC) movlps %xmm1, 6 * SIZE(%esi, LDC) movhps %xmm1, 7 * SIZE(%esi, LDC) addl $8 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $2, I jle .L30 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $2, %eax movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 addpd %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 addpd %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 addpd %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 addpd %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 addpd %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 addpd %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 addpd %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) addl $4 * SIZE, C1 ALIGN_4 .L30: movl M, I testl $1, I jle .L39 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -12 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm1 mulsd -10 * SIZE(BB), %xmm0 addsd %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 addsd %xmm0, %xmm7 movsd -14 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -6 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd -4 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd -13 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd -2 * SIZE(BB), %xmm0 addsd %xmm3, %xmm6 movsd 8 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movsd -8 * SIZE(AA), %xmm0 mulsd %xmm2, %xmm1 mulsd 2 * SIZE(BB), %xmm2 addsd %xmm1, %xmm4 movsd 4 * SIZE(BB), %xmm1 addsd %xmm2, %xmm5 movsd -11 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm1 mulsd 6 * SIZE(BB), %xmm2 addsd %xmm1, %xmm6 movsd 16 * SIZE(BB), %xmm1 addsd %xmm2, %xmm7 movsd -10 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 10 * SIZE(BB), %xmm2 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm2, %xmm5 movsd -9 * SIZE(AA), %xmm2 mulsd %xmm2, %xmm3 mulsd 14 * SIZE(BB), %xmm2 addsd %xmm3, %xmm6 movsd 24 * SIZE(BB), %xmm3 addsd %xmm2, %xmm7 movsd -4 * SIZE(AA), %xmm2 subl $-8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 movsd -12 * SIZE(BB), %xmm1 addsd %xmm0, %xmm5 movsd -15 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 1 * SIZE(%esi, LDC), %xmm1 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 1 * SIZE(%esi, LDC) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L40: movl N, %eax testl $1, %eax jle .L999 ALIGN_4 .L41: leal 16 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $3, %eax jle .L45 ALIGN_4 .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $7, %eax BRANCH jle .L50 ALIGN_4 .L46: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, -16 * SIZE(BB) addl $1 * SIZE, B addl $2 * SIZE, BB decl %eax jne .L46 ALIGN_4 .L50: movl C, C1 movl A, AA movl M, I sarl $2, I jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(C1) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AA), %xmm1 addpd %xmm0, %xmm5 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -6 * SIZE(AA), %xmm1 addpd %xmm2, %xmm4 movapd -4 * SIZE(AA), %xmm2 addpd %xmm1, %xmm6 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm2 mulpd -2 * SIZE(AA), %xmm1 addpd %xmm2, %xmm5 movapd 8 * SIZE(AA), %xmm2 addpd %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm0 mulpd 2 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 6 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 10 * SIZE(AA), %xmm3 addpd %xmm2, %xmm4 movapd 12 * SIZE(AA), %xmm2 addpd %xmm3, %xmm6 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm2 mulpd 14 * SIZE(AA), %xmm3 addpd %xmm2, %xmm5 movapd 24 * SIZE(AA), %xmm2 addpd %xmm3, %xmm7 movapd 8 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L58 ALIGN_4 .L56: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm6 movapd -14 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 4 * SIZE(%esi), %xmm0 movhps 5 * SIZE(%esi), %xmm0 movsd 6 * SIZE(%esi), %xmm1 movhps 7 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 4 * SIZE(%esi) movhps %xmm0, 5 * SIZE(%esi) movlps %xmm1, 6 * SIZE(%esi) movhps %xmm1, 7 * SIZE(%esi) addl $8 * SIZE, C1 decl I jg .L51 ALIGN_4 .L60: movl M, I testl $2, I jle .L70 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 movapd -8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd -10 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 0 * SIZE(BB), %xmm1 mulpd %xmm2, %xmm3 movapd -6 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -6 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd -4 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd -2 * SIZE(AA), %xmm2 addpd %xmm3, %xmm4 movapd -2 * SIZE(BB), %xmm3 mulpd %xmm2, %xmm3 movapd 8 * SIZE(AA), %xmm2 addpd %xmm3, %xmm5 movapd 8 * SIZE(BB), %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addpd %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) addl $4 * SIZE, C1 ALIGN_4 .L70: movl M, I testl $1, I jle .L79 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -8 * SIZE(BB), %xmm3 movsd -12 * SIZE(AA), %xmm2 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -14 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -12 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -13 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -10 * SIZE(BB), %xmm1 mulsd %xmm0, %xmm1 movsd -8 * SIZE(AA), %xmm0 addsd %xmm1, %xmm5 movsd -0 * SIZE(BB), %xmm1 mulsd %xmm2, %xmm3 movsd -11 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -6 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -10 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd -4 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -9 * SIZE(AA), %xmm2 addsd %xmm3, %xmm4 movsd -2 * SIZE(BB), %xmm3 mulsd %xmm2, %xmm3 movsd -4 * SIZE(AA), %xmm2 addsd %xmm3, %xmm5 movsd 8 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L78 ALIGN_4 .L76: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 movsd -14 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addsd %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 unpcklpd %xmm4, %xmm4 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) ALIGN_4 .L79: addl LDC, C ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_4x2_northwood.S000066400000000000000000000764451313527062700227340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 4) #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define AA %edx #define BB %ecx #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movsd STACK_ALPHA_R, %xmm0 movhps STACK_ALPHA_I, %xmm0 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movaps %xmm0, ALPHA movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L100 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B subl $-16 * SIZE, %ecx decl %eax BRANCH jne .L02 ALIGN_2 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax BRANCH jne .L04 ALIGN_4 .L05: movl B, BX movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) NOBRANCH jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC) movl BX, %eax prefetcht2 0 * SIZE(%eax) subl $-8 * SIZE, %eax movl %eax, BX #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif #ifdef PENTIUM4 andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .align 8 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else sarl $3, %eax je .L12 .L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $32 * SIZE, %ecx addl $32 * SIZE, %edx decl %eax jne .L11 #endif .L12: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 ALIGN_4 .L14: movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 4 * SIZE(%esi), %xmm0 movhps 5 * SIZE(%esi), %xmm0 movsd 6 * SIZE(%esi), %xmm1 movhps 7 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 4 * SIZE(%esi) movhps %xmm0, 5 * SIZE(%esi) movlps %xmm1, 6 * SIZE(%esi) movhps %xmm1, 7 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) movsd 4 * SIZE(%esi, LDC), %xmm0 movhps 5 * SIZE(%esi, LDC), %xmm0 movsd 6 * SIZE(%esi, LDC), %xmm1 movhps 7 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm7, %xmm2 unpckhpd %xmm7, %xmm7 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm7 addpd %xmm7, %xmm1 movlps %xmm0, 4 * SIZE(%esi, LDC) movhps %xmm0, 5 * SIZE(%esi, LDC) movlps %xmm1, 6 * SIZE(%esi, LDC) movhps %xmm1, 7 * SIZE(%esi, LDC) addl $8 * SIZE, %esi decl %ebx # i -- BRANCH jg .L10 ALIGN_2 .L30: movl M, %ebx testl $2, %ebx jle .L50 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, %ecx movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $2, %eax movl %eax, KKK #endif sarl $3, %eax je .L32 .L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L31 .L32: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L33 ALIGN_4 .L34: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 1 * SIZE(%esi, LDC), %xmm0 movsd 2 * SIZE(%esi, LDC), %xmm1 movhps 3 * SIZE(%esi, LDC), %xmm1 pshufd $0x44, %xmm5, %xmm2 unpckhpd %xmm5, %xmm5 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 1 * SIZE(%esi, LDC) movlps %xmm1, 2 * SIZE(%esi, LDC) movhps %xmm1, 3 * SIZE(%esi, LDC) addl $4 * SIZE, %esi # coffset += 4 ALIGN_2 .L50: movl M, %ebx testl $1, %ebx jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, %ecx movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 .L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm2 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 12 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 3 * SIZE(AA), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BB), %xmm0 addsd %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addsd %xmm0, %xmm5 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm2 mulsd 18 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 20 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 5 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm2 mulsd 22 * SIZE(BB), %xmm1 addsd %xmm2, %xmm4 movsd 32 * SIZE(BB), %xmm2 addsd %xmm1, %xmm5 movsd 6 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 26 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 7 * SIZE(AA), %xmm1 mulsd %xmm1, %xmm3 mulsd 30 * SIZE(BB), %xmm1 addsd %xmm3, %xmm4 movsd 40 * SIZE(BB), %xmm3 addsd %xmm1, %xmm5 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB BRANCH decl %eax jne .L51 .L52: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addsd %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L53 ALIGN_4 .L54: addsd %xmm6, %xmm4 addsd %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 1 * SIZE(%esi, LDC), %xmm1 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 1 * SIZE(%esi, LDC) ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C # c += 2 * ldc BRANCH decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $3, %eax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(%ecx) movapd %xmm1, 2 * SIZE(%ecx) movapd %xmm2, 4 * SIZE(%ecx) movapd %xmm3, 6 * SIZE(%ecx) movapd %xmm4, 8 * SIZE(%ecx) movapd %xmm5, 10 * SIZE(%ecx) movapd %xmm6, 12 * SIZE(%ecx) movapd %xmm7, 14 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: movl K, %eax andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movsd 0 * SIZE(B), %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, B addl $2 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 .L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 mulpd 6 * SIZE(AA), %xmm2 addpd %xmm0, %xmm5 movapd 16 * SIZE(AA), %xmm0 addpd %xmm2, %xmm7 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 10 * SIZE(AA), %xmm2 addpd %xmm1, %xmm4 movapd 12 * SIZE(AA), %xmm1 addpd %xmm2, %xmm6 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm1 mulpd 14 * SIZE(AA), %xmm2 addpd %xmm1, %xmm5 movapd 24 * SIZE(AA), %xmm1 addpd %xmm2, %xmm7 movapd 16 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm0 mulpd 18 * SIZE(AA), %xmm3 addpd %xmm0, %xmm4 movapd 20 * SIZE(AA), %xmm0 addpd %xmm3, %xmm6 movapd 10 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm0 mulpd 22 * SIZE(AA), %xmm3 addpd %xmm0, %xmm5 movapd 32 * SIZE(AA), %xmm0 addpd %xmm3, %xmm7 movapd 12 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 26 * SIZE(AA), %xmm3 addpd %xmm1, %xmm4 movapd 28 * SIZE(AA), %xmm1 addpd %xmm3, %xmm6 movapd 14 * SIZE(BB), %xmm3 mulpd %xmm3, %xmm1 mulpd 30 * SIZE(AA), %xmm3 addpd %xmm1, %xmm5 movapd 40 * SIZE(AA), %xmm1 addpd %xmm3, %xmm7 movapd 24 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L111 .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 movapd 4 * SIZE(AA), %xmm0 addpd %xmm2, %xmm6 movapd 2 * SIZE(BB), %xmm2 addl $4 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L113 ALIGN_4 .L114: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) movsd 4 * SIZE(%esi), %xmm0 movhps 5 * SIZE(%esi), %xmm0 movsd 6 * SIZE(%esi), %xmm1 movhps 7 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm6, %xmm2 unpckhpd %xmm6, %xmm6 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm1 movlps %xmm0, 4 * SIZE(%esi) movhps %xmm0, 5 * SIZE(%esi) movlps %xmm1, 6 * SIZE(%esi) movhps %xmm1, 7 * SIZE(%esi) addl $8 * SIZE, %esi # coffset += 4 BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx testl $2, %ebx jle .L150 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L132 .L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 mulpd 2 * SIZE(BB), %xmm0 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd 4 * SIZE(BB), %xmm0 addpd %xmm0, %xmm6 movapd 6 * SIZE(AA), %xmm0 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 addpd %xmm3, %xmm4 mulpd 10 * SIZE(BB), %xmm1 movapd 24 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 12 * SIZE(AA), %xmm1 mulpd 12 * SIZE(BB), %xmm1 addpd %xmm1, %xmm6 movapd 14 * SIZE(AA), %xmm1 mulpd 14 * SIZE(BB), %xmm1 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L131 .L132: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movapd 0 * SIZE(AA), %xmm0 mulpd 0 * SIZE(BB), %xmm0 addpd %xmm0, %xmm4 addl $2 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L133 ALIGN_4 .L134: addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 addpd %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhps 3 * SIZE(%esi), %xmm1 pshufd $0x44, %xmm4, %xmm2 unpckhpd %xmm4, %xmm4 mulpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) movlps %xmm1, 2 * SIZE(%esi) movhps %xmm1, 3 * SIZE(%esi) addl $4 * SIZE, %esi ALIGN_2 .L150: movl M, %ebx testl $1, %ebx jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax je .L152 .L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 mulsd 2 * SIZE(BB), %xmm0 movsd 16 * SIZE(BB), %xmm2 addsd %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 mulsd 4 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 3 * SIZE(AA), %xmm0 mulsd 6 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 mulsd %xmm1, %xmm3 movsd 5 * SIZE(AA), %xmm1 addsd %xmm3, %xmm4 mulsd 10 * SIZE(BB), %xmm1 movsd 24 * SIZE(BB), %xmm3 addsd %xmm1, %xmm4 movsd 6 * SIZE(AA), %xmm1 mulsd 12 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 7 * SIZE(AA), %xmm1 mulsd 14 * SIZE(BB), %xmm1 addsd %xmm1, %xmm4 movsd 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $16 * SIZE, BB BRANCH decl %eax jne .L151 .L152: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 addl $1 * SIZE, AA # aoffset += 8 addl $2 * SIZE, BB # boffset1 += 8 decl %eax BRANCH jg .L153 ALIGN_4 .L154: movsd 0 * SIZE(%esi), %xmm0 movhps 1 * SIZE(%esi), %xmm0 unpcklpd %xmm4, %xmm4 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 1 * SIZE(%esi) ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_4x4_barcelona.S000066400000000000000000001267671313527062700226440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_ALPHA_R 16 + STACK(%esi) #define OLD_ALPHA_I 20 + STACK(%esi) #define OLD_A 24 + STACK(%esi) #define OLD_B 28 + STACK(%esi) #define OLD_C 32 + STACK(%esi) #define OLD_LDC 36 + STACK(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 128(%esp) #define PREFETCH prefetch #define PREFETCHSIZE (16 * 17 + 0) #define RPREFETCHSIZE (16 * 9 + 0) #define WPREFETCHSIZE (16 * 9 + 0) #define AA %edx #define BB %ecx #define LDC %ebp #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define movsd movlps #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movss OLD_ALPHA_R, %xmm0 movss OLD_ALPHA_I, %xmm1 movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl OLD_B, %edi movl OLD_C, %ebx unpcklps %xmm1, %xmm0 movlhps %xmm0, %xmm0 movaps %xmm0, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC sarl $2, %eax movl %eax, J jle .L40 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $1, %eax jle .L05 ALIGN_4 .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movaps 0 * SIZE(%edi), %xmm3 movaps 4 * SIZE(%edi), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $ 8 * SIZE, %edi subl $-32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $1, %eax BRANCH jle .L10 movaps 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) addl $4 * SIZE, %edi ALIGN_4 .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (%esi, LDC, 2), %eax prefetchw 3 * SIZE(%esi) prefetchw 3 * SIZE(%esi, LDC) prefetchw 3 * SIZE(%eax) prefetchw 3 * SIZE(%eax, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $128 * 8 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 movsd 4 * SIZE(%esi, LDC, 2), %xmm1 movhps 6 * SIZE(%esi, LDC, 2), %xmm1 pshufd $0x50, %xmm6, %xmm2 pshufd $0xfa, %xmm6, %xmm6 mulps %xmm3, %xmm2 mulps %xmm3, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 2 * SIZE(%esi, LDC, 2) movlps %xmm1, 4 * SIZE(%esi, LDC, 2) movhps %xmm1, 6 * SIZE(%esi, LDC, 2) movsd 0 * SIZE(%esi, %eax), %xmm0 movhps 2 * SIZE(%esi, %eax), %xmm0 movsd 4 * SIZE(%esi, %eax), %xmm1 movhps 6 * SIZE(%esi, %eax), %xmm1 pshufd $0x50, %xmm7, %xmm2 pshufd $0xfa, %xmm7, %xmm7 mulps %xmm3, %xmm2 mulps %xmm3, %xmm7 addps %xmm2, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, %eax) movhps %xmm0, 2 * SIZE(%esi, %eax) movlps %xmm1, 4 * SIZE(%esi, %eax) movhps %xmm1, 6 * SIZE(%esi, %eax) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif movsd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsd 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 44 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 64 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 60 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 80 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movsd 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 76 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 96 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movsd 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 92 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 112 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movsd 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 108 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 128 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movsd 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 124 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 144 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 16 * SIZE(BB), %xmm2 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 2 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm4, %xmm4 pshufd $0x50, %xmm5, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 addps %xmm4, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 2 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 movsd 0 * SIZE(%esi, %eax), %xmm1 movhps 2 * SIZE(%esi, %eax), %xmm1 pshufd $0x50, %xmm6, %xmm6 pshufd $0x50, %xmm7, %xmm7 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 addps %xmm6, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 2 * SIZE(%esi, LDC, 2) movlps %xmm1, 0 * SIZE(%esi, %eax) movhps %xmm1, 2 * SIZE(%esi, %eax) addl $4 * SIZE, %esi # coffset += 2 ALIGN_4 .L30: testl $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movss 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movss 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 20 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 36 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 40 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 44 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 52 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 56 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 60 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 68 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 72 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 76 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 96 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 84 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 88 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 92 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 112 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 100 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 104 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 108 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 128 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 116 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 120 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 124 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 144 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: leal (%esi, LDC, 2), %eax movsd (%esi), %xmm0 movhps (%esi, LDC), %xmm0 movsd (%eax), %xmm1 movhps (%eax, LDC), %xmm1 shufps $0, %xmm5, %xmm4 mulps %xmm3, %xmm4 addps %xmm4, %xmm0 shufps $0, %xmm7, %xmm6 mulps %xmm3, %xmm6 addps %xmm6, %xmm1 movlps %xmm0, (%esi) movhps %xmm0, (%esi, LDC) movlps %xmm1, (%eax) movhps %xmm1, (%eax, LDC) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 .L42: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movaps 0 * SIZE(%edi), %xmm3 movaps 4 * SIZE(%edi), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $ 8 * SIZE, %edi subl $-32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $3, %eax BRANCH jle .L50 ALIGN_4 .L46: movsd 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) addl $2 * SIZE, %edi addl $8 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 prefetchw 3 * SIZE(%esi) prefetchw 3 * SIZE(%esi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 2 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm4, %xmm4 pshufd $0x50, %xmm5, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 addps %xmm4, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 2 * SIZE(%esi, LDC) addl $4 * SIZE, %esi # coffset += 2 ALIGN_4 .L70: testl $1, M je .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addss %xmm6, %xmm4 addss %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L80: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal BUFFER, %ecx movl K, %eax sarl $3, %eax jle .L85 ALIGN_4 .L82: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movups 0 * SIZE(%edi), %xmm3 movups 4 * SIZE(%edi), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $ 8 * SIZE, %edi subl $-32 * SIZE, %ecx decl %eax jne .L82 ALIGN_4 .L85: movl K, %eax andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: movss 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(%ecx) addl $1 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L86 ALIGN_4 .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 prefetchw 3 * SIZE(%esi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 mulps 20 * SIZE(BB), %xmm1 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 2 * SIZE(AA), %xmm0 movsd 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) addl $4 * SIZE, %esi # coffset += 2 ALIGN_4 .L110: testl $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 48 * SIZE(BB), %xmm3 mulss 20 * SIZE(BB), %xmm1 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_4x4_opteron.S000066400000000000000000001512071313527062700223670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_ALPHA_R 16 + STACK(%esi) #define OLD_ALPHA_I 20 + STACK(%esi) #define OLD_A 24 + STACK(%esi) #define OLD_B 28 + STACK(%esi) #define OLD_C 32 + STACK(%esi) #define OLD_LDC 36 + STACK(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define BX 40(%esp) #define OLD_STACK 44(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define BUFFER 128(%esp) #ifdef ATHLON #define PREFETCH prefetch #define PREFETCHSIZE 64 #endif #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #define PREFETCHSIZE (16 * 10 + 8) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #define AA %edx #define BB %ecx #define LDC %ebp #if defined(OPTERON) || defined(BARCELONA) #define movsd movlps #endif #if defined(OPTERON) || defined(BARCELONA) #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; #endif #ifdef PENTIUM4 #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movss OLD_ALPHA_R, %xmm0 movss OLD_ALPHA_I, %xmm1 movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl OLD_B, %edi movl OLD_C, %ebx unpcklps %xmm1, %xmm0 movlhps %xmm0, %xmm0 movaps %xmm0, ALPHA movl %ebx, C movl OLD_LDC, LDC sall $ZBASE_SHIFT, LDC sarl $2, %eax movl %eax, J jle .L40 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $1, %eax jle .L05 ALIGN_4 .L02: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) movd %mm4, 16 * SIZE(%ecx) movd %mm4, 17 * SIZE(%ecx) movd %mm4, 18 * SIZE(%ecx) movd %mm4, 19 * SIZE(%ecx) movd %mm5, 20 * SIZE(%ecx) movd %mm5, 21 * SIZE(%ecx) movd %mm5, 22 * SIZE(%ecx) movd %mm5, 23 * SIZE(%ecx) movd %mm6, 24 * SIZE(%ecx) movd %mm6, 25 * SIZE(%ecx) movd %mm6, 26 * SIZE(%ecx) movd %mm6, 27 * SIZE(%ecx) movd %mm7, 28 * SIZE(%ecx) movd %mm7, 29 * SIZE(%ecx) movd %mm7, 30 * SIZE(%ecx) movd %mm7, 31 * SIZE(%ecx) #endif #ifdef PENTIUM4 prefetcht2 112 * SIZE(%ecx) #endif #if defined(OPTERON) || defined(BARCELONA) prefetchnta 80 * SIZE(%edi) prefetchw 112 * SIZE(%ecx) prefetchw 120 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $1, %eax BRANCH jle .L10 #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) #endif addl $4 * SIZE, %edi ALIGN_4 .L10: movl %edi, BX movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movl BX, %eax #ifdef HAVE_SSE prefetcht2 0 * SIZE(%eax) prefetcht2 4 * SIZE(%eax) #if L2_SIZE > 262144 subl $-8 * SIZE, BX #elif L2_SIZE > 131072 prefetcht2 8 * SIZE(%eax) prefetcht2 12 * SIZE(%eax) subl $-16 * SIZE, BX #else prefetcht2 16 * SIZE(%eax) prefetcht2 20 * SIZE(%eax) prefetcht2 24 * SIZE(%eax) prefetcht2 28 * SIZE(%eax) subl $-32 * SIZE, BX #endif #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax #if defined(OPTERON) || defined(BARCELONA) prefetchw 4 * SIZE(%esi) prefetchw 4 * SIZE(%esi, LDC) prefetchw 4 * SIZE(%esi, LDC, 2) prefetchw 4 * SIZE(%esi, %eax) #endif #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) prefetchnta 4 * SIZE(%esi, LDC, 2) prefetchnta 4 * SIZE(%esi, %eax) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $128 * 8 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $128 * SIZE, BB addl $32 * SIZE, AA decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 movsd 4 * SIZE(%esi, LDC, 2), %xmm1 movhps 6 * SIZE(%esi, LDC, 2), %xmm1 pshufd $0x50, %xmm6, %xmm2 pshufd $0xfa, %xmm6, %xmm6 mulps %xmm3, %xmm2 mulps %xmm3, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 2 * SIZE(%esi, LDC, 2) movlps %xmm1, 4 * SIZE(%esi, LDC, 2) movhps %xmm1, 6 * SIZE(%esi, LDC, 2) movsd 0 * SIZE(%esi, %eax), %xmm0 movhps 2 * SIZE(%esi, %eax), %xmm0 movsd 4 * SIZE(%esi, %eax), %xmm1 movhps 6 * SIZE(%esi, %eax), %xmm1 pshufd $0x50, %xmm7, %xmm2 pshufd $0xfa, %xmm7, %xmm7 mulps %xmm3, %xmm2 mulps %xmm3, %xmm7 addps %xmm2, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, %eax) movhps %xmm0, 2 * SIZE(%esi, %eax) movlps %xmm1, 4 * SIZE(%esi, %eax) movhps %xmm1, 6 * SIZE(%esi, %eax) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif movsd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsd 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 44 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 64 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 60 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 80 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movsd 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 76 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 96 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movsd 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 92 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 112 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movsd 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 108 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 128 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movsd 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 124 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 144 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 16 * SIZE(BB), %xmm2 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 2 * SIZE(%esi, LDC), %xmm1 shufps $0x50, %xmm4, %xmm4 shufps $0x50, %xmm5, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 addps %xmm4, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 2 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 movsd 0 * SIZE(%esi, %eax), %xmm1 movhps 2 * SIZE(%esi, %eax), %xmm1 shufps $0x50, %xmm6, %xmm6 shufps $0x50, %xmm7, %xmm7 mulps %xmm3, %xmm6 mulps %xmm3, %xmm7 addps %xmm6, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 2 * SIZE(%esi, LDC, 2) movlps %xmm1, 0 * SIZE(%esi, %eax) movhps %xmm1, 2 * SIZE(%esi, %eax) addl $4 * SIZE, %esi # coffset += 2 ALIGN_4 .L30: testl $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movss 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movss 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 20 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 36 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 40 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 44 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 addss %xmm3, %xmm4 movss 52 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 addss %xmm3, %xmm5 movss 56 * SIZE(BB), %xmm3 mulss %xmm0, %xmm3 mulss 60 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 68 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 72 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 76 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 96 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 84 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 88 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 92 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 112 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 addss %xmm2, %xmm4 movss 100 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 addss %xmm2, %xmm5 movss 104 * SIZE(BB), %xmm2 mulss %xmm1, %xmm2 mulss 108 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 128 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 addss %xmm3, %xmm4 movss 116 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 addss %xmm3, %xmm5 movss 120 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 mulss 124 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 144 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm5 movss 8 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: leal (LDC, LDC, 2), %eax movsd (%esi), %xmm0 movhps (%esi, LDC), %xmm0 shufps $0, %xmm5, %xmm4 mulps %xmm3, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, (%esi) movhps %xmm0, (%esi, LDC) movsd (%esi, LDC, 2), %xmm0 movhps (%esi, %eax), %xmm0 shufps $0, %xmm7, %xmm6 mulps %xmm3, %xmm6 addps %xmm6, %xmm0 movlps %xmm0, (%esi, LDC, 2) movhps %xmm0, (%esi, %eax) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 .L42: prefetchnta 80 * SIZE(%edi) #if defined(OPTERON) || defined(BARCELONA) prefetchw 112 * SIZE(%ecx) prefetchw 120 * SIZE(%ecx) #endif #ifdef PENTIUM4 prefetcht1 112 * SIZE(%ecx) #endif #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) movd %mm4, 16 * SIZE(%ecx) movd %mm4, 17 * SIZE(%ecx) movd %mm4, 18 * SIZE(%ecx) movd %mm4, 19 * SIZE(%ecx) movd %mm5, 20 * SIZE(%ecx) movd %mm5, 21 * SIZE(%ecx) movd %mm5, 22 * SIZE(%ecx) movd %mm5, 23 * SIZE(%ecx) movd %mm6, 24 * SIZE(%ecx) movd %mm6, 25 * SIZE(%ecx) movd %mm6, 26 * SIZE(%ecx) movd %mm6, 27 * SIZE(%ecx) movd %mm7, 28 * SIZE(%ecx) movd %mm7, 29 * SIZE(%ecx) movd %mm7, 30 * SIZE(%ecx) movd %mm7, 31 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $3, %eax BRANCH jle .L50 ALIGN_4 .L46: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) #endif addl $2 * SIZE, %edi addl $8 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(%esi) prefetchw 4 * SIZE(%esi, LDC) #elif defined(HAVE_SSE) || defined(HAVE_SSE2) prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movsd 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movsd 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movsd 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 2 * SIZE(%esi, LDC), %xmm1 shufps $0x50, %xmm4, %xmm4 shufps $0x50, %xmm5, %xmm5 mulps %xmm3, %xmm4 mulps %xmm3, %xmm5 addps %xmm4, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 2 * SIZE(%esi, LDC) addl $4 * SIZE, %esi ALIGN_4 .L70: testl $1, M je .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $ 1 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addss %xmm6, %xmm4 addss %xmm7, %xmm5 movsd (%esi), %xmm0 movhps (%esi, LDC), %xmm0 shufps $0, %xmm5, %xmm4 mulps %xmm3, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, (%esi) movhps %xmm0, (%esi, LDC) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L80: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 .L82: prefetchnta 80 * SIZE(%edi) #if defined(OPTERON) || defined(BARCELONA) prefetchw 112 * SIZE(%ecx) prefetchw 120 * SIZE(%ecx) #endif #ifdef PENTIUM4 prefetcht1 112 * SIZE(%ecx) #endif #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) movd %mm1, 4 * SIZE(%ecx) movd %mm1, 5 * SIZE(%ecx) movd %mm1, 6 * SIZE(%ecx) movd %mm1, 7 * SIZE(%ecx) movd %mm2, 8 * SIZE(%ecx) movd %mm2, 9 * SIZE(%ecx) movd %mm2, 10 * SIZE(%ecx) movd %mm2, 11 * SIZE(%ecx) movd %mm3, 12 * SIZE(%ecx) movd %mm3, 13 * SIZE(%ecx) movd %mm3, 14 * SIZE(%ecx) movd %mm3, 15 * SIZE(%ecx) movd %mm4, 16 * SIZE(%ecx) movd %mm4, 17 * SIZE(%ecx) movd %mm4, 18 * SIZE(%ecx) movd %mm4, 19 * SIZE(%ecx) movd %mm5, 20 * SIZE(%ecx) movd %mm5, 21 * SIZE(%ecx) movd %mm5, 22 * SIZE(%ecx) movd %mm5, 23 * SIZE(%ecx) movd %mm6, 24 * SIZE(%ecx) movd %mm6, 25 * SIZE(%ecx) movd %mm6, 26 * SIZE(%ecx) movd %mm6, 27 * SIZE(%ecx) movd %mm7, 28 * SIZE(%ecx) movd %mm7, 29 * SIZE(%ecx) movd %mm7, 30 * SIZE(%ecx) movd %mm7, 31 * SIZE(%ecx) #endif addl $ 8 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L82 ALIGN_4 .L85: movl K, %eax andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(%ecx) #else movd 0 * SIZE(%edi), %mm0 movd %mm0, 0 * SIZE(%ecx) movd %mm0, 1 * SIZE(%ecx) movd %mm0, 2 * SIZE(%ecx) movd %mm0, 3 * SIZE(%ecx) #endif addl $1 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L86 ALIGN_4 .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(%esi) #elif defined(HAVE_SSE) || defined(HAVE_SSE2) prefetcht2 4 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movaps 48 * SIZE(BB), %xmm3 mulps 20 * SIZE(BB), %xmm1 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE(AA), %xmm1 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsd 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 movsd 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movsd 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 2 * SIZE(AA), %xmm0 movsd 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 shufps $0x50, %xmm4, %xmm4 mulps %xmm3, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) addl $4 * SIZE, %esi ALIGN_4 .L110: testl $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movss 0 * SIZE(AA), %xmm0 movss 4 * SIZE(AA), %xmm1 movss 0 * SIZE(BB), %xmm2 movss 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 48 * SIZE(BB), %xmm3 mulss 20 * SIZE(BB), %xmm1 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 movsd (%esi), %xmm0 shufps $0, %xmm5, %xmm4 mulps %xmm3, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, (%esi) ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_4x4_penryn.S000066400000000000000000001002301313527062700222020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define PREFETCH_R (8 * 4) #define PREFETCHSIZE (8 * 17 + 4) #define PREFETCH prefetcht0 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-32 * SIZE, A subl $-32 * SIZE, B sall $ZBASE_SHIFT, LDC movl N, %eax sarl $2, %eax movl %eax, J jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sall $BASE_SHIFT + 2, %eax leal (B, %eax), %eax movl %eax, BX movl C, C1 movl A, AA movl M, I sarl $2, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif movl BX, %eax prefetcht2 -32 * SIZE(%eax) subl $-16 * SIZE, BX leal (C1, LDC, 2), %eax movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(C1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(C1, LDC) pxor %xmm6, %xmm6 prefetcht0 3 * SIZE(%eax) pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(%eax, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 subl $1, %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm7 pshufd $0x93, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm6 pshufd $0x93, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 pshufd $0x93, %xmm3, %xmm2 mulps %xmm0, %xmm3 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: addps %xmm3, %xmm6 addps %xmm2, %xmm7 movddup ALPHA, %xmm3 pshufd $0x39, %xmm5, %xmm2 pshufd $0x4e, %xmm6, %xmm0 pshufd $0x93, %xmm7, %xmm7 movaps %xmm4, %xmm6 unpcklps %xmm0, %xmm4 unpckhps %xmm0, %xmm6 movaps %xmm2, %xmm1 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm1 movaps %xmm4, %xmm5 unpcklps %xmm2, %xmm4 unpckhps %xmm2, %xmm5 movaps %xmm6, %xmm7 unpcklps %xmm1, %xmm6 unpckhps %xmm1, %xmm7 pshufd $0x93, %xmm5, %xmm5 pshufd $0x4e, %xmm6, %xmm6 pshufd $0x39, %xmm7, %xmm7 leal (C1, LDC, 2), %eax movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 4 * SIZE(C1), %xmm1 movhps 6 * SIZE(C1), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 2 * SIZE(C1) movlps %xmm1, 4 * SIZE(C1) movhps %xmm1, 6 * SIZE(C1) movsd 0 * SIZE(C1, LDC), %xmm0 movhps 2 * SIZE(C1, LDC), %xmm0 movsd 4 * SIZE(C1, LDC), %xmm1 movhps 6 * SIZE(C1, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(C1, LDC) movhps %xmm0, 2 * SIZE(C1, LDC) movlps %xmm1, 4 * SIZE(C1, LDC) movhps %xmm1, 6 * SIZE(C1, LDC) movsd 0 * SIZE(%eax), %xmm0 movhps 2 * SIZE(%eax), %xmm0 movsd 4 * SIZE(%eax), %xmm1 movhps 6 * SIZE(%eax), %xmm1 pshufd $0x50, %xmm6, %xmm2 pshufd $0xfa, %xmm6, %xmm6 mulps %xmm3, %xmm2 mulps %xmm3, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm1 movlps %xmm0, 0 * SIZE(%eax) movhps %xmm0, 2 * SIZE(%eax) movlps %xmm1, 4 * SIZE(%eax) movhps %xmm1, 6 * SIZE(%eax) movsd 0 * SIZE(%eax, LDC), %xmm0 movhps 2 * SIZE(%eax, LDC), %xmm0 movsd 4 * SIZE(%eax, LDC), %xmm1 movhps 6 * SIZE(%eax, LDC), %xmm1 pshufd $0x50, %xmm7, %xmm2 pshufd $0xfa, %xmm7, %xmm7 mulps %xmm3, %xmm2 mulps %xmm3, %xmm7 addps %xmm2, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%eax, LDC) movhps %xmm0, 2 * SIZE(%eax, LDC) movlps %xmm1, 4 * SIZE(%eax, LDC) movhps %xmm1, 6 * SIZE(%eax, LDC) addl $8 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $2, I jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 pshufd $0x44, %xmm0, %xmm2 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm6 pshufd $0xfa, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm7 subl $-16 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm4 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm5 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: movddup ALPHA, %xmm3 addps %xmm6, %xmm4 addps %xmm7, %xmm5 leal (C1, LDC, 2), %eax movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 2 * SIZE(C1, LDC), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 2 * SIZE(C1) movlps %xmm1, 0 * SIZE(C1, LDC) movhps %xmm1, 2 * SIZE(C1, LDC) movsd 0 * SIZE(%eax), %xmm0 movhps 2 * SIZE(%eax), %xmm0 movsd 0 * SIZE(%eax, LDC), %xmm1 movhps 2 * SIZE(%eax, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%eax) movhps %xmm0, 2 * SIZE(%eax) movlps %xmm1, 0 * SIZE(%eax, LDC) movhps %xmm1, 2 * SIZE(%eax, LDC) addl $4 * SIZE, C1 ALIGN_4 .L30: movl M, I testl $1, I jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movaps -32 * SIZE(BB), %xmm1 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB subl $1, %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: movddup ALPHA, %xmm3 leal (C1, LDC, 2), %eax movsd (C1), %xmm0 movhps (C1, LDC), %xmm0 movsd (%eax), %xmm1 movhps (%eax, LDC), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, (C1) movhps %xmm0, (C1, LDC) movlps %xmm1, (%eax) movhps %xmm1, (%eax, LDC) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif movl BB, B leal (, LDC, 4), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L40: movl N, %eax testl $2, %eax jle .L70 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $2, I jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(C1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(C1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addps %xmm2, %xmm4 addps %xmm3, %xmm5 movddup ALPHA, %xmm3 addps %xmm6, %xmm4 addps %xmm7, %xmm5 movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 4 * SIZE(C1), %xmm1 movhps 6 * SIZE(C1), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 2 * SIZE(C1) movlps %xmm1, 4 * SIZE(C1) movhps %xmm1, 6 * SIZE(C1) movsd 0 * SIZE(C1, LDC), %xmm0 movhps 2 * SIZE(C1, LDC), %xmm0 movsd 4 * SIZE(C1, LDC), %xmm1 movhps 6 * SIZE(C1, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(C1, LDC) movhps %xmm0, 2 * SIZE(C1, LDC) movlps %xmm1, 4 * SIZE(C1, LDC) movhps %xmm1, 6 * SIZE(C1, LDC) addl $8 * SIZE, C1 decl I jg .L41 ALIGN_4 .L50: movl M, I testl $2, I jle .L60 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movaps -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 pshufd $0x44, %xmm0, %xmm2 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 mulps %xmm2, %xmm3 pshufd $0xee, %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 pshufd $0xfa, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L58 ALIGN_4 .L56: pshufd $0x44, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm3, %xmm4 pshufd $0x50, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: addps %xmm3, %xmm4 addps %xmm5, %xmm4 movddup ALPHA, %xmm3 movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhps 2 * SIZE(C1, LDC), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 2 * SIZE(C1) movlps %xmm1, 0 * SIZE(C1, LDC) movhps %xmm1, 2 * SIZE(C1, LDC) addl $4 * SIZE, C1 ALIGN_4 .L60: movl M, I testl $1, I jle .L69 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -28 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -24 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -22 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -20 * SIZE(BB), %xmm1 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -18 * SIZE(BB), %xmm1 pshufd $0x55, %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm5 movsd -16 * SIZE(BB), %xmm1 subl $ -8 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: pshufd $0x00, %xmm0, %xmm2 movss -31 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: movddup ALPHA, %xmm3 addps %xmm5, %xmm4 movsd (C1), %xmm0 movhps (C1, LDC), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, (C1) movhps %xmm0, (C1, LDC) ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L70: movl N, %eax testl $1, %eax jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, I sarl $2, I jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(C1) pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L78 ALIGN_4 .L76: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: movddup ALPHA, %xmm3 addps %xmm2, %xmm4 addps %xmm5, %xmm4 movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 movsd 4 * SIZE(C1), %xmm1 movhps 6 * SIZE(C1), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 2 * SIZE(C1) movlps %xmm1, 4 * SIZE(C1) movhps %xmm1, 6 * SIZE(C1) addl $8 * SIZE, C1 decl I jg .L71 ALIGN_4 .L80: movl M, I testl $2, I jle .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 movsd -32 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x55, %xmm1, %xmm2 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 subl $-16 * SIZE, AA subl $ -8 * SIZE, BB subl $1, %eax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L88 ALIGN_4 .L86: pshufd $0x00, %xmm1, %xmm2 movss -31 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 addl $2 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: movddup ALPHA, %xmm3 addps %xmm5, %xmm4 movsd 0 * SIZE(C1), %xmm0 movhps 2 * SIZE(C1), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 2 * SIZE(C1) addl $4 * SIZE, C1 ALIGN_4 .L90: movl M, I testl $1, I jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax addl %eax, AA addl %eax, BB #endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movsd -32 * SIZE(BB), %xmm1 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 subl $-8 * SIZE, AA subl $-8 * SIZE, BB subl $1, %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L98 ALIGN_4 .L96: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -31 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $1 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: movddup ALPHA, %xmm3 haddps %xmm4, %xmm4 movsd 0 * SIZE(C1), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(C1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_4x4_prescott.S000066400000000000000000001231741313527062700225460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) #define OLD_ALPHA_R 16 + STACK(%esi) #define OLD_ALPHA_I 20 + STACK(%esi) #define OLD_A 24 + STACK(%esi) #define OLD_B 28 + STACK(%esi) #define OLD_C 32 + STACK(%esi) #define OLD_LDC 36 + STACK(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 128(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #ifdef PENTIUMM #define PREFETCH prefetcht0 #define PREFETCHSIZE 96 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA); \ addps %xmm2, %xmm4; \ movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm2, %xmm7; \ movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm2, %xmm7; \ movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm3, %xmm7; \ movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ addps %xmm3, %xmm7; \ movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm2, %xmm7 #define KERNEL6(address) \ movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm2, %xmm7; \ movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL7(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm3, %xmm7; \ movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ addps %xmm3, %xmm7; \ movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movss OLD_ALPHA_R, %xmm0 movss OLD_ALPHA_I, %xmm1 movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl OLD_B, %edi movl OLD_C, %ebx unpcklps %xmm1, %xmm0 movlhps %xmm0, %xmm0 movaps %xmm0, ALPHA movl %ebx, C movl OLD_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC sarl $2, %eax movl %eax, J jle .L40 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 .L02: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 movddup 4 * SIZE(%edi), %xmm2 movddup 6 * SIZE(%edi), %xmm3 movddup 8 * SIZE(%edi), %xmm4 movddup 10 * SIZE(%edi), %xmm5 movddup 12 * SIZE(%edi), %xmm6 movddup 14 * SIZE(%edi), %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) # prefetcht1 128 * SIZE(%ecx) prefetcht0 112 * SIZE(%edi) addl $16 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_2 .L05: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_2 .L06: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) addl $4 * SIZE, %edi addl $8 * SIZE, %ecx decl %eax jne .L06 ALIGN_4 .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsldup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsldup 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 leal (LDC, LDC, 2), %eax prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) prefetchnta 4 * SIZE(%esi, LDC, 2) prefetchnta 4 * SIZE(%esi, %eax) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $4, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) #if 1 cmpl $128 * 8, %eax jle .L12 KERNEL1(32 * 8) KERNEL2(32 * 8) KERNEL3(32 * 8) KERNEL4(32 * 8) KERNEL5(32 * 8) KERNEL6(32 * 8) KERNEL7(32 * 8) KERNEL8(32 * 8) cmpl $128 * 9, %eax jle .L12 KERNEL1(32 * 9) KERNEL2(32 * 9) KERNEL3(32 * 9) KERNEL4(32 * 9) KERNEL5(32 * 9) KERNEL6(32 * 9) KERNEL7(32 * 9) KERNEL8(32 * 9) cmpl $128 * 10, %eax jle .L12 KERNEL1(32 * 10) KERNEL2(32 * 10) KERNEL3(32 * 10) KERNEL4(32 * 10) KERNEL5(32 * 10) KERNEL6(32 * 10) KERNEL7(32 * 10) KERNEL8(32 * 10) cmpl $128 * 11, %eax jle .L12 KERNEL1(32 * 11) KERNEL2(32 * 11) KERNEL3(32 * 11) KERNEL4(32 * 11) KERNEL5(32 * 11) KERNEL6(32 * 11) KERNEL7(32 * 11) KERNEL8(32 * 11) cmpl $128 * 12, %eax jle .L12 KERNEL1(32 * 12) KERNEL2(32 * 12) KERNEL3(32 * 12) KERNEL4(32 * 12) KERNEL5(32 * 12) KERNEL6(32 * 12) KERNEL7(32 * 12) KERNEL8(32 * 12) cmpl $128 * 13, %eax jle .L12 KERNEL1(32 * 13) KERNEL2(32 * 13) KERNEL3(32 * 13) KERNEL4(32 * 13) KERNEL5(32 * 13) KERNEL6(32 * 13) KERNEL7(32 * 13) KERNEL8(32 * 13) cmpl $128 * 14, %eax jle .L12 KERNEL1(32 * 14) KERNEL2(32 * 14) KERNEL3(32 * 14) KERNEL4(32 * 14) KERNEL5(32 * 14) KERNEL6(32 * 14) KERNEL7(32 * 14) KERNEL8(32 * 14) cmpl $128 * 15, %eax jle .L12 KERNEL1(32 * 15) KERNEL2(32 * 15) KERNEL3(32 * 15) KERNEL4(32 * 15) KERNEL5(32 * 15) KERNEL6(32 * 15) KERNEL7(32 * 15) KERNEL8(32 * 15) #else addl $128 * 4 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 #endif .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L12 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movshdup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movsldup 8 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 movsd 4 * SIZE(%esi, LDC, 2), %xmm1 movhps 6 * SIZE(%esi, LDC, 2), %xmm1 pshufd $0x50, %xmm6, %xmm2 pshufd $0xfa, %xmm6, %xmm6 mulps %xmm3, %xmm2 mulps %xmm3, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 2 * SIZE(%esi, LDC, 2) movlps %xmm1, 4 * SIZE(%esi, LDC, 2) movhps %xmm1, 6 * SIZE(%esi, LDC, 2) movsd 0 * SIZE(%esi, %eax), %xmm0 movhps 2 * SIZE(%esi, %eax), %xmm0 movsd 4 * SIZE(%esi, %eax), %xmm1 movhps 6 * SIZE(%esi, %eax), %xmm1 pshufd $0x50, %xmm7, %xmm2 pshufd $0xfa, %xmm7, %xmm7 mulps %xmm3, %xmm2 mulps %xmm3, %xmm7 addps %xmm2, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 0 * SIZE(%esi, %eax) movhps %xmm0, 2 * SIZE(%esi, %eax) movlps %xmm1, 4 * SIZE(%esi, %eax) movhps %xmm1, 6 * SIZE(%esi, %eax) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L11 ALIGN_4 .L20: testl $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 12 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 movddup 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 movddup 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 movddup 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 44 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 movddup 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 64 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 60 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: leal (LDC, LDC, 2), %eax movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 2 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 2 * SIZE(%esi, LDC) movsd 0 * SIZE(%esi, LDC, 2), %xmm0 movhps 2 * SIZE(%esi, LDC, 2), %xmm0 movsd 0 * SIZE(%esi, %eax), %xmm1 movhps 2 * SIZE(%esi, %eax), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC, 2) movhps %xmm0, 2 * SIZE(%esi, LDC, 2) movlps %xmm1, 0 * SIZE(%esi, %eax) movhps %xmm1, 2 * SIZE(%esi, %eax) addl $4 * SIZE, %esi # coffset += 2 ALIGN_4 .L30: testl $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $4, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: shufps $0, %xmm0, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movhps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 8 * SIZE(BB), %xmm2 shufps $0, %xmm0, %xmm0 movhps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movhps 20 * SIZE(BB), %xmm3 shufps $0, %xmm0, %xmm0 movsd 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 movss 3 * SIZE(AA), %xmm0 addps %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 shufps $0, %xmm0, %xmm0 movhps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movss 8 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 shufps $0, %xmm1, %xmm1 movhps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movss 5 * SIZE(AA), %xmm1 addps %xmm2, %xmm4 movsd 40 * SIZE(BB), %xmm2 shufps $0, %xmm1, %xmm1 movhps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movss 6 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 64 * SIZE(BB), %xmm2 shufps $0, %xmm1, %xmm1 movhps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 56 * SIZE(BB), %xmm3 shufps $0, %xmm1, %xmm1 movhps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 80 * SIZE(BB), %xmm3 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0, %xmm0, %xmm0 movhps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 8 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: leal (LDC, LDC, 2), %eax addps %xmm5, %xmm4 movsd (%esi), %xmm0 movhps (%esi, LDC), %xmm0 movsd (%esi, LDC, 2), %xmm1 movhps (%esi, %eax), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, (%esi) movhps %xmm0, (%esi, LDC) movlps %xmm1, (%esi, LDC, 2) movhps %xmm1, (%esi, %eax) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leal (, LDC, 4), %eax addl %eax, C # c += 4 * ldc decl J # j -- jg .L01 ALIGN_4 .L40: testl $2, N je .L80 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L45 ALIGN_4 .L42: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 movddup 4 * SIZE(%edi), %xmm2 movddup 6 * SIZE(%edi), %xmm3 movddup 8 * SIZE(%edi), %xmm4 movddup 10 * SIZE(%edi), %xmm5 movddup 12 * SIZE(%edi), %xmm6 movddup 14 * SIZE(%edi), %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) # prefetcht1 128 * SIZE(%ecx) prefetcht0 112 * SIZE(%edi) addl $16 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L42 ALIGN_4 .L45: movl K, %eax andl $7, %eax BRANCH jle .L50 ALIGN_4 .L46: movddup 0 * SIZE(%edi), %xmm0 movaps %xmm0, 0 * SIZE(%ecx) addl $2 * SIZE, %edi addl $4 * SIZE, %ecx decl %eax jne .L46 ALIGN_4 .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsldup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsldup 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 12 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 16 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsldup 48 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L56 ALIGN_4 .L58: movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) addl $8 * SIZE, %esi # coffset += 2 decl %ebx # i -- jg .L51 ALIGN_4 .L60: testl $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 12 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm1 movhps 2 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 0 * SIZE(%esi, LDC) movhps %xmm1, 2 * SIZE(%esi, LDC) addl $4 * SIZE, %esi ALIGN_4 .L70: testl $1, M je .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 4 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 shufps $0, %xmm0, %xmm0 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 shufps $0, %xmm0, %xmm0 movsd 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 3 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 shufps $0, %xmm0, %xmm0 movsd 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movss 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0, %xmm1, %xmm1 mulps %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 shufps $0, %xmm1, %xmm1 movsd 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 6 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 shufps $0, %xmm1, %xmm1 movsd 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 shufps $0, %xmm1, %xmm1 movsd 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movss 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addl $ 1 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addps %xmm5, %xmm4 movsd (%esi), %xmm0 movhps (%esi, LDC), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, (%esi) movhps %xmm0, (%esi, LDC) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C ALIGN_4 .L80: testl $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 .L82: movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 movss 2 * SIZE(%edi), %xmm2 movss 3 * SIZE(%edi), %xmm3 movss 4 * SIZE(%edi), %xmm4 movss 5 * SIZE(%edi), %xmm5 movss 6 * SIZE(%edi), %xmm6 movss 7 * SIZE(%edi), %xmm7 movss %xmm0, 0 * SIZE(%ecx) movss %xmm0, 1 * SIZE(%ecx) movss %xmm1, 2 * SIZE(%ecx) movss %xmm1, 3 * SIZE(%ecx) movss %xmm2, 4 * SIZE(%ecx) movss %xmm2, 5 * SIZE(%ecx) movss %xmm3, 6 * SIZE(%ecx) movss %xmm3, 7 * SIZE(%ecx) movss %xmm4, 8 * SIZE(%ecx) movss %xmm4, 9 * SIZE(%ecx) movss %xmm5, 10 * SIZE(%ecx) movss %xmm5, 11 * SIZE(%ecx) movss %xmm6, 12 * SIZE(%ecx) movss %xmm6, 13 * SIZE(%ecx) movss %xmm7, 14 * SIZE(%ecx) movss %xmm7, 15 * SIZE(%ecx) # prefetcht1 128 * SIZE(%ecx) prefetcht0 112 * SIZE(%edi) addl $ 8 * SIZE, %edi addl $16 * SIZE, %ecx decl %eax jne .L82 ALIGN_4 .L85: movl K, %eax andl $7, %eax BRANCH jle .L90 ALIGN_4 .L86: movss 0 * SIZE(%edi), %xmm0 movss %xmm0, 0 * SIZE(%ecx) movss %xmm0, 1 * SIZE(%ecx) addl $1 * SIZE, %edi addl $2 * SIZE, %ecx decl %eax jne .L86 ALIGN_4 .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 0 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movaps 16 * SIZE(AA), %xmm1 movddup 8 * SIZE(BB), %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(%esi) #elif defined(HAVE_SSE) || defined(HAVE_SSE2) prefetcht2 4 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movddup 2 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movddup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 12 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movddup 6 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movddup 16 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movddup 10 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movddup 12 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movddup 14 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movddup 24 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movddup 2 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addps %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) addl $8 * SIZE, %esi decl %ebx # i -- jg .L91 ALIGN_4 .L100: testl $2, M je .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 0 * SIZE(BB), %xmm2 movsd 8 * SIZE(AA), %xmm1 movsd 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L105 ALIGN_4 .L102: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 6 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 16 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 movsd 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 10 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 12 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 14 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L106 ALIGN_4 .L108: addps %xmm5, %xmm4 movhlps %xmm4, %xmm5 addps %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) addl $4 * SIZE, %esi # coffset += 2 ALIGN_4 .L110: testl $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss 0 * SIZE(BB), %xmm2 pxor %xmm5, %xmm5 movss 4 * SIZE(AA), %xmm1 movss 8 * SIZE(BB), %xmm3 leal (LDC, LDC, 2), %eax #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L115 ALIGN_4 .L112: mulss %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 2 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 movss 2 * SIZE(AA), %xmm0 addss %xmm2, %xmm5 movss 4 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 movss 3 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 6 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 movss 8 * SIZE(AA), %xmm0 addss %xmm2, %xmm5 movss 16 * SIZE(BB), %xmm2 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 10 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 movss 6 * SIZE(AA), %xmm1 addss %xmm3, %xmm5 movss 12 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 movss 14 * SIZE(BB), %xmm3 mulss %xmm1, %xmm3 movss 12 * SIZE(AA), %xmm1 addss %xmm3, %xmm5 movss 24 * SIZE(BB), %xmm3 addl $ 8 * SIZE, AA addl $16 * SIZE, BB decl %eax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 movss 2 * SIZE(BB), %xmm2 addl $1 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L116 ALIGN_4 .L118: addss %xmm5, %xmm4 movsd (%esi), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, (%esi) ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_8x2_core2.S000066400000000000000000000774541313527062700217300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 20 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 512(%esp) #define PREFETCH_R (8 * 16 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 16 + 4) #define PREFETCH prefetcht0 #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx #ifdef OPTERON #define MOVSD movlps #else #define MOVSD movsd #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $512 + LOCAL_BUFFER_SIZE, %esp andl $-4096, %esp # align stack STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, B movl STACK_C, %ebx unpcklps %xmm1, %xmm0 movlps %xmm0, 0 + ALPHA movlps %xmm0, 8 + ALPHA movl %ebx, C movl STACK_LDC, LDC subl $-32 * SIZE, A subl $-32 * SIZE, B sall $ZBASE_SHIFT, LDC sarl $1, %eax movl %eax, J jle .L50 ALIGN_4 .L01: leal 32 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 movss -30 * SIZE(B), %xmm2 movss -29 * SIZE(B), %xmm3 movss -28 * SIZE(B), %xmm4 movss -27 * SIZE(B), %xmm5 movss -26 * SIZE(B), %xmm6 movss -25 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_W + 0) * SIZE(BB) shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 16) * SIZE(BB) movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) movaps %xmm2, -24 * SIZE(BB) movaps %xmm3, -20 * SIZE(BB) movaps %xmm4, -16 * SIZE(BB) movaps %xmm5, -12 * SIZE(BB) movaps %xmm6, -8 * SIZE(BB) movaps %xmm7, -4 * SIZE(BB) addl $ 8 * SIZE, B subl $-32 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L05: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_4 .L06: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L06 ALIGN_4 .L10: movl C, C1 movl A, AA movl M, I sarl $3, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -16 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 prefetcht0 7 * SIZE(C1) pxor %xmm7, %xmm7 prefetcht0 7 * SIZE(C1, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps -20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 0 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -16 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps -12 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 movaps -8 * SIZE(AA), %xmm3 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps -4 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 movaps 16 * SIZE(AA), %xmm3 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps 4 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps 8 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps 12 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps 16 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps 20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 20 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 addps %xmm2, %xmm6 movaps 24 * SIZE(AA), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps 28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 28 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 subl $-64 * SIZE, BB movaps 48 * SIZE(AA), %xmm3 subl $-64 * SIZE, AA addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -32 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 addps %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movaps ALPHA, %xmm3 MOVSD 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 MOVSD 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) MOVSD 8 * SIZE(%esi), %xmm0 movhps 10 * SIZE(%esi), %xmm0 MOVSD 12 * SIZE(%esi), %xmm1 movhps 14 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm6, %xmm2 pshufd $0xfa, %xmm6, %xmm6 mulps %xmm3, %xmm2 mulps %xmm3, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm1 movlps %xmm0, 8 * SIZE(%esi) movhps %xmm0, 10 * SIZE(%esi) movlps %xmm1, 12 * SIZE(%esi) movhps %xmm1, 14 * SIZE(%esi) MOVSD 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 MOVSD 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) MOVSD 8 * SIZE(%esi, LDC), %xmm0 movhps 10 * SIZE(%esi, LDC), %xmm0 MOVSD 12 * SIZE(%esi, LDC), %xmm1 movhps 14 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm7, %xmm2 pshufd $0xfa, %xmm7, %xmm7 mulps %xmm3, %xmm2 mulps %xmm3, %xmm7 addps %xmm2, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 8 * SIZE(%esi, LDC) movhps %xmm0, 10 * SIZE(%esi, LDC) movlps %xmm1, 12 * SIZE(%esi, LDC) movhps %xmm1, 14 * SIZE(%esi, LDC) addl $16 * SIZE, C1 decl I jg .L11 ALIGN_4 .L20: movl M, I testl $4, I jle .L30 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movaps -16 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movaps -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BB), %xmm0 addps %xmm1, %xmm6 movaps 0 * SIZE(BB), %xmm1 addps %xmm0, %xmm7 movaps -24 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps -8 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps -20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 16 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 mulps 4 * SIZE(BB), %xmm2 addps %xmm1, %xmm4 movaps 8 * SIZE(BB), %xmm1 addps %xmm2, %xmm5 movaps -12 * SIZE(AA), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(BB), %xmm2 addps %xmm1, %xmm6 movaps 32 * SIZE(BB), %xmm1 addps %xmm2, %xmm7 movaps -8 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 20 * SIZE(BB), %xmm2 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm2, %xmm5 movaps -4 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 28 * SIZE(BB), %xmm2 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm2, %xmm7 movaps 16 * SIZE(AA), %xmm2 subl $-32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 movsd 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 pshufd $0x50, %xmm5, %xmm2 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) addl $8 * SIZE, C1 ALIGN_4 .L30: movl M, I testl $2, I jle .L40 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -24 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movsd -30 * SIZE(AA), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BB), %xmm0 addps %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 addps %xmm0, %xmm7 movsd -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movsd -8 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movsd -26 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movsd 16 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd -16 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 mulps 4 * SIZE(BB), %xmm2 addps %xmm1, %xmm4 movsd 8 * SIZE(BB), %xmm1 addps %xmm2, %xmm5 movsd -22 * SIZE(AA), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(BB), %xmm2 addps %xmm1, %xmm6 movsd 32 * SIZE(BB), %xmm1 addps %xmm2, %xmm7 movsd -20 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 20 * SIZE(BB), %xmm2 addps %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 addps %xmm2, %xmm5 movsd -18 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 28 * SIZE(BB), %xmm2 addps %xmm3, %xmm6 movsd 48 * SIZE(BB), %xmm3 addps %xmm2, %xmm7 movsd -8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 addps %xmm0, %xmm5 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movsd 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 pshufd $0x50, %xmm5, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) addl $4 * SIZE, %esi ALIGN_4 .L40: movl M, I testl $1, I jle .L49 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB /* because it's doubled */ #endif movss -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movss -28 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movss -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: mulss %xmm0, %xmm1 mulss -28 * SIZE(BB), %xmm0 addss %xmm1, %xmm4 movss -24 * SIZE(BB), %xmm1 addss %xmm0, %xmm5 movss -31 * SIZE(AA), %xmm0 mulss %xmm0, %xmm1 mulss -20 * SIZE(BB), %xmm0 addss %xmm1, %xmm6 movss 0 * SIZE(BB), %xmm1 addss %xmm0, %xmm7 movss -30 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss -12 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss -8 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss -29 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss -4 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 16 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss -24 * SIZE(AA), %xmm0 mulss %xmm2, %xmm1 mulss 4 * SIZE(BB), %xmm2 addss %xmm1, %xmm4 movss 8 * SIZE(BB), %xmm1 addss %xmm2, %xmm5 movss -27 * SIZE(AA), %xmm2 mulss %xmm2, %xmm1 mulss 12 * SIZE(BB), %xmm2 addss %xmm1, %xmm6 movss 32 * SIZE(BB), %xmm1 addss %xmm2, %xmm7 movss -26 * SIZE(AA), %xmm2 mulss %xmm2, %xmm3 mulss 20 * SIZE(BB), %xmm2 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm2, %xmm5 movss -25 * SIZE(AA), %xmm2 mulss %xmm2, %xmm3 mulss 28 * SIZE(BB), %xmm2 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm2, %xmm7 movss -20 * SIZE(AA), %xmm2 subl $-8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: mulss %xmm0, %xmm1 mulss -28 * SIZE(BB), %xmm0 addss %xmm1, %xmm4 movss -24 * SIZE(BB), %xmm1 addss %xmm0, %xmm5 movss -31 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addss %xmm6, %xmm4 addss %xmm7, %xmm5 movsd (%esi), %xmm0 movhps (%esi, LDC), %xmm0 shufps $0, %xmm5, %xmm4 mulps %xmm3, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, (%esi) movhps %xmm0, (%esi, LDC) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L50: movl N, %eax testl $1, %eax jle .L999 ALIGN_4 .L51: leal 32 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $3, %eax jle .L55 ALIGN_4 .L52: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 movss -30 * SIZE(B), %xmm2 movss -29 * SIZE(B), %xmm3 movss -28 * SIZE(B), %xmm4 movss -27 * SIZE(B), %xmm5 movss -26 * SIZE(B), %xmm6 movss -25 * SIZE(B), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) movaps %xmm2, -24 * SIZE(BB) movaps %xmm3, -20 * SIZE(BB) movaps %xmm4, -16 * SIZE(BB) movaps %xmm5, -12 * SIZE(BB) movaps %xmm6, -8 * SIZE(BB) movaps %xmm7, -4 * SIZE(BB) addl $ 8 * SIZE, B subl $-32 * SIZE, BB decl %eax jne .L52 ALIGN_4 .L55: movl K, %eax andl $7, %eax BRANCH jle .L60 ALIGN_4 .L56: movss -32 * SIZE(B), %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(BB) addl $1 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L56 ALIGN_4 .L60: movl C, C1 movl A, AA movl M, I sarl $3, I jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movaps -16 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movaps -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetcht0 3 * SIZE(C1) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L65 ALIGN_4 .L62: mulps %xmm1, %xmm0 mulps -28 * SIZE(AA), %xmm1 addps %xmm0, %xmm4 movaps -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm6 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 mulps -20 * SIZE(AA), %xmm1 addps %xmm0, %xmm5 movaps 0 * SIZE(AA), %xmm0 addps %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 mulps %xmm1, %xmm2 mulps -12 * SIZE(AA), %xmm1 addps %xmm2, %xmm4 movaps -8 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps -20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm2 mulps -4 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 16 * SIZE(AA), %xmm2 addps %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 mulps %xmm3, %xmm0 mulps 4 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 8 * SIZE(AA), %xmm0 addps %xmm3, %xmm6 movaps -12 * SIZE(BB), %xmm3 mulps %xmm3, %xmm0 mulps 12 * SIZE(AA), %xmm3 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps -8 * SIZE(BB), %xmm3 mulps %xmm3, %xmm2 mulps 20 * SIZE(AA), %xmm3 addps %xmm2, %xmm4 movaps 24 * SIZE(AA), %xmm2 addps %xmm3, %xmm6 movaps -4 * SIZE(BB), %xmm3 mulps %xmm3, %xmm2 mulps 28 * SIZE(AA), %xmm3 addps %xmm2, %xmm5 movaps 48 * SIZE(AA), %xmm2 addps %xmm3, %xmm7 movaps 16 * SIZE(BB), %xmm3 addl $ 64 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L62 ALIGN_4 .L65: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L68 ALIGN_4 .L66: mulps %xmm1, %xmm0 mulps -28 * SIZE(AA), %xmm1 addps %xmm0, %xmm4 movaps -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm6 movaps -28 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L66 ALIGN_4 .L68: addps %xmm5, %xmm4 addps %xmm7, %xmm6 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) movsd 8 * SIZE(%esi), %xmm0 movhps 10 * SIZE(%esi), %xmm0 movsd 12 * SIZE(%esi), %xmm1 movhps 14 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm6, %xmm2 pshufd $0xfa, %xmm6, %xmm6 mulps %xmm3, %xmm2 mulps %xmm3, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm1 movlps %xmm0, 8 * SIZE(%esi) movhps %xmm0, 10 * SIZE(%esi) movlps %xmm1, 12 * SIZE(%esi) movhps %xmm1, 14 * SIZE(%esi) addl $16 * SIZE, C1 decl I jg .L61 ALIGN_4 .L70: movl M, I testl $4, I jle .L80 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movaps -16 * SIZE(AA), %xmm2 movaps -16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L75 ALIGN_4 .L72: mulps %xmm0, %xmm1 movaps -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movaps -20 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movaps 0 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps 0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 movaps -12 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movaps -12 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movaps -8 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movaps -8 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movaps -4 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movaps -4 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movaps 16 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movaps 16 * SIZE(BB), %xmm3 subl $-32 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L72 ALIGN_4 .L75: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L78 ALIGN_4 .L76: mulps %xmm0, %xmm1 movaps -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L76 ALIGN_4 .L78: addps %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 pshufd $0x50, %xmm4, %xmm2 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) addl $8 * SIZE, %esi ALIGN_4 .L80: movl M, I testl $2, I jle .L90 .L81: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB /* because it's doubled */ #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -16 * SIZE(BB), %xmm3 movsd -24 * SIZE(AA), %xmm2 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L85 ALIGN_4 .L82: mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm1 movsd -16 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movsd -0 * SIZE(BB), %xmm1 mulps %xmm2, %xmm3 movsd -22 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movsd -12 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movsd -20 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movsd -8 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movsd -18 * SIZE(AA), %xmm2 addps %xmm3, %xmm4 movsd -4 * SIZE(BB), %xmm3 mulps %xmm2, %xmm3 movsd -8 * SIZE(AA), %xmm2 addps %xmm3, %xmm5 movsd 16 * SIZE(BB), %xmm3 subl $-16 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L82 ALIGN_4 .L85: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L88 ALIGN_4 .L86: mulps %xmm0, %xmm1 movsd -30 * SIZE(AA), %xmm0 addps %xmm1, %xmm4 movsd -28 * SIZE(BB), %xmm1 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L86 ALIGN_4 .L88: addps %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) addl $4 * SIZE, %esi ALIGN_4 .L90: movl M, I testl $1, I jle .L99 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movss -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movss -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movss -16 * SIZE(BB), %xmm3 movss -28 * SIZE(AA), %xmm2 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L95 ALIGN_4 .L92: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -28 * SIZE(BB), %xmm1 mulss %xmm0, %xmm1 movss -30 * SIZE(AA), %xmm0 addss %xmm1, %xmm5 movss -24 * SIZE(BB), %xmm1 mulss %xmm0, %xmm1 movss -29 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -20 * SIZE(BB), %xmm1 mulss %xmm0, %xmm1 movss -24 * SIZE(AA), %xmm0 addss %xmm1, %xmm5 movss -0 * SIZE(BB), %xmm1 mulss %xmm2, %xmm3 movss -27 * SIZE(AA), %xmm2 addss %xmm3, %xmm4 movss -12 * SIZE(BB), %xmm3 mulss %xmm2, %xmm3 movss -26 * SIZE(AA), %xmm2 addss %xmm3, %xmm5 movss -8 * SIZE(BB), %xmm3 mulss %xmm2, %xmm3 movss -25 * SIZE(AA), %xmm2 addss %xmm3, %xmm4 movss -4 * SIZE(BB), %xmm3 mulss %xmm2, %xmm3 movss -20 * SIZE(AA), %xmm2 addss %xmm3, %xmm5 movss 16 * SIZE(BB), %xmm3 subl $ -8 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L92 ALIGN_4 .L95: movaps ALPHA, %xmm3 #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L98 ALIGN_4 .L96: mulss %xmm0, %xmm1 movss -31 * SIZE(AA), %xmm0 addss %xmm1, %xmm4 movss -28 * SIZE(BB), %xmm1 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L96 ALIGN_4 .L98: addss %xmm5, %xmm4 movsd 0 * SIZE(%esi), %xmm0 pshufd $0x50, %xmm4, %xmm2 mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) ALIGN_4 .L99: addl LDC, C ALIGN_4 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm3m_kernel_8x2_sse.S000066400000000000000000001610241313527062700214730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 20 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define PREFETCHSIZE 48 /* for PIII */ #define AA %edx #define BB %ecx #ifdef HAVE_SSE2 #define MOVSD movsd #define XORPS pxor #else #define MOVSD movlps #define XORPS xorps #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm1, %xmm3; \ mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL4(address) \ mulps %xmm1, %xmm3; \ mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL5(address) \ mulps %xmm0, %xmm2; \ mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL6(address) \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL7(address) \ mulps %xmm1, %xmm3; \ mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif unpcklps %xmm1, %xmm0 movlps %xmm0, 0 + ALPHA movlps %xmm0, 8 + ALPHA movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC sarl $1, %eax # j = (n >> 1) movl %eax, J jle .L100 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L03 ALIGN_4 .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax BRANCH jne .L02 ALIGN_2 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 addl $2 * SIZE, B shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) addl $8 * SIZE, %ecx decl %eax jne .L04 ALIGN_4 .L05: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef PENTIUM4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif prefetchnta 7 * SIZE(%esi) prefetchnta 7 * SIZE(%esi, %ebp) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $2, %eax #endif movl %eax, KKK #endif andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 8 * SIZE, AA addl $64 * 8 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif prefetchnta 8 * SIZE(%esi) prefetchnta 8 * SIZE(%esi, %ebp) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L12 ALIGN_2 .L11: #ifdef CORE_KATMAI prefetcht0 PREFETCHSIZE * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 ALIGN_2 #endif .L12: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 4 * SIZE(AA), %xmm0 addps %xmm1, %xmm5 movaps 4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm1, %xmm7 addl $8 * SIZE, AA addl $8 * SIZE, BB subl $1, %eax jg .L13 ALIGN_4 .L14: MOVSD 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 MOVSD 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) MOVSD 8 * SIZE(%esi), %xmm0 movhps 10 * SIZE(%esi), %xmm0 MOVSD 12 * SIZE(%esi), %xmm1 movhps 14 * SIZE(%esi), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm6, %xmm2 #else movaps %xmm6, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm6, %xmm6 mulps %xmm3, %xmm2 mulps %xmm3, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm1 movlps %xmm0, 8 * SIZE(%esi) movhps %xmm0, 10 * SIZE(%esi) movlps %xmm1, 12 * SIZE(%esi) movhps %xmm1, 14 * SIZE(%esi) MOVSD 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 MOVSD 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm5, %xmm2 #else movaps %xmm5, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) MOVSD 8 * SIZE(%esi, LDC), %xmm0 movhps 10 * SIZE(%esi, LDC), %xmm0 MOVSD 12 * SIZE(%esi, LDC), %xmm1 movhps 14 * SIZE(%esi, LDC), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm7, %xmm2 #else movaps %xmm7, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm7, %xmm7 mulps %xmm3, %xmm2 mulps %xmm3, %xmm7 addps %xmm2, %xmm0 addps %xmm7, %xmm1 movlps %xmm0, 8 * SIZE(%esi, LDC) movhps %xmm0, 10 * SIZE(%esi, LDC) movlps %xmm1, 12 * SIZE(%esi, LDC) movhps %xmm1, 14 * SIZE(%esi, LDC) addl $16 * SIZE, %esi BRANCH decl %ebx # i -- jg .L10 ALIGN_2 .L30: movl M, %ebx andl $7, %ebx jle .L99 testl $4, %ebx jle .L50 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L32 ALIGN_2 .L31: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L31 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L32 ALIGN_2 .L31: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 12 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 20 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 44 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 52 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L31 ALIGN_2 #endif .L32: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L34 .L33: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L33 ALIGN_4 .L34: addps %xmm6, %xmm4 addps %xmm7, %xmm5 MOVSD 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 MOVSD 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) MOVSD 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 MOVSD 4 * SIZE(%esi, LDC), %xmm1 movhps 6 * SIZE(%esi, LDC), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm5, %xmm2 #else movaps %xmm5, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) movlps %xmm1, 4 * SIZE(%esi, LDC) movhps %xmm1, 6 * SIZE(%esi, LDC) addl $8 * SIZE, %esi ALIGN_2 .L50: testl $2, %ebx jle .L70 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB MOVSD 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ MOVSD 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 ALIGN_2 .L51: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 MOVSD 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 MOVSD 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 MOVSD 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 MOVSD 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 MOVSD 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 MOVSD 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 MOVSD 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 MOVSD 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 MOVSD 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 MOVSD 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 MOVSD 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 MOVSD 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 MOVSD 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 MOVSD 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 MOVSD 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 MOVSD 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 MOVSD 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 MOVSD 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 MOVSD 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 MOVSD 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB MOVSD 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ MOVSD 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 ALIGN_2 .L51: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 MOVSD 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 MOVSD 16 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 MOVSD 12 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 MOVSD 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 MOVSD 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 MOVSD 20 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 MOVSD 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 MOVSD 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 MOVSD 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 MOVSD 40 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 MOVSD 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 MOVSD 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 MOVSD 48 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 MOVSD 44 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 MOVSD 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 MOVSD 52 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 MOVSD 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 MOVSD 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 MOVSD 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 MOVSD 72 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 #endif .L52: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 MOVSD 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 MOVSD 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L53 ALIGN_4 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 MOVSD 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) MOVSD 0 * SIZE(%esi, LDC), %xmm0 movhps 2 * SIZE(%esi, LDC), %xmm0 #ifdef HAVE_SSE2 pshufd $0x50, %xmm5, %xmm2 #else movaps %xmm5, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi, LDC) movhps %xmm0, 2 * SIZE(%esi, LDC) addl $4 * SIZE, %esi ALIGN_2 .L70: testl $1, %ebx jle .L99 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L72 ALIGN_2 .L71: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 12 * SIZE(BB), %xmm0 addss %xmm2, %xmm6 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 20 * SIZE(BB), %xmm0 addss %xmm3, %xmm4 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 48 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 40 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 44 * SIZE(BB), %xmm1 addss %xmm2, %xmm6 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 52 * SIZE(BB), %xmm1 addss %xmm3, %xmm4 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 80 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L72 ALIGN_2 .L71: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 12 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm2 mulss 20 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 3 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 mulss 28 * SIZE(BB), %xmm0 addss %xmm3, %xmm6 movss 40 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 mulss 36 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 48 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 5 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 44 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 56 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm2 mulss 52 * SIZE(BB), %xmm1 addss %xmm2, %xmm4 movss 64 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 7 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 mulss 60 * SIZE(BB), %xmm1 addss %xmm3, %xmm6 movss 72 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 #endif .L72: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L74 .L73: mulss %xmm0, %xmm2 mulss 4 * SIZE(BB), %xmm0 addss %xmm2, %xmm4 movss 8 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 1 * SIZE(AA), %xmm0 addl $1 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L73 ALIGN_4 .L74: addss %xmm6, %xmm4 addss %xmm7, %xmm5 MOVSD 0 * SIZE(%esi), %xmm0 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) MOVSD 0 * SIZE(%esi, LDC), %xmm0 #ifdef HAVE_SSE2 pshufd $0x50, %xmm5, %xmm2 #else movaps %xmm5, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi, LDC) ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C # c += 2 * ldc BRANCH decl J # j -- jg .L01 ALIGN_2 .L100: movl N, %eax testl $1, %eax jle .L999 ALIGN_2 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L103 ALIGN_4 .L102: prefetchnta 96 * SIZE(B) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 addl $ 8 * SIZE, B shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps %xmm4, 16 * SIZE(%ecx) movaps %xmm5, 20 * SIZE(%ecx) movaps %xmm6, 24 * SIZE(%ecx) movaps %xmm7, 28 * SIZE(%ecx) addl $32 * SIZE, %ecx decl %eax BRANCH jne .L102 ALIGN_2 .L103: movl K, %eax andl $7, %eax BRANCH jle .L105 ALIGN_2 .L104: movss 0 * SIZE(B), %xmm0 addl $1 * SIZE, B shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(%ecx) addl $4 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $3, %ebx # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_2 .L111: mulps %xmm2, %xmm0 mulps 4 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 4 * SIZE(BB), %xmm2 mulps %xmm2, %xmm0 mulps 12 * SIZE(AA), %xmm2 addps %xmm0, %xmm6 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 8 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 20 * SIZE(AA), %xmm2 addps %xmm1, %xmm4 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 12 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 28 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps 48 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 36 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 40 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 20 * SIZE(BB), %xmm3 mulps %xmm3, %xmm0 mulps 44 * SIZE(AA), %xmm3 addps %xmm0, %xmm6 movaps 64 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 52 * SIZE(AA), %xmm3 addps %xmm1, %xmm4 movaps 56 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 28 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 60 * SIZE(AA), %xmm3 addps %xmm1, %xmm6 movaps 80 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 addl $64 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $8, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_2 .L111: mulps %xmm2, %xmm0 mulps 4 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 4 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 16 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 20 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 12 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 28 * SIZE(AA), %xmm3 addps %xmm1, %xmm6 movaps 40 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 24 * SIZE(BB), %xmm3 mulps %xmm2, %xmm0 mulps 36 * SIZE(AA), %xmm2 addps %xmm0, %xmm4 movaps 48 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 20 * SIZE(BB), %xmm2 mulps %xmm2, %xmm1 mulps 44 * SIZE(AA), %xmm2 addps %xmm1, %xmm6 movaps 56 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm3, %xmm0 mulps 52 * SIZE(AA), %xmm3 addps %xmm0, %xmm4 movaps 64 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 28 * SIZE(BB), %xmm3 mulps %xmm3, %xmm1 mulps 60 * SIZE(AA), %xmm3 addps %xmm1, %xmm6 movaps 72 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 40 * SIZE(BB), %xmm3 addl $64 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_2 #endif .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L114 .L113: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm0 addps %xmm0, %xmm4 mulps 4 * SIZE(AA), %xmm2 addps %xmm2, %xmm5 addl $8 * SIZE, AA addl $4 * SIZE, BB subl $1, %eax jg .L113 ALIGN_4 .L114: addps %xmm6, %xmm4 addps %xmm7, %xmm5 MOVSD 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 MOVSD 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) MOVSD 8 * SIZE(%esi), %xmm0 movhps 10 * SIZE(%esi), %xmm0 MOVSD 12 * SIZE(%esi), %xmm1 movhps 14 * SIZE(%esi), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm5, %xmm2 #else movaps %xmm5, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm5, %xmm5 mulps %xmm3, %xmm2 mulps %xmm3, %xmm5 addps %xmm2, %xmm0 addps %xmm5, %xmm1 movlps %xmm0, 8 * SIZE(%esi) movhps %xmm0, 10 * SIZE(%esi) movlps %xmm1, 12 * SIZE(%esi) movhps %xmm1, 14 * SIZE(%esi) addl $16 * SIZE, %esi BRANCH decl %ebx # i -- jg .L110 ALIGN_2 .L130: movl M, %ebx andl $7, %ebx jle .L999 testl $4, %ebx jle .L150 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L132 ALIGN_2 .L131: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 4 * SIZE(BB), %xmm0 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 8 * SIZE(AA), %xmm0 mulps 8 * SIZE(BB), %xmm0 addps %xmm0, %xmm6 movaps 12 * SIZE(AA), %xmm0 mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 mulps 20 * SIZE(BB), %xmm1 movaps 48 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 24 * SIZE(AA), %xmm1 mulps 24 * SIZE(BB), %xmm1 addps %xmm1, %xmm6 movaps 28 * SIZE(AA), %xmm1 mulps 28 * SIZE(BB), %xmm1 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L131 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L132 ALIGN_2 .L131: mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 4 * SIZE(BB), %xmm0 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 mulps 12 * SIZE(BB), %xmm1 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 movaps 20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 mulps 20 * SIZE(BB), %xmm0 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 mulps 28 * SIZE(BB), %xmm1 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L131 ALIGN_2 #endif .L132: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L134 .L133: movaps 0 * SIZE(BB), %xmm2 movaps 0 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L133 ALIGN_4 .L134: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 MOVSD 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 MOVSD 4 * SIZE(%esi), %xmm1 movhps 6 * SIZE(%esi), %xmm1 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif shufps $0xfa, %xmm4, %xmm4 mulps %xmm3, %xmm2 mulps %xmm3, %xmm4 addps %xmm2, %xmm0 addps %xmm4, %xmm1 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) movlps %xmm1, 4 * SIZE(%esi) movhps %xmm1, 6 * SIZE(%esi) addl $8 * SIZE, %esi ALIGN_2 .L150: testl $2, %ebx jle .L170 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB MOVSD 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB /* because it's doubled */ MOVSD 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L152 ALIGN_2 .L151: mulps %xmm0, %xmm2 MOVSD 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 MOVSD 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 MOVSD 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 MOVSD 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 MOVSD 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 MOVSD 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 MOVSD 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 MOVSD 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 MOVSD 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 MOVSD 48 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L151 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB MOVSD 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB /* because it's doubled */ MOVSD 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 MOVSD 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 MOVSD 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L152 ALIGN_2 .L151: mulps %xmm0, %xmm2 MOVSD 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 MOVSD 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 MOVSD 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 MOVSD 16 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 MOVSD 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm6 MOVSD 12 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 MOVSD 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 MOVSD 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 MOVSD 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm4 MOVSD 20 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 MOVSD 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 MOVSD 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 MOVSD 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm6 MOVSD 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 MOVSD 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 MOVSD 40 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L151 ALIGN_2 #endif .L152: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L154 .L153: mulps %xmm0, %xmm2 MOVSD 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 MOVSD 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L153 ALIGN_4 .L154: addps %xmm5, %xmm4 addps %xmm7, %xmm6 addps %xmm6, %xmm4 MOVSD 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) movhps %xmm0, 2 * SIZE(%esi) addl $4 * SIZE, %esi ALIGN_2 .L170: testl $1, %ebx jle .L999 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 16 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 16 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L172 ALIGN_2 .L171: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 mulss 4 * SIZE(BB), %xmm0 movss 32 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss 8 * SIZE(BB), %xmm0 addss %xmm0, %xmm6 movss 3 * SIZE(AA), %xmm0 mulss 12 * SIZE(BB), %xmm0 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm3 movss 5 * SIZE(AA), %xmm1 addss %xmm3, %xmm4 mulss 20 * SIZE(BB), %xmm1 movss 48 * SIZE(BB), %xmm3 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss 24 * SIZE(BB), %xmm1 addss %xmm1, %xmm6 movss 7 * SIZE(AA), %xmm1 mulss 28 * SIZE(BB), %xmm1 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L171 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movss 0 * SIZE + BUFFER, %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 8 * SIZE + BUFFER, %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movss 0 * SIZE(BB), %xmm2 XORPS %xmm4, %xmm4 movss 0 * SIZE(AA), %xmm0 XORPS %xmm5, %xmm5 movss 8 * SIZE(BB), %xmm3 XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L172 ALIGN_2 .L171: mulss %xmm0, %xmm2 movss 1 * SIZE(AA), %xmm0 addss %xmm2, %xmm4 mulss 4 * SIZE(BB), %xmm0 movss 16 * SIZE(BB), %xmm2 addss %xmm0, %xmm5 movss 2 * SIZE(AA), %xmm0 mulss %xmm0, %xmm3 movss 3 * SIZE(AA), %xmm0 addss %xmm3, %xmm6 mulss 12 * SIZE(BB), %xmm0 movss 24 * SIZE(BB), %xmm3 addss %xmm0, %xmm7 movss 8 * SIZE(AA), %xmm0 mulss %xmm1, %xmm2 movss 5 * SIZE(AA), %xmm1 addss %xmm2, %xmm4 mulss 20 * SIZE(BB), %xmm1 movss 32 * SIZE(BB), %xmm2 addss %xmm1, %xmm5 movss 6 * SIZE(AA), %xmm1 mulss %xmm1, %xmm3 movss 7 * SIZE(AA), %xmm1 addss %xmm3, %xmm6 mulss 28 * SIZE(BB), %xmm1 movss 40 * SIZE(BB), %xmm3 addss %xmm1, %xmm7 movss 12 * SIZE(AA), %xmm1 addl $ 8 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L171 ALIGN_2 #endif .L172: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L174 .L173: movss 0 * SIZE(AA), %xmm0 movss 0 * SIZE(BB), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm4 addl $1 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L173 ALIGN_4 .L174: addss %xmm5, %xmm4 addss %xmm7, %xmm6 addss %xmm6, %xmm4 MOVSD 0 * SIZE(%esi), %xmm0 #ifdef HAVE_SSE2 pshufd $0x50, %xmm4, %xmm2 #else movaps %xmm4, %xmm2 shufps $0x50, %xmm2, %xmm2 #endif mulps %xmm3, %xmm2 addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_beta.S000066400000000000000000000126641313527062700173200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #ifdef DOUBLE #define BETA_R 16 + STACK + ARGS(%esp) #define BETA_I 24 + STACK + ARGS(%esp) #define C 48 + STACK + ARGS(%esp) #define LDC 52 + STACK + ARGS(%esp) #else #define BETA_R 16 + STACK + ARGS(%esp) #define BETA_I 20 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define LDC 44 + STACK + ARGS(%esp) #endif PROLOGUE pushl %ebp pushl %edi pushl %esi PROFCODE movl M, %ebp movl N, %ecx movl LDC, %edx movl C, %edi FLD BETA_R FLD BETA_I testl %ebp, %ebp # if n <= 0 goto End jle .L83 testl %ecx, %ecx # if m <= 0 goto End jle .L83 fld %st(1) fabs fld %st(1) fabs faddp %st, %st(1) sall $ZBASE_SHIFT, %edx ftst fnstsw %ax andb $68, %ah #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif je .L71 ALIGN_2 .L53: movl %edi, %esi # c_offset1 = c_offset addl %edx, %edi # c_offset += ldc movl %ebp, %eax sarl $2, %eax jle .L56 ALIGN_2 .L57: #if defined(HAS_PREFETCH) && defined(PENTIUM3) prefetchnta 16 * SIZE(%esi) prefetchnta 24 * SIZE(%esi) #endif FSTU 0 * SIZE(%esi) # c_offset1 FSTU 1 * SIZE(%esi) FSTU 2 * SIZE(%esi) FSTU 3 * SIZE(%esi) FSTU 4 * SIZE(%esi) FSTU 5 * SIZE(%esi) FSTU 6 * SIZE(%esi) FSTU 7 * SIZE(%esi) addl $8 * SIZE, %esi # c_offset1 += 8 decl %eax # i-- jg .L57 ALIGN_2 .L56: movl %ebp, %eax andl $3, %eax jle .L62 ALIGN_2 .L63: FSTU 0 * SIZE(%esi) FSTU 1 * SIZE(%esi) addl $2 * SIZE,%esi decl %eax jg .L63 ALIGN_2 .L62: decl %ecx # j -- jg .L53 jmp .L83 ALIGN_3 .L71: movl %edi, %esi addl %edx, %edi # c_offset += ldc movl %ebp, %eax sarl $1, %eax jle .L84 ALIGN_3 .L85: #if defined(HAS_PREFETCH) && defined(PENTIUM3) prefetchnta 16 * SIZE(%esi) #endif fld %st(0) FMUL 0 * SIZE(%esi) fld %st(2) FMUL 1 * SIZE(%esi) faddp %st,%st(1) fld %st(2) FMUL 0 * SIZE(%esi) fld %st(2) FMUL 1 * SIZE(%esi) fsubrp %st,%st(1) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) fld %st(0) FMUL 2 * SIZE(%esi) fld %st(2) FMUL 3 * SIZE(%esi) faddp %st,%st(1) fld %st(2) FMUL 2 * SIZE(%esi) fld %st(2) FMUL 3 * SIZE(%esi) fsubrp %st,%st(1) FST 2 * SIZE(%esi) FST 3 * SIZE(%esi) addl $4 * SIZE, %esi decl %eax jg .L85 ALIGN_3 .L84: movl %ebp, %eax andl $1, %eax jle .L74 ALIGN_3 .L75: #if defined(HAS_PREFETCH) && defined(PENTIUM3) prefetchnta 16 * SIZE(%esi) #endif fld %st(0) FMUL 0 * SIZE(%esi) fld %st(2) FMUL 1 * SIZE(%esi) faddp %st,%st(1) fld %st(2) FMUL 0 * SIZE(%esi) fld %st(2) FMUL 1 * SIZE(%esi) fsubrp %st,%st(1) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) ALIGN_2 .L74: decl %ecx jg .L71 ALIGN_2 .L83: #ifndef C_SUN ffreep %st(0) ffreep %st(0) #else .byte 0xdf .byte 0xc0 .byte 0xdf .byte 0xc0 #endif popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x1.S000066400000000000000000000222531313527062700203510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define BX 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_K 12 + STACK + ARGS(%esp) #ifdef DOUBLE #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define STACK_A 32 + STACK + ARGS(%esp) #define STACK_B 36 + STACK + ARGS(%esp) #define STACK_C 40 + STACK + ARGS(%esp) #define STACK_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #else #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define STACK_A 24 + STACK + ARGS(%esp) #define STACK_B 28 + STACK + ARGS(%esp) #define STACK_C 32 + STACK + ARGS(%esp) #define STACK_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #endif PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #define M %esi #define K %edi #define A %ebx #define B %ecx #define C %edx #define LDC %ebp #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl STACK_K, K movl STACK_LDC, LDC sall $ZBASE_SHIFT, LDC cmpl $0, STACK_N jle .L29 cmpl $0, STACK_M jle .L29 ALIGN_4 .L30: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl %ebx, BX movl STACK_A, A movl STACK_C, C movl STACK_M, M ALIGN_4 .L34: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl STACK_B, B #else movl STACK_B, B movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 2), A leal (B, %eax, 2), B #endif #ifdef HAVE_SSE movl BX, %eax prefetcht2 0 * SIZE(%eax) prefetcht2 4 * SIZE(%eax) #if L2_SIZE > 262144 subl $-8 * SIZE, BX #elif L2_SIZE > 131072 prefetcht2 8 * SIZE(%eax) prefetcht2 12 * SIZE(%eax) subl $-16 * SIZE, BX #else prefetcht2 16 * SIZE(%eax) prefetcht2 20 * SIZE(%eax) prefetcht2 24 * SIZE(%eax) prefetcht2 28 * SIZE(%eax) subl $-32 * SIZE, BX #endif #endif fldz fldz fldz fldz FLD 4 * SIZE(B) # B5 FLD 4 * SIZE(A) # A5 FLD 0 * SIZE(B) # B0 FLD 0 * SIZE(A) # A0 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif #ifdef HAVE_SSE prefetcht2 2 * SIZE(C) #endif sarl $2, %eax je .L37 ALIGN_4 #define PREFETCH_OFFSET 40 .L38: #ifdef HAVE_SSE prefetchnta (PREFETCH_OFFSET) * SIZE(B) #ifdef CORE_KATMAI prefetcht0 (PREFETCH_OFFSET) * SIZE(A) #endif #endif fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) faddp %st, %st(5) FLD 0 * SIZE(B) fxch %st(1) #if defined(NN) || defined(CN) faddp %st, %st(4) #else fsubrp %st, %st(4) #endif FLD 1 * SIZE(A) fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) #if defined(NN) || defined(NC) faddp %st, %st(7) #else fsubrp %st, %st(7) #endif FLD 2 * SIZE(B) fxch %st(1) #if defined(NN) || defined(CC) fsubrp %st, %st(6) #else faddp %st, %st(6) #endif FLD 2 * SIZE(A) fmul %st, %st(1) FMUL 3 * SIZE(B) fxch %st(1) faddp %st, %st(5) FLD 2 * SIZE(B) fxch %st(1) #if defined(NN) || defined(CN) faddp %st, %st(4) #else fsubrp %st, %st(4) #endif FLD 3 * SIZE(A) fmul %st, %st(1) FMUL 3 * SIZE(B) fxch %st(1) #if defined(NN) || defined(NC) faddp %st, %st(7) #else fsubrp %st, %st(7) #endif FLD 8 * SIZE(B) fxch %st(1) #if defined(NN) || defined(CC) fsubrp %st, %st(6) #else faddp %st, %st(6) #endif FLD 8 * SIZE(A) fxch %st(2) #ifdef HAVE_SSE #ifdef DOUBLE prefetchnta (PREFETCH_OFFSET + 4) * SIZE(B) #ifdef CORE_KATMAI prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(A) #endif #endif #endif fmul %st, %st(3) FMUL 5 * SIZE(B) fxch %st(3) faddp %st, %st(5) FLD 4 * SIZE(B) fxch %st(3) #if defined(NN) || defined(CN) faddp %st, %st(4) #else fsubrp %st, %st(4) #endif FLD 5 * SIZE(A) fmul %st, %st(3) FMUL 5 * SIZE(B) fxch %st(3) #if defined(NN) || defined(NC) faddp %st, %st(7) #else fsubrp %st, %st(7) #endif FLD 6 * SIZE(B) fxch %st(3) #if defined(NN) || defined(CC) fsubrp %st, %st(6) #else faddp %st, %st(6) #endif FLD 6 * SIZE(A) fmul %st, %st(3) FMUL 7 * SIZE(B) fxch %st(3) faddp %st, %st(5) FLD 6 * SIZE(B) fxch %st(3) #if defined(NN) || defined(CN) faddp %st, %st(4) #else fsubrp %st, %st(4) #endif FLD 7 * SIZE(A) fmul %st, %st(3) FMUL 7 * SIZE(B) fxch %st(3) #if defined(NN) || defined(NC) faddp %st, %st(7) #else fsubrp %st, %st(7) #endif FLD 12 * SIZE(B) fxch %st(3) #if defined(NN) || defined(CC) fsubrp %st, %st(6) #else faddp %st, %st(6) #endif FLD 12 * SIZE(A) fxch %st(2) subl $-8 * SIZE, B subl $-8 * SIZE, A decl %eax jg .L38 ALIGN_4 .L37: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax jle .L43 ALIGN_2 .L54: fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) faddp %st, %st(5) FLD 0 * SIZE(B) fxch %st(1) #if defined(NN) || defined(CN) faddp %st, %st(4) #else fsubrp %st, %st(4) #endif FLD 1 * SIZE(A) fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) #if defined(NN) || defined(NC) faddp %st, %st(7) #else fsubrp %st, %st(7) #endif FLD 2 * SIZE(B) fxch %st(1) #if defined(NN) || defined(CC) fsubrp %st, %st(6) #else faddp %st, %st(6) #endif FLD 2 * SIZE(A) addl $2 * SIZE, A addl $2 * SIZE, B decl %eax jg .L54 ALIGN_3 .L43: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) FLD ALPHA_R fxch %st(3) FLD ALPHA_I fxch %st(5) faddp %st, %st(2) # ctemp3 += ctemp4 faddp %st, %st(2) # ctemp1 += ctemp2 fld %st(0) # copy ctemp2 fmul %st(4), %st # ctemp3 *= alpha_i fld %st(2) # copy ctemp1 fmul %st(4), %st # ctemp1 *= alpha_r fsubp %st, %st(1) # ctemp2 -= ctemp4 #ifndef TRMMKERNEL FADD 0 * SIZE(C) #endif FST 0 * SIZE(C) fmulp %st, %st(2) # ctemp3 *= alpha_i fmulp %st, %st(2) # ctemp1 *= alpha_r faddp %st, %st(1) # ctemp1 += ctemp3 #ifndef TRMMKERNEL FADD 1 * SIZE(C) #endif FST 1 * SIZE(C) addl $2 * SIZE, C #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 2), A leal (B, %eax, 2), B #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif decl M jg .L34 ALIGN_2 .L33: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif movl B, STACK_B addl LDC, STACK_C decl STACK_N jg .L30 ALIGN_2 .L29: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x1_atom.S000066400000000000000000000175441313527062700214000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #define AA %edx #define BB %ecx #define CO1 %esi #define LDC %ebp #define B %edi #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 addsd #define ADDSD4 subsd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADDSD1 addsd #define ADDSD2 subsd #define ADDSD3 addsd #define ADDSD4 addsd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 subsd #define ADDSD4 addsd #else #define ADDSD1 addsd #define ADDSD2 subsd #define ADDSD3 subsd #define ADDSD4 subsd #endif PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif sall $ZBASE_SHIFT, LDC movl M, %ebx testl %ebx, %ebx jle .L999 movl N, %eax testl %eax, %eax movl %eax, J jle .L999 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl B, BX movl C, CO1 addl LDC, C movl A, AA movl M, %ebx ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif movl BX, %eax prefetcht0 0 * SIZE(%eax) subl $-8 * SIZE, BX movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 prefetcht0 1 * SIZE(CO1) xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADDSD3 %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 ADDSD3 %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 2 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 3 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 2 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 3 * SIZE(BB), %xmm3 ADDSD3 %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 4 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 5 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 4 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 5 * SIZE(BB), %xmm3 ADDSD3 %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 6 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 7 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 6 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 7 * SIZE(BB), %xmm3 addl $8 * SIZE, BB addl $8 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADDSD3 %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movsd ALPHA_R, %xmm0 movsd ALPHA_I, %xmm1 ADDSD3 %xmm2, %xmm6 ADDSD4 %xmm3, %xmm7 addsd %xmm7, %xmm4 addsd %xmm5, %xmm6 movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 mulsd %xmm0, %xmm4 mulsd %xmm1, %xmm5 mulsd %xmm1, %xmm6 mulsd %xmm0, %xmm7 subsd %xmm6, %xmm4 addsd %xmm7, %xmm5 #ifndef TRMMKERNEL addsd 0 * SIZE(CO1), %xmm4 addsd 1 * SIZE(CO1), %xmm5 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, CO1 decl %ebx jg .L10 ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif movl BB, B decl J jg .L01 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x2.S000066400000000000000000000416571313527062700203630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #ifdef DOUBLE #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define STACK_A 32 + STACK + ARGS(%esp) #define STACK_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define STACK_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #else #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define STACK_A 24 + STACK + ARGS(%esp) #define STACK_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define STACK_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #endif #define A %edx #define B %ecx #define BB %ebx #define LDC %ebp #define BX %esi #define ADD1 faddp #if defined(NN) || defined(CN) #define ADD2 faddp #else #define ADD2 fsubrp #endif #if defined(NN) || defined(CC) #define ADD3 fsubrp #else #define ADD3 faddp #endif #if defined(NN) || defined(NC) #define ADD4 faddp #else #define ADD4 fsubrp #endif #define PREFETCHSIZE (8 * 5 + 4) #define AOFFSET 1 #define BOFFSET 1 #ifdef HAVE_3DNOW #define PREFETCH prefetch #else #define PREFETCH prefetcht0 #endif #define KERNEL \ PREFETCH PREFETCHSIZE * SIZE + AOFFSET(A, %eax, 2);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD2 %st, %st(5);\ FLD -14 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL -13 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -15 * SIZE + AOFFSET(A, %eax, 2);\ FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD -16 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD -13 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL -14 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD -14 * SIZE + AOFFSET(A, %eax, 2);\ FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD2 %st, %st(5);\ FLD -10 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL -9 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -13 * SIZE + AOFFSET(A, %eax, 2);\ FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD -9 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL -10 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD -12 * SIZE + AOFFSET(A, %eax, 2);\ FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD2 %st, %st(5);\ FLD -6 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL -5 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -11 * SIZE + AOFFSET(A, %eax, 2);\ FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD -5 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL -6 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD -10 * SIZE + AOFFSET(A, %eax, 2);\ FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD2 %st, %st(5);\ FLD -2 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL -1 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -9 * SIZE + AOFFSET(A, %eax, 2);\ FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD -1 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL -2 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD 8 * SIZE + AOFFSET(A, %eax, 2);\ fxch %st(1);\ FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + AOFFSET(A, %eax, 2);\ ADD2 %st, %st(5);\ FLD 2 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL 3 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -7 * SIZE + AOFFSET(A, %eax, 2);\ FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD 3 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL 2 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD -6 * SIZE + AOFFSET(A, %eax, 2);\ FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD2 %st, %st(5);\ FLD 6 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL 7 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -5 * SIZE + AOFFSET(A, %eax, 2);\ FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD 7 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL 6 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD -4 * SIZE + AOFFSET(A, %eax, 2);\ FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD2 %st, %st(5);\ FLD 10 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL 11 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -3 * SIZE + AOFFSET(A, %eax, 2);\ FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD 11 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL 10 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD -2 * SIZE + AOFFSET(A, %eax, 2);\ FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(4);\ FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD2 %st, %st(5);\ FLD 14 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD1 %st, %st(6);\ FMUL 15 * SIZE + BOFFSET(B, %eax, 4);\ ADD2 %st, %st(6);\ FLD -1 * SIZE + AOFFSET(A, %eax, 2);\ FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(4);\ FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD4 %st, %st(5);\ FLD 15 * SIZE + BOFFSET(B, %eax, 4);\ fmul %st(1), %st;\ ADD3 %st, %st(6);\ FMUL 14 * SIZE + BOFFSET(B, %eax, 4);\ ADD4 %st, %st(6);\ FLD 16 * SIZE + AOFFSET(A, %eax, 2);\ fxch %st(2);\ FLD 0 * SIZE + BOFFSET(BB, %eax, 4);\ subl $-8 * SIZE, %eax /* A hint of scheduling is received from following URL http://www.netlib.org/atlas/atlas-comm/msg00260.html */ PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(TRMMKERNEL) && !defined(LEFT) movl OFFSET, %eax negl %eax movl %eax, KK #endif movl STACK_LDC, LDC sall $ZBASE_SHIFT, LDC subl $(AOFFSET - 16 * SIZE), STACK_A subl $(BOFFSET - 16 * SIZE), STACK_B movl M, %eax testl %eax, %eax jle .L999 movl N, %eax testl %eax, %eax jle .L999 movl K, %eax testl %eax, %eax jle .L999 movl N, %eax sarl $1, %eax movl %eax, J je .L20 ALIGN_3 .L11: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl STACK_B, B movl C, %edi movl K, BX sall $ZBASE_SHIFT + 1, BX addl B, BX movl M, %eax movl %eax, I ALIGN_3 .L14: prefetchnta -16 * SIZE + BOFFSET(BX) prefetchnta -8 * SIZE + BOFFSET(BX) subl $-16 * SIZE, BX movl STACK_B, B #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 2), A leal (B, %eax, 4), B #endif fldz fldz fldz fldz FLD 0 * SIZE + AOFFSET(A) FLD -8 * SIZE + AOFFSET(A) FLD -16 * SIZE + AOFFSET(A) FLD -16 * SIZE + BOFFSET(B) #ifdef HAVE_3DNOW prefetchw 1 * SIZE(%edi) prefetchw 2 * SIZE(%edi, LDC) #elif defined(HAVE_SSE) prefetcht0 1 * SIZE(%edi) prefetcht0 2 * SIZE(%edi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif andl $-8, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 2), A leal 16 * SIZE(B, %eax, 4), BB leal (B, %eax, 4), B negl %eax NOBRANCH je .L16 ALIGN_4 .L15: KERNEL jge .L16 KERNEL jge .L16 KERNEL jge .L16 KERNEL jl .L15 ALIGN_4 .L16: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $7, %eax je .L19 ALIGN_4 .L17: fmul %st(1), %st ADD1 %st, %st(4) FLD -15 * SIZE + BOFFSET(B) fmul %st(1), %st ADD2 %st, %st(5) FLD -14 * SIZE + BOFFSET(B) fmul %st(1), %st ADD1 %st, %st(6) FMUL -13 * SIZE + BOFFSET(B) ADD2 %st, %st(6) FLD -15 * SIZE + AOFFSET(A) FLD -15 * SIZE + BOFFSET(B) fmul %st(1), %st ADD3 %st, %st(4) FLD -16 * SIZE + BOFFSET(B) fmul %st(1), %st ADD4 %st, %st(5) FLD -13 * SIZE + BOFFSET(B) fmul %st(1), %st ADD3 %st, %st(6) FMUL -14 * SIZE + BOFFSET(B) ADD4 %st, %st(6) FLD -14 * SIZE + AOFFSET(A) FLD -12 * SIZE + BOFFSET(B) addl $2 * SIZE,A addl $4 * SIZE,B decl %eax jne .L17 ALIGN_4 .L19: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) FLD ALPHA_R fmul %st(1), %st FLD ALPHA_I fmul %st(3), %st fsubrp %st, %st(1) fxch %st(2) FMUL ALPHA_R fxch %st(1) FMUL ALPHA_I faddp %st, %st(1) #ifndef TRMMKERNEL FADD 1 * SIZE(%edi) FST 1 * SIZE(%edi) FADD 0 * SIZE(%edi) FST 0 * SIZE(%edi) #else FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) #endif FLD ALPHA_R fmul %st(1), %st FLD ALPHA_I fmul %st(3), %st fsubrp %st, %st(1) fxch %st(2) FMUL ALPHA_R fxch %st(1) FMUL ALPHA_I faddp %st, %st(1) #ifndef TRMMKERNEL FADD 1 * SIZE(%edi,LDC) FST 1 * SIZE(%edi,LDC) FADD 0 * SIZE(%edi,LDC) FST 0 * SIZE(%edi,LDC) #else FST 1 * SIZE(%edi,LDC) FST 0 * SIZE(%edi,LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 2), A leal (B, %eax, 4), B #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %edi decl I jne .L14 #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C movl B, STACK_B decl J jne .L11 ALIGN_4 .L20: movl N, %eax andl $1, %eax je .L999 ALIGN_3 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl STACK_A, A movl STACK_B, B movl C, %edi movl M, %eax movl %eax, I ALIGN_3 .L24: movl STACK_B, B #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (A, %eax, 2), A leal (B, %eax, 2), B #endif fldz fldz fldz fldz FLD -16 * SIZE + AOFFSET(A) FLD -16 * SIZE + BOFFSET(B) prefetchw 1 * SIZE(%edi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $2, %eax je .L26 ALIGN_3 .L25: fmul %st(1), %st PADDING ADD1 %st, %st(2) FMUL -15 * SIZE + BOFFSET(B) ADD2 %st, %st(2) FLD -15 * SIZE + AOFFSET(A) FLD -16 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD4 %st, %st(4) FMUL -15 * SIZE + BOFFSET(B) ADD3 %st, %st(4) FLD -14 * SIZE + AOFFSET(A) FLD -14 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD1 %st, %st(2) FMUL -13 * SIZE + BOFFSET(B) ADD2 %st, %st(2) FLD -13 * SIZE + AOFFSET(A) FLD -14 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD4 %st, %st(4) FMUL -13 * SIZE + BOFFSET(B) ADD3 %st, %st(4) FLD -12 * SIZE + AOFFSET(A) FLD -12 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD1 %st, %st(2) FMUL -11 * SIZE + BOFFSET(B) ADD2 %st, %st(2) FLD -11 * SIZE + AOFFSET(A) FLD -12 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD4 %st, %st(4) FMUL -11 * SIZE + BOFFSET(B) ADD3 %st, %st(4) FLD -10 * SIZE + AOFFSET(A) FLD -10 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD1 %st, %st(2) FMUL -9 * SIZE + BOFFSET(B) ADD2 %st, %st(2) FLD -9 * SIZE + AOFFSET(A) FLD -10 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD4 %st, %st(4) FMUL -9 * SIZE + BOFFSET(B) ADD3 %st, %st(4) FLD -8 * SIZE + AOFFSET(A) FLD -8 * SIZE + BOFFSET(B) addl $8 * SIZE,A addl $8 * SIZE,B decl %eax jne .L25 ALIGN_4 .L26: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif and $3, %eax je .L29 ALIGN_4 .L27: fmul %st(1), %st PADDING ADD1 %st, %st(2) FMUL -15 * SIZE + BOFFSET(B) ADD2 %st, %st(2) FLD -15 * SIZE + AOFFSET(A) FLD -16 * SIZE + BOFFSET(B) fmul %st(1), %st PADDING ADD4 %st, %st(4) FMUL -15 * SIZE + BOFFSET(B) ADD3 %st, %st(4) FLD -14 * SIZE + AOFFSET(A) FLD -14 * SIZE + BOFFSET(B) addl $2 * SIZE,A addl $2 * SIZE,B decl %eax jne .L27 ALIGN_4 .L29: ffreep %st(0) ffreep %st(0) faddp %st, %st(3) faddp %st, %st(1) fxch %st(1) FLD ALPHA_R fmul %st(1), %st FLD ALPHA_I fmul %st(3), %st fsubrp %st, %st(1) fxch %st(2) FMUL ALPHA_R fxch %st(1) FMUL ALPHA_I faddp %st, %st(1) #ifndef TRMMKERNEL FADD 1 * SIZE(%edi) FST 1 * SIZE(%edi) FADD 0 * SIZE(%edi) FST 0 * SIZE(%edi) #else FST 1 * SIZE(%edi) FST 0 * SIZE(%edi) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (A, %eax, 2), A leal (B, %eax, 2), B #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %edi decl I jne .L24 #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif addl LDC, C movl B, STACK_B ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x2_3dnow.S000066400000000000000000000534331313527062700214700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) #define OLD_ALPHA_R 16 + STACK + ARGS(%esi) #define OLD_ALPHA_I 20 + STACK + ARGS(%esi) #define OLD_A 24 + STACK + ARGS(%esi) #define OLD_B 28 + STACK + ARGS(%esi) #define OLD_C 32 + STACK + ARGS(%esi) #define OLD_LDC 36 + STACK + ARGS(%esi) #define OLD_OFFSET 40 + STACK + ARGS(%esi) #define GAMMA_R 0(%esp) #define GAMMA_I 8(%esp) #define ALPHA 16(%esp) #define K 24(%esp) #define N 28(%esp) #define M 32(%esp) #define A 36(%esp) #define C 40(%esp) #define J 44(%esp) #define OLD_STACK 48(%esp) #define OFFSET 52(%esp) #define KK 56(%esp) #define KKK 60(%esp) #define BUFFER 128(%esp) #define AA %edx #define BB %ecx #define PREFETCHSIZE (16 * 2 + 6) #define AOFFSET -32 #define BOFFSET 128 /* A hint of scheduling is received from following URL https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11 */ PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx andl $-1024, %esp # align stack STACK_TOUCHING movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx movl %ebx, M movl %eax, N movl %ecx, K subl $AOFFSET * SIZE, %edx movl %edx, A movl %esi, OLD_STACK testl %ebx, %ebx jle .L999 movl OLD_B, %edi movl OLD_C, %ebx EMMS movd OLD_ALPHA_R, %mm0 movd OLD_ALPHA_I, %mm1 movd %mm0, 0 + ALPHA movd %mm1, 4 + ALPHA #if defined(NN) || defined(NT) || defined(TN) || defined(TT) movl $0x3f800000, 0 + GAMMA_R movl $0x3f800000, 4 + GAMMA_R movl $0xbf800000, 0 + GAMMA_I movl $0x3f800000, 4 + GAMMA_I #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) movl $0x3f800000, 0 + GAMMA_R movl $0x3f800000, 4 + GAMMA_R movl $0x3f800000, 0 + GAMMA_I movl $0xbf800000, 4 + GAMMA_I #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) movl $0x3f800000, 0 + GAMMA_R movl $0xbF800000, 4 + GAMMA_R movl $0x3f800000, 0 + GAMMA_I movl $0x3F800000, 4 + GAMMA_I #else movl $0x3f800000, 0 + GAMMA_R movl $0xbf800000, 4 + GAMMA_R movl $0xbf800000, 0 + GAMMA_I movl $0xbf800000, 4 + GAMMA_I #endif movl %ebx, C movl OLD_LDC, %ebp leal (, %ebp, SIZE * 2), %ebp #ifdef TRMMKERNEL movl OLD_OFFSET, %eax movl %eax, OFFSET #ifndef LEFT negl %eax movl %eax, KK #endif #endif movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L20 ALIGN_4 .L01: /* Copying to Sub Buffer */ leal BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L03 ALIGN_4 .L02: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 prefetchnta 72 * SIZE(%edi) punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm7, 14 * SIZE(BB) movd 8 * SIZE(%edi), %mm0 movd 9 * SIZE(%edi), %mm1 movd 10 * SIZE(%edi), %mm2 movd 11 * SIZE(%edi), %mm3 movd 12 * SIZE(%edi), %mm4 movd 13 * SIZE(%edi), %mm5 movd 14 * SIZE(%edi), %mm6 movd 15 * SIZE(%edi), %mm7 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 16 * SIZE(BB) movq %mm1, 18 * SIZE(BB) movq %mm2, 20 * SIZE(BB) movq %mm3, 22 * SIZE(BB) movq %mm4, 24 * SIZE(BB) movq %mm5, 26 * SIZE(BB) movq %mm6, 28 * SIZE(BB) movq %mm7, 30 * SIZE(BB) addl $16 * SIZE, %edi addl $32 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: movl K, %eax andl $3, %eax BRANCH jle .L10 ALIGN_4 .L04: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 movq %mm0, 0 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm3, 6 * SIZE(BB) addl $4 * SIZE, %edi addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L10: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx ALIGN_4 .L11: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 16 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 pxor %mm7, %mm7 prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, %ebp) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L15 ALIGN_4 .L12: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 8 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 10 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 12 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 14 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm5 PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 32 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 18 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 20 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 22 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 24 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2 pfmul (102 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 26 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm5 PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2 pfmul (110 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 28 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3 pfmul (118 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 30 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm5 PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3 pfmul (126 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 48 + AOFFSET) * SIZE(AA), %mm1 subl $-32 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L12 ALIGN_3 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm5 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movq GAMMA_R, %mm0 movq GAMMA_I, %mm1 movq ALPHA, %mm2 pswapd %mm5, %mm5 pswapd %mm7, %mm7 pfmul %mm0, %mm4 pfmul %mm1, %mm5 pfmul %mm0, %mm6 pfmul %mm1, %mm7 pfadd %mm5, %mm4 pfadd %mm7, %mm6 pswapd %mm4, %mm5 pswapd %mm6, %mm7 pfmul %mm2, %mm4 pfmul %mm2, %mm6 pfmul %mm2, %mm5 pfmul %mm2, %mm7 pfpnacc %mm5, %mm4 pfpnacc %mm7, %mm6 #ifndef TRMMKERNEL pfadd (%esi), %mm4 pfadd (%esi, %ebp), %mm6 #endif movq %mm4, (%esi) movq %mm6, (%esi, %ebp) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %esi decl %ebx jg .L11 ALIGN_4 .L19: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, %ebp, 2), %eax addl %eax, C # c += ldc decl J # j -- jg .L01 ALIGN_4 .L20: movl N, %eax andl $1, %eax jle .L999 ALIGN_4 .L21: /* Copying to Sub Buffer */ movl K, %eax leal BUFFER, BB sarl $2, %eax jle .L25 ALIGN_4 .L22: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd 2 * SIZE(%edi), %mm2 movd 3 * SIZE(%edi), %mm3 movd 4 * SIZE(%edi), %mm4 movd 5 * SIZE(%edi), %mm5 movd 6 * SIZE(%edi), %mm6 movd 7 * SIZE(%edi), %mm7 prefetchnta 72 * SIZE(%edi) punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(BB) movq %mm1, 2 * SIZE(BB) movq %mm2, 4 * SIZE(BB) movq %mm3, 6 * SIZE(BB) movq %mm4, 8 * SIZE(BB) movq %mm5, 10 * SIZE(BB) movq %mm6, 12 * SIZE(BB) movq %mm7, 14 * SIZE(BB) addl $ 8 * SIZE, %edi addl $16 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: movl K, %eax andl $3, %eax BRANCH jle .L30 ALIGN_4 .L26: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 movd %mm0, 0 * SIZE(BB) movd %mm0, 1 * SIZE(BB) movd %mm1, 2 * SIZE(BB) movd %mm1, 3 * SIZE(BB) addl $2 * SIZE, %edi addl $4 * SIZE, BB decl %eax jne .L26 ALIGN_4 .L30: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx ALIGN_3 .L31: leal - BOFFSET * SIZE + BUFFER, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movq ( 0 + AOFFSET) * SIZE(AA), %mm0 pxor %mm4, %mm4 movq ( 16 + AOFFSET) * SIZE(AA), %mm1 pxor %mm5, %mm5 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 pxor %mm6, %mm6 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 pxor %mm7, %mm7 prefetchw 2 * SIZE(%esi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $4, %eax je .L35 ALIGN_4 .L32: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 4 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 6 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm2 pfadd %mm2, %mm6 PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 8 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 10 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 12 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm4 PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 14 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm0, %mm3 pfadd %mm3, %mm6 PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm7 movq ( 32 + AOFFSET) * SIZE(AA), %mm0 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 18 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 20 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm4 PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 22 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm2 pfadd %mm2, %mm6 PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 24 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 26 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 28 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm4 PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm5 movq ( 30 + AOFFSET) * SIZE(AA), %mm1 pfmul %mm1, %mm3 pfadd %mm3, %mm6 PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 pfadd %mm1, %mm7 movq ( 48 + AOFFSET) * SIZE(AA), %mm1 subl $-32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L32 ALIGN_3 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $15, %eax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: pfmul %mm0, %mm2 pfadd %mm2, %mm4 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 pfadd %mm0, %mm5 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: pfadd %mm6, %mm4 pfadd %mm7, %mm5 movq ALPHA, %mm2 pswapd %mm5, %mm5 pfmul GAMMA_R, %mm4 pfmul GAMMA_I, %mm5 pfadd %mm5, %mm4 pswapd %mm4, %mm5 pfmul %mm2, %mm4 pfmul %mm2, %mm5 pfpnacc %mm5, %mm4 #ifndef TRMMKERNEL pfadd 0 * SIZE(%esi), %mm4 #endif movq %mm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L31 ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x2_barcelona.S000066400000000000000000000371141313527062700223620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define OLD_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define OLD_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #define B %edi #define LDC %ebp #define AO %edx #define BO %ecx #define CO %esi #define I %ebx #define movsd movlps #define movapd movups #define movlpd movlps #define movhpd movhps #define PREFETCH prefetch #define PREFETCHSIZE (8 * 7 + 0) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 addpd #define ADD2 subpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 subpd #define ADD2 addpd #else #define ADD1 subpd #define ADD2 subpd #endif #define KERNEL1(address) \ mulpd %xmm0, %xmm1; \ PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %eax, 2); \ mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ ADD1 %xmm1, %xmm4; \ movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ ADD1 %xmm0, %xmm6; \ movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ ADD2 %xmm0, %xmm7; \ movddup -14 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL2(address) \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ ADD1 %xmm1, %xmm4; \ movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ ADD1 %xmm0, %xmm6; \ movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ ADD2 %xmm0, %xmm7; \ movddup -12 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL3(address) \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ ADD1 %xmm1, %xmm4; \ movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ ADD1 %xmm0, %xmm6; \ movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ ADD2 %xmm0, %xmm7; \ movddup -10 * SIZE(AO, %eax, 2), %xmm0 #define KERNEL4(address) \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2; \ mulpd %xmm0, %xmm1; \ mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ ADD1 %xmm1, %xmm4; \ movapd (BO, %eax, 4), %xmm1; \ ADD1 %xmm0, %xmm6; \ movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ ADD2 %xmm0, %xmm7; \ movddup (AO, %eax, 2), %xmm0 #define KERNEL5(address) \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ ADD1 %xmm1, %xmm4; \ movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ ADD1 %xmm3, %xmm6; \ movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ ADD2 %xmm3, %xmm7; \ movddup -6 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL6(address) \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ ADD1 %xmm1, %xmm4; \ movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ ADD1 %xmm3, %xmm6; \ movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ ADD2 %xmm3, %xmm7; \ movddup -4 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL7(address) \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ ADD1 %xmm1, %xmm4; \ movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ ADD1 %xmm3, %xmm6; \ movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ ADD2 %xmm3, %xmm7; \ movddup -2 * SIZE(AO, %eax, 2), %xmm3 #define KERNEL8(address) \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2; \ mulpd %xmm3, %xmm1; \ mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ ADD1 %xmm1, %xmm4; \ movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ ADD1 %xmm3, %xmm6; \ movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ mulpd %xmm3, %xmm2; \ mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ ADD2 %xmm3, %xmm7; \ movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ ADD2 %xmm2, %xmm5; \ movapd %xmm1, %xmm2 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl OLD_B, B movl OLD_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax movl %eax, BX movl C, CO movl A, AO movl M, I testl I, I jle .L100 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 4), BO #endif movl BX, %eax prefetcht2 0 * SIZE(%eax) subl $-8 * SIZE, BX movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 pxor %xmm4, %xmm4 movddup -8 * SIZE(AO), %xmm3 pxor %xmm5, %xmm5 prefetchw 1 * SIZE(CO) pxor %xmm6, %xmm6 prefetchw 1 * SIZE(CO, LDC) pxor %xmm7, %xmm7 movapd %xmm1, %xmm2 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif andl $-8, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO negl %eax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $8 * SIZE, %eax BRANCH jl .L12 ALIGN_3 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO negl %eax ALIGN_4 .L16: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 ADD1 %xmm1, %xmm4 movapd -12 * SIZE(BO, %eax, 4), %xmm1 ADD1 %xmm0, %xmm6 movddup -15 * SIZE(AO, %eax, 2), %xmm0 mulpd %xmm0, %xmm2 mulpd -14 * SIZE(BO, %eax, 4), %xmm0 ADD2 %xmm0, %xmm7 movddup -14 * SIZE(AO, %eax, 2), %xmm0 ADD2 %xmm2, %xmm5 movapd %xmm1, %xmm2 addl $SIZE, %eax jl .L16 ALIGN_4 .L14: #ifndef TRMMKERNEL movupd 0 * SIZE(CO), %xmm0 movupd 0 * SIZE(CO, LDC), %xmm1 #endif movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 #else addsubpd %xmm4, %xmm5 addsubpd %xmm6, %xmm7 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm5 movapd %xmm7, %xmm6 pshufd $0x4e, %xmm7, %xmm7 #endif mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm2, %xmm6 mulpd %xmm3, %xmm7 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 #endif movlpd %xmm4, 0 * SIZE(CO) movhpd %xmm4, 1 * SIZE(CO) movlpd %xmm6, 0 * SIZE(CO, LDC) movhpd %xmm6, 1 * SIZE(CO, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, CO # coffset += 4 decl I # i -- jg .L10 ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BO, B leal (, LDC, 2), %eax addl %eax, C # c += ldc decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L500 ALIGN_4 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, CO movl A, AO movl M, I testl %ebx, I jle .L500 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BO #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 movddup -15 * SIZE(AO), %xmm1 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 prefetchw 1 * SIZE(CO) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -16 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm4 movddup -14 * SIZE(AO), %xmm0 mulpd -16 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm5 movddup -13 * SIZE(AO), %xmm1 mulpd -14 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm6 movddup -12 * SIZE(AO), %xmm0 mulpd -14 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm7 movddup -11 * SIZE(AO), %xmm1 mulpd -12 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm4 movddup -10 * SIZE(AO), %xmm0 mulpd -12 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm5 movddup -9 * SIZE(AO), %xmm1 mulpd -10 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm6 movddup -8 * SIZE(AO), %xmm0 mulpd -10 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm7 movddup -7 * SIZE(AO), %xmm1 mulpd -8 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm4 movddup -6 * SIZE(AO), %xmm0 mulpd -8 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm5 movddup -5 * SIZE(AO), %xmm1 mulpd -6 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm6 movddup -4 * SIZE(AO), %xmm0 mulpd -6 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm7 movddup -3 * SIZE(AO), %xmm1 mulpd -4 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm4 movddup -2 * SIZE(AO), %xmm0 mulpd -4 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm5 movddup -1 * SIZE(AO), %xmm1 mulpd -2 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm6 movddup 0 * SIZE(AO), %xmm0 mulpd -2 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm7 movddup 1 * SIZE(AO), %xmm1 subl $-16 * SIZE, AO subl $-16 * SIZE, BO decl %eax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulpd -16 * SIZE(BO), %xmm0 ADD1 %xmm0, %xmm4 movddup -14 * SIZE(AO), %xmm0 mulpd -16 * SIZE(BO), %xmm1 ADD2 %xmm1, %xmm5 movddup -13 * SIZE(AO), %xmm1 addl $2 * SIZE, AO addl $2 * SIZE, BO decl %eax jg .L113 ALIGN_4 .L114: #ifndef TRMMKERNEL movupd 0 * SIZE(CO), %xmm0 #endif movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) addsubpd %xmm5, %xmm4 pshufd $0x4e, %xmm4, %xmm5 #else addsubpd %xmm4, %xmm5 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm5 #endif mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 addsubpd %xmm5, %xmm4 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 #endif movlpd %xmm4, 0 * SIZE(CO) movhpd %xmm4, 1 * SIZE(CO) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AO, %eax, 2), AO leal (BO, %eax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, CO # coffset += 4 decl I # i -- jg .L110 ALIGN_4 .L500: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x2_penryn.S000066400000000000000000000346051313527062700217510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #ifdef NANO #define PREFETCHSIZE (8 * 3 + 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht0 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht0 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (8 * 13 + 4) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addpd #define ADD2 addpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 addpd #define ADD2 addpd #else #define ADD1 addpd #define ADD2 subpd #endif PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx jle .L999 subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC movl N, %eax sarl $1, %eax movl %eax, J jle .L20 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl B, BX movl C, C1 # coffset = c movl A, AA # aoffset = a movl M, %ebx ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movl BX, %eax PREFETCHB -16 * SIZE(%eax) subl $-8 * SIZE, %eax movl %eax, BX movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 xorps %xmm4, %xmm4 PREFETCHW 1 * SIZE(C1) xorps %xmm5, %xmm5 PREFETCHW 3 * SIZE(C1, LDC) xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADD1 %xmm3, %xmm6 movaps -14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -10 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -6 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -2 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) movaps 2 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 6 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 10 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 16 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 subl $-32 * SIZE, BB pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: ADD1 %xmm3, %xmm6 movaps -14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: ADD1 %xmm3, %xmm6 pcmpeqb %xmm0, %xmm0 ADD2 %xmm2, %xmm7 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #endif #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 0 * SIZE(C1, LDC), %xmm1 movhpd 1 * SIZE(C1, LDC), %xmm1 #endif haddpd %xmm5, %xmm4 haddpd %xmm7, %xmm6 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm2, %xmm4 mulpd %xmm2, %xmm6 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm7 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm6, 0 * SIZE(C1, LDC) movhpd %xmm6, 1 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, C1 # coffset += 4 decl %ebx # i -- jg .L10 #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B leal (, LDC, 2), %eax addl %eax, C # c += ldc decl J # j -- jg .L01 ALIGN_4 .L20: movl N, %eax testl $1, %eax jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 # coffset = c movl A, AA # aoffset = a movl M, %ebx ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 1 * SIZE(C1) pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -10 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -6 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -2 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps 0 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addpd %xmm6, %xmm4 pcmpeqb %xmm0, %xmm0 addpd %xmm7, %xmm5 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm4 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 pxor %xmm0, %xmm5 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm5 #endif #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 #endif haddpd %xmm5, %xmm4 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 addsubpd %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, C1 decl %ebx # i -- jg .L21 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x2_sse2.S000066400000000000000000000473321313527062700213130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define OLD_STACK 72(%esp) #define OFFSET 76(%esp) #define KK 80(%esp) #define KKK 84(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #endif #define PREFETCHSIZE (8 * 10 + 4) #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, B movl STACK_C, %ebx #ifdef TRMMKERNEL movss STACK_OFFT, %xmm4 #endif movlpd STACK_ALPHA_R, %xmm0 movlpd STACK_ALPHA_I, %xmm1 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm0, 0 + ALPHA_R movlpd %xmm0, 8 + ALPHA_R movlpd %xmm1, 8 + ALPHA_I xorpd %xmm7, %xmm1 movlpd %xmm1, 0 + ALPHA_I movlpd %xmm2, 0 + POSINV movlpd %xmm7, 8 + POSINV movl %ebx, C movl STACK_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC sarl $1, %eax movl %eax, J # j = n jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal BUFFER, %ecx movapd POSINV, %xmm7 movl K, %eax sarl $1, %eax jle .L03 ALIGN_4 .L02: prefetchnta 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) movlpd %xmm4, 8 * SIZE(BB) movlpd %xmm4, 9 * SIZE(BB) movlpd %xmm5, 10 * SIZE(BB) movlpd %xmm5, 11 * SIZE(BB) movlpd %xmm6, 12 * SIZE(BB) movlpd %xmm6, 13 * SIZE(BB) movlpd %xmm7, 14 * SIZE(BB) movlpd %xmm7, 15 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: movl K, %eax andl $1, %eax BRANCH jle .L05 movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L05: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx testl %ebx, %ebx jle .L100 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetchw 2 * SIZE(%esi) prefetchw 2 * SIZE(%esi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L11 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: movapd POSINV, %xmm1 movapd ALPHA_R, %xmm2 movapd ALPHA_I, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #else xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #endif #ifndef TRMMKERNEL movlpd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movlpd 0 * SIZE(%esi, LDC), %xmm1 movhpd 1 * SIZE(%esi, LDC), %xmm1 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm2, %xmm6 mulpd %xmm3, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 #endif movlpd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movlpd %xmm6, 0 * SIZE(%esi, LDC) movhpd %xmm6, 1 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax addl %eax, C # c += ldc decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L500 ALIGN_4 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal BUFFER, %ecx movapd POSINV, %xmm7 movl K, %eax sarl $2, %eax jle .L103 ALIGN_4 .L102: prefetchnta 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) movlpd %xmm4, 8 * SIZE(BB) movlpd %xmm4, 9 * SIZE(BB) movlpd %xmm5, 10 * SIZE(BB) movlpd %xmm5, 11 * SIZE(BB) movlpd %xmm6, 12 * SIZE(BB) movlpd %xmm6, 13 * SIZE(BB) movlpd %xmm7, 14 * SIZE(BB) movlpd %xmm7, 15 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, %ecx decl %eax jne .L102 ALIGN_4 .L103: movl K, %eax andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx testl %ebx, %ebx jle .L500 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: movapd POSINV, %xmm1 movapd ALPHA_R, %xmm2 movapd ALPHA_I, %xmm3 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 #else xorpd %xmm1, %xmm4 #endif #ifndef TRMMKERNEL movlpd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 addpd %xmm5, %xmm4 #ifndef TRMMKERNEL addpd %xmm0, %xmm4 #endif movlpd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L110 ALIGN_4 .L500: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_1x2_sse3.S000066400000000000000000000455131313527062700213130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #ifdef PENTIUM4 #define PREFETCH_R (8 * 4) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef PENTIUMM #define PREFETCH_R (8 * 4) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADDSUB addpd #else #define ADDSUB subpd #endif #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addpd %xmm2, %xmm4; \ movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7; \ movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL6(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7 #define KERNEL7(address) \ movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif sall $ZBASE_SHIFT, LDC movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl B, BX movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx testl %ebx, %ebx jle .L100 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB #endif movl BX, %eax prefetcht2 (PREFETCH_R + 0) * SIZE(%eax) prefetcht2 (PREFETCH_R + 16) * SIZE(%eax) subl $-8 * SIZE, BX movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef PENTIUM4 prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif #ifdef CORE_PRESCOTT andl $-8, %eax sall $4, %eax je .L12 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L11 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L11 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L11 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L11 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L11 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L11 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L11 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) #if 1 cmpl $128 * 8, %eax jle .L11 KERNEL1(16 * 8) KERNEL2(16 * 8) KERNEL3(16 * 8) KERNEL4(16 * 8) KERNEL5(16 * 8) KERNEL6(16 * 8) KERNEL7(16 * 8) KERNEL8(16 * 8) cmpl $128 * 9, %eax jle .L11 KERNEL1(16 * 9) KERNEL2(16 * 9) KERNEL3(16 * 9) KERNEL4(16 * 9) KERNEL5(16 * 9) KERNEL6(16 * 9) KERNEL7(16 * 9) KERNEL8(16 * 9) cmpl $128 * 10, %eax jle .L11 KERNEL1(16 * 10) KERNEL2(16 * 10) KERNEL3(16 * 10) KERNEL4(16 * 10) KERNEL5(16 * 10) KERNEL6(16 * 10) KERNEL7(16 * 10) KERNEL8(16 * 10) cmpl $128 * 11, %eax jle .L11 KERNEL1(16 * 11) KERNEL2(16 * 11) KERNEL3(16 * 11) KERNEL4(16 * 11) KERNEL5(16 * 11) KERNEL6(16 * 11) KERNEL7(16 * 11) KERNEL8(16 * 11) cmpl $128 * 12, %eax jle .L11 KERNEL1(16 * 12) KERNEL2(16 * 12) KERNEL3(16 * 12) KERNEL4(16 * 12) KERNEL5(16 * 12) KERNEL6(16 * 12) KERNEL7(16 * 12) KERNEL8(16 * 12) cmpl $128 * 13, %eax jle .L11 KERNEL1(16 * 13) KERNEL2(16 * 13) KERNEL3(16 * 13) KERNEL4(16 * 13) KERNEL5(16 * 13) KERNEL6(16 * 13) KERNEL7(16 * 13) KERNEL8(16 * 13) cmpl $128 * 14, %eax jle .L11 KERNEL1(16 * 14) KERNEL2(16 * 14) KERNEL3(16 * 14) KERNEL4(16 * 14) KERNEL5(16 * 14) KERNEL6(16 * 14) KERNEL7(16 * 14) KERNEL8(16 * 14) cmpl $128 * 15, %eax jle .L11 KERNEL1(16 * 15) KERNEL2(16 * 15) KERNEL3(16 * 15) KERNEL4(16 * 15) KERNEL5(16 * 15) KERNEL6(16 * 15) KERNEL7(16 * 15) KERNEL8(16 * 15) #else addl $32 * 4 * SIZE, AA addl $32 * 8 * SIZE, BB subl $128 * 8, %eax jg .L1X #endif .L11: leal (AA, %eax, 1), AA # * 16 leal (BB, %eax, 2), BB # * 64 #else sarl $3, %eax je .L12 ALIGN_4 .L11: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L11 ALIGN_4 #endif .L12: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA_R, %xmm1 movddup ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: pcmpeqb %xmm0, %xmm0 SHUFPD_1 %xmm5, %xmm5 psllq $63, %xmm0 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) shufps $0x04, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #else shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #endif addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm6 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm7 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm2 movhpd 1 * SIZE(%esi, LDC), %xmm2 addpd %xmm0, %xmm4 addpd %xmm2, %xmm6 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm6, 0 * SIZE(%esi, LDC) movhpd %xmm6, 1 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (, LDC, 2), %eax movl BB, B addl %eax, C # c += ldc decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax testl $1, %eax jle .L500 movl C, %esi # coffset = c movl A, AA # aoffset = a #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx jle .L500 ALIGN_4 L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je L112 ALIGN_4 L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne L111 ALIGN_4 L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movddup ALPHA_R, %xmm1 movddup ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je L114 ALIGN_4 L113: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg L113 ALIGN_4 L114: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 pcmpeqb %xmm0, %xmm0 SHUFPD_1 %xmm5, %xmm5 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) shufps $0x04, %xmm0, %xmm0 pxor %xmm0, %xmm5 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm5 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm4 #else shufps $0x40, %xmm0, %xmm0 pxor %xmm0, %xmm4 #endif addpd %xmm5, %xmm4 movaps %xmm4, %xmm5 SHUFPD_1 %xmm5, %xmm5 mulpd %xmm1, %xmm4 mulpd %xmm3, %xmm5 addsubpd %xmm5, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg L110 ALIGN_4 .L500: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_2x1_core2.S000066400000000000000000000352011313527062700214410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define BX 72(%esp) #define OLD_STACK 76(%esp) #define OFFSET 80(%esp) #define KK 84(%esp) #define KKK 88(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define PREFETCH_R (8 * 16 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 16 + 4) #define PREFETCH prefetcht0 #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define C1 %esi #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addpd #define ADD2 subpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 subpd #define ADD2 addpd #else #define ADD1 subpd #define ADD2 subpd #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movsd STACK_ALPHA_R, %xmm0 movsd STACK_ALPHA_I, %xmm1 movddup %xmm0, %xmm0 movddup %xmm1, %xmm1 movapd %xmm0, ALPHA_R movapd %xmm1, ALPHA_I movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC movl %eax, J # j = n testl %eax, %eax jle .L999 ALIGN_2 .L01: leal 16 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L03 ALIGN_2 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $ 2 * SIZE, B addl $ 4 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: movl B, BX movl C, C1 # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, BB #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 prefetcht0 3 * SIZE(C1) pxor %xmm7, %xmm7 movapd %xmm1, %xmm2 movl BX, %eax prefetcht0 (%eax) subl $-8 * SIZE, %eax movl %eax, BX #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 ADD1 %xmm2, %xmm6 movapd -12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 ADD2 %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm2, %xmm4 movapd -10 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 ADD2 %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm6 movapd -8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 PADDING; movapd 0 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 ADD1 %xmm2, %xmm6 movapd -4 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 ADD2 %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm3, %xmm2 ADD1 %xmm2, %xmm4 movapd -2 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 ADD2 %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm6 PADDING; movapd 0 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm2 movapd 8 * SIZE(AA), %xmm3 ADD2 %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 ADD1 %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 ADD2 %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm2, %xmm4 movapd 6 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm0 ADD2 %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm6 movapd 8 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 ADD1 %xmm2, %xmm6 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm3, %xmm1 movapd 12 * SIZE(AA), %xmm3 ADD2 %xmm1, %xmm7 PADDING; movapd %xmm2, %xmm1 mulpd %xmm3, %xmm2 ADD1 %xmm2, %xmm4 movapd 14 * SIZE(BB), %xmm2 mulpd %xmm2, %xmm3 subl $-32 * SIZE, BB ADD2 %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm6 movapd -16 * SIZE(BB), %xmm1 mulpd %xmm3, %xmm2 movapd 24 * SIZE(AA), %xmm3 ADD2 %xmm2, %xmm7 PADDING; movapd %xmm1, %xmm2 subl $-32 * SIZE, AA decl %eax BRANCH jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 ADD2 %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm2, %xmm6 mulpd %xmm0, %xmm3 movapd -12 * SIZE(AA), %xmm0 ADD2 %xmm3, %xmm7 movapd %xmm1, %xmm2 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: movapd ALPHA_R, %xmm2 movapd ALPHA_I, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 movapd %xmm4, %xmm5 movapd %xmm6, %xmm7 #else addsubpd %xmm4, %xmm5 addsubpd %xmm6, %xmm7 movapd %xmm5, %xmm4 movapd %xmm7, %xmm6 #endif #ifndef TRMMKERNEL movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 movsd 2 * SIZE(C1), %xmm1 movhpd 3 * SIZE(C1), %xmm1 #endif SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 mulpd %xmm2, %xmm4 mulpd %xmm2, %xmm6 mulpd %xmm3, %xmm5 mulpd %xmm3, %xmm7 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) movsd %xmm6, 2 * SIZE(C1) movhpd %xmm6, 3 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, C1 # coffset += 4 decl %ebx # i -- jg .L10 .L20: movl M, %ebx testl $1, %ebx je .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 16 * SIZE + BUFFER, %ecx #else leal 16 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax jle .L22 .L21: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 ADD1 %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 ADD2 %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 ADD1 %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 ADD2 %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 ADD1 %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 ADD1 %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 ADD1 %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 ADD1 %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax # l-- jg .L21 ALIGN_2 .L22: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # l = (k & 3) jle .L24 ALIGN_2 .L23: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax # l-- jg .L23 .L24: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 movapd ALPHA_R, %xmm2 movapd ALPHA_I, %xmm3 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm5, %xmm4 movapd %xmm4, %xmm5 #else addsubpd %xmm4, %xmm5 movapd %xmm5, %xmm4 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm0 movhpd 1 * SIZE(C1), %xmm0 #endif SHUFPD_1 %xmm5, %xmm5 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 addsubpd %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhpd %xmm4, 1 * SIZE(C1) ALIGN_2 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif addl LDC, C # c += ldc decl J # j -- jg .L01 .L999: movl OLD_STACK, %esp emms popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_2x1_sse2.S000066400000000000000000000444441313527062700213140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 4) #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define BX 72(%esp) #define OLD_STACK 76(%esp) #define OFFSET 80(%esp) #define KK 84(%esp) #define KKK 88(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define KERNEL1(address) \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movsd STACK_ALPHA_R, %xmm0 movsd STACK_ALPHA_I, %xmm1 pxor %xmm7, %xmm7 cmpeqpd %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movsd %xmm0, 0 + ALPHA_R movsd %xmm0, 8 + ALPHA_R movsd %xmm1, 8 + ALPHA_I xorpd %xmm7, %xmm1 movsd %xmm1, 0 + ALPHA_I #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) movsd %xmm7, 0 + POSINV movsd %xmm2, 8 + POSINV #else movsd %xmm2, 0 + POSINV movsd %xmm7, 8 + POSINV #endif movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC movl %eax, J # j = n testl %eax, %eax jle .L999 ALIGN_2 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal BUFFER, BB movapd POSINV, %xmm7 movl K, %eax sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 #else xorpd %xmm7, %xmm0 xorpd %xmm7, %xmm2 #endif movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) movapd %xmm2, 4 * SIZE(BB) movapd %xmm3, 6 * SIZE(BB) movsd 4 * SIZE(B), %xmm0 movsd 5 * SIZE(B), %xmm1 movsd 6 * SIZE(B), %xmm2 movsd 7 * SIZE(B), %xmm3 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 #else xorpd %xmm7, %xmm0 xorpd %xmm7, %xmm2 #endif movapd %xmm0, 8 * SIZE(BB) movapd %xmm1, 10 * SIZE(BB) movapd %xmm2, 12 * SIZE(BB) movapd %xmm3, 14 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorpd %xmm7, %xmm1 #else xorpd %xmm7, %xmm0 #endif movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) addl $ 2 * SIZE, B addl $ 4 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: movl B, BX movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L10: movl BX, %eax prefetcht2 0 * SIZE(%eax) subl $-8 * SIZE, BX #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movapd 0 * SIZE + BUFFER, %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #endif prefetchnta 3 * SIZE(%esi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (BB, %eax, 4), BB leal (AA, %eax, 4), AA .L12: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 .L14: movapd ALPHA_R, %xmm2 movapd ALPHA_I, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif movapd %xmm4, %xmm5 movapd %xmm6, %xmm7 SHUFPD_1 %xmm4, %xmm4 SHUFPD_1 %xmm6, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 movsd 2 * SIZE(%esi), %xmm1 movhpd 3 * SIZE(%esi), %xmm1 addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) movsd %xmm6, 2 * SIZE(%esi) movhpd %xmm6, 3 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 .L50: movl M, %ebx testl $1, %ebx je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, %ecx movapd 0 * SIZE + BUFFER, %xmm1 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE + BUFFER, %xmm2 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movapd 0 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $2, %eax # l = (k >> 2) jle .L52 .L51: mulpd %xmm0, %xmm1 movapd 2 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 16 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm3 movapd 2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm5 movapd 4 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 24 * SIZE(BB), %xmm2 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm5 movapd 8 * SIZE(AA), %xmm0 addl $ 8 * SIZE, AA # aoffset += 2 addl $16 * SIZE, BB # boffset1 += 4 decl %eax # l-- jg .L51 ALIGN_2 .L52: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $3, %eax # l = (k & 3) jle .L54 ALIGN_2 .L53: movapd 0 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 2 addl $4 * SIZE, BB # boffset1 += 4 decl %eax # l-- jg .L53 .L54: movapd ALPHA_R, %xmm2 movapd ALPHA_I, %xmm3 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif movapd %xmm4, %xmm5 SHUFPD_1 %xmm4, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm5, %xmm4 #ifndef TRMMKERNEL SHUFPD_2 %xmm4, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 addpd %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhpd %xmm4, 1 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif addl LDC, C # c += ldc decl J # j -- jg .L01 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_2 EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_2x2_barcelona.S000066400000000000000000000715131313527062700223640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 20 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define OLD_STACK 72(%esp) #define OFFSET 76(%esp) #define KK 80(%esp) #define KKK 84(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define PREFETCH prefetch #define PREFETCHSIZE (16 * 17 + 0) #define RPREFETCHSIZE (16 * 9 + 0) #define WPREFETCHSIZE (16 * 9 + 0) #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx #ifdef TRMMKERNEL movss STACK_OFFT, %xmm4 #endif movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 xorps %xmm7, %xmm7 cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask xorps %xmm2, %xmm2 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) movss %xmm7, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm7, 12 + POSINV #endif EMMS movl %ebx, C movl STACK_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC movl %eax, J # j = n sarl $1, J jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movaps POSINV, %xmm7 movl K, %eax sarl $1, %eax jle .L03 ALIGN_4 .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movaps 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps 4 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) movaps %xmm0, 16 * SIZE(%ecx) movaps %xmm1, 20 * SIZE(%ecx) movaps %xmm2, 24 * SIZE(%ecx) movaps %xmm3, 28 * SIZE(%ecx) addl $ 8 * SIZE, %edi subl $-32 * SIZE, %ecx decl %eax jne .L02 ALIGN_4 .L03: movl K, %eax andl $1, %eax BRANCH jle .L05 ALIGN_4 .L04: movaps 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) addl $ 4 * SIZE, %edi ALIGN_4 .L05: movl C, %esi movl A, %edx movl M, %ebx sarl $1, %ebx jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetchw 3 * SIZE(%esi) prefetchw 3 * SIZE(%esi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $128 * 8 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 subps %xmm7, %xmm6 #else addps %xmm5, %xmm4 addps %xmm7, %xmm6 #endif movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 mulps %xmm1, %xmm7 mulps %xmm3, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #ifndef TRMMKERNEL shufps $0xe4, %xmm0, %xmm0 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 shufps $0xe4, %xmm2, %xmm2 movsd 0 * SIZE(%esi, LDC), %xmm2 movhps 2 * SIZE(%esi, LDC), %xmm2 addps %xmm0, %xmm4 addps %xmm2, %xmm6 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movlps %xmm6, 0 * SIZE(%esi, LDC) movhps %xmm6, 2 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 ALIGN_4 .L30: movl M, %ebx andl $1, %ebx jle .L99 ALIGN_4 .L40: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movsd 0 * SIZE(AA), %xmm0 movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L42 ALIGN_4 .L41: mulps %xmm0, %xmm2 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 44 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 6 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 60 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 76 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 96 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 10 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 92 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 112 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 108 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 128 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 14 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 124 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 144 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 24 * SIZE(AA), %xmm1 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 subps %xmm7, %xmm6 #else addps %xmm5, %xmm4 addps %xmm7, %xmm6 #endif movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 mulps %xmm1, %xmm7 mulps %xmm3, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #ifndef TRMMKERNEL shufps $0xe4, %xmm4, %xmm4 shufps $0xe4, %xmm6, %xmm6 movsd 0 * SIZE(%esi), %xmm0 movsd 0 * SIZE(%esi, LDC), %xmm2 addps %xmm0, %xmm4 addps %xmm2, %xmm6 #endif movlps %xmm4, 0 * SIZE(%esi) movlps %xmm6, 0 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (LDC, LDC), %eax addl %eax, C # c += 2 * ldc decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L999 ALIGN_4 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movaps POSINV, %xmm7 movl K, %eax sarl $2, %eax jle .L103 ALIGN_4 .L102: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) movaps 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) movaps %xmm2, 8 * SIZE(%ecx) movaps %xmm3, 12 * SIZE(%ecx) movaps 4 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) movaps %xmm0, 16 * SIZE(%ecx) movaps %xmm1, 20 * SIZE(%ecx) movaps %xmm2, 24 * SIZE(%ecx) movaps %xmm3, 28 * SIZE(%ecx) addl $ 8 * SIZE, B subl $-32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: movl K, %eax andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: movsd 0 * SIZE(%edi), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 #else xorps %xmm7, %xmm0 #endif movaps %xmm0, 0 * SIZE(%ecx) movaps %xmm1, 4 * SIZE(%ecx) addl $ 2 * SIZE, %edi addl $ 8 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi movl A, AA movl M, %ebx sarl $1, %ebx jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 prefetchw 3 * SIZE(%esi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_4 .L111: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 12 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 20 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: addps %xmm6, %xmm4 addps %xmm7, %xmm5 shufps $0xb1, %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 #else addps %xmm5, %xmm4 #endif movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL shufps $0xe4, %xmm4, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L110 ALIGN_4 .L130: movl M, %ebx andl $1, %ebx jle .L999 ALIGN_4 .L140: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movsd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L142 ALIGN_4 .L141: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: addps %xmm6, %xmm4 addps %xmm7, %xmm5 shufps $0xb1, %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 #else addps %xmm5, %xmm4 #endif movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL shufps $0xe4, %xmm4, %xmm4 movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_2x2_penryn.S000066400000000000000000000571631313527062700217560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) #ifdef NANO #define PREFETCHSIZE (16 * 3 + 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht0 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht0 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (16 * 13 + 8) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define C1 %esi #define I %ebx #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addps #define ADD2 addps #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addps #define ADD2 addps #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 addps #define ADD2 addps #else #define ADD1 addps #define ADD2 subps #endif PROLOGUE subl $ARGS, %esp # Generate Stack Frame pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC #ifdef TRMMKERNEL movl OFFSET, %eax #ifndef LEFT negl %eax #endif movl %eax, KK #endif subl $-32 * SIZE, A subl $-32 * SIZE, B sall $ZBASE_SHIFT, LDC movl N, %eax sarl $1, %eax movl %eax, J jle .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl B, BX movl C, C1 movl A, AA movl M, %ebx sarl $1, %ebx jle .L20 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif movl BX, %eax PREFETCHB -32 * SIZE(%eax) subl $-16 * SIZE, %eax movl %eax, BX movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 xorps %xmm4, %xmm4 PREFETCHW 3 * SIZE(C1) xorps %xmm5, %xmm5 PREFETCHW 7 * SIZE(C1, LDC) xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L18 ALIGN_4 .L16: ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: ADD2 %xmm2, %xmm7 pcmpeqb %xmm0, %xmm0 ADD1 %xmm3, %xmm6 psllq $63, %xmm0 movsd ALPHA_R, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pshufd $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #endif haddps %xmm5, %xmm4 haddps %xmm7, %xmm6 shufps $0xd8, %xmm4, %xmm4 shufps $0xd8, %xmm6, %xmm6 movaps %xmm4, %xmm5 shufps $0xe4, %xmm6, %xmm4 shufps $0xe4, %xmm5, %xmm6 pshufd $0x00, %xmm3, %xmm2 pshufd $0x55, %xmm3, %xmm3 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 mulps %xmm2, %xmm6 mulps %xmm3, %xmm7 addsubps %xmm5, %xmm4 addsubps %xmm7, %xmm6 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm2 movhps 2 * SIZE(C1), %xmm2 movsd 0 * SIZE(C1, LDC), %xmm3 movhps 2 * SIZE(C1, LDC), %xmm3 addps %xmm2, %xmm4 addps %xmm3, %xmm6 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) movsd %xmm6, 0 * SIZE(C1, LDC) movhps %xmm6, 2 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, C1 decl %ebx jg .L10 ALIGN_4 .L20: movl M, %ebx testl $1, %ebx jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L28 ALIGN_4 .L26: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L26 ALIGN_4 .L28: addps %xmm2, %xmm6 addps %xmm3, %xmm7 movsd ALPHA_R, %xmm3 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 pshufd $0xb1, %xmm7, %xmm7 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 subps %xmm5, %xmm4 subps %xmm7, %xmm6 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #else pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 subps %xmm5, %xmm4 subps %xmm7, %xmm6 #endif pshufd $0x00, %xmm3, %xmm2 pshufd $0x55, %xmm3, %xmm3 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 mulps %xmm2, %xmm6 mulps %xmm3, %xmm7 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 subps %xmm5, %xmm4 subps %xmm7, %xmm6 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm2 movsd 0 * SIZE(C1, LDC), %xmm3 addps %xmm2, %xmm4 addps %xmm3, %xmm6 #endif movsd %xmm4, 0 * SIZE(C1) movsd %xmm6, 0 * SIZE(C1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif addl $2 * SIZE, C1 ALIGN_2 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movl BB, B leal (, LDC, 2), %eax addl %eax, C decl J jg .L01 ALIGN_4 .L30: movl N, %eax testl $1, %eax jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl C, C1 movl A, AA movl M, %ebx sarl $1, %ebx jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(C1) pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movsd -32 * SIZE(BB), %xmm1 andl $7, %eax BRANCH je .L38 ALIGN_4 .L36: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L36 ALIGN_4 .L38: addps %xmm2, %xmm4 addps %xmm3, %xmm5 movsd ALPHA_R, %xmm3 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm5 subps %xmm5, %xmm4 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pxor %xmm0, %xmm5 addps %xmm5, %xmm4 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm4 addps %xmm5, %xmm4 #else pxor %xmm0, %xmm4 subps %xmm5, %xmm4 #endif pshufd $0x00, %xmm3, %xmm2 pshufd $0x55, %xmm3, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 pxor %xmm0, %xmm5 subps %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm2 movhps 2 * SIZE(C1), %xmm2 addps %xmm2, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, C1 decl %ebx jg .L31 ALIGN_4 .L40: movl M, %ebx testl $1, %ebx jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl B, BB #else movl B, BB movl KK, %eax leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L45 ALIGN_4 .L42: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -22 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -18 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax BRANCH je .L48 ALIGN_4 .L46: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L46 ALIGN_4 .L48: addps %xmm2, %xmm4 addps %xmm3, %xmm5 movsd ALPHA_R, %xmm3 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm5 subps %xmm5, %xmm4 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pxor %xmm0, %xmm5 addps %xmm5, %xmm4 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm4 addps %xmm5, %xmm4 #else pxor %xmm0, %xmm4 subps %xmm5, %xmm4 #endif pshufd $0x00, %xmm3, %xmm2 pshufd $0x55, %xmm3, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 pxor %xmm0, %xmm5 subps %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm2 addps %xmm2, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_2x2_sse.S000066400000000000000000001034241313527062700212250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 20 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define OLD_STACK 72(%esp) #define OFFSET 76(%esp) #define KK 80(%esp) #define KKK 84(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #ifdef ATHLON #define PREFETCHSIZE 64 #define WPREFETCHSIZE 80 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #if defined(OPTERON) || defined(BARCELONA) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 168 #endif #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; #endif #ifdef PENTIUM4 #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx #ifdef TRMMKERNEL movss STACK_OFFT, %xmm4 #endif movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 xorps %xmm7, %xmm7 cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask xorps %xmm2, %xmm2 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) movss %xmm7, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm7, 12 + POSINV #endif EMMS movl %ebx, C movl STACK_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC movl %eax, J # j = n sarl $1, J jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movaps POSINV, %xmm7 movl K, %eax sarl $1, %eax jle .L03 ALIGN_4 .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movss 4 * SIZE(B), %xmm0 movss 5 * SIZE(B), %xmm1 movss 6 * SIZE(B), %xmm2 movss 7 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm2, 24 * SIZE(BB) movaps %xmm3, 28 * SIZE(BB) #ifdef PENTIUM4 prefetcht1 104 * SIZE(BB) #endif addl $ 8 * SIZE, %edi addl $32 * SIZE, %ecx decl %eax jne .L02 ALIGN_4 .L03: movl K, %eax andl $1, %eax BRANCH jle .L05 ALIGN_4 .L04: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) addl $ 4 * SIZE, %edi ALIGN_4 .L05: movl C, %esi movl A, %edx movl M, %ebx sarl $1, %ebx jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(OPTERON) || defined(BARCELONA) prefetchw 4 * SIZE(%esi) prefetchw 4 * SIZE(%esi, LDC) #endif #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $128 * 8 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) addl $ 32 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L11 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 subps %xmm7, %xmm6 #else addps %xmm5, %xmm4 addps %xmm7, %xmm6 #endif movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 mulps %xmm1, %xmm7 mulps %xmm3, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #ifndef TRMMKERNEL shufps $0xe4, %xmm0, %xmm0 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 shufps $0xe4, %xmm2, %xmm2 movsd 0 * SIZE(%esi, LDC), %xmm2 movhps 2 * SIZE(%esi, LDC), %xmm2 addps %xmm0, %xmm4 addps %xmm2, %xmm6 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movlps %xmm6, 0 * SIZE(%esi, LDC) movhps %xmm6, 2 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 ALIGN_4 .L30: movl M, %ebx andl $1, %ebx jle .L99 ALIGN_4 .L40: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L42 ALIGN_4 .L41: mulps %xmm0, %xmm2 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 44 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 6 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 60 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 #if defined(OPTERON) || defined(BARCELONA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 76 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 96 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 10 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 92 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 112 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 108 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 128 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 14 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 124 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 144 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 24 * SIZE(AA), %xmm1 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 subps %xmm7, %xmm6 #else addps %xmm5, %xmm4 addps %xmm7, %xmm6 #endif movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 mulps %xmm1, %xmm7 mulps %xmm3, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #ifndef TRMMKERNEL shufps $0xe4, %xmm4, %xmm4 shufps $0xe4, %xmm6, %xmm6 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(%esi, LDC), %xmm2 addps %xmm0, %xmm4 addps %xmm2, %xmm6 #endif movlps %xmm4, 0 * SIZE(%esi) movlps %xmm6, 0 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (LDC, LDC), %eax addl %eax, C # c += 2 * ldc decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L999 ALIGN_4 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movaps POSINV, %xmm7 movl K, %eax sarl $2, %eax jle .L103 ALIGN_4 .L102: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movss 4 * SIZE(B), %xmm0 movss 5 * SIZE(B), %xmm1 movss 6 * SIZE(B), %xmm2 movss 7 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm2, 24 * SIZE(BB) movaps %xmm3, 28 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: movl K, %eax andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 #else xorps %xmm7, %xmm0 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $ 2 * SIZE, %edi addl $ 8 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi movl A, AA movl M, %ebx sarl $1, %ebx jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(OPTERON) || defined(BARCELONA) prefetchw 4 * SIZE(%esi) #endif #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_4 .L111: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 12 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 20 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: addps %xmm6, %xmm4 addps %xmm7, %xmm5 shufps $0xb1, %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 #else addps %xmm5, %xmm4 #endif movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL shufps $0xe4, %xmm4, %xmm4 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L110 ALIGN_4 .L130: movl M, %ebx andl $1, %ebx jle .L999 ALIGN_4 .L140: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L142 ALIGN_4 .L141: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: addps %xmm6, %xmm4 addps %xmm7, %xmm5 shufps $0xb1, %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 #else addps %xmm5, %xmm4 #endif movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL shufps $0xe4, %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_2x2_sse3.S000066400000000000000000000704231313527062700213120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 20 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define OLD_STACK 72(%esp) #define OFFSET 76(%esp) #define KK 80(%esp) #define KKK 84(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHSIZE 168 #endif #ifdef PENTIUMM #define PREFETCH prefetcht0 #define PREFETCHSIZE 168 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADDSUB addps #else #define ADDSUB subps #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (address) * SIZE(AA); \ addps %xmm2, %xmm4; \ movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7; \ movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL6(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm6; \ movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7; \ movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 #define KERNEL7(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm6; \ movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp andl $-1024, %esp # align stack STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx #ifdef TRMMKERNEL movss STACK_OFFT, %xmm4 #endif movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 pxor %xmm7, %xmm7 cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I movl %ebx, C movl STACK_LDC, LDC #ifdef TRMMKERNEL movss %xmm4, OFFSET movss %xmm4, KK #ifndef LEFT negl KK #endif #endif sall $ZBASE_SHIFT, LDC movl %eax, J # j = n sarl $1, J jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $2, %eax jle .L03 ALIGN_4 .L02: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) # prefetcht1 128 * SIZE(%ecx) prefetcht0 112 * SIZE(%edi) addl $16 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 ALIGN_4 .L04: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $4 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: movl C, %esi movl A, %edx movl M, %ebx sarl $1, %ebx jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsldup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsldup 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 prefetchnta 4 * SIZE(%esi) prefetchnta 4 * SIZE(%esi, LDC) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $2, %eax #endif movl %eax, KKK #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) #if 1 cmpl $128 * 8, %eax jle .L12 KERNEL1(32 * 8) KERNEL2(32 * 8) KERNEL3(32 * 8) KERNEL4(32 * 8) KERNEL5(32 * 8) KERNEL6(32 * 8) KERNEL7(32 * 8) KERNEL8(32 * 8) cmpl $128 * 9, %eax jle .L12 KERNEL1(32 * 9) KERNEL2(32 * 9) KERNEL3(32 * 9) KERNEL4(32 * 9) KERNEL5(32 * 9) KERNEL6(32 * 9) KERNEL7(32 * 9) KERNEL8(32 * 9) cmpl $128 * 10, %eax jle .L12 KERNEL1(32 * 10) KERNEL2(32 * 10) KERNEL3(32 * 10) KERNEL4(32 * 10) KERNEL5(32 * 10) KERNEL6(32 * 10) KERNEL7(32 * 10) KERNEL8(32 * 10) cmpl $128 * 11, %eax jle .L12 KERNEL1(32 * 11) KERNEL2(32 * 11) KERNEL3(32 * 11) KERNEL4(32 * 11) KERNEL5(32 * 11) KERNEL6(32 * 11) KERNEL7(32 * 11) KERNEL8(32 * 11) cmpl $128 * 12, %eax jle .L12 KERNEL1(32 * 12) KERNEL2(32 * 12) KERNEL3(32 * 12) KERNEL4(32 * 12) KERNEL5(32 * 12) KERNEL6(32 * 12) KERNEL7(32 * 12) KERNEL8(32 * 12) cmpl $128 * 13, %eax jle .L12 KERNEL1(32 * 13) KERNEL2(32 * 13) KERNEL3(32 * 13) KERNEL4(32 * 13) KERNEL5(32 * 13) KERNEL6(32 * 13) KERNEL7(32 * 13) KERNEL8(32 * 13) cmpl $128 * 14, %eax jle .L12 KERNEL1(32 * 14) KERNEL2(32 * 14) KERNEL3(32 * 14) KERNEL4(32 * 14) KERNEL5(32 * 14) KERNEL6(32 * 14) KERNEL7(32 * 14) KERNEL8(32 * 14) cmpl $128 * 15, %eax jle .L12 KERNEL1(32 * 15) KERNEL2(32 * 15) KERNEL3(32 * 15) KERNEL4(32 * 15) KERNEL5(32 * 15) KERNEL6(32 * 15) KERNEL7(32 * 15) KERNEL8(32 * 15) #else addl $128 * 4 * SIZE, BB addl $128 * 2 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 #endif .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L11: KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L11 ALIGN_4 #endif .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 ADDSUB %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movshdup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movsldup 8 * SIZE(BB), %xmm2 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 addsubps %xmm5, %xmm4 addsubps %xmm7, %xmm6 movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 #else shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 addsubps %xmm4, %xmm5 addsubps %xmm6, %xmm7 movaps %xmm5, %xmm4 movaps %xmm7, %xmm6 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 mulps %xmm1, %xmm7 mulps %xmm3, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #ifndef TRMMKERNEL shufps $0xe4, %xmm0, %xmm0 movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 shufps $0xe4, %xmm2, %xmm2 movsd 0 * SIZE(%esi, LDC), %xmm2 movhps 2 * SIZE(%esi, LDC), %xmm2 addps %xmm0, %xmm4 addps %xmm2, %xmm6 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movsd %xmm6, 0 * SIZE(%esi, LDC) movhps %xmm6, 2 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 ALIGN_4 .L30: movl M, %ebx andl $1, %ebx jle .L99 ALIGN_4 .L40: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $2, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L42 ALIGN_4 .L41: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 12 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 movddup 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm0, %xmm3 movddup 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 36 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 movddup 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 40 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movsd 44 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm1, %xmm2 movddup 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movsd 64 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 52 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 56 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movsd 60 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: movaps %xmm4, %xmm6 movlhps %xmm5, %xmm4 movhlps %xmm6, %xmm5 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm5 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 addsubps %xmm5, %xmm4 movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 #else shufps $0xb1, %xmm4, %xmm4 addsubps %xmm4, %xmm5 movaps %xmm5, %xmm4 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 0 * SIZE(%esi, LDC), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 0 * SIZE(%esi, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leal (LDC, LDC), %eax addl %eax, C # c += 2 * ldc decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L999 ALIGN_4 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif /* Copying to Sub Buffer */ leal BUFFER, %ecx movl K, %eax sarl $3, %eax jle .L103 ALIGN_4 .L102: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $16 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: movl K, %eax andl $7, %eax BRANCH jle .L105 ALIGN_4 .L104: movddup 0 * SIZE(B), %xmm0 movaps %xmm0, 0 * SIZE(BB) addl $ 2 * SIZE, %edi addl $ 4 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: movl C, %esi movl A, AA movl M, %ebx sarl $1, %ebx jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsldup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movsldup 16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef PENTIUM4 prefetchnta 4 * SIZE(%esi) #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L112 ALIGN_4 .L111: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movsldup 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 12 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movsldup 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 32 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movsldup 32 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 16 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 20 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movsldup 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 20 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 24 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movsldup 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 24 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movsldup 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movshdup 28 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movsldup 48 * SIZE(BB), %xmm3 addl $32 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movshdup 0 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movsldup 4 * SIZE(BB), %xmm2 addl $ 4 * SIZE, AA addl $ 4 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 addsubps %xmm5, %xmm4 movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 #else shufps $0xb1, %xmm4, %xmm4 addsubps %xmm4, %xmm5 movaps %xmm5, %xmm4 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L110 ALIGN_4 .L130: movl M, %ebx andl $1, %ebx jle .L999 ALIGN_4 .L140: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB # boffset1 = boffset #else leal BUFFER, BB # boffset1 = boffset movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movddup 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movsd 0 * SIZE(BB), %xmm2 movsd 16 * SIZE(BB), %xmm3 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $1, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L142 ALIGN_4 .L141: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 8 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 12 * SIZE(BB), %xmm2 shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 16 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movsd 32 * SIZE(BB), %xmm2 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 10 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 20 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 24 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm4 movsd 28 * SIZE(BB), %xmm3 shufps $0x50, %xmm3, %xmm3 mulps %xmm1, %xmm3 movddup 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movsd 48 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: shufps $0x50, %xmm2, %xmm2 mulps %xmm0, %xmm2 movddup 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 movsd 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: addps %xmm5, %xmm4 movhlps %xmm4, %xmm5 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm5 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 addsubps %xmm5, %xmm4 movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 #else shufps $0xb1, %xmm4, %xmm4 addsubps %xmm4, %xmm5 movaps %xmm5, %xmm4 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movsd %xmm4, 0 * SIZE(%esi) ALIGN_4 .L999: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_4x1_core2.S000066400000000000000000000450431313527062700214500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 20 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define OLD_STACK 72(%esp) #define TEMP 76(%esp) #define OFFSET 80(%esp) #define KK 84(%esp) #define KKK 88(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define C1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define PREFETCH_R (8 * 16 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 16 + 4) #define PREFETCH prefetcht0 #define AA %edx #define BB %ecx #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADDSUB addps #else #define ADDSUB subps #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movd %mm1, K movd %mm0, M movl %eax, N movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif subl $-32 * SIZE, A subl $-32 * SIZE, B leal (, LDC, SIZE * 2), LDC movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 pcmpeqb %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I movl %eax, J # j = n testl %eax, %eax jle .L999 .L01: leal 32 * SIZE + BUFFER, BB #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif movl K, %eax sarl $2, %eax jle .L03 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 movss -30 * SIZE(B), %xmm2 movss -29 * SIZE(B), %xmm3 movss -28 * SIZE(B), %xmm4 movss -27 * SIZE(B), %xmm5 movss -26 * SIZE(B), %xmm6 movss -25 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_W + 0) * SIZE(BB) shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 16) * SIZE(BB) movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) movaps %xmm2, -24 * SIZE(BB) movaps %xmm3, -20 * SIZE(BB) movaps %xmm4, -16 * SIZE(BB) movaps %xmm5, -12 * SIZE(BB) movaps %xmm6, -8 * SIZE(BB) movaps %xmm7, -4 * SIZE(BB) addl $ 8 * SIZE, B subl $-32 * SIZE, BB decl %eax jne .L02 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 .L04: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, -32 * SIZE(BB) movaps %xmm1, -28 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: movl C, C1 # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L20 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -16 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 prefetcht0 7 * SIZE(C1) pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 ADDSUB %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 ADDSUB %xmm0, %xmm5 movaps -20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 0 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps -16 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 ADDSUB %xmm3, %xmm5 movaps -12 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 movaps -8 * SIZE(AA), %xmm3 addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps -8 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 ADDSUB %xmm3, %xmm5 movaps -4 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 movaps 16 * SIZE(AA), %xmm3 addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps 0 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps 4 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 ADDSUB %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps 8 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps 12 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 ADDSUB %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps 32 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps 16 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps 20 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 ADDSUB %xmm3, %xmm5 movaps 20 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 addps %xmm2, %xmm6 movaps 24 * SIZE(AA), %xmm3 ADDSUB %xmm1, %xmm7 movaps 24 * SIZE(BB), %xmm1 movaps %xmm1, %xmm2 mulps %xmm3, %xmm1 addps %xmm1, %xmm4 movaps 28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm3 ADDSUB %xmm3, %xmm5 movaps 28 * SIZE(AA), %xmm3 mulps %xmm3, %xmm2 mulps %xmm3, %xmm1 subl $-64 * SIZE, BB movaps 48 * SIZE(AA), %xmm3 subl $-64 * SIZE, AA addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps -32 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 .L16: movaps %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm1, %xmm0 ADDSUB %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm1 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 ADDSUB %xmm1, %xmm7 movaps -24 * SIZE(BB), %xmm1 addl $8 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L16 .L18: movaps ALPHA_R, %xmm0 movaps ALPHA_I, %xmm1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 addsubps %xmm5, %xmm4 addsubps %xmm7, %xmm6 movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 #else shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 addsubps %xmm4, %xmm5 addsubps %xmm6, %xmm7 movaps %xmm5, %xmm4 movaps %xmm7, %xmm6 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 mulps %xmm0, %xmm7 mulps %xmm1, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm2 movhps 2 * SIZE(C1), %xmm2 movsd 4 * SIZE(C1), %xmm3 movhps 6 * SIZE(C1), %xmm3 addps %xmm2, %xmm4 addps %xmm3, %xmm6 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) movsd %xmm6, 4 * SIZE(C1) movhps %xmm6, 6 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $8 * SIZE, C1 decl %ebx jg .L10 ALIGN_2 .L20: movl M, %ebx testl $2, %ebx jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movaps -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movaps -16 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movaps -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 ADDSUB %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BB), %xmm0 addps %xmm1, %xmm6 movaps 0 * SIZE(BB), %xmm1 ADDSUB %xmm0, %xmm7 movaps -24 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps -8 * SIZE(BB), %xmm3 ADDSUB %xmm0, %xmm5 movaps -20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 16 * SIZE(BB), %xmm3 ADDSUB %xmm0, %xmm7 movaps 0 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 mulps 4 * SIZE(BB), %xmm2 addps %xmm1, %xmm4 movaps 8 * SIZE(BB), %xmm1 ADDSUB %xmm2, %xmm5 movaps -12 * SIZE(AA), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(BB), %xmm2 addps %xmm1, %xmm6 movaps 32 * SIZE(BB), %xmm1 ADDSUB %xmm2, %xmm7 movaps -8 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 20 * SIZE(BB), %xmm2 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 ADDSUB %xmm2, %xmm5 movaps -4 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 28 * SIZE(BB), %xmm2 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 ADDSUB %xmm2, %xmm7 movaps 16 * SIZE(AA), %xmm2 subl $-32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L22 ALIGN_2 .L25: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L28 .L26: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 ADDSUB %xmm0, %xmm5 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L26 .L28: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps ALPHA_R, %xmm0 movaps ALPHA_I, %xmm1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 addsubps %xmm5, %xmm4 movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 #else shufps $0xb1, %xmm4, %xmm4 addsubps %xmm4, %xmm5 movaps %xmm5, %xmm4 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 addps %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm2 movhps 2 * SIZE(C1), %xmm2 addps %xmm2, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) movhps %xmm4, 2 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, C1 ALIGN_2 .L30: testl $1, %ebx jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal 32 * SIZE + BUFFER, BB #else leal 32 * SIZE + BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movsd -32 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movsd -24 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movsd -16 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax je .L35 ALIGN_4 .L32: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 ADDSUB %xmm0, %xmm5 movsd -30 * SIZE(AA), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BB), %xmm0 addps %xmm1, %xmm6 movsd 0 * SIZE(BB), %xmm1 ADDSUB %xmm0, %xmm7 movsd -28 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movsd -8 * SIZE(BB), %xmm3 ADDSUB %xmm0, %xmm5 movsd -26 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movsd 16 * SIZE(BB), %xmm3 ADDSUB %xmm0, %xmm7 movsd -16 * SIZE(AA), %xmm0 mulps %xmm2, %xmm1 mulps 4 * SIZE(BB), %xmm2 addps %xmm1, %xmm4 movsd 8 * SIZE(BB), %xmm1 ADDSUB %xmm2, %xmm5 movsd -22 * SIZE(AA), %xmm2 mulps %xmm2, %xmm1 mulps 12 * SIZE(BB), %xmm2 addps %xmm1, %xmm6 movsd 32 * SIZE(BB), %xmm1 ADDSUB %xmm2, %xmm7 movsd -20 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 20 * SIZE(BB), %xmm2 addps %xmm3, %xmm4 movsd 24 * SIZE(BB), %xmm3 ADDSUB %xmm2, %xmm5 movsd -18 * SIZE(AA), %xmm2 mulps %xmm2, %xmm3 mulps 28 * SIZE(BB), %xmm2 addps %xmm3, %xmm6 movsd 48 * SIZE(BB), %xmm3 ADDSUB %xmm2, %xmm7 movsd -8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L32 ALIGN_2 .L35: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L38 .L36: mulps %xmm0, %xmm1 mulps -28 * SIZE(BB), %xmm0 addps %xmm1, %xmm4 movsd -24 * SIZE(BB), %xmm1 ADDSUB %xmm0, %xmm5 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L36 .L38: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps ALPHA_R, %xmm0 movaps ALPHA_I, %xmm1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 addsubps %xmm5, %xmm4 movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 #else shufps $0xb1, %xmm4, %xmm4 addsubps %xmm4, %xmm5 movaps %xmm5, %xmm4 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 addps %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(C1), %xmm2 addps %xmm2, %xmm4 #endif movsd %xmm4, 0 * SIZE(C1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_2 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif addl LDC, C # c += ldc decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_kernel_4x1_sse.S000066400000000000000000000771641313527062700212410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 20 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define ALPHA_R 16(%esp) #define ALPHA_I 32(%esp) #define K 48(%esp) #define N 52(%esp) #define M 56(%esp) #define A 60(%esp) #define C 64(%esp) #define J 68(%esp) #define OLD_STACK 72(%esp) #define TEMP 76(%esp) #define OFFSET 80(%esp) #define KK 84(%esp) #define KKK 88(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define AA %edx #define BB %ecx #if !defined(HAVE_SSE2) || defined(OPTERON) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm1, %xmm3; \ mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL4(address) \ mulps %xmm1, %xmm3; \ mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL5(address) \ mulps %xmm0, %xmm2; \ mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL6(address) \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm4; \ movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm5; \ movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 #define KERNEL7(address) \ mulps %xmm1, %xmm3; \ mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm4; \ movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm5; \ movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ mulps %xmm1, %xmm3; \ mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC #ifdef TRMMKERNEL movd STACK_OFFT, %mm4 #endif movd %mm1, K movd %mm0, M movl %eax, N movd %mm2, A movd %mm3, C movl %esi, OLD_STACK #ifdef TRMMKERNEL movd %mm4, OFFSET movd %mm4, KK #ifndef LEFT negl KK #endif #endif leal (, LDC, SIZE * 2), LDC movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 #ifdef HAVE_SSE2 pxor %xmm7, %xmm7 cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask #else movl $0x80000000, TEMP movss TEMP, %xmm7 shufps $0, %xmm7, %xmm7 #endif xorps %xmm2, %xmm2 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) movss %xmm7, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm7, 12 + POSINV #endif movl %eax, J # j = n testl %eax, %eax jle .L999 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK #endif leal BUFFER, BB movaps POSINV, %xmm7 movl K, %eax sarl $2, %eax jle .L03 .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movss 4 * SIZE(B), %xmm0 movss 5 * SIZE(B), %xmm1 movss 6 * SIZE(B), %xmm2 movss 7 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else xorps %xmm7, %xmm0 xorps %xmm7, %xmm2 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm2, 24 * SIZE(BB) movaps %xmm3, 28 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L02 .L03: movl K, %eax andl $3, %eax BRANCH jle .L05 .L04: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 #else xorps %xmm7, %xmm0 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx sarl $2, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L10: #ifdef PENTIUM4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif prefetchnta 8 * SIZE(%esi) #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif andl $-8, %eax je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $128 * 4 * SIZE, AA addl $128 * 4 * SIZE, BB subl $ 64 * 8, %eax BRANCH jg .L1X .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $4, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax prefetcht0 8 * SIZE(%esi) je .L12 ALIGN_4 #define PREFETCHSIZE 48 .L11: #ifdef CORE_KATMAI prefetcht0 PREFETCHSIZE * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 #endif .L12: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 addl $8 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L13 .L14: shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 subps %xmm7, %xmm6 #else addps %xmm5, %xmm4 addps %xmm7, %xmm6 #endif movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 mulps %xmm1, %xmm7 mulps %xmm3, %xmm6 addps %xmm5, %xmm4 addps %xmm7, %xmm6 shufps $0xe4, %xmm4, %xmm4 shufps $0xe4, %xmm6, %xmm6 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 movsd 4 * SIZE(%esi), %xmm2 movhps 6 * SIZE(%esi), %xmm2 addps %xmm0, %xmm4 addps %xmm2, %xmm6 #endif movsd %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) movsd %xmm6, 4 * SIZE(%esi) movhps %xmm6, 6 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $4, KK #endif addl $8 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 ALIGN_2 .L50: movl M, %ebx testl $2, %ebx jle .L70 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 ALIGN_4 .L51: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 20 * SIZE(BB), %xmm0 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm5 movaps 12 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 36 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 40 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 20 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 mulps 44 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 52 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 48 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax #ifdef LEFT addl $2, %eax #else addl $1, %eax #endif movl %eax, KKK #endif sarl $3, %eax je .L52 ALIGN_4 .L51: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 12 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 20 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 44 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 52 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 #endif .L52: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L53 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 shufps $0xb1, %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 #else addps %xmm5, %xmm4 #endif movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhps 2 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) movhps %xmm4, 2 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $2, KK #endif addl $4 * SIZE, %esi # coffset += 4 ALIGN_2 .L70: testl $1, %ebx jle .L99 #if (L1_DATA_LINESIZE == 64) #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 16 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax je .L72 ALIGN_4 .L71: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 #else #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leal BUFFER, BB movaps 0 * SIZE + BUFFER, %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE + BUFFER, %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #else leal BUFFER, BB movl KK, %eax leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax movl %eax, KKK #else movl KK, %eax addl $1, %eax movl %eax, KKK #endif sarl $3, %eax je .L72 ALIGN_4 .L71: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 16 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 12 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 20 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 40 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 48 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 44 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 52 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 72 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 #endif .L72: #ifndef TRMMKERNEL movl K, %eax #else movl KKK, %eax #endif movaps ALPHA_R, %xmm1 movaps ALPHA_I, %xmm3 andl $7, %eax # if (k & 1) BRANCH je .L74 .L73: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L73 .L74: addps %xmm6, %xmm4 addps %xmm7, %xmm5 shufps $0xb1, %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm5, %xmm4 #else addps %xmm5, %xmm4 #endif movaps %xmm4, %xmm5 shufps $0xb1, %xmm4, %xmm4 mulps %xmm1, %xmm5 mulps %xmm3, %xmm4 addps %xmm5, %xmm4 #ifndef TRMMKERNEL #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(%esi), %xmm0 addps %xmm0, %xmm4 #endif movlps %xmm4, 0 * SIZE(%esi) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax subl KKK, %eax leal (,%eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB #endif #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $1, KK #endif addl LDC, C # c += ldc decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_ncopy_2.S000066400000000000000000000141611313527062700177500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 8 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_A 12 + STACK + ARGS(%esp) #define STACK_LDA 16 + STACK + ARGS(%esp) #define STACK_B 20 + STACK + ARGS(%esp) #define I %eax #define J %ecx #define LDA %edx #define A %edi #define A1 %ebx #define A2 %ebp #define B %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl STACK_A, A movl STACK_LDA, LDA movl STACK_B, B sall $ZBASE_SHIFT, LDA movl STACK_N, J sarl $1, J je .L20 ALIGN_3 .L21: movl A, A1 leal (A1, LDA), A2 leal (A, LDA, 2), A movl STACK_M, I sarl $1, I je .L24 ALIGN_3 .L25: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(A1), %mm0 MMXLOAD 1 * SIZE(A1), %mm1 MMXLOAD 0 * SIZE(A2), %mm2 MMXLOAD 1 * SIZE(A2), %mm3 MMXLOAD 2 * SIZE(A1), %mm4 MMXLOAD 3 * SIZE(A1), %mm5 MMXLOAD 2 * SIZE(A2), %mm6 MMXLOAD 3 * SIZE(A2), %mm7 MMXSTORE %mm0, 0 * SIZE(B) MMXSTORE %mm1, 1 * SIZE(B) MMXSTORE %mm2, 2 * SIZE(B) MMXSTORE %mm3, 3 * SIZE(B) MMXSTORE %mm4, 4 * SIZE(B) MMXSTORE %mm5, 5 * SIZE(B) MMXSTORE %mm6, 6 * SIZE(B) MMXSTORE %mm7, 7 * SIZE(B) #else FLD 3 * SIZE(A2) FLD 2 * SIZE(A2) FLD 3 * SIZE(A1) FLD 2 * SIZE(A1) FLD 1 * SIZE(A2) FLD 0 * SIZE(A2) FLD 1 * SIZE(A1) FLD 0 * SIZE(A1) FST 0 * SIZE(B) FST 1 * SIZE(B) FST 2 * SIZE(B) FST 3 * SIZE(B) FST 4 * SIZE(B) FST 5 * SIZE(B) FST 6 * SIZE(B) FST 7 * SIZE(B) #endif addl $4 * SIZE, A1 addl $4 * SIZE, A2 addl $8 * SIZE, B decl I jne .L25 ALIGN_3 .L24: movl STACK_M, I andl $1, I jle .L30 ALIGN_3 .L31: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(A1), %mm0 MMXLOAD 1 * SIZE(A1), %mm1 MMXLOAD 0 * SIZE(A2), %mm2 MMXLOAD 1 * SIZE(A2), %mm3 MMXSTORE %mm0, 0 * SIZE(B) MMXSTORE %mm1, 1 * SIZE(B) MMXSTORE %mm2, 2 * SIZE(B) MMXSTORE %mm3, 3 * SIZE(B) #else FLD 1 * SIZE(A2) FLD 0 * SIZE(A2) FLD 1 * SIZE(A1) FLD 0 * SIZE(A1) FST 0 * SIZE(B) FST 1 * SIZE(B) FST 2 * SIZE(B) FST 3 * SIZE(B) #endif addl $2 * SIZE, A1 addl $2 * SIZE, A2 addl $4 * SIZE, B decl I jne .L31 ALIGN_3 .L30: decl J jne .L21 ALIGN_3 .L20: movl A, A1 movl STACK_N, J andl $1, J jle .L38 ALIGN_3 .L39: movl STACK_M, I sarl $2, I je .L42 ALIGN_3 .L43: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(A1), %mm0 MMXLOAD 1 * SIZE(A1), %mm1 MMXLOAD 2 * SIZE(A1), %mm2 MMXLOAD 3 * SIZE(A1), %mm3 MMXLOAD 4 * SIZE(A1), %mm4 MMXLOAD 5 * SIZE(A1), %mm5 MMXLOAD 6 * SIZE(A1), %mm6 MMXLOAD 7 * SIZE(A1), %mm7 MMXSTORE %mm0, 0 * SIZE(B) MMXSTORE %mm1, 1 * SIZE(B) MMXSTORE %mm2, 2 * SIZE(B) MMXSTORE %mm3, 3 * SIZE(B) MMXSTORE %mm4, 4 * SIZE(B) MMXSTORE %mm5, 5 * SIZE(B) MMXSTORE %mm6, 6 * SIZE(B) MMXSTORE %mm7, 7 * SIZE(B) #else FLD 7 * SIZE(A1) FLD 6 * SIZE(A1) FLD 5 * SIZE(A1) FLD 4 * SIZE(A1) FLD 3 * SIZE(A1) FLD 2 * SIZE(A1) FLD 1 * SIZE(A1) FLD 0 * SIZE(A1) FST 0 * SIZE(B) FST 1 * SIZE(B) FST 2 * SIZE(B) FST 3 * SIZE(B) FST 4 * SIZE(B) FST 5 * SIZE(B) FST 6 * SIZE(B) FST 7 * SIZE(B) #endif addl $8 * SIZE, A1 addl $8 * SIZE, B decl I jne .L43 ALIGN_3 .L42: movl STACK_M, I andl $3, I jle .L38 ALIGN_3 .L49: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(A1), %mm0 MMXLOAD 1 * SIZE(A1), %mm1 MMXSTORE %mm0, 0 * SIZE(B) MMXSTORE %mm1, 1 * SIZE(B) #else FLD 1 * SIZE(A1) FLD 0 * SIZE(A1) FST 0 * SIZE(B) FST 1 * SIZE(B) #endif addl $2 * SIZE, A1 addl $2 * SIZE, B decl I jne .L49 ALIGN_3 .L38: EMMS popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemm_tcopy_2.S000066400000000000000000000110541313527062700177540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 8 #define J 0 + STACK(%esp) #define BOFFSET2 4 + STACK(%esp) #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_A 12 + STACK + ARGS(%esp) #define STACK_LDA 16 + STACK + ARGS(%esp) #define STACK_B 20 + STACK + ARGS(%esp) PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #define A %ebp #define A1 %edx #define LDA %ecx #define B %edi #define I %ebx #define B1 %eax #define M4 %esi EMMS movl STACK_A, A movl STACK_B, B movl STACK_M, %ebx movl STACK_N, %eax movl STACK_LDA, LDA sall $ZBASE_SHIFT, LDA andl $-2, %eax addl %eax, %eax imull %ebx, %eax # m * ( n & ~1) leal (B, %eax, SIZE), %eax # boffset2 = b + m * (n & ~1) movl %eax, BOFFSET2 movl STACK_M, M4 sall $ZBASE_SHIFT + 1, M4 testl %ebx, %ebx # if !(m & 1) goto L28 movl %ebx, J jle .L999 ALIGN_4 .L39: movl A, A1 addl LDA, A movl B, B1 addl $4 * SIZE, B movl STACK_N, I sarl $1, I jle .L32 ALIGN_4 .L36: #ifdef HAVE_MMX MMXLOAD 0 * SIZE(A1), %mm0 MMXLOAD 1 * SIZE(A1), %mm1 MMXLOAD 2 * SIZE(A1), %mm2 MMXLOAD 3 * SIZE(A1), %mm3 MMXSTORE %mm0, 0 * SIZE(B1) MMXSTORE %mm1, 1 * SIZE(B1) MMXSTORE %mm2, 2 * SIZE(B1) MMXSTORE %mm3, 3 * SIZE(B1) #else FLD 3 * SIZE(A1) FLD 2 * SIZE(A1) FLD 1 * SIZE(A1) FLD 0 * SIZE(A1) FST 0 * SIZE(B1) FST 1 * SIZE(B1) FST 2 * SIZE(B1) FST 3 * SIZE(B1) #endif addl $4 * SIZE, A1 addl M4, B1 decl I jne .L36 ALIGN_4 .L32: movl STACK_N, I andl $1, I jle .L99 ALIGN_4 movl BOFFSET2, B1 #ifdef HAVE_MMX MMXLOAD 0 * SIZE(A1), %mm0 MMXLOAD 1 * SIZE(A1), %mm1 MMXSTORE %mm0, 0 * SIZE(B1) MMXSTORE %mm1, 1 * SIZE(B1) #else FLD 1 * SIZE(A1) FLD 0 * SIZE(A1) FST 0 * SIZE(B1) FST 1 * SIZE(B1) #endif addl $2 * SIZE, BOFFSET2 ALIGN_4 .L99: decl J jne .L39 ALIGN_4 .L999: EMMS popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_n.S000066400000000000000000000203471313527062700166500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 32 #endif #if defined(PENTIUM4) || defined(ATHLON) #define P ((DTB_DEFAULT_ENTRIES) >> 1) #endif #ifndef P #define P DTB_DEFAULT_ENTRIES #endif #define STACK 16 #define ARGS 16 #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) #define IS 12 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #ifdef DOUBLE #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define LDA 36 + STACK + ARGS(%esp) #define X 40 + STACK + ARGS(%esp) #define INCX 44 + STACK + ARGS(%esp) #define Y 48 + STACK + ARGS(%esp) #define INCY 52 + STACK + ARGS(%esp) #define BUFFER 56 + STACK + ARGS(%esp) #else #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define LDA 28 + STACK + ARGS(%esp) #define X 32 + STACK + ARGS(%esp) #define INCX 36 + STACK + ARGS(%esp) #define Y 40 + STACK + ARGS(%esp) #define INCY 44 + STACK + ARGS(%esp) #define BUFFER 48 + STACK + ARGS(%esp) #endif PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA_I FLD ALPHA_R movl X, %edi movl LDA, %ebx addl %ebx, %ebx # lda *= 2 leal 0(,%ebx,SIZE),%ebx # EBX : lda movl $0, IS movl M, %ecx movl N, %esi test %ecx, %ecx jle .L79 # goto END test %esi, %esi jle .L79 # goto END movl INCY, %eax addl %eax, %eax # incy *= 2 leal (,%eax,SIZE),%eax movl %eax, INCY movl LDA, %eax imull $P, %eax # P * lda subl M ,%eax # P * lda - m leal (, %eax, SIZE), %eax addl %eax, %eax movl %eax, PLDA_M ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl N, %eax subl %esi,%eax # n - is cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_N movl INCX, %edx addl %edx, %edx addl %esi, %esi leal (%edi, %esi, SIZE), %esi # xp = x + is movl %esi, XP cmpl $2, %edx je .L34 # if incx == 1 goto L34 movl BUFFER, %esi leal (, %edx, SIZE), %edx movl %esi, XP # xp = buffer sarl $1,%eax jle .L35 ALIGN_2 .L36: FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %edx,%edi # x += incx FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %edx,%edi # x += incx FST 3 * SIZE(%esi) FST 2 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) addl $4 * SIZE, %esi # xp += 4 decl %eax jg .L36 ALIGN_3 .L35: movl MIN_N, %eax andl $1, %eax jle .L34 FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %edx,%edi # x += incx FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) ALIGN_3 /* Main Routine */ .L34: movl Y, %ecx # c_offset movl M, %ebp # j = m ALIGN_3 .L61: movl A, %edx # a_offset = a fldz addl $2 * SIZE, A # a++ fldz movl XP,%esi fldz movl MIN_N,%eax fldz FLD (%esi) # bt1 = *(b_offset + 0) sarl $1, %eax jle .L64 ALIGN_3 .L65: #ifdef PENTIUM4 prefetchnta 16 * SIZE(%esi) #endif FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) addl $2 * SIZE, %esi # b_offset += 2 addl %ebx, %edx # a_offset += lda FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) addl $2 * SIZE, %esi # b_offset += 2 addl %ebx, %edx # a_offset += lda decl %eax jg .L65 .L64: movl MIN_N, %eax andl $1, %eax jle .L70 ALIGN_2 .L71: FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) faddp %st, %st(4) # ct4 += bt1 fldz ALIGN_2 .L70: #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif #ifndef XCONJ #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) faddp %st, %st(1) #endif #else #ifndef CONJ faddp %st, %st(3) fsubp %st, %st(1) #else fsubp %st, %st(3) fsubp %st, %st(1) #endif #endif fld %st(0) # ct4 = ct2 fmul %st(4), %st fld %st(2) fmul %st(4), %st fsubp %st, %st(1) movl INCY, %eax FADD 0 * SIZE(%ecx) FST 0 * SIZE(%ecx) fmul %st(2), %st fxch %st(1) fmul %st(3), %st faddp %st, %st(1) FADD 1 * SIZE(%ecx) FST 1 * SIZE(%ecx) addl %eax, %ecx decl %ebp jg .L61 .L60: movl PLDA_M, %esi addl %esi, A # a += P * lda - m addl $P, IS movl N, %esi cmpl %esi,IS jl .L32 .L79: #ifndef C_SUN ffreep %st(0) ffreep %st(0) #else .byte 0xdf .byte 0xc0 .byte 0xdf .byte 0xc0 #endif popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_n_atom.S000066400000000000000000000260551313527062700176720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #define STACKSIZE 16 #define M 4 + STACKSIZE(%esp) #define N 8 + STACKSIZE(%esp) #define ALPHA_R 16 + STACKSIZE(%esp) #define ALPHA_I 24 + STACKSIZE(%esp) #define A 32 + STACKSIZE(%esp) #define STACK_LDA 36 + STACKSIZE(%esp) #define STACK_X 40 + STACKSIZE(%esp) #define STACK_INCX 44 + STACKSIZE(%esp) #define Y 48 + STACKSIZE(%esp) #define STACK_INCY 52 + STACKSIZE(%esp) #define BUFFER 56 + STACKSIZE(%esp) #define I %eax #define J %ebx #define INCX %ecx #define INCY J #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp #if !defined(CONJ) && !defined(XCONJ) #define ADD1 addsd #define ADD2 addsd #define ADD3 subsd #define ADD4 addsd #endif #if defined(CONJ) && !defined(XCONJ) #define ADD1 addsd #define ADD2 addsd #define ADD3 addsd #define ADD4 subsd #endif #if !defined(CONJ) && defined(XCONJ) #define ADD1 addsd #define ADD2 subsd #define ADD3 addsd #define ADD4 addsd #endif #if defined(CONJ) && defined(XCONJ) #define ADD1 addsd #define ADD2 subsd #define ADD3 subsd #define ADD4 subsd #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl N, J pxor %xmm7, %xmm7 movl M, %eax addl $8, %eax sarl $3, %eax ALIGN_3 .L01: movapd %xmm7, 0 * SIZE(Y1) movapd %xmm7, 2 * SIZE(Y1) movapd %xmm7, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) movapd %xmm7, 8 * SIZE(Y1) movapd %xmm7, 10 * SIZE(Y1) movapd %xmm7, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) subl $-16 * SIZE, Y1 decl %eax jg .L01 ALIGN_3 .L10: movl BUFFER, Y1 addl $16 * SIZE, Y1 movl A, A1 addl LDA, A movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addl INCX, X movapd %xmm6, %xmm2 mulsd ALPHA_R, %xmm6 mulsd ALPHA_I, %xmm2 movapd %xmm7, %xmm3 mulsd ALPHA_I, %xmm3 mulsd ALPHA_R, %xmm7 #ifndef XCONJ subsd %xmm3, %xmm6 addsd %xmm2, %xmm7 #else addsd %xmm3, %xmm6 subsd %xmm2, %xmm7 #endif movsd -16 * SIZE(Y1), %xmm0 movsd -15 * SIZE(Y1), %xmm1 ALIGN_3 movl M, I sarl $2, I jle .L15 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 mulsd %xmm7, %xmm4 decl I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -13 * SIZE(A1), %xmm3 ADD4 %xmm5, %xmm1 mulsd %xmm7, %xmm4 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) movsd -13 * SIZE(Y1), %xmm1 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -11 * SIZE(A1), %xmm3 mulsd %xmm7, %xmm4 ADD4 %xmm5, %xmm1 movlpd %xmm0, -14 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 movlpd %xmm1, -13 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -10 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -9 * SIZE(A1), %xmm3 ADD4 %xmm5, %xmm1 mulsd %xmm7, %xmm4 movlpd %xmm0, -12 * SIZE(Y1) movsd -10 * SIZE(Y1), %xmm0 movlpd %xmm1, -11 * SIZE(Y1) movsd -9 * SIZE(Y1), %xmm1 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -8 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -7 * SIZE(A1), %xmm3 mulsd %xmm7, %xmm4 ADD4 %xmm5, %xmm1 movlpd %xmm0, -10 * SIZE(Y1) movsd -8 * SIZE(Y1), %xmm0 movlpd %xmm1, -9 * SIZE(Y1) movsd -7 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 subl $1, I BRANCH jg .L13 ALIGN_3 .L14: movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -13 * SIZE(A1), %xmm3 ADD4 %xmm5, %xmm1 mulsd %xmm7, %xmm4 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) movsd -13 * SIZE(Y1), %xmm1 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -11 * SIZE(A1), %xmm3 mulsd %xmm7, %xmm4 ADD4 %xmm5, %xmm1 movlpd %xmm0, -14 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 movlpd %xmm1, -13 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -10 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -9 * SIZE(A1), %xmm3 ADD4 %xmm5, %xmm1 mulsd %xmm7, %xmm4 movlpd %xmm0, -12 * SIZE(Y1) movsd -10 * SIZE(Y1), %xmm0 movlpd %xmm1, -11 * SIZE(Y1) movsd -9 * SIZE(Y1), %xmm1 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 ADD3 %xmm3, %xmm0 ADD4 %xmm5, %xmm1 movlpd %xmm0, -10 * SIZE(Y1) movsd -8 * SIZE(Y1), %xmm0 movlpd %xmm1, -9 * SIZE(Y1) movsd -7 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 ALIGN_3 .L15: testl $2, M je .L17 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 mulsd %xmm7, %xmm4 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 movsd -14 * SIZE(A1), %xmm2 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 ADD3 %xmm3, %xmm0 movsd -13 * SIZE(A1), %xmm3 ADD4 %xmm5, %xmm1 mulsd %xmm7, %xmm4 movlpd %xmm0, -16 * SIZE(Y1) movsd -14 * SIZE(Y1), %xmm0 movlpd %xmm1, -15 * SIZE(Y1) movsd -13 * SIZE(Y1), %xmm1 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 ADD3 %xmm3, %xmm0 ADD4 %xmm5, %xmm1 movlpd %xmm0, -14 * SIZE(Y1) movsd -12 * SIZE(Y1), %xmm0 movlpd %xmm1, -13 * SIZE(Y1) movsd -11 * SIZE(Y1), %xmm1 addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L17: testl $1, M je .L19 movsd -16 * SIZE(A1), %xmm2 movsd -15 * SIZE(A1), %xmm3 movapd %xmm2, %xmm4 mulsd %xmm6, %xmm2 mulsd %xmm7, %xmm4 movapd %xmm3, %xmm5 mulsd %xmm7, %xmm3 ADD1 %xmm2, %xmm0 mulsd %xmm6, %xmm5 ADD2 %xmm4, %xmm1 ADD3 %xmm3, %xmm0 ADD4 %xmm5, %xmm1 movlpd %xmm0, -16 * SIZE(Y1) movlpd %xmm1, -15 * SIZE(Y1) ALIGN_3 .L19: decl J jg .L10 ALIGN_4 .L990: movl Y, Y1 movl BUFFER, X movl STACK_INCY, INCY movl Y1, A1 sall $ZBASE_SHIFT, INCY movl M, %eax sarl $2, %eax jle .L994 ALIGN_3 .L992: movsd 0 * SIZE(Y1), %xmm0 movsd 1 * SIZE(Y1), %xmm1 addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm2 movsd 1 * SIZE(Y1), %xmm3 addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm4 movsd 1 * SIZE(Y1), %xmm5 addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm6 movsd 1 * SIZE(Y1), %xmm7 addl INCY, Y1 addsd 0 * SIZE(X), %xmm0 addsd 1 * SIZE(X), %xmm1 addsd 2 * SIZE(X), %xmm2 addsd 3 * SIZE(X), %xmm3 addsd 4 * SIZE(X), %xmm4 addsd 5 * SIZE(X), %xmm5 addsd 6 * SIZE(X), %xmm6 addsd 7 * SIZE(X), %xmm7 movlpd %xmm0, 0 * SIZE(A1) movlpd %xmm1, 1 * SIZE(A1) addl INCY, A1 movlpd %xmm2, 0 * SIZE(A1) movlpd %xmm3, 1 * SIZE(A1) addl INCY, A1 movlpd %xmm4, 0 * SIZE(A1) movlpd %xmm5, 1 * SIZE(A1) addl INCY, A1 movlpd %xmm6, 0 * SIZE(A1) movlpd %xmm7, 1 * SIZE(A1) addl INCY, A1 addl $8 * SIZE, X decl %eax jg .L992 ALIGN_3 .L994: testl $2, M jle .L996 movsd 0 * SIZE(Y1), %xmm0 movsd 1 * SIZE(Y1), %xmm1 addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm2 movsd 1 * SIZE(Y1), %xmm3 addl INCY, Y1 addsd 0 * SIZE(X), %xmm0 addsd 1 * SIZE(X), %xmm1 addsd 2 * SIZE(X), %xmm2 addsd 3 * SIZE(X), %xmm3 movlpd %xmm0, 0 * SIZE(A1) movlpd %xmm1, 1 * SIZE(A1) addl INCY, A1 movlpd %xmm2, 0 * SIZE(A1) movlpd %xmm3, 1 * SIZE(A1) addl INCY, A1 addl $4 * SIZE, X ALIGN_3 .L996: testl $1, M jle .L999 movsd 0 * SIZE(Y1), %xmm0 movsd 1 * SIZE(Y1), %xmm1 addsd 0 * SIZE(X), %xmm0 addsd 1 * SIZE(X), %xmm1 movlpd %xmm0, 0 * SIZE(A1) movlpd %xmm1, 1 * SIZE(A1) ALIGN_3 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_n_sse.S000066400000000000000000000303651313527062700175230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef movsd #undef movsd #endif #ifdef PENTIUM3 #ifdef HAVE_SSE #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define movsd movlps #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 3) #define movsd movlps #endif #if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) #endif #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 4) #endif #define STACKSIZE 16 #define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) #define ALPHA_I 20 + STACKSIZE+ARGS(%esp) #define A 24 + STACKSIZE+ARGS(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) #define Y 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp) #define MMM 0+ARGS(%esp) #define YY 4+ARGS(%esp) #define AA 8+ARGS(%esp) #define I %eax #define J %ebx #define INCX %ecx #define INCY J #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp #undef SUBPS #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) #define SUBPS subps #else #define SUBPS addps #endif PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl Y,J movl J,YY movl A,J movl J,AA movl M,J movl J,MMM .L0t: xorl J,J addl $1,J sall $20,J subl J,MMM movl J,M jge .L00t ALIGN_3 movl MMM,%eax addl J,%eax jle .L999x movl %eax,M .L00t: movl AA,%eax movl %eax,A movl YY,J movl J,Y movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, LDA subl $-32 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl N, J xorps %xmm7, %xmm7 movl M, %eax addl $8, %eax sarl $3, %eax ALIGN_3 .L01: movaps %xmm7, 0 * SIZE(Y1) movaps %xmm7, 4 * SIZE(Y1) movaps %xmm7, 8 * SIZE(Y1) movaps %xmm7, 12 * SIZE(Y1) subl $-16 * SIZE, Y1 decl %eax jg .L01 ALIGN_3 .L10: movl BUFFER, Y1 addl $32 * SIZE, Y1 movl A, A1 addl LDA, A movsd (X), %xmm7 addl INCX, X #ifdef HAVE_SSE2 pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #else subl $8, %esp movl $0x00000000, 0(%esp) movl $0x80000000, 4(%esp) movlps (%esp), %xmm5 addl $8, %esp movlhps %xmm5, %xmm5 #endif #ifdef HAVE_SSE2 pshufd $0x00, %xmm7, %xmm6 pshufd $0x55, %xmm7, %xmm7 #else movaps %xmm7, %xmm6 shufps $0x00, %xmm6, %xmm6 shufps $0x55, %xmm7, %xmm7 #endif #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm3 #else movsd ALPHA_R, %xmm3 movlhps %xmm3, %xmm3 #endif #ifdef HAVE_SSE2 pshufd $0xb1, %xmm3, %xmm4 #else movaps %xmm3, %xmm4 shufps $0xb1, %xmm4, %xmm4 #endif #ifndef XCONJ xorps %xmm5, %xmm7 #else xorps %xmm5, %xmm6 #endif mulps %xmm3, %xmm6 mulps %xmm4, %xmm7 #ifndef XCONJ subps %xmm7, %xmm6 #else addps %xmm7, %xmm6 #endif #ifdef HAVE_SSE2 pshufd $0x55, %xmm6, %xmm7 pshufd $0x00, %xmm6, %xmm6 #else movaps %xmm6, %xmm7 shufps $0x55, %xmm7, %xmm7 shufps $0x00, %xmm6, %xmm6 #endif #ifndef CONJ xorps %xmm5, %xmm7 #else xorps %xmm5, %xmm6 #endif movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 ALIGN_3 movl M, I sarl $3, I jle .L15 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movsd -28 * SIZE(A1), %xmm4 movhps -26 * SIZE(A1), %xmm4 decl I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif #ifdef HAVE_SSE2 pshufd $0xb1, %xmm2, %xmm3 #else movaps %xmm2, %xmm3 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -24 * SIZE(A1), %xmm2 movhps -22 * SIZE(A1), %xmm2 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm6, %xmm4 addps %xmm4, %xmm1 movsd -20 * SIZE(A1), %xmm4 movhps -18 * SIZE(A1), %xmm4 mulps %xmm7, %xmm3 SUBPS %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps -24 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 SUBPS %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y1) movaps -20 * SIZE(Y1), %xmm1 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm2, %xmm3 #else movaps %xmm2, %xmm3 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -16 * SIZE(A1), %xmm2 movhps -14 * SIZE(A1), %xmm2 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm6, %xmm4 addps %xmm4, %xmm1 movsd -12 * SIZE(A1), %xmm4 movhps -10 * SIZE(A1), %xmm4 mulps %xmm7, %xmm3 SUBPS %xmm3, %xmm0 movaps %xmm0, -24 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 SUBPS %xmm5, %xmm1 movaps %xmm1, -20 * SIZE(Y1) movaps -12 * SIZE(Y1), %xmm1 subl $-16 * SIZE, A1 subl $-16 * SIZE, Y1 subl $1, I BRANCH jg .L13 ALIGN_3 .L14: #ifdef HAVE_SSE2 pshufd $0xb1, %xmm2, %xmm3 #else movaps %xmm2, %xmm3 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 movsd -24 * SIZE(A1), %xmm2 movhps -22 * SIZE(A1), %xmm2 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm6, %xmm4 addps %xmm4, %xmm1 movsd -20 * SIZE(A1), %xmm4 movhps -18 * SIZE(A1), %xmm4 mulps %xmm7, %xmm3 SUBPS %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps -24 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 SUBPS %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y1) movaps -20 * SIZE(Y1), %xmm1 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm2, %xmm3 #else movaps %xmm2, %xmm3 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm6, %xmm4 addps %xmm4, %xmm1 mulps %xmm7, %xmm3 SUBPS %xmm3, %xmm0 movaps %xmm0, -24 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 SUBPS %xmm5, %xmm1 movaps %xmm1, -20 * SIZE(Y1) movaps -12 * SIZE(Y1), %xmm1 subl $-16 * SIZE, A1 subl $-16 * SIZE, Y1 ALIGN_3 .L15: testl $4, M je .L17 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 movsd -28 * SIZE(A1), %xmm4 movhps -26 * SIZE(A1), %xmm4 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm2, %xmm3 #else movaps %xmm2, %xmm3 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm6, %xmm4 addps %xmm4, %xmm1 mulps %xmm7, %xmm3 SUBPS %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps -24 * SIZE(Y1), %xmm0 mulps %xmm7, %xmm5 SUBPS %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(Y1) movaps -20 * SIZE(Y1), %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, Y1 ALIGN_3 .L17: testl $2, M je .L18 movsd -32 * SIZE(A1), %xmm2 movhps -30 * SIZE(A1), %xmm2 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm2, %xmm3 #else movaps %xmm2, %xmm3 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 mulps %xmm7, %xmm3 SUBPS %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L18: testl $1, M je .L19 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A1), %xmm2 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm2, %xmm3 #else movaps %xmm2, %xmm3 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm6, %xmm2 addps %xmm2, %xmm0 mulps %xmm7, %xmm3 SUBPS %xmm3, %xmm0 movlps %xmm0, -32 * SIZE(Y1) ALIGN_3 .L19: decl J jg .L10 ALIGN_4 .L990: movl Y, Y1 movl BUFFER, X movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCY movl M, %eax sarl $3, %eax jle .L994 ALIGN_3 .L992: movsd (Y1), %xmm0 movhps (Y1, INCY), %xmm0 addps 0 * SIZE(X), %xmm0 movlps %xmm0, (Y1) movhps %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhps (Y1, INCY), %xmm0 addps 4 * SIZE(X), %xmm0 movlps %xmm0, (Y1) movhps %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhps (Y1, INCY), %xmm0 addps 8 * SIZE(X), %xmm0 movlps %xmm0, (Y1) movhps %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhps (Y1, INCY), %xmm0 addps 12 * SIZE(X), %xmm0 movlps %xmm0, (Y1) movhps %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 addl $16 * SIZE, X decl %eax jg .L992 ALIGN_3 .L994: testl $4, M jle .L995 movsd (Y1), %xmm0 movhps (Y1, INCY), %xmm0 addps 0 * SIZE(X), %xmm0 movlps %xmm0, (Y1) movhps %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 movsd (Y1), %xmm0 movhps (Y1, INCY), %xmm0 addps 4 * SIZE(X), %xmm0 movlps %xmm0, (Y1) movhps %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 addl $8 * SIZE, X ALIGN_3 .L995: testl $2, M jle .L996 movsd (Y1), %xmm0 movhps (Y1, INCY), %xmm0 addps 0 * SIZE(X), %xmm0 movlps %xmm0, (Y1) movhps %xmm0, (Y1, INCY) leal (Y1, INCY, 2), Y1 addl $4 * SIZE, X ALIGN_3 .L996: testl $1, M jle .L999 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd (Y1), %xmm0 addps 0 * SIZE(X), %xmm0 movlps %xmm0, (Y1) ALIGN_3 .L999: movl M,%eax sall $ZBASE_SHIFT,%eax addl %eax,AA movl STACK_INCY,INCY imull INCY,%eax addl %eax,YY jmp .L0t ALIGN_3 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_n_sse2.S000066400000000000000000000245141313527062700176040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 3) #define movsd movlps #endif #if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) #endif #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 4) #endif #define STACKSIZE 16 #define ARGS 16 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) #define ALPHA_I 24 + STACKSIZE+ARGS(%esp) #define A 32 + STACKSIZE+ARGS(%esp) #define STACK_LDA 36 + STACKSIZE+ARGS(%esp) #define STACK_X 40 + STACKSIZE+ARGS(%esp) #define STACK_INCX 44 + STACKSIZE+ARGS(%esp) #define Y 48 + STACKSIZE+ARGS(%esp) #define STACK_INCY 52 + STACKSIZE+ARGS(%esp) #define BUFFER 56 + STACKSIZE+ARGS(%esp) #define MMM 0 + ARGS(%esp) #define YY 4 + ARGS(%esp) #define AA 8 + ARGS(%esp) #define I %eax #define J %ebx #define INCX %ecx #define INCY J #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp #undef SUBPD #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) #define SUBPD subpd #else #define SUBPD addpd #endif PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl Y,J movl J,YY movl A,J movl J,AA movl M,J movl J,MMM .L0t: xorl J,J addl $1,J sall $18,J subl J,MMM movl J,M jge .L00t ALIGN_3 movl MMM,%eax addl J,%eax jle .L999x movl %eax,M .L00t: movl AA,%eax movl %eax,A movl YY,J movl J,Y movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl N, J pxor %xmm7, %xmm7 movl M, %eax addl $8, %eax sarl $3, %eax ALIGN_3 .L01: movapd %xmm7, 0 * SIZE(Y1) movapd %xmm7, 2 * SIZE(Y1) movapd %xmm7, 4 * SIZE(Y1) movapd %xmm7, 6 * SIZE(Y1) movapd %xmm7, 8 * SIZE(Y1) movapd %xmm7, 10 * SIZE(Y1) movapd %xmm7, 12 * SIZE(Y1) movapd %xmm7, 14 * SIZE(Y1) subl $-16 * SIZE, Y1 decl %eax jg .L01 ALIGN_3 .L10: movl BUFFER, Y1 addl $16 * SIZE, Y1 movl A, A1 addl LDA, A movsd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm6 addl INCX, X pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0xc0, %xmm5, %xmm5 pshufd $0x4e, %xmm6, %xmm7 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm3 movddup ALPHA_I, %xmm4 #else movsd ALPHA_R, %xmm3 movsd ALPHA_I, %xmm4 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 #endif xorpd %xmm5, %xmm7 mulpd %xmm3, %xmm6 mulpd %xmm4, %xmm7 #ifndef XCONJ subpd %xmm7, %xmm6 #else addpd %xmm7, %xmm6 #endif pshufd $0xee, %xmm6, %xmm7 pshufd $0x44, %xmm6, %xmm6 #ifndef CONJ xorpd %xmm5, %xmm7 #else xorpd %xmm5, %xmm6 #endif movapd -16 * SIZE(Y1), %xmm0 movapd -14 * SIZE(Y1), %xmm1 ALIGN_3 movl M, I sarl $2, I jle .L15 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movsd -14 * SIZE(A1), %xmm4 movhpd -13 * SIZE(A1), %xmm4 decl I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif pshufd $0x4e, %xmm2, %xmm3 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 movhpd -11 * SIZE(A1), %xmm2 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm6, %xmm4 addpd %xmm4, %xmm1 movsd -10 * SIZE(A1), %xmm4 movhpd -9 * SIZE(A1), %xmm4 mulpd %xmm7, %xmm3 SUBPD %xmm3, %xmm0 movapd %xmm0, -16 * SIZE(Y1) movapd -12 * SIZE(Y1), %xmm0 mulpd %xmm7, %xmm5 SUBPD %xmm5, %xmm1 movapd %xmm1, -14 * SIZE(Y1) movapd -10 * SIZE(Y1), %xmm1 pshufd $0x4e, %xmm2, %xmm3 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -8 * SIZE(A1), %xmm2 movhpd -7 * SIZE(A1), %xmm2 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm6, %xmm4 addpd %xmm4, %xmm1 movsd -6 * SIZE(A1), %xmm4 movhpd -5 * SIZE(A1), %xmm4 mulpd %xmm7, %xmm3 SUBPD %xmm3, %xmm0 movapd %xmm0, -12 * SIZE(Y1) movapd -8 * SIZE(Y1), %xmm0 mulpd %xmm7, %xmm5 SUBPD %xmm5, %xmm1 movapd %xmm1, -10 * SIZE(Y1) movapd -6 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 subl $1, I BRANCH jg .L13 ALIGN_3 .L14: pshufd $0x4e, %xmm2, %xmm3 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 movsd -12 * SIZE(A1), %xmm2 movhpd -11 * SIZE(A1), %xmm2 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm6, %xmm4 addpd %xmm4, %xmm1 movsd -10 * SIZE(A1), %xmm4 movhpd -9 * SIZE(A1), %xmm4 mulpd %xmm7, %xmm3 SUBPD %xmm3, %xmm0 movapd %xmm0, -16 * SIZE(Y1) movapd -12 * SIZE(Y1), %xmm0 mulpd %xmm7, %xmm5 SUBPD %xmm5, %xmm1 movapd %xmm1, -14 * SIZE(Y1) movapd -10 * SIZE(Y1), %xmm1 pshufd $0x4e, %xmm2, %xmm3 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm6, %xmm4 addpd %xmm4, %xmm1 mulpd %xmm7, %xmm3 SUBPD %xmm3, %xmm0 movapd %xmm0, -12 * SIZE(Y1) movapd -8 * SIZE(Y1), %xmm0 mulpd %xmm7, %xmm5 SUBPD %xmm5, %xmm1 movapd %xmm1, -10 * SIZE(Y1) movapd -6 * SIZE(Y1), %xmm1 subl $-8 * SIZE, A1 subl $-8 * SIZE, Y1 ALIGN_3 .L15: testl $2, M je .L17 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 movsd -14 * SIZE(A1), %xmm4 movhpd -13 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm2, %xmm3 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm6, %xmm4 addpd %xmm4, %xmm1 mulpd %xmm7, %xmm3 SUBPD %xmm3, %xmm0 movapd %xmm0, -16 * SIZE(Y1) mulpd %xmm7, %xmm5 SUBPD %xmm5, %xmm1 movapd %xmm1, -14 * SIZE(Y1) movapd -12 * SIZE(Y1), %xmm0 addl $4 * SIZE, A1 addl $4 * SIZE, Y1 ALIGN_3 .L17: testl $1, M je .L19 movsd -16 * SIZE(A1), %xmm2 movhpd -15 * SIZE(A1), %xmm2 pshufd $0x4e, %xmm2, %xmm3 mulpd %xmm6, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm7, %xmm3 SUBPD %xmm3, %xmm0 movapd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L19: decl J jg .L10 ALIGN_4 .L990: movl Y, Y1 movl BUFFER, X movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCY movl M, %eax sarl $2, %eax jle .L994 ALIGN_3 .L992: movsd 0 * SIZE(Y1), %xmm0 movhpd 1 * SIZE(Y1), %xmm0 addpd 0 * SIZE(X), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm0 movhpd 1 * SIZE(Y1), %xmm0 addpd 2 * SIZE(X), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm0 movhpd 1 * SIZE(Y1), %xmm0 addpd 4 * SIZE(X), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm0 movhpd 1 * SIZE(Y1), %xmm0 addpd 6 * SIZE(X), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addl INCY, Y1 addl $8 * SIZE, X decl %eax jg .L992 ALIGN_3 .L994: testl $2, M jle .L996 movsd 0 * SIZE(Y1), %xmm0 movhpd 1 * SIZE(Y1), %xmm0 addpd 0 * SIZE(X), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addl INCY, Y1 movsd 0 * SIZE(Y1), %xmm0 movhpd 1 * SIZE(Y1), %xmm0 addpd 2 * SIZE(X), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addl INCY, Y1 addl $4 * SIZE, X ALIGN_3 .L996: testl $1, M jle .L999 movsd 0 * SIZE(Y1), %xmm0 movhpd 1 * SIZE(Y1), %xmm0 addpd 0 * SIZE(X), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) ALIGN_3 .L999: movl M,%eax sall $ZBASE_SHIFT,%eax addl %eax,AA movl STACK_INCY,INCY imull INCY,%eax addl %eax,YY jmp .L0t ALIGN_3 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_t.S000066400000000000000000000203241313527062700166510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM #define P 88 #endif #ifndef P #define P 400 #endif #define STACK 16 #define ARGS 24 #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) #define J 12 + STACK(%esp) #define IS 16 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #ifdef DOUBLE #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define LDA 36 + STACK + ARGS(%esp) #define X 40 + STACK + ARGS(%esp) #define INCX 44 + STACK + ARGS(%esp) #define Y 48 + STACK + ARGS(%esp) #define INCY 52 + STACK + ARGS(%esp) #define BUFFER 56 + STACK + ARGS(%esp) #else #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define LDA 28 + STACK + ARGS(%esp) #define X 32 + STACK + ARGS(%esp) #define INCX 36 + STACK + ARGS(%esp) #define Y 40 + STACK + ARGS(%esp) #define INCY 44 + STACK + ARGS(%esp) #define BUFFER 48 + STACK + ARGS(%esp) #endif PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE FLD ALPHA_I FLD ALPHA_R movl X, %edi # X movl $0, IS movl M, %ebx movl N, %ecx testl %ebx, %ebx jle .L79 testl %ecx, %ecx jle .L79 movl INCX, %esi addl %esi, %esi leal (,%esi,SIZE), %esi movl %esi, INCX movl INCY, %esi addl %esi, %esi leal (, %esi, SIZE), %esi movl %esi, INCY movl LDA, %ebx movl N, %eax imull %ebx, %eax movl $P, %esi subl %eax, %esi leal (, %esi, SIZE), %esi addl %esi, %esi movl %esi, NLDA leal (,%ebx,SIZE), %esi addl %esi, %esi movl %esi, LDA ALIGN_2 .L32: movl IS, %esi movl $P, %edx movl M, %eax subl %esi, %eax cmpl %edx, %eax #ifdef PENTIUM jle .L33 movl %edx, %eax .L33: #else cmovg %edx, %eax #endif movl %eax, MIN_M movl IS, %ecx addl %ecx, %ecx leal (%edi,%ecx,SIZE), %ecx # xp = x + is movl INCX, %ebx movl %ecx, XP cmpl $2 * SIZE, %ebx je .L34 movl BUFFER, %esi movl MIN_M, %eax movl %esi, XP sarl $1, %eax jle .L35 ALIGN_3 .L36: FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %ebx,%edi # x += incx FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %ebx,%edi # x += incx FST 3 * SIZE(%esi) FST 2 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) addl $4 * SIZE, %esi # xp += 4 decl %eax jg .L36 ALIGN_3 .L35: movl MIN_M, %eax andl $1,%eax jle .L34 FLD 0 * SIZE(%edi) FLD 1 * SIZE(%edi) addl %ebx,%edi # x += incx FST 1 * SIZE(%esi) FST 0 * SIZE(%esi) ALIGN_3 /* Main Routine */ .L34: movl Y, %ebp # coffset = y movl N, %ecx testl %ecx, %ecx jle .L60 ALIGN_2 .L61: movl A, %ebx # a_offset = a fldz # ct1 = ZERO movl LDA, %edx fldz # ct1 = ZERO addl %ebx, %edx fldz # ct1 = ZERO movl %edx, A fldz # ct1 = ZERO movl XP, %esi FLD (%esi) # bt1 = *(b_offset + 0) movl MIN_M, %eax sarl $1, %eax jle .L64 ALIGN_3 #define PRESIZE 8 .L65: #ifdef HAS_PREFETCH prefetcht0 PRESIZE * SIZE(%ebx) prefetcht0 PRESIZE * SIZE(%esi) #endif FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 3 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) faddp %st, %st(4) # ct4 += bt1 FLD 4 * SIZE(%esi) # bt1 = *(b_offset + 1) addl $4 * SIZE, %esi addl $4 * SIZE, %ebx decl %eax jg .L65 ALIGN_3 .L64: movl MIN_M, %eax andl $1, %eax jle .L70 ALIGN_3 .L71: FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) faddp %st, %st(4) # ct4 += bt1 fldz ALIGN_3 .L70: #ifndef C_SUN ffreep %st(0) #else .byte 0xdf .byte 0xc0 #endif #ifndef XCONJ #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) faddp %st, %st(1) #endif #else #ifndef CONJ faddp %st, %st(3) fsubp %st, %st(1) #else fsubp %st, %st(3) fsubp %st, %st(1) #endif #endif fld %st(0) # ct4 = ct2 fmul %st(4), %st fld %st(2) fmul %st(4), %st fsubp %st, %st(1) FADD 0 * SIZE(%ebp) FST 0 * SIZE(%ebp) fmul %st(2), %st fxch %st(1) fmul %st(3), %st faddp %st, %st(1) FADD 1 * SIZE(%ebp) FST 1 * SIZE(%ebp) addl INCY, %ebp decl %ecx jg .L61 ALIGN_3 .L60: movl A, %ebx addl NLDA, %ebx movl %ebx, A addl $P, IS movl M, %esi cmpl %esi, IS jl .L32 ALIGN_3 .L79: #ifndef C_SUN ffreep %st(0) ffreep %st(0) #else .byte 0xdf .byte 0xc0 .byte 0xdf .byte 0xc0 #endif popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_t_atom.S000066400000000000000000000221331313527062700176710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #define STACKSIZE 16 #define M 4 + STACKSIZE(%esp) #define N 8 + STACKSIZE(%esp) #define ALPHA_R 16 + STACKSIZE(%esp) #define ALPHA_I 24 + STACKSIZE(%esp) #define A 32 + STACKSIZE(%esp) #define STACK_LDA 36 + STACKSIZE(%esp) #define STACK_X 40 + STACKSIZE(%esp) #define STACK_INCX 44 + STACKSIZE(%esp) #define Y 48 + STACKSIZE(%esp) #define STACK_INCY 52 + STACKSIZE(%esp) #define BUFFER 56 + STACKSIZE(%esp) #define I %eax #define J %ebx #define INCX J #define INCY %ecx #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp #if !defined(CONJ) && !defined(XCONJ) #define ADD1 addsd #define ADD2 addsd #define ADD3 subsd #define ADD4 addsd #endif #if defined(CONJ) && !defined(XCONJ) #define ADD1 addsd #define ADD2 addsd #define ADD3 addsd #define ADD4 subsd #endif #if !defined(CONJ) && defined(XCONJ) #define ADD1 addsd #define ADD2 subsd #define ADD3 addsd #define ADD4 addsd #endif #if defined(CONJ) && defined(XCONJ) #define ADD1 addsd #define ADD2 subsd #define ADD3 subsd #define ADD4 subsd #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY sall $ZBASE_SHIFT, LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl M, I sarl $2, I jle .L05 ALIGN_4 .L02: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addl INCX, X movsd 0 * SIZE(X), %xmm1 movhpd 1 * SIZE(X), %xmm1 addl INCX, X movsd 0 * SIZE(X), %xmm2 movhpd 1 * SIZE(X), %xmm2 addl INCX, X movsd 0 * SIZE(X), %xmm3 movhpd 1 * SIZE(X), %xmm3 addl INCX, X movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) movapd %xmm2, 4 * SIZE(Y1) movapd %xmm3, 6 * SIZE(Y1) addl $8 * SIZE, Y1 decl I jg .L02 ALIGN_4 .L05: movl M, I andl $3, I jle .L10 ALIGN_2 .L06: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addl INCX, X movapd %xmm0, 0 * SIZE(Y1) addl $2 * SIZE, Y1 decl I jg .L06 ALIGN_4 .L10: movl Y, Y1 movl N, J ALIGN_3 .L11: movl BUFFER, X addl $16 * SIZE, X movl A, A1 addl LDA, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movsd -16 * SIZE(X), %xmm2 movsd -15 * SIZE(X), %xmm3 movl M, I sarl $2, I jle .L15 movsd -16 * SIZE(A1), %xmm4 movsd -15 * SIZE(A1), %xmm5 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 mulsd %xmm3, %xmm6 decl I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -13 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -14 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -14 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -13 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -11 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -12 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -11 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -9 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -10 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -10 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -9 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -7 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -8 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -7 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X decl I jg .L12 ALIGN_4 .L13: movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -13 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -14 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -14 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -13 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -11 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -12 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -11 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -9 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -10 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -10 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -9 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -7 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 mulsd %xmm2, %xmm7 movsd -8 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 ADD3 %xmm5, %xmm0 ADD4 %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L15: testl $2, M jle .L17 movsd -16 * SIZE(A1), %xmm4 movsd -15 * SIZE(A1), %xmm5 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 mulsd %xmm3, %xmm6 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -13 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 movsd -14 * SIZE(A1), %xmm4 mulsd %xmm2, %xmm7 movsd -14 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 ADD3 %xmm5, %xmm0 movsd -13 * SIZE(A1), %xmm5 mulsd %xmm3, %xmm6 ADD4 %xmm7, %xmm1 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 movsd -11 * SIZE(X), %xmm3 ADD1 %xmm4, %xmm0 mulsd %xmm2, %xmm7 movsd -12 * SIZE(X), %xmm2 ADD2 %xmm6, %xmm1 ADD3 %xmm5, %xmm0 ADD4 %xmm7, %xmm1 addl $4 * SIZE, A1 ALIGN_4 .L17: testl $1, M jle .L18 movsd -16 * SIZE(A1), %xmm4 movsd -15 * SIZE(A1), %xmm5 movapd %xmm4, %xmm6 mulsd %xmm2, %xmm4 mulsd %xmm3, %xmm6 movapd %xmm5, %xmm7 mulsd %xmm3, %xmm5 ADD1 %xmm4, %xmm0 mulsd %xmm2, %xmm7 ADD2 %xmm6, %xmm1 ADD3 %xmm5, %xmm0 ADD4 %xmm7, %xmm1 ALIGN_4 .L18: movsd 0 * SIZE(Y1), %xmm4 movapd %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 movsd 1 * SIZE(Y1), %xmm5 movapd %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 mulsd ALPHA_I, %xmm2 mulsd ALPHA_I, %xmm3 addsd %xmm2, %xmm1 subsd %xmm3, %xmm0 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 movlpd %xmm0, 0 * SIZE(Y1) movlpd %xmm1, 1 * SIZE(Y1) addl INCY, Y1 decl J jg .L11 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_t_sse.S000066400000000000000000000253761313527062700175370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef movsd #undef movsd #endif #ifdef PENTIUM3 #ifdef HAVE_SSE #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define movsd movlps #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 3) #define movsd movlps #endif #if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) #endif #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 4) #endif #define STACKSIZE 16 #define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) #define ALPHA_I 20 + STACKSIZE+ARGS(%esp) #define A 24 + STACKSIZE+ARGS(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) #define Y 40 + STACKSIZE+ARGS(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp) #define MMM 0+ARGS(%esp) #define XX 4+ARGS(%esp) #define AA 8+ARGS(%esp) #define I %eax #define J %ebx #define INCX J #define INCY %ecx #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp #undef SUBPS #ifndef CONJ #define SUBPS addps #else #define SUBPS subps #endif PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_X, X movl X,XX movl A,J movl J,AA #backup A movl M,J movl J,MMM .L0t: xorl J,J addl $1,J sall $20,J subl $8,J subl J,MMM #MMM-=J movl J,M jge .L00t ALIGN_4 movl MMM,%eax addl J,%eax jle .L999x movl %eax,M .L00t: movl AA,%eax movl %eax,A movl XX,%eax movl %eax,X movl STACK_LDA,LDA movl STACK_INCX, INCX movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, LDA sall $ZBASE_SHIFT, INCY subl $-32 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl M, I sarl $2, I jle .L05 ALIGN_4 .L02: movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm1 addl INCX, X movhps (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X movsd (X), %xmm3 addl INCX, X movhps (X), %xmm3 addl INCX, X movaps %xmm0, 0 * SIZE(Y1) movaps %xmm1, 4 * SIZE(Y1) movaps %xmm2, 8 * SIZE(Y1) movaps %xmm3, 12 * SIZE(Y1) addl $16 * SIZE, Y1 decl I jg .L02 ALIGN_4 .L05: movl M, I andl $3, I jle .L10 ALIGN_2 .L06: movsd (X), %xmm0 addl INCX, X movlps %xmm0, (Y1) addl $2 * SIZE, Y1 decl I jg .L06 ALIGN_4 .L10: movl Y, Y1 movl N, J ALIGN_3 .L11: movl BUFFER, X addl $32 * SIZE, X movl A, A1 addl LDA, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movaps -32 * SIZE(X), %xmm2 movaps -28 * SIZE(X), %xmm3 movl M, I sarl $3, I jle .L15 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 movsd -28 * SIZE(A1), %xmm6 movhps -26 * SIZE(A1), %xmm6 decl I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 movaps -24 * SIZE(X), %xmm2 SUBPS %xmm5, %xmm1 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm6, %xmm7 #else movaps %xmm6, %xmm7 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm3, %xmm6 addps %xmm6, %xmm0 mulps %xmm3, %xmm7 movaps -20 * SIZE(X), %xmm3 SUBPS %xmm7, %xmm1 movsd -24 * SIZE(A1), %xmm4 movhps -22 * SIZE(A1), %xmm4 movsd -20 * SIZE(A1), %xmm6 movhps -18 * SIZE(A1), %xmm6 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 movaps -16 * SIZE(X), %xmm2 SUBPS %xmm5, %xmm1 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm6, %xmm7 #else movaps %xmm6, %xmm7 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm3, %xmm6 addps %xmm6, %xmm0 mulps %xmm3, %xmm7 movaps -12 * SIZE(X), %xmm3 SUBPS %xmm7, %xmm1 movsd -16 * SIZE(A1), %xmm4 movhps -14 * SIZE(A1), %xmm4 movsd -12 * SIZE(A1), %xmm6 movhps -10 * SIZE(A1), %xmm6 addl $16 * SIZE, A1 addl $16 * SIZE, X decl I jg .L12 ALIGN_4 .L13: #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 movaps -24 * SIZE(X), %xmm2 SUBPS %xmm5, %xmm1 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm6, %xmm7 #else movaps %xmm6, %xmm7 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm3, %xmm6 addps %xmm6, %xmm0 mulps %xmm3, %xmm7 movaps -20 * SIZE(X), %xmm3 SUBPS %xmm7, %xmm1 movsd -24 * SIZE(A1), %xmm4 movhps -22 * SIZE(A1), %xmm4 movsd -20 * SIZE(A1), %xmm6 movhps -18 * SIZE(A1), %xmm6 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 movaps -16 * SIZE(X), %xmm2 SUBPS %xmm5, %xmm1 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm6, %xmm7 #else movaps %xmm6, %xmm7 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm3, %xmm6 addps %xmm6, %xmm0 mulps %xmm3, %xmm7 movaps -12 * SIZE(X), %xmm3 SUBPS %xmm7, %xmm1 addl $16 * SIZE, A1 addl $16 * SIZE, X ALIGN_4 .L15: testl $4, M jle .L17 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 movsd -28 * SIZE(A1), %xmm6 movhps -26 * SIZE(A1), %xmm6 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 movaps -24 * SIZE(X), %xmm2 SUBPS %xmm5, %xmm1 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm6, %xmm7 #else movaps %xmm6, %xmm7 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm3, %xmm6 addps %xmm6, %xmm0 mulps %xmm3, %xmm7 movaps -20 * SIZE(X), %xmm3 SUBPS %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L17: testl $2, M jle .L18 movsd -32 * SIZE(A1), %xmm4 movhps -30 * SIZE(A1), %xmm4 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 SUBPS %xmm5, %xmm1 movaps %xmm3, %xmm2 addl $4 * SIZE, A1 ALIGN_4 .L18: testl $1, M jle .L19 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm4 shufps $0x44, %xmm2, %xmm2 #ifdef HAVE_SSE2 pshufd $0xb1, %xmm4, %xmm5 #else movaps %xmm4, %xmm5 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm2, %xmm4 addps %xmm4, %xmm0 mulps %xmm2, %xmm5 SUBPS %xmm5, %xmm1 ALIGN_4 .L19: #ifdef HAVE_SSE2 pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #else subl $8, %esp movl $0x00000000, 0(%esp) movl $0x80000000, 4(%esp) movlps (%esp), %xmm5 addl $8, %esp movlhps %xmm5, %xmm5 #endif #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 #else xorps %xmm5, %xmm1 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm0, %xmm0 #else movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 addps %xmm2, %xmm0 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 #endif #ifdef HAVE_SSE2 pshufd $0xb1, %xmm0, %xmm1 #else movaps %xmm0, %xmm1 shufps $0xb1, %xmm1, %xmm1 #endif movsd ALPHA_R, %xmm7 movlhps %xmm7, %xmm7 mulps %xmm7, %xmm0 mulps %xmm7, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd 0 * SIZE(Y1), %xmm4 shufps $0xd8, %xmm0, %xmm0 addps %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(Y1) addl INCY, Y1 decl J jg .L11 ALIGN_4 .L999: movl M,%eax sall $ZBASE_SHIFT, %eax addl %eax,AA movl STACK_INCX,INCX imull INCX,%eax addl %eax,XX jmp .L0t ALIGN_4 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zgemv_t_sse2.S000066400000000000000000000221171313527062700176070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) #endif #ifdef OPTERON #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 3) #define movsd movlps #endif #if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) #endif #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 4) #endif #define STACKSIZE 16 #define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) #define ALPHA_R 16 + STACKSIZE+ARGS(%esp) #define ALPHA_I 24 + STACKSIZE+ARGS(%esp) #define A 32 + STACKSIZE+ARGS(%esp) #define STACK_LDA 36 + STACKSIZE+ARGS(%esp) #define STACK_X 40 + STACKSIZE+ARGS(%esp) #define STACK_INCX 44 + STACKSIZE+ARGS(%esp) #define Y 48 + STACKSIZE+ARGS(%esp) #define STACK_INCY 52 + STACKSIZE+ARGS(%esp) #define BUFFER 56 + STACKSIZE+ARGS(%esp) #define MMM 0 + ARGS(%esp) #define AA 4 + ARGS(%esp) #define XX 8 + ARGS(%esp) #define I %eax #define J %ebx #define INCX J #define INCY %ecx #define A1 %esi #define X %edx #define Y1 %edi #define LDA %ebp #undef SUBPD #ifndef CONJ #define SUBPD addpd #else #define SUBPD subpd #endif PROLOGUE subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_X, X movl X, XX movl A,J movl J,AA movl M,J movl J,MMM .L0t: xorl J,J addl $1,J sall $18,J subl $4,J subl J,MMM movl J,M jge .L00t ALIGN_4 movl MMM,%eax addl J,%eax jle .L999x movl %eax, M .L00t: movl XX, %eax movl %eax, X movl AA,%eax movl %eax,A movl STACK_LDA, LDA movl STACK_INCX, INCX movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY sall $ZBASE_SHIFT, LDA subl $-16 * SIZE, A cmpl $0, N jle .L999 cmpl $0, M jle .L999 movl BUFFER, Y1 movl M, I sarl $2, I jle .L05 ALIGN_4 .L02: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addl INCX, X movsd 0 * SIZE(X), %xmm1 movhpd 1 * SIZE(X), %xmm1 addl INCX, X movsd 0 * SIZE(X), %xmm2 movhpd 1 * SIZE(X), %xmm2 addl INCX, X movsd 0 * SIZE(X), %xmm3 movhpd 1 * SIZE(X), %xmm3 addl INCX, X movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) movapd %xmm2, 4 * SIZE(Y1) movapd %xmm3, 6 * SIZE(Y1) addl $8 * SIZE, Y1 decl I jg .L02 ALIGN_4 .L05: movl M, I andl $3, I jle .L10 ALIGN_2 .L06: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addl INCX, X movapd %xmm0, 0 * SIZE(Y1) addl $2 * SIZE, Y1 decl I jg .L06 ALIGN_4 .L10: movl Y, Y1 movl N, J ALIGN_4 .L11: movl BUFFER, X addl $16 * SIZE, X movl A, A1 addl LDA, A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 movapd -16 * SIZE(X), %xmm2 movapd -14 * SIZE(X), %xmm3 movl M, I sarl $2, I jle .L15 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 decl I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(A1) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 mulpd %xmm2, %xmm5 SUBPD %xmm5, %xmm1 movapd -12 * SIZE(X), %xmm2 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm3, %xmm7 SUBPD %xmm7, %xmm1 movapd -10 * SIZE(X), %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 movhpd -7 * SIZE(A1), %xmm4 mulpd %xmm2, %xmm5 movapd -8 * SIZE(X), %xmm2 SUBPD %xmm5, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd -6 * SIZE(A1), %xmm6 movhpd -5 * SIZE(A1), %xmm6 mulpd %xmm3, %xmm7 movapd -6 * SIZE(X), %xmm3 SUBPD %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X decl I jg .L12 ALIGN_4 .L13: pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 mulpd %xmm2, %xmm5 SUBPD %xmm5, %xmm1 movapd -12 * SIZE(X), %xmm2 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm3, %xmm7 SUBPD %xmm7, %xmm1 movapd -10 * SIZE(X), %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm2, %xmm5 movapd -8 * SIZE(X), %xmm2 SUBPD %xmm5, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm3, %xmm7 movapd -6 * SIZE(X), %xmm3 SUBPD %xmm7, %xmm1 addl $8 * SIZE, A1 addl $8 * SIZE, X ALIGN_4 .L15: testl $2, M jle .L17 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm2, %xmm5 movapd -12 * SIZE(X), %xmm2 SUBPD %xmm5, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm3, %xmm7 SUBPD %xmm7, %xmm1 addl $4 * SIZE, A1 ALIGN_4 .L17: testl $1, M jle .L18 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm2, %xmm5 SUBPD %xmm5, %xmm1 ALIGN_4 .L18: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0xc0, %xmm5, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm5, %xmm0 #else xorpd %xmm5, %xmm1 #endif #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 addpd %xmm2, %xmm0 #endif pshufd $0x4e, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #else movsd ALPHA_R, %xmm6 movsd ALPHA_I, %xmm7 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 #endif mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm1 xorpd %xmm5, %xmm1 subpd %xmm1, %xmm0 movsd 0 * SIZE(Y1), %xmm4 movhpd 1 * SIZE(Y1), %xmm4 addpd %xmm4, %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addl INCY, Y1 decl J jg .L11 ALIGN_4 .L999: movl M,%eax sall $ZBASE_SHIFT,%eax addl %eax,AA movl STACK_INCX,INCX imull INCX,%eax addl %eax,XX jmp .L0t ALIGN_4 .L999x: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS,%esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/znrm2.S000066400000000000000000000117041313527062700162500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %edx #define X %ecx #define INCX %esi #define I %eax #include "l1param.h" PROLOGUE pushl %esi pushl %ebx PROFCODE movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif #ifdef F_INTERFACE movl (M), %ebx movl (INCX), INCX #endif fldz testl M, M jle .L999 testl INCX, INCX jle .L999 sall $ZBASE_SHIFT, INCX fldz fldz fldz cmpl $SIZE * 2, INCX jne .L40 movl M, I sarl $2, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) fmul %st(0), %st FLD 2 * SIZE(X) fmul %st(0), %st FLD 3 * SIZE(X) fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fmul %st(0), %st FLD 5 * SIZE(X) fmul %st(0), %st FLD 6 * SIZE(X) fmul %st(0), %st FLD 7 * SIZE(X) fmul %st(0), %st addl $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L10 ALIGN_4 .L20: movl M, I andl $3, I jle .L998 ALIGN_4 .L21: FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) fmul %st(0), %st faddp %st,%st(3) faddp %st,%st(1) addl $2 * SIZE, X decl I jg .L21 jmp .L998 ALIGN_4 .L40: movl M, I sarl $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addl INCX, X fmul %st(0), %st FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addl INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addl INCX, X fmul %st(0), %st FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addl INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decl I jg .L50 ALIGN_4 .L60: movl M, I andl $3, I jle .L998 ALIGN_4 .L61: FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addl INCX, X fmul %st(0), %st faddp %st,%st(3) faddp %st,%st(1) decl I jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: fsqrt popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/znrm2_sse.S000066400000000000000000000212451313527062700171230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define M %edx #define X %ecx #define INCX %esi #define I %eax #include "l1param.h" PROLOGUE PROFCODE pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX pxor %xmm0, %xmm0 testl M, M jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX jle .L999 sall $ZBASE_SHIFT, INCX cmpl $2 * SIZE, INCX jne .L40 addl M, M subl $-32 * SIZE, X testl $SIZE, X je .L05 movss -32 * SIZE(X), %xmm0 cvtss2sd %xmm0, %xmm0 mulsd %xmm0, %xmm0 addl $SIZE, X decl M jle .L998 ALIGN_3 .L05: movl M, I sarl $4, I jle .L13 movsd -32 * SIZE(X), %xmm4 movsd -30 * SIZE(X), %xmm5 movsd -28 * SIZE(X), %xmm6 movsd -26 * SIZE(X), %xmm7 decl I jle .L12 ALIGN_3 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif cvtps2pd %xmm4, %xmm2 movsd -24 * SIZE(X), %xmm4 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd -22 * SIZE(X), %xmm5 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd -20 * SIZE(X), %xmm6 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd -18 * SIZE(X), %xmm7 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm4, %xmm2 movsd -16 * SIZE(X), %xmm4 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd -14 * SIZE(X), %xmm5 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd -12 * SIZE(X), %xmm6 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd -10 * SIZE(X), %xmm7 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X decl I jg .L10 ALIGN_3 .L12: cvtps2pd %xmm4, %xmm2 movsd -24 * SIZE(X), %xmm4 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd -22 * SIZE(X), %xmm5 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd -20 * SIZE(X), %xmm6 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd -18 * SIZE(X), %xmm7 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 subl $-16 * SIZE, X ALIGN_4 .L13: testl $8, M je .L14 movsd -32 * SIZE(X), %xmm4 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd -30 * SIZE(X), %xmm5 cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 movsd -28 * SIZE(X), %xmm6 cvtps2pd %xmm6, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd -26 * SIZE(X), %xmm7 cvtps2pd %xmm7, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 addl $8 * SIZE, X ALIGN_3 .L14: testl $4, M je .L15 movsd -32 * SIZE(X), %xmm4 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd -30 * SIZE(X), %xmm5 cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 addl $4 * SIZE, X ALIGN_3 .L15: testl $2, M je .L16 movsd -32 * SIZE(X), %xmm4 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 addl $2 * SIZE, X ALIGN_3 .L16: testl $1, M je .L998 movss -32 * SIZE(X), %xmm4 cvtss2sd %xmm4, %xmm2 mulsd %xmm2, %xmm2 addsd %xmm2, %xmm1 jmp .L998 ALIGN_4 .L40: movl M, I sarl $3, I jle .L43 movsd (X), %xmm4 addl INCX, X movsd (X), %xmm5 addl INCX, X movsd (X), %xmm6 addl INCX, X movsd (X), %xmm7 addl INCX, X decl I jle .L42 ALIGN_3 .L41: cvtps2pd %xmm4, %xmm2 movsd (X), %xmm4 addl INCX, X mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd (X), %xmm6 addl INCX, X mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd (X), %xmm7 addl INCX, X mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm4, %xmm2 movsd (X), %xmm4 addl INCX, X mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd (X), %xmm6 addl INCX, X mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 movsd (X), %xmm7 addl INCX, X decl I jg .L41 ALIGN_3 .L42: cvtps2pd %xmm4, %xmm2 movsd (X), %xmm4 addl INCX, X mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 movsd (X), %xmm5 addl INCX, X mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 movsd (X), %xmm6 addl INCX, X mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 movsd (X), %xmm7 addl INCX, X mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 cvtps2pd %xmm6, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 cvtps2pd %xmm7, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 ALIGN_4 .L43: testl $4, M je .L44 movsd (X), %xmm4 addl INCX, X cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd (X), %xmm5 addl INCX, X cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 movsd (X), %xmm6 addl INCX, X cvtps2pd %xmm6, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd (X), %xmm7 addl INCX, X cvtps2pd %xmm7, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 ALIGN_3 .L44: testl $2, M je .L45 movsd (X), %xmm4 addl INCX, X cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 movsd (X), %xmm5 addl INCX, X cvtps2pd %xmm5, %xmm3 mulpd %xmm3, %xmm3 addpd %xmm3, %xmm1 ALIGN_3 .L45: testl $1, M je .L998 movsd (X), %xmm4 cvtps2pd %xmm4, %xmm2 mulpd %xmm2, %xmm2 addpd %xmm2, %xmm0 ALIGN_4 .L998: addpd %xmm1, %xmm0 #ifndef HAVE_SSE3 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif ALIGN_4 .L999: sqrtsd %xmm0, %xmm0 cvtsd2ss %xmm0, %xmm0 movss %xmm0, STACK_M flds STACK_M popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zrot.S000066400000000000000000000160761313527062700162050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define STACK_C 24 + STACK + ARGS(%esp) #ifdef XDOUBLE #define STACK_S 40 + STACK + ARGS(%esp) #elif defined DOUBLE #define STACK_S 32 + STACK + ARGS(%esp) #else #define STACK_S 28 + STACK + ARGS(%esp) #endif #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define I %eax #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCH_SIZE 144 #endif #ifdef OPTERON #define PREFETCH prefetchw #define PREFETCH_SIZE 144 #endif PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY FLD STACK_S FLD STACK_C sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY testl N, N jle .L999 cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 movl N, I sarl $1, I jle .L15 ALIGN_4 .L10: #ifdef PENTIUM4 PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) #endif #ifdef OPTERON PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) #ifdef PENTIUM4 PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) #endif #ifdef OPTERON PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) #endif FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 2 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 2 * SIZE(Y) FLD 3 * SIZE(X) FLD 3 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 3 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 3 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y decl I jg .L10 ALIGN_4 .L15: movl N, I andl $1, I jle .L999 ALIGN_4 .L16: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) jmp .L999 ALIGN_4 .L50: movl N, I sarl $1, I jle .L55 ALIGN_4 .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) addl INCX, X addl INCY, Y decl I jg .L51 ALIGN_4 .L55: movl N, I andl $1, I jle .L999 ALIGN_4 .L56: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) ALIGN_4 .L999: ffreep %st(0) ffreep %st(0) popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zrot_sse.S000066400000000000000000000543041313527062700170530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define STACK_C 24 + STACK + ARGS(%esp) #define STACK_S 28 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define I %eax #include "l1param.h" #define C %xmm6 #define S %xmm7 PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY movss STACK_C, C movss STACK_S, S shufps $0x0, C, C shufps $0x0, S, S cmpl $0, N jle .L999 cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 testl $2 * SIZE, X je .L10 #ifndef HAVE_SSE2 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y decl N jle .L999 .L10: testl $1 * SIZE, X jne .L30 testl $3 * SIZE, Y jne .L20 movl N, I sarl $4, I jle .L14 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 0 * SIZE(Y) movaps 4 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movaps %xmm2, 4 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps 8 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 8 * SIZE(Y) movaps 12 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movaps %xmm2, 12 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps 16 * SIZE(Y), %xmm1 movaps 16 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 16 * SIZE(X) movaps %xmm2, 16 * SIZE(Y) movaps 20 * SIZE(Y), %xmm1 movaps 20 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 20 * SIZE(X) movaps %xmm2, 20 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps 24 * SIZE(Y), %xmm1 movaps 24 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 24 * SIZE(X) movaps %xmm2, 24 * SIZE(Y) movaps 28 * SIZE(Y), %xmm1 movaps 28 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 28 * SIZE(X) movaps %xmm2, 28 * SIZE(Y) addl $32 * SIZE, X addl $32 * SIZE, Y decl I jg .L11 ALIGN_3 .L14: testl $15, N jle .L999 testl $8, N jle .L15 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 0 * SIZE(Y) movaps 4 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movaps %xmm2, 4 * SIZE(Y) movaps 8 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 8 * SIZE(Y) movaps 12 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movaps %xmm2, 12 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L15: testl $4, N jle .L16 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 0 * SIZE(Y) movaps 4 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movaps %xmm2, 4 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L16: testl $2, N jle .L17 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 0 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L17: testl $1, N jle .L999 #ifndef HAVE_SSE2 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movl N, I sarl $4, I jle .L24 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 movaps 16 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 16 * SIZE(X) movlps %xmm2, 16 * SIZE(Y) movhps %xmm2, 18 * SIZE(Y) movsd 20 * SIZE(Y), %xmm1 movhps 22 * SIZE(Y), %xmm1 movaps 20 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 20 * SIZE(X) movlps %xmm2, 20 * SIZE(Y) movhps %xmm2, 22 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movsd 24 * SIZE(Y), %xmm1 movhps 26 * SIZE(Y), %xmm1 movaps 24 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 24 * SIZE(X) movlps %xmm2, 24 * SIZE(Y) movhps %xmm2, 26 * SIZE(Y) movsd 28 * SIZE(Y), %xmm1 movhps 30 * SIZE(Y), %xmm1 movaps 28 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 28 * SIZE(X) movlps %xmm2, 28 * SIZE(Y) movhps %xmm2, 30 * SIZE(Y) addl $32 * SIZE, X addl $32 * SIZE, Y decl I jg .L21 ALIGN_3 .L24: testl $15, N jle .L999 testl $8, N jle .L25 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 8 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 12 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L25: testl $4, N jle .L26 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 4 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L26: testl $2, N jle .L27 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L27: testl $1, N jle .L999 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L30: movl N, I sarl $4, I jle .L34 ALIGN_3 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movsd 4 * SIZE(X), %xmm0 movhps 6 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 4 * SIZE(X) movhps %xmm0, 6 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movsd 8 * SIZE(X), %xmm0 movhps 10 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 8 * SIZE(X) movhps %xmm0, 10 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movsd 12 * SIZE(X), %xmm0 movhps 14 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 12 * SIZE(X) movhps %xmm0, 14 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 movsd 16 * SIZE(X), %xmm0 movhps 18 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 16 * SIZE(X) movhps %xmm0, 18 * SIZE(X) movlps %xmm2, 16 * SIZE(Y) movhps %xmm2, 18 * SIZE(Y) movsd 20 * SIZE(Y), %xmm1 movhps 22 * SIZE(Y), %xmm1 movsd 20 * SIZE(X), %xmm0 movhps 22 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 20 * SIZE(X) movhps %xmm0, 22 * SIZE(X) movlps %xmm2, 20 * SIZE(Y) movhps %xmm2, 22 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movsd 24 * SIZE(Y), %xmm1 movhps 26 * SIZE(Y), %xmm1 movsd 24 * SIZE(X), %xmm0 movhps 26 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 24 * SIZE(X) movhps %xmm0, 26 * SIZE(X) movlps %xmm2, 24 * SIZE(Y) movhps %xmm2, 26 * SIZE(Y) movsd 28 * SIZE(Y), %xmm1 movhps 30 * SIZE(Y), %xmm1 movsd 28 * SIZE(X), %xmm0 movhps 30 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 28 * SIZE(X) movhps %xmm0, 30 * SIZE(X) movlps %xmm2, 28 * SIZE(Y) movhps %xmm2, 30 * SIZE(Y) addl $32 * SIZE, X addl $32 * SIZE, Y decl I jg .L31 ALIGN_3 .L34: testl $15, N jle .L999 testl $8, N jle .L35 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movsd 4 * SIZE(X), %xmm0 movhps 6 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 4 * SIZE(X) movhps %xmm0, 6 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movsd 8 * SIZE(X), %xmm0 movhps 10 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 8 * SIZE(X) movhps %xmm0, 10 * SIZE(X) movlps %xmm2, 8 * SIZE(Y) movhps %xmm2, 10 * SIZE(Y) movsd 12 * SIZE(Y), %xmm1 movhps 14 * SIZE(Y), %xmm1 movsd 12 * SIZE(X), %xmm0 movhps 14 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 12 * SIZE(X) movhps %xmm0, 14 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L35: testl $4, N jle .L36 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) movsd 4 * SIZE(Y), %xmm1 movhps 6 * SIZE(Y), %xmm1 movsd 4 * SIZE(X), %xmm0 movhps 6 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 4 * SIZE(X) movhps %xmm0, 6 * SIZE(X) movlps %xmm2, 4 * SIZE(Y) movhps %xmm2, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L36: testl $2, N jle .L37 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L37: testl $1, N jle .L999 #ifndef HAVE_SSE2 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 ALIGN_3 .L50: movl N, I //if incx ==0 || incy==0 jump to the tail cmpl $0, INCX je .L56 cmpl $0, INCY je .L56 sarl $2, I jle .L55 ALIGN_3 .L53: movsd (Y), %xmm1 movhps (Y, INCY), %xmm1 movsd (X), %xmm0 movhps (X, INCX), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, (X) movhps %xmm0, (X, INCX) movlps %xmm2, (Y) movhps %xmm2, (Y, INCY) leal (X, INCX, 2), X leal (Y, INCY, 2), Y movsd (Y), %xmm1 movhps (Y, INCY), %xmm1 movsd (X), %xmm0 movhps (X, INCX), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, (X) movhps %xmm0, (X, INCX) movlps %xmm2, (Y) movhps %xmm2, (Y, INCY) leal (X, INCX, 2), X leal (Y, INCY, 2), Y decl I jg .L53 ALIGN_3 .L55: #ifndef HAVE_SSE2 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #endif movl N, I andl $3, I jle .L999 ALIGN_3 .L56: movsd (Y), %xmm1 movsd (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, (X) movlps %xmm2, (Y) addl INCX, X addl INCY, Y decl I jg .L56 ALIGN_3 .L999: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zrot_sse2.S000066400000000000000000000632121313527062700171330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define STACK_C 24 + STACK + ARGS(%esp) #define STACK_S 32 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #define I %eax #include "l1param.h" #define C %xmm6 #define S %xmm7 PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY movsd STACK_C, C movsd STACK_S, S pshufd $0x44, C, C pshufd $0x44, S, S cmpl $0, N jle .L999 cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 .L10: testl $SIZE, X jne .L30 testl $SIZE, Y jne .L20 movl N, I sarl $3, I jle .L14 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 8 * SIZE(Y), %xmm1 movapd 8 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 8 * SIZE(X) movapd %xmm2, 8 * SIZE(Y) movapd 10 * SIZE(Y), %xmm1 movapd 10 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 10 * SIZE(X) movapd %xmm2, 10 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 12 * SIZE(Y), %xmm1 movapd 12 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 12 * SIZE(X) movapd %xmm2, 12 * SIZE(Y) movapd 14 * SIZE(Y), %xmm1 movapd 14 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 14 * SIZE(X) movapd %xmm2, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y decl I jg .L11 ALIGN_3 .L14: testl $7, N jle .L999 testl $4, N jle .L15 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: testl $2, N jle .L16 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: testl $1, N jle .L999 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movapd -1 * SIZE(Y), %xmm1 movl N, I sarl $3, I jle .L24 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) movapd 3 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movlpd %xmm2, 2 * SIZE(Y) movhpd %xmm2, 3 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 5 * SIZE(Y), %xmm4 movapd 4 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movlpd %xmm2, 4 * SIZE(Y) movhpd %xmm2, 5 * SIZE(Y) movapd 7 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movlpd %xmm2, 6 * SIZE(Y) movhpd %xmm2, 7 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 9 * SIZE(Y), %xmm4 movapd 8 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 8 * SIZE(X) movlpd %xmm2, 8 * SIZE(Y) movhpd %xmm2, 9 * SIZE(Y) movapd 11 * SIZE(Y), %xmm1 movapd 10 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 10 * SIZE(X) movlpd %xmm2, 10 * SIZE(Y) movhpd %xmm2, 11 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 13 * SIZE(Y), %xmm4 movapd 12 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 12 * SIZE(X) movlpd %xmm2, 12 * SIZE(Y) movhpd %xmm2, 13 * SIZE(Y) movapd 15 * SIZE(Y), %xmm1 movapd 14 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 14 * SIZE(X) movlpd %xmm2, 14 * SIZE(Y) movhpd %xmm2, 15 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y decl I jg .L21 ALIGN_3 .L24: testl $7, N jle .L999 testl $4, N jle .L25 movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) movapd 3 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movlpd %xmm2, 2 * SIZE(Y) movhpd %xmm2, 3 * SIZE(Y) movapd 5 * SIZE(Y), %xmm4 movapd 4 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movlpd %xmm2, 4 * SIZE(Y) movhpd %xmm2, 5 * SIZE(Y) movapd 7 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movlpd %xmm2, 6 * SIZE(Y) movhpd %xmm2, 7 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $2, N jle .L26 movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) movapd 3 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 SHUFPD_1 %xmm1, %xmm4 movapd %xmm4, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm4 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm4, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movlpd %xmm2, 2 * SIZE(Y) movhpd %xmm2, 3 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: testl $1, N jle .L999 movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) jmp .L999 ALIGN_3 .L30: testl $SIZE, Y jne .L40 movapd -1 * SIZE(X), %xmm0 movl N, I sarl $3, I jle .L34 ALIGN_3 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 1 * SIZE(X), %xmm4 movapd 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 3 * SIZE(X), %xmm0 movapd 2 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm4 movapd %xmm1, %xmm2 movapd %xmm4, %xmm3 mulpd C, %xmm4 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm4 subpd %xmm3, %xmm2 movlpd %xmm4, 2 * SIZE(X) movhpd %xmm4, 3 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 5 * SIZE(X), %xmm4 movapd 4 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 4 * SIZE(X) movhpd %xmm0, 5 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 7 * SIZE(X), %xmm0 movapd 6 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm4 movapd %xmm1, %xmm2 movapd %xmm4, %xmm3 mulpd C, %xmm4 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm4 subpd %xmm3, %xmm2 movlpd %xmm4, 6 * SIZE(X) movhpd %xmm4, 7 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 9 * SIZE(X), %xmm4 movapd 8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 8 * SIZE(X) movhpd %xmm0, 9 * SIZE(X) movapd %xmm2, 8 * SIZE(Y) movapd 11 * SIZE(X), %xmm0 movapd 10 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm4 movapd %xmm1, %xmm2 movapd %xmm4, %xmm3 mulpd C, %xmm4 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm4 subpd %xmm3, %xmm2 movlpd %xmm4, 10 * SIZE(X) movhpd %xmm4, 11 * SIZE(X) movapd %xmm2, 10 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 13 * SIZE(X), %xmm4 movapd 12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 12 * SIZE(X) movhpd %xmm0, 13 * SIZE(X) movapd %xmm2, 12 * SIZE(Y) movapd 15 * SIZE(X), %xmm0 movapd 14 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm4 movapd %xmm1, %xmm2 movapd %xmm4, %xmm3 mulpd C, %xmm4 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm4 subpd %xmm3, %xmm2 movlpd %xmm4, 14 * SIZE(X) movhpd %xmm4, 15 * SIZE(X) movapd %xmm2, 14 * SIZE(Y) addl $16 * SIZE, Y addl $16 * SIZE, X decl I jg .L31 ALIGN_3 .L34: testl $7, N jle .L999 testl $4, N jle .L35 movapd 1 * SIZE(X), %xmm4 movapd 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 3 * SIZE(X), %xmm0 movapd 2 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm4 movapd %xmm1, %xmm2 movapd %xmm4, %xmm3 mulpd C, %xmm4 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm4 subpd %xmm3, %xmm2 movlpd %xmm4, 2 * SIZE(X) movhpd %xmm4, 3 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) movapd 5 * SIZE(X), %xmm4 movapd 4 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 4 * SIZE(X) movhpd %xmm0, 5 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 7 * SIZE(X), %xmm0 movapd 6 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm4 movapd %xmm1, %xmm2 movapd %xmm4, %xmm3 mulpd C, %xmm4 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm4 subpd %xmm3, %xmm2 movlpd %xmm4, 6 * SIZE(X) movhpd %xmm4, 7 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) addl $8 * SIZE, Y addl $8 * SIZE, X ALIGN_3 .L35: testl $2, N jle .L36 movapd 1 * SIZE(X), %xmm4 movapd 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 3 * SIZE(X), %xmm0 movapd 2 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm4 movapd %xmm1, %xmm2 movapd %xmm4, %xmm3 mulpd C, %xmm4 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm4 subpd %xmm3, %xmm2 movlpd %xmm4, 2 * SIZE(X) movhpd %xmm4, 3 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) addl $4 * SIZE, Y addl $4 * SIZE, X ALIGN_3 .L36: testl $1, N jle .L999 movapd 1 * SIZE(X), %xmm4 movapd 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L40: movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) addl $1 * SIZE, Y addl $1 * SIZE, X decl N jle .L47 movl N, I sarl $3, I jle .L44 ALIGN_3 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 8 * SIZE(Y), %xmm1 movapd 8 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 8 * SIZE(X) movapd %xmm2, 8 * SIZE(Y) movapd 10 * SIZE(Y), %xmm1 movapd 10 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 10 * SIZE(X) movapd %xmm2, 10 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 12 * SIZE(Y), %xmm1 movapd 12 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 12 * SIZE(X) movapd %xmm2, 12 * SIZE(Y) movapd 14 * SIZE(Y), %xmm1 movapd 14 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 14 * SIZE(X) movapd %xmm2, 14 * SIZE(Y) addl $16 * SIZE, X addl $16 * SIZE, Y decl I jg .L41 ALIGN_3 .L44: testl $4, N jle .L45 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 4 * SIZE(Y) movapd 6 * SIZE(Y), %xmm1 movapd 6 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 6 * SIZE(X) movapd %xmm2, 6 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L45: testl $2, N jle .L46 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 2 * SIZE(X) movapd %xmm2, 2 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L46: testl $1, N jle .L47 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) addl $2 * SIZE, Y addl $2 * SIZE, X ALIGN_3 .L47: movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L50: movl N, I sarl $2, I jle .L55 ALIGN_3 .L53: movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addl INCX, X addl INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addl INCX, X addl INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addl INCX, X addl INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addl INCX, X addl INCY, Y decl I jg .L53 ALIGN_3 .L55: movl N, I andl $3, I jle .L999 ALIGN_3 .L56: movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(X) movhpd %xmm0, 1 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addl INCX, X addl INCY, Y decl I jg .L56 ALIGN_3 .L999: popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zscal.S000066400000000000000000000142661313527062700163220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 8 #define STACK_N 4 + STACK(%esp) #ifdef XDOUBLE #define ALPHA_R 16 + STACK(%esp) #define ALPHA_I 32 + STACK(%esp) #define STACK_X 48 + STACK(%esp) #define STACK_INCX 52 + STACK(%esp) #elif defined(DOUBLE) #define ALPHA_R 16 + STACK(%esp) #define ALPHA_I 24 + STACK(%esp) #define STACK_X 32 + STACK(%esp) #define STACK_INCX 36 + STACK(%esp) #else #define ALPHA_R 16 + STACK(%esp) #define ALPHA_I 20 + STACK(%esp) #define STACK_X 24 + STACK(%esp) #define STACK_INCX 28 + STACK(%esp) #endif #define N %esi #define X %edx #define INCX %ebx #define I %ecx PROLOGUE pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX sall $ZBASE_SHIFT, INCX FLD ALPHA_R FLD ALPHA_I testl N, N jle .L999 fld %st(1) fabs fld %st(1) fabs faddp %st, %st(1) fldz fcomip %st(1), %st ffreep %st(0) jne .L30 EMMS pxor %mm0, %mm0 cmpl $2 * SIZE, INCX jne .L20 movl N, I sarl $2, I jle .L15 ALIGN_4 .L12: #ifdef XDOUBLE movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) movq %mm0, 32(X) movq %mm0, 40(X) movq %mm0, 48(X) movq %mm0, 56(X) movq %mm0, 64(X) movq %mm0, 72(X) movq %mm0, 80(X) movq %mm0, 88(X) movq %mm0, 96(X) movq %mm0, 104(X) movq %mm0, 112(X) movq %mm0, 120(X) #elif defined(DOUBLE) movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) movq %mm0, 32(X) movq %mm0, 40(X) movq %mm0, 48(X) movq %mm0, 56(X) #else movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) #endif addl $8 * SIZE, X decl I jg .L12 ALIGN_3 .L15: movl N, I andl $3, I jle .L18 ALIGN_2 .L16: #ifdef XDOUBLE movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) #elif defined(DOUBLE) movq %mm0, 0(X) movq %mm0, 8(X) #else movq %mm0, 0(X) #endif addl $2 * SIZE, X decl I jg .L16 .L18: EMMS xorl %eax, %eax popl %ebx popl %esi ret ALIGN_2 .L20: movl N, I sarl $2, I jle .L25 ALIGN_3 .L22: #ifdef XDOUBLE movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addl INCX, X movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addl INCX, X movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addl INCX, X movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addl INCX, X #elif defined(DOUBLE) movq %mm0, 0(X) movq %mm0, 8(X) addl INCX, X movq %mm0, 0(X) movq %mm0, 8(X) addl INCX, X movq %mm0, 0(X) movq %mm0, 8(X) addl INCX, X movq %mm0, 0(X) movq %mm0, 8(X) addl INCX, X #else movq %mm0, 0(X) addl INCX, X movq %mm0, 0(X) addl INCX, X movq %mm0, 0(X) addl INCX, X movq %mm0, 0(X) addl INCX, X #endif decl I jg .L22 ALIGN_3 .L25: movl N, I andl $3, I jle .L28 ALIGN_3 .L26: #ifdef XDOUBLE movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addl INCX, X #elif defined(DOUBLE) movq %mm0, 0(X) movq %mm0, 8(X) addl INCX, X #else movq %mm0, 0(X) addl INCX, X #endif decl I jg .L26 .L28: EMMS xorl %eax, %eax popl %ebx popl %esi ret ALIGN_3 .L30: movl N, I ALIGN_2 .L32: FLD 0 * SIZE(X) fmul %st(1),%st FLD 1 * SIZE(X) fmul %st(3),%st faddp %st,%st(1) FLD 0 * SIZE(X) fmul %st(3),%st FLD 1 * SIZE(X) fmul %st(3),%st fsubrp %st,%st(1) FST 0 * SIZE(X) FST 1 * SIZE(X) addl INCX, X decl I jg .L32 ALIGN_2 .L999: ffreep %st(0) ffreep %st(0) xorl %eax,%eax popl %ebx popl %esi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zscal_sse.S000066400000000000000000000613671313527062700172000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 20 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define XX %edi #define FLAG %ebp #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF #endif #include "l1param.h" PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx pushl %ebp movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movss STACK_ALPHA_R, %xmm0 movss STACK_ALPHA_I, %xmm1 sall $ZBASE_SHIFT, INCX xor FLAG, FLAG testl M, M jle .L999 xorps %xmm7, %xmm7 comiss %xmm0, %xmm7 jne .L100 # Alpha_r != ZERO comiss %xmm1, %xmm7 jne .L100 # Alpha_i != ZERO /* Alpha == ZERO */ cmpl $2 * SIZE, INCX jne .L50 /* INCX == 1 */ cmpl $3, M jle .L13 testl $4, X je .L05 movss %xmm7, 0 * SIZE(X) addl $SIZE, X movl $1, FLAG decl M ALIGN_3 .L05: testl $8, X je .L06 movlps %xmm7, 0 * SIZE(X) addl $2 * SIZE, X subl $1, M ALIGN_3 .L06: movl M, I # rcx = n sarl $3, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm7, 0 * SIZE(X) movaps %xmm7, 4 * SIZE(X) movaps %xmm7, 8 * SIZE(X) movaps %xmm7, 12 * SIZE(X) addl $16 * SIZE, X decl I jg .L11 ALIGN_4 .L12: testl $7, M je .L19 testl $4, M je .L13 movaps %xmm7, 0 * SIZE(X) movaps %xmm7, 4 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L13: testl $2, M je .L14 movlps %xmm7, 0 * SIZE(X) movhps %xmm7, 2 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L14: testl $1, M je .L19 movlps %xmm7, 0 * SIZE(X) addl $2 * SIZE, X ALIGN_3 .L19: testl $1, FLAG je .L999 movss %xmm7, 0 * SIZE(X) jmp .L999 ALIGN_4 /* incx != 1 */ .L50: movl M, I # rcx = n sarl $2, I jle .L52 ALIGN_4 .L51: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd %xmm7, 0 * SIZE(X) addl INCX, X movsd %xmm7, 0 * SIZE(X) addl INCX, X movsd %xmm7, 0 * SIZE(X) addl INCX, X movsd %xmm7, 0 * SIZE(X) addl INCX, X decl I jg .L51 ALIGN_4 .L52: testl $2, M je .L53 movsd %xmm7, 0 * SIZE(X) addl INCX, X movsd %xmm7, 0 * SIZE(X) addl INCX, X ALIGN_3 .L53: testl $1, M je .L999 movsd %xmm7, 0 * SIZE(X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: testl $SIZE, X jne .L130 cmpl $2 * SIZE, INCX jne .L120 movaps %xmm0, %xmm6 shufps $0, %xmm6, %xmm6 shufps $0, %xmm1, %xmm1 subps %xmm1, %xmm7 unpcklps %xmm1, %xmm7 subl $-32 * SIZE, X testl $2 * SIZE, X je .L105 movsd -32 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) addl $2 * SIZE, X decl M jle .L999 ALIGN_3 .L105: movl M, I sarl $4, I jle .L115 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 decl I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -16 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -12 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(X) movaps -8 * SIZE(X), %xmm2 PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(X) movaps -4 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps 0 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(X) movaps 4 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(X) movaps 8 * SIZE(X), %xmm2 PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(X) movaps 12 * SIZE(X), %xmm3 subl $-32 * SIZE, X decl I jg .L111 ALIGN_4 .L112: PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -16 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -12 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(X) movaps -8 * SIZE(X), %xmm2 PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(X) movaps -4 * SIZE(X), %xmm3 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(X) PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, -12 * SIZE(X) PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movaps %xmm2, -8 * SIZE(X) PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movaps %xmm3, -4 * SIZE(X) subl $-32 * SIZE, X ALIGN_4 .L115: testl $8, M je .L116 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(X) PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movaps %xmm2, -24 * SIZE(X) PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movaps %xmm3, -20 * SIZE(X) addl $16 * SIZE, X ALIGN_3 .L116: testl $4, M je .L117 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(X) PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, -28 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L117: testl $2, M je .L118 movaps -32 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L118: testl $1, M je .L999 movsd -32 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) jmp .L999 ALIGN_3 .L120: PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) subps %xmm1, %xmm7 unpcklps %xmm1, %xmm7 movl X, XX movl M, I sarl $3, I jle .L125 movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X movsd (X), %xmm1 addl INCX, X movhps (X), %xmm1 addl INCX, X movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X movsd (X), %xmm3 addl INCX, X movhps (X), %xmm3 addl INCX, X decl I jle .L122 ALIGN_4 .L121: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, (XX) addl INCX, XX movhps %xmm0, (XX) addl INCX, XX movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, (XX) addl INCX, XX movhps %xmm1, (XX) addl INCX, XX movsd (X), %xmm1 addl INCX, X movhps (X), %xmm1 addl INCX, X PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movlps %xmm2, (XX) addl INCX, XX movhps %xmm2, (XX) addl INCX, XX movsd (X), %xmm2 addl INCX, X movhps (X), %xmm2 addl INCX, X PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movlps %xmm3, (XX) addl INCX, XX movhps %xmm3, (XX) addl INCX, XX movsd (X), %xmm3 addl INCX, X movhps (X), %xmm3 addl INCX, X decl I jg .L121 ALIGN_4 .L122: PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, (XX) addl INCX, XX movhps %xmm0, (XX) addl INCX, XX PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, (XX) addl INCX, XX movhps %xmm1, (XX) addl INCX, XX PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movlps %xmm2, (XX) addl INCX, XX movhps %xmm2, (XX) addl INCX, XX PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movlps %xmm3, (XX) addl INCX, XX movhps %xmm3, (XX) addl INCX, XX ALIGN_4 .L125: testl $4, M je .L127 movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, (XX) addl INCX, XX movhps %xmm0, (XX) addl INCX, XX movsd (X), %xmm1 addl INCX, X movhps (X), %xmm1 addl INCX, X PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, (XX) addl INCX, XX movhps %xmm1, (XX) addl INCX, XX ALIGN_3 .L127: testl $2, M je .L128 movsd (X), %xmm0 addl INCX, X movhps (X), %xmm0 addl INCX, X PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, (XX) addl INCX, XX movhps %xmm0, (XX) addl INCX, XX ALIGN_3 .L128: testl $1, M je .L999 movsd (X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, (XX) jmp .L999 ALIGN_3 .L130: cmpl $2 * SIZE, INCX jne .L120 #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) subps %xmm1, %xmm7 unpcklps %xmm1, %xmm7 subl $-31 * SIZE, X testl $2 * SIZE, X je .L130x movsd -31 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -31 * SIZE(X) addl $2 * SIZE, X decl M jle .L999 ALIGN_3 .L130x: shufps $0xb1, %xmm7, %xmm7 movaps -32 * SIZE(X), %xmm0 movaps %xmm0, %xmm4 movl M, I sarl $4, I jle .L135 movaps -28 * SIZE(X), %xmm1 decl I jle .L132 ALIGN_4 .L131: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -20 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -24 * SIZE(X) movaps -16 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -20 * SIZE(X) movaps -12 * SIZE(X), %xmm1 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps -8 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -12 * SIZE(X) movaps -4 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -8 * SIZE(X) movaps 0 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -4 * SIZE(X) movaps 4 * SIZE(X), %xmm1 subl $-32 * SIZE, X decl I jg .L131 ALIGN_4 .L132: movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -20 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -24 * SIZE(X) movaps -16 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -20 * SIZE(X) movaps -12 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps -8 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -12 * SIZE(X) movaps -4 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -8 * SIZE(X) movaps 0 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -4 * SIZE(X) subl $-32 * SIZE, X ALIGN_4 .L135: testl $8, M je .L136 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -20 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -24 * SIZE(X) movaps -16 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -20 * SIZE(X) addl $16 * SIZE, X ALIGN_3 .L136: testl $4, M je .L137 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movss %xmm0, %xmm1 PSHUFD2($0x1b, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movaps %xmm1, %xmm4 movss %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L137: testl $2, M je .L138 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 PSHUFD2($0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movaps %xmm0, %xmm2 movss %xmm4, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps %xmm2, %xmm4 movaps %xmm1, %xmm0 addl $4 * SIZE, X ALIGN_3 .L138: movss %xmm4, -32 * SIZE(X) testl $1, M je .L999 PSHUFD2( $0x1b, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 PSHUFD1( $0x39, %xmm0) movlps %xmm0, -31 * SIZE(X) jmp .L999 ALIGN_3 #else PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) subps %xmm1, %xmm7 unpcklps %xmm1, %xmm7 subl $-32 * SIZE, X testl $2 * SIZE, X je .L130x #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) addl $2 * SIZE, X decl M jle .L999 ALIGN_3 .L130x: movl M, I sarl $4, I jle .L135 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 decl I jle .L132 ALIGN_4 .L131: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) movsd -16 * SIZE(X), %xmm0 movhps -14 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) movsd -12 * SIZE(X), %xmm1 movhps -10 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movlps %xmm2, -24 * SIZE(X) movhps %xmm2, -22 * SIZE(X) movsd -8 * SIZE(X), %xmm2 movhps -6 * SIZE(X), %xmm2 PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movlps %xmm3, -20 * SIZE(X) movhps %xmm3, -18 * SIZE(X) movsd -4 * SIZE(X), %xmm3 movhps -2 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -14 * SIZE(X) movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, -12 * SIZE(X) movhps %xmm1, -10 * SIZE(X) movsd 4 * SIZE(X), %xmm1 movhps 6 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movlps %xmm2, -8 * SIZE(X) movhps %xmm2, -6 * SIZE(X) movsd 8 * SIZE(X), %xmm2 movhps 10 * SIZE(X), %xmm2 PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movlps %xmm3, -4 * SIZE(X) movhps %xmm3, -2 * SIZE(X) movsd 12 * SIZE(X), %xmm3 movhps 14 * SIZE(X), %xmm3 subl $-32 * SIZE, X decl I jg .L131 ALIGN_4 .L132: PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) movsd -16 * SIZE(X), %xmm0 movhps -14 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) movsd -12 * SIZE(X), %xmm1 movhps -10 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movlps %xmm2, -24 * SIZE(X) movhps %xmm2, -22 * SIZE(X) movsd -8 * SIZE(X), %xmm2 movhps -6 * SIZE(X), %xmm2 PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movlps %xmm3, -20 * SIZE(X) movhps %xmm3, -18 * SIZE(X) movsd -4 * SIZE(X), %xmm3 movhps -2 * SIZE(X), %xmm3 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -14 * SIZE(X) PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, -12 * SIZE(X) movhps %xmm1, -10 * SIZE(X) PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movlps %xmm2, -8 * SIZE(X) movhps %xmm2, -6 * SIZE(X) PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movlps %xmm3, -4 * SIZE(X) movhps %xmm3, -2 * SIZE(X) subl $-32 * SIZE, X ALIGN_4 .L135: testl $8, M je .L136 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 PSHUFD2( $0xb1, %xmm2, %xmm5) mulps %xmm6, %xmm2 mulps %xmm7, %xmm5 addps %xmm5, %xmm2 movlps %xmm2, -24 * SIZE(X) movhps %xmm2, -22 * SIZE(X) movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 PSHUFD2( $0xb1, %xmm3, %xmm5) mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm5, %xmm3 movlps %xmm3, -20 * SIZE(X) movhps %xmm3, -18 * SIZE(X) addl $16 * SIZE, X ALIGN_3 .L136: testl $4, M je .L137 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) PSHUFD2( $0xb1, %xmm1, %xmm5) mulps %xmm6, %xmm1 mulps %xmm7, %xmm5 addps %xmm5, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L137: testl $2, M je .L138 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L138: testl $1, M je .L999 movsd -32 * SIZE(X), %xmm0 PSHUFD2( $0xb1, %xmm0, %xmm5) mulps %xmm6, %xmm0 mulps %xmm7, %xmm5 addps %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(X) ALIGN_3 #endif .L999: xorl %eax, %eax popl %ebp popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zscal_sse2.S000066400000000000000000000775201313527062700172600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 24 + STACK + ARGS(%esp) #define STACK_X 32 + STACK + ARGS(%esp) #define STACK_INCX 36 + STACK + ARGS(%esp) #define M %ebx #define X %ecx #define INCX %edx #define I %esi #define XX %edi #define FLAG %ebp #include "l1param.h" #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF #endif #define xmm8 xmm0 #define xmm9 xmm1 #define xmm10 xmm2 #define xmm11 xmm3 #define xmm12 xmm4 #define xmm13 xmm5 #define xmm14 xmm6 #define xmm15 xmm7 PROLOGUE PROFCODE pushl %edi pushl %esi pushl %ebx pushl %ebp movl STACK_M, M movl STACK_X, X movl STACK_INCX, INCX movsd STACK_ALPHA_R, %xmm0 movsd STACK_ALPHA_I, %xmm1 sall $ZBASE_SHIFT, INCX xor FLAG, FLAG testl M, M jle .L999 xorps %xmm7, %xmm7 comisd %xmm0, %xmm7 jne .L100 comisd %xmm1, %xmm7 jne .L100 /* Alpha == ZERO */ cmpl $2 * SIZE, INCX jne .L20 /* INCX == 1 */ testl $SIZE, X je .L05 movsd %xmm7, 0 * SIZE(X) addl $SIZE, X movl $1, FLAG decl M jle .L19 ALIGN_3 .L05: movl M, I # rcx = n sarl $3, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm7, 0 * SIZE(X) movaps %xmm7, 2 * SIZE(X) movaps %xmm7, 4 * SIZE(X) movaps %xmm7, 6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm7, 8 * SIZE(X) movaps %xmm7, 10 * SIZE(X) movaps %xmm7, 12 * SIZE(X) movaps %xmm7, 14 * SIZE(X) addl $16 * SIZE, X decl I jg .L11 ALIGN_4 .L12: testl $4, M je .L13 movaps %xmm7, 0 * SIZE(X) movaps %xmm7, 2 * SIZE(X) movaps %xmm7, 4 * SIZE(X) movaps %xmm7, 6 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L13: testl $2, M je .L14 movaps %xmm7, 0 * SIZE(X) movaps %xmm7, 2 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L14: testl $1, M je .L19 movaps %xmm7, 0 * SIZE(X) addl $2 * SIZE, X ALIGN_3 .L19: testl $1, FLAG je .L999 movsd %xmm7, 0 * SIZE(X) jmp .L999 ALIGN_4 /* incx != 1 */ .L20: testl $SIZE, X jne .L30 /* Aligned Mode */ movl M, I # rcx = n sarl $2, I jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm7, (X) addl INCX, X movaps %xmm7, (X) addl INCX, X movaps %xmm7, (X) addl INCX, X movaps %xmm7, (X) addl INCX, X decl I jg .L21 ALIGN_4 .L22: testl $3, M je .L999 testl $2, M je .L23 movaps %xmm7, (X) addl INCX, X movaps %xmm7, (X) addl INCX, X ALIGN_3 .L23: testl $1, M je .L999 movaps %xmm7, (X) jmp .L999 ALIGN_4 /* Unaligned Mode */ .L30: movl M, I # rcx = n sarl $2, I jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movlps %xmm7, 0 * SIZE(X) movlps %xmm7, 1 * SIZE(X) addl INCX, X movlps %xmm7, 0 * SIZE(X) movlps %xmm7, 1 * SIZE(X) addl INCX, X movlps %xmm7, 0 * SIZE(X) movlps %xmm7, 1 * SIZE(X) addl INCX, X movlps %xmm7, 0 * SIZE(X) movlps %xmm7, 1 * SIZE(X) addl INCX, X decl I jg .L31 ALIGN_4 .L32: testl $3, M je .L999 testl $2, M je .L33 movlps %xmm7, 0 * SIZE(X) movlps %xmm7, 1 * SIZE(X) addl INCX, X movlps %xmm7, 0 * SIZE(X) movlps %xmm7, 1 * SIZE(X) addl INCX, X ALIGN_3 .L33: testl $1, M je .L999 movlps %xmm7, 0 * SIZE(X) movlps %xmm7, 1 * SIZE(X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: testl $SIZE, X jne .L200 #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 #else pshufd $0x44, %xmm0, %xmm6 #endif xorps %xmm7, %xmm7 subsd %xmm1, %xmm7 movlhps %xmm1, %xmm7 cmpl $2 * SIZE, INCX jne .L120 subl $-16 * SIZE, X movl M, I sarl $3, I jle .L115 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 decl I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm0, %xmm5 #else movsd -15 * SIZE(X), %xmm5 movhps -16 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps -8 * SIZE(X), %xmm0 #ifdef USE_PSHUFD pshufd $0x4e, %xmm1, %xmm5 #else movsd -13 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -6 * SIZE(X), %xmm1 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm2, %xmm5 #else movsd -11 * SIZE(X), %xmm5 movhps -12 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(X) movaps -4 * SIZE(X), %xmm2 #ifdef USE_PSHUFD pshufd $0x4e, %xmm3, %xmm5 #else movsd -9 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(X) movaps -2 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm0, %xmm5 #else movsd -7 * SIZE(X), %xmm5 movhps -8 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, -8 * SIZE(X) movaps 0 * SIZE(X), %xmm0 #ifdef USE_PSHUFD pshufd $0x4e, %xmm1, %xmm5 #else movsd -5 * SIZE(X), %xmm5 movhps -6 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, -6 * SIZE(X) movaps 2 * SIZE(X), %xmm1 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm2, %xmm5 #else movsd -3 * SIZE(X), %xmm5 movhps -4 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, -4 * SIZE(X) movaps 4 * SIZE(X), %xmm2 #ifdef USE_PSHUFD pshufd $0x4e, %xmm3, %xmm5 #else movsd -1 * SIZE(X), %xmm5 movhps -2 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, -2 * SIZE(X) movaps 6 * SIZE(X), %xmm3 subl $-16 * SIZE, X decl I jg .L111 ALIGN_4 .L112: #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm0, %xmm5 #else movsd -15 * SIZE(X), %xmm5 movhps -16 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps -8 * SIZE(X), %xmm0 #ifdef USE_PSHUFD pshufd $0x4e, %xmm1, %xmm5 #else movsd -13 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -6 * SIZE(X), %xmm1 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm2, %xmm5 #else movsd -11 * SIZE(X), %xmm5 movhps -12 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(X) movaps -4 * SIZE(X), %xmm2 #ifdef USE_PSHUFD pshufd $0x4e, %xmm3, %xmm5 #else movsd -9 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(X) movaps -2 * SIZE(X), %xmm3 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm0, %xmm5 #else movsd -7 * SIZE(X), %xmm5 movhps -8 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, -8 * SIZE(X) #ifdef USE_PSHUFD pshufd $0x4e, %xmm1, %xmm5 #else movsd -5 * SIZE(X), %xmm5 movhps -6 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, -6 * SIZE(X) #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm2, %xmm5 #else movsd -3 * SIZE(X), %xmm5 movhps -4 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, -4 * SIZE(X) #ifdef USE_PSHUFD pshufd $0x4e, %xmm3, %xmm5 #else movsd -1 * SIZE(X), %xmm5 movhps -2 * SIZE(X), %xmm5 #endif mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, -2 * SIZE(X) subl $-16 * SIZE, X ALIGN_3 .L115: testl $7, M je .L999 testl $4, M je .L116 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(X) pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, -12 * SIZE(X) pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, -10 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L116: testl $2, M je .L117 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(X) pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, -14 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L117: testl $1, M je .L999 movaps -16 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, -16 * SIZE(X) jmp .L999 ALIGN_3 .L120: movl X, XX movl M, I sarl $3, I jle .L125 movaps (X), %xmm0 addl INCX, X movaps (X), %xmm1 addl INCX, X movaps (X), %xmm2 addl INCX, X movaps (X), %xmm3 addl INCX, X decl I jle .L122 ALIGN_4 .L121: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, (XX) addl INCX, XX movaps (X), %xmm0 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, (XX) addl INCX, XX movaps (X), %xmm1 addl INCX, X pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, (XX) addl INCX, XX movaps (X), %xmm2 addl INCX, X pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, (XX) addl INCX, XX movaps (X), %xmm3 addl INCX, X #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, (XX) addl INCX, XX movaps (X), %xmm0 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, (XX) addl INCX, XX movaps (X), %xmm1 addl INCX, X pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, (XX) addl INCX, XX movaps (X), %xmm2 addl INCX, X pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, (XX) addl INCX, XX movaps (X), %xmm3 addl INCX, X decl I jg .L121 ALIGN_4 .L122: pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, (XX) addl INCX, XX movaps (X), %xmm0 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, (XX) addl INCX, XX movaps (X), %xmm1 addl INCX, X pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, (XX) addl INCX, XX movaps (X), %xmm2 addl INCX, X pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, (XX) addl INCX, XX movaps (X), %xmm3 addl INCX, X pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, (XX) addl INCX, XX pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, (XX) addl INCX, XX pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, (XX) addl INCX, XX pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, (XX) addl INCX, XX ALIGN_3 .L125: testl $7, M je .L999 testl $4, M je .L126 movaps (X), %xmm0 addl INCX, X movaps (X), %xmm1 addl INCX, X movaps (X), %xmm2 addl INCX, X movaps (X), %xmm3 addl INCX, X pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, (XX) addl INCX, XX pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, (XX) addl INCX, XX pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movaps %xmm2, (XX) addl INCX, XX pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movaps %xmm3, (XX) addl INCX, XX ALIGN_3 .L126: testl $2, M je .L127 movaps (X), %xmm0 addl INCX, X movaps (X), %xmm1 addl INCX, X pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, (XX) addl INCX, XX pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movaps %xmm1, (XX) addl INCX, XX ALIGN_3 .L127: testl $1, M je .L999 movaps (X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movaps %xmm0, (XX) jmp .L999 ALIGN_3 .L200: cmpl $2 * SIZE, INCX jne .L220 #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 #else pshufd $0x44, %xmm0, %xmm6 #endif pxor %xmm7, %xmm7 subsd %xmm1, %xmm7 movlhps %xmm1, %xmm7 shufpd $1, %xmm7, %xmm7 movhps 0 * SIZE(X), %xmm0 movaps 1 * SIZE(X), %xmm1 subl $-16 * SIZE, X unpckhpd %xmm0, %xmm0 mulsd %xmm6, %xmm0 movaps %xmm1, %xmm5 mulsd %xmm7, %xmm5 subsd %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) decl M movl M, I sarl $3, I jle .L205 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 decl I jle .L202 ALIGN_4 .L201: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, %xmm5 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps %xmm2, %xmm5 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -7 * SIZE(X), %xmm1 movaps %xmm3, %xmm5 SHUFPD_1 %xmm0, %xmm2 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -11 * SIZE(X) movaps -5 * SIZE(X), %xmm2 movaps %xmm0, %xmm5 SHUFPD_1 %xmm1, %xmm3 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -9 * SIZE(X) movaps -3 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, %xmm5 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -7 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps %xmm2, %xmm5 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -5 * SIZE(X) movaps 1 * SIZE(X), %xmm1 movaps %xmm3, %xmm5 SHUFPD_1 %xmm0, %xmm2 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -3 * SIZE(X) movaps 3 * SIZE(X), %xmm2 movaps %xmm0, %xmm5 SHUFPD_1 %xmm1, %xmm3 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -1 * SIZE(X) movaps 5 * SIZE(X), %xmm3 subl $-16 * SIZE, X decl I jg .L201 ALIGN_4 .L202: movaps %xmm1, %xmm5 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps %xmm2, %xmm5 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -7 * SIZE(X), %xmm1 movaps %xmm3, %xmm5 SHUFPD_1 %xmm0, %xmm2 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -11 * SIZE(X) movaps -5 * SIZE(X), %xmm2 movaps %xmm0, %xmm5 SHUFPD_1 %xmm1, %xmm3 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -9 * SIZE(X) movaps -3 * SIZE(X), %xmm3 movaps %xmm1, %xmm5 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -7 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps %xmm2, %xmm5 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -5 * SIZE(X) movaps 1 * SIZE(X), %xmm1 movaps %xmm3, %xmm5 SHUFPD_1 %xmm0, %xmm2 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -3 * SIZE(X) movaps %xmm0, %xmm5 SHUFPD_1 %xmm1, %xmm3 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -1 * SIZE(X) subl $-16 * SIZE, X ALIGN_3 .L205: testl $4, M je .L206 movaps -13 * SIZE(X), %xmm2 movaps %xmm1, %xmm5 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm3 movaps %xmm2, %xmm5 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps %xmm3, %xmm5 SHUFPD_1 %xmm0, %xmm2 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm2 addpd %xmm5, %xmm2 movaps %xmm2, -11 * SIZE(X) movaps -7 * SIZE(X), %xmm1 movaps %xmm0, %xmm5 SHUFPD_1 %xmm1, %xmm3 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm3 addpd %xmm5, %xmm3 movaps %xmm3, -9 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L206: testl $2, M je .L207 movaps -13 * SIZE(X), %xmm2 movaps %xmm1, %xmm5 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm3 movaps %xmm2, %xmm5 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm1 addpd %xmm5, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $4 * SIZE, X ALIGN_3 .L207: testl $1, M je .L208 movaps -13 * SIZE(X), %xmm2 movaps %xmm1, %xmm5 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm6, %xmm5 mulpd %xmm7, %xmm0 addpd %xmm5, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps %xmm1, %xmm0 movaps %xmm2, %xmm1 addl $2 * SIZE, X ALIGN_3 .L208: unpckhpd %xmm0, %xmm0 mulsd %xmm6, %xmm1 mulsd %xmm7, %xmm0 addsd %xmm1, %xmm0 movlps %xmm0, -15 * SIZE(X) jmp .L999 ALIGN_3 #else #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 #else pshufd $0x44, %xmm0, %xmm6 #endif pxor %xmm7, %xmm7 subsd %xmm1, %xmm7 movlhps %xmm1, %xmm7 subl $-16 * SIZE, X movl M, I sarl $3, I jle .L205 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 decl I jle .L202 ALIGN_4 .L201: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) movsd -8 * SIZE(X), %xmm0 movhps -7 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) movsd -6 * SIZE(X), %xmm1 movhps -5 * SIZE(X), %xmm1 pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, -12 * SIZE(X) movhps %xmm2, -11 * SIZE(X) movsd -4 * SIZE(X), %xmm2 movhps -3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, -10 * SIZE(X) movhps %xmm3, -9 * SIZE(X) movsd -2 * SIZE(X), %xmm3 movhps -1 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, -8 * SIZE(X) movhps %xmm0, -7 * SIZE(X) movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, -6 * SIZE(X) movhps %xmm1, -5 * SIZE(X) movsd 2 * SIZE(X), %xmm1 movhps 3 * SIZE(X), %xmm1 pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, -4 * SIZE(X) movhps %xmm2, -3 * SIZE(X) movsd 4 * SIZE(X), %xmm2 movhps 5 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, -2 * SIZE(X) movhps %xmm3, -1 * SIZE(X) movsd 6 * SIZE(X), %xmm3 movhps 7 * SIZE(X), %xmm3 subl $-16 * SIZE, X decl I jg .L201 ALIGN_4 .L202: pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) movsd -8 * SIZE(X), %xmm0 movhps -7 * SIZE(X), %xmm0 pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) movsd -6 * SIZE(X), %xmm1 movhps -5 * SIZE(X), %xmm1 pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, -12 * SIZE(X) movhps %xmm2, -11 * SIZE(X) movsd -4 * SIZE(X), %xmm2 movhps -3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, -10 * SIZE(X) movhps %xmm3, -9 * SIZE(X) movsd -2 * SIZE(X), %xmm3 movhps -1 * SIZE(X), %xmm3 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, -8 * SIZE(X) movhps %xmm0, -7 * SIZE(X) pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, -6 * SIZE(X) movhps %xmm1, -5 * SIZE(X) pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, -4 * SIZE(X) movhps %xmm2, -3 * SIZE(X) pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, -2 * SIZE(X) movhps %xmm3, -1 * SIZE(X) subl $-16 * SIZE, X ALIGN_3 .L205: testl $7, M je .L999 testl $4, M je .L206 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, -12 * SIZE(X) movhps %xmm2, -11 * SIZE(X) pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, -10 * SIZE(X) movhps %xmm3, -9 * SIZE(X) addl $8 * SIZE, X ALIGN_3 .L206: testl $2, M je .L207 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) addl $4 * SIZE, X ALIGN_3 .L207: testl $1, M je .L999 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) jmp .L999 ALIGN_3 #endif .L220: #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 #else pshufd $0x44, %xmm0, %xmm6 #endif pxor %xmm7, %xmm7 subsd %xmm1, %xmm7 movlhps %xmm1, %xmm7 movl X, XX movl M, I sarl $3, I jle .L225 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addl INCX, X movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addl INCX, X movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addl INCX, X decl I jle .L222 ALIGN_4 .L221: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addl INCX, X pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addl INCX, X pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addl INCX, X #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addl INCX, X pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addl INCX, X pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addl INCX, X decl I jg .L221 ALIGN_4 .L222: pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addl INCX, X pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addl INCX, X pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addl INCX, X pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addl INCX, XX pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addl INCX, XX pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addl INCX, XX pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addl INCX, XX ALIGN_3 .L225: testl $7, M je .L999 testl $4, M je .L226 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addl INCX, X pshufd $0x4e, %xmm2, %xmm5 mulpd %xmm6, %xmm2 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addl INCX, X pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm6, %xmm3 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addl INCX, XX ALIGN_3 .L226: testl $2, M je .L227 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addl INCX, X pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addl INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addl INCX, X pshufd $0x4e, %xmm1, %xmm5 mulpd %xmm6, %xmm1 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addl INCX, XX ALIGN_3 .L227: testl $1, M je .L999 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm5 mulpd %xmm6, %xmm0 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) ALIGN_3 .L999: xorl %eax, %eax popl %ebp popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zswap.S000066400000000000000000000136401313527062700163450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define N 4 + STACK + ARGS(%esp) #ifdef XDOUBLE #define X 48 + STACK + ARGS(%esp) #define INCX 52 + STACK + ARGS(%esp) #define Y 56 + STACK + ARGS(%esp) #define INCY 60 + STACK + ARGS(%esp) #elif defined(DOUBLE) #define X 32 + STACK + ARGS(%esp) #define INCX 36 + STACK + ARGS(%esp) #define Y 40 + STACK + ARGS(%esp) #define INCY 44 + STACK + ARGS(%esp) #else #define X 24 + STACK + ARGS(%esp) #define INCX 28 + STACK + ARGS(%esp) #define Y 32 + STACK + ARGS(%esp) #define INCY 36 + STACK + ARGS(%esp) #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl N, %edx movl X, %esi movl Y, %edi movl INCX, %ebx movl INCY, %ecx sall $ZBASE_SHIFT, %ebx sall $ZBASE_SHIFT, %ecx cmpl $2 * SIZE, %ebx jne .L14 cmpl $2 * SIZE, %ecx jne .L14 movl %edx, %eax sarl $1, %eax jle .L15 ALIGN_3 .L16: #if defined(DOUBLE) || defined(XDOUBLE) FLD 3 * SIZE(%esi) FLD 2 * SIZE(%esi) FLD 1 * SIZE(%esi) FLD 0 * SIZE(%esi) FLD 3 * SIZE(%edi) FLD 2 * SIZE(%edi) FLD 1 * SIZE(%edi) FLD 0 * SIZE(%edi) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) FST 2 * SIZE(%esi) FST 3 * SIZE(%esi) FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) FST 2 * SIZE(%edi) FST 3 * SIZE(%edi) #else fldl 2 * SIZE(%esi) fldl 0 * SIZE(%esi) fldl 2 * SIZE(%edi) fldl 0 * SIZE(%edi) fstpl 0 * SIZE(%esi) fstpl 2 * SIZE(%esi) fstpl 0 * SIZE(%edi) fstpl 2 * SIZE(%edi) #endif addl $4 * SIZE, %esi addl $4 * SIZE, %edi decl %eax jg .L16 ALIGN_3 .L15: movl %edx, %eax andl $1, %eax jle .L27 ALIGN_3 .L22: #if defined(DOUBLE) || defined(XDOUBLE) FLD 1 * SIZE(%esi) FLD 0 * SIZE(%esi) FLD 1 * SIZE(%edi) FLD 0 * SIZE(%edi) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) #else fldl 0 * SIZE(%esi) fldl 0 * SIZE(%edi) fstpl 0 * SIZE(%esi) fstpl 0 * SIZE(%edi) #endif jmp .L27 ALIGN_3 /* INCX != 1 or INCY != 1 */ .L14: movl %edx, %eax sarl $1, %eax jle .L28 ALIGN_2 .L29: #if defined(DOUBLE) || defined(XDOUBLE) FLD 1 * SIZE(%esi) FLD 0 * SIZE(%esi) addl %ebx, %esi FLD 1 * SIZE(%esi) FLD 0 * SIZE(%esi) FLD 1 * SIZE(%edi) FLD 0 * SIZE(%edi) addl %ecx, %edi FLD 1 * SIZE(%edi) FLD 0 * SIZE(%edi) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) subl %ebx, %esi FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) leal (%esi, %ebx, 2), %esi FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) subl %ecx, %edi FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) leal (%edi, %ecx, 2), %edi #else fldl 0 * SIZE(%esi) addl %ebx, %esi fldl 0 * SIZE(%esi) fldl 0 * SIZE(%edi) addl %ecx, %edi fldl 0 * SIZE(%edi) fstpl 0 * SIZE(%esi) subl %ebx, %esi fstpl 0 * SIZE(%esi) leal (%esi, %ebx, 2), %esi fstpl 0 * SIZE(%edi) subl %ecx, %edi fstpl 0 * SIZE(%edi) leal (%edi, %ecx, 2), %edi #endif decl %eax jg .L29 ALIGN_3 .L28: movl %edx, %eax andl $1, %eax jle .L27 ALIGN_3 .L35: #if defined(DOUBLE) || defined(XDOUBLE) FLD 1 * SIZE(%esi) FLD 0 * SIZE(%esi) FLD 1 * SIZE(%edi) FLD 0 * SIZE(%edi) FST 0 * SIZE(%esi) FST 1 * SIZE(%esi) FST 0 * SIZE(%edi) FST 1 * SIZE(%edi) #else fldl 0 * SIZE(%esi) fldl 0 * SIZE(%edi) fstpl 0 * SIZE(%esi) fstpl 0 * SIZE(%edi) #endif ALIGN_3 .L27: xorl %eax,%eax popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zswap_sse.S000066400000000000000000000501541313527062700172200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) #define STACK_Y 32 + STACK + ARGS(%esp) #define STACK_INCY 36 + STACK + ARGS(%esp) #define M %edx #define X %esi #define Y %edi #define INCX %ebx #define INCY %ecx #include "l1param.h" PROLOGUE PROFCODE pushl %ebp pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_Y, Y movl STACK_INCX, INCX movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY testl M, M jle .L19 cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 addl M, M subl $-32 * SIZE, X subl $-32 * SIZE, Y cmpl $3, M jle .L16 testl $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) addl $1 * SIZE, X addl $1 * SIZE, Y decl M ALIGN_3 .L05: testl $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, X addl $2 * SIZE, Y subl $2, M jle .L19 ALIGN_3 .L10: cmpl $3, M jle .L16 testl $2 * SIZE, X jne .L30 testl $1 * SIZE, X jne .L20 movl M, %eax sarl $5, %eax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) subl $-32 * SIZE, Y subl $-32 * SIZE, X decl %eax jg .L11 ALIGN_3 .L13: testl $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L14: testl $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L15: testl $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L16: testl $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) addl $2 * SIZE, X movlps %xmm0, -32 * SIZE(Y) addl $2 * SIZE, Y ALIGN_3 .L17: testl $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) PSHUFD2($0x39, %xmm1, %xmm3) movlps %xmm3, -31 * SIZE(X) subl $3, M movl M, %eax sarl $5, %eax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -13 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -5 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -5 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L23: testl $16, M jle .L24 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L24: testl $8, M jle .L25 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L25: testl $4, M jle .L26 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L26: PSHUFD2($0x39, %xmm0, %xmm2) PSHUFD1($0xff, %xmm0) movlps %xmm2, -32 * SIZE(Y) movss %xmm0, -30 * SIZE(Y) testl $2, M jle .L27 movsd -29 * SIZE(X), %xmm0 movsd -29 * SIZE(Y), %xmm1 movlps %xmm0, -29 * SIZE(Y) movlps %xmm1, -29 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L27: testl $1, M jle .L29 movss -29 * SIZE(X), %xmm0 movss -29 * SIZE(Y), %xmm1 movss %xmm0, -29 * SIZE(Y) movss %xmm1, -29 * SIZE(X) ALIGN_3 .L29: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L30: testl $1 * SIZE, X jne .L40 movhps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) subl $2, M movl M, %eax sarl $5, %eax jle .L33 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -6 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -6 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -2 * SIZE(X) subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L33: testl $16, M jle .L34 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L34: testl $8, M jle .L35 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L35: testl $4, M jle .L36 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L36: movhps %xmm0, -32 * SIZE(Y) testl $2, M jle .L37 movsd -30 * SIZE(X), %xmm0 movsd -30 * SIZE(Y), %xmm1 movlps %xmm0, -30 * SIZE(Y) movlps %xmm1, -30 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L37: testl $1, M jle .L39 movss -30 * SIZE(X), %xmm0 movss -30 * SIZE(Y), %xmm1 movss %xmm0, -30 * SIZE(Y) movss %xmm1, -30 * SIZE(X) ALIGN_3 .L39: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) subl $3, M movl M, %eax sarl $5, %eax jle .L43 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -11 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -7 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -3 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -3 * SIZE(X) subl $-32 * SIZE, X subl $-32 * SIZE, Y decl %eax jg .L41 ALIGN_3 .L43: testl $16, M jle .L44 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 .L44: testl $8, M jle .L45 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L45: testl $4, M jle .L46 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L46: movsd -31 * SIZE(X), %xmm2 PSHUFD2($0x39, %xmm1, %xmm1) movlps %xmm1, -31 * SIZE(X) PSHUFD1($0xff, %xmm0) movss %xmm0, -32 * SIZE(Y) movlps %xmm2, -31 * SIZE(Y) addl $3 * SIZE, X addl $3 * SIZE, Y testl $2, M jle .L47 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm0, -32 * SIZE(Y) movlps %xmm1, -32 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L47: testl $1, M jle .L49 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm0, -32 * SIZE(Y) movss %xmm1, -32 * SIZE(X) ALIGN_3 .L49: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L50: movl M, %eax sarl $2, %eax jle .L55 ALIGN_3 .L51: movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addl INCX, X movlps %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addl INCX, X movlps %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addl INCX, X movlps %xmm0, (Y) addl INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addl INCX, X movlps %xmm0, (Y) addl INCY, Y decl %eax jg .L51 ALIGN_3 .L55: movl M, %eax andl $3, %eax jle .L57 ALIGN_3 .L56: movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addl INCX, X movlps %xmm0, (Y) addl INCY, Y decl %eax jg .L56 ALIGN_3 .L57: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/zswap_sse2.S000066400000000000000000000432351313527062700173040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 32 + STACK + ARGS(%esp) #define STACK_INCX 36 + STACK + ARGS(%esp) #define STACK_Y 40 + STACK + ARGS(%esp) #define STACK_INCY 44 + STACK + ARGS(%esp) #define M %edx #define X %esi #define Y %edi #define INCX %ebx #define INCY %ecx #include "l1param.h" PROLOGUE PROFCODE pushl %ebp pushl %edi pushl %esi pushl %ebx movl STACK_M, M movl STACK_X, X movl STACK_Y, Y movl STACK_INCX, INCX movl STACK_INCY, INCY sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY testl M, M jle .L19 cmpl $2 * SIZE, INCX jne .L50 cmpl $2 * SIZE, INCY jne .L50 subl $-16 * SIZE, X subl $-16 * SIZE, Y testl $SIZE, Y jne .L30 testl $SIZE, X jne .L20 movl M, %eax sarl $3, %eax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -6 * SIZE(X), %xmm0 movaps -6 * SIZE(Y), %xmm1 movaps %xmm0, -6 * SIZE(Y) movaps %xmm1, -6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps -2 * SIZE(Y), %xmm1 movaps %xmm0, -2 * SIZE(Y) movaps %xmm1, -2 * SIZE(X) subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L11 ALIGN_3 .L13: testl $4, M jle .L14 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L14: testl $2, M jle .L15 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L15: testl $1, M jle .L19 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L19: xorl %eax,%eax popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L20: movhps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movlps %xmm1, -16 * SIZE(X) decl M jle .L29 movl M, %eax sarl $3, %eax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -7 * SIZE(X), %xmm2 movaps -6 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -5 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -5 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -3 * SIZE(X), %xmm2 movaps -2 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -4 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -3 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -2 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L21 ALIGN_3 .L23: testl $4, M jle .L24 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L24: testl $2, M jle .L25 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L25: testl $1, M jle .L29 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L29: movaps -15 * SIZE(X), %xmm2 movhps %xmm1, -15 * SIZE(X) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L30: testl $SIZE, X jne .L40 movhps -16 * SIZE(Y), %xmm0 movaps -16 * SIZE(X), %xmm1 movlps %xmm1, -16 * SIZE(Y) decl M jle .L39 movl M, %eax sarl $3, %eax jle .L33 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) movaps -13 * SIZE(Y), %xmm0 movaps -12 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -11 * SIZE(Y), %xmm2 movaps -10 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(Y) movaps -9 * SIZE(Y), %xmm0 movaps -8 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -7 * SIZE(Y), %xmm2 movaps -6 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -7 * SIZE(Y) movaps -5 * SIZE(Y), %xmm0 movaps -4 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -5 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -3 * SIZE(Y), %xmm2 movaps -2 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -4 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -3 * SIZE(Y) movaps -1 * SIZE(Y), %xmm0 movaps 0 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -2 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(Y) subl $-16 * SIZE, X subl $-16 * SIZE, Y decl %eax jg .L31 ALIGN_3 .L33: testl $4, M jle .L34 movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) movaps -13 * SIZE(Y), %xmm0 movaps -12 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(Y) movaps -11 * SIZE(Y), %xmm2 movaps -10 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(Y) movaps -9 * SIZE(Y), %xmm0 movaps -8 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(Y) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L34: testl $2, M jle .L35 movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) movaps -13 * SIZE(Y), %xmm0 movaps -12 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(Y) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L35: testl $1, M jle .L39 movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L39: movaps -15 * SIZE(Y), %xmm2 movhps %xmm1, -15 * SIZE(Y) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L40: movsd -16 * SIZE(X), %xmm0 movsd -16 * SIZE(Y), %xmm1 movlps %xmm0, -16 * SIZE(Y) movlps %xmm1, -16 * SIZE(X) addl $SIZE, X addl $SIZE, Y decl M jle .L49 movl M, %eax sarl $3, %eax jle .L43 ALIGN_3 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -6 * SIZE(X), %xmm0 movaps -6 * SIZE(Y), %xmm1 movaps %xmm0, -6 * SIZE(Y) movaps %xmm1, -6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps -2 * SIZE(Y), %xmm1 movaps %xmm0, -2 * SIZE(Y) movaps %xmm1, -2 * SIZE(X) subl $-16 * SIZE, Y subl $-16 * SIZE, X decl %eax jg .L41 ALIGN_3 .L43: testl $4, M jle .L44 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) addl $8 * SIZE, X addl $8 * SIZE, Y ALIGN_3 .L44: testl $2, M jle .L45 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) addl $4 * SIZE, X addl $4 * SIZE, Y ALIGN_3 .L45: testl $1, M jle .L49 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) addl $2 * SIZE, X addl $2 * SIZE, Y ALIGN_3 .L49: movsd -16 * SIZE(X), %xmm0 movsd -16 * SIZE(Y), %xmm1 movlps %xmm0, -16 * SIZE(Y) movlps %xmm1, -16 * SIZE(X) popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L50: testl $SIZE, X jne .L60 testl $SIZE, Y jne .L60 movl M, %eax sarl $2, %eax jle .L55 ALIGN_3 .L51: movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addl INCX, X movaps %xmm0, (Y) addl INCY, Y movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addl INCX, X movaps %xmm0, (Y) addl INCY, Y movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addl INCX, X movaps %xmm0, (Y) addl INCY, Y movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addl INCX, X movaps %xmm0, (Y) addl INCY, Y decl %eax jg .L51 ALIGN_3 .L55: movl M, %eax andl $3, %eax jle .L57 ALIGN_3 .L56: movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addl INCX, X movaps %xmm0, (Y) addl INCY, Y decl %eax jg .L56 ALIGN_3 .L57: popl %ebx popl %esi popl %edi popl %ebp ret ALIGN_3 .L60: movl M, %eax sarl $2, %eax jle .L65 ALIGN_3 .L61: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addl INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addl INCY, Y movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addl INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addl INCY, Y movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addl INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addl INCY, Y movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addl INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addl INCY, Y decl %eax jg .L61 ALIGN_3 .L65: movl M, %eax andl $3, %eax jle .L67 ALIGN_3 .L66: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addl INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addl INCY, Y decl %eax jg .L66 ALIGN_3 .L67: popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LN_2x1_core2.S000066400000000000000000000460561313527062700221040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 4) #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define ADD1 addpd #define ADD2 addpd #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movsd %xmm2, 0 + POSINV movsd %xmm7, 8 + POSINV movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK sall $ZBASE_SHIFT, LDC subl $-16 * SIZE, A subl $-16 * SIZE, B #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J # j = n testl %eax, %eax jle .L999 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $ 2 * SIZE, B addl $ 4 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx testl $1, %ebx je .L50 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L52 .L51: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 ADD1 %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 ADD2 %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 ADD1 %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 ADD2 %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 ADD1 %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 ADD1 %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 ADD1 %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 ADD1 %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax # l-- jg .L51 ALIGN_2 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # l = (k & 3) jle .L54 ALIGN_2 .L53: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax # l-- jg .L53 .L54: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 #else xorpd %xmm1, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm5 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RT movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(B) movddup %xmm5, %xmm4 unpckhpd %xmm5, %xmm5 movapd %xmm4, -16 * SIZE(BB) movapd %xmm5, -14 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L50: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd -2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd 8 * SIZE(AA), %xmm3 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd 4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd 6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 16 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd 8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm2, %xmm6 movapd 12 * SIZE(AA), %xmm3 ADD2 %xmm1, %xmm7 movapd 12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd 14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 subl $-32 * SIZE, BB movapd 24 * SIZE(AA), %xmm3 subl $-32 * SIZE, AA ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -16 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L16: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 ADD2 %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm2, %xmm6 mulpd %xmm0, %xmm3 movapd -12 * SIZE(AA), %xmm0 ADD2 %xmm3, %xmm7 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #else xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm5 movapd -14 * SIZE(B), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AA), %xmm5 movapd -14 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movddup -10 * SIZE(AA), %xmm2 movddup -9 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movddup -12 * SIZE(AA), %xmm2 movddup -11 * SIZE(AA), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movddup -14 * SIZE(AA), %xmm2 movddup -13 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movddup -10 * SIZE(AA), %xmm2 movddup -9 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RT movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef LN subl $4 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movsd %xmm7, 2 * SIZE(CO1) movhpd %xmm7, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(B) movapd %xmm7, -14 * SIZE(B) movddup %xmm5, %xmm4 unpckhpd %xmm5, %xmm5 movddup %xmm7, %xmm6 unpckhpd %xmm7, %xmm7 movapd %xmm4, -16 * SIZE(BB) movapd %xmm5, -14 * SIZE(BB) movapd %xmm6, -12 * SIZE(BB) movapd %xmm7, -10 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) movapd %xmm7, -14 * SIZE(AA) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 .L99: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J # j -- jg .L01 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S000066400000000000000000000544661313527062700217520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 4) #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define KERNEL1(address) \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movsd %xmm2, 0 + POSINV movsd %xmm7, 8 + POSINV movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J # j = n testl %eax, %eax jle .L999 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) movapd %xmm2, 4 * SIZE(BB) movapd %xmm3, 6 * SIZE(BB) movapd %xmm4, 8 * SIZE(BB) movapd %xmm5, 10 * SIZE(BB) movapd %xmm6, 12 * SIZE(BB) movapd %xmm7, 14 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) addl $ 2 * SIZE, B addl $ 4 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx testl $1, %ebx je .L50 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, %ecx #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax # l = (k >> 2) jle .L52 .L51: mulpd %xmm0, %xmm1 movapd 2 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 16 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm3 movapd 2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm5 movapd 4 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 24 * SIZE(BB), %xmm2 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm5 movapd 8 * SIZE(AA), %xmm0 addl $ 8 * SIZE, AA # aoffset += 2 addl $16 * SIZE, BB # boffset1 += 4 decl %eax # l-- jg .L51 ALIGN_2 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # l = (k & 3) jle .L54 ALIGN_2 .L53: movapd 0 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 2 addl $4 * SIZE, BB # boffset1 += 4 decl %eax # l-- jg .L53 .L54: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 #else xorpd %xmm1, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RT movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movsd %xmm5, 0 * SIZE(BB) movsd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L50: movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (BB, %eax, 4), BB leal (AA, %eax, 4), AA .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #else xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 movapd 2 * SIZE(B), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AA), %xmm5 movapd 2 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movsd 6 * SIZE(AA), %xmm2 movhpd 6 * SIZE(AA), %xmm2 movsd 7 * SIZE(AA), %xmm3 movhpd 7 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movsd 4 * SIZE(AA), %xmm2 movhpd 4 * SIZE(AA), %xmm2 movsd 5 * SIZE(AA), %xmm3 movhpd 5 * SIZE(AA), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movsd 2 * SIZE(AA), %xmm2 movhpd 2 * SIZE(AA), %xmm2 movsd 3 * SIZE(AA), %xmm3 movhpd 3 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movsd 6 * SIZE(AA), %xmm2 movhpd 6 * SIZE(AA), %xmm2 movsd 7 * SIZE(AA), %xmm3 movhpd 7 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RN movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RT movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef LN subl $4 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movsd %xmm7, 2 * SIZE(CO1) movhpd %xmm7, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movapd %xmm7, 2 * SIZE(B) movsd %xmm5, 0 * SIZE(BB) movsd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) movsd %xmm7, 4 * SIZE(BB) movsd %xmm7, 5 * SIZE(BB) movhpd %xmm7, 6 * SIZE(BB) movhpd %xmm7, 7 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) movapd %xmm7, 2 * SIZE(AA) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 .L99: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J # j -- jg .L01 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S000066400000000000000000001012471313527062700224000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define ADD1 addps #define ADD2 addps PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK movl M, %ebx testl %ebx, %ebx jle .L999 subl $-32 * SIZE, A subl $-32 * SIZE, B sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J sarl $1, J jle .L100 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx andl $1, %ebx jle .L30 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L42 ALIGN_4 .L41: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif addps %xmm2, %xmm6 addps %xmm3, %xmm7 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 pshufd $0xb1, %xmm7, %xmm7 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #else pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #endif #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(LT) unpcklpd %xmm6, %xmm4 movaps -32 * SIZE(BB), %xmm2 subps %xmm4, %xmm2 #else movsd -32 * SIZE(AA), %xmm1 movsd -30 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, -32 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) #else movlps %xmm1, -32 * SIZE(AA) movlps %xmm5, -30 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: movl M, %ebx sarl $1, %ebx jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L11: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 decl %eax jne .L11 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif ADD2 %xmm2, %xmm7 pcmpeqb %xmm0, %xmm0 ADD1 %xmm3, %xmm6 psllq $63, %xmm0 #ifndef CONJ pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 shufps $0xb1, %xmm0, %xmm0 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #else pshufd $0xb1, %xmm0, %xmm1 pxor %xmm1, %xmm5 pxor %xmm1, %xmm7 #endif #endif haddps %xmm5, %xmm4 haddps %xmm7, %xmm6 shufps $0xd8, %xmm4, %xmm4 shufps $0xd8, %xmm6, %xmm6 movaps %xmm4, %xmm5 shufps $0xe4, %xmm6, %xmm4 shufps $0xe4, %xmm5, %xmm6 #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps -32 * SIZE(BB), %xmm2 movaps -28 * SIZE(BB), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AA), %xmm1 movaps -28 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #ifdef LN movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm3, 2 * SIZE(CO1, LDC) #else movaps %xmm1, -32 * SIZE(AA) movaps %xmm5, -28 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) movhps %xmm5, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L999 #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx andl $1, %ebx jle .L130 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L142 ALIGN_4 .L141: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -22 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -18 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm3, %xmm5 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 #else pxor %xmm0, %xmm5 #endif #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movsd -32 * SIZE(BB), %xmm2 subps %xmm4, %xmm2 #else movsd -32 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, -32 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, -32 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L130: movl M, %ebx sarl $1, %ebx jle .L149 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 movhps -30 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm3, %xmm5 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 #else pxor %xmm0, %xmm5 #endif #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movsd -32 * SIZE(BB), %xmm2 movsd -30 * SIZE(BB), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #ifdef LN movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, -32 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L149: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LN_2x2_sse.S000066400000000000000000001146121313527062700216570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define AORIG 60(%esp) #define BORIG 64(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #if defined(PENTIUM4) || defined(PENTIUMM) #define PREFETCH prefetcht1 #define PREFETCHSIZE 168 #define PREFETCHW prefetcht0 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 168 #define PREFETCHW prefetcht0 #endif #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx movss STACK_OFFT, %xmm4 xorps %xmm7, %xmm7 pcmpeqb %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm2, %xmm2 #ifndef CONJ movss %xmm7, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm7, 12 + POSINV #endif EMMS movl %ebx, C movl STACK_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J sarl $1, J jle .L100 ALIGN_4 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L05 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) addl $ 4 * SIZE, B ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx andl $1, %ebx jle .L30 ALIGN_4 .L40: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L42 ALIGN_4 .L41: mulps %xmm0, %xmm2 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 44 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 6 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 60 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 76 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 96 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 10 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 92 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 112 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 108 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 128 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 14 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 124 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 144 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 24 * SIZE(AA), %xmm1 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm6, %xmm4 movaps 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef RN movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm5, 12 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) #else movlps %xmm1, 0 * SIZE(AA) movlps %xmm5, 2 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L30: movl M, %ebx sarl $1, %ebx jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 PREFETCHW -4 * SIZE(CO1) PREFETCHW -4 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L11: KERNEL1(0 * 16) KERNEL2(0 * 16) KERNEL3(0 * 16) KERNEL4(0 * 16) KERNEL5(0 * 16) KERNEL6(0 * 16) KERNEL7(0 * 16) KERNEL8(0 * 16) addl $ 32 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L11 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm1 movaps 4 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #ifdef LN movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm5, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm5, 28 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm3, 2 * SIZE(CO1, LDC) #else movaps %xmm1, 0 * SIZE(AA) movaps %xmm5, 4 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) movhps %xmm5, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L999 ALIGN_4 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L103 ALIGN_4 .L102: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $ 2 * SIZE, %edi addl $ 8 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx andl $1, %ebx jle .L130 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L142 ALIGN_4 .L141: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, 0 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L130: movl M, %ebx sarl $1, %ebx jle .L149 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW -4 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 12 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 20 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #ifdef LN movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BB) movaps %xmm1, 12 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L149: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LN_4x1_sse.S000066400000000000000000001005001313527062700216470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define AORIG 60(%esp) #define BORIG 64(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if !defined(HAVE_SSE2) || defined(OPTERON) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx movss STACK_OFFT, %xmm4 #ifndef CONJ movl $0x80000000, 0 + POSINV movl $0x00000000, 4 + POSINV movl $0x80000000, 8 + POSINV movl $0x00000000, 12 + POSINV #else movl $0x00000000, 0 + POSINV movl $0x80000000, 4 + POSINV movl $0x00000000, 8 + POSINV movl $0x80000000, 12 + POSINV #endif movl %ebx, C movl STACK_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J # j = n testl %eax, %eax jle .L999 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movss 4 * SIZE(B), %xmm0 movss 5 * SIZE(B), %xmm1 movss 6 * SIZE(B), %xmm2 movss 7 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm2, 24 * SIZE(BB) movaps %xmm3, 28 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L02 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 .L04: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx testl $1, %ebx jle .L50 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L72 ALIGN_4 .L71: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 16 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 12 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 20 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 40 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 48 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 44 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 52 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 72 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 .L72: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax je .L74 .L73: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L73 .L74: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef movsd xorps %xmm5, %xmm5 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm5 #else movsd 0 * SIZE(AA), %xmm5 #endif subps %xmm4, %xmm5 #ifdef movsd xorps %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm1 #else movsd 0 * SIZE(B), %xmm1 #endif movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm5, 0 * SIZE(B) movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movlps %xmm5, 0 * SIZE(AA) #endif movlps %xmm5, 0 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L50: movl M, %ebx testl $2, %ebx jle .L70 #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 ALIGN_4 .L51: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 12 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 20 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 44 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 52 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_4 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L53 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm5 movhps 2 * SIZE(B), %xmm5 #else movaps 0 * SIZE(AA), %xmm5 #endif subps %xmm4, %xmm5 #if defined(LN) || defined(LT) movhlps %xmm5, %xmm4 #endif #ifdef LN #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 6 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 movsd 4 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 #endif #ifdef LT #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 movsd 2 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 6 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm1 movhps 2 * SIZE(B), %xmm1 movaps %xmm1, %xmm2 shufps $0x44, %xmm2, %xmm2 movaps %xmm1, %xmm3 shufps $0x11, %xmm2, %xmm3 movaps %xmm5, %xmm4 shufps $0xa0, %xmm4, %xmm4 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 addps %xmm4, %xmm5 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlhps %xmm4, %xmm5 movsd %xmm5, 0 * SIZE(B) movhps %xmm5, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm2 pshufd $0xff, %xmm5, %xmm3 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xaa, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xff, %xmm3, %xmm3 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) #else movaps %xmm5, 0 * SIZE(AA) #endif movsd %xmm5, 0 * SIZE(CO1) movhps %xmm5, 2 * SIZE(CO1) #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L70: movl M, %ebx sarl $2, %ebx jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax prefetcht0 8 * SIZE(CO1) je .L12 ALIGN_4 #define PREFETCHSIZE 48 .L11: #ifdef CORE_KATMAI prefetcht0 PREFETCHSIZE * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 addl $8 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L13 .L14: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm5 movhps 2 * SIZE(B), %xmm5 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #else movaps 0 * SIZE(AA), %xmm5 movaps 4 * SIZE(AA), %xmm7 #endif subps %xmm4, %xmm5 subps %xmm6, %xmm7 #if defined(LN) || defined(LT) movhlps %xmm5, %xmm4 movhlps %xmm7, %xmm6 #endif #ifdef LN #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 30 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps POSINV, %xmm6 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm6 addps %xmm3, %xmm6 movsd 28 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm6, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm7 subps %xmm3, %xmm7 movsd 26 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm6, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 24 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm6, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 20 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps POSINV, %xmm7 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm7 addps %xmm3, %xmm7 movsd 18 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 16 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 10 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 movsd 8 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 movsd 2 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 4 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm7 subps %xmm3, %xmm7 movsd 6 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm6 subps %xmm3, %xmm6 movsd 10 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 movsd 12 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm7 subps %xmm3, %xmm7 movsd 14 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm6 subps %xmm3, %xmm6 movsd 20 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps POSINV, %xmm7 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm7 addps %xmm3, %xmm7 movsd 22 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm6 subps %xmm3, %xmm6 movsd 30 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps POSINV, %xmm6 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm6 addps %xmm3, %xmm6 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm1 movhps 2 * SIZE(B), %xmm1 #ifdef HAVE_SSE2 pshufd $0x44, %xmm1, %xmm2 pshufd $0x11, %xmm1, %xmm3 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm6 pshufd $0xf5, %xmm7, %xmm7 #else movaps %xmm1, %xmm2 shufps $0x44, %xmm2, %xmm2 movaps %xmm1, %xmm3 shufps $0x11, %xmm3, %xmm3 movaps %xmm5, %xmm4 shufps $0xa0, %xmm4, %xmm4 shufps $0xf5, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xa0, %xmm6, %xmm6 shufps $0xf5, %xmm7, %xmm7 #endif #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 mulps %xmm2, %xmm6 mulps %xmm3, %xmm7 addps %xmm4, %xmm5 addps %xmm6, %xmm7 #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlhps %xmm4, %xmm5 movlhps %xmm6, %xmm7 movsd %xmm5, 0 * SIZE(B) movhps %xmm5, 2 * SIZE(B) movsd %xmm7, 4 * SIZE(B) movhps %xmm7, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm2 pshufd $0xff, %xmm5, %xmm3 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xaa, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xff, %xmm3, %xmm3 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm2 pshufd $0xff, %xmm7, %xmm3 #else movaps %xmm7, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm7, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xaa, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xff, %xmm3, %xmm3 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm2, 24 * SIZE(BB) movaps %xmm3, 28 * SIZE(BB) #else movaps %xmm5, 0 * SIZE(AA) movaps %xmm7, 4 * SIZE(AA) #endif movlps %xmm5, 0 * SIZE(CO1) movhps %xmm5, 2 * SIZE(CO1) movlps %xmm7, 4 * SIZE(CO1) movhps %xmm7, 6 * SIZE(CO1) #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L99: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_1x1.S000066400000000000000000000225121313527062700210060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define KK 0 + STACK(%esp) #define KKK 4 + STACK(%esp) #define AORIG 8 + STACK(%esp) #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_K 12 + STACK + ARGS(%esp) #ifdef DOUBLE #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define STACK_A 32 + STACK + ARGS(%esp) #define STACK_B 36 + STACK + ARGS(%esp) #define STACK_C 40 + STACK + ARGS(%esp) #define STACK_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #else #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 20 + STACK + ARGS(%esp) #define STACK_A 24 + STACK + ARGS(%esp) #define STACK_B 28 + STACK + ARGS(%esp) #define STACK_C 32 + STACK + ARGS(%esp) #define STACK_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #endif PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE #define M %esi #define K %edi #define A %ebx #define B %ecx #define C %edx #define LDC %ebp movl STACK_K, K movl STACK_LDC, LDC sall $ZBASE_SHIFT, LDC #ifdef LN movl STACK_M, %eax sall $ZBASE_SHIFT, %eax addl %eax, STACK_C imull K, %eax addl %eax, STACK_A #endif #ifdef RT movl STACK_N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, STACK_B movl STACK_N, %eax imull LDC, %eax addl %eax, STACK_C #endif #ifdef RN movl OFFSET, %eax negl %eax movl %eax, KK #endif #ifdef RT movl STACK_N, %eax subl OFFSET, %eax movl %eax, KK #endif cmpl $0, STACK_N jle .L29 cmpl $0, STACK_M jle .L29 ALIGN_4 .L30: #if defined(LT) || defined(RN) movl STACK_A, A #else movl STACK_A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, STACK_B #endif #ifdef RT subl LDC, STACK_C #endif movl STACK_C, C #ifndef RT addl LDC, STACK_C #endif movl STACK_M, M #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif ALIGN_4 .L34: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax movl AORIG, A movl STACK_B, B addl %eax, A addl %eax, B #else movl STACK_B, B #endif fldz fldz fldz fldz FLD 4 * SIZE(B) # B5 FLD 4 * SIZE(A) # A5 FLD 0 * SIZE(B) # B0 FLD 0 * SIZE(A) # A0 #ifdef HAVE_SSE prefetcht2 2 * SIZE(C) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L37 ALIGN_4 #define PREFETCH_OFFSET 40 .L38: #ifdef HAVE_SSE prefetchnta (PREFETCH_OFFSET) * SIZE(B) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET) * SIZE(A) #endif #endif fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) faddp %st, %st(5) FLD 0 * SIZE(B) fxch %st(1) faddp %st, %st(4) FLD 1 * SIZE(A) fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(B) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(A) fmul %st, %st(1) FMUL 3 * SIZE(B) fxch %st(1) faddp %st, %st(5) FLD 2 * SIZE(B) fxch %st(1) faddp %st, %st(4) FLD 3 * SIZE(A) fmul %st, %st(1) FMUL 3 * SIZE(B) fxch %st(1) faddp %st, %st(7) FLD 8 * SIZE(B) fxch %st(1) faddp %st, %st(6) FLD 8 * SIZE(A) fxch %st(2) #ifdef HAVE_SSE #ifdef DOUBLE prefetchnta (PREFETCH_OFFSET + 4) * SIZE(B) #if (L2_SIZE == 524288) prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(A) #endif #endif #endif fmul %st, %st(3) FMUL 5 * SIZE(B) fxch %st(3) faddp %st, %st(5) FLD 4 * SIZE(B) fxch %st(3) faddp %st, %st(4) FLD 5 * SIZE(A) fmul %st, %st(3) FMUL 5 * SIZE(B) fxch %st(3) faddp %st, %st(7) FLD 6 * SIZE(B) fxch %st(3) faddp %st, %st(6) FLD 6 * SIZE(A) fmul %st, %st(3) FMUL 7 * SIZE(B) fxch %st(3) faddp %st, %st(5) FLD 6 * SIZE(B) fxch %st(3) faddp %st, %st(4) FLD 7 * SIZE(A) fmul %st, %st(3) FMUL 7 * SIZE(B) fxch %st(3) faddp %st, %st(7) FLD 12 * SIZE(B) fxch %st(3) faddp %st, %st(6) FLD 12 * SIZE(A) fxch %st(2) subl $-8 * SIZE, B subl $-8 * SIZE, A decl %eax jg .L38 ALIGN_4 .L37: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax jle .L43 ALIGN_2 .L54: fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) faddp %st, %st(5) FLD 0 * SIZE(B) fxch %st(1) faddp %st, %st(4) FLD 1 * SIZE(A) fmul %st, %st(1) FMUL 1 * SIZE(B) fxch %st(1) faddp %st, %st(7) FLD 2 * SIZE(B) fxch %st(1) faddp %st, %st(6) FLD 2 * SIZE(A) addl $2 * SIZE, A addl $2 * SIZE, B decl %eax jg .L54 ALIGN_3 .L43: ffreep %st(0) ffreep %st(0) ffreep %st(0) ffreep %st(0) #if defined(LN) || defined(LT) #ifndef CONJ faddp %st, %st(3) # ctemp3 += ctemp4 fsubp %st, %st(1) # ctemp1 += ctemp2 #else fsubp %st, %st(3) # ctemp1 += ctemp2 faddp %st, %st(1) # ctemp3 += ctemp4 #endif #endif #if defined(RN) || defined(RT) #ifndef CONJ faddp %st, %st(3) # ctemp3 += ctemp4 fsubp %st, %st(1) # ctemp1 += ctemp2 #else fsubrp %st, %st(3) # ctemp1 += ctemp2 faddp %st, %st(1) # ctemp3 += ctemp4 #endif #endif #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax sall $ZBASE_SHIFT, %eax movl AORIG, A movl STACK_B, B addl %eax, A addl %eax, B #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(B) fsubp %st, %st(1) FLD 1 * SIZE(B) fsubp %st, %st(2) #else FLD 0 * SIZE(A) fsubp %st, %st(1) FLD 1 * SIZE(A) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD 0 * SIZE(A) fmul %st(1), %st FLD 0 * SIZE(A) fmul %st(3), %st FLD 1 * SIZE(A) fmulp %st, %st(3) FLD 1 * SIZE(A) fmulp %st, %st(4) #endif #if defined(RN) || defined(RT) FLD 0 * SIZE(B) fmul %st(1), %st FLD 0 * SIZE(B) fmul %st(3), %st FLD 1 * SIZE(B) fmulp %st, %st(3) FLD 1 * SIZE(B) fmulp %st, %st(4) #endif #ifndef CONJ faddp %st, %st(2) fsubp %st, %st(2) #else fsubp %st, %st(2) faddp %st, %st(2) #endif #ifdef LN subl $2 * SIZE, C #endif #if defined(LN) || defined(LT) FSTU 1 * SIZE(B) fxch %st(1) FSTU 0 * SIZE(B) #else FSTU 1 * SIZE(A) fxch %st(1) FSTU 0 * SIZE(A) #endif FST 0 * SIZE(C) FST 1 * SIZE(C) #ifndef LN addl $2 * SIZE, C #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, A addl %eax, B #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl M jg .L34 ALIGN_2 .L33: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, STACK_B #endif #if defined(LT) || defined(RN) movl B, STACK_B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl STACK_N jg .L30 ALIGN_2 .L29: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_1x1_atom.S000066400000000000000000000216441313527062700220330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) #ifndef CONJ #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 addsd #define ADDSD4 subsd #elif defined(LN) || defined(LT) #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 subsd #define ADDSD4 addsd #else #define ADDSD1 addsd #define ADDSD2 subsd #define ADDSD3 addsd #define ADDSD4 addsd #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax testl %eax, %eax movl %eax, J # j = n jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx jle .L99 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 prefetcht0 1 * SIZE(CO1) xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADDSD3 %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 ADDSD3 %xmm2, %xmm6 movsd 3 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 2 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 3 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 4 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 2 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 3 * SIZE(BB), %xmm3 ADDSD3 %xmm2, %xmm6 movsd 5 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 4 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 5 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 6 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 4 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 5 * SIZE(BB), %xmm3 ADDSD3 %xmm2, %xmm6 movsd 7 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 6 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 7 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 8 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 6 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 7 * SIZE(BB), %xmm3 addl $8 * SIZE, BB addl $8 * SIZE, AA decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADDSD3 %xmm2, %xmm6 movsd 1 * SIZE(AA), %xmm2 movaps %xmm0, %xmm1 mulsd 0 * SIZE(BB), %xmm0 ADDSD4 %xmm3, %xmm7 mulsd 1 * SIZE(BB), %xmm1 ADDSD1 %xmm0, %xmm4 movsd 2 * SIZE(AA), %xmm0 movaps %xmm2, %xmm3 mulsd 0 * SIZE(BB), %xmm2 ADDSD2 %xmm1, %xmm5 mulsd 1 * SIZE(BB), %xmm3 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: ADDSD3 %xmm2, %xmm6 ADDSD4 %xmm3, %xmm7 addsd %xmm7, %xmm4 addsd %xmm5, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BB), %xmm0 movsd 1 * SIZE(BB), %xmm1 #else movsd 0 * SIZE(AA), %xmm0 movsd 1 * SIZE(AA), %xmm1 #endif subsd %xmm4, %xmm0 subsd %xmm6, %xmm1 #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm6 movaps %xmm0, %xmm5 movsd 1 * SIZE(AA), %xmm7 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD3 %xmm5, %xmm1 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BB), %xmm6 movaps %xmm0, %xmm5 movsd 1 * SIZE(BB), %xmm7 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD2 %xmm5, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BB) movsd %xmm1, 1 * SIZE(BB) #else movsd %xmm0, 0 * SIZE(AA) movsd %xmm1, 1 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J # j -- jg .L01 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S000066400000000000000000000412731313527062700224070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi #define ADD1 addpd #define ADD2 addpd PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK movl M, %ebx testl %ebx, %ebx jle .L999 subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L100 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx ALIGN_4 .L10: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -2 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -2 * SIZE(CO1, LDC) #else pxor %xmm4, %xmm4 prefetcht0 1 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(CO1, LDC) #endif pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADD1 %xmm3, %xmm6 movaps -14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -10 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -6 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -2 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) ADD1 %xmm3, %xmm6 movaps 2 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 6 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 10 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 16 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 subl $-32 * SIZE, BB mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: ADD1 %xmm3, %xmm6 movaps -14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif ADD1 %xmm3, %xmm6 pcmpeqb %xmm1, %xmm1 ADD2 %xmm2, %xmm7 psllq $63, %xmm1 #ifndef CONJ pshufd $0x40, %xmm1, %xmm0 shufps $0x04, %xmm1, %xmm1 pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm1, %xmm0 #else pshufd $0x04, %xmm1, %xmm0 #endif shufps $0x40, %xmm1, %xmm1 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #endif haddpd %xmm5, %xmm4 haddpd %xmm7, %xmm6 #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm5 movapd -14 * SIZE(BB), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AA), %xmm5 movapd -14 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm2 movddup -15 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movddup -14 * SIZE(BB), %xmm2 movddup -13 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movddup -10 * SIZE(BB), %xmm2 movddup -9 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RT movddup -10 * SIZE(BB), %xmm2 movddup -9 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movddup -12 * SIZE(BB), %xmm2 movddup -11 * SIZE(BB), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movddup -16 * SIZE(BB), %xmm2 movddup -15 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movlpd %xmm7, 0 * SIZE(CO1, LDC) movhpd %xmm7, 1 * SIZE(CO1, LDC) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(BB) movapd %xmm7, -14 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) movapd %xmm7, -14 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax testl $1, %eax jle .L999 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx ALIGN_4 L110: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -2 * SIZE(CO1) #else prefetcht0 1 * SIZE(CO1) #endif pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je L115 ALIGN_4 L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -10 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -6 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -2 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps 0 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne L112 ALIGN_4 L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je L118 ALIGN_4 L116: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg L116 ALIGN_4 L118: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif addpd %xmm6, %xmm4 pcmpeqb %xmm1, %xmm1 addpd %xmm7, %xmm5 psllq $63, %xmm1 #ifndef CONJ pshufd $0x40, %xmm1, %xmm0 shufps $0x04, %xmm1, %xmm1 pxor %xmm0, %xmm4 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm1, %xmm0 #else pshufd $0x04, %xmm1, %xmm0 #endif shufps $0x40, %xmm1, %xmm1 pxor %xmm0, %xmm5 #endif haddpd %xmm5, %xmm4 #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm5 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(BB), %xmm2 movddup -15 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg L110 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S000066400000000000000000000617131313527062700217510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #else #define PREFETCH prefetcht0 #endif #define PREFETCHSIZE (8 * 10 + 4) #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, B movl STACK_C, %ebx movss STACK_OFFT, %xmm4 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm7, 8 + POSINV movl %ebx, C movl STACK_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L100 ALIGN_4 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L03 ALIGN_4 .L02: prefetchnta 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) movlpd %xmm4, 8 * SIZE(BB) movlpd %xmm4, 9 * SIZE(BB) movlpd %xmm5, 10 * SIZE(BB) movlpd %xmm5, 11 * SIZE(BB) movlpd %xmm6, 12 * SIZE(BB) movlpd %xmm6, 13 * SIZE(BB) movlpd %xmm7, 14 * SIZE(BB) movlpd %xmm7, 15 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L05 movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT leal (, LDC, 2), %eax subl %eax, C #endif movl C, CO1 #ifndef RT leal (, LDC, 2), %eax addl %eax, C #endif movl M, %ebx testl %ebx, %ebx jle .L100 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) #else prefetchw 2 * SIZE(CO1) prefetchw 2 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L11 ALIGN_4 #endif .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #else xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 movapd 2 * SIZE(B), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AA), %xmm5 movapd 2 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movlpd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movlpd 6 * SIZE(B), %xmm2 movhpd 6 * SIZE(B), %xmm2 movlpd 7 * SIZE(B), %xmm3 movhpd 7 * SIZE(B), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm2 movhpd 6 * SIZE(B), %xmm2 movlpd 7 * SIZE(B), %xmm3 movhpd 7 * SIZE(B), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movlpd 4 * SIZE(B), %xmm2 movhpd 4 * SIZE(B), %xmm2 movlpd 5 * SIZE(B), %xmm3 movhpd 5 * SIZE(B), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movlpd %xmm7, 0 * SIZE(CO1, LDC) movhpd %xmm7, 1 * SIZE(CO1, LDC) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movapd %xmm7, 2 * SIZE(B) movlpd %xmm5, 0 * SIZE(BB) movlpd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) movlpd %xmm7, 4 * SIZE(BB) movlpd %xmm7, 5 * SIZE(BB) movhpd %xmm7, 6 * SIZE(BB) movhpd %xmm7, 7 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) movapd %xmm7, 2 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L500 ALIGN_4 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L103 ALIGN_4 .L102: prefetchnta 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) movlpd %xmm4, 8 * SIZE(BB) movlpd %xmm4, 9 * SIZE(BB) movlpd %xmm5, 10 * SIZE(BB) movlpd %xmm5, 11 * SIZE(BB) movlpd %xmm6, 12 * SIZE(BB) movlpd %xmm6, 13 * SIZE(BB) movlpd %xmm7, 14 * SIZE(BB) movlpd %xmm7, 15 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx testl %ebx, %ebx jle .L199 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) #else prefetchw 2 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 #else xorpd %xmm1, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movlpd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movlpd %xmm5, 0 * SIZE(BB) movlpd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L199: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L500: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S000066400000000000000000000446411313527062700217530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #ifdef PENTIUM4 #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef PENTIUMM #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi #define ADDSUB addpd #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addpd %xmm2, %xmm4; \ movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7; \ movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL6(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7 #define KERNEL7(address) \ movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L100 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx jle .L100 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) prefetchnta -2 * SIZE(CO1, LDC, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L12 ALIGN_4 .L11: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L11 ALIGN_4 .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif pcmpeqb %xmm1, %xmm1 psllq $63, %xmm1 shufps $0x40, %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #ifndef CONJ xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else #if defined(LN) || defined(LT) xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #else xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #endif addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm5 movapd 2 * SIZE(BB), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AA), %xmm5 movapd 2 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movddup 0 * SIZE(AA), %xmm2 movddup 1 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 movapd %xmm7, %xmm6 SHUFPD_1 %xmm4, %xmm4 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm2 movddup 1 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movddup 2 * SIZE(BB), %xmm2 movddup 3 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 movapd %xmm5, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movddup 6 * SIZE(BB), %xmm2 movddup 7 * SIZE(BB), %xmm3 movapd %xmm7, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RT movddup 6 * SIZE(BB), %xmm2 movddup 7 * SIZE(BB), %xmm3 movapd %xmm7, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movddup 4 * SIZE(BB), %xmm2 movddup 5 * SIZE(BB), %xmm3 movapd %xmm7, %xmm4 movapd %xmm7, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movddup 0 * SIZE(BB), %xmm2 movddup 1 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movlpd %xmm7, 0 * SIZE(CO1, LDC) movhpd %xmm7, 1 * SIZE(CO1, LDC) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(BB) movapd %xmm7, 2 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) movapd %xmm7, 2 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax testl $1, %eax jle .L500 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx jle .L500 ALIGN_4 L110: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) #else prefetchnta 2 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je L112 ALIGN_4 L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne L111 ALIGN_4 L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je L114 ALIGN_4 L113: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg L113 ALIGN_4 L114: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif pcmpeqb %xmm1, %xmm1 psllq $63, %xmm1 shufps $0x40, %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #ifndef CONJ xorpd %xmm1, %xmm5 subpd %xmm5, %xmm4 #else #if defined(LN) || defined(LT) xorpd %xmm1, %xmm4 #else xorpd %xmm1, %xmm5 #endif addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm5 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movddup 0 * SIZE(AA), %xmm2 movddup 1 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #if defined(RN) || defined(RT) movddup 0 * SIZE(BB), %xmm2 movddup 1 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg L110 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L500: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_2x1_core2.S000066400000000000000000000460551313527062700221110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 4) #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define ADD1 addpd #define ADD2 addpd #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movsd %xmm2, 0 + POSINV movsd %xmm7, 8 + POSINV movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK sall $ZBASE_SHIFT, LDC subl $-16 * SIZE, A subl $-16 * SIZE, B #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J # j = n testl %eax, %eax jle .L999 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal 16 * SIZE + BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) movapd %xmm2, -12 * SIZE(BB) movapd %xmm3, -10 * SIZE(BB) movapd %xmm4, -8 * SIZE(BB) movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, -16 * SIZE(BB) movapd %xmm1, -14 * SIZE(BB) addl $ 2 * SIZE, B addl $ 4 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm3 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd -12 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 0 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd -6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd -6 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd -4 * SIZE(AA), %xmm3 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd -2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd -2 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 movapd 8 * SIZE(AA), %xmm3 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd 0 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd 4 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd 6 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm0 ADD2 %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm1 movapd 16 * SIZE(AA), %xmm0 ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd 8 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd 10 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd 10 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm2, %xmm6 movapd 12 * SIZE(AA), %xmm3 ADD2 %xmm1, %xmm7 movapd 12 * SIZE(BB), %xmm1 movapd %xmm1, %xmm2 mulpd %xmm3, %xmm1 ADD1 %xmm1, %xmm4 movapd 14 * SIZE(BB), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm3, %xmm5 movapd 14 * SIZE(AA), %xmm3 mulpd %xmm3, %xmm2 mulpd %xmm3, %xmm1 subl $-32 * SIZE, BB movapd 24 * SIZE(AA), %xmm3 subl $-32 * SIZE, AA ADD1 %xmm2, %xmm6 ADD2 %xmm1, %xmm7 movapd -16 * SIZE(BB), %xmm1 decl %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L16: movapd %xmm1, %xmm2 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm4 movapd -14 * SIZE(BB), %xmm1 movapd %xmm1, %xmm3 mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 ADD2 %xmm1, %xmm5 movapd -12 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm2, %xmm6 mulpd %xmm0, %xmm3 movapd -12 * SIZE(AA), %xmm0 ADD2 %xmm3, %xmm7 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #else xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm5 movapd -14 * SIZE(B), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AA), %xmm5 movapd -14 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movddup -10 * SIZE(AA), %xmm2 movddup -9 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movddup -12 * SIZE(AA), %xmm2 movddup -11 * SIZE(AA), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movddup -14 * SIZE(AA), %xmm2 movddup -13 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movddup -10 * SIZE(AA), %xmm2 movddup -9 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RT movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef LN subl $4 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movsd %xmm7, 2 * SIZE(CO1) movhpd %xmm7, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(B) movapd %xmm7, -14 * SIZE(B) movddup %xmm5, %xmm4 unpckhpd %xmm5, %xmm5 movddup %xmm7, %xmm6 unpckhpd %xmm7, %xmm7 movapd %xmm4, -16 * SIZE(BB) movapd %xmm5, -14 * SIZE(BB) movapd %xmm6, -12 * SIZE(BB) movapd %xmm7, -10 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) movapd %xmm7, -14 * SIZE(AA) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 .L50: movl M, %ebx testl $1, %ebx je .L99 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal 16 * SIZE + BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd -16 * SIZE(BB), %xmm1 pxor %xmm5, %xmm5 movapd -8 * SIZE(AA), %xmm2 pxor %xmm6, %xmm6 movapd -8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax jle .L52 .L51: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm6 movapd 0 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm7 movapd -12 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BB), %xmm0 ADD1 %xmm3, %xmm4 movapd -4 * SIZE(BB), %xmm3 ADD2 %xmm0, %xmm5 movapd -10 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BB), %xmm0 ADD1 %xmm3, %xmm6 movapd 8 * SIZE(BB), %xmm3 ADD2 %xmm0, %xmm7 movapd 0 * SIZE(AA), %xmm0 mulpd %xmm2, %xmm1 mulpd 2 * SIZE(BB), %xmm2 ADD1 %xmm1, %xmm4 movapd 4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 movapd -6 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BB), %xmm2 ADD1 %xmm1, %xmm6 movapd 16 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 movapd -4 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BB), %xmm2 ADD1 %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm5 movapd -2 * SIZE(AA), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BB), %xmm2 ADD1 %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 movapd 8 * SIZE(AA), %xmm2 subl $-16 * SIZE, AA addl $ 32 * SIZE, BB decl %eax # l-- jg .L51 ALIGN_2 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # l = (k & 3) jle .L54 ALIGN_2 .L53: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 ADD1 %xmm1, %xmm4 movapd -12 * SIZE(BB), %xmm1 ADD2 %xmm0, %xmm5 movapd -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax # l-- jg .L53 .L54: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal 16 * SIZE + BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 #else xorpd %xmm1, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm5 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RT movddup -16 * SIZE(B), %xmm2 movddup -15 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(B) movddup %xmm5, %xmm4 unpckhpd %xmm5, %xmm5 movapd %xmm4, -16 * SIZE(BB) movapd %xmm5, -14 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J # j -- jg .L01 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S000066400000000000000000000544671313527062700217610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define PREFETCHSIZE (8 * 4) #if !defined(HAVE_SSE2) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define KERNEL1(address) \ movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL3(address) \ movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL4(address) \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL5(address) \ movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL6(address) \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm4; \ movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm5; \ movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 #define KERNEL7(address) \ movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm4; \ movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm5; \ movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ mulpd %xmm1, %xmm3; \ mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE EMMS movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 movd STACK_A, %mm2 movl STACK_B, B movd STACK_C, %mm3 movl STACK_LDC, LDC movd STACK_OFFT, %mm4 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movsd %xmm2, 0 + POSINV movsd %xmm7, 8 + POSINV movd %mm1, K movl %eax, N movd %mm0, M movd %mm2, A movd %mm3, C movl %esi, OLD_STACK movd %mm4, OFFSET movd %mm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J # j = n testl %eax, %eax jle .L999 ALIGN_2 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 ALIGN_2 .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 unpcklpd %xmm2, %xmm2 unpcklpd %xmm3, %xmm3 unpcklpd %xmm4, %xmm4 unpcklpd %xmm5, %xmm5 unpcklpd %xmm6, %xmm6 unpcklpd %xmm7, %xmm7 movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) movapd %xmm2, 4 * SIZE(BB) movapd %xmm3, 6 * SIZE(BB) movapd %xmm4, 8 * SIZE(BB) movapd %xmm5, 10 * SIZE(BB) movapd %xmm6, 12 * SIZE(BB) movapd %xmm7, 14 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax jne .L02 ALIGN_2 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 ALIGN_2 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 unpcklpd %xmm0, %xmm0 unpcklpd %xmm1, %xmm1 movapd %xmm0, 0 * SIZE(BB) movapd %xmm1, 2 * SIZE(BB) addl $ 2 * SIZE, B addl $ 4 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) jle .L50 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm3 pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $-8, %eax NOBRANCH je .L12 sall $3, %eax .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) cmpl $64 * 1, %eax NOBRANCH jle .L11 KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) cmpl $64 * 2, %eax NOBRANCH jle .L11 KERNEL1(32 * 2) KERNEL2(32 * 2) KERNEL3(32 * 2) KERNEL4(32 * 2) KERNEL5(32 * 2) KERNEL6(32 * 2) KERNEL7(32 * 2) KERNEL8(32 * 2) cmpl $64 * 3, %eax NOBRANCH jle .L11 KERNEL1(32 * 3) KERNEL2(32 * 3) KERNEL3(32 * 3) KERNEL4(32 * 3) KERNEL5(32 * 3) KERNEL6(32 * 3) KERNEL7(32 * 3) KERNEL8(32 * 3) cmpl $64 * 4, %eax NOBRANCH jle .L11 KERNEL1(32 * 4) KERNEL2(32 * 4) KERNEL3(32 * 4) KERNEL4(32 * 4) KERNEL5(32 * 4) KERNEL6(32 * 4) KERNEL7(32 * 4) KERNEL8(32 * 4) cmpl $64 * 5, %eax NOBRANCH jle .L11 KERNEL1(32 * 5) KERNEL2(32 * 5) KERNEL3(32 * 5) KERNEL4(32 * 5) KERNEL5(32 * 5) KERNEL6(32 * 5) KERNEL7(32 * 5) KERNEL8(32 * 5) cmpl $64 * 6, %eax NOBRANCH jle .L11 KERNEL1(32 * 6) KERNEL2(32 * 6) KERNEL3(32 * 6) KERNEL4(32 * 6) KERNEL5(32 * 6) KERNEL6(32 * 6) KERNEL7(32 * 6) KERNEL8(32 * 6) cmpl $64 * 7, %eax NOBRANCH jle .L11 KERNEL1(32 * 7) KERNEL2(32 * 7) KERNEL3(32 * 7) KERNEL4(32 * 7) KERNEL5(32 * 7) KERNEL6(32 * 7) KERNEL7(32 * 7) KERNEL8(32 * 7) addl $64 * 4 * SIZE, AA addl $64 * 4 * SIZE, BB subl $64 * 8, %eax BRANCH jg .L1X .L11: leal (BB, %eax, 4), BB leal (AA, %eax, 4), AA .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 0 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 2 * SIZE(AA), %xmm0 addpd %xmm1, %xmm5 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm1 movapd 4 * SIZE(AA), %xmm0 addpd %xmm1, %xmm7 addl $4 * SIZE, AA # aoffset += 8 addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #else xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 movapd 2 * SIZE(B), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AA), %xmm5 movapd 2 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movsd 6 * SIZE(AA), %xmm2 movhpd 6 * SIZE(AA), %xmm2 movsd 7 * SIZE(AA), %xmm3 movhpd 7 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movsd 4 * SIZE(AA), %xmm2 movhpd 4 * SIZE(AA), %xmm2 movsd 5 * SIZE(AA), %xmm3 movhpd 5 * SIZE(AA), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movsd 2 * SIZE(AA), %xmm2 movhpd 2 * SIZE(AA), %xmm2 movsd 3 * SIZE(AA), %xmm3 movhpd 3 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movsd 6 * SIZE(AA), %xmm2 movhpd 6 * SIZE(AA), %xmm2 movsd 7 * SIZE(AA), %xmm3 movhpd 7 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RN movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RT movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef LN subl $4 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movsd %xmm7, 2 * SIZE(CO1) movhpd %xmm7, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movapd %xmm7, 2 * SIZE(B) movsd %xmm5, 0 * SIZE(BB) movsd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) movsd %xmm7, 4 * SIZE(BB) movsd %xmm7, 5 * SIZE(BB) movhpd %xmm7, 6 * SIZE(BB) movhpd %xmm7, 7 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) movapd %xmm7, 2 * SIZE(AA) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 .L50: movl M, %ebx testl $1, %ebx je .L99 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, %ecx #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 movapd 0 * SIZE(AA), %xmm0 pxor %xmm5, %xmm5 movapd 8 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax # l = (k >> 2) jle .L52 .L51: mulpd %xmm0, %xmm1 movapd 2 * SIZE(BB), %xmm3 addpd %xmm1, %xmm4 movapd 16 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm3 movapd 2 * SIZE(AA), %xmm0 addpd %xmm3, %xmm5 movapd 4 * SIZE(BB), %xmm3 mulpd %xmm0, %xmm3 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 addpd %xmm0, %xmm5 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 movapd 12 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 24 * SIZE(BB), %xmm2 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm0, %xmm5 movapd 8 * SIZE(AA), %xmm0 addl $ 8 * SIZE, AA # aoffset += 2 addl $16 * SIZE, BB # boffset1 += 4 decl %eax # l-- jg .L51 ALIGN_2 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax # l = (k & 3) jle .L54 ALIGN_2 .L53: movapd 0 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm4 movapd 2 * SIZE(BB), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA # aoffset += 2 addl $4 * SIZE, BB # boffset1 += 4 decl %eax # l-- jg .L53 .L54: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 #else xorpd %xmm1, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #ifdef LN movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movsd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RT movsd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movsd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movsd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movsd %xmm5, 0 * SIZE(BB) movsd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J # j -- jg .L01 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S000066400000000000000000001012471313527062700224060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define ADD1 addps #define ADD2 addps PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK movl M, %ebx testl %ebx, %ebx jle .L999 subl $-32 * SIZE, A subl $-32 * SIZE, B sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J sarl $1, J jle .L100 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L11: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 decl %eax jne .L11 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif ADD2 %xmm2, %xmm7 pcmpeqb %xmm0, %xmm0 ADD1 %xmm3, %xmm6 psllq $63, %xmm0 #ifndef CONJ pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 shufps $0xb1, %xmm0, %xmm0 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #else pshufd $0xb1, %xmm0, %xmm1 pxor %xmm1, %xmm5 pxor %xmm1, %xmm7 #endif #endif haddps %xmm5, %xmm4 haddps %xmm7, %xmm6 shufps $0xd8, %xmm4, %xmm4 shufps $0xd8, %xmm6, %xmm6 movaps %xmm4, %xmm5 shufps $0xe4, %xmm6, %xmm4 shufps $0xe4, %xmm5, %xmm6 #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps -32 * SIZE(BB), %xmm2 movaps -28 * SIZE(BB), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AA), %xmm1 movaps -28 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #ifdef LN movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm3, 2 * SIZE(CO1, LDC) #else movaps %xmm1, -32 * SIZE(AA) movaps %xmm5, -28 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) movhps %xmm5, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx jg .L10 ALIGN_4 .L30: movl M, %ebx andl $1, %ebx jle .L99 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L42 ALIGN_4 .L41: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif addps %xmm2, %xmm6 addps %xmm3, %xmm7 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 pshufd $0xb1, %xmm7, %xmm7 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #else pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #endif #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(LT) unpcklpd %xmm6, %xmm4 movaps -32 * SIZE(BB), %xmm2 subps %xmm4, %xmm2 #else movsd -32 * SIZE(AA), %xmm1 movsd -30 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, -32 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) #else movlps %xmm1, -32 * SIZE(AA) movlps %xmm5, -30 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L999 #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 movhps -30 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm3, %xmm5 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 #else pxor %xmm0, %xmm5 #endif #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movsd -32 * SIZE(BB), %xmm2 movsd -30 * SIZE(BB), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #ifdef LN movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, -32 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L130: movl M, %ebx andl $1, %ebx jle .L149 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L142 ALIGN_4 .L141: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -22 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -18 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm3, %xmm5 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 #else pxor %xmm0, %xmm5 #endif #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movsd -32 * SIZE(BB), %xmm2 subps %xmm4, %xmm2 #else movsd -32 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, -32 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, -32 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L149: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_2x2_sse.S000066400000000000000000001146121313527062700216650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define AORIG 60(%esp) #define BORIG 64(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #if defined(PENTIUM4) || defined(PENTIUMM) #define PREFETCH prefetcht1 #define PREFETCHSIZE 168 #define PREFETCHW prefetcht0 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 168 #define PREFETCHW prefetcht0 #endif #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx movss STACK_OFFT, %xmm4 xorps %xmm7, %xmm7 pcmpeqb %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm2, %xmm2 #ifndef CONJ movss %xmm7, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm7, 12 + POSINV #endif EMMS movl %ebx, C movl STACK_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J sarl $1, J jle .L100 ALIGN_4 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L05 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) addl $ 4 * SIZE, B ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $1, %ebx jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 PREFETCHW 3 * SIZE(CO1) PREFETCHW 3 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L11: KERNEL1(0 * 16) KERNEL2(0 * 16) KERNEL3(0 * 16) KERNEL4(0 * 16) KERNEL5(0 * 16) KERNEL6(0 * 16) KERNEL7(0 * 16) KERNEL8(0 * 16) addl $ 32 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L11 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm1 movaps 4 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #ifdef LN movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm5, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm5, 28 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm3, 2 * SIZE(CO1, LDC) #else movaps %xmm1, 0 * SIZE(AA) movaps %xmm5, 4 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) movhps %xmm5, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx jg .L10 ALIGN_4 .L30: movl M, %ebx andl $1, %ebx jle .L99 ALIGN_4 .L40: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L42 ALIGN_4 .L41: mulps %xmm0, %xmm2 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 44 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 6 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 60 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 76 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 96 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 10 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 92 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 112 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 108 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 128 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 14 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 124 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 144 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 24 * SIZE(AA), %xmm1 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm6, %xmm4 movaps 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef RN movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm5, 12 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) #else movlps %xmm1, 0 * SIZE(AA) movlps %xmm5, 2 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L100: movl N, %eax andl $1, %eax jle .L999 ALIGN_4 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L103 ALIGN_4 .L102: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $ 2 * SIZE, %edi addl $ 8 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $1, %ebx jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW 3 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 12 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 20 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #ifdef LN movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BB) movaps %xmm1, 12 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L130: movl M, %ebx andl $1, %ebx jle .L149 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L142 ALIGN_4 .L141: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, 0 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L149: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_LT_4x1_sse.S000066400000000000000000001005601313527062700216630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if !defined(HAVE_SSE) || !defined(HAVE_MMX) #error You have to check your configuration. #endif #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define AORIG 60(%esp) #define BORIG 64(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if !defined(HAVE_SSE2) || defined(OPTERON) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx movss STACK_OFFT, %xmm4 #ifndef CONJ movl $0x80000000, 0 + POSINV movl $0x00000000, 4 + POSINV movl $0x80000000, 8 + POSINV movl $0x00000000, 12 + POSINV #else movl $0x00000000, 0 + POSINV movl $0x80000000, 4 + POSINV movl $0x00000000, 8 + POSINV movl $0x80000000, 12 + POSINV #endif movl %ebx, C movl STACK_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax movl %eax, J # j = n testl %eax, %eax jle .L999 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L03 .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) movss 4 * SIZE(B), %xmm0 movss 5 * SIZE(B), %xmm1 movss 6 * SIZE(B), %xmm2 movss 7 * SIZE(B), %xmm3 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm2, 24 * SIZE(BB) movaps %xmm3, 28 * SIZE(BB) prefetcht0 104 * SIZE(B) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L02 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L05 .L04: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $2 * SIZE, B addl $8 * SIZE, BB decl %eax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $2, %ebx jle .L50 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $2 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax prefetcht0 8 * SIZE(CO1) je .L12 ALIGN_4 #define PREFETCHSIZE 48 .L11: #ifdef CORE_KATMAI prefetcht0 PREFETCHSIZE * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 8 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 12 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 20 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 24 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 36 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 48 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 44 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 44 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 56 * SIZE(AA), %xmm1 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) #endif mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 52 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 52 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 64 * SIZE(AA), %xmm0 #ifdef CORE_KATMAI prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) #endif mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm4 movaps 56 * SIZE(BB), %xmm3 addps %xmm1, %xmm5 movaps 60 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 72 * SIZE(AA), %xmm1 addl $64 * SIZE, BB addl $64 * SIZE, AA decl %eax jne .L11 .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 .L13: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 0 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 8 * SIZE(AA), %xmm0 addl $8 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L13 .L14: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $4, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 4), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm5 movhps 2 * SIZE(B), %xmm5 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 #else movaps 0 * SIZE(AA), %xmm5 movaps 4 * SIZE(AA), %xmm7 #endif subps %xmm4, %xmm5 subps %xmm6, %xmm7 #if defined(LN) || defined(LT) movhlps %xmm5, %xmm4 movhlps %xmm7, %xmm6 #endif #ifdef LN #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 30 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps POSINV, %xmm6 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm6 addps %xmm3, %xmm6 movsd 28 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm6, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm7 subps %xmm3, %xmm7 movsd 26 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm6, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 24 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm6, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 20 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps POSINV, %xmm7 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm7 addps %xmm3, %xmm7 movsd 18 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 16 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 10 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 movsd 8 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 #endif #ifdef LT #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 movsd 2 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 4 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm7 subps %xmm3, %xmm7 movsd 6 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm6 subps %xmm3, %xmm6 movsd 10 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 movsd 12 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm7 subps %xmm3, %xmm7 movsd 14 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm6 subps %xmm3, %xmm6 movsd 20 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps POSINV, %xmm7 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm7 addps %xmm3, %xmm7 movsd 22 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm6 subps %xmm3, %xmm6 movsd 30 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm6, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps POSINV, %xmm6 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm6 addps %xmm3, %xmm6 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm1 movhps 2 * SIZE(B), %xmm1 #ifdef HAVE_SSE2 pshufd $0x44, %xmm1, %xmm2 pshufd $0x11, %xmm1, %xmm3 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm6 pshufd $0xf5, %xmm7, %xmm7 #else movaps %xmm1, %xmm2 shufps $0x44, %xmm2, %xmm2 movaps %xmm1, %xmm3 shufps $0x11, %xmm3, %xmm3 movaps %xmm5, %xmm4 shufps $0xa0, %xmm4, %xmm4 shufps $0xf5, %xmm5, %xmm5 movaps %xmm7, %xmm6 shufps $0xa0, %xmm6, %xmm6 shufps $0xf5, %xmm7, %xmm7 #endif #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 mulps %xmm2, %xmm6 mulps %xmm3, %xmm7 addps %xmm4, %xmm5 addps %xmm6, %xmm7 #endif #ifdef LN subl $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlhps %xmm4, %xmm5 movlhps %xmm6, %xmm7 movlps %xmm5, 0 * SIZE(B) movhps %xmm5, 2 * SIZE(B) movlps %xmm7, 4 * SIZE(B) movhps %xmm7, 6 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm2 pshufd $0xff, %xmm5, %xmm3 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xaa, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xff, %xmm3, %xmm3 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) #ifdef HAVE_SSE2 pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm2 pshufd $0xff, %xmm7, %xmm3 #else movaps %xmm7, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm7, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm7, %xmm2 shufps $0xaa, %xmm2, %xmm2 movaps %xmm7, %xmm3 shufps $0xff, %xmm3, %xmm3 #endif movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm2, 24 * SIZE(BB) movaps %xmm3, 28 * SIZE(BB) #else movaps %xmm5, 0 * SIZE(AA) movaps %xmm7, 4 * SIZE(AA) #endif movlps %xmm5, 0 * SIZE(CO1) movhps %xmm5, 2 * SIZE(CO1) movlps %xmm7, 4 * SIZE(CO1) movhps %xmm7, 6 * SIZE(CO1) #ifndef LN addl $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $4, KK movl BORIG, B #endif #ifdef LT addl $4, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $2 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_2 .L50: movl M, %ebx testl $2, %ebx jle .L70 #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 movaps 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L52 ALIGN_4 .L51: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 12 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 24 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 20 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 32 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 28 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 40 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 24 * SIZE(AA), %xmm1 mulps %xmm0, %xmm2 mulps 36 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 48 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 20 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 mulps 44 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 56 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 mulps 52 * SIZE(BB), %xmm1 addps %xmm2, %xmm4 movaps 64 * SIZE(BB), %xmm2 addps %xmm1, %xmm5 movaps 28 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 mulps 60 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 72 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movaps 40 * SIZE(AA), %xmm1 addl $32 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L51 ALIGN_4 .L52: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L54 .L53: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $4 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L53 .L54: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm5 movhps 2 * SIZE(B), %xmm5 #else movaps 0 * SIZE(AA), %xmm5 #endif subps %xmm4, %xmm5 #if defined(LN) || defined(LT) movhlps %xmm5, %xmm4 #endif #ifdef LN #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 6 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 movsd 4 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm4, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm5 subps %xmm3, %xmm5 movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 #endif #ifdef LT #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 movsd 2 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xa0, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps POSINV, %xmm3 #else xorps POSINV, %xmm2 #endif mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subps %xmm2, %xmm4 subps %xmm3, %xmm4 movsd 6 * SIZE(AA), %xmm1 movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm4, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm4, %xmm4 #ifndef CONJ xorps POSINV, %xmm4 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 addps %xmm3, %xmm4 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(B), %xmm1 movhps 2 * SIZE(B), %xmm1 movaps %xmm1, %xmm2 shufps $0x44, %xmm2, %xmm2 movaps %xmm1, %xmm3 shufps $0x11, %xmm2, %xmm3 movaps %xmm5, %xmm4 shufps $0xa0, %xmm4, %xmm4 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif mulps %xmm2, %xmm4 mulps %xmm3, %xmm5 addps %xmm4, %xmm5 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlhps %xmm4, %xmm5 movlps %xmm5, 0 * SIZE(B) movhps %xmm5, 2 * SIZE(B) #ifdef HAVE_SSE2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xaa, %xmm5, %xmm2 pshufd $0xff, %xmm5, %xmm3 #else movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm5, %xmm2 shufps $0xaa, %xmm2, %xmm2 movaps %xmm5, %xmm3 shufps $0xff, %xmm3, %xmm3 #endif movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) #else movaps %xmm5, 0 * SIZE(AA) #endif movlps %xmm5, 0 * SIZE(CO1) movhps %xmm5, 2 * SIZE(CO1) #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L70: movl M, %ebx testl $1, %ebx jle .L99 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm5, %xmm5 movaps 8 * SIZE(BB), %xmm3 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L72 ALIGN_4 .L71: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 16 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 12 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 4 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 20 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 6 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 40 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 48 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 44 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 12 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 52 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 14 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 72 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $64 * SIZE, BB decl %eax jne .L71 ALIGN_2 .L72: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax je .L74 .L73: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 addl $2 * SIZE, AA # aoffset += 8 addl $8 * SIZE, BB # boffset1 += 8 decl %eax jg .L73 .L74: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #ifdef movsd xorps %xmm5, %xmm5 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm5 #else movsd 0 * SIZE(AA), %xmm5 #endif subps %xmm4, %xmm5 #ifdef movsd xorps %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AA), %xmm1 #else movsd 0 * SIZE(B), %xmm1 #endif movaps %xmm1, %xmm0 shufps $0x44, %xmm0, %xmm0 shufps $0x11, %xmm1, %xmm1 movaps %xmm5, %xmm3 shufps $0xa0, %xmm3, %xmm3 shufps $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps POSINV, %xmm5 #else xorps POSINV, %xmm3 #endif mulps %xmm0, %xmm3 mulps %xmm1, %xmm5 addps %xmm3, %xmm5 #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm5, 0 * SIZE(B) movaps %xmm5, %xmm0 shufps $0x00, %xmm0, %xmm0 movaps %xmm5, %xmm1 shufps $0x55, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) #else movlps %xmm5, 0 * SIZE(AA) #endif movlps %xmm5, 0 * SIZE(CO1) #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_2 .L99: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif decl J # j -- jg .L01 ALIGN_2 .L999: movl OLD_STACK, %esp EMMS popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S000066400000000000000000000412711313527062700224130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 2) #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi #define ADD1 addpd #define ADD2 addpd PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK movl M, %ebx testl %ebx, %ebx jle .L999 subl $-16 * SIZE, A subl $-16 * SIZE, B sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax testl $1, %eax jle .L100 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx ALIGN_4 L110: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -2 * SIZE(CO1) #else prefetcht0 1 * SIZE(CO1) #endif pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je L115 ALIGN_4 L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -10 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -6 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps -4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -2 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm6 movaps 0 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm7 subl $-16 * SIZE, AA subl $-16 * SIZE, BB subl $1, %eax jne L112 ALIGN_4 L115: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je L118 ALIGN_4 L116: pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm1, %xmm4 movaps -14 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg L116 ALIGN_4 L118: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif addpd %xmm6, %xmm4 pcmpeqb %xmm1, %xmm1 addpd %xmm7, %xmm5 psllq $63, %xmm1 #ifndef CONJ pshufd $0x40, %xmm1, %xmm0 shufps $0x04, %xmm1, %xmm1 pxor %xmm0, %xmm4 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm1, %xmm0 #else pshufd $0x04, %xmm1, %xmm0 #endif shufps $0x40, %xmm1, %xmm1 pxor %xmm0, %xmm5 #endif haddpd %xmm5, %xmm4 #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm5 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(BB), %xmm2 movddup -15 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg L110 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L100: movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx ALIGN_4 .L10: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -16 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -2 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -2 * SIZE(CO1, LDC) #else pxor %xmm4, %xmm4 prefetcht0 1 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 1 * SIZE(CO1, LDC) #endif pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADD1 %xmm3, %xmm6 movaps -14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -10 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -6 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps -2 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 0 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) ADD1 %xmm3, %xmm6 movaps 2 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 4 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -6 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 6 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 8 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 10 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -2 * SIZE(AA), %xmm0 ADD1 %xmm3, %xmm6 movaps 14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps 16 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 subl $-32 * SIZE, BB mulpd %xmm0, %xmm2 movaps 0 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $1, %eax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: ADD1 %xmm3, %xmm6 movaps -14 * SIZE(BB), %xmm3 ADD2 %xmm2, %xmm7 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 ADD2 %xmm2, %xmm5 pshufd $0x4e, %xmm3, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif ADD1 %xmm3, %xmm6 pcmpeqb %xmm1, %xmm1 ADD2 %xmm2, %xmm7 psllq $63, %xmm1 #ifndef CONJ pshufd $0x40, %xmm1, %xmm0 shufps $0x04, %xmm1, %xmm1 pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm1, %xmm0 #else pshufd $0x04, %xmm1, %xmm0 #endif shufps $0x40, %xmm1, %xmm1 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #endif haddpd %xmm5, %xmm4 haddpd %xmm7, %xmm6 #if defined(LN) || defined(LT) movapd -16 * SIZE(BB), %xmm5 movapd -14 * SIZE(BB), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AA), %xmm5 movapd -14 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AA), %xmm2 movddup -15 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BB), %xmm2 movddup -15 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movddup -14 * SIZE(BB), %xmm2 movddup -13 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movddup -10 * SIZE(BB), %xmm2 movddup -9 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RT movddup -10 * SIZE(BB), %xmm2 movddup -9 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movddup -12 * SIZE(BB), %xmm2 movddup -11 * SIZE(BB), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movddup -16 * SIZE(BB), %xmm2 movddup -15 * SIZE(BB), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movlpd %xmm7, 0 * SIZE(CO1, LDC) movhpd %xmm7, 1 * SIZE(CO1, LDC) #if defined(LN) || defined(LT) movapd %xmm5, -16 * SIZE(BB) movapd %xmm7, -14 * SIZE(BB) #else movapd %xmm5, -16 * SIZE(AA) movapd %xmm7, -14 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S000066400000000000000000000616731313527062700217640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_ALPHA_R 16 + STACK + ARGS(%esi) #define STACK_ALPHA_I 24 + STACK + ARGS(%esi) #define STACK_A 32 + STACK + ARGS(%esi) #define STACK_B 36 + STACK + ARGS(%esi) #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 44(%esp) #define KK 48(%esp) #define KKK 52(%esp) #define AORIG 56(%esp) #define BORIG 60(%esp) #define BUFFER 128(%esp) #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) #define PREFETCH prefetch #else #define PREFETCH prefetcht0 #endif #define PREFETCHSIZE (8 * 10 + 4) #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm2, %xmm6; \ movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm0, %xmm7; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addpd %xmm3, %xmm6; \ movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm0, %xmm7; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm5; \ movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm2, %xmm6; \ movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addpd %xmm1, %xmm7; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm5; \ movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addpd %xmm3, %xmm6; \ movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addpd %xmm1, %xmm7; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, B movl STACK_C, %ebx movss STACK_OFFT, %xmm4 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm7, 8 + POSINV movl %ebx, C movl STACK_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax andl $1, %eax jle .L100 ALIGN_4 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L103 ALIGN_4 .L102: prefetchnta 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) movlpd %xmm4, 8 * SIZE(BB) movlpd %xmm4, 9 * SIZE(BB) movlpd %xmm5, 10 * SIZE(BB) movlpd %xmm5, 11 * SIZE(BB) movlpd %xmm6, 12 * SIZE(BB) movlpd %xmm6, 13 * SIZE(BB) movlpd %xmm7, 14 * SIZE(BB) movlpd %xmm7, 15 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) addl $2 * SIZE, B addl $4 * SIZE, BB decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx testl %ebx, %ebx jle .L199 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 movapd 0 * SIZE(AA), %xmm0 movapd 8 * SIZE(AA), %xmm1 movapd 0 * SIZE(BB), %xmm2 movapd 8 * SIZE(BB), %xmm3 #ifdef LN prefetchw -2 * SIZE(CO1) #else prefetchw 2 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 16 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 4 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BB), %xmm0 addpd %xmm3, %xmm4 movapd 12 * SIZE(BB), %xmm3 addpd %xmm0, %xmm5 movapd 6 * SIZE(AA), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BB), %xmm0 addpd %xmm3, %xmm6 movapd 24 * SIZE(BB), %xmm3 addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 mulpd %xmm1, %xmm2 mulpd 18 * SIZE(BB), %xmm1 addpd %xmm2, %xmm4 movapd 20 * SIZE(BB), %xmm2 addpd %xmm1, %xmm5 movapd 10 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm2 mulpd 22 * SIZE(BB), %xmm1 addpd %xmm2, %xmm6 movapd 32 * SIZE(BB), %xmm2 addpd %xmm1, %xmm7 movapd 12 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 26 * SIZE(BB), %xmm1 addpd %xmm3, %xmm4 movapd 28 * SIZE(BB), %xmm3 addpd %xmm1, %xmm5 movapd 14 * SIZE(AA), %xmm1 mulpd %xmm1, %xmm3 mulpd 30 * SIZE(BB), %xmm1 addpd %xmm3, %xmm6 movapd 40 * SIZE(BB), %xmm3 addpd %xmm1, %xmm7 movapd 24 * SIZE(AA), %xmm1 addl $16 * SIZE, AA addl $32 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 movapd 4 * SIZE(BB), %xmm2 addpd %xmm0, %xmm5 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 2), BB #endif movapd POSINV, %xmm1 addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 #else xorpd %xmm1, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 #else addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movlpd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movlpd %xmm5, 0 * SIZE(BB) movlpd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L199: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L100: movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L500 ALIGN_4 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, BB #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L03 ALIGN_4 .L02: prefetchnta 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) movlpd %xmm4, 8 * SIZE(BB) movlpd %xmm4, 9 * SIZE(BB) movlpd %xmm5, 10 * SIZE(BB) movlpd %xmm5, 11 * SIZE(BB) movlpd %xmm6, 12 * SIZE(BB) movlpd %xmm6, 13 * SIZE(BB) movlpd %xmm7, 14 * SIZE(BB) movlpd %xmm7, 15 * SIZE(BB) addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L05 movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd %xmm0, 0 * SIZE(BB) movlpd %xmm0, 1 * SIZE(BB) movlpd %xmm1, 2 * SIZE(BB) movlpd %xmm1, 3 * SIZE(BB) movlpd %xmm2, 4 * SIZE(BB) movlpd %xmm2, 5 * SIZE(BB) movlpd %xmm3, 6 * SIZE(BB) movlpd %xmm3, 7 * SIZE(BB) addl $4 * SIZE, B ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT leal (, LDC, 2), %eax subl %eax, C #endif movl C, CO1 #ifndef RT leal (, LDC, 2), %eax addl %eax, C #endif movl M, %ebx testl %ebx, %ebx jle .L100 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movapd 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movapd 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchw -2 * SIZE(CO1) prefetchw -2 * SIZE(CO1, LDC) #else prefetchw 2 * SIZE(CO1) prefetchw 2 * SIZE(CO1, LDC) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif #if 1 andl $-8, %eax sall $4, %eax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) cmpl $128 * 1, %eax jle .L12 KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpl $128 * 2, %eax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) cmpl $128 * 3, %eax jle .L12 KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpl $128 * 4, %eax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) cmpl $128 * 5, %eax jle .L12 KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpl $128 * 6, %eax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) cmpl $128 * 7, %eax jle .L12 KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addl $128 * 4 * SIZE, BB addl $128 * 1 * SIZE, AA subl $128 * 8, %eax jg .L1X jmp .L15 .L12: leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 #else sarl $3, %eax je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $64 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L11 ALIGN_4 #endif .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movapd 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm5 movapd 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 mulpd 6 * SIZE(BB), %xmm0 addpd %xmm2, %xmm6 movapd 8 * SIZE(BB), %xmm2 addpd %xmm0, %xmm7 movapd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA leal (B, %eax, 2), B leal (BB, %eax, 4), BB #endif movapd POSINV, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #else xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm5 movapd 2 * SIZE(B), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AA), %xmm5 movapd 2 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(AA), %xmm2 movhpd 0 * SIZE(AA), %xmm2 movlpd 1 * SIZE(AA), %xmm3 movhpd 1 * SIZE(AA), %xmm3 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 movapd %xmm5, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movlpd 6 * SIZE(B), %xmm2 movhpd 6 * SIZE(B), %xmm2 movlpd 7 * SIZE(B), %xmm3 movhpd 7 * SIZE(B), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm2 movhpd 6 * SIZE(B), %xmm2 movlpd 7 * SIZE(B), %xmm3 movhpd 7 * SIZE(B), %xmm3 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movlpd 4 * SIZE(B), %xmm2 movhpd 4 * SIZE(B), %xmm2 movlpd 5 * SIZE(B), %xmm3 movhpd 5 * SIZE(B), %xmm3 movapd %xmm7, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movlpd 0 * SIZE(B), %xmm2 movhpd 0 * SIZE(B), %xmm2 movlpd 1 * SIZE(B), %xmm3 movhpd 1 * SIZE(B), %xmm3 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movlpd %xmm7, 0 * SIZE(CO1, LDC) movhpd %xmm7, 1 * SIZE(CO1, LDC) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(B) movapd %xmm7, 2 * SIZE(B) movlpd %xmm5, 0 * SIZE(BB) movlpd %xmm5, 1 * SIZE(BB) movhpd %xmm5, 2 * SIZE(BB) movhpd %xmm5, 3 * SIZE(BB) movlpd %xmm7, 4 * SIZE(BB) movlpd %xmm7, 5 * SIZE(BB) movhpd %xmm7, 6 * SIZE(BB) movhpd %xmm7, 7 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) movapd %xmm7, 2 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L500: movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S000066400000000000000000000446371313527062700217660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define ALPHA_R 16 + STACK + ARGS(%esp) #define ALPHA_I 24 + STACK + ARGS(%esp) #define A 32 + STACK + ARGS(%esp) #define ARG_B 36 + STACK + ARGS(%esp) #define C 40 + STACK + ARGS(%esp) #define ARG_LDC 44 + STACK + ARGS(%esp) #define OFFSET 48 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #ifdef PENTIUM4 #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef PENTIUMM #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #define AA %edx #define BB %ecx #define LDC %ebp #define B %edi #define CO1 %esi #define ADDSUB addpd #define KERNEL1(address) \ mulpd %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addpd %xmm2, %xmm4; \ movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL2(address) \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm0, %xmm2; \ movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm2, %xmm7; \ movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL3(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL4(address) \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm0, %xmm3; \ movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ ADDSUB %xmm3, %xmm7; \ movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL5(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7; \ movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 #define KERNEL6(address) \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm4; \ movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ ADDSUB %xmm2, %xmm5; \ movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ addpd %xmm2, %xmm6; \ movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm2; \ movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm2, %xmm7 #define KERNEL7(address) \ movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 #define KERNEL8(address) \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm4; \ movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ ADDSUB %xmm3, %xmm5; \ movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ addpd %xmm3, %xmm6; \ movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ mulpd %xmm1, %xmm3; \ movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ ADDSUB %xmm3, %xmm7; \ movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax testl $1, %eax jle .L100 #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx jle .L500 ALIGN_4 L110: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetchnta -2 * SIZE(CO1) #else prefetchnta 2 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je L112 ALIGN_4 L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 4 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 5 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 6 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 6 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 7 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 16 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 16 * SIZE(BB), %xmm2 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 9 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 10 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movddup 10 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 11 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 12 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm7 movddup 12 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm4 movddup 13 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 14 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm5 movddup 14 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 addpd %xmm3, %xmm6 movddup 15 * SIZE(BB), %xmm3 mulpd %xmm1, %xmm3 movapd 24 * SIZE(AA), %xmm1 ADDSUB %xmm3, %xmm7 movddup 24 * SIZE(BB), %xmm3 addl $16 * SIZE, AA addl $16 * SIZE, BB decl %eax jne L111 ALIGN_4 L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je L114 ALIGN_4 L113: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg L113 ALIGN_4 L114: addpd %xmm6, %xmm4 addpd %xmm7, %xmm5 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $1, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif pcmpeqb %xmm1, %xmm1 psllq $63, %xmm1 shufps $0x40, %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #ifndef CONJ xorpd %xmm1, %xmm5 subpd %xmm5, %xmm4 #else #if defined(LN) || defined(LT) xorpd %xmm1, %xmm4 #else xorpd %xmm1, %xmm5 #endif addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm5 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AA), %xmm5 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movddup 0 * SIZE(AA), %xmm2 movddup 1 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #if defined(RN) || defined(RT) movddup 0 * SIZE(BB), %xmm2 movddup 1 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg L110 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L100: movl N, %eax sarl $1, %eax movl %eax, J # j = n jle .L500 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx jle .L500 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 movapd 8 * SIZE(AA), %xmm1 pxor %xmm5, %xmm5 movddup 0 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 movddup 8 * SIZE(BB), %xmm3 pxor %xmm7, %xmm7 #ifdef LN prefetcht0 -2 * SIZE(CO1) prefetcht0 -2 * SIZE(CO1, LDC, 1) #else prefetchnta 2 * SIZE(CO1) prefetchnta 2 * SIZE(CO1, LDC, 1) #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L12 ALIGN_4 .L11: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addl $32 * SIZE, BB addl $16 * SIZE, AA decl %eax jne .L11 ALIGN_4 .L12: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm0, %xmm2 addpd %xmm2, %xmm4 movddup 1 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 ADDSUB %xmm2, %xmm5 movddup 2 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm6 movddup 3 * SIZE(BB), %xmm2 mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 ADDSUB %xmm2, %xmm7 movddup 4 * SIZE(BB), %xmm2 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif pcmpeqb %xmm1, %xmm1 psllq $63, %xmm1 shufps $0x40, %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #ifndef CONJ xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else #if defined(LN) || defined(LT) xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 #else xorpd %xmm1, %xmm5 xorpd %xmm1, %xmm7 #endif addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BB), %xmm5 movapd 2 * SIZE(BB), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AA), %xmm5 movapd 2 * SIZE(AA), %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm1, %xmm1 #endif #if defined(LN) || defined(LT) movddup 0 * SIZE(AA), %xmm2 movddup 1 * SIZE(AA), %xmm3 movapd %xmm5, %xmm4 movapd %xmm7, %xmm6 SHUFPD_1 %xmm4, %xmm4 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm4 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BB), %xmm2 movddup 1 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 movddup 2 * SIZE(BB), %xmm2 movddup 3 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 movapd %xmm5, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm7 subpd %xmm6, %xmm7 movddup 6 * SIZE(BB), %xmm2 movddup 7 * SIZE(BB), %xmm3 movapd %xmm7, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 #endif #ifdef RT movddup 6 * SIZE(BB), %xmm2 movddup 7 * SIZE(BB), %xmm3 movapd %xmm7, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm7 mulpd %xmm3, %xmm6 addpd %xmm6, %xmm7 movddup 4 * SIZE(BB), %xmm2 movddup 5 * SIZE(BB), %xmm3 movapd %xmm7, %xmm4 movapd %xmm7, %xmm6 SHUFPD_1 %xmm6, %xmm6 xorpd %xmm1, %xmm6 mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm6 subpd %xmm4, %xmm5 subpd %xmm6, %xmm5 movddup 0 * SIZE(BB), %xmm2 movddup 1 * SIZE(BB), %xmm3 movapd %xmm5, %xmm4 SHUFPD_1 %xmm4, %xmm4 xorpd %xmm1, %xmm4 mulpd %xmm2, %xmm5 mulpd %xmm3, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef LN subl $2 * SIZE, CO1 #endif movlpd %xmm5, 0 * SIZE(CO1) movhpd %xmm5, 1 * SIZE(CO1) movlpd %xmm7, 0 * SIZE(CO1, LDC) movhpd %xmm7, 1 * SIZE(CO1, LDC) #if defined(LN) || defined(LT) movapd %xmm5, 0 * SIZE(BB) movapd %xmm7, 2 * SIZE(BB) #else movapd %xmm5, 0 * SIZE(AA) movapd %xmm7, 2 * SIZE(AA) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L10 ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L500: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S000066400000000000000000001012471313527062700224140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 16 #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) #define A 24 + STACK + ARGS(%esp) #define ARG_B 28 + STACK + ARGS(%esp) #define C 32 + STACK + ARGS(%esp) #define ARG_LDC 36 + STACK + ARGS(%esp) #define OFFSET 40 + STACK + ARGS(%esp) #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHSIZE (16 * 2) #endif #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define ADD1 addps #define ADD2 addps PROLOGUE subl $ARGS, %esp pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl ARG_B, B movl ARG_LDC, LDC movl OFFSET, %eax #ifdef RN negl %eax #endif movl %eax, KK movl M, %ebx testl %ebx, %ebx jle .L999 subl $-32 * SIZE, A subl $-32 * SIZE, B sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax andl $1, %eax jle .L100 #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 movhps -30 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -12 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -8 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 movaps -4 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps 0 * SIZE(AA), %xmm0 subl $-32 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm3, %xmm5 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 #else pxor %xmm0, %xmm5 #endif #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movsd -32 * SIZE(BB), %xmm2 movsd -30 * SIZE(BB), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #ifdef LN movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, -32 * SIZE(BB) movlps %xmm3, -30 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, -32 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L130: movl M, %ebx andl $1, %ebx jle .L149 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movsd -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L142 ALIGN_4 .L141: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -26 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -22 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -18 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-16 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: addps %xmm2, %xmm4 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0x55, %xmm1, %xmm3 movsd -30 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $2 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB #endif addps %xmm2, %xmm4 addps %xmm3, %xmm5 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 #else pxor %xmm0, %xmm5 #endif #endif addps %xmm5, %xmm4 #if defined(LN) || defined(LT) movsd -32 * SIZE(BB), %xmm2 subps %xmm4, %xmm2 #else movsd -32 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, -32 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, -32 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L149: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L100: movl N, %eax movl %eax, J sarl $1, J jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif #ifdef LT movl OFFSET, %eax movl %eax, KK #endif movl M, %ebx sarl $1, %ebx jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 #ifdef LN pxor %xmm4, %xmm4 prefetcht0 -4 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 -4 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #else pxor %xmm4, %xmm4 prefetcht0 3 * SIZE(CO1) pxor %xmm5, %xmm5 prefetcht0 3 * SIZE(CO1, LDC) pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L11: PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AA), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -12 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -8 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -4 * SIZE(AA), %xmm0 ADD2 %xmm2, %xmm7 subl $-32 * SIZE, BB pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 subl $-32 * SIZE, AA pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 decl %eax jne .L11 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: ADD2 %xmm2, %xmm7 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm6 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm2, %xmm5 pshufd $0xb1, %xmm3, %xmm2 mulps %xmm0, %xmm3 ADD1 %xmm1, %xmm4 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AA), %xmm0 addl $4 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB #endif ADD2 %xmm2, %xmm7 pcmpeqb %xmm0, %xmm0 ADD1 %xmm3, %xmm6 psllq $63, %xmm0 #ifndef CONJ pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 shufps $0xb1, %xmm0, %xmm0 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #else pshufd $0xb1, %xmm0, %xmm1 pxor %xmm1, %xmm5 pxor %xmm1, %xmm7 #endif #endif haddps %xmm5, %xmm4 haddps %xmm7, %xmm6 shufps $0xd8, %xmm4, %xmm4 shufps $0xd8, %xmm6, %xmm6 movaps %xmm4, %xmm5 shufps $0xe4, %xmm6, %xmm4 shufps $0xe4, %xmm5, %xmm6 #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps -32 * SIZE(BB), %xmm2 movaps -28 * SIZE(BB), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AA), %xmm1 movaps -28 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #ifdef LN movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps -28 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, -32 * SIZE(BB) movaps %xmm3, -28 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm3, 2 * SIZE(CO1, LDC) #else movaps %xmm1, -32 * SIZE(AA) movaps %xmm5, -28 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) movhps %xmm5, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $2, KK #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx jg .L10 ALIGN_4 .L30: movl M, %ebx andl $1, %ebx jle .L99 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl KK, %eax movl AORIG, AA sall $ZBASE_SHIFT, %eax addl %eax, AA #endif movl B, BB #if defined(LN) || defined(RT) movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB #endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 movaps -32 * SIZE(BB), %xmm1 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L42 ALIGN_4 .L41: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -24 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -28 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -20 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -26 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -16 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -24 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -12 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -22 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -8 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -20 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -4 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -18 * SIZE(AA), %xmm0 addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps 0 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -16 * SIZE(AA), %xmm0 subl $-16 * SIZE, AA subl $-32 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: addps %xmm2, %xmm6 pshufd $0x00, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm7 pshufd $0x55, %xmm1, %xmm3 mulps %xmm0, %xmm3 addps %xmm2, %xmm4 pshufd $0xaa, %xmm1, %xmm2 mulps %xmm0, %xmm2 addps %xmm3, %xmm5 pshufd $0xff, %xmm1, %xmm3 movaps -28 * SIZE(BB), %xmm1 mulps %xmm0, %xmm3 movsd -30 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $4 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB #endif addps %xmm2, %xmm6 addps %xmm3, %xmm7 pshufd $0xb1, %xmm5, %xmm5 pcmpeqb %xmm0, %xmm0 pshufd $0xb1, %xmm7, %xmm7 psllq $63, %xmm0 #ifndef CONJ shufps $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #else #if defined(LN) || defined(LT) pxor %xmm0, %xmm4 pxor %xmm0, %xmm6 #else pxor %xmm0, %xmm5 pxor %xmm0, %xmm7 #endif #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(LT) unpcklpd %xmm6, %xmm4 movaps -32 * SIZE(BB), %xmm2 subps %xmm4, %xmm2 #else movsd -32 * SIZE(AA), %xmm1 movsd -30 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef RN movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps -28 * SIZE(BB), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps -32 * SIZE(BB), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, -32 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) #else movlps %xmm1, -32 * SIZE(AA) movlps %xmm5, -30 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB #endif #ifdef LN subl $1, KK #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl BB, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L999: popl %ebx popl %esi popl %edi popl %ebp addl $ARGS, %esp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86/ztrsm_kernel_RT_2x2_sse.S000066400000000000000000001146131313527062700216740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 16 #define ARGS 0 #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) #define STACK_A 24 + STACK + ARGS(%esi) #define STACK_B 28 + STACK + ARGS(%esi) #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) #define POSINV 0(%esp) #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) #define A 28(%esp) #define C 32(%esp) #define J 36(%esp) #define OLD_STACK 40(%esp) #define OFFSET 48(%esp) #define KK 52(%esp) #define KKK 56(%esp) #define AORIG 60(%esp) #define BORIG 64(%esp) #define BUFFER 128(%esp) #define B %edi #define LDC %ebp #define AA %edx #define BB %ecx #define CO1 %esi #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #if defined(PENTIUM4) || defined(PENTIUMM) #define PREFETCH prefetcht1 #define PREFETCHSIZE 168 #define PREFETCHW prefetcht0 #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 168 #define PREFETCHW prefetcht0 #endif #if defined(OPTERON) || !defined(HAVE_SSE2) #define movsd movlps #endif #ifdef HAVE_SSE2 #define xorps pxor #endif #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ addps %xmm2, %xmm4; \ movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL2(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL3(address) \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm4; \ movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ addps %xmm2, %xmm5; \ movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm0, %xmm2; \ mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm2, %xmm6; \ movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm0, %xmm7; \ movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL4(address) \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm4; \ movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ addps %xmm3, %xmm5; \ movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm0, %xmm3; \ mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ addps %xmm3, %xmm6; \ movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm0, %xmm7; \ movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 #define KERNEL5(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL6(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL7(address) \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm4; \ movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ addps %xmm2, %xmm5; \ movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ mulps %xmm1, %xmm2; \ mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm2, %xmm6; \ movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ addps %xmm1, %xmm7; \ movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 #define KERNEL8(address) \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm4; \ movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ addps %xmm3, %xmm5; \ movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ mulps %xmm1, %xmm3; \ mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ addps %xmm3, %xmm6; \ movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; PROLOGUE pushl %ebp pushl %edi pushl %esi pushl %ebx PROFCODE movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp andl $-STACK_ALIGN, %esp # align stack addl $STACK_OFFSET, %esp STACK_TOUCHING movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx movl STACK_A, %edx movl %ebx, M movl %eax, N movl %ecx, K movl %edx, A movl %esi, OLD_STACK movl STACK_B, %edi movl STACK_C, %ebx movss STACK_OFFT, %xmm4 xorps %xmm7, %xmm7 pcmpeqb %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm2, %xmm2 #ifndef CONJ movss %xmm7, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm7, 12 + POSINV #endif EMMS movl %ebx, C movl STACK_LDC, LDC movss %xmm4, OFFSET movss %xmm4, KK sall $ZBASE_SHIFT, LDC #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax addl %eax, C imull K, %eax addl %eax, A #endif #ifdef RT movl N, %eax sall $ZBASE_SHIFT, %eax imull K, %eax addl %eax, B movl N, %eax imull LDC, %eax addl %eax, C #endif #ifdef RN negl KK #endif #ifdef RT movl N, %eax subl OFFSET, %eax movl %eax, KK #endif movl N, %eax andl $1, %eax jle .L100 ALIGN_4 .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $2, %eax jle .L103 ALIGN_4 .L102: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $3, %eax BRANCH jle .L105 ALIGN_4 .L104: #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) addl $ 2 * SIZE, %edi addl $ 8 * SIZE, %ecx decl %eax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif #ifdef RT subl LDC, C #endif movl C, CO1 #ifndef RT addl LDC, C #endif movl M, %ebx sarl $1, %ebx jle .L130 ALIGN_4 .L110: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movaps 0 * SIZE(AA), %xmm0 movaps 16 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 PREFETCHW 3 * SIZE(CO1) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L112 ALIGN_4 .L111: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movaps 8 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 12 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movaps 32 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 20 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movaps 24 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 28 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movaps 48 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 32 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L114 ALIGN_4 .L113: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $ 8 * SIZE, BB decl %eax jg .L113 ALIGN_4 .L114: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $1, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #ifdef LN movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BB) movaps %xmm1, 12 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx # i -- jg .L110 ALIGN_4 .L130: movl M, %ebx andl $1, %ebx jle .L149 #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB #endif #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L142 ALIGN_4 .L141: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 2 * SIZE(AA), %xmm0 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm6 movaps 12 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 movsd 4 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 movaps 32 * SIZE(BB), %xmm2 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 6 * SIZE(AA), %xmm0 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm6 movaps 28 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 movsd 16 * SIZE(AA), %xmm0 addps %xmm3, %xmm7 movaps 48 * SIZE(BB), %xmm3 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 10 * SIZE(AA), %xmm1 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm6 movaps 44 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 movsd 12 * SIZE(AA), %xmm1 addps %xmm2, %xmm7 movaps 64 * SIZE(BB), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 14 * SIZE(AA), %xmm1 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm6 movaps 60 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 movsd 24 * SIZE(AA), %xmm1 addps %xmm3, %xmm7 movaps 80 * SIZE(BB), %xmm3 addl $ 16 * SIZE, AA addl $ 64 * SIZE, BB decl %eax jne .L141 ALIGN_4 .L142: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L144 ALIGN_4 .L143: mulps %xmm0, %xmm2 mulps 4 * SIZE(BB), %xmm0 addps %xmm2, %xmm4 movaps 8 * SIZE(BB), %xmm2 addps %xmm0, %xmm5 movsd 2 * SIZE(AA), %xmm0 addl $2 * SIZE, AA addl $8 * SIZE, BB decl %eax jg .L143 ALIGN_4 .L144: addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm4 #endif #else xorps %xmm0, %xmm5 #endif addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movl KK, %eax subl $1, %eax movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax addl %eax, AA addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LN) || defined(LT) #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 subps %xmm4, %xmm1 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, 0 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $2 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L149: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $1, KK #endif #ifdef RT subl $1, KK #endif ALIGN_4 .L100: movl N, %eax movl %eax, J sarl $1, J jle .L999 ALIGN_4 .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK #endif leal BUFFER, %ecx #ifdef RT movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, B #endif #if defined(LN) || defined(RT) movl KK, %eax movl B, BORIG sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB #endif #if defined(LT) movl OFFSET, %eax movl %eax, KK #endif #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $1, %eax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BB) movaps %xmm5, 20 * SIZE(BB) movaps %xmm6, 24 * SIZE(BB) movaps %xmm7, 28 * SIZE(BB) addl $ 8 * SIZE, B addl $32 * SIZE, BB decl %eax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $1, %eax BRANCH jle .L05 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm2, 8 * SIZE(BB) movaps %xmm3, 12 * SIZE(BB) addl $ 4 * SIZE, B ALIGN_4 .L05: #if defined(LT) || defined(RN) movl A, %eax movl %eax, AA #else movl A, %eax movl %eax, AORIG #endif leal (, LDC, 2), %eax #ifdef RT subl %eax, C #endif movl C, CO1 #ifndef RT addl %eax, C #endif movl M, %ebx sarl $1, %ebx jle .L30 ALIGN_4 .L10: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB #endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 movaps 16 * SIZE(AA), %xmm1 xorps %xmm5, %xmm5 movaps 0 * SIZE(BB), %xmm2 xorps %xmm6, %xmm6 movaps 16 * SIZE(BB), %xmm3 xorps %xmm7, %xmm7 PREFETCHW 3 * SIZE(CO1) PREFETCHW 3 * SIZE(CO1, LDC) #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L15 ALIGN_4 .L11: KERNEL1(0 * 16) KERNEL2(0 * 16) KERNEL3(0 * 16) KERNEL4(0 * 16) KERNEL5(0 * 16) KERNEL6(0 * 16) KERNEL7(0 * 16) KERNEL8(0 * 16) addl $ 32 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L11 ALIGN_4 .L15: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movaps 4 * SIZE(AA), %xmm0 addl $ 4 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L13 ALIGN_4 .L14: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $2, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 subps %xmm4, %xmm2 subps %xmm5, %xmm3 #else movaps 0 * SIZE(AA), %xmm1 movaps 4 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #ifdef LN movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm1 subps %xmm4, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AA), %xmm5 pshufd $0xee, %xmm5, %xmm6 pshufd $0xbb, %xmm5, %xmm7 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm0, %xmm3 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm3 addps %xmm4, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm5, 12 * SIZE(BB) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BB) movaps %xmm1, 20 * SIZE(BB) movaps %xmm4, 24 * SIZE(BB) movaps %xmm5, 28 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm3, 2 * SIZE(CO1, LDC) #else movaps %xmm1, 0 * SIZE(AA) movaps %xmm5, 4 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) movhps %xmm5, 2 * SIZE(CO1, LDC) #endif #ifndef LN addl $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $8 * SIZE, B #endif #endif #ifdef LN subl $2, KK movl BORIG, B #endif #ifdef LT addl $2, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $1 + ZBASE_SHIFT, %eax addl %eax, AORIG #endif decl %ebx jg .L10 ALIGN_4 .L30: movl M, %ebx andl $1, %ebx jle .L99 ALIGN_4 .L40: #ifdef LN movl K, %eax sall $ZBASE_SHIFT, %eax subl %eax, AORIG #endif #if defined(LN) || defined(RT) movl AORIG, %eax movl %eax, AA movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #endif leal BUFFER, BB # boffset1 = boffset #if defined(LN) || defined(RT) movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB #endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(AA), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 8 * SIZE(AA), %xmm1 movaps 0 * SIZE(BB), %xmm2 movaps 16 * SIZE(BB), %xmm3 #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif sarl $3, %eax je .L42 ALIGN_4 .L41: mulps %xmm0, %xmm2 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 32 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 20 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 24 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 28 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 48 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 4 * SIZE(AA), %xmm0 mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 36 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 40 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 44 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 64 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 6 * SIZE(AA), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm4 movaps 52 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm5 movaps 56 * SIZE(BB), %xmm3 mulps %xmm0, %xmm3 mulps 60 * SIZE(BB), %xmm0 addps %xmm3, %xmm6 movaps 80 * SIZE(BB), %xmm3 addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 movaps 68 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 72 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 76 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 96 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 10 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 84 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 88 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 92 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 112 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 12 * SIZE(AA), %xmm1 mulps %xmm1, %xmm2 addps %xmm2, %xmm4 movaps 100 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 addps %xmm2, %xmm5 movaps 104 * SIZE(BB), %xmm2 mulps %xmm1, %xmm2 mulps 108 * SIZE(BB), %xmm1 addps %xmm2, %xmm6 movaps 128 * SIZE(BB), %xmm2 addps %xmm1, %xmm7 movsd 14 * SIZE(AA), %xmm1 mulps %xmm1, %xmm3 addps %xmm3, %xmm4 movaps 116 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm5 movaps 120 * SIZE(BB), %xmm3 mulps %xmm1, %xmm3 mulps 124 * SIZE(BB), %xmm1 addps %xmm3, %xmm6 movaps 144 * SIZE(BB), %xmm3 addps %xmm1, %xmm7 movsd 24 * SIZE(AA), %xmm1 addl $ 16 * SIZE, AA addl $128 * SIZE, BB decl %eax jne .L41 ALIGN_4 .L42: #if defined(LT) || defined(RN) movl KK, %eax #else movl K, %eax subl KK, %eax #endif andl $7, %eax # if (k & 1) BRANCH je .L44 ALIGN_4 .L43: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 movaps 4 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm5 movaps 8 * SIZE(BB), %xmm2 mulps %xmm0, %xmm2 mulps 12 * SIZE(BB), %xmm0 addps %xmm2, %xmm6 movaps 16 * SIZE(BB), %xmm2 addps %xmm0, %xmm7 movsd 2 * SIZE(AA), %xmm0 addl $ 2 * SIZE, AA addl $16 * SIZE, BB decl %eax jg .L43 ALIGN_4 .L44: movaps POSINV, %xmm0 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #else xorps %xmm0, %xmm4 xorps %xmm0, %xmm6 #endif #else xorps %xmm0, %xmm5 xorps %xmm0, %xmm7 #endif addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movl KK, %eax #ifdef LN subl $1, %eax #else subl $2, %eax #endif movl AORIG, AA movl BORIG, B leal BUFFER, BB sall $ZBASE_SHIFT, %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), B leal (BB, %eax, 8), BB #endif #if defined(LN) || defined(LT) unpcklpd %xmm6, %xmm4 movaps 0 * SIZE(B), %xmm2 subps %xmm4, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AA), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(AA), %xmm5 subps %xmm4, %xmm1 subps %xmm6, %xmm5 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AA), %xmm5 pshufd $0x44, %xmm5, %xmm6 pshufd $0x11, %xmm5, %xmm7 pshufd $0xa0, %xmm2, %xmm4 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm4 #endif mulps %xmm6, %xmm4 mulps %xmm7, %xmm2 addps %xmm4, %xmm2 #endif #ifdef RN movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm4 pshufd $0xee, %xmm4, %xmm6 pshufd $0xbb, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm0, %xmm5 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm5 addps %xmm3, %xmm5 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm5, %xmm3 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm0, %xmm2 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm2 subps %xmm3, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm4 pshufd $0x44, %xmm4, %xmm6 pshufd $0x11, %xmm4, %xmm7 pshufd $0xa0, %xmm1, %xmm3 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm0, %xmm1 #else xorps %xmm0, %xmm3 #endif mulps %xmm6, %xmm3 mulps %xmm7, %xmm1 addps %xmm3, %xmm1 #endif #ifdef LN subl $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BB) movaps %xmm1, 4 * SIZE(BB) movaps %xmm4, 8 * SIZE(BB) movaps %xmm5, 12 * SIZE(BB) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO1, LDC) #else movlps %xmm1, 0 * SIZE(AA) movlps %xmm5, 2 * SIZE(AA) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO1, LDC) #endif #ifndef LN addl $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, AA #ifdef LT addl $4 * SIZE, B #endif #endif #ifdef LN subl $1, KK movl BORIG, B #endif #ifdef LT addl $1, KK #endif #ifdef RT movl K, %eax movl BORIG, B sall $ZBASE_SHIFT, %eax addl %eax, AORIG #endif ALIGN_4 .L99: #ifdef LN movl K, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #if defined(LT) || defined(RN) movl K, %eax subl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, B #endif #ifdef RN addl $2, KK #endif #ifdef RT subl $2, KK #endif decl J # j -- jg .L01 ALIGN_4 .L999: EMMS movl OLD_STACK, %esp popl %ebx popl %esi popl %edi popl %ebp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/000077500000000000000000000000001313527062700153625ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/x86_64/KERNEL000066400000000000000000000150261313527062700162710ustar00rootroot00000000000000ifndef SAMAXKERNEL SAMAXKERNEL = amax_sse.S endif ifndef DAMAXKERNEL DAMAXKERNEL = amax_sse2.S endif ifndef QAMAXKERNEL QAMAXKERNEL = amax.S endif ifndef CAMAXKERNEL CAMAXKERNEL = zamax_sse.S endif ifndef ZAMAXKERNEL ZAMAXKERNEL = zamax_sse2.S endif ifndef XAMAXKERNEL XAMAXKERNEL = zamax.S endif ifndef SASUMKERNEL SASUMKERNEL = asum_sse.S endif ifndef DASUMKERNEL DASUMKERNEL = asum_sse2.S endif ifndef CASUMKERNEL CASUMKERNEL = zasum_sse.S endif ifndef ZASUMKERNEL ZASUMKERNEL = zasum_sse2.S endif ifndef QASUMKERNEL QASUMKERNEL = asum.S endif ifndef XASUMKERNEL XASUMKERNEL = zasum.S endif ifndef SAMINKERNEL SAMINKERNEL = amax_sse.S endif ifndef DAMINKERNEL DAMINKERNEL = amax_sse2.S endif ifndef QAMINKERNEL QAMINKERNEL = amax.S endif ifndef CAMINKERNEL CAMINKERNEL = zamax_sse.S endif ifndef ZAMINKERNEL ZAMINKERNEL = zamax_sse2.S endif ifndef XAMINKERNEL XAMINKERNEL = zamax.S endif ifndef SAXPYKERNEL SAXPYKERNEL = axpy_sse.S endif ifndef DAXPYKERNEL DAXPYKERNEL = axpy_sse2.S endif ifndef CAXPYKERNEL CAXPYKERNEL = zaxpy_sse.S endif ifndef ZAXPYKERNEL ZAXPYKERNEL = zaxpy_sse2.S endif ifndef QAXPYKERNEL QAXPYKERNEL = axpy.S endif ifndef XAXPYKERNEL XAXPYKERNEL = zaxpy.S endif ifndef SCOPYKERNEL SCOPYKERNEL = copy_sse.S endif ifndef DCOPYKERNEL DCOPYKERNEL = copy_sse2.S endif ifndef CCOPYKERNEL CCOPYKERNEL = zcopy_sse.S endif ifndef ZCOPYKERNEL ZCOPYKERNEL = zcopy_sse2.S endif ifndef QCOPYKERNEL QCOPYKERNEL = copy.S endif ifndef XCOPYKERNEL XCOPYKERNEL = zcopy.S endif ifndef SDOTKERNEL SDOTKERNEL = ../generic/dot.c endif ifndef DSDOTKERNEL DSDOTKERNEL = ../generic/dot.c endif ifndef DDOTKERNEL DDOTKERNEL = dot_sse2.S endif ifndef CDOTKERNEL CDOTKERNEL = zdot_sse.S endif ifndef ZDOTKERNEL ZDOTKERNEL = zdot_sse2.S endif ifndef QDOTKERNEL QDOTKERNEL = dot.S endif ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif ifndef ISAMAXKERNEL ISAMAXKERNEL = iamax_sse.S endif ifndef IDAMAXKERNEL IDAMAXKERNEL = iamax_sse2.S endif ifndef IQAMAXKERNEL IQAMAXKERNEL = iamax.S endif ifndef ICAMAXKERNEL ICAMAXKERNEL = izamax_sse.S endif ifndef IZAMAXKERNEL IZAMAXKERNEL = izamax_sse2.S endif ifndef IXAMAXKERNEL IXAMAXKERNEL = izamax.S endif ifndef ISAMINKERNEL ISAMINKERNEL = iamax_sse.S endif ifndef IDAMINKERNEL IDAMINKERNEL = iamax_sse2.S endif ifndef IQAMINKERNEL IQAMINKERNEL = iamax.S endif ifndef ICAMINKERNEL ICAMINKERNEL = izamax_sse.S endif ifndef IZAMINKERNEL IZAMINKERNEL = izamax_sse2.S endif ifndef IXAMINKERNEL IXAMINKERNEL = izamax.S endif ifndef ISMAXKERNEL ISMAXKERNEL = iamax_sse.S endif ifndef IDMAXKERNEL IDMAXKERNEL = iamax_sse2.S endif ifndef IQMAXKERNEL IQMAXKERNEL = iamax.S endif ifndef ISMINKERNEL ISMINKERNEL = iamax_sse.S endif ifndef IDMINKERNEL IDMINKERNEL = iamax_sse2.S endif ifndef IQMINKERNEL IQMINKERNEL = iamax.S endif ifndef SMAXKERNEL SMAXKERNEL = amax_sse.S endif ifndef DMAXKERNEL DMAXKERNEL = amax_sse2.S endif ifndef QMAXKERNEL QMAXKERNEL = amax.S endif ifndef SMINKERNEL SMINKERNEL = amax_sse.S endif ifndef DMINKERNEL DMINKERNEL = amax_sse2.S endif ifndef QMINKERNEL QMINKERNEL = amax.S endif ifndef SNRM2KERNEL SNRM2KERNEL = nrm2_sse.S endif ifndef DNRM2KERNEL DNRM2KERNEL = nrm2.S endif ifndef QNRM2KERNEL QNRM2KERNEL = nrm2.S endif ifndef CNRM2KERNEL CNRM2KERNEL = znrm2_sse.S endif ifndef ZNRM2KERNEL ZNRM2KERNEL = znrm2.S endif ifndef XNRM2KERNEL XNRM2KERNEL = znrm2.S endif ifndef SROTKERNEL SROTKERNEL = rot_sse.S endif ifndef DROTKERNEL DROTKERNEL = rot_sse2.S endif ifndef QROTKERNEL QROTKERNEL = rot.S endif ifndef CROTKERNEL CROTKERNEL = zrot_sse.S endif ifndef ZROTKERNEL ZROTKERNEL = zrot_sse2.S endif ifndef XROTKERNEL XROTKERNEL = zrot.S endif ifndef SSCALKERNEL SSCALKERNEL = scal_sse.S endif ifndef DSCALKERNEL DSCALKERNEL = scal_sse2.S endif ifndef CSCALKERNEL CSCALKERNEL = zscal_sse.S endif ifndef ZSCALKERNEL ZSCALKERNEL = zscal_sse2.S endif ifndef ASCALKERNEL QSCALKERNEL = scal.S endif ifndef XSCALKERNEL XSCALKERNEL = zscal.S endif ifndef SSWAPKERNEL SSWAPKERNEL = swap_sse.S endif ifndef DSWAPKERNEL DSWAPKERNEL = swap_sse2.S endif ifndef CSWAPKERNEL CSWAPKERNEL = zswap_sse.S endif ifndef ZSWAPKERNEL ZSWAPKERNEL = zswap_sse2.S endif ifndef QSWAPKERNEL QSWAPKERNEL = swap.S endif ifndef XSWAPKERNEL XSWAPKERNEL = zswap.S endif ifndef SSYMV_U_KERNEL SSYMV_U_KERNEL = symv_U_sse.S endif ifndef SSYMV_L_KERNEL SSYMV_L_KERNEL = symv_L_sse.S endif ifndef DSYMV_U_KERNEL DSYMV_U_KERNEL = symv_U_sse2.S endif ifndef DSYMV_L_KERNEL DSYMV_L_KERNEL = symv_L_sse2.S endif ifndef ZSYMV_U_KERNEL ZSYMV_U_KERNEL = zsymv_U_sse2.S endif ifndef ZSYMV_L_KERNEL ZSYMV_L_KERNEL = zsymv_L_sse2.S endif ifndef ZHEMV_U_KERNEL ZHEMV_U_KERNEL = zsymv_U_sse2.S endif ifndef ZHEMV_L_KERNEL ZHEMV_L_KERNEL = zsymv_L_sse2.S endif GEMVDEP = ../l2param.h ifndef SGEMVNKERNEL SGEMVNKERNEL = sgemv_n.c endif ifndef SGEMVTKERNEL SGEMVTKERNEL = sgemv_t.c endif ifndef DGEMVNKERNEL DGEMVNKERNEL = dgemv_n.S endif ifndef DGEMVTKERNEL DGEMVTKERNEL = dgemv_t.S endif ifndef CGEMVNKERNEL CGEMVNKERNEL = cgemv_n_4.c endif ifndef CGEMVTKERNEL CGEMVTKERNEL = cgemv_t_4.c endif ifndef ZGEMVNKERNEL ZGEMVNKERNEL = zgemv_n_4.c endif ifndef ZGEMVTKERNEL ZGEMVTKERNEL = zgemv_t_4.c endif ifndef QGEMVNKERNEL QGEMVNKERNEL = qgemv_n.S endif ifndef QGEMVTKERNEL QGEMVTKERNEL = qgemv_t.S endif ifndef XGEMVNKERNEL XGEMVNKERNEL = xgemv_n.S endif ifndef XGEMVTKERNEL XGEMVTKERNEL = xgemv_t.S endif QGEMMKERNEL = qgemm_kernel_2x2.S QGEMMINCOPY = QGEMMITCOPY = QGEMMONCOPY = ../generic/gemm_ncopy_2.c QGEMMOTCOPY = ../generic/gemm_tcopy_2.c QGEMMINCOPYOBJ = QGEMMITCOPYOBJ = QGEMMONCOPYOBJ = qgemm_oncopy$(TSUFFIX).$(SUFFIX) QGEMMOTCOPYOBJ = qgemm_otcopy$(TSUFFIX).$(SUFFIX) XGEMMKERNEL = xgemm_kernel_1x1.S XGEMMINCOPY = XGEMMITCOPY = XGEMMONCOPY = ../generic/zgemm_ncopy_1.c XGEMMOTCOPY = ../generic/zgemm_tcopy_1.c XGEMMINCOPYOBJ = XGEMMITCOPYOBJ = XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) ifndef SGEMM_BETA SGEMM_BETA = gemm_beta.S endif ifndef DGEMM_BETA DGEMM_BETA = gemm_beta.S endif ifndef CGEMM_BETA CGEMM_BETA = zgemm_beta.S endif ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta.S endif QGEMM_BETA = ../generic/gemm_beta.c XGEMM_BETA = ../generic/zgemm_beta.c QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S XGEMM3MKERNEL = xgemm3m_kernel_2x2.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.ATOM000066400000000000000000000055341313527062700170330ustar00rootroot00000000000000DAMAXKERNEL = amax_atom.S ZAMAXKERNEL = zamax_atom.S DAMINKERNEL = amax_atom.S ZAMINKERNEL = zamax_atom.S DASUMKERNEL = asum_atom.S ZASUMKERNEL = zasum_atom.S DAXPYKERNEL = axpy_atom.S ZAXPYKERNEL = zaxpy_atom.S DDOTKERNEL = dot_atom.S ZDOTKERNEL = zdot_atom.S DMAXKERNEL = amax_atom.S DMINKERNEL = amax_atom.S DSCALKERNEL = scal_atom.S ZSCALKERNEL = zscal_atom.S DGEMVNKERNEL = dgemv_n_atom.S DGEMVTKERNEL = dgemv_t_atom.S ZGEMVNKERNEL = zgemv_n_atom.S ZGEMVTKERNEL = zgemv_t_atom.S SGEMMKERNEL = gemm_kernel_8x4_penryn.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x2_atom.S DGEMMINCOPY = gemm_ncopy_4.S DGEMMITCOPY = gemm_tcopy_4.S DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_penryn.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x1_atom.S ZGEMMINCOPY = zgemm_ncopy_2.S ZGEMMITCOPY = zgemm_tcopy_2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x2_atom.S DTRSMKERNEL_LT = trsm_kernel_LT_4x2_atom.S DTRSMKERNEL_RN = trsm_kernel_LT_4x2_atom.S DTRSMKERNEL_RT = trsm_kernel_RT_4x2_atom.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_atom.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_atom.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_atom.S ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_atom.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S ZGEMM3MKERNEL = zgemm3m_kernel_4x2_atom.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.BARCELONA000066400000000000000000000043031313527062700175520ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_barcelona.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4_opteron.S DGEMMOTCOPY = gemm_tcopy_4_opteron.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.BOBCAT000066400000000000000000000043721313527062700172240ustar00rootroot00000000000000ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_barcelona.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4_opteron.S DGEMMOTCOPY = gemm_tcopy_4_opteron.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.BULLDOZER000066400000000000000000000057041313527062700176340ustar00rootroot00000000000000DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S DDOTKERNEL = ddot_bulldozer.S DCOPYKERNEL = dcopy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c OpenBLAS-0.2.20/kernel/x86_64/KERNEL.CORE2000066400000000000000000000042001313527062700170720ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x4_core2.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_core2.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_core2.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_core2.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_core2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_core2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_core2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_core2.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.DUNNINGTON000066400000000000000000000042151313527062700177510ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x4_penryn.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_penryn.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_penryn.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_penryn.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.EXCAVATOR000066400000000000000000000057071313527062700176310ustar00rootroot00000000000000DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c OpenBLAS-0.2.20/kernel/x86_64/KERNEL.HASWELL000066400000000000000000000063271313527062700173730ustar00rootroot00000000000000DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t_4.c SSYMV_L_KERNEL = ssymv_L.c SSYMV_U_KERNEL = ssymv_U.c DSYMV_L_KERNEL = dsymv_L.c DSYMV_U_KERNEL = dsymv_U.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = dtrsm_kernel_RN_haswell.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.NANO000066400000000000000000000042521313527062700170220ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_4x8_nano.S SGEMMINCOPY = gemm_ncopy_4.S SGEMMITCOPY = gemm_tcopy_4.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_penryn.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S CGEMMINCOPY = zgemm_ncopy_2.S CGEMMITCOPY = zgemm_tcopy_2.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.NEHALEM000066400000000000000000000053501313527062700173400ustar00rootroot00000000000000SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMINCOPY = gemm_ncopy_4.S SGEMMITCOPY = gemm_tcopy_4.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x8_nehalem.S DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S CGEMMINCOPY = zgemm_ncopy_2.S CGEMMITCOPY = zgemm_tcopy_2.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S ZGEMMINCOPY = zgemm_ncopy_1.S ZGEMMITCOPY = zgemm_tcopy_1.S ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.OPTERON000066400000000000000000000042161313527062700174150ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x4_sse.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_sse2.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4_opteron.S DGEMMOTCOPY = gemm_tcopy_4_opteron.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_sse.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_sse2.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse2.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.OPTERON_SSE3000066400000000000000000000043051313527062700202110ustar00rootroot00000000000000ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S SGEMMKERNEL = gemm_kernel_8x4_sse.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_sse2.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4_opteron.S DGEMMOTCOPY = gemm_tcopy_4_opteron.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_sse.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_sse2.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse2.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse2.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse2.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse2.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse2.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.PENRYN000066400000000000000000000042151313527062700173010ustar00rootroot00000000000000SGEMMKERNEL = gemm_kernel_8x4_penryn.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_penryn.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_penryn.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_penryn.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.PILEDRIVER000066400000000000000000000055411313527062700177360ustar00rootroot00000000000000DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c DCOPYKERNEL = dcopy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c OpenBLAS-0.2.20/kernel/x86_64/KERNEL.PRESCOTT000066400000000000000000000042501313527062700175300ustar00rootroot00000000000000ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVTKERNEL = zgemv_t.S SGEMMKERNEL = gemm_kernel_8x4_sse3.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_sse3.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4.S DGEMMOTCOPY = gemm_tcopy_4.S DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_4x2_sse.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = zgemm_ncopy_2.S CGEMMOTCOPY = zgemm_tcopy_2.S CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_sse3.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = zgemm_ncopy_2.S ZGEMMOTCOPY = zgemm_tcopy_2.S ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse3.S DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse3.S DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse3.S DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse3.S CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse3.S ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse3.S ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse3.S ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse3.S CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.SANDYBRIDGE000066400000000000000000000072401313527062700200220ustar00rootroot00000000000000DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c SGERKERNEL = sger.c DGERKERNEL = dger.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c SSYMV_L_KERNEL = ssymv_L.c SSYMV_U_KERNEL = ssymv_U.c DSYMV_L_KERNEL = dsymv_L.c DSYMV_U_KERNEL = dsymv_U.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SGEMMKERNEL = sgemm_kernel_16x4_sandy.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_4x8_sandy.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_8x2_sandy.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S ZGEMMINCOPY = zgemm_ncopy_1.S ZGEMMITCOPY = zgemm_tcopy_1.S ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S #STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S #DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S #DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S #DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S #DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S #CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S #CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S #CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S #CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S #ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S #ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S #ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S #ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.STEAMROLLER000066400000000000000000000057071313527062700200660ustar00rootroot00000000000000DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c DSYMV_U_KERNEL = dsymv_U.c DSYMV_L_KERNEL = dsymv_L.c SSYMV_U_KERNEL = ssymv_U.c SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = gemm_ncopy_2_bulldozer.S DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S ZGEMMINCOPY = ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c OpenBLAS-0.2.20/kernel/x86_64/KERNEL.ZEN000066400000000000000000000063271313527062700167300ustar00rootroot00000000000000DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c CGEMVNKERNEL = cgemv_n_4.c CGEMVTKERNEL = cgemv_t_4.c SSYMV_L_KERNEL = ssymv_L.c SSYMV_U_KERNEL = ssymv_U.c DSYMV_L_KERNEL = dsymv_L.c DSYMV_U_KERNEL = dsymv_U.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c DGEMMONCOPY = ../generic/gemm_ncopy_8.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = dtrsm_kernel_RN_haswell.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S OpenBLAS-0.2.20/kernel/x86_64/KERNEL.generic000066400000000000000000000111551313527062700177030ustar00rootroot00000000000000SGEMM_BETA = ../generic/gemm_beta.c DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S #Pure C for other kernels SAMAXKERNEL = ../arm/amax.c DAMAXKERNEL = ../arm/amax.c CAMAXKERNEL = ../arm/zamax.c ZAMAXKERNEL = ../arm/zamax.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c ZAMINKERNEL = ../arm/zamin.c SMAXKERNEL = ../arm/max.c DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = ../arm/iamax.c ICAMAXKERNEL = ../arm/izamax.c IZAMAXKERNEL = ../arm/izamax.c ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = ../arm/izamin.c ISMAXKERNEL = ../arm/imax.c IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c SCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = ../arm/copy.c CCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = ../arm/zcopy.c SDOTKERNEL = ../arm/dot.c DDOTKERNEL = ../arm/dot.c CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = ../arm/rot.c DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c SSCALKERNEL = ../arm/scal.c DSCALKERNEL = ../arm/scal.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c SSYMV_U_KERNEL = ../generic/symv_k.c SSYMV_L_KERNEL = ../generic/symv_k.c DSYMV_U_KERNEL = ../generic/symv_k.c DSYMV_L_KERNEL = ../generic/symv_k.c QSYMV_U_KERNEL = ../generic/symv_k.c QSYMV_L_KERNEL = ../generic/symv_k.c CSYMV_U_KERNEL = ../generic/zsymv_k.c CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c XSYMV_U_KERNEL = ../generic/zsymv_k.c XSYMV_L_KERNEL = ../generic/zsymv_k.c ZHEMV_U_KERNEL = ../generic/zhemv_k.c ZHEMV_L_KERNEL = ../generic/zhemv_k.c LSAME_KERNEL = ../generic/lsame.c SCABS_KERNEL = ../generic/cabs.c DCABS_KERNEL = ../generic/cabs.c QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c OpenBLAS-0.2.20/kernel/x86_64/Makefile000066400000000000000000000000121313527062700170130ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/x86_64/amax.S000066400000000000000000000125301313527062700164350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define X ARG2 #define INCX ARG3 #define I %rax #ifndef USE_MIN #define FMOV fcmovbe #else #define FMOV fcmovnbe #endif #include "l1param.h" PROLOGUE PROFCODE salq $BASE_SHIFT, INCX fldz testq M, M jle .L999 testq INCX, INCX jle .L999 ffreep %st FLD (X) #ifdef USE_ABS fabs #endif addq INCX, X decq M jle .L999 cmpq $SIZE, INCX jne .L40 movq M, I sarq $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 1 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 2 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 3 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 4 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 5 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 6 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 7 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st addq $8 * SIZE, X decq I jg .L10 ALIGN_4 .L20: movq M, I andq $7, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st addq $1 * SIZE, X decq I jg .L21 jmp .L999 ALIGN_4 .L40: movq M, I sarq $3, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st decq I jg .L50 ALIGN_4 .L60: movq M, I andq $7, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st addq INCX, X decq I jg .L61 ALIGN_4 .L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/amax_atom.S000066400000000000000000000202231313527062700174530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #ifdef USE_MIN #define maxsd minsd #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 leaq (, INCX, SIZE), INCX testq M, M jle .L999 testq INCX, INCX jle .L999 #ifdef USE_ABS pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 #endif movsd (X), %xmm0 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm0 #endif decq M jle .L999 movaps %xmm0, %xmm1 movaps %xmm0, %xmm2 movaps %xmm0, %xmm3 cmpq $SIZE, INCX jne .L20 movq M, I sarq $3, I jle .L15 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movsd 2 * SIZE(X), %xmm6 movsd 3 * SIZE(X), %xmm7 movsd 4 * SIZE(X), %xmm8 movsd 5 * SIZE(X), %xmm9 movsd 6 * SIZE(X), %xmm10 movsd 7 * SIZE(X), %xmm11 decq I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm1 movsd 8 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxsd %xmm5, %xmm2 movsd 9 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxsd %xmm6, %xmm1 movsd 10 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxsd %xmm7, %xmm2 movsd 11 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm8 #endif maxsd %xmm8, %xmm1 movsd 12 * SIZE(X), %xmm8 #ifdef USE_ABS andps %xmm15, %xmm9 #endif maxsd %xmm9, %xmm2 movsd 13 * SIZE(X), %xmm9 #ifdef USE_ABS andps %xmm15, %xmm10 #endif maxsd %xmm10, %xmm1 movsd 14 * SIZE(X), %xmm10 #ifdef USE_ABS andps %xmm15, %xmm11 #endif maxsd %xmm11, %xmm2 movsd 15 * SIZE(X), %xmm11 addq $8 * SIZE, X decq I jg .L12 ALIGN_4 .L13: #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxsd %xmm5, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxsd %xmm6, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxsd %xmm7, %xmm3 #ifdef USE_ABS andps %xmm15, %xmm8 #endif maxsd %xmm8, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm9 #endif maxsd %xmm9, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm10 #endif maxsd %xmm10, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm11 #endif maxsd %xmm11, %xmm3 addq $8 * SIZE, X ALIGN_4 .L15: testq $4, M jle .L17 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movsd 2 * SIZE(X), %xmm6 movsd 3 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxsd %xmm5, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxsd %xmm6, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxsd %xmm7, %xmm3 addq $4 * SIZE, X ALIGN_3 .L17: testq $2, M jle .L18 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxsd %xmm5, %xmm2 addq $2 * SIZE, X ALIGN_3 .L18: testq $1, M jle .L998 movsd 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm3 jmp .L998 ALIGN_3 .L20: movq M, I sarq $3, I jle .L25 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X movsd (X), %xmm6 addq INCX, X movsd (X), %xmm7 addq INCX, X movsd (X), %xmm8 addq INCX, X movsd (X), %xmm9 addq INCX, X movsd (X), %xmm10 addq INCX, X movsd (X), %xmm11 decq I jle .L23 ALIGN_4 .L22: #ifdef USE_ABS andps %xmm15, %xmm4 #endif addq INCX, X maxsd %xmm4, %xmm1 movsd (X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif addq INCX, X maxsd %xmm5, %xmm2 movsd (X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif addq INCX, X maxsd %xmm6, %xmm1 movsd (X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif addq INCX, X maxsd %xmm7, %xmm2 movsd (X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm8 #endif addq INCX, X maxsd %xmm8, %xmm1 movsd (X), %xmm8 #ifdef USE_ABS andps %xmm15, %xmm9 #endif addq INCX, X maxsd %xmm9, %xmm2 movsd (X), %xmm9 #ifdef USE_ABS andps %xmm15, %xmm10 #endif addq INCX, X maxsd %xmm10, %xmm1 movsd (X), %xmm10 #ifdef USE_ABS andps %xmm15, %xmm11 #endif addq INCX, X maxsd %xmm11, %xmm2 movsd (X), %xmm11 decq I jg .L22 ALIGN_4 .L23: #ifdef USE_ABS andps %xmm15, %xmm4 #endif addq INCX, X maxsd %xmm4, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxsd %xmm5, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxsd %xmm6, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxsd %xmm7, %xmm3 #ifdef USE_ABS andps %xmm15, %xmm8 #endif maxsd %xmm8, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm9 #endif maxsd %xmm9, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm10 #endif maxsd %xmm10, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm11 #endif maxsd %xmm11, %xmm3 ALIGN_4 .L25: testq $4, M jle .L27 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X movsd (X), %xmm6 addq INCX, X movsd (X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxsd %xmm5, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxsd %xmm6, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxsd %xmm7, %xmm3 ALIGN_3 .L27: testq $2, M jle .L28 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxsd %xmm5, %xmm2 ALIGN_3 .L28: testq $1, M jle .L998 movsd (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxsd %xmm4, %xmm3 ALIGN_3 .L998: maxsd %xmm1, %xmm0 maxsd %xmm3, %xmm2 maxsd %xmm2, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/amax_sse.S000066400000000000000000000204471313527062700173150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #ifdef USE_MIN #define maxps minps #define maxss minss #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 leaq (, INCX, SIZE), INCX testq M, M jle .L999 #ifdef USE_ABS pcmpeqb %xmm15, %xmm15 psrld $1, %xmm15 #endif movss (X), %xmm0 shufps $0, %xmm0, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm0 #endif movaps %xmm0, %xmm1 movaps %xmm0, %xmm2 movaps %xmm0, %xmm3 addq INCX, X decq M jle .L999 cmpq $SIZE, INCX jne .L40 subq $-32 * SIZE, X cmpq $3, M jle .L17 testq $SIZE, X je .L05 movss -32 * SIZE(X), %xmm1 shufps $0, %xmm1, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm1 #endif decq M addq $SIZE, X ALIGN_3 .L05: testq $2 * SIZE, X je .L06 movsd -32 * SIZE(X), %xmm2 unpcklps %xmm2, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm2 #endif subq $2, M addq $2 * SIZE, X ALIGN_3 .L06: movq M, I sarq $5, I jle .L15 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 decq I jle .L12 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 movaps -8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 movaps -4 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 movaps 8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 movaps 12 * SIZE(X), %xmm7 subq $-32 * SIZE, X decq I jg .L11 ALIGN_4 .L12: #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 movaps -8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 movaps -4 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 subq $-32 * SIZE, X ALIGN_3 .L15: testq $16, M je .L16 movaps -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 movaps -24 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 movaps -20 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 addq $16 * SIZE, X ALIGN_3 .L16: testq $8, M je .L17 movaps -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 .L17: testq $4, M je .L18 movaps -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm2 addq $4 * SIZE, X ALIGN_3 .L18: testq $2, M je .L19 movsd -32 * SIZE(X), %xmm4 unpcklps %xmm4, %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm3 addq $2 * SIZE, X ALIGN_3 .L19: testq $1, M je .L998 movss -32 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 jmp .L998 ALIGN_3 .L40: movq M, I sarq $3, I jle .L45 ALIGN_4 .L41: movss (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 movss (X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxss %xmm6, %xmm2 movss (X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 movss (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 movss (X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxss %xmm6, %xmm2 movss (X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 decq I jg .L41 ALIGN_4 .L45: testq $4, M je .L46 movss (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 movss (X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxss %xmm6, %xmm2 movss (X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 ALIGN_3 .L46: testq $2, M je .L47 movss (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 ALIGN_3 .L47: testq $1, M je .L998 movss (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm2 ALIGN_4 .L998: maxps %xmm1, %xmm0 maxps %xmm3, %xmm2 maxps %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/amax_sse2.S000066400000000000000000000214301313527062700173700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 leaq (, INCX, SIZE), INCX testq M, M jle .L999 #ifdef USE_ABS pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 #endif movsd (X), %xmm0 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm0 #endif unpcklpd %xmm0, %xmm0 movaps %xmm0, %xmm1 movaps %xmm0, %xmm2 movaps %xmm0, %xmm3 decq M jle .L999 cmpq $SIZE, INCX jne .L40 subq $-16 * SIZE, X testq $SIZE, X je .L05 movsd -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif unpcklpd %xmm4, %xmm4 maxpd %xmm4, %xmm3 addq $SIZE, X decq M jle .L998 ALIGN_3 .L05: movq M, I sarq $4, I jle .L15 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 decq I jle .L12 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movaps -4 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 movaps -2 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 movaps 6 * SIZE(X), %xmm7 subq $-16 * SIZE, X decq I jg .L11 ALIGN_4 .L12: #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -8 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movaps -6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movaps -4 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 movaps -2 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 subq $-16 * SIZE, X ALIGN_4 .L15: testq $8, M jle .L16 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -14 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movaps -12 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movaps -10 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 addq $8 * SIZE, X ALIGN_3 .L16: testq $4, M jle .L17 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movaps -14 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 addq $4 * SIZE, X ALIGN_3 .L17: testq $2, M jle .L18 movaps -16 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm2 addq $2 * SIZE, X ALIGN_3 .L18: testq $1, M jle .L998 movsd -16 * SIZE(X), %xmm4 unpcklpd %xmm4, %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm3 jmp .L998 ALIGN_3 .L40: movq M, I sarq $4, I jle .L45 ALIGN_4 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 decq I jg .L41 ALIGN_4 .L45: andq $15, M jle .L998 testq $8, M je .L46 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 ALIGN_3 .L46: testq $4, M je .L47 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 ALIGN_3 .L47: testq $2, M je .L48 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 ALIGN_3 .L48: testq $1, M je .L998 movsd (X), %xmm7 unpcklpd %xmm7, %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 ALIGN_4 .L998: maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movaps %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/asum.S000066400000000000000000000102611313527062700164530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define X ARG2 #define INCX ARG3 #define I %rax #include "l1param.h" PROLOGUE PROFCODE fldz testq M, M jle .L999 testq INCX, INCX jle .L999 salq $BASE_SHIFT, INCX fldz fldz fldz cmpq $SIZE, INCX jne .L40 movq M, I sarq $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs addq $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L10 ALIGN_4 .L20: andq $7, M jle .L998 ALIGN_4 .L21: FLD (X) fabs faddp %st,%st(1) addq $1 * SIZE, X decq M jg .L21 jmp .L998 ALIGN_4 .L40: movq M, I sarq $3, I jle .L60 ALIGN_4 .L50: FLD (X) addq INCX, X fabs FLD (X) addq INCX, X fabs FLD (X) addq INCX, X fabs FLD (X) addq INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD (X) addq INCX, X fabs FLD (X) addq INCX, X fabs FLD (X) addq INCX, X fabs FLD (X) addq INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L50 ALIGN_4 .L60: andq $7, M jle .L998 ALIGN_4 .L61: FLD (X) addq INCX, X fabs faddp %st,%st(1) decq M jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/asum_atom.S000066400000000000000000000211341313527062700174740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 testq M, M jle .L999 testq INCX, INCX jle .L999 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 salq $BASE_SHIFT, INCX xorps %xmm13, %xmm13 cmpq $SIZE, INCX jne .L20 testq $SIZE, X je .L05 movsd (X), %xmm0 addq $SIZE, X andps %xmm15, %xmm0 decq M jle .L999 ALIGN_3 .L05: subq $-16 * SIZE, X movq M, I sarq $4, I jle .L12 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 movaps -8 * SIZE(X), %xmm8 movaps -6 * SIZE(X), %xmm9 movaps -4 * SIZE(X), %xmm10 movaps -2 * SIZE(X), %xmm11 decq I jle .L11 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm4 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 movaps 2 * SIZE(X), %xmm5 andps %xmm15, %xmm6 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm6, %xmm12 addsd %xmm6, %xmm0 movaps 4 * SIZE(X), %xmm6 andps %xmm15, %xmm7 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm13 addsd %xmm7, %xmm2 movaps 6 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm15, %xmm8 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm8, %xmm12 addsd %xmm8, %xmm0 movaps 8 * SIZE(X), %xmm8 andps %xmm15, %xmm9 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm13 addsd %xmm9, %xmm2 movaps 10 * SIZE(X), %xmm9 andps %xmm15, %xmm10 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm10, %xmm12 addsd %xmm10, %xmm0 movaps 12 * SIZE(X), %xmm10 andps %xmm15, %xmm11 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm13 addsd %xmm11, %xmm2 movaps 14 * SIZE(X), %xmm11 subq $-16 * SIZE, X decq I jg .L10 ALIGN_4 .L11: andps %xmm15, %xmm4 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 andps %xmm15, %xmm6 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm6, %xmm12 addsd %xmm6, %xmm0 andps %xmm15, %xmm7 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm13 addsd %xmm7, %xmm2 andps %xmm15, %xmm8 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm8, %xmm12 addsd %xmm8, %xmm0 andps %xmm15, %xmm9 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm13 addsd %xmm9, %xmm2 andps %xmm15, %xmm10 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm10, %xmm12 addsd %xmm10, %xmm0 andps %xmm15, %xmm11 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm13 addsd %xmm11, %xmm2 addsd %xmm13, %xmm3 subq $-16 * SIZE, X ALIGN_3 .L12: andq $15, M jle .L998 testq $8, M je .L13 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 addq $8 * SIZE, X andps %xmm15, %xmm4 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 addsd %xmm13, %xmm3 andps %xmm15, %xmm6 pshufd $0x4e, %xmm6, %xmm12 addsd %xmm6, %xmm0 andps %xmm15, %xmm7 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm13 addsd %xmm7, %xmm2 addsd %xmm13, %xmm3 ALIGN_3 .L13: testq $4, M je .L14 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 addq $4 * SIZE, X andps %xmm15, %xmm4 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 addsd %xmm13, %xmm3 ALIGN_3 .L14: testq $2, M je .L15 movaps -16 * SIZE(X), %xmm4 addq $2 * SIZE, X andps %xmm15, %xmm4 pshufd $0x4e, %xmm4, %xmm5 addsd %xmm4, %xmm2 addsd %xmm5, %xmm3 ALIGN_3 .L15: testq $1, M je .L998 movsd -16 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 jmp .L998 ALIGN_3 .L20: movq M, I sarq $3, I jle .L25 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X movsd (X), %xmm6 addq INCX, X movsd (X), %xmm7 addq INCX, X movsd (X), %xmm8 addq INCX, X movsd (X), %xmm9 addq INCX, X movsd (X), %xmm10 addq INCX, X movsd (X), %xmm11 decq I jle .L23 ALIGN_4 .L22: andps %xmm15, %xmm4 addq INCX, X addsd %xmm4, %xmm0 movsd (X), %xmm4 andps %xmm15, %xmm5 addq INCX, X addsd %xmm5, %xmm1 movsd (X), %xmm5 andps %xmm15, %xmm6 addq INCX, X addsd %xmm6, %xmm2 movsd (X), %xmm6 andps %xmm15, %xmm7 addq INCX, X addsd %xmm7, %xmm3 movsd (X), %xmm7 andps %xmm15, %xmm8 addq INCX, X addsd %xmm8, %xmm0 movsd (X), %xmm8 andps %xmm15, %xmm9 addq INCX, X addsd %xmm9, %xmm1 movsd (X), %xmm9 andps %xmm15, %xmm10 addq INCX, X addsd %xmm10, %xmm2 movsd (X), %xmm10 andps %xmm15, %xmm11 addq INCX, X addsd %xmm11, %xmm3 movsd (X), %xmm11 decq I jg .L22 ALIGN_4 .L23: andps %xmm15, %xmm4 addq INCX, X addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm5, %xmm1 andps %xmm15, %xmm6 addsd %xmm6, %xmm2 andps %xmm15, %xmm7 addsd %xmm7, %xmm3 andps %xmm15, %xmm8 addsd %xmm8, %xmm0 andps %xmm15, %xmm9 addsd %xmm9, %xmm1 andps %xmm15, %xmm10 addsd %xmm10, %xmm2 andps %xmm15, %xmm11 addsd %xmm11, %xmm3 ALIGN_3 .L25: andq $7, M jle .L998 testq $4, M je .L26 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X movsd (X), %xmm6 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 addq INCX, X movsd (X), %xmm7 andps %xmm15, %xmm5 addsd %xmm5, %xmm1 addq INCX, X andps %xmm15, %xmm6 addsd %xmm6, %xmm2 andps %xmm15, %xmm7 addsd %xmm7, %xmm3 ALIGN_3 .L26: testq $2, M je .L27 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X andps %xmm15, %xmm4 andps %xmm15, %xmm5 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 ALIGN_3 .L27: testq $1, M je .L998 movsd (X), %xmm4 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 ALIGN_3 .L998: addsd %xmm1, %xmm0 addsd %xmm3, %xmm2 addsd %xmm2, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/asum_sse.S000066400000000000000000000155661313527062700173420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 testq M, M jle .L999 testq INCX, INCX jle .L999 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 pcmpeqb %xmm15, %xmm15 psrld $1, %xmm15 leaq (, INCX, SIZE), INCX cmpq $SIZE, INCX jne .L100 subq $-32 * SIZE, X cmpq $3, M jle .L18 testq $4, X je .L05 movss -32 * SIZE(X), %xmm0 andps %xmm15, %xmm0 addq $SIZE, X decq M jle .L998 ALIGN_3 .L05: testq $8, X je .L10 movsd -32 * SIZE(X), %xmm1 andps %xmm15, %xmm1 addq $2 * SIZE, X subq $2, M jle .L998 ALIGN_3 .L10: movq M, I sarq $5, I jle .L14 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 movaps -16 * SIZE(X), %xmm8 movaps -12 * SIZE(X), %xmm9 movaps -8 * SIZE(X), %xmm10 movaps -4 * SIZE(X), %xmm11 decq I jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm15, %xmm5 addps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 andps %xmm15, %xmm6 addps %xmm6, %xmm2 movaps 8 * SIZE(X), %xmm6 andps %xmm15, %xmm7 addps %xmm7, %xmm3 movaps 12 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps 16 * SIZE(X), %xmm8 andps %xmm15, %xmm9 addps %xmm9, %xmm1 movaps 20 * SIZE(X), %xmm9 andps %xmm15, %xmm10 addps %xmm10, %xmm2 movaps 24 * SIZE(X), %xmm10 andps %xmm15, %xmm11 addps %xmm11, %xmm3 movaps 28 * SIZE(X), %xmm11 subq $-32 * SIZE, X decq I jg .L11 ALIGN_3 .L12: andps %xmm15, %xmm4 addps %xmm4, %xmm0 andps %xmm15, %xmm5 addps %xmm5, %xmm1 andps %xmm15, %xmm6 addps %xmm6, %xmm2 andps %xmm15, %xmm7 addps %xmm7, %xmm3 andps %xmm15, %xmm8 addps %xmm8, %xmm0 andps %xmm15, %xmm9 addps %xmm9, %xmm1 andps %xmm15, %xmm10 addps %xmm10, %xmm2 andps %xmm15, %xmm11 addps %xmm11, %xmm3 subq $-32 * SIZE, X ALIGN_3 .L14: testq $16, M je .L16 movaps -32 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm15, %xmm5 addps %xmm5, %xmm1 movaps -24 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm5 andps %xmm15, %xmm5 addps %xmm5, %xmm1 addq $16 * SIZE, X ALIGN_3 .L16: testq $8, M je .L17 movaps -32 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm15, %xmm5 addps %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 .L17: testq $4, M je .L18 movaps -32 * SIZE(X), %xmm6 andps %xmm15, %xmm6 addps %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 .L18: testq $2, M je .L19 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd -32 * SIZE(X), %xmm7 andps %xmm15, %xmm7 addps %xmm7, %xmm3 addq $2 * SIZE, X ALIGN_3 .L19: testq $1, M je .L998 movss -32 * SIZE(X), %xmm6 andps %xmm15, %xmm6 addps %xmm6, %xmm2 jmp .L998 ALIGN_4 .L100: movq M, I sarq $3, I jle .L105 ALIGN_4 .L101: movss 0 * SIZE(X), %xmm4 addq INCX, X andps %xmm15, %xmm4 addss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X andps %xmm15, %xmm5 addss %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X andps %xmm15, %xmm6 addss %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X andps %xmm15, %xmm7 addss %xmm7, %xmm3 movss 0 * SIZE(X), %xmm8 addq INCX, X andps %xmm15, %xmm8 addss %xmm8, %xmm0 movss 0 * SIZE(X), %xmm4 addq INCX, X andps %xmm15, %xmm4 addss %xmm4, %xmm1 movss 0 * SIZE(X), %xmm5 addq INCX, X andps %xmm15, %xmm5 addss %xmm5, %xmm2 movss 0 * SIZE(X), %xmm6 addq INCX, X andps %xmm15, %xmm6 addss %xmm6, %xmm3 decq I jg .L101 ALIGN_4 .L105: andq $7, M jle .L998 ALIGN_4 .L106: movss 0 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 addq INCX, X decq M jg .L106 ALIGN_4 .L998: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #endif ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/asum_sse2.S000066400000000000000000000146471313527062700174230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 testq M, M jle .L999 testq INCX, INCX jle .L999 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 salq $BASE_SHIFT, INCX subq $-16 * SIZE, X cmpq $SIZE, INCX jne .L40 testq $SIZE, X je .L05 movsd -16 * SIZE(X), %xmm0 addq $SIZE, X andps %xmm15, %xmm0 subq $1, M jle .L999 ALIGN_3 .L05: movq M, I sarq $4, I jle .L20 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 movaps -8 * SIZE(X), %xmm8 movaps -6 * SIZE(X), %xmm9 movaps -4 * SIZE(X), %xmm10 movaps -2 * SIZE(X), %xmm11 decq I jle .L11 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm4 addpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm15, %xmm5 addpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 andps %xmm15, %xmm6 addpd %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 andps %xmm15, %xmm7 addpd %xmm7, %xmm3 movaps 6 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps 8 * SIZE(X), %xmm8 andps %xmm15, %xmm9 addpd %xmm9, %xmm1 movaps 10 * SIZE(X), %xmm9 andps %xmm15, %xmm10 addpd %xmm10, %xmm2 movaps 12 * SIZE(X), %xmm10 andps %xmm15, %xmm11 addpd %xmm11, %xmm3 movaps 14 * SIZE(X), %xmm11 subq $-16 * SIZE, X decq I jg .L10 ALIGN_4 .L11: andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 andps %xmm15, %xmm8 andps %xmm15, %xmm9 andps %xmm15, %xmm10 andps %xmm15, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 subq $-16 * SIZE, X ALIGN_3 .L20: andq $15, M jle .L998 testq $8, M je .L21 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 addq $8 * SIZE, X ALIGN_3 .L21: testq $4, M je .L22 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addq $4 * SIZE, X ALIGN_3 .L22: testq $2, M je .L23 movaps -16 * SIZE(X), %xmm6 andps %xmm15, %xmm6 addpd %xmm6, %xmm3 addq $2 * SIZE, X .L23: testq $1, M je .L998 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -16 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 jmp .L998 ALIGN_3 .L40: movq M, I sarq $3, I jle .L60 ALIGN_4 .L50: movsd -16 * SIZE(X), %xmm4 addq INCX, X movhpd -16 * SIZE(X), %xmm4 addq INCX, X andps %xmm15, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(X), %xmm5 addq INCX, X movhpd -16 * SIZE(X), %xmm5 addq INCX, X andps %xmm15, %xmm5 addpd %xmm5, %xmm1 movsd -16 * SIZE(X), %xmm6 addq INCX, X movhpd -16 * SIZE(X), %xmm6 addq INCX, X andps %xmm15, %xmm6 addpd %xmm6, %xmm2 movsd -16 * SIZE(X), %xmm7 addq INCX, X movhpd -16 * SIZE(X), %xmm7 addq INCX, X andps %xmm15, %xmm7 addpd %xmm7, %xmm3 decq I jg .L50 ALIGN_4 .L60: #ifdef movsd xorps %xmm4, %xmm4 #endif andq $7, M jle .L998 ALIGN_4 .L61: movsd -16 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addpd %xmm4, %xmm0 addq INCX, X decq M jg .L61 ALIGN_4 .L998: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 ALIGN_4 .L999: #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/axpy.S000066400000000000000000000115241313527062700164720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG4 /* rsi */ #define INCX ARG5 /* rdx */ #define Y ARG6 /* rcx */ #define INCY ARG2 /* r8 */ #define ALPHA 8(%rsp) #include "l1param.h" PROLOGUE PROFCODE movq 24(%rsp), INCY FLD ALPHA salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY testq M, M jle .L40 cmpq $SIZE, INCX jne .L14 cmpq $SIZE, INCY jne .L14 movq M, %rax sarq $3, %rax jle .L15 ALIGN_3 #define PRESIZE 33 .L16: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(1),%st FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) fmul %st(1),%st FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(1),%st FLD 2 * SIZE(Y) faddp %st, %st(1) FST 2 * SIZE(Y) FLD 3 * SIZE(X) fmul %st(1),%st FLD 3 * SIZE(Y) faddp %st, %st(1) FST 3 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 4 * SIZE(X) fmul %st(1),%st FLD 4 * SIZE(Y) faddp %st, %st(1) FST 4 * SIZE(Y) FLD 5 * SIZE(X) fmul %st(1),%st FLD 5 * SIZE(Y) faddp %st, %st(1) FST 5 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(1),%st FLD 6 * SIZE(Y) faddp %st, %st(1) FST 6 * SIZE(Y) FLD 7 * SIZE(X) fmul %st(1),%st FLD 7 * SIZE(Y) faddp %st, %st(1) FST 7 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L16 ALIGN_3 .L15: movq M, %rax andq $7, %rax jle .L40 ALIGN_3 .L22: FLD 0 * SIZE(X) fmul %st(1),%st FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) addq $SIZE, X addq $SIZE, Y decq %rax jg .L22 jmp .L40 ALIGN_3 .L14: movq M, %rax sarq $2, %rax jle .L28 ALIGN_3 .L29: FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addq INCX, X addq INCY, Y FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addq INCX, X addq INCY, Y FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addq INCX, X addq INCY, Y FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addq INCX, X addq INCY, Y decq %rax jg .L29 ALIGN_3 .L28: movq M, %rax andq $3, %rax jle .L40 ALIGN_3 .L35: FLD (X) fmul %st(1),%st FLD (Y) faddp %st, %st(1) FST (Y) addq INCX, X addq INCY, Y decq %rax jg .L35 .L40: ffreep %st(0) ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/axpy_atom.S000066400000000000000000000242051313527062700175120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %r10 #endif #define YY %r11 #define ALPHA %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY #else movq 24(%rsp), INCY #endif movaps %xmm0, ALPHA #else movaps %xmm3, ALPHA movq 40(%rsp), X movq 48(%rsp), INCX movq 56(%rsp), Y movq 64(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY testq M, M jle .L29 cmpq $SIZE, INCX jne .L20 cmpq $SIZE, INCY jne .L20 movq M, %rax sarq $3, %rax jle .L13 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 movsd 2 * SIZE(X), %xmm2 movsd 3 * SIZE(X), %xmm3 movsd 0 * SIZE(Y), %xmm4 movsd 1 * SIZE(Y), %xmm5 movsd 2 * SIZE(Y), %xmm6 movsd 3 * SIZE(Y), %xmm7 movsd 4 * SIZE(X), %xmm8 mulsd ALPHA, %xmm0 movsd 5 * SIZE(X), %xmm9 mulsd ALPHA, %xmm1 movsd 6 * SIZE(X), %xmm10 mulsd ALPHA, %xmm2 movsd 7 * SIZE(X), %xmm11 mulsd ALPHA, %xmm3 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif addsd %xmm4, %xmm0 movsd 4 * SIZE(Y), %xmm4 addsd %xmm5, %xmm1 movsd 5 * SIZE(Y), %xmm5 addsd %xmm6, %xmm2 movsd 6 * SIZE(Y), %xmm6 addsd %xmm7, %xmm3 movsd 7 * SIZE(Y), %xmm7 movsd %xmm0, 0 * SIZE(Y) mulsd ALPHA, %xmm8 movsd 8 * SIZE(X), %xmm0 movsd %xmm1, 1 * SIZE(Y) mulsd ALPHA, %xmm9 movsd 9 * SIZE(X), %xmm1 movsd %xmm2, 2 * SIZE(Y) mulsd ALPHA, %xmm10 movsd 10 * SIZE(X), %xmm2 movsd %xmm3, 3 * SIZE(Y) mulsd ALPHA, %xmm11 movsd 11 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif addsd %xmm4, %xmm8 movsd 8 * SIZE(Y), %xmm4 addsd %xmm5, %xmm9 movsd 9 * SIZE(Y), %xmm5 addsd %xmm6, %xmm10 movsd 10 * SIZE(Y), %xmm6 addsd %xmm7, %xmm11 movsd 11 * SIZE(Y), %xmm7 movsd %xmm8, 4 * SIZE(Y) mulsd ALPHA, %xmm0 movsd 12 * SIZE(X), %xmm8 movsd %xmm9, 5 * SIZE(Y) mulsd ALPHA, %xmm1 movsd 13 * SIZE(X), %xmm9 movsd %xmm10, 6 * SIZE(Y) mulsd ALPHA, %xmm2 movsd 14 * SIZE(X), %xmm10 movsd %xmm11, 7 * SIZE(Y) mulsd ALPHA, %xmm3 movsd 15 * SIZE(X), %xmm11 addq $8 * SIZE, Y addq $8 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: addsd %xmm4, %xmm0 movsd 4 * SIZE(Y), %xmm4 addsd %xmm5, %xmm1 movsd 5 * SIZE(Y), %xmm5 addsd %xmm6, %xmm2 movsd 6 * SIZE(Y), %xmm6 addsd %xmm7, %xmm3 movsd 7 * SIZE(Y), %xmm7 movsd %xmm0, 0 * SIZE(Y) mulsd ALPHA, %xmm8 movsd %xmm1, 1 * SIZE(Y) mulsd ALPHA, %xmm9 movsd %xmm2, 2 * SIZE(Y) mulsd ALPHA, %xmm10 movsd %xmm3, 3 * SIZE(Y) mulsd ALPHA, %xmm11 addsd %xmm4, %xmm8 addsd %xmm5, %xmm9 addsd %xmm6, %xmm10 addsd %xmm7, %xmm11 movsd %xmm8, 4 * SIZE(Y) movsd %xmm9, 5 * SIZE(Y) movsd %xmm10, 6 * SIZE(Y) movsd %xmm11, 7 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L13: movq M, %rax andq $4, %rax jle .L15 ALIGN_3 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 movsd 2 * SIZE(X), %xmm2 movsd 3 * SIZE(X), %xmm3 movsd 0 * SIZE(Y), %xmm4 mulsd ALPHA, %xmm0 movsd 1 * SIZE(Y), %xmm5 mulsd ALPHA, %xmm1 movsd 2 * SIZE(Y), %xmm6 mulsd ALPHA, %xmm2 movsd 3 * SIZE(Y), %xmm7 mulsd ALPHA, %xmm3 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 movsd %xmm0, 0 * SIZE(Y) movsd %xmm1, 1 * SIZE(Y) movsd %xmm2, 2 * SIZE(Y) movsd %xmm3, 3 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: movq M, %rax andq $2, %rax jle .L16 ALIGN_3 movsd 0 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm4 movsd 1 * SIZE(X), %xmm1 movsd 1 * SIZE(Y), %xmm5 mulsd ALPHA, %xmm0 mulsd ALPHA, %xmm1 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 movsd %xmm0, 0 * SIZE(Y) movsd %xmm1, 1 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L16: movq M, %rax andq $1, %rax jle .L19 ALIGN_3 movsd 0 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd 0 * SIZE(Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L20: movq Y, YY movq M, %rax sarq $3, %rax jle .L23 movsd (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X movsd (Y), %xmm4 addq INCY, Y movsd (Y), %xmm5 addq INCY, Y movsd (Y), %xmm6 addq INCY, Y movsd (Y), %xmm7 addq INCY, Y movsd (X), %xmm8 addq INCX, X mulsd ALPHA, %xmm0 movsd (X), %xmm9 addq INCX, X mulsd ALPHA, %xmm1 movsd (X), %xmm10 addq INCX, X mulsd ALPHA, %xmm2 movsd (X), %xmm11 addq INCX, X mulsd ALPHA, %xmm3 decq %rax jle .L22 ALIGN_3 .L21: addsd %xmm4, %xmm0 movsd (Y), %xmm4 addq INCY, Y addsd %xmm5, %xmm1 movsd (Y), %xmm5 addq INCY, Y addsd %xmm6, %xmm2 movsd (Y), %xmm6 addq INCY, Y addsd %xmm7, %xmm3 movsd (Y), %xmm7 addq INCY, Y movsd %xmm0, (YY) addq INCY, YY movsd (X), %xmm0 addq INCX, X mulsd ALPHA, %xmm8 movsd %xmm1, (YY) addq INCY, YY movsd (X), %xmm1 addq INCX, X mulsd ALPHA, %xmm9 movsd %xmm2, (YY) addq INCY, YY movsd (X), %xmm2 addq INCX, X mulsd ALPHA, %xmm10 movsd %xmm3, (YY) addq INCY, YY movsd (X), %xmm3 addq INCX, X mulsd ALPHA, %xmm11 addsd %xmm4, %xmm8 movsd (Y), %xmm4 addq INCY, Y addsd %xmm5, %xmm9 movsd (Y), %xmm5 addq INCY, Y addsd %xmm6, %xmm10 movsd (Y), %xmm6 addq INCY, Y addsd %xmm7, %xmm11 movsd (Y), %xmm7 addq INCY, Y movsd %xmm8, (YY) addq INCY, YY movsd (X), %xmm8 addq INCX, X mulsd ALPHA, %xmm0 movsd %xmm9, (YY) addq INCY, YY movsd (X), %xmm9 addq INCX, X mulsd ALPHA, %xmm1 movsd %xmm10, (YY) addq INCY, YY movsd (X), %xmm10 addq INCX, X mulsd ALPHA, %xmm2 movsd %xmm11, (YY) addq INCY, YY movsd (X), %xmm11 addq INCX, X mulsd ALPHA, %xmm3 decq %rax jg .L21 ALIGN_3 .L22: addsd %xmm4, %xmm0 movsd (Y), %xmm4 addq INCY, Y addsd %xmm5, %xmm1 movsd (Y), %xmm5 addq INCY, Y addsd %xmm6, %xmm2 movsd (Y), %xmm6 addq INCY, Y addsd %xmm7, %xmm3 movsd (Y), %xmm7 addq INCY, Y movsd %xmm0, (YY) addq INCY, YY mulsd ALPHA, %xmm8 movsd %xmm1, (YY) addq INCY, YY mulsd ALPHA, %xmm9 movsd %xmm2, (YY) addq INCY, YY mulsd ALPHA, %xmm10 movsd %xmm3, (YY) addq INCY, YY mulsd ALPHA, %xmm11 addsd %xmm4, %xmm8 addsd %xmm5, %xmm9 addsd %xmm6, %xmm10 addsd %xmm7, %xmm11 movsd %xmm8, (YY) addq INCY, YY movsd %xmm9, (YY) addq INCY, YY movsd %xmm10, (YY) addq INCY, YY movsd %xmm11, (YY) addq INCY, YY ALIGN_3 .L23: movq M, %rax andq $4, %rax jle .L25 ALIGN_3 movsd (X), %xmm0 addq INCX, X movsd (Y), %xmm4 addq INCY, Y movsd (X), %xmm1 addq INCX, X movsd (Y), %xmm5 addq INCY, Y movsd (X), %xmm2 addq INCX, X mulsd ALPHA, %xmm0 movsd (Y), %xmm6 addq INCY, Y mulsd ALPHA, %xmm1 movsd (X), %xmm3 addq INCX, X mulsd ALPHA, %xmm2 movsd (Y), %xmm7 addq INCY, Y mulsd ALPHA, %xmm3 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 movsd %xmm0, (YY) addq INCY, YY movsd %xmm1, (YY) addq INCY, YY movsd %xmm2, (YY) addq INCY, YY movsd %xmm3, (YY) addq INCY, YY ALIGN_3 .L25: movq M, %rax andq $2, %rax jle .L26 ALIGN_3 movsd (X), %xmm0 addq INCX, X movsd (Y), %xmm4 addq INCY, Y movsd (X), %xmm1 addq INCX, X movsd (Y), %xmm5 addq INCY, Y mulsd ALPHA, %xmm0 mulsd ALPHA, %xmm1 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 movsd %xmm0, (YY) addq INCY, YY movsd %xmm1, (YY) addq INCY, YY ALIGN_3 .L26: movq M, %rax andq $1, %rax jle .L29 ALIGN_3 movsd (X), %xmm0 mulsd ALPHA, %xmm0 addsd (Y), %xmm0 movsd %xmm0, (YY) addq $SIZE, Y ALIGN_3 .L29: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/axpy_sse.S000066400000000000000000000670041313527062700173500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %r10 #endif #define YY %r11 #define ALPHA %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY #else movq 24(%rsp), INCY #endif movaps %xmm0, ALPHA #else movq 40(%rsp), X movq 48(%rsp), INCX movq 56(%rsp), Y movq 64(%rsp), INCY #endif SAVEREGISTERS #ifdef WINDOWS_ABI movaps %xmm3, ALPHA #endif shufps $0, ALPHA, ALPHA leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY testq M, M jle .L19 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 subq $-32 * SIZE, X subq $-32 * SIZE, Y cmpq $3, M jle .L16 testq $SIZE, Y je .L00 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_3 .L00: testq $SIZE * 2, Y je .L10 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y subq $2, M jle .L19 ALIGN_3 .L10: testq $SIZE * 3, X jne .L20 movq M, %rax sarq $5, %rax jle .L13 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 decq %rax jle .L12 ALIGN_4 .L11: movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 8 * SIZE(X), %xmm2 movaps 12 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L13: movq M, %rax andq $16, %rax jle .L14 ALIGN_3 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L14: movq M, %rax andq $8, %rax jle .L15 ALIGN_3 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: movq M, %rax andq $4, %rax jle .L16 ALIGN_3 movaps -32 * SIZE(X), %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: movq M, %rax andq $2, %rax jle .L17 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L17: movq M, %rax andq $1, %rax jle .L19 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS testq $SIZE, X jne .L30 movhps -32 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L23 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 decq %rax jle .L22 ALIGN_4 .L21: movaps -14 * SIZE(X), %xmm5 movaps -10 * SIZE(X), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -6 * SIZE(X), %xmm7 movaps -2 * SIZE(X), %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 movaps 6 * SIZE(X), %xmm2 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm5, %xmm4 mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 10 * SIZE(X), %xmm3 movaps 14 * SIZE(X), %xmm4 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm7, %xmm6 mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movaps -14 * SIZE(X), %xmm5 movaps -10 * SIZE(X), %xmm6 SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -6 * SIZE(X), %xmm7 movaps -2 * SIZE(X), %xmm0 SHUFPD_1 %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L23: movq M, %rax andq $16, %rax jle .L24 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 SHUFPD_1 %xmm3, %xmm2 SHUFPD_1 %xmm4, %xmm3 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L24: movq M, %rax andq $8, %rax jle .L25 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: movq M, %rax andq $4, %rax jle .L26 ALIGN_3 movaps -30 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: movq M, %rax andq $2, %rax jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: movq M, %rax andq $1, %rax jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L30: testq $2 * SIZE, X jne .L40 movaps -33 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L33 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 decq %rax jle .L32 ALIGN_4 .L31: movaps -13 * SIZE(X), %xmm5 movaps -9 * SIZE(X), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -5 * SIZE(X), %xmm7 movaps -1 * SIZE(X), %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 3 * SIZE(X), %xmm1 movaps 7 * SIZE(X), %xmm2 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 SHUFPS_39 %xmm4, %xmm4 mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 SHUFPS_39 %xmm5, %xmm5 mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 11 * SIZE(X), %xmm3 movaps 15 * SIZE(X), %xmm4 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 SHUFPS_39 %xmm6, %xmm6 mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 SHUFPS_39 %xmm7, %xmm7 mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: movaps -13 * SIZE(X), %xmm5 movaps -9 * SIZE(X), %xmm6 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -5 * SIZE(X), %xmm7 movaps -1 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 SHUFPS_39 %xmm4, %xmm4 mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 SHUFPS_39 %xmm5, %xmm5 mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 SHUFPS_39 %xmm6, %xmm6 mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 SHUFPS_39 %xmm7, %xmm7 mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L33: movq M, %rax andq $16, %rax jle .L34 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L34: movq M, %rax andq $8, %rax jle .L35 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L35: movq M, %rax andq $4, %rax jle .L36 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L36: movq M, %rax andq $2, %rax jle .L37 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L37: movq M, %rax andq $1, %rax jle .L39 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L39: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L43 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 decq %rax jle .L42 ALIGN_4 .L41: movaps -15 * SIZE(X), %xmm5 movaps -11 * SIZE(X), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -7 * SIZE(X), %xmm7 movaps -3 * SIZE(X), %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 1 * SIZE(X), %xmm1 movaps 5 * SIZE(X), %xmm2 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 9 * SIZE(X), %xmm3 movaps 13 * SIZE(X), %xmm4 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movaps -15 * SIZE(X), %xmm5 movaps -11 * SIZE(X), %xmm6 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -7 * SIZE(X), %xmm7 movaps -3 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L43: movq M, %rax andq $16, %rax jle .L44 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L44: movq M, %rax andq $8, %rax jle .L45 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L45: movq M, %rax andq $4, %rax jle .L46 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L46: movq M, %rax andq $2, %rax jle .L47 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L47: movq M, %rax andq $1, %rax jle .L49 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L49: xorq %rax,%rax RESTOREREGISTERS ret #else movq M, %rax sarq $5, %rax jle .L23 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 decq %rax jle .L22 ALIGN_4 .L21: movsd -16 * SIZE(X), %xmm4 movhps -14 * SIZE(X), %xmm4 movsd -12 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 movsd -4 * SIZE(X), %xmm7 movhps -2 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movsd 4 * SIZE(X), %xmm1 movhps 6 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movsd 8 * SIZE(X), %xmm2 movhps 10 * SIZE(X), %xmm2 movsd 12 * SIZE(X), %xmm3 movhps 14 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movsd -16 * SIZE(X), %xmm4 movhps -14 * SIZE(X), %xmm4 movsd -12 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 movsd -4 * SIZE(X), %xmm7 movhps -2 * SIZE(X), %xmm7 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) mulps ALPHA, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) mulps ALPHA, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) mulps ALPHA, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) mulps ALPHA, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L23: movq M, %rax andq $16, %rax jle .L24 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 mulps ALPHA, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) mulps ALPHA, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L24: movq M, %rax andq $8, %rax jle .L25 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 mulps ALPHA, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: movq M, %rax andq $4, %rax jle .L26 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 mulps ALPHA, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: movq M, %rax andq $2, %rax jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm4 mulps ALPHA, %xmm0 addps %xmm4, %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: movq M, %rax andq $1, %rax jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 mulss ALPHA, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret #endif ALIGN_3 .L50: movq M, %rax movq Y, YY //If incx==0 || incy==0, avoid unloop. cmpq $0, INCX je .L56 cmpq $0, INCY je .L56 sarq $3, %rax jle .L55 ALIGN_3 .L51: movss (X), %xmm0 addq INCX, X mulss ALPHA, %xmm0 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm0 movss (X), %xmm1 addq INCX, X mulss ALPHA, %xmm1 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm1 movss (X), %xmm2 addq INCX, X mulss ALPHA, %xmm2 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm2 movss (X), %xmm3 addq INCX, X mulss ALPHA, %xmm3 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm3 movss %xmm0, (Y) addq INCY, Y movss %xmm1, (Y) addq INCY, Y movss %xmm2, (Y) addq INCY, Y movss %xmm3, (Y) addq INCY, Y movss (X), %xmm0 addq INCX, X mulss ALPHA, %xmm0 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm0 movss (X), %xmm1 addq INCX, X mulss ALPHA, %xmm1 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm1 movss (X), %xmm2 addq INCX, X mulss ALPHA, %xmm2 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm2 movss (X), %xmm3 addq INCX, X mulss ALPHA, %xmm3 movss (YY), %xmm6 addq INCY, YY addss %xmm6, %xmm3 movss %xmm0, (Y) addq INCY, Y movss %xmm1, (Y) addq INCY, Y movss %xmm2, (Y) addq INCY, Y movss %xmm3, (Y) addq INCY, Y decq %rax jg .L51 ALIGN_3 .L55: movq M, %rax andq $7, %rax jle .L59 ALIGN_3 .L56: movss (X), %xmm0 addq INCX, X mulss ALPHA, %xmm0 movss (Y), %xmm6 addss %xmm6, %xmm0 movss %xmm0, (Y) addq INCY, Y decq %rax jg .L56 ALIGN_3 .L59: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/axpy_sse2.S000066400000000000000000000414461313527062700174340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %r10 #endif #define YY %r11 #define ALPHA %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY #else movq 24(%rsp), INCY #endif movaps %xmm0, ALPHA #else movq 40(%rsp), X movq 48(%rsp), INCX movq 56(%rsp), Y movq 64(%rsp), INCY #endif SAVEREGISTERS #ifdef WINDOWS_ABI movaps %xmm3, ALPHA #endif unpcklpd ALPHA, ALPHA leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY testq M, M jle .L47 cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY jne .L40 testq $SIZE, Y je .L10 movsd (X), %xmm0 mulsd ALPHA, %xmm0 addsd (Y), %xmm0 movsd %xmm0, (Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_4 .L10: subq $-16 * SIZE, X subq $-16 * SIZE, Y testq $SIZE, X jne .L20 movq M, %rax sarq $4, %rax jle .L13 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 decq %rax jle .L12 ALIGN_3 .L11: movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulpd ALPHA, %xmm4 addpd -8 * SIZE(Y), %xmm4 movaps %xmm4, -8 * SIZE(Y) mulpd ALPHA, %xmm5 addpd -6 * SIZE(Y), %xmm5 movaps %xmm5, -6 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 movaps 6 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd ALPHA, %xmm6 addpd -4 * SIZE(Y), %xmm6 movaps %xmm6, -4 * SIZE(Y) mulpd ALPHA, %xmm7 addpd -2 * SIZE(Y), %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) mulpd ALPHA, %xmm4 addpd -8 * SIZE(Y), %xmm4 movaps %xmm4, -8 * SIZE(Y) mulpd ALPHA, %xmm5 addpd -6 * SIZE(Y), %xmm5 movaps %xmm5, -6 * SIZE(Y) mulpd ALPHA, %xmm6 addpd -4 * SIZE(Y), %xmm6 movaps %xmm6, -4 * SIZE(Y) mulpd ALPHA, %xmm7 addpd -2 * SIZE(Y), %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L13: movq M, %rax andq $8, %rax jle .L14 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L14: movq M, %rax andq $4, %rax jle .L15 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: movq M, %rax andq $2, %rax jle .L16 ALIGN_3 movaps -16 * SIZE(X), %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L16: movq M, %rax andq $1, %rax jle .L19 ALIGN_3 movsd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm0 movq M, %rax sarq $4, %rax jle .L23 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 decq %rax jle .L22 ALIGN_4 .L21: movaps -9 * SIZE(X), %xmm4 movaps -7 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -5 * SIZE(X), %xmm6 movaps -3 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movaps 1 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm5, %xmm4 mulpd ALPHA, %xmm4 addpd -8 * SIZE(Y), %xmm4 movaps %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 mulpd ALPHA, %xmm5 addpd -6 * SIZE(Y), %xmm5 movaps %xmm5, -6 * SIZE(Y) movaps 3 * SIZE(X), %xmm2 movaps 5 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm7, %xmm6 mulpd ALPHA, %xmm6 addpd -4 * SIZE(Y), %xmm6 movaps %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 mulpd ALPHA, %xmm7 addpd -2 * SIZE(Y), %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movaps -9 * SIZE(X), %xmm4 movaps -7 * SIZE(X), %xmm5 SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -5 * SIZE(X), %xmm6 movaps -3 * SIZE(X), %xmm7 SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 mulpd ALPHA, %xmm4 addpd -8 * SIZE(Y), %xmm4 movaps %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 mulpd ALPHA, %xmm5 addpd -6 * SIZE(Y), %xmm5 movaps %xmm5, -6 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 mulpd ALPHA, %xmm6 addpd -4 * SIZE(Y), %xmm6 movaps %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 mulpd ALPHA, %xmm7 addpd -2 * SIZE(Y), %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L23: movq M, %rax andq $8, %rax jle .L24 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm8 SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm8, %xmm3 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm8, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: movq M, %rax andq $4, %rax jle .L25 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: movq M, %rax andq $2, %rax jle .L26 ALIGN_3 movaps -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L26: movq M, %rax andq $1, %rax jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 #else movq M, %rax sarq $4, %rax jle .L23 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 decq %rax jle .L22 ALIGN_3 .L21: movsd -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 movsd -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 movsd -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 2 * SIZE(X), %xmm1 movhps 3 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulpd ALPHA, %xmm4 addpd -8 * SIZE(Y), %xmm4 movaps %xmm4, -8 * SIZE(Y) mulpd ALPHA, %xmm5 addpd -6 * SIZE(Y), %xmm5 movaps %xmm5, -6 * SIZE(Y) movsd 4 * SIZE(X), %xmm2 movhps 5 * SIZE(X), %xmm2 movsd 6 * SIZE(X), %xmm3 movhps 7 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd ALPHA, %xmm6 addpd -4 * SIZE(Y), %xmm6 movaps %xmm6, -4 * SIZE(Y) mulpd ALPHA, %xmm7 addpd -2 * SIZE(Y), %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L21 ALIGN_3 .L22: movsd -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 movsd -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 movsd -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) mulpd ALPHA, %xmm4 addpd -8 * SIZE(Y), %xmm4 movaps %xmm4, -8 * SIZE(Y) mulpd ALPHA, %xmm5 addpd -6 * SIZE(Y), %xmm5 movaps %xmm5, -6 * SIZE(Y) mulpd ALPHA, %xmm6 addpd -4 * SIZE(Y), %xmm6 movaps %xmm6, -4 * SIZE(Y) mulpd ALPHA, %xmm7 addpd -2 * SIZE(Y), %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L23: movq M, %rax andq $8, %rax jle .L24 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 mulpd ALPHA, %xmm1 addpd -14 * SIZE(Y), %xmm1 mulpd ALPHA, %xmm2 addpd -12 * SIZE(Y), %xmm2 mulpd ALPHA, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: movq M, %rax andq $4, %rax jle .L25 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: movq M, %rax andq $2, %rax jle .L26 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L26: movq M, %rax andq $1, %rax jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 #endif .L40: movq Y, YY movq M, %rax //If incx==0 || incy==0, avoid unloop. cmpq $0, INCX je .L46 cmpq $0, INCY je .L46 sarq $3, %rax jle .L45 ALIGN_3 .L41: movsd 0 * SIZE(X), %xmm0 addq INCX, X movhpd 0 * SIZE(X), %xmm0 addq INCX, X mulpd ALPHA, %xmm0 movsd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm0 movsd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 addq INCX, X mulpd ALPHA, %xmm1 movsd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm1 movsd 0 * SIZE(X), %xmm2 addq INCX, X movhpd 0 * SIZE(X), %xmm2 addq INCX, X mulpd ALPHA, %xmm2 movsd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm2 movsd 0 * SIZE(X), %xmm3 addq INCX, X movhpd 0 * SIZE(X), %xmm3 addq INCX, X mulpd ALPHA, %xmm3 movsd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm3 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y movhpd %xmm0, 0 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) addq INCY, Y movhpd %xmm1, 0 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) addq INCY, Y movhpd %xmm2, 0 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) addq INCY, Y movhpd %xmm3, 0 * SIZE(Y) addq INCY, Y decq %rax jg .L41 ALIGN_3 .L45: movq M, %rax andq $7, %rax jle .L47 ALIGN_3 .L46: movsd (X), %xmm0 addq INCX, X mulsd %xmm15, %xmm0 addsd (Y), %xmm0 movsd %xmm0, (Y) addq INCY, Y decq %rax jg .L46 ALIGN_3 .L47: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/builtin_stinit.S000066400000000000000000000055241313527062700205540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE cmpq $4096, %rax jle .L999 ALIGN_3 .L01: subq $4096, %rax subq $4096, %rsp movq $0, (%rsp) cmpq $4096, %rax jg .L01 ALIGN_3 .L999: subq %rax, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cabs.S000066400000000000000000000061311313527062700164170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE #ifdef DOUBLE movsd 0 * SIZE(ARG1), %xmm0 movsd 1 * SIZE(ARG1), %xmm1 pcmpeqb %xmm4, %xmm4 psrlq $1, %xmm4 andpd %xmm4, %xmm0 andpd %xmm4, %xmm1 addpd %xmm1, %xmm0 #else movss 0 * SIZE(ARG1), %xmm0 movss 1 * SIZE(ARG1), %xmm1 pcmpeqb %xmm4, %xmm4 psrld $1, %xmm4 andps %xmm4, %xmm0 andps %xmm4, %xmm1 addps %xmm1, %xmm0 #endif #if !defined(DOUBLE) && defined(NEED_F2CCONV) cvtss2sd %xmm0, %xmm0 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/caxpy.c000066400000000000000000000076361313527062700166660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_8 static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; BLASLONG register ix = 0; FLOAT da_r = alpha[0]; FLOAT da_i = alpha[1]; while(i < n) { #if !defined(CONJ) y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; #else y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; #endif ix+=4 ; i+=2 ; } } #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT da[2]; if ( n <= 0 ) return(0); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -32; if ( n1 ) { da[0] = da_r; da[1] = da_i; caxpy_kernel_8(n1, x, y , da ); ix = 2 * n1; } i = n1; while(i < n) { #if !defined(CONJ) y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif i++ ; ix += 2; } return(0); } inc_x *=2; inc_y *=2; while(i < n) { #if !defined(CONJ) y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/x86_64/caxpy_microk_bulldozer-2.c000066400000000000000000000171351313527062700224460ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; #else FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; #endif BLASLONG register i = 0; if ( n < 640 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulps (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulps (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x "vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x "vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x "vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x "vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t" ".align 2 \n\t" "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t" "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t" "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t" "vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" "vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part "vfmaddps 64(%3,%0,4), %%xmm0 , %%xmm12, %%xmm12 \n\t" "vfmaddps 80(%3,%0,4), %%xmm0 , %%xmm13, %%xmm13 \n\t" "vfmaddps 96(%3,%0,4), %%xmm0 , %%xmm14, %%xmm14 \n\t" "vfmaddps 112(%3,%0,4), %%xmm0 , %%xmm15, %%xmm15 \n\t" "vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" "vfmaddps %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t" "vfmaddps %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t" "vfmaddps %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t" "vmovups %%xmm5 , (%3,%0,4) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,4) \n\t" "vmovups %%xmm9 , 32(%3,%0,4) \n\t" "vmovups %%xmm11, 48(%3,%0,4) \n\t" "vmovups %%xmm12, 64(%3,%0,4) \n\t" "vmovups %%xmm13, 80(%3,%0,4) \n\t" "vmovups %%xmm14, 96(%3,%0,4) \n\t" "vmovups %%xmm15,112(%3,%0,4) \n\t" "addq $32, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulps (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulps (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "prefetcht0 512(%3,%0,4) \n\t" "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t" ".align 2 \n\t" "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t" "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t" "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t" "vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" "vmovups %%xmm5 , (%3,%0,4) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,4) \n\t" "vmovups %%xmm9 , 32(%3,%0,4) \n\t" "vmovups %%xmm11, 48(%3,%0,4) \n\t" "addq $16, %0 \n\t" "subq $8, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/caxpy_microk_haswell-2.c000066400000000000000000000125631313527062700221030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 }; #else FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 }; #endif BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulps (%5), %%ymm1 , %%ymm1 \n\t" #else "vmulps (%5), %%ymm0 , %%ymm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x ".align 2 \n\t" "vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x "vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x "vmovups 128(%2,%0,4), %%ymm12 \n\t" // 4 complex values from x "vmovups 160(%2,%0,4), %%ymm13 \n\t" // 4 complex values from x "vmovups 192(%2,%0,4), %%ymm14 \n\t" // 4 complex values from x "vmovups 224(%2,%0,4), %%ymm15 \n\t" // 4 complex values from x "vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part "vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t" ".align 2 \n\t" "vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t" "vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t" "vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t" "vfmadd231ps %%ymm1 , %%ymm4 , %%ymm5 \n\t" "vfmadd231ps %%ymm1 , %%ymm6 , %%ymm7 \n\t" "vfmadd231ps %%ymm1 , %%ymm8 , %%ymm9 \n\t" "vfmadd231ps %%ymm1 , %%ymm10, %%ymm11 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part "vfmadd213ps 128(%3,%0,4), %%ymm0 , %%ymm12 \n\t" "vfmadd213ps 160(%3,%0,4), %%ymm0 , %%ymm13 \n\t" "vfmadd213ps 192(%3,%0,4), %%ymm0 , %%ymm14 \n\t" "vfmadd213ps 224(%3,%0,4), %%ymm0 , %%ymm15 \n\t" "vfmadd231ps %%ymm1 , %%ymm4 , %%ymm12 \n\t" "vfmadd231ps %%ymm1 , %%ymm6 , %%ymm13 \n\t" "vfmadd231ps %%ymm1 , %%ymm8 , %%ymm14 \n\t" "vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t" "vmovups %%ymm5 , (%3,%0,4) \n\t" ".align 2 \n\t" "vmovups %%ymm7 , 32(%3,%0,4) \n\t" "vmovups %%ymm9 , 64(%3,%0,4) \n\t" "vmovups %%ymm11, 96(%3,%0,4) \n\t" "vmovups %%ymm12,128(%3,%0,4) \n\t" "vmovups %%ymm13,160(%3,%0,4) \n\t" "vmovups %%ymm14,192(%3,%0,4) \n\t" "vmovups %%ymm15,224(%3,%0,4) \n\t" "addq $64, %0 \n\t" "subq $32, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/caxpy_microk_sandy-2.c000066400000000000000000000111201313527062700215460ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 }; #else FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 }; #endif BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulps (%5), %%ymm1 , %%ymm1 \n\t" #else "vmulps (%5), %%ymm0 , %%ymm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x ".align 2 \n\t" "vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x "vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x "vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part "vmulps %%ymm5 , %%ymm0 , %%ymm5 \n\t" "vmulps %%ymm7 , %%ymm0 , %%ymm7 \n\t" "vmulps %%ymm9 , %%ymm0 , %%ymm9 \n\t" "vmulps %%ymm11, %%ymm0 , %%ymm11 \n\t" "vaddps (%3,%0,4), %%ymm5 , %%ymm5 \n\t" "vaddps 32(%3,%0,4), %%ymm7 , %%ymm7 \n\t" "vaddps 64(%3,%0,4), %%ymm9 , %%ymm9 \n\t" "vaddps 96(%3,%0,4), %%ymm11, %%ymm11 \n\t" "vmulps %%ymm4 , %%ymm1 , %%ymm4 \n\t" "vmulps %%ymm6 , %%ymm1 , %%ymm6 \n\t" "vmulps %%ymm8 , %%ymm1 , %%ymm8 \n\t" "vmulps %%ymm10, %%ymm1 , %%ymm10 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm6 , %%ymm7 , %%ymm7 \n\t" "vaddps %%ymm8 , %%ymm9 , %%ymm9 \n\t" "vaddps %%ymm10, %%ymm11, %%ymm11 \n\t" "vmovups %%ymm5 , (%3,%0,4) \n\t" ".align 2 \n\t" "vmovups %%ymm7 , 32(%3,%0,4) \n\t" "vmovups %%ymm9 , 64(%3,%0,4) \n\t" "vmovups %%ymm11, 96(%3,%0,4) \n\t" "addq $32, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/caxpy_microk_steamroller-2.c000066400000000000000000000167771313527062700230100ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; #else FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; #endif BLASLONG register i = 0; if ( n <= 2048 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulps (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulps (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x "vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x "vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x "vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x "vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t" ".align 2 \n\t" "vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t" "vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t" "vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t" "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t" "vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part "vfmadd213ps 64(%3,%0,4), %%xmm0 , %%xmm12 \n\t" "vfmadd213ps 80(%3,%0,4), %%xmm0 , %%xmm13 \n\t" "vfmadd213ps 96(%3,%0,4), %%xmm0 , %%xmm14 \n\t" "vfmadd213ps 112(%3,%0,4), %%xmm0 , %%xmm15 \n\t" "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm12 \n\t" "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm13 \n\t" "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm14 \n\t" "vfmadd231ps %%xmm1 , %%xmm10, %%xmm15 \n\t" "vmovups %%xmm5 , (%3,%0,4) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,4) \n\t" "vmovups %%xmm9 , 32(%3,%0,4) \n\t" "vmovups %%xmm11, 48(%3,%0,4) \n\t" "vmovups %%xmm12, 64(%3,%0,4) \n\t" "vmovups %%xmm13, 80(%3,%0,4) \n\t" "vmovups %%xmm14, 96(%3,%0,4) \n\t" "vmovups %%xmm15,112(%3,%0,4) \n\t" "addq $32, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulps (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulps (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "prefetcht0 512(%3,%0,4) \n\t" "vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t" ".align 2 \n\t" "vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t" "vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t" "vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t" "vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t" "vmovups %%xmm5 , (%3,%0,4) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,4) \n\t" "vmovups %%xmm9 , 32(%3,%0,4) \n\t" "vmovups %%xmm11, 48(%3,%0,4) \n\t" "addq $16, %0 \n\t" "subq $8, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cdot.c000066400000000000000000000106221313527062700164600ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include #if defined(BULLDOZER) #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" #elif defined(HASWELL) || defined(ZEN) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_16 static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot[8] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; BLASLONG j=0; while( i < n ) { dot[0] += x[j] * y[j] ; dot[1] += x[j+1] * y[j+1] ; dot[4] += x[j] * y[j+1] ; dot[5] += x[j+1] * y[j] ; dot[2] += x[j+2] * y[j+2] ; dot[3] += x[j+3] * y[j+3] ; dot[6] += x[j+2] * y[j+3] ; dot[7] += x[j+3] * y[j+2] ; dot[0] += x[j+4] * y[j+4] ; dot[1] += x[j+5] * y[j+5] ; dot[4] += x[j+4] * y[j+5] ; dot[5] += x[j+5] * y[j+4] ; dot[2] += x[j+6] * y[j+6] ; dot[3] += x[j+7] * y[j+7] ; dot[6] += x[j+6] * y[j+7] ; dot[7] += x[j+7] * y[j+6] ; j+=8; i+=4; } d[0] = dot[0]; d[1] = dot[1]; d[2] = dot[2]; d[3] = dot[3]; d[4] = dot[4]; d[5] = dot[5]; d[6] = dot[6]; d[7] = dot[7]; } #endif FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; BLASLONG ix,iy; FLOAT _Complex result; FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; if ( n <= 0 ) { result = OPENBLAS_MAKE_COMPLEX_FLOAT (0.0, 0.0) ; return(result); } if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -16; if ( n1 ) { cdot_kernel_16(n1, x, y , dot ); dot[0] += dot[2]; dot[1] += dot[3]; dot[4] += dot[6]; dot[5] += dot[7]; } i = n1; BLASLONG j = i * 2; while( i < n ) { dot[0] += x[j] * y[j] ; dot[1] += x[j+1] * y[j+1] ; dot[4] += x[j] * y[j+1] ; dot[5] += x[j+1] * y[j] ; j+=2; i++ ; } } else { i=0; ix=0; iy=0; inc_x <<= 1; inc_y <<= 1; while(i < n) { dot[0] += x[ix] * y[iy] ; dot[1] += x[ix+1] * y[iy+1] ; dot[4] += x[ix] * y[iy+1] ; dot[5] += x[ix+1] * y[iy] ; ix += inc_x ; iy += inc_y ; i++ ; } } #if !defined(CONJ) result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]-dot[1], dot[4]+dot[5]) ; // CREAL(result) = dot[0] - dot[1]; // CIMAG(result) = dot[4] + dot[5]; #else result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]+dot[1], dot[4]-dot[5]) ; // CREAL(result) = dot[0] + dot[1]; // CIMAG(result) = dot[4] - dot[5]; #endif return(result); } OpenBLAS-0.2.20/kernel/x86_64/cdot_microk_bulldozer-2.c000066400000000000000000000173551313527062700222570ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n <=1024 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $8 , %1 \n\t" "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x "prefetcht0 384(%3,%0,4) \n\t" "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $8 , %1 \n\t" "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cdot_microk_haswell-2.c000066400000000000000000000115041313527062700217020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y "vmovups 64(%2,%0,4), %%ymm10 \n\t" // 2 * x "vmovups 96(%2,%0,4), %%ymm11 \n\t" // 2 * x "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y "vfmadd231ps %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231ps %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231ps %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $32 , %0 \n\t" "vfmadd231ps %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r "vfmadd231ps %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r "subq $16 , %1 \n\t" "vfmadd231ps %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cdot_microk_sandy-2.c000066400000000000000000000116121313527062700213610ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x "vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x "vmovups (%3,%0,4), %%ymm12 \n\t" // 2 * y "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 2 * y "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 2 * y "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 2 * y "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vaddps %%ymm0 , %%ymm10, %%ymm0 \n\t" "vaddps %%ymm1 , %%ymm11, %%ymm1 \n\t" "vmulps %%ymm8 , %%ymm12, %%ymm10 \n\t" "vmulps %%ymm9 , %%ymm13, %%ymm11 \n\t" "vmovups 64(%2,%0,4), %%ymm8 \n\t" // 2 * x "vmovups 96(%2,%0,4), %%ymm9 \n\t" // 2 * x "vaddps %%ymm4 , %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5 , %%ymm11, %%ymm5 \n\t" "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" "vaddps %%ymm2 , %%ymm10, %%ymm2 \n\t" "vaddps %%ymm3 , %%ymm11, %%ymm3 \n\t" "vmulps %%ymm8 , %%ymm14, %%ymm10 \n\t" "vmulps %%ymm9 , %%ymm15, %%ymm11 \n\t" "addq $32 , %0 \n\t" "subq $16 , %1 \n\t" "vaddps %%ymm6 , %%ymm10, %%ymm6 \n\t" "vaddps %%ymm7 , %%ymm11, %%ymm7 \n\t" "jnz 1b \n\t" "vaddps %%ymm0, %%ymm1, %%ymm0 \n\t" "vaddps %%ymm2, %%ymm3, %%ymm2 \n\t" "vaddps %%ymm0, %%ymm2, %%ymm0 \n\t" "vaddps %%ymm4, %%ymm5, %%ymm4 \n\t" "vaddps %%ymm6, %%ymm7, %%ymm6 \n\t" "vaddps %%ymm4, %%ymm6, %%ymm4 \n\t" "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cdot_microk_steamroller-2.c000066400000000000000000000172351313527062700226030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n < 1280 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $8 , %1 \n\t" "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x "prefetcht0 512(%3,%0,4) \n\t" "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y "vfmadd231ps %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" "vfmadd231ps %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231ps %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" "vfmadd231ps %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" "vfmadd231ps %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmadd231ps %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $8 , %1 \n\t" "vfmadd231ps %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S000066400000000000000000001474751313527062700227370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #define BUFFER2 LB2_OFFSET+128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADD_R vfmaddps #define VFMADD_I vfmaddps #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADD_R vfnmaddps #define VFMADD_I vfmaddps #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADD_R vfmaddps #define VFMADD_I vfnmaddps #else #define VFMADD_R vfnmaddps #define VFMADD_I vfnmaddps #endif #define A_PR1 384 #define B_PR1 192 #define KERNEL4x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL4x2_2(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL4x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL4x2_4(xx) \ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $16, BI ;\ addq $32, %rax ;\ #define KERNEL4x2_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $4, BI ;\ addq $8, %rax ;\ /************************************************************************************************/ #define KERNEL2x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL2x2_2(xx) \ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL2x2_3(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL2x2_4(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $16, BI ;\ addq $16, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $4, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x2_1(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_2(xx) \ vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_3(xx) \ vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_4(xx) \ vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $16, BI ;\ addq $8, %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $4, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ #define KERNEL4x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL4x1_2(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL4x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL4x1_4(xx) \ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $8, BI ;\ addq $32, %rax ;\ #define KERNEL4x1_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $2, BI ;\ addq $8, %rax ;\ /************************************************************************************************/ #define KERNEL2x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL2x1_2(xx) \ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL2x1_3(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL2x1_4(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $2, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x1_1(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_2(xx) \ vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_3(xx) \ vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_4(xx) \ vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $2, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL vmovsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA_R vmovss %xmm1, ALPHA_I salq $ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_0: movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $4*SIZE,BO1 addq $4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = (m >> 2) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL4x2_SUB(xxx) jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 vshufps $0xb1, %xmm13, %xmm13, %xmm13 vshufps $0xb1, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm10, %xmm10, %xmm11 vshufps $0xb1, %xmm12, %xmm12, %xmm13 vshufps $0xb1, %xmm14, %xmm14, %xmm15 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vaddsubps %xmm12, %xmm13,%xmm13 vaddsubps %xmm14, %xmm15,%xmm15 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 vmovaps %xmm13, %xmm12 vmovaps %xmm15, %xmm14 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 vshufps $0xb1, %xmm13, %xmm13, %xmm13 vshufps $0xb1, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 vmulps %xmm12, %xmm0, %xmm12 vmulps %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vmulps %xmm13, %xmm1, %xmm13 vmulps %xmm15, %xmm1, %xmm15 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 vaddps (CO1, LDC), %xmm10, %xmm10 vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 4 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: testq $3, M jz .L2_60 // to next 2 lines of N testq $2, M jz .L2_40 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_26 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL2x2_SUB(xxx) jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) jl .L2_47 ALIGN_4 .L2_49: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 vmovsd (CO1, LDC), %xmm15 vaddps %xmm15, %xmm10, %xmm10 #endif vmovsd %xmm8 , (CO1) vmovsd %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = (m >> 2) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL4x1_SUB(xxx) jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm12, %xmm12, %xmm13 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm12, %xmm13,%xmm13 vmovaps %xmm9, %xmm8 vmovaps %xmm13, %xmm12 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm13, %xmm1, %xmm13 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: testq $3, M jz .L999 testq $2, M jz .L1_40 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_26 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_26 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL2x1_SUB(xxx) jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L1_40: testq $1, M jz .L999 // to next 2 lines of N ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) jl .L1_47 ALIGN_4 .L1_49: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #ifndef TRMMKERNEL vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 #endif vmovsd %xmm8 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cgemm_kernel_4x2_piledriver.S000066400000000000000000001505071313527062700230700ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /********************************************************************* * * 2014/06/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/31 Saar * * Parameter: * UNROLL_M 4 * UNROLL_N 2 * CGEMM_P 768 * CGEMM_Q 168 * A_PR1 512 * B_PR1 256 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * * 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) * 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) * 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) * 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * * 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) * 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) * 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) * 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) * 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) * 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 256*8*4 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADD_R vfmaddps #define VFMADD_I vfmaddps #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADD_R vfnmaddps #define VFMADD_I vfmaddps #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADD_R vfmaddps #define VFMADD_I vfnmaddps #else #define VFMADD_R vfnmaddps #define VFMADD_I vfnmaddps #endif #define A_PR1 512 #define B_PR1 256 #define KERNEL4x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL4x2_2(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL4x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL4x2_4(xx) \ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $16, BI ;\ addq $32, %rax ;\ #define KERNEL4x2_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $4, BI ;\ addq $8, %rax ;\ /************************************************************************************************/ #define KERNEL2x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL2x2_2(xx) \ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL2x2_3(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL2x2_4(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $16, BI ;\ addq $16, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $4, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x2_1(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_2(xx) \ vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_3(xx) \ vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_4(xx) \ vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $16, BI ;\ addq $8, %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $4, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ #define KERNEL4x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL4x1_2(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL4x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL4x1_4(xx) \ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $8, BI ;\ addq $32, %rax ;\ #define KERNEL4x1_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $2, BI ;\ addq $8, %rax ;\ /************************************************************************************************/ #define KERNEL2x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL2x1_2(xx) \ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL2x1_3(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL2x1_4(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $2, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x1_1(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_2(xx) \ vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_3(xx) \ vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_4(xx) \ vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $2, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA_R vmovss %xmm1, ALPHA_I salq $ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_0: movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $4*SIZE,BO1 addq $4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = (m >> 2) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL4x2_SUB(xxx) jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 vshufps $0xb1, %xmm13, %xmm13, %xmm13 vshufps $0xb1, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm10, %xmm10, %xmm11 vshufps $0xb1, %xmm12, %xmm12, %xmm13 vshufps $0xb1, %xmm14, %xmm14, %xmm15 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vaddsubps %xmm12, %xmm13,%xmm13 vaddsubps %xmm14, %xmm15,%xmm15 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 vmovaps %xmm13, %xmm12 vmovaps %xmm15, %xmm14 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 vshufps $0xb1, %xmm13, %xmm13, %xmm13 vshufps $0xb1, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 vmulps %xmm12, %xmm0, %xmm12 vmulps %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vmulps %xmm13, %xmm1, %xmm13 vmulps %xmm15, %xmm1, %xmm15 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 vaddps (CO1, LDC), %xmm10, %xmm10 vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 4 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: testq $3, M jz .L2_60 // to next 2 lines of N testq $2, M jz .L2_40 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_26 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL2x2_SUB(xxx) jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) jl .L2_47 ALIGN_4 .L2_49: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 vmovsd (CO1, LDC), %xmm15 vaddps %xmm15, %xmm10, %xmm10 #endif vmovsd %xmm8 , (CO1) vmovsd %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = (m >> 2) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL4x1_SUB(xxx) jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 vshufps $0xb1, %xmm12, %xmm12, %xmm13 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm12, %xmm13,%xmm13 vmovaps %xmm9, %xmm8 vmovaps %xmm13, %xmm12 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 vshufps $0xb1, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm13, %xmm1, %xmm13 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: testq $3, M jz .L999 testq $2, M jz .L1_40 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_26 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_26 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL2x1_SUB(xxx) jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L1_40: testq $1, M jz .L999 // to next 2 lines of N ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) jl .L1_47 ALIGN_4 .L1_49: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #ifndef TRMMKERNEL vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 #endif vmovsd %xmm8 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cgemm_kernel_4x8_sandy.S000066400000000000000000003276601313527062700220550ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #define ASSEMBLER #include "common.h" #define old_bm %rdi #define old_bn %rsi #define old_bk %rdx #define bm %r13 #define bn %r14 #define bk %r15 #define ALPHA %xmm0 #define ba %rcx #define bb %r8 #define C %r9 #define ldc %r10 #define i %r11 #define k %rax #define ptrba %rdi #define ptrbb %rsi #define C0 %rbx #define C1 %rbp #define prebb %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define old_ldc 8+STACKSIZE(%rsp) #define old_offset 16+STACKSIZE(%rsp) #define MEMALPHA_R 48(%rsp) #define MEMALPHA_I 56(%rsp) #define j 64(%rsp) #define OFFSET 72(%rsp) #define kk 80(%rsp) #define kkk 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define old_ldc 72 + STACKSIZE(%rsp) #define old_offset 80 + STACKSIZE(%rsp) #define MEMALPHA_R 224(%rsp) #define MEMALPHA_I 232(%rsp) #define j 240(%rsp) #define OFFSET 248(%rsp) #define kk 256(%rsp) #define kkk 264(%rsp) #endif #define PREFETCH0 prefetcht0 #define PREFETCH1 prefetcht0 #define PREFETCH2 prefetcht0 #define PRESIZE 64 #define xvec0 %xmm0 #define xvec1 %xmm1 #define xvec2 %xmm2 #define xvec3 %xmm3 #define xvec4 %xmm4 #define xvec5 %xmm5 #define xvec6 %xmm6 #define xvec7 %xmm7 #define xvec8 %xmm8 #define xvec9 %xmm9 #define xvec10 %xmm10 #define xvec11 %xmm11 #define xvec12 %xmm12 #define xvec13 %xmm13 #define xvec14 %xmm14 #define xvec15 %xmm15 #define yvec0 %ymm0 #define yvec1 %ymm1 #define yvec2 %ymm2 #define yvec3 %ymm3 #define yvec4 %ymm4 #define yvec5 %ymm5 #define yvec6 %ymm6 #define yvec7 %ymm7 #define yvec8 %ymm8 #define yvec9 %ymm9 #define yvec10 %ymm10 #define yvec11 %ymm11 #define yvec12 %ymm12 #define yvec13 %ymm13 #define yvec14 %ymm14 #define yvec15 %ymm15 #define LEAQ leaq #define ADDQ addq #define MULQ imulq #define SARQ sarq #define SALQ salq #define ANDQ andq #define SUBQ subq #define DECQ decq #define JG jg #define JLE jle #define TEST testq #define OR orq #define JNE jne #define JMP jmp #define NOP #define XOR xorpd #undef MOVQ #define MOVQ movq #define XOR_SY vxorps #define XOR_SX vxorps #define LD_SY vmovaps #define LD_SX vmovaps #define LDL_SX vmovlps #define LDL_SY vmovlps #define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps #define ST_SX vmovaps #define STL_SX vmovlps #define STL_SY vmovlps #define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup #define EDUP_SX vmovsldup #define ODUP_SX vmovshdup #define ADD_SY vaddps #define ADD_SX vaddps #define SUB_SY vsubps #define SUB_SX vsubps #define ADDSUB_SY vaddsubps #define ADDSUB_SX vaddsubps #define MUL_SY vmulps #define MUL_SX vmulps #define SHUF_SY vperm2f128 #define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps #define BROAD_SY vbroadcastss #define BROAD_SX vbroadcastss #define MOV_SY vmovaps #define MOV_SX vmovaps #define REVS_SY vshufps #define REVS_SX vshufps #define EXTRA_SY vextractf128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1_SY ADD_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX ADD_SX #define ADD2_SX ADDSUB_SX #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1_SY SUB_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX SUB_SX #define ADD2_SX ADDSUB_SX #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1_SY SUB_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX SUB_SX #define ADD2_SX ADDSUB_SX #else #define ADD1_SY ADD_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX ADD_SX #define ADD2_SX ADDSUB_SX #endif PROLOGUE subq $STACKSIZE, %rsp; movq %rbx, 0(%rsp); movq %rbp, 8(%rsp); movq %r12, 16(%rsp); movq %r13, 24(%rsp); movq %r14, 32(%rsp); movq %r15, 40(%rsp); #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, old_bm movq ARG2, old_bn movq ARG3, old_bk movq OLD_A, ba movq OLD_B, bb movq OLD_C, C movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11; #endif #endif vzeroupper vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm movq old_bn, bn movq old_bk, bk salq $ZBASE_SHIFT, ldc #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11; #endif movq %r11, kk; #endif MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C,C0; LEAQ (C,ldc,2),C1; MOVQ bk, k; SALQ $5, k; LEAQ (bb, k, 1), prebb; # Rn=4, SIZE=4 COMPLEX=2 MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif # Initial results register PREFETCH0 0*SIZE(prebb); XOR_SY yvec15, yvec15, yvec15; PREFETCH0 16*SIZE(prebb); ADDQ $32*SIZE, prebb; XOR_SY yvec14, yvec14, yvec14; PREFETCH2 3*SIZE(C0); XOR_SY yvec13, yvec13, yvec13; PREFETCH2 3*SIZE(C0, ldc, 1); XOR_SY yvec12, yvec12, yvec12; PREFETCH2 3*SIZE(C1); EDUP_SY 0*SIZE(ptrbb), yvec2; # Br0, Br1, Br2, Br3 PREFETCH2 3*SIZE(C1, ldc, 1); XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; LD_SY 0*SIZE(ptrba), yvec0; # Ar0, Ai0, Ar1, Ai1.. XOR_SY yvec9, yvec9, yvec9; XOR_SY yvec8, yvec8, yvec8; VPERMILP_SY $0x4e, yvec2, yvec3; # Br2, Br3, Br0, Br1 #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; ALIGN_5; .L2_bodyB:; # Computing kernel ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba); LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 8*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 16*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 2 ################## PREFETCH0 (PRESIZE+16)*SIZE(ptrba); LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 16*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 32*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 3 ################## PREFETCH0 (PRESIZE+32)*SIZE(ptrba); LD_SY 40*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 16*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 24*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 48*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 4 ################## PREFETCH0 (PRESIZE+48)*SIZE(ptrba); LD_SY 56*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADDQ $64*SIZE, ptrba; ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 24*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADDQ $32*SIZE, ptrbb; ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 0*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L3_loopE; ALIGN_5 .L3_loopB: ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; EDUP_SY 8*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; LD_SY 16*SIZE(ptrba), yvec0; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 2 ################## PREFETCH0 (PRESIZE+16)*SIZE(ptrba) LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADDQ $32*SIZE, ptrba SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $16*SIZE, ptrbb; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; EDUP_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; LD_SY 0*SIZE(ptrba), yvec0; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; .L3_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L4_loopE; ALIGN_5 .L4_loopB:; ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADDQ $16*SIZE, ptrba; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $8*SIZE, ptrbb; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec14, yvec14; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; .L4_loopE:; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec12, yvec7, yvec12; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec10, yvec7, yvec10; ADDSUB_SY yvec9, yvec7, yvec9; ADDSUB_SY yvec8, yvec7, yvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; SUB_SY yvec14, yvec7, yvec14; SUB_SY yvec13, yvec7, yvec13; SUB_SY yvec12, yvec7, yvec12; SUB_SY yvec11, yvec7, yvec11; SUB_SY yvec10, yvec7, yvec10; SUB_SY yvec9, yvec7, yvec9; SUB_SY yvec8, yvec7, yvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec12, yvec12; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec10, yvec10; VPERMILP_SY $0xb1, yvec9, yvec9; VPERMILP_SY $0xb1, yvec8, yvec8; ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec12, yvec7, yvec12; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec10, yvec7, yvec10; ADDSUB_SY yvec9, yvec7, yvec9; ADDSUB_SY yvec8, yvec7, yvec8; VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec12, yvec12; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec10, yvec10; VPERMILP_SY $0xb1, yvec9, yvec9; VPERMILP_SY $0xb1, yvec8, yvec8; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; VPERMILP_SY $0xb1,yvec14, yvec4; MUL_SY yvec14, yvec7, yvec14; MUL_SY yvec4, yvec6, yvec4; ADDSUB_SY yvec4, yvec14, yvec14; VPERMILP_SY $0xb1,yvec13, yvec3; MUL_SY yvec13, yvec7, yvec13; MUL_SY yvec3, yvec6, yvec3; ADDSUB_SY yvec3, yvec13, yvec13; VPERMILP_SY $0xb1,yvec12, yvec2; MUL_SY yvec12, yvec7, yvec12; MUL_SY yvec2, yvec6, yvec2; ADDSUB_SY yvec2, yvec12, yvec12; VPERMILP_SY $0xb1,yvec11, yvec1; MUL_SY yvec11, yvec7, yvec11; MUL_SY yvec1, yvec6, yvec1; ADDSUB_SY yvec1, yvec11, yvec11; VPERMILP_SY $0xb1,yvec10, yvec0; MUL_SY yvec10, yvec7, yvec10; MUL_SY yvec0, yvec6, yvec0; ADDSUB_SY yvec0, yvec10, yvec10; VPERMILP_SY $0xb1,yvec9, yvec5; MUL_SY yvec9, yvec7, yvec9; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec9, yvec9; VPERMILP_SY $0xb1,yvec8, yvec4; MUL_SY yvec8, yvec7, yvec8; MUL_SY yvec4, yvec6, yvec4; ADDSUB_SY yvec4, yvec8, yvec8; #### Shuffle Results #### MOV_SY yvec15,yvec7; REVS_SY $0xe4,yvec13,yvec15,yvec15; REVS_SY $0xe4,yvec7,yvec13,yvec13; MOV_SY yvec14,yvec7; REVS_SY $0xe4,yvec12,yvec14,yvec14; REVS_SY $0xe4,yvec7,yvec12,yvec12; MOV_SY yvec11,yvec7; REVS_SY $0xe4,yvec9,yvec11,yvec11; REVS_SY $0xe4,yvec7,yvec9,yvec9; MOV_SY yvec10,yvec7; REVS_SY $0xe4,yvec8,yvec10,yvec10; REVS_SY $0xe4,yvec7,yvec8,yvec8; #### Store Back #### #### Testing alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; ALIGN_5 EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; EXTRA_SY $1,yvec13,xvec5; EXTRA_SY $1,yvec12,xvec4; EXTRA_SY $1,yvec11,xvec3; EXTRA_SY $1,yvec10,xvec2; EXTRA_SY $1,yvec9,xvec1; EXTRA_SY $1,yvec8,xvec0; #ifndef TRMMKERNEL ADD_SY 0*SIZE(C0),xvec15, xvec15; ADD_SY 4*SIZE(C1),xvec7, xvec7; ADD_SY 8*SIZE(C0),xvec14, xvec14; ADD_SY 12*SIZE(C1),xvec6, xvec6; ADD_SY 0*SIZE(C0,ldc,1),xvec13, xvec13; ADD_SY 4*SIZE(C1,ldc,1),xvec5, xvec5; ADD_SY 8*SIZE(C0,ldc,1),xvec12, xvec12; ADD_SY 12*SIZE(C1,ldc,1),xvec4, xvec4; ADD_SY 0*SIZE(C1),xvec11, xvec11; ADD_SY 4*SIZE(C0),xvec3, xvec3; ADD_SY 8*SIZE(C1),xvec10, xvec10; ADD_SY 12*SIZE(C0),xvec2, xvec2; ADD_SY 0*SIZE(C1,ldc,1),xvec9, xvec9; ADD_SY 4*SIZE(C0,ldc,1),xvec1, xvec1; ADD_SY 8*SIZE(C1,ldc,1),xvec8, xvec8; ADD_SY 12*SIZE(C0,ldc,1),xvec0, xvec0; #endif ST_SY xvec15,0*SIZE(C0); ST_SY xvec7,4*SIZE(C1); ST_SY xvec14,8*SIZE(C0); ST_SY xvec6,12*SIZE(C1); ST_SY xvec13,0*SIZE(C0,ldc,1); ST_SY xvec5,4*SIZE(C1,ldc,1); ST_SY xvec12,8*SIZE(C0,ldc,1); ST_SY xvec4,12*SIZE(C1,ldc,1); ST_SY xvec11,0*SIZE(C1); ST_SY xvec3,4*SIZE(C0); ST_SY xvec10,8*SIZE(C1); ST_SY xvec2,12*SIZE(C0); ST_SY xvec9,0*SIZE(C1,ldc,1); ST_SY xvec1,4*SIZE(C0,ldc,1); ST_SY xvec8,8*SIZE(C1,ldc,1); ST_SY xvec0,12*SIZE(C0,ldc,1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE,C0; ADDQ $16*SIZE,C1; .L1_bodyE:; DECQ i; JG .L1_bodyB; JMP .L1_loopE; ALIGN_5 .L4_loopEx: EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0), xvec6, xvec6; LDH_SY 2*SIZE(C0), xvec6, xvec6; ADD_SY xvec6, xvec15, xvec15; #endif STL_SY xvec15, 0*SIZE(C0); STH_SY xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1), xvec5, xvec5; LDH_SY 6*SIZE(C1), xvec5, xvec5; ADD_SY xvec5, xvec7, xvec7; #endif STL_SY xvec7, 4*SIZE(C1); STH_SY xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C0), xvec5, xvec5; LDH_SY 10*SIZE(C0), xvec5, xvec5; ADD_SY xvec5, xvec14, xvec14; #endif STL_SY xvec14, 8*SIZE(C0); STH_SY xvec14, 10*SIZE(C0); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C1), xvec4, xvec4; LDH_SY 14*SIZE(C1), xvec4, xvec4; ADD_SY xvec4, xvec6, xvec6; #endif STL_SY xvec6, 12*SIZE(C1); STH_SY xvec6, 14*SIZE(C1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0, ldc, 1), xvec4, xvec4; LDH_SY 2*SIZE(C0, ldc, 1), xvec4, xvec4; ADD_SY xvec4, xvec13, xvec13; #endif STL_SY xvec13, 0*SIZE(C0, ldc, 1); STH_SY xvec13, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_SY 6*SIZE(C1, ldc, 1), xvec3, xvec3; ADD_SY xvec3, xvec5, xvec5; #endif STL_SY xvec5, 4*SIZE(C1, ldc, 1); STH_SX xvec5, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec12, xvec4; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C0, ldc, 1), xvec3, xvec3; LDH_SY 10*SIZE(C0, ldc, 1), xvec3, xvec3; ADD_SY xvec3, xvec12, xvec12; #endif STL_SY xvec12, 8*SIZE(C0, ldc, 1); STH_SY xvec12, 10*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_SY 14*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_SY xvec2, xvec4, xvec4; #endif STL_SY xvec4, 12*SIZE(C1, ldc, 1); STH_SY xvec4, 14*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1), xvec2, xvec2; LDH_SY 2*SIZE(C1), xvec2, xvec2; ADD_SY xvec2, xvec11, xvec11; #endif STL_SY xvec11, 0*SIZE(C1); STH_SY xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0), xvec1, xvec1; LDH_SY 6*SIZE(C0), xvec1, xvec1; ADD_SY xvec1, xvec3, xvec3; #endif STL_SY xvec3, 4*SIZE(C0); STH_SY xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec10, xvec2; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C1), xvec1, xvec1; LDH_SY 10*SIZE(C1), xvec1, xvec1; ADD_SY xvec1, xvec10, xvec10; #endif STL_SY xvec10, 8*SIZE(C1); STH_SY xvec10, 10*SIZE(C1); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C0), xvec0, xvec0; LDH_SY 14*SIZE(C0), xvec0, xvec0; ADD_SY xvec0, xvec2, xvec2; #endif STL_SY xvec2, 12*SIZE(C0); STH_SY xvec2, 14*SIZE(C0); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1, ldc, 1), xvec7, xvec7; LDH_SY 2*SIZE(C1, ldc, 1), xvec7, xvec7; ADD_SY xvec7, xvec9, xvec9; #endif STL_SY xvec9, 0*SIZE(C1, ldc, 1); STH_SY xvec9, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0, ldc, 1), xvec6, xvec6; LDH_SY 6*SIZE(C0, ldc, 1), xvec6, xvec6; ADD_SY xvec6, xvec1, xvec1; #endif STL_SY xvec1, 4*SIZE(C0, ldc, 1); STH_SY xvec1, 6*SIZE(C0, ldc, 1); EXTRA_SY $1, yvec8, xvec0; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C1, ldc, 1), xvec6, xvec6; LDH_SY 10*SIZE(C1, ldc, 1), xvec6, xvec6; ADD_SY xvec6, xvec8, xvec8; #endif STL_SY xvec8, 8*SIZE(C1, ldc, 1); STH_SY xvec8, 10*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_SY 14*SIZE(C0, ldc, 1), xvec5, xvec5; ADD_SY xvec5, xvec0, xvec0; #endif STL_SY xvec0, 12*SIZE(C0, ldc, 1); STH_SY xvec0, 14*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L1_bodyB; ALIGN_5; .L1_loopE:; TEST $4, bm; JLE .L5_loopE; ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec9, yvec9, yvec9; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 0*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 0*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 2 #### LD_SY 8*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 8*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 8*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 3 #### LD_SY 16*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 16*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 16*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 3 #### LD_SY 24*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 24*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 24*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; ADDQ $32*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L8_bodyB; ALIGN_5 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L9_loopE; ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 0*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 0*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 2 #### LD_SY 8*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 8*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 8*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L9_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L10_loopE; ALIGN_5 .L10_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 0*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 0*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L10_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec9, yvec7, yvec9; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; SUB_SY yvec13, yvec7, yvec13; SUB_SY yvec11, yvec7, yvec11; SUB_SY yvec9, yvec7, yvec9; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec9, yvec9; ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec9, yvec7, yvec9; VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec9, yvec9; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; VPERMILP_SY $0xb1,yvec13, yvec3; MUL_SY yvec13, yvec7, yvec13; MUL_SY yvec3, yvec6, yvec3; ADDSUB_SY yvec3, yvec13, yvec13; VPERMILP_SY $0xb1,yvec11, yvec1; MUL_SY yvec11, yvec7, yvec11; MUL_SY yvec1, yvec6, yvec1; ADDSUB_SY yvec1, yvec11, yvec11; VPERMILP_SY $0xb1,yvec9, yvec5; MUL_SY yvec9, yvec7, yvec9; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec9, yvec9; #### Writing back #### #### Shuffle Results #### MOV_SY yvec15,yvec7; REVS_SY $0xe4,yvec13,yvec15,yvec15; REVS_SY $0xe4,yvec7,yvec13,yvec13; MOV_SY yvec11,yvec7; REVS_SY $0xe4,yvec9,yvec11,yvec11; REVS_SY $0xe4,yvec7,yvec9,yvec9; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec6, xvec6; LDH_SX 2*SIZE(C0), xvec6, xvec6; ADD_SX xvec6, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C1), xvec4, xvec4; LDH_SX 6*SIZE(C1), xvec4, xvec4; ADD_SX xvec4, xvec7, xvec7; #endif STL_SX xvec7, 4*SIZE(C1); STH_SX xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0, ldc, 1), xvec4, xvec4; LDH_SX 2*SIZE(C0, ldc, 1), xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; #endif STL_SX xvec13, 0*SIZE(C0, ldc, 1); STH_SX xvec13, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_SX 6*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_SX xvec2, xvec5, xvec5; #endif STL_SX xvec5, 4*SIZE(C1, ldc, 1); STH_SX xvec5, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec2, xvec2; LDH_SX 2*SIZE(C1), xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C0), xvec0, xvec0; LDH_SX 6*SIZE(C0), xvec0, xvec0; ADD_SX xvec0, xvec3, xvec3; #endif STL_SX xvec3, 4*SIZE(C0); STH_SX xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1, ldc, 1), xvec0, xvec0; LDH_SX 2*SIZE(C1, ldc, 1), xvec0, xvec0; ADD_SX xvec0, xvec9, xvec9; #endif STL_SX xvec9, 0*SIZE(C1, ldc, 1); STH_SX xvec9, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C0, ldc, 1), xvec6, xvec6; LDH_SX 6*SIZE(C0, ldc, 1), xvec6, xvec6; ADD_SX xvec6, xvec1, xvec1; #endif STL_SX xvec1, 4*SIZE(C0, ldc, 1); STH_SX xvec1, 6*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #### Initial Results Register #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; ALIGN_5 .L11_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 16*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 24*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; ALIGN_5 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L12_loopE; ALIGN_5 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L12_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L13_loopE; ALIGN_5 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L13_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; #endif ##### Load Alpha #### BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec6, xvec3, xvec3; ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; MUL_SX xvec7, xvec12, xvec12; MUL_SX xvec6, xvec2, xvec2; ADDSUB_SX xvec2, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0, ldc,1), xvec0, xvec0; LDL_SX 0*SIZE(C0, ldc,1), xvec1, xvec1; LDH_SX 2*SIZE(C0), xvec1, xvec1; LDL_SX 0*SIZE(C1), xvec2, xvec2; LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_SX 2*SIZE(C1), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; ADD_SX xvec2, xvec13, xvec13; ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0, ldc, 1); STL_SX xvec14, 0*SIZE(C0, ldc, 1); STH_SX xvec14, 2*SIZE(C0); STL_SX xvec13, 0*SIZE(C1); STH_SX xvec13, 2*SIZE(C1, ldc, 1); STL_SX xvec12, 0*SIZE(C1, ldc, 1); STH_SX xvec12, 2*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; ALIGN_5 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; ALIGN_5 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 5*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 7*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L15_loopE; ALIGN_5 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L15_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L16_loopE; ALIGN_5 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L16_loopE: #### Handle #### #if defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; #endif ##### Load Alpha #### BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 0*SIZE(C0, ldc, 1), xvec0, xvec0; LDL_SX 0*SIZE(C1), xvec1, xvec1; LDH_SX 0*SIZE(C1, ldc, 1), xvec1, xvec1; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 1); STL_SX xvec14, 0*SIZE(C1); STH_SX xvec14, 0*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L7_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $4, kk; #endif MOVQ bk,k; SALQ $5,k; ADDQ k,bb; LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; LEAQ (C, ldc, 1), C1; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; XOR_SY yvec9, yvec9, yvec9; XOR_SY yvec8, yvec8, yvec8; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; ALIGN_5 .L211_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 32*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 36*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 40*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 44*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 48*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 52*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 56*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 60*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L211_bodyB; ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L212_loopE; ALIGN_5 .L212_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L212_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L213_loopE; ALIGN_5 .L213_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb .L213_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; SHUF_SX $0xb1, xvec9, xvec9; SHUF_SX $0xb1, xvec8, xvec8; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; SHUF_SX $0xb1, xvec9, xvec9; SHUF_SX $0xb1, xvec8, xvec8; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec6, xvec3, xvec3; ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; MUL_SX xvec7, xvec12, xvec12; MUL_SX xvec6, xvec2, xvec2; ADDSUB_SX xvec2, xvec12, xvec12; VPERMILP_SX $0xb1,xvec11, xvec1; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec6, xvec1, xvec1; ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; MUL_SX xvec7, xvec10, xvec10; MUL_SX xvec6, xvec0, xvec0; ADDSUB_SX xvec0, xvec10, xvec10; VPERMILP_SX $0xb1,xvec9, xvec5; MUL_SX xvec7, xvec9, xvec9; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec9, xvec9; VPERMILP_SX $0xb1,xvec8, xvec4; MUL_SX xvec7, xvec8, xvec8; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec8, xvec8; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C1), xvec1, xvec1; LDL_SX 8*SIZE(C0), xvec2, xvec2; LDH_SX 10*SIZE(C1), xvec2, xvec2; LDL_SX 12*SIZE(C0), xvec3, xvec3; LDH_SX 14*SIZE(C1), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; ADD_SX xvec2, xvec13, xvec13; ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); STL_SX xvec13, 8*SIZE(C0); STH_SX xvec13, 10*SIZE(C1); STL_SX xvec12, 12*SIZE(C0); STH_SX xvec12, 14*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec4, xvec4; LDH_SX 2*SIZE(C0), xvec4, xvec4; LDL_SX 4*SIZE(C1), xvec5, xvec5; LDH_SX 6*SIZE(C0), xvec5, xvec5; LDL_SX 8*SIZE(C1), xvec6, xvec6; LDH_SX 10*SIZE(C0), xvec6, xvec6; LDL_SX 12*SIZE(C1), xvec7, xvec7; LDH_SX 14*SIZE(C0), xvec7, xvec7; ADD_SX xvec4, xvec11, xvec11; ADD_SX xvec5, xvec10, xvec10; ADD_SX xvec6, xvec9, xvec9; ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); STL_SX xvec10, 4*SIZE(C1); STH_SX xvec10, 6*SIZE(C0); STL_SX xvec9, 8*SIZE(C1); STH_SX xvec9, 10*SIZE(C0); STL_SX xvec8, 12*SIZE(C1); STH_SX xvec8, 14*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L21_bodyB; ALIGN_5 .L21_loopE: TEST $4, bm; JLE .L22_loopE; ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; ALIGN_5 .L221_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L222_loopE; ALIGN_5 .L222_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L222_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L223_loopE; ALIGN_5 .L223_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L223_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec11, xvec1; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec6, xvec1, xvec1; ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; MUL_SX xvec7, xvec10, xvec10; MUL_SX xvec6, xvec0, xvec0; ADDSUB_SX xvec0, xvec10, xvec10; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C1), xvec1, xvec1; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec4, xvec4; LDH_SX 2*SIZE(C0), xvec4, xvec4; LDL_SX 4*SIZE(C1), xvec5, xvec5; LDH_SX 6*SIZE(C0), xvec5, xvec5; ADD_SX xvec4, xvec11, xvec11; ADD_SX xvec5, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); STL_SX xvec10, 4*SIZE(C1); STH_SX xvec10, 6*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec11, yvec11, yvec11; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; ALIGN_5 .L231_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L232_loopE; ALIGN_5 .L232_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L233_loopE; ALIGN_5 .L233_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L233_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec11, xvec1; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec6, xvec1, xvec1; ADDSUB_SX xvec1, xvec11, xvec11; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec4, xvec4; LDH_SX 2*SIZE(C0), xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; ALIGN_5 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; ALIGN_5 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 5*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 7*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L242_loopE; ALIGN_5 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L242_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L243_loopE; ALIGN_5 .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L243_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif ##### Load Alpha #### BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 0*SIZE(C1), xvec0, xvec0; ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L24_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $2, kk; #endif MOVQ bk, k; SALQ $4, k; ADDQ k, bb; LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; ALIGN_5 .L31_bodyB: MOVQ bb, ptrbb; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; MOVQ bk, k; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; ALIGN_5 .L311_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; LD_SY 8*SIZE(ptrba), yvec1; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 2 #### LD_SY 16*SIZE(ptrba), yvec0; LD_SY 24*SIZE(ptrba), yvec1; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 3 #### LD_SY 32*SIZE(ptrba), yvec0; LD_SY 40*SIZE(ptrba), yvec1; BROAD_SY 4*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 5*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 4 #### LD_SY 48*SIZE(ptrba), yvec0; LD_SY 56*SIZE(ptrba), yvec1; BROAD_SY 6*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 7*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; ADDQ $64*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L312_loopE; ALIGN_5 .L312_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; LD_SY 8*SIZE(ptrba), yvec1; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 2 #### LD_SY 16*SIZE(ptrba), yvec0; LD_SY 24*SIZE(ptrba), yvec1; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; ADDQ $32*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L312_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L313_loopE; ALIGN_5 .L313_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; LD_SY 8*SIZE(ptrba), yvec1; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; ADDQ $16*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L313_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; SUB_SY yvec14, yvec7, yvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; VPERMILP_SY $0xb1,yvec14, yvec4; MUL_SY yvec14, yvec7, yvec14; MUL_SY yvec4, yvec6, yvec4; ADDSUB_SY yvec4, yvec14, yvec14; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C0), xvec1, xvec1; LDL_SX 8*SIZE(C0), xvec2, xvec2; LDH_SX 10*SIZE(C0), xvec2, xvec2; LDL_SX 12*SIZE(C0), xvec3, xvec3; LDH_SX 14*SIZE(C0), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec7, xvec7; ADD_SX xvec2, xvec14, xvec14; ADD_SX xvec3, xvec6, xvec6; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); STL_SX xvec7, 4*SIZE(C0); STH_SX xvec7, 6*SIZE(C0); STL_SX xvec14, 8*SIZE(C0); STH_SX xvec14, 10*SIZE(C0); STL_SX xvec6, 12*SIZE(C0); STH_SX xvec6, 14*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; DECQ i; JG .L31_bodyB; ALIGN_5 .L31_loopE: TEST $4, bm; JLE .L32_loopE; ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; ALIGN_5 .L321_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 2 #### LD_SY 8*SIZE(ptrba), yvec0; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 3 #### LD_SY 16*SIZE(ptrba), yvec0; BROAD_SY 4*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 5*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 4 #### LD_SY 24*SIZE(ptrba), yvec0; BROAD_SY 6*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 7*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L322_loopE; ALIGN_5 .L322_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 2 #### LD_SY 8*SIZE(ptrba), yvec0; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L322_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L323_loopE; ALIGN_5 .L323_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L323_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; ADDSUB_SY yvec15, yvec7, yvec15; VPERMILP_SY $0xb1, yvec15, yvec15; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C0), xvec1, xvec1; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec7, xvec7; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); STL_SX xvec7, 4*SIZE(C0); STH_SX xvec7, 6*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; .L32_loopE: TEST $2, bm; JLE .L33_loopE; ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; ALIGN_5 .L331_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 3 #### LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 4*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 5*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 4 #### LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 6*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 7*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L332_loopE; ALIGN_5 .L332_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L332_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L333_loopE; ALIGN_5 .L333_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L333_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0), xvec0, xvec0; ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; .L33_loopE: TEST $1, bm; JLE .L34_loopE; ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; ALIGN_5 .L341_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L341_bodyB; ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L342_loopE; ALIGN_5 .L342_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L342_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L343_loopE; ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; LDL_SX 0*SIZE(ptrba), xvec0, xvec0; LDL_SX 0*SIZE(ptrbb), xvec2, xvec2; SHUF_SX $0xe0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xe1, xvec0, xvec1; SHUF_SX $0xe5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L343_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; VPERMILP_SX $0xb1, xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; SHUF_SX $0x44, xvec15, xvec14; SHUF_SX $0xee, xvec15, xvec13; ADD_SX xvec13, xvec14, xvec14; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; ADD_SX xvec0, xvec14, xvec14; #endif STL_SX xvec14, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; .L34_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $1, kk; #endif MOVQ bk, k; SALQ $3, k; ADDQ k, bb; ADDQ ldc, C; .L30_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; vzeroupper #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp; ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cgemm_kernel_8x2_haswell.S000066400000000000000000003161131313527062700223630ustar00rootroot00000000000000/********************************************************************************* Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* * 2014/07/29 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * 2013/10/28 Saar * Parameter: * CGEMM_DEFAULT_UNROLL_N 2 * CGEMM_DEFAULT_UNROLL_M 8 * CGEMM_DEFAULT_P 384 * CGEMM_DEFAULT_Q 192 * A_PR1 512 * B_PR1 512 * * 2014/07/29 Saar * Performance at 6912x6912x6912: * 1 thread: 107 GFLOPS (SANDYBRIDGE: 60) (MKL: 86) * 2 threads: 208 GFLOPS (SANDYBRIDGE: 114) (MKL: 155) * 3 threads: 289 GFLOPS (SANDYBRIDGE: 162) (MKL: 222) * 4 threads: 377 GFLOPS (SANDYBRIDGE: 223) (MKL: 279) * * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $ 0, 4096 * 4(%rsp);\ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(BULLDOZER) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #else #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 #endif #else #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #else #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 #endif #endif #define A_PR1 512 #define B_PR1 512 /***************************************************************************************************************************/ .macro KERNEL8x3_SUB vmovups -16 * SIZE(AO), %ymm0 vmovups -8 * SIZE(AO), %ymm1 vbroadcastss -8 * SIZE(BO), %ymm2 vbroadcastss -7 * SIZE(BO), %ymm3 prefetcht0 A_PR1(AO) VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 ) VFMADDPS_R( %ymm12,%ymm2,%ymm1 ) VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 ) VFMADDPS_I( %ymm13,%ymm3,%ymm1 ) vbroadcastss -6 * SIZE(BO), %ymm2 vbroadcastss -5 * SIZE(BO), %ymm3 VFMADDPS_R( %ymm10,%ymm2,%ymm0 ) VFMADDPS_R( %ymm14,%ymm2,%ymm1 ) VFMADDPS_I( %ymm11,%ymm3,%ymm0 ) VFMADDPS_I( %ymm15,%ymm3,%ymm1 ) vbroadcastss -4 * SIZE(BO), %ymm2 vbroadcastss -3 * SIZE(BO), %ymm3 VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) VFMADDPS_R( %ymm6 ,%ymm2,%ymm1 ) VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 ) addq $ 6*SIZE, BO addq $ 16*SIZE, AO decq %rax .endm .macro SAVE8x3 vbroadcastss ALPHA_R, %ymm0 vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm11,%ymm10, %ymm10 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm15,%ymm14, %ymm14 vaddsubps %ymm5, %ymm4 , %ymm4 vaddsubps %ymm7, %ymm6 , %ymm6 vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9 vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5 vshufps $ 0xb1, %ymm6 , %ymm6 , %ymm7 #else vaddsubps %ymm8, %ymm9 ,%ymm9 vaddsubps %ymm10, %ymm11,%ymm11 vaddsubps %ymm12, %ymm13,%ymm13 vaddsubps %ymm14, %ymm15,%ymm15 vaddsubps %ymm4, %ymm5 ,%ymm5 vaddsubps %ymm6, %ymm7 ,%ymm7 vmovaps %ymm9, %ymm8 vmovaps %ymm11, %ymm10 vmovaps %ymm13, %ymm12 vmovaps %ymm15, %ymm14 vmovaps %ymm5, %ymm4 vmovaps %ymm7, %ymm6 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7 #endif // multiply with ALPHA_R vmulps %ymm8 , %ymm0, %ymm8 vmulps %ymm10, %ymm0, %ymm10 vmulps %ymm12, %ymm0, %ymm12 vmulps %ymm14, %ymm0, %ymm14 vmulps %ymm4 , %ymm0, %ymm4 vmulps %ymm6 , %ymm0, %ymm6 // multiply with ALPHA_I vmulps %ymm9 , %ymm1, %ymm9 vmulps %ymm11, %ymm1, %ymm11 vmulps %ymm13, %ymm1, %ymm13 vmulps %ymm15, %ymm1, %ymm15 vmulps %ymm5 , %ymm1, %ymm5 vmulps %ymm7 , %ymm1, %ymm7 vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm11,%ymm10, %ymm10 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm15,%ymm14, %ymm14 vaddsubps %ymm5, %ymm4 , %ymm4 vaddsubps %ymm7, %ymm6 , %ymm6 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm8 , %ymm8 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 vaddps (CO1, LDC), %ymm10, %ymm10 vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 vaddps (CO1, LDC,2), %ymm4, %ymm4 vaddps 8 * SIZE(CO1, LDC,2), %ymm6, %ymm6 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , 8 * SIZE(CO1) vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 8 * SIZE(CO1, LDC) vmovups %ymm4 , (CO1, LDC,2) vmovups %ymm6 , 8 * SIZE(CO1, LDC,2) .endm /***************************************************************************************************************************/ .macro KERNEL4x3_SUB vmovups -16 * SIZE(AO), %ymm0 vbroadcastss -8 * SIZE(BO), %ymm2 vbroadcastss -7 * SIZE(BO), %ymm3 VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 ) VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 ) vbroadcastss -6 * SIZE(BO), %ymm2 vbroadcastss -5 * SIZE(BO), %ymm3 VFMADDPS_R( %ymm12,%ymm2,%ymm0 ) VFMADDPS_I( %ymm13,%ymm3,%ymm0 ) vbroadcastss -4 * SIZE(BO), %ymm2 vbroadcastss -3 * SIZE(BO), %ymm3 VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) addq $ 6*SIZE, BO addq $ 8*SIZE, AO decq %rax .endm .macro SAVE4x3 vbroadcastss ALPHA_R, %ymm0 vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm5, %ymm4 , %ymm4 vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5 #else vaddsubps %ymm8, %ymm9 ,%ymm9 vaddsubps %ymm12, %ymm13,%ymm13 vaddsubps %ymm4, %ymm5 ,%ymm5 vmovaps %ymm9, %ymm8 vmovaps %ymm13, %ymm12 vmovaps %ymm5, %ymm4 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 #endif // multiply with ALPHA_R vmulps %ymm8 , %ymm0, %ymm8 vmulps %ymm12, %ymm0, %ymm12 vmulps %ymm4 , %ymm0, %ymm4 // multiply with ALPHA_I vmulps %ymm9 , %ymm1, %ymm9 vmulps %ymm13, %ymm1, %ymm13 vmulps %ymm5 , %ymm1, %ymm5 vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm5, %ymm4 , %ymm4 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm8 , %ymm8 vaddps (CO1, LDC), %ymm12, %ymm12 vaddps (CO1, LDC,2), %ymm4, %ymm4 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , (CO1, LDC) vmovups %ymm4 , (CO1, LDC,2) .endm /***************************************************************************************************************************/ .macro KERNEL2x3_SUB vmovups -16 * SIZE(AO), %xmm0 vbroadcastss -8 * SIZE(BO), %xmm2 vbroadcastss -7 * SIZE(BO), %xmm3 VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 ) vbroadcastss -6 * SIZE(BO), %xmm2 vbroadcastss -5 * SIZE(BO), %xmm3 VFMADDPS_R( %xmm12,%xmm2,%xmm0 ) VFMADDPS_I( %xmm13,%xmm3,%xmm0 ) vbroadcastss -4 * SIZE(BO), %xmm2 vbroadcastss -3 * SIZE(BO), %xmm3 VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) addq $ 6*SIZE, BO addq $ 4*SIZE, AO decq %rax .endm .macro SAVE2x3 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm5, %xmm4 , %xmm4 vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm12, %xmm13,%xmm13 vaddsubps %xmm4, %xmm5 ,%xmm5 vmovaps %xmm9, %xmm8 vmovaps %xmm13, %xmm12 vmovaps %xmm5, %xmm4 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm12, %xmm0, %xmm12 vmulps %xmm4 , %xmm0, %xmm4 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm13, %xmm1, %xmm13 vmulps %xmm5 , %xmm1, %xmm5 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm5, %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm12, %xmm12 vaddps (CO1, LDC,2), %xmm4, %xmm4 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , (CO1, LDC) vmovups %xmm4 , (CO1, LDC,2) .endm /***************************************************************************************************************************/ .macro KERNEL1x3_SUB vmovsd -16 * SIZE(AO), %xmm0 vbroadcastss -8 * SIZE(BO), %xmm2 vbroadcastss -7 * SIZE(BO), %xmm3 VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 ) vbroadcastss -6 * SIZE(BO), %xmm2 vbroadcastss -5 * SIZE(BO), %xmm3 VFMADDPS_R( %xmm12,%xmm2,%xmm0 ) VFMADDPS_I( %xmm13,%xmm3,%xmm0 ) vbroadcastss -4 * SIZE(BO), %xmm2 vbroadcastss -3 * SIZE(BO), %xmm3 VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) addq $ 6*SIZE, BO addq $ 2*SIZE, AO decq %rax .endm .macro SAVE1x3 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm5, %xmm4 , %xmm4 vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm12, %xmm13,%xmm13 vaddsubps %xmm4, %xmm5 ,%xmm5 vmovaps %xmm9, %xmm8 vmovaps %xmm13, %xmm12 vmovaps %xmm5, %xmm4 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm12, %xmm0, %xmm12 vmulps %xmm4 , %xmm0, %xmm4 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm13, %xmm1, %xmm13 vmulps %xmm5 , %xmm1, %xmm5 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm5, %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vmovsd (CO1) , %xmm9 vmovsd (CO1,LDC) , %xmm13 vmovsd (CO1,LDC,2), %xmm5 vaddps %xmm9 , %xmm8 , %xmm8 vaddps %xmm13, %xmm12, %xmm12 vaddps %xmm5 , %xmm4, %xmm4 #endif vmovsd %xmm8 , (CO1) vmovsd %xmm12 , (CO1, LDC) vmovsd %xmm4 , (CO1, LDC,2) .endm /***************************************************************************************************************************/ .macro KERNEL8x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) addq $ 4 , BI addq $ 16, %rax .endm .macro SAVE8x2 vbroadcastss ALPHA_R, %ymm0 vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm11,%ymm10, %ymm10 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm15,%ymm14, %ymm14 vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 #else vaddsubps %ymm8, %ymm9 ,%ymm9 vaddsubps %ymm10, %ymm11,%ymm11 vaddsubps %ymm12, %ymm13,%ymm13 vaddsubps %ymm14, %ymm15,%ymm15 vmovaps %ymm9, %ymm8 vmovaps %ymm11, %ymm10 vmovaps %ymm13, %ymm12 vmovaps %ymm15, %ymm14 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 #endif // multiply with ALPHA_R vmulps %ymm8 , %ymm0, %ymm8 vmulps %ymm10, %ymm0, %ymm10 vmulps %ymm12, %ymm0, %ymm12 vmulps %ymm14, %ymm0, %ymm14 // multiply with ALPHA_I vmulps %ymm9 , %ymm1, %ymm9 vmulps %ymm11, %ymm1, %ymm11 vmulps %ymm13, %ymm1, %ymm13 vmulps %ymm15, %ymm1, %ymm15 vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm11,%ymm10, %ymm10 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm15,%ymm14, %ymm14 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm8 , %ymm8 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 vaddps (CO1, LDC), %ymm10, %ymm10 vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , 8 * SIZE(CO1) vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 8 * SIZE(CO1, LDC) prefetcht0 64(CO1) prefetcht0 64(CO1, LDC) .endm /***************************************************************************************************************************/ .macro KERNEL4x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) addq $ 4, BI addq $ 8, %rax .endm .macro SAVE4x2 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vaddsubps %xmm12, %xmm13,%xmm13 vaddsubps %xmm14, %xmm15,%xmm15 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 vmovaps %xmm13, %xmm12 vmovaps %xmm15, %xmm14 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 vmulps %xmm12, %xmm0, %xmm12 vmulps %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vmulps %xmm13, %xmm1, %xmm13 vmulps %xmm15, %xmm1, %xmm15 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 vaddps (CO1, LDC), %xmm10, %xmm10 vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 4 * SIZE(CO1, LDC) .endm /************************************************************************************************/ .macro KERNEL2x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $ 4, BI addq $ 4, %rax .endm .macro SAVE2x2 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) .endm /************************************************************************************************/ .macro KERNEL1x2_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $ 4, BI addq $ 2, %rax .endm .macro SAVE1x2 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #if !defined(TRMMKERNEL) vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 vmovsd (CO1, LDC), %xmm15 vaddps %xmm15, %xmm10, %xmm10 #endif vmovsd %xmm8 , (CO1) vmovsd %xmm10 , (CO1, LDC) .endm /************************************************************************************************/ .macro KERNEL8x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) addq $ 2 , BI addq $ 16, %rax .endm .macro SAVE8x1 vbroadcastss ALPHA_R, %ymm0 vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm13,%ymm12, %ymm12 vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 #else vaddsubps %ymm8, %ymm9 ,%ymm9 vaddsubps %ymm12, %ymm13,%ymm13 vmovaps %ymm9, %ymm8 vmovaps %ymm13, %ymm12 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 #endif // multiply with ALPHA_R vmulps %ymm8 , %ymm0, %ymm8 vmulps %ymm12, %ymm0, %ymm12 // multiply with ALPHA_I vmulps %ymm9 , %ymm1, %ymm9 vmulps %ymm13, %ymm1, %ymm13 vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm13,%ymm12, %ymm12 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm8 , %ymm8 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , 8 * SIZE(CO1) .endm /************************************************************************************************/ .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) addq $ 2, BI addq $ 8, %rax .endm .macro SAVE4x1 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm12, %xmm13,%xmm13 vmovaps %xmm9, %xmm8 vmovaps %xmm13, %xmm12 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm13, %xmm1, %xmm13 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) .endm /************************************************************************************************/ .macro KERNEL2x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $ 2, BI addq $ 4, %rax .endm .macro SAVE2x1 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) .endm /************************************************************************************************/ .macro KERNEL1x1_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $ 2, BI addq $ 2, %rax .endm .macro SAVE1x1 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #if !defined(TRMMKERNEL) vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 #endif vmovsd %xmm8 , (CO1) .endm #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $ 128 + L_BUFFER_SIZE, %rsp andq $ -4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA_R vmovss %xmm1, ALPHA_I salq $ ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $ 6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 /************************************************************************************************/ .L6_0: movq Ndiv6, J cmpq $ 0, J je .L2_00 ALIGN_4 .L6_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax salq $2, %rax // 2 * COMPSIZE leaq (B, %rax,4), BO2 movq BO2, B // next offset of B movq K, %rax ALIGN_4 .L6_02b: vmovups (BO1), %xmm0 vmovsd (BO2), %xmm1 vmovups %xmm0, (BO) vmovsd %xmm1, 4*SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO2 addq $ 6*SIZE,BO decq %rax jnz .L6_02b .L6_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L6_4_10 ALIGN_4 /**********************************************************************************************************/ .L6_8_11: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L6_8_16 ALIGN_4 .L6_8_12: KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB je .L6_8_16 KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB je .L6_8_16 jmp .L6_8_12 ALIGN_4 .L6_8_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L6_8_19 ALIGN_4 .L6_8_17: KERNEL8x3_SUB jnz .L6_8_17 ALIGN_4 .L6_8_19: SAVE8x3 addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L6_8_11 ALIGN_4 /**********************************************************************************************************/ .L6_4_10: testq $ 7, M jz .L6_4_60 // to next 2 lines of N testq $ 4, M jz .L6_4_20 ALIGN_4 .L6_4_11: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L6_4_16 ALIGN_4 .L6_4_12: prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB je .L6_4_16 prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB je .L6_4_16 jmp .L6_4_12 ALIGN_4 .L6_4_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L6_4_19 ALIGN_4 .L6_4_17: KERNEL4x3_SUB jnz .L6_4_17 ALIGN_4 .L6_4_19: SAVE4x3 addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_4_20: testq $ 2, M jz .L6_4_40 ALIGN_4 .L6_4_21: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L6_4_26 ALIGN_4 .L6_4_22: prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L6_4_26 prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L6_4_26 jmp .L6_4_22 ALIGN_4 .L6_4_26: movq K, %rax andq $ 7, %rax # if (k & 1) je .L6_4_29 ALIGN_4 .L6_4_27: KERNEL2x3_SUB jnz .L6_4_27 ALIGN_4 .L6_4_29: SAVE2x3 addq $ 4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L6_4_21 ALIGN_4 /**************************************************************************/ .L6_4_40: testq $ 1, M jz .L6_4_60 // to next 2 lines of N ALIGN_4 .L6_4_41: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L6_4_46 ALIGN_4 .L6_4_42: prefetcht0 A_PR1(AO) KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L6_4_46 prefetcht0 A_PR1(AO) KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L6_4_46 jmp .L6_4_42 ALIGN_4 .L6_4_46: movq K, %rax andq $ 7, %rax # if (k & 1) je .L6_4_49 ALIGN_4 .L6_4_47: KERNEL1x3_SUB jnz .L6_4_47 ALIGN_4 .L6_4_49: SAVE1x3 addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L6_4_41 ALIGN_4 .L6_4_60: /*******************************************************************************************/ .L7_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax salq $2, %rax // 2 * COMPSIZE leaq (B, %rax,4), BO2 movq K, %rax ALIGN_4 .L7_02b: vmovsd 2*SIZE(BO1), %xmm0 vmovups (BO2), %xmm1 vmovsd %xmm0, (BO) vmovups %xmm1, 2*SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO2 addq $ 6*SIZE,BO decq %rax jnz .L7_02b movq BO2, B // next offset of B .L7_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L7_4_10 ALIGN_4 /**********************************************************************************************************/ .L7_8_11: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L7_8_16 ALIGN_4 .L7_8_12: KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB je .L7_8_16 KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB KERNEL8x3_SUB je .L7_8_16 jmp .L7_8_12 ALIGN_4 .L7_8_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L7_8_19 ALIGN_4 .L7_8_17: KERNEL8x3_SUB jnz .L7_8_17 ALIGN_4 .L7_8_19: SAVE8x3 addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L7_8_11 ALIGN_4 /**********************************************************************************************************/ .L7_4_10: testq $ 7, M jz .L7_4_60 // to next 2 lines of N testq $ 4, M jz .L7_4_20 ALIGN_4 .L7_4_11: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L7_4_16 ALIGN_4 .L7_4_12: prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB je .L7_4_16 prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB prefetcht0 A_PR1(AO) KERNEL4x3_SUB KERNEL4x3_SUB je .L7_4_16 jmp .L7_4_12 ALIGN_4 .L7_4_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L7_4_19 ALIGN_4 .L7_4_17: KERNEL4x3_SUB jnz .L7_4_17 ALIGN_4 .L7_4_19: SAVE4x3 addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L7_4_20: testq $ 2, M jz .L7_4_40 ALIGN_4 .L7_4_21: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L7_4_26 ALIGN_4 .L7_4_22: prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L7_4_26 prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB prefetcht0 A_PR1(AO) KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L7_4_26 jmp .L7_4_22 ALIGN_4 .L7_4_26: movq K, %rax andq $ 7, %rax # if (k & 1) je .L7_4_29 ALIGN_4 .L7_4_27: KERNEL2x3_SUB jnz .L7_4_27 ALIGN_4 .L7_4_29: SAVE2x3 addq $ 4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L7_4_21 ALIGN_4 /**************************************************************************/ .L7_4_40: testq $ 1, M jz .L7_4_60 // to next 2 lines of N ALIGN_4 .L7_4_41: leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L7_4_46 ALIGN_4 .L7_4_42: prefetcht0 A_PR1(AO) KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L7_4_46 prefetcht0 A_PR1(AO) KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L7_4_46 jmp .L7_4_42 ALIGN_4 .L7_4_46: movq K, %rax andq $ 7, %rax # if (k & 1) je .L7_4_49 ALIGN_4 .L7_4_47: KERNEL1x3_SUB jnz .L7_4_47 ALIGN_4 .L7_4_49: SAVE1x3 addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L7_4_41 ALIGN_4 .L7_4_60: decq J // j -- jg .L6_01 // next 6 lines of N /************************************************************************************************/ .L2_00: movq Nmod6, J sarq $1, J // j = j / 2 cmpq $ 0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L2_4_10 ALIGN_4 /**********************************************************************************************************/ .L2_8_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 8, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_8_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_8_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB je .L2_8_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB je .L2_8_16 jmp .L2_8_12 ALIGN_4 .L2_8_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_8_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_8_17: KERNEL8x2_SUB jl .L2_8_17 ALIGN_4 .L2_8_19: SAVE8x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 8, KK #endif addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_8_11 ALIGN_4 /**********************************************************************************************************/ .L2_4_10: testq $ 7, M jz .L2_4_60 // to next 2 lines of N testq $ 4, M jz .L2_4_20 ALIGN_4 .L2_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB je .L2_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB je .L2_4_16 jmp .L2_4_12 ALIGN_4 .L2_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_17: KERNEL4x2_SUB jl .L2_4_17 ALIGN_4 .L2_4_19: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_4_20: testq $ 2, M jz .L2_4_40 ALIGN_4 .L2_4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_26 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_22: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_4_26 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_4_26 jmp .L2_4_22 ALIGN_4 .L2_4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_29 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_27: KERNEL2x2_SUB jl .L2_4_27 ALIGN_4 .L2_4_29: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L2_4_21 ALIGN_4 /**************************************************************************/ .L2_4_40: testq $ 1, M jz .L2_4_60 // to next 2 lines of N ALIGN_4 .L2_4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_4_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_4_46 jmp .L2_4_42 ALIGN_4 .L2_4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_47: KERNEL1x2_SUB jl .L2_4_47 ALIGN_4 .L2_4_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L2_4_41 ALIGN_4 .L2_4_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $ 2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $ 1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L1_4_10 ALIGN_4 /**************************************************************************************************/ .L1_8_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 8, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_8_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_8_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB je .L1_8_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB je .L1_8_16 jmp .L1_8_12 ALIGN_4 .L1_8_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_8_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_8_17: KERNEL8x1_SUB jl .L1_8_17 ALIGN_4 .L1_8_19: SAVE8x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 8, KK #endif addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_8_11 ALIGN_4 /**************************************************************************************************/ .L1_4_10: testq $ 7, M jz .L999 testq $ 4, M jz .L1_4_20 .L1_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 jmp .L1_4_12 ALIGN_4 .L1_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_17: KERNEL4x1_SUB jl .L1_4_17 ALIGN_4 .L1_4_19: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_4_20: testq $ 2, M jz .L1_4_40 ALIGN_4 .L1_4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_26 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_22: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_4_26 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_4_26 jmp .L1_4_22 ALIGN_4 .L1_4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_29 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_27: KERNEL2x1_SUB jl .L1_4_27 ALIGN_4 .L1_4_29: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L1_4_40: testq $ 1, M jz .L999 // to next 2 lines of N ALIGN_4 .L1_4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_4_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_4_46 jmp .L1_4_42 ALIGN_4 .L1_4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_47: KERNEL1x1_SUB jl .L1_4_47 ALIGN_4 .L1_4_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $ STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************************/ PROLOGUE PROFCODE subq $ STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $ 128 + L_BUFFER_SIZE, %rsp andq $ -4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA_R vmovss %xmm1, ALPHA_I salq $ ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $ 2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_0: movq Ndiv6, J cmpq $ 0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L2_4_10 ALIGN_4 /**********************************************************************************************************/ .L2_8_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 8, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_8_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_8_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB je .L2_8_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x2_SUB je .L2_8_16 jmp .L2_8_12 ALIGN_4 .L2_8_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_8_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_8_17: KERNEL8x2_SUB jl .L2_8_17 ALIGN_4 .L2_8_19: SAVE8x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 8, KK #endif addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_8_11 ALIGN_4 /**********************************************************************************************************/ .L2_4_10: testq $ 7, M jz .L2_4_60 // to next 2 lines of N testq $ 4, M jz .L2_4_20 ALIGN_4 .L2_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB je .L2_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB je .L2_4_16 jmp .L2_4_12 ALIGN_4 .L2_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_17: KERNEL4x2_SUB jl .L2_4_17 ALIGN_4 .L2_4_19: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_4_20: testq $ 2, M jz .L2_4_40 ALIGN_4 .L2_4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_26 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_22: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_4_26 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_4_26 jmp .L2_4_22 ALIGN_4 .L2_4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_29 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_27: KERNEL2x2_SUB jl .L2_4_27 ALIGN_4 .L2_4_29: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L2_4_21 ALIGN_4 /**************************************************************************/ .L2_4_40: testq $ 1, M jz .L2_4_60 // to next 2 lines of N ALIGN_4 .L2_4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_4_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_4_46 jmp .L2_4_42 ALIGN_4 .L2_4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_47: KERNEL1x2_SUB jl .L2_4_47 ALIGN_4 .L2_4_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L2_4_41 ALIGN_4 .L2_4_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $ 2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $ 1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L1_4_10 ALIGN_4 /**************************************************************************************************/ .L1_8_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 8, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_8_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_8_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB je .L1_8_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB je .L1_8_16 jmp .L1_8_12 ALIGN_4 .L1_8_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_8_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_8_17: KERNEL8x1_SUB jl .L1_8_17 ALIGN_4 .L1_8_19: SAVE8x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 8, KK #endif addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_8_11 ALIGN_4 /**************************************************************************************************/ .L1_4_10: testq $ 7, M jz .L999 testq $ 4, M jz .L1_4_20 .L1_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 jmp .L1_4_12 ALIGN_4 .L1_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_17: KERNEL4x1_SUB jl .L1_4_17 ALIGN_4 .L1_4_19: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_4_20: testq $ 2, M jz .L1_4_40 ALIGN_4 .L1_4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_26 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_22: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_4_26 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_4_26 jmp .L1_4_22 ALIGN_4 .L1_4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_29 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_27: KERNEL2x1_SUB jl .L1_4_27 ALIGN_4 .L1_4_29: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L1_4_40: testq $ 1, M jz .L999 // to next 2 lines of N ALIGN_4 .L1_4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_4_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_4_46 jmp .L1_4_42 ALIGN_4 .L1_4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_47: KERNEL1x1_SUB jl .L1_4_47 ALIGN_4 .L1_4_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $ STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/cgemm_kernel_8x2_sandy.S000066400000000000000000001626511313527062700220500ustar00rootroot00000000000000/********************************************************************************* Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* * 2014/07/29 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * 2013/10/28 Saar * Parameter: * CGEMM_DEFAULT_UNROLL_N 2 * CGEMM_DEFAULT_UNROLL_M 8 * CGEMM_DEFAULT_P 768 * CGEMM_DEFAULT_Q 512 * A_PR1 512 * B_PR1 512 * * 2014/07/29 Saar * Performance at 6192x6192x6192: * 1 thread: 49 GFLOPS (MKL: 52) * 2 threads: 99 GFLOPS (MKL: 102) * 3 threads: 148 GFLOPS (MKL: 150) * 4 threads: 195 GFLOPS (MKL: 194) * 8 threads: 354 GFLOPS (MKL: 317) * * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $ 0, 4096 * 4(%rsp);\ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADDPS_YR( y0,y1,y2 ) \ vmulps y1,y2,%ymm2;\ vaddps y0,%ymm2,y0 #define VFMADDPS_YI( y0,y1,y2 ) \ vmulps y1,y2,%ymm3;\ vaddps y0,%ymm3,y0 #define VFMADDPS_R( y0,y1,y2 ) \ vmulps y1,y2,%xmm2;\ vaddps y0,%xmm2,y0 #define VFMADDPS_I( y0,y1,y2 ) \ vmulps y1,y2,%xmm3;\ vaddps y0,%xmm3,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADDPS_YR( y0,y1,y2 ) \ vmulps y1,y2,%ymm2;\ vsubps %ymm2,y0,y0 #define VFMADDPS_YI( y0,y1,y2 ) \ vmulps y1,y2,%ymm3;\ vaddps y0,%ymm3,y0 #define VFMADDPS_R( y0,y1,y2 ) \ vmulps y1,y2,%xmm2;\ vsubps %xmm2,y0,y0 #define VFMADDPS_I( y0,y1,y2 ) \ vmulps y1,y2,%xmm3;\ vaddps y0,%xmm3,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADDPS_YR( y0,y1,y2 ) \ vmulps y1,y2,%ymm2;\ vaddps y0,%ymm2,y0 #define VFMADDPS_YI( y0,y1,y2 ) \ vmulps y1,y2,%ymm3;\ vsubps %ymm3,y0,y0 #define VFMADDPS_R( y0,y1,y2 ) \ vmulps y1,y2,%xmm2;\ vaddps y0,%xmm2,y0 #define VFMADDPS_I( y0,y1,y2 ) \ vmulps y1,y2,%xmm3;\ vsubps %xmm3,y0,y0 #else #define VFMADDPS_YR( y0,y1,y2 ) \ vmulps y1,y2,%ymm2;\ vsubps %ymm2,y0,y0 #define VFMADDPS_YI( y0,y1,y2 ) \ vmulps y1,y2,%ymm3;\ vsubps %ymm3,y0,y0 #define VFMADDPS_R( y0,y1,y2 ) \ vmulps y1,y2,%xmm2;\ vsubps %xmm2,y0,y0 #define VFMADDPS_I( y0,y1,y2 ) \ vmulps y1,y2,%xmm3;\ vsubps %xmm3,y0,y0 #endif #define A_PR1 512 #define B_PR1 512 /***************************************************************************************************************************/ .macro KERNEL8x2_1 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 prefetcht0 A_PR1(AO, %rax, SIZE) VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1 prefetcht0 A_PR1+64(AO, %rax, SIZE) VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6 VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4 VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1 prefetcht0 A_PR1+128(AO, %rax, SIZE) VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6 VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4 VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1 prefetcht0 A_PR1+192(AO, %rax, SIZE) VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6 VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) addq $ 16, BI VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) addq $ 64, %rax .endm .macro KERNEL8x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) addq $ 4 , BI addq $ 16, %rax .endm .macro SAVE8x2 vbroadcastss ALPHA_R, %ymm0 vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm11,%ymm10, %ymm10 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm15,%ymm14, %ymm14 vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 #else vaddsubps %ymm8, %ymm9 ,%ymm9 vaddsubps %ymm10, %ymm11,%ymm11 vaddsubps %ymm12, %ymm13,%ymm13 vaddsubps %ymm14, %ymm15,%ymm15 vmovaps %ymm9, %ymm8 vmovaps %ymm11, %ymm10 vmovaps %ymm13, %ymm12 vmovaps %ymm15, %ymm14 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 #endif // multiply with ALPHA_R vmulps %ymm8 , %ymm0, %ymm8 vmulps %ymm10, %ymm0, %ymm10 vmulps %ymm12, %ymm0, %ymm12 vmulps %ymm14, %ymm0, %ymm14 // multiply with ALPHA_I vmulps %ymm9 , %ymm1, %ymm9 vmulps %ymm11, %ymm1, %ymm11 vmulps %ymm13, %ymm1, %ymm13 vmulps %ymm15, %ymm1, %ymm15 vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm11,%ymm10, %ymm10 vaddsubps %ymm13,%ymm12, %ymm12 vaddsubps %ymm15,%ymm14, %ymm14 #ifndef TRMMKERNEL vaddps (CO1), %ymm8 , %ymm8 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 vaddps (CO1, LDC), %ymm10, %ymm10 vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , 8 * SIZE(CO1) vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 8 * SIZE(CO1, LDC) prefetcht0 64(CO1) prefetcht0 64(CO1, LDC) .endm /***************************************************************************************************************************/ .macro KERNEL4x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) addq $ 4, BI addq $ 8, %rax .endm .macro SAVE4x2 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vaddsubps %xmm12, %xmm13,%xmm13 vaddsubps %xmm14, %xmm15,%xmm15 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 vmovaps %xmm13, %xmm12 vmovaps %xmm15, %xmm14 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 vmulps %xmm12, %xmm0, %xmm12 vmulps %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vmulps %xmm13, %xmm1, %xmm13 vmulps %xmm15, %xmm1, %xmm15 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vaddsubps %xmm13,%xmm12, %xmm12 vaddsubps %xmm15,%xmm14, %xmm14 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 vaddps (CO1, LDC), %xmm10, %xmm10 vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 4 * SIZE(CO1, LDC) .endm /************************************************************************************************/ .macro KERNEL2x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $ 4, BI addq $ 4, %rax .endm .macro SAVE2x2 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) .endm /************************************************************************************************/ .macro KERNEL1x2_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) addq $ 4, BI addq $ 2, %rax .endm .macro SAVE1x2 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 vmovsd (CO1, LDC), %xmm15 vaddps %xmm15, %xmm10, %xmm10 #endif vmovsd %xmm8 , (CO1) vmovsd %xmm10 , (CO1, LDC) .endm /************************************************************************************************/ .macro KERNEL8x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) addq $ 2 , BI addq $ 16, %rax .endm .macro SAVE8x1 vbroadcastss ALPHA_R, %ymm0 vbroadcastss ALPHA_I, %ymm1 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm13,%ymm12, %ymm12 vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 #else vaddsubps %ymm8, %ymm9 ,%ymm9 vaddsubps %ymm12, %ymm13,%ymm13 vmovaps %ymm9, %ymm8 vmovaps %ymm13, %ymm12 // swap high and low 64 bytes vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 #endif // multiply with ALPHA_R vmulps %ymm8 , %ymm0, %ymm8 vmulps %ymm12, %ymm0, %ymm12 // multiply with ALPHA_I vmulps %ymm9 , %ymm1, %ymm9 vmulps %ymm13, %ymm1, %ymm13 vaddsubps %ymm9, %ymm8 , %ymm8 vaddsubps %ymm13,%ymm12, %ymm12 #ifndef TRMMKERNEL vaddps (CO1), %ymm8 , %ymm8 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , 8 * SIZE(CO1) .endm /************************************************************************************************/ .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) addq $ 2, BI addq $ 8, %rax .endm .macro SAVE4x1 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm12, %xmm13,%xmm13 vmovaps %xmm9, %xmm8 vmovaps %xmm13, %xmm12 // swap high and low 4 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm13, %xmm1, %xmm13 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm13,%xmm12, %xmm12 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 4 * SIZE(CO1) .endm /************************************************************************************************/ .macro KERNEL2x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $ 2, BI addq $ 4, %rax .endm .macro SAVE2x1 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) .endm /************************************************************************************************/ .macro KERNEL1x1_SUB vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) addq $ 2, BI addq $ 2, %rax .endm .macro SAVE1x1 vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vmovaps %xmm9, %xmm8 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vaddsubps %xmm9, %xmm8 , %xmm8 #ifndef TRMMKERNEL vmovsd (CO1), %xmm14 vaddps %xmm14, %xmm8 , %xmm8 #endif vmovsd %xmm8 , (CO1) .endm /************************************************************************************************/ PROLOGUE PROFCODE subq $ STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $ 128 + L_BUFFER_SIZE, %rsp andq $ -4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA_R vmovss %xmm1, ALPHA_I salq $ ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $ 2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_0: movq Ndiv6, J cmpq $ 0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L2_4_10 ALIGN_4 /**********************************************************************************************************/ .L2_8_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 8, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_8_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_8_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_1 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_1 je .L2_8_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_1 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x2_1 je .L2_8_16 jmp .L2_8_12 ALIGN_4 .L2_8_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_8_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_8_17: KERNEL8x2_SUB jl .L2_8_17 ALIGN_4 .L2_8_19: SAVE8x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 8, KK #endif addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_8_11 ALIGN_4 /**********************************************************************************************************/ .L2_4_10: testq $ 7, M jz .L2_4_60 // to next 2 lines of N testq $ 4, M jz .L2_4_20 ALIGN_4 .L2_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB je .L2_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB KERNEL4x2_SUB je .L2_4_16 jmp .L2_4_12 ALIGN_4 .L2_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_17: KERNEL4x2_SUB jl .L2_4_17 ALIGN_4 .L2_4_19: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_4_20: testq $ 2, M jz .L2_4_40 ALIGN_4 .L2_4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_26 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_22: prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_4_26 prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_4_26 jmp .L2_4_22 ALIGN_4 .L2_4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_29 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_27: KERNEL2x2_SUB jl .L2_4_27 ALIGN_4 .L2_4_29: vbroadcastss ALPHA_R, %xmm0 vbroadcastss ALPHA_I, %xmm1 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 #else vaddsubps %xmm8, %xmm9 ,%xmm9 vaddsubps %xmm10, %xmm11,%xmm11 vmovaps %xmm9, %xmm8 vmovaps %xmm11, %xmm10 // swap high and low 64 bytes vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulps %xmm8 , %xmm0, %xmm8 vmulps %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulps %xmm9 , %xmm1, %xmm9 vmulps %xmm11, %xmm1, %xmm11 vaddsubps %xmm9, %xmm8 , %xmm8 vaddsubps %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddps (CO1), %xmm8 , %xmm8 vaddps (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L2_4_21 ALIGN_4 /**************************************************************************/ .L2_4_40: testq $ 1, M jz .L2_4_60 // to next 2 lines of N ALIGN_4 .L2_4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_42: prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_4_46 prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_4_46 jmp .L2_4_42 ALIGN_4 .L2_4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_47: KERNEL1x2_SUB jl .L2_4_47 ALIGN_4 .L2_4_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L2_4_41 ALIGN_4 .L2_4_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $ 2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $ 1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $ 3, I // i = (m >> 3) je .L1_4_10 ALIGN_4 /**************************************************************************************************/ .L1_8_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 8, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_8_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_8_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB je .L1_8_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL8x1_SUB je .L1_8_16 jmp .L1_8_12 ALIGN_4 .L1_8_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_8_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_8_17: KERNEL8x1_SUB jl .L1_8_17 ALIGN_4 .L1_8_19: SAVE8x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 4, %rax // rax = rax *16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 8, KK #endif addq $ 16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_8_11 ALIGN_4 /**************************************************************************************************/ .L1_4_10: testq $ 7, M jz .L999 testq $ 4, M jz .L1_4_20 .L1_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 jmp .L1_4_12 ALIGN_4 .L1_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_17: KERNEL4x1_SUB jl .L1_4_17 ALIGN_4 .L1_4_19: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_4_20: testq $ 2, M jz .L1_4_40 ALIGN_4 .L1_4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_26 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_22: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_4_26 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_4_26 jmp .L1_4_22 ALIGN_4 .L1_4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_29 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_27: KERNEL2x1_SUB jl .L1_4_27 ALIGN_4 .L1_4_29: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /**************************************************************************/ .L1_4_40: testq $ 1, M jz .L999 // to next 2 lines of N ALIGN_4 .L1_4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_4_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_4_46 jmp .L1_4_42 ALIGN_4 .L1_4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_47: KERNEL1x1_SUB jl .L1_4_47 ALIGN_4 .L1_4_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $ STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cgemv_n.S000066400000000000000000002365541313527062700171430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #if GEMV_UNROLL < 2 #undef GEMV_UNROLL #define GEMV_UNROLL 2 #endif #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) #define ALPHA 48 (%rsp) #define MMM 64(%rsp) #define NN 72(%rsp) #define AA 80(%rsp) #define XX 88(%rsp) #define LDAX 96(%rsp) #define ALPHAR 104(%rsp) #define ALPHAI 112(%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #else #define STACKSIZE 288 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) #define OLD_X 64 + STACKSIZE(%rsp) #define OLD_INCX 72 + STACKSIZE(%rsp) #define OLD_Y 80 + STACKSIZE(%rsp) #define OLD_INCY 88 + STACKSIZE(%rsp) #define OLD_BUFFER 96 + STACKSIZE(%rsp) #define ALPHA 224 (%rsp) #define MMM 232(%rsp) #define NN 240(%rsp) #define AA 248(%rsp) #define XX 256(%rsp) #define LDAX 264(%rsp) #define ALPHAR 272(%rsp) #define ALPHAI 280(%rsp) #define M %rcx #define N %rdx #define A %r8 #define LDA %r9 #define X %rdi #define INCX %rsi #define Y %rbp #define INCY %r10 #endif #define I %rax #define A1 %r11 #define A2 %r12 #define Y1 %r13 #define BUFFER %r14 #ifdef ALIGNED_ACCESS #define MM %r15 #else #define MM M #endif #undef SUBPS #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) #define SUBPS subps #else #define SUBPS addps #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #endif movq A, AA movq N, NN movq M, MMM movq LDA, LDAX movq X, XX movq OLD_Y, Y movss %xmm0,ALPHAR movss %xmm1,ALPHAI .L0t: xorq I,I addq $1,I salq $20,I subq I,MMM movq I,M movss ALPHAR,%xmm0 movss ALPHAI,%xmm1 jge .L00t movq MMM,M addq I,M jle .L999x .L00t: movq AA, A movq NN, N movq LDAX, LDA movq XX, X movq OLD_INCX, INCX # movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, LDA salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY unpcklps %xmm1, %xmm0 movlps %xmm0, ALPHA testq M, M jle .L999 testq N, N jle .L999 ALIGN_3 subq $-32 * SIZE, A movq BUFFER, Y1 pxor %xmm4, %xmm4 movq M, %rax addq $8, %rax sarq $3, %rax ALIGN_3 .L01: movaps %xmm4, 0 * SIZE(Y1) movaps %xmm4, 4 * SIZE(Y1) movaps %xmm4, 8 * SIZE(Y1) movaps %xmm4, 12 * SIZE(Y1) subq $-16 * SIZE, Y1 decq %rax jg .L01 ALIGN_3 .L10: #ifdef ALIGNED_ACCESS movq M, MM movq A, %rax andq $4 * SIZE - 1, %rax leaq 2 * SIZE(BUFFER), A1 leaq -1(M), A2 cmpq $2 * SIZE, %rax cmovge A1, BUFFER cmovge A2, MM testq $SIZE, A jne .L200 testq $2 * SIZE, LDA jne .L100 #endif #if GEMV_UNROLL >= 4 cmpq $4, N jl .L20 ALIGN_3 .L11: subq $4, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movsd (X), %xmm9 addq INCX, X movsd (X), %xmm11 addq INCX, X movsd (X), %xmm13 addq INCX, X movsd (X), %xmm15 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm6 #else movsd ALPHA, %xmm6 unpcklpd %xmm6, %xmm6 #endif pshufd $0xb1, %xmm6, %xmm5 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 pshufd $0x00, %xmm9, %xmm8 pshufd $0x55, %xmm9, %xmm9 pshufd $0x00, %xmm11, %xmm10 pshufd $0x55, %xmm11, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 pshufd $0x55, %xmm15, %xmm15 #ifndef XCONJ xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 xorps %xmm7, %xmm15 #else xorps %xmm7, %xmm8 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 xorps %xmm7, %xmm14 #endif mulps %xmm6, %xmm8 mulps %xmm5, %xmm9 mulps %xmm6, %xmm10 mulps %xmm5, %xmm11 mulps %xmm6, %xmm12 mulps %xmm5, %xmm13 mulps %xmm6, %xmm14 mulps %xmm5, %xmm15 #ifndef XCONJ subps %xmm9, %xmm8 subps %xmm11, %xmm10 subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif pshufd $0x55, %xmm8, %xmm9 pshufd $0x00, %xmm8, %xmm8 pshufd $0x55, %xmm10, %xmm11 pshufd $0x00, %xmm10, %xmm10 pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 pshufd $0x55, %xmm14, %xmm15 pshufd $0x00, %xmm14, %xmm14 #ifndef CONJ xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 xorps %xmm7, %xmm15 #else xorps %xmm7, %xmm8 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 xorps %xmm7, %xmm14 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L1X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm6 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movsd -32 * SIZE(A2, LDA), %xmm6 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L1X: #endif movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 movq MM, I sarq $3, I jle .L15 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A1, %xmm6) mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm2 MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm3 MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm1 MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm2 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm3 MOVUPS_A1(-28 * SIZE, A2, %xmm6) mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A2, %xmm6) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm2 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm3 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm2 MOVUPS_A1(-16 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 MOVUPS_A1(-12 * SIZE, A1, %xmm6) mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A1, %xmm6) mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm2 MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm3 MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm1 MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm2 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm3 MOVUPS_A1(-28 * SIZE, A2, %xmm6) mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A2, %xmm6) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm2 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm3 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm2 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L15: testq $4, MM je .L17 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm1 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm1 MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm1 mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm1 MOVUPS_A1(-32 * SIZE, A2, %xmm4) MOVUPS_A1(-28 * SIZE, A2, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L17: testq $2, MM je .L18 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L18: testq $1, MM je .L19 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm6 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movsd -32 * SIZE(A2, LDA), %xmm6 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) ALIGN_3 .L19: cmpq $4, N jge .L11 ALIGN_3 .L20: #endif cmpq $2, N jl .L30 #if GEMV_UNROLL == 2 ALIGN_3 .L21: #endif subq $2, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd (X), %xmm13 addq INCX, X movsd (X), %xmm15 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 pshufd $0x55, %xmm15, %xmm15 #ifndef XCONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 mulps %xmm8, %xmm14 mulps %xmm9, %xmm15 #ifndef XCONJ subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 pshufd $0x55, %xmm14, %xmm15 pshufd $0x00, %xmm14, %xmm14 #ifndef CONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L2X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L2X: #endif movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 ALIGN_3 movq MM, I sarq $3, I jle .L25 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) MOVUPS_A1(-24 * SIZE, A1, %xmm8) MOVUPS_A1(-20 * SIZE, A1, %xmm10) decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm6) pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm8) pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 MOVUPS_A1(-20 * SIZE, A2, %xmm10) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-12 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm8, %xmm9 mulps %xmm14, %xmm8 addps %xmm8, %xmm2 MOVUPS_A1( -8 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm10, %xmm11 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 MOVUPS_A1( -4 * SIZE, A1, %xmm10) mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm15, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm15, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm6) pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm8) pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 MOVUPS_A1(-20 * SIZE, A2, %xmm10) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm14, %xmm8 addps %xmm8, %xmm2 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm15, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm15, %xmm11 SUBPS %xmm11, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L25: testq $4, MM je .L27 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) MOVUPS_A1(-32 * SIZE, A2, %xmm8) MOVUPS_A1(-28 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm14, %xmm8 addps %xmm8, %xmm0 mulps %xmm14, %xmm10 addps %xmm10, %xmm1 mulps %xmm15, %xmm9 SUBPS %xmm9, %xmm0 mulps %xmm15, %xmm11 SUBPS %xmm11, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L27: testq $2, MM je .L28 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-32 * SIZE, A2, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L28: testq $1, MM #if GEMV_UNROLL == 2 je .L29 #else je .L30 #endif movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) #if GEMV_UNROLL == 2 ALIGN_3 .L29: cmpq $2, N jge .L21 #endif ALIGN_3 .L30: cmpq $1, N jl .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movsd (X), %xmm13 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 #ifndef XCONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 #ifndef XCONJ subps %xmm13, %xmm12 #else addps %xmm13, %xmm12 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 #ifndef CONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L3X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L3X: #endif movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 ALIGN_3 movq MM, I sarq $3, I jle .L35 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) MOVUPS_A1(-24 * SIZE, A1, %xmm8) MOVUPS_A1(-20 * SIZE, A1, %xmm10) decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-12 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 MOVUPS_A1( -8 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 MOVUPS_A1( -4 * SIZE, A1, %xmm10) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L35: testq $4, MM je .L37 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L37: testq $2, MM je .L38 MOVUPS_A1(-32 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L38: testq $1, MM je .L990 movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) #ifdef ALIGNED_ACCESS jmp .L990 ALIGN_3 .L100: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L110 ALIGN_3 .L101: subq $4, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movsd (X), %xmm9 addq INCX, X movsd (X), %xmm11 addq INCX, X movsd (X), %xmm13 addq INCX, X movsd (X), %xmm15 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm6 #else movsd ALPHA, %xmm6 unpcklpd %xmm6, %xmm6 #endif pshufd $0xb1, %xmm6, %xmm5 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 pshufd $0x00, %xmm9, %xmm8 pshufd $0x55, %xmm9, %xmm9 pshufd $0x00, %xmm11, %xmm10 pshufd $0x55, %xmm11, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 pshufd $0x55, %xmm15, %xmm15 #ifndef XCONJ xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 xorps %xmm7, %xmm15 #else xorps %xmm7, %xmm8 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 xorps %xmm7, %xmm14 #endif mulps %xmm6, %xmm8 mulps %xmm5, %xmm9 mulps %xmm6, %xmm10 mulps %xmm5, %xmm11 mulps %xmm6, %xmm12 mulps %xmm5, %xmm13 mulps %xmm6, %xmm14 mulps %xmm5, %xmm15 #ifndef XCONJ subps %xmm9, %xmm8 subps %xmm11, %xmm10 subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif pshufd $0x55, %xmm8, %xmm9 pshufd $0x00, %xmm8, %xmm8 pshufd $0x55, %xmm10, %xmm11 pshufd $0x00, %xmm10, %xmm10 pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 pshufd $0x55, %xmm14, %xmm15 pshufd $0x00, %xmm14, %xmm14 #ifndef CONJ xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 xorps %xmm7, %xmm15 #else xorps %xmm7, %xmm8 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 xorps %xmm7, %xmm14 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L10X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm6 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movsd -32 * SIZE(A2, LDA), %xmm6 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L10X: #endif movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 movq MM, I sarq $3, I jle .L105 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) decq I jle .L104 ALIGN_3 .L103: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A1, %xmm6) mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm2 movsd -32 * SIZE(A1, LDA), %xmm4 movhps -30 * SIZE(A1, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm3 movsd -28 * SIZE(A1, LDA), %xmm6 movhps -26 * SIZE(A1, LDA), %xmm6 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A1, LDA), %xmm4 movhps -22 * SIZE(A1, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm1 movsd -20 * SIZE(A1, LDA), %xmm6 movhps -18 * SIZE(A1, LDA), %xmm6 mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm2 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm3 MOVUPS_A1(-28 * SIZE, A2, %xmm6) mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A2, %xmm6) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm2 movsd -32 * SIZE(A2, LDA), %xmm4 movhps -30 * SIZE(A2, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm3 movsd -28 * SIZE(A2, LDA), %xmm6 movhps -26 * SIZE(A2, LDA), %xmm6 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A2, LDA), %xmm4 movhps -22 * SIZE(A2, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 movsd -20 * SIZE(A2, LDA), %xmm6 movhps -18 * SIZE(A2, LDA), %xmm6 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm2 MOVUPS_A1(-16 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 MOVUPS_A1(-12 * SIZE, A1, %xmm6) mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L103 ALIGN_3 .L104: pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A1, %xmm6) mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm2 movsd -32 * SIZE(A1, LDA), %xmm4 movhps -30 * SIZE(A1, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm3 movsd -28 * SIZE(A1, LDA), %xmm6 movhps -26 * SIZE(A1, LDA), %xmm6 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A1, LDA), %xmm4 movhps -22 * SIZE(A1, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm1 movsd -20 * SIZE(A1, LDA), %xmm6 movhps -18 * SIZE(A1, LDA), %xmm6 mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm2 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm3 MOVUPS_A1(-28 * SIZE, A2, %xmm6) mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-24 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-20 * SIZE, A2, %xmm6) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm2 movsd -32 * SIZE(A2, LDA), %xmm4 movhps -30 * SIZE(A2, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm3 movsd -28 * SIZE(A2, LDA), %xmm6 movhps -26 * SIZE(A2, LDA), %xmm6 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 movsd -24 * SIZE(A2, LDA), %xmm4 movhps -22 * SIZE(A2, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 movsd -20 * SIZE(A2, LDA), %xmm6 movhps -18 * SIZE(A2, LDA), %xmm6 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm2 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L105: testq $4, MM je .L107 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A1, LDA), %xmm4 movhps -30 * SIZE(A1, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm8, %xmm6 addps %xmm6, %xmm1 movsd -28 * SIZE(A1, LDA), %xmm6 movhps -26 * SIZE(A1, LDA), %xmm6 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm9, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm10, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm10, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm6) mulps %xmm11, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2, LDA), %xmm4 movhps -30 * SIZE(A2, LDA), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 movsd -28 * SIZE(A2, LDA), %xmm6 movhps -26 * SIZE(A2, LDA), %xmm6 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L107: testq $2, MM je .L108 MOVUPS_A1(-32 * SIZE, A1, %xmm4) movsd -32 * SIZE(A1, LDA), %xmm6 movhps -30 * SIZE(A1, LDA), %xmm6 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movsd -32 * SIZE(A2, LDA), %xmm6 movhps -30 * SIZE(A2, LDA), %xmm6 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L108: testq $1, MM je .L109 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm6 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm9, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movsd -32 * SIZE(A2, LDA), %xmm6 mulps %xmm11, %xmm7 SUBPS %xmm7, %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) ALIGN_3 .L109: cmpq $4, N jge .L101 ALIGN_3 .L110: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L120 #if GEMV_UNROLL == 2 ALIGN_3 .L111: #endif subq $2, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd (X), %xmm13 addq INCX, X movsd (X), %xmm15 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 pshufd $0x55, %xmm15, %xmm15 #ifndef XCONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 mulps %xmm8, %xmm14 mulps %xmm9, %xmm15 #ifndef XCONJ subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 pshufd $0x55, %xmm14, %xmm15 pshufd $0x00, %xmm14, %xmm14 #ifndef CONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L11X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L11X: #endif movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 ALIGN_3 movq MM, I sarq $3, I jle .L115 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) MOVUPS_A1(-24 * SIZE, A1, %xmm8) MOVUPS_A1(-20 * SIZE, A1, %xmm10) decq I jle .L114 ALIGN_3 .L113: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 movhps -30 * SIZE(A2), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 movsd -28 * SIZE(A2), %xmm6 movhps -26 * SIZE(A2), %xmm6 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 movsd -24 * SIZE(A2), %xmm8 movhps -22 * SIZE(A2), %xmm8 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movsd -20 * SIZE(A2), %xmm10 movhps -18 * SIZE(A2), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-12 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm8, %xmm9 mulps %xmm14, %xmm8 addps %xmm8, %xmm2 MOVUPS_A1( -8 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm10, %xmm11 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 MOVUPS_A1( -4 * SIZE, A1, %xmm10) mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm15, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm15, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L113 ALIGN_3 .L114: pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 movhps -30 * SIZE(A2), %xmm4 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 movsd -28 * SIZE(A2), %xmm6 movhps -26 * SIZE(A2), %xmm6 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 movsd -24 * SIZE(A2), %xmm8 movhps -22 * SIZE(A2), %xmm8 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movsd -20 * SIZE(A2), %xmm10 movhps -18 * SIZE(A2), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm14, %xmm6 addps %xmm6, %xmm1 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm14, %xmm8 addps %xmm8, %xmm2 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm15, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm15, %xmm11 SUBPS %xmm11, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L115: testq $4, MM je .L117 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) movsd -32 * SIZE(A2), %xmm8 movhps -30 * SIZE(A2), %xmm8 movsd -28 * SIZE(A2), %xmm10 movhps -26 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm14, %xmm8 addps %xmm8, %xmm0 mulps %xmm14, %xmm10 addps %xmm10, %xmm1 mulps %xmm15, %xmm9 SUBPS %xmm9, %xmm0 mulps %xmm15, %xmm11 SUBPS %xmm11, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L117: testq $2, MM je .L118 MOVUPS_A1(-32 * SIZE, A1, %xmm4) movsd -32 * SIZE(A2), %xmm6 movhps -30 * SIZE(A2), %xmm6 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L118: testq $1, MM #if GEMV_UNROLL == 2 je .L119 #else je .L120 #endif movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) #if GEMV_UNROLL == 2 ALIGN_3 .L119: cmpq $2, N jge .L111 #endif ALIGN_3 .L120: #endif cmpq $1, N jl .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movsd (X), %xmm13 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 #ifndef XCONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 #ifndef XCONJ subps %xmm13, %xmm12 #else addps %xmm13, %xmm12 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 #ifndef CONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L12X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L12X: #endif movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 ALIGN_3 movq MM, I sarq $3, I jle .L125 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) MOVUPS_A1(-24 * SIZE, A1, %xmm8) MOVUPS_A1(-20 * SIZE, A1, %xmm10) decq I jle .L124 ALIGN_3 .L123: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 MOVUPS_A1(-12 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 MOVUPS_A1( -8 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 MOVUPS_A1( -4 * SIZE, A1, %xmm10) mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L123 ALIGN_3 .L124: pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L125: testq $4, MM je .L127 MOVUPS_A1(-32 * SIZE, A1, %xmm4) MOVUPS_A1(-28 * SIZE, A1, %xmm6) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L127: testq $2, MM je .L128 MOVUPS_A1(-32 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L128: testq $1, MM je .L990 movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L200: testq $2 * SIZE, LDA jne .L300 cmpq $2, N jl .L210 ALIGN_3 .L201: subq $2, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd (X), %xmm13 addq INCX, X movsd (X), %xmm15 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 pshufd $0x55, %xmm15, %xmm15 #ifndef XCONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 mulps %xmm8, %xmm14 mulps %xmm9, %xmm15 #ifndef XCONJ subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 pshufd $0x55, %xmm14, %xmm15 pshufd $0x00, %xmm14, %xmm14 #ifndef CONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L20X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L20X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -33 * SIZE(A2), %xmm6 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 movq MM, I sarq $3, I jle .L205 movaps -29 * SIZE(A1), %xmm8 movaps -25 * SIZE(A1), %xmm9 movaps -21 * SIZE(A1), %xmm10 decq I jle .L204 ALIGN_3 .L203: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm12, %xmm8 addps %xmm8, %xmm1 movaps -29 * SIZE(A2), %xmm8 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x39, %xmm9, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movaps -25 * SIZE(A2), %xmm9 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movaps -21 * SIZE(A2), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm5 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movaps -17 * SIZE(A2), %xmm6 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm14, %xmm8 addps %xmm8, %xmm1 movaps -13 * SIZE(A1), %xmm8 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x39, %xmm9, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm14, %xmm9 addps %xmm9, %xmm2 movaps -9 * SIZE(A1), %xmm9 movss %xmm6, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 movaps -5 * SIZE(A1), %xmm10 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L203 ALIGN_3 .L204: movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm12, %xmm8 addps %xmm8, %xmm1 movaps -29 * SIZE(A2), %xmm8 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x39, %xmm9, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movaps -25 * SIZE(A2), %xmm9 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movaps -21 * SIZE(A2), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm5 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movaps -17 * SIZE(A2), %xmm6 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm14, %xmm8 addps %xmm8, %xmm1 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x39, %xmm9, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm14, %xmm9 addps %xmm9, %xmm2 movss %xmm6, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L205: testq $4, MM je .L207 movaps -29 * SIZE(A1), %xmm8 movaps -25 * SIZE(A1), %xmm9 movaps -29 * SIZE(A2), %xmm10 movaps -25 * SIZE(A2), %xmm11 movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm12, %xmm8 addps %xmm8, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm5 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movss %xmm11, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm14, %xmm10 addps %xmm10, %xmm1 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm9, %xmm4 movaps %xmm11, %xmm6 movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L207: testq $2, MM je .L208 movaps -29 * SIZE(A1), %xmm8 movaps -29 * SIZE(A2), %xmm9 movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 movss %xmm9, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L208: testq $1, MM je .L209 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) ALIGN_3 .L209: cmpq $2, N jge .L201 ALIGN_3 .L210: cmpq $1, N jl .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movsd (X), %xmm13 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 #ifndef XCONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 #ifndef XCONJ subps %xmm13, %xmm12 #else addps %xmm13, %xmm12 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 #ifndef CONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L21X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L21X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 movq MM, I sarq $3, I jle .L215 movaps -29 * SIZE(A1), %xmm6 movaps -25 * SIZE(A1), %xmm8 movaps -21 * SIZE(A1), %xmm10 decq I jle .L214 ALIGN_3 .L213: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 movaps -13 * SIZE(A1), %xmm6 movss %xmm10, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 movaps -9 * SIZE(A1), %xmm8 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movaps -5 * SIZE(A1), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L213 ALIGN_3 .L214: movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 movss %xmm10, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L215: testq $4, MM je .L217 movaps -29 * SIZE(A1), %xmm6 movaps -25 * SIZE(A1), %xmm8 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 movaps %xmm8, %xmm4 addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L217: testq $2, MM je .L218 movaps -29 * SIZE(A1), %xmm6 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L218: testq $1, MM je .L990 movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L300: cmpq $2, N jl .L310 ALIGN_3 .L301: subq $2, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd (X), %xmm13 addq INCX, X movsd (X), %xmm15 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 pshufd $0x55, %xmm15, %xmm15 #ifndef XCONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 mulps %xmm8, %xmm14 mulps %xmm9, %xmm15 #ifndef XCONJ subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 pshufd $0x55, %xmm14, %xmm15 pshufd $0x00, %xmm14, %xmm14 #ifndef CONJ xorps %xmm11, %xmm13 xorps %xmm11, %xmm15 #else xorps %xmm11, %xmm12 xorps %xmm11, %xmm14 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L30X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L30X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -35 * SIZE(A2), %xmm6 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 movq MM, I sarq $3, I jle .L305 movaps -29 * SIZE(A1), %xmm8 movaps -25 * SIZE(A1), %xmm9 movaps -21 * SIZE(A1), %xmm10 decq I jle .L304 ALIGN_3 .L303: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm12, %xmm8 addps %xmm8, %xmm1 movaps -31 * SIZE(A2), %xmm8 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x39, %xmm9, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movaps -27 * SIZE(A2), %xmm9 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movaps -23 * SIZE(A2), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 pshufd $0xb1, %xmm6, %xmm5 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movaps -19 * SIZE(A2), %xmm6 movss %xmm9, %xmm8 shufps $0x93, %xmm9, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm14, %xmm8 addps %xmm8, %xmm1 movaps -13 * SIZE(A1), %xmm8 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x93, %xmm10, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm14, %xmm9 addps %xmm9, %xmm2 movaps -9 * SIZE(A1), %xmm9 movss %xmm6, %xmm10 shufps $0x93, %xmm6, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 movaps -5 * SIZE(A1), %xmm10 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L303 ALIGN_3 .L304: movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm12, %xmm8 addps %xmm8, %xmm1 movaps -31 * SIZE(A2), %xmm8 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x39, %xmm9, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movaps -27 * SIZE(A2), %xmm9 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movaps -23 * SIZE(A2), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 pshufd $0xb1, %xmm6, %xmm5 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movaps -19 * SIZE(A2), %xmm6 movss %xmm9, %xmm8 shufps $0x93, %xmm9, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm14, %xmm8 addps %xmm8, %xmm1 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm9 shufps $0x93, %xmm10, %xmm9 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm14, %xmm9 addps %xmm9, %xmm2 movss %xmm6, %xmm10 shufps $0x93, %xmm6, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm14, %xmm10 addps %xmm10, %xmm3 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm2 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L305: testq $4, MM je .L307 movaps -29 * SIZE(A1), %xmm8 movaps -25 * SIZE(A1), %xmm9 movaps -31 * SIZE(A2), %xmm10 movaps -27 * SIZE(A2), %xmm11 movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm9, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm7 mulps %xmm12, %xmm8 addps %xmm8, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movss %xmm10, %xmm6 shufps $0x93, %xmm10, %xmm6 pshufd $0xb1, %xmm6, %xmm5 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movss %xmm11, %xmm10 shufps $0x93, %xmm11, %xmm10 pshufd $0xb1, %xmm10, %xmm7 mulps %xmm14, %xmm10 addps %xmm10, %xmm1 mulps %xmm15, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm9, %xmm4 movaps %xmm11, %xmm6 movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L307: testq $2, MM je .L308 movaps -29 * SIZE(A1), %xmm8 movaps -31 * SIZE(A2), %xmm9 movss %xmm8, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm9, %xmm6 shufps $0x93, %xmm9, %xmm6 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L308: testq $1, MM je .L309 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A2), %xmm6 pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 SUBPS %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) ALIGN_3 .L309: cmpq $2, N jge .L301 ALIGN_3 .L310: cmpq $1, N jl .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movsd (X), %xmm13 addq INCX, X #ifdef HAVE_SSE3 movddup ALPHA, %xmm8 #else movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 #ifndef XCONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif mulps %xmm8, %xmm12 mulps %xmm9, %xmm13 #ifndef XCONJ subps %xmm13, %xmm12 #else addps %xmm13, %xmm12 #endif pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 #ifndef CONJ xorps %xmm11, %xmm13 #else xorps %xmm11, %xmm12 #endif #ifdef ALIGNED_ACCESS cmpq M, MM je .L31X movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L31X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -32 * SIZE(Y1), %xmm0 movaps -28 * SIZE(Y1), %xmm1 movaps -24 * SIZE(Y1), %xmm2 movaps -20 * SIZE(Y1), %xmm3 movq MM, I sarq $3, I jle .L315 movaps -29 * SIZE(A1), %xmm6 movaps -25 * SIZE(A1), %xmm8 movaps -21 * SIZE(A1), %xmm10 decq I jle .L314 ALIGN_3 .L313: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 movaps -13 * SIZE(A1), %xmm6 movss %xmm10, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 movaps -9 * SIZE(A1), %xmm8 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 movaps -5 * SIZE(A1), %xmm10 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L313 ALIGN_3 .L314: movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 movss %xmm10, %xmm8 shufps $0x39, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm12, %xmm8 addps %xmm8, %xmm2 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm12, %xmm10 addps %xmm10, %xmm3 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 mulps %xmm13, %xmm9 SUBPS %xmm9, %xmm2 mulps %xmm13, %xmm11 SUBPS %xmm11, %xmm3 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, -24 * SIZE(Y1) movaps %xmm3, -20 * SIZE(Y1) movaps -16 * SIZE(Y1), %xmm0 movaps -12 * SIZE(Y1), %xmm1 movaps -8 * SIZE(Y1), %xmm2 movaps -4 * SIZE(Y1), %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L315: testq $4, MM je .L317 movaps -29 * SIZE(A1), %xmm6 movaps -25 * SIZE(A1), %xmm8 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm7 mulps %xmm12, %xmm6 addps %xmm6, %xmm1 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm1 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, -28 * SIZE(Y1) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 movaps %xmm8, %xmm4 addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L317: testq $2, MM je .L318 movaps -29 * SIZE(A1), %xmm6 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movaps %xmm0, -32 * SIZE(Y1) movaps %xmm1, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L318: testq $1, MM je .L990 movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 SUBPS %xmm5, %xmm0 movlps %xmm0, -32 * SIZE(Y1) #endif ALIGN_3 .L990: movq Y, Y1 #ifdef ALIGNED_ACCESS cmpq M, MM je .L991 movsd (Y), %xmm0 addq INCY, Y movsd (BUFFER), %xmm1 addq $2 * SIZE, BUFFER addps %xmm1, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 ALIGN_3 .L991: #endif movq MM, %rax sarq $3, %rax jle .L994 ALIGN_3 .L992: movsd (Y), %xmm0 addq INCY, Y movhps (Y), %xmm0 addq INCY, Y movsd (Y), %xmm1 addq INCY, Y movhps (Y), %xmm1 addq INCY, Y movsd (Y), %xmm2 addq INCY, Y movhps (Y), %xmm2 addq INCY, Y movsd (Y), %xmm3 addq INCY, Y movhps (Y), %xmm3 addq INCY, Y addps 0 * SIZE(BUFFER), %xmm0 addps 4 * SIZE(BUFFER), %xmm1 addps 8 * SIZE(BUFFER), %xmm2 addps 12 * SIZE(BUFFER), %xmm3 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 movlps %xmm1, (Y1) addq INCY, Y1 movhps %xmm1, (Y1) addq INCY, Y1 movlps %xmm2, (Y1) addq INCY, Y1 movhps %xmm2, (Y1) addq INCY, Y1 movlps %xmm3, (Y1) addq INCY, Y1 movhps %xmm3, (Y1) addq INCY, Y1 addq $16 * SIZE, BUFFER decq %rax jg .L992 ALIGN_3 .L994: testq $7, MM jle .L999 testq $4, MM jle .L995 movsd (Y), %xmm0 addq INCY, Y movhps (Y), %xmm0 addq INCY, Y movsd (Y), %xmm1 addq INCY, Y movhps (Y), %xmm1 addq INCY, Y addps 0 * SIZE(BUFFER), %xmm0 addps 4 * SIZE(BUFFER), %xmm1 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 movlps %xmm1, (Y1) addq INCY, Y1 movhps %xmm1, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER ALIGN_3 .L995: testq $2, MM jle .L996 movsd (Y), %xmm0 addq INCY, Y movhps (Y), %xmm0 addq INCY, Y addps 0 * SIZE(BUFFER), %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L996: testq $1, MM jle .L999 movsd (Y), %xmm0 addps 0 * SIZE(BUFFER), %xmm0 movlps %xmm0, (Y1) ALIGN_3 .L999: movq M, I salq $ZBASE_SHIFT,I addq I,AA jmp .L0t .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cgemv_n_4.c000066400000000000000000000367441313527062700174050ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include "common.h" #if defined(HASWELL) || defined(ZEN) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" #endif #define NBMAX 2048 #ifndef HAVE_KERNEL_4x4 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; y[i] += a1[i]*x[2] - a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; y[i] += a2[i]*x[4] - a2[i+1] * x[5]; y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; y[i] += a3[i]*x[6] - a3[i+1] * x[7]; y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; y[i] += a1[i]*x[2] + a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; y[i] += a2[i]*x[4] + a2[i+1] * x[5]; y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; y[i] += a3[i]*x[6] + a3[i+1] * x[7]; y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; #endif } } #endif #ifndef HAVE_KERNEL_4x2 static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1; a0 = ap[0]; a1 = ap[1]; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; y[i] += a1[i]*x[2] - a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; y[i] += a1[i]*x[2] + a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; #endif } } #endif #ifndef HAVE_KERNEL_4x1 static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; #endif } } #endif #ifndef HAVE_KERNEL_ADDY static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; if ( inc_dest != 2 ) { FLOAT temp_r; FLOAT temp_i; for ( i=0; i= 4 cmpq $4, N jl .L20 ALIGN_3 .L11: subq $4, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef ALIGNED_ACCESS cmpq M, MM je .L1X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A1, LDA), %xmm9 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd -32 * SIZE(A2), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd -32 * SIZE(A2, LDA), %xmm11 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm7 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L1X: #endif movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 #ifdef PREFETCHW PREFETCHW 7 * SIZE(Y1) #endif movq MM, I sarq $3, I jle .L15 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A1(-32 * SIZE, A2, %xmm10) MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-28 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-28 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-24 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-20 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) #endif pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-16 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-28 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-28 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-24 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-20 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 ALIGN_3 .L15: testq $4, MM je .L17 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A1(-32 * SIZE, A2, %xmm10) MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-28 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 MOVUPS_A1(-28 * SIZE, A2, %xmm10) pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_3 .L17: testq $2, MM je .L18 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A1(-32 * SIZE, A2, %xmm10) MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm7 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L18: testq $1, MM je .L19 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A1, LDA), %xmm9 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd -32 * SIZE(A2), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd -32 * SIZE(A2, LDA), %xmm11 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm7 ALIGN_3 .L19: pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm11, %xmm0 xorps %xmm11, %xmm2 xorps %xmm11, %xmm4 xorps %xmm11, %xmm6 #else xorps %xmm11, %xmm1 xorps %xmm11, %xmm3 xorps %xmm11, %xmm5 xorps %xmm11, %xmm7 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm3, %xmm2 haddps %xmm2, %xmm0 haddps %xmm5, %xmm4 haddps %xmm7, %xmm6 haddps %xmm6, %xmm4 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 movaps %xmm2, %xmm9 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm9 movaps %xmm4, %xmm10 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm10 movaps %xmm6, %xmm12 unpcklps %xmm7, %xmm6 unpckhps %xmm7, %xmm12 addps %xmm8, %xmm0 addps %xmm9, %xmm2 addps %xmm10, %xmm4 addps %xmm12, %xmm6 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 movlhps %xmm2, %xmm0 movlhps %xmm6, %xmm4 #endif pshufd $0xb1, %xmm0, %xmm1 pshufd $0xb1, %xmm4, %xmm5 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm4 mulps %xmm15, %xmm5 xorps %xmm11, %xmm0 xorps %xmm11, %xmm4 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm5, %xmm4 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm4 #endif movsd (Y), %xmm2 addq INCY, Y movhps (Y), %xmm2 addq INCY, Y movsd (Y), %xmm6 addq INCY, Y movhps (Y), %xmm6 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 shufps $0xd8, %xmm4, %xmm4 addps %xmm2, %xmm0 addps %xmm6, %xmm4 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 movlps %xmm4, (Y1) addq INCY, Y1 movhps %xmm4, (Y1) addq INCY, Y1 cmpq $4, N jge .L11 ALIGN_3 .L20: #endif cmpq $2, N jl .L30 #if GEMV_UNROLL == 2 ALIGN_3 .L21: #endif subq $2, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef ALIGNED_ACCESS cmpq M, MM je .L2X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L2X: #endif movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 #if (GEMV_UNROLL == 2) && defined(PREFETCHW) PREFETCHW 3 * SIZE(Y1) #endif movq MM, I sarq $3, I jle .L25 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A1(-32 * SIZE, A2, %xmm9) MOVUPS_A1(-28 * SIZE, A1, %xmm10) MOVUPS_A1(-28 * SIZE, A2, %xmm11) decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm9) mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 MOVUPS_A1(-20 * SIZE, A2, %xmm11) mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -20 * SIZE(X1), %xmm13 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A1(-16 * SIZE, A2, %xmm9) mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -16 * SIZE(X1), %xmm12 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm11) mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm9) mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 MOVUPS_A1(-20 * SIZE, A2, %xmm11) mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -20 * SIZE(X1), %xmm13 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -16 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 ALIGN_3 .L25: testq $4, MM je .L27 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A1(-32 * SIZE, A2, %xmm9) MOVUPS_A1(-28 * SIZE, A1, %xmm10) MOVUPS_A1(-28 * SIZE, A2, %xmm11) pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -20 * SIZE(X1), %xmm13 addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3 .L27: testq $2, MM je .L28 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A1(-32 * SIZE, A2, %xmm9) pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L28: testq $1, MM je .L29 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 ALIGN_3 .L29: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 xorps %xmm5, %xmm2 #else xorps %xmm5, %xmm1 xorps %xmm5, %xmm3 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm3, %xmm2 haddps %xmm2, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 movaps %xmm2, %xmm4 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm4 addps %xmm8, %xmm0 addps %xmm4, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 addps %xmm1, %xmm0 addps %xmm3, %xmm2 movlhps %xmm2, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y movhps (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L21 #endif ALIGN_3 .L30: cmpq $1, N jl .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #ifdef ALIGNED_ACCESS cmpq M, MM je .L3X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L3X: #endif movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 movq MM, I sarq $3, I jle .L35 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A1(-28 * SIZE, A1, %xmm10) decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -20 * SIZE(X1), %xmm13 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -16 * SIZE(X1), %xmm12 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -20 * SIZE(X1), %xmm13 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -16 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 ALIGN_3 .L35: testq $4, MM je .L37 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A1(-28 * SIZE, A1, %xmm10) pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -20 * SIZE(X1), %xmm13 addq $8 * SIZE, A1 ALIGN_3 .L37: testq $2, MM je .L38 MOVUPS_A1(-32 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 ALIGN_3 .L38: testq $1, MM je .L39 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 ALIGN_3 .L39: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 #else xorps %xmm5, %xmm1 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm0, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 addps %xmm8, %xmm0 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 #ifdef ALIGNED_ACCESS jmp .L999 ALIGN_3 .L100: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L110 ALIGN_3 .L101: subq $4, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef ALIGNED_ACCESS cmpq M, MM je .L10X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A1, LDA), %xmm9 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd -32 * SIZE(A2), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd -32 * SIZE(A2, LDA), %xmm11 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm7 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L10X: #endif movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 #ifdef PREFETCHW PREFETCHW 7 * SIZE(Y1) #endif movq MM, I sarq $3, I jle .L105 movaps -32 * SIZE(A1), %xmm8 movsd -32 * SIZE(A1, LDA), %xmm9 movhps -30 * SIZE(A1, LDA), %xmm9 movaps -32 * SIZE(A2), %xmm10 movsd -32 * SIZE(A2, LDA), %xmm11 movhps -30 * SIZE(A2, LDA), %xmm11 decq I jle .L104 ALIGN_3 .L103: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -28 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -28 * SIZE(A1, LDA), %xmm9 movhps -26 * SIZE(A1, LDA), %xmm9 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 movaps -28 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 movsd -28 * SIZE(A2, LDA), %xmm11 movhps -26 * SIZE(A2, LDA), %xmm11 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -24 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 movsd -24 * SIZE(A1, LDA), %xmm9 movhps -22 * SIZE(A1, LDA), %xmm9 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 movaps -24 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 movsd -24 * SIZE(A2, LDA), %xmm11 movhps -22 * SIZE(A2, LDA), %xmm11 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -20 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -20 * SIZE(A1, LDA), %xmm9 movhps -18 * SIZE(A1, LDA), %xmm9 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 movaps -20 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 movsd -20 * SIZE(A2, LDA), %xmm11 movhps -18 * SIZE(A2, LDA), %xmm11 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 movsd -16 * SIZE(A1, LDA), %xmm9 movhps -14 * SIZE(A1, LDA), %xmm9 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) #endif pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 movaps -16 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 movsd -16 * SIZE(A2, LDA), %xmm11 movhps -14 * SIZE(A2, LDA), %xmm11 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L103 ALIGN_3 .L104: pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -28 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -28 * SIZE(A1, LDA), %xmm9 movhps -26 * SIZE(A1, LDA), %xmm9 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 movaps -28 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 movsd -28 * SIZE(A2, LDA), %xmm11 movhps -26 * SIZE(A2, LDA), %xmm11 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -24 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 movsd -24 * SIZE(A1, LDA), %xmm9 movhps -22 * SIZE(A1, LDA), %xmm9 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 movaps -24 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 movsd -24 * SIZE(A2, LDA), %xmm11 movhps -22 * SIZE(A2, LDA), %xmm11 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -20 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -20 * SIZE(A1, LDA), %xmm9 movhps -18 * SIZE(A1, LDA), %xmm9 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 movaps -20 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 movsd -20 * SIZE(A2, LDA), %xmm11 movhps -18 * SIZE(A2, LDA), %xmm11 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 ALIGN_3 .L105: testq $4, MM je .L107 movaps -32 * SIZE(A1), %xmm8 movsd -32 * SIZE(A1, LDA), %xmm9 movhps -30 * SIZE(A1, LDA), %xmm9 movaps -32 * SIZE(A2), %xmm10 movsd -32 * SIZE(A2, LDA), %xmm11 movhps -30 * SIZE(A2, LDA), %xmm11 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -28 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -28 * SIZE(A1, LDA), %xmm9 movhps -26 * SIZE(A1, LDA), %xmm9 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 movaps -28 * SIZE(A2), %xmm10 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 movsd -28 * SIZE(A2, LDA), %xmm11 movhps -26 * SIZE(A2, LDA), %xmm11 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm15, %xmm7 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm13, %xmm9 addps %xmm9, %xmm2 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm13, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm13, %xmm11 addps %xmm11, %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm7 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_3 .L107: testq $2, MM je .L108 movaps -32 * SIZE(A1), %xmm8 movsd -32 * SIZE(A1, LDA), %xmm9 movhps -30 * SIZE(A1, LDA), %xmm9 movaps -32 * SIZE(A2), %xmm10 movsd -32 * SIZE(A2, LDA), %xmm11 movhps -30 * SIZE(A2, LDA), %xmm11 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm7 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L108: testq $1, MM je .L109 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A1, LDA), %xmm9 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd -32 * SIZE(A2), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd -32 * SIZE(A2, LDA), %xmm11 pshufd $0xb1, %xmm8, %xmm14 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 pshufd $0xb1, %xmm9, %xmm15 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm3 pshufd $0xb1, %xmm10, %xmm14 mulps %xmm12, %xmm10 addps %xmm10, %xmm4 pshufd $0xb1, %xmm11, %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm6 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm5 mulps %xmm12, %xmm15 SUBPS %xmm15, %xmm7 ALIGN_3 .L109: pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm11, %xmm0 xorps %xmm11, %xmm2 xorps %xmm11, %xmm4 xorps %xmm11, %xmm6 #else xorps %xmm11, %xmm1 xorps %xmm11, %xmm3 xorps %xmm11, %xmm5 xorps %xmm11, %xmm7 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm3, %xmm2 haddps %xmm2, %xmm0 haddps %xmm5, %xmm4 haddps %xmm7, %xmm6 haddps %xmm6, %xmm4 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 movaps %xmm2, %xmm9 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm9 movaps %xmm4, %xmm10 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm10 movaps %xmm6, %xmm11 unpcklps %xmm7, %xmm6 unpckhps %xmm7, %xmm11 addps %xmm8, %xmm0 addps %xmm9, %xmm2 addps %xmm10, %xmm4 addps %xmm11, %xmm6 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 movlhps %xmm2, %xmm0 movlhps %xmm6, %xmm4 #endif pshufd $0xb1, %xmm0, %xmm1 pshufd $0xb1, %xmm4, %xmm5 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm4 mulps %xmm15, %xmm5 xorps %xmm11, %xmm0 xorps %xmm11, %xmm4 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm5, %xmm4 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 addps %xmm2, %xmm0 addps %xmm6, %xmm4 #endif movsd (Y), %xmm2 addq INCY, Y movhps (Y), %xmm2 addq INCY, Y movsd (Y), %xmm6 addq INCY, Y movhps (Y), %xmm6 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 shufps $0xd8, %xmm4, %xmm4 addps %xmm2, %xmm0 addps %xmm6, %xmm4 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 movlps %xmm4, (Y1) addq INCY, Y1 movhps %xmm4, (Y1) addq INCY, Y1 cmpq $4, N jge .L101 ALIGN_3 .L110: #endif cmpq $2, N jl .L120 #if GEMV_UNROLL == 2 ALIGN_3 .L111: #endif subq $2, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef ALIGNED_ACCESS cmpq M, MM je .L11X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L11X: #endif movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 #if (GEMV_UNROLL == 2) && defined(PREFETCHW) PREFETCHW 3 * SIZE(Y1) #endif movq MM, I sarq $3, I jle .L115 movaps -32 * SIZE(A1), %xmm8 movsd -32 * SIZE(A2), %xmm9 movhps -30 * SIZE(A2), %xmm9 movaps -28 * SIZE(A1), %xmm10 movsd -28 * SIZE(A2), %xmm11 movhps -26 * SIZE(A2), %xmm11 decq I jle .L114 ALIGN_3 .L113: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -24 * SIZE(A1), %xmm8 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -24 * SIZE(A2), %xmm9 movhps -22 * SIZE(A2), %xmm9 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 movaps -20 * SIZE(A1), %xmm10 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 movsd -20 * SIZE(A2), %xmm11 movhps -18 * SIZE(A2), %xmm11 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -20 * SIZE(X1), %xmm13 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(A1), %xmm8 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -16 * SIZE(A2), %xmm9 movhps -14 * SIZE(A2), %xmm9 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -16 * SIZE(X1), %xmm12 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 movaps -12 * SIZE(A1), %xmm10 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 movsd -12 * SIZE(A2), %xmm11 movhps -10 * SIZE(A2), %xmm11 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L113 ALIGN_3 .L114: pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -24 * SIZE(A1), %xmm8 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 movsd -24 * SIZE(A2), %xmm9 movhps -22 * SIZE(A2), %xmm9 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 movaps -20 * SIZE(A1), %xmm10 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 movsd -20 * SIZE(A2), %xmm11 movhps -18 * SIZE(A2), %xmm11 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -20 * SIZE(X1), %xmm13 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -16 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 ALIGN_3 .L115: testq $4, MM je .L117 movaps -32 * SIZE(A1), %xmm8 movsd -32 * SIZE(A2), %xmm9 movhps -30 * SIZE(A2), %xmm9 movaps -28 * SIZE(A1), %xmm10 movsd -28 * SIZE(A2), %xmm11 movhps -26 * SIZE(A2), %xmm11 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 pshufd $0xb1, %xmm11, %xmm7 mulps %xmm13, %xmm11 addps %xmm11, %xmm2 mulps %xmm13, %xmm7 SUBPS %xmm7, %xmm3 movaps -20 * SIZE(X1), %xmm13 addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3 .L117: testq $2, MM je .L118 movaps -32 * SIZE(A1), %xmm8 movsd -32 * SIZE(A2), %xmm9 movhps -30 * SIZE(A2), %xmm9 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L118: testq $1, MM je .L119 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 ALIGN_3 .L119: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 xorps %xmm5, %xmm2 #else xorps %xmm5, %xmm1 xorps %xmm5, %xmm3 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm3, %xmm2 haddps %xmm2, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 movaps %xmm2, %xmm4 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm4 addps %xmm8, %xmm0 addps %xmm4, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 addps %xmm1, %xmm0 addps %xmm3, %xmm2 movlhps %xmm2, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y movhps (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L111 #endif ALIGN_3 .L120: cmpq $1, N jl .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #ifdef ALIGNED_ACCESS cmpq M, MM je .L12X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L12X: #endif movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 movq MM, I sarq $3, I jle .L125 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A1(-28 * SIZE, A1, %xmm10) decq I jle .L124 ALIGN_3 .L123: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -20 * SIZE(X1), %xmm13 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -16 * SIZE(X1), %xmm12 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L123 ALIGN_3 .L124: pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-24 * SIZE, A1, %xmm8) mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 MOVUPS_A1(-20 * SIZE, A1, %xmm10) mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -20 * SIZE(X1), %xmm13 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -16 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -12 * SIZE(X1), %xmm13 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 ALIGN_3 .L125: testq $4, MM je .L127 MOVUPS_A1(-32 * SIZE, A1, %xmm8) MOVUPS_A1(-28 * SIZE, A1, %xmm10) pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps -24 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm10, %xmm6 mulps %xmm13, %xmm10 addps %xmm10, %xmm0 mulps %xmm13, %xmm6 SUBPS %xmm6, %xmm1 movaps -20 * SIZE(X1), %xmm13 addq $8 * SIZE, A1 ALIGN_3 .L127: testq $2, MM je .L128 MOVUPS_A1(-32 * SIZE, A1, %xmm8) pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 ALIGN_3 .L128: testq $1, MM je .L129 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 ALIGN_3 .L129: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 #else xorps %xmm5, %xmm1 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm0, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 addps %xmm8, %xmm0 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 jmp .L999 ALIGN_3 .L200: testq $2 * SIZE, LDA jne .L300 cmpq $2, N jl .L210 ALIGN_3 .L201: subq $2, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef ALIGNED_ACCESS cmpq M, MM je .L20X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L20X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -33 * SIZE(A2), %xmm5 movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq MM, I sarq $3, I jle .L205 movaps -29 * SIZE(A1), %xmm6 movaps -29 * SIZE(A2), %xmm7 decq I jle .L204 ALIGN_3 .L203: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -25 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -25 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 movaps -21 * SIZE(A1), %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 movaps -21 * SIZE(A2), %xmm7 mulps %xmm13, %xmm14 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -17 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 movaps -13 * SIZE(A1), %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 movaps -13 * SIZE(A2), %xmm7 mulps %xmm13, %xmm14 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L203 ALIGN_3 .L204: movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -25 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -25 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 movaps -21 * SIZE(A1), %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 movaps -21 * SIZE(A2), %xmm7 mulps %xmm13, %xmm14 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -17 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 mulps %xmm13, %xmm14 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 ALIGN_3 .L205: testq $4, MM je .L207 movaps -29 * SIZE(A1), %xmm6 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movaps -29 * SIZE(A2), %xmm7 movss %xmm7, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm3 movaps -25 * SIZE(A1), %xmm8 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movaps -25 * SIZE(A2), %xmm9 movss %xmm9, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm3 movaps %xmm8, %xmm4 movaps %xmm9, %xmm5 movaps -24 * SIZE(X1), %xmm12 movaps -20 * SIZE(X1), %xmm13 addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3 .L207: testq $2, MM je .L208 movaps -29 * SIZE(A1), %xmm6 movaps -29 * SIZE(A2), %xmm7 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm3 movaps %xmm6, %xmm4 movaps %xmm7, %xmm5 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L208: testq $1, MM je .L209 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 ALIGN_3 .L209: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 xorps %xmm5, %xmm2 #else xorps %xmm5, %xmm1 xorps %xmm5, %xmm3 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm3, %xmm2 haddps %xmm2, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 movaps %xmm2, %xmm4 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm4 addps %xmm8, %xmm0 addps %xmm4, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 addps %xmm1, %xmm0 addps %xmm3, %xmm2 movlhps %xmm2, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y movhps (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 cmpq $2, N jge .L201 ALIGN_3 .L210: cmpq $1, N jl .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #ifdef ALIGNED_ACCESS cmpq M, MM je .L21X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L21X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 movq MM, I sarq $3, I jle .L215 movaps -29 * SIZE(A1), %xmm5 movaps -25 * SIZE(A1), %xmm6 movaps -21 * SIZE(A1), %xmm7 decq I jle .L214 ALIGN_3 .L213: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm15 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 movaps -13 * SIZE(A1), %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm12, %xmm6 addps %xmm6, %xmm0 movaps -9 * SIZE(A1), %xmm6 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm4, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm15 mulps %xmm13, %xmm7 addps %xmm7, %xmm0 movaps -5 * SIZE(A1), %xmm7 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L213 ALIGN_3 .L214: movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm15 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm12, %xmm6 addps %xmm6, %xmm0 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm4, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm15 mulps %xmm13, %xmm7 addps %xmm7, %xmm0 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 ALIGN_3 .L215: testq $4, MM je .L217 movaps -29 * SIZE(A1), %xmm5 movaps -25 * SIZE(A1), %xmm6 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm15 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm1 movaps -24 * SIZE(X1), %xmm12 movaps -20 * SIZE(X1), %xmm13 movaps %xmm6, %xmm4 addq $8 * SIZE, A1 ALIGN_3 .L217: testq $2, MM je .L218 movaps -29 * SIZE(A1), %xmm5 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 ALIGN_3 .L218: testq $1, MM je .L219 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 ALIGN_3 .L219: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 #else xorps %xmm5, %xmm1 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm0, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 addps %xmm8, %xmm0 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 jmp .L999 .L300: cmpq $2, N jl .L310 ALIGN_3 .L301: subq $2, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef ALIGNED_ACCESS cmpq M, MM je .L30X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L30X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -35 * SIZE(A2), %xmm5 movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq MM, I sarq $3, I jle .L305 movaps -29 * SIZE(A1), %xmm6 movaps -31 * SIZE(A2), %xmm7 decq I jle .L304 ALIGN_3 .L303: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -25 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x93, %xmm7, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -27 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 movaps -21 * SIZE(A1), %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x93, %xmm5, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 movaps -23 * SIZE(A2), %xmm7 mulps %xmm13, %xmm14 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x93, %xmm7, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -19 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 movaps -13 * SIZE(A1), %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x93, %xmm5, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 movaps -15 * SIZE(A2), %xmm7 mulps %xmm13, %xmm14 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L303 ALIGN_3 .L304: movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -25 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x93, %xmm7, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -27 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 movaps -21 * SIZE(A1), %xmm6 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x93, %xmm5, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 movaps -23 * SIZE(A2), %xmm7 mulps %xmm13, %xmm14 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x93, %xmm7, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 movaps -19 * SIZE(A2), %xmm5 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm3 movss %xmm4, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm5, %xmm7 shufps $0x93, %xmm5, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 mulps %xmm13, %xmm14 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm14, %xmm3 subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, X1 ALIGN_3 .L305: testq $4, MM je .L307 movaps -29 * SIZE(A1), %xmm6 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movaps -31 * SIZE(A2), %xmm7 movss %xmm7, %xmm5 shufps $0x93, %xmm7, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm3 movaps -25 * SIZE(A1), %xmm8 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm13, %xmm6 addps %xmm6, %xmm0 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm1 movaps -27 * SIZE(A2), %xmm9 movss %xmm9, %xmm7 shufps $0x93, %xmm9, %xmm7 pshufd $0xb1, %xmm7, %xmm14 mulps %xmm13, %xmm7 addps %xmm7, %xmm2 mulps %xmm13, %xmm14 SUBPS %xmm14, %xmm3 movaps %xmm8, %xmm4 movaps %xmm9, %xmm5 movaps -24 * SIZE(X1), %xmm12 movaps -20 * SIZE(X1), %xmm13 addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3 .L307: testq $2, MM je .L308 movaps -29 * SIZE(A1), %xmm6 movaps -31 * SIZE(A2), %xmm7 movss %xmm6, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm7, %xmm5 shufps $0x93, %xmm7, %xmm5 pshufd $0xb1, %xmm5, %xmm14 mulps %xmm12, %xmm5 addps %xmm5, %xmm2 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm3 movaps %xmm6, %xmm4 movaps %xmm7, %xmm5 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L308: testq $1, MM je .L309 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd -32 * SIZE(A2), %xmm9 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 pshufd $0xb1, %xmm9, %xmm5 mulps %xmm12, %xmm9 addps %xmm9, %xmm2 mulps %xmm12, %xmm5 SUBPS %xmm5, %xmm3 ALIGN_3 .L309: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 xorps %xmm5, %xmm2 #else xorps %xmm5, %xmm1 xorps %xmm5, %xmm3 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm3, %xmm2 haddps %xmm2, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 movaps %xmm2, %xmm4 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm4 addps %xmm8, %xmm0 addps %xmm4, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 addps %xmm1, %xmm0 addps %xmm3, %xmm2 movlhps %xmm2, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y movhps (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 movhps %xmm0, (Y1) addq INCY, Y1 cmpq $2, N jge .L301 ALIGN_3 .L310: cmpq $1, N jl .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 #ifdef ALIGNED_ACCESS cmpq M, MM je .L31X #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd -32 * SIZE(X1), %xmm12 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L31X: #endif movaps -33 * SIZE(A1), %xmm4 movaps -32 * SIZE(X1), %xmm12 movaps -28 * SIZE(X1), %xmm13 movq MM, I sarq $3, I jle .L315 movaps -29 * SIZE(A1), %xmm5 movaps -25 * SIZE(A1), %xmm6 movaps -21 * SIZE(A1), %xmm7 decq I jle .L314 ALIGN_3 .L313: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm15 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 movaps -13 * SIZE(A1), %xmm5 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm12, %xmm6 addps %xmm6, %xmm0 movaps -9 * SIZE(A1), %xmm6 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm4, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm15 mulps %xmm13, %xmm7 addps %xmm7, %xmm0 movaps -5 * SIZE(A1), %xmm7 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 subq $1, I BRANCH jg .L313 ALIGN_3 .L314: movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -17 * SIZE(A1), %xmm4 mulps %xmm12, %xmm14 movaps -24 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm15 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm13, %xmm15 movaps -20 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm14 mulps %xmm12, %xmm6 addps %xmm6, %xmm0 mulps %xmm12, %xmm14 movaps -16 * SIZE(X1), %xmm12 SUBPS %xmm14, %xmm1 movss %xmm4, %xmm7 shufps $0x39, %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm15 mulps %xmm13, %xmm7 addps %xmm7, %xmm0 mulps %xmm13, %xmm15 movaps -12 * SIZE(X1), %xmm13 SUBPS %xmm15, %xmm1 subq $-16 * SIZE, A1 subq $-16 * SIZE, X1 ALIGN_3 .L315: testq $4, MM je .L317 movaps -29 * SIZE(A1), %xmm5 movaps -25 * SIZE(A1), %xmm6 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm15 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm13, %xmm15 SUBPS %xmm15, %xmm1 movaps -24 * SIZE(X1), %xmm12 movaps -20 * SIZE(X1), %xmm13 movaps %xmm6, %xmm4 addq $8 * SIZE, A1 ALIGN_3 .L317: testq $2, MM je .L318 movaps -29 * SIZE(A1), %xmm5 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm14 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm14 SUBPS %xmm14, %xmm1 movaps %xmm13, %xmm12 addq $4 * SIZE, A1 ALIGN_3 .L318: testq $1, MM je .L319 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(A1), %xmm8 pshufd $0xb1, %xmm8, %xmm4 mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm4 SUBPS %xmm4, %xmm1 ALIGN_3 .L319: pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 #else xorps %xmm5, %xmm1 #endif #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 haddps %xmm0, %xmm0 #else movaps %xmm0, %xmm8 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm8 addps %xmm8, %xmm0 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 #endif pshufd $0xb1, %xmm0, %xmm1 #ifdef HAVE_SSE3 movddup ALPHA, %xmm15 #else movsd ALPHA, %xmm15 pshufd $0x44, %xmm15, %xmm15 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 xorps %xmm5, %xmm0 #ifdef HAVE_SSE3 haddps %xmm1, %xmm0 #else movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 addps %xmm2, %xmm0 #endif movsd (Y), %xmm12 addq INCY, Y shufps $0xd8, %xmm0, %xmm0 addps %xmm12, %xmm0 movlps %xmm0, (Y1) addq INCY, Y1 #endif ALIGN_3 .L999: movq M, I salq $ZBASE_SHIFT,I addq I,AA jmp .L0t .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cgemv_t_4.c000066400000000000000000000372301313527062700174020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(HASWELL) || defined(ZEN) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" #endif #define NBMAX 2048 #ifndef HAVE_KERNEL_4x4 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; FLOAT alpha_r = alpha[0]; FLOAT alpha_i = alpha[1]; FLOAT temp_r0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_r2 = 0.0; FLOAT temp_r3 = 0.0; FLOAT temp_i0 = 0.0; FLOAT temp_i1 = 0.0; FLOAT temp_i2 = 0.0; FLOAT temp_i3 = 0.0; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; #else temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; #endif } #if !defined(XCONJ) y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; #else y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; #endif } #endif #ifndef HAVE_KERNEL_4x2 static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1; a0 = ap[0]; a1 = ap[1]; FLOAT alpha_r = alpha[0]; FLOAT alpha_i = alpha[1]; FLOAT temp_r0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_i0 = 0.0; FLOAT temp_i1 = 0.0; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; #else temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; #endif } #if !defined(XCONJ) y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; #else y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif } #endif #ifndef HAVE_KERNEL_4x1 static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0; a0 = ap; FLOAT alpha_r = alpha[0]; FLOAT alpha_i = alpha[1]; FLOAT temp_r0 = 0.0; FLOAT temp_i0 = 0.0; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; #else temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; #endif } #if !defined(XCONJ) y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; #else y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; #endif } #endif static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; for ( i=0; i> 2 ; n2 = n & 3 ; m3 = m & 3 ; m1 = m - m3; m2 = (m & (NBMAX-1)) - m3 ; alpha[0] = alpha_r; alpha[1] = alpha_i; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } y_ptr = y; a_ptr = a; x_ptr = x; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if ( inc_x != 2 ) copy_x(NB,x_ptr,xbuffer,inc_x); else xbuffer = x_ptr; if ( inc_y == 2 ) { for( i = 0; i < n1 ; i++) { cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; y_ptr += 8; } if ( n2 & 2 ) { cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); a_ptr += lda * 2; y_ptr += 4; } if ( n2 & 1 ) { cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); a_ptr += lda; y_ptr += 2; } } else { for( i = 0; i < n1 ; i++) { memset(ybuffer,0,32); cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; y_ptr += inc_y; y_ptr[0] += ybuffer[2]; y_ptr[1] += ybuffer[3]; y_ptr += inc_y; y_ptr[0] += ybuffer[4]; y_ptr[1] += ybuffer[5]; y_ptr += inc_y; y_ptr[0] += ybuffer[6]; y_ptr[1] += ybuffer[7]; y_ptr += inc_y; } for( i = 0; i < n2 ; i++) { memset(ybuffer,0,32); cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); a_ptr += lda; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; y_ptr += inc_y; } } a += 2 * NB; x += NB * inc_x; } if ( m3 == 0 ) return(0); x_ptr = x; j=0; a_ptr = a; y_ptr = y; if ( m3 == 3 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; FLOAT x2 = x_ptr[0]; FLOAT x3 = x_ptr[1]; x_ptr += inc_x; FLOAT x4 = x_ptr[0]; FLOAT x5 = x_ptr[1]; while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif #if !defined(XCONJ) y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } if ( m3 == 2 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT temp_r1 ; FLOAT temp_i1 ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; FLOAT x2 = x_ptr[0]; FLOAT x3 = x_ptr[1]; FLOAT ar = alpha[0]; FLOAT ai = alpha[1]; while ( j < ( n & -2 )) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; a_ptr += lda; temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; a_ptr += lda; temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 - ai * temp_i1; y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 + ai * temp_i1; y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; j+=2; } while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } if ( m3 == 1 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT temp_r1 ; FLOAT temp_i1 ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; FLOAT ar = alpha[0]; FLOAT ai = alpha[1]; while ( j < ( n & -2 )) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; a_ptr += lda; temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; a_ptr += lda; temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 - ai * temp_i1; y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 + ai * temp_i1; y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; j+=2; } while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } return(0); } OpenBLAS-0.2.20/kernel/x86_64/cgemv_t_microk_bulldozer-4.c000066400000000000000000000603031313527062700227430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary froms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary from must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "prefetcht0 384(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 384(%5,%0,4) \n\t" "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "prefetcht0 384(%6,%0,4) \n\t" "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 "prefetcht0 384(%7,%0,4) \n\t" "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t" "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t" #else "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t" "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" #endif "vmovsd (%3), %%xmm4 \n\t" // read y "vmovsd 8(%3), %%xmm5 \n\t" "vmovsd 16(%3), %%xmm6 \n\t" "vmovsd 24(%3), %%xmm7 \n\t" "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vextractf128 $1, %%ymm10, %%xmm11 \n\t" "vextractf128 $1, %%ymm12, %%xmm13 \n\t" "vextractf128 $1, %%ymm14, %%xmm15 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t" "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t" "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t" #else "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" #endif "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" "vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t" "vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t" "vmovsd %%xmm8 , (%3) \n\t" "vmovsd %%xmm10, 8(%3) \n\t" "vmovsd %%xmm12, 16(%3) \n\t" "vmovsd %%xmm14, 24(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x2 1 static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "prefetcht0 384(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 384(%5,%0,4) \n\t" "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" #else "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" #endif "vmovsd (%3), %%xmm4 \n\t" // read y "vmovsd 8(%3), %%xmm5 \n\t" "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vextractf128 $1, %%ymm10, %%xmm11 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" #else "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" #endif "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" "vmovsd %%xmm8 , (%3) \n\t" "vmovsd %%xmm10, 8(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x1 1 static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "prefetcht0 384(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" #else "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" #endif "vmovsd (%3), %%xmm4 \n\t" // read y "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" #else "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" #endif "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" "vmovsd %%xmm8 , (%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 "r" (alpha) // 5 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cgemv_t_microk_haswell-4.c000066400000000000000000000577001313527062700224070ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary froms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary from must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 192(%5,%0,4) \n\t" "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "prefetcht0 192(%2,%0,4) \n\t" "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "prefetcht0 192(%6,%0,4) \n\t" "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 "prefetcht0 192(%7,%0,4) \n\t" "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2 "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3 "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t" "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t" #else "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t" "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" #endif "vmovsd (%3), %%xmm4 \n\t" // read y "vmovsd 8(%3), %%xmm5 \n\t" "vmovsd 16(%3), %%xmm6 \n\t" "vmovsd 24(%3), %%xmm7 \n\t" "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vextractf128 $1, %%ymm10, %%xmm11 \n\t" "vextractf128 $1, %%ymm12, %%xmm13 \n\t" "vextractf128 $1, %%ymm14, %%xmm15 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t" "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t" "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t" "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t" #else "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" #endif "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" "vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t" "vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t" "vmovsd %%xmm8 , (%3) \n\t" "vmovsd %%xmm10, 8(%3) \n\t" "vmovsd %%xmm12, 16(%3) \n\t" "vmovsd %%xmm14, 24(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x2 1 static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 192(%5,%0,4) \n\t" "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "prefetcht0 192(%2,%0,4) \n\t" "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t" "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t" #else "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t" "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t" #endif "vmovsd (%3), %%xmm4 \n\t" // read y "vmovsd 8(%3), %%xmm5 \n\t" "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vextractf128 $1, %%ymm10, %%xmm11 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t" "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t" "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t" #else "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" #endif "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t" "vmovsd %%xmm8 , (%3) \n\t" "vmovsd %%xmm10, 8(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x1 1 static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "prefetcht0 192(%2,%0,4) \n\t" "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $16 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t" #else "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" #endif "vmovsd (%3), %%xmm4 \n\t" // read y "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t" "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t" "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t" #else "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" #endif "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t" "vmovsd %%xmm8 , (%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 "r" (alpha) // 5 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/copy.S000066400000000000000000000153711313527062700164670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #define FLAG ARG6 #else #define INCY %r10 #define FLAG %r11 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif EMMS testq N, N # if m == 0 goto End jle .L999 salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY cmpq $SIZE, INCX # if incx != 1 jne .L100 cmpq $SIZE, INCY # if incy != 1 jne .L100 movq N, %rax # i = m sarq $3, %rax jle .L20 ALIGN_2 .L11: #ifdef XDOUBLE #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq 0(X), %mm0 movq 8(X), %mm1 movq %mm0, 0(Y) movq %mm1, 8(Y) movq 16(X), %mm2 movq 24(X), %mm3 movq %mm2, 16(Y) movq %mm3, 24(Y) movq 32(X), %mm4 movq 40(X), %mm5 movq %mm4, 32(Y) movq %mm5, 40(Y) movq 48(X), %mm6 movq 56(X), %mm7 movq %mm6, 48(Y) movq %mm7, 56(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq 64(X), %mm0 movq 72(X), %mm1 movq %mm0, 64(Y) movq %mm1, 72(Y) movq 80(X), %mm2 movq 88(X), %mm3 movq %mm2, 80(Y) movq %mm3, 88(Y) movq 96(X), %mm4 movq 104(X), %mm5 movq %mm4, 96(Y) movq %mm5, 104(Y) movq 112(X), %mm6 movq 120(X), %mm7 movq %mm6, 112(Y) movq %mm7, 120(Y) #elif defined(DOUBLE) movq 0(X), %mm0 movq 8(X), %mm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq %mm0, 0(Y) movq %mm1, 8(Y) movq 16(X), %mm2 movq 24(X), %mm3 movq %mm2, 16(Y) movq %mm3, 24(Y) movq 32(X), %mm4 movq 40(X), %mm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq %mm4, 32(Y) movq %mm5, 40(Y) movq 48(X), %mm6 movq 56(X), %mm7 movq %mm6, 48(Y) movq %mm7, 56(Y) #else movq 0 * SIZE(X), %mm0 movq 2 * SIZE(X), %mm2 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq %mm0, 0 * SIZE(Y) movq %mm2, 2 * SIZE(Y) movq 4 * SIZE(X), %mm4 movq 6 * SIZE(X), %mm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq %mm4, 4 * SIZE(Y) movq %mm6, 6 * SIZE(Y) #endif addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L11 ALIGN_2 .L20: movq N, %rax andq $7, %rax jle .L99 ALIGN_2 .L21: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq %mm0, 0(Y) movq %mm1, 8(Y) #else MOVQ (X), %mm0 MOVQ %mm0, (Y) #endif addq $SIZE, X addq $SIZE, Y decq %rax jg .L21 .L99: xorq %rax,%rax EMMS ret ALIGN_3 .L100: movq N, %rax sarq $3, %rax jle .L120 ALIGN_2 .L111: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 addq INCX, X movq 0(X), %mm2 movq 8(X), %mm3 addq INCX, X movq 0(X), %mm4 movq 8(X), %mm5 addq INCX, X movq 0(X), %mm6 movq 8(X), %mm7 addq INCX, X movq %mm0, 0(Y) movq %mm1, 8(Y) addq INCY, Y movq %mm2, 0(Y) movq %mm3, 8(Y) addq INCY, Y movq %mm4, 0(Y) movq %mm5, 8(Y) addq INCY, Y movq %mm6, 0(Y) movq %mm7, 8(Y) addq INCY, Y movq 0(X), %mm0 movq 8(X), %mm1 addq INCX, X movq 0(X), %mm2 movq 8(X), %mm3 addq INCX, X movq 0(X), %mm4 movq 8(X), %mm5 addq INCX, X movq 0(X), %mm6 movq 8(X), %mm7 addq INCX, X movq %mm0, 0(Y) movq %mm1, 8(Y) addq INCY, Y movq %mm2, 0(Y) movq %mm3, 8(Y) addq INCY, Y movq %mm4, 0(Y) movq %mm5, 8(Y) addq INCY, Y movq %mm6, 0(Y) movq %mm7, 8(Y) addq INCY, Y #else MOVQ (X), %mm0 addq INCX, X MOVQ (X), %mm1 addq INCX, X MOVQ (X), %mm2 addq INCX, X MOVQ (X), %mm3 addq INCX, X MOVQ (X), %mm4 addq INCX, X MOVQ (X), %mm5 addq INCX, X MOVQ (X), %mm6 addq INCX, X MOVQ (X), %mm7 addq INCX, X MOVQ %mm0, (Y) addq INCY, Y MOVQ %mm1, (Y) addq INCY, Y MOVQ %mm2, (Y) addq INCY, Y MOVQ %mm3, (Y) addq INCY, Y MOVQ %mm4, (Y) addq INCY, Y MOVQ %mm5, (Y) addq INCY, Y MOVQ %mm6, (Y) addq INCY, Y MOVQ %mm7, (Y) addq INCY, Y #endif decq %rax jg .L111 .L120: movq N, %rax andq $7, %rax jle .L999 ALIGN_2 .L121: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq %mm0, 0(Y) movq %mm1, 8(Y) #else MOVQ (X), %mm0 MOVQ %mm0, (Y) #endif addq INCX, X addq INCY, Y decq %rax jg .L121 .L999: xorq %rax,%rax EMMS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/copy_sse.S000066400000000000000000000421531313527062700173370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 cmpq $3, M jle .L55 subq $-32 * SIZE, X subq $-32 * SIZE, Y testq $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M ALIGN_4 .L05: testq $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y subq $2, M jle .L19 ALIGN_4 .L10: testq $3 * SIZE, X jne .L20 movq M, %rax sarq $5, %rax jle .L13 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -32 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -28 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -24 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm2) movaps %xmm3, -20 * SIZE(Y) LOAD(12 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4,-16 * SIZE(Y) LOAD(16 * SIZE, X, %xmm4) movaps %xmm5,-12 * SIZE(Y) LOAD(20 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -8 * SIZE(Y) LOAD(24 * SIZE, X, %xmm6) movaps %xmm7, -4 * SIZE(Y) LOAD(28 * SIZE, X, %xmm7) subq $-32 * SIZE, Y subq $-32 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, -16 * SIZE(Y) movaps %xmm5, -12 * SIZE(Y) movaps %xmm6, -8 * SIZE(Y) movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, Y subq $-32 * SIZE, X ALIGN_3 .L13: testq $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L14: testq $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L20: testq $SIZE, X jne .L30 movhps -32 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L23 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -10 * SIZE(X), %xmm6 movaps -6 * SIZE(X), %xmm7 decq %rax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 6 * SIZE(X), %xmm2 shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 10 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 14 * SIZE(X), %xmm4 shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 18 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 22 * SIZE(X), %xmm6 shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 26 * SIZE(X), %xmm7 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L23: testq $16, M jle .L24 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L24: testq $8, M jle .L25 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 shufps $0x4e, %xmm1, %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $4, M jle .L26 ALIGN_3 movaps -30 * SIZE(X), %xmm1 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: testq $2, M jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: testq $1, M jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L30: testq $2 * SIZE, X jne .L40 movaps -33 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L33 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movaps -13 * SIZE(X), %xmm5 movaps -9 * SIZE(X), %xmm6 movaps -5 * SIZE(X), %xmm7 decq %rax jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 3 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 7 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 11 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 15 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 19 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 23 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 27 * SIZE(X), %xmm7 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L33: testq $16, M jle .L34 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L34: testq $8, M jle .L35 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L35: testq $4, M jle .L36 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L36: testq $2, M jle .L37 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L37: testq $1, M jle .L39 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L39: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L43 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movaps -15 * SIZE(X), %xmm5 movaps -11 * SIZE(X), %xmm6 movaps -7 * SIZE(X), %xmm7 decq %rax jle .L42 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 1 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 5 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 9 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 13 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 17 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 21 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 25 * SIZE(X), %xmm7 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L43: testq $16, M jle .L44 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L44: testq $8, M jle .L45 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L45: testq $4, M jle .L46 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L46: testq $2, M jle .L47 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L47: testq $1, M jle .L49 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L49: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_4 .L50: movq M, %rax sarq $3, %rax jle .L55 ALIGN_3 .L51: movss (X), %xmm0 addq INCX, X movss (X), %xmm1 addq INCX, X movss (X), %xmm2 addq INCX, X movss (X), %xmm3 addq INCX, X movss (X), %xmm4 addq INCX, X movss (X), %xmm5 addq INCX, X movss (X), %xmm6 addq INCX, X movss (X), %xmm7 addq INCX, X movss %xmm0, (Y) addq INCY, Y movss %xmm1, (Y) addq INCY, Y movss %xmm2, (Y) addq INCY, Y movss %xmm3, (Y) addq INCY, Y movss %xmm4, (Y) addq INCY, Y movss %xmm5, (Y) addq INCY, Y movss %xmm6, (Y) addq INCY, Y movss %xmm7, (Y) addq INCY, Y decq %rax jg .L51 ALIGN_3 .L55: movq M, %rax andq $7, %rax jle .L57 ALIGN_3 .L56: movss (X), %xmm0 addq INCX, X movss %xmm0, (Y) addq INCY, Y decq %rax jg .L56 ALIGN_3 .L57: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/copy_sse2.S000066400000000000000000000305121313527062700174150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY jne .L40 #ifdef ALIGNED_ACCESS testq $SIZE, Y #else testq $SIZE, X #endif je .L10 movsd (X), %xmm0 movsd %xmm0, (Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_4 .L10: subq $-16 * SIZE, X subq $-16 * SIZE, Y #ifdef ALIGNED_ACCESS testq $SIZE, X #else testq $SIZE, Y #endif jne .L20 movq M, %rax sarq $4, %rax jle .L13 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -16 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -14 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -12 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movaps %xmm3, -10 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4, -8 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movaps %xmm5, -6 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -4 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movaps %xmm7, -2 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, -8 * SIZE(Y) movaps %xmm5, -6 * SIZE(Y) movaps %xmm6, -4 * SIZE(Y) movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L13: testq $8, M jle .L14 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L14: testq $4, M jle .L15 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: testq $2, M jle .L16 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L16: testq $1, M jle .L19 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm0 movq M, %rax sarq $4, %rax jle .L23 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 movaps -7 * SIZE(X), %xmm5 movaps -5 * SIZE(X), %xmm6 movaps -3 * SIZE(X), %xmm7 decq %rax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) LOAD( 1 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) LOAD( 3 * SIZE, X, %xmm2) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) LOAD( 5 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) LOAD( 7 * SIZE, X, %xmm4) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) LOAD( 9 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) LOAD(11 * SIZE, X, %xmm6) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) LOAD(13 * SIZE, X, %xmm7) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L23: testq $8, M jle .L24 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm8 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm8, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: testq $4, M jle .L25 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: testq $2, M jle .L26 ALIGN_3 movaps -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L26: testq $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 #else movq M, %rax sarq $4, %rax jle .L23 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L21 ALIGN_3 .L22: movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L23: testq $8, M jle .L24 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movaps -10 * SIZE(X), %xmm3 movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: testq $4, M jle .L25 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: testq $2, M jle .L26 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L26: testq $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 #endif .L40: movq M, %rax sarq $3, %rax jle .L45 ALIGN_3 .L41: movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X movhps (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X movhps (X), %xmm3 addq INCX, X movlps %xmm0, (Y) addq INCY, Y movhps %xmm0, (Y) addq INCY, Y movlps %xmm1, (Y) addq INCY, Y movhps %xmm1, (Y) addq INCY, Y movlps %xmm2, (Y) addq INCY, Y movhps %xmm2, (Y) addq INCY, Y movlps %xmm3, (Y) addq INCY, Y movhps %xmm3, (Y) addq INCY, Y decq %rax jg .L41 ALIGN_3 .L45: movq M, %rax andq $7, %rax jle .L47 ALIGN_3 .L46: movsd (X), %xmm0 addq INCX, X movlps %xmm0, (Y) addq INCY, Y decq %rax jg .L46 ALIGN_3 .L47: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/cscal.c000066400000000000000000000202611313527062700166140ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013 - 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(HASWELL) || defined(ZEN) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "cscal_microk_steamroller-2.c" #elif defined(SANDYBRIDGE) #include "cscal_microk_bulldozer-2.c" #endif #if !defined(HAVE_KERNEL_16) static void cscal_kernel_16( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void cscal_kernel_16( BLASLONG n, FLOAT *alpha , FLOAT *x ) { BLASLONG i; FLOAT da_r = alpha[0]; FLOAT da_i = alpha[1]; FLOAT t0,t1,t2,t3; for( i=0; i 0 ) { alpha[0] = da_r; alpha[1] = da_i; cscal_kernel_inc_8(n1, alpha, x, inc_x); j = n1 ; i = n1 * inc_x; } while(j < n) { temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; i += inc_x ; j++; } } } return(0); } BLASLONG n1 = n & -16; if ( n1 > 0 ) { alpha[0] = da_r; alpha[1] = da_i; if ( da_r == 0.0 ) if ( da_i == 0 ) cscal_kernel_16_zero(n1 , alpha , x); else cscal_kernel_16_zero_r(n1 , alpha , x); else if ( da_i == 0 ) cscal_kernel_16_zero_i(n1 , alpha , x); else cscal_kernel_16(n1 , alpha , x); i = n1 << 1; j = n1; } if ( da_r == 0.0 ) { if ( da_i == 0.0 ) { while(j < n) { x[i]=0.0; x[i+1]=0.0; i += 2 ; j++; } } else { while(j < n) { temp0 = -da_i * x[i+1]; x[i+1] = da_i * x[i]; x[i] = temp0; i += 2 ; j++; } } } else { if ( da_i == 0.0 ) { while(j < n) { temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; i += 2 ; j++; } } else { BLASLONG n2 = n & -2; while(j < n2) { temp0 = da_r * x[i] - da_i * x[i+1]; temp1 = da_r * x[i+2] - da_i * x[i+3]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i+3] = da_r * x[i+3] + da_i * x[i+2]; x[i] = temp0; x[i+2] = temp1; i += 4 ; j+=2; } while(j < n) { temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; i += 2 ; j++; } } } return(0); } OpenBLAS-0.2.20/kernel/x86_64/cscal_microk_bulldozer-2.c000066400000000000000000000250421313527062700224030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastss (%2), %%xmm0 \n\t" // da_r "vbroadcastss 4(%2), %%xmm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%xmm4 \n\t" "vmovups -112(%1), %%xmm5 \n\t" "vmovups -96(%1), %%xmm6 \n\t" "vmovups -80(%1), %%xmm7 \n\t" "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 320(%1) \n\t" // ".align 2 \n\t" "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmovups -64(%1), %%xmm4 \n\t" "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmovups -48(%1), %%xmm5 \n\t" "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmovups -32(%1), %%xmm6 \n\t" "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmovups -16(%1), %%xmm7 \n\t" "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%xmm12 , %%xmm8 , %%xmm8 \n\t" "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubps %%xmm13 , %%xmm9 , %%xmm9 \n\t" "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubps %%xmm14 , %%xmm10, %%xmm10 \n\t" "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubps %%xmm15 , %%xmm11, %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" "addq $64 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%xmm12 , %%xmm8 , %%xmm8 \n\t" "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubps %%xmm13 , %%xmm9 , %%xmm9 \n\t" "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubps %%xmm14 , %%xmm10, %%xmm10 \n\t" "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubps %%xmm15 , %%xmm11, %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" "vbroadcastss 4(%2), %%xmm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%xmm4 \n\t" "vmovups -112(%1), %%xmm5 \n\t" "vmovups -96(%1), %%xmm6 \n\t" "vmovups -80(%1), %%xmm7 \n\t" "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups -64(%1), %%xmm4 \n\t" "vmovups -48(%1), %%xmm5 \n\t" "vmovups -32(%1), %%xmm6 \n\t" "vmovups -16(%1), %%xmm7 \n\t" "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%xmm12 , %%xmm0 , %%xmm8 \n\t" "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubps %%xmm13 , %%xmm0 , %%xmm9 \n\t" "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubps %%xmm14 , %%xmm0 , %%xmm10 \n\t" "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubps %%xmm15 , %%xmm0 , %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vpermilps $0xb1 , %%xmm4, %%xmm12 \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vpermilps $0xb1 , %%xmm5, %%xmm13 \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vpermilps $0xb1 , %%xmm6, %%xmm14 \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vpermilps $0xb1 , %%xmm7, %%xmm15 \n\t" "addq $64 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%xmm12 , %%xmm0 , %%xmm8 \n\t" "vmulps %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubps %%xmm13 , %%xmm0 , %%xmm9 \n\t" "vmulps %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubps %%xmm14 , %%xmm0 , %%xmm10 \n\t" "vmulps %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubps %%xmm15 , %%xmm0 , %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastss (%2), %%xmm0 \n\t" // da_r "addq $128, %1 \n\t" "vmovups -128(%1), %%xmm4 \n\t" "vmovups -112(%1), %%xmm5 \n\t" "vmovups -96(%1), %%xmm6 \n\t" "vmovups -80(%1), %%xmm7 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmovups -64(%1), %%xmm4 \n\t" "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmovups -48(%1), %%xmm5 \n\t" "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmovups -32(%1), %%xmm6 \n\t" "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmovups -16(%1), %%xmm7 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "addq $64 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmulps %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmulps %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmulps %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" "addq $128, %1 \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups %%xmm0 , -128(%1) \n\t" "vmovups %%xmm0 , -112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "addq $64 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cscal_microk_haswell-2.c000066400000000000000000000250171313527062700220420ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastss (%2), %%ymm0 \n\t" // da_r "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "subq $16, %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "subq $16, %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups 0(%1), %%ymm4 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", // "0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastss (%2), %%ymm0 \n\t" // da_r "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "subq $16, %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" "addq $128, %1 \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups %%ymm0 , -128(%1) \n\t" "vmovups %%ymm0 , -96(%1) \n\t" "vmovups %%ymm0 , -64(%1) \n\t" "vmovups %%ymm0 , -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/cscal_microk_steamroller-2.c000066400000000000000000000250621313527062700227340ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastss (%2), %%ymm0 \n\t" // da_r "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "subq $16, %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%1) \n\t" // ".align 2 \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "prefetcht0 768(%1) \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm14 , %%ymm10, %%ymm10 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" "vbroadcastss 4(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "subq $16, %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups 0(%1), %%ymm4 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vpermilps $0xb1 , %%ymm4, %%ymm12 \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vpermilps $0xb1 , %%ymm5, %%ymm13 \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vpermilps $0xb1 , %%ymm6, %%ymm14 \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilps $0xb1 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubps %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubps %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulps %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubps %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulps %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubps %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastss (%2), %%ymm0 \n\t" // da_r "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "subq $16, %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulps %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulps %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulps %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" "addq $128, %1 \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups %%ymm0 , -128(%1) \n\t" "vmovups %%ymm0 , -96(%1) \n\t" "vmovups %%ymm0 , -64(%1) \n\t" "vmovups %%ymm0 , -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $16, %0 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/ctrsm_kernel_LN_bulldozer.c000066400000000000000000000310731313527062700226750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ctrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ctrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " vbroadcastss (%3,%1,4), %%xmm0 \n\t" // b0 real, b0 real " vbroadcastss 4(%3,%1,4), %%xmm1 \n\t" // b0 imag, b0 imag " vbroadcastss 8(%3,%1,4), %%xmm2 \n\t" // b1 real, b1 real " vbroadcastss 12(%3,%1,4), %%xmm3 \n\t" // b1 imag, b1 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vfnmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufps $0xb1 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufps $0xb1 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufps $0xb1 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufps $0xb1 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorps %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubps %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddps %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddps %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddps %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddps %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; a += (m - 1) * m; b += (m - 1) * n; for (i = m - 1; i >= 0; i--) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = 0; k < i; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a -= m; b -= 2 * n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (m - 1) * m * 2; b += (m - 1) * n * 2; for (i = m - 1; i >= 0; i--) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a -= m * 2; b -= 4 * n; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { #ifdef CONJ if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else ctrsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * j * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, cc, ldc); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/ctrsm_kernel_LT_bulldozer.c000066400000000000000000000271161313527062700227060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ctrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ctrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " vbroadcastss (%3,%1,4), %%xmm0 \n\t" // b0 real, b0 real " vbroadcastss 4(%3,%1,4), %%xmm1 \n\t" // b0 imag, b0 imag " vbroadcastss 8(%3,%1,4), %%xmm2 \n\t" // b1 real, b1 real " vbroadcastss 12(%3,%1,4), %%xmm3 \n\t" // b1 imag, b1 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vfnmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufps $0xb1 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufps $0xb1 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufps $0xb1 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufps $0xb1 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorps %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubps %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddps %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddps %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddps %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddps %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < m; i++) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = i + 1; k < m; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a += m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < m; i++) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = i + 1; k < m; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a += m * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { #ifdef CONJ if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else ctrsm_LT_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/ctrsm_kernel_RN_bulldozer.c000066400000000000000000000271241313527062700227050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ctrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ctrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " vbroadcastss (%3,%1,4), %%xmm0 \n\t" // b0 real, b0 real " vbroadcastss 4(%3,%1,4), %%xmm1 \n\t" // b0 imag, b0 imag " vbroadcastss 8(%3,%1,4), %%xmm2 \n\t" // b1 real, b1 real " vbroadcastss 12(%3,%1,4), %%xmm3 \n\t" // b1 imag, b1 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vfnmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufps $0xb1 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufps $0xb1 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufps $0xb1 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufps $0xb1 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorps %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubps %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddps %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddps %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddps %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddps %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < n; i++) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = i + 1; k < n; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b += n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < n; i++) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = -aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = i + 1; k < n; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b += n * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); kk = -offset; while (j > 0) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { #ifndef CONJ ctrsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } kk += GEMM_UNROLL_N; b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; kk += j; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/ctrsm_kernel_RT_bulldozer.c000066400000000000000000000304611313527062700227110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ctrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ctrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " vbroadcastss (%3,%1,4), %%xmm0 \n\t" // b0 real, b0 real " vbroadcastss 4(%3,%1,4), %%xmm1 \n\t" // b0 imag, b0 imag " vbroadcastss 8(%3,%1,4), %%xmm2 \n\t" // b1 real, b1 real " vbroadcastss 12(%3,%1,4), %%xmm3 \n\t" // b1 imag, b1 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vfnmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddps %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddps %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufps $0xb1 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufps $0xb1 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufps $0xb1 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufps $0xb1 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorps %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubps %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubps %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubps %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubps %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddps %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddps %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddps %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddps %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; a += (n - 1) * m; b += (n - 1) * n; for (i = n - 1; i >= 0; i--) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = 0; k < i; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b -= n; a -= 2 * m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (n - 1) * m * 2; b += (n - 1) * n * 2; for (i = n - 1; i >= 0; i--) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = - aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b -= n * 2; a -= 4 * m; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif kk = n - offset; c += n * ldc * COMPSIZE; b += n * k * COMPSIZE; if (n & (GEMM_UNROLL_N - 1)) { j = 1; while (j < GEMM_UNROLL_N) { if (n & j) { aa = a; b -= j * k * COMPSIZE; c -= j * ldc* COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - j) * i * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= j; } j <<= 1; } } j = (n >> GEMM_UNROLL_N_SHIFT); if (j > 0) { do { aa = a; b -= GEMM_UNROLL_N * k * COMPSIZE; c -= GEMM_UNROLL_N * ldc * COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { #ifndef CONJ ctrsm_RT_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= GEMM_UNROLL_N; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/x86_64/daxpy.c000066400000000000000000000065641313527062700166660ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(NEHALEM) #include "daxpy_microk_nehalem-2.c" #elif defined(BULLDOZER) #include "daxpy_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "daxpy_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_8 static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; while(i < n) { y[i] += a * x[i]; y[i+1] += a * x[i+1]; y[i+2] += a * x[i+2]; y[i+3] += a * x[i+3]; y[i+4] += a * x[i+4]; y[i+5] += a * x[i+5]; y[i+6] += a * x[i+6]; y[i+7] += a * x[i+7]; i+=8 ; } } #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; if ( n <= 0 ) return(0); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -16; if ( n1 ) daxpy_kernel_8(n1, x, y , &da ); i = n1; while(i < n) { y[i] += da * x[i] ; i++ ; } return(0); } BLASLONG n1 = n & -4; while(i < n1) { FLOAT m1 = da * x[ix] ; FLOAT m2 = da * x[ix+inc_x] ; FLOAT m3 = da * x[ix+2*inc_x] ; FLOAT m4 = da * x[ix+3*inc_x] ; y[iy] += m1 ; y[iy+inc_y] += m2 ; y[iy+2*inc_y] += m3 ; y[iy+3*inc_y] += m4 ; ix += inc_x*4 ; iy += inc_y*4 ; i+=4 ; } while(i < n) { y[iy] += da * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/x86_64/daxpy_bulldozer.S000066400000000000000000000211561313527062700207220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %r10 #endif #define YY %r11 #define ALPHA %xmm15 #define A_PRE 640 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY #else movq 24(%rsp), INCY #endif vmovups %xmm0, ALPHA #else vmovups %xmm3, ALPHA movq 40(%rsp), X movq 48(%rsp), INCX movq 56(%rsp), Y movq 64(%rsp), INCY #endif SAVEREGISTERS unpcklpd ALPHA, ALPHA leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY testq M, M jle .L47 cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY jne .L40 testq $SIZE, Y je .L10 movsd (X), %xmm0 mulsd ALPHA, %xmm0 addsd (Y), %xmm0 movsd %xmm0, (Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_4 .L10: subq $-16 * SIZE, X subq $-16 * SIZE, Y movq M, %rax sarq $4, %rax jle .L13 vmovups -16 * SIZE(X), %xmm0 vmovups -14 * SIZE(X), %xmm1 vmovups -12 * SIZE(X), %xmm2 vmovups -10 * SIZE(X), %xmm3 decq %rax jle .L12 ALIGN_3 .L11: prefetchnta A_PRE(Y) vmovups -8 * SIZE(X), %xmm4 vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 vmovups -6 * SIZE(X), %xmm5 vmovups -4 * SIZE(X), %xmm6 vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 vmovups -2 * SIZE(X), %xmm7 vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) prefetchnta A_PRE(X) nop vmovups %xmm2, -12 * SIZE(Y) vmovups %xmm3, -10 * SIZE(Y) prefetchnta A_PRE+64(Y) vmovups 0 * SIZE(X), %xmm0 vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 vmovups 2 * SIZE(X), %xmm1 vmovups 4 * SIZE(X), %xmm2 vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 vmovups 6 * SIZE(X), %xmm3 vmovups %xmm4, -8 * SIZE(Y) vmovups %xmm5, -6 * SIZE(Y) prefetchnta A_PRE+64(X) nop vmovups %xmm6, -4 * SIZE(Y) vmovups %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: vmovups -8 * SIZE(X), %xmm4 vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 vmovups -6 * SIZE(X), %xmm5 vmovups -4 * SIZE(X), %xmm6 vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 vmovups -2 * SIZE(X), %xmm7 vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) vmovups %xmm2, -12 * SIZE(Y) vmovups %xmm3, -10 * SIZE(Y) vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 vmovups %xmm4, -8 * SIZE(Y) vmovups %xmm5, -6 * SIZE(Y) vmovups %xmm6, -4 * SIZE(Y) vmovups %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L13: movq M, %rax andq $8, %rax jle .L14 ALIGN_3 vmovups -16 * SIZE(X), %xmm0 vmovups -14 * SIZE(X), %xmm1 vmovups -12 * SIZE(X), %xmm2 vmovups -10 * SIZE(X), %xmm3 vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) vmovups %xmm2, -12 * SIZE(Y) vmovups %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L14: movq M, %rax andq $4, %rax jle .L15 ALIGN_3 vmovups -16 * SIZE(X), %xmm0 vmovups -14 * SIZE(X), %xmm1 vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: movq M, %rax andq $2, %rax jle .L16 ALIGN_3 vmovups -16 * SIZE(X), %xmm0 vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L16: movq M, %rax andq $1, %rax jle .L19 ALIGN_3 vmovsd -16 * SIZE(X), %xmm0 vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L40: movq Y, YY movq M, %rax //If incx==0 || incy==0, avoid unloop. cmpq $0, INCX je .L46 cmpq $0, INCY je .L46 sarq $3, %rax jle .L45 prefetchnta 512(X) prefetchnta 512+64(X) prefetchnta 512+128(X) prefetchnta 512+192(X) prefetchnta 512(Y) prefetchnta 512+64(Y) prefetchnta 512+128(Y) prefetchnta 512+192(Y) ALIGN_3 .L41: vmovsd 0 * SIZE(X), %xmm0 addq INCX, X vmovhpd 0 * SIZE(X), %xmm0 , %xmm0 addq INCX, X vmovsd 0 * SIZE(YY), %xmm6 addq INCY, YY vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6 addq INCY, YY vmovsd 0 * SIZE(X), %xmm1 addq INCX, X vmovhpd 0 * SIZE(X), %xmm1 , %xmm1 addq INCX, X vmovsd 0 * SIZE(YY), %xmm7 addq INCY, YY vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7 addq INCY, YY vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0 vmovsd 0 * SIZE(X), %xmm2 addq INCX, X vmovhpd 0 * SIZE(X), %xmm2 , %xmm2 addq INCX, X vmovsd 0 * SIZE(YY), %xmm8 addq INCY, YY vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8 addq INCY, YY vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1 vmovsd 0 * SIZE(X), %xmm3 addq INCX, X vmovhpd 0 * SIZE(X), %xmm3 , %xmm3 addq INCX, X vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2 vmovsd 0 * SIZE(YY), %xmm9 addq INCY, YY vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9 addq INCY, YY vmovsd %xmm0, 0 * SIZE(Y) addq INCY, Y vmovhpd %xmm0, 0 * SIZE(Y) addq INCY, Y vmovsd %xmm1, 0 * SIZE(Y) addq INCY, Y vmovhpd %xmm1, 0 * SIZE(Y) addq INCY, Y vmovsd %xmm2, 0 * SIZE(Y) addq INCY, Y vmovhpd %xmm2, 0 * SIZE(Y) addq INCY, Y vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3 vmovsd %xmm3, 0 * SIZE(Y) addq INCY, Y vmovhpd %xmm3, 0 * SIZE(Y) addq INCY, Y decq %rax jg .L41 ALIGN_3 .L45: movq M, %rax andq $7, %rax jle .L47 ALIGN_3 .L46: vmovsd (X), %xmm0 addq INCX, X vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0 vmovsd %xmm0, (Y) addq INCY, Y decq %rax jg .L46 ALIGN_3 .L47: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/daxpy_microk_bulldozer-2.c000066400000000000000000000066231313527062700224470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vmovddup (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "prefetcht0 768(%3,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x ".align 2 \n\t" "vmovups %%xmm8 , (%3,%0,8) \n\t" "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x ".align 2 \n\t" "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "vmovups %%xmm9 , 16(%3,%0,8) \n\t" "prefetcht0 768(%2,%0,8) \n\t" ".align 2 \n\t" "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x "vmovups %%xmm10, 32(%3,%0,8) \n\t" "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x "vmovups %%xmm11, 48(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/daxpy_microk_haswell-2.c000066400000000000000000000062521313527062700221020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastsd (%4), %%ymm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 4 * y "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 4 * y "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 4 * y "vfmadd231pd (%2,%0,8), %%ymm0 , %%ymm12 \n\t" // y += alpha * x "vfmadd231pd 32(%2,%0,8), %%ymm0 , %%ymm13 \n\t" // y += alpha * x "vfmadd231pd 64(%2,%0,8), %%ymm0 , %%ymm14 \n\t" // y += alpha * x "vfmadd231pd 96(%2,%0,8), %%ymm0 , %%ymm15 \n\t" // y += alpha * x "vmovups %%ymm12, (%3,%0,8) \n\t" "vmovups %%ymm13, 32(%3,%0,8) \n\t" "vmovups %%ymm14, 64(%3,%0,8) \n\t" "vmovups %%ymm15, 96(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/daxpy_microk_nehalem-2.c000066400000000000000000000072761313527062700220630ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "movsd (%4), %%xmm0 \n\t" // alpha "shufpd $0, %%xmm0, %%xmm0 \n\t" ".align 16 \n\t" "1: \n\t" // "prefetcht0 192(%2,%0,8) \n\t" // "prefetcht0 192(%3,%0,8) \n\t" "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y "mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x "mulpd %%xmm0 , %%xmm13 \n\t" "mulpd %%xmm0 , %%xmm14 \n\t" "mulpd %%xmm0 , %%xmm15 \n\t" "addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x "addpd %%xmm13, %%xmm9 \n\t" "addpd %%xmm14, %%xmm10 \n\t" "addpd %%xmm15, %%xmm11 \n\t" "movups %%xmm8 , (%3,%0,8) \n\t" "movups %%xmm9 , 16(%3,%0,8) \n\t" "movups %%xmm10, 32(%3,%0,8) \n\t" "movups %%xmm11, 48(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/daxpy_microk_piledriver-2.c000066400000000000000000000145541313527062700226140ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; if ( n < 640 ) { __asm__ __volatile__ ( "vmovddup (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x "vmovups %%xmm8 , (%3,%0,8) \n\t" "vmovups %%xmm9 , 16(%3,%0,8) \n\t" "vmovups %%xmm10, 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "vmovups %%xmm12, 64(%3,%0,8) \n\t" "vmovups %%xmm13, 80(%3,%0,8) \n\t" "vmovups %%xmm14, 96(%3,%0,8) \n\t" "vmovups %%xmm15,112(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vmovddup (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y "prefetcht0 576(%3,%0,8) \n\t" "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y "prefetcht0 512(%2,%0,8) \n\t" "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x "prefetcht0 576(%2,%0,8) \n\t" "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x "vmovups %%xmm8 , (%3,%0,8) \n\t" "vmovups %%xmm9 , 16(%3,%0,8) \n\t" "vmovups %%xmm10, 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "vmovups %%xmm12, 64(%3,%0,8) \n\t" "vmovups %%xmm13, 80(%3,%0,8) \n\t" "vmovups %%xmm14, 96(%3,%0,8) \n\t" "vmovups %%xmm15,112(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/daxpy_microk_sandy-2.c000066400000000000000000000104601313527062700215550ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastsd (%4), %%ymm0 \n\t" // alpha "vmovups (%3,%0,8), %%ymm8 \n\t" "vmovups 32(%3,%0,8), %%ymm9 \n\t" "vmovups 64(%3,%0,8), %%ymm10 \n\t" "vmovups 96(%3,%0,8), %%ymm11 \n\t" "vmovups (%2,%0,8), %%ymm4 \n\t" "vmovups 32(%2,%0,8), %%ymm5 \n\t" "vmovups 64(%2,%0,8), %%ymm6 \n\t" "vmovups 96(%2,%0,8), %%ymm7 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "vmulpd %%ymm4, %%ymm0, %%ymm4 \n\t" "vaddpd %%ymm8 , %%ymm4, %%ymm12 \n\t" "vmulpd %%ymm5, %%ymm0, %%ymm5 \n\t" "vaddpd %%ymm9 , %%ymm5, %%ymm13 \n\t" "vmulpd %%ymm6, %%ymm0, %%ymm6 \n\t" "vaddpd %%ymm10, %%ymm6, %%ymm14 \n\t" "vmulpd %%ymm7, %%ymm0, %%ymm7 \n\t" "vaddpd %%ymm11, %%ymm7, %%ymm15 \n\t" "vmovups (%3,%0,8), %%ymm8 \n\t" "vmovups 32(%3,%0,8), %%ymm9 \n\t" "vmovups 64(%3,%0,8), %%ymm10 \n\t" "vmovups 96(%3,%0,8), %%ymm11 \n\t" "vmovups (%2,%0,8), %%ymm4 \n\t" "vmovups 32(%2,%0,8), %%ymm5 \n\t" "vmovups 64(%2,%0,8), %%ymm6 \n\t" "vmovups 96(%2,%0,8), %%ymm7 \n\t" "vmovups %%ymm12, -128(%3,%0,8) \n\t" "vmovups %%ymm13, -96(%3,%0,8) \n\t" "vmovups %%ymm14, -64(%3,%0,8) \n\t" "vmovups %%ymm15, -32(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm4, %%ymm0, %%ymm4 \n\t" "vmulpd %%ymm5, %%ymm0, %%ymm5 \n\t" "vmulpd %%ymm6, %%ymm0, %%ymm6 \n\t" "vmulpd %%ymm7, %%ymm0, %%ymm7 \n\t" "vaddpd %%ymm8 , %%ymm4, %%ymm12 \n\t" "vaddpd %%ymm9 , %%ymm5, %%ymm13 \n\t" "vaddpd %%ymm10, %%ymm6, %%ymm14 \n\t" "vaddpd %%ymm11, %%ymm7, %%ymm15 \n\t" "vmovups %%ymm12, -128(%3,%0,8) \n\t" "vmovups %%ymm13, -96(%3,%0,8) \n\t" "vmovups %%ymm14, -64(%3,%0,8) \n\t" "vmovups %%ymm15, -32(%3,%0,8) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/daxpy_microk_steamroller-2.c000066400000000000000000000145551313527062700230010ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; if ( n < 2048 ) { __asm__ __volatile__ ( "vmovddup (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x "vmovups %%xmm8 , (%3,%0,8) \n\t" "vmovups %%xmm9 , 16(%3,%0,8) \n\t" "vmovups %%xmm10, 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "vmovups %%xmm12, 64(%3,%0,8) \n\t" "vmovups %%xmm13, 80(%3,%0,8) \n\t" "vmovups %%xmm14, 96(%3,%0,8) \n\t" "vmovups %%xmm15,112(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vmovddup (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm8 \n\t" // 2 y "vmovups 16(%3,%0,8), %%xmm9 \n\t" // 2 y "vmovups 32(%3,%0,8), %%xmm10 \n\t" // 2 y "vmovups 48(%3,%0,8), %%xmm11 \n\t" // 2 y "prefetcht0 576(%3,%0,8) \n\t" "vmovups 64(%3,%0,8), %%xmm12 \n\t" // 2 y "vmovups 80(%3,%0,8), %%xmm13 \n\t" // 2 y "vmovups 96(%3,%0,8), %%xmm14 \n\t" // 2 y "vmovups 112(%3,%0,8), %%xmm15 \n\t" // 2 y "prefetcht0 512(%2,%0,8) \n\t" "vfmadd231pd (%2,%0,8), %%xmm0 , %%xmm8 \n\t" // y += alpha * x "vfmadd231pd 16(%2,%0,8), %%xmm0 , %%xmm9 \n\t" // y += alpha * x "vfmadd231pd 32(%2,%0,8), %%xmm0 , %%xmm10 \n\t" // y += alpha * x "vfmadd231pd 48(%2,%0,8), %%xmm0 , %%xmm11 \n\t" // y += alpha * x "prefetcht0 576(%2,%0,8) \n\t" "vfmadd231pd 64(%2,%0,8), %%xmm0 , %%xmm12 \n\t" // y += alpha * x "vfmadd231pd 80(%2,%0,8), %%xmm0 , %%xmm13 \n\t" // y += alpha * x "vfmadd231pd 96(%2,%0,8), %%xmm0 , %%xmm14 \n\t" // y += alpha * x "vfmadd231pd 112(%2,%0,8), %%xmm0 , %%xmm15 \n\t" // y += alpha * x "vmovups %%xmm8 , (%3,%0,8) \n\t" "vmovups %%xmm9 , 16(%3,%0,8) \n\t" "vmovups %%xmm10, 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "vmovups %%xmm12, 64(%3,%0,8) \n\t" "vmovups %%xmm13, 80(%3,%0,8) \n\t" "vmovups %%xmm14, 96(%3,%0,8) \n\t" "vmovups %%xmm15,112(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dcopy_bulldozer.S000066400000000000000000000144111313527062700207070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" #define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG #define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2 #define A_PRE 640 #define B_PRE 640 PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY jne .L40 testq $SIZE, X je .L10 vmovsd (X), %xmm0 vmovsd %xmm0, (Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_4 .L10: subq $-16 * SIZE, X subq $-16 * SIZE, Y movq M, %rax sarq $4, %rax jle .L13 vmovups -16 * SIZE(X), %xmm0 vmovups -14 * SIZE(X), %xmm1 vmovups -12 * SIZE(X), %xmm2 vmovups -10 * SIZE(X), %xmm3 vmovups -8 * SIZE(X), %xmm4 vmovups -6 * SIZE(X), %xmm5 vmovups -4 * SIZE(X), %xmm6 vmovups -2 * SIZE(X), %xmm7 decq %rax jle .L12 ALIGN_4 .L11: prefetchnta A_PRE(X) nop vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) prefetchnta B_PRE(Y) nop vmovups %xmm2, -12 * SIZE(Y) vmovups %xmm3, -10 * SIZE(Y) VLOAD( 0 * SIZE, X, %xmm0) VLOAD( 2 * SIZE, X, %xmm1) VLOAD( 4 * SIZE, X, %xmm2) VLOAD( 6 * SIZE, X, %xmm3) prefetchnta A_PRE+64(X) nop vmovups %xmm4, -8 * SIZE(Y) vmovups %xmm5, -6 * SIZE(Y) prefetchnta B_PRE+64(Y) nop vmovups %xmm6, -4 * SIZE(Y) vmovups %xmm7, -2 * SIZE(Y) VLOAD( 8 * SIZE, X, %xmm4) VLOAD(10 * SIZE, X, %xmm5) subq $-16 * SIZE, Y VLOAD(12 * SIZE, X, %xmm6) VLOAD(14 * SIZE, X, %xmm7) subq $-16 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) vmovups %xmm2, -12 * SIZE(Y) vmovups %xmm3, -10 * SIZE(Y) vmovups %xmm4, -8 * SIZE(Y) vmovups %xmm5, -6 * SIZE(Y) vmovups %xmm6, -4 * SIZE(Y) vmovups %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L13: testq $8, M jle .L14 ALIGN_3 vmovups -16 * SIZE(X), %xmm0 vmovups -14 * SIZE(X), %xmm1 vmovups -12 * SIZE(X), %xmm2 vmovups -10 * SIZE(X), %xmm3 vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) vmovups %xmm2, -12 * SIZE(Y) vmovups %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L14: testq $4, M jle .L15 ALIGN_3 vmovups -16 * SIZE(X), %xmm0 vmovups -14 * SIZE(X), %xmm1 vmovups %xmm0, -16 * SIZE(Y) vmovups %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: testq $2, M jle .L16 ALIGN_3 vmovups -16 * SIZE(X), %xmm0 vmovups %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L16: testq $1, M jle .L19 ALIGN_3 vmovsd -16 * SIZE(X), %xmm0 vmovsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L40: movq M, %rax sarq $3, %rax jle .L45 ALIGN_3 .L41: vmovsd (X), %xmm0 addq INCX, X vmovsd (X), %xmm4 addq INCX, X vmovsd (X), %xmm1 addq INCX, X vmovsd (X), %xmm5 addq INCX, X vmovsd (X), %xmm2 addq INCX, X vmovsd (X), %xmm6 addq INCX, X vmovsd (X), %xmm3 addq INCX, X vmovsd (X), %xmm7 addq INCX, X vmovsd %xmm0, (Y) addq INCY, Y vmovsd %xmm4, (Y) addq INCY, Y vmovsd %xmm1, (Y) addq INCY, Y vmovsd %xmm5, (Y) addq INCY, Y vmovsd %xmm2, (Y) addq INCY, Y vmovsd %xmm6, (Y) addq INCY, Y vmovsd %xmm3, (Y) addq INCY, Y vmovsd %xmm7, (Y) addq INCY, Y decq %rax jg .L41 ALIGN_3 .L45: movq M, %rax andq $7, %rax jle .L47 ALIGN_3 .L46: vmovsd (X), %xmm0 addq INCX, X vmovsd %xmm0, (Y) addq INCY, Y decq %rax jg .L46 ALIGN_3 .L47: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ddot.c000066400000000000000000000065671313527062700164760ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) #include "ddot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "ddot_microk_steamroller-2.c" #elif defined(PILEDRIVER) #include "ddot_microk_piledriver-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_8 static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot = 0.0; while(i < n) { dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] + y[i+4] * x[i+4] + y[i+5] * x[i+5] + y[i+6] * x[i+6] + y[i+7] * x[i+7] ; i+=8 ; } *d += dot; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT dot = 0.0 ; if ( n <= 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -16; if ( n1 ) ddot_kernel_8(n1, x, y , &dot ); i = n1; while(i < n) { dot += y[i] * x[i] ; i++ ; } return(dot); } FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; BLASLONG n1 = n & -4; while(i < n1) { FLOAT m1 = y[iy] * x[ix] ; FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; ix += inc_x*4 ; iy += inc_y*4 ; temp1 += m1+m3; temp2 += m2+m4; i+=4 ; } while(i < n) { temp1 += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } dot = temp1 + temp2; return(dot); } OpenBLAS-0.2.20/kernel/x86_64/ddot_bulldozer.S000066400000000000000000000164251313527062700205320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #define A_PRE 512 #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY vxorps %xmm0, %xmm0 , %xmm0 vxorps %xmm1, %xmm1 , %xmm1 vxorps %xmm2, %xmm2 , %xmm2 vxorps %xmm3, %xmm3 , %xmm3 cmpq $0, N jle .L999 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 subq $-16 * SIZE, X subq $-16 * SIZE, Y testq $SIZE, Y je .L10 vmovsd -16 * SIZE(X), %xmm0 vmulsd -16 * SIZE(Y), %xmm0 , %xmm0 addq $1 * SIZE, X addq $1 * SIZE, Y decq N ALIGN_2 .L10: movq N, %rax sarq $4, %rax jle .L14 vmovups -16 * SIZE(X), %xmm4 vmovups -14 * SIZE(X), %xmm5 vmovups -12 * SIZE(X), %xmm6 vmovups -10 * SIZE(X), %xmm7 vmovups -8 * SIZE(X), %xmm8 vmovups -6 * SIZE(X), %xmm9 vmovups -4 * SIZE(X), %xmm10 vmovups -2 * SIZE(X), %xmm11 decq %rax jle .L12 ALIGN_3 .L11: prefetchnta A_PRE(Y) vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 prefetchnta A_PRE(X) vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 vmovups 0 * SIZE(X), %xmm4 vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 vmovups 2 * SIZE(X), %xmm5 vmovups 4 * SIZE(X), %xmm6 vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 vmovups 6 * SIZE(X), %xmm7 prefetchnta A_PRE+64(Y) vmovups 8 * SIZE(X), %xmm8 vmovups 10 * SIZE(X), %xmm9 prefetchnta A_PRE+64(X) vmovups 12 * SIZE(X), %xmm10 vmovups 14 * SIZE(X), %xmm11 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L14: testq $15, N jle .L999 testq $8, N jle .L15 vmovups -16 * SIZE(X), %xmm4 vmovups -14 * SIZE(X), %xmm5 vmovups -12 * SIZE(X), %xmm6 vmovups -10 * SIZE(X), %xmm7 vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $4, N jle .L16 vmovups -16 * SIZE(X), %xmm4 vmovups -14 * SIZE(X), %xmm5 vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, N jle .L17 vmovups -16 * SIZE(X), %xmm4 vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, N jle .L999 vmovsd -16 * SIZE(X), %xmm4 vmovsd -16 * SIZE(Y), %xmm5 vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0 jmp .L999 ALIGN_3 .L50: movq N, %rax sarq $3, %rax jle .L55 ALIGN_3 .L53: vmovsd 0 * SIZE(X), %xmm4 addq INCX, X vmovsd 0 * SIZE(Y), %xmm8 addq INCY, Y vmovsd 0 * SIZE(X), %xmm5 addq INCX, X vmovsd 0 * SIZE(Y), %xmm9 addq INCY, Y vmovsd 0 * SIZE(X), %xmm6 addq INCX, X vmovsd 0 * SIZE(Y), %xmm10 addq INCY, Y vmovsd 0 * SIZE(X), %xmm7 addq INCX, X vmovsd 0 * SIZE(Y), %xmm11 addq INCY, Y vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 vmovsd 0 * SIZE(X), %xmm4 addq INCX, X vmovsd 0 * SIZE(Y), %xmm8 addq INCY, Y vmovsd 0 * SIZE(X), %xmm5 addq INCX, X vmovsd 0 * SIZE(Y), %xmm9 addq INCY, Y vmovsd 0 * SIZE(X), %xmm6 addq INCX, X vmovsd 0 * SIZE(Y), %xmm10 addq INCY, Y vmovsd 0 * SIZE(X), %xmm7 addq INCX, X vmovsd 0 * SIZE(Y), %xmm11 addq INCY, Y vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $7, %rax jle .L999 ALIGN_3 .L56: vmovsd 0 * SIZE(X), %xmm4 addq INCX, X vmovsd 0 * SIZE(Y), %xmm8 addq INCY, Y vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 decq %rax jg .L56 ALIGN_3 .L999: vaddpd %xmm1, %xmm0 , %xmm0 vaddpd %xmm3, %xmm2 , %xmm2 vaddpd %xmm2, %xmm0 , %xmm0 vhaddpd %xmm0, %xmm0 , %xmm0 RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ddot_microk_bulldozer-2.c000066400000000000000000000064321313527062700222520ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x "vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y "vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y "vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y "vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovsd %%xmm4, (%4) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/ddot_microk_haswell-2.c000066400000000000000000000071711313527062700217100ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x "vmovups 64(%2,%0,8), %%ymm14 \n\t" // 2 * x "vmovups 96(%2,%0,8), %%ymm15 \n\t" // 2 * x "vfmadd231pd (%3,%0,8), %%ymm12, %%ymm4 \n\t" // 2 * y "vfmadd231pd 32(%3,%0,8), %%ymm13, %%ymm5 \n\t" // 2 * y "vfmadd231pd 64(%3,%0,8), %%ymm14, %%ymm6 \n\t" // 2 * y "vfmadd231pd 96(%3,%0,8), %%ymm15, %%ymm7 \n\t" // 2 * y "addq $16 , %0 \n\t" "subq $16 , %1 \n\t" "jnz 1b \n\t" "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/ddot_microk_nehalem-2.c000066400000000000000000000071241313527062700216600ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "xorpd %%xmm4, %%xmm4 \n\t" "xorpd %%xmm5, %%xmm5 \n\t" "xorpd %%xmm6, %%xmm6 \n\t" "xorpd %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y "mulpd %%xmm8 , %%xmm12 \n\t" "mulpd %%xmm9 , %%xmm13 \n\t" "mulpd %%xmm10, %%xmm14 \n\t" "mulpd %%xmm11, %%xmm15 \n\t" "addpd %%xmm12, %%xmm4 \n\t" "addpd %%xmm13, %%xmm5 \n\t" "addpd %%xmm14, %%xmm6 \n\t" "addpd %%xmm15, %%xmm7 \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "addpd %%xmm5, %%xmm4 \n\t" "addpd %%xmm7, %%xmm6 \n\t" "addpd %%xmm6, %%xmm4 \n\t" "haddpd %%xmm4, %%xmm4 \n\t" "movsd %%xmm4, (%4) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/ddot_microk_piledriver-2.c000066400000000000000000000142441313527062700224150ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n < 1408 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y "addq $16 , %0 \n\t" "subq $16 , %1 \n\t" "jnz 1b \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 768(%2,%0,8) \n\t" "prefetcht0 832(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x "prefetcht0 768(%3,%0,8) \n\t" "prefetcht0 832(%3,%0,8) \n\t" "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y "addq $16 , %0 \n\t" "subq $16 , %1 \n\t" "jnz 1b \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/ddot_microk_sandy-2.c000066400000000000000000000075061313527062700213710ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x "vmovups 64(%2,%0,8), %%ymm14 \n\t" // 2 * x "vmovups 96(%2,%0,8), %%ymm15 \n\t" // 2 * x "vmulpd (%3,%0,8), %%ymm12, %%ymm12 \n\t" // 2 * y "vmulpd 32(%3,%0,8), %%ymm13, %%ymm13 \n\t" // 2 * y "vmulpd 64(%3,%0,8), %%ymm14, %%ymm14 \n\t" // 2 * y "vmulpd 96(%3,%0,8), %%ymm15, %%ymm15 \n\t" // 2 * y "vaddpd %%ymm4 , %%ymm12, %%ymm4 \n\t" // 2 * y "vaddpd %%ymm5 , %%ymm13, %%ymm5 \n\t" // 2 * y "vaddpd %%ymm6 , %%ymm14, %%ymm6 \n\t" // 2 * y "vaddpd %%ymm7 , %%ymm15, %%ymm7 \n\t" // 2 * y "addq $16 , %0 \n\t" "subq $16 , %1 \n\t" "jnz 1b \n\t" "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/ddot_microk_steamroller-2.c000066400000000000000000000076331313527062700226050ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x "vfmadd231pd (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y "vmovups 64(%2,%0,8), %%xmm0 \n\t" // 2 * x "vmovups 80(%2,%0,8), %%xmm1 \n\t" // 2 * x "vfmadd231pd 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y "vmovups 96(%2,%0,8), %%xmm2 \n\t" // 2 * x "vmovups 112(%2,%0,8), %%xmm3 \n\t" // 2 * x "vfmadd231pd 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y "vfmadd231pd 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y "vfmadd231pd 64(%3,%0,8), %%xmm0 , %%xmm4 \n\t" // 2 * y "vfmadd231pd 80(%3,%0,8), %%xmm1 , %%xmm5 \n\t" // 2 * y "vfmadd231pd 96(%3,%0,8), %%xmm2 , %%xmm6 \n\t" // 2 * y "vfmadd231pd 112(%3,%0,8), %%xmm3 , %%xmm7 \n\t" // 2 * y "addq $16 , %0 \n\t" "subq $16 , %1 \n\t" "jnz 1b \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dgemm_kernel_16x2_haswell.S000066400000000000000000003262071313527062700224500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ /********************************************************************* * 2013/10/20 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/20 Saar * Parameter: * DGEMM_DEFAULT_UNROLL_N 2 * DGEMM_DEFAULT_UNROLL_M 16 * DGEMM_DEFAULT_P 192 * DGEMM_DEFAULT_Q 128 * A_PR1 512 * * * Performance without prefetch of B: * 1 thread: 45.8 GFLOPS (MKL: 45) * 2 threads: 80.0 GFLOPS (MKL: 91) * 4 threads: 135.0 GFLOPS (MKL: 135) *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 512*8*4 #define LB2_OFFSET 512*8*2 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #define BUFFER2 LB2_OFFSET+128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(BULLDOZER) .macro VFMADD231PD_ y0,y1,y2 vfmaddpd \y0,\y1,\y2,\y0 .endm .macro VFMADD231SD_ x0,x1,x2 vfmaddsd \x0,\x1,\x2,\x0 .endm #else .macro VFMADD231PD_ y0,y1,y2 vfmadd231pd \y2,\y1,\y0 .endm .macro VFMADD231SD_ x0,x1,x2 vfmadd231sd \x2,\x1,\x0 .endm #endif #define A_PR1 512 #define B_PR1 256 /******************************************************************************************* * 3 lines of N *******************************************************************************************/ .macro KERNEL16x3_SUBN prefetcht0 A_PR1(AO) vbroadcastsd -12 * SIZE(BO), %ymm1 vmovaps -16 * SIZE(AO), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -11 * SIZE(BO), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -10 * SIZE(BO), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovaps -12 * SIZE(AO), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 A_PR1+64(AO) VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 vmovaps -8 * SIZE(AO), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 VFMADD231PD_ %ymm12,%ymm3,%ymm0 vmovaps -4 * SIZE(AO), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 VFMADD231PD_ %ymm15,%ymm3,%ymm0 addq $ 3*SIZE , BO addq $ 16*SIZE, AO .endm .macro KERNEL8x3_SUBN //prefetcht0 A_PR1(AO) vbroadcastsd -12 * SIZE(BO), %ymm1 vmovaps -16 * SIZE(AO), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -11 * SIZE(BO), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -10 * SIZE(BO), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovaps -12 * SIZE(AO), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 //prefetcht0 A_PR1+64(AO) VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 prefetcht0 B_PR1(BO) addq $ 3*SIZE , BO addq $ 8*SIZE, AO .endm .macro KERNEL4x3_SUBN vbroadcastsd -12 * SIZE(BO), %ymm1 vmovaps -16 * SIZE(AO), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -11 * SIZE(BO), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -10 * SIZE(BO), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 addq $ 3*SIZE , BO addq $ 4*SIZE, AO .endm .macro KERNEL2x3_SUBN vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -11 * SIZE(BO), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -10 * SIZE(BO), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 vmovsd -15 * SIZE(AO), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 VFMADD231SD_ %xmm12,%xmm3,%xmm0 addq $ 3*SIZE , BO addq $ 2*SIZE, AO .endm .macro KERNEL1x3_SUBN vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -11 * SIZE(BO), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -10 * SIZE(BO), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 addq $ 3*SIZE , BO addq $ 1*SIZE, AO .endm /******************************************************************************************/ .macro KERNEL16x3_1 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 64+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 VFMADD231PD_ %ymm12,%ymm3,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm15,%ymm3,%ymm0 .endm .macro KERNEL16x3_2 prefetcht0 128+A_PR1(AO, %rax, SIZE) vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 prefetcht0 A_PR1+64(AO,%rax,SIZE) VFMADD231PD_ %ymm9,%ymm3,%ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 prefetcht0 192+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm11,%ymm2,%ymm0 VFMADD231PD_ %ymm12,%ymm3,%ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm15,%ymm3,%ymm0 .endm .macro KERNEL16x3_3 prefetcht0 256+A_PR1(AO, %rax, SIZE) vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 320+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 VFMADD231PD_ %ymm12,%ymm3,%ymm0 vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm15,%ymm3,%ymm0 .endm .macro KERNEL16x3_4 prefetcht0 384+A_PR1(AO, %rax, SIZE) vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 448+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 addq $12, BI VFMADD231PD_ %ymm12,%ymm3,%ymm0 vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 addq $64, %rax VFMADD231PD_ %ymm15,%ymm3,%ymm0 .endm .macro KERNEL16x3_SUB vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 VFMADD231PD_ %ymm12,%ymm3,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 VFMADD231PD_ %ymm15,%ymm3,%ymm0 addq $3 , BI addq $16, %rax .endm .macro SAVE16x3 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm11, %ymm11 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm15, %ymm15 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 vaddpd (CO1, LDC), %ymm5,%ymm5 vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 vaddpd (CO1, LDC, 2), %ymm6,%ymm6 vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 #endif vmovups %ymm4 , (CO1) vmovups %ymm7 , 4 * SIZE(CO1) vmovups %ymm10, 8 * SIZE(CO1) vmovups %ymm13,12 * SIZE(CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm8 , 4 * SIZE(CO1, LDC) vmovups %ymm11, 8 * SIZE(CO1, LDC) vmovups %ymm14,12 * SIZE(CO1, LDC) vmovups %ymm6 , (CO1, LDC, 2) vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) vmovups %ymm15,12 * SIZE(CO1, LDC, 2) .endm /*******************************************************************************************/ .macro KERNEL8x3_1 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 .endm .macro KERNEL8x3_2 prefetcht0 64+A_PR1(AO, %rax, SIZE) vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 .endm .macro KERNEL8x3_3 prefetcht0 128+A_PR1(AO, %rax, SIZE) vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 .endm .macro KERNEL8x3_4 prefetcht0 192+A_PR1(AO, %rax, SIZE) vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 addq $12, BI addq $32, %rax .endm .macro KERNEL8x3_SUB vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 VFMADD231PD_ %ymm9,%ymm3,%ymm0 addq $3 , BI addq $8 , %rax .endm .macro SAVE8x3 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm9 , %ymm9 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 vaddpd (CO1, LDC), %ymm5,%ymm5 vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 vaddpd (CO1, LDC, 2), %ymm6,%ymm6 vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 #endif vmovups %ymm4 , (CO1) vmovups %ymm7 , 4 * SIZE(CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm8 , 4 * SIZE(CO1, LDC) vmovups %ymm6 , (CO1, LDC, 2) vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) .endm /*******************************************************************************************/ .macro KERNEL4x3_1 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 .endm .macro KERNEL4x3_2 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 .endm .macro KERNEL4x3_3 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 .endm .macro KERNEL4x3_4 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 addq $12, BI addq $16, %rax .endm .macro KERNEL4x3_SUB vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PD_ %ymm6,%ymm3,%ymm0 addq $3 , BI addq $4 , %rax .endm .macro SAVE4x3 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd (CO1, LDC), %ymm5,%ymm5 vaddpd (CO1, LDC, 2), %ymm6,%ymm6 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm6 , (CO1, LDC, 2) .endm /*******************************************************************************************/ .macro KERNEL2x3_1 prefetcht0 A_PR1(AO, %rax, SIZE) vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 VFMADD231SD_ %xmm12,%xmm3,%xmm0 .endm .macro KERNEL2x3_2 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 VFMADD231SD_ %xmm12,%xmm3,%xmm0 .endm .macro KERNEL2x3_3 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 VFMADD231SD_ %xmm12,%xmm3,%xmm0 .endm .macro KERNEL2x3_4 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 VFMADD231SD_ %xmm12,%xmm3,%xmm0 addq $12, BI addq $8, %rax .endm .macro KERNEL2x3_SUB vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 VFMADD231SD_ %xmm12,%xmm3,%xmm0 addq $3 , BI addq $2 , %rax .endm .macro SAVE2x3 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm8 , %xmm8 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm10, %xmm10 vmulsd %xmm0 , %xmm6 , %xmm6 vmulsd %xmm0 , %xmm12, %xmm12 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4,%xmm4 vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 vaddsd (CO1, LDC), %xmm5,%xmm5 vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 vaddsd (CO1, LDC, 2), %xmm6,%xmm6 vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm8 , 1 * SIZE(CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm10, 1 * SIZE(CO1, LDC) vmovsd %xmm6 , (CO1, LDC, 2) vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) .endm /*******************************************************************************************/ .macro KERNEL1x3_1 vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 .endm .macro KERNEL1x3_2 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 .endm .macro KERNEL1x3_3 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 .endm .macro KERNEL1x3_4 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 addq $12, BI addq $4, %rax .endm .macro KERNEL1x3_SUB vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SD_ %xmm6,%xmm3,%xmm0 addq $3 , BI addq $1 , %rax .endm .macro SAVE1x3 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm6 , %xmm6 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4,%xmm4 vaddsd (CO1, LDC), %xmm5,%xmm5 vaddsd (CO1, LDC, 2), %xmm6,%xmm6 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (CO1, LDC, 2) .endm /*******************************************************************************************/ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ .macro KERNEL16x2_1 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 64+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm8,%ymm2,%ymm0 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 .endm .macro KERNEL16x2_2 prefetcht0 128+A_PR1(AO, %rax, SIZE) vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 192+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm8,%ymm2,%ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 .endm .macro KERNEL16x2_3 prefetcht0 256+A_PR1(AO, %rax, SIZE) vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 320+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm8,%ymm2,%ymm0 vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 .endm .macro KERNEL16x2_4 prefetcht0 384+A_PR1(AO, %rax, SIZE) vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 prefetcht0 448+A_PR1(AO, %rax, SIZE) VFMADD231PD_ %ymm8,%ymm2,%ymm0 vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 addq $8, BI addq $64, %rax .endm .macro KERNEL16x2_SUB vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 VFMADD231PD_ %ymm11,%ymm2,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 VFMADD231PD_ %ymm14,%ymm2,%ymm0 addq $2, BI addq $16, %rax .endm .macro SAVE16x2 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm11, %ymm11 vmulpd %ymm0 , %ymm14, %ymm14 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 vaddpd (CO1, LDC), %ymm5,%ymm5 vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 #endif vmovups %ymm4 , (CO1) vmovups %ymm7 , 4 * SIZE(CO1) vmovups %ymm10, 8 * SIZE(CO1) vmovups %ymm13,12 * SIZE(CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm8 , 4 * SIZE(CO1, LDC) vmovups %ymm11, 8 * SIZE(CO1, LDC) vmovups %ymm14,12 * SIZE(CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL8x2_1 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 .endm .macro KERNEL8x2_2 prefetcht0 64+A_PR1(AO, %rax, SIZE) vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 .endm .macro KERNEL8x2_3 prefetcht0 128+A_PR1(AO, %rax, SIZE) vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 .endm .macro KERNEL8x2_4 prefetcht0 192+A_PR1(AO, %rax, SIZE) vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 addq $8, BI addq $32, %rax .endm .macro KERNEL8x2_SUB vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 VFMADD231PD_ %ymm8,%ymm2,%ymm0 addq $2, BI addq $8 , %rax .endm .macro SAVE8x2 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm8 , %ymm8 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 vaddpd (CO1, LDC), %ymm5,%ymm5 vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 #endif vmovups %ymm4 , (CO1) vmovups %ymm7 , 4 * SIZE(CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm8 , 4 * SIZE(CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL4x2_1 prefetcht0 A_PR1(AO, %rax, SIZE) vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 .endm .macro KERNEL4x2_2 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 .endm .macro KERNEL4x2_3 prefetcht0 64+A_PR1(AO, %rax, SIZE) vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 .endm .macro KERNEL4x2_4 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 addq $8, BI addq $16, %rax .endm .macro KERNEL4x2_SUB vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PD_ %ymm5,%ymm2,%ymm0 addq $2, BI addq $4 , %rax .endm .macro SAVE4x2 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd (CO1, LDC), %ymm5,%ymm5 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , (CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL2x2_1 prefetcht0 A_PR1(AO, %rax, SIZE) vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 .endm .macro KERNEL2x2_2 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 .endm .macro KERNEL2x2_3 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 .endm .macro KERNEL2x2_4 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 addq $8, BI addq $8, %rax .endm .macro KERNEL2x2_SUB vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 VFMADD231SD_ %xmm10,%xmm2,%xmm0 addq $2, BI addq $2, %rax .endm .macro SAVE2x2 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm8 , %xmm8 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm10, %xmm10 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4,%xmm4 vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 vaddsd (CO1, LDC), %xmm5,%xmm5 vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm8 , 1 * SIZE(CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm10, 1 * SIZE(CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL1x2_1 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 .endm .macro KERNEL1x2_2 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 .endm .macro KERNEL1x2_3 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 .endm .macro KERNEL1x2_4 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 addq $8, BI addq $4, %rax .endm .macro KERNEL1x2_SUB vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SD_ %xmm5,%xmm2,%xmm0 addq $2, BI addq $1, %rax .endm .macro SAVE1x2 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4,%xmm4 vaddsd (CO1, LDC), %xmm5,%xmm5 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) .endm /*******************************************************************************************/ /******************************************************************************************* * 1 line of N *******************************************************************************************/ .macro KERNEL16x1_1 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 .endm .macro KERNEL16x1_2 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 .endm .macro KERNEL16x1_3 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 .endm .macro KERNEL16x1_4 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 addq $4, BI addq $64, %rax .endm .macro KERNEL16x1_SUB vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm10,%ymm1,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm13,%ymm1,%ymm0 addq $1, BI addq $16, %rax .endm .macro SAVE16x1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm13, %ymm13 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 #endif vmovups %ymm4 , (CO1) vmovups %ymm7 , 4 * SIZE(CO1) vmovups %ymm10, 8 * SIZE(CO1) vmovups %ymm13,12 * SIZE(CO1) .endm /*******************************************************************************************/ .macro KERNEL8x1_1 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 .endm .macro KERNEL8x1_2 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 .endm .macro KERNEL8x1_3 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 .endm .macro KERNEL8x1_4 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 addq $4, BI addq $32, %rax .endm .macro KERNEL8x1_SUB vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm7,%ymm1,%ymm0 addq $1, BI addq $8 , %rax .endm .macro SAVE8x1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm7 , 4 * SIZE(CO1) .endm /*******************************************************************************************/ .macro KERNEL4x1_1 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 .endm .macro KERNEL4x1_2 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 .endm .macro KERNEL4x1_3 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 .endm .macro KERNEL4x1_4 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 addq $4, BI addq $16, %rax .endm .macro KERNEL4x1_SUB vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 VFMADD231PD_ %ymm4,%ymm1,%ymm0 addq $1, BI addq $4 , %rax .endm .macro SAVE4x1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4,%ymm4 #endif vmovups %ymm4 , (CO1) .endm /*******************************************************************************************/ .macro KERNEL2x1_1 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 .endm .macro KERNEL2x1_2 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 .endm .macro KERNEL2x1_3 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 .endm .macro KERNEL2x1_4 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 addq $4, BI addq $8, %rax .endm .macro KERNEL2x1_SUB vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm8,%xmm1,%xmm0 addq $1, BI addq $2 , %rax .endm .macro SAVE2x1 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm8 , %xmm8 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4,%xmm4 vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm8 , 1 * SIZE(CO1) .endm /*******************************************************************************************/ .macro KERNEL1x1_1 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 .endm .macro KERNEL1x1_2 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 .endm .macro KERNEL1x1_3 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 .endm .macro KERNEL1x1_4 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 addq $ 4, BI addq $ 4, %rax .endm .macro KERNEL1x1_SUB vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 VFMADD231SD_ %xmm4,%xmm1,%xmm0 addq $ 1, BI addq $ 1 , %rax .endm .macro SAVE1x1 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4,%xmm4 #endif vmovsd %xmm4 , (CO1) .endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 .L6_01: // copy to sub buffer movq K, %rax salq $1,%rax // K * 2 ; read 2 values movq B, BO1 leaq (B,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_01a_2 ALIGN_4 .L6_01a_1: prefetcht0 512(BO1) prefetcht0 512(BO2) prefetchw 512(BO) vmovups 0 * SIZE(BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm2 vmovups 4 * SIZE(BO1), %xmm4 vmovups 6 * SIZE(BO1), %xmm6 vmovsd 0 * SIZE(BO2), %xmm1 vmovsd 2 * SIZE(BO2), %xmm3 vmovsd 4 * SIZE(BO2), %xmm5 vmovsd 6 * SIZE(BO2), %xmm7 vmovups %xmm0, 0*SIZE(BO) vmovsd %xmm1, 2*SIZE(BO) vmovups %xmm2, 3*SIZE(BO) vmovsd %xmm3, 5*SIZE(BO) vmovups %xmm4, 6*SIZE(BO) vmovsd %xmm5, 8*SIZE(BO) vmovups %xmm6, 9*SIZE(BO) vmovsd %xmm7,11*SIZE(BO) addq $ 8*SIZE,BO1 addq $ 8*SIZE,BO2 addq $ 12*SIZE,BO vmovups 0 * SIZE(BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm2 vmovups 4 * SIZE(BO1), %xmm4 vmovups 6 * SIZE(BO1), %xmm6 vmovsd 0 * SIZE(BO2), %xmm1 vmovsd 2 * SIZE(BO2), %xmm3 vmovsd 4 * SIZE(BO2), %xmm5 vmovsd 6 * SIZE(BO2), %xmm7 vmovups %xmm0, 0*SIZE(BO) vmovsd %xmm1, 2*SIZE(BO) vmovups %xmm2, 3*SIZE(BO) vmovsd %xmm3, 5*SIZE(BO) vmovups %xmm4, 6*SIZE(BO) vmovsd %xmm5, 8*SIZE(BO) vmovups %xmm6, 9*SIZE(BO) vmovsd %xmm7,11*SIZE(BO) addq $ 8*SIZE,BO1 addq $ 8*SIZE,BO2 addq $ 12*SIZE,BO decq %rax jnz .L6_01a_1 .L6_01a_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_02c ALIGN_4 .L6_02b: vmovups 0 * SIZE(BO1), %xmm0 vmovsd 0 * SIZE(BO2), %xmm2 vmovups %xmm0, 0*SIZE(BO) vmovsd %xmm2, 2*SIZE(BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO2 addq $ 3*SIZE,BO decq %rax jnz .L6_02b .L6_02c: movq K, %rax salq $1,%rax // K * 2 leaq (B,%rax, SIZE), BO1 // next offset to BO1 leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER2, BO // second buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_02c_2 ALIGN_4 .L6_02c_1: prefetcht0 512(BO2) prefetchw 512(BO) vmovups 0 * SIZE(BO2), %xmm0 vmovups 2 * SIZE(BO2), %xmm2 vmovups 4 * SIZE(BO2), %xmm4 vmovups 6 * SIZE(BO2), %xmm6 vmovsd 1 * SIZE(BO1), %xmm1 vmovsd 3 * SIZE(BO1), %xmm3 vmovsd 5 * SIZE(BO1), %xmm5 vmovsd 7 * SIZE(BO1), %xmm7 vmovsd %xmm1, 0*SIZE(BO) vmovups %xmm0, 1*SIZE(BO) vmovsd %xmm3, 3*SIZE(BO) vmovups %xmm2, 4*SIZE(BO) vmovsd %xmm5, 6*SIZE(BO) vmovups %xmm4, 7*SIZE(BO) vmovsd %xmm7, 9*SIZE(BO) vmovups %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO vmovups 0 * SIZE(BO2), %xmm0 vmovups 2 * SIZE(BO2), %xmm2 vmovups 4 * SIZE(BO2), %xmm4 vmovups 6 * SIZE(BO2), %xmm6 vmovsd 1 * SIZE(BO1), %xmm1 vmovsd 3 * SIZE(BO1), %xmm3 vmovsd 5 * SIZE(BO1), %xmm5 vmovsd 7 * SIZE(BO1), %xmm7 vmovsd %xmm1, 0*SIZE(BO) vmovups %xmm0, 1*SIZE(BO) vmovsd %xmm3, 3*SIZE(BO) vmovups %xmm2, 4*SIZE(BO) vmovsd %xmm5, 6*SIZE(BO) vmovups %xmm4, 7*SIZE(BO) vmovsd %xmm7, 9*SIZE(BO) vmovups %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_02c_1 .L6_02c_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_03c ALIGN_4 .L6_03b: vmovsd 1*SIZE(BO1), %xmm0 vmovups 0*SIZE(BO2), %xmm1 vmovsd %xmm0, 0*SIZE(BO) vmovups %xmm1, 1*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_03b .L6_03c: movq BO2, B // next offset of B .L6_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L6_20 ALIGN_4 .L6_11: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO prefetcht0 (CO1) prefetcht0 (CO1,LDC,1) prefetcht0 (CO1,LDC,2) prefetcht0 64(CO1) prefetcht0 64(CO1,LDC,1) prefetcht0 64(CO1,LDC,2) vzeroall movq K, %rax sarq $1, %rax // K / 8 je .L6_16 ALIGN_5 .L6_12: /* prefetcht0 B_PR1(BO) prefetcht0 B_PR1+64(BO) prefetcht0 B_PR1+128(BO) */ KERNEL16x3_SUBN KERNEL16x3_SUBN /* KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN */ dec %rax jne .L6_12 .L6_16: movq K, %rax andq $1, %rax # if (k & 1) je .L6_19 ALIGN_4 .L6_17: KERNEL16x3_SUBN dec %rax jne .L6_17 ALIGN_4 .L6_19: SAVE16x3 addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L6_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_20: // Test rest of M testq $15, M jz .L7_10 // to next 3 lines of N testq $8, M jz .L6_21pre ALIGN_4 /**************************************************************************/ .L6_20_1: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3, %rax je .L6_20_6 ALIGN_4 .L6_20_2: KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN dec %rax jne .L6_20_2 ALIGN_4 .L6_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L6_20_9 ALIGN_4 .L6_20_7: KERNEL8x3_SUBN dec %rax jne .L6_20_7 ALIGN_4 .L6_20_9: SAVE8x3 addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L6_21pre: testq $4, M jz .L6_30 ALIGN_4 .L6_21: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3, %rax je .L6_26 ALIGN_4 .L6_22: KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN dec %rax jne .L6_22 ALIGN_4 .L6_26: movq K, %rax andq $7, %rax # if (k & 1) je .L6_29 ALIGN_4 .L6_27: KERNEL4x3_SUBN dec %rax jne .L6_27 ALIGN_4 .L6_29: SAVE4x3 addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L6_30: testq $2, M jz .L6_40 ALIGN_4 .L6_31: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3, %rax je .L6_36 ALIGN_4 .L6_32: KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN dec %rax jne .L6_32 ALIGN_4 .L6_36: movq K, %rax andq $7, %rax # if (k & 1) je .L6_39 ALIGN_4 .L6_37: KERNEL2x3_SUBN dec %rax jne .L6_37 ALIGN_4 .L6_39: SAVE2x3 addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L6_40: testq $1, M jz .L7_10 // to next 3 lines of N ALIGN_4 .L6_41: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3,%rax je .L6_46 ALIGN_4 .L6_42: KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN dec %rax jne .L6_42 ALIGN_4 .L6_46: movq K, %rax andq $7, %rax # if (k & 1) je .L6_49 ALIGN_4 .L6_47: KERNEL1x3_SUBN dec %rax jne .L6_47 ALIGN_4 .L6_49: SAVE1x3 addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 /***************************************************************************************************************/ .L7_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L7_20 ALIGN_4 .L7_11: leaq BUFFER2, BO // second buffer to BO addq $12 * SIZE, BO prefetcht0 (CO1) prefetcht0 (CO1,LDC,1) prefetcht0 (CO1,LDC,2) prefetcht0 64(CO1) prefetcht0 64(CO1,LDC,1) prefetcht0 64(CO1,LDC,2) vzeroall movq K, %rax sarq $3, %rax // K / 8 je .L7_16 ALIGN_5 .L7_12: /* prefetcht0 B_PR1(BO) prefetcht0 B_PR1+64(BO) prefetcht0 B_PR1+128(BO) */ KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN KERNEL16x3_SUBN dec %rax jne .L7_12 ALIGN_4 .L7_16: movq K, %rax andq $7, %rax # if (k & 1) je .L7_19 ALIGN_5 .L7_17: KERNEL16x3_SUBN dec %rax jne .L7_17 .L7_19: SAVE16x3 addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L7_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L7_20: // Test rest of M testq $15, M jz .L7_60 // to next 3 lines of N testq $8, M jz .L7_21pre ALIGN_4 /**************************************************************************/ .L7_20_1: leaq BUFFER2, BO // first buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3, %rax je .L7_20_6 ALIGN_4 .L7_20_2: KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN KERNEL8x3_SUBN dec %rax jne .L7_20_2 ALIGN_4 .L7_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L7_20_9 ALIGN_4 .L7_20_7: KERNEL8x3_SUBN dec %rax jne .L7_20_7 ALIGN_4 .L7_20_9: SAVE8x3 addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L7_21pre: testq $4, M jz .L7_30 ALIGN_4 .L7_21: leaq BUFFER2, BO // second buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3, %rax je .L7_26 ALIGN_4 .L7_22: KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN KERNEL4x3_SUBN dec %rax jne .L7_22 ALIGN_4 .L7_26: movq K, %rax andq $7, %rax # if (k & 1) je .L7_29 ALIGN_4 .L7_27: KERNEL4x3_SUBN dec %rax jne .L7_27 ALIGN_4 .L7_29: SAVE4x3 addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L7_30: testq $2, M jz .L7_40 ALIGN_4 .L7_31: leaq BUFFER2, BO // second buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3, %rax je .L7_36 ALIGN_4 .L7_32: KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN KERNEL2x3_SUBN dec %rax jne .L7_32 ALIGN_4 .L7_36: movq K, %rax andq $7, %rax # if (k & 1) je .L7_39 ALIGN_4 .L7_37: KERNEL2x3_SUBN dec %rax jne .L7_37 ALIGN_4 .L7_39: SAVE2x3 addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L7_40: testq $1, M jz .L7_60 // to next 3 lines of N ALIGN_4 .L7_41: leaq BUFFER2, BO // second buffer to BO addq $12 * SIZE, BO vzeroall movq K, %rax sarq $3, %rax je .L7_46 ALIGN_4 .L7_42: KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN KERNEL1x3_SUBN dec %rax jne .L7_42 ALIGN_4 .L7_46: movq K, %rax andq $7, %rax # if (k & 1) je .L7_49 ALIGN_4 .L7_47: KERNEL1x3_SUBN dec %rax jne .L7_47 ALIGN_4 .L7_49: SAVE1x3 addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L7_60: decq J // j -- jg .L6_01 .L2_0: cmpq $0, Nmod6 // N % 6 == 0 je .L999 /************************************************************************************************ * Loop for Nmod6 / 2 > 0 *************************************************************************************************/ movq Nmod6, J sarq $1, J // j = j / 2 je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L2_01b ALIGN_4 .L2_01a: prefetcht0 512(BO1) prefetchw 512(BO) vmovups (BO1), %xmm0 vmovups 2*SIZE(BO1), %xmm1 vmovups 4*SIZE(BO1), %xmm2 vmovups 6*SIZE(BO1), %xmm3 vmovups %xmm0, (BO) vmovups %xmm1, 2*SIZE(BO) vmovups %xmm2, 4*SIZE(BO) vmovups %xmm3, 6*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO decq %rax jnz .L2_01a .L2_01b: movq K, %rax andq $3, %rax // K % 4 jz .L2_02d ALIGN_4 .L2_02c: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02c .L2_02d: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 je .L2_16 prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB jl .L2_17 ALIGN_4 .L2_19: SAVE16x2 addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 je .L2_20_6 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB jl .L2_20_7 ALIGN_4 .L2_20_9: SAVE8x2 addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 je .L2_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: movq K, %rax andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB jl .L2_27 ALIGN_4 .L2_29: SAVE4x2 addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 je .L2_36 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB jl .L2_37 ALIGN_4 .L2_39: SAVE2x2 addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 je .L2_46 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB jl .L2_47 ALIGN_4 .L2_49: SAVE1x2 addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,8) KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 je .L1_16 prefetcht0 B_PR1(BO,BI,8) KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB jl .L1_17 ALIGN_4 .L1_19: SAVE16x1 addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 je .L1_20_6 prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB jl .L1_20_7 ALIGN_4 .L1_20_9: SAVE8x1 addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 je .L1_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: movq K, %rax andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB jl .L1_27 ALIGN_4 .L1_29: SAVE4x1 addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 je .L1_36 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB jl .L1_37 ALIGN_4 .L1_39: SAVE2x1 addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 je .L1_46 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB jl .L1_47 ALIGN_4 .L1_49: SAVE1x1 addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L2_01b ALIGN_4 .L2_01a: prefetcht0 512(BO1) prefetchw 512(BO) vmovups (BO1), %xmm0 vmovups 2*SIZE(BO1), %xmm1 vmovups 4*SIZE(BO1), %xmm2 vmovups 6*SIZE(BO1), %xmm3 vmovups %xmm0, (BO) vmovups %xmm1, 2*SIZE(BO) vmovups %xmm2, 4*SIZE(BO) vmovups %xmm3, 6*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO decq %rax jnz .L2_01a .L2_01b: movq K, %rax andq $3, %rax // K % 4 jz .L2_02d ALIGN_4 .L2_02c: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02c .L2_02d: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 je .L2_16 prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL16x2_1 KERNEL16x2_2 KERNEL16x2_3 KERNEL16x2_4 je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB jl .L2_17 ALIGN_4 .L2_19: SAVE16x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 je .L2_20_6 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1 KERNEL8x2_2 KERNEL8x2_3 KERNEL8x2_4 je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB jl .L2_20_7 ALIGN_4 .L2_20_9: SAVE8x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 je .L2_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1 KERNEL4x2_2 KERNEL4x2_3 KERNEL4x2_4 je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB jl .L2_27 ALIGN_4 .L2_29: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 je .L2_36 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_3 KERNEL2x2_4 je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB jl .L2_37 ALIGN_4 .L2_39: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 je .L2_46 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_3 KERNEL1x2_4 je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB jl .L2_47 ALIGN_4 .L2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,8) KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 je .L1_16 prefetcht0 B_PR1(BO,BI,8) KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 KERNEL16x1_1 KERNEL16x1_2 KERNEL16x1_3 KERNEL16x1_4 je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB jl .L1_17 ALIGN_4 .L1_19: SAVE16x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 je .L1_20_6 prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 KERNEL8x1_1 KERNEL8x1_2 KERNEL8x1_3 KERNEL8x1_4 je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB jl .L1_20_7 ALIGN_4 .L1_20_9: SAVE8x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 je .L1_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 KERNEL4x1_1 KERNEL4x1_2 KERNEL4x1_3 KERNEL4x1_4 je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB jl .L1_27 ALIGN_4 .L1_29: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 je .L1_36 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_3 KERNEL2x1_4 je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB jl .L1_37 ALIGN_4 .L1_39: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 je .L1_46 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_3 KERNEL1x1_4 je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB jl .L1_47 ALIGN_4 .L1_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/dgemm_kernel_4x4_haswell.S000066400000000000000000002021551313527062700223620ustar00rootroot00000000000000/********************************************************************************* Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* * 2013/10/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/27 Saar * Parameter: * DGEMM_DEFAULT_UNROLL_N 4 * DGEMM_DEFAULT_UNROLL_M 4 * DGEMM_DEFAULT_P 512 * DGEMM_DEFAULT_Q 256 * A_PR1 512 * B_PR1 512 * * * Performance at 9216x9216x9216: * 1 thread: 53.3 GFLOPS (MKL: 54) * 2 threads: 100.0 GFLOPS (MKL: 97) * 3 threads: 147.0 GFLOPS (MKL: 133) * 4 threads: 184.0 GFLOPS (MKL: 170) *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define SP %rbx #define BO1 %rdi #define BO2 %r15 #define BO3 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 96 #define L_BUFFER_SIZE 256*8*12+4096 #else #define STACKSIZE 256 #define L_BUFFER_SIZE 128*8*12+512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define Ndiv12 24(%rsp) #define Nmod12 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $ 0, 4096 * 4(%rsp);\ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #define A_PR1 512 #define B_PR1 512 /******************************************************************************************* * Macro definitions *******************************************************************************************/ .macro INIT4x12 vxorpd %ymm4 , %ymm4 , %ymm4 vxorpd %ymm5 , %ymm5 , %ymm5 vxorpd %ymm6 , %ymm6 , %ymm6 vxorpd %ymm7 , %ymm7 , %ymm7 vxorpd %ymm8 , %ymm8 , %ymm8 vxorpd %ymm9 , %ymm9 , %ymm9 vxorpd %ymm10, %ymm10, %ymm10 vxorpd %ymm11, %ymm11, %ymm11 vxorpd %ymm12, %ymm12, %ymm12 vxorpd %ymm13, %ymm13, %ymm13 vxorpd %ymm14, %ymm14, %ymm14 vxorpd %ymm15, %ymm15, %ymm15 .endm .macro KERNEL4x12_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 prefetcht0 B_PR1(BO) vmovups -16 * SIZE(AO), %ymm0 prefetcht0 B_PR1+64(BO) vmovups -8 * SIZE(BO), %ymm2 prefetcht0 B_PR1+128(BO) vmovups -4 * SIZE(BO), %ymm3 vmulpd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+192(BO) vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm3 , %ymm15 vmovups -4 * SIZE(BO), %ymm3 .endm .macro KERNEL4x12_M1 prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 vmovups -4 * SIZE(BO), %ymm3 .endm .macro KERNEL4x12_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vmovups 4 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 vmovups 8 * SIZE(BO), %ymm3 addq $ 24*SIZE, BO .endm .macro KERNEL4x12_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 addq $ 12*SIZE, BO .endm .macro KERNEL4x12_SUB vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 .endm .macro SAVE4x12 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 vpermpd $ 0xb1 , %ymm5, %ymm5 vpermpd $ 0xb1 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4, %ymm4 vaddpd (CO1, LDC), %ymm5, %ymm5 vaddpd (%rax), %ymm6, %ymm6 vaddpd (%rax, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) prefetcht0 32(CO1) prefetcht0 32(CO1,LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) vpermpd $ 0xb1 , %ymm9 , %ymm9 vpermpd $ 0xb1 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %ymm4, %ymm4 vaddpd (%rax, LDC), %ymm5, %ymm5 vaddpd (%rbp), %ymm6, %ymm6 vaddpd (%rbp, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (%rax) vmovups %ymm5 , (%rax, LDC) vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) vpermpd $ 0xb1 , %ymm13, %ymm13 vpermpd $ 0xb1 , %ymm15, %ymm15 vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %ymm4, %ymm4 vaddpd (%rax, LDC), %ymm5, %ymm5 vaddpd (%rbp), %ymm6, %ymm6 vaddpd (%rbp, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (%rax) vmovups %ymm5 , (%rax, LDC) vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT2x12 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 vxorpd %xmm12, %xmm12, %xmm12 vxorpd %xmm13, %xmm13, %xmm13 vxorpd %xmm14, %xmm14, %xmm14 vxorpd %xmm15, %xmm15, %xmm15 .endm .macro KERNEL2x12_SUB vmovups -16 * SIZE(AO), %xmm0 vmovddup -12 * SIZE(BO), %xmm1 vmovddup -11 * SIZE(BO), %xmm2 vmovddup -10 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm4 vmovddup -9 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm5 vmovddup -8 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 vmovddup -7 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm7 vmovddup -6 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm8 vmovddup -5 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm9 vmovddup -4 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm10 vmovddup -3 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm11 vmovddup -2 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm12 vmovddup -1 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm13 addq $ 12*SIZE, BO vfmadd231pd %xmm0 ,%xmm2 , %xmm14 addq $ 2*SIZE, AO vfmadd231pd %xmm0 ,%xmm3 , %xmm15 .endm .macro SAVE2x12 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm5 , %xmm5 vmulpd %xmm0 , %xmm6 , %xmm6 vmulpd %xmm0 , %xmm7 , %xmm7 vmulpd %xmm0 , %xmm8 , %xmm8 vmulpd %xmm0 , %xmm9 , %xmm9 vmulpd %xmm0 , %xmm10, %xmm10 vmulpd %xmm0 , %xmm11, %xmm11 vmulpd %xmm0 , %xmm12, %xmm12 vmulpd %xmm0 , %xmm13, %xmm13 vmulpd %xmm0 , %xmm14, %xmm14 vmulpd %xmm0 , %xmm15, %xmm15 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %xmm4, %xmm4 vaddpd (CO1, LDC), %xmm5, %xmm5 vaddpd (%rax), %xmm6, %xmm6 vaddpd (%rax, LDC), %xmm7, %xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (%rax) vmovups %xmm7 , (%rax, LDC) leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %xmm8 , %xmm4 vaddpd (%rax, LDC), %xmm9 , %xmm5 vaddpd (%rbp), %xmm10, %xmm6 vaddpd (%rbp, LDC), %xmm11, %xmm7 #endif vmovups %xmm4 , (%rax) vmovups %xmm5 , (%rax, LDC) vmovups %xmm6 , (%rbp) vmovups %xmm7 , (%rbp, LDC) leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %xmm12, %xmm4 vaddpd (%rax, LDC), %xmm13, %xmm5 vaddpd (%rbp), %xmm14, %xmm6 vaddpd (%rbp, LDC), %xmm15, %xmm7 #endif vmovups %xmm4 , (%rax) vmovups %xmm5 , (%rax, LDC) vmovups %xmm6 , (%rbp) vmovups %xmm7 , (%rbp, LDC) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT1x12 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 vxorpd %xmm12, %xmm12, %xmm12 vxorpd %xmm13, %xmm13, %xmm13 vxorpd %xmm14, %xmm14, %xmm14 vxorpd %xmm15, %xmm15, %xmm15 .endm .macro KERNEL1x12_SUB vmovsd -16 * SIZE(AO), %xmm0 vmovsd -12 * SIZE(BO), %xmm1 vmovsd -11 * SIZE(BO), %xmm2 vmovsd -10 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vmovsd -9 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 vmovsd -8 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm6 vmovsd -7 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm7 vmovsd -6 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm8 vmovsd -5 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm9 vmovsd -4 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm10 vmovsd -3 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm11 vmovsd -2 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm12 vmovsd -1 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm13 addq $ 12*SIZE, BO vfmadd231sd %xmm0 ,%xmm2 , %xmm14 addq $ 1*SIZE, AO vfmadd231sd %xmm0 ,%xmm3 , %xmm15 .endm .macro SAVE1x12 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm6 , %xmm6 vmulsd %xmm0 , %xmm7 , %xmm7 vmulsd %xmm0 , %xmm8 , %xmm8 vmulsd %xmm0 , %xmm9 , %xmm9 vmulsd %xmm0 , %xmm10, %xmm10 vmulsd %xmm0 , %xmm11, %xmm11 vmulsd %xmm0 , %xmm12, %xmm12 vmulsd %xmm0 , %xmm13, %xmm13 vmulsd %xmm0 , %xmm14, %xmm14 vmulsd %xmm0 , %xmm15, %xmm15 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 vaddsd (CO1, LDC), %xmm5, %xmm5 vaddsd (%rax), %xmm6, %xmm6 vaddsd (%rax, LDC), %xmm7, %xmm7 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (%rax) vmovsd %xmm7 , (%rax, LDC) leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddsd (%rax), %xmm8 , %xmm4 vaddsd (%rax, LDC), %xmm9 , %xmm5 vaddsd (%rbp), %xmm10, %xmm6 vaddsd (%rbp, LDC), %xmm11, %xmm7 #endif vmovsd %xmm4 , (%rax) vmovsd %xmm5 , (%rax, LDC) vmovsd %xmm6 , (%rbp) vmovsd %xmm7 , (%rbp, LDC) leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp #if !defined(TRMMKERNEL) vaddsd (%rax), %xmm12, %xmm4 vaddsd (%rax, LDC), %xmm13, %xmm5 vaddsd (%rbp), %xmm14, %xmm6 vaddsd (%rbp, LDC), %xmm15, %xmm7 #endif vmovsd %xmm4 , (%rax) vmovsd %xmm5 , (%rax, LDC) vmovsd %xmm6 , (%rbp) vmovsd %xmm7 , (%rbp, LDC) addq $ 1*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT4x4 vxorpd %ymm4 , %ymm4 , %ymm4 vxorpd %ymm5 , %ymm5 , %ymm5 vxorpd %ymm6 , %ymm6 , %ymm6 vxorpd %ymm7 , %ymm7 , %ymm7 .endm .macro KERNEL4x4_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M1 prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO .endm .macro KERNEL4x4_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm .macro SAVE4x4 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vpermpd $ 0xb1 , %ymm5, %ymm5 vpermpd $ 0xb1 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4, %ymm4 vaddpd (CO1, LDC), %ymm5, %ymm5 vaddpd (%rax), %ymm6, %ymm6 vaddpd (%rax, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT2x4 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 .endm .macro KERNEL2x4_SUB vmovddup -12 * SIZE(BO), %xmm1 vmovups -16 * SIZE(AO), %xmm0 vmovddup -11 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm1 , %xmm4 vmovddup -10 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm2 , %xmm5 vmovddup -9 * SIZE(BO), %xmm8 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 addq $ 4*SIZE, BO vfmadd231pd %xmm0 ,%xmm8 , %xmm7 addq $ 2*SIZE, AO .endm .macro SAVE2x4 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm5 , %xmm5 vmulpd %xmm0 , %xmm6 , %xmm6 vmulpd %xmm0 , %xmm7 , %xmm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %xmm4, %xmm4 vaddpd (CO1, LDC), %xmm5, %xmm5 vaddpd (%rax), %xmm6, %xmm6 vaddpd (%rax, LDC), %xmm7, %xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (%rax) vmovups %xmm7 , (%rax, LDC) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT1x4 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 .endm .macro KERNEL1x4_SUB vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 vmovsd -11 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vmovsd -10 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 vmovsd -9 * SIZE(BO), %xmm8 vfmadd231sd %xmm0 ,%xmm3 , %xmm6 addq $ 4*SIZE, BO vfmadd231sd %xmm0 ,%xmm8 , %xmm7 addq $ 1*SIZE, AO .endm .macro SAVE1x4 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm6 , %xmm6 vmulsd %xmm0 , %xmm7 , %xmm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 vaddsd (CO1, LDC), %xmm5, %xmm5 vaddsd (%rax), %xmm6, %xmm6 vaddsd (%rax, LDC), %xmm7, %xmm7 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (%rax) vmovsd %xmm7 , (%rax, LDC) addq $ 1*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT4x2 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 .endm .macro KERNEL4x2_SUB vmovddup -12 * SIZE(BO), %xmm2 vmovups -16 * SIZE(AO), %xmm0 vmovups -14 * SIZE(AO), %xmm1 vmovddup -11 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 vfmadd231pd %xmm1 ,%xmm2 , %xmm5 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 vfmadd231pd %xmm1 ,%xmm3 , %xmm7 addq $ 2*SIZE, BO addq $ 4*SIZE, AO .endm .macro SAVE4x2 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm5 , %xmm5 vmulpd %xmm0 , %xmm6 , %xmm6 vmulpd %xmm0 , %xmm7 , %xmm7 #if !defined(TRMMKERNEL) vaddpd (CO1) , %xmm4, %xmm4 vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 vaddpd (CO1, LDC), %xmm6, %xmm6 vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , 2 * SIZE(CO1) vmovups %xmm6 , (CO1, LDC) vmovups %xmm7 , 2 * SIZE(CO1, LDC) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT2x2 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm6 , %xmm6 , %xmm6 .endm .macro KERNEL2x2_SUB vmovddup -12 * SIZE(BO), %xmm2 vmovups -16 * SIZE(AO), %xmm0 vmovddup -11 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 addq $ 2*SIZE, BO addq $ 2*SIZE, AO .endm .macro SAVE2x2 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm6 , %xmm6 #if !defined(TRMMKERNEL) vaddpd (CO1) , %xmm4, %xmm4 vaddpd (CO1, LDC), %xmm6, %xmm6 #endif vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT1x2 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 .endm .macro KERNEL1x2_SUB vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 vmovsd -11 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 addq $ 2*SIZE, BO addq $ 1*SIZE, AO .endm .macro SAVE1x2 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 vaddsd (CO1, LDC), %xmm5, %xmm5 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) addq $ 1*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT4x1 vxorpd %ymm4 , %ymm4 , %ymm4 vxorpd %ymm5 , %ymm5 , %ymm5 vxorpd %ymm6 , %ymm6 , %ymm6 vxorpd %ymm7 , %ymm7 , %ymm7 .endm .macro KERNEL4x1 vbroadcastsd -12 * SIZE(BO), %ymm0 vbroadcastsd -11 * SIZE(BO), %ymm1 vbroadcastsd -10 * SIZE(BO), %ymm2 vbroadcastsd -9 * SIZE(BO), %ymm3 vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 vbroadcastsd -8 * SIZE(BO), %ymm0 vbroadcastsd -7 * SIZE(BO), %ymm1 vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 vbroadcastsd -6 * SIZE(BO), %ymm2 vbroadcastsd -5 * SIZE(BO), %ymm3 vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 addq $ 8 *SIZE, BO addq $ 32*SIZE, AO .endm .macro KERNEL4x1_SUB vbroadcastsd -12 * SIZE(BO), %ymm2 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm2 , %ymm4 addq $ 1*SIZE, BO addq $ 4*SIZE, AO .endm .macro SAVE4x1 vbroadcastsd ALPHA, %ymm0 vaddpd %ymm4,%ymm5, %ymm4 vaddpd %ymm6,%ymm7, %ymm6 vaddpd %ymm4,%ymm6, %ymm4 vmulpd %ymm0 , %ymm4 , %ymm4 #if !defined(TRMMKERNEL) vaddpd (CO1) , %ymm4, %ymm4 #endif vmovups %ymm4 , (CO1) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT2x1 vxorpd %xmm4 , %xmm4 , %xmm4 .endm .macro KERNEL2x1_SUB vmovddup -12 * SIZE(BO), %xmm2 vmovups -16 * SIZE(AO), %xmm0 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 addq $ 1*SIZE, BO addq $ 2*SIZE, AO .endm .macro SAVE2x1 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddpd (CO1) , %xmm4, %xmm4 #endif vmovups %xmm4 , (CO1) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT1x1 vxorpd %xmm4 , %xmm4 , %xmm4 .endm .macro KERNEL1x1_SUB vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 addq $ 1*SIZE, BO addq $ 1*SIZE, AO .endm .macro SAVE1x1 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 #endif vmovsd %xmm4 , (CO1) addq $ 1*SIZE, CO1 .endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovups %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $12, %rdi divq %rdi // N / 12 movq %rax, Ndiv12 // N / 12 movq %rdx, Nmod12 // N % 12 movq Ndiv12, J cmpq $ 0, J je .L4_0 ALIGN_4 .L12_01: // copy to sub buffer movq K, %rax salq $2,%rax // K * 4 ; read 2 values movq B, BO1 leaq (B,%rax, SIZE), BO2 // next offset to BO2 leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $1 , %rax // K / 2 jz .L12_01a_2 ALIGN_4 .L12_01a_1: prefetcht0 512(BO1) prefetcht0 512(BO2) prefetcht0 512(BO3) prefetchw 512(BO) vmovups 0 * SIZE(BO1), %ymm1 vmovups 4 * SIZE(BO1), %ymm5 vmovups 0 * SIZE(BO2), %ymm2 vmovups 4 * SIZE(BO2), %ymm6 vmovups 0 * SIZE(BO3), %ymm3 vmovups 4 * SIZE(BO3), %ymm7 vmovups %ymm1, 0 * SIZE(BO) vmovups %ymm2, 4 * SIZE(BO) vmovups %ymm3, 8 * SIZE(BO) vmovups %ymm5, 12 * SIZE(BO) vmovups %ymm6, 16 * SIZE(BO) vmovups %ymm7, 20 * SIZE(BO) addq $ 8 * SIZE ,BO1 addq $ 8 * SIZE ,BO2 addq $ 8 * SIZE ,BO3 addq $ 24 *SIZE ,BO decq %rax jnz .L12_01a_1 .L12_01a_2: movq K, %rax andq $1, %rax // K % 2 jz .L12_03c ALIGN_4 .L12_02b: vmovups 0 * SIZE(BO1), %ymm1 vmovups 0 * SIZE(BO2), %ymm2 vmovups 0 * SIZE(BO3), %ymm3 vmovups %ymm1, 0 * SIZE(BO) vmovups %ymm2, 4 * SIZE(BO) vmovups %ymm3, 8 * SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO2 addq $ 4*SIZE,BO3 addq $ 12*SIZE,BO decq %rax jnz .L12_02b .L12_03c: movq BO3, B // next offset of B .L12_10: movq C, CO1 leaq (C, LDC, 8), C leaq (C, LDC, 4), C // c += 12 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L12_20 ALIGN_4 .L12_11: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $2, %rax jl .L12_13 KERNEL4x12_I KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 subq $2, %rax je .L12_12a ALIGN_5 .L12_12: KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 dec %rax jne .L12_12 .L12_12a: KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_E jmp .L12_16 .L12_13: test $1, %rax jz .L12_14 KERNEL4x12_I KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_E jmp .L12_16 .L12_14: INIT4x12 .L12_16: movq K, %rax andq $7, %rax # if (k & 1) je .L12_19 ALIGN_4 .L12_17: KERNEL4x12_SUB dec %rax jne .L12_17 ALIGN_4 .L12_19: SAVE4x12 decq I # i -- jne .L12_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L12_20: // Test rest of M testq $3, M jz .L12_100 // to next 16 lines of N .L12_30: testq $2, M jz .L12_40 ALIGN_4 .L12_31: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO INIT2x12 movq K, %rax sarq $3, %rax je .L12_36 ALIGN_4 .L12_32: KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB dec %rax jne .L12_32 ALIGN_4 .L12_36: movq K, %rax andq $7, %rax # if (k & 1) je .L12_39 ALIGN_4 .L12_37: KERNEL2x12_SUB dec %rax jne .L12_37 ALIGN_4 .L12_39: SAVE2x12 ALIGN_4 .L12_40: testq $1, M jz .L12_100 // to next 3 lines of N ALIGN_4 .L12_41: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO INIT1x12 movq K, %rax sarq $3,%rax je .L12_46 ALIGN_4 .L12_42: KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB dec %rax jne .L12_42 ALIGN_4 .L12_46: movq K, %rax andq $7, %rax # if (k & 1) je .L12_49 ALIGN_4 .L12_47: KERNEL1x12_SUB dec %rax jne .L12_47 ALIGN_4 .L12_49: SAVE1x12 ALIGN_4 .L12_100: decq J // j -- jg .L12_01 .L4_0: cmpq $ 0, Nmod12 // N % 12 == 0 je .L999 movq Nmod12, J sarq $2, J // j = j / 4 je .L2_0 .L4_10: movq C, CO1 leaq (C, LDC, 4), C // c += 4 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L4_20 ALIGN_4 .L4_11: movq B, BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $2, %rax jl .L4_13 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subq $2, %rax je .L4_12a ALIGN_5 .L4_12: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 dec %rax jne .L4_12 .L4_12a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_13: test $1, %rax jz .L4_14 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_14: INIT4x4 .L4_16: movq K, %rax andq $7, %rax # if (k & 1) je .L4_19 ALIGN_4 .L4_17: KERNEL4x4_SUB dec %rax jne .L4_17 ALIGN_4 .L4_19: SAVE4x4 decq I # i -- jg .L4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L4_20: // Test rest of M testq $3, M jz .L4_100 // to next 16 lines of N .L4_30: testq $2, M jz .L4_40 ALIGN_4 .L4_31: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT2x4 movq K, %rax sarq $3, %rax je .L4_36 ALIGN_4 .L4_32: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB dec %rax jne .L4_32 ALIGN_4 .L4_36: movq K, %rax andq $7, %rax # if (k & 1) je .L4_39 ALIGN_4 .L4_37: KERNEL2x4_SUB dec %rax jne .L4_37 .L4_39: SAVE2x4 .L4_40: testq $1, M jz .L4_100 // to next 3 lines of N ALIGN_4 .L4_41: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT1x4 movq K, %rax sarq $3,%rax je .L4_46 ALIGN_4 .L4_42: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB dec %rax jne .L4_42 ALIGN_4 .L4_46: movq K, %rax andq $7, %rax # if (k & 1) je .L4_49 ALIGN_4 .L4_47: KERNEL1x4_SUB dec %rax jne .L4_47 ALIGN_4 .L4_49: SAVE1x4 ALIGN_4 .L4_100: movq K, %rax salq $2, %rax // * 4 leaq (B , %rax, SIZE), B decq J // j -- jg .L4_10 /***************************************************************************************************************/ .L2_0: movq Nmod12, J testq $2, J je .L1_0 .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L2_20 ALIGN_4 .L2_11: movq B, BO addq $12 * SIZE, BO INIT4x2 movq K, %rax sarq $3, %rax // K / 8 je .L2_16 ALIGN_5 .L2_12: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB dec %rax jne .L2_12 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 ALIGN_4 .L2_17: KERNEL4x2_SUB dec %rax jne .L2_17 ALIGN_4 .L2_19: SAVE4x2 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $3, M jz .L2_100 // to next 16 lines of N .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT2x2 movq K, %rax sarq $3, %rax je .L2_36 ALIGN_4 .L2_32: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB dec %rax jne .L2_32 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 ALIGN_4 .L2_37: KERNEL2x2_SUB dec %rax jne .L2_37 .L2_39: SAVE2x2 .L2_40: testq $1, M jz .L2_100 // to next 3 lines of N .L2_41: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT1x2 movq K, %rax sarq $3,%rax je .L2_46 ALIGN_4 .L2_42: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB dec %rax jne .L2_42 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 ALIGN_4 .L2_47: KERNEL1x2_SUB dec %rax jne .L2_47 .L2_49: SAVE1x2 .L2_100: movq K, %rax salq $1, %rax // * 2 leaq (B , %rax, SIZE), B /***************************************************************************************************************/ .L1_0: movq Nmod12, J testq $1, J je .L999 .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L1_20 ALIGN_4 .L1_11: movq B, BO addq $12 * SIZE, BO INIT4x1 movq K, %rax sarq $3, %rax // K / 8 je .L1_16 ALIGN_5 .L1_12: KERNEL4x1 dec %rax jne .L1_12 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 ALIGN_4 .L1_17: KERNEL4x1_SUB dec %rax jne .L1_17 ALIGN_4 .L1_19: SAVE4x1 decq I # i -- jg .L1_11 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $3, M jz .L1_100 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT2x1 movq K, %rax sarq $3, %rax je .L1_36 ALIGN_4 .L1_32: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB dec %rax jne .L1_32 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 ALIGN_4 .L1_37: KERNEL2x1_SUB dec %rax jne .L1_37 .L1_39: SAVE2x1 .L1_40: testq $1, M jz .L1_100 // to next 3 lines of N .L1_41: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT1x1 movq K, %rax sarq $3,%rax je .L1_46 ALIGN_4 .L1_42: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB dec %rax jne .L1_42 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 ALIGN_4 .L1_47: KERNEL1x1_SUB dec %rax jne .L1_47 .L1_49: SAVE1x1 .L1_100: .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovups %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL vmovsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $4, %rdi divq %rdi // N / 4 movq %rax, Ndiv12 // N / 4 movq %rdx, Nmod12 // N % 4 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv12, J cmpq $ 0, J je .L2_0 ALIGN_4 .L4_10: movq C, CO1 leaq (C, LDC, 4), C // c += 4 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L4_20 ALIGN_4 .L4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,4), BO // add number of values in B leaq (AO,%rax,4), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif sarq $3, %rax // K / 8 cmpq $2, %rax jl .L4_13 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subq $2, %rax je .L4_12a ALIGN_5 .L4_12: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 dec %rax jne .L4_12 .L4_12a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_13: test $1, %rax jz .L4_14 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_14: INIT4x4 .L4_16: movq KKK, %rax andq $7, %rax # if (k & 1) je .L4_19 ALIGN_4 .L4_17: KERNEL4x4_SUB dec %rax jne .L4_17 ALIGN_4 .L4_19: SAVE4x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 4), BO // number of values in B leaq (AO, %rax, 4), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK // number of values in A #endif decq I # i -- jg .L4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L4_20: // Test rest of M testq $3, M jz .L4_100 // to next 16 lines of N .L4_30: testq $2, M jz .L4_40 ALIGN_4 .L4_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,4), BO // add number of values in B leaq (AO,%rax,2), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif INIT2x4 sarq $3, %rax je .L4_36 ALIGN_4 .L4_32: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB dec %rax jne .L4_32 ALIGN_4 .L4_36: movq KKK, %rax andq $7, %rax # if (k & 1) je .L4_39 ALIGN_4 .L4_37: KERNEL2x4_SUB dec %rax jne .L4_37 .L4_39: SAVE2x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 4), BO // number of values in B leaq (AO, %rax, 2), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK // number of values in A #endif .L4_40: testq $1, M jz .L4_100 // to next 3 lines of N ALIGN_4 .L4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,4), BO // add number of values in B leaq (AO,%rax,1), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif INIT1x4 sarq $3,%rax je .L4_46 ALIGN_4 .L4_42: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB dec %rax jne .L4_42 ALIGN_4 .L4_46: movq KKK, %rax andq $7, %rax # if (k & 1) je .L4_49 ALIGN_4 .L4_47: KERNEL1x4_SUB dec %rax jne .L4_47 ALIGN_4 .L4_49: SAVE1x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 4), BO // number of values in B leaq (AO, %rax, 1), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK // number of values in A #endif .L4_100: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK // number of values in B #endif movq K, %rax salq $2, %rax // * 4 leaq (B , %rax, SIZE), B decq J // j -- jg .L4_10 /***************************************************************************************************************/ .L2_0: movq Nmod12, J testq $2, J je .L1_0 .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,2), BO // add number of values in B leaq (AO,%rax,4), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif INIT4x2 sarq $3, %rax // K / 8 je .L2_16 ALIGN_5 .L2_12: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB dec %rax jne .L2_12 .L2_16: movq KKK, %rax andq $7, %rax # if (k & 1) je .L2_19 ALIGN_4 .L2_17: KERNEL4x2_SUB dec %rax jne .L2_17 ALIGN_4 .L2_19: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 2), BO // number of values in B leaq (AO, %rax, 4), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK // number of values in A #endif decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $3, M jz .L2_100 // to next 16 lines of N .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,2), BO // add number of values in B leaq (AO,%rax,2), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif INIT2x2 sarq $3, %rax je .L2_36 ALIGN_4 .L2_32: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB dec %rax jne .L2_32 .L2_36: movq KKK, %rax andq $7, %rax # if (k & 1) je .L2_39 ALIGN_4 .L2_37: KERNEL2x2_SUB dec %rax jne .L2_37 .L2_39: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 2), BO // number of values in B leaq (AO, %rax, 2), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK // number of values in A #endif .L2_40: testq $1, M jz .L2_100 // to next 3 lines of N .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,2), BO // add number of values in B leaq (AO,%rax,1), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif INIT1x2 sarq $3,%rax je .L2_46 ALIGN_4 .L2_42: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB dec %rax jne .L2_42 .L2_46: movq KKK, %rax andq $7, %rax # if (k & 1) je .L2_49 ALIGN_4 .L2_47: KERNEL1x2_SUB dec %rax jne .L2_47 .L2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 2), BO // number of values in B leaq (AO, %rax, 1), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK // number of values in A #endif .L2_100: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK // number of values in B #endif movq K, %rax salq $1, %rax // * 2 leaq (B , %rax, SIZE), B /***************************************************************************************************************/ .L1_0: movq Nmod12, J testq $1, J je .L999 .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,1), BO // add number of values in B leaq (AO,%rax,4), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif INIT4x1 sarq $3, %rax // K / 8 je .L1_16 ALIGN_5 .L1_12: KERNEL4x1 dec %rax jne .L1_12 .L1_16: movq KKK, %rax andq $7, %rax # if (k & 1) je .L1_19 ALIGN_4 .L1_17: KERNEL4x1_SUB dec %rax jne .L1_17 ALIGN_4 .L1_19: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 1), BO // number of values in B leaq (AO, %rax, 4), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK // number of values in A #endif decq I # i -- jg .L1_11 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $3, M jz .L1_100 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,1), BO // add number of values in B leaq (AO,%rax,2), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif INIT2x1 sarq $3, %rax je .L1_36 ALIGN_4 .L1_32: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB dec %rax jne .L1_32 .L1_36: movq KKK, %rax andq $7, %rax # if (k & 1) je .L1_39 ALIGN_4 .L1_37: KERNEL2x1_SUB dec %rax jne .L1_37 .L1_39: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 1), BO // number of values in B leaq (AO, %rax, 2), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK // number of values in A #endif .L1_40: testq $1, M jz .L1_100 // to next 3 lines of N .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,1), BO // add number of values in B leaq (AO,%rax,1), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif INIT1x1 sarq $3,%rax je .L1_46 ALIGN_4 .L1_42: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB dec %rax jne .L1_42 .L1_46: movq KKK, %rax andq $7, %rax # if (k & 1) je .L1_49 ALIGN_4 .L1_47: KERNEL1x1_SUB dec %rax jne .L1_47 .L1_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 1), BO // number of values in B leaq (AO, %rax, 1), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK // number of values in A #endif .L1_100: #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK // number of values in B #endif .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/dgemm_kernel_4x8_haswell.S000066400000000000000000002553431313527062700223750ustar00rootroot00000000000000/********************************************************************************* Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define SP %rbx #define BO1 %rdi #define BO2 %r15 #define BO3 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 96 #define L_BUFFER_SIZE 256*8*12+4096 #else #define STACKSIZE 256 #define L_BUFFER_SIZE 128*8*12+512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define Ndiv12 24(%rsp) #define Nmod12 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $ 0, 4096 * 4(%rsp);\ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #define A_PR1 512 #define B_PR1 512 /******************************************************************************************* * Macro definitions *******************************************************************************************/ .macro INIT4x12 vxorpd %ymm4 , %ymm4 , %ymm4 vxorpd %ymm5 , %ymm5 , %ymm5 vxorpd %ymm6 , %ymm6 , %ymm6 vxorpd %ymm7 , %ymm7 , %ymm7 vxorpd %ymm8 , %ymm8 , %ymm8 vxorpd %ymm9 , %ymm9 , %ymm9 vxorpd %ymm10, %ymm10, %ymm10 vxorpd %ymm11, %ymm11, %ymm11 vxorpd %ymm12, %ymm12, %ymm12 vxorpd %ymm13, %ymm13, %ymm13 vxorpd %ymm14, %ymm14, %ymm14 vxorpd %ymm15, %ymm15, %ymm15 .endm .macro KERNEL4x12_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 prefetcht0 B_PR1(BO) vmovups -16 * SIZE(AO), %ymm0 prefetcht0 B_PR1+64(BO) vmovups -8 * SIZE(BO), %ymm2 prefetcht0 B_PR1+128(BO) vmovups -4 * SIZE(BO), %ymm3 vmulpd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+192(BO) vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm3 , %ymm15 vmovups -4 * SIZE(BO), %ymm3 .endm .macro KERNEL4x12_M1 prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 vmovups -4 * SIZE(BO), %ymm3 .endm .macro KERNEL4x12_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vmovups 4 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 vmovups 8 * SIZE(BO), %ymm3 addq $ 24*SIZE, BO .endm .macro KERNEL4x12_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 addq $ 12*SIZE, BO .endm .macro KERNEL4x12_SUB vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 .endm .macro SAVE4x12 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 vpermpd $ 0xb1 , %ymm5, %ymm5 vpermpd $ 0xb1 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4, %ymm4 vaddpd (CO1, LDC), %ymm5, %ymm5 vaddpd (%rax), %ymm6, %ymm6 vaddpd (%rax, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) prefetcht0 32(CO1) prefetcht0 32(CO1,LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) vpermpd $ 0xb1 , %ymm9 , %ymm9 vpermpd $ 0xb1 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %ymm4, %ymm4 vaddpd (%rax, LDC), %ymm5, %ymm5 vaddpd (%rbp), %ymm6, %ymm6 vaddpd (%rbp, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (%rax) vmovups %ymm5 , (%rax, LDC) vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) vpermpd $ 0xb1 , %ymm13, %ymm13 vpermpd $ 0xb1 , %ymm15, %ymm15 vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %ymm4, %ymm4 vaddpd (%rax, LDC), %ymm5, %ymm5 vaddpd (%rbp), %ymm6, %ymm6 vaddpd (%rbp, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (%rax) vmovups %ymm5 , (%rax, LDC) vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT2x12 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 vxorpd %xmm12, %xmm12, %xmm12 vxorpd %xmm13, %xmm13, %xmm13 vxorpd %xmm14, %xmm14, %xmm14 vxorpd %xmm15, %xmm15, %xmm15 .endm .macro KERNEL2x12_SUB vmovups -16 * SIZE(AO), %xmm0 vmovddup -12 * SIZE(BO), %xmm1 vmovddup -11 * SIZE(BO), %xmm2 vmovddup -10 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm4 vmovddup -9 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm5 vmovddup -8 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 vmovddup -7 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm7 vmovddup -6 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm8 vmovddup -5 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm9 vmovddup -4 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm10 vmovddup -3 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm11 vmovddup -2 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm12 vmovddup -1 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm13 addq $ 12*SIZE, BO vfmadd231pd %xmm0 ,%xmm2 , %xmm14 addq $ 2*SIZE, AO vfmadd231pd %xmm0 ,%xmm3 , %xmm15 .endm .macro SAVE2x12 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm5 , %xmm5 vmulpd %xmm0 , %xmm6 , %xmm6 vmulpd %xmm0 , %xmm7 , %xmm7 vmulpd %xmm0 , %xmm8 , %xmm8 vmulpd %xmm0 , %xmm9 , %xmm9 vmulpd %xmm0 , %xmm10, %xmm10 vmulpd %xmm0 , %xmm11, %xmm11 vmulpd %xmm0 , %xmm12, %xmm12 vmulpd %xmm0 , %xmm13, %xmm13 vmulpd %xmm0 , %xmm14, %xmm14 vmulpd %xmm0 , %xmm15, %xmm15 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %xmm4, %xmm4 vaddpd (CO1, LDC), %xmm5, %xmm5 vaddpd (%rax), %xmm6, %xmm6 vaddpd (%rax, LDC), %xmm7, %xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (%rax) vmovups %xmm7 , (%rax, LDC) leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %xmm8 , %xmm4 vaddpd (%rax, LDC), %xmm9 , %xmm5 vaddpd (%rbp), %xmm10, %xmm6 vaddpd (%rbp, LDC), %xmm11, %xmm7 #endif vmovups %xmm4 , (%rax) vmovups %xmm5 , (%rax, LDC) vmovups %xmm6 , (%rbp) vmovups %xmm7 , (%rbp, LDC) leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %xmm12, %xmm4 vaddpd (%rax, LDC), %xmm13, %xmm5 vaddpd (%rbp), %xmm14, %xmm6 vaddpd (%rbp, LDC), %xmm15, %xmm7 #endif vmovups %xmm4 , (%rax) vmovups %xmm5 , (%rax, LDC) vmovups %xmm6 , (%rbp) vmovups %xmm7 , (%rbp, LDC) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT1x12 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 vxorpd %xmm12, %xmm12, %xmm12 vxorpd %xmm13, %xmm13, %xmm13 vxorpd %xmm14, %xmm14, %xmm14 vxorpd %xmm15, %xmm15, %xmm15 .endm .macro KERNEL1x12_SUB vmovsd -16 * SIZE(AO), %xmm0 vmovsd -12 * SIZE(BO), %xmm1 vmovsd -11 * SIZE(BO), %xmm2 vmovsd -10 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vmovsd -9 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 vmovsd -8 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm6 vmovsd -7 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm7 vmovsd -6 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm8 vmovsd -5 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm9 vmovsd -4 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm10 vmovsd -3 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm11 vmovsd -2 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm12 vmovsd -1 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm13 addq $ 12*SIZE, BO vfmadd231sd %xmm0 ,%xmm2 , %xmm14 addq $ 1*SIZE, AO vfmadd231sd %xmm0 ,%xmm3 , %xmm15 .endm .macro SAVE1x12 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm6 , %xmm6 vmulsd %xmm0 , %xmm7 , %xmm7 vmulsd %xmm0 , %xmm8 , %xmm8 vmulsd %xmm0 , %xmm9 , %xmm9 vmulsd %xmm0 , %xmm10, %xmm10 vmulsd %xmm0 , %xmm11, %xmm11 vmulsd %xmm0 , %xmm12, %xmm12 vmulsd %xmm0 , %xmm13, %xmm13 vmulsd %xmm0 , %xmm14, %xmm14 vmulsd %xmm0 , %xmm15, %xmm15 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 vaddsd (CO1, LDC), %xmm5, %xmm5 vaddsd (%rax), %xmm6, %xmm6 vaddsd (%rax, LDC), %xmm7, %xmm7 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (%rax) vmovsd %xmm7 , (%rax, LDC) leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddsd (%rax), %xmm8 , %xmm4 vaddsd (%rax, LDC), %xmm9 , %xmm5 vaddsd (%rbp), %xmm10, %xmm6 vaddsd (%rbp, LDC), %xmm11, %xmm7 #endif vmovsd %xmm4 , (%rax) vmovsd %xmm5 , (%rax, LDC) vmovsd %xmm6 , (%rbp) vmovsd %xmm7 , (%rbp, LDC) leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp #if !defined(TRMMKERNEL) vaddsd (%rax), %xmm12, %xmm4 vaddsd (%rax, LDC), %xmm13, %xmm5 vaddsd (%rbp), %xmm14, %xmm6 vaddsd (%rbp, LDC), %xmm15, %xmm7 #endif vmovsd %xmm4 , (%rax) vmovsd %xmm5 , (%rax, LDC) vmovsd %xmm6 , (%rbp) vmovsd %xmm7 , (%rbp, LDC) addq $ 1*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT4x8 vxorpd %ymm4 , %ymm4 , %ymm4 vxorpd %ymm5 , %ymm5 , %ymm5 vxorpd %ymm6 , %ymm6 , %ymm6 vxorpd %ymm7 , %ymm7 , %ymm7 vxorpd %ymm8 , %ymm8 , %ymm8 vxorpd %ymm9 , %ymm9 , %ymm9 vxorpd %ymm10, %ymm10, %ymm10 vxorpd %ymm11, %ymm11, %ymm11 .endm .macro KERNEL4x8_I vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 vmovups -8 * SIZE(BO), %ymm2 .endm .macro KERNEL4x8_M1 prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vmovups -8 * SIZE(BO), %ymm2 .endm .macro KERNEL4x8_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vmovups 0 * SIZE(BO), %ymm2 addq $ 16*SIZE, BO .endm .macro KERNEL4x8_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO .endm .macro KERNEL4x8_SUB vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 .endm .macro SAVE4x8 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 vpermpd $ 0xb1 , %ymm5, %ymm5 vpermpd $ 0xb1 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4, %ymm4 vaddpd (CO1, LDC), %ymm5, %ymm5 vaddpd (%rax), %ymm6, %ymm6 vaddpd (%rax, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) prefetcht0 32(CO1) prefetcht0 32(CO1,LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) vpermpd $ 0xb1 , %ymm9 , %ymm9 vpermpd $ 0xb1 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %ymm4, %ymm4 vaddpd (%rax, LDC), %ymm5, %ymm5 vaddpd (%rbp), %ymm6, %ymm6 vaddpd (%rbp, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (%rax) vmovups %ymm5 , (%rax, LDC) vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) prefetcht0 32(%rax) prefetcht0 32(%rax,LDC) prefetcht0 32(%rbp) prefetcht0 32(%rbp,LDC) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT2x8 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 .endm .macro KERNEL2x8_SUB vmovups -16 * SIZE(AO), %xmm0 vmovddup -12 * SIZE(BO), %xmm1 vmovddup -11 * SIZE(BO), %xmm2 vmovddup -10 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm4 vmovddup -9 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm5 vmovddup -8 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 vmovddup -7 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm1 , %xmm7 vmovddup -6 * SIZE(BO), %xmm1 vfmadd231pd %xmm0 ,%xmm2 , %xmm8 vmovddup -5 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm3 , %xmm9 vfmadd231pd %xmm0 ,%xmm1 , %xmm10 vfmadd231pd %xmm0 ,%xmm2 , %xmm11 addq $ 8*SIZE, BO addq $ 2*SIZE, AO .endm .macro SAVE2x8 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm5 , %xmm5 vmulpd %xmm0 , %xmm6 , %xmm6 vmulpd %xmm0 , %xmm7 , %xmm7 vmulpd %xmm0 , %xmm8 , %xmm8 vmulpd %xmm0 , %xmm9 , %xmm9 vmulpd %xmm0 , %xmm10, %xmm10 vmulpd %xmm0 , %xmm11, %xmm11 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %xmm4, %xmm4 vaddpd (CO1, LDC), %xmm5, %xmm5 vaddpd (%rax), %xmm6, %xmm6 vaddpd (%rax, LDC), %xmm7, %xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (%rax) vmovups %xmm7 , (%rax, LDC) leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddpd (%rax), %xmm8 , %xmm4 vaddpd (%rax, LDC), %xmm9 , %xmm5 vaddpd (%rbp), %xmm10, %xmm6 vaddpd (%rbp, LDC), %xmm11, %xmm7 #endif vmovups %xmm4 , (%rax) vmovups %xmm5 , (%rax, LDC) vmovups %xmm6 , (%rbp) vmovups %xmm7 , (%rbp, LDC) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT1x8 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 .endm .macro KERNEL1x8_SUB vmovsd -16 * SIZE(AO), %xmm0 vmovsd -12 * SIZE(BO), %xmm1 vmovsd -11 * SIZE(BO), %xmm2 vmovsd -10 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vmovsd -9 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 vmovsd -8 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm6 vmovsd -7 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm1 , %xmm7 vmovsd -6 * SIZE(BO), %xmm1 vfmadd231sd %xmm0 ,%xmm2 , %xmm8 vmovsd -5 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm3 , %xmm9 vfmadd231sd %xmm0 ,%xmm1 , %xmm10 vfmadd231sd %xmm0 ,%xmm2 , %xmm11 addq $ 8*SIZE, BO addq $ 1*SIZE, AO .endm .macro SAVE1x8 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm6 , %xmm6 vmulsd %xmm0 , %xmm7 , %xmm7 vmulsd %xmm0 , %xmm8 , %xmm8 vmulsd %xmm0 , %xmm9 , %xmm9 vmulsd %xmm0 , %xmm10, %xmm10 vmulsd %xmm0 , %xmm11, %xmm11 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 vaddsd (CO1, LDC), %xmm5, %xmm5 vaddsd (%rax), %xmm6, %xmm6 vaddsd (%rax, LDC), %xmm7, %xmm7 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (%rax) vmovsd %xmm7 , (%rax, LDC) leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp #if !defined(TRMMKERNEL) vaddsd (%rax), %xmm8 , %xmm4 vaddsd (%rax, LDC), %xmm9 , %xmm5 vaddsd (%rbp), %xmm10, %xmm6 vaddsd (%rbp, LDC), %xmm11, %xmm7 #endif vmovsd %xmm4 , (%rax) vmovsd %xmm5 , (%rax, LDC) vmovsd %xmm6 , (%rbp) vmovsd %xmm7 , (%rbp, LDC) addq $ 1*SIZE, CO1 .endm /******************************************************************************************/ .macro INIT4x4 vxorpd %ymm4 , %ymm4 , %ymm4 vxorpd %ymm5 , %ymm5 , %ymm5 vxorpd %ymm6 , %ymm6 , %ymm6 vxorpd %ymm7 , %ymm7 , %ymm7 .endm .macro KERNEL4x4_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO vpermpd $ 0xb1, %ymm0 , %ymm0 vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M1 prefetcht0 A_PR1(AO) vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M2 vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO .endm .macro KERNEL4x4_E vmovups -12 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO vpermpd $ 0x1b, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO vpermpd $ 0xb1, %ymm0 , %ymm0 vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm .macro SAVE4x4 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm7 , %ymm7 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vpermpd $ 0xb1 , %ymm5, %ymm5 vpermpd $ 0xb1 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 vpermpd $ 0x1b , %ymm2, %ymm2 vpermpd $ 0x1b , %ymm3, %ymm3 vpermpd $ 0xb1 , %ymm2, %ymm2 vpermpd $ 0xb1 , %ymm3, %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %ymm4, %ymm4 vaddpd (CO1, LDC), %ymm5, %ymm5 vaddpd (%rax), %ymm6, %ymm6 vaddpd (%rax, LDC), %ymm7, %ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , (CO1, LDC) vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT2x4 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 .endm .macro KERNEL2x4_SUB vmovddup -12 * SIZE(BO), %xmm1 vmovups -16 * SIZE(AO), %xmm0 vmovddup -11 * SIZE(BO), %xmm2 vfmadd231pd %xmm0 ,%xmm1 , %xmm4 vmovddup -10 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm2 , %xmm5 vmovddup -9 * SIZE(BO), %xmm8 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 addq $ 4*SIZE, BO vfmadd231pd %xmm0 ,%xmm8 , %xmm7 addq $ 2*SIZE, AO .endm .macro SAVE2x4 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm5 , %xmm5 vmulpd %xmm0 , %xmm6 , %xmm6 vmulpd %xmm0 , %xmm7 , %xmm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddpd (CO1), %xmm4, %xmm4 vaddpd (CO1, LDC), %xmm5, %xmm5 vaddpd (%rax), %xmm6, %xmm6 vaddpd (%rax, LDC), %xmm7, %xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (%rax) vmovups %xmm7 , (%rax, LDC) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT1x4 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 .endm .macro KERNEL1x4_SUB vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 vmovsd -11 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vmovsd -10 * SIZE(BO), %xmm3 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 vmovsd -9 * SIZE(BO), %xmm8 vfmadd231sd %xmm0 ,%xmm3 , %xmm6 addq $ 4*SIZE, BO vfmadd231sd %xmm0 ,%xmm8 , %xmm7 addq $ 1*SIZE, AO .endm .macro SAVE1x4 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 vmulsd %xmm0 , %xmm6 , %xmm6 vmulsd %xmm0 , %xmm7 , %xmm7 leaq (CO1, LDC, 2), %rax #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 vaddsd (CO1, LDC), %xmm5, %xmm5 vaddsd (%rax), %xmm6, %xmm6 vaddsd (%rax, LDC), %xmm7, %xmm7 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (%rax) vmovsd %xmm7 , (%rax, LDC) addq $ 1*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT4x2 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 .endm .macro KERNEL4x2_SUB vmovddup -12 * SIZE(BO), %xmm2 vmovups -16 * SIZE(AO), %xmm0 vmovups -14 * SIZE(AO), %xmm1 vmovddup -11 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 vfmadd231pd %xmm1 ,%xmm2 , %xmm5 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 vfmadd231pd %xmm1 ,%xmm3 , %xmm7 addq $ 2*SIZE, BO addq $ 4*SIZE, AO .endm .macro SAVE4x2 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm5 , %xmm5 vmulpd %xmm0 , %xmm6 , %xmm6 vmulpd %xmm0 , %xmm7 , %xmm7 #if !defined(TRMMKERNEL) vaddpd (CO1) , %xmm4, %xmm4 vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 vaddpd (CO1, LDC), %xmm6, %xmm6 vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , 2 * SIZE(CO1) vmovups %xmm6 , (CO1, LDC) vmovups %xmm7 , 2 * SIZE(CO1, LDC) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT2x2 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm6 , %xmm6 , %xmm6 .endm .macro KERNEL2x2_SUB vmovddup -12 * SIZE(BO), %xmm2 vmovups -16 * SIZE(AO), %xmm0 vmovddup -11 * SIZE(BO), %xmm3 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 vfmadd231pd %xmm0 ,%xmm3 , %xmm6 addq $ 2*SIZE, BO addq $ 2*SIZE, AO .endm .macro SAVE2x2 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 vmulpd %xmm0 , %xmm6 , %xmm6 #if !defined(TRMMKERNEL) vaddpd (CO1) , %xmm4, %xmm4 vaddpd (CO1, LDC), %xmm6, %xmm6 #endif vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT1x2 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 .endm .macro KERNEL1x2_SUB vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 vmovsd -11 * SIZE(BO), %xmm2 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 vfmadd231sd %xmm0 ,%xmm2 , %xmm5 addq $ 2*SIZE, BO addq $ 1*SIZE, AO .endm .macro SAVE1x2 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 vmulsd %xmm0 , %xmm5 , %xmm5 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 vaddsd (CO1, LDC), %xmm5, %xmm5 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) addq $ 1*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT4x1 vxorpd %ymm4 , %ymm4 , %ymm4 vxorpd %ymm5 , %ymm5 , %ymm5 vxorpd %ymm6 , %ymm6 , %ymm6 vxorpd %ymm7 , %ymm7 , %ymm7 .endm .macro KERNEL4x1 vbroadcastsd -12 * SIZE(BO), %ymm0 vbroadcastsd -11 * SIZE(BO), %ymm1 vbroadcastsd -10 * SIZE(BO), %ymm2 vbroadcastsd -9 * SIZE(BO), %ymm3 vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 vbroadcastsd -8 * SIZE(BO), %ymm0 vbroadcastsd -7 * SIZE(BO), %ymm1 vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 vbroadcastsd -6 * SIZE(BO), %ymm2 vbroadcastsd -5 * SIZE(BO), %ymm3 vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 addq $ 8 *SIZE, BO addq $ 32*SIZE, AO .endm .macro KERNEL4x1_SUB vbroadcastsd -12 * SIZE(BO), %ymm2 vmovups -16 * SIZE(AO), %ymm0 vfmadd231pd %ymm0 ,%ymm2 , %ymm4 addq $ 1*SIZE, BO addq $ 4*SIZE, AO .endm .macro SAVE4x1 vbroadcastsd ALPHA, %ymm0 vaddpd %ymm4,%ymm5, %ymm4 vaddpd %ymm6,%ymm7, %ymm6 vaddpd %ymm4,%ymm6, %ymm4 vmulpd %ymm0 , %ymm4 , %ymm4 #if !defined(TRMMKERNEL) vaddpd (CO1) , %ymm4, %ymm4 #endif vmovups %ymm4 , (CO1) addq $ 4*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT2x1 vxorpd %xmm4 , %xmm4 , %xmm4 .endm .macro KERNEL2x1_SUB vmovddup -12 * SIZE(BO), %xmm2 vmovups -16 * SIZE(AO), %xmm0 vfmadd231pd %xmm0 ,%xmm2 , %xmm4 addq $ 1*SIZE, BO addq $ 2*SIZE, AO .endm .macro SAVE2x1 vmovddup ALPHA, %xmm0 vmulpd %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddpd (CO1) , %xmm4, %xmm4 #endif vmovups %xmm4 , (CO1) addq $ 2*SIZE, CO1 .endm /******************************************************************************************/ /******************************************************************************************/ .macro INIT1x1 vxorpd %xmm4 , %xmm4 , %xmm4 .endm .macro KERNEL1x1_SUB vmovsd -12 * SIZE(BO), %xmm1 vmovsd -16 * SIZE(AO), %xmm0 vfmadd231sd %xmm0 ,%xmm1 , %xmm4 addq $ 1*SIZE, BO addq $ 1*SIZE, AO .endm .macro SAVE1x1 vmovsd ALPHA, %xmm0 vmulsd %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddsd (CO1), %xmm4, %xmm4 #endif vmovsd %xmm4 , (CO1) addq $ 1*SIZE, CO1 .endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovups %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $24, %rdi divq %rdi // N / 24 movq %rax, Ndiv12 // N / 24 movq %rdx, Nmod12 // N % 24 movq Ndiv12, J cmpq $ 0, J je .L8_0 ALIGN_4 .L12_01: // copy to sub buffer movq K, %rax salq $3,%rax // K * 8 ; read 8 values from BO1 movq B, BO1 leaq (B,%rax, SIZE), BO2 // next offset to BO2 movq BO2 , B leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L12_02b: vmovups 0 * SIZE(BO1), %ymm1 vmovups 4 * SIZE(BO1), %ymm2 vmovups 0 * SIZE(BO2), %ymm3 vmovups %ymm1, 0 * SIZE(BO) vmovups %ymm2, 4 * SIZE(BO) vmovups %ymm3, 8 * SIZE(BO) addq $ 8*SIZE,BO1 addq $ 8*SIZE,BO2 addq $ 12*SIZE,BO decq %rax jnz .L12_02b .L12_03c: .L12_10: movq C, CO1 leaq (C, LDC, 8), C leaq (C, LDC, 4), C // c += 12 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L12_20 ALIGN_4 .L12_11: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $2, %rax jl .L12_13 KERNEL4x12_I KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 subq $2, %rax je .L12_12a ALIGN_5 .L12_12: KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 dec %rax jne .L12_12 .L12_12a: KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_E jmp .L12_16 .L12_13: test $1, %rax jz .L12_14 KERNEL4x12_I KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_E jmp .L12_16 .L12_14: INIT4x12 .L12_16: movq K, %rax andq $7, %rax # if (k & 1) je .L12_19 ALIGN_4 .L12_17: KERNEL4x12_SUB dec %rax jne .L12_17 ALIGN_4 .L12_19: SAVE4x12 decq I # i -- jne .L12_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L12_20: // Test rest of M testq $3, M jz .L12_100 // to next 16 lines of N .L12_30: testq $2, M jz .L12_40 ALIGN_4 .L12_31: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO INIT2x12 movq K, %rax sarq $3, %rax je .L12_36 ALIGN_4 .L12_32: KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB dec %rax jne .L12_32 ALIGN_4 .L12_36: movq K, %rax andq $7, %rax # if (k & 1) je .L12_39 ALIGN_4 .L12_37: KERNEL2x12_SUB dec %rax jne .L12_37 ALIGN_4 .L12_39: SAVE2x12 ALIGN_4 .L12_40: testq $1, M jz .L12_100 // to next 3 lines of N ALIGN_4 .L12_41: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO INIT1x12 movq K, %rax sarq $3,%rax je .L12_46 ALIGN_4 .L12_42: KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB dec %rax jne .L12_42 ALIGN_4 .L12_46: movq K, %rax andq $7, %rax # if (k & 1) je .L12_49 ALIGN_4 .L12_47: KERNEL1x12_SUB dec %rax jne .L12_47 ALIGN_4 .L12_49: SAVE1x12 ALIGN_4 .L12_100: /**************************************************************************************************/ .L13_01: // copy to sub buffer movq K, %rax salq $3,%rax // K * 8 ; read 8 values movq B, BO2 leaq (B,%rax, SIZE), BO3 // next offset to BO2 leaq (BO3,%rax, SIZE), B // next offset to B leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L13_02b: vmovups 4 * SIZE(BO2), %ymm1 vmovups 0 * SIZE(BO3), %ymm2 vmovups 4 * SIZE(BO3), %ymm3 vmovups %ymm1, 0 * SIZE(BO) vmovups %ymm2, 4 * SIZE(BO) vmovups %ymm3, 8 * SIZE(BO) addq $ 8*SIZE,BO2 addq $ 8*SIZE,BO3 addq $ 12*SIZE,BO decq %rax jnz .L13_02b .L13_10: movq C, CO1 leaq (C, LDC, 8), C leaq (C, LDC, 4), C // c += 12 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L13_20 ALIGN_4 .L13_11: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $2, %rax jl .L13_13 KERNEL4x12_I KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 subq $2, %rax je .L13_12a ALIGN_5 .L13_12: KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 dec %rax jne .L13_12 .L13_12a: KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_E jmp .L13_16 .L13_13: test $1, %rax jz .L13_14 KERNEL4x12_I KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_M2 KERNEL4x12_M1 KERNEL4x12_E jmp .L13_16 .L13_14: INIT4x12 .L13_16: movq K, %rax andq $7, %rax # if (k & 1) je .L13_19 ALIGN_4 .L13_17: KERNEL4x12_SUB dec %rax jne .L13_17 ALIGN_4 .L13_19: SAVE4x12 decq I # i -- jne .L13_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L13_20: // Test rest of M testq $3, M jz .L13_100 // to next 16 lines of N .L13_30: testq $2, M jz .L13_40 ALIGN_4 .L13_31: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO INIT2x12 movq K, %rax sarq $3, %rax je .L13_36 ALIGN_4 .L13_32: KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB KERNEL2x12_SUB dec %rax jne .L13_32 ALIGN_4 .L13_36: movq K, %rax andq $7, %rax # if (k & 1) je .L13_39 ALIGN_4 .L13_37: KERNEL2x12_SUB dec %rax jne .L13_37 ALIGN_4 .L13_39: SAVE2x12 ALIGN_4 .L13_40: testq $1, M jz .L13_100 // to next 3 lines of N ALIGN_4 .L13_41: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO INIT1x12 movq K, %rax sarq $3,%rax je .L13_46 ALIGN_4 .L13_42: KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB KERNEL1x12_SUB dec %rax jne .L13_42 ALIGN_4 .L13_46: movq K, %rax andq $7, %rax # if (k & 1) je .L13_49 ALIGN_4 .L13_47: KERNEL1x12_SUB dec %rax jne .L13_47 ALIGN_4 .L13_49: SAVE1x12 ALIGN_4 .L13_100: decq J // j -- jg .L12_01 /**************************************************************************************************/ .L8_0: cmpq $ 0, Nmod12 // N % 12 == 0 je .L999 movq Nmod12, J sarq $3, J // j = j / 8 je .L4_0 .L8_10: movq C, CO1 leaq (C, LDC, 8), C // c += 4 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L8_20 ALIGN_4 .L8_11: movq B, BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $2, %rax jl .L8_13 KERNEL4x8_I KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 subq $2, %rax je .L8_12a ALIGN_5 .L8_12: KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 dec %rax jne .L8_12 .L8_12a: KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_E jmp .L8_16 .L8_13: test $1, %rax jz .L8_14 KERNEL4x8_I KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_E jmp .L8_16 .L8_14: INIT4x8 .L8_16: movq K, %rax andq $7, %rax # if (k & 1) je .L8_19 ALIGN_4 .L8_17: KERNEL4x8_SUB dec %rax jne .L8_17 ALIGN_4 .L8_19: SAVE4x8 decq I # i -- jg .L8_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L8_20: // Test rest of M testq $3, M jz .L8_100 // to next 16 lines of N .L8_30: testq $2, M jz .L8_40 ALIGN_4 .L8_31: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT2x8 movq K, %rax sarq $3, %rax je .L8_36 ALIGN_4 .L8_32: KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB dec %rax jne .L8_32 ALIGN_4 .L8_36: movq K, %rax andq $7, %rax # if (k & 1) je .L8_39 ALIGN_4 .L8_37: KERNEL2x8_SUB dec %rax jne .L8_37 .L8_39: SAVE2x8 .L8_40: testq $1, M jz .L8_100 // to next 3 lines of N ALIGN_4 .L8_41: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT1x8 movq K, %rax sarq $3,%rax je .L8_46 ALIGN_4 .L8_42: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB dec %rax jne .L8_42 ALIGN_4 .L8_46: movq K, %rax andq $7, %rax # if (k & 1) je .L8_49 ALIGN_4 .L8_47: KERNEL1x8_SUB dec %rax jne .L8_47 ALIGN_4 .L8_49: SAVE1x8 ALIGN_4 .L8_100: movq K, %rax salq $3, %rax // * 8 leaq (B , %rax, SIZE), B decq J // j -- jg .L8_10 /**************************************************************************************************/ .L4_0: cmpq $ 0, Nmod12 // N % 12 == 0 je .L999 movq Nmod12, J testq $4, J // j = j / 4 je .L2_0 .L4_10: movq C, CO1 leaq (C, LDC, 4), C // c += 4 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L4_20 ALIGN_4 .L4_11: movq B, BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $2, %rax jl .L4_13 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subq $2, %rax je .L4_12a ALIGN_5 .L4_12: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 dec %rax jne .L4_12 .L4_12a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_13: test $1, %rax jz .L4_14 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_14: INIT4x4 .L4_16: movq K, %rax andq $7, %rax # if (k & 1) je .L4_19 ALIGN_4 .L4_17: KERNEL4x4_SUB dec %rax jne .L4_17 ALIGN_4 .L4_19: SAVE4x4 decq I # i -- jg .L4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L4_20: // Test rest of M testq $3, M jz .L4_100 // to next 16 lines of N .L4_30: testq $2, M jz .L4_40 ALIGN_4 .L4_31: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT2x4 movq K, %rax sarq $3, %rax je .L4_36 ALIGN_4 .L4_32: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB dec %rax jne .L4_32 ALIGN_4 .L4_36: movq K, %rax andq $7, %rax # if (k & 1) je .L4_39 ALIGN_4 .L4_37: KERNEL2x4_SUB dec %rax jne .L4_37 .L4_39: SAVE2x4 .L4_40: testq $1, M jz .L4_100 // to next 3 lines of N ALIGN_4 .L4_41: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT1x4 movq K, %rax sarq $3,%rax je .L4_46 ALIGN_4 .L4_42: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB dec %rax jne .L4_42 ALIGN_4 .L4_46: movq K, %rax andq $7, %rax # if (k & 1) je .L4_49 ALIGN_4 .L4_47: KERNEL1x4_SUB dec %rax jne .L4_47 ALIGN_4 .L4_49: SAVE1x4 ALIGN_4 .L4_100: movq K, %rax salq $2, %rax // * 4 leaq (B , %rax, SIZE), B /***************************************************************************************************************/ .L2_0: movq Nmod12, J testq $2, J je .L1_0 .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L2_20 ALIGN_4 .L2_11: movq B, BO addq $12 * SIZE, BO INIT4x2 movq K, %rax sarq $3, %rax // K / 8 je .L2_16 ALIGN_5 .L2_12: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB dec %rax jne .L2_12 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 ALIGN_4 .L2_17: KERNEL4x2_SUB dec %rax jne .L2_17 ALIGN_4 .L2_19: SAVE4x2 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $3, M jz .L2_100 // to next 16 lines of N .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT2x2 movq K, %rax sarq $3, %rax je .L2_36 ALIGN_4 .L2_32: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB dec %rax jne .L2_32 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 ALIGN_4 .L2_37: KERNEL2x2_SUB dec %rax jne .L2_37 .L2_39: SAVE2x2 .L2_40: testq $1, M jz .L2_100 // to next 3 lines of N .L2_41: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT1x2 movq K, %rax sarq $3,%rax je .L2_46 ALIGN_4 .L2_42: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB dec %rax jne .L2_42 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 ALIGN_4 .L2_47: KERNEL1x2_SUB dec %rax jne .L2_47 .L2_49: SAVE1x2 .L2_100: movq K, %rax salq $1, %rax // * 2 leaq (B , %rax, SIZE), B /***************************************************************************************************************/ .L1_0: movq Nmod12, J testq $1, J je .L999 .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L1_20 ALIGN_4 .L1_11: movq B, BO addq $12 * SIZE, BO INIT4x1 movq K, %rax sarq $3, %rax // K / 8 je .L1_16 ALIGN_5 .L1_12: KERNEL4x1 dec %rax jne .L1_12 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 ALIGN_4 .L1_17: KERNEL4x1_SUB dec %rax jne .L1_17 ALIGN_4 .L1_19: SAVE4x1 decq I # i -- jg .L1_11 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $3, M jz .L1_100 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT2x1 movq K, %rax sarq $3, %rax je .L1_36 ALIGN_4 .L1_32: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB dec %rax jne .L1_32 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 ALIGN_4 .L1_37: KERNEL2x1_SUB dec %rax jne .L1_37 .L1_39: SAVE2x1 .L1_40: testq $1, M jz .L1_100 // to next 3 lines of N .L1_41: movq B, BO // first buffer to BO addq $12 * SIZE, BO INIT1x1 movq K, %rax sarq $3,%rax je .L1_46 ALIGN_4 .L1_42: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB dec %rax jne .L1_42 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 ALIGN_4 .L1_47: KERNEL1x1_SUB dec %rax jne .L1_47 .L1_49: SAVE1x1 .L1_100: .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovups %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL vmovsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $8, %rdi divq %rdi // N / 8 movq %rax, Ndiv12 // N / 8 movq %rdx, Nmod12 // N % 8 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif /*************************************************************************************************/ .L8_0: movq Ndiv12, J cmpq $ 0, J je .L4_0 ALIGN_4 .L8_10: movq C, CO1 leaq (C, LDC, 8), C // c += 8 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L8_20 ALIGN_4 .L8_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,8), BO // add number of values in B leaq (AO,%rax,4), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $8, %rax // number of values in BO #endif movq %rax, KKK #endif sarq $3, %rax // K / 8 cmpq $2, %rax jl .L8_13 KERNEL4x8_I KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 subq $2, %rax je .L8_12a ALIGN_5 .L8_12: KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 dec %rax jne .L8_12 .L8_12a: KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_E jmp .L8_16 .L8_13: test $1, %rax jz .L8_14 KERNEL4x8_I KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_M2 KERNEL4x8_M1 KERNEL4x8_E jmp .L8_16 .L8_14: INIT4x8 .L8_16: movq KKK, %rax andq $7, %rax # if (k & 1) je .L8_19 ALIGN_4 .L8_17: KERNEL4x8_SUB dec %rax jne .L8_17 ALIGN_4 .L8_19: SAVE4x8 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 8), BO // number of values in B leaq (AO, %rax, 4), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK // number of values in A #endif decq I # i -- jg .L8_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L8_20: // Test rest of M testq $3, M jz .L8_100 // to next 16 lines of N .L8_30: testq $2, M jz .L8_40 ALIGN_4 .L8_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,8), BO // add number of values in B leaq (AO,%rax,2), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $8, %rax // number of values in BO #endif movq %rax, KKK #endif INIT2x8 sarq $3, %rax je .L8_36 ALIGN_4 .L8_32: KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB KERNEL2x8_SUB dec %rax jne .L8_32 ALIGN_4 .L8_36: movq KKK, %rax andq $7, %rax # if (k & 1) je .L8_39 ALIGN_4 .L8_37: KERNEL2x8_SUB dec %rax jne .L8_37 .L8_39: SAVE2x8 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 8), BO // number of values in B leaq (AO, %rax, 2), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK // number of values in A #endif .L8_40: testq $1, M jz .L8_100 // to next 3 lines of N ALIGN_4 .L8_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,8), BO // add number of values in B leaq (AO,%rax,1), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $8, %rax // number of values in BO #endif movq %rax, KKK #endif INIT1x8 sarq $3,%rax je .L8_46 ALIGN_4 .L8_42: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB dec %rax jne .L8_42 ALIGN_4 .L8_46: movq KKK, %rax andq $7, %rax # if (k & 1) je .L8_49 ALIGN_4 .L8_47: KERNEL1x8_SUB dec %rax jne .L8_47 ALIGN_4 .L8_49: SAVE1x8 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 8), BO // number of values in B leaq (AO, %rax, 1), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK // number of values in A #endif .L8_100: #if defined(TRMMKERNEL) && !defined(LEFT) addq $8, KK // number of values in B #endif decq J // j -- jg .L8_10 /*************************************************************************************************/ .L4_0: movq Nmod12, J testq $4, J je .L2_0 ALIGN_4 .L4_10: movq C, CO1 leaq (C, LDC, 4), C // c += 4 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L4_20 ALIGN_4 .L4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,4), BO // add number of values in B leaq (AO,%rax,4), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif sarq $3, %rax // K / 8 cmpq $2, %rax jl .L4_13 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subq $2, %rax je .L4_12a ALIGN_5 .L4_12: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 dec %rax jne .L4_12 .L4_12a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_13: test $1, %rax jz .L4_14 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E jmp .L4_16 .L4_14: INIT4x4 .L4_16: movq KKK, %rax andq $7, %rax # if (k & 1) je .L4_19 ALIGN_4 .L4_17: KERNEL4x4_SUB dec %rax jne .L4_17 ALIGN_4 .L4_19: SAVE4x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 4), BO // number of values in B leaq (AO, %rax, 4), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK // number of values in A #endif decq I # i -- jg .L4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L4_20: // Test rest of M testq $3, M jz .L4_100 // to next 16 lines of N .L4_30: testq $2, M jz .L4_40 ALIGN_4 .L4_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,4), BO // add number of values in B leaq (AO,%rax,2), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif INIT2x4 sarq $3, %rax je .L4_36 ALIGN_4 .L4_32: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB dec %rax jne .L4_32 ALIGN_4 .L4_36: movq KKK, %rax andq $7, %rax # if (k & 1) je .L4_39 ALIGN_4 .L4_37: KERNEL2x4_SUB dec %rax jne .L4_37 .L4_39: SAVE2x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 4), BO // number of values in B leaq (AO, %rax, 2), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK // number of values in A #endif .L4_40: testq $1, M jz .L4_100 // to next 3 lines of N ALIGN_4 .L4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,4), BO // add number of values in B leaq (AO,%rax,1), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif INIT1x4 sarq $3,%rax je .L4_46 ALIGN_4 .L4_42: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB dec %rax jne .L4_42 ALIGN_4 .L4_46: movq KKK, %rax andq $7, %rax # if (k & 1) je .L4_49 ALIGN_4 .L4_47: KERNEL1x4_SUB dec %rax jne .L4_47 ALIGN_4 .L4_49: SAVE1x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 4), BO // number of values in B leaq (AO, %rax, 1), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK // number of values in A #endif .L4_100: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK // number of values in B #endif movq K, %rax salq $2, %rax // * 4 leaq (B , %rax, SIZE), B /***************************************************************************************************************/ .L2_0: movq Nmod12, J testq $2, J je .L1_0 .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,2), BO // add number of values in B leaq (AO,%rax,4), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif INIT4x2 sarq $3, %rax // K / 8 je .L2_16 ALIGN_5 .L2_12: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB dec %rax jne .L2_12 .L2_16: movq KKK, %rax andq $7, %rax # if (k & 1) je .L2_19 ALIGN_4 .L2_17: KERNEL4x2_SUB dec %rax jne .L2_17 ALIGN_4 .L2_19: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 2), BO // number of values in B leaq (AO, %rax, 4), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK // number of values in A #endif decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $3, M jz .L2_100 // to next 16 lines of N .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,2), BO // add number of values in B leaq (AO,%rax,2), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif INIT2x2 sarq $3, %rax je .L2_36 ALIGN_4 .L2_32: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB dec %rax jne .L2_32 .L2_36: movq KKK, %rax andq $7, %rax # if (k & 1) je .L2_39 ALIGN_4 .L2_37: KERNEL2x2_SUB dec %rax jne .L2_37 .L2_39: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax + SIZE leaq (BO, %rax, 2), BO // number of values in B leaq (AO, %rax, 2), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK // number of values in A #endif .L2_40: testq $1, M jz .L2_100 // to next 3 lines of N .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,2), BO // add number of values in B leaq (AO,%rax,1), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif INIT1x2 sarq $3,%rax je .L2_46 ALIGN_4 .L2_42: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB dec %rax jne .L2_42 .L2_46: movq KKK, %rax andq $7, %rax # if (k & 1) je .L2_49 ALIGN_4 .L2_47: KERNEL1x2_SUB dec %rax jne .L2_47 .L2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 2), BO // number of values in B leaq (AO, %rax, 1), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK // number of values in A #endif .L2_100: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK // number of values in B #endif movq K, %rax salq $1, %rax // * 2 leaq (B , %rax, SIZE), B /***************************************************************************************************************/ .L1_0: movq Nmod12, J testq $1, J je .L999 .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $2, I // i = m / 4 je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,1), BO // add number of values in B leaq (AO,%rax,4), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif INIT4x1 sarq $3, %rax // K / 8 je .L1_16 ALIGN_5 .L1_12: KERNEL4x1 dec %rax jne .L1_12 .L1_16: movq KKK, %rax andq $7, %rax # if (k & 1) je .L1_19 ALIGN_4 .L1_17: KERNEL4x1_SUB dec %rax jne .L1_17 ALIGN_4 .L1_19: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 1), BO // number of values in B leaq (AO, %rax, 4), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK // number of values in A #endif decq I # i -- jg .L1_11 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $3, M jz .L1_100 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,1), BO // add number of values in B leaq (AO,%rax,2), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif INIT2x1 sarq $3, %rax je .L1_36 ALIGN_4 .L1_32: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB dec %rax jne .L1_32 .L1_36: movq KKK, %rax andq $7, %rax # if (k & 1) je .L1_39 ALIGN_4 .L1_37: KERNEL2x1_SUB dec %rax jne .L1_37 .L1_39: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 1), BO // number of values in B leaq (AO, %rax, 2), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK // number of values in A #endif .L1_40: testq $1, M jz .L1_100 // to next 3 lines of N .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO addq $12 * SIZE, BO #else movq B, BO addq $12 * SIZE, BO movq KK, %rax salq $3, %rax // rax * SIZE leaq (BO,%rax,1), BO // add number of values in B leaq (AO,%rax,1), AO // add number of values in A #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif INIT1x1 sarq $3,%rax je .L1_46 ALIGN_4 .L1_42: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB dec %rax jne .L1_42 .L1_46: movq KKK, %rax andq $7, %rax # if (k & 1) je .L1_49 ALIGN_4 .L1_47: KERNEL1x1_SUB dec %rax jne .L1_47 .L1_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $3, %rax // rax * SIZE leaq (BO, %rax, 1), BO // number of values in B leaq (AO, %rax, 1), AO // number of values in A #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK // number of values in A #endif .L1_100: #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK // number of values in B #endif .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/dgemm_kernel_4x8_sandy.S000066400000000000000000002155441313527062700220530ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #define ASSEMBLER #include "common.h" #define old_bm %rdi #define old_bn %rsi #define old_bk %rdx #define bm %r13 #define bn %r14 #define bk %r15 #define ALPHA %xmm0 #define ba %rcx #define bb %r8 #define C %r9 #define ldc %r10 #define i %r11 #define k %rax #define ptrba %rdi #define ptrbb %rsi #define C0 %rbx #define C1 %rbp #define prebb %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define old_ldc 8+STACKSIZE(%rsp) #define old_offset 16+STACKSIZE(%rsp) #define MEMALPHA 48(%rsp) #define j 56(%rsp) #define OFFSET 64(%rsp) #define kk 72(%rsp) #define kkk 80(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define old_ldc 64 + STACKSIZE(%rsp) #define old_offset 72 + STACKSIZE(%rsp) #define MEMALPHA 224(%rsp) #define j 232(%rsp) #define OFFSET 240(%rsp) #define kk 248(%rsp) #define kkk 256(%rsp) #endif #define PREFETCH0 prefetcht0 #define PREFETCH1 prefetcht0 #define PREFETCH2 prefetcht2 #define xvec0 %xmm0 #define xvec1 %xmm1 #define xvec2 %xmm2 #define xvec3 %xmm3 #define xvec4 %xmm4 #define xvec5 %xmm5 #define xvec6 %xmm6 #define xvec7 %xmm7 #define xvec8 %xmm8 #define xvec9 %xmm9 #define xvec10 %xmm10 #define xvec11 %xmm11 #define xvec12 %xmm12 #define xvec13 %xmm13 #define xvec14 %xmm14 #define xvec15 %xmm15 #define yvec0 %ymm0 #define yvec1 %ymm1 #define yvec2 %ymm2 #define yvec3 %ymm3 #define yvec4 %ymm4 #define yvec5 %ymm5 #define yvec6 %ymm6 #define yvec7 %ymm7 #define yvec8 %ymm8 #define yvec9 %ymm9 #define yvec10 %ymm10 #define yvec11 %ymm11 #define yvec12 %ymm12 #define yvec13 %ymm13 #define yvec14 %ymm14 #define yvec15 %ymm15 #define LEAQ leaq #define ADDQ addq #define MULQ imulq #define SARQ sarq #define SALQ salq #define ANDQ andq #define SUBQ subq #define DECQ decq #define JG jg #define JLE jle #define TEST testq #define OR orq #define JNE jne #define NOP #define XOR xorpd #undef MOVQ #define MOVQ movq #define XOR_DY vxorpd #define XOR_DX vxorpd #define LD_DY vmovapd #define LD_DX vmovapd #define LDL_DX vmovlpd #define LDL_DY vmovlpd #define LDH_DX vmovhpd #define LDH_DY vmovhpd #define ST_DY vmovapd #define ST_DX vmovapd #define STL_DX vmovlpd #define STL_DY vmovlpd #define STH_DX vmovhpd #define STH_DY vmovhpd #define EDUP_DY vmovddup #define ADD_DY vaddpd #define ADD_DX vaddpd #define ADD1_DY vaddpd #define ADD2_DY vaddpd #define ADDSUB_DY vaddsubpd #define MUL_DY vmulpd #define MUL_DX vmulpd #define SHUF_DY vperm2f128 #define SHUF_DX vpshufd #define VPERMILP_DY vpermilpd #define BROAD_DY vbroadcastsd #define BROAD_DX vmovddup #define MOV_DY vmovapd #define MOV_DX vmovapd #define REVS_DY vshufpd #define REVS_DX vmovsd #define EXTRA_DY vextractf128 PROLOGUE subq $STACKSIZE, %rsp; movq %rbx, 0(%rsp); movq %rbp, 8(%rsp); movq %r12, 16(%rsp); movq %r13, 24(%rsp); movq %r14, 32(%rsp); movq %r15, 40(%rsp); #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, old_bm movq ARG2, old_bn movq ARG3, old_bk movq OLD_A, ba movq OLD_B, bb movq OLD_C, C movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11 #endif movaps %xmm3, %xmm0 #else movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11 #endif #endif vzeroupper vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn movq old_bk, bk leaq (, ldc, SIZE), ldc #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11; #endif movq %r11, kk #endif MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C,C0; LEAQ (C,ldc,2),C1; MOVQ bk, k; SALQ $5, k; LEAQ (bb, k, 1), prebb; MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif //#### Initial Results Register #### PREFETCH2 0*SIZE(prebb); XOR_DY yvec15, yvec15, yvec15; PREFETCH2 8*SIZE(prebb); XOR_DY yvec14, yvec14, yvec14; XOR_DY yvec13, yvec13, yvec13; ADDQ $16*SIZE, prebb XOR_DY yvec12, yvec12, yvec12; PREFETCH0 3*SIZE(C0) LD_DY 0*SIZE(ptrbb), yvec2; PREFETCH0 3*SIZE(C0, ldc, 1) XOR_DY yvec11, yvec11, yvec11; PREFETCH0 3*SIZE(C1) XOR_DY yvec10, yvec10, yvec10; PREFETCH0 3*SIZE(C1, ldc, 1) LD_DY 0*SIZE(ptrba), yvec0; XOR_DY yvec9, yvec9, yvec9; XOR_DY yvec8, yvec8, yvec8; VPERMILP_DY $0x05, yvec2, yvec3; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; ALIGN_5; .L2_bodyB:; # Computing kernel //#### Unroll times 1 #### LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADD_DY yvec15, yvec6, yvec15; ADD_DY yvec13, yvec7, yvec13; PREFETCH0 64*SIZE(ptrba) MUL_DY yvec1, yvec2, yvec6; LD_DY 4*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; VPERMILP_DY $0x05, yvec2, yvec3; ADD_DY yvec14, yvec6, yvec14; ADD_DY yvec12, yvec7, yvec12; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 8*SIZE(ptrba), yvec0; ADD_DY yvec11, yvec6, yvec11; ADD_DY yvec9, yvec7, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; //#### Unroll times 2 #### LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADD_DY yvec15, yvec6, yvec15; ADD_DY yvec13, yvec7, yvec13; PREFETCH0 72*SIZE(ptrba) MUL_DY yvec1, yvec2, yvec6; LD_DY 8*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; VPERMILP_DY $0x05, yvec2, yvec3; ADD_DY yvec14, yvec6, yvec14; ADD_DY yvec12, yvec7, yvec12; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 16*SIZE(ptrba), yvec0; ADD_DY yvec11, yvec6, yvec11; ADD_DY yvec9, yvec7, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; //#### Unroll times 3 #### LD_DY 20*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADD_DY yvec15, yvec6, yvec15; ADD_DY yvec13, yvec7, yvec13; PREFETCH0 80*SIZE(ptrba) MUL_DY yvec1, yvec2, yvec6; LD_DY 12*SIZE(ptrbb), yvec2; ADDQ $16*SIZE, ptrbb; MUL_DY yvec1, yvec3, yvec7; VPERMILP_DY $0x05, yvec2, yvec3; ADD_DY yvec14, yvec6, yvec14; ADD_DY yvec12, yvec7, yvec12; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 24*SIZE(ptrba), yvec0; ADD_DY yvec11, yvec6, yvec11; ADD_DY yvec9, yvec7, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; //#### Unroll times 4 #### LD_DY 28*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADDQ $32*SIZE, ptrba; ADD_DY yvec15, yvec6, yvec15; ADD_DY yvec13, yvec7, yvec13; PREFETCH0 88*SIZE(ptrba) MUL_DY yvec1, yvec2, yvec6; LD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; VPERMILP_DY $0x05, yvec2, yvec3; ADD_DY yvec14, yvec6, yvec14; ADD_DY yvec12, yvec7, yvec12; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 0*SIZE(ptrba), yvec0; ADD_DY yvec11, yvec6, yvec11; ADD_DY yvec9, yvec7, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; ALIGN_5 .L2_loopE:; PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb; #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L3_loopE; ALIGN_5 .L3_bodyB: //#### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADD_DY yvec15, yvec6, yvec15; ADD_DY yvec13, yvec7, yvec13; MUL_DY yvec1, yvec2, yvec6; LD_DY 4*SIZE(ptrbb), yvec2; ADDQ $8*SIZE, ptrbb; MUL_DY yvec1, yvec3, yvec7; VPERMILP_DY $0x05, yvec2, yvec3; ADD_DY yvec14, yvec6, yvec14; ADD_DY yvec12, yvec7, yvec12; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 8*SIZE(ptrba), yvec0; ADD_DY yvec11, yvec6, yvec11; ADD_DY yvec9, yvec7, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; //#### Unroll times 2 #### PREFETCH0 72*SIZE(ptrba) LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADDQ $16*SIZE, ptrba; ADD_DY yvec15, yvec6, yvec15; ADD_DY yvec13, yvec7, yvec13; MUL_DY yvec1, yvec2, yvec6; LD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; VPERMILP_DY $0x05, yvec2, yvec3; ADD_DY yvec14, yvec6, yvec14; ADD_DY yvec12, yvec7, yvec12; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 0*SIZE(ptrba), yvec0; ADD_DY yvec11, yvec6, yvec11; ADD_DY yvec9, yvec7, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; .L3_loopE: PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L4_loopE; ALIGN_5 .L4_bodyB:; //#### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADDQ $8*SIZE, ptrba; ADD_DY yvec15, yvec6, yvec15; ADD_DY yvec13, yvec7, yvec13; MUL_DY yvec1, yvec2, yvec6; MUL_DY yvec1, yvec3, yvec7; ADDQ $4*SIZE, ptrbb; ADD_DY yvec14, yvec6, yvec14; ADD_DY yvec12, yvec7, yvec12; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; ADD_DY yvec11, yvec6, yvec11; ADD_DY yvec9, yvec7, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; .L4_loopE:; //#### Load Alpha #### BROAD_DY MEMALPHA,yvec7; //#### Multiply Alpha #### MUL_DY yvec7,yvec15,yvec15; MUL_DY yvec7,yvec14,yvec14; MUL_DY yvec7,yvec13,yvec13; MUL_DY yvec7,yvec12,yvec12; MUL_DY yvec7,yvec11,yvec11; MUL_DY yvec7,yvec10,yvec10; MUL_DY yvec7,yvec9,yvec9; MUL_DY yvec7,yvec8,yvec8; //#### Reverse the Results #### MOV_DY yvec15,yvec7; REVS_DY $0x0a,yvec13,yvec15,yvec15; REVS_DY $0x0a,yvec7,yvec13,yvec13; MOV_DY yvec14,yvec7; REVS_DY $0x0a,yvec12,yvec14,yvec14; REVS_DY $0x0a,yvec7,yvec12,yvec12; MOV_DY yvec11,yvec7; REVS_DY $0x0a,yvec9,yvec11,yvec11; REVS_DY $0x0a,yvec7,yvec9,yvec9; MOV_DY yvec10,yvec7; REVS_DY $0x0a,yvec8,yvec10,yvec10; REVS_DY $0x0a,yvec7,yvec8,yvec8; //#### Testing alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; # Unalign part write back ALIGN_5 //#### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec12,xvec4; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec10,xvec2; EXTRA_DY $1,yvec9,xvec1; EXTRA_DY $1,yvec8,xvec0; #ifndef TRMMKERNEL ADD_DY 0*SIZE(C0),xvec15,xvec15; ADD_DY 2*SIZE(C1),xvec7,xvec7; ADD_DY 4*SIZE(C0),xvec14,xvec14; ADD_DY 6*SIZE(C1),xvec6,xvec6; ADD_DY 0*SIZE(C0,ldc,1),xvec13,xvec13; ADD_DY 2*SIZE(C1,ldc,1),xvec5,xvec5; ADD_DY 4*SIZE(C0,ldc,1),xvec12,xvec12; ADD_DY 6*SIZE(C1,ldc,1),xvec4,xvec4; ADD_DY 0*SIZE(C1),xvec11,xvec11; ADD_DY 2*SIZE(C0),xvec3,xvec3; ADD_DY 4*SIZE(C1),xvec10,xvec10; ADD_DY 6*SIZE(C0),xvec2,xvec2; ADD_DY 0*SIZE(C1,ldc,1),xvec9,xvec9; ADD_DY 2*SIZE(C0,ldc,1),xvec1,xvec1; ADD_DY 4*SIZE(C1,ldc,1),xvec8,xvec8; ADD_DY 6*SIZE(C0,ldc,1),xvec0,xvec0; #endif ST_DY xvec15, 0*SIZE(C0); ST_DY xvec7, 2*SIZE(C1); ST_DY xvec14, 4*SIZE(C0); ST_DY xvec6, 6*SIZE(C1); ST_DY xvec13, 0*SIZE(C0,ldc,1); ST_DY xvec5, 2*SIZE(C1,ldc,1); ST_DY xvec12, 4*SIZE(C0,ldc,1); ST_DY xvec4, 6*SIZE(C1,ldc,1); ST_DY xvec11, 0*SIZE(C1); ST_DY xvec3, 2*SIZE(C0); ST_DY xvec10, 4*SIZE(C1); ST_DY xvec2, 6*SIZE(C0); ST_DY xvec9, 0*SIZE(C1,ldc,1); ST_DY xvec1, 2*SIZE(C0,ldc,1); ST_DY xvec8, 4*SIZE(C1,ldc,1); ST_DY xvec0, 6*SIZE(C0,ldc,1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE,C0; ADDQ $8*SIZE,C1; .L1_bodyE:; DECQ i; JG .L1_bodyB; JMP .L1_loopE; ALIGN_5; .L4_loopEx:; EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C0), xvec6, xvec6; LDH_DY 1*SIZE(C0), xvec6, xvec6; ADD_DY xvec6, xvec15, xvec15; LDL_DY 2*SIZE(C1), xvec5, xvec5; LDH_DY 3*SIZE(C1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif STL_DY xvec15, 0*SIZE(C0); STH_DY xvec15, 1*SIZE(C0); STL_DY xvec7, 2*SIZE(C1); STH_DY xvec7, 3*SIZE(C1); EXTRA_DY $1, yvec14, xvec4; #ifndef TRMMKERNEL LDL_DY 4*SIZE(C0), xvec3, xvec3; LDH_DY 5*SIZE(C0), xvec3, xvec3; ADD_DY xvec3, xvec14, xvec14; LDL_DY 6*SIZE(C1), xvec2, xvec2; LDH_DY 7*SIZE(C1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif STL_DY xvec14, 4*SIZE(C0); STH_DY xvec14, 5*SIZE(C0); STL_DY xvec4, 6*SIZE(C1); STH_DY xvec4, 7*SIZE(C1); EXTRA_DY $1, yvec13, xvec7; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C0, ldc, 1), xvec6, xvec6; LDH_DY 1*SIZE(C0, ldc, 1), xvec6, xvec6; ADD_DY xvec6, xvec13, xvec13; LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif STL_DY xvec13, 0*SIZE(C0, ldc, 1); STH_DY xvec13, 1*SIZE(C0, ldc, 1); STL_DY xvec7, 2*SIZE(C1, ldc, 1); STH_DY xvec7, 3*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL LDL_DY 4*SIZE(C0, ldc, 1), xvec3, xvec3; LDH_DY 5*SIZE(C0, ldc, 1), xvec3, xvec3; ADD_DY xvec3, xvec12, xvec12; LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif STL_DY xvec12, 4*SIZE(C0, ldc, 1); STH_DY xvec12, 5*SIZE(C0, ldc ,1); STL_DY xvec4, 6*SIZE(C1, ldc, 1); STH_DY xvec4, 7*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec11, xvec7; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C1), xvec6, xvec6; LDH_DY 1*SIZE(C1), xvec6, xvec6; ADD_DY xvec6, xvec11, xvec11; LDL_DY 2*SIZE(C0), xvec5, xvec5; LDH_DY 3*SIZE(C0), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif STL_DY xvec11, 0*SIZE(C1); STH_DY xvec11, 1*SIZE(C1); STL_DY xvec7, 2*SIZE(C0); STH_DY xvec7, 3*SIZE(C0); EXTRA_DY $1, yvec10, xvec4; #ifndef TRMMKERNEL LDL_DY 4*SIZE(C1), xvec3, xvec3; LDH_DY 5*SIZE(C1), xvec3, xvec3; ADD_DY xvec3, xvec10, xvec10; LDL_DY 6*SIZE(C0), xvec2, xvec2; LDH_DY 7*SIZE(C0), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif STL_DY xvec10, 4*SIZE(C1); STH_DY xvec10, 5*SIZE(C1); STL_DY xvec4, 6*SIZE(C0); STH_DY xvec4, 7*SIZE(C0); EXTRA_DY $1, yvec9, xvec7; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C1, ldc, 1), xvec6, xvec6; LDH_DY 1*SIZE(C1, ldc, 1), xvec6, xvec6; ADD_DY xvec6, xvec9, xvec9; LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif STL_DY xvec9, 0*SIZE(C1, ldc, 1); STH_DY xvec9, 1*SIZE(C1, ldc, 1); STL_DY xvec7, 2*SIZE(C0, ldc, 1); STH_DY xvec7, 3*SIZE(C0, ldc, 1); EXTRA_DY $1, yvec8, xvec4; #ifndef TRMMKERNEL LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3; ADD_DY xvec3, xvec8, xvec8; LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif STL_DY xvec8, 4*SIZE(C1, ldc, 1); STH_DY xvec8, 5*SIZE(C1, ldc, 1); STL_DY xvec4, 6*SIZE(C0, ldc, 1); STH_DY xvec4, 7*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; ALIGN_5 .L1_loopE:; TEST $4, bm; # Rm = 4 JLE .L5_loopE; ALIGN_5 .L5_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif //#### Initial Results Register #### XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec13, yvec13, yvec13; LD_DY 0*SIZE(ptrbb), yvec2; XOR_DY yvec11, yvec11, yvec11; XOR_DY yvec9, yvec9, yvec9; LD_DY 0*SIZE(ptrba), yvec0; VPERMILP_DY $0x05, yvec2, yvec3; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L6_loopE; ALIGN_5; .L6_bodyB:; # Computing kernel //#### Untoll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD_DY yvec13, yvec7, yvec13; SHUF_DY $0x03, yvec3, yvec3, yvec5; LD_DY 4*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec4, yvec6; ADD_DY yvec11, yvec6, yvec11; VPERMILP_DY $0x05, yvec2, yvec3; MUL_DY yvec0, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; //#### Untoll time 2 #### LD_DY 8*SIZE(ptrba), yvec0; MUL_DY yvec1, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec1, yvec3, yvec7; ADD_DY yvec13, yvec7, yvec13; SHUF_DY $0x03, yvec3, yvec3, yvec5; LD_DY 8*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec4, yvec6; ADD_DY yvec11, yvec6, yvec11; VPERMILP_DY $0x05, yvec2, yvec3; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; //#### Untoll time 3 #### LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; ADDQ $16*SIZE, ptrba; MUL_DY yvec0, yvec3, yvec7; ADD_DY yvec13, yvec7, yvec13; SHUF_DY $0x03, yvec3, yvec3, yvec5; LD_DY 12*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec4, yvec6; ADD_DY yvec11, yvec6, yvec11; VPERMILP_DY $0x05, yvec2, yvec3; ADDQ $16*SIZE, ptrbb; MUL_DY yvec0, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; //#### Untoll time 4 #### LD_DY 0*SIZE(ptrba), yvec0; MUL_DY yvec1, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec1, yvec3, yvec7; ADD_DY yvec13, yvec7, yvec13; SHUF_DY $0x03, yvec3, yvec3, yvec5; LD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec4, yvec6; ADD_DY yvec11, yvec6, yvec11; VPERMILP_DY $0x05, yvec2, yvec3; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; DECQ k; JG .L6_bodyB; ALIGN_5 .L6_loopE:; #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L7_loopE; ALIGN_5 .L7_bodyB:; //#### Untoll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; ADDQ $8*SIZE, ptrba; MUL_DY yvec0, yvec3, yvec7; ADD_DY yvec13, yvec7, yvec13; SHUF_DY $0x03, yvec3, yvec3, yvec5; LD_DY 4*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec4, yvec6; ADD_DY yvec11, yvec6, yvec11; VPERMILP_DY $0x05, yvec2, yvec3; ADDQ $8*SIZE, ptrbb; MUL_DY yvec0, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; //#### Untoll time 2 #### LD_DY 0*SIZE(ptrba), yvec0; MUL_DY yvec1, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec1, yvec3, yvec7; ADD_DY yvec13, yvec7, yvec13; SHUF_DY $0x03, yvec3, yvec3, yvec5; LD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec4, yvec6; ADD_DY yvec11, yvec6, yvec11; VPERMILP_DY $0x05, yvec2, yvec3; MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; .L7_loopE:; #ifndef TRMMKERNEL TEST $1, bk #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L8_loopE; ALIGN_5 .L8_bodyB:; //#### Untoll time 1 #### MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; ADDQ $4*SIZE, ptrba; MUL_DY yvec0, yvec3, yvec7; ADD_DY yvec13, yvec7, yvec13; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD_DY yvec11, yvec6, yvec11; ADDQ $4*SIZE, ptrbb; MUL_DY yvec0, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; .L8_loopE:; //#### Load Alpha #### BROAD_DY MEMALPHA, yvec7; //#### Multiply Alpha #### MUL_DY yvec7,yvec15,yvec15; MUL_DY yvec7,yvec13,yvec13; MUL_DY yvec7,yvec11,yvec11; MUL_DY yvec7,yvec9,yvec9; //#### Reverse the Results #### MOV_DY yvec15, yvec7; REVS_DY $0x0a,yvec13,yvec15,yvec15; REVS_DY $0x0a,yvec7,yvec13,yvec13; MOV_DY yvec11,yvec7; REVS_DY $0x0a,yvec9,yvec11,yvec11; REVS_DY $0x0a,yvec7,yvec9,yvec9; //#### Testing alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L8_loopEx; # Unalign part write back ALIGN_5 //#### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec15, xvec15; ADD_DX 2*SIZE(C1), xvec7, xvec7; ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5; ADD_DX 0*SIZE(C1), xvec11, xvec11; ADD_DX 2*SIZE(C0), xvec3, xvec3; ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9; ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C1); ST_DX xvec13, 0*SIZE(C0,ldc,1); ST_DX xvec5, 2*SIZE(C1,ldc,1); ST_DX xvec11, 0*SIZE(C1); ST_DX xvec3, 2*SIZE(C0); ST_DX xvec9, 0*SIZE(C1,ldc,1); ST_DX xvec1, 2*SIZE(C0,ldc,1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; ALIGN_5 .L8_loopEx:; EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec14, xvec14; LDH_DX 1*SIZE(C0), xvec14, xvec14; LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; LDL_DX 0*SIZE(C1), xvec10, xvec10; LDH_DX 1*SIZE(C1), xvec10, xvec10; LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; ADD_DX xvec14, xvec15, xvec15; ADD_DX xvec12, xvec13, xvec13; ADD_DX xvec10, xvec11, xvec11; ADD_DX xvec8, xvec9, xvec9; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec13, 0*SIZE(C0, ldc, 1); STH_DX xvec13, 1*SIZE(C0, ldc, 1); STL_DX xvec11, 0*SIZE(C1); STH_DX xvec11, 1*SIZE(C1); STL_DX xvec9, 0*SIZE(C1, ldc, 1); STH_DX xvec9, 1*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_DX 2*SIZE(C0), xvec0, xvec0; LDH_DX 3*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2; LDL_DX 2*SIZE(C1), xvec4, xvec4; LDH_DX 3*SIZE(C1), xvec4, xvec4; LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6; ADD_DX xvec0, xvec3, xvec3; ADD_DX xvec2, xvec1, xvec1; ADD_DX xvec4, xvec7, xvec7; ADD_DX xvec6, xvec5, xvec5; #endif STL_DX xvec3, 2*SIZE(C0); STH_DX xvec3, 3*SIZE(C0); STL_DX xvec1, 2*SIZE(C0, ldc, 1); STH_DX xvec1, 3*SIZE(C0, ldc, 1); STL_DX xvec7, 2*SIZE(C1); STH_DX xvec7, 3*SIZE(C1); STL_DX xvec5, 2*SIZE(C1, ldc, 1); STH_DX xvec5, 3*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L5_loopE:; TEST $2, bm; JLE .L9_loopE; ALIGN_5 .L9_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb #endif //#### Initial Results Register #### LD_DX 0*SIZE(ptrbb), xvec2; XOR_DY yvec15, yvec15, yvec15; LD_DX 2*SIZE(ptrbb), xvec3; XOR_DY yvec13, yvec13, yvec13; LD_DX 0*SIZE(ptrba), xvec0; XOR_DY yvec11, yvec11, yvec11; SHUF_DX $0x4e, xvec2, xvec4; XOR_DY yvec9, yvec9, yvec9; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; ALIGN_5; .L10_bodyB:; # Computing kernel //#### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; MUL_DX xvec0, xvec3, xvec3; ADD_DX xvec3, xvec11, xvec11; LD_DX 2*SIZE(ptrba), xvec1; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec9, xvec9; //#### Unroll time 2 #### LD_DX 8*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; MUL_DX xvec1, xvec6, xvec6; ADD_DX xvec6, xvec15, xvec15; LD_DX 10*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec7, xvec7; ADD_DX xvec7, xvec11, xvec11; LD_DX 4*SIZE(ptrba), xvec0; MUL_DX xvec1, xvec4, xvec4; ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec9, xvec9; //#### Unroll time 3 #### LD_DX 12*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; LD_DX 14*SIZE(ptrbb), xvec7; MUL_DX xvec0, xvec3, xvec3; ADD_DX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $8*SIZE, ptrba; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec9, xvec9; //#### Unroll time 4 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; MUL_DX xvec1, xvec6, xvec6; ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec7, xvec7; ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; MUL_DX xvec1, xvec4, xvec4; ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec9, xvec9; DECQ k; JG .L10_bodyB; ALIGN_5 .L10_loopE:; #ifndef TRMMKERNEL TEST $2, bk #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L11_loopE; ALIGN_5 .L11_bodyB:; //#### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; MUL_DX xvec0, xvec3, xvec3; ADD_DX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $4*SIZE, ptrba; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec9, xvec9; //#### Unroll time 2 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; MUL_DX xvec1, xvec6, xvec6; ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec7, xvec7; ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; MUL_DX xvec1, xvec4, xvec4; ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec9, xvec9; .L11_loopE:; #ifndef TRMMKERNEL TEST $1, bk #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L12_loopE; ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; MUL_DX xvec0, xvec3, xvec3; ADD_DX xvec3, xvec11, xvec11; ADDQ $2*SIZE, ptrba; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec13, xvec13; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec9, xvec9; .L12_loopE:; //#### Load Alpha #### BROAD_DX MEMALPHA, xvec7; //#### Multiply Alpha #### MUL_DX xvec7, xvec15, xvec15; MUL_DX xvec7, xvec13, xvec13; MUL_DX xvec7, xvec11, xvec11; MUL_DX xvec7, xvec9, xvec9; //#### Reverse the Results #### MOV_DX xvec15, xvec6; REVS_DX xvec13, xvec15, xvec15; REVS_DX xvec6, xvec13, xvec13; MOV_DX xvec11, xvec6; REVS_DX xvec9, xvec11, xvec11; REVS_DX xvec6, xvec9, xvec9; //#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L12_loopEx; ALIGN_5 //#### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec13, xvec13; ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15; ADD_DX 0*SIZE(C1), xvec9, xvec9; ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11; #endif ST_DX xvec13, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C0, ldc, 1); ST_DX xvec9, 0*SIZE(C1); ST_DX xvec11, 0*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk #endif ADDQ $2*SIZE, C0 ADDQ $2*SIZE, C1 JMP .L9_loopE; ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec14, xvec14; LDH_DX 1*SIZE(C0), xvec14, xvec14; LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; LDL_DX 0*SIZE(C1), xvec10, xvec10; LDH_DX 1*SIZE(C1), xvec10, xvec10; LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; ADD_DX xvec14, xvec13, xvec13; ADD_DX xvec12, xvec15, xvec15; ADD_DX xvec10, xvec9, xvec9; ADD_DX xvec8, xvec11, xvec11; #endif STL_DX xvec13, 0*SIZE(C0); STH_DX xvec13, 1*SIZE(C0); STL_DX xvec15, 0*SIZE(C0, ldc, 1); STH_DX xvec15, 1*SIZE(C0, ldc, 1); STL_DX xvec9, 0*SIZE(C1); STH_DX xvec9, 1*SIZE(C1); STL_DX xvec11, 0*SIZE(C1, ldc, 1); STH_DX xvec11, 1*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L9_loopE:; TEST $1, bm JLE .L13_loopE; ALIGN_5 .L13_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif //#### Initial Results Register #### XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; ALIGN_5 .L14_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; BROAD_DY 1*SIZE(ptrba), yvec1; LD_DY 4*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD_DY yvec15, yvec7, yvec15; BROAD_DY 2*SIZE(ptrba), yvec0; LD_DY 8*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; BROAD_DY 3*SIZE(ptrba), yvec1; LD_DY 12*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD_DY yvec15, yvec7, yvec15; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L14_bodyB; ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L15_loopE; ALIGN_5 .L15_bodyB: BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; BROAD_DY 1*SIZE(ptrba), yvec1; LD_DY 4*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD_DY yvec15, yvec7, yvec15; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L15_loopE:; #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L16_loopE; ALIGN_5 .L16_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD_DY yvec15, yvec6, yvec15; ADDQ $1*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L16_loopE: //#### Load Alpha #### BROAD_DY MEMALPHA, yvec7; //#### Multiply Alpha #### MUL_DY yvec15, yvec7, yvec15; //#### Writing Back #### EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0; LDL_DX 0*SIZE(C1), xvec1, xvec1; LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C0, ldc, 1); STL_DX xvec7, 0*SIZE(C1); STH_DX xvec7, 0*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $1, kk #endif ADDQ $1*SIZE, C0 ADDQ $1*SIZE, C1 .L13_loopE:; #if defined(TRMMKERNEL)&&!defined(LEFT) ADDQ $4, kk #endif MOVQ bk,k; SALQ $5,k; ADDQ k,bb; LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; ALIGN_5; .L20_loopB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk #endif MOVQ C, C0; LEAQ (C, ldc, 1), C1; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; # Rm = 8 JLE .L21_loopE; ALIGN_5; .L21_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif //#### Initial Results Register #### XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; XOR_DY yvec13, yvec13, yvec13; XOR_DY yvec12, yvec12, yvec12; XOR_DY yvec11, yvec11, yvec11; XOR_DY yvec10, yvec10, yvec10; XOR_DY yvec9, yvec9, yvec9; XOR_DY yvec8, yvec8, yvec8; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; ALIGN_5; .L211_bodyB: # Computing kernel //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec9, xvec9; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec8, xvec8; //#### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec9, xvec9; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec8, xvec8; //#### Unroll time 3 #### LD_DX 16*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 18*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; LD_DX 20*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec13, xvec13; LD_DX 22*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec9, xvec9; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec8, xvec8; //#### Unroll time 4 #### LD_DX 24*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 26*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; LD_DX 28*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec13, xvec13; LD_DX 30*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec12, xvec12; ADDQ $32*SIZE, ptrba; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec9, xvec9; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec8, xvec8; DECQ k; JG .L211_bodyB; ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L212_loopE; ALIGN_5; .L212_bodyB: # Computing kernel //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec9, xvec9; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec8, xvec8; //#### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec12, xvec12; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec9, xvec9; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec8, xvec8; .L212_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L213_loopE; ALIGN_5 .L213_bodyB: //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec12, xvec12; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; MUL_DX xvec2, xvec6, xvec6; ADD_DX xvec6, xvec9, xvec9; MUL_DX xvec3, xvec7, xvec7; ADD_DX xvec7, xvec8, xvec8; .L213_loopE: //#### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; MUL_DX xvec7, xvec15, xvec15; MUL_DX xvec7, xvec14, xvec14; MUL_DX xvec7, xvec13, xvec13; MUL_DX xvec7, xvec12, xvec12; MUL_DX xvec7, xvec11, xvec11; MUL_DX xvec7, xvec10, xvec10; MUL_DX xvec7, xvec9, xvec9; MUL_DX xvec7, xvec8, xvec8; //#### Reverse #### MOV_DX xvec15, xvec6; REVS_DX xvec11, xvec15, xvec15; REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; REVS_DX xvec10, xvec14, xvec14; REVS_DX xvec6, xvec10, xvec10; MOV_DX xvec13, xvec6; REVS_DX xvec9, xvec13, xvec13; REVS_DX xvec6, xvec9, xvec9; MOV_DX xvec12, xvec6; REVS_DX xvec8, xvec12, xvec12; REVS_DX xvec6, xvec8, xvec8; //#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; ALIGN_5 //#### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11, xvec11; ADD_DX 2*SIZE(C0), xvec10, xvec10; ADD_DX 4*SIZE(C0), xvec9, xvec9; ADD_DX 6*SIZE(C0), xvec8, xvec8; ADD_DX 0*SIZE(C1), xvec15, xvec15; ADD_DX 2*SIZE(C1), xvec14, xvec14; ADD_DX 4*SIZE(C1), xvec13, xvec13; ADD_DX 6*SIZE(C1), xvec12, xvec12; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); ST_DX xvec9, 4*SIZE(C0); ST_DX xvec8, 6*SIZE(C0); ST_DX xvec15, 0*SIZE(C1); ST_DX xvec14, 2*SIZE(C1); ST_DX xvec13, 4*SIZE(C1); ST_DX xvec12, 6*SIZE(C1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C0), xvec1, xvec1; LDH_DX 3*SIZE(C0), xvec1, xvec1; LDL_DX 4*SIZE(C0), xvec2, xvec2; LDH_DX 5*SIZE(C0), xvec2, xvec2; LDL_DX 6*SIZE(C0), xvec3, xvec3; LDH_DX 7*SIZE(C0), xvec3, xvec3; ADD_DX xvec0, xvec11, xvec11; ADD_DX xvec1, xvec10, xvec10; ADD_DX xvec2, xvec9, xvec9; ADD_DX xvec3, xvec8, xvec8; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); STL_DX xvec10, 2*SIZE(C0); STH_DX xvec10, 3*SIZE(C0); STL_DX xvec9, 4*SIZE(C0); STH_DX xvec9, 5*SIZE(C0); STL_DX xvec8, 6*SIZE(C0); STH_DX xvec8, 7*SIZE(C0); #ifndef TRMMKERNEL LDL_DX 0*SIZE(C1), xvec4, xvec4; LDH_DX 1*SIZE(C1), xvec4, xvec4; LDL_DX 2*SIZE(C1), xvec5, xvec5; LDH_DX 3*SIZE(C1), xvec5, xvec5; LDL_DX 4*SIZE(C1), xvec6, xvec6; LDH_DX 5*SIZE(C1), xvec6, xvec6; LDL_DX 6*SIZE(C1), xvec7, xvec7; LDH_DX 7*SIZE(C1), xvec7, xvec7; ADD_DX xvec4, xvec15, xvec15; ADD_DX xvec5, xvec14, xvec14; ADD_DX xvec6, xvec13, xvec13; ADD_DX xvec7, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); STL_DX xvec14, 2*SIZE(C1); STH_DX xvec14, 3*SIZE(C1); STL_DX xvec13, 4*SIZE(C1); STH_DX xvec13, 5*SIZE(C1); STL_DX xvec12, 6*SIZE(C1); STH_DX xvec12, 7*SIZE(C1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; .L21_loopE:; TEST $4, bm; # Rm = 4 JLE .L22_loopE; ALIGN_5; .L22_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif //#### Initial Results Register #### XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; XOR_DY yvec11, yvec11, yvec11; XOR_DY yvec10, yvec10, yvec10; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; ALIGN_5 .L221_bodyB:; # Computing kernel //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; //#### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; //#### Unroll time 3 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; //#### Unroll time 4 #### LD_DX 12*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 14*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; DECQ k; JG .L221_bodyB; ALIGN_5 .L221_loopE:; #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L222_loopE; ALIGN_5 .L222_bodyB: //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; //#### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; .L222_loopE: #ifndef TRMMKERNEL TEST $1, bk #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L223_loopE; ALIGN_5 .L223_bodyB: //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; MOV_DX xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec11, xvec11; MUL_DX xvec1, xvec5, xvec5; ADD_DX xvec5, xvec10, xvec10; .L223_loopE: //#### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; MUL_DX xvec7, xvec15, xvec15; MUL_DX xvec7, xvec14, xvec14; MUL_DX xvec7, xvec11, xvec11; MUL_DX xvec7, xvec10, xvec10; //#### Reverse #### MOV_DX xvec15, xvec6; REVS_DX xvec11, xvec15, xvec15; REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; REVS_DX xvec10, xvec14, xvec14; REVS_DX xvec6, xvec10, xvec10; //#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L223_loopEx; ALIGN_5 //#### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11, xvec11; ADD_DX 2*SIZE(C0), xvec10, xvec10; ADD_DX 0*SIZE(C1), xvec15, xvec15; ADD_DX 2*SIZE(C1), xvec14, xvec14; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); ST_DX xvec15, 0*SIZE(C1); ST_DX xvec14, 2*SIZE(C1); #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L22_loopE; ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C0), xvec1, xvec1; LDH_DX 3*SIZE(C0), xvec1, xvec1; ADD_DX xvec0, xvec11, xvec11; ADD_DX xvec1, xvec10, xvec10; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); STL_DX xvec10, 2*SIZE(C0); STH_DX xvec10, 3*SIZE(C0); #ifndef TRMMKERNEL LDL_DX 0*SIZE(C1), xvec4, xvec4; LDH_DX 1*SIZE(C1), xvec4, xvec4; LDL_DX 2*SIZE(C1), xvec5, xvec5; LDH_DX 3*SIZE(C1), xvec5, xvec5; ADD_DX xvec4, xvec15, xvec15; ADD_DX xvec5, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); STL_DX xvec14, 2*SIZE(C1); STH_DX xvec14, 3*SIZE(C1); #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L22_loopE:; TEST $2, bm; // Rm = 2 JLE .L23_loopE; ALIGN_5; .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec11, yvec11, yvec11; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; ALIGN_5 .L231_bodyB: # Computing kernel //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec11, xvec11; //#### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec11, xvec11; //#### Unroll time 3 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec11, xvec11; //#### Unroll time 4 #### LD_DX 6*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L232_loopE; ALIGN_5 .L232_bodyB: //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec11, xvec11; //#### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec11, xvec11; ADDQ $4*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L233_loopE; ALIGN_5 .L233_bodyB: //#### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; MUL_DX xvec0, xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; MUL_DX xvec0, xvec5, xvec5; ADD_DX xvec5, xvec11, xvec11; ADDQ $2*SIZE, ptrbb; .L233_loopE: //#### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; MUL_DX xvec7, xvec15, xvec15; MUL_DX xvec7, xvec11, xvec11; //#### Reverse #### MOV_DX xvec15, xvec6; REVS_DX xvec11, xvec15, xvec15; REVS_DX xvec6, xvec11, xvec11; //#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L233_loopEx; ALIGN_5 //#### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11, xvec11; ADD_DX 0*SIZE(C1), xvec15, xvec15; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; JMP .L23_loopE; ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; ADD_DX xvec0, xvec11, xvec11; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); #ifndef TRMMKERNEL LDL_DX 0*SIZE(C1), xvec4, xvec4; LDH_DX 1*SIZE(C1), xvec4, xvec4; ADD_DX xvec4, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; // Rm = 1 JLE .L24_loopE; ALIGN_5; .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADD_DX xvec3, xvec15, xvec15; BROAD_DX 2*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; BROAD_DX 3*SIZE(ptrba), xvec1; LD_DX 6*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L241_bodyB; ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L242_loopE; ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADD_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L242_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L243_loopE; ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L243_loopE: BROAD_DX MEMALPHA, xvec7; MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 0*SIZE(C1), xvec0, xvec0; ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $1*SIZE, C0; ADDQ $1*SIZE, C1; .L24_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $2, kk; #endif MOVQ bk, k; SALQ $4, k; ADDQ k, bb; LEAQ (C, ldc, 2), C; .L20_loopE:; TEST $1, bn; // Rn = 1 JLE .L30_loopE; ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif //#### Initial Results Register #### XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; ALIGN_5 .L311_bodyB: //#### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; LD_DY 4*SIZE(ptrba), yvec1; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec2, yvec0, yvec0; ADD_DY yvec0, yvec15, yvec15; MUL_DY yvec2, yvec1, yvec1; ADD_DY yvec1, yvec14, yvec14; //#### Unroll time 2 #### LD_DY 8*SIZE(ptrba), yvec3; LD_DY 12*SIZE(ptrba), yvec4; BROAD_DY 1*SIZE(ptrbb), yvec5; MUL_DY yvec5, yvec3, yvec3; ADD_DY yvec3, yvec15, yvec15; MUL_DY yvec5, yvec4, yvec4 ADD_DY yvec4, yvec14, yvec14; //#### Unroll time 3 #### LD_DY 16*SIZE(ptrba), yvec0; LD_DY 20*SIZE(ptrba), yvec1; BROAD_DY 2*SIZE(ptrbb), yvec2; MUL_DY yvec2, yvec0, yvec0; ADD_DY yvec0, yvec15, yvec15; MUL_DY yvec2, yvec1, yvec1; ADD_DY yvec1, yvec14, yvec14; //#### Unroll time 2 #### LD_DY 24*SIZE(ptrba), yvec3; LD_DY 28*SIZE(ptrba), yvec4; BROAD_DY 3*SIZE(ptrbb), yvec5; MUL_DY yvec5, yvec3, yvec3; ADD_DY yvec3, yvec15, yvec15; ADDQ $32*SIZE, ptrba; MUL_DY yvec5, yvec4, yvec4; ADD_DY yvec4, yvec14, yvec14; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L311_bodyB; ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L312_loopE; ALIGN_5 .L312_bodyB: //#### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; LD_DY 4*SIZE(ptrba), yvec1; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec2, yvec0, yvec0; ADD_DY yvec0, yvec15, yvec15; MUL_DY yvec2, yvec1, yvec1; ADD_DY yvec1, yvec14, yvec14; //#### Unroll time 2 #### LD_DY 8*SIZE(ptrba), yvec3; LD_DY 12*SIZE(ptrba), yvec4; BROAD_DY 1*SIZE(ptrbb), yvec5; MUL_DY yvec5, yvec3, yvec3; ADD_DY yvec3, yvec15, yvec15; ADDQ $16*SIZE, ptrba; MUL_DY yvec5, yvec4, yvec4 ADD_DY yvec4, yvec14, yvec14; ADDQ $2*SIZE, ptrbb; .L312_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L313_loopE; ALIGN_5 .L313_bodyB: //#### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; LD_DY 4*SIZE(ptrba), yvec1; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec2, yvec0, yvec0; ADD_DY yvec0, yvec15, yvec15; ADDQ $8*SIZE, ptrba; MUL_DY yvec2, yvec1, yvec1; ADD_DY yvec1, yvec14, yvec14; ADDQ $1*SIZE, ptrbb; .L313_loopE: //#### Multiply Alpha #### BROAD_DY MEMALPHA, yvec7; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec7, yvec14, yvec14; //#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L313_loopEx; ALIGN_5 //#### Writing Back #### EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec15, xvec15; ADD_DX 2*SIZE(C0), xvec13, xvec13; ADD_DX 4*SIZE(C0), xvec14, xvec14; ADD_DX 6*SIZE(C0), xvec12, xvec12; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec13, 2*SIZE(C0); ST_DX xvec14, 4*SIZE(C0); ST_DX xvec12, 6*SIZE(C0); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $8, kk; #endif ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; JMP .L31_loopE; ALIGN_5 .L313_loopEx: EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec11, xvec11; LDH_DX 1*SIZE(C0), xvec11, xvec11; LDL_DX 2*SIZE(C0), xvec10, xvec10; LDH_DX 3*SIZE(C0), xvec10, xvec10; LDL_DX 4*SIZE(C0), xvec9, xvec9; LDH_DX 5*SIZE(C0), xvec9, xvec9; LDL_DX 6*SIZE(C0), xvec8, xvec8; LDH_DX 7*SIZE(C0), xvec8, xvec8; ADD_DX xvec11, xvec15, xvec15; ADD_DX xvec10, xvec13, xvec13; ADD_DX xvec9, xvec14, xvec14; ADD_DX xvec8, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec13, 2*SIZE(C0); STH_DX xvec13, 3*SIZE(C0); STL_DX xvec14, 4*SIZE(C0); STH_DX xvec14, 5*SIZE(C0); STL_DX xvec12, 6*SIZE(C0); STH_DX xvec12, 7*SIZE(C0); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $8, kk; #endif ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; .L31_loopE: TEST $4, bm JLE .L32_loopE; ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif //#### Initial Results Register #### XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk #endif SARQ $2, k; JLE .L321_loopE; ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; MUL_DY yvec0, yvec1, yvec1; ADD_DY yvec1, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec2; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec2, yvec3, yvec3; ADD_DY yvec3, yvec15, yvec15; LD_DY 8*SIZE(ptrba), yvec4; BROAD_DY 2*SIZE(ptrbb), yvec5; MUL_DY yvec4, yvec5, yvec5; ADD_DY yvec5, yvec15, yvec15; LD_DY 12*SIZE(ptrba), yvec6; BROAD_DY 3*SIZE(ptrbb), yvec7; MUL_DY yvec6, yvec7, yvec7; ADD_DY yvec7, yvec15, yvec15; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L321_bodyB; ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L322_loopE; ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; MUL_DY yvec0, yvec1, yvec1; ADD_DY yvec1, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec2; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec2, yvec3, yvec3; ADD_DY yvec3, yvec15, yvec15; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L322_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L323_loopE; ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; MUL_DY yvec0, yvec1, yvec1; ADD_DY yvec1, yvec15, yvec15; ADDQ $4*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L323_loopE: //#### Multiply Alpha #### BROAD_DY MEMALPHA, yvec7; MUL_DY yvec7, yvec15, yvec15; //#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L323_loopEx; ALIGN_5 //#### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec15, xvec15; ADD_DX 2*SIZE(C0), xvec14, xvec14; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec14, 2*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; JMP .L32_loopE; ALIGN_5 .L323_loopEx: //#### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec13, xvec13; LDH_DX 1*SIZE(C0), xvec13, xvec13; LDL_DX 2*SIZE(C0), xvec12, xvec12; LDH_DX 3*SIZE(C0), xvec12, xvec12; ADD_DX xvec13, xvec15, xvec15; ADD_DX xvec12, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec14, 2*SIZE(C0); STH_DX xvec14, 3*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; .L32_loopE: TEST $2, bm JLE .L33_loopE; ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax LEAQ (ptrba, %rax, 2), ptrba ADDQ %rax, ptrbb; #endif //#### Initial Result #### XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADD_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec4; BROAD_DX 2*SIZE(ptrbb), xvec5; MUL_DX xvec4, xvec5, xvec5; ADD_DX xvec5, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec6; BROAD_DX 3*SIZE(ptrbb), xvec7; MUL_DX xvec6, xvec7, xvec7; ADD_DX xvec7, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L331_bodyB; ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2,bk; #else MOVQ kkk, %rax; TEST $2, %rax #endif JLE .L332_loopE; ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L332_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L333_loopE; ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD_DX xvec2, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L333_loopE: //#### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec14, xvec14; LDH_DX 1*SIZE(C0), xvec14, xvec14; ADD_DX xvec14, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, kk #endif ADDQ $2*SIZE, C0; .L33_loopE: TEST $1, bm JLE .L34_loopE; ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; ALIGN_5 .L341_bodyB: vmovsd 0*SIZE(ptrba), xvec0; vmovsd 0*SIZE(ptrbb), xvec1; vmulsd xvec0, xvec1, xvec1; vaddsd xvec1, xvec15, xvec15; vmovsd 1*SIZE(ptrba), xvec0; vmovsd 1*SIZE(ptrbb), xvec1; vmulsd xvec0, xvec1, xvec1; vaddsd xvec1, xvec15, xvec15; vmovsd 2*SIZE(ptrba), xvec0; vmovsd 2*SIZE(ptrbb), xvec1; vmulsd xvec0, xvec1, xvec1; vaddsd xvec1, xvec15, xvec15; vmovsd 3*SIZE(ptrba), xvec0; vmovsd 3*SIZE(ptrbb), xvec1; vmulsd xvec0, xvec1, xvec1; vaddsd xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L341_bodyB; ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L342_loopE; ALIGN_5 .L342_bodyB: vmovsd 0*SIZE(ptrba), xvec0; vmovsd 0*SIZE(ptrbb), xvec1; vmulsd xvec0, xvec1, xvec1; vaddsd xvec1, xvec15, xvec15; vmovsd 1*SIZE(ptrba), xvec0; vmovsd 1*SIZE(ptrbb), xvec1; vmulsd xvec0, xvec1, xvec1; vaddsd xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; .L342_loopE: #ifndef TRMMKERNEL TEST $1, bk #else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L343_loopE; ALIGN_5 .L343_bodyB: vmovsd 0*SIZE(ptrba), xvec0; vmovsd 0*SIZE(ptrbb), xvec1; vmulsd xvec0, xvec1, xvec1; vaddsd xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L343_loopE: //#### Writing Back #### vmovsd MEMALPHA, xvec7; vmulsd xvec7, xvec15, xvec15; #ifndef TRMMKERNEL vmovsd 0*SIZE(C0), xvec0; vaddsd xvec0, xvec15, xvec15; #endif movsd xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, kk #endif addq $1*SIZE, C0; .L34_loopE: MOVQ bk, k SALQ $3, k; ADDQ k, bb; LEAQ (C, ldc, 1), C; .L30_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; vzeroupper #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp; ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_kernel_6x4_piledriver.S000066400000000000000000001444031313527062700230730ustar00rootroot00000000000000/**************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ // register blocking= 6x4. unloop k = 4. // Use FMA3 on piledriver. // Todo: 1) deal with the edge. 2) Add windows abi. #define ASSEMBLER #include "common.h" #define STACKSIZE 128 #define oldbk_i %rdi #define oldbk_j %rsi #define oldbk_l %rdx #define _bk_i %r13 #define _bk_j %r14 #define _bk_l %r15 #define ALPHA %xmm0 #define _ptr_A %rcx #define _ptr_B %r8 #define _ptr_C %r9 #define LDC %r10 #define i %r11 #define k %rax #define _pre_B %r12 #define _ptr__A_0 %rdi #define _ptr__B_0 %rsi #define _ptr__C_0 %rbx #define _ptr__C_1 %rbp #define old_ldc 8+STACKSIZE(%rsp) #define alpha 48(%rsp) #define j 56(%rsp) #define MOVQ2560(s,d) movq s,d #define LEAQ2560(s,d) leaq s,d #define SARQ2560(imm,n) sarq imm,n #define ADDQ2560(off,addr) addq off,addr #define SUBQ2560(off,addr) subq off,addr #define DIVQ2560(off,addr) divq off,addr #define MULQ2560(s,d) mulq s,d #define DECQ2560(addr) decq addr #define NEGQ2560(s) negq s #define TESTQ2560(n,addr) testq n,addr #define SALQ2560(imm,n) salq imm,n #define MOVQ1280(s,d) movq s,d #define LEAQ1280(s,d) leaq s,d #define SARQ1280(imm,n) sarq imm,n #define ADDQ1280(off,addr) addq off,addr #define SUBQ1280(off,addr) subq off,addr #define DIVQ1280(off,addr) divq off,addr #define CMPQ1280(off,addr) cmpq off,addr #define MULQ1280(s,d) mulq s,d #define DECQ1280(addr) decq addr #define NEGQ1280(s) negq s #define TESTQ1280(n,addr) testq n,addr #define SALQ1280(imm,n) salq imm,n #define JG jg #define JLE jle #define VLD2560(addr,reg) vmovapd addr,reg #define VST2560(reg,addr) vmovapd reg,addr #define VMUL2560(a,b,c) vmulpd a,b,c #define MVMUL2560(a,b,c) vmulpd b,a,c #define VADD2560(a,b,c) vaddpd a,b,c #define MVADD2560(a,b,c) vaddpd b,a,c #define VSHUF2560(imm,s,d) vpermilpd imm,s,d #define VSHUF2F2560(imm,s1,s2,d) vperm2f128 imm,s1,s2,d #define BROAD2560(addr,reg) vbroadcastsd addr,reg #define MOVRR2560(a,b) vmovapd a,b #define REVS2560(imm,s1,s2,d) vshufpd imm,s1,s2,d #define EXTR2561(imm,a,b) vextractf128 imm,a,b #define LDL2561(addr,reg) vmovlpd addr,reg,reg #define LDH2561(addr,reg) vmovhpd addr,reg,reg #define STL2561(reg,addr) vmovlpd reg,addr #define STH2561(reg,addr) vmovhpd reg,addr #define VADD2561(a,b,c) vaddpd a,b,c #define VXOR2560(a,b,c) vxorpd a,b,c #define PREFETCH02560(addr,b) prefetcht0 addr #define PREFETCH12560(addr,b) prefetcht0 addr #define PREFETCH22560(addr,b) prefetcht2 addr #define PREFETCHW2560(addr,b) prefetchw addr #define PREFETCHN2560(addr,b) prefetchnta addr #define VMA2560(a,b,c,d) vfmaddpd d,a,b,c #define MVMA2560(a,b,c,d) vfmaddpd d,a,b,c #define VLD1280(addr,reg) vmovapd addr,reg #define VLD1282(addr,reg) vmovapd addr,reg #define VLD1281(addr,reg) movsd addr,reg #define VST1280(reg,addr) vmovapd reg,addr #define VST1282(reg,addr) vmovapd reg,addr #define VST1281(reg,addr) movsd reg,addr #define VLDU1282(addr,reg) vmovupd addr,reg #define VLDU1281(addr,reg) movsd addr,reg #define VSTU1282(reg,addr) vmovupd reg,addr #define VSTU1281(reg,addr) movsd reg,addr #define VMUL1280(a,b,c) vmulpd a,b,c #define VMUL1282(a,b,c) vmulpd a,b,c #define VMUL1281(a,b,c) vmulpd a,b,c #define MVMUL1280(a,b,c) vmulpd b,a,c #define VADD1280(a,b,c) vaddpd a,b,c #define MVADD1280(a,b,c) vaddpd b,a,c #define VSHUF1280(imm,s,d) vpermilpd imm,s,d #define VSHUF2F1280(imm,s1,s2,d) vperm2f128 imm,s1,s2,d #define BROAD1280(addr,reg) vmovddup addr,reg #define BROAD1282(addr,reg) vmovddup addr,reg #define BROAD1281(addr,reg) movddup addr,reg #define MOVRR1280(a,b) vmovapd a,b #define REVS1280(imm,s1,s2,d) vshufpd imm,s1,s2,d #define EXTR1281(imm,a,b) vextractf128 imm,a,b #define LDL1281(addr,reg) vmovlpd addr,reg,reg #define LDH1281(addr,reg) vmovhpd addr,reg,reg #define STL1281(reg,addr) vmovlpd reg,addr #define STH1281(reg,addr) vmovhpd reg,addr #define VADD1281(a,b,c) vaddpd a,b,c #define VXOR1280(a,b,c) vxorpd a,b,c #define VXOR1282(a,b,c) vxorpd a,b,c #define VXOR1281(a,b,c) vxorpd a,b,c #define PREFETCH01280(addr,b) prefetcht0 addr #define PREFETCH11280(addr,b) prefetcht0 addr #define PREFETCH21280(addr,b) prefetcht2 addr #define PREFETCHW1280(addr,b) prefetchw addr #define PREFETCHN1280(addr,b) prefetchnta addr #define VMA1280(a,b,c,d) vfmaddpd d,a,b,c #define VMA1282(a,b,c,d) vfmadd231pd a,b,c #define VMA1281(a,b,c,d) vfmadd231pd a,b,c #define VMA21282(a,b,c,d) vfmadd231pd a,b,c #define VMA21281(a,b,c,d) vfmadd231pd a,b,c //#define VMA1282(a,b,c,d) nop //#define VMA1281(a,b,c,d) nop //#define VMA21282(a,b,c,d) nop //#define VMA21281(a,b,c,d) nop #define MVMA1280(a,b,c,d) vfmaddpd d,a,b,c #define imm1 $0x05 #define imm3 $0x05 #define imm100 $0x05 #define imm200 $0x0a #define XMM0 %xmm0 #define XMM1 %xmm1 #define XMM2 %xmm2 #define XMM3 %xmm3 #define XMM4 %xmm4 #define XMM5 %xmm5 #define XMM6 %xmm6 #define XMM7 %xmm7 #define XMM8 %xmm8 #define XMM9 %xmm9 #define XMM10 %xmm10 #define XMM11 %xmm11 #define XMM12 %xmm12 #define XMM13 %xmm13 #define XMM14 %xmm14 #define XMM15 %xmm15 #define YMM0 %ymm0 #define YMM1 %ymm1 #define YMM2 %ymm2 #define YMM3 %ymm3 #define YMM4 %ymm4 #define YMM5 %ymm5 #define YMM6 %ymm6 #define YMM7 %ymm7 #define YMM8 %ymm8 #define YMM9 %ymm9 #define YMM10 %ymm10 #define YMM11 %ymm11 #define YMM12 %ymm12 #define YMM13 %ymm13 #define YMM14 %ymm14 #define YMM15 %ymm15 PROLOGUE subq $STACKSIZE, %rsp; movq %rbx, 0(%rsp); movq %rbp, 8(%rsp); movq %r12, 16(%rsp); movq %r13, 24(%rsp); movq %r14, 32(%rsp); movq %r15, 40(%rsp); vzeroupper movl old_ldc, %eax movq %rax, LDC movlps ALPHA, alpha movq oldbk_i, _bk_i movq oldbk_j, _bk_j movq oldbk_l, _bk_l leaq (, LDC, SIZE), LDC MOVQ1280(_bk_j,j); SARQ1280($2,j); JLE ._L_0_loopE; ALIGN_4; ._L_0_bodyB:; MOVQ1280(_ptr_A,_ptr__A_0); MOVQ1280(_ptr_C,_ptr__C_0); LEAQ1280((_ptr_C,LDC,2),_ptr__C_1); MOVQ1280(_bk_l,%rax); SALQ1280($5,%rax); ADDQ1280(%rax,_pre_B); MOVQ1280(_bk_i,i); CMPQ1280($6,i); JL ._L_1_loopE; ._L_1_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); VXOR1282(XMM4,XMM4,XMM4); VXOR1282(XMM5,XMM5,XMM5); VXOR1282(XMM6,XMM6,XMM6); VXOR1282(XMM7,XMM7,XMM7); VXOR1282(XMM8,XMM8,XMM8); VXOR1282(XMM9,XMM9,XMM9); VXOR1282(XMM10,XMM10,XMM10); VXOR1282(XMM11,XMM11,XMM11); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_2_loopE; ALIGN_4; ._L_2_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(8*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(16*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(9*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(10*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(11*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(12*SIZE(_ptr__B_0),XMM15); VLD1282(18*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(20*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(22*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(13*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(14*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(15*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); ADDQ1280($24*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_2_bodyE:; DECQ1280(k); JG ._L_2_bodyB; ALIGN_4; ._L_2_loopE:; TESTQ1280($2,_bk_l); JLE ._L_3_loopE; ALIGN_4; ._L_3_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); ADDQ1280($12*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_3_loopE:; TESTQ1280($1,_bk_l); JLE ._L_4_loopE; ALIGN_4; ._L_4_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); ADDQ1280($6*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_4_loopE:; BROAD1282(alpha,XMM12); VLDU1282(0*SIZE(_ptr__C_0),XMM13); VMA21282(XMM12,XMM0,XMM13,XMM0); VSTU1282(XMM13,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM14); VMA21282(XMM12,XMM1,XMM14,XMM1); VSTU1282(XMM14,2*SIZE(_ptr__C_0)); VLDU1282(4*SIZE(_ptr__C_0),XMM15); VMA21282(XMM12,XMM2,XMM15,XMM2); VSTU1282(XMM15,4*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM13); VMA21282(XMM12,XMM3,XMM13,XMM3); VSTU1282(XMM13,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM14); VMA21282(XMM12,XMM4,XMM14,XMM4); VSTU1282(XMM14,2*SIZE(_ptr__C_0,LDC,1)); VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM15); VMA21282(XMM12,XMM5,XMM15,XMM5); VSTU1282(XMM15,4*SIZE(_ptr__C_0,LDC,1)); VLDU1282(0*SIZE(_ptr__C_1),XMM13); VMA21282(XMM12,XMM6,XMM13,XMM6); VSTU1282(XMM13,0*SIZE(_ptr__C_1)); VLDU1282(2*SIZE(_ptr__C_1),XMM14); VMA21282(XMM12,XMM7,XMM14,XMM7); VSTU1282(XMM14,2*SIZE(_ptr__C_1)); VLDU1282(4*SIZE(_ptr__C_1),XMM15); VMA21282(XMM12,XMM8,XMM15,XMM8); VSTU1282(XMM15,4*SIZE(_ptr__C_1)); VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM13); VMA21282(XMM12,XMM9,XMM13,XMM9); VSTU1282(XMM13,0*SIZE(_ptr__C_1,LDC,1)); VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM14); VMA21282(XMM12,XMM10,XMM14,XMM10); VSTU1282(XMM14,2*SIZE(_ptr__C_1,LDC,1)); VLDU1282(4*SIZE(_ptr__C_1,LDC,1),XMM15); VMA21282(XMM12,XMM11,XMM15,XMM11); VSTU1282(XMM15,4*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($6*SIZE,_ptr__C_0); ADDQ1280($6*SIZE,_ptr__C_1); ._L_1_bodyE:; SUBQ1280($6,i); JG ._L_1_bodyB; ALIGN_4; ._L_1_loopE:; TESTQ1280($4,i); JLE ._L_5_loopE; ALIGN_4; ._L_5_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); VXOR1282(XMM4,XMM4,XMM4); VXOR1282(XMM5,XMM5,XMM5); VXOR1282(XMM6,XMM6,XMM6); VXOR1282(XMM7,XMM7,XMM7); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_6_loopE; ALIGN_4; ._L_6_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(8*SIZE(_ptr__B_0),XMM15); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(9*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(10*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(11*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(12*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(13*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(14*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(15*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); ADDQ1280($16*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_6_bodyE:; DECQ1280(k); JG ._L_6_bodyB; ALIGN_4; ._L_6_loopE:; TESTQ1280($2,_bk_l); JLE ._L_7_loopE; ALIGN_4; ._L_7_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_7_loopE:; TESTQ1280($1,_bk_l); JLE ._L_8_loopE; ALIGN_4; ._L_8_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_8_loopE:; BROAD1282(alpha,XMM8); VLDU1282(0*SIZE(_ptr__C_0),XMM9); VMA21282(XMM8,XMM0,XMM9,XMM0); VSTU1282(XMM9,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM10); VMA21282(XMM8,XMM1,XMM10,XMM1); VSTU1282(XMM10,2*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM11); VMA21282(XMM8,XMM2,XMM11,XMM2); VSTU1282(XMM11,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM12); VMA21282(XMM8,XMM3,XMM12,XMM3); VSTU1282(XMM12,2*SIZE(_ptr__C_0,LDC,1)); VLDU1282(0*SIZE(_ptr__C_1),XMM13); VMA21282(XMM8,XMM4,XMM13,XMM4); VSTU1282(XMM13,0*SIZE(_ptr__C_1)); VLDU1282(2*SIZE(_ptr__C_1),XMM14); VMA21282(XMM8,XMM5,XMM14,XMM5); VSTU1282(XMM14,2*SIZE(_ptr__C_1)); VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM15); VMA21282(XMM8,XMM6,XMM15,XMM6); VSTU1282(XMM15,0*SIZE(_ptr__C_1,LDC,1)); VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM9); VMA21282(XMM8,XMM7,XMM9,XMM7); VSTU1282(XMM9,2*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($4*SIZE,_ptr__C_0); ADDQ1280($4*SIZE,_ptr__C_1); ._L_5_loopE:; TESTQ1280($2,i); JLE ._L_9_loopE; ALIGN_4; ._L_9_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_10_loopE; ALIGN_4; ._L_10_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(8*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(9*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(10*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(11*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(12*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(13*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(14*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(15*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_10_bodyE:; DECQ1280(k); JG ._L_10_bodyB; ALIGN_4; ._L_10_loopE:; TESTQ1280($2,_bk_l); JLE ._L_11_loopE; ALIGN_4; ._L_11_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_11_loopE:; TESTQ1280($1,_bk_l); JLE ._L_12_loopE; ALIGN_4; ._L_12_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_12_loopE:; BROAD1282(alpha,XMM4); VLDU1282(0*SIZE(_ptr__C_0),XMM5); VMA21282(XMM4,XMM0,XMM5,XMM0); VSTU1282(XMM5,0*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM6); VMA21282(XMM4,XMM1,XMM6,XMM1); VSTU1282(XMM6,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(0*SIZE(_ptr__C_1),XMM7); VMA21282(XMM4,XMM2,XMM7,XMM2); VSTU1282(XMM7,0*SIZE(_ptr__C_1)); VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM8); VMA21282(XMM4,XMM3,XMM8,XMM3); VSTU1282(XMM8,0*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($2*SIZE,_ptr__C_0); ADDQ1280($2*SIZE,_ptr__C_1); ._L_9_loopE:; TESTQ1280($1,i); JLE ._L_13_loopE; ALIGN_4; ._L_13_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1281(XMM0,XMM0,XMM0); VXOR1281(XMM1,XMM1,XMM1); VXOR1281(XMM2,XMM2,XMM2); VXOR1281(XMM3,XMM3,XMM3); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_14_loopE; ALIGN_4; ._L_14_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(4*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(5*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(6*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(7*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1281(8*SIZE(_ptr__B_0),XMM15); VLD1281(2*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(9*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(10*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(11*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1281(12*SIZE(_ptr__B_0),XMM15); VLD1281(3*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(13*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(14*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(15*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_14_bodyE:; DECQ1280(k); JG ._L_14_bodyB; ALIGN_4; ._L_14_loopE:; TESTQ1280($2,_bk_l); JLE ._L_15_loopE; ALIGN_4; ._L_15_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(4*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(5*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(6*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(7*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_15_loopE:; TESTQ1280($1,_bk_l); JLE ._L_16_loopE; ALIGN_4; ._L_16_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); ADDQ1280($1*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_16_loopE:; BROAD1281(alpha,XMM4); VLDU1281(0*SIZE(_ptr__C_0),XMM5); VMA21281(XMM4,XMM0,XMM5,XMM0); VSTU1281(XMM5,0*SIZE(_ptr__C_0)); VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM6); VMA21281(XMM4,XMM1,XMM6,XMM1); VSTU1281(XMM6,0*SIZE(_ptr__C_0,LDC,1)); VLDU1281(0*SIZE(_ptr__C_1),XMM7); VMA21281(XMM4,XMM2,XMM7,XMM2); VSTU1281(XMM7,0*SIZE(_ptr__C_1)); VLDU1281(0*SIZE(_ptr__C_1,LDC,1),XMM8); VMA21281(XMM4,XMM3,XMM8,XMM3); VSTU1281(XMM8,0*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($1*SIZE,_ptr__C_0); ADDQ1280($1*SIZE,_ptr__C_1); ._L_13_loopE:; MOVQ1280(LDC,%rax); SALQ1280($2,%rax); ADDQ1280(%rax,_ptr_C); MOVQ1280(_bk_l,%rax); SALQ1280($5,%rax); ADDQ1280(%rax,_ptr_B); ._L_0_bodyE:; DECQ1280(j); JG ._L_0_bodyB; ALIGN_4; ._L_0_loopE:; TESTQ1280($2,_bk_j); JLE ._L_17_loopE; ALIGN_4; ._L_17_bodyB:; MOVQ1280(_ptr_A,_ptr__A_0); MOVQ1280(_ptr_C,_ptr__C_0); LEAQ1280((_ptr_C,LDC,1),_ptr__C_1); MOVQ1280(_bk_l,%rax); SALQ1280($4,%rax); ADDQ1280(%rax,_pre_B); MOVQ1280(_bk_i,i); CMPQ1280($6,i); JL ._L_18_loopE; ._L_18_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); VXOR1282(XMM4,XMM4,XMM4); VXOR1282(XMM5,XMM5,XMM5); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_19_loopE; ALIGN_4; ._L_19_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(16*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VLD1282(18*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(20*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(22*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); ADDQ1280($24*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_19_bodyE:; DECQ1280(k); JG ._L_19_bodyB; ALIGN_4; ._L_19_loopE:; TESTQ1280($2,_bk_l); JLE ._L_20_loopE; ALIGN_4; ._L_20_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); ADDQ1280($12*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_20_loopE:; TESTQ1280($1,_bk_l); JLE ._L_21_loopE; ALIGN_4; ._L_21_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); ADDQ1280($6*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_21_loopE:; BROAD1282(alpha,XMM6); VLDU1282(0*SIZE(_ptr__C_0),XMM7); VMA21282(XMM6,XMM0,XMM7,XMM0); VSTU1282(XMM7,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM8); VMA21282(XMM6,XMM1,XMM8,XMM1); VSTU1282(XMM8,2*SIZE(_ptr__C_0)); VLDU1282(4*SIZE(_ptr__C_0),XMM9); VMA21282(XMM6,XMM2,XMM9,XMM2); VSTU1282(XMM9,4*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM10); VMA21282(XMM6,XMM3,XMM10,XMM3); VSTU1282(XMM10,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM11); VMA21282(XMM6,XMM4,XMM11,XMM4); VSTU1282(XMM11,2*SIZE(_ptr__C_0,LDC,1)); VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM12); VMA21282(XMM6,XMM5,XMM12,XMM5); VSTU1282(XMM12,4*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($6*SIZE,_ptr__C_0); ADDQ1280($6*SIZE,_ptr__C_1); ._L_18_bodyE:; SUBQ1280($6,i); JG ._L_18_bodyB; ALIGN_4; ._L_18_loopE:; TESTQ1280($4,i); JLE ._L_22_loopE; ALIGN_4; ._L_22_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_23_loopE; ALIGN_4; ._L_23_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($16*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_23_bodyE:; DECQ1280(k); JG ._L_23_bodyB; ALIGN_4; ._L_23_loopE:; TESTQ1280($2,_bk_l); JLE ._L_24_loopE; ALIGN_4; ._L_24_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_24_loopE:; TESTQ1280($1,_bk_l); JLE ._L_25_loopE; ALIGN_4; ._L_25_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_25_loopE:; BROAD1282(alpha,XMM4); VLDU1282(0*SIZE(_ptr__C_0),XMM5); VMA21282(XMM4,XMM0,XMM5,XMM0); VSTU1282(XMM5,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM6); VMA21282(XMM4,XMM1,XMM6,XMM1); VSTU1282(XMM6,2*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM7); VMA21282(XMM4,XMM2,XMM7,XMM2); VSTU1282(XMM7,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM8); VMA21282(XMM4,XMM3,XMM8,XMM3); VSTU1282(XMM8,2*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($4*SIZE,_ptr__C_0); ADDQ1280($4*SIZE,_ptr__C_1); ._L_22_loopE:; TESTQ1280($2,i); JLE ._L_26_loopE; ALIGN_4; ._L_26_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_27_loopE; ALIGN_4; ._L_27_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_27_bodyE:; DECQ1280(k); JG ._L_27_bodyB; ALIGN_4; ._L_27_loopE:; TESTQ1280($2,_bk_l); JLE ._L_28_loopE; ALIGN_4; ._L_28_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_28_loopE:; TESTQ1280($1,_bk_l); JLE ._L_29_loopE; ALIGN_4; ._L_29_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_29_loopE:; BROAD1282(alpha,XMM2); VLDU1282(0*SIZE(_ptr__C_0),XMM3); VMA21282(XMM2,XMM0,XMM3,XMM0); VSTU1282(XMM3,0*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM4); VMA21282(XMM2,XMM1,XMM4,XMM1); VSTU1282(XMM4,0*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($2*SIZE,_ptr__C_0); ADDQ1280($2*SIZE,_ptr__C_1); ._L_26_loopE:; TESTQ1280($1,i); JLE ._L_30_loopE; ALIGN_4; ._L_30_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1281(XMM0,XMM0,XMM0); VXOR1281(XMM1,XMM1,XMM1); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_31_loopE; ALIGN_4; ._L_31_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1281(4*SIZE(_ptr__B_0),XMM15); VLD1281(2*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(5*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1281(6*SIZE(_ptr__B_0),XMM15); VLD1281(3*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(7*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_31_bodyE:; DECQ1280(k); JG ._L_31_bodyB; ALIGN_4; ._L_31_loopE:; TESTQ1280($2,_bk_l); JLE ._L_32_loopE; ALIGN_4; ._L_32_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_32_loopE:; TESTQ1280($1,_bk_l); JLE ._L_33_loopE; ALIGN_4; ._L_33_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); ADDQ1280($1*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_33_loopE:; BROAD1281(alpha,XMM2); VLDU1281(0*SIZE(_ptr__C_0),XMM3); VMA21281(XMM2,XMM0,XMM3,XMM0); VSTU1281(XMM3,0*SIZE(_ptr__C_0)); VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM4); VMA21281(XMM2,XMM1,XMM4,XMM1); VSTU1281(XMM4,0*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($1*SIZE,_ptr__C_0); ADDQ1280($1*SIZE,_ptr__C_1); ._L_30_loopE:; MOVQ1280(LDC,%rax); SALQ1280($1,%rax); ADDQ1280(%rax,_ptr_C); MOVQ1280(_bk_l,%rax); SALQ1280($4,%rax); ADDQ1280(%rax,_ptr_B); ._L_17_loopE:; TESTQ1280($1,_bk_j); JLE ._L_34_loopE; ALIGN_4; ._L_34_bodyB:; MOVQ1280(_ptr_A,_ptr__A_0); MOVQ1280(_ptr_C,_ptr__C_0); MOVQ1280(_bk_l,%rax); SALQ1280($3,%rax); ADDQ1280(%rax,_pre_B); MOVQ1280(_bk_i,i); CMPQ1280($6,i); JL ._L_35_loopE; ._L_35_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_36_loopE; ALIGN_4; ._L_36_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(16*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VLD1282(18*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(20*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(22*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); ADDQ1280($24*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_36_bodyE:; DECQ1280(k); JG ._L_36_bodyB; ALIGN_4; ._L_36_loopE:; TESTQ1280($2,_bk_l); JLE ._L_37_loopE; ALIGN_4; ._L_37_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); ADDQ1280($12*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_37_loopE:; TESTQ1280($1,_bk_l); JLE ._L_38_loopE; ALIGN_4; ._L_38_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); ADDQ1280($6*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_38_loopE:; BROAD1282(alpha,XMM3); VLDU1282(0*SIZE(_ptr__C_0),XMM4); VMA21282(XMM3,XMM0,XMM4,XMM0); VSTU1282(XMM4,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM5); VMA21282(XMM3,XMM1,XMM5,XMM1); VSTU1282(XMM5,2*SIZE(_ptr__C_0)); VLDU1282(4*SIZE(_ptr__C_0),XMM6); VMA21282(XMM3,XMM2,XMM6,XMM2); VSTU1282(XMM6,4*SIZE(_ptr__C_0)); ADDQ1280($6*SIZE,_ptr__C_0); ADDQ1280($6*SIZE,_ptr__C_1); ._L_35_bodyE:; SUBQ1280($6,i); JG ._L_35_bodyB; ALIGN_4; ._L_35_loopE:; TESTQ1280($4,i); JLE ._L_39_loopE; ALIGN_4; ._L_39_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_40_loopE; ALIGN_4; ._L_40_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($16*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_40_bodyE:; DECQ1280(k); JG ._L_40_bodyB; ALIGN_4; ._L_40_loopE:; TESTQ1280($2,_bk_l); JLE ._L_41_loopE; ALIGN_4; ._L_41_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_41_loopE:; TESTQ1280($1,_bk_l); JLE ._L_42_loopE; ALIGN_4; ._L_42_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_42_loopE:; BROAD1282(alpha,XMM2); VLDU1282(0*SIZE(_ptr__C_0),XMM3); VMA21282(XMM2,XMM0,XMM3,XMM0); VSTU1282(XMM3,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM4); VMA21282(XMM2,XMM1,XMM4,XMM1); VSTU1282(XMM4,2*SIZE(_ptr__C_0)); ADDQ1280($4*SIZE,_ptr__C_0); ADDQ1280($4*SIZE,_ptr__C_1); ._L_39_loopE:; TESTQ1280($2,i); JLE ._L_43_loopE; ALIGN_4; ._L_43_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_44_loopE; ALIGN_4; ._L_44_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_44_bodyE:; DECQ1280(k); JG ._L_44_bodyB; ALIGN_4; ._L_44_loopE:; TESTQ1280($2,_bk_l); JLE ._L_45_loopE; ALIGN_4; ._L_45_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_45_loopE:; TESTQ1280($1,_bk_l); JLE ._L_46_loopE; ALIGN_4; ._L_46_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_46_loopE:; BROAD1282(alpha,XMM1); VLDU1282(0*SIZE(_ptr__C_0),XMM2); VMA21282(XMM1,XMM0,XMM2,XMM0); VSTU1282(XMM2,0*SIZE(_ptr__C_0)); ADDQ1280($2*SIZE,_ptr__C_0); ADDQ1280($2*SIZE,_ptr__C_1); ._L_43_loopE:; TESTQ1280($1,i); JLE ._L_47_loopE; ALIGN_4; ._L_47_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1281(XMM0,XMM0,XMM0); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_48_loopE; ALIGN_4; ._L_48_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VLD1281(2*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VLD1281(3*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_48_bodyE:; DECQ1280(k); JG ._L_48_bodyB; ALIGN_4; ._L_48_loopE:; TESTQ1280($2,_bk_l); JLE ._L_49_loopE; ALIGN_4; ._L_49_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_49_loopE:; TESTQ1280($1,_bk_l); JLE ._L_50_loopE; ALIGN_4; ._L_50_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); ADDQ1280($1*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_50_loopE:; BROAD1281(alpha,XMM1); VLDU1281(0*SIZE(_ptr__C_0),XMM2); VMA21281(XMM1,XMM0,XMM2,XMM0); VSTU1281(XMM2,0*SIZE(_ptr__C_0)); ADDQ1280($1*SIZE,_ptr__C_0); ADDQ1280($1*SIZE,_ptr__C_1); ._L_47_loopE:; MOVQ1280(LDC,%rax); ADDQ1280(%rax,_ptr_C); MOVQ1280(_bk_l,%rax); SALQ1280($3,%rax); ADDQ1280(%rax,_ptr_B); ._L_34_loopE:; vzeroupper movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; addq $STACKSIZE, %rsp; ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S000066400000000000000000003005761313527062700227350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ /********************************************************************* * 2013/06/02 Saar * * Parameter: * UNROLL_M 8 * UNROLL_N 2 * DGEMM_P 360 * DGEMM_Q 160 * * Performance at m x n without prefetch of BO: * * 5760x5760 93.4 GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS) * 5760x5760 84.2 GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS) * 3840x3840 50.3 GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS) * * 5760x5760 56.4 GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS) * 3840x3840 29.0 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) * 3840x3840 26.1 GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS) * *********************************************************************/ /********************************************************************* * 2013/06/03 Saar * * Parameter: * UNROLL_M 8 * UNROLL_N 2 * DGEMM_P 336 * DGEMM_Q 168 * NO_WARMUP 1 * NO_AFFINITY 1 * GEMM_MULTITHREAD_THRESHOLD 4 * * Performance at m x n with prefetch of BO: * * 8064x3840 93.7 GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS) * 6048x2880 85.1 GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS) * 6048x2880 52.0 GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS) * * 6048x2880 56.3 GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS) * 4032x1920 29.5 GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS) * 4032x1920 26.9 GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS) * *********************************************************************/ /********************************************************************* * 2013/06/04 Saar * * Parameter: * UNROLL_M 8 * UNROLL_N 2 * DGEMM_P 384 * DGEMM_Q 168 * NO_WARMUP 1 * NO_AFFINITY 1 * GEMM_MULTITHREAD_THRESHOLD 4 * * Performance at m x n with prefetch of BO: * * 6144x5376 94.6 GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS) * 6144x5376 86.0 GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS) * 4608x4032 52.0 GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS) * * 6144x5376 57.3 GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS) * 4608x4032 29.6 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) * 4608x4032 26.9 GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define LB2_OFFSET 4096 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #define BUFFER2 LB2_OFFSET+128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #define A_PR1 384 #define B_PR1 192 #define KERNEL8x3_1(xx) \ prefetcht0 A_PR1(AO,%rax,8) ;\ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL8x3_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,8) ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL8x3_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,8) ;\ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL8x3_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,8) ;\ vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ addq $12, BI ;\ addq $32, %rax ;\ #define KERNEL8x3_SUB(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ /*******************************************************************************************/ #define KERNEL4x3_1(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL4x3_2(xx) \ vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL4x3_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL4x3_4(xx) \ vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ addq $12, BI ;\ addq $16, %rax ;\ #define KERNEL4x3_SUB(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ /*******************************************************************************************/ #define KERNEL2x3_1(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL2x3_2(xx) \ vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL2x3_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL2x3_4(xx) \ vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $8, %rax ;\ #define KERNEL2x3_SUB(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ #define KERNEL1x3_1(xx) \ vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_2(xx) \ vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_3(xx) \ vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_4(xx) \ vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $4, %rax ;\ #define KERNEL1x3_SUB(xx) \ vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ #define KERNEL8x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,8) ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL8x2_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,8) ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL8x2_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,8) ;\ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL8x2_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,8) ;\ vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $8, BI ;\ addq $32, %rax ;\ #define KERNEL8x2_SUB(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ /*******************************************************************************************/ #define KERNEL4x2_1(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL4x2_2(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL4x2_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL4x2_4(xx) \ vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL4x2_SUB(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL2x2_1(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL2x2_2(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL2x2_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL2x2_4(xx) \ vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ #define KERNEL1x2_1(xx) \ vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_2(xx) \ vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_3(xx) \ vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_4(xx) \ vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $4, %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ /******************************************************************************************* * 1 line of N *******************************************************************************************/ #define KERNEL8x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,8) ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL8x1_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,8) ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL8x1_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,8) ;\ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL8x1_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,8) ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ addq $4, BI ;\ addq $32, %rax ;\ #define KERNEL8x1_SUB(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ /*******************************************************************************************/ #define KERNEL4x1_1(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL4x1_2(xx) \ vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL4x1_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL4x1_4(xx) \ vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ addq $4, BI ;\ addq $16, %rax ;\ #define KERNEL4x1_SUB(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ /*******************************************************************************************/ #define KERNEL2x1_1(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL2x1_2(xx) \ vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL2x1_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL2x1_4(xx) \ vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $8, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #define KERNEL1x1_1(xx) \ vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_2(xx) \ vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_3(xx) \ vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_4(xx) \ vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $4, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 .L6_01: // copy to sub buffer movq K, %rax salq $1,%rax // K * 2 movq B, BO1 leaq (B,%rax,8), BO2 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L6_02a ALIGN_4 .L6_02: prefetcht0 512(BO1) prefetcht0 512(BO2) prefetchw 512(BO) vmovups (BO1), %xmm0 vmovups 2*SIZE(BO1), %xmm2 vmovups 4*SIZE(BO1), %xmm4 vmovups 6*SIZE(BO1), %xmm6 vmovsd (BO2), %xmm1 vmovsd 2*SIZE(BO2), %xmm3 vmovsd 4*SIZE(BO2), %xmm5 vmovsd 6*SIZE(BO2), %xmm7 vmovups %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) vmovups %xmm2, 3*SIZE(BO) vmovsd %xmm3, 5*SIZE(BO) vmovups %xmm4, 6*SIZE(BO) vmovsd %xmm5, 8*SIZE(BO) vmovups %xmm6, 9*SIZE(BO) vmovsd %xmm7,11*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_02 .L6_02a: movq K, %rax andq $3, %rax // K % 4 jz .L6_02c ALIGN_4 .L6_02b: vmovups (BO1), %xmm0 vmovsd (BO2), %xmm1 vmovups %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_02b .L6_02c: movq K, %rax salq $1,%rax // K * 2 leaq (B,%rax,8), BO1 // next offset to BO1 leaq (BO1,%rax,8), BO2 // next offset to BO1 leaq BUFFER2, BO // second buffer to BO movq K, %rax sarq $2, %rax // k / 4 jz .L6_03a ALIGN_4 .L6_03: prefetcht0 512(BO2) prefetchw 512(BO) vmovups (BO2), %xmm0 vmovups 2*SIZE(BO2), %xmm2 vmovups 4*SIZE(BO2), %xmm4 vmovups 6*SIZE(BO2), %xmm6 vmovsd 1*SIZE(BO1), %xmm1 vmovsd 3*SIZE(BO1), %xmm3 vmovsd 5*SIZE(BO1), %xmm5 vmovsd 7*SIZE(BO1), %xmm7 vmovsd %xmm1, 0*SIZE(BO) vmovups %xmm0, 1*SIZE(BO) vmovsd %xmm3, 3*SIZE(BO) vmovups %xmm2, 4*SIZE(BO) vmovsd %xmm5, 6*SIZE(BO) vmovups %xmm4, 7*SIZE(BO) vmovsd %xmm7, 9*SIZE(BO) vmovups %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_03 .L6_03a: movq K, %rax andq $3, %rax // K % 4 jz .L6_03c ALIGN_4 .L6_03b: vmovsd 1*SIZE(BO1), %xmm0 vmovups (BO2), %xmm1 vmovsd %xmm0, (BO) vmovups %xmm1, 1*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_03b .L6_03c: movq BO2, B // next offset of B .L6_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L6_20 ALIGN_4 .L6_11: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L6_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_12: prefetcht0 B_PR1(BO,BI,8) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_16 prefetcht0 B_PR1(BO,BI,8) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_16 jmp .L6_12 ALIGN_4 .L6_16: movq K, %rax andq $7, %rax # if (k & 1) je .L6_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_17: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L6_17 ALIGN_4 .L6_19: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm11, 4 * SIZE(CO1, LDC) vmovups %xmm14, 6 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L6_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_20: // Test rest of M testq $7, M jz .L7_10 // to next 3 lines of N testq $4, M jz .L6_30 ALIGN_4 .L6_21: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 jmp .L6_22 ALIGN_4 .L6_26: movq K, %rax andq $7, %rax # if (k & 1) je .L6_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L6_27 ALIGN_4 .L6_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L6_30: testq $2, M jz .L6_40 ALIGN_4 .L6_31: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_32: prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 jmp .L6_32 ALIGN_4 .L6_36: movq K, %rax andq $7, %rax # if (k & 1) je .L6_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L6_37 ALIGN_4 .L6_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L6_40: testq $1, M jz .L7_10 // to next 3 lines of N ALIGN_4 .L6_41: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_42: prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 jmp .L6_42 ALIGN_4 .L6_46: movq K, %rax andq $7, %rax # if (k & 1) je .L6_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L6_47 ALIGN_4 .L6_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 /***************************************************************************************************************/ .L7_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L7_20 ALIGN_4 .L7_11: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_12: prefetcht0 B_PR1(BO,BI,8) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_16 prefetcht0 B_PR1(BO,BI,8) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_16 jmp .L7_12 ALIGN_4 .L7_16: movq K, %rax andq $7, %rax # if (k & 1) je .L7_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_17: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L7_17 ALIGN_4 .L7_19: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm11, 4 * SIZE(CO1, LDC) vmovups %xmm14, 6 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L7_11 ALIGN_4 .L7_20: // Test rest of M testq $7, M jz .L7_60 // to next 6 lines of N testq $4, M jz .L7_30 ALIGN_4 .L7_21: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 jmp .L7_22 ALIGN_4 .L7_26: movq K, %rax andq $7, %rax # if (k & 1) je .L7_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L7_27 ALIGN_4 .L7_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L7_30: testq $2, M jz .L7_40 ALIGN_4 .L7_31: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_32: prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 prefetcht0 B_PR1(BO,BI,8) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 jmp .L7_32 ALIGN_4 .L7_36: movq K, %rax andq $7, %rax # if (k & 1) je .L7_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L7_37 ALIGN_4 .L7_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L7_40: testq $1, M jz .L7_60 // to next 6 lines of N ALIGN_4 .L7_41: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_42: prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 prefetcht0 B_PR1(BO,BI,8) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) prefetcht0 B_PR1+64(BO,BI,8) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,8) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 jmp .L7_42 ALIGN_4 .L7_46: movq K, %rax andq $7, %rax # if (k & 1) je .L7_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L7_47 ALIGN_4 .L7_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 .L7_60: decq J // j -- jg .L6_01 .L2_0: cmpq $0, Nmod6 // N % 6 == 0 je .L999 /************************************************************************************************ * Loop for Nmod6 / 2 > 0 *************************************************************************************************/ movq Nmod6, J sarq $1, J // j = j / 2 je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L2_20 ALIGN_4 .L2_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_17 ALIGN_4 .L2_19: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm11, 4 * SIZE(CO1, LDC) vmovups %xmm14, 6 * SIZE(CO1, LDC) addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $7, M jz .L2_60 // to next 2 lines of N testq $4, M jz .L2_30 ALIGN_4 .L2_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: movq K, %rax andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_42: prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L1_20 ALIGN_4 .L1_11: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_17 ALIGN_4 .L1_19: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $7, M jz .L999 testq $4, M jz .L1_30 ALIGN_4 .L1_21: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: movq K, %rax andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vmovups %xmm4 , (CO1) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_42: prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vmovsd %xmm4 , (CO1) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_0: .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_17 ALIGN_4 .L2_19: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 vmulpd %xmm0, %xmm10,%xmm10 vmulpd %xmm0, %xmm13,%xmm13 vmulpd %xmm0, %xmm5,%xmm5 vmulpd %xmm0, %xmm8,%xmm8 vmulpd %xmm0, %xmm11,%xmm11 vmulpd %xmm0, %xmm14,%xmm14 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm11, 4 * SIZE(CO1, LDC) vmovups %xmm14, 6 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $7, M jz .L2_60 // to next 2 lines of N testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 vmulpd %xmm0, %xmm5,%xmm5 vmulpd %xmm0, %xmm8,%xmm8 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm5,%xmm5 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_42: prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,8) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulsd %xmm0, %xmm4,%xmm4 vmulsd %xmm0, %xmm5,%xmm5 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 prefetcht0 B_PR1(BO,BI,8) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_17 ALIGN_4 .L1_19: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 vmulpd %xmm0, %xmm10,%xmm10 vmulpd %xmm0, %xmm13,%xmm13 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $7, M jz .L999 testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 prefetcht0 B_PR1(BO,BI,8) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,8) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 #else vmulpd %xmm0, %xmm4,%xmm4 #endif vmovups %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_42: prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 prefetcht0 B_PR1(BO,BI,8) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 #else vmulsd %xmm0, %xmm4,%xmm4 #endif vmovsd %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/dgemm_kernel_8x2_piledriver.S000066400000000000000000002775201313527062700231020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /********************************************************************* * * 2013/11/13 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/31 Saar * * Parameter: * UNROLL_M 8 * UNROLL_N 2 * DGEMM_P 768 * DGEMM_Q 168 * DGEMM_R 12288 * A_PR1 512 * B_PR1 256 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * * 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) * 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) * 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) * 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * * 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior * 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior * 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) * 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) * 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) * 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define LB2_OFFSET 4096 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #define BUFFER2 LB2_OFFSET+128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(BULLDOZER) #define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 #define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 #else #define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 #define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 #endif #define A_PR1 512 #define B_PR1 256 #define C_PR1 64 .macro INIT8x3 vxorpd %xmm4 , %xmm4 , %xmm4 vxorpd %xmm5 , %xmm5 , %xmm5 vxorpd %xmm6 , %xmm6 , %xmm6 vxorpd %xmm7 , %xmm7 , %xmm7 vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 vxorpd %xmm12, %xmm12, %xmm12 vxorpd %xmm13, %xmm13, %xmm13 vxorpd %xmm14, %xmm14, %xmm14 vxorpd %xmm15, %xmm15, %xmm15 .endm .macro KERNEL8x3_INIT vmovddup -12 * SIZE(BO), %xmm1 vmovups -16 * SIZE(AO), %xmm0 prefetcht0 A_PR1(AO) vmulpd %xmm1,%xmm0,%xmm4 vmovddup -11 * SIZE(BO), %xmm2 vmulpd %xmm2,%xmm0,%xmm5 vmovddup -10 * SIZE(BO), %xmm3 vmulpd %xmm3,%xmm0,%xmm6 vmovups -14 * SIZE(AO), %xmm0 vmulpd %xmm1,%xmm0,%xmm7 vmulpd %xmm2,%xmm0,%xmm8 vmulpd %xmm3,%xmm0,%xmm9 vmovups -12 * SIZE(AO), %xmm0 vmulpd %xmm1,%xmm0,%xmm10 vmulpd %xmm2,%xmm0,%xmm11 addq $ 3 * SIZE, BO vmulpd %xmm3,%xmm0,%xmm12 vmovups -10 * SIZE(AO), %xmm0 vmulpd %xmm1,%xmm0,%xmm13 vmovddup -12 * SIZE(BO), %xmm1 vmulpd %xmm2,%xmm0,%xmm14 vmovddup -11 * SIZE(BO), %xmm2 vmulpd %xmm3,%xmm0,%xmm15 .endm .macro KERNEL8x3_M1 vmovups -16 * SIZE(AO), %xmm0 prefetcht0 A_PR1(AO) VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -12 * SIZE(BO), %xmm1 VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -11 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M2 vmovups -8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+64(AO) vmovddup -10 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -6 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -4 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -2 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -9 * SIZE(BO), %xmm1 VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -8 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M3 vmovups 0 * SIZE(AO), %xmm0 prefetcht0 A_PR1+128(AO) vmovddup -7 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 2 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 4 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 6 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -6 * SIZE(BO), %xmm1 VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -5 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M4 vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup -4 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup -3 * SIZE(BO), %xmm1 addq $ 32 * SIZE, AO VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup -2 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M5 vmovups -16 * SIZE(AO), %xmm0 prefetcht0 A_PR1(AO) vmovddup -1 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 0 * SIZE(BO), %xmm1 VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 1 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M6 vmovups -8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+64(AO) vmovddup 2 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -6 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -4 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -2 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 3 * SIZE(BO), %xmm1 VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 4 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M7 vmovups 0 * SIZE(AO), %xmm0 prefetcht0 A_PR1+128(AO) vmovddup 5 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 2 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 4 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 6 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 6 * SIZE(BO), %xmm1 VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 7 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_M8 vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup 8 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) vmovddup 9 * SIZE(BO), %xmm1 VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) vmovddup 10 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) vmovddup 11 * SIZE(BO), %xmm3 addq $ 32 * SIZE, AO addq $ 24 * SIZE, BO .endm .macro KERNEL8x3_E vmovups 8 * SIZE(AO), %xmm0 prefetcht0 A_PR1+192(AO) vmovddup 8 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups 10 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups 12 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups 14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) addq $ 32 * SIZE, AO VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) addq $ 21 * SIZE, BO VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro KERNEL8x3_SUBN vmovddup -12 * SIZE(BO), %xmm1 vmovups -16 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) vmovddup -11 * SIZE(BO), %xmm2 VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) vmovddup -10 * SIZE(BO), %xmm3 VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) vmovups -14 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) vmovups -12 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) vmovups -10 * SIZE(AO), %xmm0 VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) addq $ 3 * SIZE, BO VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) addq $ 8 * SIZE, AO VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) .endm .macro SAVE8x3 vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm11, 4 * SIZE(CO1, LDC) vmovups %xmm14, 6 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) prefetcht0 C_PR1(CO1) prefetcht0 C_PR1(CO1,LDC) prefetcht0 C_PR1(CO1,LDC,2) addq $ 8 * SIZE, CO1 # coffset += 8 .endm /*******************************************************************************************/ #define KERNEL4x3_1(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL4x3_2(xx) \ vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL4x3_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL4x3_4(xx) \ vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ addq $12, BI ;\ addq $16, %rax ;\ #define KERNEL4x3_SUB(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ /*******************************************************************************************/ #define KERNEL2x3_1(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL2x3_2(xx) \ vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL2x3_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL2x3_4(xx) \ vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $8, %rax ;\ #define KERNEL2x3_SUB(xx) \ vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ #define KERNEL1x3_1(xx) \ vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_2(xx) \ vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_3(xx) \ vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_4(xx) \ vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $4, %rax ;\ #define KERNEL1x3_SUB(xx) \ vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ #define KERNEL8x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,8) ;\ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL8x2_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,8) ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL8x2_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,8) ;\ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL8x2_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,8) ;\ vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $8, BI ;\ addq $32, %rax ;\ #define KERNEL8x2_SUB(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ /*******************************************************************************************/ #define KERNEL4x2_1(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL4x2_2(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL4x2_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL4x2_4(xx) \ vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL4x2_SUB(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL2x2_1(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL2x2_2(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL2x2_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL2x2_4(xx) \ vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ #define KERNEL1x2_1(xx) \ vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_2(xx) \ vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_3(xx) \ vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_4(xx) \ vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $4, %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ /******************************************************************************************* * 1 line of N *******************************************************************************************/ #define KERNEL8x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,8) ;\ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL8x1_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,8) ;\ vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL8x1_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,8) ;\ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL8x1_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,8) ;\ vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ addq $4, BI ;\ addq $32, %rax ;\ #define KERNEL8x1_SUB(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ /*******************************************************************************************/ #define KERNEL4x1_1(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL4x1_2(xx) \ vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL4x1_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL4x1_4(xx) \ vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ addq $4, BI ;\ addq $16, %rax ;\ #define KERNEL4x1_SUB(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ /*******************************************************************************************/ #define KERNEL2x1_1(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL2x1_2(xx) \ vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL2x1_3(xx) \ vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL2x1_4(xx) \ vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $8, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #define KERNEL1x1_1(xx) \ vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_2(xx) \ vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_3(xx) \ vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_4(xx) \ vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $4, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 .L6_01: // copy to sub buffer movq K, %rax salq $1,%rax // K * 2 movq B, BO1 leaq (B,%rax,8), BO2 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L6_02a ALIGN_4 .L6_02: prefetcht0 B_PR1(BO1) prefetcht0 B_PR1(BO2) prefetchw B_PR1(BO) vmovups (BO1), %xmm0 vmovups 2*SIZE(BO1), %xmm2 vmovups 4*SIZE(BO1), %xmm4 vmovups 6*SIZE(BO1), %xmm6 vmovsd (BO2), %xmm1 vmovsd 2*SIZE(BO2), %xmm3 vmovsd 4*SIZE(BO2), %xmm5 vmovsd 6*SIZE(BO2), %xmm7 vmovups %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) vmovups %xmm2, 3*SIZE(BO) vmovsd %xmm3, 5*SIZE(BO) vmovups %xmm4, 6*SIZE(BO) vmovsd %xmm5, 8*SIZE(BO) vmovups %xmm6, 9*SIZE(BO) vmovsd %xmm7,11*SIZE(BO) addq $ 8*SIZE,BO1 addq $ 8*SIZE,BO2 addq $ 12*SIZE,BO decq %rax jnz .L6_02 .L6_02a: movq K, %rax andq $3, %rax // K % 4 jz .L6_02c ALIGN_4 .L6_02b: vmovups (BO1), %xmm0 vmovsd (BO2), %xmm1 vmovups %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO2 addq $ 3*SIZE,BO decq %rax jnz .L6_02b .L6_02c: movq K, %rax salq $1,%rax // K * 2 leaq (B,%rax,8), BO1 // next offset to BO1 leaq (BO1,%rax,8), BO2 // next offset to BO1 leaq BUFFER2, BO // second buffer to BO movq K, %rax sarq $2, %rax // k / 4 jz .L6_03a ALIGN_4 .L6_03: prefetcht0 B_PR1(BO2) prefetchw B_PR1(BO) vmovups (BO2), %xmm0 vmovups 2*SIZE(BO2), %xmm2 vmovups 4*SIZE(BO2), %xmm4 vmovups 6*SIZE(BO2), %xmm6 vmovsd 1*SIZE(BO1), %xmm1 vmovsd 3*SIZE(BO1), %xmm3 vmovsd 5*SIZE(BO1), %xmm5 vmovsd 7*SIZE(BO1), %xmm7 vmovsd %xmm1, 0*SIZE(BO) vmovups %xmm0, 1*SIZE(BO) vmovsd %xmm3, 3*SIZE(BO) vmovups %xmm2, 4*SIZE(BO) vmovsd %xmm5, 6*SIZE(BO) vmovups %xmm4, 7*SIZE(BO) vmovsd %xmm7, 9*SIZE(BO) vmovups %xmm6,10*SIZE(BO) addq $ 8*SIZE,BO1 addq $ 8*SIZE,BO2 addq $ 12*SIZE,BO decq %rax jnz .L6_03 .L6_03a: movq K, %rax andq $3, %rax // K % 4 jz .L6_03c ALIGN_4 .L6_03b: vmovsd 1*SIZE(BO1), %xmm0 vmovups (BO2), %xmm1 vmovsd %xmm0, (BO) vmovups %xmm1, 1*SIZE(BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO2 addq $ 3*SIZE,BO decq %rax jnz .L6_03b .L6_03c: movq BO2, B // next offset of B .L6_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L6_20 ALIGN_4 .L6_11: leaq BUFFER1, BO // first buffer to BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $3, %rax jl .L6_13 prefetcht0 B_PR1(BO) prefetcht0 B_PR1+64(BO) prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 subq $2, %rax ALIGN_5 .L6_12: prefetcht0 B_PR1-24(BO) prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 prefetcht0 B_PR1+104(BO) KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax jne .L6_12 .L6_12_E: prefetcht0 B_PR1(BO) prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_E jmp .L6_16 .L6_13: test $2, %rax jz .L6_14 KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_E jmp .L6_16 .L6_14: test $1, %rax jz .L6_15 KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_E jmp .L6_16 .L6_15: INIT8x3 .L6_16: movq K, %rax andq $7, %rax # if (k & 1) je .L6_19 ALIGN_4 .L6_17: KERNEL8x3_SUBN dec %rax jne .L6_17 ALIGN_4 .L6_19: SAVE8x3 decq I # i -- jg .L6_11 /************************************************************************** * Rest of M ***************************************************************************/ .L6_20: // Test rest of M testq $7, M jz .L7_10 // to next 3 lines of N testq $4, M jz .L6_30 ALIGN_4 .L6_21: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_22: KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 jmp .L6_22 ALIGN_4 .L6_26: movq K, %rax andq $7, %rax # if (k & 1) je .L6_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L6_27 ALIGN_4 .L6_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L6_30: testq $2, M jz .L6_40 ALIGN_4 .L6_31: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_32: KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 jmp .L6_32 ALIGN_4 .L6_36: movq K, %rax andq $7, %rax # if (k & 1) je .L6_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L6_37 ALIGN_4 .L6_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L6_40: testq $1, M jz .L7_10 // to next 3 lines of N ALIGN_4 .L6_41: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 jmp .L6_42 ALIGN_4 .L6_46: movq K, %rax andq $7, %rax # if (k & 1) je .L6_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L6_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L6_47 ALIGN_4 .L6_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 /***************************************************************************************************************/ .L7_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L7_20 ALIGN_4 .L7_11: leaq BUFFER2, BO // first buffer to BO addq $12 * SIZE, BO movq K, %rax sarq $3, %rax // K / 8 cmpq $3, %rax jl .L7_13 prefetcht0 B_PR1(BO) prefetcht0 B_PR1+64(BO) prefetcht0 B_PR1+128(BO) KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 subq $2, %rax ALIGN_5 .L7_12: prefetcht0 B_PR1-24(BO) prefetcht0 B_PR1+40(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 prefetcht0 B_PR1+104(BO) KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 dec %rax jne .L7_12 .L7_12_E: prefetcht0 B_PR1(BO) prefetcht0 B_PR1+64(BO) KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_E jmp .L7_16 .L7_13: test $2, %rax jz .L7_14 KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_M8 KERNEL8x3_M1 KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_E jmp .L7_16 .L7_14: test $1, %rax jz .L7_15 KERNEL8x3_INIT KERNEL8x3_M2 KERNEL8x3_M3 KERNEL8x3_M4 KERNEL8x3_M5 KERNEL8x3_M6 KERNEL8x3_M7 KERNEL8x3_E jmp .L7_16 .L7_15: INIT8x3 .L7_16: movq K, %rax andq $7, %rax # if (k & 1) je .L7_19 ALIGN_4 .L7_17: KERNEL8x3_SUBN dec %rax jne .L7_17 ALIGN_4 .L7_19: SAVE8x3 decq I # i -- jg .L7_11 ALIGN_4 .L7_20: // Test rest of M testq $7, M jz .L7_60 // to next 6 lines of N testq $4, M jz .L7_30 ALIGN_4 .L7_21: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_22: KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 jmp .L7_22 ALIGN_4 .L7_26: movq K, %rax andq $7, %rax # if (k & 1) je .L7_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L7_27 ALIGN_4 .L7_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L7_30: testq $2, M jz .L7_40 ALIGN_4 .L7_31: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_32: KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 jmp .L7_32 ALIGN_4 .L7_36: movq K, %rax andq $7, %rax # if (k & 1) je .L7_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L7_37 ALIGN_4 .L7_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L7_40: testq $1, M jz .L7_60 // to next 6 lines of N ALIGN_4 .L7_41: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 jmp .L7_42 ALIGN_4 .L7_46: movq K, %rax andq $7, %rax # if (k & 1) je .L7_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L7_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L7_47 ALIGN_4 .L7_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) vmovsd %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 .L7_60: decq J // j -- jg .L6_01 .L2_0: cmpq $0, Nmod6 // N % 6 == 0 je .L999 /************************************************************************************************ * Loop for Nmod6 / 2 > 0 *************************************************************************************************/ movq Nmod6, J sarq $1, J // j = j / 2 je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L2_20 ALIGN_4 .L2_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_12: KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_17 ALIGN_4 .L2_19: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm11, 4 * SIZE(CO1, LDC) vmovups %xmm14, 6 * SIZE(CO1, LDC) addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $7, M jz .L2_60 // to next 2 lines of N testq $4, M jz .L2_30 ALIGN_4 .L2_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_22: KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: movq K, %rax andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_32: KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L1_20 ALIGN_4 .L1_11: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_12: KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_17 ALIGN_4 .L1_19: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $7, M jz .L999 testq $4, M jz .L1_30 ALIGN_4 .L1_21: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_22: KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: movq K, %rax andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_32: KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovddup ALPHA, %xmm0 vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vmovups %xmm4 , (CO1) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovddup ALPHA, %xmm0 vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vmovsd %xmm4 , (CO1) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_0: .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_12: KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_17 ALIGN_4 .L2_19: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 vmulpd %xmm0, %xmm10,%xmm10 vmulpd %xmm0, %xmm13,%xmm13 vmulpd %xmm0, %xmm5,%xmm5 vmulpd %xmm0, %xmm8,%xmm8 vmulpd %xmm0, %xmm11,%xmm11 vmulpd %xmm0, %xmm14,%xmm14 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) vmovups %xmm11, 4 * SIZE(CO1, LDC) vmovups %xmm14, 6 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $7, M jz .L2_60 // to next 2 lines of N testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_22: KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 vmulpd %xmm0, %xmm5,%xmm5 vmulpd %xmm0, %xmm8,%xmm8 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 2 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_32: KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm5,%xmm5 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulsd %xmm0, %xmm4,%xmm4 vmulsd %xmm0, %xmm5,%xmm5 #endif vmovsd %xmm4 , (CO1) vmovsd %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $3, I // i = (m >> 3) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_12: KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_17 ALIGN_4 .L1_19: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 vmulpd %xmm0, %xmm10,%xmm10 vmulpd %xmm0, %xmm13,%xmm13 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) vmovups %xmm10, 4 * SIZE(CO1) vmovups %xmm13, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $7, M jz .L999 testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_22: KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 #else vmulpd %xmm0, %xmm4,%xmm4 vmulpd %xmm0, %xmm7,%xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_32: KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 #else vmulpd %xmm0, %xmm4,%xmm4 #endif vmovups %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, 8), AO leaq (BO, BI, 8), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovddup ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 #else vmulsd %xmm0, %xmm4,%xmm4 #endif vmovsd %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, 8), BO leaq (AO, %rax, 8), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/dgemm_ncopy_2.S000066400000000000000000000255611313527062700202410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef NEHALEM #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef SANDYBRIDGE #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef MOVAPS #define MOVAPS movaps #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r14 #define I %r15 #endif #define J %r10 #define AO1 %r11 #define AO2 %r12 #define MM %r13 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movq OLD_B, B #endif leaq (,LDA, SIZE), LDA subq $-16 * SIZE, B movq M, MM leaq -1(M), %rax testq $SIZE, A cmovne %rax, MM testq $SIZE, LDA jne .L50 movq N, J sarq $1, J jle .L30 ALIGN_4 .L21: movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L22 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L22: movq MM, I sarq $3, I jle .L24 ALIGN_4 .L23: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 2 * SIZE(AO2), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm4, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 4 * SIZE(AO2), %xmm1 MOVAPS 6 * SIZE(AO1), %xmm2 MOVAPS 6 * SIZE(AO2), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm4 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) #endif movaps %xmm0, -8 * SIZE(B) movaps %xmm4, -6 * SIZE(B) movaps %xmm2, -4 * SIZE(B) movaps %xmm6, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L23 ALIGN_4 .L24: testq $4, MM jle .L26 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 2 * SIZE(AO2), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm4 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm4, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L26: testq $2, MM jle .L28 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO2), %xmm1 movaps %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L28: testq $1, MM jle .L29 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L29: decq J jg .L21 ALIGN_4 .L30: testq $1, N jle .L999 .L30x: movq A, AO1 testq $SIZE, A jne .L35 movq M, I sarq $3, I jle .L32 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1), %xmm1 MOVAPS 4 * SIZE(AO1), %xmm2 MOVAPS 6 * SIZE(AO1), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm3, -10 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, B decq I jg .L31 ALIGN_4 .L32: testq $4, M jle .L33 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1), %xmm1 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B ALIGN_4 .L33: testq $2, M jle .L34 MOVAPS 0 * SIZE(AO1), %xmm0 movaps %xmm0, -16 * SIZE(B) addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L34: testq $1, M jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L35: movaps -1 * SIZE(AO1), %xmm0 movq M, I sarq $3, I jle .L36 ALIGN_4 .L36: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif MOVAPS 1 * SIZE(AO1), %xmm1 MOVAPS 3 * SIZE(AO1), %xmm2 MOVAPS 5 * SIZE(AO1), %xmm3 MOVAPS 7 * SIZE(AO1), %xmm4 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm4, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm3, -10 * SIZE(B) movaps %xmm4, %xmm0 addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L36 ALIGN_4 .L37: testq $4, M jle .L38 MOVAPS 1 * SIZE(AO1), %xmm1 MOVAPS 3 * SIZE(AO1), %xmm2 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, %xmm0 addq $4 * SIZE, AO1 addq $4 * SIZE, B ALIGN_4 .L38: testq $2, M jle .L39 MOVAPS 1 * SIZE(AO1), %xmm1 shufpd $1, %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, %xmm0 addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L39: testq $1, M jle .L999 movhpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L50: movq N, J sarq $1, J jle .L30 ALIGN_4 .L61: movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L62 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L62: MOVAPS -1 * SIZE(AO2), %xmm5 movq MM, I sarq $3, I jle .L64 ALIGN_4 .L63: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 3 * SIZE(AO2), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm5, -16 * SIZE(B) movaps %xmm0, -14 * SIZE(B) movaps %xmm1, -12 * SIZE(B) movaps %xmm2, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 5 * SIZE(AO2), %xmm1 MOVAPS 6 * SIZE(AO1), %xmm2 MOVAPS 7 * SIZE(AO2), %xmm5 movsd %xmm0, %xmm3 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm5, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm3, -8 * SIZE(B) movaps %xmm0, -6 * SIZE(B) movaps %xmm1, -4 * SIZE(B) movaps %xmm2, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L63 ALIGN_4 .L64: testq $4, MM jle .L66 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 3 * SIZE(AO2), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 movaps %xmm5, -16 * SIZE(B) movaps %xmm0, -14 * SIZE(B) movaps %xmm1, -12 * SIZE(B) movaps %xmm2, -10 * SIZE(B) movaps %xmm3, %xmm5 addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L66: testq $2, MM jle .L68 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO2), %xmm1 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movaps %xmm5, -16 * SIZE(B) movaps %xmm0, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L68: testq $1, MM jle .L69 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L69: decq J jg .L61 testq $1, N jne .L30 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_ncopy_4.S000066400000000000000000000537641313527062700202510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(PENTIUM4) || defined(GENERIC) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef ATOM #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef NANO #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef OPTERON #define PREFETCHSIZE 16 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #ifdef GENERIC #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r14 #define I %r15 #endif #define J %r10 #define AO1 %r11 #define AO2 %r12 #define MM %r13 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movq OLD_B, B #endif leaq (,LDA, SIZE), LDA subq $-16 * SIZE, B movq M, MM leaq -1(M), %rax testq $SIZE, A cmovne %rax, MM testq $SIZE, LDA jne .L50 movq N, J sarq $2, J jle .L20 ALIGN_4 .L11: movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A testq $SIZE, A je .L12 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_3 .L12: movq MM, I sarq $3, I jle .L14 ALIGN_4 .L13: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif movapd 0 * SIZE(AO1), %xmm0 movapd 0 * SIZE(AO1, LDA), %xmm1 movapd 0 * SIZE(AO2), %xmm2 movapd 0 * SIZE(AO2, LDA), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) movapd %xmm4, -12 * SIZE(B) movapd %xmm6, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) #endif movapd 2 * SIZE(AO1), %xmm0 movapd 2 * SIZE(AO1, LDA), %xmm1 movapd 2 * SIZE(AO2), %xmm2 movapd 2 * SIZE(AO2, LDA), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) #endif movapd %xmm0, -8 * SIZE(B) movapd %xmm2, -6 * SIZE(B) movapd %xmm4, -4 * SIZE(B) movapd %xmm6, -2 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2) #endif movapd 4 * SIZE(AO1), %xmm0 movapd 4 * SIZE(AO1, LDA), %xmm1 movapd 4 * SIZE(AO2), %xmm2 movapd 4 * SIZE(AO2, LDA), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) #endif movapd %xmm0, 0 * SIZE(B) movapd %xmm2, 2 * SIZE(B) movapd %xmm4, 4 * SIZE(B) movapd %xmm6, 6 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) #endif movapd 6 * SIZE(AO1), %xmm0 movapd 6 * SIZE(AO1, LDA), %xmm1 movapd 6 * SIZE(AO2), %xmm2 movapd 6 * SIZE(AO2, LDA), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) #endif movapd %xmm0, 8 * SIZE(B) movapd %xmm2, 10 * SIZE(B) movapd %xmm4, 12 * SIZE(B) movapd %xmm6, 14 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-32 * SIZE, B decq I jg .L13 ALIGN_4 .L14: testq $4, MM jle .L16 movapd 0 * SIZE(AO1), %xmm0 movapd 0 * SIZE(AO1, LDA), %xmm1 movapd 0 * SIZE(AO2), %xmm2 movapd 0 * SIZE(AO2, LDA), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) movapd %xmm4, -12 * SIZE(B) movapd %xmm6, -10 * SIZE(B) movapd 2 * SIZE(AO1), %xmm0 movapd 2 * SIZE(AO1, LDA), %xmm1 movapd 2 * SIZE(AO2), %xmm2 movapd 2 * SIZE(AO2, LDA), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 movapd %xmm0, -8 * SIZE(B) movapd %xmm2, -6 * SIZE(B) movapd %xmm4, -4 * SIZE(B) movapd %xmm6, -2 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L16: testq $2, MM jle .L18 movapd 0 * SIZE(AO1), %xmm0 movapd 0 * SIZE(AO1, LDA), %xmm1 movapd 0 * SIZE(AO2), %xmm2 movapd 0 * SIZE(AO2, LDA), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) movapd %xmm4, -12 * SIZE(B) movapd %xmm6, -10 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L18: testq $1, MM jle .L19 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) subq $-4 * SIZE, B ALIGN_4 .L19: decq J jg .L11 ALIGN_4 .L20: testq $2, N jle .L30 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L22 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movapd %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L22: movq MM, I sarq $3, I jle .L24 ALIGN_4 .L23: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif movapd 0 * SIZE(AO1), %xmm0 movapd 0 * SIZE(AO2), %xmm1 movapd 2 * SIZE(AO1), %xmm2 movapd 2 * SIZE(AO2), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm0, -16 * SIZE(B) movapd %xmm4, -14 * SIZE(B) movapd %xmm2, -12 * SIZE(B) movapd %xmm6, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif movapd 4 * SIZE(AO1), %xmm0 movapd 4 * SIZE(AO2), %xmm1 movapd 6 * SIZE(AO1), %xmm2 movapd 6 * SIZE(AO2), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) #endif movapd %xmm0, -8 * SIZE(B) movapd %xmm4, -6 * SIZE(B) movapd %xmm2, -4 * SIZE(B) movapd %xmm6, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L23 ALIGN_4 .L24: testq $4, MM jle .L26 movapd 0 * SIZE(AO1), %xmm0 movapd 0 * SIZE(AO2), %xmm1 movapd 2 * SIZE(AO1), %xmm2 movapd 2 * SIZE(AO2), %xmm3 movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm4 movapd %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm6 movapd %xmm0, -16 * SIZE(B) movapd %xmm4, -14 * SIZE(B) movapd %xmm2, -12 * SIZE(B) movapd %xmm6, -10 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L26: testq $2, MM jle .L28 movapd 0 * SIZE(AO1), %xmm0 movapd 0 * SIZE(AO2), %xmm1 movapd %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L28: testq $1, MM jle .L30 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movapd %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L30: testq $1, N jle .L999 movq A, AO1 testq $SIZE, A jne .L35 movq MM, I sarq $3, I jle .L32 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif movapd 0 * SIZE(AO1), %xmm0 movapd 2 * SIZE(AO1), %xmm1 movapd 4 * SIZE(AO1), %xmm2 movapd 6 * SIZE(AO1), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movapd %xmm2, -12 * SIZE(B) movapd %xmm3, -10 * SIZE(B) addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L31 ALIGN_4 .L32: testq $4, MM jle .L33 movapd 0 * SIZE(AO1), %xmm0 movapd 2 * SIZE(AO1), %xmm1 movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B ALIGN_4 .L33: testq $2, MM jle .L34 movapd 0 * SIZE(AO1), %xmm0 movapd %xmm0, -16 * SIZE(B) addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L34: testq $1, MM jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L35: movapd -1 * SIZE(AO1), %xmm0 movq MM, I sarq $3, I jle .L36 ALIGN_4 .L36: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif movapd 1 * SIZE(AO1), %xmm1 movapd 3 * SIZE(AO1), %xmm2 movapd 5 * SIZE(AO1), %xmm3 movapd 7 * SIZE(AO1), %xmm4 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm4, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movapd %xmm2, -12 * SIZE(B) movapd %xmm3, -10 * SIZE(B) movapd %xmm4, %xmm0 addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L36 ALIGN_4 .L37: testq $4, MM jle .L38 movapd 1 * SIZE(AO1), %xmm1 movapd 3 * SIZE(AO1), %xmm2 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movapd %xmm2, %xmm0 addq $4 * SIZE, AO1 addq $4 * SIZE, B ALIGN_4 .L38: testq $2, MM jle .L39 movapd 1 * SIZE(AO1), %xmm1 shufpd $1, %xmm1, %xmm0 movapd %xmm0, -16 * SIZE(B) movapd %xmm1, %xmm0 addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L39: testq $1, MM jle .L999 shufpd $1, %xmm0, %xmm0 movlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L50: movq N, J sarq $2, J jle .L60 ALIGN_4 .L51: movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A testq $SIZE, A je .L52 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_3 .L52: movapd -1 * SIZE(AO1, LDA), %xmm5 movapd -1 * SIZE(AO2, LDA), %xmm7 movq MM, I sarq $3, I jle .L54 ALIGN_4 .L53: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif movapd 0 * SIZE(AO1), %xmm0 movapd 1 * SIZE(AO1, LDA), %xmm1 movapd 0 * SIZE(AO2), %xmm2 movapd 1 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 movsd %xmm2, %xmm7 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm5, -16 * SIZE(B) movapd %xmm7, -14 * SIZE(B) movapd %xmm0, -12 * SIZE(B) movapd %xmm2, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) #endif movapd 2 * SIZE(AO1), %xmm0 movapd 3 * SIZE(AO1, LDA), %xmm5 movapd 2 * SIZE(AO2), %xmm2 movapd 3 * SIZE(AO2, LDA), %xmm7 movsd %xmm0, %xmm1 movsd %xmm2, %xmm3 shufpd $1, %xmm5, %xmm0 shufpd $1, %xmm7, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) #endif movapd %xmm1, -8 * SIZE(B) movapd %xmm3, -6 * SIZE(B) movapd %xmm0, -4 * SIZE(B) movapd %xmm2, -2 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2) #endif movapd 4 * SIZE(AO1), %xmm0 movapd 5 * SIZE(AO1, LDA), %xmm1 movapd 4 * SIZE(AO2), %xmm2 movapd 5 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 movsd %xmm2, %xmm7 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) #endif movapd %xmm5, 0 * SIZE(B) movapd %xmm7, 2 * SIZE(B) movapd %xmm0, 4 * SIZE(B) movapd %xmm2, 6 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) #endif movapd 6 * SIZE(AO1), %xmm0 movapd 7 * SIZE(AO1, LDA), %xmm5 movapd 6 * SIZE(AO2), %xmm2 movapd 7 * SIZE(AO2, LDA), %xmm7 movsd %xmm0, %xmm1 movsd %xmm2, %xmm3 shufpd $1, %xmm5, %xmm0 shufpd $1, %xmm7, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) #endif movapd %xmm1, 8 * SIZE(B) movapd %xmm3, 10 * SIZE(B) movapd %xmm0, 12 * SIZE(B) movapd %xmm2, 14 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-32 * SIZE, B decq I jg .L53 ALIGN_4 .L54: testq $4, MM jle .L56 movapd 0 * SIZE(AO1), %xmm0 movapd 1 * SIZE(AO1, LDA), %xmm1 movapd 0 * SIZE(AO2), %xmm2 movapd 1 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm7 shufpd $1, %xmm3, %xmm2 movapd %xmm5, -16 * SIZE(B) movapd %xmm7, -14 * SIZE(B) movapd %xmm0, -12 * SIZE(B) movapd %xmm2, -10 * SIZE(B) movapd 2 * SIZE(AO1), %xmm0 movapd 3 * SIZE(AO1, LDA), %xmm5 movapd 2 * SIZE(AO2), %xmm2 movapd 3 * SIZE(AO2, LDA), %xmm7 movsd %xmm0, %xmm1 shufpd $1, %xmm5, %xmm0 movsd %xmm2, %xmm3 shufpd $1, %xmm7, %xmm2 movapd %xmm1, -8 * SIZE(B) movapd %xmm3, -6 * SIZE(B) movapd %xmm0, -4 * SIZE(B) movapd %xmm2, -2 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L56: testq $2, MM jle .L58 movapd 0 * SIZE(AO1), %xmm0 movapd 1 * SIZE(AO1, LDA), %xmm1 movapd 0 * SIZE(AO2), %xmm2 movapd 1 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 movsd %xmm2, %xmm7 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 movapd %xmm5, -16 * SIZE(B) movapd %xmm7, -14 * SIZE(B) movapd %xmm0, -12 * SIZE(B) movapd %xmm2, -10 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L58: testq $1, MM jle .L59 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) subq $-4 * SIZE, B ALIGN_4 .L59: decq J jg .L51 ALIGN_4 .L60: testq $2, N jle .L70 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L62 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movapd %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L62: movapd -1 * SIZE(AO2), %xmm5 movq MM, I sarq $3, I jle .L64 ALIGN_4 .L63: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif movapd 0 * SIZE(AO1), %xmm0 movapd 1 * SIZE(AO2), %xmm1 movapd 2 * SIZE(AO1), %xmm2 movapd 3 * SIZE(AO2), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm5, -16 * SIZE(B) movapd %xmm0, -14 * SIZE(B) movapd %xmm1, -12 * SIZE(B) movapd %xmm2, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif movapd 4 * SIZE(AO1), %xmm0 movapd 5 * SIZE(AO2), %xmm1 movapd 6 * SIZE(AO1), %xmm2 movapd 7 * SIZE(AO2), %xmm5 movsd %xmm0, %xmm3 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm5, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm3, -8 * SIZE(B) movapd %xmm0, -6 * SIZE(B) movapd %xmm1, -4 * SIZE(B) movapd %xmm2, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L63 ALIGN_4 .L64: testq $4, MM jle .L66 movapd 0 * SIZE(AO1), %xmm0 movapd 1 * SIZE(AO2), %xmm1 movapd 2 * SIZE(AO1), %xmm2 movapd 3 * SIZE(AO2), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 movapd %xmm5, -16 * SIZE(B) movapd %xmm0, -14 * SIZE(B) movapd %xmm1, -12 * SIZE(B) movapd %xmm2, -10 * SIZE(B) movaps %xmm3, %xmm5 addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L66: testq $2, MM jle .L68 movapd 0 * SIZE(AO1), %xmm0 movapd 1 * SIZE(AO2), %xmm1 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movapd %xmm5, -16 * SIZE(B) movapd %xmm0, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L68: testq $1, MM jle .L70 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movapd %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L70: testq $1, N jle .L999 movq A, AO1 testq $SIZE, A jne .L75 movq MM, I sarq $3, I jle .L72 ALIGN_4 .L71: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif movapd 0 * SIZE(AO1), %xmm0 movapd 2 * SIZE(AO1), %xmm2 movapd 4 * SIZE(AO1), %xmm4 movapd 6 * SIZE(AO1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) movapd %xmm4, -12 * SIZE(B) movapd %xmm6, -10 * SIZE(B) addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L71 ALIGN_4 .L72: testq $4, MM jle .L73 movapd 0 * SIZE(AO1), %xmm0 movapd 2 * SIZE(AO1), %xmm2 movapd %xmm0, -16 * SIZE(B) movapd %xmm2, -14 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B ALIGN_4 .L73: testq $2, MM jle .L74 movapd 0 * SIZE(AO1), %xmm0 movapd %xmm0, -16 * SIZE(B) addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L74: testq $1, MM jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L75: movapd -1 * SIZE(AO1), %xmm0 movq MM, I sarq $3, I jle .L76 ALIGN_4 .L76: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif movapd 1 * SIZE(AO1), %xmm1 movapd 3 * SIZE(AO1), %xmm2 movapd 5 * SIZE(AO1), %xmm3 movapd 7 * SIZE(AO1), %xmm4 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm4, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movapd %xmm2, -12 * SIZE(B) movapd %xmm3, -10 * SIZE(B) movapd %xmm4, %xmm0 addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L76 ALIGN_4 .L77: testq $4, MM jle .L78 movapd 1 * SIZE(AO1), %xmm1 movapd 3 * SIZE(AO1), %xmm2 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 movapd %xmm0, -16 * SIZE(B) movapd %xmm1, -14 * SIZE(B) movapd %xmm2, %xmm0 addq $4 * SIZE, AO1 addq $4 * SIZE, B ALIGN_4 .L78: testq $2, MM jle .L79 movapd 1 * SIZE(AO1), %xmm1 shufpd $1, %xmm1, %xmm0 movapd %xmm0, -16 * SIZE(B) movapd %xmm1, %xmm0 addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L79: testq $1, MM jle .L999 shufpd $1, %xmm0, %xmm0 movlpd %xmm0, -16 * SIZE(B) ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_ncopy_8.S000066400000000000000000001135501313527062700202430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef NEHALEM #define PREFETCHSIZE 12 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef SANDYBRIDGE #define PREFETCHSIZE 12 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef MOVAPS #define MOVAPS movaps #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define AO1 %r9 #define AO2 %r10 #define LDA3 %r11 #define J %r12 #define MM %r13 #else #define STACKSIZE 128 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r15 #define AO1 %r10 #define AO2 %r11 #define LDA3 %r12 #define J %r13 #define MM %r14 #endif #define I %rax PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movq OLD_B, B #endif leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 subq $-16 * SIZE, B movq M, MM leaq -1(M), %rax testq $SIZE, A cmovne %rax, MM testq $SIZE, LDA jne .L50 movq N, J sarq $3, J jle .L20 ALIGN_4 .L11: movq A, AO1 leaq (A, LDA, 4), AO2 leaq (A, LDA, 8), A testq $SIZE, A je .L12 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 movsd 0 * SIZE(AO1, LDA3), %xmm3 movsd 0 * SIZE(AO2), %xmm4 movsd 0 * SIZE(AO2, LDA), %xmm5 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 movsd 0 * SIZE(AO2, LDA3), %xmm7 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 unpcklpd %xmm5, %xmm4 unpcklpd %xmm7, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_3 .L12: movq MM, I sarq $3, I jle .L14 ALIGN_4 .L13: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 movaps %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) #endif MOVAPS 0 * SIZE(AO2), %xmm4 MOVAPS 0 * SIZE(AO2, LDA), %xmm5 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 movaps %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 movaps %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) unpckhpd %xmm1, %xmm8 unpckhpd %xmm3, %xmm9 unpckhpd %xmm5, %xmm10 unpckhpd %xmm7, %xmm11 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) #endif movaps %xmm8, -8 * SIZE(B) movaps %xmm9, -6 * SIZE(B) movaps %xmm10, -4 * SIZE(B) movaps %xmm11, -2 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) #endif MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 movaps %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) #endif MOVAPS 2 * SIZE(AO2), %xmm4 MOVAPS 2 * SIZE(AO2, LDA), %xmm5 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 movaps %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 movaps %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) #endif movaps %xmm0, 0 * SIZE(B) movaps %xmm2, 2 * SIZE(B) movaps %xmm4, 4 * SIZE(B) movaps %xmm6, 6 * SIZE(B) unpckhpd %xmm1, %xmm8 unpckhpd %xmm3, %xmm9 unpckhpd %xmm5, %xmm10 unpckhpd %xmm7, %xmm11 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) #endif movaps %xmm8, 8 * SIZE(B) movaps %xmm9, 10 * SIZE(B) movaps %xmm10, 12 * SIZE(B) movaps %xmm11, 14 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 4 * SIZE(AO1, LDA), %xmm1 MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 4 * SIZE(AO1, LDA3), %xmm3 movaps %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) #endif MOVAPS 4 * SIZE(AO2), %xmm4 MOVAPS 4 * SIZE(AO2, LDA), %xmm5 MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 4 * SIZE(AO2, LDA3), %xmm7 movaps %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 movaps %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) #endif movaps %xmm0, 16 * SIZE(B) movaps %xmm2, 18 * SIZE(B) movaps %xmm4, 20 * SIZE(B) movaps %xmm6, 22 * SIZE(B) unpckhpd %xmm1, %xmm8 unpckhpd %xmm3, %xmm9 unpckhpd %xmm5, %xmm10 unpckhpd %xmm7, %xmm11 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) #endif movaps %xmm8, 24 * SIZE(B) movaps %xmm9, 26 * SIZE(B) movaps %xmm10, 28 * SIZE(B) movaps %xmm11, 30 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) #endif MOVAPS 6 * SIZE(AO1), %xmm0 MOVAPS 6 * SIZE(AO1, LDA), %xmm1 MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 6 * SIZE(AO1, LDA3), %xmm3 movaps %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) #endif MOVAPS 6 * SIZE(AO2), %xmm4 MOVAPS 6 * SIZE(AO2, LDA), %xmm5 MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 6 * SIZE(AO2, LDA3), %xmm7 movaps %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 movaps %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) #endif movaps %xmm0, 32 * SIZE(B) movaps %xmm2, 34 * SIZE(B) movaps %xmm4, 36 * SIZE(B) movaps %xmm6, 38 * SIZE(B) unpckhpd %xmm1, %xmm8 unpckhpd %xmm3, %xmm9 unpckhpd %xmm5, %xmm10 unpckhpd %xmm7, %xmm11 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 56) * SIZE(B) #endif movaps %xmm8, 40 * SIZE(B) movaps %xmm9, 42 * SIZE(B) movaps %xmm10, 44 * SIZE(B) movaps %xmm11, 46 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-64 * SIZE, B decq I jg .L13 ALIGN_4 .L14: testq $4, MM jle .L16 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 MOVAPS 0 * SIZE(AO2), %xmm4 MOVAPS 0 * SIZE(AO2, LDA), %xmm5 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 movaps %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 movaps %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 movaps %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) unpckhpd %xmm1, %xmm8 unpckhpd %xmm3, %xmm9 unpckhpd %xmm5, %xmm10 unpckhpd %xmm7, %xmm11 movaps %xmm8, -8 * SIZE(B) movaps %xmm9, -6 * SIZE(B) movaps %xmm10, -4 * SIZE(B) movaps %xmm11, -2 * SIZE(B) MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 MOVAPS 2 * SIZE(AO2), %xmm4 MOVAPS 2 * SIZE(AO2, LDA), %xmm5 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 movaps %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 movaps %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 movaps %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(B) movaps %xmm2, 2 * SIZE(B) movaps %xmm4, 4 * SIZE(B) movaps %xmm6, 6 * SIZE(B) unpckhpd %xmm1, %xmm8 unpckhpd %xmm3, %xmm9 unpckhpd %xmm5, %xmm10 unpckhpd %xmm7, %xmm11 movaps %xmm8, 8 * SIZE(B) movaps %xmm9, 10 * SIZE(B) movaps %xmm10, 12 * SIZE(B) movaps %xmm11, 14 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-32 * SIZE, B ALIGN_4 .L16: testq $2, MM jle .L18 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 MOVAPS 0 * SIZE(AO2), %xmm4 MOVAPS 0 * SIZE(AO2, LDA), %xmm5 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 movaps %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 movaps %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 movaps %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) unpckhpd %xmm1, %xmm8 unpckhpd %xmm3, %xmm9 unpckhpd %xmm5, %xmm10 unpckhpd %xmm7, %xmm11 movaps %xmm8, -8 * SIZE(B) movaps %xmm9, -6 * SIZE(B) movaps %xmm10, -4 * SIZE(B) movaps %xmm11, -2 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L18: testq $1, MM jle .L19 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 movsd 0 * SIZE(AO1, LDA3), %xmm3 movsd 0 * SIZE(AO2), %xmm4 movsd 0 * SIZE(AO2, LDA), %xmm5 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 movsd 0 * SIZE(AO2, LDA3), %xmm7 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 unpcklpd %xmm5, %xmm4 unpcklpd %xmm7, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) subq $-8 * SIZE, B ALIGN_4 .L19: decq J jg .L11 ALIGN_4 .L20: testq $4, N jle .L30 movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A testq $SIZE, A je .L22 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_3 .L22: movq MM, I sarq $3, I jle .L24 ALIGN_4 .L23: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO2), %xmm2 MOVAPS 0 * SIZE(AO2, LDA), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) #endif MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 MOVAPS 2 * SIZE(AO2), %xmm2 MOVAPS 2 * SIZE(AO2, LDA), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) #endif movaps %xmm0, -8 * SIZE(B) movaps %xmm2, -6 * SIZE(B) movaps %xmm4, -4 * SIZE(B) movaps %xmm6, -2 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 4 * SIZE(AO1, LDA), %xmm1 MOVAPS 4 * SIZE(AO2), %xmm2 MOVAPS 4 * SIZE(AO2, LDA), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) #endif movaps %xmm0, 0 * SIZE(B) movaps %xmm2, 2 * SIZE(B) movaps %xmm4, 4 * SIZE(B) movaps %xmm6, 6 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) #endif MOVAPS 6 * SIZE(AO1), %xmm0 MOVAPS 6 * SIZE(AO1, LDA), %xmm1 MOVAPS 6 * SIZE(AO2), %xmm2 MOVAPS 6 * SIZE(AO2, LDA), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) #endif movaps %xmm0, 8 * SIZE(B) movaps %xmm2, 10 * SIZE(B) movaps %xmm4, 12 * SIZE(B) movaps %xmm6, 14 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-32 * SIZE, B decq I jg .L23 ALIGN_4 .L24: testq $4, MM jle .L26 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO2), %xmm2 MOVAPS 0 * SIZE(AO2, LDA), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1, LDA), %xmm1 MOVAPS 2 * SIZE(AO2), %xmm2 MOVAPS 2 * SIZE(AO2, LDA), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 movaps %xmm0, -8 * SIZE(B) movaps %xmm2, -6 * SIZE(B) movaps %xmm4, -4 * SIZE(B) movaps %xmm6, -2 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L26: testq $2, MM jle .L28 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO2), %xmm2 MOVAPS 0 * SIZE(AO2, LDA), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L28: testq $1, MM jle .L30 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) subq $-4 * SIZE, B ALIGN_4 .L30: testq $2, N jle .L40 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L32 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L32: movq MM, I sarq $3, I jle .L34 ALIGN_4 .L33: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 2 * SIZE(AO2), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm4, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 4 * SIZE(AO2), %xmm1 MOVAPS 6 * SIZE(AO1), %xmm2 MOVAPS 6 * SIZE(AO2), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm1, %xmm4 unpckhpd %xmm3, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) #endif movaps %xmm0, -8 * SIZE(B) movaps %xmm4, -6 * SIZE(B) movaps %xmm2, -4 * SIZE(B) movaps %xmm6, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L33 ALIGN_4 .L34: testq $4, MM jle .L36 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 2 * SIZE(AO2), %xmm3 movaps %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm4 movaps %xmm2, %xmm6 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm4, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L36: testq $2, MM jle .L38 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 0 * SIZE(AO2), %xmm1 movaps %xmm0, %xmm2 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm2 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L38: testq $1, MM jle .L40 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L40: testq $1, N jle .L999 movq A, AO1 testq $SIZE, A jne .L45 movq MM, I sarq $3, I jle .L42 ALIGN_4 .L41: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1), %xmm1 MOVAPS 4 * SIZE(AO1), %xmm2 MOVAPS 6 * SIZE(AO1), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm3, -10 * SIZE(B) addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L41 ALIGN_4 .L42: testq $4, MM jle .L43 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1), %xmm1 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B ALIGN_4 .L43: testq $2, MM jle .L44 MOVAPS 0 * SIZE(AO1), %xmm0 movaps %xmm0, -16 * SIZE(B) addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L44: testq $1, MM jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L45: MOVAPS -1 * SIZE(AO1), %xmm0 movq M, I sarq $3, I jle .L46 ALIGN_4 .L46: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) #endif MOVAPS 1 * SIZE(AO1), %xmm1 MOVAPS 3 * SIZE(AO1), %xmm2 MOVAPS 5 * SIZE(AO1), %xmm3 MOVAPS 7 * SIZE(AO1), %xmm4 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm4, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm3, -10 * SIZE(B) movaps %xmm4, %xmm0 addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L46 ALIGN_4 .L47: testq $4, M jle .L48 MOVAPS 1 * SIZE(AO1), %xmm1 MOVAPS 3 * SIZE(AO1), %xmm2 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, %xmm0 addq $4 * SIZE, AO1 addq $4 * SIZE, B ALIGN_4 .L48: testq $2, M jle .L49 MOVAPS 1 * SIZE(AO1), %xmm1 shufpd $1, %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, %xmm0 addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L49: testq $1, M jle .L999 shufpd $1, %xmm0, %xmm0 movlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L50: movq N, J sarq $3, J jle .L60 ALIGN_4 .L51: movq A, AO1 leaq (A, LDA, 4), AO2 leaq (A, LDA, 8), A testq $SIZE, A je .L52 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 movsd 0 * SIZE(AO1, LDA3), %xmm3 movsd 0 * SIZE(AO2), %xmm4 movsd 0 * SIZE(AO2, LDA), %xmm5 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 movsd 0 * SIZE(AO2, LDA3), %xmm7 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 unpcklpd %xmm5, %xmm4 unpcklpd %xmm7, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_3 .L52: MOVAPS -1 * SIZE(AO1, LDA), %xmm9 MOVAPS -1 * SIZE(AO1, LDA3), %xmm10 MOVAPS -1 * SIZE(AO2, LDA), %xmm11 MOVAPS -1 * SIZE(AO2, LDA3), %xmm12 movq MM, I sarq $3, I jle .L54 ALIGN_4 .L53: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) #endif MOVAPS 0 * SIZE(AO2), %xmm4 MOVAPS 1 * SIZE(AO2, LDA), %xmm5 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 movsd %xmm0, %xmm9 movsd %xmm2, %xmm10 movsd %xmm4, %xmm11 movsd %xmm6, %xmm12 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm9, -16 * SIZE(B) movaps %xmm10, -14 * SIZE(B) movaps %xmm11, -12 * SIZE(B) movaps %xmm12, -10 * SIZE(B) shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm5, %xmm4 shufpd $1, %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) #endif movaps %xmm0, -8 * SIZE(B) movaps %xmm2, -6 * SIZE(B) movaps %xmm4, -4 * SIZE(B) movaps %xmm6, -2 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) #endif MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 3 * SIZE(AO1, LDA), %xmm9 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) #endif MOVAPS 2 * SIZE(AO2), %xmm4 MOVAPS 3 * SIZE(AO2, LDA), %xmm11 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 movsd %xmm0, %xmm1 movsd %xmm2, %xmm3 movsd %xmm4, %xmm5 movsd %xmm6, %xmm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) #endif movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 2 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm7, 6 * SIZE(B) shufpd $1, %xmm9, %xmm0 shufpd $1, %xmm10, %xmm2 shufpd $1, %xmm11, %xmm4 shufpd $1, %xmm12, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) #endif movaps %xmm0, 8 * SIZE(B) movaps %xmm2, 10 * SIZE(B) movaps %xmm4, 12 * SIZE(B) movaps %xmm6, 14 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 5 * SIZE(AO1, LDA), %xmm1 MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 5 * SIZE(AO1, LDA3), %xmm3 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) #endif MOVAPS 4 * SIZE(AO2), %xmm4 MOVAPS 5 * SIZE(AO2, LDA), %xmm5 MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 5 * SIZE(AO2, LDA3), %xmm7 movsd %xmm0, %xmm9 movsd %xmm2, %xmm10 movsd %xmm4, %xmm11 movsd %xmm6, %xmm12 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) #endif movaps %xmm9, 16 * SIZE(B) movaps %xmm10, 18 * SIZE(B) movaps %xmm11, 20 * SIZE(B) movaps %xmm12, 22 * SIZE(B) shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm5, %xmm4 shufpd $1, %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) #endif movaps %xmm0, 24 * SIZE(B) movaps %xmm2, 26 * SIZE(B) movaps %xmm4, 28 * SIZE(B) movaps %xmm6, 30 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) #endif MOVAPS 6 * SIZE(AO1), %xmm0 MOVAPS 7 * SIZE(AO1, LDA), %xmm9 MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 7 * SIZE(AO1, LDA3), %xmm10 #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) #endif MOVAPS 6 * SIZE(AO2), %xmm4 MOVAPS 7 * SIZE(AO2, LDA), %xmm11 MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 7 * SIZE(AO2, LDA3), %xmm12 movsd %xmm0, %xmm1 movsd %xmm2, %xmm3 movsd %xmm4, %xmm5 movsd %xmm6, %xmm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) #endif movaps %xmm1, 32 * SIZE(B) movaps %xmm3, 34 * SIZE(B) movaps %xmm5, 36 * SIZE(B) movaps %xmm7, 38 * SIZE(B) shufpd $1, %xmm9, %xmm0 shufpd $1, %xmm10, %xmm2 shufpd $1, %xmm11, %xmm4 shufpd $1, %xmm12, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) #endif movaps %xmm0, 40 * SIZE(B) movaps %xmm2, 42 * SIZE(B) movaps %xmm4, 44 * SIZE(B) movaps %xmm6, 46 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-64 * SIZE, B decq I jg .L53 ALIGN_4 .L54: testq $4, MM jle .L56 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 MOVAPS 0 * SIZE(AO2), %xmm4 MOVAPS 1 * SIZE(AO2, LDA), %xmm5 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 movsd %xmm0, %xmm9 movsd %xmm2, %xmm10 movsd %xmm4, %xmm11 movsd %xmm6, %xmm12 movaps %xmm9, -16 * SIZE(B) movaps %xmm10, -14 * SIZE(B) movaps %xmm11, -12 * SIZE(B) movaps %xmm12, -10 * SIZE(B) shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm5, %xmm4 shufpd $1, %xmm7, %xmm6 movaps %xmm0, -8 * SIZE(B) movaps %xmm2, -6 * SIZE(B) movaps %xmm4, -4 * SIZE(B) movaps %xmm6, -2 * SIZE(B) MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 3 * SIZE(AO1, LDA), %xmm9 MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 MOVAPS 2 * SIZE(AO2), %xmm4 MOVAPS 3 * SIZE(AO2, LDA), %xmm11 MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 movsd %xmm0, %xmm1 movsd %xmm2, %xmm3 movsd %xmm4, %xmm5 movsd %xmm6, %xmm7 movaps %xmm1, 0 * SIZE(B) movaps %xmm3, 2 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm7, 6 * SIZE(B) shufpd $1, %xmm9, %xmm0 shufpd $1, %xmm10, %xmm2 shufpd $1, %xmm11, %xmm4 shufpd $1, %xmm12, %xmm6 movaps %xmm0, 8 * SIZE(B) movaps %xmm2, 10 * SIZE(B) movaps %xmm4, 12 * SIZE(B) movaps %xmm6, 14 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-32 * SIZE, B ALIGN_4 .L56: testq $2, MM jle .L58 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 MOVAPS 0 * SIZE(AO2), %xmm4 MOVAPS 1 * SIZE(AO2, LDA), %xmm5 MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 movsd %xmm0, %xmm9 movsd %xmm2, %xmm10 movsd %xmm4, %xmm11 movsd %xmm6, %xmm12 movaps %xmm9, -16 * SIZE(B) movaps %xmm10, -14 * SIZE(B) movaps %xmm11, -12 * SIZE(B) movaps %xmm12, -10 * SIZE(B) shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm5, %xmm4 shufpd $1, %xmm7, %xmm6 movaps %xmm0, -8 * SIZE(B) movaps %xmm2, -6 * SIZE(B) movaps %xmm4, -4 * SIZE(B) movaps %xmm6, -2 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L58: testq $1, MM jle .L59 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 movsd 0 * SIZE(AO1, LDA3), %xmm3 movsd 0 * SIZE(AO2), %xmm4 movsd 0 * SIZE(AO2, LDA), %xmm5 movsd 0 * SIZE(AO2, LDA, 2), %xmm6 movsd 0 * SIZE(AO2, LDA3), %xmm7 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 unpcklpd %xmm5, %xmm4 unpcklpd %xmm7, %xmm6 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) subq $-8 * SIZE, B ALIGN_4 .L59: decq J jg .L51 ALIGN_4 .L60: testq $4, N jle .L70 movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A testq $SIZE, A je .L62 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_3 .L62: movaps -1 * SIZE(AO1, LDA), %xmm5 movaps -1 * SIZE(AO2, LDA), %xmm7 movq MM, I sarq $3, I jle .L64 ALIGN_4 .L63: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO2), %xmm2 MOVAPS 1 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 movsd %xmm2, %xmm7 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm5, -16 * SIZE(B) movaps %xmm7, -14 * SIZE(B) movaps %xmm0, -12 * SIZE(B) movaps %xmm2, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) #endif MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 3 * SIZE(AO1, LDA), %xmm5 MOVAPS 2 * SIZE(AO2), %xmm2 MOVAPS 3 * SIZE(AO2, LDA), %xmm7 movsd %xmm0, %xmm1 movsd %xmm2, %xmm3 shufpd $1, %xmm5, %xmm0 shufpd $1, %xmm7, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) #endif movaps %xmm1, -8 * SIZE(B) movaps %xmm3, -6 * SIZE(B) movaps %xmm0, -4 * SIZE(B) movaps %xmm2, -2 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 5 * SIZE(AO1, LDA), %xmm1 MOVAPS 4 * SIZE(AO2), %xmm2 MOVAPS 5 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 movsd %xmm2, %xmm7 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) #endif movaps %xmm5, 0 * SIZE(B) movaps %xmm7, 2 * SIZE(B) movaps %xmm0, 4 * SIZE(B) movaps %xmm2, 6 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) #endif MOVAPS 6 * SIZE(AO1), %xmm0 MOVAPS 7 * SIZE(AO1, LDA), %xmm5 MOVAPS 6 * SIZE(AO2), %xmm2 MOVAPS 7 * SIZE(AO2, LDA), %xmm7 movsd %xmm0, %xmm1 movsd %xmm2, %xmm3 shufpd $1, %xmm5, %xmm0 shufpd $1, %xmm7, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) #endif movaps %xmm1, 8 * SIZE(B) movaps %xmm3, 10 * SIZE(B) movaps %xmm0, 12 * SIZE(B) movaps %xmm2, 14 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-32 * SIZE, B decq I jg .L63 ALIGN_4 .L64: testq $4, MM jle .L66 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO2), %xmm2 MOVAPS 1 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm7 shufpd $1, %xmm3, %xmm2 movaps %xmm5, -16 * SIZE(B) movaps %xmm7, -14 * SIZE(B) movaps %xmm0, -12 * SIZE(B) movaps %xmm2, -10 * SIZE(B) MOVAPS 2 * SIZE(AO1), %xmm0 MOVAPS 3 * SIZE(AO1, LDA), %xmm5 MOVAPS 2 * SIZE(AO2), %xmm2 MOVAPS 3 * SIZE(AO2, LDA), %xmm7 movsd %xmm0, %xmm1 shufpd $1, %xmm5, %xmm0 movsd %xmm2, %xmm3 shufpd $1, %xmm7, %xmm2 movaps %xmm1, -8 * SIZE(B) movaps %xmm3, -6 * SIZE(B) movaps %xmm0, -4 * SIZE(B) movaps %xmm2, -2 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L66: testq $2, MM jle .L68 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO1, LDA), %xmm1 MOVAPS 0 * SIZE(AO2), %xmm2 MOVAPS 1 * SIZE(AO2, LDA), %xmm3 movsd %xmm0, %xmm5 movsd %xmm2, %xmm7 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm3, %xmm2 movaps %xmm5, -16 * SIZE(B) movaps %xmm7, -14 * SIZE(B) movaps %xmm0, -12 * SIZE(B) movaps %xmm2, -10 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L68: testq $1, MM jle .L70 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) subq $-4 * SIZE, B ALIGN_4 .L70: testq $2, N jle .L80 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L72 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L72: MOVAPS -1 * SIZE(AO2), %xmm5 movq MM, I sarq $3, I jle .L74 ALIGN_4 .L73: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 3 * SIZE(AO2), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm5, -16 * SIZE(B) movaps %xmm0, -14 * SIZE(B) movaps %xmm1, -12 * SIZE(B) movaps %xmm2, -10 * SIZE(B) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) #endif MOVAPS 4 * SIZE(AO1), %xmm0 MOVAPS 5 * SIZE(AO2), %xmm1 MOVAPS 6 * SIZE(AO1), %xmm2 MOVAPS 7 * SIZE(AO2), %xmm5 movsd %xmm0, %xmm3 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm5, %xmm2 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) #endif movaps %xmm3, -8 * SIZE(B) movaps %xmm0, -6 * SIZE(B) movaps %xmm1, -4 * SIZE(B) movaps %xmm2, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L73 ALIGN_4 .L74: testq $4, MM jle .L76 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO2), %xmm1 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 3 * SIZE(AO2), %xmm3 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movsd %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 movaps %xmm5, -16 * SIZE(B) movaps %xmm0, -14 * SIZE(B) movaps %xmm1, -12 * SIZE(B) movaps %xmm2, -10 * SIZE(B) movaps %xmm3, %xmm5 addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L76: testq $2, MM jle .L78 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 1 * SIZE(AO2), %xmm1 movsd %xmm0, %xmm5 shufpd $1, %xmm1, %xmm0 movaps %xmm5, -16 * SIZE(B) movaps %xmm0, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L78: testq $1, MM jle .L80 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L80: testq $1, N jle .L999 movq A, AO1 testq $SIZE, A jne .L85 movq MM, I sarq $3, I jle .L82 ALIGN_4 .L81: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) #endif MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1), %xmm2 MOVAPS 4 * SIZE(AO1), %xmm4 MOVAPS 6 * SIZE(AO1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) movaps %xmm4, -12 * SIZE(B) movaps %xmm6, -10 * SIZE(B) addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L81 ALIGN_4 .L82: testq $4, MM jle .L83 MOVAPS 0 * SIZE(AO1), %xmm0 MOVAPS 2 * SIZE(AO1), %xmm2 movaps %xmm0, -16 * SIZE(B) movaps %xmm2, -14 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B ALIGN_4 .L83: testq $2, MM jle .L84 MOVAPS 0 * SIZE(AO1), %xmm0 movaps %xmm0, -16 * SIZE(B) addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L84: testq $1, MM jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L85: MOVAPS -1 * SIZE(AO1), %xmm0 movq M, I sarq $3, I jle .L86 ALIGN_4 .L86: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) #endif MOVAPS 1 * SIZE(AO1), %xmm1 MOVAPS 3 * SIZE(AO1), %xmm2 MOVAPS 5 * SIZE(AO1), %xmm3 MOVAPS 7 * SIZE(AO1), %xmm4 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 shufpd $1, %xmm3, %xmm2 shufpd $1, %xmm4, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, -12 * SIZE(B) movaps %xmm3, -10 * SIZE(B) movaps %xmm4, %xmm0 addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L86 ALIGN_4 .L87: testq $4, M jle .L88 MOVAPS 1 * SIZE(AO1), %xmm1 MOVAPS 3 * SIZE(AO1), %xmm2 shufpd $1, %xmm1, %xmm0 shufpd $1, %xmm2, %xmm1 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, -14 * SIZE(B) movaps %xmm2, %xmm0 addq $4 * SIZE, AO1 addq $4 * SIZE, B ALIGN_4 .L88: testq $2, M jle .L89 MOVAPS 1 * SIZE(AO1), %xmm1 shufpd $1, %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B) movaps %xmm1, %xmm0 addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L89: testq $1, M jle .L999 shufpd $1, %xmm0, %xmm0 movlpd %xmm0, -16 * SIZE(B) ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_ncopy_8_bulldozer.S000066400000000000000000001117141313527062700223250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define A_PRE 256 #define B_PRE 128 #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define AO1 %r9 #define AO2 %r10 #define LDA3 %r11 #define J %r12 #define MM %r13 #else #define STACKSIZE 128 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r15 #define AO1 %r10 #define AO2 %r11 #define LDA3 %r12 #define J %r13 #define MM %r14 #endif #define I %rax PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp vmovups %xmm6, 0(%rsp) vmovups %xmm7, 16(%rsp) vmovups %xmm8, 32(%rsp) vmovups %xmm9, 48(%rsp) vmovups %xmm10, 64(%rsp) vmovups %xmm11, 80(%rsp) vmovups %xmm12, 96(%rsp) movq OLD_B, B #endif leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 subq $-16 * SIZE, B movq M, MM leaq -1(M), %rax testq $SIZE, A cmovne %rax, MM testq $SIZE, LDA jne .L50 movq N, J sarq $3, J jle .L20 ALIGN_4 .L11: movq A, AO1 leaq (A, LDA, 4), AO2 leaq (A, LDA, 8), A testq $SIZE, A je .L12 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 vmovsd 0 * SIZE(AO1, LDA3), %xmm3 vmovsd 0 * SIZE(AO2), %xmm4 vmovsd 0 * SIZE(AO2, LDA), %xmm5 vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 vmovsd 0 * SIZE(AO2, LDA3), %xmm7 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpcklpd %xmm5, %xmm4 , %xmm4 vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_3 .L12: movq MM, I sarq $3, I jle .L14 ALIGN_4 .L13: prefetchnta A_PRE(AO1) vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO1, LDA), %xmm1 prefetchnta A_PRE(AO1, LDA) vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups 0 * SIZE(AO1, LDA3), %xmm3 prefetchnta A_PRE(AO1, LDA, 2) vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups %xmm0, %xmm8 prefetchnta A_PRE(AO1, LDA3) vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 vunpcklpd %xmm3, %xmm2 , %xmm2 prefetchnta A_PRE(AO2) vmovups 0 * SIZE(AO2), %xmm4 vmovups 0 * SIZE(AO2, LDA), %xmm5 prefetchnta A_PRE(AO2, LDA) vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 0 * SIZE(AO2, LDA3), %xmm7 prefetchnta A_PRE(AO2, LDA, 2) vmovups %xmm4, %xmm10 vunpcklpd %xmm5, %xmm4 , %xmm4 prefetchnta A_PRE(AO2, LDA3) vmovups %xmm6, %xmm11 vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE(B) vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) vunpckhpd %xmm1, %xmm8 , %xmm8 vunpckhpd %xmm3, %xmm9 , %xmm9 vunpckhpd %xmm5, %xmm10, %xmm10 vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+64(B) vmovups %xmm8, -8 * SIZE(B) vmovups %xmm9, -6 * SIZE(B) vmovups %xmm10, -4 * SIZE(B) vmovups %xmm11, -2 * SIZE(B) /***********************************************************************************************/ vmovups 2 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1, LDA), %xmm1 vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 vmovups 2 * SIZE(AO1, LDA3), %xmm3 vmovups %xmm0, %xmm8 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups 2 * SIZE(AO2), %xmm4 vmovups 2 * SIZE(AO2, LDA), %xmm5 vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 vmovups 2 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm4, %xmm10 vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE+128(B) vmovups %xmm0, 0 * SIZE(B) vmovups %xmm2, 2 * SIZE(B) vmovups %xmm4, 4 * SIZE(B) vmovups %xmm6, 6 * SIZE(B) vunpckhpd %xmm1, %xmm8 , %xmm8 vunpckhpd %xmm3, %xmm9 , %xmm9 vunpckhpd %xmm5, %xmm10, %xmm10 vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+192(B) vmovups %xmm8, 8 * SIZE(B) vmovups %xmm9, 10 * SIZE(B) vmovups %xmm10, 12 * SIZE(B) vmovups %xmm11, 14 * SIZE(B) /***********************************************************************************************/ vmovups 4 * SIZE(AO1), %xmm0 vmovups 4 * SIZE(AO1, LDA), %xmm1 vmovups 4 * SIZE(AO1, LDA, 2), %xmm2 vmovups 4 * SIZE(AO1, LDA3), %xmm3 vmovups %xmm0, %xmm8 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups 4 * SIZE(AO2), %xmm4 vmovups 4 * SIZE(AO2, LDA), %xmm5 vmovups 4 * SIZE(AO2, LDA, 2), %xmm6 vmovups 4 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm4, %xmm10 vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE+256(B) vmovups %xmm0, 16 * SIZE(B) vmovups %xmm2, 18 * SIZE(B) vmovups %xmm4, 20 * SIZE(B) vmovups %xmm6, 22 * SIZE(B) vunpckhpd %xmm1, %xmm8 , %xmm8 vunpckhpd %xmm3, %xmm9 , %xmm9 vunpckhpd %xmm5, %xmm10, %xmm10 vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+320(B) vmovups %xmm8, 24 * SIZE(B) vmovups %xmm9, 26 * SIZE(B) vmovups %xmm10, 28 * SIZE(B) vmovups %xmm11, 30 * SIZE(B) /***********************************************************************************************/ vmovups 6 * SIZE(AO1), %xmm0 vmovups 6 * SIZE(AO1, LDA), %xmm1 vmovups 6 * SIZE(AO1, LDA, 2), %xmm2 vmovups 6 * SIZE(AO1, LDA3), %xmm3 vmovups %xmm0, %xmm8 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups 6 * SIZE(AO2), %xmm4 vmovups 6 * SIZE(AO2, LDA), %xmm5 vmovups 6 * SIZE(AO2, LDA, 2), %xmm6 vmovups 6 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm4, %xmm10 vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 vunpcklpd %xmm7, %xmm6 , %xmm6 prefetchw B_PRE+384(B) vmovups %xmm0, 32 * SIZE(B) vmovups %xmm2, 34 * SIZE(B) vmovups %xmm4, 36 * SIZE(B) vmovups %xmm6, 38 * SIZE(B) vunpckhpd %xmm1, %xmm8 , %xmm8 vunpckhpd %xmm3, %xmm9 , %xmm9 vunpckhpd %xmm5, %xmm10, %xmm10 vunpckhpd %xmm7, %xmm11, %xmm11 prefetchw B_PRE+448(B) vmovups %xmm8, 40 * SIZE(B) vmovups %xmm9, 42 * SIZE(B) vmovups %xmm10, 44 * SIZE(B) vmovups %xmm11, 46 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-64 * SIZE, B decq I jg .L13 ALIGN_4 .L14: testq $4, MM jle .L16 vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups 0 * SIZE(AO1, LDA3), %xmm3 vmovups 0 * SIZE(AO2), %xmm4 vmovups 0 * SIZE(AO2, LDA), %xmm5 vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 0 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm0, %xmm8 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm4, %xmm10 vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) vunpckhpd %xmm1, %xmm8 , %xmm8 vunpckhpd %xmm3, %xmm9 , %xmm9 vunpckhpd %xmm5, %xmm10, %xmm10 vunpckhpd %xmm7, %xmm11, %xmm11 vmovups %xmm8, -8 * SIZE(B) vmovups %xmm9, -6 * SIZE(B) vmovups %xmm10, -4 * SIZE(B) vmovups %xmm11, -2 * SIZE(B) vmovups 2 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1, LDA), %xmm1 vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 vmovups 2 * SIZE(AO1, LDA3), %xmm3 vmovups 2 * SIZE(AO2), %xmm4 vmovups 2 * SIZE(AO2, LDA), %xmm5 vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 vmovups 2 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm0, %xmm8 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm4, %xmm10 vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, 0 * SIZE(B) vmovups %xmm2, 2 * SIZE(B) vmovups %xmm4, 4 * SIZE(B) vmovups %xmm6, 6 * SIZE(B) vunpckhpd %xmm1, %xmm8 , %xmm8 vunpckhpd %xmm3, %xmm9 , %xmm9 vunpckhpd %xmm5, %xmm10, %xmm10 vunpckhpd %xmm7, %xmm11, %xmm11 vmovups %xmm8, 8 * SIZE(B) vmovups %xmm9, 10 * SIZE(B) vmovups %xmm10, 12 * SIZE(B) vmovups %xmm11, 14 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-32 * SIZE, B ALIGN_4 .L16: testq $2, MM jle .L18 vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups 0 * SIZE(AO1, LDA3), %xmm3 vmovups 0 * SIZE(AO2), %xmm4 vmovups 0 * SIZE(AO2, LDA), %xmm5 vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 0 * SIZE(AO2, LDA3), %xmm7 vmovups %xmm0, %xmm8 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm9 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm4, %xmm10 vunpcklpd %xmm5, %xmm4 , %xmm4 vmovups %xmm6, %xmm11 vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) vunpckhpd %xmm1, %xmm8 , %xmm8 vunpckhpd %xmm3, %xmm9 , %xmm9 vunpckhpd %xmm5, %xmm10, %xmm10 vunpckhpd %xmm7, %xmm11, %xmm11 vmovups %xmm8, -8 * SIZE(B) vmovups %xmm9, -6 * SIZE(B) vmovups %xmm10, -4 * SIZE(B) vmovups %xmm11, -2 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L18: testq $1, MM jle .L19 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 vmovsd 0 * SIZE(AO1, LDA3), %xmm3 vmovsd 0 * SIZE(AO2), %xmm4 vmovsd 0 * SIZE(AO2, LDA), %xmm5 vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 vmovsd 0 * SIZE(AO2, LDA3), %xmm7 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpcklpd %xmm5, %xmm4 , %xmm4 vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) subq $-8 * SIZE, B ALIGN_4 .L19: decq J jg .L11 ALIGN_4 .L20: testq $4, N jle .L30 movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A testq $SIZE, A je .L22 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO2), %xmm2 vmovsd 0 * SIZE(AO2, LDA), %xmm3 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_3 .L22: movq MM, I sarq $3, I jle .L24 ALIGN_4 .L23: vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO2), %xmm2 vmovups 0 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) vmovups 2 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1, LDA), %xmm1 vmovups 2 * SIZE(AO2), %xmm2 vmovups 2 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) vmovups %xmm4, -4 * SIZE(B) vmovups %xmm6, -2 * SIZE(B) vmovups 4 * SIZE(AO1), %xmm0 vmovups 4 * SIZE(AO1, LDA), %xmm1 vmovups 4 * SIZE(AO2), %xmm2 vmovups 4 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, 0 * SIZE(B) vmovups %xmm2, 2 * SIZE(B) vmovups %xmm4, 4 * SIZE(B) vmovups %xmm6, 6 * SIZE(B) vmovups 6 * SIZE(AO1), %xmm0 vmovups 6 * SIZE(AO1, LDA), %xmm1 vmovups 6 * SIZE(AO2), %xmm2 vmovups 6 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, 8 * SIZE(B) vmovups %xmm2, 10 * SIZE(B) vmovups %xmm4, 12 * SIZE(B) vmovups %xmm6, 14 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-32 * SIZE, B decq I jg .L23 ALIGN_4 .L24: testq $4, MM jle .L26 vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO2), %xmm2 vmovups 0 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) vmovups 2 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1, LDA), %xmm1 vmovups 2 * SIZE(AO2), %xmm2 vmovups 2 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) vmovups %xmm4, -4 * SIZE(B) vmovups %xmm6, -2 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L26: testq $2, MM jle .L28 vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO2), %xmm2 vmovups 0 * SIZE(AO2, LDA), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L28: testq $1, MM jle .L30 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO2), %xmm2 vmovsd 0 * SIZE(AO2, LDA), %xmm3 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) subq $-4 * SIZE, B ALIGN_4 .L30: testq $2, N jle .L40 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L32 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO2), %xmm1 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L32: movq MM, I sarq $3, I jle .L34 ALIGN_4 .L33: vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO2), %xmm1 vmovups 2 * SIZE(AO1), %xmm2 vmovups 2 * SIZE(AO2), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm4, -14 * SIZE(B) vmovups %xmm2, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) vmovups 4 * SIZE(AO1), %xmm0 vmovups 4 * SIZE(AO2), %xmm1 vmovups 6 * SIZE(AO1), %xmm2 vmovups 6 * SIZE(AO2), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm1, %xmm4 , %xmm4 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm4, -6 * SIZE(B) vmovups %xmm2, -4 * SIZE(B) vmovups %xmm6, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L33 ALIGN_4 .L34: testq $4, MM jle .L36 vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO2), %xmm1 vmovups 2 * SIZE(AO1), %xmm2 vmovups 2 * SIZE(AO2), %xmm3 vmovups %xmm0, %xmm4 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpckhpd %xmm1, %xmm4 , %xmm4 vmovups %xmm2, %xmm6 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpckhpd %xmm3, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm4, -14 * SIZE(B) vmovups %xmm2, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L36: testq $2, MM jle .L38 vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO2), %xmm1 vmovups %xmm0, %xmm2 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpckhpd %xmm1, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L38: testq $1, MM jle .L40 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO2), %xmm1 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L40: testq $1, N jle .L999 movq A, AO1 testq $SIZE, A jne .L45 movq MM, I sarq $3, I jle .L42 ALIGN_4 .L41: vmovups 0 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1), %xmm1 vmovups 4 * SIZE(AO1), %xmm2 vmovups 6 * SIZE(AO1), %xmm3 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) vmovups %xmm2, -12 * SIZE(B) vmovups %xmm3, -10 * SIZE(B) addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L41 ALIGN_4 .L42: testq $4, MM jle .L43 vmovups 0 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1), %xmm1 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B ALIGN_4 .L43: testq $2, MM jle .L44 vmovups 0 * SIZE(AO1), %xmm0 vmovups %xmm0, -16 * SIZE(B) addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L44: testq $1, MM jle .L999 vmovsd 0 * SIZE(AO1), %xmm0 vmovlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L45: vmovups -1 * SIZE(AO1), %xmm0 movq M, I sarq $3, I jle .L46 ALIGN_4 .L46: vmovups 1 * SIZE(AO1), %xmm1 vmovups 3 * SIZE(AO1), %xmm2 vmovups 5 * SIZE(AO1), %xmm3 vmovups 7 * SIZE(AO1), %xmm4 vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm2, %xmm1 , %xmm1 vshufpd $1, %xmm3, %xmm2 , %xmm2 vshufpd $1, %xmm4, %xmm3 , %xmm3 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) vmovups %xmm2, -12 * SIZE(B) vmovups %xmm3, -10 * SIZE(B) vmovups %xmm4, %xmm0 addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L46 ALIGN_4 .L47: testq $4, M jle .L48 vmovups 1 * SIZE(AO1), %xmm1 vmovups 3 * SIZE(AO1), %xmm2 vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm2, %xmm1 , %xmm1 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) vmovups %xmm2, %xmm0 addq $4 * SIZE, AO1 addq $4 * SIZE, B ALIGN_4 .L48: testq $2, M jle .L49 vmovups 1 * SIZE(AO1), %xmm1 vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, %xmm0 addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L49: testq $1, M jle .L999 vshufpd $1, %xmm0, %xmm0 , %xmm0 vmovlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L50: movq N, J sarq $3, J jle .L60 ALIGN_4 .L51: movq A, AO1 leaq (A, LDA, 4), AO2 leaq (A, LDA, 8), A testq $SIZE, A je .L52 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 vmovsd 0 * SIZE(AO1, LDA3), %xmm3 vmovsd 0 * SIZE(AO2), %xmm4 vmovsd 0 * SIZE(AO2, LDA), %xmm5 vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 vmovsd 0 * SIZE(AO2, LDA3), %xmm7 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpcklpd %xmm5, %xmm4 , %xmm4 vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_3 .L52: vmovups -1 * SIZE(AO1, LDA), %xmm9 vmovups -1 * SIZE(AO1, LDA3), %xmm10 vmovups -1 * SIZE(AO2, LDA), %xmm11 vmovups -1 * SIZE(AO2, LDA3), %xmm12 movq MM, I sarq $3, I jle .L54 ALIGN_4 .L53: vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups 1 * SIZE(AO1, LDA3), %xmm3 vmovups 0 * SIZE(AO2), %xmm4 vmovups 1 * SIZE(AO2, LDA), %xmm5 vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 1 * SIZE(AO2, LDA3), %xmm7 vmovsd %xmm0, %xmm9 , %xmm9 vmovsd %xmm2, %xmm10, %xmm10 vmovsd %xmm4, %xmm11, %xmm11 vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, -16 * SIZE(B) vmovups %xmm10, -14 * SIZE(B) vmovups %xmm11, -12 * SIZE(B) vmovups %xmm12, -10 * SIZE(B) vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm3, %xmm2 , %xmm2 vshufpd $1, %xmm5, %xmm4 , %xmm4 vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) vmovups %xmm4, -4 * SIZE(B) vmovups %xmm6, -2 * SIZE(B) vmovups 2 * SIZE(AO1), %xmm0 vmovups 3 * SIZE(AO1, LDA), %xmm9 vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 vmovups 3 * SIZE(AO1, LDA3), %xmm10 vmovups 2 * SIZE(AO2), %xmm4 vmovups 3 * SIZE(AO2, LDA), %xmm11 vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 vmovups 3 * SIZE(AO2, LDA3), %xmm12 vmovsd %xmm0, %xmm1 , %xmm1 vmovsd %xmm2, %xmm3 , %xmm3 vmovsd %xmm4, %xmm5 , %xmm5 vmovsd %xmm6, %xmm7 , %xmm7 vmovups %xmm1, 0 * SIZE(B) vmovups %xmm3, 2 * SIZE(B) vmovups %xmm5, 4 * SIZE(B) vmovups %xmm7, 6 * SIZE(B) vshufpd $1, %xmm9, %xmm0 , %xmm0 vshufpd $1, %xmm10, %xmm2 , %xmm2 vshufpd $1, %xmm11, %xmm4 , %xmm4 vshufpd $1, %xmm12, %xmm6 , %xmm6 vmovups %xmm0, 8 * SIZE(B) vmovups %xmm2, 10 * SIZE(B) vmovups %xmm4, 12 * SIZE(B) vmovups %xmm6, 14 * SIZE(B) vmovups 4 * SIZE(AO1), %xmm0 vmovups 5 * SIZE(AO1, LDA), %xmm1 vmovups 4 * SIZE(AO1, LDA, 2), %xmm2 vmovups 5 * SIZE(AO1, LDA3), %xmm3 vmovups 4 * SIZE(AO2), %xmm4 vmovups 5 * SIZE(AO2, LDA), %xmm5 vmovups 4 * SIZE(AO2, LDA, 2), %xmm6 vmovups 5 * SIZE(AO2, LDA3), %xmm7 vmovsd %xmm0, %xmm9 , %xmm9 vmovsd %xmm2, %xmm10, %xmm10 vmovsd %xmm4, %xmm11, %xmm11 vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, 16 * SIZE(B) vmovups %xmm10, 18 * SIZE(B) vmovups %xmm11, 20 * SIZE(B) vmovups %xmm12, 22 * SIZE(B) vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm3, %xmm2 , %xmm2 vshufpd $1, %xmm5, %xmm4 , %xmm4 vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, 24 * SIZE(B) vmovups %xmm2, 26 * SIZE(B) vmovups %xmm4, 28 * SIZE(B) vmovups %xmm6, 30 * SIZE(B) vmovups 6 * SIZE(AO1), %xmm0 vmovups 7 * SIZE(AO1, LDA), %xmm9 vmovups 6 * SIZE(AO1, LDA, 2), %xmm2 vmovups 7 * SIZE(AO1, LDA3), %xmm10 vmovups 6 * SIZE(AO2), %xmm4 vmovups 7 * SIZE(AO2, LDA), %xmm11 vmovups 6 * SIZE(AO2, LDA, 2), %xmm6 vmovups 7 * SIZE(AO2, LDA3), %xmm12 vmovsd %xmm0, %xmm1 , %xmm1 vmovsd %xmm2, %xmm3 , %xmm3 vmovsd %xmm4, %xmm5 , %xmm5 vmovsd %xmm6, %xmm7 , %xmm7 vmovups %xmm1, 32 * SIZE(B) vmovups %xmm3, 34 * SIZE(B) vmovups %xmm5, 36 * SIZE(B) vmovups %xmm7, 38 * SIZE(B) vshufpd $1, %xmm9, %xmm0 , %xmm0 vshufpd $1, %xmm10, %xmm2 , %xmm2 vshufpd $1, %xmm11, %xmm4 , %xmm4 vshufpd $1, %xmm12, %xmm6 , %xmm6 vmovups %xmm0, 40 * SIZE(B) vmovups %xmm2, 42 * SIZE(B) vmovups %xmm4, 44 * SIZE(B) vmovups %xmm6, 46 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-64 * SIZE, B decq I jg .L53 ALIGN_4 .L54: testq $4, MM jle .L56 vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups 1 * SIZE(AO1, LDA3), %xmm3 vmovups 0 * SIZE(AO2), %xmm4 vmovups 1 * SIZE(AO2, LDA), %xmm5 vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 1 * SIZE(AO2, LDA3), %xmm7 vmovsd %xmm0, %xmm9 , %xmm9 vmovsd %xmm2, %xmm10, %xmm10 vmovsd %xmm4, %xmm11, %xmm11 vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, -16 * SIZE(B) vmovups %xmm10, -14 * SIZE(B) vmovups %xmm11, -12 * SIZE(B) vmovups %xmm12, -10 * SIZE(B) vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm3, %xmm2 , %xmm2 vshufpd $1, %xmm5, %xmm4 , %xmm4 vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) vmovups %xmm4, -4 * SIZE(B) vmovups %xmm6, -2 * SIZE(B) vmovups 2 * SIZE(AO1), %xmm0 vmovups 3 * SIZE(AO1, LDA), %xmm9 vmovups 2 * SIZE(AO1, LDA, 2), %xmm2 vmovups 3 * SIZE(AO1, LDA3), %xmm10 vmovups 2 * SIZE(AO2), %xmm4 vmovups 3 * SIZE(AO2, LDA), %xmm11 vmovups 2 * SIZE(AO2, LDA, 2), %xmm6 vmovups 3 * SIZE(AO2, LDA3), %xmm12 vmovsd %xmm0, %xmm1 , %xmm1 vmovsd %xmm2, %xmm3 , %xmm3 vmovsd %xmm4, %xmm5 , %xmm5 vmovsd %xmm6, %xmm7 , %xmm7 vmovups %xmm1, 0 * SIZE(B) vmovups %xmm3, 2 * SIZE(B) vmovups %xmm5, 4 * SIZE(B) vmovups %xmm7, 6 * SIZE(B) vshufpd $1, %xmm9, %xmm0 , %xmm0 vshufpd $1, %xmm10, %xmm2 , %xmm2 vshufpd $1, %xmm11, %xmm4 , %xmm4 vshufpd $1, %xmm12, %xmm6 , %xmm6 vmovups %xmm0, 8 * SIZE(B) vmovups %xmm2, 10 * SIZE(B) vmovups %xmm4, 12 * SIZE(B) vmovups %xmm6, 14 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-32 * SIZE, B ALIGN_4 .L56: testq $2, MM jle .L58 vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO1, LDA, 2), %xmm2 vmovups 1 * SIZE(AO1, LDA3), %xmm3 vmovups 0 * SIZE(AO2), %xmm4 vmovups 1 * SIZE(AO2, LDA), %xmm5 vmovups 0 * SIZE(AO2, LDA, 2), %xmm6 vmovups 1 * SIZE(AO2, LDA3), %xmm7 vmovsd %xmm0, %xmm9 , %xmm9 vmovsd %xmm2, %xmm10, %xmm10 vmovsd %xmm4, %xmm11, %xmm11 vmovsd %xmm6, %xmm12, %xmm12 vmovups %xmm9, -16 * SIZE(B) vmovups %xmm10, -14 * SIZE(B) vmovups %xmm11, -12 * SIZE(B) vmovups %xmm12, -10 * SIZE(B) vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm3, %xmm2 , %xmm2 vshufpd $1, %xmm5, %xmm4 , %xmm4 vshufpd $1, %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -8 * SIZE(B) vmovups %xmm2, -6 * SIZE(B) vmovups %xmm4, -4 * SIZE(B) vmovups %xmm6, -2 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L58: testq $1, MM jle .L59 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 vmovsd 0 * SIZE(AO1, LDA3), %xmm3 vmovsd 0 * SIZE(AO2), %xmm4 vmovsd 0 * SIZE(AO2, LDA), %xmm5 vmovsd 0 * SIZE(AO2, LDA, 2), %xmm6 vmovsd 0 * SIZE(AO2, LDA3), %xmm7 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vunpcklpd %xmm5, %xmm4 , %xmm4 vunpcklpd %xmm7, %xmm6 , %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) subq $-8 * SIZE, B ALIGN_4 .L59: decq J jg .L51 ALIGN_4 .L60: testq $4, N jle .L70 movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A testq $SIZE, A je .L62 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO2), %xmm2 vmovsd 0 * SIZE(AO2, LDA), %xmm3 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_3 .L62: vmovups -1 * SIZE(AO1, LDA), %xmm5 vmovups -1 * SIZE(AO2, LDA), %xmm7 movq MM, I sarq $3, I jle .L64 ALIGN_4 .L63: vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO2), %xmm2 vmovups 1 * SIZE(AO2, LDA), %xmm3 vmovsd %xmm0, %xmm5 , %xmm5 vmovsd %xmm2, %xmm7 , %xmm7 vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm7, -14 * SIZE(B) vmovups %xmm0, -12 * SIZE(B) vmovups %xmm2, -10 * SIZE(B) vmovups 2 * SIZE(AO1), %xmm0 vmovups 3 * SIZE(AO1, LDA), %xmm5 vmovups 2 * SIZE(AO2), %xmm2 vmovups 3 * SIZE(AO2, LDA), %xmm7 vmovsd %xmm0, %xmm1 , %xmm1 vmovsd %xmm2, %xmm3 , %xmm3 vshufpd $1, %xmm5, %xmm0 , %xmm0 vshufpd $1, %xmm7, %xmm2 , %xmm2 vmovups %xmm1, -8 * SIZE(B) vmovups %xmm3, -6 * SIZE(B) vmovups %xmm0, -4 * SIZE(B) vmovups %xmm2, -2 * SIZE(B) vmovups 4 * SIZE(AO1), %xmm0 vmovups 5 * SIZE(AO1, LDA), %xmm1 vmovups 4 * SIZE(AO2), %xmm2 vmovups 5 * SIZE(AO2, LDA), %xmm3 vmovsd %xmm0, %xmm5 , %xmm5 vmovsd %xmm2, %xmm7 , %xmm7 vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, 0 * SIZE(B) vmovups %xmm7, 2 * SIZE(B) vmovups %xmm0, 4 * SIZE(B) vmovups %xmm2, 6 * SIZE(B) vmovups 6 * SIZE(AO1), %xmm0 vmovups 7 * SIZE(AO1, LDA), %xmm5 vmovups 6 * SIZE(AO2), %xmm2 vmovups 7 * SIZE(AO2, LDA), %xmm7 vmovsd %xmm0, %xmm1 , %xmm1 vmovsd %xmm2, %xmm3 , %xmm3 vshufpd $1, %xmm5, %xmm0 , %xmm0 vshufpd $1, %xmm7, %xmm2 , %xmm2 vmovups %xmm1, 8 * SIZE(B) vmovups %xmm3, 10 * SIZE(B) vmovups %xmm0, 12 * SIZE(B) vmovups %xmm2, 14 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-32 * SIZE, B decq I jg .L63 ALIGN_4 .L64: testq $4, MM jle .L66 vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO2), %xmm2 vmovups 1 * SIZE(AO2, LDA), %xmm3 vmovsd %xmm0, %xmm5 , %xmm5 vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovsd %xmm2, %xmm7 , %xmm7 vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm7, -14 * SIZE(B) vmovups %xmm0, -12 * SIZE(B) vmovups %xmm2, -10 * SIZE(B) vmovups 2 * SIZE(AO1), %xmm0 vmovups 3 * SIZE(AO1, LDA), %xmm5 vmovups 2 * SIZE(AO2), %xmm2 vmovups 3 * SIZE(AO2, LDA), %xmm7 vmovsd %xmm0, %xmm1 , %xmm1 vshufpd $1, %xmm5, %xmm0 , %xmm0 vmovsd %xmm2, %xmm3 , %xmm3 vshufpd $1, %xmm7, %xmm2 , %xmm2 vmovups %xmm1, -8 * SIZE(B) vmovups %xmm3, -6 * SIZE(B) vmovups %xmm0, -4 * SIZE(B) vmovups %xmm2, -2 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B ALIGN_4 .L66: testq $2, MM jle .L68 vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO1, LDA), %xmm1 vmovups 0 * SIZE(AO2), %xmm2 vmovups 1 * SIZE(AO2, LDA), %xmm3 vmovsd %xmm0, %xmm5 , %xmm5 vmovsd %xmm2, %xmm7 , %xmm7 vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm7, -14 * SIZE(B) vmovups %xmm0, -12 * SIZE(B) vmovups %xmm2, -10 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L68: testq $1, MM jle .L70 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO2), %xmm2 vmovsd 0 * SIZE(AO2, LDA), %xmm3 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) subq $-4 * SIZE, B ALIGN_4 .L70: testq $2, N jle .L80 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A testq $SIZE, A je .L72 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO2), %xmm1 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) addq $1 * SIZE, AO1 addq $1 * SIZE, AO2 subq $-2 * SIZE, B ALIGN_3 .L72: vmovups -1 * SIZE(AO2), %xmm5 movq MM, I sarq $3, I jle .L74 ALIGN_4 .L73: vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO2), %xmm1 vmovups 2 * SIZE(AO1), %xmm2 vmovups 3 * SIZE(AO2), %xmm3 vmovsd %xmm0, %xmm5 , %xmm5 vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovsd %xmm2, %xmm1 , %xmm1 vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm0, -14 * SIZE(B) vmovups %xmm1, -12 * SIZE(B) vmovups %xmm2, -10 * SIZE(B) vmovups 4 * SIZE(AO1), %xmm0 vmovups 5 * SIZE(AO2), %xmm1 vmovups 6 * SIZE(AO1), %xmm2 vmovups 7 * SIZE(AO2), %xmm5 vmovsd %xmm0, %xmm3 , %xmm3 vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovsd %xmm2, %xmm1 , %xmm1 vshufpd $1, %xmm5, %xmm2 , %xmm2 vmovups %xmm3, -8 * SIZE(B) vmovups %xmm0, -6 * SIZE(B) vmovups %xmm1, -4 * SIZE(B) vmovups %xmm2, -2 * SIZE(B) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L73 ALIGN_4 .L74: testq $4, MM jle .L76 vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO2), %xmm1 vmovups 2 * SIZE(AO1), %xmm2 vmovups 3 * SIZE(AO2), %xmm3 vmovsd %xmm0, %xmm5 , %xmm5 vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovsd %xmm2, %xmm1 , %xmm1 vshufpd $1, %xmm3, %xmm2 , %xmm2 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm0, -14 * SIZE(B) vmovups %xmm1, -12 * SIZE(B) vmovups %xmm2, -10 * SIZE(B) vmovups %xmm3, %xmm5 addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L76: testq $2, MM jle .L78 vmovups 0 * SIZE(AO1), %xmm0 vmovups 1 * SIZE(AO2), %xmm1 vmovsd %xmm0, %xmm5 , %xmm5 vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovups %xmm5, -16 * SIZE(B) vmovups %xmm0, -14 * SIZE(B) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B ALIGN_4 .L78: testq $1, MM jle .L80 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO2), %xmm1 vunpcklpd %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) subq $-2 * SIZE, B ALIGN_4 .L80: testq $1, N jle .L999 movq A, AO1 testq $SIZE, A jne .L85 movq MM, I sarq $3, I jle .L82 ALIGN_4 .L81: vmovups 0 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1), %xmm2 vmovups 4 * SIZE(AO1), %xmm4 vmovups 6 * SIZE(AO1), %xmm6 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) vmovups %xmm4, -12 * SIZE(B) vmovups %xmm6, -10 * SIZE(B) addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L81 ALIGN_4 .L82: testq $4, MM jle .L83 vmovups 0 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1), %xmm2 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm2, -14 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B ALIGN_4 .L83: testq $2, MM jle .L84 vmovups 0 * SIZE(AO1), %xmm0 vmovups %xmm0, -16 * SIZE(B) addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L84: testq $1, MM jle .L999 vmovsd 0 * SIZE(AO1), %xmm0 vmovlpd %xmm0, -16 * SIZE(B) jmp .L999 ALIGN_4 .L85: vmovups -1 * SIZE(AO1), %xmm0 movq M, I sarq $3, I jle .L86 ALIGN_4 .L86: vmovups 1 * SIZE(AO1), %xmm1 vmovups 3 * SIZE(AO1), %xmm2 vmovups 5 * SIZE(AO1), %xmm3 vmovups 7 * SIZE(AO1), %xmm4 vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm2, %xmm1 , %xmm1 vshufpd $1, %xmm3, %xmm2 , %xmm2 vshufpd $1, %xmm4, %xmm3 , %xmm3 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) vmovups %xmm2, -12 * SIZE(B) vmovups %xmm3, -10 * SIZE(B) vmovups %xmm4, %xmm0 addq $8 * SIZE, AO1 subq $-8 * SIZE, B decq I jg .L86 ALIGN_4 .L87: testq $4, M jle .L88 vmovups 1 * SIZE(AO1), %xmm1 vmovups 3 * SIZE(AO1), %xmm2 vshufpd $1, %xmm1, %xmm0 , %xmm0 vshufpd $1, %xmm2, %xmm1 , %xmm1 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, -14 * SIZE(B) vmovups %xmm2, %xmm0 addq $4 * SIZE, AO1 addq $4 * SIZE, B ALIGN_4 .L88: testq $2, M jle .L89 vmovups 1 * SIZE(AO1), %xmm1 vshufpd $1, %xmm1, %xmm0 , %xmm0 vmovups %xmm0, -16 * SIZE(B) vmovups %xmm1, %xmm0 addq $2 * SIZE, AO1 subq $-2 * SIZE, B ALIGN_4 .L89: testq $1, M jle .L999 vshufpd $1, %xmm0, %xmm0 , %xmm0 vmovlpd %xmm0, -16 * SIZE(B) ALIGN_4 .L999: #ifdef WINDOWS_ABI vmovups 0(%rsp), %xmm6 vmovups 16(%rsp), %xmm7 vmovups 32(%rsp), %xmm8 vmovups 48(%rsp), %xmm9 vmovups 64(%rsp), %xmm10 vmovups 80(%rsp), %xmm11 vmovups 96(%rsp), %xmm12 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_tcopy_2.S000066400000000000000000000166471313527062700202540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(PENTIUM4) || defined(GENERIC) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef NEHALEM #define PREFETCHSIZE 12 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define MOVUPS_A movups #endif #ifdef SANDYBRIDGE #define PREFETCHSIZE 12 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define MOVUPS_A movups #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef OPTERON #define PREFETCHSIZE 16 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS #else #define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS #endif #ifndef WINDOWS_ABI #define N ARG1 /* rsi */ #define M ARG2 /* rdi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define AO1 %r9 #define AO2 %r10 #define LDA3 %r11 #define M8 %r12 #else #define N ARG1 /* rdx */ #define M ARG2 /* rcx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 40(%rsp) #define B %r12 #define AO1 %rsi #define AO2 %rdi #define LDA3 %r10 #define M8 %r11 #endif #define I %rax #define B0 %rbp #define B3 %r13 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r12 pushq %r13 pushq %rbp #ifdef WINDOWS_ABI movq OLD_B, B #endif subq $-16 * SIZE, B movq M, B3 andq $-2, B3 imulq N, B3 leaq (B, B3, SIZE), B3 leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 leaq (, N, SIZE), M8 cmpq $2, N jl .L40 ALIGN_4 .L31: subq $2, N movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq B, B0 addq $4 * SIZE, B movq M, I sarq $3, I jle .L34 ALIGN_4 .L33: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(0 * SIZE, AO2, %xmm2) MOVUPS_A1(2 * SIZE, AO2, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm2, -14 * SIZE(B0) movaps %xmm1, -16 * SIZE(B0, M8, 2) movaps %xmm3, -14 * SIZE(B0, M8, 2) leaq (B0, M8, 4), B0 #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif MOVUPS_A1(4 * SIZE, AO1, %xmm0) MOVUPS_A1(6 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO2, %xmm2) MOVUPS_A1(6 * SIZE, AO2, %xmm3) movaps %xmm0, -16 * SIZE(B0) movaps %xmm2, -14 * SIZE(B0) movaps %xmm1, -16 * SIZE(B0, M8, 2) movaps %xmm3, -14 * SIZE(B0, M8, 2) leaq (B0, M8, 4), B0 addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 decq I jg .L33 ALIGN_4 .L34: testq $4, M jle .L36 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(0 * SIZE, AO2, %xmm2) MOVUPS_A1(2 * SIZE, AO2, %xmm3) movaps %xmm0, -16 * SIZE(B0) movaps %xmm2, -14 * SIZE(B0) movaps %xmm1, -16 * SIZE(B0, M8, 2) movaps %xmm3, -14 * SIZE(B0, M8, 2) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 leaq (B0, M8, 4), B0 ALIGN_4 .L36: testq $2, M jle .L38 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(0 * SIZE, AO2, %xmm1) movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 leaq (B0, M8, 2), B0 ALIGN_4 .L38: testq $1, M jle .L39 movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movaps %xmm0, -16 * SIZE(B3) subq $-2 * SIZE, B3 ALIGN_4 .L39: cmpq $2, N jge .L31 ALIGN_4 .L40: cmpq $1, N jl .L999 movq A, AO1 movq B, B0 movq M, I sarq $3, I jle .L44 ALIGN_4 .L43: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif addq $8 * SIZE, AO1 movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -16 * SIZE(B0, M8, 2) leaq (B0, M8, 4), B0 movaps %xmm2, -16 * SIZE(B0) movaps %xmm3, -16 * SIZE(B0, M8, 2) leaq (B0, M8, 4), B0 decq I jg .L43 ALIGN_4 .L44: testq $4, M jle .L45 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) addq $4 * SIZE, AO1 movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -16 * SIZE(B0, M8, 2) leaq (B0, M8, 4), B0 ALIGN_4 .L45: testq $2, M jle .L46 MOVUPS_A1(0 * SIZE, AO1, %xmm0) movaps %xmm0, -16 * SIZE(B0) addq $2 * SIZE, AO1 ALIGN_4 .L46: testq $1, M jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B3) ALIGN_4 .L999: popq %rbp popq %r13 popq %r12 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_tcopy_4.S000066400000000000000000000254541313527062700202520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(PENTIUM4) || defined(GENERIC) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef NEHALEM #define PREFETCHSIZE 12 #define PREFETCH prefetcht0 #define MOVUPS_A movups #endif #ifdef SANDYBRIDGE #define PREFETCHSIZE 12 #define PREFETCH prefetcht0 #define MOVUPS_A movups #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef OPTERON #define PREFETCHSIZE 16 #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS #else #define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS #endif #ifndef WINDOWS_ABI #define N ARG1 /* rsi */ #define M ARG2 /* rdi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define AO1 %r9 #define AO2 %r10 #define LDA3 %r11 #define M8 %r12 #else #define STACKSIZE 256 #define N ARG1 /* rdx */ #define M ARG2 /* rcx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 64 + 32 + STACKSIZE(%rsp) #define B %r12 #define AO1 %rsi #define AO2 %rdi #define LDA3 %r10 #define M8 %r11 #endif #define I %rax #define B0 %rbp #define B2 %r14 #define B3 %r15 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbp #ifdef WINDOWS_ABI movq OLD_B, B #endif subq $-16 * SIZE, B movq M, B2 movq M, B3 andq $-4, B2 andq $-2, B3 imulq N, B2 imulq N, B3 leaq (B, B2, SIZE), B2 leaq (B, B3, SIZE), B3 leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 leaq (, N, SIZE), M8 cmpq $4, N jl .L30 ALIGN_4 .L21: subq $4, N movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A movq B, B0 addq $16 * SIZE, B movq M, I sarq $3, I jle .L24 ALIGN_4 .L23: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -16 * SIZE(B0, M8, 4) movaps %xmm3, -14 * SIZE(B0, M8, 4) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) #endif MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) #endif movaps %xmm0, -12 * SIZE(B0) movaps %xmm1, -10 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0, M8, 4) movaps %xmm3, -10 * SIZE(B0, M8, 4) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2) #endif MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A1(4 * SIZE, AO2, %xmm2) MOVUPS_A1(6 * SIZE, AO2, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) #endif movaps %xmm0, -8 * SIZE(B0) movaps %xmm1, -6 * SIZE(B0) movaps %xmm2, -8 * SIZE(B0, M8, 4) movaps %xmm3, -6 * SIZE(B0, M8, 4) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) #endif MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) #endif movaps %xmm0, -4 * SIZE(B0) movaps %xmm1, -2 * SIZE(B0) movaps %xmm2, -4 * SIZE(B0, M8, 4) movaps %xmm3, -2 * SIZE(B0, M8, 4) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L23 ALIGN_4 .L24: testq $4, M jle .L26 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0) movaps %xmm3, -10 * SIZE(B0) MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) movaps %xmm0, -8 * SIZE(B0) movaps %xmm1, -6 * SIZE(B0) movaps %xmm2, -4 * SIZE(B0) movaps %xmm3, -2 * SIZE(B0) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 leaq (B0, M8, 4), B0 ALIGN_4 .L26: testq $2, M jle .L28 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) MOVUPS_A1(0 * SIZE, AO2, %xmm2) MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) movaps %xmm0, -16 * SIZE(B2) movaps %xmm1, -14 * SIZE(B2) movaps %xmm2, -12 * SIZE(B2) movaps %xmm3, -10 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B2 ALIGN_4 .L28: testq $1, M jle .L29 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -16 * SIZE(B3) movaps %xmm2, -14 * SIZE(B3) subq $-4 * SIZE, B3 ALIGN_4 .L29: cmpq $4, N jge .L21 ALIGN_4 .L30: cmpq $2, N jl .L40 subq $2, N movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq B, B0 addq $8 * SIZE, B movq M, I sarq $3, I jle .L34 ALIGN_4 .L33: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -16 * SIZE(B0, M8, 4) movaps %xmm3, -14 * SIZE(B0, M8, 4) #ifdef PREFETCH PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) #endif MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A1(4 * SIZE, AO2, %xmm2) MOVUPS_A1(6 * SIZE, AO2, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) #endif movaps %xmm0, -12 * SIZE(B0) movaps %xmm1, -10 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0, M8, 4) movaps %xmm3, -10 * SIZE(B0, M8, 4) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L33 ALIGN_4 .L34: testq $4, M jle .L36 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(0 * SIZE, AO2, %xmm2) MOVUPS_A1(2 * SIZE, AO2, %xmm3) movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0) movaps %xmm3, -10 * SIZE(B0) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 leaq (B0, M8, 4), B0 ALIGN_4 .L36: testq $2, M jle .L38 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(0 * SIZE, AO2, %xmm1) movaps %xmm0, -16 * SIZE(B2) movaps %xmm1, -14 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B2 ALIGN_4 .L38: testq $1, M jle .L40 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B3) subq $-2 * SIZE, B3 ALIGN_4 .L40: cmpq $1, N jl .L999 movq A, AO1 movq B, B0 movq M, I sarq $3, I jle .L44 ALIGN_4 .L43: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -16 * SIZE(B0, M8, 4) movaps %xmm3, -14 * SIZE(B0, M8, 4) addq $8 * SIZE, AO1 leaq (B0, M8, 8), B0 decq I jg .L43 ALIGN_4 .L44: testq $4, M jle .L45 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) addq $4 * SIZE, AO1 leaq (B0, M8, 4), B0 ALIGN_4 .L45: testq $2, M jle .L46 MOVUPS_A1(0 * SIZE, AO1, %xmm0) movaps %xmm0, -16 * SIZE(B2) addq $2 * SIZE, AO1 subq $-2 * SIZE, B2 ALIGN_4 .L46: testq $1, M jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B3) jmp .L999 ALIGN_4 .L999: popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_tcopy_8.S000066400000000000000000000400651313527062700202510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef NEHALEM #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define MOVUPS_A movups #endif #ifdef SANDYBRIDGE #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define MOVUPS_A movups #endif #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS #else #define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS #endif #ifndef WINDOWS_ABI #define N ARG1 /* rsi */ #define M ARG2 /* rdi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define AO1 %r9 #define AO2 %r10 #define LDA3 %r11 #define M8 %r12 #else #define N ARG1 /* rdx */ #define M ARG2 /* rcx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 56(%rsp) #define B %r12 #define AO1 %rsi #define AO2 %rdi #define LDA3 %r10 #define M8 %r11 #endif #define I %rax #define B0 %rbp #define B1 %r13 #define B2 %r14 #define B3 %r15 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbp #ifdef WINDOWS_ABI movq OLD_B, B #endif subq $-16 * SIZE, B movq M, B1 movq M, B2 movq M, B3 andq $-8, B1 andq $-4, B2 andq $-2, B3 imulq N, B1 imulq N, B2 imulq N, B3 leaq (B, B1, SIZE), B1 leaq (B, B2, SIZE), B2 leaq (B, B3, SIZE), B3 leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 leaq (, N, SIZE), M8 cmpq $8, N jl .L20 ALIGN_4 .L11: subq $8, N movq A, AO1 leaq (A, LDA, 4), AO2 leaq (A, LDA, 8), A movq B, B0 addq $64 * SIZE, B movq M, I sarq $3, I jle .L14 ALIGN_4 .L13: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW 48 * SIZE(B0) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0) movaps %xmm3, -10 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) #endif MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) #ifdef PREFETCHW PREFETCHW 56 * SIZE(B0) #endif movaps %xmm0, -8 * SIZE(B0) movaps %xmm1, -6 * SIZE(B0) movaps %xmm2, -4 * SIZE(B0) movaps %xmm3, -2 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) #endif MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) MOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2) MOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3) #ifdef PREFETCHW PREFETCHW 64 * SIZE(B0) #endif movaps %xmm0, 0 * SIZE(B0) movaps %xmm1, 2 * SIZE(B0) movaps %xmm2, 4 * SIZE(B0) movaps %xmm3, 6 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) #endif MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3) #ifdef PREFETCHW PREFETCHW 72 * SIZE(B0) #endif movaps %xmm0, 8 * SIZE(B0) movaps %xmm1, 10 * SIZE(B0) movaps %xmm2, 12 * SIZE(B0) movaps %xmm3, 14 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2) #endif MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A1(4 * SIZE, AO2, %xmm2) MOVUPS_A1(6 * SIZE, AO2, %xmm3) #ifdef PREFETCHW PREFETCHW 80 * SIZE(B0) #endif movaps %xmm0, 16 * SIZE(B0) movaps %xmm1, 18 * SIZE(B0) movaps %xmm2, 20 * SIZE(B0) movaps %xmm3, 22 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) #endif MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) #ifdef PREFETCHW PREFETCHW 88 * SIZE(B0) #endif movaps %xmm0, 24 * SIZE(B0) movaps %xmm1, 26 * SIZE(B0) movaps %xmm2, 28 * SIZE(B0) movaps %xmm3, 30 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) #endif MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) MOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2) MOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3) #ifdef PREFETCHW PREFETCHW 96 * SIZE(B0) #endif movaps %xmm0, 32 * SIZE(B0) movaps %xmm1, 34 * SIZE(B0) movaps %xmm2, 36 * SIZE(B0) movaps %xmm3, 38 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) #endif MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3) #ifdef PREFETCHW PREFETCHW 104 * SIZE(B0) #endif movaps %xmm0, 40 * SIZE(B0) movaps %xmm1, 42 * SIZE(B0) movaps %xmm2, 44 * SIZE(B0) movaps %xmm3, 46 * SIZE(B0) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L13 ALIGN_4 .L14: testq $4, M jle .L16 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) movaps %xmm0, -16 * SIZE(B1) movaps %xmm1, -14 * SIZE(B1) movaps %xmm2, -12 * SIZE(B1) movaps %xmm3, -10 * SIZE(B1) MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3) movaps %xmm0, -8 * SIZE(B1) movaps %xmm1, -6 * SIZE(B1) movaps %xmm2, -4 * SIZE(B1) movaps %xmm3, -2 * SIZE(B1) MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) movaps %xmm0, 0 * SIZE(B1) movaps %xmm1, 2 * SIZE(B1) movaps %xmm2, 4 * SIZE(B1) movaps %xmm3, 6 * SIZE(B1) MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3) movaps %xmm0, 8 * SIZE(B1) movaps %xmm1, 10 * SIZE(B1) movaps %xmm2, 12 * SIZE(B1) movaps %xmm3, 14 * SIZE(B1) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-32 * SIZE, B1 ALIGN_4 .L16: testq $2, M jle .L18 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2) MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3) movaps %xmm0, -16 * SIZE(B2) movaps %xmm1, -14 * SIZE(B2) movaps %xmm2, -12 * SIZE(B2) movaps %xmm3, -10 * SIZE(B2) MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1) MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2) MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3) movaps %xmm0, -8 * SIZE(B2) movaps %xmm1, -6 * SIZE(B2) movaps %xmm2, -4 * SIZE(B2) movaps %xmm3, -2 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-16 * SIZE, B2 ALIGN_4 .L18: testq $1, M jle .L19 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO1, LDA, 2), %xmm2 movsd 0 * SIZE(AO1, LDA3), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -16 * SIZE(B3) movaps %xmm2, -14 * SIZE(B3) movsd 0 * SIZE(AO2), %xmm0 movsd 0 * SIZE(AO2, LDA), %xmm1 movsd 0 * SIZE(AO2, LDA, 2), %xmm2 movsd 0 * SIZE(AO2, LDA3), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -12 * SIZE(B3) movaps %xmm2, -10 * SIZE(B3) subq $-8 * SIZE, B3 ALIGN_4 .L19: cmpq $8, N jge .L11 ALIGN_4 .L20: cmpq $4, N jl .L30 subq $4, N movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A movq B, B0 addq $32 * SIZE, B movq M, I sarq $3, I jle .L24 ALIGN_4 .L23: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW 16 * SIZE(B0) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0) movaps %xmm3, -10 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) #endif MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) #ifdef PREFETCHW PREFETCHW 24 * SIZE(B0) #endif movaps %xmm0, -8 * SIZE(B0) movaps %xmm1, -6 * SIZE(B0) movaps %xmm2, -4 * SIZE(B0) movaps %xmm3, -2 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) #endif MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A1(4 * SIZE, AO2, %xmm2) MOVUPS_A1(6 * SIZE, AO2, %xmm3) #ifdef PREFETCHW PREFETCHW 32 * SIZE(B0) #endif movaps %xmm0, 0 * SIZE(B0) movaps %xmm1, 2 * SIZE(B0) movaps %xmm2, 4 * SIZE(B0) movaps %xmm3, 6 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) #endif MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) #ifdef PREFETCHW PREFETCHW 40 * SIZE(B0) #endif movaps %xmm0, 8 * SIZE(B0) movaps %xmm1, 10 * SIZE(B0) movaps %xmm2, 12 * SIZE(B0) movaps %xmm3, 14 * SIZE(B0) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L23 ALIGN_4 .L24: testq $4, M jle .L26 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) movaps %xmm0, -16 * SIZE(B1) movaps %xmm1, -14 * SIZE(B1) movaps %xmm2, -12 * SIZE(B1) movaps %xmm3, -10 * SIZE(B1) MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) movaps %xmm0, -8 * SIZE(B1) movaps %xmm1, -6 * SIZE(B1) movaps %xmm2, -4 * SIZE(B1) movaps %xmm3, -2 * SIZE(B1) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B1 ALIGN_4 .L26: testq $2, M jle .L28 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) MOVUPS_A1(0 * SIZE, AO2, %xmm2) MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) movaps %xmm0, -16 * SIZE(B2) movaps %xmm1, -14 * SIZE(B2) movaps %xmm2, -12 * SIZE(B2) movaps %xmm3, -10 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B2 ALIGN_4 .L28: testq $1, M jle .L30 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO1, LDA), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movsd 0 * SIZE(AO2, LDA), %xmm3 unpcklpd %xmm1, %xmm0 unpcklpd %xmm3, %xmm2 movaps %xmm0, -16 * SIZE(B3) movaps %xmm2, -14 * SIZE(B3) subq $-4 * SIZE, B3 ALIGN_4 .L30: cmpq $2, N jl .L40 subq $2, N movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq B, B0 addq $16 * SIZE, B movq M, I sarq $3, I jle .L34 ALIGN_4 .L33: #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW 0 * SIZE(B0) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0) movaps %xmm3, -10 * SIZE(B0) #ifdef PREFETCH PREFETCH PREFETCHSIZE * SIZE(AO2) #endif MOVUPS_A1(0 * SIZE, AO2, %xmm0) MOVUPS_A1(2 * SIZE, AO2, %xmm1) MOVUPS_A1(4 * SIZE, AO2, %xmm2) MOVUPS_A1(6 * SIZE, AO2, %xmm3) #ifdef PREFETCHW PREFETCHW 8 * SIZE(B0) #endif movaps %xmm0, -8 * SIZE(B0) movaps %xmm1, -6 * SIZE(B0) movaps %xmm2, -4 * SIZE(B0) movaps %xmm3, -2 * SIZE(B0) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L33 ALIGN_4 .L34: testq $4, M jle .L36 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(0 * SIZE, AO2, %xmm2) MOVUPS_A1(2 * SIZE, AO2, %xmm3) movaps %xmm0, -16 * SIZE(B1) movaps %xmm1, -14 * SIZE(B1) movaps %xmm2, -12 * SIZE(B1) movaps %xmm3, -10 * SIZE(B1) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B1 ALIGN_4 .L36: testq $2, M jle .L38 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(0 * SIZE, AO2, %xmm1) movaps %xmm0, -16 * SIZE(B2) movaps %xmm1, -14 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B2 ALIGN_4 .L38: testq $1, M jle .L40 movsd 0 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 unpcklpd %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(B3) subq $-2 * SIZE, B3 ALIGN_4 .L40: cmpq $1, N jl .L999 movq A, AO1 movq B, B0 movq M, I sarq $3, I jle .L44 ALIGN_4 .L43: #ifdef PREFETCH PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) #endif MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) MOVUPS_A1(4 * SIZE, AO1, %xmm2) MOVUPS_A1(6 * SIZE, AO1, %xmm3) #ifdef PREFETCHW PREFETCHW -8 * SIZE(B0) #endif movaps %xmm0, -16 * SIZE(B0) movaps %xmm1, -14 * SIZE(B0) movaps %xmm2, -12 * SIZE(B0) movaps %xmm3, -10 * SIZE(B0) addq $8 * SIZE, AO1 leaq (B0, M8, 8), B0 decq I jg .L43 ALIGN_4 .L44: testq $4, M jle .L45 MOVUPS_A1(0 * SIZE, AO1, %xmm0) MOVUPS_A1(2 * SIZE, AO1, %xmm1) movaps %xmm0, -16 * SIZE(B1) movaps %xmm1, -14 * SIZE(B1) addq $4 * SIZE, AO1 subq $-4 * SIZE, B1 ALIGN_4 .L45: testq $2, M jle .L46 MOVUPS_A1(0 * SIZE, AO1, %xmm0) movaps %xmm0, -16 * SIZE(B2) addq $2 * SIZE, AO1 subq $-2 * SIZE, B2 ALIGN_4 .L46: testq $1, M jle .L999 movsd 0 * SIZE(AO1), %xmm0 movlpd %xmm0, -16 * SIZE(B3) jmp .L999 ALIGN_4 .L999: popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemm_tcopy_8_bulldozer.S000066400000000000000000000347351313527062700223420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS #define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS #define A_PRE 256 #ifndef WINDOWS_ABI #define N ARG1 /* rsi */ #define M ARG2 /* rdi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define AO1 %r9 #define AO2 %r10 #define LDA3 %r11 #define M8 %r12 #else #define N ARG1 /* rdx */ #define M ARG2 /* rcx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 56(%rsp) #define B %r12 #define AO1 %rsi #define AO2 %rdi #define LDA3 %r10 #define M8 %r11 #endif #define I %rax #define B0 %rbp #define B1 %r13 #define B2 %r14 #define B3 %r15 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbp #ifdef WINDOWS_ABI movq OLD_B, B #endif subq $-16 * SIZE, B movq M, B1 movq M, B2 movq M, B3 andq $-8, B1 andq $-4, B2 andq $-2, B3 imulq N, B1 imulq N, B2 imulq N, B3 leaq (B, B1, SIZE), B1 leaq (B, B2, SIZE), B2 leaq (B, B3, SIZE), B3 leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 leaq (, N, SIZE), M8 cmpq $8, N jl .L20 ALIGN_4 .L11: subq $8, N movq A, AO1 leaq (A, LDA, 4), AO2 leaq (A, LDA, 8), A movq B, B0 addq $64 * SIZE, B movq M, I sarq $3, I jle .L14 ALIGN_4 .L13: prefetchnta A_PRE(AO1) VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) VMOVUPS_A1(4 * SIZE, AO1, %xmm2) VMOVUPS_A1(6 * SIZE, AO1, %xmm3) vmovups %xmm0, -16 * SIZE(B0) vmovups %xmm1, -14 * SIZE(B0) vmovups %xmm2, -12 * SIZE(B0) vmovups %xmm3, -10 * SIZE(B0) prefetchnta A_PRE(AO1, LDA, 1) VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) vmovups %xmm0, -8 * SIZE(B0) vmovups %xmm1, -6 * SIZE(B0) vmovups %xmm2, -4 * SIZE(B0) vmovups %xmm3, -2 * SIZE(B0) prefetchnta A_PRE(AO1, LDA, 2) VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) VMOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2) VMOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3) vmovups %xmm0, 0 * SIZE(B0) vmovups %xmm1, 2 * SIZE(B0) vmovups %xmm2, 4 * SIZE(B0) vmovups %xmm3, 6 * SIZE(B0) prefetchnta A_PRE(AO1, LDA3, 1) VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0) VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1) VMOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2) VMOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3) vmovups %xmm0, 8 * SIZE(B0) vmovups %xmm1, 10 * SIZE(B0) vmovups %xmm2, 12 * SIZE(B0) vmovups %xmm3, 14 * SIZE(B0) prefetchnta A_PRE(AO2) VMOVUPS_A1(0 * SIZE, AO2, %xmm0) VMOVUPS_A1(2 * SIZE, AO2, %xmm1) VMOVUPS_A1(4 * SIZE, AO2, %xmm2) VMOVUPS_A1(6 * SIZE, AO2, %xmm3) vmovups %xmm0, 16 * SIZE(B0) vmovups %xmm1, 18 * SIZE(B0) vmovups %xmm2, 20 * SIZE(B0) vmovups %xmm3, 22 * SIZE(B0) prefetchnta A_PRE(AO2, LDA, 1) VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) vmovups %xmm0, 24 * SIZE(B0) vmovups %xmm1, 26 * SIZE(B0) vmovups %xmm2, 28 * SIZE(B0) vmovups %xmm3, 30 * SIZE(B0) prefetchnta A_PRE(AO2, LDA, 2) VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) VMOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2) VMOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3) vmovups %xmm0, 32 * SIZE(B0) vmovups %xmm1, 34 * SIZE(B0) vmovups %xmm2, 36 * SIZE(B0) vmovups %xmm3, 38 * SIZE(B0) prefetchnta A_PRE(AO2, LDA3, 1) VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0) VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1) VMOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2) VMOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3) vmovups %xmm0, 40 * SIZE(B0) vmovups %xmm1, 42 * SIZE(B0) vmovups %xmm2, 44 * SIZE(B0) vmovups %xmm3, 46 * SIZE(B0) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L13 ALIGN_4 .L14: testq $4, M jle .L16 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) vmovups %xmm0, -16 * SIZE(B1) vmovups %xmm1, -14 * SIZE(B1) vmovups %xmm2, -12 * SIZE(B1) vmovups %xmm3, -10 * SIZE(B1) VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) VMOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2) VMOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3) vmovups %xmm0, -8 * SIZE(B1) vmovups %xmm1, -6 * SIZE(B1) vmovups %xmm2, -4 * SIZE(B1) vmovups %xmm3, -2 * SIZE(B1) VMOVUPS_A1(0 * SIZE, AO2, %xmm0) VMOVUPS_A1(2 * SIZE, AO2, %xmm1) VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) vmovups %xmm0, 0 * SIZE(B1) vmovups %xmm1, 2 * SIZE(B1) vmovups %xmm2, 4 * SIZE(B1) vmovups %xmm3, 6 * SIZE(B1) VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) VMOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2) VMOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3) vmovups %xmm0, 8 * SIZE(B1) vmovups %xmm1, 10 * SIZE(B1) vmovups %xmm2, 12 * SIZE(B1) vmovups %xmm3, 14 * SIZE(B1) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-32 * SIZE, B1 ALIGN_4 .L16: testq $2, M jle .L18 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) VMOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2) VMOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3) vmovups %xmm0, -16 * SIZE(B2) vmovups %xmm1, -14 * SIZE(B2) vmovups %xmm2, -12 * SIZE(B2) vmovups %xmm3, -10 * SIZE(B2) VMOVUPS_A1(0 * SIZE, AO2, %xmm0) VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1) VMOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2) VMOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3) vmovups %xmm0, -8 * SIZE(B2) vmovups %xmm1, -6 * SIZE(B2) vmovups %xmm2, -4 * SIZE(B2) vmovups %xmm3, -2 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-16 * SIZE, B2 ALIGN_4 .L18: testq $1, M jle .L19 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO1, LDA, 2), %xmm2 vmovsd 0 * SIZE(AO1, LDA3), %xmm3 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -16 * SIZE(B3) vmovups %xmm2, -14 * SIZE(B3) vmovsd 0 * SIZE(AO2), %xmm0 vmovsd 0 * SIZE(AO2, LDA), %xmm1 vmovsd 0 * SIZE(AO2, LDA, 2), %xmm2 vmovsd 0 * SIZE(AO2, LDA3), %xmm3 vunpcklpd %xmm1, %xmm0 , %xmm0 vunpcklpd %xmm3, %xmm2 , %xmm2 vmovups %xmm0, -12 * SIZE(B3) vmovups %xmm2, -10 * SIZE(B3) subq $-8 * SIZE, B3 ALIGN_4 .L19: cmpq $8, N jge .L11 ALIGN_4 .L20: cmpq $4, N jl .L30 subq $4, N movq A, AO1 leaq (A, LDA, 2), AO2 leaq (A, LDA, 4), A movq B, B0 addq $32 * SIZE, B movq M, I sarq $3, I jle .L24 ALIGN_4 .L23: VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) VMOVUPS_A1(4 * SIZE, AO1, %xmm2) VMOVUPS_A1(6 * SIZE, AO1, %xmm3) vmovups %xmm0, -16 * SIZE(B0) vmovups %xmm1, -14 * SIZE(B0) vmovups %xmm2, -12 * SIZE(B0) vmovups %xmm3, -10 * SIZE(B0) VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) VMOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) VMOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) vmovups %xmm0, -8 * SIZE(B0) vmovups %xmm1, -6 * SIZE(B0) vmovups %xmm2, -4 * SIZE(B0) vmovups %xmm3, -2 * SIZE(B0) VMOVUPS_A1(0 * SIZE, AO2, %xmm0) VMOVUPS_A1(2 * SIZE, AO2, %xmm1) VMOVUPS_A1(4 * SIZE, AO2, %xmm2) VMOVUPS_A1(6 * SIZE, AO2, %xmm3) vmovups %xmm0, 0 * SIZE(B0) vmovups %xmm1, 2 * SIZE(B0) vmovups %xmm2, 4 * SIZE(B0) vmovups %xmm3, 6 * SIZE(B0) VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) VMOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) VMOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) vmovups %xmm0, 8 * SIZE(B0) vmovups %xmm1, 10 * SIZE(B0) vmovups %xmm2, 12 * SIZE(B0) vmovups %xmm3, 14 * SIZE(B0) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L23 ALIGN_4 .L24: testq $4, M jle .L26 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) VMOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) vmovups %xmm0, -16 * SIZE(B1) vmovups %xmm1, -14 * SIZE(B1) vmovups %xmm2, -12 * SIZE(B1) vmovups %xmm3, -10 * SIZE(B1) VMOVUPS_A1(0 * SIZE, AO2, %xmm0) VMOVUPS_A1(2 * SIZE, AO2, %xmm1) VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) VMOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) vmovups %xmm0, -8 * SIZE(B1) vmovups %xmm1, -6 * SIZE(B1) vmovups %xmm2, -4 * SIZE(B1) vmovups %xmm3, -2 * SIZE(B1) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-16 * SIZE, B1 ALIGN_4 .L26: testq $2, M jle .L28 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) VMOVUPS_A1(0 * SIZE, AO2, %xmm2) VMOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) vmovups %xmm0, -16 * SIZE(B2) vmovups %xmm1, -14 * SIZE(B2) vmovups %xmm2, -12 * SIZE(B2) vmovups %xmm3, -10 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-8 * SIZE, B2 ALIGN_4 .L28: testq $1, M jle .L30 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO1, LDA), %xmm1 vmovsd 0 * SIZE(AO2), %xmm2 vmovsd 0 * SIZE(AO2, LDA), %xmm3 vunpcklpd %xmm1, %xmm0, %xmm0 vunpcklpd %xmm3, %xmm2, %xmm2 vmovups %xmm0, -16 * SIZE(B3) vmovups %xmm2, -14 * SIZE(B3) subq $-4 * SIZE, B3 ALIGN_4 .L30: cmpq $2, N jl .L40 subq $2, N movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq B, B0 addq $16 * SIZE, B movq M, I sarq $3, I jle .L34 ALIGN_4 .L33: VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) VMOVUPS_A1(4 * SIZE, AO1, %xmm2) VMOVUPS_A1(6 * SIZE, AO1, %xmm3) vmovups %xmm0, -16 * SIZE(B0) vmovups %xmm1, -14 * SIZE(B0) vmovups %xmm2, -12 * SIZE(B0) vmovups %xmm3, -10 * SIZE(B0) VMOVUPS_A1(0 * SIZE, AO2, %xmm0) VMOVUPS_A1(2 * SIZE, AO2, %xmm1) VMOVUPS_A1(4 * SIZE, AO2, %xmm2) VMOVUPS_A1(6 * SIZE, AO2, %xmm3) vmovups %xmm0, -8 * SIZE(B0) vmovups %xmm1, -6 * SIZE(B0) vmovups %xmm2, -4 * SIZE(B0) vmovups %xmm3, -2 * SIZE(B0) addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (B0, M8, 8), B0 decq I jg .L33 ALIGN_4 .L34: testq $4, M jle .L36 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) VMOVUPS_A1(0 * SIZE, AO2, %xmm2) VMOVUPS_A1(2 * SIZE, AO2, %xmm3) vmovups %xmm0, -16 * SIZE(B1) vmovups %xmm1, -14 * SIZE(B1) vmovups %xmm2, -12 * SIZE(B1) vmovups %xmm3, -10 * SIZE(B1) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B1 ALIGN_4 .L36: testq $2, M jle .L38 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(0 * SIZE, AO2, %xmm1) vmovups %xmm0, -16 * SIZE(B2) vmovups %xmm1, -14 * SIZE(B2) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 subq $-4 * SIZE, B2 ALIGN_4 .L38: testq $1, M jle .L40 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO2), %xmm1 vunpcklpd %xmm1, %xmm0, %xmm0 vmovups %xmm0, -16 * SIZE(B3) subq $-2 * SIZE, B3 ALIGN_4 .L40: cmpq $1, N jl .L999 movq A, AO1 movq B, B0 movq M, I sarq $3, I jle .L44 ALIGN_4 .L43: VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) VMOVUPS_A1(4 * SIZE, AO1, %xmm2) VMOVUPS_A1(6 * SIZE, AO1, %xmm3) vmovups %xmm0, -16 * SIZE(B0) vmovups %xmm1, -14 * SIZE(B0) vmovups %xmm2, -12 * SIZE(B0) vmovups %xmm3, -10 * SIZE(B0) addq $8 * SIZE, AO1 leaq (B0, M8, 8), B0 decq I jg .L43 ALIGN_4 .L44: testq $4, M jle .L45 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) VMOVUPS_A1(2 * SIZE, AO1, %xmm1) vmovups %xmm0, -16 * SIZE(B1) vmovups %xmm1, -14 * SIZE(B1) addq $4 * SIZE, AO1 subq $-4 * SIZE, B1 ALIGN_4 .L45: testq $2, M jle .L46 VMOVUPS_A1(0 * SIZE, AO1, %xmm0) vmovups %xmm0, -16 * SIZE(B2) addq $2 * SIZE, AO1 subq $-2 * SIZE, B2 ALIGN_4 .L46: testq $1, M jle .L999 vmovsd 0 * SIZE(AO1), %xmm0 vmovsd %xmm0, -16 * SIZE(B3) jmp .L999 ALIGN_4 .L999: popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemv_n.S000066400000000000000000001517041313527062700171350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #if GEMV_UNROLL < 2 #undef GEMV_UNROLL #define GEMV_UNROLL 2 #endif #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx #define OLD_LDA %r8 #define STACK_INCX 8 + STACKSIZE(%rsp) #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) #define ALPHA 48 (%rsp) #define MMM 56(%rsp) #define NN 64(%rsp) #define AA 72(%rsp) #define LDAX 80(%rsp) #define XX 88(%rsp) #else #define STACKSIZE 288 #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define STACK_INCX 64 + STACKSIZE(%rsp) #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) #define ALPHA 224 (%rsp) #define MMM 232(%rsp) #define NN 240(%rsp) #define AA 248(%rsp) #define LDAX 256(%rsp) #define XX 264(%rsp) #endif #define LDA %r8 #define X %r9 #define INCX %rsi #define INCY %rdi #define M %r10 #define N %r11 #define A %r12 #define Y %r14 #define BUFFER %r13 #define I %rax #define A1 %rbx #define A2 %rcx #define LDA3 %rdx #define Y1 %rbp #ifdef ALIGNED_ACCESS #define MM %r15 #else #define MM M #endif #define TMP_M %r15 #define Y2 %rbx PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_M, M movq OLD_N, N movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X #else movq OLD_M, M movq OLD_N, N movq OLD_A, A movq OLD_LDA, LDA #endif #ifndef WINDOWS_ABI movsd %xmm0, ALPHA #else movsd %xmm3, ALPHA #endif movq STACK_Y, Y movq A,AA movq N,NN movq M,MMM movq LDA,LDAX movq X,XX .L0t: xorq I,I addq $1,I salq $21,I subq I,MMM movq I,M jge .L00t movq MMM,M addq M, I jle .L999x movq I, M .L00t: movq XX,X movq AA,A movq NN,N movq LDAX,LDA movq STACK_INCX, INCX movq STACK_INCY, INCY movq STACK_BUFFER, BUFFER leaq -1(INCY), %rax leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 subq $-16 * SIZE, A #ifdef ALIGNED_ACCESS leaq -1 (M), MM testq $SIZE, A cmoveq M, MM #endif testq N, N # if n <= 0 goto END jle .L999 testq M, M # if n <= 0 goto END jle .L999 #if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS) #ifndef NOCOPY_UNALIGNED movq Y, Y1 andq $0xf, Y1 orq Y1, %rax #endif testq %rax, %rax cmoveq Y, BUFFER je .L10 #endif movq BUFFER, Y1 pxor %xmm4, %xmm4 movq M, %rax addq $16, %rax sarq $4, %rax ALIGN_3 .L01: movapd %xmm4, 0 * SIZE(Y1) movapd %xmm4, 2 * SIZE(Y1) movapd %xmm4, 4 * SIZE(Y1) movapd %xmm4, 6 * SIZE(Y1) movapd %xmm4, 8 * SIZE(Y1) movapd %xmm4, 10 * SIZE(Y1) movapd %xmm4, 12 * SIZE(Y1) movapd %xmm4, 14 * SIZE(Y1) subq $-16 * SIZE, Y1 decq %rax jg .L01 ALIGN_3 .L10: #ifdef ALIGNED_ACCESS leaq SIZE(BUFFER), %rax testq $SIZE, A cmovne %rax, BUFFER testq $SIZE, LDA jne .L50 #endif #if GEMV_UNROLL >= 8 cmpq $8, N jl .L20 ALIGN_3 .L11: subq $8, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 4), A2 leaq (A, LDA, 8), A #ifdef HAVE_SSE3 movddup (X), %xmm8 addq INCX, X movddup (X), %xmm9 addq INCX, X movddup (X), %xmm10 addq INCX, X movddup (X), %xmm11 addq INCX, X movddup (X), %xmm12 addq INCX, X movddup (X), %xmm13 addq INCX, X movddup (X), %xmm14 addq INCX, X movddup (X), %xmm15 addq INCX, X movddup ALPHA, %xmm0 #else movsd (X), %xmm8 unpcklpd %xmm8, %xmm8 addq INCX, X movsd (X), %xmm9 unpcklpd %xmm9, %xmm9 addq INCX, X movsd (X), %xmm10 unpcklpd %xmm10, %xmm10 addq INCX, X movsd (X), %xmm11 unpcklpd %xmm11, %xmm11 addq INCX, X movsd (X), %xmm12 unpcklpd %xmm12, %xmm12 addq INCX, X movsd (X), %xmm13 unpcklpd %xmm13, %xmm13 addq INCX, X movsd (X), %xmm14 unpcklpd %xmm14, %xmm14 addq INCX, X movsd (X), %xmm15 unpcklpd %xmm15, %xmm15 addq INCX, X movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L1X movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movsd -16 * SIZE(A1, LDA, 2), %xmm6 movsd -16 * SIZE(A1, LDA3), %xmm7 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm8, %xmm4 addsd %xmm4, %xmm0 movsd -16 * SIZE(A2), %xmm4 mulsd %xmm9, %xmm5 addsd %xmm5, %xmm0 movsd -16 * SIZE(A2, LDA), %xmm5 mulsd %xmm10, %xmm6 addsd %xmm6, %xmm0 movsd -16 * SIZE(A2, LDA, 2), %xmm6 mulsd %xmm11, %xmm7 addsd %xmm7, %xmm0 movsd -16 * SIZE(A2, LDA3), %xmm7 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm5 addsd %xmm5, %xmm0 mulsd %xmm14, %xmm6 addsd %xmm6, %xmm0 mulsd %xmm15, %xmm7 addsd %xmm7, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L1X: #endif movq MM, I sarq $3, I jle .L15 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_A1(-12 * SIZE, A1, %xmm6) MOVUPS_A1(-10 * SIZE, A1, %xmm7) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) #endif mulpd %xmm9, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) mulpd %xmm9, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) mulpd %xmm9, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) mulpd %xmm9, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) #endif mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) mulpd %xmm10, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) mulpd %xmm10, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) mulpd %xmm10, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) #endif mulpd %xmm11, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) mulpd %xmm11, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A1(-14 * SIZE, A2, %xmm5) mulpd %xmm11, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm6) mulpd %xmm11, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A1(-10 * SIZE, A2, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) #endif mulpd %xmm13, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) mulpd %xmm13, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) #endif mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) mulpd %xmm14, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) mulpd %xmm14, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) #endif mulpd %xmm15, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm4) mulpd %xmm15, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A1( -6 * SIZE, A1, %xmm5) mulpd %xmm15, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1( -4 * SIZE, A1, %xmm6) mulpd %xmm15, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A1( -2 * SIZE, A1, %xmm7) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) mulpd %xmm9, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) mulpd %xmm9, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) mulpd %xmm9, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) mulpd %xmm9, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) mulpd %xmm10, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) mulpd %xmm10, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) mulpd %xmm10, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) mulpd %xmm11, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) mulpd %xmm11, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A1(-14 * SIZE, A2, %xmm5) mulpd %xmm11, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm6) mulpd %xmm11, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A1(-10 * SIZE, A2, %xmm7) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) mulpd %xmm13, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) mulpd %xmm13, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) mulpd %xmm14, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) mulpd %xmm14, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) mulpd %xmm15, %xmm4 addpd %xmm4, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) mulpd %xmm15, %xmm5 addpd %xmm5, %xmm1 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) mulpd %xmm15, %xmm6 addpd %xmm6, %xmm2 MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) mulpd %xmm15, %xmm7 addpd %xmm7, %xmm3 MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L15: testq $4, MM je .L16 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) mulpd %xmm9, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6) mulpd %xmm9, %xmm7 addpd %xmm7, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7) mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) mulpd %xmm10, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A1(-14 * SIZE, A2, %xmm5) mulpd %xmm11, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm11, %xmm7 addpd %xmm7, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6) mulpd %xmm13, %xmm7 addpd %xmm7, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7) mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 mulpd %xmm15, %xmm6 addpd %xmm6, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) mulpd %xmm15, %xmm7 addpd %xmm7, %xmm1 MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L16: testq $2, MM je .L17 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6) MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) mulpd %xmm9, %xmm5 addpd %xmm5, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm10, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6) mulpd %xmm11, %xmm7 addpd %xmm7, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm0 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm15, %xmm7 addpd %xmm7, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L17: testq $1, MM je .L18 movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movsd -16 * SIZE(A1, LDA, 2), %xmm6 movsd -16 * SIZE(A1, LDA3), %xmm7 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm8, %xmm4 addsd %xmm4, %xmm0 movsd -16 * SIZE(A2), %xmm4 mulsd %xmm9, %xmm5 addsd %xmm5, %xmm0 movsd -16 * SIZE(A2, LDA), %xmm5 mulsd %xmm10, %xmm6 addsd %xmm6, %xmm0 movsd -16 * SIZE(A2, LDA, 2), %xmm6 mulsd %xmm11, %xmm7 addsd %xmm7, %xmm0 movsd -16 * SIZE(A2, LDA3), %xmm7 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm5 addsd %xmm5, %xmm0 mulsd %xmm14, %xmm6 addsd %xmm6, %xmm0 mulsd %xmm15, %xmm7 addsd %xmm7, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L18: cmpq $8, N jge .L11 ALIGN_3 .L20: #endif #if GEMV_UNROLL >= 4 cmpq $4, N jl .L30 #if GEMV_UNROLL == 4 ALIGN_3 .L21: #endif subq $4, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A #ifdef HAVE_SSE3 movddup (X), %xmm12 addq INCX, X movddup (X), %xmm13 addq INCX, X movddup (X), %xmm14 addq INCX, X movddup (X), %xmm15 addq INCX, X movddup ALPHA, %xmm0 #else movsd (X), %xmm12 unpcklpd %xmm12, %xmm12 addq INCX, X movsd (X), %xmm13 unpcklpd %xmm13, %xmm13 addq INCX, X movsd (X), %xmm14 unpcklpd %xmm14, %xmm14 addq INCX, X movsd (X), %xmm15 unpcklpd %xmm15, %xmm15 addq INCX, X movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L2X movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movsd -16 * SIZE(A2), %xmm6 movsd -16 * SIZE(A2, LDA), %xmm7 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm5 addsd %xmm5, %xmm0 mulsd %xmm14, %xmm6 addsd %xmm6, %xmm0 mulsd %xmm15, %xmm7 addsd %xmm7, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L2X: #endif movq MM, I sarq $3, I jle .L25 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_A1(-12 * SIZE, A1, %xmm2) MOVUPS_A1(-10 * SIZE, A1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_A1(-16 * SIZE, A2, %xmm0) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A1(-14 * SIZE, A2, %xmm1) mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 MOVUPS_A1(-12 * SIZE, A2, %xmm2) mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 MOVUPS_A1(-10 * SIZE, A2, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif mulpd %xmm13, %xmm4 addpd %xmm4, %xmm8 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) mulpd %xmm13, %xmm5 addpd %xmm5, %xmm9 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm10 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm13, %xmm7 addpd %xmm7, %xmm11 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulpd %xmm14, %xmm0 addpd %xmm0, %xmm8 MOVUPS_A1( -8 * SIZE, A1, %xmm0) mulpd %xmm14, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A1( -6 * SIZE, A1, %xmm1) mulpd %xmm14, %xmm2 addpd %xmm2, %xmm10 MOVUPS_A1( -4 * SIZE, A1, %xmm2) mulpd %xmm14, %xmm3 addpd %xmm3, %xmm11 MOVUPS_A1( -2 * SIZE, A1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif mulpd %xmm15, %xmm4 addpd %xmm4, %xmm8 MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4) mulpd %xmm15, %xmm5 addpd %xmm5, %xmm9 MOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm15, %xmm6 addpd %xmm6, %xmm10 MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm15, %xmm7 addpd %xmm7, %xmm11 MOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_A1(-16 * SIZE, A2, %xmm0) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A1(-14 * SIZE, A2, %xmm1) mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 MOVUPS_A1(-12 * SIZE, A2, %xmm2) mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 MOVUPS_A1(-10 * SIZE, A2, %xmm3) mulpd %xmm13, %xmm4 addpd %xmm4, %xmm8 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) mulpd %xmm13, %xmm5 addpd %xmm5, %xmm9 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm10 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm13, %xmm7 addpd %xmm7, %xmm11 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) mulpd %xmm14, %xmm0 addpd %xmm0, %xmm8 mulpd %xmm14, %xmm1 addpd %xmm1, %xmm9 mulpd %xmm14, %xmm2 addpd %xmm2, %xmm10 mulpd %xmm14, %xmm3 addpd %xmm3, %xmm11 mulpd %xmm15, %xmm4 addpd %xmm4, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) mulpd %xmm15, %xmm5 addpd %xmm5, %xmm9 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) mulpd %xmm15, %xmm6 addpd %xmm6, %xmm10 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) mulpd %xmm15, %xmm7 addpd %xmm7, %xmm11 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L25: testq $4, MM je .L26 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm13, %xmm4 addpd %xmm4, %xmm8 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm9 MOVUPS_A1(-16 * SIZE, A2, %xmm0) MOVUPS_A1(-14 * SIZE, A2, %xmm1) mulpd %xmm14, %xmm0 addpd %xmm0, %xmm8 mulpd %xmm14, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm15, %xmm4 addpd %xmm4, %xmm8 mulpd %xmm15, %xmm5 addpd %xmm5, %xmm9 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L26: testq $2, MM je .L27 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A1(-16 * SIZE, A2, %xmm10) MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm0 mulpd %xmm14, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm15, %xmm11 addpd %xmm11, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L27: testq $1, MM #if GEMV_UNROLL == 4 je .L28 #else je .L30 #endif movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A1, LDA), %xmm9 movsd -16 * SIZE(A2), %xmm10 movsd -16 * SIZE(A2, LDA), %xmm11 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 mulsd %xmm13, %xmm9 addsd %xmm9, %xmm0 mulsd %xmm14, %xmm10 addsd %xmm10, %xmm0 mulsd %xmm15, %xmm11 addsd %xmm11, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #if GEMV_UNROLL == 4 .L28: cmpq $4, N jge .L21 ALIGN_3 #endif .L30: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L40 #if GEMV_UNROLL == 2 ALIGN_3 .L31: #endif subq $2, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA), A2 leaq (A, LDA, 2), A #ifdef HAVE_SSE3 movddup (X), %xmm12 addq INCX, X movddup (X), %xmm13 addq INCX, X movddup ALPHA, %xmm0 #else movsd (X), %xmm12 unpcklpd %xmm12, %xmm12 addq INCX, X movsd (X), %xmm13 unpcklpd %xmm13, %xmm13 addq INCX, X movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L3X movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A2), %xmm5 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm5 addsd %xmm5, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L3X: #endif movq MM, I sarq $3, I jle .L35 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_A1(-12 * SIZE, A1, %xmm2) MOVUPS_A1(-10 * SIZE, A1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) MOVUPS_A1(-16 * SIZE, A2, %xmm4) MOVUPS_A1(-14 * SIZE, A2, %xmm5) MOVUPS_A1(-12 * SIZE, A2, %xmm6) MOVUPS_A1(-10 * SIZE, A2, %xmm7) decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_A1( -8 * SIZE, A1, %xmm0) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A1( -6 * SIZE, A1, %xmm1) mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 MOVUPS_A1( -4 * SIZE, A1, %xmm2) mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 MOVUPS_A1( -2 * SIZE, A1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif mulpd %xmm13, %xmm4 addpd %xmm4, %xmm8 MOVUPS_A1( -8 * SIZE, A2, %xmm4) mulpd %xmm13, %xmm5 addpd %xmm5, %xmm9 MOVUPS_A1( -6 * SIZE, A2, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm10 MOVUPS_A1( -4 * SIZE, A2, %xmm6) mulpd %xmm13, %xmm7 addpd %xmm7, %xmm11 MOVUPS_A1( -2 * SIZE, A2, %xmm7) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) mulpd %xmm13, %xmm5 addpd %xmm5, %xmm9 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm10 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) mulpd %xmm13, %xmm7 addpd %xmm7, %xmm11 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L35: testq $4, MM je .L36 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A1(-16 * SIZE, A2, %xmm4) MOVUPS_A1(-14 * SIZE, A2, %xmm5) mulpd %xmm13, %xmm4 addpd %xmm4, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) mulpd %xmm13, %xmm5 addpd %xmm5, %xmm9 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L36: testq $2, MM je .L37 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-16 * SIZE, A2, %xmm9) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L37: testq $1, MM #if GEMV_UNROLL == 2 je .L38 #else je .L40 #endif movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm9 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 mulsd %xmm13, %xmm9 addsd %xmm9, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #if GEMV_UNROLL == 2 .L38: cmpq $2, N jge .L31 ALIGN_3 #endif .L40: cmpq $1, N jl .L900 #endif leaq 16 * SIZE(BUFFER), Y1 movq A, A1 #ifdef HAVE_SSE3 movddup (X), %xmm12 addq INCX, X movddup ALPHA, %xmm0 #else movsd (X), %xmm12 unpcklpd %xmm12, %xmm12 addq INCX, X movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif mulpd %xmm0, %xmm12 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L4X movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, Y1 ALIGN_3 .L4X: #endif movq MM, I sarq $3, I jle .L45 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_A1(-12 * SIZE, A1, %xmm2) MOVUPS_A1(-10 * SIZE, A1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) decq I jle .L44 ALIGN_3 .L43: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_A1( -8 * SIZE, A1, %xmm0) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A1( -6 * SIZE, A1, %xmm1) mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 MOVUPS_A1( -4 * SIZE, A1, %xmm2) mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 MOVUPS_A1( -2 * SIZE, A1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L43 ALIGN_3 .L44: mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 ALIGN_3 .L45: testq $4, MM je .L46 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L46: testq $2, MM je .L47 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L47: testq $1, MM je .L900 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #ifdef ALIGNED_ACCESS jmp .L900 ALIGN_3 .L50: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L60 ALIGN_3 .L51: subq $4, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A #ifdef HAVE_SSE3 movddup (X), %xmm12 addq INCX, X movddup (X), %xmm13 addq INCX, X movddup (X), %xmm14 addq INCX, X movddup (X), %xmm15 addq INCX, X movddup ALPHA, %xmm0 #else movsd (X), %xmm12 unpcklpd %xmm12, %xmm12 addq INCX, X movsd (X), %xmm13 unpcklpd %xmm13, %xmm13 addq INCX, X movsd (X), %xmm14 unpcklpd %xmm14, %xmm14 addq INCX, X movsd (X), %xmm15 unpcklpd %xmm15, %xmm15 addq INCX, X movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 testq $SIZE, A je .L5X movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm5 movsd -16 * SIZE(A2), %xmm6 movsd -16 * SIZE(A2, LDA), %xmm7 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm5 addsd %xmm5, %xmm0 mulsd %xmm14, %xmm6 addsd %xmm6, %xmm0 mulsd %xmm15, %xmm7 addsd %xmm7, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L5X: movhpd -16 * SIZE(A1, LDA), %xmm8 movhpd -16 * SIZE(A2, LDA), %xmm9 movq MM, I sarq $3, I jle .L55 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_A1(-12 * SIZE, A1, %xmm6) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) decq I jle .L54 ALIGN_3 .L53: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm7) mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA) #endif shufpd $1, %xmm4, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) shufpd $1, %xmm5, %xmm4 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm1 MOVUPS_A1(-16 * SIZE, A2, %xmm4) shufpd $1, %xmm6, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm5) shufpd $1, %xmm8, %xmm6 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A1(-12 * SIZE, A2, %xmm6) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A2, %xmm7) mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) mulpd %xmm14, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm14, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA) #endif shufpd $1, %xmm4, %xmm9 mulpd %xmm15, %xmm9 addpd %xmm9, %xmm0 MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) shufpd $1, %xmm5, %xmm4 mulpd %xmm15, %xmm4 addpd %xmm4, %xmm1 MOVUPS_A1( -8 * SIZE, A1, %xmm4) shufpd $1, %xmm6, %xmm5 mulpd %xmm15, %xmm5 addpd %xmm5, %xmm2 MOVUPS_A1( -6 * SIZE, A1, %xmm5) shufpd $1, %xmm9, %xmm6 mulpd %xmm15, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A1( -4 * SIZE, A1, %xmm6) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L53 ALIGN_3 .L54: mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm7) mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) shufpd $1, %xmm4, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) shufpd $1, %xmm5, %xmm4 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm1 MOVUPS_A1(-16 * SIZE, A2, %xmm4) shufpd $1, %xmm6, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm5) shufpd $1, %xmm8, %xmm6 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A1(-12 * SIZE, A2, %xmm6) mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A2, %xmm7) mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) mulpd %xmm14, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) mulpd %xmm14, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) shufpd $1, %xmm4, %xmm9 mulpd %xmm15, %xmm9 addpd %xmm9, %xmm0 MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) shufpd $1, %xmm5, %xmm4 mulpd %xmm15, %xmm4 addpd %xmm4, %xmm1 shufpd $1, %xmm6, %xmm5 mulpd %xmm15, %xmm5 addpd %xmm5, %xmm2 shufpd $1, %xmm9, %xmm6 mulpd %xmm15, %xmm6 addpd %xmm6, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L55: testq $4, MM je .L56 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) shufpd $1, %xmm6, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movaps %xmm7, %xmm8 shufpd $1, %xmm7, %xmm6 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-16 * SIZE, A2, %xmm4) MOVUPS_A1(-14 * SIZE, A2, %xmm5) mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) shufpd $1, %xmm6, %xmm9 mulpd %xmm15, %xmm9 addpd %xmm9, %xmm0 movaps %xmm7, %xmm9 shufpd $1, %xmm7, %xmm6 mulpd %xmm15, %xmm6 addpd %xmm6, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L56: testq $2, MM je .L57 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A1(-16 * SIZE, A2, %xmm6) MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm5, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movaps %xmm5, %xmm8 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm0 shufpd $1, %xmm7, %xmm9 mulpd %xmm15, %xmm9 addpd %xmm9, %xmm0 movaps %xmm7, %xmm9 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L57: testq $1, MM je .L58 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm4 shufpd $1, %xmm8, %xmm8 movsd -16 * SIZE(A2), %xmm6 shufpd $1, %xmm9, %xmm9 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm8 addsd %xmm8, %xmm0 mulsd %xmm14, %xmm6 addsd %xmm6, %xmm0 mulsd %xmm15, %xmm9 addsd %xmm9, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L58: cmpq $4, N jge .L51 ALIGN_3 .L60: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L70 #if GEMV_UNROLL == 2 ALIGN_3 .L61: #endif subq $2, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA), A2 leaq (A, LDA, 2), A #ifdef HAVE_SSE3 movddup (X), %xmm12 addq INCX, X movddup (X), %xmm13 addq INCX, X movddup ALPHA, %xmm0 #else movsd (X), %xmm12 unpcklpd %xmm12, %xmm12 addq INCX, X movsd (X), %xmm13 unpcklpd %xmm13, %xmm13 addq INCX, X movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 testq $SIZE, A je .L6X movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(A2), %xmm5 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm5 addsd %xmm5, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L6X: movhpd -16 * SIZE(A2), %xmm8 movq MM, I sarq $3, I jle .L65 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_A1(-12 * SIZE, A1, %xmm6) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) decq I jle .L64 ALIGN_3 .L63: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm7) mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A1(-15 * SIZE, A2, %xmm4) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-13 * SIZE, A2, %xmm5) mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A1(-11 * SIZE, A2, %xmm6) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2) #endif shufpd $1, %xmm4, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1( -9 * SIZE, A2, %xmm8) shufpd $1, %xmm5, %xmm4 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm1 MOVUPS_A1( -8 * SIZE, A1, %xmm4) shufpd $1, %xmm6, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm2 MOVUPS_A1( -6 * SIZE, A1, %xmm5) shufpd $1, %xmm8, %xmm6 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A1( -4 * SIZE, A1, %xmm6) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L63 ALIGN_3 .L64: mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm7) mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A1(-15 * SIZE, A2, %xmm4) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-13 * SIZE, A2, %xmm5) mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 MOVUPS_A1(-11 * SIZE, A2, %xmm6) shufpd $1, %xmm4, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1( -9 * SIZE, A2, %xmm8) shufpd $1, %xmm5, %xmm4 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm1 shufpd $1, %xmm6, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm2 shufpd $1, %xmm8, %xmm6 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L65: testq $4, MM je .L66 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A1(-15 * SIZE, A2, %xmm6) MOVUPS_A1(-13 * SIZE, A2, %xmm7) shufpd $1, %xmm6, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movaps %xmm7, %xmm8 shufpd $1, %xmm7, %xmm6 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L66: testq $2, MM je .L67 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-15 * SIZE, A2, %xmm5) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm5, %xmm8 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movaps %xmm5, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L67: testq $1, MM #if GEMV_UNROLL == 2 je .L68 #else je .L70 #endif movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm4 shufpd $1, %xmm8, %xmm8 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 mulsd %xmm13, %xmm8 addsd %xmm8, %xmm0 movsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #if GEMV_UNROLL == 2 .L68: cmpq $2, N jge .L61 ALIGN_3 #endif .L70: cmpq $1, N jl .L900 #endif leaq 16 * SIZE(BUFFER), Y1 movq A, A1 #ifdef HAVE_SSE3 movddup (X), %xmm12 addq INCX, X movddup ALPHA, %xmm0 #else movsd (X), %xmm12 unpcklpd %xmm12, %xmm12 addq INCX, X movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif mulpd %xmm0, %xmm12 testq $SIZE, A je .L7X movsd -16 * SIZE(A1), %xmm4 movsd -16 * SIZE(Y1), %xmm0 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 movsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, Y1 ALIGN_3 .L7X: movq MM, I sarq $3, I jle .L75 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_A1(-12 * SIZE, A1, %xmm2) MOVUPS_A1(-10 * SIZE, A1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) decq I jle .L74 ALIGN_3 .L73: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_A1( -8 * SIZE, A1, %xmm0) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_A1( -6 * SIZE, A1, %xmm1) mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 MOVUPS_A1( -4 * SIZE, A1, %xmm2) mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 MOVUPS_A1( -2 * SIZE, A1, %xmm3) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L73 ALIGN_3 .L74: mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) mulpd %xmm12, %xmm2 addpd %xmm2, %xmm10 MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) mulpd %xmm12, %xmm3 addpd %xmm3, %xmm11 MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 ALIGN_3 .L75: testq $4, MM je .L76 MOVUPS_A1(-16 * SIZE, A1, %xmm0) MOVUPS_A1(-14 * SIZE, A1, %xmm1) MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) mulpd %xmm12, %xmm0 addpd %xmm0, %xmm8 MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) mulpd %xmm12, %xmm1 addpd %xmm1, %xmm9 MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L76: testq $2, MM je .L77 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L77: testq $1, MM je .L900 movsd -16 * SIZE(Y1), %xmm0 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd %xmm0, -16 * SIZE(Y1) #endif ALIGN_3 .L900: #ifndef COPY_FORCE cmpq Y, BUFFER je .L999 #endif movq M, TMP_M movq Y, Y1 cmpq $SIZE, INCY jne .L950 testq $SIZE, Y1 je .L910 movsd (Y1), %xmm0 addsd (BUFFER), %xmm0 movsd %xmm0, (Y1) addq $SIZE, Y1 addq $SIZE, BUFFER decq TMP_M jle .L999 ALIGN_4 .L910: testq $SIZE, BUFFER jne .L920 movq TMP_M, %rax sarq $3, %rax jle .L914 ALIGN_3 .L912: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1) #endif movapd 0 * SIZE(Y1), %xmm0 movapd 2 * SIZE(Y1), %xmm1 movapd 4 * SIZE(Y1), %xmm2 movapd 6 * SIZE(Y1), %xmm3 movapd 0 * SIZE(BUFFER), %xmm4 movapd 2 * SIZE(BUFFER), %xmm5 movapd 4 * SIZE(BUFFER), %xmm6 movapd 6 * SIZE(BUFFER), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) #endif addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) movapd %xmm2, 4 * SIZE(Y1) movapd %xmm3, 6 * SIZE(Y1) addq $8 * SIZE, Y1 addq $8 * SIZE, BUFFER decq %rax jg .L912 ALIGN_3 .L914: testq $7, TMP_M jle .L999 testq $4, TMP_M jle .L915 movapd 0 * SIZE(Y1), %xmm0 movapd 2 * SIZE(Y1), %xmm1 movapd 0 * SIZE(BUFFER), %xmm4 movapd 2 * SIZE(BUFFER), %xmm5 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) addq $4 * SIZE, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L915: testq $2, TMP_M jle .L916 movapd (Y1), %xmm0 movapd (BUFFER), %xmm4 addpd %xmm4, %xmm0 movapd %xmm0, (Y1) addq $2 * SIZE, Y1 addq $2 * SIZE, BUFFER ALIGN_3 .L916: testq $1, TMP_M jle .L999 movsd (Y1), %xmm0 movsd 0 * SIZE(BUFFER), %xmm4 addsd %xmm4, %xmm0 movlpd %xmm0, (Y1) ALIGN_3 jmp .L999 ALIGN_4 .L920: movapd -1 * SIZE(BUFFER), %xmm4 movq TMP_M, %rax sarq $3, %rax jle .L924 ALIGN_3 .L922: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1) #endif movapd 0 * SIZE(Y1), %xmm0 movapd 2 * SIZE(Y1), %xmm1 movapd 4 * SIZE(Y1), %xmm2 movapd 6 * SIZE(Y1), %xmm3 movapd 1 * SIZE(BUFFER), %xmm5 movapd 3 * SIZE(BUFFER), %xmm6 movapd 5 * SIZE(BUFFER), %xmm7 movapd 7 * SIZE(BUFFER), %xmm8 shufpd $1, %xmm5, %xmm4 shufpd $1, %xmm6, %xmm5 shufpd $1, %xmm7, %xmm6 shufpd $1, %xmm8, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) #endif addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) movapd %xmm2, 4 * SIZE(Y1) movapd %xmm3, 6 * SIZE(Y1) movapd %xmm8, %xmm4 addq $8 * SIZE, Y1 addq $8 * SIZE, BUFFER decq %rax jg .L922 ALIGN_3 .L924: testq $7, TMP_M jle .L999 testq $4, TMP_M jle .L925 movapd 0 * SIZE(Y1), %xmm0 movapd 2 * SIZE(Y1), %xmm1 movapd 1 * SIZE(BUFFER), %xmm5 movapd 3 * SIZE(BUFFER), %xmm6 shufpd $1, %xmm5, %xmm4 shufpd $1, %xmm6, %xmm5 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 movapd %xmm0, 0 * SIZE(Y1) movapd %xmm1, 2 * SIZE(Y1) movapd %xmm6, %xmm4 addq $4 * SIZE, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L925: testq $2, TMP_M jle .L926 movapd (Y1), %xmm0 movapd 1 * SIZE(BUFFER), %xmm5 shufpd $1, %xmm5, %xmm4 addpd %xmm4, %xmm0 movapd %xmm0, (Y1) movaps %xmm5, %xmm4 addq $2 * SIZE, Y1 addq $2 * SIZE, BUFFER ALIGN_3 .L926: testq $1, TMP_M jle .L999 movsd (Y1), %xmm0 shufpd $1, %xmm4, %xmm4 addsd %xmm4, %xmm0 movlpd %xmm0, (Y1) ALIGN_3 jmp .L999 ALIGN_4 .L950: testq $SIZE, BUFFER je .L960 movsd (Y1), %xmm0 addsd (BUFFER), %xmm0 movsd %xmm0, (Y1) addq INCY, Y1 addq $SIZE, BUFFER decq TMP_M jle .L999 ALIGN_4 .L960: movq Y1, Y2 movq TMP_M, %rax sarq $3, %rax jle .L964 ALIGN_3 .L962: movsd (Y2), %xmm0 addq INCY, Y2 movhpd (Y2), %xmm0 addq INCY, Y2 movapd 0 * SIZE(BUFFER), %xmm4 movsd (Y2), %xmm1 addq INCY, Y2 movhpd (Y2), %xmm1 addq INCY, Y2 movapd 2 * SIZE(BUFFER), %xmm5 movsd (Y2), %xmm2 addq INCY, Y2 movhpd (Y2), %xmm2 addq INCY, Y2 movapd 4 * SIZE(BUFFER), %xmm6 addpd %xmm4, %xmm0 movsd (Y2), %xmm3 addq INCY, Y2 movhpd (Y2), %xmm3 addq INCY, Y2 movapd 6 * SIZE(BUFFER), %xmm7 addpd %xmm5, %xmm1 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 addpd %xmm6, %xmm2 movlpd %xmm1, (Y1) addq INCY, Y1 movhpd %xmm1, (Y1) addq INCY, Y1 addpd %xmm7, %xmm3 movlpd %xmm2, (Y1) addq INCY, Y1 movhpd %xmm2, (Y1) addq INCY, Y1 movlpd %xmm3, (Y1) addq INCY, Y1 movhpd %xmm3, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER decq %rax jg .L962 ALIGN_3 .L964: testq $7, TMP_M jle .L999 testq $4, TMP_M jle .L965 movsd (Y2), %xmm0 addq INCY, Y2 movhpd (Y2), %xmm0 addq INCY, Y2 movapd 0 * SIZE(BUFFER), %xmm4 movsd (Y2), %xmm1 addq INCY, Y2 movhpd (Y2), %xmm1 addq INCY, Y2 movapd 2 * SIZE(BUFFER), %xmm5 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 movlpd %xmm1, (Y1) addq INCY, Y1 movhpd %xmm1, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L965: testq $2, TMP_M jle .L966 movsd (Y2), %xmm0 addq INCY, Y2 movhpd (Y2), %xmm0 addq INCY, Y2 movapd 0 * SIZE(BUFFER), %xmm4 addpd %xmm4, %xmm0 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 addq $2 * SIZE, BUFFER ALIGN_3 .L966: testq $1, TMP_M jle .L999 movsd (Y2), %xmm0 movsd 0 * SIZE(BUFFER), %xmm4 addsd %xmm4, %xmm0 movlpd %xmm0, (Y1) ALIGN_3 .L999: leaq (, M, SIZE), %rax addq %rax,AA movq STACK_INCY, INCY imulq INCY, %rax addq %rax, Y jmp .L0t ALIGN_4 .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemv_n_4.c000066400000000000000000000252731313527062700174010ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(NEHALEM) #include "dgemv_n_microk_nehalem-4.c" #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" #endif #define NBMAX 2048 #ifndef HAVE_KERNEL_4x4 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; FLOAT x[4]; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; for ( i=0; i<4; i++) x[i] = xo[i] * *alpha; for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; } } #endif #ifndef HAVE_KERNEL_4x2 static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "movsd (%2) , %%xmm12 \n\t" // x0 "movsd (%6) , %%xmm4 \n\t" // alpha "movsd 8(%2) , %%xmm13 \n\t" // x1 "mulsd %%xmm4 , %%xmm12 \n\t" // alpha "mulsd %%xmm4 , %%xmm13 \n\t" // alpha "shufpd $0, %%xmm12, %%xmm12 \n\t" "shufpd $0, %%xmm13, %%xmm13 \n\t" // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y "movups (%4,%0,8), %%xmm8 \n\t" "movups (%5,%0,8), %%xmm9 \n\t" "mulpd %%xmm12, %%xmm8 \n\t" "mulpd %%xmm13, %%xmm9 \n\t" "addpd %%xmm8 , %%xmm4 \n\t" "addpd %%xmm9 , %%xmm4 \n\t" "movups 16(%4,%0,8), %%xmm8 \n\t" "movups 16(%5,%0,8), %%xmm9 \n\t" "mulpd %%xmm12, %%xmm8 \n\t" "mulpd %%xmm13, %%xmm9 \n\t" "addpd %%xmm8 , %%xmm5 \n\t" "addpd %%xmm9 , %%xmm5 \n\t" "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef HAVE_KERNEL_4x1 static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "movsd (%2), %%xmm12 \n\t" // x0 "mulsd (%5), %%xmm12 \n\t" // alpha "shufpd $0, %%xmm12, %%xmm12 \n\t" // ".align 16 \n\t" "1: \n\t" "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y "mulpd %%xmm12, %%xmm8 \n\t" "mulpd %%xmm12, %%xmm9 \n\t" "addpd %%xmm8 , %%xmm4 \n\t" "addpd %%xmm9 , %%xmm5 \n\t" "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 "r" (alpha) // 5 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; if ( inc_dest != 1 ) { for ( i=0; i> 2 ; n2 = n & 3 ; m3 = m & 3 ; m1 = m & -4 ; m2 = (m & (NBMAX-1)) - m3 ; y_ptr = y; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } a_ptr = a; x_ptr = x; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if ( inc_y != 1 ) memset(ybuffer,0,NB*8); else ybuffer = y_ptr; if ( inc_x == 1 ) { for( i = 0; i < n1 ; i++) { dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; x_ptr += 4; } if ( n2 & 2 ) { dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); a_ptr += lda*2; x_ptr += 2; } if ( n2 & 1 ) { dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); a_ptr += lda; x_ptr += 1; } } else { for( i = 0; i < n1 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; xbuffer[1] = x_ptr[0]; x_ptr += inc_x; xbuffer[2] = x_ptr[0]; x_ptr += inc_x; xbuffer[3] = x_ptr[0]; x_ptr += inc_x; dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; } for( i = 0; i < n2 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); a_ptr += lda; } } a += NB; if ( inc_y != 1 ) { add_y(NB,ybuffer,y_ptr,inc_y); y_ptr += NB * inc_y; } else y_ptr += NB ; } if ( m3 == 0 ) return(0); if ( m3 == 3 ) { a_ptr = a; x_ptr = x; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; if ( lda == 3 && inc_x ==1 ) { for( i = 0; i < ( n & -4 ); i+=4 ) { temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; a_ptr += 12; x_ptr += 4; } for( ; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0]; a_ptr += 3; x_ptr ++; } } else { for( i = 0; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp0; y_ptr += inc_y; y_ptr[0] += alpha * temp1; y_ptr += inc_y; y_ptr[0] += alpha * temp2; return(0); } if ( m3 == 2 ) { a_ptr = a; x_ptr = x; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; if ( lda == 2 && inc_x ==1 ) { for( i = 0; i < (n & -4) ; i+=4 ) { temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; a_ptr += 8; x_ptr += 4; } for( ; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; a_ptr += 2; x_ptr ++; } } else { for( i = 0; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp0; y_ptr += inc_y; y_ptr[0] += alpha * temp1; return(0); } if ( m3 == 1 ) { a_ptr = a; x_ptr = x; FLOAT temp = 0.0; if ( lda == 1 && inc_x ==1 ) { for( i = 0; i < (n & -4); i+=4 ) { temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; } for( ; i < n; i++ ) { temp += a_ptr[i] * x_ptr[i]; } } else { for( i = 0; i < n; i++ ) { temp += a_ptr[0] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp; return(0); } return(0); } OpenBLAS-0.2.20/kernel/x86_64/dgemv_n_atom.S000066400000000000000000000366241313527062700201600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCH_SIZE (8 * 6) #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) #define STACK_ALPHA 48 (%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define OLD_INCX 64 + STACKSIZE(%rsp) #define OLD_Y 72 + STACKSIZE(%rsp) #define OLD_INCY 80 + STACKSIZE(%rsp) #define OLD_BUFFER 88 + STACKSIZE(%rsp) #define STACK_ALPHA 224 (%rsp) #define M %rcx #define N %rdx #define A %r8 #define LDA %r9 #define X %rdi #define INCX %rsi #define Y %rbp #define INCY %r10 #endif #define I %rax #define J %r11 #define A1 %r12 #define A2 %r13 #define Y1 %r14 #define BUFFER %r15 #define MM %rbx #define ALPHA %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X #endif movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER #ifndef WINDOWS_ABI movsd %xmm0, STACK_ALPHA #else movsd %xmm3, STACK_ALPHA #endif leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq N, N jle .L999 testq M, M jle .L999 cmpq $SIZE, INCY cmoveq Y, BUFFER je .L10 movq BUFFER, Y1 xorps %xmm4, %xmm4 movq M, %rax addq $7, %rax sarq $3, %rax ALIGN_3 .L01: movsd %xmm4, 0 * SIZE(Y1) movsd %xmm4, 1 * SIZE(Y1) movsd %xmm4, 2 * SIZE(Y1) movsd %xmm4, 3 * SIZE(Y1) movsd %xmm4, 4 * SIZE(Y1) movsd %xmm4, 5 * SIZE(Y1) movsd %xmm4, 6 * SIZE(Y1) movsd %xmm4, 7 * SIZE(Y1) addq $8 * SIZE, Y1 decq %rax jg .L01 ALIGN_3 .L10: movq N, J sarq $1, J jle .L20 ALIGN_3 .L11: movq BUFFER, Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd STACK_ALPHA, %xmm0 movsd (X), %xmm14 addq INCX, X movsd (X), %xmm15 addq INCX, X mulsd %xmm0, %xmm14 mulsd %xmm0, %xmm15 movq M, I sarq $3, I jle .L15 movsd 0 * SIZE(A1), %xmm0 movsd 1 * SIZE(A1), %xmm1 movsd 2 * SIZE(A1), %xmm2 movsd 3 * SIZE(A1), %xmm3 movsd 0 * SIZE(A2), %xmm4 movsd 1 * SIZE(A2), %xmm5 movsd 2 * SIZE(A2), %xmm6 movsd 3 * SIZE(A2), %xmm7 movsd 0 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm0 movsd 1 * SIZE(Y1), %xmm9 mulsd %xmm14, %xmm1 movsd 2 * SIZE(Y1), %xmm10 mulsd %xmm14, %xmm2 movsd 3 * SIZE(Y1), %xmm11 mulsd %xmm14, %xmm3 decq I jle .L14 ALIGN_3 .L13: PREFETCH PREFETCH_SIZE * SIZE(A1) mulsd %xmm15, %xmm4 PREFETCH PREFETCH_SIZE * SIZE(A2) addsd %xmm0, %xmm8 movsd 4 * SIZE(A1), %xmm0 mulsd %xmm15, %xmm5 addsd %xmm1, %xmm9 movsd 5 * SIZE(A1), %xmm1 mulsd %xmm15, %xmm6 addsd %xmm2, %xmm10 movsd 6 * SIZE(A1), %xmm2 mulsd %xmm15, %xmm7 addsd %xmm3, %xmm11 movsd 7 * SIZE(A1), %xmm3 addsd %xmm4, %xmm8 mulsd %xmm14, %xmm0 movsd 4 * SIZE(A2), %xmm4 addsd %xmm5, %xmm9 mulsd %xmm14, %xmm1 movsd 5 * SIZE(A2), %xmm5 addsd %xmm6, %xmm10 mulsd %xmm14, %xmm2 movsd 6 * SIZE(A2), %xmm6 addsd %xmm7, %xmm11 mulsd %xmm14, %xmm3 movsd 7 * SIZE(A2), %xmm7 movsd %xmm8, 0 * SIZE(Y1) movsd 4 * SIZE(Y1), %xmm8 movsd %xmm9, 1 * SIZE(Y1) movsd 5 * SIZE(Y1), %xmm9 movsd %xmm10, 2 * SIZE(Y1) movsd 6 * SIZE(Y1), %xmm10 movsd %xmm11, 3 * SIZE(Y1) movsd 7 * SIZE(Y1), %xmm11 mulsd %xmm15, %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(A1), %xmm0 mulsd %xmm15, %xmm5 addsd %xmm1, %xmm9 movsd 9 * SIZE(A1), %xmm1 mulsd %xmm15, %xmm6 addsd %xmm2, %xmm10 movsd 10 * SIZE(A1), %xmm2 mulsd %xmm15, %xmm7 addq $8 * SIZE, A2 addsd %xmm3, %xmm11 movsd 11 * SIZE(A1), %xmm3 mulsd %xmm14, %xmm0 addsd %xmm4, %xmm8 movsd 0 * SIZE(A2), %xmm4 mulsd %xmm14, %xmm1 addq $8 * SIZE, Y1 addsd %xmm5, %xmm9 movsd 1 * SIZE(A2), %xmm5 mulsd %xmm14, %xmm2 addq $8 * SIZE, A1 addsd %xmm6, %xmm10 movsd 2 * SIZE(A2), %xmm6 mulsd %xmm14, %xmm3 decq I addsd %xmm7, %xmm11 movsd 3 * SIZE(A2), %xmm7 movsd %xmm8, -4 * SIZE(Y1) movsd 0 * SIZE(Y1), %xmm8 movsd %xmm9, -3 * SIZE(Y1) movsd 1 * SIZE(Y1), %xmm9 movsd %xmm10,-2 * SIZE(Y1) movsd 2 * SIZE(Y1), %xmm10 movsd %xmm11,-1 * SIZE(Y1) movsd 3 * SIZE(Y1), %xmm11 jg .L13 ALIGN_3 .L14: mulsd %xmm15, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(A1), %xmm0 mulsd %xmm15, %xmm5 addsd %xmm1, %xmm9 movsd 5 * SIZE(A1), %xmm1 mulsd %xmm15, %xmm6 addsd %xmm2, %xmm10 movsd 6 * SIZE(A1), %xmm2 mulsd %xmm15, %xmm7 addsd %xmm3, %xmm11 movsd 7 * SIZE(A1), %xmm3 addsd %xmm4, %xmm8 mulsd %xmm14, %xmm0 movsd 4 * SIZE(A2), %xmm4 addsd %xmm5, %xmm9 mulsd %xmm14, %xmm1 movsd 5 * SIZE(A2), %xmm5 addsd %xmm6, %xmm10 mulsd %xmm14, %xmm2 movsd 6 * SIZE(A2), %xmm6 addsd %xmm7, %xmm11 mulsd %xmm14, %xmm3 movsd 7 * SIZE(A2), %xmm7 movsd %xmm8, 0 * SIZE(Y1) movsd 4 * SIZE(Y1), %xmm8 movsd %xmm9, 1 * SIZE(Y1) movsd 5 * SIZE(Y1), %xmm9 movsd %xmm10, 2 * SIZE(Y1) movsd 6 * SIZE(Y1), %xmm10 movsd %xmm11, 3 * SIZE(Y1) movsd 7 * SIZE(Y1), %xmm11 mulsd %xmm15, %xmm4 addsd %xmm0, %xmm8 mulsd %xmm15, %xmm5 addsd %xmm1, %xmm9 mulsd %xmm15, %xmm6 addsd %xmm2, %xmm10 mulsd %xmm15, %xmm7 addq $8 * SIZE, A2 addsd %xmm3, %xmm11 mulsd %xmm14, %xmm0 addsd %xmm4, %xmm8 mulsd %xmm14, %xmm1 addq $8 * SIZE, Y1 addsd %xmm5, %xmm9 mulsd %xmm14, %xmm2 addq $8 * SIZE, A1 addsd %xmm6, %xmm10 mulsd %xmm14, %xmm3 addsd %xmm7, %xmm11 movsd %xmm8, -4 * SIZE(Y1) movsd %xmm9, -3 * SIZE(Y1) movsd %xmm10,-2 * SIZE(Y1) movsd %xmm11,-1 * SIZE(Y1) ALIGN_3 .L15: testq $4, M je .L17 movsd 0 * SIZE(A1), %xmm0 movsd 1 * SIZE(A1), %xmm1 movsd 2 * SIZE(A1), %xmm2 movsd 3 * SIZE(A1), %xmm3 movsd 0 * SIZE(A2), %xmm4 movsd 1 * SIZE(A2), %xmm5 movsd 2 * SIZE(A2), %xmm6 movsd 3 * SIZE(A2), %xmm7 movsd 0 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm0 movsd 1 * SIZE(Y1), %xmm9 mulsd %xmm14, %xmm1 movsd 2 * SIZE(Y1), %xmm10 mulsd %xmm14, %xmm2 movsd 3 * SIZE(Y1), %xmm11 mulsd %xmm14, %xmm3 mulsd %xmm15, %xmm4 addsd %xmm0, %xmm8 mulsd %xmm15, %xmm5 addsd %xmm1, %xmm9 mulsd %xmm15, %xmm6 addsd %xmm2, %xmm10 mulsd %xmm15, %xmm7 addsd %xmm3, %xmm11 addsd %xmm4, %xmm8 addsd %xmm5, %xmm9 addsd %xmm6, %xmm10 addsd %xmm7, %xmm11 movsd %xmm8, 0 * SIZE(Y1) movsd %xmm9, 1 * SIZE(Y1) movsd %xmm10, 2 * SIZE(Y1) movsd %xmm11, 3 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L17: testq $2, M je .L18 movsd 0 * SIZE(A1), %xmm0 movsd 1 * SIZE(A1), %xmm1 movsd 0 * SIZE(A2), %xmm4 movsd 1 * SIZE(A2), %xmm5 mulsd %xmm14, %xmm0 movsd 0 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm1 movsd 1 * SIZE(Y1), %xmm9 mulsd %xmm15, %xmm4 mulsd %xmm15, %xmm5 addsd %xmm0, %xmm8 addsd %xmm1, %xmm9 addsd %xmm4, %xmm8 addsd %xmm5, %xmm9 movsd %xmm8, 0 * SIZE(Y1) movsd %xmm9, 1 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L18: testq $1, M je .L19 movsd 0 * SIZE(Y1), %xmm8 movsd 0 * SIZE(A1), %xmm0 movsd 0 * SIZE(A2), %xmm4 mulsd %xmm14, %xmm0 mulsd %xmm15, %xmm4 addsd %xmm0, %xmm8 addsd %xmm4, %xmm8 movsd %xmm8, 0 * SIZE(Y1) ALIGN_3 .L19: decq J jg .L11 ALIGN_3 .L20: testq $1, N je .L990 movq BUFFER, Y1 movq A, A1 movsd (X), %xmm14 mulsd STACK_ALPHA, %xmm14 movq M, I sarq $3, I jle .L25 movsd 0 * SIZE(A1), %xmm0 movsd 1 * SIZE(A1), %xmm1 movsd 2 * SIZE(A1), %xmm2 movsd 3 * SIZE(A1), %xmm3 movsd 4 * SIZE(A1), %xmm4 movsd 5 * SIZE(A1), %xmm5 movsd 6 * SIZE(A1), %xmm6 movsd 7 * SIZE(A1), %xmm7 movsd 0 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm0 movsd 1 * SIZE(Y1), %xmm9 mulsd %xmm14, %xmm1 movsd 2 * SIZE(Y1), %xmm10 mulsd %xmm14, %xmm2 movsd 3 * SIZE(Y1), %xmm11 mulsd %xmm14, %xmm3 decq I jle .L24 ALIGN_3 .L23: PREFETCH PREFETCH_SIZE * SIZE(A1) addsd %xmm0, %xmm8 movsd 8 * SIZE(A1), %xmm0 addsd %xmm1, %xmm9 movsd 9 * SIZE(A1), %xmm1 addsd %xmm2, %xmm10 movsd 10 * SIZE(A1), %xmm2 addsd %xmm3, %xmm11 movsd 11 * SIZE(A1), %xmm3 movsd %xmm8, 0 * SIZE(Y1) movsd 4 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm4 movsd %xmm9, 1 * SIZE(Y1) movsd 5 * SIZE(Y1), %xmm9 mulsd %xmm14, %xmm5 movsd %xmm10, 2 * SIZE(Y1) movsd 6 * SIZE(Y1), %xmm10 mulsd %xmm14, %xmm6 movsd %xmm11, 3 * SIZE(Y1) movsd 7 * SIZE(Y1), %xmm11 mulsd %xmm14, %xmm7 addsd %xmm4, %xmm8 movsd 12 * SIZE(A1), %xmm4 addsd %xmm5, %xmm9 movsd 13 * SIZE(A1), %xmm5 addsd %xmm6, %xmm10 movsd 14 * SIZE(A1), %xmm6 addsd %xmm7, %xmm11 movsd 15 * SIZE(A1), %xmm7 movsd %xmm8, 4 * SIZE(Y1) movsd 8 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm0 movsd %xmm9, 5 * SIZE(Y1) movsd 9 * SIZE(Y1), %xmm9 mulsd %xmm14, %xmm1 movsd %xmm10, 6 * SIZE(Y1) movsd 10 * SIZE(Y1), %xmm10 mulsd %xmm14, %xmm2 movsd %xmm11, 7 * SIZE(Y1) movsd 11 * SIZE(Y1), %xmm11 mulsd %xmm14, %xmm3 addq $8 * SIZE, Y1 addq $8 * SIZE, A1 decq I jg .L23 ALIGN_3 .L24: addsd %xmm0, %xmm8 addsd %xmm1, %xmm9 addsd %xmm2, %xmm10 addsd %xmm3, %xmm11 mulsd %xmm14, %xmm4 movsd %xmm8, 0 * SIZE(Y1) movsd 4 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm5 movsd %xmm9, 1 * SIZE(Y1) movsd 5 * SIZE(Y1), %xmm9 mulsd %xmm14, %xmm6 movsd %xmm10, 2 * SIZE(Y1) movsd 6 * SIZE(Y1), %xmm10 mulsd %xmm14, %xmm7 movsd %xmm11, 3 * SIZE(Y1) movsd 7 * SIZE(Y1), %xmm11 addsd %xmm4, %xmm8 addsd %xmm5, %xmm9 addsd %xmm6, %xmm10 addsd %xmm7, %xmm11 movsd %xmm8, 4 * SIZE(Y1) movsd %xmm9, 5 * SIZE(Y1) movsd %xmm10, 6 * SIZE(Y1) movsd %xmm11, 7 * SIZE(Y1) addq $8 * SIZE, Y1 addq $8 * SIZE, A1 ALIGN_3 .L25: testq $4, M je .L27 movsd 0 * SIZE(A1), %xmm0 movsd 1 * SIZE(A1), %xmm1 movsd 2 * SIZE(A1), %xmm2 movsd 3 * SIZE(A1), %xmm3 movsd 0 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm0 movsd 1 * SIZE(Y1), %xmm9 mulsd %xmm14, %xmm1 movsd 2 * SIZE(Y1), %xmm10 mulsd %xmm14, %xmm2 movsd 3 * SIZE(Y1), %xmm11 mulsd %xmm14, %xmm3 addsd %xmm0, %xmm8 addsd %xmm1, %xmm9 addsd %xmm2, %xmm10 addsd %xmm3, %xmm11 movsd %xmm8, 0 * SIZE(Y1) movsd %xmm9, 1 * SIZE(Y1) movsd %xmm10, 2 * SIZE(Y1) movsd %xmm11, 3 * SIZE(Y1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L27: testq $2, M je .L28 movsd 0 * SIZE(A1), %xmm0 movsd 1 * SIZE(A1), %xmm1 mulsd %xmm14, %xmm0 movsd 0 * SIZE(Y1), %xmm8 mulsd %xmm14, %xmm1 movsd 1 * SIZE(Y1), %xmm9 addsd %xmm0, %xmm8 addsd %xmm1, %xmm9 movsd %xmm8, 0 * SIZE(Y1) movsd %xmm9, 1 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L28: testq $1, M je .L990 movsd 0 * SIZE(Y1), %xmm8 movsd 0 * SIZE(A1), %xmm0 mulsd %xmm14, %xmm0 addsd %xmm0, %xmm8 movsd %xmm8, 0 * SIZE(Y1) ALIGN_3 .L990: cmpq $SIZE, INCY je .L999 movq Y, Y1 movq M, %rax sarq $2, %rax jle .L994 ALIGN_3 .L992: movsd (Y), %xmm0 addq INCY, Y movsd (Y), %xmm1 addq INCY, Y movsd (Y), %xmm2 addq INCY, Y movsd (Y), %xmm3 addq INCY, Y addsd 0 * SIZE(BUFFER), %xmm0 addsd 1 * SIZE(BUFFER), %xmm1 addsd 2 * SIZE(BUFFER), %xmm2 addsd 3 * SIZE(BUFFER), %xmm3 addq $4 * SIZE, BUFFER movsd %xmm0, (Y1) addq INCY, Y1 movsd %xmm1, (Y1) addq INCY, Y1 movsd %xmm2, (Y1) addq INCY, Y1 movsd %xmm3, (Y1) addq INCY, Y1 decq %rax jg .L992 ALIGN_3 .L994: testq $2, M jle .L996 movsd (Y), %xmm0 addq INCY, Y movsd (Y), %xmm1 addq INCY, Y addsd 0 * SIZE(BUFFER), %xmm0 addsd 1 * SIZE(BUFFER), %xmm1 addq $2 * SIZE, BUFFER movsd %xmm0, (Y1) addq INCY, Y1 movsd %xmm1, (Y1) addq INCY, Y1 ALIGN_3 .L996: testq $1, M jle .L999 movsd (Y), %xmm0 addsd (BUFFER), %xmm0 movsd %xmm0, (Y1) ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemv_n_bulldozer.S000066400000000000000000001402511313527062700212120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #undef ALIGNED_ACCESS #define A_PRE 256 #define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS #define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS #define VMOVUPS_YL1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS #define VMOVUPS_YS1(OFF, ADDR, REGS) vmovups REGS, OFF(ADDR) #if GEMV_UNROLL < 2 #undef GEMV_UNROLL #define GEMV_UNROLL 2 #endif #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx #define OLD_LDA %r8 #define STACK_INCX 8 + STACKSIZE(%rsp) #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) #define ALPHA 48 (%rsp) #else #define STACKSIZE 256 #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define STACK_INCX 64 + STACKSIZE(%rsp) #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) #define ALPHA 224 (%rsp) #endif #define LDA %r8 #define X %r9 #define INCX %rsi #define INCY %rdi #define M %r10 #define N %r11 #define A %r12 #define Y %r14 #define BUFFER %r13 #define I %rax #define A1 %rbx #define A2 %rcx #define LDA3 %rdx #define Y1 %rbp #ifdef ALIGNED_ACCESS #define MM %r15 #else #define MM M #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq OLD_M, M movq OLD_N, N movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X #else movq OLD_M, M movq OLD_N, N movq OLD_A, A movq OLD_LDA, LDA #endif movq STACK_INCX, INCX movq STACK_Y, Y movq STACK_INCY, INCY movq STACK_BUFFER, BUFFER #ifndef WINDOWS_ABI vmovsd %xmm0, ALPHA #else vmovsd %xmm3, ALPHA #endif leaq -1(INCY), %rax leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 subq $-16 * SIZE, A #ifdef ALIGNED_ACCESS leaq -1 (M), MM testq $SIZE, A cmoveq M, MM #endif testq N, N # if n <= 0 goto END jle .L999 testq M, M # if n <= 0 goto END jle .L999 #if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS) #ifndef NOCOPY_UNALIGNED movq Y, Y1 andq $0xf, Y1 orq Y1, %rax #endif testq %rax, %rax cmoveq Y, BUFFER je .L10 #endif movq BUFFER, Y1 vxorpd %xmm4, %xmm4, %xmm4 movq M, %rax addq $16, %rax sarq $4, %rax ALIGN_3 .L01: vmovups %xmm4, 0 * SIZE(Y1) vmovups %xmm4, 2 * SIZE(Y1) vmovups %xmm4, 4 * SIZE(Y1) vmovups %xmm4, 6 * SIZE(Y1) vmovups %xmm4, 8 * SIZE(Y1) vmovups %xmm4, 10 * SIZE(Y1) vmovups %xmm4, 12 * SIZE(Y1) vmovups %xmm4, 14 * SIZE(Y1) subq $-16 * SIZE, Y1 decq %rax jg .L01 ALIGN_3 .L10: #ifdef ALIGNED_ACCESS leaq SIZE(BUFFER), %rax testq $SIZE, A cmovne %rax, BUFFER testq $SIZE, LDA jne .L50 #endif #if GEMV_UNROLL >= 8 cmpq $8, N jl .L20 ALIGN_3 .L11: subq $8, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 4), A2 leaq (A, LDA, 8), A vmovddup (X), %xmm8 addq INCX, X vmovddup (X), %xmm9 addq INCX, X vmovddup (X), %xmm10 addq INCX, X vmovddup (X), %xmm11 addq INCX, X vmovddup (X), %xmm12 addq INCX, X vmovddup (X), %xmm13 addq INCX, X vmovddup (X), %xmm14 addq INCX, X vmovddup (X), %xmm15 addq INCX, X vmovddup ALPHA, %xmm0 vmulpd %xmm0, %xmm8 , %xmm8 vmulpd %xmm0, %xmm9 , %xmm9 vmulpd %xmm0, %xmm10 , %xmm10 vmulpd %xmm0, %xmm11 , %xmm11 vmulpd %xmm0, %xmm12 , %xmm12 vmulpd %xmm0, %xmm13 , %xmm13 vmulpd %xmm0, %xmm14 , %xmm14 vmulpd %xmm0, %xmm15 , %xmm15 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L1X vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(A1, LDA), %xmm5 vmovsd -16 * SIZE(A1, LDA, 2), %xmm6 vmovsd -16 * SIZE(A1, LDA3), %xmm7 vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0 vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0 vmovsd -16 * SIZE(A2), %xmm4 vmovsd -16 * SIZE(A2, LDA), %xmm5 vmovsd -16 * SIZE(A2, LDA, 2), %xmm6 vmovsd -16 * SIZE(A2, LDA3), %xmm7 vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L1X: #endif movq MM, I sarq $3, I jle .L15 VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) decq I jle .L14 ALIGN_5 .L13: prefetchnta A_PRE(A1) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm8, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1) , %xmm8, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1) , %xmm8, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1) , %xmm8, %xmm3 nop prefetchnta A_PRE(A1,LDA,1) vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 1) , %xmm9 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 1) , %xmm9 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 1) , %xmm9 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 1) , %xmm9 , %xmm3 prefetchnta A_PRE(A1,LDA,2) vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 2) , %xmm10, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 2) , %xmm10, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 2) , %xmm10, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 2) , %xmm10, %xmm3 prefetchnta A_PRE(A1,LDA3,1) vfmaddpd %xmm0 , -16 * SIZE(A1, LDA3, 1) , %xmm11, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1, LDA3, 1) , %xmm11, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1, LDA3, 1) , %xmm11, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1, LDA3, 1) , %xmm11, %xmm3 prefetchnta A_PRE(A2) vfmaddpd %xmm0 , -16 * SIZE(A2) , %xmm12, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2) , %xmm12, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm12, %xmm3 nop prefetchnta A_PRE(A2,LDA,1) vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 1) , %xmm13, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 1) , %xmm13, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 1) , %xmm13, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 1) , %xmm13, %xmm3 prefetchnta A_PRE(A2,LDA,2) vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 2) , %xmm14, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 2) , %xmm14, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 2) , %xmm14, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 2) , %xmm14, %xmm3 prefetchnta A_PRE(A2,LDA3,1) vfmaddpd %xmm0 , -16 * SIZE(A2, LDA3, 1) , %xmm15, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2, LDA3, 1) , %xmm15, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2, LDA3, 1) , %xmm15, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2, LDA3, 1) , %xmm15, %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) prefetchnta A_PRE(Y1) VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm8, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1) , %xmm8, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1) , %xmm8, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1) , %xmm8, %xmm3 vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 1) , %xmm9 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 1) , %xmm9 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 1) , %xmm9 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 1) , %xmm9 , %xmm3 vfmaddpd %xmm0 , -16 * SIZE(A1, LDA, 2) , %xmm10, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1, LDA, 2) , %xmm10, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1, LDA, 2) , %xmm10, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1, LDA, 2) , %xmm10, %xmm3 vfmaddpd %xmm0 , -16 * SIZE(A1, LDA3, 1) , %xmm11, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1, LDA3, 1) , %xmm11, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1, LDA3, 1) , %xmm11, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1, LDA3, 1) , %xmm11, %xmm3 vfmaddpd %xmm0 , -16 * SIZE(A2) , %xmm12, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2) , %xmm12, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm12, %xmm3 vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 1) , %xmm13, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 1) , %xmm13, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 1) , %xmm13, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 1) , %xmm13, %xmm3 vfmaddpd %xmm0 , -16 * SIZE(A2, LDA, 2) , %xmm14, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2, LDA, 2) , %xmm14, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2, LDA, 2) , %xmm14, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2, LDA, 2) , %xmm14, %xmm3 vfmaddpd %xmm0 , -16 * SIZE(A2, LDA3, 1) , %xmm15, %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A2, LDA3, 1) , %xmm15, %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2, LDA3, 1) , %xmm15, %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2, LDA3, 1) , %xmm15, %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L15: testq $4, MM je .L16 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A1(-14 * SIZE, A1, %xmm5) VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1 vfmaddpd %xmm0 , %xmm9 , %xmm6 , %xmm0 vfmaddpd %xmm1 , %xmm9 , %xmm7 , %xmm1 VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6) VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7) vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0 vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1 vfmaddpd %xmm0 , %xmm11, %xmm6 , %xmm0 vfmaddpd %xmm1 , %xmm11, %xmm7 , %xmm1 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) VMOVUPS_A1(-14 * SIZE, A2, %xmm5) VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7) vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1 vfmaddpd %xmm0 , %xmm13, %xmm6 , %xmm0 vfmaddpd %xmm1 , %xmm13, %xmm7 , %xmm1 VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6) VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7) vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0 vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1 vfmaddpd %xmm0 , %xmm15, %xmm6 , %xmm0 vfmaddpd %xmm1 , %xmm15, %xmm7 , %xmm1 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L16: testq $2, MM je .L17 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5) VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6) VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0 vfmaddpd %xmm0 , %xmm9 , %xmm5 , %xmm0 vfmaddpd %xmm0 , %xmm10, %xmm6 , %xmm0 vfmaddpd %xmm0 , %xmm11, %xmm7 , %xmm0 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5) VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6) VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7) vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0 vfmaddpd %xmm0 , %xmm13, %xmm5 , %xmm0 vfmaddpd %xmm0 , %xmm14, %xmm6 , %xmm0 vfmaddpd %xmm0 , %xmm15, %xmm7 , %xmm0 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L17: testq $1, MM je .L18 vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(A1, LDA), %xmm5 vmovsd -16 * SIZE(A1, LDA, 2), %xmm6 vmovsd -16 * SIZE(A1, LDA3), %xmm7 vmovsd -16 * SIZE(Y1), %xmm0 vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0 vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0 vmovsd -16 * SIZE(A2), %xmm4 vmovsd -16 * SIZE(A2, LDA), %xmm5 vmovsd -16 * SIZE(A2, LDA, 2), %xmm6 vmovsd -16 * SIZE(A2, LDA3), %xmm7 vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L18: cmpq $8, N jge .L11 ALIGN_3 .L20: #endif #if GEMV_UNROLL >= 4 cmpq $4, N jl .L30 #if GEMV_UNROLL == 4 ALIGN_3 .L21: #endif subq $4, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A vmovddup (X), %xmm12 addq INCX, X vmovddup (X), %xmm13 addq INCX, X vmovddup (X), %xmm14 addq INCX, X vmovddup (X), %xmm15 addq INCX, X vmovddup ALPHA, %xmm0 vmulpd %xmm0, %xmm12 , %xmm12 vmulpd %xmm0, %xmm13 , %xmm13 vmulpd %xmm0, %xmm14 , %xmm14 vmulpd %xmm0, %xmm15 , %xmm15 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L2X vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(A1, LDA), %xmm5 vmovsd -16 * SIZE(A2), %xmm6 vmovsd -16 * SIZE(A2, LDA), %xmm7 vmovsd -16 * SIZE(Y1), %xmm0 vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L2X: #endif movq MM, I sarq $3, I jle .L25 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_A1(-12 * SIZE, A1, %xmm2) VMOVUPS_A1(-10 * SIZE, A1, %xmm3) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) decq I jle .L24 ALIGN_3 .L23: vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11 VMOVUPS_A1(-16 * SIZE, A2, %xmm0) VMOVUPS_A1(-14 * SIZE, A2, %xmm1) prefetchnta A_PRE(A2) VMOVUPS_A1(-12 * SIZE, A2, %xmm2) VMOVUPS_A1(-10 * SIZE, A2, %xmm3) vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10 vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11 VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) prefetchnta A_PRE(A2, LDA, 1) VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11 VMOVUPS_A1( -8 * SIZE, A1, %xmm0) VMOVUPS_A1( -6 * SIZE, A1, %xmm1) prefetchnta A_PRE(A1) VMOVUPS_A1( -4 * SIZE, A1, %xmm2) VMOVUPS_A1( -2 * SIZE, A1, %xmm3) vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10 vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11 VMOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4) VMOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5) prefetchnta A_PRE(A1, LDA, 1) VMOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6) VMOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7) VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) prefetchnta A_PRE(Y1) VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11 VMOVUPS_A1(-16 * SIZE, A2, %xmm0) VMOVUPS_A1(-14 * SIZE, A2, %xmm1) VMOVUPS_A1(-12 * SIZE, A2, %xmm2) VMOVUPS_A1(-10 * SIZE, A2, %xmm3) vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10 vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11 VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11 vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10 vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L25: testq $4, MM je .L26 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9 VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9 VMOVUPS_A1(-16 * SIZE, A2, %xmm0) VMOVUPS_A1(-14 * SIZE, A2, %xmm1) vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9 VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L26: testq $2, MM je .L27 VMOVUPS_A1(-16 * SIZE, A1, %xmm8) VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) VMOVUPS_A1(-16 * SIZE, A2, %xmm10) VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12, %xmm8 , %xmm0 vfmaddpd %xmm0 , %xmm13, %xmm9 , %xmm0 vfmaddpd %xmm0 , %xmm14, %xmm10, %xmm0 vfmaddpd %xmm0 , %xmm15, %xmm11, %xmm0 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L27: testq $1, MM #if GEMV_UNROLL == 4 je .L28 #else je .L30 #endif vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm8 vmovsd -16 * SIZE(A1, LDA), %xmm9 vmovsd -16 * SIZE(A2), %xmm10 vmovsd -16 * SIZE(A2, LDA), %xmm11 vfmaddsd %xmm0 , %xmm12, %xmm8 , %xmm0 vfmaddsd %xmm0 , %xmm13, %xmm9 , %xmm0 vfmaddsd %xmm0 , %xmm14, %xmm10, %xmm0 vfmaddsd %xmm0 , %xmm15, %xmm11, %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #if GEMV_UNROLL == 4 .L28: cmpq $4, N jge .L21 ALIGN_3 #endif .L30: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L40 #if GEMV_UNROLL == 2 ALIGN_3 .L31: #endif subq $2, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA), A2 leaq (A, LDA, 2), A vmovddup (X), %xmm12 addq INCX, X vmovddup (X), %xmm13 addq INCX, X vmovddup ALPHA, %xmm0 vmulpd %xmm0, %xmm12 , %xmm12 vmulpd %xmm0, %xmm13 , %xmm13 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L3X vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(A2), %xmm5 vmovsd -16 * SIZE(Y1), %xmm0 vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L3X: #endif movq MM, I sarq $3, I jle .L35 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_A1(-12 * SIZE, A1, %xmm2) VMOVUPS_A1(-10 * SIZE, A1, %xmm3) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) VMOVUPS_A1(-16 * SIZE, A2, %xmm4) VMOVUPS_A1(-14 * SIZE, A2, %xmm5) VMOVUPS_A1(-12 * SIZE, A2, %xmm6) VMOVUPS_A1(-10 * SIZE, A2, %xmm7) decq I jle .L34 ALIGN_3 .L33: vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 vmovups -8 * SIZE(A1), %xmm0 vmovups -6 * SIZE(A1), %xmm1 prefetchnta A_PRE(A1) vmovups -4 * SIZE(A1), %xmm2 vmovups -2 * SIZE(A1), %xmm3 vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 prefetchnta A_PRE(A2) vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10 vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11 vmovups -8 * SIZE(A2), %xmm4 vmovups -6 * SIZE(A2), %xmm5 vmovups -4 * SIZE(A2), %xmm6 vmovups -2 * SIZE(A2) , %xmm7 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) prefetchnta A_PRE(Y1) VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10 vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L35: testq $4, MM je .L36 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) VMOVUPS_A1(-14 * SIZE, A2, %xmm5) vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8 vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L36: testq $2, MM je .L37 VMOVUPS_A1(-16 * SIZE, A1, %xmm8) VMOVUPS_A1(-16 * SIZE, A2, %xmm9) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 vfmaddpd %xmm0 , %xmm13 , %xmm9 , %xmm0 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L37: testq $1, MM #if GEMV_UNROLL == 2 je .L38 #else je .L40 #endif vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm8 vmovsd -16 * SIZE(A2), %xmm9 vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 vfmaddsd %xmm0 , %xmm13 , %xmm9 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #if GEMV_UNROLL == 2 .L38: cmpq $2, N jge .L31 ALIGN_3 #endif .L40: cmpq $1, N jl .L900 #endif leaq 16 * SIZE(BUFFER), Y1 movq A, A1 vmovddup (X), %xmm12 addq INCX, X vmovddup ALPHA, %xmm0 vmulpd %xmm0, %xmm12 , %xmm12 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L4X vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(Y1), %xmm0 vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, Y1 ALIGN_3 .L4X: #endif movq MM, I sarq $3, I jle .L45 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_A1(-12 * SIZE, A1, %xmm2) VMOVUPS_A1(-10 * SIZE, A1, %xmm3) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) decq I jle .L44 ALIGN_3 .L43: vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 VMOVUPS_A1( -8 * SIZE, A1, %xmm0) VMOVUPS_A1( -6 * SIZE, A1, %xmm1) VMOVUPS_A1( -4 * SIZE, A1, %xmm2) VMOVUPS_A1( -2 * SIZE, A1, %xmm3) VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L43 ALIGN_3 .L44: vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 ALIGN_3 .L45: testq $4, MM je .L46 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L46: testq $2, MM je .L47 VMOVUPS_A1(-16 * SIZE, A1, %xmm8) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L47: testq $1, MM je .L900 vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm8 vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #ifdef ALIGNED_ACCESS jmp .L900 ALIGN_3 .L50: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L60 ALIGN_3 .L51: subq $4, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A vmovddup (X), %xmm12 addq INCX, X vmovddup (X), %xmm13 addq INCX, X vmovddup (X), %xmm14 addq INCX, X vmovddup (X), %xmm15 addq INCX, X vmovddup ALPHA, %xmm0 vmulpd %xmm0, %xmm12 , %xmm12 vmulpd %xmm0, %xmm13 , %xmm13 vmulpd %xmm0, %xmm14 , %xmm14 vmulpd %xmm0, %xmm15 , %xmm15 testq $SIZE, A je .L5X vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(A1, LDA), %xmm5 vmovsd -16 * SIZE(A2), %xmm6 vmovsd -16 * SIZE(A2, LDA), %xmm7 vmovsd -16 * SIZE(Y1), %xmm0 vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0 vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm15 , %xmm7 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L5X: vmovhpd -16 * SIZE(A1, LDA), %xmm8, %xmm8 vmovhpd -16 * SIZE(A2, LDA), %xmm9, %xmm9 movq MM, I sarq $3, I jle .L55 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A1(-14 * SIZE, A1, %xmm5) VMOVUPS_A1(-12 * SIZE, A1, %xmm6) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) decq I jle .L54 ALIGN_3 .L53: vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 VMOVUPS_A1(-10 * SIZE, A1, %xmm7) vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) prefetchnta A_PRE(A1, LDA, 1) vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) prefetchnta A_PRE(A2) vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 VMOVUPS_A1(-14 * SIZE, A2, %xmm5) vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_A1(-12 * SIZE, A2, %xmm6) vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 VMOVUPS_A1(-10 * SIZE, A2, %xmm7) vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) prefetchnta A_PRE(A2, LDA, 1) vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2 VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3 VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) vshufpd $0x01, %xmm4, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 VMOVUPS_A1( -8 * SIZE, A1, %xmm4) prefetchnta A_PRE(A1) vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 VMOVUPS_A1( -6 * SIZE, A1, %xmm5) vshufpd $0x01, %xmm9, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 VMOVUPS_A1( -4 * SIZE, A1, %xmm6) VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) prefetchnta A_PRE(Y1) VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L53 ALIGN_3 .L54: vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 VMOVUPS_A1(-10 * SIZE, A1, %xmm7) vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 VMOVUPS_A1(-14 * SIZE, A2, %xmm5) vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_A1(-12 * SIZE, A2, %xmm6) vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 VMOVUPS_A1(-10 * SIZE, A2, %xmm7) vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2 VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3 VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) vshufpd $0x01, %xmm4, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1 vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2 vshufpd $0x01, %xmm9, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L55: testq $4, MM je .L56 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A1(-14 * SIZE, A1, %xmm5) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) vshufpd $0x01, %xmm6, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm7, %xmm8 vshufpd $0x01, %xmm7, %xmm6, %xmm6 vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 VMOVUPS_A1(-16 * SIZE, A2, %xmm4) VMOVUPS_A1(-14 * SIZE, A2, %xmm5) vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0 vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1 VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) vshufpd $0x01, %xmm6, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 movaps %xmm7, %xmm9 vshufpd $0x01, %xmm7, %xmm6, %xmm6 vfmaddpd %xmm1 , %xmm15 , %xmm6 , %xmm1 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L56: testq $2, MM je .L57 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) VMOVUPS_A1(-16 * SIZE, A2, %xmm6) VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 vshufpd $0x01, %xmm5, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm5, %xmm8 vfmaddpd %xmm0 , %xmm14 , %xmm6 , %xmm0 vshufpd $0x01, %xmm7, %xmm9, %xmm9 vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0 movaps %xmm7, %xmm9 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L57: testq $1, MM je .L58 vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm4 vshufpd $0x01, %xmm8, %xmm8, %xmm8 vmovsd -16 * SIZE(A2), %xmm6 vshufpd $0x01, %xmm9, %xmm9, %xmm9 vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0 vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm15 , %xmm9 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) ALIGN_3 .L58: cmpq $4, N jge .L51 ALIGN_3 .L60: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L70 #if GEMV_UNROLL == 2 ALIGN_3 .L61: #endif subq $2, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA), A2 leaq (A, LDA, 2), A vmovddup (X), %xmm12 addq INCX, X vmovddup (X), %xmm13 addq INCX, X vmovddup ALPHA, %xmm0 vmulpd %xmm0, %xmm12 , %xmm12 vmulpd %xmm0, %xmm13 , %xmm13 testq $SIZE, A je .L6X vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(A2), %xmm5 vmovsd -16 * SIZE(Y1), %xmm0 vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, Y1 ALIGN_3 .L6X: vmovhpd -16 * SIZE(A2), %xmm8, %xmm8 movq MM, I sarq $3, I jle .L65 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A1(-14 * SIZE, A1, %xmm5) VMOVUPS_A1(-12 * SIZE, A1, %xmm6) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3) decq I jle .L64 ALIGN_3 .L63: vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 VMOVUPS_A1(-10 * SIZE, A1, %xmm7) vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 VMOVUPS_A1(-15 * SIZE, A2, %xmm4) prefetchnta A_PRE(A2) vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 VMOVUPS_A1(-13 * SIZE, A2, %xmm5) vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 VMOVUPS_A1(-11 * SIZE, A2, %xmm6) vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A1( -9 * SIZE, A2, %xmm8) vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 VMOVUPS_A1( -8 * SIZE, A1, %xmm4) prefetchnta A_PRE(A1) vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 VMOVUPS_A1( -6 * SIZE, A1, %xmm5) vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_A1( -4 * SIZE, A1, %xmm6) VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0) VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1) prefetchnta A_PRE(Y1) VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2) VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L63 ALIGN_3 .L64: vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 VMOVUPS_A1(-10 * SIZE, A1, %xmm7) vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 VMOVUPS_A1(-15 * SIZE, A2, %xmm4) vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2 VMOVUPS_A1(-13 * SIZE, A2, %xmm5) vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3 VMOVUPS_A1(-11 * SIZE, A2, %xmm6) vshufpd $0x01, %xmm4, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 VMOVUPS_A1( -9 * SIZE, A2, %xmm8) vshufpd $0x01, %xmm5, %xmm4, %xmm4 vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1 vshufpd $0x01, %xmm6, %xmm5, %xmm5 vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2 vshufpd $0x01, %xmm8, %xmm6, %xmm6 vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L65: testq $4, MM je .L66 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A1(-14 * SIZE, A1, %xmm5) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1) vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1 VMOVUPS_A1(-15 * SIZE, A2, %xmm6) VMOVUPS_A1(-13 * SIZE, A2, %xmm7) vshufpd $0x01, %xmm6, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm7, %xmm8 vshufpd $0x01, %xmm7, %xmm6, %xmm6 vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L66: testq $2, MM je .L67 VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A1(-15 * SIZE, A2, %xmm5) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 vshufpd $0x01, %xmm5, %xmm8, %xmm8 vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0 movaps %xmm5, %xmm8 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L67: testq $1, MM #if GEMV_UNROLL == 2 je .L68 #else je .L70 #endif vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm4 vshufpd $0x01, %xmm8, %xmm8 , %xmm8 vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) ALIGN_3 #if GEMV_UNROLL == 2 .L68: cmpq $2, N jge .L61 ALIGN_3 #endif .L70: cmpq $1, N jl .L900 #endif leaq 16 * SIZE(BUFFER), Y1 movq A, A1 vmovddup (X), %xmm12 addq INCX, X vmovddup ALPHA, %xmm0 vmulpd %xmm0, %xmm12 , %xmm12 testq $SIZE, A je .L7X vmovsd -16 * SIZE(A1), %xmm4 vmovsd -16 * SIZE(Y1), %xmm0 vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) addq $SIZE, A1 addq $SIZE, Y1 ALIGN_3 .L7X: movq MM, I sarq $3, I jle .L75 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_A1(-12 * SIZE, A1, %xmm2) VMOVUPS_A1(-10 * SIZE, A1, %xmm3) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11) decq I jle .L74 ALIGN_3 .L73: vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 VMOVUPS_A1( -8 * SIZE, A1, %xmm0) vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 VMOVUPS_A1( -6 * SIZE, A1, %xmm1) prefetchnta A_PRE(A1) vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 VMOVUPS_A1( -4 * SIZE, A1, %xmm2) vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 VMOVUPS_A1( -2 * SIZE, A1, %xmm3) VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8) VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9) prefetchnta A_PRE(Y1) VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10) VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L73 ALIGN_3 .L74: vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10 VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10) vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11 VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 ALIGN_3 .L75: testq $4, MM je .L76 VMOVUPS_A1(-16 * SIZE, A1, %xmm0) VMOVUPS_A1(-14 * SIZE, A1, %xmm1) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8) VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9) vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8) vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9 VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L76: testq $2, MM je .L77 VMOVUPS_A1(-16 * SIZE, A1, %xmm8) VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0) vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0 VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L77: testq $1, MM je .L900 vmovsd -16 * SIZE(Y1), %xmm0 vmovsd -16 * SIZE(A1), %xmm8 vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0 vmovsd %xmm0, -16 * SIZE(Y1) #endif ALIGN_3 .L900: #ifndef COPY_FORCE cmpq Y, BUFFER je .L999 #endif cmpq $SIZE, INCY jne .L950 testq $SIZE, Y je .L910 vmovsd (Y), %xmm0 vaddsd (BUFFER), %xmm0, %xmm0 vmovsd %xmm0, (Y) addq $SIZE, Y addq $SIZE, BUFFER decq M jle .L999 ALIGN_4 .L910: testq $SIZE, BUFFER jne .L920 movq M, %rax sarq $3, %rax jle .L914 ALIGN_3 .L912: vmovups 0 * SIZE(Y), %xmm0 vmovups 2 * SIZE(Y), %xmm1 vmovups 4 * SIZE(Y), %xmm2 vmovups 6 * SIZE(Y), %xmm3 vmovups 0 * SIZE(BUFFER), %xmm4 vmovups 2 * SIZE(BUFFER), %xmm5 vmovups 4 * SIZE(BUFFER), %xmm6 vmovups 6 * SIZE(BUFFER), %xmm7 vaddpd %xmm4, %xmm0, %xmm0 vaddpd %xmm5, %xmm1, %xmm1 vaddpd %xmm6, %xmm2, %xmm2 vaddpd %xmm7, %xmm3, %xmm3 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) vmovups %xmm2, 4 * SIZE(Y) vmovups %xmm3, 6 * SIZE(Y) addq $8 * SIZE, Y addq $8 * SIZE, BUFFER decq %rax jg .L912 ALIGN_3 .L914: testq $7, M jle .L999 testq $4, M jle .L915 vmovups 0 * SIZE(Y), %xmm0 vmovups 2 * SIZE(Y), %xmm1 vmovups 0 * SIZE(BUFFER), %xmm4 vmovups 2 * SIZE(BUFFER), %xmm5 vaddpd %xmm4, %xmm0, %xmm0 vaddpd %xmm5, %xmm1, %xmm1 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) addq $4 * SIZE, Y addq $4 * SIZE, BUFFER ALIGN_3 .L915: testq $2, M jle .L916 vmovups (Y), %xmm0 vmovups (BUFFER), %xmm4 vaddpd %xmm4, %xmm0, %xmm0 vmovups %xmm0, (Y) addq $2 * SIZE, Y addq $2 * SIZE, BUFFER ALIGN_3 .L916: testq $1, M jle .L999 vmovsd (Y), %xmm0 vmovsd 0 * SIZE(BUFFER), %xmm4 vaddsd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y) ALIGN_3 jmp .L999 ALIGN_4 .L920: vmovups -1 * SIZE(BUFFER), %xmm4 movq M, %rax sarq $3, %rax jle .L924 ALIGN_3 .L922: vmovups 0 * SIZE(Y), %xmm0 vmovups 2 * SIZE(Y), %xmm1 vmovups 4 * SIZE(Y), %xmm2 vmovups 6 * SIZE(Y), %xmm3 vmovups 1 * SIZE(BUFFER), %xmm5 vmovups 3 * SIZE(BUFFER), %xmm6 vmovups 5 * SIZE(BUFFER), %xmm7 vmovups 7 * SIZE(BUFFER), %xmm8 vshufpd $0x01, %xmm5, %xmm4, %xmm4 vshufpd $0x01, %xmm6, %xmm5, %xmm5 vshufpd $0x01, %xmm7, %xmm6, %xmm6 vshufpd $0x01, %xmm8, %xmm7, %xmm7 vaddpd %xmm4, %xmm0, %xmm0 vaddpd %xmm5, %xmm1, %xmm1 vaddpd %xmm6, %xmm2, %xmm2 vaddpd %xmm7, %xmm3, %xmm3 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) vmovups %xmm2, 4 * SIZE(Y) vmovups %xmm3, 6 * SIZE(Y) vmovups %xmm8, %xmm4 addq $8 * SIZE, Y addq $8 * SIZE, BUFFER decq %rax jg .L922 ALIGN_3 .L924: testq $7, M jle .L999 testq $4, M jle .L925 vmovups 0 * SIZE(Y), %xmm0 vmovups 2 * SIZE(Y), %xmm1 vmovups 1 * SIZE(BUFFER), %xmm5 vmovups 3 * SIZE(BUFFER), %xmm6 vshufpd $0x01, %xmm5, %xmm4, %xmm4 vshufpd $0x01, %xmm6, %xmm5, %xmm5 vaddpd %xmm4, %xmm0, %xmm0 vaddpd %xmm5, %xmm1, %xmm1 vmovups %xmm0, 0 * SIZE(Y) vmovups %xmm1, 2 * SIZE(Y) vmovups %xmm6, %xmm4 addq $4 * SIZE, Y addq $4 * SIZE, BUFFER ALIGN_3 .L925: testq $2, M jle .L926 vmovups (Y), %xmm0 vmovups 1 * SIZE(BUFFER), %xmm5 vshufpd $0x01, %xmm5, %xmm4, %xmm4 vaddpd %xmm4, %xmm0, %xmm0 vmovups %xmm0, (Y) movaps %xmm5, %xmm4 addq $2 * SIZE, Y addq $2 * SIZE, BUFFER ALIGN_3 .L926: testq $1, M jle .L999 vmovsd (Y), %xmm0 vshufpd $0x01, %xmm4 ,%xmm4, %xmm4 vaddsd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y) ALIGN_3 jmp .L999 ALIGN_4 .L950: testq $SIZE, BUFFER je .L960 vmovsd (Y), %xmm0 vaddsd (BUFFER), %xmm0, %xmm0 vmovsd %xmm0, (Y) addq INCY, Y addq $SIZE, BUFFER decq M jle .L999 ALIGN_4 .L960: movq Y, Y1 movq M, %rax sarq $3, %rax jle .L964 ALIGN_3 .L962: vmovsd (Y), %xmm0 addq INCY, Y vmovhpd (Y), %xmm0, %xmm0 addq INCY, Y vmovups 0 * SIZE(BUFFER), %xmm4 vmovsd (Y), %xmm1 addq INCY, Y vmovhpd (Y), %xmm1, %xmm1 addq INCY, Y vmovups 2 * SIZE(BUFFER), %xmm5 vmovsd (Y), %xmm2 addq INCY, Y vmovhpd (Y), %xmm2, %xmm2 addq INCY, Y vmovups 4 * SIZE(BUFFER), %xmm6 vaddpd %xmm4, %xmm0, %xmm0 vmovsd (Y), %xmm3 addq INCY, Y vmovhpd (Y), %xmm3, %xmm3 addq INCY, Y vmovups 6 * SIZE(BUFFER), %xmm7 vaddpd %xmm5, %xmm1, %xmm1 vmovsd %xmm0, (Y1) addq INCY, Y1 vmovhpd %xmm0, (Y1) addq INCY, Y1 vaddpd %xmm6, %xmm2, %xmm2 vmovsd %xmm1, (Y1) addq INCY, Y1 vmovhpd %xmm1, (Y1) addq INCY, Y1 vaddpd %xmm7, %xmm3, %xmm3 vmovsd %xmm2, (Y1) addq INCY, Y1 vmovhpd %xmm2, (Y1) addq INCY, Y1 vmovsd %xmm3, (Y1) addq INCY, Y1 vmovhpd %xmm3, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER decq %rax jg .L962 ALIGN_3 .L964: testq $7, M jle .L999 testq $4, M jle .L965 vmovsd (Y), %xmm0 addq INCY, Y vmovhpd (Y), %xmm0, %xmm0 addq INCY, Y vmovups 0 * SIZE(BUFFER), %xmm4 vmovsd (Y), %xmm1 addq INCY, Y vmovhpd (Y), %xmm1, %xmm1 addq INCY, Y vmovups 2 * SIZE(BUFFER), %xmm5 vaddpd %xmm4, %xmm0, %xmm0 vaddpd %xmm5, %xmm1, %xmm1 vmovsd %xmm0, (Y1) addq INCY, Y1 vmovhpd %xmm0, (Y1) addq INCY, Y1 vmovsd %xmm1, (Y1) addq INCY, Y1 vmovhpd %xmm1, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L965: testq $2, M jle .L966 vmovsd (Y), %xmm0 addq INCY, Y vmovhpd (Y),%xmm0, %xmm0 addq INCY, Y vmovups 0 * SIZE(BUFFER), %xmm4 vaddpd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y1) addq INCY, Y1 vmovhpd %xmm0, (Y1) addq INCY, Y1 addq $2 * SIZE, BUFFER ALIGN_3 .L966: testq $1, M jle .L999 vmovsd (Y), %xmm0 vmovsd 0 * SIZE(BUFFER), %xmm4 vaddsd %xmm4, %xmm0, %xmm0 vmovsd %xmm0, (Y1) ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemv_n_microk_haswell-4.c000066400000000000000000000132721313527062700223760ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 "vmovups (%4,%0,8), %%ymm0 \n\t" "vmovups (%5,%0,8), %%ymm1 \n\t" "vmovups (%6,%0,8), %%ymm2 \n\t" "vmovups (%7,%0,8), %%ymm3 \n\t" "vbroadcastsd (%8), %%ymm6 \n\t" // alpha "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jz 2f \n\t" // ".align 16 \n\t" "1: \n\t" "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" "vmovups (%4,%0,8), %%ymm0 \n\t" "vmovups (%5,%0,8), %%ymm1 \n\t" "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" "vmovups (%6,%0,8), %%ymm2 \n\t" "vmovups (%7,%0,8), %%ymm3 \n\t" "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" "vfmadd231pd %%ymm2 , %%ymm14, %%ymm4 \n\t" "vfmadd231pd %%ymm3 , %%ymm15, %%ymm5 \n\t" "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x2 static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vmovups (%4,%0,8), %%ymm0 \n\t" "vmovups (%5,%0,8), %%ymm1 \n\t" "vbroadcastsd (%6), %%ymm6 \n\t" // alpha "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jz 2f \n\t" "1: \n\t" "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" "vmovups (%4,%0,8), %%ymm0 \n\t" "vmovups (%5,%0,8), %%ymm1 \n\t" "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm8", "%xmm12", "%xmm13", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dgemv_n_microk_nehalem-4.c000066400000000000000000000137461313527062700223560ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "movsd (%2), %%xmm12 \n\t" // x0 "movsd 8(%2), %%xmm13 \n\t" // x1 "movsd 16(%2), %%xmm14 \n\t" // x2 "movsd 24(%2), %%xmm15 \n\t" // x3 "shufpd $0, %%xmm12, %%xmm12\n\t" "shufpd $0, %%xmm13, %%xmm13\n\t" "shufpd $0, %%xmm14, %%xmm14\n\t" "shufpd $0, %%xmm15, %%xmm15\n\t" "movsd (%8), %%xmm6 \n\t" // alpha "shufpd $0, %%xmm6 , %%xmm6 \n\t" "movups (%4,%0,8), %%xmm8 \n\t" "movups 16(%4,%0,8), %%xmm0 \n\t" "movups (%5,%0,8), %%xmm9 \n\t" "movups 16(%5,%0,8), %%xmm1 \n\t" "movups (%6,%0,8), %%xmm10 \n\t" "movups 16(%6,%0,8), %%xmm2 \n\t" "movups (%7,%0,8), %%xmm11 \n\t" "movups 16(%7,%0,8), %%xmm3 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" "movups -32(%3,%0,8), %%xmm7 \n\t" // 2 * y "mulpd %%xmm12, %%xmm8 \n\t" "mulpd %%xmm12, %%xmm0 \n\t" "addpd %%xmm8 , %%xmm4 \n\t" "addpd %%xmm0 , %%xmm5 \n\t" "movups (%4,%0,8), %%xmm8 \n\t" "movups 16(%4,%0,8), %%xmm0 \n\t" "mulpd %%xmm13, %%xmm9 \n\t" "mulpd %%xmm13, %%xmm1 \n\t" "addpd %%xmm9 , %%xmm4 \n\t" "addpd %%xmm1 , %%xmm5 \n\t" "movups (%5,%0,8), %%xmm9 \n\t" "movups 16(%5,%0,8), %%xmm1 \n\t" "mulpd %%xmm14, %%xmm10 \n\t" "mulpd %%xmm14, %%xmm2 \n\t" "addpd %%xmm10 , %%xmm4 \n\t" "addpd %%xmm2 , %%xmm5 \n\t" "movups (%6,%0,8), %%xmm10 \n\t" "movups 16(%6,%0,8), %%xmm2 \n\t" "mulpd %%xmm15, %%xmm11 \n\t" "mulpd %%xmm15, %%xmm3 \n\t" "addpd %%xmm11 , %%xmm4 \n\t" "addpd %%xmm3 , %%xmm5 \n\t" "movups (%7,%0,8), %%xmm11 \n\t" "movups 16(%7,%0,8), %%xmm3 \n\t" "mulpd %%xmm6 , %%xmm4 \n\t" "addpd %%xmm7 , %%xmm4 \n\t" "movups -16(%3,%0,8), %%xmm7 \n\t" // 2 * y "movups %%xmm4 , -32(%3,%0,8) \n\t" // 2 * y "mulpd %%xmm6 , %%xmm5 \n\t" "addpd %%xmm7 , %%xmm5 \n\t" "movups %%xmm5 , -16(%3,%0,8) \n\t" // 2 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "2: \n\t" "xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t" "mulpd %%xmm12, %%xmm8 \n\t" "addpd %%xmm8 , %%xmm4 \n\t" "mulpd %%xmm13, %%xmm9 \n\t" "addpd %%xmm9 , %%xmm4 \n\t" "mulpd %%xmm14, %%xmm10 \n\t" "addpd %%xmm10 , %%xmm4 \n\t" "mulpd %%xmm15, %%xmm11 \n\t" "addpd %%xmm11 , %%xmm4 \n\t" "mulpd %%xmm12, %%xmm0 \n\t" "addpd %%xmm0 , %%xmm5 \n\t" "mulpd %%xmm13, %%xmm1 \n\t" "addpd %%xmm1 , %%xmm5 \n\t" "mulpd %%xmm14, %%xmm2 \n\t" "addpd %%xmm2 , %%xmm5 \n\t" "mulpd %%xmm15, %%xmm3 \n\t" "addpd %%xmm3 , %%xmm5 \n\t" "movups -32(%3,%0,8), %%xmm7 \n\t" // 2 * y "mulpd %%xmm6 , %%xmm4 \n\t" "addpd %%xmm7 , %%xmm4 \n\t" "movups %%xmm4 , -32(%3,%0,8) \n\t" // 2 * y "movups -16(%3,%0,8), %%xmm7 \n\t" // 2 * y "mulpd %%xmm6 , %%xmm5 \n\t" "addpd %%xmm7 , %%xmm5 \n\t" "movups %%xmm5 , -16(%3,%0,8) \n\t" // 2 * y : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dgemv_n_microk_piledriver-4.c000066400000000000000000000206231313527062700231020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x8 1 static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 "vbroadcastsd (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y "addq $4 , %8 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" ".align 16 \n\t" "1: \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" "addq $8 , %0 \n\t" "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" "addq $8 , %8 \n\t" "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y "subq $8 , %1 \n\t" "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y "jnz 1b \n\t" "3: \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastsd (%2), %%ymm12 \n\t" // x0 "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 "vbroadcastsd (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" ".align 16 \n\t" "1: \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dgemv_t.S000066400000000000000000001347611313527062700171470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #if GEMV_UNROLL < 2 #undef GEMV_UNROLL #define GEMV_UNROLL 2 #endif #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx #define OLD_LDA %r8 #define STACK_INCX 8 + STACKSIZE(%rsp) #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) #define MMM 56(%rsp) #define NN 64(%rsp) #define AA 72(%rsp) #define LDAX 80(%rsp) #else #define STACKSIZE 256 #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define STACK_INCX 64 + STACKSIZE(%rsp) #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) //Temp variables for M,N,A,LDA #define MMM 224(%rsp) #define NN 232(%rsp) #define AA 240(%rsp) #define LDAX 248(%rsp) #endif #define LDA %r8 #define X %r9 #define INCX %rsi #define INCY %rdi #define M %r10 #define N %r11 #define A %r12 #define Y %r14 #define BUFFER %r13 #define I %rax #define A1 %rbx #define A2 %rcx #define LDA3 %rdx #define Y1 %rbp #define X1 %r15 #ifdef ALIGNED_ACCESS #define MM INCX #else #define MM M #endif #define ALPHA %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_M, M movq OLD_N, N movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movq M, MMM movq N, NN movq A, AA movq LDA, LDAX #else movq OLD_M, MMM movq OLD_N, NN movq OLD_A, AA movq OLD_LDA, LDAX #endif #ifdef HAVE_SSE3 #ifndef WINDOWS_ABI movddup %xmm0, ALPHA #else movddup %xmm3, ALPHA #endif #else #ifndef WINDOWS_ABI movapd %xmm0, ALPHA #else movapd %xmm3, ALPHA #endif unpcklpd ALPHA, ALPHA #endif .L0x: xorq M,M addq $1,M salq $21,M subq M,MMM jge .L00 movq MMM,%rax addq M,%rax jle .L999x movq %rax,M .L00: movq LDAX,LDA movq NN,N movq AA,A movq STACK_INCX, INCX movq STACK_Y, Y movq STACK_INCY, INCY movq STACK_BUFFER, BUFFER leaq -1(INCX), %rax leaq (,LDA, SIZE), LDA leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (LDA, LDA, 2), LDA3 subq $-16 * SIZE, A testq M, M jle .L999 testq N, N jle .L999 movq BUFFER, X1 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L01 movsd (X), %xmm0 addq INCX, X movsd %xmm0, 1 * SIZE(BUFFER) addq $1 * SIZE, BUFFER addq $2 * SIZE, X1 decq M jle .L10 ALIGN_4 .L01: #endif movq M, I sarq $3, I jle .L05 ALIGN_4 .L02: movsd (X), %xmm0 addq INCX, X movhpd (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X movhpd (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movhpd (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X movhpd (X), %xmm3 addq INCX, X movapd %xmm0, 0 * SIZE(X1) movapd %xmm1, 2 * SIZE(X1) movapd %xmm2, 4 * SIZE(X1) movapd %xmm3, 6 * SIZE(X1) addq $8 * SIZE, X1 decq I jg .L02 ALIGN_4 .L05: movq M, I andq $7, I jle .L10 ALIGN_2 .L06: movsd (X), %xmm0 addq INCX, X movsd %xmm0, 0 * SIZE(X1) addq $SIZE, X1 decq I jg .L06 ALIGN_4 .L10: movq Y, Y1 #ifdef ALIGNED_ACCESS testq $SIZE, LDA jne .L50 #endif #if GEMV_UNROLL >= 8 cmpq $8, N jl .L20 ALIGN_3 .L11: subq $8, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 4), A2 leaq (A1, LDA, 8), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef PREFETCHW PREFETCHW 7 * SIZE(Y1) #endif #ifdef ALIGNED_ACCESS testq $SIZE, A je .L1X movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm1 movsd -16 * SIZE(A1, LDA, 2), %xmm10 mulsd %xmm12, %xmm10 addsd %xmm10, %xmm2 movsd -16 * SIZE(A1, LDA3), %xmm11 mulsd %xmm12, %xmm11 addsd %xmm11, %xmm3 movsd -16 * SIZE(A2), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm4 movsd -16 * SIZE(A2, LDA), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm5 movsd -16 * SIZE(A2, LDA, 2), %xmm10 mulsd %xmm12, %xmm10 addsd %xmm10, %xmm6 movsd -16 * SIZE(A2, LDA3), %xmm11 mulsd %xmm12, %xmm11 addsd %xmm11, %xmm7 addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, X1 ALIGN_3 .L1X: #endif movq M, I sarq $3, I jle .L15 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm7 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm7 MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-10 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-8 * SIZE, X1, %xmm12) addpd %xmm11, %xmm7 MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-10 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-8 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-8 * SIZE, A1, LDA, 1, %xmm9) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) #endif mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-8 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm7 MOVUPS_A2(-8 * SIZE, A1, LDA3, 1, %xmm11) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L12 ALIGN_4 .L13: mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm7 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm7 MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-10 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm7 MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-10 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm7 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L15: testq $4, M jle .L16 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm7 MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm7 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L16: testq $2, M jle .L17 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm7 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L17: testq $1, M je .L18 movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm1 movsd -16 * SIZE(A1, LDA, 2), %xmm10 mulsd %xmm12, %xmm10 addsd %xmm10, %xmm2 movsd -16 * SIZE(A1, LDA3), %xmm11 mulsd %xmm12, %xmm11 addsd %xmm11, %xmm3 movsd -16 * SIZE(A2), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm4 movsd -16 * SIZE(A2, LDA), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm5 movsd -16 * SIZE(A2, LDA, 2), %xmm10 mulsd %xmm12, %xmm10 addsd %xmm10, %xmm6 movsd -16 * SIZE(A2, LDA3), %xmm11 mulsd %xmm12, %xmm11 addsd %xmm11, %xmm7 ALIGN_4 .L18: #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 haddpd %xmm3, %xmm2 haddpd %xmm5, %xmm4 haddpd %xmm7, %xmm6 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm9 movapd %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm10 movapd %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 addpd %xmm10, %xmm4 addpd %xmm11, %xmm6 #endif mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm4 mulpd ALPHA, %xmm6 cmpq $SIZE, INCY jne .L19 movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 movsd 2 * SIZE(Y), %xmm9 movhpd 3 * SIZE(Y), %xmm9 movsd 4 * SIZE(Y), %xmm10 movhpd 5 * SIZE(Y), %xmm10 movsd 6 * SIZE(Y), %xmm11 movhpd 7 * SIZE(Y), %xmm11 addq $8 * SIZE, Y addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 addpd %xmm10, %xmm4 addpd %xmm11, %xmm6 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) movlpd %xmm2, 2 * SIZE(Y1) movhpd %xmm2, 3 * SIZE(Y1) movlpd %xmm4, 4 * SIZE(Y1) movhpd %xmm4, 5 * SIZE(Y1) movlpd %xmm6, 6 * SIZE(Y1) movhpd %xmm6, 7 * SIZE(Y1) addq $8 * SIZE, Y1 cmpq $8, N jge .L11 jmp .L20 ALIGN_4 .L19: movsd (Y), %xmm8 addq INCY, Y movhpd (Y), %xmm8 addq INCY, Y movsd (Y), %xmm9 addq INCY, Y movhpd (Y), %xmm9 addq INCY, Y movsd (Y), %xmm10 addq INCY, Y movhpd (Y), %xmm10 addq INCY, Y movsd (Y), %xmm11 addq INCY, Y movhpd (Y), %xmm11 addq INCY, Y addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 addpd %xmm10, %xmm4 addpd %xmm11, %xmm6 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 movlpd %xmm2, (Y1) addq INCY, Y1 movhpd %xmm2, (Y1) addq INCY, Y1 movlpd %xmm4, (Y1) addq INCY, Y1 movhpd %xmm4, (Y1) addq INCY, Y1 movlpd %xmm6, (Y1) addq INCY, Y1 movhpd %xmm6, (Y1) addq INCY, Y1 cmpq $8, N jge .L11 ALIGN_4 .L20: #endif #if GEMV_UNROLL >= 4 cmpq $4, N jl .L30 #if GEMV_UNROLL == 4 ALIGN_3 .L21: #endif subq $4, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #if (GEMV_UNROLL == 4 ) && defined(PREFETCHW) PREFETCHW 3 * SIZE(Y1) #endif #ifdef ALIGNED_ACCESS testq $SIZE, A je .L2X movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm1 movsd -16 * SIZE(A2), %xmm10 mulsd %xmm12, %xmm10 addsd %xmm10, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm11 mulsd %xmm12, %xmm11 addsd %xmm11, %xmm3 addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, X1 ALIGN_3 .L2X: #endif movq M, I sarq $3, I jle .L25 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) MOVUPS_A1(-16 * SIZE, A2, %xmm10) MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L23 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-10 * SIZE, A2, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm9) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1( -8 * SIZE, A2, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 MOVUPS_A2( -8 * SIZE, A2, LDA, 1, %xmm11) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L22 ALIGN_4 .L23: mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-10 * SIZE, A2, %xmm10) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L25: testq $4, M jle .L26 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A1(-16 * SIZE, A2, %xmm10) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A1(-14 * SIZE, A2, %xmm10) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L26: testq $2, M jle .L27 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_A1(-16 * SIZE, A2, %xmm10) mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L27: testq $1, M je .L28 movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm1 movsd -16 * SIZE(A2), %xmm10 mulsd %xmm12, %xmm10 addsd %xmm10, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm11 mulsd %xmm12, %xmm11 addsd %xmm11, %xmm3 ALIGN_4 .L28: #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 haddpd %xmm3, %xmm2 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm9 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 #endif mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm2 cmpq $SIZE, INCY jne .L29 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 movsd 2 * SIZE(Y), %xmm5 movhpd 3 * SIZE(Y), %xmm5 addq $4 * SIZE, Y addpd %xmm4, %xmm0 addpd %xmm5, %xmm2 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) movlpd %xmm2, 2 * SIZE(Y1) movhpd %xmm2, 3 * SIZE(Y1) addq $4 * SIZE, Y1 #if GEMV_UNROLL == 4 cmpq $4, N jge .L21 #endif jmp .L30 ALIGN_4 .L29: movsd (Y), %xmm4 addq INCY, Y movhpd (Y), %xmm4 addq INCY, Y movsd (Y), %xmm5 addq INCY, Y movhpd (Y), %xmm5 addq INCY, Y addpd %xmm4, %xmm0 addpd %xmm5, %xmm2 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 movlpd %xmm2, (Y1) addq INCY, Y1 movhpd %xmm2, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 4 cmpq $4, N jge .L21 #endif ALIGN_4 .L30: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L40 #if GEMV_UNROLL == 2 ALIGN_3 .L31: #endif subq $2, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #if (GEMV_UNROLL == 2 ) && defined(PREFETCHW) PREFETCHW 2 * SIZE(Y1) #endif #ifdef ALIGNED_ACCESS testq $SIZE, A je .L3X movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd -16 * SIZE(A2), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm1 addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, X1 ALIGN_3 .L3X: #endif movq M, I sarq $3, I jle .L35 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-16 * SIZE, A2, %xmm9) MOVUPS_A1(-14 * SIZE, A1, %xmm10) MOVUPS_A1(-14 * SIZE, A2, %xmm11) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L33 ALIGN_4 .L32: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm9, %xmm1 MOVUPS_A1(-12 * SIZE, A2, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-10 * SIZE, A1, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 MOVUPS_A1(-10 * SIZE, A2, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm9, %xmm1 MOVUPS_A1( -8 * SIZE, A2, %xmm9) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1( -6 * SIZE, A1, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 MOVUPS_A1( -6 * SIZE, A2, %xmm11) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L32 ALIGN_4 .L33: mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm9, %xmm1 MOVUPS_A1(-12 * SIZE, A2, %xmm9) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-10 * SIZE, A1, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 MOVUPS_A1(-10 * SIZE, A2, %xmm11) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L35: testq $4, M jle .L36 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm9) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) MOVUPS_A1(-14 * SIZE, A1, %xmm10) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm11) mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L36: testq $2, M jle .L37 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm9) mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L37: testq $1, M je .L38 movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd -16 * SIZE(A2), %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm1 ALIGN_4 .L38: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 addpd %xmm8, %xmm0 #endif mulpd ALPHA, %xmm0 movsd (Y), %xmm4 addq INCY, Y movhpd (Y), %xmm4 addq INCY, Y addpd %xmm4, %xmm0 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L31 #endif ALIGN_4 .L40: cmpq $1, N jl .L999 #endif leaq 16 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L4X movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 addq $SIZE, A1 addq $SIZE, X1 ALIGN_3 .L4X: #endif movq M, I sarq $3, I jle .L45 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-14 * SIZE, A1, %xmm9) MOVUPS_A1(-12 * SIZE, A1, %xmm10) MOVUPS_A1(-10 * SIZE, A1, %xmm11) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L43 ALIGN_4 .L42: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm8 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm8, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm9, %xmm2 MOVUPS_A1( -6 * SIZE, A1, %xmm9) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) #endif mulpd %xmm12, %xmm10 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm10, %xmm0 MOVUPS_A1( -4 * SIZE, A1, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm2 MOVUPS_A1( -2 * SIZE, A1, %xmm11) addq $8 * SIZE, A1 addq $8 * SIZE, X1 decq I jg .L42 ALIGN_4 .L43: mulpd %xmm12, %xmm8 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm9, %xmm2 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm2 addq $8 * SIZE, A1 addq $8 * SIZE, X1 ALIGN_4 .L45: testq $4, M jle .L46 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-14 * SIZE, A1, %xmm9) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm2 addq $4 * SIZE, A1 addq $4 * SIZE, X1 ALIGN_4 .L46: testq $2, M jle .L47 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_A1(-16 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_4 .L47: testq $1, M je .L48 movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 ALIGN_4 .L48: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 addpd %xmm1, %xmm0 #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 addsd %xmm8, %xmm0 #endif mulsd ALPHA, %xmm0 movsd (Y), %xmm4 addq INCY, Y addsd %xmm4, %xmm0 movlpd %xmm0, (Y1) addq INCY, Y1 #ifdef ALIGNED_ACCESS jmp .L999 ALIGN_4 .L50: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L60 ALIGN_3 .L51: subq $4, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif #ifdef ALIGNED_ACCESS testq $SIZE, A je .L5X movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm5 mulsd %xmm12, %xmm5 addsd %xmm5, %xmm1 movsd -16 * SIZE(A2), %xmm6 mulsd %xmm12, %xmm6 addsd %xmm6, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm7 mulsd %xmm12, %xmm7 addsd %xmm7, %xmm3 addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, X1 ALIGN_3 .L5X: #endif movhpd -16 * SIZE(A1, LDA), %xmm8 movhpd -16 * SIZE(A2, LDA), %xmm9 movq M, I sarq $3, I jle .L55 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A1(-16 * SIZE, A2, %xmm6) MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L53 ALIGN_4 .L52: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-14 * SIZE, A1, %xmm4) shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm6) shufpd $1, %xmm7, %xmm9 mulpd %xmm12, %xmm9 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm9, %xmm3 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA) #endif mulpd %xmm13, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm4) shufpd $1, %xmm8, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm6) shufpd $1, %xmm9, %xmm7 mulpd %xmm13, %xmm7 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm7, %xmm3 MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm4) shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-10 * SIZE, A2, %xmm6) shufpd $1, %xmm7, %xmm9 mulpd %xmm12, %xmm9 MOVUPS_XL1(-8 * SIZE, X1, %xmm12) addpd %xmm9, %xmm3 MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA) #endif mulpd %xmm13, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-8 * SIZE, A1, %xmm4) shufpd $1, %xmm8, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(X1) #endif mulpd %xmm13, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-8 * SIZE, A2, %xmm6) shufpd $1, %xmm9, %xmm7 mulpd %xmm13, %xmm7 MOVUPS_XL1(-6 * SIZE, X1, %xmm13) addpd %xmm7, %xmm3 MOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L52 ALIGN_4 .L53: mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-14 * SIZE, A1, %xmm4) shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm6) shufpd $1, %xmm7, %xmm9 mulpd %xmm12, %xmm9 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm9, %xmm3 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm4) shufpd $1, %xmm8, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm6) shufpd $1, %xmm9, %xmm7 mulpd %xmm13, %xmm7 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm7, %xmm3 MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm4) shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-10 * SIZE, A2, %xmm6) shufpd $1, %xmm7, %xmm9 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm3 MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm8, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm2 shufpd $1, %xmm9, %xmm7 mulpd %xmm13, %xmm7 addpd %xmm7, %xmm3 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L55: testq $4, M jle .L56 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A1(-16 * SIZE, A2, %xmm6) MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-14 * SIZE, A1, %xmm4) shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-14 * SIZE, A2, %xmm6) shufpd $1, %xmm7, %xmm9 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm3 MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) mulpd %xmm13, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm8, %xmm5 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm2 shufpd $1, %xmm9, %xmm7 mulpd %xmm13, %xmm7 addpd %xmm7, %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L56: testq $2, M jle .L57 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A1(-16 * SIZE, A2, %xmm6) MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 movaps %xmm5, %xmm8 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 shufpd $1, %xmm7, %xmm9 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm3 movaps %xmm7, %xmm9 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L57: testq $1, M je .L58 movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 shufpd $1, %xmm8, %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm1 movsd -16 * SIZE(A2), %xmm6 mulsd %xmm12, %xmm6 addsd %xmm6, %xmm2 shufpd $1, %xmm9, %xmm9 mulsd %xmm12, %xmm9 addsd %xmm9, %xmm3 ALIGN_4 .L58: #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 haddpd %xmm3, %xmm2 #else movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm4 movapd %xmm2, %xmm5 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm5 addpd %xmm4, %xmm0 addpd %xmm5, %xmm2 #endif mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm2 cmpq $SIZE, INCY jne .L59 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 movsd 2 * SIZE(Y), %xmm5 movhpd 3 * SIZE(Y), %xmm5 addq $4 * SIZE, Y addpd %xmm4, %xmm0 addpd %xmm5, %xmm2 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) movlpd %xmm2, 2 * SIZE(Y1) movhpd %xmm2, 3 * SIZE(Y1) addq $4 * SIZE, Y1 cmpq $4, N jge .L51 jmp .L60 ALIGN_4 .L59: movsd (Y), %xmm4 addq INCY, Y movhpd (Y), %xmm4 addq INCY, Y movsd (Y), %xmm5 addq INCY, Y movhpd (Y), %xmm5 addq INCY, Y addpd %xmm4, %xmm0 addpd %xmm5, %xmm2 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 movlpd %xmm2, (Y1) addq INCY, Y1 movhpd %xmm2, (Y1) addq INCY, Y1 cmpq $4, N jge .L51 ALIGN_4 .L60: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L70 #if GEMV_UNROLL == 2 ALIGN_3 .L61: #endif subq $2, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #if (GEMV_UNROLL == 2 ) && defined(PREFETCHW) PREFETCHW 2 * SIZE(Y1) #endif #ifdef ALIGNED_ACCESS testq $SIZE, A je .L6X movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 movsd -16 * SIZE(A2), %xmm5 mulsd %xmm12, %xmm5 addsd %xmm5, %xmm1 addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, X1 ALIGN_3 .L6X: #endif movhpd -16 * SIZE(A2), %xmm8 movq M, I sarq $3, I jle .L65 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-15 * SIZE, A2, %xmm5) MOVUPS_A1(-14 * SIZE, A1, %xmm6) MOVUPS_A1(-13 * SIZE, A2, %xmm7) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L63 ALIGN_4 .L62: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm4) shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm8, %xmm1 MOVUPS_A1(-11 * SIZE, A2, %xmm9) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm6) shufpd $1, %xmm7, %xmm5 mulpd %xmm13, %xmm5 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm5, %xmm1 MOVUPS_A1( -9 * SIZE, A2, %xmm8) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-8 * SIZE, A1, %xmm4) shufpd $1, %xmm9, %xmm7 mulpd %xmm12, %xmm7 MOVUPS_XL1(-8 * SIZE, X1, %xmm12) addpd %xmm7, %xmm1 MOVUPS_A1(-7 * SIZE, A2, %xmm5) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(X1) #endif mulpd %xmm13, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A1(-6 * SIZE, A1, %xmm6) shufpd $1, %xmm8, %xmm9 mulpd %xmm13, %xmm9 MOVUPS_XL1(-6 * SIZE, X1, %xmm13) addpd %xmm9, %xmm1 MOVUPS_A1(-5 * SIZE, A2, %xmm7) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L62 ALIGN_4 .L63: mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm4) shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm8, %xmm1 MOVUPS_A1(-11 * SIZE, A2, %xmm9) mulpd %xmm13, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm6) shufpd $1, %xmm7, %xmm5 mulpd %xmm13, %xmm5 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm5, %xmm1 MOVUPS_A1( -9 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm9, %xmm7 mulpd %xmm12, %xmm7 addpd %xmm7, %xmm1 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm0 shufpd $1, %xmm8, %xmm9 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L65: testq $4, M jle .L66 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-15 * SIZE, A2, %xmm5) MOVUPS_A1(-14 * SIZE, A1, %xmm6) MOVUPS_A1(-13 * SIZE, A2, %xmm7) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 mulpd %xmm13, %xmm6 addpd %xmm6, %xmm0 shufpd $1, %xmm7, %xmm5 movaps %xmm7, %xmm8 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm1 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L66: testq $2, M jle .L67 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-15 * SIZE, A2, %xmm5) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 shufpd $1, %xmm5, %xmm8 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm1 movaps %xmm5, %xmm8 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L67: testq $1, M je .L68 movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 shufpd $1, %xmm8, %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm1 ALIGN_4 .L68: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm4 addpd %xmm4, %xmm0 #endif mulpd ALPHA, %xmm0 movsd (Y), %xmm4 addq INCY, Y movhpd (Y), %xmm4 addq INCY, Y addpd %xmm4, %xmm0 movlpd %xmm0, (Y1) addq INCY, Y1 movhpd %xmm0, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L61 #endif ALIGN_4 .L70: cmpq $1, N jl .L999 #endif leaq 16 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef ALIGNED_ACCESS testq $SIZE, A je .L7X movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 addq $SIZE, A1 addq $SIZE, X1 ALIGN_3 .L7X: #endif movq M, I sarq $3, I jle .L75 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_A1(-12 * SIZE, A1, %xmm6) MOVUPS_A1(-10 * SIZE, A1, %xmm7) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L73 ALIGN_4 .L72: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm4 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm4, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm4) mulpd %xmm13, %xmm5 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm5, %xmm2 MOVUPS_A1( -6 * SIZE, A1, %xmm5) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) #endif mulpd %xmm12, %xmm6 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm6, %xmm0 MOVUPS_A1( -4 * SIZE, A1, %xmm6) mulpd %xmm13, %xmm7 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm7, %xmm2 MOVUPS_A1( -2 * SIZE, A1, %xmm7) addq $8 * SIZE, A1 addq $8 * SIZE, X1 decq I jg .L72 ALIGN_4 .L73: mulpd %xmm12, %xmm4 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm5, %xmm2 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm13, %xmm7 addpd %xmm7, %xmm2 addq $8 * SIZE, A1 addq $8 * SIZE, X1 ALIGN_4 .L75: testq $4, M jle .L76 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm5) MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_XL1(-14 * SIZE, X1, %xmm13) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 addpd %xmm5, %xmm2 addq $4 * SIZE, A1 addq $4 * SIZE, X1 ALIGN_4 .L76: testq $2, M jle .L77 MOVUPS_XL1(-16 * SIZE, X1, %xmm12) MOVUPS_A1(-16 * SIZE, A1, %xmm4) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_4 .L77: testq $1, M je .L78 movsd -16 * SIZE(X1), %xmm12 movsd -16 * SIZE(A1), %xmm4 mulsd %xmm12, %xmm4 addsd %xmm4, %xmm0 ALIGN_4 .L78: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 addpd %xmm1, %xmm0 #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm4 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm4 addsd %xmm4, %xmm0 #endif mulsd ALPHA, %xmm0 movsd (Y), %xmm4 addq INCY, Y addsd %xmm4, %xmm0 movlpd %xmm0, (Y1) addq INCY, Y1 #endif ALIGN_4 .L999: leaq (, M, SIZE), %rax addq %rax,AA jmp .L0x; ALIGN_4 .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret ALIGN_4 EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemv_t_4.c000066400000000000000000000332201313527062700173760ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_t_microk_haswell-4.c" #endif #define NBMAX 2048 #ifndef HAVE_KERNEL_4x4 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; FLOAT temp3 = 0.0; for ( i=0; i< n; i+=4 ) { temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; } y[0] = temp0; y[1] = temp1; y[2] = temp2; y[3] = temp3; } #endif static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) { BLASLONG i; i=0; __asm__ __volatile__ ( "xorpd %%xmm10 , %%xmm10 \n\t" "xorpd %%xmm11 , %%xmm11 \n\t" "testq $2 , %1 \n\t" "jz 2f \n\t" "movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 "mulpd %%xmm14 , %%xmm12 \n\t" "mulpd %%xmm14 , %%xmm13 \n\t" "addq $2 , %0 \n\t" "addpd %%xmm12 , %%xmm10 \n\t" "subq $2 , %1 \n\t" "addpd %%xmm13 , %%xmm11 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 "mulpd %%xmm14 , %%xmm12 \n\t" "mulpd %%xmm14 , %%xmm13 \n\t" "addpd %%xmm12 , %%xmm10 \n\t" "addpd %%xmm13 , %%xmm11 \n\t" "movups 16(%5,%0,8) , %%xmm14 \n\t" // x "movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0 "movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1 "mulpd %%xmm14 , %%xmm12 \n\t" "mulpd %%xmm14 , %%xmm13 \n\t" "addpd %%xmm12 , %%xmm10 \n\t" "addpd %%xmm13 , %%xmm11 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "3: \n\t" "haddpd %%xmm10, %%xmm10 \n\t" "haddpd %%xmm11, %%xmm11 \n\t" "movsd %%xmm10, (%2) \n\t" "movsd %%xmm11,8(%2) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 "r" (x) // 5 : "cc", "%xmm4", "%xmm5", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; i=0; __asm__ __volatile__ ( "xorpd %%xmm9 , %%xmm9 \n\t" "xorpd %%xmm10 , %%xmm10 \n\t" "testq $2 , %1 \n\t" "jz 2f \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t" "mulpd %%xmm11 , %%xmm12 \n\t" "addq $2 , %0 \n\t" "addpd %%xmm12 , %%xmm10 \n\t" "subq $2 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" "movups 16(%3,%0,8) , %%xmm14 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t" "movups 16(%4,%0,8) , %%xmm13 \n\t" "mulpd %%xmm11 , %%xmm12 \n\t" "mulpd %%xmm13 , %%xmm14 \n\t" "addq $4 , %0 \n\t" "addpd %%xmm12 , %%xmm10 \n\t" "subq $4 , %1 \n\t" "addpd %%xmm14 , %%xmm9 \n\t" "jnz 1b \n\t" "3: \n\t" "addpd %%xmm9 , %%xmm10 \n\t" "haddpd %%xmm10, %%xmm10 \n\t" "movsd %%xmm10, (%2) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 : "cc", "%xmm9", "%xmm10" , "%xmm11", "%xmm12", "%xmm13", "%xmm14", "memory" ); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; for ( i=0; i> 2 ; n2 = n & 3 ; m3 = m & 3 ; m1 = m & -4 ; m2 = (m & (NBMAX-1)) - m3 ; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } y_ptr = y; a_ptr = a; x_ptr = x; if ( inc_x == 1 ) xbuffer = x_ptr; else copy_x(NB,x_ptr,xbuffer,inc_x); FLOAT *ap[4]; FLOAT *yp; BLASLONG register lda4 = 4 * lda; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if ( n0 > 0 ) { BLASLONG nb1 = NBMAX / 4; for( j=0; j 0 ) { add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); y_ptr += n1 * inc_y * 4; a_ptr += n1 * lda4 ; } if ( n2 & 2 ) { dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); a_ptr += lda * 2; *y_ptr += ybuffer[0] * alpha; y_ptr += inc_y; *y_ptr += ybuffer[1] * alpha; y_ptr += inc_y; } if ( n2 & 1 ) { dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); a_ptr += lda; *y_ptr += ybuffer[0] * alpha; y_ptr += inc_y; } a += NB; x += NB * inc_x; } if ( m3 == 0 ) return(0); x_ptr = x; a_ptr = a; if ( m3 == 3 ) { FLOAT xtemp0 = *x_ptr * alpha; x_ptr += inc_x; FLOAT xtemp1 = *x_ptr * alpha; x_ptr += inc_x; FLOAT xtemp2 = *x_ptr * alpha; FLOAT *aj = a_ptr; y_ptr = y; if ( lda == 3 && inc_y == 1 ) { for ( j=0; j< ( n & -4) ; j+=4 ) { y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; aj += 12; } for ( ; j= 8 cmpq $8, N jl .L20 ALIGN_3 .L11: subq $8, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 4), A2 leaq (A1, LDA, 8), A vxorps %xmm0 , %xmm0, %xmm0 vxorps %xmm1 , %xmm1, %xmm1 vxorps %xmm2 , %xmm2, %xmm2 vxorps %xmm3 , %xmm3, %xmm3 vxorps %xmm4 , %xmm4, %xmm4 vxorps %xmm5 , %xmm5, %xmm5 vxorps %xmm6 , %xmm6, %xmm6 vxorps %xmm7 , %xmm7, %xmm7 movq M, I sarq $3, I jle .L15 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L13 ALIGN_4 .L12: prefetchnta A_PRE(A1) prefetchnta A_PRE(A2) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 prefetchnta A_PRE(A1,LDA,1) prefetchnta A_PRE(A2,LDA,1) vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) prefetchnta A_PRE(A1,LDA,2) prefetchnta A_PRE(A2,LDA,2) vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 prefetchnta A_PRE(A1,LDA3,1) prefetchnta A_PRE(A2,LDA3,1) vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 prefetchnta A_PRE(X1) VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -12 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 vfmaddpd %xmm4 , -12 * SIZE(A2) , %xmm12 , %xmm4 vfmaddpd %xmm5 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 vfmaddpd %xmm6 , -12 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 vfmaddpd %xmm7 , -12 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -10 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 vfmaddpd %xmm4 , -10 * SIZE(A2) , %xmm13 , %xmm4 vfmaddpd %xmm5 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 vfmaddpd %xmm6 , -10 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 vfmaddpd %xmm7 , -10 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L12 ALIGN_4 .L13: vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -12 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 vfmaddpd %xmm4 , -12 * SIZE(A2) , %xmm12 , %xmm4 vfmaddpd %xmm5 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 vfmaddpd %xmm6 , -12 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 vfmaddpd %xmm7 , -12 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -10 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 vfmaddpd %xmm4 , -10 * SIZE(A2) , %xmm13 , %xmm4 vfmaddpd %xmm5 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 vfmaddpd %xmm6 , -10 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 vfmaddpd %xmm7 , -10 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L15: testq $4, M jle .L16 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -14 * SIZE(A1 , LDA , 2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A1 , LDA3, 1) , %xmm13 , %xmm3 vfmaddpd %xmm4 , -14 * SIZE(A2) , %xmm13 , %xmm4 vfmaddpd %xmm5 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm5 vfmaddpd %xmm6 , -14 * SIZE(A2 , LDA , 2) , %xmm13 , %xmm6 vfmaddpd %xmm7 , -14 * SIZE(A2 , LDA3, 1) , %xmm13 , %xmm7 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L16: testq $2, M jle .L17 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A1 , LDA , 2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A1 , LDA3, 1) , %xmm12 , %xmm3 vfmaddpd %xmm4 , -16 * SIZE(A2) , %xmm12 , %xmm4 vfmaddpd %xmm5 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm5 vfmaddpd %xmm6 , -16 * SIZE(A2 , LDA , 2) , %xmm12 , %xmm6 vfmaddpd %xmm7 , -16 * SIZE(A2 , LDA3, 1) , %xmm12 , %xmm7 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L17: testq $1, M je .L18 vmovsd -16 * SIZE(X1), %xmm12 vmovsd -16 * SIZE(A1), %xmm8 vmovsd -16 * SIZE(A1, LDA), %xmm9 vmovsd -16 * SIZE(A1, LDA, 2), %xmm10 vmovsd -16 * SIZE(A1, LDA3), %xmm11 vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 vmovsd -16 * SIZE(A2), %xmm8 vmovsd -16 * SIZE(A2, LDA), %xmm9 vmovsd -16 * SIZE(A2, LDA, 2), %xmm10 vmovsd -16 * SIZE(A2, LDA3), %xmm11 vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4 vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5 vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6 vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7 ALIGN_4 .L18: vhaddpd %xmm1, %xmm0 , %xmm0 vhaddpd %xmm3, %xmm2 , %xmm2 vhaddpd %xmm5, %xmm4 , %xmm4 vhaddpd %xmm7, %xmm6 , %xmm6 vmulpd ALPHA, %xmm0 , %xmm0 vmulpd ALPHA, %xmm2 , %xmm2 vmulpd ALPHA, %xmm4 , %xmm4 vmulpd ALPHA, %xmm6 , %xmm6 cmpq $SIZE, INCY jne .L19 vaddpd 0 * SIZE(Y), %xmm0 , %xmm0 vaddpd 2 * SIZE(Y), %xmm2 , %xmm2 vaddpd 4 * SIZE(Y), %xmm4 , %xmm4 vaddpd 6 * SIZE(Y), %xmm6 , %xmm6 addq $8 * SIZE, Y vmovups %xmm0, 0 * SIZE(Y1) vmovups %xmm2, 2 * SIZE(Y1) vmovups %xmm4, 4 * SIZE(Y1) vmovups %xmm6, 6 * SIZE(Y1) addq $8 * SIZE, Y1 cmpq $8, N jge .L11 jmp .L20 ALIGN_4 .L19: vmovsd (Y), %xmm8 addq INCY, Y vmovhpd (Y), %xmm8 , %xmm8 addq INCY, Y vmovsd (Y), %xmm9 addq INCY, Y vmovhpd (Y), %xmm9 , %xmm9 addq INCY, Y vmovsd (Y), %xmm10 addq INCY, Y vmovhpd (Y), %xmm10 , %xmm10 addq INCY, Y vmovsd (Y), %xmm11 addq INCY, Y vmovhpd (Y), %xmm11 , %xmm11 addq INCY, Y vaddpd %xmm8, %xmm0 , %xmm0 vaddpd %xmm9, %xmm2 , %xmm2 vaddpd %xmm10, %xmm4 , %xmm4 vaddpd %xmm11, %xmm6 , %xmm6 vmovlpd %xmm0, (Y1) addq INCY, Y1 vmovhpd %xmm0, (Y1) addq INCY, Y1 vmovlpd %xmm2, (Y1) addq INCY, Y1 vmovhpd %xmm2, (Y1) addq INCY, Y1 vmovlpd %xmm4, (Y1) addq INCY, Y1 vmovhpd %xmm4, (Y1) addq INCY, Y1 vmovlpd %xmm6, (Y1) addq INCY, Y1 vmovhpd %xmm6, (Y1) addq INCY, Y1 cmpq $8, N jge .L11 ALIGN_4 .L20: #endif #if GEMV_UNROLL >= 4 cmpq $4, N jl .L30 #if GEMV_UNROLL == 4 ALIGN_3 .L21: #endif subq $4, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A vxorps %xmm0 , %xmm0, %xmm0 vxorps %xmm1 , %xmm1, %xmm1 vxorps %xmm2 , %xmm2, %xmm2 vxorps %xmm3 , %xmm3, %xmm3 movq M, I sarq $3, I jle .L25 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L23 ALIGN_4 .L22: prefetchnta A_PRE(A1) prefetchnta A_PRE(A2) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) prefetchnta A_PRE(A1,LDA,1) prefetchnta A_PRE(A2,LDA,1) vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 prefetchnta A_PRE(X1) VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L22 ALIGN_4 .L23: vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -12 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -12 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -10 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L25: testq $4, M jle .L26 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0 vfmaddpd %xmm1 , -14 * SIZE(A1 , LDA , 1) , %xmm13 , %xmm1 vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A2 , LDA , 1) , %xmm13 , %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L26: testq $2, M jle .L27 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A1 , LDA , 1) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2 vfmaddpd %xmm3 , -16 * SIZE(A2 , LDA , 1) , %xmm12 , %xmm3 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L27: testq $1, M je .L28 vmovsd -16 * SIZE(X1), %xmm12 vmovsd -16 * SIZE(A1), %xmm8 vmovsd -16 * SIZE(A1, LDA), %xmm9 vmovsd -16 * SIZE(A2), %xmm10 vmovsd -16 * SIZE(A2, LDA), %xmm11 vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 ALIGN_4 .L28: vhaddpd %xmm1, %xmm0 , %xmm0 vhaddpd %xmm3, %xmm2 , %xmm2 vmulpd ALPHA, %xmm0 , %xmm0 vmulpd ALPHA, %xmm2 , %xmm2 cmpq $SIZE, INCY jne .L29 vmovups 0 * SIZE(Y), %xmm4 vmovups 2 * SIZE(Y), %xmm5 addq $4 * SIZE, Y vaddpd %xmm4, %xmm0 , %xmm0 vaddpd %xmm5, %xmm2 , %xmm2 vmovups %xmm0, 0 * SIZE(Y1) vmovups %xmm2, 2 * SIZE(Y1) addq $4 * SIZE, Y1 #if GEMV_UNROLL == 4 cmpq $4, N jge .L21 #endif jmp .L30 ALIGN_4 .L29: vmovsd (Y), %xmm4 addq INCY, Y vmovhpd (Y), %xmm4 , %xmm4 addq INCY, Y vmovsd (Y), %xmm5 addq INCY, Y vmovhpd (Y), %xmm5 , %xmm5 addq INCY, Y vaddpd %xmm4, %xmm0 , %xmm0 vaddpd %xmm5, %xmm2 , %xmm2 vmovlpd %xmm0, (Y1) addq INCY, Y1 vmovhpd %xmm0, (Y1) addq INCY, Y1 vmovlpd %xmm2, (Y1) addq INCY, Y1 vmovhpd %xmm2, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 4 cmpq $4, N jge .L21 #endif ALIGN_4 .L30: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L40 #if GEMV_UNROLL == 2 ALIGN_3 .L31: #endif subq $2, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A vxorps %xmm0 , %xmm0, %xmm0 vxorps %xmm1 , %xmm1, %xmm1 vxorps %xmm2 , %xmm2, %xmm2 vxorps %xmm3 , %xmm3, %xmm3 movq M, I sarq $3, I jle .L35 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L33 ALIGN_4 .L32: prefetchnta A_PRE(A1) prefetchnta A_PRE(A2) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 prefetchnta A_PRE(X1) VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -12 * SIZE(A2) , %xmm12 , %xmm1 VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm13 , %xmm3 VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 decq I jg .L32 ALIGN_4 .L33: vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -12 * SIZE(A2) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -10 * SIZE(A2) , %xmm13 , %xmm3 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L35: testq $4, M jle .L36 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 vfmaddpd %xmm3 , -14 * SIZE(A2) , %xmm13 , %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L36: testq $2, M jle .L37 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L37: testq $1, M je .L38 vmovsd -16 * SIZE(X1), %xmm12 vmovsd -16 * SIZE(A1), %xmm8 vmovsd -16 * SIZE(A2), %xmm9 vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 ALIGN_4 .L38: vaddpd %xmm2, %xmm0 , %xmm0 vaddpd %xmm3, %xmm1 , %xmm1 vhaddpd %xmm1, %xmm0 , %xmm0 mulpd ALPHA, %xmm0 vmovsd (Y), %xmm4 addq INCY, Y vmovhpd (Y), %xmm4 , %xmm4 addq INCY, Y vaddpd %xmm4, %xmm0 , %xmm0 vmovlpd %xmm0, (Y1) addq INCY, Y1 vmovhpd %xmm0, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L31 #endif ALIGN_4 .L40: cmpq $1, N jl .L999 #endif leaq 16 * SIZE(BUFFER), X1 movq A, A1 vxorps %xmm0 , %xmm0, %xmm0 vxorps %xmm1 , %xmm1, %xmm1 vxorps %xmm2 , %xmm2, %xmm2 vxorps %xmm3 , %xmm3, %xmm3 movq M, I sarq $3, I jle .L45 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) decq I jle .L43 ALIGN_4 .L42: prefetchnta A_PRE(A1) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 prefetchnta A_PRE(X1) VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) addq $8 * SIZE, A1 addq $8 * SIZE, X1 decq I jg .L42 ALIGN_4 .L43: vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2 addq $8 * SIZE, A1 addq $8 * SIZE, X1 ALIGN_4 .L45: testq $4, M jle .L46 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2 addq $4 * SIZE, A1 addq $4 * SIZE, X1 ALIGN_4 .L46: testq $2, M jle .L47 VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_4 .L47: testq $1, M je .L48 vmovsd -16 * SIZE(X1), %xmm12 vmovsd -16 * SIZE(A1), %xmm8 vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 ALIGN_4 .L48: vaddpd %xmm2, %xmm0 , %xmm0 vaddpd %xmm3, %xmm1 , %xmm1 vaddpd %xmm1, %xmm0 , %xmm0 vhaddpd %xmm1, %xmm0 , %xmm0 vmulsd ALPHA, %xmm0 , %xmm0 vmovsd (Y), %xmm4 addq INCY, Y vaddsd %xmm4, %xmm0 , %xmm0 vmovlpd %xmm0, (Y1) addq INCY, Y1 ALIGN_4 .L999: leaq (, M, SIZE), %rax addq %rax,AA jmp .L0x; ALIGN_4 .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret ALIGN_4 EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dgemv_t_microk_haswell-4.c000066400000000000000000000114211313527062700223760ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6 , %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "cmpq $0, %1 \n\t" "je 3f \n\t" // ".align 16 \n\t" "1: \n\t" // "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x // "prefetcht0 384(%4,%0,8) \n\t" "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" // "prefetcht0 384(%5,%0,8) \n\t" "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" // "prefetcht0 384(%6,%0,8) \n\t" "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" "addq $8 , %0 \n\t" // "prefetcht0 384(%7,%0,8) \n\t" "vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t" "subq $8 , %1 \n\t" "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" "jnz 1b \n\t" "3: \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vhaddpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vhaddpd %%xmm7, %%xmm7, %%xmm7 \n\t" "vmovsd %%xmm4, (%3) \n\t" "vmovsd %%xmm5, 8(%3) \n\t" "vmovsd %%xmm6, 16(%3) \n\t" "vmovsd %%xmm7, 24(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]) // 7 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dger.c000066400000000000000000000065111313527062700164520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #if defined(SANDYBRIDGE) #include "dger_microk_sandy-2.c" #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ FLOAT *X = x; if (incx != 1) { X = buffer; COPY_K(m, x, incx, X, 1); } BLASLONG m1 = m & -16; while (n > 0) { FLOAT y0 = alpha * *y; if ( m1 > 0 ) { #ifdef HAVE_KERNEL_16 dger_kernel_16(m1, X, a, &y0); #else AXPYU_K(m1, 0, 0, y0, X, 1, a, 1, NULL, 0); #endif } if ( m > m1 ) { AXPYU_K(m-m1, 0, 0, y0, X+m1 , 1, a+m1, 1, NULL, 0); } a += lda; y += incy; n --; } return 0; } OpenBLAS-0.2.20/kernel/x86_64/dger_microk_sandy-2.c000066400000000000000000000107011313527062700213470ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vmovddup (%4), %%xmm0 \n\t" // alpha "prefetcht0 256(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm8 \n\t" "vmovups 16(%3,%0,8), %%xmm9 \n\t" "vmovups 32(%3,%0,8), %%xmm10 \n\t" "vmovups 48(%3,%0,8), %%xmm11 \n\t" "prefetcht0 256(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm4 \n\t" "vmovups 16(%2,%0,8), %%xmm5 \n\t" "vmovups 32(%2,%0,8), %%xmm6 \n\t" "vmovups 48(%2,%0,8), %%xmm7 \n\t" "addq $8, %0 \n\t" "subq $8, %1 \n\t" "jz 2f \n\t" ".align 8 \n\t" "1: \n\t" "vmulpd %%xmm4, %%xmm0, %%xmm4 \n\t" "vaddpd %%xmm8 , %%xmm4, %%xmm12 \n\t" "vmulpd %%xmm5, %%xmm0, %%xmm5 \n\t" "vaddpd %%xmm9 , %%xmm5, %%xmm13 \n\t" "vmulpd %%xmm6, %%xmm0, %%xmm6 \n\t" "vaddpd %%xmm10, %%xmm6, %%xmm14 \n\t" "vmulpd %%xmm7, %%xmm0, %%xmm7 \n\t" "vaddpd %%xmm11, %%xmm7, %%xmm15 \n\t" "prefetcht0 256(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm8 \n\t" "vmovups 16(%3,%0,8), %%xmm9 \n\t" "vmovups 32(%3,%0,8), %%xmm10 \n\t" "vmovups 48(%3,%0,8), %%xmm11 \n\t" "prefetcht0 256(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm4 \n\t" "vmovups 16(%2,%0,8), %%xmm5 \n\t" "vmovups 32(%2,%0,8), %%xmm6 \n\t" "vmovups 48(%2,%0,8), %%xmm7 \n\t" "vmovups %%xmm12, -64(%3,%0,8) \n\t" "vmovups %%xmm13, -48(%3,%0,8) \n\t" "vmovups %%xmm14, -32(%3,%0,8) \n\t" "vmovups %%xmm15, -16(%3,%0,8) \n\t" "addq $8, %0 \n\t" "subq $8, %1 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%xmm4, %%xmm0, %%xmm4 \n\t" "vmulpd %%xmm5, %%xmm0, %%xmm5 \n\t" "vmulpd %%xmm6, %%xmm0, %%xmm6 \n\t" "vmulpd %%xmm7, %%xmm0, %%xmm7 \n\t" "vaddpd %%xmm8 , %%xmm4, %%xmm12 \n\t" "vaddpd %%xmm9 , %%xmm5, %%xmm13 \n\t" "vaddpd %%xmm10, %%xmm6, %%xmm14 \n\t" "vaddpd %%xmm11, %%xmm7, %%xmm15 \n\t" "vmovups %%xmm12, -64(%3,%0,8) \n\t" "vmovups %%xmm13, -48(%3,%0,8) \n\t" "vmovups %%xmm14, -32(%3,%0,8) \n\t" "vmovups %%xmm15, -16(%3,%0,8) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dot.S000066400000000000000000000105151313527062700162760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #define INCY ARG5 /* r8 */ #include "l1param.h" PROLOGUE PROFCODE salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY fldz fldz fldz fldz cmpq $SIZE, INCX jne .L14 cmpq $SIZE, INCY jne .L14 movq N, %rax sarq $2, %rax jle .L15 ALIGN_3 .L16: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(1) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(2) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(3) FLD 3 * SIZE(X) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(4) addq $4 * SIZE, X addq $4 * SIZE, Y decq %rax jg .L16 ALIGN_3 .L15: movq N, %rax andq $3, %rax jle .L27 ALIGN_3 .L22: FLD (X) addq $SIZE, X FLD (Y) fmulp %st, %st(1) addq $SIZE, Y faddp %st,%st(1) decq %rax jg .L22 jmp .L27 ALIGN_3 .L14: movq N, %rax sarq $2, %rax jle .L30 ALIGN_3 .L31: FLD (X) addq INCX, X FLD (Y) fmulp %st, %st(1) addq INCY, Y faddp %st,%st(1) FLD (X) addq INCX, X FLD (Y) fmulp %st, %st(1) addq INCY, Y faddp %st,%st(2) FLD (X) addq INCX, X FLD (Y) fmulp %st, %st(1) addq INCY, Y faddp %st,%st(3) FLD (X) addq INCX, X FLD (Y) fmulp %st, %st(1) addq INCY, Y faddp %st,%st(4) decq %rax jg .L31 ALIGN_3 .L30: movq N, %rax andq $3, %rax jle .L27 ALIGN_3 .L37: FLD (X) addq INCX, X FLD (Y) fmulp %st, %st(1) addq INCY, Y faddp %st, %st(1) decq %rax jg .L37 ALIGN_3 .L27: faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(1) ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dot_atom.S000066400000000000000000000152251313527062700173210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX pxor %xmm0, %xmm0 leaq (, INCY, SIZE), INCY pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 cmpq $0, N pxor %xmm3, %xmm3 jle .L999 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 movq N, %rax sarq $3, %rax jle .L14 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm8 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm9 movsd 2 * SIZE(X), %xmm6 mulsd %xmm8, %xmm4 movsd 2 * SIZE(Y), %xmm10 mulsd %xmm9, %xmm5 movsd 3 * SIZE(X), %xmm7 mulsd %xmm10, %xmm6 movsd 3 * SIZE(Y), %xmm11 mulsd %xmm11, %xmm7 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif addsd %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm4 addsd %xmm5, %xmm1 movsd 4 * SIZE(Y), %xmm8 addsd %xmm6, %xmm2 movsd 5 * SIZE(X), %xmm5 addsd %xmm7, %xmm3 movsd 5 * SIZE(Y), %xmm9 movsd 6 * SIZE(X), %xmm6 mulsd %xmm8, %xmm4 movsd 6 * SIZE(Y), %xmm10 mulsd %xmm9, %xmm5 movsd 7 * SIZE(X), %xmm7 mulsd %xmm10, %xmm6 movsd 7 * SIZE(Y), %xmm11 mulsd %xmm11, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif addsd %xmm4, %xmm0 movsd 8 * SIZE(X), %xmm4 addsd %xmm5, %xmm1 movsd 8 * SIZE(Y), %xmm8 addsd %xmm6, %xmm2 movsd 9 * SIZE(X), %xmm5 addsd %xmm7, %xmm3 movsd 9 * SIZE(Y), %xmm9 movsd 10 * SIZE(X), %xmm6 mulsd %xmm8, %xmm4 movsd 10 * SIZE(Y), %xmm10 mulsd %xmm9, %xmm5 movsd 11 * SIZE(X), %xmm7 mulsd %xmm10, %xmm6 movsd 11 * SIZE(Y), %xmm11 mulsd %xmm11, %xmm7 addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: addsd %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm4 addsd %xmm5, %xmm1 movsd 4 * SIZE(Y), %xmm8 addsd %xmm6, %xmm2 movsd 5 * SIZE(X), %xmm5 addsd %xmm7, %xmm3 movsd 5 * SIZE(Y), %xmm9 movsd 6 * SIZE(X), %xmm6 mulsd %xmm8, %xmm4 movsd 6 * SIZE(Y), %xmm10 mulsd %xmm9, %xmm5 movsd 7 * SIZE(X), %xmm7 mulsd %xmm10, %xmm6 movsd 7 * SIZE(Y), %xmm11 mulsd %xmm11, %xmm7 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 addq $ 8 * SIZE, X addq $ 8 * SIZE, Y ALIGN_3 .L14: testq $7, N jle .L999 testq $4, N jle .L16 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm8 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm9 movsd 2 * SIZE(X), %xmm6 mulsd %xmm8, %xmm4 movsd 2 * SIZE(Y), %xmm10 mulsd %xmm9, %xmm5 movsd 3 * SIZE(X), %xmm7 mulsd %xmm10, %xmm6 movsd 3 * SIZE(Y), %xmm11 mulsd %xmm11, %xmm7 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, N jle .L17 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm8 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm9 mulsd %xmm8, %xmm4 mulsd %xmm9, %xmm5 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, N jle .L999 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm8 mulsd %xmm8, %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 .L50: movq N, %rax sarq $2, %rax jle .L55 ALIGN_3 .L53: movsd 0 * SIZE(X), %xmm4 addq INCX, X mulsd 0 * SIZE(Y), %xmm4 addq INCY, Y movsd 0 * SIZE(X), %xmm5 addq INCX, X mulsd 0 * SIZE(Y), %xmm5 addq INCY, Y movsd 0 * SIZE(X), %xmm6 addq INCX, X mulsd 0 * SIZE(Y), %xmm6 addq INCY, Y movsd 0 * SIZE(X), %xmm7 addq INCX, X mulsd 0 * SIZE(Y), %xmm7 addq INCY, Y addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3 .L56: movsd 0 * SIZE(X), %xmm4 addq INCX, X mulsd 0 * SIZE(Y), %xmm4 addq INCY, Y addsd %xmm4, %xmm0 decq %rax jg .L56 ALIGN_3 .L999: addsd %xmm1, %xmm0 addsd %xmm3, %xmm2 addsd %xmm2, %xmm0 RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dot_sse.S000066400000000000000000000563071313527062700171610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 cmpq $0, N jle .L999 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 subq $-32 * SIZE, X subq $-32 * SIZE, Y cmpq $3, N jle .L17 testq $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 mulss -32 * SIZE(Y), %xmm0 addq $1 * SIZE, X addq $1 * SIZE, Y decq N ALIGN_2 .L05: testq $2 * SIZE, Y je .L10 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(Y), %xmm1 mulps %xmm4, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y subq $2, N jle .L999 ALIGN_2 .L10: #ifdef ALIGNED_ACCESS testq $2 * SIZE, X jne .L30 testq $SIZE, X jne .L20 #else testq $3 * SIZE, X jne .L20 #endif movq N, %rax sarq $5, %rax jle .L14 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 movaps -16 * SIZE(X), %xmm8 movaps -12 * SIZE(X), %xmm9 movaps -8 * SIZE(X), %xmm10 movaps -4 * SIZE(X), %xmm11 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 8 * SIZE(X), %xmm6 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 12 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movaps 16 * SIZE(X), %xmm8 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 movaps 20 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 movaps 24 * SIZE(X), %xmm10 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 movaps 28 * SIZE(X), %xmm11 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L14: testq $31, N jle .L999 testq $16, N jle .L15 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L15: testq $8, N jle .L16 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L16: testq $4, N jle .L17 movaps -32 * SIZE(X), %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L17: testq $2, N jle .L18 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(Y), %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L18: testq $1, N jle .L999 movss -32 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movaps -33 * SIZE(X), %xmm4 addq $3 * SIZE, X movq N, %rax sarq $5, %rax jle .L24 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 movaps -20 * SIZE(X), %xmm8 movaps -16 * SIZE(X), %xmm9 movaps -12 * SIZE(X), %xmm10 movaps -8 * SIZE(X), %xmm11 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm5, %xmm4 pshufd $0x39, %xmm4, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 pshufd $0x39, %xmm5, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 pshufd $0x39, %xmm6, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 movss %xmm8, %xmm7 pshufd $0x39, %xmm7, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 8 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0x39, %xmm8, %xmm8 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movaps 12 * SIZE(X), %xmm8 movss %xmm10, %xmm9 pshufd $0x39, %xmm9, %xmm9 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 movaps 16 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0x39, %xmm10, %xmm10 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 movaps 20 * SIZE(X), %xmm10 movss %xmm4, %xmm11 pshufd $0x39, %xmm11, %xmm11 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 movaps 24 * SIZE(X), %xmm11 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movss %xmm5, %xmm4 pshufd $0x39, %xmm4, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 pshufd $0x39, %xmm5, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 pshufd $0x39, %xmm6, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm8, %xmm7 pshufd $0x39, %xmm7, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movss %xmm9, %xmm8 pshufd $0x39, %xmm8, %xmm8 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movss %xmm10, %xmm9 pshufd $0x39, %xmm9, %xmm9 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 movss %xmm11, %xmm10 pshufd $0x39, %xmm10, %xmm10 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 movss %xmm4, %xmm11 pshufd $0x39, %xmm11, %xmm11 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L24: testq $31, N jle .L999 testq $16, N jle .L25 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 movss %xmm5, %xmm4 pshufd $0x39, %xmm4, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 pshufd $0x39, %xmm5, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 pshufd $0x39, %xmm6, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm4, %xmm7 pshufd $0x39, %xmm7, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L25: testq $8, N jle .L26 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movss %xmm5, %xmm4 pshufd $0x39, %xmm4, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movss %xmm6, %xmm5 pshufd $0x39, %xmm5, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps %xmm6, %xmm4 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L26: testq $4, N jle .L27 movaps -32 * SIZE(X), %xmm5 movss %xmm5, %xmm4 pshufd $0x39, %xmm4, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 movaps %xmm5, %xmm4 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L27: testq $2, N jle .L28 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(Y), %xmm8 pshufd $0x29, %xmm4, %xmm5 mulps %xmm8, %xmm5 addps %xmm5, %xmm3 movhlps %xmm4, %xmm4 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L28: testq $1, N jle .L999 pshufd $0x39, %xmm4, %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 .L30: testq $SIZE, X jne .L40 movhps -32 * SIZE(X), %xmm4 addq $2 * SIZE, X movq N, %rax sarq $5, %rax jle .L34 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 movaps -20 * SIZE(X), %xmm8 movaps -16 * SIZE(X), %xmm9 movaps -12 * SIZE(X), %xmm10 movaps -8 * SIZE(X), %xmm11 decq %rax jle .L32 ALIGN_3 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 SHUFPD_1 %xmm8, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 8 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm9, %xmm8 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movaps 12 * SIZE(X), %xmm8 SHUFPD_1 %xmm10, %xmm9 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 movaps 16 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm11, %xmm10 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 movaps 20 * SIZE(X), %xmm10 SHUFPD_1 %xmm4, %xmm11 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 movaps 24 * SIZE(X), %xmm11 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 SHUFPD_1 %xmm8, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 SHUFPD_1 %xmm9, %xmm8 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 SHUFPD_1 %xmm10, %xmm9 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 SHUFPD_1 %xmm11, %xmm10 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 SHUFPD_1 %xmm4, %xmm11 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L34: testq $31, N jle .L999 testq $16, N jle .L35 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 SHUFPD_1 %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L35: testq $8, N jle .L36 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 SHUFPD_1 %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movapd %xmm6, %xmm4 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L36: testq $4, N jle .L37 movaps -32 * SIZE(X), %xmm5 SHUFPD_1 %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps %xmm5, %xmm4 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L37: testq $2, N jle .L38 xorps %xmm5, %xmm5 movhlps %xmm4, %xmm5 movlps -32 * SIZE(Y), %xmm4 mulps %xmm4, %xmm5 addps %xmm5, %xmm0 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L38: testq $1, N jle .L999 movss -34 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm4 addq $SIZE, X movq N, %rax sarq $5, %rax jle .L44 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 movaps -20 * SIZE(X), %xmm8 movaps -16 * SIZE(X), %xmm9 movaps -12 * SIZE(X), %xmm10 movaps -8 * SIZE(X), %xmm11 decq %rax jle .L42 ALIGN_3 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 movss %xmm8, %xmm7 shufps $0x93, %xmm8, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movaps 8 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm9, %xmm8 shufps $0x93, %xmm9, %xmm8 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movaps 12 * SIZE(X), %xmm8 movss %xmm10, %xmm9 shufps $0x93, %xmm10, %xmm9 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 movaps 16 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 shufps $0x93, %xmm11, %xmm10 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 movaps 20 * SIZE(X), %xmm10 movss %xmm4, %xmm11 shufps $0x93, %xmm4, %xmm11 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 movaps 24 * SIZE(X), %xmm11 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -4 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm8, %xmm7 shufps $0x93, %xmm8, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movss %xmm9, %xmm8 shufps $0x93, %xmm9, %xmm8 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movss %xmm10, %xmm9 shufps $0x93, %xmm10, %xmm9 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 movss %xmm11, %xmm10 shufps $0x93, %xmm11, %xmm10 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 movss %xmm4, %xmm11 shufps $0x93, %xmm4, %xmm11 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L44: testq $31, N jle .L999 testq $16, N jle .L45 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movaps -24 * SIZE(X), %xmm7 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movss %xmm4, %xmm7 shufps $0x93, %xmm4, %xmm7 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L45: testq $8, N jle .L46 movaps -32 * SIZE(X), %xmm5 movaps -28 * SIZE(X), %xmm6 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movaps %xmm6, %xmm4 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L46: testq $4, N jle .L47 movaps -32 * SIZE(X), %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 movaps %xmm5, %xmm4 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L47: testq $2, N jle .L48 movaps -32 * SIZE(X), %xmm5 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(Y), %xmm8 movss %xmm5, %xmm4 shufps $0x93, %xmm4, %xmm4 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 movlhps %xmm5, %xmm4 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L48: testq $1, N jle .L999 pshufd $0x93, %xmm4, %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_4 #else movq N, %rax sarq $5, %rax jle .L24 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 movlps -28 * SIZE(X), %xmm5 movhps -26 * SIZE(X), %xmm5 movlps -24 * SIZE(X), %xmm6 movhps -22 * SIZE(X), %xmm6 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 movlps -16 * SIZE(X), %xmm8 movhps -14 * SIZE(X), %xmm8 movlps -12 * SIZE(X), %xmm9 movhps -10 * SIZE(X), %xmm9 movlps -8 * SIZE(X), %xmm10 movhps -6 * SIZE(X), %xmm10 movlps -4 * SIZE(X), %xmm11 movhps -2 * SIZE(X), %xmm11 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 movlps 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 movlps 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 movlps 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 movlps 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movlps 16 * SIZE(X), %xmm8 movhps 18 * SIZE(X), %xmm8 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 movlps 20 * SIZE(X), %xmm9 movhps 22 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 movlps 24 * SIZE(X), %xmm10 movhps 26 * SIZE(X), %xmm10 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 movlps 28 * SIZE(X), %xmm11 movhps 30 * SIZE(X), %xmm11 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 mulps -16 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 mulps -12 * SIZE(Y), %xmm9 addps %xmm9, %xmm1 mulps -8 * SIZE(Y), %xmm10 addps %xmm10, %xmm2 mulps -4 * SIZE(Y), %xmm11 addps %xmm11, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L24: testq $31, N jle .L999 testq $16, N jle .L25 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 movlps -28 * SIZE(X), %xmm5 movhps -26 * SIZE(X), %xmm5 movlps -24 * SIZE(X), %xmm6 movhps -22 * SIZE(X), %xmm6 movlps -20 * SIZE(X), %xmm7 movhps -18 * SIZE(X), %xmm7 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 mulps -24 * SIZE(Y), %xmm6 addps %xmm6, %xmm2 mulps -20 * SIZE(Y), %xmm7 addps %xmm7, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L25: testq $8, N jle .L26 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 movlps -28 * SIZE(X), %xmm5 movhps -26 * SIZE(X), %xmm5 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm0 mulps -28 * SIZE(Y), %xmm5 addps %xmm5, %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L26: testq $4, N jle .L27 movlps -32 * SIZE(X), %xmm4 movhps -30 * SIZE(X), %xmm4 mulps -32 * SIZE(Y), %xmm4 addps %xmm4, %xmm2 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L27: testq $2, N jle .L28 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(Y), %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L28: testq $1, N jle .L999 movss -32 * SIZE(X), %xmm4 mulss -32 * SIZE(Y), %xmm4 addss %xmm4, %xmm0 jmp .L999 ALIGN_3 #endif .L50: movq N, %rax sarq $2, %rax jle .L55 ALIGN_3 .L53: movss 0 * SIZE(X), %xmm4 addq INCX, X mulss 0 * SIZE(Y), %xmm4 addq INCY, Y movss 0 * SIZE(X), %xmm5 addq INCX, X mulss 0 * SIZE(Y), %xmm5 addq INCY, Y movss 0 * SIZE(X), %xmm6 addq INCX, X mulss 0 * SIZE(Y), %xmm6 addq INCY, Y movss 0 * SIZE(X), %xmm7 addq INCX, X mulss 0 * SIZE(Y), %xmm7 addq INCY, Y addss %xmm4, %xmm0 addss %xmm5, %xmm1 addss %xmm6, %xmm2 addss %xmm7, %xmm3 decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3 .L56: movss 0 * SIZE(X), %xmm4 addq INCX, X mulss 0 * SIZE(Y), %xmm4 addq INCY, Y addss %xmm4, %xmm0 decq %rax jg .L56 ALIGN_3 .L999: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #endif #ifdef DSDOT cvtss2sd %xmm0, %xmm0 #endif RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dot_sse2.S000066400000000000000000000332431313527062700172350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 cmpq $0, N jle .L999 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 subq $-16 * SIZE, X subq $-16 * SIZE, Y testq $SIZE, Y je .L10 movsd -16 * SIZE(X), %xmm0 mulsd -16 * SIZE(Y), %xmm0 addq $1 * SIZE, X addq $1 * SIZE, Y decq N ALIGN_2 .L10: testq $SIZE, X jne .L20 movq N, %rax sarq $4, %rax jle .L14 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 movaps -8 * SIZE(X), %xmm8 movaps -6 * SIZE(X), %xmm9 movaps -4 * SIZE(X), %xmm10 movaps -2 * SIZE(X), %xmm11 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps 6 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd -8 * SIZE(Y), %xmm8 addpd %xmm8, %xmm0 movaps 8 * SIZE(X), %xmm8 mulpd -6 * SIZE(Y), %xmm9 addpd %xmm9, %xmm1 movaps 10 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulpd -4 * SIZE(Y), %xmm10 addpd %xmm10, %xmm2 movaps 12 * SIZE(X), %xmm10 mulpd -2 * SIZE(Y), %xmm11 addpd %xmm11, %xmm3 movaps 14 * SIZE(X), %xmm11 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 mulpd -8 * SIZE(Y), %xmm8 addpd %xmm8, %xmm0 mulpd -6 * SIZE(Y), %xmm9 addpd %xmm9, %xmm1 mulpd -4 * SIZE(Y), %xmm10 addpd %xmm10, %xmm2 mulpd -2 * SIZE(Y), %xmm11 addpd %xmm11, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L14: testq $15, N jle .L999 testq $8, N jle .L15 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $4, N jle .L16 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, N jle .L17 movaps -16 * SIZE(X), %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, N jle .L999 movsd -16 * SIZE(X), %xmm4 mulsd -16 * SIZE(Y), %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm4 addq $SIZE, X movq N, %rax sarq $4, %rax jle .L24 movaps -16 * SIZE(X), %xmm5 movaps -14 * SIZE(X), %xmm6 movaps -12 * SIZE(X), %xmm7 movaps -10 * SIZE(X), %xmm8 movaps -8 * SIZE(X), %xmm9 movaps -6 * SIZE(X), %xmm10 movaps -4 * SIZE(X), %xmm11 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -2 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movaps 0 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm7, %xmm6 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movaps 2 * SIZE(X), %xmm6 SHUFPD_1 %xmm8, %xmm7 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movaps 4 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm9, %xmm8 mulpd -8 * SIZE(Y), %xmm8 addpd %xmm8, %xmm0 movaps 6 * SIZE(X), %xmm8 SHUFPD_1 %xmm10, %xmm9 mulpd -6 * SIZE(Y), %xmm9 addpd %xmm9, %xmm1 movaps 8 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm11, %xmm10 mulpd -4 * SIZE(Y), %xmm10 addpd %xmm10, %xmm2 movaps 10 * SIZE(X), %xmm10 SHUFPD_1 %xmm4, %xmm11 mulpd -2 * SIZE(Y), %xmm11 addpd %xmm11, %xmm3 movaps 12 * SIZE(X), %xmm11 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -2 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 SHUFPD_1 %xmm8, %xmm7 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 SHUFPD_1 %xmm9, %xmm8 mulpd -8 * SIZE(Y), %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm10, %xmm9 mulpd -6 * SIZE(Y), %xmm9 addpd %xmm9, %xmm1 SHUFPD_1 %xmm11, %xmm10 mulpd -4 * SIZE(Y), %xmm10 addpd %xmm10, %xmm2 SHUFPD_1 %xmm4, %xmm11 mulpd -2 * SIZE(Y), %xmm11 addpd %xmm11, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L24: testq $15, N jle .L999 testq $8, N jle .L25 movaps -16 * SIZE(X), %xmm5 movaps -14 * SIZE(X), %xmm6 movaps -12 * SIZE(X), %xmm7 SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movaps -10 * SIZE(X), %xmm4 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 SHUFPD_1 %xmm7, %xmm6 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 SHUFPD_1 %xmm4, %xmm7 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $4, N jle .L26 movaps -16 * SIZE(X), %xmm5 movaps -14 * SIZE(X), %xmm6 SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 SHUFPD_1 %xmm6, %xmm5 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movapd %xmm6, %xmm4 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: testq $2, N jle .L27 movaps -16 * SIZE(X), %xmm5 SHUFPD_1 %xmm5, %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movapd %xmm5, %xmm4 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: testq $1, N jle .L999 SHUFPD_1 %xmm4, %xmm4 mulsd -16 * SIZE(Y), %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 #else movq N, %rax sarq $4, %rax jle .L24 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movlps -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movlps -12 * SIZE(X), %xmm6 movhps -11 * SIZE(X), %xmm6 movlps -10 * SIZE(X), %xmm7 movhps -9 * SIZE(X), %xmm7 movlps -8 * SIZE(X), %xmm8 movhps -7 * SIZE(X), %xmm8 movlps -6 * SIZE(X), %xmm9 movhps -5 * SIZE(X), %xmm9 movlps -4 * SIZE(X), %xmm10 movhps -3 * SIZE(X), %xmm10 movlps -2 * SIZE(X), %xmm11 movhps -1 * SIZE(X), %xmm11 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movlps 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 movlps 2 * SIZE(X), %xmm5 movhps 3 * SIZE(X), %xmm5 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 movlps 4 * SIZE(X), %xmm6 movhps 5 * SIZE(X), %xmm6 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 movlps 6 * SIZE(X), %xmm7 movhps 7 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd -8 * SIZE(Y), %xmm8 addpd %xmm8, %xmm0 movlps 8 * SIZE(X), %xmm8 movhps 9 * SIZE(X), %xmm8 mulpd -6 * SIZE(Y), %xmm9 addpd %xmm9, %xmm1 movlps 10 * SIZE(X), %xmm9 movhps 11 * SIZE(X), %xmm9 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif mulpd -4 * SIZE(Y), %xmm10 addpd %xmm10, %xmm2 movlps 12 * SIZE(X), %xmm10 movhps 13 * SIZE(X), %xmm10 mulpd -2 * SIZE(Y), %xmm11 addpd %xmm11, %xmm3 movlps 14 * SIZE(X), %xmm11 movhps 15 * SIZE(X), %xmm11 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 mulpd -8 * SIZE(Y), %xmm8 addpd %xmm8, %xmm0 mulpd -6 * SIZE(Y), %xmm9 addpd %xmm9, %xmm1 mulpd -4 * SIZE(Y), %xmm10 addpd %xmm10, %xmm2 mulpd -2 * SIZE(Y), %xmm11 addpd %xmm11, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L24: testq $15, N jle .L999 testq $8, N jle .L25 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movlps -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movlps -12 * SIZE(X), %xmm6 movhps -11 * SIZE(X), %xmm6 movlps -10 * SIZE(X), %xmm7 movhps -9 * SIZE(X), %xmm7 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 mulpd -12 * SIZE(Y), %xmm6 addpd %xmm6, %xmm2 mulpd -10 * SIZE(Y), %xmm7 addpd %xmm7, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $4, N jle .L26 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movlps -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 mulpd -14 * SIZE(Y), %xmm5 addpd %xmm5, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: testq $2, N jle .L27 movlps -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 mulpd -16 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: testq $1, N jle .L999 movsd -16 * SIZE(X), %xmm4 mulsd -16 * SIZE(Y), %xmm4 addsd %xmm4, %xmm0 jmp .L999 ALIGN_3 #endif .L50: movq N, %rax sarq $2, %rax jle .L55 ALIGN_3 .L53: movsd 0 * SIZE(X), %xmm4 addq INCX, X mulsd 0 * SIZE(Y), %xmm4 addq INCY, Y movsd 0 * SIZE(X), %xmm5 addq INCX, X mulsd 0 * SIZE(Y), %xmm5 addq INCY, Y movsd 0 * SIZE(X), %xmm6 addq INCX, X mulsd 0 * SIZE(Y), %xmm6 addq INCY, Y movsd 0 * SIZE(X), %xmm7 addq INCX, X mulsd 0 * SIZE(Y), %xmm7 addq INCY, Y addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3 .L56: movsd 0 * SIZE(X), %xmm4 addq INCX, X mulsd 0 * SIZE(Y), %xmm4 addq INCY, Y addsd %xmm4, %xmm0 decq %rax jg .L56 ALIGN_3 .L999: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #ifndef HAVE_SSE3 pshufd $0xe, %xmm0, %xmm1 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dscal.c000066400000000000000000000124371313527062700166230ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013 - 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" #endif #if !defined(HAVE_KERNEL_8) static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) { BLASLONG i; FLOAT alpha = *da; for( i=0; i 0 ) { dscal_kernel_inc_8(n1, &da, x, inc_x); i = n1 * inc_x; j = n1; } while(j < n) { x[i] *= da; i += inc_x ; j++; } } return(0); } BLASLONG n1 = n & -8; if ( n1 > 0 ) { if ( da == 0.0 ) dscal_kernel_8_zero(n1 , &da , x); else dscal_kernel_8(n1 , &da , x); } if ( da == 0.0 ) { for ( i=n1 ; i> 4 ; BLASLONG n2 = n & 8 ; __asm__ __volatile__ ( "vmovddup (%2), %%xmm0 \n\t" // alpha "addq $128, %1 \n\t" "cmpq $0, %0 \n\t" "je 4f \n\t" "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" "subq $1 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 256(%1) \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" "prefetcht0 320(%1) \n\t" "vmovups %%xmm8 , -64(%1) \n\t" "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" "vmovups %%xmm9 , -48(%1) \n\t" "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" "vmovups %%xmm10 , -32(%1) \n\t" "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" "vmovups %%xmm11 , -16(%1) \n\t" "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" "addq $128, %1 \n\t" "subq $1 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "vmovups %%xmm8 , -64(%1) \n\t" "vmovups %%xmm9 , -48(%1) \n\t" "vmovups %%xmm10 , -32(%1) \n\t" "vmovups %%xmm11 , -16(%1) \n\t" "addq $128, %1 \n\t" "4: \n\t" "cmpq $8 ,%3 \n\t" "jne 5f \n\t" "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "5: \n\t" "vzeroupper \n\t" : : "r" (n1), // 0 "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { BLASLONG n1 = n >> 4 ; BLASLONG n2 = n & 8 ; __asm__ __volatile__ ( "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" "addq $128, %1 \n\t" "cmpq $0, %0 \n\t" "je 2f \n\t" ".align 16 \n\t" "1: \n\t" "vmovups %%xmm0 ,-128(%1) \n\t" "vmovups %%xmm0 ,-112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "vmovups %%xmm0 , -64(%1) \n\t" "vmovups %%xmm0 , -48(%1) \n\t" "vmovups %%xmm0 , -32(%1) \n\t" "vmovups %%xmm0 , -16(%1) \n\t" "addq $128, %1 \n\t" "subq $1 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "cmpq $8 ,%3 \n\t" "jne 4f \n\t" "vmovups %%xmm0 ,-128(%1) \n\t" "vmovups %%xmm0 ,-112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "4: \n\t" "vzeroupper \n\t" : : "r" (n1), // 0 "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dscal_microk_haswell-2.c000066400000000000000000000137211313527062700220420ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) { BLASLONG n1 = n >> 4 ; BLASLONG n2 = n & 8 ; __asm__ __volatile__ ( "vmovddup (%2), %%xmm0 \n\t" // alpha "addq $128, %1 \n\t" "cmpq $0, %0 \n\t" "je 4f \n\t" "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" "subq $1 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" // "prefetcht0 640(%1) \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" // "prefetcht0 704(%1) \n\t" "vmovups %%xmm8 , -64(%1) \n\t" "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" "vmovups %%xmm9 , -48(%1) \n\t" "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" "vmovups %%xmm10 , -32(%1) \n\t" "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" "vmovups %%xmm11 , -16(%1) \n\t" "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" "addq $128, %1 \n\t" "subq $1 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "vmovups %%xmm8 , -64(%1) \n\t" "vmovups %%xmm9 , -48(%1) \n\t" "vmovups %%xmm10 , -32(%1) \n\t" "vmovups %%xmm11 , -16(%1) \n\t" "addq $128, %1 \n\t" "4: \n\t" "cmpq $8 ,%3 \n\t" "jne 5f \n\t" "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "5: \n\t" "vzeroupper \n\t" : : "r" (n1), // 0 "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { BLASLONG n1 = n >> 4 ; BLASLONG n2 = n & 8 ; __asm__ __volatile__ ( "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" "addq $128, %1 \n\t" "cmpq $0, %0 \n\t" "je 2f \n\t" ".align 16 \n\t" "1: \n\t" "vmovups %%xmm0 ,-128(%1) \n\t" "vmovups %%xmm0 ,-112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "vmovups %%xmm0 , -64(%1) \n\t" "vmovups %%xmm0 , -48(%1) \n\t" "vmovups %%xmm0 , -32(%1) \n\t" "vmovups %%xmm0 , -16(%1) \n\t" "addq $128, %1 \n\t" "subq $1 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "cmpq $8 ,%3 \n\t" "jne 4f \n\t" "vmovups %%xmm0 ,-128(%1) \n\t" "vmovups %%xmm0 ,-112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "4: \n\t" "vzeroupper \n\t" : : "r" (n1), // 0 "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dscal_microk_sandy-2.c000066400000000000000000000137131313527062700215220ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) { BLASLONG n1 = n >> 4 ; BLASLONG n2 = n & 8 ; __asm__ __volatile__ ( "vmovddup (%2), %%xmm0 \n\t" // alpha "addq $128, %1 \n\t" "cmpq $0, %0 \n\t" "je 4f \n\t" "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" "subq $1 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 640(%1) \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" "prefetcht0 704(%1) \n\t" "vmovups %%xmm8 , -64(%1) \n\t" "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" "vmovups %%xmm9 , -48(%1) \n\t" "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" "vmovups %%xmm10 , -32(%1) \n\t" "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" "vmovups %%xmm11 , -16(%1) \n\t" "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" "addq $128, %1 \n\t" "subq $1 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "vmovups %%xmm8 , -64(%1) \n\t" "vmovups %%xmm9 , -48(%1) \n\t" "vmovups %%xmm10 , -32(%1) \n\t" "vmovups %%xmm11 , -16(%1) \n\t" "addq $128, %1 \n\t" "4: \n\t" "cmpq $8 ,%3 \n\t" "jne 5f \n\t" "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" "vmovups %%xmm4 ,-128(%1) \n\t" "vmovups %%xmm5 ,-112(%1) \n\t" "vmovups %%xmm6 , -96(%1) \n\t" "vmovups %%xmm7 , -80(%1) \n\t" "5: \n\t" "vzeroupper \n\t" : : "r" (n1), // 0 "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { BLASLONG n1 = n >> 4 ; BLASLONG n2 = n & 8 ; __asm__ __volatile__ ( "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" "addq $128, %1 \n\t" "cmpq $0, %0 \n\t" "je 2f \n\t" ".align 16 \n\t" "1: \n\t" "vmovups %%xmm0 ,-128(%1) \n\t" "vmovups %%xmm0 ,-112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "vmovups %%xmm0 , -64(%1) \n\t" "vmovups %%xmm0 , -48(%1) \n\t" "vmovups %%xmm0 , -32(%1) \n\t" "vmovups %%xmm0 , -16(%1) \n\t" "addq $128, %1 \n\t" "subq $1 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "cmpq $8 ,%3 \n\t" "jne 4f \n\t" "vmovups %%xmm0 ,-128(%1) \n\t" "vmovups %%xmm0 ,-112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "4: \n\t" "vzeroupper \n\t" : : "r" (n1), // 0 "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/dsymv_L.c000066400000000000000000000154171313527062700171530ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" #elif defined(NEHALEM) #include "dsymv_L_microk_nehalem-2.c" #endif #ifndef HAVE_KERNEL_4x4 static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) { FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; BLASLONG i; for (i=from; i=12 ) { BLASLONG m2 = (m/4)*4; for (i=j+1; i j+4 ) dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); for (i=m2; i=8 ) { BLASLONG j1 = ((from + 4)/4)*4; BLASLONG j2 = (m/4)*4; for (i=from; i static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) __attribute__ ((noinline)); static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) { BLASLONG i = 0; BLASLONG temp1 = n * 8; __asm__ __volatile__ ( " vxorpd %%ymm4 , %%ymm4 , %%ymm4 \n\t" " vxorpd %%ymm5 , %%ymm5 , %%ymm5 \n\t" " vxorpd %%ymm6 , %%ymm6 , %%ymm6 \n\t" " vxorpd %%ymm7 , %%ymm7 , %%ymm7 \n\t" " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" " cmp $0, %1 \n\t" " jz 2f \n\t" " .align 16 \n\t" "1: \n\t" " vmovups (%2,%0,4) , %%ymm0 \n\t" " vmovups (%3,%0,8) , %%ymm1 \n\t" " vmovups 32(%3,%0,8) , %%ymm2 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" " addq $8 , %0 \n\t" " cmp %0 , %1 \n\t" " jne 1b \n\t" "2: \n\t" " vbroadcastsd (%4), %%ymm0 \n\t" " vmulpd %%ymm0 , %%ymm4 , %%ymm4 \n\t" " vmulpd %%ymm0 , %%ymm5 , %%ymm5 \n\t" " vmulpd %%ymm0 , %%ymm6 , %%ymm6 \n\t" " vmulpd %%ymm0 , %%ymm7 , %%ymm7 \n\t" " vmulpd %%ymm0 , %%ymm8 , %%ymm8 \n\t" " vmulpd %%ymm0 , %%ymm9 , %%ymm9 \n\t" " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" " vmovups %%ymm4 , (%5) \n\t" " vmovups %%ymm5 , (%6) \n\t" " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" " vmovups %%ymm4 , (%9) \n\t" " vmovups %%ymm5 , (%10) \n\t" " vmovups %%ymm6 , (%11) \n\t" " vmovups %%ymm7 , (%12) \n\t" : : "a" (i), // 0 "r" (temp1), // 1 "S" (a), // 2 "D" (b), // 3 "r" (alpha), // 4 "r" (C0), // 5 "r" (C1), // 6 "r" (C2), // 7 "r" (C3), // 8 "r" (C4), // 9 "r" (C5), // 10 "r" (C6), // 11 "r" (C7) // 12 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) { BLASLONG i,j,k; FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; FLOAT res0_0; FLOAT res0_1; FLOAT res0_2; FLOAT res0_3; FLOAT res1_0; FLOAT res1_1; FLOAT res1_2; FLOAT res1_3; FLOAT res2_0; FLOAT res2_1; FLOAT res2_2; FLOAT res2_3; FLOAT res3_0; FLOAT res3_1; FLOAT res3_2; FLOAT res3_3; FLOAT res4_0; FLOAT res4_1; FLOAT res4_2; FLOAT res4_3; FLOAT res5_0; FLOAT res5_1; FLOAT res5_2; FLOAT res5_3; FLOAT res6_0; FLOAT res6_1; FLOAT res6_2; FLOAT res6_3; FLOAT res7_0; FLOAT res7_1; FLOAT res7_2; FLOAT res7_3; FLOAT a0; FLOAT a1; FLOAT b0; FLOAT b1; FLOAT b2; FLOAT b3; FLOAT b4; FLOAT b5; FLOAT b6; FLOAT b7; BLASLONG off, temp ; bool left; bool transposed; bool backwards; #ifdef LEFT left = true; #else left = false; #endif #ifdef TRANSA transposed = true; #else transposed = false; #endif backwards = left != transposed; if (!left) { off = -offset; } for (j=0; j= 0; i--) { aa = *(a + i); for (j = 0; j < n; j ++) { cj = c + j * ldc; bb = *(cj + i); bb *= aa; *b = bb; *(cj + i) = bb; b ++; BLASLONG i1 = i & -4 ; FLOAT t0,t1,t2,t3; k=0; if ( i & 4 ) { t0 = cj[k]; t1 = cj[k+1]; t2 = cj[k+2]; t3 = cj[k+3]; t0 -= bb * a[k+0]; t1 -= bb * a[k+1]; t2 -= bb * a[k+2]; t3 -= bb * a[k+3]; cj[k+0] = t0; cj[k+1] = t1; cj[k+2] = t2; cj[k+3] = t3; k+=4; } if ( i & 2 ) { t0 = a[k]; t1 = a[k+1]; t0 *= bb; t1 *= bb; cj[k+0] -= t0; cj[k+1] -= t1; k+=2; } if ( i & 1 ) { t0 = bb * a[k]; cj[k+0] -= t0; } } a -= m; b -= 2 * n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (m - 1) * m * 2; b += (m - 1) * n * 2; for (i = m - 1; i >= 0; i--) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a -= m * 2; b -= 4 * n; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { dtrsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc , aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * j * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, cc, ldc); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S000066400000000000000000000652071313527062700233730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #define J %rbx #ifndef WINDOWS_ABI #define STACKSIZE 96 #define OFFSET 48(%rsp) #define AORIG 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define AORIG 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define A_PR1 384 #define B_PR1 192 .macro KERNEL8x2_SUB vmovddup -16*SIZE(BO,%rax,2), %xmm1 vmovddup -15*SIZE(BO,%rax,2), %xmm2 vmovups -16*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 vmovups -14*SIZE(AO,%rax,8), %xmm4 vfmaddpd %xmm10, %xmm4 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm4 , %xmm2 , %xmm11 vmovups -12*SIZE(AO,%rax,8), %xmm5 vfmaddpd %xmm12, %xmm5 , %xmm1 , %xmm12 vfmaddpd %xmm13, %xmm5 , %xmm2 , %xmm13 vmovups -10*SIZE(AO,%rax,8), %xmm6 vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 addq $ SIZE, %rax .endm .macro SOLVE_8x2 vmovups %xmm8 , %xmm1 vunpcklpd %xmm9 , %xmm8 , %xmm8 vunpckhpd %xmm9 , %xmm1 , %xmm1 vmovups -16 * SIZE(BO), %xmm0 vsubpd %xmm8 , %xmm0 , %xmm0 vmovups -14 * SIZE(BO), %xmm8 vsubpd %xmm1 , %xmm8 , %xmm1 vmovups %xmm10, %xmm3 vunpcklpd %xmm11, %xmm10 , %xmm10 vunpckhpd %xmm11, %xmm3 , %xmm3 vmovups -12 * SIZE(BO), %xmm8 vmovups -10 * SIZE(BO), %xmm9 vsubpd %xmm10, %xmm8 , %xmm2 vsubpd %xmm3 , %xmm9 , %xmm3 vmovups %xmm12, %xmm5 vunpcklpd %xmm13, %xmm12 , %xmm12 vunpckhpd %xmm13, %xmm5 , %xmm5 vmovups -8 * SIZE(BO), %xmm8 vmovups -6 * SIZE(BO), %xmm9 vsubpd %xmm12, %xmm8 , %xmm4 vsubpd %xmm5 , %xmm9 , %xmm5 vmovups %xmm14, %xmm7 vunpcklpd %xmm15, %xmm14 , %xmm14 vunpckhpd %xmm15, %xmm7 , %xmm7 vmovups -4 * SIZE(BO), %xmm8 vmovups -2 * SIZE(BO), %xmm9 vsubpd %xmm14, %xmm8 , %xmm6 vsubpd %xmm7 , %xmm9 , %xmm7 vmovddup -16 * SIZE(AO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(AO), %xmm9 vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 vmovddup -14 * SIZE(AO), %xmm10 vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2 vmovddup -13 * SIZE(AO), %xmm11 vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3 vmovddup -12 * SIZE(AO), %xmm8 vfnmaddpd %xmm4 , %xmm0 , %xmm8 , %xmm4 vmovddup -11 * SIZE(AO), %xmm9 vfnmaddpd %xmm5 , %xmm0 , %xmm9 , %xmm5 vmovddup -10 * SIZE(AO), %xmm10 vfnmaddpd %xmm6 , %xmm0 , %xmm10, %xmm6 vmovddup -9 * SIZE(AO), %xmm11 vfnmaddpd %xmm7 , %xmm0 , %xmm11, %xmm7 vmovddup -7 * SIZE(AO), %xmm8 vmulpd %xmm1 , %xmm8 , %xmm1 vmovddup -6 * SIZE(AO), %xmm10 vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2 vmovddup -5 * SIZE(AO), %xmm11 vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3 vmovddup -4 * SIZE(AO), %xmm8 vfnmaddpd %xmm4 , %xmm1 , %xmm8 , %xmm4 vmovddup -3 * SIZE(AO), %xmm9 vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5 vmovddup -2 * SIZE(AO), %xmm10 vfnmaddpd %xmm6 , %xmm1 , %xmm10, %xmm6 vmovddup -1 * SIZE(AO), %xmm11 vfnmaddpd %xmm7 , %xmm1 , %xmm11, %xmm7 vmovddup 2 * SIZE(AO), %xmm8 vmulpd %xmm2 , %xmm8 , %xmm2 vmovddup 3 * SIZE(AO), %xmm11 vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3 vmovddup 4 * SIZE(AO), %xmm8 vfnmaddpd %xmm4 , %xmm2 , %xmm8 , %xmm4 vmovddup 5 * SIZE(AO), %xmm9 vfnmaddpd %xmm5 , %xmm2 , %xmm9 , %xmm5 vmovddup 6 * SIZE(AO), %xmm10 vfnmaddpd %xmm6 , %xmm2 , %xmm10, %xmm6 vmovddup 7 * SIZE(AO), %xmm11 vfnmaddpd %xmm7 , %xmm2 , %xmm11, %xmm7 vmovddup 11 * SIZE(AO), %xmm8 vmulpd %xmm3 , %xmm8 , %xmm3 vmovddup 12 * SIZE(AO), %xmm11 vfnmaddpd %xmm4 , %xmm3 , %xmm11, %xmm4 vmovddup 13 * SIZE(AO), %xmm9 vfnmaddpd %xmm5 , %xmm3 , %xmm9 , %xmm5 vmovddup 14 * SIZE(AO), %xmm10 vfnmaddpd %xmm6 , %xmm3 , %xmm10, %xmm6 vmovddup 15 * SIZE(AO), %xmm11 vfnmaddpd %xmm7 , %xmm3 , %xmm11, %xmm7 vmovddup 20 * SIZE(AO), %xmm8 vmulpd %xmm4 , %xmm8 , %xmm4 vmovddup 21 * SIZE(AO), %xmm9 vfnmaddpd %xmm5 , %xmm4 , %xmm9 , %xmm5 vmovddup 22 * SIZE(AO), %xmm10 vfnmaddpd %xmm6 , %xmm4 , %xmm10, %xmm6 vmovddup 23 * SIZE(AO), %xmm11 vfnmaddpd %xmm7 , %xmm4 , %xmm11, %xmm7 vmovddup 29 * SIZE(AO), %xmm8 vmulpd %xmm5 , %xmm8 , %xmm5 vmovddup 30 * SIZE(AO), %xmm10 vfnmaddpd %xmm6 , %xmm5 , %xmm10, %xmm6 vmovddup 31 * SIZE(AO), %xmm11 vfnmaddpd %xmm7 , %xmm5 , %xmm11, %xmm7 vmovddup 38 * SIZE(AO), %xmm8 vmulpd %xmm6 , %xmm8 , %xmm6 vmovddup 39 * SIZE(AO), %xmm11 vfnmaddpd %xmm7 , %xmm6 , %xmm11, %xmm7 vmovddup 47 * SIZE(AO), %xmm8 vmulpd %xmm7 , %xmm8 , %xmm7 vmovsd %xmm0 , 0 * SIZE(CO1) vmovsd %xmm1 , 1 * SIZE(CO1) vmovsd %xmm2 , 2 * SIZE(CO1) vmovsd %xmm3 , 3 * SIZE(CO1) vmovsd %xmm4 , 4 * SIZE(CO1) vmovsd %xmm5 , 5 * SIZE(CO1) vmovsd %xmm6 , 6 * SIZE(CO1) vmovsd %xmm7 , 7 * SIZE(CO1) vmovhpd %xmm0 , 0 * SIZE(CO2) vmovhpd %xmm1 , 1 * SIZE(CO2) vmovhpd %xmm2 , 2 * SIZE(CO2) vmovhpd %xmm3 , 3 * SIZE(CO2) vmovhpd %xmm4 , 4 * SIZE(CO2) vmovhpd %xmm5 , 5 * SIZE(CO2) vmovhpd %xmm6 , 6 * SIZE(CO2) vmovhpd %xmm7 , 7 * SIZE(CO2) vmovups %xmm0 , -16 * SIZE(BO) vmovups %xmm1 , -14 * SIZE(BO) vmovups %xmm2 , -12 * SIZE(BO) vmovups %xmm3 , -10 * SIZE(BO) vmovups %xmm4 , -8 * SIZE(BO) vmovups %xmm5 , -6 * SIZE(BO) vmovups %xmm6 , -4 * SIZE(BO) vmovups %xmm7 , -2 * SIZE(BO) .endm .macro KERNEL4x2_SUB vmovddup -16*SIZE(BO,%rax,2), %xmm1 vmovddup -15*SIZE(BO,%rax,2), %xmm2 vmovups -16*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 addq $ SIZE, %rax .endm .macro SOLVE_4x2 vmovups %xmm8 , %xmm1 vunpcklpd %xmm9 , %xmm8 , %xmm8 vunpckhpd %xmm9 , %xmm1 , %xmm1 vmovups -16 * SIZE(BO), %xmm0 vsubpd %xmm8 , %xmm0 , %xmm0 vmovups -14 * SIZE(BO), %xmm8 vsubpd %xmm1 , %xmm8 , %xmm1 vmovups %xmm10, %xmm3 vunpcklpd %xmm11, %xmm10 , %xmm10 vunpckhpd %xmm11, %xmm3 , %xmm3 vmovups -12 * SIZE(BO), %xmm8 vmovups -10 * SIZE(BO), %xmm9 vsubpd %xmm10, %xmm8 , %xmm2 vsubpd %xmm3 , %xmm9 , %xmm3 vmovddup -16 * SIZE(AO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(AO), %xmm9 vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 vmovddup -14 * SIZE(AO), %xmm10 vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2 vmovddup -13 * SIZE(AO), %xmm11 vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3 vmovddup -11 * SIZE(AO), %xmm8 vmulpd %xmm1 , %xmm8 , %xmm1 vmovddup -10 * SIZE(AO), %xmm10 vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2 vmovddup -9 * SIZE(AO), %xmm11 vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3 vmovddup -6 * SIZE(AO), %xmm8 vmulpd %xmm2 , %xmm8 , %xmm2 vmovddup -5 * SIZE(AO), %xmm11 vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3 vmovddup -1 * SIZE(AO), %xmm8 vmulpd %xmm3 , %xmm8 , %xmm3 vmovsd %xmm0 , 0 * SIZE(CO1) vmovsd %xmm1 , 1 * SIZE(CO1) vmovsd %xmm2 , 2 * SIZE(CO1) vmovsd %xmm3 , 3 * SIZE(CO1) vmovhpd %xmm0 , 0 * SIZE(CO2) vmovhpd %xmm1 , 1 * SIZE(CO2) vmovhpd %xmm2 , 2 * SIZE(CO2) vmovhpd %xmm3 , 3 * SIZE(CO2) vmovups %xmm0 , -16 * SIZE(BO) vmovups %xmm1 , -14 * SIZE(BO) vmovups %xmm2 , -12 * SIZE(BO) vmovups %xmm3 , -10 * SIZE(BO) .endm .macro KERNEL2x2_SUB vmovddup -16*SIZE(BO,%rax,2), %xmm1 vmovddup -15*SIZE(BO,%rax,2), %xmm2 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 addq $ SIZE, %rax .endm .macro SOLVE_2x2 vmovups %xmm8 , %xmm1 vunpcklpd %xmm9 , %xmm8 , %xmm8 vunpckhpd %xmm9 , %xmm1 , %xmm1 vmovups -16 * SIZE(BO), %xmm0 vsubpd %xmm8 , %xmm0 , %xmm0 vmovups -14 * SIZE(BO), %xmm8 vsubpd %xmm1 , %xmm8 , %xmm1 vmovddup -16 * SIZE(AO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(AO), %xmm9 vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 vmovddup -13 * SIZE(AO), %xmm8 vmulpd %xmm1 , %xmm8 , %xmm1 vmovsd %xmm0 , 0 * SIZE(CO1) vmovsd %xmm1 , 1 * SIZE(CO1) vmovhpd %xmm0 , 0 * SIZE(CO2) vmovhpd %xmm1 , 1 * SIZE(CO2) vmovups %xmm0 , -16 * SIZE(BO) vmovups %xmm1 , -14 * SIZE(BO) .endm .macro KERNEL1x2_SUB vmovups -16*SIZE(BO,%rax,2), %xmm1 vmovddup -16*SIZE(AO,%rax,1), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 addq $ SIZE, %rax .endm .macro SOLVE_1x2 vmovups -16 * SIZE(BO), %xmm0 vsubpd %xmm8 , %xmm0 , %xmm0 vmovddup -16 * SIZE(AO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovsd %xmm0 , 0 * SIZE(CO1) vmovhpd %xmm0 , 0 * SIZE(CO2) vmovups %xmm0 , -16 * SIZE(BO) .endm /******************************************************************************************/ .macro KERNEL8x1_SUB vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 vmovups -12*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vmovups -10*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 addq $ SIZE, %rax .endm .macro SOLVE_8x1 vmovups -16 * SIZE(BO), %xmm1 vmovups -14 * SIZE(BO), %xmm3 vmovups -12 * SIZE(BO), %xmm5 vmovups -10 * SIZE(BO), %xmm7 vsubpd %xmm8 , %xmm1 , %xmm1 vsubpd %xmm9 , %xmm3 , %xmm3 vsubpd %xmm10, %xmm5 , %xmm5 vsubpd %xmm11, %xmm7 , %xmm7 vmovups %xmm1 , %xmm0 vunpckhpd %xmm1 , %xmm1 , %xmm1 vmovups %xmm3 , %xmm2 vunpckhpd %xmm3 , %xmm3 , %xmm3 vmovups %xmm5 , %xmm4 vunpckhpd %xmm5 , %xmm5 , %xmm5 vmovups %xmm7 , %xmm6 vunpckhpd %xmm7 , %xmm7 , %xmm7 vmulsd -16 * SIZE(AO), %xmm0 , %xmm0 vfnmaddsd %xmm1 ,-15 * SIZE(AO), %xmm0 , %xmm1 vfnmaddsd %xmm2 ,-14 * SIZE(AO), %xmm0 , %xmm2 vfnmaddsd %xmm3 ,-13 * SIZE(AO), %xmm0 , %xmm3 vfnmaddsd %xmm4 ,-12 * SIZE(AO), %xmm0 , %xmm4 vfnmaddsd %xmm5 ,-11 * SIZE(AO), %xmm0 , %xmm5 vfnmaddsd %xmm6 ,-10 * SIZE(AO), %xmm0 , %xmm6 vfnmaddsd %xmm7 , -9 * SIZE(AO), %xmm0 , %xmm7 vmulsd -7 * SIZE(AO), %xmm1 , %xmm1 vfnmaddsd %xmm2 , -6 * SIZE(AO), %xmm1 , %xmm2 vfnmaddsd %xmm3 , -5 * SIZE(AO), %xmm1 , %xmm3 vfnmaddsd %xmm4 , -4 * SIZE(AO), %xmm1 , %xmm4 vfnmaddsd %xmm5 , -3 * SIZE(AO), %xmm1 , %xmm5 vfnmaddsd %xmm6 , -2 * SIZE(AO), %xmm1 , %xmm6 vfnmaddsd %xmm7 , -1 * SIZE(AO), %xmm1 , %xmm7 vmulsd 2 * SIZE(AO), %xmm2 , %xmm2 vfnmaddsd %xmm3 , 3 * SIZE(AO), %xmm2 , %xmm3 vfnmaddsd %xmm4 , 4 * SIZE(AO), %xmm2 , %xmm4 vfnmaddsd %xmm5 , 5 * SIZE(AO), %xmm2 , %xmm5 vfnmaddsd %xmm6 , 6 * SIZE(AO), %xmm2 , %xmm6 vfnmaddsd %xmm7 , 7 * SIZE(AO), %xmm2 , %xmm7 vmulsd 11 * SIZE(AO), %xmm3 , %xmm3 vfnmaddsd %xmm4 , 12 * SIZE(AO), %xmm3 , %xmm4 vfnmaddsd %xmm5 , 13 * SIZE(AO), %xmm3 , %xmm5 vfnmaddsd %xmm6 , 14 * SIZE(AO), %xmm3 , %xmm6 vfnmaddsd %xmm7 , 15 * SIZE(AO), %xmm3 , %xmm7 vmulsd 20 * SIZE(AO), %xmm4 , %xmm4 vfnmaddsd %xmm5 , 21 * SIZE(AO), %xmm4 , %xmm5 vfnmaddsd %xmm6 , 22 * SIZE(AO), %xmm4 , %xmm6 vfnmaddsd %xmm7 , 23 * SIZE(AO), %xmm4 , %xmm7 vmulsd 29 * SIZE(AO), %xmm5 , %xmm5 vfnmaddsd %xmm6 , 30 * SIZE(AO), %xmm5 , %xmm6 vfnmaddsd %xmm7 , 31 * SIZE(AO), %xmm5 , %xmm7 vmulsd 38 * SIZE(AO), %xmm6 , %xmm6 vfnmaddsd %xmm7 , 39 * SIZE(AO), %xmm6 , %xmm7 vmulsd 47 * SIZE(AO), %xmm7 , %xmm7 vmovsd %xmm0 , 0 * SIZE(CO1) vmovsd %xmm1 , 1 * SIZE(CO1) vmovsd %xmm2 , 2 * SIZE(CO1) vmovsd %xmm3 , 3 * SIZE(CO1) vmovsd %xmm4 , 4 * SIZE(CO1) vmovsd %xmm5 , 5 * SIZE(CO1) vmovsd %xmm6 , 6 * SIZE(CO1) vmovsd %xmm7 , 7 * SIZE(CO1) vmovsd %xmm0 , -16 * SIZE(BO) vmovsd %xmm1 , -15 * SIZE(BO) vmovsd %xmm2 , -14 * SIZE(BO) vmovsd %xmm3 , -13 * SIZE(BO) vmovsd %xmm4 , -12 * SIZE(BO) vmovsd %xmm5 , -11 * SIZE(BO) vmovsd %xmm6 , -10 * SIZE(BO) vmovsd %xmm7 , -9 * SIZE(BO) .endm .macro KERNEL4x1_SUB vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 addq $ SIZE, %rax .endm .macro SOLVE_4x1 vmovups -16 * SIZE(BO), %xmm1 vmovups -14 * SIZE(BO), %xmm3 vsubpd %xmm8 , %xmm1 , %xmm1 vsubpd %xmm9 , %xmm3 , %xmm3 vmovups %xmm1 , %xmm0 vunpckhpd %xmm1 , %xmm1 , %xmm1 vmovups %xmm3 , %xmm2 vunpckhpd %xmm3 , %xmm3 , %xmm3 vmulsd -16 * SIZE(AO), %xmm0 , %xmm0 vfnmaddsd %xmm1 ,-15 * SIZE(AO), %xmm0 , %xmm1 vfnmaddsd %xmm2 ,-14 * SIZE(AO), %xmm0 , %xmm2 vfnmaddsd %xmm3 ,-13 * SIZE(AO), %xmm0 , %xmm3 vmulsd -11 * SIZE(AO), %xmm1 , %xmm1 vfnmaddsd %xmm2 ,-10 * SIZE(AO), %xmm1 , %xmm2 vfnmaddsd %xmm3 , -9 * SIZE(AO), %xmm1 , %xmm3 vmulsd -6 * SIZE(AO), %xmm2 , %xmm2 vfnmaddsd %xmm3 , -5 * SIZE(AO), %xmm2 , %xmm3 vmulsd -1 * SIZE(AO), %xmm3 , %xmm3 vmovsd %xmm0 , 0 * SIZE(CO1) vmovsd %xmm1 , 1 * SIZE(CO1) vmovsd %xmm2 , 2 * SIZE(CO1) vmovsd %xmm3 , 3 * SIZE(CO1) vmovsd %xmm0 , -16 * SIZE(BO) vmovsd %xmm1 , -15 * SIZE(BO) vmovsd %xmm2 , -14 * SIZE(BO) vmovsd %xmm3 , -13 * SIZE(BO) .endm .macro KERNEL2x1_SUB vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 addq $ SIZE, %rax .endm .macro SOLVE_2x1 vmovups -16 * SIZE(BO), %xmm1 vsubpd %xmm8 , %xmm1 , %xmm1 vmovups %xmm1 , %xmm0 vunpckhpd %xmm1 , %xmm1 , %xmm1 vmulsd -16 * SIZE(AO), %xmm0 , %xmm0 vfnmaddsd %xmm1 ,-15 * SIZE(AO), %xmm0 , %xmm1 vmulsd -13 * SIZE(AO), %xmm1 , %xmm1 vmovsd %xmm0 , 0 * SIZE(CO1) vmovsd %xmm1 , 1 * SIZE(CO1) vmovsd %xmm0 , -16 * SIZE(BO) vmovsd %xmm1 , -15 * SIZE(BO) .endm .macro KERNEL1x1_SUB vmovsd -16*SIZE(BO,%rax,1), %xmm1 vmovsd -16*SIZE(AO,%rax,1), %xmm0 vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 addq $ SIZE, %rax .endm .macro SOLVE_1x1 vmovsd -16 * SIZE(BO), %xmm1 vsubsd %xmm8 , %xmm1 , %xmm1 vmulsd -16 * SIZE(AO), %xmm1 , %xmm1 vmovsd %xmm1 , 0 * SIZE(CO1) vmovsd %xmm1 , -16 * SIZE(BO) .endm /***************************************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm12 #else movq STACKSIZE + 8(%rsp), LDC movsd STACKSIZE + 16(%rsp), %xmm12 #endif movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B movsd %xmm12, OFFSET movsd %xmm12, KK leaq (, LDC, SIZE), LDC movq N, J sarq $1, J # j = (n >> 1) jle .L80 ALIGN_4 .L01: movq A, AO movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc leaq (C, LDC, 2), C movq OFFSET, %rax movq %rax, KK movq M, I sarq $3, I # i = (m >> 3) jle .L50_A ALIGN_4 /*********************************************************************************/ .L51: movq B, BO vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 vxorpd %xmm12, %xmm12, %xmm12 vxorpd %xmm13, %xmm13, %xmm13 vxorpd %xmm14, %xmm14, %xmm14 vxorpd %xmm15, %xmm15, %xmm15 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO negq %rax je .L56 ALIGN_4 .L52: prefetcht0 A_PR1(AO,%rax,8) prefetcht0 B_PR1(BO,%rax,2) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,8) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,8) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,8) KERNEL8x2_SUB jl .L52 ALIGN_4 .L56: movq KK, %rax andq $3, %rax # if (k & 1) je .L59 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57: KERNEL8x2_SUB jl .L57 ALIGN_4 .L59: SOLVE_8x2 addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO addq $8, KK decq I # i -- jg .L51 ALIGN_4 /*********************************************************************************/ .L50_A: testq $4, M je .L60 .L51_A: movq B, BO vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax je .L56_A ALIGN_4 .L52_A: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB jl .L52_A ALIGN_4 .L56_A: movq KK, %rax andq $3, %rax # if (k & 1) je .L59_A leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57_A: KERNEL4x2_SUB jl .L57_A ALIGN_4 .L59_A: SOLVE_4x2 addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO addq $4, KK ALIGN_4 /*********************************************************************************/ .L60: testq $2, M je .L70 .L61: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 vxorpd %xmm9, %xmm9 , %xmm9 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax je .L66 ALIGN_4 .L62: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB jl .L62 ALIGN_4 .L66: movq KK, %rax andq $3, %rax # if (k & 1) je .L69 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L67: KERNEL2x2_SUB jl .L67 ALIGN_4 .L69: SOLVE_2x2 addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO addq $2, KK ALIGN_4 /********************************************************************************/ .L70: testq $1, M je .L79 ALIGN_4 .L71: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax je .L76 ALIGN_4 .L72: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB jl .L72 ALIGN_4 .L76: movq KK, %rax andq $3, %rax # if (k & 1) je .L78 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L77: KERNEL1x2_SUB jl .L77 ALIGN_4 .L78: SOLVE_1x2 addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO addq $1, KK ALIGN_4 .L79: movq BO, B decq J # j -- jg .L01 ALIGN_4 /***************************************************************************************/ .L80: testq $1, N je .L999 movq A, AO movq C, CO1 # coffset1 = c movq OFFSET, %rax movq %rax, KK movq M, I sarq $3, I # i = (m >> 3) jle .L90_A ALIGN_4 /*************************************************************************************/ .L91: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 vxorpd %xmm9, %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO negq %rax je .L96 ALIGN_4 .L92: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB jl .L92 ALIGN_4 .L96: movq KK, %rax andq $3, %rax # if (k & 1) je .L99 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97: KERNEL8x1_SUB jl .L97 ALIGN_4 .L99: SOLVE_8x1 addq $8 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO addq %rax, BO addq $8, KK decq I # i -- jg .L91 ALIGN_4 /*****************************************************************************/ .L90_A: testq $4, M je .L100 .L91_A: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 vxorpd %xmm9, %xmm9 , %xmm9 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax je .L96_A ALIGN_4 .L92_A: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB jl .L92_A ALIGN_4 .L96_A: movq KK, %rax andq $3, %rax # if (k & 1) je .L99_A leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97_A: KERNEL4x1_SUB jl .L97_A ALIGN_4 .L99_A: SOLVE_4x1 addq $4 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO addq $4, KK ALIGN_4 /*************************************************************************************/ .L100: testq $2, M je .L110 movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax je .L106 ALIGN_4 .L102: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB jl .L102 ALIGN_4 .L106: movq KK, %rax andq $3, %rax # if (k & 1) je .L109 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L107: KERNEL2x1_SUB jl .L107 ALIGN_4 .L109: SOLVE_2x1 addq $2 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO addq $2, KK ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax je .L116 ALIGN_4 .L112: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB jl .L112 ALIGN_4 .L116: movq KK, %rax andq $3, %rax # if (k & 1) je .L118 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L117: KERNEL1x1_SUB jl .L117 ALIGN_4 .L118: SOLVE_1x1 addq $1 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax addq %rax, AO addq %rax, BO addq $1, KK ALIGN_4 .L119: movq BO, B ALIGN_4 .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S000066400000000000000000000475741313527062700234020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #define J %rbx #ifndef WINDOWS_ABI #define STACKSIZE 96 #define OFFSET 48(%rsp) #define AORIG 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define AORIG 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define A_PR1 384 #define B_PR1 192 .macro KERNEL8x2_SUB vmovddup -16*SIZE(BO,%rax,2), %xmm1 vmovddup -15*SIZE(BO,%rax,2), %xmm2 vmovups -16*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 vmovups -14*SIZE(AO,%rax,8), %xmm4 vfmaddpd %xmm10, %xmm4 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm4 , %xmm2 , %xmm11 vmovups -12*SIZE(AO,%rax,8), %xmm5 vfmaddpd %xmm12, %xmm5 , %xmm1 , %xmm12 vfmaddpd %xmm13, %xmm5 , %xmm2 , %xmm13 vmovups -10*SIZE(AO,%rax,8), %xmm6 vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 addq $ SIZE, %rax .endm .macro SOLVE_8x2 vmovups -16 * SIZE(AO), %xmm0 vmovups -14 * SIZE(AO), %xmm1 vmovups -12 * SIZE(AO), %xmm2 vmovups -10 * SIZE(AO), %xmm3 vmovups -8 * SIZE(AO), %xmm4 vmovups -6 * SIZE(AO), %xmm5 vmovups -4 * SIZE(AO), %xmm6 vmovups -2 * SIZE(AO), %xmm7 vsubpd %xmm8 , %xmm0 , %xmm0 vsubpd %xmm10, %xmm1 , %xmm1 vsubpd %xmm12, %xmm2 , %xmm2 vsubpd %xmm14, %xmm3 , %xmm3 vsubpd %xmm9 , %xmm4 , %xmm4 vsubpd %xmm11, %xmm5 , %xmm5 vsubpd %xmm13, %xmm6 , %xmm6 vsubpd %xmm15, %xmm7 , %xmm7 vmovddup -16 * SIZE(BO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmulpd %xmm1 , %xmm8 , %xmm1 vmulpd %xmm2 , %xmm8 , %xmm2 vmulpd %xmm3 , %xmm8 , %xmm3 vmovddup -15 * SIZE(BO), %xmm9 vfnmaddpd %xmm4 , %xmm0 , %xmm9 , %xmm4 vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5 vfnmaddpd %xmm6 , %xmm2 , %xmm9 , %xmm6 vfnmaddpd %xmm7 , %xmm3 , %xmm9 , %xmm7 vmovddup -13 * SIZE(BO), %xmm10 vmulpd %xmm4 , %xmm10, %xmm4 vmulpd %xmm5 , %xmm10, %xmm5 vmulpd %xmm6 , %xmm10, %xmm6 vmulpd %xmm7 , %xmm10, %xmm7 vmovups %xmm0 , 0 * SIZE(CO1) vmovups %xmm1 , 2 * SIZE(CO1) vmovups %xmm2 , 4 * SIZE(CO1) vmovups %xmm3 , 6 * SIZE(CO1) vmovups %xmm4 , 0 * SIZE(CO2) vmovups %xmm5 , 2 * SIZE(CO2) vmovups %xmm6 , 4 * SIZE(CO2) vmovups %xmm7 , 6 * SIZE(CO2) vmovups %xmm0 , -16 * SIZE(AO) vmovups %xmm1 , -14 * SIZE(AO) vmovups %xmm2 , -12 * SIZE(AO) vmovups %xmm3 , -10 * SIZE(AO) vmovups %xmm4 , -8 * SIZE(AO) vmovups %xmm5 , -6 * SIZE(AO) vmovups %xmm6 , -4 * SIZE(AO) vmovups %xmm7 , -2 * SIZE(AO) .endm .macro KERNEL4x2_SUB vmovddup -16*SIZE(BO,%rax,2), %xmm1 vmovddup -15*SIZE(BO,%rax,2), %xmm2 vmovups -16*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 addq $ SIZE, %rax .endm .macro SOLVE_4x2 vmovups -16 * SIZE(AO), %xmm0 vmovups -14 * SIZE(AO), %xmm1 vmovups -12 * SIZE(AO), %xmm2 vmovups -10 * SIZE(AO), %xmm3 vsubpd %xmm8 , %xmm0 , %xmm0 vsubpd %xmm10, %xmm1 , %xmm1 vsubpd %xmm9 , %xmm2 , %xmm2 vsubpd %xmm11, %xmm3 , %xmm3 vmovddup -16 * SIZE(BO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmulpd %xmm1 , %xmm8 , %xmm1 vmovddup -15 * SIZE(BO), %xmm9 vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2 vfnmaddpd %xmm3 , %xmm1 , %xmm9 , %xmm3 vmovddup -13 * SIZE(BO), %xmm10 vmulpd %xmm2 , %xmm10, %xmm2 vmulpd %xmm3 , %xmm10, %xmm3 vmovups %xmm0 , 0 * SIZE(CO1) vmovups %xmm1 , 2 * SIZE(CO1) vmovups %xmm2 , 0 * SIZE(CO2) vmovups %xmm3 , 2 * SIZE(CO2) vmovups %xmm0 , -16 * SIZE(AO) vmovups %xmm1 , -14 * SIZE(AO) vmovups %xmm2 , -12 * SIZE(AO) vmovups %xmm3 , -10 * SIZE(AO) .endm .macro KERNEL2x2_SUB vmovddup -16*SIZE(BO,%rax,2), %xmm1 vmovddup -15*SIZE(BO,%rax,2), %xmm2 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 addq $ SIZE, %rax .endm .macro SOLVE_2x2 vmovups -16 * SIZE(AO), %xmm0 vmovups -14 * SIZE(AO), %xmm2 vsubpd %xmm8 , %xmm0 , %xmm0 vsubpd %xmm9 , %xmm2 , %xmm2 vmovddup -16 * SIZE(BO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(BO), %xmm9 vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2 vmovddup -13 * SIZE(BO), %xmm10 vmulpd %xmm2 , %xmm10, %xmm2 vmovups %xmm0 , 0 * SIZE(CO1) vmovups %xmm2 , 0 * SIZE(CO2) vmovups %xmm0 , -16 * SIZE(AO) vmovups %xmm2 , -14 * SIZE(AO) .endm .macro KERNEL1x2_SUB vmovups -16*SIZE(BO,%rax,2), %xmm1 vmovddup -16*SIZE(AO,%rax,1), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 addq $ SIZE, %rax .endm .macro SOLVE_1x2 vmovups -16 * SIZE(AO), %xmm2 vsubpd %xmm8 , %xmm2 , %xmm2 vmovups %xmm2 , %xmm0 vunpckhpd %xmm0 , %xmm0 , %xmm0 vmovsd -16 * SIZE(BO), %xmm8 vmulsd %xmm2 , %xmm8 , %xmm2 vmovsd -15 * SIZE(BO), %xmm9 vfnmaddsd %xmm0 , %xmm2 , %xmm9 , %xmm0 vmovsd -13 * SIZE(BO), %xmm10 vmulsd %xmm0 , %xmm10, %xmm0 vmovsd %xmm2 , 0 * SIZE(CO1) vmovsd %xmm0 , 0 * SIZE(CO2) vmovsd %xmm2 , -16 * SIZE(AO) vmovsd %xmm0 , -15 * SIZE(AO) .endm /******************************************************************************************/ .macro KERNEL8x1_SUB vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 vmovups -12*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vmovups -10*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 addq $ SIZE, %rax .endm .macro SOLVE_8x1 vmovups -16 * SIZE(AO), %xmm0 vmovups -14 * SIZE(AO), %xmm1 vmovups -12 * SIZE(AO), %xmm2 vmovups -10 * SIZE(AO), %xmm3 vsubpd %xmm8 , %xmm0 , %xmm0 vsubpd %xmm9 , %xmm1 , %xmm1 vsubpd %xmm10, %xmm2 , %xmm2 vsubpd %xmm11, %xmm3 , %xmm3 vmovddup -16 * SIZE(BO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmulpd %xmm1 , %xmm8 , %xmm1 vmulpd %xmm2 , %xmm8 , %xmm2 vmulpd %xmm3 , %xmm8 , %xmm3 vmovups %xmm0 , 0 * SIZE(CO1) vmovups %xmm1 , 2 * SIZE(CO1) vmovups %xmm2 , 4 * SIZE(CO1) vmovups %xmm3 , 6 * SIZE(CO1) vmovups %xmm0 , -16 * SIZE(AO) vmovups %xmm1 , -14 * SIZE(AO) vmovups %xmm2 , -12 * SIZE(AO) vmovups %xmm3 , -10 * SIZE(AO) .endm .macro KERNEL4x1_SUB vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 addq $ SIZE, %rax .endm .macro SOLVE_4x1 vmovups -16 * SIZE(AO), %xmm0 vmovups -14 * SIZE(AO), %xmm1 vsubpd %xmm8 , %xmm0 , %xmm0 vsubpd %xmm9 , %xmm1 , %xmm1 vmovddup -16 * SIZE(BO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmulpd %xmm1 , %xmm8 , %xmm1 vmovups %xmm0 , 0 * SIZE(CO1) vmovups %xmm1 , 2 * SIZE(CO1) vmovups %xmm0 , -16 * SIZE(AO) vmovups %xmm1 , -14 * SIZE(AO) .endm .macro KERNEL2x1_SUB vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 addq $ SIZE, %rax .endm .macro SOLVE_2x1 vmovups -16 * SIZE(AO), %xmm1 vsubpd %xmm8 , %xmm1 , %xmm1 vmovddup -16 * SIZE(BO), %xmm8 vmulpd %xmm1 , %xmm8 , %xmm1 vmovups %xmm1 , 0 * SIZE(CO1) vmovups %xmm1 , -16 * SIZE(AO) .endm .macro KERNEL1x1_SUB vmovsd -16*SIZE(BO,%rax,1), %xmm1 vmovsd -16*SIZE(AO,%rax,1), %xmm0 vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 addq $ SIZE, %rax .endm .macro SOLVE_1x1 vmovsd -16 * SIZE(AO), %xmm1 vsubsd %xmm8 , %xmm1 , %xmm1 vmulsd -16 * SIZE(BO), %xmm1 , %xmm1 vmovsd %xmm1 , 0 * SIZE(CO1) vmovsd %xmm1 , -16 * SIZE(AO) .endm /***************************************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm12 #else movq STACKSIZE + 8(%rsp), LDC movsd STACKSIZE + 16(%rsp), %xmm12 #endif movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B movsd %xmm12, OFFSET movsd %xmm12, KK negq KK // for RN Kernel leaq (, LDC, SIZE), LDC movq N, J sarq $1, J # j = (n >> 1) jle .L80 ALIGN_4 .L01: movq A, AO movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc leaq (C, LDC, 2), C movq M, I sarq $3, I # i = (m >> 3) jle .L50_A ALIGN_4 /*********************************************************************************/ .L51: movq B, BO vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 vxorpd %xmm12, %xmm12, %xmm12 vxorpd %xmm13, %xmm13, %xmm13 vxorpd %xmm14, %xmm14, %xmm14 vxorpd %xmm15, %xmm15, %xmm15 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO negq %rax je .L56 ALIGN_4 .L52: prefetcht0 A_PR1(AO,%rax,8) prefetcht0 B_PR1(BO,%rax,2) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,8) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,8) KERNEL8x2_SUB prefetcht0 A_PR1(AO,%rax,8) KERNEL8x2_SUB jl .L52 ALIGN_4 .L56: movq KK, %rax andq $3, %rax # if (k & 1) je .L59 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57: KERNEL8x2_SUB jl .L57 ALIGN_4 .L59: SOLVE_8x2 addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO decq I # i -- jg .L51 ALIGN_4 /*********************************************************************************/ .L50_A: testq $4, M je .L60 .L51_A: movq B, BO vxorpd %xmm8 , %xmm8 , %xmm8 vxorpd %xmm9 , %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax je .L56_A ALIGN_4 .L52_A: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB jl .L52_A ALIGN_4 .L56_A: movq KK, %rax andq $3, %rax # if (k & 1) je .L59_A leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57_A: KERNEL4x2_SUB jl .L57_A ALIGN_4 .L59_A: SOLVE_4x2 addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO ALIGN_4 /*********************************************************************************/ .L60: testq $2, M je .L70 .L61: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 vxorpd %xmm9, %xmm9 , %xmm9 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax je .L66 ALIGN_4 .L62: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB jl .L62 ALIGN_4 .L66: movq KK, %rax andq $3, %rax # if (k & 1) je .L69 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L67: KERNEL2x2_SUB jl .L67 ALIGN_4 .L69: SOLVE_2x2 addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO ALIGN_4 /********************************************************************************/ .L70: testq $1, M je .L79 ALIGN_4 .L71: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax je .L76 ALIGN_4 .L72: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB jl .L72 ALIGN_4 .L76: movq KK, %rax andq $3, %rax # if (k & 1) je .L78 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L77: KERNEL1x2_SUB jl .L77 ALIGN_4 .L78: SOLVE_1x2 addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO ALIGN_4 .L79: addq $2, KK // number of values in B # only for RN Kernel movq BO, B decq J # j -- jg .L01 ALIGN_4 /***************************************************************************************/ .L80: testq $1, N je .L999 movq A, AO movq C, CO1 # coffset1 = c movq M, I sarq $3, I # i = (m >> 3) jle .L90_A ALIGN_4 /*************************************************************************************/ .L91: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 vxorpd %xmm9, %xmm9 , %xmm9 vxorpd %xmm10, %xmm10, %xmm10 vxorpd %xmm11, %xmm11, %xmm11 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO negq %rax je .L96 ALIGN_4 .L92: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB jl .L92 ALIGN_4 .L96: movq KK, %rax andq $3, %rax # if (k & 1) je .L99 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97: KERNEL8x1_SUB jl .L97 ALIGN_4 .L99: SOLVE_8x1 addq $8 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO addq %rax, BO decq I # i -- jg .L91 ALIGN_4 /*****************************************************************************/ .L90_A: testq $4, M je .L100 .L91_A: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 vxorpd %xmm9, %xmm9 , %xmm9 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax je .L96_A ALIGN_4 .L92_A: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB jl .L92_A ALIGN_4 .L96_A: movq KK, %rax andq $3, %rax # if (k & 1) je .L99_A leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97_A: KERNEL4x1_SUB jl .L97_A ALIGN_4 .L99_A: SOLVE_4x1 addq $4 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO ALIGN_4 /*************************************************************************************/ .L100: testq $2, M je .L110 movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax je .L106 ALIGN_4 .L102: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB jl .L102 ALIGN_4 .L106: movq KK, %rax andq $3, %rax # if (k & 1) je .L109 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L107: KERNEL2x1_SUB jl .L107 ALIGN_4 .L109: SOLVE_2x1 addq $2 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: movq B, BO vxorpd %xmm8, %xmm8 , %xmm8 movq KK, %rax andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax je .L116 ALIGN_4 .L112: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB jl .L112 ALIGN_4 .L116: movq KK, %rax andq $3, %rax # if (k & 1) je .L118 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L117: KERNEL1x1_SUB jl .L117 ALIGN_4 .L118: SOLVE_1x1 addq $1 * SIZE, CO1 movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax addq %rax, AO addq %rax, BO ALIGN_4 .L119: addq $1 , KK // number of values in B # only for RN Kernel movq BO, B ALIGN_4 .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/dtrsm_kernel_RN_haswell.c000066400000000000000000000503631313527062700223440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c3 = c + ldc + ldc*2 ; FLOAT *c6 = c + ldc*4 + ldc*2 ; ldc = ldc *8; BLASLONG n1 = n * 8; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" " vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" " vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" " vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" " vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" " cmpq $0, %0 \n\t" " je 4f \n\t" " vmovups (%2,%1,4), %%ymm0 \n\t" // read a " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 " addq $8, %1 \n\t" " cmpq %1, %0 \n\t" " je 21f \n\t" " .align 16 \n\t" "1: \n\t" " vmovups (%2,%1,4), %%ymm4 \n\t" // read a " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm15 \n\t" " cmpq %1, %0 \n\t" " jz 22f \n\t" " vmovups (%2,%1,4), %%ymm0 \n\t" // read a " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "21: \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" " jmp 3f \n\t" "22: \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" "3: \n\t" " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm14 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm15 \n\t" "4: \n\t" " vmovups (%4) , %%ymm0 \n\t" // read c0 " vmovups (%4,%7,1) , %%ymm1 \n\t" // read c1 " vmovups (%4,%7,2) , %%ymm2 \n\t" // read c2 " vmovups (%5) , %%ymm3 \n\t" // read c3 " vmovups (%5,%7,1) , %%ymm4 \n\t" // read c4 " vmovups (%5,%7,2) , %%ymm5 \n\t" // read c5 " vmovups (%6) , %%ymm6 \n\t" // read c6 " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" " vmovups (%9), %%ymm0 \n\t" " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" " vsubpd %%ymm11, %%ymm3 , %%ymm11 \n\t" " vpermpd $0xff , %%ymm0 , %%ymm3 \n\t" " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" " vmovups 32(%9), %%ymm4 \n\t" " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" " vpermpd $0xaa , %%ymm4 , %%ymm6 \n\t" " vsubpd %%ymm15, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm4 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm4 , %%ymm4 \n\t" "5: \n\t" // i = 0 " addq $64, %9 \n\t" // b=b+8 " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb " vmovups (%9), %%ymm0 \n\t" " vmovups %%ymm8 , (%8) \n\t" // write a " vmovups %%ymm8 , (%4) \n\t" // write c " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" " vmovups 32(%9), %%ymm1 \n\t" " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" " vpermpd $0xff , %%ymm0 , %%ymm3 \n\t" " vfnmadd231pd %%ymm8 , %%ymm4 , %%ymm12 \n\t" " vpermpd $0x55 , %%ymm0 , %%ymm0 \n\t" " vfnmadd231pd %%ymm8 , %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" " vfnmadd231pd %%ymm8 , %%ymm6 , %%ymm14 \n\t" " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" " vfnmadd231pd %%ymm8 , %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" " addq $64, %9 \n\t" // b=b+8 " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb " vmovups (%9), %%ymm0 \n\t" " vmovups 32(%9), %%ymm1 \n\t" " vmovups %%ymm9 , (%8) \n\t" // write a " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" " vfnmadd231pd %%ymm9 , %%ymm3 , %%ymm11 \n\t" " vpermpd $0xff , %%ymm0 , %%ymm3 \n\t" " vfnmadd231pd %%ymm9 , %%ymm4 , %%ymm12 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm0 \n\t" " vfnmadd231pd %%ymm9 , %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" " vfnmadd231pd %%ymm9 , %%ymm6 , %%ymm14 \n\t" " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" " vfnmadd231pd %%ymm9 , %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" " addq $64, %9 \n\t" // b=b+8 " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb " vmovups (%9), %%ymm0 \n\t" " vmovups 32(%9), %%ymm1 \n\t" " vmovups %%ymm10, (%8) \n\t" // write a " vmovups %%ymm10, (%4,%7,2) \n\t" // write c " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" " vpermpd $0xff , %%ymm0 , %%ymm0 \n\t" " vfnmadd231pd %%ymm10, %%ymm4 , %%ymm12 \n\t" " vfnmadd231pd %%ymm10, %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" " vfnmadd231pd %%ymm10, %%ymm6 , %%ymm14 \n\t" " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" " vfnmadd231pd %%ymm10, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" " addq $64, %9 \n\t" // b=b+8 " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb " vmovups 32(%9), %%ymm1 \n\t" " vmovups %%ymm11, (%8) \n\t" // write a " vmovups %%ymm11, (%5) \n\t" // write c " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" " vfnmadd231pd %%ymm11, %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" " vfnmadd231pd %%ymm11, %%ymm6 , %%ymm14 \n\t" " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" " vfnmadd231pd %%ymm11, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" " addq $64, %9 \n\t" // b=b+8 " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb " vmovups 32(%9), %%ymm1 \n\t" " vmovups %%ymm12, (%8) \n\t" // write a " vmovups %%ymm12, (%5,%7,1) \n\t" // write c " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" " vfnmadd231pd %%ymm12, %%ymm6 , %%ymm14 \n\t" " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" " vfnmadd231pd %%ymm12, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" " addq $64, %9 \n\t" // b=b+8 " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb " vmovups 32(%9), %%ymm1 \n\t" " vmovups %%ymm13, (%8) \n\t" // write a " vmovups %%ymm13, (%5,%7,2) \n\t" // write c " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" " vfnmadd231pd %%ymm13, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" " addq $64, %9 \n\t" // b=b+8 " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb " vmovups 32(%9), %%ymm1 \n\t" " vmovups %%ymm14, (%8) \n\t" // write a " vmovups %%ymm14, (%6) \n\t" // write c " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb " vmovups %%ymm15, (%8) \n\t" // write a " vmovups %%ymm15, (%6,%7,1) \n\t" // write c " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c3), // 5 "r" (c6), // 6 "r" (ldc), // 7 "r" (as), // 8 "r" (bs) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < n; i++) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = i + 1; k < n; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b += n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < n; i++) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = -aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = i + 1; k < n; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b += n * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); kk = -offset; while (j > 0) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { dtrsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); /* solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); */ aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } kk += GEMM_UNROLL_N; b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; kk += j; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c000066400000000000000000000360041313527062700227110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc ; BLASLONG n1 = n * 8; BLASLONG i=0; as += (2 - 1) * 8; bs += (2 - 1) * 2; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 2f \n\t" " .align 16 \n\t" "1: \n\t" " prefetcht0 384(%2,%1,8) \n\t" " prefetcht0 384(%3,%1,8) \n\t" " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovups (%2,%1,8), %%xmm4 \n\t" " vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" " addq $8, %1 \n\t" " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" " cmpq %1, %0 \n\t" " jz 2f \n\t" " prefetcht0 384(%2,%1,8) \n\t" " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovups (%2,%1,8), %%xmm4 \n\t" " vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" " addq $8, %1 \n\t" " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" " cmpq %1, %0 \n\t" " jz 2f \n\t" " prefetcht0 384(%2,%1,8) \n\t" " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovups (%2,%1,8), %%xmm4 \n\t" " vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" " addq $8, %1 \n\t" " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" " cmpq %1, %0 \n\t" " jz 2f \n\t" " prefetcht0 384(%2,%1,8) \n\t" " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b " vmovddup 8(%3,%1,2), %%xmm1 \n\t" " vmovups (%2,%1,8), %%xmm4 \n\t" " vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" " addq $8, %1 \n\t" " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups 32(%4) , %%xmm2 \n\t" " vmovups 48(%4) , %%xmm3 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vmovups 32(%5) , %%xmm6 \n\t" " vmovups 48(%5) , %%xmm7 \n\t" " vsubpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" " vsubpd %%xmm9 , %%xmm1 , %%xmm9 \n\t" " vsubpd %%xmm10, %%xmm2 , %%xmm10 \n\t" " vsubpd %%xmm11, %%xmm3 , %%xmm11 \n\t" " vsubpd %%xmm12, %%xmm4 , %%xmm12 \n\t" " vsubpd %%xmm13, %%xmm5 , %%xmm13 \n\t" " vsubpd %%xmm14, %%xmm6 , %%xmm14 \n\t" " vsubpd %%xmm15, %%xmm7 , %%xmm15 \n\t" "3: \n\t" // i = 1 " vmovddup (%7), %%xmm1 \n\t" // read b " vmovddup 8(%7), %%xmm0 \n\t" // read bb " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb " vmovups %%xmm12 , (%6) \n\t" // write a " vmovups %%xmm13 , 16(%6) \n\t" // write a " vmovups %%xmm14 , 32(%6) \n\t" // write a " vmovups %%xmm15 , 48(%6) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" " vmovups %%xmm14 , 32(%5) \n\t" " vmovups %%xmm15 , 48(%5) \n\t" " vfnmaddpd %%xmm8 , %%xmm12 , %%xmm1 , %%xmm8 \n\t" // c = c - aa * b " vfnmaddpd %%xmm9 , %%xmm13 , %%xmm1 , %%xmm9 \n\t" " vfnmaddpd %%xmm10 , %%xmm14 , %%xmm1 , %%xmm10 \n\t" " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 " subq $16 , %7 \n\t" // b = b - 2 " subq $64 , %6 \n\t" // a = a - 8 " vmovddup (%7), %%xmm0 \n\t" // read bb " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" " vmovups %%xmm8 , (%6) \n\t" // write a " vmovups %%xmm9 , 16(%6) \n\t" " vmovups %%xmm10 , 32(%6) \n\t" " vmovups %%xmm11 , 48(%6) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" " vmovups %%xmm10 , 32(%4) \n\t" " vmovups %%xmm11 , 48(%4) \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; a += (n - 1) * m; b += (n - 1) * n; for (i = n - 1; i >= 0; i--) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = 0; k < i; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b -= n; a -= 2 * m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (n - 1) * m * 2; b += (n - 1) * n * 2; for (i = n - 1; i >= 0; i--) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = - aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b -= n * 2; a -= 4 * m; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif kk = n - offset; c += n * ldc * COMPSIZE; b += n * k * COMPSIZE; if (n & (GEMM_UNROLL_N - 1)) { j = 1; while (j < GEMM_UNROLL_N) { if (n & j) { aa = a; b -= j * k * COMPSIZE; c -= j * ldc* COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - j) * i * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= j; } j <<= 1; } } j = (n >> GEMM_UNROLL_N_SHIFT); if (j > 0) { do { aa = a; b -= GEMM_UNROLL_N * k * COMPSIZE; c -= GEMM_UNROLL_N * ldc * COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { dtrsm_RT_solve_opt(k - kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE , b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE ); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= GEMM_UNROLL_N; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/x86_64/gemm_beta.S000066400000000000000000000127451313527062700174370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define N ARG2 #define C ARG3 #define LDC ARG4 #define C1 ARG5 #define STACK_C 16(%rsp) #define STACK_LDC 24(%rsp) #else #define STACKSIZE 256 #define M ARG1 #define N ARG2 #define C ARG3 #define LDC ARG4 #define C1 %r10 #define STACK_C 72 + STACKSIZE(%rsp) #define STACK_LDC 80 + STACKSIZE(%rsp) #endif #define I %rax PROLOGUE PROFCODE #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movaps %xmm3, %xmm0 #endif movq STACK_C, C movq STACK_LDC, LDC pxor %xmm1, %xmm1 test M, M jle .L999 test N, N jle .L999 #ifdef DOUBLE ucomisd %xmm1, %xmm0 #else ucomiss %xmm1, %xmm0 #endif jne .L201 ALIGN_4 .L101: movq C, C1 leaq (C, LDC, SIZE), C movq M, I sarq $3, I jle .L103 ALIGN_4 .L102: #ifdef OPTERON prefetchw 32 * SIZE(C1) #endif MOVSD %xmm0, 0 * SIZE(C1) MOVSD %xmm0, 1 * SIZE(C1) MOVSD %xmm0, 2 * SIZE(C1) MOVSD %xmm0, 3 * SIZE(C1) MOVSD %xmm0, 4 * SIZE(C1) MOVSD %xmm0, 5 * SIZE(C1) MOVSD %xmm0, 6 * SIZE(C1) MOVSD %xmm0, 7 * SIZE(C1) addq $8 * SIZE, C1 decq I jg .L102 ALIGN_4 .L103: movq M, I andq $7, I jle .L105 ALIGN_4 .L104: MOVSD %xmm0, 0 * SIZE(C1) addq $SIZE, C1 decq I jg .L104 ALIGN_4 .L105: decq N jg .L101 jmp .L999 ALIGN_3 .L201: movq C, C1 # c_offset = c leaq (C, LDC, SIZE), C # c += ldc movq M, I sarq $3, I jle .L203 ALIGN_4 .L202: #ifdef OPTERON prefetchw 32 * SIZE(C1) #endif MOVSD 0 * SIZE(C1), %xmm8 MOVSD 1 * SIZE(C1), %xmm9 MOVSD 2 * SIZE(C1), %xmm10 MOVSD 3 * SIZE(C1), %xmm11 MOVSD 4 * SIZE(C1), %xmm12 MOVSD 5 * SIZE(C1), %xmm13 MOVSD 6 * SIZE(C1), %xmm14 MOVSD 7 * SIZE(C1), %xmm15 MULSD %xmm0, %xmm8 MULSD %xmm0, %xmm9 MULSD %xmm0, %xmm10 MULSD %xmm0, %xmm11 MULSD %xmm0, %xmm12 MULSD %xmm0, %xmm13 MULSD %xmm0, %xmm14 MULSD %xmm0, %xmm15 MOVSD %xmm8, 0 * SIZE(C1) MOVSD %xmm9, 1 * SIZE(C1) MOVSD %xmm10, 2 * SIZE(C1) MOVSD %xmm11, 3 * SIZE(C1) MOVSD %xmm12, 4 * SIZE(C1) MOVSD %xmm13, 5 * SIZE(C1) MOVSD %xmm14, 6 * SIZE(C1) MOVSD %xmm15, 7 * SIZE(C1) addq $8 * SIZE, C1 decq I jg .L202 ALIGN_4 .L203: movq M, I andq $7, I jle .L205 ALIGN_4 .L204: MOVSD 0 * SIZE(C1), %xmm8 MULSD %xmm0, %xmm8 MOVSD %xmm8, 0 * SIZE(C1) addq $SIZE, C1 decq I jg .L204 ALIGN_4 .L205: decq N jg .L201 ALIGN_3 .L999: xorq %rax, %rax #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_2x8_nehalem.S000066400000000000000000001020751313527062700221720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define INC32 %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA 48(%rsp) #define J 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define ALPHA 224(%rsp) #define J 232(%rsp) #define OFFSET 240(%rsp) #define KK 248(%rsp) #define KKK 256(%rsp) #endif #define PREFETCHSIZE 4 #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif movlps %xmm0, ALPHA subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K leaq (, LDC, SIZE), LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $3, J NOBRANCH jle .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 4), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 PADDING xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax PADDING xorps %xmm8, %xmm8 prefetcht0 1 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 3 * SIZE(CO1, LDC, 1) PADDING xorps %xmm10, %xmm10 prefetcht0 1 * SIZE(CO1, LDC, 2) PADDING xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1, %rax, 1) movaps -16 * SIZE(AO), %xmm0 PADDING xorps %xmm12, %xmm12 prefetcht0 1 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 3 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht0 1 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht0 3 * SIZE(CO2, %rax, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -14 * SIZE(AO), %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm8 movaps 4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AO), %xmm5 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm12 movaps 8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 PADDING; mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 PADDING; mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm4 subq $-32 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: addpd %xmm1, %xmm12 movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 mulpd %xmm7, %xmm8 shufpd $2, %xmm0, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm2, %xmm13 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 mulpd %xmm7, %xmm10 shufpd $2, %xmm0, %xmm11 mulpd %xmm7, %xmm11 addpd %xmm3, %xmm14 movaps %xmm12, %xmm0 shufpd $2, %xmm13, %xmm12 mulpd %xmm7, %xmm12 shufpd $2, %xmm0, %xmm13 mulpd %xmm7, %xmm13 addpd %xmm4, %xmm15 movaps %xmm14, %xmm0 shufpd $2, %xmm15, %xmm14 mulpd %xmm7, %xmm14 shufpd $2, %xmm0, %xmm15 mulpd %xmm7, %xmm15 movq CO1, %rax orq LDC, %rax testq $15, %rax NOBRANCH jne .L18x leaq (LDC, LDC, 2), %rax #ifndef TRMMKERNEL movups (CO1), %xmm0 movups (CO1, LDC, 1), %xmm1 movups (CO1, LDC, 2), %xmm2 movups (CO1, %rax, 1), %xmm3 movups (CO2), %xmm4 movups (CO2, LDC, 1), %xmm5 movups (CO2, LDC, 2), %xmm6 movups (CO2, %rax, 1), %xmm7 addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 addpd %xmm4, %xmm12 addpd %xmm5, %xmm13 addpd %xmm6, %xmm14 addpd %xmm7, %xmm15 #endif movaps %xmm8, (CO1) movaps %xmm9, (CO1, LDC, 1) movaps %xmm10, (CO1, LDC, 2) movaps %xmm11, (CO1, %rax, 1) movaps %xmm12, (CO2) movaps %xmm13, (CO2, LDC, 1) movaps %xmm14, (CO2, LDC, 2) movaps %xmm15, (CO2, %rax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 decq I BRANCH jg .L11 jmp .L20 ALIGN_4 .L18x: leaq (LDC, LDC, 2), %rax #ifndef TRMMKERNEL movups (CO1), %xmm0 movups (CO1, LDC, 1), %xmm1 movups (CO1, LDC, 2), %xmm2 movups (CO1, %rax, 1), %xmm3 movups (CO2), %xmm4 movups (CO2, LDC, 1), %xmm5 movups (CO2, LDC, 2), %xmm6 movups (CO2, %rax, 1), %xmm7 addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 addpd %xmm4, %xmm12 addpd %xmm5, %xmm13 addpd %xmm6, %xmm14 addpd %xmm7, %xmm15 #endif movups %xmm8, (CO1) movups %xmm9, (CO1, LDC, 1) movups %xmm10, (CO1, LDC, 2) movups %xmm11, (CO1, %rax, 1) movups %xmm12, (CO2) movups %xmm13, (CO2, LDC, 1) movups %xmm14, (CO2, LDC, 2) movups %xmm15, (CO2, %rax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 decq I BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L29 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 16 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm11 leaq (LDC, LDC, 2), %rax #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC, 1), %xmm0 movsd (CO1, LDC, 2), %xmm1 movhps (CO1, %rax, 1), %xmm1 movsd (CO2), %xmm2 movhps (CO2, LDC, 1), %xmm2 movsd (CO2, LDC, 2), %xmm3 movhps (CO2, %rax, 1), %xmm3 addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO1, LDC, 1) movsd %xmm9, (CO1, LDC, 2) movhps %xmm9, (CO1, %rax, 1) movsd %xmm10, (CO2) movhps %xmm10, (CO2, LDC, 1) movsd %xmm11, (CO2, LDC, 2) movhps %xmm11, (CO2, %rax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addq $8, KK #endif movq BO, B leaq (C, LDC, 8), C subq $1, J BRANCH jg .L01 ALIGN_4 .L30: testq $4, N jle .L50 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 2), CO2 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 2 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht0 2 * SIZE(CO2, LDC, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 addpd %xmm3, %xmm10 addpd %xmm4, %xmm11 movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 mulpd %xmm7, %xmm8 shufpd $2, %xmm0, %xmm9 mulpd %xmm7, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 mulpd %xmm7, %xmm10 shufpd $2, %xmm0, %xmm11 mulpd %xmm7, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO1, LDC, 1), %xmm1 movhps 1 * SIZE(CO1, LDC, 1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 1 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO2, LDC, 1), %xmm3 movhps 1 * SIZE(CO2, LDC, 1), %xmm3 addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO1, LDC, 1) movhps %xmm9, 1 * SIZE(CO1, LDC, 1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 1 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO2, LDC, 1) movhps %xmm11, 1 * SIZE(CO2, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 decq I BRANCH jg .L31 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_4 .L48: addpd %xmm10, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm11, %xmm9 mulpd %xmm7, %xmm9 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC, 1), %xmm0 movsd (CO2), %xmm1 movhps (CO2, LDC, 1), %xmm1 addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO1, LDC, 1) movsd %xmm9, (CO2) movhps %xmm9, (CO2, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C ALIGN_4 .L50: testq $2, N jle .L70 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC), CO2 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L52 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L55: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 mulpd %xmm7, %xmm8 shufpd $2, %xmm0, %xmm9 mulpd %xmm7, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 1 * SIZE(CO2), %xmm1 addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhps %xmm9, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 decq I BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -8 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: addpd %xmm9, %xmm8 mulpd %xmm7, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO2), %xmm0 addpd %xmm0, %xmm8 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B leaq (C, LDC, 2), C ALIGN_4 .L70: testq $1, N jle .L999 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L72 addpd %xmm9, %xmm8 ALIGN_3 .L75: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: addpd %xmm1, %xmm8 mulpd %xmm7, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 addpd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 decq I BRANCH jg .L71 ALIGN_4 .L80: testq $1, M BRANCH jle .L999 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifndef TRMMKERNEL movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #else movsd -16 * SIZE(AO), %xmm0 movhps -15 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm1 movhps -15 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: mulpd %xmm0, %xmm1 #ifndef TRMMKERNEL movaps -14 * SIZE(AO), %xmm0 #else movsd -14 * SIZE(AO), %xmm0 movhps -13 * SIZE(AO), %xmm0 #endif addpd %xmm1, %xmm8 #ifndef TRMMKERNEL movaps -14 * SIZE(BO), %xmm1 #else movsd -14 * SIZE(BO), %xmm1 movhps -13 * SIZE(BO), %xmm1 #endif mulpd %xmm0, %xmm1 #ifndef TRMMKERNEL movaps -12 * SIZE(AO), %xmm0 #else movsd -12 * SIZE(AO), %xmm0 movhps -11 * SIZE(AO), %xmm0 #endif addpd %xmm1, %xmm9 #ifndef TRMMKERNEL movaps -12 * SIZE(BO), %xmm1 #else movsd -12 * SIZE(BO), %xmm1 movhps -11 * SIZE(BO), %xmm1 #endif subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L82 addpd %xmm9, %xmm8 ALIGN_3 .L85: movsd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd -15 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_4 .L88: haddpd %xmm8, %xmm8 mulsd %xmm7, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 addsd %xmm0, %xmm8 #endif movsd %xmm8, (CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x2_atom.S000066400000000000000000000622171313527062700215200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KKK 64(%rsp) #define KK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define ALPHA 224(%rsp) #define OFFSET 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif movsd %xmm0, ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif leaq (, LDC, SIZE), LDC movq N, J sarq $1, J jle .L40 ALIGN_4 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 leaq (C, LDC, 2), C movq A, AO movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $2, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 prefetcht0 3 * SIZE(CO2) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO addsd %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH BRANCH je .L19 ALIGN_4 .L16: addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L19: movsd ALPHA, %xmm5 addsd %xmm2, %xmm13 mulsd %xmm5, %xmm8 addsd %xmm7, %xmm14 mulsd %xmm5, %xmm10 addsd %xmm6, %xmm15 mulsd %xmm5, %xmm12 mulsd %xmm5, %xmm14 mulsd %xmm5, %xmm9 mulsd %xmm5, %xmm11 mulsd %xmm5, %xmm13 mulsd %xmm5, %xmm15 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm10 addsd 2 * SIZE(CO1), %xmm12 addsd 3 * SIZE(CO1), %xmm14 addsd 0 * SIZE(CO2), %xmm9 addsd 1 * SIZE(CO2), %xmm11 addsd 2 * SIZE(CO2), %xmm13 addsd 3 * SIZE(CO2), %xmm15 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movsd %xmm14, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movsd %xmm11, 1 * SIZE(CO2) movsd %xmm13, 2 * SIZE(CO2) movsd %xmm15, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA, %xmm7 andq $3, %rax BRANCH BRANCH je .L29 ALIGN_4 .L26: addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: addsd %xmm2, %xmm9 mulsd %xmm7, %xmm8 addsd %xmm6, %xmm11 mulsd %xmm7, %xmm10 mulsd %xmm7, %xmm9 mulsd %xmm7, %xmm11 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm10 addsd 0 * SIZE(CO2), %xmm9 addsd 1 * SIZE(CO2), %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movsd %xmm11, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 movsd 1 * SIZE(AO), %xmm2 xorps %xmm5, %xmm5 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L35 ALIGN_4 .L32: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 3 * SIZE(AO), %xmm2 addsd %xmm5, %xmm8 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 addsd %xmm7, %xmm9 movsd 7 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 8 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 9 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 5 * SIZE(AO), %xmm2 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 movsd ALPHA, %xmm7 andq $3, %rax BRANCH BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm0, %xmm1 addq $2 * SIZE, BO mulsd %xmm0, %xmm3 movsd 1 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 0 * SIZE(BO), %xmm1 addsd %xmm3, %xmm9 movsd 1 * SIZE(BO), %xmm3 addq $1 * SIZE, AO decq %rax BRANCH jg .L36 ALIGN_4 .L38: mulsd %xmm7, %xmm8 mulsd %xmm7, %xmm9 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 addsd 0 * SIZE(CO2), %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movq BO, B decq J # j -- jg .L10 ALIGN_4 .L40: testq $1, N je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 addq LDC, C movq A, AO movq M, I sarq $2, I jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 movsd 1 * SIZE(AO), %xmm1 xorps %xmm11, %xmm11 movsd 2 * SIZE(AO), %xmm2 xorps %xmm13, %xmm13 movsd 3 * SIZE(AO), %xmm3 xorps %xmm15, %xmm15 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm14, %xmm14 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L45 ALIGN_4 .L42: addsd %xmm9, %xmm8 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm11, %xmm10 movsd 5 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 6 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 7 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 9 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addsd %xmm2, %xmm12 movsd 10 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 addsd %xmm3, %xmm14 movsd 11 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 3 * SIZE(BO), %xmm5 addsd %xmm9, %xmm8 movsd 12 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addsd %xmm11, %xmm10 movsd 13 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 14 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 15 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 4 * SIZE(BO), %xmm4 subq $-16 * SIZE, AO addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addq $ 4 * SIZE, BO addsd %xmm2, %xmm12 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 decq %rax addsd %xmm3, %xmm14 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 1 * SIZE(BO), %xmm5 jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA, %xmm7 addsd %xmm9, %xmm8 addsd %xmm11, %xmm10 addsd %xmm13, %xmm12 addsd %xmm15, %xmm14 andq $3, %rax BRANCH BRANCH je .L49 ALIGN_4 .L46: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 mulsd %xmm4, %xmm2 mulsd %xmm4, %xmm3 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 5 * SIZE(AO), %xmm1 addsd %xmm2, %xmm12 movsd 6 * SIZE(AO), %xmm2 addsd %xmm3, %xmm14 movsd 7 * SIZE(AO), %xmm3 addq $4 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L46 ALIGN_4 .L49: mulsd %xmm7, %xmm8 mulsd %xmm7, %xmm10 mulsd %xmm7, %xmm12 mulsd %xmm7, %xmm14 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm10 addsd 2 * SIZE(CO1), %xmm12 addsd 3 * SIZE(CO1), %xmm14 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movsd %xmm14, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 decq I # i -- jg .L41 ALIGN_4 .L50: testq $2, M jle .L60 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm1 xorps %xmm3, %xmm3 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L55 ALIGN_4 .L52: addsd %xmm2, %xmm8 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm3, %xmm10 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 addq $8 * SIZE, AO addsd %xmm1, %xmm10 movsd -3 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 3 * SIZE(BO), %xmm5 addsd %xmm2, %xmm8 movsd -2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 addq $4 * SIZE, BO addsd %xmm3, %xmm10 movsd -1 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 0 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 decq %rax addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 1 * SIZE(BO), %xmm5 jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA, %xmm7 addsd %xmm2, %xmm8 addsd %xmm3, %xmm10 andq $3, %rax BRANCH BRANCH je .L59 ALIGN_4 .L56: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 3 * SIZE(AO), %xmm1 addq $2 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L56 ALIGN_4 .L59: mulsd %xmm7, %xmm8 mulsd %xmm7, %xmm10 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 ALIGN_4 .L60: testq $1, M je .L999 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 movsd 1 * SIZE(AO), %xmm2 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L65 ALIGN_4 .L62: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 movsd 4 * SIZE(AO), %xmm0 addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm3 movsd 5 * SIZE(AO), %xmm2 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm4, %xmm5 movsd 6 * SIZE(AO), %xmm4 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm6, %xmm7 movsd 7 * SIZE(AO), %xmm6 addq $4 * SIZE, AO addq $4 * SIZE, BO decq %rax jne .L62 addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA, %xmm7 andq $3, %rax BRANCH BRANCH je .L68 ALIGN_4 .L66: movsd 0 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 addq $1 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L66 ALIGN_4 .L68: addsd %xmm9, %xmm8 mulsd %xmm7, %xmm8 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x4_barcelona.S000066400000000000000000001244411313527062700225060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #define J %rbx #ifndef WINDOWS_ABI #define STACKSIZE 96 #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define ALPHA 224(%rsp) #define OFFSET 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define movapd movaps #define movupd movups #define KERNEL1(xx) \ mulpd %xmm1, %xmm0 ;\ addpd %xmm0, %xmm8 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL2(xx) \ mulpd %xmm1, %xmm0 ;\ addpd %xmm0, %xmm8 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ /*A*/ movapd (AO, %rax, 4), %xmm6 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ addpd %xmm1, %xmm14 ;\ /**/ movddup (BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL3(xx) \ mulpd %xmm5, %xmm4 ;\ addpd %xmm4, %xmm8 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL4(xx) \ mulpd %xmm5, %xmm4 ;\ addpd %xmm4, %xmm8 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ /*A*/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ /**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL5(xx) \ mulpd %xmm1, %xmm6 ;\ addpd %xmm6, %xmm8 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ addpd %xmm1, %xmm14 ;\ movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL6(xx) \ mulpd %xmm1, %xmm6 ;\ addpd %xmm6, %xmm8 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ /*A*/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ addpd %xmm1, %xmm14 ;\ /**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL7(xx) \ mulpd %xmm5, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ addpd %xmm5, %xmm14 ;\ movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL8(xx) \ mulpd %xmm5, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ /*A*/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ /**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax ;\ #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd (AO), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup (BO), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO), %xmm3 ;\ movapd %xmm0, %xmm2 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B movsd %xmm0, ALPHA salq $BASE_SHIFT, LDC #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: movq C, CO1 # coffset1 = c leaq (C, LDC, 2), CO2 # coffset2 = c + ldc leaq (C, LDC, 4), C # c += 4 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO # aoffset = a movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movddup -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 movddup -15 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 movapd -8 * SIZE(AO), %xmm4 xorps %xmm11, %xmm11 movddup -8 * SIZE(BO), %xmm5 xorps %xmm12, %xmm12 prefetchw 3 * SIZE(CO1) xorps %xmm13, %xmm13 prefetchw 7 * SIZE(CO1, LDC) xorps %xmm14, %xmm14 prefetchw 3 * SIZE(CO2) xorps %xmm15, %xmm15 prefetchw 7 * SIZE(CO2, LDC) movapd %xmm0, %xmm2 prefetch -16 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-8, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) jl .L12 ALIGN_4 .L15: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 ALIGN_4 KERNEL_SUB1(16 * 0) KERNEL_SUB2(16 * 0) KERNEL_SUB3(16 * 0) KERNEL_SUB4(16 * 0) subq $-16 * SIZE, BO subq $-16 * SIZE, AO ALIGN_4 .L16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L17: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd %xmm2, %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movapd %xmm0, %xmm2 addpd %xmm3, %xmm13 movddup -13 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm10 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm14 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 addpd %xmm3, %xmm15 movddup -11 * SIZE(BO, %rax, 4), %xmm3 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L17 ALIGN_4 .L19: prefetch -8 * SIZE(BB) subq $-16 * SIZE, BB #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd 2 * SIZE(CO1), %xmm1 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm12 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 #endif movsd %xmm8, (CO1) movhps %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhps %xmm12, 3 * SIZE(CO1) #ifndef TRMMKERNEL movupd (CO1, LDC), %xmm2 movupd 2 * SIZE(CO1, LDC), %xmm3 #endif mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm13 #ifndef TRMMKERNEL addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 #endif movsd %xmm9, (CO1, LDC) movhps %xmm9, 1 * SIZE(CO1, LDC) movsd %xmm13, 2 * SIZE(CO1, LDC) movhps %xmm13, 3 * SIZE(CO1, LDC) #ifndef TRMMKERNEL movupd (CO2), %xmm0 movupd 2 * SIZE(CO2), %xmm1 #endif mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm14 #ifndef TRMMKERNEL addpd %xmm0, %xmm10 addpd %xmm1, %xmm14 #endif movsd %xmm10, (CO2) movhps %xmm10, 1 * SIZE(CO2) movsd %xmm14, 2 * SIZE(CO2) movhps %xmm14, 3 * SIZE(CO2) #ifndef TRMMKERNEL movupd (CO2, LDC), %xmm2 movupd 2 * SIZE(CO2, LDC), %xmm3 #endif mulpd %xmm7, %xmm11 mulpd %xmm7, %xmm15 #ifndef TRMMKERNEL addpd %xmm2, %xmm11 addpd %xmm3, %xmm15 #endif movsd %xmm11, (CO2, LDC) movhps %xmm11, 1 * SIZE(CO2, LDC) movsd %xmm15, 2 * SIZE(CO2, LDC) movhps %xmm15, 3 * SIZE(CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 xorps %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 xorps %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 xorps %xmm11, %xmm11 movddup -8 * SIZE(BO), %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L26 ALIGN_4 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -9 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup (BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -7 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -6 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -5 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup -4 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup -3 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -2 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -1 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup 8 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup 1 * SIZE(BO, %rax, 4), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L22 ALIGN_4 .L26: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L29 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L27: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 addq $SIZE, %rax jl .L27 ALIGN_4 .L29: #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd (CO1, LDC), %xmm2 movupd (CO2), %xmm4 movupd (CO2, LDC), %xmm6 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm11 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm4, %xmm10 addpd %xmm6, %xmm11 #endif movsd %xmm8, (CO1) movhps %xmm8, 1 * SIZE(CO1) movsd %xmm9, (CO1, LDC) movhps %xmm9, 1 * SIZE(CO1, LDC) movsd %xmm10, (CO2) movhps %xmm10, 1 * SIZE(CO2) movsd %xmm11, (CO2, LDC) movhps %xmm11, 1 * SIZE(CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L30: testq $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movddup -14 * SIZE(AO), %xmm2 xorps %xmm9, %xmm9 movddup -15 * SIZE(AO), %xmm4 xorps %xmm10, %xmm10 movapd -16 * SIZE(BO), %xmm1 xorps %xmm11, %xmm11 movapd -8 * SIZE(BO), %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L36 ALIGN_4 .L32: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd %xmm4, %xmm1 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 addpd %xmm1, %xmm10 movapd (BO, %rax, 4), %xmm1 addpd %xmm4, %xmm11 movddup -11 * SIZE(AO, %rax, 1), %xmm4 mulpd %xmm2, %xmm3 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movddup -13 * SIZE(AO, %rax, 1), %xmm2 mulpd %xmm2, %xmm3 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 movddup -10 * SIZE(AO, %rax, 1), %xmm2 addq $4 * SIZE, %rax BRANCH jl .L32 ALIGN_4 .L36: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L38 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L37: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L37 ALIGN_4 .L38: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC), %xmm0 movsd (CO2), %xmm1 movhps (CO2, LDC), %xmm1 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO1, LDC) movsd %xmm9, (CO2) movhps %xmm9, (CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B decq J # j -- jg .L01 ALIGN_4 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 movddup -12 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movapd -16 * SIZE(AO), %xmm0 xorps %xmm12, %xmm12 movapd -8 * SIZE(AO), %xmm4 xorps %xmm13, %xmm13 prefetchw 3 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw 3 * SIZE(CO2) prefetch -16 * SIZE(BB) subq $-8 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L56 ALIGN_4 .L52: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd (AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -9 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd 8 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -4 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -7 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $4 * SIZE, %rax BRANCH jl .L52 ALIGN_4 .L56: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L59 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L57 ALIGN_4 .L59: #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd 2 * SIZE(CO1), %xmm1 movupd (CO2), %xmm2 movupd 2 * SIZE(CO2), %xmm3 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm12 mulpd %xmm7, %xmm13 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 #endif movsd %xmm8, (CO1) movhps %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhps %xmm12, 3 * SIZE(CO1) movsd %xmm9, (CO2) movhps %xmm9, 1 * SIZE(CO2) movsd %xmm13, 2 * SIZE(CO2) movhps %xmm13, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 xorps %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 xorps %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm3 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L66 ALIGN_4 .L62: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm11 movddup -11 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm9 movddup -9 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm10 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm11 movddup -7 * SIZE(BO, %rax, 2), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L62 ALIGN_4 .L66: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L69 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L67: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 addq $SIZE, %rax jl .L67 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd (CO2), %xmm2 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 #endif movsd %xmm8, (CO1) movhps %xmm8, 1 * SIZE(CO1) movsd %xmm9, (CO2) movhps %xmm9, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movddup -15 * SIZE(AO), %xmm1 xorps %xmm9, %xmm9 movddup -14 * SIZE(AO), %xmm2 xorps %xmm10, %xmm10 movddup -13 * SIZE(AO), %xmm3 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L76 ALIGN_4 .L72: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(AO, %rax, 1), %xmm1 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(AO, %rax, 1), %xmm2 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(AO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L72 ALIGN_4 .L76: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L78 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L77: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L77 ALIGN_4 .L78: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO2), %xmm0 #endif mulpd %xmm7, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B leaq (C, LDC, 2), C ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 .L81: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif movapd -8 * SIZE(AO), %xmm2 xorps %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 xorps %xmm12, %xmm12 movddup -14 * SIZE(BO), %xmm3 xorps %xmm13, %xmm13 movddup -15 * SIZE(BO), %xmm5 prefetchw 3 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L96 ALIGN_4 .L92: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -12 * SIZE(BO, %rax, 1), %xmm1 mulpd %xmm5, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm0, %xmm9 movapd (AO, %rax, 4), %xmm0 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 1), %xmm5 mulpd %xmm3, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 1), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 movapd 8 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 1), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L92 ALIGN_4 .L96: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L99 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -15 * SIZE(BO, %rax, 1), %xmm1 addq $SIZE, %rax jl .L97 ALIGN_4 .L99: addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd 2 * SIZE(CO1), %xmm1 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm12 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 #endif movsd %xmm8, (CO1) movhps %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhps %xmm12, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif movddup -16 * SIZE(BO), %xmm0 xorps %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 movddup -14 * SIZE(BO), %xmm2 xorps %xmm10, %xmm10 movddup -13 * SIZE(BO), %xmm3 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L106 ALIGN_4 .L102: mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(BO, %rax, 1), %xmm0 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(BO, %rax, 1), %xmm1 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(BO, %rax, 1), %xmm2 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(BO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L102 ALIGN_4 .L106: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L109 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L107: movddup -16 * SIZE(BO, %rax, 1), %xmm0 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 addq $SIZE, %rax jl .L107 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #ifndef TRMMKERNEL movupd (CO1), %xmm0 #endif mulpd %xmm7, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movsd %xmm8, (CO1) movhps %xmm8, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 ALIGN_4 .L110: testq $1, M je .L999 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movapd -14 * SIZE(AO), %xmm1 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L116 ALIGN_4 .L112: mulpd -16 * SIZE(BO, %rax, 1), %xmm0 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 addpd %xmm1, %xmm9 movapd -10 * SIZE(AO, %rax, 1), %xmm1 addq $4 * SIZE, %rax BRANCH jl .L112 ALIGN_4 .L116: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L118 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L117: mulsd -16 * SIZE(BO, %rax, 1), %xmm0 addsd %xmm0, %xmm8 movsd -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L117 ALIGN_4 .L118: addpd %xmm9, %xmm8 haddpd %xmm8, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 #endif mulsd %xmm7, %xmm8 #ifndef TRMMKERNEL addsd %xmm0, %xmm8 #endif movsd %xmm8, (CO1) ALIGN_4 .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x4_core2.S000066400000000000000000001237441313527062700215770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R * 4) #define PREFETCHSIZE (8 * 13 + 5) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING unpcklpd %xmm0, %xmm0 movapd %xmm0, ALPHA subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N leaq (, LDC, SIZE), LDC #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq N, J sarq $2, J NOBRANCH jle .L40 ALIGN_4 .L01: /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movapd -16 * SIZE(B), %xmm0 movapd -8 * SIZE(B), %xmm4 movq K, %rax sarq $2, %rax NOBRANCH jle .L05 ALIGN_3 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) prefetcht0 (PREFETCH_R + 8) * SIZE(B) movapd -14 * SIZE(B), %xmm1 movapd -12 * SIZE(B), %xmm2 movapd -10 * SIZE(B), %xmm3 movapd -6 * SIZE(B), %xmm5 movapd -4 * SIZE(B), %xmm6 movapd -2 * SIZE(B), %xmm7 movddup %xmm0, %xmm8 movapd %xmm8, -16 * SIZE(BO) unpckhpd %xmm0, %xmm0 movapd %xmm0, -14 * SIZE(BO) movapd 0 * SIZE(B), %xmm0 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movddup %xmm1, %xmm9 movapd %xmm9, -12 * SIZE(BO) unpckhpd %xmm1, %xmm1 movapd %xmm1, -10 * SIZE(BO) movddup %xmm2, %xmm10 movapd %xmm10, -8 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) unpckhpd %xmm2, %xmm2 movapd %xmm2, -6 * SIZE(BO) movddup %xmm3, %xmm11 movapd %xmm11, -4 * SIZE(BO) unpckhpd %xmm3, %xmm3 movapd %xmm3, -2 * SIZE(BO) prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movddup %xmm4, %xmm12 movapd %xmm12, 0 * SIZE(BO) unpckhpd %xmm4, %xmm4 movapd %xmm4, 2 * SIZE(BO) movapd 8 * SIZE(B), %xmm4 movddup %xmm5, %xmm13 movapd %xmm13, 4 * SIZE(BO) unpckhpd %xmm5, %xmm5 movapd %xmm5, 6 * SIZE(BO) prefetcht0 (PREFETCH_W + 24) * SIZE(BO) movddup %xmm6, %xmm14 movapd %xmm14, 8 * SIZE(BO) unpckhpd %xmm6, %xmm6 movapd %xmm6, 10 * SIZE(BO) movddup %xmm7, %xmm15 movapd %xmm15, 12 * SIZE(BO) unpckhpd %xmm7, %xmm7 movapd %xmm7, 14 * SIZE(BO) subq $-32 * SIZE, BO subq $-16 * SIZE, B decq %rax BRANCH jne .L02 ALIGN_3 .L05: movq K, %rax andq $3, %rax BRANCH BRANCH jle .L10 ALIGN_3 .L06: movapd -14 * SIZE(B), %xmm1 movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd -12 * SIZE(B), %xmm0 movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax BRANCH jne .L06 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 19 * SIZE + BUFFER, BO #else leaq 19 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif prefetcht2 (BB) movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -19 * SIZE(BO), %xmm6 movaps -17 * SIZE(BO), %xmm7 pxor %xmm2, %xmm2 prefetcht0 3 * SIZE(CO1) pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 7 * SIZE(CO2) pxor %xmm5, %xmm5 movapd %xmm2, %xmm8 movapd %xmm2, %xmm9 movapd %xmm2, %xmm10 prefetcht0 3 * SIZE(CO1, LDC, 2) movapd %xmm2, %xmm11 movapd %xmm2, %xmm12 movapd %xmm2, %xmm13 prefetcht0 7 * SIZE(CO2, LDC, 2) movapd %xmm2, %xmm14 movapd %xmm2, %xmm15 subq $-16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_4 .L12: PADDING; addpd %xmm2, %xmm10 movaps -15 * SIZE(BO), %xmm2 PADDING; addpd %xmm3, %xmm14 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps -13 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps -11 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps -9 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm10 movaps -7 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps -5 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps -3 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps -1 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm10 movaps 1 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps 3 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 PADDING movaps %xmm7, %xmm5 mulpd %xmm1, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm7 addpd %xmm6, %xmm8 movaps 5 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps 7 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm10 movaps 9 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 subq $-16 * SIZE, AO movaps 11 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps 13 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps 15 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 subq $-32 * SIZE, BO movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: prefetcht2 -8 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm2, %xmm10 movaps -15 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps -13 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps -11 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 addq $4 * SIZE, AO movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps -9 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 addq $8 * SIZE, BO movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: movddup ALPHA, %xmm7 addpd %xmm2, %xmm10 mulpd %xmm7, %xmm8 addpd %xmm3, %xmm14 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm11 mulpd %xmm7, %xmm9 addpd %xmm5, %xmm15 mulpd %xmm7, %xmm13 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm14 mulpd %xmm7, %xmm11 mulpd %xmm7, %xmm15 movq CO1, %rax orq LDC, %rax testq $15, %rax NOBRANCH jne .L18x #ifndef TRMMKERNEL addpd 0 * SIZE(CO1), %xmm8 addpd 2 * SIZE(CO1), %xmm12 addpd 0 * SIZE(CO2), %xmm9 addpd 2 * SIZE(CO2), %xmm13 addpd 0 * SIZE(CO1, LDC, 2), %xmm10 addpd 2 * SIZE(CO1, LDC, 2), %xmm14 addpd 0 * SIZE(CO2, LDC, 2), %xmm11 addpd 2 * SIZE(CO2, LDC, 2), %xmm15 #endif movapd %xmm8, 0 * SIZE(CO1) movapd %xmm12, 2 * SIZE(CO1) movapd %xmm9, 0 * SIZE(CO2) movapd %xmm13, 2 * SIZE(CO2) movapd %xmm10, 0 * SIZE(CO1, LDC, 2) movapd %xmm14, 2 * SIZE(CO1, LDC, 2) movapd %xmm11, 0 * SIZE(CO2, LDC, 2) movapd %xmm15, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 jmp .L20 ALIGN_4 .L18x: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 movsd 2 * SIZE(CO1, LDC, 2), %xmm5 movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 movsd 2 * SIZE(CO2, LDC, 2), %xmm7 movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 addpd %xmm4, %xmm10 addpd %xmm5, %xmm14 addpd %xmm6, %xmm11 addpd %xmm7, %xmm15 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm13, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 2 * SIZE(CO1, LDC, 2) movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm9, %xmm3 movapd %xmm10, %xmm4 movapd %xmm11, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd -12 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movapd -8 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -6 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd -4 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movapd 0 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd 2 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd 4 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -10 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movapd 8 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd 12 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO), %xmm0 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L21 ALIGN_4 .L25: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd -12 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm11 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm4, %xmm10 addpd %xmm6, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M BRANCH jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm9, %xmm3 movapd %xmm10, %xmm4 movapd %xmm11, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_4 .L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd -12 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -15 * SIZE(AO), %xmm0 addsd %xmm2, %xmm8 movsd -8 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -6 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd -4 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -14 * SIZE(AO), %xmm0 addsd %xmm2, %xmm8 movsd 0 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd 2 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd 4 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -13 * SIZE(AO), %xmm0 addsd %xmm2, %xmm8 movsd 8 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd 10 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd 12 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -12 * SIZE(AO), %xmm0 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L31 ALIGN_4 .L35: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd -12 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -15 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 #endif mulsd %xmm7, %xmm8 mulsd %xmm7, %xmm9 mulsd %xmm7, %xmm10 mulsd %xmm7, %xmm11 #ifndef TRMMKERNEL addsd %xmm0, %xmm8 addsd %xmm2, %xmm9 addsd %xmm4, %xmm10 addsd %xmm6, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif leaq (C, LDC, 4), C subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $2, N BRANCH jle .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $3, %rax jle .L43 addq %rax, %rax ALIGN_4 .L42: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $8 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $7, %rax BRANCH jle .L45 ALIGN_4 .L44: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO subq $1, %rax jne .L44 ALIGN_4 .L45: movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L50: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht0 3 * SIZE(CO2) pxor %xmm13, %xmm13 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 movapd %xmm8, %xmm4 movapd %xmm8, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L55 ALIGN_4 .L51: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -6 * SIZE(AO), %xmm1 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm2, %xmm8 movapd -8 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -6 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -4 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -2 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd 2 * SIZE(AO), %xmm1 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L51 ALIGN_4 .L55: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L58 ALIGN_4 .L56: addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L56 ALIGN_4 .L58: addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm12 mulpd %xmm7, %xmm13 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm13, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 subq $1, I jg .L50 ALIGN_4 .L60: testq $2, M jle .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 movapd -14 * SIZE(AO), %xmm1 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 movapd %xmm8, %xmm4 movapd %xmm8, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L65 ALIGN_4 .L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 movapd -12 * SIZE(AO), %xmm0 addpd %xmm4, %xmm10 movapd -12 * SIZE(BO), %xmm4 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm11 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm1, %xmm5 movapd -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -8 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -6 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO), %xmm0 addpd %xmm4, %xmm10 movapd -4 * SIZE(BO), %xmm4 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm11 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm1, %xmm5 movapd -6 * SIZE(AO), %xmm1 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L61 ALIGN_4 .L65: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L68 ALIGN_4 .L66: addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L68: addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L70: testq $1, M jle .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 movapd %xmm8, %xmm4 movapd %xmm8, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L75 ALIGN_4 .L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 movsd -14 * SIZE(AO), %xmm0 addsd %xmm4, %xmm10 movsd -12 * SIZE(BO), %xmm4 mulsd %xmm1, %xmm4 addsd %xmm5, %xmm11 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 movsd -13 * SIZE(AO), %xmm1 addsd %xmm2, %xmm8 movsd -8 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -6 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 movsd -12 * SIZE(AO), %xmm0 addsd %xmm4, %xmm10 movsd -4 * SIZE(BO), %xmm4 mulsd %xmm1, %xmm4 addsd %xmm5, %xmm11 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 movsd -11 * SIZE(AO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L71 ALIGN_4 .L75: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L78 ALIGN_4 .L76: addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 #endif addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 mulsd %xmm7, %xmm8 mulsd %xmm7, %xmm9 #ifndef TRMMKERNEL addsd %xmm0, %xmm8 addsd %xmm2, %xmm9 #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C ALIGN_4 .L80: testq $1, N BRANCH jle .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $4, %rax jle .L83 addq %rax, %rax ALIGN_4 .L82: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO subq $1, %rax jne .L82 ALIGN_4 .L83: movq K, %rax andq $15, %rax BRANCH jle .L85 ALIGN_4 .L84: movddup -16 * SIZE(B), %xmm8 movapd %xmm8, 0 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO subq $1, %rax jne .L84 ALIGN_4 .L85: movq C, CO1 movq A, AO movq M, I sarq $2, I jle .L100 ALIGN_4 .L90: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 pxor %xmm9, %xmm9 movapd -14 * SIZE(BO), %xmm5 pxor %xmm12, %xmm12 movapd -12 * SIZE(BO), %xmm6 pxor %xmm13, %xmm13 movapd -10 * SIZE(BO), %xmm7 movapd %xmm8, %xmm0 prefetcht0 3 * SIZE(CO1) movapd %xmm8, %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L95 ALIGN_4 .L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 addpd %xmm1, %xmm12 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm4, %xmm1 movapd -8 * SIZE(BO), %xmm4 addpd %xmm2, %xmm9 movapd -12 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm13 movapd -10 * SIZE(AO), %xmm3 mulpd %xmm5, %xmm3 movapd -6 * SIZE(BO), %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm0, %xmm8 movapd -8 * SIZE(AO), %xmm0 mulpd %xmm6, %xmm0 addpd %xmm1, %xmm12 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm6, %xmm1 movapd -4 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 addpd %xmm3, %xmm13 movapd -2 * SIZE(AO), %xmm3 mulpd %xmm7, %xmm3 movapd -2 * SIZE(BO), %xmm7 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax jg .L91 ALIGN_4 .L95: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L98 ALIGN_4 .L96: addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 addpd %xmm1, %xmm12 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm4, %xmm1 movapd -14 * SIZE(BO), %xmm4 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L96 ALIGN_4 .L98: addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm12 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 subq $1, I jg .L90 ALIGN_4 .L100: testq $2, M jle .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 pxor %xmm9, %xmm9 movapd -14 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movapd -12 * SIZE(BO), %xmm6 pxor %xmm11, %xmm11 movapd -10 * SIZE(BO), %xmm7 movapd %xmm8, %xmm0 movapd %xmm8, %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L105 ALIGN_4 .L101: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 movapd -8 * SIZE(BO), %xmm4 addpd %xmm1, %xmm9 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm5, %xmm1 movapd -6 * SIZE(BO), %xmm5 addpd %xmm2, %xmm10 movapd -12 * SIZE(AO), %xmm2 mulpd %xmm6, %xmm2 movapd -4 * SIZE(BO), %xmm6 addpd %xmm3, %xmm11 movapd -10 * SIZE(AO), %xmm3 mulpd %xmm7, %xmm3 movapd -2 * SIZE(BO), %xmm7 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jg .L101 ALIGN_4 .L105: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L108 ALIGN_4 .L106: addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 movapd -14 * SIZE(BO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L106 ALIGN_4 .L108: addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 addpd %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 #endif mulpd %xmm7, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L110: testq $1, M jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm4 pxor %xmm9, %xmm9 movsd -14 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movsd -12 * SIZE(BO), %xmm6 pxor %xmm11, %xmm11 movsd -10 * SIZE(BO), %xmm7 movapd %xmm8, %xmm0 movapd %xmm8, %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L115 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 movsd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 movsd -8 * SIZE(BO), %xmm4 addpd %xmm1, %xmm9 movsd -15 * SIZE(AO), %xmm1 mulpd %xmm5, %xmm1 movsd -6 * SIZE(BO), %xmm5 addpd %xmm2, %xmm10 movsd -14 * SIZE(AO), %xmm2 mulpd %xmm6, %xmm2 movsd -4 * SIZE(BO), %xmm6 addpd %xmm3, %xmm11 movsd -13 * SIZE(AO), %xmm3 mulpd %xmm7, %xmm3 movsd -2 * SIZE(BO), %xmm7 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jg .L111 ALIGN_4 .L115: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L118 ALIGN_4 .L116: addsd %xmm0, %xmm8 movsd -16 * SIZE(AO), %xmm0 mulsd %xmm4, %xmm0 movsd -14 * SIZE(BO), %xmm4 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L116 ALIGN_4 .L118: addsd %xmm0, %xmm8 addsd %xmm1, %xmm9 addsd %xmm2, %xmm10 addsd %xmm3, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 #endif addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 addsd %xmm9, %xmm8 mulsd %xmm7, %xmm8 #ifndef TRMMKERNEL addsd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x4_penryn.S000066400000000000000000001123521313527062700220710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define PREA %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA 48(%rsp) #define J 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define ALPHA 224(%rsp) #define J 232(%rsp) #define OFFSET 240(%rsp) #define KK 248(%rsp) #define KKK 256(%rsp) #endif #ifdef NANO #define PREFETCHSIZE (8 * 2 + 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifdef DUNNINGTON #define PREFETCHSIZE (8 * 97 + 4) #define PREFETCHB prefetcht2 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht2 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht0 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (8 * 17 + 4) #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif movlps %xmm0, ALPHA subq $-16 * SIZE, A subq $-17 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K leaq (, LDC, SIZE), LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $2, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -16 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 xorpd %xmm4, %xmm4 movaps -17 * SIZE(BO), %xmm2 PREFETCHB -16 * SIZE(BB) xorpd %xmm5, %xmm5 xorpd %xmm6, %xmm6 PREFETCHW 3 * SIZE(CO1) movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 PREFETCHW 7 * SIZE(CO2) movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 PREFETCHW 3 * SIZE(CO1, LDC, 2) movaps %xmm4, %xmm12 movaps %xmm4, %xmm13 PREFETCHW 7 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -11 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -7 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movapd %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movapd %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 PADDING PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm2, %xmm9 movaps -5 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 subq $-16 * SIZE, AO movaps -3 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -1 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 subq $-16 * SIZE, BO mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: PREFETCHB -8 * SIZE(BB) #ifdef DUNNINGTON PREFETCHB 0 * SIZE(BB) PREFETCHB 8 * SIZE(BB) #endif #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: movddup ALPHA, %xmm1 #ifndef DUNNINGTON subq $-16 * SIZE, BB #else subq $-32 * SIZE, BB #endif addpd %xmm3, %xmm11 addpd %xmm4, %xmm15 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 mulpd %xmm1, %xmm8 movsd %xmm0, %xmm9 mulpd %xmm1, %xmm9 movaps %xmm10, %xmm0 movsd %xmm11, %xmm10 mulpd %xmm1, %xmm10 movsd %xmm0, %xmm11 mulpd %xmm1, %xmm11 movaps %xmm12, %xmm0 movsd %xmm13, %xmm12 mulpd %xmm1, %xmm12 movsd %xmm0, %xmm13 mulpd %xmm1, %xmm13 movaps %xmm14, %xmm0 movsd %xmm15, %xmm14 mulpd %xmm1, %xmm14 movsd %xmm0, %xmm15 mulpd %xmm1, %xmm15 movq CO1, %rax orq LDC, %rax testq $15, %rax NOBRANCH jne .L18x #ifndef TRMMKERNEL addpd 0 * SIZE(CO1), %xmm8 addpd 2 * SIZE(CO1), %xmm12 addpd 0 * SIZE(CO2), %xmm9 addpd 2 * SIZE(CO2), %xmm13 addpd 0 * SIZE(CO1, LDC, 2), %xmm10 addpd 2 * SIZE(CO1, LDC, 2), %xmm14 addpd 0 * SIZE(CO2, LDC, 2), %xmm11 addpd 2 * SIZE(CO2, LDC, 2), %xmm15 #endif movaps %xmm8, 0 * SIZE(CO1) movaps %xmm12, 2 * SIZE(CO1) movaps %xmm9, 0 * SIZE(CO2) movaps %xmm13, 2 * SIZE(CO2) movaps %xmm10, 0 * SIZE(CO1, LDC, 2) movaps %xmm14, 2 * SIZE(CO1, LDC, 2) movaps %xmm11, 0 * SIZE(CO2, LDC, 2) movaps %xmm15, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 jmp .L20 ALIGN_4 .L18x: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 movsd 2 * SIZE(CO1, LDC, 2), %xmm5 movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 movsd 2 * SIZE(CO2, LDC, 2), %xmm7 movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 addpd %xmm4, %xmm10 addpd %xmm5, %xmm14 addpd %xmm6, %xmm11 addpd %xmm7, %xmm15 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm13, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 2 * SIZE(CO1, LDC, 2) movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 movaps -15 * SIZE(BO), %xmm3 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 movaps %xmm3, %xmm8 movaps %xmm3, %xmm9 movaps %xmm3, %xmm10 movaps %xmm3, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -11 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -7 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -5 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -3 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO addpd %xmm2, %xmm9 movaps -1 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: addpd %xmm3, %xmm11 addpd %xmm5, %xmm10 movddup ALPHA, %xmm3 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 mulpd %xmm3, %xmm8 movsd %xmm0, %xmm9 mulpd %xmm3, %xmm9 movaps %xmm10, %xmm0 movsd %xmm11, %xmm10 mulpd %xmm3, %xmm10 movsd %xmm0, %xmm11 mulpd %xmm3, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm4, %xmm10 addpd %xmm6, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M BRANCH jle .L39 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 4), BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 movaps -15 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -11 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps -9 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps -7 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -5 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -3 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps -1 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps 1 * SIZE(BO), %xmm3 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -11 * SIZE(BO), %xmm3 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: movddup ALPHA, %xmm3 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 0 * SIZE(CO2), %xmm0 movsd 0 * SIZE(CO1, LDC, 2), %xmm1 movhpd 0 * SIZE(CO2, LDC, 2), %xmm1 #endif mulpd %xmm3, %xmm8 mulpd %xmm3, %xmm9 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 0 * SIZE(CO2) movlpd %xmm9, 0 * SIZE(CO1, LDC, 2) movhpd %xmm9, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $2, N BRANCH jle .L80 movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif PREFETCHB -16 * SIZE(BB) subq $-4 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -17 * SIZE(BO), %xmm2 PREFETCHW 3 * SIZE(CO1) xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 PREFETCHW 3 * SIZE(CO2) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -15 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -11 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -15 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: movddup ALPHA, %xmm3 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 mulpd %xmm3, %xmm8 movsd %xmm0, %xmm9 mulpd %xmm3, %xmm9 movaps %xmm12, %xmm0 movsd %xmm13, %xmm12 mulpd %xmm3, %xmm12 movsd %xmm0, %xmm13 mulpd %xmm3, %xmm13 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm13, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L51 ALIGN_4 .L60: testq $2, M BRANCH jle .L70 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movaps -17 * SIZE(BO), %xmm2 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -15 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -11 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -9 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -15 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movddup ALPHA, %xmm3 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 mulpd %xmm3, %xmm8 movsd %xmm0, %xmm9 mulpd %xmm3, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L70: testq $1, M BRANCH jle .L79 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 2), BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -15 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -11 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -15 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: movddup ALPHA, %xmm3 addpd %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 0 * SIZE(CO2), %xmm0 #endif mulpd %xmm3, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C movq BO, B ALIGN_4 .L80: testq $1, N BRANCH jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movsd -17 * SIZE(BO), %xmm2 PREFETCHW 3 * SIZE(CO1) xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_4 .L98: movddup ALPHA, %xmm3 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 #endif mulpd %xmm3, %xmm8 mulpd %xmm3, %xmm12 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 decq I BRANCH jg .L91 ALIGN_4 .L100: testq $2, M BRANCH jle .L110 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO #endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -17 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 pshufd $0x44, %xmm2, %xmm3 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x44, %xmm2, %xmm3 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_4 .L108: addpd %xmm9, %xmm8 movddup ALPHA, %xmm3 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 #endif mulpd %xmm3, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 ALIGN_4 .L110: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax addq %rax, AO addq %rax, BO #endif movsd -16 * SIZE(AO), %xmm0 movsd -17 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -14 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -13 * SIZE(AO), %xmm0 movsd -14 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -12 * SIZE(AO), %xmm0 movsd -13 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_4 .L118: movddup ALPHA, %xmm3 addpd %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 #endif mulsd %xmm3, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movlpd %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x4_sse2.S000066400000000000000000001554771313527062700214510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 256(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 9 + 4) #define movsd movlps #define movapd movaps #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 13 + 4) #define movapd movaps #endif #ifndef GENERIC #define KERNEL1(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL2(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL3(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL4(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ addpd %xmm6, %xmm15 ;\ movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #define KERNEL5(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL6(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL7(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL8(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #else #define KERNEL1(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL2(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL3(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL4(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm6, %xmm15 ;\ movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL6(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL7(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL8(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif EMMS movq %rsp, %rbx # save old stack subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A unpcklpd %xmm0, %xmm0 movapd %xmm0, ALPHA leaq (, LDC, SIZE), LDC #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_3 .L01: /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $2, %rax jle .L03 ALIGN_3 #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) movq 0 * SIZE(B), %mm0 movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq 1 * SIZE(B), %mm1 movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq 2 * SIZE(B), %mm2 movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq 3 * SIZE(B), %mm3 movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) movq 4 * SIZE(B), %mm4 movq %mm4, -8 * SIZE(BO) movq %mm4, -7 * SIZE(BO) movq 5 * SIZE(B), %mm5 movq %mm5, -6 * SIZE(BO) movq %mm5, -5 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) movq 6 * SIZE(B), %mm6 movq %mm6, -4 * SIZE(BO) movq %mm6, -3 * SIZE(BO) movq 7 * SIZE(B), %mm7 movq %mm7, -2 * SIZE(BO) movq %mm7, -1 * SIZE(BO) PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) movq 8 * SIZE(B), %mm0 movq %mm0, 0 * SIZE(BO) movq %mm0, 1 * SIZE(BO) movq 9 * SIZE(B), %mm1 movq %mm1, 2 * SIZE(BO) movq %mm1, 3 * SIZE(BO) movq 10 * SIZE(B), %mm2 movq %mm2, 4 * SIZE(BO) movq %mm2, 5 * SIZE(BO) movq 11 * SIZE(B), %mm3 movq %mm3, 6 * SIZE(BO) movq %mm3, 7 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) movq 12 * SIZE(B), %mm4 movq %mm4, 8 * SIZE(BO) movq %mm4, 9 * SIZE(BO) movq 13 * SIZE(B), %mm5 movq %mm5, 10 * SIZE(BO) movq %mm5, 11 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) movq 14 * SIZE(B), %mm6 movq %mm6, 12 * SIZE(BO) movq %mm6, 13 * SIZE(BO) movq 15 * SIZE(B), %mm7 movq %mm7, 14 * SIZE(BO) movq %mm7, 15 * SIZE(BO) addq $ 32 * SIZE, BO subq $-16 * SIZE, B subq $1, %rax jne .L02 ALIGN_3 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_3 .L04: movq 0 * SIZE(B), %mm0 movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq 1 * SIZE(B), %mm1 movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq 2 * SIZE(B), %mm2 movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq 3 * SIZE(B), %mm3 movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L04 ALIGN_3 .L10: movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_3 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movapd -14 * SIZE(AO), %xmm2 movapd -14 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -12 * SIZE(AO), %xmm4 movapd -12 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movapd -10 * SIZE(AO), %xmm6 movapd -8 * SIZE(BO), %xmm7 pxor %xmm11, %xmm11 PREFETCHW 3 * SIZE(CO1) pxor %xmm12, %xmm12 PREFETCHW 7 * SIZE(CO2) pxor %xmm13, %xmm13 PREFETCHW 3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 PREFETCHW 7 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 PREFETCH 0 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #ifndef GENERIC andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax BRANCH jl .L12 ALIGN_3 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $32 * SIZE, BO addq $16 * SIZE, AO ALIGN_3 #else sarq $2, %rax NOBRANCH jle .L16 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $ 32 * SIZE, BO subq $-16 * SIZE, AO decq %rax BRANCH jg .L12 #endif .L16: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_3 .L17: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd -12 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO, %rax, 8), %xmm0 addpd %xmm1, %xmm10 movapd -16 * SIZE(BO, %rax, 8), %xmm1 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO, %rax, 4), %xmm0 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm12 movapd -14 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm13 movapd -12 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm2, %xmm1 mulpd -10 * SIZE(BO, %rax, 8), %xmm2 addpd %xmm1, %xmm14 movapd -8 * SIZE(BO, %rax, 8), %xmm1 addpd %xmm2, %xmm15 movapd -10 * SIZE(AO, %rax, 4), %xmm2 addq $SIZE, %rax jl .L17 ALIGN_3 .L19: PREFETCH 8 * SIZE(BB) subq $-12 * SIZE, BB #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm11 mulpd %xmm7, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm7, %xmm14 mulpd %xmm7, %xmm15 #ifndef TRMMKERNEL movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 movlpd 2 * SIZE(CO1, LDC, 2), %xmm5 movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 movlpd 2 * SIZE(CO2, LDC, 2), %xmm7 movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movlpd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movlpd %xmm13, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #ifndef TRMMKERNEL addpd %xmm4, %xmm10 addpd %xmm5, %xmm14 addpd %xmm6, %xmm11 addpd %xmm7, %xmm15 #endif movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) movlpd %xmm14, 2 * SIZE(CO1, LDC, 2) movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) movlpd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_3 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_3 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movapd 16 * SIZE(BO), %xmm5 movapd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_3 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 32 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm8 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm9 movapd 12 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 40 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm8 movapd 18 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movapd 20 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 mulpd 22 * SIZE(BO), %xmm0 addpd %xmm5, %xmm10 movapd 48 * SIZE(BO), %xmm5 addpd %xmm0, %xmm11 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm7 addpd %xmm7, %xmm8 movapd 26 * SIZE(BO), %xmm7 mulpd %xmm0, %xmm7 addpd %xmm7, %xmm9 movapd 28 * SIZE(BO), %xmm7 mulpd %xmm0, %xmm7 mulpd 30 * SIZE(BO), %xmm0 addpd %xmm7, %xmm10 movapd 56 * SIZE(BO), %xmm7 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movapd 34 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm9 movapd 36 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 mulpd 38 * SIZE(BO), %xmm2 addpd %xmm1, %xmm10 movapd 64 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movapd 42 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm9 movapd 44 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 mulpd 46 * SIZE(BO), %xmm2 addpd %xmm3, %xmm10 movapd 72 * SIZE(BO), %xmm3 addpd %xmm2, %xmm11 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm8 movapd 50 * SIZE(BO), %xmm5 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movapd 52 * SIZE(BO), %xmm5 mulpd %xmm2, %xmm5 mulpd 54 * SIZE(BO), %xmm2 addpd %xmm5, %xmm10 movapd 80 * SIZE(BO), %xmm5 addpd %xmm2, %xmm11 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm7 addpd %xmm7, %xmm8 movapd 58 * SIZE(BO), %xmm7 mulpd %xmm2, %xmm7 addpd %xmm7, %xmm9 movapd 60 * SIZE(BO), %xmm7 mulpd %xmm2, %xmm7 mulpd 62 * SIZE(BO), %xmm2 addpd %xmm7, %xmm10 movapd 88 * SIZE(BO), %xmm7 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_3 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_3 .L26: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 8 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_3 .L29: #ifndef TRMMKERNEL movlpd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movlpd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm11 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm4, %xmm10 addpd %xmm6, %xmm11 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_3 .L30: testq $1, M je .L39 ALIGN_3 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movsd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movsd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movsd 16 * SIZE(BO), %xmm5 movsd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_3 .L32: mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 2 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm9 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 32 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -15 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm8 movsd 10 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm9 movsd 12 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BO), %xmm0 addsd %xmm3, %xmm10 movsd 40 * SIZE(BO), %xmm3 addsd %xmm0, %xmm11 movsd -14 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm8 movsd 18 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm9 movsd 20 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 mulsd 22 * SIZE(BO), %xmm0 addsd %xmm5, %xmm10 movsd 48 * SIZE(BO), %xmm5 addsd %xmm0, %xmm11 movsd -13 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm8 movsd 26 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm9 movsd 28 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 mulsd 30 * SIZE(BO), %xmm0 addsd %xmm7, %xmm10 movsd 56 * SIZE(BO), %xmm7 addsd %xmm0, %xmm11 movsd -12 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 movsd 34 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm9 movsd 36 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 mulsd 38 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 64 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -11 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm8 movsd 42 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm9 movsd 44 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 mulsd 46 * SIZE(BO), %xmm0 addsd %xmm3, %xmm10 movsd 72 * SIZE(BO), %xmm3 addsd %xmm0, %xmm11 movsd -10 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm8 movsd 50 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm9 movsd 52 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 mulsd 54 * SIZE(BO), %xmm0 addsd %xmm5, %xmm10 movsd 80 * SIZE(BO), %xmm5 addsd %xmm0, %xmm11 movsd -9 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm8 movsd 58 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm9 movsd 60 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 mulsd 62 * SIZE(BO), %xmm0 addsd %xmm7, %xmm10 movsd 88 * SIZE(BO), %xmm7 addsd %xmm0, %xmm11 movsd -8 * SIZE(AO), %xmm0 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_3 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm9 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 8 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -15 * SIZE(AO), %xmm0 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_3 .L38: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 #endif mulsd %xmm7, %xmm8 mulsd %xmm7, %xmm9 mulsd %xmm7, %xmm10 mulsd %xmm7, %xmm11 #ifndef TRMMKERNEL addsd %xmm0, %xmm8 addsd %xmm2, %xmm9 addsd %xmm4, %xmm10 addsd %xmm6, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_3 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 ALIGN_3 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $2, %rax jle .L43 ALIGN_3 .L42: PREFETCH 56 * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) movq %mm4, -8 * SIZE(BO) movq %mm4, -7 * SIZE(BO) movq %mm5, -6 * SIZE(BO) movq %mm5, -5 * SIZE(BO) movq %mm6, -4 * SIZE(BO) movq %mm6, -3 * SIZE(BO) movq %mm7, -2 * SIZE(BO) movq %mm7, -1 * SIZE(BO) decq %rax jne .L42 ALIGN_3 .L43: movq K, %rax andq $3, %rax BRANCH jle .L50 ALIGN_3 .L44: movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq %mm0, 0 * SIZE(BO) movq %mm0, 1 * SIZE(BO) movq %mm1, 2 * SIZE(BO) movq %mm1, 3 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_3 .L50: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_3 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm12, %xmm12 movapd 8 * SIZE(BO), %xmm3 pxor %xmm13, %xmm13 movapd 0 * SIZE(AO), %xmm4 movapd 16 * SIZE(BO), %xmm5 movapd 8 * SIZE(AO), %xmm6 movapd 24 * SIZE(BO), %xmm7 PREFETCHW 4 * SIZE(CO1) PREFETCHW 4 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L55 ALIGN_3 .L52: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 0 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm12 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm13 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm12 movapd 32 * SIZE(BO), %xmm1 addpd %xmm0, %xmm13 movapd 16 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd 8 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd 12 * SIZE(BO), %xmm3 addpd %xmm2, %xmm13 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd 12 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd 40 * SIZE(BO), %xmm3 addpd %xmm2, %xmm13 movapd 24 * SIZE(AO), %xmm2 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm4, %xmm5 mulpd 18 * SIZE(BO), %xmm4 addpd %xmm5, %xmm8 movapd 16 * SIZE(BO), %xmm5 addpd %xmm4, %xmm9 movapd 2 * SIZE(AO), %xmm4 mulpd %xmm4, %xmm5 mulpd 18 * SIZE(BO), %xmm4 addpd %xmm5, %xmm12 movapd 20 * SIZE(BO), %xmm5 addpd %xmm4, %xmm13 movapd 4 * SIZE(AO), %xmm4 mulpd %xmm4, %xmm5 mulpd 22 * SIZE(BO), %xmm4 addpd %xmm5, %xmm8 movapd 20 * SIZE(BO), %xmm5 addpd %xmm4, %xmm9 movapd 6 * SIZE(AO), %xmm4 mulpd %xmm4, %xmm5 mulpd 22 * SIZE(BO), %xmm4 addpd %xmm5, %xmm12 movapd 48 * SIZE(BO), %xmm5 addpd %xmm4, %xmm13 movapd 32 * SIZE(AO), %xmm4 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm6, %xmm7 mulpd 26 * SIZE(BO), %xmm6 addpd %xmm7, %xmm8 movapd 24 * SIZE(BO), %xmm7 addpd %xmm6, %xmm9 movapd 10 * SIZE(AO), %xmm6 mulpd %xmm6, %xmm7 mulpd 26 * SIZE(BO), %xmm6 addpd %xmm7, %xmm12 movapd 28 * SIZE(BO), %xmm7 addpd %xmm6, %xmm13 movapd 12 * SIZE(AO), %xmm6 mulpd %xmm6, %xmm7 mulpd 30 * SIZE(BO), %xmm6 addpd %xmm7, %xmm8 movapd 28 * SIZE(BO), %xmm7 addpd %xmm6, %xmm9 movapd 14 * SIZE(AO), %xmm6 mulpd %xmm6, %xmm7 mulpd 30 * SIZE(BO), %xmm6 addpd %xmm7, %xmm12 movapd 56 * SIZE(BO), %xmm7 addpd %xmm6, %xmm13 movapd 40 * SIZE(AO), %xmm6 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_3 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_3 .L56: movapd 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm12 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm0, %xmm13 movapd -12 * SIZE(AO), %xmm0 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_3 .L59: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 #endif mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm12 mulpd %xmm7, %xmm13 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm13, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 ALIGN_3 .L60: testq $2, M je .L70 ALIGN_3 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movapd 16 * SIZE(BO), %xmm5 movapd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_3 .L62: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 32 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BO), %xmm0 addpd %xmm3, %xmm8 movapd 12 * SIZE(BO), %xmm3 addpd %xmm0, %xmm9 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 40 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm2, %xmm5 mulpd 18 * SIZE(BO), %xmm2 addpd %xmm5, %xmm8 movapd 20 * SIZE(BO), %xmm5 addpd %xmm2, %xmm9 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm5 mulpd 22 * SIZE(BO), %xmm2 addpd %xmm5, %xmm10 movapd 48 * SIZE(BO), %xmm5 addpd %xmm2, %xmm11 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm7 mulpd 26 * SIZE(BO), %xmm2 addpd %xmm7, %xmm8 movapd 28 * SIZE(BO), %xmm7 addpd %xmm2, %xmm9 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm7 mulpd 30 * SIZE(BO), %xmm2 addpd %xmm7, %xmm10 movapd 56 * SIZE(BO), %xmm7 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_3 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_3 .L66: mulpd %xmm0, %xmm1 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_3 .L69: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_3 .L70: testq $1, M je .L79 ALIGN_3 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movsd -12 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movsd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movsd 16 * SIZE(BO), %xmm5 movsd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_3 .L72: mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd 2 * SIZE(BO), %xmm0 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 addsd %xmm0, %xmm9 movsd -15 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm1 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 32 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -14 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BO), %xmm0 addsd %xmm3, %xmm8 movsd 12 * SIZE(BO), %xmm3 addsd %xmm0, %xmm9 movsd -13 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BO), %xmm0 addsd %xmm3, %xmm10 movsd 40 * SIZE(BO), %xmm3 addsd %xmm0, %xmm11 movsd -8 * SIZE(AO), %xmm0 mulsd %xmm2, %xmm5 mulsd 18 * SIZE(BO), %xmm2 addsd %xmm5, %xmm8 movsd 20 * SIZE(BO), %xmm5 addsd %xmm2, %xmm9 movsd -11 * SIZE(AO), %xmm2 mulsd %xmm2, %xmm5 mulsd 22 * SIZE(BO), %xmm2 addsd %xmm5, %xmm10 movsd 48 * SIZE(BO), %xmm5 addsd %xmm2, %xmm11 movsd -10 * SIZE(AO), %xmm2 mulsd %xmm2, %xmm7 mulsd 26 * SIZE(BO), %xmm2 addsd %xmm7, %xmm8 movsd 28 * SIZE(BO), %xmm7 addsd %xmm2, %xmm9 movsd -9 * SIZE(AO), %xmm2 mulsd %xmm2, %xmm7 mulsd 30 * SIZE(BO), %xmm2 addsd %xmm7, %xmm10 movsd 56 * SIZE(BO), %xmm7 addsd %xmm2, %xmm11 movsd -4 * SIZE(AO), %xmm2 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_3 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulsd %xmm0, %xmm1 mulsd 2 * SIZE(BO), %xmm0 addsd %xmm1, %xmm8 addsd %xmm0, %xmm9 movsd -15 * SIZE(AO), %xmm0 movsd 4 * SIZE(BO), %xmm1 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_3 .L78: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 #endif addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 mulsd %xmm7, %xmm8 mulsd %xmm7, %xmm9 #ifndef TRMMKERNEL addsd %xmm0, %xmm8 addsd %xmm2, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_3 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C ALIGN_3 .L80: testq $1, N je .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $3, %rax jle .L83 ALIGN_3 .L82: PREFETCH 56 * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) movq %mm4, -8 * SIZE(BO) movq %mm4, -7 * SIZE(BO) movq %mm5, -6 * SIZE(BO) movq %mm5, -5 * SIZE(BO) movq %mm6, -4 * SIZE(BO) movq %mm6, -3 * SIZE(BO) movq %mm7, -2 * SIZE(BO) movq %mm7, -1 * SIZE(BO) decq %rax jne .L82 ALIGN_3 .L83: movq K, %rax andq $7, %rax BRANCH jle .L90 ALIGN_3 .L84: movq 0 * SIZE(B), %mm0 movq %mm0, 0 * SIZE(BO) movq %mm0, 1 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO decq %rax jne .L84 ALIGN_3 .L90: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_3 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movapd 0 * SIZE(AO), %xmm4 movapd 8 * SIZE(AO), %xmm6 PREFETCHW 4 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_3 .L92: mulpd %xmm1, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -14 * SIZE(AO), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movapd 2 * SIZE(BO), %xmm1 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO), %xmm1 addpd %xmm0, %xmm10 movapd 16 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm1, %xmm11 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm1, %xmm2 mulpd -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -4 * SIZE(AO), %xmm2 addpd %xmm1, %xmm9 movapd 6 * SIZE(BO), %xmm1 mulpd %xmm1, %xmm2 mulpd -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm10 movapd 24 * SIZE(AO), %xmm2 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm1, %xmm11 movapd 16 * SIZE(BO), %xmm1 mulpd %xmm3, %xmm4 mulpd 2 * SIZE(AO), %xmm3 addpd %xmm4, %xmm8 movapd 4 * SIZE(AO), %xmm4 addpd %xmm3, %xmm9 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm3, %xmm4 mulpd 6 * SIZE(AO), %xmm3 addpd %xmm4, %xmm10 movapd 32 * SIZE(AO), %xmm4 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) addpd %xmm3, %xmm11 movapd 12 * SIZE(BO), %xmm3 mulpd %xmm3, %xmm6 mulpd 10 * SIZE(AO), %xmm3 addpd %xmm6, %xmm8 movapd 12 * SIZE(AO), %xmm6 addpd %xmm3, %xmm9 movapd 14 * SIZE(BO), %xmm3 mulpd %xmm3, %xmm6 mulpd 14 * SIZE(AO), %xmm3 addpd %xmm6, %xmm10 movapd 40 * SIZE(AO), %xmm6 addpd %xmm3, %xmm11 movapd 24 * SIZE(BO), %xmm3 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L92 ALIGN_3 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_3 .L96: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movapd 2 * SIZE(BO), %xmm1 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_3 .L99: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm9 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 ALIGN_3 .L100: testq $2, M je .L110 ALIGN_3 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L105 ALIGN_3 .L102: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -14 * SIZE(AO), %xmm0 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -12 * SIZE(AO), %xmm0 mulpd 4 * SIZE(BO), %xmm0 addpd %xmm0, %xmm10 movapd -10 * SIZE(AO), %xmm0 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm2 mulpd 10 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd 24 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -4 * SIZE(AO), %xmm2 mulpd 12 * SIZE(BO), %xmm2 addpd %xmm2, %xmm10 movapd -2 * SIZE(AO), %xmm2 mulpd 14 * SIZE(BO), %xmm2 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L102 ALIGN_3 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_3 .L106: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(AO), %xmm0 movapd 2 * SIZE(BO), %xmm1 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_3 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 mulpd %xmm7, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 addpd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) addq $2 * SIZE, CO1 # coffset += 4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif ALIGN_3 .L110: testq $1, M je .L999 ALIGN_3 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movsd -12 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movsd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_3 .L112: mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 16 * SIZE(BO), %xmm1 mulsd 2 * SIZE(BO), %xmm0 addsd %xmm0, %xmm9 movsd -14 * SIZE(AO), %xmm0 mulsd 4 * SIZE(BO), %xmm0 addsd %xmm0, %xmm10 movsd -13 * SIZE(AO), %xmm0 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm0, %xmm11 movsd -8 * SIZE(AO), %xmm0 mulsd %xmm2, %xmm3 movsd -11 * SIZE(AO), %xmm2 addsd %xmm3, %xmm8 movsd 24 * SIZE(BO), %xmm3 mulsd 10 * SIZE(BO), %xmm2 addsd %xmm2, %xmm9 movsd -10 * SIZE(AO), %xmm2 mulsd 12 * SIZE(BO), %xmm2 addsd %xmm2, %xmm10 movsd -9 * SIZE(AO), %xmm2 mulsd 14 * SIZE(BO), %xmm2 addsd %xmm2, %xmm11 movsd -4 * SIZE(AO), %xmm2 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L112 ALIGN_3 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_3 .L116: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 2 * SIZE(BO), %xmm1 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_3 .L118: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 addsd %xmm9, %xmm8 mulsd %xmm7, %xmm8 #ifndef TRMMKERNEL addsd 0 * SIZE(CO1), %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) ALIGN_3 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x4_sse3.S000066400000000000000000001564431313527062700214440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KKK 64(%rsp) #define KK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define ALPHA 224(%rsp) #define OFFSET 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #define KERNEL1(address) \ mulpd %xmm8, %xmm9 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif movsd %xmm0, ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif leaq (, LDC, SIZE), LDC movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 pxor %xmm4, %xmm4 movddup 16 * SIZE(BO), %xmm13 pxor %xmm5, %xmm5 movapd 24 * SIZE(AO), %xmm14 pxor %xmm6, %xmm6 movddup 24 * SIZE(BO), %xmm15 pxor %xmm7, %xmm7 prefetchnta 3 * SIZE(CO1) prefetchnta 3 * SIZE(CO2) prefetchnta 3 * SIZE(CO1, LDC, 2) prefetchnta 3 * SIZE(CO2, LDC, 2) prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #if 1 andq $-8, %rax salq $4, %rax NOBRANCH je .L15 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L12 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L12 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L12 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L12 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L12 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L12 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L12 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax BRANCH jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 #else sarq $3, %rax je .L15 ALIGN_4 .L12: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 32 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 40 * SIZE(BO), %xmm11 mulpd %xmm12, %xmm13 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm13, %xmm0 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 18 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 16 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 20 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm0 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 22 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 48 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 48 * SIZE(BO), %xmm13 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 26 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 28 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 30 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 56 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 56 * SIZE(BO), %xmm15 addq $32 * SIZE, BO addq $32 * SIZE, AO decq %rax BRANCH jne .L12 #endif ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax BRANCH jg .L16 ALIGN_4 .L19: mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm4 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm5 testq $15, CO1 NOBRANCH jne .L19x testq $15, LDC NOBRANCH jne .L19x mulpd %xmm15, %xmm2 mulpd %xmm15, %xmm3 mulpd %xmm15, %xmm6 mulpd %xmm15, %xmm7 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd 0 * SIZE(CO1), %xmm0 addpd 2 * SIZE(CO1), %xmm4 addpd 0 * SIZE(CO2), %xmm1 addpd 2 * SIZE(CO2), %xmm5 addpd 0 * SIZE(CO1, LDC, 2), %xmm2 addpd 2 * SIZE(CO1, LDC, 2), %xmm6 addpd 0 * SIZE(CO2, LDC, 2), %xmm3 addpd 2 * SIZE(CO2, LDC, 2), %xmm7 #endif movapd %xmm0, 0 * SIZE(CO1) movapd %xmm4, 2 * SIZE(CO1) movapd %xmm1, 0 * SIZE(CO2) movapd %xmm5, 2 * SIZE(CO2) movapd %xmm2, 0 * SIZE(CO1, LDC, 2) movapd %xmm6, 2 * SIZE(CO1, LDC, 2) movapd %xmm3, 0 * SIZE(CO2, LDC, 2) movapd %xmm7, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 jmp .L20 ALIGN_4 .L19x: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movsd 0 * SIZE(CO2), %xmm10 movhpd 1 * SIZE(CO2), %xmm10 movsd 2 * SIZE(CO2), %xmm11 movhpd 3 * SIZE(CO2), %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm4 addpd %xmm10, %xmm1 addpd %xmm11, %xmm5 #endif mulpd %xmm15, %xmm2 mulpd %xmm15, %xmm3 mulpd %xmm15, %xmm6 mulpd %xmm15, %xmm7 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 movsd 2 * SIZE(CO1, LDC, 2), %xmm13 movhpd 3 * SIZE(CO1, LDC, 2), %xmm13 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 movsd 2 * SIZE(CO2, LDC, 2), %xmm15 movhpd 3 * SIZE(CO2, LDC, 2), %xmm15 addpd %xmm12, %xmm2 addpd %xmm13, %xmm6 addpd %xmm14, %xmm3 addpd %xmm15, %xmm7 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movhpd %xmm4, 3 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhpd %xmm1, 1 * SIZE(CO2) movsd %xmm5, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm6, 2 * SIZE(CO1, LDC, 2) movhpd %xmm6, 3 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) movsd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M BRANCH je .L30 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhpd 1 * SIZE(CO2), %xmm10 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 #endif mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm2 mulpd %xmm15, %xmm3 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm10, %xmm1 addpd %xmm12, %xmm2 addpd %xmm14, %xmm3 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhpd %xmm1, 1 * SIZE(CO2) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 3 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 8 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 5 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 6 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 7 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 40 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 0 * SIZE(CO2), %xmm8 movsd 0 * SIZE(CO1, LDC, 2), %xmm9 movhpd 0 * SIZE(CO2, LDC, 2), %xmm9 #endif mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm1 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO2) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhpd %xmm1, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc movq BO, B decq J # j -- jg .L10 ALIGN_4 .L40: testq $2, N je .L80 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif prefetcht0 0 * SIZE(BB) subq $-4 * SIZE, BB movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) prefetchw 4 * SIZE(CO2) #else prefetchnta 4 * SIZE(CO1) prefetchnta 4 * SIZE(CO2) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movsd 0 * SIZE(CO2), %xmm10 movhpd 1 * SIZE(CO2), %xmm10 movsd 2 * SIZE(CO2), %xmm11 movhpd 3 * SIZE(CO2), %xmm11 #endif mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm4 mulpd %xmm15, %xmm5 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm9, %xmm4 addpd %xmm10, %xmm1 addpd %xmm11, %xmm5 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movhpd %xmm4, 3 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhpd %xmm1, 1 * SIZE(CO2) movsd %xmm5, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhpd 1 * SIZE(CO2), %xmm10 #endif addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm1 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm10, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhpd %xmm1, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movddup 2 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movddup 3 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movddup 8 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm11 movddup 5 * SIZE(AO), %xmm10 addpd %xmm11, %xmm0 mulpd 10 * SIZE(BO), %xmm10 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movddup 6 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movddup 7 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movddup 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 0 * SIZE(CO2), %xmm8 #endif addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm15, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C movq BO, B ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 24 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm8 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm8, %xmm0 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 5 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm8 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 6 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 movapd 28 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 7 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 #endif addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm1 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(AO), %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd 4 * SIZE(AO), %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm10 movddup 5 * SIZE(BO), %xmm11 addpd %xmm10, %xmm0 mulpd 10 * SIZE(AO), %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 6 * SIZE(BO), %xmm11 mulpd 12 * SIZE(AO), %xmm11 addpd %xmm11, %xmm2 movddup 7 * SIZE(BO), %xmm11 mulpd 14 * SIZE(AO), %xmm11 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm9, %xmm8 movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 #endif addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 mulpd %xmm15, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 ALIGN_4 .L110: testq $1, M je .L999 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 0 * SIZE(AO), %xmm9 movapd 0 * SIZE(BO), %xmm8 movapd 4 * SIZE(AO), %xmm11 movapd 4 * SIZE(BO), %xmm10 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulpd %xmm9, %xmm8 movapd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(BO), %xmm9 movapd 8 * SIZE(BO), %xmm8 addpd %xmm9, %xmm1 movapd 8 * SIZE(AO), %xmm9 mulpd %xmm11, %xmm10 movapd 6 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 mulpd 6 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm10 addpd %xmm11, %xmm1 movapd 12 * SIZE(AO), %xmm11 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd 0 * SIZE(BO), %xmm9 addsd %xmm9, %xmm0 movsd 1 * SIZE(AO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 #endif addpd %xmm1, %xmm0 haddpd %xmm0, %xmm0 mulsd %xmm15, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x8_nano.S000066400000000000000000001315261313527062700215210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 256(%rsp) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define RPREFETCHSIZE (16 * 4) #define PREFETCHSIZE (16 * 8 + 8) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif subq $-32 * SIZE, A salq $BASE_SHIFT, LDC movq N, J sarq $3, J jle .L40 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif leaq 32 * SIZE + BUFFER, BO movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm5 movaps 12 * SIZE(B), %xmm7 movq K, %rax sarq $1, %rax jle .L03 ALIGN_4 .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0xfa, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(BO) movaps 16 * SIZE(B), %xmm1 pshufd $0x50, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(BO) pshufd $0xfa, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(BO) movaps 20 * SIZE(B), %xmm3 pshufd $0x50, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(BO) pshufd $0xfa, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(BO) movaps 24 * SIZE(B), %xmm5 pshufd $0x50, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(BO) pshufd $0xfa, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(BO) movaps 28 * SIZE(B), %xmm7 addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $1, %rax BRANCH jle .L10 pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0xfa, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(BO) pshufd $0x50, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(BO) pshufd $0xfa, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO ALIGN_4 .L10: movq C, CO1 leaq (C, LDC, 4), CO2 movq A, AO leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $2, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif leaq (LDC, LDC, 2), %rax movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 PREFETCHW 3 * SIZE(CO1) pxor %xmm9, %xmm9 PREFETCHW 5 * SIZE(CO1, LDC, 1) pxor %xmm10, %xmm10 PREFETCHW 3 * SIZE(CO1, LDC, 2) pxor %xmm11, %xmm11 PREFETCHW 5 * SIZE(CO1, %rax) pxor %xmm12, %xmm12 PREFETCHW 3 * SIZE(CO2) pxor %xmm13, %xmm13 PREFETCHW 5 * SIZE(CO2, LDC, 1) pxor %xmm14, %xmm14 PREFETCHW 3 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 PREFETCHW 5 * SIZE(CO2, %rax) PREFETCH -32 * SIZE(BB) addq $16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L16 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0)(AO) pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 movaps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm10 movaps -24 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm12 movaps -20 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm13 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm14 movaps -16 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 movaps -28 * SIZE(AO), %xmm2 addps %xmm3, %xmm15 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm10 movaps -8 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm12 movaps -4 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm13 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm14 movaps 0 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 movaps -24 * SIZE(AO), %xmm2 addps %xmm3, %xmm15 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm8 movaps 4 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm10 movaps 8 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm12 movaps 12 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm13 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm14 movaps 16 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 movaps -20 * SIZE(AO), %xmm2 addps %xmm3, %xmm15 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm8 movaps 20 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm10 movaps 24 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm12 movaps 28 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm13 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm2, %xmm1 addps %xmm1, %xmm14 movaps 32 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm15 subq $-16 * SIZE, AO addq $ 64 * SIZE, BO decq %rax BRANCH jg .L12 .L16: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L18 ALIGN_4 .L17: pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm12 movaps -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm13 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm14 movaps -16 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm15 addq $ 4 * SIZE, AO subq $-16 * SIZE, BO decq %rax jg .L17 ALIGN_4 .L18: leaq (LDC, LDC, 2), %rax mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1, LDC, 1), %xmm0 movsd 0 * SIZE(CO1, LDC, 1), %xmm1 movhps 2 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO1, LDC, 2), %xmm2 movhps 2 * SIZE(CO1, %rax), %xmm2 movsd 0 * SIZE(CO1, %rax), %xmm3 movhps 2 * SIZE(CO1, LDC, 2), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm9 addps %xmm2, %xmm10 addps %xmm3, %xmm11 #endif mulps %xmm7, %xmm12 mulps %xmm7, %xmm13 mulps %xmm7, %xmm14 mulps %xmm7, %xmm15 #ifndef TRMMKERNEL movsd 0 * SIZE(CO2), %xmm4 movhps 2 * SIZE(CO2, LDC, 1), %xmm4 movsd 0 * SIZE(CO2, LDC, 1), %xmm5 movhps 2 * SIZE(CO2), %xmm5 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhps 2 * SIZE(CO2, %rax), %xmm6 movsd 0 * SIZE(CO2, %rax), %xmm7 movhps 2 * SIZE(CO2, LDC, 2), %xmm7 addps %xmm4, %xmm12 addps %xmm5, %xmm13 addps %xmm6, %xmm14 addps %xmm7, %xmm15 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1, LDC, 1) movlps %xmm9, 0 * SIZE(CO1, LDC, 1) movhps %xmm9, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 2) movhps %xmm10, 2 * SIZE(CO1, %rax) movlps %xmm11, 0 * SIZE(CO1, %rax) movhps %xmm11, 2 * SIZE(CO1, LDC, 2) movlps %xmm12, 0 * SIZE(CO2) movhps %xmm12, 2 * SIZE(CO2, LDC, 1) movlps %xmm13, 0 * SIZE(CO2, LDC, 1) movhps %xmm13, 2 * SIZE(CO2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) movhps %xmm14, 2 * SIZE(CO2, %rax) movlps %xmm15, 0 * SIZE(CO2, %rax) movhps %xmm15, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I jg .L11 ALIGN_4 .L20: testq $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L26 ALIGN_3 .L22: PREFETCH (PREFETCHSIZE + 0)(AO) mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movddup -30 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -8 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -4 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 0 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movddup -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps 4 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps 8 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps 12 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movddup -26 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps 20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps 24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps 28 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO addq $64 * SIZE, BO decq %rax BRANCH jg .L22 .L26: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L28 ALIGN_4 .L27: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movddup -30 * SIZE(AO), %xmm0 addq $ 2 * SIZE, AO subq $-16 * SIZE, BO decq %rax jg .L27 ALIGN_4 .L28: leaq (LDC, LDC, 2), %rax mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC, 1), %xmm0 movsd (CO1, LDC, 2), %xmm1 movhps (CO1, %rax), %xmm1 movsd (CO2), %xmm2 movhps (CO2, LDC, 1), %xmm2 movsd (CO2, LDC, 2), %xmm3 movhps (CO2, %rax), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm9 addps %xmm2, %xmm10 addps %xmm3, %xmm11 #endif movlps %xmm8, (CO1) movhps %xmm8, (CO1, LDC, 1) movlps %xmm9, (CO1, LDC, 2) movhps %xmm9, (CO1, %rax) movlps %xmm10, (CO2) movhps %xmm10, (CO2, LDC, 1) movlps %xmm11, (CO2, LDC, 2) movhps %xmm11, (CO2, %rax) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L30: testq $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO addq %rax, %rax leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L36 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0)(AO) shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movss -31 * SIZE(AO), %xmm0 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -8 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -4 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 0 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movss -30 * SIZE(AO), %xmm0 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps 4 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps 8 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps 12 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movss -29 * SIZE(AO), %xmm0 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps 20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps 24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps 28 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movss -28 * SIZE(AO), %xmm0 subq $-4 * SIZE, AO addq $64 * SIZE, BO decq %rax BRANCH jg .L32 .L36: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L38 ALIGN_4 .L37: shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movss -31 * SIZE(AO), %xmm0 addq $ 1 * SIZE, AO subq $-16 * SIZE, BO decq %rax jg .L37 ALIGN_4 .L38: leaq (LDC, LDC, 2), %rax mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 movhlps %xmm8, %xmm12 movhlps %xmm9, %xmm13 movhlps %xmm10, %xmm14 movhlps %xmm11, %xmm15 #ifndef TRMMKERNEL addss (CO1), %xmm8 addss (CO1, LDC, 1), %xmm12 addss (CO1, LDC, 2), %xmm9 addss (CO1, %rax), %xmm13 addss (CO2), %xmm10 addss (CO2, LDC, 1), %xmm14 addss (CO2, LDC, 2), %xmm11 addss (CO2, %rax), %xmm15 #endif movss %xmm8, (CO1) movss %xmm12, (CO1, LDC, 1) movss %xmm9, (CO1, LDC, 2) movss %xmm13, (CO1, %rax) movss %xmm10, (CO2) movss %xmm14, (CO2, LDC, 1) movss %xmm11, (CO2, LDC, 2) movss %xmm15, (CO2, %rax) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO addq %rax, %rax leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $8, KK #endif leaq (C, LDC, 8), C decq J jg .L01 ALIGN_4 .L40: testq $4, N jle .L80 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif leaq 32 * SIZE + BUFFER, BO movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm5 movaps 12 * SIZE(B), %xmm7 movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 .L42: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0xfa, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(BO) movaps 16 * SIZE(B), %xmm1 pshufd $0x50, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(BO) pshufd $0xfa, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(BO) movaps 20 * SIZE(B), %xmm3 pshufd $0x50, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(BO) pshufd $0xfa, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(BO) movaps 24 * SIZE(B), %xmm5 pshufd $0x50, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(BO) pshufd $0xfa, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(BO) movaps 28 * SIZE(B), %xmm7 addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $3, %rax BRANCH jle .L50 ALIGN_4 .L45: pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0xfa, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(BO) movaps 4 * SIZE(B), %xmm1 addq $ 4 * SIZE, B subq $-8 * SIZE, BO decq %rax jne .L45 ALIGN_4 .L50: movq C, CO1 leaq (C, LDC, 2), CO2 movq A, AO movq M, I sarq $2, I jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 PREFETCHW 3 * SIZE(CO1) pxor %xmm9, %xmm9 PREFETCHW 5 * SIZE(CO1, LDC) pxor %xmm10, %xmm10 PREFETCHW 3 * SIZE(CO2) pxor %xmm11, %xmm11 PREFETCHW 5 * SIZE(CO2, LDC) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L56 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0)(AO) pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -16 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -8 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 movaps -20 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps 0 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 movaps -16 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 subq $-16 * SIZE, AO subq $-32 * SIZE, BO decq %rax BRANCH jg .L52 .L56: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L58 ALIGN_4 .L57: pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 addq $ 4 * SIZE, AO subq $-8 * SIZE, BO decq %rax jg .L57 ALIGN_4 .L58: mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1, LDC), %xmm0 movsd 0 * SIZE(CO1, LDC), %xmm1 movhps 2 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2, LDC), %xmm2 movsd 0 * SIZE(CO2, LDC), %xmm3 movhps 2 * SIZE(CO2), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm9 addps %xmm2, %xmm10 addps %xmm3, %xmm11 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1, LDC) movlps %xmm9, 0 * SIZE(CO1, LDC) movhps %xmm9, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2, LDC) movlps %xmm11, 0 * SIZE(CO2, LDC) movhps %xmm11, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L66 ALIGN_3 .L62: PREFETCH (PREFETCHSIZE + 0)(AO) mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movddup -30 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movddup -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -12 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movddup -26 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -4 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps 0 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movddup -24 * SIZE(AO), %xmm0 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO decq %rax BRANCH jg .L62 .L66: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L68 ALIGN_4 .L67: mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movddup -30 * SIZE(AO), %xmm0 addq $ 2 * SIZE, AO subq $-8 * SIZE, BO decq %rax jg .L67 ALIGN_4 .L68: mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC), %xmm0 movsd (CO2), %xmm1 movhps (CO2, LDC), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm9 #endif movlps %xmm8, (CO1) movhps %xmm8, (CO1, LDC) movlps %xmm9, (CO2) movhps %xmm9, (CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L70: testq $1, M je .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L76 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0)(AO) shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movss -31 * SIZE(AO), %xmm0 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movss -30 * SIZE(AO), %xmm0 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 mulps -12 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movss -29 * SIZE(AO), %xmm0 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 mulps -4 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps 0 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movss -28 * SIZE(AO), %xmm0 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO decq %rax BRANCH jg .L72 .L76: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L78 ALIGN_4 .L77: shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movss -31 * SIZE(AO), %xmm0 addq $ 1 * SIZE, AO subq $-8 * SIZE, BO decq %rax jg .L77 ALIGN_4 .L78: mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 movhlps %xmm8, %xmm10 movhlps %xmm9, %xmm11 #ifndef TRMMKERNEL addss (CO1), %xmm8 addss (CO1, LDC), %xmm10 addss (CO2), %xmm9 addss (CO2, LDC), %xmm11 #endif movss %xmm8, (CO1) movss %xmm10, (CO1, LDC) movss %xmm9, (CO2) movss %xmm11, (CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C ALIGN_4 .L80: testq $2, N jle .L120 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif leaq 32 * SIZE + BUFFER, BO movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movq K, %rax sarq $2, %rax jle .L83 ALIGN_4 .L82: pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0xfa, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(BO) movaps 8 * SIZE(B), %xmm1 pshufd $0x50, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(BO) pshufd $0xfa, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(BO) movaps 12 * SIZE(B), %xmm3 addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L83: movq K, %rax andq $3, %rax BRANCH jle .L90 ALIGN_4 .L85: pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) movsd 2 * SIZE(B), %xmm1 addq $ 2 * SIZE, B subq $-4 * SIZE, BO decq %rax jne .L85 ALIGN_4 .L90: movq C, CO1 leaq (C, LDC), CO2 movq A, AO movq M, I sarq $2, I jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 PREFETCHW 3 * SIZE(CO1) pxor %xmm9, %xmm9 PREFETCHW 3 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L96 ALIGN_3 .L92: PREFETCH (PREFETCHSIZE + 0)(AO) pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -28 * SIZE(AO), %xmm0 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -24 * SIZE(AO), %xmm0 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -20 * SIZE(AO), %xmm0 pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $-16 * SIZE, BO decq %rax BRANCH jg .L92 .L96: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L98 ALIGN_4 .L97: pshufd $0x4e, %xmm1, %xmm3 mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -28 * SIZE(AO), %xmm0 addq $ 4 * SIZE, AO subq $-4 * SIZE, BO decq %rax jg .L97 ALIGN_4 .L98: mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO1), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm9 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO2) movlps %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I jg .L91 ALIGN_4 .L100: testq $2, M je .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L106 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0)(AO) mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movddup -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO decq %rax BRANCH jg .L102 .L106: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L108 ALIGN_4 .L107: mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addq $ 2 * SIZE, AO subq $-4 * SIZE, BO decq %rax jg .L107 ALIGN_4 .L108: mulps %xmm7, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO2), %xmm0 addps %xmm0, %xmm8 #endif movlps %xmm8, (CO1) movhps %xmm8, (CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L110: testq $1, M je .L119 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L116 ALIGN_3 .L112: PREFETCH (PREFETCHSIZE + 0)(AO) shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 movss -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 movss -29 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 movss -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO decq %rax BRANCH jg .L112 .L116: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L118 ALIGN_4 .L117: shufps $0, %xmm0, %xmm0 mulps %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addq $ 1 * SIZE, AO subq $-4 * SIZE, BO decq %rax jg .L117 ALIGN_4 .L118: mulps %xmm7, %xmm8 movhlps %xmm8, %xmm9 #ifndef TRMMKERNEL addss (CO1), %xmm8 addss (CO2), %xmm9 #endif movss %xmm8, (CO1) movss %xmm9, (CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L119: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C ALIGN_4 .L120: testq $1, N jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif leaq 32 * SIZE + BUFFER, BO movsd 0 * SIZE(B), %xmm1 movhps 2 * SIZE(B), %xmm1 movq K, %rax sarq $2, %rax jle .L123 ALIGN_4 .L122: pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0xfa, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(BO) movsd 4 * SIZE(B), %xmm1 movhps 6 * SIZE(B), %xmm1 addq $ 4 * SIZE, B subq $-8 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L123: movq K, %rax andq $3, %rax BRANCH jle .L130 ALIGN_4 .L125: pshufd $0x50, %xmm1, %xmm0 movlps %xmm0, -32 * SIZE(BO) movss 1 * SIZE(B), %xmm1 addq $ 1 * SIZE, B subq $-2 * SIZE, BO decq %rax jne .L125 ALIGN_4 .L130: movq C, CO1 movq A, AO movq M, I sarq $2, I jle .L140 ALIGN_4 .L131: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm0 movddup -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 PREFETCHW 3 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L136 ALIGN_3 .L132: PREFETCH (PREFETCHSIZE + 0)(AO) mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movddup -30 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movddup -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movddup -26 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movaps -16 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movddup -24 * SIZE(BO), %xmm1 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO decq %rax BRANCH jg .L132 .L136: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L138 ALIGN_4 .L137: mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movddup -30 * SIZE(BO), %xmm1 addq $ 4 * SIZE, AO subq $-2 * SIZE, BO decq %rax jg .L137 ALIGN_4 .L138: mulps %xmm7, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 decq I jg .L131 ALIGN_4 .L140: testq $2, M je .L150 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L146 ALIGN_3 .L142: PREFETCH (PREFETCHSIZE + 0)(AO) mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -30 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -26 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movddup -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -24 * SIZE(BO), %xmm1 subq $-8 * SIZE, AO subq $-8 * SIZE, BO decq %rax BRANCH jg .L142 .L146: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L148 ALIGN_4 .L147: mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -30 * SIZE(BO), %xmm1 addq $ 2 * SIZE, AO subq $-2 * SIZE, BO decq %rax jg .L147 ALIGN_4 .L148: mulps %xmm7, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 addps %xmm0, %xmm8 #endif movlps %xmm8, (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 ALIGN_4 .L150: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movss -32 * SIZE(BO), %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L156 ALIGN_3 .L152: PREFETCH (PREFETCHSIZE + 0)(AO) mulss %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addss %xmm1, %xmm8 movss -30 * SIZE(BO), %xmm1 mulss %xmm0, %xmm1 movss -30 * SIZE(AO), %xmm0 addss %xmm1, %xmm8 movss -28 * SIZE(BO), %xmm1 mulss %xmm0, %xmm1 movss -29 * SIZE(AO), %xmm0 addss %xmm1, %xmm8 movss -26 * SIZE(BO), %xmm1 mulss %xmm0, %xmm1 movss -28 * SIZE(AO), %xmm0 addss %xmm1, %xmm8 movss -24 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-8 * SIZE, BO decq %rax BRANCH jg .L152 .L156: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L158 ALIGN_4 .L157: mulss %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addss %xmm1, %xmm8 movss -30 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L157 ALIGN_4 .L158: mulss %xmm7, %xmm8 #ifndef TRMMKERNEL addss (CO1), %xmm8 #endif movss %xmm8, (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_4x8_nehalem.S000066400000000000000000001245551313527062700222030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %rbp #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rdx #define BB %r12 #define PREA %r10 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA 48(%rsp) #define J 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define ALPHA 224(%rsp) #define J 232(%rsp) #define OFFSET 240(%rsp) #define KK 248(%rsp) #define KKK 256(%rsp) #endif #define PREFETCHSIZE 8 #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif unpcklps %xmm0, %xmm0 movlps %xmm0, ALPHA subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $BASE_SHIFT, LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $3, J NOBRANCH jle .L40 ALIGN_4 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 4), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $2, I NOBRANCH jle .L20 ALIGN_4 .L11: prefetcht2 -32 * SIZE(BB) subq $-16 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif leaq (LDC, LDC, 2), %rax xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 PADDING xorps %xmm4, %xmm4 PADDING xorps %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 7 * SIZE(CO1, LDC, 1) PADDING xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1, LDC, 2) PADDING xorps %xmm11, %xmm11 prefetcht0 7 * SIZE(CO1, %rax, 1) movaps -32 * SIZE(AO), %xmm0 PADDING xorps %xmm12, %xmm12 prefetcht0 3 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 7 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht0 3 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht0 7 * SIZE(CO2, %rax, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 movaps -28 * SIZE(AO), %xmm7 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm7, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm7, %xmm5 mulps %xmm7, %xmm6 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm7, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm7, %xmm3 mulps %xmm7, %xmm4 addps %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 movaps -20 * SIZE(AO), %xmm7 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 addps %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm7, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm7, %xmm5 mulps %xmm7, %xmm6 addps %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm7, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm7, %xmm3 movaps -16 * SIZE(AO), %xmm0 mulps %xmm7, %xmm4 subq $-16 * SIZE, AO decq %rax BRANCH jg .L12 ALIGN_3 .L15: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: addps %xmm1, %xmm12 addps %xmm2, %xmm13 addps %xmm3, %xmm14 addps %xmm4, %xmm15 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps %xmm13, %xmm4 shufps $0xd8, %xmm12, %xmm13 shufps $0xd8, %xmm15, %xmm12 shufps $0xd8, %xmm14, %xmm15 shufps $0xd8, %xmm4, %xmm14 movaps %xmm12, %xmm4 shufps $0xd8, %xmm14, %xmm12 shufps $0xd8, %xmm4, %xmm14 movaps %xmm13, %xmm5 shufps $0xd8, %xmm15, %xmm13 shufps $0xd8, %xmm5, %xmm15 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 mulps %xmm7, %xmm12 mulps %xmm7, %xmm13 mulps %xmm7, %xmm14 mulps %xmm7, %xmm15 leaq (LDC, LDC, 2), %rax #ifndef TRMMKERNEL movups (CO1), %xmm0 movups (CO1, LDC, 1), %xmm1 movups (CO1, LDC, 2), %xmm2 movups (CO1, %rax, 1), %xmm3 movups (CO2), %xmm4 movups (CO2, LDC, 1), %xmm5 movups (CO2, LDC, 2), %xmm6 movups (CO2, %rax, 1), %xmm7 addps %xmm0, %xmm8 addps %xmm1, %xmm9 addps %xmm2, %xmm10 addps %xmm3, %xmm11 addps %xmm4, %xmm12 addps %xmm5, %xmm13 addps %xmm6, %xmm14 addps %xmm7, %xmm15 #endif movups %xmm8, (CO1) movups %xmm9, (CO1, LDC, 1) movups %xmm10, (CO1, LDC, 2) movups %xmm11, (CO1, %rax, 1) movups %xmm12, (CO2) movups %xmm13, (CO2, LDC, 1) movups %xmm14, (CO2, LDC, 2) movups %xmm15, (CO2, %rax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -16 * SIZE(BO), %xmm5 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -8 * SIZE(BO), %xmm5 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps 0 * SIZE(BO), %xmm5 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 leaq (LDC, LDC, 2), %rax #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC, 1), %xmm0 movsd (CO1, LDC, 2), %xmm1 movhps (CO1, %rax, 1), %xmm1 movsd (CO2), %xmm2 movhps (CO2, LDC, 1), %xmm2 movsd (CO2, LDC, 2), %xmm3 movhps (CO2, %rax, 1), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm9 addps %xmm2, %xmm10 addps %xmm3, %xmm11 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO1, LDC, 1) movsd %xmm9, (CO1, LDC, 2) movhps %xmm9, (CO1, %rax, 1) movsd %xmm10, (CO2) movhps %xmm10, (CO2, LDC, 1) movsd %xmm11, (CO2, LDC, 2) movhps %xmm11, (CO2, %rax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L30: testq $1, M BRANCH jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm8, %xmm8 xorps %xmm12, %xmm12 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -20 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -12 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -4 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 subq $-32 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: addps %xmm2, %xmm8 addps %xmm3, %xmm12 mulps %xmm7, %xmm8 mulps %xmm7, %xmm12 pshufd $0xff, %xmm8, %xmm11 pshufd $0xaa, %xmm8, %xmm10 pshufd $0x55, %xmm8, %xmm9 pshufd $0x00, %xmm8, %xmm8 pshufd $0xff, %xmm12, %xmm15 pshufd $0xaa, %xmm12, %xmm14 pshufd $0x55, %xmm12, %xmm13 pshufd $0x00, %xmm12, %xmm12 leaq (LDC, LDC, 2), %rax #ifndef TRMMKERNEL addss (CO1), %xmm8 addss (CO1, LDC, 1), %xmm9 addss (CO1, LDC, 2), %xmm10 addss (CO1, %rax, 1), %xmm11 addss (CO2), %xmm12 addss (CO2, LDC, 1), %xmm13 addss (CO2, LDC, 2), %xmm14 addss (CO2, %rax, 1), %xmm15 #endif movss %xmm8, (CO1) movss %xmm9, (CO1, LDC, 1) movss %xmm10, (CO1, LDC, 2) movss %xmm11, (CO1, %rax, 1) movss %xmm12, (CO2) movss %xmm13, (CO2, LDC, 1) movss %xmm14, (CO2, LDC, 2) movss %xmm15, (CO2, %rax, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $8, KK #endif movq BO, B leaq (C, LDC, 8), C subq $1, J BRANCH jg .L10 ALIGN_4 .L40: testq $4, N jle .L70 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 2), CO2 movq A, AO movq M, I sarq $2, I NOBRANCH jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 4 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht2 4 * SIZE(CO2, LDC, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO1, LDC, 1), %xmm1 movhps 2 * SIZE(CO1, LDC, 1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO2, LDC, 1), %xmm3 movhps 2 * SIZE(CO2, LDC, 1), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm9 addps %xmm2, %xmm10 addps %xmm3, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO1, LDC, 1) movhps %xmm9, 2 * SIZE(CO1, LDC, 1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO2, LDC, 1) movhps %xmm11, 2 * SIZE(CO2, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L41 ALIGN_4 .L50: testq $2, M BRANCH jle .L60 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: addps %xmm1, %xmm8 addps %xmm2, %xmm9 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC, 1), %xmm0 movsd (CO2), %xmm1 movhps (CO2, LDC, 1), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm9 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO1, LDC, 1) movsd %xmm9, (CO2) movhps %xmm9, (CO2, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -20 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-16 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L62 addps %xmm9, %xmm8 ALIGN_3 .L65: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: addps %xmm2, %xmm8 mulps %xmm7, %xmm8 pshufd $0xff, %xmm8, %xmm11 pshufd $0xaa, %xmm8, %xmm10 pshufd $0x55, %xmm8, %xmm9 pshufd $0x00, %xmm8, %xmm8 #ifndef TRMMKERNEL addss (CO1), %xmm8 addss (CO1, LDC, 1), %xmm9 addss (CO2), %xmm10 addss (CO2, LDC, 1), %xmm11 #endif movss %xmm8, (CO1) movss %xmm9, (CO1, LDC, 1) movss %xmm10, (CO2) movss %xmm11, (CO2, LDC, 1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C ALIGN_4 .L70: testq $2, N jle .L100 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC), CO2 movq A, AO movq M, I sarq $2, I NOBRANCH jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -26 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -24 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_3 .L75: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_3 .L78: addps %xmm1, %xmm8 addps %xmm2, %xmm9 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L71 ALIGN_4 .L80: testq $2, M BRANCH jle .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -26 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L82 ALIGN_3 .L85: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_3 .L88: addps %xmm1, %xmm8 mulps %xmm7, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO2), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L90: testq $1, M BRANCH jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_3 .L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -26 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L92 addps %xmm9, %xmm8 ALIGN_3 .L95: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_3 .L96: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_3 .L98: addps %xmm2, %xmm8 mulps %xmm7, %xmm8 pshufd $0x55, %xmm8, %xmm9 pshufd $0x00, %xmm8, %xmm8 #ifndef TRMMKERNEL addss (CO1), %xmm8 addss (CO2), %xmm9 #endif movss %xmm8, (CO1) movss %xmm9, (CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B leaq (C, LDC, 2), C ALIGN_4 .L100: testq $1, N jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $2, I NOBRANCH jle .L110 ALIGN_4 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -29 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_3 .L105: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_3 .L106: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_3 .L108: addps %xmm1, %xmm8 mulps %xmm7, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 decq I BRANCH jg .L101 ALIGN_4 .L110: testq $2, M BRANCH jle .L120 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_3 .L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -31 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -29 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AO), %xmm0 subq $-4 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L112 ALIGN_3 .L115: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_3 .L116: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_3 .L118: addps %xmm1, %xmm8 mulps %xmm7, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 ALIGN_4 .L120: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L125 ALIGN_3 .L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -30 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -30 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -29 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -29 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -28 * SIZE(AO), %xmm0 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L122 ALIGN_3 .L125: movddup ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L128 ALIGN_3 .L126: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L126 ALIGN_3 .L128: addps %xmm2, %xmm8 mulps %xmm7, %xmm8 #ifndef TRMMKERNEL addss (CO1), %xmm8 #endif movss %xmm8, (CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_8x4_barcelona.S000066400000000000000000002053471313527062700225170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH prefetch #define PREFETCHSIZE (16 * 17 + 0) #define RPREFETCHSIZE (16 * 4 + 0) #define WPREFETCHSIZE (16 * 9 + 0) #define KERNEL1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #define KERNEL2(xx) \ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ addps %xmm1, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL3(xx) \ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL4(xx) \ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ movaps (AO, %rax, 4), %xmm6 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ addps %xmm5, %xmm14 ;\ movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm6, %xmm2 #define KERNEL5(xx) \ mulps %xmm1, %xmm6 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm8 ;\ movaps %xmm2, %xmm6 ;\ addps %xmm1, %xmm12 ;\ movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ movaps %xmm6, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm6 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm10 ;\ movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ addps %xmm1, %xmm14 ;\ movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm6, %xmm2 #define KERNEL6(xx) \ mulps %xmm1, %xmm6 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm8 ;\ movaps %xmm2, %xmm6 ;\ addps %xmm1, %xmm12 ;\ movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm6, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm6 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm10 ;\ movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm7, %xmm2 #define KERNEL7(xx) \ mulps %xmm5, %xmm7 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm8 ;\ movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ movaps %xmm2, %xmm7 ;\ addps %xmm5, %xmm12 ;\ movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm7, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm7 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm10 ;\ movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ addps %xmm5, %xmm14 ;\ movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm7, %xmm2 #define KERNEL8(xx) \ mulps %xmm5, %xmm7 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm8 ;\ movaps %xmm2, %xmm7 ;\ addps %xmm5, %xmm12 ;\ movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm7, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm7 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm10 ;\ movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps (AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ addps %xmm5, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 #else movq 72(%rsp), LDC #ifdef TRMMKERNEL movsd 80(%rsp), %xmm12 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif subq $-32 * SIZE, A leaq (, LDC, SIZE), LDC movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 movaps 8 * SIZE(B), %xmm11 movaps 12 * SIZE(B), %xmm15 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetchw (WPREFETCHSIZE + 32) * SIZE(BO) pshufd $0x00, %xmm11, %xmm0 pshufd $0x55, %xmm11, %xmm1 pshufd $0xaa, %xmm11, %xmm2 pshufd $0xff, %xmm11, %xmm3 prefetchw (WPREFETCHSIZE + 48) * SIZE(BO) pshufd $0x00, %xmm15, %xmm4 pshufd $0x55, %xmm15, %xmm5 pshufd $0xaa, %xmm15, %xmm6 pshufd $0xff, %xmm15, %xmm7 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) movaps %xmm4, 48 * SIZE(BO) movaps %xmm5, 52 * SIZE(BO) movaps %xmm6, 56 * SIZE(BO) movaps %xmm7, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movaps -28 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movaps -16 * SIZE(AO), %xmm4 xorps %xmm10, %xmm10 movaps 0 * SIZE(BO), %xmm5 xorps %xmm11, %xmm11 prefetch -20 * SIZE(BB) prefetchw 3 * SIZE(CO1) xorps %xmm12, %xmm12 prefetchw 7 * SIZE(CO2) xorps %xmm13, %xmm13 prefetchw 3 * SIZE(CO1, LDC, 2) xorps %xmm14, %xmm14 prefetchw 7 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 movaps %xmm0, %xmm2 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-8, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) BRANCH jl .L12 ALIGN_4 .L15: prefetch 16 * SIZE(BB) subq $-32 * SIZE, BB movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL_SUB1(32 * 0) KERNEL_SUB2(32 * 0) KERNEL_SUB3(32 * 0) KERNEL_SUB4(32 * 0) addq $32 * SIZE, AO addq $64 * SIZE, BO ALIGN_3 .L16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L18 leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_4 .L17: mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm8 movaps %xmm2, %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm9 movaps %xmm0, %xmm2 addps %xmm3, %xmm13 movaps -20 * SIZE(BO, %rax, 8), %xmm3 mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm10 movaps -24 * SIZE(AO, %rax, 4), %xmm0 addps %xmm1, %xmm14 movaps -16 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm11 addps %xmm3, %xmm15 movaps -12 * SIZE(BO, %rax, 8), %xmm3 movaps %xmm0, %xmm2 addq $SIZE * 2, %rax jl .L17 ALIGN_4 .L18: #ifndef TRMMKERNEL movups 0 * SIZE(CO1), %xmm0 movups 4 * SIZE(CO1), %xmm1 movups 0 * SIZE(CO2), %xmm2 movups 4 * SIZE(CO2), %xmm3 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 mulps %xmm7, %xmm12 mulps %xmm7, %xmm13 mulps %xmm7, %xmm14 mulps %xmm7, %xmm15 #ifndef TRMMKERNEL movups 0 * SIZE(CO1, LDC, 2), %xmm4 movups 4 * SIZE(CO1, LDC, 2), %xmm5 movups 0 * SIZE(CO2, LDC, 2), %xmm6 movups 4 * SIZE(CO2, LDC, 2), %xmm7 addps %xmm0, %xmm8 addps %xmm1, %xmm12 addps %xmm2, %xmm9 addps %xmm3, %xmm13 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movsd %xmm13, 4 * SIZE(CO2) movhps %xmm13, 6 * SIZE(CO2) #ifndef TRMMKERNEL addps %xmm4, %xmm10 addps %xmm5, %xmm14 addps %xmm6, %xmm11 addps %xmm7, %xmm15 #endif movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movhps %xmm10, 2 * SIZE(CO1, LDC, 2) movsd %xmm14, 4 * SIZE(CO1, LDC, 2) movhps %xmm14, 6 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) movsd %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movsd 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -28 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsd 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movsd 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movsd 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movsd 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd -26 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movsd 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movsd 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movsd 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movsd 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd -16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movsd 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movsd 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movsd 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movsd 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd -22 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movsd 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movsd 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movsd 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsd 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movsd 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -18 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsd 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsd 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movsd 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsd 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L40: testq $1, M je .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss -30 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss -29 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss -24 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss -27 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss -26 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss -25 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: mulss %xmm15, %xmm0 mulss %xmm15, %xmm1 mulss %xmm15, %xmm2 mulss %xmm15, %xmm3 #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm10 movss 0 * SIZE(CO1, LDC, 2), %xmm12 movss 0 * SIZE(CO2, LDC, 2), %xmm14 addss %xmm8, %xmm0 addss %xmm10, %xmm1 addss %xmm12, %xmm2 addss %xmm14, %xmm3 #endif movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO2) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 .L50: testq $2, N je .L100 .L51: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 .L52: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $ 2 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 prefetchw 4 * SIZE(CO1) xorps %xmm4, %xmm4 prefetchw 4 * SIZE(CO2) xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 4 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 8 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 12 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 20 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 24 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 28 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 80 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 movsd 4 * SIZE(CO2), %xmm11 movhps 6 * SIZE(CO2), %xmm11 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm9, %xmm4 addps %xmm10, %xmm1 addps %xmm11, %xmm5 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movsd %xmm5, 4 * SIZE(CO2) movhps %xmm5, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm10, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L80: testq $2, M je .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -26 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movsd 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsd 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -22 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movsd 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movsd 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsd 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -18 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movsd 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movsd 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsd 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm10, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L90: testq $1, M je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -29 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -24 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -27 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -26 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -25 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm10 #endif addss %xmm2, %xmm0 addss %xmm3, %xmm1 mulss %xmm15, %xmm0 mulss %xmm15, %xmm1 #ifndef TRMMKERNEL addss %xmm8, %xmm0 addss %xmm10, %xmm1 #endif movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 4 * ldc ALIGN_4 .L100: testq $1, N je .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 .L102: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movups 0 * SIZE(B), %xmm3 movups 4 * SIZE(B), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 prefetchw 4 * SIZE(CO1) xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps -20 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 mulps -12 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps -8 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps -4 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 48 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 mulps 4 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 8 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 12 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 64 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 mulps 20 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 24 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 28 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 80 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm9, %xmm4 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps -28 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -24 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps -20 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 movaps -12 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -8 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps -4 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 mulps %xmm15, %xmm0 #ifndef TRMMKERNEL addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L130: testq $2, M je .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -26 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd -22 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -18 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -8 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 mulps %xmm15, %xmm0 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 ALIGN_4 .L140: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss -31 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss -30 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss -29 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss -24 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss -27 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss -26 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss -25 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss -20 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movss ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 mulss %xmm15, %xmm0 #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_8x4_core2.S000066400000000000000000001366621313527062700216060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (16 * 4 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (16 * 13 + 10) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA subq $-32 * SIZE, A subq $-32 * SIZE, B #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq OLD_M, M movq OLD_N, N leaq (, LDC, SIZE), LDC movq N, J sarq $2, J jle .L50 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq 32 * SIZE + BUFFER, BO movaps -32 * SIZE(B), %xmm3 movq K, %rax sarq $2, %rax jle .L05 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movaps -28 * SIZE(B), %xmm7 movaps -24 * SIZE(B), %xmm11 movaps -20 * SIZE(B), %xmm15 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0x55, %xmm3, %xmm1 movaps %xmm1, -28 * SIZE(BO) pshufd $0xaa, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(BO) pshufd $0xff, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(BO) movaps -16 * SIZE(B), %xmm3 prefetcht0 (PREFETCH_W + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 movaps %xmm4, -16 * SIZE(BO) pshufd $0x55, %xmm7, %xmm5 movaps %xmm5, -12 * SIZE(BO) pshufd $0xaa, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(BO) pshufd $0xff, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(BO) prefetcht0 (PREFETCH_W + 32) * SIZE(BO) pshufd $0x00, %xmm11, %xmm8 movaps %xmm8, 0 * SIZE(BO) pshufd $0x55, %xmm11, %xmm9 movaps %xmm9, 4 * SIZE(BO) pshufd $0xaa, %xmm11, %xmm10 movaps %xmm10, 8 * SIZE(BO) pshufd $0xff, %xmm11, %xmm11 movaps %xmm11, 12 * SIZE(BO) prefetcht0 (PREFETCH_W + 48) * SIZE(BO) pshufd $0x00, %xmm15, %xmm12 movaps %xmm12, 16 * SIZE(BO) pshufd $0x55, %xmm15, %xmm13 movaps %xmm13, 20 * SIZE(BO) pshufd $0xaa, %xmm15, %xmm14 movaps %xmm14, 24 * SIZE(BO) pshufd $0xff, %xmm15, %xmm15 movaps %xmm15, 28 * SIZE(BO) subq $-16 * SIZE, B subq $-64 * SIZE, BO subq $1, %rax jne .L02 ALIGN_4 .L05: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L06: movaps -32 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm2, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L06 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 40 * SIZE + BUFFER, BO #else leaq 40 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 movaps -32 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 movaps -28 * SIZE(AO), %xmm1 pxor %xmm10, %xmm10 movaps -40 * SIZE(BO), %xmm6 pxor %xmm11, %xmm11 movaps -36 * SIZE(BO), %xmm7 prefetcht2 -32 * SIZE(BB) prefetcht0 7 * SIZE(CO1) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 prefetcht0 7 * SIZE(CO2) pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 prefetcht0 7 * SIZE(CO1, LDC, 2) pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 prefetcht0 7 * SIZE(CO2, LDC, 2) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L15 ALIGN_4 .L12: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 PADDING; movaps %xmm6, %xmm3 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps -28 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps -20 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps -16 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps -12 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -8 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps -4 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 PADDING; movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -12 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 PADDING; movaps %xmm6, %xmm3 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps 4 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps 8 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps 12 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -4 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps 16 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps 20 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps 24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 subq $-32 * SIZE, AO movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps 28 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $-64 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_4 .L15: prefetcht2 -16 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps -28 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 addq $8 * SIZE, AO movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps -20 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 addq $16 * SIZE, BO movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: movaps ALPHA, %xmm7 addps %xmm2, %xmm10 addps %xmm3, %xmm14 addps %xmm4, %xmm11 addps %xmm5, %xmm15 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 mulps %xmm7, %xmm12 mulps %xmm7, %xmm13 mulps %xmm7, %xmm14 mulps %xmm7, %xmm15 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhps 2 * SIZE(CO1, LDC, 2), %xmm4 movsd 4 * SIZE(CO1, LDC, 2), %xmm5 movhps 6 * SIZE(CO1, LDC, 2), %xmm5 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhps 2 * SIZE(CO2, LDC, 2), %xmm6 movsd 4 * SIZE(CO2, LDC, 2), %xmm7 movhps 6 * SIZE(CO2, LDC, 2), %xmm7 addps %xmm0, %xmm8 addps %xmm1, %xmm12 addps %xmm2, %xmm9 addps %xmm3, %xmm13 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movlps %xmm13, 4 * SIZE(CO2) movhps %xmm13, 6 * SIZE(CO2) #ifndef TRMMKERNEL addps %xmm4, %xmm10 addps %xmm5, %xmm14 addps %xmm6, %xmm11 addps %xmm7, %xmm15 #endif movlps %xmm10, 0 * SIZE(CO1, LDC, 2) movhps %xmm10, 2 * SIZE(CO1, LDC, 2) movlps %xmm14, 4 * SIZE(CO1, LDC, 2) movhps %xmm14, 6 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 subq $1, I jg .L11 ALIGN_4 .L20: testq $4, M jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L25 ALIGN_4 .L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -28 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm3 movaps -8 * SIZE(BO), %xmm4 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -24 * SIZE(AO), %xmm0 movaps 0 * SIZE(BO), %xmm2 movaps 4 * SIZE(BO), %xmm3 movaps 8 * SIZE(BO), %xmm4 movaps 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -20 * SIZE(AO), %xmm0 movaps 16 * SIZE(BO), %xmm2 movaps 20 * SIZE(BO), %xmm3 movaps 24 * SIZE(BO), %xmm4 movaps 28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $-16 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jg .L21 ALIGN_4 .L25: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L28 ALIGN_4 .L26: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 addq $ 4 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L26 ALIGN_4 .L28: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhps 2 * SIZE(CO1, LDC, 2), %xmm4 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhps 2 * SIZE(CO2, LDC, 2), %xmm6 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm2, %xmm9 addps %xmm4, %xmm10 addps %xmm6, %xmm11 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movlps %xmm10, 0 * SIZE(CO1, LDC, 2) movhps %xmm10, 2 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 subq $1, I ALIGN_4 .L30: testq $2, M jle .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L35 ALIGN_4 .L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -30 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -12 * SIZE(BO), %xmm3 movsd -8 * SIZE(BO), %xmm4 movsd -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -28 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm2 movsd 4 * SIZE(BO), %xmm3 movsd 8 * SIZE(BO), %xmm4 movsd 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -26 * SIZE(AO), %xmm0 movsd 16 * SIZE(BO), %xmm2 movsd 20 * SIZE(BO), %xmm3 movsd 24 * SIZE(BO), %xmm4 movsd 28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jg .L31 ALIGN_4 .L35: movsd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L38 ALIGN_4 .L36: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 addq $ 2 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L36 ALIGN_4 .L38: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm2, %xmm9 addps %xmm4, %xmm10 addps %xmm6, %xmm11 #endif movlps %xmm8, 0 * SIZE(CO1) movlps %xmm9, 0 * SIZE(CO2) movlps %xmm10, 0 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L40: testq $1, M jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L45 ALIGN_4 .L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 movss -24 * SIZE(BO), %xmm4 movss -20 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -31 * SIZE(AO), %xmm0 movss -16 * SIZE(BO), %xmm2 movss -12 * SIZE(BO), %xmm3 movss -8 * SIZE(BO), %xmm4 movss -4 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -30 * SIZE(AO), %xmm0 movss 0 * SIZE(BO), %xmm2 movss 4 * SIZE(BO), %xmm3 movss 8 * SIZE(BO), %xmm4 movss 12 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -29 * SIZE(AO), %xmm0 movss 16 * SIZE(BO), %xmm2 movss 20 * SIZE(BO), %xmm3 movss 24 * SIZE(BO), %xmm4 movss 28 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jg .L41 ALIGN_4 .L45: movss ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L48 ALIGN_4 .L46: movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 movss -24 * SIZE(BO), %xmm4 movss -20 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 addq $ 1 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L46 ALIGN_4 .L48: #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm0 movss 0 * SIZE(CO2), %xmm2 movss 0 * SIZE(CO1, LDC, 2), %xmm4 movss 0 * SIZE(CO2, LDC, 2), %xmm6 #endif mulss %xmm7, %xmm8 mulss %xmm7, %xmm9 mulss %xmm7, %xmm10 mulss %xmm7, %xmm11 #ifndef TRMMKERNEL addss %xmm0, %xmm8 addss %xmm2, %xmm9 addss %xmm4, %xmm10 addss %xmm6, %xmm11 #endif movss %xmm8, 0 * SIZE(CO1) movss %xmm9, 0 * SIZE(CO2) movss %xmm10, 0 * SIZE(CO1, LDC, 2) movss %xmm11, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C subq $1, J jg .L01 ALIGN_4 .L50: testq $2, N jle .L100 ALIGN_4 .L51: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $3, %rax jle .L53 addq %rax, %rax ALIGN_4 .L52: movaps -32 * SIZE(B), %xmm3 movaps -28 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO subq $1, %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $7, %rax BRANCH jle .L55 ALIGN_4 .L54: movss -32 * SIZE(B), %xmm8 movss -31 * SIZE(B), %xmm9 shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L54 ALIGN_4 .L55: movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO # aoffset = a movq M, I sarq $3, I jle .L70 ALIGN_4 .L60: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 7 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht0 7 * SIZE(CO2) pxor %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L65 ALIGN_4 .L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -20 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 movaps -16 * SIZE(AO), %xmm0 movaps -12 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -12 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 movaps -8 * SIZE(AO), %xmm0 movaps -4 * SIZE(AO), %xmm1 movaps -8 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -4 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 subq $-32 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L61 ALIGN_4 .L65: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L68 ALIGN_4 .L66: movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 addq $8 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L68: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm12 mulps %xmm7, %xmm13 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm12 addps %xmm2, %xmm9 addps %xmm3, %xmm13 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movlps %xmm13, 4 * SIZE(CO2) movhps %xmm13, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 subq $1, I jg .L60 ALIGN_4 .L70: testq $4, M jle .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L75 ALIGN_4 .L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm3 movaps -8 * SIZE(BO), %xmm4 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $-16 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L71 ALIGN_4 .L75: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L78 ALIGN_4 .L76: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: addps %xmm10, %xmm8 addps %xmm11, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm2, %xmm9 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L80: testq $2, M jle .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L85 ALIGN_4 .L81: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -30 * SIZE(AO), %xmm1 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -28 * SIZE(AO), %xmm0 movsd -26 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -12 * SIZE(BO), %xmm3 movsd -8 * SIZE(BO), %xmm4 movsd -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L81 ALIGN_4 .L85: movsd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L88 ALIGN_4 .L86: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L86 ALIGN_4 .L88: addps %xmm10, %xmm8 addps %xmm11, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm2, %xmm9 #endif movlps %xmm8, 0 * SIZE(CO1) movlps %xmm9, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L90: testq $1, M jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L95 ALIGN_4 .L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 movss -31 * SIZE(AO), %xmm1 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 movss -24 * SIZE(BO), %xmm4 movss -20 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm1, %xmm4 mulss %xmm1, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -30 * SIZE(AO), %xmm0 movss -29 * SIZE(AO), %xmm1 movss -16 * SIZE(BO), %xmm2 movss -12 * SIZE(BO), %xmm3 movss -8 * SIZE(BO), %xmm4 movss -4 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm1, %xmm4 mulss %xmm1, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L91 ALIGN_4 .L95: movss ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L98 ALIGN_4 .L96: movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L96 ALIGN_4 .L98: addss %xmm10, %xmm8 addss %xmm11, %xmm9 #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm0 movss 0 * SIZE(CO2), %xmm2 #endif mulss %xmm7, %xmm8 mulss %xmm7, %xmm9 #ifndef TRMMKERNEL addss %xmm0, %xmm8 addss %xmm2, %xmm9 #endif movss %xmm8, 0 * SIZE(CO1) movss %xmm9, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C ALIGN_4 .L100: testq $1, N jle .L999 ALIGN_4 .L101: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $4, %rax jle .L103 addq %rax, %rax ALIGN_4 .L102: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 movss -30 * SIZE(B), %xmm2 movss -29 * SIZE(B), %xmm3 movss -28 * SIZE(B), %xmm4 movss -27 * SIZE(B), %xmm5 movss -26 * SIZE(B), %xmm6 movss -25 * SIZE(B), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B subq $-32 * SIZE, BO subq $1, %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $15, %rax BRANCH jle .L105 ALIGN_4 .L104: movss -32 * SIZE(B), %xmm8 shufps $0, %xmm8, %xmm8 movaps %xmm8, 0 * SIZE(BO) addq $1 * SIZE, B addq $4 * SIZE, BO subq $1, %rax jne .L104 ALIGN_4 .L105: movq C, CO1 movq A, AO movq M, I sarq $3, I jle .L120 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 7 * SIZE(CO1) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L115 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm12 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -28 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm9 addps %xmm3, %xmm13 movaps -16 * SIZE(AO), %xmm0 movaps -12 * SIZE(AO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm12 movaps -8 * SIZE(AO), %xmm0 movaps -4 * SIZE(AO), %xmm1 movaps -20 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm9 addps %xmm3, %xmm13 subq $-32 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L111 ALIGN_4 .L115: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L118 ALIGN_4 .L116: movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addq $8 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L116 ALIGN_4 .L118: addps %xmm9, %xmm8 addps %xmm13, %xmm12 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm12 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm12 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 subq $1, I jg .L110 ALIGN_4 .L120: testq $4, M jle .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L125 ALIGN_4 .L121: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps -20 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm10 addps %xmm3, %xmm11 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L121 ALIGN_4 .L125: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L128 ALIGN_4 .L126: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm8 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L126 ALIGN_4 .L128: addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 #endif mulps %xmm7, %xmm8 #ifndef TRMMKERNEL addps %xmm0, %xmm8 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L130: testq $2, M jle .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L135 ALIGN_4 .L131: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -30 * SIZE(AO), %xmm1 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 movsd -28 * SIZE(AO), %xmm0 movsd -26 * SIZE(AO), %xmm1 movsd -24 * SIZE(BO), %xmm2 movsd -20 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm10 addps %xmm3, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L131 ALIGN_4 .L135: movsd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L138 ALIGN_4 .L136: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm8 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L136 ALIGN_4 .L138: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 #endif addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm9, %xmm8 mulps %xmm7, %xmm8 #ifndef TRMMKERNEL addps %xmm0, %xmm8 #endif movlps %xmm8, 0 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 ALIGN_4 .L140: testq $1, M jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L145 ALIGN_4 .L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 movss -31 * SIZE(AO), %xmm1 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 mulss %xmm0, %xmm2 mulss %xmm1, %xmm3 addss %xmm2, %xmm8 addss %xmm3, %xmm9 movss -30 * SIZE(AO), %xmm0 movss -29 * SIZE(AO), %xmm1 movss -24 * SIZE(BO), %xmm2 movss -20 * SIZE(BO), %xmm3 mulss %xmm0, %xmm2 mulss %xmm1, %xmm3 addss %xmm2, %xmm10 addss %xmm3, %xmm11 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L141 ALIGN_4 .L145: movss ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L148 ALIGN_4 .L146: movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm8 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L146 ALIGN_4 .L148: #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm0 #endif addss %xmm10, %xmm8 addss %xmm11, %xmm9 addss %xmm9, %xmm8 mulss %xmm7, %xmm8 #ifndef TRMMKERNEL addss %xmm0, %xmm8 #endif movss %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_8x4_penryn.S000066400000000000000000001323331313527062700220760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define PREA %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA 48(%rsp) #define J 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define ALPHA 224(%rsp) #define J 232(%rsp) #define OFFSET 240(%rsp) #define KK 248(%rsp) #define KKK 256(%rsp) #endif #define PREFETCHSIZE (8 * 17 + 4) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif unpcklps %xmm0, %xmm0 movlps %xmm0, ALPHA subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $BASE_SHIFT, LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $2, J NOBRANCH jle .L50 ALIGN_4 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I sarq $3, I NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 movaps -28 * SIZE(AO), %xmm1 xorpd %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorpd %xmm5, %xmm5 prefetcht0 -32 * SIZE(BB) xorpd %xmm6, %xmm6 prefetcht2 7 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht2 7 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht2 7 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movaps %xmm4, %xmm13 prefetcht2 7 * SIZE(CO2, LDC, 2) movaps %xmm4, %xmm14 movaps %xmm4, %xmm15 subq $-24 * SIZE, BB leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH -32 * SIZE(PREA) addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -12 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 PREFETCH -16 * SIZE(PREA) movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -4 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 0 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 4 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 PREFETCH 0 * SIZE(PREA) movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 12 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 20 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 PREFETCH 16 * SIZE(PREA) movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 28 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 subq $-64 * SIZE, AO pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 subq $-32 * SIZE, BO pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $-64 * SIZE, PREA subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: prefetcht0 -16 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: addps %xmm6, %xmm10 addps %xmm3, %xmm14 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movddup ALPHA, %xmm3 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps %xmm13, %xmm4 shufps $0xd8, %xmm12, %xmm13 shufps $0xd8, %xmm15, %xmm12 shufps $0xd8, %xmm14, %xmm15 shufps $0xd8, %xmm4, %xmm14 movaps %xmm12, %xmm4 shufps $0xd8, %xmm14, %xmm12 shufps $0xd8, %xmm4, %xmm14 movaps %xmm13, %xmm5 shufps $0xd8, %xmm15, %xmm13 shufps $0xd8, %xmm5, %xmm15 mulps %xmm3, %xmm8 mulps %xmm3, %xmm9 mulps %xmm3, %xmm10 mulps %xmm3, %xmm11 mulps %xmm3, %xmm12 mulps %xmm3, %xmm13 mulps %xmm3, %xmm14 mulps %xmm3, %xmm15 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhps 2 * SIZE(CO1, LDC, 2), %xmm4 movsd 4 * SIZE(CO1, LDC, 2), %xmm5 movhps 6 * SIZE(CO1, LDC, 2), %xmm5 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhps 2 * SIZE(CO2, LDC, 2), %xmm6 movsd 4 * SIZE(CO2, LDC, 2), %xmm7 movhps 6 * SIZE(CO2, LDC, 2), %xmm7 addps %xmm0, %xmm8 addps %xmm1, %xmm12 addps %xmm2, %xmm9 addps %xmm3, %xmm13 addps %xmm4, %xmm10 addps %xmm5, %xmm14 addps %xmm6, %xmm11 addps %xmm7, %xmm15 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movsd %xmm13, 4 * SIZE(CO2) movhps %xmm13, 6 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movhps %xmm10, 2 * SIZE(CO1, LDC, 2) movsd %xmm14, 4 * SIZE(CO1, LDC, 2) movhps %xmm14, 6 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) movsd %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 decq I BRANCH jg .L11 ALIGN_4 .L20: testq $4, M BRANCH jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: addps %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 subq $-16 * SIZE, AO addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L28 ALIGN_3 .L26: addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: addps %xmm6, %xmm10 addps %xmm4, %xmm11 movddup ALPHA, %xmm3 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 mulps %xmm3, %xmm8 mulps %xmm3, %xmm9 mulps %xmm3, %xmm10 mulps %xmm3, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhps 2 * SIZE(CO1, LDC, 2), %xmm4 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhps 2 * SIZE(CO2, LDC, 2), %xmm6 addps %xmm0, %xmm8 addps %xmm2, %xmm9 addps %xmm4, %xmm10 addps %xmm6, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movsd %xmm10, 0 * SIZE(CO1, LDC, 2) movhps %xmm10, 2 * SIZE(CO1, LDC, 2) movsd %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L30: testq $2, M BRANCH jle .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm0, %xmm1 addps %xmm3, %xmm8 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm9 pshufd $0xfa, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 pshufd $0xee, %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm2, %xmm4 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 pshufd $0x44, %xmm0, %xmm1 addps %xmm3, %xmm8 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm9 pshufd $0xfa, %xmm2, %xmm4 movaps -20 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 pshufd $0xee, %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm2, %xmm4 movaps -16 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L38 ALIGN_3 .L36: pshufd $0x44, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm9 pshufd $0xfa, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: movddup ALPHA, %xmm2 addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm3, %xmm8 addps %xmm4, %xmm9 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 0 * SIZE(CO2), %xmm0 movsd 0 * SIZE(CO1, LDC, 2), %xmm1 movhps 0 * SIZE(CO2, LDC, 2), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 0 * SIZE(CO2) movsd %xmm9, 0 * SIZE(CO1, LDC, 2) movhps %xmm9, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -30 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movaps -24 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -29 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -28 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movaps -16 * SIZE(BO), %xmm2 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L48 ALIGN_3 .L46: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: movddup ALPHA, %xmm2 addps %xmm9, %xmm8 mulps %xmm2, %xmm8 pshufd $0xff, %xmm8, %xmm11 pshufd $0xaa, %xmm8, %xmm10 pshufd $0x55, %xmm8, %xmm9 pshufd $0x00, %xmm8, %xmm8 #ifndef TRMMKERNEL addss 0 * SIZE(CO1), %xmm8 addss 0 * SIZE(CO2), %xmm9 addss 0 * SIZE(CO1, LDC, 2), %xmm10 addss 0 * SIZE(CO2, LDC, 2), %xmm11 #endif movss %xmm8, 0 * SIZE(CO1) movss %xmm9, 0 * SIZE(CO2) movss %xmm10, 0 * SIZE(CO1, LDC, 2) movss %xmm11, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C subq $1, J BRANCH jg .L10 ALIGN_4 .L50: testq $2, N jle .L90 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $3, I NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO #endif prefetcht2 -32 * SIZE(BB) subq $-8 * SIZE, BB movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 movaps -28 * SIZE(AO), %xmm1 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 prefetcht0 7 * SIZE(CO1) movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 prefetcht0 7 * SIZE(CO2) movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0x00, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0x55, %xmm2, %xmm6 mulps %xmm1, %xmm6 movaps -20 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addps %xmm5, %xmm10 pshufd $0xaa, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0xff, %xmm2, %xmm6 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps -12 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0x00, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0x55, %xmm2, %xmm6 mulps %xmm1, %xmm6 movaps -4 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps 0 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0xaa, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0xff, %xmm2, %xmm6 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps 4 * SIZE(AO), %xmm1 subq $-32 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0x00, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0x55, %xmm2, %xmm6 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: movddup ALPHA, %xmm7 addps %xmm3, %xmm8 addps %xmm4, %xmm9 addps %xmm5, %xmm10 addps %xmm6, %xmm11 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm10 addps %xmm2, %xmm9 addps %xmm3, %xmm11 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm10, 4 * SIZE(CO1) movhps %xmm10, 6 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movsd %xmm11, 4 * SIZE(CO2) movhps %xmm11, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 decq I BRANCH jg .L51 ALIGN_4 .L60: testq $4, M BRANCH jle .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xff, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xff, %xmm2, %xmm4 movaps -24 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L68 ALIGN_3 .L66: addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: movddup ALPHA, %xmm7 addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm3, %xmm8 addps %xmm4, %xmm9 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 addps %xmm0, %xmm8 addps %xmm2, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L70: testq $2, M BRANCH jle .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 movaps -32 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x44, %xmm0, %xmm1 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm9 pshufd $0xee, %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 pshufd $0xfa, %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm8 pshufd $0x44, %xmm0, %xmm1 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm9 pshufd $0xee, %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 pshufd $0xfa, %xmm2, %xmm3 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm3 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_3 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L78 ALIGN_3 .L76: addps %xmm3, %xmm8 pshufd $0x44, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 pshufd $0x50, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm3 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_3 .L78: movddup ALPHA, %xmm2 addps %xmm9, %xmm8 addps %xmm3, %xmm8 mulps %xmm2, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 0 * SIZE(CO2), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L80: testq $1, M BRANCH jle .L89 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movsd -30 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -30 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movsd -28 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -29 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movsd -26 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -28 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movsd -24 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L82 ALIGN_3 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L88 ALIGN_3 .L86: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movsd -30 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_3 .L88: movddup ALPHA, %xmm2 addps %xmm9, %xmm8 mulps %xmm2, %xmm8 pshufd $0x55, %xmm8, %xmm9 pshufd $0x00, %xmm8, %xmm8 #ifndef TRMMKERNEL addss 0 * SIZE(CO1), %xmm8 addss 0 * SIZE(CO2), %xmm9 #endif movss %xmm8, 0 * SIZE(CO1) movss %xmm9, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L89: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B leaq (C, LDC, 2), C ALIGN_4 .L90: testq $1, N jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $3, I NOBRANCH jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -28 * SIZE(AO), %xmm1 xorps %xmm9, %xmm9 movsd -32 * SIZE(BO), %xmm2 xorps %xmm10, %xmm10 prefetcht0 7 * SIZE(CO1) xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_3 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm3, %xmm0 addps %xmm0, %xmm8 movaps -24 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm9 movaps -20 * SIZE(AO), %xmm1 pshufd $0x55, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm3, %xmm0 addps %xmm0, %xmm10 movaps -16 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm11 movaps -12 * SIZE(AO), %xmm1 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm3, %xmm0 addps %xmm0, %xmm8 movaps -8 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm9 movaps -4 * SIZE(AO), %xmm1 pshufd $0x55, %xmm2, %xmm3 movsd -28 * SIZE(BO), %xmm2 mulps %xmm3, %xmm0 addps %xmm0, %xmm10 movaps 0 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm11 movaps 4 * SIZE(AO), %xmm1 subq $-32 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L92 ALIGN_3 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_3 .L96: pshufd $0x00, %xmm2, %xmm3 movss -31 * SIZE(BO), %xmm2 mulps %xmm3, %xmm0 addps %xmm0, %xmm8 movaps -24 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm9 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_3 .L98: movddup ALPHA, %xmm7 addps %xmm10, %xmm8 addps %xmm11, %xmm9 mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 decq I BRANCH jg .L91 ALIGN_4 .L100: testq $4, M BRANCH jle .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm9 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movaps -16 * SIZE(AO), %xmm0 addps %xmm3, %xmm9 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_3 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L108 ALIGN_3 .L106: pshufd $0x00, %xmm2, %xmm3 movss -31 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_3 .L108: movddup ALPHA, %xmm7 addps %xmm9, %xmm8 mulps %xmm7, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 ALIGN_4 .L110: testq $2, M BRANCH jle .L120 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 movsd -32 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_3 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movsd -30 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movsd -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movsd -26 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movsd -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L112 ALIGN_3 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L118 ALIGN_3 .L116: pshufd $0x00, %xmm2, %xmm3 movss -31 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movsd -30 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_3 .L118: movddup ALPHA, %xmm2 mulps %xmm2, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 ALIGN_4 .L120: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movss -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L125 ALIGN_3 .L122: mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -30 * SIZE(AO), %xmm0 addss %xmm2, %xmm9 movss -30 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -29 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -29 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -28 * SIZE(AO), %xmm0 addss %xmm2, %xmm9 movss -28 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L122 ALIGN_3 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L128 ALIGN_3 .L126: mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L126 ALIGN_3 .L128: movss ALPHA, %xmm2 addss %xmm9, %xmm8 mulss %xmm2, %xmm8 #ifndef TRMMKERNEL addss 0 * SIZE(CO1), %xmm8 #endif movss %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_8x4_sse.S000066400000000000000000002137651313527062700213660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 256(%rsp) #ifdef OPTERON #define movsd movlps #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 9 + 8) #endif #if defined(GENERIC) || defined(NANO) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 5 + 8) #endif #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) #ifndef GENERIC #define KERNEL1(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL2(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL3(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL4(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ addps %xmm6, %xmm15 ;\ movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL6(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL7(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL8(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #else #define KERNEL1(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL2(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ #define KERNEL3(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL4(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm1, %xmm8 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL6(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL7(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL8(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif EMMS movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif subq $-32 * SIZE, A leaq (, LDC, SIZE), LDC movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movd 0 * SIZE(B), %mm0 movq K, %rax sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 movd 4 * SIZE(B), %mm4 movd 5 * SIZE(B), %mm5 movd 6 * SIZE(B), %mm6 movd 7 * SIZE(B), %mm7 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) punpckldq %mm0, %mm0 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) punpckldq %mm1, %mm1 movd 8 * SIZE(B), %mm0 movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) punpckldq %mm2, %mm2 movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) punpckldq %mm3, %mm3 movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) punpckldq %mm4, %mm4 movq %mm4, 16 * SIZE(BO) movq %mm4, 18 * SIZE(BO) punpckldq %mm5, %mm5 movq %mm5, 20 * SIZE(BO) movq %mm5, 22 * SIZE(BO) punpckldq %mm6, %mm6 movq %mm6, 24 * SIZE(BO) movq %mm6, 26 * SIZE(BO) punpckldq %mm7, %mm7 movq %mm7, 28 * SIZE(BO) movq %mm7, 30 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movaps -28 * SIZE(AO), %xmm2 movaps -28 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movaps -24 * SIZE(AO), %xmm4 movaps -24 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 movaps -20 * SIZE(AO), %xmm6 movaps -16 * SIZE(BO), %xmm7 xorps %xmm11, %xmm11 PREFETCHW 7 * SIZE(CO1) xorps %xmm12, %xmm12 PREFETCHW 15 * SIZE(CO2) xorps %xmm13, %xmm13 PREFETCHW 7 * SIZE(CO1, LDC, 2) xorps %xmm14, %xmm14 PREFETCHW 15 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 PREFETCH -32 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #ifndef GENERIC andq $-8, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax BRANCH jl .L12 ALIGN_3 .L15: PREFETCH -16 * SIZE(BB) subq $-16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $64 * SIZE, BO addq $32 * SIZE, AO ALIGN_3 #else sarq $2, %rax NOBRANCH jle .L16 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $ 64 * SIZE, BO subq $-32 * SIZE, AO decq %rax BRANCH jg .L12 #endif .L16: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L18 leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_4 .L17: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO, %rax, 8), %xmm0 addps %xmm1, %xmm10 movaps -32 * SIZE(BO, %rax, 8), %xmm1 addps %xmm0, %xmm11 movaps -24 * SIZE(AO, %rax, 4), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm12 movaps -28 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm13 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm2, %xmm1 mulps -20 * SIZE(BO, %rax, 8), %xmm2 addps %xmm1, %xmm14 movaps -16 * SIZE(BO, %rax, 8), %xmm1 addps %xmm2, %xmm15 movaps -20 * SIZE(AO, %rax, 4), %xmm2 addq $SIZE * 2, %rax jl .L17 ALIGN_4 .L18: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 #endif mulps %xmm7, %xmm8 mulps %xmm7, %xmm9 mulps %xmm7, %xmm10 mulps %xmm7, %xmm11 mulps %xmm7, %xmm12 mulps %xmm7, %xmm13 mulps %xmm7, %xmm14 mulps %xmm7, %xmm15 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1, LDC, 2), %xmm4 movhps 2 * SIZE(CO1, LDC, 2), %xmm4 movsd 4 * SIZE(CO1, LDC, 2), %xmm5 movhps 6 * SIZE(CO1, LDC, 2), %xmm5 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 movhps 2 * SIZE(CO2, LDC, 2), %xmm6 movsd 4 * SIZE(CO2, LDC, 2), %xmm7 movhps 6 * SIZE(CO2, LDC, 2), %xmm7 addps %xmm0, %xmm8 addps %xmm1, %xmm12 addps %xmm2, %xmm9 addps %xmm3, %xmm13 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm9, 0 * SIZE(CO2) movhps %xmm9, 2 * SIZE(CO2) movlps %xmm13, 4 * SIZE(CO2) movhps %xmm13, 6 * SIZE(CO2) #ifndef TRMMKERNEL addps %xmm4, %xmm10 addps %xmm5, %xmm14 addps %xmm6, %xmm11 addps %xmm7, %xmm15 #endif movlps %xmm10, 0 * SIZE(CO1, LDC, 2) movhps %xmm10, 2 * SIZE(CO1, LDC, 2) movlps %xmm14, 4 * SIZE(CO1, LDC, 2) movhps %xmm14, 6 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3 #endif movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -28 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd -26 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd -16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd -22 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -18 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 #ifndef TRMMKERNEL #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(CO1), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 0 * SIZE(CO2), %xmm10 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd 0 * SIZE(CO1, LDC, 2), %xmm12 #ifdef movsd xorps %xmm14, %xmm14 #endif movsd 0 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3 #endif movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L40: testq $1, M je .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss -30 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss -29 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss -24 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss -27 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss -26 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss -25 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: mulss %xmm15, %xmm0 mulss %xmm15, %xmm1 mulss %xmm15, %xmm2 mulss %xmm15, %xmm3 #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm10 movss 0 * SIZE(CO1, LDC, 2), %xmm12 movss 0 * SIZE(CO2, LDC, 2), %xmm14 addss %xmm8, %xmm0 addss %xmm10, %xmm1 addss %xmm12, %xmm2 addss %xmm14, %xmm3 #endif movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO2) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 .L50: testq $2, N je .L100 .L51: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 .L52: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 PREFETCH 32 * SIZE(B) shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH 32 * SIZE(B) movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 movd 4 * SIZE(B), %mm4 movd 5 * SIZE(B), %mm5 movd 6 * SIZE(B), %mm6 movd 7 * SIZE(B), %mm7 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) movq %mm4, 16 * SIZE(BO) movq %mm4, 18 * SIZE(BO) movq %mm5, 20 * SIZE(BO) movq %mm5, 22 * SIZE(BO) movq %mm6, 24 * SIZE(BO) movq %mm6, 26 * SIZE(BO) movq %mm7, 28 * SIZE(BO) movq %mm7, 30 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif decq %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) #endif addq $ 2 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 PREFETCHW 7 * SIZE(CO1) xorps %xmm4, %xmm4 PREFETCHW 7 * SIZE(CO2) xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 4 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 8 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 12 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 20 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 24 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 28 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 80 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 movsd 4 * SIZE(CO2), %xmm11 movhps 6 * SIZE(CO2), %xmm11 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm9, %xmm4 addps %xmm10, %xmm1 addps %xmm11, %xmm5 #endif movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movlps %xmm5, 4 * SIZE(CO2) movhps %xmm5, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm10, %xmm1 #endif movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L80: testq $2, M je .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -26 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -22 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -18 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: #ifndef TRMMKERNEL #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(CO1), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 0 * SIZE(CO2), %xmm10 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm10, %xmm1 #endif movlps %xmm0, 0 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L90: testq $1, M je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -29 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -24 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -27 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -26 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -25 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm10 #endif addss %xmm2, %xmm0 addss %xmm3, %xmm1 mulss %xmm15, %xmm0 mulss %xmm15, %xmm1 #ifndef TRMMKERNEL addss %xmm8, %xmm0 addss %xmm10, %xmm1 #endif movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 4 * ldc ALIGN_4 .L100: testq $1, N je .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 .L102: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 PREFETCH 32 * SIZE(B) shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH 32 * SIZE(B) movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 movd 4 * SIZE(B), %mm4 movd 5 * SIZE(B), %mm5 movd 6 * SIZE(B), %mm6 movd 7 * SIZE(B), %mm7 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) movq %mm4, 16 * SIZE(BO) movq %mm4, 18 * SIZE(BO) movq %mm5, 20 * SIZE(BO) movq %mm5, 22 * SIZE(BO) movq %mm6, 24 * SIZE(BO) movq %mm6, 26 * SIZE(BO) movq %mm7, 28 * SIZE(BO) movq %mm7, 30 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(BO) #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) movd 0 * SIZE(B), %mm0 punpckldq %mm0, %mm0 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) #endif addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 PREFETCHW 7 * SIZE(CO1) xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps -20 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 mulps -12 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps -8 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps -4 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 48 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 mulps 4 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 8 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 12 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 64 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 mulps 20 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 24 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 28 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 80 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm9, %xmm4 #endif movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps -28 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -24 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps -20 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 movaps -12 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -8 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps -4 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 mulps %xmm15, %xmm0 #ifndef TRMMKERNEL addps %xmm8, %xmm0 #endif movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L130: testq $2, M je .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -26 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd -22 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -18 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -8 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 mulps %xmm15, %xmm0 #ifndef TRMMKERNEL #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(CO1), %xmm8 addps %xmm8, %xmm0 #endif movlps %xmm0, 0 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 ALIGN_4 .L140: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss -31 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss -30 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss -29 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss -24 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss -27 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss -26 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss -25 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss -20 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movss ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 mulss %xmm15, %xmm0 #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_kernel_8x4_sse3.S000066400000000000000000001770561313527062700214530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r12 #define BO %r13 #define CO1 %r14 #define CO2 %r15 #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 320 #define KERNEL1(address) \ mulps %xmm8, %xmm9; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AO); \ addps %xmm9, %xmm0; \ movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm1; \ movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm2; \ movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 4 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm3; \ movsldup 0 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm4; \ movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm5; \ movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm6; \ movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 8 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm7; \ movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm0; \ movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm1; \ movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm2; \ movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 12 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm3; \ movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm4; \ movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm5; \ movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm6; \ movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 64 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm7; \ movsldup 64 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm0; \ movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm1; \ movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm2; \ movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 20 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm3; \ movsldup 16 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm4; \ movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm5; \ movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm6; \ movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 24 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm7; \ movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm0; \ movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm1; \ movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm2; \ movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 28 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm3; \ movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm4; \ movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm5; \ movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm6; \ movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 80 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm7; \ movsldup 80 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulps %xmm12, %xmm13; \ PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * SIZE(AO); \ addps %xmm13, %xmm0; \ movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm1; \ movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm2; \ movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 36 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm3; \ movsldup 32 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm4; \ movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm5; \ movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm6; \ movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 40 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm7; \ movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm0; \ movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm1; \ movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm2; \ movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 44 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm3; \ movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm4; \ movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm5; \ movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm6; \ movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 96 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm7; \ movsldup 96 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm0; \ movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm1; \ movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm2; \ movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 52 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm3; \ movsldup 48 * SIZE + (address) * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm4; \ movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm5; \ movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm6; \ movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 56 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm7; \ movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm0; \ movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm1; \ movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm2; \ movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 60 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm3; \ movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm4; \ movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm5; \ movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm6; \ movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 112 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm7; \ movsldup 112 * SIZE + (address) * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif leaq (, LDC, SIZE), LDC movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetcht1 128 * SIZE(BO) prefetcht0 112 * SIZE(B) addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq 112 * SIZE(B), BB movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: prefetcht0 0 * SIZE(BB) prefetcht0 8 * SIZE(BB) subq $-16 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 movsldup 32 * SIZE(BO), %xmm13 movsldup 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 prefetchnta 8 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta 8 * SIZE(CO2) pxor %xmm5, %xmm5 prefetchnta 8 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 prefetchnta 8 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #if 1 andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1 (64 * 0) KERNEL2 (64 * 0) KERNEL3 (64 * 0) KERNEL4 (64 * 0) KERNEL5 (64 * 0) KERNEL6 (64 * 0) KERNEL7 (64 * 0) KERNEL8 (64 * 0) KERNEL9 (64 * 0) KERNEL10(64 * 0) KERNEL11(64 * 0) KERNEL12(64 * 0) KERNEL13(64 * 0) KERNEL14(64 * 0) KERNEL15(64 * 0) KERNEL16(64 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L12 KERNEL1 (64 * 1) KERNEL2 (64 * 1) KERNEL3 (64 * 1) KERNEL4 (64 * 1) KERNEL5 (64 * 1) KERNEL6 (64 * 1) KERNEL7 (64 * 1) KERNEL8 (64 * 1) KERNEL9 (64 * 1) KERNEL10(64 * 1) KERNEL11(64 * 1) KERNEL12(64 * 1) KERNEL13(64 * 1) KERNEL14(64 * 1) KERNEL15(64 * 1) KERNEL16(64 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L12 KERNEL1 (64 * 2) KERNEL2 (64 * 2) KERNEL3 (64 * 2) KERNEL4 (64 * 2) KERNEL5 (64 * 2) KERNEL6 (64 * 2) KERNEL7 (64 * 2) KERNEL8 (64 * 2) KERNEL9 (64 * 2) KERNEL10(64 * 2) KERNEL11(64 * 2) KERNEL12(64 * 2) KERNEL13(64 * 2) KERNEL14(64 * 2) KERNEL15(64 * 2) KERNEL16(64 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L12 KERNEL1 (64 * 3) KERNEL2 (64 * 3) KERNEL3 (64 * 3) KERNEL4 (64 * 3) KERNEL5 (64 * 3) KERNEL6 (64 * 3) KERNEL7 (64 * 3) KERNEL8 (64 * 3) KERNEL9 (64 * 3) KERNEL10(64 * 3) KERNEL11(64 * 3) KERNEL12(64 * 3) KERNEL13(64 * 3) KERNEL14(64 * 3) KERNEL15(64 * 3) KERNEL16(64 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L12 KERNEL1 (64 * 4) KERNEL2 (64 * 4) KERNEL3 (64 * 4) KERNEL4 (64 * 4) KERNEL5 (64 * 4) KERNEL6 (64 * 4) KERNEL7 (64 * 4) KERNEL8 (64 * 4) KERNEL9 (64 * 4) KERNEL10(64 * 4) KERNEL11(64 * 4) KERNEL12(64 * 4) KERNEL13(64 * 4) KERNEL14(64 * 4) KERNEL15(64 * 4) KERNEL16(64 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L12 KERNEL1 (64 * 5) KERNEL2 (64 * 5) KERNEL3 (64 * 5) KERNEL4 (64 * 5) KERNEL5 (64 * 5) KERNEL6 (64 * 5) KERNEL7 (64 * 5) KERNEL8 (64 * 5) KERNEL9 (64 * 5) KERNEL10(64 * 5) KERNEL11(64 * 5) KERNEL12(64 * 5) KERNEL13(64 * 5) KERNEL14(64 * 5) KERNEL15(64 * 5) KERNEL16(64 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L12 KERNEL1 (64 * 6) KERNEL2 (64 * 6) KERNEL3 (64 * 6) KERNEL4 (64 * 6) KERNEL5 (64 * 6) KERNEL6 (64 * 6) KERNEL7 (64 * 6) KERNEL8 (64 * 6) KERNEL9 (64 * 6) KERNEL10(64 * 6) KERNEL11(64 * 6) KERNEL12(64 * 6) KERNEL13(64 * 6) KERNEL14(64 * 6) KERNEL15(64 * 6) KERNEL16(64 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L12 KERNEL1 (64 * 7) KERNEL2 (64 * 7) KERNEL3 (64 * 7) KERNEL4 (64 * 7) KERNEL5 (64 * 7) KERNEL6 (64 * 7) KERNEL7 (64 * 7) KERNEL8 (64 * 7) KERNEL9 (64 * 7) KERNEL10(64 * 7) KERNEL11(64 * 7) KERNEL12(64 * 7) KERNEL13(64 * 7) KERNEL14(64 * 7) KERNEL15(64 * 7) KERNEL16(64 * 7) addq $64 * 8 * SIZE, AO addq $64 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #else sarq $3, %rax je .L15 ALIGN_4 .L12: KERNEL1 (64 * 0) KERNEL2 (64 * 0) KERNEL3 (64 * 0) KERNEL4 (64 * 0) KERNEL5 (64 * 0) KERNEL6 (64 * 0) KERNEL7 (64 * 0) KERNEL8 (64 * 0) KERNEL9 (64 * 0) KERNEL10(64 * 0) KERNEL11(64 * 0) KERNEL12(64 * 0) KERNEL13(64 * 0) KERNEL14(64 * 0) KERNEL15(64 * 0) KERNEL16(64 * 0) addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L12 #endif ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm6 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm7 movsldup 8 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L16 ALIGN_4 .L18: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 mulps %xmm15, %xmm0 movhps 2 * SIZE(CO1), %xmm8 mulps %xmm15, %xmm1 movsd 4 * SIZE(CO1), %xmm9 mulps %xmm15, %xmm2 movhps 6 * SIZE(CO1), %xmm9 mulps %xmm15, %xmm3 movsd 0 * SIZE(CO2), %xmm10 mulps %xmm15, %xmm4 movhps 2 * SIZE(CO2), %xmm10 mulps %xmm15, %xmm5 movsd 4 * SIZE(CO2), %xmm11 mulps %xmm15, %xmm6 movhps 6 * SIZE(CO2), %xmm11 mulps %xmm15, %xmm7 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 movsd 4 * SIZE(CO1, LDC, 2), %xmm13 movhps 6 * SIZE(CO1, LDC, 2), %xmm13 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 movsd 4 * SIZE(CO2, LDC, 2), %xmm15 movhps 6 * SIZE(CO2, LDC, 2), %xmm15 addps %xmm8, %xmm0 addps %xmm9, %xmm4 addps %xmm10, %xmm1 addps %xmm11, %xmm5 addps %xmm12, %xmm2 movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) addps %xmm13, %xmm6 movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) addps %xmm14, %xmm3 movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) addps %xmm15, %xmm7 movsd %xmm5, 4 * SIZE(CO2) movhps %xmm5, 6 * SIZE(CO2) #else mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 mulps %xmm15, %xmm4 mulps %xmm15, %xmm5 mulps %xmm15, %xmm6 mulps %xmm15, %xmm7 movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movsd %xmm5, 4 * SIZE(CO2) movhps %xmm5, 6 * SIZE(CO2) #endif movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm6, 4 * SIZE(CO1, LDC, 2) movhps %xmm6, 6 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 2 * SIZE(CO2, LDC, 2) movsd %xmm7, 4 * SIZE(CO2, LDC, 2) movhps %xmm7, 6 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 movsldup 32 * SIZE(BO), %xmm13 movsldup 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 32 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsldup 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 32 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsldup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsldup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsldup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 24 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsldup 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 48 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsldup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 28 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsldup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsldup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 48 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsldup 112 * SIZE(BO), %xmm15 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L26 ALIGN_4 .L28: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3 #else mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 2 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 32 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 16 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 20 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 24 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 28 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movsd 64 * SIZE(BO), %xmm9 addps %xmm11, %xmm0 movsd 36 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 40 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 44 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 48 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movsd 52 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 56 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 60 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 96 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L36 ALIGN_4 .L38: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 0 * SIZE(CO2), %xmm8 movsd 0 * SIZE(CO1, LDC, 2), %xmm9 movhps 0 * SIZE(CO2, LDC, 2), %xmm9 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm8, %xmm0 addps %xmm9, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhps %xmm1, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L40: testq $1, M je .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 32 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L45 ALIGN_4 .L42: shufps $0, %xmm8, %xmm8 movhps 4 * SIZE(BO), %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 8 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 movhps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 16 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 movhps 20 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 3 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 24 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 movhps 28 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 64 * SIZE(BO), %xmm9 shufps $0, %xmm10, %xmm10 movhps 36 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 40 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 movhps 44 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 6 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 movhps 52 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 7 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 56 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 movhps 60 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 96 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: shufps $0, %xmm8, %xmm8 movhps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm9 movss 0 * SIZE(CO1, LDC, 2), %xmm10 movss 0 * SIZE(CO2, LDC, 2), %xmm11 #endif addps %xmm1, %xmm0 mulps %xmm15, %xmm0 movhlps %xmm0, %xmm1 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addss %xmm0, %xmm8 psrlq $32, %xmm0 addss %xmm0, %xmm9 addss %xmm1, %xmm10 psrlq $32, %xmm1 addss %xmm1, %xmm11 movss %xmm8, 0 * SIZE(CO1) movss %xmm9, 0 * SIZE(CO2) movss %xmm10, 0 * SIZE(CO1, LDC, 2) movss %xmm11, 0 * SIZE(CO2, LDC, 2) #else movss %xmm0, 0 * SIZE(CO1) psrlq $32, %xmm0 movss %xmm0, 0 * SIZE(CO2) movss %xmm1, 0 * SIZE(CO1, LDC, 2) psrlq $32, %xmm1 movss %xmm1, 0 * SIZE(CO2, LDC, 2) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 .L50: testq $2, N je .L100 .L51: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L53 ALIGN_4 .L52: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetcht1 128 * SIZE(BO) prefetcht0 112 * SIZE(B) addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $7, %rax BRANCH jle .L60 ALIGN_4 .L54: movddup 0 * SIZE(B), %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 2 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 prefetcht2 4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetcht2 4 * SIZE(CO2) pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 64 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 20 * SIZE(AO), %xmm10 addps %xmm9, %xmm1 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 24 * SIZE(AO), %xmm10 addps %xmm9, %xmm5 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 28 * SIZE(AO), %xmm10 addps %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 80 * SIZE(AO), %xmm10 addps %xmm9, %xmm5 movsldup 32 * SIZE(BO), %xmm9 mulps %xmm12, %xmm11 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 36 * SIZE(AO), %xmm12 addps %xmm11, %xmm1 movsldup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm4 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 40 * SIZE(AO), %xmm12 addps %xmm11, %xmm5 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm0 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 44 * SIZE(AO), %xmm12 addps %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm4 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 96 * SIZE(AO), %xmm12 addps %xmm11, %xmm5 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 52 * SIZE(AO), %xmm14 addps %xmm11, %xmm1 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm4 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 56 * SIZE(AO), %xmm14 addps %xmm11, %xmm5 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm0 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 60 * SIZE(AO), %xmm14 addps %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm4 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 112 * SIZE(AO), %xmm14 addps %xmm11, %xmm5 movsldup 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L66 ALIGN_4 .L68: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 movsd 4 * SIZE(CO2), %xmm11 movhps 6 * SIZE(CO2), %xmm11 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm8, %xmm0 addps %xmm9, %xmm4 addps %xmm10, %xmm1 addps %xmm11, %xmm5 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) movsd %xmm5, 4 * SIZE(CO2) movhps %xmm5, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps 0 * SIZE(AO), %xmm8 movsldup 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(AO), %xmm10 movsldup 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 28 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 48 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsldup 48 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L76 ALIGN_4 .L78: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm8, %xmm0 addps %xmm10, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L80: testq $2, M je .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L85 ALIGN_4 .L82: shufps $0x50, %xmm9, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L86 ALIGN_4 .L88: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 0 * SIZE(CO2), %xmm8 #endif addps %xmm1, %xmm0 mulps %xmm15, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L90: testq $1, M je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: shufps $0, %xmm8, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 3 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 6 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 7 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L96 ALIGN_4 .L98: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm9 addps %xmm1, %xmm0 mulps %xmm15, %xmm0 addss %xmm0, %xmm8 psrlq $32, %xmm0 addss %xmm0, %xmm9 movss %xmm8, 0 * SIZE(CO1) movss %xmm9, 0 * SIZE(CO2) #else addps %xmm1, %xmm0 mulps %xmm15, %xmm0 movss %xmm0, 0 * SIZE(CO1) psrlq $32, %xmm0 movss %xmm0, 0 * SIZE(CO2) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 4 * ldc ALIGN_4 .L100: testq $1, N je .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 .L102: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 movss %xmm0, 0 * SIZE(BO) movss %xmm0, 1 * SIZE(BO) movss %xmm1, 2 * SIZE(BO) movss %xmm1, 3 * SIZE(BO) movss %xmm2, 4 * SIZE(BO) movss %xmm2, 5 * SIZE(BO) movss %xmm3, 6 * SIZE(BO) movss %xmm3, 7 * SIZE(BO) movss %xmm4, 8 * SIZE(BO) movss %xmm4, 9 * SIZE(BO) movss %xmm5, 10 * SIZE(BO) movss %xmm5, 11 * SIZE(BO) movss %xmm6, 12 * SIZE(BO) movss %xmm6, 13 * SIZE(BO) movss %xmm7, 14 * SIZE(BO) movss %xmm7, 15 * SIZE(BO) addq $ 8 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm0 movss %xmm0, 0 * SIZE(BO) movss %xmm0, 1 * SIZE(BO) addq $ 1 * SIZE, B addq $ 2 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movddup 0 * SIZE(BO), %xmm9 movddup 8 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 prefetchnta 8 * SIZE(CO1) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movddup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movddup 2 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 64 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 20 * SIZE(AO), %xmm10 addps %xmm9, %xmm0 movddup 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 24 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movddup 6 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 28 * SIZE(AO), %xmm10 addps %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 80 * SIZE(AO), %xmm10 addps %xmm9, %xmm5 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) movddup 8 * SIZE(BO), %xmm9 mulps %xmm12, %xmm9 movaps 36 * SIZE(AO), %xmm12 addps %xmm9, %xmm0 movddup 16 * SIZE(BO), %xmm9 mulps %xmm12, %xmm11 movaps 40 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movddup 10 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 44 * SIZE(AO), %xmm12 addps %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 96 * SIZE(AO), %xmm12 addps %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 52 * SIZE(AO), %xmm14 addps %xmm11, %xmm0 movddup 12 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 56 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movddup 14 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 60 * SIZE(AO), %xmm14 addps %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 112 * SIZE(AO), %xmm14 addps %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movddup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movddup 2 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L116 ALIGN_4 .L118: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 #endif addps %xmm1, %xmm0 addps %xmm5, %xmm4 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm8, %xmm0 addps %xmm9, %xmm4 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 0 * SIZE(AO), %xmm8 movddup 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(AO), %xmm10 movddup 8 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movddup 2 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movddup 6 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movddup 16 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movddup 10 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 28 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movddup 14 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 48 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movddup 2 * SIZE(BO), %xmm9 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L126 ALIGN_4 .L128: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 #endif addps %xmm1, %xmm0 mulps %xmm15, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L130: testq $2, M je .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(AO), %xmm10 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $4, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 mulps 4 * SIZE(BO), %xmm8 addps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps 12 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm10, %xmm1 movaps 24 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps 28 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $15, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: movsd 0 * SIZE(AO), %xmm8 movsd 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L136 ALIGN_4 .L138: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 mulps %xmm15, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 ALIGN_4 .L140: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 8 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss 1 * SIZE(AO), %xmm8 mulss 2 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 16 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss 2 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss 3 * SIZE(AO), %xmm8 mulss 6 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss 8 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 mulss 10 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 24 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss 6 * SIZE(AO), %xmm10 mulss 12 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss 7 * SIZE(AO), %xmm10 mulss 14 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movss ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: movss 0 * SIZE(AO), %xmm8 movss 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 addq $1 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: #if! defined(TRMMKERNEL) && !defined(BETAZERO) movss 0 * SIZE(CO1), %xmm8 #endif addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 mulss %xmm15, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_ncopy_2.S000066400000000000000000000152031313527062700200650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(NEHALEM) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 2) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 2) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r14 #define I %r15 #endif #define J %r10 #define AO1 %r11 #define AO2 %r12 #define AO3 %r13 #define AO4 %rax PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif leaq (,LDA, SIZE), LDA # Scaling movq N, J sarq $1, J jle .L20 ALIGN_4 .L12: movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq M, I sarq $2, I jle .L14 ALIGN_4 .L13: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss 1 * SIZE(AO1), %xmm2 movss 1 * SIZE(AO2), %xmm3 movss 2 * SIZE(AO1), %xmm4 movss 2 * SIZE(AO2), %xmm5 movss 3 * SIZE(AO1), %xmm6 movss 3 * SIZE(AO2), %xmm7 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) movss %xmm2, 2 * SIZE(B) movss %xmm3, 3 * SIZE(B) movss %xmm4, 4 * SIZE(B) movss %xmm5, 5 * SIZE(B) movss %xmm6, 6 * SIZE(B) movss %xmm7, 7 * SIZE(B) #else PREFETCH RPREFETCHSIZE * SIZE(AO1) movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movsd 1 * SIZE(AO1), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 PREFETCH RPREFETCHSIZE * SIZE(AO2) movsd 2 * SIZE(AO1), %xmm2 movhpd 2 * SIZE(AO2), %xmm2 movsd 3 * SIZE(AO1), %xmm3 movhpd 3 * SIZE(AO2), %xmm3 PREFETCHW WPREFETCHSIZE * SIZE(B) movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movapd %xmm2, 4 * SIZE(B) movapd %xmm3, 6 * SIZE(B) #endif addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B decq I jg .L13 ALIGN_4 .L14: movq M, I andq $3, I jle .L16 ALIGN_4 .L15: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movapd %xmm0, 0 * SIZE(B) #endif addq $SIZE, AO1 addq $SIZE, AO2 addq $2 * SIZE, B decq I jg .L15 ALIGN_4 .L16: decq J jg .L12 ALIGN_4 .L20: testq $1, N jle .L999 movq A, AO1 movq M, I sarq $2, I jle .L34 ALIGN_4 .L33: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 1 * SIZE(AO1), %xmm1 movss 2 * SIZE(AO1), %xmm2 movss 3 * SIZE(AO1), %xmm3 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) movss %xmm2, 2 * SIZE(B) movss %xmm3, 3 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) #endif addq $4 * SIZE, AO1 subq $-4 * SIZE, B decq I jg .L33 ALIGN_4 .L34: movq M, I andq $3, I jle .L999 ALIGN_4 .L35: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss %xmm0, 0 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movsd %xmm0, 0 * SIZE(B) #endif addq $SIZE, AO1 addq $1 * SIZE, B decq I jg .L35 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_ncopy_2_bulldozer.S000066400000000000000000000177741313527062700221660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r14 #define I %r15 #endif #define J %r10 #define AO1 %r11 #define AO2 %r12 #define AO3 %r13 #define AO4 %rax PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp vmovups %xmm6, 0(%rsp) vmovups %xmm7, 16(%rsp) vmovups %xmm8, 32(%rsp) vmovups %xmm9, 48(%rsp) vmovups %xmm10, 64(%rsp) vmovups %xmm11, 80(%rsp) vmovups %xmm12, 96(%rsp) vmovups %xmm13, 112(%rsp) vmovups %xmm14, 128(%rsp) vmovups %xmm15, 144(%rsp) movq OLD_B, B #endif leaq (,LDA, SIZE), LDA # Scaling movq N, J sarq $1, J jle .L20 ALIGN_4 .L01: movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq M, I sarq $3, I jle .L08 ALIGN_4 .L03: #ifndef DOUBLE vmovss 0 * SIZE(AO1), %xmm0 vmovss 0 * SIZE(AO2), %xmm1 vmovss 1 * SIZE(AO1), %xmm2 vmovss 1 * SIZE(AO2), %xmm3 vmovss 2 * SIZE(AO1), %xmm4 vmovss 2 * SIZE(AO2), %xmm5 vmovss 3 * SIZE(AO1), %xmm6 vmovss 3 * SIZE(AO2), %xmm7 vmovss 4 * SIZE(AO1), %xmm8 vmovss 4 * SIZE(AO2), %xmm9 vmovss 5 * SIZE(AO1), %xmm10 vmovss 5 * SIZE(AO2), %xmm11 vmovss 6 * SIZE(AO1), %xmm12 vmovss 6 * SIZE(AO2), %xmm13 vmovss 7 * SIZE(AO1), %xmm14 vmovss 7 * SIZE(AO2), %xmm15 vmovss %xmm0, 0 * SIZE(B) vmovss %xmm1, 1 * SIZE(B) vmovss %xmm2, 2 * SIZE(B) vmovss %xmm3, 3 * SIZE(B) vmovss %xmm4, 4 * SIZE(B) vmovss %xmm5, 5 * SIZE(B) vmovss %xmm6, 6 * SIZE(B) vmovss %xmm7, 7 * SIZE(B) vmovss %xmm8, 8 * SIZE(B) vmovss %xmm9, 9 * SIZE(B) vmovss %xmm10, 10 * SIZE(B) vmovss %xmm11, 11 * SIZE(B) vmovss %xmm12, 12 * SIZE(B) vmovss %xmm13, 13 * SIZE(B) vmovss %xmm14, 14 * SIZE(B) vmovss %xmm15, 15 * SIZE(B) #else prefetchw 256(B) prefetchnta 256(AO1) vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 1 * SIZE(AO1), %xmm1 vmovsd 2 * SIZE(AO1), %xmm2 vmovsd 3 * SIZE(AO1), %xmm3 vmovsd 4 * SIZE(AO1), %xmm4 vmovsd 5 * SIZE(AO1), %xmm5 vmovsd 6 * SIZE(AO1), %xmm6 vmovsd 7 * SIZE(AO1), %xmm7 prefetchnta 256(AO2) vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1 vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2 vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3 vmovhpd 4 * SIZE(AO2), %xmm4 , %xmm4 vmovhpd 5 * SIZE(AO2), %xmm5 , %xmm5 vmovhpd 6 * SIZE(AO2), %xmm6 , %xmm6 vmovhpd 7 * SIZE(AO2), %xmm7 , %xmm7 prefetchw 256+64(B) vmovups %xmm0, 0 * SIZE(B) vmovups %xmm1, 2 * SIZE(B) vmovups %xmm2, 4 * SIZE(B) vmovups %xmm3, 6 * SIZE(B) vmovups %xmm4, 8 * SIZE(B) vmovups %xmm5, 10 * SIZE(B) vmovups %xmm6, 12 * SIZE(B) vmovups %xmm7, 14 * SIZE(B) #endif addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L03 ALIGN_4 .L08: testq $4 , M je .L14 ALIGN_4 .L13: #ifndef DOUBLE vmovss 0 * SIZE(AO1), %xmm0 vmovss 0 * SIZE(AO2), %xmm1 vmovss 1 * SIZE(AO1), %xmm2 vmovss 1 * SIZE(AO2), %xmm3 vmovss 2 * SIZE(AO1), %xmm4 vmovss 2 * SIZE(AO2), %xmm5 vmovss 3 * SIZE(AO1), %xmm6 vmovss 3 * SIZE(AO2), %xmm7 vmovss %xmm0, 0 * SIZE(B) vmovss %xmm1, 1 * SIZE(B) vmovss %xmm2, 2 * SIZE(B) vmovss %xmm3, 3 * SIZE(B) vmovss %xmm4, 4 * SIZE(B) vmovss %xmm5, 5 * SIZE(B) vmovss %xmm6, 6 * SIZE(B) vmovss %xmm7, 7 * SIZE(B) #else vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 1 * SIZE(AO1), %xmm1 vmovsd 2 * SIZE(AO1), %xmm2 vmovsd 3 * SIZE(AO1), %xmm3 vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 vmovhpd 1 * SIZE(AO2), %xmm1 , %xmm1 vmovhpd 2 * SIZE(AO2), %xmm2 , %xmm2 vmovhpd 3 * SIZE(AO2), %xmm3 , %xmm3 vmovups %xmm0, 0 * SIZE(B) vmovups %xmm1, 2 * SIZE(B) vmovups %xmm2, 4 * SIZE(B) vmovups %xmm3, 6 * SIZE(B) #endif addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B ALIGN_4 .L14: movq M, I andq $3, I jle .L16 ALIGN_4 .L15: #ifndef DOUBLE vmovss 0 * SIZE(AO1), %xmm0 vmovss 0 * SIZE(AO2), %xmm1 vmovss %xmm0, 0 * SIZE(B) vmovss %xmm1, 1 * SIZE(B) #else vmovsd 0 * SIZE(AO1), %xmm0 vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 vmovups %xmm0, 0 * SIZE(B) #endif addq $SIZE, AO1 addq $SIZE, AO2 addq $2 * SIZE, B decq I jg .L15 ALIGN_4 .L16: decq J jg .L01 ALIGN_4 .L20: testq $1, N jle .L999 movq A, AO1 movq M, I sarq $2, I jle .L34 ALIGN_4 .L33: #ifndef DOUBLE vmovups 0 * SIZE(AO1), %xmm0 vmovups %xmm0, 0 * SIZE(B) #else vmovups 0 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1), %xmm1 vmovups %xmm0, 0 * SIZE(B) vmovups %xmm1, 2 * SIZE(B) #endif addq $4 * SIZE, AO1 subq $-4 * SIZE, B decq I jg .L33 ALIGN_4 .L34: movq M, I andq $3, I jle .L999 ALIGN_4 .L35: #ifndef DOUBLE vmovss 0 * SIZE(AO1), %xmm0 vmovss %xmm0, 0 * SIZE(B) #else vmovsd 0 * SIZE(AO1), %xmm0 vmovsd %xmm0, 0 * SIZE(B) #endif addq $SIZE, AO1 addq $1 * SIZE, B decq I jg .L35 ALIGN_4 .L999: #ifdef WINDOWS_ABI vmovups 0(%rsp), %xmm6 vmovups 16(%rsp), %xmm7 vmovups 32(%rsp), %xmm8 vmovups 48(%rsp), %xmm9 vmovups 64(%rsp), %xmm10 vmovups 80(%rsp), %xmm11 vmovups 96(%rsp), %xmm12 vmovups 112(%rsp), %xmm13 vmovups 128(%rsp), %xmm14 vmovups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_ncopy_4.S000066400000000000000000000244321313527062700200730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(PENTIUM4) || defined(GENERIC) #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht2 #endif #ifdef ATOM #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef NANO #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef BARCELONA #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #ifdef GENERIC #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r14 #define I %r15 #endif #define J %r10 #define AO1 %r11 #define AO2 %r12 #define AO3 %r13 #define AO4 %rax PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif leaq (,LDA, SIZE), LDA # Scaling movq N, J sarq $2, J jle .L20 ALIGN_4 .L12: movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), AO3 leaq (AO2, LDA, 2), AO4 leaq (A, LDA, 4), A movq M, I sarq $2, I jle .L14 ALIGN_4 .L13: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss 0 * SIZE(AO3), %xmm2 movss 0 * SIZE(AO4), %xmm3 movss 1 * SIZE(AO1), %xmm4 movss 1 * SIZE(AO2), %xmm5 movss 1 * SIZE(AO3), %xmm6 movss 1 * SIZE(AO4), %xmm7 movss 2 * SIZE(AO1), %xmm8 movss 2 * SIZE(AO2), %xmm9 movss 2 * SIZE(AO3), %xmm10 movss 2 * SIZE(AO4), %xmm11 movss 3 * SIZE(AO1), %xmm12 movss 3 * SIZE(AO2), %xmm13 movss 3 * SIZE(AO3), %xmm14 movss 3 * SIZE(AO4), %xmm15 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) movss %xmm2, 2 * SIZE(B) movss %xmm3, 3 * SIZE(B) movss %xmm4, 4 * SIZE(B) movss %xmm5, 5 * SIZE(B) movss %xmm6, 6 * SIZE(B) movss %xmm7, 7 * SIZE(B) PREFETCH RPREFETCHSIZE * SIZE(AO1) PREFETCH RPREFETCHSIZE * SIZE(AO2) PREFETCH RPREFETCHSIZE * SIZE(AO3) PREFETCH RPREFETCHSIZE * SIZE(AO4) PREFETCHW WPREFETCHSIZE * SIZE(B) movss %xmm8, 8 * SIZE(B) movss %xmm9, 9 * SIZE(B) movss %xmm10, 10 * SIZE(B) movss %xmm11, 11 * SIZE(B) movss %xmm12, 12 * SIZE(B) movss %xmm13, 13 * SIZE(B) movss %xmm14, 14 * SIZE(B) movss %xmm15, 15 * SIZE(B) #else PREFETCH RPREFETCHSIZE * SIZE(AO1) movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movsd 1 * SIZE(AO1), %xmm2 movhpd 1 * SIZE(AO2), %xmm2 PREFETCH RPREFETCHSIZE * SIZE(AO2) movsd 2 * SIZE(AO1), %xmm4 movhpd 2 * SIZE(AO2), %xmm4 movsd 3 * SIZE(AO1), %xmm6 movhpd 3 * SIZE(AO2), %xmm6 PREFETCH RPREFETCHSIZE * SIZE(AO3) movsd 0 * SIZE(AO3), %xmm1 movhpd 0 * SIZE(AO4), %xmm1 movsd 1 * SIZE(AO3), %xmm3 movhpd 1 * SIZE(AO4), %xmm3 PREFETCH RPREFETCHSIZE * SIZE(AO4) movsd 2 * SIZE(AO3), %xmm5 movhpd 2 * SIZE(AO4), %xmm5 movsd 3 * SIZE(AO3), %xmm7 movhpd 3 * SIZE(AO4), %xmm7 PREFETCHW WPREFETCHSIZE * SIZE(B) movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movapd %xmm2, 4 * SIZE(B) movapd %xmm3, 6 * SIZE(B) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) PREFETCHW (WPREFETCHSIZE + 8) * SIZE(B) #endif movapd %xmm4, 8 * SIZE(B) movapd %xmm5, 10 * SIZE(B) movapd %xmm6, 12 * SIZE(B) movapd %xmm7, 14 * SIZE(B) #endif addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 addq $4 * SIZE, AO3 addq $4 * SIZE, AO4 subq $-16 * SIZE, B decq I jg .L13 ALIGN_4 .L14: movq M, I andq $3, I jle .L16 ALIGN_4 .L15: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss 0 * SIZE(AO3), %xmm2 movss 0 * SIZE(AO4), %xmm3 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) movss %xmm2, 2 * SIZE(B) movss %xmm3, 3 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movsd 0 * SIZE(AO3), %xmm1 movhpd 0 * SIZE(AO4), %xmm1 movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) #endif addq $SIZE, AO1 addq $SIZE, AO2 addq $SIZE, AO3 addq $SIZE, AO4 addq $4 * SIZE, B decq I jg .L15 ALIGN_4 .L16: decq J jg .L12 ALIGN_4 .L20: testq $2, N jle .L30 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq M, I sarq $2, I jle .L24 ALIGN_4 .L23: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss 1 * SIZE(AO1), %xmm2 movss 1 * SIZE(AO2), %xmm3 movss 2 * SIZE(AO1), %xmm4 movss 2 * SIZE(AO2), %xmm5 movss 3 * SIZE(AO1), %xmm6 movss 3 * SIZE(AO2), %xmm7 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) movss %xmm2, 2 * SIZE(B) movss %xmm3, 3 * SIZE(B) movss %xmm4, 4 * SIZE(B) movss %xmm5, 5 * SIZE(B) movss %xmm6, 6 * SIZE(B) movss %xmm7, 7 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movsd 1 * SIZE(AO1), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 movsd 2 * SIZE(AO1), %xmm2 movhpd 2 * SIZE(AO2), %xmm2 movsd 3 * SIZE(AO1), %xmm3 movhpd 3 * SIZE(AO2), %xmm3 movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movapd %xmm2, 4 * SIZE(B) movapd %xmm3, 6 * SIZE(B) #endif PREFETCH RPREFETCHSIZE * SIZE(AO1) PREFETCH RPREFETCHSIZE * SIZE(AO2) PREFETCHW WPREFETCHSIZE * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B decq I jg .L23 ALIGN_4 .L24: movq M, I andq $3, I jle .L30 ALIGN_4 .L25: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movapd %xmm0, 0 * SIZE(B) #endif addq $SIZE, AO1 addq $SIZE, AO2 addq $2 * SIZE, B decq I jg .L25 ALIGN_4 .L30: testq $1, N jle .L999 movq A, AO1 movq M, I sarq $2, I jle .L34 ALIGN_4 .L33: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 1 * SIZE(AO1), %xmm1 movss 2 * SIZE(AO1), %xmm2 movss 3 * SIZE(AO1), %xmm3 movss %xmm0, 0 * SIZE(B) movss %xmm1, 1 * SIZE(B) movss %xmm2, 2 * SIZE(B) movss %xmm3, 3 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) #endif addq $4 * SIZE, AO1 subq $-4 * SIZE, B decq I jg .L33 ALIGN_4 .L34: movq M, I andq $3, I jle .L999 ALIGN_4 .L35: #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss %xmm0, 0 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movsd %xmm0, 0 * SIZE(B) #endif addq $SIZE, AO1 addq $1 * SIZE, B decq I jg .L35 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_ncopy_4_opteron.S000066400000000000000000000200771313527062700216420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (48 + 4) #define MOVNTQ MOVQ #else #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (24 + 4) #define MOVNTQ MOVQ #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 32 + STACKSIZE(%rsp) #define B %r14 #define I %r15 #endif #define J %r10 #define AO1 %r11 #define AO2 %r12 #define AO3 %r13 #define AO4 %rax #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r15 pushq %r14 #endif pushq %r13 pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif EMMS leaq (,LDA, SIZE), LDA # Scaling movq N, J sarq $2, J jle .L20 ALIGN_4 .L11: #if 0 movq A, AO1 leaq (A, LDA, 1), AO2 leaq (A, LDA, 2), AO3 leaq (AO2, LDA, 2), AO4 movq M, I sarq $4, I jle .L13 ALIGN_4 .L12: MOVQ 0 * SIZE(AO1), %mm0 addq $8 * SIZE, AO1 MOVQ 0 * SIZE(AO2), %mm1 addq $8 * SIZE, AO2 MOVQ 0 * SIZE(AO3), %mm2 addq $8 * SIZE, AO3 MOVQ 0 * SIZE(AO4), %mm3 addq $8 * SIZE, AO4 decq I jg .L12 ALIGN_4 .L13: #endif movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), AO3 leaq (AO2, LDA, 2), AO4 leaq (A, LDA, 4), A movq M, I sarq $2, I jle .L15 ALIGN_4 .L14: RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) MOVQ 0 * SIZE(AO1), %mm0 MOVNTQ %mm0, 0 * SIZE(B) MOVQ 0 * SIZE(AO2), %mm1 MOVNTQ %mm1, 1 * SIZE(B) RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) MOVQ 0 * SIZE(AO3), %mm2 MOVNTQ %mm2, 2 * SIZE(B) MOVQ 0 * SIZE(AO4), %mm3 MOVNTQ %mm3, 3 * SIZE(B) prefetchw (WPREFETCHSIZE + 0) * SIZE(B) MOVQ 1 * SIZE(AO1), %mm4 MOVNTQ %mm4, 4 * SIZE(B) MOVQ 1 * SIZE(AO2), %mm5 MOVNTQ %mm5, 5 * SIZE(B) MOVQ 1 * SIZE(AO3), %mm6 MOVNTQ %mm6, 6 * SIZE(B) MOVQ 1 * SIZE(AO4), %mm7 MOVNTQ %mm7, 7 * SIZE(B) RPREFETCH (RPREFETCHSIZE) * SIZE(AO3) MOVQ 2 * SIZE(AO1), %mm0 MOVNTQ %mm0, 8 * SIZE(B) MOVQ 2 * SIZE(AO2), %mm1 MOVNTQ %mm1, 9 * SIZE(B) RPREFETCH (RPREFETCHSIZE) * SIZE(AO4) MOVQ 2 * SIZE(AO3), %mm2 MOVNTQ %mm2, 10 * SIZE(B) MOVQ 2 * SIZE(AO4), %mm3 MOVNTQ %mm3, 11 * SIZE(B) prefetchw (WPREFETCHSIZE + 8) * SIZE(B) MOVQ 3 * SIZE(AO1), %mm4 MOVNTQ %mm4, 12 * SIZE(B) MOVQ 3 * SIZE(AO2), %mm5 MOVNTQ %mm5, 13 * SIZE(B) MOVQ 3 * SIZE(AO3), %mm6 MOVNTQ %mm6, 14 * SIZE(B) MOVQ 3 * SIZE(AO4), %mm7 MOVNTQ %mm7, 15 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 addq $4 * SIZE, AO3 addq $4 * SIZE, AO4 subq $-16 * SIZE, B decq I jg .L14 ALIGN_4 .L15: movq M, I andq $3, I jle .L17 ALIGN_4 .L16: MOVQ 0 * SIZE(AO1), %mm0 MOVQ 0 * SIZE(AO2), %mm1 MOVQ 0 * SIZE(AO3), %mm2 MOVQ 0 * SIZE(AO4), %mm3 MOVNTQ %mm0, 0 * SIZE(B) MOVNTQ %mm1, 1 * SIZE(B) MOVNTQ %mm2, 2 * SIZE(B) MOVNTQ %mm3, 3 * SIZE(B) addq $SIZE, AO1 addq $SIZE, AO2 addq $SIZE, AO3 addq $SIZE, AO4 addq $4 * SIZE, B decq I jg .L16 ALIGN_4 .L17: decq J jg .L11 ALIGN_4 .L20: testq $2, N jle .L30 movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq M, I sarq $2, I jle .L24 ALIGN_4 .L23: prefetch (RPREFETCHSIZE) * SIZE(AO1) MOVQ 0 * SIZE(AO1), %mm0 prefetch (RPREFETCHSIZE) * SIZE(AO2) MOVQ 0 * SIZE(AO2), %mm1 MOVQ 1 * SIZE(AO1), %mm2 MOVQ 1 * SIZE(AO2), %mm3 MOVQ 2 * SIZE(AO1), %mm4 MOVQ 2 * SIZE(AO2), %mm5 MOVQ 3 * SIZE(AO1), %mm6 MOVQ 3 * SIZE(AO2), %mm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(B) MOVNTQ %mm0, 0 * SIZE(B) MOVNTQ %mm1, 1 * SIZE(B) MOVNTQ %mm2, 2 * SIZE(B) MOVNTQ %mm3, 3 * SIZE(B) MOVNTQ %mm4, 4 * SIZE(B) MOVNTQ %mm5, 5 * SIZE(B) MOVNTQ %mm6, 6 * SIZE(B) MOVNTQ %mm7, 7 * SIZE(B) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 subq $-8 * SIZE, B decq I jg .L23 ALIGN_4 .L24: movq M, I andq $3, I jle .L30 ALIGN_4 .L25: MOVQ 0 * SIZE(AO1), %mm0 MOVQ 0 * SIZE(AO2), %mm1 MOVNTQ %mm0, 0 * SIZE(B) MOVNTQ %mm1, 1 * SIZE(B) addq $SIZE, AO1 addq $SIZE, AO2 addq $2 * SIZE, B decq I jg .L25 ALIGN_4 .L30: testq $1, N jle .L999 movq A, AO1 movq M, I sarq $2, I jle .L34 ALIGN_4 .L33: MOVQ 0 * SIZE(AO1), %mm0 MOVQ 1 * SIZE(AO1), %mm1 MOVQ 2 * SIZE(AO1), %mm2 MOVQ 3 * SIZE(AO1), %mm3 MOVNTQ %mm0, 0 * SIZE(B) MOVNTQ %mm1, 1 * SIZE(B) MOVNTQ %mm2, 2 * SIZE(B) MOVNTQ %mm3, 3 * SIZE(B) addq $4 * SIZE, AO1 subq $-4 * SIZE, B decq I jg .L33 ALIGN_4 .L34: movq M, I andq $3, I jle .L999 ALIGN_4 .L35: MOVQ 0 * SIZE(AO1), %mm0 addq $SIZE, AO1 MOVNTQ %mm0, 0 * SIZE(B) addq $1 * SIZE, B decq I jg .L35 ALIGN_4 .L999: EMMS #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r12 popq %r13 #ifdef WINDOWS_ABI popq %r14 popq %r15 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_tcopy_2.S000066400000000000000000000142701313527062700200760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(NEHALEM) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 2) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 2) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r10 #define J %rbp #define AO1 %r9 #define AO2 %r15 #define AO3 %r11 #define AO4 %r14 #define BO1 %r13 #define M8 %rbx #define BO %rax #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 64 + STACKSIZE(%rsp) #define B %rdi #define I %r10 #define J %r11 #define AO1 %r12 #define AO2 %r13 #define AO3 %r14 #define AO4 %r15 #define BO1 %rsi #define M8 %rbp #define BO %rax #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbp pushq %rbx #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif movq N, %rax andq $-2, %rax imulq M, %rax leaq (B, %rax, SIZE), BO1 leaq (, LDA, SIZE), LDA leaq (, M, SIZE), M8 movq M, J sarq $1, J jle .L20 ALIGN_4 .L11: movq A, AO1 leaq (A, LDA ), AO2 leaq (A, LDA, 2), A movq B, BO addq $4 * SIZE, B movq N, I sarq $1, I jle .L14 ALIGN_4 .L12: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO2), %xmm0 movaps %xmm0, 0 * SIZE(BO) #else PREFETCH RPREFETCHSIZE * SIZE(AO1) movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 PREFETCH RPREFETCHSIZE * SIZE(AO2) movsd 0 * SIZE(AO2), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 PREFETCHW WPREFETCHSIZE * SIZE(BO) movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) #endif leaq (BO, M8, 2), BO addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 decq I jg .L12 ALIGN_4 .L14: testq $1, N jle .L19 #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss %xmm0, 0 * SIZE(BO1) movss %xmm1, 1 * SIZE(BO1) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movapd %xmm0, 0 * SIZE(BO1) #endif addq $2 * SIZE, BO1 ALIGN_4 .L19: decq J jg .L11 ALIGN_4 .L20: testq $1, M jle .L999 ALIGN_4 .L31: movq A, AO1 movq B, BO movq N, I sarq $1, I jle .L33 ALIGN_4 .L32: #ifndef DOUBLE movsd 0 * SIZE(AO1), %xmm0 movsd %xmm0, 0 * SIZE(BO) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movapd %xmm0, 0 * SIZE(BO) #endif addq $2 * SIZE, AO1 leaq (BO, M8, 2), BO decq I jg .L32 ALIGN_4 .L33: testq $1, N jle .L999 #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss %xmm0, 0 * SIZE(BO1) #else movsd 0 * SIZE(AO1), %xmm0 movsd %xmm0, 0 * SIZE(BO1) #endif addq $1 * SIZE, BO1 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %rbx popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_tcopy_2_bulldozer.S000066400000000000000000000173401313527062700221610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r10 #define J %rbp #define AO1 %r9 #define AO2 %r15 #define AO3 %r11 #define AO4 %r14 #define BO1 %r13 #define M8 %rbx #define BO %rax #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 64 + STACKSIZE(%rsp) #define B %rdi #define I %r10 #define J %r11 #define AO1 %r12 #define AO2 %r13 #define AO3 %r14 #define AO4 %r15 #define BO1 %rsi #define M8 %rbp #define BO %rax #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbp pushq %rbx #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp vmovups %xmm6, 0(%rsp) vmovups %xmm7, 16(%rsp) vmovups %xmm8, 32(%rsp) vmovups %xmm9, 48(%rsp) vmovups %xmm10, 64(%rsp) vmovups %xmm11, 80(%rsp) vmovups %xmm12, 96(%rsp) vmovups %xmm13, 112(%rsp) vmovups %xmm14, 128(%rsp) vmovups %xmm15, 144(%rsp) movq OLD_B, B #endif movq N, %rax andq $-2, %rax imulq M, %rax leaq (B, %rax, SIZE), BO1 leaq (, LDA, SIZE), LDA leaq (, M, SIZE), M8 movq M, J sarq $1, J jle .L20 ALIGN_4 .L01: movq A, AO1 leaq (A, LDA ), AO2 leaq (A, LDA, 2), A movq B, BO addq $4 * SIZE, B movq N, I sarq $3, I jle .L10 ALIGN_4 .L08: #ifndef DOUBLE vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 2 * SIZE(AO1), %xmm2 vmovsd 4 * SIZE(AO1), %xmm4 vmovsd 6 * SIZE(AO1), %xmm6 vmovsd 0 * SIZE(AO2), %xmm1 vmovsd 2 * SIZE(AO2), %xmm3 vmovsd 4 * SIZE(AO2), %xmm5 vmovsd 6 * SIZE(AO2), %xmm7 vmovsd %xmm0, 0 * SIZE(BO) vmovsd %xmm1, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovsd %xmm2, 0 * SIZE(BO) vmovsd %xmm3, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovsd %xmm4, 0 * SIZE(BO) vmovsd %xmm5, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovsd %xmm6, 0 * SIZE(BO) vmovsd %xmm7, 2 * SIZE(BO) leaq (BO, M8, 2), BO #else prefetchnta 256(AO1) prefetchnta 256(AO2) vmovups 0 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1), %xmm2 vmovups 4 * SIZE(AO1), %xmm4 vmovups 6 * SIZE(AO1), %xmm6 vmovups 0 * SIZE(AO2), %xmm1 vmovups 2 * SIZE(AO2), %xmm3 vmovups 4 * SIZE(AO2), %xmm5 vmovups 6 * SIZE(AO2), %xmm7 vmovups %xmm0, 0 * SIZE(BO) vmovups %xmm1, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovups %xmm2, 0 * SIZE(BO) vmovups %xmm3, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovups %xmm4, 0 * SIZE(BO) vmovups %xmm5, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovups %xmm6, 0 * SIZE(BO) vmovups %xmm7, 2 * SIZE(BO) leaq (BO, M8, 2), BO #endif addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 decq I jg .L08 ALIGN_4 .L10: testq $4, N jle .L12 #ifndef DOUBLE vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 2 * SIZE(AO1), %xmm2 vmovsd 0 * SIZE(AO2), %xmm1 vmovsd 2 * SIZE(AO2), %xmm3 vmovsd %xmm0, 0 * SIZE(BO) vmovsd %xmm1, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovsd %xmm2, 0 * SIZE(BO) vmovsd %xmm3, 2 * SIZE(BO) leaq (BO, M8, 2), BO #else vmovups 0 * SIZE(AO1), %xmm0 vmovups 2 * SIZE(AO1), %xmm2 vmovups 0 * SIZE(AO2), %xmm1 vmovups 2 * SIZE(AO2), %xmm3 vmovups %xmm0, 0 * SIZE(BO) vmovups %xmm1, 2 * SIZE(BO) leaq (BO, M8, 2), BO vmovups %xmm2, 0 * SIZE(BO) vmovups %xmm3, 2 * SIZE(BO) leaq (BO, M8, 2), BO #endif addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 ALIGN_4 .L12: testq $2, N jle .L14 #ifndef DOUBLE vmovsd 0 * SIZE(AO1), %xmm0 vmovsd 0 * SIZE(AO2), %xmm1 vmovsd %xmm0, 0 * SIZE(BO) vmovsd %xmm1, 2 * SIZE(BO) #else vmovups 0 * SIZE(AO1), %xmm0 vmovups 0 * SIZE(AO2), %xmm1 vmovups %xmm0, 0 * SIZE(BO) vmovups %xmm1, 2 * SIZE(BO) #endif leaq (BO, M8, 2), BO addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 ALIGN_4 .L14: testq $1, N jle .L19 #ifndef DOUBLE vmovss 0 * SIZE(AO1), %xmm0 vmovss 0 * SIZE(AO2), %xmm1 vmovss %xmm0, 0 * SIZE(BO1) vmovss %xmm1, 1 * SIZE(BO1) #else vmovsd 0 * SIZE(AO1), %xmm0 vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 vmovups %xmm0, 0 * SIZE(BO1) #endif addq $2 * SIZE, BO1 ALIGN_4 .L19: decq J jg .L01 ALIGN_4 .L20: testq $1, M jle .L999 ALIGN_4 .L31: movq A, AO1 movq B, BO movq N, I sarq $1, I jle .L33 ALIGN_4 .L32: #ifndef DOUBLE vmovsd 0 * SIZE(AO1), %xmm0 vmovsd %xmm0, 0 * SIZE(BO) #else vmovups 0 * SIZE(AO1), %xmm0 vmovups %xmm0, 0 * SIZE(BO) #endif addq $2 * SIZE, AO1 leaq (BO, M8, 2), BO decq I jg .L32 ALIGN_4 .L33: testq $1, N jle .L999 #ifndef DOUBLE vmovss 0 * SIZE(AO1), %xmm0 vmovss %xmm0, 0 * SIZE(BO1) #else vmovsd 0 * SIZE(AO1), %xmm0 vmovsd %xmm0, 0 * SIZE(BO1) #endif addq $1 * SIZE, BO1 ALIGN_4 .L999: #ifdef WINDOWS_ABI vmovups 0(%rsp), %xmm6 vmovups 16(%rsp), %xmm7 vmovups 32(%rsp), %xmm8 vmovups 48(%rsp), %xmm9 vmovups 64(%rsp), %xmm10 vmovups 80(%rsp), %xmm11 vmovups 96(%rsp), %xmm12 vmovups 112(%rsp), %xmm13 vmovups 128(%rsp), %xmm14 vmovups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %rbx popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_tcopy_4.S000066400000000000000000000265531313527062700201070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(PENTIUM4) || defined(GENERIC) #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht2 #endif #ifdef ATOM #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef NANO #define RPREFETCHSIZE 8 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifdef BARCELONA #define RPREFETCHSIZE 8 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetch #define PREFETCHW prefetchw #endif #ifdef GENERIC #define RPREFETCHSIZE 16 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r10 #define J %rbp #define AO1 %r9 #define AO2 %r15 #define AO3 %r11 #define AO4 %r14 #define BO1 %r13 #define BO2 %r12 #define M8 %rbx #define BO %rax #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 64 + STACKSIZE(%rsp) #define B %rdi #define I %r10 #define J %r11 #define AO1 %r12 #define AO2 %r13 #define AO3 %r14 #define AO4 %r15 #define BO1 %rsi #define BO2 %rbx #define M8 %rbp #define BO %rax #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbp pushq %rbx #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif movq N, %rax movq N, %rbx andq $-4, %rax andq $-2, %rbx imulq M, %rax imulq M, %rbx leaq (B, %rax, SIZE), BO1 leaq (B, %rbx, SIZE), BO2 leaq (, LDA, SIZE), LDA leaq (, M, SIZE), M8 movq M, J sarq $2, J jle .L20 ALIGN_4 .L11: movq A, AO1 leaq (A, LDA ), AO2 leaq (A, LDA, 2), AO3 leaq (AO2, LDA, 2), AO4 leaq (A, LDA, 4), A movq B, BO addq $16 * SIZE, B movq N, I sarq $2, I jle .L13 ALIGN_4 .L12: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movlps 0 * SIZE(AO2), %xmm1 movhps 2 * SIZE(AO2), %xmm1 movlps 0 * SIZE(AO3), %xmm2 movhps 2 * SIZE(AO3), %xmm2 movlps 0 * SIZE(AO4), %xmm3 movhps 2 * SIZE(AO4), %xmm3 #if defined(PENTIUM4) || defined(GENERIC) PREFETCH RPREFETCHSIZE * SIZE(AO1) PREFETCH RPREFETCHSIZE * SIZE(AO2) PREFETCH RPREFETCHSIZE * SIZE(AO3) PREFETCH RPREFETCHSIZE * SIZE(AO4) PREFETCHW WPREFETCHSIZE * SIZE(BO) #endif movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) #else PREFETCH RPREFETCHSIZE * SIZE(AO1) movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 PREFETCH RPREFETCHSIZE * SIZE(AO2) movsd 0 * SIZE(AO2), %xmm2 movhpd 1 * SIZE(AO2), %xmm2 movsd 2 * SIZE(AO2), %xmm3 movhpd 3 * SIZE(AO2), %xmm3 PREFETCH RPREFETCHSIZE * SIZE(AO3) movsd 0 * SIZE(AO3), %xmm4 movhpd 1 * SIZE(AO3), %xmm4 movsd 2 * SIZE(AO3), %xmm5 movhpd 3 * SIZE(AO3), %xmm5 PREFETCH RPREFETCHSIZE * SIZE(AO4) movsd 0 * SIZE(AO4), %xmm6 movhpd 1 * SIZE(AO4), %xmm6 movsd 2 * SIZE(AO4), %xmm7 movhpd 3 * SIZE(AO4), %xmm7 PREFETCHW WPREFETCHSIZE * SIZE(BO) movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) PREFETCHW (WPREFETCHSIZE + 8) * SIZE(B) #endif movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) #endif leaq (BO, M8, 4), BO addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 addq $4 * SIZE, AO3 addq $4 * SIZE, AO4 decq I jg .L12 ALIGN_4 .L13: testq $2, N jle .L14 #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO2), %xmm0 movlps 0 * SIZE(AO3), %xmm1 movhps 0 * SIZE(AO4), %xmm1 movaps %xmm0, 0 * SIZE(BO1) movaps %xmm1, 4 * SIZE(BO1) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 movsd 0 * SIZE(AO3), %xmm2 movhpd 1 * SIZE(AO3), %xmm2 movsd 0 * SIZE(AO4), %xmm3 movhpd 1 * SIZE(AO4), %xmm3 movapd %xmm0, 0 * SIZE(BO1) movapd %xmm1, 2 * SIZE(BO1) movapd %xmm2, 4 * SIZE(BO1) movapd %xmm3, 6 * SIZE(BO1) #endif addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 addq $2 * SIZE, AO3 addq $2 * SIZE, AO4 addq $8 * SIZE, BO1 ALIGN_4 .L14: testq $1, N jle .L19 #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss 0 * SIZE(AO3), %xmm2 movss 0 * SIZE(AO4), %xmm3 movss %xmm0, 0 * SIZE(BO2) movss %xmm1, 1 * SIZE(BO2) movss %xmm2, 2 * SIZE(BO2) movss %xmm3, 3 * SIZE(BO2) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movsd 0 * SIZE(AO3), %xmm1 movhpd 0 * SIZE(AO4), %xmm1 movapd %xmm0, 0 * SIZE(BO2) movapd %xmm1, 2 * SIZE(BO2) #endif addq $4 * SIZE, BO2 ALIGN_4 .L19: decq J jg .L11 ALIGN_4 .L20: testq $2, M jle .L30 ALIGN_4 .L21: movq A, AO1 leaq (A, LDA ), AO2 leaq (A, LDA, 2), A movq B, BO addq $8 * SIZE, B movq N, I sarq $2, I jle .L23 ALIGN_4 .L22: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movlps 0 * SIZE(AO2), %xmm1 movhps 2 * SIZE(AO2), %xmm1 #if defined(PENTIUM4) || defined(GENERIC) PREFETCH RPREFETCHSIZE * SIZE(AO1) PREFETCH RPREFETCHSIZE * SIZE(AO2) PREFETCHW WPREFETCHSIZE * SIZE(BO) #endif movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movhpd 1 * SIZE(AO2), %xmm2 movsd 2 * SIZE(AO2), %xmm3 movhpd 3 * SIZE(AO2), %xmm3 #if defined(PENTIUM4) || defined(GENERIC) PREFETCH RPREFETCHSIZE * SIZE(AO1) PREFETCH RPREFETCHSIZE * SIZE(AO2) PREFETCHW WPREFETCHSIZE * SIZE(BO) #endif movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) #endif addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 leaq (BO, M8, 4), BO decq I jg .L22 ALIGN_4 .L23: testq $2, N jle .L24 #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO2), %xmm0 movaps %xmm0, 0 * SIZE(BO1) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 movapd %xmm0, 0 * SIZE(BO1) movapd %xmm1, 2 * SIZE(BO1) #endif addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 addq $4 * SIZE, BO1 ALIGN_4 .L24: testq $1, N jle .L30 #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss 0 * SIZE(AO2), %xmm1 movss %xmm0, 0 * SIZE(BO2) movss %xmm1, 1 * SIZE(BO2) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 0 * SIZE(AO2), %xmm0 movapd %xmm0, 0 * SIZE(BO2) #endif addq $2 * SIZE, BO2 ALIGN_4 .L30: testq $1, M jle .L999 ALIGN_4 .L31: movq A, AO1 movq B, BO movq N, I sarq $2, I jle .L33 ALIGN_4 .L32: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movaps %xmm0, 0 * SIZE(BO) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) #endif addq $4 * SIZE, AO1 leaq (BO, M8, 4), BO decq I jg .L32 ALIGN_4 .L33: testq $2, N jle .L34 #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movlps %xmm0, 0 * SIZE(BO1) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movapd %xmm0, 0 * SIZE(BO1) #endif addq $2 * SIZE, AO1 addq $2 * SIZE, BO1 ALIGN_4 .L34: testq $1, N jle .L999 #ifndef DOUBLE movss 0 * SIZE(AO1), %xmm0 movss %xmm0, 0 * SIZE(BO2) #else movsd 0 * SIZE(AO1), %xmm0 movsd %xmm0, 0 * SIZE(BO2) #endif addq $1 * SIZE, BO2 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %rbx popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/gemm_tcopy_4_opteron.S000066400000000000000000000230751313527062700216510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4) #define MOVNTQ MOVQ #else #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4) #define MOVNTQ MOVQ #endif #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r10 #define J %rbp #define AO1 %r9 #define AO2 %r15 #define AO3 %r11 #define AO4 %r14 #define BO1 %r13 #define BO2 %r12 #define M8 %rbx #define BO %rax #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 64 + STACKSIZE(%rsp) #define B %rdi #define I %r10 #define J %r11 #define AO1 %r12 #define AO2 %r13 #define AO3 %r14 #define AO4 %r15 #define BO1 %rsi #define BO2 %rbx #define M8 %rbp #define BO %rax #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbp pushq %rbx #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif movq N, %rax movq N, %rbx andq $-4, %rax andq $-2, %rbx imulq M, %rax imulq M, %rbx EMMS leaq (B, %rax, SIZE), BO1 leaq (B, %rbx, SIZE), BO2 leaq (, LDA, SIZE), LDA leaq (, M, SIZE), M8 movq M, J sarq $2, J jle .L20 ALIGN_4 .L11: #if 0 movq A, AO1 leaq (A, LDA, 1), AO2 leaq (A, LDA, 2), AO3 leaq (AO2, LDA, 2), AO4 movq N, I sarq $3, I jle .L13 ALIGN_4 .L12: MOVQ 0 * SIZE(AO1), %mm0 addq $8 * SIZE, AO1 MOVQ 0 * SIZE(AO2), %mm1 addq $8 * SIZE, AO2 MOVQ 0 * SIZE(AO3), %mm2 addq $8 * SIZE, AO3 MOVQ 0 * SIZE(AO4), %mm3 addq $8 * SIZE, AO4 decq I jg .L12 ALIGN_4 .L13: #endif movq A, AO1 leaq (A, LDA ), AO2 leaq (A, LDA, 2), AO3 leaq (AO2, LDA, 2), AO4 leaq (A, LDA, 4), A movq B, BO addq $16 * SIZE, B movq N, I sarq $2, I jle .L15 ALIGN_4 .L14: RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) MOVQ 0 * SIZE(AO1), %mm0 MOVNTQ %mm0, 0 * SIZE(BO) MOVQ 1 * SIZE(AO1), %mm1 MOVNTQ %mm1, 1 * SIZE(BO) RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) MOVQ 2 * SIZE(AO1), %mm2 MOVNTQ %mm2, 2 * SIZE(BO) MOVQ 3 * SIZE(AO1), %mm3 MOVNTQ %mm3, 3 * SIZE(BO) prefetchw (WPREFETCHSIZE + 0) * SIZE(B) MOVQ 0 * SIZE(AO2), %mm4 MOVNTQ %mm4, 4 * SIZE(BO) MOVQ 1 * SIZE(AO2), %mm5 MOVNTQ %mm5, 5 * SIZE(BO) MOVQ 2 * SIZE(AO2), %mm6 MOVNTQ %mm6, 6 * SIZE(BO) MOVQ 3 * SIZE(AO2), %mm7 MOVNTQ %mm7, 7 * SIZE(BO) RPREFETCH (RPREFETCHSIZE) * SIZE(AO3) MOVQ 0 * SIZE(AO3), %mm0 MOVNTQ %mm0, 8 * SIZE(BO) MOVQ 1 * SIZE(AO3), %mm1 MOVNTQ %mm1, 9 * SIZE(BO) RPREFETCH (RPREFETCHSIZE) * SIZE(AO4) MOVQ 2 * SIZE(AO3), %mm2 MOVNTQ %mm2, 10 * SIZE(BO) MOVQ 3 * SIZE(AO3), %mm3 MOVNTQ %mm3, 11 * SIZE(BO) prefetchw (WPREFETCHSIZE + 8) * SIZE(B) MOVQ 0 * SIZE(AO4), %mm4 MOVNTQ %mm4, 12 * SIZE(BO) MOVQ 1 * SIZE(AO4), %mm5 MOVNTQ %mm5, 13 * SIZE(BO) MOVQ 2 * SIZE(AO4), %mm6 MOVNTQ %mm6, 14 * SIZE(BO) MOVQ 3 * SIZE(AO4), %mm7 MOVNTQ %mm7, 15 * SIZE(BO) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 addq $4 * SIZE, AO3 addq $4 * SIZE, AO4 leaq (BO, M8, 4), BO decq I jg .L14 ALIGN_4 .L15: testq $2, N jle .L16 MOVQ 0 * SIZE(AO1), %mm0 MOVQ 1 * SIZE(AO1), %mm1 MOVQ 0 * SIZE(AO2), %mm2 MOVQ 1 * SIZE(AO2), %mm3 MOVQ 0 * SIZE(AO3), %mm4 MOVQ 1 * SIZE(AO3), %mm5 MOVQ 0 * SIZE(AO4), %mm6 MOVQ 1 * SIZE(AO4), %mm7 MOVNTQ %mm0, 0 * SIZE(BO1) MOVNTQ %mm1, 1 * SIZE(BO1) MOVNTQ %mm2, 2 * SIZE(BO1) MOVNTQ %mm3, 3 * SIZE(BO1) MOVNTQ %mm4, 4 * SIZE(BO1) MOVNTQ %mm5, 5 * SIZE(BO1) MOVNTQ %mm6, 6 * SIZE(BO1) MOVNTQ %mm7, 7 * SIZE(BO1) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 addq $2 * SIZE, AO3 addq $2 * SIZE, AO4 addq $8 * SIZE, BO1 ALIGN_4 .L16: testq $1, N jle .L19 MOVQ 0 * SIZE(AO1), %mm0 MOVQ 0 * SIZE(AO2), %mm1 MOVQ 0 * SIZE(AO3), %mm2 MOVQ 0 * SIZE(AO4), %mm3 MOVNTQ %mm0, 0 * SIZE(BO2) MOVNTQ %mm1, 1 * SIZE(BO2) MOVNTQ %mm2, 2 * SIZE(BO2) MOVNTQ %mm3, 3 * SIZE(BO2) addq $4 * SIZE, BO2 ALIGN_4 .L19: decq J jg .L11 ALIGN_4 .L20: testq $2, M jle .L30 ALIGN_4 .L21: movq A, AO1 leaq (A, LDA ), AO2 leaq (A, LDA, 2), A movq B, BO addq $8 * SIZE, B movq N, I sarq $2, I jle .L23 ALIGN_4 .L22: RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) MOVQ 0 * SIZE(AO1), %mm0 MOVQ 1 * SIZE(AO1), %mm1 MOVQ 2 * SIZE(AO1), %mm2 MOVQ 3 * SIZE(AO1), %mm3 RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) MOVQ 0 * SIZE(AO2), %mm4 MOVQ 1 * SIZE(AO2), %mm5 MOVQ 2 * SIZE(AO2), %mm6 MOVQ 3 * SIZE(AO2), %mm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(B) MOVNTQ %mm0, 0 * SIZE(BO) MOVNTQ %mm1, 1 * SIZE(BO) MOVNTQ %mm2, 2 * SIZE(BO) MOVNTQ %mm3, 3 * SIZE(BO) MOVNTQ %mm4, 4 * SIZE(BO) MOVNTQ %mm5, 5 * SIZE(BO) MOVNTQ %mm6, 6 * SIZE(BO) MOVNTQ %mm7, 7 * SIZE(BO) addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 leaq (BO, M8, 4), BO decq I jg .L22 ALIGN_4 .L23: testq $2, N jle .L24 MOVQ 0 * SIZE(AO1), %mm0 MOVQ 1 * SIZE(AO1), %mm1 MOVQ 0 * SIZE(AO2), %mm2 MOVQ 1 * SIZE(AO2), %mm3 MOVNTQ %mm0, 0 * SIZE(BO1) MOVNTQ %mm1, 1 * SIZE(BO1) MOVNTQ %mm2, 2 * SIZE(BO1) MOVNTQ %mm3, 3 * SIZE(BO1) addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 addq $4 * SIZE, BO1 ALIGN_4 .L24: testq $1, N jle .L30 MOVQ 0 * SIZE(AO1), %mm0 MOVQ 0 * SIZE(AO2), %mm1 MOVNTQ %mm0, 0 * SIZE(BO2) MOVNTQ %mm1, 1 * SIZE(BO2) addq $2 * SIZE, BO2 ALIGN_4 .L30: testq $1, M jle .L999 ALIGN_4 .L31: movq A, AO1 movq B, BO movq N, I sarq $2, I jle .L33 ALIGN_4 .L32: MOVQ 0 * SIZE(AO1), %mm0 MOVQ 1 * SIZE(AO1), %mm1 MOVQ 2 * SIZE(AO1), %mm2 MOVQ 3 * SIZE(AO1), %mm3 MOVNTQ %mm0, 0 * SIZE(BO) MOVNTQ %mm1, 1 * SIZE(BO) MOVNTQ %mm2, 2 * SIZE(BO) MOVNTQ %mm3, 3 * SIZE(BO) addq $4 * SIZE, AO1 leaq (BO, M8, 4), BO decq I jg .L32 ALIGN_4 .L33: testq $2, N jle .L34 MOVQ 0 * SIZE(AO1), %mm0 MOVQ 1 * SIZE(AO1), %mm1 MOVNTQ %mm0, 0 * SIZE(BO1) MOVNTQ %mm1, 1 * SIZE(BO1) addq $2 * SIZE, AO1 addq $2 * SIZE, BO1 ALIGN_4 .L34: testq $1, N jle .L999 MOVQ 0 * SIZE(AO1), %mm0 MOVNTQ %mm0, 0 * SIZE(BO2) addq $1 * SIZE, BO2 ALIGN_4 .L999: EMMS #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %rbx popq %rbp popq %r12 popq %r13 popq %r14 popq %r15 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/iamax.S000066400000000000000000000136341313527062700166140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define X ARG2 #define INCX ARG3 #define RET %rax #define I ARG4 #define NUM %r10 #ifndef USE_MIN #define FMOV fcmovbe #define IMOV cmovnbe #else #define FMOV fcmovnbe #define IMOV cmovb #endif #include "l1param.h" PROLOGUE PROFCODE salq $BASE_SHIFT, INCX fldz xorq RET, RET testq M, M jle .L999 testq INCX, INCX jle .L999 ffreep %st movq $2, NUM movq $1, RET FLD (X) #ifdef USE_ABS fabs #endif addq INCX, X decq M jle .L999 cmpq $SIZE, INCX jne .L40 movq M, I sarq $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 1 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 2 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 3 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 4 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 5 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 6 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 7 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM addq $8 * SIZE, X decq I jg .L10 ALIGN_4 .L20: movq M, I andq $7, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st addq $1 * SIZE, X incq NUM decq I jg .L21 jmp .L999 ALIGN_4 .L40: movq M, I sarq $3, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM decq I jg .L50 ALIGN_4 .L60: movq M, I andq $7, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) #ifdef USE_ABS fabs #endif fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM addq INCX, X decq I jg .L61 ALIGN_4 .L999: ffreep %st ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/iamax_sse.S000066400000000000000000000401231313527062700174570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define RET %rax #define I ARG4 #define XX %r10 #define MM %r11 #ifdef USE_MIN #define maxps minps #define maxss minss #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 /* Return Value(Float) */ xor RET, RET /* Return Value(Int) */ testq M, M jle .L999 leaq (, INCX, SIZE), INCX testq INCX, INCX jle .L999 movq M, MM movq X, XX #ifdef USE_ABS pcmpeqb %xmm15, %xmm15 psrld $1, %xmm15 /* Generate USE_ABS */ #endif movss (X), %xmm0 addq INCX, X decq M shufps $0, %xmm0, %xmm0 #ifdef USE_ABS andps %xmm15, %xmm0 #endif movaps %xmm0, %xmm1 movaps %xmm0, %xmm2 movaps %xmm0, %xmm3 /* Generating "seed value" */ cmpq $SIZE, INCX jne .L80 /* Incx != 1 goto L80 */ /* Analigned Check */ testq $3, X /* 00000011 */ jne .L30 /* Purely Unaligned Mode */ cmpq $8, M jle .L30 /* if M <= 8 goto Unaligned mode */ testq $4, X /* bit test 000100 */ je .L05 movss 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 decq M addq $SIZE, X ALIGN_3 .L05: testq $8, X je .L06 movsd 0 * SIZE(X), %xmm4 unpcklps %xmm4, %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm1 subq $2, M addq $2 * SIZE, X ALIGN_3 .L06: movq M, I sarq $4, I jle .L15 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 movaps 8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 movaps 12 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 addq $16 * SIZE, X decq I jg .L11 ALIGN_4 .L15: andq $15, M jle .L20 testq $8, M je .L16 movaps 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 .L16: testq $4, M je .L17 movaps 0 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 .L17: testq $2, M je .L18 movsd 0 * SIZE(X), %xmm7 unpcklps %xmm7, %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 addq $2 * SIZE, X .L18: testq $1, M je .L20 movss 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 ALIGN_3 .L20: movq XX, X movq MM, M maxps %xmm1, %xmm0 maxps %xmm3, %xmm2 maxps %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 testq $4, X je .L21 movss 0 * SIZE(X), %xmm1 decq M addq $SIZE, X #ifdef USE_ABS andps %xmm15, %xmm1 #endif incq RET comiss %xmm0, %xmm1 je .L999 ALIGN_3 .L21: testq $8, X je .L22 movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 subq $2, M addq $2 * SIZE, X #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 #endif incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L22: movq M, I sarq $3, I jle .L25 ALIGN_4 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps 0 * SIZE(X), %xmm1 #ifdef USE_ABS andps %xmm15, %xmm1 #endif cmpeqps %xmm0, %xmm1 movaps 4 * SIZE(X), %xmm3 #ifdef USE_ABS andps %xmm15, %xmm3 #endif cmpeqps %xmm0, %xmm3 orps %xmm3, %xmm1 #ifndef C_SUN movmskps %xmm1, %r11 #else .long 0xd9500f4c #endif testq $15, %r11 jne .L24 addq $8 * SIZE, X addq $8, RET decq I jg .L23 jmp .L25 ALIGN_3 .L24: movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 movss 4 * SIZE(X), %xmm5 movss 5 * SIZE(X), %xmm6 movss 6 * SIZE(X), %xmm7 movss 7 * SIZE(X), %xmm8 #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 andps %xmm15, %xmm8 #endif addq $8 * SIZE, X incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm4 je .L999 incq RET comiss %xmm0, %xmm5 je .L999 incq RET comiss %xmm0, %xmm6 je .L999 incq RET comiss %xmm0, %xmm7 je .L999 incq RET jmp .L999 ALIGN_4 .L25: testq $4, M je .L26 movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 #endif addq $4 * SIZE, X incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm4 je .L999 ALIGN_3 .L26: testq $2, M je .L27 movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 #endif addq $2 * SIZE, X incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L27: incq RET jmp .L999 ALIGN_3 /* Unaligned Mode */ .L30: movq M, I sarq $4, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 movsd 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 movsd 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 addq $16 * SIZE, X decq I jg .L31 ALIGN_4 .L35: andq $15, M jle .L40 testq $8, M je .L36 movsd 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxps %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxps %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 .L36: testq $4, M je .L37 movsd 0 * SIZE(X), %xmm6 movhps 2 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxps %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 .L37: testq $2, M je .L38 movsd 0 * SIZE(X), %xmm7 unpcklps %xmm7, %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxps %xmm7, %xmm3 addq $2 * SIZE, X .L38: testq $1, M je .L40 movss 0 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 jmp .L40 ALIGN_4 .L40: movq XX, X movq MM, M maxps %xmm1, %xmm0 maxps %xmm3, %xmm2 maxps %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I sarq $3, I jle .L45 ALIGN_4 .L43: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm1 movhps 2 * SIZE(X), %xmm1 #ifdef USE_ABS andps %xmm15, %xmm1 #endif cmpeqps %xmm0, %xmm1 movsd 4 * SIZE(X), %xmm3 movhps 6 * SIZE(X), %xmm3 #ifdef USE_ABS andps %xmm15, %xmm3 #endif cmpeqps %xmm0, %xmm3 orps %xmm3, %xmm1 #ifndef C_SUN movmskps %xmm1, %r11 #else .long 0xd9500f4c #endif testq $15, %r11 jne .L44 addq $8 * SIZE, X addq $8, RET decq I jg .L43 jmp .L45 ALIGN_3 .L44: movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 movss 4 * SIZE(X), %xmm5 movss 5 * SIZE(X), %xmm6 movss 6 * SIZE(X), %xmm7 movss 7 * SIZE(X), %xmm8 #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 andps %xmm15, %xmm8 #endif addq $8 * SIZE, X incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm4 je .L999 incq RET comiss %xmm0, %xmm5 je .L999 incq RET comiss %xmm0, %xmm6 je .L999 incq RET comiss %xmm0, %xmm7 je .L999 incq RET jmp .L999 ALIGN_4 .L45: testq $4, M je .L46 movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 #endif addq $4 * SIZE, X incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm4 je .L999 ALIGN_3 .L46: testq $2, M je .L47 movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 #endif addq $2 * SIZE, X incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L47: incq RET jmp .L999 ALIGN_3 .L80: movq M, I sarq $3, I jle .L85 ALIGN_4 .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxss %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxss %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 decq I jg .L81 ALIGN_4 .L85: andq $7, M jle .L90 testq $4, M je .L86 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxss %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 ALIGN_3 .L86: testq $2, M je .L87 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif maxss %xmm5, %xmm1 ALIGN_3 .L87: testq $1, M je .L90 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif maxss %xmm6, %xmm2 ALIGN_4 .L90: movq XX, X movq MM, M maxss %xmm1, %xmm0 maxss %xmm3, %xmm2 maxss %xmm2, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I sarq $3, I jle .L95 ALIGN_4 .L93: movss 0 * SIZE(X), %xmm1 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm1 #endif cmpeqss %xmm0, %xmm1 movss 0 * SIZE(X), %xmm2 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm2 #endif cmpeqss %xmm0, %xmm2 movss 0 * SIZE(X), %xmm3 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm3 #endif cmpeqss %xmm0, %xmm3 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif cmpeqss %xmm0, %xmm4 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif cmpeqps %xmm0, %xmm5 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif cmpeqss %xmm0, %xmm6 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif cmpeqss %xmm0, %xmm7 movss 0 * SIZE(X), %xmm8 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm8 #endif cmpeqss %xmm0, %xmm8 orps %xmm2, %xmm1 orps %xmm4, %xmm3 orps %xmm6, %xmm5 orps %xmm8, %xmm7 orps %xmm3, %xmm1 orps %xmm7, %xmm5 orps %xmm5, %xmm1 #ifndef C_SUN movmskps %xmm1, %r11 #else .long 0xd9500f4c #endif testq $15, %r11 jne .L94 addq $8, RET decq I jg .L93 jmp .L95 ALIGN_3 .L94: subq INCX, X movss 0 * SIZE(X), %xmm8 subq INCX, X movss 0 * SIZE(X), %xmm7 subq INCX, X movss 0 * SIZE(X), %xmm6 subq INCX, X movss 0 * SIZE(X), %xmm5 subq INCX, X movss 0 * SIZE(X), %xmm4 subq INCX, X movss 0 * SIZE(X), %xmm3 subq INCX, X movss 0 * SIZE(X), %xmm2 subq INCX, X movss 0 * SIZE(X), %xmm1 #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 andps %xmm15, %xmm8 #endif incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm4 je .L999 incq RET comiss %xmm0, %xmm5 je .L999 incq RET comiss %xmm0, %xmm6 je .L999 incq RET comiss %xmm0, %xmm7 je .L999 incq RET jmp .L999 ALIGN_4 .L95: testq $4, M je .L96 movss 0 * SIZE(X), %xmm1 addq INCX, X movss 0 * SIZE(X), %xmm2 addq INCX, X movss 0 * SIZE(X), %xmm3 addq INCX, X movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 #endif incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm4 je .L999 ALIGN_3 .L96: testq $2, M je .L97 movss 0 * SIZE(X), %xmm1 addq INCX, X movss 0 * SIZE(X), %xmm2 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm1 andps %xmm15, %xmm2 #endif incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm2 je .L999 ALIGN_3 .L97: incq RET ALIGN_3 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/iamax_sse2.S000066400000000000000000000442541313527062700175520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define RET %rax #define I ARG4 #define XX %r10 #define MM %r11 #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 xor RET, RET testq M, M jle .L999 leaq (, INCX, SIZE), INCX testq INCX, INCX jle .L999 movq M, MM movq X, XX #ifdef USE_ABS pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 #endif movsd (X), %xmm0 addq INCX, X decq M #ifdef USE_ABS andpd %xmm15, %xmm0 #endif unpcklpd %xmm0, %xmm0 movapd %xmm0, %xmm1 movapd %xmm0, %xmm2 movapd %xmm0, %xmm3 cmpq $SIZE, INCX jne .L80 /* Analigned Check */ cmpq $7, M jle .L50 testq $7, X jne .L50 # Purely Unaligned Mode testq $15, X # Checking for 128bit align je .L05 movsd 0 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif unpcklpd %xmm4, %xmm4 maxpd %xmm4, %xmm3 decq M addq $SIZE, X ALIGN_3 .L05: movq M, I sarq $4, I jle .L15 ALIGN_4 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 0 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 2 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movapd 4 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movapd 6 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 8 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 10 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movapd 12 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movapd 14 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 addq $16 * SIZE, X decq I jg .L11 ALIGN_4 .L15: andq $15, M jle .L20 testq $8, M je .L16 movapd 0 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 2 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movapd 4 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movapd 6 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 addq $8 * SIZE, X ALIGN_3 .L16: testq $4, M je .L17 movapd 0 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movapd 2 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 addq $4 * SIZE, X ALIGN_3 .L17: testq $2, M je .L18 movapd 0 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 addq $2 * SIZE, X .L18: testq $1, M je .L20 movsd 0 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif unpcklpd %xmm7, %xmm7 maxpd %xmm7, %xmm3 ALIGN_3 /* Finding Index */ .L20: movq XX, X movq MM, M maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 ALIGN_3 testq $15, X # Checking for 128bit align je .L21 movsd 0 * SIZE(X), %xmm1 #ifdef USE_ABS andpd %xmm15, %xmm1 #endif incq RET comisd %xmm0, %xmm1 je .L999 addq $SIZE, X decq M ALIGN_3 .L21: movq M, I sarq $3, I jle .L25 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 0 * SIZE(X), %xmm1 #ifdef USE_ABS andpd %xmm15, %xmm1 #endif cmpeqpd %xmm0, %xmm1 movapd 2 * SIZE(X), %xmm3 #ifdef USE_ABS andpd %xmm15, %xmm3 #endif cmpeqpd %xmm0, %xmm3 movapd 4 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif cmpeqpd %xmm0, %xmm5 movapd 6 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif cmpeqpd %xmm0, %xmm7 orpd %xmm3, %xmm1 orpd %xmm7, %xmm5 orpd %xmm5, %xmm1 #ifndef C_SUN movmskpd %xmm1, %r11 #else .byte 0x66 .long 0xd9500f4c #endif testq $3, %r11 jne .L23 addq $8 * SIZE, X addq $8, RET decq I jg .L22 jmp .L25 ALIGN_4 .L23: movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movsd 5 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 movsd 7 * SIZE(X), %xmm8 #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 andpd %xmm15, %xmm8 #endif addq $8 * SIZE, X incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm4 je .L999 incq RET comisd %xmm0, %xmm5 je .L999 incq RET comisd %xmm0, %xmm6 je .L999 incq RET comisd %xmm0, %xmm7 je .L999 incq RET jmp .L999 ALIGN_3 .L25: testq $4, M je .L27 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 #endif addq $4 * SIZE, X incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm4 je .L999 ALIGN_3 .L27: testq $2, M je .L28 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 #endif addq $2 * SIZE, X incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 ALIGN_3 .L28: incq RET jmp .L999 ALIGN_3 /* Unaligned Mode */ .L50: movq M, I sarq $4, I jle .L55 ALIGN_4 .L51: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 2 * SIZE(X), %xmm5 movhpd 3 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd 4 * SIZE(X), %xmm6 movhpd 5 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd 6 * SIZE(X), %xmm7 movhpd 7 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 8 * SIZE(X), %xmm4 movhpd 9 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 10 * SIZE(X), %xmm5 movhpd 11 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd 12 * SIZE(X), %xmm6 movhpd 13 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd 14 * SIZE(X), %xmm7 movhpd 15 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 addq $16 * SIZE, X decq I jg .L51 ALIGN_4 .L55: andq $15, M jle .L60 testq $8, M je .L56 movsd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 2 * SIZE(X), %xmm5 movhpd 3 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd 4 * SIZE(X), %xmm6 movhpd 5 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd 6 * SIZE(X), %xmm7 movhpd 7 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 addq $8 * SIZE, X ALIGN_3 .L56: testq $4, M je .L57 movsd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 2 * SIZE(X), %xmm5 movhpd 3 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 addq $4 * SIZE, X ALIGN_3 .L57: testq $2, M je .L58 movsd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm6 #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 addq $2 * SIZE, X .L58: testq $1, M je .L60 movsd 0 * SIZE(X), %xmm7 unpcklpd %xmm7, %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 ALIGN_3 .L60: movq XX, X movq MM, M maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movq M, I sarq $3, I jle .L65 ALIGN_4 .L62: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm1 movhpd 1 * SIZE(X), %xmm1 #ifdef USE_ABS andpd %xmm15, %xmm1 #endif cmpeqpd %xmm0, %xmm1 movsd 2 * SIZE(X), %xmm3 movhpd 3 * SIZE(X), %xmm3 #ifdef USE_ABS andpd %xmm15, %xmm3 #endif cmpeqpd %xmm0, %xmm3 movsd 4 * SIZE(X), %xmm5 movhpd 5 * SIZE(X), %xmm5 #ifdef USE_ABS andpd %xmm15, %xmm5 #endif cmpeqpd %xmm0, %xmm5 movsd 6 * SIZE(X), %xmm7 movhpd 7 * SIZE(X), %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif cmpeqpd %xmm0, %xmm7 orpd %xmm3, %xmm1 orpd %xmm7, %xmm5 orpd %xmm5, %xmm1 #ifndef C_SUN movmskpd %xmm1, %r11 #else .byte 0x66 .long 0xd9500f4c #endif testq $3, %r11 jne .L63 addq $8 * SIZE, X addq $8, RET decq I jg .L62 jmp .L65 ALIGN_4 .L63: movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movsd 5 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 movsd 7 * SIZE(X), %xmm8 #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 andpd %xmm15, %xmm8 #endif addq $8 * SIZE, X incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm4 je .L999 incq RET comisd %xmm0, %xmm5 je .L999 incq RET comisd %xmm0, %xmm6 je .L999 incq RET comisd %xmm0, %xmm7 je .L999 incq RET jmp .L999 ALIGN_3 .L65: testq $4, M je .L67 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 #endif addq $4 * SIZE, X incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm4 je .L999 ALIGN_3 .L67: testq $2, M je .L68 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 #endif addq $2 * SIZE, X incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 ALIGN_3 .L68: incq RET jmp .L999 ALIGN_4 .L80: movq M, I sarq $4, I jle .L85 ALIGN_4 .L81: movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd 0 * SIZE(X), %xmm6 addq INCX, X movhpd 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd 0 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd 0 * SIZE(X), %xmm6 addq INCX, X movhpd 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd 0 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 decq I jg .L81 ALIGN_4 .L85: andq $15, M jle .L90 testq $8, M je .L86 movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 movsd 0 * SIZE(X), %xmm6 addq INCX, X movhpd 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 movsd 0 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 ALIGN_3 .L86: testq $4, M je .L87 movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm4 #endif maxpd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 ALIGN_3 .L87: testq $2, M je .L88 movsd 0 * SIZE(X), %xmm6 addq INCX, X movhpd 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm6 #endif maxpd %xmm6, %xmm2 ALIGN_3 .L88: testq $1, M je .L90 movsd 0 * SIZE(X), %xmm7 unpcklpd %xmm7, %xmm7 #ifdef USE_ABS andpd %xmm15, %xmm7 #endif maxpd %xmm7, %xmm3 maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 ALIGN_4 .L90: movq XX, X movq MM, M maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movq M, I sarq $3, I jle .L95 ALIGN_4 .L92: movsd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm1 #endif cmpeqpd %xmm0, %xmm1 movsd 0 * SIZE(X), %xmm3 addq INCX, X movhpd 0 * SIZE(X), %xmm3 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm3 #endif cmpeqpd %xmm0, %xmm3 movsd 0 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm5 #endif cmpeqpd %xmm0, %xmm5 movsd 0 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm7 #endif cmpeqpd %xmm0, %xmm7 orpd %xmm3, %xmm1 orpd %xmm7, %xmm5 orpd %xmm5, %xmm1 #ifndef C_SUN movmskpd %xmm1, %r11 #else .byte 0x66 .long 0xd9500f4c #endif testq $3, %r11 jne .L93 addq $8, RET decq I jg .L92 jmp .L95 ALIGN_4 .L93: subq INCX, X movsd 0 * SIZE(X), %xmm8 subq INCX, X movsd 0 * SIZE(X), %xmm7 subq INCX, X movsd 0 * SIZE(X), %xmm6 subq INCX, X movsd 0 * SIZE(X), %xmm5 subq INCX, X movsd 0 * SIZE(X), %xmm4 subq INCX, X movsd 0 * SIZE(X), %xmm3 subq INCX, X movsd 0 * SIZE(X), %xmm2 subq INCX, X movsd 0 * SIZE(X), %xmm1 #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 andpd %xmm15, %xmm8 #endif addq $8 * SIZE, X incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm4 je .L999 incq RET comisd %xmm0, %xmm5 je .L999 incq RET comisd %xmm0, %xmm6 je .L999 incq RET comisd %xmm0, %xmm7 je .L999 incq RET jmp .L999 ALIGN_3 .L95: testq $4, M je .L97 movsd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 addq INCX, X movsd 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 #endif incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm4 je .L999 ALIGN_3 .L97: testq $2, M je .L98 movsd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X #ifdef USE_ABS andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 #endif incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm2 je .L999 ALIGN_3 .L98: incq RET ALIGN_3 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/izamax.S000066400000000000000000000121511313527062700167770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define X ARG2 #define INCX ARG3 #define I ARG4 #define NUM %r10 #define RET %rax #ifndef USE_MIN #define FMOV fcmovbe #define IMOV cmovnbe #else #define FMOV fcmovnb #define IMOV cmovb #endif #include "l1param.h" PROLOGUE PROFCODE salq $ZBASE_SHIFT, INCX fldz xorq RET, RET testq M, M jle .L999 testq INCX, INCX jle .L999 ffreep %st movq $2, NUM movq $1, RET FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) addq INCX, X decq M jle .L999 cmpq $2 * SIZE, INCX jne .L40 movq M, I sarq $2, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM addq $8 * SIZE, X decq I jg .L10 ALIGN_4 .L20: movq M, I andq $3, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM addq $2 * SIZE, X decq I jg .L21 jmp .L999 ALIGN_4 .L40: movq M, I sarq $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM decq I jg .L50 ALIGN_4 .L60: movq M, I andq $3, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) IMOV NUM, RET fxch %st(1) ffreep %st incq NUM addq INCX, X decq I jg .L61 ALIGN_4 .L999: ffreep %st ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/izamax_sse.S000066400000000000000000000243021313527062700176520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define RET %rax #define I ARG4 #define XX %r10 #define MM %r11 #ifdef USE_MIN #define maxps minps #define maxss minss #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 xor RET, RET testq M, M jle .L999 testq INCX, INCX jle .L999 salq $ZBASE_SHIFT, INCX movq M, MM movq X, XX pcmpeqb %xmm15, %xmm15 psrld $1, %xmm15 movss 0 * SIZE(X), %xmm0 movss 1 * SIZE(X), %xmm1 addq INCX, X decq M andps %xmm15, %xmm0 andps %xmm15, %xmm1 addps %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, %xmm1 cmpq $2 * SIZE, INCX jne .L70 .L30: movq M, I sarq $3, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 movsd 8 * SIZE(X), %xmm7 movhps 10 * SIZE(X), %xmm7 movsd 12 * SIZE(X), %xmm8 movhps 14 * SIZE(X), %xmm8 movaps %xmm7, %xmm9 shufps $0x88, %xmm8, %xmm7 shufps $0xdd, %xmm8, %xmm9 andps %xmm15, %xmm7 andps %xmm15, %xmm9 addps %xmm9, %xmm7 maxps %xmm7, %xmm0 addq $16 * SIZE, X decq I jg .L31 ALIGN_4 .L35: andq $7, M jle .L40 testq $4, M je .L36 movsd 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 addq $8 * SIZE, X ALIGN_3 .L36: testq $2, M je .L37 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 movss 2 * SIZE(X), %xmm6 movss 3 * SIZE(X), %xmm7 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addps %xmm5, %xmm4 addps %xmm7, %xmm6 maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 addq $4 * SIZE, X ALIGN_3 .L37: testq $1, M je .L40 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addps %xmm5, %xmm4 maxss %xmm4, %xmm0 ALIGN_4 .L40: movq XX, X movq MM, M maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I sarq $2, I jle .L45 ALIGN_4 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm1 movhps 2 * SIZE(X), %xmm1 movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm15, %xmm1 andps %xmm15, %xmm3 addps %xmm3, %xmm1 cmpeqps %xmm0, %xmm1 #ifndef C_SUN movmskps %xmm1, %r11 #else .long 0xd9500f4c #endif testq $15, %r11 jne .L43 addq $8 * SIZE, X addq $4, RET decq I jg .L41 jmp .L45 ALIGN_4 .L43: movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 movss 4 * SIZE(X), %xmm5 movss 5 * SIZE(X), %xmm6 movss 6 * SIZE(X), %xmm7 movss 7 * SIZE(X), %xmm8 addq $8 * SIZE, X andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 andps %xmm15, %xmm8 addps %xmm2, %xmm1 addps %xmm4, %xmm3 addps %xmm6, %xmm5 addps %xmm8, %xmm7 incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm5 je .L999 incq RET comiss %xmm0, %xmm7 je .L999 ALIGN_3 .L45: testq $2, M je .L47 movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 movss 2 * SIZE(X), %xmm3 movss 3 * SIZE(X), %xmm4 addq $4 * SIZE, X andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 ALIGN_3 .L47: incq RET jmp .L999 ALIGN_3 .L70: movq M, I sarq $3, I jle .L75 ALIGN_4 .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 addq INCX, X movhps 0 * SIZE(X), %xmm4 addq INCX, X movsd 0 * SIZE(X), %xmm5 addq INCX, X movhps 0 * SIZE(X), %xmm5 addq INCX, X movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm7 addq INCX, X movhps 0 * SIZE(X), %xmm7 addq INCX, X movsd 0 * SIZE(X), %xmm8 addq INCX, X movhps 0 * SIZE(X), %xmm8 addq INCX, X movaps %xmm7, %xmm9 shufps $0x88, %xmm8, %xmm7 shufps $0xdd, %xmm8, %xmm9 andps %xmm15, %xmm7 andps %xmm15, %xmm9 addps %xmm9, %xmm7 maxps %xmm7, %xmm0 decq I jg .L71 ALIGN_4 .L75: andq $7, M jle .L80 testq $4, M je .L76 movsd 0 * SIZE(X), %xmm4 addq INCX, X movhps 0 * SIZE(X), %xmm4 addq INCX, X movsd 0 * SIZE(X), %xmm5 addq INCX, X movhps 0 * SIZE(X), %xmm5 addq INCX, X movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 ALIGN_3 .L76: testq $2, M je .L77 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 addq INCX, X movss 0 * SIZE(X), %xmm6 movss 1 * SIZE(X), %xmm7 addq INCX, X andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addps %xmm5, %xmm4 addps %xmm7, %xmm6 maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 ALIGN_3 .L77: testq $1, M je .L80 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addps %xmm5, %xmm4 maxss %xmm4, %xmm0 ALIGN_4 .L80: movq XX, X movq MM, M maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I sarq $2, I jle .L85 ALIGN_4 .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm1 addq INCX, X movhps 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X movhps 0 * SIZE(X), %xmm2 addq INCX, X movaps %xmm1, %xmm3 shufps $0x88, %xmm2, %xmm1 shufps $0xdd, %xmm2, %xmm3 andps %xmm15, %xmm1 andps %xmm15, %xmm3 addps %xmm3, %xmm1 cmpeqps %xmm0, %xmm1 #ifndef C_SUN movmskps %xmm1, %r11 #else .long 0xd9500f4c #endif testq $15, %r11 jne .L83 addq $4, RET decq I jg .L81 jmp .L85 ALIGN_4 .L83: subq INCX, X movss 0 * SIZE(X), %xmm7 movss 1 * SIZE(X), %xmm8 subq INCX, X movss 0 * SIZE(X), %xmm5 movss 1 * SIZE(X), %xmm6 subq INCX, X movss 0 * SIZE(X), %xmm3 movss 1 * SIZE(X), %xmm4 subq INCX, X movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 andps %xmm15, %xmm8 addps %xmm2, %xmm1 addps %xmm4, %xmm3 addps %xmm6, %xmm5 addps %xmm8, %xmm7 incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 incq RET comiss %xmm0, %xmm5 je .L999 incq RET comiss %xmm0, %xmm7 je .L999 ALIGN_3 .L85: testq $2, M je .L87 movss 0 * SIZE(X), %xmm1 movss 1 * SIZE(X), %xmm2 addq INCX, X movss 0 * SIZE(X), %xmm3 movss 1 * SIZE(X), %xmm4 addq INCX, X andps %xmm15, %xmm1 andps %xmm15, %xmm2 andps %xmm15, %xmm3 andps %xmm15, %xmm4 addps %xmm2, %xmm1 addps %xmm4, %xmm3 incq RET comiss %xmm0, %xmm1 je .L999 incq RET comiss %xmm0, %xmm3 je .L999 ALIGN_3 .L87: incq RET ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/izamax_sse2.S000066400000000000000000000262701313527062700177420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define RET %rax #define I ARG4 #define XX %r10 #define MM %r11 #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 xor RET, RET testq M, M jle .L999 testq INCX, INCX jle .L999 salq $ZBASE_SHIFT, INCX movq M, MM movq X, XX pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 addq INCX, X decq M andpd %xmm15, %xmm0 andpd %xmm15, %xmm1 addpd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, %xmm1 movapd %xmm0, %xmm2 movapd %xmm0, %xmm3 cmpq $2 * SIZE, INCX jne .L60 movq M, I sarq $3, I jle .L25 ALIGN_4 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movhpd 2 * SIZE(X), %xmm4 movhpd 3 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 5 * SIZE(X), %xmm7 movhpd 6 * SIZE(X), %xmm6 movhpd 7 * SIZE(X), %xmm7 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm0 maxpd %xmm6, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 8 * SIZE(X), %xmm4 movsd 9 * SIZE(X), %xmm5 movhpd 10 * SIZE(X), %xmm4 movhpd 11 * SIZE(X), %xmm5 movsd 12 * SIZE(X), %xmm6 movsd 13 * SIZE(X), %xmm7 movhpd 14 * SIZE(X), %xmm6 movhpd 15 * SIZE(X), %xmm7 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm2 maxpd %xmm6, %xmm3 addq $16 * SIZE, X decq I jg .L21 ALIGN_4 .L25: andq $7, M jle .L30 testq $4, M je .L26 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movhpd 2 * SIZE(X), %xmm4 movhpd 3 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 5 * SIZE(X), %xmm7 movhpd 6 * SIZE(X), %xmm6 movhpd 7 * SIZE(X), %xmm7 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm0 maxpd %xmm6, %xmm1 addq $8 * SIZE, X ALIGN_3 .L26: testq $2, M je .L27 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movhpd 2 * SIZE(X), %xmm4 movhpd 3 * SIZE(X), %xmm5 addq $4 * SIZE, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm0 ALIGN_3 .L27: testq $1, M je .L30 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxsd %xmm4, %xmm2 ALIGN_4 .L30: movq XX, X movq MM, M maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movq M, I sarq $2, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movhpd 2 * SIZE(X), %xmm1 movhpd 3 * SIZE(X), %xmm2 movsd 4 * SIZE(X), %xmm3 movsd 5 * SIZE(X), %xmm4 movhpd 6 * SIZE(X), %xmm3 movhpd 7 * SIZE(X), %xmm4 andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 cmpeqpd %xmm0, %xmm1 cmpeqpd %xmm0, %xmm3 orpd %xmm3, %xmm1 #ifndef C_SUN movmskpd %xmm1, %r11 #else .byte 0x66 .long 0xd9500f4c #endif testq $3, %r11 jne .L33 addq $8 * SIZE, X addq $4, RET decq I jg .L31 jmp .L35 ALIGN_4 .L33: movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movsd 5 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 movsd 7 * SIZE(X), %xmm8 addq $8 * SIZE, X andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 andpd %xmm15, %xmm8 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 addpd %xmm8, %xmm7 incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm5 je .L999 incq RET comisd %xmm0, %xmm7 je .L999 ALIGN_3 .L35: testq $2, M je .L36 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 addq $4 * SIZE, X andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 ALIGN_3 .L36: incq RET jmp .L999 ALIGN_3 .L60: movq M, I sarq $3, I jle .L65 ALIGN_4 .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm7 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm0 maxpd %xmm6, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm7 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm2 maxpd %xmm6, %xmm3 decq I jg .L61 ALIGN_4 .L65: andq $7, M jle .L70 testq $4, M je .L66 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm7 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm0 maxpd %xmm6, %xmm1 ALIGN_3 .L66: testq $2, M je .L67 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm2 ALIGN_3 .L67: testq $1, M je .L70 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxsd %xmm4, %xmm3 ALIGN_3 .L70: movq XX, X movq MM, M maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movq M, I sarq $2, I jle .L75 ALIGN_4 .L71: #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) prefetch PREFETCHSIZE * SIZE(X) #endif #ifdef PENTIUM4 prefetchnta PREFETCHSIZE * SIZE(X) #endif movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 addq INCX, X movhpd 0 * SIZE(X), %xmm1 movhpd 1 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 movsd 1 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm3 movhpd 1 * SIZE(X), %xmm4 addq INCX, X andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 cmpeqpd %xmm0, %xmm1 cmpeqpd %xmm0, %xmm3 orpd %xmm3, %xmm1 #ifndef C_SUN movmskpd %xmm1, %r11 #else .byte 0x66 .long 0xd9500f4c #endif testq $3, %r11 jne .L73 addq $4, RET decq I jg .L71 jmp .L75 ALIGN_4 .L73: subq INCX, X movsd 0 * SIZE(X), %xmm7 movsd 1 * SIZE(X), %xmm8 subq INCX, X movsd 0 * SIZE(X), %xmm5 movsd 1 * SIZE(X), %xmm6 subq INCX, X movsd 0 * SIZE(X), %xmm3 movsd 1 * SIZE(X), %xmm4 subq INCX, X movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 andpd %xmm15, %xmm8 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 addpd %xmm8, %xmm7 incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 incq RET comisd %xmm0, %xmm5 je .L999 incq RET comisd %xmm0, %xmm7 je .L999 ALIGN_3 .L75: testq $2, M je .L76 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 movsd 1 * SIZE(X), %xmm4 addq INCX, X andpd %xmm15, %xmm1 andpd %xmm15, %xmm2 andpd %xmm15, %xmm3 andpd %xmm15, %xmm4 addpd %xmm2, %xmm1 addpd %xmm4, %xmm3 incq RET comisd %xmm0, %xmm1 je .L999 incq RET comisd %xmm0, %xmm3 je .L999 ALIGN_3 .L76: incq RET ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/lsame.S000066400000000000000000000057321313527062700166160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define X ARG1 /* rdi */ #define Y ARG2 /* rsi */ #define XX ARG3 #define YY ARG4 PROLOGUE PROFCODE movzbq (X), X movzbq (Y), Y andq $255, X andq $255, Y leaq -32(X), XX leaq -32(Y), YY cmpq $97, X cmovge XX, X cmpq $97,Y cmovge YY, Y movq $0, %rax movq $1, %r8 cmpq X, Y cmoveq %r8, %rax ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/mcount.S000066400000000000000000000052631313527062700170210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE jmp _mcount EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/nrm2.S000066400000000000000000000110211313527062700163570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE fldz testq M, M jle .L999 testq INCX, INCX jle .L999 salq $BASE_SHIFT, INCX fldz fldz fldz cmpq $SIZE, INCX jne .L40 movq M, I sarq $3, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) fmul %st(0), %st FLD 2 * SIZE(X) fmul %st(0), %st FLD 3 * SIZE(X) fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fmul %st(0), %st FLD 5 * SIZE(X) fmul %st(0), %st FLD 6 * SIZE(X) fmul %st(0), %st FLD 7 * SIZE(X) fmul %st(0), %st addq $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L10 ALIGN_4 .L20: andq $7, M jle .L998 ALIGN_4 .L21: FLD (X) fmul %st(0), %st faddp %st,%st(1) addq $1 * SIZE, X decq M jg .L21 jmp .L998 ALIGN_4 .L40: movq M, I sarq $3, I jle .L60 ALIGN_4 .L50: FLD (X) addq INCX, X fmul %st(0), %st FLD (X) addq INCX, X fmul %st(0), %st FLD (X) addq INCX, X fmul %st(0), %st FLD (X) addq INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD (X) addq INCX, X fmul %st(0), %st FLD (X) addq INCX, X fmul %st(0), %st FLD (X) addq INCX, X fmul %st(0), %st FLD (X) addq INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L50 ALIGN_4 .L60: andq $7, M jle .L998 ALIGN_4 .L61: FLD (X) addq INCX, X fmul %st(0), %st faddp %st,%st(1) decq M jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: fsqrt #ifndef XDOUBLE sub $2 * SIZE, %rsp FST (%rsp) MOVSD (%rsp), %xmm0 add $2 * SIZE, %rsp #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/nrm2_sse.S000066400000000000000000000150561313527062700172450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 testq M, M jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX jle .L999 pxor %xmm2, %xmm2 leaq (, INCX, SIZE), INCX pxor %xmm3, %xmm3 cmpq $SIZE, INCX jne .L40 testq $SIZE, X je .L05 movss 0 * SIZE(X), %xmm4 cvtss2sd %xmm4, %xmm6 mulsd %xmm6, %xmm6 addsd %xmm6, %xmm3 addq INCX, X decq M jle .L998 ALIGN_3 .L05: movq M, I sarq $3, I jle .L14 movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 addq $8 * SIZE, X decq I jle .L12 ALIGN_3 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif cvtps2pd %xmm4, %xmm8 cvtps2pd %xmm5, %xmm9 cvtps2pd %xmm6, %xmm10 cvtps2pd %xmm7, %xmm11 movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 mulpd %xmm8, %xmm8 mulpd %xmm9, %xmm9 mulpd %xmm10, %xmm10 mulpd %xmm11, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 addq $8 * SIZE, X decq I jg .L10 ALIGN_3 .L12: cvtps2pd %xmm4, %xmm8 cvtps2pd %xmm5, %xmm9 cvtps2pd %xmm6, %xmm10 cvtps2pd %xmm7, %xmm11 mulpd %xmm8, %xmm8 mulpd %xmm9, %xmm9 mulpd %xmm10, %xmm10 mulpd %xmm11, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 ALIGN_3 .L14: testq $4, M je .L15 movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 cvtps2pd %xmm4, %xmm6 cvtps2pd %xmm5, %xmm7 mulpd %xmm6, %xmm6 mulpd %xmm7, %xmm7 addpd %xmm6, %xmm0 addpd %xmm7, %xmm1 addq $4 * SIZE, X ALIGN_3 .L15: testq $2, M je .L16 movsd 0 * SIZE(X), %xmm4 cvtps2pd %xmm4, %xmm6 mulpd %xmm6, %xmm6 addpd %xmm6, %xmm2 addq $2 * SIZE, X ALIGN_3 .L16: testq $1, M je .L998 movss 0 * SIZE(X), %xmm4 cvtss2sd %xmm4, %xmm6 mulsd %xmm6, %xmm6 addsd %xmm6, %xmm3 jmp .L998 ALIGN_4 .L40: movq M, I sarq $3, I jle .L44 ALIGN_4 .L41: movss (X), %xmm4 addq INCX, X movss (X), %xmm5 addq INCX, X movss (X), %xmm6 addq INCX, X movss (X), %xmm7 addq INCX, X movss (X), %xmm8 addq INCX, X movss (X), %xmm9 addq INCX, X movss (X), %xmm10 addq INCX, X movss (X), %xmm11 addq INCX, X cvtss2sd %xmm4, %xmm4 cvtss2sd %xmm5, %xmm5 cvtss2sd %xmm6, %xmm6 cvtss2sd %xmm7, %xmm7 cvtss2sd %xmm8, %xmm8 cvtss2sd %xmm9, %xmm9 cvtss2sd %xmm10, %xmm10 cvtss2sd %xmm11, %xmm11 mulsd %xmm4, %xmm4 mulsd %xmm5, %xmm5 mulsd %xmm6, %xmm6 mulsd %xmm7, %xmm7 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 mulsd %xmm8, %xmm8 mulsd %xmm9, %xmm9 mulsd %xmm10, %xmm10 mulsd %xmm11, %xmm11 addsd %xmm8, %xmm0 addsd %xmm9, %xmm1 addsd %xmm10, %xmm2 addsd %xmm11, %xmm3 decq I jg .L41 ALIGN_3 .L44: testq $4, M je .L45 movss (X), %xmm4 addq INCX, X movss (X), %xmm5 addq INCX, X movss (X), %xmm6 addq INCX, X movss (X), %xmm7 addq INCX, X cvtss2sd %xmm4, %xmm8 cvtss2sd %xmm5, %xmm9 cvtss2sd %xmm6, %xmm10 cvtss2sd %xmm7, %xmm11 mulsd %xmm8, %xmm8 mulsd %xmm9, %xmm9 mulsd %xmm10, %xmm10 mulsd %xmm11, %xmm11 addsd %xmm8, %xmm0 addsd %xmm9, %xmm1 addsd %xmm10, %xmm2 addsd %xmm11, %xmm3 ALIGN_3 .L45: testq $2, M je .L46 movss (X), %xmm4 addq INCX, X movss (X), %xmm5 addq INCX, X cvtss2sd %xmm4, %xmm6 cvtss2sd %xmm5, %xmm7 mulsd %xmm6, %xmm6 mulsd %xmm7, %xmm7 addsd %xmm6, %xmm1 addsd %xmm7, %xmm2 ALIGN_3 .L46: testq $1, M je .L998 movss (X), %xmm4 cvtss2sd %xmm4, %xmm6 mulsd %xmm6, %xmm6 addsd %xmm6, %xmm3 ALIGN_4 .L998: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #ifndef HAVE_SSE3 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif ALIGN_4 .L999: sqrtsd %xmm0, %xmm0 cvtsd2ss %xmm0, %xmm0 RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qconjg.S000066400000000000000000000054401313527062700167720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" PROLOGUE PROFCODE fldz FLD 1 * SIZE(ARG1) fsubrp %st, %st(1) FLD 0 * SIZE(ARG1) FST 0 * SIZE(ARG2) FST 1 * SIZE(ARG2) ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qdot.S000066400000000000000000000113161313527062700164570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx #include "l1param.h" PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY sall $BASE_SHIFT, INCX sall $BASE_SHIFT, INCY fldz fldz fldz fldz cmpl $SIZE, INCX jne .L14 cmpl $SIZE, INCY jne .L14 movl N, %eax sarl $2, %eax jle .L15 ALIGN_3 .L16: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(1) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(2) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(3) FLD 3 * SIZE(X) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st,%st(4) addl $4 * SIZE, X addl $4 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L22: FLD (X) addl $SIZE, X FLD (Y) fmulp %st, %st(1) addl $SIZE, Y faddp %st,%st(1) decl %eax jg .L22 jmp .L27 ALIGN_3 .L14: movl N, %eax sarl $2, %eax jle .L30 ALIGN_3 .L31: FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(1) FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(2) FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(3) FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st,%st(4) decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L37: FLD (X) addl INCX, X FLD (Y) fmulp %st, %st(1) addl INCY, Y faddp %st, %st(1) decl %eax jg .L37 ALIGN_3 .L27: faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(1) popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qgemm_kernel_2x2.S000066400000000000000000000331561313527062700206570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define N ARG2 #define K ARG3 #define A ARG4 #define B ARG5 #define C ARG6 #define LDC %r10 #define I %r12 #define J %r13 #define AO %r14 #define BO %r15 #define CO %rbp #define KK %r11 #define KKK 48(%rsp) #define STACKSIZE 64 #define ALPHA 8 + STACKSIZE(%rsp) #define OFFSET 32 + STACKSIZE(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq 24 + STACKSIZE(%rsp), LDC #if defined(TRMMKERNEL) && !defined(LEFT) movq OFFSET, %rax negq %rax movq %rax, KK #endif addq $8 * SIZE, A addq $8 * SIZE, B salq $BASE_SHIFT, LDC movq N, %rax sarq $1, %rax movq %rax, J je .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO movq C, CO leaq (, LDC, 2), %rax addq %rax, C movq M, I sarq $1, I je .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addq $8 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addq $2 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L16 ALIGN_4 .L18: #ifndef TRMMKERNEL FLD ALPHA fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) FLD 1 * SIZE(CO, LDC) faddp %st, %st(1) FST 1 * SIZE(CO, LDC) #else FST 0 * SIZE(CO) FST 1 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO, LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO decq I jne .L11 ALIGN_4 .L20: movq M, %rax andq $1, %rax je .L29 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 2), BO #endif fldz fldz #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $4 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $1 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L26 ALIGN_4 .L28: #ifndef TRMMKERNEL FLD ALPHA fmul %st, %st(1) fmulp %st, %st(2) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) #else FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B decq J jne .L01 ALIGN_4 .L30: movq N, %rax testq $1, %rax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO movq C, CO addq LDC, C movq M, I sarq $1, I je .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq ( B, %rax, 1), BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $8 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $2 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L36 ALIGN_4 .L38: #ifndef TRMMKERNEL FLD ALPHA fmul %st, %st(1) fmulp %st, %st(2) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) #else FST 0 * SIZE(CO) FST 1 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO decq I jne .L31 ALIGN_4 .L40: movq M, %rax andq $1, %rax je .L49 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 1), BO #endif fldz #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $4 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $1 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L46 ALIGN_4 .L48: #ifndef TRMMKERNEL FLD ALPHA fmulp %st, %st(1) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) #else FST 0 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK #endif movq BO, B ALIGN_4 .L999: EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qgemv_n.S000066400000000000000000000177551313527062700171610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #define P 32 #define STACKSIZE 80 #define ALPHA 8 + STACKSIZE(%rsp) #define OLD_INCX 24 + STACKSIZE(%rsp) #define OLD_Y 32 + STACKSIZE(%rsp) #define OLD_INCY 40 + STACKSIZE(%rsp) #define BUFFER 48 + STACKSIZE(%rsp) #define PLDA_M 56 (%rsp) #define IS 64 (%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #define TEMP %rax #define I %rax #define J %r11 #define A1 %r12 #define X1 %r13 #define Y1 %r14 #define XP %r15 /* #define BUFFER %r15 */ #define MIN_N %rbx PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY FLD ALPHA salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY movq $0, IS test M, M jle .L79 # goto END test N, N jle .L79 # goto END movq LDA, %rax imulq $P, %rax # P * lda subq M ,%rax # P * lda - m salq $BASE_SHIFT, %rax movq %rax, PLDA_M salq $BASE_SHIFT, LDA ALIGN_2 .L32: movq $P, %rax movq N, MIN_N subq IS, MIN_N cmpq %rax, MIN_N cmovg %rax, MIN_N movq IS, XP salq $BASE_SHIFT, XP leaq (X,XP, 1), XP cmpq $SIZE, INCX je .L34 # if incx == 1 goto L34 movq BUFFER, XP movq XP, X1 movq MIN_N, I sarq $2,I jle .L35 ALIGN_2 .L36: FLD (X) addq INCX,X FLD (X) addq INCX,X FLD (X) addq INCX,X FLD (X) addq INCX,X FST 3 * SIZE(X1) FST 2 * SIZE(X1) FST 1 * SIZE(X1) FST 0 * SIZE(X1) addq $4 * SIZE, X1 decq I jg .L36 ALIGN_3 .L35: movq MIN_N, I andq $3, I jle .L34 ALIGN_2 .L42: FLD (X) addq INCX, X FST (X1) addq $SIZE, X1 decq I jg .L42 ALIGN_3 /* Main Routine */ .L34: movq Y, Y1 movq M, J sarq $2, J jle .L47 ALIGN_2 .L48: movq A, A1 # a_offset = a fldz addq $4 * SIZE, A # a += 4 fldz movq XP, X1 # b_offset = xp fldz movq MIN_N, I # i = min_n fldz FLD (X1) # bt1 = b_offset sarq $1, I jle .L51 ALIGN_2 .L80: FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += at1 FLD 1 * SIZE(X1) # bt1 = b_offset addq LDA, A1 # a_offset += lda FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) addq LDA, A1 faddp %st, %st(4) # ct4 += at1 FLD 2 * SIZE(X1) # bt1 = b_offset addq $2 * SIZE, X1 # b_offset += 2 decq I jg .L80 .L51: movq MIN_N, I andq $1, I je .L57 FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += at1 fldz ALIGN_2 .L57: ffreep %st(0) fxch %st(4) fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fxch %st(4) FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 decq J # j -- jg .L48 ALIGN_3 .L47: movq M, J andq $3, J # j = (m & 3) jle .L60 ALIGN_2 .L61: movq A, A1 # a_offset = a fldz addq $SIZE, A # a++ fldz movq XP, X1 fldz fldz movq MIN_N, I sarq $3, I jle .L64 ALIGN_2 .L65: FLD 0 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st, %st(1) addq LDA, A1 FLD 1 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st, %st(2) addq LDA ,A1 FLD 2 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st, %st(3) addq LDA, A1 FLD 3 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st, %st(4) addq LDA, A1 FLD 4 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st,%st(1) addq LDA, A1 FLD 5 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st, %st(2) addq LDA, A1 FLD 6 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st,%st(3) addq LDA, A1 FLD 7 * SIZE(X1) FLD (A1) fmulp %st, %st(1) faddp %st,%st(4) addq LDA, A1 addq $8 * SIZE, X1 decq I jg .L65 .L64: movq MIN_N,I andq $7, I jle .L70 ALIGN_2 .L71: FLD (X1) addq $SIZE, X1 FLD (A1) fmulp %st, %st(1) addq LDA, A1 # a_offset += lda faddp %st, %st(1) decq I jg .L71 ALIGN_2 .L70: faddp %st, %st(1) faddp %st, %st(1) faddp %st, %st(1) fmul %st(1), %st FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 decq J jg .L61 .L60: addq PLDA_M, A addq $P, IS cmpq N, IS jl .L32 .L79: EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qgemv_t.S000066400000000000000000000214241313527062700171530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #define STACKSIZE 80 #define P 4096 #define ALPHA 8 + STACKSIZE(%rsp) #define OLD_INCX 24 + STACKSIZE(%rsp) #define OLD_Y 32 + STACKSIZE(%rsp) #define OLD_INCY 40 + STACKSIZE(%rsp) #define BUFFER 48 + STACKSIZE(%rsp) #define NLDA 56 (%rsp) #define IS 64 (%rsp) #define XP 72 (%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #define TEMP %rax #define I %rax #define J %r11 #define A1 %r12 #define A2 %r15 #define X1 %r13 #define Y1 %r14 #define MIN_M %rbx PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY FLD ALPHA salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY movq $0, IS test M, M jle .L79 # goto END test N, N jle .L79 # goto END movq N, %rax imulq LDA, %rax movq $P, NLDA subq %rax, NLDA salq $BASE_SHIFT, NLDA salq $BASE_SHIFT, LDA ALIGN_2 .L32: movq $P, %rax movq M, MIN_M subq IS , MIN_M cmpq %rax, MIN_M cmovg %rax, MIN_M movq IS, X1 salq $BASE_SHIFT, X1 leaq (X,X1, 1), X1 movq X1, XP cmpq $SIZE, INCX je .L34 movq BUFFER, X1 movq X1, XP movq MIN_M, I sarq $2, I jle .L35 ALIGN_3 .L36: FLD (X) addq INCX, X FST 0 * SIZE(X1) FLD (X) addq INCX, X FST 1 * SIZE(X1) FLD (X) addq INCX, X FST 2 * SIZE(X1) FLD (X) addq INCX, X FST 3 * SIZE(X1) addq $4 * SIZE, X1 decq I jg .L36 ALIGN_3 .L35: movq MIN_M, I andq $3,I jle .L34 ALIGN_2 .L42: FLD (X) addq INCX, X FST (X1) addq $SIZE, X1 decq I jg .L42 ALIGN_3 /* Main Routine */ .L34: movq Y, Y1 # coffset = y movq N, J sarq $2, J jle .L47 ALIGN_3 .L48: movq A, A1 leaq (A, LDA), A2 leaq (A, LDA, 4), A fldz fldz fldz fldz movq XP, X1 FLD (X1) movq MIN_M, I sarq $2,I jle .L51 ALIGN_3 .L80: FLD 0 * SIZE(A1) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 0 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 0 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 0 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 1 * SIZE(X1) FLD 1 * SIZE(A1) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 1 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 1 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 1 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 2 * SIZE(X1) FLD 2 * SIZE(A1) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 2 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 2 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 2 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 3 * SIZE(X1) FLD 3 * SIZE(A1) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD 3 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD 3 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD 3 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) addq $4 * SIZE, A1 faddp %st,%st(4) addq $4 * SIZE, A2 FLD 4 * SIZE(X1) addq $4 * SIZE, X1 decq I jg .L80 ALIGN_3 .L51: movq MIN_M, I andq $3, I je .L81 ALIGN_3 .L52: FLD (A1) # at = *(a_offset + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(2) # ct1 += at1 FLD (A2) # at1 = *(a_offset2 + 0 * lda) fmul %st(1),%st # at1 *= bt1 faddp %st,%st(3) # ct2 += at1 FLD (A1, LDA, 2) # at = *(a_offset + 2 * lda) fmul %st(1),%st faddp %st,%st(4) FLD (A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) fmulp %st, %st(1) faddp %st,%st(4) FLD 1 * SIZE(X1) addq $SIZE, A1 addq $SIZE, A2 addq $SIZE, X1 decq I jg .L52 ALIGN_3 .L81: ffreep %st(0) fxch %st(4) fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fxch %st(4) FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 decq J jg .L48 ALIGN_3 .L47: movq N, J andq $3, J jle .L60 ALIGN_2 .L61: movq A, A1 # a_offset = a fldz # ct1 = ZERO fldz # ct1 = ZERO addq LDA, A fldz # ct1 = ZERO fldz # ct1 = ZERO movq XP, X1 movq MIN_M, I sarq $3,I jle .L64 ALIGN_3 .L65: FLD 0 * SIZE(X1) FLD 0 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(1) FLD 1 * SIZE(X1) FLD 1 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(2) FLD 2 * SIZE(X1) FLD 2 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(3) FLD 3 * SIZE(X1) FLD 3 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(4) FLD 4 * SIZE(X1) FLD 4 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(1) FLD 5 * SIZE(X1) FLD 5 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(2) FLD 6 * SIZE(X1) FLD 6 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(3) FLD 7 * SIZE(X1) FLD 7 * SIZE(A1) fmulp %st, %st(1) faddp %st,%st(4) addq $8 * SIZE, X1 addq $8 * SIZE, A1 decq I jg .L65 ALIGN_3 .L64: movq MIN_M, I andq $7, I jle .L70 ALIGN_3 .L71: FLD (X1) FLD (A1) fmulp %st, %st(1) faddp %st,%st(1) addq $SIZE, X1 addq $SIZE, A1 decq I jg .L71 ALIGN_3 .L70: faddp %st, %st(1) faddp %st, %st(1) faddp %st, %st(1) fmul %st(1),%st FLD (Y1) faddp %st, %st(1) FST (Y1) addq INCY, Y1 decq J jg .L61 ALIGN_3 .L60: addq NLDA, A addq $P, IS cmpq M, IS jl .L32 ALIGN_3 .L79: EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qtrsm_kernel_LN_2x2.S000066400000000000000000000450731313527062700213110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define N ARG2 #define K ARG3 #define A ARG4 #define B ARG5 #define C ARG6 #define LDC %r10 #define I %r12 #define J %r13 #define AO %r14 #define BO %r15 #define CO %rbp #define KK %r11 #define AORIG 48(%rsp) #define STACKSIZE 64 #define ALPHA 8 + STACKSIZE(%rsp) #define OFFSET 32 + STACKSIZE(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq 24 + STACKSIZE(%rsp), LDC #if defined(TRMMKERNEL) && !defined(LEFT) movq OFFSET, %rax negq %rax movq %rax, KK #endif addq $8 * SIZE, A addq $8 * SIZE, B salq $BASE_SHIFT, LDC #ifdef LN movq M, %rax salq $BASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $BASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN movq OFFSET, %rax negq %rax movq %rax, KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, %rax sarq $1, %rax movq %rax, J je .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, %rax movq %rax, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif lea (, LDC, 2), %rax #ifdef RT subq %rax, C #endif movq C, CO #ifndef RT addq %rax, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, %rax andq $1, %rax je .L20 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif fldz fldz #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $4 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $1 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) FLD -7 * SIZE(BO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(2) #endif #ifdef RT FLD -5 * SIZE(BO) fmulp %st, %st(2) FLD -6 * SIZE(BO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subq $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 0 * SIZE(CO, LDC) FST 0 * SIZE(CO) #ifndef LN addq $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: movq M, I sarq $1, I je .L29 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addq $8 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addq $2 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) FLD -6 * SIZE(BO) fsubp %st, %st(3) FLD -5 * SIZE(BO) fsubp %st, %st(4) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) FLD -6 * SIZE(AO) fsubp %st, %st(2) FLD -5 * SIZE(AO) fsubp %st, %st(4) #endif #ifdef LN FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) FLD -6 * SIZE(AO) fmul %st(3), %st FLD -6 * SIZE(AO) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) FLD -7 * SIZE(AO) fmul %st(1), %st FLD -7 * SIZE(AO) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmul %st(1), %st FLD -7 * SIZE(BO) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) FLD -6 * SIZE(BO) fmul %st(2), %st FLD -6 * SIZE(BO) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subq $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) fxch %st(2) fld %st FST -6 * SIZE(BO) fxch %st(3) fld %st FST -5 * SIZE(BO) FST 1 * SIZE(CO, LDC) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO) #else fld %st FST -8 * SIZE(AO) fxch %st(2) fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -6 * SIZE(AO) fxch %st(3) fld %st FST -5 * SIZE(AO) FST 1 * SIZE(CO, LDC) FST 1 * SIZE(CO) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #ifndef LN addq $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I jne .L11 ALIGN_4 .L29: #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J jne .L01 ALIGN_4 .L30: movq N, %rax testq $1, %rax je .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, %rax movq %rax, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #ifdef RT subq LDC, C #endif movq C, CO #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, %rax andq $1, %rax je .L40 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif fldz #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $4 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $1 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) #endif #ifdef LN FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef RT FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subq $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) #endif FST 0 * SIZE(CO) #ifndef LN addq $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L40: movq M, I sarq $1, I je .L49 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $8 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $2 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #ifdef LN FLD -5 * SIZE(AO) fmulp %st, %st(2) FLD -6 * SIZE(AO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) FLD -7 * SIZE(AO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subq $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 1 * SIZE(CO) FST 0 * SIZE(CO) #ifndef LN addq $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I jne .L31 ALIGN_4 .L49: #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qtrsm_kernel_LT_2x2.S000066400000000000000000000450731313527062700213170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define N ARG2 #define K ARG3 #define A ARG4 #define B ARG5 #define C ARG6 #define LDC %r10 #define I %r12 #define J %r13 #define AO %r14 #define BO %r15 #define CO %rbp #define KK %r11 #define AORIG 48(%rsp) #define STACKSIZE 64 #define ALPHA 8 + STACKSIZE(%rsp) #define OFFSET 32 + STACKSIZE(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq 24 + STACKSIZE(%rsp), LDC #if defined(TRMMKERNEL) && !defined(LEFT) movq OFFSET, %rax negq %rax movq %rax, KK #endif addq $8 * SIZE, A addq $8 * SIZE, B salq $BASE_SHIFT, LDC #ifdef LN movq M, %rax salq $BASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $BASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN movq OFFSET, %rax negq %rax movq %rax, KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, %rax sarq $1, %rax movq %rax, J je .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, %rax movq %rax, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif lea (, LDC, 2), %rax #ifdef RT subq %rax, C #endif movq C, CO #ifndef RT addq %rax, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I je .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addq $8 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addq $2 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) FLD -6 * SIZE(BO) fsubp %st, %st(3) FLD -5 * SIZE(BO) fsubp %st, %st(4) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) FLD -6 * SIZE(AO) fsubp %st, %st(2) FLD -5 * SIZE(AO) fsubp %st, %st(4) #endif #ifdef LN FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) FLD -6 * SIZE(AO) fmul %st(3), %st FLD -6 * SIZE(AO) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) FLD -7 * SIZE(AO) fmul %st(1), %st FLD -7 * SIZE(AO) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmul %st(1), %st FLD -7 * SIZE(BO) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) FLD -6 * SIZE(BO) fmul %st(2), %st FLD -6 * SIZE(BO) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subq $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) fxch %st(2) fld %st FST -6 * SIZE(BO) fxch %st(3) fld %st FST -5 * SIZE(BO) FST 1 * SIZE(CO, LDC) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO) #else fld %st FST -8 * SIZE(AO) fxch %st(2) fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -6 * SIZE(AO) fxch %st(3) fld %st FST -5 * SIZE(AO) FST 1 * SIZE(CO, LDC) FST 1 * SIZE(CO) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #ifndef LN addq $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I jne .L11 ALIGN_4 .L20: movq M, %rax andq $1, %rax je .L29 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif fldz fldz #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $4 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $1 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) FLD -7 * SIZE(BO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(2) #endif #ifdef RT FLD -5 * SIZE(BO) fmulp %st, %st(2) FLD -6 * SIZE(BO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subq $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 0 * SIZE(CO, LDC) FST 0 * SIZE(CO) #ifndef LN addq $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L29: #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J jne .L01 ALIGN_4 .L30: movq N, %rax testq $1, %rax je .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, %rax movq %rax, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #ifdef RT subq LDC, C #endif movq C, CO #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I je .L40 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $8 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $2 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #ifdef LN FLD -5 * SIZE(AO) fmulp %st, %st(2) FLD -6 * SIZE(AO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) FLD -7 * SIZE(AO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subq $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 1 * SIZE(CO) FST 0 * SIZE(CO) #ifndef LN addq $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I jne .L31 ALIGN_4 .L40: movq M, %rax andq $1, %rax je .L49 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif fldz #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $4 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $1 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) #endif #ifdef LN FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef RT FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subq $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) #endif FST 0 * SIZE(CO) #ifndef LN addq $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/qtrsm_kernel_RT_2x2.S000066400000000000000000000450731313527062700213250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define N ARG2 #define K ARG3 #define A ARG4 #define B ARG5 #define C ARG6 #define LDC %r10 #define I %r12 #define J %r13 #define AO %r14 #define BO %r15 #define CO %rbp #define KK %r11 #define AORIG 48(%rsp) #define STACKSIZE 64 #define ALPHA 8 + STACKSIZE(%rsp) #define OFFSET 32 + STACKSIZE(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq 24 + STACKSIZE(%rsp), LDC #if defined(TRMMKERNEL) && !defined(LEFT) movq OFFSET, %rax negq %rax movq %rax, KK #endif addq $8 * SIZE, A addq $8 * SIZE, B salq $BASE_SHIFT, LDC #ifdef LN movq M, %rax salq $BASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $BASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN movq OFFSET, %rax negq %rax movq %rax, KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, %rax testq $1, %rax je .L30 #if defined(LT) || defined(RN) movq A, AO #else movq A, %rax movq %rax, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #ifdef RT subq LDC, C #endif movq C, CO #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I je .L40 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $8 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $2 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #ifdef LN FLD -5 * SIZE(AO) fmulp %st, %st(2) FLD -6 * SIZE(AO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) FLD -7 * SIZE(AO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RT FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LN subq $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 1 * SIZE(CO) FST 0 * SIZE(CO) #ifndef LN addq $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I jne .L31 ALIGN_4 .L40: movq M, %rax andq $1, %rax je .L49 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif fldz #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $4 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $1 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) #endif #ifdef LN FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef LT FLD -8 * SIZE(AO) fmulp %st, %st(1) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef RT FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subq $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) #endif FST 0 * SIZE(CO) #ifndef LN addq $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L30: movq N, %rax sarq $1, %rax movq %rax, J je .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, %rax movq %rax, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif lea (, LDC, 2), %rax #ifdef RT subq %rax, C #endif movq C, CO #ifndef RT addq %rax, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I je .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addq $8 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(4) faddp %st, %st(2) addq $2 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) FLD -6 * SIZE(BO) fsubp %st, %st(3) FLD -5 * SIZE(BO) fsubp %st, %st(4) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(3) FLD -6 * SIZE(AO) fsubp %st, %st(2) FLD -5 * SIZE(AO) fsubp %st, %st(4) #endif #ifdef LN FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) FLD -6 * SIZE(AO) fmul %st(3), %st FLD -6 * SIZE(AO) fmul %st(5), %st fsubrp %st, %st(3) fsubrp %st, %st(1) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef LT FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) FLD -7 * SIZE(AO) fmul %st(1), %st FLD -7 * SIZE(AO) fmul %st(3), %st fsubrp %st, %st(5) fsubrp %st, %st(3) FLD -5 * SIZE(AO) fmul %st, %st(3) fmulp %st, %st(4) #endif #ifdef RN FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmul %st(1), %st FLD -7 * SIZE(BO) fmul %st(4), %st fsubrp %st, %st(5) fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) #endif #ifdef RT FLD -5 * SIZE(BO) fmul %st, %st(2) fmulp %st, %st(4) FLD -6 * SIZE(BO) fmul %st(2), %st FLD -6 * SIZE(BO) fmul %st(5), %st fsubrp %st, %st(4) fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmul %st, %st(1) fmulp %st, %st(3) #endif #ifdef LN subq $2 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) fxch %st(2) fld %st FST -6 * SIZE(BO) fxch %st(3) fld %st FST -5 * SIZE(BO) FST 1 * SIZE(CO, LDC) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO) #else fld %st FST -8 * SIZE(AO) fxch %st(2) fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -6 * SIZE(AO) fxch %st(3) fld %st FST -5 * SIZE(AO) FST 1 * SIZE(CO, LDC) FST 1 * SIZE(CO) FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #ifndef LN addq $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I jne .L11 ALIGN_4 .L20: movq M, %rax andq $1, %rax je .L29 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif fldz fldz #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $4 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $1 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $BASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st, %st(1) fmulp %st, %st(2) #endif #ifdef RN FLD -8 * SIZE(BO) fmulp %st, %st(1) FLD -7 * SIZE(BO) fmul %st(1), %st fsubrp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(2) #endif #ifdef RT FLD -5 * SIZE(BO) fmulp %st, %st(2) FLD -6 * SIZE(BO) fmul %st(2), %st fsubrp %st, %st(1) FLD -8 * SIZE(BO) fmulp %st, %st(1) #endif #ifdef LN subq $1 * SIZE, CO #endif #if defined(LN) || defined(LT) fld %st FST -8 * SIZE(BO) fxch %st(1) fld %st FST -7 * SIZE(BO) #else fld %st FST -8 * SIZE(AO) fxch %st(1) fld %st FST -7 * SIZE(AO) #endif FST 0 * SIZE(CO, LDC) FST 0 * SIZE(CO) #ifndef LN addq $1 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L29: #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J jne .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/rot.S000066400000000000000000000137361313527062700163240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #define I %rax #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 48(%rsp), INCY FLD 72(%rsp) FLD 56(%rsp) #else FLD 24(%rsp) FLD 8(%rsp) #endif salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY testq N, N jle .L999 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 movq N, I sarq $2, I jle .L15 ALIGN_4 .L10: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 2 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 2 * SIZE(Y) FLD 3 * SIZE(X) FLD 3 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 3 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 3 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y decq I jg .L10 ALIGN_4 .L15: movq N, I andq $3, I jle .L999 ALIGN_4 .L16: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addq $SIZE, X addq $SIZE, Y decq I jg .L16 jmp .L999 ALIGN_4 .L50: movq N, I sarq $2, I jle .L55 ALIGN_4 .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addq INCX, X addq INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addq INCX, X addq INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addq INCX, X addq INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addq INCX, X addq INCY, Y decq I jg .L51 ALIGN_4 .L55: movq N, I andq $3, I jle .L999 ALIGN_4 .L56: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) addq INCX, X addq INCY, Y decq I jg .L56 ALIGN_4 .L999: ffreep %st ffreep %st ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/rot_sse.S000066400000000000000000000450071313527062700171720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #define C %xmm14 #define S %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY movss 48(%rsp), %xmm0 movss 56(%rsp), %xmm1 #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY pshufd $0x0, %xmm0, C pshufd $0x0, %xmm1, S cmpq $0, N jle .L999 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 testq $SIZE, X je .L05 movss 0 * SIZE(Y), %xmm1 movss 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, 0 * SIZE(X) movss %xmm2, 0 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq N jle .L999 .L05: testq $2 * SIZE, X je .L10 cmpq $1, N je .L17 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y subq $2, N jle .L999 ALIGN_2 .L10: testq $3 * SIZE, Y jne .L20 movq N, %rax sarq $5, %rax jle .L14 movaps 0 * SIZE(Y), %xmm1 movaps 4 * SIZE(Y), %xmm3 movaps 8 * SIZE(Y), %xmm9 movaps 12 * SIZE(Y), %xmm11 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm8 movaps 12 * SIZE(X), %xmm10 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movaps 16 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movaps 20 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, 0 * SIZE(X) movaps 16 * SIZE(X), %xmm0 movaps %xmm2, 4 * SIZE(X) movaps 20 * SIZE(X), %xmm2 movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movaps 24 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movaps 28 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 8 * SIZE(X) movaps 24 * SIZE(X), %xmm8 movaps %xmm10,12 * SIZE(X) movaps 28 * SIZE(X), %xmm10 movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 12 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movaps 32 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movaps 36 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 16 * SIZE(X) movaps 32 * SIZE(X), %xmm0 movaps %xmm2, 20 * SIZE(X) movaps 36 * SIZE(X), %xmm2 movaps %xmm4, 16 * SIZE(Y) movaps %xmm6, 20 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movaps 40 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movaps 44 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 24 * SIZE(X) movaps 40 * SIZE(X), %xmm8 movaps %xmm10, 28 * SIZE(X) movaps 44 * SIZE(X), %xmm10 movaps %xmm4, 24 * SIZE(Y) movaps %xmm6, 28 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movaps 16 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movaps 20 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps 16 * SIZE(X), %xmm0 movaps %xmm2, 4 * SIZE(X) movaps 20 * SIZE(X), %xmm2 movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movaps 24 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movaps 28 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 8 * SIZE(X) movaps 24 * SIZE(X), %xmm8 movaps %xmm10,12 * SIZE(X) movaps 28 * SIZE(X), %xmm10 movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 12 * SIZE(Y) movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 16 * SIZE(X) movaps %xmm2, 20 * SIZE(X) movaps %xmm4, 16 * SIZE(Y) movaps %xmm6, 20 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 addps %xmm11, %xmm10 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 24 * SIZE(X) movaps %xmm10, 28 * SIZE(X) movaps %xmm4, 24 * SIZE(Y) movaps %xmm6, 28 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y ALIGN_3 .L14: testq $31, N jle .L999 testq $16, N jle .L15 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(Y), %xmm3 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) movaps 8 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps 12 * SIZE(Y), %xmm3 movaps 12 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 12 * SIZE(X) movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 12 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L15: testq $8, N jle .L16 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(Y), %xmm3 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L16: testq $4, N jle .L17 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 0 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L17: testq $2, N jle .L18 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L18: testq $1, N jle .L999 movss 0 * SIZE(Y), %xmm1 movss 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, 0 * SIZE(X) movss %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movq N, %rax sarq $5, %rax jle .L24 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movsd 12 * SIZE(Y), %xmm3 movhps 14 * SIZE(Y), %xmm3 movaps 8 * SIZE(X), %xmm0 movaps 12 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 12 * SIZE(X) movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 movsd 20 * SIZE(Y), %xmm3 movhps 22 * SIZE(Y), %xmm3 movaps 16 * SIZE(X), %xmm0 movaps 20 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 16 * SIZE(X) movaps %xmm2, 20 * SIZE(X) movlps %xmm4, 16 * SIZE(Y) movhps %xmm4, 18 * SIZE(Y) movlps %xmm6, 20 * SIZE(Y) movhps %xmm6, 22 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movsd 24 * SIZE(Y), %xmm1 movhps 26 * SIZE(Y), %xmm1 movsd 28 * SIZE(Y), %xmm3 movhps 30 * SIZE(Y), %xmm3 movaps 24 * SIZE(X), %xmm0 movaps 28 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 24 * SIZE(X) movaps %xmm2, 28 * SIZE(X) movlps %xmm4, 24 * SIZE(Y) movhps %xmm4, 26 * SIZE(Y) movlps %xmm6, 28 * SIZE(Y) movhps %xmm6, 30 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L24: testq $31, N jle .L999 testq $16, N jle .L25 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movsd 12 * SIZE(Y), %xmm3 movhps 14 * SIZE(Y), %xmm3 movaps 8 * SIZE(X), %xmm0 movaps 12 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 12 * SIZE(X) movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L25: testq $8, N jle .L26 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L26: testq $4, N jle .L27 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L27: testq $2, N jle .L28 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L28: testq $1, N jle .L999 movss 0 * SIZE(Y), %xmm1 movss 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, 0 * SIZE(X) movss %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L50: movq N, %rax sarq $2, %rax jle .L55 ALIGN_3 .L53: movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addq INCX, X addq INCY, Y movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addq INCX, X addq INCY, Y movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addq INCX, X addq INCY, Y movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addq INCX, X addq INCY, Y decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3 .L56: movss (Y), %xmm1 movss (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulss C, %xmm0 mulss S, %xmm1 mulss C, %xmm2 mulss S, %xmm3 addss %xmm1, %xmm0 subss %xmm3, %xmm2 movss %xmm0, (X) movss %xmm2, (Y) addq INCX, X addq INCY, Y decq %rax jg .L56 ALIGN_3 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/rot_sse2.S000066400000000000000000000423641313527062700172570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #define C %xmm14 #define S %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY movsd 48(%rsp), %xmm0 movsd 56(%rsp), %xmm1 #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY pshufd $0x44, %xmm0, C pshufd $0x44, %xmm1, S cmpq $0, N jle .L999 cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 testq $SIZE, X je .L10 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq N jle .L999 ALIGN_2 .L10: testq $SIZE, Y jne .L20 movq N, %rax sarq $4, %rax jle .L14 movaps 0 * SIZE(Y), %xmm1 movaps 2 * SIZE(Y), %xmm3 movaps 4 * SIZE(Y), %xmm9 movaps 6 * SIZE(Y), %xmm11 movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(X), %xmm2 movaps 4 * SIZE(X), %xmm8 movaps 6 * SIZE(X), %xmm10 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulpd S, %xmm1 movaps %xmm3, %xmm6 mulpd S, %xmm3 movaps %xmm0, %xmm5 mulpd C, %xmm0 movaps %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movaps 8 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movaps 10 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, 0 * SIZE(X) movaps 8 * SIZE(X), %xmm0 movaps %xmm2, 2 * SIZE(X) movaps 10 * SIZE(X), %xmm2 movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 2 * SIZE(Y) movaps %xmm9, %xmm4 mulpd S, %xmm9 movaps %xmm8, %xmm5 mulpd C, %xmm8 movaps %xmm11, %xmm6 mulpd S, %xmm11 movaps %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movaps 12 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movaps 14 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm8, 4 * SIZE(X) movaps 12 * SIZE(X), %xmm8 movaps %xmm10,6 * SIZE(X) movaps 14 * SIZE(X), %xmm10 movaps %xmm4, 4 * SIZE(Y) movaps %xmm6, 6 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulpd S, %xmm1 movaps %xmm3, %xmm6 mulpd S, %xmm3 movaps %xmm0, %xmm5 mulpd C, %xmm0 movaps %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movaps 16 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movaps 18 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps 16 * SIZE(X), %xmm0 movaps %xmm2, 10 * SIZE(X) movaps 18 * SIZE(X), %xmm2 movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 10 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm9, %xmm4 mulpd S, %xmm9 movaps %xmm8, %xmm5 mulpd C, %xmm8 movaps %xmm11, %xmm6 mulpd S, %xmm11 movaps %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movaps 20 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movaps 22 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm8, 12 * SIZE(X) movaps 20 * SIZE(X), %xmm8 movaps %xmm10, 14 * SIZE(X) movaps 22 * SIZE(X), %xmm10 movaps %xmm4, 12 * SIZE(Y) movaps %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: movaps %xmm1, %xmm4 mulpd S, %xmm1 movaps %xmm3, %xmm6 mulpd S, %xmm3 movaps %xmm0, %xmm5 mulpd C, %xmm0 movaps %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movaps 8 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movaps 10 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps 8 * SIZE(X), %xmm0 movaps %xmm2, 2 * SIZE(X) movaps 10 * SIZE(X), %xmm2 movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 2 * SIZE(Y) movaps %xmm9, %xmm4 mulpd S, %xmm9 movaps %xmm8, %xmm5 mulpd C, %xmm8 movaps %xmm11, %xmm6 mulpd S, %xmm11 movaps %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movaps 12 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movaps 14 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm8, 4 * SIZE(X) movaps 12 * SIZE(X), %xmm8 movaps %xmm10,6 * SIZE(X) movaps 14 * SIZE(X), %xmm10 movaps %xmm4, 4 * SIZE(Y) movaps %xmm6, 6 * SIZE(Y) movaps %xmm1, %xmm4 mulpd S, %xmm1 movaps %xmm3, %xmm6 mulpd S, %xmm3 movaps %xmm0, %xmm5 mulpd C, %xmm0 movaps %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 10 * SIZE(X) movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 10 * SIZE(Y) movaps %xmm9, %xmm4 mulpd S, %xmm9 movaps %xmm8, %xmm5 mulpd C, %xmm8 movaps %xmm11, %xmm6 mulpd S, %xmm11 movaps %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm8, 12 * SIZE(X) movaps %xmm10, 14 * SIZE(X) movaps %xmm4, 12 * SIZE(Y) movaps %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L14: testq $15, N jle .L999 testq $8, N jle .L15 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(Y), %xmm3 movaps 2 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 2 * SIZE(X) movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 2 * SIZE(Y) movaps 4 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps 6 * SIZE(Y), %xmm3 movaps 6 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 4 * SIZE(X) movaps %xmm2, 6 * SIZE(X) movaps %xmm4, 4 * SIZE(Y) movaps %xmm6, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $4, N jle .L16 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(Y), %xmm3 movaps 2 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 2 * SIZE(X) movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, N jle .L17 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 0 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, N jle .L999 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movaps -1 * SIZE(Y), %xmm1 movq N, %rax sarq $4, %rax jle .L24 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps 1 * SIZE(Y), %xmm3 movaps 3 * SIZE(Y), %xmm8 movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 2 * SIZE(X) movlpd %xmm4, 0 * SIZE(Y) movhps %xmm4, 1 * SIZE(Y) movlpd %xmm6, 2 * SIZE(Y) movhps %xmm6, 3 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps 5 * SIZE(Y), %xmm9 movaps 7 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps 6 * SIZE(X), %xmm2 SHUFPD_1 %xmm9, %xmm8 SHUFPD_1 %xmm1, %xmm9 movaps %xmm8, %xmm4 movaps %xmm0, %xmm5 movaps %xmm9, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm8 mulpd C, %xmm2 mulpd S, %xmm9 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 4 * SIZE(X) movaps %xmm2, 6 * SIZE(X) movlpd %xmm4, 4 * SIZE(Y) movhps %xmm4, 5 * SIZE(Y) movlpd %xmm6, 6 * SIZE(Y) movhps %xmm6, 7 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps 9 * SIZE(Y), %xmm3 movaps 11 * SIZE(Y), %xmm8 movaps 8 * SIZE(X), %xmm0 movaps 10 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 10 * SIZE(X) movlpd %xmm4, 8 * SIZE(Y) movhps %xmm4, 9 * SIZE(Y) movlpd %xmm6, 10 * SIZE(Y) movhps %xmm6, 11 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps 13 * SIZE(Y), %xmm9 movaps 15 * SIZE(Y), %xmm1 movaps 12 * SIZE(X), %xmm0 movaps 14 * SIZE(X), %xmm2 SHUFPD_1 %xmm9, %xmm8 SHUFPD_1 %xmm1, %xmm9 movaps %xmm8, %xmm4 movaps %xmm0, %xmm5 movaps %xmm9, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm8 mulpd C, %xmm2 mulpd S, %xmm9 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 12 * SIZE(X) movaps %xmm2, 14 * SIZE(X) movlpd %xmm4, 12 * SIZE(Y) movhps %xmm4, 13 * SIZE(Y) movlpd %xmm6, 14 * SIZE(Y) movhps %xmm6, 15 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L24: testq $15, N jle .L999 testq $8, N jle .L25 movaps 1 * SIZE(Y), %xmm3 movaps 3 * SIZE(Y), %xmm8 movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 2 * SIZE(X) movlpd %xmm4, 0 * SIZE(Y) movhps %xmm4, 1 * SIZE(Y) movlpd %xmm6, 2 * SIZE(Y) movhps %xmm6, 3 * SIZE(Y) movaps 5 * SIZE(Y), %xmm9 movaps 7 * SIZE(Y), %xmm1 movaps 4 * SIZE(X), %xmm0 movaps 6 * SIZE(X), %xmm2 SHUFPD_1 %xmm9, %xmm8 SHUFPD_1 %xmm1, %xmm9 movaps %xmm8, %xmm4 movaps %xmm0, %xmm5 movaps %xmm9, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm8 mulpd C, %xmm2 mulpd S, %xmm9 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 4 * SIZE(X) movaps %xmm2, 6 * SIZE(X) movlpd %xmm4, 4 * SIZE(Y) movhps %xmm4, 5 * SIZE(Y) movlpd %xmm6, 6 * SIZE(Y) movhps %xmm6, 7 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $4, N jle .L26 movaps 1 * SIZE(Y), %xmm3 movaps 3 * SIZE(Y), %xmm8 movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 2 * SIZE(X) movlpd %xmm4, 0 * SIZE(Y) movhps %xmm4, 1 * SIZE(Y) movlpd %xmm6, 2 * SIZE(Y) movhps %xmm6, 3 * SIZE(Y) movaps %xmm8, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: testq $2, N jle .L27 movaps 1 * SIZE(Y), %xmm4 movaps 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlpd %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) movaps %xmm4, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: testq $1, N jle .L999 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L50: movq N, %rax cmpq $0, INCX je .L56 cmpq $0, INCY je .L56 sarq $2, %rax jle .L55 ALIGN_3 .L53: movsd (Y), %xmm1 movhps (Y, INCY), %xmm1 movsd (X), %xmm0 movhps (X, INCX), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, (X) movhps %xmm0, (X, INCX) movlpd %xmm2, (Y) movhps %xmm2, (Y, INCY) leaq (X, INCX, 2), X leaq (Y, INCY, 2), Y movsd (Y), %xmm1 movhps (Y, INCY), %xmm1 movsd (X), %xmm0 movhps (X, INCX), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlpd %xmm0, (X) movhps %xmm0, (X, INCX) movlpd %xmm2, (Y) movhps %xmm2, (Y, INCY) leaq (X, INCX, 2), X leaq (Y, INCY, 2), Y decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3 .L56: movsd (Y), %xmm1 movsd (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, (X) movsd %xmm2, (Y) addq INCX, X addq INCY, Y decq %rax jg .L56 ALIGN_3 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/saxpy.c000066400000000000000000000066511313527062700167020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(NEHALEM) #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "saxpy_microk_piledriver-2.c" #endif #ifndef HAVE_KERNEL_16 static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; while(i < n) { y[i] += a * x[i]; y[i+1] += a * x[i+1]; y[i+2] += a * x[i+2]; y[i+3] += a * x[i+3]; y[i+4] += a * x[i+4]; y[i+5] += a * x[i+5]; y[i+6] += a * x[i+6]; y[i+7] += a * x[i+7]; i+=8 ; } } #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; if ( n <= 0 ) return(0); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -32; if ( n1 ) saxpy_kernel_16(n1, x, y , &da ); i = n1; while(i < n) { y[i] += da * x[i] ; i++ ; } return(0); } BLASLONG n1 = n & -4; while(i < n1) { FLOAT m1 = da * x[ix] ; FLOAT m2 = da * x[ix+inc_x] ; FLOAT m3 = da * x[ix+2*inc_x] ; FLOAT m4 = da * x[ix+3*inc_x] ; y[iy] += m1 ; y[iy+inc_y] += m2 ; y[iy+2*inc_y] += m3 ; y[iy+3*inc_y] += m4 ; ix += inc_x*4 ; iy += inc_y*4 ; i+=4 ; } while(i < n) { y[iy] += da * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/x86_64/saxpy_microk_haswell-2.c000066400000000000000000000062551313527062700221240ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastss (%4), %%ymm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "vmovups (%3,%0,4), %%ymm12 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm13 \n\t" // 8 * y "vmovups 64(%3,%0,4), %%ymm14 \n\t" // 8 * y "vmovups 96(%3,%0,4), %%ymm15 \n\t" // 8 * y "vfmadd231ps (%2,%0,4), %%ymm0 , %%ymm12 \n\t" // y += alpha * x "vfmadd231ps 32(%2,%0,4), %%ymm0 , %%ymm13 \n\t" // y += alpha * x "vfmadd231ps 64(%2,%0,4), %%ymm0 , %%ymm14 \n\t" // y += alpha * x "vfmadd231ps 96(%2,%0,4), %%ymm0 , %%ymm15 \n\t" // y += alpha * x "vmovups %%ymm12, (%3,%0,4) \n\t" "vmovups %%ymm13, 32(%3,%0,4) \n\t" "vmovups %%ymm14, 64(%3,%0,4) \n\t" "vmovups %%ymm15, 96(%3,%0,4) \n\t" "addq $32, %0 \n\t" "subq $32, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/saxpy_microk_nehalem-2.c000066400000000000000000000073011313527062700220670ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "movss (%4), %%xmm0 \n\t" // alpha "shufps $0, %%xmm0, %%xmm0 \n\t" ".align 16 \n\t" "1: \n\t" // "prefetcht0 192(%2,%0,4) \n\t" // "prefetcht0 192(%3,%0,4) \n\t" "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x "movups (%3,%0,4), %%xmm8 \n\t" // 4 * y "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y "mulps %%xmm0 , %%xmm12 \n\t" // alpha * x "mulps %%xmm0 , %%xmm13 \n\t" "mulps %%xmm0 , %%xmm14 \n\t" "mulps %%xmm0 , %%xmm15 \n\t" "addps %%xmm12, %%xmm8 \n\t" // y += alpha *x "addps %%xmm13, %%xmm9 \n\t" "addps %%xmm14, %%xmm10 \n\t" "addps %%xmm15, %%xmm11 \n\t" "movups %%xmm8 , (%3,%0,4) \n\t" "movups %%xmm9 , 16(%3,%0,4) \n\t" "movups %%xmm10, 32(%3,%0,4) \n\t" "movups %%xmm11, 48(%3,%0,4) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/saxpy_microk_piledriver-2.c000066400000000000000000000145471313527062700226350ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; if ( n < 1024 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "vmovups (%3,%0,4), %%xmm8 \n\t" // 4 * y "vmovups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y "vmovups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y "vmovups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y "vmovups 64(%3,%0,4), %%xmm12 \n\t" // 4 * y "vmovups 80(%3,%0,4), %%xmm13 \n\t" // 4 * y "vmovups 96(%3,%0,4), %%xmm14 \n\t" // 4 * y "vmovups 112(%3,%0,4), %%xmm15 \n\t" // 4 * y "vfmadd231ps (%2,%0,4), %%xmm0 , %%xmm8 \n\t" // y += alpha * x "vfmadd231ps 16(%2,%0,4), %%xmm0 , %%xmm9 \n\t" // y += alpha * x "vfmadd231ps 32(%2,%0,4), %%xmm0 , %%xmm10 \n\t" // y += alpha * x "vfmadd231ps 48(%2,%0,4), %%xmm0 , %%xmm11 \n\t" // y += alpha * x "vfmadd231ps 64(%2,%0,4), %%xmm0 , %%xmm12 \n\t" // y += alpha * x "vfmadd231ps 80(%2,%0,4), %%xmm0 , %%xmm13 \n\t" // y += alpha * x "vfmadd231ps 96(%2,%0,4), %%xmm0 , %%xmm14 \n\t" // y += alpha * x "vfmadd231ps 112(%2,%0,4), %%xmm0 , %%xmm15 \n\t" // y += alpha * x "vmovups %%xmm8 , (%3,%0,4) \n\t" "vmovups %%xmm9 , 16(%3,%0,4) \n\t" "vmovups %%xmm10, 32(%3,%0,4) \n\t" "vmovups %%xmm11, 48(%3,%0,4) \n\t" "vmovups %%xmm12, 64(%3,%0,4) \n\t" "vmovups %%xmm13, 80(%3,%0,4) \n\t" "vmovups %%xmm14, 96(%3,%0,4) \n\t" "vmovups %%xmm15,112(%3,%0,4) \n\t" "addq $32, %0 \n\t" "subq $32, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%4), %%xmm0 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%3,%0,4) \n\t" "prefetcht0 576(%3,%0,4) \n\t" "vmovups (%3,%0,4), %%xmm8 \n\t" // 4 * y "vmovups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y "vmovups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y "vmovups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y "vmovups 64(%3,%0,4), %%xmm12 \n\t" // 4 * y "vmovups 80(%3,%0,4), %%xmm13 \n\t" // 4 * y "vmovups 96(%3,%0,4), %%xmm14 \n\t" // 4 * y "vmovups 112(%3,%0,4), %%xmm15 \n\t" // 4 * y "prefetcht0 512(%2,%0,4) \n\t" "prefetcht0 576(%2,%0,4) \n\t" "vfmadd231ps (%2,%0,4), %%xmm0 , %%xmm8 \n\t" // y += alpha * x "vfmadd231ps 16(%2,%0,4), %%xmm0 , %%xmm9 \n\t" // y += alpha * x "vfmadd231ps 32(%2,%0,4), %%xmm0 , %%xmm10 \n\t" // y += alpha * x "vfmadd231ps 48(%2,%0,4), %%xmm0 , %%xmm11 \n\t" // y += alpha * x "vfmadd231ps 64(%2,%0,4), %%xmm0 , %%xmm12 \n\t" // y += alpha * x "vfmadd231ps 80(%2,%0,4), %%xmm0 , %%xmm13 \n\t" // y += alpha * x "vfmadd231ps 96(%2,%0,4), %%xmm0 , %%xmm14 \n\t" // y += alpha * x "vfmadd231ps 112(%2,%0,4), %%xmm0 , %%xmm15 \n\t" // y += alpha * x "vmovups %%xmm8 , (%3,%0,4) \n\t" "vmovups %%xmm9 , 16(%3,%0,4) \n\t" "vmovups %%xmm10, 32(%3,%0,4) \n\t" "vmovups %%xmm11, 48(%3,%0,4) \n\t" "vmovups %%xmm12, 64(%3,%0,4) \n\t" "vmovups %%xmm13, 80(%3,%0,4) \n\t" "vmovups %%xmm14, 96(%3,%0,4) \n\t" "vmovups %%xmm15,112(%3,%0,4) \n\t" "addq $32, %0 \n\t" "subq $32, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/saxpy_microk_sandy-2.c000066400000000000000000000104631313527062700215770ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastss (%4), %%ymm0 \n\t" // alpha "vmovups (%3,%0,4), %%ymm8 \n\t" "vmovups 32(%3,%0,4), %%ymm9 \n\t" "vmovups 64(%3,%0,4), %%ymm10 \n\t" "vmovups 96(%3,%0,4), %%ymm11 \n\t" "vmovups (%2,%0,4), %%ymm4 \n\t" "vmovups 32(%2,%0,4), %%ymm5 \n\t" "vmovups 64(%2,%0,4), %%ymm6 \n\t" "vmovups 96(%2,%0,4), %%ymm7 \n\t" "addq $32, %0 \n\t" "subq $32, %1 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "vmulps %%ymm4, %%ymm0, %%ymm4 \n\t" "vaddps %%ymm8 , %%ymm4, %%ymm12 \n\t" "vmulps %%ymm5, %%ymm0, %%ymm5 \n\t" "vaddps %%ymm9 , %%ymm5, %%ymm13 \n\t" "vmulps %%ymm6, %%ymm0, %%ymm6 \n\t" "vaddps %%ymm10, %%ymm6, %%ymm14 \n\t" "vmulps %%ymm7, %%ymm0, %%ymm7 \n\t" "vaddps %%ymm11, %%ymm7, %%ymm15 \n\t" "vmovups (%3,%0,4), %%ymm8 \n\t" "vmovups 32(%3,%0,4), %%ymm9 \n\t" "vmovups 64(%3,%0,4), %%ymm10 \n\t" "vmovups 96(%3,%0,4), %%ymm11 \n\t" "vmovups (%2,%0,4), %%ymm4 \n\t" "vmovups 32(%2,%0,4), %%ymm5 \n\t" "vmovups 64(%2,%0,4), %%ymm6 \n\t" "vmovups 96(%2,%0,4), %%ymm7 \n\t" "vmovups %%ymm12, -128(%3,%0,4) \n\t" "vmovups %%ymm13, -96(%3,%0,4) \n\t" "vmovups %%ymm14, -64(%3,%0,4) \n\t" "vmovups %%ymm15, -32(%3,%0,4) \n\t" "addq $32, %0 \n\t" "subq $32, %1 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%ymm4, %%ymm0, %%ymm4 \n\t" "vmulps %%ymm5, %%ymm0, %%ymm5 \n\t" "vmulps %%ymm6, %%ymm0, %%ymm6 \n\t" "vmulps %%ymm7, %%ymm0, %%ymm7 \n\t" "vaddps %%ymm8 , %%ymm4, %%ymm12 \n\t" "vaddps %%ymm9 , %%ymm5, %%ymm13 \n\t" "vaddps %%ymm10, %%ymm6, %%ymm14 \n\t" "vaddps %%ymm11, %%ymm7, %%ymm15 \n\t" "vmovups %%ymm12, -128(%3,%0,4) \n\t" "vmovups %%ymm13, -96(%3,%0,4) \n\t" "vmovups %%ymm14, -64(%3,%0,4) \n\t" "vmovups %%ymm15, -32(%3,%0,4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/scal.S000066400000000000000000000127251313527062700164370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define X ARG4 #define INCX ARG5 #define I %rax #include "l1param.h" PROLOGUE PROFCODE FLD 8(%rsp) ftst fnstsw %ax andb $68, %ah je .L300 /* Alpha == ZERO */ cmpq $1, INCX jne .L104 movq M, I sarq $3, I jle .L102 ALIGN_4 .L101: fld %st FST 0 * SIZE(X) fld %st FST 1 * SIZE(X) fld %st FST 2 * SIZE(X) fld %st FST 3 * SIZE(X) fld %st FST 4 * SIZE(X) fld %st FST 5 * SIZE(X) fld %st FST 6 * SIZE(X) fld %st FST 7 * SIZE(X) addq $8 * SIZE, X decq I jg .L101 ALIGN_4 .L102: movq M, I andq $7, I jle .L999 ALIGN_4 .L103: fld %st FST 0 * SIZE(X) addq $SIZE, X decq I jg .L103 jmp .L999 ALIGN_4 .L104: salq $BASE_SHIFT, INCX movq M, I sarq $3, I jle .L106 ALIGN_4 .L105: fld %st FST 0 * SIZE(X) addq INCX, X fld %st FST 0 * SIZE(X) addq INCX, X fld %st FST 0 * SIZE(X) addq INCX, X fld %st FST 0 * SIZE(X) addq INCX, X fld %st FST 0 * SIZE(X) addq INCX, X fld %st FST 0 * SIZE(X) addq INCX, X fld %st FST 0 * SIZE(X) addq INCX, X fld %st FST 0 * SIZE(X) addq INCX, X decq I jg .L105 ALIGN_4 .L106: movq M, I andq $7, I jle .L999 ALIGN_4 .L107: fld %st FST 0 * SIZE(X) addq INCX, X decq I jg .L107 jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L300: cmpq $1,INCX jne .L304 movq M, I sarq $3, I jle .L302 ALIGN_4 .L301: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) FLD 1 * SIZE(X) fmul %st(1), %st FST 1 * SIZE(X) FLD 2 * SIZE(X) fmul %st(1), %st FST 2 * SIZE(X) FLD 3 * SIZE(X) fmul %st(1), %st FST 3 * SIZE(X) FLD 4 * SIZE(X) fmul %st(1), %st FST 4 * SIZE(X) FLD 5 * SIZE(X) fmul %st(1), %st FST 5 * SIZE(X) FLD 6 * SIZE(X) fmul %st(1), %st FST 6 * SIZE(X) FLD 7 * SIZE(X) fmul %st(1), %st FST 7 * SIZE(X) addq $8 * SIZE, X decq I jg .L301 ALIGN_4 .L302: movq M, I andq $7, I jle .L999 ALIGN_4 .L303: FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq $SIZE, X decq I jg .L303 jmp .L999 ALIGN_4 .L304: salq $BASE_SHIFT, INCX movq M, I sarq $3, I jle .L306 ALIGN_4 .L305: FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X decq I jg .L305 ALIGN_4 .L306: movq M, I andq $7, I jle .L999 ALIGN_4 .L307: FLD 0 * SIZE(X) fmul %st(1), %st FST 0 * SIZE(X) addq INCX, X decq I jg .L307 ALIGN_4 .L999: ffreep %st(0) ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/scal_atom.S000066400000000000000000000210311313527062700174450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #endif #define XX %r10 #define I %rax #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), X movq 48(%rsp), INCX movaps %xmm3, %xmm0 #endif SAVEREGISTERS testq M, M jle .L999 pxor %xmm1, %xmm1 lea (, INCX, SIZE), INCX comisd %xmm0, %xmm1 jne .L100 /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 movq M, I sarq $3, I jle .L12 ALIGN_4 .L11: movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) movsd %xmm1, 2 * SIZE(X) movsd %xmm1, 3 * SIZE(X) movsd %xmm1, 4 * SIZE(X) movsd %xmm1, 5 * SIZE(X) movsd %xmm1, 6 * SIZE(X) movsd %xmm1, 7 * SIZE(X) addq $8 * SIZE, X decq I jg .L11 ALIGN_4 .L12: testq $4, M je .L14 movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) movsd %xmm1, 2 * SIZE(X) movsd %xmm1, 3 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L14: testq $2, M je .L15 movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L15: testq $1, M je .L999 movsd %xmm1, 0 * SIZE(X) jmp .L999 ALIGN_4 .L50: movq M, I sarq $3, I jle .L52 ALIGN_4 .L51: movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X decq I jg .L51 ALIGN_4 .L52: testq $7, M je .L999 testq $4, M je .L53 movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X ALIGN_3 .L53: testq $2, M je .L54 movsd %xmm1, 0 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) addq INCX, X ALIGN_3 .L54: testq $1, M je .L999 movsd %xmm1, 0 * SIZE(X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: cmpq $SIZE, INCX jne .L150 unpcklpd %xmm0, %xmm0 movq M, I sarq $3, I jle .L113 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movsd 5 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 movsd 7 * SIZE(X), %xmm8 mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 decq I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd %xmm1, 0 * SIZE(X) movsd %xmm2, 1 * SIZE(X) movsd %xmm3, 2 * SIZE(X) movsd %xmm4, 3 * SIZE(X) movsd 8 * SIZE(X), %xmm1 mulsd %xmm0, %xmm5 movsd 9 * SIZE(X), %xmm2 mulsd %xmm0, %xmm6 movsd 10 * SIZE(X), %xmm3 mulsd %xmm0, %xmm7 movsd 11 * SIZE(X), %xmm4 mulsd %xmm0, %xmm8 movsd %xmm5, 4 * SIZE(X) movsd %xmm6, 5 * SIZE(X) movsd %xmm7, 6 * SIZE(X) movsd %xmm8, 7 * SIZE(X) movsd 12 * SIZE(X), %xmm5 mulsd %xmm0, %xmm1 movsd 13 * SIZE(X), %xmm6 mulsd %xmm0, %xmm2 movsd 14 * SIZE(X), %xmm7 mulsd %xmm0, %xmm3 movsd 15 * SIZE(X), %xmm8 mulsd %xmm0, %xmm4 addq $8 * SIZE, X decq I jg .L111 ALIGN_4 .L112: movsd %xmm1, 0 * SIZE(X) mulsd %xmm0, %xmm5 movsd %xmm2, 1 * SIZE(X) mulsd %xmm0, %xmm6 movsd %xmm3, 2 * SIZE(X) mulsd %xmm0, %xmm7 movsd %xmm4, 3 * SIZE(X) mulsd %xmm0, %xmm8 movsd %xmm5, 4 * SIZE(X) movsd %xmm6, 5 * SIZE(X) movsd %xmm7, 6 * SIZE(X) movsd %xmm8, 7 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L113: testq $4, M je .L115 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 movsd 2 * SIZE(X), %xmm3 movsd 3 * SIZE(X), %xmm4 mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 movsd %xmm1, 0 * SIZE(X) movsd %xmm2, 1 * SIZE(X) movsd %xmm3, 2 * SIZE(X) movsd %xmm4, 3 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L115: testq $2, M je .L116 movsd 0 * SIZE(X), %xmm1 movsd 1 * SIZE(X), %xmm2 mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 movsd %xmm1, 0 * SIZE(X) movsd %xmm2, 1 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L116: testq $1, M je .L999 movsd 0 * SIZE(X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, 0 * SIZE(X) jmp .L999 ALIGN_3 /* incx != 1 */ .L150: movq X, XX movq M, I # rcx = n sarq $3, I # (n >> 3) jle .L152 ALIGN_4 .L151: movsd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X mulsd %xmm0, %xmm1 movsd 0 * SIZE(X), %xmm3 addq INCX, X mulsd %xmm0, %xmm2 movsd 0 * SIZE(X), %xmm4 addq INCX, X mulsd %xmm0, %xmm3 movsd 0 * SIZE(X), %xmm5 addq INCX, X mulsd %xmm0, %xmm4 movsd 0 * SIZE(X), %xmm6 addq INCX, X mulsd %xmm0, %xmm5 movsd 0 * SIZE(X), %xmm7 addq INCX, X mulsd %xmm0, %xmm6 movsd 0 * SIZE(X), %xmm8 addq INCX, X mulsd %xmm0, %xmm7 movsd %xmm1, 0 * SIZE(XX) addq INCX, XX mulsd %xmm0, %xmm8 movsd %xmm2, 0 * SIZE(XX) addq INCX, XX movsd %xmm3, 0 * SIZE(XX) addq INCX, XX movsd %xmm4, 0 * SIZE(XX) addq INCX, XX movsd %xmm5, 0 * SIZE(XX) addq INCX, XX movsd %xmm6, 0 * SIZE(XX) addq INCX, XX movsd %xmm7, 0 * SIZE(XX) addq INCX, XX movsd %xmm8, 0 * SIZE(XX) addq INCX, XX decq I jg .L151 ALIGN_4 .L152: testq $7, M je .L999 testq $4, M je .L153 movsd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X mulsd %xmm0, %xmm1 movsd 0 * SIZE(X), %xmm3 addq INCX, X mulsd %xmm0, %xmm2 movsd 0 * SIZE(X), %xmm4 addq INCX, X mulsd %xmm0, %xmm3 movsd %xmm1, 0 * SIZE(XX) addq INCX, XX mulsd %xmm0, %xmm4 movsd %xmm2, 0 * SIZE(XX) addq INCX, XX movsd %xmm3, 0 * SIZE(XX) addq INCX, XX movsd %xmm4, 0 * SIZE(XX) addq INCX, XX ALIGN_3 .L153: testq $2, M je .L154 movsd 0 * SIZE(X), %xmm1 addq INCX, X mulsd %xmm0, %xmm1 movsd 0 * SIZE(X), %xmm2 addq INCX, X mulsd %xmm0, %xmm2 movsd %xmm1, 0 * SIZE(XX) addq INCX, XX movsd %xmm2, 0 * SIZE(XX) addq INCX, XX ALIGN_3 .L154: testq $1, M je .L999 movsd 0 * SIZE(X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, 0 * SIZE(X) ALIGN_4 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/scal_sse.S000066400000000000000000000256631313527062700173160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #endif #define XX %r10 #define I %rax #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), X movq 48(%rsp), INCX movaps %xmm3, %xmm0 #endif SAVEREGISTERS testq M, M jle .L999 lea (, INCX, SIZE), INCX pxor %xmm1, %xmm1 comiss %xmm0, %xmm1 shufps $0, %xmm0, %xmm0 jne .L100 # Alpha != ZERO /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 /* INCX == 1 */ cmpq $3, M jle .L14 testq $4, X # aligned for double word? je .L05 movss %xmm1, 0 * SIZE(X) addq $SIZE, X decq M jle .L999 ALIGN_3 .L05: testq $8, X # aligned for quad word? je .L06 movsd %xmm1, 0 * SIZE(X) addq $2 * SIZE, X subq $2, M jle .L999 ALIGN_3 .L06: movq M, I sarq $4, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 4 * SIZE(X) movaps %xmm1, 8 * SIZE(X) movaps %xmm1, 12 * SIZE(X) addq $16 * SIZE, X decq I jg .L11 ALIGN_4 .L12: testq $15, M je .L999 testq $8, M je .L13 movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 4 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L13: testq $4, M je .L14 movaps %xmm1, 0 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L14: testq $2, M je .L15 movsd %xmm1, 0 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L15: testq $1, M je .L999 movss %xmm1, 0 * SIZE(X) jmp .L999 ALIGN_4 /* incx != 1 */ .L50: movq M, I # rcx = n sarq $3, I # (n >> 3) jle .L52 ALIGN_4 .L51: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X decq I jg .L51 ALIGN_4 .L52: testq $7, M je .L999 testq $4, M je .L53 movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X ALIGN_3 .L53: testq $2, M je .L54 movss %xmm1, (X) addq INCX, X movss %xmm1, (X) addq INCX, X ALIGN_3 .L54: testq $1, M je .L999 movss %xmm1, (X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: cmpq $SIZE, INCX jne .L150 subq $-32 * SIZE, X cmpq $3, M jle .L116 testq $SIZE, X je .L105 movss -32 * SIZE(X), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, -32 * SIZE(X) addq $SIZE, X decq M jle .L999 ALIGN_3 .L105: testq $2 * SIZE, X je .L110 movsd -32 * SIZE(X), %xmm1 mulps %xmm0, %xmm1 movsd %xmm1, -32 * SIZE(X) addq $2 * SIZE, X subq $2, M jle .L999 ALIGN_3 .L110: movq M, I sarq $5, I jle .L113 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 movaps %xmm0, %xmm2 mulps -28 * SIZE(X), %xmm2 movaps %xmm0, %xmm3 mulps -24 * SIZE(X), %xmm3 movaps %xmm0, %xmm4 mulps -20 * SIZE(X), %xmm4 movaps %xmm0, %xmm5 mulps -16 * SIZE(X), %xmm5 movaps %xmm0, %xmm6 mulps -12 * SIZE(X), %xmm6 movaps %xmm0, %xmm7 mulps -8 * SIZE(X), %xmm7 movaps %xmm0, %xmm8 mulps -4 * SIZE(X), %xmm8 decq I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, -32 * SIZE(X) movaps %xmm2, -28 * SIZE(X) movaps %xmm3, -24 * SIZE(X) movaps %xmm4, -20 * SIZE(X) movaps %xmm0, %xmm1 mulps 0 * SIZE(X), %xmm1 movaps %xmm0, %xmm2 mulps 4 * SIZE(X), %xmm2 movaps %xmm0, %xmm3 mulps 8 * SIZE(X), %xmm3 movaps %xmm0, %xmm4 mulps 12 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm5, -16 * SIZE(X) movaps %xmm6, -12 * SIZE(X) movaps %xmm7, -8 * SIZE(X) movaps %xmm8, -4 * SIZE(X) movaps %xmm0, %xmm5 mulps 16 * SIZE(X), %xmm5 movaps %xmm0, %xmm6 mulps 20 * SIZE(X), %xmm6 movaps %xmm0, %xmm7 mulps 24 * SIZE(X), %xmm7 movaps %xmm0, %xmm8 mulps 28 * SIZE(X), %xmm8 subq $-32 * SIZE, X decq I jg .L111 ALIGN_4 .L112: movaps %xmm1, -32 * SIZE(X) movaps %xmm2, -28 * SIZE(X) movaps %xmm3, -24 * SIZE(X) movaps %xmm4, -20 * SIZE(X) movaps %xmm5, -16 * SIZE(X) movaps %xmm6, -12 * SIZE(X) movaps %xmm7, -8 * SIZE(X) movaps %xmm8, -4 * SIZE(X) #else movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movaps -20 * SIZE(X), %xmm4 movaps -16 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -8 * SIZE(X), %xmm7 movaps -4 * SIZE(X), %xmm8 decq I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) movaps 0 * SIZE(X), %xmm1 mulps %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(X) movaps 4 * SIZE(X), %xmm2 mulps %xmm0, %xmm3 movaps %xmm3, -24 * SIZE(X) movaps 8 * SIZE(X), %xmm3 mulps %xmm0, %xmm4 movaps %xmm4, -20 * SIZE(X) movaps 12 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulps %xmm0, %xmm5 movaps %xmm5, -16 * SIZE(X) movaps 16 * SIZE(X), %xmm5 mulps %xmm0, %xmm6 movaps %xmm6, -12 * SIZE(X) movaps 20 * SIZE(X), %xmm6 mulps %xmm0, %xmm7 movaps %xmm7, -8 * SIZE(X) movaps 24 * SIZE(X), %xmm7 mulps %xmm0, %xmm8 movaps %xmm8, -4 * SIZE(X) movaps 28 * SIZE(X), %xmm8 subq $-32 * SIZE, X decq I jg .L111 ALIGN_4 .L112: mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) mulps %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(X) mulps %xmm0, %xmm3 movaps %xmm3, -24 * SIZE(X) mulps %xmm0, %xmm4 movaps %xmm4, -20 * SIZE(X) mulps %xmm0, %xmm5 movaps %xmm5, -16 * SIZE(X) mulps %xmm0, %xmm6 movaps %xmm6, -12 * SIZE(X) mulps %xmm0, %xmm7 movaps %xmm7, -8 * SIZE(X) mulps %xmm0, %xmm8 movaps %xmm8, -4 * SIZE(X) #endif subq $-32 * SIZE, X ALIGN_3 .L113: testq $31, M je .L999 testq $16, M je .L114 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm3 movaps -24 * SIZE(X), %xmm5 movaps -20 * SIZE(X), %xmm7 mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) mulps %xmm0, %xmm3 movaps %xmm3, -28 * SIZE(X) mulps %xmm0, %xmm5 movaps %xmm5, -24 * SIZE(X) mulps %xmm0, %xmm7 movaps %xmm7, -20 * SIZE(X) addq $16 * SIZE, X ALIGN_3 .L114: testq $8, M je .L115 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm3 mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) mulps %xmm0, %xmm3 movaps %xmm3, -28 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L115: testq $4, M je .L116 movaps -32 * SIZE(X), %xmm1 mulps %xmm0, %xmm1 movaps %xmm1, -32 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L116: testq $2, M je .L117 movsd -32 * SIZE(X), %xmm1 mulps %xmm0, %xmm1 movsd %xmm1, -32 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L117: testq $1, M je .L999 movss -32 * SIZE(X), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, -32 * SIZE(X) jmp .L999 ALIGN_3 /* incx != 1 */ .L150: movq X, XX movq M, I # rcx = n sarq $3, I # (n >> 3) jle .L152 ALIGN_4 .L151: movss (X), %xmm1 addq INCX, X movss (X), %xmm2 addq INCX, X movss (X), %xmm3 addq INCX, X movss (X), %xmm4 addq INCX, X movss (X), %xmm5 addq INCX, X movss (X), %xmm6 addq INCX, X movss (X), %xmm7 addq INCX, X movss (X), %xmm8 addq INCX, X mulss %xmm0, %xmm1 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 mulss %xmm0, %xmm6 mulss %xmm0, %xmm7 mulss %xmm0, %xmm8 movss %xmm1, (XX) addq INCX, XX movss %xmm2, (XX) addq INCX, XX movss %xmm3, (XX) addq INCX, XX movss %xmm4, (XX) addq INCX, XX movss %xmm5, (XX) addq INCX, XX movss %xmm6, (XX) addq INCX, XX movss %xmm7, (XX) addq INCX, XX movss %xmm8, (XX) addq INCX, XX decq I jg .L151 ALIGN_4 .L152: testq $7, M je .L999 testq $4, M je .L153 movss (X), %xmm1 addq INCX, X movss (X), %xmm2 addq INCX, X movss (X), %xmm3 addq INCX, X movss (X), %xmm4 addq INCX, X mulss %xmm0, %xmm1 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 movss %xmm1, (XX) addq INCX, XX movss %xmm2, (XX) addq INCX, XX movss %xmm3, (XX) addq INCX, XX movss %xmm4, (XX) addq INCX, XX ALIGN_3 .L153: testq $2, M je .L154 movss (X), %xmm1 addq INCX, X movss (X), %xmm2 addq INCX, X mulss %xmm0, %xmm1 mulss %xmm0, %xmm2 movss %xmm1, (XX) addq INCX, XX movss %xmm2, (XX) addq INCX, XX ALIGN_3 .L154: testq $1, M je .L999 movss (X), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, (X) ALIGN_4 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/scal_sse2.S000066400000000000000000000255051313527062700173730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #endif #define XX %r10 #define I %rax #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), X movq 48(%rsp), INCX movaps %xmm3, %xmm0 #endif SAVEREGISTERS testq M, M jle .L999 leaq (, INCX, SIZE), INCX xorps %xmm1, %xmm1 comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO jp .L100 # For Alpha = NaN /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 /* INCX == 1 */ testq $15, X # aligned for quad word? je .L05 movsd %xmm1, 0 * SIZE(X) addq $SIZE, X decq M jle .L999 ALIGN_3 .L05: /* Aligned Mode */ movq M, I # rcx = n sarq $4, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 2 * SIZE(X) movaps %xmm1, 4 * SIZE(X) movaps %xmm1, 6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, 8 * SIZE(X) movaps %xmm1, 10 * SIZE(X) movaps %xmm1, 12 * SIZE(X) movaps %xmm1, 14 * SIZE(X) addq $16 * SIZE, X decq I jg .L11 ALIGN_4 .L12: testq $15, M je .L999 testq $8, M je .L13 movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 2 * SIZE(X) movaps %xmm1, 4 * SIZE(X) movaps %xmm1, 6 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L13: testq $4, M je .L14 movaps %xmm1, 0 * SIZE(X) movaps %xmm1, 2 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L14: testq $2, M je .L15 movaps %xmm1, 0 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L15: testq $1, M je .L999 movsd %xmm1, 0 * SIZE(X) jmp .L999 ALIGN_4 .L50: movq M, I sarq $3, I jle .L52 ALIGN_4 .L51: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X decq I jg .L51 ALIGN_4 .L52: testq $7, M je .L999 testq $4, M je .L53 movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X ALIGN_3 .L53: testq $2, M je .L54 movsd %xmm1, (X) addq INCX, X movsd %xmm1, (X) addq INCX, X ALIGN_3 .L54: testq $1, M je .L999 movsd %xmm1, (X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: unpcklpd %xmm0, %xmm0 cmpq $SIZE, INCX jne .L150 testq $SIZE, X je .L105 movsd 0 * SIZE(X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, 0 * SIZE(X) addq $SIZE, X decq M jle .L999 ALIGN_3 .L105: subq $-16 * SIZE, X movq M, I # rcx = n sarq $4, I jle .L113 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 movaps %xmm0, %xmm2 mulpd -14 * SIZE(X), %xmm2 movaps %xmm0, %xmm3 mulpd -12 * SIZE(X), %xmm3 movaps %xmm0, %xmm4 mulpd -10 * SIZE(X), %xmm4 movaps %xmm0, %xmm5 mulpd -8 * SIZE(X), %xmm5 movaps %xmm0, %xmm6 mulpd -6 * SIZE(X), %xmm6 movaps %xmm0, %xmm7 mulpd -4 * SIZE(X), %xmm7 movaps %xmm0, %xmm8 mulpd -2 * SIZE(X), %xmm8 decq I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, -16 * SIZE(X) movaps %xmm2, -14 * SIZE(X) movaps %xmm3, -12 * SIZE(X) movaps %xmm4, -10 * SIZE(X) movaps %xmm0, %xmm1 mulpd 0 * SIZE(X), %xmm1 movaps %xmm0, %xmm2 mulpd 2 * SIZE(X), %xmm2 movaps %xmm0, %xmm3 mulpd 4 * SIZE(X), %xmm3 movaps %xmm0, %xmm4 mulpd 6 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm5, -8 * SIZE(X) movaps %xmm6, -6 * SIZE(X) movaps %xmm7, -4 * SIZE(X) movaps %xmm8, -2 * SIZE(X) movaps %xmm0, %xmm5 mulpd 8 * SIZE(X), %xmm5 movaps %xmm0, %xmm6 mulpd 10 * SIZE(X), %xmm6 movaps %xmm0, %xmm7 mulpd 12 * SIZE(X), %xmm7 movaps %xmm0, %xmm8 mulpd 14 * SIZE(X), %xmm8 subq $-16 * SIZE, X decq I jg .L111 ALIGN_4 .L112: movaps %xmm1, -16 * SIZE(X) movaps %xmm2, -14 * SIZE(X) movaps %xmm3, -12 * SIZE(X) movaps %xmm4, -10 * SIZE(X) movaps %xmm5, -8 * SIZE(X) movaps %xmm6, -6 * SIZE(X) movaps %xmm7, -4 * SIZE(X) movaps %xmm8, -2 * SIZE(X) #else movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 movaps -10 * SIZE(X), %xmm4 movaps -8 * SIZE(X), %xmm5 movaps -6 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 movaps -2 * SIZE(X), %xmm8 decq I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) movaps 0 * SIZE(X), %xmm1 mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) movaps 2 * SIZE(X), %xmm2 mulpd %xmm0, %xmm3 movaps %xmm3, -12 * SIZE(X) movaps 4 * SIZE(X), %xmm3 mulpd %xmm0, %xmm4 movaps %xmm4, -10 * SIZE(X) movaps 6 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif mulpd %xmm0, %xmm5 movaps %xmm5, -8 * SIZE(X) movaps 8 * SIZE(X), %xmm5 mulpd %xmm0, %xmm6 movaps %xmm6, -6 * SIZE(X) movaps 10 * SIZE(X), %xmm6 mulpd %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(X) movaps 12 * SIZE(X), %xmm7 mulpd %xmm0, %xmm8 movaps %xmm8, -2 * SIZE(X) movaps 14 * SIZE(X), %xmm8 subq $-16 * SIZE, X decq I jg .L111 ALIGN_4 .L112: mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) mulpd %xmm0, %xmm3 movaps %xmm3, -12 * SIZE(X) mulpd %xmm0, %xmm4 movaps %xmm4, -10 * SIZE(X) mulpd %xmm0, %xmm5 movaps %xmm5, -8 * SIZE(X) mulpd %xmm0, %xmm6 movaps %xmm6, -6 * SIZE(X) mulpd %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(X) mulpd %xmm0, %xmm8 movaps %xmm8, -2 * SIZE(X) #endif subq $-16 * SIZE, X ALIGN_3 .L113: testq $15, M je .L999 testq $8, M je .L114 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 movaps -10 * SIZE(X), %xmm4 mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) mulpd %xmm0, %xmm3 movaps %xmm3, -12 * SIZE(X) mulpd %xmm0, %xmm4 movaps %xmm4, -10 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L114: testq $4, M je .L115 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) mulpd %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L115: testq $2, M je .L116 movaps -16 * SIZE(X), %xmm1 mulpd %xmm0, %xmm1 movaps %xmm1, -16 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L116: testq $1, M je .L999 movsd -16 * SIZE(X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, -16 * SIZE(X) jmp .L999 ALIGN_3 /* incx != 1 */ .L150: movq X, XX movq M, I # rcx = n sarq $3, I # (n >> 3) jle .L152 ALIGN_4 .L151: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X movsd (X), %xmm6 addq INCX, X movsd (X), %xmm7 addq INCX, X movsd (X), %xmm8 addq INCX, X mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 mulsd %xmm0, %xmm8 movsd %xmm1, (XX) addq INCX, XX movsd %xmm2, (XX) addq INCX, XX movsd %xmm3, (XX) addq INCX, XX movsd %xmm4, (XX) addq INCX, XX movsd %xmm5, (XX) addq INCX, XX movsd %xmm6, (XX) addq INCX, XX movsd %xmm7, (XX) addq INCX, XX movsd %xmm8, (XX) addq INCX, XX decq I jg .L151 ALIGN_4 .L152: testq $7, M je .L999 testq $4, M je .L153 movsd (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X movsd (X), %xmm4 addq INCX, X mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 movsd %xmm1, (XX) addq INCX, XX movsd %xmm2, (XX) addq INCX, XX movsd %xmm3, (XX) addq INCX, XX movsd %xmm4, (XX) addq INCX, XX ALIGN_3 .L153: testq $2, M je .L154 movsd (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X mulsd %xmm0, %xmm1 mulsd %xmm0, %xmm2 movsd %xmm1, (XX) addq INCX, XX movsd %xmm2, (XX) addq INCX, XX ALIGN_3 .L154: testq $1, M je .L999 movsd (X), %xmm1 mulsd %xmm0, %xmm1 movsd %xmm1, (X) ALIGN_4 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/sdot.c000066400000000000000000000062511313527062700165030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) #include "sdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "sdot_microk_steamroller-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_16 static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot = 0.0; while(i < n) { dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] + y[i+4] * x[i+4] + y[i+5] * x[i+5] + y[i+6] * x[i+6] + y[i+7] * x[i+7] ; i+=8 ; } *d += dot; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; double dot = 0.0 ; FLOAT mydot=0.0; BLASLONG n1; if ( n <= 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { n1 = n & (BLASLONG)(-32); if ( n1 ) sdot_kernel_16(n1, x, y , &mydot ); i = n1; while(i < n) { dot += y[i] * x[i] ; i++ ; } dot+=mydot; return(dot); } n1 = n & (BLASLONG)(-2); while(i < n1) { dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; ix += inc_x*2 ; iy += inc_y*2 ; i+=2 ; } while(i < n) { dot += y[iy] * x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; } return(dot); } OpenBLAS-0.2.20/kernel/x86_64/sdot_microk_bulldozer-2.c000066400000000000000000000065131313527062700222710ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x "vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y "vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y "vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y "vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovss %%xmm4, (%4) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sdot_microk_haswell-2.c000066400000000000000000000072521313527062700217270ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 2 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 2 * x "vmovups 64(%2,%0,4), %%ymm14 \n\t" // 2 * x "vmovups 96(%2,%0,4), %%ymm15 \n\t" // 2 * x "vfmadd231ps (%3,%0,4), %%ymm12, %%ymm4 \n\t" // 2 * y "vfmadd231ps 32(%3,%0,4), %%ymm13, %%ymm5 \n\t" // 2 * y "vfmadd231ps 64(%3,%0,4), %%ymm14, %%ymm6 \n\t" // 2 * y "vfmadd231ps 96(%3,%0,4), %%ymm15, %%ymm7 \n\t" // 2 * y "addq $32 , %0 \n\t" "subq $32 , %1 \n\t" "jnz 1b \n\t" "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sdot_microk_nehalem-2.c000066400000000000000000000071731313527062700217030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "xorps %%xmm4, %%xmm4 \n\t" "xorps %%xmm5, %%xmm5 \n\t" "xorps %%xmm6, %%xmm6 \n\t" "xorps %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x "mulps %%xmm8 , %%xmm12 \n\t" "mulps %%xmm9 , %%xmm13 \n\t" "mulps %%xmm10, %%xmm14 \n\t" "mulps %%xmm11, %%xmm15 \n\t" "addps %%xmm12, %%xmm4 \n\t" "addps %%xmm13, %%xmm5 \n\t" "addps %%xmm14, %%xmm6 \n\t" "addps %%xmm15, %%xmm7 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "addps %%xmm5, %%xmm4 \n\t" "addps %%xmm7, %%xmm6 \n\t" "addps %%xmm6, %%xmm4 \n\t" "haddps %%xmm4, %%xmm4 \n\t" "haddps %%xmm4, %%xmm4 \n\t" "movss %%xmm4, (%4) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sdot_microk_sandy-2.c000066400000000000000000000075571313527062700214160ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; __asm__ __volatile__ ( "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 2 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 2 * x "vmovups 64(%2,%0,4), %%ymm14 \n\t" // 2 * x "vmovups 96(%2,%0,4), %%ymm15 \n\t" // 2 * x "vmulps (%3,%0,4), %%ymm12, %%ymm12 \n\t" // 2 * y "vmulps 32(%3,%0,4), %%ymm13, %%ymm13 \n\t" // 2 * y "vmulps 64(%3,%0,4), %%ymm14, %%ymm14 \n\t" // 2 * y "vmulps 96(%3,%0,4), %%ymm15, %%ymm15 \n\t" // 2 * y "vaddps %%ymm4 , %%ymm12, %%ymm4 \n\t" // 2 * y "vaddps %%ymm5 , %%ymm13, %%ymm5 \n\t" // 2 * y "vaddps %%ymm6 , %%ymm14, %%ymm6 \n\t" // 2 * y "vaddps %%ymm7 , %%ymm15, %%ymm7 \n\t" // 2 * y "addq $32 , %0 \n\t" "subq $32 , %1 \n\t" "jnz 1b \n\t" "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t" "vextractf128 $1 , %%ymm6 , %%xmm14 \n\t" "vextractf128 $1 , %%ymm7 , %%xmm15 \n\t" "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sdot_microk_steamroller-2.c000066400000000000000000000141631313527062700226200ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n < 4096 ) { __asm__ __volatile__ ( "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x "vfmadd231ps (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y "vfmadd231ps 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y "vmovups 64(%2,%0,4), %%xmm0 \n\t" // 4 * x "vmovups 80(%2,%0,4), %%xmm1 \n\t" // 4 * x "vfmadd231ps 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y "vfmadd231ps 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y "vmovups 96(%2,%0,4), %%xmm2 \n\t" // 4 * x "vmovups 112(%2,%0,4), %%xmm3 \n\t" // 4 * x "vfmadd231ps 64(%3,%0,4), %%xmm0 , %%xmm4 \n\t" // 4 * y "vfmadd231ps 80(%3,%0,4), %%xmm1 , %%xmm5 \n\t" // 4 * y "vfmadd231ps 96(%3,%0,4), %%xmm2 , %%xmm6 \n\t" // 4 * y "vfmadd231ps 112(%3,%0,4), %%xmm3 , %%xmm7 \n\t" // 4 * y "addq $32, %0 \n\t" "subq $32, %1 \n\t" "jnz 1b \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovss %%xmm4, (%4) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x "prefetcht0 512(%3,%0,4) \n\t" "vfmadd231ps (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y "vfmadd231ps 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y "prefetcht0 576(%2,%0,4) \n\t" "vmovups 64(%2,%0,4), %%xmm0 \n\t" // 4 * x "vmovups 80(%2,%0,4), %%xmm1 \n\t" // 4 * x "prefetcht0 576(%3,%0,4) \n\t" "vfmadd231ps 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y "vfmadd231ps 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y "vmovups 96(%2,%0,4), %%xmm2 \n\t" // 4 * x "vmovups 112(%2,%0,4), %%xmm3 \n\t" // 4 * x "vfmadd231ps 64(%3,%0,4), %%xmm0 , %%xmm4 \n\t" // 4 * y "vfmadd231ps 80(%3,%0,4), %%xmm1 , %%xmm5 \n\t" // 4 * y "vfmadd231ps 96(%3,%0,4), %%xmm2 , %%xmm6 \n\t" // 4 * y "vfmadd231ps 112(%3,%0,4), %%xmm3 , %%xmm7 \n\t" // 4 * y "addq $32, %0 \n\t" "subq $32, %1 \n\t" "jnz 1b \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vmovss %%xmm4, (%4) \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S000066400000000000000000003522541313527062700230330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define LB2_OFFSET 4096 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #define BUFFER2 LB2_OFFSET+128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #define A_PR1 384 #define B_PR1 192 /******************************************************************************************* * 3 lines of N *******************************************************************************************/ #define KERNEL16x3_1(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_2(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_3(xx) \ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_4(xx) \ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ addq $12, BI ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $64, %rax ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ /*******************************************************************************************/ #define KERNEL8x3_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_2(xx) \ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_4(xx) \ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ addq $12, BI ;\ addq $32, %rax ;\ #define KERNEL8x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ /*******************************************************************************************/ #define KERNEL4x3_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_2(xx) \ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_4(xx) \ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $16, %rax ;\ #define KERNEL4x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ #define KERNEL2x3_1(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_2(xx) \ vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_4(xx) \ vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ addq $12, BI ;\ addq $8, %rax ;\ #define KERNEL2x3_SUB(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ /*******************************************************************************************/ #define KERNEL1x3_1(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_2(xx) \ vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_4(xx) \ vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $4, %rax ;\ #define KERNEL1x3_SUB(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ #define KERNEL16x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $8, BI ;\ addq $64, %rax ;\ #define KERNEL16x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ /*******************************************************************************************/ #define KERNEL8x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_2(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_4(xx) \ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ addq $8, BI ;\ addq $32, %rax ;\ #define KERNEL8x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL4x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_2(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_4(xx) \ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL4x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ #define KERNEL2x2_1(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_2(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_4(xx) \ vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ /*******************************************************************************************/ #define KERNEL1x2_1(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_2(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_4(xx) \ vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $4, %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ /******************************************************************************************* * 1 line of N *******************************************************************************************/ #define KERNEL16x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ addq $4, BI ;\ addq $64, %rax ;\ #define KERNEL16x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ /*******************************************************************************************/ #define KERNEL8x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_2(xx) \ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_4(xx) \ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ addq $4, BI ;\ addq $32, %rax ;\ #define KERNEL8x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ /*******************************************************************************************/ #define KERNEL4x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_2(xx) \ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_4(xx) \ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $16, %rax ;\ #define KERNEL4x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #define KERNEL2x1_1(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_2(xx) \ vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_4(xx) \ vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ addq $4, BI ;\ addq $8, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL1x1_1(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_2(xx) \ vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_4(xx) \ vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $4, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 .L6_01: // copy to sub buffer movq K, %rax salq $1,%rax // K * 2 ; read 2 values movq B, BO1 leaq (B,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_01a_2 ALIGN_4 .L6_01a_1: prefetcht0 512(BO1) prefetcht0 512(BO2) prefetchw 512(BO) vmovsd 0 * SIZE(BO1), %xmm0 vmovsd 2 * SIZE(BO1), %xmm2 vmovsd 4 * SIZE(BO1), %xmm4 vmovsd 6 * SIZE(BO1), %xmm6 vmovss 0 * SIZE(BO2), %xmm1 vmovss 2 * SIZE(BO2), %xmm3 vmovss 4 * SIZE(BO2), %xmm5 vmovss 6 * SIZE(BO2), %xmm7 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm1, 2*SIZE(BO) vmovsd %xmm2, 3*SIZE(BO) vmovss %xmm3, 5*SIZE(BO) vmovsd %xmm4, 6*SIZE(BO) vmovss %xmm5, 8*SIZE(BO) vmovsd %xmm6, 9*SIZE(BO) vmovss %xmm7,11*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO vmovsd 0 * SIZE(BO1), %xmm0 vmovsd 2 * SIZE(BO1), %xmm2 vmovsd 4 * SIZE(BO1), %xmm4 vmovsd 6 * SIZE(BO1), %xmm6 vmovss 0 * SIZE(BO2), %xmm1 vmovss 2 * SIZE(BO2), %xmm3 vmovss 4 * SIZE(BO2), %xmm5 vmovss 6 * SIZE(BO2), %xmm7 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm1, 2*SIZE(BO) vmovsd %xmm2, 3*SIZE(BO) vmovss %xmm3, 5*SIZE(BO) vmovsd %xmm4, 6*SIZE(BO) vmovss %xmm5, 8*SIZE(BO) vmovsd %xmm6, 9*SIZE(BO) vmovss %xmm7,11*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_01a_1 .L6_01a_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_02c ALIGN_4 .L6_02b: vmovsd 0 * SIZE(BO1), %xmm0 vmovss 0 * SIZE(BO2), %xmm2 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm2, 2*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_02b .L6_02c: movq K, %rax salq $1,%rax // K * 2 leaq (B,%rax, SIZE), BO1 // next offset to BO1 leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER2, BO // second buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_02c_2 ALIGN_4 .L6_02c_1: prefetcht0 512(BO2) prefetchw 512(BO) vmovsd 0 * SIZE(BO2), %xmm0 vmovsd 2 * SIZE(BO2), %xmm2 vmovsd 4 * SIZE(BO2), %xmm4 vmovsd 6 * SIZE(BO2), %xmm6 vmovss 1 * SIZE(BO1), %xmm1 vmovss 3 * SIZE(BO1), %xmm3 vmovss 5 * SIZE(BO1), %xmm5 vmovss 7 * SIZE(BO1), %xmm7 vmovss %xmm1, 0*SIZE(BO) vmovsd %xmm0, 1*SIZE(BO) vmovss %xmm3, 3*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovss %xmm5, 6*SIZE(BO) vmovsd %xmm4, 7*SIZE(BO) vmovss %xmm7, 9*SIZE(BO) vmovsd %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO vmovsd 0 * SIZE(BO2), %xmm0 vmovsd 2 * SIZE(BO2), %xmm2 vmovsd 4 * SIZE(BO2), %xmm4 vmovsd 6 * SIZE(BO2), %xmm6 vmovss 1 * SIZE(BO1), %xmm1 vmovss 3 * SIZE(BO1), %xmm3 vmovss 5 * SIZE(BO1), %xmm5 vmovss 7 * SIZE(BO1), %xmm7 vmovss %xmm1, 0*SIZE(BO) vmovsd %xmm0, 1*SIZE(BO) vmovss %xmm3, 3*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovss %xmm5, 6*SIZE(BO) vmovsd %xmm4, 7*SIZE(BO) vmovss %xmm7, 9*SIZE(BO) vmovsd %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_02c_1 .L6_02c_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_03c ALIGN_4 .L6_03b: vmovss 1*SIZE(BO1), %xmm0 vmovsd 0*SIZE(BO2), %xmm1 vmovss %xmm0, 0*SIZE(BO) vmovsd %xmm1, 1*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_03b .L6_03c: movq BO2, B // next offset of B .L6_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L6_20 ALIGN_4 .L6_11: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L6_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L6_16 KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L6_16 jmp .L6_12 ALIGN_4 .L6_16: movq K, %rax andq $7, %rax # if (k & 1) je .L6_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_17: KERNEL16x3_SUB(xxx) addq $3, BI addq $16, %rax jl .L6_17 ALIGN_4 .L6_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) vmovups %xmm15,12 * SIZE(CO1, LDC, 2) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L6_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_20: // Test rest of M testq $15, M jz .L7_10 // to next 3 lines of N testq $8, M jz .L6_21pre ALIGN_4 /**************************************************************************/ .L6_20_1: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_20_6 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_20_6 KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_20_6 jmp .L6_20_2 ALIGN_4 .L6_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L6_20_9 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_20_7: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L6_20_7 ALIGN_4 .L6_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L6_21pre: testq $4, M jz .L6_30 ALIGN_4 .L6_21: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 jmp .L6_22 ALIGN_4 .L6_26: movq K, %rax andq $7, %rax # if (k & 1) je .L6_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L6_27 ALIGN_4 .L6_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L6_30: testq $2, M jz .L6_40 ALIGN_4 .L6_31: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) prefetcht0 B_PR1+16(BO,BI,SIZE) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,SIZE) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 jmp .L6_32 ALIGN_4 .L6_36: movq K, %rax andq $7, %rax # if (k & 1) je .L6_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L6_37 ALIGN_4 .L6_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L6_40: testq $1, M jz .L7_10 // to next 3 lines of N ALIGN_4 .L6_41: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 jmp .L6_42 ALIGN_4 .L6_46: movq K, %rax andq $7, %rax # if (k & 1) je .L6_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L6_47 ALIGN_4 .L6_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 /***************************************************************************************************************/ .L7_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L7_20 ALIGN_4 .L7_11: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L7_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L7_16 KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L7_16 jmp .L7_12 ALIGN_4 .L7_16: movq K, %rax andq $7, %rax # if (k & 1) je .L7_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_17: KERNEL16x3_SUB(xxx) addq $3, BI addq $16, %rax jl .L7_17 ALIGN_4 .L7_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) vmovups %xmm15,12 * SIZE(CO1, LDC, 2) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L7_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L7_20: // Test rest of M testq $15, M jz .L7_60 // to next 3 lines of N testq $8, M jz .L7_21pre ALIGN_4 /**************************************************************************/ .L7_20_1: leaq BUFFER2, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_20_6 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_20_6 KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_20_6 jmp .L7_20_2 ALIGN_4 .L7_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L7_20_9 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_20_7: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L7_20_7 ALIGN_4 .L7_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L7_21pre: testq $4, M jz .L7_30 ALIGN_4 .L7_21: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 jmp .L7_22 ALIGN_4 .L7_26: movq K, %rax andq $7, %rax # if (k & 1) je .L7_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L7_27 ALIGN_4 .L7_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L7_30: testq $2, M jz .L7_40 ALIGN_4 .L7_31: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) prefetcht0 B_PR1+16(BO,BI,SIZE) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,SIZE) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 jmp .L7_32 ALIGN_4 .L7_36: movq K, %rax andq $7, %rax # if (k & 1) je .L7_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L7_37 ALIGN_4 .L7_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L7_40: testq $1, M jz .L7_60 // to next 3 lines of N ALIGN_4 .L7_41: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 jmp .L7_42 ALIGN_4 .L7_46: movq K, %rax andq $7, %rax # if (k & 1) je .L7_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L7_47 ALIGN_4 .L7_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L7_60: decq J // j -- jg .L6_01 .L2_0: cmpq $0, Nmod6 // N % 6 == 0 je .L999 /************************************************************************************************ * Loop for Nmod6 / 2 > 0 *************************************************************************************************/ movq Nmod6, J sarq $1, J // j = j / 2 je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB(xxx) addq $2, BI addq $16, %rax jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_20_7 ALIGN_4 .L2_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: movq K, %rax andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB(xxx) addq $1, BI addq $16, %rax jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_20_7 ALIGN_4 .L1_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: movq K, %rax andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vmovups %xmm4 , (CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vmovss %xmm4 , (CO1) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB(xxx) addq $2, BI addq $16, %rax jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm10,%xmm10 vmulps %xmm0, %xmm13,%xmm13 vmulps %xmm0, %xmm5,%xmm5 vmulps %xmm0, %xmm8,%xmm8 vmulps %xmm0, %xmm11,%xmm11 vmulps %xmm0, %xmm14,%xmm14 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_20_7 ALIGN_4 .L2_20_9: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm5,%xmm5 vmulps %xmm0, %xmm8,%xmm8 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm5,%xmm5 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm8,%xmm8 vmulss %xmm0, %xmm5,%xmm5 vmulss %xmm0, %xmm10,%xmm10 #endif vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm5,%xmm5 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB(xxx) addq $1, BI addq $16, %rax jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm10,%xmm10 vmulps %xmm0, %xmm13,%xmm13 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_20_7 ALIGN_4 .L1_20_9: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 #else vmulps %xmm0, %xmm4,%xmm4 #endif vmovups %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm8,%xmm8 #endif vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 #else vmulss %xmm0, %xmm4,%xmm4 #endif vmovss %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/sgemm_kernel_16x2_piledriver.S000066400000000000000000003533751313527062700232030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /********************************************************************* * * 2013/10/18 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/29 Saar * * Parameter: * UNROLL_M 16 * UNROLL_N 2 * SGEMM_P 768 * SGEMM_Q 192 * SGEMM_R 12288 * A_PR1 384 * B_PR1 192 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * * 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) * 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) * 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) * 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * * 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) * 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) * 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) * 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) * 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) * 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define LB2_OFFSET 4096 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #define BUFFER2 LB2_OFFSET+128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #define A_PR1 384 #define B_PR1 192 /******************************************************************************************* * 3 lines of N *******************************************************************************************/ #define KERNEL16x3_1(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_2(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_3(xx) \ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_4(xx) \ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ addq $12, BI ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $64, %rax ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ /*******************************************************************************************/ #define KERNEL8x3_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_2(xx) \ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_4(xx) \ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ addq $12, BI ;\ addq $32, %rax ;\ #define KERNEL8x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ /*******************************************************************************************/ #define KERNEL4x3_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_2(xx) \ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_4(xx) \ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $16, %rax ;\ #define KERNEL4x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ #define KERNEL2x3_1(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_2(xx) \ vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_4(xx) \ vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ addq $12, BI ;\ addq $8, %rax ;\ #define KERNEL2x3_SUB(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ /*******************************************************************************************/ #define KERNEL1x3_1(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_2(xx) \ vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_4(xx) \ vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $4, %rax ;\ #define KERNEL1x3_SUB(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ #define KERNEL16x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $8, BI ;\ addq $64, %rax ;\ #define KERNEL16x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ /*******************************************************************************************/ #define KERNEL8x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_2(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_4(xx) \ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ addq $8, BI ;\ addq $32, %rax ;\ #define KERNEL8x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL4x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_2(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_4(xx) \ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL4x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ #define KERNEL2x2_1(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_2(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_4(xx) \ vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ /*******************************************************************************************/ #define KERNEL1x2_1(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_2(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_4(xx) \ vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $4, %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ /******************************************************************************************* * 1 line of N *******************************************************************************************/ #define KERNEL16x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ addq $4, BI ;\ addq $64, %rax ;\ #define KERNEL16x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ /*******************************************************************************************/ #define KERNEL8x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_2(xx) \ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_4(xx) \ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ addq $4, BI ;\ addq $32, %rax ;\ #define KERNEL8x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ /*******************************************************************************************/ #define KERNEL4x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_2(xx) \ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_4(xx) \ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $16, %rax ;\ #define KERNEL4x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #define KERNEL2x1_1(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_2(xx) \ vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_4(xx) \ vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ addq $4, BI ;\ addq $8, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL1x1_1(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_2(xx) \ vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_4(xx) \ vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $4, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 .L6_01: // copy to sub buffer movq K, %rax salq $1,%rax // K * 2 ; read 2 values movq B, BO1 leaq (B,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_01a_2 ALIGN_4 .L6_01a_1: prefetcht0 512(BO1) prefetcht0 512(BO2) prefetchw 512(BO) vmovsd 0 * SIZE(BO1), %xmm0 vmovsd 2 * SIZE(BO1), %xmm2 vmovsd 4 * SIZE(BO1), %xmm4 vmovsd 6 * SIZE(BO1), %xmm6 vmovss 0 * SIZE(BO2), %xmm1 vmovss 2 * SIZE(BO2), %xmm3 vmovss 4 * SIZE(BO2), %xmm5 vmovss 6 * SIZE(BO2), %xmm7 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm1, 2*SIZE(BO) vmovsd %xmm2, 3*SIZE(BO) vmovss %xmm3, 5*SIZE(BO) vmovsd %xmm4, 6*SIZE(BO) vmovss %xmm5, 8*SIZE(BO) vmovsd %xmm6, 9*SIZE(BO) vmovss %xmm7,11*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO vmovsd 0 * SIZE(BO1), %xmm0 vmovsd 2 * SIZE(BO1), %xmm2 vmovsd 4 * SIZE(BO1), %xmm4 vmovsd 6 * SIZE(BO1), %xmm6 vmovss 0 * SIZE(BO2), %xmm1 vmovss 2 * SIZE(BO2), %xmm3 vmovss 4 * SIZE(BO2), %xmm5 vmovss 6 * SIZE(BO2), %xmm7 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm1, 2*SIZE(BO) vmovsd %xmm2, 3*SIZE(BO) vmovss %xmm3, 5*SIZE(BO) vmovsd %xmm4, 6*SIZE(BO) vmovss %xmm5, 8*SIZE(BO) vmovsd %xmm6, 9*SIZE(BO) vmovss %xmm7,11*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_01a_1 .L6_01a_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_02c ALIGN_4 .L6_02b: vmovsd 0 * SIZE(BO1), %xmm0 vmovss 0 * SIZE(BO2), %xmm2 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm2, 2*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_02b .L6_02c: movq K, %rax salq $1,%rax // K * 2 leaq (B,%rax, SIZE), BO1 // next offset to BO1 leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER2, BO // second buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_02c_2 ALIGN_4 .L6_02c_1: prefetcht0 512(BO2) prefetchw 512(BO) vmovsd 0 * SIZE(BO2), %xmm0 vmovsd 2 * SIZE(BO2), %xmm2 vmovsd 4 * SIZE(BO2), %xmm4 vmovsd 6 * SIZE(BO2), %xmm6 vmovss 1 * SIZE(BO1), %xmm1 vmovss 3 * SIZE(BO1), %xmm3 vmovss 5 * SIZE(BO1), %xmm5 vmovss 7 * SIZE(BO1), %xmm7 vmovss %xmm1, 0*SIZE(BO) vmovsd %xmm0, 1*SIZE(BO) vmovss %xmm3, 3*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovss %xmm5, 6*SIZE(BO) vmovsd %xmm4, 7*SIZE(BO) vmovss %xmm7, 9*SIZE(BO) vmovsd %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO vmovsd 0 * SIZE(BO2), %xmm0 vmovsd 2 * SIZE(BO2), %xmm2 vmovsd 4 * SIZE(BO2), %xmm4 vmovsd 6 * SIZE(BO2), %xmm6 vmovss 1 * SIZE(BO1), %xmm1 vmovss 3 * SIZE(BO1), %xmm3 vmovss 5 * SIZE(BO1), %xmm5 vmovss 7 * SIZE(BO1), %xmm7 vmovss %xmm1, 0*SIZE(BO) vmovsd %xmm0, 1*SIZE(BO) vmovss %xmm3, 3*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovss %xmm5, 6*SIZE(BO) vmovsd %xmm4, 7*SIZE(BO) vmovss %xmm7, 9*SIZE(BO) vmovsd %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_02c_1 .L6_02c_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_03c ALIGN_4 .L6_03b: vmovss 1*SIZE(BO1), %xmm0 vmovsd 0*SIZE(BO2), %xmm1 vmovss %xmm0, 0*SIZE(BO) vmovsd %xmm1, 1*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_03b .L6_03c: movq BO2, B // next offset of B .L6_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L6_20 ALIGN_4 .L6_11: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L6_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L6_16 KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L6_16 jmp .L6_12 ALIGN_4 .L6_16: movq K, %rax andq $7, %rax # if (k & 1) je .L6_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_17: KERNEL16x3_SUB(xxx) addq $3, BI addq $16, %rax jl .L6_17 ALIGN_4 .L6_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) vmovups %xmm15,12 * SIZE(CO1, LDC, 2) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L6_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_20: // Test rest of M testq $15, M jz .L7_10 // to next 3 lines of N testq $8, M jz .L6_21pre ALIGN_4 /**************************************************************************/ .L6_20_1: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_20_6 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_20_6 KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_20_6 jmp .L6_20_2 ALIGN_4 .L6_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L6_20_9 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_20_7: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L6_20_7 ALIGN_4 .L6_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L6_21pre: testq $4, M jz .L6_30 ALIGN_4 .L6_21: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 jmp .L6_22 ALIGN_4 .L6_26: movq K, %rax andq $7, %rax # if (k & 1) je .L6_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L6_27 ALIGN_4 .L6_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L6_30: testq $2, M jz .L6_40 ALIGN_4 .L6_31: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) prefetcht0 B_PR1+16(BO,BI,SIZE) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,SIZE) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 jmp .L6_32 ALIGN_4 .L6_36: movq K, %rax andq $7, %rax # if (k & 1) je .L6_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L6_37 ALIGN_4 .L6_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L6_40: testq $1, M jz .L7_10 // to next 3 lines of N ALIGN_4 .L6_41: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 jmp .L6_42 ALIGN_4 .L6_46: movq K, %rax andq $7, %rax # if (k & 1) je .L6_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L6_47 ALIGN_4 .L6_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 /***************************************************************************************************************/ .L7_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L7_20 ALIGN_4 .L7_11: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L7_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L7_16 KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L7_16 jmp .L7_12 ALIGN_4 .L7_16: movq K, %rax andq $7, %rax # if (k & 1) je .L7_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_17: KERNEL16x3_SUB(xxx) addq $3, BI addq $16, %rax jl .L7_17 ALIGN_4 .L7_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) vmovups %xmm15,12 * SIZE(CO1, LDC, 2) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L7_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L7_20: // Test rest of M testq $15, M jz .L7_60 // to next 3 lines of N testq $8, M jz .L7_21pre ALIGN_4 /**************************************************************************/ .L7_20_1: leaq BUFFER2, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_20_6 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_20_6 KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_20_6 jmp .L7_20_2 ALIGN_4 .L7_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L7_20_9 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_20_7: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L7_20_7 ALIGN_4 .L7_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L7_21pre: testq $4, M jz .L7_30 ALIGN_4 .L7_21: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 jmp .L7_22 ALIGN_4 .L7_26: movq K, %rax andq $7, %rax # if (k & 1) je .L7_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L7_27 ALIGN_4 .L7_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L7_30: testq $2, M jz .L7_40 ALIGN_4 .L7_31: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) prefetcht0 B_PR1+16(BO,BI,SIZE) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,SIZE) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 jmp .L7_32 ALIGN_4 .L7_36: movq K, %rax andq $7, %rax # if (k & 1) je .L7_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L7_37 ALIGN_4 .L7_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L7_40: testq $1, M jz .L7_60 // to next 3 lines of N ALIGN_4 .L7_41: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 jmp .L7_42 ALIGN_4 .L7_46: movq K, %rax andq $7, %rax # if (k & 1) je .L7_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L7_47 ALIGN_4 .L7_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L7_60: decq J // j -- jg .L6_01 .L2_0: cmpq $0, Nmod6 // N % 6 == 0 je .L999 /************************************************************************************************ * Loop for Nmod6 / 2 > 0 *************************************************************************************************/ movq Nmod6, J sarq $1, J // j = j / 2 je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB(xxx) addq $2, BI addq $16, %rax jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_20_7 ALIGN_4 .L2_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: movq K, %rax andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB(xxx) addq $1, BI addq $16, %rax jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_20_7 ALIGN_4 .L1_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: movq K, %rax andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vmovups %xmm4 , (CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vmovss %xmm4 , (CO1) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB(xxx) addq $2, BI addq $16, %rax jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm10,%xmm10 vmulps %xmm0, %xmm13,%xmm13 vmulps %xmm0, %xmm5,%xmm5 vmulps %xmm0, %xmm8,%xmm8 vmulps %xmm0, %xmm11,%xmm11 vmulps %xmm0, %xmm14,%xmm14 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_20_7 ALIGN_4 .L2_20_9: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm5,%xmm5 vmulps %xmm0, %xmm8,%xmm8 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm5,%xmm5 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm8,%xmm8 vmulss %xmm0, %xmm5,%xmm5 vmulss %xmm0, %xmm10,%xmm10 #endif vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm5,%xmm5 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB(xxx) addq $1, BI addq $16, %rax jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm10,%xmm10 vmulps %xmm0, %xmm13,%xmm13 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_20_7 ALIGN_4 .L1_20_9: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 #else vmulps %xmm0, %xmm4,%xmm4 #endif vmovups %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm8,%xmm8 #endif vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 #else vmulss %xmm0, %xmm4,%xmm4 #endif vmovss %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/sgemm_kernel_16x4_haswell.S000066400000000000000000004225741313527062700224750ustar00rootroot00000000000000/********************************************************************************* Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /********************************************************************* * 2014/07/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * 2013/10/28 Saar * Parameter: * SGEMM_DEFAULT_UNROLL_N 4 * SGEMM_DEFAULT_UNROLL_M 16 * SGEMM_DEFAULT_P 768 * SGEMM_DEFAULT_Q 384 * A_PR1 512 * B_PR1 512 * * * 2014/07/28 Saar * Performance at 9216x9216x9216: * 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) * 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) * 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) * 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define BO2 %rbp #define SP %rbx #define BO1 %rdi #define CO2 %rdx #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #if defined(OS_WINDOWS) #define L_BUFFER_SIZE 8192 #else #define L_BUFFER_SIZE 12288 #endif #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(BULLDOZER) #define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 #define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 #else #define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 #define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 #endif #define A_PR1 512 #define B_PR1 512 /******************************************************************************************* * 6 lines of N *******************************************************************************************/ .macro KERNEL16x6_SUB vmovups -16 * SIZE(AO), %ymm0 vmovups -8 * SIZE(AO), %ymm1 vbroadcastss -4 * SIZE(BO), %ymm2 vbroadcastss -3 * SIZE(BO), %ymm3 prefetcht0 A_PR1(AO) VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) vbroadcastss -2 * SIZE(BO), %ymm2 vbroadcastss -1 * SIZE(BO), %ymm3 VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) vbroadcastss 0 * SIZE(BO), %ymm2 vbroadcastss 1 * SIZE(BO), %ymm3 VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) addq $ 6*SIZE, BO addq $ 16*SIZE, AO decq %rax .endm .macro SAVE16x6 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm5 , %ymm5 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm7 , %ymm7 vmulps %ymm0 , %ymm8 , %ymm8 vmulps %ymm0 , %ymm9 , %ymm9 vmulps %ymm0 , %ymm10, %ymm10 vmulps %ymm0 , %ymm11, %ymm11 vmulps %ymm0 , %ymm12, %ymm12 vmulps %ymm0 , %ymm13, %ymm13 vmulps %ymm0 , %ymm14, %ymm14 vmulps %ymm0 , %ymm15, %ymm15 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps 8 * SIZE(CO1), %ymm5,%ymm5 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 vaddps (CO1, LDC,2), %ymm8,%ymm8 vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 vaddps (CO2), %ymm10,%ymm10 vaddps 8 * SIZE(CO2), %ymm11,%ymm11 vaddps (CO2, LDC), %ymm12,%ymm12 vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 vaddps (CO2, LDC,2), %ymm14,%ymm14 vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , 8 * SIZE(CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm7 , 8 * SIZE(CO1, LDC) vmovups %ymm8 , (CO1, LDC,2) vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) vmovups %ymm10, (CO2) vmovups %ymm11, 8 * SIZE(CO2) vmovups %ymm12, (CO2, LDC) vmovups %ymm13, 8 * SIZE(CO2, LDC) vmovups %ymm14, (CO2, LDC,2) vmovups %ymm15, 8 * SIZE(CO2, LDC,2) .endm /*******************************************************************************************/ .macro KERNEL8x6_SUB vmovups -16 * SIZE(AO), %ymm0 vbroadcastss -4 * SIZE(BO), %ymm2 vbroadcastss -3 * SIZE(BO), %ymm3 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) vbroadcastss -2 * SIZE(BO), %ymm2 vbroadcastss -1 * SIZE(BO), %ymm3 VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) vbroadcastss 0 * SIZE(BO), %ymm2 vbroadcastss 1 * SIZE(BO), %ymm3 VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) addq $ 6*SIZE, BO addq $ 8*SIZE, AO decq %rax .endm .macro SAVE8x6 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm8 , %ymm8 vmulps %ymm0 , %ymm10, %ymm10 vmulps %ymm0 , %ymm12, %ymm12 vmulps %ymm0 , %ymm14, %ymm14 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps (CO1, LDC,2), %ymm8,%ymm8 vaddps (CO2), %ymm10,%ymm10 vaddps (CO2, LDC), %ymm12,%ymm12 vaddps (CO2, LDC,2), %ymm14,%ymm14 #endif vmovups %ymm4 , (CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm8 , (CO1, LDC,2) vmovups %ymm10, (CO2) vmovups %ymm12, (CO2, LDC) vmovups %ymm14, (CO2, LDC,2) .endm /*******************************************************************************************/ .macro KERNEL4x6_SUB vmovups -16 * SIZE(AO), %xmm0 vbroadcastss -4 * SIZE(BO), %xmm2 vbroadcastss -3 * SIZE(BO), %xmm3 VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) vbroadcastss -2 * SIZE(BO), %xmm2 vbroadcastss -1 * SIZE(BO), %xmm3 VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) vbroadcastss 0 * SIZE(BO), %xmm2 vbroadcastss 1 * SIZE(BO), %xmm3 VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) addq $ 6*SIZE, BO addq $ 4*SIZE, AO decq %rax .endm .macro SAVE4x6 vbroadcastss ALPHA, %xmm0 vmulps %xmm0 , %xmm4 , %xmm4 vmulps %xmm0 , %xmm6 , %xmm6 vmulps %xmm0 , %xmm8 , %xmm8 vmulps %xmm0 , %xmm10, %xmm10 vmulps %xmm0 , %xmm12, %xmm12 vmulps %xmm0 , %xmm14, %xmm14 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm4,%xmm4 vaddps (CO1, LDC), %xmm6,%xmm6 vaddps (CO1, LDC,2), %xmm8,%xmm8 vaddps (CO2), %xmm10,%xmm10 vaddps (CO2, LDC), %xmm12,%xmm12 vaddps (CO2, LDC,2), %xmm14,%xmm14 #endif vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) vmovups %xmm8 , (CO1, LDC,2) vmovups %xmm10, (CO2) vmovups %xmm12, (CO2, LDC) vmovups %xmm14, (CO2, LDC,2) .endm /*******************************************************************************************/ .macro KERNEL2x6_SUB vmovss -16 * SIZE(AO), %xmm0 vmovss -15 * SIZE(AO), %xmm1 vmovss -4 * SIZE(BO), %xmm2 vmovss -3 * SIZE(BO), %xmm3 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) vmovss -2 * SIZE(BO), %xmm2 vmovss -1 * SIZE(BO), %xmm3 VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) vmovss 0 * SIZE(BO), %xmm2 vmovss 1 * SIZE(BO), %xmm3 VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) addq $ 6*SIZE, BO addq $ 2*SIZE, AO decq %rax .endm .macro SAVE2x6 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm5 , %xmm5 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm7 , %xmm7 vmulss %xmm0 , %xmm8 , %xmm8 vmulss %xmm0 , %xmm9 , %xmm9 vmulss %xmm0 , %xmm10, %xmm10 vmulss %xmm0 , %xmm11, %xmm11 vmulss %xmm0 , %xmm12, %xmm12 vmulss %xmm0 , %xmm13, %xmm13 vmulss %xmm0 , %xmm14, %xmm14 vmulss %xmm0 , %xmm15, %xmm15 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss 1 * SIZE(CO1), %xmm5,%xmm5 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 vaddss (CO1, LDC,2), %xmm8,%xmm8 vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 vaddss (CO2), %xmm10,%xmm10 vaddss 1 * SIZE(CO2), %xmm11,%xmm11 vaddss (CO2, LDC), %xmm12,%xmm12 vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 vaddss (CO2, LDC,2), %xmm14,%xmm14 vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , 1 * SIZE(CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm7 , 1 * SIZE(CO1, LDC) vmovss %xmm8 , (CO1, LDC,2) vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) vmovss %xmm10, (CO2) vmovss %xmm11, 1 * SIZE(CO2) vmovss %xmm12, (CO2, LDC) vmovss %xmm13, 1 * SIZE(CO2, LDC) vmovss %xmm14, (CO2, LDC,2) vmovss %xmm15, 1 * SIZE(CO2, LDC,2) .endm /*******************************************************************************************/ .macro KERNEL1x6_SUB vmovss -16 * SIZE(AO), %xmm0 vmovss -4 * SIZE(BO), %xmm2 vmovss -3 * SIZE(BO), %xmm3 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) vmovss -2 * SIZE(BO), %xmm2 vmovss -1 * SIZE(BO), %xmm3 VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) vmovss 0 * SIZE(BO), %xmm2 vmovss 1 * SIZE(BO), %xmm3 VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) addq $ 6*SIZE, BO addq $ 1*SIZE, AO decq %rax .endm .macro SAVE1x6 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm8 , %xmm8 vmulss %xmm0 , %xmm10, %xmm10 vmulss %xmm0 , %xmm12, %xmm12 vmulss %xmm0 , %xmm14, %xmm14 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss (CO1, LDC,2), %xmm8,%xmm8 vaddss (CO2), %xmm10,%xmm10 vaddss (CO2, LDC), %xmm12,%xmm12 vaddss (CO2, LDC,2), %xmm14,%xmm14 #endif vmovss %xmm4 , (CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm8 , (CO1, LDC,2) vmovss %xmm10, (CO2) vmovss %xmm12, (CO2, LDC) vmovss %xmm14, (CO2, LDC,2) .endm /*******************************************************************************************/ /******************************************************************************************* * 4 lines of N *******************************************************************************************/ .macro KERNEL16x4_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) addq $ 4 , BI addq $ 16, %rax .endm .macro SAVE16x4 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm5 , %ymm5 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm7 , %ymm7 vmulps %ymm0 , %ymm8 , %ymm8 vmulps %ymm0 , %ymm9 , %ymm9 vmulps %ymm0 , %ymm10, %ymm10 vmulps %ymm0 , %ymm11, %ymm11 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps 8 * SIZE(CO1), %ymm5,%ymm5 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 vaddps (CO2), %ymm8,%ymm8 vaddps 8 * SIZE(CO2), %ymm9,%ymm9 vaddps (CO2, LDC), %ymm10,%ymm10 vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , 8 * SIZE(CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm7 , 8 * SIZE(CO1, LDC) vmovups %ymm8 , (CO2) vmovups %ymm9 , 8 * SIZE(CO2) vmovups %ymm10, (CO2, LDC) vmovups %ymm11, 8 * SIZE(CO2, LDC) prefetcht0 64(CO1) prefetcht0 64(CO1, LDC) prefetcht0 64(CO2) prefetcht0 64(CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL8x4_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) addq $ 4 , BI addq $ 8 , %rax .endm .macro SAVE8x4 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm8 , %ymm8 vmulps %ymm0 , %ymm10, %ymm10 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps (CO2), %ymm8,%ymm8 vaddps (CO2, LDC), %ymm10,%ymm10 #endif vmovups %ymm4 , (CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm8 , (CO2) vmovups %ymm10, (CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL4x4_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) addq $ 4 , BI addq $ 4 , %rax .endm .macro SAVE4x4 vbroadcastss ALPHA, %xmm0 vmulps %xmm0 , %xmm4 , %xmm4 vmulps %xmm0 , %xmm6 , %xmm6 vmulps %xmm0 , %xmm8 , %xmm8 vmulps %xmm0 , %xmm10, %xmm10 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm4,%xmm4 vaddps (CO1, LDC), %xmm6,%xmm6 vaddps (CO2), %xmm8,%xmm8 vaddps (CO2, LDC), %xmm10,%xmm10 #endif vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) vmovups %xmm8 , (CO2) vmovups %xmm10, (CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL2x4_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) addq $ 4 , BI addq $ 2, %rax .endm .macro SAVE2x4 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm5 , %xmm5 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm7 , %xmm7 vmulss %xmm0 , %xmm8 , %xmm8 vmulss %xmm0 , %xmm9 , %xmm9 vmulss %xmm0 , %xmm10, %xmm10 vmulss %xmm0 , %xmm11, %xmm11 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss 1 * SIZE(CO1), %xmm5,%xmm5 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 vaddss (CO2), %xmm8,%xmm8 vaddss 1 * SIZE(CO2), %xmm9,%xmm9 vaddss (CO2, LDC), %xmm10,%xmm10 vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , 1 * SIZE(CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm7 , 1 * SIZE(CO1, LDC) vmovss %xmm8 , (CO2) vmovss %xmm9 , 1 * SIZE(CO2) vmovss %xmm10, (CO2, LDC) vmovss %xmm11, 1 * SIZE(CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL1x4_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) addq $ 4 , BI addq $ 1, %rax .endm .macro SAVE1x4 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm8 , %xmm8 vmulss %xmm0 , %xmm10, %xmm10 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss (CO2), %xmm8,%xmm8 vaddss (CO2, LDC), %xmm10,%xmm10 #endif vmovss %xmm4 , (CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm8 , (CO2) vmovss %xmm10, (CO2, LDC) .endm /*******************************************************************************************/ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ .macro KERNEL16x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) addq $ 2 , BI addq $ 16, %rax .endm .macro SAVE16x2 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm5 , %ymm5 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm7 , %ymm7 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps 8 * SIZE(CO1), %ymm5,%ymm5 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , 8 * SIZE(CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm7 , 8 * SIZE(CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL8x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) addq $ 2 , BI addq $ 8 , %rax .endm .macro SAVE8x2 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm6 , %ymm6 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps (CO1, LDC), %ymm6,%ymm6 #endif vmovups %ymm4 , (CO1) vmovups %ymm6 , (CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL4x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) addq $ 2 , BI addq $ 4 , %rax .endm .macro SAVE4x2 vbroadcastss ALPHA, %xmm0 vmulps %xmm0 , %xmm4 , %xmm4 vmulps %xmm0 , %xmm6 , %xmm6 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm4,%xmm4 vaddps (CO1, LDC), %xmm6,%xmm6 #endif vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL2x2_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) addq $ 2 , BI addq $ 2, %rax .endm .macro SAVE2x2 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm5 , %xmm5 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm7 , %xmm7 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss 1 * SIZE(CO1), %xmm5,%xmm5 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , 1 * SIZE(CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm7 , 1 * SIZE(CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL1x2_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) addq $ 2 , BI addq $ 1, %rax .endm .macro SAVE1x2 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm6 , %xmm6 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss (CO1, LDC), %xmm6,%xmm6 #endif vmovss %xmm4 , (CO1) vmovss %xmm6 , (CO1, LDC) .endm /*******************************************************************************************/ /******************************************************************************************* * 1 line of N *******************************************************************************************/ .macro KERNEL16x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) addq $ 1 , BI addq $ 16, %rax .endm .macro SAVE16x1 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm5 , %ymm5 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps 8 * SIZE(CO1), %ymm5,%ymm5 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , 8 * SIZE(CO1) .endm /*******************************************************************************************/ .macro KERNEL8x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) addq $ 1 , BI addq $ 8 , %rax .endm .macro SAVE8x1 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 #endif vmovups %ymm4 , (CO1) .endm /*******************************************************************************************/ .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) addq $ 1 , BI addq $ 4 , %rax .endm .macro SAVE4x1 vbroadcastss ALPHA, %xmm0 vmulps %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm4,%xmm4 #endif vmovups %xmm4 , (CO1) .endm /*******************************************************************************************/ .macro KERNEL2x1_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) addq $ 1 , BI addq $ 2 , %rax .endm .macro SAVE2x1 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm5 , %xmm5 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss 1 * SIZE(CO1), %xmm5,%xmm5 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , 1 * SIZE(CO1) .endm /*******************************************************************************************/ .macro KERNEL1x1_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) addq $ 1 , BI addq $ 1 , %rax .endm .macro SAVE1x1 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 #endif vmovss %xmm4 , (CO1) .endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) /************************************************************************************* * GEMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $12, %rdi divq %rdi // N / 12 movq %rax, Ndiv6 // N / 12 movq %rdx, Nmod6 // N % 12 movq Ndiv6, J cmpq $0, J je .L4_00 ALIGN_4 /*******************************************************************************************/ .L6_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax salq $2, %rax // 4 values of B leaq (B, %rax,4), BO2 movq BO2, B // next offset of B movq K, %rax ALIGN_4 .L6_02c: vmovups (BO1), %xmm0 vmovsd (BO2), %xmm1 vmovups %xmm0, (BO) vmovsd %xmm1, 4*SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO2 addq $ 6*SIZE,BO decq %rax jnz .L6_02c .L6_10: movq C, CO1 leaq (C, LDC, 2), CO2 leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc leaq (C, LDC, 4), C leaq (C, LDC, 2), C // c = c + 6 * ldc movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L6_20 ALIGN_4 .L6_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L6_16 ALIGN_4 .L6_12: KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB je .L6_16 KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB je .L6_16 jmp .L6_12 ALIGN_4 .L6_16: movq K, %rax andq $7, %rax # if (k & 1) je .L6_19 ALIGN_4 .L6_17: KERNEL16x6_SUB jnz .L6_17 ALIGN_4 .L6_19: SAVE16x6 addq $16 * SIZE, CO1 # coffset += 16 addq $16 * SIZE, CO2 # coffset += 16 decq I # i -- jg .L6_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_20: // Test rest of M testq $15, M jz .L6_60 // to next 6 lines of N testq $8, M jz .L6_21pre ALIGN_4 /**************************************************************************/ .L6_20_1: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_20_6 ALIGN_4 .L6_20_2: prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB je .L6_20_6 prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB je .L6_20_6 jmp .L6_20_2 ALIGN_4 .L6_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L6_20_9 ALIGN_4 .L6_20_7: KERNEL8x6_SUB jnz .L6_20_7 ALIGN_4 .L6_20_9: SAVE8x6 addq $8 * SIZE, CO1 # coffset += 8 addq $8 * SIZE, CO2 # coffset += 8 ALIGN_4 /**************************************************************************/ .L6_21pre: testq $4, M jz .L6_30 ALIGN_4 .L6_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_26 ALIGN_4 .L6_22: prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB je .L6_26 prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB je .L6_26 jmp .L6_22 ALIGN_4 .L6_26: movq K, %rax andq $7, %rax # if (k & 1) je .L6_29 ALIGN_4 .L6_27: KERNEL4x6_SUB jnz .L6_27 ALIGN_4 .L6_29: SAVE4x6 addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L6_30: testq $2, M jz .L6_40 ALIGN_4 .L6_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_36 ALIGN_4 .L6_32: prefetcht0 A_PR1(AO) KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB je .L6_36 prefetcht0 A_PR1(AO) KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB je .L6_36 jmp .L6_32 ALIGN_4 .L6_36: movq K, %rax andq $7, %rax # if (k & 1) je .L6_39 ALIGN_4 .L6_37: KERNEL2x6_SUB jnz .L6_37 ALIGN_4 .L6_39: SAVE2x6 addq $2 * SIZE, CO1 # coffset += 2 addq $2 * SIZE, CO2 # coffset += 2 ALIGN_4 .L6_40: testq $1, M jz .L6_60 // to next 4 lines of N ALIGN_4 .L6_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_46 ALIGN_4 .L6_42: prefetcht0 A_PR1(AO) KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB je .L6_46 KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB je .L6_46 jmp .L6_42 ALIGN_4 .L6_46: movq K, %rax andq $7, %rax # if (k & 1) je .L6_49 ALIGN_4 .L6_47: KERNEL1x6_SUB jnz .L6_47 ALIGN_4 .L6_49: SAVE1x6 addq $1 * SIZE, CO1 # coffset += 1 addq $1 * SIZE, CO2 # coffset += 1 ALIGN_4 .L6_60: /*******************************************************************************************/ .L7_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax salq $2, %rax // 4 values of B leaq (B, %rax,4), BO2 movq K, %rax ALIGN_4 .L7_02c: vmovsd 2*SIZE(BO1), %xmm0 vmovups (BO2), %xmm1 vmovsd %xmm0, (BO) vmovups %xmm1, 2*SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO2 addq $ 6*SIZE,BO decq %rax jnz .L7_02c movq BO2, B // next offset of B .L7_10: movq C, CO1 leaq (C, LDC, 2), CO2 leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc leaq (C, LDC, 4), C leaq (C, LDC, 2), C // c = c + 6 * ldc movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L7_20 ALIGN_4 .L7_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L7_16 ALIGN_4 .L7_12: KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB je .L7_16 KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB KERNEL16x6_SUB je .L7_16 jmp .L7_12 ALIGN_4 .L7_16: movq K, %rax andq $7, %rax # if (k & 1) je .L7_19 ALIGN_4 .L7_17: KERNEL16x6_SUB jnz .L7_17 ALIGN_4 .L7_19: SAVE16x6 addq $16 * SIZE, CO1 # coffset += 16 addq $16 * SIZE, CO2 # coffset += 16 decq I # i -- jg .L7_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L7_20: // Test rest of M testq $15, M jz .L7_60 // to next 6 lines of N testq $8, M jz .L7_21pre ALIGN_4 /**************************************************************************/ .L7_20_1: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_20_6 ALIGN_4 .L7_20_2: prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB je .L7_20_6 prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB prefetcht0 A_PR1(AO) KERNEL8x6_SUB KERNEL8x6_SUB je .L7_20_6 jmp .L7_20_2 ALIGN_4 .L7_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L7_20_9 ALIGN_4 .L7_20_7: KERNEL8x6_SUB jnz .L7_20_7 ALIGN_4 .L7_20_9: SAVE8x6 addq $8 * SIZE, CO1 # coffset += 8 addq $8 * SIZE, CO2 # coffset += 8 ALIGN_4 /**************************************************************************/ .L7_21pre: testq $4, M jz .L7_30 ALIGN_4 .L7_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_26 ALIGN_4 .L7_22: prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB je .L7_26 prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB prefetcht0 A_PR1(AO) KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB KERNEL4x6_SUB je .L7_26 jmp .L7_22 ALIGN_4 .L7_26: movq K, %rax andq $7, %rax # if (k & 1) je .L7_29 ALIGN_4 .L7_27: KERNEL4x6_SUB jnz .L7_27 ALIGN_4 .L7_29: SAVE4x6 addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L7_30: testq $2, M jz .L7_40 ALIGN_4 .L7_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_36 ALIGN_4 .L7_32: prefetcht0 A_PR1(AO) KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB je .L7_36 prefetcht0 A_PR1(AO) KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB KERNEL2x6_SUB je .L7_36 jmp .L7_32 ALIGN_4 .L7_36: movq K, %rax andq $7, %rax # if (k & 1) je .L7_39 ALIGN_4 .L7_37: KERNEL2x6_SUB jnz .L7_37 ALIGN_4 .L7_39: SAVE2x6 addq $2 * SIZE, CO1 # coffset += 2 addq $2 * SIZE, CO2 # coffset += 2 ALIGN_4 .L7_40: testq $1, M jz .L7_60 // to next 4 lines of N ALIGN_4 .L7_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_46 ALIGN_4 .L7_42: prefetcht0 A_PR1(AO) KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB je .L7_46 KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB KERNEL1x6_SUB je .L7_46 jmp .L7_42 ALIGN_4 .L7_46: movq K, %rax andq $7, %rax # if (k & 1) je .L7_49 ALIGN_4 .L7_47: KERNEL1x6_SUB jnz .L7_47 ALIGN_4 .L7_49: SAVE1x6 addq $1 * SIZE, CO1 # coffset += 1 addq $1 * SIZE, CO2 # coffset += 1 ALIGN_4 .L7_60: decq J // j -- jg .L6_01 // next 12 lines of N /*******************************************************************************************/ .L4_00: movq Nmod6, J sarq $2, J // j = j / 4 cmpq $ 0, J je .L2_00 ALIGN_4 .L4_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L4_01b ALIGN_4 .L4_01a: prefetcht0 512(BO1) prefetchw 512(BO) vmovups (BO1), %xmm0 vmovups 4*SIZE(BO1), %xmm1 vmovups 8*SIZE(BO1), %xmm2 vmovups 12*SIZE(BO1), %xmm3 vmovups %xmm0, (BO) vmovups %xmm1, 4*SIZE(BO) vmovups %xmm2, 8*SIZE(BO) vmovups %xmm3,12*SIZE(BO) addq $ 16*SIZE,BO1 addq $ 16*SIZE,BO decq %rax jnz .L4_01a .L4_01b: movq K, %rax andq $3, %rax // K % 4 jz .L4_02d ALIGN_4 .L4_02c: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L4_02c .L4_02d: movq BO1, B // next offset of B .L4_10: movq C, CO1 leaq (C, LDC, 2), CO2 leaq (C, LDC, 4), C // c += 4 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L4_20 ALIGN_4 .L4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L4_16 movq %rax, BI // Index for BO leaq (,BI,4) , BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_12: prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB je .L4_16 prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB je .L4_16 jmp .L4_12 ALIGN_4 .L4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_19 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_17: KERNEL16x4_SUB jl .L4_17 ALIGN_4 .L4_19: SAVE16x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 addq $16 * SIZE, CO2 # coffset += 16 decq I # i -- jg .L4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L4_20: // Test rest of M testq $15, M jz .L4_60 // to next 3 lines of N testq $8, M jz .L4_21pre ALIGN_4 /**************************************************************************/ .L4_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_20_6 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_20_2: KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB je .L4_20_6 KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB je .L4_20_6 jmp .L4_20_2 ALIGN_4 .L4_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_20_9 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_20_7: KERNEL8x4_SUB jl .L4_20_7 ALIGN_4 .L4_20_9: SAVE8x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 addq $8 * SIZE, CO2 # coffset += 8 ALIGN_4 /**************************************************************************/ .L4_21pre: testq $4, M jz .L4_30 ALIGN_4 .L4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_26 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_22: KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB je .L4_26 KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB je .L4_26 jmp .L4_22 ALIGN_4 .L4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_29 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_27: KERNEL4x4_SUB jl .L4_27 ALIGN_4 .L4_29: SAVE4x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L4_30: testq $2, M jz .L4_40 ALIGN_4 .L4_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_36 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_32: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB je .L4_36 KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB je .L4_36 jmp .L4_32 ALIGN_4 .L4_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_39 movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_37: KERNEL2x4_SUB jl .L4_37 ALIGN_4 .L4_39: SAVE2x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 addq $2 * SIZE, CO2 # coffset += 2 ALIGN_4 .L4_40: testq $1, M jz .L4_60 // to next 4 lines of N ALIGN_4 .L4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_46 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_42: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB je .L4_46 KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB je .L4_46 jmp .L4_42 ALIGN_4 .L4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_49 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_47: KERNEL1x4_SUB jl .L4_47 ALIGN_4 .L4_49: SAVE1x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 addq $1 * SIZE, CO2 # coffset += 1 ALIGN_4 .L4_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif decq J // j -- jg .L4_01 // next 4 lines of N /*******************************************************************************************/ .L2_00: movq Nmod6, J andq $3, J // j % 4 je .L999 movq Nmod6, J andq $2, J // j % 4 je .L1_0 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L2_01b ALIGN_4 .L2_01a: vmovsd (BO1), %xmm0 vmovsd 2*SIZE(BO1), %xmm1 vmovsd 4*SIZE(BO1), %xmm2 vmovsd 6*SIZE(BO1), %xmm3 vmovsd %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovsd %xmm3, 6*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO decq %rax jnz .L2_01a .L2_01b: movq K, %rax andq $3, %rax // K % 4 jz .L2_02d ALIGN_4 .L2_02c: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02c .L2_02d: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB je .L2_16 KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB jl .L2_17 ALIGN_4 .L2_19: SAVE16x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 2 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB je .L2_20_6 KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB jl .L2_20_7 ALIGN_4 .L2_20_9: SAVE8x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB je .L2_26 KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB jl .L2_27 ALIGN_4 .L2_29: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_36 KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB jl .L2_37 ALIGN_4 .L2_39: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_46 KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB jl .L2_47 ALIGN_4 .L2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB je .L1_16 KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB jl .L1_17 ALIGN_4 .L1_19: SAVE16x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB je .L1_20_6 KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB jl .L1_20_7 ALIGN_4 .L1_20_9: SAVE8x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_26 KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB jl .L1_27 ALIGN_4 .L1_29: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_36 KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB jl .L1_37 ALIGN_4 .L1_39: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_46 KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB jl .L1_47 ALIGN_4 .L1_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $4, %rdi divq %rdi // N / 4 movq %rax, Ndiv6 // N / 4 movq %rdx, Nmod6 // N % 4 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 /*******************************************************************************************/ .L4_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L4_01b ALIGN_4 .L4_01a: prefetcht0 512(BO1) prefetchw 512(BO) vmovups (BO1), %xmm0 vmovups 4*SIZE(BO1), %xmm1 vmovups 8*SIZE(BO1), %xmm2 vmovups 12*SIZE(BO1), %xmm3 vmovups %xmm0, (BO) vmovups %xmm1, 4*SIZE(BO) vmovups %xmm2, 8*SIZE(BO) vmovups %xmm3,12*SIZE(BO) addq $ 16*SIZE,BO1 addq $ 16*SIZE,BO decq %rax jnz .L4_01a .L4_01b: movq K, %rax andq $3, %rax // K % 4 jz .L4_02d ALIGN_4 .L4_02c: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L4_02c .L4_02d: movq BO1, B // next offset of B .L4_10: movq C, CO1 leaq (C, LDC, 2), CO2 leaq (C, LDC, 4), C // c += 4 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L4_20 ALIGN_4 .L4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L4_16 movq %rax, BI // Index for BO leaq (,BI,4) , BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_12: prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB je .L4_16 prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB je .L4_16 jmp .L4_12 ALIGN_4 .L4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_19 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_17: KERNEL16x4_SUB jl .L4_17 ALIGN_4 .L4_19: SAVE16x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 addq $16 * SIZE, CO2 # coffset += 16 decq I # i -- jg .L4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L4_20: // Test rest of M testq $15, M jz .L4_60 // to next 3 lines of N testq $8, M jz .L4_21pre ALIGN_4 /**************************************************************************/ .L4_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_20_6 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_20_2: KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB je .L4_20_6 KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB je .L4_20_6 jmp .L4_20_2 ALIGN_4 .L4_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_20_9 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_20_7: KERNEL8x4_SUB jl .L4_20_7 ALIGN_4 .L4_20_9: SAVE8x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 addq $8 * SIZE, CO2 # coffset += 8 ALIGN_4 /**************************************************************************/ .L4_21pre: testq $4, M jz .L4_30 ALIGN_4 .L4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_26 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_22: KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB je .L4_26 KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB je .L4_26 jmp .L4_22 ALIGN_4 .L4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_29 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_27: KERNEL4x4_SUB jl .L4_27 ALIGN_4 .L4_29: SAVE4x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L4_30: testq $2, M jz .L4_40 ALIGN_4 .L4_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_36 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_32: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB je .L4_36 KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB je .L4_36 jmp .L4_32 ALIGN_4 .L4_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_39 movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_37: KERNEL2x4_SUB jl .L4_37 ALIGN_4 .L4_39: SAVE2x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 addq $2 * SIZE, CO2 # coffset += 2 ALIGN_4 .L4_40: testq $1, M jz .L4_60 // to next 4 lines of N ALIGN_4 .L4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_46 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_42: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB je .L4_46 KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB je .L4_46 jmp .L4_42 ALIGN_4 .L4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_49 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_47: KERNEL1x4_SUB jl .L4_47 ALIGN_4 .L4_49: SAVE1x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 addq $1 * SIZE, CO2 # coffset += 1 ALIGN_4 .L4_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif decq J // j -- jg .L4_01 // next 4 lines of N /*******************************************************************************************/ .L2_0: movq Nmod6, J andq $3, J // j % 4 je .L999 movq Nmod6, J andq $2, J // j % 4 je .L1_0 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L2_01b ALIGN_4 .L2_01a: vmovsd (BO1), %xmm0 vmovsd 2*SIZE(BO1), %xmm1 vmovsd 4*SIZE(BO1), %xmm2 vmovsd 6*SIZE(BO1), %xmm3 vmovsd %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovsd %xmm3, 6*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO decq %rax jnz .L2_01a .L2_01b: movq K, %rax andq $3, %rax // K % 4 jz .L2_02d ALIGN_4 .L2_02c: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02c .L2_02d: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB je .L2_16 KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB jl .L2_17 ALIGN_4 .L2_19: SAVE16x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 2 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB je .L2_20_6 KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB jl .L2_20_7 ALIGN_4 .L2_20_9: SAVE8x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB je .L2_26 KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB jl .L2_27 ALIGN_4 .L2_29: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_36 KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB jl .L2_37 ALIGN_4 .L2_39: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_46 KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB jl .L2_47 ALIGN_4 .L2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB je .L1_16 KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB jl .L1_17 ALIGN_4 .L1_19: SAVE16x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB je .L1_20_6 KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB jl .L1_20_7 ALIGN_4 .L1_20_9: SAVE8x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_26 KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB jl .L1_27 ALIGN_4 .L1_29: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_36 KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB jl .L1_37 ALIGN_4 .L1_39: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_46 KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB jl .L1_47 ALIGN_4 .L1_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/sgemm_kernel_16x4_sandy.S000066400000000000000000002100251313527062700221360ustar00rootroot00000000000000/********************************************************************************* Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define CO2 %rdx #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #define A_PR1 512 #define B_PR1 512 /******************************************************************************************* * 4 lines of N *******************************************************************************************/ .macro KERNEL16x4_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vmulps %ymm2 , %ymm0 , %ymm12 vmulps %ymm2 , %ymm1 , %ymm13 vmulps %ymm3 , %ymm0 , %ymm14 vmulps %ymm3 , %ymm1 , %ymm15 vaddps %ymm12, %ymm4 , %ymm4 vaddps %ymm13, %ymm5 , %ymm5 vaddps %ymm14, %ymm6 , %ymm6 vaddps %ymm15, %ymm7 , %ymm7 vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 vmulps %ymm2 , %ymm0 , %ymm12 vmulps %ymm2 , %ymm1 , %ymm13 vmulps %ymm3 , %ymm0 , %ymm14 vmulps %ymm3 , %ymm1 , %ymm15 vaddps %ymm12, %ymm8 , %ymm8 vaddps %ymm13, %ymm9 , %ymm9 vaddps %ymm14, %ymm10, %ymm10 vaddps %ymm15, %ymm11, %ymm11 addq $ 4 , BI addq $ 16, %rax .endm .macro SAVE16x4 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm5 , %ymm5 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm7 , %ymm7 vmulps %ymm0 , %ymm8 , %ymm8 vmulps %ymm0 , %ymm9 , %ymm9 vmulps %ymm0 , %ymm10, %ymm10 vmulps %ymm0 , %ymm11, %ymm11 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps 8 * SIZE(CO1), %ymm5,%ymm5 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 vaddps (CO2), %ymm8,%ymm8 vaddps 8 * SIZE(CO2), %ymm9,%ymm9 vaddps (CO2, LDC), %ymm10,%ymm10 vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , 8 * SIZE(CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm7 , 8 * SIZE(CO1, LDC) vmovups %ymm8 , (CO2) vmovups %ymm9 , 8 * SIZE(CO2) vmovups %ymm10, (CO2, LDC) vmovups %ymm11, 8 * SIZE(CO2, LDC) prefetcht0 64(CO1) prefetcht0 64(CO1, LDC) prefetcht0 64(CO2) prefetcht0 64(CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL8x4_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vmulps %ymm2 , %ymm0 , %ymm12 vmulps %ymm3 , %ymm0 , %ymm14 vaddps %ymm12, %ymm4 , %ymm4 vaddps %ymm14, %ymm6 , %ymm6 vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 vmulps %ymm2 , %ymm0 , %ymm12 vmulps %ymm3 , %ymm0 , %ymm14 vaddps %ymm12, %ymm8 , %ymm8 vaddps %ymm14, %ymm10, %ymm10 addq $ 4 , BI addq $ 8 , %rax .endm .macro SAVE8x4 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm8 , %ymm8 vmulps %ymm0 , %ymm10, %ymm10 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps (CO2), %ymm8,%ymm8 vaddps (CO2, LDC), %ymm10,%ymm10 #endif vmovups %ymm4 , (CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm8 , (CO2) vmovups %ymm10, (CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL4x4_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 vmulps %xmm2 , %xmm0 , %xmm12 vmulps %xmm3 , %xmm0 , %xmm14 vaddps %xmm12, %xmm4 , %xmm4 vaddps %xmm14, %xmm6 , %xmm6 vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 vmulps %xmm2 , %xmm0 , %xmm12 vmulps %xmm3 , %xmm0 , %xmm14 vaddps %xmm12, %xmm8 , %xmm8 vaddps %xmm14, %xmm10, %xmm10 addq $ 4 , BI addq $ 4 , %rax .endm .macro SAVE4x4 vbroadcastss ALPHA, %xmm0 vmulps %xmm0 , %xmm4 , %xmm4 vmulps %xmm0 , %xmm6 , %xmm6 vmulps %xmm0 , %xmm8 , %xmm8 vmulps %xmm0 , %xmm10, %xmm10 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm4,%xmm4 vaddps (CO1, LDC), %xmm6,%xmm6 vaddps (CO2), %xmm8,%xmm8 vaddps (CO2, LDC), %xmm10,%xmm10 #endif vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) vmovups %xmm8 , (CO2) vmovups %xmm10, (CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL2x4_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmulss %xmm2 , %xmm0 , %xmm12 vmulss %xmm2 , %xmm1 , %xmm13 vmulss %xmm3 , %xmm0 , %xmm14 vmulss %xmm3 , %xmm1 , %xmm15 vaddss %xmm12, %xmm4 , %xmm4 vaddss %xmm13, %xmm5 , %xmm5 vaddss %xmm14, %xmm6 , %xmm6 vaddss %xmm15, %xmm7 , %xmm7 vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 vmulss %xmm2 , %xmm0 , %xmm12 vmulss %xmm2 , %xmm1 , %xmm13 vmulss %xmm3 , %xmm0 , %xmm14 vmulss %xmm3 , %xmm1 , %xmm15 vaddss %xmm12, %xmm8 , %xmm8 vaddss %xmm13, %xmm9 , %xmm9 vaddss %xmm14, %xmm10, %xmm10 vaddss %xmm15, %xmm11, %xmm11 addq $ 4 , BI addq $ 2, %rax .endm .macro SAVE2x4 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm5 , %xmm5 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm7 , %xmm7 vmulss %xmm0 , %xmm8 , %xmm8 vmulss %xmm0 , %xmm9 , %xmm9 vmulss %xmm0 , %xmm10, %xmm10 vmulss %xmm0 , %xmm11, %xmm11 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss 1 * SIZE(CO1), %xmm5,%xmm5 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 vaddss (CO2), %xmm8,%xmm8 vaddss 1 * SIZE(CO2), %xmm9,%xmm9 vaddss (CO2, LDC), %xmm10,%xmm10 vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , 1 * SIZE(CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm7 , 1 * SIZE(CO1, LDC) vmovss %xmm8 , (CO2) vmovss %xmm9 , 1 * SIZE(CO2) vmovss %xmm10, (CO2, LDC) vmovss %xmm11, 1 * SIZE(CO2, LDC) .endm /*******************************************************************************************/ .macro KERNEL1x4_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmulss %xmm2 , %xmm0 , %xmm12 vmulss %xmm3 , %xmm0 , %xmm14 vaddss %xmm12, %xmm4 , %xmm4 vaddss %xmm14, %xmm6 , %xmm6 vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 vmulss %xmm2 , %xmm0 , %xmm12 vmulss %xmm3 , %xmm0 , %xmm14 vaddss %xmm12, %xmm8 , %xmm8 vaddss %xmm14, %xmm10, %xmm10 addq $ 4 , BI addq $ 1, %rax .endm .macro SAVE1x4 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm8 , %xmm8 vmulss %xmm0 , %xmm10, %xmm10 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss (CO2), %xmm8,%xmm8 vaddss (CO2, LDC), %xmm10,%xmm10 #endif vmovss %xmm4 , (CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm8 , (CO2) vmovss %xmm10, (CO2, LDC) .endm /*******************************************************************************************/ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ .macro KERNEL16x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vmulps %ymm2 , %ymm0 , %ymm12 vmulps %ymm2 , %ymm1 , %ymm13 vmulps %ymm3 , %ymm0 , %ymm14 vmulps %ymm3 , %ymm1 , %ymm15 vaddps %ymm12, %ymm4 , %ymm4 vaddps %ymm13, %ymm5 , %ymm5 vaddps %ymm14, %ymm6 , %ymm6 vaddps %ymm15, %ymm7 , %ymm7 addq $ 2 , BI addq $ 16, %rax .endm .macro SAVE16x2 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm5 , %ymm5 vmulps %ymm0 , %ymm6 , %ymm6 vmulps %ymm0 , %ymm7 , %ymm7 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps 8 * SIZE(CO1), %ymm5,%ymm5 vaddps (CO1, LDC), %ymm6,%ymm6 vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , 8 * SIZE(CO1) vmovups %ymm6 , (CO1, LDC) vmovups %ymm7 , 8 * SIZE(CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL8x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 vmulps %ymm2 , %ymm0 , %ymm12 vmulps %ymm3 , %ymm0 , %ymm14 vaddps %ymm12, %ymm4 , %ymm4 vaddps %ymm14, %ymm6 , %ymm6 addq $ 2 , BI addq $ 8 , %rax .endm .macro SAVE8x2 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm6 , %ymm6 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps (CO1, LDC), %ymm6,%ymm6 #endif vmovups %ymm4 , (CO1) vmovups %ymm6 , (CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL4x2_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 vmulps %xmm2 , %xmm0 , %xmm12 vmulps %xmm3 , %xmm0 , %xmm14 vaddps %xmm12, %xmm4 , %xmm4 vaddps %xmm14, %xmm6 , %xmm6 addq $ 2 , BI addq $ 4 , %rax .endm .macro SAVE4x2 vbroadcastss ALPHA, %xmm0 vmulps %xmm0 , %xmm4 , %xmm4 vmulps %xmm0 , %xmm6 , %xmm6 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm4,%xmm4 vaddps (CO1, LDC), %xmm6,%xmm6 #endif vmovups %xmm4 , (CO1) vmovups %xmm6 , (CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL2x2_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmulss %xmm2 , %xmm0 , %xmm12 vmulss %xmm2 , %xmm1 , %xmm13 vmulss %xmm3 , %xmm0 , %xmm14 vmulss %xmm3 , %xmm1 , %xmm15 vaddss %xmm12, %xmm4 , %xmm4 vaddss %xmm13, %xmm5 , %xmm5 vaddss %xmm14, %xmm6 , %xmm6 vaddss %xmm15, %xmm7 , %xmm7 addq $ 2 , BI addq $ 2, %rax .endm .macro SAVE2x2 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm5 , %xmm5 vmulss %xmm0 , %xmm6 , %xmm6 vmulss %xmm0 , %xmm7 , %xmm7 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss 1 * SIZE(CO1), %xmm5,%xmm5 vaddss (CO1, LDC), %xmm6,%xmm6 vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , 1 * SIZE(CO1) vmovss %xmm6 , (CO1, LDC) vmovss %xmm7 , 1 * SIZE(CO1, LDC) .endm /*******************************************************************************************/ .macro KERNEL1x2_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 vmulss %xmm2 , %xmm0 , %xmm12 vmulss %xmm3 , %xmm0 , %xmm14 vaddss %xmm12, %xmm4 , %xmm4 vaddss %xmm14, %xmm6 , %xmm6 addq $ 2 , BI addq $ 1, %rax .endm .macro SAVE1x2 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm6 , %xmm6 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss (CO1, LDC), %xmm6,%xmm6 #endif vmovss %xmm4 , (CO1) vmovss %xmm6 , (CO1, LDC) .endm /*******************************************************************************************/ /******************************************************************************************* * 1 line of N *******************************************************************************************/ .macro KERNEL16x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vmulps %ymm2 , %ymm0 , %ymm12 vmulps %ymm2 , %ymm1 , %ymm13 vaddps %ymm12, %ymm4 , %ymm4 vaddps %ymm13, %ymm5 , %ymm5 addq $ 1 , BI addq $ 16, %rax .endm .macro SAVE16x1 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 vmulps %ymm0 , %ymm5 , %ymm5 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 vaddps 8 * SIZE(CO1), %ymm5,%ymm5 #endif vmovups %ymm4 , (CO1) vmovups %ymm5 , 8 * SIZE(CO1) .endm /*******************************************************************************************/ .macro KERNEL8x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 vmulps %ymm2 , %ymm0 , %ymm12 vaddps %ymm12, %ymm4 , %ymm4 addq $ 1 , BI addq $ 8 , %rax .endm .macro SAVE8x1 vbroadcastss ALPHA, %ymm0 vmulps %ymm0 , %ymm4 , %ymm4 #if !defined(TRMMKERNEL) vaddps (CO1), %ymm4,%ymm4 #endif vmovups %ymm4 , (CO1) .endm /*******************************************************************************************/ .macro KERNEL4x1_SUB vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 vmulps %xmm2 , %xmm0 , %xmm12 vaddps %xmm12, %xmm4 , %xmm4 addq $ 1 , BI addq $ 4 , %rax .endm .macro SAVE4x1 vbroadcastss ALPHA, %xmm0 vmulps %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddps (CO1), %xmm4,%xmm4 #endif vmovups %xmm4 , (CO1) .endm /*******************************************************************************************/ .macro KERNEL2x1_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmulss %xmm2 , %xmm0 , %xmm12 vmulss %xmm2 , %xmm1 , %xmm13 vaddss %xmm12, %xmm4 , %xmm4 vaddss %xmm13, %xmm5 , %xmm5 addq $ 1 , BI addq $ 2 , %rax .endm .macro SAVE2x1 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 vmulss %xmm0 , %xmm5 , %xmm5 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 vaddss 1 * SIZE(CO1), %xmm5,%xmm5 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , 1 * SIZE(CO1) .endm /*******************************************************************************************/ .macro KERNEL1x1_SUB vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 vmulss %xmm2 , %xmm0 , %xmm12 vaddss %xmm12, %xmm4 , %xmm4 addq $ 1 , BI addq $ 1 , %rax .endm .macro SAVE1x1 vmovss ALPHA, %xmm0 vmulss %xmm0 , %xmm4 , %xmm4 #if !defined(TRMMKERNEL) vaddss (CO1), %xmm4,%xmm4 #endif vmovss %xmm4 , (CO1) .endm /*******************************************************************************************/ /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovss %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $4, %rdi divq %rdi // N / 4 movq %rax, Ndiv6 // N / 4 movq %rdx, Nmod6 // N % 4 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 /*******************************************************************************************/ .L4_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L4_01b ALIGN_4 .L4_01a: prefetcht0 512(BO1) prefetchw 512(BO) vmovups (BO1), %xmm0 vmovups 4*SIZE(BO1), %xmm1 vmovups 8*SIZE(BO1), %xmm2 vmovups 12*SIZE(BO1), %xmm3 vmovups %xmm0, (BO) vmovups %xmm1, 4*SIZE(BO) vmovups %xmm2, 8*SIZE(BO) vmovups %xmm3,12*SIZE(BO) addq $ 16*SIZE,BO1 addq $ 16*SIZE,BO decq %rax jnz .L4_01a .L4_01b: movq K, %rax andq $3, %rax // K % 4 jz .L4_02d ALIGN_4 .L4_02c: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L4_02c .L4_02d: movq BO1, B // next offset of B .L4_10: movq C, CO1 leaq (C, LDC, 2), CO2 leaq (C, LDC, 4), C // c += 4 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L4_20 ALIGN_4 .L4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L4_16 movq %rax, BI // Index for BO leaq (,BI,4) , BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_12: prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB je .L4_16 prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) prefetcht0 B_PR1(BO, BI , SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB prefetcht0 A_PR1(AO, %rax, SIZE) KERNEL16x4_SUB je .L4_16 jmp .L4_12 ALIGN_4 .L4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_19 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_17: KERNEL16x4_SUB jl .L4_17 ALIGN_4 .L4_19: SAVE16x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 addq $16 * SIZE, CO2 # coffset += 16 decq I # i -- jg .L4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L4_20: // Test rest of M testq $15, M jz .L4_60 // to next 3 lines of N testq $8, M jz .L4_21pre ALIGN_4 /**************************************************************************/ .L4_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_20_6 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_20_2: KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB je .L4_20_6 KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB KERNEL8x4_SUB je .L4_20_6 jmp .L4_20_2 ALIGN_4 .L4_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_20_9 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_20_7: KERNEL8x4_SUB jl .L4_20_7 ALIGN_4 .L4_20_9: SAVE8x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 addq $8 * SIZE, CO2 # coffset += 8 ALIGN_4 /**************************************************************************/ .L4_21pre: testq $4, M jz .L4_30 ALIGN_4 .L4_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_26 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_22: KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB je .L4_26 KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB KERNEL4x4_SUB je .L4_26 jmp .L4_22 ALIGN_4 .L4_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_29 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_27: KERNEL4x4_SUB jl .L4_27 ALIGN_4 .L4_29: SAVE4x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L4_30: testq $2, M jz .L4_40 ALIGN_4 .L4_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_36 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_32: KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB je .L4_36 KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB KERNEL2x4_SUB je .L4_36 jmp .L4_32 ALIGN_4 .L4_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_39 movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_37: KERNEL2x4_SUB jl .L4_37 ALIGN_4 .L4_39: SAVE2x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 addq $2 * SIZE, CO2 # coffset += 2 ALIGN_4 .L4_40: testq $1, M jz .L4_60 // to next 4 lines of N ALIGN_4 .L4_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $4, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L4_46 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_42: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB je .L4_46 KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB je .L4_46 jmp .L4_42 ALIGN_4 .L4_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L4_49 movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L4_47: KERNEL1x4_SUB jl .L4_47 ALIGN_4 .L4_49: SAVE1x4 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (,BI, 4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 addq $1 * SIZE, CO2 # coffset += 1 ALIGN_4 .L4_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif decq J // j -- jg .L4_01 // next 4 lines of N /*******************************************************************************************/ .L2_0: movq Nmod6, J andq $3, J // j % 4 je .L999 movq Nmod6, J andq $2, J // j % 4 je .L1_0 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $2, %rax // K / 4 jz .L2_01b ALIGN_4 .L2_01a: vmovsd (BO1), %xmm0 vmovsd 2*SIZE(BO1), %xmm1 vmovsd 4*SIZE(BO1), %xmm2 vmovsd 6*SIZE(BO1), %xmm3 vmovsd %xmm0, (BO) vmovsd %xmm1, 2*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovsd %xmm3, 6*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO decq %rax jnz .L2_01a .L2_01b: movq K, %rax andq $3, %rax // K % 4 jz .L2_02d ALIGN_4 .L2_02c: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02c .L2_02d: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB je .L2_16 KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB jl .L2_17 ALIGN_4 .L2_19: SAVE16x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 2 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB je .L2_20_6 KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB jl .L2_20_7 ALIGN_4 .L2_20_9: SAVE8x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB je .L2_26 KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB jl .L2_27 ALIGN_4 .L2_29: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_36 KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB KERNEL2x2_SUB je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB jl .L2_37 ALIGN_4 .L2_39: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_46 KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB jl .L2_47 ALIGN_4 .L2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $16 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB je .L1_16 KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB jl .L1_17 ALIGN_4 .L1_19: SAVE16x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB je .L1_20_6 KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB jl .L1_20_7 ALIGN_4 .L1_20_9: SAVE8x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_26 KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB jl .L1_27 ALIGN_4 .L1_29: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_36 KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB KERNEL2x1_SUB je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB jl .L1_37 ALIGN_4 .L1_39: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_46 KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB jl .L1_47 ALIGN_4 .L1_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S000066400000000000000000001743101313527062700227510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH prefetch #define PREFETCHSIZE (16 * 17 + 0) #define RPREFETCHSIZE (16 * 4 + 0) #define WPREFETCHSIZE (16 * 9 + 0) #define KERNEL1(xx) \ vfmaddps %xmm8,%xmm1,%xmm0,%xmm8 ;\ vmovaps %xmm2, %xmm0 ;\ vmovups -28 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ vmovups -24 * SIZE(BO, %rax, 8), %xmm1 ;\ vfmaddps %xmm9,%xmm3, %xmm0, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups -20 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm1, %xmm0, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ vfmaddps %xmm11,%xmm3, %xmm0, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups -24 * SIZE(AO, %rax, 4), %xmm0 ;\ vmovups -16 * SIZE(BO, %rax, 8), %xmm1 ;\ vmovups -12 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm0, %xmm2 #define KERNEL2(xx) \ vfmaddps %xmm8,%xmm1,%xmm0,%xmm8 ;\ vmovaps %xmm2, %xmm0 ;\ vmovups -20 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ vmovups -8 * SIZE(BO, %rax, 8), %xmm1 ;\ vfmaddps %xmm9,%xmm3, %xmm0, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups -4 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm1, %xmm0, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ vfmaddps %xmm11,%xmm3, %xmm0, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups 4 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm4, %xmm2 #define KERNEL3(xx) \ vfmaddps %xmm8,%xmm5,%xmm4,%xmm8 ;\ vmovups -12 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ vmovups 32 * SIZE(BO, %rax, 8), %xmm1 ;\ vmovups 8 * SIZE(BO, %rax, 8), %xmm5 ;\ vfmaddps %xmm9,%xmm3, %xmm4, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups 12 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm5, %xmm4, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ vfmaddps %xmm11,%xmm3, %xmm4, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups -8 * SIZE(AO, %rax, 4), %xmm4 ;\ vmovups 16 * SIZE(BO, %rax, 8), %xmm5 ;\ vmovups 20 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm4, %xmm2 #define KERNEL4(xx) \ vfmaddps %xmm8,%xmm5, %xmm4, %xmm8 ;\ vmovups -4 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ vmovups 24 * SIZE(BO, %rax, 8), %xmm5 ;\ vfmaddps %xmm9,%xmm3, %xmm4, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups 28 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm5, %xmm4, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ vmovups 64 * SIZE(BO, %rax, 8), %xmm5 ;\ vfmaddps %xmm11,%xmm3, %xmm4, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups (AO, %rax, 4), %xmm6 ;\ vmovups 36 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm6, %xmm2 #define KERNEL5(xx) \ vfmaddps %xmm8,%xmm1, %xmm6, %xmm8 ;\ vmovups 4 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ vmovups 40 * SIZE(BO, %rax, 8), %xmm1 ;\ vfmaddps %xmm9,%xmm3, %xmm6, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups 16 * SIZE(AO, %rax, 4), %xmm7 ;\ vmovups 44 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm1, %xmm6, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ vfmaddps %xmm11,%xmm3, %xmm6, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups 8 * SIZE(AO, %rax, 4), %xmm6 ;\ vmovups 48 * SIZE(BO, %rax, 8), %xmm1 ;\ vmovups 52 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm6, %xmm2 #define KERNEL6(xx) \ vfmaddps %xmm8,%xmm1, %xmm6, %xmm8 ;\ vmovups 12 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ vmovups 56 * SIZE(BO, %rax, 8), %xmm1 ;\ vfmaddps %xmm9,%xmm3, %xmm6, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups 60 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm1, %xmm6, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ vfmaddps %xmm11,%xmm3, %xmm6, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups 32 * SIZE(AO, %rax, 4), %xmm0 ;\ vmovups 68 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm7, %xmm2 #define KERNEL7(xx) \ vfmaddps %xmm8,%xmm5, %xmm7, %xmm8 ;\ vmovups 20 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ vmovups 96 * SIZE(BO, %rax, 8), %xmm1 ;\ vmovups 72 * SIZE(BO, %rax, 8), %xmm5 ;\ vfmaddps %xmm9,%xmm3, %xmm7, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups 76 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm5, %xmm7, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ vfmaddps %xmm11,%xmm3, %xmm7, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups 24 * SIZE(AO, %rax, 4), %xmm7 ;\ vmovups 80 * SIZE(BO, %rax, 8), %xmm5 ;\ vmovups 84 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm7, %xmm2 #define KERNEL8(xx) \ vfmaddps %xmm8,%xmm5, %xmm7, %xmm8 ;\ vmovups 28 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ vmovups 88 * SIZE(BO, %rax, 8), %xmm5 ;\ vfmaddps %xmm9, %xmm3, %xmm7, %xmm9 ;\ vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ vmovups 92 * SIZE(BO, %rax, 8), %xmm3 ;\ vfmaddps %xmm10,%xmm5, %xmm7, %xmm10 ;\ vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ vmovups 48 * SIZE(AO, %rax, 4), %xmm4 ;\ vmovups 128 * SIZE(BO, %rax, 8), %xmm5 ;\ vfmaddps %xmm11,%xmm3, %xmm7, %xmm11 ;\ vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ vmovups 100 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps (AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ addps %xmm5, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #if defined(OS_LINUX) && defined(CORE_BULLDOZER) && !defined(TRMMKERNEL) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 #else movq 72(%rsp), LDC #ifdef TRMMKERNEL movsd 80(%rsp), %xmm12 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif subq $-32 * SIZE, A leaq (, LDC, SIZE), LDC movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: prefetcht0 192(B) prefetcht0 256(B) prefetcht0 192(BO) prefetcht0 256(BO) movaps 0 * SIZE(B), %xmm3 movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 movaps 8 * SIZE(B), %xmm11 movaps 12 * SIZE(B), %xmm15 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) pshufd $0x00, %xmm11, %xmm0 pshufd $0x55, %xmm11, %xmm1 pshufd $0xaa, %xmm11, %xmm2 pshufd $0xff, %xmm11, %xmm3 pshufd $0x00, %xmm15, %xmm4 pshufd $0x55, %xmm15, %xmm5 pshufd $0xaa, %xmm15, %xmm6 pshufd $0xff, %xmm15, %xmm7 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) movaps %xmm4, 48 * SIZE(BO) movaps %xmm5, 52 * SIZE(BO) movaps %xmm6, 56 * SIZE(BO) movaps %xmm7, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movaps -28 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movaps -16 * SIZE(AO), %xmm4 xorps %xmm10, %xmm10 movaps 0 * SIZE(BO), %xmm5 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 movaps %xmm0, %xmm2 prefetcht0 (CO1) prefetcht0 (CO1,LDC, 2) prefetcht0 (CO2) prefetcht0 (CO2,LDC, 2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-8, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) BRANCH jl .L12 ALIGN_4 .L15: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL_SUB1(32 * 0) KERNEL_SUB2(32 * 0) KERNEL_SUB3(32 * 0) KERNEL_SUB4(32 * 0) addq $32 * SIZE, AO addq $64 * SIZE, BO ALIGN_3 .L16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L18 leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_4 .L17: mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm8 movaps %xmm2, %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm9 movaps %xmm0, %xmm2 addps %xmm3, %xmm13 movaps -20 * SIZE(BO, %rax, 8), %xmm3 mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm10 movaps -24 * SIZE(AO, %rax, 4), %xmm0 addps %xmm1, %xmm14 movaps -16 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm11 addps %xmm3, %xmm15 movaps -12 * SIZE(BO, %rax, 8), %xmm3 movaps %xmm0, %xmm2 addq $SIZE * 2, %rax jl .L17 ALIGN_4 .L18: #ifndef TRMMKERNEL vfmaddps 0 * SIZE(CO1),%xmm7, %xmm8, %xmm8 vfmaddps 4 * SIZE(CO1),%xmm7, %xmm12, %xmm12 vfmaddps 0 * SIZE(CO2),%xmm7, %xmm9, %xmm9 vfmaddps 4 * SIZE(CO2),%xmm7, %xmm13, %xmm13 vfmaddps 0 * SIZE(CO1, LDC, 2),%xmm7, %xmm10, %xmm10 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm7, %xmm14, %xmm14 vfmaddps 0 * SIZE(CO2, LDC, 2),%xmm7, %xmm11, %xmm11 vfmaddps 4 * SIZE(CO2, LDC, 2),%xmm7, %xmm15, %xmm15 #else vmulps %xmm7, %xmm8, %xmm8 vmulps %xmm7, %xmm9, %xmm9 vmulps %xmm7, %xmm10, %xmm10 vmulps %xmm7, %xmm11, %xmm11 vmulps %xmm7, %xmm12,%xmm12 vmulps %xmm7, %xmm13,%xmm13 vmulps %xmm7, %xmm14,%xmm14 vmulps %xmm7, %xmm15,%xmm15 #endif vmovups %xmm8, 0 * SIZE(CO1) vmovups %xmm12, 4 * SIZE(CO1) vmovups %xmm9, 0 * SIZE(CO2) vmovups %xmm13, 4 * SIZE(CO2) vmovups %xmm10, 0 * SIZE(CO1, LDC, 2) vmovups %xmm14, 4 * SIZE(CO1, LDC, 2) vmovups %xmm11, 0 * SIZE(CO2, LDC, 2) vmovups %xmm15, 4 * SIZE(CO2, LDC, 2) prefetcht0 64(CO1) prefetcht0 64(CO1,LDC, 2) prefetcht0 64(CO2) prefetcht0 64(CO2,LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3 #endif vmovups %xmm0, 0 * SIZE(CO1) vmovups %xmm1, 0 * SIZE(CO2) vmovups %xmm2, 0 * SIZE(CO1, LDC, 2) vmovups %xmm3, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movsd 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -28 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsd 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movsd 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movsd 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movsd 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd -26 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movsd 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movsd 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movsd 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movsd 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd -16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movsd 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movsd 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movsd 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movsd 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd -22 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movsd 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movsd 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movsd 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsd 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movsd 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -18 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsd 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsd 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movsd 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsd 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 mulps %xmm15, %xmm2 mulps %xmm15, %xmm3 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 addps %xmm8, %xmm0 addps %xmm10, %xmm1 addps %xmm12, %xmm2 addps %xmm14, %xmm3 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L40: testq $1, M je .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss -30 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss -29 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss -24 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss -27 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss -26 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss -25 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: mulss %xmm15, %xmm0 mulss %xmm15, %xmm1 mulss %xmm15, %xmm2 mulss %xmm15, %xmm3 #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm10 movss 0 * SIZE(CO1, LDC, 2), %xmm12 movss 0 * SIZE(CO2, LDC, 2), %xmm14 addss %xmm8, %xmm0 addss %xmm10, %xmm1 addss %xmm12, %xmm2 addss %xmm14, %xmm3 #endif movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO2) movss %xmm2, 0 * SIZE(CO1, LDC, 2) movss %xmm3, 0 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 .L50: testq $2, N je .L100 .L51: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 .L52: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $ 2 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 prefetchw 4 * SIZE(CO1) xorps %xmm4, %xmm4 prefetchw 4 * SIZE(CO2) xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 4 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 8 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 12 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 20 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 24 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 28 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 80 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 movsd 4 * SIZE(CO2), %xmm11 movhps 6 * SIZE(CO2), %xmm11 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm9, %xmm4 addps %xmm10, %xmm1 addps %xmm11, %xmm5 #endif vmovups %xmm0, 0 * SIZE(CO1) vmovups %xmm4, 4 * SIZE(CO1) vmovups %xmm1, 0 * SIZE(CO2) vmovups %xmm5, 4 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm10, %xmm1 #endif vmovups %xmm0, 0 * SIZE(CO1) vmovups %xmm1, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L80: testq $2, M je .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -26 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movsd 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsd 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -22 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movsd 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movsd 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsd 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -18 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movsd 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movsd 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsd 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 #endif addps %xmm2, %xmm0 addps %xmm3, %xmm1 mulps %xmm15, %xmm0 mulps %xmm15, %xmm1 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm10, %xmm1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L90: testq $1, M je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -29 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -24 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -27 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -26 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -25 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 movss 0 * SIZE(CO2), %xmm10 #endif addss %xmm2, %xmm0 addss %xmm3, %xmm1 mulss %xmm15, %xmm0 mulss %xmm15, %xmm1 #ifndef TRMMKERNEL addss %xmm8, %xmm0 addss %xmm10, %xmm1 #endif movss %xmm0, 0 * SIZE(CO1) movss %xmm1, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 4 * ldc ALIGN_4 .L100: testq $1, N je .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 .L102: movups 0 * SIZE(B), %xmm3 movups 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 prefetchw 4 * SIZE(CO1) xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps -20 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps -12 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps -8 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps -4 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 48 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 mulps %xmm11, %xmm12 mulps 4 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 8 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 12 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 64 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 20 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 24 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 28 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 80 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 #endif mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #ifndef TRMMKERNEL addps %xmm8, %xmm0 addps %xmm9, %xmm4 #endif vmovups %xmm0, 0 * SIZE(CO1) vmovups %xmm4, 4 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 movaps -28 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -24 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps -20 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 mulps %xmm10, %xmm11 movaps -12 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -8 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps -4 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 mulps %xmm15, %xmm0 #ifndef TRMMKERNEL addps %xmm8, %xmm0 #endif vmovups %xmm0, 0 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L130: testq $2, M je .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -26 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd -22 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -18 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -8 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 mulps %xmm15, %xmm0 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm8 addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 4 ALIGN_4 .L140: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss -30 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss -29 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss -24 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss -27 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss -26 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss -25 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss -20 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movss ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 mulss %xmm15, %xmm0 #ifndef TRMMKERNEL movss 0 * SIZE(CO1), %xmm8 addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/sgemm_kernel_8x8_sandy.S000066400000000000000000002471471313527062700221020ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #define ASSEMBLER #include "common.h" #define old_bm %rdi #define old_bn %rsi #define old_bk %rdx #define bm %r13 #define bn %r14 #define bk %r15 #define ALPHA %xmm0 #define ba %rcx #define bb %r8 #define C %r9 #define ldc %r10 #define i %r11 #define k %rax #define ptrba %rdi #define ptrbb %rsi #define C0 %rbx #define C1 %rbp #define prebb %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define old_ldc 8+STACKSIZE(%rsp) #define old_offset 16+STACKSIZE(%rsp) #define MEMALPHA 48(%rsp) #define j 56(%rsp) #define OFFSET 64(%rsp) #define kk 72(%rsp) #define kkk 80(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define old_ldc 64 + STACKSIZE(%rsp) #define old_offset 72 + STACKSIZE(%rsp) #define MEMALPHA 224(%rsp) #define j 232(%rsp) #define OFFSET 240(%rsp) #define kk 248(%rsp) #define kkk 256(%rsp) #endif #define PREFETCH0 prefetcht0 #define PREFETCH1 prefetcht0 #define PREFETCH2 prefetcht2 #define PRESIZE 80 #define xvec0 %xmm0 #define xvec1 %xmm1 #define xvec2 %xmm2 #define xvec3 %xmm3 #define xvec4 %xmm4 #define xvec5 %xmm5 #define xvec6 %xmm6 #define xvec7 %xmm7 #define xvec8 %xmm8 #define xvec9 %xmm9 #define xvec10 %xmm10 #define xvec11 %xmm11 #define xvec12 %xmm12 #define xvec13 %xmm13 #define xvec14 %xmm14 #define xvec15 %xmm15 #define yvec0 %ymm0 #define yvec1 %ymm1 #define yvec2 %ymm2 #define yvec3 %ymm3 #define yvec4 %ymm4 #define yvec5 %ymm5 #define yvec6 %ymm6 #define yvec7 %ymm7 #define yvec8 %ymm8 #define yvec9 %ymm9 #define yvec10 %ymm10 #define yvec11 %ymm11 #define yvec12 %ymm12 #define yvec13 %ymm13 #define yvec14 %ymm14 #define yvec15 %ymm15 #define LEAQ leaq #define ADDQ addq #define MULQ imulq #define SARQ sarq #define SALQ salq #define ANDQ andq #define SUBQ subq #define DECQ decq #define JG jg #define JLE jle #define TEST testq #define OR orq #define JNE jne #define JMP jmp #define NOP #define XOR xorpd #undef MOVQ #define MOVQ movq #define XOR_SY vxorps #define XOR_SX vxorps #define LD_SY vmovaps #define LD_SX vmovaps #define LDL_SX vmovlps #define LDL_SY vmovlps #define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps #define ST_SX vmovaps #define STL_SX vmovlps #define STL_SY vmovlps #define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup #define EDUP_SX vmovsldup #define ODUP_SX vmovshdup #define ADD_SY vaddps #define ADD_SX vaddps #define ADD1_DY vaddpd #define ADDSUB_SY vaddsubps #define MUL_SY vmulps #define MUL_SX vmulps #define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 #define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps #define BROAD_SY vbroadcastss #define BROAD_SX vbroadcastss #define MOV_SY vmovaps #define MOV_SX vmovaps #define REVS_SY vshufps #define REVS_SX vshufps #define EXTRA_SY vextractf128 PROLOGUE subq $STACKSIZE, %rsp; movq %rbx, 0(%rsp); movq %rbp, 8(%rsp); movq %r12, 16(%rsp); movq %r13, 24(%rsp); movq %r14, 32(%rsp); movq %r15, 40(%rsp); #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, old_bm movq ARG2, old_bn movq ARG3, old_bk movq OLD_A, ba movq OLD_B, bb movq OLD_C, C movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11 #endif movaps %xmm3, %xmm0 #else movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11 #endif #endif vzeroupper vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn movq old_bk, bk leaq (, ldc, SIZE), ldc #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11; #endif movq %r11, kk #endif MOVQ bn,j; SARQ $3,j; JLE .L0_loopE; ALIGN_4; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C,C0; LEAQ (C,ldc,4),C1; MOVQ bk, k; SALQ $5, k; LEAQ (bb, k, 1), prebb; MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; JLE .L1_loopE; ALIGN_4; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #### Initial Results Register #### XOR_SY yvec15, yvec15, yvec15; PREFETCH0 0*SIZE(prebb); XOR_SY yvec14, yvec14, yvec14; PREFETCH0 16*SIZE(prebb); XOR_SY yvec13, yvec13, yvec13; PREFETCH0 32*SIZE(prebb); XOR_SY yvec12, yvec12, yvec12; ADDQ $48*SIZE, prebb; EDUP_SY 0*SIZE(ptrbb), yvec2; LEAQ (ldc, ldc, 2), %rax; PREFETCH2 7*SIZE(C0); PREFETCH2 7*SIZE(C1); XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; PREFETCH2 7*SIZE(C0, ldc, 1); PREFETCH2 7*SIZE(C1, ldc, 1); LD_SY 0*SIZE(ptrba), yvec0; XOR_SY yvec9, yvec9, yvec9; PREFETCH2 7*SIZE(C0, ldc, 2); PREFETCH2 7*SIZE(C1, ldc, 2); XOR_SY yvec8, yvec8, yvec8; VPERMILP_SY $0x4e, yvec2, yvec3; PREFETCH2 7*SIZE(C0, %rax, 1); PREFETCH2 7*SIZE(C1, %rax, 1); #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $8, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; ALIGN_4; .L2_bodyB:; # Computing kernel #### Unroll times 1 #### PREFETCH0 PRESIZE*SIZE(ptrba); MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 0*SIZE(ptrbb), yvec2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5 ADD_SY yvec15, yvec6, yvec15 ADD_SY yvec13, yvec7, yvec13; LD_SY 8*SIZE(ptrba), yvec1; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec11, yvec6, yvec11; ADD_SY yvec9, yvec7, yvec9; MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; EDUP_SY 8*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; ADD_SY yvec14, yvec6, yvec14; ADD_SY yvec12, yvec7, yvec12; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; #### Unroll times 2 #### MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 8*SIZE(ptrbb), yvec2 MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5 ADD_SY yvec15, yvec6, yvec15 ADD_SY yvec13, yvec7, yvec13; LD_SY 16*SIZE(ptrba), yvec0; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD_SY yvec11, yvec6, yvec11; ADD_SY yvec9, yvec7, yvec9; MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; EDUP_SY 16*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; ADD_SY yvec14, yvec6, yvec14; ADD_SY yvec12, yvec7, yvec12; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; #### Unroll times 3 #### PREFETCH0 (PRESIZE+16)*SIZE(ptrba); MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 16*SIZE(ptrbb), yvec2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5 ADD_SY yvec15, yvec6, yvec15 ADD_SY yvec13, yvec7, yvec13; LD_SY 24*SIZE(ptrba), yvec1; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec11, yvec6, yvec11; ADD_SY yvec9, yvec7, yvec9; ADDQ $32*SIZE, ptrba; MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; EDUP_SY 24*SIZE(ptrbb), yvec2; ADD_SY yvec14, yvec6, yvec14; ADD_SY yvec12, yvec7, yvec12; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; #### Unroll times 4 #### MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 24*SIZE(ptrbb), yvec2 MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5 ADDQ $32*SIZE, ptrbb; ADD_SY yvec15, yvec6, yvec15 ADD_SY yvec13, yvec7, yvec13; LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD_SY yvec11, yvec6, yvec11; ADD_SY yvec9, yvec7, yvec9; MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; EDUP_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; ADD_SY yvec14, yvec6, yvec14; ADD_SY yvec12, yvec7, yvec12; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; ALIGN_4 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L3_loopE; ALIGN_4 .L3_loobB: #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 0*SIZE(ptrbb), yvec2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5 ADD_SY yvec15, yvec6, yvec15 ADD_SY yvec13, yvec7, yvec13; LD_SY 8*SIZE(ptrba), yvec1; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $16*SIZE, ptrba; ADD_SY yvec11, yvec6, yvec11; ADD_SY yvec9, yvec7, yvec9; MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; EDUP_SY 8*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; ADD_SY yvec14, yvec6, yvec14; ADD_SY yvec12, yvec7, yvec12; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; #### Unroll times 2 #### MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 8*SIZE(ptrbb), yvec2 MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5 ADDQ $16*SIZE, ptrbb ADD_SY yvec15, yvec6, yvec15 ADD_SY yvec13, yvec7, yvec13; LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD_SY yvec11, yvec6, yvec11; ADD_SY yvec9, yvec7, yvec9; MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; EDUP_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; ADD_SY yvec14, yvec6, yvec14; ADD_SY yvec12, yvec7, yvec12; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; .L3_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L4_loopE; ALIGN_4 .L4_loopB:; #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 0*SIZE(ptrbb), yvec2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5 ADDQ $8*SIZE, ptrba; ADD_SY yvec15, yvec6, yvec15 ADD_SY yvec13, yvec7, yvec13; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $8*SIZE, ptrbb; ADD_SY yvec11, yvec6, yvec11; ADD_SY yvec9, yvec7, yvec9; MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; ADD_SY yvec14, yvec6, yvec14; ADD_SY yvec12, yvec7, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; .L4_loopE:; #### Load Alpha #### BROAD_SY MEMALPHA,yvec7; MUL_SY yvec7,yvec15,yvec15; MUL_SY yvec7,yvec14,yvec14; MUL_SY yvec7,yvec13,yvec13; MUL_SY yvec7,yvec12,yvec12; MUL_SY yvec7,yvec11,yvec11; MUL_SY yvec7,yvec10,yvec10; MUL_SY yvec7,yvec9,yvec9; MUL_SY yvec7,yvec8,yvec8; MOV_SY yvec15,yvec7; REVS_SY $0xe4,yvec13,yvec15,yvec15; REVS_SY $0xe4,yvec7,yvec13,yvec13; MOV_SY yvec14,yvec7; REVS_SY $0xe4,yvec12,yvec14,yvec14; REVS_SY $0xe4,yvec7,yvec12,yvec12; MOV_SY yvec11,yvec7; REVS_SY $0xe4,yvec9,yvec11,yvec11; REVS_SY $0xe4,yvec7,yvec9,yvec9; MOV_SY yvec10,yvec7; REVS_SY $0xe4,yvec8,yvec10,yvec10; REVS_SY $0xe4,yvec7,yvec8,yvec8; ##### Testing alignment ##### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; ALIGN_4 LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; EXTRA_SY $1,yvec13,xvec5; EXTRA_SY $1,yvec12,xvec4; EXTRA_SY $1,yvec11,xvec3; EXTRA_SY $1,yvec10,xvec2; EXTRA_SY $1,yvec9,xvec1; EXTRA_SY $1,yvec8,xvec0; #ifndef TRMMKERNEL ADD_SY 0*SIZE(C0), xvec15, xvec15; ADD_SY 4*SIZE(C1), xvec7, xvec7; ADD_SY 0*SIZE(C0,ldc,1), xvec14, xvec14; ADD_SY 4*SIZE(C1,ldc,1), xvec6, xvec6; ADD_SY 0*SIZE(C0,ldc,2), xvec13, xvec13; ADD_SY 4*SIZE(C1,ldc,2), xvec5, xvec5; ADD_SY 0*SIZE(C0,%rax,1), xvec12, xvec12; ADD_SY 4*SIZE(C1,%rax,1), xvec4, xvec4; ADD_SY 0*SIZE(C1), xvec11, xvec11; ADD_SY 4*SIZE(C0), xvec3, xvec3; ADD_SY 0*SIZE(C1,ldc,1), xvec10, xvec10; ADD_SY 4*SIZE(C0,ldc,1), xvec2, xvec2; ADD_SY 0*SIZE(C1,ldc,2), xvec9, xvec9; ADD_SY 4*SIZE(C0,ldc,2), xvec1, xvec1; ADD_SY 0*SIZE(C1,%rax,1), xvec8, xvec8; ADD_SY 4*SIZE(C0,%rax,1), xvec0, xvec0; #endif ST_SY xvec15,0*SIZE(C0); ST_SY xvec7,4*SIZE(C1); ST_SY xvec14,0*SIZE(C0,ldc,1); ST_SY xvec6,4*SIZE(C1,ldc,1); ST_SY xvec13,0*SIZE(C0,ldc,2); ST_SY xvec5,4*SIZE(C1,ldc,2); ST_SY xvec12,0*SIZE(C0,%rax,1); ST_SY xvec4,4*SIZE(C1,%rax,1); ST_SY xvec11,0*SIZE(C1); ST_SY xvec3,4*SIZE(C0); ST_SY xvec10,0*SIZE(C1,ldc,1); ST_SY xvec2,4*SIZE(C0,ldc,1); ST_SY xvec9,0*SIZE(C1,ldc,2); ST_SY xvec1,4*SIZE(C0,ldc,2); ST_SY xvec8,0*SIZE(C1,%rax,1); ST_SY xvec0,4*SIZE(C0,%rax,1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE,C0; ADDQ $8*SIZE,C1; .L1_bodyE:; DECQ i; JG .L1_bodyB; JMP .L1_loopE; ALIGN_4; .L4_loopEx: LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0), xvec6, xvec6; LDH_SY 2*SIZE(C0), xvec6, xvec6; ADD_SY xvec6, xvec15, xvec15; #endif STL_SY xvec15, 0*SIZE(C0); STH_SY xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1), xvec5, xvec5; LDH_SY 6*SIZE(C1), xvec5, xvec5; ADD_SY xvec5, xvec7, xvec7; #endif STL_SY xvec7, 4*SIZE(C1); STH_SY xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_SY 2*SIZE(C0, ldc, 1), xvec5, xvec5; ADD_SY xvec5, xvec14, xvec14; #endif STL_SY xvec14, 0*SIZE(C0, ldc, 1); STH_SY xvec14, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1, ldc, 1), xvec4, xvec4; LDH_SY 6*SIZE(C1, ldc, 1), xvec4, xvec4; ADD_SY xvec4, xvec6, xvec6; #endif STL_SY xvec6, 4*SIZE(C1, ldc, 1); STH_SY xvec6, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0, ldc, 2), xvec4, xvec4; LDH_SY 2*SIZE(C0, ldc, 2), xvec4, xvec4; ADD_SY xvec4, xvec13, xvec13; #endif STL_SY xvec13, 0*SIZE(C0, ldc, 2); STH_SY xvec13, 2*SIZE(C0, ldc, 2); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1, ldc, 2), xvec3, xvec3; LDH_SY 6*SIZE(C1, ldc, 2), xvec3, xvec3; ADD_SY xvec3, xvec5, xvec5; #endif STL_SY xvec5, 4*SIZE(C1, ldc, 2); STH_SY xvec5, 6*SIZE(C1, ldc, 2); EXTRA_SY $1, yvec12, xvec4; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0, %rax, 1), xvec3, xvec3; LDH_SY 2*SIZE(C0, %rax, 1), xvec3, xvec3; ADD_SY xvec3, xvec12, xvec12; #endif STL_SY xvec12, 0*SIZE(C0, %rax, 1); STH_SY xvec12, 2*SIZE(C0, %rax, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1, %rax, 1), xvec2, xvec2; LDH_SY 6*SIZE(C1, %rax, 1), xvec2, xvec2; ADD_SY xvec2, xvec4, xvec4; #endif STL_SY xvec4, 4*SIZE(C1, %rax, 1); STH_SY xvec4, 6*SIZE(C1, %rax, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1), xvec2, xvec2; LDH_SY 2*SIZE(C1), xvec2, xvec2; ADD_SY xvec2, xvec11, xvec11; #endif STL_SY xvec11, 0*SIZE(C1); STH_SY xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0), xvec1, xvec1; LDH_SY 6*SIZE(C0), xvec1, xvec1; ADD_SY xvec1, xvec3, xvec3; #endif STL_SY xvec3, 4*SIZE(C0); STH_SY xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec10, xvec2; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1, ldc, 1), xvec1, xvec1; LDH_SY 2*SIZE(C1, ldc, 1), xvec1, xvec1; ADD_SY xvec1, xvec10, xvec10; #endif STL_SY xvec10, 0*SIZE(C1, ldc, 1); STH_SY xvec10, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0, ldc, 1), xvec0, xvec0; LDH_SY 6*SIZE(C0, ldc, 1), xvec0, xvec0; ADD_SY xvec0, xvec2, xvec2; #endif STL_SY xvec2, 4*SIZE(C0, ldc, 1); STH_SY xvec2, 6*SIZE(C0, ldc, 1); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1, ldc, 2), xvec0, xvec0; LDH_SY 2*SIZE(C1, ldc, 2), xvec0, xvec0; ADD_SY xvec0, xvec9, xvec9; #endif STL_SY xvec9, 0*SIZE(C1, ldc, 2); STH_SY xvec9, 2*SIZE(C1, ldc, 2); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0, ldc, 2), xvec7, xvec7; LDH_SY 6*SIZE(C0, ldc, 2), xvec7, xvec7; ADD_SY xvec7, xvec1, xvec1; #endif STL_SY xvec1, 4*SIZE(C0, ldc, 2); STH_SY xvec1, 6*SIZE(C0, ldc, 2); EXTRA_SY $1, yvec8, xvec0; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1, %rax, 1), xvec6, xvec6; LDH_SY 2*SIZE(C1, %rax, 1), xvec6, xvec6; ADD_SY xvec6, xvec8, xvec8; #endif STL_SY xvec8, 0*SIZE(C1, %rax, 1); STH_SY xvec8, 2*SIZE(C1, %rax, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0, %rax, 1), xvec5, xvec5; LDH_SY 6*SIZE(C0, %rax, 1), xvec5, xvec5; ADD_SY xvec5, xvec0, xvec0; #endif STL_SY xvec0, 4*SIZE(C0, %rax, 1); STH_SY xvec0, 6*SIZE(C0, %rax, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; ALIGN_4 .L1_loopE:; TEST $4, bm; JLE .L5_loopE; ALIGN_4 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #### Initial Results Register #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; LD_SX 0*SIZE(ptrba), xvec0; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; XOR_SY yvec9, yvec9, yvec9; XOR_SY yvec8, yvec8, yvec8; ODUP_SX 0*SIZE(ptrbb), xvec3; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $8, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; ALIGN_4 .L8_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec10, xvec10; EDUP_SX 16*SIZE(ptrbb), xvec2; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec9, xvec9; ODUP_SX 16*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; #### Unroll time 3 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; ODUP_SX 20*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec10, xvec10; EDUP_SX 24*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec9, xvec9; ODUP_SX 24*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; ADDQ $16*SIZE, ptrba; #### Unroll time 4 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec2; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; ODUP_SX 28*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $32*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; DECQ k; JG .L8_bodyB; ALIGN_4 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L9_loopE; ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### ADDQ $8*SIZE, ptrba; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; .L9_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L10_loopE; ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; EDUP_SX 4*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrbb; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec10, xvec10; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec9, xvec9; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; .L10_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec7, xvec12, xvec12; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec7, xvec10, xvec10; MUL_SX xvec7, xvec9, xvec9; MUL_SX xvec7, xvec8, xvec8; #### Reverse Result #### MOV_SX xvec15, xvec7; REVS_SX $0xe4, xvec13, xvec15, xvec15; REVS_SX $0xe4, xvec7, xvec13, xvec13; MOV_SX xvec14, xvec7; REVS_SX $0xe4, xvec12, xvec14, xvec14; REVS_SX $0xe4, xvec7, xvec12, xvec12; MOV_SX xvec11, xvec7; REVS_SX $0xe4, xvec9, xvec11, xvec11; REVS_SX $0xe4, xvec7, xvec9, xvec9; MOV_SX xvec10, xvec7; REVS_SX $0xe4, xvec8, xvec10, xvec10; REVS_SX $0xe4, xvec7, xvec8, xvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L10_loopEx; ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL ADD_SX 0*SIZE(C0), xvec15, xvec15; ADD_SX 0*SIZE(C0, ldc,1), xvec14, xvec14; ADD_SX 0*SIZE(C0, ldc,2), xvec13, xvec13; ADD_SX 0*SIZE(C0, %rax,1), xvec12, xvec12; ADD_SX 0*SIZE(C1), xvec11, xvec11; ADD_SX 0*SIZE(C1, ldc,1), xvec10, xvec10; ADD_SX 0*SIZE(C1, ldc,2), xvec9, xvec9; ADD_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; #endif ST_SX xvec15, 0*SIZE(C0); ST_SX xvec14, 0*SIZE(C0, ldc, 1); ST_SX xvec13, 0*SIZE(C0, ldc, 2); ST_SX xvec12, 0*SIZE(C0, %rax, 1); ST_SX xvec11, 0*SIZE(C1); ST_SX xvec10, 0*SIZE(C1, ldc, 1); ST_SX xvec9, 0*SIZE(C1, ldc, 2); ST_SX xvec8, 0*SIZE(C1, %rax, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec7, xvec7; LDH_SX 2*SIZE(C0), xvec7, xvec7; LDL_SX 0*SIZE(C0, ldc, 1), xvec6, xvec6; LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; LDL_SX 0*SIZE(C0, ldc, 2), xvec5, xvec5; LDH_SX 2*SIZE(C0, ldc, 2), xvec5, xvec5; LDL_SX 0*SIZE(C0, %rax, 1), xvec4, xvec4; LDH_SX 2*SIZE(C0, %rax, 1), xvec4, xvec4; LDL_SX 0*SIZE(C1), xvec3, xvec3; LDH_SX 2*SIZE(C1), xvec3, xvec3; LDL_SX 0*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; LDL_SX 0*SIZE(C1, ldc, 2), xvec1, xvec1; LDH_SX 2*SIZE(C1, ldc, 2), xvec1, xvec1; LDL_SX 0*SIZE(C1, %rax, 1), xvec0, xvec0; LDH_SX 2*SIZE(C1, %rax, 1), xvec0, xvec0; ADD_SX xvec7, xvec15, xvec15; ADD_SX xvec6, xvec14, xvec14; ADD_SX xvec5, xvec13, xvec13; ADD_SX xvec4, xvec12, xvec12; ADD_SX xvec3, xvec11, xvec11; ADD_SX xvec2, xvec10, xvec10; ADD_SX xvec1, xvec9, xvec9; ADD_SX xvec0, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); STL_SX xvec14, 0*SIZE(C0, ldc, 1); STH_SX xvec14, 2*SIZE(C0, ldc, 1); STL_SX xvec13, 0*SIZE(C0, ldc, 2); STH_SX xvec13, 2*SIZE(C0, ldc, 2); STL_SX xvec12, 0*SIZE(C0, %rax, 1); STH_SX xvec12, 2*SIZE(C0, %rax, 1); STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C1); STL_SX xvec10, 0*SIZE(C1, ldc, 1); STH_SX xvec10, 2*SIZE(C1, ldc, 1); STL_SX xvec9, 0*SIZE(C1, ldc, 2); STH_SX xvec9, 2*SIZE(C1, ldc, 2); STL_SX xvec8, 0*SIZE(C1, %rax, 1); STH_SX xvec8, 2*SIZE(C1, %rax, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; ALIGN_4 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb #endif #### Initial Results Register #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; MOVQ bk, k; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $8, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; ALIGN_4 .L11_bodyB: #### Computing kernel LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; MUL_SX xvec6, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec6, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; MUL_SX xvec6, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec6, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0x44, xvec0, xvec1; EDUP_SX 16*SIZE(ptrbb), xvec2; ODUP_SX 16*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; ODUP_SX 20*SIZE(ptrbb), xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 24*SIZE(ptrbb), xvec2; ODUP_SX 24*SIZE(ptrbb), xvec3; MUL_SX xvec6, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec6, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; ODUP_SX 28*SIZE(ptrbb), xvec5; MUL_SX xvec6, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec6, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; ALIGN_4 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L12_loopE; ALIGN_4 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; MUL_SX xvec6, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec6, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; MUL_SX xvec6, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec6, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L12_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L13_loopE; ALIGN_4 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L13_loopE: LEAQ (ldc,ldc,2),%rax; #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec7, xvec12, xvec12; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec11, xvec11; LDH_SX 0*SIZE(C0, ldc, 2), xvec11, xvec11; LDL_SX 0*SIZE(C0, ldc, 1), xvec10, xvec10; LDH_SX 0*SIZE(C0, %rax, 1), xvec10, xvec10; LDL_SX 0*SIZE(C1), xvec9, xvec9; LDH_SX 0*SIZE(C1, ldc, 2), xvec9, xvec9; LDL_SX 0*SIZE(C1, ldc, 1), xvec8, xvec8; LDH_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; ADD_SX xvec11, xvec15, xvec15; ADD_SX xvec10, xvec14, xvec14; ADD_SX xvec9, xvec13, xvec13; ADD_SX xvec8, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 2); STL_SX xvec14, 0*SIZE(C0, ldc, 1); STH_SX xvec14, 0*SIZE(C0, %rax, 1); STL_SX xvec13, 0*SIZE(C1); STH_SX xvec13, 0*SIZE(C1, ldc, 2); STL_SX xvec12, 0*SIZE(C1, ldc, 1); STH_SX xvec12, 0*SIZE(C1, %rax, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; #### Writing Back #### .L6_loopE: TEST $1, bm; JLE .L7_loopE; ALIGN_4 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #### intitial #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; MOVQ bk, k; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $8, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; LD_SX 24*SIZE(ptrbb), xvec4; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec5; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; ALIGN_4 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L15_loopE; ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L15_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L16_loopE; ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; ADDQ $1, ptrba; ADDQ $4, ptrbb; .L16_loopE: BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; LEAQ (ldc,ldc,2),%rax; SHUF_SX $0xff, xvec15, xvec13; SHUF_SX $0xaa, xvec15, xvec12; SHUF_SX $0x55, xvec15, xvec11; SHUF_SX $0x00, xvec15, xvec10; #ifndef TRMMKERNEL addss 0*SIZE(C0), xvec10; addss 0*SIZE(C0, ldc, 1), xvec11; addss 0*SIZE(C0, ldc, 2), xvec12; addss 0*SIZE(C0, %rax, 1), xvec13; #endif movss xvec10, 0*SIZE(C0); movss xvec11, 0*SIZE(C0, ldc, 1); movss xvec12, 0*SIZE(C0, ldc, 2); movss xvec13, 0*SIZE(C0, %rax, 1); SHUF_SX $0xff, xvec14, xvec9; SHUF_SX $0xaa, xvec14, xvec8; SHUF_SX $0x55, xvec14, xvec7; SHUF_SX $0x00, xvec14, xvec6; #ifndef TRMMKERNEL addss 0*SIZE(C1), xvec6; addss 0*SIZE(C1, ldc, 1), xvec7; addss 0*SIZE(C1, ldc, 2), xvec8; addss 0*SIZE(C1, %rax, 1), xvec9; #endif movss xvec6, 0*SIZE(C1); movss xvec7, 0*SIZE(C1, ldc, 1); movss xvec8, 0*SIZE(C1, ldc, 2); movss xvec9, 0*SIZE(C1, %rax, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 8), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $1, kk #endif ADDQ $1*SIZE, C0; ADDQ $1*SIZE, C1; #### Writing Back #### .L7_loopE: #if defined(TRMMKERNEL)&&!defined(LEFT) ADDQ $8, kk #endif MOVQ bk,k; SALQ $5,k; ADDQ k,bb; LEAQ (C,ldc,8),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; ALIGN_4; .L0_loopE:; TEST $4, bn; # Rn = 4 JLE .L20_loopE; ALIGN_4; .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; LEAQ (C, ldc, 2), C1; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; ALIGN_4 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #### Initial #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; LD_SX 0*SIZE(ptrba), xvec0; XOR_SY yvec9, yvec9, yvec9; XOR_SY yvec8, yvec8, yvec8; LD_SX 4*SIZE(ptrba), xvec1; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L211_loopE; ALIGN_4 .L211_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec13, xvec13; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; #### Unroll time 2 #### ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec13, xvec13; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec12, xvec12; EDUP_SX 8*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec9, xvec9; LD_SX 16*SIZE(ptrba), xvec0; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec8, xvec8; LD_SX 20*SIZE(ptrba), xvec1; #### Unroll time 3 #### ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec13, xvec13; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec12, xvec12; EDUP_SX 12*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; ADDQ $16*SIZE, ptrbb; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec9, xvec9; LD_SX 24*SIZE(ptrba), xvec0; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec8, xvec8; LD_SX 28*SIZE(ptrba), xvec1; ADDQ $32*SIZE, ptrba; #### Unroll time 4 #### ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec13, xvec13; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; ALIGN_4 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk #else TEST $2, kkk; #endif JLE .L212_loopE; ALIGN_4 .L212_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec13, xvec13; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; ADDQ $16*SIZE, ptrba; #### Unroll time 2 #### ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec13, xvec13; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; .L212_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L213_loopE; ALIGN_4 .L213_bodyB: ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; MOV_SX xvec3, xvec7; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec13, xvec13; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec12, xvec12; MOV_SX xvec4, xvec6; ADDQ $8*SIZE, ptrba; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec9, xvec9; MUL_SX xvec1, xvec7, xvec7; ADD_SX xvec7, xvec8, xvec8; .L213_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec7, xvec12, xvec12; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec7, xvec10, xvec10; MUL_SX xvec7, xvec9, xvec9; MUL_SX xvec7, xvec8, xvec8; #### Writing Back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C1), xvec1, xvec1; LDL_SX 0*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; LDL_SX 4*SIZE(C0, ldc, 1), xvec3, xvec3; LDH_SX 6*SIZE(C1, ldc, 1), xvec3, xvec3; LDL_SX 0*SIZE(C1), xvec4, xvec4; LDH_SX 2*SIZE(C0), xvec4, xvec4; LDL_SX 4*SIZE(C1), xvec5, xvec5; LDH_SX 6*SIZE(C0), xvec5, xvec5; LDL_SX 0*SIZE(C1, ldc, 1), xvec6, xvec6; LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; LDL_SX 4*SIZE(C1, ldc, 1), xvec7, xvec7; LDH_SX 6*SIZE(C0, ldc, 1), xvec7, xvec7; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; ADD_SX xvec2, xvec13, xvec13; ADD_SX xvec3, xvec12, xvec12; ADD_SX xvec4, xvec11, xvec11; ADD_SX xvec5, xvec10, xvec10; ADD_SX xvec6, xvec9, xvec9; ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); STL_SX xvec13, 0*SIZE(C0, ldc, 1); STH_SX xvec13, 2*SIZE(C1, ldc, 1); STL_SX xvec12, 4*SIZE(C0, ldc, 1); STH_SX xvec12, 6*SIZE(C1, ldc, 1); STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); STL_SX xvec10, 4*SIZE(C1); STH_SX xvec10, 6*SIZE(C0); STL_SX xvec9, 0*SIZE(C1, ldc, 1); STH_SX xvec9, 2*SIZE(C0, ldc, 1); STL_SX xvec8, 4*SIZE(C1, ldc, 1); STH_SX xvec8, 6*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; ALIGN_4 .L21_loopE: TEST $4, bm; JLE .L22_loopE; ALIGN_4 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #### Initial Results #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; ALIGN_4 .L221_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; EDUP_SX 12*SIZE(ptrbb), xvec2; ODUP_SX 12*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15 SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; ALIGN_4 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L222_loopE; ALIGN_4 .L222_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec1, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13 MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L222_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L223_loopE; ALIGN_4 .L223_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L223_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec7, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 0*SIZE(C0, ldc, 1), xvec1, xvec1; LDH_SX 2*SIZE(C1, ldc, 1), xvec1, xvec1; LDL_SX 0*SIZE(C1), xvec2, xvec2; LDH_SX 2*SIZE(C0), xvec2, xvec2; LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_SX 2*SIZE(C0, ldc, 1), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; ADD_SX xvec2, xvec13, xvec13; ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 0*SIZE(C0, ldc, 1); STH_SX xvec14, 2*SIZE(C1, ldc, 1); STL_SX xvec13, 0*SIZE(C1); STH_SX xvec13, 2*SIZE(C0); STL_SX xvec12, 0*SIZE(C1, ldc, 1); STH_SX xvec12, 2*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; ALIGN_4 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb #endif #### Initial #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; ALIGN_4 .L231_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; MUL_SX xvec2, xvec6, xvec6; ADD_SX xvec6, xvec15, xvec15; MUL_SX xvec2, xvec7, xvec7; ADD_SX xvec7, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 12*SIZE(ptrbb), xvec6; ODUP_SX 12*SIZE(ptrbb), xvec7; MUL_SX xvec2, xvec6, xvec6; ADD_SX xvec6, xvec15, xvec15; MUL_SX xvec2, xvec7, xvec7; ADD_SX xvec7, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; ALIGN_4 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L232_loopE; ALIGN_4 .L232_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; MUL_SX xvec2, xvec6, xvec6; ADD_SX xvec6, xvec15, xvec15; MUL_SX xvec2, xvec7, xvec7; ADD_SX xvec7, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L233_loopE; ALIGN_4 .L233_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L233_loopE: #### Multiply Alpha #### BROAD_SY MEMALPHA, yvec7; MUL_SY xvec7, xvec15, xvec15; MUL_SY xvec7, xvec14, xvec14; #### Writing Back #### SHUF_SX $0xee, xvec15, xvec13; SHUF_SX $0xee, xvec14, xvec12; #ifndef TRMMKERNEL ADD_SY 0*SIZE(C0), xvec15, xvec15; ADD_SY 0*SIZE(C0, ldc, 1), xvec14, xvec14; ADD_SY 0*SIZE(C1), xvec13, xvec13; ADD_SY 0*SIZE(C1, ldc, 1), xvec12, xvec12; #endif STL_SY xvec15, 0*SIZE(C0); STL_SY xvec14, 0*SIZE(C0, ldc, 1); STL_SY xvec13, 0*SIZE(C1); STL_SY xvec12, 0*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; ALIGN_4 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #### Initial #### XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec2, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec4; LD_SX 8*SIZE(ptrbb), xvec5; MUL_SX xvec4, xvec5, xvec5; ADD_SX xvec5, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec6; LD_SX 12*SIZE(ptrbb), xvec7; MUL_SX xvec6, xvec7, xvec7; ADD_SX xvec7, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; ALIGN_4 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L242_loopE; ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec2, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L242_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L243_loopE; ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L243_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; SHUF_SX $0xff, xvec15, xvec14; SHUF_SX $0xaa, xvec15, xvec13; SHUF_SX $0x55, xvec15, xvec12; SHUF_SX $0x00, xvec15, xvec11; #ifndef TRMMKERNEL addss 0*SIZE(C0), xvec11; addss 0*SIZE(C0, ldc, 1), xvec12; addss 0*SIZE(C1), xvec13; addss 0*SIZE(C1, ldc, 1), xvec14; #endif movss xvec11, 0*SIZE(C0); movss xvec12, 0*SIZE(C0, ldc, 1); movss xvec13, 0*SIZE(C1); movss xvec14, 0*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $1, kk #endif ADDQ $1*SIZE, C0; ADDQ $1*SIZE, C1; .L24_loopE: #if defined(TRMMKERNEL)&&!defined(LEFT) ADDQ $4, kk #endif MOVQ bk, k; SALQ $4, k; ADDQ k, bb; LEAQ (C, ldc, 4), C; .L20_loopE: TEST $2, bn; JLE .L30_loopE; ALIGN_4 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk #endif MOVQ C, C0; LEAQ (C, ldc, 1), C1; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; ALIGN_4 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #### Initial #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; ALIGN_4 .L311_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec13, xvec13; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec13, xvec13; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec12, xvec12; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; LD_SX 16*SIZE(ptrba), xvec0; LD_SX 20*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec13, xvec13; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 24*SIZE(ptrba), xvec0; LD_SX 28*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec13, xvec13; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec12, xvec12; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; ALIGN_4 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L312_loopE; ALIGN_4 .L312_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec13, xvec13; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec13, xvec13; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L312_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L313_loopE; ALIGN_4 .L313_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec13, xvec13; MUL_SX xvec1, xvec6, xvec6; ADD_SX xvec6, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L313_loopE: BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec7, xvec12, xvec12; #### Writing Back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C1), xvec1, xvec1; LDL_SX 0*SIZE(C1), xvec2, xvec2; LDH_SX 2*SIZE(C0), xvec2, xvec2; LDL_SX 4*SIZE(C1), xvec3, xvec3; LDH_SX 6*SIZE(C0), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; ADD_SX xvec2, xvec13, xvec13; ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); STL_SX xvec13, 0*SIZE(C1); STH_SX xvec13, 2*SIZE(C0); STL_SX xvec12, 4*SIZE(C1); STH_SX xvec12, 6*SIZE(C0); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L31_bodyB; ALIGN_4 .L31_loopE: TEST $4, bm; JLE .L32_loopE; ALIGN_4 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #### Initial #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; ALIGN_4 .L321_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec15, xvec15; MUL_SX xvec0, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; LD_SX 12*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec15, xvec15; MUL_SX xvec0, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; ALIGN_4 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L322_loopE; ALIGN_4 .L322_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec15, xvec15; MUL_SX xvec0, xvec6, xvec6; ADD_SX xvec6, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L322_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L323_loopE; ALIGN_4 .L323_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec15, xvec15; MUL_SX xvec0, xvec4, xvec4; ADD_SX xvec4, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L323_loopE: BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 0*SIZE(C1), xvec1, xvec1; LDH_SX 2*SIZE(C0), xvec1, xvec1; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 0*SIZE(C1); STH_SX xvec14, 2*SIZE(C0); #if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L32_loopE: TEST $2, bm; JLE .L33_loopE; ALIGN_4 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #### Initial #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; ALIGN_4 .L331_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; ALIGN_4 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L332_loopE; ALIGN_4 .L332_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L332_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L333_loopE; ALIGN_4 .L333_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; movss 0*SIZE(ptrbb), xvec2; XOR_SY yvec3, yvec3, yvec3; movss xvec2, xvec3; mulss xvec0, xvec2; addss xvec2, xvec15; mulss xvec1, xvec3; SHUF_SX $0xe1, xvec3, xvec4; ADD_SX xvec4, xvec15, xvec15; movss 1*SIZE(ptrbb), xvec5; XOR_SY yvec6, yvec6, yvec6; movss xvec5, xvec6; mulss xvec0, xvec5; addss xvec5, xvec14; mulss xvec1, xvec6; SHUF_SX $0xe1, xvec6, xvec7; ADD_SX xvec7, xvec14, xvec14 ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L333_loopE: BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec7, xvec14, xvec14; SHUF_SX $0xee, xvec15, xvec13; SHUF_SX $0xee, xvec14, xvec12; SHUF_SX $0x44, xvec15, xvec11; SHUF_SX $0x44, xvec14, xvec10; ADD_SX xvec13, xvec11, xvec11; ADD_SX xvec12, xvec10, xvec10; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDL_SX 0*SIZE(C1), xvec1, xvec1; ADD_SX xvec0, xvec11, xvec11; ADD_SX xvec1, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C0); STL_SX xvec10, 0*SIZE(C1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; #### Writing Back #### .L33_loopE: TEST $1, bm; JLE .L34_loopE; ALIGN_4 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #### Initial #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; ALIGN_4 .L341_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; mulss xvec0, xvec1; addss xvec1, xvec15; movss 1*SIZE(ptrbb), xvec2; mulss xvec0, xvec2; addss xvec2, xvec14; movss 1*SIZE(ptrba), xvec0; movss 2*SIZE(ptrbb), xvec1; mulss xvec0, xvec1; addss xvec1, xvec15; movss 3*SIZE(ptrbb), xvec2; mulss xvec0, xvec2; addss xvec2, xvec14; movss 2*SIZE(ptrba), xvec0; movss 4*SIZE(ptrbb), xvec1; mulss xvec0, xvec1; addss xvec1, xvec15; movss 5*SIZE(ptrbb), xvec2; mulss xvec0, xvec2; addss xvec2, xvec14; movss 3*SIZE(ptrba), xvec0; movss 6*SIZE(ptrbb), xvec1; mulss xvec0, xvec1; addss xvec1, xvec15; movss 7*SIZE(ptrbb), xvec2; mulss xvec0, xvec2; addss xvec2, xvec14; addq $4*SIZE, ptrba; addq $8*SIZE, ptrbb; decq k; jg .L341_bodyB; ALIGN_4 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L342_loopE; ALIGN_4 .L342_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; mulss xvec0, xvec1; addss xvec1, xvec15; movss 1*SIZE(ptrbb), xvec2; mulss xvec0, xvec2; addss xvec2, xvec14; movss 1*SIZE(ptrba), xvec0; movss 2*SIZE(ptrbb), xvec1; mulss xvec0, xvec1; addss xvec1, xvec15; movss 3*SIZE(ptrbb), xvec2; mulss xvec0, xvec2; addss xvec2, xvec14; addq $2*SIZE, ptrba; addq $4*SIZE, ptrbb; .L342_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L343_loopE; ALIGN_4 .L343_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; mulss xvec0, xvec1; addss xvec1, xvec15; movss 1*SIZE(ptrbb), xvec2; mulss xvec0, xvec2; addss xvec2, xvec14; addq $1*SIZE, ptrba; addq $2*SIZE, ptrbb .L343_loopE: #### Writing back #### movss MEMALPHA, xvec7; mulss xvec7, xvec15; mulss xvec7, xvec14; movss 0*SIZE(C0), xvec0; movss 0*SIZE(C1), xvec1; #ifndef TRMMKERNEL addss xvec0, xvec15; addss xvec1, xvec14; #endif movss xvec15, 0*SIZE(C0); movss xvec14, 0*SIZE(C1); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif addq $1*SIZE, C0; addq $1*SIZE, C1; .L34_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $2, kk; #endif MOVQ bk, k; SALQ $3, k; ADDQ k, bb; LEAQ (C, ldc, 2), C; .L30_loopE: TEST $1, bn; JLE .L40_loopE; ALIGN_4 .L40_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L41_loopE; ALIGN_4 .L41_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif #### initial #### XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L411_loopE; ALIGN_4 .L411_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; MUL_SY yvec0, yvec1, yvec2; ADD_SY yvec2, yvec15, yvec15; LD_SY 8*SIZE(ptrba), yvec0; BROAD_SY 1*SIZE(ptrbb), yvec1; MUL_SY yvec0, yvec1, yvec2; ADD_SY yvec2, yvec15, yvec15; LD_SY 16*SIZE(ptrba), yvec0; BROAD_SY 2*SIZE(ptrbb), yvec1; MUL_SY yvec0, yvec1, yvec2; ADD_SY yvec2, yvec15, yvec15; LD_SY 24*SIZE(ptrba), yvec0; BROAD_SY 3*SIZE(ptrbb), yvec1; MUL_SY yvec0, yvec1, yvec2; ADD_SY yvec2, yvec15, yvec15; ADDQ $32*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L411_bodyB; ALIGN_4 .L411_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L412_loopE; ALIGN_4 .L412_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; MUL_SY yvec0, yvec1, yvec2; ADD_SY yvec2, yvec15, yvec15; LD_SY 8*SIZE(ptrba), yvec0; BROAD_SY 1*SIZE(ptrbb), yvec1; MUL_SY yvec0, yvec1, yvec2; ADD_SY yvec2, yvec15, yvec15; ADDQ $16*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L412_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L413_loopE; ALIGN_4 .L413_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; MUL_SY yvec0, yvec1, yvec2; ADD_SY yvec2, yvec15, yvec15; ADDQ $8*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L413_loopE: #### Writing #### BROAD_SY MEMALPHA, yvec7; MUL_SY yvec7, yvec15, yvec15; EXTRA_SY $1, yvec15, xvec14; SHUF_SX $0x44, xvec15, xvec13; SHUF_SX $0xee, xvec15, xvec12; SHUF_SX $0x44, xvec14, xvec11; SHUF_SX $0xee, xvec14, xvec10; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDL_SX 2*SIZE(C0), xvec1, xvec1; LDL_SX 4*SIZE(C0), xvec2, xvec2; LDL_SX 6*SIZE(C0), xvec3, xvec3; ADD_SX xvec0, xvec13, xvec13; ADD_SX xvec1, xvec12, xvec12; ADD_SX xvec2, xvec11, xvec11; ADD_SX xvec3, xvec10, xvec10; #endif STL_SX xvec13, 0*SIZE(C0); STL_SX xvec12, 2*SIZE(C0); STL_SX xvec11, 4*SIZE(C0); STL_SX xvec10, 6*SIZE(C0); #if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL)&&defined(LEFT) ADDQ $8, kk; #endif ADDQ $8*SIZE, C0; DECQ i; JG .L41_bodyB; ALIGN_4 .L41_loopE: TEST $4, bm; JLE .L42_loopE; ALIGN_4 .L42_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk #endif SARQ $2, k; JLE .L421_loopE; ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 3*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L421_bodyB; ALIGN_4 .L421_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L422_loopE; ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L422_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L423_loopE; ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; MUL_SX xvec0, xvec1, xvec1; ADD_SX xvec1, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L423_loopE: #### Writing back #### BROAD_SX MEMALPHA, xvec7; MUL_SX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0), xvec0, xvec0; ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (, %rax, SIZE), %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk #endif ADDQ $4*SIZE, C0; .L42_loopE: TEST $2, bm; JLE .L43_loopE; ALIGN_4 .L43_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax LEAQ (ptrba, %rax, 2), ptrba ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L431_loopE; ALIGN_4 .L431_bodyB: vmovss 0*SIZE(ptrba), xvec0; vmovss 1*SIZE(ptrba), xvec1; vmovss 0*SIZE(ptrbb), xvec2; vmulss xvec2, xvec0, xvec0; vaddss xvec0, xvec15, xvec15; vmulss xvec2, xvec1, xvec1; vaddss xvec1, xvec14, xvec14; vmovss 2*SIZE(ptrba), xvec3; vmovss 3*SIZE(ptrba), xvec4; vmovss 1*SIZE(ptrbb), xvec5; vmulss xvec5, xvec3, xvec3; vaddss xvec3, xvec15, xvec15; vmulss xvec5, xvec4, xvec4; vaddss xvec4, xvec14, xvec14; vmovss 4*SIZE(ptrba), xvec0; vmovss 5*SIZE(ptrba), xvec1; vmovss 2*SIZE(ptrbb), xvec2; vmulss xvec2, xvec0, xvec0; vaddss xvec0, xvec15, xvec15; vmulss xvec2, xvec1, xvec1; vaddss xvec1, xvec14, xvec14; vmovss 6*SIZE(ptrba), xvec3; vmovss 7*SIZE(ptrba), xvec4; vmovss 3*SIZE(ptrbb), xvec5; vmulss xvec5, xvec3, xvec3; vaddss xvec3, xvec15, xvec15; vmulss xvec5, xvec4, xvec4; vaddss xvec4, xvec14, xvec14; addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L431_bodyB; ALIGN_4 .L431_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L432_loopE; ALIGN_4 .L432_bodyB: vmovss 0*SIZE(ptrba), xvec0; vmovss 1*SIZE(ptrba), xvec1; vmovss 0*SIZE(ptrbb), xvec2; vmulss xvec2, xvec0, xvec0; vaddss xvec0, xvec15, xvec15; vmulss xvec2, xvec1, xvec1; vaddss xvec1, xvec14, xvec14; vmovss 2*SIZE(ptrba), xvec3; vmovss 3*SIZE(ptrba), xvec4; vmovss 1*SIZE(ptrbb), xvec5; vmulss xvec5, xvec3, xvec3; vaddss xvec3, xvec15, xvec15; vmulss xvec5, xvec4, xvec4; vaddss xvec4, xvec14, xvec14; addq $4*SIZE, ptrba; addq $2*SIZE, ptrbb; .L432_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L433_loopE; ALIGN_4 .L433_bodyB: vmovss 0*SIZE(ptrba), xvec0; vmovss 1*SIZE(ptrba), xvec1; vmovss 0*SIZE(ptrbb), xvec2; vmulss xvec2, xvec0, xvec0; vaddss xvec0, xvec15, xvec15; vmulss xvec2, xvec1, xvec1; vaddss xvec1, xvec14, xvec14; addq $2*SIZE, ptrba; addq $1*SIZE, ptrbb; .L433_loopE: #### Writing Back #### vmovss MEMALPHA, xvec7; vmulss xvec7, xvec15, xvec15; vmulss xvec7, xvec14, xvec14; #ifndef TRMMKERNEL vaddss 0*SIZE(C0), xvec15, xvec15; vaddss 1*SIZE(C0), xvec14, xvec14; #endif vmovss xvec15, 0*SIZE(C0); vmovss xvec14, 1*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, kk #endif addq $2*SIZE, C0; .L43_loopE: TEST $1, bm; JLE .L44_loopE; ALIGN_4 .L44_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk, k; #elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L441_loopE; ALIGN_4 .L441_bodyB: vmovss 0*SIZE(ptrba), xvec0; vmovss 0*SIZE(ptrbb), xvec1; vmulss xvec0, xvec1, xvec1; vaddss xvec1, xvec15, xvec15; vmovss 1*SIZE(ptrba), xvec0; vmovss 1*SIZE(ptrbb), xvec1; vmulss xvec0, xvec1, xvec1; vaddss xvec1, xvec15, xvec15; vmovss 2*SIZE(ptrba), xvec0; vmovss 2*SIZE(ptrbb), xvec1; vmulss xvec0, xvec1, xvec1; vaddss xvec1, xvec15, xvec15; vmovss 3*SIZE(ptrba), xvec0; vmovss 3*SIZE(ptrbb), xvec1; vmulss xvec0, xvec1, xvec1; vaddss xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L441_bodyB; ALIGN_4 .L441_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L442_loopE; ALIGN_4 .L442_bodyB: vmovss 0*SIZE(ptrba), xvec0; vmovss 0*SIZE(ptrbb), xvec1; vmulss xvec0, xvec1, xvec1; vaddss xvec1, xvec15, xvec15; vmovss 1*SIZE(ptrba), xvec0; vmovss 1*SIZE(ptrbb), xvec1; vmulss xvec0, xvec1, xvec1; vaddss xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; .L442_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L443_loopE; ALIGN_4 .L443_bodyB: vmovss 0*SIZE(ptrba), xvec0; vmovss 0*SIZE(ptrbb), xvec1; vmulss xvec0, xvec1, xvec1; vaddss xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L443_loopE: #### Writing Back #### vmovss MEMALPHA, xvec7; vmulss xvec7, xvec15, xvec15; #ifndef TRMMKERNEL vaddss 0*SIZE(C0), xvec15, xvec15; #endif vmovss xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; LEAQ (,%rax, SIZE), %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, kk #endif addq $1*SIZE, C0; .L44_loopE: MOV bk, k; SALQ $2, k; ADDQ k, bb; ADDQ ldc, C; .L40_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; vzeroupper #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp; ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/sgemv_n.S000066400000000000000000003441661313527062700171620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #if GEMV_UNROLL < 4 #undef GEMV_UNROLL #define GEMV_UNROLL 4 #endif #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx #define OLD_LDA %r8 #define STACK_INCX 8 + STACKSIZE(%rsp) #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) #define ALPHA 48 (%rsp) #define MMM 56(%rsp) #define NN 64(%rsp) #define AA 72(%rsp) #define LDAX 80(%rsp) #define XX 96(%rsp) #else #define STACKSIZE 288 #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define STACK_INCX 64 + STACKSIZE(%rsp) #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) #define ALPHA 224 (%rsp) #define MMM 232(%rsp) #define NN 240(%rsp) #define AA 248(%rsp) #define LDAX 256(%rsp) #define XX 264(%rsp) #endif #define LDA %r8 #define X %r9 #define INCX %rsi #define INCY %rdi #define M %r10 #define N %r11 #define A %r12 #define Y %r14 #define BUFFER %r13 #define I %rax #define A1 %rbx #define A2 %rcx #define LDA3 %rdx #define Y1 %rbp #ifdef ALIGNED_ACCESS #define MM %r15 #else #define MM M #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_M, M movq OLD_N, N movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X #else movq OLD_M, M movq OLD_N, N movq OLD_A, A movq OLD_LDA, LDA #endif #ifndef WINDOWS_ABI movss %xmm0, ALPHA #else movss %xmm3, ALPHA #endif movq M,MMM movq A,AA movq N,NN movq LDA,LDAX movq X,XX movq STACK_Y, Y .L0t: xorq I,I addq $1,I salq $22,I subq I,MMM movq I,M jge .L00t movq MMM,M addq I,M jle .L999x .L00t: movq AA,A movq NN,N movq LDAX,LDA movq XX,X movq STACK_INCX, INCX movq STACK_INCY, INCY movq STACK_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 #ifdef ALIGNED_ACCESS movq M, MM testq $4 * SIZE - 1, A je .L0X cmpq $3, M jle .L0X movq A, MM sarq $BASE_SHIFT, MM andq $3, MM subq $4, MM addq M, MM .L0X: #endif testq N, N # if n <= 0 goto END jle .L999 testq M, M # if n <= 0 goto END jle .L999 subq $-32 * SIZE, A movq BUFFER, Y1 pxor %xmm0, %xmm0 movq M, %rax #ifdef ALIGNED_ACCESS addq $19, %rax #else addq $16, %rax #endif sarq $4, %rax ALIGN_3 .L01: movaps %xmm0, 0 * SIZE(Y1) movaps %xmm0, 4 * SIZE(Y1) movaps %xmm0, 8 * SIZE(Y1) movaps %xmm0, 12 * SIZE(Y1) addq $16 * SIZE, Y1 decq %rax jg .L01 ALIGN_3 .L10: #ifdef ALIGNED_ACCESS movq A, %rax andq $4 * SIZE - 1, %rax addq %rax, BUFFER testq $4 * SIZE - 1, LDA jne .L100 #endif #if GEMV_UNROLL >= 8 cmpq $8, N jl .L20 ALIGN_3 .L11: subq $8, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 4), A2 leaq (A, LDA, 8), A movss (X), %xmm8 addq INCX, X movss (X), %xmm9 addq INCX, X movss (X), %xmm10 addq INCX, X movss (X), %xmm11 addq INCX, X movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss (X), %xmm15 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm8 shufps $0, %xmm8, %xmm8 mulss %xmm0, %xmm9 shufps $0, %xmm9, %xmm9 mulss %xmm0, %xmm10 shufps $0, %xmm10, %xmm10 mulss %xmm0, %xmm11 shufps $0, %xmm11, %xmm11 mulss %xmm0, %xmm12 shufps $0, %xmm12, %xmm12 mulss %xmm0, %xmm13 shufps $0, %xmm13, %xmm13 mulss %xmm0, %xmm14 shufps $0, %xmm14, %xmm14 mulss %xmm0, %xmm15 shufps $0, %xmm15, %xmm15 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L17 testq $SIZE, A1 je .L1X movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA, 1), %xmm5 movss -32 * SIZE(A1, LDA, 2), %xmm6 movss -32 * SIZE(A1, LDA3, 1), %xmm7 movss -32 * SIZE(Y1), %xmm0 mulss %xmm8, %xmm4 addss %xmm4, %xmm0 movss -32 * SIZE(A2), %xmm4 mulss %xmm9, %xmm5 addss %xmm5, %xmm0 movss -32 * SIZE(A2, LDA, 1), %xmm5 mulss %xmm10, %xmm6 addss %xmm6, %xmm0 movss -32 * SIZE(A2, LDA, 2), %xmm6 mulss %xmm11, %xmm7 addss %xmm7, %xmm0 movss -32 * SIZE(A2, LDA3, 1), %xmm7 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 mulss %xmm15, %xmm7 addss %xmm7, %xmm0 movss %xmm0, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L1X: testq $2 * SIZE, A1 je .L1XX movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA, 1), %xmm5 movsd -32 * SIZE(A1, LDA, 2), %xmm6 movsd -32 * SIZE(A1, LDA3, 1), %xmm7 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movsd -32 * SIZE(A2, LDA, 1), %xmm5 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movsd -32 * SIZE(A2, LDA, 2), %xmm6 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movsd -32 * SIZE(A2, LDA3, 1), %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 addps %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L1XX: #endif movq MM, I sarq $4, I jle .L15 MOVUPS_A1 (-32 * SIZE, A1, %xmm4) MOVUPS_A1 (-28 * SIZE, A1, %xmm5) MOVUPS_A1 (-24 * SIZE, A1, %xmm6) MOVUPS_A1 (-20 * SIZE, A1, %xmm7) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm4) mulps %xmm8, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm5) mulps %xmm8, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm8, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm9, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) mulps %xmm9, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) mulps %xmm9, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm6) mulps %xmm9, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) #endif mulps %xmm10, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm4) mulps %xmm10, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm5) mulps %xmm10, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm6) mulps %xmm10, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) #endif mulps %xmm11, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1 (-32 * SIZE, A2, %xmm4) mulps %xmm11, %xmm5 addps %xmm5, %xmm1 MOVUPS_A1 (-28 * SIZE, A2, %xmm5) mulps %xmm11, %xmm6 addps %xmm6, %xmm2 MOVUPS_A1 (-24 * SIZE, A2, %xmm6) mulps %xmm11, %xmm7 addps %xmm7, %xmm3 MOVUPS_A1 (-20 * SIZE, A2, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm4) mulps %xmm12, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm5) mulps %xmm12, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm12, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm6) mulps %xmm13, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) #endif mulps %xmm14, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm4) mulps %xmm14, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm5) mulps %xmm14, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm6) mulps %xmm14, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) #endif mulps %xmm15, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1 (-16 * SIZE, A1, %xmm4) mulps %xmm15, %xmm5 addps %xmm5, %xmm1 MOVUPS_A1 (-12 * SIZE, A1, %xmm5) mulps %xmm15, %xmm6 addps %xmm6, %xmm2 MOVUPS_A1 ( -8 * SIZE, A1, %xmm6) mulps %xmm15, %xmm7 addps %xmm7, %xmm3 MOVUPS_A1 ( -4 * SIZE, A1, %xmm7) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm4) mulps %xmm8, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm5) mulps %xmm8, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm8, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm7) mulps %xmm9, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) mulps %xmm9, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) mulps %xmm9, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm6) mulps %xmm9, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm7) mulps %xmm10, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm4) mulps %xmm10, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm5) mulps %xmm10, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm6) mulps %xmm10, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm7) mulps %xmm11, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1 (-32 * SIZE, A2, %xmm4) mulps %xmm11, %xmm5 addps %xmm5, %xmm1 MOVUPS_A1 (-28 * SIZE, A2, %xmm5) mulps %xmm11, %xmm6 addps %xmm6, %xmm2 MOVUPS_A1 (-24 * SIZE, A2, %xmm6) mulps %xmm11, %xmm7 addps %xmm7, %xmm3 MOVUPS_A1 (-20 * SIZE, A2, %xmm7) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm4) mulps %xmm12, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm5) mulps %xmm12, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm12, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm7) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm6) mulps %xmm13, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm7) mulps %xmm14, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm4) mulps %xmm14, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm5) mulps %xmm14, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm6) mulps %xmm14, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm7) mulps %xmm15, %xmm4 addps %xmm4, %xmm0 mulps %xmm15, %xmm5 addps %xmm5, %xmm1 mulps %xmm15, %xmm6 addps %xmm6, %xmm2 mulps %xmm15, %xmm7 addps %xmm7, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L15: testq $8, MM je .L16 MOVUPS_A1 (-32 * SIZE, A1, %xmm4) MOVUPS_A1 (-28 * SIZE, A1, %xmm5) MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm6) MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm7) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) mulps %xmm8, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) mulps %xmm9, %xmm6 addps %xmm6, %xmm0 MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm6) mulps %xmm9, %xmm7 addps %xmm7, %xmm1 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm7) mulps %xmm10, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1 (-32 * SIZE, A2, %xmm4) mulps %xmm10, %xmm5 addps %xmm5, %xmm1 MOVUPS_A1 (-28 * SIZE, A2, %xmm5) mulps %xmm11, %xmm6 addps %xmm6, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm11, %xmm7 addps %xmm7, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm7) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) mulps %xmm12, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm6) mulps %xmm13, %xmm7 addps %xmm7, %xmm1 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm7) mulps %xmm14, %xmm4 addps %xmm4, %xmm0 mulps %xmm14, %xmm5 addps %xmm5, %xmm1 mulps %xmm15, %xmm6 addps %xmm6, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm15, %xmm7 addps %xmm7, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L16: testq $4, MM je .L17 MOVUPS_A1 (-32 * SIZE, A1, %xmm4) MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm6) MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm7) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm8, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1 (-32 * SIZE, A2, %xmm4) mulps %xmm9, %xmm5 addps %xmm5, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm5) mulps %xmm10, %xmm6 addps %xmm6, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm6) mulps %xmm11, %xmm7 addps %xmm7, %xmm0 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm7) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 addps %xmm7, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L17: testq $2, MM je .L18 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA, 1), %xmm5 movsd -32 * SIZE(A1, LDA, 2), %xmm6 movsd -32 * SIZE(A1, LDA3, 1), %xmm7 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movsd -32 * SIZE(A2), %xmm4 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movsd -32 * SIZE(A2, LDA, 1), %xmm5 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movsd -32 * SIZE(A2, LDA, 2), %xmm6 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movsd -32 * SIZE(A2, LDA3, 1), %xmm7 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 addps %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L18: testq $1, MM je .L19 movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA, 1), %xmm5 movss -32 * SIZE(A1, LDA, 2), %xmm6 movss -32 * SIZE(A1, LDA3, 1), %xmm7 movss -32 * SIZE(Y1), %xmm0 mulss %xmm8, %xmm4 addss %xmm4, %xmm0 movss -32 * SIZE(A2), %xmm4 mulss %xmm9, %xmm5 addss %xmm5, %xmm0 movss -32 * SIZE(A2, LDA, 1), %xmm5 mulss %xmm10, %xmm6 addss %xmm6, %xmm0 movss -32 * SIZE(A2, LDA, 2), %xmm6 mulss %xmm11, %xmm7 addss %xmm7, %xmm0 movss -32 * SIZE(A2, LDA3, 1), %xmm7 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 mulss %xmm15, %xmm7 addss %xmm7, %xmm0 movss %xmm0, -32 * SIZE(Y1) ALIGN_3 .L19: cmpq $8, N jge .L11 ALIGN_3 .L20: #endif cmpq $4, N jl .L30 #if GEMV_UNROLL == 4 ALIGN_3 .L21: #endif subq $4, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss (X), %xmm15 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 mulss %xmm0, %xmm15 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 shufps $0, %xmm15, %xmm15 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L27 testq $SIZE, A1 je .L2X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(A2, LDA), %xmm3 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 mulss %xmm15, %xmm3 addss %xmm3, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L2X: testq $2 * SIZE, A1 je .L2XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(A2, LDA), %xmm3 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 mulps %xmm15, %xmm3 addps %xmm3, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L2XX: #endif movq MM, I sarq $4, I jle .L25 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_A1 (-24 * SIZE, A1, %xmm10) MOVUPS_A1 (-20 * SIZE, A1, %xmm11) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm6) MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm7) decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm9) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm10) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1(-20 * SIZE, A2, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm13, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulps %xmm14, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1 (-16 * SIZE, A1, %xmm8) mulps %xmm14, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1 (-12 * SIZE, A1, %xmm9) mulps %xmm14, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) mulps %xmm14, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif mulps %xmm15, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) mulps %xmm15, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm5) mulps %xmm15, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm15, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm7) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm9) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm10) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1(-20 * SIZE, A2, %xmm11) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm6) mulps %xmm13, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm7) mulps %xmm14, %xmm8 addps %xmm8, %xmm0 mulps %xmm14, %xmm9 addps %xmm9, %xmm1 mulps %xmm14, %xmm10 addps %xmm10, %xmm2 mulps %xmm14, %xmm11 addps %xmm11, %xmm3 mulps %xmm15, %xmm4 addps %xmm4, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm15, %xmm5 addps %xmm5, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) mulps %xmm15, %xmm6 addps %xmm6, %xmm2 MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) mulps %xmm15, %xmm7 addps %xmm7, %xmm3 MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L25: testq $8, MM je .L26 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm9) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 mulps %xmm14, %xmm8 addps %xmm8, %xmm0 mulps %xmm14, %xmm9 addps %xmm9, %xmm1 mulps %xmm15, %xmm4 addps %xmm4, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm15, %xmm5 addps %xmm5, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L26: testq $4, MM je .L27 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) mulps %xmm14, %xmm8 addps %xmm8, %xmm0 mulps %xmm15, %xmm4 addps %xmm4, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L27: testq $2, MM je .L28 movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(A2, LDA), %xmm3 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 mulps %xmm15, %xmm3 addps %xmm3, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L28: testq $1, MM #if GEMV_UNROLL == 4 je .L29 #else je .L30 #endif movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 mulss %xmm15, %xmm3 addss %xmm3, %xmm8 movss %xmm8, -32 * SIZE(Y1) ALIGN_3 #if GEMV_UNROLL == 4 .L29: cmpq $4, N jge .L21 #endif ALIGN_3 .L30: testq N, N jle .L990 cmpq $3, N jne .L40 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L37 testq $SIZE, A1 je .L3X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L3X: testq $2 * SIZE, A1 je .L3XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L3XX: #endif movq MM, I sarq $4, I jle .L35 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_A1 (-24 * SIZE, A1, %xmm10) MOVUPS_A1 (-20 * SIZE, A1, %xmm11) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm6) MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm7) decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm9) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm10) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1(-20 * SIZE, A2, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm6) mulps %xmm13, %xmm7 addps %xmm7, %xmm3 MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm7) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif mulps %xmm14, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1 (-16 * SIZE, A1, %xmm8) mulps %xmm14, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1 (-12 * SIZE, A1, %xmm9) mulps %xmm14, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) mulps %xmm14, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm9) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1(-24 * SIZE, A2, %xmm10) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1(-20 * SIZE, A2, %xmm11) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm1 mulps %xmm13, %xmm6 addps %xmm6, %xmm2 mulps %xmm13, %xmm7 addps %xmm7, %xmm3 mulps %xmm14, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm14, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) mulps %xmm14, %xmm10 addps %xmm10, %xmm2 MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) mulps %xmm14, %xmm11 addps %xmm11, %xmm3 MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L35: testq $8, MM je .L36 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm9) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm1 mulps %xmm13, %xmm6 addps %xmm6, %xmm2 mulps %xmm14, %xmm8 addps %xmm8, %xmm0 mulps %xmm14, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L36: testq $4, MM je .L37 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm8) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 mulps %xmm14, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L37: testq $2, MM je .L38 movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L38: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 movss %xmm8, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L40: cmpq $2, N jne .L50 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L47 testq $SIZE, A1 je .L4X movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L4X: testq $2 * SIZE, A1 je .L4XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L4XX: #endif movq MM, I sarq $4, I jle .L45 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_A1 (-24 * SIZE, A1, %xmm10) MOVUPS_A1 (-20 * SIZE, A1, %xmm11) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) MOVUPS_A1(-32 * SIZE, A2, %xmm4) MOVUPS_A1(-28 * SIZE, A2, %xmm5) MOVUPS_A1(-24 * SIZE, A2, %xmm6) MOVUPS_A1(-20 * SIZE, A2, %xmm7) decq I jle .L44 ALIGN_3 .L43: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1 (-16 * SIZE, A1, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1 (-12 * SIZE, A1, %xmm9) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_A1(-12 * SIZE, A2, %xmm5) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 MOVUPS_A1( -8 * SIZE, A2, %xmm6) mulps %xmm13, %xmm7 addps %xmm7, %xmm3 MOVUPS_A1( -4 * SIZE, A2, %xmm7) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L43 ALIGN_3 .L44: mulps %xmm12, %xmm8 addps %xmm8, %xmm0 mulps %xmm12, %xmm9 addps %xmm9, %xmm1 mulps %xmm12, %xmm10 addps %xmm10, %xmm2 mulps %xmm12, %xmm11 addps %xmm11, %xmm3 mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) mulps %xmm13, %xmm6 addps %xmm6, %xmm2 MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) mulps %xmm13, %xmm7 addps %xmm7, %xmm3 MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L45: testq $8, MM je .L46 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm4) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1(-28 * SIZE, A2, %xmm5) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm13, %xmm5 addps %xmm5, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L46: testq $4, MM je .L47 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1(-32 * SIZE, A2, %xmm4) mulps %xmm13, %xmm4 addps %xmm4, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L47: testq $2, MM je .L48 movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L48: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 movss %xmm8, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L50: cmpq $1, N jne .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movss (X), %xmm12 mulss ALPHA, %xmm12 shufps $0, %xmm12, %xmm12 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L57 testq $SIZE, A1 je .L5X movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L5X: testq $2 * SIZE, A1 je .L5XX movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L5XX: #endif movq MM, I sarq $4, I jle .L55 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_A1 (-24 * SIZE, A1, %xmm10) MOVUPS_A1 (-20 * SIZE, A1, %xmm11) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L54 ALIGN_3 .L53: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_A1 (-16 * SIZE, A1, %xmm8) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_A1 (-12 * SIZE, A1, %xmm9) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L53 ALIGN_3 .L54: mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L55: testq $8, MM je .L56 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_A1 (-28 * SIZE, A1, %xmm9) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L56: testq $4, MM je .L57 MOVUPS_A1 (-32 * SIZE, A1, %xmm8) MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L57: testq $2, MM je .L58 movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L58: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) #ifdef ALIGNED_ACCESS jmp .L990 ALIGN_3 .L100: testq $2 * SIZE - 1, LDA jne .L200 cmpq $4, N jl .L110 ALIGN_3 .L101: subq $4, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss (X), %xmm15 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 mulss %xmm0, %xmm15 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 shufps $0, %xmm15, %xmm15 cmpq $3, M jle .L107 testq $SIZE, A1 je .L10X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(A2, LDA), %xmm3 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 mulss %xmm15, %xmm3 addss %xmm3, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L10X: testq $2 * SIZE, A1 je .L10XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(A2, LDA), %xmm3 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 mulps %xmm15, %xmm3 addps %xmm3, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L10XX: movhps -32 * SIZE(A1, LDA), %xmm8 movhps -32 * SIZE(A2, LDA), %xmm9 movq MM, I sarq $4, I jle .L105 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L104 ALIGN_3 .L103: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A1, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif shufps $0x4e, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -18 * SIZE(A1, LDA), %xmm8 shufps $0x4e, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -32 * SIZE(A2), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -28 * SIZE(A2), %xmm5 shufps $0x4e, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -24 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulps %xmm14, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A2), %xmm7 mulps %xmm14, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A2, LDA), %xmm4 mulps %xmm14, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A2, LDA), %xmm5 mulps %xmm14, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A2, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif shufps $0x4e, %xmm4, %xmm9 mulps %xmm15, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2, LDA), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm15, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm15, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 shufps $0x4e, %xmm9, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L103 ALIGN_3 .L104: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A1, LDA), %xmm6 shufps $0x4e, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -18 * SIZE(A1, LDA), %xmm8 shufps $0x4e, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -32 * SIZE(A2), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -28 * SIZE(A2), %xmm5 shufps $0x4e, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -24 * SIZE(A2), %xmm6 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A2), %xmm7 mulps %xmm14, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A2, LDA), %xmm4 mulps %xmm14, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A2, LDA), %xmm5 mulps %xmm14, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A2, LDA), %xmm6 shufps $0x4e, %xmm4, %xmm9 mulps %xmm15, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2, LDA), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm15, %xmm4 addps %xmm4, %xmm1 shufps $0x4e, %xmm6, %xmm5 mulps %xmm15, %xmm5 addps %xmm5, %xmm2 shufps $0x4e, %xmm9, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L105: testq $8, MM je .L106 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -30 * SIZE(A1, LDA), %xmm6 movaps -26 * SIZE(A1, LDA), %xmm7 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -32 * SIZE(A2), %xmm4 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -28 * SIZE(A2), %xmm10 shufps $0x4e, %xmm6, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -30 * SIZE(A2, LDA), %xmm11 shufps $0x4e, %xmm7, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm8 movaps -26 * SIZE(A2, LDA), %xmm7 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 mulps %xmm14, %xmm10 addps %xmm10, %xmm1 shufps $0x4e, %xmm11, %xmm9 mulps %xmm15, %xmm9 addps %xmm9, %xmm0 shufps $0x4e, %xmm7, %xmm11 mulps %xmm15, %xmm11 addps %xmm11, %xmm1 movaps %xmm7, %xmm9 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L106: testq $4, MM je .L107 movaps -32 * SIZE(A1), %xmm4 movaps -30 * SIZE(A1, LDA), %xmm5 movaps -32 * SIZE(A2), %xmm6 movaps -30 * SIZE(A2, LDA), %xmm7 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 shufps $0x4e, %xmm5, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 shufps $0x4e, %xmm7, %xmm9 mulps %xmm15, %xmm9 addps %xmm9, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L107: testq $2, MM je .L108 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(A2, LDA), %xmm7 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 addps %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L108: testq $1, MM je .L109 movss -32 * SIZE(Y1), %xmm0 movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA), %xmm5 movss -32 * SIZE(A2), %xmm6 movss -32 * SIZE(A2, LDA), %xmm7 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 mulss %xmm15, %xmm7 addss %xmm7, %xmm0 movss %xmm0, -32 * SIZE(Y1) ALIGN_3 .L109: cmpq $4, N jge .L101 ALIGN_3 .L110: testq N, N jle .L990 cmpq $3, N jne .L120 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 cmpq $3, M jle .L117 testq $SIZE, A1 je .L11X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L11X: testq $2 * SIZE, A1 je .L11XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L11XX: movhps -32 * SIZE(A1, LDA), %xmm8 movhps -32 * SIZE(A2, LDA), %xmm9 movq MM, I sarq $4, I jle .L115 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L114 ALIGN_3 .L113: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A1, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif shufps $0x4e, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -18 * SIZE(A1, LDA), %xmm8 shufps $0x4e, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -32 * SIZE(A2), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -28 * SIZE(A2), %xmm5 shufps $0x4e, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -24 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif mulps %xmm14, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A2), %xmm7 mulps %xmm14, %xmm5 addps %xmm5, %xmm1 movaps -16 * SIZE(A1), %xmm4 mulps %xmm14, %xmm6 addps %xmm6, %xmm2 movaps -12 * SIZE(A1), %xmm5 mulps %xmm14, %xmm7 addps %xmm7, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L113 ALIGN_3 .L114: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A1, LDA), %xmm6 shufps $0x4e, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -18 * SIZE(A1, LDA), %xmm8 shufps $0x4e, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -32 * SIZE(A2), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -28 * SIZE(A2), %xmm5 shufps $0x4e, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -24 * SIZE(A2), %xmm6 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A2), %xmm7 mulps %xmm14, %xmm5 addps %xmm5, %xmm1 mulps %xmm14, %xmm6 addps %xmm6, %xmm2 mulps %xmm14, %xmm7 addps %xmm7, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L115: testq $8, MM je .L116 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -30 * SIZE(A1, LDA), %xmm6 movaps -26 * SIZE(A1, LDA), %xmm7 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -32 * SIZE(A2), %xmm4 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -28 * SIZE(A2), %xmm10 shufps $0x4e, %xmm6, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 shufps $0x4e, %xmm7, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm8 mulps %xmm14, %xmm4 addps %xmm4, %xmm0 mulps %xmm14, %xmm10 addps %xmm10, %xmm1 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L116: testq $4, MM je .L117 movaps -32 * SIZE(A1), %xmm4 movaps -30 * SIZE(A1, LDA), %xmm5 movaps -32 * SIZE(A2), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 shufps $0x4e, %xmm5, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L117: testq $2, MM je .L118 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L118: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm0 movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA), %xmm5 movss -32 * SIZE(A2), %xmm6 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 movss %xmm0, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L120: cmpq $2, N jl .L130 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 cmpq $3, M jle .L127 testq $SIZE, A1 je .L12X movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L12X: testq $2 * SIZE, A1 je .L12XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L12XX: movhps -32 * SIZE(A2), %xmm8 movq MM, I sarq $4, I jle .L125 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L124 ALIGN_3 .L123: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A1, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif shufps $0x4e, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -18 * SIZE(A1, LDA), %xmm8 shufps $0x4e, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 shufps $0x4e, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L123 ALIGN_3 .L124: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -30 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -26 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -22 * SIZE(A1, LDA), %xmm6 shufps $0x4e, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -18 * SIZE(A1, LDA), %xmm8 shufps $0x4e, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 shufps $0x4e, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 shufps $0x4e, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L125: testq $8, MM je .L126 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -30 * SIZE(A2), %xmm6 movaps -26 * SIZE(A2), %xmm7 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 shufps $0x4e, %xmm6, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 shufps $0x4e, %xmm7, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm8 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L126: testq $4, MM je .L127 movaps -32 * SIZE(A1), %xmm4 movaps -30 * SIZE(A2), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 shufps $0x4e, %xmm5, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L127: testq $2, MM je .L128 movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L128: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 movss %xmm8, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L130: cmpq $1, N jne .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movss (X), %xmm12 mulss ALPHA, %xmm12 shufps $0, %xmm12, %xmm12 cmpq $3, M jle .L137 testq $SIZE, A1 je .L13X movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L13X: testq $2 * SIZE, A1 je .L13XX movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L13XX: movq MM, I sarq $4, I jle .L135 movaps -32 * SIZE(A1), %xmm8 movaps -28 * SIZE(A1), %xmm9 movaps -24 * SIZE(A1), %xmm10 movaps -20 * SIZE(A1), %xmm11 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L134 ALIGN_3 .L133: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(A1), %xmm8 mulps %xmm12, %xmm9 addps %xmm9, %xmm1 movaps -12 * SIZE(A1), %xmm9 mulps %xmm12, %xmm10 addps %xmm10, %xmm2 movaps -8 * SIZE(A1), %xmm10 mulps %xmm12, %xmm11 addps %xmm11, %xmm3 movaps -4 * SIZE(A1), %xmm11 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L133 ALIGN_3 .L134: mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L135: testq $8, MM je .L136 movaps -32 * SIZE(A1), %xmm8 movaps -28 * SIZE(A1), %xmm9 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L136: testq $4, MM je .L137 movaps -32 * SIZE(A1), %xmm8 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L137: testq $2, MM je .L138 movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L138: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L200: testq $2 * SIZE, LDA jne .L300 cmpq $4, N jl .L210 ALIGN_3 .L201: subq $4, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss (X), %xmm15 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 mulss %xmm0, %xmm15 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 shufps $0, %xmm15, %xmm15 cmpq $3, M jle .L207 testq $SIZE, A1 je .L20X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(A2, LDA), %xmm3 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 mulss %xmm15, %xmm3 addss %xmm3, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L20X: testq $2 * SIZE, A1 je .L20XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(A2, LDA), %xmm3 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 mulps %xmm15, %xmm3 addps %xmm3, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L20XX: movaps -33 * SIZE(A1, LDA), %xmm8 movaps -34 * SIZE(A2), %xmm9 movaps -35 * SIZE(A2, LDA), %xmm10 movq MM, I sarq $4, I jle .L205 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L204 ALIGN_3 .L203: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -29 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -25 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -21 * SIZE(A1, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif movss %xmm4, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -17 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps -31 * SIZE(A2, LDA), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 movaps -27 * SIZE(A2, LDA), %xmm5 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 movaps -23 * SIZE(A2, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif movss %xmm4, %xmm10 shufps $0x93, %xmm4, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 movaps -19 * SIZE(A2, LDA), %xmm10 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm15, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm15, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 movss %xmm10, %xmm6 shufps $0x93, %xmm10, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L203 ALIGN_3 .L204: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -29 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -25 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -21 * SIZE(A1, LDA), %xmm6 movss %xmm4, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -17 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps -31 * SIZE(A2, LDA), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 movaps -27 * SIZE(A2, LDA), %xmm5 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 movaps -23 * SIZE(A2, LDA), %xmm6 movss %xmm4, %xmm10 shufps $0x93, %xmm4, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 movaps -19 * SIZE(A2, LDA), %xmm10 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm15, %xmm4 addps %xmm4, %xmm1 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm15, %xmm5 addps %xmm5, %xmm2 movss %xmm10, %xmm6 shufps $0x93, %xmm10, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L205: testq $8, MM je .L206 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -29 * SIZE(A1, LDA), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -30 * SIZE(A2), %xmm4 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -26 * SIZE(A2), %xmm5 movss %xmm6, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -25 * SIZE(A1, LDA), %xmm8 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps -31 * SIZE(A2, LDA), %xmm6 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -27 * SIZE(A2, LDA), %xmm7 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps %xmm5, %xmm9 movss %xmm6, %xmm10 shufps $0x93, %xmm6, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm10 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L206: testq $4, MM je .L207 movaps -32 * SIZE(A1), %xmm4 movaps -29 * SIZE(A1, LDA), %xmm5 movaps -30 * SIZE(A2), %xmm6 movaps -31 * SIZE(A2, LDA), %xmm7 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm5, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 shufps $0x4e, %xmm6, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movss %xmm7, %xmm10 shufps $0x93, %xmm7, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L207: testq $2, MM je .L208 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(A2, LDA), %xmm7 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 addps %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L208: testq $1, MM je .L209 movss -32 * SIZE(Y1), %xmm0 movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA), %xmm5 movss -32 * SIZE(A2), %xmm6 movss -32 * SIZE(A2, LDA), %xmm7 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 mulss %xmm15, %xmm7 addss %xmm7, %xmm0 movss %xmm0, -32 * SIZE(Y1) ALIGN_3 .L209: cmpq $4, N jge .L201 ALIGN_3 .L210: cmpq $3, N jne .L220 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 cmpq $3, M jle .L217 testq $SIZE, A1 je .L21X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L21X: testq $2 * SIZE, A1 je .L21XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L21XX: movaps -33 * SIZE(A1, LDA), %xmm8 movaps -34 * SIZE(A2), %xmm9 movaps -35 * SIZE(A2, LDA), %xmm10 movq MM, I sarq $4, I jle .L215 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L214 ALIGN_3 .L213: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -29 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -25 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -21 * SIZE(A1, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif movss %xmm4, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -17 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L213 ALIGN_3 .L214: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -29 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -25 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -21 * SIZE(A1, LDA), %xmm6 movss %xmm4, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -17 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L215: testq $8, MM je .L216 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -29 * SIZE(A1, LDA), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -30 * SIZE(A2), %xmm4 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -26 * SIZE(A2), %xmm5 movss %xmm6, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -25 * SIZE(A1, LDA), %xmm8 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps %xmm5, %xmm9 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L216: testq $4, MM je .L217 movaps -32 * SIZE(A1), %xmm4 movaps -29 * SIZE(A1, LDA), %xmm5 movaps -30 * SIZE(A2), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm5, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 shufps $0x4e, %xmm6, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movss %xmm7, %xmm10 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L217: testq $2, MM je .L218 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L218: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm0 movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA), %xmm5 movss -32 * SIZE(A2), %xmm6 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 movss %xmm0, -32 * SIZE(Y1) jmp .L990 ALIGN_4 .L220: testq N, N jle .L990 cmpq $2, N jne .L230 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 cmpq $3, M jle .L227 testq $SIZE, A1 je .L22X movss -32 * SIZE(Y1), %xmm9 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm9 mulss %xmm13, %xmm1 addss %xmm1, %xmm9 movss %xmm9, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L22X: testq $2 * SIZE, A1 je .L22XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm9 mulps %xmm12, %xmm0 addps %xmm0, %xmm9 mulps %xmm13, %xmm1 addps %xmm1, %xmm9 movlps %xmm9, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L22XX: movaps -33 * SIZE(A1, LDA), %xmm8 movq MM, I sarq $4, I jle .L225 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L224 ALIGN_3 .L223: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -29 * SIZE(A2), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -25 * SIZE(A2), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -21 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif movss %xmm4, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -17 * SIZE(A2), %xmm8 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L223 ALIGN_3 .L224: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -29 * SIZE(A2), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -25 * SIZE(A2), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -21 * SIZE(A2), %xmm6 movss %xmm4, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -17 * SIZE(A2), %xmm8 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movss %xmm8, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L225: testq $8, MM je .L226 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -29 * SIZE(A2), %xmm6 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -25 * SIZE(A2), %xmm7 movss %xmm6, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm8 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L226: testq $4, MM je .L227 movaps -32 * SIZE(A1), %xmm4 movaps -29 * SIZE(A2), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm5, %xmm8 shufps $0x39, %xmm8, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L227: testq $2, MM je .L228 movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm9 mulps %xmm12, %xmm0 addps %xmm0, %xmm9 mulps %xmm13, %xmm1 addps %xmm1, %xmm9 movlps %xmm9, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L228: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm9 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm9 mulss %xmm13, %xmm1 addss %xmm1, %xmm9 movss %xmm9, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L230: cmpq $1, N jne .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movss (X), %xmm12 mulss ALPHA, %xmm12 shufps $0, %xmm12, %xmm12 cmpq $3, M jle .L237 testq $SIZE, A1 je .L23X movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, Y1 ALIGN_3 .L23X: testq $2 * SIZE, A1 je .L23XX movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L23XX: testq $2 * SIZE, A1 jne .L230 movq MM, I sarq $4, I jle .L235 movaps -32 * SIZE(A1), %xmm8 movaps -28 * SIZE(A1), %xmm9 movaps -24 * SIZE(A1), %xmm10 movaps -20 * SIZE(A1), %xmm11 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L234 ALIGN_3 .L233: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(A1), %xmm8 mulps %xmm12, %xmm9 addps %xmm9, %xmm1 movaps -12 * SIZE(A1), %xmm9 mulps %xmm12, %xmm10 addps %xmm10, %xmm2 movaps -8 * SIZE(A1), %xmm10 mulps %xmm12, %xmm11 addps %xmm11, %xmm3 movaps -4 * SIZE(A1), %xmm11 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L233 ALIGN_3 .L234: mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L235: testq $8, MM je .L236 movaps -32 * SIZE(A1), %xmm8 movaps -28 * SIZE(A1), %xmm9 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L236: testq $4, MM je .L237 movaps -32 * SIZE(A1), %xmm8 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L237: testq $2, MM je .L238 movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L238: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) jmp .L990 ALIGN_4 .L300: cmpq $4, N jl .L310 ALIGN_3 .L301: subq $4, N leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss (X), %xmm15 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 mulss %xmm0, %xmm15 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 shufps $0, %xmm15, %xmm15 cmpq $3, M jle .L307 testq $SIZE, A1 je .L30X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(A2, LDA), %xmm3 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 mulss %xmm15, %xmm3 addss %xmm3, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L30X: testq $2 * SIZE, A1 je .L30XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(A2, LDA), %xmm3 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 mulps %xmm15, %xmm3 addps %xmm3, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L30XX: movaps -35 * SIZE(A1, LDA), %xmm8 movaps -34 * SIZE(A2), %xmm9 movaps -33 * SIZE(A2, LDA), %xmm10 movq MM, I sarq $4, I jle .L305 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L304 ALIGN_3 .L303: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -31 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -27 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -23 * SIZE(A1, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif movss %xmm4, %xmm8 shufps $0x93, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -19 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps -29 * SIZE(A2, LDA), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 movaps -25 * SIZE(A2, LDA), %xmm5 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 movaps -21 * SIZE(A2, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 movaps -17 * SIZE(A2, LDA), %xmm10 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm15, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm15, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 movss %xmm10, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L303 ALIGN_3 .L304: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -31 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -27 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -23 * SIZE(A1, LDA), %xmm6 movss %xmm4, %xmm8 shufps $0x93, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -19 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps -29 * SIZE(A2, LDA), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 movaps -25 * SIZE(A2, LDA), %xmm5 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 movaps -21 * SIZE(A2, LDA), %xmm6 movss %xmm4, %xmm10 shufps $0x39, %xmm10, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 movaps -17 * SIZE(A2, LDA), %xmm10 movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 mulps %xmm15, %xmm4 addps %xmm4, %xmm1 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 mulps %xmm15, %xmm5 addps %xmm5, %xmm2 movss %xmm10, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L305: testq $8, MM je .L306 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -31 * SIZE(A1, LDA), %xmm6 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -27 * SIZE(A1, LDA), %xmm7 movss %xmm6, %xmm8 shufps $0x93, %xmm6, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -30 * SIZE(A2), %xmm4 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm8 movaps -26 * SIZE(A2), %xmm5 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -29 * SIZE(A2, LDA), %xmm6 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps %xmm5, %xmm9 movaps -25 * SIZE(A2, LDA), %xmm7 movss %xmm6, %xmm10 shufps $0x39, %xmm10, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 mulps %xmm15, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm10 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L306: testq $4, MM je .L307 movaps -32 * SIZE(A1), %xmm4 movaps -31 * SIZE(A1, LDA), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -30 * SIZE(A2), %xmm6 movss %xmm5, %xmm8 shufps $0x93, %xmm5, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -29 * SIZE(A2, LDA), %xmm7 shufps $0x4e, %xmm6, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movss %xmm7, %xmm10 shufps $0x39, %xmm10, %xmm10 mulps %xmm15, %xmm10 addps %xmm10, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L307: testq $2, MM je .L308 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(A2, LDA), %xmm7 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 mulps %xmm15, %xmm7 addps %xmm7, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L308: testq $1, MM je .L309 movss -32 * SIZE(Y1), %xmm0 movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA), %xmm5 movss -32 * SIZE(A2), %xmm6 movss -32 * SIZE(A2, LDA), %xmm7 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 mulss %xmm15, %xmm7 addss %xmm7, %xmm0 movss %xmm0, -32 * SIZE(Y1) ALIGN_3 .L309: cmpq $4, N jge .L301 ALIGN_3 .L310: cmpq $3, N jne .L320 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss (X), %xmm14 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 mulss %xmm0, %xmm14 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 cmpq $3, M jle .L317 testq $SIZE, A1 je .L31X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 movss -32 * SIZE(Y1), %xmm8 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 mulss %xmm14, %xmm2 addss %xmm2, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L31X: testq $2 * SIZE, A1 je .L31XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A1, LDA), %xmm1 movsd -32 * SIZE(A2), %xmm2 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 mulps %xmm14, %xmm2 addps %xmm2, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L31XX: movaps -35 * SIZE(A1, LDA), %xmm8 movaps -34 * SIZE(A2), %xmm9 movaps -33 * SIZE(A2, LDA), %xmm10 movq MM, I sarq $4, I jle .L315 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L314 ALIGN_3 .L313: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -31 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -27 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -23 * SIZE(A1, LDA), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif movss %xmm4, %xmm8 shufps $0x93, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -19 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L313 ALIGN_3 .L314: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -31 * SIZE(A1, LDA), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -27 * SIZE(A1, LDA), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -23 * SIZE(A1, LDA), %xmm6 movss %xmm4, %xmm8 shufps $0x93, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -19 * SIZE(A1, LDA), %xmm8 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -30 * SIZE(A2), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -26 * SIZE(A2), %xmm5 movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -22 * SIZE(A2), %xmm6 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 movaps -18 * SIZE(A2), %xmm9 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 shufps $0x4e, %xmm6, %xmm5 mulps %xmm14, %xmm5 addps %xmm5, %xmm2 shufps $0x4e, %xmm9, %xmm6 mulps %xmm14, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L315: testq $8, MM je .L316 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -31 * SIZE(A1, LDA), %xmm6 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -27 * SIZE(A1, LDA), %xmm7 movss %xmm6, %xmm8 shufps $0x93, %xmm6, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -30 * SIZE(A2), %xmm4 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm8 movaps -26 * SIZE(A2), %xmm5 shufps $0x4e, %xmm4, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 shufps $0x4e, %xmm5, %xmm4 mulps %xmm14, %xmm4 addps %xmm4, %xmm1 movaps %xmm5, %xmm9 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L316: testq $4, MM je .L317 movaps -32 * SIZE(A1), %xmm4 movaps -31 * SIZE(A1, LDA), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -30 * SIZE(A2), %xmm6 movss %xmm5, %xmm8 shufps $0x93, %xmm5, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 shufps $0x4e, %xmm6, %xmm9 mulps %xmm14, %xmm9 addps %xmm9, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L317: testq $2, MM je .L318 movsd -32 * SIZE(A1), %xmm4 movsd -32 * SIZE(A1, LDA), %xmm5 movsd -32 * SIZE(A2), %xmm6 movsd -32 * SIZE(Y1), %xmm0 mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 addps %xmm5, %xmm0 mulps %xmm14, %xmm6 addps %xmm6, %xmm0 movlps %xmm0, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L318: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm0 movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA), %xmm5 movss -32 * SIZE(A2), %xmm6 mulss %xmm12, %xmm4 addss %xmm4, %xmm0 mulss %xmm13, %xmm5 addss %xmm5, %xmm0 mulss %xmm14, %xmm6 addss %xmm6, %xmm0 movss %xmm0, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L320: cmpq $2, N jne .L330 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 movss (X), %xmm12 addq INCX, X movss (X), %xmm13 addq INCX, X movss ALPHA, %xmm0 mulss %xmm0, %xmm12 mulss %xmm0, %xmm13 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 cmpq $3, M jle .L327 testq $SIZE, A1 je .L32X movss -32 * SIZE(Y1), %xmm9 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm9 mulss %xmm13, %xmm1 addss %xmm1, %xmm9 movss %xmm9, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, Y1 ALIGN_3 .L32X: testq $2 * SIZE, A1 je .L32XX movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L32XX: movaps -35 * SIZE(A1, LDA), %xmm8 movq MM, I sarq $4, I jle .L325 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 movaps -24 * SIZE(A1), %xmm6 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L324 ALIGN_3 .L323: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -31 * SIZE(A2), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -27 * SIZE(A2), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -23 * SIZE(A2), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif movss %xmm4, %xmm8 shufps $0x93, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -19 * SIZE(A2), %xmm8 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movaps -16 * SIZE(A1), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movaps -12 * SIZE(A1), %xmm5 movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 movaps -8 * SIZE(A1), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L323 ALIGN_3 .L324: mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(A1), %xmm7 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -31 * SIZE(A2), %xmm4 mulps %xmm12, %xmm6 addps %xmm6, %xmm2 movaps -27 * SIZE(A2), %xmm5 mulps %xmm12, %xmm7 addps %xmm7, %xmm3 movaps -23 * SIZE(A2), %xmm6 movss %xmm4, %xmm8 shufps $0x93, %xmm4, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movaps -19 * SIZE(A2), %xmm8 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 mulps %xmm13, %xmm4 addps %xmm4, %xmm1 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 mulps %xmm13, %xmm5 addps %xmm5, %xmm2 movss %xmm8, %xmm6 shufps $0x93, %xmm8, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm3 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, A2 subq $-16 * SIZE, Y1 ALIGN_3 .L325: testq $8, MM je .L326 movaps -32 * SIZE(A1), %xmm4 movaps -28 * SIZE(A1), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movaps -31 * SIZE(A2), %xmm6 mulps %xmm12, %xmm5 addps %xmm5, %xmm1 movaps -27 * SIZE(A2), %xmm7 movss %xmm6, %xmm8 shufps $0x93, %xmm6, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 mulps %xmm13, %xmm6 addps %xmm6, %xmm1 movaps %xmm7, %xmm8 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, Y1 ALIGN_3 .L326: testq $4, MM je .L327 movaps -32 * SIZE(A1), %xmm4 movaps -31 * SIZE(A2), %xmm5 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm4 addps %xmm4, %xmm0 movss %xmm5, %xmm8 shufps $0x93, %xmm5, %xmm8 mulps %xmm13, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L327: testq $2, MM je .L328 movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(A2), %xmm1 movsd -32 * SIZE(Y1), %xmm8 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 mulps %xmm13, %xmm1 addps %xmm1, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, Y1 ALIGN_3 .L328: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A2), %xmm1 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 mulss %xmm13, %xmm1 addss %xmm1, %xmm8 movss %xmm8, -32 * SIZE(Y1) jmp .L990 ALIGN_3 .L330: cmpq $1, N jne .L990 leaq 32 * SIZE(BUFFER), Y1 movq A, A1 movss (X), %xmm12 mulss ALPHA, %xmm12 shufps $0, %xmm12, %xmm12 cmpq $3, M jle .L337 testq $SIZE, A1 je .L33X movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) addq $1 * SIZE, A1 addq $1 * SIZE, Y1 ALIGN_3 .L33X: testq $2 * SIZE, A1 je .L33XX movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L33XX: movq MM, I sarq $4, I jle .L335 movaps -32 * SIZE(A1), %xmm8 movaps -28 * SIZE(A1), %xmm9 movaps -24 * SIZE(A1), %xmm10 movaps -20 * SIZE(A1), %xmm11 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) decq I jle .L334 ALIGN_3 .L333: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm12, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(A1), %xmm8 mulps %xmm12, %xmm9 addps %xmm9, %xmm1 movaps -12 * SIZE(A1), %xmm9 mulps %xmm12, %xmm10 addps %xmm10, %xmm2 movaps -8 * SIZE(A1), %xmm10 mulps %xmm12, %xmm11 addps %xmm11, %xmm3 movaps -4 * SIZE(A1), %xmm11 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 subq $1, I BRANCH jg .L333 ALIGN_3 .L334: mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm10 addps %xmm10, %xmm2 MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) mulps %xmm12, %xmm11 addps %xmm11, %xmm3 MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) subq $-16 * SIZE, A1 subq $-16 * SIZE, Y1 ALIGN_3 .L335: testq $8, MM je .L336 movaps -32 * SIZE(A1), %xmm8 movaps -28 * SIZE(A1), %xmm9 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm9 addps %xmm9, %xmm1 MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) addq $8 * SIZE, A1 addq $8 * SIZE, Y1 ALIGN_3 .L336: testq $4, MM je .L337 movaps -32 * SIZE(A1), %xmm8 MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) mulps %xmm12, %xmm8 addps %xmm8, %xmm0 MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L337: testq $2, MM je .L338 movsd -32 * SIZE(Y1), %xmm8 movsd -32 * SIZE(A1), %xmm0 mulps %xmm12, %xmm0 addps %xmm0, %xmm8 movlps %xmm8, -32 * SIZE(Y1) addq $2 * SIZE, A1 addq $2 * SIZE, Y1 ALIGN_3 .L338: testq $1, MM je .L990 movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 mulss %xmm12, %xmm0 addss %xmm0, %xmm8 movss %xmm8, -32 * SIZE(Y1) jmp .L990 #endif ALIGN_4 .L990: movq Y, Y1 movq M, %rax sarq $3, %rax jle .L994 ALIGN_3 .L992: movsd 0 * SIZE(BUFFER), %xmm0 movhps 2 * SIZE(BUFFER), %xmm0 movsd 4 * SIZE(BUFFER), %xmm4 movhps 6 * SIZE(BUFFER), %xmm4 pshufd $0x01, %xmm0, %xmm1 pshufd $0x02, %xmm0, %xmm2 pshufd $0x03, %xmm0, %xmm3 pshufd $0x01, %xmm4, %xmm5 pshufd $0x02, %xmm4, %xmm6 pshufd $0x03, %xmm4, %xmm7 addss (Y), %xmm0 addq INCY, Y addss (Y), %xmm1 addq INCY, Y addss (Y), %xmm2 addq INCY, Y addss (Y), %xmm3 addq INCY, Y addss (Y), %xmm4 addq INCY, Y addss (Y), %xmm5 addq INCY, Y addss (Y), %xmm6 addq INCY, Y addss (Y), %xmm7 addq INCY, Y movss %xmm0, (Y1) addq INCY, Y1 movss %xmm1, (Y1) addq INCY, Y1 movss %xmm2, (Y1) addq INCY, Y1 movss %xmm3, (Y1) addq INCY, Y1 movss %xmm4, (Y1) addq INCY, Y1 movss %xmm5, (Y1) addq INCY, Y1 movss %xmm6, (Y1) addq INCY, Y1 movss %xmm7, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER decq %rax jg .L992 ALIGN_3 .L994: testq $7, M jle .L999 testq $4, M jle .L995 movsd 0 * SIZE(BUFFER), %xmm0 movhps 2 * SIZE(BUFFER), %xmm0 pshufd $0x01, %xmm0, %xmm1 pshufd $0x02, %xmm0, %xmm2 pshufd $0x03, %xmm0, %xmm3 addss (Y), %xmm0 addq INCY, Y addss (Y), %xmm1 addq INCY, Y addss (Y), %xmm2 addq INCY, Y addss (Y), %xmm3 addq INCY, Y movss %xmm0, (Y1) addq INCY, Y1 movss %xmm1, (Y1) addq INCY, Y1 movss %xmm2, (Y1) addq INCY, Y1 movss %xmm3, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L995: testq $2, M jle .L996 movsd (BUFFER), %xmm0 pshufd $0x01, %xmm0, %xmm1 addss (Y), %xmm0 addq INCY, Y addss (Y), %xmm1 addq INCY, Y movss %xmm0, (Y1) addq INCY, Y1 movss %xmm1, (Y1) addq INCY, Y1 addq $2 * SIZE, BUFFER ALIGN_3 .L996: testq $1, M jle .L999 movss (BUFFER), %xmm0 addss (Y), %xmm0 movss %xmm0, (Y1) ALIGN_3 .L999: leaq (,M,SIZE),%rax addq %rax,AA jmp .L0t ALIGN_4 .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/sgemv_n.c000066400000000000000000000106421313527062700171670ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #define NBMAX 4096 #ifndef HAVE_KERNEL_16x4 static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; } } #endif static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; for ( i=0; i< n; i+=4 ) { y[i] += a0[i]*x[0]; y[i+1] += a0[i+1]*x[0]; y[i+2] += a0[i+2]*x[0]; y[i+3] += a0[i+3]*x[0]; } } static void zero_y(BLASLONG n, FLOAT *dest) { BLASLONG i; for ( i=0; i> 3 ; n2 = n & 7 ; } else { n1 = n >> 2 ; n2 = n & 3 ; } m3 = m & 3 ; m1 = m & -4 ; m2 = (m & (NBMAX-1)) - m3 ; y_ptr = y; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } a_ptr = a; x_ptr = x; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if ( inc_y != 1 ) memset(ybuffer,0,NB*4); else ybuffer = y_ptr; if ( inc_x == 1 ) { for( i = 0; i < n1 ; i++) { sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); ap[0] += lda8; ap[1] += lda8; ap[2] += lda8; ap[3] += lda8; a_ptr += lda8; x_ptr += 8; } if ( n2 & 4 ) { sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; x_ptr += 4; } if ( n2 & 2 ) { sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); a_ptr += lda*2; x_ptr += 2; } if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); a_ptr += lda; x_ptr += 1; } } else { for( i = 0; i < n1 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; xbuffer[1] = x_ptr[0]; x_ptr += inc_x; xbuffer[2] = x_ptr[0]; x_ptr += inc_x; xbuffer[3] = x_ptr[0]; x_ptr += inc_x; sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; } for( i = 0; i < n2 ; i++) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); a_ptr += lda; } } a += NB; if ( inc_y != 1 ) { add_y(NB,ybuffer,y_ptr,inc_y); y_ptr += NB * inc_y; } else y_ptr += NB ; } if ( m3 == 0 ) return(0); if ( m3 == 3 ) { a_ptr = a; x_ptr = x; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; if ( lda == 3 && inc_x ==1 ) { for( i = 0; i < ( n & -4 ); i+=4 ) { temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; a_ptr += 12; x_ptr += 4; } for( ; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0]; a_ptr += 3; x_ptr ++; } } else { for( i = 0; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp0; y_ptr += inc_y; y_ptr[0] += alpha * temp1; y_ptr += inc_y; y_ptr[0] += alpha * temp2; return(0); } if ( m3 == 2 ) { a_ptr = a; x_ptr = x; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; if ( lda == 2 && inc_x ==1 ) { for( i = 0; i < (n & -4) ; i+=4 ) { temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; a_ptr += 8; x_ptr += 4; } for( ; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; a_ptr += 2; x_ptr ++; } } else { for( i = 0; i < n; i++ ) { temp0 += a_ptr[0] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp0; y_ptr += inc_y; y_ptr[0] += alpha * temp1; return(0); } if ( m3 == 1 ) { a_ptr = a; x_ptr = x; FLOAT temp = 0.0; if ( lda == 1 && inc_x ==1 ) { for( i = 0; i < (n & -4); i+=4 ) { temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; } for( ; i < n; i++ ) { temp += a_ptr[i] * x_ptr[i]; } } else { for( i = 0; i < n; i++ ) { temp += a_ptr[0] * x_ptr[0]; a_ptr += lda; x_ptr += inc_x; } } y_ptr[0] += alpha * temp; return(0); } return(0); } OpenBLAS-0.2.20/kernel/x86_64/sgemv_n_microk_bulldozer-4.c000066400000000000000000000234651313527062700227650ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastss (%2), %%xmm12 \n\t" // x0 "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 "vbroadcastss (%9), %%xmm8 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" "addq $4 , %0 \n\t" "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" "addq $4 , %8 \n\t" "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" "subq $4 , %1 \n\t" "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y "2: \n\t" "testq $0x08, %1 \n\t" "jz 3f \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y "addq $8 , %0 \n\t" "addq $8 , %8 \n\t" "subq $8 , %1 \n\t" "3: \n\t" "cmpq $0, %1 \n\t" "je 4f \n\t" ".align 16 \n\t" "1: \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%5,%0,4) \n\t" "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" "prefetcht0 192(%6,%0,4) \n\t" "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" "prefetcht0 192(%7,%0,4) \n\t" "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" ".align 2 \n\t" "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" "prefetcht0 192(%4,%8,4) \n\t" "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" "prefetcht0 192(%5,%8,4) \n\t" "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" "prefetcht0 192(%6,%8,4) \n\t" "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" "prefetcht0 192(%7,%8,4) \n\t" "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" "addq $16, %0 \n\t" "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y "addq $16, %8 \n\t" "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y "subq $16, %1 \n\t" "jnz 1b \n\t" "4: \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastss (%2), %%xmm12 \n\t" // x0 "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 "vbroadcastss (%8), %%xmm8 \n\t" // alpha ".align 16 \n\t" "1: \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t" "vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sgemv_n_microk_haswell-4.c000066400000000000000000000243201313527062700224110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%2), %%ymm12 \n\t" // x0 "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4 , %8 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "testq $0x08, %1 \n\t" "jz 3f \n\t" "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8 , %8 \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" "3: \n\t" "cmpq $0, %1 \n\t" "je 4f \n\t" // ".align 16 \n\t" "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" "addq $16, %0 \n\t" "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" "addq $16, %8 \n\t" "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y "subq $16, %1 \n\t" "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y "jnz 1b \n\t" "4: \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%2), %%ymm12 \n\t" // x0 "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 "vbroadcastss (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" "2: \n\t" "testq $0x08, %1 \n\t" "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" "3: \n\t" "cmpq $0, %1 \n\t" "je 4f \n\t" // ".align 16 \n\t" "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" "vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "4: \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sgemv_n_microk_nehalem-4.c000066400000000000000000000152361313527062700223710ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "movss (%2), %%xmm12 \n\t" // x0 "movss 4(%2), %%xmm13 \n\t" // x1 "movss 8(%2), %%xmm14 \n\t" // x2 "movss 12(%2), %%xmm15 \n\t" // x3 "shufps $0, %%xmm12, %%xmm12\n\t" "shufps $0, %%xmm13, %%xmm13\n\t" "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" "movss 16(%2), %%xmm0 \n\t" // x4 "movss 20(%2), %%xmm1 \n\t" // x5 "movss 24(%2), %%xmm2 \n\t" // x6 "movss 28(%2), %%xmm3 \n\t" // x7 "shufps $0, %%xmm0 , %%xmm0 \n\t" "shufps $0, %%xmm1 , %%xmm1 \n\t" "shufps $0, %%xmm2 , %%xmm2 \n\t" "shufps $0, %%xmm3 , %%xmm3 \n\t" "movss (%9), %%xmm6 \n\t" // alpha "shufps $0, %%xmm6 , %%xmm6 \n\t" ".align 16 \n\t" "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y ".align 2 \n\t" "movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm9 \n\t" "movups (%6,%0,4), %%xmm10 \n\t" "movups (%7,%0,4), %%xmm11 \n\t" ".align 2 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" "mulps %%xmm14, %%xmm10 \n\t" "mulps %%xmm15, %%xmm11 \n\t" "addps %%xmm8 , %%xmm4 \n\t" "addps %%xmm9 , %%xmm5 \n\t" "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" "movups (%4,%8,4), %%xmm8 \n\t" "movups (%5,%8,4), %%xmm9 \n\t" "movups (%6,%8,4), %%xmm10 \n\t" "movups (%7,%8,4), %%xmm11 \n\t" ".align 2 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" "mulps %%xmm2 , %%xmm10 \n\t" "mulps %%xmm3 , %%xmm11 \n\t" "addps %%xmm8 , %%xmm4 \n\t" "addps %%xmm9 , %%xmm5 \n\t" "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" "addq $4 , %8 \n\t" "addps %%xmm5 , %%xmm4 \n\t" "addq $4 , %0 \n\t" "mulps %%xmm6 , %%xmm4 \n\t" "subq $4 , %1 \n\t" "addps %%xmm4 , %%xmm7 \n\t" "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "movss (%2), %%xmm12 \n\t" // x0 "movss 4(%2), %%xmm13 \n\t" // x1 "movss 8(%2), %%xmm14 \n\t" // x2 "movss 12(%2), %%xmm15 \n\t" // x3 "shufps $0, %%xmm12, %%xmm12\n\t" "shufps $0, %%xmm13, %%xmm13\n\t" "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" "movss (%8), %%xmm6 \n\t" // alpha "shufps $0, %%xmm6 , %%xmm6 \n\t" ".align 16 \n\t" "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y "movups (%4,%0,4), %%xmm8 \n\t" "movups (%5,%0,4), %%xmm9 \n\t" "movups (%6,%0,4), %%xmm10 \n\t" "movups (%7,%0,4), %%xmm11 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" "mulps %%xmm14, %%xmm10 \n\t" "mulps %%xmm15, %%xmm11 \n\t" "addps %%xmm8 , %%xmm4 \n\t" "addq $4 , %0 \n\t" "addps %%xmm9 , %%xmm4 \n\t" "subq $4 , %1 \n\t" "addps %%xmm10 , %%xmm4 \n\t" "addps %%xmm4 , %%xmm11 \n\t" "mulps %%xmm6 , %%xmm11 \n\t" "addps %%xmm7 , %%xmm11 \n\t" "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y "jnz 1b \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sgemv_n_microk_sandy-4.c000066400000000000000000000304361313527062700220750ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%2), %%ymm12 \n\t" // x0 "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4, %8 \n\t" "addq $4, %0 \n\t" "subq $4, %1 \n\t" "2: \n\t" "testq $0x08, %1 \n\t" "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8, %8 \n\t" "addq $8, %0 \n\t" "subq $8, %1 \n\t" "3: \n\t" "cmpq $0, %1 \n\t" "je 4f \n\t" ".align 16 \n\t" "1: \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "prefetcht0 192(%4,%0,4) \n\t" "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" "prefetcht0 192(%5,%0,4) \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "prefetcht0 192(%6,%0,4) \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" "prefetcht0 192(%7,%0,4) \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "prefetcht0 192(%4,%8,4) \n\t" "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" "prefetcht0 192(%5,%8,4) \n\t" "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "prefetcht0 192(%6,%8,4) \n\t" "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" "prefetcht0 192(%7,%8,4) \n\t" "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y "addq $16, %8 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "4: \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastss (%2), %%ymm12 \n\t" // x0 "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 "vbroadcastss (%8), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y "addq $4, %0 \n\t" "subq $4, %1 \n\t" "2: \n\t" "testq $0x08, %1 \n\t" "jz 3f \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y "addq $8, %0 \n\t" "subq $8, %1 \n\t" "3: \n\t" "cmpq $0, %1 \n\t" "je 4f \n\t" ".align 16 \n\t" "1: \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y "vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y "prefetcht0 192(%4,%0,4) \n\t" "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" "prefetcht0 192(%5,%0,4) \n\t" "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "prefetcht0 192(%6,%0,4) \n\t" "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" "prefetcht0 192(%7,%0,4) \n\t" "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t" "vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t" "vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y "vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "4: \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/sgemv_t.S000066400000000000000000003536051313527062700171660ustar00rootroot00000000000000/* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #if GEMV_UNROLL < 4 #undef GEMV_UNROLL #define GEMV_UNROLL 4 #endif #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx #define OLD_LDA %r8 #define STACK_INCX 8 + STACKSIZE(%rsp) #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) #define MMM 56(%rsp) #define NN 64(%rsp) #define AA 72(%rsp) #define LDAX 80(%rsp) #else #define STACKSIZE 288 #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define STACK_INCX 64 + STACKSIZE(%rsp) #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) #define MMM 232(%rsp) #define NN 240(%rsp) #define AA 248(%rsp) #define LDAX 256(%rsp) #endif #define LDA %r8 #define X %r9 #define INCX %rsi #define INCY %rdi #define M %r10 #define N %r11 #define A %r12 #define Y %r14 #define BUFFER %r13 #define I %rax #define A1 %rbx #define A2 %rcx #define LDA3 %rdx #define X1 %rbp #define Y1 INCX #ifdef ALIGNED_ACCESS #define MM %r15 #else #define MM M #endif #define ALPHA %xmm7 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_M, MMM movq OLD_N, NN movq OLD_A, X movq X, AA movq OLD_LDA, X movq X, LDAX movq OLD_X, X #else movq OLD_M, MMM movq OLD_N, NN movq OLD_A, AA movq OLD_LDA, LDAX #endif #ifndef WINDOWS_ABI pshufd $0, %xmm0, ALPHA #else pshufd $0, %xmm3, ALPHA #endif .L0t: xorq M,M addq $1,M salq $22,M subq M,MMM jge .L00t ALIGN_4 movq MMM,%rax addq M,%rax jle .L999x movq %rax,M .L00t: movq LDAX,LDA movq NN,N movq AA,A movq STACK_INCX, INCX movq STACK_Y, Y movq STACK_INCY, INCY movq STACK_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA leaq (LDA, LDA, 2), LDA3 #ifdef ALIGNED_ACCESS movq M, MM testq $4 * SIZE - 1, A je .L0X cmpq $3, M jle .L0X movq A, MM sarq $BASE_SHIFT, MM andq $3, MM subq $4, MM addq M, MM .L0X: #endif testq M, M jle .L999 testq N, N jle .L999 ALIGN_4 subq $-32 * SIZE, A #ifdef ALIGNED_ACCESS movq A, %rax andq $4 * SIZE - 1, %rax addq %rax, BUFFER #endif movq BUFFER, X1 movq M, I sarq $3, I jle .L05 ALIGN_4 .L02: movss (X), %xmm0 addq INCX, X movss (X), %xmm1 addq INCX, X movss (X), %xmm2 addq INCX, X movss (X), %xmm3 addq INCX, X movss (X), %xmm4 addq INCX, X movss (X), %xmm5 addq INCX, X movss (X), %xmm6 addq INCX, X movss (X), %xmm8 addq INCX, X movss %xmm0, 0 * SIZE(X1) movss %xmm1, 1 * SIZE(X1) movss %xmm2, 2 * SIZE(X1) movss %xmm3, 3 * SIZE(X1) movss %xmm4, 4 * SIZE(X1) movss %xmm5, 5 * SIZE(X1) movss %xmm6, 6 * SIZE(X1) movss %xmm8, 7 * SIZE(X1) addq $8 * SIZE, X1 decq I jg .L02 ALIGN_4 .L05: movq M, I andq $7, I jle .L10 ALIGN_2 .L06: movss (X), %xmm0 addq INCX, X movss %xmm0, 0 * SIZE(X1) addq $SIZE, X1 decq I jg .L06 ALIGN_4 .L10: movq Y, Y1 #ifdef ALIGNED_ACCESS testq $4 * SIZE - 1, LDA jne .L100 #endif #if GEMV_UNROLL >= 8 cmpq $8, N jl .L20 ALIGN_3 .L11: subq $8, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 4), A2 leaq (A1, LDA, 8), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L17 testq $SIZE, A1 je .L1X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA, 1), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A1, LDA, 2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A1, LDA3, 1), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 movss -32 * SIZE(A2), %xmm0 mulss %xmm4, %xmm0 addss %xmm0, %xmm12 movss -32 * SIZE(A2, LDA, 1), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm13 movss -32 * SIZE(A2, LDA, 2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm14 movss -32 * SIZE(A2, LDA3, 1), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm15 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L1X: testq $2 * SIZE, A1 je .L1XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA, 1), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A1, LDA, 2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A1, LDA3, 1), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 movsd -32 * SIZE(A2), %xmm0 mulps %xmm4, %xmm0 addps %xmm0, %xmm12 movsd -32 * SIZE(A2, LDA, 1), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm13 movsd -32 * SIZE(A2, LDA, 2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm14 movsd -32 * SIZE(A2, LDA3, 1), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm15 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L1XX: #endif MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCHW 8 * SIZE(Y1) #endif movq MM, I sarq $4, I jle .L15 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) decq I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm4, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm12 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm14 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm3, %xmm15 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A2, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm5, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm12 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm14 MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm15 MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A2, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm4, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm12 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) #endif MOVUPS_A1 (-20 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm14 MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm3, %xmm15 MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A2, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm5, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm5, %xmm0 addps %xmm0, %xmm12 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) #endif MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm14 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif MOVUPS_A2 (-16 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm15 MOVUPS_A2 (-16 * SIZE, A1, LDA3, 1, %xmm3) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L12 ALIGN_4 .L13: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm4, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm12 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm14 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm3, %xmm15 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A2, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm5, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm5, %xmm0 addps %xmm0, %xmm12 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm14 MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm15 MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A2, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm4, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm12 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm14 MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm3, %xmm15 MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A2, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm5, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm5, %xmm0 addps %xmm0, %xmm12 mulps %xmm5, %xmm1 addps %xmm1, %xmm13 mulps %xmm5, %xmm2 addps %xmm2, %xmm14 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm15 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L15: testq $8, MM jle .L16 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm4, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm12 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm13 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm14 MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm3, %xmm15 MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A2, %xmm0) mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm5, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm5, %xmm0 addps %xmm0, %xmm12 mulps %xmm5, %xmm1 addps %xmm1, %xmm13 mulps %xmm5, %xmm2 addps %xmm2, %xmm14 mulps %xmm5, %xmm3 addps %xmm3, %xmm15 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L16: testq $4, MM jle .L17 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-32 * SIZE, A2, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) mulps %xmm4, %xmm3 addps %xmm3, %xmm11 MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm12 mulps %xmm4, %xmm1 addps %xmm1, %xmm13 mulps %xmm4, %xmm2 addps %xmm2, %xmm14 mulps %xmm4, %xmm3 addps %xmm3, %xmm15 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L17: testq $2, MM jle .L18 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA, 1), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A1, LDA, 2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A1, LDA3, 1), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 movsd -32 * SIZE(A2), %xmm0 mulps %xmm4, %xmm0 addps %xmm0, %xmm12 movsd -32 * SIZE(A2, LDA, 1), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm13 movsd -32 * SIZE(A2, LDA, 2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm14 movsd -32 * SIZE(A2, LDA3, 1), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm15 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L18: testq $1, MM jle .L19 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA, 1), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A1, LDA, 2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A1, LDA3, 1), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 movss -32 * SIZE(A2), %xmm0 mulss %xmm4, %xmm0 addss %xmm0, %xmm12 movss -32 * SIZE(A2, LDA, 1), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm13 movss -32 * SIZE(A2, LDA, 2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm14 movss -32 * SIZE(A2, LDA3, 1), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm15 ALIGN_4 .L19: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 haddps %xmm13, %xmm12 haddps %xmm15, %xmm14 haddps %xmm14, %xmm12 pshufd $0x1, %xmm12, %xmm13 pshufd $0x2, %xmm12, %xmm14 pshufd $0x3, %xmm12, %xmm15 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 movaps %xmm12, %xmm0 unpcklps %xmm13, %xmm12 unpckhps %xmm13, %xmm0 movaps %xmm14, %xmm1 unpcklps %xmm15, %xmm14 unpckhps %xmm15, %xmm1 movaps %xmm12, %xmm13 unpcklps %xmm14, %xmm12 unpckhps %xmm14, %xmm13 movaps %xmm0, %xmm14 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm14 addps %xmm13, %xmm12 addps %xmm0, %xmm14 addps %xmm14, %xmm12 pshufd $0x2, %xmm12, %xmm13 pshufd $0x1, %xmm12, %xmm14 pshufd $0x3, %xmm12, %xmm15 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 mulss ALPHA, %xmm11 mulss ALPHA, %xmm12 mulss ALPHA, %xmm13 mulss ALPHA, %xmm14 mulss ALPHA, %xmm15 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 addq INCY, Y addss (Y), %xmm11 addq INCY, Y addss (Y), %xmm12 addq INCY, Y addss (Y), %xmm13 addq INCY, Y addss (Y), %xmm14 addq INCY, Y addss (Y), %xmm15 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) addq INCY, Y1 movss %xmm11, (Y1) addq INCY, Y1 movss %xmm12, (Y1) addq INCY, Y1 movss %xmm13, (Y1) addq INCY, Y1 movss %xmm14, (Y1) addq INCY, Y1 movss %xmm15, (Y1) addq INCY, Y1 cmpq $8, N jge .L11 ALIGN_4 .L20: #endif cmpq $4, N jl .L30 #if GEMV_UNROLL == 4 ALIGN_3 .L21: #endif subq $4, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L27 testq $SIZE, A1 je .L2X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L2X: testq $2 * SIZE, A1 je .L2XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L2XX: #endif MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) #if (GEMV_UNROLL == 4) && defined(PREFETCHW) PREFETCHW 4 * SIZE(Y1) #endif movq MM, I sarq $4, I jle .L25 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-32 * SIZE, A2, %xmm2) MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) MOVUPS_A1 (-28 * SIZE, A1, %xmm12) MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) MOVUPS_A1 (-28 * SIZE, A2, %xmm14) MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) decq I jle .L23 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm3, %xmm11 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) mulps %xmm5, %xmm14 addps %xmm14, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) mulps %xmm5, %xmm15 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm15, %xmm11 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm3, %xmm11 MOVUPS_A2 (-16 * SIZE, A2, LDA, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm14 addps %xmm14, %xmm10 MOVUPS_A1 (-12 * SIZE, A2, %xmm14) mulps %xmm5, %xmm15 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm15, %xmm11 MOVUPS_A2 (-12 * SIZE, A2, LDA, 1, %xmm15) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L22 ALIGN_4 .L23: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) mulps %xmm4, %xmm3 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm3, %xmm11 MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) mulps %xmm5, %xmm14 addps %xmm14, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) mulps %xmm5, %xmm15 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm15, %xmm11 MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 mulps %xmm4, %xmm3 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm3, %xmm11 mulps %xmm5, %xmm12 addps %xmm12, %xmm8 mulps %xmm5, %xmm13 addps %xmm13, %xmm9 mulps %xmm5, %xmm14 addps %xmm14, %xmm10 mulps %xmm5, %xmm15 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm15, %xmm11 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L25: testq $8, MM jle .L26 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm3 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm3, %xmm11 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A1 (-28 * SIZE, A2, %xmm14) mulps %xmm5, %xmm14 addps %xmm14, %xmm10 MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) mulps %xmm5, %xmm15 addps %xmm15, %xmm11 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L26: testq $4, MM jle .L27 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm3 addps %xmm3, %xmm11 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L27: testq $2, MM jle .L28 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L28: testq $1, MM jle .L29 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 ALIGN_4 .L29: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 mulss ALPHA, %xmm11 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 addq INCY, Y addss (Y), %xmm11 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) addq INCY, Y1 movss %xmm11, (Y1) addq INCY, Y1 #if GEMV_UNROLL == 4 cmpq $4, N jge .L21 #endif ALIGN_4 .L30: cmpq $3, N jne .L40 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L37 testq $SIZE, A1 je .L3X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L3X: testq $2 * SIZE, A1 je .L3XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L3XX: #endif MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) #if (GEMV_UNROLL == 4) && defined(PREFETCHW) PREFETCHW 4 * SIZE(Y1) #endif movq MM, I sarq $4, I jle .L35 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-32 * SIZE, A2, %xmm2) MOVUPS_A1 (-28 * SIZE, A1, %xmm12) MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) MOVUPS_A1 (-28 * SIZE, A2, %xmm14) decq I jle .L33 ALIGN_4 .L32: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) mulps %xmm5, %xmm14 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm14, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) mulps %xmm5, %xmm14 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm14, %xmm10 MOVUPS_A1 (-12 * SIZE, A2, %xmm14) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L32 ALIGN_4 .L33: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm2 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) mulps %xmm5, %xmm14 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm14, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm14) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 mulps %xmm5, %xmm13 addps %xmm13, %xmm9 mulps %xmm5, %xmm14 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm14, %xmm10 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L35: testq $8, MM jle .L36 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) mulps %xmm4, %xmm2 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 MOVUPS_A1 (-28 * SIZE, A2, %xmm14) mulps %xmm5, %xmm14 addps %xmm14, %xmm10 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L36: testq $4, MM jle .L37 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L37: testq $2, MM jle .L38 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L38: testq $1, MM jle .L39 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 ALIGN_4 .L39: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) addq INCY, Y1 jmp .L999 ALIGN_4 .L40: cmpq $2, N jne .L50 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L47 testq $SIZE, A1 je .L4X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L4X: testq $2 * SIZE, A1 je .L4XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L4XX: #endif MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L45 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-32 * SIZE, A2, %xmm1) MOVUPS_A1 (-28 * SIZE, A1, %xmm12) MOVUPS_A1 (-28 * SIZE, A2, %xmm13) decq I jle .L43 ALIGN_4 .L42: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm1, %xmm9 MOVUPS_A1 (-24 * SIZE, A2, %xmm1) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm13, %xmm9 MOVUPS_A1 (-20 * SIZE, A2, %xmm13) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm1, %xmm9 MOVUPS_A1 (-16 * SIZE, A2, %xmm1) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm13, %xmm9 MOVUPS_A1 (-12 * SIZE, A2, %xmm13) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L42 ALIGN_4 .L43: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm4, %xmm1 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm1, %xmm9 MOVUPS_A1 (-24 * SIZE, A2, %xmm1) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm5, %xmm13 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm13, %xmm9 MOVUPS_A1 (-20 * SIZE, A2, %xmm13) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 mulps %xmm4, %xmm1 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm1, %xmm9 mulps %xmm5, %xmm12 addps %xmm12, %xmm8 mulps %xmm5, %xmm13 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm13, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L45: testq $8, MM jle .L46 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-32 * SIZE, A2, %xmm1) mulps %xmm4, %xmm1 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm1, %xmm9 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) mulps %xmm5, %xmm12 addps %xmm12, %xmm8 MOVUPS_A1 (-28 * SIZE, A2, %xmm13) mulps %xmm5, %xmm13 addps %xmm13, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L46: testq $4, MM jle .L47 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-32 * SIZE, A2, %xmm1) mulps %xmm4, %xmm1 addps %xmm1, %xmm9 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L47: testq $2, MM jle .L48 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L48: testq $1, MM jle .L49 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 ALIGN_4 .L49: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm8, %xmm8 #else movaps %xmm8, %xmm10 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm10 addps %xmm10, %xmm8 movhlps %xmm8, %xmm9 addps %xmm9, %xmm8 #endif pshufd $0x1, %xmm8, %xmm9 mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 jmp .L999 ALIGN_4 .L50: cmpq $1, N jne .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifdef ALIGNED_ACCESS cmpq $3, M jle .L57 testq $SIZE, A1 je .L5X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 addq $1 * SIZE, A1 addq $1 * SIZE, X1 ALIGN_3 .L5X: testq $2 * SIZE, A1 je .L5XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L5XX: #endif MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L55 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-28 * SIZE, A1, %xmm12) decq I jle .L53 ALIGN_4 .L52: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) addq $16 * SIZE, A1 addq $16 * SIZE, X1 decq I jg .L52 ALIGN_4 .L53: mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, X1 ALIGN_4 .L55: testq $8, MM jle .L56 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) mulps %xmm5, %xmm12 addps %xmm12, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, X1 ALIGN_4 .L56: testq $4, MM jle .L57 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 addq $4 * SIZE, A1 addq $4 * SIZE, X1 ALIGN_4 .L57: testq $2, MM jle .L58 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_4 .L58: testq $1, MM jle .L59 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 ALIGN_4 .L59: addps %xmm9, %xmm8 #ifdef HAVE_SSE3 haddps %xmm8, %xmm8 haddps %xmm8, %xmm8 #else pshufd $1, %xmm8, %xmm9 pshufd $2, %xmm8, %xmm10 pshufd $3, %xmm8, %xmm11 addss %xmm9, %xmm8 addss %xmm11, %xmm10 addss %xmm10, %xmm8 #endif mulss ALPHA, %xmm8 addss (Y), %xmm8 movss %xmm8, (Y1) #ifdef ALIGNED_ACCESS jmp .L999 ALIGN_4 .L100: testq $2 * SIZE - 1, LDA jne .L200 cmpq $4, N jl .L110 ALIGN_3 .L101: subq $4, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 cmpq $3, M jle .L107 testq $SIZE, A1 je .L10X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L10X: testq $2 * SIZE, A1 je .L10XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L10XX: MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) MOVUPS_A2 (-34 * SIZE, A2, LDA, 1, %xmm13) MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCHW 4 * SIZE(Y1) #endif movq MM, I sarq $4, I jle .L105 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-32 * SIZE, A2, %xmm2) MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) decq I jle .L103 ALIGN_4 .L102: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) shufps $0x4e, %xmm3, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm13, %xmm11 MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) shufps $0x4e, %xmm13, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) shufps $0x4e, %xmm3, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm13, %xmm11 MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) shufps $0x4e, %xmm13, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-14 * SIZE, A2, LDA, 1, %xmm3) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L102 ALIGN_4 .L103: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) shufps $0x4e, %xmm3, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm13, %xmm11 MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) shufps $0x4e, %xmm13, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) shufps $0x4e, %xmm3, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm13, %xmm11 MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 shufps $0x4e, %xmm13, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L105: testq $8, MM jle .L106 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-32 * SIZE, A2, %xmm2) MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) shufps $0x4e, %xmm3, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm13, %xmm11 MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 shufps $0x4e, %xmm13, %xmm3 mulps %xmm5, %xmm3 addps %xmm3, %xmm11 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L106: testq $4, MM jle .L107 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) shufps $0x4e, %xmm3, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm11 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L107: testq $2, MM jle .L108 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L108: testq $1, MM jle .L109 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 ALIGN_4 .L109: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 mulss ALPHA, %xmm11 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 addq INCY, Y addss (Y), %xmm11 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) addq INCY, Y1 movss %xmm11, (Y1) addq INCY, Y1 cmpq $4, N jge .L101 ALIGN_4 .L110: cmpq $3, N jne .L120 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 cmpq $3, M jle .L117 testq $SIZE, A1 je .L11X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L11X: testq $2 * SIZE, A1 je .L11XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L11XX: MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L115 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-32 * SIZE, A2, %xmm2) decq I jle .L113 ALIGN_4 .L112: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm2, %xmm10 MOVUPS_A1 (-16 * SIZE, A2, %xmm2) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L112 ALIGN_4 .L113: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) mulps %xmm5, %xmm2 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm2, %xmm10 MOVUPS_A1 (-24 * SIZE, A2, %xmm2) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-20 * SIZE, A2, %xmm2) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 mulps %xmm5, %xmm2 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm2, %xmm10 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L115: testq $8, MM jle .L116 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-32 * SIZE, A2, %xmm2) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) mulps %xmm4, %xmm2 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm2, %xmm10 MOVUPS_A1 (-28 * SIZE, A2, %xmm2) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L116: testq $4, MM jle .L117 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A1 (-32 * SIZE, A2, %xmm2) mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L117: testq $2, MM jle .L118 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L118: testq $1, MM jle .L119 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 ALIGN_4 .L119: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) jmp .L999 ALIGN_4 .L120: cmpq $2, N jne .L130 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 cmpq $3, M jle .L127 testq $SIZE, A1 je .L12X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L12X: testq $2 * SIZE, A1 je .L12XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L12XX: MOVUPS_A1 (-34 * SIZE, A2, %xmm12) MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L125 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-30 * SIZE, A2, %xmm1) decq I jle .L123 ALIGN_4 .L122: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-26 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-22 * SIZE, A2, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-18 * SIZE, A2, %xmm12) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-14 * SIZE, A2, %xmm1) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L122 ALIGN_4 .L123: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-26 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-22 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-18 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L125: testq $8, MM jle .L126 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-30 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-26 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 shufps $0x4e, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L126: testq $4, MM jle .L127 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-30 * SIZE, A2, %xmm1) shufps $0x4e, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L127: testq $2, MM jle .L128 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L128: testq $1, MM jle .L129 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 ALIGN_4 .L129: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm8, %xmm8 #else movaps %xmm8, %xmm10 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm10 addps %xmm10, %xmm8 movhlps %xmm8, %xmm9 addps %xmm9, %xmm8 #endif pshufd $0x1, %xmm8, %xmm9 mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 jmp .L999 ALIGN_4 .L130: cmpq $1, N jne .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 cmpq $3, M jle .L137 testq $SIZE, A1 je .L13X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 addq $1 * SIZE, A1 addq $1 * SIZE, X1 ALIGN_3 .L13X: testq $2 * SIZE, A1 je .L13XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L13XX: MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L135 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-28 * SIZE, A1, %xmm12) decq I jle .L133 ALIGN_4 .L132: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) addq $16 * SIZE, A1 addq $16 * SIZE, X1 decq I jg .L132 ALIGN_4 .L133: mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, X1 ALIGN_4 .L135: testq $8, MM jle .L136 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) mulps %xmm5, %xmm12 addps %xmm12, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, X1 ALIGN_4 .L136: testq $4, MM jle .L137 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 addq $4 * SIZE, A1 addq $4 * SIZE, X1 ALIGN_4 .L137: testq $2, MM jle .L138 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_4 .L138: testq $1, MM jle .L139 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 ALIGN_4 .L139: addps %xmm9, %xmm8 #ifdef HAVE_SSE3 haddps %xmm8, %xmm8 haddps %xmm8, %xmm8 #else pshufd $1, %xmm8, %xmm9 pshufd $2, %xmm8, %xmm10 pshufd $3, %xmm8, %xmm11 addss %xmm9, %xmm8 addss %xmm11, %xmm10 addss %xmm10, %xmm8 #endif mulss ALPHA, %xmm8 addss (Y), %xmm8 movss %xmm8, (Y1) jmp .L999 ALIGN_4 .L200: testq $2 * SIZE, LDA jne .L300 cmpq $4, N jl .L210 ALIGN_3 .L201: subq $4, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 cmpq $3, M jle .L207 testq $SIZE, A1 je .L20X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L20X: testq $2 * SIZE, A1 je .L20XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L20XX: movaps -33 * SIZE(A1, LDA), %xmm12 movaps -34 * SIZE(A2), %xmm13 movaps -35 * SIZE(A2, LDA), %xmm14 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCHW 4 * SIZE(Y1) #endif movq MM, I sarq $4, I jle .L205 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) decq I jle .L203 ALIGN_4 .L202: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x93, %xmm14, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x93, %xmm14, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-15 * SIZE, A2, LDA, 1, %xmm3) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L202 ALIGN_4 .L203: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x93, %xmm14, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 movss %xmm14, %xmm3 shufps $0x93, %xmm14, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L205: testq $8, MM jle .L206 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 movss %xmm14, %xmm3 shufps $0x93, %xmm14, %xmm3 mulps %xmm5, %xmm3 addps %xmm3, %xmm11 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L206: testq $4, MM jle .L207 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 addps %xmm14, %xmm11 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L207: testq $2, MM jle .L208 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L208: testq $1, MM jle .L209 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 ALIGN_4 .L209: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 mulss ALPHA, %xmm11 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 addq INCY, Y addss (Y), %xmm11 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) addq INCY, Y1 movss %xmm11, (Y1) addq INCY, Y1 cmpq $4, N jge .L201 ALIGN_4 .L210: cmpq $3, N jne .L220 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 cmpq $3, M jle .L217 testq $SIZE, A1 je .L21X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L21X: testq $2 * SIZE, A1 je .L21XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L21XX: movaps -33 * SIZE(A1, LDA), %xmm12 movaps -34 * SIZE(A2), %xmm13 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCHW 4 * SIZE(Y1) #endif movq MM, I sarq $4, I jle .L215 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) decq I jle .L213 ALIGN_4 .L212: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm2, %xmm10 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L212 ALIGN_4 .L213: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x93, %xmm14, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x93, %xmm3, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 movss %xmm14, %xmm3 shufps $0x93, %xmm14, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L215: testq $8, MM jle .L216 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L216: testq $4, MM jle .L217 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L217: testq $2, MM jle .L218 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L218: testq $1, MM jle .L219 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 ALIGN_4 .L219: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) jmp .L999 ALIGN_4 .L220: testq N, N jle .L999 cmpq $2, N jne .L230 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 cmpq $3, M jle .L227 testq $SIZE, A1 je .L22X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L22X: testq $2 * SIZE, A1 je .L22XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L22XX: movaps -33 * SIZE(A2), %xmm12 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L225 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-29 * SIZE, A2, %xmm1) decq I jle .L223 ALIGN_4 .L222: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm2) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-25 * SIZE, A2, %xmm12) mulps %xmm5, %xmm2 addps %xmm2, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-21 * SIZE, A2, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm2) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-17 * SIZE, A2, %xmm12) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm2 addps %xmm2, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-13 * SIZE, A2, %xmm1) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L222 ALIGN_4 .L223: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm2) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-25 * SIZE, A2, %xmm12) mulps %xmm5, %xmm2 addps %xmm2, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-21 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm2) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-17 * SIZE, A2, %xmm12) mulps %xmm5, %xmm2 addps %xmm2, %xmm8 movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L225: testq $8, MM jle .L226 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-29 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm2) movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-25 * SIZE, A2, %xmm12) mulps %xmm5, %xmm2 addps %xmm2, %xmm8 movss %xmm12, %xmm1 shufps $0x39, %xmm1, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L226: testq $4, MM jle .L227 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-29 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 movss %xmm1, %xmm12 shufps $0x39, %xmm12, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L227: testq $2, MM jle .L228 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L228: testq $1, MM jle .L229 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 ALIGN_4 .L229: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm8, %xmm8 #else movaps %xmm8, %xmm10 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm10 addps %xmm10, %xmm8 movhlps %xmm8, %xmm9 addps %xmm9, %xmm8 #endif pshufd $0x1, %xmm8, %xmm9 mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 jmp .L999 ALIGN_4 .L230: cmpq $1, N jne .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 cmpq $3, M jle .L237 testq $SIZE, A1 je .L23X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 addq $1 * SIZE, A1 addq $1 * SIZE, X1 ALIGN_3 .L23X: testq $2 * SIZE, A1 je .L23XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L23XX: MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L235 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-28 * SIZE, A1, %xmm12) decq I jle .L233 ALIGN_4 .L232: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) addq $16 * SIZE, A1 addq $16 * SIZE, X1 decq I jg .L232 ALIGN_4 .L233: mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, X1 ALIGN_4 .L235: testq $8, MM jle .L236 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) mulps %xmm5, %xmm12 addps %xmm12, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, X1 ALIGN_4 .L236: testq $4, MM jle .L237 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 addq $4 * SIZE, A1 addq $4 * SIZE, X1 ALIGN_4 .L237: testq $2, MM jle .L238 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_4 .L238: testq $1, MM jle .L239 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 ALIGN_4 .L239: addps %xmm9, %xmm8 #ifdef HAVE_SSE3 haddps %xmm8, %xmm8 haddps %xmm8, %xmm8 #else pshufd $1, %xmm8, %xmm9 pshufd $2, %xmm8, %xmm10 pshufd $3, %xmm8, %xmm11 addss %xmm9, %xmm8 addss %xmm11, %xmm10 addss %xmm10, %xmm8 #endif mulss ALPHA, %xmm8 addss (Y), %xmm8 movss %xmm8, (Y1) jmp .L999 ALIGN_4 .L300: cmpq $4, N jl .L310 ALIGN_3 .L301: subq $4, N leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 cmpq $3, M jle .L307 testq $SIZE, A1 je .L30X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L30X: testq $2 * SIZE, A1 je .L30XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L30XX: movaps -35 * SIZE(A1, LDA), %xmm12 movaps -34 * SIZE(A2), %xmm13 movaps -33 * SIZE(A2, LDA), %xmm14 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCHW 4 * SIZE(Y1) #endif movq MM, I sarq $4, I jle .L305 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) decq I jle .L303 ALIGN_4 .L302: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x39, %xmm3, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x39, %xmm3, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-13 * SIZE, A2, LDA, 1, %xmm3) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L302 ALIGN_4 .L303: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x39, %xmm3, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 movss %xmm14, %xmm3 shufps $0x39, %xmm3, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L305: testq $8, MM jle .L306 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 movss %xmm14, %xmm3 shufps $0x39, %xmm3, %xmm3 mulps %xmm5, %xmm3 addps %xmm3, %xmm11 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L306: testq $4, MM jle .L307 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 addps %xmm14, %xmm11 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L307: testq $2, MM jle .L308 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L308: testq $1, MM jle .L309 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 movss -32 * SIZE(A2, LDA), %xmm3 mulss %xmm4, %xmm3 addss %xmm3, %xmm11 ALIGN_4 .L309: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 pshufd $0x3, %xmm8, %xmm11 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 mulss ALPHA, %xmm11 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 addq INCY, Y addss (Y), %xmm11 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) addq INCY, Y1 movss %xmm11, (Y1) addq INCY, Y1 cmpq $4, N jge .L301 ALIGN_4 .L310: testq N, N jle .L999 cmpq $3, N jne .L320 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 cmpq $3, M jle .L317 testq $SIZE, A1 je .L31X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L31X: testq $2 * SIZE, A1 je .L31XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L31XX: movaps -35 * SIZE(A1, LDA), %xmm12 movaps -34 * SIZE(A2), %xmm13 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L315 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) decq I jle .L313 ALIGN_4 .L312: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm2, %xmm10 MOVUPS_A1 (-14 * SIZE, A2, %xmm2) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L312 ALIGN_4 .L313: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 MOVUPS_A1 (-22 * SIZE, A2, %xmm2) movss %xmm14, %xmm3 shufps $0x39, %xmm3, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 MOVUPS_A1 (-18 * SIZE, A2, %xmm13) movss %xmm3, %xmm14 shufps $0x39, %xmm14, %xmm14 mulps %xmm4, %xmm14 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm14, %xmm11 MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 movss %xmm14, %xmm3 shufps $0x39, %xmm3, %xmm3 mulps %xmm5, %xmm3 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm3, %xmm11 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L315: testq $8, MM jle .L316 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm13, %xmm10 MOVUPS_A1 (-26 * SIZE, A2, %xmm13) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 shufps $0x4e, %xmm13, %xmm2 mulps %xmm5, %xmm2 addps %xmm2, %xmm10 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L316: testq $4, MM jle .L317 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) MOVUPS_A1 (-30 * SIZE, A2, %xmm2) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 shufps $0x4e, %xmm2, %xmm13 mulps %xmm4, %xmm13 addps %xmm13, %xmm10 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L317: testq $2, MM jle .L318 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L318: testq $1, MM jle .L319 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A1, LDA), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 movss -32 * SIZE(A2), %xmm2 mulss %xmm4, %xmm2 addss %xmm2, %xmm10 ALIGN_4 .L319: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm10, %xmm8 pshufd $0x1, %xmm8, %xmm9 pshufd $0x2, %xmm8, %xmm10 #else movaps %xmm8, %xmm0 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm0 movaps %xmm10, %xmm1 unpcklps %xmm11, %xmm10 unpckhps %xmm11, %xmm1 movaps %xmm8, %xmm9 unpcklps %xmm10, %xmm8 unpckhps %xmm10, %xmm9 movaps %xmm0, %xmm10 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm10 addps %xmm9, %xmm8 addps %xmm0, %xmm10 addps %xmm10, %xmm8 pshufd $0x2, %xmm8, %xmm9 pshufd $0x1, %xmm8, %xmm10 #endif mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 mulss ALPHA, %xmm10 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y addss (Y), %xmm10 movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 movss %xmm10, (Y1) jmp .L999 ALIGN_3 .L320: cmpq $2, N jne .L330 leaq 32 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 cmpq $3, M jle .L327 testq $SIZE, A1 je .L32X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 addq $1 * SIZE, A1 addq $1 * SIZE, A2 addq $1 * SIZE, X1 ALIGN_3 .L32X: testq $2 * SIZE, A1 je .L32XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_3 .L32XX: movaps -35 * SIZE(A2), %xmm12 MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L325 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-31 * SIZE, A2, %xmm1) decq I jle .L323 ALIGN_4 .L322: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-27 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-23 * SIZE, A2, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) #endif mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-19 * SIZE, A2, %xmm12) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-15 * SIZE, A2, %xmm1) addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 decq I jg .L322 ALIGN_4 .L323: mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-27 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 MOVUPS_A1 (-23 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-20 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-19 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm1, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, A2 addq $16 * SIZE, X1 ALIGN_4 .L325: testq $8, MM jle .L326 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-31 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm0) movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm12, %xmm9 MOVUPS_A1 (-27 * SIZE, A2, %xmm12) mulps %xmm5, %xmm0 addps %xmm0, %xmm8 movss %xmm12, %xmm1 shufps $0x93, %xmm12, %xmm1 mulps %xmm5, %xmm1 addps %xmm1, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, A2 addq $8 * SIZE, X1 ALIGN_4 .L326: testq $4, MM jle .L327 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-31 * SIZE, A2, %xmm1) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 movss %xmm1, %xmm12 shufps $0x93, %xmm1, %xmm12 mulps %xmm4, %xmm12 addps %xmm12, %xmm9 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, X1 ALIGN_4 .L327: testq $2, MM jle .L328 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 ALIGN_4 .L328: testq $1, MM jle .L329 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 movss -32 * SIZE(A2), %xmm1 mulss %xmm4, %xmm1 addss %xmm1, %xmm9 ALIGN_4 .L329: #ifdef HAVE_SSE3 haddps %xmm9, %xmm8 haddps %xmm8, %xmm8 #else movaps %xmm8, %xmm10 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm10 addps %xmm10, %xmm8 movhlps %xmm8, %xmm9 addps %xmm9, %xmm8 #endif pshufd $0x1, %xmm8, %xmm9 mulss ALPHA, %xmm8 mulss ALPHA, %xmm9 addss (Y), %xmm8 addq INCY, Y addss (Y), %xmm9 addq INCY, Y movss %xmm8, (Y1) addq INCY, Y1 movss %xmm9, (Y1) addq INCY, Y1 jmp .L999 ALIGN_4 .L330: cmpq $1, N jne .L999 leaq 32 * SIZE(BUFFER), X1 movq A, A1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 cmpq $3, M jle .L337 testq $SIZE, A1 je .L33X movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 addq $1 * SIZE, A1 addq $1 * SIZE, X1 ALIGN_3 .L33X: testq $2 * SIZE, A1 je .L33XX #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_3 .L33XX: MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) movq MM, I sarq $4, I jle .L335 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) MOVUPS_A1 (-28 * SIZE, A1, %xmm12) decq I jle .L333 ALIGN_4 .L332: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) #endif mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-16 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-12 * SIZE, A1, %xmm12) addq $16 * SIZE, A1 addq $16 * SIZE, X1 decq I jg .L332 ALIGN_4 .L333: mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-24 * SIZE, A1, %xmm0) mulps %xmm5, %xmm12 MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 MOVUPS_A1 (-20 * SIZE, A1, %xmm12) mulps %xmm4, %xmm0 MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 mulps %xmm5, %xmm12 MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) addps %xmm12, %xmm9 addq $16 * SIZE, A1 addq $16 * SIZE, X1 ALIGN_4 .L335: testq $8, MM jle .L336 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) addps %xmm0, %xmm8 MOVUPS_A1 (-28 * SIZE, A1, %xmm12) mulps %xmm5, %xmm12 addps %xmm12, %xmm9 addq $8 * SIZE, A1 addq $8 * SIZE, X1 ALIGN_4 .L336: testq $4, MM jle .L337 MOVUPS_A1 (-32 * SIZE, A1, %xmm0) mulps %xmm4, %xmm0 addps %xmm0, %xmm8 addq $4 * SIZE, A1 addq $4 * SIZE, X1 ALIGN_4 .L337: testq $2, MM jle .L338 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 shufps $0xe, %xmm4, %xmm4 addq $2 * SIZE, A1 addq $2 * SIZE, X1 ALIGN_4 .L338: testq $1, MM jle .L339 movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 addss %xmm0, %xmm8 ALIGN_4 .L339: addps %xmm9, %xmm8 #ifdef HAVE_SSE3 haddps %xmm8, %xmm8 haddps %xmm8, %xmm8 #else pshufd $1, %xmm8, %xmm9 pshufd $2, %xmm8, %xmm10 pshufd $3, %xmm8, %xmm11 addss %xmm9, %xmm8 addss %xmm11, %xmm10 addss %xmm10, %xmm8 #endif mulss ALPHA, %xmm8 addss (Y), %xmm8 movss %xmm8, (Y1) jmp .L999 #endif ALIGN_4 .L999: leaq (,M,SIZE),%rax addq %rax,AA jmp .L0t ALIGN_4 .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret ALIGN_4 EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/sgemv_t.c000066400000000000000000000106271313527062700172000ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #define NBMAX 4096 #ifndef HAVE_KERNEL_16x4 static void sgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; FLOAT temp0 = 0.0; FLOAT temp1 = 0.0; FLOAT temp2 = 0.0; FLOAT temp3 = 0.0; for ( i=0; i< n; i+=4 ) { temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; } y[0] = temp0; y[1] = temp1; y[2] = temp2; y[3] = temp3; } #endif static void sgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; FLOAT temp = 0.0; for ( i=0; i< n; i+=4 ) { temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; } *y = temp; } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; for ( i=0; i> 2 ; n2 = n & 3 ; m3 = m & 3 ; m1 = m & -4 ; m2 = (m & (NBMAX-1)) - m3 ; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } y_ptr = y; a_ptr = a; x_ptr = x; if ( inc_x == 1 ) xbuffer = x_ptr; else copy_x(NB,x_ptr,xbuffer,inc_x); FLOAT *ap[4]; FLOAT *yp; BLASLONG register lda4 = 4 * lda; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if ( n0 > 0 ) { BLASLONG nb1 = NBMAX / 4; for( j=0; j 0 ) { add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); y_ptr += n1 * inc_y * 4; a_ptr += n1 * lda4 ; } if ( n2 & 2 ) { sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); a_ptr += lda * 2; *y_ptr += ybuffer[0] * alpha; y_ptr += inc_y; *y_ptr += ybuffer[1] * alpha; y_ptr += inc_y; } if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); a_ptr += lda; *y_ptr += ybuffer[0] * alpha; y_ptr += inc_y; } a += NB; x += NB * inc_x; } if ( m3 == 0 ) return(0); x_ptr = x; a_ptr = a; if ( m3 == 3 ) { FLOAT xtemp0 = *x_ptr * alpha; x_ptr += inc_x; FLOAT xtemp1 = *x_ptr * alpha; x_ptr += inc_x; FLOAT xtemp2 = *x_ptr * alpha; FLOAT *aj = a_ptr; y_ptr = y; if ( lda == 3 && inc_y == 1 ) { for ( j=0; j< ( n & -4) ; j+=4 ) { y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; aj += 12; } for ( ; j #include "common.h" #if defined(SANDYBRIDGE) #include "sger_microk_sandy-2.c" #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ FLOAT *X = x; if (incx != 1) { X = buffer; COPY_K(m, x, incx, X, 1); } BLASLONG m1 = m & -16; while (n > 0) { FLOAT y0 = alpha * *y; if ( m1 > 0 ) { #ifdef HAVE_KERNEL_16 sger_kernel_16(m1, X, a, &y0); #else AXPYU_K(m1, 0, 0, y0, X, 1, a, 1, NULL, 0); #endif } if ( m > m1 ) { AXPYU_K(m-m1, 0, 0, y0, X+m1 , 1, a+m1, 1, NULL, 0); } a += lda; y += incy; n --; } return 0; } OpenBLAS-0.2.20/kernel/x86_64/sger_microk_sandy-2.c000066400000000000000000000107121313527062700213700ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_16 1 static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vbroadcastss (%4), %%xmm0 \n\t" // alpha "prefetcht0 256(%3,%0,4) \n\t" "vmovups (%3,%0,4), %%xmm8 \n\t" "vmovups 16(%3,%0,4), %%xmm9 \n\t" "vmovups 32(%3,%0,4), %%xmm10 \n\t" "vmovups 48(%3,%0,4), %%xmm11 \n\t" "prefetcht0 256(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm4 \n\t" "vmovups 16(%2,%0,4), %%xmm5 \n\t" "vmovups 32(%2,%0,4), %%xmm6 \n\t" "vmovups 48(%2,%0,4), %%xmm7 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "vmulps %%xmm4, %%xmm0, %%xmm4 \n\t" "vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t" "vmulps %%xmm5, %%xmm0, %%xmm5 \n\t" "vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t" "vmulps %%xmm6, %%xmm0, %%xmm6 \n\t" "vaddps %%xmm10, %%xmm6, %%xmm14 \n\t" "vmulps %%xmm7, %%xmm0, %%xmm7 \n\t" "vaddps %%xmm11, %%xmm7, %%xmm15 \n\t" "prefetcht0 256(%3,%0,4) \n\t" "vmovups (%3,%0,4), %%xmm8 \n\t" "vmovups 16(%3,%0,4), %%xmm9 \n\t" "vmovups 32(%3,%0,4), %%xmm10 \n\t" "vmovups 48(%3,%0,4), %%xmm11 \n\t" "prefetcht0 256(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%xmm4 \n\t" "vmovups 16(%2,%0,4), %%xmm5 \n\t" "vmovups 32(%2,%0,4), %%xmm6 \n\t" "vmovups 48(%2,%0,4), %%xmm7 \n\t" "vmovups %%xmm12, -64(%3,%0,4) \n\t" "vmovups %%xmm13, -48(%3,%0,4) \n\t" "vmovups %%xmm14, -32(%3,%0,4) \n\t" "vmovups %%xmm15, -16(%3,%0,4) \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulps %%xmm4, %%xmm0, %%xmm4 \n\t" "vmulps %%xmm5, %%xmm0, %%xmm5 \n\t" "vmulps %%xmm6, %%xmm0, %%xmm6 \n\t" "vmulps %%xmm7, %%xmm0, %%xmm7 \n\t" "vaddps %%xmm8 , %%xmm4, %%xmm12 \n\t" "vaddps %%xmm9 , %%xmm5, %%xmm13 \n\t" "vaddps %%xmm10, %%xmm6, %%xmm14 \n\t" "vaddps %%xmm11, %%xmm7, %%xmm15 \n\t" "vmovups %%xmm12, -64(%3,%0,4) \n\t" "vmovups %%xmm13, -48(%3,%0,4) \n\t" "vmovups %%xmm14, -32(%3,%0,4) \n\t" "vmovups %%xmm15, -16(%3,%0,4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/ssymv_L.c000066400000000000000000000154201313527062700171640ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_4x4 static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2) { FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 }; BLASLONG i; for (i=from; i=12 ) { BLASLONG m2 = (m/4)*4; for (i=j+1; i j+4 ) ssymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2); for (i=m2; i=8 ) { BLASLONG j1 = ((from + 4)/4)*4; BLASLONG j2 = (m/4)*4; for (i=from; i= 0; i--) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = 0; k < i; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a -= m; b -= 2 * n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (m - 1) * m * 2; b += (m - 1) * n * 2; for (i = m - 1; i >= 0; i--) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a -= m * 2; b -= 4 * n; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { strsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * j * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, cc, ldc); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/strsm_kernel_LT_bulldozer.c000066400000000000000000001026031313527062700227210ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc ; BLASLONG n1 = n * 8; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 2f \n\t" " .align 16 \n\t" "1: \n\t" " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b " vmovups (%2,%1,8), %%xmm4 \n\t" " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" " vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddps %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" " vfmaddps %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" " vfmaddps %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" " vfmaddps %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" " addq $8, %1 \n\t" " vfmaddps %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" " vfmaddps %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups 32(%4) , %%xmm2 \n\t" " vmovups 48(%4) , %%xmm3 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vmovups 32(%5) , %%xmm6 \n\t" " vmovups 48(%5) , %%xmm7 \n\t" " vsubps %%xmm8 , %%xmm0 , %%xmm8 \n\t" " vsubps %%xmm9 , %%xmm1 , %%xmm9 \n\t" " vsubps %%xmm10, %%xmm2 , %%xmm10 \n\t" " vsubps %%xmm11, %%xmm3 , %%xmm11 \n\t" " vsubps %%xmm12, %%xmm4 , %%xmm12 \n\t" " vsubps %%xmm13, %%xmm5 , %%xmm13 \n\t" " vsubps %%xmm14, %%xmm6 , %%xmm14 \n\t" " vsubps %%xmm15, %%xmm7 , %%xmm15 \n\t" "3: \n\t" " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" " addq $64 , %6 \n\t" // a -= m " addq $8 , %7 \n\t" // b -= n " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < m; i++) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = i + 1; k < m; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a += m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < m; i++) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = i + 1; k < m; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a += m * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { strsm_LT_solve_opt(kk , aa , b , cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/strsm_kernel_RN_bulldozer.c000066400000000000000000000275341313527062700227320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc ; BLASLONG n1 = n * 8; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 2f \n\t" " .align 16 \n\t" "1: \n\t" " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b " vmovups (%2,%1,8), %%xmm4 \n\t" " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" " vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddps %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" " vfmaddps %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" " vfmaddps %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" " vfmaddps %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" " addq $8, %1 \n\t" " vfmaddps %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" " vfmaddps %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups 32(%4) , %%xmm2 \n\t" " vmovups 48(%4) , %%xmm3 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vmovups 32(%5) , %%xmm6 \n\t" " vmovups 48(%5) , %%xmm7 \n\t" " vsubps %%xmm8 , %%xmm0 , %%xmm8 \n\t" " vsubps %%xmm9 , %%xmm1 , %%xmm9 \n\t" " vsubps %%xmm10, %%xmm2 , %%xmm10 \n\t" " vsubps %%xmm11, %%xmm3 , %%xmm11 \n\t" " vsubps %%xmm12, %%xmm4 , %%xmm12 \n\t" " vsubps %%xmm13, %%xmm5 , %%xmm13 \n\t" " vsubps %%xmm14, %%xmm6 , %%xmm14 \n\t" " vsubps %%xmm15, %%xmm7 , %%xmm15 \n\t" "3: \n\t" // i = 0 " vbroadcastss (%7), %%xmm0 \n\t" // read bb " vbroadcastss 4(%7), %%xmm1 \n\t" // read b " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" " vmovups %%xmm8 , (%6) \n\t" // write a " vmovups %%xmm9 , 16(%6) \n\t" " vmovups %%xmm10 , 32(%6) \n\t" " vmovups %%xmm11 , 48(%6) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" " vmovups %%xmm10 , 32(%4) \n\t" " vmovups %%xmm11 , 48(%4) \n\t" " vfnmaddps %%xmm12 , %%xmm8 , %%xmm1 , %%xmm12 \n\t" // c = c - aa * b " vfnmaddps %%xmm13 , %%xmm9 , %%xmm1 , %%xmm13 \n\t" " vfnmaddps %%xmm14 , %%xmm10 , %%xmm1 , %%xmm14 \n\t" " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" " \n\t" // i = 1 " addq $8 , %7 \n\t" // b = b + 2 " addq $64 , %6 \n\t" // a = a + 16 " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb " vmovups %%xmm12 , (%6) \n\t" // write a " vmovups %%xmm13 , 16(%6) \n\t" // write a " vmovups %%xmm14 , 32(%6) \n\t" // write a " vmovups %%xmm15 , 48(%6) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" " vmovups %%xmm14 , 32(%5) \n\t" " vmovups %%xmm15 , 48(%5) \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < n; i++) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = i + 1; k < n; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b += n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < n; i++) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = -aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = i + 1; k < n; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b += n * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); kk = -offset; while (j > 0) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { strsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } kk += GEMM_UNROLL_N; b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; kk += j; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/strsm_kernel_RT_bulldozer.c000066400000000000000000000310431313527062700227260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc ; BLASLONG n1 = n * 8; BLASLONG i=0; as += (2 - 1) * 16; bs += (2 - 1) * 2; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 2f \n\t" " .align 16 \n\t" "1: \n\t" " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b " vmovups (%2,%1,8), %%xmm4 \n\t" " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" " vmovups 16(%2,%1,8), %%xmm5 \n\t" " vmovups 32(%2,%1,8), %%xmm6 \n\t" " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" " vfmaddps %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" " vfmaddps %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" " vfmaddps %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" " vfmaddps %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" " addq $8, %1 \n\t" " vfmaddps %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" " vfmaddps %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups 32(%4) , %%xmm2 \n\t" " vmovups 48(%4) , %%xmm3 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vmovups 32(%5) , %%xmm6 \n\t" " vmovups 48(%5) , %%xmm7 \n\t" " vsubps %%xmm8 , %%xmm0 , %%xmm8 \n\t" " vsubps %%xmm9 , %%xmm1 , %%xmm9 \n\t" " vsubps %%xmm10, %%xmm2 , %%xmm10 \n\t" " vsubps %%xmm11, %%xmm3 , %%xmm11 \n\t" " vsubps %%xmm12, %%xmm4 , %%xmm12 \n\t" " vsubps %%xmm13, %%xmm5 , %%xmm13 \n\t" " vsubps %%xmm14, %%xmm6 , %%xmm14 \n\t" " vsubps %%xmm15, %%xmm7 , %%xmm15 \n\t" "3: \n\t" // i = 1 " vbroadcastss (%7), %%xmm1 \n\t" // read b " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb " vmovups %%xmm12 , (%6) \n\t" // write a " vmovups %%xmm13 , 16(%6) \n\t" // write a " vmovups %%xmm14 , 32(%6) \n\t" // write a " vmovups %%xmm15 , 48(%6) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" " vmovups %%xmm14 , 32(%5) \n\t" " vmovups %%xmm15 , 48(%5) \n\t" " vfnmaddps %%xmm8 , %%xmm12 , %%xmm1 , %%xmm8 \n\t" // c = c - aa * b " vfnmaddps %%xmm9 , %%xmm13 , %%xmm1 , %%xmm9 \n\t" " vfnmaddps %%xmm10 , %%xmm14 , %%xmm1 , %%xmm10 \n\t" " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 " subq $8 , %7 \n\t" // b = b - 2 " subq $64 , %6 \n\t" // a = a - 16 " vbroadcastss (%7), %%xmm0 \n\t" // read bb " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" " vmovups %%xmm8 , (%6) \n\t" // write a " vmovups %%xmm9 , 16(%6) \n\t" " vmovups %%xmm10 , 32(%6) \n\t" " vmovups %%xmm11 , 48(%6) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" " vmovups %%xmm10 , 32(%4) \n\t" " vmovups %%xmm11 , 48(%4) \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; a += (n - 1) * m; b += (n - 1) * n; for (i = n - 1; i >= 0; i--) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = 0; k < i; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b -= n; a -= 2 * m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (n - 1) * m * 2; b += (n - 1) * n * 2; for (i = n - 1; i >= 0; i--) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = - aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b -= n * 2; a -= 4 * m; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif kk = n - offset; c += n * ldc * COMPSIZE; b += n * k * COMPSIZE; if (n & (GEMM_UNROLL_N - 1)) { j = 1; while (j < GEMM_UNROLL_N) { if (n & j) { aa = a; b -= j * k * COMPSIZE; c -= j * ldc* COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - j) * i * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= j; } j <<= 1; } } j = (n >> GEMM_UNROLL_N_SHIFT); if (j > 0) { do { aa = a; b -= GEMM_UNROLL_N * k * COMPSIZE; c -= GEMM_UNROLL_N * ldc * COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { strsm_RT_solve_opt(k - kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE , b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE ); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= GEMM_UNROLL_N; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/x86_64/swap.S000066400000000000000000000207361313527062700164700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define N ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define N ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %rbx #endif #define XX %r10 #define YY %r11 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY #else movq 24(%rsp), INCY #endif #else pushq %rbx movq 48(%rsp), X movq 56(%rsp), INCX movq 64(%rsp), Y movq 72(%rsp), INCY #endif EMMS salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY cmpq $SIZE, INCX jne .L14 cmpq $SIZE, INCY jne .L14 movq N, %rax sarq $3, %rax jle .L15 ALIGN_3 .L16: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq 16(X), %mm2 movq 24(X), %mm3 movq 0(Y), %mm4 movq 8(Y), %mm5 movq 16(Y), %mm6 movq 24(Y), %mm7 movq %mm4, 0(X) movq %mm5, 8(X) movq %mm6, 16(X) movq %mm7, 24(X) movq %mm0, 0(Y) movq %mm1, 8(Y) movq %mm2, 16(Y) movq %mm3, 24(Y) movq 32(X), %mm0 movq 40(X), %mm1 movq 48(X), %mm2 movq 56(X), %mm3 movq 32(Y), %mm4 movq 40(Y), %mm5 movq 48(Y), %mm6 movq 56(Y), %mm7 movq %mm4, 32(X) movq %mm5, 40(X) movq %mm6, 48(X) movq %mm7, 56(X) movq %mm0, 32(Y) movq %mm1, 40(Y) movq %mm2, 48(Y) movq %mm3, 56(Y) movq 64(X), %mm0 movq 72(X), %mm1 movq 80(X), %mm2 movq 88(X), %mm3 movq 64(Y), %mm4 movq 72(Y), %mm5 movq 80(Y), %mm6 movq 88(Y), %mm7 movq %mm4, 64(X) movq %mm5, 72(X) movq %mm6, 80(X) movq %mm7, 88(X) movq %mm0, 64(Y) movq %mm1, 72(Y) movq %mm2, 80(Y) movq %mm3, 88(Y) movq 96(X), %mm0 movq 104(X), %mm1 movq 112(X), %mm2 movq 120(X), %mm3 movq 96(Y), %mm4 movq 104(Y), %mm5 movq 112(Y), %mm6 movq 120(Y), %mm7 movq %mm4, 96(X) movq %mm5, 104(X) movq %mm6, 112(X) movq %mm7, 120(X) movq %mm0, 96(Y) movq %mm1, 104(Y) movq %mm2, 112(Y) movq %mm3, 120(Y) #elif defined(DOUBLE) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq 0 * SIZE(X), %mm0 movq 1 * SIZE(X), %mm1 movq 2 * SIZE(X), %mm2 movq 3 * SIZE(X), %mm3 movq 0 * SIZE(Y), %mm4 movq 1 * SIZE(Y), %mm5 movq 2 * SIZE(Y), %mm6 movq 3 * SIZE(Y), %mm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq %mm4, 0 * SIZE(X) movq %mm5, 1 * SIZE(X) movq %mm6, 2 * SIZE(X) movq %mm7, 3 * SIZE(X) movq %mm0, 0 * SIZE(Y) movq %mm1, 1 * SIZE(Y) movq %mm2, 2 * SIZE(Y) movq %mm3, 3 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movq 4 * SIZE(X), %mm0 movq 5 * SIZE(X), %mm1 movq 6 * SIZE(X), %mm2 movq 7 * SIZE(X), %mm3 movq 4 * SIZE(Y), %mm4 movq 5 * SIZE(Y), %mm5 movq 6 * SIZE(Y), %mm6 movq 7 * SIZE(Y), %mm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq %mm4, 4 * SIZE(X) movq %mm5, 5 * SIZE(X) movq %mm6, 6 * SIZE(X) movq %mm7, 7 * SIZE(X) movq %mm0, 4 * SIZE(Y) movq %mm1, 5 * SIZE(Y) movq %mm2, 6 * SIZE(Y) movq %mm3, 7 * SIZE(Y) #else #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq 0 * SIZE(X), %mm0 movq 2 * SIZE(X), %mm1 movq 4 * SIZE(X), %mm2 movq 6 * SIZE(X), %mm3 movq 0 * SIZE(Y), %mm4 movq 2 * SIZE(Y), %mm5 movq 4 * SIZE(Y), %mm6 movq 6 * SIZE(Y), %mm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq %mm4, 0 * SIZE(X) movq %mm5, 2 * SIZE(X) movq %mm6, 4 * SIZE(X) movq %mm7, 6 * SIZE(X) movq %mm0, 0 * SIZE(Y) movq %mm1, 2 * SIZE(Y) movq %mm2, 4 * SIZE(Y) movq %mm3, 6 * SIZE(Y) #endif addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L16 ALIGN_3 .L15: movq N, %rax andq $7, %rax jle .L27 ALIGN_3 .L22: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq 0(Y), %mm4 movq 8(Y), %mm5 movq %mm4, 0(X) movq %mm5, 8(X) movq %mm0, 0(Y) movq %mm1, 8(Y) #else MOVQ 0 * SIZE(X), %mm0 MOVQ 0 * SIZE(Y), %mm4 MOVQ %mm4, 0 * SIZE(X) MOVQ %mm0, 0 * SIZE(Y) #endif addq $SIZE, X addq $SIZE, Y decq %rax jg .L22 jmp .L27 ALIGN_3 /* INCX != 1 or INCY != 1 */ .L14: movq N, %rax movq X, XX movq Y, YY sarq $2, %rax jle .L28 ALIGN_2 .L29: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 addq INCX, X movq 0(Y), %mm4 movq 8(Y), %mm5 addq INCY, Y movq %mm4, 0(XX) movq %mm5, 8(XX) addq INCX, XX movq %mm0, 0(YY) movq %mm1, 8(YY) addq INCY, YY movq 0(X), %mm0 movq 8(X), %mm1 addq INCX, X movq 0(Y), %mm4 movq 8(Y), %mm5 addq INCY, Y movq %mm4, 0(XX) movq %mm5, 8(XX) addq INCX, XX movq %mm0, 0(YY) movq %mm1, 8(YY) addq INCY, YY movq 0(X), %mm0 movq 8(X), %mm1 addq INCX, X movq 0(Y), %mm4 movq 8(Y), %mm5 addq INCY, Y movq %mm4, 0(XX) movq %mm5, 8(XX) addq INCX, XX movq %mm0, 0(YY) movq %mm1, 8(YY) addq INCY, YY movq 0(X), %mm0 movq 8(X), %mm1 addq INCX, X movq 0(Y), %mm4 movq 8(Y), %mm5 addq INCY, Y movq %mm4, 0(XX) movq %mm5, 8(XX) addq INCX, XX movq %mm0, 0(YY) movq %mm1, 8(YY) addq INCY, YY #else MOVQ (X), %mm0 addq INCX, X MOVQ (X), %mm1 addq INCX, X MOVQ (X), %mm2 addq INCX, X MOVQ (X), %mm3 addq INCX, X MOVQ (Y), %mm4 addq INCY, Y MOVQ (Y), %mm5 addq INCY, Y MOVQ (Y), %mm6 addq INCY, Y MOVQ (Y), %mm7 addq INCY, Y MOVQ %mm4, (XX) addq INCX, XX MOVQ %mm5, (XX) addq INCX, XX MOVQ %mm6, (XX) addq INCX, XX MOVQ %mm7, (XX) addq INCX, XX MOVQ %mm0, (YY) addq INCY, YY MOVQ %mm1, (YY) addq INCY, YY MOVQ %mm2, (YY) addq INCY, YY MOVQ %mm3, (YY) addq INCY, YY #endif decq %rax jg .L29 ALIGN_3 .L28: movq N, %rax andq $3, %rax jle .L27 ALIGN_3 .L35: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq 0(Y), %mm4 movq 8(Y), %mm5 movq %mm4, 0(X) movq %mm5, 8(X) movq %mm0, 0(Y) movq %mm1, 8(Y) #else MOVQ (X), %mm0 MOVQ (Y), %mm4 MOVQ %mm4, (X) MOVQ %mm0, (Y) #endif addq INCX, X addq INCY, Y decq %rax jg .L35 ALIGN_3 .L27: EMMS xorq %rax,%rax #ifdef WINDOWS_ABI popq %rbx #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/swap_sse.S000066400000000000000000000507561313527062700173470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %rbx #endif #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI movq 8(%rsp), INCY #else pushq %rbx movq 48(%rsp), X movq 56(%rsp), INCX movq 64(%rsp), Y movq 72(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY jne .L50 subq $-32 * SIZE, X subq $-32 * SIZE, Y cmpq $3, M jle .L16 testq $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M ALIGN_3 .L05: testq $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y subq $2, M jle .L19 ALIGN_3 .L10: cmpq $3, M jle .L16 testq $2 * SIZE, X jne .L30 testq $1 * SIZE, X jne .L20 movq M, %rax sarq $5, %rax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) subq $-32 * SIZE, Y subq $-32 * SIZE, X decq %rax jg .L11 ALIGN_3 .L13: testq $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L14: testq $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) addq $2 * SIZE, X movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) pshufd $0x39, %xmm1, %xmm3 movlps %xmm3, -31 * SIZE(X) subq $3, M movq M, %rax sarq $5, %rax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -13 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -5 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -5 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L23: testq $16, M jle .L24 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L24: testq $8, M jle .L25 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $4, M jle .L26 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: pshufd $0x39, %xmm0, %xmm2 pshufd $0xff, %xmm0, %xmm0 movlps %xmm2, -32 * SIZE(Y) movss %xmm0, -30 * SIZE(Y) testq $2, M jle .L27 movsd -29 * SIZE(X), %xmm0 movsd -29 * SIZE(Y), %xmm1 movlps %xmm0, -29 * SIZE(Y) movlps %xmm1, -29 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: testq $1, M jle .L29 movss -29 * SIZE(X), %xmm0 movss -29 * SIZE(Y), %xmm1 movss %xmm0, -29 * SIZE(Y) movss %xmm1, -29 * SIZE(X) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L30: testq $1 * SIZE, X jne .L40 movhps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) subq $2, M movq M, %rax sarq $5, %rax jle .L33 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -6 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -6 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -2 * SIZE(X) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L33: testq $16, M jle .L34 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L34: testq $8, M jle .L35 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L35: testq $4, M jle .L36 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L36: movhps %xmm0, -32 * SIZE(Y) testq $2, M jle .L37 movsd -30 * SIZE(X), %xmm0 movsd -30 * SIZE(Y), %xmm1 movlps %xmm0, -30 * SIZE(Y) movlps %xmm1, -30 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L37: testq $1, M jle .L39 movss -30 * SIZE(X), %xmm0 movss -30 * SIZE(Y), %xmm1 movss %xmm0, -30 * SIZE(Y) movss %xmm1, -30 * SIZE(X) ALIGN_3 .L39: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) subq $3, M movq M, %rax sarq $5, %rax jle .L43 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -11 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -7 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -3 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -3 * SIZE(X) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L43: testq $16, M jle .L44 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L44: testq $8, M jle .L45 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L45: testq $4, M jle .L46 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L46: movsd -31 * SIZE(X), %xmm2 pshufd $0x39, %xmm1, %xmm1 movlps %xmm1, -31 * SIZE(X) pshufd $0xff, %xmm0, %xmm0 movss %xmm0, -32 * SIZE(Y) movlps %xmm2, -31 * SIZE(Y) addq $3 * SIZE, X addq $3 * SIZE, Y testq $2, M jle .L47 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm0, -32 * SIZE(Y) movlps %xmm1, -32 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L47: testq $1, M jle .L49 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm0, -32 * SIZE(Y) movss %xmm1, -32 * SIZE(X) ALIGN_3 .L49: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L50: movq M, %rax sarq $3, %rax jle .L55 ALIGN_3 .L51: movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) addq INCX, X movss %xmm0, (Y) addq INCY, Y decq %rax jg .L51 ALIGN_3 .L55: movq M, %rax andq $7, %rax jle .L57 ALIGN_3 .L56: movss (X), %xmm0 movss (Y), %xmm1 movss %xmm1, (X) movss %xmm0, (Y) addq INCX, X addq INCY, Y decq %rax jg .L56 ALIGN_3 .L57: xorq %rax, %rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/swap_sse2.S000066400000000000000000000253271313527062700174250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %rbx #endif #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI movq 8(%rsp), INCY #else pushq %rbx movq 48(%rsp), X movq 56(%rsp), INCX movq 64(%rsp), Y movq 72(%rsp), INCY #endif SAVEREGISTERS leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY jne .L40 testq $SIZE, Y je .L10 movsd 0 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm8 movsd %xmm8, 0 * SIZE(X) movsd %xmm0, 0 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_4 .L10: subq $-16 * SIZE, X subq $-16 * SIZE, Y testq $SIZE, X jne .L20 movq M, %rax sarq $4, %rax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -6 * SIZE(X), %xmm0 movaps -6 * SIZE(Y), %xmm1 movaps %xmm0, -6 * SIZE(Y) movaps %xmm1, -6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps -2 * SIZE(Y), %xmm1 movaps %xmm0, -2 * SIZE(Y) movaps %xmm1, -2 * SIZE(X) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L11 ALIGN_3 .L13: testq $8, M jle .L14 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L14: testq $4, M jle .L15 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: testq $2, M jle .L16 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L16: testq $1, M jle .L19 movsd -16 * SIZE(X), %xmm0 movsd -16 * SIZE(Y), %xmm1 movlps %xmm1, -16 * SIZE(X) movlps %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L20: movhps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movlps %xmm1, -16 * SIZE(X) decq M jle .L29 movq M, %rax sarq $4, %rax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -7 * SIZE(X), %xmm2 movaps -6 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -5 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -5 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -3 * SIZE(X), %xmm2 movaps -2 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -4 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -3 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -2 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L23: testq $8, M jle .L24 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: testq $4, M jle .L25 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: testq $2, M jle .L26 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L26: testq $1, M jle .L29 movhps %xmm0, -16 * SIZE(Y) movhps -15 * SIZE(X), %xmm0 movhps %xmm1, -15 * SIZE(X) addq $SIZE, X addq $SIZE, Y ALIGN_3 .L29: movhps %xmm0, -16 * SIZE(Y) xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L40: movq M, %rax sarq $3, %rax jle .L45 ALIGN_3 .L41: movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) addq INCX, X movsd %xmm0, (Y) addq INCY, Y decq %rax jg .L41 ALIGN_3 .L45: movq M, %rax andq $7, %rax jle .L47 ALIGN_3 .L46: movsd (X), %xmm0 movsd (Y), %xmm1 movsd %xmm1, (X) movsd %xmm0, (Y) addq INCX, X addq INCY, Y decq %rax jg .L46 ALIGN_3 .L47: xorq %rax, %rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/symv_L_sse.S000066400000000000000000000477251313527062700176500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 8) #define movsd movlps #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) #define OLD_Y 64 + STACKSIZE(%rsp) #define OLD_INCY 72 + STACKSIZE(%rsp) #define OLD_BUFFER 80 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA %xmm0 #define atemp1 %xmm0 #define atemp2 %xmm1 #define atemp3 %xmm2 #define atemp4 %xmm3 #define xsum1 %xmm4 #define xsum2 %xmm5 #define xsum3 %xmm6 #define xsum4 %xmm7 #define xtemp1 %xmm8 #define xtemp2 %xmm9 #define yy1 %xmm10 #define xt1 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define a4 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 shufps $0, ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3 .L01: movss 0 * SIZE(X), %xmm1 addq INCX, X movss 0 * SIZE(X), %xmm2 addq INCX, X movss 0 * SIZE(X), %xmm3 addq INCX, X movss 0 * SIZE(X), %xmm4 addq INCX, X movss 0 * SIZE(X), %xmm5 addq INCX, X movss 0 * SIZE(X), %xmm6 addq INCX, X movss 0 * SIZE(X), %xmm7 addq INCX, X movss 0 * SIZE(X), %xmm8 addq INCX, X mulss ALPHA, %xmm1 mulss ALPHA, %xmm2 mulss ALPHA, %xmm3 mulss ALPHA, %xmm4 mulss ALPHA, %xmm5 mulss ALPHA, %xmm6 mulss ALPHA, %xmm7 mulss ALPHA, %xmm8 movss %xmm1, 0 * SIZE(XX) movss %xmm2, 1 * SIZE(XX) movss %xmm3, 2 * SIZE(XX) movss %xmm4, 3 * SIZE(XX) movss %xmm5, 4 * SIZE(XX) movss %xmm6, 5 * SIZE(XX) movss %xmm7, 6 * SIZE(XX) movss %xmm8, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3 .L03: movss 0 * SIZE(X), %xmm1 addq INCX, X mulss ALPHA, %xmm1 movss %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3 .L06: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss 0 * SIZE(YY), %xmm1 addq INCY, YY movss 0 * SIZE(YY), %xmm2 addq INCY, YY movss 0 * SIZE(YY), %xmm3 addq INCY, YY movss 0 * SIZE(YY), %xmm4 addq INCY, YY movss 0 * SIZE(YY), %xmm5 addq INCY, YY movss 0 * SIZE(YY), %xmm6 addq INCY, YY movss 0 * SIZE(YY), %xmm7 addq INCY, YY movss %xmm0, 0 * SIZE(XX) movss %xmm1, 1 * SIZE(XX) movss %xmm2, 2 * SIZE(XX) movss %xmm3, 3 * SIZE(XX) movss %xmm4, 4 * SIZE(XX) movss %xmm5, 5 * SIZE(XX) movss %xmm6, 6 * SIZE(XX) movss %xmm7, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3 .L08: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: xorq IS, IS # is = 0 cmpq $4, N jl .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 2), A2 leaq 4 * SIZE(A, LDA, 4), A leaq (NEW_X, IS, SIZE), XX leaq 4 * SIZE(NEW_Y, IS, SIZE), YY movaps 0 * SIZE(XX), atemp4 movsd 0 * SIZE(A1), xsum1 movhps 2 * SIZE(A1), xsum1 mulps atemp4, xsum1 movss 1 * SIZE(A1), xsum2 movss 1 * SIZE(A1, LDA, 1), a2 movss 2 * SIZE(A1, LDA, 1), a3 movss 3 * SIZE(A1, LDA, 1), a4 unpcklps a3, xsum2 unpcklps a4, a2 unpcklps a2, xsum2 mulps atemp4, xsum2 movss 2 * SIZE(A1), xsum3 movss 2 * SIZE(A1, LDA, 1), a2 movss 2 * SIZE(A2), a3 movss 3 * SIZE(A2), a4 unpcklps a3, xsum3 unpcklps a4, a2 unpcklps a2, xsum3 mulps atemp4, xsum3 movss 3 * SIZE(A1), xsum4 movss 3 * SIZE(A1, LDA, 1), a2 movss 3 * SIZE(A2), a3 movss 3 * SIZE(A2, LDA, 1), a4 unpcklps a3, xsum4 unpcklps a4, a2 unpcklps a2, xsum4 mulps atemp4, xsum4 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 pshufd $0xaa, atemp4, atemp3 pshufd $0xff, atemp4, atemp4 movaps 4 * SIZE(XX), xtemp1 movaps 8 * SIZE(XX), xtemp2 movsd 0 * SIZE(YY), yy1 movhps 2 * SIZE(YY), yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 addq $4 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $4, I sarq $4, I jle .L14 ALIGN_3 .L12: movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCH PREFETCHSIZE(XX) #endif movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 12 * SIZE(A1), a1 movhps 14 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 12 * SIZE(A1, LDA, 1), a2 movhps 14 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 12 * SIZE(A2), a3 movhps 14 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCHW PREFETCHSIZE(YY) #endif movaps xtemp1, xt1 movaps 16 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 12 * SIZE(A2, LDA, 1), a4 movhps 14 * SIZE(A2, LDA, 1), a4 movlps yy1, 8 * SIZE(YY) movhps yy1, 10 * SIZE(YY) movsd 12 * SIZE(YY), yy1 movhps 14 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 16 * SIZE(A1), a1 movhps 18 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 16 * SIZE(A1, LDA, 1), a2 movhps 18 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 16 * SIZE(A2), a3 movhps 18 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 20 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 16 * SIZE(A2, LDA, 1), a4 movhps 18 * SIZE(A2, LDA, 1), a4 movlps yy1, 12 * SIZE(YY) movhps yy1, 14 * SIZE(YY) movsd 16 * SIZE(YY), yy1 movhps 18 * SIZE(YY), yy1 addq $16 * SIZE, XX addq $16 * SIZE, YY addq $16 * SIZE, A1 addq $16 * SIZE, A2 decq I jg .L12 ALIGN_3 .L14: movq M, I subq IS, I subq $4, I test $8, I jle .L15 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3 .L15: test $4, I jle .L17 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movaps xtemp1, xt1 movsd 4 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L17: testq $2, M jle .L18 pxor xtemp2, xtemp2 movlhps xtemp2, a1 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movss 2 * SIZE(A1), a1 movlhps xtemp2, a2 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movss 2 * SIZE(A1, LDA, 1), a2 movlhps xtemp2, a3 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movss 2 * SIZE(A2), a3 movlhps xtemp2, a4 movaps xtemp1, xt1 movss 2 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movss 2 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movss 2 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 addq $2 * SIZE, A2 ALIGN_3 .L18: testq $1, M jle .L19 movss 0 * SIZE(XX), xtemp1 movss 0 * SIZE(YY), yy1 movss 0 * SIZE(A1), a1 movss 0 * SIZE(A1, LDA, 1), a2 movss 0 * SIZE(A2), a3 movss 0 * SIZE(A2, LDA, 1), a4 movaps xtemp1, xt1 mulss a1, xt1 mulss atemp1, a1 addss xt1, xsum1 addss a1, yy1 movaps xtemp1, xt1 mulss a2, xt1 mulss atemp2, a2 addss xt1, xsum2 addss a2, yy1 movaps xtemp1, xt1 mulss a3, xt1 mulss atemp3, a3 addss xt1, xsum3 addss a3, yy1 movaps xtemp1, xt1 mulss a4, xt1 mulss atemp4, a4 addss xt1, xsum4 addss a4, yy1 movss yy1, 0 * SIZE(YY) ALIGN_3 .L19: #ifndef HAVE_SSE3 movaps xsum1, xtemp1 unpcklps xsum3, xsum1 unpckhps xsum3, xtemp1 movaps xsum2, xtemp2 unpcklps xsum4, xsum2 unpckhps xsum4, xtemp2 movaps xsum1, xsum3 unpcklps xsum2, xsum1 unpckhps xsum2, xsum3 movaps xtemp1, xsum4 unpcklps xtemp2, xtemp1 unpckhps xtemp2, xsum4 addps xsum3, xsum1 addps xtemp1, xsum4 addps xsum4, xsum1 #else haddps xsum2, xsum1 haddps xsum4, xsum3 haddps xsum3, xsum1 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1 addps xsum1, yy1 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE) addq $4, IS movq IS, I addq $4, I cmpq N, I jle .L11 ALIGN_3 .L20: testq $2, N jle .L30 movq A, A1 leaq 2 * SIZE(A, LDA, 2), A movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 #if defined(OPTERON) pxor xsum1, xsum1 #endif movsd 0 * SIZE(A1), xsum1 mulps atemp4, xsum1 movss 1 * SIZE(A1), xsum2 movss 1 * SIZE(A1, LDA, 1), a2 unpcklps a2, xsum2 mulps atemp4, xsum2 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 testq $1, M jle .L29 movss 2 * SIZE(A1), a1 movss 2 * SIZE(A1, LDA, 1), a2 movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1 movss 2 * SIZE(NEW_Y, IS, SIZE), yy1 movaps xtemp1, xt1 mulss a1, xt1 mulss atemp1, a1 addss xt1, xsum1 addps a1, yy1 movaps xtemp1, xt1 mulss a2, xt1 mulss atemp2, a2 addss xt1, xsum2 addss a2, yy1 movss yy1, 2 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L29: #ifndef HAVE_SSE3 unpcklps xsum2, xsum1 movhlps xsum1, xsum2 addps xsum2, xsum1 #else haddps xsum2, xsum1 haddps xsum1, xsum1 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 addps xsum1, yy1 movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE) addq $2, IS ALIGN_3 .L30: testq $1, N jle .L990 movss 0 * SIZE(NEW_X, IS, SIZE), xsum1 mulss 0 * SIZE(A), xsum1 addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1 movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3 .L996: movss 0 * SIZE(NEW_Y), %xmm0 movss 1 * SIZE(NEW_Y), %xmm1 movss 2 * SIZE(NEW_Y), %xmm2 movss 3 * SIZE(NEW_Y), %xmm3 movss 4 * SIZE(NEW_Y), %xmm4 movss 5 * SIZE(NEW_Y), %xmm5 movss 6 * SIZE(NEW_Y), %xmm6 movss 7 * SIZE(NEW_Y), %xmm7 movss %xmm0, 0 * SIZE(Y) addq INCY, Y movss %xmm1, 0 * SIZE(Y) addq INCY, Y movss %xmm2, 0 * SIZE(Y) addq INCY, Y movss %xmm3, 0 * SIZE(Y) addq INCY, Y movss %xmm4, 0 * SIZE(Y) addq INCY, Y movss %xmm5, 0 * SIZE(Y) addq INCY, Y movss %xmm6, 0 * SIZE(Y) addq INCY, Y movss %xmm7, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3 .L998: movss 0 * SIZE(NEW_Y), %xmm0 movss %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/symv_L_sse2.S000066400000000000000000000461401313527062700177200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 8) #define movsd movlpd #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) #define OLD_Y 64 + STACKSIZE(%rsp) #define OLD_INCY 72 + STACKSIZE(%rsp) #define OLD_BUFFER 80 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA %xmm0 #define xtemp1 %xmm0 #define xtemp2 %xmm1 #define yy1 %xmm2 #define yy2 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xsum1 %xmm8 #define xsum2 %xmm9 #define xsum3 %xmm10 #define xsum4 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define xt1 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 unpcklpd ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3 .L01: movsd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X movhpd 0 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 addq INCX, X movhpd 0 * SIZE(X), %xmm3 addq INCX, X movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 mulpd ALPHA, %xmm4 movapd %xmm1, 0 * SIZE(XX) movapd %xmm2, 2 * SIZE(XX) movapd %xmm3, 4 * SIZE(XX) movapd %xmm4, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3 .L03: movsd 0 * SIZE(X), %xmm1 addq INCX, X mulsd ALPHA, %xmm1 movlpd %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movhpd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 addq INCY, YY movhpd 0 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 addq INCY, YY movhpd 0 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 addq INCY, YY movhpd 0 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: xorq IS, IS # is = 0 cmpq $4, N jl .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 2), A2 leaq 4 * SIZE(A, LDA, 4), A leaq (NEW_X, IS, SIZE), XX leaq 4 * SIZE(NEW_Y, IS, SIZE), YY movapd 0 * SIZE(XX), atemp2 movapd 2 * SIZE(XX), atemp4 movsd 0 * SIZE(A1), xsum1 movhpd 1 * SIZE(A1), xsum1 mulpd atemp2, xsum1 movsd 1 * SIZE(A1), xsum2 movhpd 1 * SIZE(A1, LDA, 1), xsum2 mulpd atemp2, xsum2 movsd 2 * SIZE(A1), xsum3 movhpd 2 * SIZE(A1, LDA, 1), xsum3 mulpd atemp2, xsum3 movsd 3 * SIZE(A1), xsum4 movhpd 3 * SIZE(A1, LDA, 1), xsum4 mulpd atemp2, xsum4 movsd 2 * SIZE(A1), a1 movhpd 3 * SIZE(A1), a1 mulpd atemp4, a1 addpd a1, xsum1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 mulpd atemp4, a1 addpd a1, xsum2 movsd 2 * SIZE(A2), a1 movhpd 3 * SIZE(A2), a1 mulpd atemp4, a1 addpd a1, xsum3 movsd 3 * SIZE(A2), a1 movhpd 3 * SIZE(A2, LDA, 1), a1 mulpd atemp4, a1 addpd a1, xsum4 movapd 4 * SIZE(XX), xtemp1 movapd 6 * SIZE(XX), xtemp2 movsd 4 * SIZE(A1), a1 movhpd 5 * SIZE(A1), a1 movsd 6 * SIZE(A1), a2 movhpd 7 * SIZE(A1), a2 movsd 4 * SIZE(A1, LDA, 1), a3 movhpd 5 * SIZE(A1, LDA, 1), a3 movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 movsd 2 * SIZE(YY), yy2 movhpd 3 * SIZE(YY), yy2 #ifndef HAVE_SSE3 movapd atemp2, atemp1 unpcklpd atemp1, atemp1 unpckhpd atemp2, atemp2 movapd atemp4, atemp3 unpcklpd atemp3, atemp3 unpckhpd atemp4, atemp4 #else movddup atemp2, atemp1 unpckhpd atemp2, atemp2 movddup atemp4, atemp3 unpckhpd atemp4, atemp4 #endif addq $4 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $4, I sarq $3, I jle .L15 ALIGN_3 .L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 PREFETCH PREFETCHSIZE(A1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCH PREFETCHSIZE(XX) #endif movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 PREFETCH PREFETCHSIZE(A1, LDA, 1) movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movsd 4 * SIZE(A1), a3 movhpd 5 * SIZE(A1), a3 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 6 * SIZE(A1), a1 movhpd 7 * SIZE(A1), a1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd 4 * SIZE(A1, LDA, 1), a2 movhpd 5 * SIZE(A1, LDA, 1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp1, a3 addpd xt1, xsum1 addpd a3, yy1 movsd 6 * SIZE(A1, LDA, 1), a3 movhpd 7 * SIZE(A1, LDA, 1), a3 PREFETCH PREFETCHSIZE(A2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 movsd 4 * SIZE(A2), a1 movhpd 5 * SIZE(A2), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum2 addpd a2, yy1 movsd 6 * SIZE(A2), a2 movhpd 7 * SIZE(A2), a2 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCHW PREFETCHSIZE(YY) #endif movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy2 movsd 4 * SIZE(A2, LDA, 1), a3 movhpd 5 * SIZE(A2, LDA, 1), a3 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum3 addpd a1, yy1 movsd 6 * SIZE(A2, LDA, 1), a1 movhpd 7 * SIZE(A2, LDA, 1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy2 movsd 10 * SIZE(A1), a2 movhpd 11 * SIZE(A1), a2 movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a3, xt1 mulpd atemp4, a3 addpd xt1, xsum4 addpd a3, yy1 movsd 8 * SIZE(A1, LDA, 1), a3 movhpd 9 * SIZE(A1, LDA, 1), a3 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy2 movsd 8 * SIZE(A1), a1 movhpd 9 * SIZE(A1), a1 movsd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 movsd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: movq M, I subq IS, I subq $4, I test $4, I jle .L17 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3 movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movsd 4 * SIZE(A1, LDA, 1), a3 movhpd 5 * SIZE(A1, LDA, 1), a3 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 4 * SIZE(A1), a1 movhpd 5 * SIZE(A1), a1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd 6 * SIZE(A1), a2 movhpd 7 * SIZE(A1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L17: testq $2, M jle .L18 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 0 * SIZE(A1, LDA, 1), a1 movhpd 1 * SIZE(A1, LDA, 1), a1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy1 movsd 0 * SIZE(A2), a1 movhpd 1 * SIZE(A2), a1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum3 addpd a1, yy1 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 movapd 2 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 2 * SIZE(A1), a1 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 2 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 addq $2 * SIZE, A2 ALIGN_3 .L18: testq $1, M jle .L19 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp1, a1 addsd xt1, xsum1 addpd a1, yy1 movsd 0 * SIZE(A1, LDA, 1), a1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp2, a1 addsd xt1, xsum2 addsd a1, yy1 movsd 0 * SIZE(A2), a1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp3, a1 addsd xt1, xsum3 addsd a1, yy1 movsd 0 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp4, a1 addsd xt1, xsum4 addsd a1, yy1 movsd yy1, 0 * SIZE(YY) ALIGN_3 .L19: #ifndef HAVE_SSE3 movapd xsum1, atemp1 movapd xsum3, atemp3 unpcklpd xsum2, xsum1 unpcklpd xsum4, xsum3 unpckhpd xsum2, atemp1 unpckhpd xsum4, atemp3 addpd atemp1, xsum1 addpd atemp3, xsum3 #else haddpd xsum2, xsum1 haddpd xsum4, xsum3 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 movsd 2 * SIZE(NEW_Y, IS, SIZE), yy2 movhpd 3 * SIZE(NEW_Y, IS, SIZE), yy2 addpd xsum1, yy1 addpd xsum3, yy2 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) movsd yy2, 2 * SIZE(NEW_Y, IS, SIZE) movhpd yy2, 3 * SIZE(NEW_Y, IS, SIZE) addq $4, IS movq IS, I addq $4, I cmpq N, I jle .L11 ALIGN_3 .L20: testq $2, N jle .L30 movq A, A1 leaq 2 * SIZE(A, LDA, 2), A movapd 0 * SIZE(NEW_X, IS, SIZE), atemp2 movsd 0 * SIZE(A1), xsum1 movhpd 1 * SIZE(A1), xsum1 mulpd atemp2, xsum1 movsd 1 * SIZE(A1), xsum2 movhpd 1 * SIZE(A1, LDA, 1), xsum2 mulpd atemp2, xsum2 #ifndef HAVE_SSE3 movapd atemp2, atemp1 unpcklpd atemp1, atemp1 #else movddup atemp2, atemp1 #endif unpckhpd atemp2, atemp2 testq $1, M jle .L29 movsd 2 * SIZE(A1), a1 movsd 2 * SIZE(A1, LDA, 1), a2 movsd 2 * SIZE(NEW_X, IS, SIZE), xtemp1 movsd 2 * SIZE(NEW_Y, IS, SIZE), yy1 movapd xtemp1, xt1 mulsd a1, xt1 mulsd atemp1, a1 addsd xt1, xsum1 addpd a1, yy1 movapd xtemp1, xt1 mulsd a2, xt1 mulsd atemp2, a2 addsd xt1, xsum2 addsd a2, yy1 movsd yy1, 2 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L29: #ifndef HAVE_SSE3 movapd xsum1, atemp1 unpcklpd xsum2, xsum1 unpckhpd xsum2, atemp1 addpd atemp1, xsum1 #else haddpd xsum2, xsum1 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 addpd xsum1, yy1 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) addq $2, IS ALIGN_3 .L30: testq $1, N jle .L990 movsd 0 * SIZE(A), xsum1 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 mulsd atemp1, xsum1 addsd xsum1, yy1 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3 .L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y movhpd %xmm0, 0 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) addq INCY, Y movhpd %xmm1, 0 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) addq INCY, Y movhpd %xmm2, 0 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) addq INCY, Y movhpd %xmm3, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3 .L998: movsd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/symv_U_sse.S000066400000000000000000000475731313527062700176620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 8) #define movsd movlps #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define IS ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) #define OLD_Y 64 + STACKSIZE(%rsp) #define OLD_INCY 72 + STACKSIZE(%rsp) #define OLD_BUFFER 80 + STACKSIZE(%rsp) #define M ARG1 #define IS ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define NEW_X BUFFER #define NEW_Y X #define ALPHA %xmm0 #define atemp1 %xmm0 #define atemp2 %xmm1 #define atemp3 %xmm2 #define atemp4 %xmm3 #define xsum1 %xmm4 #define xsum2 %xmm5 #define xsum3 %xmm6 #define xsum4 %xmm7 #define xtemp1 %xmm8 #define xtemp2 %xmm9 #define yy1 %xmm10 #define xt1 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define a4 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 negq IS addq M, IS movq IS, TEMP imulq LDA, TEMP addq TEMP, A shufps $0, ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3 .L01: movss 0 * SIZE(X), %xmm1 addq INCX, X movss 0 * SIZE(X), %xmm2 addq INCX, X movss 0 * SIZE(X), %xmm3 addq INCX, X movss 0 * SIZE(X), %xmm4 addq INCX, X movss 0 * SIZE(X), %xmm5 addq INCX, X movss 0 * SIZE(X), %xmm6 addq INCX, X movss 0 * SIZE(X), %xmm7 addq INCX, X movss 0 * SIZE(X), %xmm8 addq INCX, X mulss ALPHA, %xmm1 mulss ALPHA, %xmm2 mulss ALPHA, %xmm3 mulss ALPHA, %xmm4 mulss ALPHA, %xmm5 mulss ALPHA, %xmm6 mulss ALPHA, %xmm7 mulss ALPHA, %xmm8 movss %xmm1, 0 * SIZE(XX) movss %xmm2, 1 * SIZE(XX) movss %xmm3, 2 * SIZE(XX) movss %xmm4, 3 * SIZE(XX) movss %xmm5, 4 * SIZE(XX) movss %xmm6, 5 * SIZE(XX) movss %xmm7, 6 * SIZE(XX) movss %xmm8, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3 .L03: movss 0 * SIZE(X), %xmm1 addq INCX, X mulss ALPHA, %xmm1 movss %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3 .L06: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss 0 * SIZE(YY), %xmm1 addq INCY, YY movss 0 * SIZE(YY), %xmm2 addq INCY, YY movss 0 * SIZE(YY), %xmm3 addq INCY, YY movss 0 * SIZE(YY), %xmm4 addq INCY, YY movss 0 * SIZE(YY), %xmm5 addq INCY, YY movss 0 * SIZE(YY), %xmm6 addq INCY, YY movss 0 * SIZE(YY), %xmm7 addq INCY, YY movss %xmm0, 0 * SIZE(XX) movss %xmm1, 1 * SIZE(XX) movss %xmm2, 2 * SIZE(XX) movss %xmm3, 3 * SIZE(XX) movss %xmm4, 4 * SIZE(XX) movss %xmm5, 5 * SIZE(XX) movss %xmm6, 6 * SIZE(XX) movss %xmm7, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3 .L08: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: movq IS, I addq $4, I cmpq M, I jg .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 pshufd $0xaa, atemp4, atemp3 pshufd $0xff, atemp4, atemp4 pxor xsum1, xsum1 pxor xsum2, xsum2 pxor xsum3, xsum3 pxor xsum4, xsum4 movaps 0 * SIZE(NEW_X), xtemp1 movaps 4 * SIZE(NEW_X), xtemp2 movsd 0 * SIZE(A1), a1 movhps 2 * SIZE(A1), a1 movsd 0 * SIZE(A1, LDA, 1), a2 movhps 2 * SIZE(A1, LDA, 1), a2 movsd 0 * SIZE(A2), a3 movhps 2 * SIZE(A2), a3 movsd 0 * SIZE(A2, LDA, 1), a4 movhps 2 * SIZE(A2, LDA, 1), a4 movsd 0 * SIZE(NEW_Y), yy1 movhps 2 * SIZE(NEW_Y), yy1 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $4, I jle .L14 ALIGN_3 .L12: movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCH PREFETCHSIZE(XX) #endif movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 12 * SIZE(A1), a1 movhps 14 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 12 * SIZE(A1, LDA, 1), a2 movhps 14 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 12 * SIZE(A2), a3 movhps 14 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCHW PREFETCHSIZE(YY) #endif movaps xtemp1, xt1 movaps 16 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 12 * SIZE(A2, LDA, 1), a4 movhps 14 * SIZE(A2, LDA, 1), a4 movlps yy1, 8 * SIZE(YY) movhps yy1, 10 * SIZE(YY) movsd 12 * SIZE(YY), yy1 movhps 14 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 16 * SIZE(A1), a1 movhps 18 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 16 * SIZE(A1, LDA, 1), a2 movhps 18 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 16 * SIZE(A2), a3 movhps 18 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 20 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 16 * SIZE(A2, LDA, 1), a4 movhps 18 * SIZE(A2, LDA, 1), a4 movlps yy1, 12 * SIZE(YY) movhps yy1, 14 * SIZE(YY) movsd 16 * SIZE(YY), yy1 movhps 18 * SIZE(YY), yy1 addq $16 * SIZE, XX addq $16 * SIZE, YY addq $16 * SIZE, A1 addq $16 * SIZE, A2 decq I jg .L12 ALIGN_3 .L14: testq $8, IS jle .L15 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3 .L15: testq $4, IS jle .L18 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movaps xtemp1, xt1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L18: movaps 0 * SIZE(NEW_X, IS, SIZE), atemp1 movss 0 * SIZE(A1), a1 movss 0 * SIZE(A1, LDA, 1), a2 movss 0 * SIZE(A2), a3 movss 0 * SIZE(A2, LDA, 1), a4 unpcklps a3, a1 unpcklps a4, a2 unpcklps a2, a1 mulps atemp1, a1 addps a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 movss 1 * SIZE(A2), a2 movhps 1 * SIZE(A2, LDA, 1), a2 shufps $0x84, a2, a1 mulps atemp1, a1 addps a1, xsum2 movsd 0 * SIZE(A2), a1 movss 2 * SIZE(A2), a2 movhps 2 * SIZE(A2, LDA, 1), a2 shufps $0x84, a2, a1 mulps atemp1, a1 addps a1, xsum3 movsd 0 * SIZE(A2, LDA, 1), a1 movhps 2 * SIZE(A2, LDA, 1), a1 mulps atemp1, a1 addps a1, xsum4 #ifndef HAVE_SSE3 movaps xsum1, xtemp1 unpcklps xsum3, xsum1 unpckhps xsum3, xtemp1 movaps xsum2, xtemp2 unpcklps xsum4, xsum2 unpckhps xsum4, xtemp2 movaps xsum1, xsum3 unpcklps xsum2, xsum1 unpckhps xsum2, xsum3 movaps xtemp1, xsum4 unpcklps xtemp2, xtemp1 unpckhps xtemp2, xsum4 addps xsum3, xsum1 addps xtemp1, xsum4 addps xsum4, xsum1 #else haddps xsum2, xsum1 haddps xsum4, xsum3 haddps xsum3, xsum1 #endif addps xsum1, yy1 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) addq $4, IS movq IS, I addq $4, I cmpq M, I jle .L11 ALIGN_3 .L20: testq $2, M jle .L30 movq A, A1 leaq (A, LDA, 2), A movsd 0 * SIZE(NEW_X, IS, SIZE), atemp4 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 pxor xsum1, xsum1 pxor xsum2, xsum2 movaps 0 * SIZE(NEW_X), xtemp1 movsd 0 * SIZE(A1), a1 movhps 2 * SIZE(A1), a1 movsd 0 * SIZE(A1, LDA, 1), a2 movhps 2 * SIZE(A1, LDA, 1), a2 movsd 0 * SIZE(NEW_Y), yy1 movhps 2 * SIZE(NEW_Y), yy1 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $2, I jle .L28 ALIGN_3 .L22: movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movaps xtemp1, xt1 movaps 4 * SIZE(XX), xtemp1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 decq I jg .L22 ALIGN_3 .L28: movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movss 0 * SIZE(A1), a1 movss 0 * SIZE(A1, LDA, 1), a2 unpcklps a2, a1 mulps atemp1, a1 addps a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 mulps atemp1, a1 addps a1, xsum2 #ifndef HAVE_SSE3 movhlps xsum1, xsum3 movhlps xsum2, xsum4 addps xsum3, xsum1 addps xsum4, xsum2 unpcklps xsum2, xsum1 movhlps xsum1, xsum2 addps xsum2, xsum1 #else haddps xsum2, xsum1 haddps xsum1, xsum1 #endif addps xsum1, yy1 movlps yy1, 0 * SIZE(YY) addq $2, IS ALIGN_3 .L30: testq $1, M jle .L990 movq A, A1 movss 0 * SIZE(NEW_X, IS, SIZE), atemp1 pshufd $0x00, atemp1, atemp1 pxor xsum1, xsum1 pxor xsum2, xsum2 movss 0 * SIZE(NEW_Y), yy1 movss 0 * SIZE(NEW_X), xtemp1 movss 1 * SIZE(NEW_X), xtemp2 movss 0 * SIZE(A1), a1 movss 1 * SIZE(A1), a2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $1, I jle .L38 ALIGN_3 .L32: movaps xtemp1, xt1 movss 2 * SIZE(XX), xtemp1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movss 2 * SIZE(A1), a1 movss yy1, 0 * SIZE(YY) movss 1 * SIZE(YY), yy1 movaps xtemp2, xt1 movss 3 * SIZE(XX), xtemp2 mulps a2, xt1 mulps atemp1, a2 addps xt1, xsum1 addps a2, yy1 movss 3 * SIZE(A1), a2 movss yy1, 1 * SIZE(YY) movss 2 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 decq I jg .L32 ALIGN_3 .L38: movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movss 0 * SIZE(A1), a1 mulss atemp1, a1 addss a1, xsum1 #ifndef HAVE_SSE3 movhlps xsum1, xsum3 movhlps xsum2, xsum4 addps xsum3, xsum1 addps xsum4, xsum2 unpcklps xsum2, xsum1 movhlps xsum1, xsum2 addps xsum2, xsum1 #else addss xsum2, xsum1 #endif addss xsum1, yy1 movss yy1, 0 * SIZE(YY) addq $2, IS ALIGN_3 .L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3 .L996: movss 0 * SIZE(NEW_Y), %xmm0 movss 1 * SIZE(NEW_Y), %xmm1 movss 2 * SIZE(NEW_Y), %xmm2 movss 3 * SIZE(NEW_Y), %xmm3 movss 4 * SIZE(NEW_Y), %xmm4 movss 5 * SIZE(NEW_Y), %xmm5 movss 6 * SIZE(NEW_Y), %xmm6 movss 7 * SIZE(NEW_Y), %xmm7 movss %xmm0, 0 * SIZE(Y) addq INCY, Y movss %xmm1, 0 * SIZE(Y) addq INCY, Y movss %xmm2, 0 * SIZE(Y) addq INCY, Y movss %xmm3, 0 * SIZE(Y) addq INCY, Y movss %xmm4, 0 * SIZE(Y) addq INCY, Y movss %xmm5, 0 * SIZE(Y) addq INCY, Y movss %xmm6, 0 * SIZE(Y) addq INCY, Y movss %xmm7, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3 .L998: movss 0 * SIZE(NEW_Y), %xmm0 movss %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/symv_U_sse2.S000066400000000000000000000454371313527062700177410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 8) #define movsd movlpd #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define IS ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) #define OLD_Y 64 + STACKSIZE(%rsp) #define OLD_INCY 72 + STACKSIZE(%rsp) #define OLD_BUFFER 80 + STACKSIZE(%rsp) #define M ARG1 #define IS ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define NEW_X BUFFER #define NEW_Y X #define ALPHA %xmm0 #define xtemp1 %xmm0 #define xtemp2 %xmm1 #define yy1 %xmm2 #define yy2 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xsum1 %xmm8 #define xsum2 %xmm9 #define xsum3 %xmm10 #define xsum4 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define xt1 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 negq IS addq M, IS movq IS, TEMP imulq LDA, TEMP addq TEMP, A unpcklpd ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3 .L01: movsd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X movhpd 0 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 addq INCX, X movhpd 0 * SIZE(X), %xmm3 addq INCX, X movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 mulpd ALPHA, %xmm4 movapd %xmm1, 0 * SIZE(XX) movapd %xmm2, 2 * SIZE(XX) movapd %xmm3, 4 * SIZE(XX) movapd %xmm4, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3 .L03: movsd 0 * SIZE(X), %xmm1 addq INCX, X mulsd ALPHA, %xmm1 movlpd %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movhpd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 addq INCY, YY movhpd 0 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 addq INCY, YY movhpd 0 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 addq INCY, YY movhpd 0 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: movq IS, I addq $4, I cmpq M, I jg .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A #ifdef HAVE_SSE3 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 movddup 2 * SIZE(NEW_X, IS, SIZE), atemp3 movddup 3 * SIZE(NEW_X, IS, SIZE), atemp4 #else movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 movsd 2 * SIZE(NEW_X, IS, SIZE), atemp3 movhpd 2 * SIZE(NEW_X, IS, SIZE), atemp3 movsd 3 * SIZE(NEW_X, IS, SIZE), atemp4 movhpd 3 * SIZE(NEW_X, IS, SIZE), atemp4 #endif pxor xsum1, xsum1 pxor xsum2, xsum2 pxor xsum3, xsum3 pxor xsum4, xsum4 movapd 0 * SIZE(NEW_X), xtemp1 movapd 2 * SIZE(NEW_X), xtemp2 movsd 0 * SIZE(A1), a1 movhpd 1 * SIZE(A1), a1 movsd 2 * SIZE(A1), a2 movhpd 3 * SIZE(A1), a2 movsd 0 * SIZE(A1, LDA, 1), a3 movhpd 1 * SIZE(A1, LDA, 1), a3 movsd 0 * SIZE(NEW_Y), yy1 movhpd 1 * SIZE(NEW_Y), yy1 movsd 2 * SIZE(NEW_Y), yy2 movhpd 3 * SIZE(NEW_Y), yy2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $3, I jle .L15 ALIGN_3 .L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 PREFETCH PREFETCHSIZE(A1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCH PREFETCHSIZE(XX) #endif movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 PREFETCH PREFETCHSIZE(A1, LDA, 1) movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movsd 4 * SIZE(A1), a3 movhpd 5 * SIZE(A1), a3 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 6 * SIZE(A1), a1 movhpd 7 * SIZE(A1), a1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd 4 * SIZE(A1, LDA, 1), a2 movhpd 5 * SIZE(A1, LDA, 1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp1, a3 addpd xt1, xsum1 addpd a3, yy1 movsd 6 * SIZE(A1, LDA, 1), a3 movhpd 7 * SIZE(A1, LDA, 1), a3 PREFETCH PREFETCHSIZE(A2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 movsd 4 * SIZE(A2), a1 movhpd 5 * SIZE(A2), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum2 addpd a2, yy1 movsd 6 * SIZE(A2), a2 movhpd 7 * SIZE(A2), a2 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCHW PREFETCHSIZE(YY) #endif movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy2 movsd 4 * SIZE(A2, LDA, 1), a3 movhpd 5 * SIZE(A2, LDA, 1), a3 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum3 addpd a1, yy1 movsd 6 * SIZE(A2, LDA, 1), a1 movhpd 7 * SIZE(A2, LDA, 1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy2 movsd 10 * SIZE(A1), a2 movhpd 11 * SIZE(A1), a2 movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a3, xt1 mulpd atemp4, a3 addpd xt1, xsum4 addpd a3, yy1 movsd 8 * SIZE(A1, LDA, 1), a3 movhpd 9 * SIZE(A1, LDA, 1), a3 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy2 movsd 8 * SIZE(A1), a1 movhpd 9 * SIZE(A1), a1 movsd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 movsd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: testq $4, IS jle .L18 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3 movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L18: unpckhpd atemp2, atemp1 unpckhpd atemp4, atemp3 movsd 0 * SIZE(A1), a1 movhpd 0 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 movhpd 1 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum2 movsd 0 * SIZE(A2), a1 movhpd 1 * SIZE(A2), a1 mulpd atemp1, a1 addpd a1, xsum3 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum4 movsd 0 * SIZE(A2), a1 movhpd 0 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum1 movsd 1 * SIZE(A2), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum2 movsd 2 * SIZE(A2), a1 movhpd 2 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum3 movsd 2 * SIZE(A2, LDA, 1), a1 movhpd 3 * SIZE(A2, LDA, 1), a1 mulpd atemp3, a1 addpd a1, xsum4 #ifndef HAVE_SSE3 movapd xsum1, atemp1 movapd xsum3, atemp3 unpcklpd xsum2, xsum1 unpcklpd xsum4, xsum3 unpckhpd xsum2, atemp1 unpckhpd xsum4, atemp3 addpd atemp1, xsum1 addpd atemp3, xsum3 #else haddpd xsum2, xsum1 haddpd xsum4, xsum3 #endif addpd xsum1, yy1 addpd xsum3, yy2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) addq $4, IS movq IS, I addq $4, I cmpq M, I jle .L11 ALIGN_3 .L20: testq $2, M je .L30 ALIGN_3 .L21: movq A, A1 leaq (A, LDA, 2), A #ifdef HAVE_SSE3 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 #else movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 #endif pxor xsum1, xsum1 pxor xsum2, xsum2 movapd 0 * SIZE(NEW_X), xtemp1 movsd 0 * SIZE(NEW_Y), yy1 movhpd 1 * SIZE(NEW_Y), yy1 movsd 0 * SIZE(A1), a1 movhpd 1 * SIZE(A1), a1 movsd 0 * SIZE(A1, LDA, 1), a2 movhpd 1 * SIZE(A1, LDA, 1), a2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $1, I jle .L28 ALIGN_3 .L22: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1), a1 movhpd 3 * SIZE(A1), a1 movapd xtemp1, xt1 movapd 2 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum2 addpd a2, yy1 movsd 2 * SIZE(A1, LDA, 1), a2 movhpd 3 * SIZE(A1, LDA, 1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 2 * SIZE(YY), yy1 movhpd 3 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 decq I jg .L22 ALIGN_3 .L28: unpckhpd atemp2, atemp1 movsd 0 * SIZE(A1), a1 movhpd 0 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum1 movsd 0 * SIZE(A1, LDA, 1), a1 movhpd 1 * SIZE(A1, LDA, 1), a1 mulpd atemp1, a1 addpd a1, xsum2 #ifndef HAVE_SSE3 movapd xsum1, atemp1 unpcklpd xsum2, xsum1 unpckhpd xsum2, atemp1 addpd atemp1, xsum1 #else haddpd xsum2, xsum1 #endif addpd xsum1, yy1 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) addq $2, IS ALIGN_3 .L30: testq $1, M je .L990 ALIGN_3 .L31: movq A, A1 #ifdef HAVE_SSE3 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 #else movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 #endif pxor xsum1, xsum1 movsd 0 * SIZE(NEW_X), xtemp1 movsd 0 * SIZE(NEW_Y), yy1 movsd 0 * SIZE(A1), a1 movq NEW_X, XX movq NEW_Y, YY movq IS, I testq I, I jle .L38 ALIGN_3 .L32: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 1 * SIZE(A1), a1 movsd 1 * SIZE(XX), xtemp1 movsd yy1, 0 * SIZE(YY) movsd 1 * SIZE(YY), yy1 addq $1 * SIZE, XX addq $1 * SIZE, YY addq $1 * SIZE, A1 decq I jg .L32 ALIGN_3 .L38: movsd 0 * SIZE(A1), a1 mulsd atemp1, a1 addsd a1, xsum1 addsd xsum1, yy1 movsd yy1, 0 * SIZE(YY) ALIGN_3 .L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3 .L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y movhpd %xmm0, 0 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) addq INCY, Y movhpd %xmm1, 0 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) addq INCY, Y movhpd %xmm2, 0 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) addq INCY, Y movhpd %xmm3, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3 .L998: movsd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S000066400000000000000000001552161313527062700226300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movaps %xmm3, %xmm0 #endif subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $3, J NOBRANCH jle .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 8), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 4), CO2 #ifndef RT leaq (C, LDC, 8), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB testq $1, M BRANCH jle .L20 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 16 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd -10 * SIZE(BO), %xmm3 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm3, %xmm7 movaps %xmm3, %xmm6 pshufd $0xe, %xmm2, %xmm5 movaps %xmm2, %xmm4 pshufd $0xe, %xmm1, %xmm3 movaps %xmm1, %xmm2 pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm1 movsd -14 * SIZE(BO), %xmm10 mulsd %xmm0, %xmm10 subsd %xmm10, %xmm2 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm3 movsd -12 * SIZE(BO), %xmm12 mulsd %xmm0, %xmm12 subsd %xmm12, %xmm4 movsd -11 * SIZE(BO), %xmm13 mulsd %xmm0, %xmm13 subsd %xmm13, %xmm5 movsd -10 * SIZE(BO), %xmm14 mulsd %xmm0, %xmm14 subsd %xmm14, %xmm6 movsd -9 * SIZE(BO), %xmm15 mulsd %xmm0, %xmm15 subsd %xmm15, %xmm7 movsd -7 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm1 movsd -6 * SIZE(BO), %xmm10 mulsd %xmm1, %xmm10 subsd %xmm10, %xmm2 movsd -5 * SIZE(BO), %xmm11 mulsd %xmm1, %xmm11 subsd %xmm11, %xmm3 movsd -4 * SIZE(BO), %xmm12 mulsd %xmm1, %xmm12 subsd %xmm12, %xmm4 movsd -3 * SIZE(BO), %xmm13 mulsd %xmm1, %xmm13 subsd %xmm13, %xmm5 movsd -2 * SIZE(BO), %xmm14 mulsd %xmm1, %xmm14 subsd %xmm14, %xmm6 movsd -1 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm7 movsd 2 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm2 movsd 3 * SIZE(BO), %xmm11 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm3 movsd 4 * SIZE(BO), %xmm12 mulsd %xmm2, %xmm12 subsd %xmm12, %xmm4 movsd 5 * SIZE(BO), %xmm13 mulsd %xmm2, %xmm13 subsd %xmm13, %xmm5 movsd 6 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm6 movsd 7 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm7 movsd 11 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm3 movsd 12 * SIZE(BO), %xmm12 mulsd %xmm3, %xmm12 subsd %xmm12, %xmm4 movsd 13 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm5 movsd 14 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm6 movsd 15 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm7 movsd 20 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm4 movsd 21 * SIZE(BO), %xmm13 mulsd %xmm4, %xmm13 subsd %xmm13, %xmm5 movsd 22 * SIZE(BO), %xmm14 mulsd %xmm4, %xmm14 subsd %xmm14, %xmm6 movsd 23 * SIZE(BO), %xmm15 mulsd %xmm4, %xmm15 subsd %xmm15, %xmm7 movsd 29 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm5 movsd 30 * SIZE(BO), %xmm14 mulsd %xmm5, %xmm14 subsd %xmm14, %xmm6 movsd 31 * SIZE(BO), %xmm15 mulsd %xmm5, %xmm15 subsd %xmm15, %xmm7 movsd 38 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm6 movsd 39 * SIZE(BO), %xmm15 mulsd %xmm6, %xmm15 subsd %xmm15, %xmm7 movsd 47 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm7 #endif #ifdef RT movsd 47 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm7 movsd 46 * SIZE(BO), %xmm9 mulsd %xmm7, %xmm9 subsd %xmm9, %xmm6 movsd 45 * SIZE(BO), %xmm10 mulsd %xmm7, %xmm10 subsd %xmm10, %xmm5 movsd 44 * SIZE(BO), %xmm11 mulsd %xmm7, %xmm11 subsd %xmm11, %xmm4 movsd 43 * SIZE(BO), %xmm12 mulsd %xmm7, %xmm12 subsd %xmm12, %xmm3 movsd 42 * SIZE(BO), %xmm13 mulsd %xmm7, %xmm13 subsd %xmm13, %xmm2 movsd 41 * SIZE(BO), %xmm14 mulsd %xmm7, %xmm14 subsd %xmm14, %xmm1 movsd 40 * SIZE(BO), %xmm15 mulsd %xmm7, %xmm15 subsd %xmm15, %xmm0 movsd 38 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm6 movsd 37 * SIZE(BO), %xmm10 mulsd %xmm6, %xmm10 subsd %xmm10, %xmm5 movsd 36 * SIZE(BO), %xmm11 mulsd %xmm6, %xmm11 subsd %xmm11, %xmm4 movsd 35 * SIZE(BO), %xmm12 mulsd %xmm6, %xmm12 subsd %xmm12, %xmm3 movsd 34 * SIZE(BO), %xmm13 mulsd %xmm6, %xmm13 subsd %xmm13, %xmm2 movsd 33 * SIZE(BO), %xmm14 mulsd %xmm6, %xmm14 subsd %xmm14, %xmm1 movsd 32 * SIZE(BO), %xmm15 mulsd %xmm6, %xmm15 subsd %xmm15, %xmm0 movsd 29 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm5 movsd 28 * SIZE(BO), %xmm11 mulsd %xmm5, %xmm11 subsd %xmm11, %xmm4 movsd 27 * SIZE(BO), %xmm12 mulsd %xmm5, %xmm12 subsd %xmm12, %xmm3 movsd 26 * SIZE(BO), %xmm13 mulsd %xmm5, %xmm13 subsd %xmm13, %xmm2 movsd 25 * SIZE(BO), %xmm14 mulsd %xmm5, %xmm14 subsd %xmm14, %xmm1 movsd 24 * SIZE(BO), %xmm15 mulsd %xmm5, %xmm15 subsd %xmm15, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm4 movsd 19 * SIZE(BO), %xmm12 mulsd %xmm4, %xmm12 subsd %xmm12, %xmm3 movsd 18 * SIZE(BO), %xmm13 mulsd %xmm4, %xmm13 subsd %xmm13, %xmm2 movsd 17 * SIZE(BO), %xmm14 mulsd %xmm4, %xmm14 subsd %xmm14, %xmm1 movsd 16 * SIZE(BO), %xmm15 mulsd %xmm4, %xmm15 subsd %xmm15, %xmm0 movsd 11 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm3 movsd 10 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm2 movsd 9 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm1 movsd 8 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm0 movsd 2 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm2 movsd 1 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm1 movsd 0 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm0 movsd -7 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -8 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm1 unpcklpd %xmm3, %xmm1 movaps %xmm4, %xmm2 unpcklpd %xmm5, %xmm2 movaps %xmm6, %xmm3 unpcklpd %xmm7, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhps %xmm1, 0 * SIZE(CO1, %rax, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 0 * SIZE(CO2, LDC, 1) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 0 * SIZE(CO2, %rax, 1) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm1, -14 * SIZE(BO) movapd %xmm2, -12 * SIZE(BO) movapd %xmm3, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: movq M, I sarq $1, I NOBRANCH jle .L29 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #else movq B, BO #endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht0 -2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 -3 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 -2 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht0 -3 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht0 -2 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 -3 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht0 -2 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht0 -3 * SIZE(CO2, %rax, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps -12 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -14 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps 4 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps 8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -10 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addq $32 * SIZE, BO subq $-8 * SIZE, AO decq %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #endif addpd %xmm1, %xmm12 addpd %xmm2, %xmm13 addpd %xmm3, %xmm14 addpd %xmm4, %xmm15 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $0, %xmm11, %xmm10 shufpd $3, %xmm0, %xmm11 movaps %xmm12, %xmm0 shufpd $0, %xmm13, %xmm12 shufpd $3, %xmm0, %xmm13 movaps %xmm14, %xmm0 shufpd $0, %xmm15, %xmm14 shufpd $3, %xmm0, %xmm15 movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm2 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm6 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movaps %xmm12, %xmm0 shufpd $2, %xmm13, %xmm12 shufpd $2, %xmm0, %xmm13 movaps %xmm14, %xmm0 shufpd $2, %xmm15, %xmm14 shufpd $2, %xmm0, %xmm15 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 subpd %xmm12, %xmm4 subpd %xmm13, %xmm5 subpd %xmm14, %xmm6 subpd %xmm15, %xmm7 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -14 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 movapd %xmm12, %xmm14 movapd %xmm12, %xmm15 mulpd %xmm1, %xmm12 mulpd %xmm3, %xmm13 mulpd %xmm5, %xmm14 mulpd %xmm7, %xmm15 subpd %xmm12, %xmm0 subpd %xmm13, %xmm2 subpd %xmm14, %xmm4 subpd %xmm15, %xmm6 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm6 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm6 movddup -15 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 movapd %xmm12, %xmm14 movapd %xmm12, %xmm15 mulpd %xmm0, %xmm12 mulpd %xmm2, %xmm13 mulpd %xmm4, %xmm14 mulpd %xmm6, %xmm15 subpd %xmm12, %xmm1 subpd %xmm13, %xmm3 subpd %xmm14, %xmm5 subpd %xmm15, %xmm7 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm1 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm3 movddup -12 * SIZE(BO), %xmm12 mulpd %xmm0, %xmm12 subpd %xmm12, %xmm4 movddup -11 * SIZE(BO), %xmm13 mulpd %xmm0, %xmm13 subpd %xmm13, %xmm5 movddup -10 * SIZE(BO), %xmm14 mulpd %xmm0, %xmm14 subpd %xmm14, %xmm6 movddup -9 * SIZE(BO), %xmm15 mulpd %xmm0, %xmm15 subpd %xmm15, %xmm7 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm1 movddup -6 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm2 movddup -5 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm3 movddup -4 * SIZE(BO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm4 movddup -3 * SIZE(BO), %xmm13 mulpd %xmm1, %xmm13 subpd %xmm13, %xmm5 movddup -2 * SIZE(BO), %xmm14 mulpd %xmm1, %xmm14 subpd %xmm14, %xmm6 movddup -1 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm7 movddup 2 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm2 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm2, %xmm11 subpd %xmm11, %xmm3 movddup 4 * SIZE(BO), %xmm12 mulpd %xmm2, %xmm12 subpd %xmm12, %xmm4 movddup 5 * SIZE(BO), %xmm13 mulpd %xmm2, %xmm13 subpd %xmm13, %xmm5 movddup 6 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm6 movddup 7 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm7 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm12 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm4 movddup 13 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm5 movddup 14 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm6 movddup 15 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm7 movddup 20 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm4, %xmm13 subpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm14 mulpd %xmm4, %xmm14 subpd %xmm14, %xmm6 movddup 23 * SIZE(BO), %xmm15 mulpd %xmm4, %xmm15 subpd %xmm15, %xmm7 movddup 29 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm5 movddup 30 * SIZE(BO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm5, %xmm15 subpd %xmm15, %xmm7 movddup 38 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm6 movddup 39 * SIZE(BO), %xmm15 mulpd %xmm6, %xmm15 subpd %xmm15, %xmm7 movddup 47 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm7 #endif #ifdef RT movddup 47 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm7 movddup 46 * SIZE(BO), %xmm9 mulpd %xmm7, %xmm9 subpd %xmm9, %xmm6 movddup 45 * SIZE(BO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm5 movddup 44 * SIZE(BO), %xmm11 mulpd %xmm7, %xmm11 subpd %xmm11, %xmm4 movddup 43 * SIZE(BO), %xmm12 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup 42 * SIZE(BO), %xmm13 mulpd %xmm7, %xmm13 subpd %xmm13, %xmm2 movddup 41 * SIZE(BO), %xmm14 mulpd %xmm7, %xmm14 subpd %xmm14, %xmm1 movddup 40 * SIZE(BO), %xmm15 mulpd %xmm7, %xmm15 subpd %xmm15, %xmm0 movddup 38 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm6 movddup 37 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm5 movddup 36 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm4 movddup 35 * SIZE(BO), %xmm12 mulpd %xmm6, %xmm12 subpd %xmm12, %xmm3 movddup 34 * SIZE(BO), %xmm13 mulpd %xmm6, %xmm13 subpd %xmm13, %xmm2 movddup 33 * SIZE(BO), %xmm14 mulpd %xmm6, %xmm14 subpd %xmm14, %xmm1 movddup 32 * SIZE(BO), %xmm15 mulpd %xmm6, %xmm15 subpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm5 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm5, %xmm11 subpd %xmm11, %xmm4 movddup 27 * SIZE(BO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup 26 * SIZE(BO), %xmm13 mulpd %xmm5, %xmm13 subpd %xmm13, %xmm2 movddup 25 * SIZE(BO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm1 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm5, %xmm15 subpd %xmm15, %xmm0 movddup 20 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm4 movddup 19 * SIZE(BO), %xmm12 mulpd %xmm4, %xmm12 subpd %xmm12, %xmm3 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm4, %xmm13 subpd %xmm13, %xmm2 movddup 17 * SIZE(BO), %xmm14 mulpd %xmm4, %xmm14 subpd %xmm14, %xmm1 movddup 16 * SIZE(BO), %xmm15 mulpd %xmm4, %xmm15 subpd %xmm15, %xmm0 movddup 11 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm3 movddup 10 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm2 movddup 9 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm1 movddup 8 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm0 movddup 2 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm2 movddup 1 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm1 movddup 0 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm0 movddup -7 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -8 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm2, -14 * SIZE(BO) movapd %xmm4, -12 * SIZE(BO) movapd %xmm6, -10 * SIZE(BO) movapd %xmm1, -8 * SIZE(BO) movapd %xmm3, -6 * SIZE(BO) movapd %xmm5, -4 * SIZE(BO) movapd %xmm7, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5 , -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 1 * SIZE(CO1, LDC, 2) movhps %xmm2, 0 * SIZE(CO1, %rax, 1) movhps %xmm3, 1 * SIZE(CO1, %rax, 1) movsd %xmm4, 0 * SIZE(CO2) movsd %xmm5, 1 * SIZE(CO2) movhps %xmm4, 0 * SIZE(CO2, LDC, 1) movhps %xmm5, 1 * SIZE(CO2, LDC, 1) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movsd %xmm7, 1 * SIZE(CO2, LDC, 2) movhps %xmm6, 0 * SIZE(CO2, %rax, 1) movhps %xmm7, 1 * SIZE(CO2, %rax, 1) #else movups %xmm0, 0 * SIZE(CO1) movups %xmm1, 0 * SIZE(CO1, LDC, 1) movups %xmm2, 0 * SIZE(CO1, LDC, 2) movups %xmm3, 0 * SIZE(CO1, %rax, 1) movups %xmm4, 0 * SIZE(CO2) movups %xmm5, 0 * SIZE(CO2, LDC, 1) movups %xmm6, 0 * SIZE(CO2, LDC, 2) movups %xmm7, 0 * SIZE(CO2, %rax, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 ALIGN_4 .L29: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 8), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $8, KK #endif #ifdef RT subq $8, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L30: testq $4, N jle .L50 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L40 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm1 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm1, %xmm3 movaps %xmm1, %xmm2 pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm1 movsd -14 * SIZE(BO), %xmm10 mulsd %xmm0, %xmm10 subsd %xmm10, %xmm2 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm3 movsd -11 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm1 movsd -10 * SIZE(BO), %xmm10 mulsd %xmm1, %xmm10 subsd %xmm10, %xmm2 movsd -9 * SIZE(BO), %xmm11 mulsd %xmm1, %xmm11 subsd %xmm11, %xmm3 movsd -6 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm2 movsd -5 * SIZE(BO), %xmm11 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm3 movsd -1 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm3 #endif #ifdef RT movsd -1 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm3 movsd -2 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm2 movsd -3 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm1 movsd -4 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm0 movsd -6 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm2 movsd -7 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm1 movsd -8 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm0 movsd -11 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -12 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm1 unpcklpd %xmm3, %xmm1 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 0 * SIZE(CO2, LDC, 1) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm1, -14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L40: movq M, I sarq $1, I NOBRANCH jle .L49 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 2 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht0 2 * SIZE(CO2, LDC, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 addpd %xmm3, %xmm10 addpd %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $0, %xmm11, %xmm10 shufpd $3, %xmm0, %xmm11 movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm2 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm3 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -14 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm3, %xmm13 subpd %xmm12, %xmm0 subpd %xmm13, %xmm2 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 movddup -15 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 mulpd %xmm0, %xmm12 mulpd %xmm2, %xmm13 subpd %xmm12, %xmm1 subpd %xmm13, %xmm3 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm1 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm3 movddup -11 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm1 movddup -10 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm2 movddup -9 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm3 movddup -6 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm2 movddup -5 * SIZE(BO), %xmm11 mulpd %xmm2, %xmm11 subpd %xmm11, %xmm3 movddup -1 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm3 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm3 movddup -2 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm2 movddup -3 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm1 movddup -4 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm0 movddup -6 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm2 movddup -7 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm1 movddup -8 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm0 movddup -11 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -12 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) movhps %xmm2, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 1 * SIZE(CO2, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 1 * SIZE(CO2, LDC, 1) #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm2, -14 * SIZE(BO) movapd %xmm1, -12 * SIZE(BO) movapd %xmm3, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L31 ALIGN_4 .L49: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif ALIGN_4 .L50: testq $2, N jle .L70 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L60 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -8 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 #else movapd -16 * SIZE(AO), %xmm0 #endif subpd %xmm8, %xmm0 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm0 movsd -15 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm1 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm1 #endif #ifdef RT movsd -13 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -14 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: movq M, I sarq $1, I NOBRANCH jle .L69 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L52 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm1 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm0 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(AO), %xmm12 mulpd %xmm0, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm0 movddup -15 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm1 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm1 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -14 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm1, -14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L70: testq $1, N jle .L999 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L80 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movhps -15 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm1 movhps -15 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: mulpd %xmm0, %xmm1 movsd -14 * SIZE(AO), %xmm0 movhps -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movsd -14 * SIZE(BO), %xmm1 movhps -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movsd -12 * SIZE(AO), %xmm0 movhps -11 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movsd -12 * SIZE(BO), %xmm1 movhps -11 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L82 addpd %xmm9, %xmm8 ALIGN_3 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd -15 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_4 .L88: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif haddpd %xmm8, %xmm8 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm0 #else movsd -16 * SIZE(AO), %xmm0 #endif subsd %xmm8, %xmm0 #if defined(LN) || defined(LT) movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) movsd -16 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BO) #else movsd %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L80: movq M, I sarq $1, I NOBRANCH jle .L89 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L72 addpd %xmm9, %xmm8 ALIGN_3 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addpd %xmm1, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 #else movapd -16 * SIZE(AO), %xmm0 #endif subpd %xmm8, %xmm0 #if defined(LN) || defined(LT) pshufd $0xe, %xmm0, %xmm1 #endif #ifdef LN movsd -13 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm1 movsd -14 * SIZE(AO), %xmm12 mulsd %xmm1, %xmm12 subsd %xmm12, %xmm0 movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(AO), %xmm12 mulsd %xmm0, %xmm12 subsd %xmm12, %xmm1 movsd -13 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm1 #endif #if defined(LN) || defined(LT) unpcklpd %xmm1, %xmm0 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L71 ALIGN_4 .L89: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_4x2_atom.S000066400000000000000000001103441313527062700221440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J jle .L40 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L20 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 movsd 1 * SIZE(AO), %xmm2 xorps %xmm5, %xmm5 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 3 * SIZE(AO), %xmm2 addsd %xmm5, %xmm8 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 addsd %xmm7, %xmm9 movsd 7 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 8 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 9 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 5 * SIZE(AO), %xmm2 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 andq $3, %rax BRANCH BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm0, %xmm1 addq $2 * SIZE, BO mulsd %xmm0, %xmm3 movsd 1 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 0 * SIZE(BO), %xmm1 addsd %xmm3, %xmm9 movsd 1 * SIZE(BO), %xmm3 addq $1 * SIZE, AO decq %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm1 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 movsd 3 * SIZE(BO), %xmm13 subsd %xmm9, %xmm1 mulsd %xmm13, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm1, %xmm9 movsd 0 * SIZE(BO), %xmm13 subsd %xmm9, %xmm0 mulsd %xmm13, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm1, 1 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M BRANCH je .L30 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L29 ALIGN_4 .L26: addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: addsd %xmm2, %xmm9 addsd %xmm6, %xmm11 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 subsd %xmm10, %xmm2 subsd %xmm11, %xmm3 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm1 movsd 3 * SIZE(AO), %xmm3 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm9, %xmm1 subsd %xmm11, %xmm3 #endif #ifdef LN movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm2 movsd 2 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(AO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm1 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm1 movsd 3 * SIZE(AO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm0, %xmm9 mulsd %xmm1, %xmm10 subsd %xmm9, %xmm2 subsd %xmm10, %xmm3 mulsd %xmm13, %xmm2 mulsd %xmm13, %xmm3 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm2 movsd 3 * SIZE(BO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm0, %xmm9 mulsd %xmm2, %xmm10 subsd %xmm9, %xmm1 subsd %xmm10, %xmm3 mulsd %xmm13, %xmm1 mulsd %xmm13, %xmm3 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(BO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm1, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm2 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm1, 2 * SIZE(AO) movsd %xmm3, 3 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $2, I jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 prefetcht0 3 * SIZE(CO2) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO addsd %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L19 ALIGN_4 .L16: addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L19: addsd %xmm2, %xmm13 addsd %xmm7, %xmm14 addsd %xmm6, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 movsd 4 * SIZE(BO), %xmm4 movsd 5 * SIZE(BO), %xmm5 movsd 6 * SIZE(BO), %xmm6 movsd 7 * SIZE(BO), %xmm7 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 subsd %xmm10, %xmm2 subsd %xmm11, %xmm3 subsd %xmm12, %xmm4 subsd %xmm13, %xmm5 subsd %xmm14, %xmm6 subsd %xmm15, %xmm7 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 movsd 4 * SIZE(AO), %xmm1 movsd 5 * SIZE(AO), %xmm3 movsd 6 * SIZE(AO), %xmm5 movsd 7 * SIZE(AO), %xmm7 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 subsd %xmm9, %xmm1 subsd %xmm11, %xmm3 subsd %xmm13, %xmm5 subsd %xmm15, %xmm7 #endif #ifdef LN movsd 15 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm6 movsd 14 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm7 movsd 13 * SIZE(AO), %xmm11 movaps %xmm9, %xmm10 movsd 12 * SIZE(AO), %xmm13 mulsd %xmm6, %xmm9 movsd 10 * SIZE(AO), %xmm8 mulsd %xmm7, %xmm10 subsd %xmm9, %xmm4 movsd 9 * SIZE(AO), %xmm9 subsd %xmm10, %xmm5 movaps %xmm11, %xmm12 mulsd %xmm6, %xmm11 mulsd %xmm7, %xmm12 subsd %xmm11, %xmm2 movsd 8 * SIZE(AO), %xmm11 subsd %xmm12, %xmm3 movaps %xmm13, %xmm14 mulsd %xmm6, %xmm13 mulsd %xmm7, %xmm14 subsd %xmm13, %xmm0 subsd %xmm14, %xmm1 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm5 movsd 5 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm4, %xmm9 mulsd %xmm5, %xmm10 subsd %xmm9, %xmm2 movsd 4 * SIZE(AO), %xmm9 subsd %xmm10, %xmm3 movaps %xmm11, %xmm12 mulsd %xmm4, %xmm11 mulsd %xmm5, %xmm12 subsd %xmm11, %xmm0 movsd 0 * SIZE(AO), %xmm11 subsd %xmm12, %xmm1 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm3 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm1 mulsd %xmm11, %xmm0 mulsd %xmm11, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm1 movsd 2 * SIZE(AO), %xmm11 movaps %xmm9, %xmm10 movsd 3 * SIZE(AO), %xmm13 mulsd %xmm0, %xmm9 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm1, %xmm10 subsd %xmm9, %xmm2 movsd 6 * SIZE(AO), %xmm9 subsd %xmm10, %xmm3 movaps %xmm11, %xmm12 mulsd %xmm0, %xmm11 mulsd %xmm1, %xmm12 subsd %xmm11, %xmm4 movsd 7 * SIZE(AO), %xmm11 subsd %xmm12, %xmm5 movaps %xmm13, %xmm14 mulsd %xmm0, %xmm13 mulsd %xmm1, %xmm14 subsd %xmm13, %xmm6 subsd %xmm14, %xmm7 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm3 movsd 10 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm4 movsd 11 * SIZE(AO), %xmm9 subsd %xmm10, %xmm5 movaps %xmm11, %xmm12 mulsd %xmm2, %xmm11 mulsd %xmm3, %xmm12 subsd %xmm11, %xmm6 subsd %xmm12, %xmm7 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm5 movsd 15 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm4, %xmm9 mulsd %xmm5, %xmm10 subsd %xmm9, %xmm6 subsd %xmm10, %xmm7 mulsd %xmm8, %xmm6 mulsd %xmm8, %xmm7 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm2 movsd 3 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm6 movaps %xmm9, %xmm10 movaps %xmm9, %xmm11 movaps %xmm9, %xmm12 mulsd %xmm0, %xmm9 mulsd %xmm2, %xmm10 mulsd %xmm4, %xmm11 mulsd %xmm6, %xmm12 subsd %xmm9, %xmm1 subsd %xmm10, %xmm3 subsd %xmm11, %xmm5 subsd %xmm12, %xmm7 mulsd %xmm13, %xmm1 mulsd %xmm13, %xmm3 mulsd %xmm13, %xmm5 mulsd %xmm13, %xmm7 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm5 mulsd %xmm8, %xmm7 movaps %xmm9, %xmm10 movaps %xmm9, %xmm11 movaps %xmm9, %xmm12 mulsd %xmm1, %xmm9 mulsd %xmm3, %xmm10 mulsd %xmm5, %xmm11 mulsd %xmm7, %xmm12 subsd %xmm9, %xmm0 subsd %xmm10, %xmm2 subsd %xmm11, %xmm4 subsd %xmm12, %xmm6 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm2 mulsd %xmm13, %xmm4 mulsd %xmm13, %xmm6 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movsd %xmm6, 3 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) movsd %xmm5, 2 * SIZE(CO2) movsd %xmm7, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) movsd %xmm4, 4 * SIZE(BO) movsd %xmm5, 5 * SIZE(BO) movsd %xmm6, 6 * SIZE(BO) movsd %xmm7, 7 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm4, 2 * SIZE(AO) movsd %xmm6, 3 * SIZE(AO) movsd %xmm1, 4 * SIZE(AO) movsd %xmm3, 5 * SIZE(AO) movsd %xmm5, 6 * SIZE(AO) movsd %xmm7, 7 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L10 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L50 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 movsd 1 * SIZE(AO), %xmm2 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L65 ALIGN_4 .L62: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 movsd 4 * SIZE(AO), %xmm0 addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm3 movsd 5 * SIZE(AO), %xmm2 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm4, %xmm5 movsd 6 * SIZE(AO), %xmm4 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm6, %xmm7 movsd 7 * SIZE(AO), %xmm6 addq $4 * SIZE, AO addq $4 * SIZE, BO decq %rax jne .L62 addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L68 ALIGN_4 .L66: movsd 0 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 addq $1 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L66 ALIGN_4 .L68: addsd %xmm9, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 subsd %xmm8, %xmm0 #else movsd 0 * SIZE(AO), %xmm0 subsd %xmm8, %xmm0 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L50: testq $2, M je .L60 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm1 xorps %xmm3, %xmm3 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L55 ALIGN_4 .L52: addsd %xmm2, %xmm8 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm3, %xmm10 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 addq $8 * SIZE, AO addsd %xmm1, %xmm10 movsd -3 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 3 * SIZE(BO), %xmm5 addsd %xmm2, %xmm8 movsd -2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 addq $4 * SIZE, BO addsd %xmm3, %xmm10 movsd -1 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 0 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 decq %rax addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 1 * SIZE(BO), %xmm5 jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm2, %xmm8 addsd %xmm3, %xmm10 andq $3, %rax BRANCH je .L59 ALIGN_4 .L56: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 3 * SIZE(AO), %xmm1 addq $2 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm2 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 #endif #ifdef LN movsd 3 * SIZE(AO), %xmm8 movsd 2 * SIZE(AO), %xmm9 movsd 0 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm2 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm0 mulsd %xmm11,%xmm0 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 movsd 1 * SIZE(AO), %xmm9 movsd 3 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm0 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm2 mulsd %xmm11,%xmm2 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm2, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: movq M, I sarq $2, I jle .L69 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 movsd 1 * SIZE(AO), %xmm1 xorps %xmm11, %xmm11 movsd 2 * SIZE(AO), %xmm2 xorps %xmm13, %xmm13 movsd 3 * SIZE(AO), %xmm3 xorps %xmm15, %xmm15 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm14, %xmm14 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L45 ALIGN_4 .L42: addsd %xmm9, %xmm8 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm11, %xmm10 movsd 5 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 6 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 7 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 9 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addsd %xmm2, %xmm12 movsd 10 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 addsd %xmm3, %xmm14 movsd 11 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 3 * SIZE(BO), %xmm5 addsd %xmm9, %xmm8 movsd 12 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addsd %xmm11, %xmm10 movsd 13 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 14 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 15 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 4 * SIZE(BO), %xmm4 subq $-16 * SIZE, AO addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addq $ 4 * SIZE, BO addsd %xmm2, %xmm12 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 decq %rax addsd %xmm3, %xmm14 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 1 * SIZE(BO), %xmm5 jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm9, %xmm8 addsd %xmm11, %xmm10 addsd %xmm13, %xmm12 addsd %xmm15, %xmm14 andq $3, %rax BRANCH BRANCH je .L49 ALIGN_4 .L46: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 mulsd %xmm4, %xmm2 mulsd %xmm4, %xmm3 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 5 * SIZE(AO), %xmm1 addsd %xmm2, %xmm12 movsd 6 * SIZE(AO), %xmm2 addsd %xmm3, %xmm14 movsd 7 * SIZE(AO), %xmm3 addq $4 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L46 ALIGN_4 .L49: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm2 movsd 2 * SIZE(BO), %xmm4 movsd 3 * SIZE(BO), %xmm6 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 #endif #ifdef LN movsd 15 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm6 movsd 14 * SIZE(AO), %xmm9 mulsd %xmm6, %xmm9 movsd 13 * SIZE(AO), %xmm11 subsd %xmm9, %xmm4 movsd 12 * SIZE(AO), %xmm13 mulsd %xmm6, %xmm11 movsd 10 * SIZE(AO), %xmm8 subsd %xmm11, %xmm2 movsd 9 * SIZE(AO), %xmm9 mulsd %xmm6, %xmm13 movsd 8 * SIZE(AO), %xmm11 subsd %xmm13, %xmm0 mulsd %xmm8, %xmm4 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm4, %xmm9 subsd %xmm9, %xmm2 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm11 subsd %xmm11, %xmm0 movsd 0 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm2 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm0 mulsd %xmm11, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm0, %xmm9 movsd 2 * SIZE(AO), %xmm11 subsd %xmm9, %xmm2 movsd 3 * SIZE(AO), %xmm13 mulsd %xmm0, %xmm11 movsd 5 * SIZE(AO), %xmm8 subsd %xmm11, %xmm4 movsd 6 * SIZE(AO), %xmm9 mulsd %xmm0, %xmm13 movsd 7 * SIZE(AO), %xmm11 subsd %xmm13, %xmm6 mulsd %xmm8, %xmm2 movsd 10 * SIZE(AO), %xmm8 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm4 movsd 11 * SIZE(AO), %xmm9 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm6 mulsd %xmm8, %xmm4 movsd 15 * SIZE(AO), %xmm8 mulsd %xmm4, %xmm9 subsd %xmm9, %xmm6 mulsd %xmm8, %xmm6 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm6 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movsd %xmm6, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm2, 1 * SIZE(BO) movsd %xmm4, 2 * SIZE(BO) movsd %xmm6, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm4, 2 * SIZE(AO) movsd %xmm6, 3 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L41 ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_2 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S000066400000000000000000001776531313527062700231540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #define J %rbx #ifndef WINDOWS_ABI #define STACKSIZE 96 #define OFFSET 48(%rsp) #define AORIG 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define AORIG 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define PREFETCH prefetch #define PREFETCHSIZE (8 * 7 + 0) #define movlpd movsd #define movapd movups #define movupd movups #define KERNEL1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ addpd %xmm1, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ /**/ movddup (BO, %rax, 4), %xmm1 ;\ movapd %xmm4, %xmm2 #define KERNEL3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ /***/ movapd (AO, %rax, 4), %xmm6 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ movapd %xmm6, %xmm2 #define KERNEL5(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ /**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ addpd %xmm1, %xmm14 ;\ movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL6(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ /***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ movapd %xmm7, %xmm2 #define KERNEL7(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ addpd %xmm5, %xmm14 ;\ movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL8(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ /**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm3, %xmm15 ;\ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd (AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup (BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm12 #else movq STACKSIZE + 8(%rsp), LDC movsd STACKSIZE + 16(%rsp), %xmm12 #endif movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B movsd %xmm12, OFFSET movsd %xmm12, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L20 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -14 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -15 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movapd -16 * SIZE(BO), %xmm1 pxor %xmm11, %xmm11 movapd -8 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L36 ALIGN_4 .L32: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd %xmm4, %xmm1 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 addpd %xmm1, %xmm10 movapd (BO, %rax, 4), %xmm1 addpd %xmm4, %xmm11 movddup -11 * SIZE(AO, %rax, 1), %xmm4 mulpd %xmm2, %xmm3 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movddup -13 * SIZE(AO, %rax, 1), %xmm2 mulpd %xmm2, %xmm3 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 movddup -10 * SIZE(AO, %rax, 1), %xmm2 addq $4 * SIZE, %rax BRANCH jl .L32 ALIGN_4 .L36: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L38 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L37: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L37 ALIGN_4 .L38: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 subpd %xmm8, %xmm2 subpd %xmm9, %xmm3 #else movapd -16 * SIZE(AO), %xmm2 movapd -14 * SIZE(AO), %xmm3 subpd %xmm8, %xmm2 subpd %xmm9, %xmm3 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd -16 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd -15 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -14 * SIZE(BO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd -13 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd -11 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd -9 * SIZE(BO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd -6 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd -5 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd -1 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd -1 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd -3 * SIZE(BO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd -4 * SIZE(BO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd -6 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd -7 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd -8 * SIZE(BO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd -11 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd -12 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -16 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #else movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm2, -16 * SIZE(BO) movaps %xmm3, -14 * SIZE(BO) #else movaps %xmm2, -16 * SIZE(AO) movaps %xmm3, -14 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -8 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L26 ALIGN_4 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -9 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup (BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -7 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -6 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -5 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup -4 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup -3 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -2 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -1 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup 8 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup 1 * SIZE(BO, %rax, 4), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L22 ALIGN_4 .L26: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L29 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L27: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 addq $SIZE, %rax jl .L27 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm11, %xmm10 subpd %xmm10, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) movaps %xmm13, -12 * SIZE(BO) movaps %xmm15, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm2, -14 * SIZE(AO) movaps %xmm4, -12 * SIZE(AO) movaps %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $2, I # i = (m >> 2) jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movddup -8 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 #ifndef LN prefetchw 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw 3 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw 3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw 3 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 #else prefetchw -8 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw -8 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw -8 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw -8 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 #endif prefetch -10 * SIZE(BB) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) BRANCH jl .L12 ALIGN_4 .L15: prefetch 14 * SIZE(BB) subq $-16 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_4 KERNEL_SUB1(16 * 0) KERNEL_SUB2(16 * 0) KERNEL_SUB3(16 * 0) KERNEL_SUB4(16 * 0) subq $-16 * SIZE, BO subq $-16 * SIZE, AO ALIGN_4 .L16: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L17: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd %xmm2, %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movapd %xmm0, %xmm2 addpd %xmm3, %xmm13 movddup -13 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm10 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm14 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 addpd %xmm3, %xmm15 movddup -11 * SIZE(BO, %rax, 4), %xmm3 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L17 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm3 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm7, %xmm14 subpd %xmm14, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm15 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm11, %xmm10 subpd %xmm10, %xmm15 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm11, %xmm14 subpd %xmm14, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm3 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm1, %xmm9 subpd %xmm9, %xmm3 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm7 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm5 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm7 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm5, %xmm9 subpd %xmm9, %xmm7 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm7, %xmm9 subpd %xmm9, %xmm5 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm3 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm7, %xmm11 subpd %xmm11, %xmm1 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm5, %xmm9 subpd %xmm9, %xmm3 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movlpd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) movaps %xmm13, -12 * SIZE(BO) movaps %xmm15, -10 * SIZE(BO) movaps %xmm1, -8 * SIZE(BO) movaps %xmm3, -6 * SIZE(BO) movaps %xmm5, -4 * SIZE(BO) movaps %xmm7, -2 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) movaps %xmm4, -8 * SIZE(AO) movaps %xmm5, -6 * SIZE(AO) movaps %xmm6, -4 * SIZE(AO) movaps %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $2, N je .L80 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L60 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 1), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(AO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L76 ALIGN_4 .L72: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(AO, %rax, 1), %xmm1 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(AO, %rax, 1), %xmm2 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(AO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L72 ALIGN_4 .L76: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L78 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L77: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L77 ALIGN_4 .L78: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm2 #else movapd -16 * SIZE(AO), %xmm2 #endif subpd %xmm8, %xmm2 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 mulsd -16 * SIZE(BO), %xmm2 movsd -15 * SIZE(BO), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 mulsd -13 * SIZE(BO), %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 mulsd -13 * SIZE(BO), %xmm0 movlpd -14 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 mulsd -16 * SIZE(BO), %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movaps %xmm2, -16 * SIZE(BO) #else movaps %xmm2, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $2, M je .L70 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L66 ALIGN_4 .L62: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm11 movddup -11 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm9 movddup -9 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm10 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm11 movddup -7 * SIZE(BO, %rax, 2), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L62 ALIGN_4 .L66: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L69 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L67: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 addq $SIZE, %rax jl .L67 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm13, -14 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: movq M, I sarq $2, I # i = (m >> 2) jle .L79 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 pxor %xmm8, %xmm8 movddup -12 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -16 * SIZE(AO), %xmm0 pxor %xmm12, %xmm12 movapd -8 * SIZE(AO), %xmm4 pxor %xmm13, %xmm13 #ifndef LN prefetchw 3 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw 3 * SIZE(CO2) #else prefetchw -8 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw -8 * SIZE(CO2) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L56 ALIGN_4 .L52: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd (AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -9 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd 8 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -4 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -7 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $4 * SIZE, %rax BRANCH jl .L52 ALIGN_4 .L56: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L59 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L57 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm1, %xmm9 subpd %xmm9, %xmm3 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movlpd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm13,-14 * SIZE(BO) movaps %xmm1, -12 * SIZE(BO) movaps %xmm5, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L100 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -14 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L116 ALIGN_4 .L112: mulpd -16 * SIZE(BO, %rax, 1), %xmm0 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 addpd %xmm1, %xmm9 movapd -10 * SIZE(AO, %rax, 1), %xmm1 addq $4 * SIZE, %rax BRANCH jl .L112 ALIGN_4 .L116: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L118 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L117: mulsd -16 * SIZE(BO, %rax, 1), %xmm0 addsd %xmm0, %xmm8 movsd -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L117 ALIGN_4 .L118: addpd %xmm9, %xmm8 haddpd %xmm8, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #if defined(RN) || defined(RT) movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm10, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movlpd %xmm10, -16 * SIZE(BO) #else movlpd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax addq %rax, AO addq %rax, BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L100: testq $2, M je .L110 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L106 ALIGN_4 .L102: mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(BO, %rax, 1), %xmm0 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(BO, %rax, 1), %xmm1 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(BO, %rax, 1), %xmm2 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(BO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L102 ALIGN_4 .L106: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L109 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L107: movddup -16 * SIZE(BO, %rax, 1), %xmm0 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 addq $SIZE, %rax jl .L107 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movaps %xmm10, -16 * SIZE(BO) #else movaps %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: movq M, I sarq $2, I # i = (m >> 2) jle .L119 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -8 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -14 * SIZE(BO), %xmm3 #ifndef LN prefetchw 3 * SIZE(CO1) #else prefetchw -8 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L96 ALIGN_4 .L92: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm9 movddup -12 * SIZE(BO, %rax, 1), %xmm1 mulpd %xmm5, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm0, %xmm10 movapd (AO, %rax, 4), %xmm0 addpd %xmm5, %xmm11 movddup -13 * SIZE(BO, %rax, 1), %xmm5 mulpd %xmm3, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm3, %xmm9 movddup -10 * SIZE(BO, %rax, 1), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm10 movapd 8 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 1), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L92 ALIGN_4 .L96: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L99 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO, %rax, 1), %xmm1 addq $SIZE, %rax jl .L97 ALIGN_4 .L99: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movlpd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movaps %xmm10, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) #else movaps %xmm10, -16 * SIZE(AO) movaps %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L119: #ifdef LN leaq (B, K, SIZE), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_4x4_core2.S000066400000000000000000002023041313527062700222160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define J 0(%rsp) #define OFFSET 8(%rsp) #define KK 16(%rsp) #define KKK 24(%rsp) #define AORIG 32(%rsp) #define BORIG 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) #define PREFETCHSIZE (8 * 17 + 2) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, %rax movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq %rax, KK movq %rax, OFFSET movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq 16 * SIZE + BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 movapd -12 * SIZE(B), %xmm2 movapd -10 * SIZE(B), %xmm3 movapd -8 * SIZE(B), %xmm4 movapd -6 * SIZE(B), %xmm5 movapd -4 * SIZE(B), %xmm6 movapd -2 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_R + 8) * SIZE(B) movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movddup %xmm2, %xmm10 unpckhpd %xmm2, %xmm2 movddup %xmm3, %xmm11 unpckhpd %xmm3, %xmm3 movddup %xmm4, %xmm12 unpckhpd %xmm4, %xmm4 movddup %xmm5, %xmm13 unpckhpd %xmm5, %xmm5 movddup %xmm6, %xmm14 unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm10, -8 * SIZE(BO) movapd %xmm2, -6 * SIZE(BO) movapd %xmm11, -4 * SIZE(BO) movapd %xmm3, -2 * SIZE(BO) prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movapd %xmm12, 0 * SIZE(BO) movapd %xmm4, 2 * SIZE(BO) movapd %xmm13, 4 * SIZE(BO) movapd %xmm5, 6 * SIZE(BO) prefetcht0 (PREFETCH_W + 24) * SIZE(BO) movapd %xmm14, 8 * SIZE(BO) movapd %xmm6, 10 * SIZE(BO) movapd %xmm15, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) subq $-16 * SIZE, B subq $-32 * SIZE, BO subq $1, %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L04 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif testq $1, M je .L20 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -15 * SIZE(AO), %xmm0 movsd -8 * SIZE(BO), %xmm2 movsd -6 * SIZE(BO), %xmm3 movsd -4 * SIZE(BO), %xmm4 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -14 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm2 movsd 2 * SIZE(BO), %xmm3 movsd 4 * SIZE(BO), %xmm4 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -13 * SIZE(AO), %xmm0 movsd 8 * SIZE(BO), %xmm2 movsd 10 * SIZE(BO), %xmm3 movsd 12 * SIZE(BO), %xmm4 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm13 movsd -14 * SIZE(B), %xmm14 movsd -13 * SIZE(B), %xmm15 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 movsd -14 * SIZE(AO), %xmm14 movsd -13 * SIZE(AO), %xmm15 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 subsd %xmm10, %xmm14 subsd %xmm11, %xmm15 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 mulsd %xmm8, %xmm14 mulsd %xmm8, %xmm15 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 mulsd %xmm8, %xmm14 mulsd %xmm8, %xmm15 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm12 movlpd -15 * SIZE(B), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 movlpd -14 * SIZE(B), %xmm10 mulsd %xmm12, %xmm10 subsd %xmm10, %xmm14 movlpd -13 * SIZE(B), %xmm11 mulsd %xmm12, %xmm11 subsd %xmm11, %xmm15 mulsd -11 * SIZE(B), %xmm13 movlpd -10 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm14 movlpd -9 * SIZE(B), %xmm10 mulsd %xmm13, %xmm10 subsd %xmm10, %xmm15 mulsd -6 * SIZE(B), %xmm14 movlpd -5 * SIZE(B), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm15 mulsd -1 * SIZE(B), %xmm15 #endif #ifdef RT mulsd -1 * SIZE(B), %xmm15 movlpd -2 * SIZE(B), %xmm9 mulsd %xmm15, %xmm9 subsd %xmm9, %xmm14 movlpd -3 * SIZE(B), %xmm10 mulsd %xmm15, %xmm10 subsd %xmm10, %xmm13 movlpd -4 * SIZE(B), %xmm11 mulsd %xmm15, %xmm11 subsd %xmm11, %xmm12 mulsd -6 * SIZE(B), %xmm14 movlpd -7 * SIZE(B), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm13 movlpd -8 * SIZE(B), %xmm10 mulsd %xmm14, %xmm10 subsd %xmm10, %xmm12 mulsd -11 * SIZE(B), %xmm13 movlpd -12 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(B), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm14, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 0 * SIZE(CO2, LDC, 2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(B) movsd %xmm13, -15 * SIZE(B) movsd %xmm14, -14 * SIZE(B) movsd %xmm15, -13 * SIZE(B) movsd %xmm12, -16 * SIZE(BO) movsd %xmm12, -15 * SIZE(BO) movsd %xmm13, -14 * SIZE(BO) movsd %xmm13, -13 * SIZE(BO) movsd %xmm14, -12 * SIZE(BO) movsd %xmm14, -11 * SIZE(BO) movsd %xmm15, -10 * SIZE(BO) movsd %xmm15, -9 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) movsd %xmm14, -14 * SIZE(AO) movsd %xmm15, -13 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -14 * SIZE(AO), %xmm0 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm4 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -10 * SIZE(AO), %xmm0 movapd 8 * SIZE(BO), %xmm2 movapd 10 * SIZE(BO), %xmm3 movapd 12 * SIZE(BO), %xmm4 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jne .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(B), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(B), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(B), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(B), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(B), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(B), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(B), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(B), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(B), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(B), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm11, %xmm10 SHUFPD_3 %xmm11, %xmm11 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm15, %xmm14 SHUFPD_3 %xmm15, %xmm15 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) movapd %xmm4, -12 * SIZE(AO) movapd %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $2, I # i = (m >> 2) jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif prefetcht2 0 * SIZE(BB) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 -3 * SIZE(CO2) pxor %xmm13, %xmm13 prefetcht2 -3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetcht2 -3 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 #else prefetcht2 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 3 * SIZE(CO2) pxor %xmm13, %xmm13 prefetcht2 3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetcht2 3 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 #endif pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-8 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm10 movapd -16 * SIZE(AO), %xmm0 addpd %xmm3, %xmm14 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movapd -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd 0 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd 2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd 4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd 6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd 8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd 10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 addq $32 * SIZE, BO mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -20 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 subq $-16 * SIZE, AO mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -18 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 subq $1, %rax mulpd %xmm1, %xmm5 BRANCH jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: movapd -16 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movapd -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 subq $1, %rax BRANCH jg .L16 ALIGN_4 .L19: addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 movapd -8 * SIZE(B), %xmm1 movapd -6 * SIZE(B), %xmm3 movapd -4 * SIZE(B), %xmm5 movapd -2 * SIZE(B), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -3 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm15 movddup -8 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm3 movddup -9 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm3 movddup -14 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm5 movddup -13 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm5 movddup -3 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -4 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm1 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup -8 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm1 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm1 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 2 * SIZE(CO1, LDC, 2) movsd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movsd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movapd %xmm1, -8 * SIZE(B) movapd %xmm3, -6 * SIZE(B) movapd %xmm5, -4 * SIZE(B) movapd %xmm7, -2 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm11, %xmm10 SHUFPD_3 %xmm11, %xmm11 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm15, %xmm14 SHUFPD_3 %xmm15, %xmm15 movddup %xmm1, %xmm0 SHUFPD_3 %xmm1, %xmm1 movddup %xmm3, %xmm2 SHUFPD_3 %xmm3, %xmm3 movddup %xmm5, %xmm4 SHUFPD_3 %xmm5, %xmm5 movddup %xmm7, %xmm6 SHUFPD_3 %xmm7, %xmm7 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5, -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) addq $8 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif testq $1, M je .L60 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 mulsd %xmm1, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -14 * SIZE(AO), %xmm0 movsd -13 * SIZE(AO), %xmm1 movsd -8 * SIZE(BO), %xmm2 movsd -6 * SIZE(BO), %xmm3 movsd -4 * SIZE(BO), %xmm4 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 mulsd %xmm1, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm13 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 mulsd -13 * SIZE(B), %xmm13 #endif #ifdef RT mulsd -13 * SIZE(B), %xmm13 movlpd -14 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(B), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(B) movsd %xmm13, -15 * SIZE(B) movsd %xmm12, -16 * SIZE(BO) movsd %xmm12, -15 * SIZE(BO) movsd %xmm13, -14 * SIZE(BO) movsd %xmm13, -13 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(B), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: movq M, I sarq $2, I # i = (m >> 2) jle .L79 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 -3 * SIZE(CO2) pxor %xmm13, %xmm13 #else prefetcht2 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 3 * SIZE(CO2) pxor %xmm13, %xmm13 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 movapd -12 * SIZE(B), %xmm1 movapd -10 * SIZE(B), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(B), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm3 movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(B), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm1 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movapd %xmm1, -12 * SIZE(B) movapd %xmm5, -10 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm1, %xmm0 SHUFPD_3 %xmm1, %xmm1 movddup %xmm5, %xmm4 SHUFPD_3 %xmm5, %xmm5 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) movapd %xmm0, -8 * SIZE(BO) movapd %xmm1, -6 * SIZE(BO) movapd %xmm4, -4 * SIZE(BO) movapd %xmm5, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L83 ALIGN_4 .L82: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO subq $1, %rax jne .L82 ALIGN_4 .L83: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L90 ALIGN_4 .L84: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, 0 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO subq $1, %rax jne .L84 ALIGN_4 .L90: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif testq $1, M je .L100 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm1, %xmm3 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 movsd -14 * SIZE(AO), %xmm0 movsd -13 * SIZE(AO), %xmm1 movsd -12 * SIZE(BO), %xmm2 movsd -10 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm1, %xmm3 addsd %xmm2, %xmm10 addsd %xmm3, %xmm11 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L116 ALIGN_4 .L118: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 addsd %xmm9, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #ifdef LN movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef RN movsd -16 * SIZE(B), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef RT movsd -16 * SIZE(B), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm10, -16 * SIZE(B) movlpd %xmm10, -16 * SIZE(BO) movlpd %xmm10, -15 * SIZE(BO) #else movsd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd -10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm8 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L106 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(B) movddup %xmm10, %xmm8 SHUFPD_3 %xmm10, %xmm10 movapd %xmm8, -16 * SIZE(BO) movapd %xmm10, -14 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: movq M, I sarq $2, I # i = (m >> 2) jle .L119 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 #ifdef LN prefetcht2 -3 * SIZE(CO1) #else prefetcht2 3 * SIZE(CO1) #endif pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -14 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -10 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L96 ALIGN_4 .L99: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm10 movapd -14 * SIZE(B), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movddup %xmm10, %xmm8 SHUFPD_3 %xmm10, %xmm10 movddup %xmm11, %xmm9 SHUFPD_3 %xmm11, %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm10, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S000066400000000000000000001663141313527062700225310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH_R (8 * 4 + 0) #define PREFETCHSIZE (8 * 21 + 6) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK subq $-16 * SIZE, A subq $-16 * SIZE, B leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L20 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -14 * SIZE(BO), %xmm3 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -10 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps -8 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -2 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps 2 * SIZE(BO), %xmm3 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -10 * SIZE(BO), %xmm3 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(LT) movaps -16 * SIZE(BO), %xmm12 movaps -14 * SIZE(BO), %xmm13 #else movaps -16 * SIZE(AO), %xmm12 movaps -14 * SIZE(AO), %xmm13 #endif subpd %xmm8, %xmm12 subpd %xmm9, %xmm13 #if defined(RN) || defined(RT) movhlps %xmm13, %xmm15 movsd %xmm13, %xmm14 movhlps %xmm12, %xmm13 movsd %xmm12, %xmm12 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm12 mulpd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(BO), %xmm12 movlpd -15 * SIZE(BO), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 movlpd -14 * SIZE(BO), %xmm10 mulsd %xmm12, %xmm10 subsd %xmm10, %xmm14 movlpd -13 * SIZE(BO), %xmm11 mulsd %xmm12, %xmm11 subsd %xmm11, %xmm15 mulsd -11 * SIZE(BO), %xmm13 movlpd -10 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm14 movlpd -9 * SIZE(BO), %xmm10 mulsd %xmm13, %xmm10 subsd %xmm10, %xmm15 mulsd -6 * SIZE(BO), %xmm14 movlpd -5 * SIZE(BO), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm15 mulsd -1 * SIZE(BO), %xmm15 #endif #ifdef RT mulsd -1 * SIZE(BO), %xmm15 movlpd -2 * SIZE(BO), %xmm9 mulsd %xmm15, %xmm9 subsd %xmm9, %xmm14 movlpd -3 * SIZE(BO), %xmm10 mulsd %xmm15, %xmm10 subsd %xmm10, %xmm13 movlpd -4 * SIZE(BO), %xmm11 mulsd %xmm15, %xmm11 subsd %xmm11, %xmm12 mulsd -6 * SIZE(BO), %xmm14 movlpd -7 * SIZE(BO), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm13 movlpd -8 * SIZE(BO), %xmm10 mulsd %xmm14, %xmm10 subsd %xmm10, %xmm12 mulsd -11 * SIZE(BO), %xmm13 movlpd -12 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(BO), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm12, 0 * SIZE(CO1) movhps %xmm12, 0 * SIZE(CO2) movsd %xmm13, 0 * SIZE(CO1, LDC, 2) movhps %xmm13, 0 * SIZE(CO2, LDC, 2) movaps %xmm12, -16 * SIZE(BO) movaps %xmm13, -14 * SIZE(BO) #else movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm14, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 0 * SIZE(CO2, LDC, 2) movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) movsd %xmm14, -14 * SIZE(AO) movsd %xmm15, -13 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M BRANCH jle .L30 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -10 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -4 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -2 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO addpd %xmm2, %xmm9 movaps 0 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addpd %xmm3, %xmm11 addpd %xmm5, %xmm10 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) movapd %xmm4, -12 * SIZE(AO) movapd %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 #ifdef LN prefetcht0 -4 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 -4 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht0 -4 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 prefetcht0 -4 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #else prefetcht0 3 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 3 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht0 3 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 prefetcht0 3 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: addpd %xmm3, %xmm11 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -4 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 0 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 4 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 14 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 16 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 subq $-32 * SIZE, AO addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax subq $4, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif addpd %xmm3, %xmm11 addpd %xmm4, %xmm15 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 movapd %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 movapd %xmm14, %xmm0 movsd %xmm15, %xmm14 movsd %xmm0, %xmm15 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -3 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm15 movddup -8 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm3 movddup -9 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm3 movddup -14 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm5 movddup -13 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm5 movddup -3 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -4 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm1 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup -8 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm1 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 2 * SIZE(CO1, LDC, 2) movsd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movsd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) movapd %xmm1, -8 * SIZE(BO) movapd %xmm3, -6 * SIZE(BO) movapd %xmm5, -4 * SIZE(BO) movapd %xmm7, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5, -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $2, N BRANCH jle .L80 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L60 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -10 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addpd %xmm9, %xmm8 movhlps %xmm8, %xmm9 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm12 movsd -15 * SIZE(BO), %xmm13 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(BO), %xmm12 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 mulsd -13 * SIZE(BO), %xmm13 #endif #ifdef RT mulsd -13 * SIZE(BO), %xmm13 movlpd -14 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(BO), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(BO) movsd %xmm13, -15 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $2, M BRANCH jle .L70 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 movaps -16 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -14 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -10 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -8 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -14 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L79 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 #ifdef LN prefetcht0 -4 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 -4 * SIZE(CO2) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #else prefetcht0 3 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 3 * SIZE(CO2) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -14 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -10 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -14 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm3 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(BO), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) movapd %xmm1, -12 * SIZE(BO) movapd %xmm5, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N BRANCH jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L90 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -14 * SIZE(AO), %xmm0 movsd -14 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -13 * SIZE(AO), %xmm0 movsd -13 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -12 * SIZE(AO), %xmm0 movsd -12 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #ifdef LN movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef RT movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm10, -16 * SIZE(BO) #else movsd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L90: testq $2, M BRANCH jle .L110 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm2 pxor %xmm9, %xmm9 movhps -15 * SIZE(BO), %xmm2 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 pshufd $0x44, %xmm2, %xmm3 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -12 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_4 .L108: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L119 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -12 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_4 .L98: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm12, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm12, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L91 ALIGN_4 .L119: #ifdef LN leaq (B, K, SIZE), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S000066400000000000000000002332641313527062700220710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #ifndef ALLOC_HUGETLB #define PREFETCHSIZE (8 * 4 + 4) #else #define PREFETCHSIZE (8 * 2 + 4) #endif #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 4 + 4) #endif #define KERNEL1(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movsd %xmm4, OFFSET movsd %xmm4, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCHNTA 40 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $16 * SIZE, BO addq $ 8 * SIZE, B movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) movsd %xmm1, 2 * SIZE(BO) movsd %xmm1, 3 * SIZE(BO) movsd %xmm2, 4 * SIZE(BO) movsd %xmm2, 5 * SIZE(BO) movsd %xmm3, 6 * SIZE(BO) movsd %xmm3, 7 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif testq $1, M je .L20 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsd 16 * SIZE(BO), %xmm13 movsd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 4 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 32 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 1 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm0 movsd 10 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm1 movsd 12 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 mulsd 14 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 40 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 2 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm0 movsd 18 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm1 movsd 20 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 mulsd 22 * SIZE(BO), %xmm8 addsd %xmm13, %xmm2 movsd 48 * SIZE(BO), %xmm13 addsd %xmm8, %xmm3 movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm0 movsd 26 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm1 movsd 28 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 mulsd 30 * SIZE(BO), %xmm8 addsd %xmm15, %xmm2 movsd 56 * SIZE(BO), %xmm15 addsd %xmm8, %xmm3 movsd 4 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 movsd 34 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 36 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 38 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 64 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm0 movsd 42 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm1 movsd 44 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 mulsd 46 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 72 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 6 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm0 movsd 50 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm1 movsd 52 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 mulsd 54 * SIZE(BO), %xmm8 addsd %xmm13, %xmm2 movsd 80 * SIZE(BO), %xmm13 addsd %xmm8, %xmm3 movsd 7 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm0 movsd 58 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm1 movsd 60 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 mulsd 62 * SIZE(BO), %xmm8 addsd %xmm15, %xmm2 movsd 88 * SIZE(BO), %xmm15 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 4 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 8 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 1 * SIZE(AO), %xmm8 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm5 movsd 2 * SIZE(B), %xmm6 movsd 3 * SIZE(B), %xmm7 #else movsd 0 * SIZE(AO), %xmm4 movsd 1 * SIZE(AO), %xmm5 movsd 2 * SIZE(AO), %xmm6 movsd 3 * SIZE(AO), %xmm7 #endif subsd %xmm0, %xmm4 subsd %xmm1, %xmm5 subsd %xmm2, %xmm6 subsd %xmm3, %xmm7 #ifdef LN movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm4 movlpd 1 * SIZE(B), %xmm1 mulsd %xmm4, %xmm1 subsd %xmm1, %xmm5 movlpd 2 * SIZE(B), %xmm2 mulsd %xmm4, %xmm2 subsd %xmm2, %xmm6 movlpd 3 * SIZE(B), %xmm3 mulsd %xmm4, %xmm3 subsd %xmm3, %xmm7 mulsd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm6 movlpd 7 * SIZE(B), %xmm2 mulsd %xmm5, %xmm2 subsd %xmm2, %xmm7 mulsd 10 * SIZE(B), %xmm6 movlpd 11 * SIZE(B), %xmm1 mulsd %xmm6, %xmm1 subsd %xmm1, %xmm7 mulsd 15 * SIZE(B), %xmm7 #endif #ifdef RT mulsd 15 * SIZE(B), %xmm7 movlpd 14 * SIZE(B), %xmm1 mulsd %xmm7, %xmm1 subsd %xmm1, %xmm6 movlpd 13 * SIZE(B), %xmm2 mulsd %xmm7, %xmm2 subsd %xmm2, %xmm5 movlpd 12 * SIZE(B), %xmm3 mulsd %xmm7, %xmm3 subsd %xmm3, %xmm4 mulsd 10 * SIZE(B), %xmm6 movlpd 9 * SIZE(B), %xmm1 mulsd %xmm6, %xmm1 subsd %xmm1, %xmm5 movlpd 8 * SIZE(B), %xmm2 mulsd %xmm6, %xmm2 subsd %xmm2, %xmm4 mulsd 5 * SIZE(B), %xmm5 movlpd 4 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm4 mulsd 0 * SIZE(B), %xmm4 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO2) movsd %xmm6, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 0 * SIZE(CO2, LDC, 2) #if defined(LN) || defined(LT) movsd %xmm4, 0 * SIZE(B) movsd %xmm5, 1 * SIZE(B) movsd %xmm6, 2 * SIZE(B) movsd %xmm7, 3 * SIZE(B) movsd %xmm4, 0 * SIZE(BO) movsd %xmm4, 1 * SIZE(BO) movsd %xmm5, 2 * SIZE(BO) movsd %xmm5, 3 * SIZE(BO) movsd %xmm6, 4 * SIZE(BO) movsd %xmm6, 5 * SIZE(BO) movsd %xmm7, 6 * SIZE(BO) movsd %xmm7, 7 * SIZE(BO) #else movsd %xmm4, 0 * SIZE(AO) movsd %xmm5, 1 * SIZE(AO) movsd %xmm6, 2 * SIZE(AO) movsd %xmm7, 3 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm11, %xmm2 movapd 40 * SIZE(BO), %xmm11 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 addpd %xmm13, %xmm0 movapd 18 * SIZE(BO), %xmm13 mulpd %xmm8, %xmm13 addpd %xmm13, %xmm1 movapd 20 * SIZE(BO), %xmm13 mulpd %xmm8, %xmm13 mulpd 22 * SIZE(BO), %xmm8 addpd %xmm13, %xmm2 movapd 48 * SIZE(BO), %xmm13 addpd %xmm8, %xmm3 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm15 addpd %xmm15, %xmm0 movapd 26 * SIZE(BO), %xmm15 mulpd %xmm8, %xmm15 addpd %xmm15, %xmm1 movapd 28 * SIZE(BO), %xmm15 mulpd %xmm8, %xmm15 mulpd 30 * SIZE(BO), %xmm8 addpd %xmm15, %xmm2 movapd 56 * SIZE(BO), %xmm15 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 34 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movapd 36 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 38 * SIZE(BO), %xmm10 addpd %xmm9, %xmm2 movapd 64 * SIZE(BO), %xmm9 addpd %xmm10, %xmm3 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 42 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movapd 44 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 mulpd 46 * SIZE(BO), %xmm10 addpd %xmm11, %xmm2 movapd 72 * SIZE(BO), %xmm11 addpd %xmm10, %xmm3 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm13 addpd %xmm13, %xmm0 movapd 50 * SIZE(BO), %xmm13 mulpd %xmm10, %xmm13 addpd %xmm13, %xmm1 movapd 52 * SIZE(BO), %xmm13 mulpd %xmm10, %xmm13 mulpd 54 * SIZE(BO), %xmm10 addpd %xmm13, %xmm2 movapd 80 * SIZE(BO), %xmm13 addpd %xmm10, %xmm3 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 addpd %xmm15, %xmm0 movapd 58 * SIZE(BO), %xmm15 mulpd %xmm10, %xmm15 addpd %xmm15, %xmm1 movapd 60 * SIZE(BO), %xmm15 mulpd %xmm10, %xmm15 mulpd 62 * SIZE(BO), %xmm10 addpd %xmm15, %xmm2 movapd 88 * SIZE(BO), %xmm15 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 8 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 movapd 4 * SIZE(AO), %xmm12 movapd 6 * SIZE(AO), %xmm14 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 subpd %xmm2, %xmm12 subpd %xmm3, %xmm14 #endif #ifdef LN movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) movapd %xmm12, 4 * SIZE(AO) movapd %xmm14, 6 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $2, I # i = (m >> 2) jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(BO), %xmm9 movapd 2 * SIZE(BO), %xmm11 movapd 4 * SIZE(BO), %xmm13 movapd 8 * SIZE(BO), %xmm15 movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 2 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movapd 4 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movapd 6 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 #ifdef LN PREFETCHW -4 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW -4 * SIZE(CO2) pxor %xmm5, %xmm5 PREFETCHW -4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 PREFETCHW -4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #else PREFETCHW 4 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 4 * SIZE(CO2) pxor %xmm5, %xmm5 PREFETCHW 4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 PREFETCHW 4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpq $64 * 2, %rax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpq $64 * 4, %rax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpq $64 * 6, %rax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addq $16 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $64 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 4), BO # * 64 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm5 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 6 * SIZE(BO), %xmm10 addpd %xmm9, %xmm6 movapd 8 * SIZE(BO), %xmm9 addpd %xmm10, %xmm7 movapd 6 * SIZE(AO), %xmm10 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd %xmm6, %xmm14 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm14 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 movapd 8 * SIZE(B), %xmm9 movapd 10 * SIZE(B), %xmm11 movapd 12 * SIZE(B), %xmm13 movapd 14 * SIZE(B), %xmm15 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 subpd %xmm4, %xmm9 subpd %xmm6, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 movapd 8 * SIZE(AO), %xmm12 movapd 10 * SIZE(AO), %xmm13 movapd 12 * SIZE(AO), %xmm14 movapd 14 * SIZE(AO), %xmm15 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 subpd %xmm2, %xmm12 subpd %xmm6, %xmm13 subpd %xmm3, %xmm14 subpd %xmm7, %xmm15 #endif #ifdef LN movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm15, %xmm4 subpd %xmm4, %xmm7 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm15, %xmm6 subpd %xmm6, %xmm3 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm7 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm11, %xmm4 subpd %xmm4, %xmm3 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm11 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm3, %xmm6 subpd %xmm6, %xmm15 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm11 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm7, %xmm4 subpd %xmm4, %xmm15 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm9, %xmm3 subpd %xmm3, %xmm15 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm13 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm15 movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm15, %xmm1 subpd %xmm1, %xmm13 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm15, %xmm3 subpd %xmm3, %xmm9 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm11 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 2 * SIZE(CO1, LDC, 2) movsd %xmm15, 3 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm13, 2 * SIZE(CO1, LDC, 2) movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movapd %xmm9, 8 * SIZE(B) movapd %xmm11, 10 * SIZE(B) movapd %xmm13, 12 * SIZE(B) movapd %xmm15, 14 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) movlpd %xmm9, 16 * SIZE(BO) movlpd %xmm9, 17 * SIZE(BO) movhpd %xmm9, 18 * SIZE(BO) movhpd %xmm9, 19 * SIZE(BO) movlpd %xmm11, 20 * SIZE(BO) movlpd %xmm11, 21 * SIZE(BO) movhpd %xmm11, 22 * SIZE(BO) movhpd %xmm11, 23 * SIZE(BO) movlpd %xmm13, 24 * SIZE(BO) movlpd %xmm13, 25 * SIZE(BO) movhpd %xmm13, 26 * SIZE(BO) movhpd %xmm13, 27 * SIZE(BO) movlpd %xmm15, 28 * SIZE(BO) movlpd %xmm15, 29 * SIZE(BO) movhpd %xmm15, 30 * SIZE(BO) movhpd %xmm15, 31 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) movapd %xmm12, 8 * SIZE(AO) movapd %xmm13, 10 * SIZE(AO) movapd %xmm14, 12 * SIZE(AO) movapd %xmm15, 14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: PREFETCH 56 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) movsd %xmm1, 2 * SIZE(BO) movsd %xmm1, 3 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif testq $1, M je .L60 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsd 16 * SIZE(BO), %xmm13 movsd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulsd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd 2 * SIZE(BO), %xmm8 addsd %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addsd %xmm8, %xmm1 movsd 1 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 32 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 2 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 mulsd 10 * SIZE(BO), %xmm8 addsd %xmm11, %xmm0 movsd 12 * SIZE(BO), %xmm11 addsd %xmm8, %xmm1 movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 mulsd 14 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 40 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 mulsd %xmm10, %xmm13 mulsd 18 * SIZE(BO), %xmm10 addsd %xmm13, %xmm0 movsd 20 * SIZE(BO), %xmm13 addsd %xmm10, %xmm1 movsd 5 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm13 mulsd 22 * SIZE(BO), %xmm10 addsd %xmm13, %xmm2 movsd 48 * SIZE(BO), %xmm13 addsd %xmm10, %xmm3 movsd 6 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm15 mulsd 26 * SIZE(BO), %xmm10 addsd %xmm15, %xmm0 movsd 28 * SIZE(BO), %xmm15 addsd %xmm10, %xmm1 movsd 7 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm15 mulsd 30 * SIZE(BO), %xmm10 addsd %xmm15, %xmm2 movsd 56 * SIZE(BO), %xmm15 addsd %xmm10, %xmm3 movsd 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulsd %xmm8, %xmm9 mulsd 2 * SIZE(BO), %xmm8 addsd %xmm9, %xmm0 addsd %xmm8, %xmm1 movsd 1 * SIZE(AO), %xmm8 movsd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addsd %xmm2, %xmm0 addsd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm5 #else movsd 0 * SIZE(AO), %xmm4 movsd 1 * SIZE(AO), %xmm5 #endif subsd %xmm0, %xmm4 subsd %xmm1, %xmm5 #ifdef LN movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm1 mulsd %xmm4, %xmm1 subsd %xmm1, %xmm5 mulsd 3 * SIZE(B), %xmm5 #endif #ifdef RT mulsd 3 * SIZE(B), %xmm5 movlpd 2 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm4 mulsd 0 * SIZE(B), %xmm4 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm4, 0 * SIZE(B) movsd %xmm5, 1 * SIZE(B) movsd %xmm4, 0 * SIZE(BO) movsd %xmm4, 1 * SIZE(BO) movsd %xmm5, 2 * SIZE(BO) movsd %xmm5, 3 * SIZE(BO) #else movsd %xmm4, 0 * SIZE(AO) movsd %xmm5, 1 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm11, %xmm0 movapd 12 * SIZE(BO), %xmm11 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm11, %xmm2 movapd 40 * SIZE(BO), %xmm11 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm13 mulpd 18 * SIZE(BO), %xmm10 addpd %xmm13, %xmm0 movapd 20 * SIZE(BO), %xmm13 addpd %xmm10, %xmm1 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm13 mulpd 22 * SIZE(BO), %xmm10 addpd %xmm13, %xmm2 movapd 48 * SIZE(BO), %xmm13 addpd %xmm10, %xmm3 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 mulpd 26 * SIZE(BO), %xmm10 addpd %xmm15, %xmm0 movapd 28 * SIZE(BO), %xmm15 addpd %xmm10, %xmm1 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 mulpd 30 * SIZE(BO), %xmm10 addpd %xmm15, %xmm2 movapd 56 * SIZE(BO), %xmm15 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 #endif #ifdef LN movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: movq M, I sarq $2, I # i = (m >> 2) jle .L79 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movapd 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 movapd 16 * SIZE(AO), %xmm12 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movapd 24 * SIZE(BO), %xmm15 #ifdef LN PREFETCHW -4 * SIZE(CO1) PREFETCHW -4 * SIZE(CO2) #else PREFETCHW 4 * SIZE(CO1) PREFETCHW 4 * SIZE(CO2) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm4 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm4 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm5 movapd 32 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm11 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 8 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm4 movapd 12 * SIZE(BO), %xmm11 addpd %xmm10, %xmm5 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 12 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm11, %xmm4 movapd 40 * SIZE(BO), %xmm11 addpd %xmm10, %xmm5 movapd 40 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm12, %xmm13 mulpd 18 * SIZE(BO), %xmm12 addpd %xmm13, %xmm0 movapd 16 * SIZE(BO), %xmm13 addpd %xmm12, %xmm1 movapd 18 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 18 * SIZE(BO), %xmm12 addpd %xmm13, %xmm4 movapd 20 * SIZE(BO), %xmm13 addpd %xmm12, %xmm5 movapd 20 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 22 * SIZE(BO), %xmm12 addpd %xmm13, %xmm0 movapd 20 * SIZE(BO), %xmm13 addpd %xmm12, %xmm1 movapd 22 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 22 * SIZE(BO), %xmm12 addpd %xmm13, %xmm4 movapd 48 * SIZE(BO), %xmm13 addpd %xmm12, %xmm5 movapd 48 * SIZE(AO), %xmm12 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm14, %xmm15 mulpd 26 * SIZE(BO), %xmm14 addpd %xmm15, %xmm0 movapd 24 * SIZE(BO), %xmm15 addpd %xmm14, %xmm1 movapd 26 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 26 * SIZE(BO), %xmm14 addpd %xmm15, %xmm4 movapd 28 * SIZE(BO), %xmm15 addpd %xmm14, %xmm5 movapd 28 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 30 * SIZE(BO), %xmm14 addpd %xmm15, %xmm0 movapd 28 * SIZE(BO), %xmm15 addpd %xmm14, %xmm1 movapd 30 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 30 * SIZE(BO), %xmm14 addpd %xmm15, %xmm4 movapd 56 * SIZE(BO), %xmm15 addpd %xmm14, %xmm5 movapd 56 * SIZE(AO), %xmm14 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 movapd 4 * SIZE(B), %xmm9 movapd 6 * SIZE(B), %xmm13 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 subpd %xmm4, %xmm9 subpd %xmm12, %xmm13 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 #endif #ifdef LN movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movapd %xmm9, 4 * SIZE(B) movapd %xmm13, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) movlpd %xmm9, 8 * SIZE(BO) movlpd %xmm9, 9 * SIZE(BO) movhpd %xmm9, 10 * SIZE(BO) movhpd %xmm9, 11 * SIZE(BO) movlpd %xmm13, 12 * SIZE(BO) movlpd %xmm13, 13 * SIZE(BO) movhpd %xmm13, 14 * SIZE(BO) movhpd %xmm13, 15 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L83 ALIGN_4 .L82: PREFETCH 56 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L82 ALIGN_4 .L83: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L90 ALIGN_4 .L84: movsd 0 * SIZE(B), %xmm0 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO decq %rax jne .L84 ALIGN_4 .L90: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif testq $1, M je .L100 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulsd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 1 * SIZE(AO), %xmm8 addsd %xmm9, %xmm0 movsd 16 * SIZE(BO), %xmm9 mulsd 2 * SIZE(BO), %xmm8 addsd %xmm8, %xmm1 movsd 2 * SIZE(AO), %xmm8 mulsd 4 * SIZE(BO), %xmm8 addsd %xmm8, %xmm2 movsd 3 * SIZE(AO), %xmm8 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 mulsd %xmm10, %xmm11 movsd 5 * SIZE(AO), %xmm10 addsd %xmm11, %xmm0 movsd 24 * SIZE(BO), %xmm11 mulsd 10 * SIZE(BO), %xmm10 addsd %xmm10, %xmm1 movsd 6 * SIZE(AO), %xmm10 mulsd 12 * SIZE(BO), %xmm10 addsd %xmm10, %xmm2 movsd 7 * SIZE(AO), %xmm10 mulsd 14 * SIZE(BO), %xmm10 addsd %xmm10, %xmm3 movsd 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm8, %xmm9 movsd 1 * SIZE(AO), %xmm8 addsd %xmm9, %xmm0 movsd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: addsd %xmm2, %xmm0 addsd %xmm3, %xmm1 addsd %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 subsd %xmm0, %xmm2 #else movsd 0 * SIZE(AO), %xmm2 subsd %xmm0, %xmm2 #endif #ifdef LN movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movsd 0 * SIZE(B), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef RT movsd 0 * SIZE(B), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) #else movsd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(AO), %xmm8 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 4 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movapd 6 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 12 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movapd 14 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(AO), %xmm8 movapd 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) movhpd %xmm2, 2 * SIZE(BO) movhpd %xmm2, 3 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: movq M, I sarq $2, I # i = (m >> 2) jle .L119 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movapd 24 * SIZE(AO), %xmm14 #ifdef LN PREFETCHW -4 * SIZE(CO1) #else PREFETCHW 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm9, %xmm3 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm12 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm12, %xmm0 movapd 20 * SIZE(AO), %xmm12 addpd %xmm11, %xmm1 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm12 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm12, %xmm2 movapd 48 * SIZE(AO), %xmm12 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) addpd %xmm11, %xmm3 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm14 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm14, %xmm0 movapd 28 * SIZE(AO), %xmm14 addpd %xmm11, %xmm1 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm14 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm14, %xmm2 movapd 56 * SIZE(AO), %xmm14 addpd %xmm11, %xmm3 movapd 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 2 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(AO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(AO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(AO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(AO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) movhpd %xmm2, 2 * SIZE(BO) movhpd %xmm2, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S000066400000000000000000002246431313527062700220730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 272 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #define KERNEL1(address) \ mulpd %xmm8, %xmm9 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L30 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 3 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 8 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 5 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 6 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 7 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 40 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(BO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(BO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(BO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(BO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(BO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $2, M BRANCH je .L20 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm5 movapd 6 * SIZE(BO), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 movapd 4 * SIZE(AO), %xmm12 movapd 6 * SIZE(AO), %xmm14 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 subpd %xmm2, %xmm12 subpd %xmm3, %xmm14 #endif #ifdef LN movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 #endif #ifdef RT movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) movapd %xmm5, 4 * SIZE(BO) movapd %xmm7, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) movapd %xmm12, 4 * SIZE(AO) movapd %xmm14, 6 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: movq M, I sarq $2, I # i = (m >> 2) jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movddup 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movddup 24 * SIZE(BO), %xmm15 #ifdef LN prefetchnta -4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta -4 * SIZE(CO2) pxor %xmm5, %xmm5 prefetchnta -4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 prefetchnta -4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #else prefetchnta 4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta 4 * SIZE(CO2) pxor %xmm5, %xmm5 prefetchnta 4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 prefetchnta 4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif #if 1 andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L12 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L12 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L12 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L12 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L12 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L12 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L12 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 #else sarq $3, %rax je .L15 ALIGN_4 .L12: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 32 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 40 * SIZE(BO), %xmm11 mulpd %xmm12, %xmm13 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm13, %xmm0 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 18 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 16 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 20 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm0 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 22 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 48 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 48 * SIZE(BO), %xmm13 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 26 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 28 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 30 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 56 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 56 * SIZE(BO), %xmm15 addq $32 * SIZE, BO addq $32 * SIZE, AO decq %rax BRANCH jne .L12 #endif ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax subq $4, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd %xmm6, %xmm14 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm14 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm5 movapd 6 * SIZE(BO), %xmm7 movapd 8 * SIZE(BO), %xmm9 movapd 10 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm13 movapd 14 * SIZE(BO), %xmm15 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 subpd %xmm4, %xmm9 subpd %xmm6, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 movapd 8 * SIZE(AO), %xmm12 movapd 10 * SIZE(AO), %xmm13 movapd 12 * SIZE(AO), %xmm14 movapd 14 * SIZE(AO), %xmm15 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 subpd %xmm2, %xmm12 subpd %xmm6, %xmm13 subpd %xmm3, %xmm14 subpd %xmm7, %xmm15 #endif #ifdef LN movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm15, %xmm4 subpd %xmm4, %xmm7 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm15, %xmm6 subpd %xmm6, %xmm3 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm7 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm11, %xmm4 subpd %xmm4, %xmm3 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm11 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm3, %xmm6 subpd %xmm6, %xmm15 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm11 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm7, %xmm4 subpd %xmm4, %xmm15 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm9, %xmm3 subpd %xmm3, %xmm15 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm13 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm15 movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #endif #ifdef RT movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm15, %xmm1 subpd %xmm1, %xmm13 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm15, %xmm3 subpd %xmm3, %xmm9 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm11 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 2 * SIZE(CO1, LDC, 2) movsd %xmm15, 3 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm13, 2 * SIZE(CO1, LDC, 2) movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) movapd %xmm5, 4 * SIZE(BO) movapd %xmm7, 6 * SIZE(BO) movapd %xmm9, 8 * SIZE(BO) movapd %xmm11, 10 * SIZE(BO) movapd %xmm13, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) movapd %xmm12, 8 * SIZE(AO) movapd %xmm13, 10 * SIZE(AO) movapd %xmm14, 12 * SIZE(AO) movapd %xmm15, 14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L10 ALIGN_4 .L40: testq $2, N je .L80 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L70 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movddup 2 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movddup 3 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movddup 8 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm11 movddup 5 * SIZE(AO), %xmm10 addpd %xmm11, %xmm0 mulpd 10 * SIZE(BO), %xmm10 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movddup 6 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movddup 7 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movddup 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $2, M je .L60 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm5 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 #endif #ifdef LN movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 #endif #ifdef RT movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm5, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: movq M, I sarq $2, I # i = (m >> 2) jle .L79 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-4 * SIZE, BB movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 #ifdef LN prefetchnta -4 * SIZE(CO1) prefetchnta -4 * SIZE(CO2) #else prefetchnta 4 * SIZE(CO1) prefetchnta 4 * SIZE(CO2) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm5 movapd 4 * SIZE(BO), %xmm9 movapd 6 * SIZE(BO), %xmm13 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 subpd %xmm4, %xmm9 subpd %xmm12, %xmm13 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 #endif #ifdef LN movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 #endif #ifdef RT movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm5, 2 * SIZE(BO) movapd %xmm9, 4 * SIZE(BO) movapd %xmm13, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M je .L110 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulpd %xmm9, %xmm8 movapd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(BO), %xmm9 movapd 8 * SIZE(BO), %xmm8 addpd %xmm9, %xmm1 movapd 8 * SIZE(AO), %xmm9 mulpd %xmm11, %xmm10 movapd 6 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 mulpd 6 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm10 addpd %xmm11, %xmm1 movapd 12 * SIZE(AO), %xmm11 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd 0 * SIZE(BO), %xmm9 addsd %xmm9, %xmm0 movsd 1 * SIZE(AO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: addpd %xmm1, %xmm0 haddpd %xmm0, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm2 subsd %xmm0, %xmm2 #else movsd 0 * SIZE(AO), %xmm2 subsd %xmm0, %xmm2 #endif #ifdef LN movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef RT movsd 0 * SIZE(BO), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(BO) #else movsd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $2, M je .L100 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(AO), %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd 4 * SIZE(AO), %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm10 movddup 5 * SIZE(BO), %xmm11 addpd %xmm10, %xmm0 mulpd 10 * SIZE(AO), %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 6 * SIZE(BO), %xmm11 mulpd 12 * SIZE(AO), %xmm11 addpd %xmm11, %xmm2 movddup 7 * SIZE(BO), %xmm11 mulpd 14 * SIZE(AO), %xmm11 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm9, %xmm8 movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L100: movq M, I sarq $2, I # i = (m >> 2) jle .L119 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifdef LN prefetchnta -4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 24 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm8 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm8, %xmm0 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 5 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm8 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 6 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 movapd 28 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 7 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(AO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(AO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(AO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(AO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_2 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S000066400000000000000000002537641313527062700226410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (16 * 1 + 4) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $3, J NOBRANCH jle .L40 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 8), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 4), CO2 #ifndef RT leaq (C, LDC, 8), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB testq $1, M BRANCH jle .L20 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm8, %xmm8 xorps %xmm12, %xmm12 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -20 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -12 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -4 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 subq $-32 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #endif addps %xmm2, %xmm8 addps %xmm3, %xmm12 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm4 subps %xmm8, %xmm0 subps %xmm12, %xmm4 #else movsd -32 * SIZE(AO), %xmm0 movhps -30 * SIZE(AO), %xmm0 movsd -28 * SIZE(AO), %xmm4 movhps -26 * SIZE(AO), %xmm4 subps %xmm8, %xmm0 subps %xmm12, %xmm4 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 pshufd $0xff, %xmm4, %xmm7 pshufd $0xaa, %xmm4, %xmm6 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm4, -28 * SIZE(BO) pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 pshufd $0xff, %xmm4, %xmm7 pshufd $0xaa, %xmm4, %xmm6 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #else unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 unpcklps %xmm5, %xmm4 unpcklps %xmm7, %xmm6 movlps %xmm0, -32 * SIZE(AO) movlps %xmm2, -30 * SIZE(AO) movlps %xmm4, -28 * SIZE(AO) movlps %xmm6, -26 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO1, LDC, 1) movss %xmm2, (CO1, LDC, 2) movss %xmm3, (CO1, %rax, 1) movss %xmm4, (CO2) movss %xmm5, (CO2, LDC, 1) movss %xmm6, (CO2, LDC, 2) movss %xmm7, (CO2, %rax, 1) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -16 * SIZE(BO), %xmm5 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -8 * SIZE(BO), %xmm5 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps 0 * SIZE(BO), %xmm5 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm5 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm2 movaps -24 * SIZE(BO), %xmm1 movaps -20 * SIZE(BO), %xmm3 subps %xmm8, %xmm0 subps %xmm4, %xmm1 subps %xmm10, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 movaps -24 * SIZE(AO), %xmm4 movaps -20 * SIZE(AO), %xmm6 subps %xmm8, %xmm0 subps %xmm9, %xmm2 subps %xmm10, %xmm4 subps %xmm11, %xmm6 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm2, -28 * SIZE(BO) movaps %xmm1, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) movaps %xmm0, %xmm4 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm4 movaps %xmm2, %xmm5 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm5 movsd %xmm0, (CO1) movhps %xmm0, (CO1, LDC, 1) movsd %xmm4, (CO1, LDC, 2) movhps %xmm4, (CO1, %rax, 1) movsd %xmm2, (CO2) movhps %xmm2, (CO2, LDC, 1) movsd %xmm5, (CO2, LDC, 2) movhps %xmm5, (CO2, %rax, 1) #else movlhps %xmm1, %xmm0 movlhps %xmm3, %xmm2 movlhps %xmm5, %xmm4 movlhps %xmm7, %xmm6 movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movaps %xmm4, -24 * SIZE(AO) movaps %xmm6, -20 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO1, LDC, 1) movsd %xmm2, (CO1, LDC, 2) movsd %xmm3, (CO1, %rax, 1) movsd %xmm4, (CO2) movsd %xmm5, (CO2, LDC, 1) movsd %xmm6, (CO2, LDC, 2) movsd %xmm7, (CO2, %rax, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $2, I NOBRANCH jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 8), BO #else movq B, BO #endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht2 -4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 -4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 -4 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht2 -4 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht2 -4 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht2 -4 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht2 -4 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht2 -4 * SIZE(CO2, %rax, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 8), BO #endif addps %xmm1, %xmm12 addps %xmm2, %xmm13 addps %xmm3, %xmm14 addps %xmm4, %xmm15 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm4 shufps $0xdd, %xmm9, %xmm5 movaps %xmm8, %xmm6 shufps $0x88, %xmm10, %xmm8 shufps $0xdd, %xmm6, %xmm10 movaps %xmm4, %xmm9 movaps %xmm5, %xmm11 shufps $0x22, %xmm5, %xmm9 shufps $0x77, %xmm4, %xmm11 movaps %xmm12, %xmm4 shufps $0x88, %xmm13, %xmm12 movaps %xmm14, %xmm5 shufps $0x88, %xmm15, %xmm14 shufps $0xdd, %xmm15, %xmm4 shufps $0xdd, %xmm13, %xmm5 movaps %xmm12, %xmm6 shufps $0x88, %xmm14, %xmm12 shufps $0xdd, %xmm6, %xmm14 movaps %xmm4, %xmm13 movaps %xmm5, %xmm15 shufps $0x22, %xmm5, %xmm13 shufps $0x77, %xmm4, %xmm15 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm4 movaps -24 * SIZE(BO), %xmm1 movaps -20 * SIZE(BO), %xmm5 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm6 movaps -8 * SIZE(BO), %xmm3 movaps -4 * SIZE(BO), %xmm7 #else movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps %xmm13, %xmm4 shufps $0xd8, %xmm12, %xmm13 shufps $0xd8, %xmm15, %xmm12 shufps $0xd8, %xmm14, %xmm15 shufps $0xd8, %xmm4, %xmm14 movaps %xmm12, %xmm4 shufps $0xd8, %xmm14, %xmm12 shufps $0xd8, %xmm4, %xmm14 movaps %xmm13, %xmm5 shufps $0xd8, %xmm15, %xmm13 shufps $0xd8, %xmm5, %xmm15 movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -24 * SIZE(AO), %xmm2 movaps -20 * SIZE(AO), %xmm3 movaps -16 * SIZE(AO), %xmm4 movaps -12 * SIZE(AO), %xmm5 movaps -8 * SIZE(AO), %xmm6 movaps -4 * SIZE(AO), %xmm7 #endif subps %xmm8, %xmm0 subps %xmm9, %xmm1 subps %xmm10, %xmm2 subps %xmm11, %xmm3 subps %xmm12, %xmm4 subps %xmm13, %xmm5 subps %xmm14, %xmm6 subps %xmm15, %xmm7 #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 mulps %xmm15, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm4, -28 * SIZE(BO) movaps %xmm1, -24 * SIZE(BO) movaps %xmm5, -20 * SIZE(BO) movaps %xmm2, -16 * SIZE(BO) movaps %xmm6, -12 * SIZE(BO) movaps %xmm3, -8 * SIZE(BO) movaps %xmm7, -4 * SIZE(BO) movaps %xmm0, %xmm8 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm8, %xmm1 movaps %xmm2, %xmm9 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm9, %xmm3 movaps %xmm0, %xmm8 shufps $0x88, %xmm2, %xmm0 movaps %xmm1, %xmm9 shufps $0x22, %xmm3, %xmm1 shufps $0xdd, %xmm2, %xmm8 movaps %xmm8, %xmm2 shufps $0x77, %xmm3, %xmm9 movaps %xmm9, %xmm3 movaps %xmm4, %xmm8 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm8, %xmm5 movaps %xmm6, %xmm9 shufps $0x88, %xmm7, %xmm6 shufps $0xdd, %xmm9, %xmm7 movaps %xmm4, %xmm8 shufps $0x88, %xmm6, %xmm4 movaps %xmm5, %xmm9 shufps $0x22, %xmm7, %xmm5 shufps $0xdd, %xmm6, %xmm8 movaps %xmm8, %xmm6 shufps $0x77, %xmm7, %xmm9 movaps %xmm9, %xmm7 #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm1, -28 * SIZE(AO) movaps %xmm2, -24 * SIZE(AO) movaps %xmm3, -20 * SIZE(AO) movaps %xmm4, -16 * SIZE(AO) movaps %xmm5, -12 * SIZE(AO) movaps %xmm6, -8 * SIZE(AO) movaps %xmm7, -4 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %rax, 1) movhps %xmm3, 2 * SIZE(CO1, %rax, 1) movsd %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movsd %xmm5, 0 * SIZE(CO2, LDC, 1) movhps %xmm5, 2 * SIZE(CO2, LDC, 1) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhps %xmm6, 2 * SIZE(CO2, LDC, 2) movsd %xmm7, 0 * SIZE(CO2, %rax, 1) movhps %xmm7, 2 * SIZE(CO2, %rax, 1) #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 8), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $8, KK #endif #ifdef RT subq $8, KK #endif subq $1, J BRANCH jg .L10 ALIGN_4 .L40: testq $4, N jle .L70 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L50 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -20 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-16 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L62 addps %xmm9, %xmm8 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addps %xmm2, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 #else movsd -32 * SIZE(AO), %xmm0 movhps -30 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #else unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, -32 * SIZE(AO) movlps %xmm2, -30 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO1, LDC, 1) movss %xmm2, (CO2) movss %xmm3, (CO2, LDC, 1) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L50: testq $2, M BRANCH jle .L60 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 subps %xmm8, %xmm0 subps %xmm4, %xmm1 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 subps %xmm8, %xmm0 subps %xmm9, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm0, %xmm4 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm4 movsd %xmm0, (CO1) movhps %xmm0, (CO1, LDC, 1) movsd %xmm4, (CO2) movhps %xmm4, (CO2, LDC, 1) #else movlhps %xmm1, %xmm0 movlhps %xmm3, %xmm2 movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO1, LDC, 1) movsd %xmm2, (CO2) movsd %xmm3, (CO2, LDC, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: movq M, I sarq $2, I NOBRANCH jle .L69 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 -4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 -4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 -4 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht2 -4 * SIZE(CO2, LDC, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm4 shufps $0xdd, %xmm9, %xmm5 movaps %xmm8, %xmm6 shufps $0x88, %xmm10, %xmm8 shufps $0xdd, %xmm6, %xmm10 movaps %xmm4, %xmm9 movaps %xmm5, %xmm11 shufps $0x22, %xmm5, %xmm9 shufps $0x77, %xmm4, %xmm11 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps -20 * SIZE(BO), %xmm3 #else movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -24 * SIZE(AO), %xmm2 movaps -20 * SIZE(AO), %xmm3 #endif subps %xmm8, %xmm0 subps %xmm9, %xmm1 subps %xmm10, %xmm2 subps %xmm11, %xmm3 #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm2, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) movaps %xmm0, %xmm8 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm8, %xmm1 movaps %xmm2, %xmm9 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm9, %xmm3 movaps %xmm0, %xmm8 shufps $0x88, %xmm2, %xmm0 movaps %xmm1, %xmm9 shufps $0x22, %xmm3, %xmm1 shufps $0xdd, %xmm2, %xmm8 movaps %xmm8, %xmm2 shufps $0x77, %xmm3, %xmm9 movaps %xmm9, %xmm3 #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm1, -28 * SIZE(AO) movaps %xmm2, -24 * SIZE(AO) movaps %xmm3, -20 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 2 * SIZE(CO2, LDC, 1) #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L41 ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif ALIGN_4 .L70: testq $2, N jle .L100 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L80 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_3 .L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -26 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L92 addps %xmm9, %xmm8 ALIGN_3 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_3 .L96: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_3 .L98: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addps %xmm2, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 #else movsd -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #if defined(LN) || defined(LT) movss -32 * SIZE(AO), %xmm8 mulss %xmm8, %xmm0 mulss %xmm8, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movss %xmm0, -32 * SIZE(BO) movss %xmm1, -31 * SIZE(BO) #else movss %xmm0, -32 * SIZE(AO) movss %xmm1, -31 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO2) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L80: testq $2, M BRANCH jle .L90 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -26 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L82 ALIGN_3 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_3 .L88: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) pshufd $0xd8, %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm0 #else movaps -32 * SIZE(AO), %xmm0 #endif subps %xmm8, %xmm0 movhlps %xmm0, %xmm1 #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlps %xmm0, -32 * SIZE(BO) movlps %xmm1, -30 * SIZE(BO) unpcklps %xmm1, %xmm0 movlps %xmm0, (CO1) movhps %xmm0, (CO2) #else movlps %xmm0, -32 * SIZE(AO) movlps %xmm1, -30 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L90: movq M, I sarq $2, I NOBRANCH jle .L99 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 -4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 -4 * SIZE(CO2) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -26 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -24 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_3 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_3 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm4 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm2 subps %xmm8, %xmm0 subps %xmm4, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 subps %xmm8, %xmm0 subps %xmm9, %xmm2 #endif #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm2 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlps %xmm0, -32 * SIZE(BO) movlps %xmm1, -30 * SIZE(BO) movlps %xmm2, -28 * SIZE(BO) movlps %xmm3, -26 * SIZE(BO) unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 2 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L71 ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L100: testq $1, N jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif testq $1, M BRANCH jle .L110 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L125 ALIGN_3 .L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -30 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -30 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -29 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -29 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -28 * SIZE(AO), %xmm0 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L122 ALIGN_3 .L125: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L128 ALIGN_3 .L126: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L126 ALIGN_3 .L128: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif addss %xmm2, %xmm8 #if defined(LN) || defined(LT) movss -32 * SIZE(BO), %xmm0 subss %xmm8, %xmm0 #else movss -32 * SIZE(AO), %xmm0 subss %xmm8, %xmm0 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AO), %xmm8 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 #endif mulss %xmm8, %xmm0 #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm0, -32 * SIZE(BO) #else movss %xmm0, -32 * SIZE(AO) #endif movss %xmm0, (CO1) #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $2, M BRANCH jle .L120 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_3 .L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -31 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -29 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AO), %xmm0 subq $-4 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L112 ALIGN_3 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_3 .L116: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_3 .L118: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 pshufd $0x55, %xmm0, %xmm1 #else movsd -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 movlps %xmm0, -32 * SIZE(BO) movlps %xmm0, 0 * SIZE(CO1) #else movlps %xmm0, -32 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L120: movq M, I sarq $2, I NOBRANCH jle .L129 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 -4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -29 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_3 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_3 .L106: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_3 .L108: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 movhps -30 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 #else movaps -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, -32 * SIZE(BO) movlps %xmm2, -30 * SIZE(BO) movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 2 * SIZE(CO1) #else movaps %xmm0, -32 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L101 ALIGN_4 .L129: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LN_8x4_sse.S000066400000000000000000003414461313527062700220150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht0 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp EMMS movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movsd %xmm4, OFFSET movsd %xmm4, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $2 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 movaps 8 * SIZE(B), %xmm11 movaps 12 * SIZE(B), %xmm15 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) pshufd $0x00, %xmm11, %xmm8 pshufd $0x55, %xmm11, %xmm9 pshufd $0xaa, %xmm11, %xmm10 pshufd $0xff, %xmm11, %xmm11 pshufd $0x00, %xmm15, %xmm12 pshufd $0x55, %xmm15, %xmm13 pshufd $0xaa, %xmm15, %xmm14 pshufd $0xff, %xmm15, %xmm15 movaps %xmm8, 32 * SIZE(BO) movaps %xmm9, 36 * SIZE(BO) movaps %xmm10, 40 * SIZE(BO) movaps %xmm11, 44 * SIZE(BO) movaps %xmm12, 48 * SIZE(BO) movaps %xmm13, 52 * SIZE(BO) movaps %xmm14, 56 * SIZE(BO) movaps %xmm15, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif testq $1, M je .L20 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss 2 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss 3 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss 8 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss 5 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss 6 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss 7 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss 12 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 unpcklps %xmm1, %xmm0 movapd 0 * SIZE(B), %xmm1 subps %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 movss 1 * SIZE(AO), %xmm10 movss 2 * SIZE(AO), %xmm12 movss 3 * SIZE(AO), %xmm14 subss %xmm0, %xmm8 subss %xmm1, %xmm10 subss %xmm2, %xmm12 subss %xmm3, %xmm14 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) movss %xmm10, 1 * SIZE(AO) movss %xmm12, 2 * SIZE(AO) movss %xmm14, 3 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movss %xmm1, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO2) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm11, 0 * SIZE(CO2, LDC, 2) #else movss %xmm8, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO2) movss %xmm12, 0 * SIZE(CO1, LDC, 2) movss %xmm14, 0 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M je .L30 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 4 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd 6 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd 16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd 10 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 14 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movapd 0 * SIZE(B), %xmm1 movapd 4 * SIZE(B), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 2 * SIZE(AO), %xmm10 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd 4 * SIZE(AO), %xmm12 #ifdef movsd xorps %xmm14, %xmm14 #endif movsd 6 * SIZE(AO), %xmm14 subps %xmm0, %xmm8 subps %xmm1, %xmm10 subps %xmm2, %xmm12 subps %xmm3, %xmm14 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) movlps %xmm10, 2 * SIZE(AO) movlps %xmm12, 4 * SIZE(AO) movlps %xmm14, 6 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $4, M je .L40 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $2 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm5 movaps 8 * SIZE(B), %xmm10 movaps 12 * SIZE(B), %xmm11 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm10 movaps 8 * SIZE(AO), %xmm12 movaps 12 * SIZE(AO), %xmm14 subps %xmm0, %xmm8 subps %xmm1, %xmm10 subps %xmm2, %xmm12 subps %xmm3, %xmm14 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm10, 8 * SIZE(B) movaps %xmm11, 12 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 pshufd $0xaa, %xmm10, %xmm4 pshufd $0xff, %xmm10, %xmm6 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm6, 44 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 pshufd $0xaa, %xmm11, %xmm4 pshufd $0xff, %xmm11, %xmm6 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm6, 60 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm10, 4 * SIZE(AO) movaps %xmm12, 8 * SIZE(AO) movaps %xmm14, 12 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) movhps %xmm14, 2 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L40: movq M, I sarq $3, I # i = (m >> 3) jle .L49 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 movaps 8 * SIZE(BO), %xmm13 movaps 16 * SIZE(BO), %xmm15 movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 4 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 8 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movaps 12 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW -8 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW -8 * SIZE(CO2) pxor %xmm5, %xmm5 PREFETCHW -8 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 PREFETCHW -8 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 4 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 mulps 12 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 8 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 16 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm5 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 mulps 12 * SIZE(BO), %xmm10 addps %xmm13, %xmm6 movaps 24 * SIZE(BO), %xmm13 addps %xmm10, %xmm7 movaps 20 * SIZE(AO), %xmm10 mulps %xmm12, %xmm15 addps %xmm15, %xmm0 movaps 16 * SIZE(BO), %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm1 movaps 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm13 mulps 28 * SIZE(BO), %xmm12 addps %xmm13, %xmm2 movaps 24 * SIZE(BO), %xmm13 addps %xmm12, %xmm3 movaps 24 * SIZE(AO), %xmm12 mulps %xmm14, %xmm15 addps %xmm15, %xmm4 movaps 48 * SIZE(BO), %xmm15 mulps %xmm14, %xmm11 addps %xmm11, %xmm5 movaps 36 * SIZE(BO), %xmm11 mulps %xmm14, %xmm13 mulps 28 * SIZE(BO), %xmm14 addps %xmm13, %xmm6 movaps 40 * SIZE(BO), %xmm13 addps %xmm14, %xmm7 movaps 28 * SIZE(AO), %xmm14 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 36 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 40 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm5 movaps 52 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm6 movaps 56 * SIZE(BO), %xmm13 addps %xmm10, %xmm7 movaps 36 * SIZE(AO), %xmm10 mulps %xmm12, %xmm15 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm1 movaps 52 * SIZE(BO), %xmm11 mulps %xmm12, %xmm13 mulps 60 * SIZE(BO), %xmm12 addps %xmm13, %xmm2 movaps 56 * SIZE(BO), %xmm13 addps %xmm12, %xmm3 movaps 40 * SIZE(AO), %xmm12 mulps %xmm14, %xmm15 addps %xmm15, %xmm4 movaps 80 * SIZE(BO), %xmm15 mulps %xmm14, %xmm11 addps %xmm11, %xmm5 movaps 68 * SIZE(BO), %xmm11 mulps %xmm14, %xmm13 mulps 60 * SIZE(BO), %xmm14 addps %xmm13, %xmm6 movaps 72 * SIZE(BO), %xmm13 addps %xmm14, %xmm7 movaps 44 * SIZE(AO), %xmm14 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm5 movaps 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 12 * SIZE(BO), %xmm10 addps %xmm9, %xmm6 movaps 16 * SIZE(BO), %xmm9 addps %xmm10, %xmm7 movaps 12 * SIZE(AO), %xmm10 addq $8 * SIZE, AO addq $16 * SIZE, BO decq %rax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $2 + BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm5 movaps 8 * SIZE(B), %xmm10 movaps 12 * SIZE(B), %xmm11 movaps 16 * SIZE(B), %xmm12 movaps 20 * SIZE(B), %xmm13 movaps 24 * SIZE(B), %xmm14 movaps 28 * SIZE(B), %xmm15 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 subps %xmm4, %xmm12 subps %xmm6, %xmm13 subps %xmm9, %xmm14 subps %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 movaps 8 * SIZE(AO), %xmm10 movaps 12 * SIZE(AO), %xmm11 movaps 16 * SIZE(AO), %xmm12 movaps 20 * SIZE(AO), %xmm13 movaps 24 * SIZE(AO), %xmm14 movaps 28 * SIZE(AO), %xmm15 subps %xmm0, %xmm8 subps %xmm4, %xmm9 subps %xmm1, %xmm10 subps %xmm5, %xmm11 subps %xmm2, %xmm12 subps %xmm6, %xmm13 subps %xmm3, %xmm14 subps %xmm7, %xmm15 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulps %xmm8, %xmm15 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm15 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm15 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 mulps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm15 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 mulps %xmm2, %xmm15 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 mulps %xmm2, %xmm15 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm13 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm9 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 mulps %xmm2, %xmm13 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm9 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm9 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 subq $8 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm10, 8 * SIZE(B) movaps %xmm11, 12 * SIZE(B) movaps %xmm12, 16 * SIZE(B) movaps %xmm13, 20 * SIZE(B) movaps %xmm14, 24 * SIZE(B) movaps %xmm15, 28 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 pshufd $0xaa, %xmm10, %xmm4 pshufd $0xff, %xmm10, %xmm6 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm6, 44 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 pshufd $0xaa, %xmm11, %xmm4 pshufd $0xff, %xmm11, %xmm6 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm6, 60 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 pshufd $0x55, %xmm12, %xmm3 pshufd $0xaa, %xmm12, %xmm4 pshufd $0xff, %xmm12, %xmm6 movaps %xmm2, 64 * SIZE(BO) movaps %xmm3, 68 * SIZE(BO) movaps %xmm4, 72 * SIZE(BO) movaps %xmm6, 76 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 pshufd $0x55, %xmm13, %xmm3 pshufd $0xaa, %xmm13, %xmm4 pshufd $0xff, %xmm13, %xmm6 movaps %xmm2, 80 * SIZE(BO) movaps %xmm3, 84 * SIZE(BO) movaps %xmm4, 88 * SIZE(BO) movaps %xmm6, 92 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 pshufd $0x55, %xmm14, %xmm3 pshufd $0xaa, %xmm14, %xmm4 pshufd $0xff, %xmm14, %xmm6 movaps %xmm2, 96 * SIZE(BO) movaps %xmm3, 100 * SIZE(BO) movaps %xmm4, 104 * SIZE(BO) movaps %xmm6, 108 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 pshufd $0x55, %xmm15, %xmm3 pshufd $0xaa, %xmm15, %xmm4 pshufd $0xff, %xmm15, %xmm6 movaps %xmm2, 112 * SIZE(BO) movaps %xmm3, 116 * SIZE(BO) movaps %xmm4, 120 * SIZE(BO) movaps %xmm6, 124 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) movaps %xmm10, 8 * SIZE(AO) movaps %xmm11, 12 * SIZE(AO) movaps %xmm12, 16 * SIZE(AO) movaps %xmm13, 20 * SIZE(AO) movaps %xmm14, 24 * SIZE(AO) movaps %xmm15, 28 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movaps %xmm12, %xmm2 unpcklps %xmm14, %xmm12 unpckhps %xmm14, %xmm2 movaps %xmm13, %xmm7 unpcklps %xmm15, %xmm13 unpckhps %xmm15, %xmm7 movaps %xmm12, %xmm14 unpcklps %xmm13, %xmm12 unpckhps %xmm13, %xmm14 movaps %xmm2, %xmm15 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm15 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm14, 4 * SIZE(CO2) movhps %xmm14, 6 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 6 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm11, 4 * SIZE(CO2) movhps %xmm11, 6 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm13, 4 * SIZE(CO1, LDC, 2) movhps %xmm13, 6 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) movhps %xmm14, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $32 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L49: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 .L50: testq $2, N je .L100 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $1 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L53 ALIGN_4 .L52: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c #ifndef RT leaq (C, LDC, 2), C #endif testq $1, M je .L70 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss 3 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss 8 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss 5 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss 6 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss 7 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss 12 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: addss %xmm2, %xmm0 addss %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 subps %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 movss 1 * SIZE(AO), %xmm10 subss %xmm0, %xmm8 subss %xmm1, %xmm10 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) movss %xmm10, 1 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movss %xmm1, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO1, LDC, 1) #else movss %xmm8, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (AO, %rax, SIZE), AO #ifdef LT addq $ 2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $2, M je .L80 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 6 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 10 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 12 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 14 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: addps %xmm2, %xmm0 addps %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 2 * SIZE(AO), %xmm10 subps %xmm0, %xmm8 subps %xmm1, %xmm10 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) movlps %xmm10, 2 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $ 4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L80: testq $4, M je .L90 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 4 * SIZE(B), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd 6 * SIZE(B), %xmm11 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm10 subps %xmm0, %xmm8 subps %xmm1, %xmm10 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) movlps %xmm10, 4 * SIZE(B) movlps %xmm11, 6 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 movaps %xmm2, 24 * SIZE(BO) movaps %xmm3, 28 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm10, 4 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $ 8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L90: movq M, I sarq $3, I # i = (m >> 3) jle .L99 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW -8 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW -8 * SIZE(CO2) pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 64 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 80 * SIZE(AO), %xmm10 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 36 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 40 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 44 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 addps %xmm12, %xmm5 movaps 96 * SIZE(BO), %xmm13 movaps 96 * SIZE(AO), %xmm12 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 52 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 56 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 60 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 112 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 4 * SIZE(B), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd 6 * SIZE(B), %xmm11 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd 8 * SIZE(B), %xmm12 #ifdef movsd xorps %xmm13, %xmm13 #endif movsd 10 * SIZE(B), %xmm13 #ifdef movsd xorps %xmm14, %xmm14 #endif movsd 12 * SIZE(B), %xmm14 #ifdef movsd xorps %xmm15, %xmm15 #endif movsd 14 * SIZE(B), %xmm15 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 subps %xmm4, %xmm12 subps %xmm6, %xmm13 subps %xmm9, %xmm14 subps %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 movaps 8 * SIZE(AO), %xmm10 movaps 12 * SIZE(AO), %xmm11 subps %xmm0, %xmm8 subps %xmm4, %xmm9 subps %xmm1, %xmm10 subps %xmm5, %xmm11 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulps %xmm8, %xmm15 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm11 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm9 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) movlps %xmm10, 4 * SIZE(B) movlps %xmm11, 6 * SIZE(B) movlps %xmm12, 8 * SIZE(B) movlps %xmm13, 10 * SIZE(B) movlps %xmm14, 12 * SIZE(B) movlps %xmm15, 14 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 movaps %xmm2, 24 * SIZE(BO) movaps %xmm3, 28 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 pshufd $0x55, %xmm12, %xmm3 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 pshufd $0x55, %xmm13, %xmm3 movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 pshufd $0x55, %xmm14, %xmm3 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 pshufd $0x55, %xmm15, %xmm3 movaps %xmm2, 56 * SIZE(BO) movaps %xmm3, 60 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) movaps %xmm10, 8 * SIZE(AO) movaps %xmm11, 12 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 unpcklps %xmm14, %xmm12 unpcklps %xmm15, %xmm13 movaps %xmm12, %xmm14 unpcklps %xmm13, %xmm12 unpckhps %xmm13, %xmm14 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) movlps %xmm14, 4 * SIZE(CO1, LDC, 1) movhps %xmm14, 6 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) movlps %xmm11, 4 * SIZE(CO1, LDC, 1) movhps %xmm11, 6 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L61 ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L100: testq $1, N je .L999 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif testq $1, M je .L120 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 1 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss 2 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss 3 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss 8 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss 6 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss 7 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AO), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm8, 0 * SIZE(CO1) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L120: testq $2, M je .L130 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 subss %xmm0, %xmm1 subss %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L130: testq $4, M je .L140 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps 12 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 24 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps 28 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 subss %xmm0, %xmm1 subss %xmm2, %xmm5 subss %xmm8, %xmm10 subss %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L140: movq M, I sarq $3, I # i = (m >> 3) jle .L149 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW -8 * SIZE(CO1) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps 12 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 64 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 mulps 20 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 24 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps 28 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 80 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 mulps 36 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 40 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 44 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 96 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 mulps 52 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 56 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 60 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 112 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps 4 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 8), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 movss 4 * SIZE(B), %xmm12 movss 5 * SIZE(B), %xmm13 movss 6 * SIZE(B), %xmm14 movss 7 * SIZE(B), %xmm15 subss %xmm0, %xmm1 subss %xmm2, %xmm5 subss %xmm8, %xmm10 subss %xmm3, %xmm11 subss %xmm4, %xmm12 subss %xmm6, %xmm13 subss %xmm9, %xmm14 subss %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 subps %xmm0, %xmm8 subps %xmm4, %xmm9 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulss %xmm8, %xmm15 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) movss %xmm12, 4 * SIZE(B) movss %xmm13, 5 * SIZE(B) movss %xmm14, 6 * SIZE(B) movss %xmm15, 7 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 movaps %xmm2, 16 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 movaps %xmm2, 20 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 movaps %xmm2, 24 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 movaps %xmm2, 28 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 unpcklps %xmm14, %xmm12 unpcklps %xmm15, %xmm13 unpcklps %xmm13, %xmm12 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L111 ALIGN_4 .L149: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S000066400000000000000000001551741313527062700226410ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $3, J NOBRANCH jle .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 8), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 4), CO2 #ifndef RT leaq (C, LDC, 8), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #else movq B, BO #endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB xorps %xmm1, %xmm1 movapd -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht0 1 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 1 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht0 2 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht0 1 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 2 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht0 1 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht0 2 * SIZE(CO2, %rax, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps -12 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -14 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps 4 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps 8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -10 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addq $32 * SIZE, BO subq $-8 * SIZE, AO decq %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #endif addpd %xmm1, %xmm12 addpd %xmm2, %xmm13 addpd %xmm3, %xmm14 addpd %xmm4, %xmm15 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $0, %xmm11, %xmm10 shufpd $3, %xmm0, %xmm11 movaps %xmm12, %xmm0 shufpd $0, %xmm13, %xmm12 shufpd $3, %xmm0, %xmm13 movaps %xmm14, %xmm0 shufpd $0, %xmm15, %xmm14 shufpd $3, %xmm0, %xmm15 movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm2 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm6 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movaps %xmm12, %xmm0 shufpd $2, %xmm13, %xmm12 shufpd $2, %xmm0, %xmm13 movaps %xmm14, %xmm0 shufpd $2, %xmm15, %xmm14 shufpd $2, %xmm0, %xmm15 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 subpd %xmm12, %xmm4 subpd %xmm13, %xmm5 subpd %xmm14, %xmm6 subpd %xmm15, %xmm7 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -14 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 movapd %xmm12, %xmm14 movapd %xmm12, %xmm15 mulpd %xmm1, %xmm12 mulpd %xmm3, %xmm13 mulpd %xmm5, %xmm14 mulpd %xmm7, %xmm15 subpd %xmm12, %xmm0 subpd %xmm13, %xmm2 subpd %xmm14, %xmm4 subpd %xmm15, %xmm6 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm6 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm6 movddup -15 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 movapd %xmm12, %xmm14 movapd %xmm12, %xmm15 mulpd %xmm0, %xmm12 mulpd %xmm2, %xmm13 mulpd %xmm4, %xmm14 mulpd %xmm6, %xmm15 subpd %xmm12, %xmm1 subpd %xmm13, %xmm3 subpd %xmm14, %xmm5 subpd %xmm15, %xmm7 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm1 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm3 movddup -12 * SIZE(BO), %xmm12 mulpd %xmm0, %xmm12 subpd %xmm12, %xmm4 movddup -11 * SIZE(BO), %xmm13 mulpd %xmm0, %xmm13 subpd %xmm13, %xmm5 movddup -10 * SIZE(BO), %xmm14 mulpd %xmm0, %xmm14 subpd %xmm14, %xmm6 movddup -9 * SIZE(BO), %xmm15 mulpd %xmm0, %xmm15 subpd %xmm15, %xmm7 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm1 movddup -6 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm2 movddup -5 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm3 movddup -4 * SIZE(BO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm4 movddup -3 * SIZE(BO), %xmm13 mulpd %xmm1, %xmm13 subpd %xmm13, %xmm5 movddup -2 * SIZE(BO), %xmm14 mulpd %xmm1, %xmm14 subpd %xmm14, %xmm6 movddup -1 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm7 movddup 2 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm2 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm2, %xmm11 subpd %xmm11, %xmm3 movddup 4 * SIZE(BO), %xmm12 mulpd %xmm2, %xmm12 subpd %xmm12, %xmm4 movddup 5 * SIZE(BO), %xmm13 mulpd %xmm2, %xmm13 subpd %xmm13, %xmm5 movddup 6 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm6 movddup 7 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm7 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm12 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm4 movddup 13 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm5 movddup 14 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm6 movddup 15 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm7 movddup 20 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm4, %xmm13 subpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm14 mulpd %xmm4, %xmm14 subpd %xmm14, %xmm6 movddup 23 * SIZE(BO), %xmm15 mulpd %xmm4, %xmm15 subpd %xmm15, %xmm7 movddup 29 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm5 movddup 30 * SIZE(BO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm5, %xmm15 subpd %xmm15, %xmm7 movddup 38 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm6 movddup 39 * SIZE(BO), %xmm15 mulpd %xmm6, %xmm15 subpd %xmm15, %xmm7 movddup 47 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm7 #endif #ifdef RT movddup 47 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm7 movddup 46 * SIZE(BO), %xmm9 mulpd %xmm7, %xmm9 subpd %xmm9, %xmm6 movddup 45 * SIZE(BO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm5 movddup 44 * SIZE(BO), %xmm11 mulpd %xmm7, %xmm11 subpd %xmm11, %xmm4 movddup 43 * SIZE(BO), %xmm12 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup 42 * SIZE(BO), %xmm13 mulpd %xmm7, %xmm13 subpd %xmm13, %xmm2 movddup 41 * SIZE(BO), %xmm14 mulpd %xmm7, %xmm14 subpd %xmm14, %xmm1 movddup 40 * SIZE(BO), %xmm15 mulpd %xmm7, %xmm15 subpd %xmm15, %xmm0 movddup 38 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm6 movddup 37 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm5 movddup 36 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm4 movddup 35 * SIZE(BO), %xmm12 mulpd %xmm6, %xmm12 subpd %xmm12, %xmm3 movddup 34 * SIZE(BO), %xmm13 mulpd %xmm6, %xmm13 subpd %xmm13, %xmm2 movddup 33 * SIZE(BO), %xmm14 mulpd %xmm6, %xmm14 subpd %xmm14, %xmm1 movddup 32 * SIZE(BO), %xmm15 mulpd %xmm6, %xmm15 subpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm5 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm5, %xmm11 subpd %xmm11, %xmm4 movddup 27 * SIZE(BO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup 26 * SIZE(BO), %xmm13 mulpd %xmm5, %xmm13 subpd %xmm13, %xmm2 movddup 25 * SIZE(BO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm1 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm5, %xmm15 subpd %xmm15, %xmm0 movddup 20 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm4 movddup 19 * SIZE(BO), %xmm12 mulpd %xmm4, %xmm12 subpd %xmm12, %xmm3 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm4, %xmm13 subpd %xmm13, %xmm2 movddup 17 * SIZE(BO), %xmm14 mulpd %xmm4, %xmm14 subpd %xmm14, %xmm1 movddup 16 * SIZE(BO), %xmm15 mulpd %xmm4, %xmm15 subpd %xmm15, %xmm0 movddup 11 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm3 movddup 10 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm2 movddup 9 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm1 movddup 8 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm0 movddup 2 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm2 movddup 1 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm1 movddup 0 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm0 movddup -7 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -8 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm2, -14 * SIZE(BO) movapd %xmm4, -12 * SIZE(BO) movapd %xmm6, -10 * SIZE(BO) movapd %xmm1, -8 * SIZE(BO) movapd %xmm3, -6 * SIZE(BO) movapd %xmm5, -4 * SIZE(BO) movapd %xmm7, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5 , -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 1 * SIZE(CO1, LDC, 2) movhps %xmm2, 0 * SIZE(CO1, %rax, 1) movhps %xmm3, 1 * SIZE(CO1, %rax, 1) movsd %xmm4, 0 * SIZE(CO2) movsd %xmm5, 1 * SIZE(CO2) movhps %xmm4, 0 * SIZE(CO2, LDC, 1) movhps %xmm5, 1 * SIZE(CO2, LDC, 1) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movsd %xmm7, 1 * SIZE(CO2, LDC, 2) movhps %xmm6, 0 * SIZE(CO2, %rax, 1) movhps %xmm7, 1 * SIZE(CO2, %rax, 1) #else movups %xmm0, 0 * SIZE(CO1) movups %xmm1, 0 * SIZE(CO1, LDC, 1) movups %xmm2, 0 * SIZE(CO1, LDC, 2) movups %xmm3, 0 * SIZE(CO1, %rax, 1) movups %xmm4, 0 * SIZE(CO2) movups %xmm5, 0 * SIZE(CO2, LDC, 1) movups %xmm6, 0 * SIZE(CO2, LDC, 2) movups %xmm7, 0 * SIZE(CO2, %rax, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L29 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 16 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd -10 * SIZE(BO), %xmm3 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm3, %xmm7 movaps %xmm3, %xmm6 pshufd $0xe, %xmm2, %xmm5 movaps %xmm2, %xmm4 pshufd $0xe, %xmm1, %xmm3 movaps %xmm1, %xmm2 pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm1 movsd -14 * SIZE(BO), %xmm10 mulsd %xmm0, %xmm10 subsd %xmm10, %xmm2 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm3 movsd -12 * SIZE(BO), %xmm12 mulsd %xmm0, %xmm12 subsd %xmm12, %xmm4 movsd -11 * SIZE(BO), %xmm13 mulsd %xmm0, %xmm13 subsd %xmm13, %xmm5 movsd -10 * SIZE(BO), %xmm14 mulsd %xmm0, %xmm14 subsd %xmm14, %xmm6 movsd -9 * SIZE(BO), %xmm15 mulsd %xmm0, %xmm15 subsd %xmm15, %xmm7 movsd -7 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm1 movsd -6 * SIZE(BO), %xmm10 mulsd %xmm1, %xmm10 subsd %xmm10, %xmm2 movsd -5 * SIZE(BO), %xmm11 mulsd %xmm1, %xmm11 subsd %xmm11, %xmm3 movsd -4 * SIZE(BO), %xmm12 mulsd %xmm1, %xmm12 subsd %xmm12, %xmm4 movsd -3 * SIZE(BO), %xmm13 mulsd %xmm1, %xmm13 subsd %xmm13, %xmm5 movsd -2 * SIZE(BO), %xmm14 mulsd %xmm1, %xmm14 subsd %xmm14, %xmm6 movsd -1 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm7 movsd 2 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm2 movsd 3 * SIZE(BO), %xmm11 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm3 movsd 4 * SIZE(BO), %xmm12 mulsd %xmm2, %xmm12 subsd %xmm12, %xmm4 movsd 5 * SIZE(BO), %xmm13 mulsd %xmm2, %xmm13 subsd %xmm13, %xmm5 movsd 6 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm6 movsd 7 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm7 movsd 11 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm3 movsd 12 * SIZE(BO), %xmm12 mulsd %xmm3, %xmm12 subsd %xmm12, %xmm4 movsd 13 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm5 movsd 14 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm6 movsd 15 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm7 movsd 20 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm4 movsd 21 * SIZE(BO), %xmm13 mulsd %xmm4, %xmm13 subsd %xmm13, %xmm5 movsd 22 * SIZE(BO), %xmm14 mulsd %xmm4, %xmm14 subsd %xmm14, %xmm6 movsd 23 * SIZE(BO), %xmm15 mulsd %xmm4, %xmm15 subsd %xmm15, %xmm7 movsd 29 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm5 movsd 30 * SIZE(BO), %xmm14 mulsd %xmm5, %xmm14 subsd %xmm14, %xmm6 movsd 31 * SIZE(BO), %xmm15 mulsd %xmm5, %xmm15 subsd %xmm15, %xmm7 movsd 38 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm6 movsd 39 * SIZE(BO), %xmm15 mulsd %xmm6, %xmm15 subsd %xmm15, %xmm7 movsd 47 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm7 #endif #ifdef RT movsd 47 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm7 movsd 46 * SIZE(BO), %xmm9 mulsd %xmm7, %xmm9 subsd %xmm9, %xmm6 movsd 45 * SIZE(BO), %xmm10 mulsd %xmm7, %xmm10 subsd %xmm10, %xmm5 movsd 44 * SIZE(BO), %xmm11 mulsd %xmm7, %xmm11 subsd %xmm11, %xmm4 movsd 43 * SIZE(BO), %xmm12 mulsd %xmm7, %xmm12 subsd %xmm12, %xmm3 movsd 42 * SIZE(BO), %xmm13 mulsd %xmm7, %xmm13 subsd %xmm13, %xmm2 movsd 41 * SIZE(BO), %xmm14 mulsd %xmm7, %xmm14 subsd %xmm14, %xmm1 movsd 40 * SIZE(BO), %xmm15 mulsd %xmm7, %xmm15 subsd %xmm15, %xmm0 movsd 38 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm6 movsd 37 * SIZE(BO), %xmm10 mulsd %xmm6, %xmm10 subsd %xmm10, %xmm5 movsd 36 * SIZE(BO), %xmm11 mulsd %xmm6, %xmm11 subsd %xmm11, %xmm4 movsd 35 * SIZE(BO), %xmm12 mulsd %xmm6, %xmm12 subsd %xmm12, %xmm3 movsd 34 * SIZE(BO), %xmm13 mulsd %xmm6, %xmm13 subsd %xmm13, %xmm2 movsd 33 * SIZE(BO), %xmm14 mulsd %xmm6, %xmm14 subsd %xmm14, %xmm1 movsd 32 * SIZE(BO), %xmm15 mulsd %xmm6, %xmm15 subsd %xmm15, %xmm0 movsd 29 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm5 movsd 28 * SIZE(BO), %xmm11 mulsd %xmm5, %xmm11 subsd %xmm11, %xmm4 movsd 27 * SIZE(BO), %xmm12 mulsd %xmm5, %xmm12 subsd %xmm12, %xmm3 movsd 26 * SIZE(BO), %xmm13 mulsd %xmm5, %xmm13 subsd %xmm13, %xmm2 movsd 25 * SIZE(BO), %xmm14 mulsd %xmm5, %xmm14 subsd %xmm14, %xmm1 movsd 24 * SIZE(BO), %xmm15 mulsd %xmm5, %xmm15 subsd %xmm15, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm4 movsd 19 * SIZE(BO), %xmm12 mulsd %xmm4, %xmm12 subsd %xmm12, %xmm3 movsd 18 * SIZE(BO), %xmm13 mulsd %xmm4, %xmm13 subsd %xmm13, %xmm2 movsd 17 * SIZE(BO), %xmm14 mulsd %xmm4, %xmm14 subsd %xmm14, %xmm1 movsd 16 * SIZE(BO), %xmm15 mulsd %xmm4, %xmm15 subsd %xmm15, %xmm0 movsd 11 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm3 movsd 10 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm2 movsd 9 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm1 movsd 8 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm0 movsd 2 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm2 movsd 1 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm1 movsd 0 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm0 movsd -7 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -8 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm1 unpcklpd %xmm3, %xmm1 movaps %xmm4, %xmm2 unpcklpd %xmm5, %xmm2 movaps %xmm6, %xmm3 unpcklpd %xmm7, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhps %xmm1, 0 * SIZE(CO1, %rax, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 0 * SIZE(CO2, LDC, 1) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 0 * SIZE(CO2, %rax, 1) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm1, -14 * SIZE(BO) movapd %xmm2, -12 * SIZE(BO) movapd %xmm3, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L29: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 8), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $8, KK #endif #ifdef RT subq $8, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L30: testq $4, N jle .L50 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I NOBRANCH jle .L40 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 2 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht0 2 * SIZE(CO2, LDC, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 addpd %xmm3, %xmm10 addpd %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $0, %xmm11, %xmm10 shufpd $3, %xmm0, %xmm11 movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm2 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm3 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -14 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm3, %xmm13 subpd %xmm12, %xmm0 subpd %xmm13, %xmm2 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 movddup -15 * SIZE(AO), %xmm12 movapd %xmm12, %xmm13 mulpd %xmm0, %xmm12 mulpd %xmm2, %xmm13 subpd %xmm12, %xmm1 subpd %xmm13, %xmm3 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm1 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm3 movddup -11 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm1 movddup -10 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm2 movddup -9 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm3 movddup -6 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm2 movddup -5 * SIZE(BO), %xmm11 mulpd %xmm2, %xmm11 subpd %xmm11, %xmm3 movddup -1 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm3 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm3 movddup -2 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm2 movddup -3 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm1 movddup -4 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm0 movddup -6 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm2 movddup -7 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm1 movddup -8 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm0 movddup -11 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -12 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) movhps %xmm2, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 1 * SIZE(CO2, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 1 * SIZE(CO2, LDC, 1) #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm2, -14 * SIZE(BO) movapd %xmm1, -12 * SIZE(BO) movapd %xmm3, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L31 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm1 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm1, %xmm3 movaps %xmm1, %xmm2 pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm1 movsd -14 * SIZE(BO), %xmm10 mulsd %xmm0, %xmm10 subsd %xmm10, %xmm2 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm3 movsd -11 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm1 movsd -10 * SIZE(BO), %xmm10 mulsd %xmm1, %xmm10 subsd %xmm10, %xmm2 movsd -9 * SIZE(BO), %xmm11 mulsd %xmm1, %xmm11 subsd %xmm11, %xmm3 movsd -6 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm2 movsd -5 * SIZE(BO), %xmm11 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm3 movsd -1 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm3 #endif #ifdef RT movsd -1 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm3 movsd -2 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm2 movsd -3 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm1 movsd -4 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm0 movsd -6 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm2 movsd -7 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm1 movsd -8 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm0 movsd -11 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -12 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm1 unpcklpd %xmm3, %xmm1 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 0 * SIZE(CO2, LDC, 1) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm1, -14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif ALIGN_4 .L50: testq $2, N jle .L70 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L52 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movapd -16 * SIZE(BO), %xmm0 movapd -14 * SIZE(BO), %xmm1 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm0 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(AO), %xmm12 mulpd %xmm0, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm0 movddup -15 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm1 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm1 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -14 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) movapd %xmm1, -14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -8 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 #else movapd -16 * SIZE(AO), %xmm0 #endif subpd %xmm8, %xmm0 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm0 movsd -15 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm1 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm1 #endif #ifdef RT movsd -13 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -14 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L70: testq $1, N jle .L999 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I NOBRANCH jle .L80 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L72 addpd %xmm9, %xmm8 ALIGN_3 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addpd %xmm1, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm0 #else movapd -16 * SIZE(AO), %xmm0 #endif subpd %xmm8, %xmm0 #if defined(LN) || defined(LT) pshufd $0xe, %xmm0, %xmm1 #endif #ifdef LN movsd -13 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm1 movsd -14 * SIZE(AO), %xmm12 mulsd %xmm1, %xmm12 subsd %xmm12, %xmm0 movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(AO), %xmm12 mulsd %xmm0, %xmm12 subsd %xmm12, %xmm1 movsd -13 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm1 #endif #if defined(LN) || defined(LT) unpcklpd %xmm1, %xmm0 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm0, -16 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L71 ALIGN_4 .L80: testq $1, M BRANCH jle .L89 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movhps -15 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm1 movhps -15 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: mulpd %xmm0, %xmm1 movsd -14 * SIZE(AO), %xmm0 movhps -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movsd -14 * SIZE(BO), %xmm1 movhps -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movsd -12 * SIZE(AO), %xmm0 movhps -11 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movsd -12 * SIZE(BO), %xmm1 movhps -11 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L82 addpd %xmm9, %xmm8 ALIGN_3 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd -15 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_4 .L88: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif haddpd %xmm8, %xmm8 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm0 #else movsd -16 * SIZE(AO), %xmm0 #endif subsd %xmm8, %xmm0 #if defined(LN) || defined(LT) movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) movsd -16 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BO) #else movsd %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L89: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_4x2_atom.S000066400000000000000000001103531313527062700221520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J jle .L40 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 prefetcht0 3 * SIZE(CO2) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO addsd %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L19 ALIGN_4 .L16: addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L19: addsd %xmm2, %xmm13 addsd %xmm7, %xmm14 addsd %xmm6, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 movsd 4 * SIZE(BO), %xmm4 movsd 5 * SIZE(BO), %xmm5 movsd 6 * SIZE(BO), %xmm6 movsd 7 * SIZE(BO), %xmm7 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 subsd %xmm10, %xmm2 subsd %xmm11, %xmm3 subsd %xmm12, %xmm4 subsd %xmm13, %xmm5 subsd %xmm14, %xmm6 subsd %xmm15, %xmm7 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 movsd 4 * SIZE(AO), %xmm1 movsd 5 * SIZE(AO), %xmm3 movsd 6 * SIZE(AO), %xmm5 movsd 7 * SIZE(AO), %xmm7 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 subsd %xmm9, %xmm1 subsd %xmm11, %xmm3 subsd %xmm13, %xmm5 subsd %xmm15, %xmm7 #endif #ifdef LN movsd 15 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm6 movsd 14 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm7 movsd 13 * SIZE(AO), %xmm11 movaps %xmm9, %xmm10 movsd 12 * SIZE(AO), %xmm13 mulsd %xmm6, %xmm9 movsd 10 * SIZE(AO), %xmm8 mulsd %xmm7, %xmm10 subsd %xmm9, %xmm4 movsd 9 * SIZE(AO), %xmm9 subsd %xmm10, %xmm5 movaps %xmm11, %xmm12 mulsd %xmm6, %xmm11 mulsd %xmm7, %xmm12 subsd %xmm11, %xmm2 movsd 8 * SIZE(AO), %xmm11 subsd %xmm12, %xmm3 movaps %xmm13, %xmm14 mulsd %xmm6, %xmm13 mulsd %xmm7, %xmm14 subsd %xmm13, %xmm0 subsd %xmm14, %xmm1 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm5 movsd 5 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm4, %xmm9 mulsd %xmm5, %xmm10 subsd %xmm9, %xmm2 movsd 4 * SIZE(AO), %xmm9 subsd %xmm10, %xmm3 movaps %xmm11, %xmm12 mulsd %xmm4, %xmm11 mulsd %xmm5, %xmm12 subsd %xmm11, %xmm0 movsd 0 * SIZE(AO), %xmm11 subsd %xmm12, %xmm1 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm3 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm1 mulsd %xmm11, %xmm0 mulsd %xmm11, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm1 movsd 2 * SIZE(AO), %xmm11 movaps %xmm9, %xmm10 movsd 3 * SIZE(AO), %xmm13 mulsd %xmm0, %xmm9 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm1, %xmm10 subsd %xmm9, %xmm2 movsd 6 * SIZE(AO), %xmm9 subsd %xmm10, %xmm3 movaps %xmm11, %xmm12 mulsd %xmm0, %xmm11 mulsd %xmm1, %xmm12 subsd %xmm11, %xmm4 movsd 7 * SIZE(AO), %xmm11 subsd %xmm12, %xmm5 movaps %xmm13, %xmm14 mulsd %xmm0, %xmm13 mulsd %xmm1, %xmm14 subsd %xmm13, %xmm6 subsd %xmm14, %xmm7 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm3 movsd 10 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm4 movsd 11 * SIZE(AO), %xmm9 subsd %xmm10, %xmm5 movaps %xmm11, %xmm12 mulsd %xmm2, %xmm11 mulsd %xmm3, %xmm12 subsd %xmm11, %xmm6 subsd %xmm12, %xmm7 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm5 movsd 15 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm4, %xmm9 mulsd %xmm5, %xmm10 subsd %xmm9, %xmm6 subsd %xmm10, %xmm7 mulsd %xmm8, %xmm6 mulsd %xmm8, %xmm7 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm2 movsd 3 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm6 movaps %xmm9, %xmm10 movaps %xmm9, %xmm11 movaps %xmm9, %xmm12 mulsd %xmm0, %xmm9 mulsd %xmm2, %xmm10 mulsd %xmm4, %xmm11 mulsd %xmm6, %xmm12 subsd %xmm9, %xmm1 subsd %xmm10, %xmm3 subsd %xmm11, %xmm5 subsd %xmm12, %xmm7 mulsd %xmm13, %xmm1 mulsd %xmm13, %xmm3 mulsd %xmm13, %xmm5 mulsd %xmm13, %xmm7 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm5 mulsd %xmm8, %xmm7 movaps %xmm9, %xmm10 movaps %xmm9, %xmm11 movaps %xmm9, %xmm12 mulsd %xmm1, %xmm9 mulsd %xmm3, %xmm10 mulsd %xmm5, %xmm11 mulsd %xmm7, %xmm12 subsd %xmm9, %xmm0 subsd %xmm10, %xmm2 subsd %xmm11, %xmm4 subsd %xmm12, %xmm6 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm2 mulsd %xmm13, %xmm4 mulsd %xmm13, %xmm6 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movsd %xmm6, 3 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) movsd %xmm5, 2 * SIZE(CO2) movsd %xmm7, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) movsd %xmm4, 4 * SIZE(BO) movsd %xmm5, 5 * SIZE(BO) movsd %xmm6, 6 * SIZE(BO) movsd %xmm7, 7 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm4, 2 * SIZE(AO) movsd %xmm6, 3 * SIZE(AO) movsd %xmm1, 4 * SIZE(AO) movsd %xmm3, 5 * SIZE(AO) movsd %xmm5, 6 * SIZE(AO) movsd %xmm7, 7 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M BRANCH je .L30 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L29 ALIGN_4 .L26: addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: addsd %xmm2, %xmm9 addsd %xmm6, %xmm11 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 subsd %xmm10, %xmm2 subsd %xmm11, %xmm3 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm1 movsd 3 * SIZE(AO), %xmm3 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm9, %xmm1 subsd %xmm11, %xmm3 #endif #ifdef LN movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm2 movsd 2 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(AO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm1 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm1 movsd 3 * SIZE(AO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm0, %xmm9 mulsd %xmm1, %xmm10 subsd %xmm9, %xmm2 subsd %xmm10, %xmm3 mulsd %xmm13, %xmm2 mulsd %xmm13, %xmm3 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm2 movsd 3 * SIZE(BO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm0, %xmm9 mulsd %xmm2, %xmm10 subsd %xmm9, %xmm1 subsd %xmm10, %xmm3 mulsd %xmm13, %xmm1 mulsd %xmm13, %xmm3 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(BO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm1, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm2 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm1, 2 * SIZE(AO) movsd %xmm3, 3 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 movsd 1 * SIZE(AO), %xmm2 xorps %xmm5, %xmm5 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 3 * SIZE(AO), %xmm2 addsd %xmm5, %xmm8 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 addsd %xmm7, %xmm9 movsd 7 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 8 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 9 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 5 * SIZE(AO), %xmm2 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 andq $3, %rax BRANCH BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm0, %xmm1 addq $2 * SIZE, BO mulsd %xmm0, %xmm3 movsd 1 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 0 * SIZE(BO), %xmm1 addsd %xmm3, %xmm9 movsd 1 * SIZE(BO), %xmm3 addq $1 * SIZE, AO decq %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm1 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 movsd 3 * SIZE(BO), %xmm13 subsd %xmm9, %xmm1 mulsd %xmm13, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm1, %xmm9 movsd 0 * SIZE(BO), %xmm13 subsd %xmm9, %xmm0 mulsd %xmm13, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm1, 1 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L10 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I jle .L50 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 movsd 1 * SIZE(AO), %xmm1 xorps %xmm11, %xmm11 movsd 2 * SIZE(AO), %xmm2 xorps %xmm13, %xmm13 movsd 3 * SIZE(AO), %xmm3 xorps %xmm15, %xmm15 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm14, %xmm14 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L45 ALIGN_4 .L42: addsd %xmm9, %xmm8 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm11, %xmm10 movsd 5 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 6 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 7 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 9 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addsd %xmm2, %xmm12 movsd 10 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 addsd %xmm3, %xmm14 movsd 11 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 3 * SIZE(BO), %xmm5 addsd %xmm9, %xmm8 movsd 12 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addsd %xmm11, %xmm10 movsd 13 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 14 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 15 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 4 * SIZE(BO), %xmm4 subq $-16 * SIZE, AO addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addq $ 4 * SIZE, BO addsd %xmm2, %xmm12 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 decq %rax addsd %xmm3, %xmm14 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 1 * SIZE(BO), %xmm5 jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm9, %xmm8 addsd %xmm11, %xmm10 addsd %xmm13, %xmm12 addsd %xmm15, %xmm14 andq $3, %rax BRANCH BRANCH je .L49 ALIGN_4 .L46: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 mulsd %xmm4, %xmm2 mulsd %xmm4, %xmm3 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 5 * SIZE(AO), %xmm1 addsd %xmm2, %xmm12 movsd 6 * SIZE(AO), %xmm2 addsd %xmm3, %xmm14 movsd 7 * SIZE(AO), %xmm3 addq $4 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L46 ALIGN_4 .L49: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm2 movsd 2 * SIZE(BO), %xmm4 movsd 3 * SIZE(BO), %xmm6 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 #endif #ifdef LN movsd 15 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm6 movsd 14 * SIZE(AO), %xmm9 mulsd %xmm6, %xmm9 movsd 13 * SIZE(AO), %xmm11 subsd %xmm9, %xmm4 movsd 12 * SIZE(AO), %xmm13 mulsd %xmm6, %xmm11 movsd 10 * SIZE(AO), %xmm8 subsd %xmm11, %xmm2 movsd 9 * SIZE(AO), %xmm9 mulsd %xmm6, %xmm13 movsd 8 * SIZE(AO), %xmm11 subsd %xmm13, %xmm0 mulsd %xmm8, %xmm4 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm4, %xmm9 subsd %xmm9, %xmm2 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm11 subsd %xmm11, %xmm0 movsd 0 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm2 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm0 mulsd %xmm11, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm0, %xmm9 movsd 2 * SIZE(AO), %xmm11 subsd %xmm9, %xmm2 movsd 3 * SIZE(AO), %xmm13 mulsd %xmm0, %xmm11 movsd 5 * SIZE(AO), %xmm8 subsd %xmm11, %xmm4 movsd 6 * SIZE(AO), %xmm9 mulsd %xmm0, %xmm13 movsd 7 * SIZE(AO), %xmm11 subsd %xmm13, %xmm6 mulsd %xmm8, %xmm2 movsd 10 * SIZE(AO), %xmm8 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm4 movsd 11 * SIZE(AO), %xmm9 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm6 mulsd %xmm8, %xmm4 movsd 15 * SIZE(AO), %xmm8 mulsd %xmm4, %xmm9 subsd %xmm9, %xmm6 mulsd %xmm8, %xmm6 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm6 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movsd %xmm6, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm2, 1 * SIZE(BO) movsd %xmm4, 2 * SIZE(BO) movsd %xmm6, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm4, 2 * SIZE(AO) movsd %xmm6, 3 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L41 ALIGN_4 .L50: testq $2, M je .L60 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm1 xorps %xmm3, %xmm3 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L55 ALIGN_4 .L52: addsd %xmm2, %xmm8 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm3, %xmm10 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 addq $8 * SIZE, AO addsd %xmm1, %xmm10 movsd -3 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 3 * SIZE(BO), %xmm5 addsd %xmm2, %xmm8 movsd -2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 addq $4 * SIZE, BO addsd %xmm3, %xmm10 movsd -1 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 0 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 decq %rax addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 1 * SIZE(BO), %xmm5 jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm2, %xmm8 addsd %xmm3, %xmm10 andq $3, %rax BRANCH je .L59 ALIGN_4 .L56: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 3 * SIZE(AO), %xmm1 addq $2 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm2 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 #endif #ifdef LN movsd 3 * SIZE(AO), %xmm8 movsd 2 * SIZE(AO), %xmm9 movsd 0 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm2 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm0 mulsd %xmm11,%xmm0 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 movsd 1 * SIZE(AO), %xmm9 movsd 3 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm0 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm2 mulsd %xmm11,%xmm2 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm2, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $1, M je .L69 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 movsd 1 * SIZE(AO), %xmm2 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L65 ALIGN_4 .L62: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 movsd 4 * SIZE(AO), %xmm0 addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm3 movsd 5 * SIZE(AO), %xmm2 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm4, %xmm5 movsd 6 * SIZE(AO), %xmm4 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm6, %xmm7 movsd 7 * SIZE(AO), %xmm6 addq $4 * SIZE, AO addq $4 * SIZE, BO decq %rax jne .L62 addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L68 ALIGN_4 .L66: movsd 0 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 addq $1 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L66 ALIGN_4 .L68: addsd %xmm9, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 subsd %xmm8, %xmm0 #else movsd 0 * SIZE(AO), %xmm0 subsd %xmm8, %xmm0 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_2 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S000066400000000000000000001776531313527062700231620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #define J %rbx #ifndef WINDOWS_ABI #define STACKSIZE 96 #define OFFSET 48(%rsp) #define AORIG 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define AORIG 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define PREFETCH prefetch #define PREFETCHSIZE (8 * 7 + 0) #define movlpd movsd #define movapd movups #define movupd movups #define KERNEL1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ /**/ movapd (AO, %rax, 4), %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ addpd %xmm1, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ /**/ movddup (BO, %rax, 4), %xmm1 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ /**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ /**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL5(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ addpd %xmm1, %xmm14 ;\ movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL6(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ /***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ addpd %xmm1, %xmm14 ;\ /**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL7(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ addpd %xmm5, %xmm14 ;\ movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL8(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movapd %xmm2, %xmm7 ;\ /**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ /**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd (AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup (BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm12 #else movq STACKSIZE + 8(%rsp), LDC movsd STACKSIZE + 16(%rsp), %xmm12 #endif movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B movsd %xmm12, OFFSET movsd %xmm12, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movddup -8 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 #ifndef LN prefetchw 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw 7 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw 3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw 7 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 #else prefetchw -8 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw -8 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw -8 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw -8 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 #endif prefetch -16 * SIZE(BB) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) BRANCH jl .L12 ALIGN_4 .L15: prefetch -8 * SIZE(BB) subq $-16 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_4 KERNEL_SUB1(16 * 0) KERNEL_SUB2(16 * 0) KERNEL_SUB3(16 * 0) KERNEL_SUB4(16 * 0) subq $-16 * SIZE, BO subq $-16 * SIZE, AO ALIGN_4 .L16: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L17: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd %xmm2, %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movapd %xmm0, %xmm2 addpd %xmm3, %xmm13 movddup -13 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm10 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm14 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 addpd %xmm3, %xmm15 movddup -11 * SIZE(BO, %rax, 4), %xmm3 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L17 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm3 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm7, %xmm14 subpd %xmm14, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm15 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm11, %xmm10 subpd %xmm10, %xmm15 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm11, %xmm14 subpd %xmm14, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm3 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm1, %xmm9 subpd %xmm9, %xmm3 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm7 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm5 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm7 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm5, %xmm9 subpd %xmm9, %xmm7 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm7, %xmm9 subpd %xmm9, %xmm5 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm3 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm7, %xmm11 subpd %xmm11, %xmm1 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm5, %xmm9 subpd %xmm9, %xmm3 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movlpd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) movaps %xmm13, -12 * SIZE(BO) movaps %xmm15, -10 * SIZE(BO) movaps %xmm1, -8 * SIZE(BO) movaps %xmm3, -6 * SIZE(BO) movaps %xmm5, -4 * SIZE(BO) movaps %xmm7, -2 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) movaps %xmm4, -8 * SIZE(AO) movaps %xmm5, -6 * SIZE(AO) movaps %xmm6, -4 * SIZE(AO) movaps %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -8 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L26 ALIGN_4 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -9 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup (BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -7 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -6 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -5 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup -4 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup -3 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -2 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -1 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup 8 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup 1 * SIZE(BO, %rax, 4), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L22 ALIGN_4 .L26: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L29 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L27: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 addq $SIZE, %rax jl .L27 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm11, %xmm10 subpd %xmm10, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) movaps %xmm13, -12 * SIZE(BO) movaps %xmm15, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm2, -14 * SIZE(AO) movaps %xmm4, -12 * SIZE(AO) movaps %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -14 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -15 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movapd -16 * SIZE(BO), %xmm1 pxor %xmm11, %xmm11 movapd -8 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L36 ALIGN_4 .L32: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd %xmm4, %xmm1 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 addpd %xmm1, %xmm10 movapd (BO, %rax, 4), %xmm1 addpd %xmm4, %xmm11 movddup -11 * SIZE(AO, %rax, 1), %xmm4 mulpd %xmm2, %xmm3 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movddup -13 * SIZE(AO, %rax, 1), %xmm2 mulpd %xmm2, %xmm3 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 movddup -10 * SIZE(AO, %rax, 1), %xmm2 addq $4 * SIZE, %rax BRANCH jl .L32 ALIGN_4 .L36: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L38 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L37: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L37 ALIGN_4 .L38: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 subpd %xmm8, %xmm2 subpd %xmm9, %xmm3 #else movapd -16 * SIZE(AO), %xmm2 movapd -14 * SIZE(AO), %xmm3 subpd %xmm8, %xmm2 subpd %xmm9, %xmm3 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd -16 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd -15 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -14 * SIZE(BO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd -13 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd -11 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd -9 * SIZE(BO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd -6 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd -5 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd -1 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd -1 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd -3 * SIZE(BO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd -4 * SIZE(BO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd -6 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd -7 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd -8 * SIZE(BO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd -11 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd -12 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -16 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #else movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm2, -16 * SIZE(BO) movaps %xmm3, -14 * SIZE(BO) #else movaps %xmm2, -16 * SIZE(AO) movaps %xmm3, -14 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $2, N je .L80 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 pxor %xmm8, %xmm8 movddup -12 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -16 * SIZE(AO), %xmm0 pxor %xmm12, %xmm12 movapd -8 * SIZE(AO), %xmm4 pxor %xmm13, %xmm13 #ifndef LN prefetchw 3 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw 3 * SIZE(CO2) #else prefetchw -8 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw -8 * SIZE(CO2) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L56 ALIGN_4 .L52: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd (AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -9 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd 8 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -4 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -7 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $4 * SIZE, %rax BRANCH jl .L52 ALIGN_4 .L56: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L59 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L57 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm1, %xmm9 subpd %xmm9, %xmm3 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movlpd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm13,-14 * SIZE(BO) movaps %xmm1, -12 * SIZE(BO) movaps %xmm5, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L66 ALIGN_4 .L62: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm11 movddup -11 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm9 movddup -9 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm10 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm11 movddup -7 * SIZE(BO, %rax, 2), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L62 ALIGN_4 .L66: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L69 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L67: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 addq $SIZE, %rax jl .L67 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm13, -14 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 1), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(AO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L76 ALIGN_4 .L72: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(AO, %rax, 1), %xmm1 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(AO, %rax, 1), %xmm2 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(AO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L72 ALIGN_4 .L76: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L78 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L77: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L77 ALIGN_4 .L78: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm2 #else movapd -16 * SIZE(AO), %xmm2 #endif subpd %xmm8, %xmm2 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 mulsd -16 * SIZE(BO), %xmm2 movsd -15 * SIZE(BO), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 mulsd -13 * SIZE(BO), %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 mulsd -13 * SIZE(BO), %xmm0 movlpd -14 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 mulsd -16 * SIZE(BO), %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movaps %xmm2, -16 * SIZE(BO) #else movaps %xmm2, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -8 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -14 * SIZE(BO), %xmm3 #ifndef LN prefetchw 3 * SIZE(CO1) #else prefetchw -8 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L96 ALIGN_4 .L92: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm9 movddup -12 * SIZE(BO, %rax, 1), %xmm1 mulpd %xmm5, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm0, %xmm10 movapd (AO, %rax, 4), %xmm0 addpd %xmm5, %xmm11 movddup -13 * SIZE(BO, %rax, 1), %xmm5 mulpd %xmm3, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm3, %xmm9 movddup -10 * SIZE(BO, %rax, 1), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm10 movapd 8 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 1), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L92 ALIGN_4 .L96: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L99 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO, %rax, 1), %xmm1 addq $SIZE, %rax jl .L97 ALIGN_4 .L99: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movlpd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movaps %xmm10, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) #else movaps %xmm10, -16 * SIZE(AO) movaps %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L106 ALIGN_4 .L102: mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(BO, %rax, 1), %xmm0 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(BO, %rax, 1), %xmm1 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(BO, %rax, 1), %xmm2 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(BO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L102 ALIGN_4 .L106: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L109 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L107: movddup -16 * SIZE(BO, %rax, 1), %xmm0 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 addq $SIZE, %rax jl .L107 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movaps %xmm10, -16 * SIZE(BO) #else movaps %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -14 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L116 ALIGN_4 .L112: mulpd -16 * SIZE(BO, %rax, 1), %xmm0 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 addpd %xmm1, %xmm9 movapd -10 * SIZE(AO, %rax, 1), %xmm1 addq $4 * SIZE, %rax BRANCH jl .L112 ALIGN_4 .L116: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L118 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L117: mulsd -16 * SIZE(BO, %rax, 1), %xmm0 addsd %xmm0, %xmm8 movsd -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L117 ALIGN_4 .L118: addpd %xmm9, %xmm8 haddpd %xmm8, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #if defined(RN) || defined(RT) movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm10, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movlpd %xmm10, -16 * SIZE(BO) #else movlpd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax addq %rax, AO addq %rax, BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (B, K, SIZE), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_4x4_core2.S000066400000000000000000002017431313527062700222320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define J 0(%rsp) #define OFFSET 8(%rsp) #define KK 16(%rsp) #define KKK 24(%rsp) #define AORIG 32(%rsp) #define BORIG 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) #define PREFETCHSIZE (8 * 17 + 2) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, %rax movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq %rax, KK movq %rax, OFFSET movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq 16 * SIZE + BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 movapd -12 * SIZE(B), %xmm2 movapd -10 * SIZE(B), %xmm3 movapd -8 * SIZE(B), %xmm4 movapd -6 * SIZE(B), %xmm5 movapd -4 * SIZE(B), %xmm6 movapd -2 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_R + 8) * SIZE(B) movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movddup %xmm2, %xmm10 unpckhpd %xmm2, %xmm2 movddup %xmm3, %xmm11 unpckhpd %xmm3, %xmm3 movddup %xmm4, %xmm12 unpckhpd %xmm4, %xmm4 movddup %xmm5, %xmm13 unpckhpd %xmm5, %xmm5 movddup %xmm6, %xmm14 unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm10, -8 * SIZE(BO) movapd %xmm2, -6 * SIZE(BO) movapd %xmm11, -4 * SIZE(BO) movapd %xmm3, -2 * SIZE(BO) prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movapd %xmm12, 0 * SIZE(BO) movapd %xmm4, 2 * SIZE(BO) movapd %xmm13, 4 * SIZE(BO) movapd %xmm5, 6 * SIZE(BO) prefetcht0 (PREFETCH_W + 24) * SIZE(BO) movapd %xmm14, 8 * SIZE(BO) movapd %xmm6, 10 * SIZE(BO) movapd %xmm15, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) subq $-16 * SIZE, B subq $-32 * SIZE, BO subq $1, %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L04 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif prefetcht2 0 * SIZE(BB) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 prefetcht2 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 3 * SIZE(CO2) pxor %xmm13, %xmm13 prefetcht2 3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetcht2 3 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-8 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm10 movapd -16 * SIZE(AO), %xmm0 addpd %xmm3, %xmm14 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movapd -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd 0 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd 2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd 4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd 6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd 8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd 10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 addq $32 * SIZE, BO mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -20 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 subq $-16 * SIZE, AO mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -18 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 subq $1, %rax mulpd %xmm1, %xmm5 BRANCH jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: movapd -16 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movapd -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 subq $1, %rax BRANCH jg .L16 ALIGN_4 .L19: addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 movapd -8 * SIZE(B), %xmm1 movapd -6 * SIZE(B), %xmm3 movapd -4 * SIZE(B), %xmm5 movapd -2 * SIZE(B), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -3 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm15 movddup -8 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm3 movddup -9 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm3 movddup -14 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm5 movddup -13 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm5 movddup -3 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -4 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm1 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup -8 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm1 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm1 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 2 * SIZE(CO1, LDC, 2) movsd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movsd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movapd %xmm1, -8 * SIZE(B) movapd %xmm3, -6 * SIZE(B) movapd %xmm5, -4 * SIZE(B) movapd %xmm7, -2 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm11, %xmm10 SHUFPD_3 %xmm11, %xmm11 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm15, %xmm14 SHUFPD_3 %xmm15, %xmm15 movddup %xmm1, %xmm0 SHUFPD_3 %xmm1, %xmm1 movddup %xmm3, %xmm2 SHUFPD_3 %xmm3, %xmm3 movddup %xmm5, %xmm4 SHUFPD_3 %xmm5, %xmm5 movddup %xmm7, %xmm6 SHUFPD_3 %xmm7, %xmm7 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5, -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -14 * SIZE(AO), %xmm0 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm4 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -10 * SIZE(AO), %xmm0 movapd 8 * SIZE(BO), %xmm2 movapd 10 * SIZE(BO), %xmm3 movapd 12 * SIZE(BO), %xmm4 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jne .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(B), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(B), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(B), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(B), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(B), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(B), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(B), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(B), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(B), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(B), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm11, %xmm10 SHUFPD_3 %xmm11, %xmm11 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm15, %xmm14 SHUFPD_3 %xmm15, %xmm15 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) movapd %xmm4, -12 * SIZE(AO) movapd %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -15 * SIZE(AO), %xmm0 movsd -8 * SIZE(BO), %xmm2 movsd -6 * SIZE(BO), %xmm3 movsd -4 * SIZE(BO), %xmm4 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -14 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm2 movsd 2 * SIZE(BO), %xmm3 movsd 4 * SIZE(BO), %xmm4 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -13 * SIZE(AO), %xmm0 movsd 8 * SIZE(BO), %xmm2 movsd 10 * SIZE(BO), %xmm3 movsd 12 * SIZE(BO), %xmm4 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm13 movsd -14 * SIZE(B), %xmm14 movsd -13 * SIZE(B), %xmm15 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 movsd -14 * SIZE(AO), %xmm14 movsd -13 * SIZE(AO), %xmm15 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 subsd %xmm10, %xmm14 subsd %xmm11, %xmm15 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 mulsd %xmm8, %xmm14 mulsd %xmm8, %xmm15 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 mulsd %xmm8, %xmm14 mulsd %xmm8, %xmm15 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm12 movlpd -15 * SIZE(B), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 movlpd -14 * SIZE(B), %xmm10 mulsd %xmm12, %xmm10 subsd %xmm10, %xmm14 movlpd -13 * SIZE(B), %xmm11 mulsd %xmm12, %xmm11 subsd %xmm11, %xmm15 mulsd -11 * SIZE(B), %xmm13 movlpd -10 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm14 movlpd -9 * SIZE(B), %xmm10 mulsd %xmm13, %xmm10 subsd %xmm10, %xmm15 mulsd -6 * SIZE(B), %xmm14 movlpd -5 * SIZE(B), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm15 mulsd -1 * SIZE(B), %xmm15 #endif #ifdef RT mulsd -1 * SIZE(B), %xmm15 movlpd -2 * SIZE(B), %xmm9 mulsd %xmm15, %xmm9 subsd %xmm9, %xmm14 movlpd -3 * SIZE(B), %xmm10 mulsd %xmm15, %xmm10 subsd %xmm10, %xmm13 movlpd -4 * SIZE(B), %xmm11 mulsd %xmm15, %xmm11 subsd %xmm11, %xmm12 mulsd -6 * SIZE(B), %xmm14 movlpd -7 * SIZE(B), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm13 movlpd -8 * SIZE(B), %xmm10 mulsd %xmm14, %xmm10 subsd %xmm10, %xmm12 mulsd -11 * SIZE(B), %xmm13 movlpd -12 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(B), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm14, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 0 * SIZE(CO2, LDC, 2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(B) movsd %xmm13, -15 * SIZE(B) movsd %xmm14, -14 * SIZE(B) movsd %xmm15, -13 * SIZE(B) movsd %xmm12, -16 * SIZE(BO) movsd %xmm12, -15 * SIZE(BO) movsd %xmm13, -14 * SIZE(BO) movsd %xmm13, -13 * SIZE(BO) movsd %xmm14, -12 * SIZE(BO) movsd %xmm14, -11 * SIZE(BO) movsd %xmm15, -10 * SIZE(BO) movsd %xmm15, -9 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) movsd %xmm14, -14 * SIZE(AO) movsd %xmm15, -13 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) addq $8 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 -3 * SIZE(CO2) pxor %xmm13, %xmm13 #else prefetcht2 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 3 * SIZE(CO2) pxor %xmm13, %xmm13 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 movapd -12 * SIZE(B), %xmm1 movapd -10 * SIZE(B), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(B), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm3 movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(B), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm1 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movapd %xmm1, -12 * SIZE(B) movapd %xmm5, -10 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm1, %xmm0 SHUFPD_3 %xmm1, %xmm1 movddup %xmm5, %xmm4 SHUFPD_3 %xmm5, %xmm5 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) movapd %xmm0, -8 * SIZE(BO) movapd %xmm1, -6 * SIZE(BO) movapd %xmm4, -4 * SIZE(BO) movapd %xmm5, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(B), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 mulsd %xmm1, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -14 * SIZE(AO), %xmm0 movsd -13 * SIZE(AO), %xmm1 movsd -8 * SIZE(BO), %xmm2 movsd -6 * SIZE(BO), %xmm3 movsd -4 * SIZE(BO), %xmm4 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 mulsd %xmm1, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm13 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 mulsd -13 * SIZE(B), %xmm13 #endif #ifdef RT mulsd -13 * SIZE(B), %xmm13 movlpd -14 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(B), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(B) movsd %xmm13, -15 * SIZE(B) movsd %xmm12, -16 * SIZE(BO) movsd %xmm12, -15 * SIZE(BO) movsd %xmm13, -14 * SIZE(BO) movsd %xmm13, -13 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L83 ALIGN_4 .L82: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO subq $1, %rax jne .L82 ALIGN_4 .L83: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L90 ALIGN_4 .L84: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, 0 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO subq $1, %rax jne .L84 ALIGN_4 .L90: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 #ifdef LN prefetcht2 -3 * SIZE(CO1) #else prefetcht2 3 * SIZE(CO1) #endif pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -14 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -10 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L96 ALIGN_4 .L99: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm10 movapd -14 * SIZE(B), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movddup %xmm10, %xmm8 SHUFPD_3 %xmm10, %xmm10 movddup %xmm11, %xmm9 SHUFPD_3 %xmm11, %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm10, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd -10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm8 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L106 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(B) movddup %xmm10, %xmm8 SHUFPD_3 %xmm10, %xmm10 movapd %xmm8, -16 * SIZE(BO) movapd %xmm10, -14 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm1, %xmm3 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 movsd -14 * SIZE(AO), %xmm0 movsd -13 * SIZE(AO), %xmm1 movsd -12 * SIZE(BO), %xmm2 movsd -10 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm1, %xmm3 addsd %xmm2, %xmm10 addsd %xmm3, %xmm11 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L116 ALIGN_4 .L118: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 addsd %xmm9, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #ifdef LN movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef RN movsd -16 * SIZE(B), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef RT movsd -16 * SIZE(B), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm10, -16 * SIZE(B) movlpd %xmm10, -16 * SIZE(BO) movlpd %xmm10, -15 * SIZE(BO) #else movsd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S000066400000000000000000001663151313527062700225400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH_R (8 * 4 + 0) #define PREFETCHSIZE (8 * 21 + 6) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK subq $-16 * SIZE, A subq $-16 * SIZE, B leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 #ifdef LN prefetcht0 -4 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 -4 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht0 -4 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 prefetcht0 -4 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #else prefetcht0 3 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 3 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht0 3 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 prefetcht0 3 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: addpd %xmm3, %xmm11 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -4 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 0 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 4 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 14 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 16 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 subq $-32 * SIZE, AO addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax subq $4, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif addpd %xmm3, %xmm11 addpd %xmm4, %xmm15 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 movapd %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 movapd %xmm14, %xmm0 movsd %xmm15, %xmm14 movsd %xmm0, %xmm15 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -3 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm15 movddup -8 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm3 movddup -9 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm3 movddup -14 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm5 movddup -13 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm5 movddup -3 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -4 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm1 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup -8 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm1 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 2 * SIZE(CO1, LDC, 2) movsd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movsd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) movapd %xmm1, -8 * SIZE(BO) movapd %xmm3, -6 * SIZE(BO) movapd %xmm5, -4 * SIZE(BO) movapd %xmm7, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5, -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -10 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -4 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -2 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO addpd %xmm2, %xmm9 movaps 0 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addpd %xmm3, %xmm11 addpd %xmm5, %xmm10 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) movapd %xmm4, -12 * SIZE(AO) movapd %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M BRANCH jle .L39 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -14 * SIZE(BO), %xmm3 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -10 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps -8 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -2 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps 2 * SIZE(BO), %xmm3 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -10 * SIZE(BO), %xmm3 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(LT) movaps -16 * SIZE(BO), %xmm12 movaps -14 * SIZE(BO), %xmm13 #else movaps -16 * SIZE(AO), %xmm12 movaps -14 * SIZE(AO), %xmm13 #endif subpd %xmm8, %xmm12 subpd %xmm9, %xmm13 #if defined(RN) || defined(RT) movhlps %xmm13, %xmm15 movsd %xmm13, %xmm14 movhlps %xmm12, %xmm13 movsd %xmm12, %xmm12 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm12 mulpd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(BO), %xmm12 movlpd -15 * SIZE(BO), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 movlpd -14 * SIZE(BO), %xmm10 mulsd %xmm12, %xmm10 subsd %xmm10, %xmm14 movlpd -13 * SIZE(BO), %xmm11 mulsd %xmm12, %xmm11 subsd %xmm11, %xmm15 mulsd -11 * SIZE(BO), %xmm13 movlpd -10 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm14 movlpd -9 * SIZE(BO), %xmm10 mulsd %xmm13, %xmm10 subsd %xmm10, %xmm15 mulsd -6 * SIZE(BO), %xmm14 movlpd -5 * SIZE(BO), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm15 mulsd -1 * SIZE(BO), %xmm15 #endif #ifdef RT mulsd -1 * SIZE(BO), %xmm15 movlpd -2 * SIZE(BO), %xmm9 mulsd %xmm15, %xmm9 subsd %xmm9, %xmm14 movlpd -3 * SIZE(BO), %xmm10 mulsd %xmm15, %xmm10 subsd %xmm10, %xmm13 movlpd -4 * SIZE(BO), %xmm11 mulsd %xmm15, %xmm11 subsd %xmm11, %xmm12 mulsd -6 * SIZE(BO), %xmm14 movlpd -7 * SIZE(BO), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm13 movlpd -8 * SIZE(BO), %xmm10 mulsd %xmm14, %xmm10 subsd %xmm10, %xmm12 mulsd -11 * SIZE(BO), %xmm13 movlpd -12 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(BO), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm12, 0 * SIZE(CO1) movhps %xmm12, 0 * SIZE(CO2) movsd %xmm13, 0 * SIZE(CO1, LDC, 2) movhps %xmm13, 0 * SIZE(CO2, LDC, 2) movaps %xmm12, -16 * SIZE(BO) movaps %xmm13, -14 * SIZE(BO) #else movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm14, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 0 * SIZE(CO2, LDC, 2) movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) movsd %xmm14, -14 * SIZE(AO) movsd %xmm15, -13 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $2, N BRANCH jle .L80 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 #ifdef LN prefetcht0 -4 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 -4 * SIZE(CO2) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #else prefetcht0 3 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 3 * SIZE(CO2) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -14 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -10 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -14 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm3 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(BO), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) movapd %xmm1, -12 * SIZE(BO) movapd %xmm5, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L60: testq $2, M BRANCH jle .L70 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 movaps -16 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -14 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -10 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -8 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -14 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M BRANCH jle .L79 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -10 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addpd %xmm9, %xmm8 movhlps %xmm8, %xmm9 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm12 movsd -15 * SIZE(BO), %xmm13 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(BO), %xmm12 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 mulsd -13 * SIZE(BO), %xmm13 #endif #ifdef RT mulsd -13 * SIZE(BO), %xmm13 movlpd -14 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(BO), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(BO) movsd %xmm13, -15 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N BRANCH jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -12 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_4 .L98: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm12, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm12, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L91 ALIGN_4 .L100: testq $2, M BRANCH jle .L110 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm2 pxor %xmm9, %xmm9 movhps -15 * SIZE(BO), %xmm2 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 pshufd $0x44, %xmm2, %xmm3 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -12 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_4 .L108: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M BRANCH jle .L119 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -14 * SIZE(AO), %xmm0 movsd -14 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -13 * SIZE(AO), %xmm0 movsd -13 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -12 * SIZE(AO), %xmm0 movsd -12 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #ifdef LN movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef RT movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm10, -16 * SIZE(BO) #else movsd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (B, K, SIZE), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S000066400000000000000000002334471313527062700221020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #ifndef ALLOC_HUGETLB #define PREFETCHSIZE (8 * 4 + 4) #else #define PREFETCHSIZE (8 * 2 + 4) #endif #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 4 + 4) #endif #ifdef OPTERON #define movsd movlpd #endif #define KERNEL1(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movsd %xmm4, OFFSET movsd %xmm4, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCHNTA 40 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $16 * SIZE, BO addq $ 8 * SIZE, B movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) movsd %xmm1, 2 * SIZE(BO) movsd %xmm1, 3 * SIZE(BO) movsd %xmm2, 4 * SIZE(BO) movsd %xmm2, 5 * SIZE(BO) movsd %xmm3, 6 * SIZE(BO) movsd %xmm3, 7 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(BO), %xmm9 movapd 2 * SIZE(BO), %xmm11 movapd 4 * SIZE(BO), %xmm13 movapd 8 * SIZE(BO), %xmm15 movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 2 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movapd 4 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movapd 6 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW 4 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 4 * SIZE(CO2) pxor %xmm5, %xmm5 PREFETCHW 4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 PREFETCHW 4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif #if 1 andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpq $64 * 2, %rax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpq $64 * 4, %rax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpq $64 * 6, %rax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addq $16 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $64 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 4), BO # * 64 ALIGN_4 #else sarq $3, %rax je .L15 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jg .L12 ALIGN_4 #endif .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm5 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 6 * SIZE(BO), %xmm10 addpd %xmm9, %xmm6 movapd 8 * SIZE(BO), %xmm9 addpd %xmm10, %xmm7 movapd 6 * SIZE(AO), %xmm10 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd %xmm6, %xmm14 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm14 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 movapd 8 * SIZE(B), %xmm9 movapd 10 * SIZE(B), %xmm11 movapd 12 * SIZE(B), %xmm13 movapd 14 * SIZE(B), %xmm15 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 subpd %xmm4, %xmm9 subpd %xmm6, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 movapd 8 * SIZE(AO), %xmm12 movapd 10 * SIZE(AO), %xmm13 movapd 12 * SIZE(AO), %xmm14 movapd 14 * SIZE(AO), %xmm15 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 subpd %xmm2, %xmm12 subpd %xmm6, %xmm13 subpd %xmm3, %xmm14 subpd %xmm7, %xmm15 #endif #ifdef LN movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm15, %xmm4 subpd %xmm4, %xmm7 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm15, %xmm6 subpd %xmm6, %xmm3 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm7 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm11, %xmm4 subpd %xmm4, %xmm3 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm11 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm3, %xmm6 subpd %xmm6, %xmm15 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm11 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm7, %xmm4 subpd %xmm4, %xmm15 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm9, %xmm3 subpd %xmm3, %xmm15 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm13 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm15 movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm15, %xmm1 subpd %xmm1, %xmm13 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm15, %xmm3 subpd %xmm3, %xmm9 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm11 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 2 * SIZE(CO1, LDC, 2) movsd %xmm15, 3 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm13, 2 * SIZE(CO1, LDC, 2) movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movapd %xmm9, 8 * SIZE(B) movapd %xmm11, 10 * SIZE(B) movapd %xmm13, 12 * SIZE(B) movapd %xmm15, 14 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) movlpd %xmm9, 16 * SIZE(BO) movlpd %xmm9, 17 * SIZE(BO) movhpd %xmm9, 18 * SIZE(BO) movhpd %xmm9, 19 * SIZE(BO) movlpd %xmm11, 20 * SIZE(BO) movlpd %xmm11, 21 * SIZE(BO) movhpd %xmm11, 22 * SIZE(BO) movhpd %xmm11, 23 * SIZE(BO) movlpd %xmm13, 24 * SIZE(BO) movlpd %xmm13, 25 * SIZE(BO) movhpd %xmm13, 26 * SIZE(BO) movhpd %xmm13, 27 * SIZE(BO) movlpd %xmm15, 28 * SIZE(BO) movlpd %xmm15, 29 * SIZE(BO) movhpd %xmm15, 30 * SIZE(BO) movhpd %xmm15, 31 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) movapd %xmm12, 8 * SIZE(AO) movapd %xmm13, 10 * SIZE(AO) movapd %xmm14, 12 * SIZE(AO) movapd %xmm15, 14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm11, %xmm2 movapd 40 * SIZE(BO), %xmm11 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 addpd %xmm13, %xmm0 movapd 18 * SIZE(BO), %xmm13 mulpd %xmm8, %xmm13 addpd %xmm13, %xmm1 movapd 20 * SIZE(BO), %xmm13 mulpd %xmm8, %xmm13 mulpd 22 * SIZE(BO), %xmm8 addpd %xmm13, %xmm2 movapd 48 * SIZE(BO), %xmm13 addpd %xmm8, %xmm3 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm15 addpd %xmm15, %xmm0 movapd 26 * SIZE(BO), %xmm15 mulpd %xmm8, %xmm15 addpd %xmm15, %xmm1 movapd 28 * SIZE(BO), %xmm15 mulpd %xmm8, %xmm15 mulpd 30 * SIZE(BO), %xmm8 addpd %xmm15, %xmm2 movapd 56 * SIZE(BO), %xmm15 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 34 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movapd 36 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 38 * SIZE(BO), %xmm10 addpd %xmm9, %xmm2 movapd 64 * SIZE(BO), %xmm9 addpd %xmm10, %xmm3 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 42 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movapd 44 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 mulpd 46 * SIZE(BO), %xmm10 addpd %xmm11, %xmm2 movapd 72 * SIZE(BO), %xmm11 addpd %xmm10, %xmm3 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm13 addpd %xmm13, %xmm0 movapd 50 * SIZE(BO), %xmm13 mulpd %xmm10, %xmm13 addpd %xmm13, %xmm1 movapd 52 * SIZE(BO), %xmm13 mulpd %xmm10, %xmm13 mulpd 54 * SIZE(BO), %xmm10 addpd %xmm13, %xmm2 movapd 80 * SIZE(BO), %xmm13 addpd %xmm10, %xmm3 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 addpd %xmm15, %xmm0 movapd 58 * SIZE(BO), %xmm15 mulpd %xmm10, %xmm15 addpd %xmm15, %xmm1 movapd 60 * SIZE(BO), %xmm15 mulpd %xmm10, %xmm15 mulpd 62 * SIZE(BO), %xmm10 addpd %xmm15, %xmm2 movapd 88 * SIZE(BO), %xmm15 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 8 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 movapd 4 * SIZE(AO), %xmm12 movapd 6 * SIZE(AO), %xmm14 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 subpd %xmm2, %xmm12 subpd %xmm3, %xmm14 #endif #ifdef LN movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) movapd %xmm12, 4 * SIZE(AO) movapd %xmm14, 6 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsd 16 * SIZE(BO), %xmm13 movsd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 4 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 32 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 1 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm0 movsd 10 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm1 movsd 12 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 mulsd 14 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 40 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 2 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm0 movsd 18 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm1 movsd 20 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 mulsd 22 * SIZE(BO), %xmm8 addsd %xmm13, %xmm2 movsd 48 * SIZE(BO), %xmm13 addsd %xmm8, %xmm3 movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm0 movsd 26 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm1 movsd 28 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 mulsd 30 * SIZE(BO), %xmm8 addsd %xmm15, %xmm2 movsd 56 * SIZE(BO), %xmm15 addsd %xmm8, %xmm3 movsd 4 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 movsd 34 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 36 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 38 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 64 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm0 movsd 42 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm1 movsd 44 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 mulsd 46 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 72 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 6 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm0 movsd 50 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm1 movsd 52 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 mulsd 54 * SIZE(BO), %xmm8 addsd %xmm13, %xmm2 movsd 80 * SIZE(BO), %xmm13 addsd %xmm8, %xmm3 movsd 7 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm0 movsd 58 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm1 movsd 60 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 mulsd 62 * SIZE(BO), %xmm8 addsd %xmm15, %xmm2 movsd 88 * SIZE(BO), %xmm15 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 4 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 8 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 1 * SIZE(AO), %xmm8 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm5 movsd 2 * SIZE(B), %xmm6 movsd 3 * SIZE(B), %xmm7 #else movsd 0 * SIZE(AO), %xmm4 movsd 1 * SIZE(AO), %xmm5 movsd 2 * SIZE(AO), %xmm6 movsd 3 * SIZE(AO), %xmm7 #endif subsd %xmm0, %xmm4 subsd %xmm1, %xmm5 subsd %xmm2, %xmm6 subsd %xmm3, %xmm7 #ifdef LN movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm4 movlpd 1 * SIZE(B), %xmm1 mulsd %xmm4, %xmm1 subsd %xmm1, %xmm5 movlpd 2 * SIZE(B), %xmm2 mulsd %xmm4, %xmm2 subsd %xmm2, %xmm6 movlpd 3 * SIZE(B), %xmm3 mulsd %xmm4, %xmm3 subsd %xmm3, %xmm7 mulsd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm6 movlpd 7 * SIZE(B), %xmm2 mulsd %xmm5, %xmm2 subsd %xmm2, %xmm7 mulsd 10 * SIZE(B), %xmm6 movlpd 11 * SIZE(B), %xmm1 mulsd %xmm6, %xmm1 subsd %xmm1, %xmm7 mulsd 15 * SIZE(B), %xmm7 #endif #ifdef RT mulsd 15 * SIZE(B), %xmm7 movlpd 14 * SIZE(B), %xmm1 mulsd %xmm7, %xmm1 subsd %xmm1, %xmm6 movlpd 13 * SIZE(B), %xmm2 mulsd %xmm7, %xmm2 subsd %xmm2, %xmm5 movlpd 12 * SIZE(B), %xmm3 mulsd %xmm7, %xmm3 subsd %xmm3, %xmm4 mulsd 10 * SIZE(B), %xmm6 movlpd 9 * SIZE(B), %xmm1 mulsd %xmm6, %xmm1 subsd %xmm1, %xmm5 movlpd 8 * SIZE(B), %xmm2 mulsd %xmm6, %xmm2 subsd %xmm2, %xmm4 mulsd 5 * SIZE(B), %xmm5 movlpd 4 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm4 mulsd 0 * SIZE(B), %xmm4 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO2) movsd %xmm6, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 0 * SIZE(CO2, LDC, 2) #if defined(LN) || defined(LT) movsd %xmm4, 0 * SIZE(B) movsd %xmm5, 1 * SIZE(B) movsd %xmm6, 2 * SIZE(B) movsd %xmm7, 3 * SIZE(B) movsd %xmm4, 0 * SIZE(BO) movsd %xmm4, 1 * SIZE(BO) movsd %xmm5, 2 * SIZE(BO) movsd %xmm5, 3 * SIZE(BO) movsd %xmm6, 4 * SIZE(BO) movsd %xmm6, 5 * SIZE(BO) movsd %xmm7, 6 * SIZE(BO) movsd %xmm7, 7 * SIZE(BO) #else movsd %xmm4, 0 * SIZE(AO) movsd %xmm5, 1 * SIZE(AO) movsd %xmm6, 2 * SIZE(AO) movsd %xmm7, 3 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: PREFETCH 56 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) movsd %xmm1, 2 * SIZE(BO) movsd %xmm1, 3 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movapd 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 movapd 16 * SIZE(AO), %xmm12 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movapd 24 * SIZE(BO), %xmm15 PREFETCHW 4 * SIZE(CO1) PREFETCHW 4 * SIZE(CO2) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm4 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm4 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm5 movapd 32 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm11 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 8 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm4 movapd 12 * SIZE(BO), %xmm11 addpd %xmm10, %xmm5 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 12 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm11, %xmm4 movapd 40 * SIZE(BO), %xmm11 addpd %xmm10, %xmm5 movapd 40 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm12, %xmm13 mulpd 18 * SIZE(BO), %xmm12 addpd %xmm13, %xmm0 movapd 16 * SIZE(BO), %xmm13 addpd %xmm12, %xmm1 movapd 18 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 18 * SIZE(BO), %xmm12 addpd %xmm13, %xmm4 movapd 20 * SIZE(BO), %xmm13 addpd %xmm12, %xmm5 movapd 20 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 22 * SIZE(BO), %xmm12 addpd %xmm13, %xmm0 movapd 20 * SIZE(BO), %xmm13 addpd %xmm12, %xmm1 movapd 22 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 22 * SIZE(BO), %xmm12 addpd %xmm13, %xmm4 movapd 48 * SIZE(BO), %xmm13 addpd %xmm12, %xmm5 movapd 48 * SIZE(AO), %xmm12 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm14, %xmm15 mulpd 26 * SIZE(BO), %xmm14 addpd %xmm15, %xmm0 movapd 24 * SIZE(BO), %xmm15 addpd %xmm14, %xmm1 movapd 26 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 26 * SIZE(BO), %xmm14 addpd %xmm15, %xmm4 movapd 28 * SIZE(BO), %xmm15 addpd %xmm14, %xmm5 movapd 28 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 30 * SIZE(BO), %xmm14 addpd %xmm15, %xmm0 movapd 28 * SIZE(BO), %xmm15 addpd %xmm14, %xmm1 movapd 30 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 30 * SIZE(BO), %xmm14 addpd %xmm15, %xmm4 movapd 56 * SIZE(BO), %xmm15 addpd %xmm14, %xmm5 movapd 56 * SIZE(AO), %xmm14 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 movapd 4 * SIZE(B), %xmm9 movapd 6 * SIZE(B), %xmm13 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 subpd %xmm4, %xmm9 subpd %xmm12, %xmm13 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 #endif #ifdef LN movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movapd %xmm9, 4 * SIZE(B) movapd %xmm13, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) movlpd %xmm9, 8 * SIZE(BO) movlpd %xmm9, 9 * SIZE(BO) movhpd %xmm9, 10 * SIZE(BO) movhpd %xmm9, 11 * SIZE(BO) movlpd %xmm13, 12 * SIZE(BO) movlpd %xmm13, 13 * SIZE(BO) movhpd %xmm13, 14 * SIZE(BO) movhpd %xmm13, 15 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm11, %xmm0 movapd 12 * SIZE(BO), %xmm11 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm11, %xmm2 movapd 40 * SIZE(BO), %xmm11 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm13 mulpd 18 * SIZE(BO), %xmm10 addpd %xmm13, %xmm0 movapd 20 * SIZE(BO), %xmm13 addpd %xmm10, %xmm1 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm13 mulpd 22 * SIZE(BO), %xmm10 addpd %xmm13, %xmm2 movapd 48 * SIZE(BO), %xmm13 addpd %xmm10, %xmm3 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 mulpd 26 * SIZE(BO), %xmm10 addpd %xmm15, %xmm0 movapd 28 * SIZE(BO), %xmm15 addpd %xmm10, %xmm1 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 mulpd 30 * SIZE(BO), %xmm10 addpd %xmm15, %xmm2 movapd 56 * SIZE(BO), %xmm15 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 #endif #ifdef LN movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsd 16 * SIZE(BO), %xmm13 movsd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulsd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd 2 * SIZE(BO), %xmm8 addsd %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addsd %xmm8, %xmm1 movsd 1 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 32 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 2 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 mulsd 10 * SIZE(BO), %xmm8 addsd %xmm11, %xmm0 movsd 12 * SIZE(BO), %xmm11 addsd %xmm8, %xmm1 movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 mulsd 14 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 40 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 mulsd %xmm10, %xmm13 mulsd 18 * SIZE(BO), %xmm10 addsd %xmm13, %xmm0 movsd 20 * SIZE(BO), %xmm13 addsd %xmm10, %xmm1 movsd 5 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm13 mulsd 22 * SIZE(BO), %xmm10 addsd %xmm13, %xmm2 movsd 48 * SIZE(BO), %xmm13 addsd %xmm10, %xmm3 movsd 6 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm15 mulsd 26 * SIZE(BO), %xmm10 addsd %xmm15, %xmm0 movsd 28 * SIZE(BO), %xmm15 addsd %xmm10, %xmm1 movsd 7 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm15 mulsd 30 * SIZE(BO), %xmm10 addsd %xmm15, %xmm2 movsd 56 * SIZE(BO), %xmm15 addsd %xmm10, %xmm3 movsd 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulsd %xmm8, %xmm9 mulsd 2 * SIZE(BO), %xmm8 addsd %xmm9, %xmm0 addsd %xmm8, %xmm1 movsd 1 * SIZE(AO), %xmm8 movsd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addsd %xmm2, %xmm0 addsd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm5 #else movsd 0 * SIZE(AO), %xmm4 movsd 1 * SIZE(AO), %xmm5 #endif subsd %xmm0, %xmm4 subsd %xmm1, %xmm5 #ifdef LN movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm1 mulsd %xmm4, %xmm1 subsd %xmm1, %xmm5 mulsd 3 * SIZE(B), %xmm5 #endif #ifdef RT mulsd 3 * SIZE(B), %xmm5 movlpd 2 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm4 mulsd 0 * SIZE(B), %xmm4 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm4, 0 * SIZE(B) movsd %xmm5, 1 * SIZE(B) movsd %xmm4, 0 * SIZE(BO) movsd %xmm4, 1 * SIZE(BO) movsd %xmm5, 2 * SIZE(BO) movsd %xmm5, 3 * SIZE(BO) #else movsd %xmm4, 0 * SIZE(AO) movsd %xmm5, 1 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L83 ALIGN_4 .L82: PREFETCH 56 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L82 ALIGN_4 .L83: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L90 ALIGN_4 .L84: movsd 0 * SIZE(B), %xmm0 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO decq %rax jne .L84 ALIGN_4 .L90: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movapd 24 * SIZE(AO), %xmm14 PREFETCHW 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm9, %xmm3 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm12 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm12, %xmm0 movapd 20 * SIZE(AO), %xmm12 addpd %xmm11, %xmm1 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm12 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm12, %xmm2 movapd 48 * SIZE(AO), %xmm12 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) addpd %xmm11, %xmm3 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm14 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm14, %xmm0 movapd 28 * SIZE(AO), %xmm14 addpd %xmm11, %xmm1 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm14 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm14, %xmm2 movapd 56 * SIZE(AO), %xmm14 addpd %xmm11, %xmm3 movapd 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 2 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(AO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(AO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(AO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(AO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) movhpd %xmm2, 2 * SIZE(BO) movhpd %xmm2, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(AO), %xmm8 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 4 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movapd 6 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 12 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movapd 14 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(AO), %xmm8 movapd 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) movhpd %xmm2, 2 * SIZE(BO) movhpd %xmm2, 3 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulsd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 1 * SIZE(AO), %xmm8 addsd %xmm9, %xmm0 movsd 16 * SIZE(BO), %xmm9 mulsd 2 * SIZE(BO), %xmm8 addsd %xmm8, %xmm1 movsd 2 * SIZE(AO), %xmm8 mulsd 4 * SIZE(BO), %xmm8 addsd %xmm8, %xmm2 movsd 3 * SIZE(AO), %xmm8 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 mulsd %xmm10, %xmm11 movsd 5 * SIZE(AO), %xmm10 addsd %xmm11, %xmm0 movsd 24 * SIZE(BO), %xmm11 mulsd 10 * SIZE(BO), %xmm10 addsd %xmm10, %xmm1 movsd 6 * SIZE(AO), %xmm10 mulsd 12 * SIZE(BO), %xmm10 addsd %xmm10, %xmm2 movsd 7 * SIZE(AO), %xmm10 mulsd 14 * SIZE(BO), %xmm10 addsd %xmm10, %xmm3 movsd 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm8, %xmm9 movsd 1 * SIZE(AO), %xmm8 addsd %xmm9, %xmm0 movsd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: addsd %xmm2, %xmm0 addsd %xmm3, %xmm1 addsd %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 subsd %xmm0, %xmm2 #else movsd 0 * SIZE(AO), %xmm2 subsd %xmm0, %xmm2 #endif #ifdef LN movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movsd 0 * SIZE(B), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef RT movsd 0 * SIZE(B), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) #else movsd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S000066400000000000000000002242531313527062700220760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #define KERNEL1(address) \ mulpd %xmm8, %xmm9 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movddup 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movddup 24 * SIZE(BO), %xmm15 prefetchnta 4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta 4 * SIZE(CO2) pxor %xmm5, %xmm5 prefetchnta 4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 prefetchnta 4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif #if 1 andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L12 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L12 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L12 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L12 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L12 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L12 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L12 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 #else sarq $3, %rax je .L15 ALIGN_4 .L12: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 32 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 40 * SIZE(BO), %xmm11 mulpd %xmm12, %xmm13 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm13, %xmm0 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 18 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 16 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 20 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm0 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 22 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 48 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 48 * SIZE(BO), %xmm13 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 26 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 28 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 30 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 56 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 56 * SIZE(BO), %xmm15 addq $32 * SIZE, BO addq $32 * SIZE, AO decq %rax BRANCH jne .L12 #endif ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax subq $4, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd %xmm6, %xmm14 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm14 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm5 movapd 6 * SIZE(BO), %xmm7 movapd 8 * SIZE(BO), %xmm9 movapd 10 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm13 movapd 14 * SIZE(BO), %xmm15 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 subpd %xmm4, %xmm9 subpd %xmm6, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 movapd 8 * SIZE(AO), %xmm12 movapd 10 * SIZE(AO), %xmm13 movapd 12 * SIZE(AO), %xmm14 movapd 14 * SIZE(AO), %xmm15 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 subpd %xmm2, %xmm12 subpd %xmm6, %xmm13 subpd %xmm3, %xmm14 subpd %xmm7, %xmm15 #endif #ifdef LN movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm15, %xmm4 subpd %xmm4, %xmm7 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm15, %xmm6 subpd %xmm6, %xmm3 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm7 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm11, %xmm4 subpd %xmm4, %xmm3 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm11 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm3, %xmm6 subpd %xmm6, %xmm15 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm11 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm7, %xmm4 subpd %xmm4, %xmm15 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm9, %xmm3 subpd %xmm3, %xmm15 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm13 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm15 movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #endif #ifdef RT movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm15, %xmm1 subpd %xmm1, %xmm13 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm15, %xmm3 subpd %xmm3, %xmm9 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm11 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 2 * SIZE(CO1, LDC, 2) movsd %xmm15, 3 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm13, 2 * SIZE(CO1, LDC, 2) movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) movapd %xmm5, 4 * SIZE(BO) movapd %xmm7, 6 * SIZE(BO) movapd %xmm9, 8 * SIZE(BO) movapd %xmm11, 10 * SIZE(BO) movapd %xmm13, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) movapd %xmm12, 8 * SIZE(AO) movapd %xmm13, 10 * SIZE(AO) movapd %xmm14, 12 * SIZE(AO) movapd %xmm15, 14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M BRANCH je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm5 movapd 6 * SIZE(BO), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 movapd 4 * SIZE(AO), %xmm12 movapd 6 * SIZE(AO), %xmm14 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 subpd %xmm2, %xmm12 subpd %xmm3, %xmm14 #endif #ifdef LN movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 #endif #ifdef RT movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) movapd %xmm5, 4 * SIZE(BO) movapd %xmm7, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) movapd %xmm12, 4 * SIZE(AO) movapd %xmm14, 6 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 3 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 8 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 5 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 6 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 7 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 40 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(BO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(BO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(BO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(BO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(BO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L10 ALIGN_4 .L40: testq $2, N je .L80 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-4 * SIZE, BB movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) prefetchw 4 * SIZE(CO2) #else prefetchnta 4 * SIZE(CO1) prefetchnta 4 * SIZE(CO2) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm5 movapd 4 * SIZE(BO), %xmm9 movapd 6 * SIZE(BO), %xmm13 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 subpd %xmm4, %xmm9 subpd %xmm12, %xmm13 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 #endif #ifdef LN movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 #endif #ifdef RT movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm5, 2 * SIZE(BO) movapd %xmm9, 4 * SIZE(BO) movapd %xmm13, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm5 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 #endif #ifdef LN movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 #endif #ifdef RT movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm5, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movddup 2 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movddup 3 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movddup 8 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm11 movddup 5 * SIZE(AO), %xmm10 addpd %xmm11, %xmm0 mulpd 10 * SIZE(BO), %xmm10 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movddup 6 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movddup 7 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movddup 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 24 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm8 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm8, %xmm0 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 5 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm8 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 6 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 movapd 28 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 7 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(AO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(AO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(AO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(AO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(AO), %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd 4 * SIZE(AO), %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm10 movddup 5 * SIZE(BO), %xmm11 addpd %xmm10, %xmm0 mulpd 10 * SIZE(AO), %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 6 * SIZE(BO), %xmm11 mulpd 12 * SIZE(AO), %xmm11 addpd %xmm11, %xmm2 movddup 7 * SIZE(BO), %xmm11 mulpd 14 * SIZE(AO), %xmm11 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm9, %xmm8 movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm9 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm8 pxor %xmm1, %xmm1 movapd 4 * SIZE(AO), %xmm11 pxor %xmm2, %xmm2 movapd 4 * SIZE(BO), %xmm10 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulpd %xmm9, %xmm8 movapd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(BO), %xmm9 movapd 8 * SIZE(BO), %xmm8 addpd %xmm9, %xmm1 movapd 8 * SIZE(AO), %xmm9 mulpd %xmm11, %xmm10 movapd 6 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 mulpd 6 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm10 addpd %xmm11, %xmm1 movapd 12 * SIZE(AO), %xmm11 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd 0 * SIZE(BO), %xmm9 addsd %xmm9, %xmm0 movsd 1 * SIZE(AO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: addpd %xmm1, %xmm0 haddpd %xmm0, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm2 subsd %xmm0, %xmm2 #else movsd 0 * SIZE(AO), %xmm2 subsd %xmm0, %xmm2 #endif #ifdef LN movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef RT movsd 0 * SIZE(BO), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(BO) #else movsd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_2 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S000066400000000000000000002537451313527062700226460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (16 * 1 + 4) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $3, J NOBRANCH jle .L40 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 8), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 4), CO2 #ifndef RT leaq (C, LDC, 8), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $2, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 8), BO #else movq B, BO #endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 4 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht2 4 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht2 4 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht2 4 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht2 4 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht2 4 * SIZE(CO2, %rax, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 8), BO #endif addps %xmm1, %xmm12 addps %xmm2, %xmm13 addps %xmm3, %xmm14 addps %xmm4, %xmm15 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm4 shufps $0xdd, %xmm9, %xmm5 movaps %xmm8, %xmm6 shufps $0x88, %xmm10, %xmm8 shufps $0xdd, %xmm6, %xmm10 movaps %xmm4, %xmm9 movaps %xmm5, %xmm11 shufps $0x22, %xmm5, %xmm9 shufps $0x77, %xmm4, %xmm11 movaps %xmm12, %xmm4 shufps $0x88, %xmm13, %xmm12 movaps %xmm14, %xmm5 shufps $0x88, %xmm15, %xmm14 shufps $0xdd, %xmm15, %xmm4 shufps $0xdd, %xmm13, %xmm5 movaps %xmm12, %xmm6 shufps $0x88, %xmm14, %xmm12 shufps $0xdd, %xmm6, %xmm14 movaps %xmm4, %xmm13 movaps %xmm5, %xmm15 shufps $0x22, %xmm5, %xmm13 shufps $0x77, %xmm4, %xmm15 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm4 movaps -24 * SIZE(BO), %xmm1 movaps -20 * SIZE(BO), %xmm5 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm6 movaps -8 * SIZE(BO), %xmm3 movaps -4 * SIZE(BO), %xmm7 #else movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps %xmm13, %xmm4 shufps $0xd8, %xmm12, %xmm13 shufps $0xd8, %xmm15, %xmm12 shufps $0xd8, %xmm14, %xmm15 shufps $0xd8, %xmm4, %xmm14 movaps %xmm12, %xmm4 shufps $0xd8, %xmm14, %xmm12 shufps $0xd8, %xmm4, %xmm14 movaps %xmm13, %xmm5 shufps $0xd8, %xmm15, %xmm13 shufps $0xd8, %xmm5, %xmm15 movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -24 * SIZE(AO), %xmm2 movaps -20 * SIZE(AO), %xmm3 movaps -16 * SIZE(AO), %xmm4 movaps -12 * SIZE(AO), %xmm5 movaps -8 * SIZE(AO), %xmm6 movaps -4 * SIZE(AO), %xmm7 #endif subps %xmm8, %xmm0 subps %xmm9, %xmm1 subps %xmm10, %xmm2 subps %xmm11, %xmm3 subps %xmm12, %xmm4 subps %xmm13, %xmm5 subps %xmm14, %xmm6 subps %xmm15, %xmm7 #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 mulps %xmm15, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm4, -28 * SIZE(BO) movaps %xmm1, -24 * SIZE(BO) movaps %xmm5, -20 * SIZE(BO) movaps %xmm2, -16 * SIZE(BO) movaps %xmm6, -12 * SIZE(BO) movaps %xmm3, -8 * SIZE(BO) movaps %xmm7, -4 * SIZE(BO) movaps %xmm0, %xmm8 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm8, %xmm1 movaps %xmm2, %xmm9 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm9, %xmm3 movaps %xmm0, %xmm8 shufps $0x88, %xmm2, %xmm0 movaps %xmm1, %xmm9 shufps $0x22, %xmm3, %xmm1 shufps $0xdd, %xmm2, %xmm8 movaps %xmm8, %xmm2 shufps $0x77, %xmm3, %xmm9 movaps %xmm9, %xmm3 movaps %xmm4, %xmm8 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm8, %xmm5 movaps %xmm6, %xmm9 shufps $0x88, %xmm7, %xmm6 shufps $0xdd, %xmm9, %xmm7 movaps %xmm4, %xmm8 shufps $0x88, %xmm6, %xmm4 movaps %xmm5, %xmm9 shufps $0x22, %xmm7, %xmm5 shufps $0xdd, %xmm6, %xmm8 movaps %xmm8, %xmm6 shufps $0x77, %xmm7, %xmm9 movaps %xmm9, %xmm7 #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm1, -28 * SIZE(AO) movaps %xmm2, -24 * SIZE(AO) movaps %xmm3, -20 * SIZE(AO) movaps %xmm4, -16 * SIZE(AO) movaps %xmm5, -12 * SIZE(AO) movaps %xmm6, -8 * SIZE(AO) movaps %xmm7, -4 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %rax, 1) movhps %xmm3, 2 * SIZE(CO1, %rax, 1) movsd %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movsd %xmm5, 0 * SIZE(CO2, LDC, 1) movhps %xmm5, 2 * SIZE(CO2, LDC, 1) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhps %xmm6, 2 * SIZE(CO2, LDC, 2) movsd %xmm7, 0 * SIZE(CO2, %rax, 1) movhps %xmm7, 2 * SIZE(CO2, %rax, 1) #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -16 * SIZE(BO), %xmm5 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -8 * SIZE(BO), %xmm5 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps 0 * SIZE(BO), %xmm5 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm5 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm2 movaps -24 * SIZE(BO), %xmm1 movaps -20 * SIZE(BO), %xmm3 subps %xmm8, %xmm0 subps %xmm4, %xmm1 subps %xmm10, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 movaps -24 * SIZE(AO), %xmm4 movaps -20 * SIZE(AO), %xmm6 subps %xmm8, %xmm0 subps %xmm9, %xmm2 subps %xmm10, %xmm4 subps %xmm11, %xmm6 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm2, -28 * SIZE(BO) movaps %xmm1, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) movaps %xmm0, %xmm4 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm4 movaps %xmm2, %xmm5 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm5 movsd %xmm0, (CO1) movhps %xmm0, (CO1, LDC, 1) movsd %xmm4, (CO1, LDC, 2) movhps %xmm4, (CO1, %rax, 1) movsd %xmm2, (CO2) movhps %xmm2, (CO2, LDC, 1) movsd %xmm5, (CO2, LDC, 2) movhps %xmm5, (CO2, %rax, 1) #else movlhps %xmm1, %xmm0 movlhps %xmm3, %xmm2 movlhps %xmm5, %xmm4 movlhps %xmm7, %xmm6 movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movaps %xmm4, -24 * SIZE(AO) movaps %xmm6, -20 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO1, LDC, 1) movsd %xmm2, (CO1, LDC, 2) movsd %xmm3, (CO1, %rax, 1) movsd %xmm4, (CO2) movsd %xmm5, (CO2, LDC, 1) movsd %xmm6, (CO2, LDC, 2) movsd %xmm7, (CO2, %rax, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M BRANCH jle .L39 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm8, %xmm8 xorps %xmm12, %xmm12 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -20 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -12 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -4 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 subq $-32 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #endif addps %xmm2, %xmm8 addps %xmm3, %xmm12 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm4 subps %xmm8, %xmm0 subps %xmm12, %xmm4 #else movsd -32 * SIZE(AO), %xmm0 movhps -30 * SIZE(AO), %xmm0 movsd -28 * SIZE(AO), %xmm4 movhps -26 * SIZE(AO), %xmm4 subps %xmm8, %xmm0 subps %xmm12, %xmm4 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 pshufd $0xff, %xmm4, %xmm7 pshufd $0xaa, %xmm4, %xmm6 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm4, -28 * SIZE(BO) pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 pshufd $0xff, %xmm4, %xmm7 pshufd $0xaa, %xmm4, %xmm6 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #else unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 unpcklps %xmm5, %xmm4 unpcklps %xmm7, %xmm6 movlps %xmm0, -32 * SIZE(AO) movlps %xmm2, -30 * SIZE(AO) movlps %xmm4, -28 * SIZE(AO) movlps %xmm6, -26 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO1, LDC, 1) movss %xmm2, (CO1, LDC, 2) movss %xmm3, (CO1, %rax, 1) movss %xmm4, (CO2) movss %xmm5, (CO2, LDC, 1) movss %xmm6, (CO2, LDC, 2) movss %xmm7, (CO2, %rax, 1) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 8), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $8, KK #endif #ifdef RT subq $8, KK #endif subq $1, J BRANCH jg .L10 ALIGN_4 .L40: testq $4, N jle .L70 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I NOBRANCH jle .L50 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 4 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht2 4 * SIZE(CO2, LDC, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm4 shufps $0xdd, %xmm9, %xmm5 movaps %xmm8, %xmm6 shufps $0x88, %xmm10, %xmm8 shufps $0xdd, %xmm6, %xmm10 movaps %xmm4, %xmm9 movaps %xmm5, %xmm11 shufps $0x22, %xmm5, %xmm9 shufps $0x77, %xmm4, %xmm11 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps -20 * SIZE(BO), %xmm3 #else movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -24 * SIZE(AO), %xmm2 movaps -20 * SIZE(AO), %xmm3 #endif subps %xmm8, %xmm0 subps %xmm9, %xmm1 subps %xmm10, %xmm2 subps %xmm11, %xmm3 #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm2, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) movaps %xmm0, %xmm8 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm8, %xmm1 movaps %xmm2, %xmm9 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm9, %xmm3 movaps %xmm0, %xmm8 shufps $0x88, %xmm2, %xmm0 movaps %xmm1, %xmm9 shufps $0x22, %xmm3, %xmm1 shufps $0xdd, %xmm2, %xmm8 movaps %xmm8, %xmm2 shufps $0x77, %xmm3, %xmm9 movaps %xmm9, %xmm3 #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm1, -28 * SIZE(AO) movaps %xmm2, -24 * SIZE(AO) movaps %xmm3, -20 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 2 * SIZE(CO2, LDC, 1) #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L41 ALIGN_4 .L50: testq $2, M BRANCH jle .L60 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 subps %xmm8, %xmm0 subps %xmm4, %xmm1 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 subps %xmm8, %xmm0 subps %xmm9, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm0, %xmm4 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm4 movsd %xmm0, (CO1) movhps %xmm0, (CO1, LDC, 1) movsd %xmm4, (CO2) movhps %xmm4, (CO2, LDC, 1) #else movlhps %xmm1, %xmm0 movlhps %xmm3, %xmm2 movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO1, LDC, 1) movsd %xmm2, (CO2) movsd %xmm3, (CO2, LDC, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $1, M BRANCH jle .L69 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -20 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-16 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L62 addps %xmm9, %xmm8 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addps %xmm2, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 #else movsd -32 * SIZE(AO), %xmm0 movhps -30 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #else unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, -32 * SIZE(AO) movlps %xmm2, -30 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO1, LDC, 1) movss %xmm2, (CO2) movss %xmm3, (CO2, LDC, 1) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif ALIGN_4 .L70: testq $2, N jle .L100 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I NOBRANCH jle .L80 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -26 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -24 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_3 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_3 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm4 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm2 subps %xmm8, %xmm0 subps %xmm4, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 subps %xmm8, %xmm0 subps %xmm9, %xmm2 #endif #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm2 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlps %xmm0, -32 * SIZE(BO) movlps %xmm1, -30 * SIZE(BO) movlps %xmm2, -28 * SIZE(BO) movlps %xmm3, -26 * SIZE(BO) unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 2 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L71 ALIGN_4 .L80: testq $2, M BRANCH jle .L90 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -26 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L82 ALIGN_3 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_3 .L88: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) pshufd $0xd8, %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm0 #else movaps -32 * SIZE(AO), %xmm0 #endif subps %xmm8, %xmm0 movhlps %xmm0, %xmm1 #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlps %xmm0, -32 * SIZE(BO) movlps %xmm1, -30 * SIZE(BO) unpcklps %xmm1, %xmm0 movlps %xmm0, (CO1) movhps %xmm0, (CO2) #else movlps %xmm0, -32 * SIZE(AO) movlps %xmm1, -30 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L90: testq $1, M BRANCH jle .L99 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_3 .L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -26 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L92 addps %xmm9, %xmm8 ALIGN_3 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_3 .L96: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_3 .L98: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addps %xmm2, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 #else movsd -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #if defined(LN) || defined(LT) movss -32 * SIZE(AO), %xmm8 mulss %xmm8, %xmm0 mulss %xmm8, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movss %xmm0, -32 * SIZE(BO) movss %xmm1, -31 * SIZE(BO) #else movss %xmm0, -32 * SIZE(AO) movss %xmm1, -31 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO2) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L100: testq $1, N jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I NOBRANCH jle .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -29 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_3 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_3 .L106: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_3 .L108: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 movhps -30 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 #else movaps -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, -32 * SIZE(BO) movlps %xmm2, -30 * SIZE(BO) movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 2 * SIZE(CO1) #else movaps %xmm0, -32 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L101 ALIGN_4 .L110: testq $2, M BRANCH jle .L120 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_3 .L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -31 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -29 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AO), %xmm0 subq $-4 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L112 ALIGN_3 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_3 .L116: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_3 .L118: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 pshufd $0x55, %xmm0, %xmm1 #else movsd -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 movlps %xmm0, -32 * SIZE(BO) movlps %xmm0, 0 * SIZE(CO1) #else movlps %xmm0, -32 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L120: testq $1, M BRANCH jle .L129 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L125 ALIGN_3 .L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -30 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -30 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -29 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -29 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -28 * SIZE(AO), %xmm0 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L122 ALIGN_3 .L125: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L128 ALIGN_3 .L126: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L126 ALIGN_3 .L128: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif addss %xmm2, %xmm8 #if defined(LN) || defined(LT) movss -32 * SIZE(BO), %xmm0 subss %xmm8, %xmm0 #else movss -32 * SIZE(AO), %xmm0 subss %xmm8, %xmm0 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AO), %xmm8 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 #endif mulss %xmm8, %xmm0 #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm0, -32 * SIZE(BO) #else movss %xmm0, -32 * SIZE(AO) #endif movss %xmm0, (CO1) #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L129: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_LT_8x4_sse.S000066400000000000000000003414451313527062700220220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht0 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp EMMS movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movsd %xmm4, OFFSET movsd %xmm4, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $2 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 movaps 8 * SIZE(B), %xmm11 movaps 12 * SIZE(B), %xmm15 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) pshufd $0x00, %xmm11, %xmm8 pshufd $0x55, %xmm11, %xmm9 pshufd $0xaa, %xmm11, %xmm10 pshufd $0xff, %xmm11, %xmm11 pshufd $0x00, %xmm15, %xmm12 pshufd $0x55, %xmm15, %xmm13 pshufd $0xaa, %xmm15, %xmm14 pshufd $0xff, %xmm15, %xmm15 movaps %xmm8, 32 * SIZE(BO) movaps %xmm9, 36 * SIZE(BO) movaps %xmm10, 40 * SIZE(BO) movaps %xmm11, 44 * SIZE(BO) movaps %xmm12, 48 * SIZE(BO) movaps %xmm13, 52 * SIZE(BO) movaps %xmm14, 56 * SIZE(BO) movaps %xmm15, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 movaps 8 * SIZE(BO), %xmm13 movaps 16 * SIZE(BO), %xmm15 movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 4 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 8 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movaps 12 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW 7 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 7 * SIZE(CO2) pxor %xmm5, %xmm5 PREFETCHW 7 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 PREFETCHW 7 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 4 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 mulps 12 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 8 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 16 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm5 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 mulps 12 * SIZE(BO), %xmm10 addps %xmm13, %xmm6 movaps 24 * SIZE(BO), %xmm13 addps %xmm10, %xmm7 movaps 20 * SIZE(AO), %xmm10 mulps %xmm12, %xmm15 addps %xmm15, %xmm0 movaps 16 * SIZE(BO), %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm1 movaps 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm13 mulps 28 * SIZE(BO), %xmm12 addps %xmm13, %xmm2 movaps 24 * SIZE(BO), %xmm13 addps %xmm12, %xmm3 movaps 24 * SIZE(AO), %xmm12 mulps %xmm14, %xmm15 addps %xmm15, %xmm4 movaps 48 * SIZE(BO), %xmm15 mulps %xmm14, %xmm11 addps %xmm11, %xmm5 movaps 36 * SIZE(BO), %xmm11 mulps %xmm14, %xmm13 mulps 28 * SIZE(BO), %xmm14 addps %xmm13, %xmm6 movaps 40 * SIZE(BO), %xmm13 addps %xmm14, %xmm7 movaps 28 * SIZE(AO), %xmm14 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 36 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 40 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm5 movaps 52 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm6 movaps 56 * SIZE(BO), %xmm13 addps %xmm10, %xmm7 movaps 36 * SIZE(AO), %xmm10 mulps %xmm12, %xmm15 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm1 movaps 52 * SIZE(BO), %xmm11 mulps %xmm12, %xmm13 mulps 60 * SIZE(BO), %xmm12 addps %xmm13, %xmm2 movaps 56 * SIZE(BO), %xmm13 addps %xmm12, %xmm3 movaps 40 * SIZE(AO), %xmm12 mulps %xmm14, %xmm15 addps %xmm15, %xmm4 movaps 80 * SIZE(BO), %xmm15 mulps %xmm14, %xmm11 addps %xmm11, %xmm5 movaps 68 * SIZE(BO), %xmm11 mulps %xmm14, %xmm13 mulps 60 * SIZE(BO), %xmm14 addps %xmm13, %xmm6 movaps 72 * SIZE(BO), %xmm13 addps %xmm14, %xmm7 movaps 44 * SIZE(AO), %xmm14 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm5 movaps 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 12 * SIZE(BO), %xmm10 addps %xmm9, %xmm6 movaps 16 * SIZE(BO), %xmm9 addps %xmm10, %xmm7 movaps 12 * SIZE(AO), %xmm10 addq $8 * SIZE, AO addq $16 * SIZE, BO decq %rax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $2 + BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm5 movaps 8 * SIZE(B), %xmm10 movaps 12 * SIZE(B), %xmm11 movaps 16 * SIZE(B), %xmm12 movaps 20 * SIZE(B), %xmm13 movaps 24 * SIZE(B), %xmm14 movaps 28 * SIZE(B), %xmm15 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 subps %xmm4, %xmm12 subps %xmm6, %xmm13 subps %xmm9, %xmm14 subps %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 movaps 8 * SIZE(AO), %xmm10 movaps 12 * SIZE(AO), %xmm11 movaps 16 * SIZE(AO), %xmm12 movaps 20 * SIZE(AO), %xmm13 movaps 24 * SIZE(AO), %xmm14 movaps 28 * SIZE(AO), %xmm15 subps %xmm0, %xmm8 subps %xmm4, %xmm9 subps %xmm1, %xmm10 subps %xmm5, %xmm11 subps %xmm2, %xmm12 subps %xmm6, %xmm13 subps %xmm3, %xmm14 subps %xmm7, %xmm15 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulps %xmm8, %xmm15 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm15 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm15 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 mulps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm15 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 mulps %xmm2, %xmm15 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 mulps %xmm2, %xmm15 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm13 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm9 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 mulps %xmm2, %xmm13 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm9 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm9 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 subq $8 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm10, 8 * SIZE(B) movaps %xmm11, 12 * SIZE(B) movaps %xmm12, 16 * SIZE(B) movaps %xmm13, 20 * SIZE(B) movaps %xmm14, 24 * SIZE(B) movaps %xmm15, 28 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 pshufd $0xaa, %xmm10, %xmm4 pshufd $0xff, %xmm10, %xmm6 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm6, 44 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 pshufd $0xaa, %xmm11, %xmm4 pshufd $0xff, %xmm11, %xmm6 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm6, 60 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 pshufd $0x55, %xmm12, %xmm3 pshufd $0xaa, %xmm12, %xmm4 pshufd $0xff, %xmm12, %xmm6 movaps %xmm2, 64 * SIZE(BO) movaps %xmm3, 68 * SIZE(BO) movaps %xmm4, 72 * SIZE(BO) movaps %xmm6, 76 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 pshufd $0x55, %xmm13, %xmm3 pshufd $0xaa, %xmm13, %xmm4 pshufd $0xff, %xmm13, %xmm6 movaps %xmm2, 80 * SIZE(BO) movaps %xmm3, 84 * SIZE(BO) movaps %xmm4, 88 * SIZE(BO) movaps %xmm6, 92 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 pshufd $0x55, %xmm14, %xmm3 pshufd $0xaa, %xmm14, %xmm4 pshufd $0xff, %xmm14, %xmm6 movaps %xmm2, 96 * SIZE(BO) movaps %xmm3, 100 * SIZE(BO) movaps %xmm4, 104 * SIZE(BO) movaps %xmm6, 108 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 pshufd $0x55, %xmm15, %xmm3 pshufd $0xaa, %xmm15, %xmm4 pshufd $0xff, %xmm15, %xmm6 movaps %xmm2, 112 * SIZE(BO) movaps %xmm3, 116 * SIZE(BO) movaps %xmm4, 120 * SIZE(BO) movaps %xmm6, 124 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) movaps %xmm10, 8 * SIZE(AO) movaps %xmm11, 12 * SIZE(AO) movaps %xmm12, 16 * SIZE(AO) movaps %xmm13, 20 * SIZE(AO) movaps %xmm14, 24 * SIZE(AO) movaps %xmm15, 28 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movaps %xmm12, %xmm2 unpcklps %xmm14, %xmm12 unpckhps %xmm14, %xmm2 movaps %xmm13, %xmm7 unpcklps %xmm15, %xmm13 unpckhps %xmm15, %xmm7 movaps %xmm12, %xmm14 unpcklps %xmm13, %xmm12 unpckhps %xmm13, %xmm14 movaps %xmm2, %xmm15 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm15 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm14, 4 * SIZE(CO2) movhps %xmm14, 6 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 6 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm11, 4 * SIZE(CO2) movhps %xmm11, 6 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm13, 4 * SIZE(CO1, LDC, 2) movhps %xmm13, 6 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) movhps %xmm14, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $32 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $2 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm5 movaps 8 * SIZE(B), %xmm10 movaps 12 * SIZE(B), %xmm11 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm10 movaps 8 * SIZE(AO), %xmm12 movaps 12 * SIZE(AO), %xmm14 subps %xmm0, %xmm8 subps %xmm1, %xmm10 subps %xmm2, %xmm12 subps %xmm3, %xmm14 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm10, 8 * SIZE(B) movaps %xmm11, 12 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 pshufd $0xaa, %xmm10, %xmm4 pshufd $0xff, %xmm10, %xmm6 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm6, 44 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 pshufd $0xaa, %xmm11, %xmm4 pshufd $0xff, %xmm11, %xmm6 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm6, 60 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm10, 4 * SIZE(AO) movaps %xmm12, 8 * SIZE(AO) movaps %xmm14, 12 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) movhps %xmm14, 2 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $2, M je .L40 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 4 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd 6 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd 16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd 10 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 14 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movapd 0 * SIZE(B), %xmm1 movapd 4 * SIZE(B), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 2 * SIZE(AO), %xmm10 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd 4 * SIZE(AO), %xmm12 #ifdef movsd xorps %xmm14, %xmm14 #endif movsd 6 * SIZE(AO), %xmm14 subps %xmm0, %xmm8 subps %xmm1, %xmm10 subps %xmm2, %xmm12 subps %xmm3, %xmm14 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) movlps %xmm10, 2 * SIZE(AO) movlps %xmm12, 4 * SIZE(AO) movlps %xmm14, 6 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L40: testq $1, M je .L49 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss 2 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss 3 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss 8 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss 5 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss 6 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss 7 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss 12 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 unpcklps %xmm1, %xmm0 movapd 0 * SIZE(B), %xmm1 subps %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 movss 1 * SIZE(AO), %xmm10 movss 2 * SIZE(AO), %xmm12 movss 3 * SIZE(AO), %xmm14 subss %xmm0, %xmm8 subss %xmm1, %xmm10 subss %xmm2, %xmm12 subss %xmm3, %xmm14 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) movss %xmm10, 1 * SIZE(AO) movss %xmm12, 2 * SIZE(AO) movss %xmm14, 3 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movss %xmm1, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO2) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm11, 0 * SIZE(CO2, LDC, 2) #else movss %xmm8, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO2) movss %xmm12, 0 * SIZE(CO1, LDC, 2) movss %xmm14, 0 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 .L50: testq $2, N je .L100 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $1 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L53 ALIGN_4 .L52: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW 4 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 4 * SIZE(CO2) pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 64 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 80 * SIZE(AO), %xmm10 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 36 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 40 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 44 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 addps %xmm12, %xmm5 movaps 96 * SIZE(BO), %xmm13 movaps 96 * SIZE(AO), %xmm12 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 52 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 56 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 60 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 112 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 4 * SIZE(B), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd 6 * SIZE(B), %xmm11 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd 8 * SIZE(B), %xmm12 #ifdef movsd xorps %xmm13, %xmm13 #endif movsd 10 * SIZE(B), %xmm13 #ifdef movsd xorps %xmm14, %xmm14 #endif movsd 12 * SIZE(B), %xmm14 #ifdef movsd xorps %xmm15, %xmm15 #endif movsd 14 * SIZE(B), %xmm15 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 subps %xmm4, %xmm12 subps %xmm6, %xmm13 subps %xmm9, %xmm14 subps %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 movaps 8 * SIZE(AO), %xmm10 movaps 12 * SIZE(AO), %xmm11 subps %xmm0, %xmm8 subps %xmm4, %xmm9 subps %xmm1, %xmm10 subps %xmm5, %xmm11 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulps %xmm8, %xmm15 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm11 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm9 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) movlps %xmm10, 4 * SIZE(B) movlps %xmm11, 6 * SIZE(B) movlps %xmm12, 8 * SIZE(B) movlps %xmm13, 10 * SIZE(B) movlps %xmm14, 12 * SIZE(B) movlps %xmm15, 14 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 movaps %xmm2, 24 * SIZE(BO) movaps %xmm3, 28 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 pshufd $0x55, %xmm12, %xmm3 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 pshufd $0x55, %xmm13, %xmm3 movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 pshufd $0x55, %xmm14, %xmm3 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 pshufd $0x55, %xmm15, %xmm3 movaps %xmm2, 56 * SIZE(BO) movaps %xmm3, 60 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) movaps %xmm10, 8 * SIZE(AO) movaps %xmm11, 12 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 unpcklps %xmm14, %xmm12 unpcklps %xmm15, %xmm13 movaps %xmm12, %xmm14 unpcklps %xmm13, %xmm12 unpckhps %xmm13, %xmm14 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) movlps %xmm14, 4 * SIZE(CO1, LDC, 1) movhps %xmm14, 6 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) movlps %xmm11, 4 * SIZE(CO1, LDC, 1) movhps %xmm11, 6 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 4 * SIZE(B), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd 6 * SIZE(B), %xmm11 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm10 subps %xmm0, %xmm8 subps %xmm1, %xmm10 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) movlps %xmm10, 4 * SIZE(B) movlps %xmm11, 6 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 movaps %xmm2, 24 * SIZE(BO) movaps %xmm3, 28 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm10, 4 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $ 8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L80: testq $2, M je .L90 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 6 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 10 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 12 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 14 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: addps %xmm2, %xmm0 addps %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 2 * SIZE(AO), %xmm10 subps %xmm0, %xmm8 subps %xmm1, %xmm10 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) movlps %xmm10, 2 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $ 4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L90: testq $1, M je .L99 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss 3 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss 8 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss 5 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss 6 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss 7 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss 12 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: addss %xmm2, %xmm0 addss %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 subps %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 movss 1 * SIZE(AO), %xmm10 subss %xmm0, %xmm8 subss %xmm1, %xmm10 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) movss %xmm10, 1 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movss %xmm1, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO1, LDC, 1) #else movss %xmm8, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (AO, %rax, SIZE), AO #ifdef LT addq $ 2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L100: testq $1, N je .L999 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW 4 * SIZE(CO1) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps 12 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 64 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 mulps 20 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 24 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps 28 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 80 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 mulps 36 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 40 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 44 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 96 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 mulps 52 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 56 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 60 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 112 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps 4 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 8), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 movss 4 * SIZE(B), %xmm12 movss 5 * SIZE(B), %xmm13 movss 6 * SIZE(B), %xmm14 movss 7 * SIZE(B), %xmm15 subss %xmm0, %xmm1 subss %xmm2, %xmm5 subss %xmm8, %xmm10 subss %xmm3, %xmm11 subss %xmm4, %xmm12 subss %xmm6, %xmm13 subss %xmm9, %xmm14 subss %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 subps %xmm0, %xmm8 subps %xmm4, %xmm9 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulss %xmm8, %xmm15 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) movss %xmm12, 4 * SIZE(B) movss %xmm13, 5 * SIZE(B) movss %xmm14, 6 * SIZE(B) movss %xmm15, 7 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 movaps %xmm2, 16 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 movaps %xmm2, 20 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 movaps %xmm2, 24 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 movaps %xmm2, 28 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 unpcklps %xmm14, %xmm12 unpcklps %xmm15, %xmm13 unpcklps %xmm13, %xmm12 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps 12 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 24 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps 28 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 subss %xmm0, %xmm1 subss %xmm2, %xmm5 subss %xmm8, %xmm10 subss %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L130: testq $2, M je .L140 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 subss %xmm0, %xmm1 subss %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L140: testq $1, M je .L149 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 1 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss 2 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss 3 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss 8 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss 6 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss 7 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AO), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm8, 0 * SIZE(CO1) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L149: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S000066400000000000000000001552201313527062700226370ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 512 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movaps %xmm3, %xmm0 #endif subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N jle .L30 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I NOBRANCH jle .L80 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L72 addpd %xmm9, %xmm8 ALIGN_3 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addpd %xmm1, %xmm8 #if defined(LN) || defined(LT) movaps -16 * SIZE(BO), %xmm0 #else movaps -16 * SIZE(AO), %xmm0 #endif subpd %xmm8, %xmm0 #if defined(LN) || defined(LT) pshufd $0xe, %xmm0, %xmm1 #endif #ifdef LN movsd -13 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm1 movsd -14 * SIZE(AO), %xmm12 mulsd %xmm1, %xmm12 subsd %xmm12, %xmm0 movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(AO), %xmm12 mulsd %xmm0, %xmm12 subsd %xmm12, %xmm1 movsd -13 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm1 #endif #if defined(LN) || defined(LT) unpcklpd %xmm1, %xmm0 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movaps %xmm0, -16 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L71 ALIGN_4 .L80: testq $1, M BRANCH jle .L89 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movhps -15 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm1 movhps -15 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: mulpd %xmm0, %xmm1 movsd -14 * SIZE(AO), %xmm0 movhps -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movsd -14 * SIZE(BO), %xmm1 movhps -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movsd -12 * SIZE(AO), %xmm0 movhps -11 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movsd -12 * SIZE(BO), %xmm1 movhps -11 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L82 addpd %xmm9, %xmm8 ALIGN_3 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd -15 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_4 .L88: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif haddpd %xmm8, %xmm8 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm0 #else movsd -16 * SIZE(AO), %xmm0 #endif subsd %xmm8, %xmm0 #if defined(LN) || defined(LT) movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) movsd -16 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, -16 * SIZE(BO) #else movsd %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L89: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L30: testq $2, N jle .L50 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L52 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movaps -16 * SIZE(BO), %xmm0 movaps -14 * SIZE(BO), %xmm1 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm0 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(AO), %xmm12 mulpd %xmm0, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm0 movddup -15 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm1 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm1 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -14 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movaps %xmm0, -16 * SIZE(BO) movaps %xmm1, -14 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -8 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movaps -16 * SIZE(BO), %xmm0 #else movaps -16 * SIZE(AO), %xmm0 #endif subpd %xmm8, %xmm0 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm0 movsd -15 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm1 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm1 #endif #ifdef RT movsd -13 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -14 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movaps %xmm0, -16 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L50: testq $4, N jle .L70 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I NOBRANCH jle .L40 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 2 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht0 2 * SIZE(CO2, LDC, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 addpd %xmm3, %xmm10 addpd %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $0, %xmm11, %xmm10 shufpd $3, %xmm0, %xmm11 movaps -16 * SIZE(BO), %xmm0 movaps -14 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm1 movaps -10 * SIZE(BO), %xmm3 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -12 * SIZE(AO), %xmm2 movaps -10 * SIZE(AO), %xmm3 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -14 * SIZE(AO), %xmm12 movaps %xmm12, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm3, %xmm13 subpd %xmm12, %xmm0 subpd %xmm13, %xmm2 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 movddup -15 * SIZE(AO), %xmm12 movaps %xmm12, %xmm13 mulpd %xmm0, %xmm12 mulpd %xmm2, %xmm13 subpd %xmm12, %xmm1 subpd %xmm13, %xmm3 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm1 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm3 movddup -11 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm1 movddup -10 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm2 movddup -9 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm3 movddup -6 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm2 movddup -5 * SIZE(BO), %xmm11 mulpd %xmm2, %xmm11 subpd %xmm11, %xmm3 movddup -1 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm3 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm3 movddup -2 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm2 movddup -3 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm1 movddup -4 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm0 movddup -6 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm2 movddup -7 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm1 movddup -8 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm0 movddup -11 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -12 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) movhps %xmm2, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 1 * SIZE(CO2, LDC, 1) #else movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 1 * SIZE(CO2, LDC, 1) #endif #if defined(LN) || defined(LT) movaps %xmm0, -16 * SIZE(BO) movaps %xmm2, -14 * SIZE(BO) movaps %xmm1, -12 * SIZE(BO) movaps %xmm3, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L31 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(LT) movaps -16 * SIZE(BO), %xmm0 movaps -14 * SIZE(BO), %xmm1 #else movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm1, %xmm3 movaps %xmm1, %xmm2 pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm1 movsd -14 * SIZE(BO), %xmm10 mulsd %xmm0, %xmm10 subsd %xmm10, %xmm2 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm3 movsd -11 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm1 movsd -10 * SIZE(BO), %xmm10 mulsd %xmm1, %xmm10 subsd %xmm10, %xmm2 movsd -9 * SIZE(BO), %xmm11 mulsd %xmm1, %xmm11 subsd %xmm11, %xmm3 movsd -6 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm2 movsd -5 * SIZE(BO), %xmm11 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm3 movsd -1 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm3 #endif #ifdef RT movsd -1 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm3 movsd -2 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm2 movsd -3 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm1 movsd -4 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm0 movsd -6 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm2 movsd -7 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm1 movsd -8 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm0 movsd -11 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -12 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm1 unpcklpd %xmm3, %xmm1 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO2) movhps %xmm1, 0 * SIZE(CO2, LDC, 1) #if defined(LN) || defined(LT) movaps %xmm0, -16 * SIZE(BO) movaps %xmm1, -14 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif ALIGN_4 .L70: movq N, J sarq $3, J NOBRANCH jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 8), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 4), CO2 #ifndef RT leaq (C, LDC, 8), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #else movq B, BO #endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht0 1 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 1 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht0 2 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht0 1 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 2 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht0 1 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht0 2 * SIZE(CO2, %rax, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps -12 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -14 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps 4 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps 8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -10 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addq $32 * SIZE, BO subq $-8 * SIZE, AO decq %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #endif addpd %xmm1, %xmm12 addpd %xmm2, %xmm13 addpd %xmm3, %xmm14 addpd %xmm4, %xmm15 #if defined(LN) || defined(LT) movaps %xmm8, %xmm0 shufpd $0, %xmm9, %xmm8 shufpd $3, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $0, %xmm11, %xmm10 shufpd $3, %xmm0, %xmm11 movaps %xmm12, %xmm0 shufpd $0, %xmm13, %xmm12 shufpd $3, %xmm0, %xmm13 movaps %xmm14, %xmm0 shufpd $0, %xmm15, %xmm14 shufpd $3, %xmm0, %xmm15 movaps -16 * SIZE(BO), %xmm0 movaps -14 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm4 movaps -10 * SIZE(BO), %xmm6 movaps -8 * SIZE(BO), %xmm1 movaps -6 * SIZE(BO), %xmm3 movaps -4 * SIZE(BO), %xmm5 movaps -2 * SIZE(BO), %xmm7 #else movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movaps %xmm12, %xmm0 shufpd $2, %xmm13, %xmm12 shufpd $2, %xmm0, %xmm13 movaps %xmm14, %xmm0 shufpd $2, %xmm15, %xmm14 shufpd $2, %xmm0, %xmm15 movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -12 * SIZE(AO), %xmm2 movaps -10 * SIZE(AO), %xmm3 movaps -8 * SIZE(AO), %xmm4 movaps -6 * SIZE(AO), %xmm5 movaps -4 * SIZE(AO), %xmm6 movaps -2 * SIZE(AO), %xmm7 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 subpd %xmm12, %xmm4 subpd %xmm13, %xmm5 subpd %xmm14, %xmm6 subpd %xmm15, %xmm7 #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -14 * SIZE(AO), %xmm12 movaps %xmm12, %xmm13 movaps %xmm12, %xmm14 movaps %xmm12, %xmm15 mulpd %xmm1, %xmm12 mulpd %xmm3, %xmm13 mulpd %xmm5, %xmm14 mulpd %xmm7, %xmm15 subpd %xmm12, %xmm0 subpd %xmm13, %xmm2 subpd %xmm14, %xmm4 subpd %xmm15, %xmm6 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm6 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm6 movddup -15 * SIZE(AO), %xmm12 movaps %xmm12, %xmm13 movaps %xmm12, %xmm14 movaps %xmm12, %xmm15 mulpd %xmm0, %xmm12 mulpd %xmm2, %xmm13 mulpd %xmm4, %xmm14 mulpd %xmm6, %xmm15 subpd %xmm12, %xmm1 subpd %xmm13, %xmm3 subpd %xmm14, %xmm5 subpd %xmm15, %xmm7 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm1 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm3 movddup -12 * SIZE(BO), %xmm12 mulpd %xmm0, %xmm12 subpd %xmm12, %xmm4 movddup -11 * SIZE(BO), %xmm13 mulpd %xmm0, %xmm13 subpd %xmm13, %xmm5 movddup -10 * SIZE(BO), %xmm14 mulpd %xmm0, %xmm14 subpd %xmm14, %xmm6 movddup -9 * SIZE(BO), %xmm15 mulpd %xmm0, %xmm15 subpd %xmm15, %xmm7 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm1 movddup -6 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm2 movddup -5 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm3 movddup -4 * SIZE(BO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm4 movddup -3 * SIZE(BO), %xmm13 mulpd %xmm1, %xmm13 subpd %xmm13, %xmm5 movddup -2 * SIZE(BO), %xmm14 mulpd %xmm1, %xmm14 subpd %xmm14, %xmm6 movddup -1 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm7 movddup 2 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm2 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm2, %xmm11 subpd %xmm11, %xmm3 movddup 4 * SIZE(BO), %xmm12 mulpd %xmm2, %xmm12 subpd %xmm12, %xmm4 movddup 5 * SIZE(BO), %xmm13 mulpd %xmm2, %xmm13 subpd %xmm13, %xmm5 movddup 6 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm6 movddup 7 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm7 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm12 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm4 movddup 13 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm5 movddup 14 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm6 movddup 15 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm7 movddup 20 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm4, %xmm13 subpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm14 mulpd %xmm4, %xmm14 subpd %xmm14, %xmm6 movddup 23 * SIZE(BO), %xmm15 mulpd %xmm4, %xmm15 subpd %xmm15, %xmm7 movddup 29 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm5 movddup 30 * SIZE(BO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm5, %xmm15 subpd %xmm15, %xmm7 movddup 38 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm6 movddup 39 * SIZE(BO), %xmm15 mulpd %xmm6, %xmm15 subpd %xmm15, %xmm7 movddup 47 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm7 #endif #ifdef RT movddup 47 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm7 movddup 46 * SIZE(BO), %xmm9 mulpd %xmm7, %xmm9 subpd %xmm9, %xmm6 movddup 45 * SIZE(BO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm5 movddup 44 * SIZE(BO), %xmm11 mulpd %xmm7, %xmm11 subpd %xmm11, %xmm4 movddup 43 * SIZE(BO), %xmm12 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup 42 * SIZE(BO), %xmm13 mulpd %xmm7, %xmm13 subpd %xmm13, %xmm2 movddup 41 * SIZE(BO), %xmm14 mulpd %xmm7, %xmm14 subpd %xmm14, %xmm1 movddup 40 * SIZE(BO), %xmm15 mulpd %xmm7, %xmm15 subpd %xmm15, %xmm0 movddup 38 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm6 movddup 37 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm5 movddup 36 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm4 movddup 35 * SIZE(BO), %xmm12 mulpd %xmm6, %xmm12 subpd %xmm12, %xmm3 movddup 34 * SIZE(BO), %xmm13 mulpd %xmm6, %xmm13 subpd %xmm13, %xmm2 movddup 33 * SIZE(BO), %xmm14 mulpd %xmm6, %xmm14 subpd %xmm14, %xmm1 movddup 32 * SIZE(BO), %xmm15 mulpd %xmm6, %xmm15 subpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm10 mulpd %xmm10, %xmm5 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm5, %xmm11 subpd %xmm11, %xmm4 movddup 27 * SIZE(BO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup 26 * SIZE(BO), %xmm13 mulpd %xmm5, %xmm13 subpd %xmm13, %xmm2 movddup 25 * SIZE(BO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm1 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm5, %xmm15 subpd %xmm15, %xmm0 movddup 20 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm4 movddup 19 * SIZE(BO), %xmm12 mulpd %xmm4, %xmm12 subpd %xmm12, %xmm3 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm4, %xmm13 subpd %xmm13, %xmm2 movddup 17 * SIZE(BO), %xmm14 mulpd %xmm4, %xmm14 subpd %xmm14, %xmm1 movddup 16 * SIZE(BO), %xmm15 mulpd %xmm4, %xmm15 subpd %xmm15, %xmm0 movddup 11 * SIZE(BO), %xmm12 mulpd %xmm12, %xmm3 movddup 10 * SIZE(BO), %xmm13 mulpd %xmm3, %xmm13 subpd %xmm13, %xmm2 movddup 9 * SIZE(BO), %xmm14 mulpd %xmm3, %xmm14 subpd %xmm14, %xmm1 movddup 8 * SIZE(BO), %xmm15 mulpd %xmm3, %xmm15 subpd %xmm15, %xmm0 movddup 2 * SIZE(BO), %xmm13 mulpd %xmm13, %xmm2 movddup 1 * SIZE(BO), %xmm14 mulpd %xmm2, %xmm14 subpd %xmm14, %xmm1 movddup 0 * SIZE(BO), %xmm15 mulpd %xmm2, %xmm15 subpd %xmm15, %xmm0 movddup -7 * SIZE(BO), %xmm14 mulpd %xmm14, %xmm1 movddup -8 * SIZE(BO), %xmm15 mulpd %xmm1, %xmm15 subpd %xmm15, %xmm0 movddup -16 * SIZE(BO), %xmm15 mulpd %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 1 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movsd %xmm3, 1 * SIZE(CO1, LDC, 2) movhps %xmm2, 0 * SIZE(CO1, %rax, 1) movhps %xmm3, 1 * SIZE(CO1, %rax, 1) movsd %xmm4, 0 * SIZE(CO2) movsd %xmm5, 1 * SIZE(CO2) movhps %xmm4, 0 * SIZE(CO2, LDC, 1) movhps %xmm5, 1 * SIZE(CO2, LDC, 1) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movsd %xmm7, 1 * SIZE(CO2, LDC, 2) movhps %xmm6, 0 * SIZE(CO2, %rax, 1) movhps %xmm7, 1 * SIZE(CO2, %rax, 1) #else movups %xmm0, 0 * SIZE(CO1) movups %xmm1, 0 * SIZE(CO1, LDC, 1) movups %xmm2, 0 * SIZE(CO1, LDC, 2) movups %xmm3, 0 * SIZE(CO1, %rax, 1) movups %xmm4, 0 * SIZE(CO2) movups %xmm5, 0 * SIZE(CO2, LDC, 1) movups %xmm6, 0 * SIZE(CO2, LDC, 2) movups %xmm7, 0 * SIZE(CO2, %rax, 1) #endif #if defined(LN) || defined(LT) movaps %xmm0, -16 * SIZE(BO) movaps %xmm2, -14 * SIZE(BO) movaps %xmm4, -12 * SIZE(BO) movaps %xmm6, -10 * SIZE(BO) movaps %xmm1, -8 * SIZE(BO) movaps %xmm3, -6 * SIZE(BO) movaps %xmm5, -4 * SIZE(BO) movaps %xmm7, -2 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) movaps %xmm4, -8 * SIZE(AO) movaps %xmm5 , -6 * SIZE(AO) movaps %xmm6, -4 * SIZE(AO) movaps %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L29 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #else movq B, BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 16 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #endif #if defined(LN) || defined(LT) movaps -16 * SIZE(BO), %xmm0 movaps -14 * SIZE(BO), %xmm1 movaps -12 * SIZE(BO), %xmm2 movaps -10 * SIZE(BO), %xmm3 #else movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -12 * SIZE(AO), %xmm2 movaps -10 * SIZE(AO), %xmm3 #endif subpd %xmm8, %xmm0 subpd %xmm9, %xmm1 subpd %xmm10, %xmm2 subpd %xmm11, %xmm3 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #if defined(RN) || defined(RT) pshufd $0xe, %xmm3, %xmm7 movaps %xmm3, %xmm6 pshufd $0xe, %xmm2, %xmm5 movaps %xmm2, %xmm4 pshufd $0xe, %xmm1, %xmm3 movaps %xmm1, %xmm2 pshufd $0xe, %xmm0, %xmm1 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm1 movsd -14 * SIZE(BO), %xmm10 mulsd %xmm0, %xmm10 subsd %xmm10, %xmm2 movsd -13 * SIZE(BO), %xmm11 mulsd %xmm0, %xmm11 subsd %xmm11, %xmm3 movsd -12 * SIZE(BO), %xmm12 mulsd %xmm0, %xmm12 subsd %xmm12, %xmm4 movsd -11 * SIZE(BO), %xmm13 mulsd %xmm0, %xmm13 subsd %xmm13, %xmm5 movsd -10 * SIZE(BO), %xmm14 mulsd %xmm0, %xmm14 subsd %xmm14, %xmm6 movsd -9 * SIZE(BO), %xmm15 mulsd %xmm0, %xmm15 subsd %xmm15, %xmm7 movsd -7 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm1 movsd -6 * SIZE(BO), %xmm10 mulsd %xmm1, %xmm10 subsd %xmm10, %xmm2 movsd -5 * SIZE(BO), %xmm11 mulsd %xmm1, %xmm11 subsd %xmm11, %xmm3 movsd -4 * SIZE(BO), %xmm12 mulsd %xmm1, %xmm12 subsd %xmm12, %xmm4 movsd -3 * SIZE(BO), %xmm13 mulsd %xmm1, %xmm13 subsd %xmm13, %xmm5 movsd -2 * SIZE(BO), %xmm14 mulsd %xmm1, %xmm14 subsd %xmm14, %xmm6 movsd -1 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm7 movsd 2 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm2 movsd 3 * SIZE(BO), %xmm11 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm3 movsd 4 * SIZE(BO), %xmm12 mulsd %xmm2, %xmm12 subsd %xmm12, %xmm4 movsd 5 * SIZE(BO), %xmm13 mulsd %xmm2, %xmm13 subsd %xmm13, %xmm5 movsd 6 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm6 movsd 7 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm7 movsd 11 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm3 movsd 12 * SIZE(BO), %xmm12 mulsd %xmm3, %xmm12 subsd %xmm12, %xmm4 movsd 13 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm5 movsd 14 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm6 movsd 15 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm7 movsd 20 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm4 movsd 21 * SIZE(BO), %xmm13 mulsd %xmm4, %xmm13 subsd %xmm13, %xmm5 movsd 22 * SIZE(BO), %xmm14 mulsd %xmm4, %xmm14 subsd %xmm14, %xmm6 movsd 23 * SIZE(BO), %xmm15 mulsd %xmm4, %xmm15 subsd %xmm15, %xmm7 movsd 29 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm5 movsd 30 * SIZE(BO), %xmm14 mulsd %xmm5, %xmm14 subsd %xmm14, %xmm6 movsd 31 * SIZE(BO), %xmm15 mulsd %xmm5, %xmm15 subsd %xmm15, %xmm7 movsd 38 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm6 movsd 39 * SIZE(BO), %xmm15 mulsd %xmm6, %xmm15 subsd %xmm15, %xmm7 movsd 47 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm7 #endif #ifdef RT movsd 47 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm7 movsd 46 * SIZE(BO), %xmm9 mulsd %xmm7, %xmm9 subsd %xmm9, %xmm6 movsd 45 * SIZE(BO), %xmm10 mulsd %xmm7, %xmm10 subsd %xmm10, %xmm5 movsd 44 * SIZE(BO), %xmm11 mulsd %xmm7, %xmm11 subsd %xmm11, %xmm4 movsd 43 * SIZE(BO), %xmm12 mulsd %xmm7, %xmm12 subsd %xmm12, %xmm3 movsd 42 * SIZE(BO), %xmm13 mulsd %xmm7, %xmm13 subsd %xmm13, %xmm2 movsd 41 * SIZE(BO), %xmm14 mulsd %xmm7, %xmm14 subsd %xmm14, %xmm1 movsd 40 * SIZE(BO), %xmm15 mulsd %xmm7, %xmm15 subsd %xmm15, %xmm0 movsd 38 * SIZE(BO), %xmm9 mulsd %xmm9, %xmm6 movsd 37 * SIZE(BO), %xmm10 mulsd %xmm6, %xmm10 subsd %xmm10, %xmm5 movsd 36 * SIZE(BO), %xmm11 mulsd %xmm6, %xmm11 subsd %xmm11, %xmm4 movsd 35 * SIZE(BO), %xmm12 mulsd %xmm6, %xmm12 subsd %xmm12, %xmm3 movsd 34 * SIZE(BO), %xmm13 mulsd %xmm6, %xmm13 subsd %xmm13, %xmm2 movsd 33 * SIZE(BO), %xmm14 mulsd %xmm6, %xmm14 subsd %xmm14, %xmm1 movsd 32 * SIZE(BO), %xmm15 mulsd %xmm6, %xmm15 subsd %xmm15, %xmm0 movsd 29 * SIZE(BO), %xmm10 mulsd %xmm10, %xmm5 movsd 28 * SIZE(BO), %xmm11 mulsd %xmm5, %xmm11 subsd %xmm11, %xmm4 movsd 27 * SIZE(BO), %xmm12 mulsd %xmm5, %xmm12 subsd %xmm12, %xmm3 movsd 26 * SIZE(BO), %xmm13 mulsd %xmm5, %xmm13 subsd %xmm13, %xmm2 movsd 25 * SIZE(BO), %xmm14 mulsd %xmm5, %xmm14 subsd %xmm14, %xmm1 movsd 24 * SIZE(BO), %xmm15 mulsd %xmm5, %xmm15 subsd %xmm15, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulsd %xmm11, %xmm4 movsd 19 * SIZE(BO), %xmm12 mulsd %xmm4, %xmm12 subsd %xmm12, %xmm3 movsd 18 * SIZE(BO), %xmm13 mulsd %xmm4, %xmm13 subsd %xmm13, %xmm2 movsd 17 * SIZE(BO), %xmm14 mulsd %xmm4, %xmm14 subsd %xmm14, %xmm1 movsd 16 * SIZE(BO), %xmm15 mulsd %xmm4, %xmm15 subsd %xmm15, %xmm0 movsd 11 * SIZE(BO), %xmm12 mulsd %xmm12, %xmm3 movsd 10 * SIZE(BO), %xmm13 mulsd %xmm3, %xmm13 subsd %xmm13, %xmm2 movsd 9 * SIZE(BO), %xmm14 mulsd %xmm3, %xmm14 subsd %xmm14, %xmm1 movsd 8 * SIZE(BO), %xmm15 mulsd %xmm3, %xmm15 subsd %xmm15, %xmm0 movsd 2 * SIZE(BO), %xmm13 mulsd %xmm13, %xmm2 movsd 1 * SIZE(BO), %xmm14 mulsd %xmm2, %xmm14 subsd %xmm14, %xmm1 movsd 0 * SIZE(BO), %xmm15 mulsd %xmm2, %xmm15 subsd %xmm15, %xmm0 movsd -7 * SIZE(BO), %xmm14 mulsd %xmm14, %xmm1 movsd -8 * SIZE(BO), %xmm15 mulsd %xmm1, %xmm15 subsd %xmm15, %xmm0 movsd -16 * SIZE(BO), %xmm15 mulsd %xmm15, %xmm0 #endif #if defined(RN) || defined(RT) unpcklpd %xmm1, %xmm0 movaps %xmm2, %xmm1 unpcklpd %xmm3, %xmm1 movaps %xmm4, %xmm2 unpcklpd %xmm5, %xmm2 movaps %xmm6, %xmm3 unpcklpd %xmm7, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO1, LDC, 1) movsd %xmm1, 0 * SIZE(CO1, LDC, 2) movhps %xmm1, 0 * SIZE(CO1, %rax, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 0 * SIZE(CO2, LDC, 1) movsd %xmm3, 0 * SIZE(CO2, LDC, 2) movhps %xmm3, 0 * SIZE(CO2, %rax, 1) #if defined(LN) || defined(LT) movaps %xmm0, -16 * SIZE(BO) movaps %xmm1, -14 * SIZE(BO) movaps %xmm2, -12 * SIZE(BO) movaps %xmm3, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L29: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 8), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $8, KK #endif #ifdef RT subq $8, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_4x2_atom.S000066400000000000000000001103441313527062700221600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N je .L40 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I jle .L50 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 movsd 1 * SIZE(AO), %xmm1 xorps %xmm11, %xmm11 movsd 2 * SIZE(AO), %xmm2 xorps %xmm13, %xmm13 movsd 3 * SIZE(AO), %xmm3 xorps %xmm15, %xmm15 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm14, %xmm14 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L45 ALIGN_4 .L42: addsd %xmm9, %xmm8 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm11, %xmm10 movsd 5 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 6 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 7 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 9 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addsd %xmm2, %xmm12 movsd 10 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 addsd %xmm3, %xmm14 movsd 11 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 3 * SIZE(BO), %xmm5 addsd %xmm9, %xmm8 movsd 12 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addsd %xmm11, %xmm10 movsd 13 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 14 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 15 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 4 * SIZE(BO), %xmm4 subq $-16 * SIZE, AO addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addq $ 4 * SIZE, BO addsd %xmm2, %xmm12 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 decq %rax addsd %xmm3, %xmm14 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 1 * SIZE(BO), %xmm5 jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm9, %xmm8 addsd %xmm11, %xmm10 addsd %xmm13, %xmm12 addsd %xmm15, %xmm14 andq $3, %rax BRANCH BRANCH je .L49 ALIGN_4 .L46: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 mulsd %xmm4, %xmm2 mulsd %xmm4, %xmm3 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 5 * SIZE(AO), %xmm1 addsd %xmm2, %xmm12 movsd 6 * SIZE(AO), %xmm2 addsd %xmm3, %xmm14 movsd 7 * SIZE(AO), %xmm3 addq $4 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L46 ALIGN_4 .L49: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm2 movsd 2 * SIZE(BO), %xmm4 movsd 3 * SIZE(BO), %xmm6 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 #endif #ifdef LN movsd 15 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm6 movsd 14 * SIZE(AO), %xmm9 mulsd %xmm6, %xmm9 movsd 13 * SIZE(AO), %xmm11 subsd %xmm9, %xmm4 movsd 12 * SIZE(AO), %xmm13 mulsd %xmm6, %xmm11 movsd 10 * SIZE(AO), %xmm8 subsd %xmm11, %xmm2 movsd 9 * SIZE(AO), %xmm9 mulsd %xmm6, %xmm13 movsd 8 * SIZE(AO), %xmm11 subsd %xmm13, %xmm0 mulsd %xmm8, %xmm4 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm4, %xmm9 subsd %xmm9, %xmm2 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm11 subsd %xmm11, %xmm0 movsd 0 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm2 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm0 mulsd %xmm11, %xmm0 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm0, %xmm9 movsd 2 * SIZE(AO), %xmm11 subsd %xmm9, %xmm2 movsd 3 * SIZE(AO), %xmm13 mulsd %xmm0, %xmm11 movsd 5 * SIZE(AO), %xmm8 subsd %xmm11, %xmm4 movsd 6 * SIZE(AO), %xmm9 mulsd %xmm0, %xmm13 movsd 7 * SIZE(AO), %xmm11 subsd %xmm13, %xmm6 mulsd %xmm8, %xmm2 movsd 10 * SIZE(AO), %xmm8 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm4 movsd 11 * SIZE(AO), %xmm9 mulsd %xmm2, %xmm11 subsd %xmm11, %xmm6 mulsd %xmm8, %xmm4 movsd 15 * SIZE(AO), %xmm8 mulsd %xmm4, %xmm9 subsd %xmm9, %xmm6 mulsd %xmm8, %xmm6 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm6 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movsd %xmm6, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm2, 1 * SIZE(BO) movsd %xmm4, 2 * SIZE(BO) movsd %xmm6, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm4, 2 * SIZE(AO) movsd %xmm6, 3 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L41 ALIGN_4 .L50: testq $2, M je .L60 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm1 xorps %xmm3, %xmm3 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L55 ALIGN_4 .L52: addsd %xmm2, %xmm8 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm3, %xmm10 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 addq $8 * SIZE, AO addsd %xmm1, %xmm10 movsd -3 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 3 * SIZE(BO), %xmm5 addsd %xmm2, %xmm8 movsd -2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 addq $4 * SIZE, BO addsd %xmm3, %xmm10 movsd -1 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 0 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 decq %rax addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 1 * SIZE(BO), %xmm5 jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm2, %xmm8 addsd %xmm3, %xmm10 andq $3, %rax BRANCH je .L59 ALIGN_4 .L56: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 3 * SIZE(AO), %xmm1 addq $2 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm2 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 #endif #ifdef LN movsd 3 * SIZE(AO), %xmm8 movsd 2 * SIZE(AO), %xmm9 movsd 0 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm2 mulsd %xmm2, %xmm9 subsd %xmm9, %xmm0 mulsd %xmm11,%xmm0 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 movsd 1 * SIZE(AO), %xmm9 movsd 3 * SIZE(AO), %xmm11 mulsd %xmm8, %xmm0 mulsd %xmm0, %xmm9 subsd %xmm9, %xmm2 mulsd %xmm11,%xmm2 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm2, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $1, M je .L69 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 movsd 1 * SIZE(AO), %xmm2 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L65 ALIGN_4 .L62: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 movsd 4 * SIZE(AO), %xmm0 addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm3 movsd 5 * SIZE(AO), %xmm2 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm4, %xmm5 movsd 6 * SIZE(AO), %xmm4 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm6, %xmm7 movsd 7 * SIZE(AO), %xmm6 addq $4 * SIZE, AO addq $4 * SIZE, BO decq %rax jne .L62 addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L68 ALIGN_4 .L66: movsd 0 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 addq $1 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L66 ALIGN_4 .L68: addsd %xmm9, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 subsd %xmm8, %xmm0 #else movsd 0 * SIZE(AO), %xmm0 subsd %xmm8, %xmm0 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_2 .L40: movq N, J sarq $1, J jle .L999 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 prefetcht0 3 * SIZE(CO2) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO addsd %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L19 ALIGN_4 .L16: addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L19: addsd %xmm2, %xmm13 addsd %xmm7, %xmm14 addsd %xmm6, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 movsd 4 * SIZE(BO), %xmm4 movsd 5 * SIZE(BO), %xmm5 movsd 6 * SIZE(BO), %xmm6 movsd 7 * SIZE(BO), %xmm7 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 subsd %xmm10, %xmm2 subsd %xmm11, %xmm3 subsd %xmm12, %xmm4 subsd %xmm13, %xmm5 subsd %xmm14, %xmm6 subsd %xmm15, %xmm7 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 movsd 4 * SIZE(AO), %xmm1 movsd 5 * SIZE(AO), %xmm3 movsd 6 * SIZE(AO), %xmm5 movsd 7 * SIZE(AO), %xmm7 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm12, %xmm4 subsd %xmm14, %xmm6 subsd %xmm9, %xmm1 subsd %xmm11, %xmm3 subsd %xmm13, %xmm5 subsd %xmm15, %xmm7 #endif #ifdef LN movsd 15 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm6 movsd 14 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm7 movsd 13 * SIZE(AO), %xmm11 movaps %xmm9, %xmm10 movsd 12 * SIZE(AO), %xmm13 mulsd %xmm6, %xmm9 movsd 10 * SIZE(AO), %xmm8 mulsd %xmm7, %xmm10 subsd %xmm9, %xmm4 movsd 9 * SIZE(AO), %xmm9 subsd %xmm10, %xmm5 movaps %xmm11, %xmm12 mulsd %xmm6, %xmm11 mulsd %xmm7, %xmm12 subsd %xmm11, %xmm2 movsd 8 * SIZE(AO), %xmm11 subsd %xmm12, %xmm3 movaps %xmm13, %xmm14 mulsd %xmm6, %xmm13 mulsd %xmm7, %xmm14 subsd %xmm13, %xmm0 subsd %xmm14, %xmm1 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm5 movsd 5 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm4, %xmm9 mulsd %xmm5, %xmm10 subsd %xmm9, %xmm2 movsd 4 * SIZE(AO), %xmm9 subsd %xmm10, %xmm3 movaps %xmm11, %xmm12 mulsd %xmm4, %xmm11 mulsd %xmm5, %xmm12 subsd %xmm11, %xmm0 movsd 0 * SIZE(AO), %xmm11 subsd %xmm12, %xmm1 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm3 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm1 mulsd %xmm11, %xmm0 mulsd %xmm11, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm1 movsd 2 * SIZE(AO), %xmm11 movaps %xmm9, %xmm10 movsd 3 * SIZE(AO), %xmm13 mulsd %xmm0, %xmm9 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm1, %xmm10 subsd %xmm9, %xmm2 movsd 6 * SIZE(AO), %xmm9 subsd %xmm10, %xmm3 movaps %xmm11, %xmm12 mulsd %xmm0, %xmm11 mulsd %xmm1, %xmm12 subsd %xmm11, %xmm4 movsd 7 * SIZE(AO), %xmm11 subsd %xmm12, %xmm5 movaps %xmm13, %xmm14 mulsd %xmm0, %xmm13 mulsd %xmm1, %xmm14 subsd %xmm13, %xmm6 subsd %xmm14, %xmm7 mulsd %xmm8, %xmm2 mulsd %xmm8, %xmm3 movsd 10 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm4 movsd 11 * SIZE(AO), %xmm9 subsd %xmm10, %xmm5 movaps %xmm11, %xmm12 mulsd %xmm2, %xmm11 mulsd %xmm3, %xmm12 subsd %xmm11, %xmm6 subsd %xmm12, %xmm7 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm5 movsd 15 * SIZE(AO), %xmm8 movaps %xmm9, %xmm10 mulsd %xmm4, %xmm9 mulsd %xmm5, %xmm10 subsd %xmm9, %xmm6 subsd %xmm10, %xmm7 mulsd %xmm8, %xmm6 mulsd %xmm8, %xmm7 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm2 movsd 3 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm4 mulsd %xmm8, %xmm6 movaps %xmm9, %xmm10 movaps %xmm9, %xmm11 movaps %xmm9, %xmm12 mulsd %xmm0, %xmm9 mulsd %xmm2, %xmm10 mulsd %xmm4, %xmm11 mulsd %xmm6, %xmm12 subsd %xmm9, %xmm1 subsd %xmm10, %xmm3 subsd %xmm11, %xmm5 subsd %xmm12, %xmm7 mulsd %xmm13, %xmm1 mulsd %xmm13, %xmm3 mulsd %xmm13, %xmm5 mulsd %xmm13, %xmm7 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm5 mulsd %xmm8, %xmm7 movaps %xmm9, %xmm10 movaps %xmm9, %xmm11 movaps %xmm9, %xmm12 mulsd %xmm1, %xmm9 mulsd %xmm3, %xmm10 mulsd %xmm5, %xmm11 mulsd %xmm7, %xmm12 subsd %xmm9, %xmm0 subsd %xmm10, %xmm2 subsd %xmm11, %xmm4 subsd %xmm12, %xmm6 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm2 mulsd %xmm13, %xmm4 mulsd %xmm13, %xmm6 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movsd %xmm6, 3 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) movsd %xmm5, 2 * SIZE(CO2) movsd %xmm7, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) movsd %xmm4, 4 * SIZE(BO) movsd %xmm5, 5 * SIZE(BO) movsd %xmm6, 6 * SIZE(BO) movsd %xmm7, 7 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm4, 2 * SIZE(AO) movsd %xmm6, 3 * SIZE(AO) movsd %xmm1, 4 * SIZE(AO) movsd %xmm3, 5 * SIZE(AO) movsd %xmm5, 6 * SIZE(AO) movsd %xmm7, 7 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M BRANCH je .L30 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L29 ALIGN_4 .L26: addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: addsd %xmm2, %xmm9 addsd %xmm6, %xmm11 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 subsd %xmm10, %xmm2 subsd %xmm11, %xmm3 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm2 movsd 2 * SIZE(AO), %xmm1 movsd 3 * SIZE(AO), %xmm3 subsd %xmm8, %xmm0 subsd %xmm10, %xmm2 subsd %xmm9, %xmm1 subsd %xmm11, %xmm3 #endif #ifdef LN movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm2 movsd 2 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(AO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm2, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm1 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(AO), %xmm9 mulsd %xmm8, %xmm1 movsd 3 * SIZE(AO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm0, %xmm9 mulsd %xmm1, %xmm10 subsd %xmm9, %xmm2 subsd %xmm10, %xmm3 mulsd %xmm13, %xmm2 mulsd %xmm13, %xmm3 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm2 movsd 3 * SIZE(BO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm0, %xmm9 mulsd %xmm2, %xmm10 subsd %xmm9, %xmm1 subsd %xmm10, %xmm3 mulsd %xmm13, %xmm1 mulsd %xmm13, %xmm3 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm3 movsd 0 * SIZE(BO), %xmm13 movaps %xmm9, %xmm10 mulsd %xmm1, %xmm9 mulsd %xmm3, %xmm10 subsd %xmm9, %xmm0 subsd %xmm10, %xmm2 mulsd %xmm13, %xmm0 mulsd %xmm13, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm2, 1 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) movsd %xmm3, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm2, 1 * SIZE(AO) movsd %xmm1, 2 * SIZE(AO) movsd %xmm3, 3 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 movsd 1 * SIZE(AO), %xmm2 xorps %xmm5, %xmm5 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 3 * SIZE(AO), %xmm2 addsd %xmm5, %xmm8 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 addsd %xmm7, %xmm9 movsd 7 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 8 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 9 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 5 * SIZE(AO), %xmm2 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 andq $3, %rax BRANCH BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm0, %xmm1 addq $2 * SIZE, BO mulsd %xmm0, %xmm3 movsd 1 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 0 * SIZE(BO), %xmm1 addsd %xmm3, %xmm9 movsd 1 * SIZE(BO), %xmm3 addq $1 * SIZE, AO decq %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm1 subsd %xmm8, %xmm0 subsd %xmm9, %xmm1 #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm1 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm0 movsd 1 * SIZE(BO), %xmm9 mulsd %xmm0, %xmm9 movsd 3 * SIZE(BO), %xmm13 subsd %xmm9, %xmm1 mulsd %xmm13, %xmm1 #endif #ifdef RT movsd 3 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm1 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm1, %xmm9 movsd 0 * SIZE(BO), %xmm13 subsd %xmm9, %xmm0 mulsd %xmm13, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm1, 1 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L10 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S000066400000000000000000001777101313527062700231620ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #define J %rbx #ifndef WINDOWS_ABI #define STACKSIZE 96 #define OFFSET 48(%rsp) #define AORIG 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define AORIG 232(%rsp) #define KK 240(%rsp) #define KKK 248(%rsp) #endif #define PREFETCH prefetch #define PREFETCHSIZE (8 * 7 + 0) #define movlpd movsd #define movapd movups #define movupd movups #define KERNEL1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ addpd %xmm1, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ /**/ movddup (BO, %rax, 4), %xmm1 ;\ movapd %xmm4, %xmm2 #define KERNEL3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ /***/ movapd (AO, %rax, 4), %xmm6 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ movapd %xmm6, %xmm2 #define KERNEL5(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ /**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ addpd %xmm1, %xmm14 ;\ movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL6(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ /***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ movapd %xmm7, %xmm2 #define KERNEL7(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ addpd %xmm5, %xmm14 ;\ movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL8(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ /**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm3, %xmm15 ;\ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd (AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup (BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm12 #else movq STACKSIZE + 8(%rsp), LDC movsd STACKSIZE + 16(%rsp), %xmm12 #endif movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B movsd %xmm12, OFFSET movsd %xmm12, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N je .L40 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -8 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -14 * SIZE(BO), %xmm3 #ifndef LN prefetchw 3 * SIZE(CO1) #else prefetchw -8 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L96 ALIGN_4 .L92: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm9 movddup -12 * SIZE(BO, %rax, 1), %xmm1 mulpd %xmm5, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm0, %xmm10 movapd (AO, %rax, 4), %xmm0 addpd %xmm5, %xmm11 movddup -13 * SIZE(BO, %rax, 1), %xmm5 mulpd %xmm3, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm3, %xmm9 movddup -10 * SIZE(BO, %rax, 1), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm10 movapd 8 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 1), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L92 ALIGN_4 .L96: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L99 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO, %rax, 1), %xmm1 addq $SIZE, %rax jl .L97 ALIGN_4 .L99: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movlpd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movaps %xmm10, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) #else movaps %xmm10, -16 * SIZE(AO) movaps %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L106 ALIGN_4 .L102: mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(BO, %rax, 1), %xmm0 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(BO, %rax, 1), %xmm1 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(BO, %rax, 1), %xmm2 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(BO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L102 ALIGN_4 .L106: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L109 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L107: movddup -16 * SIZE(BO, %rax, 1), %xmm0 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 addq $SIZE, %rax jl .L107 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movlpd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movaps %xmm10, -16 * SIZE(BO) #else movaps %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -14 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L116 ALIGN_4 .L112: mulpd -16 * SIZE(BO, %rax, 1), %xmm0 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 addpd %xmm1, %xmm9 movapd -10 * SIZE(AO, %rax, 1), %xmm1 addq $4 * SIZE, %rax BRANCH jl .L112 ALIGN_4 .L116: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L118 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L117: mulsd -16 * SIZE(BO, %rax, 1), %xmm0 addsd %xmm0, %xmm8 movsd -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L117 ALIGN_4 .L118: addpd %xmm9, %xmm8 haddpd %xmm8, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #if defined(RN) || defined(RT) movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif movsd %xmm10, 0 * SIZE(CO1) #if defined(LN) || defined(LT) movlpd %xmm10, -16 * SIZE(BO) #else movlpd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax addq %rax, AO addq %rax, BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (B, K, SIZE), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L40: testq $2, N je .L80 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 pxor %xmm8, %xmm8 movddup -12 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -16 * SIZE(AO), %xmm0 pxor %xmm12, %xmm12 movapd -8 * SIZE(AO), %xmm4 pxor %xmm13, %xmm13 #ifndef LN prefetchw 3 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw 5 * SIZE(CO2) #else prefetchw -4 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw -4 * SIZE(CO2) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L56 ALIGN_4 .L52: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd (AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -9 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd 8 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -4 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -7 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $4 * SIZE, %rax BRANCH jl .L52 ALIGN_4 .L56: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L59 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L57 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm1, %xmm9 subpd %xmm9, %xmm3 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movlpd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm13,-14 * SIZE(BO) movaps %xmm1, -12 * SIZE(BO) movaps %xmm5, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L66 ALIGN_4 .L62: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm11 movddup -11 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm9 movddup -9 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm10 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm11 movddup -7 * SIZE(BO, %rax, 2), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L62 ALIGN_4 .L66: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L69 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L67: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 addq $SIZE, %rax jl .L67 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm13, -14 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 1), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(AO), %xmm3 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L76 ALIGN_4 .L72: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(AO, %rax, 1), %xmm1 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(AO, %rax, 1), %xmm2 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(AO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L72 ALIGN_4 .L76: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L78 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L77: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L77 ALIGN_4 .L78: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm2 #else movapd -16 * SIZE(AO), %xmm2 #endif subpd %xmm8, %xmm2 #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 mulsd -16 * SIZE(BO), %xmm2 movsd -15 * SIZE(BO), %xmm4 mulsd %xmm2, %xmm4 subsd %xmm4, %xmm0 mulsd -13 * SIZE(BO), %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 mulsd -13 * SIZE(BO), %xmm0 movlpd -14 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 subsd %xmm4, %xmm2 mulsd -16 * SIZE(BO), %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movaps %xmm2, -16 * SIZE(BO) #else movaps %xmm2, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: movq N, J sarq $2, J # j = (n >> 2) jle .L999 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax movq B, BB subq %rax, BB #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movddup -8 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 #ifndef LN prefetchw 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw 5 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw 3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw 5 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 #else prefetchw -8 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw -8 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw -8 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw -8 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 #endif prefetch -16 * SIZE(BB) prefetch -8 * SIZE(BB) subq $-16 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) BRANCH jl .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_4 KERNEL_SUB1(16 * 0) KERNEL_SUB2(16 * 0) KERNEL_SUB3(16 * 0) KERNEL_SUB4(16 * 0) subq $-16 * SIZE, BO subq $-16 * SIZE, AO ALIGN_4 .L16: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L17: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd %xmm2, %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movapd %xmm0, %xmm2 addpd %xmm3, %xmm13 movddup -13 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm10 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm14 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 addpd %xmm3, %xmm15 movddup -11 * SIZE(BO, %rax, 4), %xmm3 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L17 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm3 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm7, %xmm14 subpd %xmm14, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm15 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm11, %xmm10 subpd %xmm10, %xmm15 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm11, %xmm14 subpd %xmm14, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm3 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm1, %xmm9 subpd %xmm9, %xmm3 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm1, %xmm11 subpd %xmm11, %xmm7 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm5 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm7 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm5, %xmm9 subpd %xmm9, %xmm7 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm7, %xmm9 subpd %xmm9, %xmm5 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm7, %xmm10 subpd %xmm10, %xmm3 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm7, %xmm11 subpd %xmm11, %xmm1 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm5, %xmm9 subpd %xmm9, %xmm3 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm3, %xmm9 subpd %xmm9, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movlpd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) movaps %xmm13, -12 * SIZE(BO) movaps %xmm15, -10 * SIZE(BO) movaps %xmm1, -8 * SIZE(BO) movaps %xmm3, -6 * SIZE(BO) movaps %xmm5, -4 * SIZE(BO) movaps %xmm7, -2 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm1, -14 * SIZE(AO) movaps %xmm2, -12 * SIZE(AO) movaps %xmm3, -10 * SIZE(AO) movaps %xmm4, -8 * SIZE(AO) movaps %xmm5, -6 * SIZE(AO) movaps %xmm6, -4 * SIZE(AO) movaps %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -8 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L26 ALIGN_4 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -9 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup (BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -7 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -6 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -5 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup -4 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup -3 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -2 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -1 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup 8 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup 1 * SIZE(BO, %rax, 4), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L22 ALIGN_4 .L26: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L29 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L27: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 addq $SIZE, %rax jl .L27 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm15, %xmm10 subpd %xmm10, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm11, %xmm10 subpd %xmm10, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm9, 0 * SIZE(CO1) movlpd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movlpd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movlpd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm9, -16 * SIZE(BO) movaps %xmm11, -14 * SIZE(BO) movaps %xmm13, -12 * SIZE(BO) movaps %xmm15, -10 * SIZE(BO) #else movaps %xmm0, -16 * SIZE(AO) movaps %xmm2, -14 * SIZE(AO) movaps %xmm4, -12 * SIZE(AO) movaps %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif movq B, BO #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -14 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -15 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movapd -16 * SIZE(BO), %xmm1 pxor %xmm11, %xmm11 movapd -8 * SIZE(BO), %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L36 ALIGN_4 .L32: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd %xmm4, %xmm1 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 addpd %xmm1, %xmm10 movapd (BO, %rax, 4), %xmm1 addpd %xmm4, %xmm11 movddup -11 * SIZE(AO, %rax, 1), %xmm4 mulpd %xmm2, %xmm3 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movddup -13 * SIZE(AO, %rax, 1), %xmm2 mulpd %xmm2, %xmm3 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 movddup -10 * SIZE(AO, %rax, 1), %xmm2 addq $4 * SIZE, %rax BRANCH jl .L32 ALIGN_4 .L36: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) je .L38 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L37: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L37 ALIGN_4 .L38: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 subpd %xmm8, %xmm2 subpd %xmm9, %xmm3 #else movapd -16 * SIZE(AO), %xmm2 movapd -14 * SIZE(AO), %xmm3 subpd %xmm8, %xmm2 subpd %xmm9, %xmm3 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd -16 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd -15 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd -14 * SIZE(BO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd -13 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd -11 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd -9 * SIZE(BO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd -6 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd -5 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd -1 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd -1 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd -3 * SIZE(BO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd -4 * SIZE(BO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd -6 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd -7 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd -8 * SIZE(BO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd -11 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd -12 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd -16 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #else movlpd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movaps %xmm2, -16 * SIZE(BO) movaps %xmm3, -14 * SIZE(BO) #else movaps %xmm2, -16 * SIZE(AO) movaps %xmm3, -14 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_4x4_core2.S000066400000000000000000002023011313527062700222270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define J 0(%rsp) #define OFFSET 8(%rsp) #define KK 16(%rsp) #define KKK 24(%rsp) #define AORIG 32(%rsp) #define BORIG 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) #define PREFETCHSIZE (8 * 17 + 2) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, %rax movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq %rax, KK movq %rax, OFFSET movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N je .L40 ALIGN_4 .L81: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L83 ALIGN_4 .L82: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO subq $1, %rax jne .L82 ALIGN_4 .L83: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L90 ALIGN_4 .L84: movddup -16 * SIZE(B), %xmm0 movapd %xmm0, 0 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO subq $1, %rax jne .L84 ALIGN_4 .L90: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 #ifdef LN prefetcht2 -3 * SIZE(CO1) #else prefetcht2 3 * SIZE(CO1) #endif pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -14 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -10 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L96 ALIGN_4 .L99: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm10 movapd -14 * SIZE(B), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm9, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movddup %xmm10, %xmm8 SHUFPD_3 %xmm10, %xmm10 movddup %xmm11, %xmm9 SHUFPD_3 %xmm11, %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm10, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd -10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm2, %xmm8 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L106 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(B) movddup %xmm10, %xmm8 SHUFPD_3 %xmm10, %xmm10 movapd %xmm8, -16 * SIZE(BO) movapd %xmm10, -14 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm1, %xmm3 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 movsd -14 * SIZE(AO), %xmm0 movsd -13 * SIZE(AO), %xmm1 movsd -12 * SIZE(BO), %xmm2 movsd -10 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm1, %xmm3 addsd %xmm2, %xmm10 addsd %xmm3, %xmm11 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L116 ALIGN_4 .L118: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 addsd %xmm9, %xmm8 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #ifdef LN movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef RN movsd -16 * SIZE(B), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef RT movsd -16 * SIZE(B), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm10, -16 * SIZE(B) movlpd %xmm10, -16 * SIZE(BO) movlpd %xmm10, -15 * SIZE(BO) #else movsd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L40: testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -12 * SIZE(B), %xmm4 movddup -11 * SIZE(B), %xmm5 movddup -10 * SIZE(B), %xmm6 movddup -9 * SIZE(B), %xmm7 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) addq $8 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 -3 * SIZE(CO2) pxor %xmm13, %xmm13 #else prefetcht2 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 3 * SIZE(CO2) pxor %xmm13, %xmm13 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 movapd -12 * SIZE(B), %xmm1 movapd -10 * SIZE(B), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(B), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm3 movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(B), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm1 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movapd %xmm1, -12 * SIZE(B) movapd %xmm5, -10 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm1, %xmm0 SHUFPD_3 %xmm1, %xmm1 movddup %xmm5, %xmm4 SHUFPD_3 %xmm5, %xmm5 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) movapd %xmm0, -8 * SIZE(BO) movapd %xmm1, -6 * SIZE(BO) movapd %xmm4, -4 * SIZE(BO) movapd %xmm5, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(B), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 mulsd %xmm1, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -14 * SIZE(AO), %xmm0 movsd -13 * SIZE(AO), %xmm1 movsd -8 * SIZE(BO), %xmm2 movsd -6 * SIZE(BO), %xmm3 movsd -4 * SIZE(BO), %xmm4 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 mulsd %xmm1, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm13 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 mulsd -13 * SIZE(B), %xmm13 #endif #ifdef RT mulsd -13 * SIZE(B), %xmm13 movlpd -14 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(B), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(B) movsd %xmm13, -15 * SIZE(B) movsd %xmm12, -16 * SIZE(BO) movsd %xmm12, -15 * SIZE(BO) movsd %xmm13, -14 * SIZE(BO) movsd %xmm13, -13 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: movq N, J sarq $2, J # j = (n >> 2) jle .L999 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq 16 * SIZE + BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 movapd -12 * SIZE(B), %xmm2 movapd -10 * SIZE(B), %xmm3 movapd -8 * SIZE(B), %xmm4 movapd -6 * SIZE(B), %xmm5 movapd -4 * SIZE(B), %xmm6 movapd -2 * SIZE(B), %xmm7 prefetcht0 (PREFETCH_R + 8) * SIZE(B) movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movddup %xmm2, %xmm10 unpckhpd %xmm2, %xmm2 movddup %xmm3, %xmm11 unpckhpd %xmm3, %xmm3 movddup %xmm4, %xmm12 unpckhpd %xmm4, %xmm4 movddup %xmm5, %xmm13 unpckhpd %xmm5, %xmm5 movddup %xmm6, %xmm14 unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm10, -8 * SIZE(BO) movapd %xmm2, -6 * SIZE(BO) movapd %xmm11, -4 * SIZE(BO) movapd %xmm3, -2 * SIZE(BO) prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movapd %xmm12, 0 * SIZE(BO) movapd %xmm4, 2 * SIZE(BO) movapd %xmm13, 4 * SIZE(BO) movapd %xmm5, 6 * SIZE(BO) prefetcht0 (PREFETCH_W + 24) * SIZE(BO) movapd %xmm14, 8 * SIZE(BO) movapd %xmm6, 10 * SIZE(BO) movapd %xmm15, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) subq $-16 * SIZE, B subq $-32 * SIZE, BO subq $1, %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L04 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 prefetcht2 0 * SIZE(BB) #ifdef LN prefetcht2 -3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 -3 * SIZE(CO2) pxor %xmm13, %xmm13 prefetcht2 -3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetcht2 -3 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 #else prefetcht2 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht2 3 * SIZE(CO2) pxor %xmm13, %xmm13 prefetcht2 3 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetcht2 3 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 #endif pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-8 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm10 movapd -16 * SIZE(AO), %xmm0 addpd %xmm3, %xmm14 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movapd -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd 0 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd 2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd 4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd 6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 movapd 8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movapd 10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 addq $32 * SIZE, BO mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 movapd -20 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 subq $-16 * SIZE, AO mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movapd -18 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 subq $1, %rax mulpd %xmm1, %xmm5 BRANCH jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: movapd -16 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movapd -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 subq $1, %rax BRANCH jg .L16 ALIGN_4 .L19: addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 movapd -8 * SIZE(B), %xmm1 movapd -6 * SIZE(B), %xmm3 movapd -4 * SIZE(B), %xmm5 movapd -2 * SIZE(B), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -3 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm15 movddup -8 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm3 movddup -9 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm3 movddup -14 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm5 movddup -13 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm5 movddup -3 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -4 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm1 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup -8 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm1 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(B), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm1 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 2 * SIZE(CO1, LDC, 2) movsd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movsd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movapd %xmm1, -8 * SIZE(B) movapd %xmm3, -6 * SIZE(B) movapd %xmm5, -4 * SIZE(B) movapd %xmm7, -2 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm11, %xmm10 SHUFPD_3 %xmm11, %xmm11 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm15, %xmm14 SHUFPD_3 %xmm15, %xmm15 movddup %xmm1, %xmm0 SHUFPD_3 %xmm1, %xmm1 movddup %xmm3, %xmm2 SHUFPD_3 %xmm3, %xmm3 movddup %xmm5, %xmm4 SHUFPD_3 %xmm5, %xmm5 movddup %xmm7, %xmm6 SHUFPD_3 %xmm7, %xmm7 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) movapd %xmm4, 8 * SIZE(BO) movapd %xmm5, 10 * SIZE(BO) movapd %xmm6, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5, -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -14 * SIZE(AO), %xmm0 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm4 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movapd -10 * SIZE(AO), %xmm0 movapd 8 * SIZE(BO), %xmm2 movapd 10 * SIZE(BO), %xmm3 movapd 12 * SIZE(BO), %xmm4 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jne .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(B), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(B), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(B), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(B), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(B), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(B), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(B), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(B), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(B), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(B), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(B), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(B), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(B), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(B), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(B), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movddup %xmm9, %xmm8 SHUFPD_3 %xmm9, %xmm9 movddup %xmm11, %xmm10 SHUFPD_3 %xmm11, %xmm11 movddup %xmm13, %xmm12 SHUFPD_3 %xmm13, %xmm13 movddup %xmm15, %xmm14 SHUFPD_3 %xmm15, %xmm15 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) movapd %xmm4, -12 * SIZE(AO) movapd %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -15 * SIZE(AO), %xmm0 movsd -8 * SIZE(BO), %xmm2 movsd -6 * SIZE(BO), %xmm3 movsd -4 * SIZE(BO), %xmm4 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -14 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm2 movsd 2 * SIZE(BO), %xmm3 movsd 4 * SIZE(BO), %xmm4 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd -13 * SIZE(AO), %xmm0 movsd 8 * SIZE(BO), %xmm2 movsd 10 * SIZE(BO), %xmm3 movsd 12 * SIZE(BO), %xmm4 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -14 * SIZE(BO), %xmm3 movsd -12 * SIZE(BO), %xmm4 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movsd -16 * SIZE(B), %xmm12 movsd -15 * SIZE(B), %xmm13 movsd -14 * SIZE(B), %xmm14 movsd -13 * SIZE(B), %xmm15 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 movsd -14 * SIZE(AO), %xmm14 movsd -13 * SIZE(AO), %xmm15 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 subsd %xmm10, %xmm14 subsd %xmm11, %xmm15 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 mulsd %xmm8, %xmm14 mulsd %xmm8, %xmm15 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 mulsd %xmm8, %xmm14 mulsd %xmm8, %xmm15 #endif #ifdef RN mulsd -16 * SIZE(B), %xmm12 movlpd -15 * SIZE(B), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 movlpd -14 * SIZE(B), %xmm10 mulsd %xmm12, %xmm10 subsd %xmm10, %xmm14 movlpd -13 * SIZE(B), %xmm11 mulsd %xmm12, %xmm11 subsd %xmm11, %xmm15 mulsd -11 * SIZE(B), %xmm13 movlpd -10 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm14 movlpd -9 * SIZE(B), %xmm10 mulsd %xmm13, %xmm10 subsd %xmm10, %xmm15 mulsd -6 * SIZE(B), %xmm14 movlpd -5 * SIZE(B), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm15 mulsd -1 * SIZE(B), %xmm15 #endif #ifdef RT mulsd -1 * SIZE(B), %xmm15 movlpd -2 * SIZE(B), %xmm9 mulsd %xmm15, %xmm9 subsd %xmm9, %xmm14 movlpd -3 * SIZE(B), %xmm10 mulsd %xmm15, %xmm10 subsd %xmm10, %xmm13 movlpd -4 * SIZE(B), %xmm11 mulsd %xmm15, %xmm11 subsd %xmm11, %xmm12 mulsd -6 * SIZE(B), %xmm14 movlpd -7 * SIZE(B), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm13 movlpd -8 * SIZE(B), %xmm10 mulsd %xmm14, %xmm10 subsd %xmm10, %xmm12 mulsd -11 * SIZE(B), %xmm13 movlpd -12 * SIZE(B), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(B), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm14, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 0 * SIZE(CO2, LDC, 2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(B) movsd %xmm13, -15 * SIZE(B) movsd %xmm14, -14 * SIZE(B) movsd %xmm15, -13 * SIZE(B) movsd %xmm12, -16 * SIZE(BO) movsd %xmm12, -15 * SIZE(BO) movsd %xmm13, -14 * SIZE(BO) movsd %xmm13, -13 * SIZE(BO) movsd %xmm14, -12 * SIZE(BO) movsd %xmm14, -11 * SIZE(BO) movsd %xmm15, -10 * SIZE(BO) movsd %xmm15, -9 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) movsd %xmm14, -14 * SIZE(AO) movsd %xmm15, -13 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S000066400000000000000000001663331313527062700225460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH_R (8 * 4 + 0) #define PREFETCHSIZE (8 * 21 + 6) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK subq $-16 * SIZE, A subq $-16 * SIZE, B leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N BRANCH jle .L40 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 #ifdef LN prefetcht0 -4 * SIZE(CO1) #else prefetcht0 3 * SIZE(CO1) #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -12 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_4 .L98: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm12, %xmm11 #else movapd -16 * SIZE(AO), %xmm10 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm10 subpd %xmm12, %xmm11 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 movsd -2 * SIZE(AO), %xmm13 mulsd %xmm9, %xmm13 subsd %xmm13, %xmm11 movsd -3 * SIZE(AO), %xmm14 mulsd %xmm9, %xmm14 subsd %xmm14, %xmm8 movsd -4 * SIZE(AO), %xmm15 mulsd %xmm9, %xmm15 subsd %xmm15, %xmm10 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -7 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm8 movsd -8 * SIZE(AO), %xmm14 mulsd %xmm11, %xmm14 subsd %xmm14, %xmm10 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -12 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movapd %xmm11, %xmm9 unpckhpd %xmm9, %xmm9 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -14 * SIZE(AO), %xmm14 mulsd %xmm10, %xmm14 subsd %xmm14, %xmm11 movsd -13 * SIZE(AO), %xmm15 mulsd %xmm10, %xmm15 subsd %xmm15, %xmm9 movsd -11 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -10 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm11 movsd -9 * SIZE(AO), %xmm14 mulsd %xmm8, %xmm14 subsd %xmm14, %xmm9 movsd -6 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm11 movsd -5 * SIZE(AO), %xmm13 mulsd %xmm11, %xmm13 subsd %xmm13, %xmm9 movsd -1 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm9 unpcklpd %xmm8, %xmm10 unpcklpd %xmm9, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 mulpd %xmm8, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhpd %xmm11, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L91 ALIGN_4 .L100: testq $2, M BRANCH jle .L110 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm2 pxor %xmm9, %xmm9 movhps -15 * SIZE(BO), %xmm2 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 pshufd $0x44, %xmm2, %xmm3 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -12 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_4 .L108: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm10 subpd %xmm8, %xmm10 #else movapd -16 * SIZE(AO), %xmm10 subpd %xmm8, %xmm10 #endif #ifdef LN movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 movsd -14 * SIZE(AO), %xmm13 mulsd %xmm8, %xmm13 subsd %xmm13, %xmm10 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 unpcklpd %xmm8, %xmm10 #endif #ifdef LT movapd %xmm10, %xmm8 unpckhpd %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 movsd -15 * SIZE(AO), %xmm13 mulsd %xmm10, %xmm13 subsd %xmm13, %xmm8 movsd -13 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm8 unpcklpd %xmm8, %xmm10 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm10 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) movhpd %xmm10, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm10, -16 * SIZE(BO) #else movapd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M BRANCH jle .L119 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -14 * SIZE(AO), %xmm0 movsd -14 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -13 * SIZE(AO), %xmm0 movsd -13 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -12 * SIZE(AO), %xmm0 movsd -12 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif addpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm10 subsd %xmm8, %xmm10 #else movsd -16 * SIZE(AO), %xmm10 subsd %xmm8, %xmm10 #endif #ifdef LN movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm12 mulsd %xmm12, %xmm10 #endif #ifdef RN movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef RT movsd -16 * SIZE(BO), %xmm8 mulsd %xmm8, %xmm10 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm10, 0 * SIZE(CO1) #else movsd %xmm10, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm10, -16 * SIZE(BO) #else movsd %xmm10, -16 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (B, K, SIZE), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L40: testq $2, N BRANCH jle .L80 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax movq B, BB subq %rax, BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 #ifdef LN prefetcht0 -4 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 -4 * SIZE(CO2) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #else prefetcht0 3 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 3 * SIZE(CO2) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -14 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -10 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -14 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 movapd -12 * SIZE(BO), %xmm1 movapd -10 * SIZE(BO), %xmm5 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 subpd %xmm12, %xmm1 subpd %xmm4, %xmm5 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 movddup -2 * SIZE(AO), %xmm10 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 movddup -3 * SIZE(AO), %xmm12 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm13 movddup -4 * SIZE(AO), %xmm14 mulpd %xmm5, %xmm14 subpd %xmm14, %xmm9 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -7 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 movddup -8 * SIZE(AO), %xmm12 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm9 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -12 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -14 * SIZE(AO), %xmm12 mulpd %xmm9, %xmm12 subpd %xmm12, %xmm1 movddup -13 * SIZE(AO), %xmm14 mulpd %xmm9, %xmm14 subpd %xmm14, %xmm5 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -10 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 movddup -9 * SIZE(AO), %xmm12 mulpd %xmm13, %xmm12 subpd %xmm12, %xmm5 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 movddup -5 * SIZE(AO), %xmm10 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm3 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -14 * SIZE(BO), %xmm9 movapd %xmm9, %xmm10 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 mulpd %xmm3, %xmm10 subpd %xmm10, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) movapd %xmm1, -12 * SIZE(BO) movapd %xmm5, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L60: testq $2, M BRANCH jle .L70 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 movaps -16 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -14 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -10 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -8 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -14 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm0, %xmm13 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 movddup -14 * SIZE(AO), %xmm10 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 movddup -15 * SIZE(AO), %xmm10 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 #endif #ifdef RT movddup -13 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -14 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M BRANCH jle .L79 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -10 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addpd %xmm9, %xmm8 movhlps %xmm8, %xmm9 #if defined(LN) || defined(LT) movsd -16 * SIZE(BO), %xmm12 movsd -15 * SIZE(BO), %xmm13 #else movsd -16 * SIZE(AO), %xmm12 movsd -15 * SIZE(AO), %xmm13 #endif subsd %xmm8, %xmm12 subsd %xmm9, %xmm13 #ifdef LN movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef LT movsd -16 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm12 mulsd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(BO), %xmm12 movsd -15 * SIZE(BO), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 mulsd -13 * SIZE(BO), %xmm13 #endif #ifdef RT mulsd -13 * SIZE(BO), %xmm13 movlpd -14 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(BO), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm12, -16 * SIZE(BO) movsd %xmm13, -15 * SIZE(BO) #else movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: movq N, J sarq $2, J NOBRANCH jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 2, %rax movq B, BB subq %rax, BB #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 #ifdef LN prefetcht0 -4 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 -4 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht0 -4 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 prefetcht0 -4 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #else prefetcht0 3 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 3 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht0 3 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 prefetcht0 3 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: addpd %xmm3, %xmm11 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -4 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 0 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 4 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 8 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 14 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps 16 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 subq $-32 * SIZE, AO addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax subq $4, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif addpd %xmm3, %xmm11 addpd %xmm4, %xmm15 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 movapd %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 movapd %xmm14, %xmm0 movsd %xmm15, %xmm14 movsd %xmm0, %xmm15 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd %xmm12, %xmm4 unpcklpd %xmm13, %xmm12 unpckhpd %xmm13, %xmm4 movapd %xmm14, %xmm6 unpcklpd %xmm15, %xmm14 unpckhpd %xmm15, %xmm6 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 movapd -8 * SIZE(BO), %xmm1 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm5 movapd -2 * SIZE(BO), %xmm7 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 subpd %xmm12, %xmm1 subpd %xmm14, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -12 * SIZE(AO), %xmm2 movapd -10 * SIZE(AO), %xmm3 movapd -8 * SIZE(AO), %xmm4 movapd -6 * SIZE(AO), %xmm5 movapd -4 * SIZE(AO), %xmm6 movapd -2 * SIZE(AO), %xmm7 subpd %xmm8, %xmm0 subpd %xmm12, %xmm1 subpd %xmm9, %xmm2 subpd %xmm13, %xmm3 subpd %xmm10, %xmm4 subpd %xmm14, %xmm5 subpd %xmm11, %xmm6 subpd %xmm15, %xmm7 #endif #ifdef LN movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 movddup -2 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -3 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm15 movddup -4 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm5, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm11 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -7 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm15 movddup -8 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm11 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -12 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm3 movddup -13 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -10 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm1 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm3 movddup -9 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm1 mulpd %xmm8, %xmm3 movddup -5 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm1, %xmm10 subpd %xmm10, %xmm5 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm5 mulpd %xmm8, %xmm7 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 movddup -15 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm3 movddup -14 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm5 movddup -13 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm1, %xmm12 subpd %xmm12, %xmm7 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -10 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm5 movddup -9 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm7 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -5 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm6 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm7 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 mulpd %xmm8, %xmm7 movddup -2 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm4 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm5 movddup -3 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm3 movddup -4 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm7, %xmm12 subpd %xmm12, %xmm1 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 mulpd %xmm8, %xmm5 movddup -7 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm2 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm3 movddup -8 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm5, %xmm12 subpd %xmm12, %xmm1 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 mulpd %xmm8, %xmm3 movddup -12 * SIZE(BO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm0 mulpd %xmm3, %xmm12 subpd %xmm12, %xmm1 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 mulpd %xmm8, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movsd %xmm5, 3 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movhpd %xmm1, 2 * SIZE(CO2) movhpd %xmm5, 3 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movsd %xmm3, 2 * SIZE(CO1, LDC, 2) movsd %xmm7, 3 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm3, 2 * SIZE(CO2) movhpd %xmm3, 3 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm5, 2 * SIZE(CO1, LDC, 2) movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) movsd %xmm7, 2 * SIZE(CO2, LDC, 2) movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) movapd %xmm1, -8 * SIZE(BO) movapd %xmm3, -6 * SIZE(BO) movapd %xmm5, -4 * SIZE(BO) movapd %xmm7, -2 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm1, -14 * SIZE(AO) movapd %xmm2, -12 * SIZE(AO) movapd %xmm3, -10 * SIZE(AO) movapd %xmm4, -8 * SIZE(AO) movapd %xmm5, -6 * SIZE(AO) movapd %xmm6, -4 * SIZE(AO) movapd %xmm7, -2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 ALIGN_4 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -10 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -8 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -4 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -2 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO addpd %xmm2, %xmm9 movaps 0 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addpd %xmm3, %xmm11 addpd %xmm5, %xmm10 movapd %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movapd %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 #if defined(LN) || defined(LT) movapd %xmm8, %xmm0 unpcklpd %xmm9, %xmm8 unpckhpd %xmm9, %xmm0 movapd %xmm10, %xmm2 unpcklpd %xmm11, %xmm10 unpckhpd %xmm11, %xmm2 movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm0, %xmm13 subpd %xmm2, %xmm15 #else movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm2 movapd -12 * SIZE(AO), %xmm4 movapd -10 * SIZE(AO), %xmm6 subpd %xmm8, %xmm0 subpd %xmm9, %xmm2 subpd %xmm10, %xmm4 subpd %xmm11, %xmm6 #endif #ifdef LN movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 movddup -14 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm13, %xmm10 subpd %xmm10, %xmm9 mulpd %xmm15, %xmm12 subpd %xmm12, %xmm11 movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd %xmm8, %xmm11 movddup -15 * SIZE(AO), %xmm10 movapd %xmm10, %xmm12 mulpd %xmm9, %xmm10 subpd %xmm10, %xmm13 mulpd %xmm11, %xmm12 subpd %xmm12, %xmm15 movddup -13 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 mulpd %xmm8, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 movddup -15 * SIZE(BO), %xmm9 mulpd %xmm0, %xmm9 subpd %xmm9, %xmm2 movddup -14 * SIZE(BO), %xmm10 mulpd %xmm0, %xmm10 subpd %xmm10, %xmm4 movddup -13 * SIZE(BO), %xmm11 mulpd %xmm0, %xmm11 subpd %xmm11, %xmm6 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -10 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm4 movddup -9 * SIZE(BO), %xmm10 mulpd %xmm2, %xmm10 subpd %xmm10, %xmm6 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -5 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm6 movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 #endif #ifdef RT movddup -1 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm6 movddup -2 * SIZE(BO), %xmm9 mulpd %xmm6, %xmm9 subpd %xmm9, %xmm4 movddup -3 * SIZE(BO), %xmm10 mulpd %xmm6, %xmm10 subpd %xmm10, %xmm2 movddup -4 * SIZE(BO), %xmm11 mulpd %xmm6, %xmm11 subpd %xmm11, %xmm0 movddup -6 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm4 movddup -7 * SIZE(BO), %xmm9 mulpd %xmm4, %xmm9 subpd %xmm9, %xmm2 movddup -8 * SIZE(BO), %xmm10 mulpd %xmm4, %xmm10 subpd %xmm10, %xmm0 movddup -11 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm2 movddup -12 * SIZE(BO), %xmm9 mulpd %xmm2, %xmm9 subpd %xmm9, %xmm0 movddup -16 * SIZE(BO), %xmm8 mulpd %xmm8, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm13, 1 * SIZE(CO1) movhpd %xmm9, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm11, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 1 * SIZE(CO1, LDC, 2) movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm4, 0 * SIZE(CO1, LDC, 2) movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm0, -16 * SIZE(AO) movapd %xmm2, -14 * SIZE(AO) movapd %xmm4, -12 * SIZE(AO) movapd %xmm6, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M BRANCH jle .L39 ALIGN_4 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -14 * SIZE(BO), %xmm3 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -10 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps -8 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps -6 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -2 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps 2 * SIZE(BO), %xmm3 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -10 * SIZE(BO), %xmm3 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(LT) movaps -16 * SIZE(BO), %xmm12 movaps -14 * SIZE(BO), %xmm13 #else movaps -16 * SIZE(AO), %xmm12 movaps -14 * SIZE(AO), %xmm13 #endif subpd %xmm8, %xmm12 subpd %xmm9, %xmm13 #if defined(RN) || defined(RT) movhlps %xmm13, %xmm15 movsd %xmm13, %xmm14 movhlps %xmm12, %xmm13 movsd %xmm12, %xmm12 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm12 mulpd %xmm8, %xmm13 #endif #ifdef RN mulsd -16 * SIZE(BO), %xmm12 movlpd -15 * SIZE(BO), %xmm9 mulsd %xmm12, %xmm9 subsd %xmm9, %xmm13 movlpd -14 * SIZE(BO), %xmm10 mulsd %xmm12, %xmm10 subsd %xmm10, %xmm14 movlpd -13 * SIZE(BO), %xmm11 mulsd %xmm12, %xmm11 subsd %xmm11, %xmm15 mulsd -11 * SIZE(BO), %xmm13 movlpd -10 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm14 movlpd -9 * SIZE(BO), %xmm10 mulsd %xmm13, %xmm10 subsd %xmm10, %xmm15 mulsd -6 * SIZE(BO), %xmm14 movlpd -5 * SIZE(BO), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm15 mulsd -1 * SIZE(BO), %xmm15 #endif #ifdef RT mulsd -1 * SIZE(BO), %xmm15 movlpd -2 * SIZE(BO), %xmm9 mulsd %xmm15, %xmm9 subsd %xmm9, %xmm14 movlpd -3 * SIZE(BO), %xmm10 mulsd %xmm15, %xmm10 subsd %xmm10, %xmm13 movlpd -4 * SIZE(BO), %xmm11 mulsd %xmm15, %xmm11 subsd %xmm11, %xmm12 mulsd -6 * SIZE(BO), %xmm14 movlpd -7 * SIZE(BO), %xmm9 mulsd %xmm14, %xmm9 subsd %xmm9, %xmm13 movlpd -8 * SIZE(BO), %xmm10 mulsd %xmm14, %xmm10 subsd %xmm10, %xmm12 mulsd -11 * SIZE(BO), %xmm13 movlpd -12 * SIZE(BO), %xmm9 mulsd %xmm13, %xmm9 subsd %xmm9, %xmm12 mulsd -16 * SIZE(BO), %xmm12 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm12, 0 * SIZE(CO1) movhps %xmm12, 0 * SIZE(CO2) movsd %xmm13, 0 * SIZE(CO1, LDC, 2) movhps %xmm13, 0 * SIZE(CO2, LDC, 2) movaps %xmm12, -16 * SIZE(BO) movaps %xmm13, -14 * SIZE(BO) #else movsd %xmm12, 0 * SIZE(CO1) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm14, 0 * SIZE(CO1, LDC, 2) movsd %xmm15, 0 * SIZE(CO2, LDC, 2) movsd %xmm12, -16 * SIZE(AO) movsd %xmm13, -15 * SIZE(AO) movsd %xmm14, -14 * SIZE(AO) movsd %xmm15, -13 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S000066400000000000000000002325541313527062700221060ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #ifndef ALLOC_HUGETLB #define PREFETCHSIZE (8 * 4 + 4) #else #define PREFETCHSIZE (8 * 2 + 4) #endif #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 4 + 4) #endif #ifdef OPTERON #define movsd movlpd #endif #define KERNEL1(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movsd %xmm4, OFFSET movsd %xmm4, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N je .L40 ALIGN_4 .L81: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L83 ALIGN_4 .L82: PREFETCH 56 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L82 ALIGN_4 .L83: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L90 ALIGN_4 .L84: movsd 0 * SIZE(B), %xmm0 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO decq %rax jne .L84 ALIGN_4 .L90: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movapd 24 * SIZE(AO), %xmm14 PREFETCHW 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm9, %xmm3 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm12 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm12, %xmm0 movapd 20 * SIZE(AO), %xmm12 addpd %xmm11, %xmm1 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm12 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm12, %xmm2 movapd 48 * SIZE(AO), %xmm12 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) addpd %xmm11, %xmm3 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm14 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm14, %xmm0 movapd 28 * SIZE(AO), %xmm14 addpd %xmm11, %xmm1 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm14 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm14, %xmm2 movapd 56 * SIZE(AO), %xmm14 addpd %xmm11, %xmm3 movapd 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 2 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 movapd 2 * SIZE(B), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(AO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(AO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(AO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(AO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) movhpd %xmm2, 2 * SIZE(BO) movhpd %xmm2, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(AO), %xmm8 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 4 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movapd 6 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 12 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movapd 14 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(AO), %xmm8 movapd 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) movhpd %xmm2, 2 * SIZE(BO) movhpd %xmm2, 3 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulsd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 1 * SIZE(AO), %xmm8 addsd %xmm9, %xmm0 movsd 16 * SIZE(BO), %xmm9 mulsd 2 * SIZE(BO), %xmm8 addsd %xmm8, %xmm1 movsd 2 * SIZE(AO), %xmm8 mulsd 4 * SIZE(BO), %xmm8 addsd %xmm8, %xmm2 movsd 3 * SIZE(AO), %xmm8 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 mulsd %xmm10, %xmm11 movsd 5 * SIZE(AO), %xmm10 addsd %xmm11, %xmm0 movsd 24 * SIZE(BO), %xmm11 mulsd 10 * SIZE(BO), %xmm10 addsd %xmm10, %xmm1 movsd 6 * SIZE(AO), %xmm10 mulsd 12 * SIZE(BO), %xmm10 addsd %xmm10, %xmm2 movsd 7 * SIZE(AO), %xmm10 mulsd 14 * SIZE(BO), %xmm10 addsd %xmm10, %xmm3 movsd 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm8, %xmm9 movsd 1 * SIZE(AO), %xmm8 addsd %xmm9, %xmm0 movsd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: addsd %xmm2, %xmm0 addsd %xmm3, %xmm1 addsd %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm2 subsd %xmm0, %xmm2 #else movsd 0 * SIZE(AO), %xmm2 subsd %xmm0, %xmm2 #endif #ifdef LN movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movsd 0 * SIZE(B), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef RT movsd 0 * SIZE(B), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(B) movlpd %xmm2, 0 * SIZE(BO) movlpd %xmm2, 1 * SIZE(BO) #else movsd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L40: testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: PREFETCH 56 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) movsd %xmm1, 2 * SIZE(BO) movsd %xmm1, 3 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movapd 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 movapd 16 * SIZE(AO), %xmm12 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movapd 24 * SIZE(BO), %xmm15 PREFETCHW 4 * SIZE(CO1) PREFETCHW 4 * SIZE(CO2) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm4 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm4 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm5 movapd 32 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm11 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 8 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 10 * SIZE(BO), %xmm10 addpd %xmm11, %xmm4 movapd 12 * SIZE(BO), %xmm11 addpd %xmm10, %xmm5 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm11, %xmm0 movapd 12 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm11, %xmm4 movapd 40 * SIZE(BO), %xmm11 addpd %xmm10, %xmm5 movapd 40 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm12, %xmm13 mulpd 18 * SIZE(BO), %xmm12 addpd %xmm13, %xmm0 movapd 16 * SIZE(BO), %xmm13 addpd %xmm12, %xmm1 movapd 18 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 18 * SIZE(BO), %xmm12 addpd %xmm13, %xmm4 movapd 20 * SIZE(BO), %xmm13 addpd %xmm12, %xmm5 movapd 20 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 22 * SIZE(BO), %xmm12 addpd %xmm13, %xmm0 movapd 20 * SIZE(BO), %xmm13 addpd %xmm12, %xmm1 movapd 22 * SIZE(AO), %xmm12 mulpd %xmm12, %xmm13 mulpd 22 * SIZE(BO), %xmm12 addpd %xmm13, %xmm4 movapd 48 * SIZE(BO), %xmm13 addpd %xmm12, %xmm5 movapd 48 * SIZE(AO), %xmm12 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm14, %xmm15 mulpd 26 * SIZE(BO), %xmm14 addpd %xmm15, %xmm0 movapd 24 * SIZE(BO), %xmm15 addpd %xmm14, %xmm1 movapd 26 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 26 * SIZE(BO), %xmm14 addpd %xmm15, %xmm4 movapd 28 * SIZE(BO), %xmm15 addpd %xmm14, %xmm5 movapd 28 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 30 * SIZE(BO), %xmm14 addpd %xmm15, %xmm0 movapd 28 * SIZE(BO), %xmm15 addpd %xmm14, %xmm1 movapd 30 * SIZE(AO), %xmm14 mulpd %xmm14, %xmm15 mulpd 30 * SIZE(BO), %xmm14 addpd %xmm15, %xmm4 movapd 56 * SIZE(BO), %xmm15 addpd %xmm14, %xmm5 movapd 56 * SIZE(AO), %xmm14 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 movapd 4 * SIZE(B), %xmm9 movapd 6 * SIZE(B), %xmm13 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 subpd %xmm4, %xmm9 subpd %xmm12, %xmm13 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 #endif #ifdef LN movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movapd %xmm9, 4 * SIZE(B) movapd %xmm13, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) movlpd %xmm9, 8 * SIZE(BO) movlpd %xmm9, 9 * SIZE(BO) movhpd %xmm9, 10 * SIZE(BO) movhpd %xmm9, 11 * SIZE(BO) movlpd %xmm13, 12 * SIZE(BO) movlpd %xmm13, 13 * SIZE(BO) movhpd %xmm13, 14 * SIZE(BO) movhpd %xmm13, 15 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm11, %xmm0 movapd 12 * SIZE(BO), %xmm11 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm11, %xmm2 movapd 40 * SIZE(BO), %xmm11 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm13 mulpd 18 * SIZE(BO), %xmm10 addpd %xmm13, %xmm0 movapd 20 * SIZE(BO), %xmm13 addpd %xmm10, %xmm1 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm13 mulpd 22 * SIZE(BO), %xmm10 addpd %xmm13, %xmm2 movapd 48 * SIZE(BO), %xmm13 addpd %xmm10, %xmm3 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 mulpd 26 * SIZE(BO), %xmm10 addpd %xmm15, %xmm0 movapd 28 * SIZE(BO), %xmm15 addpd %xmm10, %xmm1 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 mulpd 30 * SIZE(BO), %xmm10 addpd %xmm15, %xmm2 movapd 56 * SIZE(BO), %xmm15 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm9, %xmm0 movapd 4 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 #endif #ifdef LN movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 #endif #ifdef RT movlpd 3 * SIZE(B), %xmm0 movhpd 3 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 2 * SIZE(B), %xmm1 movhpd 2 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsd 16 * SIZE(BO), %xmm13 movsd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulsd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd 2 * SIZE(BO), %xmm8 addsd %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addsd %xmm8, %xmm1 movsd 1 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 32 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 2 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 mulsd 10 * SIZE(BO), %xmm8 addsd %xmm11, %xmm0 movsd 12 * SIZE(BO), %xmm11 addsd %xmm8, %xmm1 movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 mulsd 14 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 40 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 mulsd %xmm10, %xmm13 mulsd 18 * SIZE(BO), %xmm10 addsd %xmm13, %xmm0 movsd 20 * SIZE(BO), %xmm13 addsd %xmm10, %xmm1 movsd 5 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm13 mulsd 22 * SIZE(BO), %xmm10 addsd %xmm13, %xmm2 movsd 48 * SIZE(BO), %xmm13 addsd %xmm10, %xmm3 movsd 6 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm15 mulsd 26 * SIZE(BO), %xmm10 addsd %xmm15, %xmm0 movsd 28 * SIZE(BO), %xmm15 addsd %xmm10, %xmm1 movsd 7 * SIZE(AO), %xmm10 mulsd %xmm10, %xmm15 mulsd 30 * SIZE(BO), %xmm10 addsd %xmm15, %xmm2 movsd 56 * SIZE(BO), %xmm15 addsd %xmm10, %xmm3 movsd 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulsd %xmm8, %xmm9 mulsd 2 * SIZE(BO), %xmm8 addsd %xmm9, %xmm0 addsd %xmm8, %xmm1 movsd 1 * SIZE(AO), %xmm8 movsd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addsd %xmm2, %xmm0 addsd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm5 #else movsd 0 * SIZE(AO), %xmm4 movsd 1 * SIZE(AO), %xmm5 #endif subsd %xmm0, %xmm4 subsd %xmm1, %xmm5 #ifdef LN movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm1 mulsd %xmm4, %xmm1 subsd %xmm1, %xmm5 mulsd 3 * SIZE(B), %xmm5 #endif #ifdef RT mulsd 3 * SIZE(B), %xmm5 movlpd 2 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm4 mulsd 0 * SIZE(B), %xmm4 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO2) #if defined(LN) || defined(LT) movsd %xmm4, 0 * SIZE(B) movsd %xmm5, 1 * SIZE(B) movsd %xmm4, 0 * SIZE(BO) movsd %xmm4, 1 * SIZE(BO) movsd %xmm5, 2 * SIZE(BO) movsd %xmm5, 3 * SIZE(BO) #else movsd %xmm4, 0 * SIZE(AO) movsd %xmm5, 1 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L80: movq N, J sarq $2, J # j = (n >> 2) jle .L999 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCHNTA 40 * SIZE(B) movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm4 movsd 5 * SIZE(B), %xmm5 movsd 6 * SIZE(B), %xmm6 movsd 7 * SIZE(B), %xmm7 addq $16 * SIZE, BO addq $ 8 * SIZE, B movsd %xmm0, -16 * SIZE(BO) movsd %xmm0, -15 * SIZE(BO) movsd %xmm1, -14 * SIZE(BO) movsd %xmm1, -13 * SIZE(BO) movsd %xmm2, -12 * SIZE(BO) movsd %xmm2, -11 * SIZE(BO) movsd %xmm3, -10 * SIZE(BO) movsd %xmm3, -9 * SIZE(BO) movsd %xmm4, -8 * SIZE(BO) movsd %xmm4, -7 * SIZE(BO) movsd %xmm5, -6 * SIZE(BO) movsd %xmm5, -5 * SIZE(BO) movsd %xmm6, -4 * SIZE(BO) movsd %xmm6, -3 * SIZE(BO) movsd %xmm7, -2 * SIZE(BO) movsd %xmm7, -1 * SIZE(BO) decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 movsd 2 * SIZE(B), %xmm2 movsd 3 * SIZE(B), %xmm3 movsd %xmm0, 0 * SIZE(BO) movsd %xmm0, 1 * SIZE(BO) movsd %xmm1, 2 * SIZE(BO) movsd %xmm1, 3 * SIZE(BO) movsd %xmm2, 4 * SIZE(BO) movsd %xmm2, 5 * SIZE(BO) movsd %xmm3, 6 * SIZE(BO) movsd %xmm3, 7 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(BO), %xmm9 movapd 2 * SIZE(BO), %xmm11 movapd 4 * SIZE(BO), %xmm13 movapd 8 * SIZE(BO), %xmm15 movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 2 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movapd 4 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movapd 6 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW 4 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 4 * SIZE(CO2) pxor %xmm5, %xmm5 PREFETCHW 4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 PREFETCHW 4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpq $64 * 2, %rax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpq $64 * 4, %rax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpq $64 * 6, %rax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addq $16 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $64 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 4), BO # * 64 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm5 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 6 * SIZE(BO), %xmm10 addpd %xmm9, %xmm6 movapd 8 * SIZE(BO), %xmm9 addpd %xmm10, %xmm7 movapd 6 * SIZE(AO), %xmm10 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd %xmm6, %xmm14 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm14 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 movapd 8 * SIZE(B), %xmm9 movapd 10 * SIZE(B), %xmm11 movapd 12 * SIZE(B), %xmm13 movapd 14 * SIZE(B), %xmm15 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 subpd %xmm4, %xmm9 subpd %xmm6, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 movapd 8 * SIZE(AO), %xmm12 movapd 10 * SIZE(AO), %xmm13 movapd 12 * SIZE(AO), %xmm14 movapd 14 * SIZE(AO), %xmm15 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 subpd %xmm2, %xmm12 subpd %xmm6, %xmm13 subpd %xmm3, %xmm14 subpd %xmm7, %xmm15 #endif #ifdef LN movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 14 * SIZE(AO), %xmm2 movhpd 14 * SIZE(AO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movlpd 13 * SIZE(AO), %xmm4 movhpd 13 * SIZE(AO), %xmm4 mulpd %xmm15, %xmm4 subpd %xmm4, %xmm7 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movlpd 12 * SIZE(AO), %xmm6 movhpd 12 * SIZE(AO), %xmm6 mulpd %xmm15, %xmm6 subpd %xmm6, %xmm3 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movlpd 9 * SIZE(AO), %xmm2 movhpd 9 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm7 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movlpd 8 * SIZE(AO), %xmm4 movhpd 8 * SIZE(AO), %xmm4 mulpd %xmm11, %xmm4 subpd %xmm4, %xmm3 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 4 * SIZE(AO), %xmm2 movhpd 4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movlpd 2 * SIZE(AO), %xmm4 movhpd 2 * SIZE(AO), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm11 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movlpd 3 * SIZE(AO), %xmm6 movhpd 3 * SIZE(AO), %xmm6 mulpd %xmm3, %xmm6 subpd %xmm6, %xmm15 movlpd 5 * SIZE(AO), %xmm0 movhpd 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movlpd 6 * SIZE(AO), %xmm2 movhpd 6 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm11 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movlpd 7 * SIZE(AO), %xmm4 movhpd 7 * SIZE(AO), %xmm4 mulpd %xmm7, %xmm4 subpd %xmm4, %xmm15 movlpd 10 * SIZE(AO), %xmm0 movhpd 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 11 * SIZE(AO), %xmm2 movhpd 11 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movlpd 15 * SIZE(AO), %xmm0 movhpd 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm9, %xmm3 subpd %xmm3, %xmm15 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm13 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm15 movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm15, %xmm1 subpd %xmm1, %xmm13 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm15, %xmm3 subpd %xmm3, %xmm9 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm11 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 2 * SIZE(CO1, LDC, 2) movsd %xmm15, 3 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm13, 2 * SIZE(CO1, LDC, 2) movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movapd %xmm9, 8 * SIZE(B) movapd %xmm11, 10 * SIZE(B) movapd %xmm13, 12 * SIZE(B) movapd %xmm15, 14 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) movlpd %xmm9, 16 * SIZE(BO) movlpd %xmm9, 17 * SIZE(BO) movhpd %xmm9, 18 * SIZE(BO) movhpd %xmm9, 19 * SIZE(BO) movlpd %xmm11, 20 * SIZE(BO) movlpd %xmm11, 21 * SIZE(BO) movhpd %xmm11, 22 * SIZE(BO) movhpd %xmm11, 23 * SIZE(BO) movlpd %xmm13, 24 * SIZE(BO) movlpd %xmm13, 25 * SIZE(BO) movhpd %xmm13, 26 * SIZE(BO) movhpd %xmm13, 27 * SIZE(BO) movlpd %xmm15, 28 * SIZE(BO) movlpd %xmm15, 29 * SIZE(BO) movhpd %xmm15, 30 * SIZE(BO) movhpd %xmm15, 31 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) movapd %xmm12, 8 * SIZE(AO) movapd %xmm13, 10 * SIZE(AO) movapd %xmm14, 12 * SIZE(AO) movapd %xmm15, 14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 32 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 2 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm11, %xmm2 movapd 40 * SIZE(BO), %xmm11 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm13 addpd %xmm13, %xmm0 movapd 18 * SIZE(BO), %xmm13 mulpd %xmm8, %xmm13 addpd %xmm13, %xmm1 movapd 20 * SIZE(BO), %xmm13 mulpd %xmm8, %xmm13 mulpd 22 * SIZE(BO), %xmm8 addpd %xmm13, %xmm2 movapd 48 * SIZE(BO), %xmm13 addpd %xmm8, %xmm3 movapd 6 * SIZE(AO), %xmm8 mulpd %xmm8, %xmm15 addpd %xmm15, %xmm0 movapd 26 * SIZE(BO), %xmm15 mulpd %xmm8, %xmm15 addpd %xmm15, %xmm1 movapd 28 * SIZE(BO), %xmm15 mulpd %xmm8, %xmm15 mulpd 30 * SIZE(BO), %xmm8 addpd %xmm15, %xmm2 movapd 56 * SIZE(BO), %xmm15 addpd %xmm8, %xmm3 movapd 16 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 34 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movapd 36 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 38 * SIZE(BO), %xmm10 addpd %xmm9, %xmm2 movapd 64 * SIZE(BO), %xmm9 addpd %xmm10, %xmm3 movapd 10 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 42 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movapd 44 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 mulpd 46 * SIZE(BO), %xmm10 addpd %xmm11, %xmm2 movapd 72 * SIZE(BO), %xmm11 addpd %xmm10, %xmm3 movapd 12 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm13 addpd %xmm13, %xmm0 movapd 50 * SIZE(BO), %xmm13 mulpd %xmm10, %xmm13 addpd %xmm13, %xmm1 movapd 52 * SIZE(BO), %xmm13 mulpd %xmm10, %xmm13 mulpd 54 * SIZE(BO), %xmm10 addpd %xmm13, %xmm2 movapd 80 * SIZE(BO), %xmm13 addpd %xmm10, %xmm3 movapd 14 * SIZE(AO), %xmm10 mulpd %xmm10, %xmm15 addpd %xmm15, %xmm0 movapd 58 * SIZE(BO), %xmm15 mulpd %xmm10, %xmm15 addpd %xmm15, %xmm1 movapd 60 * SIZE(BO), %xmm15 mulpd %xmm10, %xmm15 mulpd 62 * SIZE(BO), %xmm10 addpd %xmm15, %xmm2 movapd 88 * SIZE(BO), %xmm15 addpd %xmm10, %xmm3 movapd 24 * SIZE(AO), %xmm10 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 8 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 movapd 4 * SIZE(AO), %xmm12 movapd 6 * SIZE(AO), %xmm14 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 subpd %xmm2, %xmm12 subpd %xmm3, %xmm14 #endif #ifdef LN movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movlpd 2 * SIZE(AO), %xmm2 movhpd 2 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm0 movhpd 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movlpd 1 * SIZE(AO), %xmm2 movhpd 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movlpd 3 * SIZE(AO), %xmm0 movhpd 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 movlpd 1 * SIZE(B), %xmm1 movhpd 1 * SIZE(B), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movlpd 2 * SIZE(B), %xmm2 movhpd 2 * SIZE(B), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movlpd 3 * SIZE(B), %xmm3 movhpd 3 * SIZE(B), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 6 * SIZE(B), %xmm1 movhpd 6 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movlpd 7 * SIZE(B), %xmm2 movhpd 7 * SIZE(B), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 movlpd 11 * SIZE(B), %xmm1 movhpd 11 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 #endif #ifdef RT movlpd 15 * SIZE(B), %xmm0 movhpd 15 * SIZE(B), %xmm0 mulpd %xmm0, %xmm14 movlpd 14 * SIZE(B), %xmm1 movhpd 14 * SIZE(B), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movlpd 13 * SIZE(B), %xmm2 movhpd 13 * SIZE(B), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movlpd 12 * SIZE(B), %xmm3 movhpd 12 * SIZE(B), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movlpd 10 * SIZE(B), %xmm0 movhpd 10 * SIZE(B), %xmm0 mulpd %xmm0, %xmm12 movlpd 9 * SIZE(B), %xmm1 movhpd 9 * SIZE(B), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movlpd 8 * SIZE(B), %xmm2 movhpd 8 * SIZE(B), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movlpd 5 * SIZE(B), %xmm0 movhpd 5 * SIZE(B), %xmm0 mulpd %xmm0, %xmm10 movlpd 4 * SIZE(B), %xmm1 movhpd 4 * SIZE(B), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movlpd 0 * SIZE(B), %xmm0 movhpd 0 * SIZE(B), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) movapd %xmm12, 4 * SIZE(AO) movapd %xmm14, 6 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsd 16 * SIZE(BO), %xmm13 movsd 24 * SIZE(BO), %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 4 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 32 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 1 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm0 movsd 10 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm1 movsd 12 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 mulsd 14 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 40 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 2 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm0 movsd 18 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm1 movsd 20 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 mulsd 22 * SIZE(BO), %xmm8 addsd %xmm13, %xmm2 movsd 48 * SIZE(BO), %xmm13 addsd %xmm8, %xmm3 movsd 3 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm0 movsd 26 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm1 movsd 28 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 mulsd 30 * SIZE(BO), %xmm8 addsd %xmm15, %xmm2 movsd 56 * SIZE(BO), %xmm15 addsd %xmm8, %xmm3 movsd 4 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 movsd 34 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 36 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 38 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 64 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 5 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm0 movsd 42 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 addsd %xmm11, %xmm1 movsd 44 * SIZE(BO), %xmm11 mulsd %xmm8, %xmm11 mulsd 46 * SIZE(BO), %xmm8 addsd %xmm11, %xmm2 movsd 72 * SIZE(BO), %xmm11 addsd %xmm8, %xmm3 movsd 6 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm0 movsd 50 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 addsd %xmm13, %xmm1 movsd 52 * SIZE(BO), %xmm13 mulsd %xmm8, %xmm13 mulsd 54 * SIZE(BO), %xmm8 addsd %xmm13, %xmm2 movsd 80 * SIZE(BO), %xmm13 addsd %xmm8, %xmm3 movsd 7 * SIZE(AO), %xmm8 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm0 movsd 58 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 addsd %xmm15, %xmm1 movsd 60 * SIZE(BO), %xmm15 mulsd %xmm8, %xmm15 mulsd 62 * SIZE(BO), %xmm8 addsd %xmm15, %xmm2 movsd 88 * SIZE(BO), %xmm15 addsd %xmm8, %xmm3 movsd 8 * SIZE(AO), %xmm8 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm8, %xmm9 addsd %xmm9, %xmm0 movsd 2 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 addsd %xmm9, %xmm1 movsd 4 * SIZE(BO), %xmm9 mulsd %xmm8, %xmm9 mulsd 6 * SIZE(BO), %xmm8 addsd %xmm9, %xmm2 movsd 8 * SIZE(BO), %xmm9 addsd %xmm8, %xmm3 movsd 1 * SIZE(AO), %xmm8 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(B), %xmm4 movsd 1 * SIZE(B), %xmm5 movsd 2 * SIZE(B), %xmm6 movsd 3 * SIZE(B), %xmm7 #else movsd 0 * SIZE(AO), %xmm4 movsd 1 * SIZE(AO), %xmm5 movsd 2 * SIZE(AO), %xmm6 movsd 3 * SIZE(AO), %xmm7 #endif subsd %xmm0, %xmm4 subsd %xmm1, %xmm5 subsd %xmm2, %xmm6 subsd %xmm3, %xmm7 #ifdef LN movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm4 mulsd %xmm0, %xmm5 mulsd %xmm0, %xmm6 mulsd %xmm0, %xmm7 #endif #ifdef RN mulsd 0 * SIZE(B), %xmm4 movlpd 1 * SIZE(B), %xmm1 mulsd %xmm4, %xmm1 subsd %xmm1, %xmm5 movlpd 2 * SIZE(B), %xmm2 mulsd %xmm4, %xmm2 subsd %xmm2, %xmm6 movlpd 3 * SIZE(B), %xmm3 mulsd %xmm4, %xmm3 subsd %xmm3, %xmm7 mulsd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm6 movlpd 7 * SIZE(B), %xmm2 mulsd %xmm5, %xmm2 subsd %xmm2, %xmm7 mulsd 10 * SIZE(B), %xmm6 movlpd 11 * SIZE(B), %xmm1 mulsd %xmm6, %xmm1 subsd %xmm1, %xmm7 mulsd 15 * SIZE(B), %xmm7 #endif #ifdef RT mulsd 15 * SIZE(B), %xmm7 movlpd 14 * SIZE(B), %xmm1 mulsd %xmm7, %xmm1 subsd %xmm1, %xmm6 movlpd 13 * SIZE(B), %xmm2 mulsd %xmm7, %xmm2 subsd %xmm2, %xmm5 movlpd 12 * SIZE(B), %xmm3 mulsd %xmm7, %xmm3 subsd %xmm3, %xmm4 mulsd 10 * SIZE(B), %xmm6 movlpd 9 * SIZE(B), %xmm1 mulsd %xmm6, %xmm1 subsd %xmm1, %xmm5 movlpd 8 * SIZE(B), %xmm2 mulsd %xmm6, %xmm2 subsd %xmm2, %xmm4 mulsd 5 * SIZE(B), %xmm5 movlpd 4 * SIZE(B), %xmm1 mulsd %xmm5, %xmm1 subsd %xmm1, %xmm4 mulsd 0 * SIZE(B), %xmm4 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif movsd %xmm4, 0 * SIZE(CO1) movsd %xmm5, 0 * SIZE(CO2) movsd %xmm6, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 0 * SIZE(CO2, LDC, 2) #if defined(LN) || defined(LT) movsd %xmm4, 0 * SIZE(B) movsd %xmm5, 1 * SIZE(B) movsd %xmm6, 2 * SIZE(B) movsd %xmm7, 3 * SIZE(B) movsd %xmm4, 0 * SIZE(BO) movsd %xmm4, 1 * SIZE(BO) movsd %xmm5, 2 * SIZE(BO) movsd %xmm5, 3 * SIZE(BO) movsd %xmm6, 4 * SIZE(BO) movsd %xmm6, 5 * SIZE(BO) movsd %xmm7, 6 * SIZE(BO) movsd %xmm7, 7 * SIZE(BO) #else movsd %xmm4, 0 * SIZE(AO) movsd %xmm5, 1 * SIZE(AO) movsd %xmm6, 2 * SIZE(AO) movsd %xmm7, 3 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S000066400000000000000000002236331313527062700221050ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #else #define STACKSIZE 272 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define KKK 232(%rsp) #define AORIG 240(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #define KERNEL1(address) \ mulpd %xmm8, %xmm9 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N je .L80 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 24 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm8 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm8, %xmm0 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 5 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm8 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 6 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 movapd 28 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 7 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(AO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(AO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(AO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(AO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) movsd %xmm3, 2 * SIZE(CO1) movhpd %xmm3, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(AO), %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd 4 * SIZE(AO), %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm10 movddup 5 * SIZE(BO), %xmm11 addpd %xmm10, %xmm0 mulpd 10 * SIZE(AO), %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 6 * SIZE(BO), %xmm11 mulpd 12 * SIZE(AO), %xmm11 addpd %xmm11, %xmm2 movddup 7 * SIZE(BO), %xmm11 mulpd 14 * SIZE(AO), %xmm11 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm9, %xmm8 movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(AO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L110: testq $1, M je .L119 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulpd %xmm9, %xmm8 movapd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(BO), %xmm9 movapd 8 * SIZE(BO), %xmm8 addpd %xmm9, %xmm1 movapd 8 * SIZE(AO), %xmm9 mulpd %xmm11, %xmm10 movapd 6 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 mulpd 6 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm10 addpd %xmm11, %xmm1 movapd 12 * SIZE(AO), %xmm11 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd 0 * SIZE(BO), %xmm9 addsd %xmm9, %xmm0 movsd 1 * SIZE(AO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: addpd %xmm1, %xmm0 haddpd %xmm0, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm2 subsd %xmm0, %xmm2 #else movsd 0 * SIZE(AO), %xmm2 subsd %xmm0, %xmm2 #endif #ifdef LN movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm4 mulsd %xmm4, %xmm2 #endif #ifdef RN movsd 0 * SIZE(BO), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef RT movsd 0 * SIZE(BO), %xmm0 mulsd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) #else movsd %xmm2, 0 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(BO) #else movsd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L119: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_2 .L80: testq $2, N je .L40 ALIGN_4 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) prefetchw 4 * SIZE(CO2) #else prefetchnta 4 * SIZE(CO1) prefetchnta 4 * SIZE(CO2) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm5 movapd 4 * SIZE(BO), %xmm9 movapd 6 * SIZE(BO), %xmm13 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 subpd %xmm4, %xmm9 subpd %xmm12, %xmm13 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 #endif #ifdef LN movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 #endif #ifdef RT movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm5, 2 * SIZE(BO) movapd %xmm9, 4 * SIZE(BO) movapd %xmm13, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm5 subpd %xmm0, %xmm1 subpd %xmm8, %xmm5 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 #endif #ifdef LN movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 #endif #ifdef RT movddup 3 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 2 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm5, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movddup 2 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movddup 3 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movddup 8 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm11 movddup 5 * SIZE(AO), %xmm10 addpd %xmm11, %xmm0 mulpd 10 * SIZE(BO), %xmm10 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movddup 6 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movddup 7 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movddup 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 subpd %xmm0, %xmm2 #else movapd 0 * SIZE(AO), %xmm2 subpd %xmm0, %xmm2 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 3 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 unpcklpd %xmm0, %xmm2 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movsd 3 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L40: movq N, J sarq $2, J # j = (n >> 2) jle .L999 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movddup 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movddup 24 * SIZE(BO), %xmm15 prefetchnta 4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta 4 * SIZE(CO2) pxor %xmm5, %xmm5 prefetchnta 4 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 prefetchnta 4 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif #if 1 andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L12 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L12 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L12 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L12 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L12 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L12 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L12 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 #else sarq $3, %rax je .L15 ALIGN_4 .L12: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 32 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 40 * SIZE(BO), %xmm11 mulpd %xmm12, %xmm13 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm13, %xmm0 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 18 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 16 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 20 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm0 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 22 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 48 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 48 * SIZE(BO), %xmm13 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 26 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 28 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 30 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 56 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 56 * SIZE(BO), %xmm15 addq $32 * SIZE, BO addq $32 * SIZE, AO decq %rax BRANCH jne .L12 #endif ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax subq $4, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd %xmm4, %xmm12 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm12 movapd %xmm6, %xmm14 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm14 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm5 movapd 6 * SIZE(BO), %xmm7 movapd 8 * SIZE(BO), %xmm9 movapd 10 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm13 movapd 14 * SIZE(BO), %xmm15 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 subpd %xmm4, %xmm9 subpd %xmm6, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 movapd 8 * SIZE(AO), %xmm12 movapd 10 * SIZE(AO), %xmm13 movapd 12 * SIZE(AO), %xmm14 movapd 14 * SIZE(AO), %xmm15 subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm1, %xmm10 subpd %xmm5, %xmm11 subpd %xmm2, %xmm12 subpd %xmm6, %xmm13 subpd %xmm3, %xmm14 subpd %xmm7, %xmm15 #endif #ifdef LN movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 14 * SIZE(AO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm13, %xmm4 subpd %xmm4, %xmm5 movddup 13 * SIZE(AO), %xmm4 mulpd %xmm15, %xmm4 subpd %xmm4, %xmm7 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm13, %xmm6 subpd %xmm6, %xmm1 movddup 12 * SIZE(AO), %xmm6 mulpd %xmm15, %xmm6 subpd %xmm6, %xmm3 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm5 movddup 9 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm7 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm9, %xmm4 subpd %xmm4, %xmm1 movddup 8 * SIZE(AO), %xmm4 mulpd %xmm11, %xmm4 subpd %xmm4, %xmm3 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm1, %xmm4 subpd %xmm4, %xmm9 movddup 2 * SIZE(AO), %xmm4 mulpd %xmm3, %xmm4 subpd %xmm4, %xmm11 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm1, %xmm6 subpd %xmm6, %xmm13 movddup 3 * SIZE(AO), %xmm6 mulpd %xmm3, %xmm6 subpd %xmm6, %xmm15 movddup 5 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm9 movddup 6 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm11 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm5, %xmm4 subpd %xmm4, %xmm13 movddup 7 * SIZE(AO), %xmm4 mulpd %xmm7, %xmm4 subpd %xmm4, %xmm15 movddup 10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm11 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 11 * SIZE(AO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movddup 15 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm15 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm9, %xmm1 subpd %xmm1, %xmm11 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm9, %xmm2 subpd %xmm2, %xmm13 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm9, %xmm3 subpd %xmm3, %xmm15 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm13 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm11, %xmm2 subpd %xmm2, %xmm15 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm15 movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 #endif #ifdef RT movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 mulpd %xmm0, %xmm15 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm15, %xmm1 subpd %xmm1, %xmm13 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm15, %xmm2 subpd %xmm2, %xmm11 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm15, %xmm3 subpd %xmm3, %xmm9 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm13, %xmm1 subpd %xmm1, %xmm11 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm13, %xmm2 subpd %xmm2, %xmm9 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm11, %xmm1 subpd %xmm1, %xmm9 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movhpd %xmm9, 2 * SIZE(CO2) movhpd %xmm13, 3 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movsd %xmm11, 2 * SIZE(CO1, LDC, 2) movsd %xmm15, 3 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm13, 2 * SIZE(CO1, LDC, 2) movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) movsd %xmm15, 2 * SIZE(CO2, LDC, 2) movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) movapd %xmm5, 4 * SIZE(BO) movapd %xmm7, 6 * SIZE(BO) movapd %xmm9, 8 * SIZE(BO) movapd %xmm11, 10 * SIZE(BO) movapd %xmm13, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) movapd %xmm12, 8 * SIZE(AO) movapd %xmm13, 10 * SIZE(AO) movapd %xmm14, 12 * SIZE(AO) movapd %xmm15, 14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M BRANCH je .L30 ALIGN_4 .L21: #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm10 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm10 movapd 0 * SIZE(BO), %xmm1 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm5 movapd 6 * SIZE(BO), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm8, %xmm5 subpd %xmm10, %xmm7 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm10 movapd 4 * SIZE(AO), %xmm12 movapd 6 * SIZE(AO), %xmm14 subpd %xmm0, %xmm8 subpd %xmm1, %xmm10 subpd %xmm2, %xmm12 subpd %xmm3, %xmm14 #endif #ifdef LN movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 subpd %xmm2, %xmm1 movddup 2 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 subpd %xmm2, %xmm3 movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm3 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm1, %xmm2 subpd %xmm2, %xmm5 movddup 1 * SIZE(AO), %xmm2 mulpd %xmm3, %xmm2 subpd %xmm2, %xmm7 movddup 3 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 mulpd %xmm0, %xmm7 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 movddup 1 * SIZE(BO), %xmm1 mulpd %xmm8, %xmm1 subpd %xmm1, %xmm10 movddup 2 * SIZE(BO), %xmm2 mulpd %xmm8, %xmm2 subpd %xmm2, %xmm12 movddup 3 * SIZE(BO), %xmm3 mulpd %xmm8, %xmm3 subpd %xmm3, %xmm14 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 6 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm12 movddup 7 * SIZE(BO), %xmm2 mulpd %xmm10, %xmm2 subpd %xmm2, %xmm14 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 movddup 11 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm14 movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 #endif #ifdef RT movddup 15 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm14 movddup 14 * SIZE(BO), %xmm1 mulpd %xmm14, %xmm1 subpd %xmm1, %xmm12 movddup 13 * SIZE(BO), %xmm2 mulpd %xmm14, %xmm2 subpd %xmm2, %xmm10 movddup 12 * SIZE(BO), %xmm3 mulpd %xmm14, %xmm3 subpd %xmm3, %xmm8 movddup 10 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm12 movddup 9 * SIZE(BO), %xmm1 mulpd %xmm12, %xmm1 subpd %xmm1, %xmm10 movddup 8 * SIZE(BO), %xmm2 mulpd %xmm12, %xmm2 subpd %xmm2, %xmm8 movddup 5 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm10 movddup 4 * SIZE(BO), %xmm1 mulpd %xmm10, %xmm1 subpd %xmm1, %xmm8 movddup 0 * SIZE(BO), %xmm0 mulpd %xmm0, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm1, 0 * SIZE(CO1) movsd %xmm5, 1 * SIZE(CO1) movhpd %xmm1, 0 * SIZE(CO2) movhpd %xmm5, 1 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movsd %xmm7, 1 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm12, 0 * SIZE(CO1, LDC, 2) movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) movsd %xmm14, 0 * SIZE(CO2, LDC, 2) movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) movapd %xmm5, 4 * SIZE(BO) movapd %xmm7, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm10, 2 * SIZE(AO) movapd %xmm12, 4 * SIZE(AO) movapd %xmm14, 6 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 3 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 8 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 5 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 6 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 7 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 40 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #else movapd 0 * SIZE(AO), %xmm2 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm2 subpd %xmm1, %xmm3 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 #endif #ifdef RN movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 movsd 1 * SIZE(BO), %xmm5 mulsd %xmm2, %xmm5 subsd %xmm5, %xmm0 movsd 2 * SIZE(BO), %xmm6 mulsd %xmm2, %xmm6 subsd %xmm6, %xmm3 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm7 subsd %xmm7, %xmm1 movsd 5 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm3 movsd 7 * SIZE(BO), %xmm6 mulsd %xmm0, %xmm6 subsd %xmm6, %xmm1 movsd 10 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd 11 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm1 movsd 15 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef RT movapd %xmm2, %xmm0 unpckhpd %xmm0, %xmm0 movapd %xmm3, %xmm1 unpckhpd %xmm1, %xmm1 movsd 15 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm1 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 subsd %xmm5, %xmm3 movsd 13 * SIZE(BO), %xmm6 mulsd %xmm1, %xmm6 subsd %xmm6, %xmm0 movsd 12 * SIZE(BO), %xmm7 mulsd %xmm1, %xmm7 subsd %xmm7, %xmm2 movsd 10 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm3 movsd 9 * SIZE(BO), %xmm5 mulsd %xmm3, %xmm5 subsd %xmm5, %xmm0 movsd 8 * SIZE(BO), %xmm6 mulsd %xmm3, %xmm6 subsd %xmm6, %xmm2 movsd 5 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm0 movsd 4 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 subsd %xmm5, %xmm2 movsd 0 * SIZE(BO), %xmm4 mulsd %xmm4, %xmm2 unpcklpd %xmm0, %xmm2 unpcklpd %xmm1, %xmm3 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #else movsd %xmm2, 0 * SIZE(CO1) movhpd %xmm2, 0 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO1, LDC, 2) movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) #endif #if defined(LN) || defined(LT) movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) #else movapd %xmm2, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L10 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S000066400000000000000000002537451313527062700226540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (16 * 1 + 4) #define PREFETCH prefetcht0 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK leaq (, LDC, SIZE), LDC movq KK, OFFSET negq KK #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N jle .L40 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I NOBRANCH jle .L110 ALIGN_4 .L101: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -29 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_3 .L105: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_3 .L106: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_3 .L108: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 movhps -30 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 #else movaps -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, -32 * SIZE(BO) movlps %xmm2, -30 * SIZE(BO) movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 2 * SIZE(CO1) #else movaps %xmm0, -32 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L101 ALIGN_4 .L110: testq $2, M BRANCH jle .L120 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_3 .L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -31 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -29 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AO), %xmm0 subq $-4 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L112 ALIGN_3 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_3 .L116: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_3 .L118: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 pshufd $0x55, %xmm0, %xmm1 #else movsd -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 movlps %xmm0, -32 * SIZE(BO) movlps %xmm0, 0 * SIZE(CO1) #else movlps %xmm0, -32 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L120: testq $1, M BRANCH jle .L129 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L125 ALIGN_3 .L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -30 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -30 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -29 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -29 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -28 * SIZE(AO), %xmm0 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L122 ALIGN_3 .L125: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L128 ALIGN_3 .L126: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L126 ALIGN_3 .L128: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif addss %xmm2, %xmm8 #if defined(LN) || defined(LT) movss -32 * SIZE(BO), %xmm0 subss %xmm8, %xmm0 #else movss -32 * SIZE(AO), %xmm0 subss %xmm8, %xmm0 #endif #if defined(LN) || defined(LT) movss -32 * SIZE(AO), %xmm8 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm8 #endif mulss %xmm8, %xmm0 #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm0, -32 * SIZE(BO) #else movss %xmm0, -32 * SIZE(AO) #endif movss %xmm0, (CO1) #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L129: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L40: testq $2, N jle .L70 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I NOBRANCH jle .L80 ALIGN_4 .L71: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -26 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -24 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_3 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_3 .L78: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 unpcklps %xmm9, %xmm8 unpckhps %xmm9, %xmm4 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm2 subps %xmm8, %xmm0 subps %xmm4, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 subps %xmm8, %xmm0 subps %xmm9, %xmm2 #endif #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm2 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlps %xmm0, -32 * SIZE(BO) movlps %xmm1, -30 * SIZE(BO) movlps %xmm2, -28 * SIZE(BO) movlps %xmm3, -26 * SIZE(BO) unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(CO1) movlps %xmm2, 2 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L71 ALIGN_4 .L80: testq $2, M BRANCH jle .L90 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -26 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L82 ALIGN_3 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_3 .L88: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addps %xmm1, %xmm8 #if defined(LN) || defined(LT) pshufd $0xd8, %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm0 #else movaps -32 * SIZE(AO), %xmm0 #endif subps %xmm8, %xmm0 movhlps %xmm0, %xmm1 #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movlps %xmm0, -32 * SIZE(BO) movlps %xmm1, -30 * SIZE(BO) unpcklps %xmm1, %xmm0 movlps %xmm0, (CO1) movhps %xmm0, (CO2) #else movlps %xmm0, -32 * SIZE(AO) movlps %xmm1, -30 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L90: testq $1, M BRANCH jle .L99 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_3 .L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -26 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L92 addps %xmm9, %xmm8 ALIGN_3 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_3 .L96: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_3 .L98: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif addps %xmm2, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 #else movsd -32 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 #endif pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #if defined(LN) || defined(LT) movss -32 * SIZE(AO), %xmm8 mulss %xmm8, %xmm0 mulss %xmm8, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 #endif #ifdef RT movaps -32 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movss %xmm0, -32 * SIZE(BO) movss %xmm1, -31 * SIZE(BO) #else movss %xmm0, -32 * SIZE(AO) movss %xmm1, -31 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO2) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L70: testq $4, N jle .L100 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $2, I NOBRANCH jle .L50 ALIGN_4 .L41: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 4 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht2 4 * SIZE(CO2, LDC, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm4 shufps $0xdd, %xmm9, %xmm5 movaps %xmm8, %xmm6 shufps $0x88, %xmm10, %xmm8 shufps $0xdd, %xmm6, %xmm10 movaps %xmm4, %xmm9 movaps %xmm5, %xmm11 shufps $0x22, %xmm5, %xmm9 shufps $0x77, %xmm4, %xmm11 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps -20 * SIZE(BO), %xmm3 #else movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -24 * SIZE(AO), %xmm2 movaps -20 * SIZE(AO), %xmm3 #endif subps %xmm8, %xmm0 subps %xmm9, %xmm1 subps %xmm10, %xmm2 subps %xmm11, %xmm3 #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm2, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) movaps %xmm0, %xmm8 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm8, %xmm1 movaps %xmm2, %xmm9 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm9, %xmm3 movaps %xmm0, %xmm8 shufps $0x88, %xmm2, %xmm0 movaps %xmm1, %xmm9 shufps $0x22, %xmm3, %xmm1 shufps $0xdd, %xmm2, %xmm8 movaps %xmm8, %xmm2 shufps $0x77, %xmm3, %xmm9 movaps %xmm9, %xmm3 #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm1, -28 * SIZE(AO) movaps %xmm2, -24 * SIZE(AO) movaps %xmm3, -20 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movsd %xmm3, 0 * SIZE(CO2, LDC, 1) movhps %xmm3, 2 * SIZE(CO2, LDC, 1) #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L41 ALIGN_4 .L50: testq $2, M BRANCH jle .L60 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 subps %xmm8, %xmm0 subps %xmm4, %xmm1 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 subps %xmm8, %xmm0 subps %xmm9, %xmm2 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm0, %xmm4 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm4 movsd %xmm0, (CO1) movhps %xmm0, (CO1, LDC, 1) movsd %xmm4, (CO2) movhps %xmm4, (CO2, LDC, 1) #else movlhps %xmm1, %xmm0 movlhps %xmm3, %xmm2 movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO1, LDC, 1) movsd %xmm2, (CO2) movsd %xmm3, (CO2, LDC, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $1, M BRANCH jle .L69 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -20 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-16 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L62 addps %xmm9, %xmm8 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif addps %xmm2, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm0 subps %xmm8, %xmm0 #else movsd -32 * SIZE(AO), %xmm0 movhps -30 * SIZE(AO), %xmm0 subps %xmm8, %xmm0 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 #endif #ifdef RT movaps -20 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -28 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 #else unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 movlps %xmm0, -32 * SIZE(AO) movlps %xmm2, -30 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO1, LDC, 1) movss %xmm2, (CO2) movss %xmm3, (CO2, LDC, 1) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif ALIGN_4 .L100: movq N, J sarq $3, J NOBRANCH jle .L999 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B leaq (, LDC, 8), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 4), CO2 #ifndef RT leaq (C, LDC, 8), C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $2, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 8), BO #else movq B, BO #endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 4 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht2 4 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht2 4 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht2 4 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht2 4 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht2 4 * SIZE(CO2, %rax, 1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 8), BO #endif addps %xmm1, %xmm12 addps %xmm2, %xmm13 addps %xmm3, %xmm14 addps %xmm4, %xmm15 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm4 shufps $0xdd, %xmm9, %xmm5 movaps %xmm8, %xmm6 shufps $0x88, %xmm10, %xmm8 shufps $0xdd, %xmm6, %xmm10 movaps %xmm4, %xmm9 movaps %xmm5, %xmm11 shufps $0x22, %xmm5, %xmm9 shufps $0x77, %xmm4, %xmm11 movaps %xmm12, %xmm4 shufps $0x88, %xmm13, %xmm12 movaps %xmm14, %xmm5 shufps $0x88, %xmm15, %xmm14 shufps $0xdd, %xmm15, %xmm4 shufps $0xdd, %xmm13, %xmm5 movaps %xmm12, %xmm6 shufps $0x88, %xmm14, %xmm12 shufps $0xdd, %xmm6, %xmm14 movaps %xmm4, %xmm13 movaps %xmm5, %xmm15 shufps $0x22, %xmm5, %xmm13 shufps $0x77, %xmm4, %xmm15 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm4 movaps -24 * SIZE(BO), %xmm1 movaps -20 * SIZE(BO), %xmm5 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm6 movaps -8 * SIZE(BO), %xmm3 movaps -4 * SIZE(BO), %xmm7 #else movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps %xmm13, %xmm4 shufps $0xd8, %xmm12, %xmm13 shufps $0xd8, %xmm15, %xmm12 shufps $0xd8, %xmm14, %xmm15 shufps $0xd8, %xmm4, %xmm14 movaps %xmm12, %xmm4 shufps $0xd8, %xmm14, %xmm12 shufps $0xd8, %xmm4, %xmm14 movaps %xmm13, %xmm5 shufps $0xd8, %xmm15, %xmm13 shufps $0xd8, %xmm5, %xmm15 movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -24 * SIZE(AO), %xmm2 movaps -20 * SIZE(AO), %xmm3 movaps -16 * SIZE(AO), %xmm4 movaps -12 * SIZE(AO), %xmm5 movaps -8 * SIZE(AO), %xmm6 movaps -4 * SIZE(AO), %xmm7 #endif subps %xmm8, %xmm0 subps %xmm9, %xmm1 subps %xmm10, %xmm2 subps %xmm11, %xmm3 subps %xmm12, %xmm4 subps %xmm13, %xmm5 subps %xmm14, %xmm6 subps %xmm15, %xmm7 #ifdef LN movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps -28 * SIZE(AO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(AO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps -20 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 mulps %xmm15, %xmm7 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm4, -28 * SIZE(BO) movaps %xmm1, -24 * SIZE(BO) movaps %xmm5, -20 * SIZE(BO) movaps %xmm2, -16 * SIZE(BO) movaps %xmm6, -12 * SIZE(BO) movaps %xmm3, -8 * SIZE(BO) movaps %xmm7, -4 * SIZE(BO) movaps %xmm0, %xmm8 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm8, %xmm1 movaps %xmm2, %xmm9 shufps $0x88, %xmm3, %xmm2 shufps $0xdd, %xmm9, %xmm3 movaps %xmm0, %xmm8 shufps $0x88, %xmm2, %xmm0 movaps %xmm1, %xmm9 shufps $0x22, %xmm3, %xmm1 shufps $0xdd, %xmm2, %xmm8 movaps %xmm8, %xmm2 shufps $0x77, %xmm3, %xmm9 movaps %xmm9, %xmm3 movaps %xmm4, %xmm8 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm8, %xmm5 movaps %xmm6, %xmm9 shufps $0x88, %xmm7, %xmm6 shufps $0xdd, %xmm9, %xmm7 movaps %xmm4, %xmm8 shufps $0x88, %xmm6, %xmm4 movaps %xmm5, %xmm9 shufps $0x22, %xmm7, %xmm5 shufps $0xdd, %xmm6, %xmm8 movaps %xmm8, %xmm6 shufps $0x77, %xmm7, %xmm9 movaps %xmm9, %xmm7 #else movaps %xmm0, -32 * SIZE(AO) movaps %xmm1, -28 * SIZE(AO) movaps %xmm2, -24 * SIZE(AO) movaps %xmm3, -20 * SIZE(AO) movaps %xmm4, -16 * SIZE(AO) movaps %xmm5, -12 * SIZE(AO) movaps %xmm6, -8 * SIZE(AO) movaps %xmm7, -4 * SIZE(AO) #endif leaq (LDC, LDC, 2), %rax movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm1, 0 * SIZE(CO1, LDC, 1) movhps %xmm1, 2 * SIZE(CO1, LDC, 1) movsd %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movsd %xmm3, 0 * SIZE(CO1, %rax, 1) movhps %xmm3, 2 * SIZE(CO1, %rax, 1) movsd %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movsd %xmm5, 0 * SIZE(CO2, LDC, 1) movhps %xmm5, 2 * SIZE(CO2, LDC, 1) movsd %xmm6, 0 * SIZE(CO2, LDC, 2) movhps %xmm6, 2 * SIZE(CO2, LDC, 2) movsd %xmm7, 0 * SIZE(CO2, %rax, 1) movhps %xmm7, 2 * SIZE(CO2, %rax, 1) #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $4, KK #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -16 * SIZE(BO), %xmm5 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -8 * SIZE(BO), %xmm5 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps 0 * SIZE(BO), %xmm5 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 8), BO #endif addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 #if defined(LN) || defined(LT) movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm5 movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm2 movaps -24 * SIZE(BO), %xmm1 movaps -20 * SIZE(BO), %xmm3 subps %xmm8, %xmm0 subps %xmm4, %xmm1 subps %xmm10, %xmm2 subps %xmm5, %xmm3 #else movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm2 movaps -24 * SIZE(AO), %xmm4 movaps -20 * SIZE(AO), %xmm6 subps %xmm8, %xmm0 subps %xmm9, %xmm2 subps %xmm10, %xmm4 subps %xmm11, %xmm6 movhlps %xmm0, %xmm1 movhlps %xmm2, %xmm3 movhlps %xmm4, %xmm5 movhlps %xmm6, %xmm7 #endif #ifdef LN movaps -32 * SIZE(AO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm2 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm0, %xmm15 subps %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm7, %xmm15 subps %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm6, %xmm15 subps %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm5, %xmm15 subps %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm4, %xmm15 subps %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulps %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm3, %xmm15 subps %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulps %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm2, %xmm15 subps %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulps %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulps %xmm1, %xmm15 subps %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm2, -28 * SIZE(BO) movaps %xmm1, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) movaps %xmm0, %xmm4 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm4 movaps %xmm2, %xmm5 unpcklps %xmm3, %xmm2 unpckhps %xmm3, %xmm5 movsd %xmm0, (CO1) movhps %xmm0, (CO1, LDC, 1) movsd %xmm4, (CO1, LDC, 2) movhps %xmm4, (CO1, %rax, 1) movsd %xmm2, (CO2) movhps %xmm2, (CO2, LDC, 1) movsd %xmm5, (CO2, LDC, 2) movhps %xmm5, (CO2, %rax, 1) #else movlhps %xmm1, %xmm0 movlhps %xmm3, %xmm2 movlhps %xmm5, %xmm4 movlhps %xmm7, %xmm6 movaps %xmm0, -32 * SIZE(AO) movaps %xmm2, -28 * SIZE(AO) movaps %xmm4, -24 * SIZE(AO) movaps %xmm6, -20 * SIZE(AO) movsd %xmm0, (CO1) movsd %xmm1, (CO1, LDC, 1) movsd %xmm2, (CO1, LDC, 2) movsd %xmm3, (CO1, %rax, 1) movsd %xmm4, (CO2) movsd %xmm5, (CO2, LDC, 1) movsd %xmm6, (CO2, LDC, 2) movsd %xmm7, (CO2, %rax, 1) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M BRANCH jle .L39 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #else movq B, BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm8, %xmm8 xorps %xmm12, %xmm12 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -20 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -12 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -4 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 subq $-32 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $8, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 8), BO #endif addps %xmm2, %xmm8 addps %xmm3, %xmm12 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm4 subps %xmm8, %xmm0 subps %xmm12, %xmm4 #else movsd -32 * SIZE(AO), %xmm0 movhps -30 * SIZE(AO), %xmm0 movsd -28 * SIZE(AO), %xmm4 movhps -26 * SIZE(AO), %xmm4 subps %xmm8, %xmm0 subps %xmm12, %xmm4 pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 pshufd $0xff, %xmm4, %xmm7 pshufd $0xaa, %xmm4, %xmm6 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #endif #if defined(LN) || defined(LT) movaps -32 * SIZE(AO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulps %xmm15, %xmm0 mulps %xmm15, %xmm4 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm3 movaps -28 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm0, %xmm15 subss %xmm15, %xmm7 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm3 movaps -20 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm7 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm3 movaps -12 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm7 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 movaps -4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm7 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm4 pshufd $0x55, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm7 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm5 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm7 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm6 pshufd $0xff, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm7 movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm7 #endif #ifdef RT movaps 28 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm7 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm4 movaps 24 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm7, %xmm15 subss %xmm15, %xmm0 movaps 20 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm6 pshufd $0x55, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm4 movaps 16 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm6, %xmm15 subss %xmm15, %xmm0 movaps 12 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm5 pshufd $0x00, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm4 movaps 8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm5, %xmm15 subss %xmm15, %xmm0 movaps 4 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm4 movaps 0 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm4, %xmm15 subss %xmm15, %xmm0 movaps -8 * SIZE(BO), %xmm8 pshufd $0xff, %xmm8, %xmm15 mulss %xmm15, %xmm3 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm3, %xmm15 subss %xmm15, %xmm0 movaps -16 * SIZE(BO), %xmm8 pshufd $0xaa, %xmm8, %xmm15 mulss %xmm15, %xmm2 pshufd $0x55, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm2, %xmm15 subss %xmm15, %xmm0 movaps -24 * SIZE(BO), %xmm8 pshufd $0x55, %xmm8, %xmm15 mulss %xmm15, %xmm1 pshufd $0x00, %xmm8, %xmm15 mulss %xmm1, %xmm15 subss %xmm15, %xmm0 movaps -32 * SIZE(BO), %xmm8 pshufd $0x00, %xmm8, %xmm15 mulss %xmm15, %xmm0 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif leaq (LDC, LDC, 2), %rax #if defined(LN) || defined(LT) movaps %xmm0, -32 * SIZE(BO) movaps %xmm4, -28 * SIZE(BO) pshufd $0xff, %xmm0, %xmm3 pshufd $0xaa, %xmm0, %xmm2 pshufd $0x55, %xmm0, %xmm1 pshufd $0x00, %xmm0, %xmm0 pshufd $0xff, %xmm4, %xmm7 pshufd $0xaa, %xmm4, %xmm6 pshufd $0x55, %xmm4, %xmm5 pshufd $0x00, %xmm4, %xmm4 #else unpcklps %xmm1, %xmm0 unpcklps %xmm3, %xmm2 unpcklps %xmm5, %xmm4 unpcklps %xmm7, %xmm6 movlps %xmm0, -32 * SIZE(AO) movlps %xmm2, -30 * SIZE(AO) movlps %xmm4, -28 * SIZE(AO) movlps %xmm6, -26 * SIZE(AO) #endif movss %xmm0, (CO1) movss %xmm1, (CO1, LDC, 1) movss %xmm2, (CO1, LDC, 2) movss %xmm3, (CO1, %rax, 1) movss %xmm4, (CO2) movss %xmm5, (CO2, LDC, 1) movss %xmm6, (CO2, LDC, 2) movss %xmm7, (CO2, %rax, 1) #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 8), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $8, KK #endif #ifdef RT subq $8, KK #endif subq $1, J BRANCH jg .L10 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/trsm_kernel_RT_8x4_sse.S000066400000000000000000003422631313527062700220270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht0 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movsd %xmm4, OFFSET movsd %xmm4, KK leaq (, LDC, SIZE), LDC #ifdef LN leaq (, M, SIZE), %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT leaq (, N, SIZE), %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N je .L50 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax jle .L103 ALIGN_4 .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 movsd 4 * SIZE(B), %xmm7 movhps 6 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW 4 * SIZE(CO1) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps 12 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 64 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 mulps 20 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 24 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps 28 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 80 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 mulps 36 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 40 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 44 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 96 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 mulps 52 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 56 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 60 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 112 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps 4 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 8), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 movss 4 * SIZE(B), %xmm12 movss 5 * SIZE(B), %xmm13 movss 6 * SIZE(B), %xmm14 movss 7 * SIZE(B), %xmm15 subss %xmm0, %xmm1 subss %xmm2, %xmm5 subss %xmm8, %xmm10 subss %xmm3, %xmm11 subss %xmm4, %xmm12 subss %xmm6, %xmm13 subss %xmm9, %xmm14 subss %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 subps %xmm0, %xmm8 subps %xmm4, %xmm9 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm15, %xmm8 subss %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm12, %xmm8 subss %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulss %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulss %xmm13, %xmm8 subss %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulss %xmm14, %xmm8 subss %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulss %xmm8, %xmm15 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) movss %xmm12, 4 * SIZE(B) movss %xmm13, 5 * SIZE(B) movss %xmm14, 6 * SIZE(B) movss %xmm15, 7 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 movaps %xmm2, 16 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 movaps %xmm2, 20 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 movaps %xmm2, 24 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 movaps %xmm2, 28 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 unpcklps %xmm14, %xmm12 unpcklps %xmm15, %xmm13 unpcklps %xmm13, %xmm12 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps 12 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 24 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps 28 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 subss %xmm0, %xmm1 subss %xmm2, %xmm5 subss %xmm8, %xmm10 subss %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm11, %xmm8 subss %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulss %xmm10, %xmm8 subss %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm11 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) movss %xmm10, 2 * SIZE(B) movss %xmm11, 3 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 movaps %xmm2, 8 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 movaps %xmm2, 12 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L130: testq $2, M je .L140 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movsd 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movss 0 * SIZE(B), %xmm1 movss 1 * SIZE(B), %xmm5 subss %xmm0, %xmm1 subss %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulss %xmm5, %xmm8 subss %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulss %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulss %xmm1, %xmm8 subss %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulss %xmm8, %xmm5 #endif #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) movss %xmm5, 1 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 movaps %xmm2, 4 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 unpcklps %xmm5, %xmm1 movlps %xmm1, 0 * SIZE(CO1) #else movlps %xmm8, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L140: testq $1, M je .L149 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 1 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss 2 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss 3 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss 8 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss 6 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss 7 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movss 0 * SIZE(B), %xmm1 subss %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 subps %xmm0, %xmm8 #endif #if defined(LN) || defined(LT) mulss 0 * SIZE(AO), %xmm1 #endif #if defined(RN) || defined(RT) mulss 0 * SIZE(B), %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 movaps %xmm2, 0 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) #endif #if defined(LN) || defined(LT) movss %xmm1, 0 * SIZE(CO1) #else movss %xmm8, 0 * SIZE(CO1) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $1 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L149: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L50: testq $2, N je .L100 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $1 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L53 ALIGN_4 .L52: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW 4 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 4 * SIZE(CO2) pxor %xmm5, %xmm5 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 64 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 80 * SIZE(AO), %xmm10 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 36 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 40 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 44 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 addps %xmm12, %xmm5 movaps 96 * SIZE(BO), %xmm13 movaps 96 * SIZE(AO), %xmm12 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 52 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 56 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 60 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 112 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 4 * SIZE(B), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd 6 * SIZE(B), %xmm11 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd 8 * SIZE(B), %xmm12 #ifdef movsd xorps %xmm13, %xmm13 #endif movsd 10 * SIZE(B), %xmm13 #ifdef movsd xorps %xmm14, %xmm14 #endif movsd 12 * SIZE(B), %xmm14 #ifdef movsd xorps %xmm15, %xmm15 #endif movsd 14 * SIZE(B), %xmm15 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 subps %xmm4, %xmm12 subps %xmm6, %xmm13 subps %xmm9, %xmm14 subps %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 movaps 8 * SIZE(AO), %xmm10 movaps 12 * SIZE(AO), %xmm11 subps %xmm0, %xmm8 subps %xmm4, %xmm9 subps %xmm1, %xmm10 subps %xmm5, %xmm11 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulps %xmm8, %xmm15 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm11 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm9 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) movlps %xmm10, 4 * SIZE(B) movlps %xmm11, 6 * SIZE(B) movlps %xmm12, 8 * SIZE(B) movlps %xmm13, 10 * SIZE(B) movlps %xmm14, 12 * SIZE(B) movlps %xmm15, 14 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 movaps %xmm2, 24 * SIZE(BO) movaps %xmm3, 28 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 pshufd $0x55, %xmm12, %xmm3 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 pshufd $0x55, %xmm13, %xmm3 movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 pshufd $0x55, %xmm14, %xmm3 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 pshufd $0x55, %xmm15, %xmm3 movaps %xmm2, 56 * SIZE(BO) movaps %xmm3, 60 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) movaps %xmm10, 8 * SIZE(AO) movaps %xmm11, 12 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 unpcklps %xmm14, %xmm12 unpcklps %xmm15, %xmm13 movaps %xmm12, %xmm14 unpcklps %xmm13, %xmm12 unpckhps %xmm13, %xmm14 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) movlps %xmm14, 4 * SIZE(CO1, LDC, 1) movhps %xmm14, 6 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) movlps %xmm11, 4 * SIZE(CO1, LDC, 1) movhps %xmm11, 6 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 4 * SIZE(B), %xmm10 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd 6 * SIZE(B), %xmm11 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm10 subps %xmm0, %xmm8 subps %xmm1, %xmm10 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) movlps %xmm10, 4 * SIZE(B) movlps %xmm11, 6 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 movaps %xmm2, 24 * SIZE(BO) movaps %xmm3, 28 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm10, 4 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) movhps %xmm10, 2 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $ 8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L80: testq $2, M je .L90 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 8 * SIZE(AO), %xmm10 #ifdef movsd xorps %xmm9, %xmm9 #endif movsd 0 * SIZE(BO), %xmm9 #ifdef movsd xorps %xmm11, %xmm11 #endif movsd 16 * SIZE(BO), %xmm11 #ifdef movsd xorps %xmm13, %xmm13 #endif movsd 32 * SIZE(BO), %xmm13 #ifdef movsd xorps %xmm15, %xmm15 #endif movsd 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 6 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 10 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 12 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 14 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: addps %xmm2, %xmm0 addps %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(B), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 2 * SIZE(AO), %xmm10 subps %xmm0, %xmm8 subps %xmm1, %xmm10 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) movlps %xmm5, 2 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) movlps %xmm10, 2 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) #else movlps %xmm8, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $ 4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L90: testq $1, M je .L99 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss 3 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss 8 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss 5 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss 6 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss 7 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss 12 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: addss %xmm2, %xmm0 addss %xmm3, %xmm1 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm1, %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(B), %xmm1 subps %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 movss 1 * SIZE(AO), %xmm10 subss %xmm0, %xmm8 subss %xmm1, %xmm10 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm10 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm10 #endif #ifdef RT movaps 0 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) movss %xmm10, 1 * SIZE(AO) #endif #if defined(LN) || defined(LT) unpcklps %xmm10, %xmm1 unpcklps %xmm11, %xmm5 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movss %xmm1, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO1, LDC, 1) #else movss %xmm8, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO1, LDC, 1) #endif #ifndef LN addq $1 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (AO, %rax, SIZE), AO #ifdef LT addq $ 2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L100: movq N, J sarq $2, J # j = (n >> 2) jle .L999 .L01: /* Copying to Sub Buffer */ #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $2 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 movaps 8 * SIZE(B), %xmm11 movaps 12 * SIZE(B), %xmm15 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) pshufd $0x00, %xmm11, %xmm8 pshufd $0x55, %xmm11, %xmm9 pshufd $0xaa, %xmm11, %xmm10 pshufd $0xff, %xmm11, %xmm11 pshufd $0x00, %xmm15, %xmm12 pshufd $0x55, %xmm15, %xmm13 pshufd $0xaa, %xmm15, %xmm14 pshufd $0xff, %xmm15, %xmm15 movaps %xmm8, 32 * SIZE(BO) movaps %xmm9, 36 * SIZE(BO) movaps %xmm10, 40 * SIZE(BO) movaps %xmm11, 44 * SIZE(BO) movaps %xmm12, 48 * SIZE(BO) movaps %xmm13, 52 * SIZE(BO) movaps %xmm14, 56 * SIZE(BO) movaps %xmm15, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 4), C #endif movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 movaps 8 * SIZE(BO), %xmm13 movaps 16 * SIZE(BO), %xmm15 movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 4 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 8 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movaps 12 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW 7 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 7 * SIZE(CO2) pxor %xmm5, %xmm5 PREFETCHW 7 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 PREFETCHW 7 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 4 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 mulps 12 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 8 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 16 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm5 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 mulps 12 * SIZE(BO), %xmm10 addps %xmm13, %xmm6 movaps 24 * SIZE(BO), %xmm13 addps %xmm10, %xmm7 movaps 20 * SIZE(AO), %xmm10 mulps %xmm12, %xmm15 addps %xmm15, %xmm0 movaps 16 * SIZE(BO), %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm1 movaps 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm13 mulps 28 * SIZE(BO), %xmm12 addps %xmm13, %xmm2 movaps 24 * SIZE(BO), %xmm13 addps %xmm12, %xmm3 movaps 24 * SIZE(AO), %xmm12 mulps %xmm14, %xmm15 addps %xmm15, %xmm4 movaps 48 * SIZE(BO), %xmm15 mulps %xmm14, %xmm11 addps %xmm11, %xmm5 movaps 36 * SIZE(BO), %xmm11 mulps %xmm14, %xmm13 mulps 28 * SIZE(BO), %xmm14 addps %xmm13, %xmm6 movaps 40 * SIZE(BO), %xmm13 addps %xmm14, %xmm7 movaps 28 * SIZE(AO), %xmm14 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 36 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 40 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm5 movaps 52 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm6 movaps 56 * SIZE(BO), %xmm13 addps %xmm10, %xmm7 movaps 36 * SIZE(AO), %xmm10 mulps %xmm12, %xmm15 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 mulps %xmm12, %xmm11 addps %xmm11, %xmm1 movaps 52 * SIZE(BO), %xmm11 mulps %xmm12, %xmm13 mulps 60 * SIZE(BO), %xmm12 addps %xmm13, %xmm2 movaps 56 * SIZE(BO), %xmm13 addps %xmm12, %xmm3 movaps 40 * SIZE(AO), %xmm12 mulps %xmm14, %xmm15 addps %xmm15, %xmm4 movaps 80 * SIZE(BO), %xmm15 mulps %xmm14, %xmm11 addps %xmm11, %xmm5 movaps 68 * SIZE(BO), %xmm11 mulps %xmm14, %xmm13 mulps 60 * SIZE(BO), %xmm14 addps %xmm13, %xmm6 movaps 72 * SIZE(BO), %xmm13 addps %xmm14, %xmm7 movaps 44 * SIZE(AO), %xmm14 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm5 movaps 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 12 * SIZE(BO), %xmm10 addps %xmm9, %xmm6 movaps 16 * SIZE(BO), %xmm9 addps %xmm10, %xmm7 movaps 12 * SIZE(AO), %xmm10 addq $8 * SIZE, AO addq $16 * SIZE, BO decq %rax jg .L16 ALIGN_4 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $8, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $2 + BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps %xmm4, %xmm9 unpcklps %xmm6, %xmm4 unpckhps %xmm6, %xmm9 movaps %xmm5, %xmm14 unpcklps %xmm7, %xmm5 unpckhps %xmm7, %xmm14 movaps %xmm4, %xmm6 unpcklps %xmm5, %xmm4 unpckhps %xmm5, %xmm6 movaps %xmm9, %xmm7 unpcklps %xmm14, %xmm9 unpckhps %xmm14, %xmm7 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm5 movaps 8 * SIZE(B), %xmm10 movaps 12 * SIZE(B), %xmm11 movaps 16 * SIZE(B), %xmm12 movaps 20 * SIZE(B), %xmm13 movaps 24 * SIZE(B), %xmm14 movaps 28 * SIZE(B), %xmm15 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 subps %xmm4, %xmm12 subps %xmm6, %xmm13 subps %xmm9, %xmm14 subps %xmm7, %xmm15 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm9 movaps 8 * SIZE(AO), %xmm10 movaps 12 * SIZE(AO), %xmm11 movaps 16 * SIZE(AO), %xmm12 movaps 20 * SIZE(AO), %xmm13 movaps 24 * SIZE(AO), %xmm14 movaps 28 * SIZE(AO), %xmm15 subps %xmm0, %xmm8 subps %xmm4, %xmm9 subps %xmm1, %xmm10 subps %xmm5, %xmm11 subps %xmm2, %xmm12 subps %xmm6, %xmm13 subps %xmm3, %xmm14 subps %xmm7, %xmm15 #endif #ifdef LN movaps 60 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm15 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm12 movaps 56 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm15, %xmm8 subps %xmm8, %xmm1 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm12 movaps 48 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm1 movaps 44 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm13 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm12 movaps 40 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm1 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 movaps 32 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm1 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm15 movaps 8 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm15 movaps 16 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 20 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm15 movaps 24 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 movaps 28 * SIZE(AO), %xmm7 pshufd $0x00, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm12 pshufd $0x55, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm15 movaps 36 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm12 pshufd $0x55, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm13 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm12, %xmm8 subps %xmm8, %xmm15 movaps 44 * SIZE(AO), %xmm7 pshufd $0x55, %xmm7, %xmm8 mulps %xmm8, %xmm13 pshufd $0xaa, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm14 pshufd $0xff, %xmm7, %xmm8 mulps %xmm13, %xmm8 subps %xmm8, %xmm15 movaps 52 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm14 pshufd $0xff, %xmm6, %xmm8 mulps %xmm14, %xmm8 subps %xmm8, %xmm15 movaps 60 * SIZE(AO), %xmm7 pshufd $0xff, %xmm7, %xmm8 mulps %xmm8, %xmm15 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm9, %xmm2 subps %xmm2, %xmm15 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm15 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 mulps %xmm2, %xmm13 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 pshufd $0xff, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm15 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 mulps %xmm2, %xmm15 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 mulps %xmm2, %xmm15 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm13 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm15, %xmm2 subps %xmm2, %xmm9 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 mulps %xmm2, %xmm13 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x55, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm13, %xmm2 subps %xmm2, %xmm9 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 mulps %xmm2, %xmm11 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 pshufd $0x00, %xmm0, %xmm2 mulps %xmm11, %xmm2 subps %xmm2, %xmm9 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 mulps %xmm2, %xmm9 #endif #ifdef LN subq $8 * SIZE, CO1 subq $8 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm10, 8 * SIZE(B) movaps %xmm11, 12 * SIZE(B) movaps %xmm12, 16 * SIZE(B) movaps %xmm13, 20 * SIZE(B) movaps %xmm14, 24 * SIZE(B) movaps %xmm15, 28 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 pshufd $0xaa, %xmm10, %xmm4 pshufd $0xff, %xmm10, %xmm6 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm6, 44 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 pshufd $0xaa, %xmm11, %xmm4 pshufd $0xff, %xmm11, %xmm6 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm6, 60 * SIZE(BO) pshufd $0x00, %xmm12, %xmm2 pshufd $0x55, %xmm12, %xmm3 pshufd $0xaa, %xmm12, %xmm4 pshufd $0xff, %xmm12, %xmm6 movaps %xmm2, 64 * SIZE(BO) movaps %xmm3, 68 * SIZE(BO) movaps %xmm4, 72 * SIZE(BO) movaps %xmm6, 76 * SIZE(BO) pshufd $0x00, %xmm13, %xmm2 pshufd $0x55, %xmm13, %xmm3 pshufd $0xaa, %xmm13, %xmm4 pshufd $0xff, %xmm13, %xmm6 movaps %xmm2, 80 * SIZE(BO) movaps %xmm3, 84 * SIZE(BO) movaps %xmm4, 88 * SIZE(BO) movaps %xmm6, 92 * SIZE(BO) pshufd $0x00, %xmm14, %xmm2 pshufd $0x55, %xmm14, %xmm3 pshufd $0xaa, %xmm14, %xmm4 pshufd $0xff, %xmm14, %xmm6 movaps %xmm2, 96 * SIZE(BO) movaps %xmm3, 100 * SIZE(BO) movaps %xmm4, 104 * SIZE(BO) movaps %xmm6, 108 * SIZE(BO) pshufd $0x00, %xmm15, %xmm2 pshufd $0x55, %xmm15, %xmm3 pshufd $0xaa, %xmm15, %xmm4 pshufd $0xff, %xmm15, %xmm6 movaps %xmm2, 112 * SIZE(BO) movaps %xmm3, 116 * SIZE(BO) movaps %xmm4, 120 * SIZE(BO) movaps %xmm6, 124 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm9, 4 * SIZE(AO) movaps %xmm10, 8 * SIZE(AO) movaps %xmm11, 12 * SIZE(AO) movaps %xmm12, 16 * SIZE(AO) movaps %xmm13, 20 * SIZE(AO) movaps %xmm14, 24 * SIZE(AO) movaps %xmm15, 28 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movaps %xmm12, %xmm2 unpcklps %xmm14, %xmm12 unpckhps %xmm14, %xmm2 movaps %xmm13, %xmm7 unpcklps %xmm15, %xmm13 unpckhps %xmm15, %xmm7 movaps %xmm12, %xmm14 unpcklps %xmm13, %xmm12 unpckhps %xmm13, %xmm14 movaps %xmm2, %xmm15 unpcklps %xmm7, %xmm2 unpckhps %xmm7, %xmm15 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm14, 4 * SIZE(CO2) movhps %xmm14, 6 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 6 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm9, 4 * SIZE(CO1) movhps %xmm9, 6 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm11, 4 * SIZE(CO2) movhps %xmm11, 6 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm13, 4 * SIZE(CO1, LDC, 2) movhps %xmm13, 6 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) movhps %xmm14, 2 * SIZE(CO2, LDC, 2) movlps %xmm15, 4 * SIZE(CO2, LDC, 2) movhps %xmm15, 6 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 8), AO #ifdef LT addq $32 * SIZE, B #endif #endif #ifdef LN subq $8, KK movq BORIG, B #endif #ifdef LT addq $8, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $3 + BASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #ifdef LN movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $2 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm8 unpcklps %xmm2, %xmm0 unpckhps %xmm2, %xmm8 movaps %xmm1, %xmm14 unpcklps %xmm3, %xmm1 unpckhps %xmm3, %xmm14 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movaps %xmm8, %xmm3 unpcklps %xmm14, %xmm8 unpckhps %xmm14, %xmm3 movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm5 movaps 8 * SIZE(B), %xmm10 movaps 12 * SIZE(B), %xmm11 subps %xmm0, %xmm1 subps %xmm2, %xmm5 subps %xmm8, %xmm10 subps %xmm3, %xmm11 #else movaps 0 * SIZE(AO), %xmm8 movaps 4 * SIZE(AO), %xmm10 movaps 8 * SIZE(AO), %xmm12 movaps 12 * SIZE(AO), %xmm14 subps %xmm0, %xmm8 subps %xmm1, %xmm10 subps %xmm2, %xmm12 subps %xmm3, %xmm14 #endif #ifdef LN movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm11, %xmm8 subps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0x55, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0x00, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm11 movaps 4 * SIZE(AO), %xmm6 pshufd $0x55, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm11 movaps 8 * SIZE(AO), %xmm6 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm8, %xmm10 pshufd $0xff, %xmm6, %xmm8 mulps %xmm10, %xmm8 subps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm11 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) movaps %xmm10, 8 * SIZE(B) movaps %xmm11, 12 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) pshufd $0x00, %xmm10, %xmm2 pshufd $0x55, %xmm10, %xmm3 pshufd $0xaa, %xmm10, %xmm4 pshufd $0xff, %xmm10, %xmm6 movaps %xmm2, 32 * SIZE(BO) movaps %xmm3, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm6, 44 * SIZE(BO) pshufd $0x00, %xmm11, %xmm2 pshufd $0x55, %xmm11, %xmm3 pshufd $0xaa, %xmm11, %xmm4 pshufd $0xff, %xmm11, %xmm6 movaps %xmm2, 48 * SIZE(BO) movaps %xmm3, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm6, 60 * SIZE(BO) #else movaps %xmm8, 0 * SIZE(AO) movaps %xmm10, 4 * SIZE(AO) movaps %xmm12, 8 * SIZE(AO) movaps %xmm14, 12 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) movhps %xmm11, 2 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) movhps %xmm14, 2 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $2, M je .L40 #ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif #ifdef movsd xorps %xmm8, %xmm8 #endif movaps 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movaps 8 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 4 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd 6 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd 16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd 10 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 14 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $1 + BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 movaps %xmm0, %xmm2 unpcklps %xmm1, %xmm0 unpckhps %xmm1, %xmm2 movapd 0 * SIZE(B), %xmm1 movapd 4 * SIZE(B), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #else #ifdef movsd xorps %xmm8, %xmm8 #endif movsd 0 * SIZE(AO), %xmm8 #ifdef movsd xorps %xmm10, %xmm10 #endif movsd 2 * SIZE(AO), %xmm10 #ifdef movsd xorps %xmm12, %xmm12 #endif movsd 4 * SIZE(AO), %xmm12 #ifdef movsd xorps %xmm14, %xmm14 #endif movsd 6 * SIZE(AO), %xmm14 subps %xmm0, %xmm8 subps %xmm1, %xmm10 subps %xmm2, %xmm12 subps %xmm3, %xmm14 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm6 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 pshufd $0xaa, %xmm6, %xmm8 mulps %xmm5, %xmm8 subps %xmm8, %xmm1 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 pshufd $0x55, %xmm6, %xmm8 mulps %xmm1, %xmm8 subps %xmm8, %xmm5 pshufd $0xff, %xmm6, %xmm8 mulps %xmm8, %xmm5 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm8, %xmm2 subps %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulps %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm14, %xmm2 subps %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulps %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm12, %xmm2 subps %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulps %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulps %xmm10, %xmm2 subps %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulps %xmm2, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) movaps %xmm5, 4 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) pshufd $0x00, %xmm5, %xmm2 pshufd $0x55, %xmm5, %xmm3 pshufd $0xaa, %xmm5, %xmm4 pshufd $0xff, %xmm5, %xmm6 movaps %xmm2, 16 * SIZE(BO) movaps %xmm3, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm6, 28 * SIZE(BO) #else movlps %xmm8, 0 * SIZE(AO) movlps %xmm10, 2 * SIZE(AO) movlps %xmm12, 4 * SIZE(AO) movlps %xmm14, 6 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movlps %xmm1, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movlps %xmm11, 0 * SIZE(CO2, LDC, 2) #else movlps %xmm8, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movlps %xmm14, 0 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L40: testq $1, M je .L49 #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO leaq (AO, %rax, SIZE), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss 2 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss 3 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss 8 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss 5 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss 6 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss 7 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss 12 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklps %xmm2, %xmm0 unpcklps %xmm3, %xmm1 unpcklps %xmm1, %xmm0 movapd 0 * SIZE(B), %xmm1 subps %xmm0, %xmm1 #else movss 0 * SIZE(AO), %xmm8 movss 1 * SIZE(AO), %xmm10 movss 2 * SIZE(AO), %xmm12 movss 3 * SIZE(AO), %xmm14 subss %xmm0, %xmm8 subss %xmm1, %xmm10 subss %xmm2, %xmm12 subss %xmm3, %xmm14 #endif #if defined(LN) || defined(LT) movss 0 * SIZE(AO), %xmm6 pshufd $0x00, %xmm6, %xmm8 mulps %xmm8, %xmm1 #endif #ifdef RN movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 pshufd $0x55, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm8, %xmm2 subss %xmm2, %xmm14 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm14 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm2, %xmm12 pshufd $0xff, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm14 movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm14 #endif #ifdef RT movaps 12 * SIZE(B), %xmm0 pshufd $0xff, %xmm0, %xmm2 mulss %xmm2, %xmm14 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm14, %xmm2 subss %xmm2, %xmm8 movaps 8 * SIZE(B), %xmm0 pshufd $0xaa, %xmm0, %xmm2 mulss %xmm2, %xmm12 pshufd $0x55, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm12, %xmm2 subss %xmm2, %xmm8 movaps 4 * SIZE(B), %xmm0 pshufd $0x55, %xmm0, %xmm2 mulss %xmm2, %xmm10 pshufd $0x00, %xmm0, %xmm2 mulss %xmm10, %xmm2 subss %xmm2, %xmm8 movaps 0 * SIZE(B), %xmm0 pshufd $0x00, %xmm0, %xmm2 mulss %xmm2, %xmm8 #endif #ifdef LN subq $1 * SIZE, CO1 subq $1 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm1, 0 * SIZE(B) pshufd $0x00, %xmm1, %xmm2 pshufd $0x55, %xmm1, %xmm3 pshufd $0xaa, %xmm1, %xmm4 pshufd $0xff, %xmm1, %xmm6 movaps %xmm2, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm6, 12 * SIZE(BO) #else movss %xmm8, 0 * SIZE(AO) movss %xmm10, 1 * SIZE(AO) movss %xmm12, 2 * SIZE(AO) movss %xmm14, 3 * SIZE(AO) #endif #if defined(LN) || defined(LT) movaps %xmm1, %xmm0 unpcklps %xmm10, %xmm1 unpckhps %xmm10, %xmm0 movaps %xmm5, %xmm7 unpcklps %xmm11, %xmm5 unpckhps %xmm11, %xmm7 movaps %xmm1, %xmm10 unpcklps %xmm5, %xmm1 unpckhps %xmm5, %xmm10 movaps %xmm0, %xmm11 unpcklps %xmm7, %xmm0 unpckhps %xmm7, %xmm11 movss %xmm1, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO2) movss %xmm0, 0 * SIZE(CO1, LDC, 2) movss %xmm11, 0 * SIZE(CO2, LDC, 2) #else movss %xmm8, 0 * SIZE(CO1) movss %xmm10, 0 * SIZE(CO2) movss %xmm12, 0 * SIZE(CO1, LDC, 2) movss %xmm14, 0 * SIZE(CO2, LDC, 2) #endif #ifndef LN addq $1 * SIZE, CO1 addq $1 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $BASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 4), B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif decq J # j -- jg .L01 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/xdot.S000066400000000000000000000133571313527062700164750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_X 12 + STACK + ARGS(%esp) #define STACK_INCX 16 + STACK + ARGS(%esp) #define STACK_Y 20 + STACK + ARGS(%esp) #define STACK_INCY 24 + STACK + ARGS(%esp) #include "l1param.h" PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY testl N, N jle .L88 sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY fldz fldz fldz fldz cmpl $2 * SIZE, INCX jne .L14 cmpl $2 * SIZE, INCY jne .L14 movl N, %eax sarl $1, %eax jle .L15 ALIGN_3 .L16: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) FLD 2 * SIZE(X) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 2 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 3 * SIZE(X) FLD 2 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addl $4 * SIZE, X addl $4 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl N, %eax andl $1, %eax jle .L27 ALIGN_3 .L22: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) jmp .L27 ALIGN_3 .L14: movl N, %eax sarl $1, %eax jle .L30 ALIGN_3 .L31: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addl INCX, X FLD 0 * SIZE(X) addl INCY, Y FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addl INCX, X addl INCY, Y decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $1, %eax jle .L27 ALIGN_3 .L37: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) ALIGN_3 .L27: movl RESULT, %eax #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) fsubp %st, %st(1) #endif FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) popl %ebx popl %esi popl %edi ret ALIGN_3 .L88: movl RESULT, %eax fldz fldz FST 1 * SIZE(%eax) FST 0 * SIZE(%eax) popl %ebx popl %esi popl %edi ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/xgemm3m_kernel_2x2.S000066400000000000000000000350251313527062700211230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define N ARG2 #define K ARG3 #define A ARG4 #define B ARG5 #define C ARG6 #define LDC %r10 #define I %r12 #define J %r13 #define AO %r14 #define BO %r15 #define CO %rbp #define KK %r11 #define KKK 48(%rsp) #define STACKSIZE 64 #define ALPHA_R 8 + STACKSIZE(%rsp) #define ALPHA_I 24 + STACKSIZE(%rsp) #define OFFSET 48 + STACKSIZE(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq 40 + STACKSIZE(%rsp), LDC #if defined(TRMMKERNEL) && !defined(LEFT) movq OFFSET, %rax negq %rax movq %rax, KK #endif addq $8 * SIZE, A addq $8 * SIZE, B salq $ZBASE_SHIFT, LDC movq N, %rax sarq $1, %rax movq %rax, J je .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO movq C, CO leaq (, LDC, 2), %rax addq %rax, C movq M, I sarq $1, I je .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) prefetchw 2 * SIZE(CO, LDC, 1) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) prefetchnta 2 * SIZE(CO, LDC, 1) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addq $8 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st faddp %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) faddp %st, %st(6) faddp %st, %st(3) faddp %st, %st(3) addq $2 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L16 ALIGN_4 .L18: #ifndef TRMMKERNEL FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fld %st(3) fmul %st(1), %st FLD 2 * SIZE(CO) faddp %st, %st(1) FST 2 * SIZE(CO) fld %st(4) fmul %st(1), %st FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) fmul %st(5), %st FLD 2 * SIZE(CO, LDC) faddp %st, %st(1) FST 2 * SIZE(CO, LDC) fmul %st, %st(1) fmul %st, %st(2) fmul %st, %st(3) fmulp %st, %st(4) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 3 * SIZE(CO) faddp %st, %st(1) FST 3 * SIZE(CO) FLD 1 * SIZE(CO, LDC) faddp %st, %st(1) FST 1 * SIZE(CO, LDC) FLD 3 * SIZE(CO, LDC) faddp %st, %st(1) FST 3 * SIZE(CO, LDC) #else FST 0 * SIZE(CO) FST 1 * SIZE(CO) FST 0 * SIZE(CO, LDC) FST 1 * SIZE(CO, LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO decq I jne .L11 ALIGN_4 .L20: movq M, %rax andq $1, %rax je .L29 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 2), BO #endif fldz fldz #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(AO) FLD -6 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(AO) FLD -4 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(AO) FLD -2 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $4 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L28 ALIGN_4 .L26: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(2) addq $1 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L26 ALIGN_4 .L28: #ifndef TRMMKERNEL FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fmul %st(3), %st FLD 0 * SIZE(CO, LDC) faddp %st, %st(1) FST 0 * SIZE(CO, LDC) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 1 * SIZE(CO, LDC) faddp %st, %st(1) FST 1 * SIZE(CO, LDC) #else FST 0 * SIZE(CO) FST 0 * SIZE(CO, LDC) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B decq J jne .L01 ALIGN_4 .L30: movq N, %rax testq $1, %rax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO movq C, CO addq LDC, C movq M, I sarq $1, I je .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq ( B, %rax, 1), BO #endif fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -7 * SIZE(BO) FLD -6 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -5 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -6 * SIZE(BO) FLD -4 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -3 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) FLD -5 * SIZE(BO) FLD -2 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -1 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $8 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L38 ALIGN_4 .L36: FLD -8 * SIZE(BO) FLD -8 * SIZE(AO) fmul %st(1), %st faddp %st, %st(2) FLD -7 * SIZE(AO) fmulp %st, %st(1) faddp %st, %st(2) addq $2 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L36 ALIGN_4 .L38: #ifndef TRMMKERNEL FLD ALPHA_I FLD ALPHA_R fld %st(2) fmul %st(1), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fmul %st(3), %st FLD 2 * SIZE(CO) faddp %st, %st(1) FST 2 * SIZE(CO) fmul %st, %st(1) fmulp %st, %st(2) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) FLD 3 * SIZE(CO) faddp %st, %st(1) FST 3 * SIZE(CO) #else FST 0 * SIZE(CO) FST 1 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO decq I jne .L31 ALIGN_4 .L40: movq M, %rax andq $1, %rax je .L49 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 1), BO #endif fldz #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L45 ALIGN_4 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -7 * SIZE(AO) FLD -7 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) FLD -5 * SIZE(AO) FLD -5 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $4 * SIZE,AO addq $4 * SIZE,BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L48 ALIGN_4 .L46: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fmulp %st, %st(1) faddp %st, %st(1) addq $1 * SIZE,AO addq $1 * SIZE,BO decq %rax jne .L46 ALIGN_4 .L48: #ifndef TRMMKERNEL FLD ALPHA_I FLD ALPHA_R fmul %st(2), %st FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) fmulp %st, %st(1) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) #else FST 0 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK #endif movq BO, B ALIGN_4 .L999: EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/xgemm_kernel_1x1.S000066400000000000000000000170321313527062700206570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define N ARG2 #define K ARG3 #define A ARG4 #define B ARG5 #define C ARG6 #define LDC %r10 #define I %r12 #define J %r13 #define AO %r14 #define BO %r15 #define CO %rbp #define STACKSIZE 64 #define ALPHA_R 8 + STACKSIZE(%rsp) #define ALPHA_I 24 + STACKSIZE(%rsp) #define OFFSET 48 + STACKSIZE(%rsp) #define KK %r11 #define KKK 48(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 faddp #define ADD2 fsubrp #define ADD3 faddp #define ADD4 faddp #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 faddp #define ADD2 faddp #define ADD3 fsubrp #define ADD4 faddp #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 faddp #define ADD2 faddp #define ADD3 faddp #define ADD4 fsubrp #else #define ADD1 faddp #define ADD2 fsubrp #define ADD3 fsubrp #define ADD4 fsubrp #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq 40 + STACKSIZE(%rsp), LDC #if defined(TRMMKERNEL) && !defined(LEFT) movq OFFSET, %rax negq %rax movq %rax, KK #endif addq $8 * SIZE, A addq $8 * SIZE, B salq $ZBASE_SHIFT, LDC cmpq $0, M jle .L999 movq N, %rax movq %rax, J testq %rax, %rax jle .L999 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO movq C, CO addq LDC, C movq M, I ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addq $8 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif and $3, %rax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addq $2 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L16 ALIGN_4 .L18: faddp %st, %st(3) faddp %st, %st(1) #ifndef TRMMKERNEL FLD ALPHA_R fld %st fmul %st(2), %st fxch %st(1) fmul %st(3), %st FLD ALPHA_I fmul %st, %st(3) fmulp %st, %st(4) fsubp %st, %st(2) faddp %st, %st(2) FLD 0 * SIZE(CO) faddp %st, %st(1) FST 0 * SIZE(CO) FLD 1 * SIZE(CO) faddp %st, %st(1) FST 1 * SIZE(CO) #else FST 1 * SIZE(CO) FST 0 * SIZE(CO) #endif #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO decq I jne .L11 #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK #endif movq BO, B decq J jne .L01 ALIGN_4 .L999: EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/xgemv_n.S000066400000000000000000000165131313527062700171570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #define P 32 #define STACKSIZE 80 #define ALPHA_R 8 + STACKSIZE(%rsp) #define ALPHA_I 24 + STACKSIZE(%rsp) #define OLD_INCX 40 + STACKSIZE(%rsp) #define OLD_Y 48 + STACKSIZE(%rsp) #define OLD_INCY 56 + STACKSIZE(%rsp) #define BUFFER 64 + STACKSIZE(%rsp) #define PLDA_M 56 (%rsp) #define IS 64 (%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #define TEMP %rax #define I %rax #define J %r11 #define A1 %r12 #define X1 %r13 #define Y1 %r14 #define XP %r15 #define MIN_N %rbx PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY FLD ALPHA_I FLD ALPHA_R salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY movq $0, IS test M, M jle .L79 test N, N jle .L79 movq LDA, %rax imulq $P, %rax # P * lda subq M ,%rax # P * lda - m salq $ZBASE_SHIFT, %rax movq %rax, PLDA_M salq $ZBASE_SHIFT, LDA ALIGN_2 .L32: movq $P, %rax movq N, MIN_N subq IS, MIN_N cmpq %rax, MIN_N cmovg %rax, MIN_N movq IS, XP salq $ZBASE_SHIFT, XP leaq (X,XP, 1), XP cmpq $2 * SIZE, INCX je .L34 movq BUFFER, XP movq XP, X1 movq MIN_N, I sarq $1, I jle .L35 ALIGN_2 .L36: FLD 0 * SIZE(X) FLD 1 * SIZE(X) addq INCX,X # x += incx FLD 0 * SIZE(X) FLD 1 * SIZE(X) addq INCX,X # x += incx FST 3 * SIZE(X1) FST 2 * SIZE(X1) FST 1 * SIZE(X1) FST 0 * SIZE(X1) addq $4 * SIZE, X1 # xp += 4 decq I jg .L36 ALIGN_3 .L35: movq MIN_N, I andq $1, I jle .L34 FLD 0 * SIZE(X) FLD 1 * SIZE(X) addq INCX,X # x += incx FST 1 * SIZE(X1) FST 0 * SIZE(X1) ALIGN_3 /* Main Routine */ .L34: movq Y, Y1 # c_offset movq M, J # j = m ALIGN_3 .L61: movq A, A1 # a_offset = a addq $2 * SIZE, A # a++ fldz fldz fldz fldz movq XP, X1 FLD (X1) # bt1 = *(b_offset + 0) movq MIN_N, I sarq $1, I jle .L64 ALIGN_3 .L65: FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(X1) # bt1 = *(b_offset + 2) addq $2 * SIZE, X1 # b_offset += 2 addq LDA, A1 # a_offset += lda FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(X1) # bt1 = *(b_offset + 2) addq $2 * SIZE, X1 # b_offset += 2 addq LDA, A1 # a_offset += lda decq I jg .L65 .L64: movq MIN_N, I andq $1, I jle .L70 ALIGN_2 .L71: FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 fldz ALIGN_2 .L70: ffreep %st(0) #ifndef XCONJ #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) faddp %st, %st(1) #endif #else #ifndef CONJ faddp %st, %st(3) fsubp %st, %st(1) #else fsubp %st, %st(3) fsubp %st, %st(1) #endif #endif fld %st(0) # ct4 = ct2 fmul %st(4) fld %st(2) fmul %st(4) fsubp %st, %st(1) FLD 0 * SIZE(Y1) faddp %st, %st(1) FST 0 * SIZE(Y1) fmul %st(2) fxch %st(1) fmul %st(3) faddp %st, %st(1) FLD 1 * SIZE(Y1) faddp %st, %st(1) FST 1 * SIZE(Y1) addq INCY, Y1 decq J jg .L61 .L60: addq PLDA_M, A addq $P, IS cmpq N, IS jl .L32 .L79: ffreep %st ffreep %st movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/xgemv_t.S000066400000000000000000000164601313527062700171660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #define STACKSIZE 80 #define P 4096 #define ALPHA_R 8 + STACKSIZE(%rsp) #define ALPHA_I 24 + STACKSIZE(%rsp) #define OLD_INCX 40 + STACKSIZE(%rsp) #define OLD_Y 48 + STACKSIZE(%rsp) #define OLD_INCY 56 + STACKSIZE(%rsp) #define BUFFER 64 + STACKSIZE(%rsp) #define NLDA 56 (%rsp) #define IS 64 (%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #define TEMP %rax #define I %rax #define J %r11 #define A1 %r12 #define XP %r15 #define X1 %r13 #define Y1 %r14 #define MIN_M %rbx PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY FLD ALPHA_I FLD ALPHA_R salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY movq $0, IS test M, M jle .L79 # goto END test N, N jle .L79 # goto END movq N, %rax imulq LDA, %rax movq $P, NLDA subq %rax, NLDA salq $ZBASE_SHIFT, NLDA salq $ZBASE_SHIFT, LDA ALIGN_2 .L32: movq $P, %rax movq M, MIN_M subq IS , MIN_M cmpq %rax, MIN_M cmovg %rax, MIN_M movq IS, X1 salq $ZBASE_SHIFT, X1 leaq (X,X1, 1), X1 movq X1, XP cmpq $2 * SIZE, INCX je .L34 movq BUFFER, X1 movq X1, XP movq MIN_M, I sarq $1, I jle .L35 ALIGN_3 .L36: FLD 0 * SIZE(X) FLD 1 * SIZE(X) addq INCX,X # x += incx FLD 0 * SIZE(X) FLD 1 * SIZE(X) addq INCX,X # x += incx FST 3 * SIZE(X1) FST 2 * SIZE(X1) FST 1 * SIZE(X1) FST 0 * SIZE(X1) addq $4 * SIZE, X1 # xp += 4 decq I jg .L36 ALIGN_3 .L35: movq MIN_M, I andq $1,I jle .L34 FLD 0 * SIZE(X) FLD 1 * SIZE(X) addq INCX,X # x += incx FST 1 * SIZE(X1) FST 0 * SIZE(X1) ALIGN_3 /* Main Routine */ .L34: movq Y, Y1 # coffset = y movq N, J ALIGN_2 .L61: movq A, A1 # a_offset = a fldz # ct1 = ZERO fldz # ct1 = ZERO addq LDA, A fldz # ct1 = ZERO fldz # ct1 = ZERO movq XP, X1 FLD (X1) # bt1 = *(b_offset + 0) movq MIN_M, I sarq $1, I jle .L64 ALIGN_3 .L65: FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 2 * SIZE(X1) # bt1 = *(b_offset + 1) FLD 2 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 3 * SIZE(X1) # bt1 = *(b_offset + 1) FLD 2 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 FLD 4 * SIZE(X1) # bt1 = *(b_offset + 1) addq $4 * SIZE, X1 addq $4 * SIZE, A1 decq I jg .L65 ALIGN_3 .L64: movq MIN_M, I andq $1, I jle .L70 ALIGN_3 .L71: FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) #ifndef CONJ faddp %st, %st(2) # ct2 += bt1 #else fsubrp %st, %st(2) # ct2 -= bt1 #endif FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1) # at1 *= bt1 faddp %st, %st(4) # ct3 += at1 FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) fmulp %st, %st(1) faddp %st, %st(4) # ct4 += bt1 fldz ALIGN_3 .L70: ffreep %st(0) #ifndef XCONJ #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) #else faddp %st, %st(3) faddp %st, %st(1) #endif #else #ifndef CONJ faddp %st, %st(3) fsubp %st, %st(1) #else fsubp %st, %st(3) fsubp %st, %st(1) #endif #endif fld %st(0) # ct4 = ct2 fmul %st(4) fld %st(2) fmul %st(4) fsubp %st, %st(1) FLD 0 * SIZE(Y1) faddp %st, %st(1) FST 0 * SIZE(Y1) fmul %st(2) fxch %st(1) fmul %st(3) faddp %st, %st(1) FLD 1 * SIZE(Y1) faddp %st, %st(1) FST 1 * SIZE(Y1) addq INCY, Y1 decq J jg .L61 ALIGN_3 .L60: addq NLDA, A addq $P, IS cmpq M, IS jl .L32 ALIGN_3 .L79: ffreep %st ffreep %st movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/xtrsm_kernel_LT_1x1.S000066400000000000000000000211051313527062700213120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define N ARG2 #define K ARG3 #define A ARG4 #define B ARG5 #define C ARG6 #define LDC %r10 #define I %r12 #define J %r13 #define AO %r14 #define BO %r15 #define CO %rbp #define OFFSET 48 + STACKSIZE(%rsp) #define STACKSIZE 64 #define KK %r11 #define AORIG 48(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #else #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #endif #define PREFETCHSIZE (5 + 4 * 10) #ifndef CONJ #define ADD1 faddp #define ADD2 fsubrp #define ADD3 faddp #define ADD4 faddp #elif defined(LN) || defined(LT) #define ADD1 faddp #define ADD2 faddp #define ADD3 fsubrp #define ADD4 faddp #else #define ADD1 faddp #define ADD2 faddp #define ADD3 faddp #define ADD4 fsubrp #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq 40 + STACKSIZE(%rsp), LDC salq $ZBASE_SHIFT, LDC addq $8 * SIZE, A addq $8 * SIZE, B #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN movq OFFSET, %rax negq %rax movq %rax, KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif cmpq $0, M jle .L999 movq N, %rax movq %rax, J testq %rax, %rax jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B #endif #ifdef RT subq LDC, C #endif movq C, CO #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif #ifdef LT movq OFFSET, %rax movq %rax, KK #endif movq M, I ALIGN_4 .L11: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif fldz fldz fldz fldz #if defined(HAVE_3DNOW) prefetchw 2 * SIZE(CO) #elif defined(HAVE_SSE) prefetchnta 2 * SIZE(CO) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -6 * SIZE(AO) FLD -6 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -5 * SIZE(BO) fmul %st, %st(2) FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) FLD -4 * SIZE(AO) FLD -4 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -3 * SIZE(BO) fmul %st, %st(2) FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) FLD -2 * SIZE(AO) FLD -2 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -1 * SIZE(BO) fmul %st, %st(2) FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addq $8 * SIZE,AO addq $8 * SIZE,BO decq %rax jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif and $3, %rax je .L18 ALIGN_4 .L16: FLD -8 * SIZE(AO) FLD -8 * SIZE(BO) fld %st(1) fmul %st(1), %st ADD1 %st, %st(3) FLD -7 * SIZE(BO) fmul %st, %st(2) FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) ADD2 %st, %st(6) ADD3 %st, %st(3) ADD4 %st, %st(3) addq $2 * SIZE,AO addq $2 * SIZE,BO decq %rax jne .L16 ALIGN_4 .L18: faddp %st, %st(3) faddp %st, %st(1) fxch %st(1) #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(BO) fsubp %st, %st(1) FLD -7 * SIZE(BO) fsubp %st, %st(2) #else FLD -8 * SIZE(AO) fsubp %st, %st(1) FLD -7 * SIZE(AO) fsubp %st, %st(2) #endif #if defined(LN) || defined(LT) FLD -8 * SIZE(AO) fmul %st(1), %st FLD -8 * SIZE(AO) fmul %st(3), %st FLD -7 * SIZE(AO) fmulp %st, %st(3) FLD -7 * SIZE(AO) fmulp %st, %st(4) #endif #if defined(RN) || defined(RT) FLD -8 * SIZE(BO) fmul %st(1), %st FLD -8 * SIZE(BO) fmul %st(3), %st FLD -7 * SIZE(BO) fmulp %st, %st(3) FLD -7 * SIZE(BO) fmulp %st, %st(4) #endif #ifndef CONJ faddp %st, %st(2) fsubp %st, %st(2) #else fsubp %st, %st(2) faddp %st, %st(2) #endif #if defined(LN) || defined(LT) fld %st FST -7 * SIZE(BO) fxch %st(1) fld %st FST -8 * SIZE(BO) #else fld %st FST -7 * SIZE(AO) fxch %st(1) fld %st FST -8 * SIZE(AO) #endif #ifdef LN subq $2 * SIZE, CO #endif FST 0 * SIZE(CO) FST 1 * SIZE(CO) #ifndef LN addq $2 * SIZE, CO #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I jne .L11 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif decq J jne .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zamax.S000066400000000000000000000113561313527062700166340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define X ARG2 #define INCX ARG3 #define I %rax #ifndef USE_MIN #define FMOV fcmovbe #else #define FMOV fcmovnbe #endif #include "l1param.h" PROLOGUE PROFCODE salq $ZBASE_SHIFT, INCX fldz testq M, M jle .L999 testq INCX, INCX jle .L999 ffreep %st FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) addq INCX, X decq M jle .L999 cmpq $2 * SIZE, INCX jne .L40 movq M, I sarq $2, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st addq $8 * SIZE, X decq I jg .L10 ALIGN_4 .L20: movq M, I andq $3, I jle .L999 ALIGN_4 .L21: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st addq $2 * SIZE, X decq I jg .L21 jmp .L999 ALIGN_4 .L40: movq M, I sarq $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs addq INCX, X faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st decq I jg .L50 ALIGN_4 .L60: movq M, I andq $3, I jle .L999 ALIGN_4 .L61: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) fxch %st(1) ffreep %st addq INCX, X decq I jg .L61 ALIGN_4 .L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zamax_atom.S000066400000000000000000000157251313527062700176600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #ifdef USE_MIN #define maxsd minsd #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 salq $ZBASE_SHIFT, INCX testq M, M jle .L999 testq INCX, INCX jle .L999 pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm4 addq INCX, X andps %xmm15, %xmm0 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 decq M jle .L999 movaps %xmm0, %xmm1 cmpq $2 * SIZE, INCX jne .L20 movq M, I sarq $2, I jle .L15 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movsd 2 * SIZE(X), %xmm6 movsd 3 * SIZE(X), %xmm7 movsd 4 * SIZE(X), %xmm8 andps %xmm15, %xmm4 movsd 5 * SIZE(X), %xmm9 andps %xmm15, %xmm5 movsd 6 * SIZE(X), %xmm10 addsd %xmm4, %xmm5 movsd 7 * SIZE(X), %xmm11 decq I jle .L13 ALIGN_4 .L12: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm6 movsd 8 * SIZE(X), %xmm4 andps %xmm15, %xmm7 addsd %xmm6, %xmm7 movsd 10 * SIZE(X), %xmm6 maxsd %xmm5, %xmm0 movsd 9 * SIZE(X), %xmm5 andps %xmm15, %xmm8 maxsd %xmm7, %xmm1 movsd 11 * SIZE(X), %xmm7 andps %xmm15, %xmm9 addsd %xmm8, %xmm9 movsd 12 * SIZE(X), %xmm8 andps %xmm15, %xmm10 andps %xmm15, %xmm11 addsd %xmm10, %xmm11 movsd 14 * SIZE(X), %xmm10 maxsd %xmm9, %xmm0 movsd 13 * SIZE(X), %xmm9 andps %xmm15, %xmm4 maxsd %xmm11, %xmm1 movsd 15 * SIZE(X), %xmm11 andps %xmm15, %xmm5 addsd %xmm4, %xmm5 addq $8 * SIZE, X decq I jg .L12 ALIGN_4 .L13: andps %xmm15, %xmm6 andps %xmm15, %xmm7 addsd %xmm6, %xmm7 maxsd %xmm5, %xmm0 andps %xmm15, %xmm8 maxsd %xmm7, %xmm1 andps %xmm15, %xmm9 addsd %xmm8, %xmm9 andps %xmm15, %xmm10 andps %xmm15, %xmm11 addsd %xmm10, %xmm11 maxsd %xmm9, %xmm0 maxsd %xmm11, %xmm1 addq $8 * SIZE, X ALIGN_4 .L15: testq $2, M jle .L17 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movsd 2 * SIZE(X), %xmm6 movsd 3 * SIZE(X), %xmm7 addq $4 * SIZE, X andps %xmm15, %xmm4 andps %xmm15, %xmm5 addsd %xmm4, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addsd %xmm6, %xmm7 maxsd %xmm5, %xmm0 maxsd %xmm7, %xmm1 ALIGN_3 .L17: testq $1, M jle .L998 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addsd %xmm4, %xmm5 maxsd %xmm5, %xmm0 jmp .L998 ALIGN_3 .L20: movq M, I sarq $2, I jle .L25 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movsd 0 * SIZE(X), %xmm8 andps %xmm15, %xmm4 movsd 1 * SIZE(X), %xmm9 addq INCX, X andps %xmm15, %xmm5 movsd 0 * SIZE(X), %xmm10 addsd %xmm4, %xmm5 movsd 1 * SIZE(X), %xmm11 addq INCX, X decq I jle .L23 ALIGN_4 .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm6 movsd 0 * SIZE(X), %xmm4 andps %xmm15, %xmm7 addsd %xmm6, %xmm7 maxsd %xmm5, %xmm0 movsd 1 * SIZE(X), %xmm5 andps %xmm15, %xmm8 addq INCX, X maxsd %xmm7, %xmm1 movsd 0 * SIZE(X), %xmm6 andps %xmm15, %xmm9 movsd 1 * SIZE(X), %xmm7 addsd %xmm8, %xmm9 addq INCX, X andps %xmm15, %xmm10 movsd 0 * SIZE(X), %xmm8 andps %xmm15, %xmm11 addsd %xmm10, %xmm11 maxsd %xmm9, %xmm0 movsd 1 * SIZE(X), %xmm9 addq INCX, X andps %xmm15, %xmm4 movsd 0 * SIZE(X), %xmm10 maxsd %xmm11, %xmm1 movsd 1 * SIZE(X), %xmm11 andps %xmm15, %xmm5 addq INCX, X addsd %xmm4, %xmm5 decq I jg .L22 ALIGN_4 .L23: andps %xmm15, %xmm6 andps %xmm15, %xmm7 addsd %xmm6, %xmm7 maxsd %xmm5, %xmm0 andps %xmm15, %xmm8 maxsd %xmm7, %xmm1 andps %xmm15, %xmm9 addsd %xmm8, %xmm9 andps %xmm15, %xmm10 andps %xmm15, %xmm11 addsd %xmm10, %xmm11 maxsd %xmm9, %xmm0 maxsd %xmm11, %xmm1 ALIGN_4 .L25: testq $2, M jle .L27 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X andps %xmm15, %xmm4 andps %xmm15, %xmm5 addsd %xmm4, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addsd %xmm6, %xmm7 maxsd %xmm5, %xmm0 maxsd %xmm7, %xmm1 ALIGN_3 .L27: testq $1, M jle .L998 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addsd %xmm4, %xmm5 maxsd %xmm5, %xmm0 ALIGN_3 .L998: maxsd %xmm1, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zamax_sse.S000066400000000000000000000151611313527062700175040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #ifdef USE_MIN #define maxps minps #define maxss minss #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 salq $ZBASE_SHIFT, INCX testq M, M jle .L999 pcmpeqb %xmm15, %xmm15 psrld $1, %xmm15 movss 0 * SIZE(X), %xmm0 movss 1 * SIZE(X), %xmm1 addq INCX, X decq M andps %xmm15, %xmm0 andps %xmm15, %xmm1 addps %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, %xmm1 cmpq $2 * SIZE, INCX jne .L40 .L30: movq M, I sarq $3, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 movsd 8 * SIZE(X), %xmm7 movhps 10 * SIZE(X), %xmm7 movsd 12 * SIZE(X), %xmm8 movhps 14 * SIZE(X), %xmm8 movaps %xmm7, %xmm9 shufps $0x88, %xmm8, %xmm7 shufps $0xdd, %xmm8, %xmm9 andps %xmm15, %xmm7 andps %xmm15, %xmm9 addps %xmm9, %xmm7 maxps %xmm7, %xmm0 addq $16 * SIZE, X decq I jg .L31 ALIGN_4 .L35: andq $7, M jle .L998 testq $4, M je .L36 movsd 0 * SIZE(X), %xmm4 movhps 2 * SIZE(X), %xmm4 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 addq $8 * SIZE, X ALIGN_3 .L36: testq $2, M je .L37 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 movss 2 * SIZE(X), %xmm6 movss 3 * SIZE(X), %xmm7 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addps %xmm5, %xmm4 addps %xmm7, %xmm6 maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 addq $4 * SIZE, X ALIGN_3 .L37: testq $1, M je .L998 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addps %xmm5, %xmm4 maxss %xmm4, %xmm0 jmp .L998 ALIGN_4 .L40: movq M, I sarq $3, I jle .L45 ALIGN_4 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 addq INCX, X movhps 0 * SIZE(X), %xmm4 addq INCX, X movsd 0 * SIZE(X), %xmm5 addq INCX, X movhps 0 * SIZE(X), %xmm5 addq INCX, X movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm7 addq INCX, X movhps 0 * SIZE(X), %xmm7 addq INCX, X movsd 0 * SIZE(X), %xmm8 addq INCX, X movhps 0 * SIZE(X), %xmm8 addq INCX, X movaps %xmm7, %xmm9 shufps $0x88, %xmm8, %xmm7 shufps $0xdd, %xmm8, %xmm9 andps %xmm15, %xmm7 andps %xmm15, %xmm9 addps %xmm9, %xmm7 maxps %xmm7, %xmm0 decq I jg .L41 ALIGN_4 .L45: andq $7, M jle .L998 testq $4, M je .L46 movsd 0 * SIZE(X), %xmm4 addq INCX, X movhps 0 * SIZE(X), %xmm4 addq INCX, X movsd 0 * SIZE(X), %xmm5 addq INCX, X movhps 0 * SIZE(X), %xmm5 addq INCX, X movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 andps %xmm15, %xmm4 andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 ALIGN_3 .L46: testq $2, M je .L47 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 addq INCX, X movss 0 * SIZE(X), %xmm6 movss 1 * SIZE(X), %xmm7 addq INCX, X andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addps %xmm5, %xmm4 addps %xmm7, %xmm6 maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 ALIGN_3 .L47: testq $1, M je .L998 movss 0 * SIZE(X), %xmm4 movss 1 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addps %xmm5, %xmm4 maxss %xmm4, %xmm0 jmp .L998 ALIGN_4 .L998: maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zamax_sse2.S000066400000000000000000000165321313527062700175710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 testq M, M jle .L999 testq INCX, INCX jle .L999 salq $ZBASE_SHIFT, INCX pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 addq INCX, X decq M andpd %xmm15, %xmm0 andpd %xmm15, %xmm1 addpd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 movapd %xmm0, %xmm1 movapd %xmm0, %xmm2 movapd %xmm0, %xmm3 cmpq $2 * SIZE, INCX jne .L40 .L30: movq M, I sarq $3, I jle .L35 ALIGN_4 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movhpd 2 * SIZE(X), %xmm4 movhpd 3 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 5 * SIZE(X), %xmm7 movhpd 6 * SIZE(X), %xmm6 movhpd 7 * SIZE(X), %xmm7 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm0 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm7, %xmm6 maxpd %xmm6, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 8 * SIZE(X), %xmm4 movsd 9 * SIZE(X), %xmm5 movhpd 10 * SIZE(X), %xmm4 movhpd 11 * SIZE(X), %xmm5 movsd 12 * SIZE(X), %xmm6 movsd 13 * SIZE(X), %xmm7 movhpd 14 * SIZE(X), %xmm6 movhpd 15 * SIZE(X), %xmm7 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm2 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm7, %xmm6 maxpd %xmm6, %xmm3 addq $16 * SIZE, X decq I jg .L31 ALIGN_4 .L35: andq $7, M jle .L998 testq $4, M je .L36 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movhpd 2 * SIZE(X), %xmm4 movhpd 3 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 5 * SIZE(X), %xmm7 movhpd 6 * SIZE(X), %xmm6 movhpd 7 * SIZE(X), %xmm7 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm0 maxpd %xmm6, %xmm1 addq $8 * SIZE, X ALIGN_3 .L36: testq $2, M je .L37 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 movhpd 2 * SIZE(X), %xmm4 movhpd 3 * SIZE(X), %xmm5 addq $4 * SIZE, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm0 ALIGN_3 .L37: testq $1, M je .L998 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxsd %xmm4, %xmm2 jmp .L998 ALIGN_4 .L40: movq M, I sarq $3, I jle .L45 ALIGN_4 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm7 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm0 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm7, %xmm6 maxpd %xmm6, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm7 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm2 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm7, %xmm6 maxpd %xmm6, %xmm3 decq I jg .L41 ALIGN_4 .L45: andq $7, M jle .L998 testq $4, M je .L46 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movhpd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm7 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 andpd %xmm15, %xmm6 andpd %xmm15, %xmm7 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 maxpd %xmm4, %xmm0 maxpd %xmm6, %xmm1 ALIGN_3 .L46: testq $2, M je .L47 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movhpd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm5 addq INCX, X andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm2 ALIGN_3 .L47: testq $1, M je .L998 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 andpd %xmm15, %xmm4 andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxsd %xmm4, %xmm3 jmp .L998 ALIGN_4 .L998: maxpd %xmm1, %xmm0 maxpd %xmm3, %xmm2 maxpd %xmm2, %xmm0 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 maxsd %xmm1, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zasum.S000066400000000000000000000104411313527062700166450ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 #define X ARG2 #define INCX ARG3 #define I %rax #include "l1param.h" PROLOGUE PROFCODE fldz testq M, M jle .L999 testq INCX, INCX jle .L999 salq $ZBASE_SHIFT, INCX fldz fldz fldz cmpq $SIZE * 2, INCX jne .L40 movq M, I sarq $2, I jle .L20 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs FLD 2 * SIZE(X) fabs FLD 3 * SIZE(X) fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fabs FLD 5 * SIZE(X) fabs FLD 6 * SIZE(X) fabs FLD 7 * SIZE(X) fabs addq $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L10 ALIGN_4 .L20: andq $3, M jle .L998 ALIGN_4 .L21: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) fabs faddp %st,%st(3) faddp %st,%st(1) addq $2 * SIZE, X decq M jg .L21 jmp .L998 ALIGN_4 .L40: movq M, I sarq $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addq INCX, X fabs FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addq INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addq INCX, X fabs FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addq INCX, X fabs faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L50 ALIGN_4 .L60: andq $3, M jle .L998 ALIGN_4 .L61: FLD 0 * SIZE(X) fabs FLD 1 * SIZE(X) addq INCX, X fabs faddp %st,%st(3) faddp %st,%st(1) decq M jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zasum_atom.S000066400000000000000000000207751313527062700177000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 testq M, M jle .L999 testq INCX, INCX jle .L999 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 salq $ZBASE_SHIFT, INCX xorps %xmm13, %xmm13 cmpq $2 * SIZE, INCX jne .L20 addq M, M testq $SIZE, X je .L05 movsd (X), %xmm0 addq $SIZE, X andps %xmm15, %xmm0 decq M ALIGN_3 .L05: subq $-16 * SIZE, X movq M, I sarq $4, I jle .L12 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 movaps -8 * SIZE(X), %xmm8 movaps -6 * SIZE(X), %xmm9 movaps -4 * SIZE(X), %xmm10 movaps -2 * SIZE(X), %xmm11 decq I jle .L11 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm4 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 movaps 2 * SIZE(X), %xmm5 andps %xmm15, %xmm6 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm6, %xmm12 addsd %xmm6, %xmm0 movaps 4 * SIZE(X), %xmm6 andps %xmm15, %xmm7 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm13 addsd %xmm7, %xmm2 movaps 6 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm15, %xmm8 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm8, %xmm12 addsd %xmm8, %xmm0 movaps 8 * SIZE(X), %xmm8 andps %xmm15, %xmm9 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm13 addsd %xmm9, %xmm2 movaps 10 * SIZE(X), %xmm9 andps %xmm15, %xmm10 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm10, %xmm12 addsd %xmm10, %xmm0 movaps 12 * SIZE(X), %xmm10 andps %xmm15, %xmm11 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm13 addsd %xmm11, %xmm2 movaps 14 * SIZE(X), %xmm11 subq $-16 * SIZE, X decq I jg .L10 ALIGN_4 .L11: andps %xmm15, %xmm4 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 andps %xmm15, %xmm6 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm6, %xmm12 addsd %xmm6, %xmm0 andps %xmm15, %xmm7 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm13 addsd %xmm7, %xmm2 andps %xmm15, %xmm8 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm8, %xmm12 addsd %xmm8, %xmm0 andps %xmm15, %xmm9 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm13 addsd %xmm9, %xmm2 andps %xmm15, %xmm10 addsd %xmm13, %xmm3 pshufd $0x4e, %xmm10, %xmm12 addsd %xmm10, %xmm0 andps %xmm15, %xmm11 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm13 addsd %xmm11, %xmm2 addsd %xmm13, %xmm3 subq $-16 * SIZE, X ALIGN_3 .L12: andq $15, M jle .L998 testq $8, M je .L13 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 addq $8 * SIZE, X andps %xmm15, %xmm4 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 addsd %xmm13, %xmm3 andps %xmm15, %xmm6 pshufd $0x4e, %xmm6, %xmm12 addsd %xmm6, %xmm0 andps %xmm15, %xmm7 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm13 addsd %xmm7, %xmm2 addsd %xmm13, %xmm3 ALIGN_3 .L13: testq $4, M je .L14 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 addq $4 * SIZE, X andps %xmm15, %xmm4 pshufd $0x4e, %xmm4, %xmm12 addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm12, %xmm1 pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 addsd %xmm13, %xmm3 ALIGN_3 .L14: testq $2, M je .L15 movaps -16 * SIZE(X), %xmm4 addq $2 * SIZE, X andps %xmm15, %xmm4 pshufd $0x4e, %xmm4, %xmm5 addsd %xmm4, %xmm2 addsd %xmm5, %xmm3 ALIGN_3 .L15: testq $1, M je .L998 movsd -16 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 jmp .L998 ALIGN_3 .L20: movq M, I sarq $2, I jle .L25 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movsd 0 * SIZE(X), %xmm8 movsd 1 * SIZE(X), %xmm9 addq INCX, X movsd 0 * SIZE(X), %xmm10 movsd 1 * SIZE(X), %xmm11 decq I jle .L23 ALIGN_4 .L22: andps %xmm15, %xmm4 addq INCX, X addsd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm4 andps %xmm15, %xmm5 addsd %xmm5, %xmm1 movsd 1 * SIZE(X), %xmm5 andps %xmm15, %xmm6 addq INCX, X addsd %xmm6, %xmm2 movsd 0 * SIZE(X), %xmm6 andps %xmm15, %xmm7 addsd %xmm7, %xmm3 movsd 1 * SIZE(X), %xmm7 andps %xmm15, %xmm8 addq INCX, X addsd %xmm8, %xmm0 movsd 0 * SIZE(X), %xmm8 andps %xmm15, %xmm9 addsd %xmm9, %xmm1 movsd 1 * SIZE(X), %xmm9 andps %xmm15, %xmm10 addq INCX, X addsd %xmm10, %xmm2 movsd 0 * SIZE(X), %xmm10 andps %xmm15, %xmm11 addsd %xmm11, %xmm3 movsd 1 * SIZE(X), %xmm11 decq I jg .L22 ALIGN_4 .L23: andps %xmm15, %xmm4 addq INCX, X addsd %xmm4, %xmm0 andps %xmm15, %xmm5 addsd %xmm5, %xmm1 andps %xmm15, %xmm6 addsd %xmm6, %xmm2 andps %xmm15, %xmm7 addsd %xmm7, %xmm3 andps %xmm15, %xmm8 addsd %xmm8, %xmm0 andps %xmm15, %xmm9 addsd %xmm9, %xmm1 andps %xmm15, %xmm10 addsd %xmm10, %xmm2 andps %xmm15, %xmm11 addsd %xmm11, %xmm3 ALIGN_3 .L25: testq $2, M je .L26 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 movsd 1 * SIZE(X), %xmm7 andps %xmm15, %xmm5 addsd %xmm5, %xmm1 addq INCX, X andps %xmm15, %xmm6 addsd %xmm6, %xmm2 andps %xmm15, %xmm7 addsd %xmm7, %xmm3 ALIGN_3 .L26: testq $1, M je .L998 movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X andps %xmm15, %xmm4 andps %xmm15, %xmm5 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 ALIGN_3 .L998: addsd %xmm1, %xmm0 addsd %xmm3, %xmm2 addsd %xmm2, %xmm0 ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zasum_sse.S000066400000000000000000000150611313527062700175220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 testq M, M jle .L999 testq INCX, INCX jle .L999 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pcmpeqb %xmm15, %xmm15 psrld $1, %xmm15 salq $ZBASE_SHIFT, INCX cmpq $2 * SIZE, INCX jne .L100 subq $-32 * SIZE, X addq M, M cmpq $3, M jle .L18 testq $4, X je .L05 movss -32 * SIZE(X), %xmm0 andps %xmm15, %xmm0 addq $SIZE, X decq M jle .L998 ALIGN_3 .L05: testq $8, X je .L10 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(X), %xmm1 andps %xmm15, %xmm1 addq $2 * SIZE, X subq $2, M jle .L998 ALIGN_3 .L10: movq M, I sarq $5, I jle .L14 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 movaps -16 * SIZE(X), %xmm8 movaps -12 * SIZE(X), %xmm9 movaps -8 * SIZE(X), %xmm10 movaps -4 * SIZE(X), %xmm11 decq I jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm15, %xmm5 addps %xmm5, %xmm1 movaps 4 * SIZE(X), %xmm5 andps %xmm15, %xmm6 addps %xmm6, %xmm2 movaps 8 * SIZE(X), %xmm6 andps %xmm15, %xmm7 addps %xmm7, %xmm3 movaps 12 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps 16 * SIZE(X), %xmm8 andps %xmm15, %xmm9 addps %xmm9, %xmm1 movaps 20 * SIZE(X), %xmm9 andps %xmm15, %xmm10 addps %xmm10, %xmm2 movaps 24 * SIZE(X), %xmm10 andps %xmm15, %xmm11 addps %xmm11, %xmm3 movaps 28 * SIZE(X), %xmm11 subq $-32 * SIZE, X decq I jg .L11 ALIGN_3 .L12: andps %xmm15, %xmm4 addps %xmm4, %xmm0 andps %xmm15, %xmm5 addps %xmm5, %xmm1 andps %xmm15, %xmm6 addps %xmm6, %xmm2 andps %xmm15, %xmm7 addps %xmm7, %xmm3 andps %xmm15, %xmm8 addps %xmm8, %xmm0 andps %xmm15, %xmm9 addps %xmm9, %xmm1 andps %xmm15, %xmm10 addps %xmm10, %xmm2 andps %xmm15, %xmm11 addps %xmm11, %xmm3 addq $32 * SIZE, X ALIGN_3 .L14: testq $31, M jle .L998 .L15: testq $16, M je .L16 movaps -32 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm15, %xmm5 addps %xmm5, %xmm1 movaps -24 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps -20 * SIZE(X), %xmm5 andps %xmm15, %xmm5 addps %xmm5, %xmm1 addq $16 * SIZE, X ALIGN_3 .L16: testq $8, M je .L17 movaps -32 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 movaps -28 * SIZE(X), %xmm5 andps %xmm15, %xmm5 addps %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 .L17: testq $4, M je .L18 movaps -32 * SIZE(X), %xmm6 andps %xmm15, %xmm6 addps %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 .L18: testq $2, M je .L19 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd -32 * SIZE(X), %xmm7 andps %xmm15, %xmm7 addps %xmm7, %xmm3 addq $2 * SIZE, X ALIGN_3 .L19: testq $1, M je .L998 movss -32 * SIZE(X), %xmm6 andps %xmm15, %xmm6 addps %xmm6, %xmm2 jmp .L998 ALIGN_4 .L100: movq M, I sarq $2, I jle .L105 ALIGN_4 .L101: movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X andps %xmm15, %xmm4 addps %xmm4, %xmm0 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X andps %xmm15, %xmm5 addps %xmm5, %xmm1 decq I jg .L101 ALIGN_4 .L105: #ifdef movsd xorps %xmm4, %xmm4 #endif andq $3, M jle .L998 ALIGN_4 .L106: movsd (X), %xmm4 andps %xmm15, %xmm4 addps %xmm4, %xmm0 addq INCX, X decq M jg .L106 ALIGN_4 .L998: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 #endif ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zasum_sse2.S000066400000000000000000000150601313527062700176030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS xorps %xmm0, %xmm0 testq M, M jle .L999 testq INCX, INCX jle .L999 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 pcmpeqb %xmm15, %xmm15 psrlq $1, %xmm15 salq $ZBASE_SHIFT, INCX cmpq $2 * SIZE, INCX jne .L40 subq $-16 * SIZE, X addq M, M testq $SIZE, X je .L05 #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -16 * SIZE(X), %xmm0 addq $SIZE, X andps %xmm15, %xmm0 subq $1, M jle .L999 ALIGN_3 .L05: movq M, I sarq $4, I jle .L20 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 movaps -8 * SIZE(X), %xmm8 movaps -6 * SIZE(X), %xmm9 movaps -4 * SIZE(X), %xmm10 movaps -2 * SIZE(X), %xmm11 decq I jle .L11 ALIGN_4 .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif andps %xmm15, %xmm4 addpd %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 andps %xmm15, %xmm5 addpd %xmm5, %xmm1 movaps 2 * SIZE(X), %xmm5 andps %xmm15, %xmm6 addpd %xmm6, %xmm2 movaps 4 * SIZE(X), %xmm6 andps %xmm15, %xmm7 addpd %xmm7, %xmm3 movaps 6 * SIZE(X), %xmm7 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif andps %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps 8 * SIZE(X), %xmm8 andps %xmm15, %xmm9 addpd %xmm9, %xmm1 movaps 10 * SIZE(X), %xmm9 andps %xmm15, %xmm10 addpd %xmm10, %xmm2 movaps 12 * SIZE(X), %xmm10 andps %xmm15, %xmm11 addpd %xmm11, %xmm3 movaps 14 * SIZE(X), %xmm11 subq $-16 * SIZE, X decq I jg .L10 ALIGN_4 .L11: andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 andps %xmm15, %xmm8 andps %xmm15, %xmm9 andps %xmm15, %xmm10 andps %xmm15, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 subq $-16 * SIZE, X ALIGN_3 .L20: andq $15, M jle .L998 testq $8, M je .L21 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 andps %xmm15, %xmm4 andps %xmm15, %xmm5 andps %xmm15, %xmm6 andps %xmm15, %xmm7 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 addq $8 * SIZE, X ALIGN_3 .L21: testq $4, M je .L22 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 andps %xmm15, %xmm4 andps %xmm15, %xmm5 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addq $4 * SIZE, X ALIGN_3 .L22: testq $2, M je .L23 movaps -16 * SIZE(X), %xmm6 andps %xmm15, %xmm6 addpd %xmm6, %xmm3 addq $2 * SIZE, X .L23: testq $1, M je .L998 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -16 * SIZE(X), %xmm4 andps %xmm15, %xmm4 addsd %xmm4, %xmm0 jmp .L998 ALIGN_3 .L40: movq M, I sarq $2, I jle .L60 ALIGN_4 .L50: #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) prefetcht0 PREFETCHSIZE * SIZE(X) #endif #ifdef PENTIUM4 prefetchnta PREFETCHSIZE * SIZE(X) #endif movsd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 addq INCX, X andpd %xmm15, %xmm4 addpd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm5 movhpd 1 * SIZE(X), %xmm5 addq INCX, X andpd %xmm15, %xmm5 addpd %xmm5, %xmm1 movsd 0 * SIZE(X), %xmm6 movhpd 1 * SIZE(X), %xmm6 addq INCX, X andpd %xmm15, %xmm6 addpd %xmm6, %xmm2 movsd 0 * SIZE(X), %xmm7 movhpd 1 * SIZE(X), %xmm7 addq INCX, X andpd %xmm15, %xmm7 addpd %xmm7, %xmm3 decq I jg .L50 ALIGN_4 .L60: andq $3, M jle .L998 ALIGN_4 .L61: movsd 0 * SIZE(X), %xmm4 movhpd 1 * SIZE(X), %xmm4 andpd %xmm15, %xmm4 addpd %xmm4, %xmm0 addq INCX, X decq M jg .L61 ALIGN_4 .L998: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif ALIGN_4 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zaxpy.S000066400000000000000000000153031313527062700166630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG4 /* rsi */ #define INCX ARG5 /* rdx */ #define Y ARG6 /* rcx */ #define INCY ARG2 /* r8 */ #ifndef CONJ #define ADD1 fsubrp #define ADD2 faddp #else #define ADD1 faddp #define ADD2 fsubrp #endif #define ALPHA_R 8(%rsp) #define ALPHA_I 24(%rsp) #include "l1param.h" PROLOGUE PROFCODE FLD ALPHA_I FLD ALPHA_R movq 40(%rsp), INCY salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq M, M jle .L40 cmpq $2 * SIZE, INCX jne .L14 cmpq $2 * SIZE, INCY jne .L14 movq M, %rax sarq $2, %rax jle .L15 ALIGN_3 .L16: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(1), %st FLD 3 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 2 * SIZE(Y) faddp %st, %st(1) FST 2 * SIZE(Y) FLD 2 * SIZE(X) fmul %st(2), %st FLD 3 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 3 * SIZE(Y) faddp %st, %st(1) FST 3 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 4 * SIZE(X) fmul %st(1), %st FLD 5 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 4 * SIZE(Y) faddp %st, %st(1) FST 4 * SIZE(Y) FLD 4 * SIZE(X) fmul %st(2), %st FLD 5 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 5 * SIZE(Y) faddp %st, %st(1) FST 5 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(1), %st FLD 7 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 6 * SIZE(Y) faddp %st, %st(1) FST 6 * SIZE(Y) FLD 6 * SIZE(X) fmul %st(2), %st FLD 7 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 7 * SIZE(Y) faddp %st, %st(1) FST 7 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L16 ALIGN_3 .L15: movq M, %rax andq $3, %rax jle .L40 ALIGN_3 .L22: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y decq %rax jg .L22 jmp .L40 ALIGN_3 .L14: movq M, %rax sarq $2, %rax jle .L28 ALIGN_3 .L29: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addq INCX, X addq INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addq INCX, X addq INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addq INCX, X addq INCY, Y FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addq INCX, X addq INCY, Y decq %rax jg .L29 ALIGN_3 .L28: movq M, %rax andq $3, %rax jle .L40 ALIGN_3 .L35: FLD 0 * SIZE(X) fmul %st(1), %st FLD 1 * SIZE(X) fmul %st(3), %st ADD1 %st, %st(1) FLD 0 * SIZE(Y) faddp %st, %st(1) FST 0 * SIZE(Y) FLD 0 * SIZE(X) fmul %st(2), %st FLD 1 * SIZE(X) fmul %st(2), %st ADD2 %st, %st(1) FLD 1 * SIZE(Y) faddp %st, %st(1) FST 1 * SIZE(Y) addq INCX, X addq INCY, Y decq %rax jg .L35 ALIGN_3 .L40: ffreep %st(0) ffreep %st(0) ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zaxpy.c000066400000000000000000000076351313527062700167140ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" #elif defined(HASWELL) || defined(ZEN) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_4 static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; BLASLONG register ix = 0; FLOAT da_r = alpha[0]; FLOAT da_i = alpha[1]; while(i < n) { #if !defined(CONJ) y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; #else y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; #endif ix+=4 ; i+=2 ; } } #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; BLASLONG ix=0,iy=0; FLOAT da[2]; if ( n <= 0 ) return(0); if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -16; if ( n1 ) { da[0] = da_r; da[1] = da_i; zaxpy_kernel_4(n1, x, y , da ); ix = 2 * n1; } i = n1; while(i < n) { #if !defined(CONJ) y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif i++ ; ix += 2; } return(0); } inc_x *=2; inc_y *=2; while(i < n) { #if !defined(CONJ) y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; #else y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; #endif ix += inc_x ; iy += inc_y ; i++ ; } return(0); } OpenBLAS-0.2.20/kernel/x86_64/zaxpy_atom.S000066400000000000000000000323751313527062700177130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %r10 #endif #define YY %r11 #define ALPHA_R %xmm14 #define ALPHA_I %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY #else movq 40(%rsp), INCY #endif #else movaps %xmm3, %xmm0 movsd 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX movq 64(%rsp), Y movq 72(%rsp), INCY #endif SAVEREGISTERS #ifndef CONJ #define ADD1 subsd #define ADD2 addsd #else #define ADD1 addsd #define ADD2 subsd #endif salq $ZBASE_SHIFT, INCX movaps %xmm0, ALPHA_R salq $ZBASE_SHIFT, INCY movaps %xmm1, ALPHA_I testq M, M jle .L999 cmpq $2 * SIZE, INCX jne .L20 cmpq $2 * SIZE, INCY jne .L20 movq M, %rax sarq $2, %rax jle .L15 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 movsd 0 * SIZE(Y), %xmm8 movsd 1 * SIZE(Y), %xmm9 movsd 2 * SIZE(X), %xmm4 movsd 3 * SIZE(X), %xmm5 movsd 2 * SIZE(Y), %xmm10 movsd 3 * SIZE(Y), %xmm11 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 mulsd ALPHA_I, %xmm3 mulsd ALPHA_I, %xmm2 movaps %xmm4, %xmm6 mulsd ALPHA_R, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(X), %xmm0 movaps %xmm5, %xmm7 mulsd ALPHA_R, %xmm5 ADD2 %xmm1, %xmm9 movsd 5 * SIZE(X), %xmm1 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif mulsd ALPHA_I, %xmm7 movsd 4 * SIZE(Y), %xmm12 ADD1 %xmm3, %xmm8 mulsd ALPHA_I, %xmm6 movsd 5 * SIZE(Y), %xmm13 addsd %xmm2, %xmm9 addsd %xmm4, %xmm10 movsd 6 * SIZE(X), %xmm4 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 ADD2 %xmm5, %xmm11 movsd 7 * SIZE(X), %xmm5 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 ADD1 %xmm7, %xmm10 movsd %xmm8, 0 * SIZE(Y) mulsd ALPHA_I, %xmm3 addsd %xmm6, %xmm11 movsd %xmm9, 1 * SIZE(Y) mulsd ALPHA_I, %xmm2 movaps %xmm4, %xmm6 movsd %xmm10, 2 * SIZE(Y) mulsd ALPHA_R, %xmm4 movsd 6 * SIZE(Y), %xmm10 addsd %xmm0, %xmm12 movsd 8 * SIZE(X), %xmm0 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm5, %xmm7 movsd %xmm11, 3 * SIZE(Y) mulsd ALPHA_R, %xmm5 movsd 7 * SIZE(Y), %xmm11 ADD2 %xmm1, %xmm13 movsd 9 * SIZE(X), %xmm1 mulsd ALPHA_I, %xmm7 movsd 8 * SIZE(Y), %xmm8 ADD1 %xmm3, %xmm12 mulsd ALPHA_I, %xmm6 movsd 9 * SIZE(Y), %xmm9 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 addsd %xmm4, %xmm10 movsd 10 * SIZE(X), %xmm4 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 ADD2 %xmm5, %xmm11 movsd 11 * SIZE(X), %xmm5 mulsd ALPHA_I, %xmm3 movsd %xmm12, 4 * SIZE(Y) ADD1 %xmm7, %xmm10 mulsd ALPHA_I, %xmm2 movsd %xmm13, 5 * SIZE(Y) addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 movsd %xmm10, 6 * SIZE(Y) mulsd ALPHA_R, %xmm4 addsd %xmm0, %xmm8 movsd 10 * SIZE(Y), %xmm10 movsd 12 * SIZE(X), %xmm0 movaps %xmm5, %xmm7 movsd %xmm11, 7 * SIZE(Y) mulsd ALPHA_R, %xmm5 movsd 11 * SIZE(Y), %xmm11 ADD2 %xmm1, %xmm9 movsd 13 * SIZE(X), %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: mulsd ALPHA_I, %xmm7 movsd 4 * SIZE(Y), %xmm12 ADD1 %xmm3, %xmm8 mulsd ALPHA_I, %xmm6 movsd 5 * SIZE(Y), %xmm13 addsd %xmm2, %xmm9 addsd %xmm4, %xmm10 movsd 6 * SIZE(X), %xmm4 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 ADD2 %xmm5, %xmm11 movsd 7 * SIZE(X), %xmm5 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 ADD1 %xmm7, %xmm10 movsd %xmm8, 0 * SIZE(Y) mulsd ALPHA_I, %xmm3 addsd %xmm6, %xmm11 movsd %xmm9, 1 * SIZE(Y) mulsd ALPHA_I, %xmm2 movaps %xmm4, %xmm6 movsd %xmm10, 2 * SIZE(Y) mulsd ALPHA_R, %xmm4 movsd 6 * SIZE(Y), %xmm10 addsd %xmm0, %xmm12 movaps %xmm5, %xmm7 movsd %xmm11, 3 * SIZE(Y) mulsd ALPHA_R, %xmm5 ADD2 %xmm1, %xmm13 movsd 7 * SIZE(Y), %xmm11 mulsd ALPHA_I, %xmm7 ADD1 %xmm3, %xmm12 mulsd ALPHA_I, %xmm6 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 addsd %xmm4, %xmm10 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 ADD2 %xmm5, %xmm11 mulsd ALPHA_I, %xmm3 ADD1 %xmm7, %xmm10 addsd %xmm6, %xmm11 mulsd ALPHA_I, %xmm2 movsd %xmm12, 4 * SIZE(Y) movsd %xmm13, 5 * SIZE(Y) movsd %xmm10, 6 * SIZE(Y) movsd %xmm11, 7 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: movq M, %rax andq $2, %rax jle .L17 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 movsd 2 * SIZE(X), %xmm4 movsd 3 * SIZE(X), %xmm5 movaps %xmm0, %xmm2 movsd 0 * SIZE(Y), %xmm8 mulsd ALPHA_R, %xmm0 movaps %xmm1, %xmm3 movsd 1 * SIZE(Y), %xmm9 mulsd ALPHA_R, %xmm1 movsd 2 * SIZE(Y), %xmm10 mulsd ALPHA_I, %xmm3 movsd 3 * SIZE(Y), %xmm11 mulsd ALPHA_I, %xmm2 movaps %xmm4, %xmm6 mulsd ALPHA_R, %xmm4 addsd %xmm0, %xmm8 movaps %xmm5, %xmm7 mulsd ALPHA_R, %xmm5 ADD2 %xmm1, %xmm9 mulsd ALPHA_I, %xmm7 ADD1 %xmm3, %xmm8 mulsd ALPHA_I, %xmm6 addsd %xmm2, %xmm9 addsd %xmm4, %xmm10 movsd %xmm8, 0 * SIZE(Y) ADD2 %xmm5, %xmm11 movsd %xmm9, 1 * SIZE(Y) ADD1 %xmm7, %xmm10 addsd %xmm6, %xmm11 movsd %xmm10, 2 * SIZE(Y) movsd %xmm11, 3 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L17: movq M, %rax andq $1, %rax jle .L999 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 movsd 0 * SIZE(Y), %xmm8 movsd 1 * SIZE(Y), %xmm9 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 mulsd ALPHA_I, %xmm3 mulsd ALPHA_I, %xmm2 addsd %xmm0, %xmm8 ADD2 %xmm1, %xmm9 ADD1 %xmm3, %xmm8 addsd %xmm2, %xmm9 movsd %xmm8, 0 * SIZE(Y) movsd %xmm9, 1 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movq Y, YY movq M, %rax sarq $2, %rax jle .L25 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(Y), %xmm8 movsd 1 * SIZE(Y), %xmm9 addq INCY, Y movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(Y), %xmm10 movsd 1 * SIZE(Y), %xmm11 addq INCY, Y movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 mulsd ALPHA_I, %xmm3 mulsd ALPHA_I, %xmm2 movaps %xmm4, %xmm6 mulsd ALPHA_R, %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(X), %xmm0 movaps %xmm5, %xmm7 mulsd ALPHA_R, %xmm5 ADD2 %xmm1, %xmm9 movsd 1 * SIZE(X), %xmm1 addq INCX, X decq %rax jle .L22 ALIGN_3 .L21: mulsd ALPHA_I, %xmm7 movsd 0 * SIZE(Y), %xmm12 ADD1 %xmm3, %xmm8 mulsd ALPHA_I, %xmm6 movsd 1 * SIZE(Y), %xmm13 addsd %xmm2, %xmm9 addq INCY, Y addsd %xmm4, %xmm10 movsd 0 * SIZE(X), %xmm4 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 ADD2 %xmm5, %xmm11 movsd 1 * SIZE(X), %xmm5 movaps %xmm1, %xmm3 addq INCX, X mulsd ALPHA_R, %xmm1 ADD1 %xmm7, %xmm10 movsd %xmm8, 0 * SIZE(YY) mulsd ALPHA_I, %xmm3 addsd %xmm6, %xmm11 movsd %xmm9, 1 * SIZE(YY) mulsd ALPHA_I, %xmm2 addq INCY, YY movaps %xmm4, %xmm6 movsd %xmm10, 0 * SIZE(YY) mulsd ALPHA_R, %xmm4 movsd 0 * SIZE(Y), %xmm10 addsd %xmm0, %xmm12 movsd 0 * SIZE(X), %xmm0 movaps %xmm5, %xmm7 movsd %xmm11, 1 * SIZE(YY) addq INCY, YY mulsd ALPHA_R, %xmm5 movsd 1 * SIZE(Y), %xmm11 addq INCY, Y ADD2 %xmm1, %xmm13 movsd 1 * SIZE(X), %xmm1 addq INCX, X mulsd ALPHA_I, %xmm7 movsd 0 * SIZE(Y), %xmm8 ADD1 %xmm3, %xmm12 mulsd ALPHA_I, %xmm6 movsd 1 * SIZE(Y), %xmm9 addsd %xmm2, %xmm13 addq INCY, Y movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 addsd %xmm4, %xmm10 movsd 0 * SIZE(X), %xmm4 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 ADD2 %xmm5, %xmm11 movsd 1 * SIZE(X), %xmm5 addq INCX, X mulsd ALPHA_I, %xmm3 movsd %xmm12, 0 * SIZE(YY) ADD1 %xmm7, %xmm10 mulsd ALPHA_I, %xmm2 movsd %xmm13, 1 * SIZE(YY) addsd %xmm6, %xmm11 addq INCY, YY movaps %xmm4, %xmm6 movsd %xmm10, 0 * SIZE(YY) mulsd ALPHA_R, %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(Y), %xmm10 movsd 0 * SIZE(X), %xmm0 movaps %xmm5, %xmm7 movsd %xmm11, 1 * SIZE(YY) addq INCY, YY mulsd ALPHA_R, %xmm5 movsd 1 * SIZE(Y), %xmm11 addq INCY, Y ADD2 %xmm1, %xmm9 movsd 1 * SIZE(X), %xmm1 addq INCX, X decq %rax jg .L21 ALIGN_3 .L22: mulsd ALPHA_I, %xmm7 movsd 0 * SIZE(Y), %xmm12 ADD1 %xmm3, %xmm8 mulsd ALPHA_I, %xmm6 movsd 1 * SIZE(Y), %xmm13 addsd %xmm2, %xmm9 addq INCY, Y addsd %xmm4, %xmm10 movsd 0 * SIZE(X), %xmm4 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 ADD2 %xmm5, %xmm11 movsd 1 * SIZE(X), %xmm5 movaps %xmm1, %xmm3 addq INCX, X mulsd ALPHA_R, %xmm1 ADD1 %xmm7, %xmm10 movsd %xmm8, 0 * SIZE(YY) mulsd ALPHA_I, %xmm3 addsd %xmm6, %xmm11 movsd %xmm9, 1 * SIZE(YY) mulsd ALPHA_I, %xmm2 addq INCY, YY movaps %xmm4, %xmm6 movsd %xmm10, 0 * SIZE(YY) mulsd ALPHA_R, %xmm4 movsd 0 * SIZE(Y), %xmm10 addsd %xmm0, %xmm12 movaps %xmm5, %xmm7 movsd %xmm11, 1 * SIZE(YY) mulsd ALPHA_R, %xmm5 addq INCY, YY ADD2 %xmm1, %xmm13 movsd 1 * SIZE(Y), %xmm11 mulsd ALPHA_I, %xmm7 addq INCY, Y ADD1 %xmm3, %xmm12 mulsd ALPHA_I, %xmm6 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 addsd %xmm4, %xmm10 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 ADD2 %xmm5, %xmm11 mulsd ALPHA_I, %xmm3 ADD1 %xmm7, %xmm10 addsd %xmm6, %xmm11 mulsd ALPHA_I, %xmm2 movsd %xmm12, 0 * SIZE(YY) movsd %xmm13, 1 * SIZE(YY) addq INCY, YY movsd %xmm10, 0 * SIZE(YY) movsd %xmm11, 1 * SIZE(YY) addq INCY, YY ALIGN_3 .L25: movq M, %rax andq $2, %rax jle .L27 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm4 movsd 1 * SIZE(X), %xmm5 addq INCX, X movaps %xmm0, %xmm2 movsd 0 * SIZE(Y), %xmm8 mulsd ALPHA_R, %xmm0 movaps %xmm1, %xmm3 movsd 1 * SIZE(Y), %xmm9 addq INCY, Y mulsd ALPHA_R, %xmm1 movsd 0 * SIZE(Y), %xmm10 mulsd ALPHA_I, %xmm3 movsd 1 * SIZE(Y), %xmm11 mulsd ALPHA_I, %xmm2 addq INCY, Y movaps %xmm4, %xmm6 mulsd ALPHA_R, %xmm4 addsd %xmm0, %xmm8 movaps %xmm5, %xmm7 mulsd ALPHA_R, %xmm5 ADD2 %xmm1, %xmm9 mulsd ALPHA_I, %xmm7 ADD1 %xmm3, %xmm8 mulsd ALPHA_I, %xmm6 addsd %xmm2, %xmm9 addsd %xmm4, %xmm10 movsd %xmm8, 0 * SIZE(YY) ADD2 %xmm5, %xmm11 movsd %xmm9, 1 * SIZE(YY) ADD1 %xmm7, %xmm10 addq INCY, YY addsd %xmm6, %xmm11 movsd %xmm10, 0 * SIZE(YY) movsd %xmm11, 1 * SIZE(YY) addq INCY, YY ALIGN_3 .L27: movq M, %rax andq $1, %rax jle .L999 movsd 0 * SIZE(X), %xmm0 movsd 1 * SIZE(X), %xmm1 movsd 0 * SIZE(Y), %xmm8 movsd 1 * SIZE(Y), %xmm9 movaps %xmm0, %xmm2 mulsd ALPHA_R, %xmm0 movaps %xmm1, %xmm3 mulsd ALPHA_R, %xmm1 mulsd ALPHA_I, %xmm3 mulsd ALPHA_I, %xmm2 addsd %xmm0, %xmm8 ADD2 %xmm1, %xmm9 ADD1 %xmm3, %xmm8 addsd %xmm2, %xmm9 movsd %xmm8, 0 * SIZE(YY) movsd %xmm9, 1 * SIZE(YY) ALIGN_3 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zaxpy_microk_bulldozer-2.c000066400000000000000000000170531313527062700224740ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4 1 static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[2] = { -1.0, 1.0 }; #else FLOAT mvec[2] = { 1.0, -1.0 }; #endif BLASLONG register i = 0; if ( n < 384 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vmovddup (%4), %%xmm0 \n\t" // real part of alpha "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 1 complex values from x "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 1 complex values from x "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 1 complex values from x "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 1 complex values from x "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" ".align 2 \n\t" "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part "vfmaddpd 64(%3,%0,8), %%xmm0 , %%xmm12, %%xmm12 \n\t" "vfmaddpd 80(%3,%0,8), %%xmm0 , %%xmm13, %%xmm13 \n\t" "vfmaddpd 96(%3,%0,8), %%xmm0 , %%xmm14, %%xmm14 \n\t" "vfmaddpd 112(%3,%0,8), %%xmm0 , %%xmm15, %%xmm15 \n\t" "vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" "vfmaddpd %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t" "vfmaddpd %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t" "vfmaddpd %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t" "vmovups %%xmm5 , (%3,%0,8) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,8) \n\t" "vmovups %%xmm9 , 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "vmovups %%xmm12, 64(%3,%0,8) \n\t" "vmovups %%xmm13, 80(%3,%0,8) \n\t" "vmovups %%xmm14, 96(%3,%0,8) \n\t" "vmovups %%xmm15,112(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vmovddup (%4), %%xmm0 \n\t" // real part of alpha "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "prefetcht0 512(%3,%0,8) \n\t" "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" ".align 2 \n\t" "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" "vmovups %%xmm5 , (%3,%0,8) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,8) \n\t" "vmovups %%xmm9 , 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $4, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zaxpy_microk_haswell-2.c000066400000000000000000000124771313527062700221360ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4 1 static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; #else FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; #endif BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" #else "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%ymm5 \n\t" // 2 complex values from x ".align 2 \n\t" "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 2 complex values from x "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 2 complex values from x "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 complex values from x "vmovups 128(%2,%0,8), %%ymm12 \n\t" // 2 complex values from x "vmovups 160(%2,%0,8), %%ymm13 \n\t" // 2 complex values from x "vmovups 192(%2,%0,8), %%ymm14 \n\t" // 2 complex values from x "vmovups 224(%2,%0,8), %%ymm15 \n\t" // 2 complex values from x "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part "vfmadd213pd (%3,%0,8), %%ymm0 , %%ymm5 \n\t" ".align 2 \n\t" "vfmadd213pd 32(%3,%0,8), %%ymm0 , %%ymm7 \n\t" "vfmadd213pd 64(%3,%0,8), %%ymm0 , %%ymm9 \n\t" "vfmadd213pd 96(%3,%0,8), %%ymm0 , %%ymm11 \n\t" "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm5 \n\t" "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm7 \n\t" "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm9 \n\t" "vfmadd231pd %%ymm1 , %%ymm10, %%ymm11 \n\t" "vpermilpd $0x5 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part "vfmadd213pd 128(%3,%0,8), %%ymm0 , %%ymm12 \n\t" "vfmadd213pd 160(%3,%0,8), %%ymm0 , %%ymm13 \n\t" "vfmadd213pd 192(%3,%0,8), %%ymm0 , %%ymm14 \n\t" "vfmadd213pd 224(%3,%0,8), %%ymm0 , %%ymm15 \n\t" "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm12 \n\t" "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm13 \n\t" "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm14 \n\t" "vfmadd231pd %%ymm1 , %%ymm10, %%ymm15 \n\t" "vmovups %%ymm5 , (%3,%0,8) \n\t" ".align 2 \n\t" "vmovups %%ymm7 , 32(%3,%0,8) \n\t" "vmovups %%ymm9 , 64(%3,%0,8) \n\t" "vmovups %%ymm11, 96(%3,%0,8) \n\t" "vmovups %%ymm12,128(%3,%0,8) \n\t" "vmovups %%ymm13,160(%3,%0,8) \n\t" "vmovups %%ymm14,192(%3,%0,8) \n\t" "vmovups %%ymm15,224(%3,%0,8) \n\t" "addq $32, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zaxpy_microk_sandy-2.c000066400000000000000000000163301313527062700216050ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4 1 static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; #else FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; #endif BLASLONG register i = 0; if ( n < 1280 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" #else "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x ".align 2 \n\t" "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" "vmovups %%ymm5 , (%3,%0,8) \n\t" ".align 2 \n\t" "vmovups %%ymm7 , 32(%3,%0,8) \n\t" "vmovups %%ymm9 , 64(%3,%0,8) \n\t" "vmovups %%ymm11, 96(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" #else "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,8) \n\t" "prefetcht0 576(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x ".align 2 \n\t" "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" "prefetcht0 512(%3,%0,8) \n\t" "prefetcht0 576(%3,%0,8) \n\t" "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" "vmovups %%ymm5 , (%3,%0,8) \n\t" ".align 2 \n\t" "vmovups %%ymm7 , 32(%3,%0,8) \n\t" "vmovups %%ymm9 , 64(%3,%0,8) \n\t" "vmovups %%ymm11, 96(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zaxpy_microk_steamroller-2.c000066400000000000000000000167141313527062700230260ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4 1 static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { #if !defined(CONJ) FLOAT mvec[2] = { -1.0, 1.0 }; #else FLOAT mvec[2] = { 1.0, -1.0 }; #endif BLASLONG register i = 0; if ( n < 640 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vmovddup (%4), %%xmm0 \n\t" // real part of alpha "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 2 complex values from x "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 2 complex values from x "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 2 complex values from x "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 2 complex values from x "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" ".align 2 \n\t" "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part "vfmadd213pd 64(%3,%0,8), %%xmm0 , %%xmm12 \n\t" "vfmadd213pd 80(%3,%0,8), %%xmm0 , %%xmm13 \n\t" "vfmadd213pd 96(%3,%0,8), %%xmm0 , %%xmm14 \n\t" "vfmadd213pd 112(%3,%0,8), %%xmm0 , %%xmm15 \n\t" "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm12 \n\t" "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm13 \n\t" "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm14 \n\t" "vfmadd231pd %%xmm1 , %%xmm10, %%xmm15 \n\t" "vmovups %%xmm5 , (%3,%0,8) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,8) \n\t" "vmovups %%xmm9 , 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "vmovups %%xmm12, 64(%3,%0,8) \n\t" "vmovups %%xmm13, 80(%3,%0,8) \n\t" "vmovups %%xmm14, 96(%3,%0,8) \n\t" "vmovups %%xmm15,112(%3,%0,8) \n\t" "addq $16, %0 \n\t" "subq $8 , %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vmovddup (%4), %%xmm0 \n\t" // real part of alpha "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha #if !defined(CONJ) "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" #else "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" #endif ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x ".align 2 \n\t" "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part "prefetcht0 512(%3,%0,8) \n\t" "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" ".align 2 \n\t" "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" "vmovups %%xmm5 , (%3,%0,8) \n\t" ".align 2 \n\t" "vmovups %%xmm7 , 16(%3,%0,8) \n\t" "vmovups %%xmm9 , 32(%3,%0,8) \n\t" "vmovups %%xmm11, 48(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 "r" (mvec) // 5 : "cc", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zaxpy_sse.S000066400000000000000000001725311313527062700175440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %r10 #endif #define YY %r11 #define ALPHA_R %xmm14 #define ALPHA_I %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI movq 8(%rsp), INCY #else movaps %xmm3, %xmm0 movss 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX movq 64(%rsp), Y movq 72(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq M, M jle .L999 cmpq $2 * SIZE, INCX jne .L100 cmpq $2 * SIZE, INCY jne .L100 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 pshufd $0, %xmm0, ALPHA_R pshufd $0, %xmm1, ALPHA_I #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 xorpd %xmm7, ALPHA_I #else xorpd %xmm7, ALPHA_R #endif subq $-32 * SIZE, X subq $-32 * SIZE, Y testq $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 addps %xmm1, %xmm0 movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y decq M jle .L999 ALIGN_2 .L10: testq $SIZE, Y jne .L50 testq $3 * SIZE, X jne .L20 movq M, %rax sarq $4, %rax jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 decq %rax jle .L12 ALIGN_3 .L11: movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 8 * SIZE(X), %xmm2 movaps 12 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L15: testq $8, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L16: testq $4, M jle .L17 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L17: testq $2, M jle .L18 movaps -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L18: testq $1, M jle .L999 movsd -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 movsd -32 * SIZE(Y), %xmm1 addps %xmm1, %xmm0 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L20: #ifdef ALIGNED_ACCESS testq $2 * SIZE, X jne .L30 subq $1 * SIZE, X movaps -32 * SIZE(X), %xmm0 movq M, %rax sarq $4, %rax jle .L25 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 decq %rax jle .L22 ALIGN_3 .L21: movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -4 * SIZE(X), %xmm7 movaps 0 * SIZE(X), %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 4 * SIZE(X), %xmm1 movaps 8 * SIZE(X), %xmm2 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 SHUFPS_39 %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 SHUFPS_39 %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 12 * SIZE(X), %xmm3 movaps 16 * SIZE(X), %xmm4 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 SHUFPS_39 %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 SHUFPS_39 %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -4 * SIZE(X), %xmm7 movaps 0 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 SHUFPS_39 %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 SHUFPS_39 %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 SHUFPS_39 %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 SHUFPS_39 %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L25: testq $8, M jle .L26 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm0 movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L26: testq $4, M jle .L27 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L27: testq $2, M jle .L28 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L28: testq $1, M jle .L999 pshufd $0x06, %xmm0, %xmm8 pshufd $0x09, %xmm0, %xmm0 mulps ALPHA_I, %xmm8 mulps ALPHA_R, %xmm0 addps -32 * SIZE(Y), %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L30: testq $1 * SIZE, X jne .L40 #endif movq M, %rax sarq $4, %rax jle .L35 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 decq %rax jle .L32 ALIGN_3 .L31: movsd -16 * SIZE(X), %xmm4 movhps -14 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -12 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -4 * SIZE(X), %xmm7 movhps -2 * SIZE(X), %xmm7 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) movsd 4 * SIZE(X), %xmm1 movhps 6 * SIZE(X), %xmm1 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) movsd 8 * SIZE(X), %xmm2 movhps 10 * SIZE(X), %xmm2 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) movsd 12 * SIZE(X), %xmm3 movhps 14 * SIZE(X), %xmm3 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: movsd -16 * SIZE(X), %xmm4 movhps -14 * SIZE(X), %xmm4 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -12 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movsd -4 * SIZE(X), %xmm7 movhps -2 * SIZE(X), %xmm7 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L35: testq $8, M jle .L36 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L36: testq $4, M jle .L37 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L37: testq $2, M jle .L38 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L38: testq $1, M jle .L999 movsd -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 movsd -32 * SIZE(Y), %xmm1 addps %xmm1, %xmm0 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #ifdef ALIGNED_ACCESS .L40: subq $3 * SIZE, X movaps -32 * SIZE(X), %xmm0 movq M, %rax sarq $4, %rax jle .L45 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 decq %rax jle .L42 ALIGN_3 .L41: movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -4 * SIZE(X), %xmm7 movaps 0 * SIZE(X), %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 4 * SIZE(X), %xmm1 movaps 8 * SIZE(X), %xmm2 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 12 * SIZE(X), %xmm3 movaps 16 * SIZE(X), %xmm4 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -4 * SIZE(X), %xmm7 movaps 0 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps -16 * SIZE(Y), %xmm4 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps -12 * SIZE(Y), %xmm5 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps -8 * SIZE(Y), %xmm6 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps -4 * SIZE(Y), %xmm7 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L45: testq $8, M jle .L46 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm0 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps -24 * SIZE(Y), %xmm2 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps -20 * SIZE(Y), %xmm3 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L46: testq $4, M jle .L47 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps -28 * SIZE(Y), %xmm1 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L47: testq $2, M jle .L48 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps -32 * SIZE(Y), %xmm0 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L48: testq $1, M jle .L999 movaps -28 * SIZE(X), %xmm1 movsd -32 * SIZE(Y), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 addps %xmm2, %xmm0 movlps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #endif .L50: xorps %xmm0, %xmm0 subq $1 * SIZE, Y testq $3 * SIZE, X jne .L60 movq M, %rax sarq $4, %rax jle .L55 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movaps -20 * SIZE(X), %xmm4 decq %rax jle .L52 ALIGN_3 .L51: movaps -16 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -8 * SIZE(X), %xmm7 movaps -4 * SIZE(X), %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 movaps 4 * SIZE(X), %xmm2 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 8 * SIZE(X), %xmm3 movaps 12 * SIZE(X), %xmm4 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L51 ALIGN_3 .L52: movaps -16 * SIZE(X), %xmm5 movaps -12 * SIZE(X), %xmm6 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -8 * SIZE(X), %xmm7 movaps -4 * SIZE(X), %xmm0 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L55: testq $8, M jle .L56 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -24 * SIZE(X), %xmm3 movaps -20 * SIZE(X), %xmm0 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L56: testq $4, M jle .L57 movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L57: testq $2, M jle .L58 movaps -32 * SIZE(X), %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L58: testq $1, M jle .L59 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(X), %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L59: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L60: #ifdef ALIGNED_ACCESS testq $2 * SIZE, X jne .L70 subq $1 * SIZE, X movaps -32 * SIZE(X), %xmm1 movq M, %rax sarq $4, %rax jle .L65 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 decq %rax jle .L62 ALIGN_3 .L61: movaps -20 * SIZE(X), %xmm4 movaps -16 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm6 movaps -8 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm5, %xmm4 SHUFPS_39 %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps 0 * SIZE(X), %xmm1 movss %xmm6, %xmm5 SHUFPS_39 %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm7, %xmm6 SHUFPS_39 %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm0, %xmm7 SHUFPS_39 %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L61 ALIGN_3 .L62: movaps -20 * SIZE(X), %xmm4 movaps -16 * SIZE(X), %xmm5 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm6 movaps -8 * SIZE(X), %xmm7 movss %xmm4, %xmm3 SHUFPS_39 %xmm3, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm5, %xmm4 SHUFPS_39 %xmm4, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm6, %xmm5 SHUFPS_39 %xmm5, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 movaps 0 * SIZE(X), %xmm1 movss %xmm7, %xmm6 SHUFPS_39 %xmm6, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm0, %xmm7 SHUFPS_39 %xmm7, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L65: testq $8, M jle .L66 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movaps -16 * SIZE(X), %xmm1 movss %xmm0, %xmm3 SHUFPS_39 %xmm3, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm1, %xmm0 SHUFPS_39 %xmm0, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L66: testq $4, M jle .L67 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 SHUFPS_39 %xmm2, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L67: testq $2, M jle .L68 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 movaps %xmm2, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L68: testq $1, M jle .L69 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 SHUFPS_39 %xmm1, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movlps %xmm0, -32 * SIZE(Y) movhlps %xmm0, %xmm0 movss %xmm0, -30 * SIZE(Y) jmp .L999 .L69: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L70: testq $1 * SIZE, X jne .L80 #endif movq M, %rax sarq $4, %rax jle .L75 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 movsd -28 * SIZE(X), %xmm2 movhps -26 * SIZE(X), %xmm2 movsd -24 * SIZE(X), %xmm3 movhps -22 * SIZE(X), %xmm3 movsd -20 * SIZE(X), %xmm4 movhps -18 * SIZE(X), %xmm4 decq %rax jle .L72 ALIGN_3 .L71: movsd -16 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 movsd -12 * SIZE(X), %xmm6 movhps -10 * SIZE(X), %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -8 * SIZE(X), %xmm7 movhps -6 * SIZE(X), %xmm7 movsd -4 * SIZE(X), %xmm0 movhps -2 * SIZE(X), %xmm0 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movsd 0 * SIZE(X), %xmm1 movhps 2 * SIZE(X), %xmm1 movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movsd 8 * SIZE(X), %xmm3 movhps 10 * SIZE(X), %xmm3 movsd 12 * SIZE(X), %xmm4 movhps 14 * SIZE(X), %xmm4 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L71 ALIGN_3 .L72: movsd -16 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 movsd -12 * SIZE(X), %xmm6 movhps -10 * SIZE(X), %xmm6 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -8 * SIZE(X), %xmm7 movhps -6 * SIZE(X), %xmm7 movsd -4 * SIZE(X), %xmm0 movhps -2 * SIZE(X), %xmm0 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L75: testq $8, M jle .L76 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 movsd -28 * SIZE(X), %xmm2 movhps -26 * SIZE(X), %xmm2 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movsd -24 * SIZE(X), %xmm3 movhps -22 * SIZE(X), %xmm3 movsd -20 * SIZE(X), %xmm0 movhps -18 * SIZE(X), %xmm0 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L76: testq $4, M jle .L77 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movsd -28 * SIZE(X), %xmm2 movhps -26 * SIZE(X), %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L77: testq $2, M jle .L78 movsd -32 * SIZE(X), %xmm1 movhps -30 * SIZE(X), %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L78: testq $1, M jle .L79 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(X), %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 .L79: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #ifdef ALIGNED_ACCESS .L80: subq $3 * SIZE, X movaps -32 * SIZE(X), %xmm1 movq M, %rax sarq $4, %rax jle .L85 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 decq %rax jle .L82 ALIGN_3 .L81: movaps -20 * SIZE(X), %xmm4 movaps -16 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm6 movaps -8 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps 0 * SIZE(X), %xmm1 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L81 ALIGN_3 .L82: movaps -20 * SIZE(X), %xmm4 movaps -16 * SIZE(X), %xmm5 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -12 * SIZE(X), %xmm6 movaps -8 * SIZE(X), %xmm7 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 pshufd $0xb1, %xmm4, %xmm8 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm4 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 pshufd $0xb1, %xmm5, %xmm8 mulps ALPHA_R, %xmm5 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm5 movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 addps -16 * SIZE(Y), %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps -4 * SIZE(X), %xmm0 movaps 0 * SIZE(X), %xmm1 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 pshufd $0xb1, %xmm6, %xmm8 mulps ALPHA_R, %xmm6 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm6 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 addps -12 * SIZE(Y), %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 pshufd $0xb1, %xmm7, %xmm8 mulps ALPHA_R, %xmm7 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm7 movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 addps -8 * SIZE(Y), %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 addps -4 * SIZE(Y), %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L85: testq $8, M jle .L86 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps -20 * SIZE(X), %xmm0 movaps -16 * SIZE(X), %xmm1 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps ALPHA_R, %xmm3 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm3 movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 addps -24 * SIZE(Y), %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps ALPHA_R, %xmm0 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm0 movss %xmm0, %xmm3 shufps $0x93, %xmm0, %xmm3 addps -20 * SIZE(Y), %xmm3 movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_2 .L86: testq $4, M jle .L87 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps ALPHA_R, %xmm2 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 addps -28 * SIZE(Y), %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2 .L87: testq $2, M jle .L88 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, %xmm0 movaps %xmm2, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2 .L88: testq $1, M jle .L89 movaps -28 * SIZE(X), %xmm2 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps ALPHA_R, %xmm1 mulps ALPHA_I, %xmm8 addps %xmm8, %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 addps -32 * SIZE(Y), %xmm0 movlps %xmm0, -32 * SIZE(Y) movhlps %xmm0, %xmm0 movss %xmm0, -30 * SIZE(Y) jmp .L999 .L89: shufps $0x93, %xmm0, %xmm0 addss -32 * SIZE(Y), %xmm0 movss %xmm0, -32 * SIZE(Y) jmp .L999 ALIGN_3 #endif .L100: #ifndef CONJ pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm15 pxor %xmm13, %xmm13 subps %xmm15, %xmm13 unpcklps %xmm14, %xmm13 unpcklps %xmm15, %xmm14 movaps %xmm13, %xmm15 #else pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm15 pxor %xmm13, %xmm13 subps %xmm14, %xmm13 unpcklps %xmm15, %xmm14 unpcklps %xmm13, %xmm15 #endif //If incx==0 || incy==0, avoid unloop and jump to end. cmpq $0, INCX je .L200 cmpq $0, INCY je .L200 movq Y, YY movq M, %rax sarq $3, %rax jle .L105 ALIGN_3 .L102: movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 movshdup %xmm4, %xmm5 movsldup %xmm4, %xmm4 movshdup %xmm6, %xmm7 movsldup %xmm6, %xmm6 #else pshufd $0xf5, %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 pshufd $0xf5, %xmm2, %xmm3 shufps $0xa0, %xmm2, %xmm2 pshufd $0xf5, %xmm4, %xmm5 shufps $0xa0, %xmm4, %xmm4 pshufd $0xf5, %xmm6, %xmm7 shufps $0xa0, %xmm6, %xmm6 #endif mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 mulps %xmm14, %xmm4 mulps %xmm15, %xmm5 mulps %xmm14, %xmm6 mulps %xmm15, %xmm7 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y movsd (Y), %xmm10 addq INCY, Y movhps (Y), %xmm10 addq INCY, Y movsd (Y), %xmm11 addq INCY, Y movhps (Y), %xmm11 addq INCY, Y addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm10 addps %xmm6, %xmm11 addps %xmm7, %xmm11 movsd %xmm8, (YY) addq INCY, YY movhps %xmm8, (YY) addq INCY, YY movsd %xmm9, (YY) addq INCY, YY movhps %xmm9, (YY) addq INCY, YY movsd %xmm10, (YY) addq INCY, YY movhps %xmm10, (YY) addq INCY, YY movsd %xmm11, (YY) addq INCY, YY movhps %xmm11, (YY) addq INCY, YY decq %rax jg .L102 ALIGN_3 .L105: testq $4, M jle .L106 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 #else pshufd $0xf5, %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 pshufd $0xf5, %xmm2, %xmm3 shufps $0xa0, %xmm2, %xmm2 #endif mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 movsd %xmm8, (YY) addq INCY, YY movhps %xmm8, (YY) addq INCY, YY movsd %xmm9, (YY) addq INCY, YY movhps %xmm9, (YY) addq INCY, YY ALIGN_3 .L106: testq $2, M jle .L107 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 #else pshufd $0xf5, %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 #endif mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, (YY) addq INCY, YY movhps %xmm8, (YY) addq INCY, YY ALIGN_3 .L107: testq $1, M jle .L999 movsd (X), %xmm0 #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 #else pshufd $0xf5, %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 #endif mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 movsd (Y), %xmm8 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, (Y) jmp .L999 ALIGN_3 .L200: movq M, %rax cmpq $0, %rax jle .L999 ALIGN_3 .L201: movsd (X), %xmm0 addq INCX, X #ifdef HAVE_SSE3 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 #else pshufd $0xf5, %xmm0, %xmm1 shufps $0xa0, %xmm0, %xmm0 #endif mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 movsd (Y), %xmm8 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, (Y) addq INCY, Y decq %rax jg .L201 ALIGN_3 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zaxpy_sse2.S000066400000000000000000001071671313527062700176310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %r10 #endif #define YY %r11 #define ALPHA_R %xmm14 #define ALPHA_I %xmm15 #define USE_PSHUFD #if defined(HAVE_SSE3) && !defined(CORE_OPTERON) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c #endif #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI movq 8(%rsp), INCY #else movaps %xmm3, %xmm0 movsd 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX movq 64(%rsp), Y movq 72(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq M, M jle .L999 cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 subq $-16 * SIZE, X subq $-16 * SIZE, Y pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifdef HAVE_SSE3 movddup %xmm0, ALPHA_R movddup %xmm1, ALPHA_I #else pshufd $0x44, %xmm0, ALPHA_R pshufd $0x44, %xmm1, ALPHA_I #endif #ifndef CONJ shufps $0x0c, %xmm7, %xmm7 xorpd %xmm7, ALPHA_I #else shufps $0xc0, %xmm7, %xmm7 xorpd %xmm7, ALPHA_R #endif testq $SIZE, Y jne .L30 testq $SIZE, X jne .L20 movq M, %rax sarq $3, %rax jle .L15 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 decq %rax jle .L12 ALIGN_3 .L11: movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm0, %xmm8 #else movsd -15 * SIZE(X), %xmm8 movhps -16 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) #ifdef USE_PSHUFD pshufd $0x4e, %xmm1, %xmm8 #else movsd -13 * SIZE(X), %xmm8 movhps -14 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm2, %xmm8 #else movsd -11 * SIZE(X), %xmm8 movhps -12 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd -12 * SIZE(Y), %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(Y) #ifdef USE_PSHUFD pshufd $0x4e, %xmm3, %xmm8 #else movsd -9 * SIZE(X), %xmm8 movhps -10 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd -10 * SIZE(Y), %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps 0 * SIZE(X), %xmm0 movaps 2 * SIZE(X), %xmm1 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm4, %xmm8 #else movsd -7 * SIZE(X), %xmm8 movhps -8 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm8 addpd -8 * SIZE(Y), %xmm4 addpd %xmm8, %xmm4 movaps %xmm4, -8 * SIZE(Y) #ifdef USE_PSHUFD pshufd $0x4e, %xmm5, %xmm8 #else movsd -5 * SIZE(X), %xmm8 movhps -6 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm8 addpd -6 * SIZE(Y), %xmm5 addpd %xmm8, %xmm5 movaps %xmm5, -6 * SIZE(Y) movaps 4 * SIZE(X), %xmm2 movaps 6 * SIZE(X), %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm6, %xmm8 #else movsd -3 * SIZE(X), %xmm8 movhps -4 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm6 mulpd ALPHA_I, %xmm8 addpd -4 * SIZE(Y), %xmm6 addpd %xmm8, %xmm6 movaps %xmm6, -4 * SIZE(Y) #ifdef USE_PSHUFD pshufd $0x4e, %xmm7, %xmm8 #else movsd -1 * SIZE(X), %xmm8 movhps -2 * SIZE(X), %xmm8 #endif mulpd ALPHA_R, %xmm7 mulpd ALPHA_I, %xmm8 addpd -2 * SIZE(Y), %xmm7 addpd %xmm8, %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd -12 * SIZE(Y), %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd -10 * SIZE(Y), %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) pshufd $0x4e, %xmm4, %xmm8 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm8 addpd -8 * SIZE(Y), %xmm4 addpd %xmm8, %xmm4 movaps %xmm4, -8 * SIZE(Y) pshufd $0x4e, %xmm5, %xmm8 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm8 addpd -6 * SIZE(Y), %xmm5 addpd %xmm8, %xmm5 movaps %xmm5, -6 * SIZE(Y) pshufd $0x4e, %xmm6, %xmm8 mulpd ALPHA_R, %xmm6 mulpd ALPHA_I, %xmm8 addpd -4 * SIZE(Y), %xmm6 addpd %xmm8, %xmm6 movaps %xmm6, -4 * SIZE(Y) pshufd $0x4e, %xmm7, %xmm8 mulpd ALPHA_R, %xmm7 mulpd ALPHA_I, %xmm8 addpd -2 * SIZE(Y), %xmm7 addpd %xmm8, %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L15: movq M, %rax andq $4, %rax jle .L16 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd -12 * SIZE(Y), %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd -10 * SIZE(Y), %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L16: movq M, %rax andq $2, %rax jle .L17 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L17: movq M, %rax andq $1, %rax jle .L999 movaps -16 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movq M, %rax sarq $3, %rax jle .L25 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 decq %rax jle .L22 ALIGN_3 .L21: movsd -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd -12 * SIZE(Y), %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd -10 * SIZE(Y), %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm4, %xmm8 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm8 addpd -8 * SIZE(Y), %xmm4 addpd %xmm8, %xmm4 movaps %xmm4, -8 * SIZE(Y) movsd 2 * SIZE(X), %xmm1 movhps 3 * SIZE(X), %xmm1 pshufd $0x4e, %xmm5, %xmm8 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm8 addpd -6 * SIZE(Y), %xmm5 addpd %xmm8, %xmm5 movaps %xmm5, -6 * SIZE(Y) movsd 4 * SIZE(X), %xmm2 movhps 5 * SIZE(X), %xmm2 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm6, %xmm8 mulpd ALPHA_R, %xmm6 mulpd ALPHA_I, %xmm8 addpd -4 * SIZE(Y), %xmm6 addpd %xmm8, %xmm6 movaps %xmm6, -4 * SIZE(Y) movsd 6 * SIZE(X), %xmm3 movhps 7 * SIZE(X), %xmm3 pshufd $0x4e, %xmm7, %xmm8 mulpd ALPHA_R, %xmm7 mulpd ALPHA_I, %xmm8 addpd -2 * SIZE(Y), %xmm7 addpd %xmm8, %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movsd -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd -12 * SIZE(Y), %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd -10 * SIZE(Y), %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) pshufd $0x4e, %xmm4, %xmm8 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm8 addpd -8 * SIZE(Y), %xmm4 addpd %xmm8, %xmm4 movaps %xmm4, -8 * SIZE(Y) pshufd $0x4e, %xmm5, %xmm8 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm8 addpd -6 * SIZE(Y), %xmm5 addpd %xmm8, %xmm5 movaps %xmm5, -6 * SIZE(Y) pshufd $0x4e, %xmm6, %xmm8 mulpd ALPHA_R, %xmm6 mulpd ALPHA_I, %xmm8 addpd -4 * SIZE(Y), %xmm6 addpd %xmm8, %xmm6 movaps %xmm6, -4 * SIZE(Y) pshufd $0x4e, %xmm7, %xmm8 mulpd ALPHA_R, %xmm7 mulpd ALPHA_I, %xmm8 addpd -2 * SIZE(Y), %xmm7 addpd %xmm8, %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L25: movq M, %rax andq $4, %rax jle .L26 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd -12 * SIZE(Y), %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd -10 * SIZE(Y), %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L26: movq M, %rax andq $2, %rax jle .L27 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd -14 * SIZE(Y), %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L27: movq M, %rax andq $1, %rax jle .L999 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd -16 * SIZE(Y), %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L30: testq $SIZE, X jne .L40 movaps -16 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 xorps %xmm0, %xmm0 SHUFPD_1 %xmm1, %xmm0 xorps %xmm4, %xmm4 movhps -16 * SIZE(Y), %xmm4 addpd %xmm0, %xmm4 movhps %xmm4, -16 * SIZE(Y) movaps %xmm1, %xmm0 addq $2 * SIZE, X addq $1 * SIZE, Y decq M jle .L39 movq M, %rax sarq $3, %rax jle .L35 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 decq %rax jle .L32 ALIGN_3 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -10 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -8 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -6 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) movaps 0 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) movaps 2 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) movaps 4 * SIZE(X), %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps -10 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps -8 * SIZE(X), %xmm1 pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movaps -6 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps -4 * SIZE(X), %xmm3 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L35: movq M, %rax andq $4, %rax jle .L36 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(X), %xmm3 movaps -10 * SIZE(X), %xmm4 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm4, %xmm8 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm4 SHUFPD_1 %xmm4, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L36: movq M, %rax andq $2, %rax jle .L37 movaps -16 * SIZE(X), %xmm1 movaps -14 * SIZE(X), %xmm2 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L37: movq M, %rax andq $1, %rax jle .L39 movaps -16 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, %xmm0 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L39: SHUFPD_1 %xmm0, %xmm0 addsd -16 * SIZE(Y), %xmm0 movlps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L40: movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 xorps %xmm0, %xmm0 SHUFPD_1 %xmm1, %xmm0 xorps %xmm4, %xmm4 movhps -16 * SIZE(Y), %xmm4 addpd %xmm0, %xmm4 movhps %xmm4, -16 * SIZE(Y) movaps %xmm1, %xmm0 addq $2 * SIZE, X addq $1 * SIZE, Y decq M jle .L49 movq M, %rax sarq $3, %rax jle .L45 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 movsd -14 * SIZE(X), %xmm2 movhps -13 * SIZE(X), %xmm2 movsd -12 * SIZE(X), %xmm3 movhps -11 * SIZE(X), %xmm3 decq %rax jle .L42 ALIGN_3 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -10 * SIZE(X), %xmm0 movhps -9 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -8 * SIZE(X), %xmm1 movhps -7 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -6 * SIZE(X), %xmm2 movhps -5 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -3 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movsd -2 * SIZE(X), %xmm0 movhps -1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) movsd 2 * SIZE(X), %xmm2 movhps 3 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) movsd 4 * SIZE(X), %xmm3 movhps 5 * SIZE(X), %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movsd -10 * SIZE(X), %xmm0 movhps -9 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movsd -8 * SIZE(X), %xmm1 movhps -7 * SIZE(X), %xmm1 pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) movsd -6 * SIZE(X), %xmm2 movhps -5 * SIZE(X), %xmm2 pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movsd -4 * SIZE(X), %xmm3 movhps -3 * SIZE(X), %xmm3 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -8 * SIZE(Y), %xmm0 movaps %xmm0, -8 * SIZE(Y) movsd -2 * SIZE(X), %xmm0 movhps -1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -6 * SIZE(Y), %xmm1 movaps %xmm1, -6 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -4 * SIZE(Y), %xmm2 movaps %xmm2, -4 * SIZE(Y) pshufd $0x4e, %xmm0, %xmm8 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm0 SHUFPD_1 %xmm0, %xmm3 addpd -2 * SIZE(Y), %xmm3 movaps %xmm3, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L45: movq M, %rax andq $4, %rax jle .L46 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 movsd -14 * SIZE(X), %xmm2 movhps -13 * SIZE(X), %xmm2 movsd -12 * SIZE(X), %xmm3 movhps -11 * SIZE(X), %xmm3 movsd -10 * SIZE(X), %xmm4 movhps -9 * SIZE(X), %xmm4 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) pshufd $0x4e, %xmm3, %xmm8 mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm3 SHUFPD_1 %xmm3, %xmm2 addpd -12 * SIZE(Y), %xmm2 movaps %xmm2, -12 * SIZE(Y) pshufd $0x4e, %xmm4, %xmm8 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm4 SHUFPD_1 %xmm4, %xmm3 addpd -10 * SIZE(Y), %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L46: movq M, %rax andq $2, %rax jle .L47 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 movsd -14 * SIZE(X), %xmm2 movhps -13 * SIZE(X), %xmm2 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) pshufd $0x4e, %xmm2, %xmm8 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm2 SHUFPD_1 %xmm2, %xmm1 addpd -14 * SIZE(Y), %xmm1 movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L47: movq M, %rax andq $1, %rax jle .L49 movsd -16 * SIZE(X), %xmm1 movhps -15 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm8 mulpd ALPHA_R, %xmm1 mulpd ALPHA_I, %xmm8 addpd %xmm8, %xmm1 SHUFPD_1 %xmm1, %xmm0 addpd -16 * SIZE(Y), %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, %xmm0 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L49: SHUFPD_1 %xmm0, %xmm0 addsd -16 * SIZE(Y), %xmm0 movlps %xmm0, -16 * SIZE(Y) jmp .L999 ALIGN_3 .L50: #ifndef CONJ movaps %xmm0, %xmm14 # a 0 pxor %xmm15, %xmm15 # 0 0 subsd %xmm1, %xmm15 # -b 0 unpcklpd %xmm14, %xmm15 # -b a unpcklpd %xmm1, %xmm14 # a b #else movaps %xmm0, %xmm14 # a 0 movaps %xmm1, %xmm15 # b 0 pxor %xmm13, %xmm13 # 0 0 subsd %xmm0, %xmm13 # -a 0 unpcklpd %xmm13, %xmm15 # b -a unpcklpd %xmm1, %xmm14 # a b #endif movq Y, YY movq M, %rax //If incx==0 || incy==0, avoid unloop and jump to end. cmpq $0, INCX je .L58 cmpq $0, INCY je .L58 sarq $3, %rax jle .L55 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm4) MOVDDUP( 1 * SIZE, X, %xmm5) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm6) MOVDDUP( 1 * SIZE, X, %xmm7) addq INCX, X movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 addq INCY, Y movsd 0 * SIZE(Y), %xmm9 movhpd 1 * SIZE(Y), %xmm9 addq INCY, Y movsd 0 * SIZE(Y), %xmm10 movhpd 1 * SIZE(Y), %xmm10 addq INCY, Y movsd 0 * SIZE(Y), %xmm11 movhpd 1 * SIZE(Y), %xmm11 addq INCY, Y mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 decq %rax jle .L52 ALIGN_3 .L51: addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm4) MOVDDUP( 1 * SIZE, X, %xmm5) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm6) MOVDDUP( 1 * SIZE, X, %xmm7) addq INCX, X mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) addq INCY, YY movlpd %xmm9, 0 * SIZE(YY) movhpd %xmm9, 1 * SIZE(YY) addq INCY, YY movlpd %xmm10, 0 * SIZE(YY) movhpd %xmm10, 1 * SIZE(YY) addq INCY, YY movlpd %xmm11, 0 * SIZE(YY) movhpd %xmm11, 1 * SIZE(YY) addq INCY, YY movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 addq INCY, Y movsd 0 * SIZE(Y), %xmm9 movhpd 1 * SIZE(Y), %xmm9 addq INCY, Y movsd 0 * SIZE(Y), %xmm10 movhpd 1 * SIZE(Y), %xmm10 addq INCY, Y movsd 0 * SIZE(Y), %xmm11 movhpd 1 * SIZE(Y), %xmm11 addq INCY, Y addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm4) MOVDDUP( 1 * SIZE, X, %xmm5) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm6) MOVDDUP( 1 * SIZE, X, %xmm7) addq INCX, X mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) addq INCY, YY movlpd %xmm9, 0 * SIZE(YY) movhpd %xmm9, 1 * SIZE(YY) addq INCY, YY movlpd %xmm10, 0 * SIZE(YY) movhpd %xmm10, 1 * SIZE(YY) addq INCY, YY movlpd %xmm11, 0 * SIZE(YY) movhpd %xmm11, 1 * SIZE(YY) addq INCY, YY movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 addq INCY, Y movsd 0 * SIZE(Y), %xmm9 movhpd 1 * SIZE(Y), %xmm9 addq INCY, Y movsd 0 * SIZE(Y), %xmm10 movhpd 1 * SIZE(Y), %xmm10 addq INCY, Y movsd 0 * SIZE(Y), %xmm11 movhpd 1 * SIZE(Y), %xmm11 addq INCY, Y decq %rax jg .L51 ALIGN_3 .L52: addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm4) MOVDDUP( 1 * SIZE, X, %xmm5) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm6) MOVDDUP( 1 * SIZE, X, %xmm7) addq INCX, X mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) addq INCY, YY movlpd %xmm9, 0 * SIZE(YY) movhpd %xmm9, 1 * SIZE(YY) addq INCY, YY movlpd %xmm10, 0 * SIZE(YY) movhpd %xmm10, 1 * SIZE(YY) addq INCY, YY movlpd %xmm11, 0 * SIZE(YY) movhpd %xmm11, 1 * SIZE(YY) addq INCY, YY movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 addq INCY, Y movsd 0 * SIZE(Y), %xmm9 movhpd 1 * SIZE(Y), %xmm9 addq INCY, Y movsd 0 * SIZE(Y), %xmm10 movhpd 1 * SIZE(Y), %xmm10 addq INCY, Y movsd 0 * SIZE(Y), %xmm11 movhpd 1 * SIZE(Y), %xmm11 addq INCY, Y addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) addq INCY, YY movlpd %xmm9, 0 * SIZE(YY) movhpd %xmm9, 1 * SIZE(YY) addq INCY, YY movlpd %xmm10, 0 * SIZE(YY) movhpd %xmm10, 1 * SIZE(YY) addq INCY, YY movlpd %xmm11, 0 * SIZE(YY) movhpd %xmm11, 1 * SIZE(YY) addq INCY, YY ALIGN_3 .L55: movq M, %rax andq $4, %rax jle .L56 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm4) MOVDDUP( 1 * SIZE, X, %xmm5) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm6) MOVDDUP( 1 * SIZE, X, %xmm7) addq INCX, X movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 addq INCY, Y movsd 0 * SIZE(Y), %xmm9 movhpd 1 * SIZE(Y), %xmm9 addq INCY, Y movsd 0 * SIZE(Y), %xmm10 movhpd 1 * SIZE(Y), %xmm10 addq INCY, Y movsd 0 * SIZE(Y), %xmm11 movhpd 1 * SIZE(Y), %xmm11 addq INCY, Y mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 addpd %xmm0, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm2, %xmm9 mulpd %xmm15, %xmm3 addpd %xmm4, %xmm10 mulpd %xmm15, %xmm5 addpd %xmm6, %xmm11 mulpd %xmm15, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 addpd %xmm5, %xmm10 addpd %xmm7, %xmm11 movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) addq INCY, YY movlpd %xmm9, 0 * SIZE(YY) movhpd %xmm9, 1 * SIZE(YY) addq INCY, YY movlpd %xmm10, 0 * SIZE(YY) movhpd %xmm10, 1 * SIZE(YY) addq INCY, YY movlpd %xmm11, 0 * SIZE(YY) movhpd %xmm11, 1 * SIZE(YY) addq INCY, YY ALIGN_3 .L56: movq M, %rax andq $2, %rax jle .L57 MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) addq INCX, X MOVDDUP( 0 * SIZE, X, %xmm2) MOVDDUP( 1 * SIZE, X, %xmm3) addq INCX, X movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 addq INCY, Y movsd 0 * SIZE(Y), %xmm9 movhpd 1 * SIZE(Y), %xmm9 addq INCY, Y mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm3 addpd %xmm0, %xmm8 addpd %xmm2, %xmm9 addpd %xmm1, %xmm8 addpd %xmm3, %xmm9 movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) addq INCY, YY movlpd %xmm9, 0 * SIZE(YY) movhpd %xmm9, 1 * SIZE(YY) addq INCY, YY ALIGN_3 .L57: movq M, %rax andq $1, %rax jle .L999 .L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) movsd 0 * SIZE(Y), %xmm8 movhpd 1 * SIZE(Y), %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm1 addpd %xmm0, %xmm8 addpd %xmm1, %xmm8 movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) decq %rax jg .L58 ALIGN_3 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zcopy.S000066400000000000000000000174551313527062700166660ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #define FLAG ARG6 #else #define INCY %r10 #define FLAG %r11 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif EMMS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq N, N # if m == 0 goto End jle .L999 cmpq $2 * SIZE, INCX # if incx != 1 jne .L100 cmpq $2 * SIZE, INCY # if incy != 1 jne .L100 movq N, %rax # i = m sarq $2, %rax jle .L20 ALIGN_2 .L11: #ifdef XDOUBLE #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq 0(X), %mm0 movq %mm0, 0(Y) movq 8(X), %mm1 movq %mm1, 8(Y) movq 16(X), %mm2 movq %mm2, 16(Y) movq 24(X), %mm3 movq %mm3, 24(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq 32(X), %mm4 movq %mm4, 32(Y) movq 40(X), %mm5 movq %mm5, 40(Y) movq 48(X), %mm6 movq %mm6, 48(Y) movq 56(X), %mm7 movq %mm7, 56(Y) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movq 64(X), %mm0 movq %mm0, 64(Y) movq 72(X), %mm1 movq %mm1, 72(Y) movq 80(X), %mm2 movq %mm2, 80(Y) movq 88(X), %mm3 movq %mm3, 88(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movq 96(X), %mm4 movq %mm4, 96(Y) movq 104(X), %mm5 movq %mm5, 104(Y) movq 112(X), %mm6 movq %mm6, 112(Y) movq 120(X), %mm7 movq %mm7, 120(Y) #elif defined(DOUBLE) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq 0 * SIZE(X), %mm0 movq 1 * SIZE(X), %mm1 movq %mm0, 0 * SIZE(Y) movq %mm1, 1 * SIZE(Y) movq 2 * SIZE(X), %mm2 movq 3 * SIZE(X), %mm3 movq %mm2, 2 * SIZE(Y) movq %mm3, 3 * SIZE(Y) movq 4 * SIZE(X), %mm4 movq 5 * SIZE(X), %mm5 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq %mm4, 4 * SIZE(Y) movq %mm5, 5 * SIZE(Y) movq 6 * SIZE(X), %mm6 movq 7 * SIZE(X), %mm7 movq %mm6, 6 * SIZE(Y) movq %mm7, 7 * SIZE(Y) #else movq 0 * SIZE(X), %mm0 movq 2 * SIZE(X), %mm2 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq %mm0, 0 * SIZE(Y) movq %mm2, 2 * SIZE(Y) movq 4 * SIZE(X), %mm4 movq 6 * SIZE(X), %mm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movq %mm4, 4 * SIZE(Y) movq %mm6, 6 * SIZE(Y) #endif addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L11 ALIGN_2 .L20: movq N, %rax # i = m andq $3, %rax jle .L99 ALIGN_2 .L21: #ifdef XDOUBLE movq 0(X), %mm0 movq %mm0, 0(Y) movq 8(X), %mm1 movq %mm1, 8(Y) movq 16(X), %mm2 movq %mm2, 16(Y) movq 24(X), %mm3 movq %mm3, 24(Y) #elif defined(DOUBLE) movq 0 * SIZE(X), %mm0 movq %mm0, 0 * SIZE(Y) movq 1 * SIZE(X), %mm1 movq %mm1, 1 * SIZE(Y) #else movq 0 * SIZE(X), %mm0 movq %mm0, 0 * SIZE(Y) #endif addq $2 * SIZE, X addq $2 * SIZE, Y decq %rax jg .L21 .L99: xorq %rax,%rax EMMS ret ALIGN_3 .L100: movq N, %rax sarq $2, %rax jle .L120 ALIGN_2 .L111: #ifdef XDOUBLE movq 0(X), %mm0 movq %mm0, 0(Y) movq 8(X), %mm1 movq %mm1, 8(Y) movq 16(X), %mm2 movq %mm2, 16(Y) movq 24(X), %mm3 movq %mm3, 24(Y) addq INCX, X addq INCY, Y movq 0(X), %mm0 movq %mm0, 0(Y) movq 8(X), %mm1 movq %mm1, 8(Y) movq 16(X), %mm2 movq %mm2, 16(Y) movq 24(X), %mm3 movq %mm3, 24(Y) addq INCX, X addq INCY, Y movq 0(X), %mm0 movq %mm0, 0(Y) movq 8(X), %mm1 movq %mm1, 8(Y) movq 16(X), %mm2 movq %mm2, 16(Y) movq 24(X), %mm3 movq %mm3, 24(Y) addq INCX, X addq INCY, Y movq 0(X), %mm0 movq %mm0, 0(Y) movq 8(X), %mm1 movq %mm1, 8(Y) movq 16(X), %mm2 movq %mm2, 16(Y) movq 24(X), %mm3 movq %mm3, 24(Y) addq INCX, X addq INCY, Y #elif defined(DOUBLE) movq 0 * SIZE(X), %mm0 movq %mm0, 0 * SIZE(Y) movq 1 * SIZE(X), %mm1 movq %mm1, 1 * SIZE(Y) addq INCX, X addq INCY, Y movq 0 * SIZE(X), %mm2 movq %mm2, 0 * SIZE(Y) movq 1 * SIZE(X), %mm3 movq %mm3, 1 * SIZE(Y) addq INCX, X addq INCY, Y movq 0 * SIZE(X), %mm4 movq %mm4, 0 * SIZE(Y) movq 1 * SIZE(X), %mm5 movq %mm5, 1 * SIZE(Y) addq INCX, X addq INCY, Y movq 0 * SIZE(X), %mm6 movq %mm6, 0 * SIZE(Y) movq 1 * SIZE(X), %mm7 movq %mm7, 1 * SIZE(Y) addq INCX, X addq INCY, Y #else movq 0 * SIZE(X), %mm0 movq %mm0, 0 * SIZE(Y) addq INCX, X addq INCY, Y movq 0 * SIZE(X), %mm2 movq %mm2, 0 * SIZE(Y) addq INCX, X addq INCY, Y movq 0 * SIZE(X), %mm4 movq %mm4, 0 * SIZE(Y) addq INCX, X addq INCY, Y movq 0 * SIZE(X), %mm6 movq %mm6, 0 * SIZE(Y) addq INCX, X addq INCY, Y #endif decq %rax jg .L111 .L120: movq N, %rax andq $3, %rax jle .L999 ALIGN_2 .L121: #ifdef XDOUBLE movq 0(X), %mm0 movq %mm0, 0(Y) movq 8(X), %mm1 movq %mm1, 8(Y) movq 16(X), %mm2 movq %mm2, 16(Y) movq 24(X), %mm3 movq %mm3, 24(Y) addq INCX, X addq INCY, Y #elif defined(DOUBLE) movq 0 * SIZE(X), %mm0 movq %mm0, 0 * SIZE(Y) movq 1 * SIZE(X), %mm1 movq %mm1, 1 * SIZE(Y) addq INCX, X addq INCY, Y #else movq 0 * SIZE(X), %mm0 movq %mm0, 0 * SIZE(Y) addq INCX, X addq INCY, Y #endif decq %rax jg .L121 .L999: xorq %rax,%rax EMMS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zcopy_sse.S000066400000000000000000000431241313527062700175300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY cmpq $2 * SIZE, INCX jne .L100 cmpq $2 * SIZE, INCY jne .L100 cmpq $3, M jle .L106 subq $-32 * SIZE, X subq $-32 * SIZE, Y addq M, M testq $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M ALIGN_4 .L05: testq $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y subq $2, M jle .L19 ALIGN_4 .L10: testq $3 * SIZE, X jne .L20 movq M, %rax sarq $5, %rax jle .L13 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -32 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -28 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -24 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm2) movaps %xmm3, -20 * SIZE(Y) LOAD(12 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4,-16 * SIZE(Y) LOAD(16 * SIZE, X, %xmm4) movaps %xmm5,-12 * SIZE(Y) LOAD(20 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -8 * SIZE(Y) LOAD(24 * SIZE, X, %xmm6) movaps %xmm7, -4 * SIZE(Y) LOAD(28 * SIZE, X, %xmm7) subq $-32 * SIZE, Y subq $-32 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, -16 * SIZE(Y) movaps %xmm5, -12 * SIZE(Y) movaps %xmm6, -8 * SIZE(Y) movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, Y subq $-32 * SIZE, X ALIGN_3 .L13: testq $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, -24 * SIZE(Y) movaps %xmm3, -20 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L14: testq $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L20: testq $SIZE, X jne .L30 movhps -32 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L23 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -10 * SIZE(X), %xmm6 movaps -6 * SIZE(X), %xmm7 decq %rax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 2 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 6 * SIZE(X), %xmm2 shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 10 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 14 * SIZE(X), %xmm4 shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 18 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 22 * SIZE(X), %xmm6 shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 26 * SIZE(X), %xmm7 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -2 * SIZE(X), %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) shufps $0x4e, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) shufps $0x4e, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) shufps $0x4e, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) shufps $0x4e, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L23: testq $16, M jle .L24 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 movaps -22 * SIZE(X), %xmm3 movaps -18 * SIZE(X), %xmm4 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) shufps $0x4e, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) shufps $0x4e, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) shufps $0x4e, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L24: testq $8, M jle .L25 ALIGN_3 movaps -30 * SIZE(X), %xmm1 movaps -26 * SIZE(X), %xmm2 shufps $0x4e, %xmm1, %xmm0 shufps $0x4e, %xmm2, %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $4, M jle .L26 ALIGN_3 movaps -30 * SIZE(X), %xmm1 shufps $0x4e, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: testq $2, M jle .L27 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: testq $1, M jle .L29 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L30: testq $2 * SIZE, X jne .L40 movaps -33 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L33 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movaps -13 * SIZE(X), %xmm5 movaps -9 * SIZE(X), %xmm6 movaps -5 * SIZE(X), %xmm7 decq %rax jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 3 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 7 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 11 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 15 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 19 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 23 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 27 * SIZE(X), %xmm7 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -1 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x39, %xmm4, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x39, %xmm5, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x39, %xmm6, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x39, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L33: testq $16, M jle .L34 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movaps -21 * SIZE(X), %xmm3 movaps -17 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L34: testq $8, M jle .L35 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movaps -25 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L35: testq $4, M jle .L36 ALIGN_3 movaps -29 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L36: testq $2, M jle .L37 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L37: testq $1, M jle .L39 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L39: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movq M, %rax sarq $5, %rax jle .L43 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movaps -15 * SIZE(X), %xmm5 movaps -11 * SIZE(X), %xmm6 movaps -7 * SIZE(X), %xmm7 decq %rax jle .L42 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps 1 * SIZE(X), %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movaps 5 * SIZE(X), %xmm2 movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps 9 * SIZE(X), %xmm3 #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movaps 13 * SIZE(X), %xmm4 movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movaps 17 * SIZE(X), %xmm5 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movaps 21 * SIZE(X), %xmm6 movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) movaps 25 * SIZE(X), %xmm7 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movaps -3 * SIZE(X), %xmm0 movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movss %xmm5, %xmm4 shufps $0x93, %xmm5, %xmm4 movaps %xmm4, -16 * SIZE(Y) movss %xmm6, %xmm5 shufps $0x93, %xmm6, %xmm5 movaps %xmm5, -12 * SIZE(Y) movss %xmm7, %xmm6 shufps $0x93, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(Y) movss %xmm0, %xmm7 shufps $0x93, %xmm0, %xmm7 movaps %xmm7, -4 * SIZE(Y) subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L43: testq $16, M jle .L44 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movaps -23 * SIZE(X), %xmm3 movaps -19 * SIZE(X), %xmm4 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movss %xmm3, %xmm2 shufps $0x93, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(Y) movss %xmm4, %xmm3 shufps $0x93, %xmm4, %xmm3 movaps %xmm3, -20 * SIZE(Y) movaps %xmm4, %xmm0 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L44: testq $8, M jle .L45 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movaps -27 * SIZE(X), %xmm2 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm2, %xmm1 shufps $0x93, %xmm2, %xmm1 movaps %xmm1, -28 * SIZE(Y) movaps %xmm2, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L45: testq $4, M jle .L46 ALIGN_3 movaps -31 * SIZE(X), %xmm1 movss %xmm1, %xmm0 shufps $0x93, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L46: testq $2, M jle .L47 ALIGN_3 movsd -32 * SIZE(X), %xmm0 movsd %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L47: testq $1, M jle .L49 ALIGN_3 movss -32 * SIZE(X), %xmm0 movss %xmm0, -32 * SIZE(Y) addq $SIZE, Y ALIGN_3 .L49: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_4 .L100: movq M, %rax sarq $3, %rax jle .L105 ALIGN_3 .L102: movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X movhps (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X movhps (X), %xmm3 addq INCX, X movsd %xmm0, (Y) addq INCY, Y movhps %xmm0, (Y) addq INCY, Y movsd %xmm1, (Y) addq INCY, Y movhps %xmm1, (Y) addq INCY, Y movsd %xmm2, (Y) addq INCY, Y movhps %xmm2, (Y) addq INCY, Y movsd %xmm3, (Y) addq INCY, Y movhps %xmm3, (Y) addq INCY, Y decq %rax jg .L102 ALIGN_3 .L105: testq $4, M jle .L106 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X movhps (X), %xmm1 addq INCX, X movsd %xmm0, (Y) addq INCY, Y movhps %xmm0, (Y) addq INCY, Y movsd %xmm1, (Y) addq INCY, Y movhps %xmm1, (Y) addq INCY, Y ALIGN_3 .L106: testq $2, M jle .L107 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd %xmm0, (Y) addq INCY, Y movhps %xmm0, (Y) addq INCY, Y ALIGN_3 .L107: testq $1, M jle .L999 movsd (X), %xmm0 movsd %xmm0, (Y) ALIGN_3 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zcopy_sse2.S000066400000000000000000000306711313527062700176150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 addq M, M #ifdef ALIGNED_ACCESS testq $SIZE, Y #else testq $SIZE, X #endif je .L10 movsd (X), %xmm0 movsd %xmm0, (Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_4 .L10: subq $-16 * SIZE, X subq $-16 * SIZE, Y #ifdef ALIGNED_ACCESS testq $SIZE, X #else testq $SIZE, Y #endif jne .L20 movq M, %rax sarq $4, %rax jle .L13 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, -16 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movaps %xmm1, -14 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm2, -12 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movaps %xmm3, -10 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm4, -8 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movaps %xmm5, -6 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm6, -4 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movaps %xmm7, -2 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L11 ALIGN_3 .L12: movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) movaps %xmm4, -8 * SIZE(Y) movaps %xmm5, -6 * SIZE(Y) movaps %xmm6, -4 * SIZE(Y) movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L13: testq $8, M jle .L14 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, -12 * SIZE(Y) movaps %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L14: testq $4, M jle .L15 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: testq $2, M jle .L16 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movaps %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L16: testq $1, M jle .L19 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 .L20: #ifdef ALIGNED_ACCESS movhps -16 * SIZE(X), %xmm0 movq M, %rax sarq $4, %rax jle .L23 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 movaps -7 * SIZE(X), %xmm5 movaps -5 * SIZE(X), %xmm6 movaps -3 * SIZE(X), %xmm7 decq %rax jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) LOAD( 1 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) LOAD( 3 * SIZE, X, %xmm2) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) LOAD( 5 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) LOAD( 7 * SIZE, X, %xmm4) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) LOAD( 9 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) LOAD(11 * SIZE, X, %xmm6) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) LOAD(13 * SIZE, X, %xmm7) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 movaps %xmm3, -10 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 movaps %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 movaps %xmm5, -6 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 movaps %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 movaps %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L23: testq $8, M jle .L24 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm8 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 movaps %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(Y) movaps %xmm8, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: testq $4, M jle .L25 ALIGN_3 movaps -15 * SIZE(X), %xmm1 movaps -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -14 * SIZE(Y) movaps %xmm2, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: testq $2, M jle .L26 ALIGN_3 movaps -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 movaps %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L26: testq $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 #else movq M, %rax sarq $4, %rax jle .L23 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L21 ALIGN_3 .L22: movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) movlps %xmm4, -8 * SIZE(Y) movhps %xmm4, -7 * SIZE(Y) movlps %xmm5, -6 * SIZE(Y) movhps %xmm5, -5 * SIZE(Y) movlps %xmm6, -4 * SIZE(Y) movhps %xmm6, -3 * SIZE(Y) movlps %xmm7, -2 * SIZE(Y) movhps %xmm7, -1 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X ALIGN_3 .L23: testq $8, M jle .L24 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) movaps -12 * SIZE(X), %xmm2 movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) movaps -10 * SIZE(X), %xmm3 movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: testq $4, M jle .L25 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) movaps -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: testq $2, M jle .L26 ALIGN_3 movaps -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L26: testq $1, M jle .L29 ALIGN_3 movsd -16 * SIZE(X), %xmm0 movsd %xmm0, -16 * SIZE(Y) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS ret ALIGN_3 #endif .L50: movq M, %rax sarq $2, %rax jle .L55 ALIGN_3 .L51: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addq INCX, X movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addq INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addq INCY, Y movlps %xmm1, 0 * SIZE(Y) movhps %xmm1, 1 * SIZE(Y) addq INCY, Y movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) addq INCY, Y movlps %xmm3, 0 * SIZE(Y) movhps %xmm3, 1 * SIZE(Y) addq INCY, Y decq %rax jg .L51 ALIGN_3 .L55: movq M, %rax andq $3, %rax jle .L57 ALIGN_3 .L56: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addq INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addq INCY, Y decq %rax jg .L56 ALIGN_3 .L57: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zdot.S000066400000000000000000000124531313527062700164730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif testq N, N jle .L88 salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY fldz fldz fldz fldz cmpq $2 * SIZE, INCX jne .L14 cmpq $2 * SIZE, INCY jne .L14 movq N, %rax sarq $1, %rax jle .L15 ALIGN_3 .L16: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fmul %st(1) faddp %st, %st(2) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 3 * SIZE(X) FLD 2 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 3 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addq $4 * SIZE, X addq $4 * SIZE, Y decq %rax jg .L16 ALIGN_3 .L15: movq N, %rax andq $1, %rax jle .L27 ALIGN_3 .L22: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) jmp .L27 ALIGN_3 .L14: movq N, %rax sarq $1, %rax jle .L30 ALIGN_3 .L31: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addq INCX, X FLD 0 * SIZE(X) addq INCY, Y FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) addq INCX, X addq INCY, Y decq %rax jg .L31 ALIGN_3 .L30: movq N, %rax andq $1, %rax jle .L27 ALIGN_3 .L37: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(2) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(2) FLD 1 * SIZE(X) FLD 0 * SIZE(Y) fmul %st(1) faddp %st, %st(4) FLD 1 * SIZE(Y) fmulp %st, %st(1) faddp %st, %st(4) ALIGN_3 .L27: #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) fxch %st(1) #else faddp %st, %st(3) fsubp %st, %st(1) fxch %st(1) #endif ret ALIGN_3 .L88: fldz fldz ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zdot.c000066400000000000000000000103201313527062700165020ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" #elif defined(HASWELL) || defined(ZEN) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" #endif #ifndef HAVE_KERNEL_8 static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { BLASLONG register i = 0; FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; BLASLONG j=0; while( i < n ) { dot[0] += x[j] * y[j] ; dot[1] += x[j+1] * y[j+1] ; dot[2] += x[j] * y[j+1] ; dot[3] += x[j+1] * y[j] ; dot[0] += x[j+2] * y[j+2] ; dot[1] += x[j+3] * y[j+3] ; dot[2] += x[j+2] * y[j+3] ; dot[3] += x[j+3] * y[j+2] ; dot[0] += x[j+4] * y[j+4] ; dot[1] += x[j+5] * y[j+5] ; dot[2] += x[j+4] * y[j+5] ; dot[3] += x[j+5] * y[j+4] ; dot[0] += x[j+6] * y[j+6] ; dot[1] += x[j+7] * y[j+7] ; dot[2] += x[j+6] * y[j+7] ; dot[3] += x[j+7] * y[j+6] ; j+=8; i+=4; } d[0] = dot[0]; d[1] = dot[1]; d[2] = dot[2]; d[3] = dot[3]; } #endif FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; BLASLONG ix,iy; FLOAT _Complex result; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; if ( n <= 0 ) { // CREAL(result) = 0.0 ; // CIMAG(result) = 0.0 ; result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); return(result); } if ( (inc_x == 1) && (inc_y == 1) ) { BLASLONG n1 = n & -8; if ( n1 ) zdot_kernel_8(n1, x, y , dot ); i = n1; BLASLONG j = i * 2; while( i < n ) { dot[0] += x[j] * y[j] ; dot[1] += x[j+1] * y[j+1] ; dot[2] += x[j] * y[j+1] ; dot[3] += x[j+1] * y[j] ; j+=2; i++ ; } } else { i=0; ix=0; iy=0; inc_x <<= 1; inc_y <<= 1; while(i < n) { dot[0] += x[ix] * y[iy] ; dot[1] += x[ix+1] * y[iy+1] ; dot[2] += x[ix] * y[iy+1] ; dot[3] += x[ix+1] * y[iy] ; ix += inc_x ; iy += inc_y ; i++ ; } } #if !defined(CONJ) result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); // CREAL(result) = dot[0] - dot[1]; // CIMAG(result) = dot[2] + dot[3]; #else result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); // CREAL(result) = dot[0] + dot[1]; // CIMAG(result) = dot[2] - dot[3]; #endif return(result); } OpenBLAS-0.2.20/kernel/x86_64/zdot_atom.S000066400000000000000000000236721313527062700175200ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX pxor %xmm0, %xmm0 salq $ZBASE_SHIFT, INCY pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 cmpq $0, N pxor %xmm3, %xmm3 jle .L999 cmpq $2 * SIZE, INCX jne .L20 cmpq $2 * SIZE, INCY jne .L20 movq N, %rax sarq $2, %rax jle .L15 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm6 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 movsd 2 * SIZE(X), %xmm10 mulsd %xmm7, %xmm8 movsd 2 * SIZE(Y), %xmm11 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 movsd 3 * SIZE(X), %xmm12 mulsd %xmm6, %xmm9 movsd 3 * SIZE(Y), %xmm13 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif addsd %xmm4, %xmm0 movaps %xmm10, %xmm14 mulsd %xmm11, %xmm10 movsd 4 * SIZE(X), %xmm4 addsd %xmm8, %xmm1 mulsd %xmm13, %xmm14 movsd 4 * SIZE(Y), %xmm6 addsd %xmm5, %xmm2 movaps %xmm12, %xmm15 mulsd %xmm13, %xmm12 movsd 5 * SIZE(X), %xmm5 addsd %xmm9, %xmm3 mulsd %xmm11, %xmm15 movsd 5 * SIZE(Y), %xmm7 addsd %xmm10, %xmm0 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 movsd 6 * SIZE(X), %xmm10 addsd %xmm14, %xmm1 mulsd %xmm7, %xmm8 movsd 6 * SIZE(Y), %xmm11 addsd %xmm12, %xmm2 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 movsd 7 * SIZE(X), %xmm12 addsd %xmm15, %xmm3 mulsd %xmm6, %xmm9 movsd 7 * SIZE(Y), %xmm13 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif addsd %xmm4, %xmm0 movaps %xmm10, %xmm14 mulsd %xmm11, %xmm10 movsd 8 * SIZE(X), %xmm4 addsd %xmm8, %xmm1 mulsd %xmm13, %xmm14 movsd 8 * SIZE(Y), %xmm6 addsd %xmm5, %xmm2 movaps %xmm12, %xmm15 mulsd %xmm13, %xmm12 movsd 9 * SIZE(X), %xmm5 addsd %xmm9, %xmm3 mulsd %xmm11, %xmm15 movsd 9 * SIZE(Y), %xmm7 addsd %xmm10, %xmm0 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 movsd 10 * SIZE(X), %xmm10 addsd %xmm14, %xmm1 mulsd %xmm7, %xmm8 movsd 10 * SIZE(Y), %xmm11 addsd %xmm12, %xmm2 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 movsd 11 * SIZE(X), %xmm12 addsd %xmm15, %xmm3 mulsd %xmm6, %xmm9 movsd 11 * SIZE(Y), %xmm13 addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: addsd %xmm4, %xmm0 movaps %xmm10, %xmm14 mulsd %xmm11, %xmm10 movsd 4 * SIZE(X), %xmm4 addsd %xmm8, %xmm1 mulsd %xmm13, %xmm14 movsd 4 * SIZE(Y), %xmm6 addsd %xmm5, %xmm2 movaps %xmm12, %xmm15 mulsd %xmm13, %xmm12 movsd 5 * SIZE(X), %xmm5 addsd %xmm9, %xmm3 mulsd %xmm11, %xmm15 movsd 5 * SIZE(Y), %xmm7 addsd %xmm10, %xmm0 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 movsd 6 * SIZE(X), %xmm10 addsd %xmm14, %xmm1 mulsd %xmm7, %xmm8 movsd 6 * SIZE(Y), %xmm11 addsd %xmm12, %xmm2 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 movsd 7 * SIZE(X), %xmm12 addsd %xmm15, %xmm3 mulsd %xmm6, %xmm9 movsd 7 * SIZE(Y), %xmm13 addsd %xmm4, %xmm0 movaps %xmm10, %xmm14 mulsd %xmm11, %xmm10 addsd %xmm8, %xmm1 mulsd %xmm13, %xmm14 addsd %xmm5, %xmm2 movaps %xmm12, %xmm15 mulsd %xmm13, %xmm12 addsd %xmm9, %xmm3 mulsd %xmm11, %xmm15 addsd %xmm10, %xmm0 addsd %xmm14, %xmm1 addsd %xmm12, %xmm2 addsd %xmm15, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: movq N, %rax andq $2, %rax jle .L17 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm6 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 movsd 2 * SIZE(X), %xmm10 mulsd %xmm7, %xmm8 movsd 2 * SIZE(Y), %xmm11 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 movsd 3 * SIZE(X), %xmm12 mulsd %xmm6, %xmm9 movsd 3 * SIZE(Y), %xmm13 addsd %xmm4, %xmm0 movaps %xmm10, %xmm14 mulsd %xmm11, %xmm10 addsd %xmm8, %xmm1 mulsd %xmm13, %xmm14 addsd %xmm5, %xmm2 movaps %xmm12, %xmm15 mulsd %xmm13, %xmm12 addsd %xmm9, %xmm3 mulsd %xmm11, %xmm15 addsd %xmm10, %xmm0 addsd %xmm14, %xmm1 addsd %xmm12, %xmm2 addsd %xmm15, %xmm3 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L17: movq N, %rax andq $1, %rax jle .L999 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm6 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 mulsd %xmm6, %xmm9 addsd %xmm4, %xmm0 addsd %xmm8, %xmm1 addsd %xmm5, %xmm2 addsd %xmm9, %xmm3 jmp .L999 ALIGN_3 .L20: movq N, %rax sarq $2, %rax jle .L25 ALIGN_3 .L23: movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm6 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 addq INCX, X mulsd %xmm6, %xmm9 addq INCY, Y addsd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm4 addsd %xmm8, %xmm1 movsd 0 * SIZE(Y), %xmm6 addsd %xmm5, %xmm2 movsd 1 * SIZE(X), %xmm5 addsd %xmm9, %xmm3 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 addq INCX, X mulsd %xmm6, %xmm9 addq INCY, Y addsd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm4 addsd %xmm8, %xmm1 movsd 0 * SIZE(Y), %xmm6 addsd %xmm5, %xmm2 movsd 1 * SIZE(X), %xmm5 addsd %xmm9, %xmm3 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 addq INCX, X mulsd %xmm6, %xmm9 addq INCY, Y addsd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm4 addsd %xmm8, %xmm1 movsd 0 * SIZE(Y), %xmm6 addsd %xmm5, %xmm2 movsd 1 * SIZE(X), %xmm5 addsd %xmm9, %xmm3 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 addq INCX, X mulsd %xmm6, %xmm9 addq INCY, Y addsd %xmm4, %xmm0 addsd %xmm8, %xmm1 addsd %xmm5, %xmm2 addsd %xmm9, %xmm3 decq %rax jg .L23 ALIGN_3 .L25: testq $3, N je .L999 movq N, %rax andq $2, %rax jle .L27 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm6 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 addq INCX, X mulsd %xmm6, %xmm9 addq INCY, Y addsd %xmm4, %xmm0 movsd 0 * SIZE(X), %xmm4 addsd %xmm8, %xmm1 movsd 0 * SIZE(Y), %xmm6 addsd %xmm5, %xmm2 movsd 1 * SIZE(X), %xmm5 addsd %xmm9, %xmm3 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 addq INCX, X mulsd %xmm6, %xmm9 addq INCY, Y addsd %xmm4, %xmm0 addsd %xmm8, %xmm1 addsd %xmm5, %xmm2 addsd %xmm9, %xmm3 ALIGN_3 .L27: movq N, %rax andq $1, %rax jle .L999 movsd 0 * SIZE(X), %xmm4 movsd 0 * SIZE(Y), %xmm6 movsd 1 * SIZE(X), %xmm5 movsd 1 * SIZE(Y), %xmm7 movaps %xmm4, %xmm8 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm8 movaps %xmm5, %xmm9 mulsd %xmm7, %xmm5 mulsd %xmm6, %xmm9 addsd %xmm4, %xmm0 addsd %xmm8, %xmm1 addsd %xmm5, %xmm2 addsd %xmm9, %xmm3 ALIGN_3 .L999: #ifndef CONJ subsd %xmm2, %xmm0 addsd %xmm3, %xmm1 #else addsd %xmm2, %xmm0 subsd %xmm3, %xmm1 #endif RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zdot_microk_bulldozer-2.c000066400000000000000000000172441313527062700223030ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n < 768 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $8 , %0 \n\t" "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $4 , %1 \n\t" "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x "prefetcht0 384(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $8 , %0 \n\t" "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $4 , %1 \n\t" "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zdot_microk_haswell-2.c000066400000000000000000000177331313527062700217430ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n <=1280 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r "subq $8 , %1 \n\t" "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x "prefetcht0 512(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y "prefetcht0 576(%2,%0,8) \n\t" "vmovups 64(%2,%0,8), %%ymm10 \n\t" // 2 * x "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 * x "prefetcht0 576(%3,%0,8) \n\t" "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" "vfmadd231pd %%ymm9 , %%ymm13, %%ymm5 \n\t" // x_r * y_i, x_i * y_r "vfmadd231pd %%ymm10, %%ymm14, %%ymm6 \n\t" // x_r * y_i, x_i * y_r "subq $8 , %1 \n\t" "vfmadd231pd %%ymm11, %%ymm15, %%ymm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zdot_microk_sandy-2.c000066400000000000000000000201631313527062700214110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n < 1280 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" "addq $16 , %0 \n\t" "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" "subq $8 , %1 \n\t" "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" "jnz 1b \n\t" "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%ymm9 \n\t" // 2 * x "prefetcht0 512(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%ymm12 \n\t" // 2 * y "vmovups 32(%3,%0,8), %%ymm13 \n\t" // 2 * y "vmovups 64(%3,%0,8), %%ymm14 \n\t" // 2 * y "vmovups 96(%3,%0,8), %%ymm15 \n\t" // 2 * y "prefetcht0 576(%3,%0,8) \n\t" "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" "prefetcht0 576(%2,%0,8) \n\t" "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vaddpd %%ymm0 , %%ymm10, %%ymm0 \n\t" "vaddpd %%ymm1 , %%ymm11, %%ymm1 \n\t" "vmulpd %%ymm8 , %%ymm12, %%ymm10 \n\t" "vmulpd %%ymm9 , %%ymm13, %%ymm11 \n\t" "vmovups 64(%2,%0,8), %%ymm8 \n\t" // 2 * x "vmovups 96(%2,%0,8), %%ymm9 \n\t" // 2 * x "vaddpd %%ymm4 , %%ymm10, %%ymm4 \n\t" "vaddpd %%ymm5 , %%ymm11, %%ymm5 \n\t" "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" "vaddpd %%ymm2 , %%ymm10, %%ymm2 \n\t" "vaddpd %%ymm3 , %%ymm11, %%ymm3 \n\t" "vmulpd %%ymm8 , %%ymm14, %%ymm10 \n\t" "addq $16 , %0 \n\t" "vmulpd %%ymm9 , %%ymm15, %%ymm11 \n\t" "vaddpd %%ymm6 , %%ymm10, %%ymm6 \n\t" "subq $8 , %1 \n\t" "vaddpd %%ymm7 , %%ymm11, %%ymm7 \n\t" "jnz 1b \n\t" "vaddpd %%ymm0, %%ymm1, %%ymm0 \n\t" "vaddpd %%ymm2, %%ymm3, %%ymm2 \n\t" "vaddpd %%ymm0, %%ymm2, %%ymm0 \n\t" "vaddpd %%ymm4, %%ymm5, %%ymm4 \n\t" "vaddpd %%ymm6, %%ymm7, %%ymm6 \n\t" "vaddpd %%ymm4, %%ymm6, %%ymm4 \n\t" "vextractf128 $1 , %%ymm0 , %%xmm1 \n\t" "vextractf128 $1 , %%ymm4 , %%xmm5 \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zdot_microk_steamroller-2.c000066400000000000000000000172541313527062700226330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) { BLASLONG register i = 0; if ( n < 640 ) { __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x // "prefetcht0 512(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $8 , %0 \n\t" "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $4 , %1 \n\t" "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); return; } __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x "prefetcht0 512(%3,%0,8) \n\t" "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y "vfmadd231pd %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vfmadd231pd %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" "vfmadd231pd %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r "addq $8 , %0 \n\t" "vfmadd231pd %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r "vfmadd231pd %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r "subq $4 , %1 \n\t" "vfmadd231pd %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r "jnz 1b \n\t" "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" "vmovups %%xmm0, (%4) \n\t" "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zdot_sse.S000066400000000000000000002033401313527062700173420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 testq N, N jle .L999 cmpq $2 * SIZE, INCX jne .L200 cmpq $2 * SIZE, INCY jne .L200 subq $-32 * SIZE, X subq $-32 * SIZE, Y testq $SIZE, X jne .L50 .L0x: testq $2 * SIZE, X je .L10 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm0 pshufd $0xb1, %xmm0, %xmm1 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y decq N ALIGN_3 .L10: testq $3 * SIZE, Y jne .L20 movq N, %rax sarq $4, %rax jle .L15 movaps -32 * SIZE(X), %xmm4 movaps -28 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm8 movaps -28 * SIZE(Y), %xmm9 movaps -24 * SIZE(X), %xmm6 movaps -20 * SIZE(X), %xmm7 movaps -24 * SIZE(Y), %xmm10 movaps -20 * SIZE(Y), %xmm11 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -12 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps -12 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -8 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -4 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps -4 * SIZE(X), %xmm7 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps 0 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps 0 * SIZE(X), %xmm4 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps 4 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps 4 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps 8 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps 8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps 12 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps 12 * SIZE(X), %xmm7 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -12 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps -12 * SIZE(X), %xmm5 addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -8 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -4 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps -4 * SIZE(X), %xmm7 addps %xmm12, %xmm3 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L15: testq $8, N jle .L16 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm9 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movaps -24 * SIZE(X), %xmm6 movaps -24 * SIZE(Y), %xmm10 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(X), %xmm7 movaps -20 * SIZE(Y), %xmm11 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L16: testq $4, N jle .L17 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm8 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm9 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L17: testq $2, N jle .L18 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L18: testq $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 jmp .L98 ALIGN_3 .L20: #ifdef ALIGNED_ACCESS testq $2 * SIZE, Y jne .L30 movaps -33 * SIZE(Y), %xmm8 addq $3 * SIZE, Y shufps $0xb1, %xmm1, %xmm1 movq N, %rax sarq $4, %rax jle .L25 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm10 movaps -24 * SIZE(X), %xmm6 movaps -24 * SIZE(Y), %xmm11 movaps -20 * SIZE(X), %xmm7 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(Y), %xmm9 addps %xmm12, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(Y), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(X), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(Y), %xmm11 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulps %xmm8, %xmm12 movaps -4 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(X), %xmm5 mulps %xmm9, %xmm12 movaps 0 * SIZE(Y), %xmm9 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(X), %xmm6 mulps %xmm10, %xmm12 movaps 4 * SIZE(Y), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps 12 * SIZE(X), %xmm7 mulps %xmm11, %xmm12 movaps 8 * SIZE(Y), %xmm11 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(Y), %xmm9 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(Y), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(X), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(Y), %xmm11 addps %xmm12, %xmm1 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 movaps -4 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L25: testq $8, N jle .L26 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm10 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -24 * SIZE(X), %xmm6 movaps -24 * SIZE(Y), %xmm11 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(X), %xmm7 movaps -20 * SIZE(Y), %xmm8 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L26: testq $4, N jle .L27 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps %xmm10, %xmm8 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L27: testq $2, N jle .L28 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps %xmm9, %xmm8 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L28: testq $1, N jle .L29 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 pshufd $0xb1, %xmm4, %xmm12 shufps $0x59, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 ALIGN_3 .L29: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 jmp .L98 ALIGN_3 .L30: testq $SIZE, Y jne .L40 #endif movq N, %rax sarq $4, %rax jle .L35 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm8 movhps -30 * SIZE(Y), %xmm8 movaps -28 * SIZE(X), %xmm5 movsd -28 * SIZE(Y), %xmm9 movhps -26 * SIZE(Y), %xmm9 movaps -24 * SIZE(X), %xmm6 movsd -24 * SIZE(Y), %xmm10 movhps -22 * SIZE(Y), %xmm10 movaps -20 * SIZE(X), %xmm7 movsd -20 * SIZE(Y), %xmm11 movhps -18 * SIZE(Y), %xmm11 decq %rax jle .L32 ALIGN_3 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd -16 * SIZE(Y), %xmm8 movhps -14 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd -12 * SIZE(Y), %xmm9 movhps -10 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps -12 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd -8 * SIZE(Y), %xmm10 movhps -6 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd -4 * SIZE(Y), %xmm11 movhps -2 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps -4 * SIZE(X), %xmm7 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd 0 * SIZE(Y), %xmm8 movhps 2 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps 0 * SIZE(X), %xmm4 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd 4 * SIZE(Y), %xmm9 movhps 6 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps 4 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd 8 * SIZE(Y), %xmm10 movhps 10 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps 8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd 12 * SIZE(Y), %xmm11 movhps 14 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps 12 * SIZE(X), %xmm7 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd -16 * SIZE(Y), %xmm8 movhps -14 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd -12 * SIZE(Y), %xmm9 movhps -10 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps -12 * SIZE(X), %xmm5 addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd -8 * SIZE(Y), %xmm10 movhps -6 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd -4 * SIZE(Y), %xmm11 movhps -2 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps -4 * SIZE(X), %xmm7 addps %xmm12, %xmm3 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L35: testq $8, N jle .L36 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm8 movhps -30 * SIZE(Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(X), %xmm5 movsd -28 * SIZE(Y), %xmm9 movhps -26 * SIZE(Y), %xmm9 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movaps -24 * SIZE(X), %xmm6 movsd -24 * SIZE(Y), %xmm10 movhps -22 * SIZE(Y), %xmm10 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(X), %xmm7 movsd -20 * SIZE(Y), %xmm11 movhps -18 * SIZE(Y), %xmm11 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L36: testq $4, N jle .L37 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm8 movhps -30 * SIZE(Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(X), %xmm5 movsd -28 * SIZE(Y), %xmm9 movhps -26 * SIZE(Y), %xmm9 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L37: testq $2, N jle .L38 movaps -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm8 movhps -30 * SIZE(Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L38: testq $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 jmp .L98 ALIGN_3 #ifdef ALIGNED_ACCESS .L40: movaps -35 * SIZE(Y), %xmm8 addq $1 * SIZE, Y shufps $0xb1, %xmm1, %xmm1 movq N, %rax sarq $4, %rax jle .L45 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm10 movaps -24 * SIZE(X), %xmm6 movaps -24 * SIZE(Y), %xmm11 movaps -20 * SIZE(X), %xmm7 decq %rax jle .L42 ALIGN_3 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(Y), %xmm9 addps %xmm12, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(Y), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(X), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(Y), %xmm11 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(X), %xmm4 mulps %xmm8, %xmm12 movaps -4 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(X), %xmm5 mulps %xmm9, %xmm12 movaps 0 * SIZE(Y), %xmm9 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(X), %xmm6 mulps %xmm10, %xmm12 movaps 4 * SIZE(Y), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps 12 * SIZE(X), %xmm7 mulps %xmm11, %xmm12 movaps 8 * SIZE(Y), %xmm11 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(X), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(X), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(Y), %xmm9 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(Y), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(X), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(Y), %xmm11 addps %xmm12, %xmm1 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 movaps -4 * SIZE(Y), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L45: testq $8, N jle .L46 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm10 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -24 * SIZE(X), %xmm6 movaps -24 * SIZE(Y), %xmm11 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(X), %xmm7 movaps -20 * SIZE(Y), %xmm8 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L46: testq $4, N jle .L47 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps %xmm10, %xmm8 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L47: testq $2, N jle .L48 movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps %xmm9, %xmm8 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L48: testq $1, N jle .L49 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 movss -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 ALIGN_3 .L49: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 jmp .L98 ALIGN_3 #endif .L50: testq $SIZE, Y jne .L70 #ifdef ALIGNED_ACCESS testq $2 * SIZE, Y je .L50x #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(X), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 pshufd $0xb1, %xmm0, %xmm1 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y decq N ALIGN_3 .L50x: testq $2 * SIZE, X jne .L60 movaps -33 * SIZE(X), %xmm8 addq $3 * SIZE, X shufps $0xb1, %xmm1, %xmm1 movq N, %rax sarq $4, %rax jle .L55 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movaps -28 * SIZE(Y), %xmm5 movaps -28 * SIZE(X), %xmm10 movaps -24 * SIZE(Y), %xmm6 movaps -24 * SIZE(X), %xmm11 movaps -20 * SIZE(Y), %xmm7 decq %rax jle .L52 ALIGN_3 .L51: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(X), %xmm9 addps %xmm12, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(X), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(X), %xmm11 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movaps -4 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movaps 0 * SIZE(X), %xmm9 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movaps 4 * SIZE(X), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps 12 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movaps 8 * SIZE(X), %xmm11 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L51 ALIGN_3 .L52: movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(X), %xmm9 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(X), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(X), %xmm11 addps %xmm12, %xmm1 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 movaps -4 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L55: testq $8, N jle .L56 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movaps -28 * SIZE(Y), %xmm5 movaps -28 * SIZE(X), %xmm10 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -24 * SIZE(Y), %xmm6 movaps -24 * SIZE(X), %xmm11 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(Y), %xmm7 movaps -20 * SIZE(X), %xmm8 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L56: testq $4, N jle .L57 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(Y), %xmm5 movaps -28 * SIZE(X), %xmm10 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps %xmm10, %xmm8 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L57: testq $2, N jle .L58 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps %xmm9, %xmm8 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L58: testq $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 pshufd $0xb1, %xmm4, %xmm12 shufps $0xa9, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 jmp .L98 ALIGN_3 .L60: movaps -35 * SIZE(X), %xmm8 addq $1 * SIZE, X shufps $0xb1, %xmm1, %xmm1 movq N, %rax sarq $4, %rax jle .L65 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movaps -28 * SIZE(Y), %xmm5 movaps -28 * SIZE(X), %xmm10 movaps -24 * SIZE(Y), %xmm6 movaps -24 * SIZE(X), %xmm11 movaps -20 * SIZE(Y), %xmm7 decq %rax jle .L62 ALIGN_3 .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(X), %xmm9 addps %xmm12, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(X), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(X), %xmm11 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movaps -4 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movaps 0 * SIZE(X), %xmm9 addps %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movaps 4 * SIZE(X), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps 12 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movaps 8 * SIZE(X), %xmm11 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L61 ALIGN_3 .L62: movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movaps -20 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movaps -16 * SIZE(X), %xmm9 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movaps -12 * SIZE(X), %xmm10 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movaps -8 * SIZE(X), %xmm11 addps %xmm12, %xmm1 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 movaps -4 * SIZE(X), %xmm8 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L65: testq $8, N jle .L66 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movaps -28 * SIZE(Y), %xmm5 movaps -28 * SIZE(X), %xmm10 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -24 * SIZE(Y), %xmm6 movaps -24 * SIZE(X), %xmm11 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(Y), %xmm7 movaps -20 * SIZE(X), %xmm8 movss %xmm11, %xmm10 pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L66: testq $4, N jle .L67 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(Y), %xmm5 movaps -28 * SIZE(X), %xmm10 movss %xmm10, %xmm9 pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps %xmm10, %xmm8 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L67: testq $2, N jle .L68 movaps -32 * SIZE(Y), %xmm4 movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps %xmm9, %xmm8 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L68: testq $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 movss -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 jmp .L98 ALIGN_3 #else testq $2 * SIZE, Y je .L50x #ifdef movsd xorps %xmm0, %xmm0 #endif movsd -32 * SIZE(Y), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 pshufd $0xb1, %xmm0, %xmm1 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y decq N ALIGN_3 .L50x: movq N, %rax sarq $4, %rax jle .L55 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm8 movhps -30 * SIZE(X), %xmm8 movaps -28 * SIZE(Y), %xmm5 movlps -28 * SIZE(X), %xmm9 movhps -26 * SIZE(X), %xmm9 movaps -24 * SIZE(Y), %xmm6 movlps -24 * SIZE(X), %xmm10 movhps -22 * SIZE(X), %xmm10 movaps -20 * SIZE(Y), %xmm7 movlps -20 * SIZE(X), %xmm11 movhps -18 * SIZE(X), %xmm11 decq %rax jle .L52 ALIGN_3 .L51: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movlps -16 * SIZE(X), %xmm8 movhps -14 * SIZE(X), %xmm8 addps %xmm12, %xmm1 pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movlps -12 * SIZE(X), %xmm9 movhps -10 * SIZE(X), %xmm9 addps %xmm12, %xmm1 pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movlps -8 * SIZE(X), %xmm10 movhps -6 * SIZE(X), %xmm10 addps %xmm12, %xmm1 pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movlps -4 * SIZE(X), %xmm11 movhps -2 * SIZE(X), %xmm11 addps %xmm12, %xmm1 pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movlps 0 * SIZE(X), %xmm8 movhps 2 * SIZE(X), %xmm8 addps %xmm12, %xmm1 pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movlps 4 * SIZE(X), %xmm9 movhps 6 * SIZE(X), %xmm9 addps %xmm12, %xmm1 pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movlps 8 * SIZE(X), %xmm10 movhps 10 * SIZE(X), %xmm10 addps %xmm12, %xmm1 pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps 12 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movlps 12 * SIZE(X), %xmm11 movhps 14 * SIZE(X), %xmm11 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L51 ALIGN_3 .L52: pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 mulps %xmm8, %xmm12 movlps -16 * SIZE(X), %xmm8 movhps -14 * SIZE(X), %xmm8 addps %xmm12, %xmm1 pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 mulps %xmm9, %xmm12 movlps -12 * SIZE(X), %xmm9 movhps -10 * SIZE(X), %xmm9 addps %xmm12, %xmm1 pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 mulps %xmm10, %xmm12 movlps -8 * SIZE(X), %xmm10 movhps -6 * SIZE(X), %xmm10 addps %xmm12, %xmm1 pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 mulps %xmm11, %xmm12 movlps -4 * SIZE(X), %xmm11 movhps -2 * SIZE(X), %xmm11 addps %xmm12, %xmm1 pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L55: testq $8, N jle .L56 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm8 movhps -30 * SIZE(X), %xmm8 movaps -28 * SIZE(Y), %xmm5 movlps -28 * SIZE(X), %xmm9 movhps -26 * SIZE(X), %xmm9 pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -24 * SIZE(Y), %xmm6 movlps -24 * SIZE(X), %xmm10 movhps -22 * SIZE(X), %xmm10 pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(Y), %xmm7 movlps -20 * SIZE(X), %xmm11 movhps -18 * SIZE(X), %xmm11 pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 addps %xmm12, %xmm1 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L56: testq $4, N jle .L57 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm8 movhps -30 * SIZE(X), %xmm8 pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(Y), %xmm5 movlps -28 * SIZE(X), %xmm9 movhps -26 * SIZE(X), %xmm9 pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L57: testq $2, N jle .L58 movaps -32 * SIZE(Y), %xmm4 movlps -32 * SIZE(X), %xmm8 movhps -30 * SIZE(X), %xmm8 pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 movaps %xmm9, %xmm8 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L58: testq $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(Y), %xmm4 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd -32 * SIZE(X), %xmm8 pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 jmp .L98 ALIGN_3 #endif .L70: testq $2 * SIZE, Y je .L70x #ifdef movsd xorps %xmm4, %xmm4 #endif movsd -32 * SIZE(X), %xmm4 addq $2 * SIZE, X #ifdef movsd xorps %xmm1, %xmm1 #endif movsd -32 * SIZE(Y), %xmm1 addq $2 * SIZE, Y pshufd $0xb1, %xmm1, %xmm0 shufps $0xb1, %xmm4, %xmm4 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 decq N ALIGN_3 .L70x: testq $2 * SIZE, X jne .L80 movaps -33 * SIZE(X), %xmm4 addq $3 * SIZE, X movaps -33 * SIZE(Y), %xmm8 addq $3 * SIZE, Y movq N, %rax sarq $4, %rax jle .L75 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movaps -28 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm10 movaps -24 * SIZE(X), %xmm7 movaps -24 * SIZE(Y), %xmm11 decq %rax jle .L72 ALIGN_3 .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -20 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -20 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -16 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps -16 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -12 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps -12 * SIZE(X), %xmm6 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -8 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps -8 * SIZE(X), %xmm7 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -4 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -4 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps 0 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps 0 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps 4 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps 4 * SIZE(X), %xmm6 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps 8 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps 8 * SIZE(X), %xmm7 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L71 ALIGN_3 .L72: movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -20 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -20 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -16 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movaps -16 * SIZE(X), %xmm5 addps %xmm12, %xmm3 movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -12 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movaps -12 * SIZE(X), %xmm6 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -8 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movaps -8 * SIZE(X), %xmm7 addps %xmm12, %xmm3 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -4 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movaps -4 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L75: testq $8, N jle .L76 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movaps -28 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movaps -24 * SIZE(X), %xmm7 movaps -24 * SIZE(Y), %xmm11 movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 movaps -20 * SIZE(X), %xmm4 movaps -20 * SIZE(Y), %xmm8 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L76: testq $4, N jle .L77 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movaps -28 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm10 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movaps %xmm6, %xmm4 movaps %xmm10, %xmm8 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L77: testq $2, N jle .L78 movaps -32 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movaps %xmm5, %xmm4 movaps %xmm9, %xmm8 ALIGN_3 .L78: testq $1, N jle .L79 xorps %xmm5, %xmm5 movss %xmm5, %xmm4 movss %xmm5, %xmm8 shufps $0x24, %xmm4, %xmm4 pshufd $0x18, %xmm8, %xmm12 shufps $0x24, %xmm8, %xmm8 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 ALIGN_3 .L79: shufps $0x39, %xmm0, %xmm0 shufps $0x39, %xmm1, %xmm1 shufps $0x39, %xmm2, %xmm2 shufps $0x39, %xmm3, %xmm3 jmp .L98 ALIGN_3 .L80: movsd -33 * SIZE(X), %xmm4 movhps -31 * SIZE(X), %xmm4 addq $3 * SIZE, X movaps -33 * SIZE(Y), %xmm8 addq $3 * SIZE, Y movq N, %rax sarq $4, %rax jle .L85 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movsd -28 * SIZE(X), %xmm6 movhps -26 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm10 movsd -24 * SIZE(X), %xmm7 movhps -22 * SIZE(X), %xmm7 movaps -24 * SIZE(Y), %xmm11 decq %rax jle .L82 ALIGN_3 .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -20 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movsd -20 * SIZE(X), %xmm4 movhps -18 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -16 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movsd -16 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -12 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movsd -12 * SIZE(X), %xmm6 movhps -10 * SIZE(X), %xmm6 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -8 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movsd -8 * SIZE(X), %xmm7 movhps -6 * SIZE(X), %xmm7 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -4 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movsd -4 * SIZE(X), %xmm4 movhps -2 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps 0 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movsd 0 * SIZE(X), %xmm5 movhps 2 * SIZE(X), %xmm5 addps %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps 4 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movsd 4 * SIZE(X), %xmm6 movhps 6 * SIZE(X), %xmm6 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps 8 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movsd 8 * SIZE(X), %xmm7 movhps 10 * SIZE(X), %xmm7 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L81 ALIGN_3 .L82: movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -20 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movsd -20 * SIZE(X), %xmm4 movhps -18 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -16 * SIZE(Y), %xmm9 mulps %xmm5, %xmm12 movsd -16 * SIZE(X), %xmm5 movhps -14 * SIZE(X), %xmm5 addps %xmm12, %xmm3 movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -12 * SIZE(Y), %xmm10 mulps %xmm6, %xmm12 movsd -12 * SIZE(X), %xmm6 movhps -10 * SIZE(X), %xmm6 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -8 * SIZE(Y), %xmm11 mulps %xmm7, %xmm12 movsd -8 * SIZE(X), %xmm7 movhps -6 * SIZE(X), %xmm7 addps %xmm12, %xmm3 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -4 * SIZE(Y), %xmm8 mulps %xmm4, %xmm12 movsd -4 * SIZE(X), %xmm4 movhps -2 * SIZE(X), %xmm4 addps %xmm12, %xmm1 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 subq $-32 * SIZE, X subq $-32 * SIZE, Y ALIGN_3 .L85: testq $8, N jle .L86 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movsd -28 * SIZE(X), %xmm6 movhps -26 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movsd -24 * SIZE(X), %xmm7 movhps -22 * SIZE(X), %xmm7 movaps -24 * SIZE(Y), %xmm11 movss %xmm11, %xmm10 pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 movsd -20 * SIZE(X), %xmm4 movhps -18 * SIZE(X), %xmm4 movaps -20 * SIZE(Y), %xmm8 movss %xmm8, %xmm11 pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L86: testq $4, N jle .L87 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movsd -28 * SIZE(X), %xmm6 movhps -26 * SIZE(X), %xmm6 movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movaps %xmm6, %xmm4 movaps %xmm10, %xmm8 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L87: testq $2, N jle .L88 movsd -32 * SIZE(X), %xmm5 movhps -30 * SIZE(X), %xmm5 movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movaps %xmm5, %xmm4 movaps %xmm9, %xmm8 ALIGN_3 .L88: testq $1, N jle .L89 xorps %xmm5, %xmm5 movss %xmm5, %xmm4 movss %xmm5, %xmm8 shufps $0x24, %xmm4, %xmm4 pshufd $0x18, %xmm8, %xmm12 shufps $0x24, %xmm8, %xmm8 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 ALIGN_3 .L89: shufps $0x39, %xmm0, %xmm0 shufps $0x39, %xmm1, %xmm1 shufps $0x39, %xmm2, %xmm2 shufps $0x39, %xmm3, %xmm3 jmp .L98 ALIGN_3 .L200: movq N, %rax sarq $4, %rax jle .L205 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X movsd (Y), %xmm10 addq INCY, Y movhps (Y), %xmm10 addq INCY, Y movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X movsd (Y), %xmm11 addq INCY, Y movhps (Y), %xmm11 addq INCY, Y decq %rax jle .L204 ALIGN_3 .L203: pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y mulps %xmm4, %xmm12 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y mulps %xmm5, %xmm12 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd (Y), %xmm10 addq INCY, Y movhps (Y), %xmm10 addq INCY, Y mulps %xmm6, %xmm12 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd (Y), %xmm11 addq INCY, Y movhps (Y), %xmm11 addq INCY, Y mulps %xmm7, %xmm12 movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X addps %xmm12, %xmm3 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y mulps %xmm4, %xmm12 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y mulps %xmm5, %xmm12 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd (Y), %xmm10 addq INCY, Y movhps (Y), %xmm10 addq INCY, Y mulps %xmm6, %xmm12 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd (Y), %xmm11 addq INCY, Y movhps (Y), %xmm11 addq INCY, Y mulps %xmm7, %xmm12 movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X addps %xmm12, %xmm3 decq %rax jg .L203 ALIGN_3 .L204: pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y mulps %xmm4, %xmm12 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y mulps %xmm5, %xmm12 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd (Y), %xmm10 addq INCY, Y movhps (Y), %xmm10 addq INCY, Y mulps %xmm6, %xmm12 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd (Y), %xmm11 addq INCY, Y movhps (Y), %xmm11 addq INCY, Y mulps %xmm7, %xmm12 movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X addps %xmm12, %xmm3 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 ALIGN_3 .L205: testq $8, N jle .L206 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X movsd (Y), %xmm10 addq INCY, Y movhps (Y), %xmm10 addq INCY, Y pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 movsd (X), %xmm7 addq INCX, X movhps (X), %xmm7 addq INCX, X movsd (Y), %xmm11 addq INCY, Y movhps (Y), %xmm11 addq INCY, Y pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 addps %xmm12, %xmm3 ALIGN_3 .L206: testq $4, N jle .L207 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 movsd (X), %xmm5 addq INCX, X movhps (X), %xmm5 addq INCX, X movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 ALIGN_3 .L207: testq $2, N jle .L208 movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 ALIGN_3 .L208: testq $1, N jle .L98 #ifdef movsd xorps %xmm4, %xmm4 #endif movsd (X), %xmm4 #ifdef movsd xorps %xmm8, %xmm8 #endif movsd (Y), %xmm8 pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 ALIGN_3 .L98: addps %xmm2, %xmm0 addps %xmm3, %xmm1 movhlps %xmm0, %xmm2 movhlps %xmm1, %xmm3 addps %xmm2, %xmm0 addps %xmm3, %xmm1 pshufd $1, %xmm0, %xmm2 pshufd $1, %xmm1, %xmm3 ALIGN_3 .L999: #ifndef CONJ subss %xmm2, %xmm0 addss %xmm3, %xmm1 #else addss %xmm2, %xmm0 subss %xmm3, %xmm1 #endif unpcklps %xmm1, %xmm0 #ifdef WINDOWS_ABI movq %xmm0, %rax #endif RESTOREREGISTERS ret ALIGN_3 EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zdot_sse2.S000066400000000000000000000760021313527062700174270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #define INCY ARG5 /* r8 */ #else #define RESULT_ADDRESS ARG1 /*rcx*/ #define N ARG2 /* rdx */ #define X ARG3 /* r8 */ #define INCX ARG4 /* r9*/ #define Y %r10 #define INCY %r11 #endif #include "l1param.h" #undef movsd #ifndef OPTERON #define MOVLPS movsd #else #define MOVLPS movlps #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), Y movq 48(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 cmpq $0, N jle .L999 cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 subq $-16 * SIZE, X subq $-16 * SIZE, Y testq $SIZE, Y jne .L30 testq $SIZE, X jne .L20 movq N, %rax sarq $3, %rax jle .L15 movaps -16 * SIZE(X), %xmm4 movaps -14 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm8 movaps -14 * SIZE(Y), %xmm9 movaps -12 * SIZE(X), %xmm6 movaps -10 * SIZE(X), %xmm7 movaps -12 * SIZE(Y), %xmm10 movaps -10 * SIZE(Y), %xmm11 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 movaps -8 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 movaps -6 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 movaps -4 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 movaps -2 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps 0 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 movaps 0 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps 2 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 movaps 2 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps 4 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 movaps 4 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps 6 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 movaps 6 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 movaps -8 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 movaps -6 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 movaps -4 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 movaps -2 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L15: testq $4, N jle .L16 movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 movaps -12 * SIZE(X), %xmm6 movaps -12 * SIZE(Y), %xmm10 movaps -10 * SIZE(X), %xmm7 movaps -10 * SIZE(Y), %xmm11 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L16: testq $2, N jle .L17 movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L17: testq $1, N jle .L98 movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 jmp .L98 ALIGN_3 .L20: movq N, %rax sarq $3, %rax jle .L25 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm8 movaps -14 * SIZE(Y), %xmm9 MOVLPS -12 * SIZE(X), %xmm6 movhps -11 * SIZE(X), %xmm6 MOVLPS -10 * SIZE(X), %xmm7 movhps -9 * SIZE(X), %xmm7 movaps -12 * SIZE(Y), %xmm10 movaps -10 * SIZE(Y), %xmm11 decq %rax jle .L22 ALIGN_3 .L21: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 MOVLPS -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 MOVLPS -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 MOVLPS -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 MOVLPS -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps 0 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps 2 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 MOVLPS 2 * SIZE(X), %xmm5 movhps 3 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps 4 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 MOVLPS 4 * SIZE(X), %xmm6 movhps 5 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps 6 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 MOVLPS 6 * SIZE(X), %xmm7 movhps 7 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 MOVLPS -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 MOVLPS -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 MOVLPS -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 MOVLPS -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L25: testq $4, N jle .L26 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 MOVLPS -12 * SIZE(X), %xmm6 movhps -11 * SIZE(X), %xmm6 movaps -12 * SIZE(Y), %xmm10 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 MOVLPS -10 * SIZE(X), %xmm7 movhps -9 * SIZE(X), %xmm7 movaps -10 * SIZE(Y), %xmm11 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L26: testq $2, N jle .L27 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 MOVLPS -14 * SIZE(X), %xmm5 movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L27: testq $1, N jle .L98 MOVLPS -16 * SIZE(X), %xmm4 movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 jmp .L98 ALIGN_3 .L30: testq $SIZE, X jne .L40 movq N, %rax sarq $3, %rax jle .L35 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -16 * SIZE(X), %xmm8 movaps -14 * SIZE(X), %xmm9 MOVLPS -12 * SIZE(Y), %xmm6 movhps -11 * SIZE(Y), %xmm6 MOVLPS -10 * SIZE(Y), %xmm7 movhps -9 * SIZE(Y), %xmm7 movaps -12 * SIZE(X), %xmm10 movaps -10 * SIZE(X), %xmm11 decq %rax jle .L32 ALIGN_3 .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(X), %xmm8 mulpd %xmm4, %xmm12 MOVLPS -8 * SIZE(Y), %xmm4 movhps -7 * SIZE(Y), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(X), %xmm9 mulpd %xmm5, %xmm12 MOVLPS -6 * SIZE(Y), %xmm5 movhps -5 * SIZE(Y), %xmm5 addpd %xmm12, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(X), %xmm10 mulpd %xmm6, %xmm12 MOVLPS -4 * SIZE(Y), %xmm6 movhps -3 * SIZE(Y), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(X), %xmm11 mulpd %xmm7, %xmm12 MOVLPS -2 * SIZE(Y), %xmm7 movhps -1 * SIZE(Y), %xmm7 addpd %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps 0 * SIZE(X), %xmm8 mulpd %xmm4, %xmm12 MOVLPS 0 * SIZE(Y), %xmm4 movhps 1 * SIZE(Y), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps 2 * SIZE(X), %xmm9 mulpd %xmm5, %xmm12 MOVLPS 2 * SIZE(Y), %xmm5 movhps 3 * SIZE(Y), %xmm5 addpd %xmm12, %xmm3 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps 4 * SIZE(X), %xmm10 mulpd %xmm6, %xmm12 MOVLPS 4 * SIZE(Y), %xmm6 movhps 5 * SIZE(Y), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps 6 * SIZE(X), %xmm11 mulpd %xmm7, %xmm12 MOVLPS 6 * SIZE(Y), %xmm7 movhps 7 * SIZE(Y), %xmm7 addpd %xmm12, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(X), %xmm8 mulpd %xmm4, %xmm12 MOVLPS -8 * SIZE(Y), %xmm4 movhps -7 * SIZE(Y), %xmm4 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(X), %xmm9 mulpd %xmm5, %xmm12 MOVLPS -6 * SIZE(Y), %xmm5 movhps -5 * SIZE(Y), %xmm5 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(X), %xmm10 mulpd %xmm6, %xmm12 MOVLPS -4 * SIZE(Y), %xmm6 movhps -3 * SIZE(Y), %xmm6 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(X), %xmm11 mulpd %xmm7, %xmm12 MOVLPS -2 * SIZE(Y), %xmm7 movhps -1 * SIZE(Y), %xmm7 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L35: testq $4, N jle .L36 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm9 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 MOVLPS -12 * SIZE(Y), %xmm6 movhps -11 * SIZE(Y), %xmm6 movaps -12 * SIZE(X), %xmm10 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 MOVLPS -10 * SIZE(Y), %xmm7 movhps -9 * SIZE(Y), %xmm7 movaps -10 * SIZE(X), %xmm11 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L36: testq $2, N jle .L37 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 MOVLPS -14 * SIZE(Y), %xmm5 movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm9 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L37: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 testq $1, N jle .L98 MOVLPS -16 * SIZE(Y), %xmm4 movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 SHUFPD_1 %xmm12, %xmm12 addpd %xmm12, %xmm1 jmp .L98 ALIGN_3 .L40: movhps -16 * SIZE(X), %xmm4 addq $SIZE, X movhps -16 * SIZE(Y), %xmm8 addq $SIZE, Y movq N, %rax sarq $3, %rax jle .L45 movaps -16 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm9 movaps -14 * SIZE(X), %xmm6 movaps -14 * SIZE(Y), %xmm10 movaps -12 * SIZE(X), %xmm7 movaps -12 * SIZE(Y), %xmm11 decq %rax jle .L42 ALIGN_3 .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movsd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -10 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 movaps -10 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 movaps -8 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 movaps -8 * SIZE(X), %xmm5 addpd %xmm12, %xmm1 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd %xmm11, %xmm10 pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -6 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 movaps -6 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 movaps -4 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 movaps -4 * SIZE(X), %xmm7 addpd %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movsd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -2 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 movaps -2 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 movaps 0 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 movaps 0 * SIZE(X), %xmm5 addpd %xmm12, %xmm1 #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movsd %xmm11, %xmm10 pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps 2 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 movaps 2 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 movaps 4 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 movaps 4 * SIZE(X), %xmm7 addpd %xmm12, %xmm1 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movsd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -10 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 movaps -10 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 movaps -8 * SIZE(Y), %xmm9 mulpd %xmm5, %xmm12 movaps -8 * SIZE(X), %xmm5 addpd %xmm12, %xmm1 movsd %xmm11, %xmm10 pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -6 * SIZE(Y), %xmm10 mulpd %xmm6, %xmm12 movaps -6 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 movaps -4 * SIZE(Y), %xmm11 mulpd %xmm7, %xmm12 movaps -4 * SIZE(X), %xmm7 addpd %xmm12, %xmm1 movsd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -2 * SIZE(Y), %xmm8 mulpd %xmm4, %xmm12 movaps -2 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm1 movsd %xmm11, %xmm10 pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm1 subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3 .L45: testq $4, N jle .L46 movaps -16 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm9 movaps -14 * SIZE(X), %xmm6 movaps -14 * SIZE(Y), %xmm10 movsd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 movaps -12 * SIZE(X), %xmm7 movaps -12 * SIZE(Y), %xmm11 movsd %xmm10, %xmm9 pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm1 movaps -10 * SIZE(X), %xmm4 movaps -10 * SIZE(Y), %xmm8 movsd %xmm11, %xmm10 pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm1 addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L46: testq $2, N jle .L47 movaps -16 * SIZE(X), %xmm5 movaps -16 * SIZE(Y), %xmm9 movsd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 movaps -14 * SIZE(X), %xmm6 movaps -14 * SIZE(Y), %xmm10 movsd %xmm10, %xmm9 pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm1 movaps %xmm6, %xmm4 movaps %xmm10, %xmm8 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L47: testq $1, N jle .L48 movlps -16 * SIZE(X), %xmm4 movlps -16 * SIZE(Y), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 ALIGN_3 .L48: SHUFPD_1 %xmm0, %xmm0 SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm2, %xmm2 SHUFPD_1 %xmm3, %xmm3 jmp .L98 ALIGN_3 .L50: movq N, %rax sarq $3, %rax jle .L55 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 addq INCY, Y MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm9 movhps 1 * SIZE(Y), %xmm9 addq INCY, Y MOVLPS 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm10 movhps 1 * SIZE(Y), %xmm10 addq INCY, Y MOVLPS 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm11 movhps 1 * SIZE(Y), %xmm11 addq INCY, Y decq %rax jle .L54 ALIGN_3 .L53: pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 addq INCY, Y mulpd %xmm4, %xmm12 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 MOVLPS 0 * SIZE(Y), %xmm9 movhps 1 * SIZE(Y), %xmm9 addq INCY, Y mulpd %xmm5, %xmm12 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 MOVLPS 0 * SIZE(Y), %xmm10 movhps 1 * SIZE(Y), %xmm10 addq INCY, Y mulpd %xmm6, %xmm12 MOVLPS 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addq INCX, X addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 MOVLPS 0 * SIZE(Y), %xmm11 movhps 1 * SIZE(Y), %xmm11 addq INCY, Y mulpd %xmm7, %xmm12 MOVLPS 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addq INCX, X addpd %xmm12, %xmm3 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 addq INCY, Y mulpd %xmm4, %xmm12 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 MOVLPS 0 * SIZE(Y), %xmm9 movhps 1 * SIZE(Y), %xmm9 addq INCY, Y mulpd %xmm5, %xmm12 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 MOVLPS 0 * SIZE(Y), %xmm10 movhps 1 * SIZE(Y), %xmm10 addq INCY, Y mulpd %xmm6, %xmm12 MOVLPS 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addq INCX, X addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 MOVLPS 0 * SIZE(Y), %xmm11 movhps 1 * SIZE(Y), %xmm11 addq INCY, Y mulpd %xmm7, %xmm12 MOVLPS 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addq INCX, X addpd %xmm12, %xmm3 decq %rax jg .L53 ALIGN_3 .L54: pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 addq INCY, Y mulpd %xmm4, %xmm12 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 MOVLPS 0 * SIZE(Y), %xmm9 movhps 1 * SIZE(Y), %xmm9 addq INCY, Y mulpd %xmm5, %xmm12 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 MOVLPS 0 * SIZE(Y), %xmm10 movhps 1 * SIZE(Y), %xmm10 addq INCY, Y mulpd %xmm6, %xmm12 MOVLPS 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addq INCX, X addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 MOVLPS 0 * SIZE(Y), %xmm11 movhps 1 * SIZE(Y), %xmm11 addq INCY, Y mulpd %xmm7, %xmm12 MOVLPS 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addq INCX, X addpd %xmm12, %xmm3 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 ALIGN_3 .L55: testq $4, N jle .L56 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 addq INCY, Y pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm9 movhps 1 * SIZE(Y), %xmm9 addq INCY, Y pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 MOVLPS 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm10 movhps 1 * SIZE(Y), %xmm10 addq INCY, Y pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 MOVLPS 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm11 movhps 1 * SIZE(Y), %xmm11 addq INCY, Y pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 ALIGN_3 .L56: testq $2, N jle .L57 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 addq INCY, Y pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 MOVLPS 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X MOVLPS 0 * SIZE(Y), %xmm9 movhps 1 * SIZE(Y), %xmm9 addq INCY, Y pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 ALIGN_3 .L57: testq $1, N jle .L98 MOVLPS 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 ALIGN_3 .L98: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 pshufd $0x4e, %xmm0, %xmm2 pshufd $0x4e, %xmm1, %xmm3 .L999: #ifndef CONJ subsd %xmm2, %xmm0 addsd %xmm3, %xmm1 #else addsd %xmm2, %xmm0 subsd %xmm3, %xmm1 #endif #ifdef WINDOWS_ABI movq RESULT_ADDRESS, %rax movsd %xmm0, (%rax) movsd %xmm1, 8(%rax) #endif RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S000066400000000000000000001071371313527062700226300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define BX %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCH prefetcht0 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $3, J NOBRANCH jle .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 4), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 7 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht0 7 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht0 3 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 7 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht0 3 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht0 7 * SIZE(CO2, %rax, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps -12 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -14 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm6 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm8 movaps 4 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm6, %xmm2 mulpd %xmm0, %xmm6 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm6, %xmm12 movaps 8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 movaps -10 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addpd %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 addq $32 * SIZE, BO subq $-8 * SIZE, AO decq %rax BRANCH jg .L12 ALIGN_3 .L15: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: addpd %xmm1, %xmm12 addpd %xmm2, %xmm13 addpd %xmm3, %xmm14 addpd %xmm4, %xmm15 movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movaps %xmm12, %xmm0 shufpd $2, %xmm13, %xmm12 shufpd $2, %xmm0, %xmm13 movaps %xmm14, %xmm0 shufpd $2, %xmm15, %xmm14 shufpd $2, %xmm0, %xmm15 leaq (LDC, LDC, 2), %rax movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO1, LDC), %xmm2 movhps 1 * SIZE(CO1, LDC), %xmm2 movsd 2 * SIZE(CO1, LDC), %xmm3 movhps 3 * SIZE(CO1, LDC), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm9, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 1 * SIZE(CO1, LDC) movlps %xmm3, 2 * SIZE(CO1, LDC) movhps %xmm3, 3 * SIZE(CO1, LDC) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhps 3 * SIZE(CO1, LDC, 2), %xmm1 movsd 0 * SIZE(CO1, %rax), %xmm2 movhps 1 * SIZE(CO1, %rax), %xmm2 movsd 2 * SIZE(CO1, %rax), %xmm3 movhps 3 * SIZE(CO1, %rax), %xmm3 movddup %xmm10, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm10 addpd %xmm10, %xmm1 movddup %xmm11, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm3 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 1 * SIZE(CO1, LDC, 2) movlps %xmm1, 2 * SIZE(CO1, LDC, 2) movhps %xmm1, 3 * SIZE(CO1, LDC, 2) movlps %xmm2, 0 * SIZE(CO1, %rax) movhps %xmm2, 1 * SIZE(CO1, %rax) movlps %xmm3, 2 * SIZE(CO1, %rax) movhps %xmm3, 3 * SIZE(CO1, %rax) movsd 0 * SIZE(CO2), %xmm0 movhps 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhps 3 * SIZE(CO2), %xmm1 movsd 0 * SIZE(CO2, LDC), %xmm2 movhps 1 * SIZE(CO2, LDC), %xmm2 movsd 2 * SIZE(CO2, LDC), %xmm3 movhps 3 * SIZE(CO2, LDC), %xmm3 movddup %xmm12, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm1 movddup %xmm13, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm13 addpd %xmm13, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 1 * SIZE(CO2) movlps %xmm1, 2 * SIZE(CO2) movhps %xmm1, 3 * SIZE(CO2) movlps %xmm2, 0 * SIZE(CO2, LDC) movhps %xmm2, 1 * SIZE(CO2, LDC) movlps %xmm3, 2 * SIZE(CO2, LDC) movhps %xmm3, 3 * SIZE(CO2, LDC) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhps 3 * SIZE(CO2, LDC, 2), %xmm1 movsd 0 * SIZE(CO2, %rax), %xmm2 movhps 1 * SIZE(CO2, %rax), %xmm2 movsd 2 * SIZE(CO2, %rax), %xmm3 movhps 3 * SIZE(CO2, %rax), %xmm3 movddup %xmm14, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm14, %xmm14 mulpd %xmm7, %xmm14 addpd %xmm14, %xmm1 movddup %xmm15, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm15, %xmm15 mulpd %xmm7, %xmm15 addpd %xmm15, %xmm3 movlps %xmm0, 0 * SIZE(CO2, LDC, 2) movhps %xmm0, 1 * SIZE(CO2, LDC, 2) movlps %xmm1, 2 * SIZE(CO2, LDC, 2) movhps %xmm1, 3 * SIZE(CO2, LDC, 2) movlps %xmm2, 0 * SIZE(CO2, %rax) movhps %xmm2, 1 * SIZE(CO2, %rax) movlps %xmm3, 2 * SIZE(CO2, %rax) movhps %xmm3, 3 * SIZE(CO2, %rax) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L29 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps 10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps 12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps 14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 16 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: leaq (LDC, LDC, 2), %rax movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO1, LDC), %xmm1 movhps 1 * SIZE(CO1, LDC), %xmm1 movsd 0 * SIZE(CO1, LDC, 2), %xmm2 movhps 1 * SIZE(CO1, LDC, 2), %xmm2 movsd 0 * SIZE(CO1, %rax), %xmm3 movhps 1 * SIZE(CO1, %rax), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm9, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) movhps %xmm1, 1 * SIZE(CO1, LDC) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 1 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %rax) movhps %xmm3, 1 * SIZE(CO1, %rax) movsd 0 * SIZE(CO2), %xmm0 movhps 1 * SIZE(CO2), %xmm0 movsd 0 * SIZE(CO2, LDC), %xmm1 movhps 1 * SIZE(CO2, LDC), %xmm1 movsd 0 * SIZE(CO2, LDC, 2), %xmm2 movhps 1 * SIZE(CO2, LDC, 2), %xmm2 movsd 0 * SIZE(CO2, %rax), %xmm3 movhps 1 * SIZE(CO2, %rax), %xmm3 movddup %xmm10, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm10 addpd %xmm10, %xmm1 movddup %xmm11, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 1 * SIZE(CO2) movlps %xmm1, 0 * SIZE(CO2, LDC) movhps %xmm1, 1 * SIZE(CO2, LDC) movlps %xmm2, 0 * SIZE(CO2, LDC, 2) movhps %xmm2, 1 * SIZE(CO2, LDC, 2) movlps %xmm3, 0 * SIZE(CO2, %rax) movhps %xmm3, 1 * SIZE(CO2, %rax) ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addq $8, KK #endif movq BO, B leaq (C, LDC, 8), C subq $1, J BRANCH jg .L01 ALIGN_4 .L30: testq $4, N jle .L50 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 2), CO2 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 7 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht0 7 * SIZE(CO2, LDC, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 addpd %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 addpd %xmm3, %xmm10 addpd %xmm4, %xmm11 movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movaps %xmm10, %xmm0 shufpd $2, %xmm11, %xmm10 shufpd $2, %xmm0, %xmm11 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO1, LDC), %xmm2 movhps 1 * SIZE(CO1, LDC), %xmm2 movsd 2 * SIZE(CO1, LDC), %xmm3 movhps 3 * SIZE(CO1, LDC), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm9, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 1 * SIZE(CO1, LDC) movlps %xmm3, 2 * SIZE(CO1, LDC) movhps %xmm3, 3 * SIZE(CO1, LDC) movsd 0 * SIZE(CO2), %xmm0 movhps 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhps 3 * SIZE(CO2), %xmm1 movsd 0 * SIZE(CO2, LDC), %xmm2 movhps 1 * SIZE(CO2, LDC), %xmm2 movsd 2 * SIZE(CO2, LDC), %xmm3 movhps 3 * SIZE(CO2, LDC), %xmm3 movddup %xmm10, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm10 addpd %xmm10, %xmm1 movddup %xmm11, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 1 * SIZE(CO2) movlps %xmm1, 2 * SIZE(CO2) movhps %xmm1, 3 * SIZE(CO2) movlps %xmm2, 0 * SIZE(CO2, LDC) movhps %xmm2, 1 * SIZE(CO2, LDC) movlps %xmm3, 2 * SIZE(CO2, LDC) movhps %xmm3, 3 * SIZE(CO2, LDC) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L31 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps -8 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -6 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movaps -2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm11 movaps 0 * SIZE(BO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_4 .L48: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO1, LDC), %xmm1 movhps 1 * SIZE(CO1, LDC), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 1 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO2, LDC), %xmm3 movhps 1 * SIZE(CO2, LDC), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm9, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) movhps %xmm1, 1 * SIZE(CO1, LDC) movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 1 * SIZE(CO2) movlps %xmm3, 0 * SIZE(CO2, LDC) movhps %xmm3, 1 * SIZE(CO2, LDC) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C ALIGN_4 .L50: testq $2, N jle .L70 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC), CO2 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 7 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L52 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L55: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addpd %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: addpd %xmm1, %xmm8 addpd %xmm2, %xmm9 movaps %xmm8, %xmm0 shufpd $2, %xmm9, %xmm8 shufpd $2, %xmm0, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhps 3 * SIZE(CO2), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm9, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 1 * SIZE(CO2) movlps %xmm3, 2 * SIZE(CO2) movhps %xmm3, 3 * SIZE(CO2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -13 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -10 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movddup -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movaps -8 * SIZE(BO), %xmm1 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: addpd %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 1 * SIZE(CO2), %xmm1 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B leaq (C, LDC, 2), C ALIGN_4 .L70: testq $1, N jle .L999 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -15 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -10 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movddup -13 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L72 addpd %xmm9, %xmm8 ALIGN_3 .L75: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addpd %xmm1, %xmm8 movddup -16 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: addpd %xmm1, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 decq I BRANCH jg .L71 ALIGN_4 .L80: testq $1, M BRANCH jle .L999 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifndef TRMMKERNEL movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -16 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #else movsd -16 * SIZE(AO), %xmm0 movhpd -15 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm1 movhpd -15 * SIZE(BO), %xmm1 xorps %xmm9, %xmm9 #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: mulpd %xmm0, %xmm1 #ifndef TRMMKERNEL movapd -14 * SIZE(AO), %xmm0 #else movsd -14 * SIZE(AO), %xmm0 movhpd -13 * SIZE(AO), %xmm0 #endif addpd %xmm1, %xmm8 #ifndef TRMMKERNEL movapd -14 * SIZE(BO), %xmm1 #else movsd -14 * SIZE(BO), %xmm1 movhpd -13 * SIZE(BO), %xmm1 #endif mulpd %xmm0, %xmm1 #ifndef TRMMKERNEL movapd -12 * SIZE(AO), %xmm0 #else movsd -12 * SIZE(AO), %xmm0 movhpd -11 * SIZE(AO), %xmm0 #endif addpd %xmm1, %xmm9 #ifndef TRMMKERNEL movapd -12 * SIZE(BO), %xmm1 #else movsd -12 * SIZE(BO), %xmm1 movhpd -11 * SIZE(BO), %xmm1 #endif subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L82 addpd %xmm9, %xmm8 ALIGN_3 .L85: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd -15 * SIZE(BO), %xmm1 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_4 .L88: haddpd %xmm8, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_4x2_atom.S000066400000000000000000000542301313527062700221460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KKK 72(%rsp) #define KK 80(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define OFFSET 240(%rsp) #define KK 248(%rsp) #define KKK 256(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #endif movsd %xmm0, ALPHA_R movsd %xmm1, ALPHA_I salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J jle .L40 ALIGN_4 .L10: movq C, CO1 leaq (C, LDC, 1), CO2 leaq (C, LDC, 2), C movq A, AO movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $2, I jle .L20 ALIGN_4 .L11: movq B, BO prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 prefetcht0 3 * SIZE(CO2) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 movq K, %rax sarq $2, %rax je .L15 ALIGN_4 .L12: addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO addsd %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: movq K, %rax andq $3, %rax BRANCH BRANCH je .L19 ALIGN_4 .L16: addsd %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 addsd %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L19: movsd ALPHA_R, %xmm4 addsd %xmm2, %xmm13 movsd ALPHA_I, %xmm5 addsd %xmm7, %xmm14 addsd %xmm6, %xmm15 movaps %xmm8, %xmm0 movaps %xmm10, %xmm1 movaps %xmm12, %xmm2 movaps %xmm14, %xmm3 mulsd %xmm4, %xmm8 mulsd %xmm5, %xmm0 mulsd %xmm4, %xmm10 mulsd %xmm5, %xmm1 mulsd %xmm4, %xmm12 mulsd %xmm5, %xmm2 mulsd %xmm4, %xmm14 mulsd %xmm5, %xmm3 addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm0 addsd 2 * SIZE(CO1), %xmm10 addsd 3 * SIZE(CO1), %xmm1 addsd 4 * SIZE(CO1), %xmm12 addsd 5 * SIZE(CO1), %xmm2 addsd 6 * SIZE(CO1), %xmm14 addsd 7 * SIZE(CO1), %xmm3 movsd %xmm8, 0 * SIZE(CO1) movsd %xmm0, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movsd %xmm1, 3 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movsd %xmm2, 5 * SIZE(CO1) movsd %xmm14, 6 * SIZE(CO1) movsd %xmm3, 7 * SIZE(CO1) movaps %xmm9, %xmm0 movaps %xmm11, %xmm1 movaps %xmm13, %xmm2 movaps %xmm15, %xmm3 mulsd %xmm4, %xmm9 mulsd %xmm5, %xmm0 mulsd %xmm4, %xmm11 mulsd %xmm5, %xmm1 mulsd %xmm4, %xmm13 mulsd %xmm5, %xmm2 mulsd %xmm4, %xmm15 mulsd %xmm5, %xmm3 addsd 0 * SIZE(CO2), %xmm9 addsd 1 * SIZE(CO2), %xmm0 addsd 2 * SIZE(CO2), %xmm11 addsd 3 * SIZE(CO2), %xmm1 addsd 4 * SIZE(CO2), %xmm13 addsd 5 * SIZE(CO2), %xmm2 addsd 6 * SIZE(CO2), %xmm15 addsd 7 * SIZE(CO2), %xmm3 movsd %xmm9, 0 * SIZE(CO2) movsd %xmm0, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movsd %xmm1, 3 * SIZE(CO2) movsd %xmm13, 4 * SIZE(CO2) movsd %xmm2, 5 * SIZE(CO2) movsd %xmm15, 6 * SIZE(CO2) movsd %xmm3, 7 * SIZE(CO2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M jle .L30 movq B, BO movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 movq K, %rax sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 addsd %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 addsd %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addsd %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 addsd %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 addsd %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: movq K, %rax movsd ALPHA_R, %xmm5 movsd ALPHA_I, %xmm7 andq $3, %rax BRANCH BRANCH je .L29 ALIGN_4 .L26: addsd %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 addsd %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addsd %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: addsd %xmm2, %xmm9 addsd %xmm6, %xmm11 movaps %xmm8, %xmm12 movaps %xmm10, %xmm13 movaps %xmm9, %xmm14 movaps %xmm11, %xmm15 mulsd %xmm5, %xmm8 mulsd %xmm7, %xmm12 mulsd %xmm5, %xmm10 mulsd %xmm7, %xmm13 mulsd %xmm5, %xmm9 mulsd %xmm7, %xmm14 mulsd %xmm5, %xmm11 mulsd %xmm7, %xmm15 addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm12 addsd 2 * SIZE(CO1), %xmm10 addsd 3 * SIZE(CO1), %xmm13 addsd 0 * SIZE(CO2), %xmm9 addsd 1 * SIZE(CO2), %xmm14 addsd 2 * SIZE(CO2), %xmm11 addsd 3 * SIZE(CO2), %xmm15 movsd %xmm8, 0 * SIZE(CO1) movsd %xmm12, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movsd %xmm14, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movsd %xmm15, 3 * SIZE(CO2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 movq B, BO movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 movsd 1 * SIZE(AO), %xmm2 xorps %xmm5, %xmm5 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 movq K, %rax sarq $2, %rax je .L35 ALIGN_4 .L32: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 3 * SIZE(AO), %xmm2 addsd %xmm5, %xmm8 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 addsd %xmm7, %xmm9 movsd 7 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm3 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 8 * SIZE(BO), %xmm1 mulsd %xmm2, %xmm5 addsd %xmm3, %xmm9 movsd 9 * SIZE(BO), %xmm3 mulsd %xmm2, %xmm7 movsd 5 * SIZE(AO), %xmm2 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: movq K, %rax addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 movsd ALPHA_R, %xmm6 movsd ALPHA_I, %xmm7 andq $3, %rax BRANCH BRANCH je .L38 ALIGN_4 .L36: mulsd %xmm0, %xmm1 addq $2 * SIZE, BO mulsd %xmm0, %xmm3 movsd 1 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 0 * SIZE(BO), %xmm1 addsd %xmm3, %xmm9 movsd 1 * SIZE(BO), %xmm3 addq $1 * SIZE, AO decq %rax BRANCH jg .L36 ALIGN_4 .L38: movaps %xmm8, %xmm10 movaps %xmm9, %xmm11 mulsd %xmm6, %xmm8 mulsd %xmm7, %xmm10 mulsd %xmm6, %xmm9 mulsd %xmm7, %xmm11 addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm10 addsd 0 * SIZE(CO2), %xmm9 addsd 1 * SIZE(CO2), %xmm11 movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movsd %xmm11, 1 * SIZE(CO2) ALIGN_4 .L39: movq BO, B decq J # j -- jg .L10 ALIGN_4 .L40: testq $1, N je .L999 movq C, CO1 addq LDC, C movq A, AO movq M, I sarq $2, I jle .L50 ALIGN_4 .L41: movq B, BO movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 movsd 1 * SIZE(AO), %xmm1 xorps %xmm11, %xmm11 movsd 2 * SIZE(AO), %xmm2 xorps %xmm13, %xmm13 movsd 3 * SIZE(AO), %xmm3 xorps %xmm15, %xmm15 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 prefetcht0 7 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm14, %xmm14 movq K, %rax sarq $2, %rax je .L45 ALIGN_4 .L42: addsd %xmm9, %xmm8 movsd 4 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm11, %xmm10 movsd 5 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 6 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 7 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 9 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addsd %xmm2, %xmm12 movsd 10 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 addsd %xmm3, %xmm14 movsd 11 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 3 * SIZE(BO), %xmm5 addsd %xmm9, %xmm8 movsd 12 * SIZE(AO), %xmm9 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addsd %xmm11, %xmm10 movsd 13 * SIZE(AO), %xmm11 mulsd %xmm4, %xmm1 addsd %xmm13, %xmm12 movsd 14 * SIZE(AO), %xmm13 mulsd %xmm4, %xmm2 addsd %xmm15, %xmm14 movsd 15 * SIZE(AO), %xmm15 mulsd %xmm4, %xmm3 movsd 4 * SIZE(BO), %xmm4 subq $-16 * SIZE, AO addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm9 addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm11 addq $ 4 * SIZE, BO addsd %xmm2, %xmm12 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm5, %xmm13 decq %rax addsd %xmm3, %xmm14 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm5, %xmm15 movsd 1 * SIZE(BO), %xmm5 jne .L42 ALIGN_4 .L45: movq K, %rax movsd ALPHA_R, %xmm6 movsd ALPHA_I, %xmm7 addsd %xmm9, %xmm8 addsd %xmm11, %xmm10 addsd %xmm13, %xmm12 addsd %xmm15, %xmm14 andq $3, %rax BRANCH BRANCH je .L49 ALIGN_4 .L46: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 mulsd %xmm4, %xmm2 mulsd %xmm4, %xmm3 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 5 * SIZE(AO), %xmm1 addsd %xmm2, %xmm12 movsd 6 * SIZE(AO), %xmm2 addsd %xmm3, %xmm14 movsd 7 * SIZE(AO), %xmm3 addq $4 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L46 ALIGN_4 .L49: movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 movaps %xmm12, %xmm13 movaps %xmm14, %xmm15 mulsd %xmm6, %xmm8 mulsd %xmm7, %xmm9 mulsd %xmm6, %xmm10 mulsd %xmm7, %xmm11 mulsd %xmm6, %xmm12 mulsd %xmm7, %xmm13 mulsd %xmm6, %xmm14 mulsd %xmm7, %xmm15 addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm9 addsd 2 * SIZE(CO1), %xmm10 addsd 3 * SIZE(CO1), %xmm11 addsd 4 * SIZE(CO1), %xmm12 addsd 5 * SIZE(CO1), %xmm13 addsd 6 * SIZE(CO1), %xmm14 addsd 7 * SIZE(CO1), %xmm15 movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movsd %xmm11, 3 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movsd %xmm13, 5 * SIZE(CO1) movsd %xmm14, 6 * SIZE(CO1) movsd %xmm15, 7 * SIZE(CO1) addq $8 * SIZE, CO1 decq I # i -- jg .L41 ALIGN_4 .L50: testq $2, M jle .L60 movq B, BO movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm1 xorps %xmm3, %xmm3 movsd 0 * SIZE(BO), %xmm4 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm5 xorps %xmm10, %xmm10 movq K, %rax sarq $2, %rax je .L55 ALIGN_4 .L52: addsd %xmm2, %xmm8 movsd 2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm3, %xmm10 movsd 3 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 2 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 addq $8 * SIZE, AO addsd %xmm1, %xmm10 movsd -3 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 3 * SIZE(BO), %xmm5 addsd %xmm2, %xmm8 movsd -2 * SIZE(AO), %xmm2 mulsd %xmm4, %xmm0 addq $4 * SIZE, BO addsd %xmm3, %xmm10 movsd -1 * SIZE(AO), %xmm3 mulsd %xmm4, %xmm1 movsd 0 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm5, %xmm2 decq %rax addsd %xmm1, %xmm10 movsd 1 * SIZE(AO), %xmm1 mulsd %xmm5, %xmm3 movsd 1 * SIZE(BO), %xmm5 jne .L52 ALIGN_4 .L55: movq K, %rax movsd ALPHA_R, %xmm6 movsd ALPHA_I, %xmm7 addsd %xmm2, %xmm8 addsd %xmm3, %xmm10 andq $3, %rax BRANCH BRANCH je .L59 ALIGN_4 .L56: mulsd %xmm4, %xmm0 mulsd %xmm4, %xmm1 movsd 1 * SIZE(BO), %xmm4 addsd %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 addsd %xmm1, %xmm10 movsd 3 * SIZE(AO), %xmm1 addq $2 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L56 ALIGN_4 .L59: movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 mulsd %xmm6, %xmm8 mulsd %xmm7, %xmm9 mulsd %xmm6, %xmm10 mulsd %xmm7, %xmm11 addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm9 addsd 2 * SIZE(CO1), %xmm10 addsd 3 * SIZE(CO1), %xmm11 movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movsd %xmm11, 3 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L60: testq $1, M je .L999 ALIGN_4 movq B, BO movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 movsd 1 * SIZE(AO), %xmm2 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 movsd 1 * SIZE(BO), %xmm3 xorps %xmm9, %xmm9 movsd 2 * SIZE(AO), %xmm4 movsd 3 * SIZE(AO), %xmm6 movq K, %rax sarq $2, %rax je .L65 ALIGN_4 .L62: addsd %xmm5, %xmm8 movsd 2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm1 movsd 4 * SIZE(AO), %xmm0 addsd %xmm7, %xmm9 movsd 3 * SIZE(BO), %xmm7 mulsd %xmm2, %xmm3 movsd 5 * SIZE(AO), %xmm2 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm4, %xmm5 movsd 6 * SIZE(AO), %xmm4 addsd %xmm3, %xmm9 movsd 5 * SIZE(BO), %xmm3 mulsd %xmm6, %xmm7 movsd 7 * SIZE(AO), %xmm6 addq $4 * SIZE, AO addq $4 * SIZE, BO decq %rax jne .L62 addsd %xmm5, %xmm8 addsd %xmm7, %xmm9 ALIGN_4 .L65: movq K, %rax movsd ALPHA_R, %xmm6 movsd ALPHA_I, %xmm7 andq $3, %rax BRANCH BRANCH je .L68 ALIGN_4 .L66: movsd 0 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 addq $1 * SIZE, AO addq $1 * SIZE, BO decq %rax BRANCH jg .L66 ALIGN_4 .L68: addsd %xmm9, %xmm8 movaps %xmm8, %xmm9 mulsd %xmm6, %xmm8 mulsd %xmm7, %xmm9 addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm9 movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 1 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S000066400000000000000000001431101313527062700231320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define BUFFERED #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 512(%rsp) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 21 + 0) #define RPREFETCHSIZE (8 * 14 + 0) #define WPREFETCHSIZE (8 * 6 + 0) #define movlpd movsd #define movapd movups #define movupd movups #define KERNEL1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ addpd %xmm1, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movddup (BO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd (AO, %rax, 4), %xmm6 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL5(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ addpd %xmm1, %xmm14 ;\ movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL6(xx) \ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm8 ;\ movapd %xmm2, %xmm6 ;\ addpd %xmm1, %xmm12 ;\ movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm6, %xmm10 ;\ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL7(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ addpd %xmm5, %xmm14 ;\ movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL8(xx) \ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm8 ;\ movapd %xmm2, %xmm7 ;\ addpd %xmm5, %xmm12 ;\ movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm7, %xmm10 ;\ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ addpd %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ addpd %xmm0, %xmm10 ;\ movapd (AO, %rax, 4), %xmm0 ;\ addpd %xmm1, %xmm14 ;\ movddup (BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ addpd %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ addpd %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ addpd %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ addpd %xmm4, %xmm10 ;\ addpd %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ addpd %xmm2, %xmm11 ;\ addpd %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #ifndef __APPLE__ .align 512 #endif #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %rbx # save old stack subq $1024 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A #ifndef BUFFERED subq $-16 * SIZE, B #endif movsd %xmm0, 0 + ALPHA movsd %xmm1, 8 + ALPHA salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif #ifdef BUFFERED movq K, %rax sarq $2, %rax jle .L03 ALIGN_3 .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movaps (B), %xmm0 movaps 2 * SIZE(B), %xmm1 movaps %xmm0, -16 * SIZE(BO) movaps %xmm1, -14 * SIZE(BO) prefetch (RPREFETCHSIZE + 8) * SIZE(B) movaps 4 * SIZE(B), %xmm2 movaps 6 * SIZE(B), %xmm3 movaps %xmm2, -12 * SIZE(BO) movaps %xmm3, -10 * SIZE(BO) prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) movaps 8 * SIZE(B), %xmm4 movaps 10 * SIZE(B), %xmm5 movaps %xmm4, -8 * SIZE(BO) movaps %xmm5, -6 * SIZE(BO) prefetchw (WPREFETCHSIZE + 8) * SIZE(BO) movaps 12 * SIZE(B), %xmm6 movaps 14 * SIZE(B), %xmm7 movaps %xmm6, -4 * SIZE(BO) movaps %xmm7, -2 * SIZE(BO) subq $-16 * SIZE, BO subq $-16 * SIZE, B subq $1, %rax jne .L02 ALIGN_3 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_3 .L04: movaps (B), %xmm0 movaps %xmm0, -16 * SIZE(BO) movaps 2 * SIZE(B), %xmm1 movaps %xmm1, -14 * SIZE(BO) addq $4 * SIZE, B addq $4 * SIZE, BO subq $1, %rax jne .L04 ALIGN_4 .L10: #endif movq A, AO # aoffset = a movq B, BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif prefetch (RPREFETCHSIZE + 0) * SIZE(BB) prefetch (RPREFETCHSIZE + 8) * SIZE(BB) prefetch (RPREFETCHSIZE + 16) * SIZE(BB) subq $-16 * SIZE, BB movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movddup -8 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 prefetchw 7 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw 7 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw 7 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw 7 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) BRANCH jl .L12 ALIGN_4 .L15: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_4 KERNEL_SUB1(16 * 0) KERNEL_SUB2(16 * 0) KERNEL_SUB3(16 * 0) KERNEL_SUB4(16 * 0) subq $-16 * SIZE, BO subq $-16 * SIZE, AO ALIGN_4 .L16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L17: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd %xmm2, %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movapd %xmm0, %xmm2 addpd %xmm3, %xmm13 movddup -13 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm10 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm14 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 addpd %xmm3, %xmm15 movddup -11 * SIZE(BO, %rax, 4), %xmm3 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L17 ALIGN_4 .L19: movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 movddup %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhpd 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhpd 7 * SIZE(CO2), %xmm3 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 movddup %xmm13, %xmm5 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm13 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 addpd %xmm5, %xmm2 addpd %xmm13, %xmm3 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd %xmm2, 4 * SIZE(CO2) movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 movsd 4 * SIZE(CO1, LDC, 2), %xmm2 movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 movsd 6 * SIZE(CO1, LDC, 2), %xmm3 movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 movddup %xmm10, %xmm4 unpckhpd %xmm10, %xmm10 movddup %xmm14, %xmm5 unpckhpd %xmm14, %xmm14 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm14 addpd %xmm4, %xmm0 addpd %xmm10, %xmm1 addpd %xmm5, %xmm2 addpd %xmm14, %xmm3 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) movsd %xmm2, 4 * SIZE(CO1, LDC, 2) movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) movsd %xmm3, 6 * SIZE(CO1, LDC, 2) movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 movsd 4 * SIZE(CO2, LDC, 2), %xmm2 movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 movsd 6 * SIZE(CO2, LDC, 2), %xmm3 movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 movddup %xmm11, %xmm4 unpckhpd %xmm11, %xmm11 movddup %xmm15, %xmm5 unpckhpd %xmm15, %xmm15 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm11 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm15 addpd %xmm4, %xmm0 addpd %xmm11, %xmm1 addpd %xmm5, %xmm2 addpd %xmm15, %xmm3 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) movsd %xmm1, 2 * SIZE(CO2, LDC, 2) movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) movsd %xmm2, 4 * SIZE(CO2, LDC, 2) movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) movsd %xmm3, 6 * SIZE(CO2, LDC, 2) movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -8 * SIZE(BO), %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L26 ALIGN_4 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -9 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup (BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -7 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -6 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -5 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup -4 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup -3 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movddup -2 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movddup -1 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm10 movddup 8 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm5, %xmm11 movddup 1 * SIZE(BO, %rax, 4), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L22 ALIGN_4 .L26: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L29 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L27: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 addq $SIZE, %rax jl .L27 ALIGN_4 .L29: movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 movddup %xmm10, %xmm4 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm10 addpd %xmm4, %xmm0 addpd %xmm10, %xmm1 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 movddup %xmm11, %xmm4 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm11 addpd %xmm4, %xmm0 addpd %xmm11, %xmm1 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) movsd %xmm1, 2 * SIZE(CO2, LDC, 2) movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L30: testq $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -14 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -15 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movapd -16 * SIZE(BO), %xmm1 pxor %xmm11, %xmm11 movapd -8 * SIZE(BO), %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L36 ALIGN_4 .L32: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd %xmm4, %xmm1 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 addpd %xmm1, %xmm10 movapd (BO, %rax, 4), %xmm1 addpd %xmm4, %xmm11 movddup -11 * SIZE(AO, %rax, 1), %xmm4 mulpd %xmm2, %xmm3 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm9 movddup -13 * SIZE(AO, %rax, 1), %xmm2 mulpd %xmm2, %xmm3 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO, %rax, 4), %xmm3 addpd %xmm2, %xmm11 movddup -10 * SIZE(AO, %rax, 1), %xmm2 addq $4 * SIZE, %rax BRANCH jl .L32 ALIGN_4 .L36: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L38 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L37: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO, %rax, 4), %xmm1 addpd %xmm0, %xmm9 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L37 ALIGN_4 .L38: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm0 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm0 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif #ifndef BUFFERED movq BO, B #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 ALIGN_4 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif #ifdef BUFFERED movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 .L42: prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) movaps (B), %xmm0 movaps %xmm0, -16 * SIZE(BO) movaps 2 * SIZE(B), %xmm1 movaps %xmm1, -14 * SIZE(BO) prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) movaps 4 * SIZE(B), %xmm2 movaps %xmm2, -12 * SIZE(BO) movaps 6 * SIZE(B), %xmm3 movaps %xmm3, -10 * SIZE(BO) subq $-8 * SIZE, BO subq $-8 * SIZE, B subq $1, %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movaps (B), %xmm0 movaps %xmm0, -16 * SIZE(BO) addq $2 * SIZE, B addq $2 * SIZE, BO subq $1, %rax jne .L44 ALIGN_4 .L50: #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 pxor %xmm8, %xmm8 movddup -12 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -16 * SIZE(AO), %xmm0 pxor %xmm12, %xmm12 movapd -8 * SIZE(AO), %xmm4 pxor %xmm13, %xmm13 prefetchw 7 * SIZE(CO1) movapd %xmm0, %xmm2 prefetchw 7 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L56 ALIGN_4 .L52: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd (AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -9 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm4, %xmm8 movapd 8 * SIZE(AO, %rax, 4), %xmm4 addpd %xmm3, %xmm12 movddup -4 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -7 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $4 * SIZE, %rax BRANCH jl .L52 ALIGN_4 .L56: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L59 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L57: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L57 ALIGN_4 .L59: movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 movddup %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhpd 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhpd 7 * SIZE(CO2), %xmm3 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 movddup %xmm13, %xmm5 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm13 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 addpd %xmm5, %xmm2 addpd %xmm13, %xmm3 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd %xmm2, 4 * SIZE(CO2) movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L66 ALIGN_4 .L62: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm11 movddup -11 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -10 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm9 movddup -9 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm10 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm3, %xmm11 movddup -7 * SIZE(BO, %rax, 2), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L62 ALIGN_4 .L66: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L69 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L67: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 addq $SIZE, %rax jl .L67 ALIGN_4 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(AO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L76 ALIGN_4 .L72: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(AO, %rax, 1), %xmm1 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(AO, %rax, 1), %xmm2 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(AO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L72 ALIGN_4 .L76: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L78 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L77: mulpd -16 * SIZE(BO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L77 ALIGN_4 .L78: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm0 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif #ifndef BUFFERED movq BO, B #endif leaq (C, LDC, 2), C ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 .L81: #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #endif #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif #ifdef BUFFERED movq K, %rax sarq $3, %rax jle .L83 ALIGN_4 .L82: prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) movaps (B), %xmm0 movaps %xmm0, -16 * SIZE(BO) movaps 2 * SIZE(B), %xmm1 movaps %xmm1, -14 * SIZE(BO) prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) movaps 4 * SIZE(B), %xmm2 movaps %xmm2, -12 * SIZE(BO) movaps 6 * SIZE(B), %xmm3 movaps %xmm3, -10 * SIZE(BO) subq $-8 * SIZE, BO subq $-8 * SIZE, B subq $1, %rax jne .L82 ALIGN_4 .L83: movq K, %rax andq $7, %rax BRANCH jle .L90 ALIGN_4 .L84: movsd (B), %xmm0 movlpd %xmm0, -16 * SIZE(BO) addq $1 * SIZE, B addq $1 * SIZE, BO decq %rax jne .L84 ALIGN_4 .L90: #endif movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif movapd -8 * SIZE(AO), %xmm2 pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm12, %xmm12 movddup -14 * SIZE(BO), %xmm3 pxor %xmm13, %xmm13 movddup -15 * SIZE(BO), %xmm5 prefetchw 3 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L96 ALIGN_4 .L92: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -12 * SIZE(BO, %rax, 1), %xmm1 mulpd %xmm5, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm0, %xmm9 movapd (AO, %rax, 4), %xmm0 addpd %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 1), %xmm5 mulpd %xmm3, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 addpd %xmm2, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 1), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 addpd %xmm2, %xmm9 movapd 8 * SIZE(AO, %rax, 4), %xmm2 addpd %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 1), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L92 ALIGN_4 .L96: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L99 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L97: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 addpd %xmm1, %xmm12 movddup -15 * SIZE(BO, %rax, 1), %xmm1 addq $SIZE, %rax jl .L97 ALIGN_4 .L99: addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 movddup %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movddup -14 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 movddup -13 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L106 ALIGN_4 .L102: mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 movddup -12 * SIZE(BO, %rax, 1), %xmm0 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 addpd %xmm1, %xmm9 movddup -11 * SIZE(BO, %rax, 1), %xmm1 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 addpd %xmm2, %xmm10 movddup -10 * SIZE(BO, %rax, 1), %xmm2 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 addpd %xmm3, %xmm11 movddup -9 * SIZE(BO, %rax, 1), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L102 ALIGN_4 .L106: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L109 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L107: movddup -16 * SIZE(BO, %rax, 1), %xmm0 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 addpd %xmm0, %xmm8 addq $SIZE, %rax jl .L107 ALIGN_4 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L110: testq $1, M je .L999 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #else movq B, BO #endif #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -14 * SIZE(AO), %xmm1 pxor %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax NOBRANCH je .L116 ALIGN_4 .L112: mulpd -16 * SIZE(BO, %rax, 1), %xmm0 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 1), %xmm0 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 addpd %xmm1, %xmm9 movapd -10 * SIZE(AO, %rax, 1), %xmm1 addq $4 * SIZE, %rax BRANCH jl .L112 ALIGN_4 .L116: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L118 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO negq %rax ALIGN_4 .L117: mulsd -16 * SIZE(BO, %rax, 1), %xmm0 addsd %xmm0, %xmm8 movsd -15 * SIZE(AO, %rax, 1), %xmm0 addq $SIZE, %rax jl .L117 ALIGN_4 .L118: addpd %xmm9, %xmm8 haddpd %xmm8, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) ALIGN_3 .L999: movq %rbx, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_4x4_core2.S000066400000000000000000001265071313527062700222310ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 13 + 5) #define PREFETCH prefetcht0 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %r15 # save old stack subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movsd %xmm0, 0 + ALPHA movsd %xmm1, 8 + ALPHA subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq N, J sarq $2, J NOBRANCH jle .L40 ALIGN_4 .L01: /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $2, %rax NOBRANCH jle .L05 ALIGN_4 .L02: movapd -16 * SIZE(B), %xmm0 prefetchnta (PREFETCH_R + 0) * SIZE(B) movapd -14 * SIZE(B), %xmm1 movapd -12 * SIZE(B), %xmm2 movapd -10 * SIZE(B), %xmm3 movapd -8 * SIZE(B), %xmm4 movapd -6 * SIZE(B), %xmm5 movapd -4 * SIZE(B), %xmm6 movapd -2 * SIZE(B), %xmm7 movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 prefetchnta (PREFETCH_R + 8) * SIZE(B) movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movddup %xmm2, %xmm10 unpckhpd %xmm2, %xmm2 movddup %xmm3, %xmm11 unpckhpd %xmm3, %xmm3 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movddup %xmm4, %xmm12 unpckhpd %xmm4, %xmm4 movddup %xmm5, %xmm13 unpckhpd %xmm5, %xmm5 movddup %xmm6, %xmm14 unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) movapd %xmm10, -8 * SIZE(BO) movapd %xmm2, -6 * SIZE(BO) movapd %xmm11, -4 * SIZE(BO) movapd %xmm3, -2 * SIZE(BO) prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movapd %xmm12, 0 * SIZE(BO) movapd %xmm4, 2 * SIZE(BO) movapd %xmm13, 4 * SIZE(BO) movapd %xmm5, 6 * SIZE(BO) prefetcht0 (PREFETCH_W + 24) * SIZE(BO) movapd %xmm14, 8 * SIZE(BO) movapd %xmm6, 10 * SIZE(BO) movapd %xmm15, 12 * SIZE(BO) movapd %xmm7, 14 * SIZE(BO) subq $-16 * SIZE, B subq $-32 * SIZE, BO decq %rax BRANCH jne .L02 ALIGN_4 .L05: movq K, %rax andq $3, %rax BRANCH BRANCH jle .L10 ALIGN_4 .L06: movapd -16 * SIZE(B), %xmm0 movapd -14 * SIZE(B), %xmm1 movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax BRANCH jne .L06 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 20 * SIZE + BUFFER, BO #else leaq 20 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -20 * SIZE(BO), %xmm6 movaps -18 * SIZE(BO), %xmm7 prefetcht2 0 * SIZE(BB) pxor %xmm2, %xmm2 prefetcht0 7 * SIZE(CO1) pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 prefetcht0 7 * SIZE(CO2) pxor %xmm5, %xmm5 movapd %xmm2, %xmm8 movapd %xmm2, %xmm9 movapd %xmm2, %xmm10 prefetcht0 7 * SIZE(CO1, LDC, 2) movapd %xmm2, %xmm11 movapd %xmm2, %xmm12 movapd %xmm2, %xmm13 prefetcht0 7 * SIZE(CO2, LDC, 2) movapd %xmm2, %xmm14 movapd %xmm2, %xmm15 subq $-16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_4 .L12: PADDING; addpd %xmm2, %xmm10 movaps -16 * SIZE(BO), %xmm2 PADDING; addpd %xmm3, %xmm14 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps -12 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps -10 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 movaps -8 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps -10 * SIZE(AO), %xmm1 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps -6 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps -4 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps -2 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps -6 * SIZE(AO), %xmm1 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps 2 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm7, %xmm5 mulpd %xmm1, %xmm5 mulpd %xmm0, %xmm7 addpd %xmm6, %xmm8 movaps 4 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps 6 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 addpd %xmm2, %xmm10 movaps 8 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps -2 * SIZE(AO), %xmm1 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps 10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps 12 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 subq $-16 * SIZE, AO addpd %xmm7, %xmm9 movaps 14 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 subq $-32 * SIZE, BO movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L12 ALIGN_4 .L15: prefetcht2 -8 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: addpd %xmm2, %xmm10 movaps -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm11 movaps -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 addpd %xmm6, %xmm8 movaps -12 * SIZE(BO), %xmm6 addpd %xmm3, %xmm12 addq $4 * SIZE, AO movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm7, %xmm9 movaps -10 * SIZE(BO), %xmm7 addpd %xmm5, %xmm13 addq $8 * SIZE, BO movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: movapd ALPHA, %xmm7 addpd %xmm2, %xmm10 addpd %xmm3, %xmm14 addpd %xmm4, %xmm11 addpd %xmm5, %xmm15 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 movddup %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhpd 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhpd 7 * SIZE(CO2), %xmm3 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 movddup %xmm13, %xmm5 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm13 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 addpd %xmm5, %xmm2 addpd %xmm13, %xmm3 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd %xmm2, 4 * SIZE(CO2) movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 movsd 4 * SIZE(CO1, LDC, 2), %xmm2 movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 movsd 6 * SIZE(CO1, LDC, 2), %xmm3 movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 movddup %xmm10, %xmm4 unpckhpd %xmm10, %xmm10 movddup %xmm14, %xmm5 unpckhpd %xmm14, %xmm14 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm14 addpd %xmm4, %xmm0 addpd %xmm10, %xmm1 addpd %xmm5, %xmm2 addpd %xmm14, %xmm3 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) movsd %xmm2, 4 * SIZE(CO1, LDC, 2) movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) movsd %xmm3, 6 * SIZE(CO1, LDC, 2) movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 movsd 4 * SIZE(CO2, LDC, 2), %xmm2 movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 movsd 6 * SIZE(CO2, LDC, 2), %xmm3 movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 movddup %xmm11, %xmm4 unpckhpd %xmm11, %xmm11 movddup %xmm15, %xmm5 unpckhpd %xmm15, %xmm15 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm11 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm15 addpd %xmm4, %xmm0 addpd %xmm11, %xmm1 addpd %xmm5, %xmm2 addpd %xmm15, %xmm3 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) movsd %xmm1, 2 * SIZE(CO2, LDC, 2) movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) movsd %xmm2, 4 * SIZE(CO2, LDC, 2) movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) movsd %xmm3, 6 * SIZE(CO2, LDC, 2) movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm9, %xmm3 movapd %xmm10, %xmm4 movapd %xmm11, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd -12 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movapd -8 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -6 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd -4 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movapd 0 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd 2 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd 4 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -10 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movapd 8 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd 12 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO), %xmm0 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L21 ALIGN_4 .L25: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm4, %xmm10 movapd -12 * SIZE(BO), %xmm4 mulpd %xmm0, %xmm4 addpd %xmm5, %xmm11 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 movddup %xmm10, %xmm4 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm10 addpd %xmm4, %xmm0 addpd %xmm10, %xmm1 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 movddup %xmm11, %xmm4 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm11 addpd %xmm4, %xmm0 addpd %xmm11, %xmm1 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) movsd %xmm1, 2 * SIZE(CO2, LDC, 2) movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M BRANCH jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm9, %xmm3 movapd %xmm10, %xmm4 movapd %xmm11, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_4 .L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd -12 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -15 * SIZE(AO), %xmm0 addsd %xmm2, %xmm8 movsd -8 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -6 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd -4 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -14 * SIZE(AO), %xmm0 addsd %xmm2, %xmm8 movsd 0 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd 2 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd 4 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd 6 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -13 * SIZE(AO), %xmm0 addsd %xmm2, %xmm8 movsd 8 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd 10 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd 12 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd 14 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -12 * SIZE(AO), %xmm0 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L31 ALIGN_4 .L35: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm4, %xmm10 movsd -12 * SIZE(BO), %xmm4 mulsd %xmm0, %xmm4 addsd %xmm5, %xmm11 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 movsd -15 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movddup %xmm10, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movddup %xmm11, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $2, N BRANCH jle .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $3, %rax jle .L43 addq %rax, %rax ALIGN_4 .L42: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $8 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $7, %rax BRANCH jle .L45 ALIGN_4 .L44: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO subq $1, %rax jne .L44 ALIGN_4 .L45: movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L50: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 3 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht0 3 * SIZE(CO2) pxor %xmm13, %xmm13 movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 movapd %xmm8, %xmm4 movapd %xmm8, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L55 ALIGN_4 .L51: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -6 * SIZE(AO), %xmm1 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm2, %xmm8 movapd -8 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -6 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -4 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -2 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd 2 * SIZE(AO), %xmm1 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L51 ALIGN_4 .L55: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L58 ALIGN_4 .L56: addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 addpd %xmm4, %xmm9 movapd -14 * SIZE(BO), %xmm4 addpd %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movapd -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L56 ALIGN_4 .L58: addpd %xmm2, %xmm8 addpd %xmm3, %xmm12 addpd %xmm4, %xmm9 addpd %xmm5, %xmm13 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 movddup %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhpd 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhpd 7 * SIZE(CO2), %xmm3 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 movddup %xmm13, %xmm5 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm13 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 addpd %xmm5, %xmm2 addpd %xmm13, %xmm3 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd %xmm2, 4 * SIZE(CO2) movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 subq $1, I jg .L50 ALIGN_4 .L60: testq $2, M jle .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 movapd -14 * SIZE(AO), %xmm1 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 movapd %xmm8, %xmm4 movapd %xmm8, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L65 ALIGN_4 .L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 movapd -12 * SIZE(AO), %xmm0 addpd %xmm4, %xmm10 movapd -12 * SIZE(BO), %xmm4 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm11 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm1, %xmm5 movapd -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -8 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -6 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO), %xmm0 addpd %xmm4, %xmm10 movapd -4 * SIZE(BO), %xmm4 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm11 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm1, %xmm5 movapd -6 * SIZE(AO), %xmm1 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L61 ALIGN_4 .L65: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L68 ALIGN_4 .L66: addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm2 addpd %xmm3, %xmm9 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L68: addpd %xmm2, %xmm8 addpd %xmm3, %xmm9 addpd %xmm4, %xmm10 addpd %xmm5, %xmm11 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movddup %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L70: testq $1, M jle .L79 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 movapd %xmm8, %xmm4 movapd %xmm8, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L75 ALIGN_4 .L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 movsd -14 * SIZE(AO), %xmm0 addsd %xmm4, %xmm10 movsd -12 * SIZE(BO), %xmm4 mulsd %xmm1, %xmm4 addsd %xmm5, %xmm11 movsd -10 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 movsd -13 * SIZE(AO), %xmm1 addsd %xmm2, %xmm8 movsd -8 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -6 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 movsd -12 * SIZE(AO), %xmm0 addsd %xmm4, %xmm10 movsd -4 * SIZE(BO), %xmm4 mulsd %xmm1, %xmm4 addsd %xmm5, %xmm11 movsd -2 * SIZE(BO), %xmm5 mulsd %xmm1, %xmm5 movsd -11 * SIZE(AO), %xmm1 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L71 ALIGN_4 .L75: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L78 ALIGN_4 .L76: addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm3, %xmm9 movsd -14 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: addsd %xmm2, %xmm8 addsd %xmm3, %xmm9 addsd %xmm4, %xmm10 addsd %xmm5, %xmm11 addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C ALIGN_4 .L80: testq $1, N BRANCH jle .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $4, %rax jle .L83 addq %rax, %rax ALIGN_4 .L82: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO subq $1, %rax jne .L82 ALIGN_4 .L83: movq K, %rax andq $15, %rax BRANCH jle .L85 ALIGN_4 .L84: movddup -16 * SIZE(B), %xmm8 movapd %xmm8, 0 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO subq $1, %rax jne .L84 ALIGN_4 .L85: movq C, CO1 movq A, AO movq M, I sarq $2, I jle .L100 ALIGN_4 .L90: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 pxor %xmm9, %xmm9 movapd -14 * SIZE(BO), %xmm5 pxor %xmm12, %xmm12 movapd -12 * SIZE(BO), %xmm6 pxor %xmm13, %xmm13 movapd -10 * SIZE(BO), %xmm7 movapd %xmm8, %xmm0 prefetcht0 3 * SIZE(CO1) movapd %xmm8, %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L95 ALIGN_4 .L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 addpd %xmm1, %xmm12 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm4, %xmm1 movapd -8 * SIZE(BO), %xmm4 addpd %xmm2, %xmm9 movapd -12 * SIZE(AO), %xmm2 mulpd %xmm5, %xmm2 addpd %xmm3, %xmm13 movapd -10 * SIZE(AO), %xmm3 mulpd %xmm5, %xmm3 movapd -6 * SIZE(BO), %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm0, %xmm8 movapd -8 * SIZE(AO), %xmm0 mulpd %xmm6, %xmm0 addpd %xmm1, %xmm12 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm6, %xmm1 movapd -4 * SIZE(BO), %xmm6 addpd %xmm2, %xmm9 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm7, %xmm2 addpd %xmm3, %xmm13 movapd -2 * SIZE(AO), %xmm3 mulpd %xmm7, %xmm3 movapd -2 * SIZE(BO), %xmm7 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax jg .L91 ALIGN_4 .L95: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L98 ALIGN_4 .L96: addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 addpd %xmm1, %xmm12 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm4, %xmm1 movapd -14 * SIZE(BO), %xmm4 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L96 ALIGN_4 .L98: addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 addpd %xmm2, %xmm9 addpd %xmm3, %xmm13 addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 movddup %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 subq $1, I jg .L90 ALIGN_4 .L100: testq $2, M jle .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 pxor %xmm9, %xmm9 movapd -14 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movapd -12 * SIZE(BO), %xmm6 pxor %xmm11, %xmm11 movapd -10 * SIZE(BO), %xmm7 movapd %xmm8, %xmm0 movapd %xmm8, %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L105 ALIGN_4 .L101: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 movapd -8 * SIZE(BO), %xmm4 addpd %xmm1, %xmm9 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm5, %xmm1 movapd -6 * SIZE(BO), %xmm5 addpd %xmm2, %xmm10 movapd -12 * SIZE(AO), %xmm2 mulpd %xmm6, %xmm2 movapd -4 * SIZE(BO), %xmm6 addpd %xmm3, %xmm11 movapd -10 * SIZE(AO), %xmm3 mulpd %xmm7, %xmm3 movapd -2 * SIZE(BO), %xmm7 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jg .L101 ALIGN_4 .L105: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L108 ALIGN_4 .L106: addpd %xmm0, %xmm8 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 movapd -14 * SIZE(BO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L106 ALIGN_4 .L108: addpd %xmm0, %xmm8 addpd %xmm1, %xmm9 addpd %xmm2, %xmm10 addpd %xmm3, %xmm11 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 addpd %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L110: testq $1, M jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm4 pxor %xmm9, %xmm9 movsd -14 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movsd -12 * SIZE(BO), %xmm6 pxor %xmm11, %xmm11 movsd -10 * SIZE(BO), %xmm7 movapd %xmm8, %xmm0 movapd %xmm8, %xmm1 movapd %xmm8, %xmm2 movapd %xmm8, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L115 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 movsd -16 * SIZE(AO), %xmm0 mulpd %xmm4, %xmm0 movsd -8 * SIZE(BO), %xmm4 addpd %xmm1, %xmm9 movsd -15 * SIZE(AO), %xmm1 mulpd %xmm5, %xmm1 movsd -6 * SIZE(BO), %xmm5 addpd %xmm2, %xmm10 movsd -14 * SIZE(AO), %xmm2 mulpd %xmm6, %xmm2 movsd -4 * SIZE(BO), %xmm6 addpd %xmm3, %xmm11 movsd -13 * SIZE(AO), %xmm3 mulpd %xmm7, %xmm3 movsd -2 * SIZE(BO), %xmm7 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax jg .L111 ALIGN_4 .L115: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L118 ALIGN_4 .L116: addsd %xmm0, %xmm8 movsd -16 * SIZE(AO), %xmm0 mulsd %xmm4, %xmm0 movsd -14 * SIZE(BO), %xmm4 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax jg .L116 ALIGN_4 .L118: addsd %xmm0, %xmm8 addsd %xmm1, %xmm9 addsd %xmm2, %xmm10 addsd %xmm3, %xmm11 addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 addsd %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S000066400000000000000000001153671313527062700225340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define PREA %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #ifdef NANO #define PREFETCHSIZE (8 * 2 + 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht2 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht0 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (8 * 17 + 4) #endif #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I subq $-16 * SIZE, A subq $-17 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -16 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 xorpd %xmm4, %xmm4 movaps -17 * SIZE(BO), %xmm2 PREFETCHB -16 * SIZE(BB) xorpd %xmm5, %xmm5 xorpd %xmm6, %xmm6 PREFETCHW 3 * SIZE(CO1) movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 PREFETCHW 7 * SIZE(CO2) movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 PREFETCHW 3 * SIZE(CO1, LDC, 2) movaps %xmm4, %xmm12 movaps %xmm4, %xmm13 PREFETCHW 7 * SIZE(CO2, LDC, 2) movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 subq $-12 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -11 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 movaps -7 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movapd %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movapd %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 PADDING PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm2, %xmm9 movaps -5 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm11 subq $-16 * SIZE, AO movaps -3 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -1 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 subq $-16 * SIZE, BO mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: PREFETCHB -8 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 addpd %xmm4, %xmm15 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: movups ALPHA_R, %xmm7 addpd %xmm3, %xmm11 addpd %xmm4, %xmm15 addpd %xmm5, %xmm10 addpd %xmm6, %xmm14 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movaps %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 movaps %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 movaps %xmm14, %xmm0 movsd %xmm15, %xmm14 movsd %xmm0, %xmm15 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhps 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhps 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm12, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movlps %xmm2, 4 * SIZE(CO1) movhps %xmm2, 5 * SIZE(CO1) movlps %xmm3, 6 * SIZE(CO1) movhps %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhps 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhps 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhps 7 * SIZE(CO2), %xmm3 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm1 movddup %xmm13, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm13 addpd %xmm13, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 1 * SIZE(CO2) movlps %xmm1, 2 * SIZE(CO2) movhps %xmm1, 3 * SIZE(CO2) movlps %xmm2, 4 * SIZE(CO2) movhps %xmm2, 5 * SIZE(CO2) movlps %xmm3, 6 * SIZE(CO2) movhps %xmm3, 7 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhps 3 * SIZE(CO1, LDC, 2), %xmm1 movsd 4 * SIZE(CO1, LDC, 2), %xmm2 movhps 5 * SIZE(CO1, LDC, 2), %xmm2 movsd 6 * SIZE(CO1, LDC, 2), %xmm3 movhps 7 * SIZE(CO1, LDC, 2), %xmm3 movddup %xmm10, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm10 addpd %xmm10, %xmm1 movddup %xmm14, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm14, %xmm14 mulpd %xmm7, %xmm14 addpd %xmm14, %xmm3 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 1 * SIZE(CO1, LDC, 2) movlps %xmm1, 2 * SIZE(CO1, LDC, 2) movhps %xmm1, 3 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 5 * SIZE(CO1, LDC, 2) movlps %xmm3, 6 * SIZE(CO1, LDC, 2) movhps %xmm3, 7 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhps 3 * SIZE(CO2, LDC, 2), %xmm1 movsd 4 * SIZE(CO2, LDC, 2), %xmm2 movhps 5 * SIZE(CO2, LDC, 2), %xmm2 movsd 6 * SIZE(CO2, LDC, 2), %xmm3 movhps 7 * SIZE(CO2, LDC, 2), %xmm3 movddup %xmm11, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm1 movddup %xmm15, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm15, %xmm15 mulpd %xmm7, %xmm15 addpd %xmm15, %xmm3 movlps %xmm0, 0 * SIZE(CO2, LDC, 2) movhps %xmm0, 1 * SIZE(CO2, LDC, 2) movlps %xmm1, 2 * SIZE(CO2, LDC, 2) movhps %xmm1, 3 * SIZE(CO2, LDC, 2) movlps %xmm2, 4 * SIZE(CO2, LDC, 2) movhps %xmm2, 5 * SIZE(CO2, LDC, 2) movlps %xmm3, 6 * SIZE(CO2, LDC, 2) movhps %xmm3, 7 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 movaps -15 * SIZE(BO), %xmm3 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 movaps %xmm3, %xmm8 movaps %xmm3, %xmm9 movaps %xmm3, %xmm10 movaps %xmm3, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -11 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -7 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -5 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm11 movaps -3 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO addpd %xmm2, %xmm9 movaps -1 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 addpd %xmm5, %xmm10 mulpd %xmm0, %xmm7 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 addpd %xmm7, %xmm8 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: movups ALPHA_R, %xmm7 addpd %xmm3, %xmm11 addpd %xmm5, %xmm10 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movaps %xmm10, %xmm0 movsd %xmm11, %xmm10 movsd %xmm0, %xmm11 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhps 3 * SIZE(CO2), %xmm1 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm1 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 1 * SIZE(CO2) movlps %xmm1, 2 * SIZE(CO2) movhps %xmm1, 3 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhps 3 * SIZE(CO1, LDC, 2), %xmm1 movddup %xmm10, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm10 addpd %xmm10, %xmm1 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 1 * SIZE(CO1, LDC, 2) movlps %xmm1, 2 * SIZE(CO1, LDC, 2) movhps %xmm1, 3 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhps 3 * SIZE(CO2, LDC, 2), %xmm1 movddup %xmm11, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm1 movlps %xmm0, 0 * SIZE(CO2, LDC, 2) movhps %xmm0, 1 * SIZE(CO2, LDC, 2) movlps %xmm1, 2 * SIZE(CO2, LDC, 2) movhps %xmm1, 3 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M BRANCH jle .L39 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 4), BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 movaps -15 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -11 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps -9 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps -7 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -5 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -3 * SIZE(BO), %xmm3 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm10 movaps -1 * SIZE(BO), %xmm2 addpd %xmm3, %xmm11 movaps 1 * SIZE(BO), %xmm3 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 addpd %xmm3, %xmm9 movaps -11 * SIZE(BO), %xmm3 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_4 .L38: movups ALPHA_R, %xmm7 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 1 * SIZE(CO2), %xmm1 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 0 * SIZE(CO2, LDC, 2), %xmm1 movhps 1 * SIZE(CO2, LDC, 2), %xmm1 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm1 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 1 * SIZE(CO1, LDC, 2) movlps %xmm1, 0 * SIZE(CO2, LDC, 2) movhps %xmm1, 1 * SIZE(CO2, LDC, 2) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $2, N BRANCH jle .L80 movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif PREFETCHB -16 * SIZE(BB) subq $-4 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -17 * SIZE(BO), %xmm2 PREFETCHW 3 * SIZE(CO1) xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 PREFETCHW 3 * SIZE(CO2) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -15 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -11 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addpd %xmm2, %xmm9 movaps -15 * SIZE(BO), %xmm2 addpd %xmm4, %xmm13 addpd %xmm7, %xmm8 addpd %xmm6, %xmm12 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: movups ALPHA_R, %xmm7 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movaps %xmm12, %xmm0 movsd %xmm13, %xmm12 movsd %xmm0, %xmm13 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhps 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhps 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm12, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movlps %xmm2, 4 * SIZE(CO1) movhps %xmm2, 5 * SIZE(CO1) movlps %xmm3, 6 * SIZE(CO1) movhps %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhps 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhps 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhps 7 * SIZE(CO2), %xmm3 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm1 movddup %xmm13, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm13 addpd %xmm13, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 1 * SIZE(CO2) movlps %xmm1, 2 * SIZE(CO2) movhps %xmm1, 3 * SIZE(CO2) movlps %xmm2, 4 * SIZE(CO2) movhps %xmm2, 5 * SIZE(CO2) movlps %xmm3, 6 * SIZE(CO2) movhps %xmm3, 7 * SIZE(CO2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 decq I BRANCH jg .L51 ALIGN_4 .L60: testq $2, M BRANCH jle .L70 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movaps -17 * SIZE(BO), %xmm2 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -15 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -11 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 addpd %xmm2, %xmm11 addpd %xmm7, %xmm10 movaps -9 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 addpd %xmm7, %xmm8 movaps -15 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: movups ALPHA_R, %xmm7 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movaps %xmm8, %xmm0 movsd %xmm9, %xmm8 movsd %xmm0, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhps 3 * SIZE(CO2), %xmm1 movddup %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm1 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 1 * SIZE(CO2) movlps %xmm1, 2 * SIZE(CO2) movhps %xmm1, 3 * SIZE(CO2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L70: testq $1, M BRANCH jle .L79 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 2), BO #endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -15 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -14 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -13 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -13 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -11 * SIZE(BO), %xmm2 shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -12 * SIZE(AO), %xmm0 addpd %xmm2, %xmm9 movaps -9 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0x44, %xmm0, %xmm0 mulpd %xmm0, %xmm2 movsd -15 * SIZE(AO), %xmm0 addpd %xmm2, %xmm8 movaps -15 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_4 .L78: movups ALPHA_R, %xmm7 addpd %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 1 * SIZE(CO2), %xmm1 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C movq BO, B ALIGN_4 .L80: testq $1, N BRANCH jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $2, I # i = (m >> 2) NOBRANCH jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movsd -17 * SIZE(BO), %xmm2 PREFETCHW 3 * SIZE(CO1) xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_4 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -6 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps 2 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: pshufd $0x44, %xmm2, %xmm3 pshufd $0x44, %xmm2, %xmm4 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm4 movaps -10 * SIZE(AO), %xmm1 addpd %xmm3, %xmm8 addpd %xmm4, %xmm12 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_4 .L98: movups ALPHA_R, %xmm7 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhps 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhps 7 * SIZE(CO1), %xmm3 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movddup %xmm12, %xmm5 mulpd %xmm7, %xmm5 addpd %xmm5, %xmm2 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm12 addpd %xmm12, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) movlps %xmm2, 4 * SIZE(CO1) movhps %xmm2, 5 * SIZE(CO1) movlps %xmm3, 6 * SIZE(CO1) movhps %xmm3, 7 * SIZE(CO1) addq $8 * SIZE, CO1 decq I BRANCH jg .L91 ALIGN_4 .L100: testq $2, M BRANCH jle .L110 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO #endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -17 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_4 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -15 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -12 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 pshufd $0x44, %xmm2, %xmm3 movsd -14 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 pshufd $0x44, %xmm2, %xmm3 movsd -13 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -8 * SIZE(AO), %xmm0 addpd %xmm3, %xmm9 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_4 .L106: pshufd $0x44, %xmm2, %xmm3 movsd -16 * SIZE(BO), %xmm2 mulpd %xmm0, %xmm3 movaps -14 * SIZE(AO), %xmm0 addpd %xmm3, %xmm8 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_4 .L108: movups ALPHA_R, %xmm7 addpd %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhps 3 * SIZE(CO1), %xmm1 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm8 addpd %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 2 * SIZE(CO1) movhps %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L110: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax addq %rax, AO addq %rax, BO #endif movsd -16 * SIZE(AO), %xmm0 movsd -17 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_4 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -14 * SIZE(AO), %xmm0 movsd -15 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -13 * SIZE(AO), %xmm0 movsd -14 * SIZE(BO), %xmm2 mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -12 * SIZE(AO), %xmm0 movsd -13 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm8 movsd -15 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_4 .L118: movups ALPHA_R, %xmm7 addpd %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 1 * SIZE(CO1), %xmm0 movddup %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S000066400000000000000000001625771313527062700221020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5 + 4) #define movsd movlps #define movapd movaps #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 5 + 4) #define movapd movaps #endif #ifndef GENERIC #define KERNEL1(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL2(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL3(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL4(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ addpd %xmm6, %xmm15 ;\ movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #define KERNEL5(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL6(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL7(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL8(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #else #define KERNEL1(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL2(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL3(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL4(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm6, %xmm15 ;\ movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL6(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL7(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL8(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #endif #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif EMMS movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A movsd %xmm0, 0 + ALPHA movsd %xmm1, 8 + ALPHA salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_3 .L01: /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $2, %rax jle .L03 ALIGN_3 #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq %mm4, -8 * SIZE(BO) movq %mm4, -7 * SIZE(BO) movq %mm5, -6 * SIZE(BO) movq %mm5, -5 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 movq %mm6, -4 * SIZE(BO) movq %mm6, -3 * SIZE(BO) movq %mm7, -2 * SIZE(BO) movq %mm7, -1 * SIZE(BO) PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) movq 8 * SIZE(B), %mm0 movq 9 * SIZE(B), %mm1 movq %mm0, 0 * SIZE(BO) movq %mm0, 1 * SIZE(BO) movq %mm1, 2 * SIZE(BO) movq %mm1, 3 * SIZE(BO) movq 10 * SIZE(B), %mm2 movq 11 * SIZE(B), %mm3 movq %mm2, 4 * SIZE(BO) movq %mm2, 5 * SIZE(BO) movq %mm3, 6 * SIZE(BO) movq %mm3, 7 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) movq 12 * SIZE(B), %mm4 movq 13 * SIZE(B), %mm5 movq %mm4, 8 * SIZE(BO) movq %mm4, 9 * SIZE(BO) movq %mm5, 10 * SIZE(BO) movq %mm5, 11 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) movq 14 * SIZE(B), %mm6 movq 15 * SIZE(B), %mm7 movq %mm6, 12 * SIZE(BO) movq %mm6, 13 * SIZE(BO) movq %mm7, 14 * SIZE(BO) movq %mm7, 15 * SIZE(BO) addq $ 32 * SIZE, BO subq $-16 * SIZE, B subq $1, %rax jne .L02 ALIGN_3 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_3 .L04: movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L04 ALIGN_3 .L10: movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_3 .L11: PREFETCH 0 * SIZE(BB) PREFETCH 8 * SIZE(BB) subq $-16 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movapd -14 * SIZE(AO), %xmm2 movapd -14 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -12 * SIZE(AO), %xmm4 movapd -12 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movapd -10 * SIZE(AO), %xmm6 movapd -8 * SIZE(BO), %xmm7 pxor %xmm11, %xmm11 PREFETCHW 7 * SIZE(CO1) pxor %xmm12, %xmm12 PREFETCHW 7 * SIZE(CO2) pxor %xmm13, %xmm13 PREFETCHW 7 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 PREFETCHW 7 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #ifndef GENERIC andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax BRANCH jl .L12 ALIGN_3 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $32 * SIZE, BO addq $16 * SIZE, AO ALIGN_3 #else sarq $2, %rax NOBRANCH jle .L16 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $ 32 * SIZE, BO subq $-16 * SIZE, AO decq %rax BRANCH jg .L12 #endif .L16: movapd ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_3 .L17: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd -12 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO, %rax, 8), %xmm0 addpd %xmm1, %xmm10 movapd -16 * SIZE(BO, %rax, 8), %xmm1 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO, %rax, 4), %xmm0 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm12 movapd -14 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm13 movapd -12 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm2, %xmm1 mulpd -10 * SIZE(BO, %rax, 8), %xmm2 addpd %xmm1, %xmm14 movapd -8 * SIZE(BO, %rax, 8), %xmm1 addpd %xmm2, %xmm15 movapd -10 * SIZE(AO, %rax, 4), %xmm2 addq $SIZE, %rax jl .L17 ALIGN_3 .L19: movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 pshufd $0x44, %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 pshufd $0x44, %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhpd 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhpd 7 * SIZE(CO2), %xmm3 pshufd $0x44, %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 pshufd $0x44, %xmm13, %xmm5 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm13 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 addpd %xmm5, %xmm2 addpd %xmm13, %xmm3 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd %xmm2, 4 * SIZE(CO2) movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 movsd 4 * SIZE(CO1, LDC, 2), %xmm2 movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 movsd 6 * SIZE(CO1, LDC, 2), %xmm3 movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 pshufd $0x44, %xmm10, %xmm4 unpckhpd %xmm10, %xmm10 pshufd $0x44, %xmm14, %xmm5 unpckhpd %xmm14, %xmm14 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm10 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm14 addpd %xmm4, %xmm0 addpd %xmm10, %xmm1 addpd %xmm5, %xmm2 addpd %xmm14, %xmm3 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) movsd %xmm2, 4 * SIZE(CO1, LDC, 2) movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) movsd %xmm3, 6 * SIZE(CO1, LDC, 2) movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 movsd 4 * SIZE(CO2, LDC, 2), %xmm2 movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 movsd 6 * SIZE(CO2, LDC, 2), %xmm3 movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 pshufd $0x44, %xmm11, %xmm4 unpckhpd %xmm11, %xmm11 pshufd $0x44, %xmm15, %xmm5 unpckhpd %xmm15, %xmm15 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm11 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm15 addpd %xmm4, %xmm0 addpd %xmm11, %xmm1 addpd %xmm5, %xmm2 addpd %xmm15, %xmm3 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) movsd %xmm1, 2 * SIZE(CO2, LDC, 2) movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) movsd %xmm2, 4 * SIZE(CO2, LDC, 2) movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) movsd %xmm3, 6 * SIZE(CO2, LDC, 2) movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_3 .L20: testq $3, M je .L39 testq $2, M je .L30 ALIGN_3 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movapd 16 * SIZE(BO), %xmm5 movapd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_3 .L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 32 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm8 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm9 movapd 12 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 40 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm8 movapd 18 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 addpd %xmm5, %xmm9 movapd 20 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm5 mulpd 22 * SIZE(BO), %xmm0 addpd %xmm5, %xmm10 movapd 48 * SIZE(BO), %xmm5 addpd %xmm0, %xmm11 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm7 addpd %xmm7, %xmm8 movapd 26 * SIZE(BO), %xmm7 mulpd %xmm0, %xmm7 addpd %xmm7, %xmm9 movapd 28 * SIZE(BO), %xmm7 mulpd %xmm0, %xmm7 mulpd 30 * SIZE(BO), %xmm0 addpd %xmm7, %xmm10 movapd 56 * SIZE(BO), %xmm7 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movapd 34 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm9 movapd 36 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 mulpd 38 * SIZE(BO), %xmm2 addpd %xmm1, %xmm10 movapd 64 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movapd 42 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm9 movapd 44 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 mulpd 46 * SIZE(BO), %xmm2 addpd %xmm3, %xmm10 movapd 72 * SIZE(BO), %xmm3 addpd %xmm2, %xmm11 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm8 movapd 50 * SIZE(BO), %xmm5 mulpd %xmm2, %xmm5 addpd %xmm5, %xmm9 movapd 52 * SIZE(BO), %xmm5 mulpd %xmm2, %xmm5 mulpd 54 * SIZE(BO), %xmm2 addpd %xmm5, %xmm10 movapd 80 * SIZE(BO), %xmm5 addpd %xmm2, %xmm11 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm7 addpd %xmm7, %xmm8 movapd 58 * SIZE(BO), %xmm7 mulpd %xmm2, %xmm7 addpd %xmm7, %xmm9 movapd 60 * SIZE(BO), %xmm7 mulpd %xmm2, %xmm7 mulpd 62 * SIZE(BO), %xmm2 addpd %xmm7, %xmm10 movapd 88 * SIZE(BO), %xmm7 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_3 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_3 .L26: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 8 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_3 .L29: movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 pshufd $0x44, %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 pshufd $0x44, %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 pshufd $0x44, %xmm10, %xmm4 unpckhpd %xmm10, %xmm10 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm10 addpd %xmm4, %xmm0 addpd %xmm10, %xmm1 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 pshufd $0x44, %xmm11, %xmm4 unpckhpd %xmm11, %xmm11 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm11 addpd %xmm4, %xmm0 addpd %xmm11, %xmm1 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) movsd %xmm1, 2 * SIZE(CO2, LDC, 2) movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_3 .L30: testq $1, M je .L39 ALIGN_3 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movsd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movsd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movsd 16 * SIZE(BO), %xmm5 movsd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_3 .L32: mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 2 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm9 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 32 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -15 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm8 movsd 10 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm9 movsd 12 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BO), %xmm0 addsd %xmm3, %xmm10 movsd 40 * SIZE(BO), %xmm3 addsd %xmm0, %xmm11 movsd -14 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm8 movsd 18 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm9 movsd 20 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 mulsd 22 * SIZE(BO), %xmm0 addsd %xmm5, %xmm10 movsd 48 * SIZE(BO), %xmm5 addsd %xmm0, %xmm11 movsd -13 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm8 movsd 26 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm9 movsd 28 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 mulsd 30 * SIZE(BO), %xmm0 addsd %xmm7, %xmm10 movsd 56 * SIZE(BO), %xmm7 addsd %xmm0, %xmm11 movsd -12 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 movsd 34 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm9 movsd 36 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 mulsd 38 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 64 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -11 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm8 movsd 42 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 addsd %xmm3, %xmm9 movsd 44 * SIZE(BO), %xmm3 mulsd %xmm0, %xmm3 mulsd 46 * SIZE(BO), %xmm0 addsd %xmm3, %xmm10 movsd 72 * SIZE(BO), %xmm3 addsd %xmm0, %xmm11 movsd -10 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm8 movsd 50 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 addsd %xmm5, %xmm9 movsd 52 * SIZE(BO), %xmm5 mulsd %xmm0, %xmm5 mulsd 54 * SIZE(BO), %xmm0 addsd %xmm5, %xmm10 movsd 80 * SIZE(BO), %xmm5 addsd %xmm0, %xmm11 movsd -9 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm8 movsd 58 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 addsd %xmm7, %xmm9 movsd 60 * SIZE(BO), %xmm7 mulsd %xmm0, %xmm7 mulsd 62 * SIZE(BO), %xmm0 addsd %xmm7, %xmm10 movsd 88 * SIZE(BO), %xmm7 addsd %xmm0, %xmm11 movsd -8 * SIZE(AO), %xmm0 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_3 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: mulsd %xmm0, %xmm1 addsd %xmm1, %xmm8 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 addsd %xmm1, %xmm9 movsd 4 * SIZE(BO), %xmm1 mulsd %xmm0, %xmm1 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 8 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -15 * SIZE(AO), %xmm0 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_3 .L38: movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 pshufd $0x44, %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 pshufd $0x44, %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 pshufd $0x44, %xmm10, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 pshufd $0x44, %xmm11, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) ALIGN_3 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 ALIGN_3 .L40: testq $3, N je .L999 testq $2, N je .L80 ALIGN_4 .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $2, %rax jle .L43 ALIGN_3 .L42: PREFETCH 56 * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) movq %mm4, -8 * SIZE(BO) movq %mm4, -7 * SIZE(BO) movq %mm5, -6 * SIZE(BO) movq %mm5, -5 * SIZE(BO) movq %mm6, -4 * SIZE(BO) movq %mm6, -3 * SIZE(BO) movq %mm7, -2 * SIZE(BO) movq %mm7, -1 * SIZE(BO) decq %rax jne .L42 ALIGN_3 .L43: movq K, %rax andq $3, %rax BRANCH jle .L50 ALIGN_3 .L44: movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq %mm0, 0 * SIZE(BO) movq %mm0, 1 * SIZE(BO) movq %mm1, 2 * SIZE(BO) movq %mm1, 3 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_3 .L50: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_3 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm12, %xmm12 movapd 8 * SIZE(BO), %xmm3 pxor %xmm13, %xmm13 movapd 0 * SIZE(AO), %xmm4 movapd 16 * SIZE(BO), %xmm5 movapd 8 * SIZE(AO), %xmm6 movapd 24 * SIZE(BO), %xmm7 PREFETCHW 7 * SIZE(CO1) PREFETCHW 7 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L55 ALIGN_3 .L52: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 0 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm12 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm13 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm12 movapd 32 * SIZE(BO), %xmm1 addpd %xmm0, %xmm13 movapd 16 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd 8 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd 12 * SIZE(BO), %xmm3 addpd %xmm2, %xmm13 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd 12 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd 40 * SIZE(BO), %xmm3 addpd %xmm2, %xmm13 movapd 24 * SIZE(AO), %xmm2 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulpd %xmm4, %xmm5 mulpd 18 * SIZE(BO), %xmm4 addpd %xmm5, %xmm8 movapd 16 * SIZE(BO), %xmm5 addpd %xmm4, %xmm9 movapd 2 * SIZE(AO), %xmm4 mulpd %xmm4, %xmm5 mulpd 18 * SIZE(BO), %xmm4 addpd %xmm5, %xmm12 movapd 20 * SIZE(BO), %xmm5 addpd %xmm4, %xmm13 movapd 4 * SIZE(AO), %xmm4 mulpd %xmm4, %xmm5 mulpd 22 * SIZE(BO), %xmm4 addpd %xmm5, %xmm8 movapd 20 * SIZE(BO), %xmm5 addpd %xmm4, %xmm9 movapd 6 * SIZE(AO), %xmm4 mulpd %xmm4, %xmm5 mulpd 22 * SIZE(BO), %xmm4 addpd %xmm5, %xmm12 movapd 48 * SIZE(BO), %xmm5 addpd %xmm4, %xmm13 movapd 32 * SIZE(AO), %xmm4 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) mulpd %xmm6, %xmm7 mulpd 26 * SIZE(BO), %xmm6 addpd %xmm7, %xmm8 movapd 24 * SIZE(BO), %xmm7 addpd %xmm6, %xmm9 movapd 10 * SIZE(AO), %xmm6 mulpd %xmm6, %xmm7 mulpd 26 * SIZE(BO), %xmm6 addpd %xmm7, %xmm12 movapd 28 * SIZE(BO), %xmm7 addpd %xmm6, %xmm13 movapd 12 * SIZE(AO), %xmm6 mulpd %xmm6, %xmm7 mulpd 30 * SIZE(BO), %xmm6 addpd %xmm7, %xmm8 movapd 28 * SIZE(BO), %xmm7 addpd %xmm6, %xmm9 movapd 14 * SIZE(AO), %xmm6 mulpd %xmm6, %xmm7 mulpd 30 * SIZE(BO), %xmm6 addpd %xmm7, %xmm12 movapd 56 * SIZE(BO), %xmm7 addpd %xmm6, %xmm13 movapd 40 * SIZE(AO), %xmm6 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_3 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_3 .L56: movapd 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm12 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm0, %xmm13 movapd -12 * SIZE(AO), %xmm0 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_3 .L59: movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 pshufd $0x44, %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 pshufd $0x44, %xmm12, %xmm5 unpckhpd %xmm12, %xmm12 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm12 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm12, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm2 movhpd 5 * SIZE(CO2), %xmm2 movsd 6 * SIZE(CO2), %xmm3 movhpd 7 * SIZE(CO2), %xmm3 pshufd $0x44, %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 pshufd $0x44, %xmm13, %xmm5 unpckhpd %xmm13, %xmm13 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm13 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 addpd %xmm5, %xmm2 addpd %xmm13, %xmm3 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) movsd %xmm2, 4 * SIZE(CO2) movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 ALIGN_3 .L60: testq $2, M je .L70 ALIGN_3 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movapd 16 * SIZE(BO), %xmm5 movapd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_3 .L62: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 32 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd 10 * SIZE(BO), %xmm0 addpd %xmm3, %xmm8 movapd 12 * SIZE(BO), %xmm3 addpd %xmm0, %xmm9 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 40 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm2, %xmm5 mulpd 18 * SIZE(BO), %xmm2 addpd %xmm5, %xmm8 movapd 20 * SIZE(BO), %xmm5 addpd %xmm2, %xmm9 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm5 mulpd 22 * SIZE(BO), %xmm2 addpd %xmm5, %xmm10 movapd 48 * SIZE(BO), %xmm5 addpd %xmm2, %xmm11 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm7 mulpd 26 * SIZE(BO), %xmm2 addpd %xmm7, %xmm8 movapd 28 * SIZE(BO), %xmm7 addpd %xmm2, %xmm9 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm7 mulpd 30 * SIZE(BO), %xmm2 addpd %xmm7, %xmm10 movapd 56 * SIZE(BO), %xmm7 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_3 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_3 .L66: mulpd %xmm0, %xmm1 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 4 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_3 .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 pshufd $0x44, %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 movhpd 3 * SIZE(CO2), %xmm1 pshufd $0x44, %xmm9, %xmm4 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm9 addpd %xmm4, %xmm0 addpd %xmm9, %xmm1 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) movsd %xmm1, 2 * SIZE(CO2) movhpd %xmm1, 3 * SIZE(CO2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_3 .L70: testq $1, M je .L79 ALIGN_3 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movsd -12 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movsd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movsd 16 * SIZE(BO), %xmm5 movsd 24 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_3 .L72: mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd 2 * SIZE(BO), %xmm0 addsd %xmm1, %xmm8 movsd 4 * SIZE(BO), %xmm1 addsd %xmm0, %xmm9 movsd -15 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm1 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm1, %xmm10 movsd 32 * SIZE(BO), %xmm1 addsd %xmm0, %xmm11 movsd -14 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 mulsd 10 * SIZE(BO), %xmm0 addsd %xmm3, %xmm8 movsd 12 * SIZE(BO), %xmm3 addsd %xmm0, %xmm9 movsd -13 * SIZE(AO), %xmm0 mulsd %xmm0, %xmm3 mulsd 14 * SIZE(BO), %xmm0 addsd %xmm3, %xmm10 movsd 40 * SIZE(BO), %xmm3 addsd %xmm0, %xmm11 movsd -8 * SIZE(AO), %xmm0 mulsd %xmm2, %xmm5 mulsd 18 * SIZE(BO), %xmm2 addsd %xmm5, %xmm8 movsd 20 * SIZE(BO), %xmm5 addsd %xmm2, %xmm9 movsd -11 * SIZE(AO), %xmm2 mulsd %xmm2, %xmm5 mulsd 22 * SIZE(BO), %xmm2 addsd %xmm5, %xmm10 movsd 48 * SIZE(BO), %xmm5 addsd %xmm2, %xmm11 movsd -10 * SIZE(AO), %xmm2 mulsd %xmm2, %xmm7 mulsd 26 * SIZE(BO), %xmm2 addsd %xmm7, %xmm8 movsd 28 * SIZE(BO), %xmm7 addsd %xmm2, %xmm9 movsd -9 * SIZE(AO), %xmm2 mulsd %xmm2, %xmm7 mulsd 30 * SIZE(BO), %xmm2 addsd %xmm7, %xmm10 movsd 56 * SIZE(BO), %xmm7 addsd %xmm2, %xmm11 movsd -4 * SIZE(AO), %xmm2 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_3 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: mulsd %xmm0, %xmm1 mulsd 2 * SIZE(BO), %xmm0 addsd %xmm1, %xmm8 addsd %xmm0, %xmm9 movsd -15 * SIZE(AO), %xmm0 movsd 4 * SIZE(BO), %xmm1 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_3 .L78: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 pshufd $0x44, %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 pshufd $0x44, %xmm9, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) ALIGN_3 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C ALIGN_3 .L80: testq $1, N je .L999 ALIGN_4 .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $3, %rax jle .L83 ALIGN_3 .L82: PREFETCH 56 * SIZE(B) movq 0 * SIZE(B), %mm0 movq 1 * SIZE(B), %mm1 movq 2 * SIZE(B), %mm2 movq 3 * SIZE(B), %mm3 movq 4 * SIZE(B), %mm4 movq 5 * SIZE(B), %mm5 movq 6 * SIZE(B), %mm6 movq 7 * SIZE(B), %mm7 addq $ 8 * SIZE, B addq $16 * SIZE, BO movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) movq %mm4, -8 * SIZE(BO) movq %mm4, -7 * SIZE(BO) movq %mm5, -6 * SIZE(BO) movq %mm5, -5 * SIZE(BO) movq %mm6, -4 * SIZE(BO) movq %mm6, -3 * SIZE(BO) movq %mm7, -2 * SIZE(BO) movq %mm7, -1 * SIZE(BO) decq %rax jne .L82 ALIGN_3 .L83: movq K, %rax andq $7, %rax BRANCH jle .L90 ALIGN_3 .L84: movq 0 * SIZE(B), %mm0 movq %mm0, 0 * SIZE(BO) movq %mm0, 1 * SIZE(BO) addq $1 * SIZE, B addq $2 * SIZE, BO decq %rax jne .L84 ALIGN_3 .L90: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_3 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movapd 0 * SIZE(AO), %xmm4 movapd 8 * SIZE(AO), %xmm6 PREFETCHW 7 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_3 .L92: mulpd %xmm1, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -14 * SIZE(AO), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movapd 2 * SIZE(BO), %xmm1 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO), %xmm1 addpd %xmm0, %xmm10 movapd 16 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm1, %xmm11 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm1, %xmm2 mulpd -6 * SIZE(AO), %xmm1 addpd %xmm2, %xmm8 movapd -4 * SIZE(AO), %xmm2 addpd %xmm1, %xmm9 movapd 6 * SIZE(BO), %xmm1 mulpd %xmm1, %xmm2 mulpd -2 * SIZE(AO), %xmm1 addpd %xmm2, %xmm10 movapd 24 * SIZE(AO), %xmm2 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm1, %xmm11 movapd 16 * SIZE(BO), %xmm1 mulpd %xmm3, %xmm4 mulpd 2 * SIZE(AO), %xmm3 addpd %xmm4, %xmm8 movapd 4 * SIZE(AO), %xmm4 addpd %xmm3, %xmm9 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm3, %xmm4 mulpd 6 * SIZE(AO), %xmm3 addpd %xmm4, %xmm10 movapd 32 * SIZE(AO), %xmm4 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) addpd %xmm3, %xmm11 movapd 12 * SIZE(BO), %xmm3 mulpd %xmm3, %xmm6 mulpd 10 * SIZE(AO), %xmm3 addpd %xmm6, %xmm8 movapd 12 * SIZE(AO), %xmm6 addpd %xmm3, %xmm9 movapd 14 * SIZE(BO), %xmm3 mulpd %xmm3, %xmm6 mulpd 14 * SIZE(AO), %xmm3 addpd %xmm6, %xmm10 movapd 40 * SIZE(AO), %xmm6 addpd %xmm3, %xmm11 movapd 24 * SIZE(BO), %xmm3 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L92 ALIGN_3 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_3 .L96: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO), %xmm1 addpd %xmm0, %xmm8 movapd -12 * SIZE(AO), %xmm0 addpd %xmm1, %xmm9 movapd 2 * SIZE(BO), %xmm1 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_3 .L99: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 4 * SIZE(CO1), %xmm2 movhpd 5 * SIZE(CO1), %xmm2 movsd 6 * SIZE(CO1), %xmm3 movhpd 7 * SIZE(CO1), %xmm3 pshufd $0x44, %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 pshufd $0x44, %xmm9, %xmm5 unpckhpd %xmm9, %xmm9 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 mulpd %xmm7, %xmm5 mulpd %xmm7, %xmm9 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 addpd %xmm5, %xmm2 addpd %xmm9, %xmm3 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) movsd %xmm2, 4 * SIZE(CO1) movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 ALIGN_3 .L100: testq $2, M je .L110 ALIGN_3 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movapd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L105 ALIGN_3 .L102: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -14 * SIZE(AO), %xmm0 mulpd 2 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd 16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -12 * SIZE(AO), %xmm0 mulpd 4 * SIZE(BO), %xmm0 addpd %xmm0, %xmm10 movapd -10 * SIZE(AO), %xmm0 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm2 mulpd 10 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd 24 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -4 * SIZE(AO), %xmm2 mulpd 12 * SIZE(BO), %xmm2 addpd %xmm2, %xmm10 movapd -2 * SIZE(AO), %xmm2 mulpd 14 * SIZE(BO), %xmm2 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L102 ALIGN_3 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_3 .L106: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(AO), %xmm0 movapd 2 * SIZE(BO), %xmm1 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_3 .L109: addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm10, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 pshufd $0x44, %xmm8, %xmm4 unpckhpd %xmm8, %xmm8 mulpd %xmm7, %xmm4 mulpd %xmm7, %xmm8 addpd %xmm4, %xmm0 addpd %xmm8, %xmm1 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm1, 2 * SIZE(CO1) movhpd %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_3 .L110: testq $1, M je .L999 ALIGN_3 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movsd 0 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movsd -12 * SIZE(AO), %xmm2 pxor %xmm10, %xmm10 movsd 8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_3 .L112: mulsd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 16 * SIZE(BO), %xmm1 mulsd 2 * SIZE(BO), %xmm0 addsd %xmm0, %xmm9 movsd -14 * SIZE(AO), %xmm0 mulsd 4 * SIZE(BO), %xmm0 addsd %xmm0, %xmm10 movsd -13 * SIZE(AO), %xmm0 mulsd 6 * SIZE(BO), %xmm0 addsd %xmm0, %xmm11 movsd -8 * SIZE(AO), %xmm0 mulsd %xmm2, %xmm3 movsd -11 * SIZE(AO), %xmm2 addsd %xmm3, %xmm8 movsd 24 * SIZE(BO), %xmm3 mulsd 10 * SIZE(BO), %xmm2 addsd %xmm2, %xmm9 movsd -10 * SIZE(AO), %xmm2 mulsd 12 * SIZE(BO), %xmm2 addsd %xmm2, %xmm10 movsd -9 * SIZE(AO), %xmm2 mulsd 14 * SIZE(BO), %xmm2 addsd %xmm2, %xmm11 movsd -4 * SIZE(AO), %xmm2 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L112 ALIGN_3 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_3 .L116: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AO), %xmm0 addsd %xmm1, %xmm8 movsd 2 * SIZE(BO), %xmm1 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_3 .L118: addsd %xmm10, %xmm8 addsd %xmm11, %xmm9 addsd %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 pshufd $0x44, %xmm8, %xmm4 mulpd %xmm7, %xmm4 addpd %xmm4, %xmm0 movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) ALIGN_3 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S000066400000000000000000001603061313527062700220670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KKK 72(%rsp) #define KK 80(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define OFFSET 240(%rsp) #define KK 248(%rsp) #define KKK 256(%rsp) #endif #define PREFETCH prefetcht2 #define PREFETCHSIZE (16 * 12 + 3) #define KERNEL1(address) \ mulpd %xmm8, %xmm9 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ addpd %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ addpd %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ addpd %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ addpd %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ addpd %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ addpd %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ addpd %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ addpd %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ addpd %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #endif movsd %xmm0, ALPHA_R movsd %xmm1, ALPHA_I salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq (, K, 4), BB leaq (B, BB, SIZE), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: prefetcht0 0 * SIZE(BB) prefetcht0 8 * SIZE(BB) subq $-8 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 pxor %xmm4, %xmm4 movddup 16 * SIZE(BO), %xmm13 pxor %xmm5, %xmm5 movapd 24 * SIZE(AO), %xmm14 pxor %xmm6, %xmm6 movddup 24 * SIZE(BO), %xmm15 pxor %xmm7, %xmm7 prefetchnta 7 * SIZE(CO1) prefetchnta 7 * SIZE(CO2) prefetchnta 7 * SIZE(CO1, LDC, 2) prefetchnta 7 * SIZE(CO2, LDC, 2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #if 1 andq $-8, %rax salq $4, %rax NOBRANCH je .L15 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L12 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L12 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L12 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L12 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L12 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L12 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L12 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax BRANCH jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 #else sarq $3, %rax je .L15 ALIGN_4 .L12: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm6 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 32 * SIZE(AO), %xmm8 addpd %xmm9, %xmm7 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm7 movddup 40 * SIZE(BO), %xmm11 mulpd %xmm12, %xmm13 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm13, %xmm0 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 18 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 16 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 17 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 18 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 19 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 20 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm0 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm1 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm2 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 22 * SIZE(AO), %xmm12 addpd %xmm13, %xmm3 movddup 20 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm4 movddup 21 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm5 movddup 22 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 addpd %xmm13, %xmm6 movddup 23 * SIZE(BO), %xmm13 mulpd %xmm12, %xmm13 movapd 48 * SIZE(AO), %xmm12 addpd %xmm13, %xmm7 movddup 48 * SIZE(BO), %xmm13 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 26 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 24 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 25 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 26 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 27 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 28 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm0 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm1 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm2 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 30 * SIZE(AO), %xmm14 addpd %xmm15, %xmm3 movddup 28 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm4 movddup 29 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm5 movddup 30 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 addpd %xmm15, %xmm6 movddup 31 * SIZE(BO), %xmm15 mulpd %xmm14, %xmm15 movapd 56 * SIZE(AO), %xmm14 addpd %xmm15, %xmm7 movddup 56 * SIZE(BO), %xmm15 addq $32 * SIZE, BO addq $32 * SIZE, AO decq %rax BRANCH jne .L12 #endif ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax BRANCH jg .L16 ALIGN_4 .L19: movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movsd 4 * SIZE(CO1), %xmm10 movhpd 5 * SIZE(CO1), %xmm10 movsd 6 * SIZE(CO1), %xmm11 movhpd 7 * SIZE(CO1), %xmm11 movddup %xmm0, %xmm12 unpckhpd %xmm0, %xmm0 movddup %xmm4, %xmm13 unpckhpd %xmm4, %xmm4 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm13 mulpd %xmm15, %xmm4 addpd %xmm12, %xmm8 addpd %xmm0, %xmm9 addpd %xmm13, %xmm10 addpd %xmm4, %xmm11 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 4 * SIZE(CO1) movhpd %xmm10, 5 * SIZE(CO1) movsd %xmm11, 6 * SIZE(CO1) movhpd %xmm11, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 movsd 2 * SIZE(CO2), %xmm9 movhpd 3 * SIZE(CO2), %xmm9 movsd 4 * SIZE(CO2), %xmm10 movhpd 5 * SIZE(CO2), %xmm10 movsd 6 * SIZE(CO2), %xmm11 movhpd 7 * SIZE(CO2), %xmm11 movddup %xmm1, %xmm12 unpckhpd %xmm1, %xmm1 movddup %xmm5, %xmm13 unpckhpd %xmm5, %xmm5 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm13 mulpd %xmm15, %xmm5 addpd %xmm12, %xmm8 addpd %xmm1, %xmm9 addpd %xmm13, %xmm10 addpd %xmm5, %xmm11 movsd %xmm8, 0 * SIZE(CO2) movhpd %xmm8, 1 * SIZE(CO2) movsd %xmm9, 2 * SIZE(CO2) movhpd %xmm9, 3 * SIZE(CO2) movsd %xmm10, 4 * SIZE(CO2) movhpd %xmm10, 5 * SIZE(CO2) movsd %xmm11, 6 * SIZE(CO2) movhpd %xmm11, 7 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 movsd 2 * SIZE(CO1, LDC, 2), %xmm9 movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 movsd 4 * SIZE(CO1, LDC, 2), %xmm10 movhpd 5 * SIZE(CO1, LDC, 2), %xmm10 movsd 6 * SIZE(CO1, LDC, 2), %xmm11 movhpd 7 * SIZE(CO1, LDC, 2), %xmm11 movddup %xmm2, %xmm12 unpckhpd %xmm2, %xmm2 movddup %xmm6, %xmm13 unpckhpd %xmm6, %xmm6 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm2 mulpd %xmm15, %xmm13 mulpd %xmm15, %xmm6 addpd %xmm12, %xmm8 addpd %xmm2, %xmm9 addpd %xmm13, %xmm10 addpd %xmm6, %xmm11 movsd %xmm8, 0 * SIZE(CO1, LDC, 2) movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) movsd %xmm9, 2 * SIZE(CO1, LDC, 2) movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) movsd %xmm10, 4 * SIZE(CO1, LDC, 2) movhpd %xmm10, 5 * SIZE(CO1, LDC, 2) movsd %xmm11, 6 * SIZE(CO1, LDC, 2) movhpd %xmm11, 7 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 movsd 2 * SIZE(CO2, LDC, 2), %xmm9 movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 movsd 4 * SIZE(CO2, LDC, 2), %xmm10 movhpd 5 * SIZE(CO2, LDC, 2), %xmm10 movsd 6 * SIZE(CO2, LDC, 2), %xmm11 movhpd 7 * SIZE(CO2, LDC, 2), %xmm11 movddup %xmm3, %xmm12 unpckhpd %xmm3, %xmm3 movddup %xmm7, %xmm13 unpckhpd %xmm7, %xmm7 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm3 mulpd %xmm15, %xmm13 mulpd %xmm15, %xmm7 addpd %xmm12, %xmm8 addpd %xmm3, %xmm9 addpd %xmm13, %xmm10 addpd %xmm7, %xmm11 movsd %xmm8, 0 * SIZE(CO2, LDC, 2) movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) movsd %xmm9, 2 * SIZE(CO2, LDC, 2) movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) movsd %xmm10, 4 * SIZE(CO2, LDC, 2) movhpd %xmm10, 5 * SIZE(CO2, LDC, 2) movsd %xmm11, 6 * SIZE(CO2, LDC, 2) movhpd %xmm11, 7 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 jmp .L20 ALIGN_4 .L20: testq $2, M BRANCH je .L30 ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L29 ALIGN_4 .L26: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L29: movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movddup %xmm0, %xmm12 unpckhpd %xmm0, %xmm0 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm0 addpd %xmm12, %xmm8 addpd %xmm0, %xmm9 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 movsd 2 * SIZE(CO2), %xmm9 movhpd 3 * SIZE(CO2), %xmm9 movddup %xmm1, %xmm12 unpckhpd %xmm1, %xmm1 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm1 addpd %xmm12, %xmm8 addpd %xmm1, %xmm9 movsd %xmm8, 0 * SIZE(CO2) movhpd %xmm8, 1 * SIZE(CO2) movsd %xmm9, 2 * SIZE(CO2) movhpd %xmm9, 3 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 movsd 2 * SIZE(CO1, LDC, 2), %xmm9 movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 movddup %xmm2, %xmm12 unpckhpd %xmm2, %xmm2 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm2 addpd %xmm12, %xmm8 addpd %xmm2, %xmm9 movsd %xmm8, 0 * SIZE(CO1, LDC, 2) movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) movsd %xmm9, 2 * SIZE(CO1, LDC, 2) movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 movsd 2 * SIZE(CO2, LDC, 2), %xmm9 movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 movddup %xmm3, %xmm12 unpckhpd %xmm3, %xmm3 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm3 addpd %xmm12, %xmm8 addpd %xmm3, %xmm9 movsd %xmm8, 0 * SIZE(CO2, LDC, 2) movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) movsd %xmm9, 2 * SIZE(CO2, LDC, 2) movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M je .L39 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 3 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movapd 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movddup 8 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movapd 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 5 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movddup 6 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movapd 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 7 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movapd 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movapd 40 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movddup %xmm0, %xmm12 mulpd %xmm15, %xmm12 addpd %xmm12, %xmm8 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 unpckhpd %xmm0, %xmm0 mulpd %xmm15, %xmm0 addpd %xmm0, %xmm8 movsd %xmm8, 0 * SIZE(CO2) movhpd %xmm8, 1 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 movddup %xmm1, %xmm12 mulpd %xmm15, %xmm12 addpd %xmm12, %xmm8 movsd %xmm8, 0 * SIZE(CO1, LDC, 2) movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 unpckhpd %xmm1, %xmm1 mulpd %xmm15, %xmm1 addpd %xmm1, %xmm8 movsd %xmm8, 0 * SIZE(CO2, LDC, 2) movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc movq BO, B decq J # j -- jg .L10 ALIGN_4 .L40: testq $2, N je .L80 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) prefetchw 4 * SIZE(CO2) #else prefetchnta 4 * SIZE(CO1) prefetchnta 4 * SIZE(CO2) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 addpd %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 addpd %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L59 ALIGN_4 .L56: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 addpd %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L59: movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movsd 4 * SIZE(CO1), %xmm10 movhpd 5 * SIZE(CO1), %xmm10 movsd 6 * SIZE(CO1), %xmm11 movhpd 7 * SIZE(CO1), %xmm11 movddup %xmm0, %xmm12 unpckhpd %xmm0, %xmm0 movddup %xmm4, %xmm13 unpckhpd %xmm4, %xmm4 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm13 mulpd %xmm15, %xmm4 addpd %xmm12, %xmm8 addpd %xmm0, %xmm9 addpd %xmm13, %xmm10 addpd %xmm4, %xmm11 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 4 * SIZE(CO1) movhpd %xmm10, 5 * SIZE(CO1) movsd %xmm11, 6 * SIZE(CO1) movhpd %xmm11, 7 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 movsd 2 * SIZE(CO2), %xmm9 movhpd 3 * SIZE(CO2), %xmm9 movsd 4 * SIZE(CO2), %xmm10 movhpd 5 * SIZE(CO2), %xmm10 movsd 6 * SIZE(CO2), %xmm11 movhpd 7 * SIZE(CO2), %xmm11 movddup %xmm1, %xmm12 unpckhpd %xmm1, %xmm1 movddup %xmm5, %xmm13 unpckhpd %xmm5, %xmm5 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm13 mulpd %xmm15, %xmm5 addpd %xmm12, %xmm8 addpd %xmm1, %xmm9 addpd %xmm13, %xmm10 addpd %xmm5, %xmm11 movsd %xmm8, 0 * SIZE(CO2) movhpd %xmm8, 1 * SIZE(CO2) movsd %xmm9, 2 * SIZE(CO2) movhpd %xmm9, 3 * SIZE(CO2) movsd %xmm10, 4 * SIZE(CO2) movhpd %xmm10, 5 * SIZE(CO2) movsd %xmm11, 6 * SIZE(CO2) movhpd %xmm11, 7 * SIZE(CO2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 addpd %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L69 ALIGN_4 .L66: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L69: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movddup %xmm0, %xmm12 unpckhpd %xmm0, %xmm0 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm0 addpd %xmm12, %xmm8 addpd %xmm0, %xmm9 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 movsd 2 * SIZE(CO2), %xmm9 movhpd 3 * SIZE(CO2), %xmm9 movddup %xmm1, %xmm12 unpckhpd %xmm1, %xmm1 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm1 addpd %xmm12, %xmm8 addpd %xmm1, %xmm9 movsd %xmm8, 0 * SIZE(CO2) movhpd %xmm8, 1 * SIZE(CO2) movsd %xmm9, 2 * SIZE(CO2) movhpd %xmm9, 3 * SIZE(CO2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L70: testq $1, M je .L79 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movapd 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 movapd 16 * SIZE(BO), %xmm9 addpd %xmm8, %xmm1 movddup 2 * SIZE(AO), %xmm8 mulpd 4 * SIZE(BO), %xmm8 addpd %xmm8, %xmm2 movddup 3 * SIZE(AO), %xmm8 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movddup 8 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm11 movddup 5 * SIZE(AO), %xmm10 addpd %xmm11, %xmm0 mulpd 10 * SIZE(BO), %xmm10 movapd 24 * SIZE(BO), %xmm11 addpd %xmm10, %xmm1 movddup 6 * SIZE(AO), %xmm10 mulpd 12 * SIZE(BO), %xmm10 addpd %xmm10, %xmm2 movddup 7 * SIZE(AO), %xmm10 mulpd 14 * SIZE(BO), %xmm10 addpd %xmm10, %xmm3 movddup 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulpd %xmm8, %xmm9 movddup 1 * SIZE(AO), %xmm8 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movddup %xmm0, %xmm12 mulpd %xmm15, %xmm12 addpd %xmm12, %xmm8 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 unpckhpd %xmm0, %xmm0 mulpd %xmm15, %xmm0 addpd %xmm0, %xmm8 movsd %xmm8, 0 * SIZE(CO2) movhpd %xmm8, 1 * SIZE(CO2) ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C movq BO, B ALIGN_4 .L80: testq $1, N je .L999 ALIGN_4 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $2, I # i = (m >> 2) jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifdef HAVE_3DNOW prefetchw 4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm8 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm8, %xmm2 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm3 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 10 * SIZE(AO), %xmm9 addpd %xmm10, %xmm0 movapd 12 * SIZE(AO), %xmm10 addpd %xmm9, %xmm1 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm9, %xmm10 mulpd 14 * SIZE(AO), %xmm9 addpd %xmm10, %xmm2 movapd 24 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm8 mulpd 18 * SIZE(AO), %xmm11 addpd %xmm8, %xmm0 movapd 20 * SIZE(AO), %xmm8 addpd %xmm11, %xmm1 movddup 5 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm8 mulpd 22 * SIZE(AO), %xmm11 addpd %xmm8, %xmm2 movapd 32 * SIZE(AO), %xmm8 addpd %xmm11, %xmm3 movddup 6 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 26 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 movapd 28 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 7 * SIZE(BO), %xmm11 mulpd %xmm11, %xmm10 mulpd 30 * SIZE(AO), %xmm11 addpd %xmm10, %xmm2 movapd 40 * SIZE(AO), %xmm10 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L99 ALIGN_4 .L96: mulpd %xmm9, %xmm8 mulpd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 movapd 4 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 1 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L99: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movsd 4 * SIZE(CO1), %xmm10 movhpd 5 * SIZE(CO1), %xmm10 movsd 6 * SIZE(CO1), %xmm11 movhpd 7 * SIZE(CO1), %xmm11 movddup %xmm0, %xmm12 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm13 unpckhpd %xmm1, %xmm1 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm0 mulpd %xmm15, %xmm13 mulpd %xmm15, %xmm1 addpd %xmm12, %xmm8 addpd %xmm0, %xmm9 addpd %xmm13, %xmm10 addpd %xmm1, %xmm11 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 4 * SIZE(CO1) movhpd %xmm10, 5 * SIZE(CO1) movsd %xmm11, 6 * SIZE(CO1) movhpd %xmm11, 7 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 ALIGN_4 .L100: testq $2, M je .L110 ALIGN_4 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L105 ALIGN_4 .L102: mulpd %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(AO), %xmm9 movapd 16 * SIZE(AO), %xmm8 addpd %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd 4 * SIZE(AO), %xmm9 addpd %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd 6 * SIZE(AO), %xmm9 addpd %xmm9, %xmm3 movddup 8 * SIZE(BO), %xmm9 mulpd %xmm11, %xmm10 movddup 5 * SIZE(BO), %xmm11 addpd %xmm10, %xmm0 mulpd 10 * SIZE(AO), %xmm11 movapd 24 * SIZE(AO), %xmm10 addpd %xmm11, %xmm1 movddup 6 * SIZE(BO), %xmm11 mulpd 12 * SIZE(AO), %xmm11 addpd %xmm11, %xmm2 movddup 7 * SIZE(BO), %xmm11 mulpd 14 * SIZE(AO), %xmm11 addpd %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L109 ALIGN_4 .L106: mulpd %xmm9, %xmm8 movddup 1 * SIZE(BO), %xmm9 addpd %xmm8, %xmm0 movapd 2 * SIZE(AO), %xmm8 addq $2 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L106 ALIGN_4 .L109: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm9 movhpd 3 * SIZE(CO1), %xmm9 movddup %xmm0, %xmm12 unpckhpd %xmm0, %xmm0 mulpd %xmm15, %xmm12 mulpd %xmm15, %xmm0 addpd %xmm12, %xmm8 addpd %xmm0, %xmm9 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L110: testq $1, M je .L999 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movsd 4 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movsd 4 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 0 * SIZE(AO), %xmm9 movapd 0 * SIZE(BO), %xmm8 movapd 4 * SIZE(AO), %xmm11 movapd 4 * SIZE(BO), %xmm10 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulpd %xmm9, %xmm8 movapd 2 * SIZE(AO), %xmm9 addpd %xmm8, %xmm0 mulpd 2 * SIZE(BO), %xmm9 movapd 8 * SIZE(BO), %xmm8 addpd %xmm9, %xmm1 movapd 8 * SIZE(AO), %xmm9 mulpd %xmm11, %xmm10 movapd 6 * SIZE(AO), %xmm11 addpd %xmm10, %xmm0 mulpd 6 * SIZE(BO), %xmm11 movapd 12 * SIZE(BO), %xmm10 addpd %xmm11, %xmm1 movapd 12 * SIZE(AO), %xmm11 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movsd ALPHA_R, %xmm15 movhpd ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulsd 0 * SIZE(BO), %xmm9 addsd %xmm9, %xmm0 movsd 1 * SIZE(AO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $1 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: addpd %xmm1, %xmm0 haddpd %xmm0, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movddup %xmm0, %xmm12 mulpd %xmm15, %xmm12 addpd %xmm12, %xmm8 movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S000066400000000000000000001311711313527062700226250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define PREA %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #define PREFETCHSIZE (16 * 1 - 8) #define PREFETCH prefetcht0 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif unpcklps %xmm1, %xmm0 movlps %xmm0, ALPHA_R movlps %xmm0, ALPHA_I subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC movq N, J sarq $3, J NOBRANCH jle .L40 ALIGN_4 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 4), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 3, %rax leaq (B, %rax), BB movq M, I sarq $2, I NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif prefetcht0 -32 * SIZE(BB) subq $-16 * SIZE, BB xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 leaq (LDC, LDC, 2), %rax xorps %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 7 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1, LDC, 2) xorps %xmm11, %xmm11 prefetcht0 7 * SIZE(CO1, %rax, 1) xorps %xmm12, %xmm12 prefetcht0 3 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 7 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 prefetcht0 3 * SIZE(CO2, LDC, 2) xorps %xmm15, %xmm15 prefetcht0 7 * SIZE(CO2, %rax, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm7 mulps %xmm0, %xmm4 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm7, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm7, %xmm5 mulps %xmm7, %xmm6 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm7, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm7, %xmm3 mulps %xmm7, %xmm4 addps %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm7 addps %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm7, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm7, %xmm5 mulps %xmm7, %xmm6 addps %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0x39, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm7, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm7, %xmm3 movaps -16 * SIZE(AO), %xmm0 mulps %xmm7, %xmm4 subq $-16 * SIZE, AO decq %rax BRANCH jg .L12 ALIGN_3 .L15: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addps %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm13 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm5 mulps %xmm0, %xmm2 addps %xmm3, %xmm14 addps %xmm4, %xmm15 pshufd $0x39, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm5, %xmm10 addps %xmm6, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: addps %xmm1, %xmm12 addps %xmm2, %xmm13 addps %xmm3, %xmm14 addps %xmm4, %xmm15 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps %xmm13, %xmm4 shufps $0xd8, %xmm12, %xmm13 shufps $0xd8, %xmm15, %xmm12 shufps $0xd8, %xmm14, %xmm15 shufps $0xd8, %xmm4, %xmm14 movaps %xmm12, %xmm4 shufps $0xd8, %xmm14, %xmm12 shufps $0xd8, %xmm4, %xmm14 movaps %xmm13, %xmm5 shufps $0xd8, %xmm15, %xmm13 shufps $0xd8, %xmm5, %xmm15 leaq (LDC, LDC, 2), %rax movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO1, LDC), %xmm2 movhps 2 * SIZE(CO1, LDC), %xmm2 movsd 4 * SIZE(CO1, LDC), %xmm3 movhps 6 * SIZE(CO1, LDC), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm9, %xmm5 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm9 addps %xmm4, %xmm0 addps %xmm8, %xmm1 addps %xmm5, %xmm2 addps %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) movlps %xmm3, 4 * SIZE(CO1, LDC) movhps %xmm3, 6 * SIZE(CO1, LDC) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 movhps 6 * SIZE(CO1, LDC, 2), %xmm1 movsd 0 * SIZE(CO1, %rax), %xmm2 movhps 2 * SIZE(CO1, %rax), %xmm2 movsd 4 * SIZE(CO1, %rax), %xmm3 movhps 6 * SIZE(CO1, %rax), %xmm3 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 pshufd $0x50, %xmm11, %xmm5 pshufd $0xfa, %xmm11, %xmm11 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 mulps %xmm7, %xmm5 mulps %xmm7, %xmm11 addps %xmm4, %xmm0 addps %xmm10, %xmm1 addps %xmm5, %xmm2 addps %xmm11, %xmm3 movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 2 * SIZE(CO1, LDC, 2) movlps %xmm1, 4 * SIZE(CO1, LDC, 2) movhps %xmm1, 6 * SIZE(CO1, LDC, 2) movlps %xmm2, 0 * SIZE(CO1, %rax) movhps %xmm2, 2 * SIZE(CO1, %rax) movlps %xmm3, 4 * SIZE(CO1, %rax) movhps %xmm3, 6 * SIZE(CO1, %rax) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 movsd 0 * SIZE(CO2, LDC), %xmm2 movhps 2 * SIZE(CO2, LDC), %xmm2 movsd 4 * SIZE(CO2, LDC), %xmm3 movhps 6 * SIZE(CO2, LDC), %xmm3 pshufd $0x50, %xmm12, %xmm4 pshufd $0xfa, %xmm12, %xmm12 pshufd $0x50, %xmm13, %xmm5 pshufd $0xfa, %xmm13, %xmm13 mulps %xmm7, %xmm4 mulps %xmm7, %xmm12 mulps %xmm7, %xmm5 mulps %xmm7, %xmm13 addps %xmm4, %xmm0 addps %xmm12, %xmm1 addps %xmm5, %xmm2 addps %xmm13, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movlps %xmm2, 0 * SIZE(CO2, LDC) movhps %xmm2, 2 * SIZE(CO2, LDC) movlps %xmm3, 4 * SIZE(CO2, LDC) movhps %xmm3, 6 * SIZE(CO2, LDC) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 2 * SIZE(CO2, LDC, 2), %xmm0 movsd 4 * SIZE(CO2, LDC, 2), %xmm1 movhps 6 * SIZE(CO2, LDC, 2), %xmm1 movsd 0 * SIZE(CO2, %rax), %xmm2 movhps 2 * SIZE(CO2, %rax), %xmm2 movsd 4 * SIZE(CO2, %rax), %xmm3 movhps 6 * SIZE(CO2, %rax), %xmm3 pshufd $0x50, %xmm14, %xmm4 pshufd $0xfa, %xmm14, %xmm14 pshufd $0x50, %xmm15, %xmm5 pshufd $0xfa, %xmm15, %xmm15 mulps %xmm7, %xmm4 mulps %xmm7, %xmm14 mulps %xmm7, %xmm5 mulps %xmm7, %xmm15 addps %xmm4, %xmm0 addps %xmm14, %xmm1 addps %xmm5, %xmm2 addps %xmm15, %xmm3 movlps %xmm0, 0 * SIZE(CO2, LDC, 2) movhps %xmm0, 2 * SIZE(CO2, LDC, 2) movlps %xmm1, 4 * SIZE(CO2, LDC, 2) movhps %xmm1, 6 * SIZE(CO2, LDC, 2) movlps %xmm2, 0 * SIZE(CO2, %rax) movhps %xmm2, 2 * SIZE(CO2, %rax) movlps %xmm3, 4 * SIZE(CO2, %rax) movhps %xmm3, 6 * SIZE(CO2, %rax) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 decq I BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -20 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -16 * SIZE(BO), %xmm5 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -12 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -8 * SIZE(BO), %xmm5 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -4 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps 0 * SIZE(BO), %xmm5 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 mulps %xmm0, %xmm2 movaps -28 * SIZE(BO), %xmm5 addps %xmm3, %xmm10 pshufd $0x50, %xmm5, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm5, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(BO), %xmm5 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 leaq (LDC, LDC, 2), %rax movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO1, LDC), %xmm1 movhps 2 * SIZE(CO1, LDC), %xmm1 movsd 0 * SIZE(CO1, LDC, 2), %xmm2 movhps 2 * SIZE(CO1, LDC, 2), %xmm2 movsd 0 * SIZE(CO1, %rax), %xmm3 movhps 2 * SIZE(CO1, %rax), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm9, %xmm5 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm9 addps %xmm4, %xmm0 addps %xmm8, %xmm1 addps %xmm5, %xmm2 addps %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) movhps %xmm1, 2 * SIZE(CO1, LDC) movlps %xmm2, 0 * SIZE(CO1, LDC, 2) movhps %xmm2, 2 * SIZE(CO1, LDC, 2) movlps %xmm3, 0 * SIZE(CO1, %rax) movhps %xmm3, 2 * SIZE(CO1, %rax) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 0 * SIZE(CO2, LDC), %xmm1 movhps 2 * SIZE(CO2, LDC), %xmm1 movsd 0 * SIZE(CO2, LDC, 2), %xmm2 movhps 2 * SIZE(CO2, LDC, 2), %xmm2 movsd 0 * SIZE(CO2, %rax), %xmm3 movhps 2 * SIZE(CO2, %rax), %xmm3 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 pshufd $0x50, %xmm11, %xmm5 pshufd $0xfa, %xmm11, %xmm11 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 mulps %xmm7, %xmm5 mulps %xmm7, %xmm11 addps %xmm4, %xmm0 addps %xmm10, %xmm1 addps %xmm5, %xmm2 addps %xmm11, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 2 * SIZE(CO2) movlps %xmm1, 0 * SIZE(CO2, LDC) movhps %xmm1, 2 * SIZE(CO2, LDC) movlps %xmm2, 0 * SIZE(CO2, LDC, 2) movhps %xmm2, 2 * SIZE(CO2, LDC, 2) movlps %xmm3, 0 * SIZE(CO2, %rax) movhps %xmm3, 2 * SIZE(CO2, %rax) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L30: testq $1, M BRANCH jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm8, %xmm8 xorps %xmm12, %xmm12 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $8, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -20 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -12 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -4 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 subq $-32 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addps %xmm3, %xmm12 movaps -28 * SIZE(BO), %xmm3 mulps %xmm1, %xmm3 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: addps %xmm2, %xmm8 addps %xmm3, %xmm12 leaq (LDC, LDC, 2), %rax movsd (CO1), %xmm0 movhps (CO1, LDC), %xmm0 movsd (CO1, LDC, 2), %xmm1 movhps (CO1, %rax), %xmm1 movsd (CO2), %xmm2 movhps (CO2, LDC), %xmm2 movsd (CO2, LDC, 2), %xmm3 movhps (CO2, %rax), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm12, %xmm5 pshufd $0xfa, %xmm12, %xmm12 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm12 addps %xmm4, %xmm0 addps %xmm8, %xmm1 addps %xmm5, %xmm2 addps %xmm12, %xmm3 movlps %xmm0, (CO1) movhps %xmm0, (CO1, LDC) movlps %xmm1, (CO1, LDC, 2) movhps %xmm1, (CO1, %rax) movlps %xmm2, (CO2) movhps %xmm2, (CO2, LDC) movlps %xmm3, (CO2, LDC, 2) movhps %xmm3, (CO2, %rax) ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $8, KK #endif movq BO, B leaq (C, LDC, 8), C subq $1, J BRANCH jg .L10 ALIGN_4 .L40: testq $4, N jle .L70 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 2), CO2 movq A, AO movq M, I sarq $2, I NOBRANCH jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 prefetcht2 4 * SIZE(CO2) xorps %xmm11, %xmm11 prefetcht2 4 * SIZE(CO2, LDC, 1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm2, %xmm9 pshufd $0x39, %xmm1, %xmm2 mulps %xmm0, %xmm1 addps %xmm3, %xmm10 pshufd $0x39, %xmm2, %xmm3 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm10 addps %xmm4, %xmm11 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO1, LDC), %xmm2 movhps 2 * SIZE(CO1, LDC), %xmm2 movsd 4 * SIZE(CO1, LDC), %xmm3 movhps 6 * SIZE(CO1, LDC), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm9, %xmm5 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm9 addps %xmm4, %xmm0 addps %xmm8, %xmm1 addps %xmm5, %xmm2 addps %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO1, LDC) movhps %xmm2, 2 * SIZE(CO1, LDC) movlps %xmm3, 4 * SIZE(CO1, LDC) movhps %xmm3, 6 * SIZE(CO1, LDC) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 movsd 0 * SIZE(CO2, LDC), %xmm2 movhps 2 * SIZE(CO2, LDC), %xmm2 movsd 4 * SIZE(CO2, LDC), %xmm3 movhps 6 * SIZE(CO2, LDC), %xmm3 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 pshufd $0x50, %xmm11, %xmm5 pshufd $0xfa, %xmm11, %xmm11 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 mulps %xmm7, %xmm5 mulps %xmm7, %xmm11 addps %xmm4, %xmm0 addps %xmm10, %xmm1 addps %xmm5, %xmm2 addps %xmm11, %xmm3 movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movlps %xmm2, 0 * SIZE(CO2, LDC) movhps %xmm2, 2 * SIZE(CO2, LDC) movlps %xmm3, 4 * SIZE(CO2, LDC) movhps %xmm3, 6 * SIZE(CO2, LDC) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 decq I BRANCH jg .L41 ALIGN_4 .L50: testq $2, M BRANCH jle .L60 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0xfa, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: addps %xmm1, %xmm8 addps %xmm2, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO1, LDC), %xmm1 movhps 2 * SIZE(CO1, LDC), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 0 * SIZE(CO2, LDC), %xmm3 movhps 2 * SIZE(CO2, LDC), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm9, %xmm5 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm9 addps %xmm4, %xmm0 addps %xmm8, %xmm1 addps %xmm5, %xmm2 addps %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO1, LDC) movhps %xmm1, 2 * SIZE(CO1, LDC) movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movlps %xmm3, 0 * SIZE(CO2, LDC) movhps %xmm3, 2 * SIZE(CO2, LDC) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movaps -20 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-16 * SIZE, BO subq $ -4 * SIZE, AO subq $1, %rax BRANCH jg .L62 addps %xmm9, %xmm8 ALIGN_3 .L65: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: addps %xmm2, %xmm8 movsd (CO1), %xmm0 movhps (CO1, LDC), %xmm0 movsd (CO2), %xmm1 movhps (CO2, LDC), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm4, %xmm0 addps %xmm8, %xmm1 movlps %xmm0, (CO1) movhps %xmm0, (CO1, LDC) movlps %xmm1, (CO2) movhps %xmm1, (CO2, LDC) ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C ALIGN_4 .L70: testq $2, N jle .L100 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC), CO2 movq A, AO movq M, I sarq $2, I NOBRANCH jle .L80 ALIGN_4 .L71: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -26 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -24 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_3 .L75: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L78 ALIGN_3 .L76: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 mulps %xmm0, %xmm1 addps %xmm2, %xmm9 pshufd $0x55, %xmm3, %xmm2 movsd -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_3 .L78: addps %xmm1, %xmm8 addps %xmm2, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm9, %xmm5 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm9 addps %xmm4, %xmm0 addps %xmm8, %xmm1 addps %xmm5, %xmm2 addps %xmm9, %xmm3 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movlps %xmm3, 4 * SIZE(CO2) movhps %xmm3, 6 * SIZE(CO2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 decq I BRANCH jg .L71 ALIGN_4 .L80: testq $2, M BRANCH jle .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movsd -26 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L82 ALIGN_3 .L85: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L88 ALIGN_3 .L86: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_3 .L88: addps %xmm1, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm4, %xmm0 addps %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 2 * SIZE(CO2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L90: testq $1, M BRANCH jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_3 .L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 pshufd $0x55, %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm2, %xmm9 movsd -26 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L92 addps %xmm9, %xmm8 ALIGN_3 .L95: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_3 .L96: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 mulps %xmm1, %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_3 .L98: addps %xmm2, %xmm8 movsd (CO1), %xmm0 movhps (CO2), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, (CO1) movhps %xmm0, (CO2) ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B leaq (C, LDC, 2), C ALIGN_4 .L100: testq $1, N jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $2, I NOBRANCH jle .L110 ALIGN_4 .L101: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm3 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -30 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -29 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -20 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_3 .L105: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L108 ALIGN_3 .L106: addps %xmm1, %xmm8 pshufd $0x00, %xmm3, %xmm1 movss -31 * SIZE(BO), %xmm3 mulps %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_3 .L108: addps %xmm1, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm4, %xmm0 addps %xmm8, %xmm1 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movlps %xmm1, 4 * SIZE(CO1) movhps %xmm1, 6 * SIZE(CO1) addq $8 * SIZE, CO1 decq I BRANCH jg .L101 ALIGN_4 .L110: testq $2, M BRANCH jle .L120 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_3 .L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -31 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -30 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -26 * SIZE(AO), %xmm0 addps %xmm1, %xmm8 movss -29 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -24 * SIZE(AO), %xmm0 subq $-4 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L112 ALIGN_3 .L115: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L118 ALIGN_3 .L116: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_3 .L118: addps %xmm1, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L120: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L125 ALIGN_3 .L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -30 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -30 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -29 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -29 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -28 * SIZE(AO), %xmm0 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L122 ALIGN_3 .L125: movups ALPHA_R, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L128 ALIGN_3 .L126: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L126 ALIGN_3 .L128: addps %xmm2, %xmm8 movsd (CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm4, %xmm0 movlps %xmm0, (CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S000066400000000000000000002034211313527062700231400ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH prefetch #define PREFETCHSIZE (16 * 17 + 0) #define RPREFETCHSIZE (16 * 9 + 0) #define WPREFETCHSIZE (16 * 9 + 0) #define KERNEL1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #define KERNEL2(xx) \ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ addps %xmm1, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL3(xx) \ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL4(xx) \ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ movaps (AO, %rax, 4), %xmm6 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ addps %xmm5, %xmm14 ;\ movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm6, %xmm2 #define KERNEL5(xx) \ mulps %xmm1, %xmm6 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm8 ;\ movaps %xmm2, %xmm6 ;\ addps %xmm1, %xmm12 ;\ movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ movaps %xmm6, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm6 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm10 ;\ movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ addps %xmm1, %xmm14 ;\ movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm6, %xmm2 #define KERNEL6(xx) \ mulps %xmm1, %xmm6 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm8 ;\ movaps %xmm2, %xmm6 ;\ addps %xmm1, %xmm12 ;\ movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm6, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm6 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm10 ;\ movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm7, %xmm2 #define KERNEL7(xx) \ mulps %xmm5, %xmm7 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm8 ;\ movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ movaps %xmm2, %xmm7 ;\ addps %xmm5, %xmm12 ;\ movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm7, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm7 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm10 ;\ movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ addps %xmm5, %xmm14 ;\ movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm7, %xmm2 #define KERNEL8(xx) \ mulps %xmm5, %xmm7 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm8 ;\ movaps %xmm2, %xmm7 ;\ addps %xmm5, %xmm12 ;\ movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm7, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm7 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm10 ;\ movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps (AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ addps %xmm5, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-1024, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N movss %xmm0, 0 + ALPHA movss %xmm1, 4 + ALPHA movss %xmm0, 8 + ALPHA movss %xmm1, 12 + ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif subq $-32 * SIZE, A salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 movaps 8 * SIZE(B), %xmm11 movaps 12 * SIZE(B), %xmm15 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetchw (WPREFETCHSIZE + 32) * SIZE(BO) pshufd $0x00, %xmm11, %xmm0 pshufd $0x55, %xmm11, %xmm1 pshufd $0xaa, %xmm11, %xmm2 pshufd $0xff, %xmm11, %xmm3 prefetchw (WPREFETCHSIZE + 48) * SIZE(BO) pshufd $0x00, %xmm15, %xmm4 pshufd $0x55, %xmm15, %xmm5 pshufd $0xaa, %xmm15, %xmm6 pshufd $0xff, %xmm15, %xmm7 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) movaps %xmm4, 48 * SIZE(BO) movaps %xmm5, 52 * SIZE(BO) movaps %xmm6, 56 * SIZE(BO) movaps %xmm7, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif prefetch 0 * SIZE(BB) prefetch 16 * SIZE(BB) subq $-32 * SIZE, BB movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movaps -28 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movaps -16 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movaps 0 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 prefetchw 7 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw 7 * SIZE(CO2) pxor %xmm13, %xmm13 prefetchw 7 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 prefetchw 7 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 movaps %xmm0, %xmm2 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif andq $-8, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) BRANCH jl .L12 ALIGN_4 .L15: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL_SUB1(32 * 0) KERNEL_SUB2(32 * 0) KERNEL_SUB3(32 * 0) KERNEL_SUB4(32 * 0) addq $32 * SIZE, AO addq $64 * SIZE, BO ALIGN_3 .L16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L18 leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_4 .L17: mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm8 movaps %xmm2, %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm9 movaps %xmm0, %xmm2 addps %xmm3, %xmm13 movaps -20 * SIZE(BO, %rax, 8), %xmm3 mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm10 movaps -24 * SIZE(AO, %rax, 4), %xmm0 addps %xmm1, %xmm14 movaps -16 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm11 addps %xmm3, %xmm15 movaps -12 * SIZE(BO, %rax, 8), %xmm3 movaps %xmm0, %xmm2 addq $SIZE * 2, %rax jl .L17 ALIGN_4 .L18: movups 0 * SIZE(CO1), %xmm0 movups 4 * SIZE(CO1), %xmm1 movups 8 * SIZE(CO1), %xmm2 movups 12 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm12, %xmm5 pshufd $0xfa, %xmm12, %xmm12 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm12 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm12 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm12, 12 * SIZE(CO1) movhps %xmm12, 14 * SIZE(CO1) movups 0 * SIZE(CO2), %xmm0 movups 4 * SIZE(CO2), %xmm1 movups 8 * SIZE(CO2), %xmm2 movups 12 * SIZE(CO2), %xmm3 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 pshufd $0x50, %xmm13, %xmm5 pshufd $0xfa, %xmm13, %xmm13 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 mulps %xmm7, %xmm5 mulps %xmm7, %xmm13 addps %xmm0, %xmm4 addps %xmm1, %xmm9 addps %xmm2, %xmm5 addps %xmm3, %xmm13 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movlps %xmm5, 8 * SIZE(CO2) movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) movups 0 * SIZE(CO1, LDC, 2), %xmm0 movups 4 * SIZE(CO1, LDC, 2), %xmm1 movups 8 * SIZE(CO1, LDC, 2), %xmm2 movups 12 * SIZE(CO1, LDC, 2), %xmm3 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 pshufd $0x50, %xmm14, %xmm5 pshufd $0xfa, %xmm14, %xmm14 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 mulps %xmm7, %xmm5 mulps %xmm7, %xmm14 addps %xmm0, %xmm4 addps %xmm1, %xmm10 addps %xmm2, %xmm5 addps %xmm3, %xmm14 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm10, 4 * SIZE(CO1, LDC, 2) movhps %xmm10, 6 * SIZE(CO1, LDC, 2) movlps %xmm5, 8 * SIZE(CO1, LDC, 2) movhps %xmm5, 10 * SIZE(CO1, LDC, 2) movlps %xmm14, 12 * SIZE(CO1, LDC, 2) movhps %xmm14, 14 * SIZE(CO1, LDC, 2) movups 0 * SIZE(CO2, LDC, 2), %xmm0 movups 4 * SIZE(CO2, LDC, 2), %xmm1 movups 8 * SIZE(CO2, LDC, 2), %xmm2 movups 12 * SIZE(CO2, LDC, 2), %xmm3 pshufd $0x50, %xmm11, %xmm4 pshufd $0xfa, %xmm11, %xmm11 pshufd $0x50, %xmm15, %xmm5 pshufd $0xfa, %xmm15, %xmm15 mulps %xmm7, %xmm4 mulps %xmm7, %xmm11 mulps %xmm7, %xmm5 mulps %xmm7, %xmm15 addps %xmm0, %xmm4 addps %xmm1, %xmm11 addps %xmm2, %xmm5 addps %xmm3, %xmm15 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm11, 4 * SIZE(CO2, LDC, 2) movhps %xmm11, 6 * SIZE(CO2, LDC, 2) movlps %xmm5, 8 * SIZE(CO2, LDC, 2) movhps %xmm5, 10 * SIZE(CO2, LDC, 2) movlps %xmm15, 12 * SIZE(CO2, LDC, 2) movhps %xmm15, 14 * SIZE(CO2, LDC, 2) addq $16 * SIZE, CO1 # coffset += 4 addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: movups 0 * SIZE(CO1), %xmm8 movups 4 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm4 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm7, %xmm4 mulps %xmm7, %xmm0 addps %xmm8, %xmm4 addps %xmm9, %xmm0 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movups 0 * SIZE(CO2), %xmm8 movups 4 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm1, %xmm4 pshufd $0xfa, %xmm1, %xmm1 mulps %xmm7, %xmm4 mulps %xmm7, %xmm1 addps %xmm8, %xmm4 addps %xmm9, %xmm1 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movups 0 * SIZE(CO1, LDC, 2), %xmm8 movups 4 * SIZE(CO1, LDC, 2), %xmm9 pshufd $0x50, %xmm2, %xmm4 pshufd $0xfa, %xmm2, %xmm2 mulps %xmm7, %xmm4 mulps %xmm7, %xmm2 addps %xmm8, %xmm4 addps %xmm9, %xmm2 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 6 * SIZE(CO1, LDC, 2) movups 0 * SIZE(CO2, LDC, 2), %xmm8 movups 4 * SIZE(CO2, LDC, 2), %xmm9 pshufd $0x50, %xmm3, %xmm4 pshufd $0xfa, %xmm3, %xmm3 mulps %xmm7, %xmm4 mulps %xmm7, %xmm3 addps %xmm8, %xmm4 addps %xmm9, %xmm3 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm3, 4 * SIZE(CO2, LDC, 2) movhps %xmm3, 6 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movsd 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -28 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsd 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movsd 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movsd 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movsd 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd -26 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movsd 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movsd 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movsd 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movsd 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd -16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movsd 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movsd 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movsd 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movsd 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd -22 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movsd 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movsd 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movsd 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsd 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movsd 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -18 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsd 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsd 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movsd 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsd 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: movups 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movups 0 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movups 0 * SIZE(CO1, LDC, 2), %xmm8 pshufd $0x50, %xmm2, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movups 0 * SIZE(CO2, LDC, 2), %xmm8 pshufd $0x50, %xmm3, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L40: testq $1, M je .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss -30 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss -29 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss -24 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss -27 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss -26 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss -25 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: movsd 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 pshufd $0x50, %xmm2, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 pshufd $0x50, %xmm3, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 .L50: testq $2, N je .L100 .L51: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 .L52: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $ 2 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 prefetchw 15 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchw 15 * SIZE(CO2) pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 4 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 8 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 12 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 20 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 24 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 28 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 80 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: movups 0 * SIZE(CO1), %xmm8 movups 4 * SIZE(CO1), %xmm9 movups 8 * SIZE(CO1), %xmm10 movups 12 * SIZE(CO1), %xmm11 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 pshufd $0x50, %xmm4, %xmm3 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 mulps %xmm7, %xmm3 mulps %xmm7, %xmm4 addps %xmm8, %xmm2 addps %xmm9, %xmm0 addps %xmm10, %xmm3 addps %xmm11, %xmm4 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movlps %xmm3, 8 * SIZE(CO1) movhps %xmm3, 10 * SIZE(CO1) movlps %xmm4, 12 * SIZE(CO1) movhps %xmm4, 14 * SIZE(CO1) movups 0 * SIZE(CO2), %xmm8 movups 4 * SIZE(CO2), %xmm9 movups 8 * SIZE(CO2), %xmm10 movups 12 * SIZE(CO2), %xmm11 pshufd $0x50, %xmm1, %xmm2 pshufd $0xfa, %xmm1, %xmm1 pshufd $0x50, %xmm5, %xmm3 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm7, %xmm2 mulps %xmm7, %xmm1 mulps %xmm7, %xmm3 mulps %xmm7, %xmm5 addps %xmm8, %xmm2 addps %xmm9, %xmm1 addps %xmm10, %xmm3 addps %xmm11, %xmm5 movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movlps %xmm3, 8 * SIZE(CO2) movhps %xmm3, 10 * SIZE(CO2) movlps %xmm5, 12 * SIZE(CO2) movhps %xmm5, 14 * SIZE(CO2) addq $16 * SIZE, CO1 # coffset += 4 addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 movups 0 * SIZE(CO1), %xmm8 movups 4 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 addps %xmm8, %xmm2 addps %xmm9, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movups 0 * SIZE(CO2), %xmm8 movups 4 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm1, %xmm2 pshufd $0xfa, %xmm1, %xmm1 mulps %xmm7, %xmm2 mulps %xmm7, %xmm1 addps %xmm8, %xmm2 addps %xmm9, %xmm1 movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 ALIGN_4 .L80: testq $2, M je .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -26 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movsd 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsd 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -22 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movsd 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movsd 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsd 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -18 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movsd 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movsd 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsd 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: addps %xmm2, %xmm0 addps %xmm3, %xmm1 movups 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movups 0 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L90: testq $1, M je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -29 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -24 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -27 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -26 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -25 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: addss %xmm2, %xmm0 addss %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO2) ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 4 * ldc ALIGN_4 .L100: testq $1, N je .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 .L102: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movups 0 * SIZE(B), %xmm3 movups 4 * SIZE(B), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 prefetchw 15 * SIZE(CO1) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps -20 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm9, %xmm10 mulps -12 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps -8 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps -4 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 48 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) mulps %xmm11, %xmm12 mulps 4 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 8 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 12 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 64 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) mulps %xmm11, %xmm14 mulps 20 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 24 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 28 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 80 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: movups 0 * SIZE(CO1), %xmm8 movups 4 * SIZE(CO1), %xmm9 movups 8 * SIZE(CO1), %xmm10 movups 12 * SIZE(CO1), %xmm11 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 pshufd $0x50, %xmm4, %xmm3 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 mulps %xmm7, %xmm3 mulps %xmm7, %xmm4 addps %xmm8, %xmm2 addps %xmm9, %xmm0 addps %xmm10, %xmm3 addps %xmm11, %xmm4 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movlps %xmm3, 8 * SIZE(CO1) movhps %xmm3, 10 * SIZE(CO1) movlps %xmm4, 12 * SIZE(CO1) movhps %xmm4, 14 * SIZE(CO1) addq $16 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -28 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -24 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps -20 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm10, %xmm11 movaps -12 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -8 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps -4 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 movups 0 * SIZE(CO1), %xmm8 movups 4 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 addps %xmm8, %xmm2 addps %xmm9, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 ALIGN_4 .L130: testq $2, M je .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -26 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd -22 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -18 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -8 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 movups 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L140: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -31 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss -30 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss -29 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss -24 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss -27 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss -26 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss -25 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss -20 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 movsd 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_8x4_core2.S000066400000000000000000001424101313527062700222240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (16 * 16 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (16 * 21 + 8) #define PREFETCH prefetcht0 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %r15 # save old stack subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movss %xmm0, 0 + ALPHA movss %xmm1, 4 + ALPHA movss %xmm0, 8 + ALPHA movss %xmm1, 12 + ALPHA subq $-32 * SIZE, A subq $-32 * SIZE, B #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq OLD_M, M movq OLD_N, N salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J jle .L50 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq 32 * SIZE + BUFFER, BO movq K, %rax sarq $2, %rax jle .L05 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movaps -32 * SIZE(B), %xmm3 movaps -28 * SIZE(B), %xmm7 movaps -24 * SIZE(B), %xmm11 movaps -20 * SIZE(B), %xmm15 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 16) * SIZE(BO) pshufd $0x00, %xmm11, %xmm8 pshufd $0x55, %xmm11, %xmm9 pshufd $0xaa, %xmm11, %xmm10 pshufd $0xff, %xmm11, %xmm11 pshufd $0x00, %xmm15, %xmm12 pshufd $0x55, %xmm15, %xmm13 pshufd $0xaa, %xmm15, %xmm14 pshufd $0xff, %xmm15, %xmm15 prefetcht0 (PREFETCH_W + 32) * SIZE(BO) movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm2, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) movaps %xmm4, -16 * SIZE(BO) movaps %xmm5, -12 * SIZE(BO) movaps %xmm6, -8 * SIZE(BO) movaps %xmm7, -4 * SIZE(BO) prefetcht0 (PREFETCH_W + 48) * SIZE(BO) movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) movaps %xmm10, 8 * SIZE(BO) movaps %xmm11, 12 * SIZE(BO) movaps %xmm12, 16 * SIZE(BO) movaps %xmm13, 20 * SIZE(BO) movaps %xmm14, 24 * SIZE(BO) movaps %xmm15, 28 * SIZE(BO) subq $-16 * SIZE, B subq $-64 * SIZE, BO subq $1, %rax jne .L02 ALIGN_4 .L05: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L06: movaps -32 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, -32 * SIZE(BO) movaps %xmm1, -28 * SIZE(BO) movaps %xmm2, -24 * SIZE(BO) movaps %xmm3, -20 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L06 ALIGN_4 .L10: movq B, BB movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 40 * SIZE + BUFFER, BO #else leaq 40 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 movaps -32 * SIZE(AO), %xmm0 pxor %xmm9, %xmm9 movaps -28 * SIZE(AO), %xmm1 pxor %xmm10, %xmm10 movaps -40 * SIZE(BO), %xmm6 pxor %xmm11, %xmm11 movaps -36 * SIZE(BO), %xmm7 prefetcht0 (PREFETCH_R + 0) * SIZE(BB) prefetcht0 15 * SIZE(CO1) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 prefetcht0 15 * SIZE(CO2) pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 prefetcht0 15 * SIZE(CO1, LDC, 2) pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 prefetcht0 15 * SIZE(CO2, LDC, 2) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-8 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L15 ALIGN_4 .L12: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 PADDING; movaps %xmm6, %xmm3 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps -28 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps -20 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps -16 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps -12 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -8 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps -4 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -12 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 PADDING; movaps %xmm6, %xmm3 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps 4 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps 8 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps 12 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -4 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps 16 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps 20 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps 24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 subq $-32 * SIZE, AO addps %xmm7, %xmm9 movaps 28 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $-64 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 movaps -28 * SIZE(BO), %xmm4 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 addq $8 * SIZE, AO movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 movaps -20 * SIZE(BO), %xmm7 addps %xmm5, %xmm13 addq $16 * SIZE, BO movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L16 ALIGN_4 .L18: movaps ALPHA, %xmm7 addps %xmm2, %xmm10 addps %xmm3, %xmm14 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 8 * SIZE(CO1), %xmm2 movhps 10 * SIZE(CO1), %xmm2 movsd 12 * SIZE(CO1), %xmm3 movhps 14 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm12, %xmm5 pshufd $0xfa, %xmm12, %xmm12 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm12 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm12 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm12, 12 * SIZE(CO1) movhps %xmm12, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 movsd 8 * SIZE(CO2), %xmm2 movhps 10 * SIZE(CO2), %xmm2 movsd 12 * SIZE(CO2), %xmm3 movhps 14 * SIZE(CO2), %xmm3 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 pshufd $0x50, %xmm13, %xmm5 pshufd $0xfa, %xmm13, %xmm13 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 mulps %xmm7, %xmm5 mulps %xmm7, %xmm13 addps %xmm0, %xmm4 addps %xmm1, %xmm9 addps %xmm2, %xmm5 addps %xmm3, %xmm13 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movlps %xmm5, 8 * SIZE(CO2) movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 movhps 6 * SIZE(CO1, LDC, 2), %xmm1 movsd 8 * SIZE(CO1, LDC, 2), %xmm2 movhps 10 * SIZE(CO1, LDC, 2), %xmm2 movsd 12 * SIZE(CO1, LDC, 2), %xmm3 movhps 14 * SIZE(CO1, LDC, 2), %xmm3 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 pshufd $0x50, %xmm14, %xmm5 pshufd $0xfa, %xmm14, %xmm14 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 mulps %xmm7, %xmm5 mulps %xmm7, %xmm14 addps %xmm0, %xmm4 addps %xmm1, %xmm10 addps %xmm2, %xmm5 addps %xmm3, %xmm14 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm10, 4 * SIZE(CO1, LDC, 2) movhps %xmm10, 6 * SIZE(CO1, LDC, 2) movlps %xmm5, 8 * SIZE(CO1, LDC, 2) movhps %xmm5, 10 * SIZE(CO1, LDC, 2) movlps %xmm14, 12 * SIZE(CO1, LDC, 2) movhps %xmm14, 14 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 2 * SIZE(CO2, LDC, 2), %xmm0 movsd 4 * SIZE(CO2, LDC, 2), %xmm1 movhps 6 * SIZE(CO2, LDC, 2), %xmm1 movsd 8 * SIZE(CO2, LDC, 2), %xmm2 movhps 10 * SIZE(CO2, LDC, 2), %xmm2 movsd 12 * SIZE(CO2, LDC, 2), %xmm3 movhps 14 * SIZE(CO2, LDC, 2), %xmm3 pshufd $0x50, %xmm11, %xmm4 pshufd $0xfa, %xmm11, %xmm11 pshufd $0x50, %xmm15, %xmm5 pshufd $0xfa, %xmm15, %xmm15 mulps %xmm7, %xmm4 mulps %xmm7, %xmm11 mulps %xmm7, %xmm5 mulps %xmm7, %xmm15 addps %xmm0, %xmm4 addps %xmm1, %xmm11 addps %xmm2, %xmm5 addps %xmm3, %xmm15 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm11, 4 * SIZE(CO2, LDC, 2) movhps %xmm11, 6 * SIZE(CO2, LDC, 2) movlps %xmm5, 8 * SIZE(CO2, LDC, 2) movhps %xmm5, 10 * SIZE(CO2, LDC, 2) movlps %xmm15, 12 * SIZE(CO2, LDC, 2) movhps %xmm15, 14 * SIZE(CO2, LDC, 2) addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 subq $1, I jg .L11 ALIGN_4 .L20: testq $4, M jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L25 ALIGN_4 .L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -28 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm3 movaps -8 * SIZE(BO), %xmm4 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -24 * SIZE(AO), %xmm0 movaps 0 * SIZE(BO), %xmm2 movaps 4 * SIZE(BO), %xmm3 movaps 8 * SIZE(BO), %xmm4 movaps 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -20 * SIZE(AO), %xmm0 movaps 16 * SIZE(BO), %xmm2 movaps 20 * SIZE(BO), %xmm3 movaps 24 * SIZE(BO), %xmm4 movaps 28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $-16 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jg .L21 ALIGN_4 .L25: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L28 ALIGN_4 .L26: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 addq $ 4 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L26 ALIGN_4 .L28: movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 addps %xmm0, %xmm4 addps %xmm1, %xmm9 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 movhps 6 * SIZE(CO1, LDC, 2), %xmm1 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 addps %xmm0, %xmm4 addps %xmm1, %xmm10 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm10, 4 * SIZE(CO1, LDC, 2) movhps %xmm10, 6 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 2 * SIZE(CO2, LDC, 2), %xmm0 movsd 4 * SIZE(CO2, LDC, 2), %xmm1 movhps 6 * SIZE(CO2, LDC, 2), %xmm1 pshufd $0x50, %xmm11, %xmm4 pshufd $0xfa, %xmm11, %xmm11 mulps %xmm7, %xmm4 mulps %xmm7, %xmm11 addps %xmm0, %xmm4 addps %xmm1, %xmm11 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm11, 4 * SIZE(CO2, LDC, 2) movhps %xmm11, 6 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 ALIGN_4 .L30: testq $2, M jle .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L35 ALIGN_4 .L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -30 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -12 * SIZE(BO), %xmm3 movsd -8 * SIZE(BO), %xmm4 movsd -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -28 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm2 movsd 4 * SIZE(BO), %xmm3 movsd 8 * SIZE(BO), %xmm4 movsd 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -26 * SIZE(AO), %xmm0 movsd 16 * SIZE(BO), %xmm2 movsd 20 * SIZE(BO), %xmm3 movsd 24 * SIZE(BO), %xmm4 movsd 28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jg .L31 ALIGN_4 .L35: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L38 ALIGN_4 .L36: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 addq $ 2 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L36 ALIGN_4 .L38: movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 pshufd $0x50, %xmm9, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 pshufd $0x50, %xmm10, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 2 * SIZE(CO2, LDC, 2), %xmm0 pshufd $0x50, %xmm11, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L40: testq $1, M jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L45 ALIGN_4 .L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 movss -24 * SIZE(BO), %xmm4 movss -20 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -31 * SIZE(AO), %xmm0 movss -16 * SIZE(BO), %xmm2 movss -12 * SIZE(BO), %xmm3 movss -8 * SIZE(BO), %xmm4 movss -4 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -30 * SIZE(AO), %xmm0 movss 0 * SIZE(BO), %xmm2 movss 4 * SIZE(BO), %xmm3 movss 8 * SIZE(BO), %xmm4 movss 12 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -29 * SIZE(AO), %xmm0 movss 16 * SIZE(BO), %xmm2 movss 20 * SIZE(BO), %xmm3 movss 24 * SIZE(BO), %xmm4 movss 28 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jg .L41 ALIGN_4 .L45: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L48 ALIGN_4 .L46: movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 movss -24 * SIZE(BO), %xmm4 movss -20 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm0, %xmm4 mulss %xmm0, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 addq $ 1 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L46 ALIGN_4 .L48: movsd 0 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 pshufd $0x50, %xmm9, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 pshufd $0x50, %xmm10, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 pshufd $0x50, %xmm11, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C subq $1, J jg .L01 ALIGN_4 .L50: testq $2, N jle .L100 ALIGN_4 .L51: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $3, %rax jle .L53 addq %rax, %rax ALIGN_4 .L52: movaps -32 * SIZE(B), %xmm3 movaps -28 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO subq $1, %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $7, %rax BRANCH jle .L55 ALIGN_4 .L54: movss -32 * SIZE(B), %xmm8 movss -31 * SIZE(B), %xmm9 shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L54 ALIGN_4 .L55: movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO # aoffset = a movq M, I sarq $3, I jle .L70 ALIGN_4 .L60: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 15 * SIZE(CO1) pxor %xmm12, %xmm12 prefetcht0 15 * SIZE(CO2) pxor %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L65 ALIGN_4 .L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -20 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 movaps -16 * SIZE(AO), %xmm0 movaps -12 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -12 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 movaps -8 * SIZE(AO), %xmm0 movaps -4 * SIZE(AO), %xmm1 movaps -8 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -4 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 subq $-32 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L61 ALIGN_4 .L65: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L68 ALIGN_4 .L66: movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addps %xmm4, %xmm9 addps %xmm5, %xmm13 addq $8 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L68: movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 8 * SIZE(CO1), %xmm2 movhps 10 * SIZE(CO1), %xmm2 movsd 12 * SIZE(CO1), %xmm3 movhps 14 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm12, %xmm5 pshufd $0xfa, %xmm12, %xmm12 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm12 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm12 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm12, 12 * SIZE(CO1) movhps %xmm12, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 movsd 8 * SIZE(CO2), %xmm2 movhps 10 * SIZE(CO2), %xmm2 movsd 12 * SIZE(CO2), %xmm3 movhps 14 * SIZE(CO2), %xmm3 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 pshufd $0x50, %xmm13, %xmm5 pshufd $0xfa, %xmm13, %xmm13 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 mulps %xmm7, %xmm5 mulps %xmm7, %xmm13 addps %xmm0, %xmm4 addps %xmm1, %xmm9 addps %xmm2, %xmm5 addps %xmm3, %xmm13 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movlps %xmm5, 8 * SIZE(CO2) movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 subq $1, I jg .L60 ALIGN_4 .L70: testq $4, M jle .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L75 ALIGN_4 .L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm3 movaps -8 * SIZE(BO), %xmm4 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $-16 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L71 ALIGN_4 .L75: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L78 ALIGN_4 .L76: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: addps %xmm10, %xmm8 addps %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 addps %xmm0, %xmm4 addps %xmm1, %xmm9 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 ALIGN_4 .L80: testq $2, M jle .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L85 ALIGN_4 .L81: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -30 * SIZE(AO), %xmm1 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 movsd -28 * SIZE(AO), %xmm0 movsd -26 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -12 * SIZE(BO), %xmm3 movsd -8 * SIZE(BO), %xmm4 movsd -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L81 ALIGN_4 .L85: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L88 ALIGN_4 .L86: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L86 ALIGN_4 .L88: addps %xmm10, %xmm8 addps %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 pshufd $0x50, %xmm9, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L90: testq $1, M jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L95 ALIGN_4 .L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 movss -31 * SIZE(AO), %xmm1 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 movss -24 * SIZE(BO), %xmm4 movss -20 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm1, %xmm4 mulss %xmm1, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 movss -30 * SIZE(AO), %xmm0 movss -29 * SIZE(AO), %xmm1 movss -16 * SIZE(BO), %xmm2 movss -12 * SIZE(BO), %xmm3 movss -8 * SIZE(BO), %xmm4 movss -4 * SIZE(BO), %xmm5 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 mulss %xmm1, %xmm4 mulss %xmm1, %xmm5 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addss %xmm4, %xmm10 addss %xmm5, %xmm11 subq $ -4 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jg .L91 ALIGN_4 .L95: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L98 ALIGN_4 .L96: movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 mulss %xmm0, %xmm2 mulss %xmm0, %xmm3 addss %xmm2, %xmm8 addss %xmm3, %xmm9 addq $1 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L96 ALIGN_4 .L98: addss %xmm10, %xmm8 addss %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 pshufd $0x50, %xmm9, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO2) ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C ALIGN_4 .L100: testq $1, N jle .L999 ALIGN_4 .L101: /* Copying to Sub Buffer */ leaq BUFFER, BO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq K, %rax sarq $4, %rax jle .L103 addq %rax, %rax ALIGN_4 .L102: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 movss -30 * SIZE(B), %xmm2 movss -29 * SIZE(B), %xmm3 movss -28 * SIZE(B), %xmm4 movss -27 * SIZE(B), %xmm5 movss -26 * SIZE(B), %xmm6 movss -25 * SIZE(B), %xmm7 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B subq $-32 * SIZE, BO subq $1, %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $15, %rax BRANCH jle .L105 ALIGN_4 .L104: movss -32 * SIZE(B), %xmm8 shufps $0, %xmm8, %xmm8 movaps %xmm8, 0 * SIZE(BO) addq $1 * SIZE, B addq $4 * SIZE, BO subq $1, %rax jne .L104 ALIGN_4 .L105: movq C, CO1 movq A, AO movq M, I sarq $3, I jle .L120 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht0 15 * SIZE(CO1) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L115 ALIGN_4 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm12 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -28 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm9 addps %xmm3, %xmm13 movaps -16 * SIZE(AO), %xmm0 movaps -12 * SIZE(AO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm12 movaps -8 * SIZE(AO), %xmm0 movaps -4 * SIZE(AO), %xmm1 movaps -20 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm9 addps %xmm3, %xmm13 subq $-32 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L111 ALIGN_4 .L115: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L118 ALIGN_4 .L116: movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm12 addq $8 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L116 ALIGN_4 .L118: addps %xmm9, %xmm8 addps %xmm13, %xmm12 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 8 * SIZE(CO1), %xmm2 movhps 10 * SIZE(CO1), %xmm2 movsd 12 * SIZE(CO1), %xmm3 movhps 14 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm12, %xmm5 pshufd $0xfa, %xmm12, %xmm12 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm12 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm12 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm12, 12 * SIZE(CO1) movhps %xmm12, 14 * SIZE(CO1) addq $16 * SIZE, CO1 subq $1, I jg .L110 ALIGN_4 .L120: testq $4, M jle .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L125 ALIGN_4 .L121: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps -20 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm10 addps %xmm3, %xmm11 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L121 ALIGN_4 .L125: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L128 ALIGN_4 .L126: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm8 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L126 ALIGN_4 .L128: addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) addq $8 * SIZE, CO1 ALIGN_4 .L130: testq $2, M jle .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L135 ALIGN_4 .L131: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -30 * SIZE(AO), %xmm1 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm8 addps %xmm3, %xmm9 movsd -28 * SIZE(AO), %xmm0 movsd -26 * SIZE(AO), %xmm1 movsd -24 * SIZE(BO), %xmm2 movsd -20 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm2, %xmm10 addps %xmm3, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L131 ALIGN_4 .L135: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L138 ALIGN_4 .L136: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 mulps %xmm0, %xmm2 addps %xmm2, %xmm8 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L136 ALIGN_4 .L138: addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L140: testq $1, M jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L145 ALIGN_4 .L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 movss -31 * SIZE(AO), %xmm1 movss -32 * SIZE(BO), %xmm2 movss -28 * SIZE(BO), %xmm3 mulss %xmm0, %xmm2 mulss %xmm1, %xmm3 addss %xmm2, %xmm8 addss %xmm3, %xmm9 movss -30 * SIZE(AO), %xmm0 movss -29 * SIZE(AO), %xmm1 movss -24 * SIZE(BO), %xmm2 movss -20 * SIZE(BO), %xmm3 mulss %xmm0, %xmm2 mulss %xmm1, %xmm3 addss %xmm2, %xmm10 addss %xmm3, %xmm11 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jg .L141 ALIGN_4 .L145: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L148 ALIGN_4 .L146: movss -32 * SIZE(AO), %xmm0 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 addss %xmm2, %xmm8 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L146 ALIGN_4 .L148: addss %xmm10, %xmm8 addss %xmm11, %xmm9 addss %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S000066400000000000000000001367101313527062700225330ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define PREA %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #define PREFETCHSIZE (8 * 17 + 4) #define PREFETCH prefetcht0 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif unpcklps %xmm1, %xmm0 movlps %xmm0, ALPHA_R movlps %xmm0, ALPHA_I subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J NOBRANCH jle .L50 ALIGN_4 .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I sarq $3, I NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 movaps -28 * SIZE(AO), %xmm1 xorpd %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorpd %xmm5, %xmm5 prefetcht0 -32 * SIZE(BB) xorpd %xmm6, %xmm6 prefetcht2 7 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht2 7 * SIZE(CO2) movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht2 7 * SIZE(CO1, LDC, 2) movapd %xmm4, %xmm12 movaps %xmm4, %xmm13 prefetcht2 7 * SIZE(CO2, LDC, 2) movaps %xmm4, %xmm14 movaps %xmm4, %xmm15 subq $-24 * SIZE, BB leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH -32 * SIZE(PREA) addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -12 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 PREFETCH -16 * SIZE(PREA) movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -4 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 0 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 4 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 PREFETCH 0 * SIZE(PREA) movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 12 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 20 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 PREFETCH 16 * SIZE(PREA) movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 28 * SIZE(AO), %xmm1 addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 subq $-64 * SIZE, AO pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 subq $-32 * SIZE, BO pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $-64 * SIZE, PREA subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: prefetcht0 -16 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: addps %xmm6, %xmm10 addps %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 addps %xmm7, %xmm9 addps %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: movups ALPHA_R, %xmm7 addps %xmm6, %xmm10 addps %xmm3, %xmm14 addps %xmm4, %xmm11 addps %xmm5, %xmm15 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movaps %xmm13, %xmm4 shufps $0xd8, %xmm12, %xmm13 shufps $0xd8, %xmm15, %xmm12 shufps $0xd8, %xmm14, %xmm15 shufps $0xd8, %xmm4, %xmm14 movaps %xmm12, %xmm4 shufps $0xd8, %xmm14, %xmm12 shufps $0xd8, %xmm4, %xmm14 movaps %xmm13, %xmm5 shufps $0xd8, %xmm15, %xmm13 shufps $0xd8, %xmm5, %xmm15 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 8 * SIZE(CO1), %xmm2 movhps 10 * SIZE(CO1), %xmm2 movsd 12 * SIZE(CO1), %xmm3 movhps 14 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm12, %xmm5 pshufd $0xfa, %xmm12, %xmm12 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm12 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm12 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm12, 12 * SIZE(CO1) movhps %xmm12, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 movsd 8 * SIZE(CO2), %xmm2 movhps 10 * SIZE(CO2), %xmm2 movsd 12 * SIZE(CO2), %xmm3 movhps 14 * SIZE(CO2), %xmm3 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 pshufd $0x50, %xmm13, %xmm5 pshufd $0xfa, %xmm13, %xmm13 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 mulps %xmm7, %xmm5 mulps %xmm7, %xmm13 addps %xmm0, %xmm4 addps %xmm1, %xmm9 addps %xmm2, %xmm5 addps %xmm3, %xmm13 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movlps %xmm5, 8 * SIZE(CO2) movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 movhps 6 * SIZE(CO1, LDC, 2), %xmm1 movsd 8 * SIZE(CO1, LDC, 2), %xmm2 movhps 10 * SIZE(CO1, LDC, 2), %xmm2 movsd 12 * SIZE(CO1, LDC, 2), %xmm3 movhps 14 * SIZE(CO1, LDC, 2), %xmm3 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 pshufd $0x50, %xmm14, %xmm5 pshufd $0xfa, %xmm14, %xmm14 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 mulps %xmm7, %xmm5 mulps %xmm7, %xmm14 addps %xmm0, %xmm4 addps %xmm1, %xmm10 addps %xmm2, %xmm5 addps %xmm3, %xmm14 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm10, 4 * SIZE(CO1, LDC, 2) movhps %xmm10, 6 * SIZE(CO1, LDC, 2) movlps %xmm5, 8 * SIZE(CO1, LDC, 2) movhps %xmm5, 10 * SIZE(CO1, LDC, 2) movlps %xmm14, 12 * SIZE(CO1, LDC, 2) movhps %xmm14, 14 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 2 * SIZE(CO2, LDC, 2), %xmm0 movsd 4 * SIZE(CO2, LDC, 2), %xmm1 movhps 6 * SIZE(CO2, LDC, 2), %xmm1 movsd 8 * SIZE(CO2, LDC, 2), %xmm2 movhps 10 * SIZE(CO2, LDC, 2), %xmm2 movsd 12 * SIZE(CO2, LDC, 2), %xmm3 movhps 14 * SIZE(CO2, LDC, 2), %xmm3 pshufd $0x50, %xmm11, %xmm4 pshufd $0xfa, %xmm11, %xmm11 pshufd $0x50, %xmm15, %xmm5 pshufd $0xfa, %xmm15, %xmm15 mulps %xmm7, %xmm4 mulps %xmm7, %xmm11 mulps %xmm7, %xmm5 mulps %xmm7, %xmm15 addps %xmm0, %xmm4 addps %xmm1, %xmm11 addps %xmm2, %xmm5 addps %xmm3, %xmm15 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm11, 4 * SIZE(CO2, LDC, 2) movhps %xmm11, 6 * SIZE(CO2, LDC, 2) movlps %xmm5, 8 * SIZE(CO2, LDC, 2) movhps %xmm5, 10 * SIZE(CO2, LDC, 2) movlps %xmm15, 12 * SIZE(CO2, LDC, 2) movhps %xmm15, 14 * SIZE(CO2, LDC, 2) addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 decq I BRANCH jg .L11 ALIGN_4 .L20: testq $4, M BRANCH jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: addps %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 subq $-16 * SIZE, AO addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L28 ALIGN_3 .L26: addps %xmm6, %xmm10 pshufd $0x39, %xmm2, %xmm7 mulps %xmm0, %xmm2 addps %xmm4, %xmm11 pshufd $0x39, %xmm7, %xmm6 mulps %xmm0, %xmm7 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0x39, %xmm6, %xmm4 mulps %xmm0, %xmm6 addps %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: movups ALPHA_R, %xmm7 addps %xmm6, %xmm10 addps %xmm4, %xmm11 movaps %xmm9, %xmm4 shufps $0xd8, %xmm8, %xmm9 shufps $0xd8, %xmm11, %xmm8 shufps $0xd8, %xmm10, %xmm11 shufps $0xd8, %xmm4, %xmm10 movaps %xmm8, %xmm4 shufps $0xd8, %xmm10, %xmm8 shufps $0xd8, %xmm4, %xmm10 movaps %xmm9, %xmm5 shufps $0xd8, %xmm11, %xmm9 shufps $0xd8, %xmm5, %xmm11 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 addps %xmm0, %xmm4 addps %xmm1, %xmm9 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 movhps 6 * SIZE(CO1, LDC, 2), %xmm1 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 addps %xmm0, %xmm4 addps %xmm1, %xmm10 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm10, 4 * SIZE(CO1, LDC, 2) movhps %xmm10, 6 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 2 * SIZE(CO2, LDC, 2), %xmm0 movsd 4 * SIZE(CO2, LDC, 2), %xmm1 movhps 6 * SIZE(CO2, LDC, 2), %xmm1 pshufd $0x50, %xmm11, %xmm4 pshufd $0xfa, %xmm11, %xmm11 mulps %xmm7, %xmm4 mulps %xmm7, %xmm11 addps %xmm0, %xmm4 addps %xmm1, %xmm11 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm11, 4 * SIZE(CO2, LDC, 2) movhps %xmm11, 6 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 ALIGN_4 .L30: testq $2, M BRANCH jle .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm0, %xmm1 addps %xmm3, %xmm8 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm9 pshufd $0xfa, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 pshufd $0xee, %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm2, %xmm4 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 pshufd $0x44, %xmm0, %xmm1 addps %xmm3, %xmm8 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm9 pshufd $0xfa, %xmm2, %xmm4 movaps -20 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 pshufd $0xee, %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm11 pshufd $0xfa, %xmm2, %xmm4 movaps -16 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L38 ALIGN_3 .L36: pshufd $0x44, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm4, %xmm9 pshufd $0xfa, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm4 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: movups ALPHA_R, %xmm7 addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm3, %xmm8 addps %xmm4, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 0 * SIZE(CO2) movhps %xmm8, 2 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 0 * SIZE(CO2, LDC, 2), %xmm1 movhps 2 * SIZE(CO2, LDC, 2), %xmm1 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 addps %xmm0, %xmm4 addps %xmm1, %xmm9 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm9, 0 * SIZE(CO2, LDC, 2) movhps %xmm9, 2 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -30 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movaps -24 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -29 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -28 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movaps -16 * SIZE(BO), %xmm2 subq $ -4 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L48 ALIGN_3 .L46: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: movups ALPHA_R, %xmm7 addps %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 0 * SIZE(CO2), %xmm0 movsd 0 * SIZE(CO1, LDC, 2), %xmm1 movhps 0 * SIZE(CO2, LDC, 2), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 0 * SIZE(CO2) movlps %xmm8, 0 * SIZE(CO1, LDC, 2) movhps %xmm8, 0 * SIZE(CO2, LDC, 2) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif movq BO, B leaq (C, LDC, 4), C subq $1, J BRANCH jg .L10 ALIGN_4 .L50: testq $2, N jle .L90 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $3, I NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO #endif prefetcht2 -32 * SIZE(BB) subq $-8 * SIZE, BB movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 movaps -28 * SIZE(AO), %xmm1 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 prefetcht0 7 * SIZE(CO1) movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 prefetcht0 7 * SIZE(CO2) movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0x00, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0x55, %xmm2, %xmm6 mulps %xmm1, %xmm6 movaps -20 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addps %xmm5, %xmm10 pshufd $0xaa, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0xff, %xmm2, %xmm6 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps -12 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0x00, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0x55, %xmm2, %xmm6 mulps %xmm1, %xmm6 movaps -4 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps 0 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0xaa, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0xff, %xmm2, %xmm6 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps 4 * SIZE(AO), %xmm1 subq $-32 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm5, %xmm10 pshufd $0x00, %xmm2, %xmm5 mulps %xmm1, %xmm5 addps %xmm6, %xmm11 pshufd $0x55, %xmm2, %xmm6 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: movups ALPHA_R, %xmm7 addps %xmm3, %xmm8 addps %xmm4, %xmm9 addps %xmm5, %xmm10 addps %xmm6, %xmm11 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 8 * SIZE(CO1), %xmm2 movhps 10 * SIZE(CO1), %xmm2 movsd 12 * SIZE(CO1), %xmm3 movhps 14 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm10, %xmm5 pshufd $0xfa, %xmm10, %xmm10 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm10 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm10 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm10, 12 * SIZE(CO1) movhps %xmm10, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 movsd 8 * SIZE(CO2), %xmm2 movhps 10 * SIZE(CO2), %xmm2 movsd 12 * SIZE(CO2), %xmm3 movhps 14 * SIZE(CO2), %xmm3 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 pshufd $0x50, %xmm11, %xmm5 pshufd $0xfa, %xmm11, %xmm11 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 mulps %xmm7, %xmm5 mulps %xmm7, %xmm11 addps %xmm0, %xmm4 addps %xmm1, %xmm9 addps %xmm2, %xmm5 addps %xmm3, %xmm11 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movlps %xmm5, 8 * SIZE(CO2) movhps %xmm5, 10 * SIZE(CO2) movlps %xmm11, 12 * SIZE(CO2) movhps %xmm11, 14 * SIZE(CO2) addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 decq I BRANCH jg .L51 ALIGN_4 .L60: testq $4, M BRANCH jle .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xff, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm3, %xmm10 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm11 pshufd $0xff, %xmm2, %xmm4 movaps -24 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L68 ALIGN_3 .L66: addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: movups ALPHA_R, %xmm7 addps %xmm10, %xmm8 addps %xmm11, %xmm9 addps %xmm3, %xmm8 addps %xmm4, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 addps %xmm0, %xmm4 addps %xmm1, %xmm9 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 ALIGN_4 .L70: testq $2, M BRANCH jle .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 movaps -32 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L75 ALIGN_3 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x44, %xmm0, %xmm1 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm9 pshufd $0xee, %xmm0, %xmm1 movaps -28 * SIZE(AO), %xmm0 pshufd $0xfa, %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm3 addps %xmm3, %xmm8 pshufd $0x44, %xmm0, %xmm1 pshufd $0x50, %xmm2, %xmm3 mulps %xmm1, %xmm3 addps %xmm3, %xmm9 pshufd $0xee, %xmm0, %xmm1 movaps -24 * SIZE(AO), %xmm0 pshufd $0xfa, %xmm2, %xmm3 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm3 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L72 ALIGN_3 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L78 ALIGN_3 .L76: addps %xmm3, %xmm8 pshufd $0x44, %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 pshufd $0x50, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm3 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L76 ALIGN_3 .L78: movups ALPHA_R, %xmm7 addps %xmm9, %xmm8 addps %xmm3, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 0 * SIZE(CO2) movhps %xmm8, 2 * SIZE(CO2) addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L80: testq $1, M BRANCH jle .L89 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L85 ALIGN_3 .L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movsd -30 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -30 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movsd -28 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -29 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movsd -26 * SIZE(BO), %xmm2 pshufd $0x00, %xmm0, %xmm1 movss -28 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm9 movsd -24 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L82 ALIGN_3 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L88 ALIGN_3 .L86: pshufd $0x00, %xmm0, %xmm1 movss -31 * SIZE(AO), %xmm0 mulps %xmm1, %xmm2 addps %xmm2, %xmm8 movsd -30 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L86 ALIGN_3 .L88: movups ALPHA_R, %xmm7 addps %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 0 * SIZE(CO2), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 0 * SIZE(CO2) ALIGN_4 .L89: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif movq BO, B leaq (C, LDC, 2), C ALIGN_4 .L90: testq $1, N jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $3, I NOBRANCH jle .L100 ALIGN_4 .L91: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movaps -28 * SIZE(AO), %xmm1 xorps %xmm9, %xmm9 movsd -32 * SIZE(BO), %xmm2 xorps %xmm10, %xmm10 prefetcht0 7 * SIZE(CO1) xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L95 ALIGN_3 .L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm3, %xmm0 addps %xmm0, %xmm8 movaps -24 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm9 movaps -20 * SIZE(AO), %xmm1 pshufd $0x55, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm3, %xmm0 addps %xmm0, %xmm10 movaps -16 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm11 movaps -12 * SIZE(AO), %xmm1 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm3, %xmm0 addps %xmm0, %xmm8 movaps -8 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm9 movaps -4 * SIZE(AO), %xmm1 pshufd $0x55, %xmm2, %xmm3 movsd -28 * SIZE(BO), %xmm2 mulps %xmm3, %xmm0 addps %xmm0, %xmm10 movaps 0 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm11 movaps 4 * SIZE(AO), %xmm1 subq $-32 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L92 ALIGN_3 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L98 ALIGN_3 .L96: pshufd $0x00, %xmm2, %xmm3 movss -31 * SIZE(BO), %xmm2 mulps %xmm3, %xmm0 addps %xmm0, %xmm8 movaps -24 * SIZE(AO), %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm9 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L96 ALIGN_3 .L98: movups ALPHA_R, %xmm7 addps %xmm10, %xmm8 addps %xmm11, %xmm9 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 8 * SIZE(CO1), %xmm2 movhps 10 * SIZE(CO1), %xmm2 movsd 12 * SIZE(CO1), %xmm3 movhps 14 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm9, %xmm5 pshufd $0xfa, %xmm9, %xmm9 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm9 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm9 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm9, 12 * SIZE(CO1) movhps %xmm9, 14 * SIZE(CO1) addq $16 * SIZE, CO1 decq I BRANCH jg .L91 ALIGN_4 .L100: testq $4, M BRANCH jle .L110 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movsd -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L105 ALIGN_3 .L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm9 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movaps -20 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movaps -16 * SIZE(AO), %xmm0 addps %xmm3, %xmm9 subq $-16 * SIZE, AO subq $ -4 * SIZE, BO subq $1, %rax BRANCH jg .L102 ALIGN_3 .L105: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L108 ALIGN_3 .L106: pshufd $0x00, %xmm2, %xmm3 movss -31 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 addq $4 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L106 ALIGN_3 .L108: movups ALPHA_R, %xmm7 addps %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 addps %xmm0, %xmm4 addps %xmm1, %xmm8 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) addq $8 * SIZE, CO1 ALIGN_4 .L110: testq $2, M BRANCH jle .L120 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 movsd -32 * SIZE(BO), %xmm2 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L115 ALIGN_3 .L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movsd -30 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movsd -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 movsd -26 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x55, %xmm2, %xmm3 movsd -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movsd -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 subq $-8 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L112 ALIGN_3 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L118 ALIGN_3 .L116: pshufd $0x00, %xmm2, %xmm3 movss -31 * SIZE(BO), %xmm2 mulps %xmm0, %xmm3 movsd -30 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 addq $2 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L116 ALIGN_3 .L118: movups ALPHA_R, %xmm7 movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) addq $4 * SIZE, CO1 ALIGN_4 .L120: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 movss -32 * SIZE(BO), %xmm2 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L125 ALIGN_3 .L122: mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -30 * SIZE(AO), %xmm0 addss %xmm2, %xmm9 movss -30 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -29 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -29 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 movss -28 * SIZE(AO), %xmm0 addss %xmm2, %xmm9 movss -28 * SIZE(BO), %xmm2 subq $-4 * SIZE, AO subq $-4 * SIZE, BO subq $1, %rax BRANCH jg .L122 ALIGN_3 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L128 ALIGN_3 .L126: mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 movss -31 * SIZE(BO), %xmm2 addq $1 * SIZE, AO addq $1 * SIZE, BO subq $1, %rax BRANCH jg .L126 ALIGN_3 .L128: movups ALPHA_R, %xmm7 addss %xmm9, %xmm8 movsd 0 * SIZE(CO1), %xmm0 pshufd $0x50, %xmm8, %xmm4 mulps %xmm7, %xmm4 addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_8x4_sse.S000066400000000000000000002173211313527062700220100ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #ifdef OPTERON #define movsd movlps #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (16 * 5 + 8) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (16 * 5 + 8) #endif #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) #ifndef GENERIC #define KERNEL1(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL2(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL3(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL4(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ addps %xmm6, %xmm15 ;\ movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL6(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL7(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL8(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #else #define KERNEL1(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL2(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ #define KERNEL3(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL4(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm1, %xmm8 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL6(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL7(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL8(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #endif #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif EMMS movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-1024, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N movss %xmm0, 0 + ALPHA movss %xmm1, 4 + ALPHA movss %xmm0, 8 + ALPHA movss %xmm1, 12 + ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif subq $-32 * SIZE, A salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 movd 4 * SIZE(B), %mm4 movd 5 * SIZE(B), %mm5 movd 6 * SIZE(B), %mm6 movd 7 * SIZE(B), %mm7 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) movq %mm4, 16 * SIZE(BO) movq %mm4, 18 * SIZE(BO) movq %mm5, 20 * SIZE(BO) movq %mm5, 22 * SIZE(BO) movq %mm6, 24 * SIZE(BO) movq %mm6, 26 * SIZE(BO) movq %mm7, 28 * SIZE(BO) movq %mm7, 30 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: PREFETCH 0 * SIZE(BB) subq $-16 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movaps -28 * SIZE(AO), %xmm2 movaps -28 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movaps -24 * SIZE(AO), %xmm4 movaps -24 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movaps -20 * SIZE(AO), %xmm6 movaps -16 * SIZE(BO), %xmm7 pxor %xmm11, %xmm11 PREFETCHW 15 * SIZE(CO1) pxor %xmm12, %xmm12 PREFETCHW 15 * SIZE(CO2) pxor %xmm13, %xmm13 PREFETCHW 15 * SIZE(CO1, LDC, 2) pxor %xmm14, %xmm14 PREFETCHW 15 * SIZE(CO2, LDC, 2) pxor %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #ifndef GENERIC andq $-8, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax BRANCH jl .L12 ALIGN_3 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $64 * SIZE, BO addq $32 * SIZE, AO ALIGN_3 #else sarq $2, %rax NOBRANCH jle .L16 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $ 64 * SIZE, BO subq $-32 * SIZE, AO decq %rax BRANCH jg .L12 #endif .L16: movaps ALPHA, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L18 leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_4 .L17: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO, %rax, 8), %xmm0 addps %xmm1, %xmm10 movaps -32 * SIZE(BO, %rax, 8), %xmm1 addps %xmm0, %xmm11 movaps -24 * SIZE(AO, %rax, 4), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm12 movaps -28 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm13 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm2, %xmm1 mulps -20 * SIZE(BO, %rax, 8), %xmm2 addps %xmm1, %xmm14 movaps -16 * SIZE(BO, %rax, 8), %xmm1 addps %xmm2, %xmm15 movaps -20 * SIZE(AO, %rax, 4), %xmm2 addq $SIZE * 2, %rax jl .L17 ALIGN_4 .L18: movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 8 * SIZE(CO1), %xmm2 movhps 10 * SIZE(CO1), %xmm2 movsd 12 * SIZE(CO1), %xmm3 movhps 14 * SIZE(CO1), %xmm3 pshufd $0x50, %xmm8, %xmm4 pshufd $0xfa, %xmm8, %xmm8 pshufd $0x50, %xmm12, %xmm5 pshufd $0xfa, %xmm12, %xmm12 mulps %xmm7, %xmm4 mulps %xmm7, %xmm8 mulps %xmm7, %xmm5 mulps %xmm7, %xmm12 addps %xmm0, %xmm4 addps %xmm1, %xmm8 addps %xmm2, %xmm5 addps %xmm3, %xmm12 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm8, 4 * SIZE(CO1) movhps %xmm8, 6 * SIZE(CO1) movlps %xmm5, 8 * SIZE(CO1) movhps %xmm5, 10 * SIZE(CO1) movlps %xmm12, 12 * SIZE(CO1) movhps %xmm12, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm0 movhps 2 * SIZE(CO2), %xmm0 movsd 4 * SIZE(CO2), %xmm1 movhps 6 * SIZE(CO2), %xmm1 movsd 8 * SIZE(CO2), %xmm2 movhps 10 * SIZE(CO2), %xmm2 movsd 12 * SIZE(CO2), %xmm3 movhps 14 * SIZE(CO2), %xmm3 pshufd $0x50, %xmm9, %xmm4 pshufd $0xfa, %xmm9, %xmm9 pshufd $0x50, %xmm13, %xmm5 pshufd $0xfa, %xmm13, %xmm13 mulps %xmm7, %xmm4 mulps %xmm7, %xmm9 mulps %xmm7, %xmm5 mulps %xmm7, %xmm13 addps %xmm0, %xmm4 addps %xmm1, %xmm9 addps %xmm2, %xmm5 addps %xmm3, %xmm13 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) movlps %xmm5, 8 * SIZE(CO2) movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 movhps 6 * SIZE(CO1, LDC, 2), %xmm1 movsd 8 * SIZE(CO1, LDC, 2), %xmm2 movhps 10 * SIZE(CO1, LDC, 2), %xmm2 movsd 12 * SIZE(CO1, LDC, 2), %xmm3 movhps 14 * SIZE(CO1, LDC, 2), %xmm3 pshufd $0x50, %xmm10, %xmm4 pshufd $0xfa, %xmm10, %xmm10 pshufd $0x50, %xmm14, %xmm5 pshufd $0xfa, %xmm14, %xmm14 mulps %xmm7, %xmm4 mulps %xmm7, %xmm10 mulps %xmm7, %xmm5 mulps %xmm7, %xmm14 addps %xmm0, %xmm4 addps %xmm1, %xmm10 addps %xmm2, %xmm5 addps %xmm3, %xmm14 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm10, 4 * SIZE(CO1, LDC, 2) movhps %xmm10, 6 * SIZE(CO1, LDC, 2) movlps %xmm5, 8 * SIZE(CO1, LDC, 2) movhps %xmm5, 10 * SIZE(CO1, LDC, 2) movlps %xmm14, 12 * SIZE(CO1, LDC, 2) movhps %xmm14, 14 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhps 2 * SIZE(CO2, LDC, 2), %xmm0 movsd 4 * SIZE(CO2, LDC, 2), %xmm1 movhps 6 * SIZE(CO2, LDC, 2), %xmm1 movsd 8 * SIZE(CO2, LDC, 2), %xmm2 movhps 10 * SIZE(CO2, LDC, 2), %xmm2 movsd 12 * SIZE(CO2, LDC, 2), %xmm3 movhps 14 * SIZE(CO2, LDC, 2), %xmm3 pshufd $0x50, %xmm11, %xmm4 pshufd $0xfa, %xmm11, %xmm11 pshufd $0x50, %xmm15, %xmm5 pshufd $0xfa, %xmm15, %xmm15 mulps %xmm7, %xmm4 mulps %xmm7, %xmm11 mulps %xmm7, %xmm5 mulps %xmm7, %xmm15 addps %xmm0, %xmm4 addps %xmm1, %xmm11 addps %xmm2, %xmm5 addps %xmm3, %xmm15 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm11, 4 * SIZE(CO2, LDC, 2) movhps %xmm11, 6 * SIZE(CO2, LDC, 2) movlps %xmm5, 8 * SIZE(CO2, LDC, 2) movhps %xmm5, 10 * SIZE(CO2, LDC, 2) movlps %xmm15, 12 * SIZE(CO2, LDC, 2) movhps %xmm15, 14 * SIZE(CO2, LDC, 2) addq $16 * SIZE, CO1 # coffset += 4 addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $ 32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -28 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm4 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm7, %xmm4 mulps %xmm7, %xmm0 addps %xmm8, %xmm4 addps %xmm9, %xmm0 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 movsd 4 * SIZE(CO2), %xmm9 movhps 6 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm1, %xmm4 pshufd $0xfa, %xmm1, %xmm1 mulps %xmm7, %xmm4 mulps %xmm7, %xmm1 addps %xmm8, %xmm4 addps %xmm9, %xmm1 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhps 2 * SIZE(CO1, LDC, 2), %xmm8 movsd 4 * SIZE(CO1, LDC, 2), %xmm9 movhps 6 * SIZE(CO1, LDC, 2), %xmm9 pshufd $0x50, %xmm2, %xmm4 pshufd $0xfa, %xmm2, %xmm2 mulps %xmm7, %xmm4 mulps %xmm7, %xmm2 addps %xmm8, %xmm4 addps %xmm9, %xmm2 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 6 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhps 2 * SIZE(CO2, LDC, 2), %xmm8 movsd 4 * SIZE(CO2, LDC, 2), %xmm9 movhps 6 * SIZE(CO2, LDC, 2), %xmm9 pshufd $0x50, %xmm3, %xmm4 pshufd $0xfa, %xmm3, %xmm3 mulps %xmm7, %xmm4 mulps %xmm7, %xmm3 addps %xmm8, %xmm4 addps %xmm9, %xmm3 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) movlps %xmm3, 4 * SIZE(CO2, LDC, 2) movhps %xmm3, 6 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -28 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd -26 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd -16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd -22 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -18 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $ 16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movhps %xmm4, 2 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhps 2 * SIZE(CO1, LDC, 2), %xmm8 pshufd $0x50, %xmm2, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movhps %xmm4, 2 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhps 2 * SIZE(CO2, LDC, 2), %xmm8 pshufd $0x50, %xmm3, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) movhps %xmm4, 2 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L40: testq $1, M je .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L45 ALIGN_4 .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 addss %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulss %xmm8, %xmm11 addss %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 addss %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulss %xmm8, %xmm11 movss -30 * SIZE(AO), %xmm8 addss %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulss %xmm8, %xmm13 addss %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 addss %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulss %xmm8, %xmm13 movss -29 * SIZE(AO), %xmm8 addss %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulss %xmm8, %xmm15 addss %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 addss %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulss %xmm8, %xmm15 movss -24 * SIZE(AO), %xmm8 addss %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 mulss %xmm10, %xmm9 addss %xmm9, %xmm0 movss 68 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm1 movss 72 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 addss %xmm9, %xmm2 movss 76 * SIZE(BO), %xmm9 mulss %xmm10, %xmm9 movss -27 * SIZE(AO), %xmm10 addss %xmm9, %xmm3 movss 128 * SIZE(BO), %xmm9 mulss %xmm10, %xmm11 addss %xmm11, %xmm0 movss 84 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm1 movss 88 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 addss %xmm11, %xmm2 movss 92 * SIZE(BO), %xmm11 mulss %xmm10, %xmm11 movss -26 * SIZE(AO), %xmm10 addss %xmm11, %xmm3 movss 144 * SIZE(BO), %xmm11 mulss %xmm10, %xmm13 addss %xmm13, %xmm0 movss 100 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm1 movss 104 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 addss %xmm13, %xmm2 movss 108 * SIZE(BO), %xmm13 mulss %xmm10, %xmm13 movss -25 * SIZE(AO), %xmm10 addss %xmm13, %xmm3 movss 160 * SIZE(BO), %xmm13 mulss %xmm10, %xmm15 addss %xmm15, %xmm0 movss 116 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm1 movss 120 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 addss %xmm15, %xmm2 movss 124 * SIZE(BO), %xmm15 mulss %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addss %xmm15, %xmm3 movss 176 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 16 * SIZE(BO), %xmm9 addq $ 1 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: movsd 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 pshufd $0x50, %xmm2, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 pshufd $0x50, %xmm3, %xmm4 mulps %xmm7, %xmm4 addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 .L50: testq $2, N je .L100 .L51: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 .L52: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 PREFETCHNTA 32 * SIZE(B) shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCHNTA 32 * SIZE(B) movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 movd 4 * SIZE(B), %mm4 movd 5 * SIZE(B), %mm5 movd 6 * SIZE(B), %mm6 movd 7 * SIZE(B), %mm7 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) movq %mm4, 16 * SIZE(BO) movq %mm4, 18 * SIZE(BO) movq %mm5, 20 * SIZE(BO) movq %mm5, 22 * SIZE(BO) movq %mm6, 24 * SIZE(BO) movq %mm6, 26 * SIZE(BO) movq %mm7, 28 * SIZE(BO) movq %mm7, 30 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif decq %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $3, %rax BRANCH jle .L60 ALIGN_4 .L54: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) #endif addq $ 2 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW 15 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 15 * SIZE(CO2) pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 4 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 8 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 12 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 20 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 24 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 28 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 80 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps -24 * SIZE(AO), %xmm8 addq $8 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 8 * SIZE(CO1), %xmm10 movhps 10 * SIZE(CO1), %xmm10 movsd 12 * SIZE(CO1), %xmm11 movhps 14 * SIZE(CO1), %xmm11 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 pshufd $0x50, %xmm4, %xmm3 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 mulps %xmm7, %xmm3 mulps %xmm7, %xmm4 addps %xmm8, %xmm2 addps %xmm9, %xmm0 addps %xmm10, %xmm3 addps %xmm11, %xmm4 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movlps %xmm3, 8 * SIZE(CO1) movhps %xmm3, 10 * SIZE(CO1) movlps %xmm4, 12 * SIZE(CO1) movhps %xmm4, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 movsd 4 * SIZE(CO2), %xmm9 movhps 6 * SIZE(CO2), %xmm9 movsd 8 * SIZE(CO2), %xmm10 movhps 10 * SIZE(CO2), %xmm10 movsd 12 * SIZE(CO2), %xmm11 movhps 14 * SIZE(CO2), %xmm11 pshufd $0x50, %xmm1, %xmm2 pshufd $0xfa, %xmm1, %xmm1 pshufd $0x50, %xmm5, %xmm3 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm7, %xmm2 mulps %xmm7, %xmm1 mulps %xmm7, %xmm3 mulps %xmm7, %xmm5 addps %xmm8, %xmm2 addps %xmm9, %xmm1 addps %xmm10, %xmm3 addps %xmm11, %xmm5 movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movlps %xmm3, 8 * SIZE(CO2) movhps %xmm3, 10 * SIZE(CO2) movlps %xmm5, 12 * SIZE(CO2) movhps %xmm5, 14 * SIZE(CO2) addq $16 * SIZE, CO1 # coffset += 4 addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps -24 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps -20 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps -12 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps -8 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps -4 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -28 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 addps %xmm8, %xmm2 addps %xmm9, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 movsd 4 * SIZE(CO2), %xmm9 movhps 6 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm1, %xmm2 pshufd $0xfa, %xmm1, %xmm1 mulps %xmm7, %xmm2 mulps %xmm7, %xmm1 addps %xmm8, %xmm2 addps %xmm9, %xmm1 movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 ALIGN_4 .L80: testq $2, M je .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L85 ALIGN_4 .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -26 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd -16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -22 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd -20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -18 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd -8 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L86 ALIGN_4 .L88: addps %xmm2, %xmm0 addps %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L90: testq $1, M je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 movss 32 * SIZE(BO), %xmm13 movss 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movss 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movss 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movss 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -29 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movss 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movss 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movss -24 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movss 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movss 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -27 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movss 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movss 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movss -26 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movss 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movss 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -25 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movss 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movss 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movss -20 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movss 112 * SIZE(BO), %xmm15 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movss 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L96 ALIGN_4 .L98: addss %xmm2, %xmm0 addss %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm1, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO2) ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 4 * ldc ALIGN_4 .L100: testq $1, N je .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 .L102: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 PREFETCHNTA 32 * SIZE(B) shufps $0, %xmm0, %xmm0 shufps $0, %xmm1, %xmm1 shufps $0, %xmm2, %xmm2 shufps $0, %xmm3, %xmm3 shufps $0, %xmm4, %xmm4 shufps $0, %xmm5, %xmm5 shufps $0, %xmm6, %xmm6 shufps $0, %xmm7, %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCHNTA 32 * SIZE(B) movd 0 * SIZE(B), %mm0 movd 1 * SIZE(B), %mm1 movd 2 * SIZE(B), %mm2 movd 3 * SIZE(B), %mm3 movd 4 * SIZE(B), %mm4 movd 5 * SIZE(B), %mm5 movd 6 * SIZE(B), %mm6 movd 7 * SIZE(B), %mm7 punpckldq %mm0, %mm0 punpckldq %mm1, %mm1 punpckldq %mm2, %mm2 punpckldq %mm3, %mm3 punpckldq %mm4, %mm4 punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) movq %mm1, 6 * SIZE(BO) movq %mm2, 8 * SIZE(BO) movq %mm2, 10 * SIZE(BO) movq %mm3, 12 * SIZE(BO) movq %mm3, 14 * SIZE(BO) movq %mm4, 16 * SIZE(BO) movq %mm4, 18 * SIZE(BO) movq %mm5, 20 * SIZE(BO) movq %mm5, 22 * SIZE(BO) movq %mm6, 24 * SIZE(BO) movq %mm6, 26 * SIZE(BO) movq %mm7, 28 * SIZE(BO) movq %mm7, 30 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO #endif decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 * SIZE(BO) #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) movd 0 * SIZE(B), %mm0 punpckldq %mm0, %mm0 movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) #endif addq $ 1 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(AO), %xmm12 movaps 16 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 PREFETCHW 15 * SIZE(CO1) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm9, %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm9, %xmm8 mulps -20 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 mulps -12 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps -8 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 12 * SIZE(BO), %xmm9 mulps %xmm9, %xmm10 mulps -4 * SIZE(AO), %xmm9 addps %xmm10, %xmm0 movaps 48 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 mulps 4 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 8 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 20 * SIZE(BO), %xmm11 mulps %xmm11, %xmm12 mulps 12 * SIZE(AO), %xmm11 addps %xmm12, %xmm0 movaps 64 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 mulps 20 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 24 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 28 * SIZE(BO), %xmm11 mulps %xmm11, %xmm14 mulps 28 * SIZE(AO), %xmm11 addps %xmm14, %xmm0 movaps 80 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movaps 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm9, %xmm8 mulps -28 * SIZE(AO), %xmm9 addps %xmm8, %xmm0 movaps -24 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L116 ALIGN_4 .L118: movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 8 * SIZE(CO1), %xmm10 movhps 10 * SIZE(CO1), %xmm10 movsd 12 * SIZE(CO1), %xmm11 movhps 14 * SIZE(CO1), %xmm11 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 pshufd $0x50, %xmm4, %xmm3 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 mulps %xmm7, %xmm3 mulps %xmm7, %xmm4 addps %xmm8, %xmm2 addps %xmm9, %xmm0 addps %xmm10, %xmm3 addps %xmm11, %xmm4 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movlps %xmm3, 8 * SIZE(CO1) movhps %xmm3, 10 * SIZE(CO1) movlps %xmm4, 12 * SIZE(CO1) movhps %xmm4, 14 * SIZE(CO1) addq $16 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps -28 * SIZE(AO), %xmm8 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 32 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps -24 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps -20 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 movaps -12 * SIZE(AO), %xmm10 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps -8 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps -4 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 16 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L126 ALIGN_4 .L128: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm2 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm7, %xmm2 mulps %xmm7, %xmm0 addps %xmm8, %xmm2 addps %xmm9, %xmm0 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 ALIGN_4 .L130: testq $2, M je .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -28 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -26 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd -16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movsd -22 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -20 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -18 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd -8 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movaps 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: mulps %xmm8, %xmm9 movsd -30 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L140: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss -31 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 32 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss -30 * SIZE(AO), %xmm8 mulss 8 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss -29 * SIZE(AO), %xmm8 mulss 12 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss -24 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss -27 * SIZE(AO), %xmm10 mulss 20 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 48 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss -26 * SIZE(AO), %xmm10 mulss 24 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss -25 * SIZE(AO), %xmm10 mulss 28 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss -20 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: mulss %xmm8, %xmm9 movss -31 * SIZE(AO), %xmm8 addss %xmm9, %xmm0 movss 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 movsd 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm2 mulps %xmm7, %xmm2 addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S000066400000000000000000002012131313527062700220640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r12 #define BO %r13 #define CO1 %r14 #define CO2 %r15 #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define BUFFER 128(%rsp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 320 #define KERNEL1(address) \ mulps %xmm8, %xmm9; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AO); \ addps %xmm9, %xmm0; \ movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm1; \ movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm2; \ movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 4 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm3; \ movsldup 0 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm4; \ movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm5; \ movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm6; \ movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 8 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm7; \ movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm0; \ movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm1; \ movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm2; \ movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 12 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm3; \ movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm4; \ movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm5; \ movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm6; \ movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 64 * SIZE + (address) * SIZE(AO), %xmm8; \ addps %xmm9, %xmm7; \ movsldup 64 * SIZE + (address) * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm0; \ movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm1; \ movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm2; \ movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 20 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm3; \ movsldup 16 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm4; \ movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm5; \ movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm6; \ movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 24 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm7; \ movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm0; \ movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm1; \ movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm2; \ movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 28 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm3; \ movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm4; \ movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm5; \ movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm6; \ movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 80 * SIZE + (address) * SIZE(AO), %xmm10; \ addps %xmm11, %xmm7; \ movsldup 80 * SIZE + (address) * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulps %xmm12, %xmm13; \ PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * SIZE(AO); \ addps %xmm13, %xmm0; \ movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm1; \ movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm2; \ movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 36 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm3; \ movsldup 32 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm4; \ movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm5; \ movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm6; \ movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 40 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm7; \ movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm0; \ movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm1; \ movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm2; \ movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 44 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm3; \ movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm4; \ movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm5; \ movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm6; \ movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 96 * SIZE + (address) * SIZE(AO), %xmm12; \ addps %xmm13, %xmm7; \ movsldup 96 * SIZE + (address) * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm0; \ movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm1; \ movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm2; \ movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 52 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm3; \ movsldup 48 * SIZE + (address) * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm4; \ movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm5; \ movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm6; \ movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 56 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm7; \ movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm0; \ movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm1; \ movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm2; \ movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 60 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm3; \ movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm4; \ movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm5; \ movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm6; \ movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 112 * SIZE + (address) * SIZE(AO), %xmm14; \ addps %xmm15, %xmm7; \ movsldup 112 * SIZE + (address) * SIZE(BO), %xmm15 #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-1024, %rsp # align stack STACK_TOUCHING movss %xmm0, 0 + ALPHA movss %xmm1, 4 + ALPHA movss %xmm0, 8 + ALPHA movss %xmm1, 12 + ALPHA #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J # j = (n >> 2) jle .L50 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetcht1 128 * SIZE(BO) prefetcht0 112 * SIZE(B) addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq 112 * SIZE(B), BB movq M, I sarq $3, I # i = (m >> 3) jle .L20 ALIGN_4 .L11: prefetcht0 0 * SIZE(BB) prefetcht0 8 * SIZE(BB) subq $-16 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 movsldup 32 * SIZE(BO), %xmm13 movsldup 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 prefetchnta 15 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta 15 * SIZE(CO2) pxor %xmm5, %xmm5 prefetchnta 15 * SIZE(CO1, LDC, 2) pxor %xmm6, %xmm6 prefetchnta 15 * SIZE(CO2, LDC, 2) pxor %xmm7, %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $4, %rax #endif movq %rax, KKK #endif #if 1 andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1 (64 * 0) KERNEL2 (64 * 0) KERNEL3 (64 * 0) KERNEL4 (64 * 0) KERNEL5 (64 * 0) KERNEL6 (64 * 0) KERNEL7 (64 * 0) KERNEL8 (64 * 0) KERNEL9 (64 * 0) KERNEL10(64 * 0) KERNEL11(64 * 0) KERNEL12(64 * 0) KERNEL13(64 * 0) KERNEL14(64 * 0) KERNEL15(64 * 0) KERNEL16(64 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L12 KERNEL1 (64 * 1) KERNEL2 (64 * 1) KERNEL3 (64 * 1) KERNEL4 (64 * 1) KERNEL5 (64 * 1) KERNEL6 (64 * 1) KERNEL7 (64 * 1) KERNEL8 (64 * 1) KERNEL9 (64 * 1) KERNEL10(64 * 1) KERNEL11(64 * 1) KERNEL12(64 * 1) KERNEL13(64 * 1) KERNEL14(64 * 1) KERNEL15(64 * 1) KERNEL16(64 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L12 KERNEL1 (64 * 2) KERNEL2 (64 * 2) KERNEL3 (64 * 2) KERNEL4 (64 * 2) KERNEL5 (64 * 2) KERNEL6 (64 * 2) KERNEL7 (64 * 2) KERNEL8 (64 * 2) KERNEL9 (64 * 2) KERNEL10(64 * 2) KERNEL11(64 * 2) KERNEL12(64 * 2) KERNEL13(64 * 2) KERNEL14(64 * 2) KERNEL15(64 * 2) KERNEL16(64 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L12 KERNEL1 (64 * 3) KERNEL2 (64 * 3) KERNEL3 (64 * 3) KERNEL4 (64 * 3) KERNEL5 (64 * 3) KERNEL6 (64 * 3) KERNEL7 (64 * 3) KERNEL8 (64 * 3) KERNEL9 (64 * 3) KERNEL10(64 * 3) KERNEL11(64 * 3) KERNEL12(64 * 3) KERNEL13(64 * 3) KERNEL14(64 * 3) KERNEL15(64 * 3) KERNEL16(64 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L12 KERNEL1 (64 * 4) KERNEL2 (64 * 4) KERNEL3 (64 * 4) KERNEL4 (64 * 4) KERNEL5 (64 * 4) KERNEL6 (64 * 4) KERNEL7 (64 * 4) KERNEL8 (64 * 4) KERNEL9 (64 * 4) KERNEL10(64 * 4) KERNEL11(64 * 4) KERNEL12(64 * 4) KERNEL13(64 * 4) KERNEL14(64 * 4) KERNEL15(64 * 4) KERNEL16(64 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L12 KERNEL1 (64 * 5) KERNEL2 (64 * 5) KERNEL3 (64 * 5) KERNEL4 (64 * 5) KERNEL5 (64 * 5) KERNEL6 (64 * 5) KERNEL7 (64 * 5) KERNEL8 (64 * 5) KERNEL9 (64 * 5) KERNEL10(64 * 5) KERNEL11(64 * 5) KERNEL12(64 * 5) KERNEL13(64 * 5) KERNEL14(64 * 5) KERNEL15(64 * 5) KERNEL16(64 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L12 KERNEL1 (64 * 6) KERNEL2 (64 * 6) KERNEL3 (64 * 6) KERNEL4 (64 * 6) KERNEL5 (64 * 6) KERNEL6 (64 * 6) KERNEL7 (64 * 6) KERNEL8 (64 * 6) KERNEL9 (64 * 6) KERNEL10(64 * 6) KERNEL11(64 * 6) KERNEL12(64 * 6) KERNEL13(64 * 6) KERNEL14(64 * 6) KERNEL15(64 * 6) KERNEL16(64 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L12 KERNEL1 (64 * 7) KERNEL2 (64 * 7) KERNEL3 (64 * 7) KERNEL4 (64 * 7) KERNEL5 (64 * 7) KERNEL6 (64 * 7) KERNEL7 (64 * 7) KERNEL8 (64 * 7) KERNEL9 (64 * 7) KERNEL10(64 * 7) KERNEL11(64 * 7) KERNEL12(64 * 7) KERNEL13(64 * 7) KERNEL14(64 * 7) KERNEL15(64 * 7) KERNEL16(64 * 7) addq $64 * 8 * SIZE, AO addq $64 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #else sarq $3, %rax je .L15 ALIGN_4 .L12: KERNEL1 (64 * 0) KERNEL2 (64 * 0) KERNEL3 (64 * 0) KERNEL4 (64 * 0) KERNEL5 (64 * 0) KERNEL6 (64 * 0) KERNEL7 (64 * 0) KERNEL8 (64 * 0) KERNEL9 (64 * 0) KERNEL10(64 * 0) KERNEL11(64 * 0) KERNEL12(64 * 0) KERNEL13(64 * 0) KERNEL14(64 * 0) KERNEL15(64 * 0) KERNEL16(64 * 0) addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L12 #endif ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm6 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm7 movsldup 8 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L16 ALIGN_4 .L18: movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 8 * SIZE(CO1), %xmm10 movhps 10 * SIZE(CO1), %xmm10 movsd 12 * SIZE(CO1), %xmm11 movhps 14 * SIZE(CO1), %xmm11 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 pshufd $0x50, %xmm4, %xmm13 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 mulps %xmm15, %xmm13 mulps %xmm15, %xmm4 addps %xmm8, %xmm12 addps %xmm9, %xmm0 addps %xmm10, %xmm13 addps %xmm11, %xmm4 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movlps %xmm13, 8 * SIZE(CO1) movhps %xmm13, 10 * SIZE(CO1) movlps %xmm4, 12 * SIZE(CO1) movhps %xmm4, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 movsd 4 * SIZE(CO2), %xmm9 movhps 6 * SIZE(CO2), %xmm9 movsd 8 * SIZE(CO2), %xmm10 movhps 10 * SIZE(CO2), %xmm10 movsd 12 * SIZE(CO2), %xmm11 movhps 14 * SIZE(CO2), %xmm11 pshufd $0x50, %xmm1, %xmm12 pshufd $0xfa, %xmm1, %xmm1 pshufd $0x50, %xmm5, %xmm13 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm15, %xmm12 mulps %xmm15, %xmm1 mulps %xmm15, %xmm13 mulps %xmm15, %xmm5 addps %xmm8, %xmm12 addps %xmm9, %xmm1 addps %xmm10, %xmm13 addps %xmm11, %xmm5 movlps %xmm12, 0 * SIZE(CO2) movhps %xmm12, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movlps %xmm13, 8 * SIZE(CO2) movhps %xmm13, 10 * SIZE(CO2) movlps %xmm5, 12 * SIZE(CO2) movhps %xmm5, 14 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhps 2 * SIZE(CO1, LDC, 2), %xmm8 movsd 4 * SIZE(CO1, LDC, 2), %xmm9 movhps 6 * SIZE(CO1, LDC, 2), %xmm9 movsd 8 * SIZE(CO1, LDC, 2), %xmm10 movhps 10 * SIZE(CO1, LDC, 2), %xmm10 movsd 12 * SIZE(CO1, LDC, 2), %xmm11 movhps 14 * SIZE(CO1, LDC, 2), %xmm11 pshufd $0x50, %xmm2, %xmm12 pshufd $0xfa, %xmm2, %xmm2 pshufd $0x50, %xmm6, %xmm13 pshufd $0xfa, %xmm6, %xmm6 mulps %xmm15, %xmm12 mulps %xmm15, %xmm2 mulps %xmm15, %xmm13 mulps %xmm15, %xmm6 addps %xmm8, %xmm12 addps %xmm9, %xmm2 addps %xmm10, %xmm13 addps %xmm11, %xmm6 movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 6 * SIZE(CO1, LDC, 2) movlps %xmm13, 8 * SIZE(CO1, LDC, 2) movhps %xmm13, 10 * SIZE(CO1, LDC, 2) movlps %xmm6, 12 * SIZE(CO1, LDC, 2) movhps %xmm6, 14 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhps 2 * SIZE(CO2, LDC, 2), %xmm8 movsd 4 * SIZE(CO2, LDC, 2), %xmm9 movhps 6 * SIZE(CO2, LDC, 2), %xmm9 movsd 8 * SIZE(CO2, LDC, 2), %xmm10 movhps 10 * SIZE(CO2, LDC, 2), %xmm10 movsd 12 * SIZE(CO2, LDC, 2), %xmm11 movhps 14 * SIZE(CO2, LDC, 2), %xmm11 pshufd $0x50, %xmm3, %xmm12 pshufd $0xfa, %xmm3, %xmm3 pshufd $0x50, %xmm7, %xmm13 pshufd $0xfa, %xmm7, %xmm7 mulps %xmm15, %xmm12 mulps %xmm15, %xmm3 mulps %xmm15, %xmm13 mulps %xmm15, %xmm7 addps %xmm8, %xmm12 addps %xmm9, %xmm3 addps %xmm10, %xmm13 addps %xmm11, %xmm7 movlps %xmm12, 0 * SIZE(CO2, LDC, 2) movhps %xmm12, 2 * SIZE(CO2, LDC, 2) movlps %xmm3, 4 * SIZE(CO2, LDC, 2) movhps %xmm3, 6 * SIZE(CO2, LDC, 2) movlps %xmm13, 8 * SIZE(CO2, LDC, 2) movhps %xmm13, 10 * SIZE(CO2, LDC, 2) movlps %xmm7, 12 * SIZE(CO2, LDC, 2) movhps %xmm7, 14 * SIZE(CO2, LDC, 2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $16 * SIZE, CO1 # coffset += 4 addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $4, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 movsldup 32 * SIZE(BO), %xmm13 movsldup 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 32 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movsldup 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 32 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsldup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 20 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsldup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movsldup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 24 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movsldup 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 48 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsldup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 28 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsldup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movsldup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 48 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movsldup 112 * SIZE(BO), %xmm15 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 addq $4 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L26 ALIGN_4 .L28: movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 addps %xmm8, %xmm12 addps %xmm9, %xmm0 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 movsd 4 * SIZE(CO2), %xmm9 movhps 6 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm1, %xmm12 pshufd $0xfa, %xmm1, %xmm1 mulps %xmm15, %xmm12 mulps %xmm15, %xmm1 addps %xmm8, %xmm12 addps %xmm9, %xmm1 movlps %xmm12, 0 * SIZE(CO2) movhps %xmm12, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhps 2 * SIZE(CO1, LDC, 2), %xmm8 movsd 4 * SIZE(CO1, LDC, 2), %xmm9 movhps 6 * SIZE(CO1, LDC, 2), %xmm9 pshufd $0x50, %xmm2, %xmm12 pshufd $0xfa, %xmm2, %xmm2 mulps %xmm15, %xmm12 mulps %xmm15, %xmm2 addps %xmm8, %xmm12 addps %xmm9, %xmm2 movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm2, 4 * SIZE(CO1, LDC, 2) movhps %xmm2, 6 * SIZE(CO1, LDC, 2) movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhps 2 * SIZE(CO2, LDC, 2), %xmm8 movsd 4 * SIZE(CO2, LDC, 2), %xmm9 movhps 6 * SIZE(CO2, LDC, 2), %xmm9 pshufd $0x50, %xmm3, %xmm12 pshufd $0xfa, %xmm3, %xmm3 mulps %xmm15, %xmm12 mulps %xmm15, %xmm3 addps %xmm8, %xmm12 addps %xmm9, %xmm3 movlps %xmm12, 0 * SIZE(CO2, LDC, 2) movhps %xmm12, 2 * SIZE(CO2, LDC, 2) movlps %xmm3, 4 * SIZE(CO2, LDC, 2) movhps %xmm3, 6 * SIZE(CO2, LDC, 2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $2, M je .L40 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 32 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 12 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsd 16 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 20 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 24 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movsd 28 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movsd 64 * SIZE(BO), %xmm9 addps %xmm11, %xmm0 movsd 36 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 40 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 44 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 48 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movsd 52 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 56 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movsd 60 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsd 96 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L36 ALIGN_4 .L38: addps %xmm2, %xmm0 addps %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm9 movhps 2 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 addps %xmm8, %xmm12 addps %xmm9, %xmm0 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 2 * SIZE(CO2) movsd 0 * SIZE(CO1, LDC, 2), %xmm8 movhps 2 * SIZE(CO1, LDC, 2), %xmm8 movsd 0 * SIZE(CO2, LDC, 2), %xmm9 movhps 2 * SIZE(CO2, LDC, 2), %xmm9 pshufd $0x50, %xmm1, %xmm12 pshufd $0xfa, %xmm1, %xmm1 mulps %xmm15, %xmm12 mulps %xmm15, %xmm1 addps %xmm8, %xmm12 addps %xmm9, %xmm1 movlps %xmm12, 0 * SIZE(CO1, LDC, 2) movhps %xmm12, 2 * SIZE(CO1, LDC, 2) movlps %xmm1, 0 * SIZE(CO2, LDC, 2) movhps %xmm1, 2 * SIZE(CO2, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L40: testq $1, M je .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 32 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L45 ALIGN_4 .L42: shufps $0, %xmm8, %xmm8 movhps 4 * SIZE(BO), %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 8 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 movhps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 16 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 movhps 20 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 3 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 24 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 movhps 28 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 64 * SIZE(BO), %xmm9 shufps $0, %xmm10, %xmm10 movhps 36 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 40 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 movhps 44 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 6 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 movhps 52 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 7 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 56 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 movhps 60 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movss 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 96 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L48 ALIGN_4 .L46: shufps $0, %xmm8, %xmm8 movhps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 8 * SIZE(BO), %xmm9 addq $1 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L46 ALIGN_4 .L48: addps %xmm1, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhps 0 * SIZE(CO2), %xmm8 movsd 0 * SIZE(CO1, LDC, 2), %xmm9 movhps 0 * SIZE(CO2, LDC, 2), %xmm9 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 addps %xmm8, %xmm12 addps %xmm9, %xmm0 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 0 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 0 * SIZE(CO2, LDC, 2) ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK #endif leaq (C, LDC, 4), C # c += 4 * ldc decq J # j -- jg .L01 .L50: testq $2, N je .L100 .L51: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L53 ALIGN_4 .L52: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetcht1 128 * SIZE(BO) prefetcht0 112 * SIZE(B) addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L53: movq K, %rax andq $7, %rax BRANCH jle .L60 ALIGN_4 .L54: movddup 0 * SIZE(B), %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $ 2 * SIZE, B addq $ 4 * SIZE, BO decq %rax jne .L54 ALIGN_4 .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L70 ALIGN_4 .L61: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 prefetcht2 4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetcht2 4 * SIZE(CO2) pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 64 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 20 * SIZE(AO), %xmm10 addps %xmm9, %xmm1 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 24 * SIZE(AO), %xmm10 addps %xmm9, %xmm5 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 28 * SIZE(AO), %xmm10 addps %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 80 * SIZE(AO), %xmm10 addps %xmm9, %xmm5 movsldup 32 * SIZE(BO), %xmm9 mulps %xmm12, %xmm11 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 36 * SIZE(AO), %xmm12 addps %xmm11, %xmm1 movsldup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm4 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 40 * SIZE(AO), %xmm12 addps %xmm11, %xmm5 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm0 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 44 * SIZE(AO), %xmm12 addps %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm4 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 96 * SIZE(AO), %xmm12 addps %xmm11, %xmm5 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 52 * SIZE(AO), %xmm14 addps %xmm11, %xmm1 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm4 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 56 * SIZE(AO), %xmm14 addps %xmm11, %xmm5 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm0 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 60 * SIZE(AO), %xmm14 addps %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm4 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 112 * SIZE(AO), %xmm14 addps %xmm11, %xmm5 movsldup 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L66 ALIGN_4 .L68: movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 8 * SIZE(CO1), %xmm10 movhps 10 * SIZE(CO1), %xmm10 movsd 12 * SIZE(CO1), %xmm11 movhps 14 * SIZE(CO1), %xmm11 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 pshufd $0x50, %xmm4, %xmm13 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 mulps %xmm15, %xmm13 mulps %xmm15, %xmm4 addps %xmm8, %xmm12 addps %xmm9, %xmm0 addps %xmm10, %xmm13 addps %xmm11, %xmm4 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movlps %xmm13, 8 * SIZE(CO1) movhps %xmm13, 10 * SIZE(CO1) movlps %xmm4, 12 * SIZE(CO1) movhps %xmm4, 14 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 movsd 4 * SIZE(CO2), %xmm9 movhps 6 * SIZE(CO2), %xmm9 movsd 8 * SIZE(CO2), %xmm10 movhps 10 * SIZE(CO2), %xmm10 movsd 12 * SIZE(CO2), %xmm11 movhps 14 * SIZE(CO2), %xmm11 pshufd $0x50, %xmm1, %xmm12 pshufd $0xfa, %xmm1, %xmm1 pshufd $0x50, %xmm5, %xmm13 pshufd $0xfa, %xmm5, %xmm5 mulps %xmm15, %xmm12 mulps %xmm15, %xmm1 mulps %xmm15, %xmm13 mulps %xmm15, %xmm5 addps %xmm8, %xmm12 addps %xmm9, %xmm1 addps %xmm10, %xmm13 addps %xmm11, %xmm5 movlps %xmm12, 0 * SIZE(CO2) movhps %xmm12, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) movlps %xmm13, 8 * SIZE(CO2) movhps %xmm13, 10 * SIZE(CO2) movlps %xmm5, 12 * SIZE(CO2) movhps %xmm5, 14 * SIZE(CO2) addq $16 * SIZE, CO1 # coffset += 4 addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 ALIGN_4 .L70: testq $4, M je .L80 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps 0 * SIZE(AO), %xmm8 movsldup 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(AO), %xmm10 movsldup 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movsldup 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 28 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 48 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movsldup 48 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 addps %xmm8, %xmm12 addps %xmm9, %xmm0 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movsd 0 * SIZE(CO2), %xmm8 movhps 2 * SIZE(CO2), %xmm8 movsd 4 * SIZE(CO2), %xmm9 movhps 6 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm1, %xmm12 pshufd $0xfa, %xmm1, %xmm1 mulps %xmm15, %xmm12 mulps %xmm15, %xmm1 addps %xmm8, %xmm12 addps %xmm9, %xmm1 movlps %xmm12, 0 * SIZE(CO2) movhps %xmm12, 2 * SIZE(CO2) movlps %xmm1, 4 * SIZE(CO2) movhps %xmm1, 6 * SIZE(CO2) addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 ALIGN_4 .L80: testq $2, M je .L90 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L85 ALIGN_4 .L82: shufps $0x50, %xmm9, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L82 ALIGN_4 .L85: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L88 ALIGN_4 .L86: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L86 ALIGN_4 .L88: addps %xmm1, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm9 movhps 2 * SIZE(CO2), %xmm9 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 addps %xmm8, %xmm12 addps %xmm9, %xmm0 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 0 * SIZE(CO2) movhps %xmm0, 2 * SIZE(CO2) addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L90: testq $1, M je .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movsd 0 * SIZE(BO), %xmm9 movsd 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L95 ALIGN_4 .L92: shufps $0, %xmm8, %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 3 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 6 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 7 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 shufps $0, %xmm10, %xmm10 mulps %xmm10, %xmm11 movss 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L92 ALIGN_4 .L95: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L98 ALIGN_4 .L96: shufps $0, %xmm8, %xmm8 mulps %xmm8, %xmm9 movss 1 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $1 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L96 ALIGN_4 .L98: addps %xmm1, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhps 0 * SIZE(CO2), %xmm8 pshufd $0x50, %xmm0, %xmm12 mulps %xmm15, %xmm12 addps %xmm8, %xmm12 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 0 * SIZE(CO2) ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 4 * ldc ALIGN_4 .L100: testq $1, N je .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 .L102: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 movss 3 * SIZE(B), %xmm3 movss 4 * SIZE(B), %xmm4 movss 5 * SIZE(B), %xmm5 movss 6 * SIZE(B), %xmm6 movss 7 * SIZE(B), %xmm7 movss %xmm0, 0 * SIZE(BO) movss %xmm0, 1 * SIZE(BO) movss %xmm1, 2 * SIZE(BO) movss %xmm1, 3 * SIZE(BO) movss %xmm2, 4 * SIZE(BO) movss %xmm2, 5 * SIZE(BO) movss %xmm3, 6 * SIZE(BO) movss %xmm3, 7 * SIZE(BO) movss %xmm4, 8 * SIZE(BO) movss %xmm4, 9 * SIZE(BO) movss %xmm5, 10 * SIZE(BO) movss %xmm5, 11 * SIZE(BO) movss %xmm6, 12 * SIZE(BO) movss %xmm6, 13 * SIZE(BO) movss %xmm7, 14 * SIZE(BO) movss %xmm7, 15 * SIZE(BO) addq $ 8 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $7, %rax BRANCH jle .L110 ALIGN_4 .L104: movss 0 * SIZE(B), %xmm0 movss %xmm0, 0 * SIZE(BO) movss %xmm0, 1 * SIZE(BO) addq $ 1 * SIZE, B addq $ 2 * SIZE, BO decq %rax jne .L104 ALIGN_4 .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $3, I # i = (m >> 3) jle .L120 ALIGN_4 .L111: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movddup 0 * SIZE(BO), %xmm9 movddup 8 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 prefetchnta 8 * SIZE(CO1) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L115 ALIGN_4 .L112: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movddup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movddup 2 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 64 * SIZE(AO), %xmm8 addps %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 20 * SIZE(AO), %xmm10 addps %xmm9, %xmm0 movddup 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 24 * SIZE(AO), %xmm10 addps %xmm9, %xmm4 movddup 6 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 28 * SIZE(AO), %xmm10 addps %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 80 * SIZE(AO), %xmm10 addps %xmm9, %xmm5 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) movddup 8 * SIZE(BO), %xmm9 mulps %xmm12, %xmm9 movaps 36 * SIZE(AO), %xmm12 addps %xmm9, %xmm0 movddup 16 * SIZE(BO), %xmm9 mulps %xmm12, %xmm11 movaps 40 * SIZE(AO), %xmm12 addps %xmm11, %xmm4 movddup 10 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 44 * SIZE(AO), %xmm12 addps %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 96 * SIZE(AO), %xmm12 addps %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 52 * SIZE(AO), %xmm14 addps %xmm11, %xmm0 movddup 12 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 56 * SIZE(AO), %xmm14 addps %xmm11, %xmm4 movddup 14 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 60 * SIZE(AO), %xmm14 addps %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 112 * SIZE(AO), %xmm14 addps %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L112 ALIGN_4 .L115: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L118 ALIGN_4 .L116: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movddup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm4 movddup 2 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L116 ALIGN_4 .L118: addps %xmm1, %xmm0 addps %xmm5, %xmm4 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 movsd 8 * SIZE(CO1), %xmm10 movhps 10 * SIZE(CO1), %xmm10 movsd 12 * SIZE(CO1), %xmm11 movhps 14 * SIZE(CO1), %xmm11 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 pshufd $0x50, %xmm4, %xmm13 pshufd $0xfa, %xmm4, %xmm4 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 mulps %xmm15, %xmm13 mulps %xmm15, %xmm4 addps %xmm8, %xmm12 addps %xmm9, %xmm0 addps %xmm10, %xmm13 addps %xmm11, %xmm4 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) movlps %xmm13, 8 * SIZE(CO1) movhps %xmm13, 10 * SIZE(CO1) movlps %xmm4, 12 * SIZE(CO1) movhps %xmm4, 14 * SIZE(CO1) addq $16 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 ALIGN_4 .L120: testq $4, M je .L130 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 0 * SIZE(AO), %xmm8 movddup 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(AO), %xmm10 movddup 8 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L125 ALIGN_4 .L122: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movddup 2 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movddup 6 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 32 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movddup 16 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movddup 10 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 28 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movddup 14 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 48 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L122 ALIGN_4 .L125: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L128 ALIGN_4 .L126: mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movddup 2 * SIZE(BO), %xmm9 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L126 ALIGN_4 .L128: addps %xmm1, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 pshufd $0x50, %xmm0, %xmm12 pshufd $0xfa, %xmm0, %xmm0 mulps %xmm15, %xmm12 mulps %xmm15, %xmm0 addps %xmm8, %xmm12 addps %xmm9, %xmm0 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) movlps %xmm0, 4 * SIZE(CO1) movhps %xmm0, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 ALIGN_4 .L130: testq $2, M je .L140 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(AO), %xmm10 movaps 16 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $4, %rax je .L135 ALIGN_4 .L132: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 mulps 4 * SIZE(BO), %xmm8 addps %xmm8, %xmm1 movaps 8 * SIZE(AO), %xmm8 mulps 8 * SIZE(BO), %xmm8 addps %xmm8, %xmm2 movaps 12 * SIZE(AO), %xmm8 mulps 12 * SIZE(BO), %xmm8 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 movaps 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movaps 48 * SIZE(BO), %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm10, %xmm1 movaps 24 * SIZE(AO), %xmm10 mulps 24 * SIZE(BO), %xmm10 addps %xmm10, %xmm2 movaps 28 * SIZE(AO), %xmm10 mulps 28 * SIZE(BO), %xmm10 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L132 ALIGN_4 .L135: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $15, %rax # if (k & 1) BRANCH je .L138 ALIGN_4 .L136: movsd 0 * SIZE(AO), %xmm8 movsd 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L136 ALIGN_4 .L138: addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm2, %xmm0 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm12 mulps %xmm15, %xmm12 addps %xmm8, %xmm12 movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L140: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 movss 0 * SIZE(BO), %xmm9 movss 8 * SIZE(BO), %xmm11 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L145 ALIGN_4 .L142: mulss %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss 1 * SIZE(AO), %xmm8 mulss 2 * SIZE(BO), %xmm8 addss %xmm9, %xmm0 movss 16 * SIZE(BO), %xmm9 addss %xmm8, %xmm1 movss 2 * SIZE(AO), %xmm8 mulss 4 * SIZE(BO), %xmm8 addss %xmm8, %xmm2 movss 3 * SIZE(AO), %xmm8 mulss 6 * SIZE(BO), %xmm8 addss %xmm8, %xmm3 movss 8 * SIZE(AO), %xmm8 mulss %xmm10, %xmm11 movss 5 * SIZE(AO), %xmm10 mulss 10 * SIZE(BO), %xmm10 addss %xmm11, %xmm0 movss 24 * SIZE(BO), %xmm11 addss %xmm10, %xmm1 movss 6 * SIZE(AO), %xmm10 mulss 12 * SIZE(BO), %xmm10 addss %xmm10, %xmm2 movss 7 * SIZE(AO), %xmm10 mulss 14 * SIZE(BO), %xmm10 addss %xmm10, %xmm3 movss 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L142 ALIGN_4 .L145: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L148 ALIGN_4 .L146: movss 0 * SIZE(AO), %xmm8 movss 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 addq $1 * SIZE, AO addq $2 * SIZE, BO decq %rax jg .L146 ALIGN_4 .L148: addss %xmm1, %xmm0 addss %xmm3, %xmm2 addss %xmm2, %xmm0 movsd 0 * SIZE(CO1), %xmm8 pshufd $0x50, %xmm0, %xmm12 mulps %xmm15, %xmm12 addps %xmm8, %xmm12 movlps %xmm12, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_beta.S000066400000000000000000000135331313527062700176250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define N ARG2 #define C ARG3 #define LDC ARG4 #define C1 ARG5 #define STACK_C 16(%rsp) #define STACK_LDC 24(%rsp) #else #define STACKSIZE 256 #define M ARG1 #define N ARG2 #define C ARG3 #define LDC ARG4 #define C1 %r10 #define STACK_ALPHA_I 40 + STACKSIZE(%rsp) #define STACK_C 80 + STACKSIZE(%rsp) #define STACK_LDC 88 + STACKSIZE(%rsp) #endif #define I %rax PROLOGUE PROFCODE #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movaps %xmm3, %xmm0 movsd STACK_ALPHA_I, %xmm1 #endif pxor %xmm15, %xmm15 movq STACK_C, C movq STACK_LDC, LDC testq M, M jle .L999 testq N, N jle .L999 salq $ZBASE_SHIFT, LDC #ifdef DOUBLE ucomisd %xmm15, %xmm0 jne .L71 ucomisd %xmm15, %xmm1 jne .L71 #else ucomiss %xmm15, %xmm0 jne .L71 ucomiss %xmm15, %xmm1 jne .L71 #endif ALIGN_2 .L53: movq C, C1 # c_offset1 = c_offset addq LDC, C # c_offset += ldc movq M, I sarq $2, I jle .L56 ALIGN_2 .L57: #ifdef OPTERON prefetchw 64 * SIZE(C1) #endif MOVSD %xmm0, 0 * SIZE(C1) # c_offset1 MOVSD %xmm0, 1 * SIZE(C1) MOVSD %xmm0, 2 * SIZE(C1) MOVSD %xmm0, 3 * SIZE(C1) MOVSD %xmm0, 4 * SIZE(C1) MOVSD %xmm0, 5 * SIZE(C1) MOVSD %xmm0, 6 * SIZE(C1) MOVSD %xmm0, 7 * SIZE(C1) addq $8 * SIZE, C1 # c_offset1 += 8 decq I # i-- jg .L57 ALIGN_2 .L56: movq M, I andq $3, I jle .L62 ALIGN_2 .L63: MOVSD %xmm0, 0 * SIZE(C1) MOVSD %xmm0, 1 * SIZE(C1) addq $2 * SIZE,C1 decq I jg .L63 ALIGN_2 .L62: decq N # j -- jg .L53 jmp .L999 ALIGN_3 .L71: movq C, C1 addq LDC, C # c_offset += ldc movq M, I sarq $1, I jle .L84 ALIGN_3 .L85: #ifdef OPTERON prefetchw 16 * SIZE(C1) #endif MOVSD 0 * SIZE(C1), %xmm2 MOVSD 1 * SIZE(C1), %xmm3 MOVSD 0 * SIZE(C1), %xmm4 MOVSD 1 * SIZE(C1), %xmm5 MOVSD 2 * SIZE(C1), %xmm6 MOVSD 3 * SIZE(C1), %xmm7 MOVSD 2 * SIZE(C1), %xmm8 MOVSD 3 * SIZE(C1), %xmm9 MULSD %xmm0, %xmm2 MULSD %xmm1, %xmm3 MULSD %xmm1, %xmm4 MULSD %xmm0, %xmm5 MULSD %xmm0, %xmm6 MULSD %xmm1, %xmm7 MULSD %xmm1, %xmm8 MULSD %xmm0, %xmm9 SUBSD %xmm3, %xmm2 ADDPD %xmm5, %xmm4 SUBSD %xmm7, %xmm6 ADDPD %xmm9, %xmm8 MOVSD %xmm2, 0 * SIZE(C1) MOVSD %xmm4, 1 * SIZE(C1) MOVSD %xmm6, 2 * SIZE(C1) MOVSD %xmm8, 3 * SIZE(C1) addq $4 * SIZE, C1 decq I jg .L85 ALIGN_3 .L84: testq $1, M jle .L74 ALIGN_3 .L75: prefetchnta 80 * SIZE(C1) MOVSD 0 * SIZE(C1), %xmm2 MULSD %xmm0, %xmm2 MOVSD 1 * SIZE(C1), %xmm3 MULSD %xmm1, %xmm3 MOVSD 0 * SIZE(C1), %xmm4 MULSD %xmm1, %xmm4 MOVSD 1 * SIZE(C1), %xmm5 MULSD %xmm0, %xmm5 SUBSD %xmm3, %xmm2 ADDPD %xmm5, %xmm4 MOVSD %xmm2, 0 * SIZE(C1) MOVSD %xmm4, 1 * SIZE(C1) ALIGN_2 .L74: decq N jg .L71 ALIGN_2 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_1x4_nehalem.S000066400000000000000000000514641313527062700223640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %rbp #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rdx #define BB %r12 #define PREA %r10 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #define PREFETCHSIZE 4 #define PREFETCH prefetcht0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addpd #define ADD2 addpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 addpd #define ADD2 addpd #else #define ADD1 addpd #define ADD2 subpd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif testq M, M jle .L999 movq N, J sarq $2, J NOBRANCH jle .L20 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 2), CO2 movq A, AO movq K, %rax salq $ZBASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I ALIGN_4 .L11: prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif PADDING xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 1 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 prefetcht0 3 * SIZE(CO1, LDC) xorps %xmm11, %xmm11 movaps -16 * SIZE(AO), %xmm0 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 prefetcht0 1 * SIZE(CO2) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 prefetcht0 3 * SIZE(CO2, LDC) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -14 * SIZE(AO), %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 ADD1 %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 ADD1 %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm3 mulpd %xmm5, %xmm4 ADD1 %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps 4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 movaps -10 * SIZE(AO), %xmm5 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps 8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 mulpd %xmm5, %xmm2 ADD1 %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 PADDING; mulpd %xmm5, %xmm4 ADD1 %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm5, %xmm1 PADDING; mulpd %xmm5, %xmm2 ADD1 %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm5, %xmm3 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm5, %xmm4 subq $-32 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: ADD1 %xmm1, %xmm12 ADD2 %xmm2, %xmm13 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 haddpd %xmm13, %xmm12 haddpd %xmm15, %xmm14 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 testq $15, CO1 NOBRANCH jne .L18x #ifndef TRMMKERNEL movaps (CO1), %xmm0 movaps (CO1, LDC), %xmm1 movaps (CO2), %xmm2 movaps (CO2, LDC), %xmm3 addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 addpd %xmm2, %xmm12 addpd %xmm3, %xmm14 #endif movaps %xmm8, (CO1) movaps %xmm10, (CO1, LDC) movaps %xmm12, (CO2) movaps %xmm14, (CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 decq I BRANCH jg .L11 #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif leaq (C, LDC, 4), C movq BO, B subq $1, J BRANCH jg .L01 jmp .L20 ALIGN_4 .L18x: #ifndef TRMMKERNEL movups (CO1), %xmm0 movups (CO1, LDC), %xmm1 movups (CO2), %xmm2 movups (CO2, LDC), %xmm3 addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 addpd %xmm2, %xmm12 addpd %xmm3, %xmm14 #endif movups %xmm8, (CO1) movups %xmm10, (CO1, LDC) movups %xmm12, (CO2) movups %xmm14, (CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 decq I BRANCH jg .L11 #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif leaq (C, LDC, 4), C movq BO, B subq $1, J BRANCH jg .L01 ALIGN_4 .L20: testq $2, N BRANCH jle .L30 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq M, I ALIGN_4 .L21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 1 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm9 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm11 addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhpd 1 * SIZE(CO2), %xmm1 addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 decq I BRANCH jg .L21 #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C movq BO, B ALIGN_4 .L30: testq $1, N BRANCH jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L32 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif haddpd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm9 addsubpd %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 addpd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 decq I BRANCH jg .L31 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x1_atom.S000066400000000000000000000362031313527062700217030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KKK 72(%rsp) #define KK 80(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define OFFSET 240(%rsp) #define KKK 248(%rsp) #define KK 256(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 addsd #define ADDSD4 subsd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADDSD1 addsd #define ADDSD2 subsd #define ADDSD3 addsd #define ADDSD4 addsd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 subsd #define ADDSD4 addsd #else #define ADDSD1 addsd #define ADDSD2 subsd #define ADDSD3 subsd #define ADDSD4 subsd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif movsd %xmm0, ALPHA_R movsd %xmm1, ALPHA_I #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif salq $ZBASE_SHIFT, LDC movq N, J testq N, N jle .L999 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 addq LDC, C movq A, AO movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax), BB movq M, I sarq $1, I jle .L20 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L15 ALIGN_4 .L12: ADDSD2 %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO ADDSD3 %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH BRANCH je .L18 ALIGN_4 .L16: ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L18: movsd ALPHA_R, %xmm0 movsd ALPHA_I, %xmm1 ADDSD2 %xmm2, %xmm13 ADDSD3 %xmm7, %xmm14 ADDSD4 %xmm6, %xmm15 addsd %xmm11, %xmm8 addsd %xmm9, %xmm10 addsd %xmm15, %xmm12 addsd %xmm13, %xmm14 movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 movaps %xmm12, %xmm13 movaps %xmm14, %xmm15 mulsd %xmm0, %xmm8 mulsd %xmm1, %xmm9 mulsd %xmm1, %xmm10 mulsd %xmm0, %xmm11 subsd %xmm10, %xmm8 addsd %xmm11, %xmm9 mulsd %xmm0, %xmm12 mulsd %xmm1, %xmm13 mulsd %xmm1, %xmm14 mulsd %xmm0, %xmm15 subsd %xmm14, %xmm12 addsd %xmm15, %xmm13 #if !defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm9 addsd 2 * SIZE(CO1), %xmm12 addsd 3 * SIZE(CO1), %xmm13 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movsd %xmm13, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 decq I jg .L10 ALIGN_4 .L20: testq $1, M jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 ADDSD3 %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 ADDSD1 %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 ADDSD3 %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 ADDSD3 %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH BRANCH je .L29 ALIGN_4 .L26: ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 ADDSD1 %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD3 %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: movsd ALPHA_R, %xmm0 movsd ALPHA_I, %xmm1 ADDSD2 %xmm2, %xmm9 ADDSD4 %xmm6, %xmm11 addsd %xmm11, %xmm8 addsd %xmm9, %xmm10 movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 mulsd %xmm0, %xmm8 mulsd %xmm1, %xmm9 mulsd %xmm1, %xmm10 mulsd %xmm0, %xmm11 subsd %xmm10, %xmm8 addsd %xmm11, %xmm9 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addsd 0 * SIZE(CO1), %xmm8 addsd 1 * SIZE(CO1), %xmm9 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK #endif movq BO, B decq J # j -- jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x2_barcelona.S000066400000000000000000000767251313527062700227070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbp #define CO2 %rbx #define BB %r12 #define J %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define OFFSET 240(%rsp) #define KK 248(%rsp) #define KKK 256(%rsp) #endif #define movlpd movsd #define movapd movups #define movupd movups #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 subpd #define ADD2 addpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addpd #define ADD2 subpd #else #define ADD1 subpd #define ADD2 subpd #endif #define KERNEL1(xx) \ mulpd %xmm1, %xmm0 ;\ ADD1 %xmm0, %xmm8 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm0 ;\ ADD1 %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ ADD1 %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL2(xx) \ mulpd %xmm1, %xmm0 ;\ ADD1 %xmm0, %xmm8 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm0 ;\ ADD1 %xmm1, %xmm12 ;\ /*A*/ movapd (AO, %rax, 4), %xmm6 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm0, %xmm10 ;\ ADD1 %xmm1, %xmm14 ;\ /**/ movddup (BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL3(xx) \ mulpd %xmm5, %xmm4 ;\ ADD1 %xmm4, %xmm8 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm4 ;\ ADD1 %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ ADD1 %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL4(xx) \ mulpd %xmm5, %xmm4 ;\ ADD1 %xmm4, %xmm8 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm4 ;\ ADD1 %xmm5, %xmm12 ;\ /*A*/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm4, %xmm10 ;\ ADD1 %xmm5, %xmm14 ;\ /**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL5(xx) \ mulpd %xmm1, %xmm6 ;\ ADD1 %xmm6, %xmm8 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm6 ;\ ADD1 %xmm1, %xmm12 ;\ movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm6, %xmm10 ;\ movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ ADD1 %xmm1, %xmm14 ;\ movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm6, %xmm2 #define KERNEL6(xx) \ mulpd %xmm1, %xmm6 ;\ ADD1 %xmm6, %xmm8 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ movapd %xmm2, %xmm6 ;\ ADD1 %xmm1, %xmm12 ;\ /*A*/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm6, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm6 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm6, %xmm10 ;\ ADD1 %xmm1, %xmm14 ;\ /**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL7(xx) \ mulpd %xmm5, %xmm7 ;\ ADD1 %xmm7, %xmm8 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm7 ;\ ADD1 %xmm5, %xmm12 ;\ movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm7, %xmm10 ;\ movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ ADD1 %xmm5, %xmm14 ;\ movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm7, %xmm2 #define KERNEL8(xx) \ mulpd %xmm5, %xmm7 ;\ ADD1 %xmm7, %xmm8 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ movapd %xmm2, %xmm7 ;\ ADD1 %xmm5, %xmm12 ;\ /*A*/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm7, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm7 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm7, %xmm10 ;\ ADD1 %xmm5, %xmm14 ;\ /**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax ;\ #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ ADD1 %xmm1, %xmm12 ;\ movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm0, %xmm10 ;\ movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ ADD1 %xmm1, %xmm14 ;\ movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm0, %xmm8 ;\ movapd %xmm2, %xmm0 ;\ ADD1 %xmm1, %xmm12 ;\ movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm0, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm1, %xmm0 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ ADD1 %xmm0, %xmm10 ;\ movapd (AO, %rax, 4), %xmm0 ;\ ADD1 %xmm1, %xmm14 ;\ movddup (BO, %rax, 4), %xmm1 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ ADD1 %xmm5, %xmm12 ;\ movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm4, %xmm10 ;\ movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ ADD1 %xmm5, %xmm14 ;\ movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm4, %xmm8 ;\ movapd %xmm2, %xmm4 ;\ ADD1 %xmm5, %xmm12 ;\ movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm9 ;\ movapd %xmm4, %xmm2 ;\ ADD2 %xmm3, %xmm13 ;\ movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ mulpd %xmm5, %xmm4 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ ADD1 %xmm4, %xmm10 ;\ ADD1 %xmm5, %xmm14 ;\ mulpd %xmm3, %xmm2 ;\ mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ ADD2 %xmm2, %xmm11 ;\ ADD2 %xmm3, %xmm15 ;\ movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 #if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq OLD_M, M movq OLD_N, N movlpd %xmm0, ALPHA_R movlpd %xmm1, ALPHA_I #ifdef TRMMKERNEL movlpd %xmm12, OFFSET movlpd %xmm12, KK #ifndef LEFT negq KK #endif #endif subq $-16 * SIZE, A subq $-16 * SIZE, B salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq K, %rax salq $ZBASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movddup -15 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movddup -8 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 prefetchw 3 * SIZE(CO1) pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 prefetchw 7 * SIZE(CO2) pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 movapd %xmm0, %xmm2 prefetch -16 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L15 ALIGN_4 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) jl .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_4 KERNEL_SUB1(16 * 0) KERNEL_SUB2(16 * 0) KERNEL_SUB3(16 * 0) KERNEL_SUB4(16 * 0) subq $-16 * SIZE, BO subq $-16 * SIZE, AO ALIGN_4 .L16: movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L17: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ADD1 %xmm0, %xmm8 movapd %xmm2, %xmm0 ADD1 %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ADD2 %xmm2, %xmm9 movapd %xmm0, %xmm2 ADD2 %xmm3, %xmm13 movddup -13 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ADD1 %xmm0, %xmm10 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ADD1 %xmm1, %xmm14 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm3, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ADD2 %xmm2, %xmm11 ADD2 %xmm3, %xmm15 movddup -11 * SIZE(BO, %rax, 4), %xmm3 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L17 ALIGN_4 .L19: prefetch -8 * SIZE(BB) subq $-16 * SIZE, BB #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd 2 * SIZE(CO1), %xmm2 movupd (CO2), %xmm1 movupd 2 * SIZE(CO2), %xmm3 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 #else addsubpd %xmm8, %xmm9 addsubpd %xmm10, %xmm11 addsubpd %xmm12, %xmm13 addsubpd %xmm14, %xmm15 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm9 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm11, %xmm11 movapd %xmm13, %xmm12 pshufd $0x4e, %xmm13, %xmm13 movapd %xmm15, %xmm14 pshufd $0x4e, %xmm15, %xmm15 #endif mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm12 addpd %xmm1, %xmm10 addpd %xmm3, %xmm14 #endif movlpd %xmm8, (CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movlpd %xmm10, (CO2) movhpd %xmm10, 1 * SIZE(CO2) movlpd %xmm14, 2 * SIZE(CO2) movhpd %xmm14, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 movddup -8 * SIZE(BO), %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax NOBRANCH je .L46 ALIGN_4 .L42: mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 ADD2 %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 ADD2 %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 ADD2 %xmm5, %xmm9 movddup -9 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm10 movddup (BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -8 * SIZE(AO, %rax, 2), %xmm0 ADD2 %xmm5, %xmm11 movddup -7 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 ADD1 %xmm3, %xmm8 movddup -6 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 ADD2 %xmm5, %xmm9 movddup -5 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 ADD1 %xmm3, %xmm10 movddup -4 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -10 * SIZE(AO, %rax, 2), %xmm2 ADD2 %xmm5, %xmm11 movddup -3 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 ADD1 %xmm3, %xmm8 movddup -2 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 ADD2 %xmm5, %xmm9 movddup -1 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm2, %xmm3 ADD1 %xmm3, %xmm10 movddup 8 * SIZE(BO, %rax, 4), %xmm3 mulpd %xmm2, %xmm5 movapd -4 * SIZE(AO, %rax, 2), %xmm2 ADD2 %xmm5, %xmm11 movddup 1 * SIZE(BO, %rax, 4), %xmm5 addq $4 * SIZE, %rax BRANCH jl .L42 ALIGN_4 .L46: movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L49 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO negq %rax ALIGN_4 .L47: mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 ADD2 %xmm5, %xmm9 movddup -13 * SIZE(BO, %rax, 4), %xmm5 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 4), %xmm1 mulpd %xmm0, %xmm5 movapd -14 * SIZE(AO, %rax, 2), %xmm0 ADD2 %xmm5, %xmm11 movddup -11 * SIZE(BO, %rax, 4), %xmm5 addq $SIZE, %rax jl .L47 ALIGN_4 .L49: #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd (CO2), %xmm1 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 #else addsubpd %xmm8, %xmm9 addsubpd %xmm10, %xmm11 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm9 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm11, %xmm11 #endif mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 #endif movlpd %xmm8, (CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm10, (CO2) movhpd %xmm10, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif movq BO, B leaq (C, LDC, 2), C # c += 2 * ldc decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 pxor %xmm8, %xmm8 movddup -12 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -16 * SIZE(AO), %xmm0 pxor %xmm12, %xmm12 movapd -8 * SIZE(AO), %xmm4 pxor %xmm13, %xmm13 prefetchw 3 * SIZE(CO1) movapd %xmm0, %xmm2 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L116 ALIGN_4 .L112: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ADD1 %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ADD1 %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 ADD2 %xmm2, %xmm9 ADD2 %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 mulpd %xmm1, %xmm0 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ADD1 %xmm0, %xmm8 movapd (AO, %rax, 4), %xmm0 ADD1 %xmm1, %xmm12 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 ADD2 %xmm2, %xmm9 ADD2 %xmm5, %xmm13 movddup -11 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ADD1 %xmm4, %xmm8 movapd -4 * SIZE(AO, %rax, 4), %xmm4 ADD1 %xmm3, %xmm12 movddup -10 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ADD2 %xmm2, %xmm9 ADD2 %xmm5, %xmm13 movddup -9 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm4, %xmm2 mulpd %xmm3, %xmm4 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ADD1 %xmm4, %xmm8 movapd 8 * SIZE(AO, %rax, 4), %xmm4 ADD1 %xmm3, %xmm12 movddup -4 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm5, %xmm2 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ADD2 %xmm2, %xmm9 ADD2 %xmm5, %xmm13 movddup -7 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $4 * SIZE, %rax BRANCH jl .L112 ALIGN_4 .L116: movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L119 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L117: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ADD1 %xmm0, %xmm8 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ADD1 %xmm1, %xmm12 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm5, %xmm2 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 ADD2 %xmm2, %xmm9 ADD2 %xmm5, %xmm13 movddup -13 * SIZE(BO, %rax, 2), %xmm5 movapd %xmm0, %xmm2 addq $SIZE, %rax jl .L117 ALIGN_4 .L119: #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd 2 * SIZE(CO1), %xmm2 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 addsubpd %xmm13, %xmm12 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm12, %xmm13 #else addsubpd %xmm8, %xmm9 addsubpd %xmm12, %xmm13 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm9 movapd %xmm13, %xmm12 pshufd $0x4e, %xmm13, %xmm13 #endif mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 addsubpd %xmm9, %xmm8 addsubpd %xmm13, %xmm12 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm12 #endif movlpd %xmm8, (CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -12 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movddup -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movddup -15 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif andq $-4, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax NOBRANCH je .L146 ALIGN_4 .L142: mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 ADD2 %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm10 movddup -12 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -8 * SIZE(AO, %rax, 2), %xmm0 ADD2 %xmm3, %xmm11 movddup -11 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 ADD1 %xmm1, %xmm8 movddup -10 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -10 * SIZE(AO, %rax, 2), %xmm2 ADD2 %xmm3, %xmm9 movddup -9 * SIZE(BO, %rax, 2), %xmm3 mulpd %xmm2, %xmm1 ADD1 %xmm1, %xmm10 movddup -8 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm2, %xmm3 movapd -4 * SIZE(AO, %rax, 2), %xmm2 ADD2 %xmm3, %xmm11 movddup -7 * SIZE(BO, %rax, 2), %xmm3 addq $4 * SIZE, %rax BRANCH jl .L142 ALIGN_4 .L146: movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L148 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO negq %rax ALIGN_4 .L147: mulpd %xmm0, %xmm1 ADD1 %xmm1, %xmm8 movddup -14 * SIZE(BO, %rax, 2), %xmm1 mulpd %xmm0, %xmm3 movapd -14 * SIZE(AO, %rax, 2), %xmm0 ADD2 %xmm3, %xmm9 movddup -13 * SIZE(BO, %rax, 2), %xmm3 addq $SIZE, %rax jl .L147 ALIGN_4 .L148: #ifndef TRMMKERNEL movupd (CO1), %xmm0 #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 SHUFPD_1 %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm9 #else addsubpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm9 #endif mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 addsubpd %xmm9, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movlpd %xmm8, (CO1) movhpd %xmm8, 1 * SIZE(CO1) ALIGN_4 .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S000066400000000000000000001153101313527062700227430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADD_R vfmaddpd #define VFMADD_I vfmaddpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADD_R vfnmaddpd #define VFMADD_I vfmaddpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADD_R vfmaddpd #define VFMADD_I vfnmaddpd #else #define VFMADD_R vfnmaddpd #define VFMADD_I vfnmaddpd #endif #define A_PR1 384 #define B_PR1 192 #define KERNEL2x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_2(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_4(xx) \ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $16, BI ;\ addq $16, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $4, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_2(xx) \ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_3(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_4(xx) \ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $16, BI ;\ addq $8 , %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $4, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ #define KERNEL2x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_2(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_4(xx) \ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $2, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_2(xx) \ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_3(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_4(xx) \ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $2, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL vmovsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA_R vmovsd %xmm1, ALPHA_I salq $ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_0: movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm1 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) addq $4*SIZE,BO1 addq $4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $8 * SIZE, AO movq M, I sarq $1, I // i = (m >> 1) je .L2_40 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL2x2_SUB(xxx) jl .L2_17 ALIGN_4 .L2_19: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 vshufpd $0x01, %xmm13, %xmm13, %xmm13 vshufpd $0x01, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm10, %xmm10, %xmm11 vshufpd $0x01, %xmm12, %xmm12, %xmm13 vshufpd $0x01, %xmm14, %xmm14, %xmm15 #else vaddsubpd %xmm8, %xmm9 ,%xmm9 vaddsubpd %xmm10, %xmm11,%xmm11 vaddsubpd %xmm12, %xmm13,%xmm13 vaddsubpd %xmm14, %xmm15,%xmm15 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 vmovapd %xmm13, %xmm12 vmovapd %xmm15, %xmm14 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 vshufpd $0x01, %xmm13, %xmm13, %xmm13 vshufpd $0x01, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 vmulpd %xmm12, %xmm0, %xmm12 vmulpd %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vmulpd %xmm13, %xmm1, %xmm13 vmulpd %xmm15, %xmm1, %xmm15 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 vaddpd (CO1, LDC), %xmm10, %xmm10 vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 2 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) jl .L2_47 ALIGN_4 .L2_49: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm10, %xmm10, %xmm11 #else vaddsubpd %xmm8, %xmm9, %xmm9 vaddsubpd %xmm10,%xmm11, %xmm11 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $8 * SIZE, AO movq M, I sarq $1, I // i = (m >> 1) je .L1_40 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL2x1_SUB(xxx) jl .L1_17 ALIGN_4 .L1_19: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13,%xmm12 , %xmm12 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm12, %xmm12, %xmm13 #else vaddsubpd %xmm8, %xmm9 , %xmm9 vaddsubpd %xmm12,%xmm13, %xmm13 vmovapd %xmm9, %xmm8 vmovapd %xmm13, %xmm12 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm13, %xmm1, %xmm13 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13, %xmm12, %xmm12 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) jl .L1_47 ALIGN_4 .L1_49: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8, %xmm8 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 #else vaddsubpd %xmm8, %xmm9, %xmm9 vmovapd %xmm9, %xmm8 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vaddsubpd %xmm9 ,%xmm8, %xmm8 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x2_core2.S000066400000000000000000000642661313527062700217700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA_R 0(%rsp) #define ALPHA_I 16(%rsp) #define J 32(%rsp) #define OFFSET 40(%rsp) #define KK 48(%rsp) #define KKK 56(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R * 2) #define PREFETCHSIZE (8 * 13 + 5) #define PREFETCH prefetcht0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addpd #define ADD2 subpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 subpd #define ADD2 addpd #else #define ADD1 subpd #define ADD2 subpd #endif #define ADDSUB subpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movddup %xmm0, %xmm0 movddup %xmm1, %xmm1 movapd %xmm0, ALPHA_R movapd %xmm1, ALPHA_I subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J # j = (n >> 2) NOBRANCH jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO movapd -16 * SIZE(B), %xmm0 movapd -8 * SIZE(B), %xmm4 movq K, %rax sarq $2, %rax jle .L03 ALIGN_3 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) prefetcht0 (PREFETCH_R + 8) * SIZE(B) movapd -14 * SIZE(B), %xmm1 movapd -12 * SIZE(B), %xmm2 movapd -10 * SIZE(B), %xmm3 movapd -6 * SIZE(B), %xmm5 movapd -4 * SIZE(B), %xmm6 movapd -2 * SIZE(B), %xmm7 movddup %xmm0, %xmm8 movapd %xmm8, -16 * SIZE(BO) unpckhpd %xmm0, %xmm0 movapd %xmm0, -14 * SIZE(BO) movapd 0 * SIZE(B), %xmm0 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movddup %xmm1, %xmm9 movapd %xmm9, -12 * SIZE(BO) unpckhpd %xmm1, %xmm1 movapd %xmm1, -10 * SIZE(BO) movddup %xmm2, %xmm10 movapd %xmm10, -8 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) unpckhpd %xmm2, %xmm2 movapd %xmm2, -6 * SIZE(BO) movddup %xmm3, %xmm11 movapd %xmm11, -4 * SIZE(BO) unpckhpd %xmm3, %xmm3 movapd %xmm3, -2 * SIZE(BO) prefetcht0 (PREFETCH_W + 16) * SIZE(BO) movddup %xmm4, %xmm12 movapd %xmm12, 0 * SIZE(BO) unpckhpd %xmm4, %xmm4 movapd %xmm4, 2 * SIZE(BO) movapd 8 * SIZE(B), %xmm4 movddup %xmm5, %xmm13 movapd %xmm13, 4 * SIZE(BO) unpckhpd %xmm5, %xmm5 movapd %xmm5, 6 * SIZE(BO) prefetcht0 (PREFETCH_W + 24) * SIZE(BO) movddup %xmm6, %xmm14 movapd %xmm14, 8 * SIZE(BO) unpckhpd %xmm6, %xmm6 movapd %xmm6, 10 * SIZE(BO) movddup %xmm7, %xmm15 movapd %xmm15, 12 * SIZE(BO) unpckhpd %xmm7, %xmm7 movapd %xmm7, 14 * SIZE(BO) subq $-32 * SIZE, BO subq $-16 * SIZE, B decq %rax jne .L02 ALIGN_3 .L03: movq K, %rax andq $3, %rax BRANCH jle .L05 ALIGN_3 .L04: movapd -14 * SIZE(B), %xmm1 movddup %xmm0, %xmm8 unpckhpd %xmm0, %xmm0 movddup %xmm1, %xmm9 unpckhpd %xmm1, %xmm1 movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) movapd -12 * SIZE(B), %xmm0 movapd %xmm9, -12 * SIZE(BO) movapd %xmm1, -10 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_3 .L05: leaq (PREFETCH_R + 0) * SIZE(B), BB movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 19 * SIZE + BUFFER, BO #else leaq 19 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -19 * SIZE(BO), %xmm6 movaps -17 * SIZE(BO), %xmm7 prefetcht2 0 * SIZE(BB) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 prefetcht2 8 * SIZE(BB) pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 prefetcht0 3 * SIZE(CO1) pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 prefetcht0 3 * SIZE(CO2) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_4 .L12: PADDING; ADD1 %xmm2, %xmm10 movaps -15 * SIZE(BO), %xmm2 PADDING; ADD1 %xmm3, %xmm14 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movaps -13 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 ADD1 %xmm6, %xmm8 movaps -11 * SIZE(BO), %xmm6 ADD1 %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm7, %xmm9 movaps -9 * SIZE(BO), %xmm7 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm10 movaps -7 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movaps -5 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 ADD1 %xmm6, %xmm8 movaps -3 * SIZE(BO), %xmm6 ADD1 %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm7, %xmm9 movaps -1 * SIZE(BO), %xmm7 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm10 movaps 1 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movaps 3 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 PADDING movaps %xmm7, %xmm5 mulpd %xmm1, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm7 ADD1 %xmm6, %xmm8 movaps 5 * SIZE(BO), %xmm6 ADD1 %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm7, %xmm9 movaps 7 * SIZE(BO), %xmm7 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm10 movaps 9 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 subq $-16 * SIZE, AO movaps 11 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 ADD1 %xmm6, %xmm8 movaps 13 * SIZE(BO), %xmm6 ADD1 %xmm3, %xmm12 movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm7, %xmm9 movaps 15 * SIZE(BO), %xmm7 ADD2 %xmm5, %xmm13 subq $-32 * SIZE, BO movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: prefetcht2 -8 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH BRANCH je .L19 ALIGN_4 .L16: ADD1 %xmm2, %xmm10 movaps -15 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movaps %xmm6, %xmm3 mulpd %xmm0, %xmm6 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movaps -13 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm5 ADD1 %xmm6, %xmm8 movaps -11 * SIZE(BO), %xmm6 ADD1 %xmm3, %xmm12 addq $4 * SIZE, AO movaps %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm7, %xmm9 movaps -9 * SIZE(BO), %xmm7 ADD2 %xmm5, %xmm13 addq $8 * SIZE, BO movaps %xmm4, %xmm5 mulpd %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm5 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L16 ALIGN_3 .L19: movapd ALPHA_R, %xmm6 ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 movapd ALPHA_I, %xmm7 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 movapd %xmm8, %xmm9 movapd %xmm10, %xmm11 movapd %xmm12, %xmm13 movapd %xmm14, %xmm15 #else addsubpd %xmm8, %xmm9 addsubpd %xmm10, %xmm11 addsubpd %xmm12, %xmm13 addsubpd %xmm14, %xmm15 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 movapd %xmm13, %xmm12 movapd %xmm15, %xmm14 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm2 movhpd 3 * SIZE(CO1), %xmm2 movsd 0 * SIZE(CO2), %xmm1 movhpd 1 * SIZE(CO2), %xmm1 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 mulpd %xmm6, %xmm8 mulpd %xmm6, %xmm10 mulpd %xmm6, %xmm12 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm11 mulpd %xmm7, %xmm13 mulpd %xmm7, %xmm15 addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 addpd %xmm2, %xmm12 addpd %xmm3, %xmm14 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm14, 2 * SIZE(CO2) movhpd %xmm14, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 .L40: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L42 .L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -14 * SIZE(AO), %xmm0 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm4 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -10 * SIZE(AO), %xmm0 movapd 8 * SIZE(BO), %xmm2 movapd 10 * SIZE(BO), %xmm3 movapd 12 * SIZE(BO), %xmm4 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L41 .L42: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH jle .L44 .L43: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L43 ALIGN_4 .L44: movapd ALPHA_R, %xmm6 movapd ALPHA_I, %xmm7 SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 movapd %xmm8, %xmm9 movapd %xmm10, %xmm11 #else addsubpd %xmm8, %xmm9 addsubpd %xmm10, %xmm11 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhpd 1 * SIZE(CO2), %xmm1 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 mulpd %xmm6, %xmm8 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm11 addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 2 * ldc decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L103 ALIGN_4 .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 prefetcht0 3 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L112 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L113 ALIGN_4 .L114: movapd ALPHA_R, %xmm6 movapd ALPHA_I, %xmm7 SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 addsubpd %xmm13, %xmm12 movapd %xmm8, %xmm9 movapd %xmm12, %xmm13 #else addsubpd %xmm8, %xmm9 addsubpd %xmm12, %xmm13 movapd %xmm9, %xmm8 movapd %xmm13, %xmm12 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm2 movhpd 3 * SIZE(CO1), %xmm2 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm6, %xmm8 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm9 mulpd %xmm7, %xmm13 addsubpd %xmm9, %xmm8 addsubpd %xmm13, %xmm12 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm8 addpd %xmm2, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L999 ALIGN_4 .L140: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L142 .L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L141 .L142: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH jle .L144 .L143: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L143 ALIGN_4 .L144: movapd ALPHA_R, %xmm6 movapd ALPHA_I, %xmm7 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 SHUFPD_1 %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm9, %xmm8 movapd %xmm8, %xmm9 #else addsubpd %xmm8, %xmm9 movapd %xmm9, %xmm8 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 #endif SHUFPD_1 %xmm9, %xmm9 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 addsubpd %xmm9, %xmm8 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x2_penryn.S000066400000000000000000000614561313527062700222670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define PREA %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #ifdef NANO #define PREFETCHSIZE (8 * 2 + 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 #endif #ifdef DUNNINGTON #define PREFETCHSIZE (8 * 81 + 4) #endif #ifndef PREFETCH #define PREFETCH prefetcht0 #endif #ifndef PREFETCHW #define PREFETCHW prefetcht2 #endif #ifndef PREFETCHB #define PREFETCHB prefetcht0 #endif #ifndef PREFETCHSIZE #define PREFETCHSIZE (8 * 17 + 4) #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addpd #define ADD2 addpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 addpd #define ADD2 addpd #else #define ADD1 addpd #define ADD2 subpd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I subq $-16 * SIZE, A subq $-17 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $1, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $ZBASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -16 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 xorpd %xmm4, %xmm4 movaps -17 * SIZE(BO), %xmm2 PREFETCHB -16 * SIZE(BB) xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 movaps %xmm4, %xmm8 movaps %xmm4, %xmm9 PREFETCHW 3 * SIZE(CO1) movaps %xmm4, %xmm10 movaps %xmm4, %xmm11 movaps %xmm4, %xmm12 movaps %xmm4, %xmm13 PREFETCHW 3 * SIZE(CO2) movaps %xmm4, %xmm14 movaps %xmm4, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm3, %xmm12 movaps -15 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -11 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -9 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -7 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 PADDING PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) ADD1 %xmm2, %xmm8 movaps -5 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 subq $-16 * SIZE, AO movaps -3 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -1 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 subq $-16 * SIZE, BO mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: PREFETCHB -8 * SIZE(BB) #ifdef DUNNINGTON PREFETCHB 0 * SIZE(BB) PREFETCHB 8 * SIZE(BB) #endif #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm3, %xmm12 movaps -15 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #ifndef DUNNINGTON subq $-16 * SIZE, BB #else subq $-32 * SIZE, BB #endif ADD1 %xmm3, %xmm12 pcmpeqb %xmm0, %xmm0 ADD1 %xmm4, %xmm14 psllq $63, %xmm0 ADD2 %xmm5, %xmm13 movddup ALPHA_R, %xmm2 ADD2 %xmm6, %xmm15 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 haddpd %xmm13, %xmm12 haddpd %xmm15, %xmm14 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm9 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm11 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm15 addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 movsd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 addpd %xmm2, %xmm12 addpd %xmm3, %xmm14 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movhpd %xmm10, 3 * SIZE(CO1) movsd %xmm12, 0 * SIZE(CO2) movhpd %xmm12, 1 * SIZE(CO2) movsd %xmm14, 2 * SIZE(CO2) movhpd %xmm14, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L39 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 movaps -15 * SIZE(BO), %xmm3 xorps %xmm3, %xmm3 xorps %xmm5, %xmm5 movaps %xmm3, %xmm8 movaps %xmm3, %xmm9 movaps %xmm3, %xmm12 movaps %xmm3, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: ADD1 %xmm3, %xmm12 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -11 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -9 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -7 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -5 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -3 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO ADD1 %xmm2, %xmm8 movaps -1 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: ADD1 %xmm3, %xmm12 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: ADD1 %xmm3, %xmm12 pcmpeqb %xmm0, %xmm0 ADD2 %xmm5, %xmm13 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm12 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm13 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm9 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm9, %xmm8 addsubpd %xmm13, %xmm12 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhpd 1 * SIZE(CO2), %xmm2 addpd %xmm0, %xmm8 addpd %xmm2, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 0 * SIZE(CO2) movhpd %xmm12, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C movq BO, B subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $1, N BRANCH jle .L999 movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq M, I sarq $1, I # i = (m >> 2) NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -17 * SIZE(BO), %xmm2 PREFETCHW 3 * SIZE(CO1) xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -15 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -13 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -11 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -9 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -15 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: pcmpeqb %xmm0, %xmm0 movddup ALPHA_R, %xmm2 psllq $63, %xmm0 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm12 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm13 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm9 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm9, %xmm8 addsubpd %xmm13, %xmm12 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 movhpd 3 * SIZE(CO1), %xmm1 addpd %xmm0, %xmm8 addpd %xmm1, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L79 ALIGN_4 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movaps -17 * SIZE(BO), %xmm2 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -15 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -13 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -11 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -9 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -15 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 pcmpeqb %xmm0, %xmm0 movddup ALPHA_R, %xmm2 psllq $63, %xmm0 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif haddpd %xmm9, %xmm8 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm9 addsubpd %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 addpd %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK #endif addq LDC, C movq BO, B ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x2_piledriver.S000066400000000000000000001163721313527062700231170ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /********************************************************************* * * 2014/06/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/30 Saar * * Parameter: * UNROLL_M 2 * UNROLL_N 2 * ZGEMM_P 384 * ZGEMM_Q 168 * A_PR1 512 * B_PR1 256 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * * 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) * 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) * 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) * 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * * 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) * 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) * 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) * 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) * 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) * 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 256*8*4 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADD_R vfmaddpd #define VFMADD_I vfmaddpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADD_R vfnmaddpd #define VFMADD_I vfmaddpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADD_R vfmaddpd #define VFMADD_I vfnmaddpd #else #define VFMADD_R vfnmaddpd #define VFMADD_I vfnmaddpd #endif #define A_PR1 512 #define B_PR1 256 #define KERNEL2x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_2(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_4(xx) \ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $16, BI ;\ addq $16, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $4, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_2(xx) \ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_3(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_4(xx) \ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $16, BI ;\ addq $8 , %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $4, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ #define KERNEL2x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_2(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_4(xx) \ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $2, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_2(xx) \ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_3(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_4(xx) \ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $2, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL vmovsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA_R vmovsd %xmm1, ALPHA_I salq $ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_0: movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm1 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) addq $4*SIZE,BO1 addq $4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $8 * SIZE, AO movq M, I sarq $1, I // i = (m >> 1) je .L2_40 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL2x2_SUB(xxx) jl .L2_17 ALIGN_4 .L2_19: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 vshufpd $0x01, %xmm13, %xmm13, %xmm13 vshufpd $0x01, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm10, %xmm10, %xmm11 vshufpd $0x01, %xmm12, %xmm12, %xmm13 vshufpd $0x01, %xmm14, %xmm14, %xmm15 #else vaddsubpd %xmm8, %xmm9 ,%xmm9 vaddsubpd %xmm10, %xmm11,%xmm11 vaddsubpd %xmm12, %xmm13,%xmm13 vaddsubpd %xmm14, %xmm15,%xmm15 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 vmovapd %xmm13, %xmm12 vmovapd %xmm15, %xmm14 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 vshufpd $0x01, %xmm13, %xmm13, %xmm13 vshufpd $0x01, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 vmulpd %xmm12, %xmm0, %xmm12 vmulpd %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vmulpd %xmm13, %xmm1, %xmm13 vmulpd %xmm15, %xmm1, %xmm15 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 vaddpd (CO1, LDC), %xmm10, %xmm10 vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 2 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) jl .L2_47 ALIGN_4 .L2_49: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm10, %xmm10, %xmm11 #else vaddsubpd %xmm8, %xmm9, %xmm9 vaddsubpd %xmm10,%xmm11, %xmm11 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $8 * SIZE, AO movq M, I sarq $1, I // i = (m >> 1) je .L1_40 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL2x1_SUB(xxx) jl .L1_17 ALIGN_4 .L1_19: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13,%xmm12 , %xmm12 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm12, %xmm12, %xmm13 #else vaddsubpd %xmm8, %xmm9 , %xmm9 vaddsubpd %xmm12,%xmm13, %xmm13 vmovapd %xmm9, %xmm8 vmovapd %xmm13, %xmm12 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm13, %xmm1, %xmm13 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13, %xmm12, %xmm12 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) jl .L1_47 ALIGN_4 .L1_49: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8, %xmm8 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 #else vaddsubpd %xmm8, %xmm9, %xmm9 vmovapd %xmm9, %xmm8 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vaddsubpd %xmm9 ,%xmm8, %xmm8 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x2_sse2.S000066400000000000000000001162721313527062700216250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define ALPHA_R 16(%rsp) #define ALPHA_I 32(%rsp) #define J 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER 256(%rsp) #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 9 + 4) #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 5 + 4) #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) #endif #ifndef GENERIC #define KERNEL1(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL2(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL3(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL4(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ addpd %xmm6, %xmm15 ;\ movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #define KERNEL5(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL6(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL7(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL8(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #else #define KERNEL1(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL2(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL3(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL4(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm6, %xmm15 ;\ movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulpd %xmm0, %xmm1 ;\ addpd %xmm1, %xmm8 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm0, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm0, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addpd %xmm5, %xmm10 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm0, %xmm11 ;\ movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL6(xx) \ mulpd %xmm2, %xmm1 ;\ addpd %xmm1, %xmm12 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulpd %xmm2, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm2, %xmm5 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addpd %xmm5, %xmm14 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm2, %xmm15 ;\ movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL7(xx) \ mulpd %xmm4, %xmm7 ;\ addpd %xmm7, %xmm8 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm4, %xmm3 ;\ addpd %xmm3, %xmm9 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm4, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addpd %xmm5, %xmm10 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm4, %xmm11 ;\ movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL8(xx) \ mulpd %xmm6, %xmm7 ;\ addpd %xmm7, %xmm12 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulpd %xmm6, %xmm3 ;\ addpd %xmm3, %xmm13 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulpd %xmm6, %xmm5 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addpd %xmm5, %xmm14 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq 72(%rsp), LDC #ifdef TRMMKERNEL movsd 80(%rsp), %xmm12 #endif #endif EMMS movq %rsp, %rbx # save old stack subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 # Generate mask pxor %xmm10, %xmm10 movlpd %xmm0, 0 + ALPHA_R movlpd %xmm0, 8 + ALPHA_R movlpd %xmm1, 8 + ALPHA_I xorpd %xmm7, %xmm1 movlpd %xmm1, 0 + ALPHA_I movlpd %xmm10, 0 + POSINV movlpd %xmm7, 8 + POSINV #ifdef TRMMKERNEL movlpd %xmm12, OFFSET movlpd %xmm12, KK #ifndef LEFT negq KK #endif #endif subq $-16 * SIZE, A salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif leaq 16 * SIZE + BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) movq 0 * SIZE(B), %mm0 movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq 1 * SIZE(B), %mm1 movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq 2 * SIZE(B), %mm2 movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq 3 * SIZE(B), %mm3 movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) movq 4 * SIZE(B), %mm4 movq %mm4, -8 * SIZE(BO) movq %mm4, -7 * SIZE(BO) movq 5 * SIZE(B), %mm5 movq %mm5, -6 * SIZE(BO) movq %mm5, -5 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) movq 6 * SIZE(B), %mm6 movq %mm6, -4 * SIZE(BO) movq %mm6, -3 * SIZE(BO) movq 7 * SIZE(B), %mm7 movq %mm7, -2 * SIZE(BO) movq %mm7, -1 * SIZE(BO) PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) movq 8 * SIZE(B), %mm0 movq %mm0, 0 * SIZE(BO) movq %mm0, 1 * SIZE(BO) movq 9 * SIZE(B), %mm1 movq %mm1, 2 * SIZE(BO) movq %mm1, 3 * SIZE(BO) movq 10 * SIZE(B), %mm2 movq %mm2, 4 * SIZE(BO) movq %mm2, 5 * SIZE(BO) movq 11 * SIZE(B), %mm3 movq %mm3, 6 * SIZE(BO) movq %mm3, 7 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) movq 12 * SIZE(B), %mm4 movq %mm4, 8 * SIZE(BO) movq %mm4, 9 * SIZE(BO) movq 13 * SIZE(B), %mm5 movq %mm5, 10 * SIZE(BO) movq %mm5, 11 * SIZE(BO) PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) movq 14 * SIZE(B), %mm6 movq %mm6, 12 * SIZE(BO) movq %mm6, 13 * SIZE(BO) movq 15 * SIZE(B), %mm7 movq %mm7, 14 * SIZE(BO) movq %mm7, 15 * SIZE(BO) addq $ 32 * SIZE, BO subq $-16 * SIZE, B decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L05 ALIGN_4 .L04: movq 0 * SIZE(B), %mm0 movq %mm0, -16 * SIZE(BO) movq %mm0, -15 * SIZE(BO) movq 1 * SIZE(B), %mm1 movq %mm1, -14 * SIZE(BO) movq %mm1, -13 * SIZE(BO) movq 2 * SIZE(B), %mm2 movq %mm2, -12 * SIZE(BO) movq %mm2, -11 * SIZE(BO) movq 3 * SIZE(B), %mm3 movq %mm3, -10 * SIZE(BO) movq %mm3, -9 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L05: movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 PREFETCH 0 * SIZE(BB) movapd -14 * SIZE(AO), %xmm2 movapd -14 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movapd -12 * SIZE(AO), %xmm4 movapd -12 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movapd -10 * SIZE(AO), %xmm6 movapd -8 * SIZE(BO), %xmm7 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 PREFETCHW 3 * SIZE(CO1) pxor %xmm13, %xmm13 PREFETCHW 3 * SIZE(CO2) pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif #ifndef GENERIC andq $-8, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) addq $8 * SIZE, %rax BRANCH jl .L12 ALIGN_3 .L15: PREFETCH 8 * SIZE(BB) subq $-16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $32 * SIZE, BO addq $16 * SIZE, AO ALIGN_3 #else sarq $2, %rax NOBRANCH jle .L16 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $ 32 * SIZE, BO subq $-16 * SIZE, AO decq %rax BRANCH jg .L12 #endif .L16: movapd POSINV, %xmm5 movapd ALPHA_R, %xmm6 movapd ALPHA_I, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) je .L19 leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_3 .L17: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd -12 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO, %rax, 8), %xmm0 addpd %xmm1, %xmm10 movapd -16 * SIZE(BO, %rax, 8), %xmm1 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO, %rax, 4), %xmm0 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm12 movapd -14 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm13 movapd -12 * SIZE(BO, %rax, 8), %xmm1 mulpd %xmm2, %xmm1 mulpd -10 * SIZE(BO, %rax, 8), %xmm2 addpd %xmm1, %xmm14 movapd -8 * SIZE(BO, %rax, 8), %xmm1 addpd %xmm2, %xmm15 movapd -10 * SIZE(AO, %rax, 4), %xmm2 addq $SIZE, %rax jl .L17 ALIGN_3 .L19: #ifndef TRMMKERNEL movlpd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movlpd 2 * SIZE(CO1), %xmm2 movhpd 3 * SIZE(CO1), %xmm2 movlpd 0 * SIZE(CO2), %xmm1 movhpd 1 * SIZE(CO2), %xmm1 movlpd 2 * SIZE(CO2), %xmm3 movhpd 3 * SIZE(CO2), %xmm3 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm5, %xmm9 xorpd %xmm5, %xmm11 xorpd %xmm5, %xmm13 xorpd %xmm5, %xmm15 #else xorpd %xmm5, %xmm8 xorpd %xmm5, %xmm10 xorpd %xmm5, %xmm12 xorpd %xmm5, %xmm14 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm12 addpd %xmm1, %xmm10 addpd %xmm3, %xmm14 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) movlpd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movlpd %xmm14, 2 * SIZE(CO2) movhpd %xmm14, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -8 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movapd -16 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movapd -8 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L44 ALIGN_4 .L41: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm9 movapd -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 0 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm8 movapd -6 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm9 movapd -4 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) addpd %xmm1, %xmm9 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm8 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm9 movapd 12 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 24 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movapd 18 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm9 movapd 20 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 mulpd 22 * SIZE(BO), %xmm2 addpd %xmm1, %xmm10 movapd 32 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movapd 26 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm9 movapd 28 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 mulpd 30 * SIZE(BO), %xmm2 addpd %xmm3, %xmm10 movapd 40 * SIZE(BO), %xmm3 addpd %xmm2, %xmm11 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm8 movapd 34 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 addpd %xmm1, %xmm9 movapd 36 * SIZE(BO), %xmm1 mulpd %xmm2, %xmm1 mulpd 38 * SIZE(BO), %xmm2 addpd %xmm1, %xmm10 movapd 48 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm8 movapd 42 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 addpd %xmm3, %xmm9 movapd 44 * SIZE(BO), %xmm3 mulpd %xmm2, %xmm3 mulpd 46 * SIZE(BO), %xmm2 addpd %xmm3, %xmm10 movapd 56 * SIZE(BO), %xmm3 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 subq $-16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L41 ALIGN_4 .L44: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $4, %rax BRANCH jle .L45 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 0 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm8 movapd -6 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm9 movapd -4 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd 2 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd 4 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd 6 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm8 movapd 10 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 addpd %xmm3, %xmm9 movapd 12 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm3 mulpd 14 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 24 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd -8 * SIZE(AO), %xmm0 addq $ 8 * SIZE, AO addq $32 * SIZE, BO ALIGN_4 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd POSINV, %xmm5 movapd ALPHA_R, %xmm6 movapd ALPHA_I, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L47 ALIGN_4 .L46: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movapd -14 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 addpd %xmm1, %xmm9 movapd -12 * SIZE(BO), %xmm1 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd -8 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L46 ALIGN_4 .L47: #ifndef TRMMKERNEL movlpd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movlpd 0 * SIZE(CO2), %xmm1 movhpd 1 * SIZE(CO2), %xmm1 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm5, %xmm9 xorpd %xmm5, %xmm11 #else xorpd %xmm5, %xmm8 xorpd %xmm5, %xmm10 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 #endif pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 2 * ldc decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L103 ALIGN_4 .L102: movlpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movlpd 2 * SIZE(B), %xmm10 movlpd 3 * SIZE(B), %xmm11 movlpd 4 * SIZE(B), %xmm12 movlpd 5 * SIZE(B), %xmm13 movlpd 6 * SIZE(B), %xmm14 movlpd 7 * SIZE(B), %xmm15 movlpd %xmm8, 0 * SIZE(BO) movlpd %xmm8, 1 * SIZE(BO) movlpd %xmm9, 2 * SIZE(BO) movlpd %xmm9, 3 * SIZE(BO) movlpd %xmm10, 4 * SIZE(BO) movlpd %xmm10, 5 * SIZE(BO) movlpd %xmm11, 6 * SIZE(BO) movlpd %xmm11, 7 * SIZE(BO) movlpd %xmm12, 8 * SIZE(BO) movlpd %xmm12, 9 * SIZE(BO) movlpd %xmm13, 10 * SIZE(BO) movlpd %xmm13, 11 * SIZE(BO) movlpd %xmm14, 12 * SIZE(BO) movlpd %xmm14, 13 * SIZE(BO) movlpd %xmm15, 14 * SIZE(BO) movlpd %xmm15, 15 * SIZE(BO) subq $-16 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L102 ALIGN_4 .L103: movq K, %rax andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movlpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movlpd %xmm8, 0 * SIZE(BO) movlpd %xmm8, 1 * SIZE(BO) movlpd %xmm9, 2 * SIZE(BO) movlpd %xmm9, 3 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm1 pxor %xmm9, %xmm9 movapd -8 * SIZE(AO), %xmm2 pxor %xmm12, %xmm12 movapd -8 * SIZE(BO), %xmm3 pxor %xmm13, %xmm13 PREFETCHW 3 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L112 .L111: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd -16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm12 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm13 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm12 movapd 0 * SIZE(BO), %xmm1 addpd %xmm0, %xmm13 movapd 0 * SIZE(AO), %xmm0 mulpd %xmm2, %xmm3 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd -6 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd -8 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd -6 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd -4 * SIZE(BO), %xmm3 addpd %xmm2, %xmm13 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd -2 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd -2 * SIZE(BO), %xmm2 addpd %xmm3, %xmm12 movapd 8 * SIZE(BO), %xmm3 addpd %xmm2, %xmm13 movapd 8 * SIZE(AO), %xmm2 subq $-16 * SIZE, AO subq $-16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movapd POSINV, %xmm5 movapd ALPHA_R, %xmm6 movapd ALPHA_I, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd -16 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm12 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm13 movapd -12 * SIZE(AO), %xmm0 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: #ifndef TRMMKERNEL movlpd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movlpd 2 * SIZE(CO1), %xmm2 movhpd 3 * SIZE(CO1), %xmm2 #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm5, %xmm9 xorpd %xmm5, %xmm13 #else xorpd %xmm5, %xmm8 xorpd %xmm5, %xmm12 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm13, %xmm12 #else addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 #endif pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 addpd %xmm2, %xmm12 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movlpd %xmm12, 2 * SIZE(CO1) movhpd %xmm12, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 16 * SIZE + BUFFER, BO #else leaq 16 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 movapd -8 * SIZE(AO), %xmm2 movapd -8 * SIZE(BO), %xmm3 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L144 ALIGN_4 .L141: mulpd %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 0 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BO), %xmm0 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO), %xmm3 addpd %xmm0, %xmm9 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 movapd 8 * SIZE(BO), %xmm3 addpd %xmm0, %xmm11 movapd 0 * SIZE(AO), %xmm0 mulpd %xmm2, %xmm1 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd 2 * SIZE(BO), %xmm2 addpd %xmm1, %xmm8 movapd 4 * SIZE(BO), %xmm1 addpd %xmm2, %xmm9 movapd -6 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm1 mulpd 6 * SIZE(BO), %xmm2 addpd %xmm1, %xmm10 movapd 16 * SIZE(BO), %xmm1 addpd %xmm2, %xmm11 movapd -4 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 10 * SIZE(BO), %xmm2 addpd %xmm3, %xmm8 movapd 12 * SIZE(BO), %xmm3 addpd %xmm2, %xmm9 movapd -2 * SIZE(AO), %xmm2 mulpd %xmm2, %xmm3 mulpd 14 * SIZE(BO), %xmm2 addpd %xmm3, %xmm10 movapd 24 * SIZE(BO), %xmm3 addpd %xmm2, %xmm11 movapd 8 * SIZE(AO), %xmm2 subq $-16 * SIZE, AO subq $-32 * SIZE, BO decq %rax jne .L141 ALIGN_4 .L144: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $4, %rax # if (k & 1) BRANCH jle .L145 mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm1 mulpd -10 * SIZE(BO), %xmm0 addpd %xmm1, %xmm10 movapd 0 * SIZE(BO), %xmm1 addpd %xmm0, %xmm11 movapd -12 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd -6 * SIZE(BO), %xmm0 addpd %xmm3, %xmm8 movapd -4 * SIZE(BO), %xmm3 addpd %xmm0, %xmm9 movapd -10 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm3 mulpd -2 * SIZE(BO), %xmm0 addpd %xmm3, %xmm10 addpd %xmm0, %xmm11 movapd -8 * SIZE(AO), %xmm0 addq $8 * SIZE, AO subq $-16 * SIZE, BO ALIGN_4 .L145: movapd POSINV, %xmm5 movapd ALPHA_R, %xmm6 movapd ALPHA_I, %xmm7 #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH jle .L148 ALIGN_4 .L146: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BO), %xmm0 addpd %xmm1, %xmm8 movapd -12 * SIZE(BO), %xmm1 addpd %xmm0, %xmm9 movapd -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L146 ALIGN_4 .L148: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #ifndef TRMMKERNEL movlpd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 #endif SHUFPD_1 %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm5, %xmm9 #else xorpd %xmm5, %xmm8 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 #else addpd %xmm9, %xmm8 #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 addpd %xmm9, %xmm8 #ifndef TRMMKERNEL addpd %xmm0, %xmm8 #endif movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp EMMS movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x2_sse3.S000066400000000000000000001053431313527062700216230ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KKK 72(%rsp) #define KK 80(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define OFFSET 240(%rsp) #define KKK 248(%rsp) #define KK 256(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addpd #define ADD2 subpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 subpd #define ADD2 addpd #else #define ADD1 subpd #define ADD2 subpd #endif #define ADDSUB subpd #define KERNEL1(address) \ mulpd %xmm8, %xmm9;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif #endif movsd %xmm0, ALPHA_R movsd %xmm1, ALPHA_I #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a movq K, %rax salq $ZBASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 pxor %xmm4, %xmm4 movddup 16 * SIZE(BO), %xmm13 pxor %xmm5, %xmm5 movapd 24 * SIZE(AO), %xmm14 pxor %xmm6, %xmm6 movddup 24 * SIZE(BO), %xmm15 pxor %xmm7, %xmm7 prefetchnta 3 * SIZE(CO1) prefetchnta 3 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-8, %rax salq $4, %rax je .L12 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L11 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L11 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L11 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L11 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L11 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L11 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L11 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L11: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 ALIGN_4 .L12: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA_R, %xmm14 movddup ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L13 ALIGN_4 .L14: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 movapd %xmm0, %xmm1 movapd %xmm2, %xmm3 movapd %xmm4, %xmm5 movapd %xmm6, %xmm7 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 addsubpd %xmm4, %xmm5 addsubpd %xmm6, %xmm7 movapd %xmm1, %xmm0 movapd %xmm3, %xmm2 movapd %xmm5, %xmm4 movapd %xmm7, %xmm6 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm10 movhpd 3 * SIZE(CO1), %xmm10 movsd 0 * SIZE(CO2), %xmm9 movhpd 1 * SIZE(CO2), %xmm9 movsd 2 * SIZE(CO2), %xmm11 movhpd 3 * SIZE(CO2), %xmm11 #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm14, %xmm4 mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm3 mulpd %xmm15, %xmm5 mulpd %xmm15, %xmm7 addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 addpd %xmm10, %xmm4 addpd %xmm11, %xmm6 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movhpd %xmm4, 3 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) movsd %xmm6, 2 * SIZE(CO2) movhpd %xmm6, 3 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 .L40: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L42 .L41: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L41 .L42: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA_R, %xmm14 movddup ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH jle .L44 .L43: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L43 ALIGN_4 .L44: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 movapd %xmm0, %xmm1 movapd %xmm2, %xmm3 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 movapd %xmm1, %xmm0 movapd %xmm3, %xmm2 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm9 movhpd 1 * SIZE(CO2), %xmm9 #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm3 addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhpd %xmm2, 1 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C # c += 2 * ldc movq BO, B decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 prefetchnta 4 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L112 .L111: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA_R, %xmm14 movddup ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH jle .L114 .L113: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm1, %xmm0 addsubpd %xmm5, %xmm4 movapd %xmm0, %xmm1 movapd %xmm4, %xmm5 #else addsubpd %xmm0, %xmm1 addsubpd %xmm4, %xmm5 movapd %xmm1, %xmm0 movapd %xmm5, %xmm4 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 movsd 2 * SIZE(CO1), %xmm10 movhpd 3 * SIZE(CO1), %xmm10 #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 mulpd %xmm14, %xmm0 mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm1 mulpd %xmm15, %xmm5 addsubpd %xmm1, %xmm0 addsubpd %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm10, %xmm4 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) movsd %xmm4, 2 * SIZE(CO1) movhpd %xmm4, 3 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L999 ALIGN_4 .L140: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq KK, %rax leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L142 .L141: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L141 .L142: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movddup ALPHA_R, %xmm14 movddup ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH jle .L144 .L143: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L143 ALIGN_4 .L144: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 SHUFPD_1 %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) addsubpd %xmm1, %xmm0 movapd %xmm0, %xmm1 #else addsubpd %xmm0, %xmm1 movapd %xmm1, %xmm0 #endif #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhpd 1 * SIZE(CO1), %xmm8 #endif SHUFPD_1 %xmm1, %xmm1 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm1 addsubpd %xmm1, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_2x4_nehalem.S000066400000000000000000000746661313527062700223760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %rbp #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rdx #define BB %r12 #define PREA %r10 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #define PREFETCHSIZE 8 #define PREFETCH prefetcht0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addps #define ADD2 addps #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addps #define ADD2 addps #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 addps #define ADD2 addps #else #define ADD1 addps #define ADD2 subps #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif unpcklps %xmm0, %xmm0 unpcklps %xmm1, %xmm1 movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $2, J NOBRANCH jle .L30 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 2), CO2 movq A, AO movq K, %rax salq $ZBASE_SHIFT + 2, %rax leaq (B, %rax), BB movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: prefetcht2 -32 * SIZE(BB) subq $-16 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 1 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 3 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 movaps -32 * SIZE(AO), %xmm0 xorps %xmm12, %xmm12 prefetcht0 1 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 3 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 movaps -28 * SIZE(AO), %xmm7 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm7, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm7, %xmm5 mulps %xmm7, %xmm6 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm7, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm7, %xmm3 mulps %xmm7, %xmm4 ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 movaps -20 * SIZE(AO), %xmm7 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm7, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm7, %xmm5 mulps %xmm7, %xmm6 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0xb1, %xmm1, %xmm2 mulps %xmm7, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm7, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm7, %xmm3 movaps -16 * SIZE(AO), %xmm0 mulps %xmm7, %xmm4 subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: ADD1 %xmm1, %xmm12 ADD2 %xmm2, %xmm13 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 pxor %xmm0, %xmm12 pxor %xmm0, %xmm14 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pshufd $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 pxor %xmm0, %xmm13 pxor %xmm0, %xmm15 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 pxor %xmm0, %xmm13 pxor %xmm0, %xmm15 #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm13, %xmm12 haddps %xmm15, %xmm14 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 shufps $0xd8, %xmm12, %xmm12 shufps $0xd8, %xmm14, %xmm14 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 movaps %xmm12, %xmm13 shufps $0xe4, %xmm14, %xmm12 shufps $0xe4, %xmm13, %xmm14 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 pshufd $0xb1, %xmm12, %xmm13 pshufd $0xb1, %xmm14, %xmm15 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 mulps %xmm2, %xmm12 mulps %xmm3, %xmm13 mulps %xmm2, %xmm10 mulps %xmm3, %xmm11 mulps %xmm2, %xmm14 mulps %xmm3, %xmm15 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 addsubps %xmm13, %xmm12 addsubps %xmm15, %xmm14 #ifndef TRMMKERNEL movups 0 * SIZE(CO1), %xmm0 movups 0 * SIZE(CO1, LDC), %xmm1 movups 0 * SIZE(CO2), %xmm2 movups 0 * SIZE(CO2, LDC), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm10 addps %xmm2, %xmm12 addps %xmm3, %xmm14 #endif movups %xmm8, 0 * SIZE(CO1) movups %xmm10, 0 * SIZE(CO1, LDC) movups %xmm12, 0 * SIZE(CO2) movups %xmm14, 0 * SIZE(CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L29 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $4, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps 0 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 #else pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #endif addps %xmm9, %xmm8 addps %xmm11, %xmm10 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 mulps %xmm2, %xmm10 mulps %xmm3, %xmm11 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO1, LDC), %xmm0 movsd (CO2), %xmm1 movhps (CO2, LDC), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm10 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO1, LDC) movsd %xmm10, (CO2) movhps %xmm10, (CO2, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif leaq (C, LDC, 4), C movq BO, B subq $1, J BRANCH jg .L01 ALIGN_4 .L30: testq $2, N BRANCH jle .L50 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC), CO2 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L40 ALIGN_4 .L31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pshufd $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 mulps %xmm2, %xmm10 mulps %xmm3, %xmm11 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 decq I # i -- BRANCH jg .L31 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 pxor %xmm0, %xmm9 #else pxor %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif addps %xmm9, %xmm8 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 addsubps %xmm9, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 movhps (CO2), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, (CO1) movhps %xmm8, (CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C movq BO, B ALIGN_4 .L50: testq $1, N BRANCH jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $1, I NOBRANCH jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -30 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -26 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $ -8 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) pxor %xmm0, %xmm8 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pshufd $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm9 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm9 #endif haddps %xmm9, %xmm8 shufps $0xd8, %xmm8, %xmm8 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 addsubps %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 decq I # i -- BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -26 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 pxor %xmm0, %xmm9 #else pxor %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif addps %xmm9, %xmm8 pshufd $0xb1, %xmm8, %xmm9 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 addsubps %xmm9, %xmm8 #ifndef TRMMKERNEL movsd (CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, (CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_4x2_barcelona.S000066400000000000000000001351001313527062700226700ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define ALPHA_R 16(%rsp) #define ALPHA_I 32(%rsp) #define J 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER 128(%rsp) #ifdef OPTERON #define movsd movlps #endif #define PREFETCH prefetch #define PREFETCHSIZE (16 * 17 + 0) #define RPREFETCHSIZE (16 * 9 + 0) #define WPREFETCHSIZE (16 * 9 + 0) #define KERNEL1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #define KERNEL2(xx) \ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ addps %xmm1, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL3(xx) \ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL4(xx) \ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ movaps (AO, %rax, 4), %xmm6 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ addps %xmm5, %xmm14 ;\ movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm6, %xmm2 #define KERNEL5(xx) \ mulps %xmm1, %xmm6 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm8 ;\ movaps %xmm2, %xmm6 ;\ addps %xmm1, %xmm12 ;\ movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ movaps %xmm6, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm6 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm10 ;\ movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ addps %xmm1, %xmm14 ;\ movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm6, %xmm2 #define KERNEL6(xx) \ mulps %xmm1, %xmm6 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm8 ;\ movaps %xmm2, %xmm6 ;\ addps %xmm1, %xmm12 ;\ movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm6, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm6 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm6, %xmm10 ;\ movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm7, %xmm2 #define KERNEL7(xx) \ mulps %xmm5, %xmm7 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm8 ;\ movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ movaps %xmm2, %xmm7 ;\ addps %xmm5, %xmm12 ;\ movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm7, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm7 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm10 ;\ movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ addps %xmm5, %xmm14 ;\ movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm7, %xmm2 #define KERNEL8(xx) \ mulps %xmm5, %xmm7 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm8 ;\ movaps %xmm2, %xmm7 ;\ addps %xmm5, %xmm12 ;\ movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm7, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm7 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm7, %xmm10 ;\ movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm8 ;\ movaps %xmm2, %xmm0 ;\ addps %xmm1, %xmm12 ;\ movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm0, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm1, %xmm0 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ addps %xmm0, %xmm10 ;\ movaps (AO, %rax, 4), %xmm0 ;\ addps %xmm1, %xmm14 ;\ movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm3, %xmm2 ;\ mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ addps %xmm5, %xmm14 ;\ movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm8 ;\ movaps %xmm2, %xmm4 ;\ addps %xmm5, %xmm12 ;\ movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm9 ;\ movaps %xmm4, %xmm2 ;\ addps %xmm3, %xmm13 ;\ movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm5, %xmm4 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ addps %xmm4, %xmm10 ;\ addps %xmm5, %xmm14 ;\ mulps %xmm3, %xmm2 ;\ mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ addps %xmm2, %xmm11 ;\ addps %xmm3, %xmm15 ;\ movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 #if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) .align 32768 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq 72(%rsp), LDC #ifdef TRMMKERNEL movsd 80(%rsp), %xmm12 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pxor %xmm7, %xmm7 cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask pxor %xmm10, %xmm10 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) movss %xmm7, 0 + POSINV movss %xmm10, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm10,12 + POSINV #else movss %xmm10, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm10, 8 + POSINV movss %xmm7, 12 + POSINV #endif addq $32 * SIZE, A #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm15 movq K, %rax sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif prefetch -20 * SIZE(BB) prefetch 28 * SIZE(BB) subq $-32 * SIZE, BB movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movaps -28 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movaps -16 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movaps 0 * SIZE(BO), %xmm5 pxor %xmm11, %xmm11 prefetchw 7 * SIZE(CO1) pxor %xmm12, %xmm12 prefetchw 7 * SIZE(CO2) pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 movaps %xmm0, %xmm2 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif andq $-8, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) NOBRANCH je .L15 KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) BRANCH jl .L12 ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL_SUB1(32 * 0) KERNEL_SUB2(32 * 0) KERNEL_SUB3(32 * 0) KERNEL_SUB4(32 * 0) addq $64 * SIZE, BO addq $32 * SIZE, AO ALIGN_3 .L16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $3, %rax # if (k & 1) BRANCH je .L18 leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_4 .L17: mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm8 movaps %xmm2, %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm9 movaps %xmm0, %xmm2 addps %xmm3, %xmm13 movaps -20 * SIZE(BO, %rax, 8), %xmm3 mulps %xmm1, %xmm0 mulps -28 * SIZE(AO, %rax, 4), %xmm1 addps %xmm0, %xmm10 movaps -24 * SIZE(AO, %rax, 4), %xmm0 addps %xmm1, %xmm14 movaps -16 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm3, %xmm2 mulps -28 * SIZE(AO, %rax, 4), %xmm3 addps %xmm2, %xmm11 addps %xmm3, %xmm15 movaps -12 * SIZE(BO, %rax, 8), %xmm3 movaps %xmm0, %xmm2 addq $SIZE * 2, %rax jl .L17 ALIGN_4 .L18: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm2 movhps 6 * SIZE(CO1), %xmm2 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 shufps $0xb1, %xmm13, %xmm13 shufps $0xb1, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm11, %xmm10 subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 movaps %xmm12, %xmm13 movaps %xmm14, %xmm15 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 shufps $0xb1, %xmm12, %xmm12 shufps $0xb1, %xmm14, %xmm14 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 mulps %xmm6, %xmm13 mulps %xmm7, %xmm12 mulps %xmm6, %xmm15 mulps %xmm7, %xmm14 addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm13, %xmm12 addps %xmm15, %xmm14 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm10 addps %xmm2, %xmm12 addps %xmm3, %xmm14 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movsd %xmm14, 4 * SIZE(CO2) movhps %xmm14, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -16 * SIZE(AO), %xmm2 movaps 0 * SIZE(AO), %xmm4 movaps 16 * SIZE(AO), %xmm6 movaps -32 * SIZE(BO), %xmm1 movaps -16 * SIZE(BO), %xmm3 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm8 movaps -12 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -8 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 mulps -4 * SIZE(BO), %xmm0 addps %xmm3, %xmm10 movaps 48 * SIZE(BO), %xmm3 addps %xmm0, %xmm11 movaps -24 * SIZE(AO), %xmm0 mulps %xmm0, %xmm5 addps %xmm5, %xmm8 movaps 4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 addps %xmm5, %xmm9 movaps 8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 mulps 12 * SIZE(BO), %xmm0 addps %xmm5, %xmm10 movaps 64 * SIZE(BO), %xmm5 addps %xmm0, %xmm11 movaps -20 * SIZE(AO), %xmm0 mulps %xmm0, %xmm7 addps %xmm7, %xmm8 movaps 20 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 addps %xmm7, %xmm9 movaps 24 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 mulps 28 * SIZE(BO), %xmm0 addps %xmm7, %xmm10 movaps 80 * SIZE(BO), %xmm7 addps %xmm0, %xmm11 movaps 0 * SIZE(AO), %xmm0 mulps %xmm2, %xmm1 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addps %xmm1, %xmm8 movaps 36 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm9 movaps 40 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 mulps 44 * SIZE(BO), %xmm2 addps %xmm1, %xmm10 movaps 96 * SIZE(BO), %xmm1 addps %xmm2, %xmm11 movaps -12 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 addps %xmm3, %xmm8 movaps 52 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm9 movaps 56 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 mulps 60 * SIZE(BO), %xmm2 addps %xmm3, %xmm10 movaps 112 * SIZE(BO), %xmm3 addps %xmm2, %xmm11 movaps -8 * SIZE(AO), %xmm2 mulps %xmm2, %xmm5 addps %xmm5, %xmm8 movaps 68 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm9 movaps 72 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 mulps 76 * SIZE(BO), %xmm2 addps %xmm5, %xmm10 movaps 128 * SIZE(BO), %xmm5 addps %xmm2, %xmm11 movaps -4 * SIZE(AO), %xmm2 mulps %xmm2, %xmm7 addps %xmm7, %xmm8 movaps 84 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm9 movaps 88 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 mulps 92 * SIZE(BO), %xmm2 addps %xmm7, %xmm10 movaps 144 * SIZE(BO), %xmm7 addps %xmm2, %xmm11 movaps 16 * SIZE(AO), %xmm2 subq $ -32 * SIZE, AO subq $-128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movaps -28 * SIZE(AO), %xmm0 subq $- 4 * SIZE, AO subq $-16 * SIZE, BO decq %rax jg .L26 ALIGN_4 .L28: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm11, %xmm10 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 #endif movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 addps %xmm9, %xmm8 addps %xmm11, %xmm10 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -24 * SIZE(AO), %xmm2 movaps -32 * SIZE(BO), %xmm1 movaps -16 * SIZE(BO), %xmm3 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movsd -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movsd -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm11 movsd 32 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm8 movsd -12 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movsd -8 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm10 movsd -4 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 movsd -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 movsd 48 * SIZE(BO), %xmm3 mulps %xmm0, %xmm5 addps %xmm5, %xmm8 movsd 4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 addps %xmm5, %xmm9 movsd 8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 addps %xmm5, %xmm10 movsd 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 movsd -26 * SIZE(AO), %xmm0 addps %xmm5, %xmm11 movsd 64 * SIZE(BO), %xmm5 mulps %xmm0, %xmm7 addps %xmm7, %xmm8 movsd 20 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 addps %xmm7, %xmm9 movsd 24 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 addps %xmm7, %xmm10 movsd 28 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 movsd -16 * SIZE(AO), %xmm0 addps %xmm7, %xmm11 movsd 80 * SIZE(BO), %xmm7 mulps %xmm2, %xmm1 addps %xmm1, %xmm8 movsd 36 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm9 movsd 40 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm10 movsd 44 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 movsd -22 * SIZE(AO), %xmm2 addps %xmm1, %xmm11 movsd 96 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm8 movsd 52 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm9 movsd 56 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm10 movsd 60 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 movsd -20 * SIZE(AO), %xmm2 addps %xmm3, %xmm11 movsd 112 * SIZE(BO), %xmm3 mulps %xmm2, %xmm5 addps %xmm5, %xmm8 movsd 68 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm9 movsd 72 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm10 movsd 76 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 movsd -18 * SIZE(AO), %xmm2 addps %xmm5, %xmm11 movsd 128 * SIZE(BO), %xmm5 mulps %xmm2, %xmm7 addps %xmm7, %xmm8 movsd 84 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm9 movsd 88 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm10 movsd 92 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 movsd -8 * SIZE(AO), %xmm2 addps %xmm7, %xmm11 movsd 144 * SIZE(BO), %xmm7 subq $ -16 * SIZE, AO subq $-128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movsd -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movsd -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm11 movsd -16 * SIZE(BO), %xmm1 subq $ -2 * SIZE, AO subq $-16 * SIZE, BO decq %rax jg .L36 ALIGN_4 .L38: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm11, %xmm10 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 #endif movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 addps %xmm9, %xmm8 addps %xmm11, %xmm10 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 2 * ldc decq J # j -- jg .L01 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 .L41: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm15 movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 .L42: prefetch (RPREFETCHSIZE + 0) * SIZE(B) movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -16 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movaps 0 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movaps 16 * SIZE(AO), %xmm6 pxor %xmm11, %xmm11 movaps -32 * SIZE(BO), %xmm1 pxor %xmm12, %xmm12 movaps -16 * SIZE(BO), %xmm3 pxor %xmm13, %xmm13 movaps 0 * SIZE(BO), %xmm5 pxor %xmm14, %xmm14 movaps 16 * SIZE(BO), %xmm7 pxor %xmm15, %xmm15 prefetchw 7 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm13 movaps -24 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -20 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm12 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm13 movaps 32 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm2, %xmm3 mulps -12 * SIZE(BO), %xmm2 addps %xmm3, %xmm8 movaps -16 * SIZE(BO), %xmm3 addps %xmm2, %xmm9 movaps -12 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 mulps -12 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps -8 * SIZE(BO), %xmm3 addps %xmm2, %xmm13 movaps -8 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 mulps -4 * SIZE(BO), %xmm2 addps %xmm3, %xmm8 movaps -8 * SIZE(BO), %xmm3 addps %xmm2, %xmm9 movaps -4 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 mulps -4 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps 48 * SIZE(BO), %xmm3 addps %xmm2, %xmm13 movaps 48 * SIZE(AO), %xmm2 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) mulps %xmm4, %xmm5 mulps 4 * SIZE(BO), %xmm4 addps %xmm5, %xmm8 movaps 0 * SIZE(BO), %xmm5 addps %xmm4, %xmm9 movaps 4 * SIZE(AO), %xmm4 mulps %xmm4, %xmm5 mulps 4 * SIZE(BO), %xmm4 addps %xmm5, %xmm12 movaps 8 * SIZE(BO), %xmm5 addps %xmm4, %xmm13 movaps 8 * SIZE(AO), %xmm4 mulps %xmm4, %xmm5 mulps 12 * SIZE(BO), %xmm4 addps %xmm5, %xmm8 movaps 8 * SIZE(BO), %xmm5 addps %xmm4, %xmm9 movaps 12 * SIZE(AO), %xmm4 mulps %xmm4, %xmm5 mulps 12 * SIZE(BO), %xmm4 addps %xmm5, %xmm12 movaps 64 * SIZE(BO), %xmm5 addps %xmm4, %xmm13 movaps 64 * SIZE(AO), %xmm4 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) mulps %xmm6, %xmm7 mulps 20 * SIZE(BO), %xmm6 addps %xmm7, %xmm8 movaps 16 * SIZE(BO), %xmm7 addps %xmm6, %xmm9 movaps 20 * SIZE(AO), %xmm6 mulps %xmm6, %xmm7 mulps 20 * SIZE(BO), %xmm6 addps %xmm7, %xmm12 movaps 24 * SIZE(BO), %xmm7 addps %xmm6, %xmm13 movaps 24 * SIZE(AO), %xmm6 mulps %xmm6, %xmm7 mulps 28 * SIZE(BO), %xmm6 addps %xmm7, %xmm8 movaps 24 * SIZE(BO), %xmm7 addps %xmm6, %xmm9 movaps 28 * SIZE(AO), %xmm6 mulps %xmm6, %xmm7 mulps 28 * SIZE(BO), %xmm6 addps %xmm7, %xmm12 movaps 80 * SIZE(BO), %xmm7 addps %xmm6, %xmm13 movaps 80 * SIZE(AO), %xmm6 subq $-64 * SIZE, AO subq $-64 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm13 movaps -24 * SIZE(AO), %xmm0 addq $ 8 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jg .L56 ALIGN_4 .L58: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm2 movhps 6 * SIZE(CO1), %xmm2 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm13, %xmm12 #else addps %xmm9, %xmm8 addps %xmm13, %xmm12 #endif movaps %xmm8, %xmm9 movaps %xmm12, %xmm13 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm12, %xmm12 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm13 mulps %xmm7, %xmm12 addps %xmm9, %xmm8 addps %xmm13, %xmm12 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm2, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -16 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movaps -32 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movaps -16 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movaps -24 * SIZE(AO), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BO), %xmm0 addps %xmm3, %xmm8 movaps -8 * SIZE(BO), %xmm3 addps %xmm0, %xmm9 movaps -20 * SIZE(AO), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BO), %xmm0 addps %xmm3, %xmm10 movaps 48 * SIZE(BO), %xmm3 addps %xmm0, %xmm11 movaps 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm2, %xmm5 mulps 4 * SIZE(BO), %xmm2 addps %xmm5, %xmm8 movaps 8 * SIZE(BO), %xmm5 addps %xmm2, %xmm9 movaps -12 * SIZE(AO), %xmm2 mulps %xmm2, %xmm5 mulps 12 * SIZE(BO), %xmm2 addps %xmm5, %xmm10 movaps 64 * SIZE(BO), %xmm5 addps %xmm2, %xmm11 movaps -8 * SIZE(AO), %xmm2 mulps %xmm2, %xmm7 mulps 20 * SIZE(BO), %xmm2 addps %xmm7, %xmm8 movaps 24 * SIZE(BO), %xmm7 addps %xmm2, %xmm9 movaps -4 * SIZE(AO), %xmm2 mulps %xmm2, %xmm7 mulps 28 * SIZE(BO), %xmm2 addps %xmm7, %xmm10 movaps 80 * SIZE(BO), %xmm7 addps %xmm2, %xmm11 movaps 16 * SIZE(AO), %xmm2 subq $-32 * SIZE, AO subq $-64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 #endif addps %xmm10, %xmm8 addps %xmm11, %xmm9 shufps $0xb1, %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 #else addps %xmm9, %xmm8 #endif movaps %xmm8, %xmm9 shufps $0xb1, %xmm8, %xmm8 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 addps %xmm9, %xmm8 #ifndef TRMMKERNEL addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L70: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -24 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movaps -32 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movaps -16 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm9 movsd -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movsd -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm11 movsd 32 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm8 movsd -12 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 movsd -26 * SIZE(AO), %xmm0 addps %xmm3, %xmm9 movsd -8 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm10 movsd -4 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 movsd -16 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 movsd 48 * SIZE(BO), %xmm3 mulps %xmm2, %xmm5 addps %xmm5, %xmm8 movsd 4 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 movsd -22 * SIZE(AO), %xmm2 addps %xmm5, %xmm9 movsd 8 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm10 movsd 12 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 movsd -20 * SIZE(AO), %xmm2 addps %xmm5, %xmm11 movsd 64 * SIZE(BO), %xmm5 mulps %xmm2, %xmm7 addps %xmm7, %xmm8 movsd 20 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 movsd -18 * SIZE(AO), %xmm2 addps %xmm7, %xmm9 movsd 24 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm10 movsd 28 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 movsd -8 * SIZE(AO), %xmm2 addps %xmm7, %xmm11 movsd 80 * SIZE(BO), %xmm7 subq $-16 * SIZE, AO subq $-64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movsd -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm9 movsd -24 * SIZE(BO), %xmm1 addq $2 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L76 ALIGN_4 .L78: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 #endif addps %xmm10, %xmm8 addps %xmm11, %xmm9 shufps $0xb1, %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 #else addps %xmm9, %xmm8 #endif movaps %xmm8, %xmm9 shufps $0xb1, %xmm8, %xmm8 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 addps %xmm9, %xmm8 #ifndef TRMMKERNEL addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_4x2_core2.S000066400000000000000000001026721313527062700217640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA_R 0(%rsp) #define ALPHA_I 16(%rsp) #define J 32(%rsp) #define OFFSET 40(%rsp) #define KK 48(%rsp) #define KKK 56(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (16 * 4 + 0) #define PREFETCH_W (PREFETCH_R * 4) #define PREFETCHSIZE (16 * 13 + 10) #define PREFETCH prefetcht0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADDSUB addps #else #define ADDSUB subps #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif #endif movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING pxor %xmm7, %xmm7 cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I subq $-32 * SIZE, A subq $-32 * SIZE, B #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq OLD_M, M movq OLD_N, N salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq 32 * SIZE + BUFFER, BO movaps -32 * SIZE(B), %xmm3 movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movaps -28 * SIZE(B), %xmm7 movaps -24 * SIZE(B), %xmm11 movaps -20 * SIZE(B), %xmm15 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0x55, %xmm3, %xmm1 movaps %xmm1, -28 * SIZE(BO) pshufd $0xaa, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(BO) pshufd $0xff, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(BO) movaps -16 * SIZE(B), %xmm3 prefetcht0 (PREFETCH_W + 16) * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 movaps %xmm4, -16 * SIZE(BO) pshufd $0x55, %xmm7, %xmm5 movaps %xmm5, -12 * SIZE(BO) pshufd $0xaa, %xmm7, %xmm6 movaps %xmm6, -8 * SIZE(BO) pshufd $0xff, %xmm7, %xmm7 movaps %xmm7, -4 * SIZE(BO) prefetcht0 (PREFETCH_W + 32) * SIZE(BO) pshufd $0x00, %xmm11, %xmm8 movaps %xmm8, 0 * SIZE(BO) pshufd $0x55, %xmm11, %xmm9 movaps %xmm9, 4 * SIZE(BO) pshufd $0xaa, %xmm11, %xmm10 movaps %xmm10, 8 * SIZE(BO) pshufd $0xff, %xmm11, %xmm11 movaps %xmm11, 12 * SIZE(BO) prefetcht0 (PREFETCH_W + 48) * SIZE(BO) pshufd $0x00, %xmm15, %xmm12 movaps %xmm12, 16 * SIZE(BO) pshufd $0x55, %xmm15, %xmm13 movaps %xmm13, 20 * SIZE(BO) pshufd $0xaa, %xmm15, %xmm14 movaps %xmm14, 24 * SIZE(BO) pshufd $0xff, %xmm15, %xmm15 movaps %xmm15, 28 * SIZE(BO) subq $-16 * SIZE, B subq $-64 * SIZE, BO subq $1, %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: pshufd $0x00, %xmm3, %xmm0 movaps %xmm0, -32 * SIZE(BO) pshufd $0x55, %xmm3, %xmm1 movaps %xmm1, -28 * SIZE(BO) pshufd $0xaa, %xmm3, %xmm2 movaps %xmm2, -24 * SIZE(BO) pshufd $0xff, %xmm3, %xmm3 movaps %xmm3, -20 * SIZE(BO) movaps -28 * SIZE(B), %xmm3 addq $ 4 * SIZE, B addq $16 * SIZE, BO subq $1, %rax jne .L04 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 40 * SIZE + BUFFER, BO #else leaq 40 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -40 * SIZE(BO), %xmm6 movaps -36 * SIZE(BO), %xmm7 prefetcht2 -32 * SIZE(BB) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 prefetcht0 7 * SIZE(CO1) pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 prefetcht0 7 * SIZE(CO2) pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-32 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax jle .L15 ALIGN_4 .L12: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 PADDING; movaps %xmm6, %xmm3 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADDSUB %xmm4, %xmm11 movaps -28 * SIZE(BO), %xmm4 ADDSUB %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADDSUB %xmm7, %xmm9 movaps -20 * SIZE(BO), %xmm7 ADDSUB %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps -16 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADDSUB %xmm4, %xmm11 movaps -12 * SIZE(BO), %xmm4 ADDSUB %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -8 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADDSUB %xmm7, %xmm9 movaps -4 * SIZE(BO), %xmm7 ADDSUB %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -12 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 PADDING; movaps %xmm6, %xmm3 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADDSUB %xmm4, %xmm11 movaps 4 * SIZE(BO), %xmm4 ADDSUB %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps 8 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADDSUB %xmm7, %xmm9 movaps 12 * SIZE(BO), %xmm7 ADDSUB %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -4 * SIZE(AO), %xmm1 addps %xmm2, %xmm10 movaps 16 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADDSUB %xmm4, %xmm11 movaps 20 * SIZE(BO), %xmm4 ADDSUB %xmm5, %xmm15 movaps %xmm7, %xmm5 subq $-32 * SIZE, AO mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps 24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADDSUB %xmm7, %xmm9 movaps 28 * SIZE(BO), %xmm7 ADDSUB %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $-64 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_4 .L15: prefetcht2 -16 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 movaps %xmm6, %xmm3 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADDSUB %xmm4, %xmm11 movaps -28 * SIZE(BO), %xmm4 ADDSUB %xmm5, %xmm15 movaps %xmm7, %xmm5 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 addps %xmm6, %xmm8 movaps -24 * SIZE(BO), %xmm6 addps %xmm3, %xmm12 addq $8 * SIZE, AO movaps %xmm2, %xmm3 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADDSUB %xmm7, %xmm9 movaps -20 * SIZE(BO), %xmm7 ADDSUB %xmm5, %xmm13 addq $16 * SIZE, BO movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $1, %rax jg .L16 ALIGN_4 .L18: movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 addps %xmm2, %xmm10 addps %xmm3, %xmm14 ADDSUB %xmm4, %xmm11 ADDSUB %xmm5, %xmm15 #if !defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm2 movhps 6 * SIZE(CO1), %xmm2 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 shufps $0xb1, %xmm13, %xmm13 shufps $0xb1, %xmm15, %xmm15 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 addsubps %xmm13, %xmm12 addsubps %xmm15, %xmm14 movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 movaps %xmm12, %xmm13 movaps %xmm14, %xmm15 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 shufps $0xb1, %xmm12, %xmm12 shufps $0xb1, %xmm14, %xmm14 #else shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 shufps $0xb1, %xmm12, %xmm12 shufps $0xb1, %xmm14, %xmm14 addsubps %xmm8, %xmm9 addsubps %xmm10, %xmm11 addsubps %xmm12, %xmm13 addsubps %xmm14, %xmm15 movaps %xmm9, %xmm8 movaps %xmm11, %xmm10 movaps %xmm13, %xmm12 movaps %xmm15, %xmm14 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 shufps $0xb1, %xmm13, %xmm13 shufps $0xb1, %xmm15, %xmm15 #endif mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 mulps %xmm6, %xmm13 mulps %xmm7, %xmm12 mulps %xmm6, %xmm15 mulps %xmm7, %xmm14 addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm13, %xmm12 addps %xmm15, %xmm14 #if !defined(TRMMKERNEL) && !defined(BETAZERO) addps %xmm0, %xmm8 addps %xmm1, %xmm10 addps %xmm2, %xmm12 addps %xmm3, %xmm14 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movsd %xmm14, 4 * SIZE(CO2) movhps %xmm14, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movaps -28 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm3 movaps -8 * SIZE(BO), %xmm4 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movaps -24 * SIZE(AO), %xmm0 movaps 0 * SIZE(BO), %xmm2 movaps 4 * SIZE(BO), %xmm3 movaps 8 * SIZE(BO), %xmm4 movaps 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movaps -20 * SIZE(AO), %xmm0 movaps 16 * SIZE(BO), %xmm2 movaps 20 * SIZE(BO), %xmm3 movaps 24 * SIZE(BO), %xmm4 movaps 28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 subq $-16 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 addq $ 4 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L26 ALIGN_4 .L28: movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 #else shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 addsubps %xmm8, %xmm9 addsubps %xmm10, %xmm11 movaps %xmm9, %xmm8 movaps %xmm11, %xmm10 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #endif mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 addps %xmm9, %xmm8 addps %xmm11, %xmm10 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 addps %xmm0, %xmm8 addps %xmm2, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L35 ALIGN_4 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movsd -30 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 movsd -12 * SIZE(BO), %xmm3 movsd -8 * SIZE(BO), %xmm4 movsd -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movsd -28 * SIZE(AO), %xmm0 movsd 0 * SIZE(BO), %xmm2 movsd 4 * SIZE(BO), %xmm3 movsd 8 * SIZE(BO), %xmm4 movsd 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movsd -26 * SIZE(AO), %xmm0 movsd 16 * SIZE(BO), %xmm2 movsd 20 * SIZE(BO), %xmm3 movsd 24 * SIZE(BO), %xmm4 movsd 28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-64 * SIZE, BO subq $1, %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 mulps %xmm0, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 addq $ 2 * SIZE, AO addq $16 * SIZE, BO subq $1, %rax jg .L36 ALIGN_4 .L38: movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 #else shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 addsubps %xmm8, %xmm9 addsubps %xmm10, %xmm11 movaps %xmm9, %xmm8 movaps %xmm11, %xmm10 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #endif mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 addps %xmm9, %xmm8 addps %xmm11, %xmm10 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 addps %xmm0, %xmm8 addps %xmm2, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 2 * ldc decq J # j -- jg .L01 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 .L41: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 .L42: movss -32 * SIZE(B), %xmm8 movss -31 * SIZE(B), %xmm9 movss -30 * SIZE(B), %xmm10 movss -29 * SIZE(B), %xmm11 movss -28 * SIZE(B), %xmm12 movss -27 * SIZE(B), %xmm13 movss -26 * SIZE(B), %xmm14 movss -25 * SIZE(B), %xmm15 shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 shufps $0, %xmm10, %xmm10 shufps $0, %xmm11, %xmm11 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 shufps $0, %xmm15, %xmm15 movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) movaps %xmm10, 8 * SIZE(BO) movaps %xmm11, 12 * SIZE(BO) movaps %xmm12, 16 * SIZE(BO) movaps %xmm13, 20 * SIZE(BO) movaps %xmm14, 24 * SIZE(BO) movaps %xmm15, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO subq $1, %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movss -32 * SIZE(B), %xmm8 movss -31 * SIZE(B), %xmm9 shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO subq $1, %rax jne .L44 ALIGN_4 .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L55 ALIGN_4 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 ADDSUB %xmm4, %xmm9 ADDSUB %xmm5, %xmm13 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -24 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -20 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 ADDSUB %xmm4, %xmm9 ADDSUB %xmm5, %xmm13 movaps -16 * SIZE(AO), %xmm0 movaps -12 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -12 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 ADDSUB %xmm4, %xmm9 ADDSUB %xmm5, %xmm13 movaps -8 * SIZE(AO), %xmm0 movaps -4 * SIZE(AO), %xmm1 movaps -8 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -4 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 ADDSUB %xmm4, %xmm9 ADDSUB %xmm5, %xmm13 subq $-32 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps %xmm2, %xmm3 movaps -28 * SIZE(BO), %xmm4 movaps %xmm4, %xmm5 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 mulps %xmm0, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 addps %xmm3, %xmm12 ADDSUB %xmm4, %xmm9 ADDSUB %xmm5, %xmm13 addq $8 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L56 ALIGN_4 .L58: movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm13, %xmm13 addsubps %xmm9, %xmm8 addsubps %xmm13, %xmm12 movaps %xmm8, %xmm9 movaps %xmm12, %xmm13 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm12, %xmm12 #else shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm12, %xmm12 addsubps %xmm8, %xmm9 addsubps %xmm12, %xmm13 movaps %xmm9, %xmm8 movaps %xmm13, %xmm12 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm13, %xmm13 #endif mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm13 mulps %xmm7, %xmm12 addps %xmm9, %xmm8 addps %xmm13, %xmm12 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 movaps -24 * SIZE(BO), %xmm4 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movaps -24 * SIZE(AO), %xmm0 movaps -20 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 movaps -12 * SIZE(BO), %xmm3 movaps -8 * SIZE(BO), %xmm4 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 subq $-16 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm2 movaps -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L66 ALIGN_4 .L68: movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 addps %xmm10, %xmm8 addps %xmm11, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 addsubps %xmm9, %xmm8 movaps %xmm8, %xmm9 shufps $0xb1, %xmm8, %xmm8 #else shufps $0xb1, %xmm8, %xmm8 addsubps %xmm8, %xmm9 movaps %xmm9, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 addps %xmm9, %xmm8 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L70: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax je .L75 ALIGN_4 .L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 movsd -30 * SIZE(AO), %xmm1 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 movsd -24 * SIZE(BO), %xmm4 movsd -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 movsd -28 * SIZE(AO), %xmm0 movsd -26 * SIZE(AO), %xmm1 movsd -16 * SIZE(BO), %xmm2 movsd -12 * SIZE(BO), %xmm3 movsd -8 * SIZE(BO), %xmm4 movsd -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm4 mulps %xmm1, %xmm5 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addps %xmm4, %xmm10 ADDSUB %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax BRANCH je .L78 ALIGN_4 .L76: movsd -32 * SIZE(AO), %xmm0 movsd -32 * SIZE(BO), %xmm2 movsd -28 * SIZE(BO), %xmm3 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 addps %xmm2, %xmm8 ADDSUB %xmm3, %xmm9 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L76 ALIGN_4 .L78: movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 addps %xmm10, %xmm8 addps %xmm11, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm9, %xmm9 addsubps %xmm9, %xmm8 movaps %xmm8, %xmm9 shufps $0xb1, %xmm8, %xmm8 #else shufps $0xb1, %xmm8, %xmm8 addsubps %xmm8, %xmm9 movaps %xmm9, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 addps %xmm9, %xmm8 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_4x2_haswell.S000066400000000000000000002570251313527062700224140ustar00rootroot00000000000000/********************************************************************************* Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /******************************************************************************** * 2014/07/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * 2013/10/28 Saar * Parameter: * ZGEMM_DEFAULT_UNROLL_N 2 * ZGEMM_DEFAULT_UNROLL_M 4 * ZGEMM_DEFAULT_P 256 * ZGEMM_DEFAULT_Q 128 * A_PR1 512 * B_PR1 512 * * 2014/07/28 Saar * Performance at 4608x4608x4608: * 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) * 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) * 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) * 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) * ********************************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $ 0, 4096 * 4(%rsp);\ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $ 0, 4096 * 3(%rsp);\ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $ 0, 4096 * 2(%rsp);\ movl $ 0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $ 0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(BULLDOZER) #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #else #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 #endif #else #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #else #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 #endif #endif #define A_PR1 512 #define B_PR1 512 /***************************************************************************************************/ .macro KERNEL4x3_SUB vmovups (AO), %ymm0 vmovups 4 * SIZE(AO), %ymm1 prefetcht0 A_PR1(AO) vbroadcastsd (BO), %ymm2 vbroadcastsd 1 * SIZE(BO), %ymm3 VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) vbroadcastsd 2 * SIZE(BO), %ymm2 vbroadcastsd 3 * SIZE(BO), %ymm3 VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) vbroadcastsd 4 * SIZE(BO), %ymm2 vbroadcastsd 5 * SIZE(BO), %ymm3 VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) addq $ 6*SIZE, BO addq $ 8*SIZE, AO decq %rax .endm .macro SAVE4x3 vbroadcastsd ALPHA_R, %ymm0 vbroadcastsd ALPHA_I, %ymm1 // swap high and low 8 bytes vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %ymm9, %ymm8 , %ymm8 vaddsubpd %ymm11,%ymm10, %ymm10 vaddsubpd %ymm13,%ymm12, %ymm12 vaddsubpd %ymm15,%ymm14, %ymm14 vaddsubpd %ymm5 ,%ymm4 , %ymm4 vaddsubpd %ymm7 ,%ymm6 , %ymm6 vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 #else vaddsubpd %ymm8, %ymm9 ,%ymm9 vaddsubpd %ymm10, %ymm11,%ymm11 vaddsubpd %ymm12, %ymm13,%ymm13 vaddsubpd %ymm14, %ymm15,%ymm15 vaddsubpd %ymm4 , %ymm5 ,%ymm5 vaddsubpd %ymm6 , %ymm7 ,%ymm7 vmovapd %ymm9, %ymm8 vmovapd %ymm11, %ymm10 vmovapd %ymm13, %ymm12 vmovapd %ymm15, %ymm14 vmovapd %ymm5 , %ymm4 vmovapd %ymm7 , %ymm6 // swap high and low 8 bytes vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 #endif // multiply with ALPHA_R vmulpd %ymm8 , %ymm0, %ymm8 vmulpd %ymm10, %ymm0, %ymm10 vmulpd %ymm12, %ymm0, %ymm12 vmulpd %ymm14, %ymm0, %ymm14 vmulpd %ymm4 , %ymm0, %ymm4 vmulpd %ymm6 , %ymm0, %ymm6 // multiply with ALPHA_I vmulpd %ymm9 , %ymm1, %ymm9 vmulpd %ymm11, %ymm1, %ymm11 vmulpd %ymm13, %ymm1, %ymm13 vmulpd %ymm15, %ymm1, %ymm15 vmulpd %ymm5 , %ymm1, %ymm5 vmulpd %ymm7 , %ymm1, %ymm7 vaddsubpd %ymm9, %ymm8 , %ymm8 vaddsubpd %ymm11,%ymm10, %ymm10 vaddsubpd %ymm13,%ymm12, %ymm12 vaddsubpd %ymm15,%ymm14, %ymm14 vaddsubpd %ymm5 ,%ymm4 , %ymm4 vaddsubpd %ymm7 ,%ymm6 , %ymm6 #ifndef TRMMKERNEL vaddpd (CO1), %ymm8 , %ymm8 vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 vaddpd (CO1, LDC), %ymm10, %ymm10 vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 vaddpd (CO1, LDC,2), %ymm4 , %ymm4 vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , 4 * SIZE(CO1) vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 4 * SIZE(CO1, LDC) vmovups %ymm4 , (CO1, LDC, 2) vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) prefetcht0 64(CO1) prefetcht0 64(CO1, LDC) .endm /***************************************************************************************************/ .macro KERNEL2x3_SUB vmovups (AO), %xmm0 vmovups 2 * SIZE(AO), %xmm1 vmovddup (BO), %xmm2 vmovddup 1 * SIZE(BO), %xmm3 VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) vmovddup 2 * SIZE(BO), %xmm2 vmovddup 3 * SIZE(BO), %xmm3 VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) vmovddup 4 * SIZE(BO), %xmm2 vmovddup 5 * SIZE(BO), %xmm3 VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) addq $ 6*SIZE, BO addq $ 4*SIZE, AO decq %rax .endm .macro SAVE2x3 vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 vaddsubpd %xmm5, %xmm4 , %xmm4 vaddsubpd %xmm7, %xmm6 , %xmm6 vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 #else vaddsubpd %xmm8, %xmm9 ,%xmm9 vaddsubpd %xmm10, %xmm11,%xmm11 vaddsubpd %xmm12, %xmm13,%xmm13 vaddsubpd %xmm14, %xmm15,%xmm15 vaddsubpd %xmm4, %xmm5 ,%xmm5 vaddsubpd %xmm6, %xmm7 ,%xmm7 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 vmovapd %xmm13, %xmm12 vmovapd %xmm15, %xmm14 vmovapd %xmm5, %xmm4 vmovapd %xmm7, %xmm6 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 vmulpd %xmm12, %xmm0, %xmm12 vmulpd %xmm14, %xmm0, %xmm14 vmulpd %xmm4 , %xmm0, %xmm4 vmulpd %xmm6 , %xmm0, %xmm6 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vmulpd %xmm13, %xmm1, %xmm13 vmulpd %xmm15, %xmm1, %xmm15 vmulpd %xmm5 , %xmm1, %xmm5 vmulpd %xmm7 , %xmm1, %xmm7 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 vaddsubpd %xmm5, %xmm4 , %xmm4 vaddsubpd %xmm7, %xmm6 , %xmm6 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 vaddpd (CO1, LDC), %xmm10, %xmm10 vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 vaddpd (CO1, LDC,2), %xmm4 , %xmm4 vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 2 * SIZE(CO1, LDC) vmovups %xmm4 , (CO1, LDC,2) vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) .endm /************************************************************************************************/ .macro KERNEL1x3_SUB vmovups (AO), %xmm0 vmovddup (BO), %xmm2 vmovddup 1 * SIZE(BO), %xmm3 VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) vmovddup 2 * SIZE(BO), %xmm2 vmovddup 3 * SIZE(BO), %xmm3 VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) vmovddup 4 * SIZE(BO), %xmm2 vmovddup 5 * SIZE(BO), %xmm3 VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) addq $ 6*SIZE, BO addq $ 2*SIZE, AO decq %rax .endm .macro SAVE1x3 vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm5, %xmm4 , %xmm4 vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 #else vaddsubpd %xmm8, %xmm9, %xmm9 vaddsubpd %xmm10,%xmm11, %xmm11 vaddsubpd %xmm4, %xmm5, %xmm5 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 vmovapd %xmm5, %xmm4 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 vmulpd %xmm4 , %xmm0, %xmm4 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vmulpd %xmm5 , %xmm1, %xmm5 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm5, %xmm4 , %xmm4 #ifndef TRMMKERNEL vaddpd (CO1) , %xmm8 , %xmm8 vaddpd (CO1, LDC) , %xmm10, %xmm10 vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm4 , (CO1, LDC,2) .endm /***************************************************************************************************/ .macro KERNEL4x2_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) addq $ 4, BI addq $ 8, %rax .endm .macro SAVE4x2 vbroadcastsd ALPHA_R, %ymm0 vbroadcastsd ALPHA_I, %ymm1 // swap high and low 8 bytes vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %ymm9, %ymm8 , %ymm8 vaddsubpd %ymm11,%ymm10, %ymm10 vaddsubpd %ymm13,%ymm12, %ymm12 vaddsubpd %ymm15,%ymm14, %ymm14 vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 #else vaddsubpd %ymm8, %ymm9 ,%ymm9 vaddsubpd %ymm10, %ymm11,%ymm11 vaddsubpd %ymm12, %ymm13,%ymm13 vaddsubpd %ymm14, %ymm15,%ymm15 vmovapd %ymm9, %ymm8 vmovapd %ymm11, %ymm10 vmovapd %ymm13, %ymm12 vmovapd %ymm15, %ymm14 // swap high and low 8 bytes vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 #endif // multiply with ALPHA_R vmulpd %ymm8 , %ymm0, %ymm8 vmulpd %ymm10, %ymm0, %ymm10 vmulpd %ymm12, %ymm0, %ymm12 vmulpd %ymm14, %ymm0, %ymm14 // multiply with ALPHA_I vmulpd %ymm9 , %ymm1, %ymm9 vmulpd %ymm11, %ymm1, %ymm11 vmulpd %ymm13, %ymm1, %ymm13 vmulpd %ymm15, %ymm1, %ymm15 vaddsubpd %ymm9, %ymm8 , %ymm8 vaddsubpd %ymm11,%ymm10, %ymm10 vaddsubpd %ymm13,%ymm12, %ymm12 vaddsubpd %ymm15,%ymm14, %ymm14 #ifndef TRMMKERNEL vaddpd (CO1), %ymm8 , %ymm8 vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 vaddpd (CO1, LDC), %ymm10, %ymm10 vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 , 4 * SIZE(CO1) vmovups %ymm10 , (CO1, LDC) vmovups %ymm14 , 4 * SIZE(CO1, LDC) prefetcht0 64(CO1) prefetcht0 64(CO1, LDC) .endm /***************************************************************************************************/ .macro KERNEL2x2_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) addq $ 4, BI addq $ 4, %rax .endm .macro SAVE2x2 vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 #else vaddsubpd %xmm8, %xmm9 ,%xmm9 vaddsubpd %xmm10, %xmm11,%xmm11 vaddsubpd %xmm12, %xmm13,%xmm13 vaddsubpd %xmm14, %xmm15,%xmm15 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 vmovapd %xmm13, %xmm12 vmovapd %xmm15, %xmm14 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 vmulpd %xmm12, %xmm0, %xmm12 vmulpd %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vmulpd %xmm13, %xmm1, %xmm13 vmulpd %xmm15, %xmm1, %xmm15 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 vaddpd (CO1, LDC), %xmm10, %xmm10 vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 2 * SIZE(CO1, LDC) .endm /************************************************************************************************/ /************************************************************************************************/ .macro KERNEL1x2_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) addq $ 4, BI addq $ 2, %rax .endm .macro SAVE1x2 vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 #else vaddsubpd %xmm8, %xmm9, %xmm9 vaddsubpd %xmm10,%xmm11, %xmm11 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) .endm /************************************************************************************************/ .macro KERNEL4x1_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) addq $ 2, BI addq $ 8, %rax .endm .macro SAVE4x1 vbroadcastsd ALPHA_R, %ymm0 vbroadcastsd ALPHA_I, %ymm1 // swap high and low 8 bytes vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %ymm9, %ymm8 , %ymm8 vaddsubpd %ymm13,%ymm12 , %ymm12 vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 #else vaddsubpd %ymm8, %ymm9 , %ymm9 vaddsubpd %ymm12,%ymm13, %ymm13 vmovapd %ymm9, %ymm8 vmovapd %ymm13, %ymm12 // swap high and low 8 bytes vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 #endif // multiply with ALPHA_R vmulpd %ymm8 , %ymm0, %ymm8 vmulpd %ymm12, %ymm0, %ymm12 // multiply with ALPHA_I vmulpd %ymm9 , %ymm1, %ymm9 vmulpd %ymm13, %ymm1, %ymm13 vaddsubpd %ymm9, %ymm8 , %ymm8 vaddsubpd %ymm13, %ymm12, %ymm12 #ifndef TRMMKERNEL vaddpd (CO1), %ymm8 , %ymm8 vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 #endif vmovups %ymm8 , (CO1) vmovups %ymm12 ,4 * SIZE(CO1) .endm /************************************************************************************************/ .macro KERNEL2x1_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) addq $ 2, BI addq $ 4, %rax .endm .macro SAVE2x1 vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13,%xmm12 , %xmm12 vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 #else vaddsubpd %xmm8, %xmm9 , %xmm9 vaddsubpd %xmm12,%xmm13, %xmm13 vmovapd %xmm9, %xmm8 vmovapd %xmm13, %xmm12 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm13, %xmm1, %xmm13 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13, %xmm12, %xmm12 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) .endm /************************************************************************************************/ .macro KERNEL1x1_SUB vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) addq $ 2, BI addq $ 2, %rax .endm .macro SAVE1x1 vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8, %xmm8 vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 #else vaddsubpd %xmm8, %xmm9, %xmm9 vmovapd %xmm9, %xmm8 // swap high and low 64 bytes vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vaddsubpd %xmm9 ,%xmm8, %xmm8 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) .endm /************************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $ STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $ 128 + L_BUFFER_SIZE, %rsp andq $ -4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA_R vmovsd %xmm1, ALPHA_I salq $ ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $ 6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 /************************************************************************************************/ .L6_00_0: movq Ndiv6, J cmpq $ 0, J je .L2_00_0 ALIGN_4 .L6_00_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax salq $2, %rax // 2 * COMPSIZE leaq (B, %rax,8), BO2 movq BO2, B // next offset of B movq K, %rax ALIGN_4 .L6_00_02b: vmovups (BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm1 vmovups (BO2), %xmm2 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) vmovups %xmm2, 4 * SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO2 addq $ 6*SIZE,BO decq %rax jnz .L6_00_02b .L6_00_02c: .L6_00_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a movq M, I sarq $ 2, I // i = (m >> 2) je .L6_2_10 ALIGN_4 /******************************************************************************************************************/ .L6_4_11: leaq BUFFER1, BO // first buffer to BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L6_4_16 ALIGN_4 .L6_4_12: KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB je .L6_4_16 KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB je .L6_4_16 jmp .L6_4_12 ALIGN_4 .L6_4_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L6_4_19 ALIGN_4 .L6_4_17: KERNEL4x3_SUB jnz .L6_4_17 ALIGN_4 .L6_4_19: SAVE4x3 addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L6_4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ /******************************************************************************************************************/ .L6_2_10: testq $ 2, M jz .L6_2_40 // to next 2 lines of N .L6_2_11: leaq BUFFER1, BO // first buffer to BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L6_2_16 ALIGN_4 .L6_2_12: KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L6_2_16 KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L6_2_16 jmp .L6_2_12 ALIGN_4 .L6_2_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L6_2_19 ALIGN_4 .L6_2_17: KERNEL2x3_SUB jnz .L6_2_17 ALIGN_4 .L6_2_19: SAVE2x3 addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_2_40: testq $ 1, M jz .L6_2_60 // to next 2 lines of N ALIGN_4 .L6_2_41: leaq BUFFER1, BO // first buffer to BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L6_2_46 ALIGN_4 .L6_2_42: KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L6_2_46 KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L6_2_46 jmp .L6_2_42 ALIGN_4 .L6_2_46: movq K, %rax andq $ 7, %rax # if (k & 1) je .L6_2_49 ALIGN_4 .L6_2_47: KERNEL1x3_SUB jnz .L6_2_47 ALIGN_4 .L6_2_49: SAVE1x3 addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L6_2_41 ALIGN_4 .L6_2_60: /************************************************************************************************/ /************************************************************************************************/ .L7_00_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax salq $2, %rax // 2 * COMPSIZE leaq (B, %rax,8), BO2 movq K, %rax ALIGN_4 .L7_00_02b: vmovups 2 * SIZE(BO1), %xmm0 vmovups (BO2), %xmm1 vmovups 2 * SIZE(BO2), %xmm2 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) vmovups %xmm2, 4 * SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO2 addq $ 6*SIZE,BO decq %rax jnz .L7_00_02b .L7_00_02c: movq BO2, B // next offset of B .L7_00_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a movq M, I sarq $ 2, I // i = (m >> 2) je .L7_2_10 ALIGN_4 /******************************************************************************************************************/ .L7_4_11: leaq BUFFER1, BO // first buffer to BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L7_4_16 ALIGN_4 .L7_4_12: KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB je .L7_4_16 KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB KERNEL4x3_SUB je .L7_4_16 jmp .L7_4_12 ALIGN_4 .L7_4_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L7_4_19 ALIGN_4 .L7_4_17: KERNEL4x3_SUB jnz .L7_4_17 ALIGN_4 .L7_4_19: SAVE4x3 addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L7_4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ /******************************************************************************************************************/ .L7_2_10: testq $ 2, M jz .L7_2_40 // to next 2 lines of N .L7_2_11: leaq BUFFER1, BO // first buffer to BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L7_2_16 ALIGN_4 .L7_2_12: KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L7_2_16 KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB KERNEL2x3_SUB je .L7_2_16 jmp .L7_2_12 ALIGN_4 .L7_2_16: movq K, %rax andq $ 7, %rax # if (k & 1) je .L7_2_19 ALIGN_4 .L7_2_17: KERNEL2x3_SUB jnz .L7_2_17 ALIGN_4 .L7_2_19: SAVE2x3 addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L7_2_40: testq $ 1, M jz .L7_2_60 // to next 2 lines of N ALIGN_4 .L7_2_41: leaq BUFFER1, BO // first buffer to BO vzeroall movq K, %rax andq $ -8, %rax // K = K - ( K % 8 ) je .L7_2_46 ALIGN_4 .L7_2_42: KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L7_2_46 KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB KERNEL1x3_SUB je .L7_2_46 jmp .L7_2_42 ALIGN_4 .L7_2_46: movq K, %rax andq $ 7, %rax # if (k & 1) je .L7_2_49 ALIGN_4 .L7_2_47: KERNEL1x3_SUB jnz .L7_2_47 ALIGN_4 .L7_2_49: SAVE1x3 addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L7_2_41 ALIGN_4 .L7_2_60: decq J // j -- jg .L6_00_01 // next 6 lines of N /************************************************************************************************/ /************************************************************************************************/ .L2_00_0: movq Nmod6, J sarq $1, J // j = j / 2 cmpq $ 0, J je .L1_2_0 ALIGN_4 .L2_00_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_00_02b: vmovups (BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm1 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L2_00_02b .L2_00_02c: movq BO1, B // next offset of B .L2_00_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 8 * SIZE, AO movq M, I sarq $ 2, I // i = (m >> 2) je .L2_2_10 ALIGN_4 /******************************************************************************************************************/ .L2_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB je .L2_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB je .L2_4_16 jmp .L2_4_12 ALIGN_4 .L2_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_17: KERNEL4x2_SUB jl .L2_4_17 ALIGN_4 .L2_4_19: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ /******************************************************************************************************************/ .L2_2_10: testq $ 2, M jz .L2_2_40 // to next 2 lines of N .L2_2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB je .L2_2_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB je .L2_2_16 jmp .L2_2_12 ALIGN_4 .L2_2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_17: KERNEL2x2_SUB jl .L2_2_17 ALIGN_4 .L2_2_19: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_2_40: testq $ 1, M jz .L2_2_60 // to next 2 lines of N ALIGN_4 .L2_2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB je .L2_2_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB je .L2_2_46 jmp .L2_2_42 ALIGN_4 .L2_2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_47: KERNEL1x2_SUB jl .L2_2_47 ALIGN_4 .L2_2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L2_2_41 ALIGN_4 .L2_2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $ 2, KK #endif decq J // j -- jg .L2_00_01 // next 2 lines of N .L1_2_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $ 1, J // j % 2 je .L999 ALIGN_4 .L1_00_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_00_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO decq %rax jnz .L1_00_02b .L1_00_02c: movq BO1, B // next offset of B .L1_00_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 8 * SIZE, AO movq M, I sarq $ 2, I // i = (m >> 2) je .L1_2_10 ALIGN_4 /*******************************************************************************************************/ .L1_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_12: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 jmp .L1_4_12 ALIGN_4 .L1_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_17: KERNEL4x1_SUB jl .L1_4_17 ALIGN_4 .L1_4_19: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_4_11 ALIGN_4 /*******************************************************************************************************/ .L1_2_10: testq $ 2, M jz .L1_2_40 .L1_2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_2_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB je .L1_2_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB je .L1_2_16 jmp .L1_2_12 ALIGN_4 .L1_2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_2_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_17: KERNEL2x1_SUB jl .L1_2_17 ALIGN_4 .L1_2_19: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_2_40: testq $ 1, M jz .L999 ALIGN_4 .L1_2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_2_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_2_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_2_46 jmp .L1_2_42 ALIGN_4 .L1_2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_2_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_47: KERNEL1x1_SUB jl .L1_2_47 ALIGN_4 .L1_2_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L1_2_41 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $ STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************************ TRMM Kernel ************************************************************************************************/ PROLOGUE PROFCODE subq $ STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $ 128 + L_BUFFER_SIZE, %rsp andq $ -4096, %rsp # align stack STACK_TOUCH cmpq $ 0, OLD_M je .L999 cmpq $ 0, OLD_N je .L999 cmpq $ 0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA_R vmovsd %xmm1, ALPHA_I salq $ ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $ 2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_00_0: movq Ndiv6, J cmpq $ 0, J je .L1_2_0 ALIGN_4 .L2_00_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_00_02b: vmovups (BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm1 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) addq $ 4*SIZE,BO1 addq $ 4*SIZE,BO decq %rax jnz .L2_00_02b .L2_00_02c: movq BO1, B // next offset of B .L2_00_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 8 * SIZE, AO movq M, I sarq $ 2, I // i = (m >> 2) je .L2_2_10 ALIGN_4 /******************************************************************************************************************/ .L2_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_4_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB je .L2_4_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI ,SIZE) KERNEL4x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL4x2_SUB je .L2_4_16 jmp .L2_4_12 ALIGN_4 .L2_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_4_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_4_17: KERNEL4x2_SUB jl .L2_4_17 ALIGN_4 .L2_4_19: SAVE4x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L2_4_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ /******************************************************************************************************************/ .L2_2_10: testq $ 2, M jz .L2_2_40 // to next 2 lines of N .L2_2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB je .L2_2_16 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_SUB KERNEL2x2_SUB je .L2_2_16 jmp .L2_2_12 ALIGN_4 .L2_2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_17: KERNEL2x2_SUB jl .L2_2_17 ALIGN_4 .L2_2_19: SAVE2x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_2_40: testq $ 1, M jz .L2_2_60 // to next 2 lines of N ALIGN_4 .L2_2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L2_2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB je .L2_2_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_SUB KERNEL1x2_SUB je .L2_2_46 jmp .L2_2_42 ALIGN_4 .L2_2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L2_2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_2_47: KERNEL1x2_SUB jl .L2_2_47 ALIGN_4 .L2_2_49: SAVE1x2 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L2_2_41 ALIGN_4 .L2_2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $ 2, KK #endif decq J // j -- jg .L2_00_01 // next 2 lines of N .L1_2_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $ 1, J // j % 2 je .L999 ALIGN_4 .L1_00_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_00_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $ 2*SIZE,BO1 addq $ 2*SIZE,BO decq %rax jnz .L1_00_02b .L1_00_02c: movq BO1, B // next offset of B .L1_00_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $ 8 * SIZE, AO movq M, I sarq $ 2, I // i = (m >> 2) je .L1_2_10 ALIGN_4 /*******************************************************************************************************/ .L1_4_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 4, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_4_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_12: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB je .L1_4_16 jmp .L1_4_12 ALIGN_4 .L1_4_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_4_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_4_17: KERNEL4x1_SUB jl .L1_4_17 ALIGN_4 .L1_4_19: SAVE4x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 4, KK #endif addq $ 8 * SIZE, CO1 # coffset += 8 decq I # i -- jg .L1_4_11 ALIGN_4 /*******************************************************************************************************/ .L1_2_10: testq $ 2, M jz .L1_2_40 .L1_2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 2, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_2_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_12: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB je .L1_2_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) KERNEL2x1_SUB KERNEL2x1_SUB je .L1_2_16 jmp .L1_2_12 ALIGN_4 .L1_2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_2_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_17: KERNEL2x1_SUB jl .L1_2_17 ALIGN_4 .L1_2_19: SAVE2x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 2, KK #endif addq $ 4 * SIZE, CO1 # coffset += 4 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_2_40: testq $ 1, M jz .L999 ALIGN_4 .L1_2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $ 4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $ 1, %rax // number of values in AO #else addq $ 1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $ -8, %rax // K = K - ( K % 8 ) je .L1_2_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_42: prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_2_46 prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB prefetcht0 A_PR1(AO,%rax,SIZE) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB je .L1_2_46 jmp .L1_2_42 ALIGN_4 .L1_2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $ 7, %rax # if (k & 1) je .L1_2_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_2_47: KERNEL1x1_SUB jl .L1_2_47 ALIGN_4 .L1_2_49: SAVE1x1 #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $ 1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $ 1, KK #endif addq $ 2 * SIZE, CO1 # coffset += 2 decq I # i -- jg .L1_2_41 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $ STACKSIZE, %rsp ret EPILOGUE #endif OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_4x2_penryn.S000066400000000000000000001036111313527062700222570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #define PREA %rdx #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define J 64(%rsp) #define OFFSET 72(%rsp) #define KK 80(%rsp) #define KKK 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define ALPHA_R 224(%rsp) #define ALPHA_I 232(%rsp) #define J 240(%rsp) #define OFFSET 248(%rsp) #define KK 256(%rsp) #define KKK 264(%rsp) #endif #define PREFETCHSIZE (8 * 17 + 4) #define PREFETCH prefetcht0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addps #define ADD2 addps #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1 addps #define ADD2 addps #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1 addps #define ADD2 addps #else #define ADD1 addps #define ADD2 subps #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #else movq OLD_LDC, LDC #ifdef TRMMKERNEL movq OLD_OFFSET, %r11 #endif #endif unpcklps %xmm0, %xmm0 unpcklps %xmm1, %xmm1 movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11 #endif movq %r11, KK #endif movq N, J sarq $1, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 leaq (C, LDC, 1), CO2 movq A, AO movq K, %rax salq $ZBASE_SHIFT + 1, %rax leaq (B, %rax), BB movq M, I sarq $2, I NOBRANCH jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -28 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 prefetcht0 -32 * SIZE(BB) pxor %xmm6, %xmm6 prefetcht2 7 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 prefetcht2 7 * SIZE(CO2) movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 subq $-24 * SIZE, BB leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH -32 * SIZE(PREA) ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -12 * SIZE(AO), %xmm1 ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 PREFETCH -16 * SIZE(PREA) movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -4 * SIZE(AO), %xmm1 ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 0 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 4 * SIZE(AO), %xmm1 ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 PREFETCH 0 * SIZE(PREA) movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 8 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 12 * SIZE(AO), %xmm1 ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 16 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 20 * SIZE(AO), %xmm1 ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 PREFETCH 16 * SIZE(PREA) movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps 24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps 28 * SIZE(AO), %xmm1 ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 subq $-64 * SIZE, AO movaps 0 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 subq $-32 * SIZE, BO pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -28 * SIZE(AO), %xmm1 subq $-64 * SIZE, PREA subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: prefetcht0 -16 * SIZE(BB) #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 movaps %xmm2, %xmm3 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 mulps %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movaps %xmm7, %xmm5 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 mulps %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movaps %xmm6, %xmm3 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 mulps %xmm1, %xmm3 ADD2 %xmm7, %xmm9 ADD2 %xmm5, %xmm13 movaps %xmm4, %xmm5 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 mulps %xmm1, %xmm5 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 pxor %xmm0, %xmm12 pxor %xmm0, %xmm14 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pshufd $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 pxor %xmm0, %xmm13 pxor %xmm0, %xmm15 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 pxor %xmm0, %xmm13 pxor %xmm0, %xmm15 #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm13, %xmm12 haddps %xmm15, %xmm14 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 shufps $0xd8, %xmm12, %xmm12 shufps $0xd8, %xmm14, %xmm14 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 movaps %xmm12, %xmm13 shufps $0xe4, %xmm14, %xmm12 shufps $0xe4, %xmm13, %xmm14 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 pshufd $0xb1, %xmm12, %xmm13 pshufd $0xb1, %xmm14, %xmm15 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 mulps %xmm2, %xmm12 mulps %xmm3, %xmm13 mulps %xmm2, %xmm10 mulps %xmm3, %xmm11 mulps %xmm2, %xmm14 mulps %xmm3, %xmm15 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 addsubps %xmm13, %xmm12 addsubps %xmm15, %xmm14 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm12 addps %xmm2, %xmm10 addps %xmm3, %xmm14 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movsd %xmm14, 4 * SIZE(CO2) movhps %xmm14, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $2, M BRANCH jle .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm6, %xmm6 movaps -32 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: ADD1 %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 subq $-16 * SIZE, AO ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -32 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: ADD1 %xmm6, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pshufd $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 mulps %xmm2, %xmm10 mulps %xmm3, %xmm11 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 movhps 2 * SIZE(CO2), %xmm2 addps %xmm0, %xmm8 addps %xmm2, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 .L30: testq $1, M BRANCH jle .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movsd -32 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 pxor %xmm6, %xmm6 movaps -32 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: ADD1 %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movsd -30 * SIZE(AO), %xmm0 ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -24 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movsd -28 * SIZE(AO), %xmm0 ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -20 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movsd -26 * SIZE(AO), %xmm0 ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -16 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 subq $-8 * SIZE, AO ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movsd -32 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm6, %xmm10 pshufd $0xb1, %xmm2, %xmm7 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0x1b, %xmm7, %xmm6 mulps %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -28 * SIZE(BO), %xmm2 pshufd $0xb1, %xmm6, %xmm4 mulps %xmm0, %xmm6 ADD2 %xmm7, %xmm9 mulps %xmm0, %xmm4 movsd -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: ADD1 %xmm6, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 movddup ALPHA_R, %xmm2 movddup ALPHA_I, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pshufd $0xb1, %xmm0, %xmm0 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 mulps %xmm2, %xmm10 mulps %xmm3, %xmm11 addsubps %xmm9, %xmm8 addsubps %xmm11, %xmm10 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm2 addps %xmm0, %xmm8 addps %xmm2, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif leaq (C, LDC, 2), C movq BO, B subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $1, N BRANCH jle .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq C, CO1 movq A, AO movq M, I sarq $2, I NOBRANCH jle .L50 ALIGN_4 .L41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -28 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 prefetcht0 7 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm12 pshufd $0x00, %xmm2, %xmm4 mulps %xmm1, %xmm4 addps %xmm5, %xmm9 pshufd $0x55, %xmm2, %xmm5 mulps %xmm0, %xmm5 movaps -24 * SIZE(AO), %xmm0 addps %xmm6, %xmm13 pshufd $0x55, %xmm2, %xmm6 mulps %xmm1, %xmm6 movaps -20 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm12 pshufd $0xaa, %xmm2, %xmm4 mulps %xmm1, %xmm4 addps %xmm5, %xmm9 pshufd $0xff, %xmm2, %xmm5 mulps %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 addps %xmm6, %xmm13 pshufd $0xff, %xmm2, %xmm6 movaps -28 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps -12 * SIZE(AO), %xmm1 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm12 pshufd $0x00, %xmm2, %xmm4 mulps %xmm1, %xmm4 addps %xmm5, %xmm9 pshufd $0x55, %xmm2, %xmm5 mulps %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 addps %xmm6, %xmm13 pshufd $0x55, %xmm2, %xmm6 mulps %xmm1, %xmm6 movaps -4 * SIZE(AO), %xmm1 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm12 pshufd $0xaa, %xmm2, %xmm4 mulps %xmm1, %xmm4 addps %xmm5, %xmm9 pshufd $0xff, %xmm2, %xmm5 mulps %xmm0, %xmm5 movaps 0 * SIZE(AO), %xmm0 addps %xmm6, %xmm13 pshufd $0xff, %xmm2, %xmm6 movaps -24 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps 4 * SIZE(AO), %xmm1 subq $-32 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm12 pshufd $0x00, %xmm2, %xmm4 mulps %xmm1, %xmm4 addps %xmm5, %xmm9 pshufd $0x55, %xmm2, %xmm5 mulps %xmm0, %xmm5 movaps -24 * SIZE(AO), %xmm0 addps %xmm6, %xmm13 pshufd $0x55, %xmm2, %xmm6 movsd -30 * SIZE(BO), %xmm2 mulps %xmm1, %xmm6 movaps -20 * SIZE(AO), %xmm1 addq $8 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: addps %xmm3, %xmm8 addps %xmm4, %xmm12 addps %xmm5, %xmm9 addps %xmm6, %xmm13 pshufd $0xb1, %xmm9, %xmm9 movddup ALPHA_R, %xmm2 pshufd $0xb1, %xmm13, %xmm13 movddup ALPHA_I, %xmm3 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 subps %xmm9, %xmm8 subps %xmm13, %xmm12 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 addps %xmm9, %xmm8 addps %xmm13, %xmm12 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 addps %xmm9, %xmm8 addps %xmm13, %xmm12 #else pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 subps %xmm9, %xmm8 subps %xmm13, %xmm12 #endif pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm12, %xmm13 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 mulps %xmm2, %xmm12 mulps %xmm3, %xmm13 addsubps %xmm9, %xmm8 addsubps %xmm13, %xmm12 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm1 movhps 6 * SIZE(CO1), %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm12 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- BRANCH jg .L41 ALIGN_4 .L50: testq $2, M BRANCH jle .L60 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 movaps -24 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: addps %xmm3, %xmm8 movddup ALPHA_R, %xmm2 addps %xmm4, %xmm9 movddup ALPHA_I, %xmm3 pshufd $0xb1, %xmm9, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm9 subps %xmm9, %xmm8 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pxor %xmm0, %xmm9 addps %xmm9, %xmm8 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm8 addps %xmm9, %xmm8 #else pxor %xmm0, %xmm8 subps %xmm9, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 addsubps %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 ALIGN_4 .L60: testq $1, M BRANCH jle .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq B, BO #else movq B, BO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif movsd -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 movaps -32 * SIZE(BO), %xmm2 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movsd -30 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 movaps -28 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movsd -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 mulps %xmm0, %xmm4 movsd -26 * SIZE(AO), %xmm0 addps %xmm3, %xmm8 pshufd $0xaa, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0xff, %xmm2, %xmm4 movaps -24 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movsd -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: addps %xmm3, %xmm8 pshufd $0x00, %xmm2, %xmm3 mulps %xmm0, %xmm3 addps %xmm4, %xmm9 pshufd $0x55, %xmm2, %xmm4 movsd -30 * SIZE(BO), %xmm2 mulps %xmm0, %xmm4 movsd -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: addps %xmm3, %xmm8 movddup ALPHA_R, %xmm2 addps %xmm4, %xmm9 movddup ALPHA_I, %xmm3 pshufd $0xb1, %xmm9, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) pxor %xmm0, %xmm9 subps %xmm9, %xmm8 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) pxor %xmm0, %xmm9 addps %xmm9, %xmm8 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) pxor %xmm0, %xmm8 addps %xmm9, %xmm8 #else pxor %xmm0, %xmm8 subps %xmm9, %xmm8 #endif pshufd $0xb1, %xmm8, %xmm9 mulps %xmm2, %xmm8 mulps %xmm3, %xmm9 addsubps %xmm9, %xmm8 #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_4x2_sse.S000066400000000000000000001374651313527062700215540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define ALPHA_R 16(%rsp) #define ALPHA_I 32(%rsp) #define J 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER 256(%rsp) #ifdef OPTERON #define movsd movlps #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5 + 8) #endif #if defined(PENTIUM4) || defined(GENERIC) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 160 #endif #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) #ifndef GENERIC #define KERNEL1(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL2(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL3(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL4(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ addps %xmm6, %xmm15 ;\ movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 #define KERNEL6(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 #define KERNEL7(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 #define KERNEL8(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 #else #define KERNEL1(xx) \ mulps %xmm0, %xmm1 ;\ addps %xmm1, %xmm8 ;\ movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL2(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ #define KERNEL3(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL4(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm1, %xmm8 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm0, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm0, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ addps %xmm5, %xmm10 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm0, %xmm11 ;\ movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 #define KERNEL6(xx) \ mulps %xmm2, %xmm1 ;\ addps %xmm1, %xmm12 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ mulps %xmm2, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm2, %xmm5 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ addps %xmm5, %xmm14 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm2, %xmm15 ;\ movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 #define KERNEL7(xx) \ mulps %xmm4, %xmm7 ;\ addps %xmm7, %xmm8 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm4, %xmm3 ;\ addps %xmm3, %xmm9 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm4, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ addps %xmm5, %xmm10 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm4, %xmm11 ;\ movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 #define KERNEL8(xx) \ mulps %xmm6, %xmm7 ;\ addps %xmm7, %xmm12 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ mulps %xmm6, %xmm3 ;\ addps %xmm3, %xmm13 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ mulps %xmm6, %xmm5 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ addps %xmm5, %xmm14 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq 72(%rsp), LDC #ifdef TRMMKERNEL movsd 80(%rsp), %xmm12 #endif #endif movq %rsp, %rbx # save old stack subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pxor %xmm7, %xmm7 cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 # Generate mask pxor %xmm10, %xmm10 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) movss %xmm7, 0 + POSINV movss %xmm10, 4 + POSINV movss %xmm7, 8 + POSINV movss %xmm10,12 + POSINV #else movss %xmm10, 0 + POSINV movss %xmm7, 4 + POSINV movss %xmm10, 8 + POSINV movss %xmm7, 12 + POSINV #endif addq $32 * SIZE, A #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK #endif #endif salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm7 movq K, %rax sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) movss 0 * SIZE(B), %xmm8 movss 1 * SIZE(B), %xmm9 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 movss 4 * SIZE(B), %xmm12 movss 5 * SIZE(B), %xmm13 movss 6 * SIZE(B), %xmm14 movss 7 * SIZE(B), %xmm15 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 shufps $0, %xmm10, %xmm10 shufps $0, %xmm11, %xmm11 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 shufps $0, %xmm15, %xmm15 PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 xorps %xmm7, %xmm15 #else xorps %xmm7, %xmm8 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 xorps %xmm7, %xmm14 #endif movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) movaps %xmm10, 8 * SIZE(BO) movaps %xmm11, 12 * SIZE(BO) movaps %xmm12, 16 * SIZE(BO) movaps %xmm13, 20 * SIZE(BO) movaps %xmm14, 24 * SIZE(BO) movaps %xmm15, 28 * SIZE(BO) addq $32 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movss 0 * SIZE(B), %xmm8 movss 1 * SIZE(B), %xmm9 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 shufps $0, %xmm10, %xmm10 shufps $0, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 #else xorps %xmm7, %xmm8 xorps %xmm7, %xmm10 #endif movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) movaps %xmm10, 8 * SIZE(BO) movaps %xmm11, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 pxor %xmm8, %xmm8 movaps -28 * SIZE(AO), %xmm2 movaps -28 * SIZE(BO), %xmm3 pxor %xmm9, %xmm9 movaps -24 * SIZE(AO), %xmm4 movaps -24 * SIZE(BO), %xmm5 pxor %xmm10, %xmm10 movaps -20 * SIZE(AO), %xmm6 movaps -16 * SIZE(BO), %xmm7 pxor %xmm11, %xmm11 PREFETCHW 7 * SIZE(CO1) pxor %xmm12, %xmm12 PREFETCHW 7 * SIZE(CO2) pxor %xmm13, %xmm13 PREFETCH -32 * SIZE(BB) pxor %xmm14, %xmm14 PREFETCH -16 * SIZE(BB) pxor %xmm15, %xmm15 subq $-16 * SIZE, BB #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif #ifndef GENERIC andq $-8, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax NOBRANCH je .L15 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax NOBRANCH je .L15 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) addq $16 * SIZE, %rax BRANCH jl .L12 ALIGN_3 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif testq $4, %rax je .L16 xorq %rax, %rax ALIGN_3 KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $64 * SIZE, BO addq $32 * SIZE, AO ALIGN_3 #else sarq $2, %rax NOBRANCH jle .L16 ALIGN_3 .L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) addq $ 64 * SIZE, BO subq $-32 * SIZE, AO decq %rax BRANCH jg .L12 #endif .L16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $3, %rax # if (k & 1) BRANCH je .L18 leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO negq %rax ALIGN_4 .L17: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO, %rax, 8), %xmm0 addps %xmm1, %xmm10 movaps -32 * SIZE(BO, %rax, 8), %xmm1 addps %xmm0, %xmm11 movaps -24 * SIZE(AO, %rax, 4), %xmm0 mulps %xmm2, %xmm1 addps %xmm1, %xmm12 movaps -28 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm13 movaps -24 * SIZE(BO, %rax, 8), %xmm1 mulps %xmm2, %xmm1 mulps -20 * SIZE(BO, %rax, 8), %xmm2 addps %xmm1, %xmm14 movaps -16 * SIZE(BO, %rax, 8), %xmm1 addps %xmm2, %xmm15 movaps -20 * SIZE(AO, %rax, 4), %xmm2 addq $SIZE * 2, %rax jl .L17 ALIGN_4 .L18: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm2 movhps 6 * SIZE(CO1), %xmm2 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 movsd 4 * SIZE(CO2), %xmm3 movhps 6 * SIZE(CO2), %xmm3 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 shufps $0xb1, %xmm13, %xmm13 shufps $0xb1, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm11, %xmm10 subps %xmm13, %xmm12 subps %xmm15, %xmm14 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm13, %xmm12 addps %xmm15, %xmm14 #endif movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 movaps %xmm12, %xmm13 movaps %xmm14, %xmm15 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 shufps $0xb1, %xmm12, %xmm12 shufps $0xb1, %xmm14, %xmm14 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 mulps %xmm6, %xmm13 mulps %xmm7, %xmm12 mulps %xmm6, %xmm15 mulps %xmm7, %xmm14 addps %xmm9, %xmm8 addps %xmm11, %xmm10 addps %xmm13, %xmm12 addps %xmm15, %xmm14 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm10 addps %xmm2, %xmm12 addps %xmm3, %xmm14 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) movsd %xmm14, 4 * SIZE(CO2) movhps %xmm14, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -16 * SIZE(AO), %xmm2 movaps 0 * SIZE(AO), %xmm4 movaps 16 * SIZE(AO), %xmm6 movaps -32 * SIZE(BO), %xmm1 movaps -16 * SIZE(BO), %xmm3 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm3 addps %xmm3, %xmm8 movaps -12 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -8 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 mulps -4 * SIZE(BO), %xmm0 addps %xmm3, %xmm10 movaps 48 * SIZE(BO), %xmm3 addps %xmm0, %xmm11 movaps -24 * SIZE(AO), %xmm0 mulps %xmm0, %xmm5 addps %xmm5, %xmm8 movaps 4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 addps %xmm5, %xmm9 movaps 8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 mulps 12 * SIZE(BO), %xmm0 addps %xmm5, %xmm10 movaps 64 * SIZE(BO), %xmm5 addps %xmm0, %xmm11 movaps -20 * SIZE(AO), %xmm0 mulps %xmm0, %xmm7 addps %xmm7, %xmm8 movaps 20 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 addps %xmm7, %xmm9 movaps 24 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 mulps 28 * SIZE(BO), %xmm0 addps %xmm7, %xmm10 movaps 80 * SIZE(BO), %xmm7 addps %xmm0, %xmm11 movaps 0 * SIZE(AO), %xmm0 mulps %xmm2, %xmm1 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) addps %xmm1, %xmm8 movaps 36 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm9 movaps 40 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 mulps 44 * SIZE(BO), %xmm2 addps %xmm1, %xmm10 movaps 96 * SIZE(BO), %xmm1 addps %xmm2, %xmm11 movaps -12 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 addps %xmm3, %xmm8 movaps 52 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm9 movaps 56 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 mulps 60 * SIZE(BO), %xmm2 addps %xmm3, %xmm10 movaps 112 * SIZE(BO), %xmm3 addps %xmm2, %xmm11 movaps -8 * SIZE(AO), %xmm2 mulps %xmm2, %xmm5 addps %xmm5, %xmm8 movaps 68 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm9 movaps 72 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 mulps 76 * SIZE(BO), %xmm2 addps %xmm5, %xmm10 movaps 128 * SIZE(BO), %xmm5 addps %xmm2, %xmm11 movaps -4 * SIZE(AO), %xmm2 mulps %xmm2, %xmm7 addps %xmm7, %xmm8 movaps 84 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm9 movaps 88 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 mulps 92 * SIZE(BO), %xmm2 addps %xmm7, %xmm10 movaps 144 * SIZE(BO), %xmm7 addps %xmm2, %xmm11 movaps 16 * SIZE(AO), %xmm2 subq $ -32 * SIZE, AO subq $-128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps -16 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movaps -28 * SIZE(AO), %xmm0 subq $- 4 * SIZE, AO subq $-16 * SIZE, BO decq %rax jg .L26 ALIGN_4 .L28: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 0 * SIZE(CO2), %xmm1 movhps 2 * SIZE(CO2), %xmm1 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm11, %xmm10 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 #endif movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 addps %xmm9, %xmm8 addps %xmm11, %xmm10 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm10 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhps %xmm10, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif movaps -32 * SIZE(AO), %xmm0 movaps -24 * SIZE(AO), %xmm2 movaps -32 * SIZE(BO), %xmm1 movaps -16 * SIZE(BO), %xmm3 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm11 movaps 32 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm8 movaps -12 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm9 movaps -8 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm10 movaps -4 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 movsd -28 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 movaps 48 * SIZE(BO), %xmm3 mulps %xmm0, %xmm5 addps %xmm5, %xmm8 movaps 4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 addps %xmm5, %xmm9 movaps 8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 addps %xmm5, %xmm10 movaps 12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm5 movsd -26 * SIZE(AO), %xmm0 addps %xmm5, %xmm11 movaps 64 * SIZE(BO), %xmm5 mulps %xmm0, %xmm7 addps %xmm7, %xmm8 movaps 20 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 addps %xmm7, %xmm9 movaps 24 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 addps %xmm7, %xmm10 movaps 28 * SIZE(BO), %xmm7 mulps %xmm0, %xmm7 movsd -16 * SIZE(AO), %xmm0 addps %xmm7, %xmm11 movaps 80 * SIZE(BO), %xmm7 mulps %xmm2, %xmm1 addps %xmm1, %xmm8 movaps 36 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm9 movaps 40 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 addps %xmm1, %xmm10 movaps 44 * SIZE(BO), %xmm1 mulps %xmm2, %xmm1 movsd -22 * SIZE(AO), %xmm2 addps %xmm1, %xmm11 movaps 96 * SIZE(BO), %xmm1 mulps %xmm2, %xmm3 addps %xmm3, %xmm8 movaps 52 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm9 movaps 56 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 addps %xmm3, %xmm10 movaps 60 * SIZE(BO), %xmm3 mulps %xmm2, %xmm3 movsd -20 * SIZE(AO), %xmm2 addps %xmm3, %xmm11 movaps 112 * SIZE(BO), %xmm3 mulps %xmm2, %xmm5 addps %xmm5, %xmm8 movaps 68 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm9 movaps 72 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm10 movaps 76 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 movsd -18 * SIZE(AO), %xmm2 addps %xmm5, %xmm11 movaps 128 * SIZE(BO), %xmm5 mulps %xmm2, %xmm7 addps %xmm7, %xmm8 movaps 84 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm9 movaps 88 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm10 movaps 92 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 movsd -8 * SIZE(AO), %xmm2 addps %xmm7, %xmm11 movaps 144 * SIZE(BO), %xmm7 subq $ -16 * SIZE, AO subq $-128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm11 movaps -16 * SIZE(BO), %xmm1 subq $ -2 * SIZE, AO subq $-16 * SIZE, BO decq %rax jg .L36 ALIGN_4 .L38: #ifndef TRMMKERNEL #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(CO1), %xmm0 #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(CO2), %xmm1 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm11, %xmm10 #else addps %xmm9, %xmm8 addps %xmm11, %xmm10 #endif movaps %xmm8, %xmm9 movaps %xmm10, %xmm11 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm10, %xmm10 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm11 mulps %xmm7, %xmm10 addps %xmm9, %xmm8 addps %xmm11, %xmm10 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm1, %xmm10 #endif movlps %xmm8, 0 * SIZE(CO1) movlps %xmm10, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 2 * ldc decq J # j -- jg .L01 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 .L41: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm7 movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 .L42: movss 0 * SIZE(B), %xmm8 movss 1 * SIZE(B), %xmm9 movss 2 * SIZE(B), %xmm10 movss 3 * SIZE(B), %xmm11 movss 4 * SIZE(B), %xmm12 movss 5 * SIZE(B), %xmm13 movss 6 * SIZE(B), %xmm14 movss 7 * SIZE(B), %xmm15 shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 shufps $0, %xmm10, %xmm10 shufps $0, %xmm11, %xmm11 shufps $0, %xmm12, %xmm12 shufps $0, %xmm13, %xmm13 shufps $0, %xmm14, %xmm14 shufps $0, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 xorps %xmm7, %xmm15 #else xorps %xmm7, %xmm8 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 xorps %xmm7, %xmm14 #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) prefetchnta 56 * SIZE(B) #endif movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) movaps %xmm10, 8 * SIZE(BO) movaps %xmm11, 12 * SIZE(BO) movaps %xmm12, 16 * SIZE(BO) movaps %xmm13, 20 * SIZE(BO) movaps %xmm14, 24 * SIZE(BO) movaps %xmm15, 28 * SIZE(BO) #if defined(PENTIUM4) || defined(GENERIC) PREFETCHW 128 * SIZE(BO) PREFETCH 112 * SIZE(B) #endif addq $32 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movss 0 * SIZE(B), %xmm8 movss 1 * SIZE(B), %xmm9 shufps $0, %xmm8, %xmm8 shufps $0, %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 #else xorps %xmm7, %xmm8 #endif movaps %xmm8, 0 * SIZE(BO) movaps %xmm9, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -16 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movaps 0 * SIZE(AO), %xmm4 pxor %xmm10, %xmm10 movaps 16 * SIZE(AO), %xmm6 pxor %xmm11, %xmm11 movaps -32 * SIZE(BO), %xmm1 pxor %xmm12, %xmm12 movaps -16 * SIZE(BO), %xmm3 pxor %xmm13, %xmm13 movaps 0 * SIZE(BO), %xmm5 pxor %xmm14, %xmm14 movaps 16 * SIZE(BO), %xmm7 pxor %xmm15, %xmm15 PREFETCHW 7 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm13 movaps -24 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -20 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm12 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm13 movaps 32 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm2, %xmm3 mulps -12 * SIZE(BO), %xmm2 addps %xmm3, %xmm8 movaps -16 * SIZE(BO), %xmm3 addps %xmm2, %xmm9 movaps -12 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 mulps -12 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps -8 * SIZE(BO), %xmm3 addps %xmm2, %xmm13 movaps -8 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 mulps -4 * SIZE(BO), %xmm2 addps %xmm3, %xmm8 movaps -8 * SIZE(BO), %xmm3 addps %xmm2, %xmm9 movaps -4 * SIZE(AO), %xmm2 mulps %xmm2, %xmm3 mulps -4 * SIZE(BO), %xmm2 addps %xmm3, %xmm12 movaps 48 * SIZE(BO), %xmm3 addps %xmm2, %xmm13 movaps 48 * SIZE(AO), %xmm2 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) mulps %xmm4, %xmm5 mulps 4 * SIZE(BO), %xmm4 addps %xmm5, %xmm8 movaps 0 * SIZE(BO), %xmm5 addps %xmm4, %xmm9 movaps 4 * SIZE(AO), %xmm4 mulps %xmm4, %xmm5 mulps 4 * SIZE(BO), %xmm4 addps %xmm5, %xmm12 movaps 8 * SIZE(BO), %xmm5 addps %xmm4, %xmm13 movaps 8 * SIZE(AO), %xmm4 mulps %xmm4, %xmm5 mulps 12 * SIZE(BO), %xmm4 addps %xmm5, %xmm8 movaps 8 * SIZE(BO), %xmm5 addps %xmm4, %xmm9 movaps 12 * SIZE(AO), %xmm4 mulps %xmm4, %xmm5 mulps 12 * SIZE(BO), %xmm4 addps %xmm5, %xmm12 movaps 64 * SIZE(BO), %xmm5 addps %xmm4, %xmm13 movaps 64 * SIZE(AO), %xmm4 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) mulps %xmm6, %xmm7 mulps 20 * SIZE(BO), %xmm6 addps %xmm7, %xmm8 movaps 16 * SIZE(BO), %xmm7 addps %xmm6, %xmm9 movaps 20 * SIZE(AO), %xmm6 mulps %xmm6, %xmm7 mulps 20 * SIZE(BO), %xmm6 addps %xmm7, %xmm12 movaps 24 * SIZE(BO), %xmm7 addps %xmm6, %xmm13 movaps 24 * SIZE(AO), %xmm6 mulps %xmm6, %xmm7 mulps 28 * SIZE(BO), %xmm6 addps %xmm7, %xmm8 movaps 24 * SIZE(BO), %xmm7 addps %xmm6, %xmm9 movaps 28 * SIZE(AO), %xmm6 mulps %xmm6, %xmm7 mulps 28 * SIZE(BO), %xmm6 addps %xmm7, %xmm12 movaps 80 * SIZE(BO), %xmm7 addps %xmm6, %xmm13 movaps 80 * SIZE(AO), %xmm6 subq $-64 * SIZE, AO subq $-64 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm13 movaps -24 * SIZE(AO), %xmm0 addq $ 8 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jg .L56 ALIGN_4 .L58: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 movsd 4 * SIZE(CO1), %xmm2 movhps 6 * SIZE(CO1), %xmm2 #endif shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 subps %xmm13, %xmm12 #else addps %xmm9, %xmm8 addps %xmm13, %xmm12 #endif movaps %xmm8, %xmm9 movaps %xmm12, %xmm13 shufps $0xb1, %xmm8, %xmm8 shufps $0xb1, %xmm12, %xmm12 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 mulps %xmm6, %xmm13 mulps %xmm7, %xmm12 addps %xmm9, %xmm8 addps %xmm13, %xmm12 #ifndef TRMMKERNEL addps %xmm0, %xmm8 addps %xmm2, %xmm12 #endif movlps %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) movlps %xmm12, 4 * SIZE(CO1) movhps %xmm12, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -16 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movaps -32 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movaps -16 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 mulps %xmm0, %xmm1 mulps -20 * SIZE(BO), %xmm0 addps %xmm1, %xmm10 movaps 32 * SIZE(BO), %xmm1 addps %xmm0, %xmm11 movaps -24 * SIZE(AO), %xmm0 mulps %xmm0, %xmm3 mulps -12 * SIZE(BO), %xmm0 addps %xmm3, %xmm8 movaps -8 * SIZE(BO), %xmm3 addps %xmm0, %xmm9 movaps -20 * SIZE(AO), %xmm0 mulps %xmm0, %xmm3 mulps -4 * SIZE(BO), %xmm0 addps %xmm3, %xmm10 movaps 48 * SIZE(BO), %xmm3 addps %xmm0, %xmm11 movaps 0 * SIZE(AO), %xmm0 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) mulps %xmm2, %xmm5 mulps 4 * SIZE(BO), %xmm2 addps %xmm5, %xmm8 movaps 8 * SIZE(BO), %xmm5 addps %xmm2, %xmm9 movaps -12 * SIZE(AO), %xmm2 mulps %xmm2, %xmm5 mulps 12 * SIZE(BO), %xmm2 addps %xmm5, %xmm10 movaps 64 * SIZE(BO), %xmm5 addps %xmm2, %xmm11 movaps -8 * SIZE(AO), %xmm2 mulps %xmm2, %xmm7 mulps 20 * SIZE(BO), %xmm2 addps %xmm7, %xmm8 movaps 24 * SIZE(BO), %xmm7 addps %xmm2, %xmm9 movaps -4 * SIZE(AO), %xmm2 mulps %xmm2, %xmm7 mulps 28 * SIZE(BO), %xmm2 addps %xmm7, %xmm10 movaps 80 * SIZE(BO), %xmm7 addps %xmm2, %xmm11 movaps 16 * SIZE(AO), %xmm2 subq $-32 * SIZE, AO subq $-64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm0, %xmm1 mulps -28 * SIZE(BO), %xmm0 addps %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 addps %xmm0, %xmm9 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #ifndef TRMMKERNEL movsd 0 * SIZE(CO1), %xmm0 movhps 2 * SIZE(CO1), %xmm0 #endif addps %xmm10, %xmm8 addps %xmm11, %xmm9 shufps $0xb1, %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 #else addps %xmm9, %xmm8 #endif movaps %xmm8, %xmm9 shufps $0xb1, %xmm8, %xmm8 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 addps %xmm9, %xmm8 #ifndef TRMMKERNEL addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) movhps %xmm8, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L70: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq 32 * SIZE + BUFFER, BO #else leaq 32 * SIZE + BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 movaps -24 * SIZE(AO), %xmm2 pxor %xmm9, %xmm9 movaps -32 * SIZE(BO), %xmm1 pxor %xmm10, %xmm10 movaps -16 * SIZE(BO), %xmm3 pxor %xmm11, %xmm11 movaps 0 * SIZE(BO), %xmm5 movaps 16 * SIZE(BO), %xmm7 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm0, %xmm1 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 addps %xmm1, %xmm10 movaps -20 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -28 * SIZE(AO), %xmm0 addps %xmm1, %xmm11 movaps 32 * SIZE(BO), %xmm1 mulps %xmm0, %xmm3 addps %xmm3, %xmm8 movaps -12 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 movsd -26 * SIZE(AO), %xmm0 addps %xmm3, %xmm9 movaps -8 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 addps %xmm3, %xmm10 movaps -4 * SIZE(BO), %xmm3 mulps %xmm0, %xmm3 movsd -16 * SIZE(AO), %xmm0 addps %xmm3, %xmm11 movaps 48 * SIZE(BO), %xmm3 mulps %xmm2, %xmm5 addps %xmm5, %xmm8 movaps 4 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 movsd -22 * SIZE(AO), %xmm2 addps %xmm5, %xmm9 movaps 8 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 addps %xmm5, %xmm10 movaps 12 * SIZE(BO), %xmm5 mulps %xmm2, %xmm5 movsd -20 * SIZE(AO), %xmm2 addps %xmm5, %xmm11 movaps 64 * SIZE(BO), %xmm5 mulps %xmm2, %xmm7 addps %xmm7, %xmm8 movaps 20 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 movsd -18 * SIZE(AO), %xmm2 addps %xmm7, %xmm9 movaps 24 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 addps %xmm7, %xmm10 movaps 28 * SIZE(BO), %xmm7 mulps %xmm2, %xmm7 movsd -8 * SIZE(AO), %xmm2 addps %xmm7, %xmm11 movaps 80 * SIZE(BO), %xmm7 subq $-16 * SIZE, AO subq $-64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm6 movaps ALPHA_I, %xmm7 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm0, %xmm1 addps %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 mulps %xmm0, %xmm1 movsd -30 * SIZE(AO), %xmm0 addps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm1 addq $2 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L76 ALIGN_4 .L78: #ifndef TRMMKERNEL #ifdef movsd xorps %xmm0, %xmm0 #endif movsd 0 * SIZE(CO1), %xmm0 #endif addps %xmm10, %xmm8 addps %xmm11, %xmm9 shufps $0xb1, %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subps %xmm9, %xmm8 #else addps %xmm9, %xmm8 #endif movaps %xmm8, %xmm9 shufps $0xb1, %xmm8, %xmm8 mulps %xmm6, %xmm9 mulps %xmm7, %xmm8 addps %xmm9, %xmm8 #ifndef TRMMKERNEL addps %xmm0, %xmm8 #endif movlps %xmm8, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_4x2_sse3.S000066400000000000000000001340651313527062700216300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %r12 #define BO %r13 #define CO1 %r14 #define CO2 %r15 #define BB %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define ALPHA_R 0(%rsp) #define ALPHA_I 16(%rsp) #define J 32(%rsp) #define OFFSET 40(%rsp) #define KK 48(%rsp) #define KKK 56(%rsp) #define BUFFER 128(%rsp) #define PREFETCH prefetcht0 #define PREFETCHSIZE 320 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADDSUB addps #else #define ADDSUB subps #endif #define KERNEL1(address) \ mulps %xmm8, %xmm9; \ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO); \ addps %xmm9, %xmm0; \ movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ ADDSUB %xmm9, %xmm1; \ movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm2; \ movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ ADDSUB %xmm9, %xmm3; \ movsldup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm4; \ movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ ADDSUB %xmm9, %xmm5; \ movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm6; \ movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 8 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ ADDSUB %xmm9, %xmm7; \ movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm0; \ movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ ADDSUB %xmm9, %xmm1; \ movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm2; \ movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 12 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ ADDSUB %xmm9, %xmm3; \ movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm4; \ movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ ADDSUB %xmm9, %xmm5; \ movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ addps %xmm9, %xmm6; \ movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ mulps %xmm8, %xmm9; \ movaps 64 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ ADDSUB %xmm9, %xmm7; \ movsldup 64 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm0; \ movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ ADDSUB %xmm11, %xmm1; \ movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm2; \ movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 20 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ ADDSUB %xmm11, %xmm3; \ movsldup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm4; \ movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ ADDSUB %xmm11, %xmm5; \ movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm6; \ movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 24 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ ADDSUB %xmm11, %xmm7; \ movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm0; \ movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ ADDSUB %xmm11, %xmm1; \ movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm2; \ movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 28 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ ADDSUB %xmm11, %xmm3; \ movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm4; \ movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ ADDSUB %xmm11, %xmm5; \ movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ addps %xmm11, %xmm6; \ movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ mulps %xmm10, %xmm11; \ movaps 80 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ ADDSUB %xmm11, %xmm7; \ movsldup 80 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulps %xmm12, %xmm13; \ PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * 2 * SIZE(AO); \ addps %xmm13, %xmm0; \ movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ ADDSUB %xmm13, %xmm1; \ movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm2; \ movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 36 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ ADDSUB %xmm13, %xmm3; \ movsldup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm4; \ movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ ADDSUB %xmm13, %xmm5; \ movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm6; \ movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 40 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ ADDSUB %xmm13, %xmm7; \ movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm0; \ movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ ADDSUB %xmm13, %xmm1; \ movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm2; \ movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 44 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ ADDSUB %xmm13, %xmm3; \ movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm4; \ movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ ADDSUB %xmm13, %xmm5; \ movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ addps %xmm13, %xmm6; \ movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ mulps %xmm12, %xmm13; \ movaps 96 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ ADDSUB %xmm13, %xmm7; \ movsldup 96 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm0; \ movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ ADDSUB %xmm15, %xmm1; \ movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm2; \ movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 52 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ ADDSUB %xmm15, %xmm3; \ movsldup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm4; \ movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ ADDSUB %xmm15, %xmm5; \ movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm6; \ movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ ADDSUB %xmm15, %xmm7; \ movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm0; \ movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ ADDSUB %xmm15, %xmm1; \ movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm2; \ movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 60 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ ADDSUB %xmm15, %xmm3; \ movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm4; \ movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ ADDSUB %xmm15, %xmm5; \ movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ addps %xmm15, %xmm6; \ movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ mulps %xmm14, %xmm15; \ movaps 112 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ ADDSUB %xmm15, %xmm7; \ movsldup 112 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm4 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq 72(%rsp), LDC #ifdef TRMMKERNEL movsd 80(%rsp), %xmm4 #endif #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING pxor %xmm15, %xmm15 cmpeqps %xmm15, %xmm15 pslld $31, %xmm15 # Generate mask pxor %xmm2, %xmm2 shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm15, %xmm1 movss %xmm1, 0 + ALPHA_I movss %xmm1, 8 + ALPHA_I #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK #endif #endif salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 .L02: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetcht1 128 * SIZE(BO) prefetcht0 112 * SIZE(B) addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: movq K, %rax andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $4 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc movq A, AO # aoffset = a leaq 112 * SIZE(B), BB movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 16 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 32 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movaps 48 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 movsldup 0 * SIZE(BO), %xmm9 pxor %xmm4, %xmm4 movsldup 16 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 movsldup 32 * SIZE(BO), %xmm13 pxor %xmm6, %xmm6 movsldup 48 * SIZE(BO), %xmm15 pxor %xmm7, %xmm7 prefetchnta 8 * SIZE(CO1) prefetchnta 8 * SIZE(CO2) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $2, %rax #endif movq %rax, KKK #endif #if 1 andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1 (32 * 0) KERNEL2 (32 * 0) KERNEL3 (32 * 0) KERNEL4 (32 * 0) KERNEL5 (32 * 0) KERNEL6 (32 * 0) KERNEL7 (32 * 0) KERNEL8 (32 * 0) KERNEL9 (32 * 0) KERNEL10(32 * 0) KERNEL11(32 * 0) KERNEL12(32 * 0) KERNEL13(32 * 0) KERNEL14(32 * 0) KERNEL15(32 * 0) KERNEL16(32 * 0) cmpq $128 * 1, %rax jle .L12 KERNEL1 (32 * 1) KERNEL2 (32 * 1) KERNEL3 (32 * 1) KERNEL4 (32 * 1) KERNEL5 (32 * 1) KERNEL6 (32 * 1) KERNEL7 (32 * 1) KERNEL8 (32 * 1) KERNEL9 (32 * 1) KERNEL10(32 * 1) KERNEL11(32 * 1) KERNEL12(32 * 1) KERNEL13(32 * 1) KERNEL14(32 * 1) KERNEL15(32 * 1) KERNEL16(32 * 1) cmpq $128 * 2, %rax jle .L12 KERNEL1 (32 * 2) KERNEL2 (32 * 2) KERNEL3 (32 * 2) KERNEL4 (32 * 2) KERNEL5 (32 * 2) KERNEL6 (32 * 2) KERNEL7 (32 * 2) KERNEL8 (32 * 2) KERNEL9 (32 * 2) KERNEL10(32 * 2) KERNEL11(32 * 2) KERNEL12(32 * 2) KERNEL13(32 * 2) KERNEL14(32 * 2) KERNEL15(32 * 2) KERNEL16(32 * 2) cmpq $128 * 3, %rax jle .L12 KERNEL1 (32 * 3) KERNEL2 (32 * 3) KERNEL3 (32 * 3) KERNEL4 (32 * 3) KERNEL5 (32 * 3) KERNEL6 (32 * 3) KERNEL7 (32 * 3) KERNEL8 (32 * 3) KERNEL9 (32 * 3) KERNEL10(32 * 3) KERNEL11(32 * 3) KERNEL12(32 * 3) KERNEL13(32 * 3) KERNEL14(32 * 3) KERNEL15(32 * 3) KERNEL16(32 * 3) cmpq $128 * 4, %rax jle .L12 KERNEL1 (32 * 4) KERNEL2 (32 * 4) KERNEL3 (32 * 4) KERNEL4 (32 * 4) KERNEL5 (32 * 4) KERNEL6 (32 * 4) KERNEL7 (32 * 4) KERNEL8 (32 * 4) KERNEL9 (32 * 4) KERNEL10(32 * 4) KERNEL11(32 * 4) KERNEL12(32 * 4) KERNEL13(32 * 4) KERNEL14(32 * 4) KERNEL15(32 * 4) KERNEL16(32 * 4) cmpq $128 * 5, %rax jle .L12 KERNEL1 (32 * 5) KERNEL2 (32 * 5) KERNEL3 (32 * 5) KERNEL4 (32 * 5) KERNEL5 (32 * 5) KERNEL6 (32 * 5) KERNEL7 (32 * 5) KERNEL8 (32 * 5) KERNEL9 (32 * 5) KERNEL10(32 * 5) KERNEL11(32 * 5) KERNEL12(32 * 5) KERNEL13(32 * 5) KERNEL14(32 * 5) KERNEL15(32 * 5) KERNEL16(32 * 5) cmpq $128 * 6, %rax jle .L12 KERNEL1 (32 * 6) KERNEL2 (32 * 6) KERNEL3 (32 * 6) KERNEL4 (32 * 6) KERNEL5 (32 * 6) KERNEL6 (32 * 6) KERNEL7 (32 * 6) KERNEL8 (32 * 6) KERNEL9 (32 * 6) KERNEL10(32 * 6) KERNEL11(32 * 6) KERNEL12(32 * 6) KERNEL13(32 * 6) KERNEL14(32 * 6) KERNEL15(32 * 6) KERNEL16(32 * 6) cmpq $128 * 7, %rax jle .L12 KERNEL1 (32 * 7) KERNEL2 (32 * 7) KERNEL3 (32 * 7) KERNEL4 (32 * 7) KERNEL5 (32 * 7) KERNEL6 (32 * 7) KERNEL7 (32 * 7) KERNEL8 (32 * 7) KERNEL9 (32 * 7) KERNEL10(32 * 7) KERNEL11(32 * 7) KERNEL12(32 * 7) KERNEL13(32 * 7) KERNEL14(32 * 7) KERNEL15(32 * 7) KERNEL16(32 * 7) addq $64 * 8 * SIZE, AO addq $64 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 #else sarq $3, %rax je .L15 ALIGN_4 .L12: KERNEL1 (32 * 0) KERNEL2 (32 * 0) KERNEL3 (32 * 0) KERNEL4 (32 * 0) KERNEL5 (32 * 0) KERNEL6 (32 * 0) KERNEL7 (32 * 0) KERNEL8 (32 * 0) KERNEL9 (32 * 0) KERNEL10(32 * 0) KERNEL11(32 * 0) KERNEL12(32 * 0) KERNEL13(32 * 0) KERNEL14(32 * 0) KERNEL15(32 * 0) KERNEL16(32 * 0) addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L12 #endif ALIGN_4 .L15: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm14 movaps ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 ADDSUB %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm3 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 ADDSUB %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm6 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm7 movsldup 8 * SIZE(BO), %xmm9 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L16 ALIGN_4 .L18: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 addsubps %xmm1, %xmm0 addsubps %xmm3, %xmm2 addsubps %xmm5, %xmm4 addsubps %xmm7, %xmm6 movaps %xmm0, %xmm1 movaps %xmm2, %xmm3 movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 shufps $0xb1, %xmm0, %xmm0 shufps $0xb1, %xmm2, %xmm2 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 #else shufps $0xb1, %xmm0, %xmm0 shufps $0xb1, %xmm2, %xmm2 shufps $0xb1, %xmm4, %xmm4 shufps $0xb1, %xmm6, %xmm6 addsubps %xmm0, %xmm1 addsubps %xmm2, %xmm3 addsubps %xmm4, %xmm5 addsubps %xmm6, %xmm7 movaps %xmm1, %xmm0 movaps %xmm3, %xmm2 movaps %xmm5, %xmm4 movaps %xmm7, %xmm6 shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #endif mulps %xmm14, %xmm1 mulps %xmm15, %xmm0 mulps %xmm14, %xmm3 mulps %xmm15, %xmm2 mulps %xmm14, %xmm5 mulps %xmm15, %xmm4 mulps %xmm14, %xmm7 mulps %xmm15, %xmm6 addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if! defined(TRMMKERNEL) && !defined(BETAZERO) shufps $0xe4, %xmm8, %xmm8 shufps $0xe4, %xmm9, %xmm9 shufps $0xe4, %xmm10, %xmm10 shufps $0xe4, %xmm11, %xmm11 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm10 movhps 6 * SIZE(CO1), %xmm10 movsd 0 * SIZE(CO2), %xmm9 movhps 2 * SIZE(CO2), %xmm9 movsd 4 * SIZE(CO2), %xmm11 movhps 6 * SIZE(CO2), %xmm11 addps %xmm8, %xmm0 addps %xmm9, %xmm2 addps %xmm10, %xmm4 addps %xmm11, %xmm6 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) movsd %xmm6, 4 * SIZE(CO2) movhps %xmm6, 6 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M je .L30 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 16 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movsldup 0 * SIZE(BO), %xmm9 pxor %xmm2, %xmm2 movsldup 16 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsldup 32 * SIZE(BO), %xmm13 movsldup 48 * SIZE(BO), %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 ADDSUB %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 ADDSUB %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm3 movsldup 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 ADDSUB %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 12 * SIZE(AO), %xmm8 ADDSUB %xmm11, %xmm3 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 ADDSUB %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movaps 32 * SIZE(AO), %xmm8 ADDSUB %xmm11, %xmm3 movsldup 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 32 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 ADDSUB %xmm13, %xmm1 movsldup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 20 * SIZE(AO), %xmm10 ADDSUB %xmm13, %xmm3 movsldup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movshdup 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 ADDSUB %xmm13, %xmm1 movsldup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movshdup 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movaps 24 * SIZE(AO), %xmm10 ADDSUB %xmm13, %xmm3 movsldup 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 48 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 ADDSUB %xmm15, %xmm1 movsldup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 28 * SIZE(AO), %xmm10 ADDSUB %xmm15, %xmm3 movsldup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movshdup 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 ADDSUB %xmm15, %xmm1 movsldup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movshdup 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movaps 48 * SIZE(AO), %xmm10 ADDSUB %xmm15, %xmm3 movsldup 112 * SIZE(BO), %xmm15 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm14 movaps ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 ADDSUB %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm3 movsldup 8 * SIZE(BO), %xmm9 addq $ 4 * SIZE, AO addq $ 8 * SIZE, BO decq %rax jg .L26 ALIGN_4 .L28: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 addsubps %xmm1, %xmm0 addsubps %xmm3, %xmm2 movaps %xmm0, %xmm1 movaps %xmm2, %xmm3 shufps $0xb1, %xmm0, %xmm0 shufps $0xb1, %xmm2, %xmm2 #else shufps $0xb1, %xmm0, %xmm0 shufps $0xb1, %xmm2, %xmm2 addsubps %xmm0, %xmm1 addsubps %xmm2, %xmm3 movaps %xmm1, %xmm0 movaps %xmm3, %xmm2 shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 #endif mulps %xmm14, %xmm1 mulps %xmm15, %xmm0 mulps %xmm14, %xmm3 mulps %xmm15, %xmm2 addps %xmm1, %xmm0 addps %xmm3, %xmm2 #if! defined(TRMMKERNEL) && !defined(BETAZERO) shufps $0xe4, %xmm8, %xmm8 shufps $0xe4, %xmm10, %xmm10 movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 0 * SIZE(CO2), %xmm10 movhps 2 * SIZE(CO2), %xmm10 addps %xmm8, %xmm0 addps %xmm10, %xmm2 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm2, 0 * SIZE(CO2) movhps %xmm2, 2 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 .L30: testq $1, M je .L39 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 8 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movsd 0 * SIZE(BO), %xmm9 pxor %xmm2, %xmm2 movsd 16 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movsd 32 * SIZE(BO), %xmm13 movsd 48 * SIZE(BO), %xmm15 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $2, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L35 ALIGN_4 .L32: shufps $0x50, %xmm9, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 64 * SIZE(BO), %xmm9 shufps $0x50, %xmm11, %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm8, %xmm11 movddup 6 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm8, %xmm11 movddup 16 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movsd 80 * SIZE(BO), %xmm11 shufps $0x50, %xmm13, %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 36 * SIZE(BO), %xmm13 shufps $0x50, %xmm13, %xmm13 mulps %xmm10, %xmm13 movddup 10 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movsd 40 * SIZE(BO), %xmm13 shufps $0x50, %xmm13, %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movsd 44 * SIZE(BO), %xmm13 shufps $0x50, %xmm13, %xmm13 mulps %xmm10, %xmm13 movddup 12 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movsd 96 * SIZE(BO), %xmm13 shufps $0x50, %xmm15, %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 52 * SIZE(BO), %xmm15 shufps $0x50, %xmm15, %xmm15 mulps %xmm10, %xmm15 movddup 14 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movsd 56 * SIZE(BO), %xmm15 shufps $0x50, %xmm15, %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movsd 60 * SIZE(BO), %xmm15 shufps $0x50, %xmm15, %xmm15 mulps %xmm10, %xmm15 movddup 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movsd 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm14 movaps ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO addq $8 * SIZE, BO decq %rax jg .L36 ALIGN_4 .L38: movaps %xmm0, %xmm6 movlhps %xmm1, %xmm0 movhlps %xmm6, %xmm1 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm1 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 addsubps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $0xb1, %xmm0, %xmm0 #else shufps $0xb1, %xmm0, %xmm0 addsubps %xmm0, %xmm1 movaps %xmm1, %xmm0 shufps $0xb1, %xmm1, %xmm1 #endif mulps %xmm14, %xmm1 mulps %xmm15, %xmm0 addps %xmm1, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 0 * SIZE(CO2), %xmm8 addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 0 * SIZE(CO2) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK #endif leaq (C, LDC, 2), C # c += 2 * ldc decq J # j -- jg .L01 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 .L41: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO movq K, %rax sarq $3, %rax jle .L43 ALIGN_4 .L42: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 movddup 4 * SIZE(B), %xmm2 movddup 6 * SIZE(B), %xmm3 movddup 8 * SIZE(B), %xmm4 movddup 10 * SIZE(B), %xmm5 movddup 12 * SIZE(B), %xmm6 movddup 14 * SIZE(B), %xmm7 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) prefetcht1 128 * SIZE(BO) prefetcht0 112 * SIZE(B) addq $16 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L43: movq K, %rax andq $7, %rax BRANCH jle .L50 ALIGN_4 .L44: movddup 0 * SIZE(B), %xmm0 movaps %xmm0, 0 * SIZE(BO) addq $2 * SIZE, B addq $4 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 16 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 32 * SIZE(AO), %xmm12 pxor %xmm4, %xmm4 movaps 48 * SIZE(AO), %xmm14 pxor %xmm5, %xmm5 movsldup 0 * SIZE(BO), %xmm9 movsldup 16 * SIZE(BO), %xmm11 prefetchnta 4 * SIZE(CO1) #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 64 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm5 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 20 * SIZE(AO), %xmm10 ADDSUB %xmm9, %xmm1 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 24 * SIZE(AO), %xmm10 ADDSUB %xmm9, %xmm5 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 28 * SIZE(AO), %xmm10 ADDSUB %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movaps 80 * SIZE(AO), %xmm10 ADDSUB %xmm9, %xmm5 movsldup 32 * SIZE(BO), %xmm9 mulps %xmm12, %xmm11 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 36 * SIZE(AO), %xmm12 ADDSUB %xmm11, %xmm1 movsldup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm4 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 40 * SIZE(AO), %xmm12 ADDSUB %xmm11, %xmm5 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm0 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 44 * SIZE(AO), %xmm12 ADDSUB %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 addps %xmm11, %xmm4 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm12, %xmm11 movaps 96 * SIZE(AO), %xmm12 ADDSUB %xmm11, %xmm5 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 52 * SIZE(AO), %xmm14 ADDSUB %xmm11, %xmm1 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm4 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 56 * SIZE(AO), %xmm14 ADDSUB %xmm11, %xmm5 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm0 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 60 * SIZE(AO), %xmm14 ADDSUB %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 addps %xmm11, %xmm4 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm14, %xmm11 movaps 112 * SIZE(AO), %xmm14 ADDSUB %xmm11, %xmm5 movsldup 48 * SIZE(BO), %xmm11 addq $64 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm14 movaps ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm4 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm5 movsldup 4 * SIZE(BO), %xmm9 addq $ 8 * SIZE, AO addq $ 4 * SIZE, BO decq %rax jg .L56 ALIGN_4 .L58: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm5, %xmm5 addsubps %xmm1, %xmm0 addsubps %xmm5, %xmm4 movaps %xmm0, %xmm1 movaps %xmm4, %xmm5 shufps $0xb1, %xmm0, %xmm0 shufps $0xb1, %xmm4, %xmm4 #else shufps $0xb1, %xmm0, %xmm0 shufps $0xb1, %xmm4, %xmm4 addsubps %xmm0, %xmm1 addsubps %xmm4, %xmm5 movaps %xmm1, %xmm0 movaps %xmm5, %xmm4 shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm5, %xmm5 #endif mulps %xmm14, %xmm1 mulps %xmm15, %xmm0 mulps %xmm14, %xmm5 mulps %xmm15, %xmm4 addps %xmm1, %xmm0 addps %xmm5, %xmm4 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 movsd 4 * SIZE(CO1), %xmm9 movhps 6 * SIZE(CO1), %xmm9 addps %xmm8, %xmm0 addps %xmm9, %xmm4 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) movsd %xmm4, 4 * SIZE(CO1) movhps %xmm4, 6 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsldup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movaps 16 * SIZE(AO), %xmm10 movsldup 16 * SIZE(BO), %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 8 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 12 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 32 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 32 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 16 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 20 * SIZE(AO), %xmm10 ADDSUB %xmm11, %xmm1 movsldup 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 20 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 24 * SIZE(AO), %xmm10 ADDSUB %xmm11, %xmm1 movsldup 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 24 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 28 * SIZE(AO), %xmm10 ADDSUB %xmm11, %xmm1 movsldup 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movshdup 28 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movaps 48 * SIZE(AO), %xmm10 ADDSUB %xmm11, %xmm1 movsldup 48 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm14 movaps ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movshdup 0 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movaps 4 * SIZE(AO), %xmm8 ADDSUB %xmm9, %xmm1 movsldup 4 * SIZE(BO), %xmm9 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 addsubps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $0xb1, %xmm0, %xmm0 #else shufps $0xb1, %xmm0, %xmm0 addsubps %xmm0, %xmm1 movaps %xmm1, %xmm0 shufps $0xb1, %xmm1, %xmm1 #endif mulps %xmm14, %xmm1 mulps %xmm15, %xmm0 addps %xmm1, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 movhps 2 * SIZE(CO1), %xmm8 addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) movhps %xmm0, 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax leaq (,%rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L70: testq $1, M je .L999 #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER, BO #else leaq BUFFER, BO movq KK, %rax leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movsd 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movddup 8 * SIZE(AO), %xmm10 movsd 16 * SIZE(BO), %xmm11 #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax #else addq $1, %rax #endif movq %rax, KKK #endif sarq $3, %rax je .L75 ALIGN_4 .L72: shufps $0x50, %xmm9, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 8 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 6 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 12 * SIZE(BO), %xmm9 shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 16 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movsd 32 * SIZE(BO), %xmm9 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 10 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 20 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 24 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 14 * SIZE(AO), %xmm10 addps %xmm11, %xmm0 movsd 28 * SIZE(BO), %xmm11 shufps $0x50, %xmm11, %xmm11 mulps %xmm10, %xmm11 movddup 24 * SIZE(AO), %xmm10 addps %xmm11, %xmm1 movsd 48 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif movaps ALPHA_R, %xmm14 movaps ALPHA_I, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: shufps $0x50, %xmm9, %xmm9 mulps %xmm8, %xmm9 movddup 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm0 movsd 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO decq %rax jg .L76 ALIGN_4 .L78: addps %xmm1, %xmm0 movhlps %xmm0, %xmm1 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm1 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 addsubps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $0xb1, %xmm0, %xmm0 #else shufps $0xb1, %xmm0, %xmm0 addsubps %xmm0, %xmm1 movaps %xmm1, %xmm0 shufps $0xb1, %xmm1, %xmm1 #endif mulps %xmm14, %xmm1 mulps %xmm15, %xmm0 addps %xmm1, %xmm0 #if! defined(TRMMKERNEL) && !defined(BETAZERO) movsd 0 * SIZE(CO1), %xmm8 addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_kernel_4x4_sandy.S000066400000000000000000002361201313527062700220660ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #define ASSEMBLER #include "common.h" #define old_bm %rdi #define old_bn %rsi #define old_bk %rdx #define bm %r13 #define bn %r14 #define bk %r15 #define ALPHA %xmm0 #define ba %rcx #define bb %r8 #define C %r9 #define ldc %r10 #define i %r11 #define k %rax #define ptrba %rdi #define ptrbb %rsi #define C0 %rbx #define C1 %rbp #define prebb %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define old_ldc 8+STACKSIZE(%rsp) #define old_offset 16+STACKSIZE(%rsp) #define MEMALPHA_R 48(%rsp) #define MEMALPHA_I 56(%rsp) #define j 64(%rsp) #define OFFSET 72(%rsp) #define kk 80(%rsp) #define kkk 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define old_ldc 72 + STACKSIZE(%rsp) #define old_offset 80 + STACKSIZE(%rsp) #define MEMALPHA_R 224(%rsp) #define MEMALPHA_I 232(%rsp) #define j 240(%rsp) #define OFFSET 248(%rsp) #define kk 256(%rsp) #define kkk 264(%rsp) #endif #define PREFETCH0 prefetcht0 #define PREFETCH1 prefetcht0 #define PREFETCH2 prefetcht0 #define PRESIZE 64 #define xvec0 %xmm0 #define xvec1 %xmm1 #define xvec2 %xmm2 #define xvec3 %xmm3 #define xvec4 %xmm4 #define xvec5 %xmm5 #define xvec6 %xmm6 #define xvec7 %xmm7 #define xvec8 %xmm8 #define xvec9 %xmm9 #define xvec10 %xmm10 #define xvec11 %xmm11 #define xvec12 %xmm12 #define xvec13 %xmm13 #define xvec14 %xmm14 #define xvec15 %xmm15 #define yvec0 %ymm0 #define yvec1 %ymm1 #define yvec2 %ymm2 #define yvec3 %ymm3 #define yvec4 %ymm4 #define yvec5 %ymm5 #define yvec6 %ymm6 #define yvec7 %ymm7 #define yvec8 %ymm8 #define yvec9 %ymm9 #define yvec10 %ymm10 #define yvec11 %ymm11 #define yvec12 %ymm12 #define yvec13 %ymm13 #define yvec14 %ymm14 #define yvec15 %ymm15 #define LEAQ leaq #define ADDQ addq #define MULQ imulq #define SARQ sarq #define SALQ salq #define ANDQ andq #define SUBQ subq #define DECQ decq #define JG jg #define JLE jle #define TEST testq #define OR orq #define JNE jne #define JMP jmp #define NOP #define XOR xorpd #undef MOVQ #define MOVQ movq #define XOR_DY vxorpd #define XOR_DX vxorpd #define LD_DY vmovapd #define LD_DX vmovapd #define LDL_DY vmovlpd #define LDL_DX vmovlpd #define LDH_DY vmovhpd #define LDH_DX vmovhpd #define ST_DY vmovapd #define ST_DX vmovapd #define STL_DY vmovlpd #define STL_DX vmovlpd #define STH_DY vmovhpd #define STH_DX vmovhpd #define EDUP_DY vmovddup #define ADD_DY vaddpd #define ADD_DX vaddpd #define SUB_DY vsubpd #define SUB_DX vsubpd #define ADDSUB_DY vaddsubpd #define ADDSUB_DX vaddsubpd #define MUL_DY vmulpd #define MUL_DX vmulpd #define SHUF_DY vperm2f128 #define SHUF_DX vpshufd #define VPERMILP_DY vpermilpd #define BROAD_DY vbroadcastsd #define BROAD_DX vmovddup #define MOV_DY vmovapd #define MOV_DX vmovapd #define REVS_DY vshufpd #define REVS_DX vmovsd #define EXTRA_DY vextractf128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1_DX ADD_DX #define ADD1_DY ADD_DY #define ADD2_DY ADDSUB_DY #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1_DX SUB_DX #define ADD1_DY SUB_DY #define ADD2_DY ADDSUB_DY #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1_DX SUB_DX #define ADD1_DY SUB_DY #define ADD2_DY ADDSUB_DY #else #define ADD1_DX ADD_DX #define ADD1_DY ADD_DY #define ADD2_DY ADDSUB_DY #endif PROLOGUE subq $STACKSIZE, %rsp; movq %rbx, 0(%rsp); movq %rbp, 8(%rsp); movq %r12, 16(%rsp); movq %r13, 24(%rsp); movq %r14, 32(%rsp); movq %r15, 40(%rsp); #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, old_bm movq ARG2, old_bn movq ARG3, old_bk movq OLD_A, ba movq OLD_B, bb movq OLD_C, C movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11; #endif #endif vzeroupper vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm movq old_bn, bn movq old_bk, bk salq $ZBASE_SHIFT, ldc #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11; #endif movq %r11, kk; #endif MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C,C0; LEAQ (C,ldc,2),C1; MOVQ bk, k; SALQ $6, k; LEAQ (bb, k, 1), prebb; # Rn=4 SIZE=8 COMPLEX=2 MOVQ ba,ptrba; MOVQ bm,i; SARQ $2,i; # Rm = 4 JLE .L1_loopE; ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif PREFETCH0 0*SIZE(prebb); PREFETCH0 8*SIZE(prebb); PREFETCH0 16*SIZE(prebb) ADDQ $24*SIZE, prebb; # Initial Results Register XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; EDUP_DY 0*SIZE(ptrbb), yvec2; # Br1, Br1, Br2, Br2 XOR_DY yvec13, yvec13, yvec13; XOR_DY yvec12, yvec12, yvec12; EDUP_DY 4*SIZE(ptrbb), yvec3; # Br3, Br3, Br4, Br4 PREFETCH2 3*SIZE(C0); PREFETCH2 3*SIZE(C1); XOR_DY yvec11, yvec11, yvec11; XOR_DY yvec10, yvec10, yvec10; LD_DY 0*SIZE(ptrba), yvec0; # Ar1, Ai1, Ar2, Ai2 PREFETCH2 7*SIZE(C0, ldc, 1); PREFETCH2 7*SIZE(C1, ldc, 1); XOR_DY yvec9, yvec9, yvec9; XOR_DY yvec8, yvec8, yvec8; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; ALIGN_5; .L2_bodyB:; #### Computing kernel #### #### Unroll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1 MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 ADD1_DY yvec6, yvec15, yvec15; ADD1_DY yvec7, yvec11, yvec11; PREFETCH0 PRESIZE*SIZE(ptrba); MUL_DY yvec1, yvec2, yvec6; EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 MUL_DY yvec1, yvec3, yvec7; EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 ADD1_DY yvec6, yvec14, yvec14; ADD1_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 ADD1_DY yvec6, yvec13, yvec13; ADD1_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 MUL_DY yvec1, yvec5, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 ADD1_DY yvec6, yvec12, yvec12; ADD1_DY yvec7, yvec8, yvec8; VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 MUL_DY yvec0, yvec2, yvec6; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec6, yvec15, yvec15; ADD2_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; EDUP_DY 8*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; EDUP_DY 12*SIZE(ptrbb), yvec3; ADD2_DY yvec6, yvec14, yvec14; ADD2_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 8*SIZE(ptrba), yvec0; ADD2_DY yvec6, yvec13, yvec13; ADD2_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; #### Unroll time 2 #### LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 ADD1_DY yvec6, yvec15, yvec15; ADD1_DY yvec7, yvec11, yvec11; PREFETCH0 (PRESIZE+8)*SIZE(ptrba); MUL_DY yvec1, yvec2, yvec6; EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 MUL_DY yvec1, yvec3, yvec7; EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 ADD1_DY yvec6, yvec14, yvec14; ADD1_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 ADD1_DY yvec6, yvec13, yvec13; ADD1_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 MUL_DY yvec1, yvec5, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 ADD1_DY yvec6, yvec12, yvec12; ADD1_DY yvec7, yvec8, yvec8; VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 MUL_DY yvec0, yvec2, yvec6; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec6, yvec15, yvec15; ADD2_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; EDUP_DY 16*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; EDUP_DY 20*SIZE(ptrbb), yvec3; ADD2_DY yvec6, yvec14, yvec14; ADD2_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 16*SIZE(ptrba), yvec0; ADD2_DY yvec6, yvec13, yvec13; ADD2_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; #### Unroll time 3 #### LD_DY 20*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 ADD1_DY yvec6, yvec15, yvec15; ADD1_DY yvec7, yvec11, yvec11; PREFETCH0 (PRESIZE+16)*SIZE(ptrba); MUL_DY yvec1, yvec2, yvec6; EDUP_DY 17*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 MUL_DY yvec1, yvec3, yvec7; EDUP_DY 21*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 ADD1_DY yvec6, yvec14, yvec14; ADD1_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 ADD1_DY yvec6, yvec13, yvec13; ADD1_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 MUL_DY yvec1, yvec5, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 ADD1_DY yvec6, yvec12, yvec12; ADD1_DY yvec7, yvec8, yvec8; VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 MUL_DY yvec0, yvec2, yvec6; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec6, yvec15, yvec15; ADD2_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; EDUP_DY 24*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; EDUP_DY 28*SIZE(ptrbb), yvec3; ADD2_DY yvec6, yvec14, yvec14; ADD2_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 24*SIZE(ptrba), yvec0; ADD2_DY yvec6, yvec13, yvec13; ADD2_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; #### Unroll time 4 #### LD_DY 28*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 ADDQ $32*SIZE, ptrba; ADD1_DY yvec6, yvec15, yvec15; ADD1_DY yvec7, yvec11, yvec11; PREFETCH0 (PRESIZE+24)*SIZE(ptrba); MUL_DY yvec1, yvec2, yvec6; EDUP_DY 25*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 MUL_DY yvec1, yvec3, yvec7; EDUP_DY 29*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 ADD1_DY yvec6, yvec14, yvec14; ADD1_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 ADDQ $32*SIZE, ptrbb; ADD1_DY yvec6, yvec13, yvec13; ADD1_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 MUL_DY yvec1, yvec5, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 ADD1_DY yvec6, yvec12, yvec12; ADD1_DY yvec7, yvec8, yvec8; VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 MUL_DY yvec0, yvec2, yvec6; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec6, yvec15, yvec15; ADD2_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; EDUP_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; EDUP_DY 4*SIZE(ptrbb), yvec3; ADD2_DY yvec6, yvec14, yvec14; ADD2_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 0*SIZE(ptrba), yvec0; ADD2_DY yvec6, yvec13, yvec13; ADD2_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; DECQ k; JG .L2_bodyB; ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L3_loopE; ALIGN_5 .L3_bodyB: #### Unroll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1 MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 ADD1_DY yvec6, yvec15, yvec15; ADD1_DY yvec7, yvec11, yvec11; PREFETCH0 PRESIZE*SIZE(ptrba); MUL_DY yvec1, yvec2, yvec6; EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 MUL_DY yvec1, yvec3, yvec7; EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 ADD1_DY yvec6, yvec14, yvec14; ADD1_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 ADD1_DY yvec6, yvec13, yvec13; ADD1_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 MUL_DY yvec1, yvec5, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 ADD1_DY yvec6, yvec12, yvec12; ADD1_DY yvec7, yvec8, yvec8; VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 MUL_DY yvec0, yvec2, yvec6; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec6, yvec15, yvec15; ADD2_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; EDUP_DY 8*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; EDUP_DY 12*SIZE(ptrbb), yvec3; ADD2_DY yvec6, yvec14, yvec14; ADD2_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 8*SIZE(ptrba), yvec0; ADD2_DY yvec6, yvec13, yvec13; ADD2_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; #### Unroll time 2 #### LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 ADDQ $16*SIZE, ptrba ADD1_DY yvec6, yvec15, yvec15; ADD1_DY yvec7, yvec11, yvec11; PREFETCH0 (PRESIZE+8)*SIZE(ptrba); MUL_DY yvec1, yvec2, yvec6; EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 MUL_DY yvec1, yvec3, yvec7; EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 ADD1_DY yvec6, yvec14, yvec14; ADD1_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 ADDQ $16*SIZE, ptrbb ADD1_DY yvec6, yvec13, yvec13; ADD1_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 MUL_DY yvec1, yvec5, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 ADD1_DY yvec6, yvec12, yvec12; ADD1_DY yvec7, yvec8, yvec8; VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 MUL_DY yvec0, yvec2, yvec6; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec6, yvec15, yvec15; ADD2_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; EDUP_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec3, yvec7; EDUP_DY 4*SIZE(ptrbb), yvec3; ADD2_DY yvec6, yvec14, yvec14; ADD2_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; LD_DY 0*SIZE(ptrba), yvec0; ADD2_DY yvec6, yvec13, yvec13; ADD2_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; .L3_loopE:; #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L4_loopE; ALIGN_5 .L4_loopB:; #### Unroll time 1 #### PREFETCH0 PRESIZE*SIZE(ptrba); LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3 ADDQ $8*SIZE, ptrba; ADD1_DY yvec6, yvec15, yvec15; ADD1_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2 MUL_DY yvec1, yvec3, yvec7; EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4 ADD1_DY yvec6, yvec14, yvec14; ADD1_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2 ADDQ $8*SIZE, ptrbb; ADD1_DY yvec6, yvec13, yvec13; ADD1_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1 MUL_DY yvec1, yvec5, yvec7; SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3 ADD1_DY yvec6, yvec12, yvec12; ADD1_DY yvec7, yvec8, yvec8; VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4 MUL_DY yvec0, yvec2, yvec6; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec6, yvec15, yvec15; ADD2_DY yvec7, yvec11, yvec11; MUL_DY yvec1, yvec2, yvec6; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec6, yvec14, yvec14; ADD2_DY yvec7, yvec10, yvec10; MUL_DY yvec0, yvec4, yvec6; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec6, yvec13, yvec13; ADD2_DY yvec7, yvec9, yvec9; MUL_DY yvec1, yvec4, yvec6; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; .L4_loopE:; #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; ADDSUB_DY yvec13, yvec7, yvec13; ADDSUB_DY yvec12, yvec7, yvec12; ADDSUB_DY yvec11, yvec7, yvec11; ADDSUB_DY yvec10, yvec7, yvec10; ADDSUB_DY yvec9, yvec7, yvec9; ADDSUB_DY yvec8, yvec7, yvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; SUB_DY yvec13, yvec7, yvec13; SUB_DY yvec12, yvec7, yvec12; SUB_DY yvec11, yvec7, yvec11; SUB_DY yvec10, yvec7, yvec10; SUB_DY yvec9, yvec7, yvec9; SUB_DY yvec8, yvec7, yvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; VPERMILP_DY $0x05, yvec12, yvec12; VPERMILP_DY $0x05, yvec11, yvec11; VPERMILP_DY $0x05, yvec10, yvec10; VPERMILP_DY $0x05, yvec9, yvec9; VPERMILP_DY $0x05, yvec8, yvec8; ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; ADDSUB_DY yvec13, yvec7, yvec13; ADDSUB_DY yvec12, yvec7, yvec12; ADDSUB_DY yvec11, yvec7, yvec11; ADDSUB_DY yvec10, yvec7, yvec10; ADDSUB_DY yvec9, yvec7, yvec9; ADDSUB_DY yvec8, yvec7, yvec8; VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; VPERMILP_DY $0x05, yvec12, yvec12; VPERMILP_DY $0x05, yvec11, yvec11; VPERMILP_DY $0x05, yvec10, yvec10; VPERMILP_DY $0x05, yvec9, yvec9; VPERMILP_DY $0x05, yvec8, yvec8; #endif #### Load Alpha #### BROAD_DY MEMALPHA_R,yvec7; BROAD_DY MEMALPHA_I,yvec6; #### Multiply Alpha #### VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADDSUB_DY yvec5, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec4; MUL_DY yvec7, yvec14, yvec14; MUL_DY yvec6, yvec4, yvec4; ADDSUB_DY yvec4, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec3; MUL_DY yvec7, yvec13, yvec13; MUL_DY yvec6, yvec3, yvec3; ADDSUB_DY yvec3, yvec13, yvec13; VPERMILP_DY $0x05,yvec12, yvec2; MUL_DY yvec7, yvec12, yvec12; MUL_DY yvec6, yvec2, yvec2; ADDSUB_DY yvec2, yvec12, yvec12; VPERMILP_DY $0x05, yvec11, yvec1; MUL_DY yvec7, yvec11, yvec11; MUL_DY yvec6, yvec1, yvec1; ADDSUB_DY yvec1, yvec11, yvec11; VPERMILP_DY $0x05,yvec10, yvec0; MUL_DY yvec7, yvec10, yvec10; MUL_DY yvec6, yvec0, yvec0; ADDSUB_DY yvec0, yvec10, yvec10; VPERMILP_DY $0x05, yvec9, yvec5; MUL_DY yvec7, yvec9, yvec9; MUL_DY yvec6, yvec5, yvec5; ADDSUB_DY yvec5, yvec9, yvec9; VPERMILP_DY $0x05, yvec8, yvec4; MUL_DY yvec7, yvec8, yvec8; MUL_DY yvec6, yvec4, yvec4; ADDSUB_DY yvec4, yvec8, yvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; ALIGN_5 #### Store Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec12,xvec4; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec10,xvec2; EXTRA_DY $1,yvec9,xvec1; EXTRA_DY $1,yvec8,xvec0; #ifndef TRMMKERNEL ADD_DY 0*SIZE(C0),xvec15, xvec15; ADD_DY 2*SIZE(C0,ldc,1), xvec7, xvec7; ADD_DY 4*SIZE(C0),xvec14, xvec14; ADD_DY 6*SIZE(C0,ldc,1),xvec6, xvec6; ADD_DY 0*SIZE(C0,ldc,1),xvec13, xvec13; ADD_DY 2*SIZE(C0),xvec5, xvec5; ADD_DY 4*SIZE(C0,ldc,1),xvec12, xvec12; ADD_DY 6*SIZE(C0),xvec4, xvec4; ADD_DY 0*SIZE(C1),xvec11, xvec11; ADD_DY 2*SIZE(C1,ldc,1),xvec3, xvec3; ADD_DY 4*SIZE(C1),xvec10, xvec10; ADD_DY 6*SIZE(C1,ldc,1),xvec2, xvec2; ADD_DY 0*SIZE(C1,ldc,1),xvec9, xvec9; ADD_DY 2*SIZE(C1),xvec1, xvec1; ADD_DY 4*SIZE(C1,ldc,1),xvec8, xvec8; ADD_DY 6*SIZE(C1),xvec0, xvec0; #endif ST_DY xvec15,0*SIZE(C0); ST_DY xvec7,2*SIZE(C0,ldc,1); ST_DY xvec14,4*SIZE(C0); ST_DY xvec6,6*SIZE(C0,ldc,1); ST_DY xvec13,0*SIZE(C0,ldc,1); ST_DY xvec5,2*SIZE(C0); ST_DY xvec12,4*SIZE(C0,ldc,1); ST_DY xvec4,6*SIZE(C0); ST_DY xvec11,0*SIZE(C1); ST_DY xvec3,2*SIZE(C1,ldc,1); ST_DY xvec10,4*SIZE(C1); ST_DY xvec2,6*SIZE(C1,ldc,1); ST_DY xvec9,0*SIZE(C1,ldc,1); ST_DY xvec1,2*SIZE(C1); ST_DY xvec8,4*SIZE(C1,ldc,1); ST_DY xvec0,6*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE,C0; ADDQ $8*SIZE,C1; .L1_bodyE:; DECQ i; JG .L1_bodyB; JMP .L1_loopE; ALIGN_5 .L4_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C0), xvec0, xvec0; LDH_DY 1*SIZE(C0), xvec0, xvec0; LDL_DY 2*SIZE(C0, ldc, 1), xvec1, xvec1; LDH_DY 3*SIZE(C0, ldc, 1), xvec1, xvec1; LDL_DY 4*SIZE(C0), xvec2, xvec2; LDH_DY 5*SIZE(C0), xvec2, xvec2; LDL_DY 6*SIZE(C0, ldc, 1), xvec3, xvec3; LDH_DY 7*SIZE(C0, ldc, 1), xvec3, xvec3; ADD_DY xvec0, xvec15, xvec15; ADD_DY xvec1, xvec7, xvec7; ADD_DY xvec2, xvec14, xvec14; ADD_DY xvec3, xvec6, xvec6; #endif STL_DY xvec15, 0*SIZE(C0); STH_DY xvec15, 1*SIZE(C0); STL_DY xvec7, 2*SIZE(C0, ldc, 1); STH_DY xvec7, 3*SIZE(C0, ldc, 1); STL_DY xvec14, 4*SIZE(C0); STH_DY xvec14, 5*SIZE(C0); STL_DY xvec6, 6*SIZE(C0, ldc, 1); STH_DY xvec6, 7*SIZE(C0, ldc, 1); EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C0, ldc, 1), xvec3, xvec3; LDH_DY 1*SIZE(C0, ldc, 1), xvec3, xvec3; LDL_DY 2*SIZE(C0), xvec2, xvec2; LDH_DY 3*SIZE(C0), xvec2, xvec2; LDL_DY 4*SIZE(C0, ldc, 1), xvec1, xvec1; LDH_DY 5*SIZE(C0, ldc, 1), xvec1, xvec1; LDL_DY 6*SIZE(C0), xvec0, xvec0; LDH_DY 7*SIZE(C0), xvec0, xvec0; ADD_DY xvec3, xvec13, xvec13; ADD_DY xvec2, xvec5, xvec5; ADD_DY xvec1, xvec12, xvec12; ADD_DY xvec0, xvec4, xvec4; #endif STL_DY xvec13, 0*SIZE(C0, ldc, 1); STH_DY xvec13, 1*SIZE(C0, ldc, 1); STL_DY xvec5, 2*SIZE(C0); STH_DY xvec5, 3*SIZE(C0); STL_DY xvec12, 4*SIZE(C0, ldc, 1); STH_DY xvec12, 5*SIZE(C0, ldc, 1); STL_DY xvec4, 6*SIZE(C0); STH_DY xvec4, 7*SIZE(C0); EXTRA_DY $1, yvec11, xvec3; EXTRA_DY $1, yvec10, xvec2; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C1), xvec7, xvec7; LDH_DY 1*SIZE(C1), xvec7, xvec7; LDL_DY 2*SIZE(C1, ldc, 1), xvec6, xvec6; LDH_DY 3*SIZE(C1, ldc, 1), xvec6, xvec6; LDL_DY 4*SIZE(C1), xvec5, xvec5; LDH_DY 5*SIZE(C1), xvec5, xvec5; LDL_DY 6*SIZE(C1, ldc, 1), xvec4, xvec4; LDH_DY 7*SIZE(C1, ldc, 1), xvec4, xvec4; ADD_DY xvec7, xvec11, xvec11; ADD_DY xvec6, xvec3, xvec3; ADD_DY xvec5, xvec10, xvec10; ADD_DY xvec4, xvec2, xvec2; #endif STL_DY xvec11, 0*SIZE(C1); STH_DY xvec11, 1*SIZE(C1); STL_DY xvec3, 2*SIZE(C1, ldc, 1); STH_DY xvec3, 3*SIZE(C1, ldc, 1); STL_DY xvec10, 4*SIZE(C1); STH_DY xvec10, 5*SIZE(C1); STL_DY xvec2, 6*SIZE(C1, ldc, 1); STH_DY xvec2, 7*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec9, xvec1; EXTRA_DY $1, yvec8, xvec0; #ifndef TRMMKERNEL LDL_DY 0*SIZE(C1, ldc, 1), xvec5, xvec5; LDH_DY 1*SIZE(C1, ldc, 1), xvec5, xvec5; LDL_DY 2*SIZE(C1), xvec4, xvec4; LDH_DY 3*SIZE(C1), xvec4, xvec4; LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3; LDL_DY 6*SIZE(C1), xvec2, xvec2; LDH_DY 7*SIZE(C1), xvec2, xvec2; ADD_DY xvec5, xvec9, xvec9; ADD_DY xvec4, xvec1, xvec1; ADD_DY xvec3, xvec8, xvec8; ADD_DY xvec2, xvec0, xvec0; #endif STL_DY xvec9, 0*SIZE(C1, ldc, 1); STH_DY xvec9, 1*SIZE(C1, ldc, 1); STL_DY xvec1, 2*SIZE(C1); STH_DY xvec1, 3*SIZE(C1); STL_DY xvec8, 4*SIZE(C1, ldc, 1); STH_DY xvec8, 5*SIZE(C1, ldc, 1); STL_DY xvec0, 6*SIZE(C1); STH_DY xvec0, 7*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; ALIGN_5; .L1_loopE:; TEST $2, bm; JLE .L5_loopE; ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; XOR_DY yvec13, yvec13, yvec13; XOR_DY yvec12, yvec12, yvec12; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L7_loopE; ALIGN_5 .L7_bodyB: #### Compute kernel #### #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; EDUP_DY 0*SIZE(ptrbb), yvec2; EDUP_DY 4*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; EDUP_DY 1*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec5, yvec7; ADD1_DY yvec7 ,yvec12, yvec12; EDUP_DY 5*SIZE(ptrbb), yvec3 VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 2 #### LD_DY 4*SIZE(ptrba), yvec0; EDUP_DY 8*SIZE(ptrbb), yvec2; EDUP_DY 12*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; EDUP_DY 9*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec5, yvec7; ADD1_DY yvec7 ,yvec12, yvec12; EDUP_DY 13*SIZE(ptrbb), yvec3 VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 3 #### LD_DY 8*SIZE(ptrba), yvec0; EDUP_DY 16*SIZE(ptrbb), yvec2; EDUP_DY 20*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; EDUP_DY 17*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec5, yvec7; ADD1_DY yvec7 ,yvec12, yvec12; EDUP_DY 21*SIZE(ptrbb), yvec3 VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 4 #### LD_DY 12*SIZE(ptrba), yvec0; EDUP_DY 24*SIZE(ptrbb), yvec2; EDUP_DY 28*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; EDUP_DY 25*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec5, yvec7; ADD1_DY yvec7 ,yvec12, yvec12; EDUP_DY 29*SIZE(ptrbb), yvec3 VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD2_DY yvec6, yvec13, yvec13; ADDQ $16*SIZE, ptrba; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L7_bodyB; ALIGN_5 .L7_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L8_loopE; ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; EDUP_DY 0*SIZE(ptrbb), yvec2; EDUP_DY 4*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; EDUP_DY 1*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec5, yvec7; ADD1_DY yvec7 ,yvec12, yvec12; EDUP_DY 5*SIZE(ptrbb), yvec3 VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 2 #### LD_DY 4*SIZE(ptrba), yvec0; EDUP_DY 8*SIZE(ptrbb), yvec2; EDUP_DY 12*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; EDUP_DY 9*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec5, yvec7; ADD1_DY yvec7 ,yvec12, yvec12; EDUP_DY 13*SIZE(ptrbb), yvec3 VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD2_DY yvec6, yvec13, yvec13; ADDQ $8*SIZE, ptrba; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; ADDQ $16*SIZE, ptrbb; .L8_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L9_loopE; ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; EDUP_DY 0*SIZE(ptrbb), yvec2; EDUP_DY 4*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; EDUP_DY 1*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec5, yvec7; ADD1_DY yvec7 ,yvec12, yvec12; EDUP_DY 5*SIZE(ptrbb), yvec3 VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec4, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec0, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L9_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; ADDSUB_DY yvec13, yvec7, yvec13; ADDSUB_DY yvec12, yvec7, yvec12; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; SUB_DY yvec13, yvec7, yvec13; SUB_DY yvec12, yvec7, yvec12; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; VPERMILP_DY $0x05, yvec12, yvec12; ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; ADDSUB_DY yvec13, yvec7, yvec13; ADDSUB_DY yvec12, yvec7, yvec12; VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; VPERMILP_DY $0x05, yvec12, yvec12; #endif #### Load Alpha #### BROAD_DY MEMALPHA_R, yvec7; BROAD_DY MEMALPHA_I, yvec6; #### Multiply Alpha #### VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADD2_DY yvec5, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec4; MUL_DY yvec7, yvec14, yvec14; MUL_DY yvec6, yvec4, yvec4; ADD2_DY yvec4, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec3; MUL_DY yvec7, yvec13, yvec13; MUL_DY yvec6, yvec3, yvec3; ADD2_DY yvec3, yvec13, yvec13; VPERMILP_DY $0x05,yvec12, yvec2; MUL_DY yvec7, yvec12, yvec12; MUL_DY yvec6, yvec2, yvec2; ADD2_DY yvec2, yvec12, yvec12; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L9_loopEx; ALIGN_5 #### Writing back #### EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec15, xvec15; ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7; ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; ADD_DX 2*SIZE(C0), xvec5, xvec5; ADD_DX 0*SIZE(C1), xvec14, xvec14; ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12; ADD_DX 2*SIZE(C1), xvec4, xvec4; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C0, ldc, 1); ST_DX xvec13, 0*SIZE(C0, ldc, 1); ST_DX xvec5, 2*SIZE(C0); ST_DX xvec14, 0*SIZE(C1); ST_DX xvec6, 2*SIZE(C1, ldc, 1); ST_DX xvec12, 0*SIZE(C1, ldc, 1); ST_DX xvec4, 2*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; ALIGN_5 .L9_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1; LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2; LDL_DX 2*SIZE(C0), xvec3, xvec3; LDH_DX 3*SIZE(C0), xvec3, xvec3; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; ADD_DX xvec2, xvec13, xvec13; ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 2*SIZE(C0, ldc, 1); STH_DX xvec7, 3*SIZE(C0, ldc, 1); STL_DX xvec13, 0*SIZE(C0, ldc, 1); STH_DX xvec13, 1*SIZE(C0, ldc, 1); STL_DX xvec5, 2*SIZE(C0); STH_DX xvec5, 3*SIZE(C0); #ifndef TRMMKERNEL LDL_DX 0*SIZE(C1), xvec0, xvec0; LDH_DX 1*SIZE(C1), xvec0, xvec0; LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1; LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1; LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2; LDL_DX 2*SIZE(C1), xvec3, xvec3; LDH_DX 3*SIZE(C1), xvec3, xvec3; ADD_DX xvec0, xvec14, xvec14; ADD_DX xvec1, xvec6, xvec6; ADD_DX xvec2, xvec12, xvec12; ADD_DX xvec3, xvec4, xvec4; #endif STL_DX xvec14, 0*SIZE(C1); STH_DX xvec14, 1*SIZE(C1); STL_DX xvec6, 2*SIZE(C1, ldc, 1); STH_DX xvec6, 3*SIZE(C1, ldc, 1); STL_DX xvec12, 0*SIZE(C1, ldc, 1); STH_DX xvec12, 1*SIZE(C1, ldc, 1); STL_DX xvec4, 2*SIZE(C1); STH_DX xvec4, 3*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L5_loopE: TEST $1, bm; JLE .L6_loopE; ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; ALIGN_5 .L10_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; EDUP_DY 4*SIZE(ptrbb), yvec3; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; MUL_DY yvec1, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 1*SIZE(ptrbb), yvec2; EDUP_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; MUL_DY yvec4, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x31, yvec0, yvec0, yvec1; EDUP_DY 8*SIZE(ptrbb), yvec2; EDUP_DY 12*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; MUL_DY yvec1, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 9*SIZE(ptrbb), yvec2; EDUP_DY 13*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; MUL_DY yvec4, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; LD_DY 4*SIZE(ptrba), yvec0; EDUP_DY 16*SIZE(ptrbb), yvec2; EDUP_DY 20*SIZE(ptrbb), yvec3; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; MUL_DY yvec1, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 17*SIZE(ptrbb), yvec2; EDUP_DY 21*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; MUL_DY yvec4, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x31, yvec0, yvec0, yvec1; EDUP_DY 24*SIZE(ptrbb), yvec2; EDUP_DY 28*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; MUL_DY yvec1, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 25*SIZE(ptrbb), yvec2; EDUP_DY 29*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; MUL_DY yvec4, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14 ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L10_bodyB; ALIGN_5 .L10_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L11_loopE; ALIGN_5 .L11_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; EDUP_DY 4*SIZE(ptrbb), yvec3; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; MUL_DY yvec1, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 1*SIZE(ptrbb), yvec2; EDUP_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; MUL_DY yvec4, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; SHUF_DY $0x31, yvec0, yvec0, yvec1; EDUP_DY 8*SIZE(ptrbb), yvec2; EDUP_DY 12*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; MUL_DY yvec1, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 9*SIZE(ptrbb), yvec2; EDUP_DY 13*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; MUL_DY yvec4, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L11_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L12_loopE; ALIGN_5 .L12_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; EDUP_DY 4*SIZE(ptrbb), yvec3; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; MUL_DY yvec1, yvec3, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 1*SIZE(ptrbb), yvec2; EDUP_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; MUL_DY yvec4, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L12_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; #endif #### Multiply Alpha #### BROAD_DY MEMALPHA_R, yvec7; BROAD_DY MEMALPHA_I, yvec6; VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADD2_DY yvec5, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec4; MUL_DY yvec7, yvec14, yvec14; MUL_DY yvec6, yvec4, yvec4; ADD2_DY yvec4, yvec14, yvec14; #### Writing Back #### EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1; LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1; LDL_DX 0*SIZE(C1), xvec2, xvec2; LDH_DX 1*SIZE(C1), xvec2, xvec2; LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; ADD_DX xvec2, xvec14, xvec14; ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 0*SIZE(C0, ldc, 1); STH_DX xvec7, 1*SIZE(C0, ldc, 1); STL_DX xvec14, 0*SIZE(C1); STH_DX xvec14, 1*SIZE(C1); STL_DX xvec6, 0*SIZE(C1, ldc, 1); STH_DX xvec6, 1*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L6_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $4, kk; #endif MOVQ bk,k; SALQ $6,k; ADDQ k,bb; LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; LEAQ (C, ldc, 1), C1; MOVQ ba, ptrba; MOVQ bm, i; SARQ $2, i; JLE .L21_loopE; ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; XOR_DY yvec13, yvec13, yvec13; XOR_DY yvec12, yvec12, yvec12; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; ALIGN_5 .L211_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; LD_DY 0*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; EDUP_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec1, yvec4, yvec7; ADD1_DY yvec7, yvec12, yvec12; VPERMILP_DY $0x05, yvec1, yvec1; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 2 #### EDUP_DY 4*SIZE(ptrbb), yvec2; LD_DY 8*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; EDUP_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec1, yvec4, yvec7; ADD1_DY yvec7, yvec12, yvec12; VPERMILP_DY $0x05, yvec1, yvec1; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 3 #### EDUP_DY 8*SIZE(ptrbb), yvec2; LD_DY 16*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; LD_DY 20*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; EDUP_DY 9*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec1, yvec4, yvec7; ADD1_DY yvec7, yvec12, yvec12; VPERMILP_DY $0x05, yvec1, yvec1; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 4 #### EDUP_DY 12*SIZE(ptrbb), yvec2; LD_DY 24*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; LD_DY 28*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; EDUP_DY 13*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec1, yvec4, yvec7; ADD1_DY yvec7, yvec12, yvec12; VPERMILP_DY $0x05, yvec1, yvec1; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; ADDQ $16*SIZE, ptrbb; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrba; DECQ k; JG .L211_bodyB; ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L212_loopE; ALIGN_5 .L212_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; LD_DY 0*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; EDUP_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec1, yvec4, yvec7; ADD1_DY yvec7, yvec12, yvec12; VPERMILP_DY $0x05, yvec1, yvec1; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; #### Unroll time 2 #### EDUP_DY 4*SIZE(ptrbb), yvec2; LD_DY 8*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; EDUP_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec1, yvec4, yvec7; ADD1_DY yvec7, yvec12, yvec12; VPERMILP_DY $0x05, yvec1, yvec1; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; ADDQ $8*SIZE, ptrbb; ADDQ $16*SIZE, ptrba; .L212_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L213_loopE; ALIGN_5 .L213_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; LD_DY 0*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; EDUP_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec1, yvec4, yvec7; ADD1_DY yvec7, yvec12, yvec12; VPERMILP_DY $0x05, yvec1, yvec1; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; MUL_DY yvec1, yvec5, yvec7; ADD2_DY yvec7, yvec12, yvec12; ADDQ $4*SIZE, ptrbb; ADDQ $8*SIZE, ptrba; .L213_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; ADDSUB_DY yvec13, yvec7, yvec13; ADDSUB_DY yvec12, yvec7, yvec12; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; SUB_DY yvec13, yvec7, yvec13; SUB_DY yvec12, yvec7, yvec12; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; VPERMILP_DY $0x05, yvec12, yvec12; ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; ADDSUB_DY yvec13, yvec7, yvec13; ADDSUB_DY yvec12, yvec7, yvec12; VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; VPERMILP_DY $0x05, yvec12, yvec12; #endif #### Load Alpha #### BROAD_DY MEMALPHA_R,yvec7; BROAD_DY MEMALPHA_I,yvec6; #### Multiply Alpha #### VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADD2_DY yvec5, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec4; MUL_DY yvec7, yvec14, yvec14; MUL_DY yvec6, yvec4, yvec4; ADD2_DY yvec4, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec3; MUL_DY yvec7, yvec13, yvec13; MUL_DY yvec6, yvec3, yvec3; ADD2_DY yvec3, yvec13, yvec13; VPERMILP_DY $0x05,yvec12, yvec2; MUL_DY yvec7, yvec12, yvec12; MUL_DY yvec6, yvec2, yvec2; ADD2_DY yvec2, yvec12, yvec12; EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec15, xvec15; ADD_DX 2*SIZE(C1), xvec7, xvec7; ADD_DX 4*SIZE(C0), xvec14, xvec14; ADD_DX 6*SIZE(C1), xvec6, xvec6; ADD_DX 0*SIZE(C1), xvec13, xvec13; ADD_DX 2*SIZE(C0), xvec5, xvec5; ADD_DX 4*SIZE(C1), xvec12, xvec12; ADD_DX 6*SIZE(C0), xvec4, xvec4; #endif ST_DX xvec15,0*SIZE(C0); ST_DX xvec7,2*SIZE(C1); ST_DX xvec14,4*SIZE(C0); ST_DX xvec6,6*SIZE(C1); ST_DX xvec13,0*SIZE(C1); ST_DX xvec5,2*SIZE(C0); ST_DX xvec12,4*SIZE(C1); ST_DX xvec4,6*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C1), xvec1, xvec1; LDH_DX 3*SIZE(C1), xvec1, xvec1; LDL_DX 4*SIZE(C0), xvec2, xvec2; LDH_DX 5*SIZE(C0), xvec2, xvec2; LDL_DX 6*SIZE(C1), xvec3, xvec3; LDH_DX 7*SIZE(C1), xvec3, xvec3; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; ADD_DX xvec2, xvec14, xvec14; ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 2*SIZE(C1); STH_DX xvec7, 3*SIZE(C1); STL_DX xvec14, 4*SIZE(C0); STH_DX xvec14, 5*SIZE(C0); STL_DX xvec6, 6*SIZE(C1); STH_DX xvec6, 7*SIZE(C1); #ifndef TRMMKERNEL LDL_DX 0*SIZE(C1), xvec3, xvec3; LDH_DX 1*SIZE(C1), xvec3, xvec3; LDL_DX 2*SIZE(C0), xvec2, xvec2; LDH_DX 3*SIZE(C0), xvec2, xvec2; LDL_DX 4*SIZE(C1), xvec1, xvec1; LDH_DX 5*SIZE(C1), xvec1, xvec1; LDL_DX 6*SIZE(C0), xvec0, xvec0; LDH_DX 7*SIZE(C0), xvec0, xvec0; ADD_DX xvec3, xvec13, xvec13; ADD_DX xvec2, xvec5, xvec5; ADD_DX xvec1, xvec12, xvec12; ADD_DX xvec0, xvec4, xvec4; #endif STL_DX xvec13, 0*SIZE(C1); STH_DX xvec13, 1*SIZE(C1); STL_DX xvec5, 2*SIZE(C0); STH_DX xvec5, 3*SIZE(C0); STL_DX xvec12, 4*SIZE(C1); STH_DX xvec12, 5*SIZE(C1); STL_DX xvec4, 6*SIZE(C0); STH_DX xvec4, 7*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; ALIGN_5 .L21_loopE: TEST $2, bm; JLE .L22_loopE; ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec13; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; ALIGN_5 .L221_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; LD_DY 0*SIZE(ptrba), yvec0; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; EDUP_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; #### Unroll time 2 #### EDUP_DY 4*SIZE(ptrbb), yvec2; LD_DY 4*SIZE(ptrba), yvec0; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; EDUP_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; #### Unroll time 3 #### EDUP_DY 8*SIZE(ptrbb), yvec2; LD_DY 8*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; EDUP_DY 9*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; #### Unroll time 4 #### EDUP_DY 12*SIZE(ptrbb), yvec2; LD_DY 12*SIZE(ptrba), yvec0; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec2, yvec2, yvec4; EDUP_DY 13*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; ADDQ $16*SIZE, ptrbb; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; ADDQ $16*SIZE, ptrba; DECQ k; JG .L221_bodyB; ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L222_loopE; ALIGN_5 .L222_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; LD_DY 0*SIZE(ptrba), yvec0; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; EDUP_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; #### Unroll time 2 #### EDUP_DY 4*SIZE(ptrbb), yvec2; LD_DY 4*SIZE(ptrba), yvec0; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; EDUP_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L222_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L223_loopE; ALIGN_5 .L223_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; LD_DY 0*SIZE(ptrba), yvec0; SHUF_DY $0x03, yvec2, yvec2, yvec4; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; EDUP_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec0, yvec4, yvec6; ADD1_DY yvec6, yvec13, yvec13; VPERMILP_DY $0x05, yvec0, yvec0; MUL_DY yvec0, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x03, yvec3, yvec3, yvec5; MUL_DY yvec0, yvec5, yvec6; ADD2_DY yvec6, yvec13, yvec13; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L223_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec13, yvec7, yvec13; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec13, yvec7, yvec13; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec13, yvec13; ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec13, yvec7, yvec13; VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec13, yvec13; #endif #### Load Alpha #### BROAD_DY MEMALPHA_R,yvec7; BROAD_DY MEMALPHA_I,yvec6; #### Multiply Alpha #### VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADD2_DY yvec5, yvec15, yvec15; VPERMILP_DY $0x05, yvec13, yvec3; MUL_DY yvec7, yvec13, yvec13; MUL_DY yvec6, yvec3, yvec3; ADD2_DY yvec3, yvec13, yvec13; EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec13, xvec5; #### Write back #### #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C1), xvec1, xvec1; LDH_DX 3*SIZE(C1), xvec1, xvec1; LDL_DX 0*SIZE(C1), xvec2, xvec2; LDH_DX 1*SIZE(C1), xvec2, xvec2; LDL_DX 2*SIZE(C0), xvec3, xvec3; LDH_DX 3*SIZE(C0), xvec3, xvec3; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; ADD_DX xvec2, xvec13, xvec13; ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 2*SIZE(C1); STH_DX xvec7, 3*SIZE(C1); STL_DX xvec13, 0*SIZE(C1); STH_DX xvec13, 1*SIZE(C1); STL_DX xvec5, 2*SIZE(C0); STH_DX xvec5, 3*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L22_loopE: TEST $1, bm; JLE .L23_loopE; ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; ALIGN_5 .L231_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 1*SIZE(ptrbb), yvec2; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x31, yvec0, yvec0, yvec1; EDUP_DY 4*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 5*SIZE(ptrbb), yvec2; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec0; EDUP_DY 8*SIZE(ptrbb), yvec2; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 9*SIZE(ptrbb), yvec2; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x31, yvec0, yvec0, yvec1; EDUP_DY 12*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 13*SIZE(ptrbb), yvec2; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L232_loopE; ALIGN_5 .L232_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 1*SIZE(ptrbb), yvec2; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; SHUF_DY $0x31, yvec0, yvec0, yvec1; EDUP_DY 4*SIZE(ptrbb), yvec2; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 5*SIZE(ptrbb), yvec2; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L233_loopE; ALIGN_5 .L233_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; SHUF_DY $0x20, yvec0, yvec0, yvec1; MUL_DY yvec1, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec4; EDUP_DY 1*SIZE(ptrbb), yvec2; MUL_DY yvec4, yvec2, yvec6; ADD2_DY yvec6, yvec15, yvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L233_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; ADDSUB_DY yvec15, yvec7, yvec15; VPERMILP_DY $0x05, yvec15, yvec15; #endif #### Multiply Alpha #### BROAD_DY MEMALPHA_R, yvec7; BROAD_DY MEMALPHA_I, yvec6; #### Writing Back #### VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 0*SIZE(C1), xvec1, xvec1; LDH_DX 1*SIZE(C1), xvec1, xvec1; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 0*SIZE(C1); STH_DX xvec7, 1*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C0; .L23_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $2, kk; #endif MOVQ bk, k; SALQ $5, k; ADDQ k, bb; LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ ba, ptrba; MOVQ C, C0; MOVQ bm, i; SARQ $2, i; JLE .L31_loopE; ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; XOR_DY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; ALIGN_5 .L311_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec0, yvec4; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec5; MUL_DY yvec5, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; LD_DY 8*SIZE(ptrba), yvec0; BROAD_DY 2*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec0, yvec4; BROAD_DY 3*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec5; MUL_DY yvec5, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; LD_DY 16*SIZE(ptrba), yvec0; BROAD_DY 4*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; LD_DY 20*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec0, yvec4; BROAD_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec5; MUL_DY yvec5, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; LD_DY 24*SIZE(ptrba), yvec0; BROAD_DY 6*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; LD_DY 28*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec0, yvec4; BROAD_DY 7*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec5; MUL_DY yvec5, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L312_loopE; ALIGN_5 .L312_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec0, yvec4; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec5; MUL_DY yvec5, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; LD_DY 8*SIZE(ptrba), yvec0; BROAD_DY 2*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; LD_DY 12*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec0, yvec4; BROAD_DY 3*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec5; MUL_DY yvec5, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L312_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L313_loopE; ALIGN_5 .L313_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec1, yvec2, yvec7; ADD1_DY yvec7, yvec14, yvec14; VPERMILP_DY $0x05, yvec0, yvec4; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec4, yvec3, yvec6; ADD2_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec1, yvec5; MUL_DY yvec5, yvec3, yvec7; ADD2_DY yvec7, yvec14, yvec14; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L313_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; ADDSUB_DY yvec15, yvec7, yvec15; ADDSUB_DY yvec14, yvec7, yvec14; VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; #endif #### Load Alpha #### BROAD_DY MEMALPHA_R,yvec7; BROAD_DY MEMALPHA_I,yvec6; #### Multiply Alpha #### VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADD2_DY yvec5, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec4; MUL_DY yvec7, yvec14, yvec14; MUL_DY yvec6, yvec4, yvec4; ADD2_DY yvec4, yvec14, yvec14; EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #### Writing Back #### #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C0), xvec1, xvec1; LDH_DX 3*SIZE(C0), xvec1, xvec1; LDL_DX 4*SIZE(C0), xvec2, xvec2; LDH_DX 5*SIZE(C0), xvec2, xvec2; LDL_DX 6*SIZE(C0), xvec3, xvec3; LDH_DX 7*SIZE(C0), xvec3, xvec3; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; ADD_DX xvec2, xvec14, xvec14; ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 2*SIZE(C0); STH_DX xvec7, 3*SIZE(C0); STL_DX xvec14, 4*SIZE(C0); STH_DX xvec14, 5*SIZE(C0); STL_DX xvec6, 6*SIZE(C0); STH_DX xvec6, 7*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; ALIGN_5 .L31_loopE: TEST $2, bm; JLE .L32_loopE; ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec0, yvec1; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec0; BROAD_DY 2*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec0, yvec1; BROAD_DY 3*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec15, yvec15; LD_DY 8*SIZE(ptrba), yvec0; BROAD_DY 4*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec0, yvec1; BROAD_DY 5*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec15, yvec15; LD_DY 12*SIZE(ptrba), yvec0; BROAD_DY 6*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec0, yvec1; BROAD_DY 7*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec15, yvec15; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L322_loopE; ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec0, yvec1; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec15, yvec15; LD_DY 4*SIZE(ptrba), yvec0; BROAD_DY 2*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec0, yvec1; BROAD_DY 3*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec15, yvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L322_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L323_loopE; ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; MUL_DY yvec0, yvec2, yvec6; ADD1_DY yvec6, yvec15, yvec15; VPERMILP_DY $0x05, yvec0, yvec1; BROAD_DY 1*SIZE(ptrbb), yvec3; MUL_DY yvec1, yvec3, yvec7; ADD2_DY yvec7, yvec15, yvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L323_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DY yvec15, yvec7, yvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; ADDSUB_DY yvec15, yvec7, yvec15; VPERMILP_DY $0x05, yvec15, yvec15; #endif #### Load Alpha #### BROAD_DY MEMALPHA_R,yvec7; BROAD_DY MEMALPHA_I,yvec6; #### Multiply Alpha #### VPERMILP_DY $0x05, yvec15, yvec5; MUL_DY yvec7, yvec15, yvec15; MUL_DY yvec6, yvec5, yvec5; ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; LDL_DX 2*SIZE(C0), xvec1, xvec1; LDH_DX 3*SIZE(C0), xvec1, xvec1; ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 2*SIZE(C0); STH_DX xvec7, 3*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; .L32_loopE: TEST $1, bm; JLE .L33_loopE; ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec0; BROAD_DX 4*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 5*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec0; BROAD_DX 6*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 7*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L332_loopE; ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L332_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L333_loopE; ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; MUL_DX xvec0, xvec2, xvec2; ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; MUL_DX xvec1, xvec3, xvec3; ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L333_loopE: #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_DX $0x4e, xvec15, xvec15; ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; SHUF_DX $0x4e, xvec15, xvec15; #endif #### Load Alpha #### BROAD_DX MEMALPHA_R,xvec7; BROAD_DX MEMALPHA_I,xvec6; #### Multiply Alpha #### SHUF_DX $0x4e, xvec15, xvec5; MUL_DX xvec7, xvec15, xvec15; MUL_DX xvec6, xvec5, xvec5; ADDSUB_DX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0, xvec0; LDH_DX 1*SIZE(C0), xvec0, xvec0; ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; .L33_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $1, kk; #endif MOVQ bk, k; SALQ $4*SIZE, k; ADDQ k, bb; LEAQ (C, ldc, 1), C; .L30_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; vzeroupper #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp; ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_ncopy_1.S000066400000000000000000000121231313527062700202540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #define J %r10 #define AO1 %r11 #define AO2 %r12 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 24 + STACKSIZE(%rsp) #define B %r10 #define I %r11 #define J %r12 #define AO1 %r13 #define AO2 %r14 #endif #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 48 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r14 pushq %r13 #endif pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif salq $ZBASE_SHIFT, LDA testq N, N movq N, J jle .L999 ALIGN_4 .L12: movq A, AO1 addq LDA, A movq M, I sarq $2, I jle .L14 ALIGN_4 .L13: #ifndef DOUBLE movsd 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movsd 4 * SIZE(AO1), %xmm1 movhps 6 * SIZE(AO1), %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 4 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movsd 4 * SIZE(AO1), %xmm2 movhpd 5 * SIZE(AO1), %xmm2 movsd 6 * SIZE(AO1), %xmm3 movhpd 7 * SIZE(AO1), %xmm3 prefetcht2 RPREFETCHSIZE * SIZE(AO1) movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movapd %xmm2, 4 * SIZE(B) movapd %xmm3, 6 * SIZE(B) prefetcht2 WPREFETCHSIZE * SIZE(B) #endif addq $8 * SIZE, AO1 addq $8 * SIZE, B decq I jg .L13 ALIGN_4 .L14: movq M, I andq $3, I jle .L16 ALIGN_4 .L15: #ifndef DOUBLE movsd 0 * SIZE(AO1), %xmm0 movsd %xmm0, 0 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movapd %xmm0, 0 * SIZE(B) #endif addq $2 * SIZE, AO1 addq $2 * SIZE, B decq I jg .L15 ALIGN_4 .L16: decq J jg .L12 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r12 #ifdef WINDOWS_ABI popq %r13 popq %r14 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_ncopy_2.S000066400000000000000000000202231313527062700202550ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #define J %r10 #define AO1 %r11 #define AO2 %r12 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 24 + STACKSIZE(%rsp) #define B %r10 #define I %r11 #define J %r12 #define AO1 %r13 #define AO2 %r14 #endif #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define RPREFETCHSIZE 16 #define WPREFETCHSIZE 48 #endif #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 80 #endif #ifdef OPTERON #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 48 #define movsd movlpd #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 48 #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r14 pushq %r13 #endif pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif salq $ZBASE_SHIFT, LDA movq N, J sarq $1, J jle .L20 ALIGN_4 .L12: movq A, AO1 leaq (A, LDA), AO2 leaq (A, LDA, 2), A movq M, I sarq $2, I jle .L14 ALIGN_4 .L13: #ifdef HAVE_3DNOW prefetchw (WPREFETCHSIZE + 0) * SIZE(B) prefetchw (WPREFETCHSIZE + 8) * SIZE(B) #endif #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO2), %xmm0 movlps 2 * SIZE(AO1), %xmm1 movhps 2 * SIZE(AO2), %xmm1 movlps 4 * SIZE(AO1), %xmm2 movhps 4 * SIZE(AO2), %xmm2 movlps 6 * SIZE(AO1), %xmm3 movhps 6 * SIZE(AO2), %xmm3 #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) prefetcht0 RPREFETCHSIZE * SIZE(AO1) prefetcht0 RPREFETCHSIZE * SIZE(AO2) prefetcht0 WPREFETCHSIZE * SIZE(B) #endif movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 4 * SIZE(B) movaps %xmm2, 8 * SIZE(B) movaps %xmm3, 12 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) prefetcht2 RPREFETCHSIZE * SIZE(AO1) #endif movsd 2 * SIZE(AO1), %xmm2 movhpd 3 * SIZE(AO1), %xmm2 movsd 2 * SIZE(AO2), %xmm3 movhpd 3 * SIZE(AO2), %xmm3 movsd 4 * SIZE(AO1), %xmm4 movhpd 5 * SIZE(AO1), %xmm4 movsd 4 * SIZE(AO2), %xmm5 movhpd 5 * SIZE(AO2), %xmm5 #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) prefetcht2 RPREFETCHSIZE * SIZE(AO2) #endif movsd 6 * SIZE(AO1), %xmm6 movhpd 7 * SIZE(AO1), %xmm6 movsd 6 * SIZE(AO2), %xmm7 movhpd 7 * SIZE(AO2), %xmm7 #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) prefetcht0 RPREFETCHSIZE * SIZE(AO1) prefetcht0 RPREFETCHSIZE * SIZE(AO2) prefetcht0 WPREFETCHSIZE * SIZE(B) #endif movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movapd %xmm2, 4 * SIZE(B) movapd %xmm3, 6 * SIZE(B) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) prefetcht2 WPREFETCHSIZE * SIZE(B) #endif movapd %xmm4, 8 * SIZE(B) movapd %xmm5, 10 * SIZE(B) movapd %xmm6, 12 * SIZE(B) movapd %xmm7, 14 * SIZE(B) #endif addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 subq $-16 * SIZE, B decq I jg .L13 ALIGN_4 .L14: movq M, I andq $3, I jle .L16 ALIGN_4 .L15: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO2), %xmm0 movaps %xmm0, 0 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) #endif addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 addq $4 * SIZE, B decq I jg .L15 ALIGN_4 .L16: decq J jg .L12 ALIGN_4 .L20: testq $1, N jle .L999 movq A, AO1 movq M, I sarq $2, I jle .L24 ALIGN_4 .L23: #ifdef HAVE_3DNOW prefetchw (WPREFETCHSIZE + 0) * SIZE(B) prefetchw (WPREFETCHSIZE + 8) * SIZE(B) #endif #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movlps 4 * SIZE(AO1), %xmm1 movhps 6 * SIZE(AO1), %xmm1 movaps %xmm0, 0 * SIZE(B) movaps %xmm1, 4 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movsd 4 * SIZE(AO1), %xmm2 movhpd 5 * SIZE(AO1), %xmm2 movsd 6 * SIZE(AO1), %xmm3 movhpd 7 * SIZE(AO1), %xmm3 movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) movapd %xmm2, 4 * SIZE(B) movapd %xmm3, 6 * SIZE(B) #endif #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) prefetcht0 RPREFETCHSIZE * SIZE(AO1) prefetcht0 RPREFETCHSIZE * SIZE(AO2) prefetcht0 WPREFETCHSIZE * SIZE(B) #endif addq $8 * SIZE, AO1 addq $8 * SIZE, B decq I jg .L23 ALIGN_4 .L24: movq M, I andq $3, I jle .L999 ALIGN_4 .L25: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movlps %xmm0, 0 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movapd %xmm0, 0 * SIZE(B) #endif addq $2 * SIZE, AO1 addq $2 * SIZE, B decq I jg .L25 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r12 #ifdef WINDOWS_ABI popq %r13 popq %r14 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_tcopy_1.S000066400000000000000000000115271313527062700202710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #define J %r10 #define AO1 %r11 #define AO2 %r12 #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 24 + STACKSIZE(%rsp) #define B %r10 #define I %r11 #define J %r12 #define AO1 %r13 #define AO2 %r14 #endif #define RPREFETCHSIZE 4 #define WPREFETCHSIZE 4 PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %r14 pushq %r13 #endif pushq %r12 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif salq $ZBASE_SHIFT, LDA testq N, N movq N, J jle .L999 ALIGN_4 .L12: movq A, AO1 addq $2 * SIZE, A movq M, I sarq $1, I jle .L14 ALIGN_4 .L13: #ifndef DOUBLE movsd 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO1, LDA, 1), %xmm0 movaps %xmm0, 0 * SIZE(B) #else prefetcht0 RPREFETCHSIZE * SIZE(AO1) movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 prefetcht0 RPREFETCHSIZE * SIZE(AO1, LDA) movsd 0 * SIZE(AO1, LDA), %xmm1 movhpd 1 * SIZE(AO1, LDA), %xmm1 movapd %xmm0, 0 * SIZE(B) movapd %xmm1, 2 * SIZE(B) prefetcht0 WPREFETCHSIZE * SIZE(B) #endif leaq (AO1, LDA, 2), AO1 addq $4 * SIZE, B decq I jg .L13 ALIGN_4 .L14: testq $1, M jle .L16 #ifndef DOUBLE movsd 0 * SIZE(AO1), %xmm0 movsd %xmm0, 0 * SIZE(B) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movapd %xmm0, 0 * SIZE(B) #endif addq $2 * SIZE, B ALIGN_4 .L16: decq J jg .L12 ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r12 #ifdef WINDOWS_ABI popq %r13 popq %r14 #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemm_tcopy_2.S000066400000000000000000000224661313527062700202760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 16 #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define N ARG2 /* rsi */ #define A ARG3 /* rdx */ #define LDA ARG4 /* rcx */ #define B ARG5 /* r8 */ #define I %r9 #define J %r10 #define AO1 %r11 #define AO2 %r12 #define BO1 %r13 #define M8 %r14 #define BO %rax #else #define STACKSIZE 256 #define M ARG1 /* rcx */ #define N ARG2 /* rdx */ #define A ARG3 /* r8 */ #define LDA ARG4 /* r9 */ #define OLD_B 40 + 48 + STACKSIZE(%rsp) #define B %r10 #define I %r11 #define J %r12 #define AO1 %r13 #define AO2 %r14 #define BO1 %rdi #define M8 %rsi #define BO %rax #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI pushq %rdi pushq %rsi #endif pushq %r14 pushq %r13 pushq %r12 pushq %r11 #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp movups %xmm6, 0(%rsp) movups %xmm7, 16(%rsp) movups %xmm8, 32(%rsp) movups %xmm9, 48(%rsp) movups %xmm10, 64(%rsp) movups %xmm11, 80(%rsp) movups %xmm12, 96(%rsp) movups %xmm13, 112(%rsp) movups %xmm14, 128(%rsp) movups %xmm15, 144(%rsp) movq OLD_B, B #endif movq N, %rax andq $-2, %rax imulq M, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), BO1 salq $ZBASE_SHIFT, LDA leaq (, M, SIZE), M8 movq M, J sarq $1, J jle .L20 ALIGN_4 .L11: movq A, AO1 leaq (A, LDA ), AO2 leaq (A, LDA, 2), A movq B, BO addq $8 * SIZE, B movq N, I sarq $2, I jle .L13 ALIGN_4 .L12: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movlps 4 * SIZE(AO1), %xmm1 movhps 6 * SIZE(AO1), %xmm1 movlps 0 * SIZE(AO2), %xmm2 movhps 2 * SIZE(AO2), %xmm2 movlps 4 * SIZE(AO2), %xmm3 movhps 6 * SIZE(AO2), %xmm3 #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) prefetcht0 RPREFETCHSIZE * SIZE(AO1) prefetcht0 RPREFETCHSIZE * SIZE(AO2) prefetcht0 WPREFETCHSIZE * SIZE(BO) #endif #ifdef HAVE_3DNOW prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) #endif movaps %xmm0, 0 * SIZE(BO) movaps %xmm2, 4 * SIZE(BO) leaq (BO, M8, 4), BO movaps %xmm1, 0 * SIZE(BO) movaps %xmm3, 4 * SIZE(BO) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) prefetcht2 RPREFETCHSIZE * SIZE(AO1) #endif movsd 4 * SIZE(AO1), %xmm2 movhpd 5 * SIZE(AO1), %xmm2 movsd 6 * SIZE(AO1), %xmm3 movhpd 7 * SIZE(AO1), %xmm3 movsd 0 * SIZE(AO2), %xmm4 movhpd 1 * SIZE(AO2), %xmm4 movsd 2 * SIZE(AO2), %xmm5 movhpd 3 * SIZE(AO2), %xmm5 #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) prefetcht2 RPREFETCHSIZE * SIZE(AO2) #endif movsd 4 * SIZE(AO2), %xmm6 movhpd 5 * SIZE(AO2), %xmm6 movsd 6 * SIZE(AO2), %xmm7 movhpd 7 * SIZE(AO2), %xmm7 #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) prefetcht0 RPREFETCHSIZE * SIZE(AO1) prefetcht0 RPREFETCHSIZE * SIZE(AO2) prefetcht0 WPREFETCHSIZE * SIZE(BO) #endif #ifdef HAVE_3DNOW prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) prefetchw (WPREFETCHSIZE + 8) * SIZE(BO) #endif movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm4, 4 * SIZE(BO) movapd %xmm5, 6 * SIZE(BO) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) prefetcht2 WPREFETCHSIZE * SIZE(BO) #endif leaq (BO, M8, 4), BO movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) movapd %xmm6, 4 * SIZE(BO) movapd %xmm7, 6 * SIZE(BO) #endif addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 leaq (BO, M8, 4), BO decq I jg .L12 ALIGN_4 .L13: testq $2, N jle .L14 #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movlps 0 * SIZE(AO2), %xmm1 movhps 2 * SIZE(AO2), %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movsd 0 * SIZE(AO2), %xmm2 movhpd 1 * SIZE(AO2), %xmm2 movsd 2 * SIZE(AO2), %xmm3 movhpd 3 * SIZE(AO2), %xmm3 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) movapd %xmm2, 4 * SIZE(BO) movapd %xmm3, 6 * SIZE(BO) #endif addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 leaq (BO, M8, 4), BO ALIGN_4 .L14: testq $1, N jle .L19 #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO2), %xmm0 movaps %xmm0, 0 * SIZE(BO1) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 0 * SIZE(AO2), %xmm1 movhpd 1 * SIZE(AO2), %xmm1 movapd %xmm0, 0 * SIZE(BO1) movapd %xmm1, 2 * SIZE(BO1) #endif addq $4 * SIZE, BO1 ALIGN_4 .L19: decq J jg .L11 ALIGN_4 .L20: testq $1, M jle .L999 ALIGN_4 .L21: movq A, AO1 movq B, BO movq N, I sarq $2, I jle .L23 ALIGN_4 .L22: #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movlps 4 * SIZE(AO1), %xmm1 movhps 6 * SIZE(AO1), %xmm1 #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) prefetcht0 RPREFETCHSIZE * SIZE(AO1) prefetcht0 WPREFETCHSIZE * SIZE(BO) #endif #ifdef HAVE_3DNOW prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) #endif movaps %xmm0, 0 * SIZE(BO) leaq (BO, M8, 4), BO movaps %xmm1, 0 * SIZE(BO) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movsd 4 * SIZE(AO1), %xmm2 movhpd 5 * SIZE(AO1), %xmm2 movsd 6 * SIZE(AO1), %xmm3 movhpd 7 * SIZE(AO1), %xmm3 #if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) prefetcht0 RPREFETCHSIZE * SIZE(AO1) prefetcht0 WPREFETCHSIZE * SIZE(BO) #endif #ifdef HAVE_3DNOW prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) #endif movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) leaq (BO, M8, 4), BO movapd %xmm2, 0 * SIZE(BO) movapd %xmm3, 2 * SIZE(BO) #endif addq $8 * SIZE, AO1 leaq (BO, M8, 4), BO decq I jg .L22 ALIGN_4 .L23: testq $2, N jle .L24 #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movhps 2 * SIZE(AO1), %xmm0 movaps %xmm0, 0 * SIZE(BO) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movsd 2 * SIZE(AO1), %xmm1 movhpd 3 * SIZE(AO1), %xmm1 movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) #endif addq $4 * SIZE, AO1 leaq (BO, M8, 4), BO ALIGN_4 .L24: testq $1, N jle .L999 #ifndef DOUBLE movlps 0 * SIZE(AO1), %xmm0 movlps %xmm0, 0 * SIZE(BO1) #else movsd 0 * SIZE(AO1), %xmm0 movhpd 1 * SIZE(AO1), %xmm0 movapd %xmm0, 0 * SIZE(BO1) #endif ALIGN_4 .L999: #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 movups 32(%rsp), %xmm8 movups 48(%rsp), %xmm9 movups 64(%rsp), %xmm10 movups 80(%rsp), %xmm11 movups 96(%rsp), %xmm12 movups 112(%rsp), %xmm13 movups 128(%rsp), %xmm14 movups 144(%rsp), %xmm15 addq $STACKSIZE, %rsp #endif popq %r11 popq %r12 popq %r13 popq %r14 #ifdef WINDOWS_ABI popq %rsi popq %rdi #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemv_n.S000066400000000000000000001513241313527062700171610ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) #define ALPHA_R 48 (%rsp) #define ALPHA_I 56 (%rsp) #define MMM 64(%rsp) #define NN 72(%rsp) #define AA 80(%rsp) #define XX 88(%rsp) #define LDAX 96(%rsp) #define ALPHAR 104(%rsp) #define ALPHAI 112(%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #else #define STACKSIZE 304 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) #define OLD_X 64 + STACKSIZE(%rsp) #define OLD_INCX 72 + STACKSIZE(%rsp) #define OLD_Y 80 + STACKSIZE(%rsp) #define OLD_INCY 88 + STACKSIZE(%rsp) #define OLD_BUFFER 96 + STACKSIZE(%rsp) #define ALPHA_R 224 (%rsp) #define ALPHA_I 232 (%rsp) #define MMM 240(%rsp) #define NN 248(%rsp) #define AA 256(%rsp) #define XX 264(%rsp) #define LDAX 272(%rsp) #define ALPHAR 280(%rsp) #define ALPHAI 288(%rsp) #define M %rcx #define N %rdx #define A %r8 #define LDA %r9 #define X %rdi #define INCX %rsi #define Y %rbp #define INCY %r10 #endif #define I %rax #define A1 %r12 #define A2 %r13 #define Y1 %r14 #define BUFFER %r15 #define J %r11 #undef SUBPD #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) #define SUBPD subpd #else #define SUBPD addpd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movapd %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #endif movq A, AA movq N, NN movq M, MMM movq LDA, LDAX movq X, XX movq OLD_Y, Y movsd %xmm0,ALPHAR movsd %xmm1,ALPHAI .L0t: xorq I,I addq $1,I salq $18,I subq I,MMM movq I,M movsd ALPHAR,%xmm0 movsd ALPHAI,%xmm1 jge .L00t movq MMM,M addq I,M jle .L999x .L00t: movq AA, A movq NN, N movq LDAX, LDA movq XX, X movq OLD_INCX, INCX # movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, LDA salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY movlpd %xmm0, ALPHA_R movlpd %xmm1, ALPHA_I subq $-16 * SIZE, A testq M, M jle .L999 testq N, N jle .L999 ALIGN_3 movq BUFFER, Y1 pxor %xmm4, %xmm4 movq M, %rax addq $8, %rax sarq $3, %rax ALIGN_3 .L01: movapd %xmm4, 0 * SIZE(Y1) movapd %xmm4, 2 * SIZE(Y1) movapd %xmm4, 4 * SIZE(Y1) movapd %xmm4, 6 * SIZE(Y1) movapd %xmm4, 8 * SIZE(Y1) movapd %xmm4, 10 * SIZE(Y1) movapd %xmm4, 12 * SIZE(Y1) movapd %xmm4, 14 * SIZE(Y1) subq $-16 * SIZE, Y1 decq %rax jg .L01 ALIGN_3 .L10: #ifdef ALIGNED_ACCESS testq $SIZE, A jne .L100 #endif #if GEMV_UNROLL >= 4 cmpq $4, N jl .L20 ALIGN_3 .L11: subq $4, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movsd 0 * SIZE(X), %xmm8 movhpd 1 * SIZE(X), %xmm8 addq INCX, X movsd 0 * SIZE(X), %xmm10 movhpd 1 * SIZE(X), %xmm10 addq INCX, X movsd 0 * SIZE(X), %xmm12 movhpd 1 * SIZE(X), %xmm12 addq INCX, X movsd 0 * SIZE(X), %xmm14 movhpd 1 * SIZE(X), %xmm14 addq INCX, X pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0xc0, %xmm5, %xmm5 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #else movsd ALPHA_R, %xmm6 unpcklpd %xmm6, %xmm6 movsd ALPHA_I, %xmm7 unpcklpd %xmm7, %xmm7 #endif xorpd %xmm5, %xmm9 xorpd %xmm5, %xmm11 xorpd %xmm5, %xmm13 xorpd %xmm5, %xmm15 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 #ifndef XCONJ subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif pshufd $0xee, %xmm8, %xmm9 pshufd $0x44, %xmm8, %xmm8 pshufd $0xee, %xmm10, %xmm11 pshufd $0x44, %xmm10, %xmm10 pshufd $0xee, %xmm12, %xmm13 pshufd $0x44, %xmm12, %xmm12 pshufd $0xee, %xmm14, %xmm15 pshufd $0x44, %xmm14, %xmm14 #ifndef CONJ xorpd %xmm5, %xmm9 xorpd %xmm5, %xmm11 xorpd %xmm5, %xmm13 xorpd %xmm5, %xmm15 #else xorpd %xmm5, %xmm8 xorpd %xmm5, %xmm10 xorpd %xmm5, %xmm12 xorpd %xmm5, %xmm14 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) ALIGN_3 movq M, I sarq $2, I jle .L15 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm6) decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-10 * SIZE, A1, %xmm6) mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm2 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm2 MOVUPS_A1(-16 * SIZE, A2, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A1(-14 * SIZE, A2, %xmm6) mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A2, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-10 * SIZE, A2, %xmm6) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm2 MOVUPS_A1( -8 * SIZE, A1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A1( -6 * SIZE, A1, %xmm6) mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-10 * SIZE, A1, %xmm6) mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm2 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm2 MOVUPS_A1(-16 * SIZE, A2, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A1(-14 * SIZE, A2, %xmm6) mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-12 * SIZE, A2, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-10 * SIZE, A2, %xmm6) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm3 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm2 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm3 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L15: testq $2, M je .L17 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm6) pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-14 * SIZE, A2, %xmm6) mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movapd %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L17: testq $1, M je .L19 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm0 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm0 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) ALIGN_3 .L19: cmpq $4, N jge .L11 ALIGN_3 .L20: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L30 #if GEMV_UNROLL == 2 ALIGN_3 .L21: #endif subq $2, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd 0 * SIZE(X), %xmm12 movhpd 1 * SIZE(X), %xmm12 addq INCX, X movsd 0 * SIZE(X), %xmm14 movhpd 1 * SIZE(X), %xmm14 addq INCX, X pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm8 movddup ALPHA_I, %xmm9 #else movsd ALPHA_R, %xmm8 unpcklpd %xmm8, %xmm8 movsd ALPHA_I, %xmm9 unpcklpd %xmm9, %xmm9 #endif xorpd %xmm11, %xmm13 xorpd %xmm11, %xmm15 mulpd %xmm8, %xmm12 mulpd %xmm9, %xmm13 mulpd %xmm8, %xmm14 mulpd %xmm9, %xmm15 #ifndef XCONJ subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif pshufd $0xee, %xmm12, %xmm13 pshufd $0x44, %xmm12, %xmm12 pshufd $0xee, %xmm14, %xmm15 pshufd $0x44, %xmm14, %xmm14 #ifndef CONJ xorpd %xmm11, %xmm13 xorpd %xmm11, %xmm15 #else xorpd %xmm11, %xmm12 xorpd %xmm11, %xmm14 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) ALIGN_3 movq M, I sarq $2, I jle .L25 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm6) MOVUPS_A1(-12 * SIZE, A1, %xmm8) MOVUPS_A1(-10 * SIZE, A1, %xmm10) decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-14 * SIZE, A2, %xmm6) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm8) pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 MOVUPS_A1(-10 * SIZE, A2, %xmm10) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1( -6 * SIZE, A1, %xmm6) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm14, %xmm8 addpd %xmm8, %xmm2 MOVUPS_A1( -4 * SIZE, A1, %xmm8) pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm14, %xmm10 addpd %xmm10, %xmm3 MOVUPS_A1( -2 * SIZE, A1, %xmm10) mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm15, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm15, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1(-14 * SIZE, A2, %xmm6) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm8) pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 MOVUPS_A1(-10 * SIZE, A2, %xmm10) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm14, %xmm8 addpd %xmm8, %xmm2 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm14, %xmm10 addpd %xmm10, %xmm3 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm15, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm15, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L25: testq $2, M je .L27 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm6) MOVUPS_A1(-16 * SIZE, A2, %xmm8) MOVUPS_A1(-14 * SIZE, A2, %xmm10) pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm14, %xmm8 addpd %xmm8, %xmm0 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm14, %xmm10 addpd %xmm10, %xmm1 mulpd %xmm15, %xmm9 SUBPD %xmm9, %xmm0 mulpd %xmm15, %xmm11 SUBPD %xmm11, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movapd %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L27: testq $1, M #if GEMV_UNROLL == 2 je .L29 #else je .L30 #endif MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-16 * SIZE, A2, %xmm6) pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) #if GEMV_UNROLL == 2 ALIGN_3 .L29: cmpq $2, N jge .L21 #endif ALIGN_3 .L30: #endif cmpq $1, N jl .L980 #if GEMV_UNROLL == 1 .L31: decq N #endif leaq 16 * SIZE(BUFFER), Y1 movq A, A1 #if GEMV_UNROLL == 1 addq LDA, A #endif movsd 0 * SIZE(X), %xmm12 movhpd 1 * SIZE(X), %xmm12 addq INCX, X pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm8 movddup ALPHA_I, %xmm9 #else movsd ALPHA_R, %xmm8 unpcklpd %xmm8, %xmm8 movsd ALPHA_I, %xmm9 unpcklpd %xmm9, %xmm9 #endif xorpd %xmm11, %xmm13 mulpd %xmm8, %xmm12 mulpd %xmm9, %xmm13 #ifndef XCONJ subpd %xmm13, %xmm12 #else addpd %xmm13, %xmm12 #endif pshufd $0xee, %xmm12, %xmm13 pshufd $0x44, %xmm12, %xmm12 #ifndef CONJ xorpd %xmm11, %xmm13 #else xorpd %xmm11, %xmm12 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) movq M, I sarq $2, I jle .L35 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm6) MOVUPS_A1(-12 * SIZE, A1, %xmm8) MOVUPS_A1(-10 * SIZE, A1, %xmm10) decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm4) pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 MOVUPS_A1( -6 * SIZE, A1, %xmm6) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 MOVUPS_A1( -4 * SIZE, A1, %xmm8) pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 MOVUPS_A1( -2 * SIZE, A1, %xmm10) mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 ALIGN_3 .L35: testq $2, M je .L37 MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-14 * SIZE, A1, %xmm6) pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movapd %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L37: testq $1, M #if GEMV_UNROLL == 1 je .L39 #else je .L980 #endif MOVUPS_A1(-16 * SIZE, A1, %xmm4) pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) #if GEMV_UNROLL == 1 ALIGN_3 .L39: cmpq $1, N jge .L31 #endif #ifdef ALIGNED_ACCESS jmp .L980 ALIGN_3 .L100: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L110 ALIGN_3 .L101: subq $4, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movsd 0 * SIZE(X), %xmm8 movhpd 1 * SIZE(X), %xmm8 addq INCX, X movsd 0 * SIZE(X), %xmm10 movhpd 1 * SIZE(X), %xmm10 addq INCX, X movsd 0 * SIZE(X), %xmm12 movhpd 1 * SIZE(X), %xmm12 addq INCX, X movsd 0 * SIZE(X), %xmm14 movhpd 1 * SIZE(X), %xmm14 addq INCX, X pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0xc0, %xmm5, %xmm5 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm6 movddup ALPHA_I, %xmm7 #else movsd ALPHA_R, %xmm6 unpcklpd %xmm6, %xmm6 movsd ALPHA_I, %xmm7 unpcklpd %xmm7, %xmm7 #endif xorpd %xmm5, %xmm9 xorpd %xmm5, %xmm11 xorpd %xmm5, %xmm13 xorpd %xmm5, %xmm15 mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 #ifndef XCONJ subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif pshufd $0xee, %xmm8, %xmm9 pshufd $0x44, %xmm8, %xmm8 pshufd $0xee, %xmm10, %xmm11 pshufd $0x44, %xmm10, %xmm10 pshufd $0xee, %xmm12, %xmm13 pshufd $0x44, %xmm12, %xmm12 pshufd $0xee, %xmm14, %xmm15 pshufd $0x44, %xmm14, %xmm14 #ifndef CONJ xorpd %xmm5, %xmm9 xorpd %xmm5, %xmm11 xorpd %xmm5, %xmm13 xorpd %xmm5, %xmm15 #else xorpd %xmm5, %xmm8 xorpd %xmm5, %xmm10 xorpd %xmm5, %xmm12 xorpd %xmm5, %xmm14 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) ALIGN_3 movq M, I sarq $2, I jle .L105 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 decq I jle .L104 ALIGN_3 .L103: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm2 movsd -16 * SIZE(A1, LDA), %xmm4 movhpd -15 * SIZE(A1, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm3 movsd -14 * SIZE(A1, LDA), %xmm6 movhpd -13 * SIZE(A1, LDA), %xmm6 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1, LDA), %xmm4 movhpd -11 * SIZE(A1, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm6 movhpd -9 * SIZE(A1, LDA), %xmm6 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm2 movsd -16 * SIZE(A2), %xmm4 movhpd -15 * SIZE(A2), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm3 movsd -14 * SIZE(A2), %xmm6 movhpd -13 * SIZE(A2), %xmm6 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A2), %xmm4 movhpd -11 * SIZE(A2), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A2), %xmm6 movhpd -9 * SIZE(A2), %xmm6 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm4 movhpd -15 * SIZE(A2, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm3 movsd -14 * SIZE(A2, LDA), %xmm6 movhpd -13 * SIZE(A2, LDA), %xmm6 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A2, LDA), %xmm4 movhpd -11 * SIZE(A2, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A2, LDA), %xmm6 movhpd -9 * SIZE(A2, LDA), %xmm6 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm2 movsd -8 * SIZE(A1), %xmm4 movhpd -7 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm3 movsd -6 * SIZE(A1), %xmm6 movhpd -5 * SIZE(A1), %xmm6 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L103 ALIGN_3 .L104: pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1), %xmm4 movhpd -11 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A1), %xmm6 movhpd -9 * SIZE(A1), %xmm6 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm2 movsd -16 * SIZE(A1, LDA), %xmm4 movhpd -15 * SIZE(A1, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm3 movsd -14 * SIZE(A1, LDA), %xmm6 movhpd -13 * SIZE(A1, LDA), %xmm6 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A1, LDA), %xmm4 movhpd -11 * SIZE(A1, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A1, LDA), %xmm6 movhpd -9 * SIZE(A1, LDA), %xmm6 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm2 movsd -16 * SIZE(A2), %xmm4 movhpd -15 * SIZE(A2), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm3 movsd -14 * SIZE(A2), %xmm6 movhpd -13 * SIZE(A2), %xmm6 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A2), %xmm4 movhpd -11 * SIZE(A2), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A2), %xmm6 movhpd -9 * SIZE(A2), %xmm6 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm4 movhpd -15 * SIZE(A2, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm3 movsd -14 * SIZE(A2, LDA), %xmm6 movhpd -13 * SIZE(A2, LDA), %xmm6 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 movsd -12 * SIZE(A2, LDA), %xmm4 movhpd -11 * SIZE(A2, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 movsd -10 * SIZE(A2, LDA), %xmm6 movhpd -9 * SIZE(A2, LDA), %xmm6 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm2 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm3 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm2 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L105: testq $2, M je .L107 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(A1, LDA), %xmm4 movhpd -15 * SIZE(A1, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 movsd -14 * SIZE(A1, LDA), %xmm6 movhpd -13 * SIZE(A1, LDA), %xmm6 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(A2), %xmm4 movhpd -15 * SIZE(A2), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 movsd -14 * SIZE(A2), %xmm6 movhpd -13 * SIZE(A2), %xmm6 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(A2, LDA), %xmm4 movhpd -15 * SIZE(A2, LDA), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 movsd -14 * SIZE(A2, LDA), %xmm6 movhpd -13 * SIZE(A2, LDA), %xmm6 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movapd %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L107: testq $1, M je .L109 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -16 * SIZE(A1, LDA), %xmm6 movhpd -15 * SIZE(A1, LDA), %xmm6 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(A2), %xmm4 movhpd -15 * SIZE(A2), %xmm4 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm0 movsd -16 * SIZE(A2, LDA), %xmm6 movhpd -15 * SIZE(A2, LDA), %xmm6 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm0 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) ALIGN_3 .L109: cmpq $4, N jge .L101 ALIGN_3 .L110: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L120 #if GEMV_UNROLL == 2 ALIGN_3 .L111: #endif subq $2, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movsd 0 * SIZE(X), %xmm12 movhpd 1 * SIZE(X), %xmm12 addq INCX, X movsd 0 * SIZE(X), %xmm14 movhpd 1 * SIZE(X), %xmm14 addq INCX, X pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm8 movddup ALPHA_I, %xmm9 #else movsd ALPHA_R, %xmm8 unpcklpd %xmm8, %xmm8 movsd ALPHA_I, %xmm9 unpcklpd %xmm9, %xmm9 #endif xorpd %xmm11, %xmm13 xorpd %xmm11, %xmm15 mulpd %xmm8, %xmm12 mulpd %xmm9, %xmm13 mulpd %xmm8, %xmm14 mulpd %xmm9, %xmm15 #ifndef XCONJ subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif pshufd $0xee, %xmm12, %xmm13 pshufd $0x44, %xmm12, %xmm12 pshufd $0xee, %xmm14, %xmm15 pshufd $0x44, %xmm14, %xmm14 #ifndef CONJ xorpd %xmm11, %xmm13 xorpd %xmm11, %xmm15 #else xorpd %xmm11, %xmm12 xorpd %xmm11, %xmm14 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) ALIGN_3 movq M, I sarq $2, I jle .L115 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 movsd -10 * SIZE(A1), %xmm10 movhpd -9 * SIZE(A1), %xmm10 decq I jle .L114 ALIGN_3 .L113: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(A2), %xmm4 movhpd -15 * SIZE(A2), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 movsd -14 * SIZE(A2), %xmm6 movhpd -13 * SIZE(A2), %xmm6 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 movsd -12 * SIZE(A2), %xmm8 movhpd -11 * SIZE(A2), %xmm8 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 movsd -10 * SIZE(A2), %xmm10 movhpd -9 * SIZE(A2), %xmm10 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 movhpd -7 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 movsd -6 * SIZE(A1), %xmm6 movhpd -5 * SIZE(A1), %xmm6 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm14, %xmm8 addpd %xmm8, %xmm2 movsd -4 * SIZE(A1), %xmm8 movhpd -3 * SIZE(A1), %xmm8 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm14, %xmm10 addpd %xmm10, %xmm3 movsd -2 * SIZE(A1), %xmm10 movhpd -1 * SIZE(A1), %xmm10 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm15, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm15, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L113 ALIGN_3 .L114: pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movsd -16 * SIZE(A2), %xmm4 movhpd -15 * SIZE(A2), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 movsd -14 * SIZE(A2), %xmm6 movhpd -13 * SIZE(A2), %xmm6 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 movsd -12 * SIZE(A2), %xmm8 movhpd -11 * SIZE(A2), %xmm8 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 movsd -10 * SIZE(A2), %xmm10 movhpd -9 * SIZE(A2), %xmm10 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm14, %xmm8 addpd %xmm8, %xmm2 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm14, %xmm10 addpd %xmm10, %xmm3 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm15, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm15, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L115: testq $2, M je .L117 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 movsd -16 * SIZE(A2), %xmm8 movhpd -15 * SIZE(A2), %xmm8 movsd -14 * SIZE(A2), %xmm10 movhpd -13 * SIZE(A2), %xmm10 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm14, %xmm8 addpd %xmm8, %xmm0 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm14, %xmm10 addpd %xmm10, %xmm1 mulpd %xmm15, %xmm9 SUBPD %xmm9, %xmm0 mulpd %xmm15, %xmm11 SUBPD %xmm11, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movapd %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L117: testq $1, M #if GEMV_UNROLL == 2 je .L119 #else je .L120 #endif movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -16 * SIZE(A2), %xmm6 movhpd -15 * SIZE(A2), %xmm6 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) #if GEMV_UNROLL == 2 ALIGN_3 .L119: cmpq $2, N jge .L111 #endif ALIGN_3 .L120: #endif cmpq $1, N jl .L980 #if GEMV_UNROLL == 1 .L121: decq N #endif leaq 16 * SIZE(BUFFER), Y1 movq A, A1 #if GEMV_UNROLL == 1 addq LDA, A #endif movsd 0 * SIZE(X), %xmm12 movhpd 1 * SIZE(X), %xmm12 addq INCX, X pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 #ifdef HAVE_SSE3 movddup ALPHA_R, %xmm8 movddup ALPHA_I, %xmm9 #else movsd ALPHA_R, %xmm8 unpcklpd %xmm8, %xmm8 movsd ALPHA_I, %xmm9 unpcklpd %xmm9, %xmm9 #endif xorpd %xmm11, %xmm13 mulpd %xmm8, %xmm12 mulpd %xmm9, %xmm13 #ifndef XCONJ subpd %xmm13, %xmm12 #else addpd %xmm13, %xmm12 #endif pshufd $0xee, %xmm12, %xmm13 pshufd $0x44, %xmm12, %xmm12 #ifndef CONJ xorpd %xmm11, %xmm13 #else xorpd %xmm11, %xmm12 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) movq M, I sarq $2, I jle .L125 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 movsd -10 * SIZE(A1), %xmm10 movhpd -9 * SIZE(A1), %xmm10 decq I jle .L124 ALIGN_3 .L123: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movsd -8 * SIZE(A1), %xmm4 movhpd -7 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 movsd -6 * SIZE(A1), %xmm6 movhpd -5 * SIZE(A1), %xmm6 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 movsd -4 * SIZE(A1), %xmm8 movhpd -3 * SIZE(A1), %xmm8 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 movsd -2 * SIZE(A1), %xmm10 movhpd -1 * SIZE(A1), %xmm10 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L123 ALIGN_3 .L124: pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm2 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm3 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm2 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 ALIGN_3 .L125: testq $2, M je .L127 movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 movsd -14 * SIZE(A1), %xmm6 movhpd -13 * SIZE(A1), %xmm6 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movapd %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L127: testq $1, M #if GEMV_UNROLL == 1 je .L129 #else je .L980 #endif movsd -16 * SIZE(A1), %xmm4 movhpd -15 * SIZE(A1), %xmm4 pshufd $0x4e, %xmm4, %xmm5 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) #if GEMV_UNROLL == 1 ALIGN_3 .L129: cmpq $1, N jge .L121 #endif #endif ALIGN_3 .L980: testq $SIZE, Y jne .L990 movq Y, Y1 movq M, %rax sarq $3, %rax jle .L184 ALIGN_3 .L182: movapd (Y), %xmm0 addq INCY, Y movapd (Y), %xmm1 addq INCY, Y movapd (Y), %xmm2 addq INCY, Y movapd (Y), %xmm3 addq INCY, Y movapd (Y), %xmm4 addq INCY, Y movapd (Y), %xmm5 addq INCY, Y movapd (Y), %xmm6 addq INCY, Y movapd (Y), %xmm7 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 addpd 8 * SIZE(BUFFER), %xmm4 addpd 10 * SIZE(BUFFER), %xmm5 addpd 12 * SIZE(BUFFER), %xmm6 addpd 14 * SIZE(BUFFER), %xmm7 movapd %xmm0, (Y1) addq INCY, Y1 movapd %xmm1, (Y1) addq INCY, Y1 movapd %xmm2, (Y1) addq INCY, Y1 movapd %xmm3, (Y1) addq INCY, Y1 movapd %xmm4, (Y1) addq INCY, Y1 movapd %xmm5, (Y1) addq INCY, Y1 movapd %xmm6, (Y1) addq INCY, Y1 movapd %xmm7, (Y1) addq INCY, Y1 subq $-16 * SIZE, BUFFER decq %rax jg .L182 ALIGN_3 .L184: testq $7, M jle .L999 testq $4, M jle .L185 movapd (Y), %xmm0 addq INCY, Y movapd (Y), %xmm1 addq INCY, Y movapd (Y), %xmm2 addq INCY, Y movapd (Y), %xmm3 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 movapd %xmm0, (Y1) addq INCY, Y1 movapd %xmm1, (Y1) addq INCY, Y1 movapd %xmm2, (Y1) addq INCY, Y1 movapd %xmm3, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER ALIGN_3 .L185: testq $2, M jle .L186 movapd (Y), %xmm0 addq INCY, Y movapd (Y), %xmm1 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 movapd %xmm0, (Y1) addq INCY, Y1 movapd %xmm1, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L186: testq $1, M jle .L999 movapd (Y), %xmm0 addpd (BUFFER), %xmm0 movapd %xmm0, (Y1) jmp .L999 ALIGN_3 .L990: movq Y, Y1 movq M, %rax sarq $3, %rax jle .L994 ALIGN_3 .L992: movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm2 movhpd 1 * SIZE(Y), %xmm2 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addq INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addq INCY, Y movsd 0 * SIZE(Y), %xmm6 movhpd 1 * SIZE(Y), %xmm6 addq INCY, Y movsd 0 * SIZE(Y), %xmm7 movhpd 1 * SIZE(Y), %xmm7 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 addpd 8 * SIZE(BUFFER), %xmm4 addpd 10 * SIZE(BUFFER), %xmm5 addpd 12 * SIZE(BUFFER), %xmm6 addpd 14 * SIZE(BUFFER), %xmm7 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) movhpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm3, 0 * SIZE(Y1) movhpd %xmm3, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm5, 0 * SIZE(Y1) movhpd %xmm5, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm6, 0 * SIZE(Y1) movhpd %xmm6, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm7, 0 * SIZE(Y1) movhpd %xmm7, 1 * SIZE(Y1) addq INCY, Y1 subq $-16 * SIZE, BUFFER decq %rax jg .L992 ALIGN_3 .L994: testq $7, M jle .L999 testq $4, M jle .L995 movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm2 movhpd 1 * SIZE(Y), %xmm2 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) movhpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm3, 0 * SIZE(Y1) movhpd %xmm3, 1 * SIZE(Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER ALIGN_3 .L995: testq $2, M jle .L996 movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) movhpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L996: testq $1, M jle .L999 movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addpd 0 * SIZE(BUFFER), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) ALIGN_3 .L999: movq M, I salq $ZBASE_SHIFT,I addq I,AA jmp .L0t .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemv_n_4.c000066400000000000000000000370451313527062700174270ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include "common.h" #if defined(HASWELL) || defined(ZEN) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_n_microk_bulldozer-4.c" #endif #define NBMAX 1024 #ifndef HAVE_KERNEL_4x4 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; y[i] += a1[i]*x[2] - a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; y[i] += a2[i]*x[4] - a2[i+1] * x[5]; y[i+1] += a2[i]*x[5] + a2[i+1] * x[4]; y[i] += a3[i]*x[6] - a3[i+1] * x[7]; y[i+1] += a3[i]*x[7] + a3[i+1] * x[6]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; y[i] += a1[i]*x[2] + a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; y[i] += a2[i]*x[4] + a2[i+1] * x[5]; y[i+1] += a2[i]*x[5] - a2[i+1] * x[4]; y[i] += a3[i]*x[6] + a3[i+1] * x[7]; y[i+1] += a3[i]*x[7] - a3[i+1] * x[6]; #endif } } #endif #ifndef HAVE_KERNEL_4x2 static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0,*a1; a0 = ap[0]; a1 = ap[1]; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; y[i] += a1[i]*x[2] - a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] + a1[i+1] * x[2]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; y[i] += a1[i]*x[2] + a1[i+1] * x[3]; y[i+1] += a1[i]*x[3] - a1[i+1] * x[2]; #endif } } #endif #ifndef HAVE_KERNEL_4x1 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG i; FLOAT *a0; a0 = ap; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) y[i] += a0[i]*x[0] - a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] + a0[i+1] * x[0]; #else y[i] += a0[i]*x[0] + a0[i+1] * x[1]; y[i+1] += a0[i]*x[1] - a0[i+1] * x[0]; #endif } } #endif #ifndef HAVE_KERNEL_ADDY static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; if ( inc_dest != 2 ) { FLOAT temp_r; FLOAT temp_i; for ( i=0; i= 4 cmpq $4, N jl .L20 ALIGN_3 .L11: subq $4, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movddup 0 * SIZE(X), %xmm8 movddup 1 * SIZE(X), %xmm9 addq INCX, X movddup 0 * SIZE(X), %xmm10 movddup 1 * SIZE(X), %xmm11 addq INCX, X movddup 0 * SIZE(X), %xmm12 movddup 1 * SIZE(X), %xmm13 addq INCX, X movddup 0 * SIZE(X), %xmm14 movddup 1 * SIZE(X), %xmm15 addq INCX, X pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0x40, %xmm5, %xmm5 movsd ALPHA_R, %xmm6 movhps ALPHA_I, %xmm6 pshufd $0x4e, %xmm6, %xmm7 #ifndef XCONJ xorps %xmm5, %xmm7 #else xorps %xmm5, %xmm6 #endif mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 mulpd %xmm6, %xmm12 mulpd %xmm7, %xmm13 mulpd %xmm6, %xmm14 mulpd %xmm7, %xmm15 #ifndef XCONJ subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 #ifndef XCONJ xorps %xmm5, %xmm9 xorps %xmm5, %xmm11 xorps %xmm5, %xmm13 xorps %xmm5, %xmm15 #else xorps %xmm5, %xmm8 xorps %xmm5, %xmm10 xorps %xmm5, %xmm12 xorps %xmm5, %xmm14 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) ALIGN_3 movq M, I sarq $2, I jle .L15 movddup -16 * SIZE(A1), %xmm4 movddup -14 * SIZE(A1), %xmm5 movddup -12 * SIZE(A1), %xmm6 movddup -10 * SIZE(A1), %xmm7 decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1), %xmm4 mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1), %xmm5 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1), %xmm6 mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1), %xmm7 mulpd %xmm9, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A1, LDA), %xmm4 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A1, LDA), %xmm5 mulpd %xmm9, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A1, LDA), %xmm6 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A1, LDA), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1, LDA), %xmm4 mulpd %xmm10, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1, LDA), %xmm5 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1, LDA), %xmm6 mulpd %xmm10, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1, LDA), %xmm7 mulpd %xmm11, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A2), %xmm4 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A2), %xmm5 mulpd %xmm11, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A2), %xmm6 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A2), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A2), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A2), %xmm6 mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A2), %xmm7 mulpd %xmm13, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A2, LDA), %xmm4 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A2, LDA), %xmm5 mulpd %xmm13, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A2, LDA), %xmm6 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A2, LDA), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A2, LDA), %xmm4 mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A2, LDA), %xmm5 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A2, LDA), %xmm6 mulpd %xmm14, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A2, LDA), %xmm7 mulpd %xmm15, %xmm4 SUBPD %xmm4, %xmm0 movddup -8 * SIZE(A1), %xmm4 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm1 movddup -6 * SIZE(A1), %xmm5 mulpd %xmm15, %xmm6 SUBPD %xmm6, %xmm2 movddup -4 * SIZE(A1), %xmm6 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm3 movddup -2 * SIZE(A1), %xmm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1), %xmm4 mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1), %xmm5 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1), %xmm6 mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1), %xmm7 mulpd %xmm9, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A1, LDA), %xmm4 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A1, LDA), %xmm5 mulpd %xmm9, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A1, LDA), %xmm6 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A1, LDA), %xmm7 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1, LDA), %xmm4 mulpd %xmm10, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1, LDA), %xmm5 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1, LDA), %xmm6 mulpd %xmm10, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1, LDA), %xmm7 mulpd %xmm11, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A2), %xmm4 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A2), %xmm5 mulpd %xmm11, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A2), %xmm6 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A2), %xmm7 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A2), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A2), %xmm6 mulpd %xmm12, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A2), %xmm7 mulpd %xmm13, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A2, LDA), %xmm4 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A2, LDA), %xmm5 mulpd %xmm13, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A2, LDA), %xmm6 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A2, LDA), %xmm7 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A2, LDA), %xmm4 mulpd %xmm14, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A2, LDA), %xmm5 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A2, LDA), %xmm6 mulpd %xmm14, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A2, LDA), %xmm7 mulpd %xmm15, %xmm4 SUBPD %xmm4, %xmm0 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm1 mulpd %xmm15, %xmm6 SUBPD %xmm6, %xmm2 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L15: testq $2, M je .L17 movddup -16 * SIZE(A1), %xmm4 movddup -15 * SIZE(A1), %xmm5 movddup -14 * SIZE(A1), %xmm6 movddup -13 * SIZE(A1), %xmm7 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -16 * SIZE(A1, LDA, 1), %xmm4 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 movddup -14 * SIZE(A1, LDA, 1), %xmm6 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 movddup -15 * SIZE(A1, LDA, 1), %xmm5 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 movddup -13 * SIZE(A1, LDA, 1), %xmm7 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movddup -16 * SIZE(A2), %xmm4 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 movddup -14 * SIZE(A2), %xmm6 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 movddup -15 * SIZE(A2), %xmm5 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 movddup -13 * SIZE(A2), %xmm7 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movddup -16 * SIZE(A2, LDA, 1), %xmm4 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm1 movddup -14 * SIZE(A2, LDA, 1), %xmm6 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 movddup -15 * SIZE(A2, LDA, 1), %xmm5 mulpd %xmm13, %xmm7 SUBPD %xmm7, %xmm1 movddup -13 * SIZE(A2, LDA, 1), %xmm7 mulpd %xmm14, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm15, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movaps %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L17: testq $1, M je .L19 movddup -16 * SIZE(A1), %xmm4 movddup -15 * SIZE(A1), %xmm5 movddup -16 * SIZE(A1, LDA, 1), %xmm6 movddup -15 * SIZE(A1, LDA, 1), %xmm7 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -16 * SIZE(A2), %xmm4 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 movddup -15 * SIZE(A2), %xmm5 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm0 movddup -16 * SIZE(A2, LDA, 1), %xmm6 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm0 movddup -15 * SIZE(A2, LDA, 1), %xmm7 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm13, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm14, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm15, %xmm7 SUBPD %xmm7, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) ALIGN_3 .L19: cmpq $4, N jge .L11 ALIGN_3 .L20: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L30 #if GEMV_UNROLL == 2 ALIGN_3 .L21: #endif subq $2, N leaq 16 * SIZE(BUFFER), Y1 movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A movddup 0 * SIZE(X), %xmm8 movddup 1 * SIZE(X), %xmm9 addq INCX, X movddup 0 * SIZE(X), %xmm10 movddup 1 * SIZE(X), %xmm11 addq INCX, X pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0x40, %xmm5, %xmm5 movsd ALPHA_R, %xmm6 movhps ALPHA_I, %xmm6 pshufd $0x4e, %xmm6, %xmm7 #ifndef XCONJ xorps %xmm5, %xmm7 #else xorps %xmm5, %xmm6 #endif mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 mulpd %xmm7, %xmm11 #ifndef XCONJ subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 #endif pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 #ifndef XCONJ xorps %xmm5, %xmm9 xorps %xmm5, %xmm11 #else xorps %xmm5, %xmm8 xorps %xmm5, %xmm10 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) movq M, I sarq $2, I jle .L25 movddup -16 * SIZE(A1), %xmm4 movddup -14 * SIZE(A1), %xmm5 movddup -12 * SIZE(A1), %xmm6 movddup -10 * SIZE(A1), %xmm7 decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1), %xmm4 mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1), %xmm5 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1), %xmm6 mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1), %xmm7 mulpd %xmm9, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A2), %xmm4 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A2), %xmm5 mulpd %xmm9, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A2), %xmm6 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A2), %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A2), %xmm4 mulpd %xmm10, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A2), %xmm5 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A2), %xmm6 mulpd %xmm10, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A2), %xmm7 mulpd %xmm11, %xmm4 SUBPD %xmm4, %xmm0 movddup -8 * SIZE(A1), %xmm4 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm1 movddup -6 * SIZE(A1), %xmm5 mulpd %xmm11, %xmm6 SUBPD %xmm6, %xmm2 movddup -4 * SIZE(A1), %xmm6 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 movddup -2 * SIZE(A1), %xmm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1), %xmm4 mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1), %xmm5 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1), %xmm6 mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1), %xmm7 mulpd %xmm9, %xmm4 SUBPD %xmm4, %xmm0 movddup -16 * SIZE(A2), %xmm4 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm1 movddup -14 * SIZE(A2), %xmm5 mulpd %xmm9, %xmm6 SUBPD %xmm6, %xmm2 movddup -12 * SIZE(A2), %xmm6 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 movddup -10 * SIZE(A2), %xmm7 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A2), %xmm4 mulpd %xmm10, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A2), %xmm5 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A2), %xmm6 mulpd %xmm10, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A2), %xmm7 mulpd %xmm11, %xmm4 SUBPD %xmm4, %xmm0 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm1 mulpd %xmm11, %xmm6 SUBPD %xmm6, %xmm2 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, Y1 ALIGN_3 .L25: testq $2, M je .L27 movddup -16 * SIZE(A1), %xmm4 movddup -15 * SIZE(A1), %xmm5 movddup -14 * SIZE(A1), %xmm6 movddup -13 * SIZE(A1), %xmm7 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -16 * SIZE(A2), %xmm4 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 movddup -14 * SIZE(A2), %xmm6 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 movddup -15 * SIZE(A2), %xmm5 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 movddup -13 * SIZE(A2), %xmm7 mulpd %xmm10, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm11, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movaps %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, A2 addq $4 * SIZE, Y1 ALIGN_3 .L27: testq $1, M #if GEMV_UNROLL == 2 je .L29 #else je .L30 #endif movddup -16 * SIZE(A1), %xmm4 movddup -15 * SIZE(A1), %xmm5 movddup -16 * SIZE(A2), %xmm6 movddup -15 * SIZE(A2), %xmm7 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm10, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm11, %xmm7 SUBPD %xmm7, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) #if GEMV_UNROLL == 2 ALIGN_3 .L29: cmpq $2, N jge .L21 #endif ALIGN_3 .L30: #endif cmpq $1, N jl .L980 #if GEMV_UNROLL == 1 .L31: decq N #endif leaq 16 * SIZE(BUFFER), Y1 movq A, A1 #if GEMV_UNROLL == 1 addq LDA, A #endif movddup 0 * SIZE(X), %xmm8 movddup 1 * SIZE(X), %xmm9 addq INCX, X pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0x40, %xmm5, %xmm5 movsd ALPHA_R, %xmm6 movhps ALPHA_I, %xmm6 pshufd $0x4e, %xmm6, %xmm7 #ifndef XCONJ xorps %xmm5, %xmm7 #else xorps %xmm5, %xmm6 #endif mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 #ifndef XCONJ subpd %xmm9, %xmm8 #else addpd %xmm9, %xmm8 #endif pshufd $0x4e, %xmm8, %xmm9 #ifndef XCONJ xorps %xmm5, %xmm9 #else xorps %xmm5, %xmm8 #endif MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) movq M, I sarq $2, I jle .L35 movddup -16 * SIZE(A1), %xmm4 movddup -14 * SIZE(A1), %xmm5 movddup -12 * SIZE(A1), %xmm6 movddup -10 * SIZE(A1), %xmm7 decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1), %xmm4 mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1), %xmm5 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1), %xmm6 mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1), %xmm7 mulpd %xmm9, %xmm4 SUBPD %xmm4, %xmm0 movddup -8 * SIZE(A1), %xmm4 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm1 movddup -6 * SIZE(A1), %xmm5 mulpd %xmm9, %xmm6 SUBPD %xmm6, %xmm2 movddup -4 * SIZE(A1), %xmm6 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 movddup -2 * SIZE(A1), %xmm7 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) #endif MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 movddup -15 * SIZE(A1), %xmm4 mulpd %xmm8, %xmm5 addpd %xmm5, %xmm1 movddup -13 * SIZE(A1), %xmm5 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm2 movddup -11 * SIZE(A1), %xmm6 mulpd %xmm8, %xmm7 addpd %xmm7, %xmm3 movddup -9 * SIZE(A1), %xmm7 mulpd %xmm9, %xmm4 SUBPD %xmm4, %xmm0 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm1 mulpd %xmm9, %xmm6 SUBPD %xmm6, %xmm2 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) subq $-8 * SIZE, A1 subq $-8 * SIZE, Y1 ALIGN_3 .L35: testq $2, M je .L37 movddup -16 * SIZE(A1), %xmm4 movddup -15 * SIZE(A1), %xmm5 movddup -14 * SIZE(A1), %xmm6 movddup -13 * SIZE(A1), %xmm7 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm8, %xmm6 addpd %xmm6, %xmm1 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 mulpd %xmm9, %xmm7 SUBPD %xmm7, %xmm1 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) movaps %xmm2, %xmm0 addq $4 * SIZE, A1 addq $4 * SIZE, Y1 ALIGN_3 .L37: testq $1, M #if GEMV_UNROLL == 1 je .L39 #else je .L980 #endif movddup -16 * SIZE(A1), %xmm4 movddup -15 * SIZE(A1), %xmm5 mulpd %xmm8, %xmm4 addpd %xmm4, %xmm0 mulpd %xmm9, %xmm5 SUBPD %xmm5, %xmm0 MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) #if GEMV_UNROLL == 1 ALIGN_3 .L39: cmpq $1, N jge .L31 #endif .L980: testq $SIZE, Y jne .L990 movq Y, Y1 movq M, %rax sarq $3, %rax jle .L184 ALIGN_3 .L182: movaps (Y), %xmm0 addq INCY, Y movaps (Y), %xmm1 addq INCY, Y movaps (Y), %xmm2 addq INCY, Y movaps (Y), %xmm3 addq INCY, Y movaps (Y), %xmm4 addq INCY, Y movaps (Y), %xmm5 addq INCY, Y movaps (Y), %xmm6 addq INCY, Y movaps (Y), %xmm7 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 addpd 8 * SIZE(BUFFER), %xmm4 addpd 10 * SIZE(BUFFER), %xmm5 addpd 12 * SIZE(BUFFER), %xmm6 addpd 14 * SIZE(BUFFER), %xmm7 movaps %xmm0, (Y1) addq INCY, Y1 movaps %xmm1, (Y1) addq INCY, Y1 movaps %xmm2, (Y1) addq INCY, Y1 movaps %xmm3, (Y1) addq INCY, Y1 movaps %xmm4, (Y1) addq INCY, Y1 movaps %xmm5, (Y1) addq INCY, Y1 movaps %xmm6, (Y1) addq INCY, Y1 movaps %xmm7, (Y1) addq INCY, Y1 subq $-16 * SIZE, BUFFER decq %rax jg .L182 ALIGN_3 .L184: testq $7, M jle .L999 testq $4, M jle .L185 movaps (Y), %xmm0 addq INCY, Y movaps (Y), %xmm1 addq INCY, Y movaps (Y), %xmm2 addq INCY, Y movaps (Y), %xmm3 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 movaps %xmm0, (Y1) addq INCY, Y1 movaps %xmm1, (Y1) addq INCY, Y1 movaps %xmm2, (Y1) addq INCY, Y1 movaps %xmm3, (Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER ALIGN_3 .L185: testq $2, M jle .L186 movaps (Y), %xmm0 addq INCY, Y movaps (Y), %xmm1 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 movaps %xmm0, (Y1) addq INCY, Y1 movaps %xmm1, (Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L186: testq $1, M jle .L999 movaps (Y), %xmm0 addpd (BUFFER), %xmm0 movaps %xmm0, (Y1) jmp .L999 ALIGN_3 .L990: movq Y, Y1 movq M, %rax sarq $3, %rax jle .L994 ALIGN_3 .L992: movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm2 movhpd 1 * SIZE(Y), %xmm2 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addq INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addq INCY, Y movsd 0 * SIZE(Y), %xmm6 movhpd 1 * SIZE(Y), %xmm6 addq INCY, Y movsd 0 * SIZE(Y), %xmm7 movhpd 1 * SIZE(Y), %xmm7 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 addpd 8 * SIZE(BUFFER), %xmm4 addpd 10 * SIZE(BUFFER), %xmm5 addpd 12 * SIZE(BUFFER), %xmm6 addpd 14 * SIZE(BUFFER), %xmm7 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) movhpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm3, 0 * SIZE(Y1) movhpd %xmm3, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm5, 0 * SIZE(Y1) movhpd %xmm5, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm6, 0 * SIZE(Y1) movhpd %xmm6, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm7, 0 * SIZE(Y1) movhpd %xmm7, 1 * SIZE(Y1) addq INCY, Y1 subq $-16 * SIZE, BUFFER decq %rax jg .L992 ALIGN_3 .L994: testq $7, M jle .L999 testq $4, M jle .L995 movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm2 movhpd 1 * SIZE(Y), %xmm2 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 addpd 4 * SIZE(BUFFER), %xmm2 addpd 6 * SIZE(BUFFER), %xmm3 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) movhpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm3, 0 * SIZE(Y1) movhpd %xmm3, 1 * SIZE(Y1) addq INCY, Y1 addq $8 * SIZE, BUFFER ALIGN_3 .L995: testq $2, M jle .L996 movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y addpd 0 * SIZE(BUFFER), %xmm0 addpd 2 * SIZE(BUFFER), %xmm1 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm1, 0 * SIZE(Y1) movhpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L996: testq $1, M jle .L999 movsd 0 * SIZE(Y), %xmm0 movhpd 1 * SIZE(Y), %xmm0 addpd 0 * SIZE(BUFFER), %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemv_n_microk_bulldozer-4.c000066400000000000000000000507671313527062700230010ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; if ( n > 384 ) { __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 ".align 16 \n\t" "1: \n\t" "prefetcht0 512(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 "prefetcht0 512(%5,%0,8) \n\t" "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "prefetcht0 512(%6,%0,8) \n\t" "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "prefetcht0 512(%7,%0,8) \n\t" "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" #else "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm13, 32(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "2: \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } else { __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 ".align 16 \n\t" "1: \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" #else "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm13, 32(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "2: \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } } #define HAVE_KERNEL_4x2 1 static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 // ".align 16 \n\t" "1: \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" #else "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm13, 32(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]) // 5 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x1 1 static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 // ".align 16 \n\t" "1: \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i "vmovups (%3,%0,8), %%ymm10 \n\t" "vmovups 32(%3,%0,8), %%ymm11 \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" #else "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" #endif "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y "vmovups %%ymm13, 32(%3,%0,8) \n\t" "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap) // 4 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_ADDY 1 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; if ( inc_dest != 2 ) { FLOAT temp_r; FLOAT temp_i; for ( i=0; i= 4 cmpq $4, N jl .L20 ALIGN_3 .L11: subq $4, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A MOVUPS_XL1(-16 * SIZE, X1, %xmm12) xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 xorpd %xmm2, %xmm2 xorpd %xmm3, %xmm3 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) xorpd %xmm4, %xmm4 xorpd %xmm5, %xmm5 xorpd %xmm6, %xmm6 xorpd %xmm7, %xmm7 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq M, I sarq $2, I jle .L15 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-10 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-10 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1( -8 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-10 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-10 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L15: testq $2, M je .L17 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 MOVUPS_A1(-14 * SIZE, A1, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-14 * SIZE, A2, %xmm8) mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm7 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L17: testq $1, M je .L19 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-16 * SIZE, A2, %xmm8) mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm7 ALIGN_3 .L19: pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0xc0, %xmm13, %xmm13 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm13, %xmm0 xorpd %xmm13, %xmm2 xorpd %xmm13, %xmm4 xorpd %xmm13, %xmm6 #else xorpd %xmm13, %xmm1 xorpd %xmm13, %xmm3 xorpd %xmm13, %xmm5 xorpd %xmm13, %xmm7 #endif #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 haddpd %xmm3, %xmm2 haddpd %xmm5, %xmm4 haddpd %xmm7, %xmm6 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm9 movapd %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm10 movapd %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 addpd %xmm10, %xmm4 addpd %xmm11, %xmm6 #endif pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm3 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm5 mulpd ALPHA_R, %xmm6 mulpd ALPHA_I, %xmm7 xorpd %xmm13, %xmm1 xorpd %xmm13, %xmm3 xorpd %xmm13, %xmm5 xorpd %xmm13, %xmm7 subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addq INCY, Y movsd 0 * SIZE(Y), %xmm7 movhpd 1 * SIZE(Y), %xmm7 addq INCY, Y addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm6, 0 * SIZE(Y1) movhpd %xmm6, 1 * SIZE(Y1) addq INCY, Y1 cmpq $4, N jge .L11 ALIGN_3 .L20: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L30 #if GEMV_UNROLL == 2 ALIGN_3 .L21: #endif subq $2, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 xorpd %xmm2, %xmm2 xorpd %xmm3, %xmm3 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) MOVUPS_XL1(-14 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq M, I sarq $2, I jle .L25 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-16 * SIZE, A2, %xmm10) MOVUPS_A1(-14 * SIZE, A1, %xmm12) MOVUPS_A1(-14 * SIZE, A2, %xmm6) decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm10) mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm12) mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-10 * SIZE, A2, %xmm6) mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm8) mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1( -8 * SIZE, A2, %xmm10) mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 MOVUPS_A1( -6 * SIZE, A1, %xmm12) mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1( -6 * SIZE, A2, %xmm6) mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 MOVUPS_A1(-12 * SIZE, A2, %xmm10) mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm12) mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 MOVUPS_A1(-10 * SIZE, A2, %xmm6) mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L25: testq $2, M je .L27 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-16 * SIZE, A2, %xmm10) MOVUPS_A1(-14 * SIZE, A1, %xmm12) MOVUPS_A1(-14 * SIZE, A2, %xmm6) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L27: testq $1, M je .L29 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-16 * SIZE, A2, %xmm10) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 ALIGN_3 .L29: pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 xorpd %xmm11, %xmm2 #else xorpd %xmm11, %xmm1 xorpd %xmm11, %xmm3 #endif #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 haddpd %xmm3, %xmm2 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm9 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 #endif pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm3 xorpd %xmm11, %xmm1 xorpd %xmm11, %xmm3 subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addq INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addq INCY, Y addpd %xmm4, %xmm0 addpd %xmm5, %xmm2 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L21 #endif ALIGN_3 .L30: #endif cmpq $1, N jl .L999 #if GEMV_UNROLL == 1 .L31: decq N #endif leaq 16 * SIZE(BUFFER), X1 movq A, A1 #if GEMV_UNROLL == 1 addq LDA, A #endif xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) MOVUPS_XL1(-14 * SIZE, X1, %xmm5) movq M, I sarq $2, I jle .L35 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-14 * SIZE, A1, %xmm12) decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm12) mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1( -8 * SIZE, A1, %xmm8) mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 MOVUPS_A1( -6 * SIZE, A1, %xmm12) mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVUPS_A1(-12 * SIZE, A1, %xmm8) mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 MOVUPS_A1(-10 * SIZE, A1, %xmm12) mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, X1 ALIGN_3 .L35: testq $2, M je .L37 MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-14 * SIZE, A1, %xmm12) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 addq $4 * SIZE, A1 ALIGN_3 .L37: testq $1, M je .L39 MOVUPS_A1(-16 * SIZE, A1, %xmm8) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 ALIGN_3 .L39: pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 #else xorpd %xmm11, %xmm1 #endif #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 addpd %xmm8, %xmm0 #endif pshufd $0x4e, %xmm0, %xmm1 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 xorpd %xmm11, %xmm1 subpd %xmm1, %xmm0 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) #if GEMV_UNROLL == 1 addq INCY, Y addq INCY, Y1 cmpq $1, N jge .L31 #endif #ifdef ALIGNED_ACCESS jmp .L999 ALIGN_3 .L100: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L110 ALIGN_3 .L101: subq $4, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A MOVUPS_XL1(-16 * SIZE, X1, %xmm12) xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 xorpd %xmm2, %xmm2 xorpd %xmm3, %xmm3 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) xorpd %xmm4, %xmm4 xorpd %xmm5, %xmm5 xorpd %xmm6, %xmm6 xorpd %xmm7, %xmm7 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq M, I sarq $2, I jle .L105 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -16 * SIZE(A1, LDA), %xmm10 movhpd -15 * SIZE(A1, LDA), %xmm10 decq I jle .L104 ALIGN_3 .L103: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movsd -16 * SIZE(A2), %xmm8 movhpd -15 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm10 movhpd -15 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movsd -14 * SIZE(A1), %xmm8 movhpd -13 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movsd -14 * SIZE(A1, LDA), %xmm10 movhpd -13 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movsd -14 * SIZE(A2), %xmm8 movhpd -13 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movsd -14 * SIZE(A2, LDA), %xmm10 movhpd -13 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 movsd -12 * SIZE(A1, LDA), %xmm10 movhpd -11 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movsd -12 * SIZE(A2), %xmm8 movhpd -11 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movsd -12 * SIZE(A2, LDA), %xmm10 movhpd -11 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movsd -10 * SIZE(A1), %xmm8 movhpd -9 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movsd -10 * SIZE(A1, LDA), %xmm10 movhpd -9 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movsd -10 * SIZE(A2), %xmm8 movhpd -9 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movsd -10 * SIZE(A2, LDA), %xmm10 movhpd -9 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 movsd -8 * SIZE(A1), %xmm8 movhpd -7 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 movsd -8 * SIZE(A1, LDA), %xmm10 movhpd -7 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L103 ALIGN_3 .L104: pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movsd -16 * SIZE(A2), %xmm8 movhpd -15 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm10 movhpd -15 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movsd -14 * SIZE(A1), %xmm8 movhpd -13 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movsd -14 * SIZE(A1, LDA), %xmm10 movhpd -13 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movsd -14 * SIZE(A2), %xmm8 movhpd -13 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movsd -14 * SIZE(A2, LDA), %xmm10 movhpd -13 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 movsd -12 * SIZE(A1, LDA), %xmm10 movhpd -11 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movsd -12 * SIZE(A2), %xmm8 movhpd -11 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movsd -12 * SIZE(A2, LDA), %xmm10 movhpd -11 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movsd -10 * SIZE(A1), %xmm8 movhpd -9 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movsd -10 * SIZE(A1, LDA), %xmm10 movhpd -9 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movsd -10 * SIZE(A2), %xmm8 movhpd -9 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movsd -10 * SIZE(A2, LDA), %xmm10 movhpd -9 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) SUBPD %xmm11, %xmm7 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L105: testq $2, M je .L107 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -16 * SIZE(A1, LDA), %xmm10 movhpd -15 * SIZE(A1, LDA), %xmm10 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movsd -16 * SIZE(A2), %xmm8 movhpd -15 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm10 movhpd -15 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movsd -14 * SIZE(A1), %xmm8 movhpd -13 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movsd -14 * SIZE(A1, LDA), %xmm10 movhpd -13 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) SUBPD %xmm11, %xmm7 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movsd -14 * SIZE(A2), %xmm8 movhpd -13 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movsd -14 * SIZE(A2, LDA), %xmm10 movhpd -13 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 SUBPD %xmm11, %xmm7 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L107: testq $1, M je .L109 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -16 * SIZE(A1, LDA), %xmm10 movhpd -15 * SIZE(A1, LDA), %xmm10 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movsd -16 * SIZE(A2), %xmm8 movhpd -15 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movsd -16 * SIZE(A2, LDA), %xmm10 movhpd -15 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm3 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm12, %xmm9 SUBPD %xmm9, %xmm5 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm12, %xmm11 SUBPD %xmm11, %xmm7 ALIGN_3 .L109: pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0xc0, %xmm13, %xmm13 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm13, %xmm0 xorpd %xmm13, %xmm2 xorpd %xmm13, %xmm4 xorpd %xmm13, %xmm6 #else xorpd %xmm13, %xmm1 xorpd %xmm13, %xmm3 xorpd %xmm13, %xmm5 xorpd %xmm13, %xmm7 #endif #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 haddpd %xmm3, %xmm2 haddpd %xmm5, %xmm4 haddpd %xmm7, %xmm6 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm9 movapd %xmm4, %xmm10 unpcklpd %xmm5, %xmm4 unpckhpd %xmm5, %xmm10 movapd %xmm6, %xmm11 unpcklpd %xmm7, %xmm6 unpckhpd %xmm7, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 addpd %xmm10, %xmm4 addpd %xmm11, %xmm6 #endif pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm3 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm5 mulpd ALPHA_R, %xmm6 mulpd ALPHA_I, %xmm7 xorpd %xmm13, %xmm1 xorpd %xmm13, %xmm3 xorpd %xmm13, %xmm5 xorpd %xmm13, %xmm7 subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addq INCY, Y movsd 0 * SIZE(Y), %xmm7 movhpd 1 * SIZE(Y), %xmm7 addq INCY, Y addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm6, 0 * SIZE(Y1) movhpd %xmm6, 1 * SIZE(Y1) addq INCY, Y1 cmpq $4, N jge .L101 ALIGN_3 .L110: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L120 #if GEMV_UNROLL == 2 ALIGN_3 .L111: #endif subq $2, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 xorpd %xmm2, %xmm2 xorpd %xmm3, %xmm3 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) MOVUPS_XL1(-14 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq M, I sarq $2, I jle .L115 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm10 movhpd -15 * SIZE(A2), %xmm10 movsd -14 * SIZE(A1), %xmm12 movhpd -13 * SIZE(A1), %xmm12 movsd -14 * SIZE(A2), %xmm6 movhpd -13 * SIZE(A2), %xmm6 decq I jle .L114 ALIGN_3 .L113: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 movsd -12 * SIZE(A2), %xmm10 movhpd -11 * SIZE(A2), %xmm10 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 movsd -10 * SIZE(A1), %xmm12 movhpd -9 * SIZE(A1), %xmm12 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 movsd -10 * SIZE(A2), %xmm6 movhpd -9 * SIZE(A2), %xmm6 mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movsd -8 * SIZE(A1), %xmm8 movhpd -7 * SIZE(A1), %xmm8 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 movsd -8 * SIZE(A2), %xmm10 movhpd -7 * SIZE(A2), %xmm10 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 movsd -6 * SIZE(A1), %xmm12 movhpd -5 * SIZE(A1), %xmm12 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 movsd -6 * SIZE(A2), %xmm6 movhpd -5 * SIZE(A2), %xmm6 mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L113 ALIGN_3 .L114: pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 movsd -12 * SIZE(A2), %xmm10 movhpd -11 * SIZE(A2), %xmm10 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 movsd -10 * SIZE(A1), %xmm12 movhpd -9 * SIZE(A1), %xmm12 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 movsd -10 * SIZE(A2), %xmm6 movhpd -9 * SIZE(A2), %xmm6 mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 MOVUPS_XL1( -6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L115: testq $2, M je .L117 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm10 movhpd -15 * SIZE(A2), %xmm10 movsd -14 * SIZE(A1), %xmm12 movhpd -13 * SIZE(A1), %xmm12 movsd -14 * SIZE(A2), %xmm6 movhpd -13 * SIZE(A2), %xmm6 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 pshufd $0x4e, %xmm6, %xmm7 mulpd %xmm5, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm5, %xmm7 SUBPD %xmm7, %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L117: testq $1, M je .L119 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -16 * SIZE(A2), %xmm10 movhpd -15 * SIZE(A2), %xmm10 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 pshufd $0x4e, %xmm10, %xmm11 mulpd %xmm4, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm4, %xmm11 SUBPD %xmm11, %xmm3 ALIGN_3 .L119: pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 xorpd %xmm11, %xmm2 #else xorpd %xmm11, %xmm1 xorpd %xmm11, %xmm3 #endif #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 haddpd %xmm3, %xmm2 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 movapd %xmm2, %xmm9 unpcklpd %xmm3, %xmm2 unpckhpd %xmm3, %xmm9 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 #endif pshufd $0x4e, %xmm0, %xmm1 pshufd $0x4e, %xmm2, %xmm3 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm3 xorpd %xmm11, %xmm1 xorpd %xmm11, %xmm3 subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addq INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addq INCY, Y addpd %xmm4, %xmm0 addpd %xmm5, %xmm2 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L111 #endif ALIGN_3 .L120: #endif cmpq $1, N jl .L999 #if GEMV_UNROLL == 1 .L121: decq N #endif leaq 16 * SIZE(BUFFER), X1 movq A, A1 #if GEMV_UNROLL == 1 addq LDA, A #endif xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 MOVUPS_XL1(-16 * SIZE, X1, %xmm4) MOVUPS_XL1(-14 * SIZE, X1, %xmm5) movq M, I sarq $2, I jle .L125 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -14 * SIZE(A1), %xmm12 movhpd -13 * SIZE(A1), %xmm12 decq I jle .L124 ALIGN_3 .L123: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 movsd -10 * SIZE(A1), %xmm12 movhpd -9 * SIZE(A1), %xmm12 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movsd -8 * SIZE(A1), %xmm8 movhpd -7 * SIZE(A1), %xmm8 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 movsd -6 * SIZE(A1), %xmm12 movhpd -5 * SIZE(A1), %xmm12 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L123 ALIGN_3 .L124: pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 movhpd -11 * SIZE(A1), %xmm8 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 movsd -10 * SIZE(A1), %xmm12 movhpd -9 * SIZE(A1), %xmm12 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-10 * SIZE, X1, %xmm5) pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1( -8 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 MOVUPS_XL1(-6 * SIZE, X1, %xmm5) subq $-8 * SIZE, A1 subq $-8 * SIZE, X1 ALIGN_3 .L125: testq $2, M je .L127 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 movsd -14 * SIZE(A1), %xmm12 movhpd -13 * SIZE(A1), %xmm12 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 MOVUPS_XL1(-12 * SIZE, X1, %xmm4) pshufd $0x4e, %xmm12, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm0 mulpd %xmm5, %xmm13 SUBPD %xmm13, %xmm1 addq $4 * SIZE, A1 ALIGN_3 .L127: testq $1, M je .L129 movsd -16 * SIZE(A1), %xmm8 movhpd -15 * SIZE(A1), %xmm8 pshufd $0x4e, %xmm8, %xmm9 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm9 SUBPD %xmm9, %xmm1 ALIGN_3 .L129: pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 #else xorpd %xmm11, %xmm1 #endif #ifdef HAVE_SSE3 haddpd %xmm1, %xmm0 #else movapd %xmm0, %xmm8 unpcklpd %xmm1, %xmm0 unpckhpd %xmm1, %xmm8 addpd %xmm8, %xmm0 #endif pshufd $0x4e, %xmm0, %xmm1 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 xorpd %xmm11, %xmm1 subpd %xmm1, %xmm0 movsd 0 * SIZE(Y), %xmm4 movhpd 1 * SIZE(Y), %xmm4 addpd %xmm4, %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) #if GEMV_UNROLL == 1 addq INCY, Y addq INCY, Y1 cmpq $1, N jge .L121 #endif #endif ALIGN_3 .L999: movq M, I salq $ZBASE_SHIFT,I addq I,AA jmp .L0t .L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemv_t_4.c000066400000000000000000000372321313527062700174330ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) || defined(ZEN) #include "zgemv_t_microk_haswell-4.c" #endif #define NBMAX 1024 #ifndef HAVE_KERNEL_4x4 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1,*a2,*a3; a0 = ap[0]; a1 = ap[1]; a2 = ap[2]; a3 = ap[3]; FLOAT alpha_r = alpha[0]; FLOAT alpha_i = alpha[1]; FLOAT temp_r0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_r2 = 0.0; FLOAT temp_r3 = 0.0; FLOAT temp_i0 = 0.0; FLOAT temp_i1 = 0.0; FLOAT temp_i2 = 0.0; FLOAT temp_i3 = 0.0; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1]; temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i]; temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1]; temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i]; #else temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1]; temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i]; temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1]; temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i]; #endif } #if !defined(XCONJ) y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; #else y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; #endif } #endif #ifndef HAVE_KERNEL_4x2 static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0,*a1; a0 = ap[0]; a1 = ap[1]; FLOAT alpha_r = alpha[0]; FLOAT alpha_i = alpha[1]; FLOAT temp_r0 = 0.0; FLOAT temp_r1 = 0.0; FLOAT temp_i0 = 0.0; FLOAT temp_i1 = 0.0; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i]; #else temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1]; temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i]; #endif } #if !defined(XCONJ) y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; #else y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif } #endif #ifndef HAVE_KERNEL_4x1 static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG i; FLOAT *a0; a0 = ap; FLOAT alpha_r = alpha[0]; FLOAT alpha_i = alpha[1]; FLOAT temp_r0 = 0.0; FLOAT temp_i0 = 0.0; for ( i=0; i< 2*n; i+=2 ) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i]; #else temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1]; temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i]; #endif } #if !defined(XCONJ) y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; #else y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; #endif } #endif static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; for ( i=0; i> 2 ; n2 = n & 3 ; m3 = m & 3 ; m1 = m - m3; m2 = (m & (NBMAX-1)) - m3 ; alpha[0] = alpha_r; alpha[1] = alpha_i; BLASLONG NB = NBMAX; while ( NB == NBMAX ) { m1 -= NB; if ( m1 < 0) { if ( m2 == 0 ) break; NB = m2; } y_ptr = y; a_ptr = a; x_ptr = x; ap[0] = a_ptr; ap[1] = a_ptr + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if ( inc_x != 2 ) copy_x(NB,x_ptr,xbuffer,inc_x); else xbuffer = x_ptr; if ( inc_y == 2 ) { for( i = 0; i < n1 ; i++) { zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; y_ptr += 8; } if ( n2 & 2 ) { zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); a_ptr += lda * 2; y_ptr += 4; } if ( n2 & 1 ) { zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); a_ptr += lda; y_ptr += 2; } } else { for( i = 0; i < n1 ; i++) { memset(ybuffer,0,64); zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); ap[0] += lda4; ap[1] += lda4; ap[2] += lda4; ap[3] += lda4; a_ptr += lda4; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; y_ptr += inc_y; y_ptr[0] += ybuffer[2]; y_ptr[1] += ybuffer[3]; y_ptr += inc_y; y_ptr[0] += ybuffer[4]; y_ptr[1] += ybuffer[5]; y_ptr += inc_y; y_ptr[0] += ybuffer[6]; y_ptr[1] += ybuffer[7]; y_ptr += inc_y; } for( i = 0; i < n2 ; i++) { memset(ybuffer,0,64); zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); a_ptr += lda; y_ptr[0] += ybuffer[0]; y_ptr[1] += ybuffer[1]; y_ptr += inc_y; } } a += 2 * NB; x += NB * inc_x; } if ( m3 == 0 ) return(0); x_ptr = x; j=0; a_ptr = a; y_ptr = y; if ( m3 == 3 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; FLOAT x2 = x_ptr[0]; FLOAT x3 = x_ptr[1]; x_ptr += inc_x; FLOAT x4 = x_ptr[0]; FLOAT x5 = x_ptr[1]; while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif #if !defined(XCONJ) y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } if ( m3 == 2 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT temp_r1 ; FLOAT temp_i1 ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; x_ptr += inc_x; FLOAT x2 = x_ptr[0]; FLOAT x3 = x_ptr[1]; FLOAT ar = alpha[0]; FLOAT ai = alpha[1]; while ( j < ( n & -2 )) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; a_ptr += lda; temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; a_ptr += lda; temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 - ai * temp_i1; y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 + ai * temp_i1; y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; j+=2; } while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } if ( m3 == 1 ) { FLOAT temp_r ; FLOAT temp_i ; FLOAT temp_r1 ; FLOAT temp_i1 ; FLOAT x0 = x_ptr[0]; FLOAT x1 = x_ptr[1]; FLOAT ar = alpha[0]; FLOAT ai = alpha[1]; while ( j < ( n & -2 )) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; a_ptr += lda; temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; a_ptr += lda; temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 - ai * temp_i1; y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; y_ptr += inc_y; y_ptr[0] += ar * temp_r1 + ai * temp_i1; y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif a_ptr += lda; y_ptr += inc_y; j+=2; } while ( j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) y_ptr[0] += ar * temp_r - ai * temp_i; y_ptr[1] += ar * temp_i + ai * temp_r; #else y_ptr[0] += ar * temp_r + ai * temp_i; y_ptr[1] -= ar * temp_i - ai * temp_r; #endif a_ptr += lda; y_ptr += inc_y; j++; } return(0); } return(0); } OpenBLAS-0.2.20/kernel/x86_64/zgemv_t_atom.S000066400000000000000000000455131313527062700202110ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #ifdef ATOM #define PREFETCH prefetchnta #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 6) #endif #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) #define OLD_X 64 + STACKSIZE(%rsp) #define OLD_INCX 72 + STACKSIZE(%rsp) #define OLD_Y 80 + STACKSIZE(%rsp) #define OLD_INCY 88 + STACKSIZE(%rsp) #define OLD_BUFFER 96 + STACKSIZE(%rsp) #define M %rcx #define N %rdx #define A %r8 #define LDA %r9 #define X %rdi #define INCX %rsi #define Y %rbp #define INCY %r10 #endif #define I %rax #define J %rbx #define A1 %r11 #define A2 %r12 #define X1 %r13 #define Y1 %r14 #define BUFFER %r15 #define ALPHA_R %xmm14 #define ALPHA_I %xmm15 #if !defined(CONJ) && !defined(XCONJ) #define ADD1 addsd #define ADD2 addsd #define ADD3 subsd #define ADD4 addsd #endif #if defined(CONJ) && !defined(XCONJ) #define ADD1 addsd #define ADD2 addsd #define ADD3 addsd #define ADD4 subsd #endif #if !defined(CONJ) && defined(XCONJ) #define ADD1 addsd #define ADD2 subsd #define ADD3 addsd #define ADD4 addsd #endif #if defined(CONJ) && defined(XCONJ) #define ADD1 addsd #define ADD2 subsd #define ADD3 subsd #define ADD4 subsd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #endif movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, LDA salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY movaps %xmm0, ALPHA_R movaps %xmm1, ALPHA_I subq $-16 * SIZE, A testq M, M jle .L999 testq N, N jle .L999 ALIGN_3 movq BUFFER, X1 movq Y, Y1 movq M, I sarq $2, I jle .L05 ALIGN_4 .L02: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addq INCX, X movsd 0 * SIZE(X), %xmm1 movhpd 1 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 movhpd 1 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 movhpd 1 * SIZE(X), %xmm3 addq INCX, X movapd %xmm0, 0 * SIZE(X1) movapd %xmm1, 2 * SIZE(X1) movapd %xmm2, 4 * SIZE(X1) movapd %xmm3, 6 * SIZE(X1) addq $8 * SIZE, X1 decq I jg .L02 ALIGN_4 .L05: movq M, I andq $3, I jle .L10 ALIGN_2 .L06: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addq INCX, X movapd %xmm0, 0 * SIZE(X1) addq $2 * SIZE, X1 decq I jg .L06 ALIGN_4 .L10: movq N, J sarq $1, J jle .L20 ALIGN_3 .L11: leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 xorpd %xmm2, %xmm2 xorpd %xmm3, %xmm3 movsd -16 * SIZE(X1), %xmm4 movsd -15 * SIZE(X1), %xmm5 movsd -14 * SIZE(X1), %xmm6 movsd -13 * SIZE(X1), %xmm7 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq M, I sarq $2, I jle .L15 movsd -16 * SIZE(A1), %xmm8 movsd -15 * SIZE(A1), %xmm9 movsd -16 * SIZE(A2), %xmm10 movsd -15 * SIZE(A2), %xmm11 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 mulsd %xmm5, %xmm12 decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 ADD1 %xmm8, %xmm0 movsd -14 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm4, %xmm10 ADD3 %xmm9, %xmm0 movsd -13 * SIZE(A1), %xmm9 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm5, %xmm11 movsd -11 * SIZE(X1), %xmm5 ADD1 %xmm10, %xmm2 movsd -14 * SIZE(A2), %xmm10 mulsd %xmm4, %xmm13 movsd -12 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm11, %xmm2 movsd -13 * SIZE(A2), %xmm11 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm3 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 ADD1 %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 mulsd %xmm6, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm6, %xmm10 ADD3 %xmm9, %xmm0 movsd -11 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm7, %xmm11 movsd -9 * SIZE(X1), %xmm7 ADD1 %xmm10, %xmm2 movsd -12 * SIZE(A2), %xmm10 mulsd %xmm6, %xmm13 movsd -10 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 ADD3 %xmm11, %xmm2 movsd -11 * SIZE(A2), %xmm11 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm3 #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A2) #endif movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 ADD1 %xmm8, %xmm0 movsd -10 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm4, %xmm10 ADD3 %xmm9, %xmm0 movsd -9 * SIZE(A1), %xmm9 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm5, %xmm11 movsd -7 * SIZE(X1), %xmm5 ADD1 %xmm10, %xmm2 movsd -10 * SIZE(A2), %xmm10 mulsd %xmm4, %xmm13 movsd -8 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm11, %xmm2 movsd -9 * SIZE(A2), %xmm11 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm3 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 ADD1 %xmm8, %xmm0 movsd -8 * SIZE(A1), %xmm8 mulsd %xmm6, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm6, %xmm10 ADD3 %xmm9, %xmm0 movsd -7 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm7, %xmm11 movsd -5 * SIZE(X1), %xmm7 ADD1 %xmm10, %xmm2 movsd -8 * SIZE(A2), %xmm10 mulsd %xmm6, %xmm13 movsd -6 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 subq $-8 * SIZE, A1 mulsd %xmm4, %xmm8 subq $-8 * SIZE, X1 ADD3 %xmm11, %xmm2 movsd -7 * SIZE(A2), %xmm11 mulsd %xmm5, %xmm12 subq $-8 * SIZE, A2 ADD4 %xmm13, %xmm3 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 ADD1 %xmm8, %xmm0 movsd -14 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm4, %xmm10 ADD3 %xmm9, %xmm0 movsd -13 * SIZE(A1), %xmm9 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm5, %xmm11 movsd -11 * SIZE(X1), %xmm5 ADD1 %xmm10, %xmm2 movsd -14 * SIZE(A2), %xmm10 mulsd %xmm4, %xmm13 movsd -12 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm11, %xmm2 movsd -13 * SIZE(A2), %xmm11 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm3 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 ADD1 %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 mulsd %xmm6, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm6, %xmm10 ADD3 %xmm9, %xmm0 movsd -11 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm7, %xmm11 movsd -9 * SIZE(X1), %xmm7 ADD1 %xmm10, %xmm2 movsd -12 * SIZE(A2), %xmm10 mulsd %xmm6, %xmm13 movsd -10 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 ADD3 %xmm11, %xmm2 movsd -11 * SIZE(A2), %xmm11 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm3 movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 ADD1 %xmm8, %xmm0 movsd -10 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm4, %xmm10 ADD3 %xmm9, %xmm0 movsd -9 * SIZE(A1), %xmm9 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm5, %xmm11 movsd -7 * SIZE(X1), %xmm5 ADD1 %xmm10, %xmm2 movsd -10 * SIZE(A2), %xmm10 mulsd %xmm4, %xmm13 movsd -8 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm11, %xmm2 movsd -9 * SIZE(A2), %xmm11 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm3 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 ADD1 %xmm8, %xmm0 mulsd %xmm6, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm6, %xmm10 ADD3 %xmm9, %xmm0 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm7, %xmm11 movsd -5 * SIZE(X1), %xmm7 ADD1 %xmm10, %xmm2 mulsd %xmm6, %xmm13 movsd -6 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm3 ADD3 %xmm11, %xmm2 ADD4 %xmm13, %xmm3 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L15: testq $2, M je .L17 movsd -16 * SIZE(A1), %xmm8 movsd -15 * SIZE(A1), %xmm9 movsd -16 * SIZE(A2), %xmm10 movsd -15 * SIZE(A2), %xmm11 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 mulsd %xmm5, %xmm12 movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 ADD1 %xmm8, %xmm0 movsd -14 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm4, %xmm10 ADD3 %xmm9, %xmm0 movsd -13 * SIZE(A1), %xmm9 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm5, %xmm11 movsd -11 * SIZE(X1), %xmm5 ADD1 %xmm10, %xmm2 movsd -14 * SIZE(A2), %xmm10 mulsd %xmm4, %xmm13 movsd -12 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm3 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm11, %xmm2 movsd -13 * SIZE(A2), %xmm11 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm3 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 ADD1 %xmm8, %xmm0 mulsd %xmm6, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm6, %xmm10 ADD3 %xmm9, %xmm0 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm7, %xmm11 ADD1 %xmm10, %xmm2 mulsd %xmm6, %xmm13 ADD2 %xmm12, %xmm3 ADD3 %xmm11, %xmm2 ADD4 %xmm13, %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L17: testq $1, M je .L19 movsd -16 * SIZE(A1), %xmm8 movsd -15 * SIZE(A1), %xmm9 movsd -16 * SIZE(A2), %xmm10 movsd -15 * SIZE(A2), %xmm11 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 mulsd %xmm5, %xmm12 movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 ADD1 %xmm8, %xmm0 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm1 movapd %xmm10, %xmm12 mulsd %xmm4, %xmm10 ADD3 %xmm9, %xmm0 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm11, %xmm13 mulsd %xmm5, %xmm11 ADD1 %xmm10, %xmm2 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm3 ADD3 %xmm11, %xmm2 ADD4 %xmm13, %xmm3 ALIGN_3 .L19: movsd 0 * SIZE(Y), %xmm4 movapd %xmm0, %xmm10 mulsd ALPHA_R, %xmm0 movsd 1 * SIZE(Y), %xmm5 movapd %xmm1, %xmm11 mulsd ALPHA_R, %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm6 movapd %xmm2, %xmm12 mulsd ALPHA_R, %xmm2 movsd 1 * SIZE(Y), %xmm7 movapd %xmm3, %xmm13 mulsd ALPHA_R, %xmm3 addq INCY, Y mulsd ALPHA_I, %xmm10 mulsd ALPHA_I, %xmm11 mulsd ALPHA_I, %xmm12 mulsd ALPHA_I, %xmm13 addsd %xmm10, %xmm1 subsd %xmm11, %xmm0 addsd %xmm12, %xmm3 subsd %xmm13, %xmm2 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 addsd %xmm6, %xmm2 addsd %xmm7, %xmm3 movlpd %xmm0, 0 * SIZE(Y1) movlpd %xmm1, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movlpd %xmm3, 1 * SIZE(Y1) addq INCY, Y1 decq J jg .L11 ALIGN_3 .L20: testq $1, N jle .L999 leaq 16 * SIZE(BUFFER), X1 movq A, A1 xorpd %xmm0, %xmm0 xorpd %xmm1, %xmm1 movsd -16 * SIZE(X1), %xmm4 movsd -15 * SIZE(X1), %xmm5 movsd -14 * SIZE(X1), %xmm6 movsd -13 * SIZE(X1), %xmm7 movq M, I sarq $2, I jle .L25 movsd -16 * SIZE(A1), %xmm8 movsd -15 * SIZE(A1), %xmm9 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 mulsd %xmm5, %xmm12 decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) #endif movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 movsd -11 * SIZE(X1), %xmm5 ADD1 %xmm8, %xmm0 movsd -14 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 movsd -12 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm9, %xmm0 movsd -13 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 movsd -9 * SIZE(X1), %xmm7 ADD1 %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 mulsd %xmm6, %xmm13 movsd -10 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 ADD3 %xmm9, %xmm0 movsd -11 * SIZE(A1), %xmm9 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 movsd -7 * SIZE(X1), %xmm5 ADD1 %xmm8, %xmm0 movsd -10 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 movsd -8 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm9, %xmm0 movsd -9 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 movsd -5 * SIZE(X1), %xmm7 ADD1 %xmm8, %xmm0 movsd -8 * SIZE(A1), %xmm8 mulsd %xmm6, %xmm13 movsd -6 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 ADD3 %xmm9, %xmm0 mulsd %xmm5, %xmm12 movsd -7 * SIZE(A1), %xmm9 ADD4 %xmm13, %xmm1 subq $-8 * SIZE, A1 subq $-8 * SIZE, X1 subq $-8 * SIZE, A2 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 movsd -11 * SIZE(X1), %xmm5 ADD1 %xmm8, %xmm0 movsd -14 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 movsd -12 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm9, %xmm0 movsd -13 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 movsd -9 * SIZE(X1), %xmm7 ADD1 %xmm8, %xmm0 movsd -12 * SIZE(A1), %xmm8 mulsd %xmm6, %xmm13 movsd -10 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 ADD3 %xmm9, %xmm0 movsd -11 * SIZE(A1), %xmm9 mulsd %xmm5, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 movsd -7 * SIZE(X1), %xmm5 ADD1 %xmm8, %xmm0 movsd -10 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 movsd -8 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm9, %xmm0 movsd -9 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 movsd -5 * SIZE(X1), %xmm7 ADD1 %xmm8, %xmm0 mulsd %xmm6, %xmm13 movsd -6 * SIZE(X1), %xmm6 ADD2 %xmm12, %xmm1 ADD3 %xmm9, %xmm0 ADD4 %xmm13, %xmm1 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L25: testq $2, M je .L27 movsd -16 * SIZE(A1), %xmm8 movsd -15 * SIZE(A1), %xmm9 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 mulsd %xmm5, %xmm12 movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 movsd -11 * SIZE(X1), %xmm5 ADD1 %xmm8, %xmm0 movsd -14 * SIZE(A1), %xmm8 mulsd %xmm4, %xmm13 movsd -12 * SIZE(X1), %xmm4 ADD2 %xmm12, %xmm1 movapd %xmm8, %xmm12 mulsd %xmm6, %xmm8 ADD3 %xmm9, %xmm0 movsd -13 * SIZE(A1), %xmm9 mulsd %xmm7, %xmm12 ADD4 %xmm13, %xmm1 movapd %xmm9, %xmm13 mulsd %xmm7, %xmm9 ADD1 %xmm8, %xmm0 mulsd %xmm6, %xmm13 ADD2 %xmm12, %xmm1 ADD3 %xmm9, %xmm0 ADD4 %xmm13, %xmm1 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L27: testq $1, M je .L29 movsd -16 * SIZE(A1), %xmm8 movsd -15 * SIZE(A1), %xmm9 movapd %xmm8, %xmm12 mulsd %xmm4, %xmm8 mulsd %xmm5, %xmm12 movapd %xmm9, %xmm13 mulsd %xmm5, %xmm9 ADD1 %xmm8, %xmm0 mulsd %xmm4, %xmm13 ADD2 %xmm12, %xmm1 ADD3 %xmm9, %xmm0 ADD4 %xmm13, %xmm1 ALIGN_3 .L29: movsd 0 * SIZE(Y), %xmm4 movapd %xmm0, %xmm10 mulsd ALPHA_R, %xmm0 movsd 1 * SIZE(Y), %xmm5 movapd %xmm1, %xmm11 mulsd ALPHA_R, %xmm1 mulsd ALPHA_I, %xmm10 mulsd ALPHA_I, %xmm11 addsd %xmm10, %xmm1 subsd %xmm11, %xmm0 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 movlpd %xmm0, 0 * SIZE(Y1) movlpd %xmm1, 1 * SIZE(Y1) ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemv_t_dup.S000066400000000000000000000611071313527062700200360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #include "l2param.h" #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) #define M %rdi #define N %rsi #define A %rcx #define LDA %r8 #define X %r9 #define INCX %rdx #define Y %rbp #define INCY %r10 #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) #define OLD_X 64 + STACKSIZE(%rsp) #define OLD_INCX 72 + STACKSIZE(%rsp) #define OLD_Y 80 + STACKSIZE(%rsp) #define OLD_INCY 88 + STACKSIZE(%rsp) #define OLD_BUFFER 96 + STACKSIZE(%rsp) #define M %rcx #define N %rdx #define A %r8 #define LDA %r9 #define X %rdi #define INCX %rsi #define Y %rbp #define INCY %r10 #endif #define I %rax #define J %rbx #define A1 %r11 #define A2 %r12 #define X1 %r13 #define Y1 %r14 #define BUFFER %r15 #define ALPHA_R %xmm14 #define ALPHA_I %xmm15 #undef SUBPD #ifndef CONJ #define SUBPD addpd #else #define SUBPD subpd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movaps %xmm3, %xmm0 movss OLD_ALPHA_I, %xmm1 #endif movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, LDA salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0x04, %xmm5, %xmm5 unpcklpd %xmm1, %xmm0 movaps %xmm0, ALPHA_R pshufd $0x4e, %xmm0, ALPHA_I xorps %xmm5, ALPHA_I subq $-16 * SIZE, A testq M, M jle .L999 testq N, N jle .L999 ALIGN_3 movq BUFFER, X1 movq Y, Y1 movq M, I sarq $2, I jle .L05 ALIGN_4 .L02: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addq INCX, X movsd 0 * SIZE(X), %xmm1 movhpd 1 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 movhpd 1 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 movhpd 1 * SIZE(X), %xmm3 addq INCX, X movapd %xmm0, 0 * SIZE(X1) movapd %xmm1, 2 * SIZE(X1) movapd %xmm2, 4 * SIZE(X1) movapd %xmm3, 6 * SIZE(X1) addq $8 * SIZE, X1 decq I jg .L02 ALIGN_4 .L05: movq M, I andq $3, I jle .L10 ALIGN_2 .L06: movsd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 addq INCX, X movapd %xmm0, 0 * SIZE(X1) addq $2 * SIZE, X1 decq I jg .L06 ALIGN_4 .L10: #if GEMV_UNROLL >= 4 cmpq $4, N jl .L20 ALIGN_3 .L11: subq $4, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA, 2), A2 leaq (A1, LDA, 4), A MOVUPS_XL1(-16 * SIZE, X1, %xmm12) xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq M, I sarq $2, I jle .L15 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -16 * SIZE(A1, LDA), %xmm10 movddup -15 * SIZE(A1, LDA), %xmm11 decq I jle .L14 ALIGN_3 .L13: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -16 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -15 * SIZE(A2), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -16 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 movddup -15 * SIZE(A2, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movddup -14 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 movddup -13 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movddup -14 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm7 movddup -13 * SIZE(A1, LDA), %xmm11 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -14 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -13 * SIZE(A2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -14 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 movddup -13 * SIZE(A2, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 movddup -12 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 movddup -11 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 movddup -12 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm7 movddup -11 * SIZE(A1, LDA), %xmm11 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -12 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -11 * SIZE(A2), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -12 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 movddup -11 * SIZE(A2, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movddup -10 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 movddup -9 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movddup -10 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm11, %xmm7 movddup -9 * SIZE(A1, LDA), %xmm11 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -10 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -9 * SIZE(A2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -10 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 movddup -9 * SIZE(A2, LDA), %xmm11 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 movddup -8 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 movddup -7 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 movddup -8 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm7 movddup -7 * SIZE(A1, LDA), %xmm11 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L13 ALIGN_3 .L14: mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -16 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -15 * SIZE(A2), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -16 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 movddup -15 * SIZE(A2, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movddup -14 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 movddup -13 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movddup -14 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm7 movddup -13 * SIZE(A1, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -14 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -13 * SIZE(A2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -14 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 movddup -13 * SIZE(A2, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 movddup -12 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 movddup -11 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 movddup -12 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm7 movddup -11 * SIZE(A1, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -12 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -11 * SIZE(A2), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -12 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 movddup -11 * SIZE(A2, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movddup -10 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 movddup -9 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movddup -10 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm11, %xmm7 movddup -9 * SIZE(A1, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -10 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -9 * SIZE(A2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -10 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 movddup -9 * SIZE(A2, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm7 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L15: testq $2, M je .L17 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -16 * SIZE(A1, LDA), %xmm10 movddup -15 * SIZE(A1, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -16 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -15 * SIZE(A2), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -16 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 movddup -15 * SIZE(A2, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 movddup -14 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 movddup -13 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 movddup -14 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm7 movddup -13 * SIZE(A1, LDA), %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -14 * SIZE(A2), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -13 * SIZE(A2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -14 * SIZE(A2, LDA), %xmm10 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 movddup -13 * SIZE(A2, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm5 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm7 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L17: testq $1, M je .L19 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -16 * SIZE(A1, LDA), %xmm10 movddup -15 * SIZE(A1, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -16 * SIZE(A2), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -15 * SIZE(A2), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -16 * SIZE(A2, LDA), %xmm10 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 movddup -15 * SIZE(A2, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm4 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm5 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm6 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm7 ALIGN_3 .L19: pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0x40, %xmm13, %xmm13 #ifndef XCONJ xorps %xmm13, %xmm1 xorps %xmm13, %xmm3 xorps %xmm13, %xmm5 xorps %xmm13, %xmm7 #else xorps %xmm13, %xmm0 xorps %xmm13, %xmm2 xorps %xmm13, %xmm4 xorps %xmm13, %xmm6 #endif pshufd $0x4e, %xmm1, %xmm1 pshufd $0x4e, %xmm3, %xmm3 pshufd $0x4e, %xmm5, %xmm5 pshufd $0x4e, %xmm7, %xmm7 #ifndef CONJ addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #else subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #endif pshufd $0xee, %xmm0, %xmm1 movddup %xmm0, %xmm0 pshufd $0xee, %xmm2, %xmm3 movddup %xmm2, %xmm2 pshufd $0xee, %xmm4, %xmm5 movddup %xmm4, %xmm4 pshufd $0xee, %xmm6, %xmm7 movddup %xmm6, %xmm6 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm3 mulpd ALPHA_R, %xmm4 mulpd ALPHA_I, %xmm5 mulpd ALPHA_R, %xmm6 mulpd ALPHA_I, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y movsd 0 * SIZE(Y), %xmm5 movhpd 1 * SIZE(Y), %xmm5 addq INCY, Y movsd 0 * SIZE(Y), %xmm7 movhpd 1 * SIZE(Y), %xmm7 addq INCY, Y addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm4, 0 * SIZE(Y1) movhpd %xmm4, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm6, 0 * SIZE(Y1) movhpd %xmm6, 1 * SIZE(Y1) addq INCY, Y1 cmpq $4, N jge .L11 ALIGN_3 .L20: #endif #if GEMV_UNROLL >= 2 cmpq $2, N jl .L30 #if GEMV_UNROLL == 2 ALIGN_3 .L21: #endif subq $2, N leaq 16 * SIZE(BUFFER), X1 movq A, A1 leaq (A1, LDA), A2 leaq (A1, LDA, 2), A MOVUPS_XL1(-16 * SIZE, X1, %xmm12) xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 #ifdef PREFETCHW PREFETCHW 3 * SIZE(Y1) #endif movq M, I sarq $2, I jle .L25 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -16 * SIZE(A1, LDA), %xmm10 movddup -15 * SIZE(A1, LDA), %xmm11 decq I jle .L24 ALIGN_3 .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -14 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -13 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -14 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 movddup -13 * SIZE(A1, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -12 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -11 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -12 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 movddup -11 * SIZE(A1, LDA), %xmm11 #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -10 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -9 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -10 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 movddup -9 * SIZE(A1, LDA), %xmm11 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) #endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -8 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -7 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -8 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 movddup -7 * SIZE(A1, LDA), %xmm11 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L23 ALIGN_3 .L24: mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -14 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -13 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -14 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 movddup -13 * SIZE(A1, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movddup -12 * SIZE(A1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movddup -11 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -12 * SIZE(A1, LDA), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 movddup -11 * SIZE(A1, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -10 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -9 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -10 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 movddup -9 * SIZE(A1, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 subq $-8 * SIZE, A1 subq $-8 * SIZE, A2 subq $-8 * SIZE, X1 ALIGN_3 .L25: testq $2, M je .L27 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -16 * SIZE(A1, LDA), %xmm10 movddup -15 * SIZE(A1, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -14 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 movddup -13 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 movddup -14 * SIZE(A1, LDA), %xmm10 mulpd %xmm12, %xmm11 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm11, %xmm3 movddup -13 * SIZE(A1, LDA), %xmm11 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L27: testq $1, M je .L29 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -16 * SIZE(A1, LDA), %xmm10 movddup -15 * SIZE(A1, LDA), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm12, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm12, %xmm11 addpd %xmm11, %xmm3 ALIGN_3 .L29: pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0x40, %xmm13, %xmm13 #ifndef XCONJ xorps %xmm13, %xmm1 xorps %xmm13, %xmm3 #else xorps %xmm13, %xmm0 xorps %xmm13, %xmm2 #endif pshufd $0x4e, %xmm1, %xmm1 pshufd $0x4e, %xmm3, %xmm3 #ifndef CONJ addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 #else subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 #endif pshufd $0xee, %xmm0, %xmm1 movddup %xmm0, %xmm0 pshufd $0xee, %xmm2, %xmm3 movddup %xmm2, %xmm2 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 mulpd ALPHA_R, %xmm2 mulpd ALPHA_I, %xmm3 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addq INCY, Y movsd 0 * SIZE(Y), %xmm3 movhpd 1 * SIZE(Y), %xmm3 addq INCY, Y addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) addq INCY, Y1 movlpd %xmm2, 0 * SIZE(Y1) movhpd %xmm2, 1 * SIZE(Y1) addq INCY, Y1 #if GEMV_UNROLL == 2 cmpq $2, N jge .L21 #endif ALIGN_3 .L30: #endif cmpq $1, N jl .L999 #if GEMV_UNROLL == 1 .L31: decq N #endif leaq 16 * SIZE(BUFFER), X1 movq A, A1 #if GEMV_UNROLL == 1 addq LDA, A #endif MOVUPS_XL1(-16 * SIZE, X1, %xmm12) xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 MOVUPS_XL1(-14 * SIZE, X1, %xmm13) xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 movq M, I sarq $2, I jle .L35 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -14 * SIZE(A1), %xmm10 movddup -13 * SIZE(A1), %xmm11 decq I jle .L34 ALIGN_3 .L33: #ifdef PREFETCH PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -12 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm9, %xmm1 movddup -11 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -10 * SIZE(A1), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 movddup -9 * SIZE(A1), %xmm11 #ifdef PREFETCHW PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) #endif mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -8 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm9, %xmm1 movddup -7 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -6 * SIZE(A1), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 movddup -5 * SIZE(A1), %xmm11 subq $-8 * SIZE, A1 subq $-8 * SIZE, X1 subq $1, I BRANCH jg .L33 ALIGN_3 .L34: mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 movddup -12 * SIZE(A1), %xmm8 mulpd %xmm12, %xmm9 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) addpd %xmm9, %xmm1 movddup -11 * SIZE(A1), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movddup -10 * SIZE(A1), %xmm10 mulpd %xmm13, %xmm11 MOVUPS_XL1(-10 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 movddup -9 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm12, %xmm9 MOVUPS_XL1( -8 * SIZE, X1, %xmm12) addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 MOVUPS_XL1( -6 * SIZE, X1, %xmm13) addpd %xmm11, %xmm3 subq $-8 * SIZE, A1 subq $-8 * SIZE, X1 ALIGN_3 .L35: testq $2, M je .L37 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 movddup -14 * SIZE(A1), %xmm10 movddup -13 * SIZE(A1), %xmm11 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 MOVUPS_XL1(-12 * SIZE, X1, %xmm12) mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 addpd %xmm11, %xmm3 addq $4 * SIZE, A1 ALIGN_3 .L37: testq $1, M je .L39 movddup -16 * SIZE(A1), %xmm8 movddup -15 * SIZE(A1), %xmm9 mulpd %xmm12, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm12, %xmm9 addpd %xmm9, %xmm1 ALIGN_3 .L39: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0x40, %xmm13, %xmm13 #ifndef XCONJ xorps %xmm13, %xmm1 #else xorps %xmm13, %xmm0 #endif pshufd $0x4e, %xmm1, %xmm1 #ifndef CONJ addpd %xmm1, %xmm0 #else subpd %xmm1, %xmm0 #endif pshufd $0xee, %xmm0, %xmm1 movddup %xmm0, %xmm0 mulpd ALPHA_R, %xmm0 mulpd ALPHA_I, %xmm1 addpd %xmm1, %xmm0 movsd 0 * SIZE(Y), %xmm1 movhpd 1 * SIZE(Y), %xmm1 addpd %xmm1, %xmm0 movlpd %xmm0, 0 * SIZE(Y1) movhpd %xmm0, 1 * SIZE(Y1) #if GEMV_UNROLL == 1 addq INCY, Y addq INCY, Y1 cmpq $1, N jge .L31 #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zgemv_t_microk_bulldozer-4.c000066400000000000000000000515701313527062700230000ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary froms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary from must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" // temp "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" ".align 16 \n\t" "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "prefetcht0 192(%5,%0,8) \n\t" "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "prefetcht0 192(%6,%0,8) \n\t" "vmovups (%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 "prefetcht0 192(%7,%0,8) \n\t" "vmovups (%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "vmovups 16(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 "vmovups 16(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "vmovups 32(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 "vmovups 32(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "vmovups 48(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2 "vmovups 48(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" #endif "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" #endif "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t" "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t" "vmovups %%xmm8 , (%3) \n\t" "vmovups %%xmm10, 16(%3) \n\t" "vmovups %%xmm12, 32(%3) \n\t" "vmovups %%xmm14, 48(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x2 1 static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp ".align 16 \n\t" "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "prefetcht0 192(%5,%0,8) \n\t" "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" #endif "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" #endif "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" "vmovups %%xmm8 , (%3) \n\t" "vmovups %%xmm10, 16(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x1 1 static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp ".align 16 \n\t" "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 16(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0 "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x0 "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x0 "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0 "vmovups 48(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0 "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x0 "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x0 "addq $8 , %0 \n\t" "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "subq $4 , %1 \n\t" "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" #endif "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" #endif "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" "vmovups %%xmm8 , (%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 "r" (alpha) // 5 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zgemv_t_microk_haswell-4.c000066400000000000000000000500501313527062700224250ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary froms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary from must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_4x4 1 static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" // ".align 16 \n\t" "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 "prefetcht0 192(%5,%0,8) \n\t" "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 "prefetcht0 192(%6,%0,8) \n\t" "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 "prefetcht0 192(%7,%0,8) \n\t" "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2 "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3 "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t" "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t" #else "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t" "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" #endif "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vextractf128 $1, %%ymm10, %%xmm11 \n\t" "vextractf128 $1, %%ymm12, %%xmm13 \n\t" "vextractf128 $1, %%ymm14, %%xmm15 \n\t" "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t" "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t" "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" #endif "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t" "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t" "vmovups %%xmm8 , (%3) \n\t" "vmovups %%xmm10, 16(%3) \n\t" "vmovups %%xmm12, 32(%3) \n\t" "vmovups %%xmm14, 48(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (ap[2]), // 6 "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x2 1 static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp // ".align 16 \n\t" "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 "prefetcht0 192(%5,%0,8) \n\t" "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1 "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t" "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t" #else "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t" #endif "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vextractf128 $1, %%ymm10, %%xmm11 \n\t" "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" #endif "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t" "vmovups %%xmm8 , (%3) \n\t" "vmovups %%xmm10, 16(%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 "r" (ap[1]), // 5 "r" (alpha) // 6 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #define HAVE_KERNEL_4x1 1 static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; __asm__ __volatile__ ( "vzeroupper \n\t" "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp // ".align 16 \n\t" "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "prefetcht0 192(%4,%0,8) \n\t" "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1 "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1 "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0 "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1 "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1 "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1 "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1 "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t" #else "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t" "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" #endif "vextractf128 $1, %%ymm8 , %%xmm9 \n\t" "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r #if !defined(XCONJ) "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t" "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t" #else "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" #endif "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t" "vmovups %%xmm8 , (%3) \n\t" "vzeroupper \n\t" : : "r" (i), // 0 "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 "r" (alpha) // 5 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/znrm2.S000066400000000000000000000112401313527062700165540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #include "l1param.h" PROLOGUE PROFCODE fldz testq M, M jle .L999 testq INCX, INCX jle .L999 salq $ZBASE_SHIFT, INCX fldz fldz fldz cmpq $SIZE * 2, INCX jne .L40 movq M, I sarq $2, I jle .L20 ALIGN_4 .L10: #if defined(PREFETCH) PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) fmul %st(0), %st FLD 2 * SIZE(X) fmul %st(0), %st FLD 3 * SIZE(X) fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 4 * SIZE(X) fmul %st(0), %st FLD 5 * SIZE(X) fmul %st(0), %st FLD 6 * SIZE(X) fmul %st(0), %st FLD 7 * SIZE(X) fmul %st(0), %st addq $8 * SIZE, X faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L10 ALIGN_4 .L20: andq $3, M jle .L998 ALIGN_4 .L21: FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) fmul %st(0), %st faddp %st,%st(3) faddp %st,%st(1) addq $2 * SIZE, X decq M jg .L21 jmp .L998 ALIGN_4 .L40: movq M, I sarq $2, I jle .L60 ALIGN_4 .L50: FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addq INCX, X fmul %st(0), %st FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addq INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addq INCX, X fmul %st(0), %st FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addq INCX, X fmul %st(0), %st faddp %st, %st(7) faddp %st, %st(5) faddp %st, %st(3) faddp %st, %st(1) decq I jg .L50 ALIGN_4 .L60: andq $3, M jle .L998 ALIGN_4 .L61: FLD 0 * SIZE(X) fmul %st(0), %st FLD 1 * SIZE(X) addq INCX, X fmul %st(0), %st faddp %st,%st(3) faddp %st,%st(1) decq M jg .L61 ALIGN_4 .L998: faddp %st,%st(2) faddp %st,%st(1) faddp %st,%st(1) ALIGN_4 .L999: fsqrt #ifndef XDOUBLE subq $2 * SIZE, %rsp FST (%rsp) MOVSD (%rsp), %xmm0 add $2 * SIZE, %rsp #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/znrm2_sse.S000066400000000000000000000175021313527062700174350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax #define FLAG %r10 #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS pxor %xmm0, %xmm0 testq M, M jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX jle .L999 xorq FLAG, FLAG pxor %xmm2, %xmm2 leaq (, INCX, 2 * SIZE), INCX pxor %xmm3, %xmm3 cmpq $2 * SIZE, INCX jne .L40 testq $SIZE, X je .L05 movss (X), %xmm4 cvtss2sd %xmm4, %xmm6 mulsd %xmm6, %xmm6 addsd %xmm6, %xmm3 addq $SIZE, X movq $1, FLAG decq M jle .L19 ALIGN_3 .L05: movq M, I sarq $3, I jle .L14 movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 movsd 8 * SIZE(X), %xmm8 movsd 10 * SIZE(X), %xmm9 movsd 12 * SIZE(X), %xmm10 movsd 14 * SIZE(X), %xmm11 addq $16 * SIZE, X decq I jle .L12 ALIGN_3 .L10: #if defined(PREFETCH) PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif cvtps2pd %xmm4, %xmm12 cvtps2pd %xmm5, %xmm13 cvtps2pd %xmm6, %xmm14 cvtps2pd %xmm7, %xmm15 movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 mulpd %xmm12, %xmm12 mulpd %xmm13, %xmm13 mulpd %xmm14, %xmm14 mulpd %xmm15, %xmm15 addpd %xmm12, %xmm0 addpd %xmm13, %xmm1 addpd %xmm14, %xmm2 addpd %xmm15, %xmm3 cvtps2pd %xmm8, %xmm12 cvtps2pd %xmm9, %xmm13 cvtps2pd %xmm10, %xmm14 cvtps2pd %xmm11, %xmm15 movsd 8 * SIZE(X), %xmm8 movsd 10 * SIZE(X), %xmm9 movsd 12 * SIZE(X), %xmm10 movsd 14 * SIZE(X), %xmm11 mulpd %xmm12, %xmm12 mulpd %xmm13, %xmm13 mulpd %xmm14, %xmm14 mulpd %xmm15, %xmm15 addpd %xmm12, %xmm0 addpd %xmm13, %xmm1 addpd %xmm14, %xmm2 addpd %xmm15, %xmm3 subq $-16 * SIZE, X decq I jg .L10 ALIGN_3 .L12: cvtps2pd %xmm4, %xmm12 cvtps2pd %xmm5, %xmm13 cvtps2pd %xmm6, %xmm14 cvtps2pd %xmm7, %xmm15 mulpd %xmm12, %xmm12 mulpd %xmm13, %xmm13 mulpd %xmm14, %xmm14 mulpd %xmm15, %xmm15 addpd %xmm12, %xmm0 addpd %xmm13, %xmm1 addpd %xmm14, %xmm2 addpd %xmm15, %xmm3 cvtps2pd %xmm8, %xmm12 cvtps2pd %xmm9, %xmm13 cvtps2pd %xmm10, %xmm14 cvtps2pd %xmm11, %xmm15 mulpd %xmm12, %xmm12 mulpd %xmm13, %xmm13 mulpd %xmm14, %xmm14 mulpd %xmm15, %xmm15 addpd %xmm12, %xmm0 addpd %xmm13, %xmm1 addpd %xmm14, %xmm2 addpd %xmm15, %xmm3 ALIGN_3 .L14: testq $4, M je .L15 movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 movsd 6 * SIZE(X), %xmm7 cvtps2pd %xmm4, %xmm8 cvtps2pd %xmm5, %xmm9 cvtps2pd %xmm6, %xmm10 cvtps2pd %xmm7, %xmm11 mulpd %xmm8, %xmm8 mulpd %xmm9, %xmm9 mulpd %xmm10, %xmm10 mulpd %xmm11, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 addq $8 * SIZE, X ALIGN_3 .L15: testq $2, M je .L16 movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 cvtps2pd %xmm4, %xmm8 cvtps2pd %xmm5, %xmm9 mulpd %xmm8, %xmm8 mulpd %xmm9, %xmm9 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addq $4 * SIZE, X ALIGN_3 .L16: testq $1, M je .L19 movsd (X), %xmm4 cvtps2pd %xmm4, %xmm6 mulpd %xmm6, %xmm6 addpd %xmm6, %xmm2 addq $2 * SIZE, X ALIGN_3 .L19: testq FLAG, FLAG je .L998 movss (X), %xmm4 cvtss2sd %xmm4, %xmm6 mulsd %xmm6, %xmm6 addsd %xmm6, %xmm3 jmp .L998 ALIGN_4 .L40: movq M, I sarq $3, I jle .L44 ALIGN_4 .L41: movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X movsd (X), %xmm6 addq INCX, X movsd (X), %xmm7 addq INCX, X movsd (X), %xmm8 addq INCX, X movsd (X), %xmm9 addq INCX, X movsd (X), %xmm10 addq INCX, X movsd (X), %xmm11 addq INCX, X cvtps2pd %xmm4, %xmm4 cvtps2pd %xmm5, %xmm5 cvtps2pd %xmm6, %xmm6 cvtps2pd %xmm7, %xmm7 cvtps2pd %xmm8, %xmm8 cvtps2pd %xmm9, %xmm9 cvtps2pd %xmm10, %xmm10 cvtps2pd %xmm11, %xmm11 mulpd %xmm4, %xmm4 mulpd %xmm5, %xmm5 mulpd %xmm6, %xmm6 mulpd %xmm7, %xmm7 addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 mulpd %xmm8, %xmm8 mulpd %xmm9, %xmm9 mulpd %xmm10, %xmm10 mulpd %xmm11, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 decq I jg .L41 ALIGN_3 .L44: testq $4, M je .L45 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X movsd (X), %xmm6 addq INCX, X movsd (X), %xmm7 addq INCX, X cvtps2pd %xmm4, %xmm8 cvtps2pd %xmm5, %xmm9 cvtps2pd %xmm6, %xmm10 cvtps2pd %xmm7, %xmm11 mulpd %xmm8, %xmm8 mulpd %xmm9, %xmm9 mulpd %xmm10, %xmm10 mulpd %xmm11, %xmm11 addpd %xmm8, %xmm0 addpd %xmm9, %xmm1 addpd %xmm10, %xmm2 addpd %xmm11, %xmm3 ALIGN_3 .L45: testq $2, M je .L46 movsd (X), %xmm4 addq INCX, X movsd (X), %xmm5 addq INCX, X cvtps2pd %xmm4, %xmm6 cvtps2pd %xmm5, %xmm7 mulpd %xmm6, %xmm6 mulpd %xmm7, %xmm7 addpd %xmm6, %xmm0 addpd %xmm7, %xmm1 ALIGN_3 .L46: testq $1, M je .L998 movsd (X), %xmm4 cvtps2pd %xmm4, %xmm6 mulpd %xmm6, %xmm6 addpd %xmm6, %xmm3 ALIGN_4 .L998: addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm2, %xmm0 #ifndef HAVE_SSE3 movapd %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 addsd %xmm1, %xmm0 #else haddpd %xmm0, %xmm0 #endif ALIGN_4 .L999: sqrtsd %xmm0, %xmm0 cvtsd2ss %xmm0, %xmm0 RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zrot.S000066400000000000000000000144251313527062700165120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #define I %rax #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 48(%rsp), INCY FLD 72(%rsp) FLD 56(%rsp) #else FLD 24(%rsp) FLD 8(%rsp) #endif salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq N, N jle .L999 cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 movq N, I sarq $1, I jle .L15 ALIGN_4 .L10: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif FLD 2 * SIZE(X) FLD 2 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 2 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 2 * SIZE(Y) FLD 3 * SIZE(X) FLD 3 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 3 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 3 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y decq I jg .L10 ALIGN_4 .L15: movq N, I andq $1, I jle .L999 ALIGN_4 .L16: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) jmp .L999 ALIGN_4 .L50: movq N, I sarq $1, I jle .L55 ALIGN_4 .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) addq INCX, X addq INCY, Y FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) addq INCX, X addq INCY, Y decq I jg .L51 ALIGN_4 .L55: movq N, I andq $1, I jle .L999 ALIGN_4 .L56: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 0 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 0 * SIZE(Y) FLD 1 * SIZE(X) FLD 1 * SIZE(Y) fld %st(1) fmul %st(3), %st fld %st(1) fmul %st(5), %st faddp %st, %st(1) FST 1 * SIZE(X) fmul %st(2), %st fxch %st(1) fmul %st(3), %st fsubrp %st, %st(1) FST 1 * SIZE(Y) ALIGN_4 .L999: ffreep %st ffreep %st ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zrot_sse.S000066400000000000000000000730101313527062700173570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #define C %xmm14 #define S %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY movss 48(%rsp), %xmm0 movss 56(%rsp), %xmm1 #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY pshufd $0x0, %xmm0, C pshufd $0x0, %xmm1, S cmpq $0, N jle .L999 cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 testq $2 * SIZE, X je .L10 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y decq N jle .L999 .L10: testq $1 * SIZE, X jne .L30 testq $3 * SIZE, Y jne .L20 movq N, %rax sarq $4, %rax jle .L14 movaps 0 * SIZE(Y), %xmm1 movaps 4 * SIZE(Y), %xmm3 movaps 8 * SIZE(Y), %xmm9 movaps 12 * SIZE(Y), %xmm11 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm8 movaps 12 * SIZE(X), %xmm10 decq %rax jle .L12 ALIGN_3 .L11: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movaps 16 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movaps 20 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, 0 * SIZE(X) movaps 16 * SIZE(X), %xmm0 movaps %xmm2, 4 * SIZE(X) movaps 20 * SIZE(X), %xmm2 movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movaps 24 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movaps 28 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 8 * SIZE(X) movaps 24 * SIZE(X), %xmm8 movaps %xmm10,12 * SIZE(X) movaps 28 * SIZE(X), %xmm10 movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 12 * SIZE(Y) #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movaps 32 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movaps 36 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 16 * SIZE(X) movaps 32 * SIZE(X), %xmm0 movaps %xmm2, 20 * SIZE(X) movaps 36 * SIZE(X), %xmm2 movaps %xmm4, 16 * SIZE(Y) movaps %xmm6, 20 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movaps 40 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movaps 44 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 24 * SIZE(X) movaps 40 * SIZE(X), %xmm8 movaps %xmm10, 28 * SIZE(X) movaps 44 * SIZE(X), %xmm10 movaps %xmm4, 24 * SIZE(Y) movaps %xmm6, 28 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movaps 16 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movaps 20 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps 16 * SIZE(X), %xmm0 movaps %xmm2, 4 * SIZE(X) movaps 20 * SIZE(X), %xmm2 movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movaps 24 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movaps 28 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 8 * SIZE(X) movaps 24 * SIZE(X), %xmm8 movaps %xmm10,12 * SIZE(X) movaps 28 * SIZE(X), %xmm10 movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 12 * SIZE(Y) movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 16 * SIZE(X) movaps %xmm2, 20 * SIZE(X) movaps %xmm4, 16 * SIZE(Y) movaps %xmm6, 20 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 addps %xmm11, %xmm10 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 24 * SIZE(X) movaps %xmm10, 28 * SIZE(X) movaps %xmm4, 24 * SIZE(Y) movaps %xmm6, 28 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y ALIGN_3 .L14: testq $15, N jle .L999 testq $8, N jle .L15 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(Y), %xmm3 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) movaps 8 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movaps 12 * SIZE(Y), %xmm3 movaps 12 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 12 * SIZE(X) movaps %xmm4, 8 * SIZE(Y) movaps %xmm6, 12 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L15: testq $4, N jle .L16 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(Y), %xmm3 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movaps %xmm4, 0 * SIZE(Y) movaps %xmm6, 4 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L16: testq $2, N jle .L17 movaps 0 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 0 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L17: testq $1, N jle .L999 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movq N, %rax sarq $4, %rax jle .L24 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movsd 8 * SIZE(Y), %xmm9 movhps 10 * SIZE(Y), %xmm9 movsd 12 * SIZE(Y), %xmm11 movhps 14 * SIZE(Y), %xmm11 movaps 0 * SIZE(X), %xmm0 movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm8 movaps 12 * SIZE(X), %xmm10 decq %rax jle .L22 ALIGN_3 .L21: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movsd 20 * SIZE(Y), %xmm3 movhps 22 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps %xmm0, 0 * SIZE(X) movaps 16 * SIZE(X), %xmm0 movaps %xmm2, 4 * SIZE(X) movaps 20 * SIZE(X), %xmm2 movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movsd 24 * SIZE(Y), %xmm9 movhps 26 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movsd 28 * SIZE(Y), %xmm11 movhps 30 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 8 * SIZE(X) movaps 24 * SIZE(X), %xmm8 movaps %xmm10,12 * SIZE(X) movaps 28 * SIZE(X), %xmm10 movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movsd 32 * SIZE(Y), %xmm1 movhps 34 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movsd 36 * SIZE(Y), %xmm3 movhps 38 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 16 * SIZE(X) movaps 32 * SIZE(X), %xmm0 movaps %xmm2, 20 * SIZE(X) movaps 36 * SIZE(X), %xmm2 movlps %xmm4, 16 * SIZE(Y) movhps %xmm4, 18 * SIZE(Y) movlps %xmm6, 20 * SIZE(Y) movhps %xmm6, 22 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movsd 40 * SIZE(Y), %xmm9 movhps 42 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movsd 44 * SIZE(Y), %xmm11 movhps 46 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 24 * SIZE(X) movaps 40 * SIZE(X), %xmm8 movaps %xmm10, 28 * SIZE(X) movaps 44 * SIZE(X), %xmm10 movlps %xmm4, 24 * SIZE(Y) movhps %xmm4, 26 * SIZE(Y) movlps %xmm6, 28 * SIZE(Y) movhps %xmm6, 30 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L22: movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movsd 20 * SIZE(Y), %xmm3 movhps 22 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps 16 * SIZE(X), %xmm0 movaps %xmm2, 4 * SIZE(X) movaps 20 * SIZE(X), %xmm2 movsd %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movsd %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movsd 24 * SIZE(Y), %xmm9 movhps 26 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movsd 28 * SIZE(Y), %xmm11 movhps 30 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 8 * SIZE(X) movaps 24 * SIZE(X), %xmm8 movaps %xmm10,12 * SIZE(X) movaps 28 * SIZE(X), %xmm10 movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 16 * SIZE(X) movaps %xmm2, 20 * SIZE(X) movlps %xmm4, 16 * SIZE(Y) movhps %xmm4, 18 * SIZE(Y) movlps %xmm6, 20 * SIZE(Y) movhps %xmm6, 22 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 addps %xmm11, %xmm10 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm8, 24 * SIZE(X) movaps %xmm10, 28 * SIZE(X) movlps %xmm4, 24 * SIZE(Y) movhps %xmm4, 26 * SIZE(Y) movlps %xmm6, 28 * SIZE(Y) movhps %xmm6, 30 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y ALIGN_3 .L24: testq $15, N jle .L999 testq $8, N jle .L25 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movaps 8 * SIZE(X), %xmm0 movsd 12 * SIZE(Y), %xmm3 movhps 14 * SIZE(Y), %xmm3 movaps 12 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 8 * SIZE(X) movaps %xmm2, 12 * SIZE(X) movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L25: testq $4, N jle .L26 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movaps 4 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movaps %xmm0, 0 * SIZE(X) movaps %xmm2, 4 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L26: testq $2, N jle .L27 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movaps 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movaps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L27: testq $1, N jle .L999 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L30: movq N, %rax sarq $4, %rax jle .L34 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movsd 8 * SIZE(Y), %xmm9 movhps 10 * SIZE(Y), %xmm9 movsd 12 * SIZE(Y), %xmm11 movhps 14 * SIZE(Y), %xmm11 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 movsd 8 * SIZE(X), %xmm8 movhps 10 * SIZE(X), %xmm8 movsd 12 * SIZE(X), %xmm10 movhps 14 * SIZE(X), %xmm10 decq %rax jle .L32 ALIGN_3 .L31: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movsd 20 * SIZE(Y), %xmm3 movhps 22 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movsd 16 * SIZE(X), %xmm0 movhps 18 * SIZE(X), %xmm0 movlps %xmm2, 4 * SIZE(X) movhps %xmm2, 6 * SIZE(X) movsd 20 * SIZE(X), %xmm2 movhps 22 * SIZE(X), %xmm2 movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movsd 24 * SIZE(Y), %xmm9 movhps 26 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movsd 28 * SIZE(Y), %xmm11 movhps 30 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm8, 8 * SIZE(X) movhps %xmm8, 10 * SIZE(X) movsd 24 * SIZE(X), %xmm8 movhps 26 * SIZE(X), %xmm8 movlps %xmm10, 12 * SIZE(X) movhps %xmm10, 14 * SIZE(X) movsd 28 * SIZE(X), %xmm10 movhps 30 * SIZE(X), %xmm10 movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movsd 32 * SIZE(Y), %xmm1 movhps 34 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movsd 36 * SIZE(Y), %xmm3 movhps 38 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm0, 16 * SIZE(X) movhps %xmm0, 18 * SIZE(X) movsd 32 * SIZE(X), %xmm0 movhps 34 * SIZE(X), %xmm0 movlps %xmm2, 20 * SIZE(X) movhps %xmm2, 22 * SIZE(X) movsd 36 * SIZE(X), %xmm2 movhps 38 * SIZE(X), %xmm2 movlps %xmm4, 16 * SIZE(Y) movhps %xmm4, 18 * SIZE(Y) movlps %xmm6, 20 * SIZE(Y) movhps %xmm6, 22 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movsd 40 * SIZE(Y), %xmm9 movhps 42 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movsd 44 * SIZE(Y), %xmm11 movhps 46 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm8, 24 * SIZE(X) movhps %xmm8, 26 * SIZE(X) movsd 40 * SIZE(X), %xmm8 movhps 42 * SIZE(X), %xmm8 movlps %xmm10, 28 * SIZE(X) movhps %xmm10, 30 * SIZE(X) movsd 44 * SIZE(X), %xmm10 movhps 46 * SIZE(X), %xmm10 movlps %xmm4, 24 * SIZE(Y) movhps %xmm4, 26 * SIZE(Y) movlps %xmm6, 28 * SIZE(Y) movhps %xmm6, 30 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L32: movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 movsd 16 * SIZE(Y), %xmm1 movhps 18 * SIZE(Y), %xmm1 addps %xmm3, %xmm2 movsd 20 * SIZE(Y), %xmm3 movhps 22 * SIZE(Y), %xmm3 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movsd 16 * SIZE(X), %xmm0 movhps 18 * SIZE(X), %xmm0 movlps %xmm2, 4 * SIZE(X) movhps %xmm2, 6 * SIZE(X) movsd 20 * SIZE(X), %xmm2 movhps 22 * SIZE(X), %xmm2 movsd %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movsd %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 movsd 24 * SIZE(Y), %xmm9 movhps 26 * SIZE(Y), %xmm9 addps %xmm11, %xmm10 movsd 28 * SIZE(Y), %xmm11 movhps 30 * SIZE(Y), %xmm11 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm8, 8 * SIZE(X) movhps %xmm8, 10 * SIZE(X) movsd 24 * SIZE(X), %xmm8 movhps 26 * SIZE(X), %xmm8 movlps %xmm10, 12 * SIZE(X) movhps %xmm10, 14 * SIZE(X) movsd 28 * SIZE(X), %xmm10 movhps 30 * SIZE(X), %xmm10 movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) movaps %xmm1, %xmm4 mulps S, %xmm1 movaps %xmm3, %xmm6 mulps S, %xmm3 movaps %xmm0, %xmm5 mulps C, %xmm0 movaps %xmm2, %xmm7 mulps C, %xmm2 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm0, 16 * SIZE(X) movhps %xmm0, 18 * SIZE(X) movlps %xmm2, 20 * SIZE(X) movhps %xmm2, 22 * SIZE(X) movlps %xmm4, 16 * SIZE(Y) movhps %xmm4, 18 * SIZE(Y) movlps %xmm6, 20 * SIZE(Y) movhps %xmm6, 22 * SIZE(Y) movaps %xmm9, %xmm4 mulps S, %xmm9 movaps %xmm8, %xmm5 mulps C, %xmm8 movaps %xmm11, %xmm6 mulps S, %xmm11 movaps %xmm10, %xmm7 mulps C, %xmm10 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm9, %xmm8 addps %xmm11, %xmm10 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm8, 24 * SIZE(X) movhps %xmm8, 26 * SIZE(X) movlps %xmm10, 28 * SIZE(X) movhps %xmm10, 30 * SIZE(X) movlps %xmm4, 24 * SIZE(Y) movhps %xmm4, 26 * SIZE(Y) movlps %xmm6, 28 * SIZE(Y) movhps %xmm6, 30 * SIZE(Y) addq $32 * SIZE, X addq $32 * SIZE, Y ALIGN_3 .L34: testq $15, N jle .L999 testq $8, N jle .L35 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movlps %xmm2, 4 * SIZE(X) movhps %xmm2, 6 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) movsd 8 * SIZE(Y), %xmm1 movhps 10 * SIZE(Y), %xmm1 movsd 8 * SIZE(X), %xmm0 movhps 10 * SIZE(X), %xmm0 movsd 12 * SIZE(Y), %xmm3 movhps 14 * SIZE(Y), %xmm3 movsd 12 * SIZE(X), %xmm2 movhps 14 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm0, 8 * SIZE(X) movhps %xmm0, 10 * SIZE(X) movlps %xmm2, 12 * SIZE(X) movhps %xmm2, 14 * SIZE(X) movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 10 * SIZE(Y) movlps %xmm6, 12 * SIZE(Y) movhps %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L35: testq $4, N jle .L36 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movsd 4 * SIZE(Y), %xmm3 movhps 6 * SIZE(Y), %xmm3 movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 movaps %xmm1, %xmm4 movaps %xmm0, %xmm5 movaps %xmm3, %xmm6 movaps %xmm2, %xmm7 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 mulps C, %xmm4 mulps S, %xmm5 mulps C, %xmm6 mulps S, %xmm7 addps %xmm1, %xmm0 addps %xmm3, %xmm2 subps %xmm5, %xmm4 subps %xmm7, %xmm6 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movlps %xmm2, 4 * SIZE(X) movhps %xmm2, 6 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 2 * SIZE(Y) movlps %xmm6, 4 * SIZE(Y) movhps %xmm6, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L36: testq $2, N jle .L37 movsd 0 * SIZE(Y), %xmm1 movhps 2 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 2 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L37: testq $1, N jle .L999 movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 ALIGN_3 .L50: movq N, %rax cmpq $0, INCX je .L56 cmpq $0, INCY je .L56 sarq $2, %rax jle .L55 ALIGN_3 .L53: movsd (Y), %xmm1 movhps (Y, INCY), %xmm1 movsd (X), %xmm0 movhps (X, INCX), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, (X) movhps %xmm0, (X, INCX) movlps %xmm2, (Y) movhps %xmm2, (Y, INCY) leaq (X, INCX, 2), X leaq (Y, INCY, 2), Y movsd (Y), %xmm1 movhps (Y, INCY), %xmm1 movsd (X), %xmm0 movhps (X, INCX), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, (X) movhps %xmm0, (X, INCX) movlps %xmm2, (Y) movhps %xmm2, (Y, INCY) leaq (X, INCX, 2), X leaq (Y, INCY, 2), Y decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3 .L56: movsd (Y), %xmm1 movsd (X), %xmm0 movaps %xmm1, %xmm2 movaps %xmm0, %xmm3 mulps C, %xmm0 mulps S, %xmm1 mulps C, %xmm2 mulps S, %xmm3 addps %xmm1, %xmm0 subps %xmm3, %xmm2 movlps %xmm0, (X) movlps %xmm2, (Y) addq INCX, X addq INCY, Y decq %rax jg .L56 ALIGN_3 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zrot_sse2.S000066400000000000000000000734731313527062700174560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define Y ARG4 /* rcx */ #ifndef WINDOWS_ABI #define INCY ARG5 /* r8 */ #else #define INCY %r10 #endif #define C %xmm14 #define S %xmm15 #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movq 40(%rsp), INCY movsd 48(%rsp), %xmm0 movsd 56(%rsp), %xmm1 #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY pshufd $0x44, %xmm0, C pshufd $0x44, %xmm1, S cmpq $0, N jle .L999 cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 .L10: testq $SIZE, X jne .L30 testq $SIZE, Y jne .L20 movq N, %rax sarq $3, %rax jle .L14 movapd 0 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y), %xmm3 movapd 4 * SIZE(Y), %xmm9 movapd 6 * SIZE(Y), %xmm11 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm2 movapd 4 * SIZE(X), %xmm8 movapd 6 * SIZE(X), %xmm10 decq %rax jle .L12 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movapd 8 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movapd 10 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd %xmm0, 0 * SIZE(X) movapd 8 * SIZE(X), %xmm0 movapd %xmm2, 2 * SIZE(X) movapd 10 * SIZE(X), %xmm2 movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movapd 12 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movapd 14 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 4 * SIZE(X) movapd 12 * SIZE(X), %xmm8 movapd %xmm10,6 * SIZE(X) movapd 14 * SIZE(X), %xmm10 movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movapd 16 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movapd 18 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 8 * SIZE(X) movapd 16 * SIZE(X), %xmm0 movapd %xmm2, 10 * SIZE(X) movapd 18 * SIZE(X), %xmm2 movapd %xmm4, 8 * SIZE(Y) movapd %xmm6, 10 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movapd 20 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movapd 22 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 12 * SIZE(X) movapd 20 * SIZE(X), %xmm8 movapd %xmm10, 14 * SIZE(X) movapd 22 * SIZE(X), %xmm10 movapd %xmm4, 12 * SIZE(Y) movapd %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L11 ALIGN_3 .L12: movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movapd 8 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movapd 10 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd 8 * SIZE(X), %xmm0 movapd %xmm2, 2 * SIZE(X) movapd 10 * SIZE(X), %xmm2 movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movapd 12 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movapd 14 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 4 * SIZE(X) movapd 12 * SIZE(X), %xmm8 movapd %xmm10,6 * SIZE(X) movapd 14 * SIZE(X), %xmm10 movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 8 * SIZE(X) movapd %xmm2, 10 * SIZE(X) movapd %xmm4, 8 * SIZE(Y) movapd %xmm6, 10 * SIZE(Y) movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 12 * SIZE(X) movapd %xmm10, 14 * SIZE(X) movapd %xmm4, 12 * SIZE(Y) movapd %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L14: testq $7, N jle .L999 testq $4, N jle .L15 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(Y), %xmm3 movapd 2 * SIZE(X), %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 2 * SIZE(X) movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd 6 * SIZE(Y), %xmm3 movapd 6 * SIZE(X), %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 6 * SIZE(X) movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $2, N jle .L16 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(Y), %xmm3 movapd 2 * SIZE(X), %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 2 * SIZE(X) movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $1, N jle .L999 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L20: movapd -1 * SIZE(Y), %xmm1 movq N, %rax sarq $3, %rax jle .L24 ALIGN_3 .L21: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 1 * SIZE(Y), %xmm3 movapd 3 * SIZE(Y), %xmm8 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 2 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 1 * SIZE(Y) movlps %xmm6, 2 * SIZE(Y) movhps %xmm6, 3 * SIZE(Y) #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 5 * SIZE(Y), %xmm9 movapd 7 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd 6 * SIZE(X), %xmm2 SHUFPD_1 %xmm9, %xmm8 SHUFPD_1 %xmm1, %xmm9 movapd %xmm8, %xmm4 movapd %xmm0, %xmm5 movapd %xmm9, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm8 mulpd C, %xmm2 mulpd S, %xmm9 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 6 * SIZE(X) movlps %xmm4, 4 * SIZE(Y) movhps %xmm4, 5 * SIZE(Y) movlps %xmm6, 6 * SIZE(Y) movhps %xmm6, 7 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 9 * SIZE(Y), %xmm3 movapd 11 * SIZE(Y), %xmm8 movapd 8 * SIZE(X), %xmm0 movapd 10 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 8 * SIZE(X) movapd %xmm2, 10 * SIZE(X) movlps %xmm4, 8 * SIZE(Y) movhps %xmm4, 9 * SIZE(Y) movlps %xmm6, 10 * SIZE(Y) movhps %xmm6, 11 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 13 * SIZE(Y), %xmm9 movapd 15 * SIZE(Y), %xmm1 movapd 12 * SIZE(X), %xmm0 movapd 14 * SIZE(X), %xmm2 SHUFPD_1 %xmm9, %xmm8 SHUFPD_1 %xmm1, %xmm9 movapd %xmm8, %xmm4 movapd %xmm0, %xmm5 movapd %xmm9, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm8 mulpd C, %xmm2 mulpd S, %xmm9 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 12 * SIZE(X) movapd %xmm2, 14 * SIZE(X) movlps %xmm4, 12 * SIZE(Y) movhps %xmm4, 13 * SIZE(Y) movlps %xmm6, 14 * SIZE(Y) movhps %xmm6, 15 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L24: testq $7, N jle .L999 testq $4, N jle .L25 movapd 1 * SIZE(Y), %xmm3 movapd 3 * SIZE(Y), %xmm8 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 2 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 1 * SIZE(Y) movlps %xmm6, 2 * SIZE(Y) movhps %xmm6, 3 * SIZE(Y) movapd 5 * SIZE(Y), %xmm9 movapd 7 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd 6 * SIZE(X), %xmm2 SHUFPD_1 %xmm9, %xmm8 SHUFPD_1 %xmm1, %xmm9 movapd %xmm8, %xmm4 movapd %xmm0, %xmm5 movapd %xmm9, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm8 mulpd C, %xmm2 mulpd S, %xmm9 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 6 * SIZE(X) movlps %xmm4, 4 * SIZE(Y) movhps %xmm4, 5 * SIZE(Y) movlps %xmm6, 6 * SIZE(Y) movhps %xmm6, 7 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $2, N jle .L26 movapd 1 * SIZE(Y), %xmm3 movapd 3 * SIZE(Y), %xmm8 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm2 SHUFPD_1 %xmm3, %xmm1 SHUFPD_1 %xmm8, %xmm3 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 2 * SIZE(X) movlps %xmm4, 0 * SIZE(Y) movhps %xmm4, 1 * SIZE(Y) movlps %xmm6, 2 * SIZE(Y) movhps %xmm6, 3 * SIZE(Y) movapd %xmm8, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: testq $1, N jle .L999 movapd 1 * SIZE(Y), %xmm4 movapd 0 * SIZE(X), %xmm0 SHUFPD_1 %xmm4, %xmm1 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) jmp .L999 ALIGN_3 .L30: testq $SIZE, Y jne .L40 movapd -1 * SIZE(X), %xmm0 movq N, %rax sarq $3, %rax jle .L34 ALIGN_3 .L31: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd 1 * SIZE(X), %xmm2 movapd 3 * SIZE(X), %xmm8 movapd 0 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 SHUFPD_1 %xmm8, %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 2 * SIZE(X) movhps %xmm2, 3 * SIZE(X) movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd 5 * SIZE(X), %xmm2 movapd 7 * SIZE(X), %xmm0 movapd 4 * SIZE(Y), %xmm1 movapd 6 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm8 SHUFPD_1 %xmm0, %xmm2 movapd %xmm1, %xmm4 movapd %xmm8, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm8 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movlps %xmm8, 4 * SIZE(X) movhps %xmm8, 5 * SIZE(X) movlps %xmm2, 6 * SIZE(X) movhps %xmm2, 7 * SIZE(X) movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd 9 * SIZE(X), %xmm2 movapd 11 * SIZE(X), %xmm8 movapd 8 * SIZE(Y), %xmm1 movapd 10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 SHUFPD_1 %xmm8, %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movlps %xmm0, 8 * SIZE(X) movhps %xmm0, 9 * SIZE(X) movlps %xmm2, 10 * SIZE(X) movhps %xmm2, 11 * SIZE(X) movapd %xmm4, 8 * SIZE(Y) movapd %xmm6, 10 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd 13 * SIZE(X), %xmm2 movapd 15 * SIZE(X), %xmm0 movapd 12 * SIZE(Y), %xmm1 movapd 14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm8 SHUFPD_1 %xmm0, %xmm2 movapd %xmm1, %xmm4 movapd %xmm8, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm8 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movlps %xmm8, 12 * SIZE(X) movhps %xmm8, 13 * SIZE(X) movlps %xmm2, 14 * SIZE(X) movhps %xmm2, 15 * SIZE(X) movapd %xmm4, 12 * SIZE(Y) movapd %xmm6, 14 * SIZE(Y) addq $16 * SIZE, Y addq $16 * SIZE, X decq %rax jg .L31 ALIGN_3 .L34: testq $7, N jle .L999 testq $4, N jle .L35 movapd 1 * SIZE(X), %xmm2 movapd 3 * SIZE(X), %xmm8 movapd 0 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 SHUFPD_1 %xmm8, %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 2 * SIZE(X) movhps %xmm2, 3 * SIZE(X) movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd 5 * SIZE(X), %xmm2 movapd 7 * SIZE(X), %xmm0 movapd 4 * SIZE(Y), %xmm1 movapd 6 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm8 SHUFPD_1 %xmm0, %xmm2 movapd %xmm1, %xmm4 movapd %xmm8, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm8 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm8 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movlps %xmm8, 4 * SIZE(X) movhps %xmm8, 5 * SIZE(X) movlps %xmm2, 6 * SIZE(X) movhps %xmm2, 7 * SIZE(X) movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) addq $8 * SIZE, Y addq $8 * SIZE, X ALIGN_3 .L35: testq $2, N jle .L36 movapd 1 * SIZE(X), %xmm2 movapd 3 * SIZE(X), %xmm8 movapd 0 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 SHUFPD_1 %xmm8, %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 2 * SIZE(X) movhps %xmm2, 3 * SIZE(X) movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd %xmm8, %xmm0 addq $4 * SIZE, Y addq $4 * SIZE, X ALIGN_3 .L36: testq $1, N jle .L999 movapd 1 * SIZE(X), %xmm4 movapd 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm4, %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L40: movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) addq $1 * SIZE, Y addq $1 * SIZE, X decq N jle .L47 movq N, %rax sarq $3, %rax jle .L44 movapd 0 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y), %xmm3 movapd 4 * SIZE(Y), %xmm9 movapd 6 * SIZE(Y), %xmm11 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm2 movapd 4 * SIZE(X), %xmm8 movapd 6 * SIZE(X), %xmm10 decq %rax jle .L42 ALIGN_3 .L41: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movapd 8 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movapd 10 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movapd %xmm0, 0 * SIZE(X) movapd 8 * SIZE(X), %xmm0 movapd %xmm2, 2 * SIZE(X) movapd 10 * SIZE(X), %xmm2 movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movapd 12 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movapd 14 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 4 * SIZE(X) movapd 12 * SIZE(X), %xmm8 movapd %xmm10,6 * SIZE(X) movapd 14 * SIZE(X), %xmm10 movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movapd 16 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movapd 18 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 8 * SIZE(X) movapd 16 * SIZE(X), %xmm0 movapd %xmm2, 10 * SIZE(X) movapd 18 * SIZE(X), %xmm2 movapd %xmm4, 8 * SIZE(Y) movapd %xmm6, 10 * SIZE(Y) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movapd 20 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movapd 22 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 12 * SIZE(X) movapd 20 * SIZE(X), %xmm8 movapd %xmm10, 14 * SIZE(X) movapd 22 * SIZE(X), %xmm10 movapd %xmm4, 12 * SIZE(Y) movapd %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L42: movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 movapd 8 * SIZE(Y), %xmm1 addpd %xmm3, %xmm2 movapd 10 * SIZE(Y), %xmm3 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd 8 * SIZE(X), %xmm0 movapd %xmm2, 2 * SIZE(X) movapd 10 * SIZE(X), %xmm2 movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 movapd 12 * SIZE(Y), %xmm9 addpd %xmm11, %xmm10 movapd 14 * SIZE(Y), %xmm11 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 4 * SIZE(X) movapd 12 * SIZE(X), %xmm8 movapd %xmm10,6 * SIZE(X) movapd 14 * SIZE(X), %xmm10 movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) movapd %xmm1, %xmm4 mulpd S, %xmm1 movapd %xmm3, %xmm6 mulpd S, %xmm3 movapd %xmm0, %xmm5 mulpd C, %xmm0 movapd %xmm2, %xmm7 mulpd C, %xmm2 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 8 * SIZE(X) movapd %xmm2, 10 * SIZE(X) movapd %xmm4, 8 * SIZE(Y) movapd %xmm6, 10 * SIZE(Y) movapd %xmm9, %xmm4 mulpd S, %xmm9 movapd %xmm8, %xmm5 mulpd C, %xmm8 movapd %xmm11, %xmm6 mulpd S, %xmm11 movapd %xmm10, %xmm7 mulpd C, %xmm10 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm8, 12 * SIZE(X) movapd %xmm10, 14 * SIZE(X) movapd %xmm4, 12 * SIZE(Y) movapd %xmm6, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L44: testq $4, N jle .L45 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(Y), %xmm3 movapd 2 * SIZE(X), %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 2 * SIZE(X) movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) movapd 4 * SIZE(Y), %xmm1 movapd 4 * SIZE(X), %xmm0 movapd 6 * SIZE(Y), %xmm3 movapd 6 * SIZE(X), %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 4 * SIZE(X) movapd %xmm2, 6 * SIZE(X) movapd %xmm4, 4 * SIZE(Y) movapd %xmm6, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L45: testq $2, N jle .L46 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(Y), %xmm3 movapd 2 * SIZE(X), %xmm2 movapd %xmm1, %xmm4 movapd %xmm0, %xmm5 movapd %xmm3, %xmm6 movapd %xmm2, %xmm7 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 mulpd C, %xmm4 mulpd S, %xmm5 mulpd C, %xmm6 mulpd S, %xmm7 addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 2 * SIZE(X) movapd %xmm4, 0 * SIZE(Y) movapd %xmm6, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L46: testq $1, N jle .L47 movapd 0 * SIZE(Y), %xmm1 movapd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movapd %xmm0, 0 * SIZE(X) movapd %xmm2, 0 * SIZE(Y) addq $2 * SIZE, Y addq $2 * SIZE, X ALIGN_3 .L47: movsd 0 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulsd C, %xmm0 mulsd S, %xmm1 mulsd C, %xmm2 mulsd S, %xmm3 addsd %xmm1, %xmm0 subsd %xmm3, %xmm2 movsd %xmm0, 0 * SIZE(X) movsd %xmm2, 0 * SIZE(Y) jmp .L999 ALIGN_3 .L50: movq N, %rax sarq $2, %rax jle .L55 ALIGN_3 .L53: movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) addq INCX, X addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) addq INCX, X addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) addq INCX, X addq INCY, Y movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) addq INCX, X addq INCY, Y decq %rax jg .L53 ALIGN_3 .L55: movq N, %rax andq $3, %rax jle .L999 ALIGN_3 .L56: movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movapd %xmm1, %xmm2 movapd %xmm0, %xmm3 mulpd C, %xmm0 mulpd S, %xmm1 mulpd C, %xmm2 mulpd S, %xmm3 addpd %xmm1, %xmm0 subpd %xmm3, %xmm2 movlps %xmm0, 0 * SIZE(X) movhps %xmm0, 1 * SIZE(X) movlps %xmm2, 0 * SIZE(Y) movhps %xmm2, 1 * SIZE(Y) addq INCX, X addq INCY, Y decq %rax jg .L56 ALIGN_3 .L999: RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zscal.S000066400000000000000000000112351313527062700166240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N ARG1 #define X ARG4 #define INCX ARG5 #define I %rax #include "l1param.h" PROLOGUE PROFCODE salq $ZBASE_SHIFT, INCX FLD 8(%rsp) FLD 24(%rsp) testq N, N jle .L999 fld %st(1) fabs fld %st(1) fabs faddp %st, %st(1) fldz fcomip %st(1), %st ffreep %st jne .L30 EMMS pxor %mm0, %mm0 cmpq $2 * SIZE, INCX jne .L20 movq N, I sarq $2, I jle .L15 ALIGN_4 .L12: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) movq %mm0, 32(X) movq %mm0, 40(X) movq %mm0, 48(X) movq %mm0, 56(X) movq %mm0, 64(X) movq %mm0, 72(X) movq %mm0, 80(X) movq %mm0, 88(X) movq %mm0, 96(X) movq %mm0, 104(X) movq %mm0, 112(X) movq %mm0, 120(X) addq $8 * SIZE, X decq I jg .L12 ALIGN_3 .L15: movq N, I andq $3, I jle .L18 ALIGN_2 .L16: movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addq $2 * SIZE, X decq I jg .L16 .L18: EMMS ret ALIGN_2 .L20: movq N, I sarq $2, I jle .L25 ALIGN_3 .L22: movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addq INCX, X movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addq INCX, X movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addq INCX, X movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addq INCX, X decq I jg .L22 ALIGN_3 .L25: movq N, I andq $3, I jle .L28 ALIGN_3 .L26: movq %mm0, 0(X) movq %mm0, 8(X) movq %mm0, 16(X) movq %mm0, 24(X) addq INCX, X decq I jg .L26 .L28: EMMS ret ALIGN_3 .L30: movq N, I ALIGN_2 .L32: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) fmul %st(1),%st FLD 1 * SIZE(X) fmul %st(3),%st faddp %st,%st(1) FLD 0 * SIZE(X) fmul %st(3),%st FLD 1 * SIZE(X) fmul %st(3),%st fsubrp %st,%st(1) FST 0 * SIZE(X) FST 1 * SIZE(X) addq INCX, X decq I jg .L32 ALIGN_2 .L999: ffreep %st ffreep %st ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zscal.c000066400000000000000000000174231313527062700166510ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013 - 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #if defined(HASWELL) || defined(ZEN) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(EXCAVATOR) #include "zscal_microk_steamroller-2.c" #endif #if !defined(HAVE_KERNEL_8) static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x ) __attribute__ ((noinline)); static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) { BLASLONG i; FLOAT da_r = alpha[0]; FLOAT da_i = alpha[1]; FLOAT t0,t1,t2,t3; for( i=0; i 0 ) { alpha[0] = da_r; alpha[1] = da_i; zscal_kernel_inc_8(n1, alpha, x, inc_x); j = n1 ; i = n1 * inc_x; } while(j < n) { temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; i += inc_x ; j++; } } } return(0); } BLASLONG n1 = n & -8; if ( n1 > 0 ) { alpha[0] = da_r; alpha[1] = da_i; if ( da_r == 0.0 ) if ( da_i == 0 ) zscal_kernel_8_zero(n1 , alpha , x); else zscal_kernel_8_zero_r(n1 , alpha , x); else if ( da_i == 0 ) zscal_kernel_8_zero_i(n1 , alpha , x); else zscal_kernel_8(n1 , alpha , x); i = n1 << 1; j = n1; } if ( da_r == 0.0 ) { if ( da_i == 0.0 ) { while(j < n) { x[i]=0.0; x[i+1]=0.0; i += 2 ; j++; } } else { while(j < n) { temp0 = -da_i * x[i+1]; x[i+1] = da_i * x[i]; x[i] = temp0; i += 2 ; j++; } } } else { if ( da_i == 0.0 ) { while(j < n) { temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; i += 2 ; j++; } } else { while(j < n) { temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; i += 2 ; j++; } } } return(0); } OpenBLAS-0.2.20/kernel/x86_64/zscal_atom.S000066400000000000000000000200371313527062700176440ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #endif #define XX %r10 #define I %rax #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movaps %xmm3, %xmm0 movsd 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX testq M, M jle .L999 pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L30 # Alpha_r != ZERO comisd %xmm1, %xmm15 jne .L30 # Alpha_i != ZERO /* Alpha == ZERO */ cmpq $2 * SIZE, INCX jne .L20 movq M, I sarq $2, I jle .L12 ALIGN_4 .L11: movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) movsd %xmm1, 2 * SIZE(X) movsd %xmm1, 3 * SIZE(X) movsd %xmm1, 4 * SIZE(X) movsd %xmm1, 5 * SIZE(X) movsd %xmm1, 6 * SIZE(X) movsd %xmm1, 7 * SIZE(X) addq $8 * SIZE, X decq I jg .L11 ALIGN_4 .L12: testq $2, M je .L14 movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) movsd %xmm1, 2 * SIZE(X) movsd %xmm1, 3 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L14: testq $1, M je .L999 movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq $2 * SIZE, X jmp .L999 ALIGN_4 .L20: movq M, I # rcx = n sarq $2, I jle .L22 ALIGN_4 .L21: movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq INCX, X decq I jg .L21 ALIGN_4 .L22: testq $2, M je .L23 movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq INCX, X movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) addq INCX, X ALIGN_3 .L23: testq $1, M je .L999 movsd %xmm1, 0 * SIZE(X) movsd %xmm1, 1 * SIZE(X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L30: movq X, XX movq M, I sarq $2, I jle .L35 movsd 0 * SIZE(X), %xmm2 movsd 1 * SIZE(X), %xmm3 addq INCX, X movsd 0 * SIZE(X), %xmm6 movsd 1 * SIZE(X), %xmm7 addq INCX, X movaps %xmm2, %xmm4 movsd 0 * SIZE(X), %xmm8 mulsd %xmm0, %xmm2 movaps %xmm3, %xmm5 movsd 1 * SIZE(X), %xmm9 mulsd %xmm1, %xmm5 addq INCX, X mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm2 movsd 0 * SIZE(X), %xmm10 addsd %xmm4, %xmm3 movsd 1 * SIZE(X), %xmm11 movaps %xmm6, %xmm4 mulsd %xmm0, %xmm6 addq INCX, X movaps %xmm7, %xmm5 mulsd %xmm1, %xmm5 mulsd %xmm0, %xmm7 mulsd %xmm1, %xmm4 decq I jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif subsd %xmm5, %xmm6 movsd %xmm2, 0 * SIZE(XX) addsd %xmm4, %xmm7 movsd %xmm3, 1 * SIZE(XX) movaps %xmm8, %xmm4 movsd 0 * SIZE(X), %xmm2 mulsd %xmm0, %xmm8 addq INCX, XX movaps %xmm9, %xmm5 movsd 1 * SIZE(X), %xmm3 mulsd %xmm1, %xmm5 addq INCX, X mulsd %xmm0, %xmm9 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm8 movsd %xmm6, 0 * SIZE(XX) addsd %xmm4, %xmm9 movsd %xmm7, 1 * SIZE(XX) movaps %xmm10, %xmm4 movsd 0 * SIZE(X), %xmm6 mulsd %xmm0, %xmm10 addq INCX, XX movaps %xmm11, %xmm5 movsd 1 * SIZE(X), %xmm7 mulsd %xmm1, %xmm5 addq INCX, X mulsd %xmm0, %xmm11 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm10 movsd %xmm8, 0 * SIZE(XX) addsd %xmm4, %xmm11 movsd %xmm9, 1 * SIZE(XX) movaps %xmm2, %xmm4 movsd 0 * SIZE(X), %xmm8 mulsd %xmm0, %xmm2 addq INCX, XX movaps %xmm3, %xmm5 movsd 1 * SIZE(X), %xmm9 mulsd %xmm1, %xmm5 addq INCX, X mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm2 movsd %xmm10, 0 * SIZE(XX) addsd %xmm4, %xmm3 movsd %xmm11, 1 * SIZE(XX) movaps %xmm6, %xmm4 movsd 0 * SIZE(X), %xmm10 mulsd %xmm0, %xmm6 addq INCX, XX movaps %xmm7, %xmm5 movsd 1 * SIZE(X), %xmm11 mulsd %xmm1, %xmm5 addq INCX, X mulsd %xmm0, %xmm7 mulsd %xmm1, %xmm4 decq I jg .L31 ALIGN_4 .L32: subsd %xmm5, %xmm6 movsd %xmm2, 0 * SIZE(XX) addsd %xmm4, %xmm7 movsd %xmm3, 1 * SIZE(XX) movaps %xmm8, %xmm4 mulsd %xmm0, %xmm8 addq INCX, XX movaps %xmm9, %xmm5 mulsd %xmm1, %xmm5 mulsd %xmm0, %xmm9 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm8 movsd %xmm6, 0 * SIZE(XX) addsd %xmm4, %xmm9 movsd %xmm7, 1 * SIZE(XX) movaps %xmm10, %xmm4 mulsd %xmm0, %xmm10 addq INCX, XX movaps %xmm11, %xmm5 mulsd %xmm1, %xmm5 mulsd %xmm0, %xmm11 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm10 movsd %xmm8, 0 * SIZE(XX) addsd %xmm4, %xmm11 movsd %xmm9, 1 * SIZE(XX) addq INCX, XX movsd %xmm10, 0 * SIZE(XX) movsd %xmm11, 1 * SIZE(XX) addq INCX, XX ALIGN_3 .L35: testq $2, M je .L37 movsd 0 * SIZE(X), %xmm2 movsd 1 * SIZE(X), %xmm3 addq INCX, X movaps %xmm2, %xmm4 movsd 0 * SIZE(X), %xmm6 mulsd %xmm0, %xmm2 movaps %xmm3, %xmm5 movsd 1 * SIZE(X), %xmm7 mulsd %xmm1, %xmm5 addq INCX, X mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm2 addsd %xmm4, %xmm3 movaps %xmm6, %xmm4 mulsd %xmm0, %xmm6 movaps %xmm7, %xmm5 mulsd %xmm1, %xmm5 mulsd %xmm0, %xmm7 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm6 movsd %xmm2, 0 * SIZE(XX) addsd %xmm4, %xmm7 movsd %xmm3, 1 * SIZE(XX) addq INCX, XX movsd %xmm6, 0 * SIZE(XX) movsd %xmm7, 1 * SIZE(XX) addq INCX, XX ALIGN_3 .L37: testq $1, M je .L999 movsd 0 * SIZE(X), %xmm2 movsd 1 * SIZE(X), %xmm3 movaps %xmm2, %xmm4 mulsd %xmm0, %xmm2 movaps %xmm3, %xmm5 mulsd %xmm1, %xmm5 mulsd %xmm0, %xmm3 mulsd %xmm1, %xmm4 subsd %xmm5, %xmm2 addsd %xmm4, %xmm3 movsd %xmm2, 0 * SIZE(XX) movsd %xmm3, 1 * SIZE(XX) ALIGN_3 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zscal_microk_bulldozer-2.c000066400000000000000000000250111313527062700224260ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vmovddup (%2), %%xmm0 \n\t" // da_r "vmovddup 8(%2), %%xmm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%xmm4 \n\t" "vmovups -112(%1), %%xmm5 \n\t" "vmovups -96(%1), %%xmm6 \n\t" "vmovups -80(%1), %%xmm7 \n\t" "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" "subq $4 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 192(%1) \n\t" // ".align 2 \n\t" "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmovups -64(%1), %%xmm4 \n\t" "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmovups -48(%1), %%xmm5 \n\t" "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmovups -32(%1), %%xmm6 \n\t" "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmovups -16(%1), %%xmm7 \n\t" "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%xmm12 , %%xmm8 , %%xmm8 \n\t" "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubpd %%xmm13 , %%xmm9 , %%xmm9 \n\t" "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubpd %%xmm14 , %%xmm10, %%xmm10 \n\t" "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubpd %%xmm15 , %%xmm11, %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" "addq $64 ,%1 \n\t" "subq $4 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%xmm12 , %%xmm8 , %%xmm8 \n\t" "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubpd %%xmm13 , %%xmm9 , %%xmm9 \n\t" "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubpd %%xmm14 , %%xmm10, %%xmm10 \n\t" "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubpd %%xmm15 , %%xmm11, %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vmovddup 8(%2), %%xmm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%xmm4 \n\t" "vmovups -112(%1), %%xmm5 \n\t" "vmovups -96(%1), %%xmm6 \n\t" "vmovups -80(%1), %%xmm7 \n\t" "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" "subq $4 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups -64(%1), %%xmm4 \n\t" "vmovups -48(%1), %%xmm5 \n\t" "vmovups -32(%1), %%xmm6 \n\t" "vmovups -16(%1), %%xmm7 \n\t" "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%xmm12 , %%xmm0 , %%xmm8 \n\t" "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubpd %%xmm13 , %%xmm0 , %%xmm9 \n\t" "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubpd %%xmm14 , %%xmm0 , %%xmm10 \n\t" "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubpd %%xmm15 , %%xmm0 , %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vpermilpd $0x01 , %%xmm4, %%xmm12 \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vpermilpd $0x01 , %%xmm5, %%xmm13 \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vpermilpd $0x01 , %%xmm6, %%xmm14 \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vpermilpd $0x01 , %%xmm7, %%xmm15 \n\t" "addq $64 ,%1 \n\t" "subq $4 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%xmm1, %%xmm12, %%xmm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%xmm12 , %%xmm0 , %%xmm8 \n\t" "vmulpd %%xmm1, %%xmm13, %%xmm13 \n\t" "vaddsubpd %%xmm13 , %%xmm0 , %%xmm9 \n\t" "vmulpd %%xmm1, %%xmm14, %%xmm14 \n\t" "vaddsubpd %%xmm14 , %%xmm0 , %%xmm10 \n\t" "vmulpd %%xmm1, %%xmm15, %%xmm15 \n\t" "vaddsubpd %%xmm15 , %%xmm0 , %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vmovddup (%2), %%xmm0 \n\t" // da_r "addq $128, %1 \n\t" "vmovups -128(%1), %%xmm4 \n\t" "vmovups -112(%1), %%xmm5 \n\t" "vmovups -96(%1), %%xmm6 \n\t" "vmovups -80(%1), %%xmm7 \n\t" "subq $4 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmovups -64(%1), %%xmm4 \n\t" "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmovups -48(%1), %%xmm5 \n\t" "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmovups -32(%1), %%xmm6 \n\t" "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmovups -16(%1), %%xmm7 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "addq $64 ,%1 \n\t" "subq $4 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%xmm0, %%xmm4 , %%xmm8 \n\t" // da_r*x0 , da_r *x1 "vmulpd %%xmm0, %%xmm5 , %%xmm9 \n\t" "vmulpd %%xmm0, %%xmm6 , %%xmm10 \n\t" "vmulpd %%xmm0, %%xmm7 , %%xmm11 \n\t" "vmovups %%xmm8 , -128(%1) \n\t" "vmovups %%xmm9 , -112(%1) \n\t" "vmovups %%xmm10, -96(%1) \n\t" "vmovups %%xmm11, -80(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" "addq $128, %1 \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups %%xmm0 , -128(%1) \n\t" "vmovups %%xmm0 , -112(%1) \n\t" "vmovups %%xmm0 , -96(%1) \n\t" "vmovups %%xmm0 , -80(%1) \n\t" "addq $64 ,%1 \n\t" "subq $4 , %0 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zscal_microk_haswell-2.c000066400000000000000000000250131313527062700220650ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm0 \n\t" // da_r "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups 0(%1), %%ymm4 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm0 \n\t" // da_r "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "addq $128, %1 \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups %%ymm0 , -128(%1) \n\t" "vmovups %%ymm0 , -96(%1) \n\t" "vmovups %%ymm0 , -64(%1) \n\t" "vmovups %%ymm0 , -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zscal_microk_steamroller-2.c000066400000000000000000000250571313527062700227670ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2014-2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #define HAVE_KERNEL_8 1 static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm0 \n\t" // da_r "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" "prefetcht0 320(%1) \n\t" "prefetcht0 384(%1) \n\t" // ".align 2 \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm8 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm9 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm11, %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "vbroadcastsd 8(%2), %%ymm1 \n\t" // da_i "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups 0(%1), %%ymm4 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vpermilpd $0x05 , %%ymm4, %%ymm12 \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vpermilpd $0x05 , %%ymm5, %%ymm13 \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vpermilpd $0x05 , %%ymm6, %%ymm14 \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vpermilpd $0x05 , %%ymm7, %%ymm15 \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" // da_i*x1 , da_i *x0 "vaddsubpd %%ymm12 , %%ymm0 , %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" "vaddsubpd %%ymm13 , %%ymm0 , %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm14, %%ymm14 \n\t" "vaddsubpd %%ymm14 , %%ymm0 , %%ymm10 \n\t" "vmulpd %%ymm1, %%ymm15, %%ymm15 \n\t" "vaddsubpd %%ymm15 , %%ymm0 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vbroadcastsd (%2), %%ymm0 \n\t" // da_r "addq $128, %1 \n\t" "vmovups -128(%1), %%ymm4 \n\t" "vmovups -96(%1), %%ymm5 \n\t" "vmovups -64(%1), %%ymm6 \n\t" "vmovups -32(%1), %%ymm7 \n\t" "subq $8 , %0 \n\t" "jz 2f \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmovups 0(%1), %%ymm4 \n\t" "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmovups 32(%1), %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmovups 64(%1), %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups 96(%1), %%ymm7 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "2: \n\t" "vmulpd %%ymm0, %%ymm4 , %%ymm8 \n\t" // da_r*x0 , da_r *x1 "vmulpd %%ymm0, %%ymm5 , %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm6 , %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm7 , %%ymm11 \n\t" "vmovups %%ymm8 , -128(%1) \n\t" "vmovups %%ymm9 , -96(%1) \n\t" "vmovups %%ymm10, -64(%1) \n\t" "vmovups %%ymm11, -32(%1) \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__ __volatile__ ( "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "addq $128, %1 \n\t" ".align 16 \n\t" "1: \n\t" //"prefetcht0 128(%1) \n\t" // ".align 2 \n\t" "vmovups %%ymm0 , -128(%1) \n\t" "vmovups %%ymm0 , -96(%1) \n\t" "vmovups %%ymm0 , -64(%1) \n\t" "vmovups %%ymm0 , -32(%1) \n\t" "addq $128 ,%1 \n\t" "subq $8 , %0 \n\t" "jnz 1b \n\t" "vzeroupper \n\t" : : "r" (n), // 0 "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } OpenBLAS-0.2.20/kernel/x86_64/zscal_sse.S000066400000000000000000000611101313527062700174730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #endif #define XX %r10 #define FLAG %r11 #define I %rax #include "l1param.h" PROLOGUE PROFCODE #ifdef WINDOWS_ABI movaps %xmm3, %xmm0 movsd 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX xor FLAG, FLAG testq M, M jle .L999 pxor %xmm15, %xmm15 comiss %xmm0, %xmm15 jne .L100 # Alpha_r != ZERO comiss %xmm1, %xmm15 jne .L100 # Alpha_i != ZERO /* Alpha == ZERO */ cmpq $2 * SIZE, INCX jne .L50 /* INCX == 1 */ cmpq $3, M jle .L13 testq $4, X je .L05 movss %xmm15, 0 * SIZE(X) addq $SIZE, X movq $1, FLAG decq M ALIGN_3 .L05: testq $8, X je .L06 movlps %xmm15, 0 * SIZE(X) addq $2 * SIZE, X subq $1, M ALIGN_3 .L06: movq M, I # rcx = n sarq $3, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm15, 0 * SIZE(X) movaps %xmm15, 4 * SIZE(X) movaps %xmm15, 8 * SIZE(X) movaps %xmm15, 12 * SIZE(X) addq $16 * SIZE, X decq I jg .L11 ALIGN_4 .L12: testq $7, M je .L19 testq $4, M je .L13 movaps %xmm15, 0 * SIZE(X) movaps %xmm15, 4 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L13: testq $2, M je .L14 movlps %xmm15, 0 * SIZE(X) movhps %xmm15, 2 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L14: testq $1, M je .L19 movlps %xmm15, 0 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L19: testq $1, FLAG je .L999 movss %xmm15, 0 * SIZE(X) jmp .L999 ALIGN_4 /* incx != 1 */ .L50: movq M, I # rcx = n sarq $2, I jle .L52 ALIGN_4 .L51: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movsd %xmm15, 0 * SIZE(X) addq INCX, X movsd %xmm15, 0 * SIZE(X) addq INCX, X movsd %xmm15, 0 * SIZE(X) addq INCX, X movsd %xmm15, 0 * SIZE(X) addq INCX, X decq I jg .L51 ALIGN_4 .L52: testq $2, M je .L53 movsd %xmm15, 0 * SIZE(X) addq INCX, X movsd %xmm15, 0 * SIZE(X) addq INCX, X ALIGN_3 .L53: testq $1, M je .L999 movsd %xmm15, 0 * SIZE(X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: testq $SIZE, X jne .L130 cmpq $2 * SIZE, INCX jne .L120 pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 subps %xmm1, %xmm15 unpcklps %xmm1, %xmm15 subq $-32 * SIZE, X testq $2 * SIZE, X je .L105 movsd -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) addq $2 * SIZE, X decq M jle .L999 ALIGN_3 .L105: movq M, I sarq $4, I jle .L115 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 decq I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps 0 * SIZE(X), %xmm0 pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps 4 * SIZE(X), %xmm1 pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(X) movaps 8 * SIZE(X), %xmm2 pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(X) movaps 12 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm4, %xmm8 mulps %xmm14, %xmm4 mulps %xmm15, %xmm8 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(X) movaps 16 * SIZE(X), %xmm4 pshufd $0xb1, %xmm5, %xmm8 mulps %xmm14, %xmm5 mulps %xmm15, %xmm8 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(X) movaps 20 * SIZE(X), %xmm5 pshufd $0xb1, %xmm6, %xmm8 mulps %xmm14, %xmm6 mulps %xmm15, %xmm8 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(X) movaps 24 * SIZE(X), %xmm6 pshufd $0xb1, %xmm7, %xmm8 mulps %xmm14, %xmm7 mulps %xmm15, %xmm8 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(X) movaps 28 * SIZE(X), %xmm7 subq $-32 * SIZE, X decq I jg .L111 ALIGN_4 .L112: pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(X) pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(X) pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(X) pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(X) pshufd $0xb1, %xmm4, %xmm8 mulps %xmm14, %xmm4 mulps %xmm15, %xmm8 addps %xmm8, %xmm4 movaps %xmm4, -16 * SIZE(X) pshufd $0xb1, %xmm5, %xmm8 mulps %xmm14, %xmm5 mulps %xmm15, %xmm8 addps %xmm8, %xmm5 movaps %xmm5, -12 * SIZE(X) pshufd $0xb1, %xmm6, %xmm8 mulps %xmm14, %xmm6 mulps %xmm15, %xmm8 addps %xmm8, %xmm6 movaps %xmm6, -8 * SIZE(X) pshufd $0xb1, %xmm7, %xmm8 mulps %xmm14, %xmm7 mulps %xmm15, %xmm8 addps %xmm8, %xmm7 movaps %xmm7, -4 * SIZE(X) subq $-32 * SIZE, X ALIGN_4 .L115: testq $8, M je .L116 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(X) pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movaps %xmm2, -24 * SIZE(X) pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movaps %xmm3, -20 * SIZE(X) addq $16 * SIZE, X ALIGN_3 .L116: testq $4, M je .L117 movaps -32 * SIZE(X), %xmm0 movaps -28 * SIZE(X), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(X) pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, -28 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L117: testq $2, M je .L118 movaps -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, -32 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L118: testq $1, M je .L999 movsd -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) jmp .L999 ALIGN_3 .L120: pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 subps %xmm1, %xmm15 unpcklps %xmm1, %xmm15 movq X, XX movq M, I sarq $3, I jle .L125 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm1 addq INCX, X movhps (X), %xmm1 addq INCX, X movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X movsd (X), %xmm3 addq INCX, X movhps (X), %xmm3 addq INCX, X decq I jle .L122 ALIGN_4 .L121: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, (XX) addq INCX, XX movhps %xmm0, (XX) addq INCX, XX movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movlps %xmm1, (XX) addq INCX, XX movhps %xmm1, (XX) addq INCX, XX movsd (X), %xmm1 addq INCX, X movhps (X), %xmm1 addq INCX, X pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movlps %xmm2, (XX) addq INCX, XX movhps %xmm2, (XX) addq INCX, XX movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movlps %xmm3, (XX) addq INCX, XX movhps %xmm3, (XX) addq INCX, XX movsd (X), %xmm3 addq INCX, X movhps (X), %xmm3 addq INCX, X decq I jg .L121 ALIGN_4 .L122: pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, (XX) addq INCX, XX movhps %xmm0, (XX) addq INCX, XX pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movlps %xmm1, (XX) addq INCX, XX movhps %xmm1, (XX) addq INCX, XX pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movlps %xmm2, (XX) addq INCX, XX movhps %xmm2, (XX) addq INCX, XX pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movlps %xmm3, (XX) addq INCX, XX movhps %xmm3, (XX) addq INCX, XX ALIGN_4 .L125: testq $4, M je .L127 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, (XX) addq INCX, XX movhps %xmm0, (XX) addq INCX, XX movsd (X), %xmm1 addq INCX, X movhps (X), %xmm1 addq INCX, X pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movlps %xmm1, (XX) addq INCX, XX movhps %xmm1, (XX) addq INCX, XX ALIGN_3 .L127: testq $2, M je .L128 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, (XX) addq INCX, XX movhps %xmm0, (XX) addq INCX, XX ALIGN_3 .L128: testq $1, M je .L999 movsd (X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, (XX) jmp .L999 ALIGN_3 .L130: cmpq $2 * SIZE, INCX jne .L120 #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 subps %xmm1, %xmm15 unpcklps %xmm1, %xmm15 subq $-31 * SIZE, X testq $2 * SIZE, X je .L130x movsd -31 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -31 * SIZE(X) addq $2 * SIZE, X decq M jle .L999 ALIGN_3 .L130x: shufps $0xb1, %xmm15, %xmm15 movaps -32 * SIZE(X), %xmm0 movaps %xmm0, %xmm9 movq M, I sarq $4, I jle .L135 movaps -28 * SIZE(X), %xmm1 movaps -24 * SIZE(X), %xmm2 movaps -20 * SIZE(X), %xmm3 movaps -16 * SIZE(X), %xmm4 movaps -12 * SIZE(X), %xmm5 movaps -8 * SIZE(X), %xmm6 movaps -4 * SIZE(X), %xmm7 decq I jle .L132 ALIGN_4 .L131: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movss %xmm1, %xmm0 pshufd $0x1b, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, %xmm10 movss %xmm9, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps 0 * SIZE(X), %xmm0 movss %xmm2, %xmm1 pshufd $0x1b, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, %xmm9 movss %xmm10, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps 4 * SIZE(X), %xmm1 movss %xmm3, %xmm2 pshufd $0x1b, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movaps %xmm2, %xmm10 movss %xmm9, %xmm2 movaps %xmm2, -24 * SIZE(X) movaps 8 * SIZE(X), %xmm2 movss %xmm4, %xmm3 pshufd $0x1b, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movaps %xmm3, %xmm9 movss %xmm10, %xmm3 movaps %xmm3, -20 * SIZE(X) movaps 12 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movss %xmm5, %xmm4 pshufd $0x1b, %xmm4, %xmm8 mulps %xmm14, %xmm4 mulps %xmm15, %xmm8 addps %xmm8, %xmm4 movaps %xmm4, %xmm10 movss %xmm9, %xmm4 movaps %xmm4, -16 * SIZE(X) movaps 16 * SIZE(X), %xmm4 movss %xmm6, %xmm5 pshufd $0x1b, %xmm5, %xmm8 mulps %xmm14, %xmm5 mulps %xmm15, %xmm8 addps %xmm8, %xmm5 movaps %xmm5, %xmm9 movss %xmm10, %xmm5 movaps %xmm5, -12 * SIZE(X) movaps 20 * SIZE(X), %xmm5 movss %xmm7, %xmm6 pshufd $0x1b, %xmm6, %xmm8 mulps %xmm14, %xmm6 mulps %xmm15, %xmm8 addps %xmm8, %xmm6 movaps %xmm6, %xmm10 movss %xmm9, %xmm6 movaps %xmm6, -8 * SIZE(X) movaps 24 * SIZE(X), %xmm6 movss %xmm0, %xmm7 pshufd $0x1b, %xmm7, %xmm8 mulps %xmm14, %xmm7 mulps %xmm15, %xmm8 addps %xmm8, %xmm7 movaps %xmm7, %xmm9 movss %xmm10, %xmm7 movaps %xmm7, -4 * SIZE(X) movaps 28 * SIZE(X), %xmm7 subq $-32 * SIZE, X decq I jg .L131 ALIGN_4 .L132: movss %xmm1, %xmm0 pshufd $0x1b, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, %xmm10 movss %xmm9, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps 0 * SIZE(X), %xmm0 movss %xmm2, %xmm1 pshufd $0x1b, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, %xmm9 movss %xmm10, %xmm1 movaps %xmm1, -28 * SIZE(X) movss %xmm3, %xmm2 pshufd $0x1b, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movaps %xmm2, %xmm10 movss %xmm9, %xmm2 movaps %xmm2, -24 * SIZE(X) movss %xmm4, %xmm3 pshufd $0x1b, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movaps %xmm3, %xmm9 movss %xmm10, %xmm3 movaps %xmm3, -20 * SIZE(X) movss %xmm5, %xmm4 pshufd $0x1b, %xmm4, %xmm8 mulps %xmm14, %xmm4 mulps %xmm15, %xmm8 addps %xmm8, %xmm4 movaps %xmm4, %xmm10 movss %xmm9, %xmm4 movaps %xmm4, -16 * SIZE(X) movss %xmm6, %xmm5 pshufd $0x1b, %xmm5, %xmm8 mulps %xmm14, %xmm5 mulps %xmm15, %xmm8 addps %xmm8, %xmm5 movaps %xmm5, %xmm9 movss %xmm10, %xmm5 movaps %xmm5, -12 * SIZE(X) movss %xmm7, %xmm6 pshufd $0x1b, %xmm6, %xmm8 mulps %xmm14, %xmm6 mulps %xmm15, %xmm8 addps %xmm8, %xmm6 movaps %xmm6, %xmm10 movss %xmm9, %xmm6 movaps %xmm6, -8 * SIZE(X) movss %xmm0, %xmm7 pshufd $0x1b, %xmm7, %xmm8 mulps %xmm14, %xmm7 mulps %xmm15, %xmm8 addps %xmm8, %xmm7 movaps %xmm7, %xmm9 movss %xmm10, %xmm7 movaps %xmm7, -4 * SIZE(X) subq $-32 * SIZE, X ALIGN_4 .L135: testq $8, M je .L136 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 pshufd $0x1b, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, %xmm10 movss %xmm9, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -24 * SIZE(X), %xmm2 movss %xmm2, %xmm1 pshufd $0x1b, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, %xmm9 movss %xmm10, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps -20 * SIZE(X), %xmm3 movss %xmm3, %xmm2 pshufd $0x1b, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movaps %xmm2, %xmm10 movss %xmm9, %xmm2 movaps %xmm2, -24 * SIZE(X) movaps -16 * SIZE(X), %xmm0 movss %xmm0, %xmm3 pshufd $0x1b, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movaps %xmm3, %xmm9 movss %xmm10, %xmm3 movaps %xmm3, -20 * SIZE(X) addq $16 * SIZE, X ALIGN_3 .L136: testq $4, M je .L137 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 pshufd $0x1b, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, %xmm10 movss %xmm9, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps -24 * SIZE(X), %xmm2 movss %xmm2, %xmm1 pshufd $0x1b, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movaps %xmm1, %xmm9 movss %xmm10, %xmm1 movaps %xmm1, -28 * SIZE(X) movaps %xmm2, %xmm0 addq $8 * SIZE, X ALIGN_3 .L137: testq $2, M je .L138 movaps -28 * SIZE(X), %xmm1 movss %xmm1, %xmm0 pshufd $0x1b, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movaps %xmm0, %xmm10 movss %xmm9, %xmm0 movaps %xmm0, -32 * SIZE(X) movaps %xmm10, %xmm9 movaps %xmm1, %xmm0 addq $4 * SIZE, X ALIGN_3 .L138: movss %xmm9, -32 * SIZE(X) testq $1, M je .L999 pshufd $0x1b, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 pshufd $0x39, %xmm0, %xmm0 movlps %xmm0, -31 * SIZE(X) jmp .L999 ALIGN_3 #else pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 subps %xmm1, %xmm15 unpcklps %xmm1, %xmm15 subq $-32 * SIZE, X testq $2 * SIZE, X je .L130x movsd -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) addq $2 * SIZE, X decq M jle .L999 ALIGN_3 .L130x: movq M, I sarq $4, I jle .L135 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 movsd -16 * SIZE(X), %xmm4 movhps -14 * SIZE(X), %xmm4 movsd -12 * SIZE(X), %xmm5 movhps -10 * SIZE(X), %xmm5 movsd -8 * SIZE(X), %xmm6 movhps -6 * SIZE(X), %xmm6 movsd -4 * SIZE(X), %xmm7 movhps -2 * SIZE(X), %xmm7 decq I jle .L132 ALIGN_4 .L131: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) movsd 4 * SIZE(X), %xmm1 movhps 6 * SIZE(X), %xmm1 pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movlps %xmm2, -24 * SIZE(X) movhps %xmm2, -22 * SIZE(X) movsd 8 * SIZE(X), %xmm2 movhps 10 * SIZE(X), %xmm2 pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movlps %xmm3, -20 * SIZE(X) movhps %xmm3, -18 * SIZE(X) movsd 12 * SIZE(X), %xmm3 movhps 14 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0xb1, %xmm4, %xmm8 mulps %xmm14, %xmm4 mulps %xmm15, %xmm8 addps %xmm8, %xmm4 movlps %xmm4, -16 * SIZE(X) movhps %xmm4, -14 * SIZE(X) movsd 16 * SIZE(X), %xmm4 movhps 18 * SIZE(X), %xmm4 pshufd $0xb1, %xmm5, %xmm8 mulps %xmm14, %xmm5 mulps %xmm15, %xmm8 addps %xmm8, %xmm5 movlps %xmm5, -12 * SIZE(X) movhps %xmm5, -10 * SIZE(X) movsd 20 * SIZE(X), %xmm5 movhps 22 * SIZE(X), %xmm5 pshufd $0xb1, %xmm6, %xmm8 mulps %xmm14, %xmm6 mulps %xmm15, %xmm8 addps %xmm8, %xmm6 movlps %xmm6, -8 * SIZE(X) movhps %xmm6, -6 * SIZE(X) movsd 24 * SIZE(X), %xmm6 movhps 26 * SIZE(X), %xmm6 pshufd $0xb1, %xmm7, %xmm8 mulps %xmm14, %xmm7 mulps %xmm15, %xmm8 addps %xmm8, %xmm7 movlps %xmm7, -4 * SIZE(X) movhps %xmm7, -2 * SIZE(X) movsd 28 * SIZE(X), %xmm7 movhps 30 * SIZE(X), %xmm7 subq $-32 * SIZE, X decq I jg .L131 ALIGN_4 .L132: pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movlps %xmm2, -24 * SIZE(X) movhps %xmm2, -22 * SIZE(X) pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movlps %xmm3, -20 * SIZE(X) movhps %xmm3, -18 * SIZE(X) pshufd $0xb1, %xmm4, %xmm8 mulps %xmm14, %xmm4 mulps %xmm15, %xmm8 addps %xmm8, %xmm4 movlps %xmm4, -16 * SIZE(X) movhps %xmm4, -14 * SIZE(X) pshufd $0xb1, %xmm5, %xmm8 mulps %xmm14, %xmm5 mulps %xmm15, %xmm8 addps %xmm8, %xmm5 movlps %xmm5, -12 * SIZE(X) movhps %xmm5, -10 * SIZE(X) pshufd $0xb1, %xmm6, %xmm8 mulps %xmm14, %xmm6 mulps %xmm15, %xmm8 addps %xmm8, %xmm6 movlps %xmm6, -8 * SIZE(X) movhps %xmm6, -6 * SIZE(X) pshufd $0xb1, %xmm7, %xmm8 mulps %xmm14, %xmm7 mulps %xmm15, %xmm8 addps %xmm8, %xmm7 movlps %xmm7, -4 * SIZE(X) movhps %xmm7, -2 * SIZE(X) subq $-32 * SIZE, X ALIGN_4 .L135: testq $8, M je .L136 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) movsd -24 * SIZE(X), %xmm2 movhps -22 * SIZE(X), %xmm2 pshufd $0xb1, %xmm2, %xmm8 mulps %xmm14, %xmm2 mulps %xmm15, %xmm8 addps %xmm8, %xmm2 movlps %xmm2, -24 * SIZE(X) movhps %xmm2, -22 * SIZE(X) movsd -20 * SIZE(X), %xmm3 movhps -18 * SIZE(X), %xmm3 pshufd $0xb1, %xmm3, %xmm8 mulps %xmm14, %xmm3 mulps %xmm15, %xmm8 addps %xmm8, %xmm3 movlps %xmm3, -20 * SIZE(X) movhps %xmm3, -18 * SIZE(X) addq $16 * SIZE, X ALIGN_3 .L136: testq $4, M je .L137 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 movsd -28 * SIZE(X), %xmm1 movhps -26 * SIZE(X), %xmm1 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) pshufd $0xb1, %xmm1, %xmm8 mulps %xmm14, %xmm1 mulps %xmm15, %xmm8 addps %xmm8, %xmm1 movlps %xmm1, -28 * SIZE(X) movhps %xmm1, -26 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L137: testq $2, M je .L138 movsd -32 * SIZE(X), %xmm0 movhps -30 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) movhps %xmm0, -30 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L138: testq $1, M je .L999 movsd -32 * SIZE(X), %xmm0 pshufd $0xb1, %xmm0, %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm8 addps %xmm8, %xmm0 movlps %xmm0, -32 * SIZE(X) ALIGN_3 #endif .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zscal_sse2.S000066400000000000000000000773311313527062700175710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 #define X ARG4 #define INCX ARG5 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #endif #define XX %r10 #define FLAG %r11 #define I %rax #include "l1param.h" #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF #endif PROLOGUE PROFCODE #ifdef WINDOWS_ABI movaps %xmm3, %xmm0 movsd 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX xor FLAG, FLAG testq M, M jle .L999 pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L100 comisd %xmm1, %xmm15 jne .L100 /* Alpha == ZERO */ cmpq $2 * SIZE, INCX jne .L20 /* INCX == 1 */ testq $SIZE, X je .L05 movsd %xmm15, 0 * SIZE(X) addq $SIZE, X movq $1, FLAG decq M jle .L19 ALIGN_3 .L05: movq M, I # rcx = n sarq $3, I jle .L12 ALIGN_4 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm15, 0 * SIZE(X) movaps %xmm15, 2 * SIZE(X) movaps %xmm15, 4 * SIZE(X) movaps %xmm15, 6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm15, 8 * SIZE(X) movaps %xmm15, 10 * SIZE(X) movaps %xmm15, 12 * SIZE(X) movaps %xmm15, 14 * SIZE(X) addq $16 * SIZE, X decq I jg .L11 ALIGN_4 .L12: testq $4, M je .L13 movaps %xmm15, 0 * SIZE(X) movaps %xmm15, 2 * SIZE(X) movaps %xmm15, 4 * SIZE(X) movaps %xmm15, 6 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L13: testq $2, M je .L14 movaps %xmm15, 0 * SIZE(X) movaps %xmm15, 2 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L14: testq $1, M je .L19 movaps %xmm15, 0 * SIZE(X) addq $2 * SIZE, X ALIGN_3 .L19: testq $1, FLAG je .L999 movsd %xmm15, 0 * SIZE(X) jmp .L999 ALIGN_4 /* incx != 1 */ .L20: testq $SIZE, X jne .L30 /* Aligned Mode */ movq M, I # rcx = n sarq $2, I jle .L22 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm15, (X) addq INCX, X movaps %xmm15, (X) addq INCX, X movaps %xmm15, (X) addq INCX, X movaps %xmm15, (X) addq INCX, X decq I jg .L21 ALIGN_4 .L22: testq $3, M je .L999 testq $2, M je .L23 movaps %xmm15, (X) addq INCX, X movaps %xmm15, (X) addq INCX, X ALIGN_3 .L23: testq $1, M je .L999 movaps %xmm15, (X) jmp .L999 ALIGN_4 /* Unaligned Mode */ .L30: movq M, I # rcx = n sarq $2, I jle .L32 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movlps %xmm15, 0 * SIZE(X) movlps %xmm15, 1 * SIZE(X) addq INCX, X movlps %xmm15, 0 * SIZE(X) movlps %xmm15, 1 * SIZE(X) addq INCX, X movlps %xmm15, 0 * SIZE(X) movlps %xmm15, 1 * SIZE(X) addq INCX, X movlps %xmm15, 0 * SIZE(X) movlps %xmm15, 1 * SIZE(X) addq INCX, X decq I jg .L31 ALIGN_4 .L32: testq $3, M je .L999 testq $2, M je .L33 movlps %xmm15, 0 * SIZE(X) movlps %xmm15, 1 * SIZE(X) addq INCX, X movlps %xmm15, 0 * SIZE(X) movlps %xmm15, 1 * SIZE(X) addq INCX, X ALIGN_3 .L33: testq $1, M je .L999 movlps %xmm15, 0 * SIZE(X) movlps %xmm15, 1 * SIZE(X) jmp .L999 ALIGN_4 /* Alpha != ZERO */ .L100: testq $SIZE, X jne .L200 #ifdef HAVE_SSE3 movddup %xmm0, %xmm14 #else pshufd $0x44, %xmm0, %xmm14 #endif pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 movlhps %xmm1, %xmm15 cmpq $2 * SIZE, INCX jne .L120 subq $-16 * SIZE, X movq M, I sarq $3, I jle .L115 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 movaps -8 * SIZE(X), %xmm4 movaps -6 * SIZE(X), %xmm5 movaps -4 * SIZE(X), %xmm6 movaps -2 * SIZE(X), %xmm7 decq I jle .L112 ALIGN_4 .L111: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm0, %xmm8 #else movsd -15 * SIZE(X), %xmm8 movhps -16 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps 0 * SIZE(X), %xmm0 #ifdef USE_PSHUFD pshufd $0x4e, %xmm1, %xmm8 #else movsd -13 * SIZE(X), %xmm8 movhps -14 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps 2 * SIZE(X), %xmm1 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm2, %xmm8 #else movsd -11 * SIZE(X), %xmm8 movhps -12 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(X) movaps 4 * SIZE(X), %xmm2 #ifdef USE_PSHUFD pshufd $0x4e, %xmm3, %xmm8 #else movsd -9 * SIZE(X), %xmm8 movhps -10 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(X) movaps 6 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm4, %xmm8 #else movsd -7 * SIZE(X), %xmm8 movhps -8 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movaps %xmm4, -8 * SIZE(X) movaps 8 * SIZE(X), %xmm4 #ifdef USE_PSHUFD pshufd $0x4e, %xmm5, %xmm8 #else movsd -5 * SIZE(X), %xmm8 movhps -6 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movaps %xmm5, -6 * SIZE(X) movaps 10 * SIZE(X), %xmm5 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm6, %xmm8 #else movsd -3 * SIZE(X), %xmm8 movhps -4 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movaps %xmm6, -4 * SIZE(X) movaps 12 * SIZE(X), %xmm6 #ifdef USE_PSHUFD pshufd $0x4e, %xmm7, %xmm8 #else movsd -1 * SIZE(X), %xmm8 movhps -2 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movaps %xmm7, -2 * SIZE(X) movaps 14 * SIZE(X), %xmm7 subq $-16 * SIZE, X decq I jg .L111 ALIGN_4 .L112: pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(X) pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(X) pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(X) pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(X) pshufd $0x4e, %xmm4, %xmm8 mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movaps %xmm4, -8 * SIZE(X) pshufd $0x4e, %xmm5, %xmm8 mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movaps %xmm5, -6 * SIZE(X) pshufd $0x4e, %xmm6, %xmm8 mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movaps %xmm6, -4 * SIZE(X) pshufd $0x4e, %xmm7, %xmm8 mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movaps %xmm7, -2 * SIZE(X) subq $-16 * SIZE, X ALIGN_3 .L115: testq $7, M je .L999 testq $4, M je .L116 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(X) pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm2 movaps -10 * SIZE(X), %xmm3 pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movaps %xmm2, -12 * SIZE(X) pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movaps %xmm3, -10 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L116: testq $2, M je .L117 movaps -16 * SIZE(X), %xmm0 movaps -14 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(X) pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, -14 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L117: testq $1, M je .L999 movaps -16 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, -16 * SIZE(X) jmp .L999 ALIGN_3 .L120: movq X, XX movq M, I sarq $3, I jle .L125 movaps (X), %xmm0 addq INCX, X movaps (X), %xmm1 addq INCX, X movaps (X), %xmm2 addq INCX, X movaps (X), %xmm3 addq INCX, X movaps (X), %xmm4 addq INCX, X movaps (X), %xmm5 addq INCX, X movaps (X), %xmm6 addq INCX, X movaps (X), %xmm7 addq INCX, X decq I jle .L122 ALIGN_4 .L121: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, (XX) addq INCX, XX movaps (X), %xmm0 addq INCX, X pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, (XX) addq INCX, XX movaps (X), %xmm1 addq INCX, X pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movaps %xmm2, (XX) addq INCX, XX movaps (X), %xmm2 addq INCX, X pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movaps %xmm3, (XX) addq INCX, XX movaps (X), %xmm3 addq INCX, X #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm4, %xmm8 mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movaps %xmm4, (XX) addq INCX, XX movaps (X), %xmm4 addq INCX, X pshufd $0x4e, %xmm5, %xmm8 mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movaps %xmm5, (XX) addq INCX, XX movaps (X), %xmm5 addq INCX, X pshufd $0x4e, %xmm6, %xmm8 mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movaps %xmm6, (XX) addq INCX, XX movaps (X), %xmm6 addq INCX, X pshufd $0x4e, %xmm7, %xmm8 mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movaps %xmm7, (XX) addq INCX, XX movaps (X), %xmm7 addq INCX, X decq I jg .L121 ALIGN_4 .L122: pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, (XX) addq INCX, XX pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, (XX) addq INCX, XX pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movaps %xmm2, (XX) addq INCX, XX pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movaps %xmm3, (XX) addq INCX, XX pshufd $0x4e, %xmm4, %xmm8 mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movaps %xmm4, (XX) addq INCX, XX pshufd $0x4e, %xmm5, %xmm8 mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movaps %xmm5, (XX) addq INCX, XX pshufd $0x4e, %xmm6, %xmm8 mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movaps %xmm6, (XX) addq INCX, XX pshufd $0x4e, %xmm7, %xmm8 mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movaps %xmm7, (XX) addq INCX, XX ALIGN_3 .L125: testq $7, M je .L999 testq $4, M je .L126 movaps (X), %xmm0 addq INCX, X movaps (X), %xmm1 addq INCX, X pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, (XX) addq INCX, XX pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, (XX) addq INCX, XX movaps (X), %xmm2 addq INCX, X movaps (X), %xmm3 addq INCX, X pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movaps %xmm2, (XX) addq INCX, XX pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movaps %xmm3, (XX) addq INCX, XX ALIGN_3 .L126: testq $2, M je .L127 movaps (X), %xmm0 addq INCX, X movaps (X), %xmm1 addq INCX, X pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, (XX) addq INCX, XX pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movaps %xmm1, (XX) addq INCX, XX ALIGN_3 .L127: testq $1, M je .L999 movaps (X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movaps %xmm0, (XX) jmp .L999 ALIGN_3 .L200: cmpq $2 * SIZE, INCX jne .L220 #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 movlhps %xmm1, %xmm15 shufpd $1, %xmm15, %xmm15 movhps 0 * SIZE(X), %xmm0 movaps 1 * SIZE(X), %xmm1 subq $-16 * SIZE, X unpckhpd %xmm0, %xmm0 mulsd %xmm14, %xmm0 movaps %xmm1, %xmm8 mulsd %xmm15, %xmm8 subsd %xmm8, %xmm0 movlps %xmm0, -16 * SIZE(X) decq M movq M, I sarq $3, I jle .L205 movaps -13 * SIZE(X), %xmm2 movaps -11 * SIZE(X), %xmm3 movaps -9 * SIZE(X), %xmm4 decq I jle .L202 ALIGN_4 .L201: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps %xmm1, %xmm8 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -7 * SIZE(X), %xmm5 movaps %xmm2, %xmm8 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -5 * SIZE(X), %xmm6 movaps %xmm3, %xmm8 SHUFPD_1 %xmm4, %xmm2 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -11 * SIZE(X) movaps -3 * SIZE(X), %xmm7 movaps %xmm4, %xmm8 SHUFPD_1 %xmm5, %xmm3 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -9 * SIZE(X) movaps -1 * SIZE(X), %xmm0 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps %xmm5, %xmm8 SHUFPD_1 %xmm6, %xmm4 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm4 addpd %xmm8, %xmm4 movaps %xmm4, -7 * SIZE(X) movaps 1 * SIZE(X), %xmm1 movaps %xmm6, %xmm8 SHUFPD_1 %xmm7, %xmm5 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm5 addpd %xmm8, %xmm5 movaps %xmm5, -5 * SIZE(X) movaps 3 * SIZE(X), %xmm2 movaps %xmm7, %xmm8 SHUFPD_1 %xmm0, %xmm6 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm6 addpd %xmm8, %xmm6 movaps %xmm6, -3 * SIZE(X) movaps 5 * SIZE(X), %xmm3 movaps %xmm0, %xmm8 SHUFPD_1 %xmm1, %xmm7 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm7 addpd %xmm8, %xmm7 movaps %xmm7, -1 * SIZE(X) movaps 7 * SIZE(X), %xmm4 subq $-16 * SIZE, X decq I jg .L201 ALIGN_4 .L202: movaps %xmm1, %xmm8 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -7 * SIZE(X), %xmm5 movaps %xmm2, %xmm8 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -5 * SIZE(X), %xmm6 movaps %xmm3, %xmm8 SHUFPD_1 %xmm4, %xmm2 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -11 * SIZE(X) movaps -3 * SIZE(X), %xmm7 movaps %xmm4, %xmm8 SHUFPD_1 %xmm5, %xmm3 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -9 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps %xmm5, %xmm8 SHUFPD_1 %xmm6, %xmm4 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm4 addpd %xmm8, %xmm4 movaps %xmm4, -7 * SIZE(X) movaps 1 * SIZE(X), %xmm1 movaps %xmm6, %xmm8 SHUFPD_1 %xmm7, %xmm5 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm5 addpd %xmm8, %xmm5 movaps %xmm5, -5 * SIZE(X) movaps %xmm7, %xmm8 SHUFPD_1 %xmm0, %xmm6 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm6 addpd %xmm8, %xmm6 movaps %xmm6, -3 * SIZE(X) movaps %xmm0, %xmm8 SHUFPD_1 %xmm1, %xmm7 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm7 addpd %xmm8, %xmm7 movaps %xmm7, -1 * SIZE(X) subq $-16 * SIZE, X ALIGN_3 .L205: testq $4, M je .L206 movaps -13 * SIZE(X), %xmm2 movaps %xmm1, %xmm8 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm3 movaps %xmm2, %xmm8 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps %xmm3, %xmm8 SHUFPD_1 %xmm0, %xmm2 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm2 addpd %xmm8, %xmm2 movaps %xmm2, -11 * SIZE(X) movaps -7 * SIZE(X), %xmm1 movaps %xmm0, %xmm8 SHUFPD_1 %xmm1, %xmm3 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm3 addpd %xmm8, %xmm3 movaps %xmm3, -9 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L206: testq $2, M je .L207 movaps -13 * SIZE(X), %xmm2 movaps %xmm1, %xmm8 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm3 movaps %xmm2, %xmm8 SHUFPD_1 %xmm3, %xmm1 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm1 addpd %xmm8, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $4 * SIZE, X ALIGN_3 .L207: testq $1, M je .L208 movaps -13 * SIZE(X), %xmm2 movaps %xmm1, %xmm8 SHUFPD_1 %xmm2, %xmm0 mulpd %xmm14, %xmm8 mulpd %xmm15, %xmm0 addpd %xmm8, %xmm0 movaps %xmm0, -15 * SIZE(X) movaps %xmm1, %xmm0 movaps %xmm2, %xmm1 addq $2 * SIZE, X ALIGN_3 .L208: unpckhpd %xmm0, %xmm0 mulsd %xmm14, %xmm1 mulsd %xmm15, %xmm0 addsd %xmm1, %xmm0 movlps %xmm0, -15 * SIZE(X) jmp .L999 ALIGN_3 #else movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 movlhps %xmm1, %xmm15 subq $-16 * SIZE, X movq M, I sarq $3, I jle .L205 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 movsd -8 * SIZE(X), %xmm4 movhps -7 * SIZE(X), %xmm4 movsd -6 * SIZE(X), %xmm5 movhps -5 * SIZE(X), %xmm5 movsd -4 * SIZE(X), %xmm6 movhps -3 * SIZE(X), %xmm6 movsd -2 * SIZE(X), %xmm7 movhps -1 * SIZE(X), %xmm7 decq I jle .L202 ALIGN_4 .L201: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm0, %xmm8 #else movsd -15 * SIZE(X), %xmm8 movhps -16 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 #ifdef USE_PSHUFD pshufd $0x4e, %xmm1, %xmm8 #else movsd -13 * SIZE(X), %xmm8 movhps -14 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) movsd 2 * SIZE(X), %xmm1 movhps 3 * SIZE(X), %xmm1 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm2, %xmm8 #else movsd -11 * SIZE(X), %xmm8 movhps -12 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movlps %xmm2, -12 * SIZE(X) movhps %xmm2, -11 * SIZE(X) movsd 4 * SIZE(X), %xmm2 movhps 5 * SIZE(X), %xmm2 #ifdef USE_PSHUFD pshufd $0x4e, %xmm3, %xmm8 #else movsd -9 * SIZE(X), %xmm8 movhps -10 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movlps %xmm3, -10 * SIZE(X) movhps %xmm3, -9 * SIZE(X) movsd 6 * SIZE(X), %xmm3 movhps 7 * SIZE(X), %xmm3 #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm4, %xmm8 #else movsd -7 * SIZE(X), %xmm8 movhps -8 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movlps %xmm4, -8 * SIZE(X) movhps %xmm4, -7 * SIZE(X) movsd 8 * SIZE(X), %xmm4 movhps 9 * SIZE(X), %xmm4 #ifdef USE_PSHUFD pshufd $0x4e, %xmm5, %xmm8 #else movsd -5 * SIZE(X), %xmm8 movhps -6 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movlps %xmm5, -6 * SIZE(X) movhps %xmm5, -5 * SIZE(X) movsd 10 * SIZE(X), %xmm5 movhps 11 * SIZE(X), %xmm5 #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) pshufd $0x4e, %xmm6, %xmm8 #else movsd -3 * SIZE(X), %xmm8 movhps -4 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movlps %xmm6, -4 * SIZE(X) movhps %xmm6, -3 * SIZE(X) movsd 12 * SIZE(X), %xmm6 movhps 13 * SIZE(X), %xmm6 #ifdef USE_PSHUFD pshufd $0x4e, %xmm7, %xmm8 #else movsd -1 * SIZE(X), %xmm8 movhps -2 * SIZE(X), %xmm8 #endif mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movlps %xmm7, -2 * SIZE(X) movhps %xmm7, -1 * SIZE(X) movsd 14 * SIZE(X), %xmm7 movhps 15 * SIZE(X), %xmm7 subq $-16 * SIZE, X decq I jg .L201 ALIGN_4 .L202: pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movlps %xmm2, -12 * SIZE(X) movhps %xmm2, -11 * SIZE(X) pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movlps %xmm3, -10 * SIZE(X) movhps %xmm3, -9 * SIZE(X) pshufd $0x4e, %xmm4, %xmm8 mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movlps %xmm4, -8 * SIZE(X) movhps %xmm4, -7 * SIZE(X) pshufd $0x4e, %xmm5, %xmm8 mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movlps %xmm5, -6 * SIZE(X) movhps %xmm5, -5 * SIZE(X) pshufd $0x4e, %xmm6, %xmm8 mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movlps %xmm6, -4 * SIZE(X) movhps %xmm6, -3 * SIZE(X) pshufd $0x4e, %xmm7, %xmm8 mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movlps %xmm7, -2 * SIZE(X) movhps %xmm7, -1 * SIZE(X) subq $-16 * SIZE, X ALIGN_3 .L205: testq $7, M je .L999 testq $4, M je .L206 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) movsd -12 * SIZE(X), %xmm2 movhps -11 * SIZE(X), %xmm2 movsd -10 * SIZE(X), %xmm3 movhps -9 * SIZE(X), %xmm3 pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movlps %xmm2, -12 * SIZE(X) movhps %xmm2, -11 * SIZE(X) pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movlps %xmm3, -10 * SIZE(X) movhps %xmm3, -9 * SIZE(X) addq $8 * SIZE, X ALIGN_3 .L206: testq $2, M je .L207 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) movsd -14 * SIZE(X), %xmm1 movhps -13 * SIZE(X), %xmm1 pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, -14 * SIZE(X) movhps %xmm1, -13 * SIZE(X) addq $4 * SIZE, X ALIGN_3 .L207: testq $1, M je .L999 movsd -16 * SIZE(X), %xmm0 movhps -15 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, -16 * SIZE(X) movhps %xmm0, -15 * SIZE(X) jmp .L999 ALIGN_3 #endif .L220: movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 movlhps %xmm1, %xmm15 movq X, XX movq M, I sarq $3, I jle .L225 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addq INCX, X movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addq INCX, X movsd 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X movsd 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X movsd 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addq INCX, X movsd 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addq INCX, X decq I jle .L222 ALIGN_4 .L221: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addq INCX, X pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addq INCX, X pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addq INCX, X pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addq INCX, X #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif pshufd $0x4e, %xmm4, %xmm8 mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(XX) movhps %xmm4, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 addq INCX, X pshufd $0x4e, %xmm5, %xmm8 mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movlps %xmm5, 0 * SIZE(XX) movhps %xmm5, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm5 movhps 1 * SIZE(X), %xmm5 addq INCX, X pshufd $0x4e, %xmm6, %xmm8 mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movlps %xmm6, 0 * SIZE(XX) movhps %xmm6, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm6 movhps 1 * SIZE(X), %xmm6 addq INCX, X pshufd $0x4e, %xmm7, %xmm8 mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movlps %xmm7, 0 * SIZE(XX) movhps %xmm7, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm7 movhps 1 * SIZE(X), %xmm7 addq INCX, X decq I jg .L221 ALIGN_4 .L222: pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addq INCX, XX pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addq INCX, XX pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addq INCX, XX pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addq INCX, XX pshufd $0x4e, %xmm4, %xmm8 mulpd %xmm14, %xmm4 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(XX) movhps %xmm4, 1 * SIZE(XX) addq INCX, XX pshufd $0x4e, %xmm5, %xmm8 mulpd %xmm14, %xmm5 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm5 movlps %xmm5, 0 * SIZE(XX) movhps %xmm5, 1 * SIZE(XX) addq INCX, XX pshufd $0x4e, %xmm6, %xmm8 mulpd %xmm14, %xmm6 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm6 movlps %xmm6, 0 * SIZE(XX) movhps %xmm6, 1 * SIZE(XX) addq INCX, XX pshufd $0x4e, %xmm7, %xmm8 mulpd %xmm14, %xmm7 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm7 movlps %xmm7, 0 * SIZE(XX) movhps %xmm7, 1 * SIZE(XX) addq INCX, XX ALIGN_3 .L225: testq $7, M je .L999 testq $4, M je .L226 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addq INCX, X pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addq INCX, X pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm2 movhps 1 * SIZE(X), %xmm2 addq INCX, X pshufd $0x4e, %xmm2, %xmm8 mulpd %xmm14, %xmm2 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(XX) movhps %xmm2, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm3 movhps 1 * SIZE(X), %xmm3 addq INCX, X pshufd $0x4e, %xmm3, %xmm8 mulpd %xmm14, %xmm3 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm3 movlps %xmm3, 0 * SIZE(XX) movhps %xmm3, 1 * SIZE(XX) addq INCX, XX ALIGN_3 .L226: testq $2, M je .L227 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 addq INCX, X pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) addq INCX, XX movsd 0 * SIZE(X), %xmm1 movhps 1 * SIZE(X), %xmm1 addq INCX, X pshufd $0x4e, %xmm1, %xmm8 mulpd %xmm14, %xmm1 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm1 movlps %xmm1, 0 * SIZE(XX) movhps %xmm1, 1 * SIZE(XX) addq INCX, XX ALIGN_3 .L227: testq $1, M je .L999 movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 pshufd $0x4e, %xmm0, %xmm8 mulpd %xmm14, %xmm0 mulpd %xmm15, %xmm8 addpd %xmm8, %xmm0 movlps %xmm0, 0 * SIZE(XX) movhps %xmm0, 1 * SIZE(XX) ALIGN_3 .L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zswap.S000066400000000000000000000223361313527062700166600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define N ARG1 /* rdi */ #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define N ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %rbx #endif #define XX %r10 #define YY %r11 #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY #else movq 40(%rsp), INCY #endif #else pushq %rbx movq 56(%rsp), X movq 64(%rsp), INCX movq 72(%rsp), Y movq 80(%rsp), INCY #endif EMMS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY cmpq $2 * SIZE, INCX jne .L14 cmpq $2 * SIZE, INCY jne .L14 movq N, %rax sarq $2, %rax jle .L15 ALIGN_3 .L16: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq 16(X), %mm2 movq 24(X), %mm3 movq 0(Y), %mm4 movq 8(Y), %mm5 movq 16(Y), %mm6 movq 24(Y), %mm7 movq %mm4, 0(X) movq %mm5, 8(X) movq %mm6, 16(X) movq %mm7, 24(X) movq %mm0, 0(Y) movq %mm1, 8(Y) movq %mm2, 16(Y) movq %mm3, 24(Y) movq 32(X), %mm0 movq 40(X), %mm1 movq 48(X), %mm2 movq 56(X), %mm3 movq 32(Y), %mm4 movq 40(Y), %mm5 movq 48(Y), %mm6 movq 56(Y), %mm7 movq %mm4, 32(X) movq %mm5, 40(X) movq %mm6, 48(X) movq %mm7, 56(X) movq %mm0, 32(Y) movq %mm1, 40(Y) movq %mm2, 48(Y) movq %mm3, 56(Y) movq 64(X), %mm0 movq 72(X), %mm1 movq 80(X), %mm2 movq 88(X), %mm3 movq 64(Y), %mm4 movq 72(Y), %mm5 movq 80(Y), %mm6 movq 88(Y), %mm7 movq %mm4, 64(X) movq %mm5, 72(X) movq %mm6, 80(X) movq %mm7, 88(X) movq %mm0, 64(Y) movq %mm1, 72(Y) movq %mm2, 80(Y) movq %mm3, 88(Y) movq 96(X), %mm0 movq 104(X), %mm1 movq 112(X), %mm2 movq 120(X), %mm3 movq 96(Y), %mm4 movq 104(Y), %mm5 movq 112(Y), %mm6 movq 120(Y), %mm7 movq %mm4, 96(X) movq %mm5, 104(X) movq %mm6, 112(X) movq %mm7, 120(X) movq %mm0, 96(Y) movq %mm1, 104(Y) movq %mm2, 112(Y) movq %mm3, 120(Y) #elif defined(DOUBLE) prefetchw PREFETCHSIZE * SIZE(X) MOVQ 0 * SIZE(X), %mm0 MOVQ 1 * SIZE(X), %mm1 MOVQ 2 * SIZE(X), %mm2 MOVQ 3 * SIZE(X), %mm3 prefetchw PREFETCHSIZE * SIZE(Y) MOVQ 0 * SIZE(Y), %mm4 MOVQ 1 * SIZE(Y), %mm5 MOVQ 2 * SIZE(Y), %mm6 MOVQ 3 * SIZE(Y), %mm7 MOVQ %mm4, 0 * SIZE(X) MOVQ %mm5, 1 * SIZE(X) MOVQ %mm6, 2 * SIZE(X) MOVQ %mm7, 3 * SIZE(X) MOVQ %mm0, 0 * SIZE(Y) MOVQ %mm1, 1 * SIZE(Y) MOVQ %mm2, 2 * SIZE(Y) MOVQ %mm3, 3 * SIZE(Y) MOVQ 4 * SIZE(X), %mm0 MOVQ 5 * SIZE(X), %mm1 MOVQ 6 * SIZE(X), %mm2 MOVQ 7 * SIZE(X), %mm3 MOVQ 4 * SIZE(Y), %mm4 MOVQ 5 * SIZE(Y), %mm5 MOVQ 6 * SIZE(Y), %mm6 MOVQ 7 * SIZE(Y), %mm7 MOVQ %mm4, 4 * SIZE(X) MOVQ %mm5, 5 * SIZE(X) MOVQ %mm6, 6 * SIZE(X) MOVQ %mm7, 7 * SIZE(X) MOVQ %mm0, 4 * SIZE(Y) MOVQ %mm1, 5 * SIZE(Y) MOVQ %mm2, 6 * SIZE(Y) MOVQ %mm3, 7 * SIZE(Y) #else #ifdef OPTERON prefetchw PREFETCHSIZE * SIZE(X) #endif movq 0 * SIZE(X), %mm0 movq 2 * SIZE(X), %mm1 movq 4 * SIZE(X), %mm2 movq 6 * SIZE(X), %mm3 movq 0 * SIZE(Y), %mm4 movq 2 * SIZE(Y), %mm5 movq 4 * SIZE(Y), %mm6 movq 6 * SIZE(Y), %mm7 #ifdef OPTERON prefetchw PREFETCHSIZE * SIZE(Y) #endif movq %mm4, 0 * SIZE(X) movq %mm5, 2 * SIZE(X) movq %mm6, 4 * SIZE(X) movq %mm7, 6 * SIZE(X) movq %mm0, 0 * SIZE(Y) movq %mm1, 2 * SIZE(Y) movq %mm2, 4 * SIZE(Y) movq %mm3, 6 * SIZE(Y) #endif addq $8 * SIZE, X addq $8 * SIZE, Y decq %rax jg .L16 ALIGN_3 .L15: movq N, %rax andq $3, %rax jle .L27 ALIGN_3 .L22: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq 16(X), %mm2 movq 24(X), %mm3 movq 0(Y), %mm4 movq 8(Y), %mm5 movq 16(Y), %mm6 movq 24(Y), %mm7 movq %mm4, 0(X) movq %mm5, 8(X) movq %mm6, 16(X) movq %mm7, 24(X) movq %mm0, 0(Y) movq %mm1, 8(Y) movq %mm2, 16(Y) movq %mm3, 24(Y) #elif defined(DOUBLE) movq 0 * SIZE(X), %mm0 movq 1 * SIZE(X), %mm1 movq 0 * SIZE(Y), %mm4 movq 1 * SIZE(Y), %mm5 movq %mm4, 0 * SIZE(X) movq %mm5, 1 * SIZE(X) movq %mm0, 0 * SIZE(Y) movq %mm1, 1 * SIZE(Y) #else movq 0 * SIZE(X), %mm0 movq 0 * SIZE(Y), %mm4 movq %mm4, 0 * SIZE(X) movq %mm0, 0 * SIZE(Y) #endif addq $2 * SIZE, X addq $2 * SIZE, Y decq %rax jg .L22 jmp .L27 ALIGN_3 /* INCX != 1 or INCY != 1 */ .L14: movq N, %rax movq X, XX movq Y, YY sarq $1, %rax jle .L28 ALIGN_2 .L29: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq 16(X), %mm2 movq 24(X), %mm3 addq INCX, X movq 0(Y), %mm4 movq 8(Y), %mm5 movq 16(Y), %mm6 movq 24(Y), %mm7 addq INCY, Y movq %mm4, 0(XX) movq %mm5, 8(XX) movq %mm6, 16(XX) movq %mm7, 24(XX) addq INCX, XX movq %mm0, 0(YY) movq %mm1, 8(YY) movq %mm2, 16(YY) movq %mm3, 24(YY) addq INCY, YY movq 0(X), %mm0 movq 8(X), %mm1 movq 16(X), %mm2 movq 24(X), %mm3 addq INCX, X movq 0(Y), %mm4 movq 8(Y), %mm5 movq 16(Y), %mm6 movq 24(Y), %mm7 addq INCY, Y movq %mm4, 0(XX) movq %mm5, 8(XX) movq %mm6, 16(XX) movq %mm7, 24(XX) addq INCX, XX movq %mm0, 0(YY) movq %mm1, 8(YY) movq %mm2, 16(YY) movq %mm3, 24(YY) addq INCY, YY #elif defined(DOUBLE) movq 0 * SIZE(X), %mm0 movq 1 * SIZE(X), %mm1 addq INCX, X movq 0 * SIZE(X), %mm2 movq 1 * SIZE(X), %mm3 addq INCX, X movq 0 * SIZE(Y), %mm4 movq 1 * SIZE(Y), %mm5 addq INCY, Y movq 0 * SIZE(Y), %mm6 movq 1 * SIZE(Y), %mm7 addq INCY, Y movq %mm4, 0 * SIZE(XX) movq %mm5, 1 * SIZE(XX) addq INCX, XX movq %mm6, 0 * SIZE(XX) movq %mm7, 1 * SIZE(XX) addq INCX, XX movq %mm0, 0 * SIZE(YY) movq %mm1, 1 * SIZE(YY) addq INCY, YY movq %mm2, 0 * SIZE(YY) movq %mm3, 1 * SIZE(YY) addq INCY, YY #else movq 0 * SIZE(X), %mm0 addq INCX, X movq 0 * SIZE(X), %mm2 addq INCX, X movq 0 * SIZE(Y), %mm4 addq INCY, Y movq 0 * SIZE(Y), %mm6 addq INCY, Y movq %mm4, 0 * SIZE(XX) addq INCX, XX movq %mm6, 0 * SIZE(XX) addq INCX, XX movq %mm0, 0 * SIZE(YY) addq INCY, YY movq %mm2, 0 * SIZE(YY) addq INCY, YY #endif decq %rax jg .L29 ALIGN_3 .L28: movq N, %rax andq $1, %rax jle .L27 ALIGN_3 .L35: #ifdef XDOUBLE movq 0(X), %mm0 movq 8(X), %mm1 movq 16(X), %mm2 movq 24(X), %mm3 movq 0(Y), %mm4 movq 8(Y), %mm5 movq 16(Y), %mm6 movq 24(Y), %mm7 movq %mm4, 0(X) movq %mm5, 8(X) movq %mm6, 16(X) movq %mm7, 24(X) movq %mm0, 0(Y) movq %mm1, 8(Y) movq %mm2, 16(Y) movq %mm3, 24(Y) #elif defined(DOUBLE) movq 0 * SIZE(X), %mm0 movq 1 * SIZE(X), %mm1 movq 0 * SIZE(Y), %mm4 movq 1 * SIZE(Y), %mm5 movq %mm4, 0 * SIZE(X) movq %mm5, 1 * SIZE(X) movq %mm0, 0 * SIZE(Y) movq %mm1, 1 * SIZE(Y) #else movq 0 * SIZE(X), %mm0 movq 0 * SIZE(Y), %mm4 movq %mm4, 0 * SIZE(X) movq %mm0, 0 * SIZE(Y) #endif addq INCX, X addq INCY, Y decq %rax jg .L35 ALIGN_3 .L27: EMMS xorq %rax,%rax #ifdef WINDOWS_ABI popq %rbx #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zswap_sse.S000066400000000000000000000502331313527062700175270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %rbx #endif #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI movq 8(%rsp), INCY #else pushq %rbx movq 56(%rsp), X movq 64(%rsp), INCX movq 72(%rsp), Y movq 80(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq M, M jle .L19 cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 addq M, M subq $-32 * SIZE, X subq $-32 * SIZE, Y cmpq $3, M jle .L16 testq $SIZE, Y je .L05 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M ALIGN_3 .L05: testq $2 * SIZE, Y je .L10 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y subq $2, M jle .L19 ALIGN_3 .L10: cmpq $3, M jle .L16 testq $2 * SIZE, X jne .L30 testq $1 * SIZE, X jne .L20 movq M, %rax sarq $5, %rax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) subq $-32 * SIZE, Y subq $-32 * SIZE, X decq %rax jg .L11 ALIGN_3 .L13: testq $16, M jle .L14 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) movaps -24 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movaps %xmm0, -24 * SIZE(Y) movaps %xmm1, -24 * SIZE(X) movaps -20 * SIZE(X), %xmm0 movaps -20 * SIZE(Y), %xmm1 movaps %xmm0, -20 * SIZE(Y) movaps %xmm1, -20 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L14: testq $8, M jle .L15 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) movaps -28 * SIZE(X), %xmm0 movaps -28 * SIZE(Y), %xmm1 movaps %xmm0, -28 * SIZE(Y) movaps %xmm1, -28 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L15: testq $4, M jle .L16 movaps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movaps %xmm0, -32 * SIZE(Y) movaps %xmm1, -32 * SIZE(X) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L16: testq $2, M jle .L17 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) addq $2 * SIZE, X movlps %xmm0, -32 * SIZE(Y) addq $2 * SIZE, Y ALIGN_3 .L17: testq $1, M jle .L19 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) movss %xmm0, -32 * SIZE(Y) ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) pshufd $0x39, %xmm1, %xmm3 movlps %xmm3, -31 * SIZE(X) subq $3, M movq M, %rax sarq $5, %rax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -13 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -13 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -5 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -5 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L23: testq $16, M jle .L24 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) movaps -21 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -21 * SIZE(X) movaps -17 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -17 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L24: testq $8, M jle .L25 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps -25 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x39, %xmm2, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x93, %xmm1, %xmm3 movaps %xmm3, -25 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L25: testq $4, M jle .L26 movaps -29 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x39, %xmm0, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x93, %xmm3, %xmm1 movaps %xmm1, -29 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L26: pshufd $0x39, %xmm0, %xmm2 pshufd $0xff, %xmm0, %xmm0 movlps %xmm2, -32 * SIZE(Y) movss %xmm0, -30 * SIZE(Y) testq $2, M jle .L27 movsd -29 * SIZE(X), %xmm0 movsd -29 * SIZE(Y), %xmm1 movlps %xmm0, -29 * SIZE(Y) movlps %xmm1, -29 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L27: testq $1, M jle .L29 movss -29 * SIZE(X), %xmm0 movss -29 * SIZE(Y), %xmm1 movss %xmm0, -29 * SIZE(Y) movss %xmm1, -29 * SIZE(X) ALIGN_3 .L29: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L30: testq $1 * SIZE, X jne .L40 movhps -32 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movlps %xmm1, -32 * SIZE(X) subq $2, M movq M, %rax sarq $5, %rax jle .L33 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -14 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -14 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -6 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -6 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -2 * SIZE(X) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L33: testq $16, M jle .L34 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) movaps -22 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -22 * SIZE(X) movaps -18 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -18 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L34: testq $8, M jle .L35 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps -26 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -26 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L35: testq $4, M jle .L36 movaps -30 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -30 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L36: movhps %xmm0, -32 * SIZE(Y) testq $2, M jle .L37 movsd -30 * SIZE(X), %xmm0 movsd -30 * SIZE(Y), %xmm1 movlps %xmm0, -30 * SIZE(Y) movlps %xmm1, -30 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L37: testq $1, M jle .L39 movss -30 * SIZE(X), %xmm0 movss -30 * SIZE(Y), %xmm1 movss %xmm0, -30 * SIZE(Y) movss %xmm1, -30 * SIZE(X) ALIGN_3 .L39: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 movss %xmm1, -32 * SIZE(X) subq $3, M movq M, %rax sarq $5, %rax jle .L43 ALIGN_4 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -12 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -11 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -12 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -11 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -7 * SIZE(X), %xmm2 movaps -4 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -3 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -4 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -3 * SIZE(X) subq $-32 * SIZE, X subq $-32 * SIZE, Y decq %rax jg .L41 ALIGN_3 .L43: testq $16, M jle .L44 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) movaps -23 * SIZE(X), %xmm2 movaps -20 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -24 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -23 * SIZE(X) movaps -19 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -20 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -19 * SIZE(X) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3 .L44: testq $8, M jle .L45 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps -27 * SIZE(X), %xmm0 movaps -24 * SIZE(Y), %xmm1 movss %xmm0, %xmm2 shufps $0x93, %xmm0, %xmm2 movaps %xmm2, -28 * SIZE(Y) movss %xmm1, %xmm3 shufps $0x39, %xmm3, %xmm3 movaps %xmm3, -27 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L45: testq $4, M jle .L46 movaps -31 * SIZE(X), %xmm2 movaps -28 * SIZE(Y), %xmm3 movss %xmm2, %xmm0 shufps $0x93, %xmm2, %xmm0 movaps %xmm0, -32 * SIZE(Y) movss %xmm3, %xmm1 shufps $0x39, %xmm1, %xmm1 movaps %xmm1, -31 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L46: movsd -31 * SIZE(X), %xmm2 pshufd $0x39, %xmm1, %xmm1 movlps %xmm1, -31 * SIZE(X) pshufd $0xff, %xmm0, %xmm0 movss %xmm0, -32 * SIZE(Y) movlps %xmm2, -31 * SIZE(Y) addq $3 * SIZE, X addq $3 * SIZE, Y testq $2, M jle .L47 movsd -32 * SIZE(X), %xmm0 movsd -32 * SIZE(Y), %xmm1 movlps %xmm0, -32 * SIZE(Y) movlps %xmm1, -32 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L47: testq $1, M jle .L49 movss -32 * SIZE(X), %xmm0 movss -32 * SIZE(Y), %xmm1 movss %xmm0, -32 * SIZE(Y) movss %xmm1, -32 * SIZE(X) ALIGN_3 .L49: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L50: movq M, %rax sarq $2, %rax jle .L55 ALIGN_3 .L51: movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addq INCX, X movlps %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addq INCX, X movlps %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addq INCX, X movlps %xmm0, (Y) addq INCY, Y movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addq INCX, X movlps %xmm0, (Y) addq INCY, Y decq %rax jg .L51 ALIGN_3 .L55: movq M, %rax andq $3, %rax jle .L57 ALIGN_3 .L56: movsd (X), %xmm0 movsd (Y), %xmm1 movlps %xmm1, (X) addq INCX, X movlps %xmm0, (Y) addq INCY, Y decq %rax jg .L56 ALIGN_3 .L57: xorq %rax, %rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zswap_sse2.S000066400000000000000000000433131313527062700176120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ #define X ARG4 #define INCX ARG5 #define Y ARG6 #define INCY ARG2 #else #define M ARG1 #define X ARG2 #define INCX ARG3 #define Y ARG4 #define INCY %rbx #endif #include "l1param.h" PROLOGUE PROFCODE #ifndef WINDOWS_ABI movq 8(%rsp), INCY #else pushq %rbx movq 56(%rsp), X movq 64(%rsp), INCX movq 72(%rsp), Y movq 80(%rsp), INCY #endif SAVEREGISTERS salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq M, M jle .L19 cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 subq $-16 * SIZE, X subq $-16 * SIZE, Y testq $SIZE, Y jne .L30 testq $SIZE, X jne .L20 movq M, %rax sarq $3, %rax jle .L13 ALIGN_3 .L11: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -6 * SIZE(X), %xmm0 movaps -6 * SIZE(Y), %xmm1 movaps %xmm0, -6 * SIZE(Y) movaps %xmm1, -6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps -2 * SIZE(Y), %xmm1 movaps %xmm0, -2 * SIZE(Y) movaps %xmm1, -2 * SIZE(X) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L11 ALIGN_3 .L13: testq $4, M jle .L14 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L14: testq $2, M jle .L15 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L15: testq $1, M jle .L19 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L19: xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L20: movhps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movlps %xmm1, -16 * SIZE(X) decq M jle .L29 movq M, %rax sarq $3, %rax jle .L23 ALIGN_4 .L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -7 * SIZE(X), %xmm2 movaps -6 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -7 * SIZE(X) movaps -5 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -5 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -3 * SIZE(X), %xmm2 movaps -2 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -4 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -3 * SIZE(X) movaps -1 * SIZE(X), %xmm0 movaps 0 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -2 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(X) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L21 ALIGN_3 .L23: testq $4, M jle .L24 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) movaps -11 * SIZE(X), %xmm2 movaps -10 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(X) movaps -9 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L24: testq $2, M jle .L25 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) movaps -13 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(Y) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(X) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L25: testq $1, M jle .L29 movaps -15 * SIZE(X), %xmm2 movaps -14 * SIZE(Y), %xmm3 SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(X) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L29: movaps -15 * SIZE(X), %xmm2 movhps %xmm1, -15 * SIZE(X) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(Y) xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L30: testq $SIZE, X jne .L40 movhps -16 * SIZE(Y), %xmm0 movaps -16 * SIZE(X), %xmm1 movlps %xmm1, -16 * SIZE(Y) decq M jle .L39 movq M, %rax sarq $3, %rax jle .L33 ALIGN_4 .L31: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) movaps -13 * SIZE(Y), %xmm0 movaps -12 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -11 * SIZE(Y), %xmm2 movaps -10 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(Y) movaps -9 * SIZE(Y), %xmm0 movaps -8 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -7 * SIZE(Y), %xmm2 movaps -6 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -8 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -7 * SIZE(Y) movaps -5 * SIZE(Y), %xmm0 movaps -4 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -6 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -5 * SIZE(Y) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -3 * SIZE(Y), %xmm2 movaps -2 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -4 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -3 * SIZE(Y) movaps -1 * SIZE(Y), %xmm0 movaps 0 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -2 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -1 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L31 ALIGN_3 .L33: testq $4, M jle .L34 movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) movaps -13 * SIZE(Y), %xmm0 movaps -12 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(Y) movaps -11 * SIZE(Y), %xmm2 movaps -10 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -12 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -11 * SIZE(Y) movaps -9 * SIZE(Y), %xmm0 movaps -8 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -10 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -9 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L34: testq $2, M jle .L35 movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) movaps -13 * SIZE(Y), %xmm0 movaps -12 * SIZE(X), %xmm1 SHUFPD_1 %xmm0, %xmm2 movaps %xmm2, -14 * SIZE(X) SHUFPD_1 %xmm1, %xmm3 movaps %xmm3, -13 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L35: testq $1, M jle .L39 movaps -15 * SIZE(Y), %xmm2 movaps -14 * SIZE(X), %xmm3 SHUFPD_1 %xmm3, %xmm1 movaps %xmm1, -15 * SIZE(Y) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L39: movaps -15 * SIZE(Y), %xmm2 movhps %xmm1, -15 * SIZE(Y) SHUFPD_1 %xmm2, %xmm0 movaps %xmm0, -16 * SIZE(X) xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L40: movsd -16 * SIZE(X), %xmm0 movsd -16 * SIZE(Y), %xmm1 movlps %xmm0, -16 * SIZE(Y) movlps %xmm1, -16 * SIZE(X) addq $SIZE, X addq $SIZE, Y decq M jle .L49 movq M, %rax sarq $3, %rax jle .L43 ALIGN_3 .L41: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) #endif movaps -8 * SIZE(X), %xmm0 movaps -8 * SIZE(Y), %xmm1 movaps %xmm0, -8 * SIZE(Y) movaps %xmm1, -8 * SIZE(X) movaps -6 * SIZE(X), %xmm0 movaps -6 * SIZE(Y), %xmm1 movaps %xmm0, -6 * SIZE(Y) movaps %xmm1, -6 * SIZE(X) #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif movaps -4 * SIZE(X), %xmm0 movaps -4 * SIZE(Y), %xmm1 movaps %xmm0, -4 * SIZE(Y) movaps %xmm1, -4 * SIZE(X) movaps -2 * SIZE(X), %xmm0 movaps -2 * SIZE(Y), %xmm1 movaps %xmm0, -2 * SIZE(Y) movaps %xmm1, -2 * SIZE(X) subq $-16 * SIZE, Y subq $-16 * SIZE, X decq %rax jg .L41 ALIGN_3 .L43: testq $4, M jle .L44 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) movaps -12 * SIZE(X), %xmm0 movaps -12 * SIZE(Y), %xmm1 movaps %xmm0, -12 * SIZE(Y) movaps %xmm1, -12 * SIZE(X) movaps -10 * SIZE(X), %xmm0 movaps -10 * SIZE(Y), %xmm1 movaps %xmm0, -10 * SIZE(Y) movaps %xmm1, -10 * SIZE(X) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3 .L44: testq $2, M jle .L45 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) movaps -14 * SIZE(X), %xmm0 movaps -14 * SIZE(Y), %xmm1 movaps %xmm0, -14 * SIZE(Y) movaps %xmm1, -14 * SIZE(X) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3 .L45: testq $1, M jle .L49 movaps -16 * SIZE(X), %xmm0 movaps -16 * SIZE(Y), %xmm1 movaps %xmm0, -16 * SIZE(Y) movaps %xmm1, -16 * SIZE(X) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3 .L49: movsd -16 * SIZE(X), %xmm0 movsd -16 * SIZE(Y), %xmm1 movlps %xmm0, -16 * SIZE(Y) movlps %xmm1, -16 * SIZE(X) xorq %rax,%rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L50: testq $SIZE, X jne .L60 testq $SIZE, Y jne .L60 movq M, %rax sarq $2, %rax jle .L55 ALIGN_3 .L51: movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addq INCX, X movaps %xmm0, (Y) addq INCY, Y movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addq INCX, X movaps %xmm0, (Y) addq INCY, Y movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addq INCX, X movaps %xmm0, (Y) addq INCY, Y movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addq INCX, X movaps %xmm0, (Y) addq INCY, Y decq %rax jg .L51 ALIGN_3 .L55: movq M, %rax andq $3, %rax jle .L57 ALIGN_3 .L56: movaps (X), %xmm0 movaps (Y), %xmm1 movaps %xmm1, (X) addq INCX, X movaps %xmm0, (Y) addq INCY, Y decq %rax jg .L56 ALIGN_3 .L57: xorq %rax, %rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret ALIGN_3 .L60: movq M, %rax sarq $2, %rax jle .L65 ALIGN_3 .L61: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addq INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addq INCY, Y movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addq INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addq INCY, Y movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addq INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addq INCY, Y movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addq INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addq INCY, Y decq %rax jg .L61 ALIGN_3 .L65: movq M, %rax andq $3, %rax jle .L67 ALIGN_3 .L66: movsd 0 * SIZE(X), %xmm0 movhps 1 * SIZE(X), %xmm0 movsd 0 * SIZE(Y), %xmm1 movhps 1 * SIZE(Y), %xmm1 movlps %xmm1, 0 * SIZE(X) movhps %xmm1, 1 * SIZE(X) addq INCX, X movlps %xmm0, 0 * SIZE(Y) movhps %xmm0, 1 * SIZE(Y) addq INCY, Y decq %rax jg .L66 ALIGN_3 .L67: xorq %rax, %rax RESTOREREGISTERS #ifdef WINDOWS_ABI popq %rbx #endif ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zsymv_L_sse.S000066400000000000000000000401261313527062700200260ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 28) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 12) #define movsd movlpd #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define OLD_INCX 64 + STACKSIZE(%rsp) #define OLD_Y 72 + STACKSIZE(%rsp) #define OLD_INCY 80 + STACKSIZE(%rsp) #define OLD_BUFFER 88 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA_R %xmm0 #define ALPHA_I %xmm1 #define xtemp1 %xmm0 #define xtemp2 %xmm1 #define xtemp3 %xmm2 #define xtemp4 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xsum1 %xmm8 #define xsum2 %xmm9 #define yy1 %xmm10 #define yy2 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define xt1 %xmm15 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY salq $ZBASE_SHIFT, LDA testq M, M jle .L999 pcmpeqb %xmm2, %xmm2 xorpd %xmm3, %xmm3 psllq $63, %xmm2 unpcklpd %xmm3, %xmm2 unpcklpd ALPHA_I, ALPHA_R unpcklpd ALPHA_R, ALPHA_I xorpd %xmm2, ALPHA_I movq BUFFER, XX movq M, %rax sarq $2, %rax jle .L02 ALIGN_3 .L01: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) movapd %xmm5, 4 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 6 * SIZE(XX) MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 8 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 10 * SIZE(XX) movapd %xmm5, 12 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 14 * SIZE(XX) subq $-16 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $3, %rax jle .L05 ALIGN_3 .L03: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 addpd %xmm4, %xmm3 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) addq $4 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $2 * SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $2, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 movhpd 1 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 movhpd 1 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 movhpd 1 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $3, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) addq $2 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: xorq IS, IS # is = 0 cmpq $2, N jl .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 1), A2 leaq 4 * SIZE(A, LDA, 2), A leaq (, IS, SIZE), I leaq 0 * SIZE(NEW_X, I, 4), XX leaq 4 * SIZE(NEW_Y, I, 2), YY movapd 0 * SIZE(XX), atemp1 movapd 2 * SIZE(XX), atemp2 movapd 4 * SIZE(XX), atemp3 movapd 6 * SIZE(XX), atemp4 MOVDDUP(0 * SIZE, A1, xsum1) MOVDDUP(2 * SIZE, A1, xsum2) mulpd atemp1, xsum1 mulpd atemp1, xsum2 MOVDDUP(1 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A1, a2) mulpd atemp2, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2 MOVDDUP(2 * SIZE, A1, a1) MOVDDUP(2 * SIZE, A2, a2) mulpd atemp3, a1 mulpd atemp3, a2 addpd a1, xsum1 addpd a2, xsum2 MOVDDUP(3 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A2, a2) mulpd atemp4, a1 mulpd atemp4, a2 addpd a1, xsum1 addpd a2, xsum2 MOVDDUP(4 * SIZE, A1, a1) MOVDDUP(6 * SIZE, A2, a2) movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 movsd 2 * SIZE(YY), yy2 movhpd 3 * SIZE(YY), yy2 movapd 8 * SIZE(XX), xtemp1 movapd 10 * SIZE(XX), xtemp2 movapd 12 * SIZE(XX), xtemp3 movapd 14 * SIZE(XX), xtemp4 addq $8 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $2, I sarq $2, I jle .L15 ALIGN_3 .L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) PREFETCH PREFETCHSIZE(A1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(XX) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(6 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(A2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(5 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(7 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(6 * SIZE, A1, a1) PREFETCHW PREFETCHSIZE(YY) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(4 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 20 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(7 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 16 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(5 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 22 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP( 8 * SIZE, A1, a1) movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(10 * SIZE, A2, a2) movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY addq $ 8 * SIZE, A1 addq $ 8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: movq M, I subq IS, I subq $2, I testq $2, I jle .L16 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L16: testq $1, M jle .L18 MOVDDUP(1 * SIZE, A1, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(0 * SIZE, A2, a1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum1 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum2 addpd a1, yy1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) ALIGN_3 .L18: leaq (, IS, SIZE), I movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1 movsd 2 * SIZE(NEW_Y, I, 2), yy2 movhpd 3 * SIZE(NEW_Y, I, 2), yy2 addpd xsum1, yy1 addpd xsum2, yy2 movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) movlpd yy2, 2 * SIZE(NEW_Y, I, 2) movhpd yy2, 3 * SIZE(NEW_Y, I, 2) addq $2, IS movq IS, I addq $2, I cmpq M, I jle .L11 ALIGN_3 .L20: HALT testq $1, N jle .L990 leaq (, IS, SIZE), I movapd 0 * SIZE(NEW_X, I, 4), atemp1 movapd 2 * SIZE(NEW_X, I, 4), atemp2 movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1 MOVDDUP(0 * SIZE, A, a1) MOVDDUP(1 * SIZE, A, a2) mulpd atemp1, a1 mulpd atemp2, a2 addpd a1, yy1 addpd a2, yy1 movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) ALIGN_3 .L990: cmpq $2 * SIZE, INCY je .L999 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3 .L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) movhpd %xmm1, 1 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) movhpd %xmm3, 1 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3 .L998: movapd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y addq $2 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zsymv_L_sse2.S000066400000000000000000000425231313527062700201130ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 28) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 12) #define movsd movlpd #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define OLD_INCX 64 + STACKSIZE(%rsp) #define OLD_Y 72 + STACKSIZE(%rsp) #define OLD_INCY 80 + STACKSIZE(%rsp) #define OLD_BUFFER 88 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA_R %xmm0 #define ALPHA_I %xmm1 #define xtemp1 %xmm0 #define xtemp2 %xmm1 #define xtemp3 %xmm2 #define xtemp4 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xsum1 %xmm8 #define xsum2 %xmm9 #define yy1 %xmm10 #define yy2 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define xt1 %xmm15 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c #endif #ifndef HEMV #define ADD addpd #else #define ADD subpd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY salq $ZBASE_SHIFT, LDA testq M, M jle .L999 pcmpeqb %xmm2, %xmm2 xorpd %xmm3, %xmm3 psllq $63, %xmm2 unpcklpd %xmm3, %xmm2 unpcklpd ALPHA_I, ALPHA_R unpcklpd ALPHA_R, ALPHA_I xorpd %xmm2, ALPHA_I movq BUFFER, XX movq M, %rax sarq $2, %rax jle .L02 ALIGN_3 .L01: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) movapd %xmm5, 4 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 6 * SIZE(XX) MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 8 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 10 * SIZE(XX) movapd %xmm5, 12 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 14 * SIZE(XX) subq $-16 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $3, %rax jle .L05 ALIGN_3 .L03: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 addpd %xmm4, %xmm3 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) addq $4 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $2 * SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $2, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 movhpd 1 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 movhpd 1 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 movhpd 1 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $3, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) addq $2 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: xorq IS, IS # is = 0 cmpq $2, N jl .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 1), A2 leaq 4 * SIZE(A, LDA, 2), A leaq (, IS, SIZE), I leaq 0 * SIZE(NEW_X, I, 4), XX leaq 4 * SIZE(NEW_Y, I, 2), YY movapd 0 * SIZE(XX), atemp1 movapd 2 * SIZE(XX), atemp2 movapd 4 * SIZE(XX), atemp3 movapd 6 * SIZE(XX), atemp4 MOVDDUP(0 * SIZE, A1, xsum1) MOVDDUP(2 * SIZE, A1, xsum2) mulpd atemp1, xsum1 mulpd atemp1, xsum2 #ifndef HEMV MOVDDUP(1 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A1, a2) mulpd atemp2, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2 #else MOVDDUP(3 * SIZE, A1, a2) mulpd atemp2, a2 addpd a2, xsum2 #endif MOVDDUP(2 * SIZE, A1, a1) MOVDDUP(2 * SIZE, A2, a2) mulpd atemp3, a1 mulpd atemp3, a2 addpd a1, xsum1 addpd a2, xsum2 #ifndef HEMV MOVDDUP(3 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A2, a2) mulpd atemp4, a1 mulpd atemp4, a2 addpd a1, xsum1 addpd a2, xsum2 #else MOVDDUP(3 * SIZE, A1, a1) mulpd atemp4, a1 subpd a1, xsum1 #endif MOVDDUP(4 * SIZE, A1, a1) MOVDDUP(6 * SIZE, A2, a2) movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 movsd 2 * SIZE(YY), yy2 movhpd 3 * SIZE(YY), yy2 movapd 8 * SIZE(XX), xtemp1 movapd 10 * SIZE(XX), xtemp2 movapd 12 * SIZE(XX), xtemp3 movapd 14 * SIZE(XX), xtemp4 addq $8 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $2, I sarq $2, I jle .L15 ALIGN_3 .L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) PREFETCH PREFETCHSIZE(A1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(XX) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(6 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(A2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(5 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(7 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(6 * SIZE, A1, a1) PREFETCHW PREFETCHSIZE(YY) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(4 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 20 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(7 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 16 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(5 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 22 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP( 8 * SIZE, A1, a1) movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(10 * SIZE, A2, a2) movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY addq $ 8 * SIZE, A1 addq $ 8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: movq M, I subq IS, I subq $2, I testq $2, I jle .L16 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L16: testq $1, M jle .L18 MOVDDUP(1 * SIZE, A1, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(0 * SIZE, A2, a1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum1 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum2 addpd a1, yy1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) ALIGN_3 .L18: leaq (, IS, SIZE), I movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1 movsd 2 * SIZE(NEW_Y, I, 2), yy2 movhpd 3 * SIZE(NEW_Y, I, 2), yy2 addpd xsum1, yy1 addpd xsum2, yy2 movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) movlpd yy2, 2 * SIZE(NEW_Y, I, 2) movhpd yy2, 3 * SIZE(NEW_Y, I, 2) addq $2, IS movq IS, I addq $2, I cmpq N, I jle .L11 ALIGN_3 .L20: testq $1, N jle .L990 leaq (, IS, SIZE), I movapd 0 * SIZE(NEW_X, I, 4), atemp1 movapd 2 * SIZE(NEW_X, I, 4), atemp2 movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1 #ifndef HEMV MOVDDUP(0 * SIZE, A, a1) MOVDDUP(1 * SIZE, A, a2) mulpd atemp1, a1 mulpd atemp2, a2 addpd a1, yy1 addpd a2, yy1 #else MOVDDUP(0 * SIZE, A, a1) mulpd atemp1, a1 addpd a1, yy1 #endif movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) ALIGN_3 .L990: cmpq $2 * SIZE, INCY je .L999 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3 .L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) movhpd %xmm1, 1 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) movhpd %xmm3, 1 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3 .L998: movapd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y addq $2 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zsymv_U_sse.S000066400000000000000000000267361313527062700200520ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 28) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 12) #define movsd movlpd #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 14) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define OLD_INCX 64 + STACKSIZE(%rsp) #define OLD_Y 72 + STACKSIZE(%rsp) #define OLD_INCY 80 + STACKSIZE(%rsp) #define OLD_BUFFER 88 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA_R %xmm0 #define ALPHA_I %xmm1 #define xsum1 %xmm0 #define xsum2 %xmm1 #define xsum3 %xmm2 #define xsum4 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xtemp1 %xmm8 #define xtemp2 %xmm9 #define a1 %xmm10 #define a2 %xmm11 #define a3 %xmm12 #define yy1 %xmm13 #define xt1 %xmm14 #define xt2 %xmm15 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY salq $ZBASE_SHIFT, LDA testq M, M jle .L999 negq IS addq M, IS movq IS, TEMP imulq LDA, TEMP addq TEMP, A pcmpeqb %xmm3, %xmm3 xorpd %xmm2, %xmm2 pslld $31, %xmm3 unpckhps %xmm3, %xmm2 shufps $0, ALPHA_R, ALPHA_R shufps $0, ALPHA_I, ALPHA_I movaps ALPHA_I, %xmm3 unpcklps ALPHA_R, ALPHA_I unpcklps %xmm3, ALPHA_R pxor %xmm2, ALPHA_R movq BUFFER, XX movq M, %rax sarq $2, %rax jle .L02 ALIGN_3 .L01: movsd 0 * SIZE(X), %xmm4 addq INCX, X movhps 0 * SIZE(X), %xmm4 addq INCX, X movsd 0 * SIZE(X), %xmm6 addq INCX, X movhps 0 * SIZE(X), %xmm6 addq INCX, X movsldup %xmm4, %xmm3 movshdup %xmm4, %xmm4 movsldup %xmm6, %xmm5 movshdup %xmm6, %xmm6 mulps ALPHA_I, %xmm3 mulps ALPHA_R, %xmm4 mulps ALPHA_I, %xmm5 mulps ALPHA_R, %xmm6 addps %xmm4, %xmm3 addps %xmm6, %xmm5 movaps %xmm3, 4 * SIZE(XX) movaps %xmm5, 12 * SIZE(XX) shufps $0xb1, %xmm3, %xmm3 shufps $0xb1, %xmm5, %xmm5 pxor %xmm2, %xmm3 pxor %xmm2, %xmm5 movaps %xmm3, 0 * SIZE(XX) movaps %xmm5, 8 * SIZE(XX) subq $-16 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: testq $2, M jle .L03 movsd 0 * SIZE(X), %xmm4 addq INCX, X movhps 0 * SIZE(X), %xmm4 addq INCX, X movsldup %xmm4, %xmm3 movshdup %xmm4, %xmm4 mulps ALPHA_I, %xmm3 mulps ALPHA_R, %xmm4 addps %xmm4, %xmm3 movaps %xmm3, 4 * SIZE(XX) shufps $0xb1, %xmm3, %xmm3 pxor %xmm2, %xmm3 movaps %xmm3, 0 * SIZE(XX) subq $-8 * SIZE, XX ALIGN_3 .L03: testq $1, M jle .L05 movsd 0 * SIZE(X), %xmm4 addq INCX, X movsldup %xmm4, %xmm3 movshdup %xmm4, %xmm4 mulps ALPHA_I, %xmm3 mulps ALPHA_R, %xmm4 addps %xmm4, %xmm3 movlps %xmm3, 2 * SIZE(XX) shufps $0xb1, %xmm3, %xmm3 pxor %xmm2, %xmm3 movlps %xmm3, 0 * SIZE(XX) subq $-4 * SIZE, XX ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $2 * SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $2, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movhps 0 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 addq INCY, YY movhps 0 * SIZE(YY), %xmm1 addq INCY, YY movaps %xmm0, 0 * SIZE(XX) movaps %xmm1, 8 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $3, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movlps %xmm0, 0 * SIZE(XX) addq $2 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: movq IS, I addq $2, I cmpq M, I jg .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A leaq (, IS, 4), I movsd 0 * SIZE(NEW_X, I, SIZE), atemp2 movhps 4 * SIZE(NEW_X, I, SIZE), atemp2 movsd 2 * SIZE(NEW_X, I, SIZE), atemp4 movhps 6 * SIZE(NEW_X, I, SIZE), atemp4 pshufd $0xcc, atemp2, atemp1 pshufd $0x99, atemp2, atemp2 pshufd $0xcc, atemp4, atemp3 pshufd $0x99, atemp4, atemp4 pxor xsum1, xsum1 pxor xsum2, xsum2 pxor xsum3, xsum3 pxor xsum4, xsum4 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $2, I jle .L15 ALIGN_3 .L12: HALT subq $-16 * SIZE, XX addq $ 8 * SIZE, YY addq $ 8 * SIZE, A1 addq $ 8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: testq $2, IS jle .L18 movsd 0 * SIZE(YY), yy1 movhps 2 * SIZE(YY), yy1 movaps 0 * SIZE(XX), xtemp1 movaps 4 * SIZE(XX), xtemp2 movsd 0 * SIZE(A1), a1 movhps 2 * SIZE(A1), a1 movaps xtemp1, xt1 movaps xtemp2, xt2 mulps a1, xt1 mulps a1, xt2 addps xt1, xsum1 addps xt2, xsum2 pshufd $0xb1, a1, xt2 mulps atemp1, a1 mulps atemp2, xt2 addps a1, yy1 addps xt2, yy1 movsd 0 * SIZE(A2), a1 movhps 2 * SIZE(A2), a1 movaps xtemp1, xt1 movaps xtemp2, xt2 mulps a1, xt1 mulps a1, xt2 addps xt1, xsum3 addps xt2, xsum4 pshufd $0xb1, a1, xt2 mulps atemp1, a1 mulps atemp2, xt2 addps a1, yy1 addps xt2, yy1 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) addq $8 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L18: leaq (, IS, 4), I movaps 0 * SIZE(NEW_X, I, SIZE), atemp1 movaps 4 * SIZE(NEW_X, I, SIZE), atemp2 movlps 0 * SIZE(YY), yy1 movhps 2 * SIZE(YY), yy1 movsd 0 * SIZE(A1), a1 movhps 0 * SIZE(A2), a1 movaps a1, a2 mulps atemp1, a1 mulps atemp2, a2 addps a1, xsum1 addps a2, xsum2 movsd 0 * SIZE(A2), a1 movhps 2 * SIZE(A2), a1 movaps a1, a2 mulps atemp1, a1 mulps atemp2, a2 addps a1, xsum3 addps a2, xsum4 haddps xsum2, xsum1 haddps xsum4, xsum3 haddps xsum3, xsum1 addps xsum1, yy1 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) addq $2, IS movq IS, I addq $2, I cmpq M, I jle .L11 ALIGN_3 .L20: testq $1, M jle .L990 .L990: cmpq $2 * SIZE, INCY je .L999 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3 .L996: movaps 0 * SIZE(NEW_Y), %xmm0 movaps 4 * SIZE(NEW_Y), %xmm1 movlps %xmm0, 0 * SIZE(Y) addq INCY, Y movhps %xmm0, 0 * SIZE(Y) addq INCY, Y movlps %xmm1, 0 * SIZE(Y) addq INCY, Y movhps %xmm1, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3 .L998: movlps 0 * SIZE(NEW_Y), %xmm0 addq $2 * SIZE, NEW_Y movlps %xmm0, 0 * SIZE(Y) addq INCY, Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/zsymv_U_sse2.S000066400000000000000000000434541313527062700201300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 28) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 12) #define movsd movlpd #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 28) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define IS ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define OLD_INCX 64 + STACKSIZE(%rsp) #define OLD_Y 72 + STACKSIZE(%rsp) #define OLD_INCY 80 + STACKSIZE(%rsp) #define OLD_BUFFER 88 + STACKSIZE(%rsp) #define M ARG1 #define IS ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define NEW_X BUFFER #define NEW_Y X #define ALPHA_R %xmm0 #define ALPHA_I %xmm1 #define xtemp1 %xmm0 #define xtemp2 %xmm1 #define xtemp3 %xmm2 #define xtemp4 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xsum1 %xmm8 #define xsum2 %xmm9 #define yy1 %xmm10 #define yy2 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define xt1 %xmm15 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c #endif #ifndef HEMV #define ADD addpd #else #define ADD subpd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 movaps %xmm3, %xmm1 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY salq $ZBASE_SHIFT, LDA testq M, M jle .L999 negq IS addq M, IS movq IS, TEMP imulq LDA, TEMP addq TEMP, A pcmpeqb %xmm2, %xmm2 xorpd %xmm3, %xmm3 psllq $63, %xmm2 unpcklpd %xmm3, %xmm2 unpcklpd ALPHA_I, ALPHA_R unpcklpd ALPHA_R, ALPHA_I xorpd %xmm2, ALPHA_I movq BUFFER, XX movq M, %rax sarq $2, %rax jle .L02 ALIGN_3 .L01: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) movapd %xmm5, 4 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 6 * SIZE(XX) MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 8 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 10 * SIZE(XX) movapd %xmm5, 12 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 14 * SIZE(XX) subq $-16 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $3, %rax jle .L05 ALIGN_3 .L03: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 addpd %xmm4, %xmm3 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) addq $4 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $2 * SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $2, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 movhpd 1 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 movhpd 1 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 movhpd 1 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $3, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) addq $2 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: movq IS, I addq $2, I cmpq M, I jg .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 1), A2 leaq (A, LDA, 2), A leaq (, IS, 4), I movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 movapd 4 * SIZE(NEW_X, I, SIZE), atemp3 movapd 6 * SIZE(NEW_X, I, SIZE), atemp4 pxor xsum1, xsum1 pxor xsum2, xsum2 movsd 0 * SIZE(NEW_Y), yy1 movhpd 1 * SIZE(NEW_Y), yy1 movsd 2 * SIZE(NEW_Y), yy2 movhpd 3 * SIZE(NEW_Y), yy2 movapd 0 * SIZE(NEW_X), xtemp1 movapd 2 * SIZE(NEW_X), xtemp2 movapd 4 * SIZE(NEW_X), xtemp3 movapd 6 * SIZE(NEW_X), xtemp4 MOVDDUP(0 * SIZE, A1, a1) MOVDDUP(2 * SIZE, A2, a2) MOVDDUP(1 * SIZE, A1, a3) movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $2, I jle .L15 ALIGN_3 .L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(3 * SIZE, A2, a1) PREFETCH PREFETCHSIZE(A1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(2 * SIZE, A1, a2) movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp2, a3 ADD xt1, xsum1 addpd a3, yy1 MOVDDUP(0 * SIZE, A2, a3) movapd xtemp4, xt1 mulpd a1, xt1 mulpd atemp4, a1 ADD xt1, xsum2 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) PREFETCH PREFETCHSIZE(XX) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum2 addpd a3, yy1 MOVDDUP(4 * SIZE, A1, a3) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 MOVDDUP(6 * SIZE, A2, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(5 * SIZE, A1, a2) PREFETCH PREFETCHSIZE(A2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp1, a3 addpd xt1, xsum1 addpd a3, yy1 MOVDDUP(7 * SIZE, A2, a3) movapd xtemp3, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum2 addpd a1, yy2 MOVDDUP(6 * SIZE, A1, a1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum1 addpd a2, yy1 MOVDDUP(4 * SIZE, A2, a2) PREFETCHW PREFETCHSIZE(YY) movapd xtemp4, xt1 mulpd a3, xt1 mulpd atemp4, a3 ADD xt1, xsum2 addpd a3, yy2 MOVDDUP(7 * SIZE, A1, a3) movapd xtemp3, xt1 movapd 20 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(5 * SIZE, A2, a1) movapd xtemp1, xt1 movapd 16 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(10 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 22 * SIZE(XX), xtemp4 mulpd a3, xt1 mulpd atemp2, a3 ADD xt1, xsum1 addpd a3, yy2 MOVDDUP( 9 * SIZE, A1, a3) movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 mulpd a1, xt1 mulpd atemp4, a1 ADD xt1, xsum2 addpd a1, yy1 MOVDDUP( 8 * SIZE, A1, a1) movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY addq $ 8 * SIZE, A1 addq $ 8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: testq $2, IS jle .L18 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) movapd xtemp3, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 mulpd a1, xt1 mulpd atemp2, a1 ADD xt1, xsum1 addpd a1, yy2 movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp4, a2 ADD xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L18: MOVDDUP(0 * SIZE, A1, a1) MOVDDUP(0 * SIZE, A2, a2) mulpd atemp1, a1 mulpd atemp1, a2 addpd a1, xsum1 addpd a2, xsum2 #ifndef HEMV MOVDDUP(1 * SIZE, A1, a1) MOVDDUP(1 * SIZE, A2, a2) mulpd atemp2, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2 #else MOVDDUP(1 * SIZE, A2, a2) mulpd atemp2, a2 subpd a2, xsum2 #endif MOVDDUP(0 * SIZE, A2, a1) MOVDDUP(2 * SIZE, A2, a2) mulpd atemp3, a1 mulpd atemp3, a2 addpd a1, xsum1 addpd a2, xsum2 #ifndef HEMV MOVDDUP(1 * SIZE, A2, a1) MOVDDUP(3 * SIZE, A2, a2) mulpd atemp4, a1 mulpd atemp4, a2 addpd a1, xsum1 addpd a2, xsum2 #else MOVDDUP(1 * SIZE, A2, a1) mulpd atemp4, a1 addpd a1, xsum1 #endif addpd xsum1, yy1 addpd xsum2, yy2 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) addq $2, IS movq IS, I addq $2, I cmpq M, I jle .L11 ALIGN_3 .L20: testq $1, M jle .L990 movq A, A1 leaq (, IS, 4), I movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 pxor xsum1, xsum1 pxor xsum2, xsum2 MOVDDUP(0 * SIZE, A1, a1) MOVDDUP(1 * SIZE, A1, a2) movapd 0 * SIZE(NEW_X), xtemp1 movapd 2 * SIZE(NEW_X), xtemp2 movapd 4 * SIZE(NEW_X), xtemp3 movapd 6 * SIZE(NEW_X), xtemp4 movsd 0 * SIZE(NEW_Y), yy1 movhpd 1 * SIZE(NEW_Y), yy1 movsd 2 * SIZE(NEW_Y), yy2 movhpd 3 * SIZE(NEW_Y), yy2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $1, I jle .L28 ALIGN_3 .L22: movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum2 addpd a2, yy1 MOVDDUP(3 * SIZE, A1, a2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a2, xt1 mulpd atemp2, a2 ADD xt1, xsum2 addpd a2, yy2 MOVDDUP(5 * SIZE, A1, a2) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 addq $8 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 decq I jg .L22 ALIGN_3 .L28: MOVDDUP(0 * SIZE, A1, a1) #ifndef HEMV MOVDDUP(1 * SIZE, A1, a2) mulpd atemp1, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2 #else mulpd atemp1, a1 addpd a1, xsum1 #endif addpd xsum2, xsum1 addpd xsum1, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) ALIGN_3 .L990: cmpq $2 * SIZE, INCY je .L999 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3 .L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) movhpd %xmm1, 1 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) movhpd %xmm3, 1 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3 .L998: movapd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y addq $2 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S000066400000000000000000000436621313527062700223430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define BB %rbx #define KK %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define KKK 232(%rsp) #define AORIG 240(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) #ifndef CONJ #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 addsd #define ADDSD4 subsd #elif defined(LN) || defined(LT) #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 subsd #define ADDSD4 addsd #else #define ADDSD1 addsd #define ADDSD2 subsd #define ADDSD3 addsd #define ADDSD4 addsd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J testq N, N jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax), BB testq $1, M jle .L20 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 ADDSD3 %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 ADDSD1 %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 ADDSD3 %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 ADDSD3 %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH BRANCH je .L29 ALIGN_4 .L26: ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 ADDSD1 %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD3 %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: ADDSD2 %xmm2, %xmm9 ADDSD4 %xmm6, %xmm11 addsd %xmm11, %xmm8 addsd %xmm9, %xmm10 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm1 #endif subsd %xmm8, %xmm0 subsd %xmm10, %xmm1 #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm6 movaps %xmm0, %xmm5 movsd 1 * SIZE(AO), %xmm7 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD3 %xmm5, %xmm1 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 movaps %xmm0, %xmm5 movsd 1 * SIZE(BO), %xmm9 movaps %xmm1, %xmm4 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm1 mulsd %xmm9, %xmm5 mulsd %xmm9, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD2 %xmm5, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm1, 1 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: movq M, I sarq $1, I jle .L99 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: ADDSD2 %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO ADDSD3 %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH BRANCH je .L18 ALIGN_4 .L16: ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L18: ADDSD2 %xmm2, %xmm13 ADDSD3 %xmm7, %xmm14 ADDSD4 %xmm6, %xmm15 addsd %xmm11, %xmm8 addsd %xmm9, %xmm10 addsd %xmm15, %xmm12 addsd %xmm13, %xmm14 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm1 movsd 2 * SIZE(AO), %xmm2 movsd 3 * SIZE(AO), %xmm3 #endif subsd %xmm8, %xmm0 subsd %xmm10, %xmm1 subsd %xmm12, %xmm2 subsd %xmm14, %xmm3 #ifdef LN movsd 6 * SIZE(AO), %xmm6 movsd 7 * SIZE(AO), %xmm7 movaps %xmm2, %xmm5 movaps %xmm3, %xmm4 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 movsd 4 * SIZE(AO), %xmm6 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 movsd 5 * SIZE(AO), %xmm7 ADDSD4 %xmm4, %xmm2 ADDSD3 %xmm5, %xmm3 movaps %xmm2, %xmm4 movaps %xmm3, %xmm5 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm5 mulsd %xmm3, %xmm6 mulsd %xmm2, %xmm7 subsd %xmm4, %xmm0 subsd %xmm6, %xmm1 movsd 0 * SIZE(AO), %xmm6 ADDSD3 %xmm5, %xmm0 ADDSD4 %xmm7, %xmm1 movsd 1 * SIZE(AO), %xmm7 movaps %xmm0, %xmm5 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD3 %xmm5, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm6 movsd 1 * SIZE(AO), %xmm7 movaps %xmm0, %xmm5 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 movsd 3 * SIZE(AO), %xmm7 ADDSD4 %xmm4, %xmm0 ADDSD3 %xmm5, %xmm1 movaps %xmm0, %xmm4 movaps %xmm1, %xmm5 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm5 mulsd %xmm1, %xmm6 mulsd %xmm0, %xmm7 subsd %xmm4, %xmm2 subsd %xmm6, %xmm3 movsd 6 * SIZE(AO), %xmm6 ADDSD3 %xmm5, %xmm2 ADDSD4 %xmm7, %xmm3 movsd 7 * SIZE(AO), %xmm7 movaps %xmm2, %xmm5 movaps %xmm3, %xmm4 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm2 ADDSD3 %xmm5, %xmm3 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 movaps %xmm0, %xmm5 movsd 1 * SIZE(BO), %xmm9 movaps %xmm1, %xmm4 movaps %xmm2, %xmm7 movaps %xmm3, %xmm6 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm1 mulsd %xmm9, %xmm5 mulsd %xmm9, %xmm4 ADDSD4 %xmm4, %xmm0 mulsd %xmm8, %xmm2 ADDSD2 %xmm5, %xmm1 mulsd %xmm8, %xmm3 mulsd %xmm9, %xmm7 mulsd %xmm9, %xmm6 ADDSD4 %xmm6, %xmm2 ADDSD2 %xmm7, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movsd %xmm2, 2 * SIZE(CO1) movsd %xmm3, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm1, 1 * SIZE(AO) movsd %xmm2, 2 * SIZE(AO) movsd %xmm3, 3 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif decq J # j -- jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S000066400000000000000000001126721313527062700224140ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define AORIG 48(%rsp) #define BORIG 56(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) #define PREFETCHSIZE (8 * 17 + 2) #define PREFETCH prefetcht0 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, %rax movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq %rax, KK movq %rax, OFFSET movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B pcmpeqb %xmm15, %xmm15 psllq $63, %xmm15 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm15, 8 + POSINV salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq 16 * SIZE + BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L05 ALIGN_4 .L04: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif testq $1, M jle .L30 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax addq %rax, AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L42 .L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -14 * SIZE(AO), %xmm0 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm4 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -10 * SIZE(AO), %xmm0 movapd 8 * SIZE(BO), %xmm2 movapd 10 * SIZE(BO), %xmm3 movapd 12 * SIZE(BO), %xmm4 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L44 .L43: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm11 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -10 * SIZE(B), %xmm4 movddup -9 * SIZE(B), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm11 subpd %xmm12, %xmm11 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(B), %xmm0 movddup -9 * SIZE(B), %xmm1 movddup -12 * SIZE(B), %xmm2 movddup -11 * SIZE(B), %xmm3 movddup -16 * SIZE(B), %xmm4 movddup -15 * SIZE(B), %xmm5 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movapd %xmm11, %xmm8 pshufd $0x4e, %xmm11, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm11, %xmm10 unpckhpd %xmm11, %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $1, I # i = (m >> 2) jle .L99 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif prefetcht2 0 * SIZE(BB) #ifdef LN pxor %xmm8, %xmm8 prefetcht1 -3 * SIZE(CO1) pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 prefetcht1 -3 * SIZE(CO2) pxor %xmm11, %xmm11 #else pxor %xmm8, %xmm8 prefetcht1 3 * SIZE(CO1) pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 prefetcht1 3 * SIZE(CO2) pxor %xmm11, %xmm11 #endif pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-8 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd -16 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -12 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd -8 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -10 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -8 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd 0 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd 2 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movapd 4 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 movapd 6 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -4 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 movapd 8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 movapd -2 * SIZE(AO), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd 10 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 subq $-32 * SIZE, BO movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 movapd -20 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 subq $-16 * SIZE, AO mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -18 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 subq $1, %rax BRANCH BRANCH jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax BRANCH BRANCH je .L19 ALIGN_4 .L16: ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L19: ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm11 xorpd %xmm7, %xmm13 xorpd %xmm7, %xmm15 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 movapd -12 * SIZE(AO), %xmm11 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 movapd %xmm13, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm9 subpd %xmm14, %xmm11 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm11, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm13 subpd %xmm10, %xmm15 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -10 * SIZE(B), %xmm4 movddup -9 * SIZE(B), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 movapd %xmm9, %xmm8 movapd %xmm13, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm13, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm11 subpd %xmm10, %xmm15 subpd %xmm12, %xmm11 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 #endif #ifdef RT movddup -10 * SIZE(B), %xmm0 movddup -9 * SIZE(B), %xmm1 movddup -12 * SIZE(B), %xmm2 movddup -11 * SIZE(B), %xmm3 movddup -16 * SIZE(B), %xmm4 movddup -15 * SIZE(B), %xmm5 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 movapd %xmm11, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm11, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm13 subpd %xmm12, %xmm9 subpd %xmm14, %xmm13 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhpd %xmm15, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm11, %xmm10 unpckhpd %xmm11, %xmm11 movddup %xmm13, %xmm12 unpckhpd %xmm13, %xmm13 movddup %xmm15, %xmm14 unpckhpd %xmm15, %xmm15 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) movapd %xmm11, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L103 ALIGN_4 .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif testq $1, M jle .L130 ALIGN_4 .L140: #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L142 .L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L141 .L142: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L144 .L143: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L143 ALIGN_4 .L144: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 #else xorpd %xmm7, %xmm8 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 #else addpd %xmm9, %xmm8 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 subpd %xmm8, %xmm9 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RT movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L130: movq M, I sarq $1, I # i = (m >> 2) jle .L199 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 prefetcht0 -3 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L112 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm13 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm13, %xmm12 #else addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movapd %xmm13, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm13 subpd %xmm12, %xmm13 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm13 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef RT movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm13, %xmm12 unpckhpd %xmm13, %xmm13 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L199: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1 * COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S000066400000000000000000001047571313527062700227220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH_R (8 * 4 + 0) #define PREFETCHSIZE (8 * 21 + 6) #define PREFETCH prefetcht0 #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK subq $-16 * SIZE, A subq $-16 * SIZE, B salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J sarq $1, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif testq $1, M BRANCH jle .L20 ALIGN_4 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -14 * SIZE(BO), %xmm3 pxor %xmm3, %xmm3 pxor %xmm5, %xmm5 movapd %xmm3, %xmm8 movapd %xmm3, %xmm9 movapd %xmm3, %xmm12 movapd %xmm3, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -10 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -6 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -2 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO ADD1 %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm3, %xmm12 pcmpeqb %xmm7, %xmm7 ADD2 %xmm5, %xmm13 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 movddup -10 * SIZE(BO), %xmm4 movddup -9 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm11 subpd %xmm12, %xmm11 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 movddup -16 * SIZE(BO), %xmm4 movddup -15 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movapd %xmm11, %xmm8 pshufd $0x4e, %xmm11, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: movq M, I sarq $1, I NOBRANCH jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 #ifdef LN prefetcht0 -4 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 -4 * SIZE(CO2) #else prefetcht0 3 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 3 * SIZE(CO2) #endif movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -10 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -6 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -2 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 2 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 4 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 6 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 6 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 10 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 10 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 14 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 16 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 subq $-32 * SIZE, AO ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax subq $2, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm3, %xmm12 pcmpeqb %xmm7, %xmm7 ADD1 %xmm4, %xmm14 psllq $63, %xmm7 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 pxor %xmm0, %xmm12 pxor %xmm0, %xmm14 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 pxor %xmm0, %xmm13 pxor %xmm0, %xmm15 #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 haddpd %xmm13, %xmm12 haddpd %xmm15, %xmm14 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 subpd %xmm10, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 movapd -12 * SIZE(AO), %xmm11 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 subpd %xmm10, %xmm13 subpd %xmm14, %xmm15 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 movapd %xmm13, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm9 subpd %xmm14, %xmm11 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm11, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm13 subpd %xmm10, %xmm15 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 movddup -10 * SIZE(BO), %xmm4 movddup -9 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 movapd %xmm9, %xmm8 movapd %xmm13, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm13, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm11 subpd %xmm10, %xmm15 subpd %xmm12, %xmm11 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 movddup -16 * SIZE(BO), %xmm4 movddup -15 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 movapd %xmm11, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm11, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm13 subpd %xmm12, %xmm9 subpd %xmm14, %xmm13 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhpd %xmm15, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) movapd %xmm11, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $1, N BRANCH jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif testq $1, M BRANCH jle .L60 ALIGN_4 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 movaps -16 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -14 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -10 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -8 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -14 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 #endif haddpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 subpd %xmm8, %xmm9 #endif #ifdef LN movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: movq M, I sarq $1, I # i = (m >> 2) NOBRANCH jle .L79 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 prefetcht0 3 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -10 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movapd %xmm13, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm13 subpd %xmm12, %xmm13 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S000066400000000000000000001235021313527062700222500ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define ALPHA_R 16(%rsp) #define ALPHA_I 32(%rsp) #define OFFSET 40(%rsp) #define KK 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #define BORIG 72(%rsp) #define BUFFER 128(%rsp) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #define KERNEL1(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pcmpeqb %xmm15, %xmm15 psllq $63, %xmm15 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm15, 8 + POSINV movlpd %xmm4, OFFSET movlpd %xmm4, KK salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCHNTA 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) movlpd %xmm4, 8 * SIZE(BO) movlpd %xmm4, 9 * SIZE(BO) movlpd %xmm5, 10 * SIZE(BO) movlpd %xmm5, 11 * SIZE(BO) movlpd %xmm6, 12 * SIZE(BO) movlpd %xmm6, 13 * SIZE(BO) movlpd %xmm7, 14 * SIZE(BO) movlpd %xmm7, 15 * SIZE(BO) subq $-16 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L05 ALIGN_4 .L04: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif testq $1, M jle .L30 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax addq %rax, AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L42 .L41: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 2 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 10 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 14 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 4 * SIZE(AO), %xmm8 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 6 * SIZE(AO), %xmm8 movapd 24 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 26 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 28 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 30 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $3, %rax # if (k & 1) BRANCH jle .L44 .L43: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm3 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 #else addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 movlpd 2 * SIZE(B), %xmm10 movhpd 2 * SIZE(B), %xmm10 movlpd 3 * SIZE(B), %xmm11 movhpd 3 * SIZE(B), %xmm11 movlpd 6 * SIZE(B), %xmm12 movhpd 6 * SIZE(B), %xmm12 movlpd 7 * SIZE(B), %xmm13 movhpd 7 * SIZE(B), %xmm13 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 movapd %xmm1, %xmm0 pshufd $0x4e, %xmm1, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm3 subpd %xmm4, %xmm3 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm2 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 addpd %xmm2, %xmm3 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm8 movhpd 6 * SIZE(B), %xmm8 movlpd 7 * SIZE(B), %xmm9 movhpd 7 * SIZE(B), %xmm9 movlpd 4 * SIZE(B), %xmm10 movhpd 4 * SIZE(B), %xmm10 movlpd 5 * SIZE(B), %xmm11 movhpd 5 * SIZE(B), %xmm11 movlpd 0 * SIZE(B), %xmm12 movhpd 0 * SIZE(B), %xmm12 movlpd 1 * SIZE(B), %xmm13 movhpd 1 * SIZE(B), %xmm13 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm2, %xmm3 movapd %xmm3, %xmm0 pshufd $0x4e, %xmm3, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm1 subpd %xmm4, %xmm1 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm3, 0 * SIZE(CO2) movhpd %xmm3, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $1, I # i = (m >> 2) jle .L99 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 2 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movapd 4 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movapd 6 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 movapd 0 * SIZE(BO), %xmm9 pxor %xmm4, %xmm4 movapd 2 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 movapd 4 * SIZE(BO), %xmm13 movapd 8 * SIZE(BO), %xmm15 #ifdef LN PREFETCHW -4 * SIZE(CO1) pxor %xmm6, %xmm6 PREFETCHW -4 * SIZE(CO2) pxor %xmm7, %xmm7 #else PREFETCHW 4 * SIZE(CO1) pxor %xmm6, %xmm6 PREFETCHW 4 * SIZE(CO2) pxor %xmm7, %xmm7 #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpq $64 * 2, %rax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpq $64 * 4, %rax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpq $64 * 6, %rax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addq $16 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $64 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 4), BO # * 64 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm5 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 6 * SIZE(BO), %xmm10 addpd %xmm9, %xmm6 movapd 8 * SIZE(BO), %xmm9 addpd %xmm10, %xmm7 movapd 6 * SIZE(AO), %xmm10 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm3 xorpd %xmm15, %xmm5 xorpd %xmm15, %xmm7 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm5 movapd 4 * SIZE(AO), %xmm3 movapd 6 * SIZE(AO), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 6 * SIZE(AO), %xmm8 movhpd 6 * SIZE(AO), %xmm8 movlpd 7 * SIZE(AO), %xmm9 movhpd 7 * SIZE(AO), %xmm9 movlpd 4 * SIZE(AO), %xmm10 movhpd 4 * SIZE(AO), %xmm10 movlpd 5 * SIZE(AO), %xmm11 movhpd 5 * SIZE(AO), %xmm11 movlpd 0 * SIZE(AO), %xmm12 movhpd 0 * SIZE(AO), %xmm12 movlpd 1 * SIZE(AO), %xmm13 movhpd 1 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 mulpd %xmm8, %xmm7 mulpd %xmm9, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 movapd %xmm5, %xmm0 movapd %xmm7, %xmm2 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm1 subpd %xmm6, %xmm3 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 movlpd 2 * SIZE(AO), %xmm10 movhpd 2 * SIZE(AO), %xmm10 movlpd 3 * SIZE(AO), %xmm11 movhpd 3 * SIZE(AO), %xmm11 movlpd 6 * SIZE(AO), %xmm12 movhpd 6 * SIZE(AO), %xmm12 movlpd 7 * SIZE(AO), %xmm13 movhpd 7 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 movapd %xmm1, %xmm0 movapd %xmm3, %xmm2 pshufd $0x4e, %xmm1, %xmm4 pshufd $0x4e, %xmm3, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm5 subpd %xmm2, %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 mulpd %xmm12, %xmm7 mulpd %xmm13, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 movlpd 2 * SIZE(B), %xmm10 movhpd 2 * SIZE(B), %xmm10 movlpd 3 * SIZE(B), %xmm11 movhpd 3 * SIZE(B), %xmm11 movlpd 6 * SIZE(B), %xmm12 movhpd 6 * SIZE(B), %xmm12 movlpd 7 * SIZE(B), %xmm13 movhpd 7 * SIZE(B), %xmm13 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 movapd %xmm1, %xmm0 movapd %xmm5, %xmm2 pshufd $0x4e, %xmm1, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm3 subpd %xmm2, %xmm7 subpd %xmm4, %xmm3 subpd %xmm6, %xmm7 pshufd $0x4e, %xmm3, %xmm2 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm6 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 mulpd %xmm12, %xmm7 mulpd %xmm13, %xmm6 addpd %xmm2, %xmm3 addpd %xmm6, %xmm7 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm8 movhpd 6 * SIZE(B), %xmm8 movlpd 7 * SIZE(B), %xmm9 movhpd 7 * SIZE(B), %xmm9 movlpd 4 * SIZE(B), %xmm10 movhpd 4 * SIZE(B), %xmm10 movlpd 5 * SIZE(B), %xmm11 movhpd 5 * SIZE(B), %xmm11 movlpd 0 * SIZE(B), %xmm12 movhpd 0 * SIZE(B), %xmm12 movlpd 1 * SIZE(B), %xmm13 movhpd 1 * SIZE(B), %xmm13 pshufd $0x4e, %xmm3, %xmm2 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm6 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm7 mulpd %xmm9, %xmm6 addpd %xmm2, %xmm3 addpd %xmm6, %xmm7 movapd %xmm3, %xmm0 movapd %xmm7, %xmm2 pshufd $0x4e, %xmm3, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm1 subpd %xmm2, %xmm5 subpd %xmm4, %xmm1 subpd %xmm6, %xmm5 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movhpd %xmm5, 3 * SIZE(CO1) movsd %xmm3, 0 * SIZE(CO2) movhpd %xmm3, 1 * SIZE(CO2) movsd %xmm7, 2 * SIZE(CO2) movhpd %xmm7, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm5, 2 * SIZE(AO) movapd %xmm3, 4 * SIZE(AO) movapd %xmm7, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L103 ALIGN_4 .L102: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) movlpd %xmm4, 8 * SIZE(BO) movlpd %xmm4, 9 * SIZE(BO) movlpd %xmm5, 10 * SIZE(BO) movlpd %xmm5, 11 * SIZE(BO) movlpd %xmm6, 12 * SIZE(BO) movlpd %xmm6, 13 * SIZE(BO) movlpd %xmm7, 14 * SIZE(BO) movlpd %xmm7, 15 * SIZE(BO) subq $-16 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif testq $1, M jle .L130 ALIGN_4 .L140: #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L142 .L141: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 addq $8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L141 .L142: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 movapd POSINV, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH jle .L144 .L143: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L143 ALIGN_4 .L144: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 #else xorpd %xmm15, %xmm0 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 #else addpd %xmm1, %xmm0 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 subpd %xmm0, %xmm1 #else movapd 0 * SIZE(AO), %xmm1 subpd %xmm0, %xmm1 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L130: movq M, I sarq $1, I # i = (m >> 2) jle .L199 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 #ifdef LN PREFETCHW -4 * SIZE(CO1) #else PREFETCHW 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L112 .L111: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 8 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 10 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 12 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 14 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm5 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm5, %xmm4 #else addpd %xmm1, %xmm0 addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 subpd %xmm0, %xmm1 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm5 subpd %xmm0, %xmm1 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 6 * SIZE(AO), %xmm8 movhpd 6 * SIZE(AO), %xmm8 movlpd 7 * SIZE(AO), %xmm9 movhpd 7 * SIZE(AO), %xmm9 movlpd 4 * SIZE(AO), %xmm10 movhpd 4 * SIZE(AO), %xmm10 movlpd 5 * SIZE(AO), %xmm11 movhpd 5 * SIZE(AO), %xmm11 movlpd 0 * SIZE(AO), %xmm12 movhpd 0 * SIZE(AO), %xmm12 movlpd 1 * SIZE(AO), %xmm13 movhpd 1 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm4, %xmm5 movapd %xmm5, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm1 subpd %xmm4, %xmm1 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 movlpd 2 * SIZE(AO), %xmm10 movhpd 2 * SIZE(AO), %xmm10 movlpd 3 * SIZE(AO), %xmm11 movhpd 3 * SIZE(AO), %xmm11 movlpd 6 * SIZE(AO), %xmm12 movhpd 6 * SIZE(AO), %xmm12 movlpd 7 * SIZE(AO), %xmm13 movhpd 7 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 movapd %xmm1, %xmm0 pshufd $0x4e, %xmm1, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm5 subpd %xmm4, %xmm5 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movhpd %xmm5, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm5, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L199: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S000066400000000000000000001301601313527062700222470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define KKK 232(%rsp) #define AORIG 240(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #ifndef CONJ #define ADD1 addpd #define ADD2 addpd #else #define ADD1 subpd #define ADD2 addpd #endif #define KERNEL1(address) \ mulpd %xmm8, %xmm9;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif testq $1, M jle .L30 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L42 .L41: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L44 .L43: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L43 ALIGN_4 .L44: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 #endif #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm3, %xmm9 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 #else addsubpd %xmm1, %xmm8 addsubpd %xmm3, %xmm9 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 movddup 2 * SIZE(BO), %xmm2 movddup 3 * SIZE(BO), %xmm3 movddup 6 * SIZE(BO), %xmm4 movddup 7 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm9 movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm9 #endif #ifdef RT movddup 6 * SIZE(BO), %xmm0 movddup 7 * SIZE(BO), %xmm1 movddup 4 * SIZE(BO), %xmm2 movddup 5 * SIZE(BO), %xmm3 movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm9 movapd %xmm9, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm8 movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $1, I # i = (m >> 2) jle .L99 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movddup 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movddup 24 * SIZE(BO), %xmm15 #ifdef LN prefetchnta -4 * SIZE(CO1) prefetchnta -4 * SIZE(CO2) #else prefetchnta 4 * SIZE(CO1) prefetchnta 4 * SIZE(CO2) #endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L12 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L11 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L11 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L11 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L11 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L11 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L11 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L11 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L11: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 ALIGN_4 .L12: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L13 ALIGN_4 .L14: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 addsubpd %xmm4, %xmm5 addsubpd %xmm6, %xmm7 #endif #if defined(LN) || defined(RT) movq KK, %rax subq $2, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 movapd 4 * SIZE(BO), %xmm10 movapd 6 * SIZE(BO), %xmm11 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 subpd %xmm4, %xmm10 subpd %xmm6, %xmm11 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm3, %xmm9 subpd %xmm5, %xmm10 subpd %xmm7, %xmm11 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm2, %xmm10 subpd %xmm6, %xmm11 #else addsubpd %xmm1, %xmm8 addsubpd %xmm5, %xmm9 addsubpd %xmm3, %xmm10 addsubpd %xmm7, %xmm11 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #if defined(LN) || defined(RT) #ifdef LN movddup 6 * SIZE(AO), %xmm0 movddup 7 * SIZE(AO), %xmm1 movddup 4 * SIZE(AO), %xmm2 movddup 5 * SIZE(AO), %xmm3 movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #else movddup 6 * SIZE(BO), %xmm0 movddup 7 * SIZE(BO), %xmm1 movddup 4 * SIZE(BO), %xmm2 movddup 5 * SIZE(BO), %xmm3 movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #endif #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm10, %xmm12 movapd %xmm11, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm10 addsubpd %xmm13, %xmm11 movapd %xmm10, %xmm12 movapd %xmm10, %xmm13 movapd %xmm11, %xmm14 movapd %xmm11, %xmm15 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 mulpd %xmm2, %xmm12 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm13 mulpd %xmm3, %xmm15 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 subpd %xmm12, %xmm8 subpd %xmm14, %xmm9 movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #if defined(LT) || defined(RN) #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 movddup 2 * SIZE(AO), %xmm2 movddup 3 * SIZE(AO), %xmm3 movddup 6 * SIZE(AO), %xmm4 movddup 7 * SIZE(AO), %xmm5 #else movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 movddup 2 * SIZE(BO), %xmm2 movddup 3 * SIZE(BO), %xmm3 movddup 6 * SIZE(BO), %xmm4 movddup 7 * SIZE(BO), %xmm5 #endif #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 movapd %xmm9, %xmm14 movapd %xmm9, %xmm15 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 mulpd %xmm2, %xmm12 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm13 mulpd %xmm3, %xmm15 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 subpd %xmm12, %xmm10 subpd %xmm14, %xmm11 movapd %xmm10, %xmm12 movapd %xmm11, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm10 addsubpd %xmm13, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movhpd %xmm10, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif testq $1, M jle .L130 ALIGN_4 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L142 .L141: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L141 .L142: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L144 .L143: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L143 ALIGN_4 .L144: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 SHUFPD_1 %xmm1, %xmm1 #ifndef CONJ addsubpd %xmm1, %xmm0 #else addsubpd %xmm0, %xmm1 #endif #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 #else movapd 0 * SIZE(AO), %xmm8 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 #else addsubpd %xmm1, %xmm8 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L130: movq M, I sarq $1, I # i = (m >> 2) jle .L149 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 #ifdef LN prefetchnta -4 * SIZE(CO1) #else prefetchnta 4 * SIZE(CO1) #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L112 .L111: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L114 .L113: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm5, %xmm4 #else addsubpd %xmm0, %xmm1 addsubpd %xmm4, %xmm5 #endif #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm5, %xmm9 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 #else addsubpd %xmm1, %xmm8 addsubpd %xmm5, %xmm9 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 6 * SIZE(AO), %xmm0 movddup 7 * SIZE(AO), %xmm1 movddup 4 * SIZE(AO), %xmm2 movddup 5 * SIZE(AO), %xmm3 movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm9 movapd %xmm9, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm8 movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 movddup 2 * SIZE(AO), %xmm2 movddup 3 * SIZE(AO), %xmm3 movddup 6 * SIZE(AO), %xmm4 movddup 7 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm9 movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm9 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L149: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S000066400000000000000000001433541313527062700230160ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (16 * 1 + 4) #define PREFETCH prefetcht0 #define ADD1 addps #define ADD2 addps PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J sarq $2, J NOBRANCH jle .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif testq $1, M BRANCH jle .L20 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps 0 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #else xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #else shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif addps %xmm9, %xmm8 addps %xmm11, %xmm10 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm13 subps %xmm8, %xmm9 subps %xmm10, %xmm13 movhlps %xmm9, %xmm11 movhlps %xmm13, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm11, %xmm12 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm12 addps %xmm10, %xmm9 addps %xmm12, %xmm11 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 movaps -20 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 #endif #ifdef RT movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 movaps %xmm15, %xmm3 pshufd $0xb1, %xmm15, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 movaps -8 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 movaps -16 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm11, -28 * SIZE(BO) movsd %xmm9, (CO1) movhps %xmm9, (CO1, LDC) movsd %xmm11, (CO2) movhps %xmm11, (CO2, LDC) #else movlhps %xmm11, %xmm9 movlhps %xmm15, %xmm13 movaps %xmm9, -32 * SIZE(AO) movaps %xmm13, -28 * SIZE(AO) movlps %xmm9, (CO1) movlps %xmm11, (CO1, LDC) movlps %xmm13, (CO2) movlps %xmm15, (CO2, LDC) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: movq M, I sarq $1, I NOBRANCH jle .L29 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 prefetcht2 4 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht2 4 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm12 ADD2 %xmm2, %xmm13 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm13, %xmm12 haddps %xmm15, %xmm14 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 shufps $0xd8, %xmm12, %xmm12 shufps $0xd8, %xmm14, %xmm14 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 movaps %xmm12, %xmm13 shufps $0xe4, %xmm14, %xmm12 shufps $0xe4, %xmm13, %xmm14 #if defined(LN) || defined(LT) movaps %xmm8, %xmm9 movlhps %xmm10, %xmm8 movhlps %xmm9, %xmm10 movaps %xmm12, %xmm11 movlhps %xmm14, %xmm12 movhlps %xmm11, %xmm14 movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm13 movaps -24 * SIZE(BO), %xmm11 movaps -20 * SIZE(BO), %xmm15 subps %xmm8, %xmm9 subps %xmm10, %xmm11 subps %xmm12, %xmm13 subps %xmm14, %xmm15 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm11 movaps -24 * SIZE(AO), %xmm13 movaps -20 * SIZE(AO), %xmm15 subps %xmm8, %xmm9 subps %xmm10, %xmm11 subps %xmm12, %xmm13 subps %xmm14, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm10, %xmm11 addps %xmm14, %xmm15 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 movaps %xmm15, %xmm5 pshufd $0xb1, %xmm15, %xmm4 xorps %xmm7, %xmm2 xorps %xmm7, %xmm4 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 subps %xmm3, %xmm9 subps %xmm2, %xmm9 subps %xmm5, %xmm13 subps %xmm4, %xmm13 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm13, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm13 mulps %xmm1, %xmm14 addps %xmm10, %xmm9 addps %xmm14, %xmm13 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm13, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm13 mulps %xmm1, %xmm14 addps %xmm10, %xmm9 addps %xmm14, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 movaps %xmm13, %xmm5 pshufd $0xb1, %xmm13, %xmm4 xorps %xmm7, %xmm2 xorps %xmm7, %xmm4 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 subps %xmm3, %xmm11 subps %xmm2, %xmm11 subps %xmm5, %xmm15 subps %xmm4, %xmm15 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm10, %xmm11 addps %xmm14, %xmm15 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 movaps -20 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 #endif #ifdef RT movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 movaps %xmm15, %xmm3 pshufd $0xb1, %xmm15, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 movaps -8 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 movaps -16 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm13, -28 * SIZE(BO) movaps %xmm11, -24 * SIZE(BO) movaps %xmm15, -20 * SIZE(BO) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhps %xmm9, 0 * SIZE(CO1, LDC) movhps %xmm11, 2 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhps %xmm13, 0 * SIZE(CO2, LDC) movhps %xmm15, 2 * SIZE(CO2, LDC) #else movaps %xmm9, -32 * SIZE(AO) movaps %xmm11, -28 * SIZE(AO) movaps %xmm13, -24 * SIZE(AO) movaps %xmm15, -20 * SIZE(AO) movsd %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO1, LDC) movhps %xmm11, 2 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movhps %xmm13, 2 * SIZE(CO2) movsd %xmm15, 0 * SIZE(CO2, LDC) movhps %xmm15, 2 * SIZE(CO2, LDC) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L29: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L30: testq $2, N BRANCH jle .L50 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif testq $1, M BRANCH jle .L40 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else xorps %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else shufps $0xb1, %xmm9, %xmm9 xorps %xmm0, %xmm9 #endif #endif addps %xmm9, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 subps %xmm8, %xmm9 #else movaps -32 * SIZE(AO), %xmm9 subps %xmm8, %xmm9 movhlps %xmm9, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RT movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movlps %xmm9, (CO1) movhps %xmm9, (CO2) #else movlps %xmm9, -32 * SIZE(AO) movlps %xmm11, -30 * SIZE(AO) movlps %xmm9, (CO1) movlps %xmm11, (CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L40: movq M, I sarq $1, I NOBRANCH jle .L49 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 #if defined(LN) || defined(LT) movaps %xmm8, %xmm9 movlhps %xmm10, %xmm8 movhlps %xmm9, %xmm10 movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm9 subps %xmm2, %xmm9 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm11 subps %xmm2, %xmm11 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RT movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm11, -28 * SIZE(BO) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhps %xmm9, 0 * SIZE(CO2) movhps %xmm11, 2 * SIZE(CO2) #else movaps %xmm9, -32 * SIZE(AO) movaps %xmm11, -28 * SIZE(AO) movsd %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhps %xmm11, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L31 ALIGN_4 .L49: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L50: testq $1, N BRANCH jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif testq $1, M BRANCH jle .L60 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -26 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else xorps %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else shufps $0xb1, %xmm9, %xmm9 xorps %xmm0, %xmm9 #endif #endif addps %xmm9, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm9 #else movsd -32 * SIZE(AO), %xmm9 #endif subps %xmm8, %xmm9 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 #endif #if defined(RN) || defined(RT) movsd -32 * SIZE(BO), %xmm5 #endif pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm9, -32 * SIZE(BO) #else movlps %xmm9, -32 * SIZE(AO) #endif movlps %xmm9, (CO1) #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: movq M, I sarq $1, I NOBRANCH jle .L69 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -30 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -26 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $ -8 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 #else xorps %xmm0, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif #endif haddps %xmm9, %xmm8 shufps $0xd8, %xmm8, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 subps %xmm8, %xmm9 movhlps %xmm9, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 subps %xmm8, %xmm9 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm9 subps %xmm2, %xmm9 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm11 subps %xmm2, %xmm11 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm9, -32 * SIZE(BO) movlps %xmm11, -30 * SIZE(BO) movlps %xmm9, 0 * SIZE(CO1) movlps %xmm11, 2 * SIZE(CO1) #else movaps %xmm9, -32 * SIZE(AO) movlps %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L51 ALIGN_4 .L69: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S000066400000000000000000002140051313527062700221670ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #ifdef OPTERON #define movsd movlps #endif #if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #define KERNEL1(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm11, %xmm1 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addps %xmm13, %xmm2 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm8, %xmm3 ;\ movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulps %xmm10, %xmm9 ;\ addps %xmm9, %xmm4 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm10, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm10, %xmm13 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addps %xmm13, %xmm6 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm10, %xmm7 ;\ movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulps %xmm12, %xmm15 ;\ addps %xmm15, %xmm0 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm12, %xmm11 ;\ addps %xmm11, %xmm1 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm12, %xmm13 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addps %xmm13, %xmm2 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm12, %xmm3 ;\ movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulps %xmm14, %xmm15 ;\ addps %xmm15, %xmm4 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm14, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm14, %xmm13 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addps %xmm13, %xmm6 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm14, %xmm7 ;\ movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm11, %xmm1 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addps %xmm13, %xmm2 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm8, %xmm3 ;\ movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulps %xmm10, %xmm9 ;\ addps %xmm9, %xmm4 ;\ movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm10, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm10, %xmm13 ;\ mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addps %xmm13, %xmm6 ;\ movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm10, %xmm7 ;\ movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulps %xmm12, %xmm15 ;\ addps %xmm15, %xmm0 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm12, %xmm11 ;\ addps %xmm11, %xmm1 ;\ movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm12, %xmm13 ;\ mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addps %xmm13, %xmm2 ;\ movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm12, %xmm3 ;\ movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulps %xmm14, %xmm15 ;\ addps %xmm15, %xmm4 ;\ movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm14, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm14, %xmm13 ;\ mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addps %xmm13, %xmm6 ;\ movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm14, %xmm7 ;\ movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pxor %xmm15, %xmm15 cmpeqps %xmm15, %xmm15 pslld $31, %xmm15 # Generate mask pxor %xmm2, %xmm2 #ifndef CONJ movss %xmm15, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm15, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm15, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm15, 12 + POSINV #endif movlpd %xmm4, OFFSET movlpd %xmm4, KK salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) movaps 8 * SIZE(B), %xmm3 movaps 12 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 48 * SIZE(BO) movaps %xmm5, 52 * SIZE(BO) movaps %xmm6, 56 * SIZE(BO) movaps %xmm7, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif testq $1, M je .L20 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movlps 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movlps 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 4 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd 6 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd 16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd 10 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 14 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklpd %xmm2, %xmm0 movaps 0 * SIZE(B), %xmm2 subps %xmm0, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AO), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(AO), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 addps %xmm0, %xmm1 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm5 addps %xmm4, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm5 addps %xmm0, %xmm5 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) #else movlps %xmm1, 0 * SIZE(AO) movlps %xmm5, 2 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L20: testq $2, M je .L30 #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 subps %xmm0, %xmm2 subps %xmm1, %xmm3 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #endif #ifdef LN movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 addps %xmm0, %xmm1 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm5 addps %xmm4, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm5 addps %xmm0, %xmm5 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm5, 28 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) movhps %xmm3, 2 * SIZE(CO2) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm5, 4 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) movhps %xmm5, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: movq M, I sarq $2, I # i = (m >> 2) jle .L39 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 movaps 8 * SIZE(BO), %xmm13 movaps 16 * SIZE(BO), %xmm15 movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 4 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 8 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movaps 12 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW -8 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW -8 * SIZE(CO2) pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) addq $32 * 2 * SIZE, AO addq $64 * 2 * SIZE, BO subq $64 * 2, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm5 movaps 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 12 * SIZE(BO), %xmm10 addps %xmm9, %xmm6 movaps 16 * SIZE(BO), %xmm9 addps %xmm10, %xmm7 movaps 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L18: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm6 movaps 12 * SIZE(B), %xmm7 subps %xmm0, %xmm2 subps %xmm1, %xmm3 subps %xmm4, %xmm6 subps %xmm5, %xmm7 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm3 movaps 8 * SIZE(AO), %xmm5 movaps 12 * SIZE(AO), %xmm7 subps %xmm0, %xmm1 subps %xmm4, %xmm3 subps %xmm2, %xmm5 subps %xmm6, %xmm7 #endif #ifdef LN movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 movaps 24 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 movaps 16 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 movaps 12 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm2 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm2 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm0, %xmm1 addps %xmm2, %xmm3 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm6 #ifndef CONJ xorps %xmm15, %xmm2 xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm4 mulps %xmm10, %xmm2 mulps %xmm10, %xmm6 subps %xmm0, %xmm5 subps %xmm4, %xmm7 subps %xmm2, %xmm5 subps %xmm6, %xmm7 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm6 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm5 mulps %xmm10, %xmm7 addps %xmm4, %xmm5 addps %xmm6, %xmm7 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm2 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm2 mulps %xmm10, %xmm5 mulps %xmm10, %xmm7 addps %xmm0, %xmm5 addps %xmm2, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 pshufd $0xa0, %xmm7, %xmm4 pshufd $0xf5, %xmm7, %xmm6 #ifndef CONJ xorps %xmm15, %xmm2 xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm4 mulps %xmm10, %xmm2 mulps %xmm10, %xmm6 subps %xmm0, %xmm1 subps %xmm4, %xmm3 subps %xmm2, %xmm1 subps %xmm6, %xmm3 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm6 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm4, %xmm1 addps %xmm6, %xmm3 #endif #ifdef LN subq $8 * SIZE, CO1 subq $8 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) movaps %xmm6, 8 * SIZE(B) movaps %xmm7, 12 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm5, 28 * SIZE(BO) pshufd $0x00, %xmm6, %xmm0 pshufd $0x55, %xmm6, %xmm1 pshufd $0xaa, %xmm6, %xmm4 pshufd $0xff, %xmm6, %xmm5 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm5, 44 * SIZE(BO) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm5 movaps %xmm0, 48 * SIZE(BO) movaps %xmm1, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm5, 60 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movlps %xmm6, 4 * SIZE(CO1) movlps %xmm7, 6 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) movhps %xmm3, 2 * SIZE(CO2) movhps %xmm6, 4 * SIZE(CO2) movhps %xmm7, 6 * SIZE(CO2) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm3, 4 * SIZE(AO) movaps %xmm5, 8 * SIZE(AO) movaps %xmm7, 12 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm3, 4 * SIZE(CO1) movhps %xmm3, 6 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) movhps %xmm5, 2 * SIZE(CO2) movlps %xmm7, 4 * SIZE(CO2) movhps %xmm7, 6 * SIZE(CO2) #endif #ifndef LN addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif testq $1, M je .L60 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movsd 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 6 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 10 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 12 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 14 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 shufps $0xb1, %xmm1, %xmm1 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif #else xorps %xmm15, %xmm1 #endif addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm0, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AO), %xmm1 subps %xmm0, %xmm1 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, 0 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L60: testq $2, M je .L70 #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: addps %xmm2, %xmm0 addps %xmm3, %xmm1 shufps $0xb1, %xmm1, %xmm1 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif #else xorps %xmm15, %xmm1 #endif addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm0, %xmm2 subps %xmm1, %xmm3 #else movaps 0 * SIZE(AO), %xmm1 subps %xmm0, %xmm1 #endif #ifdef LN movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BO) movaps %xmm1, 12 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: movq M, I sarq $2, I # i = (m >> 2) jle .L79 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 PREFETCHW -8 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 64 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 80 * SIZE(AO), %xmm10 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 36 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 40 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 44 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 96 * SIZE(AO), %xmm12 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 52 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 56 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 60 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 112 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 addq $ 8 * SIZE, AO # aoffset += 4 addq $ 8 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L58: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm5 #endif addps %xmm1, %xmm0 addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd 4 * SIZE(B), %xmm6 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd 6 * SIZE(B), %xmm7 subps %xmm0, %xmm2 subps %xmm1, %xmm3 subps %xmm4, %xmm6 subps %xmm5, %xmm7 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm3 subps %xmm0, %xmm1 subps %xmm4, %xmm3 #endif #ifdef LN movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 movaps 24 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 movaps 16 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 movaps 12 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm6 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm4, %xmm1 addps %xmm6, %xmm3 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) movlps %xmm6, 4 * SIZE(B) movlps %xmm7, 6 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BO) movaps %xmm1, 12 * SIZE(BO) pshufd $0x00, %xmm6, %xmm0 pshufd $0x55, %xmm6, %xmm1 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 movaps %xmm0, 24 * SIZE(BO) movaps %xmm1, 28 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movlps %xmm6, 4 * SIZE(CO1) movlps %xmm7, 6 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm3, 4 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm3, 4 * SIZE(CO1) movhps %xmm3, 6 * SIZE(CO1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, COMPSIZE), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LN_bulldozer.c000066400000000000000000000334641313527062700227320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ztrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ztrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " prefetcht0 256(%3,%1,8) \n\t" " prefetcht0 256(%2,%1,8) \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jz 2f \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufpd $0x01 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufpd $0x01 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufpd $0x01 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufpd $0x01 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorpd %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubpd %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddpd %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddpd %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddpd %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddpd %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; a += (m - 1) * m; b += (m - 1) * n; for (i = m - 1; i >= 0; i--) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = 0; k < i; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a -= m; b -= 2 * n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (m - 1) * m * 2; b += (m - 1) * n * 2; for (i = m - 1; i >= 0; i--) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a -= m * 2; b -= 4 * n; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { #ifdef CONJ if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else ztrsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = m + offset; if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * j * COMPSIZE, cc, ldc); kk -= i; } } } i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, cc, ldc); aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S000066400000000000000000000672061313527062700230240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (8 * 1 + 2) #define PREFETCH prefetcht0 #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif testq M, M jle .L999 movq N, J sarq $2, J NOBRANCH jle .L20 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif movq M, I ALIGN_4 .L11: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetchnta -16 * SIZE(BB) subq $-8 * SIZE, BB xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 prefetcht0 2 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 2 * SIZE(CO2, LDC) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 movaps -14 * SIZE(AO), %xmm0 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 movaps -12 * SIZE(AO), %xmm0 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps 4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps 8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm12 ADD2 %xmm2, %xmm13 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #else #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 haddpd %xmm13, %xmm12 haddpd %xmm15, %xmm14 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 movapd -12 * SIZE(AO), %xmm13 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0x04, %xmm7, %xmm7 #else shufps $0x40, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm3 subpd %xmm2, %xmm11 subpd %xmm3, %xmm11 movddup -12 * SIZE(BO), %xmm0 movddup -11 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm1 subpd %xmm0, %xmm13 subpd %xmm1, %xmm13 movddup -10 * SIZE(BO), %xmm2 movddup -9 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm3 subpd %xmm2, %xmm15 subpd %xmm3, %xmm15 movddup -6 * SIZE(BO), %xmm0 movddup -5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movddup -4 * SIZE(BO), %xmm0 movddup -3 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm0 mulpd %xmm10, %xmm1 subpd %xmm0, %xmm13 subpd %xmm1, %xmm13 movddup -2 * SIZE(BO), %xmm2 movddup -1 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm2 mulpd %xmm10, %xmm3 subpd %xmm2, %xmm15 subpd %xmm3, %xmm15 movddup 4 * SIZE(BO), %xmm0 movddup 5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movddup 6 * SIZE(BO), %xmm2 movddup 7 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm13, %xmm2 mulpd %xmm12, %xmm3 subpd %xmm2, %xmm15 subpd %xmm3, %xmm15 movddup 14 * SIZE(BO), %xmm0 movddup 15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm14, %xmm15 #endif #ifdef RT movddup 14 * SIZE(BO), %xmm0 movddup 15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm14, %xmm15 movddup 12 * SIZE(BO), %xmm2 movddup 13 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm15, %xmm2 mulpd %xmm14, %xmm3 subpd %xmm2, %xmm13 subpd %xmm3, %xmm13 movddup 10 * SIZE(BO), %xmm0 movddup 11 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm15, %xmm0 mulpd %xmm14, %xmm1 subpd %xmm0, %xmm11 subpd %xmm1, %xmm11 movddup 8 * SIZE(BO), %xmm2 movddup 9 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm15, %xmm2 mulpd %xmm14, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup 4 * SIZE(BO), %xmm0 movddup 5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movddup 2 * SIZE(BO), %xmm0 movddup 3 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm1 subpd %xmm0, %xmm11 subpd %xmm1, %xmm11 movddup 0 * SIZE(BO), %xmm2 movddup 1 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm13, %xmm2 mulpd %xmm12, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup -6 * SIZE(BO), %xmm0 movddup -5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movddup -8 * SIZE(BO), %xmm2 movddup -7 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm2 mulpd %xmm10, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO1, LDC) movhpd %xmm11, 1 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm15, 0 * SIZE(CO2, LDC) movhpd %xmm15, 1 * SIZE(CO2, LDC) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) movapd %xmm13, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L20: testq $2, N BRANCH jle .L30 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I ALIGN_4 .L21: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #else #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0x04, %xmm7, %xmm7 #else shufps $0x40, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm3 subpd %xmm2, %xmm11 subpd %xmm3, %xmm11 movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm2 mulpd %xmm10, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L21 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L30: testq $1, N BRANCH jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I ALIGN_4 .L31: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L32 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 #else shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif #else #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 #else shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif #endif haddpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0x04, %xmm7, %xmm7 #else shufps $0x40, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L31 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S000066400000000000000000000436621313527062700223510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define BB %rbx #define KK %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define KKK 232(%rsp) #define AORIG 240(%rsp) #endif #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) #ifndef CONJ #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 addsd #define ADDSD4 subsd #elif defined(LN) || defined(LT) #define ADDSD1 addsd #define ADDSD2 addsd #define ADDSD3 subsd #define ADDSD4 addsd #else #define ADDSD1 addsd #define ADDSD2 subsd #define ADDSD3 addsd #define ADDSD4 addsd #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J testq N, N jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax), BB movq M, I sarq $1, I jle .L20 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 prefetcht0 3 * SIZE(CO1) xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L15 ALIGN_4 .L12: ADDSD2 %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 12 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 13 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 6 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 14 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 15 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 subq $-16 * SIZE, AO ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 0 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 addq $ 8 * SIZE, BO ADDSD3 %xmm4, %xmm10 movsd 1 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 decq %rax ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 0 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 2 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 1 * SIZE(BO), %xmm3 jne .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH BRANCH je .L18 ALIGN_4 .L16: ADDSD2 %xmm2, %xmm13 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm2 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm12 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 addq $4 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L16 ALIGN_4 .L18: ADDSD2 %xmm2, %xmm13 ADDSD3 %xmm7, %xmm14 ADDSD4 %xmm6, %xmm15 addsd %xmm11, %xmm8 addsd %xmm9, %xmm10 addsd %xmm15, %xmm12 addsd %xmm13, %xmm14 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 movsd 2 * SIZE(BO), %xmm2 movsd 3 * SIZE(BO), %xmm3 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm1 movsd 2 * SIZE(AO), %xmm2 movsd 3 * SIZE(AO), %xmm3 #endif subsd %xmm8, %xmm0 subsd %xmm10, %xmm1 subsd %xmm12, %xmm2 subsd %xmm14, %xmm3 #ifdef LN movsd 6 * SIZE(AO), %xmm6 movsd 7 * SIZE(AO), %xmm7 movaps %xmm2, %xmm5 movaps %xmm3, %xmm4 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 movsd 4 * SIZE(AO), %xmm6 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 movsd 5 * SIZE(AO), %xmm7 ADDSD4 %xmm4, %xmm2 ADDSD3 %xmm5, %xmm3 movaps %xmm2, %xmm4 movaps %xmm3, %xmm5 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm5 mulsd %xmm3, %xmm6 mulsd %xmm2, %xmm7 subsd %xmm4, %xmm0 subsd %xmm6, %xmm1 movsd 0 * SIZE(AO), %xmm6 ADDSD3 %xmm5, %xmm0 ADDSD4 %xmm7, %xmm1 movsd 1 * SIZE(AO), %xmm7 movaps %xmm0, %xmm5 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD3 %xmm5, %xmm1 #endif #ifdef LT movsd 0 * SIZE(AO), %xmm6 movsd 1 * SIZE(AO), %xmm7 movaps %xmm0, %xmm5 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 movsd 2 * SIZE(AO), %xmm6 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 movsd 3 * SIZE(AO), %xmm7 ADDSD4 %xmm4, %xmm0 ADDSD3 %xmm5, %xmm1 movaps %xmm0, %xmm4 movaps %xmm1, %xmm5 mulsd %xmm6, %xmm4 mulsd %xmm7, %xmm5 mulsd %xmm1, %xmm6 mulsd %xmm0, %xmm7 subsd %xmm4, %xmm2 subsd %xmm6, %xmm3 movsd 6 * SIZE(AO), %xmm6 ADDSD3 %xmm5, %xmm2 ADDSD4 %xmm7, %xmm3 movsd 7 * SIZE(AO), %xmm7 movaps %xmm2, %xmm5 movaps %xmm3, %xmm4 mulsd %xmm6, %xmm2 mulsd %xmm6, %xmm3 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm2 ADDSD3 %xmm5, %xmm3 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 movaps %xmm0, %xmm5 movsd 1 * SIZE(BO), %xmm9 movaps %xmm1, %xmm4 movaps %xmm2, %xmm7 movaps %xmm3, %xmm6 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm1 mulsd %xmm9, %xmm5 mulsd %xmm9, %xmm4 ADDSD4 %xmm4, %xmm0 mulsd %xmm8, %xmm2 ADDSD2 %xmm5, %xmm1 mulsd %xmm8, %xmm3 mulsd %xmm9, %xmm7 mulsd %xmm9, %xmm6 ADDSD4 %xmm6, %xmm2 ADDSD2 %xmm7, %xmm3 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) movsd %xmm2, 2 * SIZE(CO1) movsd %xmm3, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) movsd %xmm2, 2 * SIZE(BO) movsd %xmm3, 3 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm1, 1 * SIZE(AO) movsd %xmm2, 2 * SIZE(AO) movsd %xmm3, 3 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L20: testq $1, M jle .L99 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd 1 * SIZE(AO), %xmm4 xorps %xmm5, %xmm5 movsd 2 * SIZE(AO), %xmm5 xorps %xmm6, %xmm6 movsd 3 * SIZE(AO), %xmm7 movsd 0 * SIZE(BO), %xmm1 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 movsd 1 * SIZE(BO), %xmm3 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L25 ALIGN_4 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 ADDSD1 %xmm0, %xmm8 movsd 4 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 ADDSD3 %xmm4, %xmm10 movsd 5 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 4 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm8 movsd 6 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 ADDSD3 %xmm7, %xmm10 movsd 7 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 5 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 6 * SIZE(BO), %xmm1 ADDSD1 %xmm0, %xmm8 movsd 8 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm2 ADDSD3 %xmm4, %xmm10 movsd 9 * SIZE(AO), %xmm4 mulsd %xmm3, %xmm6 movsd 7 * SIZE(BO), %xmm3 ADDSD2 %xmm2, %xmm9 movaps %xmm5, %xmm2 mulsd %xmm1, %xmm5 ADDSD4 %xmm6, %xmm11 movaps %xmm7, %xmm6 mulsd %xmm1, %xmm7 movsd 8 * SIZE(BO), %xmm1 ADDSD1 %xmm5, %xmm8 movsd 10 * SIZE(AO), %xmm5 mulsd %xmm3, %xmm2 ADDSD3 %xmm7, %xmm10 movsd 11 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm6 movsd 9 * SIZE(BO), %xmm3 addq $8 * SIZE, AO addq $8 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH BRANCH je .L29 ALIGN_4 .L26: ADDSD2 %xmm2, %xmm9 movaps %xmm0, %xmm2 mulsd %xmm1, %xmm0 ADDSD4 %xmm6, %xmm11 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 movsd 2 * SIZE(BO), %xmm1 mulsd %xmm3, %xmm2 ADDSD1 %xmm0, %xmm8 movsd 2 * SIZE(AO), %xmm0 mulsd %xmm3, %xmm6 movsd 3 * SIZE(BO), %xmm3 ADDSD3 %xmm4, %xmm10 movsd 3 * SIZE(AO), %xmm4 addq $2 * SIZE, AO addq $2 * SIZE, BO decq %rax BRANCH jg .L26 ALIGN_4 .L29: ADDSD2 %xmm2, %xmm9 ADDSD4 %xmm6, %xmm11 addsd %xmm11, %xmm8 addsd %xmm9, %xmm10 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movsd 0 * SIZE(BO), %xmm0 movsd 1 * SIZE(BO), %xmm1 #else movsd 0 * SIZE(AO), %xmm0 movsd 1 * SIZE(AO), %xmm1 #endif subsd %xmm8, %xmm0 subsd %xmm10, %xmm1 #if defined(LN) || defined(LT) movsd 0 * SIZE(AO), %xmm6 movaps %xmm0, %xmm5 movsd 1 * SIZE(AO), %xmm7 movaps %xmm1, %xmm4 mulsd %xmm6, %xmm0 mulsd %xmm6, %xmm1 mulsd %xmm7, %xmm5 mulsd %xmm7, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD3 %xmm5, %xmm1 #endif #if defined(RN) || defined(RT) movsd 0 * SIZE(BO), %xmm8 movaps %xmm0, %xmm5 movsd 1 * SIZE(BO), %xmm9 movaps %xmm1, %xmm4 mulsd %xmm8, %xmm0 mulsd %xmm8, %xmm1 mulsd %xmm9, %xmm5 mulsd %xmm9, %xmm4 ADDSD4 %xmm4, %xmm0 ADDSD2 %xmm5, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm0, 0 * SIZE(CO1) movsd %xmm1, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movsd %xmm0, 0 * SIZE(BO) movsd %xmm1, 1 * SIZE(BO) #else movsd %xmm0, 0 * SIZE(AO) movsd %xmm1, 1 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif decq J # j -- jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S000066400000000000000000001126721313527062700224220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define AORIG 48(%rsp) #define BORIG 56(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) #define PREFETCHSIZE (8 * 17 + 2) #define PREFETCH prefetcht0 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, %rax movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq %rax, KK movq %rax, OFFSET movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B pcmpeqb %xmm15, %xmm15 psllq $63, %xmm15 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm15, 8 + POSINV salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq 16 * SIZE + BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L05 ALIGN_4 .L04: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif prefetcht2 0 * SIZE(BB) #ifdef LN pxor %xmm8, %xmm8 prefetcht1 -3 * SIZE(CO1) pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 prefetcht1 -3 * SIZE(CO2) pxor %xmm11, %xmm11 #else pxor %xmm8, %xmm8 prefetcht1 3 * SIZE(CO1) pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 prefetcht1 3 * SIZE(CO2) pxor %xmm11, %xmm11 #endif pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-8 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd -16 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -12 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd -8 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -10 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -8 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd 0 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd 2 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movapd 4 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 movapd 6 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -4 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 movapd 8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 movapd -2 * SIZE(AO), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd 10 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 subq $-32 * SIZE, BO movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 movapd -20 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 subq $-16 * SIZE, AO mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -18 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 subq $1, %rax BRANCH BRANCH jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax BRANCH BRANCH je .L19 ALIGN_4 .L16: ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L19: ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm11 xorpd %xmm7, %xmm13 xorpd %xmm7, %xmm15 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 movapd -12 * SIZE(AO), %xmm11 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 movapd %xmm13, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm9 subpd %xmm14, %xmm11 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm11, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm13 subpd %xmm10, %xmm15 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -10 * SIZE(B), %xmm4 movddup -9 * SIZE(B), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 movapd %xmm9, %xmm8 movapd %xmm13, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm13, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm11 subpd %xmm10, %xmm15 subpd %xmm12, %xmm11 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 #endif #ifdef RT movddup -10 * SIZE(B), %xmm0 movddup -9 * SIZE(B), %xmm1 movddup -12 * SIZE(B), %xmm2 movddup -11 * SIZE(B), %xmm3 movddup -16 * SIZE(B), %xmm4 movddup -15 * SIZE(B), %xmm5 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 movapd %xmm11, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm11, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm13 subpd %xmm12, %xmm9 subpd %xmm14, %xmm13 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhpd %xmm15, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm11, %xmm10 unpckhpd %xmm11, %xmm11 movddup %xmm13, %xmm12 unpckhpd %xmm13, %xmm13 movddup %xmm15, %xmm14 unpckhpd %xmm15, %xmm15 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) movapd %xmm11, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax addq %rax, AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L42 .L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -14 * SIZE(AO), %xmm0 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm4 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -10 * SIZE(AO), %xmm0 movapd 8 * SIZE(BO), %xmm2 movapd 10 * SIZE(BO), %xmm3 movapd 12 * SIZE(BO), %xmm4 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L44 .L43: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm11 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -10 * SIZE(B), %xmm4 movddup -9 * SIZE(B), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm11 subpd %xmm12, %xmm11 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(B), %xmm0 movddup -9 * SIZE(B), %xmm1 movddup -12 * SIZE(B), %xmm2 movddup -11 * SIZE(B), %xmm3 movddup -16 * SIZE(B), %xmm4 movddup -15 * SIZE(B), %xmm5 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movapd %xmm11, %xmm8 pshufd $0x4e, %xmm11, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm11, %xmm10 unpckhpd %xmm11, %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L103 ALIGN_4 .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 prefetcht0 3 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L112 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm13 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm13, %xmm12 #else addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movapd %xmm13, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm13 subpd %xmm12, %xmm13 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm13 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef RT movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm13, %xmm12 unpckhpd %xmm13, %xmm13 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L199 ALIGN_4 .L140: #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L142 .L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L141 .L142: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L144 .L143: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L143 ALIGN_4 .L144: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 #else xorpd %xmm7, %xmm8 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 #else addpd %xmm9, %xmm8 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 subpd %xmm8, %xmm9 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RT movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L199: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1 * COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S000066400000000000000000001047631313527062700227250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH_R (8 * 4 + 0) #define PREFETCHSIZE (8 * 21 + 6) #define PREFETCH prefetcht0 #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK subq $-16 * SIZE, A subq $-16 * SIZE, B salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J sarq $1, J NOBRANCH jle .L40 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 #ifdef LN prefetcht0 -4 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 -4 * SIZE(CO2) #else prefetcht0 3 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 3 * SIZE(CO2) #endif movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -10 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -6 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -2 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 2 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 4 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 6 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 6 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 10 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 10 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 14 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 16 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 subq $-32 * SIZE, AO ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax subq $2, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm3, %xmm12 pcmpeqb %xmm7, %xmm7 ADD1 %xmm4, %xmm14 psllq $63, %xmm7 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 pxor %xmm0, %xmm12 pxor %xmm0, %xmm14 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 pxor %xmm0, %xmm13 pxor %xmm0, %xmm15 #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 haddpd %xmm13, %xmm12 haddpd %xmm15, %xmm14 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 subpd %xmm10, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 movapd -12 * SIZE(AO), %xmm11 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 subpd %xmm10, %xmm13 subpd %xmm14, %xmm15 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 movapd %xmm13, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm9 subpd %xmm14, %xmm11 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm11, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm13 subpd %xmm10, %xmm15 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 movddup -10 * SIZE(BO), %xmm4 movddup -9 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 movapd %xmm9, %xmm8 movapd %xmm13, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm13, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm11 subpd %xmm10, %xmm15 subpd %xmm12, %xmm11 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 movddup -16 * SIZE(BO), %xmm4 movddup -15 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 movapd %xmm11, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm11, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm13 subpd %xmm12, %xmm9 subpd %xmm14, %xmm13 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhpd %xmm15, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) movapd %xmm11, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L39 ALIGN_4 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -14 * SIZE(BO), %xmm3 pxor %xmm3, %xmm3 pxor %xmm5, %xmm5 movapd %xmm3, %xmm8 movapd %xmm3, %xmm9 movapd %xmm3, %xmm12 movapd %xmm3, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -10 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -6 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -2 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO ADD1 %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm3, %xmm12 pcmpeqb %xmm7, %xmm7 ADD2 %xmm5, %xmm13 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 movddup -10 * SIZE(BO), %xmm4 movddup -9 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm11 subpd %xmm12, %xmm11 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 movddup -16 * SIZE(BO), %xmm4 movddup -15 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movapd %xmm11, %xmm8 pshufd $0x4e, %xmm11, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L40: testq $1, N BRANCH jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I # i = (m >> 2) NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 prefetcht0 3 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -10 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movapd %xmm13, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm13 subpd %xmm12, %xmm13 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L79 ALIGN_4 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 movaps -16 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -14 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -10 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -8 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -14 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 #endif haddpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 subpd %xmm8, %xmm9 #endif #ifdef LN movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S000066400000000000000000001231701313527062700222570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define ALPHA_R 16(%rsp) #define ALPHA_I 32(%rsp) #define OFFSET 40(%rsp) #define KK 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #define BORIG 72(%rsp) #define BUFFER 128(%rsp) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #define KERNEL1(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pcmpeqb %xmm15, %xmm15 psllq $63, %xmm15 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm15, 8 + POSINV movlpd %xmm4, OFFSET movlpd %xmm4, KK salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCHNTA 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) movlpd %xmm4, 8 * SIZE(BO) movlpd %xmm4, 9 * SIZE(BO) movlpd %xmm5, 10 * SIZE(BO) movlpd %xmm5, 11 * SIZE(BO) movlpd %xmm6, 12 * SIZE(BO) movlpd %xmm6, 13 * SIZE(BO) movlpd %xmm7, 14 * SIZE(BO) movlpd %xmm7, 15 * SIZE(BO) subq $-16 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L05 ALIGN_4 .L04: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 2 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movapd 4 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movapd 6 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 movapd 0 * SIZE(BO), %xmm9 pxor %xmm4, %xmm4 movapd 2 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 movapd 4 * SIZE(BO), %xmm13 movapd 8 * SIZE(BO), %xmm15 PREFETCHW 4 * SIZE(CO1) pxor %xmm6, %xmm6 PREFETCHW 4 * SIZE(CO2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpq $64 * 2, %rax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpq $64 * 4, %rax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpq $64 * 6, %rax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addq $16 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $64 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 4), BO # * 64 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm5 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 6 * SIZE(BO), %xmm10 addpd %xmm9, %xmm6 movapd 8 * SIZE(BO), %xmm9 addpd %xmm10, %xmm7 movapd 6 * SIZE(AO), %xmm10 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm3 xorpd %xmm15, %xmm5 xorpd %xmm15, %xmm7 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm5 movapd 4 * SIZE(AO), %xmm3 movapd 6 * SIZE(AO), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 6 * SIZE(AO), %xmm8 movhpd 6 * SIZE(AO), %xmm8 movlpd 7 * SIZE(AO), %xmm9 movhpd 7 * SIZE(AO), %xmm9 movlpd 4 * SIZE(AO), %xmm10 movhpd 4 * SIZE(AO), %xmm10 movlpd 5 * SIZE(AO), %xmm11 movhpd 5 * SIZE(AO), %xmm11 movlpd 0 * SIZE(AO), %xmm12 movhpd 0 * SIZE(AO), %xmm12 movlpd 1 * SIZE(AO), %xmm13 movhpd 1 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 mulpd %xmm8, %xmm7 mulpd %xmm9, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 movapd %xmm5, %xmm0 movapd %xmm7, %xmm2 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm1 subpd %xmm6, %xmm3 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 movlpd 2 * SIZE(AO), %xmm10 movhpd 2 * SIZE(AO), %xmm10 movlpd 3 * SIZE(AO), %xmm11 movhpd 3 * SIZE(AO), %xmm11 movlpd 6 * SIZE(AO), %xmm12 movhpd 6 * SIZE(AO), %xmm12 movlpd 7 * SIZE(AO), %xmm13 movhpd 7 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 movapd %xmm1, %xmm0 movapd %xmm3, %xmm2 pshufd $0x4e, %xmm1, %xmm4 pshufd $0x4e, %xmm3, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm5 subpd %xmm2, %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 mulpd %xmm12, %xmm7 mulpd %xmm13, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 movlpd 2 * SIZE(B), %xmm10 movhpd 2 * SIZE(B), %xmm10 movlpd 3 * SIZE(B), %xmm11 movhpd 3 * SIZE(B), %xmm11 movlpd 6 * SIZE(B), %xmm12 movhpd 6 * SIZE(B), %xmm12 movlpd 7 * SIZE(B), %xmm13 movhpd 7 * SIZE(B), %xmm13 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 movapd %xmm1, %xmm0 movapd %xmm5, %xmm2 pshufd $0x4e, %xmm1, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm3 subpd %xmm2, %xmm7 subpd %xmm4, %xmm3 subpd %xmm6, %xmm7 pshufd $0x4e, %xmm3, %xmm2 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm6 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 mulpd %xmm12, %xmm7 mulpd %xmm13, %xmm6 addpd %xmm2, %xmm3 addpd %xmm6, %xmm7 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm8 movhpd 6 * SIZE(B), %xmm8 movlpd 7 * SIZE(B), %xmm9 movhpd 7 * SIZE(B), %xmm9 movlpd 4 * SIZE(B), %xmm10 movhpd 4 * SIZE(B), %xmm10 movlpd 5 * SIZE(B), %xmm11 movhpd 5 * SIZE(B), %xmm11 movlpd 0 * SIZE(B), %xmm12 movhpd 0 * SIZE(B), %xmm12 movlpd 1 * SIZE(B), %xmm13 movhpd 1 * SIZE(B), %xmm13 pshufd $0x4e, %xmm3, %xmm2 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm6 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm7 mulpd %xmm9, %xmm6 addpd %xmm2, %xmm3 addpd %xmm6, %xmm7 movapd %xmm3, %xmm0 movapd %xmm7, %xmm2 pshufd $0x4e, %xmm3, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm1 subpd %xmm2, %xmm5 subpd %xmm4, %xmm1 subpd %xmm6, %xmm5 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movhpd %xmm5, 3 * SIZE(CO1) movsd %xmm3, 0 * SIZE(CO2) movhpd %xmm3, 1 * SIZE(CO2) movsd %xmm7, 2 * SIZE(CO2) movhpd %xmm7, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm5, 2 * SIZE(AO) movapd %xmm3, 4 * SIZE(AO) movapd %xmm7, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax addq %rax, AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L42 .L41: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 2 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 10 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 14 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 4 * SIZE(AO), %xmm8 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 6 * SIZE(AO), %xmm8 movapd 24 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 26 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 28 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 30 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $3, %rax # if (k & 1) BRANCH jle .L44 .L43: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm3 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 #else addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 movlpd 2 * SIZE(B), %xmm10 movhpd 2 * SIZE(B), %xmm10 movlpd 3 * SIZE(B), %xmm11 movhpd 3 * SIZE(B), %xmm11 movlpd 6 * SIZE(B), %xmm12 movhpd 6 * SIZE(B), %xmm12 movlpd 7 * SIZE(B), %xmm13 movhpd 7 * SIZE(B), %xmm13 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 movapd %xmm1, %xmm0 pshufd $0x4e, %xmm1, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm3 subpd %xmm4, %xmm3 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm2 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 addpd %xmm2, %xmm3 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm8 movhpd 6 * SIZE(B), %xmm8 movlpd 7 * SIZE(B), %xmm9 movhpd 7 * SIZE(B), %xmm9 movlpd 4 * SIZE(B), %xmm10 movhpd 4 * SIZE(B), %xmm10 movlpd 5 * SIZE(B), %xmm11 movhpd 5 * SIZE(B), %xmm11 movlpd 0 * SIZE(B), %xmm12 movhpd 0 * SIZE(B), %xmm12 movlpd 1 * SIZE(B), %xmm13 movhpd 1 * SIZE(B), %xmm13 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm2, %xmm3 movapd %xmm3, %xmm0 pshufd $0x4e, %xmm3, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm1 subpd %xmm4, %xmm1 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm3, 0 * SIZE(CO2) movhpd %xmm3, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L103 ALIGN_4 .L102: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) movlpd %xmm4, 8 * SIZE(BO) movlpd %xmm4, 9 * SIZE(BO) movlpd %xmm5, 10 * SIZE(BO) movlpd %xmm5, 11 * SIZE(BO) movlpd %xmm6, 12 * SIZE(BO) movlpd %xmm6, 13 * SIZE(BO) movlpd %xmm7, 14 * SIZE(BO) movlpd %xmm7, 15 * SIZE(BO) subq $-16 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 PREFETCHW 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L112 .L111: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 8 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 10 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 12 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 14 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm5 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm5, %xmm4 #else addpd %xmm1, %xmm0 addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 subpd %xmm0, %xmm1 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm5 subpd %xmm0, %xmm1 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 6 * SIZE(AO), %xmm8 movhpd 6 * SIZE(AO), %xmm8 movlpd 7 * SIZE(AO), %xmm9 movhpd 7 * SIZE(AO), %xmm9 movlpd 4 * SIZE(AO), %xmm10 movhpd 4 * SIZE(AO), %xmm10 movlpd 5 * SIZE(AO), %xmm11 movhpd 5 * SIZE(AO), %xmm11 movlpd 0 * SIZE(AO), %xmm12 movhpd 0 * SIZE(AO), %xmm12 movlpd 1 * SIZE(AO), %xmm13 movhpd 1 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm4, %xmm5 movapd %xmm5, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm1 subpd %xmm4, %xmm1 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 movlpd 2 * SIZE(AO), %xmm10 movhpd 2 * SIZE(AO), %xmm10 movlpd 3 * SIZE(AO), %xmm11 movhpd 3 * SIZE(AO), %xmm11 movlpd 6 * SIZE(AO), %xmm12 movhpd 6 * SIZE(AO), %xmm12 movlpd 7 * SIZE(AO), %xmm13 movhpd 7 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 movapd %xmm1, %xmm0 pshufd $0x4e, %xmm1, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm5 subpd %xmm4, %xmm5 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movhpd %xmm5, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm5, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L199 ALIGN_4 .L140: #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L142 .L141: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 addq $8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L141 .L142: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 movapd POSINV, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH jle .L144 .L143: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L143 ALIGN_4 .L144: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 #else xorpd %xmm15, %xmm0 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 #else addpd %xmm1, %xmm0 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 subpd %xmm0, %xmm1 #else movapd 0 * SIZE(AO), %xmm1 subpd %xmm0, %xmm1 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L199: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1 * COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S000066400000000000000000001277451313527062700222740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define KKK 232(%rsp) #define AORIG 240(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #ifndef CONJ #define ADD1 addpd #define ADD2 addpd #else #define ADD1 subpd #define ADD2 addpd #endif #define KERNEL1(address) \ mulpd %xmm8, %xmm9;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L100 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movddup 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movddup 24 * SIZE(BO), %xmm15 prefetchnta 4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta 4 * SIZE(CO2) pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L12 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L11 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L11 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L11 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L11 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L11 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L11 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L11 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L11: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 ALIGN_4 .L12: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L13 ALIGN_4 .L14: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 addsubpd %xmm4, %xmm5 addsubpd %xmm6, %xmm7 #endif #if defined(LN) || defined(RT) movq KK, %rax subq $2, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 movapd 4 * SIZE(BO), %xmm10 movapd 6 * SIZE(BO), %xmm11 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 subpd %xmm4, %xmm10 subpd %xmm6, %xmm11 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm3, %xmm9 subpd %xmm5, %xmm10 subpd %xmm7, %xmm11 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm2, %xmm10 subpd %xmm6, %xmm11 #else addsubpd %xmm1, %xmm8 addsubpd %xmm5, %xmm9 addsubpd %xmm3, %xmm10 addsubpd %xmm7, %xmm11 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #if defined(LN) || defined(RT) #ifdef LN movddup 6 * SIZE(AO), %xmm0 movddup 7 * SIZE(AO), %xmm1 movddup 4 * SIZE(AO), %xmm2 movddup 5 * SIZE(AO), %xmm3 movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #else movddup 6 * SIZE(BO), %xmm0 movddup 7 * SIZE(BO), %xmm1 movddup 4 * SIZE(BO), %xmm2 movddup 5 * SIZE(BO), %xmm3 movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #endif #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm10, %xmm12 movapd %xmm11, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm10 addsubpd %xmm13, %xmm11 movapd %xmm10, %xmm12 movapd %xmm10, %xmm13 movapd %xmm11, %xmm14 movapd %xmm11, %xmm15 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 mulpd %xmm2, %xmm12 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm13 mulpd %xmm3, %xmm15 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 subpd %xmm12, %xmm8 subpd %xmm14, %xmm9 movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #if defined(LT) || defined(RN) #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 movddup 2 * SIZE(AO), %xmm2 movddup 3 * SIZE(AO), %xmm3 movddup 6 * SIZE(AO), %xmm4 movddup 7 * SIZE(AO), %xmm5 #else movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 movddup 2 * SIZE(BO), %xmm2 movddup 3 * SIZE(BO), %xmm3 movddup 6 * SIZE(BO), %xmm4 movddup 7 * SIZE(BO), %xmm5 #endif #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 movapd %xmm9, %xmm14 movapd %xmm9, %xmm15 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 mulpd %xmm2, %xmm12 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm13 mulpd %xmm3, %xmm15 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 subpd %xmm12, %xmm10 subpd %xmm14, %xmm11 movapd %xmm10, %xmm12 movapd %xmm11, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm10 addsubpd %xmm13, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movhpd %xmm10, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L42 .L41: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L44 .L43: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L43 ALIGN_4 .L44: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 #endif #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm3, %xmm9 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 #else addsubpd %xmm1, %xmm8 addsubpd %xmm3, %xmm9 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 movddup 2 * SIZE(BO), %xmm2 movddup 3 * SIZE(BO), %xmm3 movddup 6 * SIZE(BO), %xmm4 movddup 7 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm9 movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm9 #endif #ifdef RT movddup 6 * SIZE(BO), %xmm0 movddup 7 * SIZE(BO), %xmm1 movddup 4 * SIZE(BO), %xmm2 movddup 5 * SIZE(BO), %xmm3 movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm9 movapd %xmm9, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm8 movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 .L100: testq $1, N jle .L999 .L101: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 prefetchnta 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L112 .L111: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L114 .L113: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm5, %xmm4 #else addsubpd %xmm0, %xmm1 addsubpd %xmm4, %xmm5 #endif #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm5, %xmm9 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 #else addsubpd %xmm1, %xmm8 addsubpd %xmm5, %xmm9 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 6 * SIZE(AO), %xmm0 movddup 7 * SIZE(AO), %xmm1 movddup 4 * SIZE(AO), %xmm2 movddup 5 * SIZE(AO), %xmm3 movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm9 movapd %xmm9, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm8 movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 movddup 2 * SIZE(AO), %xmm2 movddup 3 * SIZE(AO), %xmm3 movddup 6 * SIZE(AO), %xmm4 movddup 7 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm9 movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm9 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L149 ALIGN_4 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L142 .L141: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L141 .L142: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L144 .L143: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L143 ALIGN_4 .L144: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 SHUFPD_1 %xmm1, %xmm1 #ifndef CONJ addsubpd %xmm1, %xmm0 #else addsubpd %xmm0, %xmm1 #endif #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 #else movapd 0 * SIZE(AO), %xmm8 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 #else addsubpd %xmm1, %xmm8 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L149: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S000066400000000000000000001433541313527062700230240ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (16 * 1 + 4) #define PREFETCH prefetcht0 #define ADD1 addps #define ADD2 addps PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif movq N, J sarq $2, J NOBRANCH jle .L30 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 prefetcht2 4 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht2 4 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm12 ADD2 %xmm2, %xmm13 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm13, %xmm12 haddps %xmm15, %xmm14 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 shufps $0xd8, %xmm12, %xmm12 shufps $0xd8, %xmm14, %xmm14 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 movaps %xmm12, %xmm13 shufps $0xe4, %xmm14, %xmm12 shufps $0xe4, %xmm13, %xmm14 #if defined(LN) || defined(LT) movaps %xmm8, %xmm9 movlhps %xmm10, %xmm8 movhlps %xmm9, %xmm10 movaps %xmm12, %xmm11 movlhps %xmm14, %xmm12 movhlps %xmm11, %xmm14 movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm13 movaps -24 * SIZE(BO), %xmm11 movaps -20 * SIZE(BO), %xmm15 subps %xmm8, %xmm9 subps %xmm10, %xmm11 subps %xmm12, %xmm13 subps %xmm14, %xmm15 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm11 movaps -24 * SIZE(AO), %xmm13 movaps -20 * SIZE(AO), %xmm15 subps %xmm8, %xmm9 subps %xmm10, %xmm11 subps %xmm12, %xmm13 subps %xmm14, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm10, %xmm11 addps %xmm14, %xmm15 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 movaps %xmm15, %xmm5 pshufd $0xb1, %xmm15, %xmm4 xorps %xmm7, %xmm2 xorps %xmm7, %xmm4 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 subps %xmm3, %xmm9 subps %xmm2, %xmm9 subps %xmm5, %xmm13 subps %xmm4, %xmm13 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm13, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm13 mulps %xmm1, %xmm14 addps %xmm10, %xmm9 addps %xmm14, %xmm13 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm13, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm13 mulps %xmm1, %xmm14 addps %xmm10, %xmm9 addps %xmm14, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 movaps %xmm13, %xmm5 pshufd $0xb1, %xmm13, %xmm4 xorps %xmm7, %xmm2 xorps %xmm7, %xmm4 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 subps %xmm3, %xmm11 subps %xmm2, %xmm11 subps %xmm5, %xmm15 subps %xmm4, %xmm15 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm10, %xmm11 addps %xmm14, %xmm15 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 movaps -20 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 #endif #ifdef RT movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 movaps %xmm15, %xmm3 pshufd $0xb1, %xmm15, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 movaps -8 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 movaps -16 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm13, -28 * SIZE(BO) movaps %xmm11, -24 * SIZE(BO) movaps %xmm15, -20 * SIZE(BO) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhps %xmm9, 0 * SIZE(CO1, LDC) movhps %xmm11, 2 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhps %xmm13, 0 * SIZE(CO2, LDC) movhps %xmm15, 2 * SIZE(CO2, LDC) #else movaps %xmm9, -32 * SIZE(AO) movaps %xmm11, -28 * SIZE(AO) movaps %xmm13, -24 * SIZE(AO) movaps %xmm15, -20 * SIZE(AO) movsd %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO1, LDC) movhps %xmm11, 2 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movhps %xmm13, 2 * SIZE(CO2) movsd %xmm15, 0 * SIZE(CO2, LDC) movhps %xmm15, 2 * SIZE(CO2, LDC) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L29 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps 0 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #else xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #else shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif addps %xmm9, %xmm8 addps %xmm11, %xmm10 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm13 subps %xmm8, %xmm9 subps %xmm10, %xmm13 movhlps %xmm9, %xmm11 movhlps %xmm13, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm11, %xmm12 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm12 addps %xmm10, %xmm9 addps %xmm12, %xmm11 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 movaps -20 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 #endif #ifdef RT movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 movaps %xmm15, %xmm3 pshufd $0xb1, %xmm15, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 movaps -8 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 movaps -16 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm11, -28 * SIZE(BO) movsd %xmm9, (CO1) movhps %xmm9, (CO1, LDC) movsd %xmm11, (CO2) movhps %xmm11, (CO2, LDC) #else movlhps %xmm11, %xmm9 movlhps %xmm15, %xmm13 movaps %xmm9, -32 * SIZE(AO) movaps %xmm13, -28 * SIZE(AO) movlps %xmm9, (CO1) movlps %xmm11, (CO1, LDC) movlps %xmm13, (CO2) movlps %xmm15, (CO2, LDC) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L29: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L30: testq $2, N BRANCH jle .L50 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L40 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 #if defined(LN) || defined(LT) movaps %xmm8, %xmm9 movlhps %xmm10, %xmm8 movhlps %xmm9, %xmm10 movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm9 subps %xmm2, %xmm9 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm11 subps %xmm2, %xmm11 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RT movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm11, -28 * SIZE(BO) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhps %xmm9, 0 * SIZE(CO2) movhps %xmm11, 2 * SIZE(CO2) #else movaps %xmm9, -32 * SIZE(AO) movaps %xmm11, -28 * SIZE(AO) movsd %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhps %xmm11, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L31 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else xorps %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else shufps $0xb1, %xmm9, %xmm9 xorps %xmm0, %xmm9 #endif #endif addps %xmm9, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 subps %xmm8, %xmm9 #else movaps -32 * SIZE(AO), %xmm9 subps %xmm8, %xmm9 movhlps %xmm9, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RT movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movlps %xmm9, (CO1) movhps %xmm9, (CO2) #else movlps %xmm9, -32 * SIZE(AO) movlps %xmm11, -30 * SIZE(AO) movlps %xmm9, (CO1) movlps %xmm11, (CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L50: testq $1, N BRANCH jle .L999 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -30 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -26 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $ -8 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 #else xorps %xmm0, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif #endif haddps %xmm9, %xmm8 shufps $0xd8, %xmm8, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 subps %xmm8, %xmm9 movhlps %xmm9, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 subps %xmm8, %xmm9 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm9 subps %xmm2, %xmm9 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm11 subps %xmm2, %xmm11 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm9, -32 * SIZE(BO) movlps %xmm11, -30 * SIZE(BO) movlps %xmm9, 0 * SIZE(CO1) movlps %xmm11, 2 * SIZE(CO1) #else movaps %xmm9, -32 * SIZE(AO) movlps %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -26 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else xorps %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else shufps $0xb1, %xmm9, %xmm9 xorps %xmm0, %xmm9 #endif #endif addps %xmm9, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm9 #else movsd -32 * SIZE(AO), %xmm9 #endif subps %xmm8, %xmm9 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 #endif #if defined(RN) || defined(RT) movsd -32 * SIZE(BO), %xmm5 #endif pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm9, -32 * SIZE(BO) #else movlps %xmm9, -32 * SIZE(AO) #endif movlps %xmm9, (CO1) #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S000066400000000000000000002140051313527062700221750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #ifdef OPTERON #define movsd movlps #endif #if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #define KERNEL1(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm11, %xmm1 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addps %xmm13, %xmm2 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm8, %xmm3 ;\ movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulps %xmm10, %xmm9 ;\ addps %xmm9, %xmm4 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm10, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm10, %xmm13 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addps %xmm13, %xmm6 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm10, %xmm7 ;\ movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulps %xmm12, %xmm15 ;\ addps %xmm15, %xmm0 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm12, %xmm11 ;\ addps %xmm11, %xmm1 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm12, %xmm13 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addps %xmm13, %xmm2 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm12, %xmm3 ;\ movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulps %xmm14, %xmm15 ;\ addps %xmm15, %xmm4 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm14, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm14, %xmm13 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addps %xmm13, %xmm6 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm14, %xmm7 ;\ movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm11, %xmm1 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addps %xmm13, %xmm2 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm8, %xmm3 ;\ movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulps %xmm10, %xmm9 ;\ addps %xmm9, %xmm4 ;\ movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm10, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm10, %xmm13 ;\ mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addps %xmm13, %xmm6 ;\ movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm10, %xmm7 ;\ movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulps %xmm12, %xmm15 ;\ addps %xmm15, %xmm0 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm12, %xmm11 ;\ addps %xmm11, %xmm1 ;\ movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm12, %xmm13 ;\ mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addps %xmm13, %xmm2 ;\ movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm12, %xmm3 ;\ movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulps %xmm14, %xmm15 ;\ addps %xmm15, %xmm4 ;\ movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm14, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm14, %xmm13 ;\ mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addps %xmm13, %xmm6 ;\ movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm14, %xmm7 ;\ movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pxor %xmm15, %xmm15 cmpeqps %xmm15, %xmm15 pslld $31, %xmm15 # Generate mask pxor %xmm2, %xmm2 #ifndef CONJ movss %xmm15, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm15, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm15, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm15, 12 + POSINV #endif movlpd %xmm4, OFFSET movlpd %xmm4, KK salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif movq N, J sarq $1, J # j = (n >> 2) jle .L40 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) movaps 8 * SIZE(B), %xmm3 movaps 12 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 48 * SIZE(BO) movaps %xmm5, 52 * SIZE(BO) movaps %xmm6, 56 * SIZE(BO) movaps %xmm7, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 movaps 8 * SIZE(BO), %xmm13 movaps 16 * SIZE(BO), %xmm15 movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 4 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 8 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movaps 12 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW 7 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 7 * SIZE(CO2) pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) addq $32 * 2 * SIZE, AO addq $64 * 2 * SIZE, BO subq $64 * 2, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm5 movaps 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 12 * SIZE(BO), %xmm10 addps %xmm9, %xmm6 movaps 16 * SIZE(BO), %xmm9 addps %xmm10, %xmm7 movaps 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L18: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm6 movaps 12 * SIZE(B), %xmm7 subps %xmm0, %xmm2 subps %xmm1, %xmm3 subps %xmm4, %xmm6 subps %xmm5, %xmm7 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm3 movaps 8 * SIZE(AO), %xmm5 movaps 12 * SIZE(AO), %xmm7 subps %xmm0, %xmm1 subps %xmm4, %xmm3 subps %xmm2, %xmm5 subps %xmm6, %xmm7 #endif #ifdef LN movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 movaps 24 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 movaps 16 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 movaps 12 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm2 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm2 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm0, %xmm1 addps %xmm2, %xmm3 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm6 #ifndef CONJ xorps %xmm15, %xmm2 xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm4 mulps %xmm10, %xmm2 mulps %xmm10, %xmm6 subps %xmm0, %xmm5 subps %xmm4, %xmm7 subps %xmm2, %xmm5 subps %xmm6, %xmm7 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm6 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm5 mulps %xmm10, %xmm7 addps %xmm4, %xmm5 addps %xmm6, %xmm7 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm2 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm2 mulps %xmm10, %xmm5 mulps %xmm10, %xmm7 addps %xmm0, %xmm5 addps %xmm2, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 pshufd $0xa0, %xmm7, %xmm4 pshufd $0xf5, %xmm7, %xmm6 #ifndef CONJ xorps %xmm15, %xmm2 xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm4 mulps %xmm10, %xmm2 mulps %xmm10, %xmm6 subps %xmm0, %xmm1 subps %xmm4, %xmm3 subps %xmm2, %xmm1 subps %xmm6, %xmm3 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm6 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm4, %xmm1 addps %xmm6, %xmm3 #endif #ifdef LN subq $8 * SIZE, CO1 subq $8 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) movaps %xmm6, 8 * SIZE(B) movaps %xmm7, 12 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm5, 28 * SIZE(BO) pshufd $0x00, %xmm6, %xmm0 pshufd $0x55, %xmm6, %xmm1 pshufd $0xaa, %xmm6, %xmm4 pshufd $0xff, %xmm6, %xmm5 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm5, 44 * SIZE(BO) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm5 movaps %xmm0, 48 * SIZE(BO) movaps %xmm1, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm5, 60 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movlps %xmm6, 4 * SIZE(CO1) movlps %xmm7, 6 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) movhps %xmm3, 2 * SIZE(CO2) movhps %xmm6, 4 * SIZE(CO2) movhps %xmm7, 6 * SIZE(CO2) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm3, 4 * SIZE(AO) movaps %xmm5, 8 * SIZE(AO) movaps %xmm7, 12 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm3, 4 * SIZE(CO1) movhps %xmm3, 6 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) movhps %xmm5, 2 * SIZE(CO2) movlps %xmm7, 4 * SIZE(CO2) movhps %xmm7, 6 * SIZE(CO2) #endif #ifndef LN addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M je .L30 #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 subps %xmm0, %xmm2 subps %xmm1, %xmm3 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #endif #ifdef LN movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 addps %xmm0, %xmm1 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm5 addps %xmm4, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm5 addps %xmm0, %xmm5 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm5, 28 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) movhps %xmm3, 2 * SIZE(CO2) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm5, 4 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) movhps %xmm5, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movlps 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movlps 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 4 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd 6 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd 16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd 10 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 14 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklpd %xmm2, %xmm0 movaps 0 * SIZE(B), %xmm2 subps %xmm0, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AO), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(AO), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 addps %xmm0, %xmm1 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm5 addps %xmm4, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm5 addps %xmm0, %xmm5 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) #else movlps %xmm1, 0 * SIZE(AO) movlps %xmm5, 2 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 ALIGN_4 .L40: testq $1, N je .L999 ALIGN_4 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movsd 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 PREFETCHW 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 64 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 80 * SIZE(AO), %xmm10 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 36 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 40 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 44 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 96 * SIZE(AO), %xmm12 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 52 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 56 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 60 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 112 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 addq $ 8 * SIZE, AO # aoffset += 4 addq $ 8 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L58: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm5 #endif addps %xmm1, %xmm0 addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd 4 * SIZE(B), %xmm6 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd 6 * SIZE(B), %xmm7 subps %xmm0, %xmm2 subps %xmm1, %xmm3 subps %xmm4, %xmm6 subps %xmm5, %xmm7 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm3 subps %xmm0, %xmm1 subps %xmm4, %xmm3 #endif #ifdef LN movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 movaps 24 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 movaps 16 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 movaps 12 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm6 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm4, %xmm1 addps %xmm6, %xmm3 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) movlps %xmm6, 4 * SIZE(B) movlps %xmm7, 6 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BO) movaps %xmm1, 12 * SIZE(BO) pshufd $0x00, %xmm6, %xmm0 pshufd $0x55, %xmm6, %xmm1 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 movaps %xmm0, 24 * SIZE(BO) movaps %xmm1, 28 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movlps %xmm6, 4 * SIZE(CO1) movlps %xmm7, 6 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm3, 4 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm3, 4 * SIZE(CO1) movhps %xmm3, 6 * SIZE(CO1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: addps %xmm2, %xmm0 addps %xmm3, %xmm1 shufps $0xb1, %xmm1, %xmm1 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif #else xorps %xmm15, %xmm1 #endif addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm0, %xmm2 subps %xmm1, %xmm3 #else movaps 0 * SIZE(AO), %xmm1 subps %xmm0, %xmm1 #endif #ifdef LN movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BO) movaps %xmm1, 12 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movsd 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 6 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 10 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 12 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 14 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 shufps $0xb1, %xmm1, %xmm1 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif #else xorps %xmm15, %xmm1 #endif addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm0, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AO), %xmm1 subps %xmm0, %xmm1 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, 0 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, COMPSIZE), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_LT_bulldozer.c000066400000000000000000000315071313527062700227340ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_L #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ztrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ztrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " prefetcht0 256(%3,%1,8) \n\t" " prefetcht0 256(%2,%1,8) \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jz 2f \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufpd $0x01 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufpd $0x01 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufpd $0x01 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufpd $0x01 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorpd %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubpd %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddpd %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddpd %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddpd %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddpd %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < m; i++) { aa = *(a + i); for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; *b = bb; *(c + i + j * ldc) = bb; b ++; for (k = i + 1; k < m; k ++){ *(c + k + j * ldc) -= bb * *(a + k); } } a += m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < m; i++) { aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = aa1 * bb2 - aa2 * bb1; #endif *(b + 0) = cc1; *(b + 1) = cc2; *(c + i * 2 + 0 + j * ldc) = cc1; *(c + i * 2 + 1 + j * ldc) = cc2; b += 2; for (k = i + 1; k < m; k ++){ #ifndef CONJ *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #else *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); #endif } } a += m * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { #ifdef CONJ if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else ztrsm_LT_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { kk = offset; aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; kk += GEMM_UNROLL_M; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; kk += i; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RN_bulldozer.c000066400000000000000000000315121313527062700227300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ztrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ztrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " prefetcht0 256(%3,%1,8) \n\t" " prefetcht0 256(%2,%1,8) \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jz 2f \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufpd $0x01 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufpd $0x01 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufpd $0x01 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufpd $0x01 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorpd %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubpd %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddpd %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddpd %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddpd %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddpd %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; for (i = 0; i < n; i++) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = i + 1; k < n; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b += n; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; for (i = 0; i < n; i++) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = -aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = i + 1; k < n; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b += n * 2; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; #if 0 fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif jj = 0; j = (n >> GEMM_UNROLL_N_SHIFT); kk = -offset; while (j > 0) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { #ifndef CONJ ztrsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } kk += GEMM_UNROLL_N; b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { aa = a; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); while (i > 0) { if (kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, b, cc, ldc); } solve(i, j, aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } } b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; kk += j; } j >>= 1; } } return 0; } OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S000066400000000000000000000672061313527062700230320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (8 * 1 + 2) #define PREFETCH prefetcht0 #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-16 * SIZE, A subq $-16 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif testq M, M jle .L999 testq $1, N BRANCH jle .L20 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I ALIGN_4 .L31: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm10 movaps -14 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm10 movaps -10 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm11 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L32 addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 #else shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif #else #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 #else shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif #endif haddpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0x04, %xmm7, %xmm7 #else shufps $0x40, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #if defined(RN) || defined(RT) movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L31 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L20: testq $2, N BRANCH jle .L30 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I ALIGN_4 .L21: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm1, %xmm8 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #else #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0x04, %xmm7, %xmm7 #else shufps $0x40, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm3 subpd %xmm2, %xmm11 subpd %xmm3, %xmm11 movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm2 mulpd %xmm10, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L21 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L30: movq N, J sarq $2, J NOBRANCH jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif movq M, I ALIGN_4 .L11: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetchnta -16 * SIZE(BB) subq $-8 * SIZE, BB xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht0 2 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht0 2 * SIZE(CO1, LDC) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 prefetcht0 2 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht0 2 * SIZE(CO2, LDC) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 movaps -14 * SIZE(AO), %xmm0 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm12 movaps 0 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 movaps -12 * SIZE(AO), %xmm0 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps 2 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps 4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps 6 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps 8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps 10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps 12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps 14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -8 * SIZE(AO), %xmm0 subq $-8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm14 movaps -14 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm15 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0x4e, %xmm1, %xmm2 mulpd %xmm0, %xmm1 mulpd %xmm0, %xmm2 ADD1 %xmm3, %xmm10 movaps -10 * SIZE(BO), %xmm3 ADD2 %xmm4, %xmm11 pshufd $0x4e, %xmm3, %xmm4 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm12 ADD2 %xmm2, %xmm13 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #else #ifndef CONJ shufps $0x40, %xmm0, %xmm0 xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else shufps $0x04, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 haddpd %xmm13, %xmm12 haddpd %xmm15, %xmm14 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 movapd -12 * SIZE(AO), %xmm13 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0x04, %xmm7, %xmm7 #else shufps $0x40, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm3 subpd %xmm2, %xmm11 subpd %xmm3, %xmm11 movddup -12 * SIZE(BO), %xmm0 movddup -11 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm1 subpd %xmm0, %xmm13 subpd %xmm1, %xmm13 movddup -10 * SIZE(BO), %xmm2 movddup -9 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm3 subpd %xmm2, %xmm15 subpd %xmm3, %xmm15 movddup -6 * SIZE(BO), %xmm0 movddup -5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movddup -4 * SIZE(BO), %xmm0 movddup -3 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm0 mulpd %xmm10, %xmm1 subpd %xmm0, %xmm13 subpd %xmm1, %xmm13 movddup -2 * SIZE(BO), %xmm2 movddup -1 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm2 mulpd %xmm10, %xmm3 subpd %xmm2, %xmm15 subpd %xmm3, %xmm15 movddup 4 * SIZE(BO), %xmm0 movddup 5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movddup 6 * SIZE(BO), %xmm2 movddup 7 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm13, %xmm2 mulpd %xmm12, %xmm3 subpd %xmm2, %xmm15 subpd %xmm3, %xmm15 movddup 14 * SIZE(BO), %xmm0 movddup 15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm14, %xmm15 #endif #ifdef RT movddup 14 * SIZE(BO), %xmm0 movddup 15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm14, %xmm15 movddup 12 * SIZE(BO), %xmm2 movddup 13 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm15, %xmm2 mulpd %xmm14, %xmm3 subpd %xmm2, %xmm13 subpd %xmm3, %xmm13 movddup 10 * SIZE(BO), %xmm0 movddup 11 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm15, %xmm0 mulpd %xmm14, %xmm1 subpd %xmm0, %xmm11 subpd %xmm1, %xmm11 movddup 8 * SIZE(BO), %xmm2 movddup 9 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm14 mulpd %xmm15, %xmm2 mulpd %xmm14, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup 4 * SIZE(BO), %xmm0 movddup 5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movddup 2 * SIZE(BO), %xmm0 movddup 3 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm1 subpd %xmm0, %xmm11 subpd %xmm1, %xmm11 movddup 0 * SIZE(BO), %xmm2 movddup 1 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm13, %xmm2 mulpd %xmm12, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup -6 * SIZE(BO), %xmm0 movddup -5 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movddup -8 * SIZE(BO), %xmm2 movddup -7 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm11, %xmm2 mulpd %xmm10, %xmm3 subpd %xmm2, %xmm9 subpd %xmm3, %xmm9 movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO1, LDC) movhpd %xmm11, 1 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movhpd %xmm13, 1 * SIZE(CO2) movsd %xmm15, 0 * SIZE(CO2, LDC) movhpd %xmm15, 1 * SIZE(CO2, LDC) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) movapd %xmm13, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L11 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S000066400000000000000000001127021313527062700224220ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define J 16(%rsp) #define OFFSET 24(%rsp) #define KK 32(%rsp) #define KKK 40(%rsp) #define AORIG 48(%rsp) #define BORIG 56(%rsp) #define BUFFER 128(%rsp) #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) #define PREFETCHSIZE (8 * 17 + 2) #define PREFETCH prefetcht0 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_LDC, LDC movq OLD_OFFSET, %rax movq %rsp, %r15 # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq %rax, KK movq %rax, OFFSET movq OLD_M, M movq OLD_N, N subq $-16 * SIZE, A subq $-16 * SIZE, B pcmpeqb %xmm15, %xmm15 psllq $63, %xmm15 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm15, 8 + POSINV salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N jle .L100 .L101: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L103 ALIGN_4 .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) movapd %xmm12, 8 * SIZE(BO) movapd %xmm13, 10 * SIZE(BO) movapd %xmm14, 12 * SIZE(BO) movapd %xmm15, 14 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 prefetcht0 3 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L112 .L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -8 * SIZE(AO), %xmm0 movapd -6 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -4 * SIZE(AO), %xmm0 movapd -2 * SIZE(AO), %xmm1 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 subq $-16 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm13 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm13, %xmm12 #else addpd %xmm9, %xmm8 addpd %xmm13, %xmm12 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movapd %xmm13, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm13 subpd %xmm12, %xmm13 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm13 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef RT movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm13, -14 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm13, %xmm12 unpckhpd %xmm13, %xmm13 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm12, -12 * SIZE(BO) movapd %xmm13, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L199 ALIGN_4 .L140: #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L142 .L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -14 * SIZE(AO), %xmm1 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd -10 * SIZE(AO), %xmm1 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-16 * SIZE, BO subq $1, %rax jne .L141 .L142: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L144 .L143: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax jg .L143 ALIGN_4 .L144: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 #else xorpd %xmm7, %xmm8 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 #else addpd %xmm9, %xmm8 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 subpd %xmm8, %xmm9 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RT movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L199: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1 * COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L100: movq N, J sarq $1, J # j = (n >> 2) jle .L999 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq 16 * SIZE + BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movddup -12 * SIZE(B), %xmm12 movddup -11 * SIZE(B), %xmm13 movddup -10 * SIZE(B), %xmm14 movddup -9 * SIZE(B), %xmm15 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) addq $ 8 * SIZE, B subq $-16 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L05 ALIGN_4 .L04: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 movddup -14 * SIZE(B), %xmm10 movddup -13 * SIZE(B), %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif prefetcht2 0 * SIZE(BB) #ifdef LN pxor %xmm8, %xmm8 prefetcht1 -3 * SIZE(CO1) pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 prefetcht1 -3 * SIZE(CO2) pxor %xmm11, %xmm11 #else pxor %xmm8, %xmm8 prefetcht1 3 * SIZE(CO1) pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 prefetcht1 3 * SIZE(CO2) pxor %xmm11, %xmm11 #endif pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 pxor %xmm14, %xmm14 pxor %xmm15, %xmm15 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 subq $-8 * SIZE, BB #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_4 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd -16 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd -14 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movapd -12 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 movapd -10 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -12 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd -8 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -10 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movapd -6 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 movapd -4 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -2 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -8 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 movapd 0 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm14 movapd %xmm2, %xmm3 movapd -6 * SIZE(AO), %xmm1 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd 2 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 movapd 4 * SIZE(BO), %xmm2 ADD1 %xmm3, %xmm12 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 movapd 6 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm13 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 movapd -4 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 movapd 8 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 movapd -2 * SIZE(AO), %xmm1 mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm11 movapd 10 * SIZE(BO), %xmm4 ADD2 %xmm5, %xmm15 subq $-32 * SIZE, BO movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 movapd -20 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 mulpd %xmm0, %xmm2 subq $-16 * SIZE, AO mulpd %xmm1, %xmm3 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -18 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 subq $1, %rax BRANCH BRANCH jg .L12 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax BRANCH BRANCH je .L19 ALIGN_4 .L16: ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 movapd -16 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -14 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 movapd -16 * SIZE(AO), %xmm0 mulpd %xmm0, %xmm2 movapd -14 * SIZE(AO), %xmm1 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 ADD1 %xmm2, %xmm8 ADD1 %xmm3, %xmm12 ADD2 %xmm4, %xmm9 ADD2 %xmm5, %xmm13 movapd -12 * SIZE(BO), %xmm2 movapd %xmm2, %xmm3 movapd -10 * SIZE(BO), %xmm4 movapd %xmm4, %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm1, %xmm5 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_4 .L19: ADD1 %xmm2, %xmm10 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm11 ADD2 %xmm5, %xmm15 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm11 xorpd %xmm7, %xmm13 xorpd %xmm7, %xmm15 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 subpd %xmm13, %xmm12 subpd %xmm15, %xmm14 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 addpd %xmm13, %xmm12 addpd %xmm15, %xmm14 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 movapd -12 * SIZE(B), %xmm13 movapd -10 * SIZE(B), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 movapd -12 * SIZE(AO), %xmm11 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 movapd %xmm13, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm9 subpd %xmm14, %xmm11 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm11, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm13 subpd %xmm10, %xmm15 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -10 * SIZE(B), %xmm4 movddup -9 * SIZE(B), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 movapd %xmm9, %xmm8 movapd %xmm13, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm13, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm11 subpd %xmm10, %xmm15 subpd %xmm12, %xmm11 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 #endif #ifdef RT movddup -10 * SIZE(B), %xmm0 movddup -9 * SIZE(B), %xmm1 movddup -12 * SIZE(B), %xmm2 movddup -11 * SIZE(B), %xmm3 movddup -16 * SIZE(B), %xmm4 movddup -15 * SIZE(B), %xmm5 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 movapd %xmm11, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm11, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm13 subpd %xmm12, %xmm9 subpd %xmm14, %xmm13 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhpd %xmm15, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movapd %xmm13, -12 * SIZE(B) movapd %xmm15, -10 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm11, %xmm10 unpckhpd %xmm11, %xmm11 movddup %xmm13, %xmm12 unpckhpd %xmm13, %xmm13 movddup %xmm15, %xmm14 unpckhpd %xmm15, %xmm15 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) movapd %xmm12, -8 * SIZE(BO) movapd %xmm13, -6 * SIZE(BO) movapd %xmm14, -4 * SIZE(BO) movapd %xmm15, -2 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) movapd %xmm11, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax addq %rax, AO #endif leaq 16 * SIZE + BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L42 .L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -14 * SIZE(AO), %xmm0 movapd -8 * SIZE(BO), %xmm2 movapd -6 * SIZE(BO), %xmm3 movapd -4 * SIZE(BO), %xmm4 movapd -2 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -12 * SIZE(AO), %xmm0 movapd 0 * SIZE(BO), %xmm2 movapd 2 * SIZE(BO), %xmm3 movapd 4 * SIZE(BO), %xmm4 movapd 6 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 movapd -10 * SIZE(AO), %xmm0 movapd 8 * SIZE(BO), %xmm2 movapd 10 * SIZE(BO), %xmm3 movapd 12 * SIZE(BO), %xmm4 movapd 14 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 subq $ -8 * SIZE, AO subq $-32 * SIZE, BO subq $1, %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm7 andq $3, %rax # if (k & 1) BRANCH jle .L44 .L43: movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm2 movapd -14 * SIZE(BO), %xmm3 movapd -12 * SIZE(BO), %xmm4 movapd -10 * SIZE(BO), %xmm5 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm3 mulpd %xmm0, %xmm4 mulpd %xmm0, %xmm5 ADD1 %xmm2, %xmm8 ADD2 %xmm3, %xmm9 ADD1 %xmm4, %xmm10 ADD2 %xmm5, %xmm11 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq 16 * SIZE + BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm9, %xmm9 SHUFPD_1 %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm7, %xmm9 xorpd %xmm7, %xmm11 #else xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm9, %xmm8 subpd %xmm11, %xmm10 #else addpd %xmm9, %xmm8 addpd %xmm11, %xmm10 #endif #if defined(LN) || defined(LT) movapd -16 * SIZE(B), %xmm9 movapd -14 * SIZE(B), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 #endif #ifndef CONJ SHUFPD_1 %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 movddup -14 * SIZE(B), %xmm2 movddup -13 * SIZE(B), %xmm3 movddup -10 * SIZE(B), %xmm4 movddup -9 * SIZE(B), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm11 subpd %xmm12, %xmm11 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(B), %xmm0 movddup -9 * SIZE(B), %xmm1 movddup -12 * SIZE(B), %xmm2 movddup -11 * SIZE(B), %xmm3 movddup -16 * SIZE(B), %xmm4 movddup -15 * SIZE(B), %xmm5 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movapd %xmm11, %xmm8 pshufd $0x4e, %xmm11, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(B) movapd %xmm11, -14 * SIZE(B) movddup %xmm9, %xmm8 unpckhpd %xmm9, %xmm9 movddup %xmm11, %xmm10 unpckhpd %xmm11, %xmm11 movapd %xmm8, -16 * SIZE(BO) movapd %xmm9, -14 * SIZE(BO) movapd %xmm10, -12 * SIZE(BO) movapd %xmm11, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 ALIGN_4 .L999: movq %r15, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S000066400000000000000000001046061313527062700227270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCH_R (8 * 4 + 0) #define PREFETCHSIZE (8 * 21 + 6) #define PREFETCH prefetcht0 #define ADD1 addpd #define ADD2 addpd PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK subq $-16 * SIZE, A subq $-16 * SIZE, B salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif testq $1, N BRANCH jle .L40 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I # i = (m >> 2) NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 movaps -16 * SIZE(BO), %xmm2 prefetcht0 3 * SIZE(CO1) pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_4 .L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -10 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 subq $-16 * SIZE, AO subq $ -8 * SIZE, BO subq $1, %rax BRANCH jg .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm2, %xmm8 movaps -14 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm12 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm13 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_4 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 subpd %xmm8, %xmm9 subpd %xmm12, %xmm13 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm12, %xmm13 movapd %xmm13, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm13 subpd %xmm12, %xmm13 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm13 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm13, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L79 ALIGN_4 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 movaps -16 * SIZE(BO), %xmm2 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_4 .L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -14 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -10 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -8 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm10 ADD2 %xmm7, %xmm11 movaps -8 * SIZE(BO), %xmm2 subq $-8 * SIZE, AO subq $-8 * SIZE, BO subq $1, %rax BRANCH jg .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm7 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm2, %xmm8 ADD2 %xmm7, %xmm9 movaps -14 * SIZE(BO), %xmm2 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_4 .L68: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 #endif haddpd %xmm9, %xmm8 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 subpd %xmm8, %xmm9 #else movapd -16 * SIZE(AO), %xmm9 subpd %xmm8, %xmm9 #endif #ifdef LN movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef RT movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L40: movq N, J sarq $1, J NOBRANCH jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax movq B, BB subq %rax, BB #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 movaps -14 * SIZE(AO), %xmm1 pxor %xmm4, %xmm4 movaps -16 * SIZE(BO), %xmm2 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 #ifdef LN prefetcht0 -4 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 -4 * SIZE(CO2) #else prefetcht0 3 * SIZE(CO1) movapd %xmm4, %xmm8 movapd %xmm4, %xmm9 prefetcht0 3 * SIZE(CO2) #endif movapd %xmm4, %xmm10 movapd %xmm4, %xmm11 movapd %xmm4, %xmm12 movapd %xmm4, %xmm13 movapd %xmm4, %xmm14 movapd %xmm4, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax NOBRANCH jle .L15 ALIGN_3 .L12: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -10 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -6 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -6 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -2 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps -2 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 0 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 2 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 2 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 4 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 4 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 6 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 6 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 8 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 8 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 10 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 10 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps 12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps 14 * SIZE(AO), %xmm1 ADD1 %xmm3, %xmm12 movaps 14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps 16 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 subq $-32 * SIZE, AO ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -14 * SIZE(AO), %xmm1 subq $-32 * SIZE, BO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 mulpd %xmm1, %xmm4 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 movaps %xmm7, %xmm6 mulpd %xmm0, %xmm7 mulpd %xmm1, %xmm6 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 ADD1 %xmm4, %xmm10 movaps %xmm3, %xmm4 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 mulpd %xmm1, %xmm4 ADD2 %xmm7, %xmm9 ADD2 %xmm6, %xmm11 movaps %xmm5, %xmm6 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 mulpd %xmm1, %xmm6 movaps -10 * SIZE(AO), %xmm1 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax subq $2, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm3, %xmm12 pcmpeqb %xmm7, %xmm7 ADD1 %xmm4, %xmm14 psllq $63, %xmm7 ADD2 %xmm5, %xmm13 ADD2 %xmm6, %xmm15 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm10 pxor %xmm0, %xmm12 pxor %xmm0, %xmm14 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm11 pxor %xmm0, %xmm13 pxor %xmm0, %xmm15 #endif haddpd %xmm9, %xmm8 haddpd %xmm11, %xmm10 haddpd %xmm13, %xmm12 haddpd %xmm15, %xmm14 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 movapd -12 * SIZE(BO), %xmm13 movapd -10 * SIZE(BO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 subpd %xmm10, %xmm13 subpd %xmm14, %xmm15 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm13 movapd -12 * SIZE(AO), %xmm11 movapd -10 * SIZE(AO), %xmm15 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 subpd %xmm10, %xmm13 subpd %xmm14, %xmm15 #endif #ifdef LN movddup -10 * SIZE(AO), %xmm0 movddup -9 * SIZE(AO), %xmm1 movddup -12 * SIZE(AO), %xmm2 movddup -11 * SIZE(AO), %xmm3 movddup -16 * SIZE(AO), %xmm4 movddup -15 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 movapd %xmm13, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm11 subpd %xmm12, %xmm9 subpd %xmm14, %xmm11 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef LT movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 movddup -14 * SIZE(AO), %xmm2 movddup -13 * SIZE(AO), %xmm3 movddup -10 * SIZE(AO), %xmm4 movddup -9 * SIZE(AO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 movapd %xmm9, %xmm8 movapd %xmm11, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm11, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm13 subpd %xmm10, %xmm15 subpd %xmm12, %xmm13 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm13, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm12, %xmm13 addpd %xmm14, %xmm15 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 movddup -10 * SIZE(BO), %xmm4 movddup -9 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm13 mulpd %xmm1, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 movapd %xmm9, %xmm8 movapd %xmm13, %xmm10 pshufd $0x4e, %xmm9, %xmm12 pshufd $0x4e, %xmm13, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm11 subpd %xmm10, %xmm15 subpd %xmm12, %xmm11 subpd %xmm14, %xmm15 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 mulpd %xmm4, %xmm15 mulpd %xmm5, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 movddup -16 * SIZE(BO), %xmm4 movddup -15 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm11, %xmm10 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm10 xorpd %xmm7, %xmm14 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 mulpd %xmm0, %xmm15 mulpd %xmm1, %xmm14 addpd %xmm10, %xmm11 addpd %xmm14, %xmm15 movapd %xmm11, %xmm8 movapd %xmm15, %xmm10 pshufd $0x4e, %xmm11, %xmm12 pshufd $0x4e, %xmm15, %xmm14 xorpd %xmm7, %xmm12 xorpd %xmm7, %xmm14 mulpd %xmm2, %xmm8 mulpd %xmm2, %xmm10 mulpd %xmm3, %xmm12 mulpd %xmm3, %xmm14 subpd %xmm8, %xmm9 subpd %xmm10, %xmm13 subpd %xmm12, %xmm9 subpd %xmm14, %xmm13 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm13, %xmm12 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 mulpd %xmm4, %xmm13 mulpd %xmm5, %xmm12 addpd %xmm8, %xmm9 addpd %xmm12, %xmm13 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm13, 2 * SIZE(CO1) movhpd %xmm13, 3 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhpd %xmm15, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) movapd %xmm13, -12 * SIZE(BO) movapd %xmm15, -10 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm13, -14 * SIZE(AO) movapd %xmm11, -12 * SIZE(AO) movapd %xmm15, -10 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L39 ALIGN_4 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movaps -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 movaps -14 * SIZE(BO), %xmm3 pxor %xmm3, %xmm3 pxor %xmm5, %xmm5 movapd %xmm3, %xmm8 movapd %xmm3, %xmm9 movapd %xmm3, %xmm12 movapd %xmm3, %xmm13 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_4 .L22: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -10 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -8 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -12 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -6 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -4 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -10 * SIZE(AO), %xmm0 ADD1 %xmm3, %xmm12 movaps -2 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 subq $ -8 * SIZE, AO ADD1 %xmm2, %xmm8 movaps 0 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $1, %rax BRANCH jg .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 ADD2 %xmm5, %xmm13 mulpd %xmm0, %xmm7 ADD1 %xmm2, %xmm8 movaps -12 * SIZE(BO), %xmm2 pshufd $0x4e, %xmm3, %xmm5 mulpd %xmm0, %xmm3 ADD2 %xmm7, %xmm9 mulpd %xmm0, %xmm5 movaps -14 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_4 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm3, %xmm12 pcmpeqb %xmm7, %xmm7 ADD2 %xmm5, %xmm13 psllq $63, %xmm7 #ifndef CONJ pshufd $0x40, %xmm7, %xmm0 shufps $0x04, %xmm7, %xmm7 pxor %xmm0, %xmm8 pxor %xmm0, %xmm12 #else #if defined(LN) || defined(LT) pshufd $0x40, %xmm7, %xmm0 #else pshufd $0x04, %xmm7, %xmm0 #endif shufps $0x40, %xmm7, %xmm7 pxor %xmm0, %xmm9 pxor %xmm0, %xmm13 #endif haddpd %xmm9, %xmm8 haddpd %xmm13, %xmm12 #if defined(LN) || defined(LT) movapd -16 * SIZE(BO), %xmm9 movapd -14 * SIZE(BO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 #else movapd -16 * SIZE(AO), %xmm9 movapd -14 * SIZE(AO), %xmm11 subpd %xmm8, %xmm9 subpd %xmm12, %xmm11 #endif #if defined(LN) || defined(LT) movddup -16 * SIZE(AO), %xmm0 movddup -15 * SIZE(AO), %xmm1 pshufd $0x4e, %xmm9, %xmm8 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm8 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm8, %xmm9 addpd %xmm10, %xmm11 #endif #ifdef RN movddup -16 * SIZE(BO), %xmm0 movddup -15 * SIZE(BO), %xmm1 movddup -14 * SIZE(BO), %xmm2 movddup -13 * SIZE(BO), %xmm3 movddup -10 * SIZE(BO), %xmm4 movddup -9 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm8 addpd %xmm8, %xmm9 movapd %xmm9, %xmm8 pshufd $0x4e, %xmm9, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm11 subpd %xmm12, %xmm11 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm10 addpd %xmm10, %xmm11 #endif #ifdef RT movddup -10 * SIZE(BO), %xmm0 movddup -9 * SIZE(BO), %xmm1 movddup -12 * SIZE(BO), %xmm2 movddup -11 * SIZE(BO), %xmm3 movddup -16 * SIZE(BO), %xmm4 movddup -15 * SIZE(BO), %xmm5 pshufd $0x4e, %xmm11, %xmm10 xorpd %xmm7, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm10 addpd %xmm10, %xmm11 movapd %xmm11, %xmm8 pshufd $0x4e, %xmm11, %xmm12 xorpd %xmm7, %xmm12 mulpd %xmm2, %xmm8 mulpd %xmm3, %xmm12 subpd %xmm8, %xmm9 subpd %xmm12, %xmm9 pshufd $0x4e, %xmm9, %xmm8 xorpd %xmm7, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm8 addpd %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm9, 0 * SIZE(CO1) movhpd %xmm9, 1 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhpd %xmm11, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm9, -16 * SIZE(BO) movapd %xmm11, -14 * SIZE(BO) #else movapd %xmm9, -16 * SIZE(AO) movapd %xmm11, -14 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S000066400000000000000000001232331313527062700222650ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define ALPHA_R 16(%rsp) #define ALPHA_I 32(%rsp) #define OFFSET 40(%rsp) #define KK 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #define BORIG 72(%rsp) #define BUFFER 128(%rsp) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #define KERNEL1(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulpd %xmm8, %xmm9 ;\ addpd %xmm9, %xmm0 ;\ movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addpd %xmm11, %xmm1 ;\ movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm8, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addpd %xmm13, %xmm2 ;\ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm8, %xmm3 ;\ movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulpd %xmm10, %xmm9 ;\ addpd %xmm9, %xmm4 ;\ movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulpd %xmm10, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm10, %xmm13 ;\ mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addpd %xmm13, %xmm6 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm10, %xmm7 ;\ movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulpd %xmm12, %xmm15 ;\ addpd %xmm15, %xmm0 ;\ movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm12, %xmm11 ;\ addpd %xmm11, %xmm1 ;\ movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm12, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addpd %xmm13, %xmm2 ;\ movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm12, %xmm3 ;\ movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulpd %xmm14, %xmm15 ;\ addpd %xmm15, %xmm4 ;\ movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulpd %xmm14, %xmm11 ;\ addpd %xmm11, %xmm5 ;\ movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulpd %xmm14, %xmm13 ;\ mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addpd %xmm13, %xmm6 ;\ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #ifndef CONJ #define NN #else #if defined(LN) || defined(LT) #define CN #else #define NC #endif #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 movaps %xmm3, %xmm0 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pcmpeqb %xmm15, %xmm15 psllq $63, %xmm15 # Generate mask pxor %xmm2, %xmm2 movlpd %xmm2, 0 + POSINV movlpd %xmm15, 8 + POSINV movlpd %xmm4, OFFSET movlpd %xmm4, KK salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N jle .L100 .L101: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L103 ALIGN_4 .L102: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) movlpd %xmm4, 8 * SIZE(BO) movlpd %xmm4, 9 * SIZE(BO) movlpd %xmm5, 10 * SIZE(BO) movlpd %xmm5, 11 * SIZE(BO) movlpd %xmm6, 12 * SIZE(BO) movlpd %xmm6, 13 * SIZE(BO) movlpd %xmm7, 14 * SIZE(BO) movlpd %xmm7, 15 * SIZE(BO) subq $-16 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L102 ALIGN_4 .L103: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L105 ALIGN_4 .L104: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) addq $4 * SIZE, BO addq $2 * SIZE, B decq %rax jne .L104 ALIGN_4 .L105: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 PREFETCHW 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L112 .L111: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 4 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 8 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 10 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 movapd 12 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 14 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $3, %rax # if (k & 1) BRANCH jle .L114 .L113: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm4 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm5 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm5, %xmm4 #else addpd %xmm1, %xmm0 addpd %xmm5, %xmm4 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm5 subpd %xmm0, %xmm1 subpd %xmm4, %xmm5 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm5 subpd %xmm0, %xmm1 subpd %xmm4, %xmm5 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 6 * SIZE(AO), %xmm8 movhpd 6 * SIZE(AO), %xmm8 movlpd 7 * SIZE(AO), %xmm9 movhpd 7 * SIZE(AO), %xmm9 movlpd 4 * SIZE(AO), %xmm10 movhpd 4 * SIZE(AO), %xmm10 movlpd 5 * SIZE(AO), %xmm11 movhpd 5 * SIZE(AO), %xmm11 movlpd 0 * SIZE(AO), %xmm12 movhpd 0 * SIZE(AO), %xmm12 movlpd 1 * SIZE(AO), %xmm13 movhpd 1 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm4, %xmm5 movapd %xmm5, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm1 subpd %xmm4, %xmm1 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 movlpd 2 * SIZE(AO), %xmm10 movhpd 2 * SIZE(AO), %xmm10 movlpd 3 * SIZE(AO), %xmm11 movhpd 3 * SIZE(AO), %xmm11 movlpd 6 * SIZE(AO), %xmm12 movhpd 6 * SIZE(AO), %xmm12 movlpd 7 * SIZE(AO), %xmm13 movhpd 7 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 movapd %xmm1, %xmm0 pshufd $0x4e, %xmm1, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm5 subpd %xmm4, %xmm5 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 addpd %xmm4, %xmm5 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef LN subq $4 * SIZE, CO1 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movhpd %xmm5, 3 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm5, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm5, 4 * SIZE(BO) movlpd %xmm5, 5 * SIZE(BO) movhpd %xmm5, 6 * SIZE(BO) movhpd %xmm5, 7 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm5, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L199 ALIGN_4 .L140: #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L142 .L141: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 2 * SIZE(AO), %xmm8 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 10 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 movapd 6 * SIZE(AO), %xmm8 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 mulpd 14 * SIZE(BO), %xmm8 addpd %xmm8, %xmm3 addq $8 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L141 .L142: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 movapd POSINV, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH jle .L144 .L143: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 mulpd 2 * SIZE(BO), %xmm8 addpd %xmm8, %xmm1 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L143 ALIGN_4 .L144: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO #endif SHUFPD_1 %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 #else xorpd %xmm15, %xmm0 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 #else addpd %xmm1, %xmm0 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 subpd %xmm0, %xmm1 #else movapd 0 * SIZE(AO), %xmm1 subpd %xmm0, %xmm1 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef RT movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L199: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 1 * COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L100: movq N, J sarq $1, J # j = (n >> 2) jle .L999 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif /* Copying to Sub Buffer */ leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 .L02: PREFETCHNTA 56 * SIZE(B) movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd 4 * SIZE(B), %xmm4 movlpd 5 * SIZE(B), %xmm5 movlpd 6 * SIZE(B), %xmm6 movlpd 7 * SIZE(B), %xmm7 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) movlpd %xmm4, 8 * SIZE(BO) movlpd %xmm4, 9 * SIZE(BO) movlpd %xmm5, 10 * SIZE(BO) movlpd %xmm5, 11 * SIZE(BO) movlpd %xmm6, 12 * SIZE(BO) movlpd %xmm6, 13 * SIZE(BO) movlpd %xmm7, 14 * SIZE(BO) movlpd %xmm7, 15 * SIZE(BO) subq $-16 * SIZE, BO addq $ 8 * SIZE, B decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L05 ALIGN_4 .L04: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 movlpd 2 * SIZE(B), %xmm2 movlpd 3 * SIZE(B), %xmm3 movlpd %xmm0, 0 * SIZE(BO) movlpd %xmm0, 1 * SIZE(BO) movlpd %xmm1, 2 * SIZE(BO) movlpd %xmm1, 3 * SIZE(BO) movlpd %xmm2, 4 * SIZE(BO) movlpd %xmm2, 5 * SIZE(BO) movlpd %xmm3, 6 * SIZE(BO) movlpd %xmm3, 7 * SIZE(BO) addq $ 4 * SIZE, B addq $ 8 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L05: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movapd 2 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movapd 4 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movapd 6 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 movapd 0 * SIZE(BO), %xmm9 pxor %xmm4, %xmm4 movapd 2 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 movapd 4 * SIZE(BO), %xmm13 movapd 8 * SIZE(BO), %xmm15 PREFETCHW 4 * SIZE(CO1) pxor %xmm6, %xmm6 PREFETCHW 4 * SIZE(CO2) pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) KERNEL4(16 * 0) KERNEL5(16 * 0) KERNEL6(16 * 0) KERNEL7(16 * 0) KERNEL8(16 * 0) KERNEL1(16 * 1) KERNEL2(16 * 1) KERNEL3(16 * 1) KERNEL4(16 * 1) KERNEL5(16 * 1) KERNEL6(16 * 1) KERNEL7(16 * 1) KERNEL8(16 * 1) cmpq $64 * 2, %rax jle .L12 KERNEL1(16 * 2) KERNEL2(16 * 2) KERNEL3(16 * 2) KERNEL4(16 * 2) KERNEL5(16 * 2) KERNEL6(16 * 2) KERNEL7(16 * 2) KERNEL8(16 * 2) KERNEL1(16 * 3) KERNEL2(16 * 3) KERNEL3(16 * 3) KERNEL4(16 * 3) KERNEL5(16 * 3) KERNEL6(16 * 3) KERNEL7(16 * 3) KERNEL8(16 * 3) cmpq $64 * 4, %rax jle .L12 KERNEL1(16 * 4) KERNEL2(16 * 4) KERNEL3(16 * 4) KERNEL4(16 * 4) KERNEL5(16 * 4) KERNEL6(16 * 4) KERNEL7(16 * 4) KERNEL8(16 * 4) KERNEL1(16 * 5) KERNEL2(16 * 5) KERNEL3(16 * 5) KERNEL4(16 * 5) KERNEL5(16 * 5) KERNEL6(16 * 5) KERNEL7(16 * 5) KERNEL8(16 * 5) cmpq $64 * 6, %rax jle .L12 KERNEL1(16 * 6) KERNEL2(16 * 6) KERNEL3(16 * 6) KERNEL4(16 * 6) KERNEL5(16 * 6) KERNEL6(16 * 6) KERNEL7(16 * 6) KERNEL8(16 * 6) KERNEL1(16 * 7) KERNEL2(16 * 7) KERNEL3(16 * 7) KERNEL4(16 * 7) KERNEL5(16 * 7) KERNEL6(16 * 7) KERNEL7(16 * 7) KERNEL8(16 * 7) addq $16 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $64 * 8, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 4), BO # * 64 ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L19 ALIGN_4 .L16: mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 mulpd 6 * SIZE(BO), %xmm8 addpd %xmm9, %xmm2 movapd 0 * SIZE(BO), %xmm9 addpd %xmm8, %xmm3 movapd 4 * SIZE(AO), %xmm8 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm4 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 addpd %xmm9, %xmm5 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 mulpd 6 * SIZE(BO), %xmm10 addpd %xmm9, %xmm6 movapd 8 * SIZE(BO), %xmm9 addpd %xmm10, %xmm7 movapd 6 * SIZE(AO), %xmm10 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L19: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm3 xorpd %xmm15, %xmm5 xorpd %xmm15, %xmm7 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 subpd %xmm5, %xmm4 subpd %xmm7, %xmm6 #else addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 addpd %xmm5, %xmm4 addpd %xmm7, %xmm6 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 movapd 4 * SIZE(B), %xmm5 movapd 6 * SIZE(B), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm5 movapd 4 * SIZE(AO), %xmm3 movapd 6 * SIZE(AO), %xmm7 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #ifdef LN movlpd 6 * SIZE(AO), %xmm8 movhpd 6 * SIZE(AO), %xmm8 movlpd 7 * SIZE(AO), %xmm9 movhpd 7 * SIZE(AO), %xmm9 movlpd 4 * SIZE(AO), %xmm10 movhpd 4 * SIZE(AO), %xmm10 movlpd 5 * SIZE(AO), %xmm11 movhpd 5 * SIZE(AO), %xmm11 movlpd 0 * SIZE(AO), %xmm12 movhpd 0 * SIZE(AO), %xmm12 movlpd 1 * SIZE(AO), %xmm13 movhpd 1 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 mulpd %xmm8, %xmm7 mulpd %xmm9, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 movapd %xmm5, %xmm0 movapd %xmm7, %xmm2 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 subpd %xmm4, %xmm1 subpd %xmm6, %xmm3 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 #endif #ifdef LT movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 movlpd 2 * SIZE(AO), %xmm10 movhpd 2 * SIZE(AO), %xmm10 movlpd 3 * SIZE(AO), %xmm11 movhpd 3 * SIZE(AO), %xmm11 movlpd 6 * SIZE(AO), %xmm12 movhpd 6 * SIZE(AO), %xmm12 movlpd 7 * SIZE(AO), %xmm13 movhpd 7 * SIZE(AO), %xmm13 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 movapd %xmm1, %xmm0 movapd %xmm3, %xmm2 pshufd $0x4e, %xmm1, %xmm4 pshufd $0x4e, %xmm3, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm5 subpd %xmm2, %xmm7 subpd %xmm4, %xmm5 subpd %xmm6, %xmm7 pshufd $0x4e, %xmm5, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 mulpd %xmm12, %xmm7 mulpd %xmm13, %xmm6 addpd %xmm4, %xmm5 addpd %xmm6, %xmm7 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 movlpd 2 * SIZE(B), %xmm10 movhpd 2 * SIZE(B), %xmm10 movlpd 3 * SIZE(B), %xmm11 movhpd 3 * SIZE(B), %xmm11 movlpd 6 * SIZE(B), %xmm12 movhpd 6 * SIZE(B), %xmm12 movlpd 7 * SIZE(B), %xmm13 movhpd 7 * SIZE(B), %xmm13 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm5 mulpd %xmm9, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 movapd %xmm1, %xmm0 movapd %xmm5, %xmm2 pshufd $0x4e, %xmm1, %xmm4 pshufd $0x4e, %xmm5, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm3 subpd %xmm2, %xmm7 subpd %xmm4, %xmm3 subpd %xmm6, %xmm7 pshufd $0x4e, %xmm3, %xmm2 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm6 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 mulpd %xmm12, %xmm7 mulpd %xmm13, %xmm6 addpd %xmm2, %xmm3 addpd %xmm6, %xmm7 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm8 movhpd 6 * SIZE(B), %xmm8 movlpd 7 * SIZE(B), %xmm9 movhpd 7 * SIZE(B), %xmm9 movlpd 4 * SIZE(B), %xmm10 movhpd 4 * SIZE(B), %xmm10 movlpd 5 * SIZE(B), %xmm11 movhpd 5 * SIZE(B), %xmm11 movlpd 0 * SIZE(B), %xmm12 movhpd 0 * SIZE(B), %xmm12 movlpd 1 * SIZE(B), %xmm13 movhpd 1 * SIZE(B), %xmm13 pshufd $0x4e, %xmm3, %xmm2 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm2 xorpd %xmm15, %xmm6 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 mulpd %xmm8, %xmm7 mulpd %xmm9, %xmm6 addpd %xmm2, %xmm3 addpd %xmm6, %xmm7 movapd %xmm3, %xmm0 movapd %xmm7, %xmm2 pshufd $0x4e, %xmm3, %xmm4 pshufd $0x4e, %xmm7, %xmm6 xorpd %xmm15, %xmm4 xorpd %xmm15, %xmm6 mulpd %xmm10, %xmm0 mulpd %xmm10, %xmm2 mulpd %xmm11, %xmm4 mulpd %xmm11, %xmm6 subpd %xmm0, %xmm1 subpd %xmm2, %xmm5 subpd %xmm4, %xmm1 subpd %xmm6, %xmm5 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm5, %xmm4 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm4 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 mulpd %xmm12, %xmm5 mulpd %xmm13, %xmm4 addpd %xmm0, %xmm1 addpd %xmm4, %xmm5 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm5, 2 * SIZE(CO1) movhpd %xmm5, 3 * SIZE(CO1) movsd %xmm3, 0 * SIZE(CO2) movhpd %xmm3, 1 * SIZE(CO2) movsd %xmm7, 2 * SIZE(CO2) movhpd %xmm7, 3 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movapd %xmm5, 4 * SIZE(B) movapd %xmm7, 6 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) movlpd %xmm5, 8 * SIZE(BO) movlpd %xmm5, 9 * SIZE(BO) movhpd %xmm5, 10 * SIZE(BO) movhpd %xmm5, 11 * SIZE(BO) movlpd %xmm7, 12 * SIZE(BO) movlpd %xmm7, 13 * SIZE(BO) movhpd %xmm7, 14 * SIZE(BO) movhpd %xmm7, 15 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm5, 2 * SIZE(AO) movapd %xmm3, 4 * SIZE(AO) movapd %xmm7, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax addq %rax, AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO #endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax je .L42 .L41: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 2 * SIZE(AO), %xmm8 movapd 8 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 10 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 12 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 14 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 4 * SIZE(AO), %xmm8 movapd 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 18 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 20 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 22 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 movapd 6 * SIZE(AO), %xmm8 movapd 24 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 26 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 28 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 30 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 addq $ 8 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movapd POSINV, %xmm15 andq $3, %rax # if (k & 1) BRANCH jle .L44 .L43: movapd 0 * SIZE(AO), %xmm8 movapd 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm0 movapd 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm1 movapd 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm2 movapd 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 addpd %xmm9, %xmm3 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L43 ALIGN_4 .L44: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO #endif SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) xorpd %xmm15, %xmm1 xorpd %xmm15, %xmm3 #else xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) subpd %xmm1, %xmm0 subpd %xmm3, %xmm2 #else addpd %xmm1, %xmm0 addpd %xmm3, %xmm2 #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(B), %xmm1 movapd 2 * SIZE(B), %xmm3 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 #else movapd 0 * SIZE(AO), %xmm1 movapd 2 * SIZE(AO), %xmm3 subpd %xmm0, %xmm1 subpd %xmm2, %xmm3 #endif #ifndef CONJ SHUFPD_1 %xmm15, %xmm15 #endif #if defined(LN) || defined(LT) movlpd 0 * SIZE(AO), %xmm8 movhpd 0 * SIZE(AO), %xmm8 movlpd 1 * SIZE(AO), %xmm9 movhpd 1 * SIZE(AO), %xmm9 pshufd $0x4e, %xmm1, %xmm0 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm0 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm0, %xmm1 addpd %xmm2, %xmm3 #endif #ifdef RN movlpd 0 * SIZE(B), %xmm8 movhpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 movhpd 1 * SIZE(B), %xmm9 movlpd 2 * SIZE(B), %xmm10 movhpd 2 * SIZE(B), %xmm10 movlpd 3 * SIZE(B), %xmm11 movhpd 3 * SIZE(B), %xmm11 movlpd 6 * SIZE(B), %xmm12 movhpd 6 * SIZE(B), %xmm12 movlpd 7 * SIZE(B), %xmm13 movhpd 7 * SIZE(B), %xmm13 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm8, %xmm1 mulpd %xmm9, %xmm0 addpd %xmm0, %xmm1 movapd %xmm1, %xmm0 pshufd $0x4e, %xmm1, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm3 subpd %xmm4, %xmm3 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm2 mulpd %xmm12, %xmm3 mulpd %xmm13, %xmm2 addpd %xmm2, %xmm3 #endif #ifdef RT movlpd 6 * SIZE(B), %xmm8 movhpd 6 * SIZE(B), %xmm8 movlpd 7 * SIZE(B), %xmm9 movhpd 7 * SIZE(B), %xmm9 movlpd 4 * SIZE(B), %xmm10 movhpd 4 * SIZE(B), %xmm10 movlpd 5 * SIZE(B), %xmm11 movhpd 5 * SIZE(B), %xmm11 movlpd 0 * SIZE(B), %xmm12 movhpd 0 * SIZE(B), %xmm12 movlpd 1 * SIZE(B), %xmm13 movhpd 1 * SIZE(B), %xmm13 pshufd $0x4e, %xmm3, %xmm2 xorpd %xmm15, %xmm2 mulpd %xmm8, %xmm3 mulpd %xmm9, %xmm2 addpd %xmm2, %xmm3 movapd %xmm3, %xmm0 pshufd $0x4e, %xmm3, %xmm4 xorpd %xmm15, %xmm4 mulpd %xmm10, %xmm0 mulpd %xmm11, %xmm4 subpd %xmm0, %xmm1 subpd %xmm4, %xmm1 pshufd $0x4e, %xmm1, %xmm0 xorpd %xmm15, %xmm0 mulpd %xmm12, %xmm1 mulpd %xmm13, %xmm0 addpd %xmm0, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif movsd %xmm1, 0 * SIZE(CO1) movhpd %xmm1, 1 * SIZE(CO1) movsd %xmm3, 0 * SIZE(CO2) movhpd %xmm3, 1 * SIZE(CO2) #if defined(LN) || defined(LT) movapd %xmm1, 0 * SIZE(B) movapd %xmm3, 2 * SIZE(B) movlpd %xmm1, 0 * SIZE(BO) movlpd %xmm1, 1 * SIZE(BO) movhpd %xmm1, 2 * SIZE(BO) movhpd %xmm1, 3 * SIZE(BO) movlpd %xmm3, 4 * SIZE(BO) movlpd %xmm3, 5 * SIZE(BO) movhpd %xmm3, 6 * SIZE(BO) movhpd %xmm3, 7 * SIZE(BO) #else movapd %xmm1, 0 * SIZE(AO) movapd %xmm3, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 ALIGN_3 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S000066400000000000000000001277471313527062700223040ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define M %rdi #define N %rsi #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %r13 #define BO %r14 #define CO1 %r15 #define CO2 %rbx #define KK %rbp #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define KKK 56(%rsp) #define AORIG 64(%rsp) #else #define STACKSIZE 256 #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define KKK 232(%rsp) #define AORIG 240(%rsp) #endif #define PREFETCH prefetcht1 #define PREFETCHSIZE (16 * 12 + 3) #define PREFETCH_R (4 * 4 + 0) #ifndef CONJ #define ADD1 addpd #define ADD2 addpd #else #define ADD1 subpd #define ADD2 addpd #endif #define KERNEL1(address) \ mulpd %xmm8, %xmm9;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm9, %xmm0;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL2(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL3(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm0;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm1;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm2;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm3;\ movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL4(address) \ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm4;\ movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD2 %xmm9, %xmm5;\ movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ ADD1 %xmm9, %xmm6;\ movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ mulpd %xmm8, %xmm9;\ movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ ADD2 %xmm9, %xmm7;\ movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 #define KERNEL5(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL6(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL7(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm0;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm1;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm2;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm3;\ movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL8(address) \ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm4;\ movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD2 %xmm11, %xmm5;\ movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ ADD1 %xmm11, %xmm6;\ movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ mulpd %xmm10, %xmm11;\ movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ ADD2 %xmm11, %xmm7;\ movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 #define KERNEL9(address) \ mulpd %xmm12, %xmm13;\ PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ ADD1 %xmm13, %xmm0;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL10(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL11(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm0;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm1;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm2;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm3;\ movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL12(address) \ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm4;\ movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD2 %xmm13, %xmm5;\ movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ ADD1 %xmm13, %xmm6;\ movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ mulpd %xmm12, %xmm13;\ movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ ADD2 %xmm13, %xmm7;\ movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 #define KERNEL13(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL14(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL15(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm0;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm1;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm2;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm3;\ movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 #define KERNEL16(address) \ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm4;\ movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD2 %xmm15, %xmm5;\ movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ ADD1 %xmm15, %xmm6;\ movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ mulpd %xmm14, %xmm15;\ movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ ADD2 %xmm15, %xmm7;\ movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, M movq ARG2, N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #endif movq OLD_LDC, LDC movq OLD_OFFSET, KK movq KK, OFFSET salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, KK subq OFFSET, KK #endif testq $1, N jle .L100 .L101: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I # i = (m >> 2) jle .L130 ALIGN_4 .L110: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm4, %xmm4 movddup 8 * SIZE(BO), %xmm11 pxor %xmm5, %xmm5 prefetchnta 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L112 .L111: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 0 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm4 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm5 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm4 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 40 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm5 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 18 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 8 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 20 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 22 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 24 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 26 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 28 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 30 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm4 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 32 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm5 movddup 24 * SIZE(BO), %xmm11 addq $32 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L111 ALIGN_4 .L112: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L114 .L113: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 movapd 4 * SIZE(AO), %xmm8 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 addq $4 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L113 ALIGN_4 .L114: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm5, %xmm5 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm5, %xmm4 #else addsubpd %xmm0, %xmm1 addsubpd %xmm4, %xmm5 #endif #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm5, %xmm9 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 #else addsubpd %xmm1, %xmm8 addsubpd %xmm5, %xmm9 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 6 * SIZE(AO), %xmm0 movddup 7 * SIZE(AO), %xmm1 movddup 4 * SIZE(AO), %xmm2 movddup 5 * SIZE(AO), %xmm3 movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm9 movapd %xmm9, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm8 movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 movddup 2 * SIZE(AO), %xmm2 movddup 3 * SIZE(AO), %xmm3 movddup 6 * SIZE(AO), %xmm4 movddup 7 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm9 movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm9 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L110 ALIGN_4 .L130: testq $1, M jle .L149 ALIGN_4 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L142 .L141: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $16 * SIZE, BO decq %rax jne .L141 .L142: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L144 .L143: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $2 * SIZE, BO # boffset1 += 8 decq %rax jg .L143 ALIGN_4 .L144: addpd %xmm2, %xmm0 addpd %xmm3, %xmm1 SHUFPD_1 %xmm1, %xmm1 #ifndef CONJ addsubpd %xmm1, %xmm0 #else addsubpd %xmm0, %xmm1 #endif #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 #else movapd 0 * SIZE(AO), %xmm8 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 #else addsubpd %xmm1, %xmm8 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef RT movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L149: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_3 .L100: movq N, J sarq $1, J # j = (n >> 2) jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I # i = (m >> 2) jle .L30 ALIGN_4 .L10: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 movapd 16 * SIZE(AO), %xmm12 movddup 16 * SIZE(BO), %xmm13 movapd 24 * SIZE(AO), %xmm14 movddup 24 * SIZE(BO), %xmm15 prefetchnta 4 * SIZE(CO1) pxor %xmm4, %xmm4 prefetchnta 4 * SIZE(CO2) pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L12 .L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) KERNEL4 (16 * 0) KERNEL5 (16 * 0) KERNEL6 (16 * 0) KERNEL7 (16 * 0) KERNEL8 (16 * 0) KERNEL9 (16 * 0) KERNEL10(16 * 0) KERNEL11(16 * 0) KERNEL12(16 * 0) KERNEL13(16 * 0) KERNEL14(16 * 0) KERNEL15(16 * 0) KERNEL16(16 * 0) cmpq $128 * 1, %rax NOBRANCH jle .L11 KERNEL1 (16 * 1) KERNEL2 (16 * 1) KERNEL3 (16 * 1) KERNEL4 (16 * 1) KERNEL5 (16 * 1) KERNEL6 (16 * 1) KERNEL7 (16 * 1) KERNEL8 (16 * 1) KERNEL9 (16 * 1) KERNEL10(16 * 1) KERNEL11(16 * 1) KERNEL12(16 * 1) KERNEL13(16 * 1) KERNEL14(16 * 1) KERNEL15(16 * 1) KERNEL16(16 * 1) cmpq $128 * 2, %rax NOBRANCH jle .L11 KERNEL1 (16 * 2) KERNEL2 (16 * 2) KERNEL3 (16 * 2) KERNEL4 (16 * 2) KERNEL5 (16 * 2) KERNEL6 (16 * 2) KERNEL7 (16 * 2) KERNEL8 (16 * 2) KERNEL9 (16 * 2) KERNEL10(16 * 2) KERNEL11(16 * 2) KERNEL12(16 * 2) KERNEL13(16 * 2) KERNEL14(16 * 2) KERNEL15(16 * 2) KERNEL16(16 * 2) cmpq $128 * 3, %rax NOBRANCH jle .L11 KERNEL1 (16 * 3) KERNEL2 (16 * 3) KERNEL3 (16 * 3) KERNEL4 (16 * 3) KERNEL5 (16 * 3) KERNEL6 (16 * 3) KERNEL7 (16 * 3) KERNEL8 (16 * 3) KERNEL9 (16 * 3) KERNEL10(16 * 3) KERNEL11(16 * 3) KERNEL12(16 * 3) KERNEL13(16 * 3) KERNEL14(16 * 3) KERNEL15(16 * 3) KERNEL16(16 * 3) cmpq $128 * 4, %rax NOBRANCH jle .L11 KERNEL1 (16 * 4) KERNEL2 (16 * 4) KERNEL3 (16 * 4) KERNEL4 (16 * 4) KERNEL5 (16 * 4) KERNEL6 (16 * 4) KERNEL7 (16 * 4) KERNEL8 (16 * 4) KERNEL9 (16 * 4) KERNEL10(16 * 4) KERNEL11(16 * 4) KERNEL12(16 * 4) KERNEL13(16 * 4) KERNEL14(16 * 4) KERNEL15(16 * 4) KERNEL16(16 * 4) cmpq $128 * 5, %rax NOBRANCH jle .L11 KERNEL1 (16 * 5) KERNEL2 (16 * 5) KERNEL3 (16 * 5) KERNEL4 (16 * 5) KERNEL5 (16 * 5) KERNEL6 (16 * 5) KERNEL7 (16 * 5) KERNEL8 (16 * 5) KERNEL9 (16 * 5) KERNEL10(16 * 5) KERNEL11(16 * 5) KERNEL12(16 * 5) KERNEL13(16 * 5) KERNEL14(16 * 5) KERNEL15(16 * 5) KERNEL16(16 * 5) cmpq $128 * 6, %rax NOBRANCH jle .L11 KERNEL1 (16 * 6) KERNEL2 (16 * 6) KERNEL3 (16 * 6) KERNEL4 (16 * 6) KERNEL5 (16 * 6) KERNEL6 (16 * 6) KERNEL7 (16 * 6) KERNEL8 (16 * 6) KERNEL9 (16 * 6) KERNEL10(16 * 6) KERNEL11(16 * 6) KERNEL12(16 * 6) KERNEL13(16 * 6) KERNEL14(16 * 6) KERNEL15(16 * 6) KERNEL16(16 * 6) cmpq $128 * 7, %rax NOBRANCH jle .L11 KERNEL1 (16 * 7) KERNEL2 (16 * 7) KERNEL3 (16 * 7) KERNEL4 (16 * 7) KERNEL5 (16 * 7) KERNEL6 (16 * 7) KERNEL7 (16 * 7) KERNEL8 (16 * 7) KERNEL9 (16 * 7) KERNEL10(16 * 7) KERNEL11(16 * 7) KERNEL12(16 * 7) KERNEL13(16 * 7) KERNEL14(16 * 7) KERNEL15(16 * 7) KERNEL16(16 * 7) addq $32 * 8 * SIZE, AO addq $32 * 8 * SIZE, BO subq $128 * 8, %rax jg .L1X .L11: leaq (AO, %rax, 2), AO # * 16 leaq (BO, %rax, 2), BO # * 64 ALIGN_4 .L12: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH je .L14 ALIGN_4 .L13: mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm10 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movddup 0 * SIZE(BO), %xmm11 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm4 movddup 1 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm5 movddup 2 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm6 movddup 3 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm7 addq $4 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L13 ALIGN_4 .L14: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 SHUFPD_1 %xmm5, %xmm5 SHUFPD_1 %xmm7, %xmm7 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 addsubpd %xmm4, %xmm5 addsubpd %xmm6, %xmm7 #endif #if defined(LN) || defined(RT) movq KK, %rax subq $2, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 movapd 4 * SIZE(BO), %xmm10 movapd 6 * SIZE(BO), %xmm11 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 movapd 4 * SIZE(AO), %xmm10 movapd 6 * SIZE(AO), %xmm11 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 subpd %xmm4, %xmm10 subpd %xmm6, %xmm11 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm3, %xmm9 subpd %xmm5, %xmm10 subpd %xmm7, %xmm11 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm4, %xmm9 subpd %xmm2, %xmm10 subpd %xmm6, %xmm11 #else addsubpd %xmm1, %xmm8 addsubpd %xmm5, %xmm9 addsubpd %xmm3, %xmm10 addsubpd %xmm7, %xmm11 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #if defined(LN) || defined(RT) #ifdef LN movddup 6 * SIZE(AO), %xmm0 movddup 7 * SIZE(AO), %xmm1 movddup 4 * SIZE(AO), %xmm2 movddup 5 * SIZE(AO), %xmm3 movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #else movddup 6 * SIZE(BO), %xmm0 movddup 7 * SIZE(BO), %xmm1 movddup 4 * SIZE(BO), %xmm2 movddup 5 * SIZE(BO), %xmm3 movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #endif #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm10, %xmm12 movapd %xmm11, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm10 mulpd %xmm0, %xmm11 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm10 addsubpd %xmm13, %xmm11 movapd %xmm10, %xmm12 movapd %xmm10, %xmm13 movapd %xmm11, %xmm14 movapd %xmm11, %xmm15 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 mulpd %xmm2, %xmm12 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm13 mulpd %xmm3, %xmm15 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 subpd %xmm12, %xmm8 subpd %xmm14, %xmm9 movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #if defined(LT) || defined(RN) #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 movddup 2 * SIZE(AO), %xmm2 movddup 3 * SIZE(AO), %xmm3 movddup 6 * SIZE(AO), %xmm4 movddup 7 * SIZE(AO), %xmm5 #else movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 movddup 2 * SIZE(BO), %xmm2 movddup 3 * SIZE(BO), %xmm3 movddup 6 * SIZE(BO), %xmm4 movddup 7 * SIZE(BO), %xmm5 #endif #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 movapd %xmm9, %xmm14 movapd %xmm9, %xmm15 SHUFPD_1 %xmm13, %xmm13 SHUFPD_1 %xmm15, %xmm15 mulpd %xmm2, %xmm12 mulpd %xmm2, %xmm14 mulpd %xmm3, %xmm13 mulpd %xmm3, %xmm15 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 subpd %xmm12, %xmm10 subpd %xmm14, %xmm11 movapd %xmm10, %xmm12 movapd %xmm11, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm10 mulpd %xmm4, %xmm11 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm10 addsubpd %xmm13, %xmm11 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm10, 2 * SIZE(CO1) movhpd %xmm10, 3 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 2 * SIZE(CO1) movhpd %xmm9, 3 * SIZE(CO1) movsd %xmm10, 0 * SIZE(CO2) movhpd %xmm10, 1 * SIZE(CO2) movsd %xmm11, 2 * SIZE(CO2) movhpd %xmm11, 3 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) movapd %xmm10, 4 * SIZE(BO) movapd %xmm11, 6 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) movapd %xmm10, 4 * SIZE(AO) movapd %xmm11, 6 * SIZE(AO) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L10 ALIGN_4 .L30: testq $1, M jle .L99 #ifdef LN movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movddup 0 * SIZE(BO), %xmm9 pxor %xmm1, %xmm1 movapd 8 * SIZE(AO), %xmm10 pxor %xmm2, %xmm2 movddup 8 * SIZE(BO), %xmm11 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L42 .L41: mulpd %xmm8, %xmm9 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 5 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 6 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 7 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 4 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 16 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 9 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 10 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 11 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 6 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 12 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm0 movddup 13 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD2 %xmm11, %xmm1 movddup 14 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 ADD1 %xmm11, %xmm2 movddup 15 * SIZE(BO), %xmm11 mulpd %xmm8, %xmm11 movapd 16 * SIZE(AO), %xmm8 ADD2 %xmm11, %xmm3 movddup 24 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 17 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 18 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 19 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 10 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 20 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm0 movddup 21 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD2 %xmm9, %xmm1 movddup 22 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 ADD1 %xmm9, %xmm2 movddup 23 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm9 movapd 12 * SIZE(AO), %xmm10 ADD2 %xmm9, %xmm3 movddup 32 * SIZE(BO), %xmm9 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 25 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 26 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 27 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 14 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 28 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm0 movddup 29 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD2 %xmm11, %xmm1 movddup 30 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 ADD1 %xmm11, %xmm2 movddup 31 * SIZE(BO), %xmm11 mulpd %xmm10, %xmm11 movapd 24 * SIZE(AO), %xmm10 ADD2 %xmm11, %xmm3 movddup 40 * SIZE(BO), %xmm11 addq $16 * SIZE, AO addq $32 * SIZE, BO decq %rax jne .L41 .L42: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $7, %rax # if (k & 1) BRANCH jle .L44 .L43: mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm0 movddup 1 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD2 %xmm9, %xmm1 movddup 2 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 ADD1 %xmm9, %xmm2 movddup 3 * SIZE(BO), %xmm9 mulpd %xmm8, %xmm9 movapd 2 * SIZE(AO), %xmm8 ADD2 %xmm9, %xmm3 movddup 4 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $4 * SIZE, BO # boffset1 += 8 decq %rax jg .L43 ALIGN_4 .L44: SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm3, %xmm3 #ifndef CONJ addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 #else addsubpd %xmm0, %xmm1 addsubpd %xmm2, %xmm3 #endif #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif leaq (, %rax, SIZE), %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif #if defined(LN) || defined(LT) movapd 0 * SIZE(BO), %xmm8 movapd 2 * SIZE(BO), %xmm9 #else movapd 0 * SIZE(AO), %xmm8 movapd 2 * SIZE(AO), %xmm9 #endif #if (defined(LN) || defined(LT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 #elif (defined(LN) || defined(LT)) && defined(CONJ) subpd %xmm1, %xmm8 subpd %xmm3, %xmm9 #elif (defined(RN) || defined(RT)) && !defined(CONJ) subpd %xmm0, %xmm8 subpd %xmm2, %xmm9 #else addsubpd %xmm1, %xmm8 addsubpd %xmm3, %xmm9 #endif #ifdef CONJ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #endif #ifdef LN movddup 0 * SIZE(AO), %xmm4 movddup 1 * SIZE(AO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm4, %xmm8 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 mulpd %xmm5, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef LT movddup 0 * SIZE(AO), %xmm0 movddup 1 * SIZE(AO), %xmm1 #ifdef CONJ xorpd %xmm7, %xmm1 #endif movapd %xmm8, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm12, %xmm12 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 mulpd %xmm1, %xmm13 addsubpd %xmm12, %xmm8 addsubpd %xmm13, %xmm9 #endif #ifdef RN movddup 0 * SIZE(BO), %xmm0 movddup 1 * SIZE(BO), %xmm1 movddup 2 * SIZE(BO), %xmm2 movddup 3 * SIZE(BO), %xmm3 movddup 6 * SIZE(BO), %xmm4 movddup 7 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm8 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm8 movapd %xmm8, %xmm12 movapd %xmm8, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm9 movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm9 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm9 #endif #ifdef RT movddup 6 * SIZE(BO), %xmm0 movddup 7 * SIZE(BO), %xmm1 movddup 4 * SIZE(BO), %xmm2 movddup 5 * SIZE(BO), %xmm3 movddup 0 * SIZE(BO), %xmm4 movddup 1 * SIZE(BO), %xmm5 #ifdef CONJ xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 xorpd %xmm7, %xmm5 #endif movapd %xmm9, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm0, %xmm9 mulpd %xmm1, %xmm12 addsubpd %xmm12, %xmm9 movapd %xmm9, %xmm12 movapd %xmm9, %xmm13 SHUFPD_1 %xmm13, %xmm13 mulpd %xmm2, %xmm12 mulpd %xmm3, %xmm13 addsubpd %xmm13, %xmm12 subpd %xmm12, %xmm8 movapd %xmm8, %xmm12 SHUFPD_1 %xmm12, %xmm12 mulpd %xmm4, %xmm8 mulpd %xmm5, %xmm12 addsubpd %xmm12, %xmm8 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #else movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) movsd %xmm9, 0 * SIZE(CO2) movhpd %xmm9, 1 * SIZE(CO2) #endif #if defined(LN) || defined(LT) movapd %xmm8, 0 * SIZE(BO) movapd %xmm9, 2 * SIZE(BO) #else movapd %xmm8, 0 * SIZE(AO) movapd %xmm9, 2 * SIZE(AO) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $0 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L99: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S000066400000000000000000001433541313527062700230320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx #define M %r13 #define N %r14 #define K %r15 #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %rbx #define CO2 %rbp #define KK %rdx #define BB %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #define OFFSET 48(%rsp) #define J 56(%rsp) #define KKK 64(%rsp) #define AORIG 72(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #define OFFSET 224(%rsp) #define J 232(%rsp) #define KKK 240(%rsp) #define AORIG 248(%rsp) #endif #define PREFETCHSIZE (16 * 1 + 4) #define PREFETCH prefetcht0 #define ADD1 addps #define ADD2 addps PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C #endif subq $-32 * SIZE, A subq $-32 * SIZE, B movq OLD_M, M movq OLD_N, N movq OLD_K, K movq OLD_LDC, LDC movq OLD_OFFSET, KK salq $ZBASE_SHIFT, LDC movq KK, OFFSET negq KK #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RT movq N, KK subq OFFSET, KK #endif testq $1, N BRANCH jle .L30 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B subq LDC, C #endif movq C, CO1 #ifndef RT addq LDC, C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L55 ALIGN_3 .L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -30 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movddup -26 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -16 * SIZE(AO), %xmm0 subq $ -8 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L52 ALIGN_3 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L58 ALIGN_3 .L56: ADD1 %xmm1, %xmm8 movddup -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 mulps %xmm0, %xmm2 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L56 ALIGN_3 .L58: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 #else xorps %xmm0, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 #endif #endif haddps %xmm9, %xmm8 shufps $0xd8, %xmm8, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 subps %xmm8, %xmm9 movhlps %xmm9, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 subps %xmm8, %xmm9 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm9 subps %xmm2, %xmm9 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm11 subps %xmm2, %xmm11 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #if defined(RN) || defined(RT) movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm9, -32 * SIZE(BO) movlps %xmm11, -30 * SIZE(BO) movlps %xmm9, 0 * SIZE(CO1) movlps %xmm11, 2 * SIZE(CO1) #else movaps %xmm9, -32 * SIZE(AO) movlps %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L51 ALIGN_4 .L60: testq $1, M BRANCH jle .L69 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movsd -32 * SIZE(BO), %xmm5 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L65 ALIGN_3 .L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -26 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-8 * SIZE, BO subq $-8 * SIZE, AO subq $1, %rax BRANCH jg .L62 ALIGN_3 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L68 ALIGN_3 .L66: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movsd -30 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $2 * SIZE, BO subq $1, %rax BRANCH jg .L66 ALIGN_3 .L68: #if defined(LN) || defined(RT) movq KK, %rax subq $1, %rax salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else xorps %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else shufps $0xb1, %xmm9, %xmm9 xorps %xmm0, %xmm9 #endif #endif addps %xmm9, %xmm8 #if defined(LN) || defined(LT) movsd -32 * SIZE(BO), %xmm9 #else movsd -32 * SIZE(AO), %xmm9 #endif subps %xmm8, %xmm9 pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 #endif #if defined(RN) || defined(RT) movsd -32 * SIZE(BO), %xmm5 #endif pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm9, -32 * SIZE(BO) #else movlps %xmm9, -32 * SIZE(AO) #endif movlps %xmm9, (CO1) #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L69: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L30: testq $2, N BRANCH jle .L50 #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L40 ALIGN_4 .L31: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO2) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L35 ALIGN_3 .L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L32 ALIGN_3 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L38 ALIGN_3 .L36: ADD1 %xmm1, %xmm8 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 ADD1 %xmm3, %xmm10 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD2 %xmm4, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L36 ALIGN_3 .L38: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 #if defined(LN) || defined(LT) movaps %xmm8, %xmm9 movlhps %xmm10, %xmm8 movhlps %xmm9, %xmm10 movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm9 subps %xmm2, %xmm9 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 subps %xmm3, %xmm11 subps %xmm2, %xmm11 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RT movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm11, -28 * SIZE(BO) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhps %xmm9, 0 * SIZE(CO2) movhps %xmm11, 2 * SIZE(CO2) #else movaps %xmm9, -32 * SIZE(AO) movaps %xmm11, -28 * SIZE(AO) movsd %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO2) movhps %xmm11, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L31 ALIGN_4 .L40: testq $1, M BRANCH jle .L49 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L45 ALIGN_3 .L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -24 * SIZE(AO), %xmm0 subq $-16 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L42 ALIGN_3 .L45: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L48 ALIGN_3 .L46: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $4 * SIZE, BO subq $1, %rax BRANCH jg .L46 ALIGN_3 .L48: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else xorps %xmm0, %xmm8 shufps $0xb1, %xmm9, %xmm9 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 shufps $0xb1, %xmm9, %xmm9 #else shufps $0xb1, %xmm9, %xmm9 xorps %xmm0, %xmm9 #endif #endif addps %xmm9, %xmm8 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 subps %xmm8, %xmm9 #else movaps -32 * SIZE(AO), %xmm9 subps %xmm8, %xmm9 movhlps %xmm9, %xmm11 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 addps %xmm10, %xmm9 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 #endif #ifdef RT movaps -28 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movlps %xmm9, (CO1) movhps %xmm9, (CO2) #else movlps %xmm9, -32 * SIZE(AO) movlps %xmm11, -30 * SIZE(AO) movlps %xmm9, (CO1) movlps %xmm11, (CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L49: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif ALIGN_4 .L50: movq N, J sarq $2, J NOBRANCH jle .L999 ALIGN_4 .L01: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B leaq (, LDC, 4), %rax subq %rax, C #endif movq C, CO1 leaq (C, LDC, 2), CO2 #ifndef RT leaq (C, LDC, 4), C #endif #ifdef LN movq OFFSET, KK addq M, KK #endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax leaq (B, %rax), BB #ifdef LT movq OFFSET, KK #endif movq M, I sarq $1, I NOBRANCH jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #else movq B, BO #endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 prefetcht2 4 * SIZE(CO1) xorps %xmm9, %xmm9 prefetcht2 4 * SIZE(CO1, LDC, 1) xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 prefetcht2 4 * SIZE(CO2) xorps %xmm13, %xmm13 prefetcht2 4 * SIZE(CO2, LDC, 1) xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L15 ALIGN_3 .L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -24 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -20 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -24 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -16 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -12 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -20 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm12 movaps -8 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -4 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 subq $-32 * SIZE, BO pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -16 * SIZE(AO), %xmm0 subq $-16 * SIZE, AO subq $1, %rax BRANCH jg .L12 ALIGN_3 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L18 ALIGN_3 .L16: ADD1 %xmm1, %xmm12 movaps -32 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm13 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pshufd $0xb1, %xmm5, %xmm6 mulps %xmm0, %xmm5 mulps %xmm0, %xmm6 ADD1 %xmm1, %xmm8 movaps -28 * SIZE(BO), %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xb1, %xmm1, %xmm2 mulps %xmm0, %xmm1 pshufd $0x1b, %xmm2, %xmm3 mulps %xmm0, %xmm2 ADD1 %xmm5, %xmm10 ADD2 %xmm6, %xmm11 pshufd $0xb1, %xmm3, %xmm4 mulps %xmm0, %xmm3 mulps %xmm0, %xmm4 movaps -28 * SIZE(AO), %xmm0 addq $4 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L16 ALIGN_3 .L18: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm12 ADD2 %xmm2, %xmm13 ADD1 %xmm3, %xmm14 ADD2 %xmm4, %xmm15 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #else #ifndef CONJ xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 xorps %xmm0, %xmm12 xorps %xmm0, %xmm14 #else shufps $0xb1, %xmm0, %xmm0 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 xorps %xmm0, %xmm13 xorps %xmm0, %xmm15 #endif #endif haddps %xmm9, %xmm8 haddps %xmm11, %xmm10 haddps %xmm13, %xmm12 haddps %xmm15, %xmm14 shufps $0xd8, %xmm8, %xmm8 shufps $0xd8, %xmm10, %xmm10 shufps $0xd8, %xmm12, %xmm12 shufps $0xd8, %xmm14, %xmm14 movaps %xmm8, %xmm9 shufps $0xe4, %xmm10, %xmm8 shufps $0xe4, %xmm9, %xmm10 movaps %xmm12, %xmm13 shufps $0xe4, %xmm14, %xmm12 shufps $0xe4, %xmm13, %xmm14 #if defined(LN) || defined(LT) movaps %xmm8, %xmm9 movlhps %xmm10, %xmm8 movhlps %xmm9, %xmm10 movaps %xmm12, %xmm11 movlhps %xmm14, %xmm12 movhlps %xmm11, %xmm14 movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm13 movaps -24 * SIZE(BO), %xmm11 movaps -20 * SIZE(BO), %xmm15 subps %xmm8, %xmm9 subps %xmm10, %xmm11 subps %xmm12, %xmm13 subps %xmm14, %xmm15 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm11 movaps -24 * SIZE(AO), %xmm13 movaps -20 * SIZE(AO), %xmm15 subps %xmm8, %xmm9 subps %xmm10, %xmm11 subps %xmm12, %xmm13 subps %xmm14, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #ifdef LN movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm10, %xmm11 addps %xmm14, %xmm15 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 movaps %xmm15, %xmm5 pshufd $0xb1, %xmm15, %xmm4 xorps %xmm7, %xmm2 xorps %xmm7, %xmm4 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 subps %xmm3, %xmm9 subps %xmm2, %xmm9 subps %xmm5, %xmm13 subps %xmm4, %xmm13 movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm13, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm13 mulps %xmm1, %xmm14 addps %xmm10, %xmm9 addps %xmm14, %xmm13 #endif #ifdef LT movaps -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm13, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm13 mulps %xmm1, %xmm14 addps %xmm10, %xmm9 addps %xmm14, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 movaps %xmm13, %xmm5 pshufd $0xb1, %xmm13, %xmm4 xorps %xmm7, %xmm2 xorps %xmm7, %xmm4 mulps %xmm0, %xmm3 mulps %xmm1, %xmm2 mulps %xmm0, %xmm5 mulps %xmm1, %xmm4 subps %xmm3, %xmm11 subps %xmm2, %xmm11 subps %xmm5, %xmm15 subps %xmm4, %xmm15 movaps -28 * SIZE(AO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm10 xorps %xmm7, %xmm14 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm10, %xmm11 addps %xmm14, %xmm15 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 movaps -20 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 #endif #ifdef RT movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 movaps %xmm15, %xmm3 pshufd $0xb1, %xmm15, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 movaps -8 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 movaps -16 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm13, -28 * SIZE(BO) movaps %xmm11, -24 * SIZE(BO) movaps %xmm15, -20 * SIZE(BO) movsd %xmm9, 0 * SIZE(CO1) movsd %xmm11, 2 * SIZE(CO1) movhps %xmm9, 0 * SIZE(CO1, LDC) movhps %xmm11, 2 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movsd %xmm15, 2 * SIZE(CO2) movhps %xmm13, 0 * SIZE(CO2, LDC) movhps %xmm15, 2 * SIZE(CO2, LDC) #else movaps %xmm9, -32 * SIZE(AO) movaps %xmm11, -28 * SIZE(AO) movaps %xmm13, -24 * SIZE(AO) movaps %xmm15, -20 * SIZE(AO) movsd %xmm9, 0 * SIZE(CO1) movhps %xmm9, 2 * SIZE(CO1) movsd %xmm11, 0 * SIZE(CO1, LDC) movhps %xmm11, 2 * SIZE(CO1, LDC) movsd %xmm13, 0 * SIZE(CO2) movhps %xmm13, 2 * SIZE(CO2) movsd %xmm15, 0 * SIZE(CO2, LDC) movhps %xmm15, 2 * SIZE(CO2, LDC) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $2, KK #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- BRANCH jg .L11 ALIGN_4 .L20: testq $1, M BRANCH jle .L29 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq AORIG, AO movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #else movq B, BO #endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 movaps -32 * SIZE(BO), %xmm5 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 xorps %xmm10, %xmm10 xorps %xmm11, %xmm11 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax NOBRANCH jle .L25 ALIGN_3 .L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -20 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -16 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -28 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -12 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -8 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -26 * SIZE(AO), %xmm0 ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -4 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps 0 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -24 * SIZE(AO), %xmm0 subq $-32 * SIZE, BO subq $ -8 * SIZE, AO subq $1, %rax BRANCH jg .L22 ALIGN_3 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax # if (k & 1) BRANCH je .L28 ALIGN_3 .L26: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 ADD2 %xmm2, %xmm9 pshufd $0xf5, %xmm5, %xmm2 movaps -28 * SIZE(BO), %xmm5 mulps %xmm0, %xmm2 ADD1 %xmm3, %xmm10 pshufd $0xa0, %xmm5, %xmm3 mulps %xmm0, %xmm3 ADD2 %xmm4, %xmm11 pshufd $0xf5, %xmm5, %xmm4 movaps -24 * SIZE(BO), %xmm5 mulps %xmm0, %xmm4 movddup -30 * SIZE(AO), %xmm0 addq $2 * SIZE, AO addq $8 * SIZE, BO subq $1, %rax BRANCH jg .L26 ALIGN_3 .L28: #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $4, %rax #endif salq $ZBASE_SHIFT, %rax movq AORIG, AO leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO #endif ADD1 %xmm1, %xmm8 ADD2 %xmm2, %xmm9 ADD1 %xmm3, %xmm10 ADD2 %xmm4, %xmm11 pcmpeqb %xmm0, %xmm0 psllq $63, %xmm0 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #else xorps %xmm0, %xmm8 xorps %xmm0, %xmm10 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #endif #else #ifndef CONJ xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 #else shufps $0xb1, %xmm9, %xmm9 shufps $0xb1, %xmm11, %xmm11 xorps %xmm0, %xmm9 xorps %xmm0, %xmm11 #endif #endif addps %xmm9, %xmm8 addps %xmm11, %xmm10 #if defined(LN) || defined(LT) movaps -32 * SIZE(BO), %xmm9 movaps -28 * SIZE(BO), %xmm11 subps %xmm8, %xmm9 subps %xmm10, %xmm11 #else movaps -32 * SIZE(AO), %xmm9 movaps -28 * SIZE(AO), %xmm13 subps %xmm8, %xmm9 subps %xmm10, %xmm13 movhlps %xmm9, %xmm11 movhlps %xmm13, %xmm15 #endif pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 #ifndef CONJ shufps $0xb1, %xmm7, %xmm7 #endif #if defined(LN) || defined(LT) movsd -32 * SIZE(AO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm10 pshufd $0xb1, %xmm11, %xmm12 xorps %xmm7, %xmm10 xorps %xmm7, %xmm12 mulps %xmm0, %xmm9 mulps %xmm1, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm12 addps %xmm10, %xmm9 addps %xmm12, %xmm11 #endif #ifdef RN movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 movaps %xmm9, %xmm3 pshufd $0xb1, %xmm9, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 movaps -28 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 movaps -20 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm15 subps %xmm1, %xmm15 movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 #endif #ifdef RT movaps -4 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm15, %xmm14 xorps %xmm7, %xmm14 mulps %xmm0, %xmm15 mulps %xmm1, %xmm14 addps %xmm14, %xmm15 movaps %xmm15, %xmm3 pshufd $0xb1, %xmm15, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm13 subps %xmm1, %xmm13 movaps -8 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -12 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm13, %xmm12 xorps %xmm7, %xmm12 mulps %xmm0, %xmm13 mulps %xmm1, %xmm12 addps %xmm12, %xmm13 movaps %xmm13, %xmm3 pshufd $0xb1, %xmm13, %xmm2 xorps %xmm7, %xmm2 movaps -16 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm11 subps %xmm1, %xmm11 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -24 * SIZE(BO), %xmm5 pshufd $0xaa, %xmm5, %xmm0 pshufd $0xff, %xmm5, %xmm1 pshufd $0xb1, %xmm11, %xmm10 xorps %xmm7, %xmm10 mulps %xmm0, %xmm11 mulps %xmm1, %xmm10 addps %xmm10, %xmm11 movaps %xmm11, %xmm3 pshufd $0xb1, %xmm11, %xmm2 xorps %xmm7, %xmm2 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 mulps %xmm3, %xmm0 mulps %xmm2, %xmm1 subps %xmm0, %xmm9 subps %xmm1, %xmm9 movaps -32 * SIZE(BO), %xmm5 pshufd $0x00, %xmm5, %xmm0 pshufd $0x55, %xmm5, %xmm1 pshufd $0xb1, %xmm9, %xmm8 xorps %xmm7, %xmm8 mulps %xmm0, %xmm9 mulps %xmm1, %xmm8 addps %xmm8, %xmm9 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm9, -32 * SIZE(BO) movaps %xmm11, -28 * SIZE(BO) movsd %xmm9, (CO1) movhps %xmm9, (CO1, LDC) movsd %xmm11, (CO2) movhps %xmm11, (CO2, LDC) #else movlhps %xmm11, %xmm9 movlhps %xmm15, %xmm13 movaps %xmm9, -32 * SIZE(AO) movaps %xmm13, -28 * SIZE(AO) movlps %xmm9, (CO1) movlps %xmm11, (CO1, LDC) movlps %xmm13, (CO2) movlps %xmm15, (CO2, LDC) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO #endif #ifdef LN subq $1, KK #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L29: #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax leaq (B, %rax, 4), B #endif #if defined(LT) || defined(RN) movq BO, B #endif #ifdef RN addq $4, KK #endif #ifdef RT subq $4, KK #endif subq $1, J BRANCH jg .L01 ALIGN_4 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S000066400000000000000000002140051313527062700222030ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define N %r14 #define K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define J %r12 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %rbp #ifndef WINDOWS_ABI #define STACKSIZE 64 #define OLD_LDC 8 + STACKSIZE(%rsp) #define OLD_OFFSET 16 + STACKSIZE(%rsp) #else #define STACKSIZE 256 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define POSINV 0(%rsp) #define OFFSET 16(%rsp) #define KK 24(%rsp) #define KKK 32(%rsp) #define AORIG 40(%rsp) #define BORIG 48(%rsp) #define BUFFER 128(%rsp) #ifdef OPTERON #define movsd movlps #endif #if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHNTA prefetchnta #define PREFETCHSIZE (8 * 6 + 4) #endif #define KERNEL1(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm11, %xmm1 ;\ movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addps %xmm13, %xmm2 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm8, %xmm3 ;\ movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL2(xx) \ mulps %xmm10, %xmm9 ;\ addps %xmm9, %xmm4 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm10, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm10, %xmm13 ;\ mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addps %xmm13, %xmm6 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm10, %xmm7 ;\ movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL3(xx) \ mulps %xmm12, %xmm15 ;\ addps %xmm15, %xmm0 ;\ movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm12, %xmm11 ;\ addps %xmm11, %xmm1 ;\ movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm12, %xmm13 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addps %xmm13, %xmm2 ;\ movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm12, %xmm3 ;\ movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL4(xx) \ mulps %xmm14, %xmm15 ;\ addps %xmm15, %xmm4 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm14, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm14, %xmm13 ;\ mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addps %xmm13, %xmm6 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm14, %xmm7 ;\ movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 #define KERNEL5(xx) \ mulps %xmm8, %xmm9 ;\ addps %xmm9, %xmm0 ;\ movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm8, %xmm11 ;\ PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ addps %xmm11, %xmm1 ;\ movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm8, %xmm13 ;\ mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ addps %xmm13, %xmm2 ;\ movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm8, %xmm3 ;\ movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 #define KERNEL6(xx) \ mulps %xmm10, %xmm9 ;\ addps %xmm9, %xmm4 ;\ movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ mulps %xmm10, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm10, %xmm13 ;\ mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ addps %xmm13, %xmm6 ;\ movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm10, %xmm7 ;\ movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 #define KERNEL7(xx) \ mulps %xmm12, %xmm15 ;\ addps %xmm15, %xmm0 ;\ movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm12, %xmm11 ;\ addps %xmm11, %xmm1 ;\ movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm12, %xmm13 ;\ mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ addps %xmm13, %xmm2 ;\ movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm12, %xmm3 ;\ movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 #define KERNEL8(xx) \ mulps %xmm14, %xmm15 ;\ addps %xmm15, %xmm4 ;\ movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ mulps %xmm14, %xmm11 ;\ addps %xmm11, %xmm5 ;\ movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ mulps %xmm14, %xmm13 ;\ mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ addps %xmm13, %xmm6 ;\ movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addps %xmm14, %xmm7 ;\ movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #else movq OLD_LDC, LDC movsd OLD_OFFSET, %xmm4 #endif movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING movq OLD_M, M movq OLD_N, N pxor %xmm15, %xmm15 cmpeqps %xmm15, %xmm15 pslld $31, %xmm15 # Generate mask pxor %xmm2, %xmm2 #ifndef CONJ movss %xmm15, 0 + POSINV movss %xmm2, 4 + POSINV movss %xmm15, 8 + POSINV movss %xmm2, 12 + POSINV #else movss %xmm2, 0 + POSINV movss %xmm15, 4 + POSINV movss %xmm2, 8 + POSINV movss %xmm15, 12 + POSINV #endif movlpd %xmm4, OFFSET movlpd %xmm4, KK salq $ZBASE_SHIFT, LDC #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax addq %rax, C imulq K, %rax addq %rax, A #endif #ifdef RT movq N, %rax salq $ZBASE_SHIFT, %rax imulq K, %rax addq %rax, B movq N, %rax imulq LDC, %rax addq %rax, C #endif #ifdef RN negq KK #endif #ifdef RT movq N, %rax subq OFFSET, %rax movq %rax, KK #endif testq $1, N je .L40 ALIGN_4 #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L43 ALIGN_4 .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) addq $ 8 * SIZE, B addq $32 * SIZE, BO decq %rax jne .L42 ALIGN_4 .L43: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L50 ALIGN_4 .L44: movlps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) addq $2 * SIZE, B addq $8 * SIZE, BO decq %rax jne .L44 ALIGN_4 .L50: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT subq LDC, C #endif movq C, CO1 # coffset1 = c #ifndef RT addq LDC, C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 .L51: #ifdef LN movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 PREFETCHW 4 * SIZE(CO1) #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L55 ALIGN_4 .L52: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 64 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 16 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 20 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 mulps 28 * SIZE(BO), %xmm10 addps %xmm11, %xmm4 movaps 80 * SIZE(BO), %xmm11 addps %xmm10, %xmm5 movaps 80 * SIZE(AO), %xmm10 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 32 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 36 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 36 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 40 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm12, %xmm1 movaps 44 * SIZE(AO), %xmm12 mulps %xmm12, %xmm13 mulps 44 * SIZE(BO), %xmm12 addps %xmm13, %xmm4 movaps 96 * SIZE(BO), %xmm13 addps %xmm12, %xmm5 movaps 96 * SIZE(AO), %xmm12 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 48 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 52 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 52 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 56 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm14, %xmm1 movaps 60 * SIZE(AO), %xmm14 mulps %xmm14, %xmm15 mulps 60 * SIZE(BO), %xmm14 addps %xmm15, %xmm4 movaps 112 * SIZE(BO), %xmm15 addps %xmm14, %xmm5 movaps 112 * SIZE(AO), %xmm14 addq $64 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L52 ALIGN_4 .L55: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L58 ALIGN_4 .L56: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm5 movaps 8 * SIZE(AO), %xmm8 addq $ 8 * SIZE, AO # aoffset += 4 addq $ 8 * SIZE, BO # boffset1 += 8 decq %rax jg .L56 ALIGN_4 .L58: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm5, %xmm5 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm5 #endif addps %xmm1, %xmm0 addps %xmm5, %xmm4 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 #ifdef movsd xorps %xmm6, %xmm6 #endif movsd 4 * SIZE(B), %xmm6 #ifdef movsd xorps %xmm7, %xmm7 #endif movsd 6 * SIZE(B), %xmm7 subps %xmm0, %xmm2 subps %xmm1, %xmm3 subps %xmm4, %xmm6 subps %xmm5, %xmm7 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm3 subps %xmm0, %xmm1 subps %xmm4, %xmm3 #endif #ifdef LN movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 movaps 24 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 movaps 16 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 movaps 12 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm6 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm4, %xmm1 addps %xmm6, %xmm3 #endif #ifdef LN subq $8 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) movlps %xmm6, 4 * SIZE(B) movlps %xmm7, 6 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BO) movaps %xmm1, 12 * SIZE(BO) pshufd $0x00, %xmm6, %xmm0 pshufd $0x55, %xmm6, %xmm1 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 movaps %xmm0, 24 * SIZE(BO) movaps %xmm1, 28 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movlps %xmm6, 4 * SIZE(CO1) movlps %xmm7, 6 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm3, 4 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm3, 4 * SIZE(CO1) movhps %xmm3, 6 * SIZE(CO1) #endif #ifndef LN addq $8 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L51 ALIGN_4 .L60: testq $2, M je .L70 #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L65 ALIGN_4 .L62: mulps %xmm8, %xmm9 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 20 * SIZE(BO), %xmm8 addps %xmm11, %xmm0 movaps 24 * SIZE(BO), %xmm11 addps %xmm8, %xmm1 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm13 mulps 36 * SIZE(BO), %xmm10 addps %xmm13, %xmm0 movaps 40 * SIZE(BO), %xmm13 addps %xmm10, %xmm1 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 mulps 44 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 52 * SIZE(BO), %xmm10 addps %xmm15, %xmm0 movaps 56 * SIZE(BO), %xmm15 addps %xmm10, %xmm1 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 mulps 60 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L62 ALIGN_4 .L65: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L68 ALIGN_4 .L66: mulps %xmm8, %xmm9 mulps 4 * SIZE(BO), %xmm8 addps %xmm9, %xmm0 movaps 8 * SIZE(BO), %xmm9 addps %xmm8, %xmm1 movaps 4 * SIZE(AO), %xmm8 addq $4 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L66 ALIGN_4 .L68: addps %xmm2, %xmm0 addps %xmm3, %xmm1 shufps $0xb1, %xmm1, %xmm1 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif #else xorps %xmm15, %xmm1 #endif addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 #ifdef movsd xorps %xmm3, %xmm3 #endif movsd 2 * SIZE(B), %xmm3 subps %xmm0, %xmm2 subps %xmm1, %xmm3 #else movaps 0 * SIZE(AO), %xmm1 subps %xmm0, %xmm1 #endif #ifdef LN movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) movlps %xmm3, 2 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 movaps %xmm0, 8 * SIZE(BO) movaps %xmm1, 12 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) #else movaps %xmm1, 0 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) #endif #ifndef LN addq $4 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L70: testq $1, M je .L79 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movsd 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L75 ALIGN_4 .L72: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 4 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 6 * SIZE(AO), %xmm8 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 16 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 10 * SIZE(AO), %xmm10 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 12 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 14 * SIZE(AO), %xmm10 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $64 * SIZE, BO decq %rax jne .L72 ALIGN_4 .L75: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L78 ALIGN_4 .L76: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 addq $2 * SIZE, AO # aoffset += 4 addq $8 * SIZE, BO # boffset1 += 8 decq %rax jg .L76 ALIGN_4 .L78: addps %xmm2, %xmm0 addps %xmm3, %xmm1 shufps $0xb1, %xmm1, %xmm1 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif #else xorps %xmm15, %xmm1 #endif addps %xmm1, %xmm0 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $1, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO #endif #if defined(LN) || defined(LT) #ifdef movsd xorps %xmm2, %xmm2 #endif movsd 0 * SIZE(B), %xmm2 subps %xmm0, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AO), %xmm1 subps %xmm0, %xmm1 #endif #if defined(LN) || defined(LT) movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #if defined(RN) || defined(RT) movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 #endif #if defined(LN) || defined(LT) movlps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) #else movlps %xmm1, 0 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) #endif #ifndef LN addq $2 * SIZE, CO1 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $2 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L79: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, COMPSIZE), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, COMPSIZE), B #endif #ifdef RN addq $1, KK #endif #ifdef RT subq $1, KK #endif ALIGN_4 .L40: movq N, J sarq $1, J # j = (n >> 2) jle .L999 ALIGN_4 .L01: #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK #endif leaq BUFFER, BO #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B #endif #if defined(LN) || defined(RT) movq KK, %rax movq B, BORIG salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LT) movq OFFSET, %rax movq %rax, KK #endif #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $2, %rax jle .L03 ALIGN_4 .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 16 * SIZE(BO) movaps %xmm5, 20 * SIZE(BO) movaps %xmm6, 24 * SIZE(BO) movaps %xmm7, 28 * SIZE(BO) movaps 8 * SIZE(B), %xmm3 movaps 12 * SIZE(B), %xmm7 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm2, 40 * SIZE(BO) movaps %xmm3, 44 * SIZE(BO) pshufd $0x00, %xmm7, %xmm4 pshufd $0x55, %xmm7, %xmm5 pshufd $0xaa, %xmm7, %xmm6 pshufd $0xff, %xmm7, %xmm7 movaps %xmm4, 48 * SIZE(BO) movaps %xmm5, 52 * SIZE(BO) movaps %xmm6, 56 * SIZE(BO) movaps %xmm7, 60 * SIZE(BO) addq $16 * SIZE, B addq $64 * SIZE, BO decq %rax jne .L02 ALIGN_4 .L03: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $3, %rax BRANCH jle .L10 ALIGN_4 .L04: movaps 0 * SIZE(B), %xmm3 pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm2 pshufd $0xff, %xmm3, %xmm3 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm2, 8 * SIZE(BO) movaps %xmm3, 12 * SIZE(BO) addq $ 4 * SIZE, B addq $16 * SIZE, BO decq %rax jne .L04 ALIGN_4 .L10: #if defined(LT) || defined(RN) movq A, AO #else movq A, AORIG #endif #ifdef RT leaq (, LDC, 2), %rax subq %rax, C #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 #ifndef RT leaq (C, LDC, 2), C #endif movq M, I sarq $2, I # i = (m >> 2) jle .L20 ALIGN_4 .L11: #ifdef LN movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 movaps 8 * SIZE(BO), %xmm13 movaps 16 * SIZE(BO), %xmm15 movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 movaps 4 * SIZE(AO), %xmm10 pxor %xmm1, %xmm1 movaps 8 * SIZE(AO), %xmm12 pxor %xmm2, %xmm2 movaps 12 * SIZE(AO), %xmm14 pxor %xmm3, %xmm3 PREFETCHW 7 * SIZE(CO1) pxor %xmm4, %xmm4 PREFETCHW 7 * SIZE(CO2) pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif andq $-8, %rax salq $4, %rax je .L15 .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) KERNEL4(32 * 0) KERNEL5(32 * 0) KERNEL6(32 * 0) KERNEL7(32 * 0) KERNEL8(32 * 0) KERNEL1(32 * 1) KERNEL2(32 * 1) KERNEL3(32 * 1) KERNEL4(32 * 1) KERNEL5(32 * 1) KERNEL6(32 * 1) KERNEL7(32 * 1) KERNEL8(32 * 1) addq $32 * 2 * SIZE, AO addq $64 * 2 * SIZE, BO subq $64 * 2, %rax jg .L1X .L12: leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO ALIGN_4 .L15: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L18 ALIGN_4 .L16: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 0 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm10, %xmm9 addps %xmm9, %xmm4 movaps 4 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm5 movaps 8 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 12 * SIZE(BO), %xmm10 addps %xmm9, %xmm6 movaps 16 * SIZE(BO), %xmm9 addps %xmm10, %xmm7 movaps 12 * SIZE(AO), %xmm10 addq $ 8 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L16 ALIGN_4 .L18: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 shufps $0xb1, %xmm5, %xmm5 shufps $0xb1, %xmm7, %xmm7 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 addps %xmm5, %xmm4 addps %xmm7, %xmm6 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $4, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps %xmm4, %xmm5 unpcklpd %xmm6, %xmm4 unpckhpd %xmm6, %xmm5 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm6 movaps 12 * SIZE(B), %xmm7 subps %xmm0, %xmm2 subps %xmm1, %xmm3 subps %xmm4, %xmm6 subps %xmm5, %xmm7 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm3 movaps 8 * SIZE(AO), %xmm5 movaps 12 * SIZE(AO), %xmm7 subps %xmm0, %xmm1 subps %xmm4, %xmm3 subps %xmm2, %xmm5 subps %xmm6, %xmm7 #endif #ifdef LN movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 movaps 24 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 movaps 16 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 8 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 movaps 12 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm6 subps %xmm1, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 20 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm6 #ifndef CONJ xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm6 addps %xmm0, %xmm6 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm6, %xmm0 pshufd $0xf5, %xmm6, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm7 subps %xmm1, %xmm7 movaps 28 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm7, %xmm0 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm7 addps %xmm0, %xmm7 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm2 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm2 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm0, %xmm1 addps %xmm2, %xmm3 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 pshufd $0xa0, %xmm3, %xmm4 pshufd $0xf5, %xmm3, %xmm6 #ifndef CONJ xorps %xmm15, %xmm2 xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm4 mulps %xmm10, %xmm2 mulps %xmm10, %xmm6 subps %xmm0, %xmm5 subps %xmm4, %xmm7 subps %xmm2, %xmm5 subps %xmm6, %xmm7 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm6 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm5 mulps %xmm10, %xmm7 addps %xmm4, %xmm5 addps %xmm6, %xmm7 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 pshufd $0xa0, %xmm7, %xmm2 pshufd $0xf5, %xmm7, %xmm7 #ifndef CONJ xorps %xmm15, %xmm5 xorps %xmm15, %xmm7 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm2 mulps %xmm10, %xmm5 mulps %xmm10, %xmm7 addps %xmm0, %xmm5 addps %xmm2, %xmm7 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 pshufd $0xa0, %xmm7, %xmm4 pshufd $0xf5, %xmm7, %xmm6 #ifndef CONJ xorps %xmm15, %xmm2 xorps %xmm15, %xmm6 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm0 mulps %xmm9, %xmm4 mulps %xmm10, %xmm2 mulps %xmm10, %xmm6 subps %xmm0, %xmm1 subps %xmm4, %xmm3 subps %xmm2, %xmm1 subps %xmm6, %xmm3 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 pshufd $0xa0, %xmm3, %xmm6 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm4 xorps %xmm15, %xmm6 #endif mulps %xmm9, %xmm4 mulps %xmm9, %xmm6 mulps %xmm10, %xmm1 mulps %xmm10, %xmm3 addps %xmm4, %xmm1 addps %xmm6, %xmm3 #endif #ifdef LN subq $8 * SIZE, CO1 subq $8 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) movaps %xmm6, 8 * SIZE(B) movaps %xmm7, 12 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm5, 28 * SIZE(BO) pshufd $0x00, %xmm6, %xmm0 pshufd $0x55, %xmm6, %xmm1 pshufd $0xaa, %xmm6, %xmm4 pshufd $0xff, %xmm6, %xmm5 movaps %xmm0, 32 * SIZE(BO) movaps %xmm1, 36 * SIZE(BO) movaps %xmm4, 40 * SIZE(BO) movaps %xmm5, 44 * SIZE(BO) pshufd $0x00, %xmm7, %xmm0 pshufd $0x55, %xmm7, %xmm1 pshufd $0xaa, %xmm7, %xmm4 pshufd $0xff, %xmm7, %xmm5 movaps %xmm0, 48 * SIZE(BO) movaps %xmm1, 52 * SIZE(BO) movaps %xmm4, 56 * SIZE(BO) movaps %xmm5, 60 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movlps %xmm6, 4 * SIZE(CO1) movlps %xmm7, 6 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) movhps %xmm3, 2 * SIZE(CO2) movhps %xmm6, 4 * SIZE(CO2) movhps %xmm7, 6 * SIZE(CO2) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm3, 4 * SIZE(AO) movaps %xmm5, 8 * SIZE(AO) movaps %xmm7, 12 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm3, 4 * SIZE(CO1) movhps %xmm3, 6 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) movhps %xmm5, 2 * SIZE(CO2) movlps %xmm7, 4 * SIZE(CO2) movhps %xmm7, 6 * SIZE(CO2) #endif #ifndef LN addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO #ifdef LT addq $16 * SIZE, B #endif #endif #ifdef LN subq $4, KK movq BORIG, B #endif #ifdef LT addq $4, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $2 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif decq I # i -- jg .L11 ALIGN_4 .L20: testq $2, M je .L30 #ifdef LN movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 movaps 32 * SIZE(AO), %xmm12 movaps 48 * SIZE(AO), %xmm14 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L25 ALIGN_4 .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 64 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 mulps 28 * SIZE(BO), %xmm8 addps %xmm11, %xmm2 movaps 80 * SIZE(BO), %xmm11 addps %xmm8, %xmm3 movaps 8 * SIZE(AO), %xmm8 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 mulps 44 * SIZE(BO), %xmm8 addps %xmm13, %xmm2 movaps 96 * SIZE(BO), %xmm13 addps %xmm8, %xmm3 movaps 12 * SIZE(AO), %xmm8 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 mulps 60 * SIZE(BO), %xmm8 addps %xmm15, %xmm2 movaps 112 * SIZE(BO), %xmm15 addps %xmm8, %xmm3 movaps 32 * SIZE(AO), %xmm8 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 mulps 76 * SIZE(BO), %xmm10 addps %xmm9, %xmm2 movaps 128 * SIZE(BO), %xmm9 addps %xmm10, %xmm3 movaps 20 * SIZE(AO), %xmm10 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 mulps 92 * SIZE(BO), %xmm10 addps %xmm11, %xmm2 movaps 144 * SIZE(BO), %xmm11 addps %xmm10, %xmm3 movaps 24 * SIZE(AO), %xmm10 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 mulps 108 * SIZE(BO), %xmm10 addps %xmm13, %xmm2 movaps 160 * SIZE(BO), %xmm13 addps %xmm10, %xmm3 movaps 28 * SIZE(AO), %xmm10 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 mulps 124 * SIZE(BO), %xmm10 addps %xmm15, %xmm2 movaps 176 * SIZE(BO), %xmm15 addps %xmm10, %xmm3 movaps 48 * SIZE(AO), %xmm10 addq $32 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L22 ALIGN_4 .L25: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L28 ALIGN_4 .L26: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 mulps 12 * SIZE(BO), %xmm8 addps %xmm9, %xmm2 movaps 16 * SIZE(BO), %xmm9 addps %xmm8, %xmm3 movaps 4 * SIZE(AO), %xmm8 addq $ 4 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L26 ALIGN_4 .L28: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $2, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) movaps %xmm0, %xmm1 unpcklpd %xmm2, %xmm0 unpckhpd %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm2 movaps 4 * SIZE(B), %xmm3 subps %xmm0, %xmm2 subps %xmm1, %xmm3 #else movaps 0 * SIZE(AO), %xmm1 movaps 4 * SIZE(AO), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #endif #ifdef LN movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm2 subps %xmm1, %xmm2 movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 subps %xmm0, %xmm3 subps %xmm1, %xmm3 movaps 4 * SIZE(AO), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm3, %xmm0 pshufd $0xf5, %xmm3, %xmm3 #ifndef CONJ xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm3 addps %xmm0, %xmm3 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 addps %xmm0, %xmm1 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm5 addps %xmm4, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm5 addps %xmm0, %xmm5 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $4 * SIZE, CO1 subq $4 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) movaps %xmm3, 4 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) pshufd $0x00, %xmm3, %xmm0 pshufd $0x55, %xmm3, %xmm1 pshufd $0xaa, %xmm3, %xmm4 pshufd $0xff, %xmm3, %xmm5 movaps %xmm0, 16 * SIZE(BO) movaps %xmm1, 20 * SIZE(BO) movaps %xmm4, 24 * SIZE(BO) movaps %xmm5, 28 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movlps %xmm3, 2 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) movhps %xmm3, 2 * SIZE(CO2) #else movaps %xmm1, 0 * SIZE(AO) movaps %xmm5, 4 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movhps %xmm1, 2 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) movhps %xmm5, 2 * SIZE(CO2) #endif #ifndef LN addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO #ifdef LT addq $8 * SIZE, B #endif #endif #ifdef LN subq $2, KK movq BORIG, B #endif #ifdef LT addq $2, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L30: testq $1, M je .L39 #ifdef LN movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, AORIG #endif #if defined(LN) || defined(RT) movq KK, %rax movq AORIG, AO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #endif leaq BUFFER, BO #if defined(LN) || defined(RT) movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO #endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 movsd 8 * SIZE(AO), %xmm10 movhps 10 * SIZE(AO), %xmm10 movaps 0 * SIZE(BO), %xmm9 movaps 16 * SIZE(BO), %xmm11 movaps 32 * SIZE(BO), %xmm13 movaps 48 * SIZE(BO), %xmm15 pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif sarq $3, %rax je .L35 ALIGN_4 .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 #if defined(OPTERON) && defined(HAVE_PREFETCH) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 64 * SIZE(BO), %xmm9 mulps %xmm8, %xmm11 addps %xmm11, %xmm0 movaps 20 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm1 movaps 24 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 addps %xmm11, %xmm2 movaps 28 * SIZE(BO), %xmm11 mulps %xmm8, %xmm11 movsd 4 * SIZE(AO), %xmm8 addps %xmm11, %xmm3 movaps 80 * SIZE(BO), %xmm11 mulps %xmm8, %xmm13 addps %xmm13, %xmm0 movaps 36 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm1 movaps 40 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 addps %xmm13, %xmm2 movaps 44 * SIZE(BO), %xmm13 mulps %xmm8, %xmm13 movsd 6 * SIZE(AO), %xmm8 addps %xmm13, %xmm3 movaps 96 * SIZE(BO), %xmm13 mulps %xmm8, %xmm15 addps %xmm15, %xmm0 movaps 52 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm1 movaps 56 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 addps %xmm15, %xmm2 movaps 60 * SIZE(BO), %xmm15 mulps %xmm8, %xmm15 movsd 16 * SIZE(AO), %xmm8 addps %xmm15, %xmm3 movaps 112 * SIZE(BO), %xmm15 mulps %xmm10, %xmm9 addps %xmm9, %xmm0 movaps 68 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm1 movaps 72 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 addps %xmm9, %xmm2 movaps 76 * SIZE(BO), %xmm9 mulps %xmm10, %xmm9 movsd 10 * SIZE(AO), %xmm10 addps %xmm9, %xmm3 movaps 128 * SIZE(BO), %xmm9 mulps %xmm10, %xmm11 addps %xmm11, %xmm0 movaps 84 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm1 movaps 88 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 addps %xmm11, %xmm2 movaps 92 * SIZE(BO), %xmm11 mulps %xmm10, %xmm11 movsd 12 * SIZE(AO), %xmm10 addps %xmm11, %xmm3 movaps 144 * SIZE(BO), %xmm11 mulps %xmm10, %xmm13 addps %xmm13, %xmm0 movaps 100 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm1 movaps 104 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 addps %xmm13, %xmm2 movaps 108 * SIZE(BO), %xmm13 mulps %xmm10, %xmm13 movsd 14 * SIZE(AO), %xmm10 addps %xmm13, %xmm3 movaps 160 * SIZE(BO), %xmm13 mulps %xmm10, %xmm15 addps %xmm15, %xmm0 movaps 116 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm1 movaps 120 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 addps %xmm15, %xmm2 movaps 124 * SIZE(BO), %xmm15 mulps %xmm10, %xmm15 movsd 24 * SIZE(AO), %xmm10 addps %xmm15, %xmm3 movaps 176 * SIZE(BO), %xmm15 addq $16 * SIZE, AO addq $128 * SIZE, BO decq %rax jne .L32 ALIGN_4 .L35: #if defined(LT) || defined(RN) movq KK, %rax #else movq K, %rax subq KK, %rax #endif movaps POSINV, %xmm15 andq $7, %rax # if (k & 1) BRANCH je .L38 ALIGN_4 .L36: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 movaps 4 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm1 movaps 8 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 addps %xmm9, %xmm2 movaps 12 * SIZE(BO), %xmm9 mulps %xmm8, %xmm9 movsd 2 * SIZE(AO), %xmm8 addps %xmm9, %xmm3 movaps 16 * SIZE(BO), %xmm9 addq $ 2 * SIZE, AO # aoffset += 4 addq $16 * SIZE, BO # boffset1 += 8 decq %rax jg .L36 ALIGN_4 .L38: shufps $0xb1, %xmm1, %xmm1 shufps $0xb1, %xmm3, %xmm3 #if defined(LN) || defined(LT) #ifndef CONJ xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else xorps %xmm15, %xmm0 xorps %xmm15, %xmm2 #endif #else xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #endif addps %xmm1, %xmm0 addps %xmm3, %xmm2 #if defined(LN) || defined(RT) movq KK, %rax #ifdef LN subq $1, %rax #else subq $2, %rax #endif movq AORIG, AO movq BORIG, B leaq BUFFER, BO salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO #endif #if defined(LN) || defined(LT) unpcklpd %xmm2, %xmm0 movaps 0 * SIZE(B), %xmm2 subps %xmm0, %xmm2 #else #ifdef movsd xorps %xmm1, %xmm1 #endif movsd 0 * SIZE(AO), %xmm1 #ifdef movsd xorps %xmm5, %xmm5 #endif movsd 2 * SIZE(AO), %xmm5 subps %xmm0, %xmm1 subps %xmm2, %xmm5 #endif #ifdef LN movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef LT movaps 0 * SIZE(AO), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm2, %xmm0 pshufd $0xf5, %xmm2, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 addps %xmm0, %xmm2 #endif #ifdef RN movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm1 addps %xmm0, %xmm1 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm0 pshufd $0xf5, %xmm1, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm5 subps %xmm2, %xmm5 movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm4 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm5 addps %xmm4, %xmm5 #endif #ifdef RT movaps 4 * SIZE(B), %xmm8 pshufd $0xee, %xmm8, %xmm9 pshufd $0xbb, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm5 #ifndef CONJ xorps %xmm15, %xmm5 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm5 addps %xmm0, %xmm5 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm5, %xmm0 pshufd $0xf5, %xmm5, %xmm2 #ifndef CONJ xorps %xmm15, %xmm2 #else xorps %xmm15, %xmm0 #endif mulps %xmm9, %xmm0 mulps %xmm10, %xmm2 subps %xmm0, %xmm1 subps %xmm2, %xmm1 movaps 0 * SIZE(B), %xmm8 pshufd $0x44, %xmm8, %xmm9 pshufd $0x11, %xmm8, %xmm10 pshufd $0xa0, %xmm1, %xmm4 pshufd $0xf5, %xmm1, %xmm1 #ifndef CONJ xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm4 #endif mulps %xmm9, %xmm4 mulps %xmm10, %xmm1 addps %xmm4, %xmm1 #endif #ifdef LN subq $2 * SIZE, CO1 subq $2 * SIZE, CO2 #endif #if defined(LN) || defined(LT) movaps %xmm2, 0 * SIZE(B) pshufd $0x00, %xmm2, %xmm0 pshufd $0x55, %xmm2, %xmm1 pshufd $0xaa, %xmm2, %xmm4 pshufd $0xff, %xmm2, %xmm5 movaps %xmm0, 0 * SIZE(BO) movaps %xmm1, 4 * SIZE(BO) movaps %xmm4, 8 * SIZE(BO) movaps %xmm5, 12 * SIZE(BO) movlps %xmm2, 0 * SIZE(CO1) movhps %xmm2, 0 * SIZE(CO2) #else movlps %xmm1, 0 * SIZE(AO) movlps %xmm5, 2 * SIZE(AO) movlps %xmm1, 0 * SIZE(CO1) movlps %xmm5, 0 * SIZE(CO2) #endif #ifndef LN addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO #ifdef LT addq $4 * SIZE, B #endif #endif #ifdef LN subq $1, KK movq BORIG, B #endif #ifdef LT addq $1, KK #endif #ifdef RT movq K, %rax movq BORIG, B salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif ALIGN_4 .L39: #ifdef LN leaq (, K, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #if defined(LT) || defined(RN) movq K, %rax subq KK, %rax leaq (,%rax, SIZE), %rax leaq (B, %rax, 2 * COMPSIZE), B #endif #ifdef RN addq $2, KK #endif #ifdef RT subq $2, KK #endif decq J # j -- jg .L01 ALIGN_4 .L999: movq %rbx, %rsp movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE OpenBLAS-0.2.20/kernel/x86_64/ztrsm_kernel_RT_bulldozer.c000066400000000000000000000330511313527062700227360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" static FLOAT dm1 = -1.; #ifdef CONJ #define GEMM_KERNEL GEMM_KERNEL_R #else #define GEMM_KERNEL GEMM_KERNEL_N #endif #if GEMM_DEFAULT_UNROLL_M == 1 #define GEMM_UNROLL_M_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_M == 2 #define GEMM_UNROLL_M_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_M == 4 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 6 #define GEMM_UNROLL_M_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_M == 16 #define GEMM_UNROLL_M_SHIFT 4 #endif #if GEMM_DEFAULT_UNROLL_N == 1 #define GEMM_UNROLL_N_SHIFT 0 #endif #if GEMM_DEFAULT_UNROLL_N == 2 #define GEMM_UNROLL_N_SHIFT 1 #endif #if GEMM_DEFAULT_UNROLL_N == 4 #define GEMM_UNROLL_N_SHIFT 2 #endif #if GEMM_DEFAULT_UNROLL_N == 8 #define GEMM_UNROLL_N_SHIFT 3 #endif #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif #ifndef CONJ static void ztrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); static void ztrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) { FLOAT *c1 = c + ldc*2 ; BLASLONG n1 = n * 4; BLASLONG i=0; __asm__ __volatile__ ( " vzeroupper \n\t" " prefetcht0 (%4) \n\t" " prefetcht0 (%5) \n\t" " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" " cmpq $0, %0 \n\t" " je 3f \n\t" " .align 16 \n\t" "1: \n\t" " prefetcht0 256(%3,%1,8) \n\t" " prefetcht0 256(%2,%1,8) \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jz 2f \n\t" " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag " addq $4, %1 \n\t" " cmpq %1, %0 \n\t" " jnz 1b \n\t" "2: \n\t" " vshufpd $0x01 , %%xmm9 , %%xmm9, %%xmm9 \n\t" " vshufpd $0x01 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" " vshufpd $0x01 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" " vshufpd $0x01 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" " vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" " vxorpd %%xmm7 , %%xmm7 , %%xmm7 \n\t" " vaddsubpd %%xmm8 , %%xmm7 , %%xmm8 \n\t" " vaddsubpd %%xmm10, %%xmm7 , %%xmm10 \n\t" " vaddsubpd %%xmm12, %%xmm7 , %%xmm12 \n\t" " vaddsubpd %%xmm14, %%xmm7 , %%xmm14 \n\t" " vmovups (%4) , %%xmm0 \n\t" " vmovups 16(%4) , %%xmm1 \n\t" " vmovups (%5) , %%xmm4 \n\t" " vmovups 16(%5) , %%xmm5 \n\t" " vaddpd %%xmm0 , %%xmm8 , %%xmm8 \n\t" " vaddpd %%xmm1 , %%xmm10, %%xmm10 \n\t" " vaddpd %%xmm4 , %%xmm12, %%xmm12 \n\t" " vaddpd %%xmm5 , %%xmm14, %%xmm14 \n\t" " vmovups %%xmm8 , (%4) \n\t" " vmovups %%xmm10 ,16(%4) \n\t" " vmovups %%xmm12 , (%5) \n\t" " vmovups %%xmm14 ,16(%5) \n\t" "3: \n\t" " vzeroupper \n\t" : : "r" (n1), // 0 "a" (i), // 1 "r" (a), // 2 "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 "r" (as), // 6 "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } #endif #ifndef COMPLEX static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa, bb; int i, j, k; a += (n - 1) * m; b += (n - 1) * n; for (i = n - 1; i >= 0; i--) { bb = *(b + i); for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; *a = aa; *(c + j + i * ldc) = aa; a ++; for (k = 0; k < i; k ++){ *(c + j + k * ldc) -= aa * *(b + k); } } b -= n; a -= 2 * m; } } #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { FLOAT aa1, aa2; FLOAT bb1, bb2; FLOAT cc1, cc2; int i, j, k; ldc *= 2; a += (n - 1) * m * 2; b += (n - 1) * n * 2; for (i = n - 1; i >= 0; i--) { bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); #ifndef CONJ cc1 = aa1 * bb1 - aa2 * bb2; cc2 = aa1 * bb2 + aa2 * bb1; #else cc1 = aa1 * bb1 + aa2 * bb2; cc2 = - aa1 * bb2 + aa2 * bb1; #endif *(a + 0) = cc1; *(a + 1) = cc2; *(c + j * 2 + 0 + i * ldc) = cc1; *(c + j * 2 + 1 + i * ldc) = cc2; a += 2; for (k = 0; k < i; k ++){ #ifndef CONJ *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #else *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); #endif } } b -= n * 2; a -= 4 * m; } } #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); #endif kk = n - offset; c += n * ldc * COMPSIZE; b += n * k * COMPSIZE; if (n & (GEMM_UNROLL_N - 1)) { j = 1; while (j < GEMM_UNROLL_N) { if (n & j) { aa = a; b -= j * k * COMPSIZE; c -= j * ldc* COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, j, aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + j * kk * COMPSIZE, cc, ldc); } solve(i, j, aa + (kk - j) * i * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= j; } j <<= 1; } } j = (n >> GEMM_UNROLL_N_SHIFT); if (j > 0) { do { aa = a; b -= GEMM_UNROLL_N * k * COMPSIZE; c -= GEMM_UNROLL_N * ldc * COMPSIZE; cc = c; i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { #ifndef CONJ ztrsm_RT_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE); solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #else if (k - kk > 0) { GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); #endif aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); do { if (m & i) { if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } solve(i, GEMM_UNROLL_N, aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } kk -= GEMM_UNROLL_N; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/kernel/zarch/000077500000000000000000000000001313527062700155335ustar00rootroot00000000000000OpenBLAS-0.2.20/kernel/zarch/KERNEL000066400000000000000000000007451313527062700164440ustar00rootroot00000000000000ifndef SCABS_KERNEL SCABS_KERNEL = ../generic/cabs.c endif ifndef DCABS_KERNEL DCABS_KERNEL = ../generic/cabs.c endif ifndef QCABS_KERNEL QCABS_KERNEL = ../generic/cabs.c endif ifndef LSAME_KERNEL LSAME_KERNEL = ../generic/lsame.c endif ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c endif ifndef DGEMM_BETA DGEMM_BETA = ../generic/gemm_beta.c endif ifndef CGEMM_BETA CGEMM_BETA = ../generic/zgemm_beta.c endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif OpenBLAS-0.2.20/kernel/zarch/KERNEL.Z13000066400000000000000000000073521313527062700170210ustar00rootroot00000000000000SAMAXKERNEL = ../arm/amax.c DAMAXKERNEL = ../arm/amax.c CAMAXKERNEL = ../arm/zamax.c ZAMAXKERNEL = ../arm/zamax.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c ZAMINKERNEL = ../arm/zamin.c SMAXKERNEL = ../arm/max.c DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = ../arm/iamax.c ICAMAXKERNEL = ../arm/izamax.c IZAMAXKERNEL = ../arm/izamax.c ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = ../arm/izamin.c ISMAXKERNEL = ../arm/imax.c IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c SCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = ../arm/copy.c CCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = ../arm/zcopy.c SDOTKERNEL = ../arm/dot.c DDOTKERNEL = ../arm/dot.c CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = ../arm/rot.c DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c SSCALKERNEL = ../arm/scal.c DSCALKERNEL = ../arm/scal.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S SGEMMKERNEL = strmm8x4V.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = gemm8x4V.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ctrmm4x4V.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ztrmm4x4V.S ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c OpenBLAS-0.2.20/kernel/zarch/KERNEL.ZARCH_GENERIC000066400000000000000000000070371313527062700204470ustar00rootroot00000000000000SAMAXKERNEL = ../arm/amax.c DAMAXKERNEL = ../arm/amax.c CAMAXKERNEL = ../arm/zamax.c ZAMAXKERNEL = ../arm/zamax.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c ZAMINKERNEL = ../arm/zamin.c SMAXKERNEL = ../arm/max.c DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = ../arm/iamax.c ICAMAXKERNEL = ../arm/izamax.c IZAMAXKERNEL = ../arm/izamax.c ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = ../arm/izamin.c ISMAXKERNEL = ../arm/imax.c IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = ../arm/zaxpy.c SCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = ../arm/copy.c CCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = ../arm/zcopy.c SDOTKERNEL = ../arm/dot.c DDOTKERNEL = ../arm/dot.c CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = ../arm/zdot.c SNRM2KERNEL = ../arm/nrm2.c DNRM2KERNEL = ../arm/nrm2.c CNRM2KERNEL = ../arm/znrm2.c ZNRM2KERNEL = ../arm/znrm2.c SROTKERNEL = ../arm/rot.c DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c SSCALKERNEL = ../arm/scal.c DSCALKERNEL = ../arm/scal.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c SSWAPKERNEL = ../arm/swap.c DSWAPKERNEL = ../arm/swap.c CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = ../generic/zgemmkernel_2x2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c OpenBLAS-0.2.20/kernel/zarch/Makefile000066400000000000000000000000121313527062700171640ustar00rootroot00000000000000clean :: OpenBLAS-0.2.20/kernel/zarch/ckernelMacrosV.S000066400000000000000000001441601313527062700206030ustar00rootroot00000000000000/****************************************Implementation**Details**********************************************/ /* */ /* Lets denote (a,a1i) complex which is mathematically a+a1*i */ /* Complex number multiplication: (a,a1i)*(b,b1i) */ /* As i*i=-1 .The multiplication result will be: */ /* (a+a1*i)(b+b1*i)=a*b+a1*i*b1*i+ a1*i*b+a*b1*i=a*b-a1*b1 + (a1*b+a*b1)*i which is (ab-a1b1,a1b+ab1) */ /* so let c= ab-a1b1 , ci=a1b+ab1 then */ /* c=c+a*b-a1*b1 => c=a*b-( a1*b1-c) => c= a1*b1-c then c=a*b-c two mseb */ /* ci=ci+a1*b+a*b1 => ci= a1*b+ci then ci= a*b1+ci */ /* For simd real and imaginary parts will be grouped together */ /* such (realA,realK) and (imageA ,imageK) */ /* Simd(0,1)=(a*b,k*b)-((ai*bi,ki*bi)-Simd(0,1)) */ /* SimdI(0,1)=SimdI(0,1)+(a*bi,k*bi)+(ai*b,ki*b) */ /* */ /* */ /* for defined(NR) || defined(NC) || defined(TR) || defined(TC) */ /* (a+a1*I)(b-b1*I)=ab+a1*b1+I(a1b-ab1) */ /* */ /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */ /* ci=ci+a1b-ab1 => ci=a1*b-(ab1-ci) => ci=ab1-ci; ci=a1*b-ci */ /* */ /* */ /* for defined(RN) || defined(RT) || defined(CN) || defined(CT) */ /* (a-a1*I)(b+b1*I)=ab+a1*b1+I(-a1b+ab1) */ /* */ /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */ /* ci=ci+a1b-ab1 => ci=a*b1-(a1b-ci) => ci=a1b-ci; ci=a*b1-ci */ /* */ /* */ /* for defined(RR) || defined(RC) || defined(CR) || defined(CC) */ /* (a-a1*I)(b-b1*I)=ab-a1*b1+I(-a1b-ab1) */ /* */ /* c= a1*b1-c then c=a*b-c */ /* ci = ci-a1*b -a*b1; */ /* as ibm z13 only has x*z-m x*z+m instructions implementation will be changed a bit */ /* Assuming ci=0; and cix=cix+a1b+ab1 ; ci=ci-cix will work */ /* cix= a*b1+cix ; cix= a1*b+cix (two madb) ci=ci-cix (sign change if ci=0) */ /* As c=0 then */ /* c=a*b-c then c=a1*b1-c => c=(a1*b1-(a*b-c)) which is -1*( a*b -(a1*b1-c)) */ /* */ /* Values will be equal to (-c) and (-ci) */ /* To change sign it'll be multiplied by -1*(alpha+alpha_i) */ /* This is done once: */ /* lcdbr ALPHA_I,ALPHA_I */ /* lcdbr ALPHA ,ALPHA */ /*************************************************************************************************************/ /*************************Zero vectors***************************************/ /*zero vectors for 4x4 */ .macro ZERO_ZCVEC_4x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 vzero %v24 vzero %v25 vzero %v26 vzero %v27 vzero %v28 vzero %v29 vzero %v30 vzero %v31 .endm /*zero vectors for */ .macro ZERO_ZCVEC_2x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 .endm /*zero vectors for */ .macro ZERO_ZCVEC_1x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 .endm /*zero vectors for */ .macro ZERO_ZCVEC_4x2 ZERO_ZCVEC_2x4 .endm .macro ZERO_ZCVEC_4x1 ZERO_ZCVEC_1x4 .endm /*zero vectors for */ .macro ZERO_ZCVEC_2x2 vzero %v16 vzero %v17 vzero %v20 vzero %v21 .endm /*zero vectors for */ .macro ZERO_ZCVEC_1x2 vzero %v16 vzero %v17 .endm /*zero vectors for */ .macro ZERO_ZCVEC_2x1 vzero %v16 vzero %v17 .endm /*zero vectors for 1x1*/ .macro ZERO_ZCVEC_1x1 lzer %f6 lzer %f7 .endm /* Calculate for 4x2 inner */ .macro CalcComplex_4x2 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 #endif .endm /* Calculate for 2x4 inner */ .macro CalcComplex_2x4 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 #endif .endm /* Calculate for 2x2 inner */ .macro CalcComplex_2x2 vResR1, vResI1,vResR2, vResI2, vR1, vI1, vRB, vIB, vRB2, vIB2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vResR1, \vI1, \vIB, \vResR1 vfmadb \vResI1, \vR1, \vIB, \vResI1 vfmsdb \vResR2, \vI1, \vIB2, \vResR2 vfmadb \vResI2, \vR1, \vIB2, \vResI2 vfmsdb \vResR1, \vR1, \vRB, \vResR1 vfmadb \vResI1, \vI1, \vRB, \vResI1 vfmsdb \vResR2, \vR1, \vRB2, \vResR2 vfmadb \vResI2, \vI1, \vRB2, \vResI2 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vResR1, \vI1, \vIB, \vResR1 vfmsdb \vResI1, \vR1, \vIB, \vResI1 vfmadb \vResR2, \vI1, \vIB2, \vResR2 vfmsdb \vResI2, \vR1, \vIB2, \vResI2 vfmadb \vResR1, \vR1, \vRB, \vResR1 vfmsdb \vResI1, \vI1, \vRB, \vResI1 vfmadb \vResR2, \vR1, \vRB2, \vResR2 vfmsdb \vResI2, \vI1, \vRB2, \vResI2 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vResR1, \vI1, \vIB, \vResR1 vfmsdb \vResI1, \vI1, \vRB, \vResI1 vfmadb \vResR2, \vI1, \vIB2, \vResR2 vfmsdb \vResI2, \vI1, \vRB2, \vResI2 vfmadb \vResR1, \vR1, \vRB, \vResR1 vfmsdb \vResI1, \vR1, \vIB, \vResI1 vfmadb \vResR2, \vR1, \vRB2, \vResR2 vfmsdb \vResI2, \vR1, \vIB2, \vResI2 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vResR1, \vR1, \vRB, \vResR1 vfmadb \vResI1, \vI1, \vRB, \vResI1 vfmsdb \vResR2, \vR1, \vRB2, \vResR2 vfmadb \vResI2, \vI1, \vRB2, \vResI2 vfmsdb \vResR1, \vI1, \vIB, \vResR1 vfmadb \vResI1, \vR1, \vIB, \vResI1 vfmsdb \vResR2, \vI1, \vIB2, \vResR2 vfmadb \vResI2, \vR1, \vIB2, \vResI2 #endif .endm /* Calculate for 2x1 inner */ .macro CalcComplex_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif .endm /* Calculate for 1x2 inner */ .macro CalcComplex_1x2 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(RN) || defined(CN) || defined(RT) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(NR) || defined(TR) || defined(NC) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif .endm /* Calculate for 4x1 inner */ .macro CalcComplex_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif .endm /* Calculate for 1x4 inner */ .macro CalcComplex_1x4 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(RN) || defined(CN) || defined(RT) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(NR) || defined(TR) || defined(NC) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif .endm .macro CalcComplex_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) msebr \RealResult1, \Image1, \ImageB maebr \ImageResult1, \Real1, \ImageB msebr \RealResult1, \Real1, \RealB maebr \ImageResult1, \Image1, \RealB #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) maebr \RealResult1, \Image1, \ImageB msebr \ImageResult1, \Real1, \ImageB maebr \RealResult1, \Real1, \RealB msebr \ImageResult1, \Image1, \RealB #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) maebr \RealResult1, \Image1, \ImageB msebr \ImageResult1, \Image1, \RealB maebr \RealResult1, \Real1, \RealB msebr \ImageResult1, \Real1, \ImageB #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) msebr \RealResult1, \Real1, \RealB maebr \ImageResult1, \Image1, \RealB msebr \RealResult1, \Image1, \ImageB maebr \ImageResult1, \Real1, \ImageB #endif .endm #define DISP(ind,stride,disp) (ind*stride+disp) #define DISP64(ind,disp) (ind*32+disp) #define DISP32(ind,disp) (ind*16+disp) #define DISP16(ind,disp) (ind*8+disp) #define unit_size 8 #define DISP(ind,stride,disp) (ind*stride+disp) #define DISP8(ind,disp) (ind*unit_size*8+disp) #define DISP4(ind,disp) (ind*unit_size*4+disp) #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) #define N8 (8*unit_size) #define N4 (4*unit_size) #define N2 (2*unit_size) #define N1 (1*unit_size) .macro ZCALC_4x4_I PTR_A_REG,PTR_B_REG,Index,IsLast vlef %v1, DISP4(\Index ,0) (\PTR_A_REG),0 vlef %v5, DISP4(\Index ,4) (\PTR_A_REG),0 vlef %v1, DISP4(\Index ,8) (\PTR_A_REG),2 vlef %v5, DISP4(\Index ,12) (\PTR_A_REG),2 vlef %v3, DISP4(\Index ,16) (\PTR_A_REG),0 vlef %v7, DISP4(\Index ,20) (\PTR_A_REG),0 vlef %v3, DISP4(\Index ,24) (\PTR_A_REG),2 vlef %v7, DISP4(\Index ,28) (\PTR_A_REG),2 vlrepf %v9, DISP4(\Index ,0)(\PTR_B_REG) vlrepf %v10 , DISP4(\Index ,4)(\PTR_B_REG) vlrepf %v11, DISP4(\Index ,8)(\PTR_B_REG) vlrepf %v12 , DISP4(\Index ,12)(\PTR_B_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v3,%v3 vldeb %v7,%v7 vldeb %v9,%v9 vldeb %v10,%v10 vldeb %v11,%v11 vldeb %v12,%v12 CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 vlrepf %v9, DISP4(\Index ,16)(\PTR_B_REG) vlrepf %v10 , DISP4(\Index ,20)(\PTR_B_REG) vlrepf %v11, DISP4(\Index ,24)(\PTR_B_REG) vlrepf %v12 , DISP4(\Index ,28)(\PTR_B_REG) vldeb %v9,%v9 vldeb %v10,%v10 vldeb %v11,%v11 vldeb %v12,%v12 .if \IsLast==1 la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG) .endif CalcComplex_4x2 %v24,%v25,%v26,%v27,%v28,%v29,%v30,%v31,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG) .endif .endm .macro ZCALC_4x2_I PTR_A_REG,PTR_B_REG,Index,IsLast vlef %v1, DISP4(\Index ,0) (\PTR_A_REG),0 vlef %v5, DISP4(\Index ,4) (\PTR_A_REG),0 vlef %v1, DISP4(\Index ,8) (\PTR_A_REG),2 vlef %v5, DISP4(\Index ,12) (\PTR_A_REG),2 vlef %v3, DISP4(\Index ,16) (\PTR_A_REG),0 vlef %v7, DISP4(\Index ,20) (\PTR_A_REG),0 vlef %v3, DISP4(\Index ,24) (\PTR_A_REG),2 vlef %v7, DISP4(\Index ,28) (\PTR_A_REG),2 vlrepf %v9, DISP2(\Index ,0)(\PTR_B_REG) vlrepf %v10 , DISP2(\Index ,4)(\PTR_B_REG) vlrepf %v11, DISP2(\Index ,8)(\PTR_B_REG) vlrepf %v12 , DISP2(\Index ,12)(\PTR_B_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v3,%v3 vldeb %v7,%v7 vldeb %v9,%v9 vldeb %v10,%v10 vldeb %v11,%v11 vldeb %v12,%v12 .if \IsLast==1 la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG) .endif CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG) .endif .endm .macro ZCALC_2x4_I PTR_A_REG,PTR_B_REG,Index,IsLast vlef %v1, DISP4(\Index ,0) (\PTR_B_REG),0 vlef %v5, DISP4(\Index ,4) (\PTR_B_REG),0 vlef %v1, DISP4(\Index ,8) (\PTR_B_REG),2 vlef %v5, DISP4(\Index ,12) (\PTR_B_REG),2 vlef %v3, DISP4(\Index ,16) (\PTR_B_REG),0 vlef %v7, DISP4(\Index ,20) (\PTR_B_REG),0 vlef %v3, DISP4(\Index ,24) (\PTR_B_REG),2 vlef %v7, DISP4(\Index ,28) (\PTR_B_REG),2 vlrepf %v9, DISP2(\Index ,0)(\PTR_A_REG) vlrepf %v10 , DISP2(\Index ,4)(\PTR_A_REG) vlrepf %v11, DISP2(\Index ,8)(\PTR_A_REG) vlrepf %v12 , DISP2(\Index ,12)(\PTR_A_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v3,%v3 vldeb %v7,%v7 vldeb %v9,%v9 vldeb %v10,%v10 vldeb %v11,%v11 vldeb %v12,%v12 .if \IsLast==1 la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG) .endif CalcComplex_2x4 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG) .endif .endm .macro ZCALC_4x1_I PTR_A_REG,PTR_B_REG,Index,IsLast vlef %v1, DISP4(\Index ,0) (\PTR_A_REG),0 vlef %v5, DISP4(\Index ,4) (\PTR_A_REG),0 vlef %v1, DISP4(\Index ,8) (\PTR_A_REG),2 vlef %v5, DISP4(\Index ,12) (\PTR_A_REG),2 vlef %v3, DISP4(\Index ,16) (\PTR_A_REG),0 vlef %v7, DISP4(\Index ,20) (\PTR_A_REG),0 vlef %v3, DISP4(\Index ,24) (\PTR_A_REG),2 vlef %v7, DISP4(\Index ,28) (\PTR_A_REG),2 vlrepf %v9, DISP1(\Index ,0)(\PTR_B_REG) vlrepf %v10 , DISP1(\Index ,4)(\PTR_B_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v3,%v3 vldeb %v7,%v7 vldeb %v9,%v9 vldeb %v10,%v10 .if \IsLast==1 la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG) .endif CalcComplex_4x1 %v16,%v17,%v18,%v19,%v1,%v5,%v3,%v7,%v9,%v10 .if \IsLast==1 la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG) .endif .endm .macro ZCALC_1x4_I PTR_A_REG,PTR_B_REG,Index,IsLast vlef %v1, DISP4(\Index ,0) (\PTR_B_REG),0 vlef %v5, DISP4(\Index ,4) (\PTR_B_REG),0 vlef %v1, DISP4(\Index ,8) (\PTR_B_REG),2 vlef %v5, DISP4(\Index ,12) (\PTR_B_REG),2 vlef %v3, DISP4(\Index ,16) (\PTR_B_REG),0 vlef %v7, DISP4(\Index ,20) (\PTR_B_REG),0 vlef %v3, DISP4(\Index ,24) (\PTR_B_REG),2 vlef %v7, DISP4(\Index ,28) (\PTR_B_REG),2 vlrepf %v9, DISP1(\Index ,0)(\PTR_A_REG) vlrepf %v10 , DISP1(\Index ,4)(\PTR_A_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v3,%v3 vldeb %v7,%v7 vldeb %v9,%v9 vldeb %v10,%v10 .if \IsLast==1 la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG) .endif CalcComplex_1x4 %v16,%v17,%v18,%v19,%v1,%v5,%v3,%v7,%v9,%v10 .if \IsLast==1 la \PTR_A_REG, DISP1(\Index ,8)(\PTR_A_REG) .endif .endm .macro ZCALC_2x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast vlef %v1, DISP2(\Index ,0) (\PTR_A_REG),0 vlef %v5, DISP2(\Index ,4) (\PTR_A_REG),0 vlef %v1, DISP2(\Index ,8) (\PTR_A_REG),2 vlef %v5, DISP2(\Index ,12) (\PTR_A_REG),2 vlrepf %v9, DISP2(\Index ,0)(\PTR_B_REG) vlrepf %v10 , DISP2(\Index ,4)(\PTR_B_REG) vlrepf %v11, DISP2(\Index ,8)(\PTR_B_REG) vlrepf %v12 , DISP2(\Index ,12)(\PTR_B_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v9,%v9 vldeb %v10,%v10 vldeb %v11,%v11 vldeb %v12,%v12 .if \IsLast==1 la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG) .endif CalcComplex_2x2 %v16,%v17,%v20,%v21,%v1,%v5, %v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG) .endif .endm .macro ZCALC_2x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast vlef %v1, DISP2(\Index ,0) (\PTR_A_REG),0 vlef %v5, DISP2(\Index ,4) (\PTR_A_REG),0 vlef %v1, DISP2(\Index ,8) (\PTR_A_REG),2 vlef %v5, DISP2(\Index ,12) (\PTR_A_REG),2 vlrepf %v9, DISP1(\Index ,0)(\PTR_B_REG) vlrepf %v10 , DISP1(\Index ,4)(\PTR_B_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v9,%v9 vldeb %v10,%v10 .if \IsLast==1 la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG) .endif CalcComplex_2x1 %v16,%v17, %v1,%v5, %v9,%v10 .if \IsLast==1 la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG) .endif .endm .macro ZCALC_1x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast vlef %v1, DISP2(\Index ,0) (\PTR_B_REG),0 vlef %v5, DISP2(\Index ,4) (\PTR_B_REG),0 vlef %v1, DISP2(\Index ,8) (\PTR_B_REG),2 vlef %v5, DISP2(\Index ,12) (\PTR_B_REG),2 vlrepf %v9, DISP1(\Index ,0)(\PTR_A_REG) vlrepf %v10 , DISP1(\Index ,4)(\PTR_A_REG) vldeb %v1,%v1 vldeb %v5,%v5 vldeb %v9,%v9 vldeb %v10,%v10 .if \IsLast==1 la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG) .endif CalcComplex_1x2 %v16,%v17, %v1,%v5, %v9,%v10 .if \IsLast==1 la \PTR_A_REG, DISP1(\Index ,8)(\PTR_A_REG) .endif .endm .macro ZCALC_1x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast le %f1 , DISP1(\Index ,0)(\PTR_A_REG) le %f3 , DISP1(\Index ,4)(\PTR_A_REG) le %f4 , DISP1(\Index ,0)(\PTR_B_REG) le %f5 , DISP1(\Index ,4)(\PTR_B_REG) .if \IsLast==1 la \PTR_A_REG, DISP1(\Index ,8)(\PTR_A_REG) .endif CalcComplex_1x1 %f6,%f7,%f1,%f3,%f4,%f5 .if \IsLast==1 la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG) .endif .endm .macro ZCALC_4x4 PTR_A_REG,PTR_B_REG ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_4x2 PTR_A_REG,PTR_B_REG ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_4x1 PTR_A_REG,PTR_B_REG ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_4x4_4 PTR_A_REG,PTR_B_REG ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_4x2_4 PTR_A_REG,PTR_B_REG ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_4x1_4 PTR_A_REG,PTR_B_REG ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_2x4_4 PTR_A_REG,PTR_B_REG ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_2x4 PTR_A_REG,PTR_B_REG ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_1x4_4 PTR_A_REG,PTR_B_REG ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x4 PTR_A_REG,PTR_B_REG ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_2x2 PTR_A_REG,PTR_B_REG ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_2x2_4 PTR_A_REG,PTR_B_REG ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_2x1 PTR_A_REG,PTR_B_REG ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_2x1_4 PTR_A_REG,PTR_B_REG ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x2_4 PTR_A_REG,PTR_B_REG ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x2 PTR_A_REG,PTR_B_REG ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_1x1_4 PTR_A_REG,PTR_B_REG ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x1 PTR_A_REG,PTR_B_REG ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm /*****************************STORE RESULTS************************************/ .macro CalcMultAlpha_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB #if defined (TRMMKERNEL) vfmdb \vRealResult1, \vImage1, \vecImageB vfmdb \vImageResult1, \vReal1, \vecImageB vfmdb \vRealResult2, \vImage2, \vecImageB vfmdb \vImageResult2, \vReal2, \vecImageB #else vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 .endm .macro CalcMultAlpha_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB #if defined (TRMMKERNEL) vfmdb \vRealResult1, \vImage1, \vecImageB vfmdb \vImageResult1, \vReal1, \vecImageB #else vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 .endm .macro CalcMultAlpha_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB msebr \RealResult1, \Image1, \ImageB maebr \ImageResult1, \Real1, \ImageB msebr \RealResult1, \Real1, \RealB maebr \ImageResult1, \Image1, \RealB .endm .macro ZSTORE_4x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL ,LC1,LC2 #if !defined(TRMMKERNEL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v3, 8(\CIJ_REG),2 vlef %v4, 12(\CIJ_REG),2 vlef %v5, 16(\CIJ_REG),0 vlef %v6, 20(\CIJ_REG),0 vlef %v5, 24(\CIJ_REG),2 vlef %v6, 28(\CIJ_REG),2 vldeb %v3,%v3 vldeb %v4,%v4 vldeb %v5,%v5 vldeb %v6,%v6 #endif la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vledb %v6, %v6,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v3, 8(\CIJ_REG),2 vstef %v4, 12(\CIJ_REG),2 vstef %v5, 16(\CIJ_REG),0 vstef %v6, 20(\CIJ_REG),0 vstef %v5, 24(\CIJ_REG),2 vstef %v6, 28(\CIJ_REG),2 la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) #if !defined(TRMMKERNEL) vlef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vldeb %v16,%v16 vldeb %v17,%v17 vldeb %v18,%v18 vldeb %v19,%v19 #endif CalcMultAlpha_4x1 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI vledb %v16, %v16,0,0 vledb %v17, %v17,0,0 vledb %v18, %v18,0,0 vledb %v19, %v19,0,0 vstef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 #if !defined(TRMMKERNEL) vlef %v3, 0(\CIJ_REG, \LC1),0 vlef %v4, 4(\CIJ_REG, \LC1),0 vlef %v3, 8(\CIJ_REG, \LC1),2 vlef %v4, 12(\CIJ_REG, \LC1),2 vlef %v5, 16(\CIJ_REG, \LC1),0 vlef %v6, 20(\CIJ_REG, \LC1),0 vlef %v5, 24(\CIJ_REG, \LC1),2 vlef %v6, 28(\CIJ_REG, \LC1),2 vldeb %v3,%v3 vldeb %v4,%v4 vldeb %v5,%v5 vldeb %v6,%v6 #endif CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v24,%v25,%v26,%v27,\ALPHA_VECREG,\ALPHA_VECI vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vledb %v6, %v6,0,0 vstef %v3, 0(\CIJ_REG,\LC1),0 vstef %v4, 4(\CIJ_REG,\LC1),0 vstef %v3, 8(\CIJ_REG,\LC1),2 vstef %v4, 12(\CIJ_REG,\LC1),2 vstef %v5, 16(\CIJ_REG,\LC1),0 vstef %v6, 20(\CIJ_REG,\LC1),0 vstef %v5, 24(\CIJ_REG,\LC1),2 vstef %v6, 28(\CIJ_REG,\LC1),2 #if !defined(TRMMKERNEL) vlef %v16, 0(\CIJ_REG,\LC2),0 vlef %v17, 4(\CIJ_REG,\LC2),0 vlef %v16, 8(\CIJ_REG,\LC2),2 vlef %v17, 12(\CIJ_REG,\LC2),2 vlef %v18, 16(\CIJ_REG,\LC2),0 vlef %v19, 20(\CIJ_REG,\LC2),0 vlef %v18, 24(\CIJ_REG,\LC2),2 vlef %v19, 28(\CIJ_REG,\LC2),2 vldeb %v16,%v16 vldeb %v17,%v17 vldeb %v18,%v18 vldeb %v19,%v19 #endif CalcMultAlpha_4x1 %v16,%v17,%v18,%v19,%v28,%v29,%v30,%v31,\ALPHA_VECREG,\ALPHA_VECI vledb %v16, %v16,0,0 vledb %v17, %v17,0,0 vledb %v18, %v18,0,0 vledb %v19, %v19,0,0 vstef %v16, 0(\CIJ_REG,\LC2),0 vstef %v17, 4(\CIJ_REG,\LC2),0 vstef %v16, 8(\CIJ_REG,\LC2),2 vstef %v17, 12(\CIJ_REG,\LC2),2 vstef %v18, 16(\CIJ_REG,\LC2),0 vstef %v19, 20(\CIJ_REG,\LC2),0 vstef %v18, 24(\CIJ_REG,\LC2),2 vstef %v19, 28(\CIJ_REG,\LC2),2 la \CIJ_REG,32(\CIJ_REG) .endm .macro ZSTORE_4x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v3, 8(\CIJ_REG),2 vlef %v4, 12(\CIJ_REG),2 vlef %v5, 16(\CIJ_REG),0 vlef %v6, 20(\CIJ_REG),0 vlef %v5, 24(\CIJ_REG),2 vlef %v6, 28(\CIJ_REG),2 vldeb %v3,%v3 vldeb %v4,%v4 vldeb %v5,%v5 vldeb %v6,%v6 #endif CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vledb %v6, %v6,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v3, 8(\CIJ_REG),2 vstef %v4, 12(\CIJ_REG),2 vstef %v5, 16(\CIJ_REG),0 vstef %v6, 20(\CIJ_REG),0 vstef %v5, 24(\CIJ_REG),2 vstef %v6, 28(\CIJ_REG),2 #if !defined(TRMMKERNEL) vlef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vldeb %v16,%v16 vldeb %v17,%v17 vldeb %v18,%v18 vldeb %v19,%v19 #endif CalcMultAlpha_4x1 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI vledb %v16, %v16,0,0 vledb %v17, %v17,0,0 vledb %v18, %v18,0,0 vledb %v19, %v19,0,0 vstef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 la \CIJ_REG,32(\CIJ_REG) .endm .macro ZSTORE_4x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v3, 8(\CIJ_REG),2 vlef %v4, 12(\CIJ_REG),2 vlef %v5, 16(\CIJ_REG),0 vlef %v6, 20(\CIJ_REG),0 vlef %v5, 24(\CIJ_REG),2 vlef %v6, 28(\CIJ_REG),2 vldeb %v3,%v3 vldeb %v4,%v4 vldeb %v5,%v5 vldeb %v6,%v6 #endif CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vledb %v6, %v6,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v3, 8(\CIJ_REG),2 vstef %v4, 12(\CIJ_REG),2 vstef %v5, 16(\CIJ_REG),0 vstef %v6, 20(\CIJ_REG),0 vstef %v5, 24(\CIJ_REG),2 vstef %v6, 28(\CIJ_REG),2 la \CIJ_REG,32(\CIJ_REG) .endm .macro ZSTORE_1x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2 #if !defined(TRMMKERNEL) la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) vlef %v5, 0(\CIJ_REG,\LC1),0 vlef %v6, 4(\CIJ_REG,\LC1),0 vlef %v5, 0(\CIJ_REG,\LC2),2 vlef %v6, 4(\CIJ_REG,\LC2),2 vldeb %v3,%v3 vldeb %v4,%v4 vldeb %v5,%v5 vldeb %v6,%v6 #else la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) #endif CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI #if defined(TRMMKERNEL) la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) #endif vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vledb %v6, %v6,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v5, 0(\CIJ_REG,\LC1),0 vstef %v6, 4(\CIJ_REG,\LC1),0 vstef %v5, 0(\CIJ_REG,\LC2),2 vstef %v6, 4(\CIJ_REG,\LC2),2 la \CIJ_REG,8(\CIJ_REG) .endm .macro ZSTORE_2x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2 #if !defined(TRMMKERNEL) la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v24, 8(\CIJ_REG),0 vlef %v25, 12(\CIJ_REG),0 vlef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v24, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v25, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) vlef %v5, 0(\CIJ_REG,\LC1),0 vlef %v6, 4(\CIJ_REG,\LC1),0 vlef %v26, 8(\CIJ_REG,\LC1),0 vlef %v27, 12(\CIJ_REG,\LC1),0 vlef %v5, 0(\CIJ_REG,\LC2),2 vlef %v6, 4(\CIJ_REG,\LC2),2 vlef %v26, 8(\CIJ_REG,\LC2),2 vlef %v27, 12(\CIJ_REG,\LC2),2 vldeb %v3,%v3 vldeb %v4,%v4 vldeb %v5,%v5 vldeb %v6,%v6 vldeb %v24,%v24 vldeb %v25,%v25 vldeb %v26,%v26 vldeb %v27,%v27 #else la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) #endif CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI CalcMultAlpha_4x1 %v24,%v25,%v26,%v27,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI #if defined(TRMMKERNEL) la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) #endif vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vledb %v6, %v6,0,0 vledb %v24, %v24,0,0 vledb %v25, %v25,0,0 vledb %v26, %v26,0,0 vledb %v27, %v27,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v24, 8(\CIJ_REG),0 vstef %v25, 12(\CIJ_REG),0 vstef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v24, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v25, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v5, 0(\CIJ_REG,\LC1),0 vstef %v6, 4(\CIJ_REG,\LC1),0 vstef %v26, 8(\CIJ_REG,\LC1),0 vstef %v27, 12(\CIJ_REG,\LC1),0 vstef %v5, 0(\CIJ_REG,\LC2),2 vstef %v6, 4(\CIJ_REG,\LC2),2 vstef %v26, 8(\CIJ_REG,\LC2),2 vstef %v27, 12(\CIJ_REG,\LC2),2 la \CIJ_REG,16(\CIJ_REG) .endm .macro ZSTORE_2x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v3, 8(\CIJ_REG),2 vlef %v4, 12(\CIJ_REG),2 vlef %v5, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v6, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vlef %v5, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v6, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vldeb %v3,%v3 vldeb %v4,%v4 vldeb %v5,%v5 vldeb %v6,%v6 #endif CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI CalcMultAlpha_2x1 %v5,%v6, %v20,%v21 ,\ALPHA_VECREG,\ALPHA_VECI vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vledb %v6, %v6,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v3, 8(\CIJ_REG),2 vstef %v4, 12(\CIJ_REG),2 vstef %v5, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v6, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v5, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v6, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 la \CIJ_REG,16(\CIJ_REG) .endm .macro ZSTORE_2x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v3, 8(\CIJ_REG),2 vlef %v4, 12(\CIJ_REG),2 vldeb %v3,%v3 vldeb %v4,%v4 #endif CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v3, 8(\CIJ_REG),2 vstef %v4, 12(\CIJ_REG),2 la \CIJ_REG,16(\CIJ_REG) .endm .macro ZSTORE_1x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vlef %v3, 0(\CIJ_REG),0 vlef %v4, 4(\CIJ_REG),0 vlef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vlef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vldeb %v3,%v3 vldeb %v4,%v4 #endif CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vstef %v3, 0(\CIJ_REG),0 vstef %v4, 4(\CIJ_REG),0 vstef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 la \CIJ_REG,8(\CIJ_REG) .endm .macro ZSTORE_1x1 ALPHA_RR,ALPHA_RI ,CIJ_REG #if defined (TRMMKERNEL) lzer %f1 lzer %f3 #else le %f1 , 0(\CIJ_REG) le %f3 , 4(\CIJ_REG ) #endif ledbr %f4,\ALPHA_RR ledbr %f5,\ALPHA_RI CalcMultAlpha_1x1 %f1,%f3, %f6,%f7,%f4,%f5 ste %f1,0(\CIJ_REG) ste %f3,4(\CIJ_REG) la \CIJ_REG,8(\CIJ_REG) .endm /****************************TRMM POINTER REFRESH MACROSES*************************/ .macro RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /* ptrbb = bb;*/ lgr \PTR_B,\B_VAL /*refresh BPOINT*/ #else /* ptrba =ptrba+ off*C_A; ptrbb = bb + off*C_B;*/ .if \C_B==4 .if \C_A==4 sllg \PTR_B, \OFF_VAL,5 agr \PTR_A,\PTR_B /*ptrba+off*4**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,4 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_B, \PTR_B la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,3 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,5 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==2 .if \C_A==4 sllg \PTR_B, \OFF_VAL,4 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,4 agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,3 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_B,\PTR_B /* off+off**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==1 .if \C_A==4 sllg \PTR_B, \OFF_VAL,5 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,3 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,3 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,3 agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .endif #endif .endm /**/ .macro RefreshTempBk TEMP_VAL,BK_VAL,OFF_VAL,INCR_A,INCR_B #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) /* temp = bk-off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #elif defined(LEFT) /* temp = off+INCR_A; // number of values in A */ la \TEMP_VAL,\INCR_A(\OFF_VAL) #else /* temp = off+INCR_B // number of values in B*/ la \TEMP_VAL,\INCR_B(\OFF_VAL) #endif .endm .macro RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,PTR_A,C_A,C_B #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /*temp = bk - off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #ifdef LEFT /*temp -= 8; // number of values in A*/ lay \TEMP_VAL,-\C_A(\TEMP_VAL) #else /*temp -= 4; // number of values in B*/ lay \TEMP_VAL,-\C_B(\TEMP_VAL) #endif /*ptrba += temp*C_A; ptrbb += temp*C_B;*/ .if \C_A==4 sllg \TEMP_VAL, \TEMP_VAL,5 /*temp*4*/ .elseif \C_A==2 sllg \TEMP_VAL, \TEMP_VAL,4 /*temp*2*/ .elseif \C_A==1 sllg \TEMP_VAL, \TEMP_VAL,3 /*temp*1*/ .endif la \PTR_A,0(\PTR_A,\TEMP_VAL) /*ptrba+temp*C_A*/ #endif #ifdef LEFT /*off += \c_A; // number of values in A*/ aghi \OFF_VAL,\C_A #endif .endm OpenBLAS-0.2.20/kernel/zarch/ctrmm4x4V.S000066400000000000000000000446471313527062700175060ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2017/03/12 AbdelRauf (quickwritereader@gmail.com) * BLASTEST : passed * CTEST : passed * TEST : passed **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" /* BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, FLOAT* C,BLASLONG ldc, BLASLONG offset) ##bm=r2,bn=r3, bk=r4, alpha=f0,aplhai=f2, ba=r5,bb=r6,stack[160] ,ldc=stack[168] offset=stack[176] **********************************************************************************************/ /*Note: r0 can not be used as address disp register */ #define BM %r2 #define BM_CUR %r0 #define BN %r3 #define BN_CUR %r10 #define BK %r4 #define LDC_BYTE %r8 #define ALPHA %f0 #define ALPHA_I %f2 #define ALPHA_VECT %v0 #define ALPHA_VECT_I %v2 #define LOCAL_VAR1 %r9 #define LOCAL_VAR2 %r1 #define LOCAL_VAR3 %r11 #define A %r5 #define B %r6 #define CIJ %r7 #define CIJ_LOCAL %r12 #define OFF %r13 #define OFFSET %f8 #define ALIGN_4 .align 32 #define ALIGN_2 .align 16 #define PREFETCH_INS 1 /**************************Include kernel helper macrosses**********************************/ #include "ckernelMacrosV.S" /***********************************CGEMM**4x4*******************************************************/ PROLOGUE #if defined(TRMMKERNEL) std OFFSET ,40(%r15) stmg %r6,%r13,48(%r15) #else stmg %r6,%r12,48(%r15) #endif std %f9, 128(%r15) std %f10,136(%r15) std %f11,144(%r15) std %f12,152(%r15) lg CIJ, 160(%r15) lg LOCAL_VAR1, 168(%r15) #if defined(TRMMKERNEL) lg OFF,176(%r15) ldgr OFFSET ,OFF #endif srlg BN_CUR,BN,2 #if defined(RR) || defined(RC) || defined(CR) || defined(CC) lcdbr ALPHA_I,ALPHA_I lcdbr ALPHA ,ALPHA #endif vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/ sllg LDC_BYTE, LOCAL_VAR1,3 /*calculate lcd stride with complex=8 x<<4 */ vrepg ALPHA_VECT_I,ALPHA_VECT_I,0 /*replicate alpha which in f0*/ vldeb ALPHA_VECT,ALPHA_VECT vldeb ALPHA_VECT_I,ALPHA_VECT_I #if defined(TRMMKERNEL) && !defined(LEFT) /*off = -offset;*/ lgdr LOCAL_VAR1,OFFSET lcgr OFF,LOCAL_VAR1 #endif cijle BN_CUR,0,.LX2 ALIGN_4 .LX4_BN: #if defined(PREFETCH_INS) pfd 1, 0(A) pfd 1, 0(B) #endif #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,2 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L2x4 ALIGN_4 .L4x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_4x4 cijle LOCAL_VAR1,0,.L4x4_mod ALIGN_4 .L4x4_4_BK: /*BK_CUR LOOP */ ZCALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 128(LOCAL_VAR3) /*256-128*/ pfd 1, 128(LOCAL_VAR2 ) #endif brctg LOCAL_VAR1,.L4x4_4_BK ALIGN_4 .L4x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store ALIGN_4 .L4x4_BK: /*BK_CUR LOOP */ ZCALC_4x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_BK ALIGN_4 .L4x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_4x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE,LOCAL_VAR1,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,4 #endif brctg BM_CUR,.L4x4_BM ALIGN_2 .L2x4: tmll BM,2 jz .L1x4 ALIGN_4 .L2x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_2x4 cijle LOCAL_VAR1,0,.L2x4_mod ALIGN_4 .L2x4_4_BK: /*BK_CUR LOOP */ ZCALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 128(LOCAL_VAR2) #endif brctg LOCAL_VAR1,.L2x4_4_BK ALIGN_4 .L2x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store ALIGN_4 .L2x4_BK: /*BK_CUR LOOP */ ZCALC_2x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_BK ALIGN_4 .L2x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_2x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE ,LOCAL_VAR1,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,4 #endif ALIGN_4 .L1x4: tmll BM,1 jz .Lx4_INNER_END ALIGN_4 .L1x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_1x4 cijle LOCAL_VAR1,0,.L1x4_mod ALIGN_4 .L1x4_4_BK: /*BK_CUR LOOP */ ZCALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_4_BK ALIGN_4 .L1x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store ALIGN_4 .L1x4_BK: /*BK_CUR LOOP */ ZCALC_1x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_BK ALIGN_4 .L1x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_1x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE,LOCAL_VAR1,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,4 #endif ALIGN_2 .Lx4_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR1,LDC_BYTE,2 /*multiply*4 */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,4 #endif sllg LOCAL_VAR2,BK,5 /*multiply*4*sizeof(complex) =multiply*4*8* 2**5 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(complex) */ brctg BN_CUR,.LX4_BN /*********************************X2 SECTION************************************************/ ALIGN_4 .LX2: tmll BN,2 jz .Lx1 ALIGN_4 .Lx2_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,2 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L2x2 ALIGN_4 .L4x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_4x2 cijle LOCAL_VAR1,0,.L4x2_mod ALIGN_4 .L4x2_4_BK: /*BK_CUR LOOP */ ZCALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 128(LOCAL_VAR3) #endif brctg LOCAL_VAR1,.L4x2_4_BK ALIGN_4 .L4x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store ALIGN_4 .L4x2_BK: /*BK_CUR LOOP */ ZCALC_4x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_BK ALIGN_4 .L4x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_4x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,2 #endif ALIGN_4 brctg BM_CUR,.L4x2_BM ALIGN_2 .L2x2: tmll BM,2 jz .L1x2 ALIGN_4 .L2x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_2x2 cijle LOCAL_VAR1,0,.L2x2_mod ALIGN_4 .L2x2_4_BK: /*BK_CUR LOOP */ ZCALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) pfd 1, 256(LOCAL_VAR2) #endif brctg LOCAL_VAR1,.L2x2_4_BK ALIGN_4 .L2x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store ALIGN_4 .L2x2_BK: /*BK_CUR LOOP */ ZCALC_2x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_BK ALIGN_4 .L2x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_2x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,2 #endif ALIGN_2 .L1x2: tmll BM,1 jz .Lx2_INNER_END ALIGN_4 .L1x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_1x2 cijle LOCAL_VAR1,0,.L1x2_mod ALIGN_4 .L1x2_4_BK: /*BK_CUR LOOP */ ZCALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_4_BK ALIGN_4 .L1x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store ALIGN_4 .L1x2_BK: /*BK_CUR LOOP */ ZCALC_1x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_BK ALIGN_4 .L1x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_1x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,2 #endif ALIGN_2 .Lx2_INNER_END: /*add LDC_BYTE_COPY to new*/ la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*multiply*2 */ sllg LOCAL_VAR2,BK,4 /*multiply*2*sizeof(complex) =multiply*2*8 2^4 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*2*/ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,2 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*2*sizeof(complex) */ /*********************************X1 SECTION************************************************/ ALIGN_2 .Lx1: tmll BN,1 jz .L_FUNC_END ALIGN_4 .Lx1_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,2 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L2x1 ALIGN_4 .L4x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_4x1 cijle LOCAL_VAR1,0,.L4x1_mod ALIGN_4 .L4x1_4_BK: /*BK_CUR LOOP */ ZCALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_4_BK ALIGN_4 .L4x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store ALIGN_4 .L4x1_BK: /*BK_CUR LOOP */ ZCALC_4x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_BK ALIGN_4 .L4x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_4x1 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,1 #endif ALIGN_4 brctg BM_CUR , .L4x1_BM ALIGN_2 .L2x1: tmll BM,2 jz .L1x1 ALIGN_4 .L2x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_2x1 cijle LOCAL_VAR1,0,.L2x1_mod ALIGN_4 .L2x1_4_BK: /*BK_CUR LOOP */ ZCALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_4_BK ALIGN_4 .L2x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store ALIGN_4 .L2x1_BK: /*BK_CUR LOOP */ ZCALC_2x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_BK ALIGN_4 .L2x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_2x1 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,1 #endif ALIGN_2 .L1x1: tmll BM, 1 jz .Lx1_INNER_END ALIGN_4 .L1x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_1x1 cijle LOCAL_VAR1,0,.L1x1_mod ALIGN_4 .L1x1_4_BK: /*BK_CUR LOOP */ ZCALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_4_BK ALIGN_4 .L1x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store ALIGN_4 .L1x1_BK: /*BK_CUR LOOP */ ZCALC_1x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_BK ALIGN_4 .L1x1_BK_Store: /*store C and use CIJ_COPY for mem storing*/ ZSTORE_1x1 ALPHA,ALPHA_I ,CIJ_LOCAL #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,1 #endif ALIGN_2 .Lx1_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR2,BK,3 /*multiply*1*sizeof(complex) =multiply*1*8* 2^3 */ la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,1 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(complex) */ ALIGN_2 .L_FUNC_END: /*end*/ #if defined(TRMMKERNEL) ld OFFSET,40(%r15) lmg %r6,%r13,48(%r15) #else lmg %r6,%r12,48(%r15) #endif ld %f9, 128(%r15) ld %f10,136(%r15) ld %f11,144(%r15) ld %f12,152(%r15) br %r14 .end OpenBLAS-0.2.20/kernel/zarch/gemm8x4V.S000066400000000000000000000356201313527062700173040ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2017/01/01 AbdelRauf (quickwritereader@gmail.com) * BLASTEST : OK * CTEST : OK * TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" /* #BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168] **********************************************************************************************/ /*Note: r0 can not be used as address disp register */ #define BM %r2 #define BM_CUR %r0 #define BN %r3 #define BN_CUR %r10 #define BK %r4 #define LDC_BYTE %r8 #define ALPHA %f0 #define ALPHA_VECT %v0 #define LOCAL_VAR1 %r9 #define LOCAL_VAR2 %r1 #define LOCAL_VAR3 %r11 #define A %r5 #define B %r6 #define CIJ %r7 #define CIJ_LOCAL %r12 #define ALIGN_4 .align 16 #define ALIGN_2 .align 8 #define PREFETCH_INS 1 #include "kernelMacros.S" /***********************************DGEMM***********************************************************/ PROLOGUE stmg %r6,%r12,48(%r15) lg CIJ, 160(%r15) lg LOCAL_VAR1, 168(%r15) srlg BN_CUR,BN,2 vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/ sllg LDC_BYTE, LOCAL_VAR1,3 /*calculate lcd stride with bytes double=8 x<<3 */ cijle BN_CUR,0,.LX2 ALIGN_4 .LX4_BN: #if defined(PREFETCH_INS) pfd 1, 0(A) pfd 1, 256(A) pfd 1, 0(B) pfd 1, 256(B) #endif srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x4 ALIGN_4 .L8x4_BM: /*BM_CUR LOOP */ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_8x4 cijle LOCAL_VAR1,0,.L8x4_mod ALIGN_4 .L8x4_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 512(LOCAL_VAR3) #endif CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 512(LOCAL_VAR2) #endif brctg LOCAL_VAR1,.L8x4_4_BK ALIGN_4 .L8x4_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x4_BK_Store ALIGN_4 .L8x4_BK: /*BK_CUR LOOP */ CALC_8x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x4_BK ALIGN_4 .L8x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE brctg BM_CUR,.L8x4_BM ALIGN_4 .L4x4: tmll BM,4 jz .L2x4 ALIGN_4 .L4x4_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_4x4 cijle LOCAL_VAR1,0,.L4x4_mod ALIGN_4 .L4x4_4_BK: /*BK_CUR LOOP */ CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_4_BK ALIGN_4 .L4x4_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x4_BK_Store ALIGN_4 .L4x4_BK: /*BK_CUR LOOP */ CALC_4x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_BK ALIGN_4 .L4x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .L2x4: tmll BM,2 jz .L1x4 ALIGN_4 .L2x4_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_2x4 cijle LOCAL_VAR1,0,.L2x4_mod ALIGN_4 .L2x4_4_BK: /*BK_CUR LOOP */ CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_4_BK ALIGN_4 .L2x4_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x4_BK_Store ALIGN_4 .L2x4_BK: /*BK_CUR LOOP */ CALC_2x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_BK ALIGN_4 .L2x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_4 .L1x4: tmll BM,1 jz .Lx4_INNER_END ALIGN_4 .L1x4_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_1x4 cijle LOCAL_VAR1,0,.L1x4_mod ALIGN_4 .L1x4_4_BK: /*BK_CUR LOOP */ CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_4_BK ALIGN_4 .L1x4_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x4_BK_Store ALIGN_4 .L1x4_BK: /*BK_CUR LOOP */ CALC_1x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_BK ALIGN_4 .L1x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .Lx4_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR1,LDC_BYTE,2 /*multiply*4 */ sllg LOCAL_VAR2,BK,5 /*muyliply*4*sizeof(double) =multiply*32* 2**5 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */ brctg BN_CUR,.LX4_BN /*********************************X2 SECTION************************************************/ ALIGN_4 .LX2: tmll BN,2 jz .Lx1 ALIGN_4 .Lx2_BN: srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x2 ALIGN_4 .L8x2_BM: /*BM_CUR LOOP */ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_8x2 cijle LOCAL_VAR1,0,.L8x2_mod ALIGN_4 .L8x2_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) pfd 1,64(LOCAL_VAR2) #endif CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x2_4_BK ALIGN_4 .L8x2_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x2_BK_Store ALIGN_4 .L8x2_BK: /*BK_CUR LOOP */ CALC_8x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x2_BK ALIGN_4 .L8x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE ALIGN_4 brctg BM_CUR,.L8x2_BM ALIGN_2 .L4x2: tmll BM,4 jz .L2x2 ALIGN_4 .L4x2_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_4x2 cijle LOCAL_VAR1,0,.L4x2_mod ALIGN_4 .L4x2_4_BK: /*BK_CUR LOOP */ CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_4_BK ALIGN_4 .L4x2_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x2_BK_Store ALIGN_4 .L4x2_BK: /*BK_CUR LOOP */ CALC_4x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_BK ALIGN_4 .L4x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .L2x2: tmll BM,2 jz .L1x2 ALIGN_4 .L2x2_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_2x2 cijle LOCAL_VAR1,0,.L2x2_mod ALIGN_4 .L2x2_4_BK: /*BK_CUR LOOP */ CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_4_BK ALIGN_4 .L2x2_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x2_BK_Store ALIGN_4 .L2x2_BK: /*BK_CUR LOOP */ CALC_2x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_BK ALIGN_4 .L2x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .L1x2: tmll BM,1 jz .Lx2_INNER_END ALIGN_4 .L1x2_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_1x2 cijle LOCAL_VAR1,0,.L1x2_mod ALIGN_4 .L1x2_4_BK: /*BK_CUR LOOP */ CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_4_BK ALIGN_4 .L1x2_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x2_BK_Store ALIGN_4 .L1x2_BK: /*BK_CUR LOOP */ CALC_1x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_BK ALIGN_4 .L1x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .Lx2_INNER_END: /*add LDC_BYTE_COPY to new*/ la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*multiply*2 */ sllg LOCAL_VAR2,BK,4 /*muyliply*2*sizeof(double) =multiply*16* 2**4 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */ /*********************************X1 SECTION************************************************/ ALIGN_2 .Lx1: tmll BN,1 jz .L_FUNC_END ALIGN_4 .Lx1_BN: srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x1 ALIGN_4 .L8x1_BM: /*BM_CUR LOOP */ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_8x1 cijle LOCAL_VAR1,0,.L8x1_mod ALIGN_4 .L8x1_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) #endif CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x1_4_BK ALIGN_4 .L8x1_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x1_BK_Store ALIGN_4 .L8x1_BK: /*BK_CUR LOOP */ CALC_8x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x1_BK ALIGN_4 .L8x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE ALIGN_4 brctg BM_CUR,.L8x1_BM ALIGN_2 .L4x1: tmll BM,4 jz .L2x1 ALIGN_4 .L4x1_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_4x1 cijle LOCAL_VAR1,0,.L4x1_mod ALIGN_4 .L4x1_4_BK: /*BK_CUR LOOP */ CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_4_BK ALIGN_4 .L4x1_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x1_BK_Store ALIGN_4 .L4x1_BK: /*BK_CUR LOOP */ CALC_4x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_BK ALIGN_4 .L4x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .L2x1: tmll BM,2 jz .L1x1 ALIGN_4 .L2x1_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_2x1 cijle LOCAL_VAR1,0,.L2x1_mod ALIGN_4 .L2x1_4_BK: /*BK_CUR LOOP */ CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_4_BK ALIGN_4 .L2x1_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x1_BK_Store ALIGN_4 .L2x1_BK: /*BK_CUR LOOP */ CALC_2x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_BK ALIGN_4 .L2x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .L1x1: tmll BM, 1 jz .Lx1_INNER_END ALIGN_4 .L1x1_BM: /*BM start*/ srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ ZERO_CVEC_1x1 cijle LOCAL_VAR1,0,.L1x1_mod ALIGN_4 .L1x1_4_BK: /*BK_CUR LOOP */ CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_4_BK ALIGN_4 .L1x1_mod: la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x1_BK_Store ALIGN_4 .L1x1_BK: /*BK_CUR LOOP */ CALC_1x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_BK ALIGN_4 .L1x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE ALIGN_2 .Lx1_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR2,BK,3 /*muyliply*2*sizeof(double) =multiply*8* 2**3 */ la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */ la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(double) */ ALIGN_2 .L_FUNC_END: /*end*/ lmg %r6,%r12,48(%r15) br %r14 .end OpenBLAS-0.2.20/kernel/zarch/kernelMacros.S000066400000000000000000001275521313527062700203200ustar00rootroot00000000000000/*********************************KERNEL 8x4***********************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_8x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 vzero %v24 vzero %v25 vzero %v26 vzero %v27 vzero %v28 vzero %v29 vzero %v30 vzero %v31 .endm /*Calculate for 8x4 C blocks*/ .macro CALC_8x4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vl %v4, 32(\PTR_A_REG) vl %v5, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,16(\PTR_B_REG) vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v1,24(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v26,%v4,%v7,%v26 la \PTR_A_REG, 64(\PTR_A_REG) vfmadb %v27,%v5,%v7,%v27 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 la \PTR_B_REG, 32(\PTR_B_REG) vfmadb %v30,%v4,%v1,%v30 vfmadb %v31,%v5,%v1,%v31 .endm /*Calculate for 8x4_4 C blocks*/ .macro CALC_8x4_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vl %v4, 32(\PTR_A_REG) vl %v5, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,16(\PTR_B_REG) vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v1,24(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v26,%v4,%v7,%v26 vfmadb %v27,%v5,%v7,%v27 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 vfmadb %v30,%v4,%v1,%v30 vfmadb %v31,%v5,%v1,%v31 vlrepg %v7, 32(\PTR_B_REG) vlrepg %v1,40(\PTR_B_REG) vl %v2, 64(\PTR_A_REG) vl %v3, 80(\PTR_A_REG) vl %v4, 96(\PTR_A_REG) vl %v5, 112(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,48(\PTR_B_REG) vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v1,56(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v26,%v4,%v7,%v26 vfmadb %v27,%v5,%v7,%v27 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 vfmadb %v30,%v4,%v1,%v30 vfmadb %v31,%v5,%v1,%v31 vlrepg %v7, 64(\PTR_B_REG) vlrepg %v1,72(\PTR_B_REG) vl %v2, 128(\PTR_A_REG) vl %v3, 144(\PTR_A_REG) vl %v4, 160(\PTR_A_REG) vl %v5, 176(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,80(\PTR_B_REG) vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v1,88(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v26,%v4,%v7,%v26 vfmadb %v27,%v5,%v7,%v27 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 vfmadb %v30,%v4,%v1,%v30 vfmadb %v31,%v5,%v1,%v31 vlrepg %v7, 96(\PTR_B_REG) vlrepg %v1,104(\PTR_B_REG) vl %v2, 192(\PTR_A_REG) vl %v3, 208(\PTR_A_REG) vl %v4, 224(\PTR_A_REG) vl %v5, 240(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,112(\PTR_B_REG) vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v1,120(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v26,%v4,%v7,%v26 vfmadb %v27,%v5,%v7,%v27 la \PTR_B_REG, 128(\PTR_B_REG) vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 vfmadb %v30,%v4,%v1,%v30 la \PTR_A_REG, 256(\PTR_A_REG) vfmadb %v31,%v5,%v1,%v31 .endm /*STORE C8X4*/ .macro STORE_8x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL /*add LDC_BYTE_reg=LDC_BYTE_original<<1 */ la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) vl %v2,16(\CIJ_REG) vfmadb %v2,%v17,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG) vl %v3,32(\CIJ_REG) vfmadb %v3,%v18,\ALPHA_VECREG,%v3 vst %v3,32(\CIJ_REG) vl %v4,48(\CIJ_REG) vfmadb %v4,%v19,\ALPHA_VECREG,%v4 vst %v4,48(\CIJ_REG) la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) /*add c LDC_BYTE*/ vl %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v1,%v20,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v2,%v21,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v3,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v3,%v22,\ALPHA_VECREG,%v3 vst %v3,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v4,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v4,%v23,\ALPHA_VECREG,%v4 vst %v4,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v1,0(\CIJ_REG,LOCAL_VAR1) vfmadb %v1,%v24,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,LOCAL_VAR1) vl %v2,16(\CIJ_REG,LOCAL_VAR1) vfmadb %v2,%v25,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,LOCAL_VAR1) vl %v3,32(\CIJ_REG,LOCAL_VAR1) vfmadb %v3,%v26,\ALPHA_VECREG,%v3 vst %v3,32(\CIJ_REG,LOCAL_VAR1) vl %v4,48(\CIJ_REG,LOCAL_VAR1) vfmadb %v4,%v27,\ALPHA_VECREG,%v4 vst %v4,48(\CIJ_REG,LOCAL_VAR1) vl %v1,0(\CIJ_REG,LOCAL_VAR2) vfmadb %v1,%v28,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,LOCAL_VAR2) vl %v2,16(\CIJ_REG,LOCAL_VAR2) vfmadb %v2,%v29,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,LOCAL_VAR2) vl %v3,32(\CIJ_REG,LOCAL_VAR2) vfmadb %v3,%v30,\ALPHA_VECREG,%v3 vst %v3,32(\CIJ_REG,LOCAL_VAR2) vl %v4,48(\CIJ_REG,LOCAL_VAR2) vfmadb %v4,%v31,\ALPHA_VECREG,%v4 vst %v4,48(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,64(\CIJ_REG) .endm /*STORE TRMM C8X4*/ .macro STORE_TRMM_8x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL /*add LDC_BYTE_reg=LDC_BYTE_original<<1 */ la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) vfmdb %v2,%v17,\ALPHA_VECREG vst %v2,16(\CIJ_REG) vfmdb %v3,%v18,\ALPHA_VECREG vst %v3,32(\CIJ_REG) vfmdb %v4,%v19,\ALPHA_VECREG vst %v4,48(\CIJ_REG) la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) /*add c LDC_BYTE*/ vfmdb %v1,%v20,\ALPHA_VECREG vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v2,%v21,\ALPHA_VECREG vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v3,%v22,\ALPHA_VECREG vst %v3,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v4,%v23,\ALPHA_VECREG vst %v4,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v1,%v24,\ALPHA_VECREG vst %v1,0(\CIJ_REG,LOCAL_VAR1) vfmdb %v2,%v25,\ALPHA_VECREG vst %v2,16(\CIJ_REG,LOCAL_VAR1) vfmdb %v3,%v26,\ALPHA_VECREG vst %v3,32(\CIJ_REG,LOCAL_VAR1) vfmdb %v4,%v27,\ALPHA_VECREG vst %v4,48(\CIJ_REG,LOCAL_VAR1) vfmdb %v1,%v28,\ALPHA_VECREG vst %v1,0(\CIJ_REG,LOCAL_VAR2) vfmdb %v2,%v29,\ALPHA_VECREG vst %v2,16(\CIJ_REG,LOCAL_VAR2) vfmdb %v3,%v30,\ALPHA_VECREG vst %v3,32(\CIJ_REG,LOCAL_VAR2) vfmdb %v4,%v31,\ALPHA_VECREG vst %v4,48(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,64(\CIJ_REG) .endm /**************************************Kernel4x4*************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_4x4 vzero %v16 vzero %v17 vzero %v20 vzero %v21 vzero %v24 vzero %v25 vzero %v28 vzero %v29 .endm /*Calculate for 4x4 C blocks*/ .macro CALC_4x4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,16(\PTR_B_REG) vlrepg %v1,24(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 la \PTR_A_REG, 32(\PTR_A_REG) vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 la \PTR_B_REG, 32(\PTR_B_REG) .endm .macro CALC_4x4_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,16(\PTR_B_REG) vlrepg %v1,24(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 vlrepg %v7, 32(\PTR_B_REG) vlrepg %v1,40(\PTR_B_REG) vl %v2, 32(\PTR_A_REG) vl %v3, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,48(\PTR_B_REG) vlrepg %v1,56(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 vlrepg %v7, 64(\PTR_B_REG) vlrepg %v1,72(\PTR_B_REG) vl %v2, 64(\PTR_A_REG) vl %v3, 80(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,80(\PTR_B_REG) vlrepg %v1,88(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 vlrepg %v7, 96(\PTR_B_REG) vlrepg %v1,104(\PTR_B_REG) vl %v2, 96(\PTR_A_REG) vl %v3, 112(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7,112(\PTR_B_REG) la \PTR_A_REG, 128(\PTR_A_REG) vlrepg %v1,120(\PTR_B_REG) vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v28,%v2,%v1,%v28 la \PTR_B_REG, 128(\PTR_B_REG) vfmadb %v29,%v3,%v1,%v29 .endm /*STORE C4X4*/ .macro STORE_4x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL /*add LDC_BYTE_reg=LDC_BYTE_original<<1 */ la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) vl %v2,16(\CIJ_REG) vfmadb %v2,%v17,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG) la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) /*add c LDC_BYTE*/ vl %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v1,%v20,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v2,%v21,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v1,0(\CIJ_REG,LOCAL_VAR1) vfmadb %v1,%v24,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,LOCAL_VAR1) vl %v2,16(\CIJ_REG,LOCAL_VAR1) vfmadb %v2,%v25,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,LOCAL_VAR1) vl %v1,0(\CIJ_REG,LOCAL_VAR2) vfmadb %v1,%v28,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,LOCAL_VAR2) vl %v2,16(\CIJ_REG,LOCAL_VAR2) vfmadb %v2,%v29,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,32(\CIJ_REG) .endm /*STORE TRMM C4X4*/ .macro STORE_TRMM_4x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL /*add LDC_BYTE_reg=LDC_BYTE_original<<1 */ la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) vfmdb %v2,%v17,\ALPHA_VECREG vst %v2,16(\CIJ_REG) la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) vfmdb %v1,%v20,\ALPHA_VECREG vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v2,%v21,\ALPHA_VECREG vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v1,%v24,\ALPHA_VECREG vst %v1,0(\CIJ_REG,LOCAL_VAR1) vfmdb %v2,%v25,\ALPHA_VECREG vst %v2,16(\CIJ_REG,LOCAL_VAR1) vfmdb %v1,%v28,\ALPHA_VECREG vst %v1,0(\CIJ_REG,LOCAL_VAR2) vfmdb %v2,%v29,\ALPHA_VECREG vst %v2,16(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,32(\CIJ_REG) .endm /**************************************Kernel2x4*************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_2x4 vzero %v1 /*a1b1 a1b2 */ vzero %v2 /*a1b3 a1b4 */ vzero %v6 /*a2b1 a2b2 */ vzero %v7 /*a2b3 a2b4 */ .endm /*Calculate for 2x4_4 C blocks.This Time BroadCast A. but Load B multiple*/ .macro CALC_2x4_4 PTR_A_REG,PTR_B_REG vl %v4, 0(\PTR_B_REG) vl %v5,16(\PTR_B_REG) vlrepg %v3, 0(\PTR_A_REG) vlrepg %v16, 8(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 vfmadb %v6,%v16,%v4,%v6 vfmadb %v7,%v16,%v5,%v7 vl %v4, 32(\PTR_B_REG) vl %v5,48(\PTR_B_REG) vlrepg %v3, 16(\PTR_A_REG) vlrepg %v16, 24(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 vfmadb %v6,%v16,%v4,%v6 vfmadb %v7,%v16,%v5,%v7 vl %v4, 64(\PTR_B_REG) vl %v5,80(\PTR_B_REG) vlrepg %v3, 32(\PTR_A_REG) vlrepg %v16, 40(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 vfmadb %v6,%v16,%v4,%v6 vfmadb %v7,%v16,%v5,%v7 vl %v4, 96(\PTR_B_REG) vl %v5,112(\PTR_B_REG) vlrepg %v3, 48(\PTR_A_REG) vlrepg %v16, 56(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 la \PTR_B_REG, 128(\PTR_B_REG) vfmadb %v6,%v16,%v4,%v6 vfmadb %v7,%v16,%v5,%v7 la \PTR_A_REG, 64(\PTR_A_REG) .endm /*Calculate for 2x4 C blocks.This Time BroadCast A. but Load B multiple*/ .macro CALC_2x4 PTR_A_REG,PTR_B_REG vl %v4, 0(\PTR_B_REG) vl %v5,16(\PTR_B_REG) vlrepg %v3, 0(\PTR_A_REG) vlrepg %v16, 8(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 la \PTR_A_REG, 16(\PTR_A_REG) vfmadb %v6,%v16,%v4,%v6 vfmadb %v7,%v16,%v5,%v7 la \PTR_B_REG, 32(\PTR_B_REG) .endm .macro STORE_2x4 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL /**/ vfmdb %v1,%v1,\ALPHA_REG vfmdb %v2,%v2,\ALPHA_REG vfmdb %v6,%v6,\ALPHA_REG vfmdb %v7,%v7,\ALPHA_REG vrepg %v4,%v1,1 vrepg %v5,%v6,1 la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) adb %f1, 0(\CIJ_REG) std %f1,0(\CIJ_REG) adb %f6, 8(\CIJ_REG) std %f6,8(\CIJ_REG) adb %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) adb %f5,8(\CIJ_REG,\LDC_BYTE_ORIGINAL) std %f5,8(\CIJ_REG,\LDC_BYTE_ORIGINAL) /*add LDC_BYTE */ la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) vrepg %v4,%v2,1 vrepg %v5,%v7,1 adb %f2,0(\CIJ_REG,LOCAL_VAR1) std %f2,0(\CIJ_REG,LOCAL_VAR1) adb %f7,8(\CIJ_REG,LOCAL_VAR1) std %f7,8(\CIJ_REG,LOCAL_VAR1) adb %f4,0(\CIJ_REG,LOCAL_VAR2) std %f4,0(\CIJ_REG,LOCAL_VAR2) adb %f5,8(\CIJ_REG,LOCAL_VAR2) std %f5,8(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,16(\CIJ_REG) .endm .macro STORE_TRMM_2x4 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL /**/ vfmdb %v1,%v1,\ALPHA_REG vfmdb %v2,%v2,\ALPHA_REG vfmdb %v6,%v6,\ALPHA_REG vfmdb %v7,%v7,\ALPHA_REG vrepg %v4,%v1,1 vrepg %v5,%v6,1 la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) std %f1,0(\CIJ_REG) std %f6,8(\CIJ_REG) std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) std %f5,8(\CIJ_REG,\LDC_BYTE_ORIGINAL) /*add LDC_BYTE */ la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) vrepg %v4,%v2,1 vrepg %v5,%v7,1 std %f2,0(\CIJ_REG,LOCAL_VAR1) std %f7,8(\CIJ_REG,LOCAL_VAR1) std %f4,0(\CIJ_REG,LOCAL_VAR2) std %f5,8(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,16(\CIJ_REG) .endm /**************************************Kernel1x4*************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_1x4 vzero %v1 vzero %v2 .endm /*Calculate for 1x4 C blocks.This Time BroadCast A. but Load B multiple*/ .macro CALC_1x4 PTR_A_REG,PTR_B_REG vl %v4, 0(\PTR_B_REG) vl %v5,16(\PTR_B_REG) vlrepg %v3, 0(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 la \PTR_A_REG, 8(\PTR_A_REG) vfmadb %v2,%v3,%v5,%v2 la \PTR_B_REG, 32(\PTR_B_REG) .endm /*Calculate for 1x4_4 C blocks.This Time BroadCast A. but Load B multiple*/ .macro CALC_1x4_4 PTR_A_REG,PTR_B_REG vl %v4, 0(\PTR_B_REG) vl %v5,16(\PTR_B_REG) vlrepg %v3, 0(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 vl %v4, 32(\PTR_B_REG) vl %v5,48(\PTR_B_REG) vlrepg %v3, 8(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 vl %v4, 64(\PTR_B_REG) vl %v5,80(\PTR_B_REG) vlrepg %v3, 16(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 vl %v4, 96(\PTR_B_REG) vl %v5,112(\PTR_B_REG) vlrepg %v3, 24(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vfmadb %v2,%v3,%v5,%v2 la \PTR_A_REG, 32(\PTR_A_REG) la \PTR_B_REG, 128(\PTR_B_REG) .endm .macro STORE_1x4 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL /**/ vfmdb %v1,%v1,\ALPHA_REG vfmdb %v2,%v2,\ALPHA_REG vrepg %v4,%v1,1 vrepg %v5,%v2,1 la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) adb %f1, 0(\CIJ_REG) std %f1,0(\CIJ_REG) adb %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) /*add LDC_BYTE */ la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) adb %f2,0(\CIJ_REG,LOCAL_VAR1) std %f2,0(\CIJ_REG,LOCAL_VAR1) adb %f5,0(\CIJ_REG,LOCAL_VAR2) std %f5,0(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,8(\CIJ_REG) .endm .macro STORE_TRMM_1x4 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL /**/ vfmdb %v1,%v1,\ALPHA_REG vfmdb %v2,%v2,\ALPHA_REG vrepg %v4,%v1,1 vrepg %v5,%v2,1 la LOCAL_VAR1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) std %f1,0(\CIJ_REG) std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) /*add LDC_BYTE */ la LOCAL_VAR2,0(LOCAL_VAR1,\LDC_BYTE_ORIGINAL ) std %f2,0(\CIJ_REG,LOCAL_VAR1) std %f5,0(\CIJ_REG,LOCAL_VAR2) la \CIJ_REG,8(\CIJ_REG) .endm /***************************************BN=2 SECTION***************************************/ /*************************************Kernel8x2***************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_8x2 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 .endm /*Calculate for 8x2 C blocks*/ .macro CALC_8x2 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vl %v4, 32(\PTR_A_REG) vl %v5, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 la \PTR_A_REG, 64(\PTR_A_REG) vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 la \PTR_B_REG, 16(\PTR_B_REG) .endm /*Calculate for 8x2_4 C blocks*/ .macro CALC_8x2_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vl %v4, 32(\PTR_A_REG) vl %v5, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v7, 16(\PTR_B_REG) vlrepg %v1,24(\PTR_B_REG) vl %v2, 64(\PTR_A_REG) vl %v3, 80(\PTR_A_REG) vl %v4, 96(\PTR_A_REG) vl %v5, 112(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v7, 32(\PTR_B_REG) vlrepg %v1,40(\PTR_B_REG) vl %v2, 128(\PTR_A_REG) vl %v3, 144(\PTR_A_REG) vl %v4, 160(\PTR_A_REG) vl %v5, 176(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 vlrepg %v7, 48(\PTR_B_REG) vlrepg %v1,56(\PTR_B_REG) vl %v2, 192(\PTR_A_REG) vl %v3, 208(\PTR_A_REG) vl %v4, 224(\PTR_A_REG) vl %v5, 240(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 la \PTR_B_REG, 64(\PTR_B_REG) vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vfmadb %v22,%v4,%v1,%v22 vfmadb %v23,%v5,%v1,%v23 la \PTR_A_REG, 256(\PTR_A_REG) .endm /*STORE C8X2*/ .macro STORE_8x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) vl %v2,16(\CIJ_REG) vfmadb %v2,%v17,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG) vl %v3,32(\CIJ_REG) vfmadb %v3,%v18,\ALPHA_VECREG,%v3 vst %v3,32(\CIJ_REG) vl %v4,48(\CIJ_REG) vfmadb %v4,%v19,\ALPHA_VECREG,%v4 vst %v4,48(\CIJ_REG) vl %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v1,%v20,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v2,%v21,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v3,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v3,%v22,\ALPHA_VECREG,%v3 vst %v3,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v4,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v4,%v23,\ALPHA_VECREG,%v4 vst %v4,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,64(\CIJ_REG) .endm /*STORE TRMM C8X2*/ .macro STORE_TRMM_8x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) vfmdb %v2,%v17,\ALPHA_VECREG vst %v2,16(\CIJ_REG) vfmdb %v3,%v18,\ALPHA_VECREG vst %v3,32(\CIJ_REG) vfmdb %v4,%v19,\ALPHA_VECREG vst %v4,48(\CIJ_REG) vfmdb %v1,%v20,\ALPHA_VECREG vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v2,%v21,\ALPHA_VECREG vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v3,%v22,\ALPHA_VECREG vst %v3,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v4,%v23,\ALPHA_VECREG vst %v4,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,64(\CIJ_REG) .endm /*************************************Kernel4x2***************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_4x2 vzero %v16 vzero %v17 vzero %v20 vzero %v21 .endm /*Calculate for 4x2 C blocks*/ .macro CALC_4x2 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 la \PTR_A_REG, 32(\PTR_A_REG) vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 la \PTR_B_REG, 16(\PTR_B_REG) .endm /*Calculate for 4x2_4 C blocks*/ .macro CALC_4x2_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7, 16(\PTR_B_REG) vlrepg %v1,24(\PTR_B_REG) vl %v2, 32(\PTR_A_REG) vl %v3, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7, 32(\PTR_B_REG) vlrepg %v1,40(\PTR_B_REG) vl %v2, 64(\PTR_A_REG) vl %v3, 80(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepg %v7, 48(\PTR_B_REG) vlrepg %v1,56(\PTR_B_REG) vl %v2, 96(\PTR_A_REG) vl %v3, 112(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 la \PTR_B_REG, 64(\PTR_B_REG) vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 la \PTR_A_REG, 128(\PTR_A_REG) .endm /*STORE C4x2*/ .macro STORE_4x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) vl %v2,16(\CIJ_REG) vfmadb %v2,%v17,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG) vl %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v1,%v20,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v2,%v21,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,32(\CIJ_REG) .endm /*STORE TRMM C4x2*/ .macro STORE_TRMM_4x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) vfmdb %v2,%v17,\ALPHA_VECREG vst %v2,16(\CIJ_REG) vfmdb %v1,%v20,\ALPHA_VECREG vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmdb %v2,%v21,\ALPHA_VECREG vst %v2,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,32(\CIJ_REG) .endm /*************************************Kernel2x2***************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_2x2 vzero %v16 vzero %v20 .endm /*Calculate for 2x2 C blocks*/ .macro CALC_2x2 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 la \PTR_A_REG, 16(\PTR_A_REG) vfmadb %v20,%v2,%v1,%v20 la \PTR_B_REG, 16(\PTR_B_REG) .endm /*Calculate for 2x2_4 C blocks*/ .macro CALC_2x2_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vlrepg %v1,8(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v20,%v2,%v1,%v20 vlrepg %v7, 16(\PTR_B_REG) vlrepg %v1,24(\PTR_B_REG) vl %v2, 16(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v20,%v2,%v1,%v20 vlrepg %v7, 32(\PTR_B_REG) vlrepg %v1,40(\PTR_B_REG) vl %v2, 32(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v20,%v2,%v1,%v20 vlrepg %v7, 48(\PTR_B_REG) vlrepg %v1,56(\PTR_B_REG) vl %v2, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v20,%v2,%v1,%v20 la \PTR_B_REG, 64(\PTR_B_REG) la \PTR_A_REG, 64(\PTR_A_REG) .endm /*STORE C2x2*/ .macro STORE_2x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) vl %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vfmadb %v1,%v20,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,16(\CIJ_REG) .endm /*STORE TRMM C2x2*/ .macro STORE_TRMM_2x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) vfmdb %v1,%v20,\ALPHA_VECREG vst %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,16(\CIJ_REG) .endm /**************************************Kernel1x2*************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_1x2 vzero %v1 .endm /*Calculate for 1x2 C blocks.This Time BroadCast A. but Load B multiple*/ .macro CALC_1x2 PTR_A_REG,PTR_B_REG vl %v4, 0(\PTR_B_REG) vlrepg %v3, 0(\PTR_A_REG) la \PTR_B_REG, 16(\PTR_B_REG) vfmadb %v1,%v3,%v4,%v1 la \PTR_A_REG, 8(\PTR_A_REG) .endm .macro CALC_1x2_4 PTR_A_REG,PTR_B_REG vl %v4, 0(\PTR_B_REG) vlrepg %v3, 0(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vl %v4, 16(\PTR_B_REG) vlrepg %v3, 8(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vl %v4, 32(\PTR_B_REG) vlrepg %v3, 16(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 vl %v4, 48(\PTR_B_REG) vlrepg %v3, 24(\PTR_A_REG) vfmadb %v1,%v3,%v4,%v1 la \PTR_B_REG, 64(\PTR_B_REG) la \PTR_A_REG, 32(\PTR_A_REG) .endm .macro STORE_1x2 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL /**/ vfmdb %v1,%v1,\ALPHA_REG vrepg %v4,%v1,1 adb %f1, 0(\CIJ_REG) std %f1,0(\CIJ_REG) adb %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,8(\CIJ_REG) .endm .macro STORE_TRMM_1x2 ALPHA_REG,CIJ_REG , LDC_BYTE_ORIGINAL /**/ vfmdb %v1,%v1,\ALPHA_REG vrepg %v4,%v1,1 std %f1,0(\CIJ_REG) std %f4,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,8(\CIJ_REG) .endm /**************************************BN=1*******************************************************/ /*************************************Kernel8x1***************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_8x1 vzero %v16 vzero %v17 vzero %v18 vzero %v19 .endm /*Calculate for 8x1 C blocks*/ .macro CALC_8x1 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vl %v4, 32(\PTR_A_REG) vl %v5, 48(\PTR_A_REG) la \PTR_B_REG, 8(\PTR_B_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 la \PTR_A_REG, 64(\PTR_A_REG) vfmadb %v19,%v5,%v7,%v19 .endm /*Calculate for 8x1_4 C blocks*/ .macro CALC_8x1_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vl %v4, 32(\PTR_A_REG) vl %v5, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vlrepg %v7, 8(\PTR_B_REG) vl %v2, 64(\PTR_A_REG) vl %v3, 80(\PTR_A_REG) vl %v4, 96(\PTR_A_REG) vl %v5, 112(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vlrepg %v7, 16(\PTR_B_REG) vl %v2, 128(\PTR_A_REG) vl %v3, 144(\PTR_A_REG) vl %v4, 160(\PTR_A_REG) vl %v5, 176(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 vlrepg %v7, 24(\PTR_B_REG) vl %v2, 192(\PTR_A_REG) vl %v3, 208(\PTR_A_REG) vl %v4, 224(\PTR_A_REG) vl %v5, 240(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v18,%v4,%v7,%v18 vfmadb %v19,%v5,%v7,%v19 la \PTR_A_REG, 256(\PTR_A_REG) la \PTR_B_REG, 32(\PTR_B_REG) .endm /*STORE C8X1*/ .macro STORE_8x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) vl %v2,16(\CIJ_REG) vfmadb %v2,%v17,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG) vl %v3,32(\CIJ_REG) vfmadb %v3,%v18,\ALPHA_VECREG,%v3 vst %v3,32(\CIJ_REG) vl %v4,48(\CIJ_REG) vfmadb %v4,%v19,\ALPHA_VECREG,%v4 vst %v4,48(\CIJ_REG) la \CIJ_REG,64(\CIJ_REG) .endm /*STORE TRMM C8X1*/ .macro STORE_TRMM_8x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) vfmdb %v2,%v17,\ALPHA_VECREG vst %v2,16(\CIJ_REG) vfmdb %v3,%v18,\ALPHA_VECREG vst %v3,32(\CIJ_REG) vfmdb %v4,%v19,\ALPHA_VECREG vst %v4,48(\CIJ_REG) la \CIJ_REG,64(\CIJ_REG) .endm /*************************************Kernel4x1***************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_4x1 vzero %v16 vzero %v17 .endm /*Calculate for 4x1 C blocks*/ .macro CALC_4x1 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) la \PTR_B_REG, 8(\PTR_B_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 la \PTR_A_REG, 32(\PTR_A_REG) .endm /*Calculate for 4x1_4 C blocks*/ .macro CALC_4x1_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vl %v3, 16(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vlrepg %v7, 8(\PTR_B_REG) vl %v2, 32(\PTR_A_REG) vl %v3, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vlrepg %v7, 16(\PTR_B_REG) vl %v2, 64(\PTR_A_REG) vl %v3, 80(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vlrepg %v7, 24(\PTR_B_REG) vl %v2, 96(\PTR_A_REG) vl %v3, 112(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 la \PTR_B_REG, 32(\PTR_B_REG) la \PTR_A_REG, 128(\PTR_A_REG) .endm /*STORE C4X1*/ .macro STORE_4x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) vl %v2,16(\CIJ_REG) vfmadb %v2,%v17,\ALPHA_VECREG,%v2 vst %v2,16(\CIJ_REG) la \CIJ_REG,32(\CIJ_REG) .endm /*STORE TRMM C4X1*/ .macro STORE_TRMM_4x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) vfmdb %v2,%v17,\ALPHA_VECREG vst %v2,16(\CIJ_REG) la \CIJ_REG,32(\CIJ_REG) .endm /*************************************Kernel2x1***************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_2x1 vzero %v16 .endm /*Calculate for 2x1 C blocks*/ .macro CALC_2x1 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) la \PTR_B_REG, 8(\PTR_B_REG) vfmadb %v16,%v2,%v7,%v16 la \PTR_A_REG, 16(\PTR_A_REG) .endm /*Calculate for 2x1_4 C blocks*/ .macro CALC_2x1_4 PTR_A_REG,PTR_B_REG vlrepg %v7, 0(\PTR_B_REG) vl %v2, 0(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vlrepg %v7, 8(\PTR_B_REG) vl %v2, 16(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vlrepg %v7, 16(\PTR_B_REG) vl %v2, 32(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 vlrepg %v7, 24(\PTR_B_REG) vl %v2, 48(\PTR_A_REG) vfmadb %v16,%v2,%v7,%v16 la \PTR_B_REG, 32(\PTR_B_REG) la \PTR_A_REG, 64(\PTR_A_REG) .endm /*STORE C2X1*/ .macro STORE_2x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vl %v1,0(\CIJ_REG) vfmadb %v1,%v16,\ALPHA_VECREG,%v1 vst %v1,0(\CIJ_REG) la \CIJ_REG,16(\CIJ_REG) .endm /*STORE TRMM C2X1*/ .macro STORE_TRMM_2x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL vfmdb %v1,%v16,\ALPHA_VECREG vst %v1,0(\CIJ_REG) la \CIJ_REG,16(\CIJ_REG) .endm /*************************************Kernel1x1***************************************************/ /*Zero C block Vectors*/ .macro ZERO_CVEC_1x1 LZDR %f1 .endm /*Calculate for 1x1 C blocks*/ .macro CALC_1x1 PTR_A_REG,PTR_B_REG ld %f2,0(\PTR_A_REG) /**a*/ la \PTR_A_REG,8(\PTR_A_REG) madb %f1,%f2,0(\PTR_B_REG) la \PTR_B_REG,8(\PTR_B_REG) .endm /*Calculate for 1x1_4 C blocks*/ .macro CALC_1x1_4 PTR_A_REG,PTR_B_REG ld %f2,0(\PTR_A_REG) /**a*/ madb %f1,%f2,0(\PTR_B_REG) ld %f2,8(\PTR_A_REG) /**a*/ madb %f1,%f2,8(\PTR_B_REG) ld %f2,16(\PTR_A_REG) /**a*/ madb %f1,%f2,16(\PTR_B_REG) ld %f2,24(\PTR_A_REG) /**a*/ madb %f1,%f2,24(\PTR_B_REG) la \PTR_A_REG,32(\PTR_A_REG) la \PTR_B_REG,32(\PTR_B_REG) .endm /*STORE C1X1*/ .macro STORE_1x1 ALPHA_FLOAT,CIJ_REG,LDC_BYTE_ORIGINAL ld %f2,0(CIJ_LOCAL) madbr %f2,%f1,\ALPHA_FLOAT std %f2,0(CIJ_LOCAL) la \CIJ_REG,8(\CIJ_REG) .endm /*STORE C1X1*/ .macro STORE_TRMM_1x1 ALPHA_FLOAT,CIJ_REG,LDC_BYTE_ORIGINAL mdbr %f1,\ALPHA_FLOAT std %f1,0(CIJ_LOCAL) la \CIJ_REG,8(\CIJ_REG) .endm /****************************TRMM POINTER REFRESH MACROSES*************************/ .macro RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /* ptrbb = bb;*/ lgr \PTR_B,\B_VAL /*refresh BPOINT*/ #else /* ptrba =ptrba+ off*C_A; ptrbb = bb + off*C_B;*/ .if \C_B==4 .if \C_A==8 sllg \PTR_B, \OFF_VAL,5 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*4*/ agr \PTR_A,\PTR_B /*ptrba+off*4**/ la \PTR_B,0(\B_VAL,\PTR_B) .elseif \C_A==4 sllg \PTR_B, \OFF_VAL,5 agr \PTR_A,\PTR_B /*ptrba+off*4**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,4 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_B, \PTR_B la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,3 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,5 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==2 .if \C_A==8 sllg \PTR_B, \OFF_VAL,6 agr \PTR_A,\PTR_B /*ptrba+off*8**/ sllg \PTR_B, \OFF_VAL,4 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==4 sllg \PTR_B, \OFF_VAL,4 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,4 agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,3 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_B,\PTR_B /* off+off**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==1 .if \C_A==8 sllg \PTR_B, \OFF_VAL,6 agr \PTR_A,\PTR_B /*ptrba+off*8**/ sllg \PTR_B, \OFF_VAL,3 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==4 sllg \PTR_B, \OFF_VAL,5 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,3 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,3 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,3 agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .endif #endif .endm /**/ .macro RefreshTempBk TEMP_VAL,BK_VAL,OFF_VAL,INCR_A,INCR_B #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) /* temp = bk-off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #elif defined(LEFT) /* temp = off+INCR_A; // number of values in A */ la \TEMP_VAL,\INCR_A(\OFF_VAL) #else /* temp = off+INCR_B // number of values in B*/ la \TEMP_VAL,\INCR_B(\OFF_VAL) #endif .endm .macro RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /*temp = bk - off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #ifdef LEFT /*temp -= 8; // number of values in A*/ lay \TEMP_VAL,-\C_A(\TEMP_VAL) #else /*temp -= 4; // number of values in B*/ lay \TEMP_VAL,-\C_B(\TEMP_VAL) #endif /*ptrba += temp*C_A; ptrbb += temp*C_B;*/ .if \C_A==8 sllg \TEMP_VAL, \TEMP_VAL,6 .elseif \C_A==4 sllg \TEMP_VAL, \TEMP_VAL,5 /*temp*4*/ .elseif \C_A==2 sllg \TEMP_VAL, \TEMP_VAL,4 /*temp*2*/ .elseif \C_A==1 sllg \TEMP_VAL, \TEMP_VAL,3 /*temp*1*/ .endif la \PTR_A,0(\PTR_A,\TEMP_VAL) /*ptrba+temp*C_A*/ /*we do not need to refresh ptrbb. so lets ignore it*/ #endif #ifdef LEFT /*off += 8; // number of values in A*/ aghi \OFF_VAL,\C_A #endif .endmOpenBLAS-0.2.20/kernel/zarch/skernelMacros.S000066400000000000000000001000161313527062700204650ustar00rootroot00000000000000/**********************************Zero Vectors**************************************************/ .macro ZERO_CVEC_8x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 vzero %v24 vzero %v25 vzero %v26 vzero %v27 vzero %v28 vzero %v29 vzero %v30 vzero %v31 .endm .macro ZERO_CVEC_8x2 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 .endm .macro ZERO_CVEC_8x1 vzero %v16 vzero %v17 vzero %v18 vzero %v19 .endm .macro ZERO_CVEC_4x4 vzero %v16 vzero %v17 vzero %v20 vzero %v21 vzero %v24 vzero %v25 vzero %v28 vzero %v29 .endm .macro ZERO_CVEC_4x2 vzero %v16 vzero %v17 vzero %v20 vzero %v21 .endm .macro ZERO_CVEC_4x1 lzer %f1 lzer %f2 lzer %f3 lzer %f4 .endm .macro ZERO_CVEC_2x4 vzero %v16 vzero %v17 vzero %v20 vzero %v21 .endm .macro ZERO_CVEC_2x2 vzero %v16 vzero %v20 .endm .macro ZERO_CVEC_2x1 lzer %f1 lzer %f2 .endm .macro ZERO_CVEC_1x4 lzer %f1 lzer %f2 lzer %f3 lzer %f4 .endm .macro ZERO_CVEC_1x2 lzer %f1 lzer %f2 .endm .macro ZERO_CVEC_1x1 lzer %f1 .endm /***********************************Helper Calculations*************************************/ #define unit_size 4 #define DISP(ind,stride,disp) (ind*stride+disp) #define DISP8(ind,disp) (ind*unit_size*8+disp) #define DISP4(ind,disp) (ind*unit_size*4+disp) #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) #define N8 (8*unit_size) #define N4 (4*unit_size) #define N2 (2*unit_size) #define N1 (1*unit_size) .macro Calculate_8x4_I PTR_A_REG,PTR_B_REG,Index,IsLast vlm %v1,%v2, DISP8(\Index , 0)(\PTR_A_REG) vmrhf %v3,%v1,%v1 vmrhf %v5,%v2,%v2 vmrlf %v4,%v1,%v1 vmrlf %v6,%v2,%v2 vldeb %v3, %v3 vldeb %v4, %v4 vldeb %v5, %v5 vlrepf %v7, DISP4(\Index ,0)(\PTR_B_REG) vlrepf %v1, DISP4(\Index ,4)(\PTR_B_REG) vldeb %v6, %v6 vldeb %v7, %v7 vldeb %v1, %v1 vfmadb %v16,%v3,%v7,%v16 vfmadb %v17,%v4,%v7,%v17 vfmadb %v18,%v5,%v7,%v18 vfmadb %v19,%v6,%v7,%v19 vfmadb %v20,%v3,%v1,%v20 vfmadb %v21,%v4,%v1,%v21 vfmadb %v22,%v5,%v1,%v22 vfmadb %v23,%v6,%v1,%v23 vlrepf %v2, DISP4(\Index ,8)(\PTR_B_REG) vlrepf %v7, DISP4(\Index ,12)(\PTR_B_REG) vldeb %v2, %v2 vldeb %v7, %v7 .if \IsLast==1 la \PTR_A_REG, DISP8(\Index ,N8)(\PTR_A_REG) .endif vfmadb %v24,%v3,%v2,%v24 vfmadb %v25,%v4,%v2,%v25 vfmadb %v26,%v5,%v2,%v26 vfmadb %v27,%v6,%v2,%v27 vfmadb %v28,%v3,%v7,%v28 vfmadb %v29,%v4,%v7,%v29 vfmadb %v30,%v5,%v7,%v30 vfmadb %v31,%v6,%v7,%v31 .if \IsLast==1 la \PTR_B_REG, DISP4(\Index ,N4)(\PTR_B_REG) .endif .endm .macro Calculate_8x2_I PTR_A_REG,PTR_B_REG,Index,IsLast vlm %v1,%v2, DISP8(\Index , 0)(\PTR_A_REG) vmrhf %v3,%v1,%v1 vmrhf %v5,%v2,%v2 vmrlf %v4,%v1,%v1 vmrlf %v6,%v2,%v2 vldeb %v3, %v3 vldeb %v4, %v4 vldeb %v5, %v5 vlrepf %v7, DISP2(\Index ,0)(\PTR_B_REG) vlrepf %v1, DISP2(\Index ,4)(\PTR_B_REG) vldeb %v6, %v6 vldeb %v7, %v7 vldeb %v1, %v1 vfmadb %v16,%v3,%v7,%v16 vfmadb %v17,%v4,%v7,%v17 vfmadb %v18,%v5,%v7,%v18 vfmadb %v19,%v6,%v7,%v19 vfmadb %v20,%v3,%v1,%v20 vfmadb %v21,%v4,%v1,%v21 .if \IsLast==1 la \PTR_A_REG, DISP8(\Index ,N8)(\PTR_A_REG) .endif vfmadb %v22,%v5,%v1,%v22 vfmadb %v23,%v6,%v1,%v23 .if \IsLast==1 la \PTR_B_REG, DISP2(\Index ,N2)(\PTR_B_REG) .endif .endm .macro Calculate_8x1_I PTR_A_REG,PTR_B_REG,Index,IsLast vlm %v1,%v2, DISP8(\Index , 0)(\PTR_A_REG) vmrhf %v3,%v1,%v1 vmrhf %v5,%v2,%v2 vmrlf %v4,%v1,%v1 vmrlf %v6,%v2,%v2 vldeb %v3, %v3 vldeb %v4, %v4 vldeb %v5, %v5 vlrepf %v7, DISP1(\Index ,0)(\PTR_B_REG) vldeb %v6, %v6 vldeb %v7, %v7 vfmadb %v16,%v3,%v7,%v16 .if \IsLast==1 la \PTR_B_REG, DISP1(\Index ,N1)(\PTR_B_REG) .endif vfmadb %v17,%v4,%v7,%v17 vfmadb %v18,%v5,%v7,%v18 vfmadb %v19,%v6,%v7,%v19 .if \IsLast==1 la \PTR_A_REG, DISP8(\Index ,N8)(\PTR_A_REG) .endif .endm .macro Calculate_4x4_I PTR_A_REG,PTR_B_REG,Index,IsLast vl %v5, DISP4(\Index , 0)(\PTR_A_REG) vlrepf %v7, DISP4(\Index ,0)(\PTR_B_REG) vlrepf %v1, DISP4(\Index ,4)(\PTR_B_REG) vmrhf %v2,%v5,%v5 vmrlf %v3,%v5,%v5 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v7, %v7 vldeb %v1, %v1 vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 vlrepf %v7, DISP4(\Index ,8)(\PTR_B_REG) vlrepf %v1, DISP4(\Index ,12)(\PTR_B_REG) vldeb %v7, %v7 vldeb %v1, %v1 .if \IsLast==1 la \PTR_A_REG, DISP4(\Index ,N4)(\PTR_A_REG) .endif vfmadb %v24,%v2,%v7,%v24 vfmadb %v25,%v3,%v7,%v25 vfmadb %v28,%v2,%v1,%v28 vfmadb %v29,%v3,%v1,%v29 .if \IsLast==1 la \PTR_B_REG, DISP4(\Index ,N4)(\PTR_B_REG) .endif .endm .macro Calculate_4x2_I PTR_A_REG,PTR_B_REG,Index,IsLast vl %v5, DISP4(\Index , 0)(\PTR_A_REG) vlrepf %v7, DISP2(\Index ,0)(\PTR_B_REG) vlrepf %v1, DISP2(\Index ,4)(\PTR_B_REG) vmrhf %v2,%v5,%v5 vmrlf %v3,%v5,%v5 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v7, %v7 vldeb %v1, %v1 vfmadb %v16,%v2,%v7,%v16 vfmadb %v17,%v3,%v7,%v17 .if \IsLast==1 la \PTR_B_REG, DISP2(\Index ,N2)(\PTR_B_REG) .endif vfmadb %v20,%v2,%v1,%v20 vfmadb %v21,%v3,%v1,%v21 .if \IsLast==1 la \PTR_A_REG, DISP4(\Index ,N4)(\PTR_A_REG) .endif .endm .macro Calculate_4x1_I PTR_A_REG,PTR_B_REG,Index,IsLast le %f5,DISP1(\Index ,0)(\PTR_B_REG) maeb %f1,%f5,DISP4(\Index ,0)(\PTR_A_REG) maeb %f2,%f5,DISP4(\Index ,4)(\PTR_A_REG) .if \IsLast==1 la \PTR_B_REG, DISP1(\Index ,N1)(\PTR_B_REG) .endif maeb %f3,%f5,DISP4(\Index ,8)(\PTR_A_REG) maeb %f4,%f5,DISP4(\Index ,12)(\PTR_A_REG) .if \IsLast==1 la \PTR_A_REG, DISP4(\Index ,N4)(\PTR_A_REG) .endif .endm .macro Calculate_2x2_I PTR_A_REG,PTR_B_REG,Index,IsLast vlrepf %v7, DISP2(\Index ,0)(\PTR_B_REG) vlrepf %v1, DISP2(\Index ,4)(\PTR_B_REG) vlef %v2, DISP2(\Index ,0)(\PTR_A_REG) ,0 vlef %v2, DISP2(\Index ,4)(\PTR_A_REG) ,2 vldeb %v7, %v7 vldeb %v2,%v2 vldeb %v1, %v1 vfmadb %v16,%v2,%v7,%v16 .if \IsLast==1 la \PTR_A_REG, DISP2(\Index ,N2)(\PTR_A_REG) .endif vfmadb %v20,%v2,%v1,%v20 .if \IsLast==1 la \PTR_B_REG, DISP2(\Index ,N2)(\PTR_B_REG) .endif .endm .macro Calculate_2x1_I PTR_A_REG,PTR_B_REG,Index,IsLast le %f3,DISP1(\Index ,0)(\PTR_B_REG) maeb %f1,%f3,DISP2(\Index ,0)(\PTR_A_REG) .if \IsLast==1 la \PTR_B_REG, DISP1(\Index ,N1)(\PTR_B_REG) .endif maeb %f2, %f3,DISP2(\Index ,4)(\PTR_A_REG) .if \IsLast==1 la \PTR_A_REG, DISP2(\Index ,N2)(\PTR_A_REG) .endif .endm .macro Calculate_1x1_I PTR_A_REG,PTR_B_REG,Index,IsLast le %f2,DISP1(\Index ,0)(\PTR_A_REG) /**a*/ .if \IsLast==1 la \PTR_A_REG,DISP1(\Index ,N1)(\PTR_A_REG) .endif maeb %f1,%f2,DISP1(\Index ,0)(\PTR_B_REG) .if \IsLast==1 la \PTR_B_REG,DISP1(\Index ,N1)(\PTR_B_REG) .endif .endm .macro CALC_8x4 PTR_A_REG,PTR_B_REG Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_8x4_4 PTR_A_REG,PTR_B_REG Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_8x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_8x2 PTR_A_REG,PTR_B_REG Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_8x2_4 PTR_A_REG,PTR_B_REG Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_8x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_8x1 PTR_A_REG,PTR_B_REG Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_8x1_4 PTR_A_REG,PTR_B_REG Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_8x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_4x4 PTR_A_REG,PTR_B_REG Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_4x4_4 PTR_A_REG,PTR_B_REG Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_4x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_4x2 PTR_A_REG,PTR_B_REG Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_4x2_4 PTR_A_REG,PTR_B_REG Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_4x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_4x1 PTR_A_REG,PTR_B_REG Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_4x1_4 PTR_A_REG,PTR_B_REG Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_4x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_2x4 PTR_A_REG,PTR_B_REG Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,0,1 .endm .macro CALC_2x4_4 PTR_A_REG,PTR_B_REG Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,0,0 Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,1,0 Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,2,0 Calculate_4x2_I \PTR_B_REG,\PTR_A_REG,3,1 .endm .macro CALC_2x2 PTR_A_REG,PTR_B_REG Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_2x2_4 PTR_A_REG,PTR_B_REG Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_2x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_2x1 PTR_A_REG,PTR_B_REG Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_2x1_4 PTR_A_REG,PTR_B_REG Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_2x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro CALC_1x4 PTR_A_REG,PTR_B_REG Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,0,1 .endm .macro CALC_1x4_4 PTR_A_REG,PTR_B_REG Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,0,0 Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,1,0 Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,2,0 Calculate_4x1_I \PTR_B_REG,\PTR_A_REG,3,1 .endm .macro CALC_1x2 PTR_A_REG,PTR_B_REG Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,0,1 .endm .macro CALC_1x2_4 PTR_A_REG,PTR_B_REG Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,0,0 Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,1,0 Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,2,0 Calculate_2x1_I \PTR_B_REG,\PTR_A_REG,3,1 .endm .macro CALC_1x1 PTR_A_REG,PTR_B_REG Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro CALC_1x1_4 PTR_A_REG,PTR_B_REG Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,0,0 Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,1,0 Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,2,0 Calculate_1x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm /**************************************STORAGE*************************************************/ .macro Multiply_8x1 vr1,vr2,vr3,vr4,va1,va2,va3,va4,vb1 #if defined(TRMMKERNEL) vfmdb \vr1,\va1,\vb1 vfmdb \vr2,\va2,\vb1 vfmdb \vr3,\va3,\vb1 vfmdb \vr4,\va4,\vb1 #else vfmadb \vr1,\va1,\vb1,\vr1 vfmadb \vr2,\va2,\vb1,\vr2 vfmadb \vr3,\va3,\vb1,\vr3 vfmadb \vr4,\va4,\vb1,\vr4 #endif .endm .macro Multiply_4x1 vr1,vr2, va1,va2, vb1 #if defined(TRMMKERNEL) vfmdb \vr1,\va1,\vb1 vfmdb \vr2,\va2,\vb1 #else vfmadb \vr1,\va1,\vb1,\vr1 vfmadb \vr2,\va2,\vb1,\vr2 #endif .endm .macro Multiply_2x1 vr1, va1,vb1 #if defined(TRMMKERNEL) vfmdb \vr1,\va1,\vb1 #else vfmadb \vr1,\va1,\vb1,\vr1 #endif .endm .macro STORE_8x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL , LV1 ,LV2 la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG) vl %v1 , 16(\CIJ_REG) vmrhf %v2,%v5,%v5 vmrhf %v4,%v1,%v1 vmrlf %v3,%v5,%v5 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v4, %v4 vmrlf %v5,%v1,%v1 vldeb %v5, %v5 #endif Multiply_8x1 %v2,%v3,%v4,%v5, %v16,%v17,%v18,%v19 ,\ALPHA_VECREG vledb %v2, %v2,0,0 vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vstef %v2, 0(\CIJ_REG),0 vstef %v2, 4(\CIJ_REG),2 vstef %v3, 8(\CIJ_REG),0 vstef %v3, 12(\CIJ_REG),2 vstef %v4, 16(\CIJ_REG),0 vstef %v4, 20(\CIJ_REG),2 vstef %v5, 24(\CIJ_REG),0 vstef %v5, 28(\CIJ_REG),2 la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL ) #if !defined(TRMMKERNEL) vl %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vmrhf %v2,%v16,%v16 vmrhf %v4,%v17,%v17 vmrlf %v3,%v16,%v16 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v4, %v4 vmrlf %v5,%v17,%v17 vldeb %v5, %v5 #endif Multiply_8x1 %v2,%v3,%v4,%v5, %v20,%v21,%v22,%v23 ,\ALPHA_VECREG vledb %v2, %v2,0,0 vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vstef %v2, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v2, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v3, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v3, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v4, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v4, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v5, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v5, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 #if !defined(TRMMKERNEL) vl %v17,0(\CIJ_REG,\LV1) vl %v18,16(\CIJ_REG,\LV1) vmrhf %v2,%v17,%v17 vmrhf %v4,%v18,%v18 vmrlf %v3,%v17,%v17 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v4, %v4 vmrlf %v5,%v18,%v18 vldeb %v5, %v5 #endif Multiply_8x1 %v2,%v3,%v4,%v5, %v24,%v25,%v26,%v27 ,\ALPHA_VECREG vledb %v2, %v2,0,0 vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vstef %v2, 0(\CIJ_REG,\LV1),0 vstef %v2, 4(\CIJ_REG,\LV1),2 vstef %v3, 8(\CIJ_REG,\LV1),0 vstef %v3, 12(\CIJ_REG,\LV1),2 vstef %v4, 16(\CIJ_REG,\LV1),0 vstef %v4, 20(\CIJ_REG,\LV1),2 vstef %v5, 24(\CIJ_REG,\LV1),0 vstef %v5, 28(\CIJ_REG,\LV1),2 #if !defined(TRMMKERNEL) vl %v16,0(\CIJ_REG,\LV2) vl %v17,16(\CIJ_REG,\LV2) vmrhf %v2,%v16,%v16 vmrhf %v4,%v17,%v17 vmrlf %v3,%v16,%v16 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v4, %v4 vmrlf %v5,%v17,%v17 vldeb %v5, %v5 #endif Multiply_8x1 %v2,%v3,%v4,%v5, %v28,%v29,%v30,%v31 ,\ALPHA_VECREG vledb %v2, %v2,0,0 vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vstef %v2, 0(\CIJ_REG,\LV2),0 vstef %v2, 4(\CIJ_REG,\LV2),2 vstef %v3, 8(\CIJ_REG,\LV2),0 vstef %v3, 12(\CIJ_REG,\LV2),2 vstef %v4, 16(\CIJ_REG,\LV2),0 vstef %v4, 20(\CIJ_REG,\LV2),2 vstef %v5, 24(\CIJ_REG,\LV2),0 vstef %v5, 28(\CIJ_REG,\LV2),2 la \CIJ_REG,N8(\CIJ_REG) .endm .macro STORE_8x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG) vl %v1 , 16(\CIJ_REG) vmrhf %v2,%v5,%v5 vmrhf %v4,%v1,%v1 vmrlf %v3,%v5,%v5 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v4, %v4 vmrlf %v5,%v1,%v1 vldeb %v5, %v5 #endif Multiply_8x1 %v2,%v3,%v4,%v5, %v16,%v17,%v18,%v19 ,\ALPHA_VECREG vledb %v2, %v2,0,0 vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vstef %v2, 0(\CIJ_REG),0 vstef %v2, 4(\CIJ_REG),2 vstef %v3, 8(\CIJ_REG),0 vstef %v3, 12(\CIJ_REG),2 vstef %v4, 16(\CIJ_REG),0 vstef %v4, 20(\CIJ_REG),2 vstef %v5, 24(\CIJ_REG),0 vstef %v5, 28(\CIJ_REG),2 #if !defined(TRMMKERNEL) vl %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vmrhf %v2,%v16,%v16 vmrhf %v4,%v17,%v17 vmrlf %v3,%v16,%v16 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v4, %v4 vmrlf %v5,%v17,%v17 vldeb %v5, %v5 #endif Multiply_8x1 %v2,%v3,%v4,%v5, %v20,%v21,%v22,%v23 ,\ALPHA_VECREG vledb %v2, %v2,0,0 vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vstef %v2, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v2, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v3, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v3, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v4, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v4, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 vstef %v5, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),0 vstef %v5, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 la \CIJ_REG,N8(\CIJ_REG) .endm .macro STORE_8x1 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG) vl %v1 , 16(\CIJ_REG) vmrhf %v2,%v5,%v5 vmrhf %v4,%v1,%v1 vmrlf %v3,%v5,%v5 vldeb %v2, %v2 vldeb %v3, %v3 vldeb %v4, %v4 vmrlf %v5,%v1,%v1 vldeb %v5, %v5 #endif Multiply_8x1 %v2,%v3,%v4,%v5, %v16,%v17,%v18,%v19 ,\ALPHA_VECREG vledb %v2, %v2,0,0 vledb %v3, %v3,0,0 vledb %v4, %v4,0,0 vledb %v5, %v5,0,0 vstef %v2, 0(\CIJ_REG),0 vstef %v2, 4(\CIJ_REG),2 vstef %v3, 8(\CIJ_REG),0 vstef %v3, 12(\CIJ_REG),2 vstef %v4, 16(\CIJ_REG),0 vstef %v4, 20(\CIJ_REG),2 vstef %v5, 24(\CIJ_REG),0 vstef %v5, 28(\CIJ_REG),2 la \CIJ_REG,N8(\CIJ_REG) .endm .macro STORE_4x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL, LV1 ,LV2 la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG) vmrhf %v1,%v5,%v5 vmrlf %v2,%v5,%v5 vldeb %v1, %v1 vldeb %v2, %v2 #endif Multiply_4x1 %v1,%v2 , %v16,%v17 ,\ALPHA_VECREG vledb %v1, %v1,0,0 vledb %v2, %v2,0,0 vstef %v1, 0(\CIJ_REG),0 vstef %v1, 4(\CIJ_REG),2 vstef %v2, 8(\CIJ_REG),0 vstef %v2, 12(\CIJ_REG),2 la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL ) #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL ) vmrhf %v16,%v5,%v5 vmrlf %v17,%v5,%v5 vldeb %v16, %v16 vldeb %v17, %v17 #endif Multiply_4x1 %v16,%v17 , %v20,%v21 ,\ALPHA_VECREG vledb %v1, %v16,0,0 vledb %v2, %v17,0,0 vstef %v1, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL ),0 vstef %v1, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL ),2 vstef %v2, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL ),0 vstef %v2, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL ),2 #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG,\LV1 ) vmrhf %v16,%v5,%v5 vmrlf %v17,%v5,%v5 vldeb %v16, %v16 vldeb %v17, %v17 #endif Multiply_4x1 %v16,%v17 , %v24,%v25 ,\ALPHA_VECREG vledb %v1, %v16,0,0 vledb %v2, %v17,0,0 vstef %v1, 0(\CIJ_REG,\LV1 ),0 vstef %v1, 4(\CIJ_REG,\LV1 ),2 vstef %v2, 8(\CIJ_REG,\LV1 ),0 vstef %v2, 12(\CIJ_REG,\LV1 ),2 #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG,\LV2 ) vmrhf %v16,%v5,%v5 vmrlf %v17,%v5,%v5 vldeb %v16, %v16 vldeb %v17, %v17 #endif Multiply_4x1 %v16,%v17, %v28,%v29 ,\ALPHA_VECREG vledb %v1, %v16,0,0 vledb %v2, %v17,0,0 vstef %v1, 0(\CIJ_REG,\LV2 ),0 vstef %v1, 4(\CIJ_REG,\LV2 ),2 vstef %v2, 8(\CIJ_REG,\LV2 ),0 vstef %v2, 12(\CIJ_REG,\LV2 ),2 la \CIJ_REG,N4(\CIJ_REG) .endm .macro STORE_4x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG) vmrhf %v1,%v5,%v5 vmrlf %v2,%v5,%v5 vldeb %v1, %v1 vldeb %v2, %v2 #endif Multiply_4x1 %v1,%v2 , %v16,%v17 ,\ALPHA_VECREG vledb %v1, %v1,0,0 vledb %v2, %v2,0,0 vstef %v1, 0(\CIJ_REG),0 vstef %v1, 4(\CIJ_REG),2 vstef %v2, 8(\CIJ_REG),0 vstef %v2, 12(\CIJ_REG),2 #if !defined(TRMMKERNEL) vl %v5, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL ) vmrhf %v16,%v5,%v5 vmrlf %v17,%v5,%v5 vldeb %v16, %v16 vldeb %v17, %v17 #endif Multiply_4x1 %v16,%v17 , %v20,%v21 ,\ALPHA_VECREG vledb %v1, %v16,0,0 vledb %v2, %v17,0,0 vstef %v1, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL ),0 vstef %v1, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL ),2 vstef %v2, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL ),0 vstef %v2, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL ),2 la \CIJ_REG,N4(\CIJ_REG) .endm .macro STORE_4x1 ALPHA_FLOAT,CIJ_REG , LDC_BYTE_ORIGINAL ledbr %f7,\ALPHA_FLOAT #if defined(TRMMKERNEL) meebr %f1,%f7 meebr %f2,%f7 meebr %f3,%f7 meebr %f4,%f7 ste %f1,0(\CIJ_REG) ste %f2,4(\CIJ_REG ) ste %f3,8(\CIJ_REG ) ste %f4,12(\CIJ_REG) #else le %f5,0(\CIJ_REG) maebr %f5,%f1,%f7 ste %f5,0(\CIJ_REG) le %f6,4(\CIJ_REG ) maebr %f6,%f2,%f7 ste %f6,4(\CIJ_REG ) le %f5,8(\CIJ_REG) maebr %f5,%f3,%f7 ste %f5,8(\CIJ_REG) le %f6,12(\CIJ_REG) maebr %f6,%f4,%f7 ste %f6,12(\CIJ_REG) #endif la \CIJ_REG,N4(\CIJ_REG) .endm .macro STORE_2x2 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vlef %v1,0(\CIJ_REG) ,0 vlef %v1,4(\CIJ_REG) ,2 vldeb %v1,%v1 #endif Multiply_2x1 %v1, %v16,\ALPHA_VECREG vledb %v1, %v1,0,0 vstef %v1, 0(\CIJ_REG),0 vstef %v1, 4(\CIJ_REG),2 #if !defined(TRMMKERNEL) vlef %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL ) ,0 vlef %v16,4(\CIJ_REG,\LDC_BYTE_ORIGINAL ),2 vldeb %v16,%v16 #endif Multiply_2x1 %v16, %v20,\ALPHA_VECREG vledb %v1, %v16,0,0 vstef %v1, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL ),0 vstef %v1, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL ),2 la \CIJ_REG,N2(\CIJ_REG) .endm .macro STORE_2x1 ALPHA_FLOAT,CIJ_REG , LDC_BYTE_ORIGINAL ledbr %f3,\ALPHA_FLOAT #if defined(TRMMKERNEL) meebr %f1,%f3 meebr %f2,%f3 ste %f1,0(\CIJ_REG) ste %f2,4(\CIJ_REG) #else le %f4,0(\CIJ_REG) le %f5,4(\CIJ_REG) maebr %f4,%f1,%f3 maebr %f5,%f2,%f3 ste %f4,0(\CIJ_REG) ste %f5,4(\CIJ_REG) #endif la \CIJ_REG,N2(\CIJ_REG) .endm /*STORE C1X1*/ .macro STORE_1x1 ALPHA_FLOAT,CIJ_REG,LDC_BYTE_ORIGINAL ledbr %f3,\ALPHA_FLOAT #if defined(TRMMKERNEL) meebr %f1,%f3 ste %f1,0(\CIJ_REG) #else le %f2,0(\CIJ_REG) maebr %f2,%f1,%f3 ste %f2,0(\CIJ_REG) #endif la \CIJ_REG,N1(\CIJ_REG) .endm /*reversed ones*/ .macro STORE_2x4 ALPHA_VECREG,CIJ_REG , LDC_BYTE_ORIGINAL , LV1 ,LV2 #if !defined(TRMMKERNEL) vlef %v1,0(\CIJ_REG) ,0 vlef %v1,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) ,2 vldeb %v1,%v1 #endif la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) Multiply_2x1 %v1, %v16 ,\ALPHA_VECREG la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL ) vledb %v1, %v1,0,0 vstef %v1, 0(\CIJ_REG),0 vstef %v1, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 #if !defined(TRMMKERNEL) vlef %v16,0(\CIJ_REG,\LV1 ) ,0 vlef %v16,0(\CIJ_REG,\LV2 ),2 vldeb %v16,%v16 #endif Multiply_2x1 %v16, %v17,\ALPHA_VECREG vledb %v1, %v16,0,0 vstef %v1, 0(\CIJ_REG ,\LV1 ),0 vstef %v1, 0(\CIJ_REG,\LV2 ),2 /*2nd*/ #if !defined(TRMMKERNEL) vlef %v1,4(\CIJ_REG) ,0 vlef %v1,4(\CIJ_REG,\LDC_BYTE_ORIGINAL) ,2 vldeb %v1,%v1 #endif Multiply_2x1 %v1, %v20 ,\ALPHA_VECREG vledb %v1, %v1,0,0 vstef %v1, 4(\CIJ_REG),0 vstef %v1,4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2 #if !defined(TRMMKERNEL) vlef %v16,4(\CIJ_REG,\LV1 ) ,0 vlef %v16,4(\CIJ_REG,\LV2 ),2 vldeb %v16,%v16 #endif Multiply_2x1 %v16, %v21,\ALPHA_VECREG vledb %v1, %v16,0,0 vstef %v1, 4(\CIJ_REG ,\LV1 ),0 vstef %v1, 4(\CIJ_REG,\LV2 ),2 la \CIJ_REG,N2(\CIJ_REG) .endm .macro STORE_1x4 ALPHA_FLOAT,CIJ_REG , LDC_BYTE_ORIGINAL , LV1 ,LV2 la \LV1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) ledbr %f7,\ALPHA_FLOAT la \LV2,0(\LV1,\LDC_BYTE_ORIGINAL ) #if defined(TRMMKERNEL) meebr %f1,%f7 meebr %f2,%f7 meebr %f3,%f7 meebr %f4,%f7 ste %f1,0(\CIJ_REG) ste %f2,0(\CIJ_REG, \LDC_BYTE_ORIGINAL) ste %f3,0(\CIJ_REG, \LV1) ste %f4,0(\CIJ_REG, \LV2) #else le %f5,0(\CIJ_REG) maebr %f5,%f1,%f7 ste %f5,0(\CIJ_REG) le %f6,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) maebr %f6,%f2,%f7 ste %f6,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) le %f5,0(\CIJ_REG, \LV1) maebr %f5,%f3,%f7 ste %f5,0(\CIJ_REG, \LV1) le %f6,0(\CIJ_REG, \LV2) maebr %f6,%f4,%f7 ste %f6,0(\CIJ_REG, \LV2) #endif la \CIJ_REG,N1(\CIJ_REG) .endm .macro STORE_1x2 ALPHA_FLOAT,CIJ_REG , LDC_BYTE_ORIGINAL ledbr %f3,\ALPHA_FLOAT #if defined(TRMMKERNEL) meebr %f1,%f3 meebr %f2,%f3 ste %f1,0(\CIJ_REG) ste %f2,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) #else le %f4,0(\CIJ_REG) maebr %f4,%f1,%f3 ste %f4,0(\CIJ_REG) le %f5,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) maebr %f5,%f2,%f3 ste %f5,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) #endif la \CIJ_REG,N1(\CIJ_REG) .endm /****************************TRMM POINTER REFRESH MACROSES*************************/ .macro RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /* ptrbb = bb;*/ lgr \PTR_B,\B_VAL /*refresh BPOINT*/ #else /* ptrba =ptrba+ off*C_A; ptrbb = bb + off*C_B;*/ .if \C_B==4 .if \C_A==8 sllg \PTR_B, \OFF_VAL,4 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*4*/ agr \PTR_A,\PTR_B /*ptrba+off*4**/ la \PTR_B,0(\B_VAL,\PTR_B) .elseif \C_A==4 sllg \PTR_B, \OFF_VAL,4 agr \PTR_A,\PTR_B /*ptrba+off*4**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,3 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_B, \PTR_B la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,2 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,4 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==2 .if \C_A==8 sllg \PTR_B, \OFF_VAL,5 agr \PTR_A,\PTR_B /*ptrba+off*8**/ sllg \PTR_B, \OFF_VAL,3 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==4 sllg \PTR_B, \OFF_VAL,3 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,3 agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,2 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_B,\PTR_B /* off+off**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==1 .if \C_A==8 sllg \PTR_B, \OFF_VAL,5 agr \PTR_A,\PTR_B /*ptrba+off*8**/ sllg \PTR_B, \OFF_VAL,2 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==4 sllg \PTR_B, \OFF_VAL,4 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,2 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,2 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,2 agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .endif #endif .endm /**/ .macro RefreshTempBk TEMP_VAL,BK_VAL,OFF_VAL,INCR_A,INCR_B #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) /* temp = bk-off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #elif defined(LEFT) /* temp = off+INCR_A; // number of values in A */ la \TEMP_VAL,\INCR_A(\OFF_VAL) #else /* temp = off+INCR_B // number of values in B*/ la \TEMP_VAL,\INCR_B(\OFF_VAL) #endif .endm .macro RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /*temp = bk - off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #ifdef LEFT /*temp -= 8; // number of values in A*/ lay \TEMP_VAL,-\C_A(\TEMP_VAL) #else /*temp -= 4; // number of values in B*/ lay \TEMP_VAL,-\C_B(\TEMP_VAL) #endif /*ptrba += temp*C_A; ptrbb += temp*C_B;*/ .if \C_A==8 sllg \TEMP_VAL, \TEMP_VAL,5 .elseif \C_A==4 sllg \TEMP_VAL, \TEMP_VAL,4 /*temp*4*/ .elseif \C_A==2 sllg \TEMP_VAL, \TEMP_VAL,3 /*temp*2*/ .elseif \C_A==1 sllg \TEMP_VAL, \TEMP_VAL,2 /*temp*1*/ .endif la \PTR_A,0(\PTR_A,\TEMP_VAL) /*ptrba+temp*C_A*/ /*we do not need to refresh ptrbb. so lets ignore it*/ #endif #ifdef LEFT /*off += 8; // number of values in A*/ aghi \OFF_VAL,\C_A #endif .endmOpenBLAS-0.2.20/kernel/zarch/strmm8x4V.S000066400000000000000000000520011313527062700175110ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2017/03/01 AbdelRauf (quickwritereader@gmail.com) * BLASTEST : passed * CTEST : passed * TEST : passed **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" /* #BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168] offset=stack[176] **********************************************************************************************/ /*Note: r0 can not be used as address disp register */ #define BM %r2 #define BM_CUR %r0 #define BN %r3 #define BN_CUR %r10 #define BK %r4 #define LDC_BYTE %r8 #define ALPHA %f0 #define ALPHA_VECT %v0 #define LOCAL_VAR1 %r9 #define LOCAL_VAR2 %r1 #define LOCAL_VAR3 %r11 #define A %r5 #define B %r6 #define CIJ %r7 #define CIJ_LOCAL %r12 #define OFF %r13 #define OFFSET %f8 #define ALIGN_4 .align 16 #define ALIGN_2 .align 8 #define PREFETCH_INS 1 /**************************Include kernel helper macrosses**********************************/ #include "skernelMacros.S" /***********************************DGEMM***********************************************************/ PROLOGUE #if defined(TRMMKERNEL) std OFFSET,40(%r15) stmg %r6,%r13,48(%r15) #else stmg %r6,%r12,48(%r15) #endif lg CIJ, 160(%r15) lg LOCAL_VAR1, 168(%r15) #if defined(TRMMKERNEL) lg OFF,176(%r15) ldgr OFFSET ,OFF #endif srlg BN_CUR,BN,2 vrepf ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/ vldeb ALPHA_VECT,ALPHA_VECT sllg LDC_BYTE, LOCAL_VAR1,2 /*calculate lcd stride with bytes float=4 x<<2 */ #if defined(TRMMKERNEL) && !defined(LEFT) /*off = -offset;*/ lgdr LOCAL_VAR1,OFFSET lcgr OFF,LOCAL_VAR1 #endif cijle BN_CUR,0,.LX2 ALIGN_4 .LX4_BN: #if defined(PREFETCH_INS) pfd 1, 0(A) pfd 1, 0(B) #endif #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x4 ALIGN_4 .L8x4_BM: /*BM_CUR LOOP */ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_8x4 cijle LOCAL_VAR1,0,.L8x4_mod ALIGN_4 .L8x4_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) #endif CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 128(LOCAL_VAR2) #endif brctg LOCAL_VAR1,.L8x4_4_BK ALIGN_4 .L8x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,8,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x4_BK_Store ALIGN_4 .L8x4_BK: /*BK_CUR LOOP */ CALC_8x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x4_BK ALIGN_4 .L8x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2 #if defined(TRMMKERNEL) /*RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,L_VAR,PTR_A,C_A*/ RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,4 #endif brctg BM_CUR,.L8x4_BM ALIGN_4 .L4x4: tmll BM,4 jz .L2x4 ALIGN_4 .L4x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_4x4 cijle LOCAL_VAR1,0,.L4x4_mod ALIGN_4 .L4x4_4_BK: /*BK_CUR LOOP */ CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_4_BK ALIGN_4 .L4x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store ALIGN_4 .L4x4_BK: /*BK_CUR LOOP */ CALC_4x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_BK ALIGN_4 .L4x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,4 #endif ALIGN_2 .L2x4: tmll BM,2 jz .L1x4 ALIGN_4 .L2x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_2x4 cijle LOCAL_VAR1,0,.L2x4_mod ALIGN_4 .L2x4_4_BK: /*BK_CUR LOOP */ CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_4_BK ALIGN_4 .L2x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store ALIGN_4 .L2x4_BK: /*BK_CUR LOOP */ CALC_2x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_BK ALIGN_4 .L2x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,4 #endif ALIGN_4 .L1x4: tmll BM,1 jz .Lx4_INNER_END ALIGN_4 .L1x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_1x4 cijle LOCAL_VAR1,0,.L1x4_mod ALIGN_4 .L1x4_4_BK: /*BK_CUR LOOP */ CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_4_BK ALIGN_4 .L1x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store ALIGN_4 .L1x4_BK: /*BK_CUR LOOP */ CALC_1x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_BK ALIGN_4 .L1x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x4 ALPHA ,CIJ_LOCAL, LDC_BYTE , LOCAL_VAR1 ,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,4 #endif ALIGN_2 .Lx4_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR1,LDC_BYTE,2 /*op*4 */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,4 #endif sllg LOCAL_VAR2,BK,4 /*op*4*sizeof(float) =op*16* 2**4 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(float) */ brctg BN_CUR,.LX4_BN /*********************************X2 SECTION************************************************/ ALIGN_4 .LX2: tmll BN,2 jz .Lx1 ALIGN_4 .Lx2_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x2 ALIGN_4 .L8x2_BM: /*BM_CUR LOOP */ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,2 RefreshTempBk LOCAL_VAR1,BK,OFF,8,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_8x2 cijle LOCAL_VAR1,0,.L8x2_mod ALIGN_4 .L8x2_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) #endif CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x2_4_BK ALIGN_4 .L8x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,8,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x2_BK_Store ALIGN_4 .L8x2_BK: /*BK_CUR LOOP */ CALC_8x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x2_BK ALIGN_4 .L8x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,2 #endif ALIGN_4 brctg BM_CUR,.L8x2_BM ALIGN_2 .L4x2: tmll BM,4 jz .L2x2 ALIGN_4 .L4x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_4x2 cijle LOCAL_VAR1,0,.L4x2_mod ALIGN_4 .L4x2_4_BK: /*BK_CUR LOOP */ CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_4_BK ALIGN_4 .L4x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store ALIGN_4 .L4x2_BK: /*BK_CUR LOOP */ CALC_4x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_BK ALIGN_4 .L4x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,2 #endif ALIGN_2 .L2x2: tmll BM,2 jz .L1x2 ALIGN_4 .L2x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_2x2 cijle LOCAL_VAR1,0,.L2x2_mod ALIGN_4 .L2x2_4_BK: /*BK_CUR LOOP */ CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_4_BK ALIGN_4 .L2x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store ALIGN_4 .L2x2_BK: /*BK_CUR LOOP */ CALC_2x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_BK ALIGN_4 .L2x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,2 #endif ALIGN_2 .L1x2: tmll BM,1 jz .Lx2_INNER_END ALIGN_4 .L1x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_1x2 cijle LOCAL_VAR1,0,.L1x2_mod ALIGN_4 .L1x2_4_BK: /*BK_CUR LOOP */ CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_4_BK ALIGN_4 .L1x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store ALIGN_4 .L1x2_BK: /*BK_CUR LOOP */ CALC_1x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_BK ALIGN_4 .L1x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x2 ALPHA ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,2 #endif ALIGN_2 .Lx2_INNER_END: /*add LDC_BYTE_COPY to new*/ la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*op*2 */ sllg LOCAL_VAR2,BK,3 /*op*2*sizeof(float) =op*8 2**3 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,2 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(float) */ /*********************************X1 SECTION************************************************/ ALIGN_2 .Lx1: tmll BN,1 jz .L_FUNC_END ALIGN_4 .Lx1_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x1 ALIGN_4 .L8x1_BM: /*BM_CUR LOOP */ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,1 RefreshTempBk LOCAL_VAR1,BK,OFF,8,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_8x1 cijle LOCAL_VAR1,0,.L8x1_mod ALIGN_4 .L8x1_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) #endif CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x1_4_BK ALIGN_4 .L8x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,8,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x1_BK_Store ALIGN_4 .L8x1_BK: /*BK_CUR LOOP */ CALC_8x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x1_BK ALIGN_4 .L8x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,1 #endif ALIGN_4 brctg BM_CUR,.L8x1_BM ALIGN_2 .L4x1: tmll BM,4 jz .L2x1 ALIGN_4 .L4x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_4x1 cijle LOCAL_VAR1,0,.L4x1_mod ALIGN_4 .L4x1_4_BK: /*BK_CUR LOOP */ CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_4_BK ALIGN_4 .L4x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store ALIGN_4 .L4x1_BK: /*BK_CUR LOOP */ CALC_4x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_BK ALIGN_4 .L4x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x1 ALPHA ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,1 #endif ALIGN_2 .L2x1: tmll BM,2 jz .L1x1 ALIGN_4 .L2x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_2x1 cijle LOCAL_VAR1,0,.L2x1_mod ALIGN_4 .L2x1_4_BK: /*BK_CUR LOOP */ CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_4_BK ALIGN_4 .L2x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store ALIGN_4 .L2x1_BK: /*BK_CUR LOOP */ CALC_2x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_BK ALIGN_4 .L2x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x1 ALPHA ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,1 #endif ALIGN_2 .L1x1: tmll BM, 1 jz .Lx1_INNER_END ALIGN_4 .L1x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_1x1 cijle LOCAL_VAR1,0,.L1x1_mod ALIGN_4 .L1x1_4_BK: /*BK_CUR LOOP */ CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_4_BK ALIGN_4 .L1x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store ALIGN_4 .L1x1_BK: /*BK_CUR LOOP */ CALC_1x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_BK ALIGN_4 .L1x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,1 #endif ALIGN_2 .Lx1_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR2,BK,2 /*op*1*sizeof(float) =op*4 2**2 */ la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,1 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(float) */ ALIGN_2 .L_FUNC_END: /*end*/ #if defined(TRMMKERNEL) ld OFFSET,40(%r15) lmg %r6,%r13,48(%r15) #else lmg %r6,%r12,48(%r15) #endif br %r14 .end OpenBLAS-0.2.20/kernel/zarch/trmm8x4V.S000066400000000000000000000526371313527062700173450ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2017/01/01 AbdelRauf (quickwritereader@gmail.com) * BLASTEST : OK * CTEST : OK * TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" /* #BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ##bm=r2,bn=r3, bk=r4, alpha=f0,ba=r5,bb=r6,stack[160] ,ldc=stack[168] offset=stack[176] **********************************************************************************************/ /*Note: r0 can not be used as address disp register */ #define BM %r2 #define BM_CUR %r0 #define BN %r3 #define BN_CUR %r10 #define BK %r4 #define LDC_BYTE %r8 #define ALPHA %f0 #define ALPHA_VECT %v0 #define LOCAL_VAR1 %r9 #define LOCAL_VAR2 %r1 #define LOCAL_VAR3 %r11 #define A %r5 #define B %r6 #define CIJ %r7 #define CIJ_LOCAL %r12 #define OFF %r13 #define OFFSET %f8 #define ALIGN_4 .align 16 #define ALIGN_2 .align 8 #define PREFETCH_INS 1 /**************************Include kernel helper macrosses**********************************/ #include "kernelMacros.S" #if defined (TRMMKERNEL) #define STORE_8x4 STORE_TRMM_8x4 #define STORE_4x4 STORE_TRMM_4x4 #define STORE_2x4 STORE_TRMM_2x4 #define STORE_1x4 STORE_TRMM_1x4 #define STORE_8x2 STORE_TRMM_8x2 #define STORE_4x2 STORE_TRMM_4x2 #define STORE_2x2 STORE_TRMM_2x2 #define STORE_1x2 STORE_TRMM_1x2 #define STORE_8x1 STORE_TRMM_8x1 #define STORE_4x1 STORE_TRMM_4x1 #define STORE_2x1 STORE_TRMM_2x1 #define STORE_1x1 STORE_TRMM_1x1 #endif /***********************************DGEMM***********************************************************/ PROLOGUE #if defined(TRMMKERNEL) std OFFSET,40(%r15) stmg %r6,%r13,48(%r15) #else stmg %r6,%r12,48(%r15) #endif lg CIJ, 160(%r15) lg LOCAL_VAR1, 168(%r15) #if defined(TRMMKERNEL) lg OFF,176(%r15) ldgr OFFSET ,OFF #endif srlg BN_CUR,BN,2 vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/ sllg LDC_BYTE, LOCAL_VAR1,3 /*calculate lcd stride with bytes double=8 x<<3 */ #if defined(TRMMKERNEL) && !defined(LEFT) /*off = -offset;*/ lgdr LOCAL_VAR1,OFFSET lcgr OFF,LOCAL_VAR1 #endif cijle BN_CUR,0,.LX2 ALIGN_4 .LX4_BN: #if defined(PREFETCH_INS) pfd 1, 0(A) pfd 1, 256(A) pfd 1, 0(B) pfd 1, 256(B) #endif #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x4 ALIGN_4 .L8x4_BM: /*BM_CUR LOOP */ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_8x4 cijle LOCAL_VAR1,0,.L8x4_mod ALIGN_4 .L8x4_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 512(LOCAL_VAR3) #endif CALC_8x4_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 512(LOCAL_VAR2) #endif brctg LOCAL_VAR1,.L8x4_4_BK ALIGN_4 .L8x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,8,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x4_BK_Store ALIGN_4 .L8x4_BK: /*BK_CUR LOOP */ CALC_8x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x4_BK ALIGN_4 .L8x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x4 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) /*RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,L_VAR,PTR_A,C_A*/ RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,4 #endif brctg BM_CUR,.L8x4_BM ALIGN_4 .L4x4: tmll BM,4 jz .L2x4 ALIGN_4 .L4x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_4x4 cijle LOCAL_VAR1,0,.L4x4_mod ALIGN_4 .L4x4_4_BK: /*BK_CUR LOOP */ CALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_4_BK ALIGN_4 .L4x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store ALIGN_4 .L4x4_BK: /*BK_CUR LOOP */ CALC_4x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_BK ALIGN_4 .L4x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,4 #endif ALIGN_2 .L2x4: tmll BM,2 jz .L1x4 ALIGN_4 .L2x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_2x4 cijle LOCAL_VAR1,0,.L2x4_mod ALIGN_4 .L2x4_4_BK: /*BK_CUR LOOP */ CALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_4_BK ALIGN_4 .L2x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store ALIGN_4 .L2x4_BK: /*BK_CUR LOOP */ CALC_2x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_BK ALIGN_4 .L2x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,4 #endif ALIGN_4 .L1x4: tmll BM,1 jz .Lx4_INNER_END ALIGN_4 .L1x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_1x4 cijle LOCAL_VAR1,0,.L1x4_mod ALIGN_4 .L1x4_4_BK: /*BK_CUR LOOP */ CALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_4_BK ALIGN_4 .L1x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store ALIGN_4 .L1x4_BK: /*BK_CUR LOOP */ CALC_1x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_BK ALIGN_4 .L1x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x4 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,4 #endif ALIGN_2 .Lx4_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR1,LDC_BYTE,2 /*multiply*4 */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,4 #endif sllg LOCAL_VAR2,BK,5 /*muyliply*4*sizeof(double) =multiply*32* 2**5 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */ brctg BN_CUR,.LX4_BN /*********************************X2 SECTION************************************************/ ALIGN_4 .LX2: tmll BN,2 jz .Lx1 ALIGN_4 .Lx2_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x2 ALIGN_4 .L8x2_BM: /*BM_CUR LOOP */ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,2 RefreshTempBk LOCAL_VAR1,BK,OFF,8,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_8x2 cijle LOCAL_VAR1,0,.L8x2_mod ALIGN_4 .L8x2_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) pfd 1,64(LOCAL_VAR2) #endif CALC_8x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x2_4_BK ALIGN_4 .L8x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,8,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x2_BK_Store ALIGN_4 .L8x2_BK: /*BK_CUR LOOP */ CALC_8x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x2_BK ALIGN_4 .L8x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x2 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,2 #endif ALIGN_4 brctg BM_CUR,.L8x2_BM ALIGN_2 .L4x2: tmll BM,4 jz .L2x2 ALIGN_4 .L4x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_4x2 cijle LOCAL_VAR1,0,.L4x2_mod ALIGN_4 .L4x2_4_BK: /*BK_CUR LOOP */ CALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_4_BK ALIGN_4 .L4x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store ALIGN_4 .L4x2_BK: /*BK_CUR LOOP */ CALC_4x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_BK ALIGN_4 .L4x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,2 #endif ALIGN_2 .L2x2: tmll BM,2 jz .L1x2 ALIGN_4 .L2x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_2x2 cijle LOCAL_VAR1,0,.L2x2_mod ALIGN_4 .L2x2_4_BK: /*BK_CUR LOOP */ CALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_4_BK ALIGN_4 .L2x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store ALIGN_4 .L2x2_BK: /*BK_CUR LOOP */ CALC_2x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_BK ALIGN_4 .L2x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,2 #endif ALIGN_2 .L1x2: tmll BM,1 jz .Lx2_INNER_END ALIGN_4 .L1x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_1x2 cijle LOCAL_VAR1,0,.L1x2_mod ALIGN_4 .L1x2_4_BK: /*BK_CUR LOOP */ CALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_4_BK ALIGN_4 .L1x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store ALIGN_4 .L1x2_BK: /*BK_CUR LOOP */ CALC_1x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_BK ALIGN_4 .L1x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x2 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,2 #endif ALIGN_2 .Lx2_INNER_END: /*add LDC_BYTE_COPY to new*/ la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*multiply*2 */ sllg LOCAL_VAR2,BK,4 /*muyliply*2*sizeof(double) =multiply*16* 2**4 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,2 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(double) */ /*********************************X1 SECTION************************************************/ ALIGN_2 .Lx1: tmll BN,1 jz .L_FUNC_END ALIGN_4 .Lx1_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,3 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L4x1 ALIGN_4 .L8x1_BM: /*BM_CUR LOOP */ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,8,1 RefreshTempBk LOCAL_VAR1,BK,OFF,8,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_8x1 cijle LOCAL_VAR1,0,.L8x1_mod ALIGN_4 .L8x1_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) #endif CALC_8x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x1_4_BK ALIGN_4 .L8x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,8,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x1_BK_Store ALIGN_4 .L8x1_BK: /*BK_CUR LOOP */ CALC_8x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L8x1_BK ALIGN_4 .L8x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_8x1 ALPHA_VECT,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,8,1 #endif ALIGN_4 brctg BM_CUR,.L8x1_BM ALIGN_2 .L4x1: tmll BM,4 jz .L2x1 ALIGN_4 .L4x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_4x1 cijle LOCAL_VAR1,0,.L4x1_mod ALIGN_4 .L4x1_4_BK: /*BK_CUR LOOP */ CALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_4_BK ALIGN_4 .L4x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store ALIGN_4 .L4x1_BK: /*BK_CUR LOOP */ CALC_4x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_BK ALIGN_4 .L4x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_4x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,4,1 #endif ALIGN_2 .L2x1: tmll BM,2 jz .L1x1 ALIGN_4 .L2x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_2x1 cijle LOCAL_VAR1,0,.L2x1_mod ALIGN_4 .L2x1_4_BK: /*BK_CUR LOOP */ CALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_4_BK ALIGN_4 .L2x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store ALIGN_4 .L2x1_BK: /*BK_CUR LOOP */ CALC_2x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_BK ALIGN_4 .L2x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_2x1 ALPHA_VECT ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,2,1 #endif ALIGN_2 .L1x1: tmll BM, 1 jz .Lx1_INNER_END ALIGN_4 .L1x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_CVEC_1x1 cijle LOCAL_VAR1,0,.L1x1_mod ALIGN_4 .L1x1_4_BK: /*BK_CUR LOOP */ CALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_4_BK ALIGN_4 .L1x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store ALIGN_4 .L1x1_BK: /*BK_CUR LOOP */ CALC_1x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_BK ALIGN_4 .L1x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ STORE_1x1 ALPHA ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR2,LOCAL_VAR3,1,1 #endif ALIGN_2 .Lx1_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR2,BK,3 /*muyliply*2*sizeof(double) =multiply*8* 2**3 */ la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,1 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(double) */ ALIGN_2 .L_FUNC_END: /*end*/ #if defined(TRMMKERNEL) ld OFFSET,40(%r15) lmg %r6,%r13,48(%r15) #else lmg %r6,%r12,48(%r15) #endif br %r14 .end OpenBLAS-0.2.20/kernel/zarch/zkernelMacrosV.S000066400000000000000000001320651313527062700206330ustar00rootroot00000000000000/****************************************Implementation**Details**********************************************/ /* */ /* Lets denote (a,a1i) complex which is mathematically a+a1*i */ /* Complex number multiplication: (a,a1i)*(b,b1i) */ /* As i*i=-1 .The multiplication result will be: */ /* (a+a1*i)(b+b1*i)=a*b+a1*i*b1*i+ a1*i*b+a*b1*i=a*b-a1*b1 + (a1*b+a*b1)*i which is (ab-a1b1,a1b+ab1) */ /* so let c= ab-a1b1 , ci=a1b+ab1 then */ /* c=c+a*b-a1*b1 => c=a*b-( a1*b1-c) => c= a1*b1-c then c=a*b-c two mseb */ /* ci=ci+a1*b+a*b1 => ci= a1*b+ci then ci= a*b1+ci */ /* For simd real and imaginary parts will be grouped together */ /* such (realA,realK) and (imageA ,imageK) */ /* Simd(0,1)=(a*b,k*b)-((ai*bi,ki*bi)-Simd(0,1)) */ /* SimdI(0,1)=SimdI(0,1)+(a*bi,k*bi)+(ai*b,ki*b) */ /* */ /* */ /* for defined(NR) || defined(NC) || defined(TR) || defined(TC) */ /* (a+a1*I)(b-b1*I)=ab+a1*b1+I(a1b-ab1) */ /* */ /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */ /* ci=ci+a1b-ab1 => ci=a1*b-(ab1-ci) => ci=ab1-ci; ci=a1*b-ci */ /* */ /* */ /* for defined(RN) || defined(RT) || defined(CN) || defined(CT) */ /* (a-a1*I)(b+b1*I)=ab+a1*b1+I(-a1b+ab1) */ /* */ /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */ /* ci=ci+a1b-ab1 => ci=a*b1-(a1b-ci) => ci=a1b-ci; ci=a*b1-ci */ /* */ /* */ /* for defined(RR) || defined(RC) || defined(CR) || defined(CC) */ /* (a-a1*I)(b-b1*I)=ab-a1*b1+I(-a1b-ab1) */ /* */ /* c= a1*b1-c then c=a*b-c */ /* ci = ci-a1*b -a*b1; */ /* as ibm z13 only has x*z-m x*z+m instructions implementation will be changed a bit */ /* Assuming ci=0; and cix=cix+a1b+ab1 ; ci=ci-cix will work */ /* cix= a*b1+cix ; cix= a1*b+cix (two madb) ci=ci-cix (sign change if ci=0) */ /* As c=0 then */ /* c=a*b-c then c=a1*b1-c => c=(a1*b1-(a*b-c)) which is -1*( a*b -(a1*b1-c)) */ /* */ /* Values will be equal to (-c) and (-ci) */ /* To change sign it'll be multiplied by -1*(alpha+alpha_i) */ /* This is done once: */ /* lcdbr ALPHA_I,ALPHA_I */ /* lcdbr ALPHA ,ALPHA */ /*************************************************************************************************************/ /*************************Zero vectors***************************************/ /*zero vectors for 4x4 */ .macro ZERO_ZCVEC_4x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 vzero %v24 vzero %v25 vzero %v26 vzero %v27 vzero %v28 vzero %v29 vzero %v30 vzero %v31 .endm /*zero vectors for */ .macro ZERO_ZCVEC_2x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 vzero %v20 vzero %v21 vzero %v22 vzero %v23 .endm /*zero vectors for */ .macro ZERO_ZCVEC_1x4 vzero %v16 vzero %v17 vzero %v18 vzero %v19 .endm /*zero vectors for */ .macro ZERO_ZCVEC_4x2 ZERO_ZCVEC_2x4 .endm .macro ZERO_ZCVEC_4x1 ZERO_ZCVEC_1x4 .endm /*zero vectors for */ .macro ZERO_ZCVEC_2x2 vzero %v16 vzero %v17 vzero %v20 vzero %v21 .endm /*zero vectors for */ .macro ZERO_ZCVEC_1x2 vzero %v16 vzero %v17 .endm /*zero vectors for */ .macro ZERO_ZCVEC_2x1 vzero %v16 vzero %v17 .endm /*zero vectors for 1x1*/ .macro ZERO_ZCVEC_1x1 lzdr %f6 lzdr %f7 .endm /* Calculate for 4x2 inner */ .macro CalcComplex_4x2 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 #endif .endm /* Calculate for 2x4 inner */ .macro CalcComplex_2x4 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vResR1, \vi1, \viB, \vResR1 vfmsdb \vResI1, \vi1, \vrB, \vResI1 vfmadb \vResR2, \vi2, \viB, \vResR2 vfmsdb \vResI2, \vi2, \vrB, \vResI2 vfmadb \vResR3, \vi1, \viB2, \vResR3 vfmsdb \vResI3, \vi1, \vrB2, \vResI3 vfmadb \vResR4, \vi2, \viB2, \vResR4 vfmsdb \vResI4, \vi2, \vrB2, \vResI4 vfmadb \vResR1, \vr1, \vrB, \vResR1 vfmsdb \vResI1, \vr1, \viB, \vResI1 vfmadb \vResR2, \vr2, \vrB, \vResR2 vfmsdb \vResI2, \vr2, \viB, \vResI2 vfmadb \vResR3, \vr1, \vrB2, \vResR3 vfmsdb \vResI3, \vr1, \viB2, \vResI3 vfmadb \vResR4, \vr2, \vrB2, \vResR4 vfmsdb \vResI4, \vr2, \viB2, \vResI4 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vResR1, \vr1, \vrB, \vResR1 vfmadb \vResI1, \vi1, \vrB, \vResI1 vfmsdb \vResR2, \vr2, \vrB, \vResR2 vfmadb \vResI2, \vi2, \vrB, \vResI2 vfmsdb \vResR3, \vr1, \vrB2, \vResR3 vfmadb \vResI3, \vi1, \vrB2, \vResI3 vfmsdb \vResR4, \vr2, \vrB2, \vResR4 vfmadb \vResI4, \vi2, \vrB2, \vResI4 vfmsdb \vResR1, \vi1, \viB, \vResR1 vfmadb \vResI1, \vr1, \viB, \vResI1 vfmsdb \vResR2, \vi2, \viB, \vResR2 vfmadb \vResI2, \vr2, \viB, \vResI2 vfmsdb \vResR3, \vi1, \viB2, \vResR3 vfmadb \vResI3, \vr1, \viB2, \vResI3 vfmsdb \vResR4, \vi2, \viB2, \vResR4 vfmadb \vResI4, \vr2, \viB2, \vResI4 #endif .endm /* Calculate for 2x2 inner */ .macro CalcComplex_2x2 vResR1, vResI1,vResR2, vResI2, vR1, vI1, vRB, vIB, vRB2, vIB2 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vResR1, \vI1, \vIB, \vResR1 vfmadb \vResI1, \vR1, \vIB, \vResI1 vfmsdb \vResR2, \vI1, \vIB2, \vResR2 vfmadb \vResI2, \vR1, \vIB2, \vResI2 vfmsdb \vResR1, \vR1, \vRB, \vResR1 vfmadb \vResI1, \vI1, \vRB, \vResI1 vfmsdb \vResR2, \vR1, \vRB2, \vResR2 vfmadb \vResI2, \vI1, \vRB2, \vResI2 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vResR1, \vI1, \vIB, \vResR1 vfmsdb \vResI1, \vR1, \vIB, \vResI1 vfmadb \vResR2, \vI1, \vIB2, \vResR2 vfmsdb \vResI2, \vR1, \vIB2, \vResI2 vfmadb \vResR1, \vR1, \vRB, \vResR1 vfmsdb \vResI1, \vI1, \vRB, \vResI1 vfmadb \vResR2, \vR1, \vRB2, \vResR2 vfmsdb \vResI2, \vI1, \vRB2, \vResI2 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vResR1, \vI1, \vIB, \vResR1 vfmsdb \vResI1, \vI1, \vRB, \vResI1 vfmadb \vResR2, \vI1, \vIB2, \vResR2 vfmsdb \vResI2, \vI1, \vRB2, \vResI2 vfmadb \vResR1, \vR1, \vRB, \vResR1 vfmsdb \vResI1, \vR1, \vIB, \vResI1 vfmadb \vResR2, \vR1, \vRB2, \vResR2 vfmsdb \vResI2, \vR1, \vIB2, \vResI2 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vResR1, \vR1, \vRB, \vResR1 vfmadb \vResI1, \vI1, \vRB, \vResI1 vfmsdb \vResR2, \vR1, \vRB2, \vResR2 vfmadb \vResI2, \vI1, \vRB2, \vResI2 vfmsdb \vResR1, \vI1, \vIB, \vResR1 vfmadb \vResI1, \vR1, \vIB, \vResI1 vfmsdb \vResR2, \vI1, \vIB2, \vResR2 vfmadb \vResI2, \vR1, \vIB2, \vResI2 #endif .endm /* Calculate for 2x1 inner */ .macro CalcComplex_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif .endm /* Calculate for 1x2 inner */ .macro CalcComplex_1x2 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(RN) || defined(CN) || defined(RT) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 #endif #if defined(NR) || defined(TR) || defined(NC) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif .endm /* Calculate for 4x1 inner */ .macro CalcComplex_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif .endm /* Calculate for 1x4 inner */ .macro CalcComplex_1x4 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(RN) || defined(CN) || defined(RT) || defined(CT) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 #endif #if defined(NR) || defined(TR) || defined(NC) || defined(TC) vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif .endm .macro CalcComplex_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB #if defined(NN) || defined(NT) || defined(TN) || defined(TT) msdbr \RealResult1, \Image1, \ImageB madbr \ImageResult1, \Real1, \ImageB msdbr \RealResult1, \Real1, \RealB madbr \ImageResult1, \Image1, \RealB #endif #if defined(NR) || defined(NC) || defined(TR) || defined(TC) madbr \RealResult1, \Image1, \ImageB msdbr \ImageResult1, \Real1, \ImageB madbr \RealResult1, \Real1, \RealB msdbr \ImageResult1, \Image1, \RealB #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) madbr \RealResult1, \Image1, \ImageB msdbr \ImageResult1, \Image1, \RealB madbr \RealResult1, \Real1, \RealB msdbr \ImageResult1, \Real1, \ImageB #endif #if defined(RR) || defined(RC) || defined(CR) || defined(CC) msdbr \RealResult1, \Real1, \RealB madbr \ImageResult1, \Image1, \RealB msdbr \RealResult1, \Image1, \ImageB madbr \ImageResult1, \Real1, \ImageB #endif .endm #define DISP(ind,stride,disp) (ind*stride+disp) #define DISP64(ind,disp) (ind*64+disp) #define DISP32(ind,disp) (ind*32+disp) #define DISP16(ind,disp) (ind*16+disp) #define USE_VLM 1 .macro ZCALC_4x4_I PTR_A_REG,PTR_B_REG,Index,IsLast #if defined(USE_VLM) vlm %v4,%v7, DISP64(\Index ,0) (\PTR_A_REG) #else vl %v4 , DISP64(\Index ,0) (\PTR_A_REG) vl %v5 , DISP64(\Index ,16)(\PTR_A_REG) vl %v6 , DISP64(\Index ,32)(\PTR_A_REG) vl %v7 , DISP64(\Index ,48)(\PTR_A_REG) #endif vlrepg %v9, DISP64(\Index ,0)(\PTR_B_REG) vlrepg %v10 , DISP64(\Index ,8)(\PTR_B_REG) vlrepg %v11, DISP64(\Index ,16)(\PTR_B_REG) vlrepg %v12 , DISP64(\Index ,24)(\PTR_B_REG) vpdi %v1,%v4,%v5,0 vpdi %v5,%v4,%v5,0b101 vpdi %v3,%v6,%v7,0 vpdi %v7,%v6,%v7,0b101 CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 vlrepg %v9, DISP64(\Index ,32)(\PTR_B_REG) vlrepg %v10 , DISP64(\Index ,40)(\PTR_B_REG) vlrepg %v11, DISP64(\Index ,48)(\PTR_B_REG) vlrepg %v12 , DISP64(\Index ,56)(\PTR_B_REG) .if \IsLast==1 la \PTR_A_REG, DISP64(\Index ,64)(\PTR_A_REG) .endif CalcComplex_4x2 %v24,%v25,%v26,%v27,%v28,%v29,%v30,%v31,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_B_REG, DISP64(\Index ,64)(\PTR_B_REG) .endif .endm .macro ZCALC_4x2_I PTR_A_REG,PTR_B_REG,Index,IsLast #if defined(USE_VLM) vlm %v4,%v7, DISP64(\Index ,0) (\PTR_A_REG) #else vl %v4 , DISP64(\Index ,0) (\PTR_A_REG) vl %v5 , DISP64(\Index ,16)(\PTR_A_REG) vl %v6 , DISP64(\Index ,32)(\PTR_A_REG) vl %v7 , DISP64(\Index ,48)(\PTR_A_REG) #endif vlrepg %v9, DISP32(\Index ,0)(\PTR_B_REG) vlrepg %v10 , DISP32(\Index ,8)(\PTR_B_REG) vlrepg %v11, DISP32(\Index ,16)(\PTR_B_REG) vlrepg %v12 , DISP32(\Index ,24)(\PTR_B_REG) vpdi %v1,%v4,%v5,0 vpdi %v5,%v4,%v5,0b101 vpdi %v3,%v6,%v7,0 vpdi %v7,%v6,%v7,0b101 .if \IsLast==1 la \PTR_A_REG, DISP64(\Index ,64)(\PTR_A_REG) .endif CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_B_REG, DISP32(\Index ,32)(\PTR_B_REG) .endif .endm .macro ZCALC_2x4_I PTR_A_REG,PTR_B_REG,Index,IsLast #if defined(USE_VLM) vlm %v4,%v7, DISP64(\Index ,0) (\PTR_B_REG) #else vl %v4 , DISP64(\Index ,0) (\PTR_B_REG) vl %v5 , DISP64(\Index ,16)(\PTR_B_REG) vl %v6 , DISP64(\Index ,32)(\PTR_B_REG) vl %v7 , DISP64(\Index ,48)(\PTR_B_REG) #endif vlrepg %v9, DISP32(\Index ,0)(\PTR_A_REG) vlrepg %v10 , DISP32(\Index ,8)(\PTR_A_REG) vlrepg %v11, DISP32(\Index ,16)(\PTR_A_REG) vlrepg %v12 , DISP32(\Index ,24)(\PTR_A_REG) vpdi %v1,%v4,%v5,0 vpdi %v5,%v4,%v5,0b101 vpdi %v3,%v6,%v7,0 vpdi %v7,%v6,%v7,0b101 .if \IsLast==1 la \PTR_B_REG, DISP64(\Index ,64)(\PTR_B_REG) .endif CalcComplex_2x4 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_A_REG, DISP32(\Index ,32)(\PTR_A_REG) .endif .endm .macro ZCALC_4x1_I PTR_A_REG,PTR_B_REG,Index,IsLast #if defined(USE_VLM) vlm %v4,%v7, DISP64(\Index ,0) (\PTR_A_REG) #else vl %v4 , DISP64(\Index ,0) (\PTR_A_REG) vl %v5 , DISP64(\Index ,16)(\PTR_A_REG) vl %v6 , DISP64(\Index ,32)(\PTR_A_REG) vl %v7 , DISP64(\Index ,48)(\PTR_A_REG) #endif vlrepg %v9, DISP16(\Index ,0)(\PTR_B_REG) vlrepg %v10 , DISP16(\Index ,8)(\PTR_B_REG) vpdi %v1,%v4,%v5,0 vpdi %v11,%v4,%v5,0b101 vpdi %v3,%v6,%v7,0 vpdi %v12,%v6,%v7,0b101 .if \IsLast==1 la \PTR_A_REG, DISP64(\Index ,64)(\PTR_A_REG) .endif CalcComplex_4x1 %v16,%v17,%v18,%v19,%v1,%v11,%v3,%v12,%v9,%v10 .if \IsLast==1 la \PTR_B_REG, DISP16(\Index ,16)(\PTR_B_REG) .endif .endm .macro ZCALC_1x4_I PTR_A_REG,PTR_B_REG,Index,IsLast #if defined(USE_VLM) vlm %v4,%v7, DISP64(\Index ,0) (\PTR_B_REG) #else vl %v4 , DISP64(\Index ,0) (\PTR_B_REG) vl %v5 , DISP64(\Index ,16)(\PTR_B_REG) vl %v6 , DISP64(\Index ,32)(\PTR_B_REG) vl %v7 , DISP64(\Index ,48)(\PTR_B_REG) #endif vlrepg %v9, DISP16(\Index ,0)(\PTR_A_REG) vlrepg %v10 , DISP16(\Index ,8)(\PTR_A_REG) vpdi %v1,%v4,%v5,0 vpdi %v11,%v4,%v5,0b101 vpdi %v3,%v6,%v7,0 vpdi %v12,%v6,%v7,0b101 .if \IsLast==1 la \PTR_B_REG, DISP64(\Index ,64)(\PTR_B_REG) .endif CalcComplex_1x4 %v16,%v17,%v18,%v19,%v1,%v11,%v3,%v12,%v9,%v10 .if \IsLast==1 la \PTR_A_REG, DISP16(\Index ,16)(\PTR_A_REG) .endif .endm .macro ZCALC_2x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast vl %v1 , DISP32(\Index ,0)(\PTR_A_REG) vl %v3 , DISP32(\Index ,16)(\PTR_A_REG) vlrepg %v9, DISP32(\Index ,0)(\PTR_B_REG) vlrepg %v10 , DISP32(\Index ,8)(\PTR_B_REG) vlrepg %v11, DISP32(\Index ,16)(\PTR_B_REG) vlrepg %v12 , DISP32(\Index ,24)(\PTR_B_REG) vpdi %v5,%v1,%v3,0 vpdi %v6,%v1,%v3,0b101 .if \IsLast==1 la \PTR_A_REG, DISP32(\Index ,32)(\PTR_A_REG) .endif CalcComplex_2x2 %v16,%v17,%v20,%v21,%v5,%v6, %v9,%v10,%v11,%v12 .if \IsLast==1 la \PTR_B_REG, DISP32(\Index ,32)(\PTR_B_REG) .endif .endm .macro ZCALC_2x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast vl %v1 , DISP32(\Index ,0)(\PTR_A_REG) vl %v3 , DISP32(\Index ,16)(\PTR_A_REG) vlrepg %v6, DISP16(\Index ,0)(\PTR_B_REG) vlrepg %v7 , DISP16(\Index ,8)(\PTR_B_REG) vpdi %v4,%v1,%v3,0 vpdi %v5,%v1,%v3,0b101 .if \IsLast==1 la \PTR_A_REG, DISP32(\Index ,32)(\PTR_A_REG) .endif CalcComplex_2x1 %v16,%v17,%v4,%v5,%v6,%v7 .if \IsLast==1 la \PTR_B_REG, DISP16(\Index ,16)(\PTR_B_REG) .endif .endm .macro ZCALC_1x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast vl %v1 , DISP32(\Index ,0)(\PTR_B_REG) vl %v3 , DISP32(\Index ,16)(\PTR_B_REG) vlrepg %v6, DISP16(\Index ,0)(\PTR_A_REG) vlrepg %v7 , DISP16(\Index ,8)(\PTR_A_REG) vpdi %v4,%v1,%v3,0 vpdi %v5,%v1,%v3,0b101 .if \IsLast==1 la \PTR_B_REG, DISP32(\Index ,32)(\PTR_B_REG) .endif CalcComplex_1x2 %v16,%v17,%v4,%v5,%v6,%v7 .if \IsLast==1 la \PTR_A_REG, DISP16(\Index ,16)(\PTR_A_REG) .endif .endm .macro ZCALC_1x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast ld %f1 , DISP16(\Index ,0)(\PTR_A_REG) ld %f3 , DISP16(\Index ,8)(\PTR_A_REG) ld %f4 , DISP16(\Index ,0)(\PTR_B_REG) ld %f5 , DISP16(\Index ,8)(\PTR_B_REG) .if \IsLast==1 la \PTR_A_REG, DISP16(\Index ,16)(\PTR_A_REG) .endif CalcComplex_1x1 %f6,%f7,%f1,%f3,%f4,%f5 .if \IsLast==1 la \PTR_B_REG, DISP16(\Index ,16)(\PTR_B_REG) .endif .endm .macro ZCALC_4x4 PTR_A_REG,PTR_B_REG ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_4x2 PTR_A_REG,PTR_B_REG ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_4x1 PTR_A_REG,PTR_B_REG ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_4x4_4 PTR_A_REG,PTR_B_REG ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_4x2_4 PTR_A_REG,PTR_B_REG ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_4x1_4 PTR_A_REG,PTR_B_REG ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_2x4_4 PTR_A_REG,PTR_B_REG ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_2x4 PTR_A_REG,PTR_B_REG ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_1x4_4 PTR_A_REG,PTR_B_REG ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x4 PTR_A_REG,PTR_B_REG ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_2x2 PTR_A_REG,PTR_B_REG ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_2x2_4 PTR_A_REG,PTR_B_REG ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_2x1 PTR_A_REG,PTR_B_REG ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_2x1_4 PTR_A_REG,PTR_B_REG ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x2_4 PTR_A_REG,PTR_B_REG ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x2 PTR_A_REG,PTR_B_REG ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,1 .endm .macro ZCALC_1x1_4 PTR_A_REG,PTR_B_REG ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,0 ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,1,0 ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,2,0 ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,3,1 .endm .macro ZCALC_1x1 PTR_A_REG,PTR_B_REG ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,1 .endm /*****************************STORE RESULTS************************************/ .macro CalcMultAlpha_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB #if defined (TRMMKERNEL) vfmdb \vRealResult1, \vImage1, \vecImageB vfmdb \vImageResult1, \vReal1, \vecImageB vfmdb \vRealResult2, \vImage2, \vecImageB vfmdb \vImageResult2, \vReal2, \vecImageB #else vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2 vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2 #endif vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2 vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2 .endm .macro CalcMultAlpha_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB #if defined (TRMMKERNEL) vfmdb \vRealResult1, \vImage1, \vecImageB vfmdb \vImageResult1, \vReal1, \vecImageB #else vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1 vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1 #endif vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1 vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1 .endm .macro CalcMultAlpha_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB msdbr \RealResult1, \Image1, \ImageB madbr \ImageResult1, \Real1, \ImageB msdbr \RealResult1, \Real1, \RealB madbr \ImageResult1, \Image1, \RealB .endm .macro ZSTORE_4x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL ,LC1,LC2 #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) vl %v4 , 16(\CIJ_REG) vpdi %v3,%v1,%v4,0 vl %v7 , 32(\CIJ_REG) vpdi %v4,%v1,%v4,0b101 vl %v6 , 48 (\CIJ_REG) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #endif la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) vpdi %v17, %v3,%v4,0b0101 vst %v16,0(\CIJ_REG) vpdi %v18, %v1 ,%v6,0 vst %v17,16(\CIJ_REG) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,32(\CIJ_REG) vst %v19,48(\CIJ_REG) #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v4 , 16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v3,%v1,%v4,0 vl %v7 , 32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v4,%v1,%v4,0b101 vl %v6 , 48 (\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3 ,%v4,0b0101 vst %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v18, %v1 ,%v6,0 vst %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vst %v19,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG,\LC1) vl %v4 , 16(\CIJ_REG,\LC1) vpdi %v3,%v1,%v4,0 vl %v7 , 32(\CIJ_REG,\LC1) vpdi %v4,%v1,%v4,0b101 vl %v6 , 48 (\CIJ_REG,\LC1) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v24,%v25,%v26,%v27,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3 ,%v4,0b0101 vst %v16,0(\CIJ_REG,\LC1) vpdi %v18, %v1 ,%v6,0 vst %v17,16(\CIJ_REG,\LC1) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,32(\CIJ_REG,\LC1) vst %v19,48(\CIJ_REG,\LC1) #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG,\LC2) vl %v4 , 16(\CIJ_REG,\LC2) vpdi %v3,%v1,%v4,0 vl %v7 , 32(\CIJ_REG,\LC2) vpdi %v4,%v1,%v4,0b101 vl %v6 , 48 (\CIJ_REG,\LC2) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v28,%v29,%v30,%v31,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3 ,%v4,0b0101 vst %v16,0(\CIJ_REG,\LC2) vpdi %v18, %v1 ,%v6,0 vst %v17,16(\CIJ_REG,\LC2) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,32(\CIJ_REG,\LC2) vst %v19,48(\CIJ_REG,\LC2) la \CIJ_REG,64(\CIJ_REG) .endm .macro ZSTORE_4x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) vl %v4 , 16(\CIJ_REG) vpdi %v3,%v1,%v4,0 vl %v7 , 32(\CIJ_REG) vpdi %v4,%v1,%v4,0b101 vl %v6 , 48 (\CIJ_REG) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3,%v4,0b0101 vst %v16,0(\CIJ_REG) vpdi %v18, %v1 ,%v6,0 vst %v17,16(\CIJ_REG) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,32(\CIJ_REG) vst %v19,48(\CIJ_REG) #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v4 , 16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v3,%v1,%v4,0 vl %v7 , 32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v4,%v1,%v4,0b101 vl %v6 , 48 (\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI vpdi %v20, %v3 ,%v4,0 vpdi %v21, %v3 ,%v4,0b0101 vst %v20,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v22, %v1 ,%v6,0 vst %v21,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v23, %v1 ,%v6,0b0101 vst %v22,32(\CIJ_REG,\LDC_BYTE_ORIGINAL) vst %v23,48(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,64(\CIJ_REG) .endm .macro ZSTORE_4x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) vl %v4 , 16(\CIJ_REG) vpdi %v3,%v1,%v4,0 vl %v7 , 32(\CIJ_REG) vpdi %v4,%v1,%v4,0b101 vl %v6 , 48 (\CIJ_REG) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3,%v4,0b0101 vst %v16,0(\CIJ_REG) vpdi %v18, %v1 ,%v6,0 vst %v17,16(\CIJ_REG) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,32(\CIJ_REG) vst %v19,48(\CIJ_REG) la \CIJ_REG,64(\CIJ_REG) .endm .macro ZSTORE_1x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2 #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vl %v4 , 0(\CIJ_REG, \LDC_BYTE_ORIGINAL) vpdi %v3,%v1,%v4,0 la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) vl %v7 , 0(\CIJ_REG, \LC1) vpdi %v4,%v1,%v4,0b101 vl %v6 , 0 (\CIJ_REG,\LC2) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 #else la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI #if defined(TRMMKERNEL) la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) #endif vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3,%v4,0b0101 vst %v16,0(\CIJ_REG) vpdi %v18, %v1 ,%v6,0 vst %v17,0(\CIJ_REG, \LDC_BYTE_ORIGINAL) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,0(\CIJ_REG, \LC1) vst %v19,0(\CIJ_REG,\LC2) la \CIJ_REG,16(\CIJ_REG) .endm .macro ZSTORE_2x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2 #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) vl %v26 , 16(\CIJ_REG) la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) vl %v4 , 0(\CIJ_REG, \LDC_BYTE_ORIGINAL) vl %v25 , 16(\CIJ_REG, \LDC_BYTE_ORIGINAL) vpdi %v3,%v1,%v4,0 vpdi %v24,%v26,%v25,0 la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) vl %v7 , 0(\CIJ_REG, \LC1) vl %v28 , 16(\CIJ_REG, \LC1) vpdi %v4,%v1,%v4,0b101 vpdi %v25,%v26,%v25,0b101 vl %v6 , 0 (\CIJ_REG,\LC2) vl %v27 , 16 (\CIJ_REG,\LC2) vpdi %v1,%v7,%v6,0 vpdi %v6,%v7,%v6,0b101 vpdi %v26,%v28,%v27,0 vpdi %v27,%v28,%v27,0b101 #else la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL) #endif CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI CalcMultAlpha_4x1 %v24,%v25,%v26,%v27,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI #if defined(TRMMKERNEL) la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL ) #endif vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3,%v4,0b0101 vpdi %v20, %v24 ,%v25,0 vpdi %v21, %v24,%v25,0b0101 vpdi %v22, %v26 ,%v27,0 vpdi %v23, %v26 ,%v27,0b0101 vst %v16,0(\CIJ_REG) vst %v20,16(\CIJ_REG) vpdi %v18, %v1 ,%v6,0 vst %v17,0(\CIJ_REG, \LDC_BYTE_ORIGINAL) vst %v21,16(\CIJ_REG, \LDC_BYTE_ORIGINAL) vpdi %v19, %v1 ,%v6,0b0101 vst %v18,0(\CIJ_REG, \LC1) vst %v22,16(\CIJ_REG, \LC1) vst %v19,0(\CIJ_REG,\LC2) vst %v23,16(\CIJ_REG,\LC2) la \CIJ_REG,32(\CIJ_REG) .endm .macro ZSTORE_2x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) vl %v4 , 16(\CIJ_REG) vpdi %v3,%v1,%v4,0 vpdi %v4,%v1,%v4,0b101 vl %v5 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vl %v7 , 16(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v6,%v5,%v7,0 vpdi %v7,%v5,%v7,0b101 #endif CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI CalcMultAlpha_2x1 %v6,%v7, %v20,%v21 ,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3,%v4,0b0101 vst %v16,0(\CIJ_REG) vst %v17,16(\CIJ_REG) vpdi %v20, %v6 ,%v7,0 vpdi %v21, %v6 ,%v7,0b0101 vst %v20,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vst %v21,16(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,32(\CIJ_REG) .endm .macro ZSTORE_2x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) vl %v4 , 16(\CIJ_REG) vpdi %v3,%v1,%v4,0 vpdi %v4,%v1,%v4,0b101 #endif CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3,%v4,0b0101 vst %v16,0(\CIJ_REG) vst %v17,16(\CIJ_REG) la \CIJ_REG,32(\CIJ_REG) .endm .macro ZSTORE_1x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL #if !defined(TRMMKERNEL) vl %v1 , 0(\CIJ_REG) vl %v4 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL) vpdi %v3,%v1,%v4,0 vpdi %v4,%v1,%v4,0b101 #endif CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI vpdi %v16, %v3 ,%v4,0 vpdi %v17, %v3,%v4,0b0101 vst %v16,0(\CIJ_REG) vst %v17,0(\CIJ_REG,\LDC_BYTE_ORIGINAL) la \CIJ_REG,16(\CIJ_REG) .endm .macro ZSTORE_1x1 ALPHA_RR,ALPHA_RI ,CIJ_REG #if defined (TRMMKERNEL) lzdr %f1 lzdr %f4 #else ld %f1 , 0(\CIJ_REG) ld %f4 , 8(\CIJ_REG ) #endif CalcMultAlpha_1x1 %f1,%f4, %f6,%f7,\ALPHA_RR,\ALPHA_RI std %f1,0(\CIJ_REG) std %f4,8(\CIJ_REG) la \CIJ_REG,16(\CIJ_REG) .endm /****************************TRMM POINTER REFRESH MACROSES*************************/ .macro RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /* ptrbb = bb;*/ lgr \PTR_B,\B_VAL /*refresh BPOINT*/ #else /* ptrba =ptrba+ off*C_A; ptrbb = bb + off*C_B;*/ .if \C_B==4 .if \C_A==4 sllg \PTR_B, \OFF_VAL,6 agr \PTR_A,\PTR_B /*ptrba+off*4**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,5 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_B, \PTR_B la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,4 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,6 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==2 .if \C_A==4 sllg \PTR_B, \OFF_VAL,5 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/ agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,5 agr \PTR_A,\PTR_B /*ptrba+off*2**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,4 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_B,\PTR_B /* off+off**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .elseif \C_B==1 .if \C_A==4 sllg \PTR_B, \OFF_VAL,6 agr \PTR_A,\PTR_B /*ptrba+off*4**/ sllg \PTR_B, \OFF_VAL,4 la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==2 sllg \PTR_B, \OFF_VAL,4 la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/ agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .elseif \C_A==1 sllg \PTR_B, \OFF_VAL,4 agr \PTR_A,\PTR_B /*ptrba+off*1**/ la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/ .endif .endif #endif .endm /**/ .macro RefreshTempBk TEMP_VAL,BK_VAL,OFF_VAL,INCR_A,INCR_B #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) /* temp = bk-off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #elif defined(LEFT) /* temp = off+INCR_A; // number of values in A */ la \TEMP_VAL,\INCR_A(\OFF_VAL) #else /* temp = off+INCR_B // number of values in B*/ la \TEMP_VAL,\INCR_B(\OFF_VAL) #endif .endm .macro RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,PTR_A,C_A,C_B #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /*temp = bk - off;*/ sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL #ifdef LEFT /*temp -= 8; // number of values in A*/ lay \TEMP_VAL,-\C_A(\TEMP_VAL) #else /*temp -= 4; // number of values in B*/ lay \TEMP_VAL,-\C_B(\TEMP_VAL) #endif /*ptrba += temp*C_A; ptrbb += temp*C_B;*/ .if \C_A==4 sllg \TEMP_VAL, \TEMP_VAL,6 /*temp*4*/ .elseif \C_A==2 sllg \TEMP_VAL, \TEMP_VAL,5 /*temp*2*/ .elseif \C_A==1 sllg \TEMP_VAL, \TEMP_VAL,4 /*temp*1*/ .endif la \PTR_A,0(\PTR_A,\TEMP_VAL) /*ptrba+temp*C_A*/ #endif #ifdef LEFT /*off += \c_A; // number of values in A*/ aghi \OFF_VAL,\C_A #endif .endm OpenBLAS-0.2.20/kernel/zarch/ztrmm4x4V.S000066400000000000000000000447161313527062700175320ustar00rootroot00000000000000/*************************************************************************** Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * 2017/02/26 AbdelRauf (quickwritereader@gmail.com) * BLASTEST : OK * CTEST : OK * TEST : OK **************************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" /* BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, FLOAT* C,BLASLONG ldc, BLASLONG offset) ##bm=r2,bn=r3, bk=r4, alpha=f0,aplhai=f2, ba=r5,bb=r6,stack[160] ,ldc=stack[168] offset=stack[176] **********************************************************************************************/ /*Note: r0 can not be used as address disp register */ #define BM %r2 #define BM_CUR %r0 #define BN %r3 #define BN_CUR %r10 #define BK %r4 #define LDC_BYTE %r8 #define ALPHA %f0 #define ALPHA_I %f2 #define ALPHA_VECT %v0 #define ALPHA_VECT_I %v2 #define LOCAL_VAR1 %r9 #define LOCAL_VAR2 %r1 #define LOCAL_VAR3 %r11 #define A %r5 #define B %r6 #define CIJ %r7 #define CIJ_LOCAL %r12 #define OFF %r13 #define OFFSET %f8 #define ALIGN_4 .align 32 #define ALIGN_2 .align 16 #define PREFETCH_INS 1 /**************************Include kernel helper macrosses**********************************/ #include "zkernelMacrosV.S" /***********************************ZGEMM**4x4*******************************************************/ PROLOGUE #if defined(TRMMKERNEL) std OFFSET ,40(%r15) stmg %r6,%r13,48(%r15) #else stmg %r6,%r12,48(%r15) #endif std %f9, 128(%r15) std %f10,136(%r15) std %f11,144(%r15) std %f12,152(%r15) lg CIJ, 160(%r15) lg LOCAL_VAR1, 168(%r15) #if defined(TRMMKERNEL) lg OFF,176(%r15) ldgr OFFSET ,OFF #endif srlg BN_CUR,BN,2 #if defined(RR) || defined(RC) || defined(CR) || defined(CC) lcdbr ALPHA_I,ALPHA_I lcdbr ALPHA ,ALPHA #endif vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/ sllg LDC_BYTE, LOCAL_VAR1,4 /*calculate lcd stride with complex=16 x<<4 */ vrepg ALPHA_VECT_I,ALPHA_VECT_I,0 /*replicate alpha which in f0*/ #if defined(TRMMKERNEL) && !defined(LEFT) /*off = -offset;*/ lgdr LOCAL_VAR1,OFFSET lcgr OFF,LOCAL_VAR1 #endif cijle BN_CUR,0,.LX2 ALIGN_4 .LX4_BN: #if defined(PREFETCH_INS) pfd 1, 0(A) pfd 1, 0(B) #endif #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,2 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L2x4 ALIGN_4 .L4x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_4x4 cijle LOCAL_VAR1,0,.L4x4_mod ALIGN_4 .L4x4_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) pfd 1, 256(LOCAL_VAR2 ) #endif ZCALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_4_BK ALIGN_4 .L4x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store ALIGN_4 .L4x4_BK: /*BK_CUR LOOP */ ZCALC_4x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x4_BK ALIGN_4 .L4x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_4x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE,LOCAL_VAR1,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,4 #endif brctg BM_CUR,.L4x4_BM ALIGN_2 .L2x4: tmll BM,2 jz .L1x4 ALIGN_4 .L2x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_2x4 cijle LOCAL_VAR1,0,.L2x4_mod ALIGN_4 .L2x4_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR2) #endif ZCALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) #endif brctg LOCAL_VAR1,.L2x4_4_BK ALIGN_4 .L2x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store ALIGN_4 .L2x4_BK: /*BK_CUR LOOP */ ZCALC_2x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x4_BK ALIGN_4 .L2x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_2x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE ,LOCAL_VAR1,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,4 #endif ALIGN_4 .L1x4: tmll BM,1 jz .Lx4_INNER_END ALIGN_4 .L1x4_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_1x4 cijle LOCAL_VAR1,0,.L1x4_mod ALIGN_4 .L1x4_4_BK: /*BK_CUR LOOP */ ZCALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_4_BK ALIGN_4 .L1x4_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store ALIGN_4 .L1x4_BK: /*BK_CUR LOOP */ ZCALC_1x4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x4_BK ALIGN_4 .L1x4_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_1x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE,LOCAL_VAR1,LOCAL_VAR2 #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,4 #endif ALIGN_2 .Lx4_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR1,LDC_BYTE,2 /*multiply*4 */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,4 #endif sllg LOCAL_VAR2,BK,6 /*multiply*4*sizeof(complex) =multiply*4*16* 2**6 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/ la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(complex) */ brctg BN_CUR,.LX4_BN /*********************************X2 SECTION************************************************/ ALIGN_4 .LX2: tmll BN,2 jz .Lx1 ALIGN_4 .Lx2_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,2 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L2x2 ALIGN_4 .L4x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_4x2 cijle LOCAL_VAR1,0,.L4x2_mod ALIGN_4 .L4x2_4_BK: /*BK_CUR LOOP */ #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) #endif ZCALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR2 ) #endif brctg LOCAL_VAR1,.L4x2_4_BK ALIGN_4 .L4x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store ALIGN_4 .L4x2_BK: /*BK_CUR LOOP */ ZCALC_4x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x2_BK ALIGN_4 .L4x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_4x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,2 #endif ALIGN_4 brctg BM_CUR,.L4x2_BM ALIGN_2 .L2x2: tmll BM,2 jz .L1x2 ALIGN_4 .L2x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_2x2 cijle LOCAL_VAR1,0,.L2x2_mod ALIGN_4 .L2x2_4_BK: /*BK_CUR LOOP */ ZCALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2 #if defined(PREFETCH_INS) pfd 1, 256(LOCAL_VAR3) pfd 1, 256(LOCAL_VAR2) #endif brctg LOCAL_VAR1,.L2x2_4_BK ALIGN_4 .L2x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store ALIGN_4 .L2x2_BK: /*BK_CUR LOOP */ ZCALC_2x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x2_BK ALIGN_4 .L2x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_2x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,2 #endif ALIGN_2 .L1x2: tmll BM,1 jz .Lx2_INNER_END ALIGN_4 .L1x2_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_1x2 cijle LOCAL_VAR1,0,.L1x2_mod ALIGN_4 .L1x2_4_BK: /*BK_CUR LOOP */ ZCALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_4_BK ALIGN_4 .L1x2_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store ALIGN_4 .L1x2_BK: /*BK_CUR LOOP */ ZCALC_1x2 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x2_BK ALIGN_4 .L1x2_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_1x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,2 #endif ALIGN_2 .Lx2_INNER_END: /*add LDC_BYTE_COPY to new*/ la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*multiply*2 */ sllg LOCAL_VAR2,BK,5 /*multiply*2*sizeof(complex) =multiply*2*16 2^5 */ la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*2*/ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,2 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*2*sizeof(complex) */ /*********************************X1 SECTION************************************************/ ALIGN_2 .Lx1: tmll BN,1 jz .L_FUNC_END ALIGN_4 .Lx1_BN: #if defined(TRMMKERNEL) && defined(LEFT) /*off = offset;*/ lgdr OFF,OFFSET #endif srlg BM_CUR,BM,2 lgr LOCAL_VAR3,A lgr CIJ_LOCAL,CIJ cijle BM_CUR,0,.L2x1 ALIGN_4 .L4x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_4x1 cijle LOCAL_VAR1,0,.L4x1_mod ALIGN_4 .L4x1_4_BK: /*BK_CUR LOOP */ ZCALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_4_BK ALIGN_4 .L4x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store ALIGN_4 .L4x1_BK: /*BK_CUR LOOP */ ZCALC_4x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L4x1_BK ALIGN_4 .L4x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_4x1 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,1 #endif ALIGN_4 brctg BM_CUR , .L4x1_BM ALIGN_2 .L2x1: tmll BM,2 jz .L1x1 ALIGN_4 .L2x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_2x1 cijle LOCAL_VAR1,0,.L2x1_mod ALIGN_4 .L2x1_4_BK: /*BK_CUR LOOP */ ZCALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_4_BK ALIGN_4 .L2x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store ALIGN_4 .L2x1_BK: /*BK_CUR LOOP */ ZCALC_2x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L2x1_BK ALIGN_4 .L2x1_BK_Store: /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/ ZSTORE_2x1 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,1 #endif ALIGN_2 .L1x1: tmll BM, 1 jz .Lx1_INNER_END ALIGN_4 .L1x1_BM: /*BM start*/ #if defined(TRMMKERNEL) /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */ RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 srl LOCAL_VAR1,2 #else srlg LOCAL_VAR1,BK,2 /*refresh BK*/ lgr LOCAL_VAR2,B /*refresh BPOINT*/ #endif ZERO_ZCVEC_1x1 cijle LOCAL_VAR1,0,.L1x1_mod ALIGN_4 .L1x1_4_BK: /*BK_CUR LOOP */ ZCALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_4_BK ALIGN_4 .L1x1_mod: #if defined(TRMMKERNEL) RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else la LOCAL_VAR1,3(0,0) NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store ALIGN_4 .L1x1_BK: /*BK_CUR LOOP */ ZCALC_1x1 LOCAL_VAR3,LOCAL_VAR2 brctg LOCAL_VAR1,.L1x1_BK ALIGN_4 .L1x1_BK_Store: /*store C and use CIJ_COPY for mem storing*/ ZSTORE_1x1 ALPHA,ALPHA_I ,CIJ_LOCAL #if defined(TRMMKERNEL) RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,1 #endif ALIGN_2 .Lx1_INNER_END: /*add LDC_BYTE_COPY to new*/ sllg LOCAL_VAR2,BK,4 /*multiply*1*sizeof(complex) =multiply*1*16* 2^4 */ la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */ #if defined(TRMMKERNEL) && !defined(LEFT) aghi OFF,1 #endif la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(complex) */ ALIGN_2 .L_FUNC_END: /*end*/ #if defined(TRMMKERNEL) ld OFFSET,40(%r15) lmg %r6,%r13,48(%r15) #else lmg %r6,%r12,48(%r15) #endif ld %f9, 128(%r15) ld %f10,136(%r15) ld %f11,144(%r15) ld %f12,152(%r15) br %r14 .end OpenBLAS-0.2.20/l1param.h000066400000000000000000000040601313527062700146520ustar00rootroot00000000000000#if defined(CORE2) || defined(PENRYN) #define ALIGNED_ACCESS #endif #ifdef NEHALEM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (128 * 12) #define ALIGNED_ACCESS #endif #ifdef SANDYBRIDGE #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (128 * 12) #define ALIGNED_ACCESS #endif #ifdef ATHLON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 10) #define ALIGNED_ACCESS #define movsd movlps #endif #ifdef PENTIUM3 #define PREFETCH prefetcht0 #define PREFETCHSIZE (128 * 10) #define ALIGNED_ACCESS #define movsd movlps #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHSIZE (128 * 10) #define FETCH128 #define ALIGNED_ACCESS #define xorps pxor #define xorpd pxor #endif #ifdef ATOM #define ALIGNED_ACCESS #define PREFETCH prefetcht0 #define PREFETCHSIZE ( 64 * 12 + 32) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 3) #define movsd movlps #endif #ifdef BARCELONA #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 5) #define ALIGNED_ACCESS #endif #ifdef SHANGHAI #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 5) #define ALIGNED_ACCESS #endif #ifdef BOBCAT #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 5) #define ALIGNED_ACCESS #endif #ifdef BULLDOZER #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 5) #define ALIGNED_ACCESS #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (128 * 4) #define ALIGNED_ACCESS #endif #define PREOFFSET 128 #ifdef HAVE_SSE2 #define PSHUFD1(A, B) pshufd A, B, B #define PSHUFD2(A, B, C) pshufd A, B, C #else #define PSHUFD1(A, B) shufps A, B, B #define PSHUFD2(A, B, C) movaps B, C; shufps A, C, C #endif #define MOVDDUP1(OFFSET, BASE, REGS) movddup OFFSET(BASE), REGS #define MOVAPS(OFFSET, BASE, REGS) movlps REGS, OFFSET(BASE); movhps REGS, OFFSET + SIZE(BASE) OpenBLAS-0.2.20/l2param.h000066400000000000000000000077301313527062700146620ustar00rootroot00000000000000#ifndef GEMV_PARAM_H #define GEMV_PARAM_H #ifdef movsd #undef movsd #endif #undef movapd #define movapd movaps #ifdef ATHLON #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps #define MOVUPS_XS movaps #define MOVUPS_YL movaps #define MOVUPS_YS movaps #define PREFETCH prefetcht0 #define PREFETCHSIZE 64 * 3 #endif #ifdef PENTIUM4 #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps #define MOVUPS_XS movaps #define MOVUPS_YL movaps #define MOVUPS_YS movaps #define PREFETCH prefetcht0 #define PREFETCHSIZE 64 * 2 #endif #ifdef CORE2 #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps #define MOVUPS_XS movaps #define MOVUPS_YL movaps #define MOVUPS_YS movaps #define PREFETCH prefetcht0 #define PREFETCHSIZE 64 * 4 #endif #ifdef PENRYN #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps #define MOVUPS_XS movaps #define MOVUPS_YL movaps #define MOVUPS_YS movaps #define PREFETCH prefetcht0 #define PREFETCHSIZE 64 * 4 #endif #ifdef NEHALEM #define MOVUPS_A movups #define MOVUPS_XL movups #define MOVUPS_XS movups #define MOVUPS_YL movups #define MOVUPS_YS movups #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 64 * 3 #endif #ifdef SANDYBRIDGE #define MOVUPS_A movups #define MOVUPS_XL movups #define MOVUPS_XS movups #define MOVUPS_YL movups #define MOVUPS_YS movups #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE 64 * 3 #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #ifndef COMPLEX #define PREFETCHSIZE 64 * 1 #else #define PREFETCHSIZE 64 * 1 #endif #define movsd movlps #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps #define MOVUPS_XS movaps #define MOVUPS_YL movaps #define MOVUPS_YS movaps #define PREFETCH prefetch #define PREFETCHW prefetchw #ifndef COMPLEX #define PREFETCHSIZE 64 * 2 #else #define PREFETCHSIZE 64 * 4 #endif #endif #ifdef NANO #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps #define MOVUPS_XS movaps #define MOVUPS_YL movaps #define MOVUPS_YS movaps #define PREFETCH prefetcht0 #ifndef COMPLEX #define PREFETCHSIZE 64 * 1 #else #define PREFETCHSIZE 64 * 2 #endif #endif #ifndef PREOFFSET #ifdef L1_DATA_LINESIZE #define PREOFFSET (L1_DATA_LINESIZE >> 1) #else #define PREOFFSET 32 #endif #endif #ifndef GEMV_UNROLL #define GEMV_UNROLL 4 #endif #ifndef ZGEMV_UNROLL #define ZGEMV_UNROLL 4 #endif /* #define COPY_FORCE */ /* Always copy X or Y to the buffer */ /* #define NOCOPY_UNALIGNED */ /* Not copy if X or Y is not aligned */ #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS #else #define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS #endif #define MOVRPS_A1(OFF, ADDR, REGS) movsd OFF + 8(ADDR), REGS; movhps OFF(ADDR), REGS #define MOVRPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF + 8(ADDR, BASE, SCALE), REGS; movhps OFF(ADDR, BASE, SCALE), REGS #ifdef MOVUPS_XL #define MOVUPS_XL1(OFF, ADDR, REGS) MOVUPS_XL OFF(ADDR), REGS #else #define MOVUPS_XL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS #endif #ifdef MOVUPS_XS #define MOVUPS_XS1(OFF, ADDR, REGS) MOVUPS_XS REGS, OFF(ADDR) #else #define MOVUPS_XS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR) #endif #ifdef MOVUPS_YL #define MOVUPS_YL1(OFF, ADDR, REGS) MOVUPS_YL OFF(ADDR), REGS #else #define MOVUPS_YL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS #endif #ifdef MOVUPS_YS #define MOVUPS_YS1(OFF, ADDR, REGS) MOVUPS_YS REGS, OFF(ADDR) #else #define MOVUPS_YS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR) #endif #endif OpenBLAS-0.2.20/lapack/000077500000000000000000000000001313527062700143775ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/CMakeLists.txt000066400000000000000000000052121313527062700171370ustar00rootroot00000000000000 include_directories(${PROJECT_SOURCE_DIR}) set(LAPACK_SOURCES getrf/getrf_single.c potrf/potrf_U_single.c potrf/potrf_L_single.c lauum/lauum_U_single.c lauum/lauum_L_single.c ) # add a 'z' to filename for complex version set(LAPACK_MANGLED_SOURCES getf2/getf2_k.c lauu2/lauu2_U.c lauu2/lauu2_L.c potf2/potf2_U.c potf2/potf2_L.c ) # sources that need TRANS set # this has a 'z' version set(TRANS_SOURCES getrs/getrs_single.c ) # sources that need UNIT set # these do NOT have a z version set(UNIT_SOURCES trtri/trtri_U_single.c trtri/trtri_L_single.c ) # these have a 'z' version set(UNIT_SOURCES2 trti2/trti2_U.c trti2/trti2_L.c ) GenerateNamedObjects("${LAPACK_SOURCES}") GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) # TODO: laswp needs arch specific code GenerateNamedObjects("laswp/generic/laswp_k.c" "" "laswp_plus" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k.c" "MINUS" "laswp_minus" false "" "" false 3) if (SMP) if (USE_OPENMP) set(GETRF_SRC getrf/getrf_parallel_omp.c) else () set(GETRF_SRC getrf/getrf_parallel.c) endif () # these do not have 'z' versions set(PARALLEL_SOURCES ${GETRF_SRC} lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c potrf/potrf_U_parallel.c potrf/potrf_L_parallel.c ) # this has a z version list(APPEND TRANS_SOURCES getrs/getrs_parallel.c ) # these do NOT have a z version list(APPEND UNIT_SOURCES trtri/trtri_U_parallel.c trtri/trtri_L_parallel.c ) GenerateNamedObjects("${PARALLEL_SOURCES}") endif () foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") foreach (trans_src ${TRANS_SOURCES}) string(REGEX MATCH "[a-z]/([a-z]+_)([a-z]+)" op_name ${trans_src}) string(REPLACE "/" "/z" ztrans_src ${trans_src}) GenerateNamedObjects("${ztrans_src}" "TRANS=1" "${CMAKE_MATCH_1}N_${CMAKE_MATCH_2}" false "" "" false ${float_type}) GenerateNamedObjects("${ztrans_src}" "TRANS=2" "${CMAKE_MATCH_1}T_${CMAKE_MATCH_2}" false "" "" false ${float_type}) GenerateNamedObjects("${ztrans_src}" "TRANS=3" "${CMAKE_MATCH_1}R_${CMAKE_MATCH_2}" false "" "" false ${float_type}) GenerateNamedObjects("${ztrans_src}" "TRANS=4" "${CMAKE_MATCH_1}C_${CMAKE_MATCH_2}" false "" "" false ${float_type}) endforeach () else () GenerateCombinationObjects("${TRANS_SOURCES}" "TRANS" "N" "" 4 "" false ${float_type}) endif () endforeach () GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) add_library(lapack OBJECT ${OPENBLAS_SRC}) OpenBLAS-0.2.20/lapack/Makefile000066400000000000000000000013221313527062700160350ustar00rootroot00000000000000TOPDIR = .. include ../Makefile.system #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 libs: @for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done prof: @for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ (cd $$d; $(MAKE) prof) ; \ fi; \ done flame: @for d in $(FLAMEDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d libs || exit 1 ; \ fi; \ done hpl: hpl_p: clean :: @for d in $(SUBDIRS) tpp ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done OpenBLAS-0.2.20/lapack/getf2/000077500000000000000000000000001313527062700154065ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/getf2/Makefile000066400000000000000000000023731313527062700170530ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = sgetf2_k.$(SUFFIX) DBLASOBJS = dgetf2_k.$(SUFFIX) QBLASOBJS = qgetf2_k.$(SUFFIX) CBLASOBJS = cgetf2_k.$(SUFFIX) ZBLASOBJS = zgetf2_k.$(SUFFIX) XBLASOBJS = xgetf2_k.$(SUFFIX) sgetf2_k.$(SUFFIX) : getf2_k.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dgetf2_k.$(SUFFIX) : getf2_k.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qgetf2_k.$(SUFFIX) : getf2_k.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) cgetf2_k.$(SUFFIX) : zgetf2_k.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zgetf2_k.$(SUFFIX) : zgetf2_k.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xgetf2_k.$(SUFFIX) : zgetf2_k.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) sgetf2_k.$(PSUFFIX) : getf2_k.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dgetf2_k.$(PSUFFIX) : getf2_k.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qgetf2_k.$(PSUFFIX) : getf2_k.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) cgetf2_k.$(PSUFFIX) : zgetf2_k.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zgetf2_k.$(PSUFFIX) : zgetf2_k.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xgetf2_k.$(PSUFFIX) : zgetf2_k.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/getf2/getf2_k.c000066400000000000000000000102171313527062700170740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dp1 = 1.; static FLOAT dm1 = -1.; blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda; blasint *ipiv, offset; FLOAT *a; FLOAT temp1, temp2; blasint i, j; blasint ip, jp; blasint info; BLASLONG len; FLOAT *b; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } info = 0; b = a; for (j = 0; j < n; j++) { len = MIN(j, m); for (i = 0; i < len; i++) { ip = ipiv[i + offset] - 1 - offset; if (ip != i) { temp1 = *(b + i); temp2 = *(b + ip); *(b + i) = temp2; *(b + ip) = temp1; } } for (i = 1; i < len; i++) { b[i] -= DOTU_K(i, a + i, lda, b, 1); } if (j < m) { GEMV_N(m - j, j, 0, dm1, a + j, lda, b, 1, b + j, 1, sb); jp = j + IAMAX_K(m - j, b + j, 1); if (jp>m) jp = m; //avoid out of boundary ipiv[j + offset] = jp + offset; jp--; temp1 = *(b + jp); if (temp1 != ZERO) { temp1 = dp1 / temp1; if (jp != j) { SWAP_K(j + 1, 0, 0, ZERO, a + j, lda, a + jp, lda, NULL, 0); } if (j + 1 < m) { SCAL_K(m - j - 1, 0, 0, temp1, b + j + 1, 1, NULL, 0, NULL, 0); } } else { if (!info) info = j + 1; } } b += lda; } return info; } OpenBLAS-0.2.20/lapack/getf2/zgetf2_k.c000066400000000000000000000112231313527062700172640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" double fabs(double); static FLOAT dp1 = 1.; static FLOAT dm1 = -1.; blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda, offset; blasint *ipiv; FLOAT *a; FLOAT temp1, temp2, temp3, temp4, ratio, den; blasint i, j; blasint ip, jp; blasint info; BLASLONG len; FLOAT *b; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } info = 0; b = a; for (j = 0; j < n; j++) { len = MIN(j, m); for (i = 0; i < len; i++) { ip = ipiv[i + offset] - 1 - offset; if (ip != i) { temp1 = *(b + i * 2 + 0); temp2 = *(b + i * 2 + 1); temp3 = *(b + ip * 2 + 0); temp4 = *(b + ip * 2 + 1); *(b + i * 2 + 0) = temp3; *(b + i * 2 + 1) = temp4; *(b + ip * 2 + 0) = temp1; *(b + ip * 2 + 1) = temp2; } } ZTRSV_NLU(len, a, lda, b, 1, sb); if (j < m) { GEMV_N(m - j, j, 0, dm1, ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb); jp = j + IAMAX_K(m - j, b + j * 2, 1); if (jp>m) jp = m; //avoid out of boundary ipiv[j + offset] = jp + offset; jp--; temp1 = *(b + jp * 2 + 0); temp2 = *(b + jp * 2 + 1); if ((temp1 != ZERO) || (temp2 != ZERO)) { if (jp != j) { SWAP_K(j + 1, 0, 0, ZERO, ZERO, a + j * 2, lda, a + jp * 2, lda, NULL, 0); } if (fabs(temp1) >= fabs(temp2)){ ratio = temp2 / temp1; den = dp1 /(temp1 * ( 1 + ratio * ratio)); temp3 = den; temp4 = -ratio * den; } else { ratio = temp1 / temp2; den = dp1 /(temp2 * ( 1 + ratio * ratio)); temp3 = ratio * den; temp4 = -den; } if (j + 1 < m) { SCAL_K(m - j - 1, 0, 0, temp3, temp4, b + (j + 1) * 2, 1, NULL, 0, NULL, 0); } } else { if (!info) info = j + 1; } } b += lda * 2; } return info; } OpenBLAS-0.2.20/lapack/getrf/000077500000000000000000000000001313527062700155065ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/getrf/Makefile000066400000000000000000000061101313527062700171440ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = sgetrf_single.$(SUFFIX) DBLASOBJS = dgetrf_single.$(SUFFIX) QBLASOBJS = qgetrf_single.$(SUFFIX) CBLASOBJS = cgetrf_single.$(SUFFIX) ZBLASOBJS = zgetrf_single.$(SUFFIX) XBLASOBJS = xgetrf_single.$(SUFFIX) ifdef SMP SBLASOBJS += sgetrf_parallel.$(SUFFIX) DBLASOBJS += dgetrf_parallel.$(SUFFIX) QBLASOBJS += qgetrf_parallel.$(SUFFIX) CBLASOBJS += cgetrf_parallel.$(SUFFIX) ZBLASOBJS += zgetrf_parallel.$(SUFFIX) XBLASOBJS += xgetrf_parallel.$(SUFFIX) endif ifeq ($(USE_OPENMP), 1) GETRF_SRC = getrf_parallel_omp.c else GETRF_SRC = getrf_parallel.c endif sgetrf_single.$(SUFFIX) : getrf_single.c $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) dgetrf_single.$(SUFFIX) : getrf_single.c $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) qgetrf_single.$(SUFFIX) : getrf_single.c $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) cgetrf_single.$(SUFFIX) : getrf_single.c $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) zgetrf_single.$(SUFFIX) : getrf_single.c $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) xgetrf_single.$(SUFFIX) : getrf_single.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) sgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) dgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) qgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) cgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) zgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) xgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) sgetrf_single.$(PSUFFIX) : getrf_single.c $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) dgetrf_single.$(PSUFFIX) : getrf_single.c $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) qgetrf_single.$(PSUFFIX) : getrf_single.c $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) cgetrf_single.$(PSUFFIX) : getrf_single.c $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) zgetrf_single.$(PSUFFIX) : getrf_single.c $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) xgetrf_single.$(PSUFFIX) : getrf_single.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) sgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) dgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) qgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) cgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) zgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) xgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/getrf/getrf_parallel.c000066400000000000000000000557161313527062700206530ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dm1 = -1.; double sqrt(double); //In this case, the recursive getrf_parallel may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > GETRF_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) #define REAL_GEMM_R (GEMM_R - GEMM_PQ) #ifndef GETRF_FACTOR #define GETRF_FACTOR 0.75 #endif #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); double n = (double)(N - IS - BK); double b = (double)BK; double a = (double)T; return (BLASLONG)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); } #define FORMULA2(M, N, IS, BK, T) (BLASLONG)((double)(N - IS + BK) * (1. - sqrt(1. - 1. / (double)(T)))) static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ BLASLONG is, min_i; BLASLONG js, min_j; BLASLONG jjs, min_jj; BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG k = args -> k; BLASLONG lda = args -> lda; BLASLONG off = args -> ldb; FLOAT *b = (FLOAT *)args -> b + (k ) * COMPSIZE; FLOAT *c = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; blasint *ipiv = (blasint *)args -> c; if (range_n) { n = range_n[1] - range_n[0]; c += range_n[0] * lda * COMPSIZE; d += range_n[0] * lda * COMPSIZE; } if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } for (js = 0; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (0 && GEMM_UNROLL_N <= 8) { LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sbb + k * (jjs - js) * COMPSIZE); } else { LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE); } for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb + k * is * COMPSIZE, sbb + (jjs - js) * k * COMPSIZE, c + (is + jjs * lda) * COMPSIZE, lda, is); } } if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0; for (is = 0; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); GEMM_KERNEL_N(min_i, min_j, k, dm1, #ifdef COMPLEX ZERO, #endif sa, sbb, d + (is + js * lda) * COMPSIZE, lda); } } } /* Non blocking implementation */ typedef struct { volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #ifndef COMPLEX #define KERNEL_OPERATION(M, N, K, SA, SB, C, LDC, X, Y) \ GEMM_KERNEL_N(M, N, K, dm1, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #else #define KERNEL_OPERATION(M, N, K, SA, SB, C, LDC, X, Y) \ GEMM_KERNEL_N(M, N, K, dm1, ZERO, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #endif static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; FLOAT *buffer[DIVIDE_RATE]; BLASLONG jjs, min_jj, div_n; BLASLONG i, current; BLASLONG is, min_i; BLASLONG m, n_from, n_to; BLASLONG k = args -> k; BLASLONG lda = args -> lda; BLASLONG off = args -> ldb; FLOAT *a = (FLOAT *)args -> b + (k ) * COMPSIZE; FLOAT *b = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; FLOAT *c = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb= sb; blasint *ipiv = (blasint *)args -> c; volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } m = range_m[1] - range_m[0]; n_from = range_n[mypos + 0]; n_to = range_n[mypos + 1]; a += range_m[0] * COMPSIZE; c += range_m[0] * COMPSIZE; div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; buffer[0] = sbb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * (((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N) * COMPSIZE; } for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (0 && GEMM_UNROLL_N <= 8) { printf("helllo\n"); LASWP_NCOPY(min_jj, off + 1, off + k, b + (- off + jjs * lda) * COMPSIZE, lda, ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); } else { LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif b + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); } for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb + k * is * COMPSIZE, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE, b + (is + jjs * lda) * COMPSIZE, lda, is); } } MB; for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } flag[mypos * CACHE_LINE_SIZE] = 0; if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; } } for(is = 0; is < m; is += min_i){ min_i = m - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; } ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); current = mypos; do { div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if ((current != mypos) && (!is)) { while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; } KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, is, xxx); MB; if (is + min_i >= m) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } current ++; if (current >= args -> nthreads) current = 0; } while (current != mypos); } for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; } } return 0; } #if 1 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, mn, lda, offset; BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2]; blasint *ipiv, iinfo, info; int mode; blas_arg_t newarg; FLOAT *a, *sbb; FLOAT dummyalpha[2] = {ZERO, ZERO}; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_N[MAX_CPU_NUMBER + 1]; #ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; #else job_t * job=NULL; #endif BLASLONG width, nn, mm; BLASLONG i, j, k, is, bk; BLASLONG num_cpu; #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #endif #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; newarg.c = ipiv; newarg.lda = lda; info = 0; mn = MIN(m, n); init_bk = ((mn / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (init_bk > GEMM_Q) init_bk = GEMM_Q; if (init_bk <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } next_bk = init_bk; bk = mn; if (bk > next_bk) bk = next_bk; range_n_new[0] = offset; range_n_new[1] = offset + bk; iinfo = CNAME(args, NULL, range_n_new, sa, sb, 0); if (iinfo && !info) info = iinfo; #ifdef USE_ALLOC_HEAP job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); if(job==NULL){ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); exit(1); } #endif newarg.common = (void *)job; TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); is = 0; num_cpu = 0; while (is < mn) { width = ((FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (width > mn - is - bk) width = mn - is - bk; if (width < bk) { next_bk = ((FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (next_bk > bk) next_bk = bk; width = next_bk; if (width > mn - is - bk) width = mn - is - bk; } if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]); mm = m - bk - is; nn = n - bk - is; newarg.a = sb; newarg.b = a + (is + is * lda) * COMPSIZE; newarg.d = (void *)flag; newarg.m = mm; newarg.n = nn; newarg.k = bk; newarg.ldb = is + offset; nn -= width; range_n_mine[0] = 0; range_n_mine[1] = width; range_N[0] = width; range_M[0] = 0; num_cpu = 0; while (nn > 0){ if (mm >= nn) { width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (nn < width) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (mm < width) width = mm; if (nn <= 0) width = mm; mm -= width; range_M[num_cpu + 1] = range_M[num_cpu] + width; } else { width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (mm < width) width = mm; mm -= width; range_M[num_cpu + 1] = range_M[num_cpu] + width; width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (nn < width) width = nn; if (mm <= 0) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; } queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_advanced_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = &range_M[num_cpu]; queue[num_cpu].range_n = &range_N[0]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; flag[num_cpu * CACHE_LINE_SIZE] = 1; num_cpu ++; } newarg.nthreads = num_cpu; if (num_cpu > 0) { for (j = 0; j < num_cpu; j++) { for (i = 0; i < num_cpu; i++) { for (k = 0; k < DIVIDE_RATE; k++) { job[j].working[i][CACHE_LINE_SIZE * k] = 0; } } } } is += bk; bk = mn - is; if (bk > next_bk) bk = next_bk; range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; if (num_cpu > 0) { queue[num_cpu - 1].next = NULL; exec_blas_async(0, &queue[0]); inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); if (iinfo && !info) info = iinfo + is; for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); } else { inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); if (iinfo && !info) info = iinfo + is; } } next_bk = init_bk; is = 0; while (is < mn) { bk = mn - is; if (bk > next_bk) bk = next_bk; width = ((FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (width > mn - is - bk) width = mn - is - bk; if (width < bk) { next_bk = ((FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (next_bk > bk) next_bk = bk; } blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); is += bk; } #ifdef USE_ALLOC_HEAP free(job); #endif return info; } #else blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, mn, lda, offset; BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2]; blasint *ipiv, iinfo, info; int mode; blas_arg_t newarg; FLOAT *a, *sbb; FLOAT dummyalpha[2] = {ZERO, ZERO}; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; newarg.c = ipiv; newarg.lda = lda; newarg.common = NULL; newarg.nthreads = args -> nthreads; mn = MIN(m, n); init_bk = ((mn / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (init_bk > GEMM_Q) init_bk = GEMM_Q; if (init_bk <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } width = FORMULA1(m, n, 0, init_bk, args -> nthreads); width = ((width + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (width > n - init_bk) width = n - init_bk; if (width < init_bk) { BLASLONG temp; temp = FORMULA2(m, n, 0, init_bk, args -> nthreads); temp = ((temp + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N; if (temp < init_bk) init_bk = temp; } next_bk = init_bk; bk = init_bk; range_n_new[0] = offset; range_n_new[1] = offset + bk; info = CNAME(args, NULL, range_n_new, sa, sb, 0); TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); is = 0; num_cpu = 0; sbb = (FLOAT *)((((BLASULONG)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); while (is < mn) { width = FORMULA1(m, n, is, bk, args -> nthreads); width = ((width + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (width < bk) { next_bk = FORMULA2(m, n, is, bk, args -> nthreads); next_bk = ((next_bk + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (next_bk > bk) next_bk = bk; #if 0 if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); #else if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); #endif width = next_bk; } if (width > mn - is - bk) { next_bk = mn - is - bk; width = next_bk; } nn = n - bk - is; if (width > nn) width = nn; if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]); range[0] = 0; range[1] = width; num_cpu = 1; nn -= width; newarg.a = sb; newarg.b = a + (is + is * lda) * COMPSIZE; newarg.d = (void *)flag; newarg.m = m - bk - is; newarg.n = n - bk - is; newarg.k = bk; newarg.ldb = is + offset; while (nn > 0){ width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu); nn -= width; if (nn < 0) width = width + nn; range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; //queue[num_cpu].routine = inner_advanced_thread; queue[num_cpu].routine = (void *)inner_basic_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = NULL; queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; flag[num_cpu * CACHE_LINE_SIZE] = 1; num_cpu ++; } queue[num_cpu - 1].next = NULL; is += bk; bk = n - is; if (bk > next_bk) bk = next_bk; range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; if (num_cpu > 1) { exec_blas_async(1, &queue[1]); #if 0 inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); #else if (range[1] >= bk * 4) { BLASLONG myrange[2]; myrange[0] = 0; myrange[1] = bk; inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); myrange[0] = bk; myrange[1] = range[1]; inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); } else { inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); } #endif for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); } else { inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); } if (iinfo && !info) info = iinfo + is; } next_bk = init_bk; bk = init_bk; is = 0; while (is < mn) { bk = mn - is; if (bk > next_bk) bk = next_bk; width = FORMULA1(m, n, is, bk, args -> nthreads); width = ((width + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (width < bk) { next_bk = FORMULA2(m, n, is, bk, args -> nthreads); next_bk = ((next_bk + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (next_bk > bk) next_bk = bk; #if 0 if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); #else if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); #endif } if (width > mn - is - bk) { next_bk = mn - is - bk; width = next_bk; } blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); is += bk; } return info; } #endif OpenBLAS-0.2.20/lapack/getrf/getrf_parallel_omp.c000066400000000000000000000163701313527062700215170ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define GEMM_PQ MAX(GEMM_P, GEMM_Q) #define REAL_GEMM_R (GEMM_R - GEMM_PQ) static FLOAT dm1 = -1.; static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ BLASLONG is, min_i; BLASLONG js, min_j; BLASLONG jjs, min_jj; BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG k = args -> k; BLASLONG lda = args -> lda; BLASLONG off = args -> ldb; FLOAT *b = (FLOAT *)args -> b + (k ) * COMPSIZE; FLOAT *c = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; blasint *ipiv = (blasint *)args -> c; if (range_n) { n = range_n[1] - range_n[0]; c += range_n[0] * lda * COMPSIZE; d += range_n[0] * lda * COMPSIZE; } for (js = 0; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #if 0 LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sb + k * (jjs - js) * COMPSIZE); #else LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE); #endif for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif (FLOAT *)args -> a + k * is * COMPSIZE, sb + (jjs - js) * k * COMPSIZE, c + (is + jjs * lda) * COMPSIZE, lda, is); } } for (is = 0; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); GEMM_KERNEL_N(min_i, min_j, k, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, d + (is + js * lda) * COMPSIZE, lda); } } } blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda, offset; blasint *ipiv, iinfo, info; BLASLONG j, jb, mn, blocking; FLOAT *a, *offsetA, *offsetB; BLASLONG range_N[2]; blas_arg_t newarg; int mode; FLOAT *sbb; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; mn = MIN(m, n); blocking = ((mn / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (blocking > GEMM_Q) blocking = GEMM_Q; #ifdef POWER8 if (blocking <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } #else if (blocking <= GEMM_UNROLL_N*2) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } #endif sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; for (j = 0; j < mn; j += blocking) { jb = mn - j; if (jb > blocking) jb = blocking; offsetA = a + j * lda * COMPSIZE; offsetB = a + (j + jb) * lda * COMPSIZE; range_N[0] = offset + j; range_N[1] = offset + j + jb; iinfo = CNAME(args, NULL, range_N, sa, sb, 0); if (iinfo && !info) info = iinfo + j; if (j + jb < n) { TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); newarg.m = m - jb - j; newarg.n = n - jb - j; newarg.k = jb; newarg.a = sb; newarg.lda = lda; newarg.b = a + (j + j * lda) * COMPSIZE; newarg.ldb = j + offset; newarg.c = ipiv; newarg.common = NULL; newarg.nthreads = args -> nthreads; gemm_thread_n(mode, &newarg, NULL, NULL, (void *)inner_thread, sa, sbb, args -> nthreads); } } for (j = 0; j < mn; j += jb) { jb = MIN(mn - j, blocking); LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, #ifdef COMPLEX ZERO, #endif a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); } return info; } OpenBLAS-0.2.20/lapack/getrf/getrf_single.c000066400000000000000000000136051313527062700203270ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #define GEMM_PQ MAX(GEMM_P, GEMM_Q) #define REAL_GEMM_R (GEMM_R - GEMM_PQ) static FLOAT dm1 = -1.; blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda, offset; BLASLONG j, js, jmin, is, imin, jc, jcmin; BLASLONG jjs, min_jj; blasint *ipiv, iinfo, info; BLASLONG jb, mn, blocking; FLOAT *a, *offsetA, *offsetB; BLASLONG range_N[2]; FLOAT *sbb; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; mn = MIN(m, n); blocking = ((mn / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (blocking > GEMM_Q) blocking = GEMM_Q; #ifdef POWER8 if (blocking <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } #else if (blocking <= GEMM_UNROLL_N * 2) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } #endif sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; for (j = 0; j < mn; j += blocking) { jb = mn - j; if (jb > blocking) jb = blocking; offsetA = a + j * lda * COMPSIZE; offsetB = a + (j + jb) * lda * COMPSIZE; range_N[0] = offset + j; range_N[1] = offset + j + jb; iinfo = CNAME(args, NULL, range_N, sa, sb, 0); if (iinfo && !info) info = iinfo + j; if (j + jb < n) { TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); for (js = j + jb; js < n; js += REAL_GEMM_R){ jmin = n - js; if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R; for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){ min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #if 1 LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, #endif a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE); #else LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE); #endif for (jc = 0; jc < jb; jc += GEMM_P) { jcmin = jb - jc; if (jcmin > GEMM_P) jcmin = GEMM_P; TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1, #ifdef COMPLEX ZERO, #endif sb + jb * jc * COMPSIZE, sbb + jb * (jjs - js) * COMPSIZE, a + (j + jc + jjs * lda) * COMPSIZE, lda, jc); } } for (is = j + jb; is < m; is += GEMM_P){ imin = m - is; if (imin > GEMM_P) imin = GEMM_P; GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa); GEMM_KERNEL_N(imin, jmin, jb, dm1, #ifdef COMPLEX ZERO, #endif sa, sbb, a + (is + js * lda) * COMPSIZE, lda); } } } } for (j = 0; j < mn; j += jb) { jb = MIN(mn - j, blocking); LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, #ifdef COMPLEX ZERO, #endif a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); } return info; } OpenBLAS-0.2.20/lapack/getrs/000077500000000000000000000000001313527062700155235ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/getrs/Makefile000066400000000000000000000214221313527062700171640ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = sgetrs_N_single.$(SUFFIX) sgetrs_T_single.$(SUFFIX) DBLASOBJS = dgetrs_N_single.$(SUFFIX) dgetrs_T_single.$(SUFFIX) QBLASOBJS = qgetrs_N_single.$(SUFFIX) qgetrs_T_single.$(SUFFIX) CBLASOBJS = cgetrs_N_single.$(SUFFIX) cgetrs_T_single.$(SUFFIX) cgetrs_R_single.$(SUFFIX) cgetrs_C_single.$(SUFFIX) ZBLASOBJS = zgetrs_N_single.$(SUFFIX) zgetrs_T_single.$(SUFFIX) zgetrs_R_single.$(SUFFIX) zgetrs_C_single.$(SUFFIX) XBLASOBJS = xgetrs_N_single.$(SUFFIX) xgetrs_T_single.$(SUFFIX) xgetrs_R_single.$(SUFFIX) xgetrs_C_single.$(SUFFIX) ifdef SMP SBLASOBJS += sgetrs_N_parallel.$(SUFFIX) sgetrs_T_parallel.$(SUFFIX) DBLASOBJS += dgetrs_N_parallel.$(SUFFIX) dgetrs_T_parallel.$(SUFFIX) QBLASOBJS += qgetrs_N_parallel.$(SUFFIX) qgetrs_T_parallel.$(SUFFIX) CBLASOBJS += cgetrs_N_parallel.$(SUFFIX) cgetrs_T_parallel.$(SUFFIX) cgetrs_R_parallel.$(SUFFIX) cgetrs_C_parallel.$(SUFFIX) ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_parallel.$(SUFFIX) zgetrs_C_parallel.$(SUFFIX) XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX) endif sgetrs_N_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) sgetrs_T_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) sgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) sgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) dgetrs_N_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) dgetrs_T_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) dgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) dgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) qgetrs_N_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) qgetrs_T_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) qgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) qgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) cgetrs_N_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) cgetrs_T_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) cgetrs_R_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) cgetrs_C_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) cgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) cgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) cgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) cgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) zgetrs_N_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) zgetrs_T_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) zgetrs_R_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) zgetrs_C_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) zgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) zgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) zgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) zgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) xgetrs_N_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) xgetrs_T_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) xgetrs_R_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) xgetrs_C_single.$(SUFFIX) : zgetrs_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) xgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) xgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) xgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) xgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) sgetrs_N_single.$(PSUFFIX) : getrs_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) sgetrs_T_single.$(PSUFFIX) : getrs_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) sgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) sgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) dgetrs_N_single.$(PSUFFIX) : getrs_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) dgetrs_T_single.$(PSUFFIX) : getrs_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) dgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) dgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) qgetrs_N_single.$(PSUFFIX) : getrs_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) qgetrs_T_single.$(PSUFFIX) : getrs_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) qgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) qgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) cgetrs_N_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) cgetrs_T_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) cgetrs_R_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) cgetrs_C_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) cgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) cgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) cgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) cgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) zgetrs_N_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) zgetrs_T_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) zgetrs_R_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) zgetrs_C_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) zgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) zgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) zgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) zgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) xgetrs_N_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) xgetrs_T_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) xgetrs_R_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) xgetrs_C_single.$(PSUFFIX) : zgetrs_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) xgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) xgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) xgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) xgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/getrs/getrs_parallel.c000066400000000000000000000112541313527062700206720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { BLASLONG n = args -> n; BLASLONG off = 0; if (range_n) { n = range_n[1] - range_n[0]; off = range_n[0]; } #ifndef TRANS LASWP_PLUS(n, 1, args -> m, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); TRSM_LNLU (args, range_m, range_n, sa, sb, 0); TRSM_LNUN (args, range_m, range_n, sa, sb, 0); #else TRSM_LTUN (args, range_m, range_n, sa, sb, 0); TRSM_LTLU (args, range_m, range_n, sa, sb, 0); LASWP_MINUS(n, 1, args -> m, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); #endif return 0; } blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { int mode; #ifndef TRANS if (args -> n == 1){ LASWP_PLUS(1, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); } #else if (args -> n == 1){ TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); LASWP_MINUS(1, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); #else mode = BLAS_SINGLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); #endif gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); } #endif return 0; } OpenBLAS-0.2.20/lapack/getrs/getrs_single.c000066400000000000000000000070231313527062700203560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { #ifndef TRANS LASWP_PLUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); if (args -> n == 1){ TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); } else { TRSM_LNLU (args, range_m, range_n, sa, sb, 0); TRSM_LNUN (args, range_m, range_n, sa, sb, 0); } #else if (args -> n == 1){ TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); } else { TRSM_LTUN (args, range_m, range_n, sa, sb, 0); TRSM_LTLU (args, range_m, range_n, sa, sb, 0); } LASWP_MINUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); #endif return 0; } OpenBLAS-0.2.20/lapack/getrs/zgetrs_parallel.c000066400000000000000000000125131313527062700210630ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { BLASLONG n = args -> n; BLASLONG off = 0; if (range_n) { n = range_n[1] - range_n[0]; off = range_n[0]; } #if TRANS == 1 LASWP_PLUS(n, 1, args -> m, ZERO, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); TRSM_LNLU (args, range_m, range_n, sa, sb, 0); TRSM_LNUN (args, range_m, range_n, sa, sb, 0); #elif TRANS == 2 TRSM_LTUN (args, range_m, range_n, sa, sb, 0); TRSM_LTLU (args, range_m, range_n, sa, sb, 0); LASWP_MINUS(n, 1, args -> m, ZERO, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); #elif TRANS == 3 LASWP_PLUS(n, 1, args -> m, ZERO, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); TRSM_LRLU (args, range_m, range_n, sa, sb, 0); TRSM_LRUN (args, range_m, range_n, sa, sb, 0); #else TRSM_LCUN (args, range_m, range_n, sa, sb, 0); TRSM_LCLU (args, range_m, range_n, sa, sb, 0); LASWP_MINUS(n, 1, args -> m, ZERO, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); #endif return 0; } blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { int mode; if (args -> n == 1){ #if TRANS == 1 LASWP_PLUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); ZTRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); ZTRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); #elif TRANS == 2 ZTRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); ZTRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); LASWP_MINUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); #elif TRANS == 3 LASWP_PLUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); ZTRSV_RLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); ZTRSV_RUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); #else ZTRSV_CUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); ZTRSV_CLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); LASWP_MINUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); #endif } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); } return 0; } OpenBLAS-0.2.20/lapack/getrs/zgetrs_single.c000066400000000000000000000071741313527062700205570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { #if TRANS == 1 LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); TRSM_LNLU(args, range_m, range_n, sa, sb, 0); TRSM_LNUN(args, range_m, range_n, sa, sb, 0); #elif TRANS == 2 TRSM_LTUN(args, range_m, range_n, sa, sb, 0); TRSM_LTLU(args, range_m, range_n, sa, sb, 0); LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); #elif TRANS == 3 LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); TRSM_LRLU(args, range_m, range_n, sa, sb, 0); TRSM_LRUN(args, range_m, range_n, sa, sb, 0); #else TRSM_LCUN(args, range_m, range_n, sa, sb, 0); TRSM_LCLU(args, range_m, range_n, sa, sb, 0); LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); #endif return 0; } OpenBLAS-0.2.20/lapack/laswp/000077500000000000000000000000001313527062700155255ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/Makefile000066400000000000000000000020111313527062700171570ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) DBLASOBJS = dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) QBLASOBJS = qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) CBLASOBJS = claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) ZBLASOBJS = zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) XBLASOBJS = xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) \ qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) \ claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) \ xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) \ slaswp_plus.$(PSUFFIX) slaswp_minus.$(PSUFFIX) dlaswp_plus.$(PSUFFIX) dlaswp_minus.$(PSUFFIX) \ qlaswp_plus.$(PSUFFIX) qlaswp_minus.$(PSUFFIX) \ claswp_plus.$(PSUFFIX) claswp_minus.$(PSUFFIX) zlaswp_plus.$(PSUFFIX) zlaswp_minus.$(PSUFFIX) \ xlaswp_plus.$(PSUFFIX) xlaswp_minus.$(PSUFFIX) : dummy cd $(ARCH) && $(MAKE) ../$(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/laswp/alpha/000077500000000000000000000000001313527062700166125ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/alpha/Makefile000066400000000000000000000002221313527062700202460ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/arm/000077500000000000000000000000001313527062700163045ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/arm/Makefile000066400000000000000000000010631313527062700177440ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system ifeq ($(CORE), CORE2) LASWP = ../generic/laswp_k_2.c ZLASWP = ../generic/zlaswp_k_2.c endif ifeq ($(CORE), OPTERON) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(CORE), PRESCOTT) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(DYNAMIC_ARCH), 1) LASWP = ../generic/laswp_k_4.c ZLASWP = ../generic/zlaswp_k_4.c endif ifndef LASWP LASWP = ../generic/laswp_k.c endif ifndef ZLASWP ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/arm64/000077500000000000000000000000001313527062700164565ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/arm64/Makefile000066400000000000000000000010631313527062700201160ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system ifeq ($(CORE), CORE2) LASWP = ../generic/laswp_k_2.c ZLASWP = ../generic/zlaswp_k_2.c endif ifeq ($(CORE), OPTERON) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(CORE), PRESCOTT) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(DYNAMIC_ARCH), 1) LASWP = ../generic/laswp_k_4.c ZLASWP = ../generic/zlaswp_k_4.c endif ifndef LASWP LASWP = ../generic/laswp_k.c endif ifndef ZLASWP ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/generic/000077500000000000000000000000001313527062700171415ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/generic/Makefile000066400000000000000000000053341313527062700206060ustar00rootroot00000000000000ifndef INCLUDED TOPDIR = ../../.. include $(TOPDIR)/Makefile.system endif ifndef LASWP LASWP = ../generic/laswp_k.c endif ifndef ZLASWP ZLASWP = ../generic/zlaswp_k.c endif LASWP_DEPS = ../generic/laswp_k_1.c ../generic/laswp_k_2.c \ ../generic/laswp_k_4.c ../generic/laswp_k_8.c ZLASWP_DEPS = ../generic/zlaswp_k_1.c ../generic/zlaswp_k_2.c \ ../generic/zlaswp_k_4.c include ../../../Makefile.tail all: ../slaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) $(CC) -c $(CFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) ../slaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) $(CC) -c $(CFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) ../dlaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) $(CC) -c $(CFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) ../dlaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) $(CC) -c $(CFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) ../qlaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) $(CC) -c $(CFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) ../qlaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) $(CC) -c $(CFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) ../claswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) $(CC) -c $(CFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) ../claswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) $(CC) -c $(CFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) ../zlaswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) $(CC) -c $(CFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) ../zlaswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) $(CC) -c $(CFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) ../xlaswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) $(CC) -c $(CFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) ../xlaswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) $(CC) -c $(CFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) ../slaswp_plus.$(PSUFFIX) : $(LASWP) $(CC) -c $(PFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) ../slaswp_minus.$(PSUFFIX) : $(LASWP) $(CC) -c $(PFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) ../dlaswp_plus.$(PSUFFIX) : $(LASWP) $(CC) -c $(PFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) ../dlaswp_minus.$(PSUFFIX) : $(LASWP) $(CC) -c $(PFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) ../qlaswp_plus.$(PSUFFIX) : $(LASWP) $(CC) -c $(PFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) ../qlaswp_minus.$(PSUFFIX) : $(LASWP) $(CC) -c $(PFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) ../claswp_plus.$(PSUFFIX) : $(ZLASWP) $(CC) -c $(PFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) ../claswp_minus.$(PSUFFIX) : $(ZLASWP) $(CC) -c $(PFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) ../zlaswp_plus.$(PSUFFIX) : $(ZLASWP) $(CC) -c $(PFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) ../zlaswp_minus.$(PSUFFIX) : $(ZLASWP) $(CC) -c $(PFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) ../xlaswp_plus.$(PSUFFIX) : $(ZLASWP) $(CC) -c $(PFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) ../xlaswp_minus.$(PSUFFIX) : $(ZLASWP) $(CC) -c $(PFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) OpenBLAS-0.2.20/lapack/laswp/generic/laswp_k.c000066400000000000000000000054621313527062700207540ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #if GEMM_UNROLL_N >= 8 #include "laswp_k_8.c" #elif GEMM_UNROLL_N >= 4 #include "laswp_k_4.c" #elif GEMM_UNROLL_N >= 2 #include "laswp_k_2.c" #else #include "laswp_k_1.c" #endif OpenBLAS-0.2.20/lapack/laswp/generic/laswp_k_1.c000066400000000000000000000140551313527062700211720ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef MINUS #define a2 (a1 + 1) #else #define a2 (a1 - 1) #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; FLOAT A1, A2, B1, B2; a--; k1 --; #ifndef MINUS ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { //Only have 1 row ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; if(a1 == b1) return 0; for(j=0; j 0) { do { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; i = (rows >> 1); i--; //Main Loop while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1)); #else asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1)); #endif #endif #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1)); asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2)); #else asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1)); asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2)); #endif #endif A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 2; #else a1 -= 2; #endif i --; } //Loop Ending A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } #ifndef MINUS a1 += 2; #else a1 -= 2; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; A1 = *a1; B1 = *b1; *a1 = B1; *b1 = A1; } a += lda; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/lapack/laswp/generic/laswp_k_2.c000066400000000000000000000216171313527062700211750ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef MINUS #define a2 (a1 + 1) #define a4 (a3 + 1) #else #define a2 (a1 - 1) #define a4 (a3 - 1) #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; a--; k1 --; #ifndef MINUS ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; j = (n >> 1); rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { //Only have 1 row ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; if(a1 == b1) return 0; for(j=0; j 0) { do { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif a3 = a1 + 1 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; i = ((rows) >> 1); // Loop pipeline i--; //Main Loop while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b3)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a3)); #else asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1)); asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b3)); asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1)); asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a3)); #endif #endif B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; #ifndef MINUS a1 += 2; a3 += 2; #else a1 -= 2; a3 -= 2; #endif i --; } //Loop Ending B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; } } #ifndef MINUS a1 += 2; a3 += 2; #else a1 -= 2; a3 -= 2; #endif //Remain i = ((rows) & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; b3 = b1 + 1 * lda; A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } a += 2 * lda; j --; } while (j > 0); } if (n & 1) { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; i = ((rows) >> 1); i --; while (i > 0) { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 2; #else a1 -= 2; #endif i --; } //Loop Ending (n=1) A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } #ifndef MINUS a1 += 2; #else a1 -= 2; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; A1 = *a1; B1 = *b1; *a1 = B1; *b1 = A1; } } return 0; } OpenBLAS-0.2.20/lapack/laswp/generic/laswp_k_4.c000066400000000000000000000323301313527062700211710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef MINUS #define a2 (a1 + 1) #define a4 (a3 + 1) #define a6 (a5 + 1) #define a8 (a7 + 1) #else #define a2 (a1 - 1) #define a4 (a3 - 1) #define a6 (a5 - 1) #define a8 (a7 - 1) #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; FLOAT *b5, *b6, *b7, *b8; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; FLOAT A5, A6, B5, B6, A7, A8, B7, B8; a--; k1 --; #ifndef MINUS ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { //Only have 1 row ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; if(a1 == b1) return 0; for(j=0; j> 2); if (j > 0) { do { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; i = ((k2 - k1) >> 1); i--; //Loop pipeline //Main Loop while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; A5 = *a5; A6 = *a6; A7 = *a7; A8 = *a8; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; B5 = *b5; B6 = *b6; B7 = *b7; B8 = *b8; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; *a6 = B6; *b6 = A6; *a8 = B8; *b8 = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; *a5 = A6; *a6 = B6; *b6 = A5; *a7 = A8; *a8 = B8; *b8 = A7; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; *a5 = A6; *a6 = B5; *b5 = A5; *a7 = A8; *a8 = B7; *b7 = A7; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; *a5 = B5; *a6 = A5; *b5 = A6; *a7 = B7; *a8 = A7; *b7 = A8; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; *a5 = B5; *a6 = B6; *b5 = A5; *b6 = A6; *a7 = B7; *a8 = B8; *b7 = A7; *b8 = A8; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; #ifndef MINUS a1 += 2; a3 += 2; a5 += 2; a7 += 2; #else a1 -= 2; a3 -= 2; a5 -= 2; a7 -= 2; #endif i --; } //Loop Ending A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; A5 = *a5; A6 = *a6; A7 = *a7; A8 = *a8; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; B5 = *b5; B6 = *b6; B7 = *b7; B8 = *b8; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; *a6 = B6; *b6 = A6; *a8 = B8; *b8 = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; *a5 = A6; *a6 = B6; *b6 = A5; *a7 = A8; *a8 = B8; *b8 = A7; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; *a5 = A6; *a6 = B5; *b5 = A5; *a7 = A8; *a8 = B7; *b7 = A7; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; *a5 = B5; *a6 = A5; *b5 = A6; *a7 = B7; *a8 = A7; *b7 = A8; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; *a5 = B5; *a6 = B6; *b5 = A5; *b6 = A6; *a7 = B7; *a8 = B8; *b7 = A7; *b8 = A8; } } #ifndef MINUS a1 += 2; a3 += 2; a5 += 2; a7 += 2; #else a1 -= 2; a3 -= 2; a5 -= 2; a7 -= 2; #endif //Remain i = ((rows) & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; b3 = b1 + 1 * lda; b5 = b1 + 2 * lda; b7 = b1 + 3 * lda; A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; A5 = *a5; B5 = *b5; A7 = *a7; B7 = *b7; *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; } a += 4 * lda; j --; } while (j > 0); } if (n & 2) { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif a3 = a1 + 1 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; i = ((rows) >> 1); i--; while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; #ifndef MINUS a1 += 2; a3 += 2; #else a1 -= 2; a3 -= 2; #endif i --; } //Loop Ending B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; } } #ifndef MINUS a1 += 2; a3 += 2; #else a1 -= 2; a3 -= 2; #endif i = ((rows) & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; b3 = b1 + 1 * lda; A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } a += 2 * lda; } if (n & 1) { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; i = ((rows) >> 1); i --; while (i > 0) { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 2; #else a1 -= 2; #endif i --; } //Loop Ending (n=1) A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } #ifndef MINUS a1 += 2; #else a1 -= 2; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; A1 = *a1; B1 = *b1; *a1 = B1; *b1 = A1; } } return 0; } OpenBLAS-0.2.20/lapack/laswp/generic/laswp_k_8.c000066400000000000000000000552101313527062700211770ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef MINUS #define a2 (a1 + 1) #define a4 (a3 + 1) #define a6 (a5 + 1) #define a8 (a7 + 1) #define a10 (a9 + 1) #define a12 (a11 + 1) #define a14 (a13 + 1) #define a16 (a15 + 1) #else #define a2 (a1 - 1) #define a4 (a3 - 1) #define a6 (a5 - 1) #define a8 (a7 - 1) #define a10 (a9 - 1) #define a12 (a11 - 1) #define a14 (a13 - 1) #define a16 (a15 - 1) #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *a9, *a11, *a13, *a15; FLOAT *b1, *b2, *b3, *b4; FLOAT *b5, *b6, *b7, *b8; FLOAT *b9, *b10, *b11, *b12; FLOAT *b13, *b14, *b15, *b16; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; FLOAT A5, A6, B5, B6, A7, A8, B7, B8; FLOAT A9, A10, B9, B10, A11, A12, B11, B12; FLOAT A13, A14, B13, B14, A15, A16, B15, B16; a--; k1 --; #ifndef MINUS ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { //Only have 1 row ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; if(a1 == b1) return 0; for(j=0; j> 3); if (j > 0) { do { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; a9 = a1 + 4 * lda; a11 = a1 + 5 * lda; a13 = a1 + 6 * lda; a15 = a1 + 7 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; b9 = b1 + 4 * lda; b10 = b2 + 4 * lda; b11 = b1 + 5 * lda; b12 = b2 + 5 * lda; b13 = b1 + 6 * lda; b14 = b2 + 6 * lda; b15 = b1 + 7 * lda; b16 = b2 + 7 * lda; i = (rows >> 1); i--; //Loop pipeline //Main Loop while (i > 0) { B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; B5 = *b5; B6 = *b6; B7 = *b7; B8 = *b8; B9 = *b9; B10 = *b10; B11 = *b11; B12 = *b12; B13 = *b13; B14 = *b14; B15 = *b15; B16 = *b16; A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; A5 = *a5; A6 = *a6; A7 = *a7; A8 = *a8; A9 = *a9; A10 = *a10; A11 = *a11; A12 = *a12; A13 = *a13; A14 = *a14; A15 = *a15; A16 = *a16; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; *a9 = A10; *a10 = A9; *a11 = A12; *a12 = A11; *a13 = A14; *a14 = A13; *a15 = A16; *a16 = A15; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; *a6 = B6; *b6 = A6; *a8 = B8; *b8 = A8; *a10 = B10; *b10 = A10; *a12 = B12; *b12 = A12; *a14 = B14; *b14 = A14; *a16 = B16; *b16 = A16; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; *a9 = A10; *a10 = A9; *a11 = A12; *a12 = A11; *a13 = A14; *a14 = A13; *a15 = A16; *a16 = A15; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; *a5 = A6; *a6 = B6; *b6 = A5; *a7 = A8; *a8 = B8; *b8 = A7; *a9 = A10; *a10 = B10; *b10 = A9; *a11 = A12; *a12 = B12; *b12 = A11; *a13 = A14; *a14 = B14; *b14 = A13; *a15 = A16; *a16 = B16; *b16 = A15; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; *a5 = A6; *a6 = B5; *b5 = A5; *a7 = A8; *a8 = B7; *b7 = A7; *a9 = A10; *a10 = B9; *b9 = A9; *a11 = A12; *a12 = B11; *b11 = A11; *a13 = A14; *a14 = B13; *b13 = A13; *a15 = A16; *a16 = B15; *b15 = A15; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; *a9 = B9; *b9 = A9; *a11 = B11; *b11 = A11; *a13 = B13; *b13 = A13; *a15 = B15; *b15 = A15; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; *a5 = B5; *a6 = A5; *b5 = A6; *a7 = B7; *a8 = A7; *b7 = A8; *a9 = B9; *a10 = A9; *b9 = A10; *a11 = B11; *a12 = A11; *b11 = A12; *a13 = B13; *a14 = A13; *b13 = A14; *a15 = B15; *a16 = A15; *b15 = A16; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; *a5 = B5; *a6 = B6; *b5 = A5; *b6 = A6; *a7 = B7; *a8 = B8; *b7 = A7; *b8 = A8; *a9 = B9; *a10 = B10; *b9 = A9; *b10 = A10; *a11 = B11; *a12 = B12; *b11 = A11; *b12 = A12; *a13 = B13; *a14 = B14; *b13 = A13; *b14 = A14; *a15 = B15; *a16 = B16; *b15 = A15; *b16 = A16; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; b9 = b1 + 4 * lda; b10 = b2 + 4 * lda; b11 = b1 + 5 * lda; b12 = b2 + 5 * lda; b13 = b1 + 6 * lda; b14 = b2 + 6 * lda; b15 = b1 + 7 * lda; b16 = b2 + 7 * lda; #ifndef MINUS a1 += 2; a3 += 2; a5 += 2; a7 += 2; a9 += 2; a11 += 2; a13 += 2; a15 += 2; #else a1 -= 2; a3 -= 2; a5 -= 2; a7 -= 2; a9 -= 2; a11 -= 2; a13 -= 2; a15 -= 2; #endif i --; } //Loop Ending B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; B5 = *b5; B6 = *b6; B7 = *b7; B8 = *b8; B9 = *b9; B10 = *b10; B11 = *b11; B12 = *b12; B13 = *b13; B14 = *b14; B15 = *b15; B16 = *b16; A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; A5 = *a5; A6 = *a6; A7 = *a7; A8 = *a8; A9 = *a9; A10 = *a10; A11 = *a11; A12 = *a12; A13 = *a13; A14 = *a14; A15 = *a15; A16 = *a16; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; *a9 = A10; *a10 = A9; *a11 = A12; *a12 = A11; *a13 = A14; *a14 = A13; *a15 = A16; *a16 = A15; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; *a6 = B6; *b6 = A6; *a8 = B8; *b8 = A8; *a10 = B10; *b10 = A10; *a12 = B12; *b12 = A12; *a14 = B14; *b14 = A14; *a16 = B16; *b16 = A16; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; *a9 = A10; *a10 = A9; *a11 = A12; *a12 = A11; *a13 = A14; *a14 = A13; *a15 = A16; *a16 = A15; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; *a5 = A6; *a6 = B6; *b6 = A5; *a7 = A8; *a8 = B8; *b8 = A7; *a9 = A10; *a10 = B10; *b10 = A9; *a11 = A12; *a12 = B12; *b12 = A11; *a13 = A14; *a14 = B14; *b14 = A13; *a15 = A16; *a16 = B16; *b16 = A15; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; *a5 = A6; *a6 = B5; *b5 = A5; *a7 = A8; *a8 = B7; *b7 = A7; *a9 = A10; *a10 = B9; *b9 = A9; *a11 = A12; *a12 = B11; *b11 = A11; *a13 = A14; *a14 = B13; *b13 = A13; *a15 = A16; *a16 = B15; *b15 = A15; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; *a9 = B9; *b9 = A9; *a11 = B11; *b11 = A11; *a13 = B13; *b13 = A13; *a15 = B15; *b15 = A15; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; *a5 = B5; *a6 = A5; *b5 = A6; *a7 = B7; *a8 = A7; *b7 = A8; *a9 = B9; *a10 = A9; *b9 = A10; *a11 = B11; *a12 = A11; *b11 = A12; *a13 = B13; *a14 = A13; *b13 = A14; *a15 = B15; *a16 = A15; *b15 = A16; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; *a5 = B5; *a6 = B6; *b5 = A5; *b6 = A6; *a7 = B7; *a8 = B8; *b7 = A7; *b8 = A8; *a9 = B9; *a10 = B10; *b9 = A9; *b10 = A10; *a11 = B11; *a12 = B12; *b11 = A11; *b12 = A12; *a13 = B13; *a14 = B14; *b13 = A13; *b14 = A14; *a15 = B15; *a16 = B16; *b15 = A15; *b16 = A16; } } #ifndef MINUS a1 += 2; a3 += 2; a5 += 2; a7 += 2; a9 += 2; a11 += 2; a13 += 2; a15 += 2; #else a1 -= 2; a3 -= 2; a5 -= 2; a7 -= 2; a9 -= 2; a11 -= 2; a13 -= 2; a15 -= 2; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; b3 = b1 + 1 * lda; b5 = b1 + 2 * lda; b7 = b1 + 3 * lda; b9 = b1 + 4 * lda; b11 = b1 + 5 * lda; b13 = b1 + 6 * lda; b15 = b1 + 7 * lda; A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; A5 = *a5; B5 = *b5; A7 = *a7; B7 = *b7; A9 = *a9; B9 = *b9; A11 = *a11; B11 = *b11; A13 = *a13; B13 = *b13; A15 = *a15; B15 = *b15; *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; *a9 = B9; *b9 = A9; *a11 = B11; *b11 = A11; *a13 = B13; *b13 = A13; *a15 = B15; *b15 = A15; } a += 8 * lda; j --; } while (j > 0); } if (n & 4) { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; i = (rows >> 1); i --; while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; A5 = *a5; A6 = *a6; A7 = *a7; A8 = *a8; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; B5 = *b5; B6 = *b6; B7 = *b7; B8 = *b8; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; *a6 = B6; *b6 = A6; *a8 = B8; *b8 = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; *a5 = A6; *a6 = B6; *b6 = A5; *a7 = A8; *a8 = B8; *b8 = A7; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; *a5 = A6; *a6 = B5; *b5 = A5; *a7 = A8; *a8 = B7; *b7 = A7; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; *a5 = B5; *a6 = A5; *b5 = A6; *a7 = B7; *a8 = A7; *b7 = A8; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; *a5 = B5; *a6 = B6; *b5 = A5; *b6 = A6; *a7 = B7; *a8 = B8; *b7 = A7; *b8 = A8; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; #ifndef MINUS a1 += 2; a3 += 2; a5 += 2; a7 += 2; #else a1 -= 2; a3 -= 2; a5 -= 2; a7 -= 2; #endif i --; } A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; A5 = *a5; A6 = *a6; A7 = *a7; A8 = *a8; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; B5 = *b5; B6 = *b6; B7 = *b7; B8 = *b8; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; *a6 = B6; *b6 = A6; *a8 = B8; *b8 = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; *a5 = A6; *a6 = A5; *a7 = A8; *a8 = A7; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; *a5 = A6; *a6 = B6; *b6 = A5; *a7 = A8; *a8 = B8; *b8 = A7; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; *a5 = A6; *a6 = B5; *b5 = A5; *a7 = A8; *a8 = B7; *b7 = A7; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; *a5 = B5; *a6 = A5; *b5 = A6; *a7 = B7; *a8 = A7; *b7 = A8; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; *a5 = B5; *a6 = B6; *b5 = A5; *b6 = A6; *a7 = B7; *a8 = B8; *b7 = A7; *b8 = A8; } } #ifndef MINUS a1 += 2; a3 += 2; a5 += 2; a7 += 2; #else a1 -= 2; a3 -= 2; a5 -= 2; a7 -= 2; #endif i = (rows & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; b3 = b1 + 1 * lda; b5 = b1 + 2 * lda; b7 = b1 + 3 * lda; A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; A5 = *a5; B5 = *b5; A7 = *a7; B7 = *b7; *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; *a5 = B5; *b5 = A5; *a7 = B7; *b7 = A7; } a += 4 * lda; } if (n & 2) { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif a3 = a1 + 1 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; i = ((rows) >> 1); i--; while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; #ifndef MINUS a1 += 2; a3 += 2; #else a1 -= 2; a3 -= 2; #endif i --; } //Loop Ending B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; } else { *a1 = A2; *a2 = B2; *b2 = A1; *a3 = A4; *a4 = B4; *b4 = A3; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; *a3 = A4; *a4 = B3; *b3 = A3; } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; *a3 = B3; *a4 = A3; *b3 = A4; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; *a3 = B3; *a4 = B4; *b3 = A3; *b4 = A4; } } #ifndef MINUS a1 += 2; a3 += 2; #else a1 -= 2; a3 -= 2; #endif i = ((rows) & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; b3 = b1 + 1 * lda; A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; } a += 2 * lda; } if (n & 1) { piv = ipiv; #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; b1 = a + ip1; b2 = a + ip2; i = ((rows) >> 1); i --; while (i > 0) { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 2; #else a1 -= 2; #endif i --; } //Loop Ending (n=1) A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; } else if (b2 != a2) { *a2 = B2; *b2 = A2; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *a1 = A2; *a2 = A1; } else { *a1 = A2; *a2 = B2; *b2 = A1; } } } else { if (b2 == a1) { *a1 = A2; *a2 = B1; *b1 = A1; } else if (b2 == a2) { *a1 = B1; *b1 = A1; } else if (b2 == b1) { *a1 = B1; *a2 = A1; *b1 = A2; } else { *a1 = B1; *a2 = B2; *b1 = A1; *b2 = A2; } } #ifndef MINUS a1 += 2; #else a1 -= 2; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv; b1 = a + ip1; A1 = *a1; B1 = *b1; *a1 = B1; *b1 = A1; } } return 0; } OpenBLAS-0.2.20/lapack/laswp/generic/zlaswp_k.c000066400000000000000000000054051313527062700211430ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include "common.h" #if GEMM_UNROLL_N >= 4 #include "zlaswp_k_4.c" #elif GEMM_UNROLL_N >= 2 #include "zlaswp_k_2.c" #else #include "zlaswp_k_1.c" #endif OpenBLAS-0.2.20/lapack/laswp/generic/zlaswp_k_1.c000066400000000000000000000173651313527062700213730ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef MINUS #define a2 (a1 + 2) #else #define a2 (a1 - 2) #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; a -= 2; lda *= 2; k1 --; #ifndef MINUS ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { //Only have 1 row ip1 = *ipiv * 2; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif b1 = a + ip1; if(a1 == b1) return 0; for(j=0; j 0) { do { piv = ipiv; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; b1 = a + ip1; b2 = a + ip2; i = ((k2 - k1) >> 1); i --; //Loop pipeline //Main Loop while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1)); #else asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1)); #endif #endif #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1)); asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2)); #else asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1)); asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1)); asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2)); #endif #endif A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 4; #else a1 -= 4; #endif i --; } //Loop Ending A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } #ifndef MINUS a1 += 4; #else a1 -= 4; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } a += lda; j --; } while (j > 0); } return 0; } OpenBLAS-0.2.20/lapack/laswp/generic/zlaswp_k_2.c000066400000000000000000000333731313527062700213710ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef MINUS #define a2 (a1 + 2) #else #define a2 (a1 - 2) #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; FLOAT A5, A6, B5, B6, A7, A8, B7, B8; a -= 2; lda *= 2; k1 --; #ifndef MINUS ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { //Only have 1 row ip1 = *ipiv * 2; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif b1 = a + ip1; if(a1 == b1) return 0; for(j=0; j> 1); if (j > 0) { do { piv = ipiv; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; b1 = a + ip1; b2 = a + ip2; i = (rows >> 1); i--; //Loop pipeline //Main Loop while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1 + lda)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1)); asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1 + lda)); #else asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1)); asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1 + lda)); asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1)); asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1 + lda)); #endif #endif A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a1 + 0 + lda); A6 = *(a1 + 1 + lda); A7 = *(a2 + 0 + lda); A8 = *(a2 + 1 + lda); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b1 + 0 + lda); B6 = *(b1 + 1 + lda); B7 = *(b2 + 0 + lda); B8 = *(b2 + 1 + lda); ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a2 + 0 + lda) = B7; *(a2 + 1 + lda) = B8; *(b2 + 0 + lda) = A7; *(b2 + 1 + lda) = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = B7; *(a2 + 1 + lda) = B8; *(b2 + 0 + lda) = A5; *(b2 + 1 + lda) = A6; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = B5; *(a2 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a1 + 0 + lda) = B5; *(a1 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; *(a1 + 0 + lda) = B5; *(a1 + 1 + lda) = B6; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; *(b1 + 0 + lda) = A7; *(b1 + 1 + lda) = A8; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a1 + 0 + lda) = B5; *(a1 + 1 + lda) = B6; *(a2 + 0 + lda) = B7; *(a2 + 1 + lda) = B8; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; *(b2 + 0 + lda) = A7; *(b2 + 1 + lda) = A8; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 4; #else a1 -= 4; #endif i --; } //Loop Ending A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a1 + 0 + lda); A6 = *(a1 + 1 + lda); A7 = *(a2 + 0 + lda); A8 = *(a2 + 1 + lda); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b1 + 0 + lda); B6 = *(b1 + 1 + lda); B7 = *(b2 + 0 + lda); B8 = *(b2 + 1 + lda); if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a2 + 0 + lda) = B7; *(a2 + 1 + lda) = B8; *(b2 + 0 + lda) = A7; *(b2 + 1 + lda) = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = B7; *(a2 + 1 + lda) = B8; *(b2 + 0 + lda) = A5; *(b2 + 1 + lda) = A6; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a1 + 0 + lda) = A7; *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = B5; *(a2 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a1 + 0 + lda) = B5; *(a1 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; *(a1 + 0 + lda) = B5; *(a1 + 1 + lda) = B6; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; *(b1 + 0 + lda) = A7; *(b1 + 1 + lda) = A8; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a1 + 0 + lda) = B5; *(a1 + 1 + lda) = B6; *(a2 + 0 + lda) = B7; *(a2 + 1 + lda) = B8; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; *(b2 + 0 + lda) = A7; *(b2 + 1 + lda) = A8; } } #ifndef MINUS a1 += 4; #else a1 -= 4; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a1 + 0 + lda); A4 = *(a1 + 1 + lda); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b1 + 0 + lda); B4 = *(b1 + 1 + lda); *(a1 + 0) = B1; *(a1 + 1) = B2; *(a1 + 0 + lda) = B3; *(a1 + 1 + lda) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b1 + 0 + lda) = A3; *(b1 + 1 + lda) = A4; } a += 2 * lda; j --; } while (j > 0); } if (n & 1) { piv = ipiv; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; b1 = a + ip1; b2 = a + ip2; i = (rows >> 1); i--; //Loop pipeline //Main Loop while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 4; #else a1 -= 4; #endif i --; } //Loop Ending A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } #ifndef MINUS a1 += 4; #else a1 -= 4; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } } return 0; } OpenBLAS-0.2.20/lapack/laswp/generic/zlaswp_k_4.c000066400000000000000000000572711313527062700213760ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef MINUS #define a2 (a1 + 2) #define a4 (a3 + 2) #define a6 (a5 + 2) #define a8 (a7 + 2) #else #define a2 (a1 - 2) #define a4 (a3 - 2) #define a6 (a5 - 2) #define a8 (a7 - 2) #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; FLOAT *b5, *b6, *b7, *b8; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; FLOAT A5, A6, B5, B6, A7, A8, B7, B8; FLOAT A9, A10, B9, B10, A11, A12, B11, B12; FLOAT A13, A14, B13, B14, A15, A16, B15, B16; a -= 2; lda *= 2; k1 --; #ifndef MINUS ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { //Only have 1 row ip1 = *ipiv * 2; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif b1 = a + ip1; if(a1 == b1) return 0; for(j=0; j> 2); if (j > 0) { do { piv = ipiv; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; i = (rows >> 1); i--; //Loop pipeline //Main Loop while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); A8 = *(a4 + 1); A9 = *(a5 + 0); A10 = *(a5 + 1); A11 = *(a6 + 0); A12 = *(a6 + 1); A13 = *(a7 + 0); A14 = *(a7 + 1); A15 = *(a8 + 0); A16 = *(a8 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); B9 = *(b5 + 0); B10 = *(b5 + 1); B11 = *(b6 + 0); B12 = *(b6 + 1); B13 = *(b7 + 0); B14 = *(b7 + 1); B15 = *(b8 + 0); B16 = *(b8 + 1); ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = A9; *(a6 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = A13; *(a8 + 1) = A14; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A7; *(b4 + 1) = A8; *(a6 + 0) = B11; *(a6 + 1) = B12; *(b6 + 0) = A11; *(b6 + 1) = A12; *(a8 + 0) = B15; *(a8 + 1) = B16; *(b8 + 0) = A15; *(b8 + 1) = A16; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = A9; *(a6 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = A13; *(a8 + 1) = A14; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A5; *(b4 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = B11; *(a6 + 1) = B12; *(b6 + 0) = A9; *(b6 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = B15; *(a8 + 1) = B16; *(b8 + 0) = A13; *(b8 + 1) = A14; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B5; *(a4 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = B9; *(a6 + 1) = B10; *(b5 + 0) = A9; *(b5 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = B13; *(a8 + 1) = B14; *(b7 + 0) = A13; *(b7 + 1) = A14; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = B5; *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; *(a5 + 0) = B9; *(a5 + 1) = B10; *(b5 + 0) = A9; *(b5 + 1) = A10; *(a7 + 0) = B13; *(a7 + 1) = B14; *(b7 + 0) = A13; *(b7 + 1) = A14; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = A5; *(a4 + 1) = A6; *(b3 + 0) = A7; *(b3 + 1) = A8; *(a5 + 0) = B9; *(a5 + 1) = B10; *(a6 + 0) = A9; *(a6 + 1) = A10; *(b5 + 0) = A11; *(b5 + 1) = A12; *(a7 + 0) = B13; *(a7 + 1) = B14; *(a8 + 0) = A13; *(a8 + 1) = A14; *(b7 + 0) = A15; *(b7 + 1) = A16; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b3 + 0) = A5; *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; *(a5 + 0) = B9; *(a5 + 1) = B10; *(a6 + 0) = B11; *(a6 + 1) = B12; *(b5 + 0) = A9; *(b5 + 1) = A10; *(b6 + 0) = A11; *(b6 + 1) = A12; *(a7 + 0) = B13; *(a7 + 1) = B14; *(a8 + 0) = B15; *(a8 + 1) = B16; *(b7 + 0) = A13; *(b7 + 1) = A14; *(b8 + 0) = A15; *(b8 + 1) = A16; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; #ifndef MINUS a1 += 4; a3 += 4; a5 += 4; a7 += 4; #else a1 -= 4; a3 -= 4; a5 -= 4; a7 -= 4; #endif i --; } //Loop Ending A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); A8 = *(a4 + 1); A9 = *(a5 + 0); A10 = *(a5 + 1); A11 = *(a6 + 0); A12 = *(a6 + 1); A13 = *(a7 + 0); A14 = *(a7 + 1); A15 = *(a8 + 0); A16 = *(a8 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); B9 = *(b5 + 0); B10 = *(b5 + 1); B11 = *(b6 + 0); B12 = *(b6 + 1); B13 = *(b7 + 0); B14 = *(b7 + 1); B15 = *(b8 + 0); B16 = *(b8 + 1); if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = A9; *(a6 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = A13; *(a8 + 1) = A14; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A7; *(b4 + 1) = A8; *(a6 + 0) = B11; *(a6 + 1) = B12; *(b6 + 0) = A11; *(b6 + 1) = A12; *(a8 + 0) = B15; *(a8 + 1) = B16; *(b8 + 0) = A15; *(b8 + 1) = A16; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = A9; *(a6 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = A13; *(a8 + 1) = A14; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A5; *(b4 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = B11; *(a6 + 1) = B12; *(b6 + 0) = A9; *(b6 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = B15; *(a8 + 1) = B16; *(b8 + 0) = A13; *(b8 + 1) = A14; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B5; *(a4 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; *(a5 + 0) = A11; *(a5 + 1) = A12; *(a6 + 0) = B9; *(a6 + 1) = B10; *(b5 + 0) = A9; *(b5 + 1) = A10; *(a7 + 0) = A15; *(a7 + 1) = A16; *(a8 + 0) = B13; *(a8 + 1) = B14; *(b7 + 0) = A13; *(b7 + 1) = A14; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = B5; *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; *(a5 + 0) = B9; *(a5 + 1) = B10; *(b5 + 0) = A9; *(b5 + 1) = A10; *(a7 + 0) = B13; *(a7 + 1) = B14; *(b7 + 0) = A13; *(b7 + 1) = A14; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = A5; *(a4 + 1) = A6; *(b3 + 0) = A7; *(b3 + 1) = A8; *(a5 + 0) = B9; *(a5 + 1) = B10; *(a6 + 0) = A9; *(a6 + 1) = A10; *(b5 + 0) = A11; *(b5 + 1) = A12; *(a7 + 0) = B13; *(a7 + 1) = B14; *(a8 + 0) = A13; *(a8 + 1) = A14; *(b7 + 0) = A15; *(b7 + 1) = A16; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b3 + 0) = A5; *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; *(a5 + 0) = B9; *(a5 + 1) = B10; *(a6 + 0) = B11; *(a6 + 1) = B12; *(b5 + 0) = A9; *(b5 + 1) = A10; *(b6 + 0) = A11; *(b6 + 1) = A12; *(a7 + 0) = B13; *(a7 + 1) = B14; *(a8 + 0) = B15; *(a8 + 1) = B16; *(b7 + 0) = A13; *(b7 + 1) = A14; *(b8 + 0) = A15; *(b8 + 1) = A16; } } #ifndef MINUS a1 += 4; a3 += 4; a5 += 4; a7 += 4; #else a1 -= 4; a3 -= 4; a5 -= 4; a7 -= 4; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; b3 = b1 + 1 * lda; b5 = b1 + 2 * lda; b7 = b1 + 3 * lda; A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); A4 = *(a3 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b3 + 0); B4 = *(b3 + 1); A5 = *(a5 + 0); A6 = *(a5 + 1); A7 = *(a7 + 0); A8 = *(a7 + 1); B5 = *(b5 + 0); B6 = *(b5 + 1); B7 = *(b7 + 0); B8 = *(b7 + 1); *(a1 + 0) = B1; *(a1 + 1) = B2; *(a3 + 0) = B3; *(a3 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b3 + 0) = A3; *(b3 + 1) = A4; *(a5 + 0) = B5; *(a5 + 1) = B6; *(a7 + 0) = B7; *(a7 + 1) = B8; *(b5 + 0) = A5; *(b5 + 1) = A6; *(b7 + 0) = A7; *(b7 + 1) = A8; } a += 4 * lda; j --; } while (j > 0); } if (n & 2) { piv = ipiv; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif a3 = a1 + lda; ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; b1 = a + ip1; b2 = a + ip2; b3 = b1 + lda; b4 = b2 + lda; i = (rows >> 1); i--; //Loop pipeline //Main Loop while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); A8 = *(a4 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A7; *(b4 + 1) = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A5; *(b4 + 1) = A6; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B5; *(a4 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = B5; *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = A5; *(a4 + 1) = A6; *(b3 + 0) = A7; *(b3 + 1) = A8; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b3 + 0) = A5; *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; } } b1 = a + ip1; b2 = a + ip2; b3 = b1 + lda; b4 = b2 + lda; #ifndef MINUS a1 += 4; a3 += 4; #else a1 -= 4; a3 -= 4; #endif i --; } //Loop Ending A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); A8 = *(a4 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A7; *(b4 + 1) = A8; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b4 + 0) = A5; *(b4 + 1) = A6; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; *(a4 + 0) = B5; *(a4 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; *(a3 + 0) = B5; *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = A5; *(a4 + 1) = A6; *(b3 + 0) = A7; *(b3 + 1) = A8; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; *(a4 + 0) = B7; *(a4 + 1) = B8; *(b3 + 0) = A5; *(b3 + 1) = A6; *(b4 + 0) = A7; *(b4 + 1) = A8; } } #ifndef MINUS a1 += 4; a3 += 4; #else a1 -= 4; a3 -= 4; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; b3 = b1 + lda; A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); A4 = *(a3 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b3 + 0); B4 = *(b3 + 1); *(a1 + 0) = B1; *(a1 + 1) = B2; *(a3 + 0) = B3; *(a3 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b3 + 0) = A3; *(b3 + 1) = A4; } a += 2 * lda; } if (n & 1) { piv = ipiv; #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; b1 = a + ip1; b2 = a + ip2; i = (rows >> 1); i--; //Loop pipeline //Main Loop while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } b1 = a + ip1; b2 = a + ip2; #ifndef MINUS a1 += 4; #else a1 -= 4; #endif i --; } //Loop Ending A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; } else { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A1; *(b2 + 1) = A2; } } } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = B1; *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = A1; *(a2 + 1) = A2; *(b1 + 0) = A3; *(b1 + 1) = A4; } else { *(a1 + 0) = B1; *(a1 + 1) = B2; *(a2 + 0) = B3; *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; *(b2 + 0) = A3; *(b2 + 1) = A4; } } #ifndef MINUS a1 += 4; #else a1 -= 4; #endif //Remain i = (rows & 1); if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); B2 = *(b1 + 1); *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; } } return 0; } OpenBLAS-0.2.20/lapack/laswp/ia64/000077500000000000000000000000001313527062700162705ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/ia64/Makefile000066400000000000000000000001211313527062700177220ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/mips/000077500000000000000000000000001313527062700164755ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/mips/Makefile000066400000000000000000000002661313527062700201410ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system ifndef LASWP LASWP = ../generic/laswp_k.c endif ifndef ZLASWP ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/mips64/000077500000000000000000000000001313527062700166475ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/mips64/Makefile000066400000000000000000000002221313527062700203030ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/power/000077500000000000000000000000001313527062700166615ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/power/Makefile000066400000000000000000000002221313527062700203150ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/sparc/000077500000000000000000000000001313527062700166355ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/sparc/Makefile000066400000000000000000000002221313527062700202710ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/x86/000077500000000000000000000000001313527062700161525ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/x86/Makefile000066400000000000000000000010631313527062700176120ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system ifeq ($(CORE), CORE2) LASWP = ../generic/laswp_k_2.c ZLASWP = ../generic/zlaswp_k_2.c endif ifeq ($(CORE), OPTERON) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(CORE), PRESCOTT) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(DYNAMIC_ARCH), 1) LASWP = ../generic/laswp_k_4.c ZLASWP = ../generic/zlaswp_k_4.c endif ifndef LASWP LASWP = ../generic/laswp_k.c endif ifndef ZLASWP ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/x86_64/000077500000000000000000000000001313527062700164635ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/x86_64/Makefile000066400000000000000000000012211313527062700201170ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system ifeq ($(CORE), PENRYN) LASWP = ../generic/laswp_k_4.c ZLASWP = ../generic/zlaswp_k_4.c endif ifeq ($(CORE), CORE2) LASWP = ../generic/laswp_k_4.c ZLASWP = ../generic/zlaswp_k_4.c endif ifeq ($(CORE), OPTERON) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(CORE), PRESCOTT) LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif ifeq ($(DYNAMIC_ARCH), 1) LASWP = ../generic/laswp_k_4.c ZLASWP = ../generic/zlaswp_k_4.c endif ifndef LASWP LASWP = ../generic/laswp_k.c endif ifndef ZLASWP ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile OpenBLAS-0.2.20/lapack/laswp/zarch/000077500000000000000000000000001313527062700166345ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/laswp/zarch/Makefile000066400000000000000000000002221313527062700202700ustar00rootroot00000000000000TOPDIR = ../../.. include ../../../Makefile.system LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c include ../generic/Makefile OpenBLAS-0.2.20/lapack/lauu2/000077500000000000000000000000001313527062700154275ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/lauu2/Makefile000066400000000000000000000045371313527062700171000ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = slauu2_U.$(SUFFIX) slauu2_L.$(SUFFIX) DBLASOBJS = dlauu2_U.$(SUFFIX) dlauu2_L.$(SUFFIX) QBLASOBJS = qlauu2_U.$(SUFFIX) qlauu2_L.$(SUFFIX) CBLASOBJS = clauu2_U.$(SUFFIX) clauu2_L.$(SUFFIX) ZBLASOBJS = zlauu2_U.$(SUFFIX) zlauu2_L.$(SUFFIX) XBLASOBJS = xlauu2_U.$(SUFFIX) xlauu2_L.$(SUFFIX) slauu2_U.$(SUFFIX) : lauu2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauu2_L.$(SUFFIX) : lauu2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dlauu2_U.$(SUFFIX) : lauu2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauu2_L.$(SUFFIX) : lauu2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qlauu2_U.$(SUFFIX) : lauu2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauu2_L.$(SUFFIX) : lauu2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) clauu2_U.$(SUFFIX) : zlauu2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauu2_L.$(SUFFIX) : zlauu2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zlauu2_U.$(SUFFIX) : zlauu2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauu2_L.$(SUFFIX) : zlauu2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xlauu2_U.$(SUFFIX) : zlauu2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauu2_L.$(SUFFIX) : zlauu2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) slauu2_U.$(PSUFFIX) : lauu2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauu2_L.$(PSUFFIX) : lauu2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dlauu2_U.$(PSUFFIX) : lauu2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauu2_L.$(PSUFFIX) : lauu2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qlauu2_U.$(PSUFFIX) : lauu2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauu2_L.$(PSUFFIX) : lauu2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) clauu2_U.$(PSUFFIX) : zlauu2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauu2_L.$(PSUFFIX) : zlauu2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zlauu2_U.$(PSUFFIX) : zlauu2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauu2_L.$(PSUFFIX) : zlauu2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xlauu2_U.$(PSUFFIX) : zlauu2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauu2_L.$(PSUFFIX) : zlauu2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/lauu2/lauu2_L.c000066400000000000000000000066431313527062700171070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dp1 = 1.; blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT aii; BLASLONG i; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i, lda, NULL, 0, NULL, 0); if (i < n - 1) { aii = DOTU_K(n - i - 1, a + i + 1 + i * lda, 1, a + i + 1 + i * lda, 1); *(a + i + i * lda) += aii; GEMV_T(n - i - 1, i, 0, dp1, a + (i + 1) , lda, a + (i + 1) + i * lda, 1, a + i , lda, sb); } } return 0; } OpenBLAS-0.2.20/lapack/lauu2/lauu2_U.c000066400000000000000000000066571313527062700171250ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dp1 = 1.; blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT aii; BLASLONG i; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i * lda, 1, NULL, 0, NULL, 0); if (i < n - 1) { aii = DOTU_K(n - i - 1, a + i + (i + 1)* lda, lda, a + i + (i + 1) * lda, lda); *(a + i + i * lda) += aii; GEMV_N(i, n - i - 1, 0, dp1, a + (i + 1) * lda, lda, a + i + (i + 1) * lda, lda, a + i * lda, 1, sb); } } return 0; } OpenBLAS-0.2.20/lapack/lauu2/zlauu2_L.c000066400000000000000000000071461313527062700173000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dp1 = 1.; blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT temp; BLASLONG i; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO, a + i * COMPSIZE, lda, NULL, 0, NULL, 0); if (i < n - 1) { temp = CREAL(DOTC_K(n - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, a + (i + 1 + i * lda) * COMPSIZE, 1)); *(a + (i + i * lda) * COMPSIZE + 0) += temp; *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; GEMV_U(n - i - 1, i, 0, dp1, ZERO, a + ((i + 1) ) * COMPSIZE, lda, a + ((i + 1) + i * lda) * COMPSIZE, 1, a + ( i ) * COMPSIZE , lda, sb); } } return 0; } OpenBLAS-0.2.20/lapack/lauu2/zlauu2_U.c000066400000000000000000000071621313527062700173070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dp1 = 1.; blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT temp; BLASLONG i; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO, a + i * lda * COMPSIZE, 1, NULL, 0, NULL, 0); if (i < n - 1) { temp = CREAL(DOTC_K(n - i - 1, a + (i + (i + 1) * lda) * COMPSIZE, lda, a + (i + (i + 1) * lda) * COMPSIZE, lda)); *(a + (i + i * lda) * COMPSIZE + 0) += temp; *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; GEMV_O(i, n - i - 1, 0, dp1, ZERO, a + ( (i + 1) * lda) * COMPSIZE, lda, a + (i + (i + 1) * lda) * COMPSIZE, lda, a + ( i * lda) * COMPSIZE, 1, sb); } } return 0; } OpenBLAS-0.2.20/lapack/lauum/000077500000000000000000000000001313527062700155225ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/lauum/Makefile000066400000000000000000000130621313527062700171640ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = slauum_U_single.$(SUFFIX) slauum_L_single.$(SUFFIX) DBLASOBJS = dlauum_U_single.$(SUFFIX) dlauum_L_single.$(SUFFIX) QBLASOBJS = qlauum_U_single.$(SUFFIX) qlauum_L_single.$(SUFFIX) CBLASOBJS = clauum_U_single.$(SUFFIX) clauum_L_single.$(SUFFIX) ZBLASOBJS = zlauum_U_single.$(SUFFIX) zlauum_L_single.$(SUFFIX) XBLASOBJS = xlauum_U_single.$(SUFFIX) xlauum_L_single.$(SUFFIX) ifdef SMP SBLASOBJS += slauum_U_parallel.$(SUFFIX) slauum_L_parallel.$(SUFFIX) DBLASOBJS += dlauum_U_parallel.$(SUFFIX) dlauum_L_parallel.$(SUFFIX) QBLASOBJS += qlauum_U_parallel.$(SUFFIX) qlauum_L_parallel.$(SUFFIX) CBLASOBJS += clauum_U_parallel.$(SUFFIX) clauum_L_parallel.$(SUFFIX) ZBLASOBJS += zlauum_U_parallel.$(SUFFIX) zlauum_L_parallel.$(SUFFIX) XBLASOBJS += xlauum_U_parallel.$(SUFFIX) xlauum_L_parallel.$(SUFFIX) endif slauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauum_L_single.$(SUFFIX) : lauum_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dlauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauum_L_single.$(SUFFIX) : lauum_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qlauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauum_L_single.$(SUFFIX) : lauum_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) clauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauum_L_single.$(SUFFIX) : lauum_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zlauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauum_L_single.$(SUFFIX) : lauum_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xlauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauum_L_single.$(SUFFIX) : lauum_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) slauum_U_single.$(PSUFFIX) : lauum_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauum_L_single.$(PSUFFIX) : lauum_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) slauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dlauum_U_single.$(PSUFFIX) : lauum_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauum_L_single.$(PSUFFIX) : lauum_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qlauum_U_single.$(PSUFFIX) : lauum_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauum_L_single.$(PSUFFIX) : lauum_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) clauum_U_single.$(PSUFFIX) : lauum_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauum_L_single.$(PSUFFIX) : lauum_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) clauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zlauum_U_single.$(PSUFFIX) : lauum_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauum_L_single.$(PSUFFIX) : lauum_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xlauum_U_single.$(PSUFFIX) : lauum_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauum_L_single.$(PSUFFIX) : lauum_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/lauum/lauum_L_parallel.c000066400000000000000000000107361313527062700211470ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, bk, i, blocking, lda; int mode; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif if (args -> nthreads == 1) { LAUUM_L_SINGLE(args, NULL, NULL, sa, sb, 0); return 0; } n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= GEMM_UNROLL_N * 2) { LAUUM_L_SINGLE(args, NULL, range_n, sa, sb, 0); return 0; } newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (blocking > GEMM_Q) blocking = GEMM_Q; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; newarg.n = i; newarg.k = bk; newarg.a = a + i * COMPSIZE; newarg.c = a; syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, &newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = i; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i ) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; CNAME(&newarg, NULL, NULL, sa, sb, 0); } return 0; } OpenBLAS-0.2.20/lapack/lauum/lauum_L_single.c000066400000000000000000000151621313527062700206320ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dp1 = 1.; #ifndef COMPLEX #define TRMM_KERNEL TRMM_KERNEL_LN #define SYRK_KERNEL SYRK_KERNEL_L #else #define TRMM_KERNEL TRMM_KERNEL_LR #ifdef XDOUBLE #define SYRK_KERNEL xherk_kernel_LC #elif defined(DOUBLE) #define SYRK_KERNEL zherk_kernel_LC #else #define SYRK_KERNEL cherk_kernel_LC #endif #endif #if 0 #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P 8 #define GEMM_Q 20 #define GEMM_R 64 #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) #define REAL_GEMM_R (GEMM_R - GEMM_PQ) blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG j, bk, blocking; BLASLONG jjs, min_jj; BLASLONG is, ls, ks; BLASLONG min_i, min_l, min_k; BLASLONG range_N[2]; FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); #if 0 FLOAT *aa; #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES) { LAUU2_L(args, NULL, range_n, sa, sb, 0); return 0; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; for (j = 0; j < n; j += blocking) { bk = MIN(blocking, n - j); if (j > 0 ){ TRMM_ILNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb); for (ls = 0; ls < j; ls += REAL_GEMM_R) { min_l = j - ls; if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; #if 0 min_i = j - ls; if (min_i > GEMM_P) min_i = GEMM_P; if (ls + min_i >= ls + min_l) { GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa); aa = sa; } else { aa = sb2; } for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); SYRK_KERNEL(min_i, min_jj, bk, dp1, aa, sb2 + (jjs - ls) * bk * COMPSIZE, a + (ls + jjs * lda) * COMPSIZE, lda, ls - jjs); } for(is = ls + min_i; is < j ; is += GEMM_P){ min_i = j - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); SYRK_KERNEL(min_i, min_l, bk, dp1, sa, sb2, a + (is + ls * lda) * COMPSIZE, lda, is - ls); } for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; TRMM_KERNEL(min_k, min_l, bk, dp1, #ifdef COMPLEX ZERO, #endif sb + ks * bk * COMPSIZE, sb2, a + (ks + j + ls * lda) * COMPSIZE, lda, ks); } #else min_i = j - ls; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa); for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); SYRK_KERNEL(min_i, min_jj, bk, dp1, sa, sb2 + (jjs - ls) * bk * COMPSIZE, a + (ls + jjs * lda) * COMPSIZE, lda, ls - jjs); } for(is = ls + min_i; is < j ; is += GEMM_P){ min_i = j - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); SYRK_KERNEL(min_i, min_l, bk, dp1, sa, sb2, a + (is + ls * lda) * COMPSIZE, lda, is - ls); } for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; TRMM_KERNEL(min_k, min_l, bk, dp1, #ifdef COMPLEX ZERO, #endif sb + ks * bk * COMPSIZE, sb2, a + (ks + j + ls * lda) * COMPSIZE, lda, ks); } #endif } } if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } CNAME(args, NULL, range_N, sa, sb, 0); } return 0; } OpenBLAS-0.2.20/lapack/lauum/lauum_U_parallel.c000066400000000000000000000107531313527062700211570ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, bk, i, blocking, lda; int mode; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif if (args -> nthreads == 1) { LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); return 0; } n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= GEMM_UNROLL_N * 2) { LAUUM_U_SINGLE(args, NULL, range_n, sa, sb, 0); return 0; } newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (blocking > GEMM_Q) blocking = GEMM_Q; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; newarg.n = i; newarg.k = bk; newarg.a = a + ( i * lda) * COMPSIZE; newarg.c = a; syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, &newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads); newarg.m = i; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + ( i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; CNAME(&newarg, NULL, NULL, sa, sb, 0); } return 0; } OpenBLAS-0.2.20/lapack/lauum/lauum_U_single.c000066400000000000000000000166671313527062700206560ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dp1 = 1.; #ifndef COMPLEX #define TRMM_KERNEL TRMM_KERNEL_RT #define SYRK_KERNEL SYRK_KERNEL_U #else #define TRMM_KERNEL TRMM_KERNEL_RC #ifdef XDOUBLE #define SYRK_KERNEL xherk_kernel_UN #elif defined(DOUBLE) #define SYRK_KERNEL zherk_kernel_UN #else #define SYRK_KERNEL cherk_kernel_UN #endif #endif #if 0 #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P 8 #define GEMM_Q 20 #define GEMM_R 24 #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) #define REAL_GEMM_R (GEMM_R - GEMM_PQ) blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG j, bk, blocking; BLASLONG is, ls, ks; BLASLONG jjs, min_jj; BLASLONG min_i, min_l, min_k; BLASLONG range_N[2]; FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); #if 0 FLOAT *aa; #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES) { LAUU2_U(args, NULL, range_n, sa, sb, 0); return 0; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (j > 0) { TRMM_OUTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb); for (ls = 0; ls < j; ls += REAL_GEMM_R) { min_l = j - ls; #if 0 if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; min_i = ls + min_l; if (min_i > GEMM_P) min_i = GEMM_P; if (ls > 0) { GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa); aa = sa; } else { aa = sb2; } for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); SYRK_KERNEL(min_i, min_jj, bk, dp1, aa, sb2 + (jjs - ls) * bk * COMPSIZE, a + (jjs * lda) * COMPSIZE, lda, - jjs); } if (ls + REAL_GEMM_R >= j ) { for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif aa, sb + ks * bk * COMPSIZE, a + ((ks + j) * lda) * COMPSIZE, lda, -ks); } } for(is = min_i; is < ls + min_l ; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; if (is < ls) { GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); aa = sa; } else { aa = sb2 + (is - ls) * bk * COMPSIZE; } SYRK_KERNEL(min_i, min_l, bk, dp1, aa, sb2, a + (is + ls * lda) * COMPSIZE, lda, is - ls); if (ls + REAL_GEMM_R >= j ) { for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif aa, sb + ks * bk * COMPSIZE, a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); } } } #else if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; min_i = ls + min_l; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa); for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); SYRK_KERNEL(min_i, min_jj, bk, dp1, sa, sb2 + (jjs - ls) * bk * COMPSIZE, a + (jjs * lda) * COMPSIZE, lda, - jjs); } if (ls + REAL_GEMM_R >= j ) { for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + ks * bk * COMPSIZE, a + ((ks + j) * lda) * COMPSIZE, lda, -ks); } } for(is = min_i; is < ls + min_l ; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); SYRK_KERNEL(min_i, min_l, bk, dp1, sa, sb2, a + (is + ls * lda) * COMPSIZE, lda, is - ls); if (ls + REAL_GEMM_R >= j ) { for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + ks * bk * COMPSIZE, a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); } } } #endif } /* end of ls */ } if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } CNAME(args, NULL, range_N, sa, sb, 0); } return 0; } OpenBLAS-0.2.20/lapack/potf2/000077500000000000000000000000001313527062700154315ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/potf2/Makefile000066400000000000000000000045371313527062700171020ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = spotf2_U.$(SUFFIX) spotf2_L.$(SUFFIX) DBLASOBJS = dpotf2_U.$(SUFFIX) dpotf2_L.$(SUFFIX) QBLASOBJS = qpotf2_U.$(SUFFIX) qpotf2_L.$(SUFFIX) CBLASOBJS = cpotf2_U.$(SUFFIX) cpotf2_L.$(SUFFIX) ZBLASOBJS = zpotf2_U.$(SUFFIX) zpotf2_L.$(SUFFIX) XBLASOBJS = xpotf2_U.$(SUFFIX) xpotf2_L.$(SUFFIX) spotf2_U.$(SUFFIX) : potf2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotf2_L.$(SUFFIX) : potf2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dpotf2_U.$(SUFFIX) : potf2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotf2_L.$(SUFFIX) : potf2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qpotf2_U.$(SUFFIX) : potf2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotf2_L.$(SUFFIX) : potf2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) cpotf2_U.$(SUFFIX) : zpotf2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotf2_L.$(SUFFIX) : zpotf2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zpotf2_U.$(SUFFIX) : zpotf2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotf2_L.$(SUFFIX) : zpotf2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xpotf2_U.$(SUFFIX) : zpotf2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotf2_L.$(SUFFIX) : zpotf2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) spotf2_U.$(PSUFFIX) : potf2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotf2_L.$(PSUFFIX) : potf2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dpotf2_U.$(PSUFFIX) : potf2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotf2_L.$(PSUFFIX) : potf2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qpotf2_U.$(PSUFFIX) : potf2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotf2_L.$(PSUFFIX) : potf2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) cpotf2_U.$(PSUFFIX) : zpotf2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotf2_L.$(PSUFFIX) : zpotf2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zpotf2_U.$(PSUFFIX) : zpotf2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotf2_L.$(PSUFFIX) : zpotf2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xpotf2_U.$(PSUFFIX) : zpotf2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotf2_L.$(PSUFFIX) : zpotf2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/potf2/potf2_L.c000066400000000000000000000071431313527062700171070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static FLOAT dm1 = -1.; static FLOAT dp1 = 1.; #ifndef SQRT #define SQRT(x) sqrt(x) #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj; FLOAT *aoffset; BLASLONG i, j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } aoffset = a; for (j = 0; j < n; j++) { ajj = *(aoffset + j) - DOTU_K(j, a + j, lda, a + j, lda); if (ajj <= 0){ *(aoffset + j) = ajj; return j + 1; } ajj = SQRT(ajj); *(aoffset + j) = ajj; i = n - j - 1; if (i > 0) { GEMV_N(i, j, 0, dm1, a + j + 1, lda, a + j, lda, aoffset + j + 1, 1, sb); SCAL_K(i, 0, 0, dp1 / ajj, aoffset + j + 1, 1, NULL, 0, NULL, 0); } aoffset += lda; } return 0; } OpenBLAS-0.2.20/lapack/potf2/potf2_U.c000066400000000000000000000070231313527062700171150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static FLOAT dm1 = -1.; static FLOAT dp1 = 1.; #ifndef SQRT #define SQRT(x) sqrt(x) #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj; BLASLONG i, j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = 0; j < n; j++) { ajj = *(a + j) - DOTU_K(j, a, 1, a, 1); if (ajj <= 0){ *(a + j) = ajj; return j + 1; } ajj = SQRT(ajj); *(a + j) = ajj; i = n - j - 1; if (i > 0) { GEMV_T(j, i, 0, dm1, a + lda, lda, a, 1, a + j + lda, lda, sb); SCAL_K(i, 0, 0, dp1 / ajj, a + j + lda, lda, NULL, 0, NULL, 0); } a += lda; } return 0; } OpenBLAS-0.2.20/lapack/potf2/zpotf2_L.c000066400000000000000000000073601313527062700173020ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static FLOAT dm1 = -1.; #ifndef SQRT #define SQRT(x) sqrt(x) #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj; FLOAT *aoffset; BLASLONG i, j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } aoffset = a; for (j = 0; j < n; j++) { ajj = CREAL(DOTC_K(j, a + j * 2, lda, a + j * 2, lda)); ajj = *(aoffset + j * 2) - ajj; if (ajj <= 0){ *(aoffset + j * 2 + 0) = ajj; *(aoffset + j * 2 + 1) = ZERO; return j + 1; } ajj = SQRT(ajj); *(aoffset + j * 2 + 0) = ajj; *(aoffset + j * 2 + 1) = ZERO; i = n - j - 1; if (i > 0) { GEMV_O(i, j, 0, dm1, ZERO, a + (j + 1) * 2, lda, a + j * 2, lda, aoffset + (j + 1) * 2, 1, sb); SCAL_K(i, 0, 0, ONE / ajj, ZERO, aoffset + (j + 1) * 2, 1, NULL, 0, NULL, 0); } aoffset += lda * 2; } return 0; } OpenBLAS-0.2.20/lapack/potf2/zpotf2_U.c000066400000000000000000000072061313527062700173120ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include #include "common.h" static FLOAT dm1 = -1.; #ifndef SQRT #define SQRT(x) sqrt(x) #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj; BLASLONG i, j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = 0; j < n; j++) { ajj = CREAL(DOTC_K(j, a, 1, a, 1)); ajj = *(a + j * 2) - ajj; if (ajj <= 0){ *(a + j * 2 + 0) = ajj; *(a + j * 2 + 1) = ZERO; return j + 1; } ajj = SQRT(ajj); *(a + j * 2 + 0) = ajj; *(a + j * 2 + 1) = ZERO; i = n - j - 1; if (i > 0){ GEMV_U(j, i, 0, dm1, ZERO, a + lda * 2, lda, a, 1, a + (j + lda) * 2, lda, sb); SCAL_K(i, 0, 0, ONE / ajj, ZERO, a + (j + lda) * 2, lda, NULL, 0, NULL, 0); } a += 2 * lda; } return 0; } OpenBLAS-0.2.20/lapack/potrf/000077500000000000000000000000001313527062700155315ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/potrf/Makefile000066400000000000000000000130621313527062700171730ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = spotrf_U_single.$(SUFFIX) spotrf_L_single.$(SUFFIX) DBLASOBJS = dpotrf_U_single.$(SUFFIX) dpotrf_L_single.$(SUFFIX) QBLASOBJS = qpotrf_U_single.$(SUFFIX) qpotrf_L_single.$(SUFFIX) CBLASOBJS = cpotrf_U_single.$(SUFFIX) cpotrf_L_single.$(SUFFIX) ZBLASOBJS = zpotrf_U_single.$(SUFFIX) zpotrf_L_single.$(SUFFIX) XBLASOBJS = xpotrf_U_single.$(SUFFIX) xpotrf_L_single.$(SUFFIX) ifdef SMP SBLASOBJS += spotrf_U_parallel.$(SUFFIX) spotrf_L_parallel.$(SUFFIX) DBLASOBJS += dpotrf_U_parallel.$(SUFFIX) dpotrf_L_parallel.$(SUFFIX) QBLASOBJS += qpotrf_U_parallel.$(SUFFIX) qpotrf_L_parallel.$(SUFFIX) CBLASOBJS += cpotrf_U_parallel.$(SUFFIX) cpotrf_L_parallel.$(SUFFIX) ZBLASOBJS += zpotrf_U_parallel.$(SUFFIX) zpotrf_L_parallel.$(SUFFIX) XBLASOBJS += xpotrf_U_parallel.$(SUFFIX) xpotrf_L_parallel.$(SUFFIX) endif spotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotrf_L_single.$(SUFFIX) : potrf_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dpotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotrf_L_single.$(SUFFIX) : potrf_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qpotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotrf_L_single.$(SUFFIX) : potrf_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) cpotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotrf_L_single.$(SUFFIX) : potrf_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zpotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotrf_L_single.$(SUFFIX) : potrf_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xpotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotrf_L_single.$(SUFFIX) : potrf_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) spotrf_U_single.$(PSUFFIX) : potrf_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotrf_L_single.$(PSUFFIX) : potrf_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) spotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) dpotrf_U_single.$(PSUFFIX) : potrf_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotrf_L_single.$(PSUFFIX) : potrf_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) dpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) qpotrf_U_single.$(PSUFFIX) : potrf_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotrf_L_single.$(PSUFFIX) : potrf_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) qpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) cpotrf_U_single.$(PSUFFIX) : potrf_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotrf_L_single.$(PSUFFIX) : potrf_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) cpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) zpotrf_U_single.$(PSUFFIX) : potrf_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotrf_L_single.$(PSUFFIX) : potrf_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) zpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) xpotrf_U_single.$(PSUFFIX) : potrf_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotrf_L_single.$(PSUFFIX) : potrf_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) xpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/potrf/potrf_L_parallel.c000066400000000000000000000114731313527062700211640ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, bk, i, blocking, lda; BLASLONG info; int mode; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { -ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif if (args -> nthreads == 1) { info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); return info; } n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= GEMM_UNROLL_N * 4) { info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); return info; } newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (blocking > GEMM_Q) blocking = GEMM_Q; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; info = CNAME(&newarg, NULL, NULL, sa, sb, 0); if (info) return info + i; if (n - i - bk > 0) { newarg.m = n - i - bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + (i + bk + i * lda) * COMPSIZE; newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; #ifndef USE_SIMPLE_THREADED_LEVEL3 HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); #endif } } return 0; } OpenBLAS-0.2.20/lapack/potrf/potrf_L_single.c000066400000000000000000000146231313527062700206510ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dm1 = -1.; #ifndef COMPLEX #define TRSM_KERNEL TRSM_KERNEL_RN #else #define TRSM_KERNEL TRSM_KERNEL_RR #undef SYRK_KERNEL_L #ifdef XDOUBLE #define SYRK_KERNEL_L xherk_kernel_LN #elif defined(DOUBLE) #define SYRK_KERNEL_L zherk_kernel_LN #else #define SYRK_KERNEL_L cherk_kernel_LN #endif #endif #if 0 #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P 128 #define GEMM_Q 128 #define GEMM_R 4000 #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) //leave some space for GEMM_ALIGN in sb2 #define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ) #if 0 #define SHARED_ARRAY #define SA aa #else #undef SHARED_ARRAY #define SA sa #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, j, blocking; BLASLONG is, min_i; BLASLONG js, min_j; BLASLONG range_N[2]; FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); #ifdef SHARED_ARRAY FLOAT *aa; #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_L(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = n / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); /* First tile */ min_j = n - j - bk; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for (is = j + bk; is < n; is += GEMM_P) { min_i = n - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifdef SHARED_ARRAY if (is < j + bk + min_j) { aa = sb2 + bk * (is - j - bk) * COMPSIZE; } else { aa = sa; } GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa); TRSM_KERNEL(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, #endif aa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, a + (is + (j + bk) * lda) * COMPSIZE, lda, is - j - bk); #else GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); TRSM_KERNEL(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); if (is < j + bk + min_j) { GEMM_OTCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sb2 + bk * (is - j - bk) * COMPSIZE); } SYRK_KERNEL_L(min_i, min_j, bk, dm1, sa, sb2, a + (is + (j + bk) * lda) * COMPSIZE, lda, is - j - bk); #endif } for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){ min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2); for (is = js; is < n; is += GEMM_P) { min_i = n - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifdef SHARED_ARRAY if (is + min_i < js + min_j) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); aa = sa; } SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); #else GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); SYRK_KERNEL_L(min_i, min_j, bk, dm1, sa, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); #endif } } } } return 0; } OpenBLAS-0.2.20/lapack/potrf/potrf_U_parallel.c000066400000000000000000000114361313527062700211740ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, bk, i, blocking, lda; BLASLONG info; int mode; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { -ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif if (args -> nthreads == 1) { info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); return info; } n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= GEMM_UNROLL_N * 4) { info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); return info; } newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (blocking > GEMM_Q) blocking = GEMM_Q; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; info = CNAME(&newarg, NULL, NULL, sa, sb, 0); if (info) return info + i; if (n - i - bk > 0) { newarg.m = bk; newarg.n = n - i - bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; #ifndef USE_SIMPLE_THREADED_LEVEL3 HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); #endif } } return 0; } OpenBLAS-0.2.20/lapack/potrf/potrf_U_single.c000066400000000000000000000134331313527062700206600ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" static FLOAT dm1 = -1.; #ifndef COMPLEX #define TRSM_KERNEL TRSM_KERNEL_LT #else #define TRSM_KERNEL TRSM_KERNEL_LC #undef SYRK_KERNEL_U #ifdef XDOUBLE #define SYRK_KERNEL_U xherk_kernel_UC #elif defined(DOUBLE) #define SYRK_KERNEL_U zherk_kernel_UC #else #define SYRK_KERNEL_U cherk_kernel_UC #endif #endif #if 0 #undef GEMM_P #undef GEMM_Q #undef GEMM_R #define GEMM_P 8 #define GEMM_Q 20 #define GEMM_R 64 #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) #define REAL_GEMM_R (GEMM_R - GEMM_PQ) #if 0 #define SHARED_ARRAY #define SA aa #else #undef SHARED_ARRAY #define SA sa #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, blocking; BLASLONG is, min_i; BLASLONG jjs, min_jj; BLASLONG range_N[2]; BLASLONG j, js, min_j; #ifdef SHARED_ARRAY FLOAT *aa; #endif FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_U(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); for(js = j + bk; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); for (is = 0; is < bk; is += GEMM_P) { min_i = bk - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL (min_i, min_jj, bk, dm1, #ifdef COMPLEX ZERO, #endif sb + bk * is * COMPSIZE, sb2 + bk * (jjs - js) * COMPSIZE, a + (j + is + jjs * lda) * COMPSIZE, lda, is); } } for (is = j + bk; is < js + min_j; is += min_i) { min_i = js + min_j - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } #ifdef SHARED_ARRAY if ((is >= js) && (is + min_i <= js + min_j)) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); aa = sa; } #else GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); #endif SYRK_KERNEL_U(min_i, min_j, bk, dm1, SA, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); } } } } return 0; } OpenBLAS-0.2.20/lapack/potrf/potrf_parallel.c000066400000000000000000000414501313527062700207070ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifndef USE_SIMPLE_THREADED_LEVEL3 //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif static FLOAT dm1 = -1.; #ifndef KERNEL_FUNC #ifndef LOWER #define KERNEL_FUNC SYRK_KERNEL_U #else #define KERNEL_FUNC SYRK_KERNEL_L #endif #endif #ifndef LOWER #ifndef COMPLEX #define TRSM_KERNEL TRSM_KERNEL_LT #else #define TRSM_KERNEL TRSM_KERNEL_LC #endif #else #ifndef COMPLEX #define TRSM_KERNEL TRSM_KERNEL_RN #else #define TRSM_KERNEL TRSM_KERNEL_RR #endif #endif #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE 8 #endif #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif #ifndef SWITCH_RATIO #define SWITCH_RATIO 2 #endif #ifndef LOWER #define TRANS #endif #ifndef SYRK_LOCAL #if !defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL SYRK_UN #elif !defined(LOWER) && defined(TRANS) #define SYRK_LOCAL SYRK_UT #elif defined(LOWER) && !defined(TRANS) #define SYRK_LOCAL SYRK_LN #else #define SYRK_LOCAL SYRK_LT #endif #endif typedef struct { volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; #ifndef KERNEL_OPERATION #ifndef COMPLEX #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) #else #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) #endif #endif #ifndef ICOPY_OPERATION #ifndef TRANS #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #ifdef TRANS #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef S #define S args -> a #endif #ifndef A #define A args -> b #endif #ifndef C #define C args -> c #endif #ifndef LDA #define LDA args -> lda #endif #ifndef N #define N args -> m #endif #ifndef K #define K args -> k #endif static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ FLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda; BLASLONG m_from, m_to; FLOAT *alpha; FLOAT *a, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; BLASLONG jjs, min_jj; BLASLONG is, min_i, div_n; BLASLONG i, current; k = K; a = (FLOAT *)A; c = (FLOAT *)C; lda = LDA; alpha = (FLOAT *)args -> alpha; m_from = range_n[mypos + 0]; m_to = range_n[mypos + 1]; #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); #endif div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; } #ifndef LOWER TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); #else TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); #endif for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(m_to, xxx + div_n) - jjs; #ifndef LOWER if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; #else if (min_jj > GEMM_P) min_jj = GEMM_P; #endif #ifndef LOWER OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); TRSM_KERNEL (k, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, a + jjs * lda * COMPSIZE, lda, 0); #else ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); TRSM_KERNEL (min_jj, k, k, dm1, #ifdef COMPLEX ZERO, #endif buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, sb, a + jjs * COMPSIZE, lda, 0); #endif } #ifndef LOWER for (i = 0; i <= mypos; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #else for (i = mypos; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #endif WMB; } min_i = m_to - m_from; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #endif current = mypos; #ifndef LOWER while (current < args -> nthreads) #else while (current >= 0) #endif { div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { /* thread has to wait */ if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, m_from, xxx); if (m_from + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } #ifndef LOWER current ++; #else current --; #endif } for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; } #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #endif current = mypos; #ifndef LOWER while (current < args -> nthreads) #else while (current >= 0) #endif { div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, is, xxx); if (is + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } #ifndef LOWER current ++; #else current --; #endif } } for (i = 0; i < args -> nthreads; i++) { if (i != mypos) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; } } } return 0; } static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ blas_arg_t newarg; #ifndef USE_ALLOC_HEAP job_t job[MAX_CPU_NUMBER]; #else job_t * job = NULL; #endif blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 100]; BLASLONG num_cpu; BLASLONG nthreads = args -> nthreads; BLASLONG width, i, j, k; BLASLONG n, n_from, n_to; int mode, mask; double dnum; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_COMPLEX; mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; #endif #endif newarg.m = args -> m; newarg.k = args -> k; newarg.a = args -> a; newarg.b = args -> b; newarg.c = args -> c; newarg.lda = args -> lda; newarg.alpha = args -> alpha; #ifdef USE_ALLOC_HEAP job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); if(job==NULL){ fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); exit(1); } #endif newarg.common = (void *)job; n_from = 0; n_to = args -> m; #ifndef LOWER range[MAX_CPU_NUMBER] = n_to - n_from; range[0] = 0; num_cpu = 0; i = 0; n = n_to - n_from; dnum = (double)n * (double)n /(double)nthreads; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)i; width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); if ((width > n - i) || (width < mask)) width = n - i; } else { width = n - i; } range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = NULL; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; #else range[0] = 0; num_cpu = 0; i = 0; n = n_to - n_from; dnum = (double)n * (double)n /(double)nthreads; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)i; width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); if ((width > n - i) || (width < mask)) width = n - i; } else { width = n - i; } range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = NULL; queue[num_cpu].range_n = range; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif newarg.nthreads = num_cpu; if (num_cpu) { for (j = 0; j < num_cpu; j++) { for (i = 0; i < num_cpu; i++) { for (k = 0; k < DIVIDE_RATE; k++) { job[j].working[i][CACHE_LINE_SIZE * k] = 0; } } } queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } #ifdef USE_ALLOC_HEAP free(job); #endif return 0; } #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, bk, i, blocking, lda; BLASLONG info; int mode; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { -ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif if (args -> nthreads == 1) { #ifndef LOWER info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); #else info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); #endif return info; } n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= GEMM_UNROLL_N * 2) { #ifndef LOWER info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); #else info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); #endif return info; } newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; if (blocking > GEMM_Q) blocking = GEMM_Q; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; info = CNAME(&newarg, NULL, NULL, sa, sb, 0); if (info) return info + i; if (n - i - bk > 0) { #ifndef USE_SIMPLE_THREADED_LEVEL3 newarg.m = n - i - bk; newarg.k = bk; #ifndef LOWER newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; #else newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; #endif newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; thread_driver(&newarg, sa, sb); #else #ifndef LOWER newarg.m = bk; newarg.n = n - i - bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; #if 0 HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); #endif #else newarg.m = n - i - bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + (i + bk + i * lda) * COMPSIZE; newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; #if 0 HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); #endif #endif #endif } } return 0; } OpenBLAS-0.2.20/lapack/trti2/000077500000000000000000000000001313527062700154435ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/trti2/Makefile000066400000000000000000000117031313527062700171050ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = strti2_UU.$(SUFFIX) strti2_UN.$(SUFFIX) strti2_LU.$(SUFFIX) strti2_LN.$(SUFFIX) DBLASOBJS = dtrti2_UU.$(SUFFIX) dtrti2_UN.$(SUFFIX) dtrti2_LU.$(SUFFIX) dtrti2_LN.$(SUFFIX) QBLASOBJS = qtrti2_UU.$(SUFFIX) qtrti2_UN.$(SUFFIX) qtrti2_LU.$(SUFFIX) qtrti2_LN.$(SUFFIX) CBLASOBJS = ctrti2_UU.$(SUFFIX) ctrti2_UN.$(SUFFIX) ctrti2_LU.$(SUFFIX) ctrti2_LN.$(SUFFIX) ZBLASOBJS = ztrti2_UU.$(SUFFIX) ztrti2_UN.$(SUFFIX) ztrti2_LU.$(SUFFIX) ztrti2_LN.$(SUFFIX) XBLASOBJS = xtrti2_UU.$(SUFFIX) xtrti2_UN.$(SUFFIX) xtrti2_LU.$(SUFFIX) xtrti2_LN.$(SUFFIX) strti2_UU.$(SUFFIX) : trti2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strti2_UN.$(SUFFIX) : trti2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strti2_LU.$(SUFFIX) : trti2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strti2_LN.$(SUFFIX) : trti2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) dtrti2_UU.$(SUFFIX) : trti2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrti2_UN.$(SUFFIX) : trti2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrti2_LU.$(SUFFIX) : trti2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrti2_LN.$(SUFFIX) : trti2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) qtrti2_UU.$(SUFFIX) : trti2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrti2_UN.$(SUFFIX) : trti2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrti2_LU.$(SUFFIX) : trti2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrti2_LN.$(SUFFIX) : trti2_L.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) ctrti2_UU.$(SUFFIX) : ztrti2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrti2_UN.$(SUFFIX) : ztrti2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrti2_LU.$(SUFFIX) : ztrti2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrti2_LN.$(SUFFIX) : ztrti2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ztrti2_UU.$(SUFFIX) : ztrti2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrti2_UN.$(SUFFIX) : ztrti2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrti2_LU.$(SUFFIX) : ztrti2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrti2_LN.$(SUFFIX) : ztrti2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) xtrti2_UU.$(SUFFIX) : ztrti2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrti2_UN.$(SUFFIX) : ztrti2_U.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrti2_LU.$(SUFFIX) : ztrti2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrti2_LN.$(SUFFIX) : ztrti2_L.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) strti2_UU.$(PSUFFIX) : trti2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strti2_UN.$(PSUFFIX) : trti2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strti2_LU.$(PSUFFIX) : trti2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strti2_LN.$(PSUFFIX) : trti2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) dtrti2_UU.$(PSUFFIX) : trti2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrti2_UN.$(PSUFFIX) : trti2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrti2_LU.$(PSUFFIX) : trti2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrti2_LN.$(PSUFFIX) : trti2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) qtrti2_UU.$(PSUFFIX) : trti2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrti2_UN.$(PSUFFIX) : trti2_U.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrti2_LU.$(PSUFFIX) : trti2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrti2_LN.$(PSUFFIX) : trti2_L.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) ctrti2_UU.$(PSUFFIX) : ztrti2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrti2_UN.$(PSUFFIX) : ztrti2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrti2_LU.$(PSUFFIX) : ztrti2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrti2_LN.$(PSUFFIX) : ztrti2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ztrti2_UU.$(PSUFFIX) : ztrti2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrti2_UN.$(PSUFFIX) : ztrti2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrti2_LU.$(PSUFFIX) : ztrti2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrti2_LN.$(PSUFFIX) : ztrti2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) xtrti2_UU.$(PSUFFIX) : ztrti2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrti2_UN.$(PSUFFIX) : ztrti2_U.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrti2_LU.$(PSUFFIX) : ztrti2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrti2_LN.$(PSUFFIX) : ztrti2_L.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/trti2/trti2_L.c000066400000000000000000000066241313527062700171360ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef UNIT #define TRMV TRMV_NLU #else #define TRMV TRMV_NLN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj; BLASLONG j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = n - 1; j >= 0; j--) { ajj = ONE; #ifndef UNIT ajj /= *(a + j + j * lda); *(a + j + j * lda) = ajj; #endif TRMV (n - j - 1, a + (j + 1) + (j + 1) * lda, lda, a + (j + 1) + j * lda, 1, sb); SCAL_K(n - j - 1, 0, 0, -ajj, a + (j + 1) + j * lda, 1, NULL, 0, NULL, 0); } return 0; } OpenBLAS-0.2.20/lapack/trti2/trti2_U.c000066400000000000000000000065331313527062700171460ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef UNIT #define TRMV TRMV_NUU #else #define TRMV TRMV_NUN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj; BLASLONG j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = 0; j < n; j++) { ajj = ONE; #ifndef UNIT ajj /= *(a + j + j * lda); *(a + j + j * lda) = ajj; #endif TRMV (j, a , lda, a + j * lda, 1, sb); SCAL_K(j, 0, 0, -ajj, a + j * lda, 1, NULL, 0, NULL, 0); } return 0; } OpenBLAS-0.2.20/lapack/trti2/ztrti2_L.c000066400000000000000000000076501313527062700173300ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef UNIT #define ZTRMV ZTRMV_NLU #else #define ZTRMV ZTRMV_NLN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj_r, ajj_i; #ifndef UNIT FLOAT ratio, den; #endif BLASLONG j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = n - 1; j >= 0; j--) { ajj_r = ONE; ajj_i = ZERO; #ifndef UNIT ajj_r = *(a + (j + j * lda) * COMPSIZE + 0); ajj_i = *(a + (j + j * lda) * COMPSIZE + 1); if (fabs(ajj_r) >= fabs(ajj_i)){ ratio = ajj_i / ajj_r; den = 1. / (ajj_r * ( 1 + ratio * ratio)); ajj_r = den; ajj_i = -ratio * den; } else { ratio = ajj_r / ajj_i; den = 1. /(ajj_i * ( 1 + ratio * ratio)); ajj_r = ratio * den; ajj_i = -den; } *(a + (j + j * lda) * COMPSIZE + 0) = ajj_r; *(a + (j + j * lda) * COMPSIZE + 1) = ajj_i; #endif ZTRMV (n - j - 1, a + ((j + 1) + (j + 1) * lda) * COMPSIZE, lda, a + ((j + 1) + j * lda) * COMPSIZE, 1, sb); SCAL_K(n - j - 1, 0, 0, -ajj_r, -ajj_i, a + ((j + 1) + j * lda) * COMPSIZE, 1, NULL, 0, NULL, 0); } return 0; } OpenBLAS-0.2.20/lapack/trti2/ztrti2_U.c000066400000000000000000000075521313527062700173420ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef UNIT #define ZTRMV ZTRMV_NUU #else #define ZTRMV ZTRMV_NUN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj_r, ajj_i; #ifndef UNIT FLOAT ratio, den; #endif BLASLONG j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = 0; j < n; j++) { ajj_r = ONE; ajj_i = ZERO; #ifndef UNIT ajj_r = *(a + (j + j * lda) * COMPSIZE + 0); ajj_i = *(a + (j + j * lda) * COMPSIZE + 1); if (fabs(ajj_r) >= fabs(ajj_i)){ ratio = ajj_i / ajj_r; den = 1. / (ajj_r * ( 1 + ratio * ratio)); ajj_r = den; ajj_i = -ratio * den; } else { ratio = ajj_r / ajj_i; den = 1. /(ajj_i * ( 1 + ratio * ratio)); ajj_r = ratio * den; ajj_i = -den; } *(a + (j + j * lda) * COMPSIZE + 0) = ajj_r; *(a + (j + j * lda) * COMPSIZE + 1) = ajj_i; #endif ZTRMV (j, a , lda, a + j * lda * COMPSIZE, 1, sb); SCAL_K(j, 0, 0, -ajj_r, -ajj_i, a + j * lda * COMPSIZE, 1, NULL, 0, NULL, 0); } return 0; } OpenBLAS-0.2.20/lapack/trtri/000077500000000000000000000000001313527062700155435ustar00rootroot00000000000000OpenBLAS-0.2.20/lapack/trtri/Makefile000066400000000000000000000272501313527062700172110ustar00rootroot00000000000000TOPDIR = ../.. include ../../Makefile.system SBLASOBJS = strtri_UU_single.$(SUFFIX) strtri_UN_single.$(SUFFIX) strtri_LU_single.$(SUFFIX) strtri_LN_single.$(SUFFIX) DBLASOBJS = dtrtri_UU_single.$(SUFFIX) dtrtri_UN_single.$(SUFFIX) dtrtri_LU_single.$(SUFFIX) dtrtri_LN_single.$(SUFFIX) QBLASOBJS = qtrtri_UU_single.$(SUFFIX) qtrtri_UN_single.$(SUFFIX) qtrtri_LU_single.$(SUFFIX) qtrtri_LN_single.$(SUFFIX) CBLASOBJS = ctrtri_UU_single.$(SUFFIX) ctrtri_UN_single.$(SUFFIX) ctrtri_LU_single.$(SUFFIX) ctrtri_LN_single.$(SUFFIX) ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_single.$(SUFFIX) ztrtri_LN_single.$(SUFFIX) XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX) ifdef SMP SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX) DBLASOBJS += dtrtri_UU_parallel.$(SUFFIX) dtrtri_UN_parallel.$(SUFFIX) dtrtri_LU_parallel.$(SUFFIX) dtrtri_LN_parallel.$(SUFFIX) QBLASOBJS += qtrtri_UU_parallel.$(SUFFIX) qtrtri_UN_parallel.$(SUFFIX) qtrtri_LU_parallel.$(SUFFIX) qtrtri_LN_parallel.$(SUFFIX) CBLASOBJS += ctrtri_UU_parallel.$(SUFFIX) ctrtri_UN_parallel.$(SUFFIX) ctrtri_LU_parallel.$(SUFFIX) ctrtri_LN_parallel.$(SUFFIX) ZBLASOBJS += ztrtri_UU_parallel.$(SUFFIX) ztrtri_UN_parallel.$(SUFFIX) ztrtri_LU_parallel.$(SUFFIX) ztrtri_LN_parallel.$(SUFFIX) XBLASOBJS += xtrtri_UU_parallel.$(SUFFIX) xtrtri_UN_parallel.$(SUFFIX) xtrtri_LU_parallel.$(SUFFIX) xtrtri_LN_parallel.$(SUFFIX) endif strtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_LN_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_LN_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) qtrtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_LN_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) ctrtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_LN_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ztrtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_LN_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) xtrtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_LN_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) strtri_UU_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_UN_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strtri_LU_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_LN_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) strtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) strtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) dtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) dtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) dtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) qtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) qtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) qtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) ctrtri_UU_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_UN_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrtri_LU_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_LN_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ctrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) ctrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) ztrtri_UU_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_UN_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrtri_LU_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_LN_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) ztrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) ztrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) xtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) xtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) xtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) include ../../Makefile.tail OpenBLAS-0.2.20/lapack/trtri/trtri_L_parallel.c000066400000000000000000000120051313527062700212000ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef UNIT #define TRTI2 TRTI2_LU #define TRMM TRMM_LNLU #define TRSM TRSM_RNLU #else #define TRTI2 TRTI2_LN #define TRMM TRMM_LNLN #define TRSM TRSM_RNLN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { BLASLONG n, info; BLASLONG bk, i, blocking, start_i; int mode; BLASLONG lda, range_N[2]; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; FLOAT beta [2] = {-ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= DTB_ENTRIES) { info = TRTI2(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n < 4 * GEMM_Q) blocking = (n + 3) / 4; start_i = 0; while (start_i < n) start_i += blocking; start_i -= blocking; for (i = start_i; i >= 0; i -= blocking) { bk = n - i; if (bk > blocking) bk = blocking; range_N[0] = i; range_N[1] = i + bk; newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.m = n - bk - i; newarg.n = bk; newarg.a = a + ( i + i * lda) * COMPSIZE; newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; newarg.beta = beta; newarg.nthreads = args -> nthreads; gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; CNAME (&newarg, NULL, NULL, sa, sb, 0); newarg.m = n - bk - i; newarg.n = i; newarg.k = bk; newarg.a = a + (i + bk + i * lda) * COMPSIZE; newarg.b = a + (i ) * COMPSIZE; newarg.c = a + (i + bk ) * COMPSIZE; newarg.beta = NULL; gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads); newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i ) * COMPSIZE; newarg.m = bk; newarg.n = i; gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); } return 0; } OpenBLAS-0.2.20/lapack/trtri/trtri_L_single.c000066400000000000000000000065371313527062700207020ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) 2013, The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of the OpenBLAS project nor the names of * its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *****************************************************************************/ /************************************************************************************** * 2014/05/22 Saar * TEST double precision unblocked : OK * 2014/05/23 Saar * TEST double precision blocked: OK * TEST single precision blocked: OK **************************************************************************************/ #include #include "common.h" // static FLOAT dp1 = 1.; // static FLOAT dm1 = -1.; #ifdef UNIT #define TRTI2 TRTI2_LU #define TRMM TRMM_LNLU #define TRSM TRSM_RNLU #else #define TRTI2 TRTI2_LN #define TRMM TRMM_LNLN #define TRSM TRSM_RNLN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG j, n, lda; FLOAT *a; // BLASLONG info=0; BLASLONG jb; BLASLONG NB; BLASLONG start_j; FLOAT beta_plus[2] = { ONE, ZERO}; FLOAT beta_minus[2] = {-ONE, ZERO}; n = args -> n; NB = GEMM_Q; if (n < NB) { TRTI2(args, NULL, range_n, sa, sb, 0); return 0; } lda = args -> lda; a = (FLOAT *) args -> a; args -> ldb = lda; args -> ldc = lda; args -> alpha = NULL; start_j = 0; while (start_j < n) start_j += NB; start_j -= NB; for (j = start_j ; j >=0 ; j-= NB) { jb = n - j; if ( jb > NB ) jb = NB; args -> n = jb; args -> m = n-j-jb; args -> a = &a[(j+jb+(j+jb)*lda) * COMPSIZE]; args -> b = &a[(j+jb+j*lda) * COMPSIZE]; args -> beta = beta_plus; TRMM(args, NULL, NULL, sa, sb, 0); args -> a = &a[(j+j*lda) * COMPSIZE]; args -> beta = beta_minus; TRSM(args, NULL, NULL, sa, sb, 0); args -> a = &a[(j+j*lda) * COMPSIZE]; TRTI2(args, NULL, range_n, sa, sb, 0); } return 0; } OpenBLAS-0.2.20/lapack/trtri/trtri_U_parallel.c000066400000000000000000000116401313527062700212150ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #include #include "common.h" #ifdef UNIT #define TRTI2 TRTI2_UU #define TRMM TRMM_LNUU #define TRSM TRSM_RNUU #else #define TRTI2 TRTI2_UN #define TRMM TRMM_LNUN #define TRSM TRSM_RNUN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { BLASLONG n, info; BLASLONG bk, i, blocking; int mode; BLASLONG lda, range_N[2]; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; FLOAT beta [2] = {-ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= DTB_ENTRIES) { info = TRTI2(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n < 4 * GEMM_Q) blocking = (n + 3) / 4; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; range_N[0] = i; range_N[1] = i + bk; newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.m = i; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + ( i * lda) * COMPSIZE; newarg.beta = beta; newarg.nthreads = args -> nthreads; gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; CNAME (&newarg, NULL, NULL, sa, sb, 0); newarg.m = i; newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + ( i * lda) * COMPSIZE; newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; newarg.c = a + ( (i + bk) * lda) * COMPSIZE; newarg.beta = NULL; gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads); newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; newarg.m = bk; newarg.n = n - i - bk; gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); } return 0; } OpenBLAS-0.2.20/lapack/trtri/trtri_U_single.c000066400000000000000000000063611313527062700207060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) 2013, The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of the OpenBLAS project nor the names of * its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *****************************************************************************/ /************************************************************************************** * 2014/05/22 Saar * TEST double precision unblocked : OK * TEST double precision blocked : OK * 2014/05/23 * TEST single precision blocked : OK * **************************************************************************************/ #include #include "common.h" // static FLOAT dp1 = 1.; // static FLOAT dm1 = -1.; #ifdef UNIT #define TRTI2 TRTI2_UU #else #define TRTI2 TRTI2_UN #endif #ifdef UNIT #define TRMM TRMM_LNUU #define TRSM TRSM_RNUU #else #define TRMM TRMM_LNUN #define TRSM TRSM_RNUN #endif blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG j, n, lda; FLOAT *a; // BLASLONG info=0; BLASLONG jb; BLASLONG NB; FLOAT beta_plus[2] = { ONE, ZERO}; FLOAT beta_minus[2] = {-ONE, ZERO}; n = args -> n; NB = GEMM_Q; if (n <= NB) { TRTI2(args, NULL, range_n, sa, sb, 0); return 0; } lda = args -> lda; a = (FLOAT *) args -> a; args -> ldb = lda; args -> ldc = lda; args -> alpha = NULL; for (j = 0; j < n; j += NB) { jb = n - j; if ( jb > NB ) jb = NB; args -> n = jb; args -> m = j; args -> a = &a[0]; args -> b = &a[(j*lda) * COMPSIZE]; args -> beta = beta_plus; TRMM(args, NULL, NULL, sa, sb, 0); args -> a = &a[(j+j*lda) * COMPSIZE]; args -> beta = beta_minus; TRSM(args, NULL, NULL, sa, sb, 0); args -> a = &a[(j+j*lda) * COMPSIZE]; TRTI2(args, NULL, range_n, sa, sb, 0); } return 0; } OpenBLAS-0.2.20/make.inc000066400000000000000000000001251313527062700145520ustar00rootroot00000000000000SHELL = /bin/sh PLAT = _LINUX DRVOPTS = $(NOOPT) ARCHFLAGS= -ru #RANLIB = ranlib OpenBLAS-0.2.20/openblas.pc.in000066400000000000000000000003611313527062700157000ustar00rootroot00000000000000Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: https://github.com/xianyi/OpenBLAS Libs: -L${libdir} -lopenblas Libs.private: ${extralib} Cflags: -I${includedir} OpenBLAS-0.2.20/openblas_config_template.h000066400000000000000000000064461313527062700203520ustar00rootroot00000000000000/*This is only for "make install" target.*/ #if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX) #define OPENBLAS_WINDOWS_ABI #define OPENBLAS_OS_WINDOWS #ifdef DOUBLE #define DOUBLE_DEFINED DOUBLE #undef DOUBLE #endif #endif #ifdef OPENBLAS_NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ #else #define BLASFUNC(FUNC) FUNC #endif #ifdef OPENBLAS_QUAD_PRECISION typedef struct { unsigned long x[2]; } xdouble; #elif defined OPENBLAS_EXPRECISION #define xdouble long double #else #define xdouble double #endif #if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__) typedef long long BLASLONG; typedef unsigned long long BLASULONG; #else typedef long BLASLONG; typedef unsigned long BLASULONG; #endif #ifdef OPENBLAS_USE64BITINT typedef BLASLONG blasint; #else typedef int blasint; #endif #if defined(XDOUBLE) || defined(DOUBLE) #define FLOATRET FLOAT #else #ifdef NEED_F2CCONV #define FLOATRET double #else #define FLOATRET float #endif #endif /* Inclusion of a standard header file is needed for definition of __STDC_* predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs as a side effect of including either or . */ #include /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) #define OPENBLAS_COMPLEX_C99 #ifndef __cplusplus #include #endif typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; typedef xdouble _Complex openblas_complex_xdouble; #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) #define openblas_complex_float_real(z) (creal(z)) #define openblas_complex_float_imag(z) (cimag(z)) #define openblas_complex_double_real(z) (creal(z)) #define openblas_complex_double_imag(z) (cimag(z)) #define openblas_complex_xdouble_real(z) (creal(z)) #define openblas_complex_xdouble_imag(z) (cimag(z)) #else #define OPENBLAS_COMPLEX_STRUCT typedef struct { float real, imag; } openblas_complex_float; typedef struct { double real, imag; } openblas_complex_double; typedef struct { xdouble real, imag; } openblas_complex_xdouble; #define openblas_make_complex_float(real, imag) {(real), (imag)} #define openblas_make_complex_double(real, imag) {(real), (imag)} #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} #define openblas_complex_float_real(z) ((z).real) #define openblas_complex_float_imag(z) ((z).imag) #define openblas_complex_double_real(z) ((z).real) #define openblas_complex_double_imag(z) ((z).imag) #define openblas_complex_xdouble_real(z) ((z).real) #define openblas_complex_xdouble_imag(z) ((z).imag) #endif OpenBLAS-0.2.20/param.h000066400000000000000000002027041313527062700144220ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef PARAM_H #define PARAM_H #ifdef OPTERON #define SNUMOPT 4 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 #define GEMM_DEFAULT_ALIGN 0x01ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #endif #define SGEMM_DEFAULT_P sgemm_p #define DGEMM_DEFAULT_P dgemm_p #define QGEMM_DEFAULT_P qgemm_p #define CGEMM_DEFAULT_P cgemm_p #define ZGEMM_DEFAULT_P zgemm_p #define XGEMM_DEFAULT_P xgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #ifdef ALLOC_HUGETLB #define SGEMM_DEFAULT_Q 248 #define DGEMM_DEFAULT_Q 248 #define QGEMM_DEFAULT_Q 248 #define CGEMM_DEFAULT_Q 248 #define ZGEMM_DEFAULT_Q 248 #define XGEMM_DEFAULT_Q 248 #else #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 240 #define QGEMM_DEFAULT_Q 240 #define CGEMM_DEFAULT_Q 240 #define ZGEMM_DEFAULT_Q 240 #define XGEMM_DEFAULT_Q 240 #endif #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 #define GEMM_DEFAULT_ALIGN 0x0fffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #endif #if 0 #define SGEMM_DEFAULT_P 496 #define DGEMM_DEFAULT_P 248 #define QGEMM_DEFAULT_P 124 #define CGEMM_DEFAULT_P 248 #define ZGEMM_DEFAULT_P 124 #define XGEMM_DEFAULT_P 62 #define SGEMM_DEFAULT_Q 248 #define DGEMM_DEFAULT_Q 248 #define QGEMM_DEFAULT_Q 248 #define CGEMM_DEFAULT_Q 248 #define ZGEMM_DEFAULT_Q 248 #define XGEMM_DEFAULT_Q 248 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 224 #define QGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 #define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 #define QGEMM_DEFAULT_Q 224 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 #endif #define SGEMM_DEFAULT_R sgemm_r #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn #endif #ifdef BULLDOZER #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 #define GEMM_DEFAULT_ALIGN 0x0fffUL #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define CGEMM3M_DEFAULT_UNROLL_N 4 #define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 4 #define ZGEMM3M_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_MN 16 #define GEMV_UNROLL 8 #endif #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 384 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 #define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 168 #define DGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 #define SGEMM_DEFAULT_R sgemm_r #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn #endif #ifdef PILEDRIVER #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 #define GEMM_DEFAULT_ALIGN 0x0fffUL #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define CGEMM3M_DEFAULT_UNROLL_N 4 #define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 4 #define ZGEMM3M_DEFAULT_UNROLL_M 4 #define GEMV_UNROLL 8 #endif #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 768 #define ZGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 768 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 168 #define ZGEMM_DEFAULT_Q 168 #define CGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 224 #define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 #define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn #endif #ifdef STEAMROLLER #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 #define GEMM_DEFAULT_ALIGN 0x0fffUL #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define CGEMM3M_DEFAULT_UNROLL_N 4 #define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 4 #define ZGEMM3M_DEFAULT_UNROLL_M 4 #define GEMV_UNROLL 8 #endif #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 576 #define ZGEMM_DEFAULT_P 288 #define CGEMM_DEFAULT_P 576 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 160 #define ZGEMM_DEFAULT_Q 160 #define CGEMM_DEFAULT_Q 160 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 224 #define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 #define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn #endif #ifdef EXCAVATOR #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 #define GEMM_DEFAULT_ALIGN 0x0fffUL #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define CGEMM3M_DEFAULT_UNROLL_N 4 #define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 4 #define ZGEMM3M_DEFAULT_UNROLL_M 4 #define GEMV_UNROLL 8 #endif #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 576 #define ZGEMM_DEFAULT_P 288 #define CGEMM_DEFAULT_P 576 #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 160 #define ZGEMM_DEFAULT_Q 160 #define CGEMM_DEFAULT_Q 160 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 224 #define CGEMM_DEFAULT_Q 224 #endif #define QGEMM_DEFAULT_Q 224 #define XGEMM_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 #define SGEMM_DEFAULT_R 12288 #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R 12288 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #define GEMM_THREAD gemm_thread_mn #endif #ifdef ZEN #define SNUMOPT 16 #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SYMV_P 8 #define SWITCH_RATIO 4 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 #endif #ifdef ARCH_X86 #define SGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_R 1024 #define ZGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 192 #define XGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define QGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 #define CGEMM3M_DEFAULT_UNROLL_N 8 #define CGEMM3M_DEFAULT_UNROLL_M 4 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 #endif #endif #ifdef ATHLON #define SNUMOPT 4 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 384 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_M 1 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_P 208 #define DGEMM_DEFAULT_P 104 #define QGEMM_DEFAULT_P 56 #define CGEMM_DEFAULT_P 104 #define ZGEMM_DEFAULT_P 56 #define XGEMM_DEFAULT_P 28 #define SGEMM_DEFAULT_Q 208 #define DGEMM_DEFAULT_Q 208 #define QGEMM_DEFAULT_Q 208 #define CGEMM_DEFAULT_Q 208 #define ZGEMM_DEFAULT_Q 208 #define XGEMM_DEFAULT_Q 208 #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #endif #ifdef VIAC3 #define SNUMOPT 2 #define DNUMOPT 1 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 256 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_M 1 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define QGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 #define XGEMM_DEFAULT_P 128 #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128 #define SYMV_P 16 #endif #ifdef NANO #define SNUMOPT 4 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 #define GEMM_DEFAULT_ALIGN 0x01ffffUL #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #endif #define SGEMM_DEFAULT_P 288 #define DGEMM_DEFAULT_P 288 #define QGEMM_DEFAULT_P 288 #define CGEMM_DEFAULT_P 288 #define ZGEMM_DEFAULT_P 288 #define XGEMM_DEFAULT_P 288 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_Q 64 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 #define XGEMM_DEFAULT_Q 32 #define SYMV_P 16 #define HAVE_EXCLUSIVE_CACHE #endif #if defined(PENTIUM) || defined(PENTIUM2) || defined(PENTIUM3) #ifdef HAVE_SSE #define SNUMOPT 2 #else #define SNUMOPT 1 #endif #define DNUMOPT 1 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #ifdef HAVE_SSE #define SGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_M 4 #else #define SGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_M 2 #endif #define DGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_Q 256 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_Q 256 #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 4 #endif #ifdef PENTIUMM #define SNUMOPT 2 #define DNUMOPT 1 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #ifdef CORE_YONAH #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_N 1 #endif #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_Q 256 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_Q 256 #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 4 #endif #ifdef CORE_NORTHWOOD #define SNUMOPT 4 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 32 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SYMV_P 8 #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 1 #define ZGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 128 #define DGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128 #endif #ifdef CORE_PRESCOTT #define SNUMOPT 4 #define DNUMOPT 2 #ifndef __64BIT__ #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 192 #else #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 256 #endif #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SYMV_P 8 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #endif #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 128 #define DGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128 #endif #ifdef CORE2 #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 448 #define GEMM_DEFAULT_OFFSET_B 128 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SYMV_P 8 #define SWITCH_RATIO 4 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 1 #define ZGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1 #define MASK(a, b) ((((a) + (b) - 1) / (b)) * (b)) #else #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #endif #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 256 #define XGEMM_DEFAULT_Q 256 #endif #ifdef PENRYN #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SYMV_P 8 #define SWITCH_RATIO 4 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #endif #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 512 #define ZGEMM_DEFAULT_Q 256 #define XGEMM_DEFAULT_Q 128 #define GETRF_FACTOR 0.75 #endif #ifdef DUNNINGTON #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SYMV_P 8 #define SWITCH_RATIO 4 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #endif #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 768 #define DGEMM_DEFAULT_Q 384 #define QGEMM_DEFAULT_Q 192 #define CGEMM_DEFAULT_Q 768 #define ZGEMM_DEFAULT_Q 384 #define XGEMM_DEFAULT_Q 192 #define GETRF_FACTOR 0.75 #define GEMM_THREAD gemm_thread_mn #endif #ifdef NEHALEM #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 32 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SYMV_P 8 #define SWITCH_RATIO 4 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #endif #define SGEMM_DEFAULT_P 504 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P 504 #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P 252 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P 252 #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 512 #define ZGEMM_DEFAULT_Q 256 #define XGEMM_DEFAULT_Q 128 #define GETRF_FACTOR 0.72 #endif #ifdef SANDYBRIDGE #define SNUMOPT 8 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SYMV_P 8 #define SWITCH_RATIO 4 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #endif #define SGEMM_DEFAULT_P 768 #define SGEMM_DEFAULT_R sgemm_r //#define SGEMM_DEFAULT_R 1024 #define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_R dgemm_r //#define DGEMM_DEFAULT_R 1024 #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P 768 #define CGEMM_DEFAULT_R cgemm_r //#define CGEMM_DEFAULT_R 1024 #define ZGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_R zgemm_r //#define ZGEMM_DEFAULT_R 1024 #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 512 #define ZGEMM_DEFAULT_Q 192 #define XGEMM_DEFAULT_Q 128 #define CGEMM3M_DEFAULT_UNROLL_N 8 #define CGEMM3M_DEFAULT_UNROLL_M 4 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 #define GETRF_FACTOR 0.72 #endif #ifdef HASWELL #define SNUMOPT 16 #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SYMV_P 8 #define SWITCH_RATIO 4 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #else #define SGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 #endif #ifdef ARCH_X86 #define SGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_R 1024 #define ZGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 192 #define XGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 #define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define QGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 #define CGEMM3M_DEFAULT_UNROLL_N 8 #define CGEMM3M_DEFAULT_UNROLL_M 4 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 #define CGEMM3M_DEFAULT_P 448 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 #define CGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 #define XGEMM3M_DEFAULT_R 12288 #endif #endif #ifdef ATOM #define SNUMOPT 2 #define DNUMOPT 1 #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SYMV_P 8 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 4 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #endif #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_P sgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_P dgemm_p #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_P qgemm_p #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P cgemm_p #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_P zgemm_p #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_P xgemm_p #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 256 #define XGEMM_DEFAULT_Q 256 #endif #ifdef ITANIUM2 #define SNUMOPT 4 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 128 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P sgemm_p #define DGEMM_DEFAULT_P dgemm_p #define QGEMM_DEFAULT_P qgemm_p #define CGEMM_DEFAULT_P cgemm_p #define ZGEMM_DEFAULT_P zgemm_p #define XGEMM_DEFAULT_P xgemm_p #define SGEMM_DEFAULT_Q 1024 #define DGEMM_DEFAULT_Q 1024 #define QGEMM_DEFAULT_Q 1024 #define CGEMM_DEFAULT_Q 1024 #define ZGEMM_DEFAULT_Q 1024 #define XGEMM_DEFAULT_Q 1024 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SYMV_P 16 #define GETRF_FACTOR 0.65 #endif #if defined(EV4) || defined(EV5) || defined(EV6) #ifdef EV4 #define SNUMOPT 1 #define DNUMOPT 1 #else #define SNUMOPT 2 #define DNUMOPT 2 #endif #define GEMM_DEFAULT_OFFSET_A 512 #define GEMM_DEFAULT_OFFSET_B 512 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SYMV_P 8 #ifdef EV4 #define SGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 112 #define SGEMM_DEFAULT_R 256 #define DGEMM_DEFAULT_P 32 #define DGEMM_DEFAULT_Q 56 #define DGEMM_DEFAULT_R 256 #define CGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_Q 64 #define CGEMM_DEFAULT_R 240 #define ZGEMM_DEFAULT_P 32 #define ZGEMM_DEFAULT_Q 32 #define ZGEMM_DEFAULT_R 240 #endif #ifdef EV5 #define SGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_P 64 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_Q 64 #endif #ifdef EV6 #define SGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_Q 256 #endif #endif #ifdef CELL #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 8192 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 128 #define SYMV_P 4 #endif #ifdef PPCG4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 256 #define SYMV_P 4 #endif #ifdef PPC970 #define SNUMOPT 4 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 2688 #define GEMM_DEFAULT_OFFSET_B 3072 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #ifdef OS_LINUX #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 256 #else #define SGEMM_DEFAULT_P 176 #define DGEMM_DEFAULT_P 176 #define CGEMM_DEFAULT_P 176 #define ZGEMM_DEFAULT_P 176 #endif #endif #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 128 #define SYMV_P 4 #endif #ifdef PPC440 #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_Q 1024 #define DGEMM_DEFAULT_Q 512 #define CGEMM_DEFAULT_Q 512 #define ZGEMM_DEFAULT_Q 256 #define SGEMM_DEFAULT_R SGEMM_DEFAULT_P #define DGEMM_DEFAULT_R DGEMM_DEFAULT_P #define CGEMM_DEFAULT_R CGEMM_DEFAULT_P #define ZGEMM_DEFAULT_R ZGEMM_DEFAULT_P #define SYMV_P 4 #endif #ifdef PPC440FP2 #define SNUMOPT 4 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 #if 1 #define SGEMM_DEFAULT_Q 4096 #define DGEMM_DEFAULT_Q 3072 #define CGEMM_DEFAULT_Q 2048 #define ZGEMM_DEFAULT_Q 1024 #else #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 128 #endif #define SYMV_P 4 #endif #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #ifdef POWER3 #define SNUMOPT 4 #define DNUMOPT 4 #define SGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 432 #define SGEMM_DEFAULT_R 1012 #define DGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_Q 216 #define DGEMM_DEFAULT_R 1012 #define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_Q 104 #define ZGEMM_DEFAULT_R 1012 #endif #if defined(POWER4) #ifdef ALLOC_HUGETLB #define SGEMM_DEFAULT_P 184 #define DGEMM_DEFAULT_P 184 #define CGEMM_DEFAULT_P 184 #define ZGEMM_DEFAULT_P 184 #else #define SGEMM_DEFAULT_P 144 #define DGEMM_DEFAULT_P 144 #define CGEMM_DEFAULT_P 144 #define ZGEMM_DEFAULT_P 144 #endif #endif #if defined(POWER5) #ifdef ALLOC_HUGETLB #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 128 #else #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 160 #define CGEMM_DEFAULT_P 160 #define ZGEMM_DEFAULT_P 80 #endif #define SGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 256 #endif #define SYMV_P 8 #endif #if defined(POWER6) #define SNUMOPT 4 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 992 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 488 #define ZGEMM_DEFAULT_P 248 #define SGEMM_DEFAULT_Q 504 #define DGEMM_DEFAULT_Q 504 #define CGEMM_DEFAULT_Q 400 #define ZGEMM_DEFAULT_Q 400 #define SYMV_P 8 #endif #if defined(POWER8) #define SNUMOPT 16 #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 1280 #define DGEMM_DEFAULT_P 640 #define CGEMM_DEFAULT_P 640 #define ZGEMM_DEFAULT_P 320 #define SGEMM_DEFAULT_Q 640 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 640 #define ZGEMM_DEFAULT_Q 640 #define SYMV_P 8 #endif #if defined(SPARC) && defined(V7) #define SNUMOPT 4 #define DNUMOPT 4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 128 #define SYMV_P 8 #define GEMM_THREAD gemm_thread_mn #endif #if (defined(SPARC) && defined(V9)) || defined(__sparc_v9__) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 512 #define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 512 #define SGEMM_DEFAULT_Q 1024 #define DGEMM_DEFAULT_Q 512 #define CGEMM_DEFAULT_Q 512 #define ZGEMM_DEFAULT_Q 256 #define SYMV_P 8 #endif #ifdef SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 108 #define DGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 108 #define ZGEMM_DEFAULT_P 112 #define SGEMM_DEFAULT_Q 288 #define DGEMM_DEFAULT_Q 144 #define CGEMM_DEFAULT_Q 144 #define ZGEMM_DEFAULT_Q 72 #define SGEMM_DEFAULT_R 2000 #define DGEMM_DEFAULT_R 2000 #define CGEMM_DEFAULT_R 2000 #define ZGEMM_DEFAULT_R 2000 #define SYMV_P 16 #endif #ifdef LOONGSON3A ////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 44 #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 92 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 80 #define SGEMM_DEFAULT_R 640 #define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R 640 #define ZGEMM_DEFAULT_R 640 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 #define SYMV_P 16 #endif #ifdef LOONGSON3B #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 24 #define CGEMM_DEFAULT_P 24 #define ZGEMM_DEFAULT_P 20 #define SGEMM_DEFAULT_Q 192 #define DGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 64 #define SGEMM_DEFAULT_R 512 #define DGEMM_DEFAULT_R 512 #define CGEMM_DEFAULT_R 512 #define ZGEMM_DEFAULT_R 512 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 #define SYMV_P 16 #endif #if defined(P5600) || defined(I6400) || defined(P6600) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #ifdef HAVE_MSA #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #else #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #endif #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #ifdef ARMV7 #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #if defined(ARMV6) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #if defined(CORTEXA57) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 128 #define SGEMM_DEFAULT_Q 1024 #define DGEMM_DEFAULT_Q 512 #define CGEMM_DEFAULT_Q 512 #define ZGEMM_DEFAULT_Q 512 #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 #define SYMV_P 16 #endif #if defined(ARMV8) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #if defined(THUNDERX) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #if defined(THUNDERX2T99) || defined(VULCAN) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P sgemm_p #define DGEMM_DEFAULT_P dgemm_p #define CGEMM_DEFAULT_P cgemm_p #define ZGEMM_DEFAULT_P zgemm_p #define SGEMM_DEFAULT_Q sgemm_q #define DGEMM_DEFAULT_Q dgemm_q #define CGEMM_DEFAULT_Q cgemm_q #define ZGEMM_DEFAULT_Q zgemm_q #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define SYMV_P 16 #endif #if defined(ARMV5) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #ifdef CORTEXA9 #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #ifdef CORTEXA15 #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #if defined(ZARCH_GENERIC) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 #define CGEMM_DEFAULT_P 96 #define ZGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_Q 240 #define DGEMM_DEFAULT_Q 120 #define CGEMM_DEFAULT_Q 120 #define ZGEMM_DEFAULT_Q 120 #define SGEMM_DEFAULT_R 12288 #define DGEMM_DEFAULT_R 8192 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 #define SYMV_P 16 #endif #if defined(Z13) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 456 #define DGEMM_DEFAULT_P 320 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 224 #define SGEMM_DEFAULT_Q 488 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 352 #define SGEMM_DEFAULT_R 8192 #define DGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 #define SYMV_P 16 #endif #ifdef GENERIC #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #endif #define SGEMM_DEFAULT_P sgemm_p #define DGEMM_DEFAULT_P dgemm_p #define QGEMM_DEFAULT_P qgemm_p #define CGEMM_DEFAULT_P cgemm_p #define ZGEMM_DEFAULT_P zgemm_p #define XGEMM_DEFAULT_P xgemm_p #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R dgemm_r #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 128 #define DGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128 #define SYMV_P 16 #endif #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif #ifndef QGEMM_DEFAULT_UNROLL_N #define QGEMM_DEFAULT_UNROLL_N 2 #endif #ifndef XGEMM_DEFAULT_UNROLL_M #define XGEMM_DEFAULT_UNROLL_M 2 #endif #ifndef XGEMM_DEFAULT_UNROLL_N #define XGEMM_DEFAULT_UNROLL_N 2 #endif #ifndef HAVE_SSE2 #define SHUFPD_0 shufps $0x44, #define SHUFPD_1 shufps $0x4e, #define SHUFPD_2 shufps $0xe4, #define SHUFPD_3 shufps $0xee, #endif #ifndef SHUFPD_0 #define SHUFPD_0 shufpd $0, #endif #ifndef SHUFPD_1 #define SHUFPD_1 shufpd $1, #endif #ifndef SHUFPD_2 #define SHUFPD_2 shufpd $2, #endif #ifndef SHUFPD_3 #define SHUFPD_3 shufpd $3, #endif #ifndef SHUFPS_39 #define SHUFPS_39 shufps $0x39, #endif #endif OpenBLAS-0.2.20/quickbuild.32bit000066400000000000000000000000411313527062700161400ustar00rootroot00000000000000#!/bin/bash make -j 2 BINARY=32 OpenBLAS-0.2.20/quickbuild.64bit000066400000000000000000000000341313527062700161470ustar00rootroot00000000000000#!/bin/bash make BINARY=64 OpenBLAS-0.2.20/quickbuild.win32000066400000000000000000000002331313527062700161620ustar00rootroot00000000000000#!/bin/bash echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=32 CC=gcc FC=gfortran OpenBLAS-0.2.20/quickbuild.win64000066400000000000000000000002331313527062700161670ustar00rootroot00000000000000#!/bin/bash echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=64 CC=gcc FC=gfortran OpenBLAS-0.2.20/reference/000077500000000000000000000000001313527062700151025ustar00rootroot00000000000000OpenBLAS-0.2.20/reference/LICENSE000066400000000000000000000017031313527062700161100ustar00rootroot00000000000000This directory contains the reference implementation of BLAS which is obtainable at: http://netlib.org/blas/ The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, 2010, is as follows: 2) Are there legal restrictions on the use of BLAS reference implementation software? The reference BLAS is a freely-available software package. It is available from netlib via anonymous ftp and the World Wide Web. Thus, it can be included in commercial software packages (and has been). We only ask that proper credit be given to the authors. Like all software, it is copyrighted. It is not trademarked, but we do ask the following: If you modify the source for these routines we ask that you change the name of the routine and comment the changes made to the original. We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. OpenBLAS-0.2.20/reference/Makefile000066400000000000000000000143131313527062700165440ustar00rootroot00000000000000TOPDIR = .. include $(TOPDIR)/Makefile.system ifeq ($(ARCH), x86) SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), x86_64) SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), ia64) SUPPORT_GEMM3M = 1 endif ifeq ($(ARCH), MIPS) SUPPORT_GEMM3M = 1 endif SBLAS1OBJS = \ saxpyf.$(SUFFIX) sswapf.$(SUFFIX) \ scopyf.$(SUFFIX) sscalf.$(SUFFIX) \ sdotf.$(SUFFIX) sdsdotf.$(SUFFIX) dsdotf.$(SUFFIX) \ sasumf.$(SUFFIX) snrm2f.$(SUFFIX) \ smaxf.$(SUFFIX) samaxf.$(SUFFIX) ismaxf.$(SUFFIX) isamaxf.$(SUFFIX) \ sminf.$(SUFFIX) saminf.$(SUFFIX) isminf.$(SUFFIX) isaminf.$(SUFFIX) \ srotf.$(SUFFIX) srotgf.$(SUFFIX) srotmf.$(SUFFIX) srotmgf.$(SUFFIX) \ SBLAS2OBJS = \ sgemvf.$(SUFFIX) sgerf.$(SUFFIX) \ strsvf.$(SUFFIX) strmvf.$(SUFFIX) ssymvf.$(SUFFIX) \ ssyrf.$(SUFFIX) ssyr2f.$(SUFFIX) sgbmvf.$(SUFFIX) \ ssbmvf.$(SUFFIX) sspmvf.$(SUFFIX) \ ssprf.$(SUFFIX) sspr2f.$(SUFFIX) \ stbsvf.$(SUFFIX) stbmvf.$(SUFFIX) \ stpsvf.$(SUFFIX) stpmvf.$(SUFFIX) SBLAS3OBJS = \ sgemmf.$(SUFFIX) ssymmf.$(SUFFIX) strmmf.$(SUFFIX) \ strsmf.$(SUFFIX) ssyrkf.$(SUFFIX) ssyr2kf.$(SUFFIX) DBLAS1OBJS = \ daxpyf.$(SUFFIX) dswapf.$(SUFFIX) \ dcopyf.$(SUFFIX) dscalf.$(SUFFIX) \ ddotf.$(SUFFIX) \ dasumf.$(SUFFIX) dnrm2f.$(SUFFIX) \ dmaxf.$(SUFFIX) damaxf.$(SUFFIX) idmaxf.$(SUFFIX) idamaxf.$(SUFFIX) \ dminf.$(SUFFIX) daminf.$(SUFFIX) idminf.$(SUFFIX) idaminf.$(SUFFIX) \ drotf.$(SUFFIX) drotgf.$(SUFFIX) drotmf.$(SUFFIX) drotmgf.$(SUFFIX) \ DBLAS2OBJS = \ dgemvf.$(SUFFIX) dgerf.$(SUFFIX) \ dtrsvf.$(SUFFIX) dtrmvf.$(SUFFIX) dsymvf.$(SUFFIX) \ dsyrf.$(SUFFIX) dsyr2f.$(SUFFIX) dgbmvf.$(SUFFIX) \ dsbmvf.$(SUFFIX) dspmvf.$(SUFFIX) \ dsprf.$(SUFFIX) dspr2f.$(SUFFIX) \ dtbsvf.$(SUFFIX) dtbmvf.$(SUFFIX) \ dtpsvf.$(SUFFIX) dtpmvf.$(SUFFIX) DBLAS3OBJS = \ dgemmf.$(SUFFIX) dsymmf.$(SUFFIX) dtrmmf.$(SUFFIX) \ dtrsmf.$(SUFFIX) dsyrkf.$(SUFFIX) dsyr2kf.$(SUFFIX) CBLAS1OBJS = \ caxpyf.$(SUFFIX) caxpycf.$(SUFFIX) cswapf.$(SUFFIX) \ ccopyf.$(SUFFIX) cscalf.$(SUFFIX) csscalf.$(SUFFIX) \ cdotcf.$(SUFFIX) cdotuf.$(SUFFIX) \ scasumf.$(SUFFIX) scnrm2f.$(SUFFIX) \ scamaxf.$(SUFFIX) icamaxf.$(SUFFIX) \ scaminf.$(SUFFIX) icaminf.$(SUFFIX) \ csrotf.$(SUFFIX) crotgf.$(SUFFIX) \ CBLAS2OBJS = \ cgemvf.$(SUFFIX) cgeruf.$(SUFFIX) cgercf.$(SUFFIX) \ ctrsvf.$(SUFFIX) ctrmvf.$(SUFFIX) csymvf.$(SUFFIX) \ csyrf.$(SUFFIX) csyr2f.$(SUFFIX) cgbmvf.$(SUFFIX) \ csbmvf.$(SUFFIX) cspmvf.$(SUFFIX) \ csprf.$(SUFFIX) cspr2f.$(SUFFIX) \ ctbsvf.$(SUFFIX) ctbmvf.$(SUFFIX) \ ctpsvf.$(SUFFIX) ctpmvf.$(SUFFIX) \ chemvf.$(SUFFIX) chbmvf.$(SUFFIX) \ cherf.$(SUFFIX) cher2f.$(SUFFIX) \ chpmvf.$(SUFFIX) chprf.$(SUFFIX) chpr2f.$(SUFFIX) CBLAS3OBJS = \ cgemmf.$(SUFFIX) csymmf.$(SUFFIX) ctrmmf.$(SUFFIX) \ ctrsmf.$(SUFFIX) csyrkf.$(SUFFIX) csyr2kf.$(SUFFIX) \ chemmf.$(SUFFIX) cherkf.$(SUFFIX) cher2kf.$(SUFFIX) ZBLAS1OBJS = \ zaxpyf.$(SUFFIX) zaxpycf.$(SUFFIX) zswapf.$(SUFFIX) \ zcopyf.$(SUFFIX) zscalf.$(SUFFIX) zdscalf.$(SUFFIX) \ zdotcf.$(SUFFIX) zdotuf.$(SUFFIX) \ dzasumf.$(SUFFIX) dznrm2f.$(SUFFIX) \ dzamaxf.$(SUFFIX) izamaxf.$(SUFFIX) \ dzaminf.$(SUFFIX) izaminf.$(SUFFIX) \ zdrotf.$(SUFFIX) zrotgf.$(SUFFIX) \ ZBLAS2OBJS = \ zgemvf.$(SUFFIX) zgeruf.$(SUFFIX) zgercf.$(SUFFIX) \ ztrsvf.$(SUFFIX) ztrmvf.$(SUFFIX) zsymvf.$(SUFFIX) \ zsyrf.$(SUFFIX) zsyr2f.$(SUFFIX) zgbmvf.$(SUFFIX) \ zsbmvf.$(SUFFIX) zspmvf.$(SUFFIX) \ zsprf.$(SUFFIX) zspr2f.$(SUFFIX) \ ztbsvf.$(SUFFIX) ztbmvf.$(SUFFIX) \ ztpsvf.$(SUFFIX) ztpmvf.$(SUFFIX) \ zhemvf.$(SUFFIX) zhbmvf.$(SUFFIX) \ zherf.$(SUFFIX) zher2f.$(SUFFIX) \ zhpmvf.$(SUFFIX) zhprf.$(SUFFIX) zhpr2f.$(SUFFIX) ZBLAS3OBJS = \ zgemmf.$(SUFFIX) zsymmf.$(SUFFIX) ztrmmf.$(SUFFIX) \ ztrsmf.$(SUFFIX) zsyrkf.$(SUFFIX) zsyr2kf.$(SUFFIX) \ zhemmf.$(SUFFIX) zherkf.$(SUFFIX) zher2kf.$(SUFFIX) ifdef SUPPORT_GEMM3M CBLAS3OBJS += cgemm3mf.$(SUFFIX) csymm3mf.$(SUFFIX) chemm3mf.$(SUFFIX) ZBLAS3OBJS += zgemm3mf.$(SUFFIX) zsymm3mf.$(SUFFIX) zhemm3mf.$(SUFFIX) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) ifneq ($(NO_LAPACK), 1) SBLASOBJS += \ sgetf2f.$(SUFFIX) sgetrff.$(SUFFIX) slauu2f.$(SUFFIX) slauumf.$(SUFFIX) \ spotf2f.$(SUFFIX) spotrff.$(SUFFIX) strti2f.$(SUFFIX) strtrif.$(SUFFIX) \ slaswpf.$(SUFFIX) sgetrsf.$(SUFFIX) sgesvf.$(SUFFIX) spotrif.$(SUFFIX) \ DBLASOBJS += \ dgetf2f.$(SUFFIX) dgetrff.$(SUFFIX) dlauu2f.$(SUFFIX) dlauumf.$(SUFFIX) \ dpotf2f.$(SUFFIX) dpotrff.$(SUFFIX) dtrti2f.$(SUFFIX) dtrtrif.$(SUFFIX) \ dlaswpf.$(SUFFIX) dgetrsf.$(SUFFIX) dgesvf.$(SUFFIX) dpotrif.$(SUFFIX) \ QBLASOBJS += # \ qgetf2f.$(SUFFIX) qgetrff.$(SUFFIX) qlauu2f.$(SUFFIX) qlauumf.$(SUFFIX) \ qpotf2f.$(SUFFIX) qpotrff.$(SUFFIX) qtrti2f.$(SUFFIX) qtrtrif.$(SUFFIX) \ qlaswpf.$(SUFFIX) qgetrsf.$(SUFFIX) qgesvf.$(SUFFIX) qpotrif.$(SUFFIX) \ CBLASOBJS += \ cgetf2f.$(SUFFIX) cgetrff.$(SUFFIX) clauu2f.$(SUFFIX) clauumf.$(SUFFIX) \ cpotf2f.$(SUFFIX) cpotrff.$(SUFFIX) ctrti2f.$(SUFFIX) ctrtrif.$(SUFFIX) \ claswpf.$(SUFFIX) cgetrsf.$(SUFFIX) cgesvf.$(SUFFIX) cpotrif.$(SUFFIX) \ ZBLASOBJS += \ zgetf2f.$(SUFFIX) zgetrff.$(SUFFIX) zlauu2f.$(SUFFIX) zlauumf.$(SUFFIX) \ zpotf2f.$(SUFFIX) zpotrff.$(SUFFIX) ztrti2f.$(SUFFIX) ztrtrif.$(SUFFIX) \ zlaswpf.$(SUFFIX) zgetrsf.$(SUFFIX) zgesvf.$(SUFFIX) zpotrif.$(SUFFIX) \ XBLASOBJS += # \ xgetf2f.$(SUFFIX) xgetrff.$(SUFFIX) xlauu2f.$(SUFFIX) xlauumf.$(SUFFIX) \ xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \ endif include $(TOPDIR)/Makefile.tail all :: libs clean :: level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ OpenBLAS-0.2.20/reference/caxpycf.f000066400000000000000000000016021313527062700167050ustar00rootroot00000000000000 subroutine caxpycf(n,ca,cx,incx,cy,incy) c c constant times a vector plus a vector. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*),cy(*),ca integer i,incx,incy,ix,iy,n INTRINSIC conjg c if(n.le.0)return if (abs(real(ca)) + abs(aimag(ca)) .eq. 0.0 ) return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n cy(iy) = cy(iy) + ca*conjg(cx(ix)) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n cy(i) = cy(i) + ca*conjg(cx(i)) 30 continue return end OpenBLAS-0.2.20/reference/caxpyf.f000066400000000000000000000015241313527062700165450ustar00rootroot00000000000000 subroutine caxpyf(n,ca,cx,incx,cy,incy) c c constant times a vector plus a vector. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*),cy(*),ca integer i,incx,incy,ix,iy,n c if(n.le.0)return if (abs(real(ca)) + abs(aimag(ca)) .eq. 0.0 ) return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n cy(iy) = cy(iy) + ca*cx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n cy(i) = cy(i) + ca*cx(i) 30 continue return end OpenBLAS-0.2.20/reference/ccopyf.f000066400000000000000000000013721313527062700165370ustar00rootroot00000000000000 subroutine ccopyf(n,cx,incx,cy,incy) c c copies a vector, x, to a vector, y. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*),cy(*) integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n cy(iy) = cx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n cy(i) = cx(i) 30 continue return end OpenBLAS-0.2.20/reference/cdotcf.f000066400000000000000000000016611313527062700165170ustar00rootroot00000000000000 complex function cdotcf(n,cx,incx,cy,incy) c c forms the dot product of two vectors, conjugating the first c vector. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*),cy(*),ctemp integer i,incx,incy,ix,iy,n c ctemp = (0.0,0.0) cdotcf = (0.0,0.0) if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ctemp = ctemp + conjg(cx(ix))*cy(iy) ix = ix + incx iy = iy + incy 10 continue cdotcf = ctemp return c c code for both increments equal to 1 c 20 do 30 i = 1,n ctemp = ctemp + conjg(cx(i))*cy(i) 30 continue cdotcf = ctemp return end OpenBLAS-0.2.20/reference/cdotuf.f000066400000000000000000000015761313527062700165460ustar00rootroot00000000000000 complex function cdotuf(n,cx,incx,cy,incy) c c forms the dot product of two vectors. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*),cy(*),ctemp integer i,incx,incy,ix,iy,n c ctemp = (0.0,0.0) cdotuf = (0.0,0.0) if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ctemp = ctemp + cx(ix)*cy(iy) ix = ix + incx iy = iy + incy 10 continue cdotuf = ctemp return c c code for both increments equal to 1 c 20 do 30 i = 1,n ctemp = ctemp + cx(i)*cy(i) 30 continue cdotuf = ctemp return end OpenBLAS-0.2.20/reference/cgbmvf.f000066400000000000000000000335711313527062700165260ustar00rootroot00000000000000 SUBROUTINE CGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA INTEGER INCX, INCY, KL, KU, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZGBMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or * * y := alpha*conjg( A' )*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n band matrix, with kl sub-diagonals and ku super-diagonals. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * KL - INTEGER. * On entry, KL specifies the number of sub-diagonals of the * matrix A. KL must satisfy 0 .le. KL. * Unchanged on exit. * * KU - INTEGER. * On entry, KU specifies the number of super-diagonals of the * matrix A. KU must satisfy 0 .le. KU. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry, the leading ( kl + ku + 1 ) by n part of the * array A must contain the matrix of coefficients, supplied * column by column, with the leading diagonal of the matrix in * row ( ku + 1 ) of the array, the first super-diagonal * starting at position 2 in row ku, the first sub-diagonal * starting at position 1 in row ( ku + 2 ), and so on. * Elements in the array A that do not correspond to elements * in the band matrix (such as the top left ku by ku triangle) * are not referenced. * The following program segment will transfer a band matrix * from conventional full matrix storage to band storage: * * DO 20, J = 1, N * K = KU + 1 - J * DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) * A( K + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( kl + ku + 1 ). * Unchanged on exit. * * X - COMPLEX*16 array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX*16 array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, $ LENX, LENY LOGICAL NOCONJ, NOTRANS, XCONJ * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ).AND. $ .NOT.LSAME( TRANS, 'O' ).AND. $ .NOT.LSAME( TRANS, 'U' ).AND. $ .NOT.LSAME( TRANS, 'S' ).AND. $ .NOT.LSAME( TRANS, 'D' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( KL.LT.0 )THEN INFO = 4 ELSE IF( KU.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN INFO = 8 ELSE IF( INCX.EQ.0 )THEN INFO = 10 ELSE IF( INCY.EQ.0 )THEN INFO = 13 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZGBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF(NOTRANS)THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the band part of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KUP1 = KU + 1 IF(XCONJ)THEN IF(NOTRANS)THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) K = KUP1 - J IF( NOCONJ )THEN DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*A( K + I, J ) 50 CONTINUE ELSE DO 55, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*CONJG(A( K + I, J )) 55 CONTINUE END IF END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IY = KY K = KUP1 - J IF( NOCONJ )THEN DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) IY = IY + INCY 70 CONTINUE ELSE DO 75, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*CONJG(A( K + I, J )) IY = IY + INCY 75 CONTINUE END IF END IF JX = JX + INCX IF( J.GT.KU ) $ KY = KY + INCY 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = ZERO K = KUP1 - J IF( NOCONJ )THEN DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( I ) 90 CONTINUE ELSE DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + CONJG( A( K + I, J ) )*X( I ) 100 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 110 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX K = KUP1 - J IF( NOCONJ )THEN DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( IX ) IX = IX + INCX 120 CONTINUE ELSE DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + CONJG( A( K + I, J ) )*X( IX ) IX = IX + INCX 130 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY IF( J.GT.KU ) $ KX = KX + INCX 140 CONTINUE END IF END IF ELSE IF(NOTRANS)THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 160, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*CONJG(X( JX )) K = KUP1 - J IF( NOCONJ )THEN DO 150, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*A( K + I, J ) 150 CONTINUE ELSE DO 155, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*CONJG(A( K + I, J )) 155 CONTINUE END IF END IF JX = JX + INCX 160 CONTINUE ELSE DO 180, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*CONJG(X( JX )) IY = KY K = KUP1 - J IF( NOCONJ )THEN DO 170, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) IY = IY + INCY 170 CONTINUE ELSE DO 175, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*CONJG(A( K + I, J )) IY = IY + INCY 175 CONTINUE END IF END IF JX = JX + INCX IF( J.GT.KU ) $ KY = KY + INCY 180 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 210, J = 1, N TEMP = ZERO K = KUP1 - J IF( NOCONJ )THEN DO 190, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*CONJG(X( I )) 190 CONTINUE ELSE DO 200, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + CONJG( A( K + I, J ) )*CONJG(X( I )) 200 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 210 CONTINUE ELSE DO 240, J = 1, N TEMP = ZERO IX = KX K = KUP1 - J IF( NOCONJ )THEN DO 220, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*CONJG(X( IX )) IX = IX + INCX 220 CONTINUE ELSE DO 230, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + CONJG( A( K + I, J ) )*CONJG(X(IX )) IX = IX + INCX 230 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY IF( J.GT.KU ) $ KX = KX + INCX 240 CONTINUE END IF END IF END IF * RETURN * * End of ZGBMV . * END OpenBLAS-0.2.20/reference/cgemm3mf.f000066400000000000000000000312701313527062700167520ustar00rootroot00000000000000 SUBROUTINE CGEMM3MF(TRA,TRB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * .. Scalar Arguments .. COMPLEX ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRA,TRB * .. * .. Array Arguments .. COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) * .. * * Purpose * ======= * * CGEMM performs one of the matrix-matrix operations * * C := alpha*op( A )*op( B ) + beta*C, * * where op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), * * alpha and beta are scalars, and A, B and C are matrices, with op( A ) * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. * * Arguments * ========== * * TRA - CHARACTER*1. * On entry, TRA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRA = 'N' or 'n', op( A ) = A. * * TRA = 'T' or 't', op( A ) = A'. * * TRA = 'C' or 'c', op( A ) = conjg( A' ). * * Unchanged on exit. * * TRB - CHARACTER*1. * On entry, TRB specifies the form of op( B ) to be used in * the matrix multiplication as follows: * * TRB = 'N' or 'n', op( B ) = B. * * TRB = 'T' or 't', op( B ) = B'. * * TRB = 'C' or 'c', op( B ) = conjg( B' ). * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix * op( A ) and of the matrix C. M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix * op( B ) and the number of columns of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of columns of the matrix * op( A ) and the number of rows of the matrix op( B ). K must * be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * k when TRA = 'N' or 'n', and is m otherwise. * Before entry with TRA = 'N' or 'n', the leading m by k * part of the array A must contain the matrix A, otherwise * the leading k by m part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRA = 'N' or 'n' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, k ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is * n when TRB = 'N' or 'n', and is k otherwise. * Before entry with TRB = 'N' or 'n', the leading k by n * part of the array B must contain the matrix B, otherwise * the leading n by k part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRB = 'N' or 'n' then * LDB must be at least max( 1, k ), otherwise LDB must be at * least max( 1, n ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n matrix * ( alpha*op( A )*op( B ) + beta*C ). * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG,MAX * .. * .. Local Scalars .. COMPLEX TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL CONJA,CONJB,NOTA,NOTB * .. * .. Parameters .. COMPLEX ONE PARAMETER (ONE= (1.0E+0,0.0E+0)) COMPLEX ZERO PARAMETER (ZERO= (0.0E+0,0.0E+0)) * .. * * Set NOTA and NOTB as true if A and B respectively are not * conjugated or transposed, set CONJA and CONJB as true if A and * B respectively are to be transposed but not conjugated and set * NROWA, NCOLA and NROWB as the number of rows and columns of A * and the number of rows of B respectively. * NOTA = LSAME(TRA,'N') NOTB = LSAME(TRB,'N') CONJA = LSAME(TRA,'C') CONJB = LSAME(TRB,'C') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF * * Test the input parameters. * INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + (.NOT.LSAME(TRA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + (.NOT.LSAME(TRB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('CGEMM ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN * * And when alpha.eq.zero. * IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF (NOTB) THEN IF (NOTA) THEN * * Form C := alpha*A*B + beta*C. * DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE IF (CONJA) THEN * * Form C := alpha*conjg( A' )*B + beta*C. * DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + CONJG(A(L,I))*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE ELSE * * Form C := alpha*A'*B + beta*C * DO 150 J = 1,N DO 140 I = 1,M TEMP = ZERO DO 130 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 130 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 140 CONTINUE 150 CONTINUE END IF ELSE IF (NOTA) THEN IF (CONJB) THEN * * Form C := alpha*A*conjg( B' ) + beta*C. * DO 200 J = 1,N IF (BETA.EQ.ZERO) THEN DO 160 I = 1,M C(I,J) = ZERO 160 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 170 I = 1,M C(I,J) = BETA*C(I,J) 170 CONTINUE END IF DO 190 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*CONJG(B(J,L)) DO 180 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 180 CONTINUE END IF 190 CONTINUE 200 CONTINUE ELSE * * Form C := alpha*A*B' + beta*C * DO 250 J = 1,N IF (BETA.EQ.ZERO) THEN DO 210 I = 1,M C(I,J) = ZERO 210 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 220 I = 1,M C(I,J) = BETA*C(I,J) 220 CONTINUE END IF DO 240 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 230 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 230 CONTINUE END IF 240 CONTINUE 250 CONTINUE END IF ELSE IF (CONJA) THEN IF (CONJB) THEN * * Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. * DO 280 J = 1,N DO 270 I = 1,M TEMP = ZERO DO 260 L = 1,K TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L)) 260 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 270 CONTINUE 280 CONTINUE ELSE * * Form C := alpha*conjg( A' )*B' + beta*C * DO 310 J = 1,N DO 300 I = 1,M TEMP = ZERO DO 290 L = 1,K TEMP = TEMP + CONJG(A(L,I))*B(J,L) 290 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 300 CONTINUE 310 CONTINUE END IF ELSE IF (CONJB) THEN * * Form C := alpha*A'*conjg( B' ) + beta*C * DO 340 J = 1,N DO 330 I = 1,M TEMP = ZERO DO 320 L = 1,K TEMP = TEMP + A(L,I)*CONJG(B(J,L)) 320 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 330 CONTINUE 340 CONTINUE ELSE * * Form C := alpha*A'*B' + beta*C * DO 370 J = 1,N DO 360 I = 1,M TEMP = ZERO DO 350 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 350 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 360 CONTINUE 370 CONTINUE END IF END IF * RETURN * * End of CGEMM . * END OpenBLAS-0.2.20/reference/cgemmf.f000066400000000000000000000313521313527062700165130ustar00rootroot00000000000000 SUBROUTINE CGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * .. Scalar Arguments .. COMPLEX ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANA,TRANB * .. * .. Array Arguments .. COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) * .. * * Purpose * ======= * * CGEMM performs one of the matrix-matrix operations * * C := alpha*op( A )*op( B ) + beta*C, * * where op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), * * alpha and beta are scalars, and A, B and C are matrices, with op( A ) * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. * * Arguments * ========== * * TRANA - CHARACTER*1. * On entry, TRANA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANA = 'N' or 'n', op( A ) = A. * * TRANA = 'T' or 't', op( A ) = A'. * * TRANA = 'C' or 'c', op( A ) = conjg( A' ). * * Unchanged on exit. * * TRANB - CHARACTER*1. * On entry, TRANB specifies the form of op( B ) to be used in * the matrix multiplication as follows: * * TRANB = 'N' or 'n', op( B ) = B. * * TRANB = 'T' or 't', op( B ) = B'. * * TRANB = 'C' or 'c', op( B ) = conjg( B' ). * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix * op( A ) and of the matrix C. M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix * op( B ) and the number of columns of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of columns of the matrix * op( A ) and the number of rows of the matrix op( B ). K must * be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * k when TRANA = 'N' or 'n', and is m otherwise. * Before entry with TRANA = 'N' or 'n', the leading m by k * part of the array A must contain the matrix A, otherwise * the leading k by m part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANA = 'N' or 'n' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, k ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is * n when TRANB = 'N' or 'n', and is k otherwise. * Before entry with TRANB = 'N' or 'n', the leading k by n * part of the array B must contain the matrix B, otherwise * the leading n by k part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANB = 'N' or 'n' then * LDB must be at least max( 1, k ), otherwise LDB must be at * least max( 1, n ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n matrix * ( alpha*op( A )*op( B ) + beta*C ). * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG,MAX * .. * .. Local Scalars .. COMPLEX TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL CONJA,CONJB,NOTA,NOTB * .. * .. Parameters .. COMPLEX ONE PARAMETER (ONE= (1.0E+0,0.0E+0)) COMPLEX ZERO PARAMETER (ZERO= (0.0E+0,0.0E+0)) * .. * * Set NOTA and NOTB as true if A and B respectively are not * conjugated or transposed, set CONJA and CONJB as true if A and * B respectively are to be transposed but not conjugated and set * NROWA, NCOLA and NROWB as the number of rows and columns of A * and the number of rows of B respectively. * NOTA = LSAME(TRANA,'N') NOTB = LSAME(TRANB,'N') CONJA = LSAME(TRANA,'C') CONJB = LSAME(TRANB,'C') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF * * Test the input parameters. * INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + (.NOT.LSAME(TRANA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + (.NOT.LSAME(TRANB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('CGEMM ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN * * And when alpha.eq.zero. * IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF (NOTB) THEN IF (NOTA) THEN * * Form C := alpha*A*B + beta*C. * DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE IF (CONJA) THEN * * Form C := alpha*conjg( A' )*B + beta*C. * DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + CONJG(A(L,I))*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE ELSE * * Form C := alpha*A'*B + beta*C * DO 150 J = 1,N DO 140 I = 1,M TEMP = ZERO DO 130 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 130 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 140 CONTINUE 150 CONTINUE END IF ELSE IF (NOTA) THEN IF (CONJB) THEN * * Form C := alpha*A*conjg( B' ) + beta*C. * DO 200 J = 1,N IF (BETA.EQ.ZERO) THEN DO 160 I = 1,M C(I,J) = ZERO 160 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 170 I = 1,M C(I,J) = BETA*C(I,J) 170 CONTINUE END IF DO 190 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*CONJG(B(J,L)) DO 180 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 180 CONTINUE END IF 190 CONTINUE 200 CONTINUE ELSE * * Form C := alpha*A*B' + beta*C * DO 250 J = 1,N IF (BETA.EQ.ZERO) THEN DO 210 I = 1,M C(I,J) = ZERO 210 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 220 I = 1,M C(I,J) = BETA*C(I,J) 220 CONTINUE END IF DO 240 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 230 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 230 CONTINUE END IF 240 CONTINUE 250 CONTINUE END IF ELSE IF (CONJA) THEN IF (CONJB) THEN * * Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. * DO 280 J = 1,N DO 270 I = 1,M TEMP = ZERO DO 260 L = 1,K TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L)) 260 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 270 CONTINUE 280 CONTINUE ELSE * * Form C := alpha*conjg( A' )*B' + beta*C * DO 310 J = 1,N DO 300 I = 1,M TEMP = ZERO DO 290 L = 1,K TEMP = TEMP + CONJG(A(L,I))*B(J,L) 290 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 300 CONTINUE 310 CONTINUE END IF ELSE IF (CONJB) THEN * * Form C := alpha*A'*conjg( B' ) + beta*C * DO 340 J = 1,N DO 330 I = 1,M TEMP = ZERO DO 320 L = 1,K TEMP = TEMP + A(L,I)*CONJG(B(J,L)) 320 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 330 CONTINUE 340 CONTINUE ELSE * * Form C := alpha*A'*B' + beta*C * DO 370 J = 1,N DO 360 I = 1,M TEMP = ZERO DO 350 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 350 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 360 CONTINUE 370 CONTINUE END IF END IF * RETURN * * End of CGEMM . * END OpenBLAS-0.2.20/reference/cgemvf.f000066400000000000000000000236171313527062700165310ustar00rootroot00000000000000 SUBROUTINE CGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA INTEGER INCX, INCY, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or * * y := alpha*conjg( A' )*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - COMPLEX array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY LOGICAL NOCONJ, NOTRANS, XCONJ * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ).AND. $ .NOT.LSAME( TRANS, 'O' ).AND. $ .NOT.LSAME( TRANS, 'U' ).AND. $ .NOT.LSAME( TRANS, 'S' ).AND. $ .NOT.LSAME( TRANS, 'D' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CGEMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF(NOTRANS)THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF(NOTRANS)THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN IF (XCONJ) THEN TEMP = ALPHA*X( JX ) ELSE TEMP = ALPHA*CONJG(X( JX )) ENDIF IF (NOCONJ) THEN DO 50, I = 1, M Y( I ) = Y( I ) + TEMP*A( I, J ) 50 CONTINUE ELSE DO 55, I = 1, M Y( I ) = Y( I ) + TEMP*CONJG(A( I, J )) 55 CONTINUE ENDIF END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF (XCONJ) THEN TEMP = ALPHA*X( JX ) ELSE TEMP = ALPHA*CONJG(X( JX )) ENDIF IY = KY IF (NOCONJ) THEN DO 70, I = 1, M Y( IY ) = Y( IY ) + TEMP*A( I, J ) IY = IY + INCY 70 CONTINUE ELSE DO 75, I = 1, M Y( IY ) = Y( IY ) + TEMP* CONJG(A( I, J )) IY = IY + INCY 75 CONTINUE ENDIF END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = ZERO IF( NOCONJ )THEN DO 90, I = 1, M IF (XCONJ) THEN TEMP = TEMP + A( I, J )*X( I ) ELSE TEMP = TEMP + A( I, J )*CONJG(X( I )) ENDIF 90 CONTINUE ELSE DO 100, I = 1, M IF (XCONJ) THEN TEMP = TEMP + CONJG( A( I, J ) )*X( I ) ELSE TEMP = TEMP + CONJG( A( I, J ) )*CONJG(X( I )) ENDIF 100 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 110 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX IF( NOCONJ )THEN DO 120, I = 1, M IF (XCONJ) THEN TEMP = TEMP + A( I, J )*X( IX ) ELSE TEMP = TEMP + A( I, J )*CONJG(X( IX )) ENDIF IX = IX + INCX 120 CONTINUE ELSE DO 130, I = 1, M IF (XCONJ) THEN TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) ELSE TEMP = TEMP + CONJG( A( I, J ) )*CONJG(X( IX )) ENDIF IX = IX + INCX 130 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of CGEMV . * END OpenBLAS-0.2.20/reference/cgercf.f000066400000000000000000000104751313527062700165110ustar00rootroot00000000000000 SUBROUTINE CGERCF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX ALPHA INTEGER INCX, INCY, LDA, M, N * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CGERC performs the rank 1 operation * * A := alpha*x*conjg( y' ) + A, * * where alpha is a scalar, x is an m element vector, y is an n element * vector and A is an m by n matrix. * * Parameters * ========== * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( m - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the m * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. On exit, A is * overwritten by the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JY, KX * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( M.LT.0 )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CGERC ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( INCY.GT.0 )THEN JY = 1 ELSE JY = 1 - ( N - 1 )*INCY END IF IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*CONJG( Y( JY ) ) DO 10, I = 1, M A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( M - 1 )*INCX END IF DO 40, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*CONJG( Y( JY ) ) IX = KX DO 30, I = 1, M A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF * RETURN * * End of CGERC . * END OpenBLAS-0.2.20/reference/cgeruf.f000066400000000000000000000104331313527062700165250ustar00rootroot00000000000000 SUBROUTINE CGERUF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX ALPHA INTEGER INCX, INCY, LDA, M, N * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CGERU performs the rank 1 operation * * A := alpha*x*y' + A, * * where alpha is a scalar, x is an m element vector, y is an n element * vector and A is an m by n matrix. * * Parameters * ========== * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( m - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the m * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. On exit, A is * overwritten by the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JY, KX * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( M.LT.0 )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CGERU ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( INCY.GT.0 )THEN JY = 1 ELSE JY = 1 - ( N - 1 )*INCY END IF IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) DO 10, I = 1, M A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( M - 1 )*INCX END IF DO 40, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) IX = KX DO 30, I = 1, M A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF * RETURN * * End of CGERU . * END OpenBLAS-0.2.20/reference/cgesvf.f000066400000000000000000000063721313527062700165360ustar00rootroot00000000000000 SUBROUTINE CGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK driver routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CGESV computes the solution to a complex system of linear equations * A * X = B, * where A is an N-by-N matrix and X and B are N-by-NRHS matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor A as * A = P * L * U, * where P is a permutation matrix, L is unit lower triangular, and U is * upper triangular. The factored form of A is then used to solve the * system of equations A * X = B. * * Arguments * ========= * * N (input) INTEGER * The number of linear equations, i.e., the order of the * matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the N-by-N coefficient matrix A. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (output) INTEGER array, dimension (N) * The pivot indices that define the permutation matrix P; * row i of the matrix was interchanged with row IPIV(i). * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the N-by-NRHS matrix of right hand side matrix B. * On exit, if INFO = 0, the N-by-NRHS solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, so the solution could not be computed. * * ===================================================================== * * .. External Subroutines .. EXTERNAL CGETRF, CGETRS, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( NRHS.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGESV ', -INFO ) RETURN END IF * * Compute the LU factorization of A. * CALL CGETRF( N, N, A, LDA, IPIV, INFO ) IF( INFO.EQ.0 ) THEN * * Solve the system A*X = B, overwriting B with X. * CALL CGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, $ INFO ) END IF RETURN * * End of CGESV * END OpenBLAS-0.2.20/reference/cgetf2f.f000066400000000000000000000074231313527062700165770ustar00rootroot00000000000000 SUBROUTINE CGETF2F( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CGETF2 computes an LU factorization of a general m-by-n matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 2 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the m by n matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, U(k,k) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER J, JP * .. * .. External Functions .. INTEGER ICAMAX EXTERNAL ICAMAX * .. * .. External Subroutines .. EXTERNAL CGERU, CSCAL, CSWAP, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGETF2', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * DO 10 J = 1, MIN( M, N ) * * Find pivot and test for singularity. * JP = J - 1 + ICAMAX( M-J+1, A( J, J ), 1 ) IPIV( J ) = JP IF( A( JP, J ).NE.ZERO ) THEN * * Apply the interchange to columns 1:N. * IF( JP.NE.J ) $ CALL CSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) * * Compute elements J+1:M of J-th column. * IF( J.LT.M ) $ CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * INFO = J END IF * IF( J.LT.MIN( M, N ) ) THEN * * Update trailing submatrix. * CALL CGERU( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), $ LDA, A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN * * End of CGETF2 * END OpenBLAS-0.2.20/reference/cgetrff.f000066400000000000000000000107701313527062700166760ustar00rootroot00000000000000 SUBROUTINE CGETRFF( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CGETRF computes an LU factorization of a general M-by-N matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 3 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the M-by-N matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, IINFO, J, JB, NB * .. * .. External Subroutines .. EXTERNAL CGEMM, CGETF2, CLASWP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 64 IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN * * Use unblocked code. * CALL CGETF2( M, N, A, LDA, IPIV, INFO ) ELSE * * Use blocked code. * DO 20 J = 1, MIN( M, N ), NB JB = MIN( MIN( M, N )-J+1, NB ) * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL CGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) * * Adjust INFO and the pivot indices. * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - 1 DO 10 I = J, MIN( M, J+JB-1 ) IPIV( I ) = J - 1 + IPIV( I ) 10 CONTINUE * * Apply interchanges to columns 1:J-1. * CALL CLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) * IF( J+JB.LE.N ) THEN * * Apply interchanges to columns J+JB:N. * CALL CLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, $ IPIV, 1 ) * * Compute block row of U. * CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), $ LDA ) IF( J+JB.LE.M ) THEN * * Update trailing submatrix. * CALL CGEMM( 'No transpose', 'No transpose', M-J-JB+1, $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), $ LDA ) END IF END IF 20 CONTINUE END IF RETURN * * End of CGETRF * END OpenBLAS-0.2.20/reference/cgetrsf.f000066400000000000000000000102141313527062700167040ustar00rootroot00000000000000 SUBROUTINE CGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CGETRS solves a system of linear equations * A * X = B, A**T * X = B, or A**H * X = B * with a general N-by-N matrix A using the LU factorization computed * by CGETRF. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input) COMPLEX array, dimension (LDA,N) * The factors L and U from the factorization A = P*L*U * as computed by CGETRF. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (input) INTEGER array, dimension (N) * The pivot indices from CGETRF; for 1<=i<=N, row i of the * matrix was interchanged with row IPIV(i). * * B (input/output) COMPLEX array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL NOTRAN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CLASWP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) .OR. LSAME(TRANS, 'R') IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * * Solve A * X = B. * * Apply row interchanges to the right hand sides. * CALL CLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) * * Solve L*X = B, overwriting B with X. * CALL CTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, $ ONE, A, LDA, B, LDB ) * * Solve U*X = B, overwriting B with X. * CALL CTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, $ NRHS, ONE, A, LDA, B, LDB ) ELSE * * Solve A**T * X = B or A**H * X = B. * * Solve U'*X = B, overwriting B with X. * CALL CTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, ONE, $ A, LDA, B, LDB ) * * Solve L'*X = B, overwriting B with X. * CALL CTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, A, $ LDA, B, LDB ) * * Apply row interchanges to the solution vectors. * CALL CLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) END IF * RETURN * * End of CGETRS * END OpenBLAS-0.2.20/reference/chbmvf.f000066400000000000000000000236371313527062700165310ustar00rootroot00000000000000 SUBROUTINE CHBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA INTEGER INCX, INCY, K, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CHBMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n hermitian band matrix, with k super-diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the band matrix A is being supplied as * follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * being supplied. * * UPLO = 'L' or 'l' The lower triangular part of A is * being supplied. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of super-diagonals of the * matrix A. K must satisfy 0 .le. K. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the hermitian matrix, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer the upper * triangular part of a hermitian band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the hermitian matrix, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer the lower * triangular part of a hermitian band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that the imaginary parts of the diagonal elements need * not be set and are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * Y - COMPLEX array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, MIN, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( K.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array A * are accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) )THEN * * Form y when upper triangle of A is stored. * KPLUS1 = K + 1 IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO L = KPLUS1 - J DO 50, I = MAX( 1, J - K ), J - 1 Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*REAL( A( KPLUS1, J ) ) $ + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY L = KPLUS1 - J DO 70, I = MAX( 1, J - K ), J - 1 Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*REAL( A( KPLUS1, J ) ) $ + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY IF( J.GT.K )THEN KX = KX + INCX KY = KY + INCY END IF 80 CONTINUE END IF ELSE * * Form y when lower triangle of A is stored. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*REAL( A( 1, J ) ) L = 1 - J DO 90, I = J + 1, MIN( N, J + K ) Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*REAL( A( 1, J ) ) L = 1 - J IX = JX IY = JY DO 110, I = J + 1, MIN( N, J + K ) IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of CHBMV . * END OpenBLAS-0.2.20/reference/chemm3mf.f000066400000000000000000000241231313527062700167520ustar00rootroot00000000000000 SUBROUTINE CHEMM3MF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX ALPHA, BETA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * CHEMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is an hermitian matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the hermitian matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the hermitian matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * hermitian matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * hermitian matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, REAL * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX TEMP1, TEMP2 * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHEMM3M', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*CONJG( A( K, I ) ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*CONJG( A( K, I ) ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*REAL( A( J, J ) ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*CONJG( A( J, K ) ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*CONJG( A( J, K ) ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of CHEMM . * END OpenBLAS-0.2.20/reference/chemmf.f000066400000000000000000000241211313527062700165100ustar00rootroot00000000000000 SUBROUTINE CHEMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX ALPHA, BETA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * CHEMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is an hermitian matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the hermitian matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the hermitian matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * hermitian matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * hermitian matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, REAL * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX TEMP1, TEMP2 * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHEMM3M', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*CONJG( A( K, I ) ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*CONJG( A( K, I ) ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*REAL( A( I, I ) ) + $ ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*REAL( A( J, J ) ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*CONJG( A( J, K ) ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*CONJG( A( J, K ) ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of CHEMM . * END OpenBLAS-0.2.20/reference/chemvf.f000066400000000000000000000252471313527062700165330ustar00rootroot00000000000000 SUBROUTINE CHEMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CHEMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n hermitian matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of A is not referenced. * Note that the imaginary parts of the diagonal elements need * not be set and are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ).AND. $ .NOT.LSAME( UPLO, 'V' ).AND. $ .NOT.LSAME( UPLO, 'M' ))THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHEMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF RETURN ENDIF IF( LSAME( UPLO, 'L' ) )THEN * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) IX = JX IY = JY DO 110, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF RETURN END IF IF( LSAME( UPLO, 'V' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 160, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 150, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1* CONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( I ) 150 CONTINUE Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 160 CONTINUE ELSE JX = KX JY = KY DO 180, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 170, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1* CONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 170 CONTINUE Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 180 CONTINUE END IF RETURN ENDIF IF( LSAME( UPLO, 'M' ) )THEN * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 200, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) DO 190, I = J + 1, N Y( I ) = Y( I ) + TEMP1*CONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( I ) 190 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 200 CONTINUE ELSE JX = KX JY = KY DO 220, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) IX = JX IY = JY DO 210, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*CONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( IX ) 210 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 220 CONTINUE END IF RETURN END IF * * * End of CHEMV . * END OpenBLAS-0.2.20/reference/cher2f.f000066400000000000000000000203421313527062700164230ustar00rootroot00000000000000 SUBROUTINE CHER2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX ALPHA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CHER2 performs the hermitian rank 2 operation * * A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, * * where alpha is a scalar, x and y are n element vectors and A is an n * by n hermitian matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHER2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( J ) ) TEMP2 = CONJG( ALPHA*X( J ) ) DO 10, I = 1, J - 1 A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE A( J, J ) = REAL( A( J, J ) ) + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) ELSE A( J, J ) = REAL( A( J, J ) ) END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( JY ) ) TEMP2 = CONJG( ALPHA*X( JX ) ) IX = KX IY = KY DO 30, I = 1, J - 1 A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE A( J, J ) = REAL( A( J, J ) ) + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) ELSE A( J, J ) = REAL( A( J, J ) ) END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( J ) ) TEMP2 = CONJG( ALPHA*X( J ) ) A( J, J ) = REAL( A( J, J ) ) + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) DO 50, I = J + 1, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE ELSE A( J, J ) = REAL( A( J, J ) ) END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( JY ) ) TEMP2 = CONJG( ALPHA*X( JX ) ) A( J, J ) = REAL( A( J, J ) ) + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) IX = JX IY = JY DO 70, I = J + 1, N IX = IX + INCX IY = IY + INCY A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 70 CONTINUE ELSE A( J, J ) = REAL( A( J, J ) ) END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of CHER2 . * END OpenBLAS-0.2.20/reference/cher2kf.f000066400000000000000000000323061313527062700166010ustar00rootroot00000000000000 SUBROUTINE CHER2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDB, LDC REAL BETA COMPLEX ALPHA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * CHER2K performs one of the hermitian rank 2k operations * * C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C, * * or * * C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C, * * where alpha and beta are scalars with beta real, C is an n by n * hermitian matrix and A and B are n by k matrices in the first case * and k by n matrices in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*conjg( B' ) + * conjg( alpha )*B*conjg( A' ) + * beta*C. * * TRANS = 'C' or 'c' C := alpha*conjg( A' )*B + * conjg( alpha )*conjg( B' )*A + * beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrices A and B, and on entry with * TRANS = 'C' or 'c', K specifies the number of rows of the * matrices A and B. K must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array B must contain the matrix B, otherwise * the leading k by n part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDB must be at least max( 1, n ), otherwise LDB must * be at least max( 1, k ). * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1. * Ed Anderson, Cray Research Inc. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, REAL * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA COMPLEX TEMP1, TEMP2 * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHER2K', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.REAL( ZERO ) )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 30 CONTINUE C( J, J ) = BETA*REAL( C( J, J ) ) 40 CONTINUE END IF ELSE IF( BETA.EQ.REAL( ZERO ) )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N C( J, J ) = BETA*REAL( C( J, J ) ) DO 70, I = J + 1, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + * C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.REAL( ZERO ) )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 100 CONTINUE C( J, J ) = BETA*REAL( C( J, J ) ) ELSE C( J, J ) = REAL( C( J, J ) ) END IF DO 120, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( B( J, L ) ) TEMP2 = CONJG( ALPHA*A( J, L ) ) DO 110, I = 1, J - 1 C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 110 CONTINUE C( J, J ) = REAL( C( J, J ) ) + $ REAL( A( J, L )*TEMP1 + $ B( J, L )*TEMP2 ) END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.REAL( ZERO ) )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J + 1, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE C( J, J ) = BETA*REAL( C( J, J ) ) ELSE C( J, J ) = REAL( C( J, J ) ) END IF DO 170, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( B( J, L ) ) TEMP2 = CONJG( ALPHA*A( J, L ) ) DO 160, I = J + 1, N C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 160 CONTINUE C( J, J ) = REAL( C( J, J ) ) + $ REAL( A( J, L )*TEMP1 + $ B( J, L )*TEMP2 ) END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + * C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP1 = ZERO TEMP2 = ZERO DO 190, L = 1, K TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J ) TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J ) 190 CONTINUE IF( I.EQ.J )THEN IF( BETA.EQ.REAL( ZERO ) )THEN C( J, J ) = REAL( ALPHA *TEMP1 + $ CONJG( ALPHA )*TEMP2 ) ELSE C( J, J ) = BETA*REAL( C( J, J ) ) + $ REAL( ALPHA *TEMP1 + $ CONJG( ALPHA )*TEMP2 ) END IF ELSE IF( BETA.EQ.REAL( ZERO ) )THEN C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 END IF END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP1 = ZERO TEMP2 = ZERO DO 220, L = 1, K TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J ) TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J ) 220 CONTINUE IF( I.EQ.J )THEN IF( BETA.EQ.REAL( ZERO ) )THEN C( J, J ) = REAL( ALPHA *TEMP1 + $ CONJG( ALPHA )*TEMP2 ) ELSE C( J, J ) = BETA*REAL( C( J, J ) ) + $ REAL( ALPHA *TEMP1 + $ CONJG( ALPHA )*TEMP2 ) END IF ELSE IF( BETA.EQ.REAL( ZERO ) )THEN C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 END IF END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of CHER2K. * END OpenBLAS-0.2.20/reference/cherf.f000066400000000000000000000152171313527062700163460ustar00rootroot00000000000000 SUBROUTINE CHERF ( UPLO, N, ALPHA, X, INCX, A, LDA ) * .. Scalar Arguments .. REAL ALPHA INTEGER INCX, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CHER performs the hermitian rank 1 operation * * A := alpha*x*conjg( x' ) + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n hermitian matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JX, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHER ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( J ) ) DO 10, I = 1, J - 1 A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE A( J, J ) = REAL( A( J, J ) ) + REAL( X( J )*TEMP ) ELSE A( J, J ) = REAL( A( J, J ) ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( JX ) ) IX = KX DO 30, I = 1, J - 1 A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE A( J, J ) = REAL( A( J, J ) ) + REAL( X( JX )*TEMP ) ELSE A( J, J ) = REAL( A( J, J ) ) END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( J ) ) A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( J ) ) DO 50, I = J + 1, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE ELSE A( J, J ) = REAL( A( J, J ) ) END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( JX ) ) A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( JX ) ) IX = JX DO 70, I = J + 1, N IX = IX + INCX A( I, J ) = A( I, J ) + X( IX )*TEMP 70 CONTINUE ELSE A( J, J ) = REAL( A( J, J ) ) END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of CHER . * END OpenBLAS-0.2.20/reference/cherkf.f000066400000000000000000000253111313527062700165150ustar00rootroot00000000000000 SUBROUTINE CHERKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDC REAL ALPHA, BETA * .. Array Arguments .. COMPLEX A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * CHERK performs one of the hermitian rank k operations * * C := alpha*A*conjg( A' ) + beta*C, * * or * * C := alpha*conjg( A' )*A + beta*C, * * where alpha and beta are real scalars, C is an n by n hermitian * matrix and A is an n by k matrix in the first case and a k by n * matrix in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*conjg( A' ) + beta*C. * * TRANS = 'C' or 'c' C := alpha*conjg( A' )*A + beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrix A, and on entry with * TRANS = 'C' or 'c', K specifies the number of rows of the * matrix A. K must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1. * Ed Anderson, Cray Research Inc. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, REAL * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA REAL RTEMP COMPLEX TEMP * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHERK ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 30 CONTINUE C( J, J ) = BETA*REAL( C( J, J ) ) 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N C( J, J ) = BETA*REAL( C( J, J ) ) DO 70, I = J + 1, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*conjg( A' ) + beta*C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 100 CONTINUE C( J, J ) = BETA*REAL( C( J, J ) ) ELSE C( J, J ) = REAL( C( J, J ) ) END IF DO 120, L = 1, K IF( A( J, L ).NE.CMPLX( ZERO ) )THEN TEMP = ALPHA*CONJG( A( J, L ) ) DO 110, I = 1, J - 1 C( I, J ) = C( I, J ) + TEMP*A( I, L ) 110 CONTINUE C( J, J ) = REAL( C( J, J ) ) + $ REAL( TEMP*A( I, L ) ) END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN C( J, J ) = BETA*REAL( C( J, J ) ) DO 150, I = J + 1, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE ELSE C( J, J ) = REAL( C( J, J ) ) END IF DO 170, L = 1, K IF( A( J, L ).NE.CMPLX( ZERO ) )THEN TEMP = ALPHA*CONJG( A( J, L ) ) C( J, J ) = REAL( C( J, J ) ) + $ REAL( TEMP*A( J, L ) ) DO 160, I = J + 1, N C( I, J ) = C( I, J ) + TEMP*A( I, L ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*conjg( A' )*A + beta*C. * IF( UPPER )THEN DO 220, J = 1, N DO 200, I = 1, J - 1 TEMP = ZERO DO 190, L = 1, K TEMP = TEMP + CONJG( A( L, I ) )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 200 CONTINUE RTEMP = ZERO DO 210, L = 1, K RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J ) 210 CONTINUE IF( BETA.EQ.ZERO )THEN C( J, J ) = ALPHA*RTEMP ELSE C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) ) END IF 220 CONTINUE ELSE DO 260, J = 1, N RTEMP = ZERO DO 230, L = 1, K RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J ) 230 CONTINUE IF( BETA.EQ.ZERO )THEN C( J, J ) = ALPHA*RTEMP ELSE C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) ) END IF DO 250, I = J + 1, N TEMP = ZERO DO 240, L = 1, K TEMP = TEMP + CONJG( A( L, I ) )*A( L, J ) 240 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 250 CONTINUE 260 CONTINUE END IF END IF * RETURN * * End of CHERK . * END OpenBLAS-0.2.20/reference/chpmvf.f000066400000000000000000000205741313527062700165440ustar00rootroot00000000000000 SUBROUTINE CHPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CHPMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n hermitian matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * AP - COMPLEX array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. * Note that the imaginary parts of the diagonal elements need * not be set and are assumed to be zero. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 6 ELSE IF( INCY.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHPMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form y when AP contains the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO K = KK DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + CONJG( AP( K ) )*X( I ) K = K + 1 50 CONTINUE Y( J ) = Y( J ) + TEMP1*REAL( AP( KK + J - 1 ) ) $ + ALPHA*TEMP2 KK = KK + J 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, K = KK, KK + J - 2 Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + CONJG( AP( K ) )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK + J - 1 ) ) $ + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + J 80 CONTINUE END IF ELSE * * Form y when AP contains the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*REAL( AP( KK ) ) K = KK + 1 DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + CONJG( AP( K ) )*X( I ) K = K + 1 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 KK = KK + ( N - J + 1 ) 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK ) ) IX = JX IY = JY DO 110, K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + CONJG( AP( K ) )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + ( N - J + 1 ) 120 CONTINUE END IF END IF * RETURN * * End of CHPMV . * END OpenBLAS-0.2.20/reference/chpr2f.f000066400000000000000000000205141313527062700164370ustar00rootroot00000000000000 SUBROUTINE CHPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) * .. Scalar Arguments .. COMPLEX ALPHA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CHPR2 performs the hermitian rank 2 operation * * A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, * * where alpha is a scalar, x and y are n element vectors and A is an * n by n hermitian matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * AP - COMPLEX array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHPR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( J ) ) TEMP2 = CONJG( ALPHA*X( J ) ) K = KK DO 10, I = 1, J - 1 AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 10 CONTINUE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) ELSE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) END IF KK = KK + J 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( JY ) ) TEMP2 = CONJG( ALPHA*X( JX ) ) IX = KX IY = KY DO 30, K = KK, KK + J - 2 AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + $ REAL( X( JX )*TEMP1 + $ Y( JY )*TEMP2 ) ELSE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) END IF JX = JX + INCX JY = JY + INCY KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( J ) ) TEMP2 = CONJG( ALPHA*X( J ) ) AP( KK ) = REAL( AP( KK ) ) + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) K = KK + 1 DO 50, I = J + 1, N AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 50 CONTINUE ELSE AP( KK ) = REAL( AP( KK ) ) END IF KK = KK + N - J + 1 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*CONJG( Y( JY ) ) TEMP2 = CONJG( ALPHA*X( JX ) ) AP( KK ) = REAL( AP( KK ) ) + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) IX = JX IY = JY DO 70, K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 70 CONTINUE ELSE AP( KK ) = REAL( AP( KK ) ) END IF JX = JX + INCX JY = JY + INCY KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of CHPR2 . * END OpenBLAS-0.2.20/reference/chprf.f000066400000000000000000000155651313527062700163670ustar00rootroot00000000000000 SUBROUTINE CHPRF ( UPLO, N, ALPHA, X, INCX, AP ) * .. Scalar Arguments .. REAL ALPHA INTEGER INCX, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX AP( * ), X( * ) * .. * * Purpose * ======= * * CHPR performs the hermitian rank 1 operation * * A := alpha*x*conjg( x' ) + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n hermitian matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * AP - COMPLEX array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CHPR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( J ) ) K = KK DO 10, I = 1, J - 1 AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 10 CONTINUE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) $ + REAL( X( J )*TEMP ) ELSE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( JX ) ) IX = KX DO 30, K = KK, KK + J - 2 AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) $ + REAL( X( JX )*TEMP ) ELSE AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( J ) ) AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( J ) ) K = KK + 1 DO 50, I = J + 1, N AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 50 CONTINUE ELSE AP( KK ) = REAL( AP( KK ) ) END IF KK = KK + N - J + 1 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*CONJG( X( JX ) ) AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( JX ) ) IX = JX DO 70, K = KK + 1, KK + N - J IX = IX + INCX AP( K ) = AP( K ) + X( IX )*TEMP 70 CONTINUE ELSE AP( KK ) = REAL( AP( KK ) ) END IF JX = JX + INCX KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of CHPR . * END OpenBLAS-0.2.20/reference/claswpf.f000066400000000000000000000063561313527062700167220ustar00rootroot00000000000000 SUBROUTINE CLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1999 * * .. Scalar Arguments .. INTEGER INCX, K1, K2, LDA, N * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CLASWP performs a series of row interchanges on the matrix A. * One row interchange is initiated for each of rows K1 through K2 of A. * * Arguments * ========= * * N (input) INTEGER * The number of columns of the matrix A. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the matrix of column dimension N to which the row * interchanges will be applied. * On exit, the permuted matrix. * * LDA (input) INTEGER * The leading dimension of the array A. * * K1 (input) INTEGER * The first element of IPIV for which a row interchange will * be done. * * K2 (input) INTEGER * The last element of IPIV for which a row interchange will * be done. * * IPIV (input) INTEGER array, dimension (M*abs(INCX)) * The vector of pivot indices. Only the elements in positions * K1 through K2 of IPIV are accessed. * IPIV(K) = L implies rows K and L are to be interchanged. * * INCX (input) INTEGER * The increment between successive values of IPIV. If IPIV * is negative, the pivots are applied in reverse order. * * Further Details * =============== * * Modified by * R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA * * ===================================================================== * * .. Local Scalars .. INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 COMPLEX TEMP * .. * .. Executable Statements .. * * Interchange row I with row IPIV(I) for each of rows K1 through K2. * IF( INCX.GT.0 ) THEN IX0 = K1 I1 = K1 I2 = K2 INC = 1 ELSE IF( INCX.LT.0 ) THEN IX0 = 1 + ( 1-K2 )*INCX I1 = K2 I2 = K1 INC = -1 ELSE RETURN END IF * N32 = ( N / 32 )*32 IF( N32.NE.0 ) THEN DO 30 J = 1, N32, 32 IX = IX0 DO 20 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 10 K = J, J + 31 TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 10 CONTINUE END IF IX = IX + INCX 20 CONTINUE 30 CONTINUE END IF IF( N32.NE.N ) THEN N32 = N32 + 1 IX = IX0 DO 50 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 40 K = N32, N TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 40 CONTINUE END IF IX = IX + INCX 50 CONTINUE END IF * RETURN * * End of CLASWP * END OpenBLAS-0.2.20/reference/clauu2f.f000066400000000000000000000102321313527062700166100ustar00rootroot00000000000000 SUBROUTINE CLAUU2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I REAL AII * .. * .. External Functions .. LOGICAL LSAME COMPLEX CDOTC EXTERNAL LSAME, CDOTC * .. * .. External Subroutines .. EXTERNAL CGEMV, CLACGV, CSSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, REAL * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAUU2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = AII*AII + REAL( CDOTC( N-I, A( I, I+1 ), LDA, $ A( I, I+1 ), LDA ) ) CALL CLACGV( N-I, A( I, I+1 ), LDA ) CALL CGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), $ LDA, A( I, I+1 ), LDA, CMPLX( AII ), $ A( 1, I ), 1 ) CALL CLACGV( N-I, A( I, I+1 ), LDA ) ELSE CALL CSSCAL( I, AII, A( 1, I ), 1 ) END IF 10 CONTINUE * ELSE * * Compute the product L' * L. * DO 20 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = AII*AII + REAL( CDOTC( N-I, A( I+1, I ), 1, $ A( I+1, I ), 1 ) ) CALL CLACGV( I-1, A( I, 1 ), LDA ) CALL CGEMV( 'Conjugate transpose', N-I, I-1, ONE, $ A( I+1, 1 ), LDA, A( I+1, I ), 1, $ CMPLX( AII ), A( I, 1 ), LDA ) CALL CLACGV( I-1, A( I, 1 ), LDA ) ELSE CALL CSSCAL( I, AII, A( I, 1 ), LDA ) END IF 20 CONTINUE END IF * RETURN * * End of CLAUU2 * END OpenBLAS-0.2.20/reference/clauumf.f000066400000000000000000000116671313527062700167200ustar00rootroot00000000000000 SUBROUTINE CLAUUMF( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the blocked form of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IB, NB * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV EXTERNAL LSAME, ILAENV * .. * .. External Subroutines .. EXTERNAL CGEMM, CHERK, CLAUU2, CTRMM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAUUM', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 128 * IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL CLAUU2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL CTRMM( 'Right', 'Upper', 'Conjugate transpose', $ 'Non-unit', I-1, IB, CONE, A( I, I ), LDA, $ A( 1, I ), LDA ) CALL CLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL CGEMM( 'No transpose', 'Conjugate transpose', $ I-1, IB, N-I-IB+1, CONE, A( 1, I+IB ), $ LDA, A( I, I+IB ), LDA, CONE, A( 1, I ), $ LDA ) CALL CHERK( 'Upper', 'No transpose', IB, N-I-IB+1, $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), $ LDA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * DO 20 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL CTRMM( 'Left', 'Lower', 'Conjugate transpose', $ 'Non-unit', IB, I-1, CONE, A( I, I ), LDA, $ A( I, 1 ), LDA ) CALL CLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL CGEMM( 'Conjugate transpose', 'No transpose', IB, $ I-1, N-I-IB+1, CONE, A( I+IB, I ), LDA, $ A( I+IB, 1 ), LDA, CONE, A( I, 1 ), LDA ) CALL CHERK( 'Lower', 'Conjugate transpose', IB, $ N-I-IB+1, ONE, A( I+IB, I ), LDA, ONE, $ A( I, I ), LDA ) END IF 20 CONTINUE END IF END IF * RETURN * * End of CLAUUM * END OpenBLAS-0.2.20/reference/cpotf2f.f000066400000000000000000000121331313527062700166140ustar00rootroot00000000000000 SUBROUTINE CPOTF2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CPOTF2 computes the Cholesky factorization of a complex Hermitian * positive definite matrix A. * * The factorization has the form * A = U' * U , if UPLO = 'U', or * A = L * L', if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the Hermitian matrix A. If UPLO = 'U', the leading * n by n upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U'*U or A = L*L'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, the leading minor of order k is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) COMPLEX CONE PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J REAL AJJ * .. * .. External Functions .. LOGICAL LSAME COMPLEX CDOTC EXTERNAL LSAME, CDOTC * .. * .. External Subroutines .. EXTERNAL CGEMV, CLACGV, CSSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, REAL, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CPOTF2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = REAL( A( J, J ) ) - CDOTC( J-1, A( 1, J ), 1, $ A( 1, J ), 1 ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of row J. * IF( J.LT.N ) THEN CALL CLACGV( J-1, A( 1, J ), 1 ) CALL CGEMV( 'Transpose', J-1, N-J, -CONE, A( 1, J+1 ), $ LDA, A( 1, J ), 1, CONE, A( J, J+1 ), LDA ) CALL CLACGV( J-1, A( 1, J ), 1 ) CALL CSSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) END IF 10 CONTINUE ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = REAL( A( J, J ) ) - CDOTC( J-1, A( J, 1 ), LDA, $ A( J, 1 ), LDA ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of column J. * IF( J.LT.N ) THEN CALL CLACGV( J-1, A( J, 1 ), LDA ) CALL CGEMV( 'No transpose', N-J, J-1, -CONE, A( J+1, 1 ), $ LDA, A( J, 1 ), LDA, CONE, A( J+1, J ), 1 ) CALL CLACGV( J-1, A( J, 1 ), LDA ) CALL CSSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF GO TO 40 * 30 CONTINUE INFO = J * 40 CONTINUE RETURN * * End of CPOTF2 * END OpenBLAS-0.2.20/reference/cpotrff.f000066400000000000000000000132301313527062700167130ustar00rootroot00000000000000 SUBROUTINE CPOTRFF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CPOTRF computes the Cholesky factorization of a complex Hermitian * positive definite matrix A. * * The factorization has the form * A = U**H * U, if UPLO = 'U', or * A = L * L**H, if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the block version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the Hermitian matrix A. If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U**H*U or A = L*L**H. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the leading minor of order i is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. REAL ONE COMPLEX CONE PARAMETER ( ONE = 1.0E+0, CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J, JB, NB * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CGEMM, CHERK, CPOTF2, CTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 56 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code. * CALL CPOTF2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code. * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL CHERK( 'Upper', 'Conjugate transpose', JB, J-1, $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) CALL CPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block row. * CALL CGEMM( 'Conjugate transpose', 'No transpose', JB, $ N-J-JB+1, J-1, -CONE, A( 1, J ), LDA, $ A( 1, J+JB ), LDA, CONE, A( J, J+JB ), $ LDA ) CALL CTRSM( 'Left', 'Upper', 'Conjugate transpose', $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), $ LDA, A( J, J+JB ), LDA ) END IF 10 CONTINUE * ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL CHERK( 'Lower', 'No transpose', JB, J-1, -ONE, $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) CALL CPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block column. * CALL CGEMM( 'No transpose', 'Conjugate transpose', $ N-J-JB+1, JB, J-1, -CONE, A( J+JB, 1 ), $ LDA, A( J, 1 ), LDA, CONE, A( J+JB, J ), $ LDA ) CALL CTRSM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), $ LDA, A( J+JB, J ), LDA ) END IF 20 CONTINUE END IF END IF GO TO 40 * 30 CONTINUE INFO = INFO + J - 1 * 40 CONTINUE RETURN * * End of CPOTRF * END OpenBLAS-0.2.20/reference/cpotrif.f000066400000000000000000000050331313527062700167200ustar00rootroot00000000000000 SUBROUTINE CPOTRIF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CPOTRI computes the inverse of a complex Hermitian positive definite * matrix A using the Cholesky factorization A = U**H*U or A = L*L**H * computed by CPOTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the triangular factor U or L from the Cholesky * factorization A = U**H*U or A = L*L**H, as computed by * CPOTRF. * On exit, the upper or lower triangle of the (Hermitian) * inverse of A, overwriting the input factor U or L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CLAUUM, CTRTRI, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL CTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL CLAUUM( UPLO, N, A, LDA, INFO ) * RETURN * * End of CPOTRI * END OpenBLAS-0.2.20/reference/crotgf.f000066400000000000000000000010171313527062700165340ustar00rootroot00000000000000 subroutine crotgf(ca,cb,c,s) complex ca,cb,s real c real norm,scale complex alpha if (cabs(ca) .ne. 0.) go to 10 c = 0. s = (1.,0.) ca = cb go to 20 10 continue scale = cabs(ca) + cabs(cb) norm = scale * sqrt((cabs(ca/scale))**2 + (cabs(cb/scale))**2) alpha = ca /cabs(ca) c = cabs(ca) / norm s = alpha * conjg(cb) / norm ca = alpha * norm 20 continue return end OpenBLAS-0.2.20/reference/csbmvf.f000066400000000000000000000226331313527062700165370ustar00rootroot00000000000000 SUBROUTINE CSBMVF(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, INCY, K, LDA, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CSBMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric band matrix, with k super-diagonals. * * Arguments * ========== * * UPLO - CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the band matrix A is being supplied as * follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * being supplied. * * UPLO = 'L' or 'l' The lower triangular part of A is * being supplied. * * Unchanged on exit. * * N - INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER * On entry, K specifies the number of super-diagonals of the * matrix A. K must satisfy 0 .le. K. * Unchanged on exit. * * ALPHA - COMPLEX * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array, dimension( LDA, N ) * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer the upper * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer the lower * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * Y - COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L COMPLEX TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( K.LT.0 ) THEN INFO = 3 ELSE IF( LDA.LT.( K+1 ) ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N-1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N-1 )*INCY END IF * * Start the operations. In this version the elements of the array A * are accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30 I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when upper triangle of A is stored. * KPLUS1 = K + 1 IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 60 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO L = KPLUS1 - J DO 50 I = MAX( 1, J-K ), J - 1 Y( I ) = Y( I ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY L = KPLUS1 - J DO 70 I = MAX( 1, J-K ), J - 1 Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY IF( J.GT.K ) THEN KX = KX + INCX KY = KY + INCY END IF 80 CONTINUE END IF ELSE * * Form y when lower triangle of A is stored. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 100 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( 1, J ) L = 1 - J DO 90 I = J + 1, MIN( N, J+K ) Y( I ) = Y( I ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) L = 1 - J IX = JX IY = JY DO 110 I = J + 1, MIN( N, J+K ) IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of CSBMV * END OpenBLAS-0.2.20/reference/cscalf.f000066400000000000000000000011661313527062700165100ustar00rootroot00000000000000 subroutine cscalf(n,ca,cx,incx) c c scales a vector by a constant. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c complex ca,cx(*) integer i,incx,n,nincx c if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c nincx = n*incx do 10 i = 1,nincx,incx cx(i) = ca*cx(i) 10 continue return c c code for increment equal to 1 c 20 do 30 i = 1,n cx(i) = ca*cx(i) 30 continue return end OpenBLAS-0.2.20/reference/cspmvf.f000066400000000000000000000175551313527062700165640ustar00rootroot00000000000000 SUBROUTINE CSPMVF(UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, INCY, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CSPMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix, supplied in packed form. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * AP (input) COMPLEX array, dimension at least * ( ( N*( N + 1 ) )/2 ). * Before entry, with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. * Before entry, with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. * Unchanged on exit. * * X (input) COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA (input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y (input/output) COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY COMPLEX TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 6 ELSE IF( INCY.EQ.0 ) THEN INFO = 9 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSPMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N-1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N-1 )*INCY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * * First form y := beta*y. * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30 I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KK = 1 IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when AP contains the upper triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 60 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO K = KK DO 50 I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 50 CONTINUE Y( J ) = Y( J ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 KK = KK + J 60 CONTINUE ELSE JX = KX JY = KY DO 80 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70 K = KK, KK + J - 2 Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + J 80 CONTINUE END IF ELSE * * Form y when AP contains the lower triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 100 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*AP( KK ) K = KK + 1 DO 90 I = J + 1, N Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 KK = KK + ( N-J+1 ) 100 CONTINUE ELSE JX = KX JY = KY DO 120 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*AP( KK ) IX = JX IY = JY DO 110 K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + ( N-J+1 ) 120 CONTINUE END IF END IF * RETURN * * End of CSPMV * END OpenBLAS-0.2.20/reference/cspr2f.f000066400000000000000000000162311313527062700164530ustar00rootroot00000000000000 SUBROUTINE CSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) * .. Scalar Arguments .. COMPLEX*8 ALPHA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*8 AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SSPR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an * n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*8 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX*8 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX*8 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * AP - COMPLEX*8 array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*8 ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. COMPLEX*8 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSPR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 10, I = 1, J AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 10 CONTINUE END IF KK = KK + J 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, K = KK, KK + J - 1 AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 50, I = J, N AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 50 CONTINUE END IF KK = KK + N - J + 1 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, K = KK, KK + N - J AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of SSPR2 . * END OpenBLAS-0.2.20/reference/csprf.f000066400000000000000000000147701313527062700163770ustar00rootroot00000000000000 SUBROUTINE CSPRF( UPLO, N, ALPHA, X, INCX, AP ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX AP( * ), X( * ) * .. * * Purpose * ======= * * CSPR performs the symmetric rank 1 operation * * A := alpha*x*conjg( x' ) + A, * * where alpha is a complex scalar, x is an n element vector and A is an * n by n symmetric matrix, supplied in packed form. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X (input) COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * AP (input/output) COMPLEX array, dimension at least * ( ( N*( N + 1 ) )/2 ). * Before entry, with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry, with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * ===================================================================== * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, J, JX, K, KK, KX COMPLEX TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSPR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 ) THEN KX = 1 - ( N-1 )*INCX ELSE IF( INCX.NE.1 ) THEN KX = 1 END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) ) THEN * * Form A when upper triangle is stored in AP. * IF( INCX.EQ.1 ) THEN DO 20 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) K = KK DO 10 I = 1, J - 1 AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 10 CONTINUE AP( KK+J-1 ) = AP( KK+J-1 ) + X( J )*TEMP ELSE AP( KK+J-1 ) = AP( KK+J-1 ) END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = KX DO 30 K = KK, KK + J - 2 AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE AP( KK+J-1 ) = AP( KK+J-1 ) + X( JX )*TEMP ELSE AP( KK+J-1 ) = AP( KK+J-1 ) END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( INCX.EQ.1 ) THEN DO 60 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) AP( KK ) = AP( KK ) + TEMP*X( J ) K = KK + 1 DO 50 I = J + 1, N AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 50 CONTINUE ELSE AP( KK ) = AP( KK ) END IF KK = KK + N - J + 1 60 CONTINUE ELSE JX = KX DO 80 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) AP( KK ) = AP( KK ) + TEMP*X( JX ) IX = JX DO 70 K = KK + 1, KK + N - J IX = IX + INCX AP( K ) = AP( K ) + X( IX )*TEMP 70 CONTINUE ELSE AP( KK ) = AP( KK ) END IF JX = JX + INCX KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of CSPR * END OpenBLAS-0.2.20/reference/csrotf.f000066400000000000000000000017151313527062700165550ustar00rootroot00000000000000 subroutine csrotf (n,cx,incx,cy,incy,c,s) c c applies a plane rotation, where the cos and sin (c and s) are real c and the vectors cx and cy are complex. c jack dongarra, linpack, 3/11/78. c complex cx(1),cy(1),ctemp real c,s integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ctemp = c*cx(ix) + s*cy(iy) cy(iy) = c*cy(iy) - s*cx(ix) cx(ix) = ctemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n ctemp = c*cx(i) + s*cy(i) cy(i) = c*cy(i) - s*cx(i) cx(i) = ctemp 30 continue return end OpenBLAS-0.2.20/reference/csscalf.f000066400000000000000000000013101313527062700166620ustar00rootroot00000000000000 subroutine csscalf(n,sa,cx,incx) c c scales a complex vector by a real constant. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*) real sa integer i,incx,n,nincx c if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c nincx = n*incx do 10 i = 1,nincx,incx cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i))) 10 continue return c c code for increment equal to 1 c 20 do 30 i = 1,n cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i))) 30 continue return end OpenBLAS-0.2.20/reference/cswapf.f000066400000000000000000000015141313527062700165350ustar00rootroot00000000000000 subroutine cswapf (n,cx,incx,cy,incy) c c interchanges two vectors. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*),cy(*),ctemp integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ctemp = cx(ix) cx(ix) = cy(iy) cy(iy) = ctemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 20 do 30 i = 1,n ctemp = cx(i) cx(i) = cy(i) cy(i) = ctemp 30 continue return end OpenBLAS-0.2.20/reference/csymm3mf.f000066400000000000000000000232521313527062700170130ustar00rootroot00000000000000 SUBROUTINE CSYMM3MF( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX ALPHA, BETA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * CSYMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is a symmetric matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the symmetric matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the symmetric matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * symmetric matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * symmetric matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX TEMP1, TEMP2 * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CSYMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*A( J, J ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*A( J, K ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*A( J, K ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of CSYMM . * END OpenBLAS-0.2.20/reference/csymmf.f000066400000000000000000000232511313527062700165520ustar00rootroot00000000000000 SUBROUTINE CSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX ALPHA, BETA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * CSYMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is a symmetric matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the symmetric matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the symmetric matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * symmetric matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * symmetric matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX TEMP1, TEMP2 * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CSYMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*A( J, J ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*A( J, K ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*A( J, K ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of CSYMM . * END OpenBLAS-0.2.20/reference/csymvf.f000066400000000000000000000174771313527062700166000ustar00rootroot00000000000000 SUBROUTINE CSYMVF(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, INCY, LDA, N COMPLEX ALPHA, BETA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CSYMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A (input) COMPLEX array, dimension ( LDA, N ) * Before entry, with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. * Before entry, with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. * Unchanged on exit. * * LDA (input) INTEGER * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, N ). * Unchanged on exit. * * X (input) COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA (input) COMPLEX * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y (input/output) COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY COMPLEX TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = 5 ELSE IF( INCX.EQ.0 ) THEN INFO = 7 ELSE IF( INCY.EQ.0 ) THEN INFO = 10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSYMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N-1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N-1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30 I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 60 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50 I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70 I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 100 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( J, J ) DO 90 I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( J, J ) IX = JX IY = JY DO 110 I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of CSYMV * END OpenBLAS-0.2.20/reference/csyr2f.f000066400000000000000000000163031313527062700164640ustar00rootroot00000000000000 SUBROUTINE CSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX*8 ALPHA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*8 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SSYR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an n * by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*8 ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. COMPLEX*8 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSYR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of SSYR2 . * END OpenBLAS-0.2.20/reference/csyr2kf.f000066400000000000000000000252521313527062700166420ustar00rootroot00000000000000 SUBROUTINE CSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDB, LDC COMPLEX ALPHA, BETA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * CSYR2K performs one of the symmetric rank 2k operations * * C := alpha*A*B' + alpha*B*A' + beta*C, * * or * * C := alpha*A'*B + alpha*B'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A and B are n by k matrices in the first case and k by n * matrices in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + * beta*C. * * TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + * beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrices A and B, and on entry with * TRANS = 'T' or 't', K specifies the number of rows of the * matrices A and B. K must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array B must contain the matrix B, otherwise * the leading k by n part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDB must be at least max( 1, n ), otherwise LDB must * be at least max( 1, k ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA COMPLEX TEMP1, TEMP2 * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CSYR2K', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*B' + alpha*B*A' + C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*B + alpha*B'*A + C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP1 = ZERO TEMP2 = ZERO DO 190, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP1 = ZERO TEMP2 = ZERO DO 220, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of CSYR2K. * END OpenBLAS-0.2.20/reference/csyrf.f000066400000000000000000000135341313527062700164050ustar00rootroot00000000000000 SUBROUTINE CSYRF( UPLO, N, ALPHA, X, INCX, A, LDA ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, LDA, N COMPLEX ALPHA * .. * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CSYR performs the symmetric rank 1 operation * * A := alpha*x*( x' ) + A, * * where alpha is a complex scalar, x is an n element vector and A is an * n by n symmetric matrix. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X (input) COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * A (input/output) COMPLEX array, dimension ( LDA, N ) * Before entry, with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry, with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA (input) INTEGER * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, N ). * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, J, JX, KX COMPLEX TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = 7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CSYR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 ) THEN KX = 1 - ( N-1 )*INCX ELSE IF( INCX.NE.1 ) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) ) THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 ) THEN DO 20 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 10 I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX DO 40 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = KX DO 30 I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 ) THEN DO 60 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 50 I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = JX DO 70 I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of CSYR * END OpenBLAS-0.2.20/reference/csyrkf.f000066400000000000000000000221071313527062700165540ustar00rootroot00000000000000 SUBROUTINE CSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDC COMPLEX ALPHA, BETA * .. Array Arguments .. COMPLEX A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * CSYRK performs one of the symmetric rank k operations * * C := alpha*A*A' + beta*C, * * or * * C := alpha*A'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A is an n by k matrix in the first case and a k by n matrix * in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. * * TRANS = 'T' or 't' C := alpha*A'*A + beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrix A, and on entry with * TRANS = 'T' or 't', K specifies the number of rows of the * matrix A. K must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA COMPLEX TEMP * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CSYRK ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*A' + beta*C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + TEMP*A( I, L ) 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + TEMP*A( I, L ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*A + beta*C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP = ZERO DO 190, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP = ZERO DO 220, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of CSYRK . * END OpenBLAS-0.2.20/reference/ctbmvf.f000066400000000000000000000312321313527062700165330ustar00rootroot00000000000000 SUBROUTINE CTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, K, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CTBMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, or x := conjg( A' )*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular band matrix, with ( k + 1 ) diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := conjg( A' )*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( K.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CTBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = KPLUS1 - J DO 10, I = MAX( 1, J - K ), J - 1 X( I ) = X( I ) + TEMP*A( L + I, J ) 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( KPLUS1, J ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = KPLUS1 - J DO 30, I = MAX( 1, J - K ), J - 1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( KPLUS1, J ) END IF JX = JX + INCX IF( J.GT.K ) $ KX = KX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = 1 - J DO 50, I = MIN( N, J + K ), J + 1, -1 X( I ) = X( I ) + TEMP*A( L + I, J ) 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( 1, J ) END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = 1 - J DO 70, I = MIN( N, J + K ), J + 1, -1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( 1, J ) END IF JX = JX - INCX IF( ( N - J ).GE.K ) $ KX = KX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x or x := conjg( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 110, J = N, 1, -1 TEMP = X( J ) L = KPLUS1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 90, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( I ) 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( KPLUS1, J ) ) DO 100, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + CONJG( A( L + I, J ) )*X( I ) 100 CONTINUE END IF X( J ) = TEMP 110 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 140, J = N, 1, -1 TEMP = X( JX ) KX = KX - INCX IX = KX L = KPLUS1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 120, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX - INCX 120 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( KPLUS1, J ) ) DO 130, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX ) IX = IX - INCX 130 CONTINUE END IF X( JX ) = TEMP JX = JX - INCX 140 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 170, J = 1, N TEMP = X( J ) L = 1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 150, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( I ) 150 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( 1, J ) ) DO 160, I = J + 1, MIN( N, J + K ) TEMP = TEMP + CONJG( A( L + I, J ) )*X( I ) 160 CONTINUE END IF X( J ) = TEMP 170 CONTINUE ELSE JX = KX DO 200, J = 1, N TEMP = X( JX ) KX = KX + INCX IX = KX L = 1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 180, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX + INCX 180 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( 1, J ) ) DO 190, I = J + 1, MIN( N, J + K ) TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX ) IX = IX + INCX 190 CONTINUE END IF X( JX ) = TEMP JX = JX + INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of CTBMV . * END OpenBLAS-0.2.20/reference/ctbsvf.f000066400000000000000000000306201313527062700165410ustar00rootroot00000000000000 SUBROUTINE CTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) * .. Scalar Arguments .. INTEGER INCX,K,LDA,N CHARACTER DIAG,TRANS,UPLO * .. * .. Array Arguments .. COMPLEX A(LDA,*),X(*) * .. * * Purpose * ======= * * CTBSV solves one of the systems of equations * * A*x = b, or A'*x = b, or conjg( A' )*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular band matrix, with ( k + 1 ) * diagonals. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Arguments * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' conjg( A' )*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER (ZERO= (0.0E+0,0.0E+0)) * .. * .. Local Scalars .. COMPLEX TEMP INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L LOGICAL NOCONJ,NOUNIT * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CONJG,MAX,MIN * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN INFO = 1 ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 2 ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT. (K+1)) THEN INFO = 7 ELSE IF (INCX.EQ.0) THEN INFO = 9 END IF IF (INFO.NE.0) THEN CALL XERBLA('CTBSV ',INFO) RETURN END IF * * Quick return if possible. * IF (N.EQ.0) RETURN * NOCONJ = LSAME(TRANS,'T') NOUNIT = LSAME(DIAG,'N') * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF (INCX.LE.0) THEN KX = 1 - (N-1)*INCX ELSE IF (INCX.NE.1) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed by sequentially with one pass through A. * IF (LSAME(TRANS,'N')) THEN * * Form x := inv( A )*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 20 J = N,1,-1 IF (X(J).NE.ZERO) THEN L = KPLUS1 - J IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) TEMP = X(J) DO 10 I = J - 1,MAX(1,J-K),-1 X(I) = X(I) - TEMP*A(L+I,J) 10 CONTINUE END IF 20 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 40 J = N,1,-1 KX = KX - INCX IF (X(JX).NE.ZERO) THEN IX = KX L = KPLUS1 - J IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) TEMP = X(JX) DO 30 I = J - 1,MAX(1,J-K),-1 X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX - INCX 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 60 J = 1,N IF (X(J).NE.ZERO) THEN L = 1 - J IF (NOUNIT) X(J) = X(J)/A(1,J) TEMP = X(J) DO 50 I = J + 1,MIN(N,J+K) X(I) = X(I) - TEMP*A(L+I,J) 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1,N KX = KX + INCX IF (X(JX).NE.ZERO) THEN IX = KX L = 1 - J IF (NOUNIT) X(JX) = X(JX)/A(1,J) TEMP = X(JX) DO 70 I = J + 1,MIN(N,J+K) X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x or x := inv( conjg( A') )*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 110 J = 1,N TEMP = X(J) L = KPLUS1 - J IF (NOCONJ) THEN DO 90 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(I) 90 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) ELSE DO 100 I = MAX(1,J-K),J - 1 TEMP = TEMP - CONJG(A(L+I,J))*X(I) 100 CONTINUE IF (NOUNIT) TEMP = TEMP/CONJG(A(KPLUS1,J)) END IF X(J) = TEMP 110 CONTINUE ELSE JX = KX DO 140 J = 1,N TEMP = X(JX) IX = KX L = KPLUS1 - J IF (NOCONJ) THEN DO 120 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX + INCX 120 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) ELSE DO 130 I = MAX(1,J-K),J - 1 TEMP = TEMP - CONJG(A(L+I,J))*X(IX) IX = IX + INCX 130 CONTINUE IF (NOUNIT) TEMP = TEMP/CONJG(A(KPLUS1,J)) END IF X(JX) = TEMP JX = JX + INCX IF (J.GT.K) KX = KX + INCX 140 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 170 J = N,1,-1 TEMP = X(J) L = 1 - J IF (NOCONJ) THEN DO 150 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(I) 150 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) ELSE DO 160 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - CONJG(A(L+I,J))*X(I) 160 CONTINUE IF (NOUNIT) TEMP = TEMP/CONJG(A(1,J)) END IF X(J) = TEMP 170 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 200 J = N,1,-1 TEMP = X(JX) IX = KX L = 1 - J IF (NOCONJ) THEN DO 180 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX - INCX 180 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) ELSE DO 190 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - CONJG(A(L+I,J))*X(IX) IX = IX - INCX 190 CONTINUE IF (NOUNIT) TEMP = TEMP/CONJG(A(1,J)) END IF X(JX) = TEMP JX = JX - INCX IF ((N-J).GE.K) KX = KX - INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of CTBSV . * END OpenBLAS-0.2.20/reference/ctpmvf.f000066400000000000000000000306441313527062700165570ustar00rootroot00000000000000 SUBROUTINE CTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX AP( * ), X( * ) * .. * * Purpose * ======= * * CTPMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, or x := conjg( A' )*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := conjg( A' )*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - COMPLEX array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CTPMVF', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ))THEN * * Form x:= A*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = 1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 10, I = 1, J - 1 IF( NOCONJ )THEN X( I ) = X( I ) + TEMP*AP( K ) ELSE X( I ) = X( I ) + TEMP*CONJG(AP( K )) END IF K = K + 1 10 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )*AP( KK + J - 1 ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*CONJG(AP( KK + J-1)) END IF END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, K = KK, KK + J - 2 IF( NOCONJ )THEN X( IX ) = X( IX ) + TEMP*AP( K ) ELSE X( IX ) = X( IX ) + TEMP*CONJG(AP(K)) END IF IX = IX + INCX 30 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK + J - 1 ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*CONJG(AP( KK + J-1)) END IF END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 50, I = N, J + 1, -1 IF( NOCONJ )THEN X( I ) = X( I ) + TEMP*AP( K ) ELSE X( I ) = X( I ) + TEMP*CONJG(AP( K )) END IF K = K - 1 50 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )*AP( KK - N + J ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*CONJG(AP(KK - N+J)) END IF END IF KK = KK - ( N - J + 1 ) 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 IF( NOCONJ )THEN X( IX ) = X( IX ) + TEMP*AP( K ) ELSE X( IX ) = X( IX ) + TEMP*CONJG(AP(K)) ENDIF IX = IX - INCX 70 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK - N + J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*CONJG(AP(KK-N+J)) ENDIF END IF JX = JX - INCX KK = KK - ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := A'*x or x := conjg( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 110, J = N, 1, -1 TEMP = X( J ) K = KK - 1 IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 90, I = J - 1, 1, -1 TEMP = TEMP + AP( K )*X( I ) K = K - 1 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( AP( KK ) ) DO 100, I = J - 1, 1, -1 TEMP = TEMP + CONJG( AP( K ) )*X( I ) K = K - 1 100 CONTINUE END IF X( J ) = TEMP KK = KK - J 110 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 140, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 120, K = KK - 1, KK - J + 1, -1 IX = IX - INCX TEMP = TEMP + AP( K )*X( IX ) 120 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( AP( KK ) ) DO 130, K = KK - 1, KK - J + 1, -1 IX = IX - INCX TEMP = TEMP + CONJG( AP( K ) )*X( IX ) 130 CONTINUE END IF X( JX ) = TEMP JX = JX - INCX KK = KK - J 140 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 170, J = 1, N TEMP = X( J ) K = KK + 1 IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 150, I = J + 1, N TEMP = TEMP + AP( K )*X( I ) K = K + 1 150 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( AP( KK ) ) DO 160, I = J + 1, N TEMP = TEMP + CONJG( AP( K ) )*X( I ) K = K + 1 160 CONTINUE END IF X( J ) = TEMP KK = KK + ( N - J + 1 ) 170 CONTINUE ELSE JX = KX DO 200, J = 1, N TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 180, K = KK + 1, KK + N - J IX = IX + INCX TEMP = TEMP + AP( K )*X( IX ) 180 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( AP( KK ) ) DO 190, K = KK + 1, KK + N - J IX = IX + INCX TEMP = TEMP + CONJG( AP( K ) )*X( IX ) 190 CONTINUE END IF X( JX ) = TEMP JX = JX + INCX KK = KK + ( N - J + 1 ) 200 CONTINUE END IF END IF END IF * RETURN * * End of CTPMV . * END OpenBLAS-0.2.20/reference/ctpsvf.f000066400000000000000000000310601313527062700165560ustar00rootroot00000000000000 SUBROUTINE CTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX AP( * ), X( * ) * .. * * Purpose * ======= * * CTPSV solves one of the systems of equations * * A*x = b, or A'*x = b, or conjg( A' )*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix, supplied in packed form. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' conjg( A' )*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - COMPLEX array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CTPSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ) .OR.LSAME( TRANS, 'R' ))THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) ELSE IF( NOUNIT ) $ X( J ) = X( J )/CONJG(AP( KK )) END IF TEMP = X( J ) K = KK - 1 DO 10, I = J - 1, 1, -1 IF( NOCONJ )THEN X( I ) = X( I ) - TEMP*AP( K ) ELSE X( I ) = X( I ) - TEMP*CONJG(AP( K )) END IF K = K - 1 10 CONTINUE END IF KK = KK - J 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/CONJG(AP( KK )) END IF TEMP = X( JX ) IX = JX DO 30, K = KK - 1, KK - J + 1, -1 IX = IX - INCX IF( NOCONJ )THEN X( IX ) = X( IX ) - TEMP*AP( K ) ELSE X( IX ) = X( IX ) - TEMP*CONJG(AP( K )) END IF 30 CONTINUE END IF JX = JX - INCX KK = KK - J 40 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) ELSE IF( NOUNIT ) $ X( J ) = X( J )/CONJG(AP( KK )) END IF TEMP = X( J ) K = KK + 1 DO 50, I = J + 1, N IF( NOCONJ )THEN X( I ) = X( I ) - TEMP*AP( K ) ELSE X( I ) = X( I ) - TEMP*CONJG(AP( K )) END IF K = K + 1 50 CONTINUE END IF KK = KK + ( N - J + 1 ) 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/CONJG(AP( KK )) END IF TEMP = X( JX ) IX = JX DO 70, K = KK + 1, KK + N - J IX = IX + INCX IF( NOCONJ )THEN X( IX ) = X( IX ) - TEMP*AP( K ) ELSE X( IX ) = X( IX ) - TEMP*CONJG(AP( K )) END IF 70 CONTINUE END IF JX = JX + INCX KK = KK + ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = 1 IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = X( J ) K = KK IF( NOCONJ )THEN DO 90, I = 1, J - 1 TEMP = TEMP - AP( K )*X( I ) K = K + 1 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) ELSE DO 100, I = 1, J - 1 TEMP = TEMP - CONJG( AP( K ) )*X( I ) K = K + 1 100 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( AP( KK + J - 1 ) ) END IF X( J ) = TEMP KK = KK + J 110 CONTINUE ELSE JX = KX DO 140, J = 1, N TEMP = X( JX ) IX = KX IF( NOCONJ )THEN DO 120, K = KK, KK + J - 2 TEMP = TEMP - AP( K )*X( IX ) IX = IX + INCX 120 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) ELSE DO 130, K = KK, KK + J - 2 TEMP = TEMP - CONJG( AP( K ) )*X( IX ) IX = IX + INCX 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( AP( KK + J - 1 ) ) END IF X( JX ) = TEMP JX = JX + INCX KK = KK + J 140 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 170, J = N, 1, -1 TEMP = X( J ) K = KK IF( NOCONJ )THEN DO 150, I = N, J + 1, -1 TEMP = TEMP - AP( K )*X( I ) K = K - 1 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) ELSE DO 160, I = N, J + 1, -1 TEMP = TEMP - CONJG( AP( K ) )*X( I ) K = K - 1 160 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( AP( KK - N + J ) ) END IF X( J ) = TEMP KK = KK - ( N - J + 1 ) 170 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 200, J = N, 1, -1 TEMP = X( JX ) IX = KX IF( NOCONJ )THEN DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1 TEMP = TEMP - AP( K )*X( IX ) IX = IX - INCX 180 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) ELSE DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1 TEMP = TEMP - CONJG( AP( K ) )*X( IX ) IX = IX - INCX 190 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( AP( KK - N + J ) ) END IF X( JX ) = TEMP JX = JX - INCX KK = KK - ( N - J + 1 ) 200 CONTINUE END IF END IF END IF * RETURN * * End of CTPSV . * END OpenBLAS-0.2.20/reference/ctrmmf.f000066400000000000000000000343521313527062700165500ustar00rootroot00000000000000 SUBROUTINE CTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB COMPLEX ALPHA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CTRMM performs one of the matrix-matrix operations * * B := alpha*op( A )*B, or B := alpha*B*op( A ) * * where alpha is a scalar, B is an m by n matrix, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) multiplies B from * the left or right as follows: * * SIDE = 'L' or 'l' B := alpha*op( A )*B. * * SIDE = 'R' or 'r' B := alpha*B*op( A ). * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = conjg( A' ). * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B, and on exit is overwritten by the * transformed matrix. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. Local Scalars .. LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA COMPLEX TEMP * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOCONJ = LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CTRMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN * * Form B := alpha*A*B. * IF( UPPER )THEN DO 50, J = 1, N DO 40, K = 1, M IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) IF (NOCONJ) THEN DO 30, I = 1, K - 1 B( I, J ) = B( I, J ) + TEMP*A( I, K ) 30 CONTINUE IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) B( K, J ) = TEMP ELSE DO 35, I = 1, K - 1 B( I, J ) = B( I, J ) + TEMP*CONJG(A( I, K )) 35 CONTINUE IF( NOUNIT ) $ TEMP = TEMP*CONJG(A( K, K )) B( K, J ) = TEMP ENDIF END IF 40 CONTINUE 50 CONTINUE ELSE DO 80, J = 1, N DO 70 K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) B( K, J ) = TEMP IF (NOCONJ) THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )*A( K, K ) DO 60, I = K + 1, M B( I, J ) = B( I, J ) + TEMP*A( I, K ) 60 CONTINUE ELSE IF( NOUNIT ) $ B( K, J ) = B( K, J )*CONJG(A( K, K )) DO 65, I = K + 1, M B( I, J ) = B( I, J ) + TEMP*CONJG(A( I, K )) 65 CONTINUE ENDIF END IF 70 CONTINUE 80 CONTINUE END IF ELSE * * Form B := alpha*A'*B or B := alpha*conjg( A' )*B. * IF( UPPER )THEN DO 120, J = 1, N DO 110, I = M, 1, -1 TEMP = B( I, J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 90, K = 1, I - 1 TEMP = TEMP + A( K, I )*B( K, J ) 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( I, I ) ) DO 100, K = 1, I - 1 TEMP = TEMP + CONJG( A( K, I ) )*B( K, J ) 100 CONTINUE END IF B( I, J ) = ALPHA*TEMP 110 CONTINUE 120 CONTINUE ELSE DO 160, J = 1, N DO 150, I = 1, M TEMP = B( I, J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 130, K = I + 1, M TEMP = TEMP + A( K, I )*B( K, J ) 130 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( I, I ) ) DO 140, K = I + 1, M TEMP = TEMP + CONJG( A( K, I ) )*B( K, J ) 140 CONTINUE END IF B( I, J ) = ALPHA*TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN * * Form B := alpha*B*A. * IF( UPPER )THEN DO 200, J = N, 1, -1 TEMP = ALPHA IF (NOCONJ) THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG(A( J, J )) ENDIF DO 170, I = 1, M B( I, J ) = TEMP*B( I, J ) 170 CONTINUE DO 190, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN TEMP = ALPHA*A( K, J ) ELSE TEMP = ALPHA*CONJG(A( K, J )) ENDIF DO 180, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 180 CONTINUE END IF 190 CONTINUE 200 CONTINUE ELSE DO 240, J = 1, N TEMP = ALPHA IF (NOCONJ) THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG(A( J, J )) ENDIF DO 210, I = 1, M B( I, J ) = TEMP*B( I, J ) 210 CONTINUE DO 230, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN TEMP = ALPHA*A( K, J ) ELSE TEMP = ALPHA*CONJG(A( K, J )) ENDIF DO 220, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 220 CONTINUE END IF 230 CONTINUE 240 CONTINUE END IF ELSE * * Form B := alpha*B*A' or B := alpha*B*conjg( A' ). * IF( UPPER )THEN DO 280, K = 1, N DO 260, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = ALPHA*A( J, K ) ELSE TEMP = ALPHA*CONJG( A( J, K ) ) END IF DO 250, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 250 CONTINUE END IF 260 CONTINUE TEMP = ALPHA IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = TEMP*A( K, K ) ELSE TEMP = TEMP*CONJG( A( K, K ) ) END IF END IF IF( TEMP.NE.ONE )THEN DO 270, I = 1, M B( I, K ) = TEMP*B( I, K ) 270 CONTINUE END IF 280 CONTINUE ELSE DO 320, K = N, 1, -1 DO 300, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = ALPHA*A( J, K ) ELSE TEMP = ALPHA*CONJG( A( J, K ) ) END IF DO 290, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 290 CONTINUE END IF 300 CONTINUE TEMP = ALPHA IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = TEMP*A( K, K ) ELSE TEMP = TEMP*CONJG( A( K, K ) ) END IF END IF IF( TEMP.NE.ONE )THEN DO 310, I = 1, M B( I, K ) = TEMP*B( I, K ) 310 CONTINUE END IF 320 CONTINUE END IF END IF END IF * RETURN * * End of CTRMM . * END OpenBLAS-0.2.20/reference/ctrmvf.f000066400000000000000000000271321313527062700165570ustar00rootroot00000000000000 SUBROUTINE CTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CTRMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, or x := conjg( A' )*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := conjg( A' )*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CTRMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 10, I = 1, J - 1 IF (NOCONJ) THEN X( I ) = X( I ) + TEMP*A( I, J ) ELSE X( I ) = X( I ) + TEMP*CONJG(A( I, J )) ENDIF 10 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*CONJG(A( J, J )) ENDIF END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, I = 1, J - 1 IF (NOCONJ) THEN X( IX ) = X( IX ) + TEMP*A( I, J ) ELSE X( IX ) = X( IX ) + TEMP*CONJG(A( I, J )) ENDIF IX = IX + INCX 30 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*CONJG(A( J, J )) ENDIF END IF JX = JX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 50, I = N, J + 1, -1 IF (NOCONJ) THEN X( I ) = X( I ) + TEMP*A( I, J ) ELSE X( I ) = X( I ) + TEMP*CONJG(A( I, J )) ENDIF 50 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*CONJG(A( J, J )) ENDIF END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, I = N, J + 1, -1 IF (NOCONJ) THEN X( IX ) = X( IX ) + TEMP*A( I, J ) ELSE X( IX ) = X( IX ) + TEMP*CONJG(A( I, J )) ENDIF IX = IX - INCX 70 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*CONJG(A( J, J )) ENDIF END IF JX = JX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x or x := conjg( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 110, J = N, 1, -1 TEMP = X( J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 90, I = J - 1, 1, -1 TEMP = TEMP + A( I, J )*X( I ) 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( J, J ) ) DO 100, I = J - 1, 1, -1 TEMP = TEMP + CONJG( A( I, J ) )*X( I ) 100 CONTINUE END IF X( J ) = TEMP 110 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 140, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 120, I = J - 1, 1, -1 IX = IX - INCX TEMP = TEMP + A( I, J )*X( IX ) 120 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( J, J ) ) DO 130, I = J - 1, 1, -1 IX = IX - INCX TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) 130 CONTINUE END IF X( JX ) = TEMP JX = JX - INCX 140 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 170, J = 1, N TEMP = X( J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 150, I = J + 1, N TEMP = TEMP + A( I, J )*X( I ) 150 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( J, J ) ) DO 160, I = J + 1, N TEMP = TEMP + CONJG( A( I, J ) )*X( I ) 160 CONTINUE END IF X( J ) = TEMP 170 CONTINUE ELSE JX = KX DO 200, J = 1, N TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 180, I = J + 1, N IX = IX + INCX TEMP = TEMP + A( I, J )*X( IX ) 180 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*CONJG( A( J, J ) ) DO 190, I = J + 1, N IX = IX + INCX TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) 190 CONTINUE END IF X( JX ) = TEMP JX = JX + INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of CTRMV . * END OpenBLAS-0.2.20/reference/ctrsmf.f000066400000000000000000000364401313527062700165560ustar00rootroot00000000000000 SUBROUTINE CTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB COMPLEX ALPHA * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * CTRSM solves one of the matrix equations * * op( A )*X = alpha*B, or X*op( A ) = alpha*B, * * where alpha is a scalar, X and B are m by n matrices, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). * * The matrix X is overwritten on B. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) appears on the left * or right of X as follows: * * SIDE = 'L' or 'l' op( A )*X = alpha*B. * * SIDE = 'R' or 'r' X*op( A ) = alpha*B. * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = conjg( A' ). * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - COMPLEX array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the right-hand side matrix B, and on exit is * overwritten by the solution matrix X. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. Local Scalars .. LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA COMPLEX TEMP * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOCONJ = (LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' )) NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CTRSM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN * * Form B := alpha*inv( A )*B. * IF( UPPER )THEN DO 60, J = 1, N IF( ALPHA.NE.ONE )THEN DO 30, I = 1, M B( I, J ) = ALPHA*B( I, J ) 30 CONTINUE END IF DO 50, K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) THEN IF (NOCONJ) THEN B( K, J ) = B( K, J )/A( K, K ) ELSE B( K, J ) = B( K, J )/CONJG(A( K, K )) ENDIF ENDIF IF (NOCONJ) THEN DO 40, I = 1, K - 1 B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 40 CONTINUE ELSE DO 45, I = 1, K - 1 B( I, J ) = B( I, J ) - B( K, J )*CONJG(A( I, K )) 45 CONTINUE ENDIF ENDIF 50 CONTINUE 60 CONTINUE ELSE DO 100, J = 1, N IF( ALPHA.NE.ONE )THEN DO 70, I = 1, M B( I, J ) = ALPHA*B( I, J ) 70 CONTINUE END IF DO 90 K = 1, M IF (NOCONJ) THEN IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/A( K, K ) DO 80, I = K + 1, M B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 80 CONTINUE END IF ELSE IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/CONJG(A( K, K )) DO 85, I = K + 1, M B( I, J ) = B( I, J ) - B( K, J )*CONJG(A( I, K )) 85 CONTINUE END IF ENDIF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form B := alpha*inv( A' )*B * or B := alpha*inv( conjg( A' ) )*B. * IF( UPPER )THEN DO 140, J = 1, N DO 130, I = 1, M TEMP = ALPHA*B( I, J ) IF( NOCONJ )THEN DO 110, K = 1, I - 1 TEMP = TEMP - A( K, I )*B( K, J ) 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) ELSE DO 120, K = 1, I - 1 TEMP = TEMP - CONJG( A( K, I ) )*B( K, J ) 120 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( A( I, I ) ) END IF B( I, J ) = TEMP 130 CONTINUE 140 CONTINUE ELSE DO 180, J = 1, N DO 170, I = M, 1, -1 TEMP = ALPHA*B( I, J ) IF( NOCONJ )THEN DO 150, K = I + 1, M TEMP = TEMP - A( K, I )*B( K, J ) 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) ELSE DO 160, K = I + 1, M TEMP = TEMP - CONJG( A( K, I ) )*B( K, J ) 160 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( A( I, I ) ) END IF B( I, J ) = TEMP 170 CONTINUE 180 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN * * Form B := alpha*B*inv( A ). * IF( UPPER )THEN DO 230, J = 1, N IF( ALPHA.NE.ONE )THEN DO 190, I = 1, M B( I, J ) = ALPHA*B( I, J ) 190 CONTINUE END IF DO 210, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN DO 200, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 200 CONTINUE ELSE DO 205, I = 1, M B( I, J ) = B( I, J ) - CONJG(A( K, J ))*B( I, K ) 205 CONTINUE ENDIF END IF 210 CONTINUE IF( NOUNIT )THEN IF (NOCONJ) THEN TEMP = ONE/A( J, J ) ELSE TEMP = ONE/CONJG(A( J, J )) ENDIF DO 220, I = 1, M B( I, J ) = TEMP*B( I, J ) 220 CONTINUE END IF 230 CONTINUE ELSE DO 280, J = N, 1, -1 IF( ALPHA.NE.ONE )THEN DO 240, I = 1, M B( I, J ) = ALPHA*B( I, J ) 240 CONTINUE END IF DO 260, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN DO 250, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 250 CONTINUE ELSE DO 255, I = 1, M B( I, J ) = B( I, J ) - CONJG(A( K, J ))*B( I, K ) 255 CONTINUE ENDIF END IF 260 CONTINUE IF( NOUNIT )THEN IF (NOCONJ) THEN TEMP = ONE/A( J, J ) ELSE TEMP = ONE/CONJG(A( J, J )) ENDIF DO 270, I = 1, M B( I, J ) = TEMP*B( I, J ) 270 CONTINUE END IF 280 CONTINUE END IF ELSE * * Form B := alpha*B*inv( A' ) * or B := alpha*B*inv( conjg( A' ) ). * IF( UPPER )THEN DO 330, K = N, 1, -1 IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = ONE/A( K, K ) ELSE TEMP = ONE/CONJG( A( K, K ) ) END IF DO 290, I = 1, M B( I, K ) = TEMP*B( I, K ) 290 CONTINUE END IF DO 310, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = A( J, K ) ELSE TEMP = CONJG( A( J, K ) ) END IF DO 300, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 300 CONTINUE END IF 310 CONTINUE IF( ALPHA.NE.ONE )THEN DO 320, I = 1, M B( I, K ) = ALPHA*B( I, K ) 320 CONTINUE END IF 330 CONTINUE ELSE DO 380, K = 1, N IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = ONE/A( K, K ) ELSE TEMP = ONE/CONJG( A( K, K ) ) END IF DO 340, I = 1, M B( I, K ) = TEMP*B( I, K ) 340 CONTINUE END IF DO 360, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = A( J, K ) ELSE TEMP = CONJG( A( J, K ) ) END IF DO 350, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 350 CONTINUE END IF 360 CONTINUE IF( ALPHA.NE.ONE )THEN DO 370, I = 1, M B( I, K ) = ALPHA*B( I, K ) 370 CONTINUE END IF 380 CONTINUE END IF END IF END IF * RETURN * * End of CTRSM . * END OpenBLAS-0.2.20/reference/ctrsvf.f000066400000000000000000000274151313527062700165710ustar00rootroot00000000000000 SUBROUTINE CTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX A( LDA, * ), X( * ) * .. * * Purpose * ======= * * CTRSV solves one of the systems of equations * * A*x = b, or A'*x = b, or conjg( A' )*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' conjg( A' )*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. COMPLEX TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC CONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CTRSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 10, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*A( I, J ) 10 CONTINUE ELSE IF( NOUNIT ) $ X( J ) = X( J )/CONJG(A( J, J )) TEMP = X( J ) DO 15, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*CONJG(A( I, J )) 15 CONTINUE ENDIF END IF 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/CONJG(A( J, J )) ENDIF TEMP = X( JX ) IX = JX DO 30, I = J - 1, 1, -1 IX = IX - INCX IF (NOCONJ) THEN X( IX ) = X( IX ) - TEMP*A( I, J ) ELSE X( IX ) = X( IX ) - TEMP*CONJG(A( I, J )) ENDIF 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 50, I = J + 1, N X( I ) = X( I ) - TEMP*A( I, J ) 50 CONTINUE ELSE IF( NOUNIT ) $ X( J ) = X( J )/CONJG(A( J, J )) TEMP = X( J ) DO 55, I = J + 1, N X( I ) = X( I ) - TEMP*CONJG(A( I, J )) 55 CONTINUE ENDIF END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/CONJG(A( J, J )) ENDIF TEMP = X( JX ) IX = JX DO 70, I = J + 1, N IX = IX + INCX IF (NOCONJ) THEN X( IX ) = X( IX ) - TEMP*A( I, J ) ELSE X( IX ) = X( IX ) - TEMP*CONJG(A( I, J )) ENDIF 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = X( J ) IF( NOCONJ )THEN DO 90, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( I ) 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 100, I = 1, J - 1 TEMP = TEMP - CONJG( A( I, J ) )*X( I ) 100 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( A( J, J ) ) END IF X( J ) = TEMP 110 CONTINUE ELSE JX = KX DO 140, J = 1, N IX = KX TEMP = X( JX ) IF( NOCONJ )THEN DO 120, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX + INCX 120 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 130, I = 1, J - 1 TEMP = TEMP - CONJG( A( I, J ) )*X( IX ) IX = IX + INCX 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( A( J, J ) ) END IF X( JX ) = TEMP JX = JX + INCX 140 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 170, J = N, 1, -1 TEMP = X( J ) IF( NOCONJ )THEN DO 150, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( I ) 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 160, I = N, J + 1, -1 TEMP = TEMP - CONJG( A( I, J ) )*X( I ) 160 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( A( J, J ) ) END IF X( J ) = TEMP 170 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 200, J = N, 1, -1 IX = KX TEMP = X( JX ) IF( NOCONJ )THEN DO 180, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX - INCX 180 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 190, I = N, J + 1, -1 TEMP = TEMP - CONJG( A( I, J ) )*X( IX ) IX = IX - INCX 190 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/CONJG( A( J, J ) ) END IF X( JX ) = TEMP JX = JX - INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of CTRSV . * END OpenBLAS-0.2.20/reference/ctrti2f.f000066400000000000000000000101601313527062700166240ustar00rootroot00000000000000 SUBROUTINE CTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CTRTI2 computes the inverse of a complex upper or lower triangular * matrix. * * This is the Level 2 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the matrix A is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * DIAG (input) CHARACTER*1 * Specifies whether or not the matrix A is unit triangular. * = 'N': Non-unit triangular * = 'U': Unit triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading n by n upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J COMPLEX AJJ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CSCAL, CTRMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CTRTI2', -INFO ) RETURN END IF * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix. * DO 10 J = 1, N IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF * * Compute elements 1:j-1 of j-th column. * CALL CTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, $ A( 1, J ), 1 ) CALL CSCAL( J-1, AJJ, A( 1, J ), 1 ) 10 CONTINUE ELSE * * Compute inverse of lower triangular matrix. * DO 20 J = N, 1, -1 IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF IF( J.LT.N ) THEN * * Compute elements j+1:n of j-th column. * CALL CTRMV( 'Lower', 'No transpose', DIAG, N-J, $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) CALL CSCAL( N-J, AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF * RETURN * * End of CTRTI2 * END OpenBLAS-0.2.20/reference/ctrtrif.f000066400000000000000000000122531313527062700167310ustar00rootroot00000000000000 SUBROUTINE CTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX A( LDA, * ) * .. * * Purpose * ======= * * CTRTRI computes the inverse of a complex upper or lower triangular * matrix A. * * This is the Level 3 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': A is upper triangular; * = 'L': A is lower triangular. * * DIAG (input) CHARACTER*1 * = 'N': A is non-unit triangular; * = 'U': A is unit triangular. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading N-by-N upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, A(i,i) is exactly zero. The triangular * matrix is singular and its inverse can not be computed. * * ===================================================================== * * .. Parameters .. COMPLEX ONE, ZERO PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J, JB, NB, NN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL CTRMM, CTRSM, CTRTI2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CTRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN DO 10 INFO = 1, N IF( A( INFO, INFO ).EQ.ZERO ) $ RETURN 10 CONTINUE INFO = 0 END IF * * Determine the block size for this environment. * NB = 128 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL CTRTI2( UPLO, DIAG, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * DO 20 J = 1, N, NB JB = MIN( NB, N-J+1 ) * * Compute rows 1:j-1 of current block column * CALL CTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, $ JB, ONE, A, LDA, A( 1, J ), LDA ) CALL CTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) * * Compute inverse of current diagonal block * CALL CTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) 20 CONTINUE ELSE * * Compute inverse of lower triangular matrix * NN = ( ( N-1 ) / NB )*NB + 1 DO 30 J = NN, 1, -NB JB = MIN( NB, N-J+1 ) IF( J+JB.LE.N ) THEN * * Compute rows j+jb:n of current block column * CALL CTRMM( 'Left', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, $ A( J+JB, J ), LDA ) CALL CTRSM( 'Right', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, $ A( J+JB, J ), LDA ) END IF * * Compute inverse of current diagonal block * CALL CTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) 30 CONTINUE END IF END IF * RETURN * * End of CTRTRI * END OpenBLAS-0.2.20/reference/damaxf.f000066400000000000000000000015311313527062700165110ustar00rootroot00000000000000 REAL*8 function damaxf(n,dx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*) integer i,incx,ix,n c damaxf = 0 if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 damaxf = dabs(dx(1)) ix = ix + incx do 10 i = 2,n if(dabs(dx(ix)).le.damaxf) go to 5 damaxf = dabs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 damaxf = dabs(dx(1)) do 30 i = 2,n if(dabs(dx(i)).le.damaxf) go to 30 damaxf = dabs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/daminf.f000066400000000000000000000015311313527062700165070ustar00rootroot00000000000000 REAL*8 function daminf(n,dx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*) integer i,incx,ix,n c daminf = 0 if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 daminf = dabs(dx(1)) ix = ix + incx do 10 i = 2,n if(dabs(dx(ix)).ge.daminf) go to 5 daminf = dabs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 daminf = dabs(dx(1)) do 30 i = 2,n if(dabs(dx(i)).ge.daminf) go to 30 daminf = dabs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/dasumf.f000066400000000000000000000020531313527062700165300ustar00rootroot00000000000000 double precision function dasumf(n,dx,incx) c c takes the sum of the absolute values. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dtemp integer i,incx,m,mp1,n,nincx c dasumf = 0.0d0 dtemp = 0.0d0 if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c nincx = n*incx do 10 i = 1,nincx,incx dtemp = dtemp + dabs(dx(i)) 10 continue dasumf = dtemp return c c code for increment equal to 1 c c c clean-up loop c 20 m = mod(n,6) if( m .eq. 0 ) go to 40 do 30 i = 1,m dtemp = dtemp + dabs(dx(i)) 30 continue if( n .lt. 6 ) go to 60 40 mp1 = m + 1 do 50 i = mp1,n,6 dtemp = dtemp + dabs(dx(i)) + dabs(dx(i + 1)) + dabs(dx(i + 2)) * + dabs(dx(i + 3)) + dabs(dx(i + 4)) + dabs(dx(i + 5)) 50 continue 60 dasumf = dtemp return end OpenBLAS-0.2.20/reference/daxpyf.f000066400000000000000000000023101313527062700165400ustar00rootroot00000000000000 subroutine daxpyf(n,da,dx,incx,dy,incy) c c constant times a vector plus a vector. c uses unrolled loops for increments equal to one. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dy(*),da integer i,incx,incy,ix,iy,m,mp1,n c if(n.le.0)return if (da .eq. 0.0d0) return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n dy(iy) = dy(iy) + da*dx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,4) if( m .eq. 0 ) go to 40 do 30 i = 1,m dy(i) = dy(i) + da*dx(i) 30 continue if( n .lt. 4 ) return 40 mp1 = m + 1 do 50 i = mp1,n,4 dy(i) = dy(i) + da*dx(i) dy(i + 1) = dy(i + 1) + da*dx(i + 1) dy(i + 2) = dy(i + 2) + da*dx(i + 2) dy(i + 3) = dy(i + 3) + da*dx(i + 3) 50 continue return end OpenBLAS-0.2.20/reference/dcopyf.f000066400000000000000000000022531313527062700165370ustar00rootroot00000000000000 subroutine dcopyf(n,dx,incx,dy,incy) c c copies a vector, x, to a vector, y. c uses unrolled loops for increments equal to one. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dy(*) integer i,incx,incy,ix,iy,m,mp1,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n dy(iy) = dx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,7) if( m .eq. 0 ) go to 40 do 30 i = 1,m dy(i) = dx(i) 30 continue if( n .lt. 7 ) return 40 mp1 = m + 1 do 50 i = mp1,n,7 dy(i) = dx(i) dy(i + 1) = dx(i + 1) dy(i + 2) = dx(i + 2) dy(i + 3) = dx(i + 3) dy(i + 4) = dx(i + 4) dy(i + 5) = dx(i + 5) dy(i + 6) = dx(i + 6) 50 continue return end OpenBLAS-0.2.20/reference/ddotf.f000066400000000000000000000023511313527062700163520ustar00rootroot00000000000000 double precision function ddotf(n,dx,incx,dy,incy) c c forms the dot product of two vectors. c uses unrolled loops for increments equal to one. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dy(*),dtemp integer i,incx,incy,ix,iy,m,mp1,n c ddotf = 0.0d0 dtemp = 0.0d0 if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n dtemp = dtemp + dx(ix)*dy(iy) ix = ix + incx iy = iy + incy 10 continue ddotf = dtemp return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,5) if( m .eq. 0 ) go to 40 do 30 i = 1,m dtemp = dtemp + dx(i)*dy(i) 30 continue if( n .lt. 5 ) go to 60 40 mp1 = m + 1 do 50 i = mp1,n,5 dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) + * dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4) 50 continue 60 ddotf = dtemp return end OpenBLAS-0.2.20/reference/dgbmvf.f000066400000000000000000000221171313527062700165210ustar00rootroot00000000000000 SUBROUTINE DGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA INTEGER INCX, INCY, KL, KU, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DGBMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n band matrix, with kl sub-diagonals and ku super-diagonals. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * KL - INTEGER. * On entry, KL specifies the number of sub-diagonals of the * matrix A. KL must satisfy 0 .le. KL. * Unchanged on exit. * * KU - INTEGER. * On entry, KU specifies the number of super-diagonals of the * matrix A. KU must satisfy 0 .le. KU. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry, the leading ( kl + ku + 1 ) by n part of the * array A must contain the matrix of coefficients, supplied * column by column, with the leading diagonal of the matrix in * row ( ku + 1 ) of the array, the first super-diagonal * starting at position 2 in row ku, the first sub-diagonal * starting at position 1 in row ( ku + 2 ), and so on. * Elements in the array A that do not correspond to elements * in the band matrix (such as the top left ku by ku triangle) * are not referenced. * The following program segment will transfer a band matrix * from conventional full matrix storage to band storage: * * DO 20, J = 1, N * K = KU + 1 - J * DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) * A( K + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( kl + ku + 1 ). * Unchanged on exit. * * X - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, $ LENX, LENY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( KL.LT.0 )THEN INFO = 4 ELSE IF( KU.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN INFO = 8 ELSE IF( INCX.EQ.0 )THEN INFO = 10 ELSE IF( INCY.EQ.0 )THEN INFO = 13 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DGBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) )THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the band part of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KUP1 = KU + 1 IF( LSAME( TRANS, 'N' ) )THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) K = KUP1 - J DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*A( K + I, J ) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IY = KY K = KUP1 - J DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX IF( J.GT.KU ) $ KY = KY + INCY 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = ZERO K = KUP1 - J DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( I ) 90 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 100 CONTINUE ELSE DO 120, J = 1, N TEMP = ZERO IX = KX K = KUP1 - J DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( IX ) IX = IX + INCX 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY IF( J.GT.KU ) $ KX = KX + INCX 120 CONTINUE END IF END IF * RETURN * * End of DGBMV . * END OpenBLAS-0.2.20/reference/dgemmf.f000066400000000000000000000226451313527062700165210ustar00rootroot00000000000000 SUBROUTINE DGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANA,TRANB * .. * .. Array Arguments .. DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*) * .. * * Purpose * ======= * * DGEMM performs one of the matrix-matrix operations * * C := alpha*op( A )*op( B ) + beta*C, * * where op( X ) is one of * * op( X ) = X or op( X ) = X', * * alpha and beta are scalars, and A, B and C are matrices, with op( A ) * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. * * Arguments * ========== * * TRANA - CHARACTER*1. * On entry, TRANA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANA = 'N' or 'n', op( A ) = A. * * TRANA = 'T' or 't', op( A ) = A'. * * TRANA = 'C' or 'c', op( A ) = A'. * * Unchanged on exit. * * TRANB - CHARACTER*1. * On entry, TRANB specifies the form of op( B ) to be used in * the matrix multiplication as follows: * * TRANB = 'N' or 'n', op( B ) = B. * * TRANB = 'T' or 't', op( B ) = B'. * * TRANB = 'C' or 'c', op( B ) = B'. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix * op( A ) and of the matrix C. M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix * op( B ) and the number of columns of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of columns of the matrix * op( A ) and the number of rows of the matrix op( B ). K must * be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is * k when TRANA = 'N' or 'n', and is m otherwise. * Before entry with TRANA = 'N' or 'n', the leading m by k * part of the array A must contain the matrix A, otherwise * the leading k by m part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANA = 'N' or 'n' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, k ). * Unchanged on exit. * * B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is * n when TRANB = 'N' or 'n', and is k otherwise. * Before entry with TRANB = 'N' or 'n', the leading k by n * part of the array B must contain the matrix B, otherwise * the leading n by k part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANB = 'N' or 'n' then * LDB must be at least max( 1, k ), otherwise LDB must be at * least max( 1, n ). * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n matrix * ( alpha*op( A )*op( B ) + beta*C ). * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL NOTA,NOTB * .. * .. Parameters .. DOUBLE PRECISION ONE,ZERO PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) * .. * * Set NOTA and NOTB as true if A and B respectively are not * transposed and set NROWA, NCOLA and NROWB as the number of rows * and columns of A and the number of rows of B respectively. * NOTA = LSAME(TRANA,'N') NOTB = LSAME(TRANB,'N') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF * * Test the input parameters. * INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANA,'C')) .AND. + (.NOT.LSAME(TRANA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANB,'C')) .AND. + (.NOT.LSAME(TRANB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('DGEMM ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN * * And if alpha.eq.zero. * IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF (NOTB) THEN IF (NOTA) THEN * * Form C := alpha*A*B + beta*C. * DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE * * Form C := alpha*A'*B + beta*C * DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE END IF ELSE IF (NOTA) THEN * * Form C := alpha*A*B' + beta*C * DO 170 J = 1,N IF (BETA.EQ.ZERO) THEN DO 130 I = 1,M C(I,J) = ZERO 130 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 140 I = 1,M C(I,J) = BETA*C(I,J) 140 CONTINUE END IF DO 160 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 150 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 150 CONTINUE END IF 160 CONTINUE 170 CONTINUE ELSE * * Form C := alpha*A'*B' + beta*C * DO 200 J = 1,N DO 190 I = 1,M TEMP = ZERO DO 180 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 180 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 190 CONTINUE 200 CONTINUE END IF END IF * RETURN * * End of DGEMM . * END OpenBLAS-0.2.20/reference/dgemvf.f000066400000000000000000000162741313527062700165330ustar00rootroot00000000000000 SUBROUTINE DGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA INTEGER INCX, INCY, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) )THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( TRANS, 'N' ) )THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) DO 50, I = 1, M Y( I ) = Y( I ) + TEMP*A( I, J ) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IY = KY DO 70, I = 1, M Y( IY ) = Y( IY ) + TEMP*A( I, J ) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = ZERO DO 90, I = 1, M TEMP = TEMP + A( I, J )*X( I ) 90 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 100 CONTINUE ELSE DO 120, J = 1, N TEMP = ZERO IX = KX DO 110, I = 1, M TEMP = TEMP + A( I, J )*X( IX ) IX = IX + INCX 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of DGEMV . * END OpenBLAS-0.2.20/reference/dgerf.f000066400000000000000000000104201313527062700163350ustar00rootroot00000000000000 SUBROUTINE DGERF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX, INCY, LDA, M, N * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DGER performs the rank 1 operation * * A := alpha*x*y' + A, * * where alpha is a scalar, x is an m element vector, y is an n element * vector and A is an m by n matrix. * * Parameters * ========== * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( m - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the m * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. On exit, A is * overwritten by the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JY, KX * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( M.LT.0 )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DGER ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( INCY.GT.0 )THEN JY = 1 ELSE JY = 1 - ( N - 1 )*INCY END IF IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) DO 10, I = 1, M A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( M - 1 )*INCX END IF DO 40, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) IX = KX DO 30, I = 1, M A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF * RETURN * * End of DGER . * END OpenBLAS-0.2.20/reference/dgesvf.f000066400000000000000000000064111313527062700165310ustar00rootroot00000000000000 SUBROUTINE DGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK driver routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DGESV computes the solution to a real system of linear equations * A * X = B, * where A is an N-by-N matrix and X and B are N-by-NRHS matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor A as * A = P * L * U, * where P is a permutation matrix, L is unit lower triangular, and U is * upper triangular. The factored form of A is then used to solve the * system of equations A * X = B. * * Arguments * ========= * * N (input) INTEGER * The number of linear equations, i.e., the order of the * matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the N-by-N coefficient matrix A. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (output) INTEGER array, dimension (N) * The pivot indices that define the permutation matrix P; * row i of the matrix was interchanged with row IPIV(i). * * B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) * On entry, the N-by-NRHS matrix of right hand side matrix B. * On exit, if INFO = 0, the N-by-NRHS solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, so the solution could not be computed. * * ===================================================================== * * .. External Subroutines .. EXTERNAL DGETRF, DGETRS, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( NRHS.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGESV ', -INFO ) RETURN END IF * * Compute the LU factorization of A. * CALL DGETRF( N, N, A, LDA, IPIV, INFO ) IF( INFO.EQ.0 ) THEN * * Solve the system A*X = B, overwriting B with X. * CALL DGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, $ INFO ) END IF RETURN * * End of DGESV * END OpenBLAS-0.2.20/reference/dgetf2f.f000066400000000000000000000073431313527062700166010ustar00rootroot00000000000000 SUBROUTINE DGETF2F( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1992 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DGETF2 computes an LU factorization of a general m-by-n matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 2 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the m by n matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, U(k,k) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER J, JP * .. * .. External Functions .. INTEGER IDAMAX EXTERNAL IDAMAX * .. * .. External Subroutines .. EXTERNAL DGER, DSCAL, DSWAP, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGETF2', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * DO 10 J = 1, MIN( M, N ) * * Find pivot and test for singularity. * JP = J - 1 + IDAMAX( M-J+1, A( J, J ), 1 ) IPIV( J ) = JP IF( A( JP, J ).NE.ZERO ) THEN * * Apply the interchange to columns 1:N. * IF( JP.NE.J ) $ CALL DSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) * * Compute elements J+1:M of J-th column. * IF( J.LT.M ) $ CALL DSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * INFO = J END IF * IF( J.LT.MIN( M, N ) ) THEN * * Update trailing submatrix. * CALL DGER( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), LDA, $ A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN * * End of DGETF2 * END OpenBLAS-0.2.20/reference/dgetrff.f000066400000000000000000000107611313527062700166770ustar00rootroot00000000000000 SUBROUTINE DGETRFF( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DGETRF computes an LU factorization of a general M-by-N matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 3 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the M-by-N matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. INTEGER I, IINFO, J, JB, NB * .. * .. External Subroutines .. EXTERNAL DGEMM, DGETF2, DLASWP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 64 IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN * * Use unblocked code. * CALL DGETF2( M, N, A, LDA, IPIV, INFO ) ELSE * * Use blocked code. * DO 20 J = 1, MIN( M, N ), NB JB = MIN( MIN( M, N )-J+1, NB ) * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL DGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) * * Adjust INFO and the pivot indices. * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - 1 DO 10 I = J, MIN( M, J+JB-1 ) IPIV( I ) = J - 1 + IPIV( I ) 10 CONTINUE * * Apply interchanges to columns 1:J-1. * CALL DLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) * IF( J+JB.LE.N ) THEN * * Apply interchanges to columns J+JB:N. * CALL DLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, $ IPIV, 1 ) * * Compute block row of U. * CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), $ LDA ) IF( J+JB.LE.M ) THEN * * Update trailing submatrix. * CALL DGEMM( 'No transpose', 'No transpose', M-J-JB+1, $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), $ LDA ) END IF END IF 20 CONTINUE END IF RETURN * * End of DGETRF * END OpenBLAS-0.2.20/reference/dgetrsf.f000066400000000000000000000101631313527062700167100ustar00rootroot00000000000000 SUBROUTINE DGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DGETRS solves a system of linear equations * A * X = B or A' * X = B * with a general N-by-N matrix A using the LU factorization computed * by DGETRF. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A'* X = B (Transpose) * = 'C': A'* X = B (Conjugate transpose = Transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input) DOUBLE PRECISION array, dimension (LDA,N) * The factors L and U from the factorization A = P*L*U * as computed by DGETRF. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (input) INTEGER array, dimension (N) * The pivot indices from DGETRF; for 1<=i<=N, row i of the * matrix was interchanged with row IPIV(i). * * B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DLASWP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * * Solve A * X = B. * * Apply row interchanges to the right hand sides. * CALL DLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) * * Solve L*X = B, overwriting B with X. * CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, $ ONE, A, LDA, B, LDB ) * * Solve U*X = B, overwriting B with X. * CALL DTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, LDA, B, LDB ) ELSE * * Solve A' * X = B. * * Solve U'*X = B, overwriting B with X. * CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, LDA, B, LDB ) * * Solve L'*X = B, overwriting B with X. * CALL DTRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, ONE, $ A, LDA, B, LDB ) * * Apply row interchanges to the solution vectors. * CALL DLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) END IF * RETURN * * End of DGETRS * END OpenBLAS-0.2.20/reference/dlaswpf.f000066400000000000000000000063671313527062700167250ustar00rootroot00000000000000 SUBROUTINE DLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1999 * * .. Scalar Arguments .. INTEGER INCX, K1, K2, LDA, N * .. * .. Array Arguments .. INTEGER IPIV( * ) DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DLASWP performs a series of row interchanges on the matrix A. * One row interchange is initiated for each of rows K1 through K2 of A. * * Arguments * ========= * * N (input) INTEGER * The number of columns of the matrix A. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the matrix of column dimension N to which the row * interchanges will be applied. * On exit, the permuted matrix. * * LDA (input) INTEGER * The leading dimension of the array A. * * K1 (input) INTEGER * The first element of IPIV for which a row interchange will * be done. * * K2 (input) INTEGER * The last element of IPIV for which a row interchange will * be done. * * IPIV (input) INTEGER array, dimension (M*abs(INCX)) * The vector of pivot indices. Only the elements in positions * K1 through K2 of IPIV are accessed. * IPIV(K) = L implies rows K and L are to be interchanged. * * INCX (input) INTEGER * The increment between successive values of IPIV. If IPIV * is negative, the pivots are applied in reverse order. * * Further Details * =============== * * Modified by * R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA * * ===================================================================== * * .. Local Scalars .. INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 DOUBLE PRECISION TEMP * .. * .. Executable Statements .. * * Interchange row I with row IPIV(I) for each of rows K1 through K2. * IF( INCX.GT.0 ) THEN IX0 = K1 I1 = K1 I2 = K2 INC = 1 ELSE IF( INCX.LT.0 ) THEN IX0 = 1 + ( 1-K2 )*INCX I1 = K2 I2 = K1 INC = -1 ELSE RETURN END IF * N32 = ( N / 32 )*32 IF( N32.NE.0 ) THEN DO 30 J = 1, N32, 32 IX = IX0 DO 20 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 10 K = J, J + 31 TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 10 CONTINUE END IF IX = IX + INCX 20 CONTINUE 30 CONTINUE END IF IF( N32.NE.N ) THEN N32 = N32 + 1 IX = IX0 DO 50 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 40 K = N32, N TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 40 CONTINUE END IF IX = IX + INCX 50 CONTINUE END IF * RETURN * * End of DLASWP * END OpenBLAS-0.2.20/reference/dlauu2f.f000066400000000000000000000073771313527062700166310ustar00rootroot00000000000000 SUBROUTINE DLAUU2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I DOUBLE PRECISION AII * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DDOT EXTERNAL LSAME, DDOT * .. * .. External Subroutines .. EXTERNAL DGEMV, DSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAUU2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = DDOT( N-I+1, A( I, I ), LDA, A( I, I ), LDA ) CALL DGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), $ LDA, A( I, I+1 ), LDA, AII, A( 1, I ), 1 ) ELSE CALL DSCAL( I, AII, A( 1, I ), 1 ) END IF 10 CONTINUE * ELSE * * Compute the product L' * L. * DO 20 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = DDOT( N-I+1, A( I, I ), 1, A( I, I ), 1 ) CALL DGEMV( 'Transpose', N-I, I-1, ONE, A( I+1, 1 ), LDA, $ A( I+1, I ), 1, AII, A( I, 1 ), LDA ) ELSE CALL DSCAL( I, AII, A( I, 1 ), LDA ) END IF 20 CONTINUE END IF * RETURN * * End of DLAUU2 * END OpenBLAS-0.2.20/reference/dlauumf.f000066400000000000000000000112631313527062700167110ustar00rootroot00000000000000 SUBROUTINE DLAUUMF( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the blocked form of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IB, NB * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DGEMM, DLAUU2, DSYRK, DTRMM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAUUM', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 128 * IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL DLAUU2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL DTRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', $ I-1, IB, ONE, A( I, I ), LDA, A( 1, I ), $ LDA ) CALL DLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL DGEMM( 'No transpose', 'Transpose', I-1, IB, $ N-I-IB+1, ONE, A( 1, I+IB ), LDA, $ A( I, I+IB ), LDA, ONE, A( 1, I ), LDA ) CALL DSYRK( 'Upper', 'No transpose', IB, N-I-IB+1, $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), $ LDA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * DO 20 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL DTRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', IB, $ I-1, ONE, A( I, I ), LDA, A( I, 1 ), LDA ) CALL DLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL DGEMM( 'Transpose', 'No transpose', IB, I-1, $ N-I-IB+1, ONE, A( I+IB, I ), LDA, $ A( I+IB, 1 ), LDA, ONE, A( I, 1 ), LDA ) CALL DSYRK( 'Lower', 'Transpose', IB, N-I-IB+1, ONE, $ A( I+IB, I ), LDA, ONE, A( I, I ), LDA ) END IF 20 CONTINUE END IF END IF * RETURN * * End of DLAUUM * END OpenBLAS-0.2.20/reference/dmaxf.f000066400000000000000000000014551313527062700163550ustar00rootroot00000000000000 REAL*8 function dmaxf(n,dx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*) integer i,incx,ix,n c dmaxf = 0 if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmaxf = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).le.dmaxf) go to 5 dmaxf = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmaxf = dx(1) do 30 i = 2,n if(dx(i).le.dmaxf) go to 30 dmaxf = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/dminf.f000066400000000000000000000014551313527062700163530ustar00rootroot00000000000000 REAL*8 function dminf(n,dx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*) integer i,incx,ix,n c dminf = 0 if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dminf = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).ge.dminf) go to 5 dminf = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dminf = dx(1) do 30 i = 2,n if(dx(i).ge.dminf) go to 30 dminf = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/dnrm2f.f000066400000000000000000000031751313527062700164470ustar00rootroot00000000000000 DOUBLE PRECISION FUNCTION DNRM2F ( N, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N * .. Array Arguments .. DOUBLE PRECISION X( * ) * .. * * DNRM2 returns the euclidean norm of a vector via the function * name, so that * * DNRM2 := sqrt( x'*x ) * * * * -- This version written on 25-October-1982. * Modified on 14-October-1993 to inline the call to DLASSQ. * Sven Hammarling, Nag Ltd. * * * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. Local Scalars .. INTEGER IX DOUBLE PRECISION ABSXI, NORM, SCALE, SSQ * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. * .. Executable Statements .. IF( N.LT.1 .OR. INCX.LT.1 )THEN NORM = ZERO ELSE IF( N.EQ.1 )THEN NORM = ABS( X( 1 ) ) ELSE SCALE = ZERO SSQ = ONE * The following loop is equivalent to this call to the LAPACK * auxiliary routine: * CALL DLASSQ( N, X, INCX, SCALE, SSQ ) * DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX IF( X( IX ).NE.ZERO )THEN ABSXI = ABS( X( IX ) ) IF( SCALE.LT.ABSXI )THEN SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SCALE = ABSXI ELSE SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF 10 CONTINUE NORM = SCALE * SQRT( SSQ ) END IF * DNRM2F = NORM RETURN * * End of DNRM2. * END OpenBLAS-0.2.20/reference/dpotf2f.f000066400000000000000000000114141313527062700166160ustar00rootroot00000000000000 SUBROUTINE DPOTF2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DPOTF2 computes the Cholesky factorization of a real symmetric * positive definite matrix A. * * The factorization has the form * A = U' * U , if UPLO = 'U', or * A = L * L', if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the symmetric matrix A. If UPLO = 'U', the leading * n by n upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U'*U or A = L*L'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, the leading minor of order k is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J DOUBLE PRECISION AJJ * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DDOT EXTERNAL LSAME, DDOT * .. * .. External Subroutines .. EXTERNAL DGEMV, DSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DPOTF2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = A( J, J ) - DDOT( J-1, A( 1, J ), 1, A( 1, J ), 1 ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of row J. * IF( J.LT.N ) THEN CALL DGEMV( 'Transpose', J-1, N-J, -ONE, A( 1, J+1 ), $ LDA, A( 1, J ), 1, ONE, A( J, J+1 ), LDA ) CALL DSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) END IF 10 CONTINUE ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = A( J, J ) - DDOT( J-1, A( J, 1 ), LDA, A( J, 1 ), $ LDA ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of column J. * IF( J.LT.N ) THEN CALL DGEMV( 'No transpose', N-J, J-1, -ONE, A( J+1, 1 ), $ LDA, A( J, 1 ), LDA, ONE, A( J+1, J ), 1 ) CALL DSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF GO TO 40 * 30 CONTINUE INFO = J * 40 CONTINUE RETURN * * End of DPOTF2 * END OpenBLAS-0.2.20/reference/dpotrff.f000066400000000000000000000127561313527062700167300ustar00rootroot00000000000000 SUBROUTINE DPOTRFF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DPOTRF computes the Cholesky factorization of a real symmetric * positive definite matrix A. * * The factorization has the form * A = U**T * U, if UPLO = 'U', or * A = L * L**T, if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the block version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the symmetric matrix A. If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the leading minor of order i is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J, JB, NB * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DGEMM, DPOTF2, DSYRK, DTRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 224 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code. * CALL DPOTF2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code. * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL DSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, $ A( 1, J ), LDA, ONE, A( J, J ), LDA ) CALL DPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block row. * CALL DGEMM( 'Transpose', 'No transpose', JB, N-J-JB+1, $ J-1, -ONE, A( 1, J ), LDA, A( 1, J+JB ), $ LDA, ONE, A( J, J+JB ), LDA ) CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', $ JB, N-J-JB+1, ONE, A( J, J ), LDA, $ A( J, J+JB ), LDA ) END IF 10 CONTINUE * ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL DSYRK( 'Lower', 'No transpose', JB, J-1, -ONE, $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) CALL DPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block column. * CALL DGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, $ J-1, -ONE, A( J+JB, 1 ), LDA, A( J, 1 ), $ LDA, ONE, A( J+JB, J ), LDA ) CALL DTRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', $ N-J-JB+1, JB, ONE, A( J, J ), LDA, $ A( J+JB, J ), LDA ) END IF 20 CONTINUE END IF END IF GO TO 40 * 30 CONTINUE INFO = INFO + J - 1 * 40 CONTINUE RETURN * * End of DPOTRF * END OpenBLAS-0.2.20/reference/dpotrif.f000066400000000000000000000050411313527062700167200ustar00rootroot00000000000000 SUBROUTINE DPOTRIF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DPOTRI computes the inverse of a real symmetric positive definite * matrix A using the Cholesky factorization A = U**T*U or A = L*L**T * computed by DPOTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T, as computed by * DPOTRF. * On exit, the upper or lower triangle of the (symmetric) * inverse of A, overwriting the input factor U or L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DLAUUM, DTRTRI, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL DTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL DLAUUM( UPLO, N, A, LDA, INFO ) * RETURN * * End of DPOTRI * END OpenBLAS-0.2.20/reference/drotf.f000066400000000000000000000016651313527062700163770ustar00rootroot00000000000000 subroutine drotf (n,dx,incx,dy,incy,c,s) c c applies a plane rotation. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dy(*),dtemp,c,s integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n dtemp = c*dx(ix) + s*dy(iy) dy(iy) = c*dy(iy) - s*dx(ix) dx(ix) = dtemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n dtemp = c*dx(i) + s*dy(i) dy(i) = c*dy(i) - s*dx(i) dx(i) = dtemp 30 continue return end OpenBLAS-0.2.20/reference/drotgf.f000066400000000000000000000012761313527062700165440ustar00rootroot00000000000000 subroutine drotgf(da,db,c,s) c c construct givens plane rotation. c jack dongarra, linpack, 3/11/78. c double precision da,db,c,s,roe,scale,r,z c roe = db if( dabs(da) .gt. dabs(db) ) roe = da scale = dabs(da) + dabs(db) if( scale .ne. 0.0d0 ) go to 10 c = 1.0d0 s = 0.0d0 r = 0.0d0 z = 0.0d0 go to 20 10 r = scale*dsqrt((da/scale)**2 + (db/scale)**2) r = dsign(1.0d0,roe)*r c = da/r s = db/r z = 1.0d0 if( dabs(da) .gt. dabs(db) ) z = s if( dabs(db) .ge. dabs(da) .and. c .ne. 0.0d0 ) z = 1.0d0/c 20 da = r db = z return end OpenBLAS-0.2.20/reference/drotmf.f000066400000000000000000000062731313527062700165540ustar00rootroot00000000000000 SUBROUTINE DROTMF (N,DX,INCX,DY,INCY,DPARAM) C C APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX C C (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN C (DY**T) C C DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE C LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. C WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. C C DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 C C (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) C H=( ) ( ) ( ) ( ) C (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). C SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. C DOUBLE PRECISION DFLAG,DH12,DH22,DX,TWO,Z,DH11,DH21, 1 DPARAM,DY,W,ZERO DIMENSION DX(1),DY(1),DPARAM(5) DATA ZERO,TWO/0.D0,2.D0/ C DFLAG=DPARAM(1) IF(N .LE. 0 .OR.(DFLAG+TWO.EQ.ZERO)) GO TO 140 IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70 C NSTEPS=N*INCX IF(DFLAG) 50,10,30 10 CONTINUE DH12=DPARAM(4) DH21=DPARAM(3) DO 20 I=1,NSTEPS,INCX W=DX(I) Z=DY(I) DX(I)=W+Z*DH12 DY(I)=W*DH21+Z 20 CONTINUE GO TO 140 30 CONTINUE DH11=DPARAM(2) DH22=DPARAM(5) DO 40 I=1,NSTEPS,INCX W=DX(I) Z=DY(I) DX(I)=W*DH11+Z DY(I)=-W+DH22*Z 40 CONTINUE GO TO 140 50 CONTINUE DH11=DPARAM(2) DH12=DPARAM(4) DH21=DPARAM(3) DH22=DPARAM(5) DO 60 I=1,NSTEPS,INCX W=DX(I) Z=DY(I) DX(I)=W*DH11+Z*DH12 DY(I)=W*DH21+Z*DH22 60 CONTINUE GO TO 140 70 CONTINUE KX=1 KY=1 IF(INCX .LT. 0) KX=1+(1-N)*INCX IF(INCY .LT. 0) KY=1+(1-N)*INCY C IF(DFLAG)120,80,100 80 CONTINUE DH12=DPARAM(4) DH21=DPARAM(3) DO 90 I=1,N W=DX(KX) Z=DY(KY) DX(KX)=W+Z*DH12 DY(KY)=W*DH21+Z KX=KX+INCX KY=KY+INCY 90 CONTINUE GO TO 140 100 CONTINUE DH11=DPARAM(2) DH22=DPARAM(5) DO 110 I=1,N W=DX(KX) Z=DY(KY) DX(KX)=W*DH11+Z DY(KY)=-W+DH22*Z KX=KX+INCX KY=KY+INCY 110 CONTINUE GO TO 140 120 CONTINUE DH11=DPARAM(2) DH12=DPARAM(4) DH21=DPARAM(3) DH22=DPARAM(5) DO 130 I=1,N W=DX(KX) Z=DY(KY) DX(KX)=W*DH11+Z*DH12 DY(KY)=W*DH21+Z*DH22 KX=KX+INCX KY=KY+INCY 130 CONTINUE 140 CONTINUE RETURN END OpenBLAS-0.2.20/reference/drotmgf.f000066400000000000000000000114531313527062700167170ustar00rootroot00000000000000 SUBROUTINE DROTMGF (DD1,DD2,DX1,DY1,DPARAM) C C CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS C THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* C DY2)**T. C WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. C C DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 C C (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) C H=( ) ( ) ( ) ( ) C (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). C LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 C RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE C VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) C C THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE C INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE C OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. C DOUBLE PRECISION GAM,ONE,RGAMSQ,DD2,DH11,DH21,DPARAM,DP2, 1 DQ2,DU,DY1,ZERO,GAMSQ,DD1,DFLAG,DH12,DH22,DP1,DQ1, 2 DTEMP,DX1,TWO DIMENSION DPARAM(5) C DATA ZERO,ONE,TWO /0.D0,1.D0,2.D0/ DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/ IF(.NOT. DD1 .LT. ZERO) GO TO 10 C GO ZERO-H-D-AND-DX1.. GO TO 60 10 CONTINUE C CASE-DD1-NONNEGATIVE DP2=DD2*DY1 IF(.NOT. DP2 .EQ. ZERO) GO TO 20 DFLAG=-TWO GO TO 260 C REGULAR-CASE.. 20 CONTINUE DP1=DD1*DX1 DQ2=DP2*DY1 DQ1=DP1*DX1 C IF(.NOT. DABS(DQ1) .GT. DABS(DQ2)) GO TO 40 DH21=-DY1/DX1 DH12=DP2/DP1 C DU=ONE-DH12*DH21 C IF(.NOT. DU .LE. ZERO) GO TO 30 C GO ZERO-H-D-AND-DX1.. GO TO 60 30 CONTINUE DFLAG=ZERO DD1=DD1/DU DD2=DD2/DU DX1=DX1*DU C GO SCALE-CHECK.. GO TO 100 40 CONTINUE IF(.NOT. DQ2 .LT. ZERO) GO TO 50 C GO ZERO-H-D-AND-DX1.. GO TO 60 50 CONTINUE DFLAG=ONE DH11=DP1/DP2 DH22=DX1/DY1 DU=ONE+DH11*DH22 DTEMP=DD2/DU DD2=DD1/DU DD1=DTEMP DX1=DY1*DU C GO SCALE-CHECK GO TO 100 C PROCEDURE..ZERO-H-D-AND-DX1.. 60 CONTINUE DFLAG=-ONE DH11=ZERO DH12=ZERO DH21=ZERO DH22=ZERO C DD1=ZERO DD2=ZERO DX1=ZERO C RETURN.. GO TO 220 C PROCEDURE..FIX-H.. 70 CONTINUE IF(.NOT. DFLAG .GE. ZERO) GO TO 90 C IF(.NOT. DFLAG .EQ. ZERO) GO TO 80 DH11=ONE DH22=ONE DFLAG=-ONE GO TO 90 80 CONTINUE DH21=-ONE DH12=ONE DFLAG=-ONE 90 CONTINUE GO TO IGO,(120,150,180,210) C PROCEDURE..SCALE-CHECK 100 CONTINUE 110 CONTINUE IF(.NOT. DD1 .LE. RGAMSQ) GO TO 130 IF(DD1 .EQ. ZERO) GO TO 160 ASSIGN 120 TO IGO C FIX-H.. GO TO 70 120 CONTINUE DD1=DD1*GAM**2 DX1=DX1/GAM DH11=DH11/GAM DH12=DH12/GAM GO TO 110 130 CONTINUE 140 CONTINUE IF(.NOT. DD1 .GE. GAMSQ) GO TO 160 ASSIGN 150 TO IGO C FIX-H.. GO TO 70 150 CONTINUE DD1=DD1/GAM**2 DX1=DX1*GAM DH11=DH11*GAM DH12=DH12*GAM GO TO 140 160 CONTINUE 170 CONTINUE IF(.NOT. DABS(DD2) .LE. RGAMSQ) GO TO 190 IF(DD2 .EQ. ZERO) GO TO 220 ASSIGN 180 TO IGO C FIX-H.. GO TO 70 180 CONTINUE DD2=DD2*GAM**2 DH21=DH21/GAM DH22=DH22/GAM GO TO 170 190 CONTINUE 200 CONTINUE IF(.NOT. DABS(DD2) .GE. GAMSQ) GO TO 220 ASSIGN 210 TO IGO C FIX-H.. GO TO 70 210 CONTINUE DD2=DD2/GAM**2 DH21=DH21*GAM DH22=DH22*GAM GO TO 200 220 CONTINUE IF(DFLAG)250,230,240 230 CONTINUE DPARAM(3)=DH21 DPARAM(4)=DH12 GO TO 260 240 CONTINUE DPARAM(2)=DH11 DPARAM(5)=DH22 GO TO 260 250 CONTINUE DPARAM(2)=DH11 DPARAM(3)=DH21 DPARAM(4)=DH12 DPARAM(5)=DH22 260 CONTINUE DPARAM(1)=DFLAG RETURN END OpenBLAS-0.2.20/reference/dsbmvf.f000066400000000000000000000231511313527062700165340ustar00rootroot00000000000000 SUBROUTINE DSBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA INTEGER INCX, INCY, K, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DSBMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric band matrix, with k super-diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the band matrix A is being supplied as * follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * being supplied. * * UPLO = 'L' or 'l' The lower triangular part of A is * being supplied. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of super-diagonals of the * matrix A. K must satisfy 0 .le. K. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer the upper * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer the lower * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * Y - DOUBLE PRECISION array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( K.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array A * are accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) )THEN * * Form y when upper triangle of A is stored. * KPLUS1 = K + 1 IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO L = KPLUS1 - J DO 50, I = MAX( 1, J - K ), J - 1 Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY L = KPLUS1 - J DO 70, I = MAX( 1, J - K ), J - 1 Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY IF( J.GT.K )THEN KX = KX + INCX KY = KY + INCY END IF 80 CONTINUE END IF ELSE * * Form y when lower triangle of A is stored. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( 1, J ) L = 1 - J DO 90, I = J + 1, MIN( N, J + K ) Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) L = 1 - J IX = JX IY = JY DO 110, I = J + 1, MIN( N, J + K ) IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of DSBMV . * END OpenBLAS-0.2.20/reference/dscalf.f000066400000000000000000000017701313527062700165120ustar00rootroot00000000000000 subroutine dscalf(n,da,dx,incx) c c scales a vector by a constant. c uses unrolled loops for increment equal to one. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision da,dx(*) integer i,incx,m,mp1,n,nincx c if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c nincx = n*incx do 10 i = 1,nincx,incx dx(i) = da*dx(i) 10 continue return c c code for increment equal to 1 c c c clean-up loop c 20 m = mod(n,5) if( m .eq. 0 ) go to 40 do 30 i = 1,m dx(i) = da*dx(i) 30 continue if( n .lt. 5 ) return 40 mp1 = m + 1 do 50 i = mp1,n,5 dx(i) = da*dx(i) dx(i + 1) = da*dx(i + 1) dx(i + 2) = da*dx(i + 2) dx(i + 3) = da*dx(i + 3) dx(i + 4) = da*dx(i + 4) 50 continue return end OpenBLAS-0.2.20/reference/dsdotf.f000066400000000000000000000047751313527062700165510ustar00rootroot00000000000000*DECK DSDOT DOUBLE PRECISION FUNCTION DSDOTF (N, SX, INCX, SY, INCY) C***BEGIN PROLOGUE DSDOT C***PURPOSE Compute the inner product of two vectors with extended C precision accumulation and result. C***LIBRARY SLATEC (BLAS) C***CATEGORY D1A4 C***TYPE DOUBLE PRECISION (DSDOT-D, DCDOT-C) C***KEYWORDS BLAS, COMPLEX VECTORS, DOT PRODUCT, INNER PRODUCT, C LINEAR ALGEBRA, VECTOR C***AUTHOR Lawson, C. L., (JPL) C Hanson, R. J., (SNLA) C Kincaid, D. R., (U. of Texas) C Krogh, F. T., (JPL) C***DESCRIPTION C C B L A S Subprogram C Description of Parameters C C --Input-- C N number of elements in input vector(s) C SX single precision vector with N elements C INCX storage spacing between elements of SX C SY single precision vector with N elements C INCY storage spacing between elements of SY C C --Output-- C DSDOT double precision dot product (zero if N.LE.0) C C Returns D.P. dot product accumulated in D.P., for S.P. SX and SY C DSDOT = sum for I = 0 to N-1 of SX(LX+I*INCX) * SY(LY+I*INCY), C where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is C defined in a similar way using INCY. C C***REFERENCES C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T. C Krogh, Basic linear algebra subprograms for Fortran C usage, Algorithm No. 539, Transactions on Mathematical C Software 5, 3 (September 1979), pp. 308-323. C***ROUTINES CALLED (NONE) C***REVISION HISTORY (YYMMDD) C 791001 DATE WRITTEN C 890831 Modified array declarations. (WRB) C 890831 REVISION DATE from Version 3.2 C 891214 Prologue converted to Version 4.0 format. (BAB) C 920310 Corrected definition of LX in DESCRIPTION. (WRB) C 920501 Reformatted the REFERENCES section. (WRB) C***END PROLOGUE DSDOT REAL SX(*),SY(*) C***FIRST EXECUTABLE STATEMENT DSDOT DSDOTF = 0.0D0 IF (N .LE. 0) RETURN IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 20 C C Code for unequal or nonpositive increments. C KX = 1 KY = 1 IF (INCX .LT. 0) KX = 1+(1-N)*INCX IF (INCY .LT. 0) KY = 1+(1-N)*INCY DO 10 I = 1,N DSDOTF = DSDOTF + DBLE(SX(KX))*DBLE(SY(KY)) KX = KX + INCX KY = KY + INCY 10 CONTINUE RETURN C C Code for equal, positive, non-unit increments. C 20 NS = N*INCX DO 30 I = 1,NS,INCX DSDOTF = DSDOTF + DBLE(SX(I))*DBLE(SY(I)) 30 CONTINUE RETURN END OpenBLAS-0.2.20/reference/dspmvf.f000066400000000000000000000200101313527062700165410ustar00rootroot00000000000000 SUBROUTINE DSPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. DOUBLE PRECISION AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DSPMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * AP - DOUBLE PRECISION array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 6 ELSE IF( INCY.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSPMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form y when AP contains the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO K = KK DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 50 CONTINUE Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 KK = KK + J 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, K = KK, KK + J - 2 Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + J 80 CONTINUE END IF ELSE * * Form y when AP contains the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*AP( KK ) K = KK + 1 DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 KK = KK + ( N - J + 1 ) 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*AP( KK ) IX = JX IY = JY DO 110, K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + ( N - J + 1 ) 120 CONTINUE END IF END IF * RETURN * * End of DSPMV . * END OpenBLAS-0.2.20/reference/dspr2f.f000066400000000000000000000161611313527062700164560ustar00rootroot00000000000000 SUBROUTINE DSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. DOUBLE PRECISION AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DSPR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an * n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * AP - DOUBLE PRECISION array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSPR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 10, I = 1, J AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 10 CONTINUE END IF KK = KK + J 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, K = KK, KK + J - 1 AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 50, I = J, N AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 50 CONTINUE END IF KK = KK + N - J + 1 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, K = KK, KK + N - J AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of DSPR2 . * END OpenBLAS-0.2.20/reference/dsprf.f000066400000000000000000000135641313527062700164000ustar00rootroot00000000000000 SUBROUTINE DSPRF ( UPLO, N, ALPHA, X, INCX, AP ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX, N CHARACTER*1 UPLO * .. Array Arguments .. DOUBLE PRECISION AP( * ), X( * ) * .. * * Purpose * ======= * * DSPR performs the symmetric rank 1 operation * * A := alpha*x*x' + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * AP - DOUBLE PRECISION array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSPR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) K = KK DO 10, I = 1, J AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 10 CONTINUE END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = KX DO 30, K = KK, KK + J - 1 AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) K = KK DO 50, I = J, N AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 50 CONTINUE END IF KK = KK + N - J + 1 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = JX DO 70, K = KK, KK + N - J AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of DSPR . * END OpenBLAS-0.2.20/reference/dswapf.f000066400000000000000000000024271313527062700165420ustar00rootroot00000000000000 subroutine dswapf (n,dx,incx,dy,incy) c c interchanges two vectors. c uses unrolled loops for increments equal one. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dy(*),dtemp integer i,incx,incy,ix,iy,m,mp1,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n dtemp = dx(ix) dx(ix) = dy(iy) dy(iy) = dtemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,3) if( m .eq. 0 ) go to 40 do 30 i = 1,m dtemp = dx(i) dx(i) = dy(i) dy(i) = dtemp 30 continue if( n .lt. 3 ) return 40 mp1 = m + 1 do 50 i = mp1,n,3 dtemp = dx(i) dx(i) = dy(i) dy(i) = dtemp dtemp = dx(i + 1) dx(i + 1) = dy(i + 1) dy(i + 1) = dtemp dtemp = dx(i + 2) dx(i + 2) = dy(i + 2) dy(i + 2) = dtemp 50 continue return end OpenBLAS-0.2.20/reference/dsymmf.f000066400000000000000000000231511313527062700165520ustar00rootroot00000000000000 SUBROUTINE DSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC DOUBLE PRECISION ALPHA, BETA * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * DSYMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is a symmetric matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the symmetric matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the symmetric matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * symmetric matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * symmetric matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA DOUBLE PRECISION TEMP1, TEMP2 * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSYMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*A( J, J ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*A( J, K ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*A( J, K ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of DSYMM . * END OpenBLAS-0.2.20/reference/dsymvf.f000066400000000000000000000176111313527062700165670ustar00rootroot00000000000000 SUBROUTINE DSYMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DSYMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSYMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( J, J ) DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( J, J ) IX = JX IY = JY DO 110, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of DSYMV . * END OpenBLAS-0.2.20/reference/dsyr2f.f000066400000000000000000000162571313527062700164750ustar00rootroot00000000000000 SUBROUTINE DSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DSYR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an n * by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSYR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of DSYR2 . * END OpenBLAS-0.2.20/reference/dsyr2kf.f000066400000000000000000000254051313527062700166430ustar00rootroot00000000000000 SUBROUTINE DSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDB, LDC DOUBLE PRECISION ALPHA, BETA * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * DSYR2K performs one of the symmetric rank 2k operations * * C := alpha*A*B' + alpha*B*A' + beta*C, * * or * * C := alpha*A'*B + alpha*B'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A and B are n by k matrices in the first case and k by n * matrices in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + * beta*C. * * TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + * beta*C. * * TRANS = 'C' or 'c' C := alpha*A'*B + alpha*B'*A + * beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrices A and B, and on entry with * TRANS = 'T' or 't' or 'C' or 'c', K specifies the number * of rows of the matrices A and B. K must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array B must contain the matrix B, otherwise * the leading k by n part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDB must be at least max( 1, n ), otherwise LDB must * be at least max( 1, k ). * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA DOUBLE PRECISION TEMP1, TEMP2 * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSYR2K', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*B' + alpha*B*A' + C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*B + alpha*B'*A + C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP1 = ZERO TEMP2 = ZERO DO 190, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP1 = ZERO TEMP2 = ZERO DO 220, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of DSYR2K. * END OpenBLAS-0.2.20/reference/dsyrf.f000066400000000000000000000135151313527062700164050ustar00rootroot00000000000000 SUBROUTINE DSYRF ( UPLO, N, ALPHA, X, INCX, A, LDA ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ) * .. * * Purpose * ======= * * DSYR performs the symmetric rank 1 operation * * A := alpha*x*x' + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JX, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSYR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = KX DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = JX DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of DSYR . * END OpenBLAS-0.2.20/reference/dsyrkf.f000066400000000000000000000222061313527062700165550ustar00rootroot00000000000000 SUBROUTINE DSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDC DOUBLE PRECISION ALPHA, BETA * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * DSYRK performs one of the symmetric rank k operations * * C := alpha*A*A' + beta*C, * * or * * C := alpha*A'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A is an n by k matrix in the first case and a k by n matrix * in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. * * TRANS = 'T' or 't' C := alpha*A'*A + beta*C. * * TRANS = 'C' or 'c' C := alpha*A'*A + beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrix A, and on entry with * TRANS = 'T' or 't' or 'C' or 'c', K specifies the number * of rows of the matrix A. K must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA DOUBLE PRECISION TEMP * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSYRK ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*A' + beta*C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + TEMP*A( I, L ) 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + TEMP*A( I, L ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*A + beta*C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP = ZERO DO 190, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP = ZERO DO 220, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of DSYRK . * END OpenBLAS-0.2.20/reference/dtbmvf.f000066400000000000000000000261331313527062700165400ustar00rootroot00000000000000 SUBROUTINE DTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, K, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ) * .. * * Purpose * ======= * * DTBMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular band matrix, with ( k + 1 ) diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := A'*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( K.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DTBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = KPLUS1 - J DO 10, I = MAX( 1, J - K ), J - 1 X( I ) = X( I ) + TEMP*A( L + I, J ) 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( KPLUS1, J ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = KPLUS1 - J DO 30, I = MAX( 1, J - K ), J - 1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( KPLUS1, J ) END IF JX = JX + INCX IF( J.GT.K ) $ KX = KX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = 1 - J DO 50, I = MIN( N, J + K ), J + 1, -1 X( I ) = X( I ) + TEMP*A( L + I, J ) 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( 1, J ) END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = 1 - J DO 70, I = MIN( N, J + K ), J + 1, -1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( 1, J ) END IF JX = JX - INCX IF( ( N - J ).GE.K ) $ KX = KX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 100, J = N, 1, -1 TEMP = X( J ) L = KPLUS1 - J IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 90, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( I ) 90 CONTINUE X( J ) = TEMP 100 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 120, J = N, 1, -1 TEMP = X( JX ) KX = KX - INCX IX = KX L = KPLUS1 - J IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 110, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX - INCX 110 CONTINUE X( JX ) = TEMP JX = JX - INCX 120 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 140, J = 1, N TEMP = X( J ) L = 1 - J IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 130, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( I ) 130 CONTINUE X( J ) = TEMP 140 CONTINUE ELSE JX = KX DO 160, J = 1, N TEMP = X( JX ) KX = KX + INCX IX = KX L = 1 - J IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 150, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX + INCX 150 CONTINUE X( JX ) = TEMP JX = JX + INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of DTBMV . * END OpenBLAS-0.2.20/reference/dtbsvf.f000066400000000000000000000256321313527062700165510ustar00rootroot00000000000000 SUBROUTINE DTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) * .. Scalar Arguments .. INTEGER INCX,K,LDA,N CHARACTER DIAG,TRANS,UPLO * .. * .. Array Arguments .. DOUBLE PRECISION A(LDA,*),X(*) * .. * * Purpose * ======= * * DTBSV solves one of the systems of equations * * A*x = b, or A'*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular band matrix, with ( k + 1 ) * diagonals. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Arguments * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' A'*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER (ZERO=0.0D+0) * .. * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L LOGICAL NOUNIT * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX,MIN * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN INFO = 1 ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 2 ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT. (K+1)) THEN INFO = 7 ELSE IF (INCX.EQ.0) THEN INFO = 9 END IF IF (INFO.NE.0) THEN CALL XERBLA('DTBSV ',INFO) RETURN END IF * * Quick return if possible. * IF (N.EQ.0) RETURN * NOUNIT = LSAME(DIAG,'N') * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF (INCX.LE.0) THEN KX = 1 - (N-1)*INCX ELSE IF (INCX.NE.1) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed by sequentially with one pass through A. * IF (LSAME(TRANS,'N')) THEN * * Form x := inv( A )*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 20 J = N,1,-1 IF (X(J).NE.ZERO) THEN L = KPLUS1 - J IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) TEMP = X(J) DO 10 I = J - 1,MAX(1,J-K),-1 X(I) = X(I) - TEMP*A(L+I,J) 10 CONTINUE END IF 20 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 40 J = N,1,-1 KX = KX - INCX IF (X(JX).NE.ZERO) THEN IX = KX L = KPLUS1 - J IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) TEMP = X(JX) DO 30 I = J - 1,MAX(1,J-K),-1 X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX - INCX 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 60 J = 1,N IF (X(J).NE.ZERO) THEN L = 1 - J IF (NOUNIT) X(J) = X(J)/A(1,J) TEMP = X(J) DO 50 I = J + 1,MIN(N,J+K) X(I) = X(I) - TEMP*A(L+I,J) 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1,N KX = KX + INCX IF (X(JX).NE.ZERO) THEN IX = KX L = 1 - J IF (NOUNIT) X(JX) = X(JX)/A(1,J) TEMP = X(JX) DO 70 I = J + 1,MIN(N,J+K) X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A')*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 100 J = 1,N TEMP = X(J) L = KPLUS1 - J DO 90 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(I) 90 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) X(J) = TEMP 100 CONTINUE ELSE JX = KX DO 120 J = 1,N TEMP = X(JX) IX = KX L = KPLUS1 - J DO 110 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX + INCX 110 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) X(JX) = TEMP JX = JX + INCX IF (J.GT.K) KX = KX + INCX 120 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 140 J = N,1,-1 TEMP = X(J) L = 1 - J DO 130 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(I) 130 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) X(J) = TEMP 140 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 160 J = N,1,-1 TEMP = X(JX) IX = KX L = 1 - J DO 150 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX - INCX 150 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) X(JX) = TEMP JX = JX - INCX IF ((N-J).GE.K) KX = KX - INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of DTBSV . * END OpenBLAS-0.2.20/reference/dtpmvf.f000066400000000000000000000223441313527062700165560ustar00rootroot00000000000000 SUBROUTINE DTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. DOUBLE PRECISION AP( * ), X( * ) * .. * * Purpose * ======= * * DTPMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := A'*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - DOUBLE PRECISION array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DTPMVF', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x:= A*x. * IF( LSAME( UPLO, 'U' ) )THEN KK =1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 10, I = 1, J - 1 X( I ) = X( I ) + TEMP*AP( K ) K = K + 1 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*AP( KK + J - 1 ) END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, K = KK, KK + J - 2 X( IX ) = X( IX ) + TEMP*AP( K ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK + J - 1 ) END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 50, I = N, J + 1, -1 X( I ) = X( I ) + TEMP*AP( K ) K = K - 1 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*AP( KK - N + J ) END IF KK = KK - ( N - J + 1 ) 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 X( IX ) = X( IX ) + TEMP*AP( K ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK - N + J ) END IF JX = JX - INCX KK = KK - ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := A'*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 100, J = N, 1, -1 TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) K = KK - 1 DO 90, I = J - 1, 1, -1 TEMP = TEMP + AP( K )*X( I ) K = K - 1 90 CONTINUE X( J ) = TEMP KK = KK - J 100 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 120, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 110, K = KK - 1, KK - J + 1, -1 IX = IX - INCX TEMP = TEMP + AP( K )*X( IX ) 110 CONTINUE X( JX ) = TEMP JX = JX - INCX KK = KK - J 120 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 140, J = 1, N TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) K = KK + 1 DO 130, I = J + 1, N TEMP = TEMP + AP( K )*X( I ) K = K + 1 130 CONTINUE X( J ) = TEMP KK = KK + ( N - J + 1 ) 140 CONTINUE ELSE JX = KX DO 160, J = 1, N TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 150, K = KK + 1, KK + N - J IX = IX + INCX TEMP = TEMP + AP( K )*X( IX ) 150 CONTINUE X( JX ) = TEMP JX = JX + INCX KK = KK + ( N - J + 1 ) 160 CONTINUE END IF END IF END IF * RETURN * * End of DTPMV . * END OpenBLAS-0.2.20/reference/dtpsvf.f000066400000000000000000000226151313527062700165650ustar00rootroot00000000000000 SUBROUTINE DTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. DOUBLE PRECISION AP( * ), X( * ) * .. * * Purpose * ======= * * DTPSV solves one of the systems of equations * * A*x = b, or A'*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix, supplied in packed form. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' A'*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - DOUBLE PRECISION array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DTPSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) TEMP = X( J ) K = KK - 1 DO 10, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*AP( K ) K = K - 1 10 CONTINUE END IF KK = KK - J 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) TEMP = X( JX ) IX = JX DO 30, K = KK - 1, KK - J + 1, -1 IX = IX - INCX X( IX ) = X( IX ) - TEMP*AP( K ) 30 CONTINUE END IF JX = JX - INCX KK = KK - J 40 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) TEMP = X( J ) K = KK + 1 DO 50, I = J + 1, N X( I ) = X( I ) - TEMP*AP( K ) K = K + 1 50 CONTINUE END IF KK = KK + ( N - J + 1 ) 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) TEMP = X( JX ) IX = JX DO 70, K = KK + 1, KK + N - J IX = IX + INCX X( IX ) = X( IX ) - TEMP*AP( K ) 70 CONTINUE END IF JX = JX + INCX KK = KK + ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = 1 IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = X( J ) K = KK DO 90, I = 1, J - 1 TEMP = TEMP - AP( K )*X( I ) K = K + 1 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) X( J ) = TEMP KK = KK + J 100 CONTINUE ELSE JX = KX DO 120, J = 1, N TEMP = X( JX ) IX = KX DO 110, K = KK, KK + J - 2 TEMP = TEMP - AP( K )*X( IX ) IX = IX + INCX 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) X( JX ) = TEMP JX = JX + INCX KK = KK + J 120 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 140, J = N, 1, -1 TEMP = X( J ) K = KK DO 130, I = N, J + 1, -1 TEMP = TEMP - AP( K )*X( I ) K = K - 1 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) X( J ) = TEMP KK = KK - ( N - J + 1 ) 140 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 160, J = N, 1, -1 TEMP = X( JX ) IX = KX DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1 TEMP = TEMP - AP( K )*X( IX ) IX = IX - INCX 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) X( JX ) = TEMP JX = JX - INCX KK = KK - (N - J + 1 ) 160 CONTINUE END IF END IF END IF * RETURN * * End of DTPSV . * END OpenBLAS-0.2.20/reference/dtrmmf.f000066400000000000000000000263261313527062700165530ustar00rootroot00000000000000 SUBROUTINE DTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB DOUBLE PRECISION ALPHA * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DTRMM performs one of the matrix-matrix operations * * B := alpha*op( A )*B, or B := alpha*B*op( A ), * * where alpha is a scalar, B is an m by n matrix, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A'. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) multiplies B from * the left or right as follows: * * SIDE = 'L' or 'l' B := alpha*op( A )*B. * * SIDE = 'R' or 'r' B := alpha*B*op( A ). * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = A'. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B, and on exit is overwritten by the * transformed matrix. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL LSIDE, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA DOUBLE PRECISION TEMP * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DTRMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*A*B. * IF( UPPER )THEN DO 50, J = 1, N DO 40, K = 1, M IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) DO 30, I = 1, K - 1 B( I, J ) = B( I, J ) + TEMP*A( I, K ) 30 CONTINUE IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) B( K, J ) = TEMP END IF 40 CONTINUE 50 CONTINUE ELSE DO 80, J = 1, N DO 70 K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) B( K, J ) = TEMP IF( NOUNIT ) $ B( K, J ) = B( K, J )*A( K, K ) DO 60, I = K + 1, M B( I, J ) = B( I, J ) + TEMP*A( I, K ) 60 CONTINUE END IF 70 CONTINUE 80 CONTINUE END IF ELSE * * Form B := alpha*A'*B. * IF( UPPER )THEN DO 110, J = 1, N DO 100, I = M, 1, -1 TEMP = B( I, J ) IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 90, K = 1, I - 1 TEMP = TEMP + A( K, I )*B( K, J ) 90 CONTINUE B( I, J ) = ALPHA*TEMP 100 CONTINUE 110 CONTINUE ELSE DO 140, J = 1, N DO 130, I = 1, M TEMP = B( I, J ) IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 120, K = I + 1, M TEMP = TEMP + A( K, I )*B( K, J ) 120 CONTINUE B( I, J ) = ALPHA*TEMP 130 CONTINUE 140 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*B*A. * IF( UPPER )THEN DO 180, J = N, 1, -1 TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 150, I = 1, M B( I, J ) = TEMP*B( I, J ) 150 CONTINUE DO 170, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN TEMP = ALPHA*A( K, J ) DO 160, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE ELSE DO 220, J = 1, N TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 190, I = 1, M B( I, J ) = TEMP*B( I, J ) 190 CONTINUE DO 210, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN TEMP = ALPHA*A( K, J ) DO 200, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 200 CONTINUE END IF 210 CONTINUE 220 CONTINUE END IF ELSE * * Form B := alpha*B*A'. * IF( UPPER )THEN DO 260, K = 1, N DO 240, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN TEMP = ALPHA*A( J, K ) DO 230, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 230 CONTINUE END IF 240 CONTINUE TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) IF( TEMP.NE.ONE )THEN DO 250, I = 1, M B( I, K ) = TEMP*B( I, K ) 250 CONTINUE END IF 260 CONTINUE ELSE DO 300, K = N, 1, -1 DO 280, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN TEMP = ALPHA*A( J, K ) DO 270, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 270 CONTINUE END IF 280 CONTINUE TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) IF( TEMP.NE.ONE )THEN DO 290, I = 1, M B( I, K ) = TEMP*B( I, K ) 290 CONTINUE END IF 300 CONTINUE END IF END IF END IF * RETURN * * End of DTRMM . * END OpenBLAS-0.2.20/reference/dtrmvf.f000066400000000000000000000212371313527062700165600ustar00rootroot00000000000000 SUBROUTINE DTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ) * .. * * Purpose * ======= * * DTRMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := A'*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DTRMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 10, I = 1, J - 1 X( I ) = X( I ) + TEMP*A( I, J ) 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, I = 1, J - 1 X( IX ) = X( IX ) + TEMP*A( I, J ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) END IF JX = JX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 50, I = N, J + 1, -1 X( I ) = X( I ) + TEMP*A( I, J ) 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, I = N, J + 1, -1 X( IX ) = X( IX ) + TEMP*A( I, J ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) END IF JX = JX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 100, J = N, 1, -1 TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 90, I = J - 1, 1, -1 TEMP = TEMP + A( I, J )*X( I ) 90 CONTINUE X( J ) = TEMP 100 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 120, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 110, I = J - 1, 1, -1 IX = IX - INCX TEMP = TEMP + A( I, J )*X( IX ) 110 CONTINUE X( JX ) = TEMP JX = JX - INCX 120 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 140, J = 1, N TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 130, I = J + 1, N TEMP = TEMP + A( I, J )*X( I ) 130 CONTINUE X( J ) = TEMP 140 CONTINUE ELSE JX = KX DO 160, J = 1, N TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 150, I = J + 1, N IX = IX + INCX TEMP = TEMP + A( I, J )*X( IX ) 150 CONTINUE X( JX ) = TEMP JX = JX + INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of DTRMV . * END OpenBLAS-0.2.20/reference/dtrsmf.f000066400000000000000000000277721313527062700165670ustar00rootroot00000000000000 SUBROUTINE DTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB DOUBLE PRECISION ALPHA * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * DTRSM solves one of the matrix equations * * op( A )*X = alpha*B, or X*op( A ) = alpha*B, * * where alpha is a scalar, X and B are m by n matrices, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A'. * * The matrix X is overwritten on B. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) appears on the left * or right of X as follows: * * SIDE = 'L' or 'l' op( A )*X = alpha*B. * * SIDE = 'R' or 'r' X*op( A ) = alpha*B. * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = A'. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the right-hand side matrix B, and on exit is * overwritten by the solution matrix X. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL LSIDE, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA DOUBLE PRECISION TEMP * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DTRSM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*inv( A )*B. * IF( UPPER )THEN DO 60, J = 1, N IF( ALPHA.NE.ONE )THEN DO 30, I = 1, M B( I, J ) = ALPHA*B( I, J ) 30 CONTINUE END IF DO 50, K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/A( K, K ) DO 40, I = 1, K - 1 B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 40 CONTINUE END IF 50 CONTINUE 60 CONTINUE ELSE DO 100, J = 1, N IF( ALPHA.NE.ONE )THEN DO 70, I = 1, M B( I, J ) = ALPHA*B( I, J ) 70 CONTINUE END IF DO 90 K = 1, M IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/A( K, K ) DO 80, I = K + 1, M B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 80 CONTINUE END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form B := alpha*inv( A' )*B. * IF( UPPER )THEN DO 130, J = 1, N DO 120, I = 1, M TEMP = ALPHA*B( I, J ) DO 110, K = 1, I - 1 TEMP = TEMP - A( K, I )*B( K, J ) 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) B( I, J ) = TEMP 120 CONTINUE 130 CONTINUE ELSE DO 160, J = 1, N DO 150, I = M, 1, -1 TEMP = ALPHA*B( I, J ) DO 140, K = I + 1, M TEMP = TEMP - A( K, I )*B( K, J ) 140 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) B( I, J ) = TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*B*inv( A ). * IF( UPPER )THEN DO 210, J = 1, N IF( ALPHA.NE.ONE )THEN DO 170, I = 1, M B( I, J ) = ALPHA*B( I, J ) 170 CONTINUE END IF DO 190, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN DO 180, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 180 CONTINUE END IF 190 CONTINUE IF( NOUNIT )THEN TEMP = ONE/A( J, J ) DO 200, I = 1, M B( I, J ) = TEMP*B( I, J ) 200 CONTINUE END IF 210 CONTINUE ELSE DO 260, J = N, 1, -1 IF( ALPHA.NE.ONE )THEN DO 220, I = 1, M B( I, J ) = ALPHA*B( I, J ) 220 CONTINUE END IF DO 240, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN DO 230, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 230 CONTINUE END IF 240 CONTINUE IF( NOUNIT )THEN TEMP = ONE/A( J, J ) DO 250, I = 1, M B( I, J ) = TEMP*B( I, J ) 250 CONTINUE END IF 260 CONTINUE END IF ELSE * * Form B := alpha*B*inv( A' ). * IF( UPPER )THEN DO 310, K = N, 1, -1 IF( NOUNIT )THEN TEMP = ONE/A( K, K ) DO 270, I = 1, M B( I, K ) = TEMP*B( I, K ) 270 CONTINUE END IF DO 290, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN TEMP = A( J, K ) DO 280, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 280 CONTINUE END IF 290 CONTINUE IF( ALPHA.NE.ONE )THEN DO 300, I = 1, M B( I, K ) = ALPHA*B( I, K ) 300 CONTINUE END IF 310 CONTINUE ELSE DO 360, K = 1, N IF( NOUNIT )THEN TEMP = ONE/A( K, K ) DO 320, I = 1, M B( I, K ) = TEMP*B( I, K ) 320 CONTINUE END IF DO 340, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN TEMP = A( J, K ) DO 330, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 330 CONTINUE END IF 340 CONTINUE IF( ALPHA.NE.ONE )THEN DO 350, I = 1, M B( I, K ) = ALPHA*B( I, K ) 350 CONTINUE END IF 360 CONTINUE END IF END IF END IF * RETURN * * End of DTRSM . * END OpenBLAS-0.2.20/reference/dtrsvf.f000066400000000000000000000214741313527062700165710ustar00rootroot00000000000000 SUBROUTINE DTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), X( * ) * .. * * Purpose * ======= * * DTRSV solves one of the systems of equations * * A*x = b, or A'*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' A'*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. DOUBLE PRECISION TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DTRSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 10, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*A( I, J ) 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) TEMP = X( JX ) IX = JX DO 30, I = J - 1, 1, -1 IX = IX - INCX X( IX ) = X( IX ) - TEMP*A( I, J ) 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 50, I = J + 1, N X( I ) = X( I ) - TEMP*A( I, J ) 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) TEMP = X( JX ) IX = JX DO 70, I = J + 1, N IX = IX + INCX X( IX ) = X( IX ) - TEMP*A( I, J ) 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = X( J ) DO 90, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( I ) 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( J ) = TEMP 100 CONTINUE ELSE JX = KX DO 120, J = 1, N TEMP = X( JX ) IX = KX DO 110, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX + INCX 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( JX ) = TEMP JX = JX + INCX 120 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 140, J = N, 1, -1 TEMP = X( J ) DO 130, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( I ) 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( J ) = TEMP 140 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 160, J = N, 1, -1 TEMP = X( JX ) IX = KX DO 150, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX - INCX 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( JX ) = TEMP JX = JX - INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of DTRSV . * END OpenBLAS-0.2.20/reference/dtrti2f.f000066400000000000000000000101521313527062700166260ustar00rootroot00000000000000 SUBROUTINE DTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DTRTI2 computes the inverse of a real upper or lower triangular * matrix. * * This is the Level 2 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the matrix A is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * DIAG (input) CHARACTER*1 * Specifies whether or not the matrix A is unit triangular. * = 'N': Non-unit triangular * = 'U': Unit triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading n by n upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J DOUBLE PRECISION AJJ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DSCAL, DTRMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DTRTI2', -INFO ) RETURN END IF * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix. * DO 10 J = 1, N IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF * * Compute elements 1:j-1 of j-th column. * CALL DTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, $ A( 1, J ), 1 ) CALL DSCAL( J-1, AJJ, A( 1, J ), 1 ) 10 CONTINUE ELSE * * Compute inverse of lower triangular matrix. * DO 20 J = N, 1, -1 IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF IF( J.LT.N ) THEN * * Compute elements j+1:n of j-th column. * CALL DTRMV( 'Lower', 'No transpose', DIAG, N-J, $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) CALL DSCAL( N-J, AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF * RETURN * * End of DTRTI2 * END OpenBLAS-0.2.20/reference/dtrtrif.f000066400000000000000000000121741313527062700167340ustar00rootroot00000000000000 SUBROUTINE DTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ) * .. * * Purpose * ======= * * DTRTRI computes the inverse of a real upper or lower triangular * matrix A. * * This is the Level 3 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': A is upper triangular; * = 'L': A is lower triangular. * * DIAG (input) CHARACTER*1 * = 'N': A is non-unit triangular; * = 'U': A is unit triangular. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading N-by-N upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, A(i,i) is exactly zero. The triangular * matrix is singular and its inverse can not be computed. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J, JB, NB, NN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL DTRMM, DTRSM, DTRTI2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DTRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN DO 10 INFO = 1, N IF( A( INFO, INFO ).EQ.ZERO ) $ RETURN 10 CONTINUE INFO = 0 END IF * * Determine the block size for this environment. * NB = 128 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * DO 20 J = 1, N, NB JB = MIN( NB, N-J+1 ) * * Compute rows 1:j-1 of current block column * CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, $ JB, ONE, A, LDA, A( 1, J ), LDA ) CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) * * Compute inverse of current diagonal block * CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) 20 CONTINUE ELSE * * Compute inverse of lower triangular matrix * NN = ( ( N-1 ) / NB )*NB + 1 DO 30 J = NN, 1, -NB JB = MIN( NB, N-J+1 ) IF( J+JB.LE.N ) THEN * * Compute rows j+jb:n of current block column * CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, $ A( J+JB, J ), LDA ) CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, $ A( J+JB, J ), LDA ) END IF * * Compute inverse of current diagonal block * CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) 30 CONTINUE END IF END IF * RETURN * * End of DTRTRI * END OpenBLAS-0.2.20/reference/dzamaxf.f000066400000000000000000000017321313527062700167060ustar00rootroot00000000000000 REAL*8 function dzamaxf(n,zx,incx) c c finds the index of element having max. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c COMPLEX*16 zx(*) integer i,incx,ix,n double precision dcabs1 c dzamaxf = 0. if( n.lt.1 .or. incx.le.0 )return dzamaxf = dcabs1(zx(1)) if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dzamaxf = dcabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(dcabs1(zx(ix)).le.dzamaxf) go to 5 dzamaxf = i dzamaxf = dcabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dzamaxf = dcabs1(zx(1)) do 30 i = 2,n if(dcabs1(zx(i)).le.dzamaxf) go to 30 dzamaxf = i dzamaxf = dcabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/dzaminf.f000066400000000000000000000016601313527062700167040ustar00rootroot00000000000000 REAL*8 function dzaminf(n,zx,incx) c c finds the index of element having min. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c COMPLEX*16 zx(*) integer i,incx,ix,n double precision dcabs1 c dzaminf = 0. if( n.lt.1 .or. incx.le.0 )return dzaminf = dcabs1(zx(1)) if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dzaminf = dcabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(dcabs1(zx(ix)).ge.dzaminf) go to 5 dzaminf = dcabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dzaminf = dcabs1(zx(1)) do 30 i = 2,n if(dcabs1(zx(i)).ge.dzaminf) go to 30 dzaminf = dcabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/dzasumf.f000066400000000000000000000014331313527062700167230ustar00rootroot00000000000000 double precision function dzasumf(n,zx,incx) c c takes the sum of the absolute values. c jack dongarra, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*) double precision stemp,dcabs1 integer i,incx,ix,n c dzasumf = 0.0d0 stemp = 0.0d0 if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 do 10 i = 1,n stemp = stemp + dcabs1(zx(ix)) ix = ix + incx 10 continue dzasumf = stemp return c c code for increment equal to 1 c 20 do 30 i = 1,n stemp = stemp + dcabs1(zx(i)) 30 continue dzasumf = stemp return end OpenBLAS-0.2.20/reference/dznrm2f.f000066400000000000000000000036651313527062700166450ustar00rootroot00000000000000 DOUBLE PRECISION FUNCTION DZNRM2F( N, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N * .. Array Arguments .. COMPLEX*16 X( * ) * .. * * DZNRM2 returns the euclidean norm of a vector via the function * name, so that * * DZNRM2 := sqrt( conjg( x' )*x ) * * * * -- This version written on 25-October-1982. * Modified on 14-October-1993 to inline the call to ZLASSQ. * Sven Hammarling, Nag Ltd. * * * .. Parameters .. DOUBLE PRECISION ONE , ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. Local Scalars .. INTEGER IX DOUBLE PRECISION NORM, SCALE, SSQ, TEMP * .. Intrinsic Functions .. INTRINSIC ABS, DIMAG, DBLE, SQRT * .. * .. Executable Statements .. IF( N.LT.1 .OR. INCX.LT.1 )THEN NORM = ZERO ELSE SCALE = ZERO SSQ = ONE * The following loop is equivalent to this call to the LAPACK * auxiliary routine: * CALL ZLASSQ( N, X, INCX, SCALE, SSQ ) * DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX IF( DBLE( X( IX ) ).NE.ZERO )THEN TEMP = ABS( DBLE( X( IX ) ) ) IF( SCALE.LT.TEMP )THEN SSQ = ONE + SSQ*( SCALE/TEMP )**2 SCALE = TEMP ELSE SSQ = SSQ + ( TEMP/SCALE )**2 END IF END IF IF( DIMAG( X( IX ) ).NE.ZERO )THEN TEMP = ABS( DIMAG( X( IX ) ) ) IF( SCALE.LT.TEMP )THEN SSQ = ONE + SSQ*( SCALE/TEMP )**2 SCALE = TEMP ELSE SSQ = SSQ + ( TEMP/SCALE )**2 END IF END IF 10 CONTINUE NORM = SCALE * SQRT( SSQ ) END IF * DZNRM2F = NORM RETURN * * End of DZNRM2. * END OpenBLAS-0.2.20/reference/icamaxf.f000066400000000000000000000017071313527062700166660ustar00rootroot00000000000000 integer function icamaxf(n,cx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*) real smax integer i,incx,ix,n real scabs1 c icamaxf = 0 if( n.lt.1 .or. incx.le.0 ) return icamaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smax = scabs1(cx(1)) ix = ix + incx do 10 i = 2,n if(scabs1(cx(ix)).le.smax) go to 5 icamaxf = i smax = scabs1(cx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smax = scabs1(cx(1)) do 30 i = 2,n if(scabs1(cx(i)).le.smax) go to 30 icamaxf = i smax = scabs1(cx(i)) 30 continue return end OpenBLAS-0.2.20/reference/icaminf.f000066400000000000000000000017071313527062700166640ustar00rootroot00000000000000 integer function icaminf(n,cx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*) real smin integer i,incx,ix,n real scabs1 c icaminf = 0 if( n.lt.1 .or. incx.le.0 ) return icaminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smin = scabs1(cx(1)) ix = ix + incx do 10 i = 2,n if(scabs1(cx(ix)).ge.smin) go to 5 icaminf = i smin = scabs1(cx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smin = scabs1(cx(1)) do 30 i = 2,n if(scabs1(cx(i)).ge.smin) go to 30 icaminf = i smin = scabs1(cx(i)) 30 continue return end OpenBLAS-0.2.20/reference/idamaxf.f000066400000000000000000000016471313527062700166720ustar00rootroot00000000000000 integer function idamaxf(n,dx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dmax integer i,incx,ix,n c idamaxf = 0 if( n.lt.1 .or. incx.le.0 ) return idamaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmax = dabs(dx(1)) ix = ix + incx do 10 i = 2,n if(dabs(dx(ix)).le.dmax) go to 5 idamaxf = i dmax = dabs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmax = dabs(dx(1)) do 30 i = 2,n if(dabs(dx(i)).le.dmax) go to 30 idamaxf = i dmax = dabs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/idaminf.f000066400000000000000000000016471313527062700166700ustar00rootroot00000000000000 integer function idaminf(n,dx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dmin integer i,incx,ix,n c idaminf = 0 if( n.lt.1 .or. incx.le.0 ) return idaminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmin = dabs(dx(1)) ix = ix + incx do 10 i = 2,n if(dabs(dx(ix)).ge.dmin) go to 5 idaminf = i dmin = dabs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmin = dabs(dx(1)) do 30 i = 2,n if(dabs(dx(i)).ge.dmin) go to 30 idaminf = i dmin = dabs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/idmaxf.f000066400000000000000000000015651313527062700165300ustar00rootroot00000000000000 integer function idmaxf(n,dx,incx) c c finds the index of element having max. value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dmax integer i,incx,ix,n c idmaxf = 0 if( n.lt.1 .or. incx.le.0 ) return idmaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmax = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).le.dmax) go to 5 idmaxf = i dmax = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmax = dx(1) do 30 i = 2,n if(dx(i).le.dmax) go to 30 idmaxf = i dmax = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/idminf.f000066400000000000000000000015651313527062700165260ustar00rootroot00000000000000 integer function idminf(n,dx,incx) c c finds the index of element having min. value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double precision dx(*),dmin integer i,incx,ix,n c idminf = 0 if( n.lt.1 .or. incx.le.0 ) return idminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmin = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).ge.dmin) go to 5 idminf = i dmin = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmin = dx(1) do 30 i = 2,n if(dx(i).ge.dmin) go to 30 idminf = i dmin = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/iqamaxf.f000066400000000000000000000020431313527062700166760ustar00rootroot00000000000000 REAL*10 function qabs(dx) REAL*10 dx qabs = dx if (dx >= 0) return qabs = -dx return end integer function iqamaxf(n,dx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real*10 dx(*),dmax integer i,incx,ix,n c iqamaxf = 0 if( n.lt.1 .or. incx.le.0 ) return iqamaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmax = qabs(dx(1)) ix = ix + incx do 10 i = 2,n if(qabs(dx(ix)).le.dmax) go to 5 iqamaxf = i dmax = qabs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmax = qabs(dx(1)) do 30 i = 2,n if(qabs(dx(i)).le.dmax) go to 30 iqamaxf = i dmax = qabs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/iqaminf.f000066400000000000000000000020441313527062700166750ustar00rootroot00000000000000 REAL*10 function qabs(dx) REAL*10 dx qabs = dx if (dx >= 0) return qabs = -dx return end integer function iqaminf(n,dx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real*10 dx(*),dmin integer i,incx,ix,n c iqaminf = 0 if( n.lt.1 .or. incx.le.0 ) return iqaminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmin = qabs(dx(1)) ix = ix + incx do 10 i = 2,n if(qabs(dx(ix)).ge.dmin) go to 5 iqaminf = i dmin = qabs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmin = qabs(dx(1)) do 30 i = 2,n if(qabs(dx(i)).ge.dmin) go to 30 iqaminf = i dmin = qabs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/iqmaxf.f000066400000000000000000000015541313527062700165430ustar00rootroot00000000000000 integer function iqmaxf(n,dx,incx) c c finds the index of element having max. value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real*10 dx(*),dmax integer i,incx,ix,n c iqmaxf = 0 if( n.lt.1 .or. incx.le.0 ) return iqmaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmax = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).le.dmax) go to 5 iqmaxf = i dmax = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmax = dx(1) do 30 i = 2,n if(dx(i).le.dmax) go to 30 iqmaxf = i dmax = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/iqminf.f000066400000000000000000000015541313527062700165410ustar00rootroot00000000000000 integer function iqminf(n,dx,incx) c c finds the index of element having min. value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real*10 dx(*),dmin integer i,incx,ix,n c iqminf = 0 if( n.lt.1 .or. incx.le.0 ) return iqminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 dmin = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).ge.dmin) go to 5 iqminf = i dmin = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 dmin = dx(1) do 30 i = 2,n if(dx(i).ge.dmin) go to 30 iqminf = i dmin = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/isamaxf.f000066400000000000000000000016251313527062700167050ustar00rootroot00000000000000 integer function isamaxf(n,sx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),smax integer i,incx,ix,n c isamaxf = 0 if( n.lt.1 .or. incx.le.0 ) return isamaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smax = abs(sx(1)) ix = ix + incx do 10 i = 2,n if(abs(sx(ix)).le.smax) go to 5 isamaxf = i smax = abs(sx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smax = abs(sx(1)) do 30 i = 2,n if(abs(sx(i)).le.smax) go to 30 isamaxf = i smax = abs(sx(i)) 30 continue return end OpenBLAS-0.2.20/reference/isaminf.f000066400000000000000000000016251313527062700167030ustar00rootroot00000000000000 integer function isaminf(n,sx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),smin integer i,incx,ix,n c isaminf = 0 if( n.lt.1 .or. incx.le.0 ) return isaminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smin = abs(sx(1)) ix = ix + incx do 10 i = 2,n if(abs(sx(ix)).ge.smin) go to 5 isaminf = i smin = abs(sx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smin = abs(sx(1)) do 30 i = 2,n if(abs(sx(i)).ge.smin) go to 30 isaminf = i smin = abs(sx(i)) 30 continue return end OpenBLAS-0.2.20/reference/ismaxf.f000066400000000000000000000015511313527062700165420ustar00rootroot00000000000000 integer function ismaxf(n,sx,incx) c c finds the index of element having max. value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),smax integer i,incx,ix,n c ismaxf = 0 if( n.lt.1 .or. incx.le.0 ) return ismaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smax = sx(1) ix = ix + incx do 10 i = 2,n if(sx(ix).le.smax) go to 5 ismaxf = i smax = sx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smax = sx(1) do 30 i = 2,n if(sx(i).le.smax) go to 30 ismaxf = i smax = sx(i) 30 continue return end OpenBLAS-0.2.20/reference/isminf.f000066400000000000000000000015511313527062700165400ustar00rootroot00000000000000 integer function isminf(n,sx,incx) c c finds the index of element having min. value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),smin integer i,incx,ix,n c isminf = 0 if( n.lt.1 .or. incx.le.0 ) return isminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smin = sx(1) ix = ix + incx do 10 i = 2,n if(sx(ix).ge.smin) go to 5 isminf = i smin = sx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smin = sx(1) do 30 i = 2,n if(sx(i).ge.smin) go to 30 isminf = i smin = sx(i) 30 continue return end OpenBLAS-0.2.20/reference/ixamaxf.f000066400000000000000000000017061313527062700167120ustar00rootroot00000000000000 integer function ixamaxf(n,zx,incx) c c finds the index of element having max. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c complex*20 zx(*) real*10 smax integer i,incx,ix,n real*10 qcabs1 c ixamaxf = 0 if( n.lt.1 .or. incx.le.0 )return ixamaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smax = qcabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(qcabs1(zx(ix)).le.smax) go to 5 ixamaxf = i smax = qcabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smax = qcabs1(zx(1)) do 30 i = 2,n if(qcabs1(zx(i)).le.smax) go to 30 ixamaxf = i smax = qcabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/ixaminf.f000066400000000000000000000017061313527062700167100ustar00rootroot00000000000000 integer function ixaminf(n,zx,incx) c c finds the index of element having min. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c complex*20 zx(*) real*10 smin integer i,incx,ix,n real*10 qcabs1 c ixaminf = 0 if( n.lt.1 .or. incx.le.0 )return ixaminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smin = qcabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(qcabs1(zx(ix)).ge.smin) go to 5 ixaminf = i smin = qcabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smin = qcabs1(zx(1)) do 30 i = 2,n if(qcabs1(zx(i)).ge.smin) go to 30 ixaminf = i smin = qcabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/izamaxf.f000066400000000000000000000017341313527062700167150ustar00rootroot00000000000000 integer function izamaxf(n,zx,incx) c c finds the index of element having max. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*) double precision smax integer i,incx,ix,n double precision dcabs1 c izamaxf = 0 if( n.lt.1 .or. incx.le.0 )return izamaxf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smax = dcabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(dcabs1(zx(ix)).le.smax) go to 5 izamaxf = i smax = dcabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smax = dcabs1(zx(1)) do 30 i = 2,n if(dcabs1(zx(i)).le.smax) go to 30 izamaxf = i smax = dcabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/izaminf.f000066400000000000000000000017341313527062700167130ustar00rootroot00000000000000 integer function izaminf(n,zx,incx) c c finds the index of element having min. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*) double precision smin integer i,incx,ix,n double precision dcabs1 c izaminf = 0 if( n.lt.1 .or. incx.le.0 )return izaminf = 1 if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smin = dcabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(dcabs1(zx(ix)).ge.smin) go to 5 izaminf = i smin = dcabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smin = dcabs1(zx(1)) do 30 i = 2,n if(dcabs1(zx(i)).ge.smin) go to 30 izaminf = i smin = dcabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/lsamef.f000066400000000000000000000046561313527062700165330ustar00rootroot00000000000000 LOGICAL FUNCTION LSAME( CA, CB ) * * -- LAPACK auxiliary routine (version 2.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * January 31, 1994 * * .. Scalar Arguments .. CHARACTER CA, CB * .. * * Purpose * ======= * * LSAME returns .TRUE. if CA is the same letter as CB regardless of * case. * * Arguments * ========= * * CA (input) CHARACTER*1 * CB (input) CHARACTER*1 * CA and CB specify the single characters to be compared. * * ===================================================================== * * .. Intrinsic Functions .. INTRINSIC ICHAR * .. * .. Local Scalars .. INTEGER INTA, INTB, ZCODE * .. * .. Executable Statements .. * * Test if the characters are equal * LSAME = CA.EQ.CB IF( LSAME ) $ RETURN * * Now test for equivalence if both characters are alphabetic. * ZCODE = ICHAR( 'Z' ) * * Use 'Z' rather than 'A' so that ASCII can be detected on Prime * machines, on which ICHAR returns a value with bit 8 set. * ICHAR('A') on Prime machines returns 193 which is the same as * ICHAR('A') on an EBCDIC machine. * INTA = ICHAR( CA ) INTB = ICHAR( CB ) * IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN * * ASCII is assumed - ZCODE is the ASCII code of either lower or * upper case 'Z'. * IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32 IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32 * ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN * * EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or * upper case 'Z'. * IF( INTA.GE.129 .AND. INTA.LE.137 .OR. $ INTA.GE.145 .AND. INTA.LE.153 .OR. $ INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64 IF( INTB.GE.129 .AND. INTB.LE.137 .OR. $ INTB.GE.145 .AND. INTB.LE.153 .OR. $ INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64 * ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN * * ASCII is assumed, on Prime machines - ZCODE is the ASCII code * plus 128 of either lower or upper case 'Z'. * IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32 IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32 END IF LSAME = INTA.EQ.INTB * * RETURN * * End of LSAME * END OpenBLAS-0.2.20/reference/samaxf.f000066400000000000000000000015121313527062700165270ustar00rootroot00000000000000 REAL*4 function samaxf(n,dx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c REAL*4 dx(*) integer i,incx,ix,n c samaxf = 0. if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 samaxf = abs(dx(1)) ix = ix + incx do 10 i = 2,n if(abs(dx(ix)).le.samaxf) go to 5 samaxf = abs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 samaxf = abs(dx(1)) do 30 i = 2,n if(abs(dx(i)).le.samaxf) go to 30 samaxf = abs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/saminf.f000066400000000000000000000015111313527062700165240ustar00rootroot00000000000000 REAL*4 function saminf(n,dx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c REAL*4 dx(*) integer i,incx,ix,n c saminf = 0 if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 saminf = abs(dx(1)) ix = ix + incx do 10 i = 2,n if(abs(dx(ix)).ge.saminf) go to 5 saminf = abs(dx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 saminf = abs(dx(1)) do 30 i = 2,n if(abs(dx(i)).ge.saminf) go to 30 saminf = abs(dx(i)) 30 continue return end OpenBLAS-0.2.20/reference/sasumf.f000066400000000000000000000021011313527062700165410ustar00rootroot00000000000000 real function sasumf(n,sx,incx) c c takes the sum of the absolute values. c uses unrolled loops for increment equal to one. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),stemp integer i,incx,m,mp1,n,nincx c sasumf = 0.0e0 stemp = 0.0e0 if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c nincx = n*incx do 10 i = 1,nincx,incx stemp = stemp + abs(sx(i)) 10 continue sasumf = stemp return c c code for increment equal to 1 c c c clean-up loop c 20 m = mod(n,6) if( m .eq. 0 ) go to 40 do 30 i = 1,m stemp = stemp + abs(sx(i)) 30 continue if( n .lt. 6 ) go to 60 40 mp1 = m + 1 do 50 i = mp1,n,6 stemp = stemp + abs(sx(i)) + abs(sx(i + 1)) + abs(sx(i + 2)) * + abs(sx(i + 3)) + abs(sx(i + 4)) + abs(sx(i + 5)) 50 continue 60 sasumf = stemp return end OpenBLAS-0.2.20/reference/saxpyf.f000066400000000000000000000022711313527062700165650ustar00rootroot00000000000000 subroutine saxpyf(n,sa,sx,incx,sy,incy) c c constant times a vector plus a vector. c uses unrolled loop for increments equal to one. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),sy(*),sa integer i,incx,incy,ix,iy,m,mp1,n c if(n.le.0)return if (sa .eq. 0.0) return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n sy(iy) = sy(iy) + sa*sx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,4) if( m .eq. 0 ) go to 40 do 30 i = 1,m sy(i) = sy(i) + sa*sx(i) 30 continue if( n .lt. 4 ) return 40 mp1 = m + 1 do 50 i = mp1,n,4 sy(i) = sy(i) + sa*sx(i) sy(i + 1) = sy(i + 1) + sa*sx(i + 1) sy(i + 2) = sy(i + 2) + sa*sx(i + 2) sy(i + 3) = sy(i + 3) + sa*sx(i + 3) 50 continue return end OpenBLAS-0.2.20/reference/scamaxf.f000066400000000000000000000017171313527062700167010ustar00rootroot00000000000000 REAL*4 function scamaxf(n,zx,incx) c c finds the index of element having max. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c COMPLEX*8 zx(*) integer i,incx,ix,n REAL*4 scabs1 c scamaxf = 0. if( n.lt.1 .or. incx.le.0 )return scamaxf = scabs1(zx(1)) if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 scamaxf = scabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(scabs1(zx(ix)).le.scamaxf) go to 5 scamaxf = i scamaxf = scabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 scamaxf = scabs1(zx(1)) do 30 i = 2,n if(scabs1(zx(i)).le.scamaxf) go to 30 scamaxf = i scamaxf = scabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/scaminf.f000066400000000000000000000016451313527062700166770ustar00rootroot00000000000000 REAL*4 function scaminf(n,zx,incx) c c finds the index of element having min. absolute value. c jack dongarra, 1/15/85. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c COMPLEX*8 zx(*) integer i,incx,ix,n REAL*4 scabs1 c scaminf = 0. if( n.lt.1 .or. incx.le.0 )return scaminf = scabs1(zx(1)) if(n.eq.1)return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 scaminf = scabs1(zx(1)) ix = ix + incx do 10 i = 2,n if(scabs1(zx(ix)).ge.scaminf) go to 5 scaminf = scabs1(zx(ix)) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 scaminf = scabs1(zx(1)) do 30 i = 2,n if(scabs1(zx(i)).ge.scaminf) go to 30 scaminf = scabs1(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/scasumf.f000066400000000000000000000015501313527062700167130ustar00rootroot00000000000000 real function scasumf(n,cx,incx) c c takes the sum of the absolute values of a complex vector and c returns a single precision result. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c complex cx(*) real stemp integer i,incx,n,nincx c scasumf = 0.0e0 stemp = 0.0e0 if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c nincx = n*incx do 10 i = 1,nincx,incx stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i))) 10 continue scasumf = stemp return c c code for increment equal to 1 c 20 do 30 i = 1,n stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i))) 30 continue scasumf = stemp return end OpenBLAS-0.2.20/reference/scnrm2f.f000066400000000000000000000036651313527062700166350ustar00rootroot00000000000000 REAL FUNCTION SCNRM2F( N, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N * .. Array Arguments .. COMPLEX X( * ) * .. * * SCNRM2 returns the euclidean norm of a vector via the function * name, so that * * SCNRM2 := sqrt( conjg( x' )*x ) * * * * -- This version written on 25-October-1982. * Modified on 14-October-1993 to inline the call to CLASSQ. * Sven Hammarling, Nag Ltd. * * * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. Local Scalars .. INTEGER IX REAL NORM, SCALE, SSQ, TEMP * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, REAL, SQRT * .. * .. Executable Statements .. IF( N.LT.1 .OR. INCX.LT.1 )THEN NORM = ZERO ELSE SCALE = ZERO SSQ = ONE * The following loop is equivalent to this call to the LAPACK * auxiliary routine: * CALL CLASSQ( N, X, INCX, SCALE, SSQ ) * DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX IF( REAL( X( IX ) ).NE.ZERO )THEN TEMP = ABS( REAL( X( IX ) ) ) IF( SCALE.LT.TEMP )THEN SSQ = ONE + SSQ*( SCALE/TEMP )**2 SCALE = TEMP ELSE SSQ = SSQ + ( TEMP/SCALE )**2 END IF END IF IF( AIMAG( X( IX ) ).NE.ZERO )THEN TEMP = ABS( AIMAG( X( IX ) ) ) IF( SCALE.LT.TEMP )THEN SSQ = ONE + SSQ*( SCALE/TEMP )**2 SCALE = TEMP ELSE SSQ = SSQ + ( TEMP/SCALE )**2 END IF END IF 10 CONTINUE NORM = SCALE * SQRT( SSQ ) END IF * SCNRM2F = NORM RETURN * * End of SCNRM2. * END OpenBLAS-0.2.20/reference/scopyf.f000066400000000000000000000022341313527062700165550ustar00rootroot00000000000000 subroutine scopyf(n,sx,incx,sy,incy) c c copies a vector, x, to a vector, y. c uses unrolled loops for increments equal to 1. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),sy(*) integer i,incx,incy,ix,iy,m,mp1,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n sy(iy) = sx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,7) if( m .eq. 0 ) go to 40 do 30 i = 1,m sy(i) = sx(i) 30 continue if( n .lt. 7 ) return 40 mp1 = m + 1 do 50 i = mp1,n,7 sy(i) = sx(i) sy(i + 1) = sx(i + 1) sy(i + 2) = sx(i + 2) sy(i + 3) = sx(i + 3) sy(i + 4) = sx(i + 4) sy(i + 5) = sx(i + 5) sy(i + 6) = sx(i + 6) 50 continue return end OpenBLAS-0.2.20/reference/sdotf.f000066400000000000000000000023211313527062700163660ustar00rootroot00000000000000 real function sdotf(n,sx,incx,sy,incy) c c forms the dot product of two vectors. c uses unrolled loops for increments equal to one. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),sy(*),stemp integer i,incx,incy,ix,iy,m,mp1,n c stemp = 0.0e0 sdotf = 0.0e0 if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n stemp = stemp + sx(ix)*sy(iy) ix = ix + incx iy = iy + incy 10 continue sdotf = stemp return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,5) if( m .eq. 0 ) go to 40 do 30 i = 1,m stemp = stemp + sx(i)*sy(i) 30 continue if( n .lt. 5 ) go to 60 40 mp1 = m + 1 do 50 i = mp1,n,5 stemp = stemp + sx(i)*sy(i) + sx(i + 1)*sy(i + 1) + * sx(i + 2)*sy(i + 2) + sx(i + 3)*sy(i + 3) + sx(i + 4)*sy(i + 4) 50 continue 60 sdotf = stemp return end OpenBLAS-0.2.20/reference/sdsdotf.f000066400000000000000000000052171313527062700167240ustar00rootroot00000000000000*DECK SDSDOTF REAL FUNCTION SDSDOTF (N, SB, SX, INCX, SY, INCY) C***BEGIN PROLOGUE SDSDOT C***PURPOSE Compute the inner product of two vectors with extended C precision accumulation. C***LIBRARY SLATEC (BLAS) C***CATEGORY D1A4 C***TYPE SINGLE PRECISION (SDSDOT-S, CDCDOT-C) C***KEYWORDS BLAS, DOT PRODUCT, INNER PRODUCT, LINEAR ALGEBRA, VECTOR C***AUTHOR Lawson, C. L., (JPL) C Hanson, R. J., (SNLA) C Kincaid, D. R., (U. of Texas) C Krogh, F. T., (JPL) C***DESCRIPTION C C B L A S Subprogram C Description of Parameters C C --Input-- C N number of elements in input vector(s) C SB single precision scalar to be added to inner product C SX single precision vector with N elements C INCX storage spacing between elements of SX C SY single precision vector with N elements C INCY storage spacing between elements of SY C C --Output-- C SDSDOT single precision dot product (SB if N .LE. 0) C C Returns S.P. result with dot product accumulated in D.P. C SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY), C where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is C defined in a similar way using INCY. C C***REFERENCES C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T. C Krogh, Basic linear algebra subprograms for Fortran C usage, Algorithm No. 539, Transactions on Mathematical C Software 5, 3 (September 1979), pp. 308-323. C***ROUTINES CALLED (NONE) C***REVISION HISTORY (YYMMDD) C 791001 DATE WRITTEN C 890531 Changed all specific intrinsics to generic. (WRB) C 890831 Modified array declarations. (WRB) C 890831 REVISION DATE from Version 3.2 C 891214 Prologue converted to Version 4.0 format. (BAB) C 920310 Corrected definition of LX in DESCRIPTION. (WRB) C 920501 Reformatted the REFERENCES section. (WRB) C***END PROLOGUE SDSDOT REAL SX(*), SY(*), SB DOUBLE PRECISION DSDOT C***FIRST EXECUTABLE STATEMENT SDSDOT DSDOT = SB IF (N .LE. 0) GO TO 30 IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 40 C C Code for unequal or nonpositive increments. C KX = 1 KY = 1 IF (INCX .LT. 0) KX = 1+(1-N)*INCX IF (INCY .LT. 0) KY = 1+(1-N)*INCY DO 10 I = 1,N DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY)) KX = KX + INCX KY = KY + INCY 10 CONTINUE 30 SDSDOTF = DSDOT RETURN C C Code for equal and positive increments. C 40 NS = N*INCX DO 50 I = 1,NS,INCX DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I)) 50 CONTINUE SDSDOTF = DSDOT RETURN END OpenBLAS-0.2.20/reference/sgbmvf.f000066400000000000000000000221171313527062700165400ustar00rootroot00000000000000 SUBROUTINE SGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. REAL ALPHA, BETA INTEGER INCX, INCY, KL, KU, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SGBMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n band matrix, with kl sub-diagonals and ku super-diagonals. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * KL - INTEGER. * On entry, KL specifies the number of sub-diagonals of the * matrix A. KL must satisfy 0 .le. KL. * Unchanged on exit. * * KU - INTEGER. * On entry, KU specifies the number of super-diagonals of the * matrix A. KU must satisfy 0 .le. KU. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry, the leading ( kl + ku + 1 ) by n part of the * array A must contain the matrix of coefficients, supplied * column by column, with the leading diagonal of the matrix in * row ( ku + 1 ) of the array, the first super-diagonal * starting at position 2 in row ku, the first sub-diagonal * starting at position 1 in row ( ku + 2 ), and so on. * Elements in the array A that do not correspond to elements * in the band matrix (such as the top left ku by ku triangle) * are not referenced. * The following program segment will transfer a band matrix * from conventional full matrix storage to band storage: * * DO 20, J = 1, N * K = KU + 1 - J * DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) * A( K + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( kl + ku + 1 ). * Unchanged on exit. * * X - REAL array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - REAL array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, $ LENX, LENY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( KL.LT.0 )THEN INFO = 4 ELSE IF( KU.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN INFO = 8 ELSE IF( INCX.EQ.0 )THEN INFO = 10 ELSE IF( INCY.EQ.0 )THEN INFO = 13 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SGBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) )THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the band part of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KUP1 = KU + 1 IF( LSAME( TRANS, 'N' ) )THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) K = KUP1 - J DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*A( K + I, J ) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IY = KY K = KUP1 - J DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX IF( J.GT.KU ) $ KY = KY + INCY 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = ZERO K = KUP1 - J DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( I ) 90 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 100 CONTINUE ELSE DO 120, J = 1, N TEMP = ZERO IX = KX K = KUP1 - J DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( IX ) IX = IX + INCX 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY IF( J.GT.KU ) $ KX = KX + INCX 120 CONTINUE END IF END IF * RETURN * * End of SGBMV . * END OpenBLAS-0.2.20/reference/sgemmf.f000066400000000000000000000225651313527062700165410ustar00rootroot00000000000000 SUBROUTINE SGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * .. Scalar Arguments .. REAL ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANA,TRANB * .. * .. Array Arguments .. REAL A(LDA,*),B(LDB,*),C(LDC,*) * .. * * Purpose * ======= * * SGEMM performs one of the matrix-matrix operations * * C := alpha*op( A )*op( B ) + beta*C, * * where op( X ) is one of * * op( X ) = X or op( X ) = X', * * alpha and beta are scalars, and A, B and C are matrices, with op( A ) * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. * * Arguments * ========== * * TRANA - CHARACTER*1. * On entry, TRANA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANA = 'N' or 'n', op( A ) = A. * * TRANA = 'T' or 't', op( A ) = A'. * * TRANA = 'C' or 'c', op( A ) = A'. * * Unchanged on exit. * * TRANB - CHARACTER*1. * On entry, TRANB specifies the form of op( B ) to be used in * the matrix multiplication as follows: * * TRANB = 'N' or 'n', op( B ) = B. * * TRANB = 'T' or 't', op( B ) = B'. * * TRANB = 'C' or 'c', op( B ) = B'. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix * op( A ) and of the matrix C. M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix * op( B ) and the number of columns of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of columns of the matrix * op( A ) and the number of rows of the matrix op( B ). K must * be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, ka ), where ka is * k when TRANA = 'N' or 'n', and is m otherwise. * Before entry with TRANA = 'N' or 'n', the leading m by k * part of the array A must contain the matrix A, otherwise * the leading k by m part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANA = 'N' or 'n' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, k ). * Unchanged on exit. * * B - REAL array of DIMENSION ( LDB, kb ), where kb is * n when TRANB = 'N' or 'n', and is k otherwise. * Before entry with TRANB = 'N' or 'n', the leading k by n * part of the array B must contain the matrix B, otherwise * the leading n by k part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANB = 'N' or 'n' then * LDB must be at least max( 1, k ), otherwise LDB must be at * least max( 1, n ). * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - REAL array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n matrix * ( alpha*op( A )*op( B ) + beta*C ). * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Local Scalars .. REAL TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL NOTA,NOTB * .. * .. Parameters .. REAL ONE,ZERO PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) * .. * * Set NOTA and NOTB as true if A and B respectively are not * transposed and set NROWA, NCOLA and NROWB as the number of rows * and columns of A and the number of rows of B respectively. * NOTA = LSAME(TRANA,'N') NOTB = LSAME(TRANB,'N') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF * * Test the input parameters. * INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANA,'C')) .AND. + (.NOT.LSAME(TRANA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANB,'C')) .AND. + (.NOT.LSAME(TRANB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('SGEMM ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN * * And if alpha.eq.zero. * IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF (NOTB) THEN IF (NOTA) THEN * * Form C := alpha*A*B + beta*C. * DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE * * Form C := alpha*A'*B + beta*C * DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE END IF ELSE IF (NOTA) THEN * * Form C := alpha*A*B' + beta*C * DO 170 J = 1,N IF (BETA.EQ.ZERO) THEN DO 130 I = 1,M C(I,J) = ZERO 130 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 140 I = 1,M C(I,J) = BETA*C(I,J) 140 CONTINUE END IF DO 160 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 150 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 150 CONTINUE END IF 160 CONTINUE 170 CONTINUE ELSE * * Form C := alpha*A'*B' + beta*C * DO 200 J = 1,N DO 190 I = 1,M TEMP = ZERO DO 180 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 180 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 190 CONTINUE 200 CONTINUE END IF END IF * RETURN * * End of SGEMM . * END OpenBLAS-0.2.20/reference/sgemvf.f000066400000000000000000000162751313527062700165530ustar00rootroot00000000000000 SUBROUTINE SGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. REAL ALPHA, BETA INTEGER INCX, INCY, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - REAL array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - REAL array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF( LSAME( TRANS, 'N' ) )THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( TRANS, 'N' ) )THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) DO 50, I = 1, M Y( I ) = Y( I ) + TEMP*A( I, J ) 50 CONTINUE END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IY = KY DO 70, I = 1, M Y( IY ) = Y( IY ) + TEMP*A( I, J ) IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = ZERO DO 90, I = 1, M TEMP = TEMP + A( I, J )*X( I ) 90 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 100 CONTINUE ELSE DO 120, J = 1, N TEMP = ZERO IX = KX DO 110, I = 1, M TEMP = TEMP + A( I, J )*X( IX ) IX = IX + INCX 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of SGEMV . * END OpenBLAS-0.2.20/reference/sgerf.f000066400000000000000000000104171313527062700163620ustar00rootroot00000000000000 SUBROUTINE SGERF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. REAL ALPHA INTEGER INCX, INCY, LDA, M, N * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SGER performs the rank 1 operation * * A := alpha*x*y' + A, * * where alpha is a scalar, x is an m element vector, y is an n element * vector and A is an m by n matrix. * * Parameters * ========== * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( m - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the m * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. On exit, A is * overwritten by the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JY, KX * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( M.LT.0 )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SGER ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( INCY.GT.0 )THEN JY = 1 ELSE JY = 1 - ( N - 1 )*INCY END IF IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) DO 10, I = 1, M A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( M - 1 )*INCX END IF DO 40, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) IX = KX DO 30, I = 1, M A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF * RETURN * * End of SGER . * END OpenBLAS-0.2.20/reference/sgesvf.f000066400000000000000000000063611313527062700165540ustar00rootroot00000000000000 SUBROUTINE SGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK driver routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SGESV computes the solution to a real system of linear equations * A * X = B, * where A is an N-by-N matrix and X and B are N-by-NRHS matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor A as * A = P * L * U, * where P is a permutation matrix, L is unit lower triangular, and U is * upper triangular. The factored form of A is then used to solve the * system of equations A * X = B. * * Arguments * ========= * * N (input) INTEGER * The number of linear equations, i.e., the order of the * matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the N-by-N coefficient matrix A. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (output) INTEGER array, dimension (N) * The pivot indices that define the permutation matrix P; * row i of the matrix was interchanged with row IPIV(i). * * B (input/output) REAL array, dimension (LDB,NRHS) * On entry, the N-by-NRHS matrix of right hand side matrix B. * On exit, if INFO = 0, the N-by-NRHS solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, so the solution could not be computed. * * ===================================================================== * * .. External Subroutines .. EXTERNAL SGETRF, SGETRS, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( NRHS.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGESV ', -INFO ) RETURN END IF * * Compute the LU factorization of A. * CALL SGETRF( N, N, A, LDA, IPIV, INFO ) IF( INFO.EQ.0 ) THEN * * Solve the system A*X = B, overwriting B with X. * CALL SGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, $ INFO ) END IF RETURN * * End of SGESV * END OpenBLAS-0.2.20/reference/sgetf2f.f000066400000000000000000000073271313527062700166220ustar00rootroot00000000000000 SUBROUTINE SGETF2F( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1992 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) REAL A( LDA, * ) * .. * * Purpose * ======= * * SGETF2 computes an LU factorization of a general m-by-n matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 2 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the m by n matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, U(k,k) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. INTEGER J, JP * .. * .. External Functions .. INTEGER ISAMAX EXTERNAL ISAMAX * .. * .. External Subroutines .. EXTERNAL SGER, SSCAL, SSWAP, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGETF2', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * DO 10 J = 1, MIN( M, N ) * * Find pivot and test for singularity. * JP = J - 1 + ISAMAX( M-J+1, A( J, J ), 1 ) IPIV( J ) = JP IF( A( JP, J ).NE.ZERO ) THEN * * Apply the interchange to columns 1:N. * IF( JP.NE.J ) $ CALL SSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) * * Compute elements J+1:M of J-th column. * IF( J.LT.M ) $ CALL SSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * INFO = J END IF * IF( J.LT.MIN( M, N ) ) THEN * * Update trailing submatrix. * CALL SGER( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), LDA, $ A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN * * End of SGETF2 * END OpenBLAS-0.2.20/reference/sgetrff.f000066400000000000000000000107451313527062700167200ustar00rootroot00000000000000 SUBROUTINE SGETRFF( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) REAL A( LDA, * ) * .. * * Purpose * ======= * * SGETRF computes an LU factorization of a general M-by-N matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 3 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the M-by-N matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. INTEGER I, IINFO, J, JB, NB * .. * .. External Subroutines .. EXTERNAL SGEMM, SGETF2, SLASWP, STRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 64 IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN * * Use unblocked code. * CALL SGETF2( M, N, A, LDA, IPIV, INFO ) ELSE * * Use blocked code. * DO 20 J = 1, MIN( M, N ), NB JB = MIN( MIN( M, N )-J+1, NB ) * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL SGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) * * Adjust INFO and the pivot indices. * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - 1 DO 10 I = J, MIN( M, J+JB-1 ) IPIV( I ) = J - 1 + IPIV( I ) 10 CONTINUE * * Apply interchanges to columns 1:J-1. * CALL SLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) * IF( J+JB.LE.N ) THEN * * Apply interchanges to columns J+JB:N. * CALL SLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, $ IPIV, 1 ) * * Compute block row of U. * CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), $ LDA ) IF( J+JB.LE.M ) THEN * * Update trailing submatrix. * CALL SGEMM( 'No transpose', 'No transpose', M-J-JB+1, $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), $ LDA ) END IF END IF 20 CONTINUE END IF RETURN * * End of SGETRF * END OpenBLAS-0.2.20/reference/sgetrsf.f000066400000000000000000000101331313527062700167240ustar00rootroot00000000000000 SUBROUTINE SGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * SGETRS solves a system of linear equations * A * X = B or A' * X = B * with a general N-by-N matrix A using the LU factorization computed * by SGETRF. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A'* X = B (Transpose) * = 'C': A'* X = B (Conjugate transpose = Transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input) REAL array, dimension (LDA,N) * The factors L and U from the factorization A = P*L*U * as computed by SGETRF. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (input) INTEGER array, dimension (N) * The pivot indices from SGETRF; for 1<=i<=N, row i of the * matrix was interchanged with row IPIV(i). * * B (input/output) REAL array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOTRAN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SLASWP, STRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * * Solve A * X = B. * * Apply row interchanges to the right hand sides. * CALL SLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) * * Solve L*X = B, overwriting B with X. * CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, $ ONE, A, LDA, B, LDB ) * * Solve U*X = B, overwriting B with X. * CALL STRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, $ NRHS, ONE, A, LDA, B, LDB ) ELSE * * Solve A' * X = B. * * Solve U'*X = B, overwriting B with X. * CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, $ ONE, A, LDA, B, LDB ) * * Solve L'*X = B, overwriting B with X. * CALL STRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, ONE, $ A, LDA, B, LDB ) * * Apply row interchanges to the solution vectors. * CALL SLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) END IF * RETURN * * End of SGETRS * END OpenBLAS-0.2.20/reference/slaswpf.f000066400000000000000000000063531313527062700167370ustar00rootroot00000000000000 SUBROUTINE SLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1999 * * .. Scalar Arguments .. INTEGER INCX, K1, K2, LDA, N * .. * .. Array Arguments .. INTEGER IPIV( * ) REAL A( LDA, * ) * .. * * Purpose * ======= * * SLASWP performs a series of row interchanges on the matrix A. * One row interchange is initiated for each of rows K1 through K2 of A. * * Arguments * ========= * * N (input) INTEGER * The number of columns of the matrix A. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the matrix of column dimension N to which the row * interchanges will be applied. * On exit, the permuted matrix. * * LDA (input) INTEGER * The leading dimension of the array A. * * K1 (input) INTEGER * The first element of IPIV for which a row interchange will * be done. * * K2 (input) INTEGER * The last element of IPIV for which a row interchange will * be done. * * IPIV (input) INTEGER array, dimension (M*abs(INCX)) * The vector of pivot indices. Only the elements in positions * K1 through K2 of IPIV are accessed. * IPIV(K) = L implies rows K and L are to be interchanged. * * INCX (input) INTEGER * The increment between successive values of IPIV. If IPIV * is negative, the pivots are applied in reverse order. * * Further Details * =============== * * Modified by * R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA * * ===================================================================== * * .. Local Scalars .. INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 REAL TEMP * .. * .. Executable Statements .. * * Interchange row I with row IPIV(I) for each of rows K1 through K2. * IF( INCX.GT.0 ) THEN IX0 = K1 I1 = K1 I2 = K2 INC = 1 ELSE IF( INCX.LT.0 ) THEN IX0 = 1 + ( 1-K2 )*INCX I1 = K2 I2 = K1 INC = -1 ELSE RETURN END IF * N32 = ( N / 32 )*32 IF( N32.NE.0 ) THEN DO 30 J = 1, N32, 32 IX = IX0 DO 20 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 10 K = J, J + 31 TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 10 CONTINUE END IF IX = IX + INCX 20 CONTINUE 30 CONTINUE END IF IF( N32.NE.N ) THEN N32 = N32 + 1 IX = IX0 DO 50 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 40 K = N32, N TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 40 CONTINUE END IF IX = IX + INCX 50 CONTINUE END IF * RETURN * * End of SLASWP * END OpenBLAS-0.2.20/reference/slauu2f.f000066400000000000000000000073631313527062700166430ustar00rootroot00000000000000 SUBROUTINE SLAUU2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * SLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I REAL AII * .. * .. External Functions .. LOGICAL LSAME REAL SDOT EXTERNAL LSAME, SDOT * .. * .. External Subroutines .. EXTERNAL SGEMV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAUU2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = SDOT( N-I+1, A( I, I ), LDA, A( I, I ), LDA ) CALL SGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), $ LDA, A( I, I+1 ), LDA, AII, A( 1, I ), 1 ) ELSE CALL SSCAL( I, AII, A( 1, I ), 1 ) END IF 10 CONTINUE * ELSE * * Compute the product L' * L. * DO 20 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = SDOT( N-I+1, A( I, I ), 1, A( I, I ), 1 ) CALL SGEMV( 'Transpose', N-I, I-1, ONE, A( I+1, 1 ), LDA, $ A( I+1, I ), 1, AII, A( I, 1 ), LDA ) ELSE CALL SSCAL( I, AII, A( I, 1 ), LDA ) END IF 20 CONTINUE END IF * RETURN * * End of SLAUU2 * END OpenBLAS-0.2.20/reference/slauumf.f000066400000000000000000000113171313527062700167300ustar00rootroot00000000000000 SUBROUTINE SLAUUMF( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * SLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the blocked form of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IB, NB * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV EXTERNAL LSAME, ILAENV * .. * .. External Subroutines .. EXTERNAL SGEMM, SLAUU2, SSYRK, STRMM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAUUM', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 128 * IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL SLAUU2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL STRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', $ I-1, IB, ONE, A( I, I ), LDA, A( 1, I ), $ LDA ) CALL SLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL SGEMM( 'No transpose', 'Transpose', I-1, IB, $ N-I-IB+1, ONE, A( 1, I+IB ), LDA, $ A( I, I+IB ), LDA, ONE, A( 1, I ), LDA ) CALL SSYRK( 'Upper', 'No transpose', IB, N-I-IB+1, $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), $ LDA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * DO 20 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL STRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', IB, $ I-1, ONE, A( I, I ), LDA, A( I, 1 ), LDA ) CALL SLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL SGEMM( 'Transpose', 'No transpose', IB, I-1, $ N-I-IB+1, ONE, A( I+IB, I ), LDA, $ A( I+IB, 1 ), LDA, ONE, A( I, 1 ), LDA ) CALL SSYRK( 'Lower', 'Transpose', IB, N-I-IB+1, ONE, $ A( I+IB, I ), LDA, ONE, A( I, I ), LDA ) END IF 20 CONTINUE END IF END IF * RETURN * * End of SLAUUM * END OpenBLAS-0.2.20/reference/smaxf.f000066400000000000000000000014431313527062700163710ustar00rootroot00000000000000 REAL*4 function smaxf(n,dx,incx) c c finds the index of element having max. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c REAL*4 dx(*) integer i,incx,ix,n c smaxf = 0 if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 smaxf = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).le.smaxf) go to 5 smaxf = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 smaxf = dx(1) do 30 i = 2,n if(dx(i).le.smaxf) go to 30 smaxf = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/sminf.f000066400000000000000000000014431313527062700163670ustar00rootroot00000000000000 REAL*4 function sminf(n,dx,incx) c c finds the index of element having min. absolute value. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c REAL*4 dx(*) integer i,incx,ix,n c sminf = 0 if( n.lt.1 .or. incx.le.0 ) return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 sminf = dx(1) ix = ix + incx do 10 i = 2,n if(dx(ix).ge.sminf) go to 5 sminf = dx(ix) 5 ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 sminf = dx(1) do 30 i = 2,n if(dx(i).ge.sminf) go to 30 sminf = dx(i) 30 continue return end OpenBLAS-0.2.20/reference/snrm2f.f000066400000000000000000000031741313527062700164650ustar00rootroot00000000000000 REAL FUNCTION SNRM2F ( N, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N * .. Array Arguments .. REAL X( * ) * .. * * SNRM2 returns the euclidean norm of a vector via the function * name, so that * * SNRM2 := sqrt( x'*x ) * * * * -- This version written on 25-October-1982. * Modified on 14-October-1993 to inline the call to SLASSQ. * Sven Hammarling, Nag Ltd. * * * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. Local Scalars .. INTEGER IX REAL ABSXI, NORM, SCALE, SSQ * .. Intrinsic Functions .. INTRINSIC ABS, SQRT * .. * .. Executable Statements .. IF( N.LT.1 .OR. INCX.LT.1 )THEN NORM = ZERO ELSE IF( N.EQ.1 )THEN NORM = ABS( X( 1 ) ) ELSE SCALE = ZERO SSQ = ONE * The following loop is equivalent to this call to the LAPACK * auxiliary routine: * CALL SLASSQ( N, X, INCX, SCALE, SSQ ) * DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX IF( X( IX ).NE.ZERO )THEN ABSXI = ABS( X( IX ) ) IF( SCALE.LT.ABSXI )THEN SSQ = ONE + SSQ*( SCALE/ABSXI )**2 SCALE = ABSXI ELSE SSQ = SSQ + ( ABSXI/SCALE )**2 END IF END IF 10 CONTINUE NORM = SCALE * SQRT( SSQ ) END IF * SNRM2F = NORM RETURN * * End of SNRM2. * END OpenBLAS-0.2.20/reference/spotf2f.f000066400000000000000000000114001313527062700166300ustar00rootroot00000000000000 SUBROUTINE SPOTF2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * February 29, 1992 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * SPOTF2 computes the Cholesky factorization of a real symmetric * positive definite matrix A. * * The factorization has the form * A = U' * U , if UPLO = 'U', or * A = L * L', if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * symmetric matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the symmetric matrix A. If UPLO = 'U', the leading * n by n upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U'*U or A = L*L'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, the leading minor of order k is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J REAL AJJ * .. * .. External Functions .. LOGICAL LSAME REAL SDOT EXTERNAL LSAME, SDOT * .. * .. External Subroutines .. EXTERNAL SGEMV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SPOTF2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = A( J, J ) - SDOT( J-1, A( 1, J ), 1, A( 1, J ), 1 ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of row J. * IF( J.LT.N ) THEN CALL SGEMV( 'Transpose', J-1, N-J, -ONE, A( 1, J+1 ), $ LDA, A( 1, J ), 1, ONE, A( J, J+1 ), LDA ) CALL SSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) END IF 10 CONTINUE ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = A( J, J ) - SDOT( J-1, A( J, 1 ), LDA, A( J, 1 ), $ LDA ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of column J. * IF( J.LT.N ) THEN CALL SGEMV( 'No transpose', N-J, J-1, -ONE, A( J+1, 1 ), $ LDA, A( J, 1 ), LDA, ONE, A( J+1, J ), 1 ) CALL SSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF GO TO 40 * 30 CONTINUE INFO = J * 40 CONTINUE RETURN * * End of SPOTF2 * END OpenBLAS-0.2.20/reference/spotrff.f000066400000000000000000000127411313527062700167410ustar00rootroot00000000000000 SUBROUTINE SPOTRFF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * SPOTRF computes the Cholesky factorization of a real symmetric * positive definite matrix A. * * The factorization has the form * A = U**T * U, if UPLO = 'U', or * A = L * L**T, if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the block version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the symmetric matrix A. If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the leading minor of order i is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J, JB, NB * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SGEMM, SPOTF2, SSYRK, STRSM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 56 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code. * CALL SPOTF2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code. * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL SSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, $ A( 1, J ), LDA, ONE, A( J, J ), LDA ) CALL SPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block row. * CALL SGEMM( 'Transpose', 'No transpose', JB, N-J-JB+1, $ J-1, -ONE, A( 1, J ), LDA, A( 1, J+JB ), $ LDA, ONE, A( J, J+JB ), LDA ) CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', $ JB, N-J-JB+1, ONE, A( J, J ), LDA, $ A( J, J+JB ), LDA ) END IF 10 CONTINUE * ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL SSYRK( 'Lower', 'No transpose', JB, J-1, -ONE, $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) CALL SPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block column. * CALL SGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, $ J-1, -ONE, A( J+JB, 1 ), LDA, A( J, 1 ), $ LDA, ONE, A( J+JB, J ), LDA ) CALL STRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', $ N-J-JB+1, JB, ONE, A( J, J ), LDA, $ A( J+JB, J ), LDA ) END IF 20 CONTINUE END IF END IF GO TO 40 * 30 CONTINUE INFO = INFO + J - 1 * 40 CONTINUE RETURN * * End of SPOTRF * END OpenBLAS-0.2.20/reference/spotrif.f000066400000000000000000000050251313527062700167410ustar00rootroot00000000000000 SUBROUTINE SPOTRIF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * SPOTRI computes the inverse of a real symmetric positive definite * matrix A using the Cholesky factorization A = U**T*U or A = L*L**T * computed by SPOTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the triangular factor U or L from the Cholesky * factorization A = U**T*U or A = L*L**T, as computed by * SPOTRF. * On exit, the upper or lower triangle of the (symmetric) * inverse of A, overwriting the input factor U or L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SLAUUM, STRTRI, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL STRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL SLAUUM( UPLO, N, A, LDA, INFO ) * RETURN * * End of SPOTRI * END OpenBLAS-0.2.20/reference/srotf.f000066400000000000000000000016501313527062700164100ustar00rootroot00000000000000 subroutine srotf (n,sx,incx,sy,incy,c,s) c c applies a plane rotation. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),sy(*),stemp,c,s integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n stemp = c*sx(ix) + s*sy(iy) sy(iy) = c*sy(iy) - s*sx(ix) sx(ix) = stemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n stemp = c*sx(i) + s*sy(i) sy(i) = c*sy(i) - s*sx(i) sx(i) = stemp 30 continue return end OpenBLAS-0.2.20/reference/srotgf.f000066400000000000000000000012261313527062700165560ustar00rootroot00000000000000 subroutine srotgf(sa,sb,c,s) c c construct givens plane rotation. c jack dongarra, linpack, 3/11/78. c real sa,sb,c,s,roe,scale,r,z c roe = sb if( abs(sa) .gt. abs(sb) ) roe = sa scale = abs(sa) + abs(sb) if( scale .ne. 0.0 ) go to 10 c = 1.0 s = 0.0 r = 0.0 z = 0.0 go to 20 10 r = scale*sqrt((sa/scale)**2 + (sb/scale)**2) r = sign(1.0,roe)*r c = sa/r s = sb/r z = 1.0 if( abs(sa) .gt. abs(sb) ) z = s if( abs(sb) .ge. abs(sa) .and. c .ne. 0.0 ) z = 1.0/c 20 sa = r sb = z return end OpenBLAS-0.2.20/reference/srotmf.f000066400000000000000000000061571313527062700165740ustar00rootroot00000000000000 SUBROUTINE SROTMF (N,SX,INCX,SY,INCY,SPARAM) C C APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX C C (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN C (DX**T) C C SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE C LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. C WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. C C SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 C C (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) C H=( ) ( ) ( ) ( ) C (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). C SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. C DIMENSION SX(1),SY(1),SPARAM(5) DATA ZERO,TWO/0.E0,2.E0/ C SFLAG=SPARAM(1) IF(N .LE. 0 .OR.(SFLAG+TWO.EQ.ZERO)) GO TO 140 IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70 C NSTEPS=N*INCX IF(SFLAG) 50,10,30 10 CONTINUE SH12=SPARAM(4) SH21=SPARAM(3) DO 20 I=1,NSTEPS,INCX W=SX(I) Z=SY(I) SX(I)=W+Z*SH12 SY(I)=W*SH21+Z 20 CONTINUE GO TO 140 30 CONTINUE SH11=SPARAM(2) SH22=SPARAM(5) DO 40 I=1,NSTEPS,INCX W=SX(I) Z=SY(I) SX(I)=W*SH11+Z SY(I)=-W+SH22*Z 40 CONTINUE GO TO 140 50 CONTINUE SH11=SPARAM(2) SH12=SPARAM(4) SH21=SPARAM(3) SH22=SPARAM(5) DO 60 I=1,NSTEPS,INCX W=SX(I) Z=SY(I) SX(I)=W*SH11+Z*SH12 SY(I)=W*SH21+Z*SH22 60 CONTINUE GO TO 140 70 CONTINUE KX=1 KY=1 IF(INCX .LT. 0) KX=1+(1-N)*INCX IF(INCY .LT. 0) KY=1+(1-N)*INCY C IF(SFLAG)120,80,100 80 CONTINUE SH12=SPARAM(4) SH21=SPARAM(3) DO 90 I=1,N W=SX(KX) Z=SY(KY) SX(KX)=W+Z*SH12 SY(KY)=W*SH21+Z KX=KX+INCX KY=KY+INCY 90 CONTINUE GO TO 140 100 CONTINUE SH11=SPARAM(2) SH22=SPARAM(5) DO 110 I=1,N W=SX(KX) Z=SY(KY) SX(KX)=W*SH11+Z SY(KY)=-W+SH22*Z KX=KX+INCX KY=KY+INCY 110 CONTINUE GO TO 140 120 CONTINUE SH11=SPARAM(2) SH12=SPARAM(4) SH21=SPARAM(3) SH22=SPARAM(5) DO 130 I=1,N W=SX(KX) Z=SY(KY) SX(KX)=W*SH11+Z*SH12 SY(KY)=W*SH21+Z*SH22 KX=KX+INCX KY=KY+INCY 130 CONTINUE 140 CONTINUE RETURN END OpenBLAS-0.2.20/reference/srotmgf.f000066400000000000000000000112151313527062700167320ustar00rootroot00000000000000 SUBROUTINE SROTMGF (SD1,SD2,SX1,SY1,SPARAM) C C CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS C THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* C SY2)**T. C WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. C C SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 C C (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) C H=( ) ( ) ( ) ( ) C (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). C LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 C RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE C VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) C C THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE C INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE C OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. C DIMENSION SPARAM(5) C DATA ZERO,ONE,TWO /0.E0,1.E0,2.E0/ DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/ IF(.NOT. SD1 .LT. ZERO) GO TO 10 C GO ZERO-H-D-AND-SX1.. GO TO 60 10 CONTINUE C CASE-SD1-NONNEGATIVE SP2=SD2*SY1 IF(.NOT. SP2 .EQ. ZERO) GO TO 20 SFLAG=-TWO GO TO 260 C REGULAR-CASE.. 20 CONTINUE SP1=SD1*SX1 SQ2=SP2*SY1 SQ1=SP1*SX1 C IF(.NOT. ABS(SQ1) .GT. ABS(SQ2)) GO TO 40 SH21=-SY1/SX1 SH12=SP2/SP1 C SU=ONE-SH12*SH21 C IF(.NOT. SU .LE. ZERO) GO TO 30 C GO ZERO-H-D-AND-SX1.. GO TO 60 30 CONTINUE SFLAG=ZERO SD1=SD1/SU SD2=SD2/SU SX1=SX1*SU C GO SCALE-CHECK.. GO TO 100 40 CONTINUE IF(.NOT. SQ2 .LT. ZERO) GO TO 50 C GO ZERO-H-D-AND-SX1.. GO TO 60 50 CONTINUE SFLAG=ONE SH11=SP1/SP2 SH22=SX1/SY1 SU=ONE+SH11*SH22 STEMP=SD2/SU SD2=SD1/SU SD1=STEMP SX1=SY1*SU C GO SCALE-CHECK GO TO 100 C PROCEDURE..ZERO-H-D-AND-SX1.. 60 CONTINUE SFLAG=-ONE SH11=ZERO SH12=ZERO SH21=ZERO SH22=ZERO C SD1=ZERO SD2=ZERO SX1=ZERO C RETURN.. GO TO 220 C PROCEDURE..FIX-H.. 70 CONTINUE IF(.NOT. SFLAG .GE. ZERO) GO TO 90 C IF(.NOT. SFLAG .EQ. ZERO) GO TO 80 SH11=ONE SH22=ONE SFLAG=-ONE GO TO 90 80 CONTINUE SH21=-ONE SH12=ONE SFLAG=-ONE 90 CONTINUE GO TO IGO,(120,150,180,210) C PROCEDURE..SCALE-CHECK 100 CONTINUE 110 CONTINUE IF(.NOT. SD1 .LE. RGAMSQ) GO TO 130 IF(SD1 .EQ. ZERO) GO TO 160 ASSIGN 120 TO IGO C FIX-H.. GO TO 70 120 CONTINUE SD1=SD1*GAM**2 SX1=SX1/GAM SH11=SH11/GAM SH12=SH12/GAM GO TO 110 130 CONTINUE 140 CONTINUE IF(.NOT. SD1 .GE. GAMSQ) GO TO 160 ASSIGN 150 TO IGO C FIX-H.. GO TO 70 150 CONTINUE SD1=SD1/GAM**2 SX1=SX1*GAM SH11=SH11*GAM SH12=SH12*GAM GO TO 140 160 CONTINUE 170 CONTINUE IF(.NOT. ABS(SD2) .LE. RGAMSQ) GO TO 190 IF(SD2 .EQ. ZERO) GO TO 220 ASSIGN 180 TO IGO C FIX-H.. GO TO 70 180 CONTINUE SD2=SD2*GAM**2 SH21=SH21/GAM SH22=SH22/GAM GO TO 170 190 CONTINUE 200 CONTINUE IF(.NOT. ABS(SD2) .GE. GAMSQ) GO TO 220 ASSIGN 210 TO IGO C FIX-H.. GO TO 70 210 CONTINUE SD2=SD2/GAM**2 SH21=SH21*GAM SH22=SH22*GAM GO TO 200 220 CONTINUE IF(SFLAG)250,230,240 230 CONTINUE SPARAM(3)=SH21 SPARAM(4)=SH12 GO TO 260 240 CONTINUE SPARAM(2)=SH11 SPARAM(5)=SH22 GO TO 260 250 CONTINUE SPARAM(2)=SH11 SPARAM(3)=SH21 SPARAM(4)=SH12 SPARAM(5)=SH22 260 CONTINUE SPARAM(1)=SFLAG RETURN END OpenBLAS-0.2.20/reference/ssbmvf.f000066400000000000000000000231511313527062700165530ustar00rootroot00000000000000 SUBROUTINE SSBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. REAL ALPHA, BETA INTEGER INCX, INCY, K, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SSBMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric band matrix, with k super-diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the band matrix A is being supplied as * follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * being supplied. * * UPLO = 'L' or 'l' The lower triangular part of A is * being supplied. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of super-diagonals of the * matrix A. K must satisfy 0 .le. K. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer the upper * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer the lower * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - REAL array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * Y - REAL array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( K.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array A * are accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) )THEN * * Form y when upper triangle of A is stored. * KPLUS1 = K + 1 IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO L = KPLUS1 - J DO 50, I = MAX( 1, J - K ), J - 1 Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY L = KPLUS1 - J DO 70, I = MAX( 1, J - K ), J - 1 Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY IF( J.GT.K )THEN KX = KX + INCX KY = KY + INCY END IF 80 CONTINUE END IF ELSE * * Form y when lower triangle of A is stored. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( 1, J ) L = 1 - J DO 90, I = J + 1, MIN( N, J + K ) Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) L = 1 - J IX = JX IY = JY DO 110, I = J + 1, MIN( N, J + K ) IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + A( L + I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of SSBMV . * END OpenBLAS-0.2.20/reference/sscalf.f000066400000000000000000000017511313527062700165300ustar00rootroot00000000000000 subroutine sscalf(n,sa,sx,incx) c c scales a vector by a constant. c uses unrolled loops for increment equal to 1. c jack dongarra, linpack, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c real sa,sx(*) integer i,incx,m,mp1,n,nincx c if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c nincx = n*incx do 10 i = 1,nincx,incx sx(i) = sa*sx(i) 10 continue return c c code for increment equal to 1 c c c clean-up loop c 20 m = mod(n,5) if( m .eq. 0 ) go to 40 do 30 i = 1,m sx(i) = sa*sx(i) 30 continue if( n .lt. 5 ) return 40 mp1 = m + 1 do 50 i = mp1,n,5 sx(i) = sa*sx(i) sx(i + 1) = sa*sx(i + 1) sx(i + 2) = sa*sx(i + 2) sx(i + 3) = sa*sx(i + 3) sx(i + 4) = sa*sx(i + 4) 50 continue return end OpenBLAS-0.2.20/reference/sspmvf.f000066400000000000000000000200101313527062700165600ustar00rootroot00000000000000 SUBROUTINE SSPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) * .. Scalar Arguments .. REAL ALPHA, BETA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. REAL AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SSPMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * AP - REAL array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 6 ELSE IF( INCY.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSPMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form y when AP contains the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO K = KK DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 50 CONTINUE Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 KK = KK + J 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, K = KK, KK + J - 2 Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + J 80 CONTINUE END IF ELSE * * Form y when AP contains the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*AP( KK ) K = KK + 1 DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 KK = KK + ( N - J + 1 ) 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*AP( KK ) IX = JX IY = JY DO 110, K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + ( N - J + 1 ) 120 CONTINUE END IF END IF * RETURN * * End of SSPMV . * END OpenBLAS-0.2.20/reference/sspr2f.f000066400000000000000000000161611313527062700164750ustar00rootroot00000000000000 SUBROUTINE SSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) * .. Scalar Arguments .. REAL ALPHA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. REAL AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SSPR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an * n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * AP - REAL array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSPR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 10, I = 1, J AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 10 CONTINUE END IF KK = KK + J 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, K = KK, KK + J - 1 AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 50, I = J, N AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 50 CONTINUE END IF KK = KK + N - J + 1 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, K = KK, KK + N - J AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of SSPR2 . * END OpenBLAS-0.2.20/reference/ssprf.f000066400000000000000000000135641313527062700164170ustar00rootroot00000000000000 SUBROUTINE SSPRF ( UPLO, N, ALPHA, X, INCX, AP ) * .. Scalar Arguments .. REAL ALPHA INTEGER INCX, N CHARACTER*1 UPLO * .. Array Arguments .. REAL AP( * ), X( * ) * .. * * Purpose * ======= * * SSPR performs the symmetric rank 1 operation * * A := alpha*x*x' + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * AP - REAL array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSPR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) K = KK DO 10, I = 1, J AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 10 CONTINUE END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = KX DO 30, K = KK, KK + J - 1 AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) K = KK DO 50, I = J, N AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 50 CONTINUE END IF KK = KK + N - J + 1 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = JX DO 70, K = KK, KK + N - J AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of SSPR . * END OpenBLAS-0.2.20/reference/sswapf.f000066400000000000000000000024131313527062700165540ustar00rootroot00000000000000 subroutine sswapf (n,sx,incx,sy,incy) c c interchanges two vectors. c uses unrolled loops for increments equal to 1. c jack dongarra, linpack, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c real sx(*),sy(*),stemp integer i,incx,incy,ix,iy,m,mp1,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n stemp = sx(ix) sx(ix) = sy(iy) sy(iy) = stemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c c c clean-up loop c 20 m = mod(n,3) if( m .eq. 0 ) go to 40 do 30 i = 1,m stemp = sx(i) sx(i) = sy(i) sy(i) = stemp 30 continue if( n .lt. 3 ) return 40 mp1 = m + 1 do 50 i = mp1,n,3 stemp = sx(i) sx(i) = sy(i) sy(i) = stemp stemp = sx(i + 1) sx(i + 1) = sy(i + 1) sy(i + 1) = stemp stemp = sx(i + 2) sx(i + 2) = sy(i + 2) sy(i + 2) = stemp 50 continue return end OpenBLAS-0.2.20/reference/ssymmf.f000066400000000000000000000231511313527062700165710ustar00rootroot00000000000000 SUBROUTINE SSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC REAL ALPHA, BETA * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * SSYMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is a symmetric matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the symmetric matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the symmetric matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * symmetric matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * symmetric matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - REAL array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - REAL array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA REAL TEMP1, TEMP2 * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSYMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*A( J, J ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*A( J, K ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*A( J, K ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of SSYMM . * END OpenBLAS-0.2.20/reference/ssymvf.f000066400000000000000000000176111313527062700166060ustar00rootroot00000000000000 SUBROUTINE SSYMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. REAL ALPHA, BETA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SSYMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSYMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( J, J ) DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( J, J ) IX = JX IY = JY DO 110, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of SSYMV . * END OpenBLAS-0.2.20/reference/ssyr2f.f000066400000000000000000000162571313527062700165140ustar00rootroot00000000000000 SUBROUTINE SSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. REAL ALPHA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. REAL A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * SSYR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an n * by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSYR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of SSYR2 . * END OpenBLAS-0.2.20/reference/ssyr2kf.f000066400000000000000000000254051313527062700166620ustar00rootroot00000000000000 SUBROUTINE SSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDB, LDC REAL ALPHA, BETA * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * SSYR2K performs one of the symmetric rank 2k operations * * C := alpha*A*B' + alpha*B*A' + beta*C, * * or * * C := alpha*A'*B + alpha*B'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A and B are n by k matrices in the first case and k by n * matrices in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + * beta*C. * * TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + * beta*C. * * TRANS = 'C' or 'c' C := alpha*A'*B + alpha*B'*A + * beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrices A and B, and on entry with * TRANS = 'T' or 't' or 'C' or 'c', K specifies the number * of rows of the matrices A and B. K must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * B - REAL array of DIMENSION ( LDB, kb ), where kb is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array B must contain the matrix B, otherwise * the leading k by n part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDB must be at least max( 1, n ), otherwise LDB must * be at least max( 1, k ). * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - REAL array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA REAL TEMP1, TEMP2 * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSYR2K', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*B' + alpha*B*A' + C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*B + alpha*B'*A + C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP1 = ZERO TEMP2 = ZERO DO 190, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP1 = ZERO TEMP2 = ZERO DO 220, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of SSYR2K. * END OpenBLAS-0.2.20/reference/ssyrf.f000066400000000000000000000135151313527062700164240ustar00rootroot00000000000000 SUBROUTINE SSYRF ( UPLO, N, ALPHA, X, INCX, A, LDA ) * .. Scalar Arguments .. REAL ALPHA INTEGER INCX, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. REAL A( LDA, * ), X( * ) * .. * * Purpose * ======= * * SSYR performs the symmetric rank 1 operation * * A := alpha*x*x' + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JX, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSYR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = KX DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IX = JX DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of SSYR . * END OpenBLAS-0.2.20/reference/ssyrkf.f000066400000000000000000000222061313527062700165740ustar00rootroot00000000000000 SUBROUTINE SSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDC REAL ALPHA, BETA * .. Array Arguments .. REAL A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * SSYRK performs one of the symmetric rank k operations * * C := alpha*A*A' + beta*C, * * or * * C := alpha*A'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A is an n by k matrix in the first case and a k by n matrix * in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. * * TRANS = 'T' or 't' C := alpha*A'*A + beta*C. * * TRANS = 'C' or 'c' C := alpha*A'*A + beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrix A, and on entry with * TRANS = 'T' or 't' or 'C' or 'c', K specifies the number * of rows of the matrix A. K must be at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * BETA - REAL . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - REAL array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA REAL TEMP * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'SSYRK ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*A' + beta*C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + TEMP*A( I, L ) 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + TEMP*A( I, L ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*A + beta*C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP = ZERO DO 190, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP = ZERO DO 220, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of SSYRK . * END OpenBLAS-0.2.20/reference/stbmvf.f000066400000000000000000000261331313527062700165570ustar00rootroot00000000000000 SUBROUTINE STBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, K, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. REAL A( LDA, * ), X( * ) * .. * * Purpose * ======= * * STBMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular band matrix, with ( k + 1 ) diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := A'*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( K.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'STBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = KPLUS1 - J DO 10, I = MAX( 1, J - K ), J - 1 X( I ) = X( I ) + TEMP*A( L + I, J ) 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( KPLUS1, J ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = KPLUS1 - J DO 30, I = MAX( 1, J - K ), J - 1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( KPLUS1, J ) END IF JX = JX + INCX IF( J.GT.K ) $ KX = KX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = 1 - J DO 50, I = MIN( N, J + K ), J + 1, -1 X( I ) = X( I ) + TEMP*A( L + I, J ) 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( 1, J ) END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = 1 - J DO 70, I = MIN( N, J + K ), J + 1, -1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( 1, J ) END IF JX = JX - INCX IF( ( N - J ).GE.K ) $ KX = KX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 100, J = N, 1, -1 TEMP = X( J ) L = KPLUS1 - J IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 90, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( I ) 90 CONTINUE X( J ) = TEMP 100 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 120, J = N, 1, -1 TEMP = X( JX ) KX = KX - INCX IX = KX L = KPLUS1 - J IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 110, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX - INCX 110 CONTINUE X( JX ) = TEMP JX = JX - INCX 120 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 140, J = 1, N TEMP = X( J ) L = 1 - J IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 130, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( I ) 130 CONTINUE X( J ) = TEMP 140 CONTINUE ELSE JX = KX DO 160, J = 1, N TEMP = X( JX ) KX = KX + INCX IX = KX L = 1 - J IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 150, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX + INCX 150 CONTINUE X( JX ) = TEMP JX = JX + INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of STBMV . * END OpenBLAS-0.2.20/reference/stbsvf.f000066400000000000000000000255661313527062700165760ustar00rootroot00000000000000 SUBROUTINE STBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) * .. Scalar Arguments .. INTEGER INCX,K,LDA,N CHARACTER DIAG,TRANS,UPLO * .. * .. Array Arguments .. REAL A(LDA,*),X(*) * .. * * Purpose * ======= * * STBSV solves one of the systems of equations * * A*x = b, or A'*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular band matrix, with ( k + 1 ) * diagonals. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Arguments * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' A'*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER (ZERO=0.0E+0) * .. * .. Local Scalars .. REAL TEMP INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L LOGICAL NOUNIT * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX,MIN * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN INFO = 1 ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 2 ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT. (K+1)) THEN INFO = 7 ELSE IF (INCX.EQ.0) THEN INFO = 9 END IF IF (INFO.NE.0) THEN CALL XERBLA('STBSV ',INFO) RETURN END IF * * Quick return if possible. * IF (N.EQ.0) RETURN * NOUNIT = LSAME(DIAG,'N') * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF (INCX.LE.0) THEN KX = 1 - (N-1)*INCX ELSE IF (INCX.NE.1) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed by sequentially with one pass through A. * IF (LSAME(TRANS,'N')) THEN * * Form x := inv( A )*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 20 J = N,1,-1 IF (X(J).NE.ZERO) THEN L = KPLUS1 - J IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) TEMP = X(J) DO 10 I = J - 1,MAX(1,J-K),-1 X(I) = X(I) - TEMP*A(L+I,J) 10 CONTINUE END IF 20 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 40 J = N,1,-1 KX = KX - INCX IF (X(JX).NE.ZERO) THEN IX = KX L = KPLUS1 - J IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) TEMP = X(JX) DO 30 I = J - 1,MAX(1,J-K),-1 X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX - INCX 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 60 J = 1,N IF (X(J).NE.ZERO) THEN L = 1 - J IF (NOUNIT) X(J) = X(J)/A(1,J) TEMP = X(J) DO 50 I = J + 1,MIN(N,J+K) X(I) = X(I) - TEMP*A(L+I,J) 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1,N KX = KX + INCX IF (X(JX).NE.ZERO) THEN IX = KX L = 1 - J IF (NOUNIT) X(JX) = X(JX)/A(1,J) TEMP = X(JX) DO 70 I = J + 1,MIN(N,J+K) X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A')*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 100 J = 1,N TEMP = X(J) L = KPLUS1 - J DO 90 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(I) 90 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) X(J) = TEMP 100 CONTINUE ELSE JX = KX DO 120 J = 1,N TEMP = X(JX) IX = KX L = KPLUS1 - J DO 110 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX + INCX 110 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) X(JX) = TEMP JX = JX + INCX IF (J.GT.K) KX = KX + INCX 120 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 140 J = N,1,-1 TEMP = X(J) L = 1 - J DO 130 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(I) 130 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) X(J) = TEMP 140 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 160 J = N,1,-1 TEMP = X(JX) IX = KX L = 1 - J DO 150 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX - INCX 150 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) X(JX) = TEMP JX = JX - INCX IF ((N-J).GE.K) KX = KX - INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of STBSV . * END OpenBLAS-0.2.20/reference/stpmvf.f000066400000000000000000000223441313527062700165750ustar00rootroot00000000000000 SUBROUTINE STPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. REAL AP( * ), X( * ) * .. * * Purpose * ======= * * STPMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := A'*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - REAL array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'STPMVF', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x:= A*x. * IF( LSAME( UPLO, 'U' ) )THEN KK =1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 10, I = 1, J - 1 X( I ) = X( I ) + TEMP*AP( K ) K = K + 1 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*AP( KK + J - 1 ) END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, K = KK, KK + J - 2 X( IX ) = X( IX ) + TEMP*AP( K ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK + J - 1 ) END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 50, I = N, J + 1, -1 X( I ) = X( I ) + TEMP*AP( K ) K = K - 1 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*AP( KK - N + J ) END IF KK = KK - ( N - J + 1 ) 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 X( IX ) = X( IX ) + TEMP*AP( K ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK - N + J ) END IF JX = JX - INCX KK = KK - ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := A'*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 100, J = N, 1, -1 TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) K = KK - 1 DO 90, I = J - 1, 1, -1 TEMP = TEMP + AP( K )*X( I ) K = K - 1 90 CONTINUE X( J ) = TEMP KK = KK - J 100 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 120, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 110, K = KK - 1, KK - J + 1, -1 IX = IX - INCX TEMP = TEMP + AP( K )*X( IX ) 110 CONTINUE X( JX ) = TEMP JX = JX - INCX KK = KK - J 120 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 140, J = 1, N TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) K = KK + 1 DO 130, I = J + 1, N TEMP = TEMP + AP( K )*X( I ) K = K + 1 130 CONTINUE X( J ) = TEMP KK = KK + ( N - J + 1 ) 140 CONTINUE ELSE JX = KX DO 160, J = 1, N TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 150, K = KK + 1, KK + N - J IX = IX + INCX TEMP = TEMP + AP( K )*X( IX ) 150 CONTINUE X( JX ) = TEMP JX = JX + INCX KK = KK + ( N - J + 1 ) 160 CONTINUE END IF END IF END IF * RETURN * * End of STPMV . * END OpenBLAS-0.2.20/reference/stpsvf.f000066400000000000000000000226151313527062700166040ustar00rootroot00000000000000 SUBROUTINE STPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. REAL AP( * ), X( * ) * .. * * Purpose * ======= * * STPSV solves one of the systems of equations * * A*x = b, or A'*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix, supplied in packed form. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' A'*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - REAL array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'STPSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) TEMP = X( J ) K = KK - 1 DO 10, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*AP( K ) K = K - 1 10 CONTINUE END IF KK = KK - J 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) TEMP = X( JX ) IX = JX DO 30, K = KK - 1, KK - J + 1, -1 IX = IX - INCX X( IX ) = X( IX ) - TEMP*AP( K ) 30 CONTINUE END IF JX = JX - INCX KK = KK - J 40 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) TEMP = X( J ) K = KK + 1 DO 50, I = J + 1, N X( I ) = X( I ) - TEMP*AP( K ) K = K + 1 50 CONTINUE END IF KK = KK + ( N - J + 1 ) 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) TEMP = X( JX ) IX = JX DO 70, K = KK + 1, KK + N - J IX = IX + INCX X( IX ) = X( IX ) - TEMP*AP( K ) 70 CONTINUE END IF JX = JX + INCX KK = KK + ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = 1 IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = X( J ) K = KK DO 90, I = 1, J - 1 TEMP = TEMP - AP( K )*X( I ) K = K + 1 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) X( J ) = TEMP KK = KK + J 100 CONTINUE ELSE JX = KX DO 120, J = 1, N TEMP = X( JX ) IX = KX DO 110, K = KK, KK + J - 2 TEMP = TEMP - AP( K )*X( IX ) IX = IX + INCX 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) X( JX ) = TEMP JX = JX + INCX KK = KK + J 120 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 140, J = N, 1, -1 TEMP = X( J ) K = KK DO 130, I = N, J + 1, -1 TEMP = TEMP - AP( K )*X( I ) K = K - 1 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) X( J ) = TEMP KK = KK - ( N - J + 1 ) 140 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 160, J = N, 1, -1 TEMP = X( JX ) IX = KX DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1 TEMP = TEMP - AP( K )*X( IX ) IX = IX - INCX 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) X( JX ) = TEMP JX = JX - INCX KK = KK - (N - J + 1 ) 160 CONTINUE END IF END IF END IF * RETURN * * End of STPSV . * END OpenBLAS-0.2.20/reference/strmmf.f000066400000000000000000000263261313527062700165720ustar00rootroot00000000000000 SUBROUTINE STRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB REAL ALPHA * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * STRMM performs one of the matrix-matrix operations * * B := alpha*op( A )*B, or B := alpha*B*op( A ), * * where alpha is a scalar, B is an m by n matrix, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A'. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) multiplies B from * the left or right as follows: * * SIDE = 'L' or 'l' B := alpha*op( A )*B. * * SIDE = 'R' or 'r' B := alpha*B*op( A ). * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = A'. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - REAL array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B, and on exit is overwritten by the * transformed matrix. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL LSIDE, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA REAL TEMP * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'STRMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*A*B. * IF( UPPER )THEN DO 50, J = 1, N DO 40, K = 1, M IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) DO 30, I = 1, K - 1 B( I, J ) = B( I, J ) + TEMP*A( I, K ) 30 CONTINUE IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) B( K, J ) = TEMP END IF 40 CONTINUE 50 CONTINUE ELSE DO 80, J = 1, N DO 70 K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) B( K, J ) = TEMP IF( NOUNIT ) $ B( K, J ) = B( K, J )*A( K, K ) DO 60, I = K + 1, M B( I, J ) = B( I, J ) + TEMP*A( I, K ) 60 CONTINUE END IF 70 CONTINUE 80 CONTINUE END IF ELSE * * Form B := alpha*A'*B. * IF( UPPER )THEN DO 110, J = 1, N DO 100, I = M, 1, -1 TEMP = B( I, J ) IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 90, K = 1, I - 1 TEMP = TEMP + A( K, I )*B( K, J ) 90 CONTINUE B( I, J ) = ALPHA*TEMP 100 CONTINUE 110 CONTINUE ELSE DO 140, J = 1, N DO 130, I = 1, M TEMP = B( I, J ) IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 120, K = I + 1, M TEMP = TEMP + A( K, I )*B( K, J ) 120 CONTINUE B( I, J ) = ALPHA*TEMP 130 CONTINUE 140 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*B*A. * IF( UPPER )THEN DO 180, J = N, 1, -1 TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 150, I = 1, M B( I, J ) = TEMP*B( I, J ) 150 CONTINUE DO 170, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN TEMP = ALPHA*A( K, J ) DO 160, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE ELSE DO 220, J = 1, N TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 190, I = 1, M B( I, J ) = TEMP*B( I, J ) 190 CONTINUE DO 210, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN TEMP = ALPHA*A( K, J ) DO 200, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 200 CONTINUE END IF 210 CONTINUE 220 CONTINUE END IF ELSE * * Form B := alpha*B*A'. * IF( UPPER )THEN DO 260, K = 1, N DO 240, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN TEMP = ALPHA*A( J, K ) DO 230, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 230 CONTINUE END IF 240 CONTINUE TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) IF( TEMP.NE.ONE )THEN DO 250, I = 1, M B( I, K ) = TEMP*B( I, K ) 250 CONTINUE END IF 260 CONTINUE ELSE DO 300, K = N, 1, -1 DO 280, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN TEMP = ALPHA*A( J, K ) DO 270, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 270 CONTINUE END IF 280 CONTINUE TEMP = ALPHA IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) IF( TEMP.NE.ONE )THEN DO 290, I = 1, M B( I, K ) = TEMP*B( I, K ) 290 CONTINUE END IF 300 CONTINUE END IF END IF END IF * RETURN * * End of STRMM . * END OpenBLAS-0.2.20/reference/strmvf.f000066400000000000000000000212371313527062700165770ustar00rootroot00000000000000 SUBROUTINE STRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. REAL A( LDA, * ), X( * ) * .. * * Purpose * ======= * * STRMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := A'*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'STRMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 10, I = 1, J - 1 X( I ) = X( I ) + TEMP*A( I, J ) 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, I = 1, J - 1 X( IX ) = X( IX ) + TEMP*A( I, J ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) END IF JX = JX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 50, I = N, J + 1, -1 X( I ) = X( I ) + TEMP*A( I, J ) 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, I = N, J + 1, -1 X( IX ) = X( IX ) + TEMP*A( I, J ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) END IF JX = JX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 100, J = N, 1, -1 TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 90, I = J - 1, 1, -1 TEMP = TEMP + A( I, J )*X( I ) 90 CONTINUE X( J ) = TEMP 100 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 120, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 110, I = J - 1, 1, -1 IX = IX - INCX TEMP = TEMP + A( I, J )*X( IX ) 110 CONTINUE X( JX ) = TEMP JX = JX - INCX 120 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 140, J = 1, N TEMP = X( J ) IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 130, I = J + 1, N TEMP = TEMP + A( I, J )*X( I ) 130 CONTINUE X( J ) = TEMP 140 CONTINUE ELSE JX = KX DO 160, J = 1, N TEMP = X( JX ) IX = JX IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 150, I = J + 1, N IX = IX + INCX TEMP = TEMP + A( I, J )*X( IX ) 150 CONTINUE X( JX ) = TEMP JX = JX + INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of STRMV . * END OpenBLAS-0.2.20/reference/strsmf.f000066400000000000000000000277721313527062700166060ustar00rootroot00000000000000 SUBROUTINE STRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB REAL ALPHA * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * STRSM solves one of the matrix equations * * op( A )*X = alpha*B, or X*op( A ) = alpha*B, * * where alpha is a scalar, X and B are m by n matrices, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A'. * * The matrix X is overwritten on B. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) appears on the left * or right of X as follows: * * SIDE = 'L' or 'l' op( A )*X = alpha*B. * * SIDE = 'R' or 'r' X*op( A ) = alpha*B. * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = A'. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - REAL . * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - REAL array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the right-hand side matrix B, and on exit is * overwritten by the solution matrix X. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL LSIDE, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA REAL TEMP * .. Parameters .. REAL ONE , ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'STRSM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*inv( A )*B. * IF( UPPER )THEN DO 60, J = 1, N IF( ALPHA.NE.ONE )THEN DO 30, I = 1, M B( I, J ) = ALPHA*B( I, J ) 30 CONTINUE END IF DO 50, K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/A( K, K ) DO 40, I = 1, K - 1 B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 40 CONTINUE END IF 50 CONTINUE 60 CONTINUE ELSE DO 100, J = 1, N IF( ALPHA.NE.ONE )THEN DO 70, I = 1, M B( I, J ) = ALPHA*B( I, J ) 70 CONTINUE END IF DO 90 K = 1, M IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/A( K, K ) DO 80, I = K + 1, M B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 80 CONTINUE END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form B := alpha*inv( A' )*B. * IF( UPPER )THEN DO 130, J = 1, N DO 120, I = 1, M TEMP = ALPHA*B( I, J ) DO 110, K = 1, I - 1 TEMP = TEMP - A( K, I )*B( K, J ) 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) B( I, J ) = TEMP 120 CONTINUE 130 CONTINUE ELSE DO 160, J = 1, N DO 150, I = M, 1, -1 TEMP = ALPHA*B( I, J ) DO 140, K = I + 1, M TEMP = TEMP - A( K, I )*B( K, J ) 140 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) B( I, J ) = TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) )THEN * * Form B := alpha*B*inv( A ). * IF( UPPER )THEN DO 210, J = 1, N IF( ALPHA.NE.ONE )THEN DO 170, I = 1, M B( I, J ) = ALPHA*B( I, J ) 170 CONTINUE END IF DO 190, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN DO 180, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 180 CONTINUE END IF 190 CONTINUE IF( NOUNIT )THEN TEMP = ONE/A( J, J ) DO 200, I = 1, M B( I, J ) = TEMP*B( I, J ) 200 CONTINUE END IF 210 CONTINUE ELSE DO 260, J = N, 1, -1 IF( ALPHA.NE.ONE )THEN DO 220, I = 1, M B( I, J ) = ALPHA*B( I, J ) 220 CONTINUE END IF DO 240, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN DO 230, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 230 CONTINUE END IF 240 CONTINUE IF( NOUNIT )THEN TEMP = ONE/A( J, J ) DO 250, I = 1, M B( I, J ) = TEMP*B( I, J ) 250 CONTINUE END IF 260 CONTINUE END IF ELSE * * Form B := alpha*B*inv( A' ). * IF( UPPER )THEN DO 310, K = N, 1, -1 IF( NOUNIT )THEN TEMP = ONE/A( K, K ) DO 270, I = 1, M B( I, K ) = TEMP*B( I, K ) 270 CONTINUE END IF DO 290, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN TEMP = A( J, K ) DO 280, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 280 CONTINUE END IF 290 CONTINUE IF( ALPHA.NE.ONE )THEN DO 300, I = 1, M B( I, K ) = ALPHA*B( I, K ) 300 CONTINUE END IF 310 CONTINUE ELSE DO 360, K = 1, N IF( NOUNIT )THEN TEMP = ONE/A( K, K ) DO 320, I = 1, M B( I, K ) = TEMP*B( I, K ) 320 CONTINUE END IF DO 340, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN TEMP = A( J, K ) DO 330, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 330 CONTINUE END IF 340 CONTINUE IF( ALPHA.NE.ONE )THEN DO 350, I = 1, M B( I, K ) = ALPHA*B( I, K ) 350 CONTINUE END IF 360 CONTINUE END IF END IF END IF * RETURN * * End of STRSM . * END OpenBLAS-0.2.20/reference/strsvf.f000066400000000000000000000214741313527062700166100ustar00rootroot00000000000000 SUBROUTINE STRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. REAL A( LDA, * ), X( * ) * .. * * Purpose * ======= * * STRSV solves one of the systems of equations * * A*x = b, or A'*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' A'*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - REAL array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - REAL array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0E+0 ) * .. Local Scalars .. REAL TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'STRSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOUNIT = LSAME( DIAG, 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) )THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 10, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*A( I, J ) 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) TEMP = X( JX ) IX = JX DO 30, I = J - 1, 1, -1 IX = IX - INCX X( IX ) = X( IX ) - TEMP*A( I, J ) 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 50, I = J + 1, N X( I ) = X( I ) - TEMP*A( I, J ) 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) TEMP = X( JX ) IX = JX DO 70, I = J + 1, N IX = IX + INCX X( IX ) = X( IX ) - TEMP*A( I, J ) 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 100, J = 1, N TEMP = X( J ) DO 90, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( I ) 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( J ) = TEMP 100 CONTINUE ELSE JX = KX DO 120, J = 1, N TEMP = X( JX ) IX = KX DO 110, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX + INCX 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( JX ) = TEMP JX = JX + INCX 120 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 140, J = N, 1, -1 TEMP = X( J ) DO 130, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( I ) 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( J ) = TEMP 140 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 160, J = N, 1, -1 TEMP = X( JX ) IX = KX DO 150, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX - INCX 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) X( JX ) = TEMP JX = JX - INCX 160 CONTINUE END IF END IF END IF * RETURN * * End of STRSV . * END OpenBLAS-0.2.20/reference/strti2f.f000066400000000000000000000101361313527062700166470ustar00rootroot00000000000000 SUBROUTINE STRTI2F( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * STRTI2 computes the inverse of a real upper or lower triangular * matrix. * * This is the Level 2 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the matrix A is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * DIAG (input) CHARACTER*1 * Specifies whether or not the matrix A is unit triangular. * = 'N': Non-unit triangular * = 'U': Unit triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading n by n upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. REAL ONE PARAMETER ( ONE = 1.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J REAL AJJ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL SSCAL, STRMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'STRTI2', -INFO ) RETURN END IF * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix. * DO 10 J = 1, N IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF * * Compute elements 1:j-1 of j-th column. * CALL STRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, $ A( 1, J ), 1 ) CALL SSCAL( J-1, AJJ, A( 1, J ), 1 ) 10 CONTINUE ELSE * * Compute inverse of lower triangular matrix. * DO 20 J = N, 1, -1 IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF IF( J.LT.N ) THEN * * Compute elements j+1:n of j-th column. * CALL STRMV( 'Lower', 'No transpose', DIAG, N-J, $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) CALL SSCAL( N-J, AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF * RETURN * * End of STRTI2 * END OpenBLAS-0.2.20/reference/strtrif.f000066400000000000000000000121601313527062700167460ustar00rootroot00000000000000 SUBROUTINE STRTRIF( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * March 31, 1993 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. REAL A( LDA, * ) * .. * * Purpose * ======= * * STRTRI computes the inverse of a real upper or lower triangular * matrix A. * * This is the Level 3 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': A is upper triangular; * = 'L': A is lower triangular. * * DIAG (input) CHARACTER*1 * = 'N': A is non-unit triangular; * = 'U': A is unit triangular. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) REAL array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading N-by-N upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, A(i,i) is exactly zero. The triangular * matrix is singular and its inverse can not be computed. * * ===================================================================== * * .. Parameters .. REAL ONE, ZERO PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J, JB, NB, NN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL STRMM, STRSM, STRTI2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'STRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN DO 10 INFO = 1, N IF( A( INFO, INFO ).EQ.ZERO ) $ RETURN 10 CONTINUE INFO = 0 END IF * * Determine the block size for this environment. * NB = 128 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL STRTI2( UPLO, DIAG, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * DO 20 J = 1, N, NB JB = MIN( NB, N-J+1 ) * * Compute rows 1:j-1 of current block column * CALL STRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, $ JB, ONE, A, LDA, A( 1, J ), LDA ) CALL STRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) * * Compute inverse of current diagonal block * CALL STRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) 20 CONTINUE ELSE * * Compute inverse of lower triangular matrix * NN = ( ( N-1 ) / NB )*NB + 1 DO 30 J = NN, 1, -NB JB = MIN( NB, N-J+1 ) IF( J+JB.LE.N ) THEN * * Compute rows j+jb:n of current block column * CALL STRMM( 'Left', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, $ A( J+JB, J ), LDA ) CALL STRSM( 'Right', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, $ A( J+JB, J ), LDA ) END IF * * Compute inverse of current diagonal block * CALL STRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) 30 CONTINUE END IF END IF * RETURN * * End of STRTRI * END OpenBLAS-0.2.20/reference/zaxpycf.f000066400000000000000000000016161313527062700167410ustar00rootroot00000000000000 subroutine zaxpycf(n,za,zx,incx,zy,incy) c c constant times a vector plus a vector. c jack dongarra, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*),zy(*),za integer i,incx,incy,ix,iy,n double precision dcabs1 INTRINSIC dconjg if(n.le.0)return if (dcabs1(za) .eq. 0.0d0) return if (incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n zy(iy) = zy(iy) + za*dconjg(zx(ix)) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n zy(i) = zy(i) + za*dconjg(zx(i)) 30 continue return end OpenBLAS-0.2.20/reference/zaxpyf.f000066400000000000000000000015341313527062700165750ustar00rootroot00000000000000 subroutine zaxpyf(n,za,zx,incx,zy,incy) c c constant times a vector plus a vector. c jack dongarra, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*),zy(*),za integer i,incx,incy,ix,iy,n double precision dcabs1 if(n.le.0)return if (dcabs1(za) .eq. 0.0d0) return if (incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n zy(iy) = zy(iy) + za*zx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n zy(i) = zy(i) + za*zx(i) 30 continue return end OpenBLAS-0.2.20/reference/zcopyf.f000066400000000000000000000014011313527062700165570ustar00rootroot00000000000000 subroutine zcopyf(n,zx,incx,zy,incy) c c copies a vector, x, to a vector, y. c jack dongarra, linpack, 4/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*),zy(*) integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n zy(iy) = zx(ix) ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n zy(i) = zx(i) 30 continue return end OpenBLAS-0.2.20/reference/zdotcf.f000066400000000000000000000016261313527062700165470ustar00rootroot00000000000000 double complex function zdotcf(n,zx,incx,zy,incy) c c forms the dot product of a vector. c jack dongarra, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*),zy(*),ztemp integer i,incx,incy,ix,iy,n ztemp = (0.0d0,0.0d0) zdotcf = (0.0d0,0.0d0) if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ztemp = ztemp + dconjg(zx(ix))*zy(iy) ix = ix + incx iy = iy + incy 10 continue zdotcf = ztemp return c c code for both increments equal to 1 c 20 do 30 i = 1,n ztemp = ztemp + dconjg(zx(i))*zy(i) 30 continue zdotcf = ztemp return end OpenBLAS-0.2.20/reference/zdotuf.f000066400000000000000000000016111313527062700165630ustar00rootroot00000000000000 double complex function zdotuf(n,zx,incx,zy,incy) c c forms the dot product of two vectors. c jack dongarra, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*),zy(*),ztemp integer i,incx,incy,ix,iy,n ztemp = (0.0d0,0.0d0) zdotuf = (0.0d0,0.0d0) if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments c not equal to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ztemp = ztemp + zx(ix)*zy(iy) ix = ix + incx iy = iy + incy 10 continue zdotuf = ztemp return c c code for both increments equal to 1 c 20 do 30 i = 1,n ztemp = ztemp + zx(i)*zy(i) 30 continue zdotuf = ztemp return end OpenBLAS-0.2.20/reference/zdrotf.f000066400000000000000000000017631313527062700165700ustar00rootroot00000000000000 subroutine zdrotf (n,zx,incx,zy,incy,c,s) c c applies a plane rotation, where the cos and sin (c and s) are c double precision and the vectors zx and zy are double complex. c jack dongarra, linpack, 3/11/78. c double complex zx(1),zy(1),ztemp double precision c,s integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ztemp = c*zx(ix) + s*zy(iy) zy(iy) = c*zy(iy) - s*zx(ix) zx(ix) = ztemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 c 20 do 30 i = 1,n ztemp = c*zx(i) + s*zy(i) zy(i) = c*zy(i) - s*zx(i) zx(i) = ztemp 30 continue return end OpenBLAS-0.2.20/reference/zdscalf.f000066400000000000000000000012541313527062700167010ustar00rootroot00000000000000 subroutine zdscalf(n,da,zx,incx) c c scales a vector by a constant. c jack dongarra, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*) double precision da integer i,incx,ix,n c if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 do 10 i = 1,n zx(ix) = dcmplx(da,0.0d0)*zx(ix) ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 do 30 i = 1,n zx(i) = dcmplx(da,0.0d0)*zx(i) 30 continue return end OpenBLAS-0.2.20/reference/zgbmvf.f000066400000000000000000000336171313527062700165560ustar00rootroot00000000000000 SUBROUTINE ZGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA INTEGER INCX, INCY, KL, KU, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZGBMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or * * y := alpha*conjg( A' )*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n band matrix, with kl sub-diagonals and ku super-diagonals. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * KL - INTEGER. * On entry, KL specifies the number of sub-diagonals of the * matrix A. KL must satisfy 0 .le. KL. * Unchanged on exit. * * KU - INTEGER. * On entry, KU specifies the number of super-diagonals of the * matrix A. KU must satisfy 0 .le. KU. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry, the leading ( kl + ku + 1 ) by n part of the * array A must contain the matrix of coefficients, supplied * column by column, with the leading diagonal of the matrix in * row ( ku + 1 ) of the array, the first super-diagonal * starting at position 2 in row ku, the first sub-diagonal * starting at position 1 in row ( ku + 2 ), and so on. * Elements in the array A that do not correspond to elements * in the band matrix (such as the top left ku by ku triangle) * are not referenced. * The following program segment will transfer a band matrix * from conventional full matrix storage to band storage: * * DO 20, J = 1, N * K = KU + 1 - J * DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) * A( K + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( kl + ku + 1 ). * Unchanged on exit. * * X - COMPLEX*16 array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX*16 array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, $ LENX, LENY LOGICAL NOCONJ, NOTRANS, XCONJ * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ).AND. $ .NOT.LSAME( TRANS, 'O' ).AND. $ .NOT.LSAME( TRANS, 'U' ).AND. $ .NOT.LSAME( TRANS, 'S' ).AND. $ .NOT.LSAME( TRANS, 'D' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( KL.LT.0 )THEN INFO = 4 ELSE IF( KU.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN INFO = 8 ELSE IF( INCX.EQ.0 )THEN INFO = 10 ELSE IF( INCY.EQ.0 )THEN INFO = 13 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZGBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF(NOTRANS)THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the band part of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KUP1 = KU + 1 IF(XCONJ)THEN IF(NOTRANS)THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) K = KUP1 - J IF( NOCONJ )THEN DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*A( K + I, J ) 50 CONTINUE ELSE DO 55, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*DCONJG(A( K + I, J )) 55 CONTINUE END IF END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*X( JX ) IY = KY K = KUP1 - J IF( NOCONJ )THEN DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) IY = IY + INCY 70 CONTINUE ELSE DO 75, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*DCONJG(A( K + I, J )) IY = IY + INCY 75 CONTINUE END IF END IF JX = JX + INCX IF( J.GT.KU ) $ KY = KY + INCY 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = ZERO K = KUP1 - J IF( NOCONJ )THEN DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( I ) 90 CONTINUE ELSE DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + DCONJG( A( K + I, J ) )*X( I ) 100 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 110 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX K = KUP1 - J IF( NOCONJ )THEN DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*X( IX ) IX = IX + INCX 120 CONTINUE ELSE DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + DCONJG( A( K + I, J ) )*X( IX ) IX = IX + INCX 130 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY IF( J.GT.KU ) $ KX = KX + INCX 140 CONTINUE END IF END IF ELSE IF(NOTRANS)THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 160, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*DCONJG(X( JX )) K = KUP1 - J IF( NOCONJ )THEN DO 150, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*A( K + I, J ) 150 CONTINUE ELSE DO 155, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( I ) = Y( I ) + TEMP*DCONJG(A( K + I, J )) 155 CONTINUE END IF END IF JX = JX + INCX 160 CONTINUE ELSE DO 180, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*DCONJG(X( JX )) IY = KY K = KUP1 - J IF( NOCONJ )THEN DO 170, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) IY = IY + INCY 170 CONTINUE ELSE DO 175, I = MAX( 1, J - KU ), MIN( M, J + KL ) Y( IY ) = Y( IY ) + TEMP*DCONJG(A( K + I, J )) IY = IY + INCY 175 CONTINUE END IF END IF JX = JX + INCX IF( J.GT.KU ) $ KY = KY + INCY 180 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 210, J = 1, N TEMP = ZERO K = KUP1 - J IF( NOCONJ )THEN DO 190, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*DCONJG(X( I )) 190 CONTINUE ELSE DO 200, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + DCONJG( A( K + I, J ) )*DCONJG(X( I )) 200 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 210 CONTINUE ELSE DO 240, J = 1, N TEMP = ZERO IX = KX K = KUP1 - J IF( NOCONJ )THEN DO 220, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + A( K + I, J )*DCONJG(X( IX )) IX = IX + INCX 220 CONTINUE ELSE DO 230, I = MAX( 1, J - KU ), MIN( M, J + KL ) TEMP = TEMP + DCONJG( A( K + I, J ) )*DCONJG(X(IX )) IX = IX + INCX 230 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY IF( J.GT.KU ) $ KX = KX + INCX 240 CONTINUE END IF END IF END IF * RETURN * * End of ZGBMV . * END OpenBLAS-0.2.20/reference/zgemm3mf.f000066400000000000000000000313421313527062700170010ustar00rootroot00000000000000 SUBROUTINE ZGEMM3MF(TRA,TRB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * .. Scalar Arguments .. DOUBLE COMPLEX ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRA,TRB * .. * .. Array Arguments .. DOUBLE COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) * .. * * Purpose * ======= * * ZGEMM performs one of the matrix-matrix operations * * C := alpha*op( A )*op( B ) + beta*C, * * where op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), * * alpha and beta are scalars, and A, B and C are matrices, with op( A ) * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. * * Arguments * ========== * * TRA - CHARACTER*1. * On entry, TRA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRA = 'N' or 'n', op( A ) = A. * * TRA = 'T' or 't', op( A ) = A'. * * TRA = 'C' or 'c', op( A ) = conjg( A' ). * * Unchanged on exit. * * TRB - CHARACTER*1. * On entry, TRB specifies the form of op( B ) to be used in * the matrix multiplication as follows: * * TRB = 'N' or 'n', op( B ) = B. * * TRB = 'T' or 't', op( B ) = B'. * * TRB = 'C' or 'c', op( B ) = conjg( B' ). * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix * op( A ) and of the matrix C. M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix * op( B ) and the number of columns of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of columns of the matrix * op( A ) and the number of rows of the matrix op( B ). K must * be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * k when TRA = 'N' or 'n', and is m otherwise. * Before entry with TRA = 'N' or 'n', the leading m by k * part of the array A must contain the matrix A, otherwise * the leading k by m part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRA = 'N' or 'n' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, k ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is * n when TRB = 'N' or 'n', and is k otherwise. * Before entry with TRB = 'N' or 'n', the leading k by n * part of the array B must contain the matrix B, otherwise * the leading n by k part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRB = 'N' or 'n' then * LDB must be at least max( 1, k ), otherwise LDB must be at * least max( 1, n ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n matrix * ( alpha*op( A )*op( B ) + beta*C ). * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DCONJG,MAX * .. * .. Local Scalars .. DOUBLE COMPLEX TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL CONJA,CONJB,NOTA,NOTB * .. * .. Parameters .. DOUBLE COMPLEX ONE PARAMETER (ONE= (1.0D+0,0.0D+0)) DOUBLE COMPLEX ZERO PARAMETER (ZERO= (0.0D+0,0.0D+0)) * .. * * Set NOTA and NOTB as true if A and B respectively are not * conjugated or transposed, set CONJA and CONJB as true if A and * B respectively are to be transposed but not conjugated and set * NROWA, NCOLA and NROWB as the number of rows and columns of A * and the number of rows of B respectively. * NOTA = LSAME(TRA,'N') NOTB = LSAME(TRB,'N') CONJA = LSAME(TRA,'C') CONJB = LSAME(TRB,'C') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF * * Test the input parameters. * INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + (.NOT.LSAME(TRA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + (.NOT.LSAME(TRB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('ZGEMM ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN * * And when alpha.eq.zero. * IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF (NOTB) THEN IF (NOTA) THEN * * Form C := alpha*A*B + beta*C. * DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE IF (CONJA) THEN * * Form C := alpha*conjg( A' )*B + beta*C. * DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + DCONJG(A(L,I))*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE ELSE * * Form C := alpha*A'*B + beta*C * DO 150 J = 1,N DO 140 I = 1,M TEMP = ZERO DO 130 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 130 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 140 CONTINUE 150 CONTINUE END IF ELSE IF (NOTA) THEN IF (CONJB) THEN * * Form C := alpha*A*conjg( B' ) + beta*C. * DO 200 J = 1,N IF (BETA.EQ.ZERO) THEN DO 160 I = 1,M C(I,J) = ZERO 160 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 170 I = 1,M C(I,J) = BETA*C(I,J) 170 CONTINUE END IF DO 190 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*DCONJG(B(J,L)) DO 180 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 180 CONTINUE END IF 190 CONTINUE 200 CONTINUE ELSE * * Form C := alpha*A*B' + beta*C * DO 250 J = 1,N IF (BETA.EQ.ZERO) THEN DO 210 I = 1,M C(I,J) = ZERO 210 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 220 I = 1,M C(I,J) = BETA*C(I,J) 220 CONTINUE END IF DO 240 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 230 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 230 CONTINUE END IF 240 CONTINUE 250 CONTINUE END IF ELSE IF (CONJA) THEN IF (CONJB) THEN * * Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. * DO 280 J = 1,N DO 270 I = 1,M TEMP = ZERO DO 260 L = 1,K TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L)) 260 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 270 CONTINUE 280 CONTINUE ELSE * * Form C := alpha*conjg( A' )*B' + beta*C * DO 310 J = 1,N DO 300 I = 1,M TEMP = ZERO DO 290 L = 1,K TEMP = TEMP + DCONJG(A(L,I))*B(J,L) 290 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 300 CONTINUE 310 CONTINUE END IF ELSE IF (CONJB) THEN * * Form C := alpha*A'*conjg( B' ) + beta*C * DO 340 J = 1,N DO 330 I = 1,M TEMP = ZERO DO 320 L = 1,K TEMP = TEMP + A(L,I)*DCONJG(B(J,L)) 320 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 330 CONTINUE 340 CONTINUE ELSE * * Form C := alpha*A'*B' + beta*C * DO 370 J = 1,N DO 360 I = 1,M TEMP = ZERO DO 350 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 350 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 360 CONTINUE 370 CONTINUE END IF END IF * RETURN * * End of ZGEMM . * END OpenBLAS-0.2.20/reference/zgemmf.f000066400000000000000000000314241313527062700165420ustar00rootroot00000000000000 SUBROUTINE ZGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * .. Scalar Arguments .. DOUBLE COMPLEX ALPHA,BETA INTEGER K,LDA,LDB,LDC,M,N CHARACTER TRANA,TRANB * .. * .. Array Arguments .. DOUBLE COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) * .. * * Purpose * ======= * * ZGEMM performs one of the matrix-matrix operations * * C := alpha*op( A )*op( B ) + beta*C, * * where op( X ) is one of * * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), * * alpha and beta are scalars, and A, B and C are matrices, with op( A ) * an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. * * Arguments * ========== * * TRANA - CHARACTER*1. * On entry, TRANA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANA = 'N' or 'n', op( A ) = A. * * TRANA = 'T' or 't', op( A ) = A'. * * TRANA = 'C' or 'c', op( A ) = conjg( A' ). * * Unchanged on exit. * * TRANB - CHARACTER*1. * On entry, TRANB specifies the form of op( B ) to be used in * the matrix multiplication as follows: * * TRANB = 'N' or 'n', op( B ) = B. * * TRANB = 'T' or 't', op( B ) = B'. * * TRANB = 'C' or 'c', op( B ) = conjg( B' ). * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix * op( A ) and of the matrix C. M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix * op( B ) and the number of columns of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of columns of the matrix * op( A ) and the number of rows of the matrix op( B ). K must * be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * k when TRANA = 'N' or 'n', and is m otherwise. * Before entry with TRANA = 'N' or 'n', the leading m by k * part of the array A must contain the matrix A, otherwise * the leading k by m part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANA = 'N' or 'n' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, k ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is * n when TRANB = 'N' or 'n', and is k otherwise. * Before entry with TRANB = 'N' or 'n', the leading k by n * part of the array B must contain the matrix B, otherwise * the leading n by k part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANB = 'N' or 'n' then * LDB must be at least max( 1, k ), otherwise LDB must be at * least max( 1, n ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n matrix * ( alpha*op( A )*op( B ) + beta*C ). * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DCONJG,MAX * .. * .. Local Scalars .. DOUBLE COMPLEX TEMP INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB LOGICAL CONJA,CONJB,NOTA,NOTB * .. * .. Parameters .. DOUBLE COMPLEX ONE PARAMETER (ONE= (1.0D+0,0.0D+0)) DOUBLE COMPLEX ZERO PARAMETER (ZERO= (0.0D+0,0.0D+0)) * .. * * Set NOTA and NOTB as true if A and B respectively are not * conjugated or transposed, set CONJA and CONJB as true if A and * B respectively are to be transposed but not conjugated and set * NROWA, NCOLA and NROWB as the number of rows and columns of A * and the number of rows of B respectively. * NOTA = LSAME(TRANA,'N') NOTB = LSAME(TRANB,'N') CONJA = LSAME(TRANA,'C') CONJB = LSAME(TRANB,'C') IF (NOTA) THEN NROWA = M NCOLA = K ELSE NROWA = K NCOLA = M END IF IF (NOTB) THEN NROWB = K ELSE NROWB = N END IF * * Test the input parameters. * INFO = 0 IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + (.NOT.LSAME(TRANA,'T'))) THEN INFO = 1 ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + (.NOT.LSAME(TRANB,'T'))) THEN INFO = 2 ELSE IF (M.LT.0) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT.MAX(1,NROWA)) THEN INFO = 8 ELSE IF (LDB.LT.MAX(1,NROWB)) THEN INFO = 10 ELSE IF (LDC.LT.MAX(1,M)) THEN INFO = 13 END IF IF (INFO.NE.0) THEN CALL XERBLA('ZGEMM ',INFO) RETURN END IF * * Quick return if possible. * IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN * * And when alpha.eq.zero. * IF (ALPHA.EQ.ZERO) THEN IF (BETA.EQ.ZERO) THEN DO 20 J = 1,N DO 10 I = 1,M C(I,J) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1,N DO 30 I = 1,M C(I,J) = BETA*C(I,J) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF (NOTB) THEN IF (NOTA) THEN * * Form C := alpha*A*B + beta*C. * DO 90 J = 1,N IF (BETA.EQ.ZERO) THEN DO 50 I = 1,M C(I,J) = ZERO 50 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 60 I = 1,M C(I,J) = BETA*C(I,J) 60 CONTINUE END IF DO 80 L = 1,K IF (B(L,J).NE.ZERO) THEN TEMP = ALPHA*B(L,J) DO 70 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 70 CONTINUE END IF 80 CONTINUE 90 CONTINUE ELSE IF (CONJA) THEN * * Form C := alpha*conjg( A' )*B + beta*C. * DO 120 J = 1,N DO 110 I = 1,M TEMP = ZERO DO 100 L = 1,K TEMP = TEMP + DCONJG(A(L,I))*B(L,J) 100 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 110 CONTINUE 120 CONTINUE ELSE * * Form C := alpha*A'*B + beta*C * DO 150 J = 1,N DO 140 I = 1,M TEMP = ZERO DO 130 L = 1,K TEMP = TEMP + A(L,I)*B(L,J) 130 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 140 CONTINUE 150 CONTINUE END IF ELSE IF (NOTA) THEN IF (CONJB) THEN * * Form C := alpha*A*conjg( B' ) + beta*C. * DO 200 J = 1,N IF (BETA.EQ.ZERO) THEN DO 160 I = 1,M C(I,J) = ZERO 160 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 170 I = 1,M C(I,J) = BETA*C(I,J) 170 CONTINUE END IF DO 190 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*DCONJG(B(J,L)) DO 180 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 180 CONTINUE END IF 190 CONTINUE 200 CONTINUE ELSE * * Form C := alpha*A*B' + beta*C * DO 250 J = 1,N IF (BETA.EQ.ZERO) THEN DO 210 I = 1,M C(I,J) = ZERO 210 CONTINUE ELSE IF (BETA.NE.ONE) THEN DO 220 I = 1,M C(I,J) = BETA*C(I,J) 220 CONTINUE END IF DO 240 L = 1,K IF (B(J,L).NE.ZERO) THEN TEMP = ALPHA*B(J,L) DO 230 I = 1,M C(I,J) = C(I,J) + TEMP*A(I,L) 230 CONTINUE END IF 240 CONTINUE 250 CONTINUE END IF ELSE IF (CONJA) THEN IF (CONJB) THEN * * Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. * DO 280 J = 1,N DO 270 I = 1,M TEMP = ZERO DO 260 L = 1,K TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L)) 260 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 270 CONTINUE 280 CONTINUE ELSE * * Form C := alpha*conjg( A' )*B' + beta*C * DO 310 J = 1,N DO 300 I = 1,M TEMP = ZERO DO 290 L = 1,K TEMP = TEMP + DCONJG(A(L,I))*B(J,L) 290 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 300 CONTINUE 310 CONTINUE END IF ELSE IF (CONJB) THEN * * Form C := alpha*A'*conjg( B' ) + beta*C * DO 340 J = 1,N DO 330 I = 1,M TEMP = ZERO DO 320 L = 1,K TEMP = TEMP + A(L,I)*DCONJG(B(J,L)) 320 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 330 CONTINUE 340 CONTINUE ELSE * * Form C := alpha*A'*B' + beta*C * DO 370 J = 1,N DO 360 I = 1,M TEMP = ZERO DO 350 L = 1,K TEMP = TEMP + A(L,I)*B(J,L) 350 CONTINUE IF (BETA.EQ.ZERO) THEN C(I,J) = ALPHA*TEMP ELSE C(I,J) = ALPHA*TEMP + BETA*C(I,J) END IF 360 CONTINUE 370 CONTINUE END IF END IF * RETURN * * End of ZGEMM . * END OpenBLAS-0.2.20/reference/zgemvf.f000066400000000000000000000236601313527062700165560ustar00rootroot00000000000000 SUBROUTINE ZGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. DOUBLE COMPLEX ALPHA, BETA INTEGER INCX, INCY, LDA, M, N CHARACTER*1 TRANS * .. Array Arguments .. DOUBLE COMPLEX A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * CGEMV performs one of the matrix-vector operations * * y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or * * y := alpha*conjg( A' )*x + beta*y, * * where alpha and beta are scalars, x and y are vectors and A is an * m by n matrix. * * Parameters * ========== * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' y := alpha*A*x + beta*y. * * TRANS = 'T' or 't' y := alpha*A'*x + beta*y. * * TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * X - COMPLEX array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX array of DIMENSION at least * ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' * and at least * ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. * Before entry with BETA non-zero, the incremented array Y * must contain the vector y. On exit, Y is overwritten by the * updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE COMPLEX ONE PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) DOUBLE COMPLEX ZERO PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. Local Scalars .. DOUBLE COMPLEX TEMP INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY LOGICAL NOCONJ, NOTRANS, XCONJ * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ).AND. $ .NOT.LSAME( TRANS, 'O' ).AND. $ .NOT.LSAME( TRANS, 'U' ).AND. $ .NOT.LSAME( TRANS, 'S' ).AND. $ .NOT.LSAME( TRANS, 'D' ) )THEN INFO = 1 ELSE IF( M.LT.0 )THEN INFO = 2 ELSE IF( N.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'CGEMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) * * Set LENX and LENY, the lengths of the vectors x and y, and set * up the start points in X and Y. * IF(NOTRANS)THEN LENX = N LENY = M ELSE LENX = M LENY = N END IF IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( LENX - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( LENY - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, LENY Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, LENY Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, LENY Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, LENY Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF(NOTRANS)THEN * * Form y := alpha*A*x + y. * JX = KX IF( INCY.EQ.1 )THEN DO 60, J = 1, N IF( X( JX ).NE.ZERO )THEN IF (XCONJ) THEN TEMP = ALPHA*X( JX ) ELSE TEMP = ALPHA*DCONJG(X( JX )) ENDIF IF (NOCONJ) THEN DO 50, I = 1, M Y( I ) = Y( I ) + TEMP*A( I, J ) 50 CONTINUE ELSE DO 55, I = 1, M Y( I ) = Y( I ) + TEMP*DCONJG(A( I, J )) 55 CONTINUE ENDIF END IF JX = JX + INCX 60 CONTINUE ELSE DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF (XCONJ) THEN TEMP = ALPHA*X( JX ) ELSE TEMP = ALPHA*DCONJG(X( JX )) ENDIF IY = KY IF (NOCONJ) THEN DO 70, I = 1, M Y( IY ) = Y( IY ) + TEMP*A( I, J ) IY = IY + INCY 70 CONTINUE ELSE DO 75, I = 1, M Y( IY ) = Y( IY ) + TEMP* DCONJG(A( I, J )) IY = IY + INCY 75 CONTINUE ENDIF END IF JX = JX + INCX 80 CONTINUE END IF ELSE * * Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. * JY = KY IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = ZERO IF( NOCONJ )THEN DO 90, I = 1, M IF (XCONJ) THEN TEMP = TEMP + A( I, J )*X( I ) ELSE TEMP = TEMP + A( I, J )*DCONJG(X( I )) ENDIF 90 CONTINUE ELSE DO 100, I = 1, M IF (XCONJ) THEN TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) ELSE TEMP = TEMP + DCONJG( A( I, J ) )*DCONJG(X( I )) ENDIF 100 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 110 CONTINUE ELSE DO 140, J = 1, N TEMP = ZERO IX = KX IF( NOCONJ )THEN DO 120, I = 1, M IF (XCONJ) THEN TEMP = TEMP + A( I, J )*X( IX ) ELSE TEMP = TEMP + A( I, J )*DCONJG(X( IX )) ENDIF IX = IX + INCX 120 CONTINUE ELSE DO 130, I = 1, M IF (XCONJ) THEN TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) ELSE TEMP = TEMP + DCONJG( A( I, J ) )*DCONJG(X( IX )) ENDIF IX = IX + INCX 130 CONTINUE END IF Y( JY ) = Y( JY ) + ALPHA*TEMP JY = JY + INCY 140 CONTINUE END IF END IF * RETURN * * End of CGEMV . * END OpenBLAS-0.2.20/reference/zgercf.f000066400000000000000000000105001313527062700165250ustar00rootroot00000000000000 SUBROUTINE ZGERCF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX*16 ALPHA INTEGER INCX, INCY, LDA, M, N * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZGERC performs the rank 1 operation * * A := alpha*x*conjg( y' ) + A, * * where alpha is a scalar, x is an m element vector, y is an n element * vector and A is an m by n matrix. * * Parameters * ========== * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( m - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the m * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. On exit, A is * overwritten by the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JY, KX * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( M.LT.0 )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZGERC ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( INCY.GT.0 )THEN JY = 1 ELSE JY = 1 - ( N - 1 )*INCY END IF IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( Y( JY ) ) DO 10, I = 1, M A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( M - 1 )*INCX END IF DO 40, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( Y( JY ) ) IX = KX DO 30, I = 1, M A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF * RETURN * * End of ZGERC . * END OpenBLAS-0.2.20/reference/zgeruf.f000066400000000000000000000104331313527062700165540ustar00rootroot00000000000000 SUBROUTINE ZGERUF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX*16 ALPHA INTEGER INCX, INCY, LDA, M, N * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZGERU performs the rank 1 operation * * A := alpha*x*y' + A, * * where alpha is a scalar, x is an m element vector, y is an n element * vector and A is an m by n matrix. * * Parameters * ========== * * M - INTEGER. * On entry, M specifies the number of rows of the matrix A. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( m - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the m * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry, the leading m by n part of the array A must * contain the matrix of coefficients. On exit, A is * overwritten by the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, m ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JY, KX * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( M.LT.0 )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, M ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZGERU ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( INCY.GT.0 )THEN JY = 1 ELSE JY = 1 - ( N - 1 )*INCY END IF IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) DO 10, I = 1, M A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF JY = JY + INCY 20 CONTINUE ELSE IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( M - 1 )*INCX END IF DO 40, J = 1, N IF( Y( JY ).NE.ZERO )THEN TEMP = ALPHA*Y( JY ) IX = KX DO 30, I = 1, M A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JY = JY + INCY 40 CONTINUE END IF * RETURN * * End of ZGERU . * END OpenBLAS-0.2.20/reference/zgesvf.f000066400000000000000000000064001313527062700165550ustar00rootroot00000000000000 SUBROUTINE ZGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK driver routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZGESV computes the solution to a complex system of linear equations * A * X = B, * where A is an N-by-N matrix and X and B are N-by-NRHS matrices. * * The LU decomposition with partial pivoting and row interchanges is * used to factor A as * A = P * L * U, * where P is a permutation matrix, L is unit lower triangular, and U is * upper triangular. The factored form of A is then used to solve the * system of equations A * X = B. * * Arguments * ========= * * N (input) INTEGER * The number of linear equations, i.e., the order of the * matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the N-by-N coefficient matrix A. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (output) INTEGER array, dimension (N) * The pivot indices that define the permutation matrix P; * row i of the matrix was interchanged with row IPIV(i). * * B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) * On entry, the N-by-NRHS matrix of right hand side matrix B. * On exit, if INFO = 0, the N-by-NRHS solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, so the solution could not be computed. * * ===================================================================== * * .. External Subroutines .. EXTERNAL XERBLA, ZGETRF, ZGETRS * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( N.LT.0 ) THEN INFO = -1 ELSE IF( NRHS.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGESV ', -INFO ) RETURN END IF * * Compute the LU factorization of A. * CALL ZGETRF( N, N, A, LDA, IPIV, INFO ) IF( INFO.EQ.0 ) THEN * * Solve the system A*X = B, overwriting B with X. * CALL ZGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, $ INFO ) END IF RETURN * * End of ZGESV * END OpenBLAS-0.2.20/reference/zgetf2f.f000066400000000000000000000074261313527062700166310ustar00rootroot00000000000000 SUBROUTINE ZGETF2F( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZGETF2 computes an LU factorization of a general m-by-n matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 2 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the m by n matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, U(k,k) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER J, JP * .. * .. External Functions .. INTEGER IZAMAX EXTERNAL IZAMAX * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGERU, ZSCAL, ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGETF2', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * DO 10 J = 1, MIN( M, N ) * * Find pivot and test for singularity. * JP = J - 1 + IZAMAX( M-J+1, A( J, J ), 1 ) IPIV( J ) = JP IF( A( JP, J ).NE.ZERO ) THEN * * Apply the interchange to columns 1:N. * IF( JP.NE.J ) $ CALL ZSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) * * Compute elements J+1:M of J-th column. * IF( J.LT.M ) $ CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * INFO = J END IF * IF( J.LT.MIN( M, N ) ) THEN * * Update trailing submatrix. * CALL ZGERU( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), $ LDA, A( J+1, J+1 ), LDA ) END IF 10 CONTINUE RETURN * * End of ZGETF2 * END OpenBLAS-0.2.20/reference/zgetrff.f000066400000000000000000000107731313527062700167300ustar00rootroot00000000000000 SUBROUTINE ZGETRFF( M, N, A, LDA, IPIV, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZGETRF computes an LU factorization of a general M-by-N matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 3 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the M-by-N matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, U(i,i) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IINFO, J, JB, NB * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEMM, ZGETF2, ZLASWP, ZTRSM * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGETRF', -INFO ) RETURN END IF * * Quick return if possible * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 64 IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN * * Use unblocked code. * CALL ZGETF2( M, N, A, LDA, IPIV, INFO ) ELSE * * Use blocked code. * DO 20 J = 1, MIN( M, N ), NB JB = MIN( MIN( M, N )-J+1, NB ) * * Factor diagonal and subdiagonal blocks and test for exact * singularity. * CALL ZGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) * * Adjust INFO and the pivot indices. * IF( INFO.EQ.0 .AND. IINFO.GT.0 ) $ INFO = IINFO + J - 1 DO 10 I = J, MIN( M, J+JB-1 ) IPIV( I ) = J - 1 + IPIV( I ) 10 CONTINUE * * Apply interchanges to columns 1:J-1. * CALL ZLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) * IF( J+JB.LE.N ) THEN * * Apply interchanges to columns J+JB:N. * CALL ZLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, $ IPIV, 1 ) * * Compute block row of U. * CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), $ LDA ) IF( J+JB.LE.M ) THEN * * Update trailing submatrix. * CALL ZGEMM( 'No transpose', 'No transpose', M-J-JB+1, $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), $ LDA ) END IF END IF 20 CONTINUE END IF RETURN * * End of ZGETRF * END OpenBLAS-0.2.20/reference/zgetrsf.f000066400000000000000000000102221313527062700167320ustar00rootroot00000000000000 SUBROUTINE ZGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER TRANS INTEGER INFO, LDA, LDB, N, NRHS * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZGETRS solves a system of linear equations * A * X = B, A**T * X = B, or A**H * X = B * with a general N-by-N matrix A using the LU factorization computed * by ZGETRF. * * Arguments * ========= * * TRANS (input) CHARACTER*1 * Specifies the form of the system of equations: * = 'N': A * X = B (No transpose) * = 'T': A**T * X = B (Transpose) * = 'C': A**H * X = B (Conjugate transpose) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * NRHS (input) INTEGER * The number of right hand sides, i.e., the number of columns * of the matrix B. NRHS >= 0. * * A (input) COMPLEX*16 array, dimension (LDA,N) * The factors L and U from the factorization A = P*L*U * as computed by ZGETRF. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * IPIV (input) INTEGER array, dimension (N) * The pivot indices from ZGETRF; for 1<=i<=N, row i of the * matrix was interchanged with row IPIV(i). * * B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) * On entry, the right hand side matrix B. * On exit, the solution matrix X. * * LDB (input) INTEGER * The leading dimension of the array B. LDB >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL NOTRAN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA, ZLASWP, ZTRSM * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 NOTRAN = LSAME( TRANS, 'N' ) .OR. LSAME(TRANS, 'R') IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( NRHS.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGETRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 .OR. NRHS.EQ.0 ) $ RETURN * IF( NOTRAN ) THEN * * Solve A * X = B. * * Apply row interchanges to the right hand sides. * CALL ZLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) * * Solve L*X = B, overwriting B with X. * CALL ZTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, $ ONE, A, LDA, B, LDB ) * * Solve U*X = B, overwriting B with X. * CALL ZTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, $ NRHS, ONE, A, LDA, B, LDB ) ELSE * * Solve A**T * X = B or A**H * X = B. * * Solve U'*X = B, overwriting B with X. * CALL ZTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, ONE, $ A, LDA, B, LDB ) * * Solve L'*X = B, overwriting B with X. * CALL ZTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, A, $ LDA, B, LDB ) * * Apply row interchanges to the solution vectors. * CALL ZLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) END IF * RETURN * * End of ZGETRS * END OpenBLAS-0.2.20/reference/zhbmvf.f000066400000000000000000000316151313527062700165530ustar00rootroot00000000000000 SUBROUTINE ZHBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA INTEGER INCX, INCY, K, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZHBMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n hermitian band matrix, with k super-diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the band matrix A is being supplied as * follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * being supplied. * * UPLO = 'L' or 'l' The lower triangular part of A is * being supplied. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry, K specifies the number of super-diagonals of the * matrix A. K must satisfy 0 .le. K. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the hermitian matrix, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer the upper * triangular part of a hermitian band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the hermitian matrix, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer the lower * triangular part of a hermitian band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that the imaginary parts of the diagonal elements need * not be set and are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX*16 array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * Y - COMPLEX*16 array of DIMENSION at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, MIN, DBLE * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ).AND. $ .NOT.LSAME( UPLO, 'V' ).AND. $ .NOT.LSAME( UPLO, 'M' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( K.LT.0 )THEN INFO = 3 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 ELSE IF( INCY.EQ.0 )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array A * are accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN * * Form y when upper triangle of A is stored. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO L = KPLUS1 - J DO 50, I = MAX( 1, J - K ), J - 1 Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*DBLE( A( KPLUS1, J ) ) $ + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY L = KPLUS1 - J DO 70, I = MAX( 1, J - K ), J - 1 Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*DBLE( A( KPLUS1, J ) ) $ + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY IF( J.GT.K )THEN KX = KX + INCX KY = KY + INCY END IF 80 CONTINUE END IF RETURN ENDIF * * Form y when lower triangle of A is stored. * IF( LSAME( UPLO, 'L' ) )THEN IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*DBLE( A( 1, J ) ) L = 1 - J DO 90, I = J + 1, MIN( N, J + K ) Y( I ) = Y( I ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*DBLE( A( 1, J ) ) L = 1 - J IX = JX IY = JY DO 110, I = J + 1, MIN( N, J + K ) IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF RETURN END IF * * Form y when upper triangle of A is stored. * IF( LSAME( UPLO, 'V' ) )THEN KPLUS1 = K + 1 IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 160, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO L = KPLUS1 - J DO 150, I = MAX( 1, J - K ), J - 1 Y( I ) = Y( I ) + TEMP1*DCONJG(A( L + I, J )) TEMP2 = TEMP2 + A( L + I, J )*X( I ) 150 CONTINUE Y( J ) = Y( J ) + TEMP1*DBLE( A( KPLUS1, J ) ) $ + ALPHA*TEMP2 160 CONTINUE ELSE JX = KX JY = KY DO 180, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY L = KPLUS1 - J DO 170, I = MAX( 1, J - K ), J - 1 Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( L + I, J )) TEMP2 = TEMP2 + A( L + I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 170 CONTINUE Y( JY ) = Y( JY ) + TEMP1*DBLE( A( KPLUS1, J ) ) $ + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY IF( J.GT.K )THEN KX = KX + INCX KY = KY + INCY END IF 180 CONTINUE END IF RETURN ENDIF * * Form y when lower triangle of A is stored. * IF( LSAME( UPLO, 'M' ) )THEN IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 200, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*DBLE( A( 1, J ) ) L = 1 - J DO 190, I = J + 1, MIN( N, J + K ) Y( I ) = Y( I ) + TEMP1*DCONJG(A( L + I, J )) TEMP2 = TEMP2 + A( L + I, J )*X( I ) 190 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 200 CONTINUE ELSE JX = KX JY = KY DO 220, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*DBLE( A( 1, J ) ) L = 1 - J IX = JX IY = JY DO 210, I = J + 1, MIN( N, J + K ) IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( L + I, J )) TEMP2 = TEMP2 + A( L + I, J )*X( IX ) 210 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 220 CONTINUE END IF RETURN END IF * RETURN * * End of ZHBMV . * END OpenBLAS-0.2.20/reference/zhemm3mf.f000066400000000000000000000241261313527062700170040ustar00rootroot00000000000000 SUBROUTINE ZHEMM3MF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX*16 ALPHA, BETA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZHEMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is an hermitian matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the hermitian matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the hermitian matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * hermitian matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * hermitian matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, DBLE * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX*16 TEMP1, TEMP2 * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHEMM3M', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*DCONJG( A( K, I ) ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*DCONJG( A( K, I ) ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*DBLE( A( J, J ) ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*DCONJG( A( J, K ) ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*DCONJG( A( J, K ) ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of ZHEMM . * END OpenBLAS-0.2.20/reference/zhemmf.f000066400000000000000000000241241313527062700165420ustar00rootroot00000000000000 SUBROUTINE ZHEMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX*16 ALPHA, BETA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZHEMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is an hermitian matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the hermitian matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the hermitian matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * hermitian matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * hermitian matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the hermitian matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the hermitian matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the hermitian * matrix and the strictly upper triangular part of A is not * referenced. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, DBLE * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX*16 TEMP1, TEMP2 * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHEMM3M', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*DCONJG( A( K, I ) ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1*A( K, I ) TEMP2 = TEMP2 + $ B( K, J )*DCONJG( A( K, I ) ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*DBLE( A( I, I ) ) + $ ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*DBLE( A( J, J ) ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*DCONJG( A( J, K ) ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*DCONJG( A( J, K ) ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of ZHEMM . * END OpenBLAS-0.2.20/reference/zhemvf.f000066400000000000000000000253011313527062700165510ustar00rootroot00000000000000 SUBROUTINE ZHEMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, $ BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZHEMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n hermitian matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of A is not referenced. * Note that the imaginary parts of the diagonal elements need * not be set and are assumed to be zero. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, DBLE * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ).AND. $ .NOT.LSAME( UPLO, 'V' ).AND. $ .NOT.LSAME( UPLO, 'M' ))THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 5 ELSE IF( INCX.EQ.0 )THEN INFO = 7 ELSE IF( INCY.EQ.0 )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHEMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF RETURN ENDIF IF( LSAME( UPLO, 'L' ) )THEN * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) IX = JX IY = JY DO 110, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF RETURN END IF IF( LSAME( UPLO, 'V' ) )THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 160, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 150, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1* DCONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( I ) 150 CONTINUE Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 160 CONTINUE ELSE JX = KX JY = KY DO 180, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 170, I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1* DCONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 170 CONTINUE Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 180 CONTINUE END IF RETURN ENDIF IF( LSAME( UPLO, 'M' ) )THEN * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 200, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) DO 190, I = J + 1, N Y( I ) = Y( I ) + TEMP1*DCONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( I ) 190 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 200 CONTINUE ELSE JX = KX JY = KY DO 220, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) IX = JX IY = JY DO 210, I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( I, J )) TEMP2 = TEMP2 + A( I, J )*X( IX ) 210 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 220 CONTINUE END IF RETURN END IF * RETURN * * End of ZHEMV . * END OpenBLAS-0.2.20/reference/zher2f.f000066400000000000000000000203531313527062700164540ustar00rootroot00000000000000 SUBROUTINE ZHER2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX*16 ALPHA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZHER2 performs the hermitian rank 2 operation * * A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, * * where alpha is a scalar, x and y are n element vectors and A is an n * by n hermitian matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, DBLE * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHER2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( J ) ) TEMP2 = DCONJG( ALPHA*X( J ) ) DO 10, I = 1, J - 1 A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE A( J, J ) = DBLE( A( J, J ) ) + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) ELSE A( J, J ) = DBLE( A( J, J ) ) END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( JY ) ) TEMP2 = DCONJG( ALPHA*X( JX ) ) IX = KX IY = KY DO 30, I = 1, J - 1 A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE A( J, J ) = DBLE( A( J, J ) ) + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) ELSE A( J, J ) = DBLE( A( J, J ) ) END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( J ) ) TEMP2 = DCONJG( ALPHA*X( J ) ) A( J, J ) = DBLE( A( J, J ) ) + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) DO 50, I = J + 1, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE ELSE A( J, J ) = DBLE( A( J, J ) ) END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( JY ) ) TEMP2 = DCONJG( ALPHA*X( JX ) ) A( J, J ) = DBLE( A( J, J ) ) + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) IX = JX IY = JY DO 70, I = J + 1, N IX = IX + INCX IY = IY + INCY A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 70 CONTINUE ELSE A( J, J ) = DBLE( A( J, J ) ) END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of ZHER2 . * END OpenBLAS-0.2.20/reference/zher2kf.f000066400000000000000000000317541313527062700166360ustar00rootroot00000000000000 SUBROUTINE ZHER2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B,LDB, BETA, $ C, LDC ) * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER K, LDA, LDB, LDC, N DOUBLE PRECISION BETA COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZHER2K performs one of the hermitian rank 2k operations * * C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C, * * or * * C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C, * * where alpha and beta are scalars with beta real, C is an n by n * hermitian matrix and A and B are n by k matrices in the first case * and k by n matrices in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*conjg( B' ) + * conjg( alpha )*B*conjg( A' ) + * beta*C. * * TRANS = 'C' or 'c' C := alpha*conjg( A' )*B + * conjg( alpha )*conjg( B' )*A + * beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrices A and B, and on entry with * TRANS = 'C' or 'c', K specifies the number of rows of the * matrices A and B. K must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array B must contain the matrix B, otherwise * the leading k by n part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDB must be at least max( 1, n ), otherwise LDB must * be at least max( 1, k ). * Unchanged on exit. * * BETA - DOUBLE PRECISION . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1. * Ed Anderson, Cray Research Inc. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCONJG, MAX * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA COMPLEX*16 TEMP1, TEMP2 * .. * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) ) THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN INFO = 2 ELSE IF( N.LT.0 ) THEN INFO = 3 ELSE IF( K.LT.0 ) THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, NROWA ) ) THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, N ) ) THEN INFO = 12 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHER2K', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND. $ ( BETA.EQ.ONE ) ) )RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO ) THEN IF( UPPER ) THEN IF( BETA.EQ.DBLE( ZERO ) ) THEN DO 20 J = 1, N DO 10 I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 30 CONTINUE C( J, J ) = BETA*DBLE( C( J, J ) ) 40 CONTINUE END IF ELSE IF( BETA.EQ.DBLE( ZERO ) ) THEN DO 60 J = 1, N DO 50 I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80 J = 1, N C( J, J ) = BETA*DBLE( C( J, J ) ) DO 70 I = J + 1, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) ) THEN * * Form C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + * C. * IF( UPPER ) THEN DO 130 J = 1, N IF( BETA.EQ.DBLE( ZERO ) ) THEN DO 90 I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 100 CONTINUE C( J, J ) = BETA*DBLE( C( J, J ) ) ELSE C( J, J ) = DBLE( C( J, J ) ) END IF DO 120 L = 1, K IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) ) $ THEN TEMP1 = ALPHA*DCONJG( B( J, L ) ) TEMP2 = DCONJG( ALPHA*A( J, L ) ) DO 110 I = 1, J - 1 C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 110 CONTINUE C( J, J ) = DBLE( C( J, J ) ) + $ DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 ) END IF 120 CONTINUE 130 CONTINUE ELSE DO 180 J = 1, N IF( BETA.EQ.DBLE( ZERO ) ) THEN DO 140 I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 150 I = J + 1, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE C( J, J ) = BETA*DBLE( C( J, J ) ) ELSE C( J, J ) = DBLE( C( J, J ) ) END IF DO 170 L = 1, K IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) ) $ THEN TEMP1 = ALPHA*DCONJG( B( J, L ) ) TEMP2 = DCONJG( ALPHA*A( J, L ) ) DO 160 I = J + 1, N C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 160 CONTINUE C( J, J ) = DBLE( C( J, J ) ) + $ DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 ) END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + * C. * IF( UPPER ) THEN DO 210 J = 1, N DO 200 I = 1, J TEMP1 = ZERO TEMP2 = ZERO DO 190 L = 1, K TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J ) TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J ) 190 CONTINUE IF( I.EQ.J ) THEN IF( BETA.EQ.DBLE( ZERO ) ) THEN C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* $ TEMP2 ) ELSE C( J, J ) = BETA*DBLE( C( J, J ) ) + $ DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* $ TEMP2 ) END IF ELSE IF( BETA.EQ.DBLE( ZERO ) ) THEN C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2 ELSE C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 + $ DCONJG( ALPHA )*TEMP2 END IF END IF 200 CONTINUE 210 CONTINUE ELSE DO 240 J = 1, N DO 230 I = J, N TEMP1 = ZERO TEMP2 = ZERO DO 220 L = 1, K TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J ) TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J ) 220 CONTINUE IF( I.EQ.J ) THEN IF( BETA.EQ.DBLE( ZERO ) ) THEN C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* $ TEMP2 ) ELSE C( J, J ) = BETA*DBLE( C( J, J ) ) + $ DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* $ TEMP2 ) END IF ELSE IF( BETA.EQ.DBLE( ZERO ) ) THEN C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2 ELSE C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 + $ DCONJG( ALPHA )*TEMP2 END IF END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of ZHER2K. * END OpenBLAS-0.2.20/reference/zherf.f000066400000000000000000000152241313527062700163730ustar00rootroot00000000000000 SUBROUTINE ZHERF ( UPLO, N, ALPHA, X, INCX, A, LDA ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZHER performs the hermitian rank 1 operation * * A := alpha*x*conjg( x' ) + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n hermitian matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JX, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, DBLE * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHER ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( J ) ) DO 10, I = 1, J - 1 A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( J )*TEMP ) ELSE A( J, J ) = DBLE( A( J, J ) ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( JX ) ) IX = KX DO 30, I = 1, J - 1 A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( JX )*TEMP ) ELSE A( J, J ) = DBLE( A( J, J ) ) END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( J ) ) A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( J ) ) DO 50, I = J + 1, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE ELSE A( J, J ) = DBLE( A( J, J ) ) END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( JX ) ) A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( JX ) ) IX = JX DO 70, I = J + 1, N IX = IX + INCX A( I, J ) = A( I, J ) + X( IX )*TEMP 70 CONTINUE ELSE A( J, J ) = DBLE( A( J, J ) ) END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of ZHER . * END OpenBLAS-0.2.20/reference/zherkf.f000066400000000000000000000252071313527062700165500ustar00rootroot00000000000000 SUBROUTINE ZHERKF( UPLO,TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER TRANS, UPLO INTEGER K, LDA, LDC, N DOUBLE PRECISION ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZHERK performs one of the hermitian rank k operations * * C := alpha*A*conjg( A' ) + beta*C, * * or * * C := alpha*conjg( A' )*A + beta*C, * * where alpha and beta are real scalars, C is an n by n hermitian * matrix and A is an n by k matrix in the first case and a k by n * matrix in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*conjg( A' ) + beta*C. * * TRANS = 'C' or 'c' C := alpha*conjg( A' )*A + beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrix A, and on entry with * TRANS = 'C' or 'c', K specifies the number of rows of the * matrix A. K must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * BETA - DOUBLE PRECISION. * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the hermitian matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the hermitian matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1. * Ed Anderson, Cray Research Inc. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, MAX * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA DOUBLE PRECISION RTEMP COMPLEX*16 TEMP * .. * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) ) THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND. $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN INFO = 2 ELSE IF( N.LT.0 ) THEN INFO = 3 ELSE IF( K.LT.0 ) THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN INFO = 7 ELSE IF( LDC.LT.MAX( 1, N ) ) THEN INFO = 10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHERK ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND. $ ( BETA.EQ.ONE ) ) )RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO ) THEN IF( UPPER ) THEN IF( BETA.EQ.ZERO ) THEN DO 20 J = 1, N DO 10 I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40 J = 1, N DO 30 I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 30 CONTINUE C( J, J ) = BETA*DBLE( C( J, J ) ) 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO ) THEN DO 60 J = 1, N DO 50 I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80 J = 1, N C( J, J ) = BETA*DBLE( C( J, J ) ) DO 70 I = J + 1, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) ) THEN * * Form C := alpha*A*conjg( A' ) + beta*C. * IF( UPPER ) THEN DO 130 J = 1, N IF( BETA.EQ.ZERO ) THEN DO 90 I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE ) THEN DO 100 I = 1, J - 1 C( I, J ) = BETA*C( I, J ) 100 CONTINUE C( J, J ) = BETA*DBLE( C( J, J ) ) ELSE C( J, J ) = DBLE( C( J, J ) ) END IF DO 120 L = 1, K IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN TEMP = ALPHA*DCONJG( A( J, L ) ) DO 110 I = 1, J - 1 C( I, J ) = C( I, J ) + TEMP*A( I, L ) 110 CONTINUE C( J, J ) = DBLE( C( J, J ) ) + $ DBLE( TEMP*A( I, L ) ) END IF 120 CONTINUE 130 CONTINUE ELSE DO 180 J = 1, N IF( BETA.EQ.ZERO ) THEN DO 140 I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE ) THEN C( J, J ) = BETA*DBLE( C( J, J ) ) DO 150 I = J + 1, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE ELSE C( J, J ) = DBLE( C( J, J ) ) END IF DO 170 L = 1, K IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN TEMP = ALPHA*DCONJG( A( J, L ) ) C( J, J ) = DBLE( C( J, J ) ) + $ DBLE( TEMP*A( J, L ) ) DO 160 I = J + 1, N C( I, J ) = C( I, J ) + TEMP*A( I, L ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*conjg( A' )*A + beta*C. * IF( UPPER ) THEN DO 220 J = 1, N DO 200 I = 1, J - 1 TEMP = ZERO DO 190 L = 1, K TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO ) THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 200 CONTINUE RTEMP = ZERO DO 210 L = 1, K RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J ) 210 CONTINUE IF( BETA.EQ.ZERO ) THEN C( J, J ) = ALPHA*RTEMP ELSE C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) ) END IF 220 CONTINUE ELSE DO 260 J = 1, N RTEMP = ZERO DO 230 L = 1, K RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J ) 230 CONTINUE IF( BETA.EQ.ZERO ) THEN C( J, J ) = ALPHA*RTEMP ELSE C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) ) END IF DO 250 I = J + 1, N TEMP = ZERO DO 240 L = 1, K TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J ) 240 CONTINUE IF( BETA.EQ.ZERO ) THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 250 CONTINUE 260 CONTINUE END IF END IF * RETURN * * End of ZHERK . * END OpenBLAS-0.2.20/reference/zhpmvf.f000066400000000000000000000206011313527062700165620ustar00rootroot00000000000000 SUBROUTINE ZHPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZHPMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n hermitian matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * AP - COMPLEX*16 array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. * Note that the imaginary parts of the diagonal elements need * not be set and are assumed to be zero. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, DBLE * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 6 ELSE IF( INCY.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHPMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * * First form y := beta*y. * IF( BETA.NE.ONE )THEN IF( INCY.EQ.1 )THEN IF( BETA.EQ.ZERO )THEN DO 10, I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20, I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO )THEN DO 30, I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40, I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form y when AP contains the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO K = KK DO 50, I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( I ) K = K + 1 50 CONTINUE Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK + J - 1 ) ) $ + ALPHA*TEMP2 KK = KK + J 60 CONTINUE ELSE JX = KX JY = KY DO 80, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70, K = KK, KK + J - 2 Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK + J - 1 ) ) $ + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + J 80 CONTINUE END IF ELSE * * Form y when AP contains the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 100, J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK ) ) K = KK + 1 DO 90, I = J + 1, N Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( I ) K = K + 1 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 KK = KK + ( N - J + 1 ) 100 CONTINUE ELSE JX = KX JY = KY DO 120, J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK ) ) IX = JX IY = JY DO 110, K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + ( N - J + 1 ) 120 CONTINUE END IF END IF * RETURN * * End of ZHPMV . * END OpenBLAS-0.2.20/reference/zhpr2f.f000066400000000000000000000205251313527062700164700ustar00rootroot00000000000000 SUBROUTINE ZHPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) * .. Scalar Arguments .. COMPLEX*16 ALPHA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZHPR2 performs the hermitian rank 2 operation * * A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, * * where alpha is a scalar, x and y are n element vectors and A is an * n by n hermitian matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * AP - COMPLEX*16 array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, DBLE * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHPR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( J ) ) TEMP2 = DCONJG( ALPHA*X( J ) ) K = KK DO 10, I = 1, J - 1 AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 10 CONTINUE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) ELSE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) END IF KK = KK + J 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( JY ) ) TEMP2 = DCONJG( ALPHA*X( JX ) ) IX = KX IY = KY DO 30, K = KK, KK + J - 2 AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + $ DBLE( X( JX )*TEMP1 + $ Y( JY )*TEMP2 ) ELSE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) END IF JX = JX + INCX JY = JY + INCY KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( J ) ) TEMP2 = DCONJG( ALPHA*X( J ) ) AP( KK ) = DBLE( AP( KK ) ) + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) K = KK + 1 DO 50, I = J + 1, N AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 50 CONTINUE ELSE AP( KK ) = DBLE( AP( KK ) ) END IF KK = KK + N - J + 1 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*DCONJG( Y( JY ) ) TEMP2 = DCONJG( ALPHA*X( JX ) ) AP( KK ) = DBLE( AP( KK ) ) + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) IX = JX IY = JY DO 70, K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 70 CONTINUE ELSE AP( KK ) = DBLE( AP( KK ) ) END IF JX = JX + INCX JY = JY + INCY KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of ZHPR2 . * END OpenBLAS-0.2.20/reference/zhprf.f000066400000000000000000000155721313527062700164140ustar00rootroot00000000000000 SUBROUTINE ZHPRF ( UPLO, N, ALPHA, X, INCX, AP ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA INTEGER INCX, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ) * .. * * Purpose * ======= * * ZHPR performs the hermitian rank 1 operation * * A := alpha*x*conjg( x' ) + A, * * where alpha is a real scalar, x is an n element vector and A is an * n by n hermitian matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * AP - COMPLEX*16 array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the hermitian matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, DBLE * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZHPR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( J ) ) K = KK DO 10, I = 1, J - 1 AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 10 CONTINUE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) $ + DBLE( X( J )*TEMP ) ELSE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( JX ) ) IX = KX DO 30, K = KK, KK + J - 2 AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) $ + DBLE( X( JX )*TEMP ) ELSE AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( J ) ) AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( J ) ) K = KK + 1 DO 50, I = J + 1, N AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 50 CONTINUE ELSE AP( KK ) = DBLE( AP( KK ) ) END IF KK = KK + N - J + 1 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = ALPHA*DCONJG( X( JX ) ) AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( JX ) ) IX = JX DO 70, K = KK + 1, KK + N - J IX = IX + INCX AP( K ) = AP( K ) + X( IX )*TEMP 70 CONTINUE ELSE AP( KK ) = DBLE( AP( KK ) ) END IF JX = JX + INCX KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of ZHPR . * END OpenBLAS-0.2.20/reference/zlaswpf.f000066400000000000000000000063611313527062700167450ustar00rootroot00000000000000 SUBROUTINE ZLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1999 * * .. Scalar Arguments .. INTEGER INCX, K1, K2, LDA, N * .. * .. Array Arguments .. INTEGER IPIV( * ) COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZLASWP performs a series of row interchanges on the matrix A. * One row interchange is initiated for each of rows K1 through K2 of A. * * Arguments * ========= * * N (input) INTEGER * The number of columns of the matrix A. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the matrix of column dimension N to which the row * interchanges will be applied. * On exit, the permuted matrix. * * LDA (input) INTEGER * The leading dimension of the array A. * * K1 (input) INTEGER * The first element of IPIV for which a row interchange will * be done. * * K2 (input) INTEGER * The last element of IPIV for which a row interchange will * be done. * * IPIV (input) INTEGER array, dimension (M*abs(INCX)) * The vector of pivot indices. Only the elements in positions * K1 through K2 of IPIV are accessed. * IPIV(K) = L implies rows K and L are to be interchanged. * * INCX (input) INTEGER * The increment between successive values of IPIV. If IPIV * is negative, the pivots are applied in reverse order. * * Further Details * =============== * * Modified by * R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA * * ===================================================================== * * .. Local Scalars .. INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 COMPLEX*16 TEMP * .. * .. Executable Statements .. * * Interchange row I with row IPIV(I) for each of rows K1 through K2. * IF( INCX.GT.0 ) THEN IX0 = K1 I1 = K1 I2 = K2 INC = 1 ELSE IF( INCX.LT.0 ) THEN IX0 = 1 + ( 1-K2 )*INCX I1 = K2 I2 = K1 INC = -1 ELSE RETURN END IF * N32 = ( N / 32 )*32 IF( N32.NE.0 ) THEN DO 30 J = 1, N32, 32 IX = IX0 DO 20 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 10 K = J, J + 31 TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 10 CONTINUE END IF IX = IX + INCX 20 CONTINUE 30 CONTINUE END IF IF( N32.NE.N ) THEN N32 = N32 + 1 IX = IX0 DO 50 I = I1, I2, INC IP = IPIV( IX ) IF( IP.NE.I ) THEN DO 40 K = N32, N TEMP = A( I, K ) A( I, K ) = A( IP, K ) A( IP, K ) = TEMP 40 CONTINUE END IF IX = IX + INCX 50 CONTINUE END IF * RETURN * * End of ZLASWP * END OpenBLAS-0.2.20/reference/zlauu2f.f000066400000000000000000000102401313527062700166360ustar00rootroot00000000000000 SUBROUTINE ZLAUU2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZLAUU2 computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the unblocked form of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I DOUBLE PRECISION AII * .. * .. External Functions .. LOGICAL LSAME COMPLEX*16 ZDOTC EXTERNAL LSAME, ZDOTC * .. * .. External Subroutines .. EXTERNAL XERBLA, ZDSCAL, ZGEMV, ZLACGV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAUU2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = AII*AII + DBLE( ZDOTC( N-I, A( I, I+1 ), LDA, $ A( I, I+1 ), LDA ) ) CALL ZLACGV( N-I, A( I, I+1 ), LDA ) CALL ZGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), $ LDA, A( I, I+1 ), LDA, DCMPLX( AII ), $ A( 1, I ), 1 ) CALL ZLACGV( N-I, A( I, I+1 ), LDA ) ELSE CALL ZDSCAL( I, AII, A( 1, I ), 1 ) END IF 10 CONTINUE * ELSE * * Compute the product L' * L. * DO 20 I = 1, N AII = A( I, I ) IF( I.LT.N ) THEN A( I, I ) = AII*AII + DBLE( ZDOTC( N-I, A( I+1, I ), 1, $ A( I+1, I ), 1 ) ) CALL ZLACGV( I-1, A( I, 1 ), LDA ) CALL ZGEMV( 'Conjugate transpose', N-I, I-1, ONE, $ A( I+1, 1 ), LDA, A( I+1, I ), 1, $ DCMPLX( AII ), A( I, 1 ), LDA ) CALL ZLACGV( I-1, A( I, 1 ), LDA ) ELSE CALL ZDSCAL( I, AII, A( I, 1 ), LDA ) END IF 20 CONTINUE END IF * RETURN * * End of ZLAUU2 * END OpenBLAS-0.2.20/reference/zlauumf.f000066400000000000000000000116221313527062700167360ustar00rootroot00000000000000 SUBROUTINE ZLAUUMF( UPLO, N, A, LDA, INFO ) * * -- LAPACK auxiliary routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZLAUUM computes the product U * U' or L' * L, where the triangular * factor U or L is stored in the upper or lower triangular part of * the array A. * * If UPLO = 'U' or 'u' then the upper triangle of the result is stored, * overwriting the factor U in A. * If UPLO = 'L' or 'l' then the lower triangle of the result is stored, * overwriting the factor L in A. * * This is the blocked form of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the triangular factor stored in the array A * is upper or lower triangular: * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the triangular factor U or L. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the triangular factor U or L. * On exit, if UPLO = 'U', the upper triangle of A is * overwritten with the upper triangle of the product U * U'; * if UPLO = 'L', the lower triangle of A is overwritten with * the lower triangle of the product L' * L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER I, IB, NB * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEMM, ZHERK, ZLAUU2, ZTRMM * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAUUM', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 128 * IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL ZLAUU2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute the product U * U'. * DO 10 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL ZTRMM( 'Right', 'Upper', 'Conjugate transpose', $ 'Non-unit', I-1, IB, CONE, A( I, I ), LDA, $ A( 1, I ), LDA ) CALL ZLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL ZGEMM( 'No transpose', 'Conjugate transpose', $ I-1, IB, N-I-IB+1, CONE, A( 1, I+IB ), $ LDA, A( I, I+IB ), LDA, CONE, A( 1, I ), $ LDA ) CALL ZHERK( 'Upper', 'No transpose', IB, N-I-IB+1, $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), $ LDA ) END IF 10 CONTINUE ELSE * * Compute the product L' * L. * DO 20 I = 1, N, NB IB = MIN( NB, N-I+1 ) CALL ZTRMM( 'Left', 'Lower', 'Conjugate transpose', $ 'Non-unit', IB, I-1, CONE, A( I, I ), LDA, $ A( I, 1 ), LDA ) CALL ZLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) IF( I+IB.LE.N ) THEN CALL ZGEMM( 'Conjugate transpose', 'No transpose', IB, $ I-1, N-I-IB+1, CONE, A( I+IB, I ), LDA, $ A( I+IB, 1 ), LDA, CONE, A( I, 1 ), LDA ) CALL ZHERK( 'Lower', 'Conjugate transpose', IB, $ N-I-IB+1, ONE, A( I+IB, I ), LDA, ONE, $ A( I, I ), LDA ) END IF 20 CONTINUE END IF END IF * RETURN * * End of ZLAUUM * END OpenBLAS-0.2.20/reference/zpotf2f.f000066400000000000000000000121361313527062700166460ustar00rootroot00000000000000 SUBROUTINE ZPOTF2F( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZPOTF2 computes the Cholesky factorization of a complex Hermitian * positive definite matrix A. * * The factorization has the form * A = U' * U , if UPLO = 'U', or * A = L * L', if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the unblocked version of the algorithm, calling Level 2 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the upper or lower triangular part of the * Hermitian matrix A is stored. * = 'U': Upper triangular * = 'L': Lower triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the Hermitian matrix A. If UPLO = 'U', the leading * n by n upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U'*U or A = L*L'. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, the leading minor of order k is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) COMPLEX*16 CONE PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J DOUBLE PRECISION AJJ * .. * .. External Functions .. LOGICAL LSAME COMPLEX*16 ZDOTC EXTERNAL LSAME, ZDOTC * .. * .. External Subroutines .. EXTERNAL XERBLA, ZDSCAL, ZGEMV, ZLACGV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, SQRT * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZPOTF2', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N * * Compute U(J,J) and test for non-positive-definiteness. * AJJ = DBLE( A( J, J ) ) - ZDOTC( J-1, A( 1, J ), 1, $ A( 1, J ), 1 ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of row J. * IF( J.LT.N ) THEN CALL ZLACGV( J-1, A( 1, J ), 1 ) CALL ZGEMV( 'Transpose', J-1, N-J, -CONE, A( 1, J+1 ), $ LDA, A( 1, J ), 1, CONE, A( J, J+1 ), LDA ) CALL ZLACGV( J-1, A( 1, J ), 1 ) CALL ZDSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) END IF 10 CONTINUE ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N * * Compute L(J,J) and test for non-positive-definiteness. * AJJ = DBLE( A( J, J ) ) - ZDOTC( J-1, A( J, 1 ), LDA, $ A( J, 1 ), LDA ) IF( AJJ.LE.ZERO ) THEN A( J, J ) = AJJ GO TO 30 END IF AJJ = SQRT( AJJ ) A( J, J ) = AJJ * * Compute elements J+1:N of column J. * IF( J.LT.N ) THEN CALL ZLACGV( J-1, A( J, 1 ), LDA ) CALL ZGEMV( 'No transpose', N-J, J-1, -CONE, A( J+1, 1 ), $ LDA, A( J, 1 ), LDA, CONE, A( J+1, J ), 1 ) CALL ZLACGV( J-1, A( J, 1 ), LDA ) CALL ZDSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF GO TO 40 * 30 CONTINUE INFO = J * 40 CONTINUE RETURN * * End of ZPOTF2 * END OpenBLAS-0.2.20/reference/zpotrff.f000066400000000000000000000132331313527062700167450ustar00rootroot00000000000000 SUBROUTINE ZPOTRFF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZPOTRF computes the Cholesky factorization of a complex Hermitian * positive definite matrix A. * * The factorization has the form * A = U**H * U, if UPLO = 'U', or * A = L * L**H, if UPLO = 'L', * where U is an upper triangular matrix and L is lower triangular. * * This is the block version of the algorithm, calling Level 3 BLAS. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the Hermitian matrix A. If UPLO = 'U', the leading * N-by-N upper triangular part of A contains the upper * triangular part of the matrix A, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of A contains the lower * triangular part of the matrix A, and the strictly upper * triangular part of A is not referenced. * * On exit, if INFO = 0, the factor U or L from the Cholesky * factorization A = U**H*U or A = L*L**H. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the leading minor of order i is not * positive definite, and the factorization could not be * completed. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ONE COMPLEX*16 CONE PARAMETER ( ONE = 1.0D+0, CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL UPPER INTEGER J, JB, NB * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEMM, ZHERK, ZPOTF2, ZTRSM * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZPOTRF', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine the block size for this environment. * NB = 56 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code. * CALL ZPOTF2( UPLO, N, A, LDA, INFO ) ELSE * * Use blocked code. * IF( UPPER ) THEN * * Compute the Cholesky factorization A = U'*U. * DO 10 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL ZHERK( 'Upper', 'Conjugate transpose', JB, J-1, $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) CALL ZPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block row. * CALL ZGEMM( 'Conjugate transpose', 'No transpose', JB, $ N-J-JB+1, J-1, -CONE, A( 1, J ), LDA, $ A( 1, J+JB ), LDA, CONE, A( J, J+JB ), $ LDA ) CALL ZTRSM( 'Left', 'Upper', 'Conjugate transpose', $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), $ LDA, A( J, J+JB ), LDA ) END IF 10 CONTINUE * ELSE * * Compute the Cholesky factorization A = L*L'. * DO 20 J = 1, N, NB * * Update and factorize the current diagonal block and test * for non-positive-definiteness. * JB = MIN( NB, N-J+1 ) CALL ZHERK( 'Lower', 'No transpose', JB, J-1, -ONE, $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) CALL ZPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) IF( INFO.NE.0 ) $ GO TO 30 IF( J+JB.LE.N ) THEN * * Compute the current block column. * CALL ZGEMM( 'No transpose', 'Conjugate transpose', $ N-J-JB+1, JB, J-1, -CONE, A( J+JB, 1 ), $ LDA, A( J, 1 ), LDA, CONE, A( J+JB, J ), $ LDA ) CALL ZTRSM( 'Right', 'Lower', 'Conjugate transpose', $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), $ LDA, A( J+JB, J ), LDA ) END IF 20 CONTINUE END IF END IF GO TO 40 * 30 CONTINUE INFO = INFO + J - 1 * 40 CONTINUE RETURN * * End of ZPOTRF * END OpenBLAS-0.2.20/reference/zpotrif.f000066400000000000000000000050361313527062700167520ustar00rootroot00000000000000 SUBROUTINE ZPOTRIF( UPLO, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZPOTRI computes the inverse of a complex Hermitian positive definite * matrix A using the Cholesky factorization A = U**H*U or A = L*L**H * computed by ZPOTRF. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': Upper triangle of A is stored; * = 'L': Lower triangle of A is stored. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the triangular factor U or L from the Cholesky * factorization A = U**H*U or A = L*L**H, as computed by * ZPOTRF. * On exit, the upper or lower triangle of the (Hermitian) * inverse of A, overwriting the input factor U or L. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, the (i,i) element of the factor U or L is * zero, and the inverse could not be computed. * * ===================================================================== * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA, ZLAUUM, ZTRTRI * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZPOTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Invert the triangular Cholesky factor U or L. * CALL ZTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) IF( INFO.GT.0 ) $ RETURN * * Form inv(U)*inv(U)' or inv(L)'*inv(L). * CALL ZLAUUM( UPLO, N, A, LDA, INFO ) * RETURN * * End of ZPOTRI * END OpenBLAS-0.2.20/reference/zrotgf.f000066400000000000000000000012051313527062700165620ustar00rootroot00000000000000 subroutine zrotgf(ca,cb,c,s) double complex ca,cb,s double precision c double precision norm,scale double complex alpha if (cdabs(ca) .ne. 0.0d0) go to 10 c = 0.0d0 s = (1.0d0,0.0d0) ca = cb go to 20 10 continue scale = cdabs(ca) + cdabs(cb) norm = scale*dsqrt((cdabs(ca/dcmplx(scale,0.0d0)))**2 + * (cdabs(cb/dcmplx(scale,0.0d0)))**2) alpha = ca /cdabs(ca) c = cdabs(ca) / norm s = alpha * dconjg(cb) / norm ca = alpha * norm 20 continue return end OpenBLAS-0.2.20/reference/zsbmvf.f000066400000000000000000000226521313527062700165670ustar00rootroot00000000000000 SUBROUTINE ZSBMVF(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, $ INCY ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, INCY, K, LDA, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZSBMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric band matrix, with k super-diagonals. * * Arguments * ========== * * UPLO - CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the band matrix A is being supplied as * follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * being supplied. * * UPLO = 'L' or 'l' The lower triangular part of A is * being supplied. * * Unchanged on exit. * * N - INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER * On entry, K specifies the number of super-diagonals of the * matrix A. K must satisfy 0 .le. K. * Unchanged on exit. * * ALPHA - COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array, dimension( LDA, N ) * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer the upper * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the symmetric matrix, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer the lower * triangular part of a symmetric band matrix from conventional * full matrix storage to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Unchanged on exit. * * LDA - INTEGER * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the * vector x. * Unchanged on exit. * * INCX - INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA - COMPLEX*16 * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * Y - COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the * vector y. On exit, Y is overwritten by the updated vector y. * * INCY - INTEGER * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L COMPLEX*16 TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( K.LT.0 ) THEN INFO = 3 ELSE IF( LDA.LT.( K+1 ) ) THEN INFO = 6 ELSE IF( INCX.EQ.0 ) THEN INFO = 8 ELSE IF( INCY.EQ.0 ) THEN INFO = 11 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N-1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N-1 )*INCY END IF * * Start the operations. In this version the elements of the array A * are accessed sequentially with one pass through A. * * First form y := beta*y. * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30 I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when upper triangle of A is stored. * KPLUS1 = K + 1 IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 60 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO L = KPLUS1 - J DO 50 I = MAX( 1, J-K ), J - 1 Y( I ) = Y( I ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY L = KPLUS1 - J DO 70 I = MAX( 1, J-K ), J - 1 Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY IF( J.GT.K ) THEN KX = KX + INCX KY = KY + INCY END IF 80 CONTINUE END IF ELSE * * Form y when lower triangle of A is stored. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 100 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( 1, J ) L = 1 - J DO 90 I = J + 1, MIN( N, J+K ) Y( I ) = Y( I ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) L = 1 - J IX = JX IY = JY DO 110 I = J + 1, MIN( N, J+K ) IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) TEMP2 = TEMP2 + A( L+I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of ZSBMV * END OpenBLAS-0.2.20/reference/zscalf.f000066400000000000000000000011701313527062700165320ustar00rootroot00000000000000 subroutine zscalf(n,za,zx,incx) c c scales a vector by a constant. c jack dongarra, 3/11/78. c modified 3/93 to return if incx .le. 0. c modified 12/3/93, array(1) declarations changed to array(*) c double complex za,zx(*) integer i,incx,ix,n c if( n.le.0 .or. incx.le.0 )return if(incx.eq.1)go to 20 c c code for increment not equal to 1 c ix = 1 do 10 i = 1,n zx(ix) = za*zx(ix) ix = ix + incx 10 continue return c c code for increment equal to 1 c 20 do 30 i = 1,n zx(i) = za*zx(i) 30 continue return end OpenBLAS-0.2.20/reference/zspmvf.f000066400000000000000000000175741313527062700166140ustar00rootroot00000000000000 SUBROUTINE ZSPMVF(UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, INCY, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZSPMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix, supplied in packed form. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * AP (input) COMPLEX*16 array, dimension at least * ( ( N*( N + 1 ) )/2 ). * Before entry, with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. * Before entry, with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. * Unchanged on exit. * * X (input) COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA (input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y (input/output) COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY COMPLEX*16 TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 6 ELSE IF( INCY.EQ.0 ) THEN INFO = 9 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSPMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N-1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N-1 )*INCY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * * First form y := beta*y. * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30 I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN KK = 1 IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when AP contains the upper triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 60 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO K = KK DO 50 I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 50 CONTINUE Y( J ) = Y( J ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 KK = KK + J 60 CONTINUE ELSE JX = KX JY = KY DO 80 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70 K = KK, KK + J - 2 Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + J 80 CONTINUE END IF ELSE * * Form y when AP contains the lower triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 100 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*AP( KK ) K = KK + 1 DO 90 I = J + 1, N Y( I ) = Y( I ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( I ) K = K + 1 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 KK = KK + ( N-J+1 ) 100 CONTINUE ELSE JX = KX JY = KY DO 120 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*AP( KK ) IX = JX IY = JY DO 110 K = KK + 1, KK + N - J IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*AP( K ) TEMP2 = TEMP2 + AP( K )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY KK = KK + ( N-J+1 ) 120 CONTINUE END IF END IF * RETURN * * End of ZSPMV * END OpenBLAS-0.2.20/reference/zspr2f.f000066400000000000000000000161311313527062700165010ustar00rootroot00000000000000 SUBROUTINE ZSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) * .. Scalar Arguments .. COMPLEX*16 ALPHA INTEGER INCX, INCY, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DSPR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an * n by n symmetric matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * AP - DOUBLE PRECISION array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. COMPLEX*16 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSPR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) )THEN * * Form A when upper triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 10, I = 1, J AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 10 CONTINUE END IF KK = KK + J 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, K = KK, KK + J - 1 AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) K = KK DO 50, I = J, N AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 K = K + 1 50 CONTINUE END IF KK = KK + N - J + 1 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, K = KK, KK + N - J AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of DSPR2 . * END OpenBLAS-0.2.20/reference/zsprf.f000066400000000000000000000150011313527062700164120ustar00rootroot00000000000000 SUBROUTINE ZSPRF( UPLO, N, ALPHA, X, INCX, AP ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ) * .. * * Purpose * ======= * * ZSPR performs the symmetric rank 1 operation * * A := alpha*x*conjg( x' ) + A, * * where alpha is a complex scalar, x is an n element vector and A is an * n by n symmetric matrix, supplied in packed form. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the matrix A is supplied in the packed * array AP as follows: * * UPLO = 'U' or 'u' The upper triangular part of A is * supplied in AP. * * UPLO = 'L' or 'l' The lower triangular part of A is * supplied in AP. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X (input) COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * AP (input/output) COMPLEX*16 array, dimension at least * ( ( N*( N + 1 ) )/2 ). * Before entry, with UPLO = 'U' or 'u', the array AP must * contain the upper triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) * and a( 2, 2 ) respectively, and so on. On exit, the array * AP is overwritten by the upper triangular part of the * updated matrix. * Before entry, with UPLO = 'L' or 'l', the array AP must * contain the lower triangular part of the symmetric matrix * packed sequentially, column by column, so that AP( 1 ) * contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) * and a( 3, 1 ) respectively, and so on. On exit, the array * AP is overwritten by the lower triangular part of the * updated matrix. * Note that the imaginary parts of the diagonal elements need * not be set, they are assumed to be zero, and on exit they * are set to zero. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, J, JX, K, KK, KX COMPLEX*16 TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSPR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 ) THEN KX = 1 - ( N-1 )*INCX ELSE IF( INCX.NE.1 ) THEN KX = 1 END IF * * Start the operations. In this version the elements of the array AP * are accessed sequentially with one pass through AP. * KK = 1 IF( LSAME( UPLO, 'U' ) ) THEN * * Form A when upper triangle is stored in AP. * IF( INCX.EQ.1 ) THEN DO 20 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) K = KK DO 10 I = 1, J - 1 AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 10 CONTINUE AP( KK+J-1 ) = AP( KK+J-1 ) + X( J )*TEMP ELSE AP( KK+J-1 ) = AP( KK+J-1 ) END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = KX DO 30 K = KK, KK + J - 2 AP( K ) = AP( K ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE AP( KK+J-1 ) = AP( KK+J-1 ) + X( JX )*TEMP ELSE AP( KK+J-1 ) = AP( KK+J-1 ) END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE * * Form A when lower triangle is stored in AP. * IF( INCX.EQ.1 ) THEN DO 60 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) AP( KK ) = AP( KK ) + TEMP*X( J ) K = KK + 1 DO 50 I = J + 1, N AP( K ) = AP( K ) + X( I )*TEMP K = K + 1 50 CONTINUE ELSE AP( KK ) = AP( KK ) END IF KK = KK + N - J + 1 60 CONTINUE ELSE JX = KX DO 80 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) AP( KK ) = AP( KK ) + TEMP*X( JX ) IX = JX DO 70 K = KK + 1, KK + N - J IX = IX + INCX AP( K ) = AP( K ) + X( IX )*TEMP 70 CONTINUE ELSE AP( KK ) = AP( KK ) END IF JX = JX + INCX KK = KK + N - J + 1 80 CONTINUE END IF END IF * RETURN * * End of ZSPR * END OpenBLAS-0.2.20/reference/zswapf.f000066400000000000000000000015121313527062700165620ustar00rootroot00000000000000 subroutine zswapf (n,zx,incx,zy,incy) c c interchanges two vectors. c jack dongarra, 3/11/78. c modified 12/3/93, array(1) declarations changed to array(*) c double complex zx(*),zy(*),ztemp integer i,incx,incy,ix,iy,n c if(n.le.0)return if(incx.eq.1.and.incy.eq.1)go to 20 c c code for unequal increments or equal increments not equal c to 1 c ix = 1 iy = 1 if(incx.lt.0)ix = (-n+1)*incx + 1 if(incy.lt.0)iy = (-n+1)*incy + 1 do 10 i = 1,n ztemp = zx(ix) zx(ix) = zy(iy) zy(iy) = ztemp ix = ix + incx iy = iy + incy 10 continue return c c code for both increments equal to 1 20 do 30 i = 1,n ztemp = zx(i) zx(i) = zy(i) zy(i) = ztemp 30 continue return end OpenBLAS-0.2.20/reference/zsymm3mf.f000066400000000000000000000232521313527062700170420ustar00rootroot00000000000000 SUBROUTINE ZSYMM3MF( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX*16 ALPHA, BETA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZSYMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is a symmetric matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the symmetric matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the symmetric matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * symmetric matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * symmetric matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX*16 TEMP1, TEMP2 * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZSYMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*A( J, J ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*A( J, K ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*A( J, K ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of ZSYMM . * END OpenBLAS-0.2.20/reference/zsymmf.f000066400000000000000000000232511313527062700166010ustar00rootroot00000000000000 SUBROUTINE ZSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO INTEGER M, N, LDA, LDB, LDC COMPLEX*16 ALPHA, BETA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZSYMM performs one of the matrix-matrix operations * * C := alpha*A*B + beta*C, * * or * * C := alpha*B*A + beta*C, * * where alpha and beta are scalars, A is a symmetric matrix and B and * C are m by n matrices. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether the symmetric matrix A * appears on the left or right in the operation as follows: * * SIDE = 'L' or 'l' C := alpha*A*B + beta*C, * * SIDE = 'R' or 'r' C := alpha*B*A + beta*C, * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the symmetric matrix A is to be * referenced as follows: * * UPLO = 'U' or 'u' Only the upper triangular part of the * symmetric matrix is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of the * symmetric matrix is to be referenced. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of the matrix C. * M must be at least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of the matrix C. * N must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * m when SIDE = 'L' or 'l' and is n otherwise. * Before entry with SIDE = 'L' or 'l', the m by m part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading m by m upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading m by m lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Before entry with SIDE = 'R' or 'r', the n by n part of * the array A must contain the symmetric matrix, such that * when UPLO = 'U' or 'u', the leading n by n upper triangular * part of the array A must contain the upper triangular part * of the symmetric matrix and the strictly lower triangular * part of A is not referenced, and when UPLO = 'L' or 'l', * the leading n by n lower triangular part of the array A * must contain the lower triangular part of the symmetric * matrix and the strictly upper triangular part of A is not * referenced. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), otherwise LDA must be at * least max( 1, n ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then C need not be set on input. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry, the leading m by n part of the array C must * contain the matrix C, except when beta is zero, in which * case C need not be set on entry. * On exit, the array C is overwritten by the m by n updated * matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, K, NROWA COMPLEX*16 TEMP1, TEMP2 * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Set NROWA as the number of rows of A. * IF( LSAME( SIDE, 'L' ) )THEN NROWA = M ELSE NROWA = N END IF UPPER = LSAME( UPLO, 'U' ) * * Test the input parameters. * INFO = 0 IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN INFO = 2 ELSE IF( M .LT.0 )THEN INFO = 3 ELSE IF( N .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, M ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZSYMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, M C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF RETURN END IF * * Start the operations. * IF( LSAME( SIDE, 'L' ) )THEN * * Form C := alpha*A*B + beta*C. * IF( UPPER )THEN DO 70, J = 1, N DO 60, I = 1, M TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 50, K = 1, I - 1 C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 50 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 60 CONTINUE 70 CONTINUE ELSE DO 100, J = 1, N DO 90, I = M, 1, -1 TEMP1 = ALPHA*B( I, J ) TEMP2 = ZERO DO 80, K = I + 1, M C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) TEMP2 = TEMP2 + B( K, J )*A( K, I ) 80 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ TEMP1*A( I, I ) + ALPHA*TEMP2 END IF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form C := alpha*B*A + beta*C. * DO 170, J = 1, N TEMP1 = ALPHA*A( J, J ) IF( BETA.EQ.ZERO )THEN DO 110, I = 1, M C( I, J ) = TEMP1*B( I, J ) 110 CONTINUE ELSE DO 120, I = 1, M C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) 120 CONTINUE END IF DO 140, K = 1, J - 1 IF( UPPER )THEN TEMP1 = ALPHA*A( K, J ) ELSE TEMP1 = ALPHA*A( J, K ) END IF DO 130, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 130 CONTINUE 140 CONTINUE DO 160, K = J + 1, N IF( UPPER )THEN TEMP1 = ALPHA*A( J, K ) ELSE TEMP1 = ALPHA*A( K, J ) END IF DO 150, I = 1, M C( I, J ) = C( I, J ) + TEMP1*B( I, K ) 150 CONTINUE 160 CONTINUE 170 CONTINUE END IF * RETURN * * End of ZSYMM . * END OpenBLAS-0.2.20/reference/zsymvf.f000066400000000000000000000175161313527062700166210ustar00rootroot00000000000000 SUBROUTINE ZSYMVF(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, INCY, LDA, N COMPLEX*16 ALPHA, BETA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * ZSYMV performs the matrix-vector operation * * y := alpha*A*x + beta*y, * * where alpha and beta are scalars, x and y are n element vectors and * A is an n by n symmetric matrix. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A (input) COMPLEX*16 array, dimension ( LDA, N ) * Before entry, with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. * Before entry, with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. * Unchanged on exit. * * LDA (input) INTEGER * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, N ). * Unchanged on exit. * * X (input) COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * BETA (input) COMPLEX*16 * On entry, BETA specifies the scalar beta. When BETA is * supplied as zero then Y need not be set on input. * Unchanged on exit. * * Y (input/output) COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated * vector y. * * INCY (input) INTEGER * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY COMPLEX*16 TEMP1, TEMP2 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = 5 ELSE IF( INCX.EQ.0 ) THEN INFO = 7 ELSE IF( INCY.EQ.0 ) THEN INFO = 10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSYMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) $ RETURN * * Set up the start points in X and Y. * IF( INCX.GT.0 ) THEN KX = 1 ELSE KX = 1 - ( N-1 )*INCX END IF IF( INCY.GT.0 ) THEN KY = 1 ELSE KY = 1 - ( N-1 )*INCY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * * First form y := beta*y. * IF( BETA.NE.ONE ) THEN IF( INCY.EQ.1 ) THEN IF( BETA.EQ.ZERO ) THEN DO 10 I = 1, N Y( I ) = ZERO 10 CONTINUE ELSE DO 20 I = 1, N Y( I ) = BETA*Y( I ) 20 CONTINUE END IF ELSE IY = KY IF( BETA.EQ.ZERO ) THEN DO 30 I = 1, N Y( IY ) = ZERO IY = IY + INCY 30 CONTINUE ELSE DO 40 I = 1, N Y( IY ) = BETA*Y( IY ) IY = IY + INCY 40 CONTINUE END IF END IF END IF IF( ALPHA.EQ.ZERO ) $ RETURN IF( LSAME( UPLO, 'U' ) ) THEN * * Form y when A is stored in upper triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 60 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO DO 50 I = 1, J - 1 Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 50 CONTINUE Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 60 CONTINUE ELSE JX = KX JY = KY DO 80 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO IX = KX IY = KY DO 70 I = 1, J - 1 Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) IX = IX + INCX IY = IY + INCY 70 CONTINUE Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF ELSE * * Form y when A is stored in lower triangle. * IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN DO 100 J = 1, N TEMP1 = ALPHA*X( J ) TEMP2 = ZERO Y( J ) = Y( J ) + TEMP1*A( J, J ) DO 90 I = J + 1, N Y( I ) = Y( I ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( I ) 90 CONTINUE Y( J ) = Y( J ) + ALPHA*TEMP2 100 CONTINUE ELSE JX = KX JY = KY DO 120 J = 1, N TEMP1 = ALPHA*X( JX ) TEMP2 = ZERO Y( JY ) = Y( JY ) + TEMP1*A( J, J ) IX = JX IY = JY DO 110 I = J + 1, N IX = IX + INCX IY = IY + INCY Y( IY ) = Y( IY ) + TEMP1*A( I, J ) TEMP2 = TEMP2 + A( I, J )*X( IX ) 110 CONTINUE Y( JY ) = Y( JY ) + ALPHA*TEMP2 JX = JX + INCX JY = JY + INCY 120 CONTINUE END IF END IF * RETURN * * End of ZSYMV * END OpenBLAS-0.2.20/reference/zsyr2f.f000066400000000000000000000162271313527062700165200ustar00rootroot00000000000000 SUBROUTINE ZSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) * .. Scalar Arguments .. COMPLEX*16 ALPHA INTEGER INCX, INCY, LDA, N CHARACTER*1 UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ), Y( * ) * .. * * Purpose * ======= * * DSYR2 performs the symmetric rank 2 operation * * A := alpha*x*y' + alpha*y*x' + A, * * where alpha is a scalar, x and y are n element vectors and A is an n * by n symmetric matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA - DOUBLE PRECISION. * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. * Unchanged on exit. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * Y - DOUBLE PRECISION array of dimension at least * ( 1 + ( n - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. * Unchanged on exit. * * INCY - INTEGER. * On entry, INCY specifies the increment for the elements of * Y. INCY must not be zero. * Unchanged on exit. * * A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. Local Scalars .. COMPLEX*16 TEMP1, TEMP2 INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO, 'U' ).AND. $ .NOT.LSAME( UPLO, 'L' ) )THEN INFO = 1 ELSE IF( N.LT.0 )THEN INFO = 2 ELSE IF( INCX.EQ.0 )THEN INFO = 5 ELSE IF( INCY.EQ.0 )THEN INFO = 7 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'DSYR2 ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) $ RETURN * * Set up the start points in X and Y if the increments are not both * unity. * IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN IF( INCX.GT.0 )THEN KX = 1 ELSE KX = 1 - ( N - 1 )*INCX END IF IF( INCY.GT.0 )THEN KY = 1 ELSE KY = 1 - ( N - 1 )*INCY END IF JX = KX JY = KY END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) )THEN * * Form A when A is stored in the upper triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 20, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 10, I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 10 CONTINUE END IF 20 CONTINUE ELSE DO 40, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = KX IY = KY DO 30, I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 30 CONTINUE END IF JX = JX + INCX JY = JY + INCY 40 CONTINUE END IF ELSE * * Form A when A is stored in the lower triangle. * IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN DO 60, J = 1, N IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( J ) TEMP2 = ALPHA*X( J ) DO 50, I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 50 CONTINUE END IF 60 CONTINUE ELSE DO 80, J = 1, N IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN TEMP1 = ALPHA*Y( JY ) TEMP2 = ALPHA*X( JX ) IX = JX IY = JY DO 70, I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP1 $ + Y( IY )*TEMP2 IX = IX + INCX IY = IY + INCY 70 CONTINUE END IF JX = JX + INCX JY = JY + INCY 80 CONTINUE END IF END IF * RETURN * * End of DSYR2 . * END OpenBLAS-0.2.20/reference/zsyr2kf.f000066400000000000000000000252521313527062700166710ustar00rootroot00000000000000 SUBROUTINE ZSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDB, LDC COMPLEX*16 ALPHA, BETA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZSYR2K performs one of the symmetric rank 2k operations * * C := alpha*A*B' + alpha*B*A' + beta*C, * * or * * C := alpha*A'*B + alpha*B'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A and B are n by k matrices in the first case and k by n * matrices in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + * beta*C. * * TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + * beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrices A and B, and on entry with * TRANS = 'T' or 't', K specifies the number of rows of the * matrices A and B. K must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array B must contain the matrix B, otherwise * the leading k by n part of the array B must contain the * matrix B. * Unchanged on exit. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDB must be at least max( 1, n ), otherwise LDB must * be at least max( 1, k ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA COMPLEX*16 TEMP1, TEMP2 * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 12 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZSYR2K', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*B' + alpha*B*A' + C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( ( A( J, L ).NE.ZERO ).OR. $ ( B( J, L ).NE.ZERO ) )THEN TEMP1 = ALPHA*B( J, L ) TEMP2 = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + $ B( I, L )*TEMP2 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*B + alpha*B'*A + C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP1 = ZERO TEMP2 = ZERO DO 190, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP1 = ZERO TEMP2 = ZERO DO 220, L = 1, K TEMP1 = TEMP1 + A( L, I )*B( L, J ) TEMP2 = TEMP2 + B( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 ELSE C( I, J ) = BETA *C( I, J ) + $ ALPHA*TEMP1 + ALPHA*TEMP2 END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of ZSYR2K. * END OpenBLAS-0.2.20/reference/zsyrf.f000066400000000000000000000135451313527062700164360ustar00rootroot00000000000000 SUBROUTINE ZSYRF( UPLO, N, ALPHA, X, INCX, A, LDA ) * * -- LAPACK auxiliary routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER UPLO INTEGER INCX, LDA, N COMPLEX*16 ALPHA * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZSYR performs the symmetric rank 1 operation * * A := alpha*x*( x' ) + A, * * where alpha is a complex scalar, x is an n element vector and A is an * n by n symmetric matrix. * * Arguments * ========== * * UPLO (input) CHARACTER*1 * On entry, UPLO specifies whether the upper or lower * triangular part of the array A is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of A * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of A * is to be referenced. * * Unchanged on exit. * * N (input) INTEGER * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * ALPHA (input) COMPLEX*16 * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * X (input) COMPLEX*16 array, dimension at least * ( 1 + ( N - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the N- * element vector x. * Unchanged on exit. * * INCX (input) INTEGER * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * A (input/output) COMPLEX*16 array, dimension ( LDA, N ) * Before entry, with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of A is not referenced. On exit, the * upper triangular part of the array A is overwritten by the * upper triangular part of the updated matrix. * Before entry, with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of A is not referenced. On exit, the * lower triangular part of the array A is overwritten by the * lower triangular part of the updated matrix. * * LDA (input) INTEGER * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, N ). * Unchanged on exit. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, INFO, IX, J, JX, KX COMPLEX*16 TEMP * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = 1 ELSE IF( N.LT.0 ) THEN INFO = 2 ELSE IF( INCX.EQ.0 ) THEN INFO = 5 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = 7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZSYR ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) $ RETURN * * Set the start point in X if the increment is not unity. * IF( INCX.LE.0 ) THEN KX = 1 - ( N-1 )*INCX ELSE IF( INCX.NE.1 ) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through the triangular part * of A. * IF( LSAME( UPLO, 'U' ) ) THEN * * Form A when A is stored in upper triangle. * IF( INCX.EQ.1 ) THEN DO 20 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 10 I = 1, J A( I, J ) = A( I, J ) + X( I )*TEMP 10 CONTINUE END IF 20 CONTINUE ELSE JX = KX DO 40 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = KX DO 30 I = 1, J A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 30 CONTINUE END IF JX = JX + INCX 40 CONTINUE END IF ELSE * * Form A when A is stored in lower triangle. * IF( INCX.EQ.1 ) THEN DO 60 J = 1, N IF( X( J ).NE.ZERO ) THEN TEMP = ALPHA*X( J ) DO 50 I = J, N A( I, J ) = A( I, J ) + X( I )*TEMP 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1, N IF( X( JX ).NE.ZERO ) THEN TEMP = ALPHA*X( JX ) IX = JX DO 70 I = J, N A( I, J ) = A( I, J ) + X( IX )*TEMP IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF * RETURN * * End of ZSYR * END OpenBLAS-0.2.20/reference/zsyrkf.f000066400000000000000000000221071313527062700166030ustar00rootroot00000000000000 SUBROUTINE ZSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, $ BETA, C, LDC ) * .. Scalar Arguments .. CHARACTER*1 UPLO, TRANS INTEGER N, K, LDA, LDC COMPLEX*16 ALPHA, BETA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), C( LDC, * ) * .. * * Purpose * ======= * * ZSYRK performs one of the symmetric rank k operations * * C := alpha*A*A' + beta*C, * * or * * C := alpha*A'*A + beta*C, * * where alpha and beta are scalars, C is an n by n symmetric matrix * and A is an n by k matrix in the first case and a k by n matrix * in the second case. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the upper or lower * triangular part of the array C is to be referenced as * follows: * * UPLO = 'U' or 'u' Only the upper triangular part of C * is to be referenced. * * UPLO = 'L' or 'l' Only the lower triangular part of C * is to be referenced. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. * * TRANS = 'T' or 't' C := alpha*A'*A + beta*C. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix C. N must be * at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with TRANS = 'N' or 'n', K specifies the number * of columns of the matrix A, and on entry with * TRANS = 'T' or 't', K specifies the number of rows of the * matrix A. K must be at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is * k when TRANS = 'N' or 'n', and is n otherwise. * Before entry with TRANS = 'N' or 'n', the leading n by k * part of the array A must contain the matrix A, otherwise * the leading k by n part of the array A must contain the * matrix A. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When TRANS = 'N' or 'n' * then LDA must be at least max( 1, n ), otherwise LDA must * be at least max( 1, k ). * Unchanged on exit. * * BETA - COMPLEX*16 . * On entry, BETA specifies the scalar beta. * Unchanged on exit. * * C - COMPLEX*16 array of DIMENSION ( LDC, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array C must contain the upper * triangular part of the symmetric matrix and the strictly * lower triangular part of C is not referenced. On exit, the * upper triangular part of the array C is overwritten by the * upper triangular part of the updated matrix. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array C must contain the lower * triangular part of the symmetric matrix and the strictly * upper triangular part of C is not referenced. On exit, the * lower triangular part of the array C is overwritten by the * lower triangular part of the updated matrix. * * LDC - INTEGER. * On entry, LDC specifies the first dimension of C as declared * in the calling (sub) program. LDC must be at least * max( 1, n ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC MAX * .. Local Scalars .. LOGICAL UPPER INTEGER I, INFO, J, L, NROWA COMPLEX*16 TEMP * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * IF( LSAME( TRANS, 'N' ) )THEN NROWA = N ELSE NROWA = K END IF UPPER = LSAME( UPLO, 'U' ) * INFO = 0 IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN INFO = 2 ELSE IF( N .LT.0 )THEN INFO = 3 ELSE IF( K .LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 7 ELSE IF( LDC.LT.MAX( 1, N ) )THEN INFO = 10 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZSYRK ', INFO ) RETURN END IF * * Quick return if possible. * IF( ( N.EQ.0 ).OR. $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN IF( UPPER )THEN IF( BETA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, J C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE ELSE DO 40, J = 1, N DO 30, I = 1, J C( I, J ) = BETA*C( I, J ) 30 CONTINUE 40 CONTINUE END IF ELSE IF( BETA.EQ.ZERO )THEN DO 60, J = 1, N DO 50, I = J, N C( I, J ) = ZERO 50 CONTINUE 60 CONTINUE ELSE DO 80, J = 1, N DO 70, I = J, N C( I, J ) = BETA*C( I, J ) 70 CONTINUE 80 CONTINUE END IF END IF RETURN END IF * * Start the operations. * IF( LSAME( TRANS, 'N' ) )THEN * * Form C := alpha*A*A' + beta*C. * IF( UPPER )THEN DO 130, J = 1, N IF( BETA.EQ.ZERO )THEN DO 90, I = 1, J C( I, J ) = ZERO 90 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 100, I = 1, J C( I, J ) = BETA*C( I, J ) 100 CONTINUE END IF DO 120, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 110, I = 1, J C( I, J ) = C( I, J ) + TEMP*A( I, L ) 110 CONTINUE END IF 120 CONTINUE 130 CONTINUE ELSE DO 180, J = 1, N IF( BETA.EQ.ZERO )THEN DO 140, I = J, N C( I, J ) = ZERO 140 CONTINUE ELSE IF( BETA.NE.ONE )THEN DO 150, I = J, N C( I, J ) = BETA*C( I, J ) 150 CONTINUE END IF DO 170, L = 1, K IF( A( J, L ).NE.ZERO )THEN TEMP = ALPHA*A( J, L ) DO 160, I = J, N C( I, J ) = C( I, J ) + TEMP*A( I, L ) 160 CONTINUE END IF 170 CONTINUE 180 CONTINUE END IF ELSE * * Form C := alpha*A'*A + beta*C. * IF( UPPER )THEN DO 210, J = 1, N DO 200, I = 1, J TEMP = ZERO DO 190, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 190 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 200 CONTINUE 210 CONTINUE ELSE DO 240, J = 1, N DO 230, I = J, N TEMP = ZERO DO 220, L = 1, K TEMP = TEMP + A( L, I )*A( L, J ) 220 CONTINUE IF( BETA.EQ.ZERO )THEN C( I, J ) = ALPHA*TEMP ELSE C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) END IF 230 CONTINUE 240 CONTINUE END IF END IF * RETURN * * End of ZSYRK . * END OpenBLAS-0.2.20/reference/ztbmvf.f000066400000000000000000000314001313527062700165570ustar00rootroot00000000000000 SUBROUTINE ZTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, K, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZTBMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, or x := conjg( A' )*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular band matrix, with ( k + 1 ) diagonals. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := conjg( A' )*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( K.LT.0 )THEN INFO = 5 ELSE IF( LDA.LT.( K + 1 ) )THEN INFO = 7 ELSE IF( INCX.EQ.0 )THEN INFO = 9 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZTBMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ) )THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = KPLUS1 - J DO 10, I = MAX( 1, J - K ), J - 1 X( I ) = X( I ) + TEMP*A( L + I, J ) 10 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( KPLUS1, J ) END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = KPLUS1 - J DO 30, I = MAX( 1, J - K ), J - 1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX + INCX 30 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( KPLUS1, J ) END IF JX = JX + INCX IF( J.GT.K ) $ KX = KX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) L = 1 - J DO 50, I = MIN( N, J + K ), J + 1, -1 X( I ) = X( I ) + TEMP*A( L + I, J ) 50 CONTINUE IF( NOUNIT ) $ X( J ) = X( J )*A( 1, J ) END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX L = 1 - J DO 70, I = MIN( N, J + K ), J + 1, -1 X( IX ) = X( IX ) + TEMP*A( L + I, J ) IX = IX - INCX 70 CONTINUE IF( NOUNIT ) $ X( JX ) = X( JX )*A( 1, J ) END IF JX = JX - INCX IF( ( N - J ).GE.K ) $ KX = KX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x or x := conjg( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN KPLUS1 = K + 1 IF( INCX.EQ.1 )THEN DO 110, J = N, 1, -1 TEMP = X( J ) L = KPLUS1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 90, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( I ) 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( KPLUS1, J ) ) DO 100, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I ) 100 CONTINUE END IF X( J ) = TEMP 110 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 140, J = N, 1, -1 TEMP = X( JX ) KX = KX - INCX IX = KX L = KPLUS1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( KPLUS1, J ) DO 120, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX - INCX 120 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( KPLUS1, J ) ) DO 130, I = J - 1, MAX( 1, J - K ), -1 TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX ) IX = IX - INCX 130 CONTINUE END IF X( JX ) = TEMP JX = JX - INCX 140 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 170, J = 1, N TEMP = X( J ) L = 1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 150, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( I ) 150 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( 1, J ) ) DO 160, I = J + 1, MIN( N, J + K ) TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I ) 160 CONTINUE END IF X( J ) = TEMP 170 CONTINUE ELSE JX = KX DO 200, J = 1, N TEMP = X( JX ) KX = KX + INCX IX = KX L = 1 - J IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( 1, J ) DO 180, I = J + 1, MIN( N, J + K ) TEMP = TEMP + A( L + I, J )*X( IX ) IX = IX + INCX 180 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( 1, J ) ) DO 190, I = J + 1, MIN( N, J + K ) TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX ) IX = IX + INCX 190 CONTINUE END IF X( JX ) = TEMP JX = JX + INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of ZTBMV . * END OpenBLAS-0.2.20/reference/ztbsvf.f000066400000000000000000000306561313527062700166010ustar00rootroot00000000000000 SUBROUTINE ZTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) * .. Scalar Arguments .. INTEGER INCX,K,LDA,N CHARACTER DIAG,TRANS,UPLO * .. * .. Array Arguments .. DOUBLE COMPLEX A(LDA,*),X(*) * .. * * Purpose * ======= * * ZTBSV solves one of the systems of equations * * A*x = b, or A'*x = b, or conjg( A' )*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular band matrix, with ( k + 1 ) * diagonals. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Arguments * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' conjg( A' )*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * K - INTEGER. * On entry with UPLO = 'U' or 'u', K specifies the number of * super-diagonals of the matrix A. * On entry with UPLO = 'L' or 'l', K specifies the number of * sub-diagonals of the matrix A. * K must satisfy 0 .le. K. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) * by n part of the array A must contain the upper triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row * ( k + 1 ) of the array, the first super-diagonal starting at * position 2 in row k, and so on. The top left k by k triangle * of the array A is not referenced. * The following program segment will transfer an upper * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = K + 1 - J * DO 10, I = MAX( 1, J - K ), J * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) * by n part of the array A must contain the lower triangular * band part of the matrix of coefficients, supplied column by * column, with the leading diagonal of the matrix in row 1 of * the array, the first sub-diagonal starting at position 1 in * row 2, and so on. The bottom right k by k triangle of the * array A is not referenced. * The following program segment will transfer a lower * triangular band matrix from conventional full matrix storage * to band storage: * * DO 20, J = 1, N * M = 1 - J * DO 10, I = J, MIN( N, J + K ) * A( M + I, J ) = matrix( I, J ) * 10 CONTINUE * 20 CONTINUE * * Note that when DIAG = 'U' or 'u' the elements of the array A * corresponding to the diagonal elements of the matrix are not * referenced, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * ( k + 1 ). * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. DOUBLE COMPLEX ZERO PARAMETER (ZERO= (0.0D+0,0.0D+0)) * .. * .. Local Scalars .. DOUBLE COMPLEX TEMP INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L LOGICAL NOCONJ,NOUNIT * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DCONJG,MAX,MIN * .. * * Test the input parameters. * INFO = 0 IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN INFO = 1 ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + .NOT.LSAME(TRANS,'C')) THEN INFO = 2 ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN INFO = 3 ELSE IF (N.LT.0) THEN INFO = 4 ELSE IF (K.LT.0) THEN INFO = 5 ELSE IF (LDA.LT. (K+1)) THEN INFO = 7 ELSE IF (INCX.EQ.0) THEN INFO = 9 END IF IF (INFO.NE.0) THEN CALL XERBLA('ZTBSV ',INFO) RETURN END IF * * Quick return if possible. * IF (N.EQ.0) RETURN * NOCONJ = LSAME(TRANS,'T') NOUNIT = LSAME(DIAG,'N') * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF (INCX.LE.0) THEN KX = 1 - (N-1)*INCX ELSE IF (INCX.NE.1) THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed by sequentially with one pass through A. * IF (LSAME(TRANS,'N')) THEN * * Form x := inv( A )*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 20 J = N,1,-1 IF (X(J).NE.ZERO) THEN L = KPLUS1 - J IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) TEMP = X(J) DO 10 I = J - 1,MAX(1,J-K),-1 X(I) = X(I) - TEMP*A(L+I,J) 10 CONTINUE END IF 20 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 40 J = N,1,-1 KX = KX - INCX IF (X(JX).NE.ZERO) THEN IX = KX L = KPLUS1 - J IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) TEMP = X(JX) DO 30 I = J - 1,MAX(1,J-K),-1 X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX - INCX 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 60 J = 1,N IF (X(J).NE.ZERO) THEN L = 1 - J IF (NOUNIT) X(J) = X(J)/A(1,J) TEMP = X(J) DO 50 I = J + 1,MIN(N,J+K) X(I) = X(I) - TEMP*A(L+I,J) 50 CONTINUE END IF 60 CONTINUE ELSE JX = KX DO 80 J = 1,N KX = KX + INCX IF (X(JX).NE.ZERO) THEN IX = KX L = 1 - J IF (NOUNIT) X(JX) = X(JX)/A(1,J) TEMP = X(JX) DO 70 I = J + 1,MIN(N,J+K) X(IX) = X(IX) - TEMP*A(L+I,J) IX = IX + INCX 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x or x := inv( conjg( A') )*x. * IF (LSAME(UPLO,'U')) THEN KPLUS1 = K + 1 IF (INCX.EQ.1) THEN DO 110 J = 1,N TEMP = X(J) L = KPLUS1 - J IF (NOCONJ) THEN DO 90 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(I) 90 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) ELSE DO 100 I = MAX(1,J-K),J - 1 TEMP = TEMP - DCONJG(A(L+I,J))*X(I) 100 CONTINUE IF (NOUNIT) TEMP = TEMP/DCONJG(A(KPLUS1,J)) END IF X(J) = TEMP 110 CONTINUE ELSE JX = KX DO 140 J = 1,N TEMP = X(JX) IX = KX L = KPLUS1 - J IF (NOCONJ) THEN DO 120 I = MAX(1,J-K),J - 1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX + INCX 120 CONTINUE IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) ELSE DO 130 I = MAX(1,J-K),J - 1 TEMP = TEMP - DCONJG(A(L+I,J))*X(IX) IX = IX + INCX 130 CONTINUE IF (NOUNIT) TEMP = TEMP/DCONJG(A(KPLUS1,J)) END IF X(JX) = TEMP JX = JX + INCX IF (J.GT.K) KX = KX + INCX 140 CONTINUE END IF ELSE IF (INCX.EQ.1) THEN DO 170 J = N,1,-1 TEMP = X(J) L = 1 - J IF (NOCONJ) THEN DO 150 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(I) 150 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) ELSE DO 160 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - DCONJG(A(L+I,J))*X(I) 160 CONTINUE IF (NOUNIT) TEMP = TEMP/DCONJG(A(1,J)) END IF X(J) = TEMP 170 CONTINUE ELSE KX = KX + (N-1)*INCX JX = KX DO 200 J = N,1,-1 TEMP = X(JX) IX = KX L = 1 - J IF (NOCONJ) THEN DO 180 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - A(L+I,J)*X(IX) IX = IX - INCX 180 CONTINUE IF (NOUNIT) TEMP = TEMP/A(1,J) ELSE DO 190 I = MIN(N,J+K),J + 1,-1 TEMP = TEMP - DCONJG(A(L+I,J))*X(IX) IX = IX - INCX 190 CONTINUE IF (NOUNIT) TEMP = TEMP/DCONJG(A(1,J)) END IF X(JX) = TEMP JX = JX - INCX IF ((N-J).GE.K) KX = KX - INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of ZTBSV . * END OpenBLAS-0.2.20/reference/ztpmvf.f000066400000000000000000000306711313527062700166060ustar00rootroot00000000000000 SUBROUTINE ZTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ) * .. * * Purpose * ======= * * ZTPMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, or x := conjg( A' )*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix, supplied in packed form. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := conjg( A' )*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - COMPLEX*16 array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZTPMVF', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ))THEN * * Form x:= A*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = 1 IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 10, I = 1, J - 1 IF( NOCONJ )THEN X( I ) = X( I ) + TEMP*AP( K ) ELSE X( I ) = X( I ) + TEMP*DCONJG(AP( K )) END IF K = K + 1 10 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )*AP( KK + J - 1 ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*DCONJG(AP( KK + J-1)) END IF END IF KK = KK + J 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, K = KK, KK + J - 2 IF( NOCONJ )THEN X( IX ) = X( IX ) + TEMP*AP( K ) ELSE X( IX ) = X( IX ) + TEMP*DCONJG(AP(K)) END IF IX = IX + INCX 30 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK + J - 1 ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*DCONJG(AP( KK + J-1)) END IF END IF JX = JX + INCX KK = KK + J 40 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) K = KK DO 50, I = N, J + 1, -1 IF( NOCONJ )THEN X( I ) = X( I ) + TEMP*AP( K ) ELSE X( I ) = X( I ) + TEMP*DCONJG(AP( K )) END IF K = K - 1 50 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )*AP( KK - N + J ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*DCONJG(AP(KK - N+J)) END IF END IF KK = KK - ( N - J + 1 ) 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 IF( NOCONJ )THEN X( IX ) = X( IX ) + TEMP*AP( K ) ELSE X( IX ) = X( IX ) + TEMP*DCONJG(AP(K)) ENDIF IX = IX - INCX 70 CONTINUE IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )*AP( KK - N + J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*DCONJG(AP(KK-N+J)) ENDIF END IF JX = JX - INCX KK = KK - ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := A'*x or x := conjg( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 110, J = N, 1, -1 TEMP = X( J ) K = KK - 1 IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 90, I = J - 1, 1, -1 TEMP = TEMP + AP( K )*X( I ) K = K - 1 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( AP( KK ) ) DO 100, I = J - 1, 1, -1 TEMP = TEMP + DCONJG( AP( K ) )*X( I ) K = K - 1 100 CONTINUE END IF X( J ) = TEMP KK = KK - J 110 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 140, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 120, K = KK - 1, KK - J + 1, -1 IX = IX - INCX TEMP = TEMP + AP( K )*X( IX ) 120 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( AP( KK ) ) DO 130, K = KK - 1, KK - J + 1, -1 IX = IX - INCX TEMP = TEMP + DCONJG( AP( K ) )*X( IX ) 130 CONTINUE END IF X( JX ) = TEMP JX = JX - INCX KK = KK - J 140 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 170, J = 1, N TEMP = X( J ) K = KK + 1 IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 150, I = J + 1, N TEMP = TEMP + AP( K )*X( I ) K = K + 1 150 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( AP( KK ) ) DO 160, I = J + 1, N TEMP = TEMP + DCONJG( AP( K ) )*X( I ) K = K + 1 160 CONTINUE END IF X( J ) = TEMP KK = KK + ( N - J + 1 ) 170 CONTINUE ELSE JX = KX DO 200, J = 1, N TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*AP( KK ) DO 180, K = KK + 1, KK + N - J IX = IX + INCX TEMP = TEMP + AP( K )*X( IX ) 180 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( AP( KK ) ) DO 190, K = KK + 1, KK + N - J IX = IX + INCX TEMP = TEMP + DCONJG( AP( K ) )*X( IX ) 190 CONTINUE END IF X( JX ) = TEMP JX = JX + INCX KK = KK + ( N - J + 1 ) 200 CONTINUE END IF END IF END IF * RETURN * * End of ZTPMV . * END OpenBLAS-0.2.20/reference/ztpsvf.f000066400000000000000000000311011313527062700166010ustar00rootroot00000000000000 SUBROUTINE ZTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX*16 AP( * ), X( * ) * .. * * Purpose * ======= * * ZTPSV solves one of the systems of equations * * A*x = b, or A'*x = b, or conjg( A' )*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix, supplied in packed form. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' conjg( A' )*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * AP - COMPLEX*16 array of DIMENSION at least * ( ( n*( n + 1 ) )/2 ). * Before entry with UPLO = 'U' or 'u', the array AP must * contain the upper triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) * respectively, and so on. * Before entry with UPLO = 'L' or 'l', the array AP must * contain the lower triangular matrix packed sequentially, * column by column, so that AP( 1 ) contains a( 1, 1 ), * AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) * respectively, and so on. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced, but are assumed to be unity. * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JX, K, KK, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( INCX.EQ.0 )THEN INFO = 7 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZTPSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of AP are * accessed sequentially with one pass through AP. * IF( LSAME( TRANS, 'N' ) .OR.LSAME( TRANS, 'R' ))THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) ELSE IF( NOUNIT ) $ X( J ) = X( J )/DCONJG(AP( KK )) END IF TEMP = X( J ) K = KK - 1 DO 10, I = J - 1, 1, -1 IF( NOCONJ )THEN X( I ) = X( I ) - TEMP*AP( K ) ELSE X( I ) = X( I ) - TEMP*DCONJG(AP( K )) END IF K = K - 1 10 CONTINUE END IF KK = KK - J 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/DCONJG(AP( KK )) END IF TEMP = X( JX ) IX = JX DO 30, K = KK - 1, KK - J + 1, -1 IX = IX - INCX IF( NOCONJ )THEN X( IX ) = X( IX ) - TEMP*AP( K ) ELSE X( IX ) = X( IX ) - TEMP*DCONJG(AP( K )) END IF 30 CONTINUE END IF JX = JX - INCX KK = KK - J 40 CONTINUE END IF ELSE KK = 1 IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( J ) = X( J )/AP( KK ) ELSE IF( NOUNIT ) $ X( J ) = X( J )/DCONJG(AP( KK )) END IF TEMP = X( J ) K = KK + 1 DO 50, I = J + 1, N IF( NOCONJ )THEN X( I ) = X( I ) - TEMP*AP( K ) ELSE X( I ) = X( I ) - TEMP*DCONJG(AP( K )) END IF K = K + 1 50 CONTINUE END IF KK = KK + ( N - J + 1 ) 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF( NOCONJ )THEN IF( NOUNIT ) $ X( JX ) = X( JX )/AP( KK ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/DCONJG(AP( KK )) END IF TEMP = X( JX ) IX = JX DO 70, K = KK + 1, KK + N - J IX = IX + INCX IF( NOCONJ )THEN X( IX ) = X( IX ) - TEMP*AP( K ) ELSE X( IX ) = X( IX ) - TEMP*DCONJG(AP( K )) END IF 70 CONTINUE END IF JX = JX + INCX KK = KK + ( N - J + 1 ) 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. * IF( LSAME( UPLO, 'U' ) )THEN KK = 1 IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = X( J ) K = KK IF( NOCONJ )THEN DO 90, I = 1, J - 1 TEMP = TEMP - AP( K )*X( I ) K = K + 1 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) ELSE DO 100, I = 1, J - 1 TEMP = TEMP - DCONJG( AP( K ) )*X( I ) K = K + 1 100 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( AP( KK + J - 1 ) ) END IF X( J ) = TEMP KK = KK + J 110 CONTINUE ELSE JX = KX DO 140, J = 1, N TEMP = X( JX ) IX = KX IF( NOCONJ )THEN DO 120, K = KK, KK + J - 2 TEMP = TEMP - AP( K )*X( IX ) IX = IX + INCX 120 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK + J - 1 ) ELSE DO 130, K = KK, KK + J - 2 TEMP = TEMP - DCONJG( AP( K ) )*X( IX ) IX = IX + INCX 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( AP( KK + J - 1 ) ) END IF X( JX ) = TEMP JX = JX + INCX KK = KK + J 140 CONTINUE END IF ELSE KK = ( N*( N + 1 ) )/2 IF( INCX.EQ.1 )THEN DO 170, J = N, 1, -1 TEMP = X( J ) K = KK IF( NOCONJ )THEN DO 150, I = N, J + 1, -1 TEMP = TEMP - AP( K )*X( I ) K = K - 1 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) ELSE DO 160, I = N, J + 1, -1 TEMP = TEMP - DCONJG( AP( K ) )*X( I ) K = K - 1 160 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( AP( KK - N + J ) ) END IF X( J ) = TEMP KK = KK - ( N - J + 1 ) 170 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 200, J = N, 1, -1 TEMP = X( JX ) IX = KX IF( NOCONJ )THEN DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1 TEMP = TEMP - AP( K )*X( IX ) IX = IX - INCX 180 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/AP( KK - N + J ) ELSE DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1 TEMP = TEMP - DCONJG( AP( K ) )*X( IX ) IX = IX - INCX 190 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( AP( KK - N + J ) ) END IF X( JX ) = TEMP JX = JX - INCX KK = KK - ( N - J + 1 ) 200 CONTINUE END IF END IF END IF * RETURN * * End of ZTPSV . * END OpenBLAS-0.2.20/reference/ztrmmf.f000066400000000000000000000343731313527062700166020ustar00rootroot00000000000000 SUBROUTINE ZTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB COMPLEX*16 ALPHA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZTRMM performs one of the matrix-matrix operations * * B := alpha*op( A )*B, or B := alpha*B*op( A ) * * where alpha is a scalar, B is an m by n matrix, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) multiplies B from * the left or right as follows: * * SIDE = 'L' or 'l' B := alpha*op( A )*B. * * SIDE = 'R' or 'r' B := alpha*B*op( A ). * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = conjg( A' ). * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the matrix B, and on exit is overwritten by the * transformed matrix. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. Local Scalars .. LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA COMPLEX*16 TEMP * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOCONJ = LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZTRMM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN * * Form B := alpha*A*B. * IF( UPPER )THEN DO 50, J = 1, N DO 40, K = 1, M IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) IF (NOCONJ) THEN DO 30, I = 1, K - 1 B( I, J ) = B( I, J ) + TEMP*A( I, K ) 30 CONTINUE IF( NOUNIT ) $ TEMP = TEMP*A( K, K ) B( K, J ) = TEMP ELSE DO 35, I = 1, K - 1 B( I, J ) = B( I, J ) + TEMP*DCONJG(A( I, K )) 35 CONTINUE IF( NOUNIT ) $ TEMP = TEMP*DCONJG(A( K, K )) B( K, J ) = TEMP ENDIF END IF 40 CONTINUE 50 CONTINUE ELSE DO 80, J = 1, N DO 70 K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN TEMP = ALPHA*B( K, J ) B( K, J ) = TEMP IF (NOCONJ) THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )*A( K, K ) DO 60, I = K + 1, M B( I, J ) = B( I, J ) + TEMP*A( I, K ) 60 CONTINUE ELSE IF( NOUNIT ) $ B( K, J ) = B( K, J )*DCONJG(A( K, K )) DO 65, I = K + 1, M B( I, J ) = B( I, J ) + TEMP*DCONJG(A( I, K )) 65 CONTINUE ENDIF END IF 70 CONTINUE 80 CONTINUE END IF ELSE * * Form B := alpha*A'*B or B := alpha*conjg( A' )*B. * IF( UPPER )THEN DO 120, J = 1, N DO 110, I = M, 1, -1 TEMP = B( I, J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 90, K = 1, I - 1 TEMP = TEMP + A( K, I )*B( K, J ) 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( I, I ) ) DO 100, K = 1, I - 1 TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J ) 100 CONTINUE END IF B( I, J ) = ALPHA*TEMP 110 CONTINUE 120 CONTINUE ELSE DO 160, J = 1, N DO 150, I = 1, M TEMP = B( I, J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( I, I ) DO 130, K = I + 1, M TEMP = TEMP + A( K, I )*B( K, J ) 130 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( I, I ) ) DO 140, K = I + 1, M TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J ) 140 CONTINUE END IF B( I, J ) = ALPHA*TEMP 150 CONTINUE 160 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN * * Form B := alpha*B*A. * IF( UPPER )THEN DO 200, J = N, 1, -1 TEMP = ALPHA IF (NOCONJ) THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG(A( J, J )) ENDIF DO 170, I = 1, M B( I, J ) = TEMP*B( I, J ) 170 CONTINUE DO 190, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN TEMP = ALPHA*A( K, J ) ELSE TEMP = ALPHA*DCONJG(A( K, J )) ENDIF DO 180, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 180 CONTINUE END IF 190 CONTINUE 200 CONTINUE ELSE DO 240, J = 1, N TEMP = ALPHA IF (NOCONJ) THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG(A( J, J )) ENDIF DO 210, I = 1, M B( I, J ) = TEMP*B( I, J ) 210 CONTINUE DO 230, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN TEMP = ALPHA*A( K, J ) ELSE TEMP = ALPHA*DCONJG(A( K, J )) ENDIF DO 220, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 220 CONTINUE END IF 230 CONTINUE 240 CONTINUE END IF ELSE * * Form B := alpha*B*A' or B := alpha*B*conjg( A' ). * IF( UPPER )THEN DO 280, K = 1, N DO 260, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = ALPHA*A( J, K ) ELSE TEMP = ALPHA*DCONJG( A( J, K ) ) END IF DO 250, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 250 CONTINUE END IF 260 CONTINUE TEMP = ALPHA IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = TEMP*A( K, K ) ELSE TEMP = TEMP*DCONJG( A( K, K ) ) END IF END IF IF( TEMP.NE.ONE )THEN DO 270, I = 1, M B( I, K ) = TEMP*B( I, K ) 270 CONTINUE END IF 280 CONTINUE ELSE DO 320, K = N, 1, -1 DO 300, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = ALPHA*A( J, K ) ELSE TEMP = ALPHA*DCONJG( A( J, K ) ) END IF DO 290, I = 1, M B( I, J ) = B( I, J ) + TEMP*B( I, K ) 290 CONTINUE END IF 300 CONTINUE TEMP = ALPHA IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = TEMP*A( K, K ) ELSE TEMP = TEMP*DCONJG( A( K, K ) ) END IF END IF IF( TEMP.NE.ONE )THEN DO 310, I = 1, M B( I, K ) = TEMP*B( I, K ) 310 CONTINUE END IF 320 CONTINUE END IF END IF END IF * RETURN * * End of ZTRMM . * END OpenBLAS-0.2.20/reference/ztrmvf.f000066400000000000000000000271501313527062700166060ustar00rootroot00000000000000 SUBROUTINE ZTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZTRMV performs one of the matrix-vector operations * * x := A*x, or x := A'*x, or x := conjg( A' )*x, * * where x is an n element vector and A is an n by n unit, or non-unit, * upper or lower triangular matrix. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the operation to be performed as * follows: * * TRANS = 'N' or 'n' x := A*x. * * TRANS = 'T' or 't' x := A'*x. * * TRANS = 'C' or 'c' x := conjg( A' )*x. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element vector x. On exit, X is overwritten with the * tranformed vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZTRMV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN * * Form x := A*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = 1, N IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 10, I = 1, J - 1 IF (NOCONJ) THEN X( I ) = X( I ) + TEMP*A( I, J ) ELSE X( I ) = X( I ) + TEMP*DCONJG(A( I, J )) ENDIF 10 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*DCONJG(A( J, J )) ENDIF END IF 20 CONTINUE ELSE JX = KX DO 40, J = 1, N IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 30, I = 1, J - 1 IF (NOCONJ) THEN X( IX ) = X( IX ) + TEMP*A( I, J ) ELSE X( IX ) = X( IX ) + TEMP*DCONJG(A( I, J )) ENDIF IX = IX + INCX 30 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*DCONJG(A( J, J )) ENDIF END IF JX = JX + INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN TEMP = X( J ) DO 50, I = N, J + 1, -1 IF (NOCONJ) THEN X( I ) = X( I ) + TEMP*A( I, J ) ELSE X( I ) = X( I ) + TEMP*DCONJG(A( I, J )) ENDIF 50 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )*A( J, J ) ELSE IF( NOUNIT ) $ X( J ) = X( J )*DCONJG(A( J, J )) ENDIF END IF 60 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 80, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN TEMP = X( JX ) IX = KX DO 70, I = N, J + 1, -1 IF (NOCONJ) THEN X( IX ) = X( IX ) + TEMP*A( I, J ) ELSE X( IX ) = X( IX ) + TEMP*DCONJG(A( I, J )) ENDIF IX = IX - INCX 70 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )*A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )*DCONJG(A( J, J )) ENDIF END IF JX = JX - INCX 80 CONTINUE END IF END IF ELSE * * Form x := A'*x or x := conjg( A' )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 110, J = N, 1, -1 TEMP = X( J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 90, I = J - 1, 1, -1 TEMP = TEMP + A( I, J )*X( I ) 90 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( J, J ) ) DO 100, I = J - 1, 1, -1 TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) 100 CONTINUE END IF X( J ) = TEMP 110 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 140, J = N, 1, -1 TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 120, I = J - 1, 1, -1 IX = IX - INCX TEMP = TEMP + A( I, J )*X( IX ) 120 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( J, J ) ) DO 130, I = J - 1, 1, -1 IX = IX - INCX TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) 130 CONTINUE END IF X( JX ) = TEMP JX = JX - INCX 140 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 170, J = 1, N TEMP = X( J ) IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 150, I = J + 1, N TEMP = TEMP + A( I, J )*X( I ) 150 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( J, J ) ) DO 160, I = J + 1, N TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) 160 CONTINUE END IF X( J ) = TEMP 170 CONTINUE ELSE JX = KX DO 200, J = 1, N TEMP = X( JX ) IX = JX IF( NOCONJ )THEN IF( NOUNIT ) $ TEMP = TEMP*A( J, J ) DO 180, I = J + 1, N IX = IX + INCX TEMP = TEMP + A( I, J )*X( IX ) 180 CONTINUE ELSE IF( NOUNIT ) $ TEMP = TEMP*DCONJG( A( J, J ) ) DO 190, I = J + 1, N IX = IX + INCX TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) 190 CONTINUE END IF X( JX ) = TEMP JX = JX + INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of ZTRMV . * END OpenBLAS-0.2.20/reference/ztrsmf.f000066400000000000000000000364771313527062700166170ustar00rootroot00000000000000 SUBROUTINE ZTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, $ B, LDB ) * .. Scalar Arguments .. IMPLICIT NONE CHARACTER*1 SIDE, UPLO, TRANSA, DIAG INTEGER M, N, LDA, LDB COMPLEX*16 ALPHA * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ) * .. * * Purpose * ======= * * ZTRSM solves one of the matrix equations * * op( A )*X = alpha*B, or X*op( A ) = alpha*B, * * where alpha is a scalar, X and B are m by n matrices, A is a unit, or * non-unit, upper or lower triangular matrix and op( A ) is one of * * op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). * * The matrix X is overwritten on B. * * Parameters * ========== * * SIDE - CHARACTER*1. * On entry, SIDE specifies whether op( A ) appears on the left * or right of X as follows: * * SIDE = 'L' or 'l' op( A )*X = alpha*B. * * SIDE = 'R' or 'r' X*op( A ) = alpha*B. * * Unchanged on exit. * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix A is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANSA - CHARACTER*1. * On entry, TRANSA specifies the form of op( A ) to be used in * the matrix multiplication as follows: * * TRANSA = 'N' or 'n' op( A ) = A. * * TRANSA = 'T' or 't' op( A ) = A'. * * TRANSA = 'C' or 'c' op( A ) = conjg( A' ). * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit triangular * as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * M - INTEGER. * On entry, M specifies the number of rows of B. M must be at * least zero. * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the number of columns of B. N must be * at least zero. * Unchanged on exit. * * ALPHA - COMPLEX*16 . * On entry, ALPHA specifies the scalar alpha. When alpha is * zero then A is not referenced and B need not be set before * entry. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m * when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. * Before entry with UPLO = 'U' or 'u', the leading k by k * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading k by k * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. When SIDE = 'L' or 'l' then * LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' * then LDA must be at least max( 1, n ). * Unchanged on exit. * * B - COMPLEX*16 array of DIMENSION ( LDB, n ). * Before entry, the leading m by n part of the array B must * contain the right-hand side matrix B, and on exit is * overwritten by the solution matrix X. * * LDB - INTEGER. * On entry, LDB specifies the first dimension of B as declared * in the calling (sub) program. LDB must be at least * max( 1, m ). * Unchanged on exit. * * * Level 3 Blas routine. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. Local Scalars .. LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER INTEGER I, INFO, J, K, NROWA COMPLEX*16 TEMP * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Executable Statements .. * * Test the input parameters. * LSIDE = LSAME( SIDE , 'L' ) IF( LSIDE )THEN NROWA = M ELSE NROWA = N END IF NOCONJ = (LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' )) NOUNIT = LSAME( DIAG , 'N' ) UPPER = LSAME( UPLO , 'U' ) * INFO = 0 IF( ( .NOT.LSIDE ).AND. $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN INFO = 1 ELSE IF( ( .NOT.UPPER ).AND. $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN INFO = 2 ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN INFO = 3 ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN INFO = 4 ELSE IF( M .LT.0 )THEN INFO = 5 ELSE IF( N .LT.0 )THEN INFO = 6 ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN INFO = 9 ELSE IF( LDB.LT.MAX( 1, M ) )THEN INFO = 11 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZTRSM ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * And when alpha.eq.zero. * IF( ALPHA.EQ.ZERO )THEN DO 20, J = 1, N DO 10, I = 1, M B( I, J ) = ZERO 10 CONTINUE 20 CONTINUE RETURN END IF * * Start the operations. * IF( LSIDE )THEN IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ) )THEN * * Form B := alpha*inv( A )*B. * IF( UPPER )THEN DO 60, J = 1, N IF( ALPHA.NE.ONE )THEN DO 30, I = 1, M B( I, J ) = ALPHA*B( I, J ) 30 CONTINUE END IF DO 50, K = M, 1, -1 IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) THEN IF (NOCONJ) THEN B( K, J ) = B( K, J )/A( K, K ) ELSE B( K, J ) = B( K, J )/DCONJG(A( K, K )) ENDIF ENDIF IF (NOCONJ) THEN DO 40, I = 1, K - 1 B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 40 CONTINUE ELSE DO 45, I = 1, K - 1 B( I, J ) = B( I, J ) - B( K, J )*DCONJG(A( I, K )) 45 CONTINUE ENDIF END IF 50 CONTINUE 60 CONTINUE ELSE DO 100, J = 1, N IF( ALPHA.NE.ONE )THEN DO 70, I = 1, M B( I, J ) = ALPHA*B( I, J ) 70 CONTINUE END IF DO 90 K = 1, M IF (NOCONJ) THEN IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/A( K, K ) DO 80, I = K + 1, M B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) 80 CONTINUE END IF ELSE IF( B( K, J ).NE.ZERO )THEN IF( NOUNIT ) $ B( K, J ) = B( K, J )/DCONJG(A( K, K )) DO 85, I = K + 1, M B( I, J ) = B( I, J ) - B( K, J )*DCONJG(A( I, K )) 85 CONTINUE END IF ENDIF 90 CONTINUE 100 CONTINUE END IF ELSE * * Form B := alpha*inv( A' )*B * or B := alpha*inv( conjg( A' ) )*B. * IF( UPPER )THEN DO 140, J = 1, N DO 130, I = 1, M TEMP = ALPHA*B( I, J ) IF( NOCONJ )THEN DO 110, K = 1, I - 1 TEMP = TEMP - A( K, I )*B( K, J ) 110 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) ELSE DO 120, K = 1, I - 1 TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J ) 120 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( A( I, I ) ) END IF B( I, J ) = TEMP 130 CONTINUE 140 CONTINUE ELSE DO 180, J = 1, N DO 170, I = M, 1, -1 TEMP = ALPHA*B( I, J ) IF( NOCONJ )THEN DO 150, K = I + 1, M TEMP = TEMP - A( K, I )*B( K, J ) 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( I, I ) ELSE DO 160, K = I + 1, M TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J ) 160 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( A( I, I ) ) END IF B( I, J ) = TEMP 170 CONTINUE 180 CONTINUE END IF END IF ELSE IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ) )THEN * * Form B := alpha*B*inv( A ). * IF( UPPER )THEN DO 230, J = 1, N IF( ALPHA.NE.ONE )THEN DO 190, I = 1, M B( I, J ) = ALPHA*B( I, J ) 190 CONTINUE END IF DO 210, K = 1, J - 1 IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN DO 200, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 200 CONTINUE ELSE DO 205, I = 1, M B( I, J ) = B( I, J ) - DCONJG(A( K, J ))*B( I, K ) 205 CONTINUE ENDIF END IF 210 CONTINUE IF( NOUNIT )THEN IF (NOCONJ) THEN TEMP = ONE/A( J, J ) ELSE TEMP = ONE/DCONJG(A( J, J )) ENDIF DO 220, I = 1, M B( I, J ) = TEMP*B( I, J ) 220 CONTINUE END IF 230 CONTINUE ELSE DO 280, J = N, 1, -1 IF( ALPHA.NE.ONE )THEN DO 240, I = 1, M B( I, J ) = ALPHA*B( I, J ) 240 CONTINUE END IF DO 260, K = J + 1, N IF( A( K, J ).NE.ZERO )THEN IF (NOCONJ) THEN DO 250, I = 1, M B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) 250 CONTINUE ELSE DO 255, I = 1, M B( I, J ) = B( I, J ) - DCONJG(A( K, J ))*B( I, K ) 255 CONTINUE ENDIF END IF 260 CONTINUE IF( NOUNIT )THEN IF (NOCONJ) THEN TEMP = ONE/A( J, J ) ELSE TEMP = ONE/DCONJG(A( J, J )) ENDIF DO 270, I = 1, M B( I, J ) = TEMP*B( I, J ) 270 CONTINUE END IF 280 CONTINUE END IF ELSE * * Form B := alpha*B*inv( A' ) * or B := alpha*B*inv( conjg( A' ) ). * IF( UPPER )THEN DO 330, K = N, 1, -1 IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = ONE/A( K, K ) ELSE TEMP = ONE/DCONJG( A( K, K ) ) END IF DO 290, I = 1, M B( I, K ) = TEMP*B( I, K ) 290 CONTINUE END IF DO 310, J = 1, K - 1 IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = A( J, K ) ELSE TEMP = DCONJG( A( J, K ) ) END IF DO 300, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 300 CONTINUE END IF 310 CONTINUE IF( ALPHA.NE.ONE )THEN DO 320, I = 1, M B( I, K ) = ALPHA*B( I, K ) 320 CONTINUE END IF 330 CONTINUE ELSE DO 380, K = 1, N IF( NOUNIT )THEN IF( NOCONJ )THEN TEMP = ONE/A( K, K ) ELSE TEMP = ONE/DCONJG( A( K, K ) ) END IF DO 340, I = 1, M B( I, K ) = TEMP*B( I, K ) 340 CONTINUE END IF DO 360, J = K + 1, N IF( A( J, K ).NE.ZERO )THEN IF( NOCONJ )THEN TEMP = A( J, K ) ELSE TEMP = DCONJG( A( J, K ) ) END IF DO 350, I = 1, M B( I, J ) = B( I, J ) - TEMP*B( I, K ) 350 CONTINUE END IF 360 CONTINUE IF( ALPHA.NE.ONE )THEN DO 370, I = 1, M B( I, K ) = ALPHA*B( I, K ) 370 CONTINUE END IF 380 CONTINUE END IF END IF END IF * RETURN * * End of ZTRSM . * END OpenBLAS-0.2.20/reference/ztrsvf.f000066400000000000000000000274401313527062700166160ustar00rootroot00000000000000 SUBROUTINE ZTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) * .. Scalar Arguments .. INTEGER INCX, LDA, N CHARACTER*1 DIAG, TRANS, UPLO * .. Array Arguments .. COMPLEX*16 A( LDA, * ), X( * ) * .. * * Purpose * ======= * * ZTRSV solves one of the systems of equations * * A*x = b, or A'*x = b, or conjg( A' )*x = b, * * where b and x are n element vectors and A is an n by n unit, or * non-unit, upper or lower triangular matrix. * * No test for singularity or near-singularity is included in this * routine. Such tests must be performed before calling this routine. * * Parameters * ========== * * UPLO - CHARACTER*1. * On entry, UPLO specifies whether the matrix is an upper or * lower triangular matrix as follows: * * UPLO = 'U' or 'u' A is an upper triangular matrix. * * UPLO = 'L' or 'l' A is a lower triangular matrix. * * Unchanged on exit. * * TRANS - CHARACTER*1. * On entry, TRANS specifies the equations to be solved as * follows: * * TRANS = 'N' or 'n' A*x = b. * * TRANS = 'T' or 't' A'*x = b. * * TRANS = 'C' or 'c' conjg( A' )*x = b. * * Unchanged on exit. * * DIAG - CHARACTER*1. * On entry, DIAG specifies whether or not A is unit * triangular as follows: * * DIAG = 'U' or 'u' A is assumed to be unit triangular. * * DIAG = 'N' or 'n' A is not assumed to be unit * triangular. * * Unchanged on exit. * * N - INTEGER. * On entry, N specifies the order of the matrix A. * N must be at least zero. * Unchanged on exit. * * A - COMPLEX*16 array of DIMENSION ( LDA, n ). * Before entry with UPLO = 'U' or 'u', the leading n by n * upper triangular part of the array A must contain the upper * triangular matrix and the strictly lower triangular part of * A is not referenced. * Before entry with UPLO = 'L' or 'l', the leading n by n * lower triangular part of the array A must contain the lower * triangular matrix and the strictly upper triangular part of * A is not referenced. * Note that when DIAG = 'U' or 'u', the diagonal elements of * A are not referenced either, but are assumed to be unity. * Unchanged on exit. * * LDA - INTEGER. * On entry, LDA specifies the first dimension of A as declared * in the calling (sub) program. LDA must be at least * max( 1, n ). * Unchanged on exit. * * X - COMPLEX*16 array of dimension at least * ( 1 + ( n - 1 )*abs( INCX ) ). * Before entry, the incremented array X must contain the n * element right-hand side vector b. On exit, X is overwritten * with the solution vector x. * * INCX - INTEGER. * On entry, INCX specifies the increment for the elements of * X. INCX must not be zero. * Unchanged on exit. * * * Level 2 Blas routine. * * -- Written on 22-October-1986. * Jack Dongarra, Argonne National Lab. * Jeremy Du Croz, Nag Central Office. * Sven Hammarling, Nag Central Office. * Richard Hanson, Sandia National Labs. * * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. Local Scalars .. COMPLEX*16 TEMP INTEGER I, INFO, IX, J, JX, KX LOGICAL NOCONJ, NOUNIT * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. External Subroutines .. EXTERNAL XERBLA * .. Intrinsic Functions .. INTRINSIC DCONJG, MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 IF ( .NOT.LSAME( UPLO , 'U' ).AND. $ .NOT.LSAME( UPLO , 'L' ) )THEN INFO = 1 ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. $ .NOT.LSAME( TRANS, 'T' ).AND. $ .NOT.LSAME( TRANS, 'R' ).AND. $ .NOT.LSAME( TRANS, 'C' ) )THEN INFO = 2 ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. $ .NOT.LSAME( DIAG , 'N' ) )THEN INFO = 3 ELSE IF( N.LT.0 )THEN INFO = 4 ELSE IF( LDA.LT.MAX( 1, N ) )THEN INFO = 6 ELSE IF( INCX.EQ.0 )THEN INFO = 8 END IF IF( INFO.NE.0 )THEN CALL XERBLA( 'ZTRSV ', INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This * will be ( N - 1 )*INCX too small for descending loops. * IF( INCX.LE.0 )THEN KX = 1 - ( N - 1 )*INCX ELSE IF( INCX.NE.1 )THEN KX = 1 END IF * * Start the operations. In this version the elements of A are * accessed sequentially with one pass through A. * IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) ) THEN * * Form x := inv( A )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 20, J = N, 1, -1 IF( X( J ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 10, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*A( I, J ) 10 CONTINUE ELSE IF( NOUNIT ) $ X( J ) = X( J )/DCONJG(A( J, J )) TEMP = X( J ) DO 15, I = J - 1, 1, -1 X( I ) = X( I ) - TEMP*DCONJG(A( I, J )) 15 CONTINUE ENDIF END IF 20 CONTINUE ELSE JX = KX + ( N - 1 )*INCX DO 40, J = N, 1, -1 IF( X( JX ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/DCONJG(A( J, J )) ENDIF TEMP = X( JX ) IX = JX DO 30, I = J - 1, 1, -1 IX = IX - INCX IF (NOCONJ) THEN X( IX ) = X( IX ) - TEMP*A( I, J ) ELSE X( IX ) = X( IX ) - TEMP*DCONJG(A( I, J )) ENDIF 30 CONTINUE END IF JX = JX - INCX 40 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 60, J = 1, N IF( X( J ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( J ) = X( J )/A( J, J ) TEMP = X( J ) DO 50, I = J + 1, N X( I ) = X( I ) - TEMP*A( I, J ) 50 CONTINUE ELSE IF( NOUNIT ) $ X( J ) = X( J )/DCONJG(A( J, J )) TEMP = X( J ) DO 55, I = J + 1, N X( I ) = X( I ) - TEMP*DCONJG(A( I, J )) 55 CONTINUE ENDIF END IF 60 CONTINUE ELSE JX = KX DO 80, J = 1, N IF( X( JX ).NE.ZERO )THEN IF (NOCONJ) THEN IF( NOUNIT ) $ X( JX ) = X( JX )/A( J, J ) ELSE IF( NOUNIT ) $ X( JX ) = X( JX )/DCONJG(A( J, J )) ENDIF TEMP = X( JX ) IX = JX DO 70, I = J + 1, N IX = IX + INCX IF (NOCONJ) THEN X( IX ) = X( IX ) - TEMP*A( I, J ) ELSE X( IX ) = X( IX ) - TEMP*DCONJG(A( I, J )) ENDIF 70 CONTINUE END IF JX = JX + INCX 80 CONTINUE END IF END IF ELSE * * Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. * IF( LSAME( UPLO, 'U' ) )THEN IF( INCX.EQ.1 )THEN DO 110, J = 1, N TEMP = X( J ) IF( NOCONJ )THEN DO 90, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( I ) 90 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 100, I = 1, J - 1 TEMP = TEMP - DCONJG( A( I, J ) )*X( I ) 100 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( A( J, J ) ) END IF X( J ) = TEMP 110 CONTINUE ELSE JX = KX DO 140, J = 1, N IX = KX TEMP = X( JX ) IF( NOCONJ )THEN DO 120, I = 1, J - 1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX + INCX 120 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 130, I = 1, J - 1 TEMP = TEMP - DCONJG( A( I, J ) )*X( IX ) IX = IX + INCX 130 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( A( J, J ) ) END IF X( JX ) = TEMP JX = JX + INCX 140 CONTINUE END IF ELSE IF( INCX.EQ.1 )THEN DO 170, J = N, 1, -1 TEMP = X( J ) IF( NOCONJ )THEN DO 150, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( I ) 150 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 160, I = N, J + 1, -1 TEMP = TEMP - DCONJG( A( I, J ) )*X( I ) 160 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( A( J, J ) ) END IF X( J ) = TEMP 170 CONTINUE ELSE KX = KX + ( N - 1 )*INCX JX = KX DO 200, J = N, 1, -1 IX = KX TEMP = X( JX ) IF( NOCONJ )THEN DO 180, I = N, J + 1, -1 TEMP = TEMP - A( I, J )*X( IX ) IX = IX - INCX 180 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/A( J, J ) ELSE DO 190, I = N, J + 1, -1 TEMP = TEMP - DCONJG( A( I, J ) )*X( IX ) IX = IX - INCX 190 CONTINUE IF( NOUNIT ) $ TEMP = TEMP/DCONJG( A( J, J ) ) END IF X( JX ) = TEMP JX = JX - INCX 200 CONTINUE END IF END IF END IF * RETURN * * End of ZTRSV . * END OpenBLAS-0.2.20/reference/ztrti2f.f000066400000000000000000000101631313527062700166560ustar00rootroot00000000000000 SUBROUTINE ZTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.1) -- * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. * November 2006 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZTRTI2 computes the inverse of a complex upper or lower triangular * matrix. * * This is the Level 2 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * Specifies whether the matrix A is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * DIAG (input) CHARACTER*1 * Specifies whether or not the matrix A is unit triangular. * = 'N': Non-unit triangular * = 'U': Unit triangular * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading n by n upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading n by n lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J COMPLEX*16 AJJ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA, ZSCAL, ZTRMV * .. * .. Intrinsic Functions .. INTRINSIC MAX * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZTRTI2', -INFO ) RETURN END IF * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix. * DO 10 J = 1, N IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF * * Compute elements 1:j-1 of j-th column. * CALL ZTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, $ A( 1, J ), 1 ) CALL ZSCAL( J-1, AJJ, A( 1, J ), 1 ) 10 CONTINUE ELSE * * Compute inverse of lower triangular matrix. * DO 20 J = N, 1, -1 IF( NOUNIT ) THEN A( J, J ) = ONE / A( J, J ) AJJ = -A( J, J ) ELSE AJJ = -ONE END IF IF( J.LT.N ) THEN * * Compute elements j+1:n of j-th column. * CALL ZTRMV( 'Lower', 'No transpose', DIAG, N-J, $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) CALL ZSCAL( N-J, AJJ, A( J+1, J ), 1 ) END IF 20 CONTINUE END IF * RETURN * * End of ZTRTI2 * END OpenBLAS-0.2.20/reference/ztrtrif.f000066400000000000000000000122561313527062700167630ustar00rootroot00000000000000 SUBROUTINE ZTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) * * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * September 30, 1994 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO INTEGER INFO, LDA, N * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ) * .. * * Purpose * ======= * * ZTRTRI computes the inverse of a complex upper or lower triangular * matrix A. * * This is the Level 3 BLAS version of the algorithm. * * Arguments * ========= * * UPLO (input) CHARACTER*1 * = 'U': A is upper triangular; * = 'L': A is lower triangular. * * DIAG (input) CHARACTER*1 * = 'N': A is non-unit triangular; * = 'U': A is unit triangular. * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input/output) COMPLEX*16 array, dimension (LDA,N) * On entry, the triangular matrix A. If UPLO = 'U', the * leading N-by-N upper triangular part of the array A contains * the upper triangular matrix, and the strictly lower * triangular part of A is not referenced. If UPLO = 'L', the * leading N-by-N lower triangular part of the array A contains * the lower triangular matrix, and the strictly upper * triangular part of A is not referenced. If DIAG = 'U', the * diagonal elements of A are also not referenced and are * assumed to be 1. * On exit, the (triangular) inverse of the original matrix, in * the same storage format. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * > 0: if INFO = i, A(i,i) is exactly zero. The triangular * matrix is singular and its inverse can not be computed. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL NOUNIT, UPPER INTEGER J, JB, NB, NN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. External Subroutines .. EXTERNAL XERBLA, ZTRMM, ZTRSM, ZTRTI2 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Test the input parameters. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOUNIT = LSAME( DIAG, 'N' ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZTRTRI', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Check for singularity if non-unit. * IF( NOUNIT ) THEN DO 10 INFO = 1, N IF( A( INFO, INFO ).EQ.ZERO ) $ RETURN 10 CONTINUE INFO = 0 END IF * * Determine the block size for this environment. * NB = 128 IF( NB.LE.1 .OR. NB.GE.N ) THEN * * Use unblocked code * CALL ZTRTI2( UPLO, DIAG, N, A, LDA, INFO ) ELSE * * Use blocked code * IF( UPPER ) THEN * * Compute inverse of upper triangular matrix * DO 20 J = 1, N, NB JB = MIN( NB, N-J+1 ) * * Compute rows 1:j-1 of current block column * CALL ZTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, $ JB, ONE, A, LDA, A( 1, J ), LDA ) CALL ZTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) * * Compute inverse of current diagonal block * CALL ZTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) 20 CONTINUE ELSE * * Compute inverse of lower triangular matrix * NN = ( ( N-1 ) / NB )*NB + 1 DO 30 J = NN, 1, -NB JB = MIN( NB, N-J+1 ) IF( J+JB.LE.N ) THEN * * Compute rows j+jb:n of current block column * CALL ZTRMM( 'Left', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, $ A( J+JB, J ), LDA ) CALL ZTRSM( 'Right', 'Lower', 'No transpose', DIAG, $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, $ A( J+JB, J ), LDA ) END IF * * Compute inverse of current diagonal block * CALL ZTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) 30 CONTINUE END IF END IF * RETURN * * End of ZTRTRI * END OpenBLAS-0.2.20/segfaults.patch000066400000000000000000000006031313527062700161610ustar00rootroot00000000000000diff -ruN common_linux.h.orig common_linux.h --- common_linux.h.orig 2012-04-23 11:27:55.000000000 +0800 +++ common_linux.h 2012-05-08 23:43:00.000000000 +0800 @@ -77,7 +77,7 @@ #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); + return 0; #endif } OpenBLAS-0.2.20/symcopy.h000066400000000000000000000727251313527062700150350ustar00rootroot00000000000000/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ /* This implementation is completely wrong. I'll rewrite this */ #ifndef SYMCOPY_H #define SYMCOPY_H #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) static __inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a12; FLOAT a21, a22; b1 = b; b2 = b; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 2; bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m + 2; cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2 * m + 2; if (m - js >= 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a21; *(bb2 + 1) = a22; aa1 += 2; aa2 += 2; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); aa1 += 2; aa2 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; is --; } is = ((m - js - 2) & 1); if (is == 1){ a11 = *(aa1 + 0); a12 = *(aa2 + 0); *(bb1 + 0) = a11; *(bb2 + 0) = a12; *(cc1 + 0) = a11; *(cc1 + 1) = a12; } } if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; } } } static __inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a12; FLOAT a21, a22; b1 = b; b2 = b; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m; cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); aa1 += 2; aa2 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a12; *(bb2 + 0) = a12; *(bb2 + 1) = a22; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); aa1 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(cc1 + 0) = a11; *(cc2 + 0) = a21; bb1 += 2; cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); *(bb1 + 0) = a11; } } } static __inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; if (m - js >= 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 2); a22 = *(aa2 + 3); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a31; *(bb2 + 1) = a41; *(bb2 + 2) = a12; *(bb2 + 3) = a22; aa1 += 4; aa2 += 4; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is --; } if (m & 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; } } if (m - js == 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; } } } static __inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a12; *(bb1 + 3) = a22; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; } } } static __inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; if (m - js >= 2){ a11 = *(aa1 + 0); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 2); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a31; *(bb2 + 1) = -a41; *(bb2 + 2) = a12; *(bb2 + 3) = 0.; aa1 += 4; aa2 += 4; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = -a21; *(cc1 + 2) = a12; *(cc1 + 3) = -a22; *(cc2 + 0) = a31; *(cc2 + 1) = -a41; *(cc2 + 2) = a32; *(cc2 + 3) = -a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is --; } if (m & 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = -a21; *(cc1 + 2) = a12; *(cc1 + 3) = -a22; } } if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; } } } static __inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = -a21; *(cc1 + 2) = a12; *(cc1 + 3) = -a22; *(cc2 + 0) = a31; *(cc2 + 1) = -a41; *(cc2 + 2) = a32; *(cc2 + 3) = -a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a12; *(bb1 + 3) = -a22; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = 0.; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(cc1 + 0) = a11; *(cc1 + 1) = -a21; *(cc2 + 0) = a31; *(cc2 + 1) = -a41; bb1 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; } } } static __inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; if (m - js >= 2){ a11 = *(aa1 + 0); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 2); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a31; *(bb1 + 3) = -a41; *(bb2 + 0) = a31; *(bb2 + 1) = a41; *(bb2 + 2) = a12; *(bb2 + 3) = 0.; aa1 += 4; aa2 += 4; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb1 + 2) = a31; *(bb1 + 3) = -a41; *(bb2 + 0) = a12; *(bb2 + 1) = -a22; *(bb2 + 2) = a32; *(bb2 + 3) = -a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is --; } if (m & 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb2 + 0) = a12; *(bb2 + 1) = -a22; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; } } if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; } } } static __inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb1 + 2) = a31; *(bb1 + 3) = -a41; *(bb2 + 0) = a12; *(bb2 + 1) = -a22; *(bb2 + 2) = a32; *(bb2 + 3) = -a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a12; *(bb1 + 3) = a22; *(bb2 + 0) = a12; *(bb2 + 1) = -a22; *(bb2 + 2) = a32; *(bb2 + 3) = 0.; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb1 + 2) = a31; *(bb1 + 3) = -a41; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); *(bb1 + 0) = a11; *(bb1 + 1) = 0.; } } } static __inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a12; FLOAT a21, a22; b1 = b; b2 = b; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 2; bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m + 2; cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2 * m + 2; if (m - js >= 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a21; *(bb2 + 1) = a22; aa1 += 2; aa2 += 2; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); aa1 += 2; aa2 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; is --; } is = ((m - js - 2) & 1); if (is == 1){ a11 = *(aa1 + 0); a12 = *(aa2 + 0); *(bb1 + 0) = a11; *(bb2 + 0) = a12; *(cc1 + 0) = a11; *(cc1 + 1) = a12; } } if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; } } } static __inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a12; FLOAT a21, a22; b1 = b; b2 = b; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 2; bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m + 2; cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2 * m + 2; if (m - js >= 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a21; *(bb2 + 1) = a22; aa1 += 2; aa2 += 2; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); aa1 += 2; aa2 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; is --; } is = ((m - js - 2) & 1); if (is == 1){ a11 = *(aa1 + 0); a12 = *(aa2 + 0); *(bb1 + 0) = a11; *(bb2 + 0) = a12; *(cc1 + 0) = a11; *(cc1 + 1) = a12; } } if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; } } } static __inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a12; FLOAT a21, a22; b1 = b; b2 = b; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m; cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); aa1 += 2; aa2 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a12; *(bb2 + 0) = a12; *(bb2 + 1) = a22; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); aa1 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(cc1 + 0) = a11; *(cc2 + 0) = a21; bb1 += 2; cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); *(bb1 + 0) = a11; } } } static __inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a12; FLOAT a21, a22; b1 = b; b2 = b; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m; cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); aa1 += 2; aa2 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; bb1 += 2; bb2 += 2; cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a12; *(bb2 + 0) = a12; *(bb2 + 1) = a22; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); aa1 += 2; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(cc1 + 0) = a11; *(cc2 + 0) = a21; bb1 += 2; cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); *(bb1 + 0) = a11; } } } static __inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; if (m - js >= 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 2); a22 = *(aa2 + 3); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a31; *(bb2 + 1) = a41; *(bb2 + 2) = a12; *(bb2 + 3) = a22; aa1 += 4; aa2 += 4; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is --; } if (m & 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; } } if (m - js == 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; } } } static __inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; if (m - js >= 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 2); a22 = *(aa2 + 3); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a31; *(bb2 + 1) = a41; *(bb2 + 2) = a12; *(bb2 + 3) = a22; aa1 += 4; aa2 += 4; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is = ((m - js - 2) >> 1); while (is > 0){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; is --; } if (m & 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; } } if (m - js == 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; } } } static __inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a12; *(bb1 + 3) = a22; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; } } } static __inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG is, js; FLOAT *aa1, *aa2; FLOAT *b1, *b2; FLOAT *bb1, *bb2; FLOAT *cc1, *cc2; FLOAT a11, a21, a31, a41; FLOAT a12, a22, a32, a42; b1 = b; b2 = b; lda *= 2; for (js = 0; js < m; js += 2){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; if (m - js >= 2){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); aa1 += 4; aa2 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; *(cc1 + 3) = a22; *(cc2 + 0) = a31; *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; bb1 += 4; bb2 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a12; *(bb1 + 3) = a22; *(bb2 + 0) = a12; *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; } if (m - js == 1){ for (is = 0; is < js; is += 2){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; *(bb1 + 3) = a41; *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); *(bb1 + 0) = a11; *(bb1 + 1) = a21; } } } #endif #endif OpenBLAS-0.2.20/test/000077500000000000000000000000001313527062700141235ustar00rootroot00000000000000OpenBLAS-0.2.20/test/CMakeLists.txt000066400000000000000000000023101313527062700166570ustar00rootroot00000000000000include_directories(${PROJECT_SOURCE_DIR}) enable_language(Fortran) set(OpenBLAS_Tests sblat1 sblat2 sblat3 dblat1 dblat2 dblat3 cblat1 cblat2 cblat3 zblat1 zblat2 zblat3) foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}_static) endforeach() # $1 exec, $2 input, $3 output_result FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" "$1 < $2\n" "grep -q FATAL $3\n" "if [ $? -eq 0 ]; then\n" "echo Error\n" "exit 1\n" "else\n" "exit 0\n" "fi\n" ) set(float_types s d c z) foreach(float_type ${float_types}) string(TOUPPER ${float_type} float_type_upper) add_test(NAME "${float_type}blas1" COMMAND "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat1") add_test(NAME "${float_type}blas2" COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) endforeach()OpenBLAS-0.2.20/test/LICENSE000066400000000000000000000017031313527062700151310ustar00rootroot00000000000000This directory contains the reference implementation of BLAS which is obtainable at: http://netlib.org/blas/ The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, 2010, is as follows: 2) Are there legal restrictions on the use of BLAS reference implementation software? The reference BLAS is a freely-available software package. It is available from netlib via anonymous ftp and the World Wide Web. Thus, it can be included in commercial software packages (and has been). We only ask that proper credit be given to the authors. Like all software, it is copyrighted. It is not trademarked, but we do ask the following: If you modify the source for these routines we ask that you change the name of the routine and comment the changes made to the original. We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. OpenBLAS-0.2.20/test/Makefile000066400000000000000000000154021313527062700155650ustar00rootroot00000000000000TOPDIR = .. include ../Makefile.system all :: level1 level2 level3 level1 : sblat1 dblat1 cblat1 zblat1 ifndef CROSS OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat1 ifdef SMP ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./sblat1 OMP_NUM_THREADS=2 ./dblat1 OMP_NUM_THREADS=2 ./cblat1 OMP_NUM_THREADS=2 ./zblat1 else OPENBLAS_NUM_THREADS=2 ./sblat1 OPENBLAS_NUM_THREADS=2 ./dblat1 OPENBLAS_NUM_THREADS=2 ./cblat1 OPENBLAS_NUM_THREADS=2 ./zblat1 endif endif endif level2 : sblat2 dblat2 cblat2 zblat2 ifndef CROSS rm -f ?BLAT2.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 ifdef SMP rm -f ?BLAT2.SUMM ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 OMP_NUM_THREADS=2 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 OMP_NUM_THREADS=2 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 OMP_NUM_THREADS=2 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 else OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif endif endif level3 : sblat3 dblat3 cblat3 zblat3 ifndef CROSS rm -f ?BLAT3.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 OMP_NUM_THREADS=2 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 else OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif endif endif level3_3m : zblat3_3m cblat3_3m ifndef CROSS rm -f ?BLAT3_3M.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 ifdef SMP rm -f ?BLAT3_3M.SUMM ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 else OPENBLAS_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 endif endif endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) CEXTRALIB = sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ zblat3_3m zblat3_3mp \ cblat3_3m cblat3_3mp \ *.stackdump *.dll libs: prof: quick : $(MAKE) -C $(TOPDIR) libs # include ../Makefile.tail OpenBLAS-0.2.20/test/cblat1.f000066400000000000000000000747241313527062700154560ustar00rootroot00000000000000 PROGRAM CBLAT1 * Test program for the COMPLEX Level 1 BLAS. * Based upon the original BLAS test routine together with: * F06GAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK1, CHECK2, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625E-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * Initialize PASS, INCX, INCY, and MODE for a new case. * The value 9999 for INCX, INCY or MODE will appear in the * detailed output, if any, for cases that do not involve * these parameters. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.LE.5) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.GE.6) THEN CALL CHECK1(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Complex BLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*6 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/'CDOTC '/ DATA L(2)/'CDOTU '/ DATA L(3)/'CAXPY '/ DATA L(4)/'CCOPY '/ DATA L(5)/'CSWAP '/ DATA L(6)/'SCNRM2'/ DATA L(7)/'SCASUM'/ DATA L(8)/'CSCAL '/ DATA L(9)/'CSSCAL'/ DATA L(10)/'ICAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,12X,A6) END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX CA REAL SA INTEGER I, J, LEN, NP1 * .. Local Arrays .. COMPLEX CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + MWPCS(5), MWPCT(5) REAL STRUE2(5), STRUE4(5) INTEGER ITRUE3(5) * .. External Functions .. REAL SCASUM, SCNRM2 INTEGER ICAMAX EXTERNAL SCASUM, SCNRM2, ICAMAX * .. External Subroutines .. EXTERNAL CSCAL, CSSCAL, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA, CA/0.3E0, (0.4E0,-0.7E0)/ DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0), + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (-0.17E0,-0.19E0), (0.13E0,-0.39E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (0.19E0,-0.17E0), (0.32E0,0.09E0), + (0.23E0,-0.24E0), (0.18E0,0.01E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + (2.0E0,3.0E0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (-0.17E0,-0.19E0), (8.0E0,9.0E0), + (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (0.11E0,-0.03E0), (3.0E0,6.0E0), + (-0.17E0,0.46E0), (4.0E0,7.0E0), + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), + (0.32E0,0.09E0), (6.0E0,9.0E0), + (0.23E0,-0.24E0), (8.0E0,3.0E0), + (0.18E0,0.01E0), (9.0E0,4.0E0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + (0.03E0,-0.09E0), (0.15E0,-0.03E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + (0.03E0,0.03E0), (-0.18E0,0.03E0), + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (0.09E0,0.03E0), (0.03E0,0.12E0), + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + (0.03E0,-0.09E0), (8.0E0,9.0E0), + (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + (0.03E0,0.03E0), (3.0E0,6.0E0), + (-0.18E0,0.03E0), (4.0E0,7.0E0), + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 DO 40 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN CX(I) = CV(I,NP1,INCX) 20 CONTINUE IF (ICASE.EQ.6) THEN * .. SCNRM2 .. CALL STEST1(SCNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1), + SFAC) ELSE IF (ICASE.EQ.7) THEN * .. SCASUM .. CALL STEST1(SCASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1), + SFAC) ELSE IF (ICASE.EQ.8) THEN * .. CSCAL .. CALL CSCAL(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN * .. CSSCAL .. CALL CSSCAL(N,SA,CX,INCX) CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.10) THEN * .. ICAMAX .. CALL ITEST1(ICAMAX(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF * 40 CONTINUE 60 CONTINUE * INCX = 1 IF (ICASE.EQ.8) THEN * CSCAL * Add a test for alpha equal to zero. CA = (0.0E0,0.0E0) DO 80 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 80 CONTINUE CALL CSCAL(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * CSSCAL * Add a test for alpha equal to zero. SA = 0.0E0 DO 100 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 100 CONTINUE CALL CSSCAL(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to one. SA = 1.0E0 DO 120 I = 1, 5 MWPCT(I) = CX(I) MWPCS(I) = CX(I) 120 CONTINUE CALL CSSCAL(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to minus one. SA = -1.0E0 DO 140 I = 1, 5 MWPCT(I) = -CX(I) MWPCS(I) = -CX(I) 140 CONTINUE CALL CSSCAL(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) END IF RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX CA INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. COMPLEX CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. COMPLEX CDOTC, CDOTU EXTERNAL CDOTC, CDOTU * .. External Subroutines .. EXTERNAL CAXPY, CCOPY, CSWAP, CTEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA CA/(0.4E0,-0.7E0)/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0), + (-0.1E0,-0.9E0), (0.2E0,-0.8E0), + (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/ DATA CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0), + (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0), + (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/ DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.32E0,-1.41E0), + (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (-1.55E0,0.5E0), + (0.03E0,-0.89E0), (-0.38E0,-0.96E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.78E0,0.06E0), (-0.9E0,0.5E0), + (0.06E0,-0.13E0), (0.1E0,-0.5E0), + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + (0.52E0,-1.51E0)/ DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.78E0,0.06E0), (-1.54E0,0.97E0), + (0.03E0,-0.89E0), (-0.18E0,-1.31E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0), + (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0), + (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0), + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + (0.32E0,-1.16E0)/ DATA CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0), + (0.65E0,-0.47E0), (-0.34E0,-1.22E0), + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + (-0.59E0,-1.46E0), (-1.04E0,-0.04E0), + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + (-0.83E0,0.59E0), (0.07E0,-0.37E0), + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/ DATA CT6/(0.0E0,0.0E0), (0.90E0,0.06E0), + (0.91E0,-0.77E0), (1.80E0,-0.10E0), + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0), + (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0), + (-0.55E0,0.23E0), (0.83E0,-0.39E0), + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0), + (1.95E0,1.22E0)/ DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0), + (-0.4E0,-0.7E0), (-0.1E0,-0.2E0), + (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0), + (0.6E0,-0.6E0)/ DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0), + (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0), + (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/ DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + (-0.4E0,-0.7E0), (-0.1E0,-0.9E0), + (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0)/ DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0), + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + (0.7E0,-0.8E0)/ DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + (-0.9E0,-0.4E0), (-0.1E0,-0.9E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0)/ DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0), + (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0), + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + (0.2E0,-0.8E0)/ DATA CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0), + (1.63E0,1.73E0), (2.90E0,2.78E0)/ DATA CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0), + (1.17E0,1.17E0), (1.17E0,1.17E0), + (1.17E0,1.17E0), (1.17E0,1.17E0), + (1.17E0,1.17E0), (1.17E0,1.17E0)/ DATA CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0), + (1.54E0,1.54E0), (1.54E0,1.54E0), + (1.54E0,1.54E0), (1.54E0,1.54E0), + (1.54E0,1.54E0), (1.54E0,1.54E0)/ * .. Executable Statements .. DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. initialize all argument arrays .. DO 20 I = 1, 7 CX(I) = CX1(I) CY(I) = CY1(I) 20 CONTINUE IF (ICASE.EQ.1) THEN * .. CDOTC .. CDOT(1) = CDOTC(N,CX,INCX,CY,INCY) CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.2) THEN * .. CDOTU .. CDOT(1) = CDOTU(N,CX,INCX,CY,INCY) CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.3) THEN * .. CAXPY .. CALL CAXPY(N,CA,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.4) THEN * .. CCOPY .. CALL CCOPY(N,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE IF (ICASE.EQ.5) THEN * .. CSWAP .. CALL CSWAP(N,CX,INCX,CY,INCY) CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF * 40 CONTINUE 60 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC INTEGER LEN * .. Array Arguments .. REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SD INTEGER I * .. External Functions .. REAL SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. REAL SCOMP1, SFAC, STRUE1 * .. Array Arguments .. REAL SSIZE(*) * .. Local Arrays .. REAL SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END REAL FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. REAL SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) * **************************** CTEST ***************************** * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. REAL SFAC INTEGER LEN * .. Array Arguments .. COMPLEX CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) * .. Local Scalars .. INTEGER I * .. Local Arrays .. REAL SCOMP(20), SSIZE(20), STRUE(20) * .. External Subroutines .. EXTERNAL STEST * .. Intrinsic Functions .. INTRINSIC AIMAG, REAL * .. Executable Statements .. DO 20 I = 1, LEN SCOMP(2*I-1) = REAL(CCOMP(I)) SCOMP(2*I) = AIMAG(CCOMP(I)) STRUE(2*I-1) = REAL(CTRUE(I)) STRUE(2*I) = AIMAG(CTRUE(I)) SSIZE(2*I-1) = REAL(CSIZE(I)) SSIZE(2*I) = AIMAG(CSIZE(I)) 20 CONTINUE * CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/test/cblat2.dat000066400000000000000000000030121313527062700157600ustar00rootroot00000000000000'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA CGEMV T PUT F FOR NO TEST. SAME COLUMNS. CGBMV T PUT F FOR NO TEST. SAME COLUMNS. CHEMV T PUT F FOR NO TEST. SAME COLUMNS. CHBMV T PUT F FOR NO TEST. SAME COLUMNS. CHPMV T PUT F FOR NO TEST. SAME COLUMNS. CTRMV T PUT F FOR NO TEST. SAME COLUMNS. CTBMV T PUT F FOR NO TEST. SAME COLUMNS. CTPMV T PUT F FOR NO TEST. SAME COLUMNS. CTRSV T PUT F FOR NO TEST. SAME COLUMNS. CTBSV T PUT F FOR NO TEST. SAME COLUMNS. CTPSV T PUT F FOR NO TEST. SAME COLUMNS. CGERC T PUT F FOR NO TEST. SAME COLUMNS. CGERU T PUT F FOR NO TEST. SAME COLUMNS. CHER T PUT F FOR NO TEST. SAME COLUMNS. CHPR T PUT F FOR NO TEST. SAME COLUMNS. CHER2 T PUT F FOR NO TEST. SAME COLUMNS. CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/cblat2.f000066400000000000000000003420241313527062700154460ustar00rootroot00000000000000 PROGRAM CBLAT2 * * Test program for the COMPLEX Level 2 Blas. * * The program must be driven by a short data file. The first 18 records * of the file are read using list-directed input, the last 17 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 35 lines: * 'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * CGEMV T PUT F FOR NO TEST. SAME COLUMNS. * CGBMV T PUT F FOR NO TEST. SAME COLUMNS. * CHEMV T PUT F FOR NO TEST. SAME COLUMNS. * CHBMV T PUT F FOR NO TEST. SAME COLUMNS. * CHPMV T PUT F FOR NO TEST. SAME COLUMNS. * CTRMV T PUT F FOR NO TEST. SAME COLUMNS. * CTBMV T PUT F FOR NO TEST. SAME COLUMNS. * CTPMV T PUT F FOR NO TEST. SAME COLUMNS. * CTRSV T PUT F FOR NO TEST. SAME COLUMNS. * CTBSV T PUT F FOR NO TEST. SAME COLUMNS. * CTPSV T PUT F FOR NO TEST. SAME COLUMNS. * CGERC T PUT F FOR NO TEST. SAME COLUMNS. * CGERU T PUT F FOR NO TEST. SAME COLUMNS. * CHER T PUT F FOR NO TEST. SAME COLUMNS. * CHPR T PUT F FOR NO TEST. SAME COLUMNS. * CHER2 T PUT F FOR NO TEST. SAME COLUMNS. * CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 17 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANS CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LCE EXTERNAL SDIFF, LCE * .. External Subroutines .. EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6, $ CCHKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'CGEMV ', 'CGBMV ', 'CHEMV ', 'CHBMV ', $ 'CHPMV ', 'CTRMV ', 'CTBMV ', 'CTPMV ', $ 'CTRSV ', 'CTBSV ', 'CTPSV ', 'CGERC ', $ 'CGERU ', 'CHER ', 'CHPR ', 'CHER2 ', $ 'CHPR2 '/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 90 CONTINUE IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 100 EPS = RHALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from CMVCH YT holds * the result computed by CMVCH. TRANS = 'N' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 170, 180, $ 180, 190, 190 )ISNUM * Test CGEMV, 01, and CGBMV, 02. 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test CTRMV, 06, CTBMV, 07, CTPMV, 08, * CTRSV, 09, CTBSV, 10, and CTPSV, 11. 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) GO TO 200 * Test CGERC, 12, CGERU, 13. 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test CHER, 14, and CHPR, 15. 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test CHER2, 16, and CHPR2, 17. 190 CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT( ' TESTS OF THE COMPLEX LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9988 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT( ' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT( A6, L2 ) 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of CBLAT2. * END SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests CGEMV and CGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CGBMV, CGEMV, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' BANDED = SNAME( 3: 3 ).EQ.'B' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ TRANS, M, N, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CGEMV( TRANS, M, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CGBMV( TRANS, M, N, KL, KU, ALPHA, $ AA, LDA, XX, INCX, BETA, $ YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LCE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LCE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LCE( YS, YY, LY ) ELSE ISAME( 10 ) = LCERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LCE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LCE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LCE( YS, YY, LY ) ELSE ISAME( 12 ) = LCERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', $ F4.1, '), Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', $ F4.1, '), Y,', I2, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK1. * END SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests CHEMV, CHBMV and CHPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHBMV, CHEMV, CHPMV, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CHEMV( UPLO, N, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL CHBMV( UPLO, N, K, ALPHA, AA, LDA, $ XX, INCX, BETA, YY, INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL CHPMV( UPLO, N, ALPHA, AA, XX, INCX, $ BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LCE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LCE( YS, YY, LY ) ELSE ISAME( 9 ) = LCERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LCE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LCE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LCE( YS, YY, LY ) ELSE ISAME( 10 ) = LCERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( AS, AA, LAA ) ISAME( 5 ) = LCE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LCE( YS, YY, LY ) ELSE ISAME( 8 ) = LCERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, $ BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2, $ ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', $ F4.1, '), Y,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ', $ 'Y,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK2. * END SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) * * Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX TRANSL REAL ERR, ERRMAX INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CMAKE, CMVCH, CTBMV, CTBSV, CTPMV, CTPSV, $ CTRMV, CTRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'R' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero vector for CMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CTRMV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CTBMV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CTPMV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CTRSV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL CTBSV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL CTPSV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LCE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LCE( XS, XX, LX ) ELSE ISAME( 7 ) = LCERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LCE( XS, XX, LX ) ELSE ISAME( 8 ) = LCERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LCE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LCE( XS, XX, LX ) ELSE ISAME( 6 ) = LCERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MV' )THEN * * Check the result. * CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, $ INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, $ LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK3. * END SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests CGERC and CGERU. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX ALPHA, ALS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL CONJ, NULL, RESET, SAME * .. Local Arrays .. COMPLEX W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CGERC, CGERU, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. CONJ = SNAME( 5: 5 ).EQ.'C' * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( CONJ )THEN IF( REWI ) $ REWIND NTRA CALL CGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA, $ LDA ) ELSE IF( REWI ) $ REWIND NTRA CALL CGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA, $ LDA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LCE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LCE( AS, AA, LAA ) ELSE ISAME( 8 ) = LCERES( 'GE', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF IF( CONJ ) $ W( 1 ) = CONJG( W( 1 ) ) CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, $ '), X,', I2, ', Y,', I2, ', A,', I3, ') ', $ ' .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests CHER and CHPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX ALPHA, TRANSL REAL ERR, ERRMAX, RALPHA, RALS INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. COMPLEX W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHER, CHPR, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, CONJG, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF RALPHA = REAL( ALF( IA ) ) ALPHA = CMPLX( RALPHA, RZERO ) NULL = N.LE.0.OR.RALPHA.EQ.RZERO * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N RALS = RALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ RALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL CHER( UPLO, N, RALPHA, XX, INCX, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ RALPHA, INCX IF( REWI ) $ REWIND NTRA CALL CHPR( UPLO, N, RALPHA, XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = RALS.EQ.RALPHA ISAME( 4 ) = LCE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LCE( AS, AA, LAA ) ELSE ISAME( 6 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = CONJG( Z( J ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK5. * END SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests CHER2 and CHPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), $ ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) REAL G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX ALPHA, ALS, TRANSL REAL ERR, ERRMAX INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. COMPLEX W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHER2, CHPR2, CMAKE, CMVCH * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL CHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL CHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LCE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LCE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LCE( AS, AA, LAA ) ELSE ISAME( 8 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = ALPHA*CONJG( Z( J, 2 ) ) W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', AP) ', $ ' .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') ', $ ' .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK6. * END SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 2 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, RALPHA, BETA, A, X and Y should not need to be defined. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. COMPLEX ALPHA, BETA REAL RALPHA * .. Local Arrays .. COMPLEX A( 1, 1 ), X( 1 ), Y( 1 ) * .. External Subroutines .. EXTERNAL CGBMV, CGEMV, CGERC, CGERU, CHBMV, CHEMV, CHER, $ CHER2, CHKXER, CHPMV, CHPR, CHPR2, CTBMV, $ CTBSV, CTPMV, CTPSV, CTRMV, CTRSV * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90, 100, 110, 120, 130, 140, 150, 160, $ 170 )ISNUM 10 INFOT = 1 CALL CGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 20 INFOT = 1 CALL CGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 30 INFOT = 1 CALL CHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 40 INFOT = 1 CALL CHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 50 INFOT = 1 CALL CHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 60 INFOT = 1 CALL CTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 70 INFOT = 1 CALL CTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 80 INFOT = 1 CALL CTPMV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTPMV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTPMV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTPMV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CTPMV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 90 INFOT = 1 CALL CTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 100 INFOT = 1 CALL CTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 110 INFOT = 1 CALL CTPSV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTPSV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTPSV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTPSV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CTPSV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 120 INFOT = 1 CALL CGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 130 INFOT = 1 CALL CGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 140 INFOT = 1 CALL CHER( '/', 0, RALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHER( 'U', -1, RALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHER( 'U', 0, RALPHA, X, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER( 'U', 2, RALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 150 INFOT = 1 CALL CHPR( '/', 0, RALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHPR( 'U', -1, RALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHPR( 'U', 0, RALPHA, X, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 160 INFOT = 1 CALL CHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 170 INFOT = 1 CALL CHPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 180 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of CCHKE. * END SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) COMPLEX ROGUE PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) REAL RROGUE PARAMETER ( RROGUE = -1.0E10 ) * .. Scalar Arguments .. COMPLEX TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX CBEG EXTERNAL CBEG * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, MIN, REAL * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'G' SYM = TYPE( 1: 1 ).EQ.'H' TRI = TYPE( 1: 1 ).EQ.'T' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = CBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = CONJG( A( I, J ) ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( SYM ) $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'GB' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE IF( SYM )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 130 CONTINUE ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE IF( SYM )THEN JJ = KK + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 170 CONTINUE ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE IF( SYM ) $ AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE ) END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of CMAKE. * END SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0, RONE = 1.0 ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA REAL EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. COMPLEX A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) REAL G( * ) * .. Local Scalars .. COMPLEX C REAL ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL CTRAN, TRAN * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT * .. Statement Functions .. REAL ABS1 * .. Statement Function definitions .. ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) * .. Executable Statements .. TRAN = TRANS.EQ.'T' CTRAN = TRANS.EQ.'C' IF( TRAN.OR.CTRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 40 I = 1, ML YT( IY ) = ZERO G( IY ) = RZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE IF( CTRAN )THEN DO 20 J = 1, NL YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 20 CONTINUE ELSE DO 30 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) JX = JX + INCXL 30 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) IY = IY + INCYL 40 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 50 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 60 50 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 80 * * Report fatal error. * 60 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 70 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) END IF 70 CONTINUE * 80 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) * * End of CMVCH. * END LOGICAL FUNCTION LCE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LCE = .TRUE. GO TO 30 20 CONTINUE LCE = .FALSE. 30 RETURN * * End of LCE. * END LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE', 'HE' or 'HP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'HE' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE LCERES = .FALSE. 80 RETURN * * End of LCERES. * END COMPLEX FUNCTION CBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC CMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) RETURN * * End of CBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 2 BLAS * routines. * * XERBLA is an error handler for the Level 2 BLAS routines. * * It is called by the Level 2 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/cblat3.dat000066400000000000000000000020261313527062700157650ustar00rootroot00000000000000'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. F LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA CGEMM T PUT F FOR NO TEST. SAME COLUMNS. CHEMM T PUT F FOR NO TEST. SAME COLUMNS. CSYMM T PUT F FOR NO TEST. SAME COLUMNS. CTRMM T PUT F FOR NO TEST. SAME COLUMNS. CTRSM T PUT F FOR NO TEST. SAME COLUMNS. CHERK T PUT F FOR NO TEST. SAME COLUMNS. CSYRK T PUT F FOR NO TEST. SAME COLUMNS. CHER2K T PUT F FOR NO TEST. SAME COLUMNS. CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/cblat3.f000066400000000000000000003764041313527062700154600ustar00rootroot00000000000000 PROGRAM CBLAT3 * * Test program for the COMPLEX Level 3 Blas. * * The program must be driven by a short data file. The first 14 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 23 lines: * 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * CGEMM T PUT F FOR NO TEST. SAME COLUMNS. * CHEMM T PUT F FOR NO TEST. SAME COLUMNS. * CSYMM T PUT F FOR NO TEST. SAME COLUMNS. * CTRMM T PUT F FOR NO TEST. SAME COLUMNS. * CTRSM T PUT F FOR NO TEST. SAME COLUMNS. * CHERK T PUT F FOR NO TEST. SAME COLUMNS. * CSYRK T PUT F FOR NO TEST. SAME COLUMNS. * CHER2K T PUT F FOR NO TEST. SAME COLUMNS. * CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANSA, TRANSB CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LCE EXTERNAL SDIFF, LCE * .. External Subroutines .. EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHKE, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'CGEMM ', 'CHEMM ', 'CSYMM ', 'CTRMM ', $ 'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K', $ 'CSYR2K'/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from CMMCH CT holds * the result computed by CMMCH. TRANSA = 'N' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test CGEMM, 01. 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test CHEMM, 02, CSYMM, 03. 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test CTRMM, 04, CTRSM, 05. 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) GO TO 190 * Test CHERK, 06, CSYRK, 07. 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test CHER2K, 08, CSYR2K, 09. 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A6, L2 ) 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of CBLAT3. * END SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests CGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CGEMM, CMAKE, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL CMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, $ BETA, LDC IF( REWI ) $ REWIND NTRA CALL CGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LCE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LCE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LCE( CS, CC, LCC ) ELSE ISAME( 12 ) = LCERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, $ ALPHA, LDA, LDB, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK1. * END SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests CHEMM and CSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHEMM, CMAKE, CMMCH, CSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL CHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) ELSE CALL CSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC * 120 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK2. * END SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C ) * * Tests CTRMM and CTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS REAL ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CMAKE, CMMCH, CTRMM, CTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for CMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL CMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL CTRMM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LCE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LCE( BS, BB, LBB ) ELSE ISAME( 10 ) = LCERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MM' )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, LDA, LDB * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK3. * END SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests CHERK and CSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHERK, CMAKE, CMMCH, CSYRK * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO RALS = RONE RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = REAL( ALPHA ) ALPHA = CMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, RALPHA, LDA, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL CHERK( UPLO, TRANS, N, K, RALPHA, AA, $ LDA, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC IF( REWI ) $ REWIND NTRA CALL CSYRK( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LCE( CS, CC, LCC ) ELSE ISAME( 9 ) = LCERES( SNAME( 2: 3 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL CMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, $ LDA, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) * * Tests CHER2K and CSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHER2K, CMAKE, CMMCH, CSYR2K * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL CHER2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL CSYR2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'HE', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = CONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*CONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = CONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC END IF * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK5. * END SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. COMPLEX ALPHA, BETA REAL RALPHA, RBETA * .. Local Arrays .. COMPLEX A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. EXTERNAL CGEMM, CHEMM, CHER2K, CHERK, CHKXER, CSYMM, $ CSYR2K, CSYRK, CTRMM, CTRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 CALL CGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 20 INFOT = 1 CALL CHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 30 INFOT = 1 CALL CSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 40 INFOT = 1 CALL CTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 50 INFOT = 1 CALL CTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 60 INFOT = 1 CALL CHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 70 INFOT = 1 CALL CSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 80 INFOT = 1 CALL CHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 90 INFOT = 1 CALL CSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 100 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of CCHKE. * END SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'HE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) COMPLEX ROGUE PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) REAL RROGUE PARAMETER ( RROGUE = -1.0E10 ) * .. Scalar Arguments .. COMPLEX TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX CBEG EXTERNAL CBEG * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, REAL * .. Executable Statements .. GEN = TYPE.EQ.'GE' HER = TYPE.EQ.'HE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = CBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = CONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of CMAKE. * END SUBROUTINE CMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0, RONE = 1.0 ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA REAL EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) REAL G( * ) * .. Local Scalars .. COMPLEX CL REAL ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT * .. Statement Functions .. REAL ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )* $ CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of CMMCH. * END LOGICAL FUNCTION LCE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LCE = .TRUE. GO TO 30 20 CONTINUE LCE = .FALSE. 30 RETURN * * End of LCE. * END LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'HE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE LCERES = .FALSE. 80 RETURN * * End of LCERES. * END COMPLEX FUNCTION CBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC CMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) RETURN * * End of CBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 3 BLAS * routines. * * XERBLA is an error handler for the Level 3 BLAS routines. * * It is called by the Level 3 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/cblat3_3m.dat000066400000000000000000000020421313527062700163620ustar00rootroot00000000000000'CBLAT3_3M.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. F LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. CHEMM F PUT F FOR NO TEST. SAME COLUMNS. CSYMM F PUT F FOR NO TEST. SAME COLUMNS. CTRMM F PUT F FOR NO TEST. SAME COLUMNS. CTRSM F PUT F FOR NO TEST. SAME COLUMNS. CHERK F PUT F FOR NO TEST. SAME COLUMNS. CSYRK F PUT F FOR NO TEST. SAME COLUMNS. CHER2K F PUT F FOR NO TEST. SAME COLUMNS. CSYR2K F PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/cblat3_3m.f000066400000000000000000003766431313527062700160640ustar00rootroot00000000000000 PROGRAM CBLAT3 * * Test program for the COMPLEX Level 3 Blas. * * The program must be driven by a short data file. The first 14 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A8, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 23 lines: * 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. * CHEMM T PUT F FOR NO TEST. SAME COLUMNS. * CSYMM T PUT F FOR NO TEST. SAME COLUMNS. * CTRMM T PUT F FOR NO TEST. SAME COLUMNS. * CTRSM T PUT F FOR NO TEST. SAME COLUMNS. * CHERK T PUT F FOR NO TEST. SAME COLUMNS. * CSYRK T PUT F FOR NO TEST. SAME COLUMNS. * CHER2K T PUT F FOR NO TEST. SAME COLUMNS. * CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANSA, TRANSB CHARACTER*8 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*8 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LCE EXTERNAL SDIFF, LCE * .. External Subroutines .. EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHKE, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*8 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'CGEMM3M ', 'CHEMM ', 'CSYMM ', $ 'CTRMM ', $ 'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K', $ 'CSYR2K'/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from CMMCH CT holds * the result computed by CMMCH. TRANSA = 'N' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test CGEMM3M, 01. 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test CHEMM, 02, CSYMM, 03. 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test CTRMM, 04, CTRSM, 05. 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) GO TO 190 * Test CHERK, 06, CSYRK, 07. 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test CHER2K, 08, CSYR2K, 09. 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A8, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A8, L2 ) 9987 FORMAT( 1X, A8, ' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of CBLAT3. * END SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests CGEMM3M. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CGEMM3M, CMAKE, CMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL CMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, $ BETA, LDC IF( REWI ) $ REWIND NTRA CALL CGEMM3M( TRANSA, TRANSB, M, N, K, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LCE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LCE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LCE( CS, CC, LCC ) ELSE ISAME( 12 ) = LCERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL CMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, $ ALPHA, LDA, LDB, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A8, '(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK1. * END SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests CHEMM and CSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BLS REAL ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHEMM, CMAKE, CMMCH, CSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL CHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) ELSE CALL CSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC * 120 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK2. * END SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C ) * * Tests CTRMM and CTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS REAL ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CMAKE, CMMCH, CTRMM, CTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for CMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL CMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL CTRMM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LCE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LCE( BS, BB, LBB ) ELSE ISAME( 10 ) = LCERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MM' )THEN * * Check the result. * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL CMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL CMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, LDA, LDB * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A8, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK3. * END SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests CHERK and CSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHERK, CMAKE, CMMCH, CSYRK * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO RALS = RONE RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = REAL( ALPHA ) ALPHA = CMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, RALPHA, LDA, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL CHERK( UPLO, TRANS, N, K, RALPHA, AA, $ LDA, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC IF( REWI ) $ REWIND NTRA CALL CSYRK( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LCE( CS, CC, LCC ) ELSE ISAME( 9 ) = LCERES( SNAME( 2: 3 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL CMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL CMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, $ LDA, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK4. * END SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) * * Tests CHER2K and CSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) REAL RONE, RZERO PARAMETER ( RONE = 1.0, RZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) REAL G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX ALPHA, ALS, BETA, BETS REAL ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LCE, LCERES EXTERNAL LCE, LCERES * .. External Subroutines .. EXTERNAL CHER2K, CMAKE, CMMCH, CSYR2K * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, MAX, REAL * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = REAL( BETA ) BETA = CMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL CHER2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL CSYR2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LCE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LCE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LCE( CS, CC, LCC ) ELSE ISAME( 11 ) = LCERES( 'HE', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = CONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*CONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = CONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC END IF * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of CCHK5. * END SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*8 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. COMPLEX ALPHA, BETA REAL RALPHA, RBETA * .. Local Arrays .. COMPLEX A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. EXTERNAL CGEMM3M, CHEMM, CHER2K, CHERK, CHKXER, CSYMM, $ CSYR2K, CSYRK, CTRMM, CTRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 CALL CGEMM3M( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CGEMM3M( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CGEMM3M( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGEMM3M( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGEMM3M( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CGEMM3M( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CGEMM3M( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CGEMM3M( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CGEMM3M( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CGEMM3M( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGEMM3M( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CGEMM3M( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 20 INFOT = 1 CALL CHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 30 INFOT = 1 CALL CSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 40 INFOT = 1 CALL CTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 50 INFOT = 1 CALL CTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 60 INFOT = 1 CALL CHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 70 INFOT = 1 CALL CSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 80 INFOT = 1 CALL CHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 90 INFOT = 1 CALL CSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 100 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A8, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of CCHKE. * END SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'HE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) COMPLEX ROGUE PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) REAL RZERO PARAMETER ( RZERO = 0.0 ) REAL RROGUE PARAMETER ( RROGUE = -1.0E10 ) * .. Scalar Arguments .. COMPLEX TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX CBEG EXTERNAL CBEG * .. Intrinsic Functions .. INTRINSIC CMPLX, CONJG, REAL * .. Executable Statements .. GEN = TYPE.EQ.'GE' HER = TYPE.EQ.'HE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = CBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = CONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of CMAKE. * END SUBROUTINE CMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX ZERO PARAMETER ( ZERO = ( 0.0, 0.0 ) ) REAL RZERO, RONE PARAMETER ( RZERO = 0.0, RONE = 1.0 ) * .. Scalar Arguments .. COMPLEX ALPHA, BETA REAL EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) REAL G( * ) * .. Local Scalars .. COMPLEX CL REAL ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT * .. Statement Functions .. REAL ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )* $ CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of CMMCH. * END LOGICAL FUNCTION LCE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LCE = .TRUE. GO TO 30 20 CONTINUE LCE = .FALSE. 30 RETURN * * End of LCE. * END LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'HE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE LCERES = .FALSE. 80 RETURN * * End of LCERES. * END COMPLEX FUNCTION CBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC CMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) RETURN * * End of CBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*8 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A8, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 3 BLAS * routines. * * XERBLA is an error handler for the Level 3 BLAS routines. * * It is called by the Level 3 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*8 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*8 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A8, ' INSTE', $ 'AD OF ', A8, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/dblat1.f000066400000000000000000000747431313527062700154600ustar00rootroot00000000000000 PROGRAM DBLAT1 * Test program for the DOUBLE PRECISION Level 1 BLAS. * Based upon the original BLAS test routine together with: * F06EAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625D-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * .. Initialize PASS, INCX, INCY, and MODE for a new case. .. * .. the value 9999 for INCX, INCY or MODE will appear in the .. * .. detailed output, if any, for cases that do not involve .. * .. these parameters .. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.EQ.3) THEN CALL CHECK0(SFAC) ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + ICASE.EQ.10) THEN CALL CHECK1(SFAC) ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + ICASE.EQ.6) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.EQ.4) THEN CALL CHECK3(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Real BLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*6 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/' DDOT '/ DATA L(2)/'DAXPY '/ DATA L(3)/'DROTG '/ DATA L(4)/' DROT '/ DATA L(5)/'DCOPY '/ DATA L(6)/'DSWAP '/ DATA L(7)/'DNRM2 '/ DATA L(8)/'DASUM '/ DATA L(9)/'DSCAL '/ DATA L(10)/'IDAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,12X,A6) END SUBROUTINE CHECK0(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION D12, SA, SB, SC, SS INTEGER K * .. Local Arrays .. DOUBLE PRECISION DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + DS1(8) * .. External Subroutines .. EXTERNAL DROTG, STEST1 * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0, + 0.0D0, 1.0D0/ DATA DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0, + 1.0D0, 0.0D0/ DATA DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0, + 0.0D0, 1.0D0/ DATA DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0, + 1.0D0, 0.0D0/ DATA DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0, + 0.0D0, 1.0D0, 1.0D0/ DATA DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0, + 0.0D0, 1.0D0, 0.0D0/ DATA D12/4096.0D0/ * .. Executable Statements .. * * Compute true values which cannot be prestored * in decimal notation * DBTRUE(1) = 1.0D0/0.6D0 DBTRUE(3) = -1.0D0/0.6D0 DBTRUE(5) = 1.0D0/0.6D0 * DO 20 K = 1, 8 * .. Set N=K for identification in output if any .. N = K IF (ICASE.EQ.3) THEN * .. DROTG .. IF (K.GT.8) GO TO 40 SA = DA1(K) SB = DB1(K) CALL DROTG(SA,SB,SC,SS) CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) CALL STEST1(SC,DC1(K),DC1(K),SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' STOP END IF 20 CONTINUE 40 RETURN END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER I, LEN, NP1 * .. Local Arrays .. DOUBLE PRECISION DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + SA(10), STEMP(1), STRUE(8), SX(8) INTEGER ITRUE2(5) * .. External Functions .. DOUBLE PRECISION DASUM, DNRM2 INTEGER IDAMAX EXTERNAL DASUM, DNRM2, IDAMAX * .. External Subroutines .. EXTERNAL ITEST1, DSCAL, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0, + 0.3D0, 0.3D0, 0.3D0, 0.3D0/ DATA DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + 2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, + 3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0, + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0, + -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0, + 5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0, + 6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0, + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0, + 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0, + -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + 0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0, + 2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0, + -0.5D0, 7.0D0, -0.1D0, 3.0D0/ DATA DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/ DATA DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/ DATA DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + 2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0, + 3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0, + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, + 0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0, + 5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0, + 6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0, + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, + 0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, + 9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0, + 2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0, + -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0, + 0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0, + -0.03D0, 3.0D0/ DATA ITRUE2/0, 1, 2, 2, 3/ * .. Executable Statements .. DO 80 INCX = 1, 2 DO 60 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN SX(I) = DV(I,NP1,INCX) 20 CONTINUE * IF (ICASE.EQ.7) THEN * .. DNRM2 .. STEMP(1) = DTRUE1(NP1) CALL STEST1(DNRM2(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.8) THEN * .. DASUM .. STEMP(1) = DTRUE3(NP1) CALL STEST1(DASUM(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.9) THEN * .. DSCAL .. CALL DSCAL(N,SA((INCX-1)*5+NP1),SX,INCX) DO 40 I = 1, LEN STRUE(I) = DTRUE5(I,NP1,INCX) 40 CONTINUE CALL STEST(LEN,SX,STRUE,STRUE,SFAC) ELSE IF (ICASE.EQ.10) THEN * .. IDAMAX .. CALL ITEST1(IDAMAX(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF 60 CONTINUE 80 CONTINUE RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SA, SC, SS INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. DOUBLE PRECISION DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + DT8(7,4,4), DT9X(7,4,4), DT9Y(7,4,4), DX1(7), + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + SX(7), SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. DOUBLE PRECISION DDOT EXTERNAL DDOT * .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DSWAP, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3D0/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + -0.4D0/ DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + 0.8D0/ DATA SC, SS/0.8D0, 0.6D0/ DATA DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0, + 0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0, + -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/ DATA DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0, + 0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0, + 0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0, + -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0, + -0.75D0, 0.2D0, 1.04D0/ DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + 0.0D0, 0.0D0, 0.0D0/ DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + -0.18D0, 0.2D0, 0.16D0/ DATA DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0, + 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0, + 0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0, + 0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + 0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0, + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0, + 0.0D0/ DATA DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0, + 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0, + 0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0, + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0, + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0, + -0.5D0, 0.2D0, 0.8D0/ DATA SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/ DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0/ * .. Executable Statements .. * DO 120 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 100 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. Initialize all argument arrays .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) 20 CONTINUE * IF (ICASE.EQ.1) THEN * .. DDOT .. CALL STEST1(DDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN) + ,SFAC) ELSE IF (ICASE.EQ.2) THEN * .. DAXPY .. CALL DAXPY(N,SA,SX,INCX,SY,INCY) DO 40 J = 1, LENY STY(J) = DT8(J,KN,KI) 40 CONTINUE CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.5) THEN * .. DCOPY .. DO 60 I = 1, 7 STY(I) = DT10Y(I,KN,KI) 60 CONTINUE CALL DCOPY(N,SX,INCX,SY,INCY) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE IF (ICASE.EQ.6) THEN * .. DSWAP .. CALL DSWAP(N,SX,INCX,SY,INCY) DO 80 I = 1, 7 STX(I) = DT10X(I,KN,KI) STY(I) = DT10Y(I,KN,KI) 80 CONTINUE CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF 100 CONTINUE 120 CONTINUE RETURN END SUBROUTINE CHECK3(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SA, SC, SS INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + MWPINY(11), MWPN(11), NS(4) * .. External Subroutines .. EXTERNAL DROT, STEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3D0/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + -0.4D0/ DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + 0.8D0/ DATA SC, SS/0.8D0, 0.6D0/ DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + 0.0D0, 0.0D0, 0.0D0/ DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + -0.18D0, 0.2D0, 0.16D0/ DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0/ * .. Executable Statements .. * DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * IF (ICASE.EQ.4) THEN * .. DROT .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) STX(I) = DT9X(I,KN,KI) STY(I) = DT9Y(I,KN,KI) 20 CONTINUE CALL DROT(N,SX,INCX,SY,INCY,SC,SS) CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' STOP END IF 40 CONTINUE 60 CONTINUE * MWPC(1) = 1 DO 80 I = 2, 11 MWPC(I) = 0 80 CONTINUE MWPS(1) = 0 DO 100 I = 2, 6 MWPS(I) = 1 100 CONTINUE DO 120 I = 7, 11 MWPS(I) = -1 120 CONTINUE MWPINX(1) = 1 MWPINX(2) = 1 MWPINX(3) = 1 MWPINX(4) = -1 MWPINX(5) = 1 MWPINX(6) = -1 MWPINX(7) = 1 MWPINX(8) = 1 MWPINX(9) = -1 MWPINX(10) = 1 MWPINX(11) = -1 MWPINY(1) = 1 MWPINY(2) = 1 MWPINY(3) = -1 MWPINY(4) = -1 MWPINY(5) = 2 MWPINY(6) = 1 MWPINY(7) = 1 MWPINY(8) = -1 MWPINY(9) = -1 MWPINY(10) = 2 MWPINY(11) = 1 DO 140 I = 1, 11 MWPN(I) = 5 140 CONTINUE MWPN(5) = 3 MWPN(10) = 3 DO 160 I = 1, 5 MWPX(I) = I MWPY(I) = I MWPTX(1,I) = I MWPTY(1,I) = I MWPTX(2,I) = I MWPTY(2,I) = -I MWPTX(3,I) = 6 - I MWPTY(3,I) = I - 6 MWPTX(4,I) = I MWPTY(4,I) = -I MWPTX(6,I) = 6 - I MWPTY(6,I) = I - 6 MWPTX(7,I) = -I MWPTY(7,I) = I MWPTX(8,I) = I - 6 MWPTY(8,I) = 6 - I MWPTX(9,I) = -I MWPTY(9,I) = I MWPTX(11,I) = I - 6 MWPTY(11,I) = 6 - I 160 CONTINUE MWPTX(5,1) = 1 MWPTX(5,2) = 3 MWPTX(5,3) = 5 MWPTX(5,4) = 4 MWPTX(5,5) = 5 MWPTY(5,1) = -1 MWPTY(5,2) = 2 MWPTY(5,3) = -2 MWPTY(5,4) = 4 MWPTY(5,5) = -3 MWPTX(10,1) = -1 MWPTX(10,2) = -3 MWPTX(10,3) = -5 MWPTX(10,4) = 4 MWPTX(10,5) = 5 MWPTY(10,1) = 1 MWPTY(10,2) = 2 MWPTY(10,3) = 2 MWPTY(10,4) = 4 MWPTY(10,5) = 3 DO 200 I = 1, 11 INCX = MWPINX(I) INCY = MWPINY(I) DO 180 K = 1, 5 COPYX(K) = MWPX(K) COPYY(K) = MWPY(K) MWPSTX(K) = MWPTX(I,K) MWPSTY(K) = MWPTY(I,K) 180 CONTINUE CALL DROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) 200 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN * .. Array Arguments .. DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SD INTEGER I * .. External Functions .. DOUBLE PRECISION SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. DOUBLE PRECISION SCOMP1, SFAC, STRUE1 * .. Array Arguments .. DOUBLE PRECISION SSIZE(*) * .. Local Arrays .. DOUBLE PRECISION SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END DOUBLE PRECISION FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. DOUBLE PRECISION SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/test/dblat2.dat000066400000000000000000000026721313527062700157740ustar00rootroot00000000000000'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 0.9 VALUES OF BETA DGEMV T PUT F FOR NO TEST. SAME COLUMNS. DGBMV T PUT F FOR NO TEST. SAME COLUMNS. DSYMV T PUT F FOR NO TEST. SAME COLUMNS. DSBMV T PUT F FOR NO TEST. SAME COLUMNS. DSPMV T PUT F FOR NO TEST. SAME COLUMNS. DTRMV T PUT F FOR NO TEST. SAME COLUMNS. DTBMV T PUT F FOR NO TEST. SAME COLUMNS. DTPMV T PUT F FOR NO TEST. SAME COLUMNS. DTRSV T PUT F FOR NO TEST. SAME COLUMNS. DTBSV T PUT F FOR NO TEST. SAME COLUMNS. DTPSV T PUT F FOR NO TEST. SAME COLUMNS. DGER T PUT F FOR NO TEST. SAME COLUMNS. DSYR T PUT F FOR NO TEST. SAME COLUMNS. DSPR T PUT F FOR NO TEST. SAME COLUMNS. DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/dblat2.f000066400000000000000000003314341313527062700154520ustar00rootroot00000000000000 PROGRAM DBLAT2 * * Test program for the DOUBLE PRECISION Level 2 Blas. * * The program must be driven by a short data file. The first 18 records * of the file are read using list-directed input, the last 16 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 34 lines: * 'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 0.9 VALUES OF BETA * DGEMV T PUT F FOR NO TEST. SAME COLUMNS. * DGBMV T PUT F FOR NO TEST. SAME COLUMNS. * DSYMV T PUT F FOR NO TEST. SAME COLUMNS. * DSBMV T PUT F FOR NO TEST. SAME COLUMNS. * DSPMV T PUT F FOR NO TEST. SAME COLUMNS. * DTRMV T PUT F FOR NO TEST. SAME COLUMNS. * DTBMV T PUT F FOR NO TEST. SAME COLUMNS. * DTPMV T PUT F FOR NO TEST. SAME COLUMNS. * DTRSV T PUT F FOR NO TEST. SAME COLUMNS. * DTBSV T PUT F FOR NO TEST. SAME COLUMNS. * DTPSV T PUT F FOR NO TEST. SAME COLUMNS. * DGER T PUT F FOR NO TEST. SAME COLUMNS. * DSYR T PUT F FOR NO TEST. SAME COLUMNS. * DSPR T PUT F FOR NO TEST. SAME COLUMNS. * DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. * DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANS CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LDE EXTERNAL DDIFF, LDE * .. External Subroutines .. EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6, $ DCHKE, DMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'DGEMV ', 'DGBMV ', 'DSYMV ', 'DSBMV ', $ 'DSPMV ', 'DTRMV ', 'DTBMV ', 'DTPMV ', $ 'DTRSV ', 'DTBSV ', 'DTPSV ', 'DGER ', $ 'DSYR ', 'DSPR ', 'DSYR2 ', 'DSPR2 '/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 90 CONTINUE IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 100 EPS = HALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from DMVCH YT holds * the result computed by DMVCH. TRANS = 'N' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 180, 180, $ 190, 190 )ISNUM * Test DGEMV, 01, and DGBMV, 02. 140 CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. 150 CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test DTRMV, 06, DTBMV, 07, DTPMV, 08, * DTRSV, 09, DTBSV, 10, and DTPSV, 11. 160 CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) GO TO 200 * Test DGER, 12. 170 CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test DSYR, 13, and DSPR, 14. 180 CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test DSYR2, 15, and DSPR2, 16. 190 CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9988 FORMAT( ' FOR BETA ', 7F6.1 ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT( ' ERROR IN DMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT( A6, L2 ) 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of DBLAT2. * END SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests DGEMV and DGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DGBMV, DGEMV, DMAKE, DMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' BANDED = SNAME( 3: 3 ).EQ.'B' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ TRANS, M, N, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL DGEMV( TRANS, M, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL DGBMV( TRANS, M, N, KL, KU, ALPHA, $ AA, LDA, XX, INCX, BETA, $ YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LDE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LDE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LDE( YS, YY, LY ) ELSE ISAME( 10 ) = LDERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LDE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LDE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LDE( YS, YY, LY ) ELSE ISAME( 12 ) = LDERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL DMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK1. * END SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests DSYMV, DSBMV and DSPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, DSBMV, DSPMV, DSYMV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'Y' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL DSYMV( UPLO, N, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL DSBMV( UPLO, N, K, ALPHA, AA, LDA, $ XX, INCX, BETA, YY, INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL DSPMV( UPLO, N, ALPHA, AA, XX, INCX, $ BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LDE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LDE( YS, YY, LY ) ELSE ISAME( 9 ) = LDERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LDE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LDE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LDE( YS, YY, LY ) ELSE ISAME( 10 ) = LDERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( AS, AA, LAA ) ISAME( 5 ) = LDE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LDE( YS, YY, LY ) ELSE ISAME( 8 ) = LDERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, $ BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP', $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,', $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK2. * END SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) * * Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XT( NMAX ), $ XX( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. DOUBLE PRECISION ERR, ERRMAX, TRANSL INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, DTBMV, DTBSV, DTPMV, DTPSV, $ DTRMV, DTRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'R' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero vector for DMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL DTRMV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL DTBMV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL DTPMV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL DTRSV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL DTBSV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL DTPSV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LDE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LDE( XS, XX, LX ) ELSE ISAME( 7 ) = LDERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LDE( XS, XX, LX ) ELSE ISAME( 8 ) = LDERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LDE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LDE( XS, XX, LX ) ELSE ISAME( 6 ) = LDERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MV' )THEN * * Check the result. * CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, $ INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, $ LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK3. * END SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests DGER. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL NULL, RESET, SAME * .. Local Arrays .. DOUBLE PRECISION W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DGER, DMAKE, DMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL DGER( M, N, ALPHA, XX, INCX, YY, INCY, AA, $ LDA ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LDE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LDE( AS, AA, LAA ) ELSE ISAME( 8 ) = LDERES( 'GE', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2, $ ', Y,', I2, ', A,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK4. * END SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests DSYR and DSPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. DOUBLE PRECISION W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, DSPR, DSYR * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'Y' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ ALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL DSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ ALPHA, INCX IF( REWI ) $ REWIND NTRA CALL DSPR( UPLO, N, ALPHA, XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LDE( AS, AA, LAA ) ELSE ISAME( 6 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = Z( J ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK5. * END SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests DSYR2 and DSPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. DOUBLE PRECISION W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMVCH, DSPR2, DSYR2 * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'Y' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL DSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL DSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LDE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LDE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LDE( AS, AA, LAA ) ELSE ISAME( 8 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = Z( J, 2 ) W( 2 ) = Z( J, 1 ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK6. * END SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 2 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, BETA, A, X and Y should not need to be defined. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA * .. Local Arrays .. DOUBLE PRECISION A( 1, 1 ), X( 1 ), Y( 1 ) * .. External Subroutines .. EXTERNAL CHKXER, DGBMV, DGEMV, DGER, DSBMV, DSPMV, DSPR, $ DSPR2, DSYMV, DSYR, DSYR2, DTBMV, DTBSV, DTPMV, $ DTPSV, DTRMV, DTRSV * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90, 100, 110, 120, 130, 140, 150, $ 160 )ISNUM 10 INFOT = 1 CALL DGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 20 INFOT = 1 CALL DGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 30 INFOT = 1 CALL DSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 40 INFOT = 1 CALL DSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 50 INFOT = 1 CALL DSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 60 INFOT = 1 CALL DTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 70 INFOT = 1 CALL DTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 80 INFOT = 1 CALL DTPMV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTPMV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTPMV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTPMV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DTPMV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 90 INFOT = 1 CALL DTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 100 INFOT = 1 CALL DTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 110 INFOT = 1 CALL DTPSV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTPSV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTPSV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTPSV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DTPSV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 120 INFOT = 1 CALL DGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 130 INFOT = 1 CALL DSYR( '/', 0, ALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSYR( 'U', -1, ALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DSYR( 'U', 0, ALPHA, X, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYR( 'U', 2, ALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 140 INFOT = 1 CALL DSPR( '/', 0, ALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSPR( 'U', -1, ALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DSPR( 'U', 0, ALPHA, X, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 150 INFOT = 1 CALL DSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 160 INFOT = 1 CALL DSPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 170 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of DCHKE. * END SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) DOUBLE PRECISION ROGUE PARAMETER ( ROGUE = -1.0D10 ) * .. Scalar Arguments .. DOUBLE PRECISION TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. DOUBLE PRECISION DBEG EXTERNAL DBEG * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'G' SYM = TYPE( 1: 1 ).EQ.'S' TRI = TYPE( 1: 1 ).EQ.'T' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = DBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'GB' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE 130 CONTINUE ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE 170 CONTINUE ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of DMAKE. * END SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA, EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. DOUBLE PRECISION A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), $ YY( * ) * .. Local Scalars .. DOUBLE PRECISION ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL TRAN * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 30 I = 1, ML YT( IY ) = ZERO G( IY ) = ZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE DO 20 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) JX = JX + INCXL 20 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) IY = IY + INCYL 30 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 40 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 50 40 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 70 * * Report fatal error. * 50 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 60 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) END IF 60 CONTINUE * 70 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) * * End of DMVCH. * END LOGICAL FUNCTION LDE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. DOUBLE PRECISION RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LDE = .TRUE. GO TO 30 20 CONTINUE LDE = .FALSE. 30 RETURN * * End of LDE. * END LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE', 'SY' or 'SP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE LDERES = .FALSE. 80 RETURN * * End of LDERES. * END DOUBLE PRECISION FUNCTION DBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Intrinsic Functions .. INTRINSIC DBLE * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF DBEG = DBLE( I - 500 )/1001.0D0 RETURN * * End of DBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 2 BLAS * routines. * * XERBLA is an error handler for the Level 2 BLAS routines. * * It is called by the Level 2 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/dblat3.dat000066400000000000000000000015621313527062700157720ustar00rootroot00000000000000'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 1.3 VALUES OF BETA DGEMM T PUT F FOR NO TEST. SAME COLUMNS. DSYMM T PUT F FOR NO TEST. SAME COLUMNS. DTRMM T PUT F FOR NO TEST. SAME COLUMNS. DTRSM T PUT F FOR NO TEST. SAME COLUMNS. DSYRK T PUT F FOR NO TEST. SAME COLUMNS. DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/dblat3.f000066400000000000000000003111651313527062700154520ustar00rootroot00000000000000 PROGRAM DBLAT3 * * Test program for the DOUBLE PRECISION Level 3 Blas. * * The program must be driven by a short data file. The first 14 records * of the file are read using list-directed input, the last 6 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 20 lines: * 'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 1.3 VALUES OF BETA * DGEMM T PUT F FOR NO TEST. SAME COLUMNS. * DSYMM T PUT F FOR NO TEST. SAME COLUMNS. * DTRMM T PUT F FOR NO TEST. SAME COLUMNS. * DTRSM T PUT F FOR NO TEST. SAME COLUMNS. * DSYRK T PUT F FOR NO TEST. SAME COLUMNS. * DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) DOUBLE PRECISION ZERO, HALF, ONE PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANSA, TRANSB CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. DOUBLE PRECISION AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LDE EXTERNAL DDIFF, LDE * .. External Subroutines .. EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHKE, DMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'DGEMM ', 'DSYMM ', 'DTRMM ', 'DTRSM ', $ 'DSYRK ', 'DSYR2K'/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 70 CONTINUE IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 80 EPS = HALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from DMMCH CT holds * the result computed by DMMCH. TRANSA = 'N' TRANSB = 'N' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'T' TRANSB = 'N' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM * Test DGEMM, 01. 140 CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test DSYMM, 02. 150 CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test DTRMM, 03, DTRSM, 04. 160 CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) GO TO 190 * Test DSYRK, 05. 170 CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test DSYR2K, 06. 180 CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9992 FORMAT( ' FOR BETA ', 7F6.1 ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN DMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A6, L2 ) 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of DBLAT3. * END SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests DGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DGEMM, DMAKE, DMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, $ BETA, LDC IF( REWI ) $ REWIND NTRA CALL DGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LDE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LDE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LDE( CS, CC, LCC ) ELSE ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL DMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, $ ALPHA, LDA, LDB, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', $ 'C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK1. * END SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests DSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, DSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the symmetric matrix A. * CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL DSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LDE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LDE( CS, CC, LCC ) ELSE ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC * 120 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK2. * END SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C ) * * Tests DTRMM and DTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, DTRMM, DTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero matrix for DMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL DTRMM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL DTRSM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LDE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LDE( BS, BB, LBB ) ELSE ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MM' )THEN * * Check the result. * IF( LEFT )THEN CALL DMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL DMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL DMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL DMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, LDA, LDB * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK3. * END SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests DSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, DSYRK * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA BETS = BETA DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC IF( REWI ) $ REWIND NTRA CALL DSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LDE( CS, CC, LCC ) ELSE ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA, $ A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA, $ A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK4. * END SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) * * Tests DSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. DOUBLE PRECISION AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LDE, LDERES EXTERNAL LDE, LDERES * .. External Subroutines .. EXTERNAL DMAKE, DMMCH, DSYR2K * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N NULL = N.LE.0 * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BETS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL DSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LDE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LDE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LDE( CS, CC, LCC ) ELSE ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = AB( ( J - 1 )*2*NMAX + K + $ I ) W( K + I ) = AB( ( J - 1 )*2*NMAX + $ I ) 50 CONTINUE CALL DMMCH( 'T', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJAB ), 2*NMAX, $ W, 2*NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE DO 60 I = 1, K W( I ) = AB( ( K + I - 1 )*NMAX + $ J ) W( K + I ) = AB( ( I - 1 )*NMAX + $ J ) 60 CONTINUE CALL DMMCH( 'N', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJ ), NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of DCHK5. * END SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, BETA, A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA * .. Local Arrays .. DOUBLE PRECISION A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. EXTERNAL CHKXER, DGEMM, DSYMM, DSYR2K, DSYRK, DTRMM, $ DTRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL DGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL DGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL DGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL DGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 20 INFOT = 1 CALL DSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 30 INFOT = 1 CALL DTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 40 INFOT = 1 CALL DTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL DTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 50 INFOT = 1 CALL DSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 60 INFOT = 1 CALL DSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL DSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL DSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL DSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 70 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of DCHKE. * END SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) DOUBLE PRECISION ROGUE PARAMETER ( ROGUE = -1.0D10 ) * .. Scalar Arguments .. DOUBLE PRECISION TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. DOUBLE PRECISION DBEG EXTERNAL DBEG * .. Executable Statements .. GEN = TYPE.EQ.'GE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = DBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE END IF RETURN * * End of DMAKE. * END SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION ALPHA, BETA, EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ), G( * ) * .. Local Scalars .. DOUBLE PRECISION ERRI INTEGER I, J, K LOGICAL TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 120 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE IF( .NOT.TRANA.AND.TRANB )THEN DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) 60 CONTINUE 70 CONTINUE ELSE IF( TRANA.AND.TRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) 80 CONTINUE 90 CONTINUE END IF DO 100 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 110 I = 1, M ERRI = ABS( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 130 110 CONTINUE * 120 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 150 * * Report fatal error. * 130 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 140 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 150 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of DMMCH. * END LOGICAL FUNCTION LDE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. DOUBLE PRECISION RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LDE = .TRUE. GO TO 30 20 CONTINUE LDE = .FALSE. 30 RETURN * * End of LDE. * END LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE LDERES = .FALSE. 80 RETURN * * End of LDERES. * END DOUBLE PRECISION FUNCTION DBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF DBEG = ( I - 500 )/1001.0D0 RETURN * * End of DBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 3 BLAS * routines. * * XERBLA is an error handler for the Level 3 BLAS routines. * * It is called by the Level 3 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/get_threading_model.c000066400000000000000000000006071313527062700202560ustar00rootroot00000000000000#include "../cblas.h" int main() { int th_model = openblas_get_parallel(); switch(th_model) { case OPENBLAS_SEQUENTIAL: printf("OpenBLAS is compiled sequentially.\n"); break; case OPENBLAS_THREAD: printf("OpenBLAS is compiled using the normal threading model\n"); break; case OPENBLAS_OPENMP: printf("OpenBLAS is compiled using OpenMP\n"); break; } return 0; } OpenBLAS-0.2.20/test/sblat1.f000066400000000000000000000747431313527062700154770ustar00rootroot00000000000000 PROGRAM SBLAT1 * Test program for the REAL Level 1 BLAS. * Based upon the original BLAS test routine together with: * F06EAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625E-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * .. Initialize PASS, INCX, INCY, and MODE for a new case. .. * .. the value 9999 for INCX, INCY or MODE will appear in the .. * .. detailed output, if any, for cases that do not involve .. * .. these parameters .. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.EQ.3) THEN CALL CHECK0(SFAC) ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + ICASE.EQ.10) THEN CALL CHECK1(SFAC) ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + ICASE.EQ.6) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.EQ.4) THEN CALL CHECK3(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Real BLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*6 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/' SDOT '/ DATA L(2)/'SAXPY '/ DATA L(3)/'SROTG '/ DATA L(4)/' SROT '/ DATA L(5)/'SCOPY '/ DATA L(6)/'SSWAP '/ DATA L(7)/'SNRM2 '/ DATA L(8)/'SASUM '/ DATA L(9)/'SSCAL '/ DATA L(10)/'ISAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,12X,A6) END SUBROUTINE CHECK0(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL D12, SA, SB, SC, SS INTEGER K * .. Local Arrays .. REAL DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + DS1(8) * .. External Subroutines .. EXTERNAL SROTG, STEST1 * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0, + 0.0E0, 1.0E0/ DATA DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0, + 1.0E0, 0.0E0/ DATA DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0, + 0.0E0, 1.0E0/ DATA DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0, + 1.0E0, 0.0E0/ DATA DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0, + 0.0E0, 1.0E0, 1.0E0/ DATA DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0, + 0.0E0, 1.0E0, 0.0E0/ DATA D12/4096.0E0/ * .. Executable Statements .. * * Compute true values which cannot be prestored * in decimal notation * DBTRUE(1) = 1.0E0/0.6E0 DBTRUE(3) = -1.0E0/0.6E0 DBTRUE(5) = 1.0E0/0.6E0 * DO 20 K = 1, 8 * .. Set N=K for identification in output if any .. N = K IF (ICASE.EQ.3) THEN * .. SROTG .. IF (K.GT.8) GO TO 40 SA = DA1(K) SB = DB1(K) CALL SROTG(SA,SB,SC,SS) CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) CALL STEST1(SC,DC1(K),DC1(K),SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' STOP END IF 20 CONTINUE 40 RETURN END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER I, LEN, NP1 * .. Local Arrays .. REAL DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + SA(10), STEMP(1), STRUE(8), SX(8) INTEGER ITRUE2(5) * .. External Functions .. REAL SASUM, SNRM2 INTEGER ISAMAX EXTERNAL SASUM, SNRM2, ISAMAX * .. External Subroutines .. EXTERNAL ITEST1, SSCAL, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0, + 0.3E0, 0.3E0, 0.3E0, 0.3E0/ DATA DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + 2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, + 3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0, + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0, + -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0, + 5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0, + 6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0, + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0, + 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0, + -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + 0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0, + 2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0, + -0.5E0, 7.0E0, -0.1E0, 3.0E0/ DATA DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/ DATA DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/ DATA DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + 2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0, + 3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0, + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, + 0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0, + 5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0, + 6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0, + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, + 0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, + 9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0, + 2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0, + -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0, + 0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0, + -0.03E0, 3.0E0/ DATA ITRUE2/0, 1, 2, 2, 3/ * .. Executable Statements .. DO 80 INCX = 1, 2 DO 60 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN SX(I) = DV(I,NP1,INCX) 20 CONTINUE * IF (ICASE.EQ.7) THEN * .. SNRM2 .. STEMP(1) = DTRUE1(NP1) CALL STEST1(SNRM2(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.8) THEN * .. SASUM .. STEMP(1) = DTRUE3(NP1) CALL STEST1(SASUM(N,SX,INCX),STEMP,STEMP,SFAC) ELSE IF (ICASE.EQ.9) THEN * .. SSCAL .. CALL SSCAL(N,SA((INCX-1)*5+NP1),SX,INCX) DO 40 I = 1, LEN STRUE(I) = DTRUE5(I,NP1,INCX) 40 CONTINUE CALL STEST(LEN,SX,STRUE,STRUE,SFAC) ELSE IF (ICASE.EQ.10) THEN * .. ISAMAX .. CALL ITEST1(ISAMAX(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF 60 CONTINUE 80 CONTINUE RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SA, SC, SS INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. REAL DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + DT8(7,4,4), DT9X(7,4,4), DT9Y(7,4,4), DX1(7), + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + SX(7), SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. REAL SDOT EXTERNAL SDOT * .. External Subroutines .. EXTERNAL SAXPY, SCOPY, SSWAP, STEST, STEST1 * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3E0/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + -0.4E0/ DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + 0.8E0/ DATA SC, SS/0.8E0, 0.6E0/ DATA DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0, + 0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0, + -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/ DATA DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0, + 0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0, + 0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0, + -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0, + -0.75E0, 0.2E0, 1.04E0/ DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + 0.0E0, 0.0E0, 0.0E0/ DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + -0.18E0, 0.2E0, 0.16E0/ DATA DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, + 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0, + 0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0, + 0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + 0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0, + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0, + 0.0E0/ DATA DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0, + 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0, + 0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0, + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0, + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0, + -0.5E0, 0.2E0, 0.8E0/ DATA SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/ DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0/ * .. Executable Statements .. * DO 120 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 100 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. Initialize all argument arrays .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) 20 CONTINUE * IF (ICASE.EQ.1) THEN * .. SDOT .. CALL STEST1(SDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN) + ,SFAC) ELSE IF (ICASE.EQ.2) THEN * .. SAXPY .. CALL SAXPY(N,SA,SX,INCX,SY,INCY) DO 40 J = 1, LENY STY(J) = DT8(J,KN,KI) 40 CONTINUE CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.5) THEN * .. SCOPY .. DO 60 I = 1, 7 STY(I) = DT10Y(I,KN,KI) 60 CONTINUE CALL SCOPY(N,SX,INCX,SY,INCY) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE IF (ICASE.EQ.6) THEN * .. SSWAP .. CALL SSWAP(N,SX,INCX,SY,INCY) DO 80 I = 1, 7 STX(I) = DT10X(I,KN,KI) STY(I) = DT10Y(I,KN,KI) 80 CONTINUE CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF 100 CONTINUE 120 CONTINUE RETURN END SUBROUTINE CHECK3(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SA, SC, SS INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + SY(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + MWPINY(11), MWPN(11), NS(4) * .. External Subroutines .. EXTERNAL SROT, STEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA/0.3E0/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + -0.4E0/ DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + 0.8E0/ DATA SC, SS/0.8E0, 0.6E0/ DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + 0.0E0, 0.0E0, 0.0E0/ DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + -0.18E0, 0.2E0, 0.16E0/ DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + 1.17E0, 1.17E0, 1.17E0/ * .. Executable Statements .. * DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * IF (ICASE.EQ.4) THEN * .. SROT .. DO 20 I = 1, 7 SX(I) = DX1(I) SY(I) = DY1(I) STX(I) = DT9X(I,KN,KI) STY(I) = DT9Y(I,KN,KI) 20 CONTINUE CALL SROT(N,SX,INCX,SY,INCY,SC,SS) CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' STOP END IF 40 CONTINUE 60 CONTINUE * MWPC(1) = 1 DO 80 I = 2, 11 MWPC(I) = 0 80 CONTINUE MWPS(1) = 0 DO 100 I = 2, 6 MWPS(I) = 1 100 CONTINUE DO 120 I = 7, 11 MWPS(I) = -1 120 CONTINUE MWPINX(1) = 1 MWPINX(2) = 1 MWPINX(3) = 1 MWPINX(4) = -1 MWPINX(5) = 1 MWPINX(6) = -1 MWPINX(7) = 1 MWPINX(8) = 1 MWPINX(9) = -1 MWPINX(10) = 1 MWPINX(11) = -1 MWPINY(1) = 1 MWPINY(2) = 1 MWPINY(3) = -1 MWPINY(4) = -1 MWPINY(5) = 2 MWPINY(6) = 1 MWPINY(7) = 1 MWPINY(8) = -1 MWPINY(9) = -1 MWPINY(10) = 2 MWPINY(11) = 1 DO 140 I = 1, 11 MWPN(I) = 5 140 CONTINUE MWPN(5) = 3 MWPN(10) = 3 DO 160 I = 1, 5 MWPX(I) = I MWPY(I) = I MWPTX(1,I) = I MWPTY(1,I) = I MWPTX(2,I) = I MWPTY(2,I) = -I MWPTX(3,I) = 6 - I MWPTY(3,I) = I - 6 MWPTX(4,I) = I MWPTY(4,I) = -I MWPTX(6,I) = 6 - I MWPTY(6,I) = I - 6 MWPTX(7,I) = -I MWPTY(7,I) = I MWPTX(8,I) = I - 6 MWPTY(8,I) = 6 - I MWPTX(9,I) = -I MWPTY(9,I) = I MWPTX(11,I) = I - 6 MWPTY(11,I) = 6 - I 160 CONTINUE MWPTX(5,1) = 1 MWPTX(5,2) = 3 MWPTX(5,3) = 5 MWPTX(5,4) = 4 MWPTX(5,5) = 5 MWPTY(5,1) = -1 MWPTY(5,2) = 2 MWPTY(5,3) = -2 MWPTY(5,4) = 4 MWPTY(5,5) = -3 MWPTX(10,1) = -1 MWPTX(10,2) = -3 MWPTX(10,3) = -5 MWPTX(10,4) = 4 MWPTX(10,5) = 5 MWPTY(10,1) = 1 MWPTY(10,2) = 2 MWPTY(10,3) = 2 MWPTY(10,4) = 4 MWPTY(10,5) = 3 DO 200 I = 1, 11 INCX = MWPINX(I) INCY = MWPINY(I) DO 180 K = 1, 5 COPYX(K) = MWPX(K) COPYY(K) = MWPY(K) MWPSTX(K) = MWPTX(I,K) MWPSTY(K) = MWPTY(I,K) 180 CONTINUE CALL SROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) 200 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. REAL SFAC INTEGER LEN * .. Array Arguments .. REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. REAL SD INTEGER I * .. External Functions .. REAL SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. REAL SCOMP1, SFAC, STRUE1 * .. Array Arguments .. REAL SSIZE(*) * .. Local Arrays .. REAL SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END REAL FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. REAL SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/test/sblat2.dat000066400000000000000000000026721313527062700160130ustar00rootroot00000000000000'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 0.9 VALUES OF BETA SGEMV T PUT F FOR NO TEST. SAME COLUMNS. SGBMV T PUT F FOR NO TEST. SAME COLUMNS. SSYMV T PUT F FOR NO TEST. SAME COLUMNS. SSBMV T PUT F FOR NO TEST. SAME COLUMNS. SSPMV T PUT F FOR NO TEST. SAME COLUMNS. STRMV T PUT F FOR NO TEST. SAME COLUMNS. STBMV T PUT F FOR NO TEST. SAME COLUMNS. STPMV T PUT F FOR NO TEST. SAME COLUMNS. STRSV T PUT F FOR NO TEST. SAME COLUMNS. STBSV T PUT F FOR NO TEST. SAME COLUMNS. STPSV T PUT F FOR NO TEST. SAME COLUMNS. SGER T PUT F FOR NO TEST. SAME COLUMNS. SSYR T PUT F FOR NO TEST. SAME COLUMNS. SSPR T PUT F FOR NO TEST. SAME COLUMNS. SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/sblat2.f000066400000000000000000003313221313527062700154650ustar00rootroot00000000000000 PROGRAM SBLAT2 * * Test program for the REAL Level 2 Blas. * * The program must be driven by a short data file. The first 18 records * of the file are read using list-directed input, the last 16 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 34 lines: * 'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 0.9 VALUES OF BETA * SGEMV T PUT F FOR NO TEST. SAME COLUMNS. * SGBMV T PUT F FOR NO TEST. SAME COLUMNS. * SSYMV T PUT F FOR NO TEST. SAME COLUMNS. * SSBMV T PUT F FOR NO TEST. SAME COLUMNS. * SSPMV T PUT F FOR NO TEST. SAME COLUMNS. * STRMV T PUT F FOR NO TEST. SAME COLUMNS. * STBMV T PUT F FOR NO TEST. SAME COLUMNS. * STPMV T PUT F FOR NO TEST. SAME COLUMNS. * STRSV T PUT F FOR NO TEST. SAME COLUMNS. * STBSV T PUT F FOR NO TEST. SAME COLUMNS. * STPSV T PUT F FOR NO TEST. SAME COLUMNS. * SGER T PUT F FOR NO TEST. SAME COLUMNS. * SSYR T PUT F FOR NO TEST. SAME COLUMNS. * SSPR T PUT F FOR NO TEST. SAME COLUMNS. * SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. * SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANS CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LSE EXTERNAL SDIFF, LSE * .. External Subroutines .. EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6, $ SCHKE, SMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'SGEMV ', 'SGBMV ', 'SSYMV ', 'SSBMV ', $ 'SSPMV ', 'STRMV ', 'STBMV ', 'STPMV ', $ 'STRSV ', 'STBSV ', 'STPSV ', 'SGER ', $ 'SSYR ', 'SSPR ', 'SSYR2 ', 'SSPR2 '/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 90 CONTINUE IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 100 EPS = HALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from SMVCH YT holds * the result computed by SMVCH. TRANS = 'N' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 180, 180, $ 190, 190 )ISNUM * Test SGEMV, 01, and SGBMV, 02. 140 CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. 150 CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test STRMV, 06, STBMV, 07, STPMV, 08, * STRSV, 09, STBSV, 10, and STPSV, 11. 160 CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) GO TO 200 * Test SGER, 12. 170 CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test SSYR, 13, and SSPR, 14. 180 CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test SSYR2, 15, and SSPR2, 16. 190 CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT( ' TESTS OF THE REAL LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9988 FORMAT( ' FOR BETA ', 7F6.1 ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT( ' ERROR IN SMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT( A6, L2 ) 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of SBLAT2. * END SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests SGEMV and SGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF PARAMETER ( ZERO = 0.0, HALF = 0.5 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SGBMV, SGEMV, SMAKE, SMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' BANDED = SNAME( 3: 3 ).EQ.'B' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ TRANS, M, N, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL SGEMV( TRANS, M, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL SGBMV( TRANS, M, N, KL, KU, ALPHA, $ AA, LDA, XX, INCX, BETA, $ YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LSE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LSE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LSE( YS, YY, LY ) ELSE ISAME( 10 ) = LSERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LSE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LSE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LSE( YS, YY, LY ) ELSE ISAME( 12 ) = LSERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL SMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK1. * END SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests SSYMV, SSBMV and SSPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF PARAMETER ( ZERO = 0.0, HALF = 0.5 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, SSBMV, SSPMV, SSYMV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'Y' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL SSYMV( UPLO, N, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL SSBMV( UPLO, N, K, ALPHA, AA, LDA, $ XX, INCX, BETA, YY, INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL SSPMV( UPLO, N, ALPHA, AA, XX, INCX, $ BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LSE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LSE( YS, YY, LY ) ELSE ISAME( 9 ) = LSERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LSE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LSE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LSE( YS, YY, LY ) ELSE ISAME( 10 ) = LSERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( AS, AA, LAA ) ISAME( 5 ) = LSE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LSE( YS, YY, LY ) ELSE ISAME( 8 ) = LSERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, $ BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP', $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, $ ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,', $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK2. * END SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) * * Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XT( NMAX ), $ XX( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. REAL ERR, ERRMAX, TRANSL INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, STBMV, STBSV, STPMV, STPSV, $ STRMV, STRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'R' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero vector for SMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL STRMV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL STBMV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL STPMV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL STRSV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL STBSV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL STPSV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LSE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LSE( XS, XX, LX ) ELSE ISAME( 7 ) = LSERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LSE( XS, XX, LX ) ELSE ISAME( 8 ) = LSERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LSE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LSE( XS, XX, LX ) ELSE ISAME( 6 ) = LSERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MV' )THEN * * Check the result. * CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, $ INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, $ LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK3. * END SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests SGER. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL NULL, RESET, SAME * .. Local Arrays .. REAL W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SGER, SMAKE, SMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL SGER( M, N, ALPHA, XX, INCX, YY, INCY, AA, $ LDA ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LSE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LSE( AS, AA, LAA ) ELSE ISAME( 8 ) = LSERES( 'GE', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2, $ ', Y,', I2, ', A,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK4. * END SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests SSYR and SSPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. REAL W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, SSPR, SSYR * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'Y' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ ALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL SSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ ALPHA, INCX IF( REWI ) $ REWIND NTRA CALL SSPR( UPLO, N, ALPHA, XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LSE( AS, AA, LAA ) ELSE ISAME( 6 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = Z( J ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK5. * END SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests SSYR2 and SSPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX, TRANSL INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. REAL W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMVCH, SSPR2, SSYR2 * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'Y' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL SSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL SSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LSE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LSE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LSE( AS, AA, LAA ) ELSE ISAME( 8 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = Z( J, 2 ) W( 2 ) = Z( J, 1 ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', Y,', I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK6. * END SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 2 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, BETA, A, X and Y should not need to be defined. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. REAL ALPHA, BETA * .. Local Arrays .. REAL A( 1, 1 ), X( 1 ), Y( 1 ) * .. External Subroutines .. EXTERNAL CHKXER, SGBMV, SGEMV, SGER, SSBMV, SSPMV, SSPR, $ SSPR2, SSYMV, SSYR, SSYR2, STBMV, STBSV, STPMV, $ STPSV, STRMV, STRSV * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90, 100, 110, 120, 130, 140, 150, $ 160 )ISNUM 10 INFOT = 1 CALL SGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL SGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 20 INFOT = 1 CALL SGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 30 INFOT = 1 CALL SSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 40 INFOT = 1 CALL SSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL SSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 50 INFOT = 1 CALL SSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL SSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 60 INFOT = 1 CALL STRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL STRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 70 INFOT = 1 CALL STBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL STBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 80 INFOT = 1 CALL STPMV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STPMV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STPMV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STPMV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL STPMV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 90 INFOT = 1 CALL STRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL STRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 100 INFOT = 1 CALL STBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL STBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 110 INFOT = 1 CALL STPSV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STPSV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STPSV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STPSV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL STPSV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 120 INFOT = 1 CALL SGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 130 INFOT = 1 CALL SSYR( '/', 0, ALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSYR( 'U', -1, ALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SSYR( 'U', 0, ALPHA, X, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYR( 'U', 2, ALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 140 INFOT = 1 CALL SSPR( '/', 0, ALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSPR( 'U', -1, ALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SSPR( 'U', 0, ALPHA, X, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 150 INFOT = 1 CALL SSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 170 160 INFOT = 1 CALL SSPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 170 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of SCHKE. * END SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) REAL ROGUE PARAMETER ( ROGUE = -1.0E10 ) * .. Scalar Arguments .. REAL TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. REAL SBEG EXTERNAL SBEG * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'G' SYM = TYPE( 1: 1 ).EQ.'S' TRI = TYPE( 1: 1 ).EQ.'T' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = SBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'GB' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE 130 CONTINUE ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE 170 CONTINUE ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of SMAKE. * END SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. Scalar Arguments .. REAL ALPHA, BETA, EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. REAL A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), $ YY( * ) * .. Local Scalars .. REAL ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL TRAN * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 30 I = 1, ML YT( IY ) = ZERO G( IY ) = ZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE DO 20 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) JX = JX + INCXL 20 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) IY = IY + INCYL 30 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 40 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 50 40 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 70 * * Report fatal error. * 50 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 60 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) END IF 60 CONTINUE * 70 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) * * End of SMVCH. * END LOGICAL FUNCTION LSE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. REAL RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LSE = .TRUE. GO TO 30 20 CONTINUE LSE = .FALSE. 30 RETURN * * End of LSE. * END LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE', 'SY' or 'SP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE LSERES = .FALSE. 80 RETURN * * End of LSERES. * END REAL FUNCTION SBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Intrinsic Functions .. INTRINSIC REAL * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF SBEG = REAL( I - 500 )/1001.0 RETURN * * End of SBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 2 BLAS * routines. * * XERBLA is an error handler for the Level 2 BLAS routines. * * It is called by the Level 2 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/sblat3.dat000066400000000000000000000015621313527062700160110ustar00rootroot00000000000000'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 3 NUMBER OF VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA 0.0 1.0 1.3 VALUES OF BETA SGEMM T PUT F FOR NO TEST. SAME COLUMNS. SSYMM T PUT F FOR NO TEST. SAME COLUMNS. STRMM T PUT F FOR NO TEST. SAME COLUMNS. STRSM T PUT F FOR NO TEST. SAME COLUMNS. SSYRK T PUT F FOR NO TEST. SAME COLUMNS. SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/sblat3.f000066400000000000000000003111011313527062700154570ustar00rootroot00000000000000 PROGRAM SBLAT3 * * Test program for the REAL Level 3 Blas. * * The program must be driven by a short data file. The first 14 records * of the file are read using list-directed input, the last 6 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 20 lines: * 'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * 0.0 1.0 0.7 VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * 0.0 1.0 1.3 VALUES OF BETA * SGEMM T PUT F FOR NO TEST. SAME COLUMNS. * SSYMM T PUT F FOR NO TEST. SAME COLUMNS. * STRMM T PUT F FOR NO TEST. SAME COLUMNS. * STRSM T PUT F FOR NO TEST. SAME COLUMNS. * SSYRK T PUT F FOR NO TEST. SAME COLUMNS. * SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) REAL ZERO, HALF, ONE PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. REAL EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANSA, TRANSB CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. REAL AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. REAL SDIFF LOGICAL LSE EXTERNAL SDIFF, LSE * .. External Subroutines .. EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHKE, SMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'SGEMM ', 'SSYMM ', 'STRMM ', 'STRSM ', $ 'SSYRK ', 'SSYR2K'/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = ONE 70 CONTINUE IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) $ GO TO 80 EPS = HALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from SMMCH CT holds * the result computed by SMMCH. TRANSA = 'N' TRANSB = 'N' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'T' TRANSB = 'N' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM * Test SGEMM, 01. 140 CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test SSYMM, 02. 150 CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test STRMM, 03, STRSM, 04. 160 CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) GO TO 190 * Test SSYRK, 05. 170 CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test SSYR2K, 06. 180 CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE REAL LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) 9992 FORMAT( ' FOR BETA ', 7F6.1 ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN SMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A6, L2 ) 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of SBLAT3. * END SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests SGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SGEMM, SMAKE, SMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, $ BETA, LDC IF( REWI ) $ REWIND NTRA CALL SGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LSE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LSE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LSE( CS, CC, LCC ) ELSE ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL SMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, $ ALPHA, LDA, LDB, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', $ 'C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK1. * END SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests SSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, SSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the symmetric matrix A. * CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL SSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LSE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LSE( CS, CC, LCC ) ELSE ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC * 120 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK2. * END SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C ) * * Tests STRMM and STRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, STRMM, STRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = ZERO * Set up zero matrix for SMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL STRMM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL STRSM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LSE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LSE( BS, BB, LBB ) ELSE ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MM' )THEN * * Check the result. * IF( LEFT )THEN CALL SMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL SMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL SMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL SMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, LDA, LDB * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK3. * END SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests SSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, SSYRK * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA BETS = BETA DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC IF( REWI ) $ REWIND NTRA CALL SSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LSE( CS, CC, LCC ) ELSE ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA, $ A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA, $ A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK4. * END SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) * * Tests SSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO PARAMETER ( ZERO = 0.0 ) * .. Scalar Arguments .. REAL EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. REAL AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ G( NMAX ), W( 2*NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LSE, LSERES EXTERNAL LSE, LSERES * .. External Subroutines .. EXTERNAL SMAKE, SMMCH, SSYR2K * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NTC'/, ICHU/'UL'/ * .. Executable Statements .. * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = ZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N NULL = N.LE.0 * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BETS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL SSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LSE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LSE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BETS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LSE( CS, CC, LCC ) ELSE ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = AB( ( J - 1 )*2*NMAX + K + $ I ) W( K + I ) = AB( ( J - 1 )*2*NMAX + $ I ) 50 CONTINUE CALL SMMCH( 'T', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJAB ), 2*NMAX, $ W, 2*NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE DO 60 I = 1, K W( I ) = AB( ( K + I - 1 )*NMAX + $ J ) W( K + I ) = AB( ( I - 1 )*NMAX + $ J ) 60 CONTINUE CALL SMMCH( 'N', 'N', LJ, 1, 2*K, $ ALPHA, AB( JJ ), NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of SCHK5. * END SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, BETA, A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. REAL ALPHA, BETA * .. Local Arrays .. REAL A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. EXTERNAL CHKXER, SGEMM, SSYMM, SSYR2K, SSYRK, STRMM, $ STRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL SGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL SGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL SGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 20 INFOT = 1 CALL SSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 30 INFOT = 1 CALL STRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 40 INFOT = 1 CALL STRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL STRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL STRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL STRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL STRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL STRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL STRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL STRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 50 INFOT = 1 CALL SSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 70 60 INFOT = 1 CALL SSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL SSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL SSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL SSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 70 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of SCHKE. * END SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) REAL ROGUE PARAMETER ( ROGUE = -1.0E10 ) * .. Scalar Arguments .. REAL TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. REAL SBEG EXTERNAL SBEG * .. Executable Statements .. GEN = TYPE.EQ.'GE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = SBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE END IF RETURN * * End of SMAKE. * END SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) * .. Scalar Arguments .. REAL ALPHA, BETA, EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ), G( * ) * .. Local Scalars .. REAL ERRI INTEGER I, J, K LOGICAL TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 120 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = ZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE IF( .NOT.TRANA.AND.TRANB )THEN DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) 60 CONTINUE 70 CONTINUE ELSE IF( TRANA.AND.TRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) 80 CONTINUE 90 CONTINUE END IF DO 100 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) 100 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 110 I = 1, M ERRI = ABS( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.ZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.ONE ) $ GO TO 130 110 CONTINUE * 120 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 150 * * Report fatal error. * 130 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 140 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 150 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', $ 'TED RESULT' ) 9998 FORMAT( 1X, I7, 2G18.6 ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of SMMCH. * END LOGICAL FUNCTION LSE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. REAL RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LSE = .TRUE. GO TO 30 20 CONTINUE LSE = .FALSE. 30 RETURN * * End of LSE. * END LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. REAL AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE LSERES = .FALSE. 80 RETURN * * End of LSERES. * END REAL FUNCTION SBEG( RESET ) * * Generates random numbers uniformly distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, MI * .. Save statement .. SAVE I, IC, MI * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 I = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I is bounded between 1 and 999. * If initial I = 1,2,3,6,7 or 9, the period will be 50. * If initial I = 4 or 8, the period will be 25. * If initial I = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I in 6. * IC = IC + 1 10 I = I*MI I = I - 1000*( I/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF SBEG = ( I - 500 )/1001.0 RETURN * * End of SBEG. * END REAL FUNCTION SDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. REAL X, Y * .. Executable Statements .. SDIFF = X - Y RETURN * * End of SDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 3 BLAS * routines. * * XERBLA is an error handler for the Level 3 BLAS routines. * * It is called by the Level 3 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/zblat1.f000066400000000000000000000747241313527062700155050ustar00rootroot00000000000000 PROGRAM ZBLAT1 * Test program for the COMPLEX*16 Level 1 BLAS. * Based upon the original BLAS test routine together with: * F06GAF Example Program Text * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SFAC INTEGER IC * .. External Subroutines .. EXTERNAL CHECK1, CHECK2, HEADER * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SFAC/9.765625D-4/ * .. Executable Statements .. WRITE (NOUT,99999) DO 20 IC = 1, 10 ICASE = IC CALL HEADER * * Initialize PASS, INCX, INCY, and MODE for a new case. * The value 9999 for INCX, INCY or MODE will appear in the * detailed output, if any, for cases that do not involve * these parameters. * PASS = .TRUE. INCX = 9999 INCY = 9999 MODE = 9999 IF (ICASE.LE.5) THEN CALL CHECK2(SFAC) ELSE IF (ICASE.GE.6) THEN CALL CHECK1(SFAC) END IF * -- Print IF (PASS) WRITE (NOUT,99998) 20 CONTINUE STOP * 99999 FORMAT (' Complex BLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') END SUBROUTINE HEADER * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. CHARACTER*6 L(10) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA L(1)/'ZDOTC '/ DATA L(2)/'ZDOTU '/ DATA L(3)/'ZAXPY '/ DATA L(4)/'ZCOPY '/ DATA L(5)/'ZSWAP '/ DATA L(6)/'DZNRM2'/ DATA L(7)/'DZASUM'/ DATA L(8)/'ZSCAL '/ DATA L(9)/'ZDSCAL'/ DATA L(10)/'IZAMAX'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN * 99999 FORMAT (/' Test of subprogram number',I3,12X,A6) END SUBROUTINE CHECK1(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX*16 CA DOUBLE PRECISION SA INTEGER I, J, LEN, NP1 * .. Local Arrays .. COMPLEX*16 CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + MWPCS(5), MWPCT(5) DOUBLE PRECISION STRUE2(5), STRUE4(5) INTEGER ITRUE3(5) * .. External Functions .. DOUBLE PRECISION DZASUM, DZNRM2 INTEGER IZAMAX EXTERNAL DZASUM, DZNRM2, IZAMAX * .. External Subroutines .. EXTERNAL ZSCAL, ZDSCAL, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA SA, CA/0.3D0, (0.4D0,-0.7D0)/ DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0), + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (-0.17D0,-0.19D0), (0.13D0,-0.39D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (0.19D0,-0.17D0), (0.32D0,0.09D0), + (0.23D0,-0.24D0), (0.18D0,0.01D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + (2.0D0,3.0D0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (-0.17D0,-0.19D0), (8.0D0,9.0D0), + (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (0.11D0,-0.03D0), (3.0D0,6.0D0), + (-0.17D0,0.46D0), (4.0D0,7.0D0), + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), + (0.32D0,0.09D0), (6.0D0,9.0D0), + (0.23D0,-0.24D0), (8.0D0,3.0D0), + (0.18D0,0.01D0), (9.0D0,4.0D0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + (0.03D0,-0.09D0), (0.15D0,-0.03D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + (0.03D0,0.03D0), (-0.18D0,0.03D0), + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (0.09D0,0.03D0), (0.03D0,0.12D0), + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + (0.03D0,-0.09D0), (8.0D0,9.0D0), + (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + (0.03D0,0.03D0), (3.0D0,6.0D0), + (-0.18D0,0.03D0), (4.0D0,7.0D0), + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 DO 40 NP1 = 1, 5 N = NP1 - 1 LEN = 2*MAX(N,1) * .. Set vector arguments .. DO 20 I = 1, LEN CX(I) = CV(I,NP1,INCX) 20 CONTINUE IF (ICASE.EQ.6) THEN * .. DZNRM2 .. CALL STEST1(DZNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1), + SFAC) ELSE IF (ICASE.EQ.7) THEN * .. DZASUM .. CALL STEST1(DZASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1), + SFAC) ELSE IF (ICASE.EQ.8) THEN * .. ZSCAL .. CALL ZSCAL(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN * .. ZDSCAL .. CALL ZDSCAL(N,SA,CX,INCX) CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.10) THEN * .. IZAMAX .. CALL ITEST1(IZAMAX(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' STOP END IF * 40 CONTINUE 60 CONTINUE * INCX = 1 IF (ICASE.EQ.8) THEN * ZSCAL * Add a test for alpha equal to zero. CA = (0.0D0,0.0D0) DO 80 I = 1, 5 MWPCT(I) = (0.0D0,0.0D0) MWPCS(I) = (1.0D0,1.0D0) 80 CONTINUE CALL ZSCAL(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * ZDSCAL * Add a test for alpha equal to zero. SA = 0.0D0 DO 100 I = 1, 5 MWPCT(I) = (0.0D0,0.0D0) MWPCS(I) = (1.0D0,1.0D0) 100 CONTINUE CALL ZDSCAL(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to one. SA = 1.0D0 DO 120 I = 1, 5 MWPCT(I) = CX(I) MWPCS(I) = CX(I) 120 CONTINUE CALL ZDSCAL(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) * Add a test for alpha equal to minus one. SA = -1.0D0 DO 140 I = 1, 5 MWPCT(I) = -CX(I) MWPCS(I) = -CX(I) 140 CONTINUE CALL ZDSCAL(5,SA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) END IF RETURN END SUBROUTINE CHECK2(SFAC) * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. COMPLEX*16 CA INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY * .. Local Arrays .. COMPLEX*16 CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) * .. External Functions .. COMPLEX*16 ZDOTC, ZDOTU EXTERNAL ZDOTC, ZDOTU * .. External Subroutines .. EXTERNAL ZAXPY, ZCOPY, ZSWAP, CTEST * .. Intrinsic Functions .. INTRINSIC ABS, MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. DATA CA/(0.4D0,-0.7D0)/ DATA INCXS/1, 2, -2, -1/ DATA INCYS/1, -2, 1, -2/ DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ DATA NS/0, 1, 2, 4/ DATA CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0), + (-0.1D0,-0.9D0), (0.2D0,-0.8D0), + (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/ DATA CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0), + (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0), + (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/ DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.32D0,-1.41D0), + (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (-1.55D0,0.5D0), + (0.03D0,-0.89D0), (-0.38D0,-0.96D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.78D0,0.06D0), (-0.9D0,0.5D0), + (0.06D0,-0.13D0), (0.1D0,-0.5D0), + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + (0.52D0,-1.51D0)/ DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.78D0,0.06D0), (-1.54D0,0.97D0), + (0.03D0,-0.89D0), (-0.18D0,-1.31D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0), + (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0), + (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0), + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + (0.32D0,-1.16D0)/ DATA CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0), + (0.65D0,-0.47D0), (-0.34D0,-1.22D0), + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + (-0.59D0,-1.46D0), (-1.04D0,-0.04D0), + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + (-0.83D0,0.59D0), (0.07D0,-0.37D0), + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/ DATA CT6/(0.0D0,0.0D0), (0.90D0,0.06D0), + (0.91D0,-0.77D0), (1.80D0,-0.10D0), + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0), + (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0), + (-0.55D0,0.23D0), (0.83D0,-0.39D0), + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0), + (1.95D0,1.22D0)/ DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0), + (-0.4D0,-0.7D0), (-0.1D0,-0.2D0), + (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0), + (0.6D0,-0.6D0)/ DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0), + (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0), + (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/ DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + (-0.4D0,-0.7D0), (-0.1D0,-0.9D0), + (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0)/ DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0), + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + (0.7D0,-0.8D0)/ DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + (-0.9D0,-0.4D0), (-0.1D0,-0.9D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0)/ DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0), + (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0), + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + (0.2D0,-0.8D0)/ DATA CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0), + (1.63D0,1.73D0), (2.90D0,2.78D0)/ DATA CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0), + (1.17D0,1.17D0), (1.17D0,1.17D0), + (1.17D0,1.17D0), (1.17D0,1.17D0), + (1.17D0,1.17D0), (1.17D0,1.17D0)/ DATA CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0), + (1.54D0,1.54D0), (1.54D0,1.54D0), + (1.54D0,1.54D0), (1.54D0,1.54D0), + (1.54D0,1.54D0), (1.54D0,1.54D0)/ * .. Executable Statements .. DO 60 KI = 1, 4 INCX = INCXS(KI) INCY = INCYS(KI) MX = ABS(INCX) MY = ABS(INCY) * DO 40 KN = 1, 4 N = NS(KN) KSIZE = MIN(2,KN) LENX = LENS(KN,MX) LENY = LENS(KN,MY) * .. initialize all argument arrays .. DO 20 I = 1, 7 CX(I) = CX1(I) CY(I) = CY1(I) 20 CONTINUE IF (ICASE.EQ.1) THEN * .. ZDOTC .. CDOT(1) = ZDOTC(N,CX,INCX,CY,INCY) CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.2) THEN * .. ZDOTU .. CDOT(1) = ZDOTU(N,CX,INCX,CY,INCY) CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) ELSE IF (ICASE.EQ.3) THEN * .. ZAXPY .. CALL ZAXPY(N,CA,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) ELSE IF (ICASE.EQ.4) THEN * .. ZCOPY .. CALL ZCOPY(N,CX,INCX,CY,INCY) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE IF (ICASE.EQ.5) THEN * .. ZSWAP .. CALL ZSWAP(N,CX,INCX,CY,INCY) CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' STOP END IF * 40 CONTINUE 60 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) * ********************************* STEST ************************** * * THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO * SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE * NEGLIGIBLE. * * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN * .. Array Arguments .. DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SD INTEGER I * .. External Functions .. DOUBLE PRECISION SDIFF EXTERNAL SDIFF * .. Intrinsic Functions .. INTRINSIC ABS * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + STRUE(I), SD, SSIZE(I) 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE I ', + ' COMP(I) TRUE(I) DIFFERENCE', + ' SIZE(I)',/1X) 99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) END SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. DOUBLE PRECISION SCOMP1, SFAC, STRUE1 * .. Array Arguments .. DOUBLE PRECISION SSIZE(*) * .. Local Arrays .. DOUBLE PRECISION SCOMP(1), STRUE(1) * .. External Subroutines .. EXTERNAL STEST * .. Executable Statements .. * SCOMP(1) = SCOMP1 STRUE(1) = STRUE1 CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) * RETURN END DOUBLE PRECISION FUNCTION SDIFF(SA,SB) * ********************************* SDIFF ************************** * COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 * * .. Scalar Arguments .. DOUBLE PRECISION SA, SB * .. Executable Statements .. SDIFF = SA - SB RETURN END SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) * **************************** CTEST ***************************** * * C.L. LAWSON, JPL, 1978 DEC 6 * * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN * .. Array Arguments .. COMPLEX*16 CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) * .. Local Scalars .. INTEGER I * .. Local Arrays .. DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20) * .. External Subroutines .. EXTERNAL STEST * .. Intrinsic Functions .. INTRINSIC DIMAG, DBLE * .. Executable Statements .. DO 20 I = 1, LEN SCOMP(2*I-1) = DBLE(CCOMP(I)) SCOMP(2*I) = DIMAG(CCOMP(I)) STRUE(2*I-1) = DBLE(CTRUE(I)) STRUE(2*I) = DIMAG(CTRUE(I)) SSIZE(2*I-1) = DBLE(CSIZE(I)) SSIZE(2*I) = DIMAG(CSIZE(I)) 20 CONTINUE * CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) RETURN END SUBROUTINE ITEST1(ICOMP,ITRUE) * ********************************* ITEST1 ************************* * * THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR * EQUALITY. * C. L. LAWSON, JPL, 1974 DEC 10 * * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) * .. Scalar Arguments .. INTEGER ICOMP, ITRUE * .. Scalars in Common .. INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Scalars .. INTEGER ID * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Executable Statements .. IF (ICOMP.EQ.ITRUE) GO TO 40 * * HERE ICOMP IS NOT EQUAL TO ITRUE. * IF ( .NOT. PASS) GO TO 20 * PRINT FAIL MESSAGE AND HEADER. PASS = .FALSE. WRITE (NOUT,99999) WRITE (NOUT,99998) 20 ID = ICOMP - ITRUE WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID 40 CONTINUE RETURN * 99999 FORMAT (' FAIL') 99998 FORMAT (/' CASE N INCX INCY MODE ', + ' COMP TRUE DIFFERENCE', + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END OpenBLAS-0.2.20/test/zblat2.dat000066400000000000000000000030121313527062700160070ustar00rootroot00000000000000'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 7 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 4 NUMBER OF VALUES OF K 0 1 2 4 VALUES OF K 4 NUMBER OF VALUES OF INCX AND INCY 1 2 -1 -2 VALUES OF INCX AND INCY 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. ZGERC T PUT F FOR NO TEST. SAME COLUMNS. ZGERU T PUT F FOR NO TEST. SAME COLUMNS. ZHER T PUT F FOR NO TEST. SAME COLUMNS. ZHPR T PUT F FOR NO TEST. SAME COLUMNS. ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/zblat2.f000066400000000000000000003425601313527062700155020ustar00rootroot00000000000000 PROGRAM ZBLAT2 * * Test program for the COMPLEX*16 Level 2 Blas. * * The program must be driven by a short data file. The first 18 records * of the file are read using list-directed input, the last 17 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 35 lines: * 'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 4 NUMBER OF VALUES OF K * 0 1 2 4 VALUES OF K * 4 NUMBER OF VALUES OF INCX AND INCY * 1 2 -1 -2 VALUES OF INCX AND INCY * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. * ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. * ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. * ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. * ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. * ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. * ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. * ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. * ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. * ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. * ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. * ZGERC T PUT F FOR NO TEST. SAME COLUMNS. * ZGERU T PUT F FOR NO TEST. SAME COLUMNS. * ZHER T PUT F FOR NO TEST. SAME COLUMNS. * ZHPR T PUT F FOR NO TEST. SAME COLUMNS. * ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. * ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. * An extended set of Fortran Basic Linear Algebra Subprograms. * * Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics * and Computer Science Division, Argonne National Laboratory, * 9700 South Cass Avenue, Argonne, Illinois 60439, US. * * Or * * NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms * Group Ltd., NAG Central Office, 256 Banbury Road, Oxford * OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st * Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. * * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 17 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, $ NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANS CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), $ X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LZE EXTERNAL DDIFF, LZE * .. External Subroutines .. EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6, $ ZCHKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'ZGEMV ', 'ZGBMV ', 'ZHEMV ', 'ZHBMV ', $ 'ZHPMV ', 'ZTRMV ', 'ZTBMV ', 'ZTPMV ', $ 'ZTRSV ', 'ZTBSV ', 'ZTPSV ', 'ZGERC ', $ 'ZGERU ', 'ZHER ', 'ZHPR ', 'ZHER2 ', $ 'ZHPR2 '/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 230 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 230 END IF 10 CONTINUE * Values of K READ( NIN, FMT = * )NKB IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN WRITE( NOUT, FMT = 9997 )'K', NKBMAX GO TO 230 END IF READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) DO 20 I = 1, NKB IF( KB( I ).LT.0 )THEN WRITE( NOUT, FMT = 9995 ) GO TO 230 END IF 20 CONTINUE * Values of INCX and INCY READ( NIN, FMT = * )NINC IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX GO TO 230 END IF READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) DO 30 I = 1, NINC IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN WRITE( NOUT, FMT = 9994 )INCMAX GO TO 230 END IF 30 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 230 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 230 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9993 ) WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9980 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 40 I = 1, NSUBS LTEST( I ) = .FALSE. 40 CONTINUE 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT DO 60 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET STOP 70 LTEST( I ) = LTESTT GO TO 50 * 80 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 90 CONTINUE IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 100 EPS = RHALF*EPS GO TO 90 100 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMVCH using exact data. * N = MIN( 32, NMAX ) DO 120 J = 1, N DO 110 I = 1, N A( I, J ) = MAX( I - J + 1, 0 ) 110 CONTINUE X( J ) = J Y( J ) = ZERO 120 CONTINUE DO 130 J = 1, N YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE * YY holds the exact result. On exit from ZMVCH YT holds * the result computed by ZMVCH. TRANS = 'N' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF TRANS = 'T' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 210 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 140, 150, 150, 150, 160, 160, $ 160, 160, 160, 160, 170, 170, 180, $ 180, 190, 190 )ISNUM * Test ZGEMV, 01, and ZGBMV, 02. 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, $ X, XX, XS, Y, YY, YS, YT, G ) GO TO 200 * Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, * ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) GO TO 200 * Test ZGERC, 12, ZGERU, 13. 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test ZHER, 14, and ZHPR, 15. 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) GO TO 200 * Test ZHER2, 16, and ZHPR2, 17. 190 CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, $ YT, G, Z ) * 200 IF( FATAL.AND.SFATAL ) $ GO TO 220 END IF 210 CONTINUE WRITE( NOUT, FMT = 9982 ) GO TO 240 * 220 CONTINUE WRITE( NOUT, FMT = 9981 ) GO TO 240 * 230 CONTINUE WRITE( NOUT, FMT = 9987 ) * 240 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', $ I2 ) 9993 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 2 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9992 FORMAT( ' FOR N ', 9I6 ) 9991 FORMAT( ' FOR K ', 7I6 ) 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) 9989 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9988 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9985 FORMAT( ' ERROR IN ZMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' ZMVCH WAS CALLED WITH TRANS = ', A1, $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' $ , /' ******* TESTS ABANDONED *******' ) 9984 FORMAT( A6, L2 ) 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9982 FORMAT( /' END OF TESTS' ) 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of ZBLAT2. * END SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests ZGEMV and ZGBMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, $ NL, NS LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN CHARACTER*1 TRANS, TRANSS CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZGBMV, ZGEMV, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' BANDED = SNAME( 3: 3 ).EQ.'B' * Define the number of arguments. IF( FULL )THEN NARGS = 11 ELSE IF( BANDED )THEN NARGS = 13 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IKU = 1, NK IF( BANDED )THEN KU = KB( IKU ) KL = MAX( KU - 1, 0 ) ELSE KU = N - 1 KL = M - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = KL + KU + 1 ELSE LDA = M END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, $ LDA, KL, KU, RESET, TRANSL ) * DO 90 IC = 1, 3 TRANS = ICH( IC: IC ) TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' * IF( TRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*NL * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) IF( NL.GT.1 )THEN X( NL/2 ) = ZERO XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*ML * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, $ YY, ABS( INCY ), 0, ML - 1, $ RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANSS = TRANS MS = M NS = N KLS = KL KUS = KU ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ TRANS, M, N, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL ZGEMV( TRANS, M, N, ALPHA, AA, $ LDA, XX, INCX, BETA, YY, $ INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANS, M, N, KL, KU, ALPHA, LDA, $ INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL ZGBMV( TRANS, M, N, KL, KU, ALPHA, $ AA, LDA, XX, INCX, BETA, $ YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 130 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N IF( FULL )THEN ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LZE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LZE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LZE( YS, YY, LY ) ELSE ISAME( 10 ) = LZERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 4 ) = KLS.EQ.KL ISAME( 5 ) = KUS.EQ.KU ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LZE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LZE( XS, XX, LX ) ISAME( 10 ) = INCXS.EQ.INCX ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LZE( YS, YY, LY ) ELSE ISAME( 12 ) = LZERES( 'GE', ' ', 1, $ ML, YS, YY, $ ABS( INCY ) ) END IF ISAME( 13 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 130 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMVCH( TRANS, M, N, ALPHA, A, $ NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 130 ELSE * Avoid repeating tests with M.le.0 or * N.le.0. GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 140 * 130 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, $ ALPHA, LDA, INCX, BETA, INCY END IF * 140 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', $ F4.1, '), Y,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', $ F4.1, '), Y,', I2, ') .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK1. * END SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, $ XS, Y, YY, YS, YT, G ) * * Tests ZHEMV, ZHBMV and ZHPMV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, $ NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, $ N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHBMV, ZHEMV, ZHPMV, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 10 ELSE IF( BANDED )THEN NARGS = 11 ELSE IF( PACKED )THEN NARGS = 9 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, $ LDA, K, K, RESET, TRANSL ) * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * UPLOS = UPLO NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX BLS = BETA DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL ZHEMV( UPLO, N, ALPHA, AA, LDA, XX, $ INCX, BETA, YY, INCY ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, N, K, ALPHA, LDA, INCX, BETA, $ INCY IF( REWI ) $ REWIND NTRA CALL ZHBMV( UPLO, N, K, ALPHA, AA, LDA, $ XX, INCX, BETA, YY, INCY ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, N, ALPHA, INCX, BETA, INCY IF( REWI ) $ REWIND NTRA CALL ZHPMV( UPLO, N, ALPHA, AA, XX, INCX, $ BETA, YY, INCY ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N IF( FULL )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( AS, AA, LAA ) ISAME( 5 ) = LDAS.EQ.LDA ISAME( 6 ) = LZE( XS, XX, LX ) ISAME( 7 ) = INCXS.EQ.INCX ISAME( 8 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 9 ) = LZE( YS, YY, LY ) ELSE ISAME( 9 ) = LZERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 10 ) = INCYS.EQ.INCY ELSE IF( BANDED )THEN ISAME( 3 ) = KS.EQ.K ISAME( 4 ) = ALS.EQ.ALPHA ISAME( 5 ) = LZE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA ISAME( 7 ) = LZE( XS, XX, LX ) ISAME( 8 ) = INCXS.EQ.INCX ISAME( 9 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 10 ) = LZE( YS, YY, LY ) ELSE ISAME( 10 ) = LZERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 11 ) = INCYS.EQ.INCY ELSE IF( PACKED )THEN ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( AS, AA, LAA ) ISAME( 5 ) = LZE( XS, XX, LX ) ISAME( 6 ) = INCXS.EQ.INCX ISAME( 7 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 8 ) = LZE( YS, YY, LY ) ELSE ISAME( 8 ) = LZERES( 'GE', ' ', 1, N, $ YS, YY, ABS( INCY ) ) END IF ISAME( 9 ) = INCYS.EQ.INCY END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X, $ INCX, BETA, Y, INCY, YT, G, $ YY, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0 GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, $ BETA, INCY ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, $ INCX, BETA, INCY ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, $ BETA, INCY END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2, $ ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', $ F4.1, '), Y,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ', $ 'Y,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK2. * END SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) * * Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) * .. Local Scalars .. COMPLEX*16 TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS CHARACTER*2 ICHD, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZMAKE, ZMVCH, ZTBMV, ZTBSV, ZTPMV, ZTPSV, $ ZTRMV, ZTRSV * .. Intrinsic Functions .. INTRINSIC ABS, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'R' BANDED = SNAME( 3: 3 ).EQ.'B' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 8 ELSE IF( BANDED )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 7 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero vector for ZMVCH. DO 10 I = 1, NMAX Z( I ) = ZERO 10 CONTINUE * DO 110 IN = 1, NIDIM N = IDIM( IN ) * IF( BANDED )THEN NK = NKB ELSE NK = 1 END IF DO 100 IK = 1, NK IF( BANDED )THEN K = KB( IK ) ELSE K = N - 1 END IF * Set LDA to 1 more than minimum value if room. IF( BANDED )THEN LDA = K + 1 ELSE LDA = N END IF IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF NULL = N.LE.0 * DO 90 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 80 ICT = 1, 3 TRANS = ICHT( ICT: ICT ) * DO 70 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, $ NMAX, AA, LDA, K, K, RESET, TRANSL ) * DO 60 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, $ ABS( INCX ), 0, N - 1, RESET, $ TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS DIAGS = DIAG NS = N KS = K DO 20 I = 1, LAA AS( I ) = AA( I ) 20 CONTINUE LDAS = LDA DO 30 I = 1, LX XS( I ) = XX( I ) 30 CONTINUE INCXS = INCX * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL ZTRMV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL ZTBMV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL ZTPMV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, $ UPLO, TRANS, DIAG, N, LDA, INCX IF( REWI ) $ REWIND NTRA CALL ZTRSV( UPLO, TRANS, DIAG, N, AA, LDA, $ XX, INCX ) ELSE IF( BANDED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, $ UPLO, TRANS, DIAG, N, K, LDA, INCX IF( REWI ) $ REWIND NTRA CALL ZTBSV( UPLO, TRANS, DIAG, N, K, AA, $ LDA, XX, INCX ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ UPLO, TRANS, DIAG, N, INCX IF( REWI ) $ REWIND NTRA CALL ZTPSV( UPLO, TRANS, DIAG, N, AA, XX, $ INCX ) END IF END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = TRANS.EQ.TRANSS ISAME( 3 ) = DIAG.EQ.DIAGS ISAME( 4 ) = NS.EQ.N IF( FULL )THEN ISAME( 5 ) = LZE( AS, AA, LAA ) ISAME( 6 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 7 ) = LZE( XS, XX, LX ) ELSE ISAME( 7 ) = LZERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 8 ) = INCXS.EQ.INCX ELSE IF( BANDED )THEN ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 8 ) = LZE( XS, XX, LX ) ELSE ISAME( 8 ) = LZERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 9 ) = INCXS.EQ.INCX ELSE IF( PACKED )THEN ISAME( 5 ) = LZE( AS, AA, LAA ) IF( NULL )THEN ISAME( 6 ) = LZE( XS, XX, LX ) ELSE ISAME( 6 ) = LZERES( 'GE', ' ', 1, N, XS, $ XX, ABS( INCX ) ) END IF ISAME( 7 ) = INCXS.EQ.INCX END IF * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MV' )THEN * * Check the result. * CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN * * Compute approximation to original vector. * DO 50 I = 1, N Z( I ) = XX( 1 + ( I - 1 )* $ ABS( INCX ) ) XX( 1 + ( I - 1 )*ABS( INCX ) ) $ = X( I ) 50 CONTINUE CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z, $ INCX, ZERO, X, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .FALSE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 120 ELSE * Avoid repeating tests with N.le.0. GO TO 110 END IF * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, $ INCX ELSE IF( BANDED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, $ LDA, INCX ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', $ 'X,', I2, ') .' ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), $ ' A,', I3, ', X,', I2, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', $ I3, ', X,', I2, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK3. * END SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests ZGERC and ZGERU. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, $ NC, ND, NS LOGICAL CONJ, NULL, RESET, SAME * .. Local Arrays .. COMPLEX*16 W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZGERC, ZGERU, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, DCONJG, MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. CONJ = SNAME( 5: 5 ).EQ.'C' * Define the number of arguments. NARGS = 9 * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 120 IN = 1, NIDIM N = IDIM( IN ) ND = N/2 + 1 * DO 110 IM = 1, 2 IF( IM.EQ.1 ) $ M = MAX( N - ND, 0 ) IF( IM.EQ.2 ) $ M = MIN( N + ND, NMAX ) * * Set LDA to 1 more than minimum value if room. LDA = M IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*N NULL = N.LE.0.OR.M.LE.0 * DO 100 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*M * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), $ 0, M - 1, RESET, TRANSL ) IF( M.GT.1 )THEN X( M/2 ) = ZERO XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO END IF * DO 90 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, $ ALPHA, INCX, INCY, LDA IF( CONJ )THEN IF( REWI ) $ REWIND NTRA CALL ZGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA, $ LDA ) ELSE IF( REWI ) $ REWIND NTRA CALL ZGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA, $ LDA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9993 ) FATAL = .TRUE. GO TO 140 END IF * * See what data changed inside subroutine. * ISAME( 1 ) = MS.EQ.M ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LZE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LZE( AS, AA, LAA ) ELSE ISAME( 8 ) = LZERES( 'GE', ' ', M, N, AS, AA, $ LDA ) END IF ISAME( 9 ) = LDAS.EQ.LDA * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 140 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, M Z( I ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, M Z( I ) = X( M - I + 1 ) 60 CONTINUE END IF DO 70 J = 1, N IF( INCY.GT.0 )THEN W( 1 ) = Y( J ) ELSE W( 1 ) = Y( N - J + 1 ) END IF IF( CONJ ) $ W( 1 ) = DCONJG( W( 1 ) ) CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, $ ONE, A( 1, J ), 1, YT, G, $ AA( 1 + ( J - 1 )*LDA ), EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 130 70 CONTINUE ELSE * Avoid repeating tests with M.le.0 or N.le.0. GO TO 110 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 150 * 130 CONTINUE WRITE( NOUT, FMT = 9995 )J * 140 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA * 150 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, $ '), X,', I2, ', Y,', I2, ', A,', I3, ') ', $ ' .' ) 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK4. * END SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests ZHER and ZHPR. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX*16 ALPHA, TRANSL DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. COMPLEX*16 W( 1 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHER, ZHPR, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 7 ELSE IF( PACKED )THEN NARGS = 6 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 100 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 90 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 80 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 70 IA = 1, NALF RALPHA = DBLE( ALF( IA ) ) ALPHA = DCMPLX( RALPHA, RZERO ) NULL = N.LE.0.OR.RALPHA.EQ.RZERO * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N RALS = RALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ RALPHA, INCX, LDA IF( REWI ) $ REWIND NTRA CALL ZHER( UPLO, N, RALPHA, XX, INCX, AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ RALPHA, INCX IF( REWI ) $ REWIND NTRA CALL ZHPR( UPLO, N, RALPHA, XX, INCX, AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = RALS.EQ.RALPHA ISAME( 4 ) = LZE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX IF( NULL )THEN ISAME( 6 ) = LZE( AS, AA, LAA ) ELSE ISAME( 6 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N, AS, $ AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 7 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 40 I = 1, N Z( I ) = X( I ) 40 CONTINUE ELSE DO 50 I = 1, N Z( I ) = X( N - I + 1 ) 50 CONTINUE END IF JA = 1 DO 60 J = 1, N W( 1 ) = DCONJG( Z( J ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, $ 1, ONE, A( JJ, J ), 1, YT, G, $ AA( JA ), EPS, ERR, FATAL, NOUT, $ .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 110 60 CONTINUE ELSE * Avoid repeating tests if N.le.0. IF( N.LE.0 ) $ GO TO 100 END IF * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', AP) .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', $ I2, ', A,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK5. * END SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, $ Z ) * * Tests ZHER2 and ZHPR2. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), $ XX( NMAX*INCMAX ), Y( NMAX ), $ YS( NMAX*INCMAX ), YT( NMAX ), $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ), INC( NINC ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, TRANSL DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, $ NARGS, NC, NS LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER CHARACTER*1 UPLO, UPLOS CHARACTER*2 ICH * .. Local Arrays .. COMPLEX*16 W( 2 ) LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHER2, ZHPR2, ZMAKE, ZMVCH * .. Intrinsic Functions .. INTRINSIC ABS, DCONJG, MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'UL'/ * .. Executable Statements .. FULL = SNAME( 3: 3 ).EQ.'E' PACKED = SNAME( 3: 3 ).EQ.'P' * Define the number of arguments. IF( FULL )THEN NARGS = 9 ELSE IF( PACKED )THEN NARGS = 8 END IF * NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 140 IN = 1, NIDIM N = IDIM( IN ) * Set LDA to 1 more than minimum value if room. LDA = N IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 140 IF( PACKED )THEN LAA = ( N*( N + 1 ) )/2 ELSE LAA = LDA*N END IF * DO 130 IC = 1, 2 UPLO = ICH( IC: IC ) UPPER = UPLO.EQ.'U' * DO 120 IX = 1, NINC INCX = INC( IX ) LX = ABS( INCX )*N * * Generate the vector X. * TRANSL = HALF CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), $ 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN X( N/2 ) = ZERO XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO END IF * DO 110 IY = 1, NINC INCY = INC( IY ) LY = ABS( INCY )*N * * Generate the vector Y. * TRANSL = ZERO CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) IF( N.GT.1 )THEN Y( N/2 ) = ZERO YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO END IF * DO 100 IA = 1, NALF ALPHA = ALF( IA ) NULL = N.LE.0.OR.ALPHA.EQ.ZERO * * Generate the matrix A. * TRANSL = ZERO CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, $ NMAX, AA, LDA, N - 1, N - 1, RESET, $ TRANSL ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LX XS( I ) = XX( I ) 20 CONTINUE INCXS = INCX DO 30 I = 1, LY YS( I ) = YY( I ) 30 CONTINUE INCYS = INCY * * Call the subroutine. * IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY, LDA IF( REWI ) $ REWIND NTRA CALL ZHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA, LDA ) ELSE IF( PACKED )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, $ ALPHA, INCX, INCY IF( REWI ) $ REWIND NTRA CALL ZHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, $ AA ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 160 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLO.EQ.UPLOS ISAME( 2 ) = NS.EQ.N ISAME( 3 ) = ALS.EQ.ALPHA ISAME( 4 ) = LZE( XS, XX, LX ) ISAME( 5 ) = INCXS.EQ.INCX ISAME( 6 ) = LZE( YS, YY, LY ) ISAME( 7 ) = INCYS.EQ.INCY IF( NULL )THEN ISAME( 8 ) = LZE( AS, AA, LAA ) ELSE ISAME( 8 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N, $ AS, AA, LDA ) END IF IF( .NOT.PACKED )THEN ISAME( 9 ) = LDAS.EQ.LDA END IF * * If data was incorrectly changed, report and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 160 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( INCX.GT.0 )THEN DO 50 I = 1, N Z( I, 1 ) = X( I ) 50 CONTINUE ELSE DO 60 I = 1, N Z( I, 1 ) = X( N - I + 1 ) 60 CONTINUE END IF IF( INCY.GT.0 )THEN DO 70 I = 1, N Z( I, 2 ) = Y( I ) 70 CONTINUE ELSE DO 80 I = 1, N Z( I, 2 ) = Y( N - I + 1 ) 80 CONTINUE END IF JA = 1 DO 90 J = 1, N W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) ) W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) ) IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), $ NMAX, W, 1, ONE, A( JJ, J ), 1, $ YT, G, AA( JA ), EPS, ERR, FATAL, $ NOUT, .TRUE. ) IF( FULL )THEN IF( UPPER )THEN JA = JA + LDA ELSE JA = JA + LDA + 1 END IF ELSE JA = JA + LJ END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and return. IF( FATAL ) $ GO TO 150 90 CONTINUE ELSE * Avoid repeating tests with N.le.0. IF( N.LE.0 ) $ GO TO 140 END IF * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 170 * 150 CONTINUE WRITE( NOUT, FMT = 9995 )J * 160 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( FULL )THEN WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, $ INCY, LDA ELSE IF( PACKED )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY END IF * 170 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', AP) ', $ ' .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') ', $ ' .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK6. * END SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 2 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, RALPHA, BETA, A, X and Y should not need to be defined. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION RALPHA * .. Local Arrays .. COMPLEX*16 A( 1, 1 ), X( 1 ), Y( 1 ) * .. External Subroutines .. EXTERNAL CHKXER, ZGBMV, ZGEMV, ZGERC, ZGERU, ZHBMV, $ ZHEMV, ZHER, ZHER2, ZHPMV, ZHPR, ZHPR2, ZTBMV, $ ZTBSV, ZTPMV, ZTPSV, ZTRMV, ZTRSV * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90, 100, 110, 120, 130, 140, 150, 160, $ 170 )ISNUM 10 INFOT = 1 CALL ZGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 20 INFOT = 1 CALL ZGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 30 INFOT = 1 CALL ZHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 40 INFOT = 1 CALL ZHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 50 INFOT = 1 CALL ZHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 60 INFOT = 1 CALL ZTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 70 INFOT = 1 CALL ZTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 80 INFOT = 1 CALL ZTPMV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTPMV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTPMV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTPMV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZTPMV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 90 INFOT = 1 CALL ZTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 100 INFOT = 1 CALL ZTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 110 INFOT = 1 CALL ZTPSV( '/', 'N', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTPSV( 'U', '/', 'N', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTPSV( 'U', 'N', '/', 0, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTPSV( 'U', 'N', 'N', -1, A, X, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZTPSV( 'U', 'N', 'N', 0, A, X, 0 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 120 INFOT = 1 CALL ZGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 130 INFOT = 1 CALL ZGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 140 INFOT = 1 CALL ZHER( '/', 0, RALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHER( 'U', -1, RALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZHER( 'U', 0, RALPHA, X, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER( 'U', 2, RALPHA, X, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 150 INFOT = 1 CALL ZHPR( '/', 0, RALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHPR( 'U', -1, RALPHA, X, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZHPR( 'U', 0, RALPHA, X, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 160 INFOT = 1 CALL ZHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 180 170 INFOT = 1 CALL ZHPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 180 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of ZCHKE. * END SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, $ KU, RESET, TRANSL ) * * Generates values for an M by N matrix A within the bandwidth * defined by KL and KU. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) COMPLEX*16 ROGUE PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) DOUBLE PRECISION RROGUE PARAMETER ( RROGUE = -1.0D10 ) * .. Scalar Arguments .. COMPLEX*16 TRANSL INTEGER KL, KU, LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX*16 ZBEG EXTERNAL ZBEG * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MIN * .. Executable Statements .. GEN = TYPE( 1: 1 ).EQ.'G' SYM = TYPE( 1: 1 ).EQ.'H' TRI = TYPE( 1: 1 ).EQ.'T' UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN IF( ( I.LE.J.AND.J - I.LE.KU ).OR. $ ( I.GE.J.AND.I - J.LE.KL ) )THEN A( I, J ) = ZBEG( RESET ) + TRANSL ELSE A( I, J ) = ZERO END IF IF( I.NE.J )THEN IF( SYM )THEN A( J, I ) = DCONJG( A( I, J ) ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( SYM ) $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'GB' )THEN DO 90 J = 1, N DO 60 I1 = 1, KU + 1 - J AA( I1 + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) 70 CONTINUE DO 80 I3 = I2, LDA AA( I3 + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE 90 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN DO 130 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 100 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 100 CONTINUE DO 110 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 110 CONTINUE DO 120 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 120 CONTINUE IF( SYM )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 130 CONTINUE ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN DO 170 J = 1, N IF( UPPER )THEN KK = KL + 1 IBEG = MAX( 1, KL + 2 - J ) IF( UNIT )THEN IEND = KL ELSE IEND = KL + 1 END IF ELSE KK = 1 IF( UNIT )THEN IBEG = 2 ELSE IBEG = 1 END IF IEND = MIN( KL + 1, 1 + M - J ) END IF DO 140 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 140 CONTINUE DO 150 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) 150 CONTINUE DO 160 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 160 CONTINUE IF( SYM )THEN JJ = KK + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 170 CONTINUE ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN IOFF = 0 DO 190 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 180 I = IBEG, IEND IOFF = IOFF + 1 AA( IOFF ) = A( I, J ) IF( I.EQ.J )THEN IF( UNIT ) $ AA( IOFF ) = ROGUE IF( SYM ) $ AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE ) END IF 180 CONTINUE 190 CONTINUE END IF RETURN * * End of ZMAKE. * END SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION EPS, ERR INTEGER INCX, INCY, M, N, NMAX, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANS * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) DOUBLE PRECISION G( * ) * .. Local Scalars .. COMPLEX*16 C DOUBLE PRECISION ERRI INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL LOGICAL CTRAN, TRAN * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, SQRT * .. Statement Functions .. DOUBLE PRECISION ABS1 * .. Statement Function definitions .. ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) * .. Executable Statements .. TRAN = TRANS.EQ.'T' CTRAN = TRANS.EQ.'C' IF( TRAN.OR.CTRAN )THEN ML = N NL = M ELSE ML = M NL = N END IF IF( INCX.LT.0 )THEN KX = NL INCXL = -1 ELSE KX = 1 INCXL = 1 END IF IF( INCY.LT.0 )THEN KY = ML INCYL = -1 ELSE KY = 1 INCYL = 1 END IF * * Compute expected result in YT using data in A, X and Y. * Compute gauges in G. * IY = KY DO 40 I = 1, ML YT( IY ) = ZERO G( IY ) = RZERO JX = KX IF( TRAN )THEN DO 10 J = 1, NL YT( IY ) = YT( IY ) + A( J, I )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 10 CONTINUE ELSE IF( CTRAN )THEN DO 20 J = 1, NL YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX ) G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) JX = JX + INCXL 20 CONTINUE ELSE DO 30 J = 1, NL YT( IY ) = YT( IY ) + A( I, J )*X( JX ) G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) JX = JX + INCXL 30 CONTINUE END IF YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) IY = IY + INCYL 40 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 50 I = 1, ML ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 60 50 CONTINUE * If the loop completes, all results are at least half accurate. GO TO 80 * * Report fatal error. * 60 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 70 I = 1, ML IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) END IF 70 CONTINUE * 80 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) * * End of ZMVCH. * END LOGICAL FUNCTION LZE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX*16 RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LZE = .TRUE. GO TO 30 20 CONTINUE LZE = .FALSE. 30 RETURN * * End of LZE. * END LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE', 'HE' or 'HP'. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'HE' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE LZERES = .FALSE. 80 RETURN * * End of LZERES. * END COMPLEX*16 FUNCTION ZBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC DCMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) RETURN * * End of ZBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 2 BLAS * routines. * * XERBLA is an error handler for the Level 2 BLAS routines. * * It is called by the Level 2 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 2 Blas. * * -- Written on 10-August-1987. * Richard Hanson, Sandia National Labs. * Jeremy Du Croz, NAG Central Office. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/zblat3.dat000066400000000000000000000020261313527062700160140ustar00rootroot00000000000000'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. F LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. ZHERK T PUT F FOR NO TEST. SAME COLUMNS. ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/zblat3.f000066400000000000000000003770461313527062700155120ustar00rootroot00000000000000 PROGRAM ZBLAT3 * * Test program for the COMPLEX*16 Level 3 Blas. * * The program must be driven by a short data file. The first 14 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A6, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 23 lines: * 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. * ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. * ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. * ZHERK T PUT F FOR NO TEST. SAME COLUMNS. * ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. * ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. * ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANSA, TRANSB CHARACTER*6 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*6 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LZE EXTERNAL DDIFF, LZE * .. External Subroutines .. EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHKE, ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'ZGEMM ', 'ZHEMM ', 'ZSYMM ', 'ZTRMM ', $ 'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K', $ 'ZSYR2K'/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from ZMMCH CT holds * the result computed by ZMMCH. TRANSA = 'N' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test ZGEMM, 01. 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test ZHEMM, 02, ZSYMM, 03. 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test ZTRMM, 04, ZTRSM, 05. 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) GO TO 190 * Test ZHERK, 06, ZSYRK, 07. 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test ZHER2K, 08, ZSYR2K, 09. 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A6, L2 ) 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of ZBLAT3. * END SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests ZGEMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZGEMM, ZMAKE, ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL ZMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, $ BETA, LDC IF( REWI ) $ REWIND NTRA CALL ZGEMM( TRANSA, TRANSB, M, N, K, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LZE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LZE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LZE( CS, CC, LCC ) ELSE ISAME( 12 ) = LZERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, $ ALPHA, LDA, LDB, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK1. * END SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests ZHEMM and ZSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHEMM, ZMAKE, ZMMCH, ZSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL ZHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) ELSE CALL ZSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC * 120 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK2. * END SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C ) * * Tests ZTRMM and ZTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZMAKE, ZMMCH, ZTRMM, ZTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for ZMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL ZMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL ZTRMM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LZE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LZE( BS, BB, LBB ) ELSE ISAME( 10 ) = LZERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MM' )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, LDA, LDB * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK3. * END SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests ZHERK and ZSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHERK, ZMAKE, ZMMCH, ZSYRK * .. Intrinsic Functions .. INTRINSIC DCMPLX, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO RALS = RONE RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = DBLE( ALPHA ) ALPHA = DCMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, RALPHA, LDA, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL ZHERK( UPLO, TRANS, N, K, RALPHA, AA, $ LDA, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC IF( REWI ) $ REWIND NTRA CALL ZSYRK( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LZE( CS, CC, LCC ) ELSE ISAME( 9 ) = LZERES( SNAME( 2: 3 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL ZMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, $ LDA, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK4. * END SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) * * Tests ZHER2K and ZSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*6 SNAME * .. Array Arguments .. COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHER2K, ZMAKE, ZMMCH, ZSYR2K * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL ZHER2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL ZSYR2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'HE', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = DCONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*DCONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = DCONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC END IF * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK5. * END SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION RALPHA, RBETA * .. Local Arrays .. COMPLEX*16 A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. EXTERNAL ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 CALL ZGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL ZGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL ZGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 20 INFOT = 1 CALL ZHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 30 INFOT = 1 CALL ZSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 40 INFOT = 1 CALL ZTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 50 INFOT = 1 CALL ZTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 60 INFOT = 1 CALL ZHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 70 INFOT = 1 CALL ZSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 80 INFOT = 1 CALL ZHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 90 INFOT = 1 CALL ZSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 100 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of ZCHKE. * END SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'HE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) COMPLEX*16 ROGUE PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) DOUBLE PRECISION RROGUE PARAMETER ( RROGUE = -1.0D10 ) * .. Scalar Arguments .. COMPLEX*16 TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX*16 ZBEG EXTERNAL ZBEG * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, DBLE * .. Executable Statements .. GEN = TYPE.EQ.'GE' HER = TYPE.EQ.'HE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = ZBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = DCONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of ZMAKE. * END SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) DOUBLE PRECISION G( * ) * .. Local Scalars .. COMPLEX*16 CL DOUBLE PRECISION ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT * .. Statement Functions .. DOUBLE PRECISION ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of ZMMCH. * END LOGICAL FUNCTION LZE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX*16 RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LZE = .TRUE. GO TO 30 20 CONTINUE LZE = .FALSE. 30 RETURN * * End of LZE. * END LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'HE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE LZERES = .FALSE. 80 RETURN * * End of LZERES. * END COMPLEX*16 FUNCTION ZBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC DCMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) RETURN * * End of ZBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A6, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 3 BLAS * routines. * * XERBLA is an error handler for the Level 3 BLAS routines. * * It is called by the Level 3 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*6 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*6 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', $ 'AD OF ', A6, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/test/zblat3_3m.dat000066400000000000000000000020421313527062700164110ustar00rootroot00000000000000'ZBLAT3_3M.SUMM' NAME OF SUMMARY OUTPUT FILE 6 UNIT NUMBER OF SUMMARY FILE 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. F LOGICAL FLAG, T TO STOP ON FAILURES. F LOGICAL FLAG, T TO TEST ERROR EXITS. 16.0 THRESHOLD VALUE OF TEST RATIO 6 NUMBER OF VALUES OF N 0 1 2 3 7 31 63 VALUES OF N 3 NUMBER OF VALUES OF ALPHA (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA 3 NUMBER OF VALUES OF BETA (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. ZHEMM F PUT F FOR NO TEST. SAME COLUMNS. ZSYMM F PUT F FOR NO TEST. SAME COLUMNS. ZTRMM F PUT F FOR NO TEST. SAME COLUMNS. ZTRSM F PUT F FOR NO TEST. SAME COLUMNS. ZHERK F PUT F FOR NO TEST. SAME COLUMNS. ZSYRK F PUT F FOR NO TEST. SAME COLUMNS. ZHER2K F PUT F FOR NO TEST. SAME COLUMNS. ZSYR2K F PUT F FOR NO TEST. SAME COLUMNS. OpenBLAS-0.2.20/test/zblat3_3m.f000066400000000000000000003773051313527062700161070ustar00rootroot00000000000000 PROGRAM ZBLAT3 * * Test program for the COMPLEX*16 Level 3 Blas. * * The program must be driven by a short data file. The first 14 records * of the file are read using list-directed input, the last 9 records * are read using the format ( A8, L2 ). An annotated example of a data * file can be obtained by deleting the first 3 characters from the * following 23 lines: * 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE * 6 UNIT NUMBER OF SUMMARY FILE * 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. * F LOGICAL FLAG, T TO STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 16.0 THRESHOLD VALUE OF TEST RATIO * 6 NUMBER OF VALUES OF N * 0 1 2 3 5 9 VALUES OF N * 3 NUMBER OF VALUES OF ALPHA * (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA * 3 NUMBER OF VALUES OF BETA * (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA * ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS. * ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. * ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. * ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. * ZHERK T PUT F FOR NO TEST. SAME COLUMNS. * ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. * ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. * ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. * * See: * * Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. * A Set of Level 3 Basic Linear Algebra Subprograms. * * Technical Memorandum No.88 (Revision 1), Mathematics and * Computer Science Division, Argonne National Laboratory, 9700 * South Cass Avenue, Argonne, Illinois 60439, US. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. DOUBLE PRECISION EPS, ERR, THRESH INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, $ TSTERR CHARACTER*1 TRANSA, TRANSB CHARACTER*8 SNAMET CHARACTER*32 SNAPS, SUMMRY * .. Local Arrays .. COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), $ ALF( NALMAX ), AS( NMAX*NMAX ), $ BB( NMAX*NMAX ), BET( NBEMAX ), $ BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDMAX ) LOGICAL LTEST( NSUBS ) CHARACTER*8 SNAMES( NSUBS ) * .. External Functions .. DOUBLE PRECISION DDIFF LOGICAL LZE EXTERNAL DDIFF, LZE * .. External Subroutines .. EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHKE, ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK CHARACTER*8 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR COMMON /SRNAMC/SRNAMT * .. Data statements .. DATA SNAMES/'ZGEMM3M ', 'ZHEMM ', 'ZSYMM ', $ 'ZTRMM ', $ 'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K', $ 'ZSYR2K'/ * .. Executable Statements .. * * Read name and unit number for summary output file and open file. * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. * READ( NIN, FMT = * )SNAPS READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI REWI = REWI.AND.TRACE * Read the flag that directs stopping on any failure. READ( NIN, FMT = * )SFATAL * Read the flag that indicates whether error exits are to be tested. READ( NIN, FMT = * )TSTERR * Read the threshold value of the test ratio READ( NIN, FMT = * )THRESH * * Read and check the parameter values for the tests. * * Values of N READ( NIN, FMT = * )NIDIM IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN WRITE( NOUT, FMT = 9997 )'N', NIDMAX GO TO 220 END IF READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) DO 10 I = 1, NIDIM IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN WRITE( NOUT, FMT = 9996 )NMAX GO TO 220 END IF 10 CONTINUE * Values of ALPHA READ( NIN, FMT = * )NALF IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX GO TO 220 END IF READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) * Values of BETA READ( NIN, FMT = * )NBET IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX GO TO 220 END IF READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) * * Report values of parameters. * WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) IF( .NOT.TSTERR )THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9984 ) END IF WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )THRESH WRITE( NOUT, FMT = * ) * * Read names of subroutines and flags which indicate * whether they are to be tested. * DO 20 I = 1, NSUBS LTEST( I ) = .FALSE. 20 CONTINUE 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT DO 40 I = 1, NSUBS IF( SNAMET.EQ.SNAMES( I ) ) $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET STOP 50 LTEST( I ) = LTESTT GO TO 30 * 60 CONTINUE CLOSE ( NIN ) * * Compute EPS (the machine precision). * EPS = RONE 70 CONTINUE IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) $ GO TO 80 EPS = RHALF*EPS GO TO 70 80 CONTINUE EPS = EPS + EPS WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMMCH using exact data. * N = MIN( 32, NMAX ) DO 100 J = 1, N DO 90 I = 1, N AB( I, J ) = MAX( I - J + 1, 0 ) 90 CONTINUE AB( J, NMAX + 1 ) = J AB( 1, NMAX + J ) = J C( J, 1 ) = ZERO 100 CONTINUE DO 110 J = 1, N CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 110 CONTINUE * CC holds the exact result. On exit from ZMMCH CT holds * the result computed by ZMMCH. TRANSA = 'N' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 AB( 1, NMAX + J ) = N - J + 1 120 CONTINUE DO 130 J = 1, N CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - $ ( ( J + 1 )*J*( J - 1 ) )/3 130 CONTINUE TRANSA = 'C' TRANSB = 'N' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR STOP END IF * * Test each subroutine in turn. * DO 200 ISNUM = 1, NSUBS WRITE( NOUT, FMT = * ) IF( .NOT.LTEST( ISNUM ) )THEN * Subprogram is not to be tested. WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) ELSE SRNAMT = SNAMES( ISNUM ) * Test error exits. IF( TSTERR )THEN CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) WRITE( NOUT, FMT = * ) END IF * Test computations. INFOT = 0 OK = .TRUE. FATAL = .FALSE. GO TO ( 140, 150, 150, 160, 160, 170, 170, $ 180, 180 )ISNUM * Test ZGEMM3M, 01. 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test ZHEMM, 02, ZSYMM, 03. 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test ZTRMM, 04, ZTRSM, 05. 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) GO TO 190 * Test ZHERK, 06, ZSYRK, 07. 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, $ CC, CS, CT, G ) GO TO 190 * Test ZHER2K, 08, ZSYR2K, 09. 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) GO TO 190 * 190 IF( FATAL.AND.SFATAL ) $ GO TO 210 END IF 200 CONTINUE WRITE( NOUT, FMT = 9986 ) GO TO 230 * 210 CONTINUE WRITE( NOUT, FMT = 9985 ) GO TO 230 * 220 CONTINUE WRITE( NOUT, FMT = 9991 ) * 230 CONTINUE IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) STOP * 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', $ 'S THAN', F8.2 ) 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', $ 'THAN ', I2 ) 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) 9995 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) 9994 FORMAT( ' FOR N ', 9I6 ) 9993 FORMAT( ' FOR ALPHA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9992 FORMAT( ' FOR BETA ', $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', $ /' ******* TESTS ABANDONED *******' ) 9990 FORMAT( ' SUBPROGRAM NAME ', A8, ' NOT RECOGNIZED', /' ******* T', $ 'ESTS ABANDONED *******' ) 9989 FORMAT( ' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', $ '*******' ) 9988 FORMAT( A8, L2 ) 9987 FORMAT( 1X, A8, ' WAS NOT TESTED' ) 9986 FORMAT( /' END OF TESTS' ) 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) * * End of ZBLAT3. * END SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests ZGEMM3M. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, $ MA, MB, MS, N, NA, NARGS, NB, NC, NS LOGICAL NULL, RESET, SAME, TRANA, TRANB CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB CHARACTER*3 ICH * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZGEMM3M, ZMAKE, ZMMCH * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICH/'NTC'/ * .. Executable Statements .. * NARGS = 13 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 110 IM = 1, NIDIM M = IDIM( IM ) * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICA = 1, 3 TRANSA = ICH( ICA: ICA ) TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' * IF( TRANA )THEN MA = K NA = M ELSE MA = M NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICB = 1, 3 TRANSB = ICH( ICB: ICB ) TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' * IF( TRANB )THEN MB = N NB = K ELSE MB = K NB = N END IF * Set LDB to 1 more than minimum value if room. LDB = MB IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 70 LBB = LDB*NB * * Generate the matrix B. * CALL ZMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, $ LDB, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, $ CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * TRANAS = TRANSA TRANBS = TRANSB MS = M NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, $ BETA, LDC IF( REWI ) $ REWIND NTRA CALL ZGEMM3M( TRANSA, TRANSB, M, N, K, ALPHA, $ AA, LDA, BB, LDB, BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = TRANSA.EQ.TRANAS ISAME( 2 ) = TRANSB.EQ.TRANBS ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = KS.EQ.K ISAME( 6 ) = ALS.EQ.ALPHA ISAME( 7 ) = LZE( AS, AA, LAA ) ISAME( 8 ) = LDAS.EQ.LDA ISAME( 9 ) = LZE( BS, BB, LBB ) ISAME( 10 ) = LDBS.EQ.LDB ISAME( 11 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 12 ) = LZE( CS, CC, LCC ) ELSE ISAME( 12 ) = LZERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 13 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report * and return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result. * CALL ZMMCH( TRANSA, TRANSB, M, N, K, $ ALPHA, A, NMAX, B, NMAX, BETA, $ C, NMAX, CT, G, CC, LDC, EPS, $ ERR, FATAL, NOUT, .TRUE. ) ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 120 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, $ ALPHA, LDA, LDB, BETA, LDC * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A8, '(''', A1, ''',''', A1, ''',', $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK1. * END SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests ZHEMM and ZSYMM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BLS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, $ NARGS, NC, NS LOGICAL CONJ, LEFT, NULL, RESET, SAME CHARACTER*1 SIDE, SIDES, UPLO, UPLOS CHARACTER*2 ICHS, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHEMM, ZMAKE, ZMMCH, ZSYMM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHS/'LR'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 100 IM = 1, NIDIM M = IDIM( IM ) * DO 90 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = M IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 90 LCC = LDC*N NULL = N.LE.0.OR.M.LE.0 * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 90 LBB = LDB*N * * Generate the matrix B. * CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, $ ZERO ) * DO 80 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' * IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * * Generate the hermitian or symmetric matrix A. * CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, $ AA, LDA, RESET, ZERO ) * DO 60 IA = 1, NALF ALPHA = ALF( IA ) * DO 50 IB = 1, NBET BETA = BET( IB ) * * Generate the matrix C. * CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, $ LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO MS = M NS = N ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB BLS = BETA DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA IF( CONJ )THEN CALL ZHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) ELSE CALL ZSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, $ BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 110 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = MS.EQ.M ISAME( 4 ) = NS.EQ.N ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB ISAME( 10 ) = BLS.EQ.BETA IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'GE', ' ', M, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 110 END IF * IF( .NOT.NULL )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, $ NMAX, B, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, $ NMAX, A, NMAX, BETA, C, NMAX, $ CT, G, CC, LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 120 * 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, $ LDB, BETA, LDC * 120 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK2. * END SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, $ B, BB, BS, CT, G, C ) * * Tests ZTRMM and ZTRSM. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS DOUBLE PRECISION ERR, ERRMAX INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, $ NS LOGICAL LEFT, NULL, RESET, SAME CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, $ UPLOS CHARACTER*2 ICHD, ICHS, ICHU CHARACTER*3 ICHT * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZMAKE, ZMMCH, ZTRMM, ZTRSM * .. Intrinsic Functions .. INTRINSIC MAX * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ * .. Executable Statements .. * NARGS = 11 NC = 0 RESET = .TRUE. ERRMAX = RZERO * Set up zero matrix for ZMMCH. DO 20 J = 1, NMAX DO 10 I = 1, NMAX C( I, J ) = ZERO 10 CONTINUE 20 CONTINUE * DO 140 IM = 1, NIDIM M = IDIM( IM ) * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDB to 1 more than minimum value if room. LDB = M IF( LDB.LT.NMAX ) $ LDB = LDB + 1 * Skip tests if not enough room. IF( LDB.GT.NMAX ) $ GO TO 130 LBB = LDB*N NULL = M.LE.0.OR.N.LE.0 * DO 120 ICS = 1, 2 SIDE = ICHS( ICS: ICS ) LEFT = SIDE.EQ.'L' IF( LEFT )THEN NA = M ELSE NA = N END IF * Set LDA to 1 more than minimum value if room. LDA = NA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 130 LAA = LDA*NA * DO 110 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) * DO 100 ICT = 1, 3 TRANSA = ICHT( ICT: ICT ) * DO 90 ICD = 1, 2 DIAG = ICHD( ICD: ICD ) * DO 80 IA = 1, NALF ALPHA = ALF( IA ) * * Generate the matrix A. * CALL ZMAKE( 'TR', UPLO, DIAG, NA, NA, A, $ NMAX, AA, LDA, RESET, ZERO ) * * Generate the matrix B. * CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, $ BB, LDB, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the * subroutine. * SIDES = SIDE UPLOS = UPLO TRANAS = TRANSA DIAGS = DIAG MS = M NS = N ALS = ALPHA DO 30 I = 1, LAA AS( I ) = AA( I ) 30 CONTINUE LDAS = LDA DO 40 I = 1, LBB BS( I ) = BB( I ) 40 CONTINUE LDBS = LDB * * Call the subroutine. * IF( SNAME( 4: 5 ).EQ.'MM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL ZTRMM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9995 )NC, SNAME, $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, $ LDA, LDB IF( REWI ) $ REWIND NTRA CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, AA, LDA, BB, LDB ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9994 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = SIDES.EQ.SIDE ISAME( 2 ) = UPLOS.EQ.UPLO ISAME( 3 ) = TRANAS.EQ.TRANSA ISAME( 4 ) = DIAGS.EQ.DIAG ISAME( 5 ) = MS.EQ.M ISAME( 6 ) = NS.EQ.N ISAME( 7 ) = ALS.EQ.ALPHA ISAME( 8 ) = LZE( AS, AA, LAA ) ISAME( 9 ) = LDAS.EQ.LDA IF( NULL )THEN ISAME( 10 ) = LZE( BS, BB, LBB ) ELSE ISAME( 10 ) = LZERES( 'GE', ' ', M, N, BS, $ BB, LDB ) END IF ISAME( 11 ) = LDBS.EQ.LDB * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 50 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 50 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN IF( SNAME( 4: 5 ).EQ.'MM' )THEN * * Check the result. * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ALPHA, A, NMAX, B, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ALPHA, B, NMAX, A, NMAX, $ ZERO, C, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN * * Compute approximation to original * matrix. * DO 70 J = 1, N DO 60 I = 1, M C( I, J ) = BB( I + ( J - 1 )* $ LDB ) BB( I + ( J - 1 )*LDB ) = ALPHA* $ B( I, J ) 60 CONTINUE 70 CONTINUE * IF( LEFT )THEN CALL ZMMCH( TRANSA, 'N', M, N, M, $ ONE, A, NMAX, C, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) ELSE CALL ZMMCH( 'N', TRANSA, M, N, N, $ ONE, C, NMAX, A, NMAX, $ ZERO, B, NMAX, CT, G, $ BB, LDB, EPS, ERR, $ FATAL, NOUT, .FALSE. ) END IF END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 150 END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * 140 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, $ N, ALPHA, LDA, LDB * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( 1X, I6, ': ', A8, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', $ ' .' ) 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK3. * END SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) * * Tests ZHERK and ZSYRK. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), $ AS( NMAX*NMAX ), B( NMAX, NMAX ), $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), $ C( NMAX, NMAX ), CC( NMAX*NMAX ), $ CS( NMAX*NMAX ), CT( NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, $ NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHERK, ZMAKE, ZMMCH, ZSYRK * .. Intrinsic Functions .. INTRINSIC DCMPLX, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 10 NC = 0 RESET = .TRUE. ERRMAX = RZERO RALS = RONE RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 100 LCC = LDC*N * DO 90 IK = 1, NIDIM K = IDIM( IK ) * DO 80 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 80 LAA = LDA*NA * * Generate the matrix A. * CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, $ RESET, ZERO ) * DO 70 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 60 IA = 1, NALF ALPHA = ALF( IA ) IF( CONJ )THEN RALPHA = DBLE( ALPHA ) ALPHA = DCMPLX( RALPHA, RZERO ) END IF * DO 50 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. $ RZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K IF( CONJ )THEN RALS = RALPHA ELSE ALS = ALPHA END IF DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 20 I = 1, LCC CS( I ) = CC( I ) 20 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, RALPHA, LDA, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL ZHERK( UPLO, TRANS, N, K, RALPHA, AA, $ LDA, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, BETA, LDC IF( REWI ) $ REWIND NTRA CALL ZSYRK( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 120 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K IF( CONJ )THEN ISAME( 5 ) = RALS.EQ.RALPHA ELSE ISAME( 5 ) = ALS.EQ.ALPHA END IF ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA IF( CONJ )THEN ISAME( 8 ) = RBETS.EQ.RBETA ELSE ISAME( 8 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 9 ) = LZE( CS, CC, LCC ) ELSE ISAME( 9 ) = LZERES( SNAME( 2: 3 ), UPLO, N, $ N, CS, CC, LDC ) END IF ISAME( 10 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 30 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 30 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 120 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JC = 1 DO 40 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN CALL ZMMCH( TRANST, 'N', LJ, 1, K, $ ALPHA, A( 1, JJ ), NMAX, $ A( 1, J ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) ELSE CALL ZMMCH( 'N', TRANST, LJ, 1, K, $ ALPHA, A( JJ, 1 ), NMAX, $ A( J, 1 ), NMAX, BETA, $ C( JJ, J ), NMAX, CT, G, $ CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 110 40 CONTINUE END IF * 50 CONTINUE * 60 CONTINUE * 70 CONTINUE * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 130 * 110 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, $ LDA, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, BETA, LDC END IF * 130 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', $ ' .' ) 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, $ '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK4. * END SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) * * Tests ZHER2K and ZSYR2K. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RONE, RZERO PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) * .. Scalar Arguments .. DOUBLE PRECISION EPS, THRESH INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA LOGICAL FATAL, REWI, TRACE CHARACTER*8 SNAME * .. Array Arguments .. COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), $ W( 2*NMAX ) DOUBLE PRECISION G( NMAX ) INTEGER IDIM( NIDIM ) * .. Local Scalars .. COMPLEX*16 ALPHA, ALS, BETA, BETS DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS CHARACTER*2 ICHT, ICHU * .. Local Arrays .. LOGICAL ISAME( 13 ) * .. External Functions .. LOGICAL LZE, LZERES EXTERNAL LZE, LZERES * .. External Subroutines .. EXTERNAL ZHER2K, ZMAKE, ZMMCH, ZSYR2K * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, MAX, DBLE * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Data statements .. DATA ICHT/'NC'/, ICHU/'UL'/ * .. Executable Statements .. CONJ = SNAME( 2: 3 ).EQ.'HE' * NARGS = 12 NC = 0 RESET = .TRUE. ERRMAX = RZERO * DO 130 IN = 1, NIDIM N = IDIM( IN ) * Set LDC to 1 more than minimum value if room. LDC = N IF( LDC.LT.NMAX ) $ LDC = LDC + 1 * Skip tests if not enough room. IF( LDC.GT.NMAX ) $ GO TO 130 LCC = LDC*N * DO 120 IK = 1, NIDIM K = IDIM( IK ) * DO 110 ICT = 1, 2 TRANS = ICHT( ICT: ICT ) TRAN = TRANS.EQ.'C' IF( TRAN.AND..NOT.CONJ ) $ TRANS = 'T' IF( TRAN )THEN MA = K NA = N ELSE MA = N NA = K END IF * Set LDA to 1 more than minimum value if room. LDA = MA IF( LDA.LT.NMAX ) $ LDA = LDA + 1 * Skip tests if not enough room. IF( LDA.GT.NMAX ) $ GO TO 110 LAA = LDA*NA * * Generate the matrix A. * IF( TRAN )THEN CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, $ LDA, RESET, ZERO ) ELSE CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, $ RESET, ZERO ) END IF * * Generate the matrix B. * LDB = LDA LBB = LAA IF( TRAN )THEN CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), $ 2*NMAX, BB, LDB, RESET, ZERO ) ELSE CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), $ NMAX, BB, LDB, RESET, ZERO ) END IF * DO 100 ICU = 1, 2 UPLO = ICHU( ICU: ICU ) UPPER = UPLO.EQ.'U' * DO 90 IA = 1, NALF ALPHA = ALF( IA ) * DO 80 IB = 1, NBET BETA = BET( IB ) IF( CONJ )THEN RBETA = DBLE( BETA ) BETA = DCMPLX( RBETA, RZERO ) END IF NULL = N.LE.0 IF( CONJ ) $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. $ ZERO ).AND.RBETA.EQ.RONE ) * * Generate the matrix C. * CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, $ NMAX, CC, LDC, RESET, ZERO ) * NC = NC + 1 * * Save every datum before calling the subroutine. * UPLOS = UPLO TRANSS = TRANS NS = N KS = K ALS = ALPHA DO 10 I = 1, LAA AS( I ) = AA( I ) 10 CONTINUE LDAS = LDA DO 20 I = 1, LBB BS( I ) = BB( I ) 20 CONTINUE LDBS = LDB IF( CONJ )THEN RBETS = RBETA ELSE BETS = BETA END IF DO 30 I = 1, LCC CS( I ) = CC( I ) 30 CONTINUE LDCS = LDC * * Call the subroutine. * IF( CONJ )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC IF( REWI ) $ REWIND NTRA CALL ZHER2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, RBETA, CC, LDC ) ELSE IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC IF( REWI ) $ REWIND NTRA CALL ZSYR2K( UPLO, TRANS, N, K, ALPHA, AA, $ LDA, BB, LDB, BETA, CC, LDC ) END IF * * Check if error-exit was taken incorrectly. * IF( .NOT.OK )THEN WRITE( NOUT, FMT = 9992 ) FATAL = .TRUE. GO TO 150 END IF * * See what data changed inside subroutines. * ISAME( 1 ) = UPLOS.EQ.UPLO ISAME( 2 ) = TRANSS.EQ.TRANS ISAME( 3 ) = NS.EQ.N ISAME( 4 ) = KS.EQ.K ISAME( 5 ) = ALS.EQ.ALPHA ISAME( 6 ) = LZE( AS, AA, LAA ) ISAME( 7 ) = LDAS.EQ.LDA ISAME( 8 ) = LZE( BS, BB, LBB ) ISAME( 9 ) = LDBS.EQ.LDB IF( CONJ )THEN ISAME( 10 ) = RBETS.EQ.RBETA ELSE ISAME( 10 ) = BETS.EQ.BETA END IF IF( NULL )THEN ISAME( 11 ) = LZE( CS, CC, LCC ) ELSE ISAME( 11 ) = LZERES( 'HE', UPLO, N, N, CS, $ CC, LDC ) END IF ISAME( 12 ) = LDCS.EQ.LDC * * If data was incorrectly changed, report and * return. * SAME = .TRUE. DO 40 I = 1, NARGS SAME = SAME.AND.ISAME( I ) IF( .NOT.ISAME( I ) ) $ WRITE( NOUT, FMT = 9998 )I 40 CONTINUE IF( .NOT.SAME )THEN FATAL = .TRUE. GO TO 150 END IF * IF( .NOT.NULL )THEN * * Check the result column by column. * IF( CONJ )THEN TRANST = 'C' ELSE TRANST = 'T' END IF JJAB = 1 JC = 1 DO 70 J = 1, N IF( UPPER )THEN JJ = 1 LJ = J ELSE JJ = J LJ = N - J + 1 END IF IF( TRAN )THEN DO 50 I = 1, K W( I ) = ALPHA*AB( ( J - 1 )*2* $ NMAX + K + I ) IF( CONJ )THEN W( K + I ) = DCONJG( ALPHA )* $ AB( ( J - 1 )*2* $ NMAX + I ) ELSE W( K + I ) = ALPHA* $ AB( ( J - 1 )*2* $ NMAX + I ) END IF 50 CONTINUE CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, $ ONE, AB( JJAB ), 2*NMAX, W, $ 2*NMAX, BETA, C( JJ, J ), $ NMAX, CT, G, CC( JC ), LDC, $ EPS, ERR, FATAL, NOUT, $ .TRUE. ) ELSE DO 60 I = 1, K IF( CONJ )THEN W( I ) = ALPHA*DCONJG( AB( ( K + $ I - 1 )*NMAX + J ) ) W( K + I ) = DCONJG( ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) ) ELSE W( I ) = ALPHA*AB( ( K + I - 1 )* $ NMAX + J ) W( K + I ) = ALPHA* $ AB( ( I - 1 )*NMAX + $ J ) END IF 60 CONTINUE CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, $ AB( JJ ), NMAX, W, 2*NMAX, $ BETA, C( JJ, J ), NMAX, CT, $ G, CC( JC ), LDC, EPS, ERR, $ FATAL, NOUT, .TRUE. ) END IF IF( UPPER )THEN JC = JC + LDC ELSE JC = JC + LDC + 1 IF( TRAN ) $ JJAB = JJAB + 2*NMAX END IF ERRMAX = MAX( ERRMAX, ERR ) * If got really bad answer, report and * return. IF( FATAL ) $ GO TO 140 70 CONTINUE END IF * 80 CONTINUE * 90 CONTINUE * 100 CONTINUE * 110 CONTINUE * 120 CONTINUE * 130 CONTINUE * * Report result. * IF( ERRMAX.LT.THRESH )THEN WRITE( NOUT, FMT = 9999 )SNAME, NC ELSE WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX END IF GO TO 160 * 140 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9995 )J * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME IF( CONJ )THEN WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, RBETA, LDC ELSE WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, $ LDA, LDB, BETA, LDC END IF * 160 CONTINUE RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', $ 'S)' ) 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', $ 'ANGED INCORRECTLY *******' ) 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, $ ' - SUSPECT *******' ) 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' ) 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, $ ', C,', I3, ') .' ) 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, $ ',', F4.1, '), C,', I3, ') .' ) 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', $ '******' ) * * End of ZCHK5. * END SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. * ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*8 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK * .. Local Scalars .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION RALPHA, RBETA * .. Local Arrays .. COMPLEX*16 A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) * .. External Subroutines .. EXTERNAL ZGEMM3M, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. * OK is set to .FALSE. by the special version of XERBLA or by CHKXER * if anything is wrong. OK = .TRUE. * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 CALL ZGEMM3M( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL ZGEMM3M( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 1 CALL ZGEMM3M( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGEMM3M( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGEMM3M( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZGEMM3M( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZGEMM3M( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZGEMM3M( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZGEMM3M( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZGEMM3M( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGEMM3M( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 13 CALL ZGEMM3M( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 20 INFOT = 1 CALL ZHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 30 INFOT = 1 CALL ZSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 40 INFOT = 1 CALL ZTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 50 INFOT = 1 CALL ZTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 6 CALL ZTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 60 INFOT = 1 CALL ZHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 70 INFOT = 1 CALL ZSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 80 INFOT = 1 CALL ZHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) GO TO 100 90 INFOT = 1 CALL ZSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 2 CALL ZSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 3 CALL ZSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 4 CALL ZSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 7 CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * 100 IF( OK )THEN WRITE( NOUT, FMT = 9999 )SRNAMT ELSE WRITE( NOUT, FMT = 9998 )SRNAMT END IF RETURN * 9999 FORMAT( ' ', A8, ' PASSED THE TESTS OF ERROR-EXITS' ) 9998 FORMAT( ' ******* ', A8, ' FAILED THE TESTS OF ERROR-EXITS *****', $ '**' ) * * End of ZCHKE. * END SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, $ TRANSL ) * * Generates values for an M by N matrix A. * Stores the values in the array AA in the data structure required * by the routine, with unwanted elements set to rogue value. * * TYPE is 'GE', 'HE', 'SY' or 'TR'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) COMPLEX*16 ROGUE PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) DOUBLE PRECISION RROGUE PARAMETER ( RROGUE = -1.0D10 ) * .. Scalar Arguments .. COMPLEX*16 TRANSL INTEGER LDA, M, N, NMAX LOGICAL RESET CHARACTER*1 DIAG, UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 A( NMAX, * ), AA( * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J, JJ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER * .. External Functions .. COMPLEX*16 ZBEG EXTERNAL ZBEG * .. Intrinsic Functions .. INTRINSIC DCMPLX, DCONJG, DBLE * .. Executable Statements .. GEN = TYPE.EQ.'GE' HER = TYPE.EQ.'HE' SYM = TYPE.EQ.'SY' TRI = TYPE.EQ.'TR' UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' UNIT = TRI.AND.DIAG.EQ.'U' * * Generate data in array A. * DO 20 J = 1, N DO 10 I = 1, M IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) $ THEN A( I, J ) = ZBEG( RESET ) + TRANSL IF( I.NE.J )THEN * Set some elements to zero IF( N.GT.3.AND.J.EQ.N/2 ) $ A( I, J ) = ZERO IF( HER )THEN A( J, I ) = DCONJG( A( I, J ) ) ELSE IF( SYM )THEN A( J, I ) = A( I, J ) ELSE IF( TRI )THEN A( J, I ) = ZERO END IF END IF END IF 10 CONTINUE IF( HER ) $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) IF( TRI ) $ A( J, J ) = A( J, J ) + ONE IF( UNIT ) $ A( J, J ) = ONE 20 CONTINUE * * Store elements in array AS in data structure required by routine. * IF( TYPE.EQ.'GE' )THEN DO 50 J = 1, N DO 30 I = 1, M AA( I + ( J - 1 )*LDA ) = A( I, J ) 30 CONTINUE DO 40 I = M + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 40 CONTINUE 50 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN DO 90 J = 1, N IF( UPPER )THEN IBEG = 1 IF( UNIT )THEN IEND = J - 1 ELSE IEND = J END IF ELSE IF( UNIT )THEN IBEG = J + 1 ELSE IBEG = J END IF IEND = N END IF DO 60 I = 1, IBEG - 1 AA( I + ( J - 1 )*LDA ) = ROGUE 60 CONTINUE DO 70 I = IBEG, IEND AA( I + ( J - 1 )*LDA ) = A( I, J ) 70 CONTINUE DO 80 I = IEND + 1, LDA AA( I + ( J - 1 )*LDA ) = ROGUE 80 CONTINUE IF( HER )THEN JJ = J + ( J - 1 )*LDA AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) END IF 90 CONTINUE END IF RETURN * * End of ZMAKE. * END SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, $ NOUT, MV ) * * Checks the results of the computational tests. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Parameters .. COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RONE PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) * .. Scalar Arguments .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION EPS, ERR INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT LOGICAL FATAL, MV CHARACTER*1 TRANSA, TRANSB * .. Array Arguments .. COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), $ CC( LDCC, * ), CT( * ) DOUBLE PRECISION G( * ) * .. Local Scalars .. COMPLEX*16 CL DOUBLE PRECISION ERRI INTEGER I, J, K LOGICAL CTRANA, CTRANB, TRANA, TRANB * .. Intrinsic Functions .. INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT * .. Statement Functions .. DOUBLE PRECISION ABS1 * .. Statement Function definitions .. ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) * .. Executable Statements .. TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' CTRANA = TRANSA.EQ.'C' CTRANB = TRANSB.EQ.'C' * * Compute expected result, one column at a time, in CT using data * in A, B and C. * Compute gauges in G. * DO 220 J = 1, N * DO 10 I = 1, M CT( I ) = ZERO G( I ) = RZERO 10 CONTINUE IF( .NOT.TRANA.AND..NOT.TRANB )THEN DO 30 K = 1, KK DO 20 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( K, J ) G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) 20 CONTINUE 30 CONTINUE ELSE IF( TRANA.AND..NOT.TRANB )THEN IF( CTRANA )THEN DO 50 K = 1, KK DO 40 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 40 CONTINUE 50 CONTINUE ELSE DO 70 K = 1, KK DO 60 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( K, J ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( K, J ) ) 60 CONTINUE 70 CONTINUE END IF ELSE IF( .NOT.TRANA.AND.TRANB )THEN IF( CTRANB )THEN DO 90 K = 1, KK DO 80 I = 1, M CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 80 CONTINUE 90 CONTINUE ELSE DO 110 K = 1, KK DO 100 I = 1, M CT( I ) = CT( I ) + A( I, K )*B( J, K ) G( I ) = G( I ) + ABS1( A( I, K ) )* $ ABS1( B( J, K ) ) 100 CONTINUE 110 CONTINUE END IF ELSE IF( TRANA.AND.TRANB )THEN IF( CTRANA )THEN IF( CTRANB )THEN DO 130 K = 1, KK DO 120 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 120 CONTINUE 130 CONTINUE ELSE DO 150 K = 1, KK DO 140 I = 1, M CT( I ) = CT( I ) + DCONJG( A( K, I ) )* $ B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 140 CONTINUE 150 CONTINUE END IF ELSE IF( CTRANB )THEN DO 170 K = 1, KK DO 160 I = 1, M CT( I ) = CT( I ) + A( K, I )* $ DCONJG( B( J, K ) ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 160 CONTINUE 170 CONTINUE ELSE DO 190 K = 1, KK DO 180 I = 1, M CT( I ) = CT( I ) + A( K, I )*B( J, K ) G( I ) = G( I ) + ABS1( A( K, I ) )* $ ABS1( B( J, K ) ) 180 CONTINUE 190 CONTINUE END IF END IF END IF DO 200 I = 1, M CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) G( I ) = ABS1( ALPHA )*G( I ) + $ ABS1( BETA )*ABS1( C( I, J ) ) 200 CONTINUE * * Compute the error ratio for this result. * ERR = ZERO DO 210 I = 1, M ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS IF( G( I ).NE.RZERO ) $ ERRI = ERRI/G( I ) ERR = MAX( ERR, ERRI ) IF( ERR*SQRT( EPS ).GE.RONE ) $ GO TO 230 210 CONTINUE * 220 CONTINUE * * If the loop completes, all results are at least half accurate. GO TO 250 * * Report fatal error. * 230 FATAL = .TRUE. WRITE( NOUT, FMT = 9999 ) DO 240 I = 1, M IF( MV )THEN WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) ELSE WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) END IF 240 CONTINUE IF( N.GT.1 ) $ WRITE( NOUT, FMT = 9997 )J * 250 CONTINUE RETURN * 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', $ 'F ACCURATE *******', /' EXPECTED RE', $ 'SULT COMPUTED RESULT' ) 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) * * End of ZMMCH. * END LOGICAL FUNCTION LZE( RI, RJ, LR ) * * Tests if two arrays are identical. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LR * .. Array Arguments .. COMPLEX*16 RI( * ), RJ( * ) * .. Local Scalars .. INTEGER I * .. Executable Statements .. DO 10 I = 1, LR IF( RI( I ).NE.RJ( I ) ) $ GO TO 20 10 CONTINUE LZE = .TRUE. GO TO 30 20 CONTINUE LZE = .FALSE. 30 RETURN * * End of LZE. * END LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) * * Tests if selected elements in two arrays are equal. * * TYPE is 'GE' or 'HE' or 'SY'. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER LDA, M, N CHARACTER*1 UPLO CHARACTER*2 TYPE * .. Array Arguments .. COMPLEX*16 AA( LDA, * ), AS( LDA, * ) * .. Local Scalars .. INTEGER I, IBEG, IEND, J LOGICAL UPPER * .. Executable Statements .. UPPER = UPLO.EQ.'U' IF( TYPE.EQ.'GE' )THEN DO 20 J = 1, N DO 10 I = M + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 10 CONTINUE 20 CONTINUE ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN DO 50 J = 1, N IF( UPPER )THEN IBEG = 1 IEND = J ELSE IBEG = J IEND = N END IF DO 30 I = 1, IBEG - 1 IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 30 CONTINUE DO 40 I = IEND + 1, LDA IF( AA( I, J ).NE.AS( I, J ) ) $ GO TO 70 40 CONTINUE 50 CONTINUE END IF * 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE LZERES = .FALSE. 80 RETURN * * End of LZERES. * END COMPLEX*16 FUNCTION ZBEG( RESET ) * * Generates complex numbers as pairs of random numbers uniformly * distributed between -0.5 and 0.5. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. LOGICAL RESET * .. Local Scalars .. INTEGER I, IC, J, MI, MJ * .. Save statement .. SAVE I, IC, J, MI, MJ * .. Intrinsic Functions .. INTRINSIC DCMPLX * .. Executable Statements .. IF( RESET )THEN * Initialize local variables. MI = 891 MJ = 457 I = 7 J = 7 IC = 0 RESET = .FALSE. END IF * * The sequence of values of I or J is bounded between 1 and 999. * If initial I or J = 1,2,3,6,7 or 9, the period will be 50. * If initial I or J = 4 or 8, the period will be 25. * If initial I or J = 5, the period will be 10. * IC is used to break up the period by skipping 1 value of I or J * in 6. * IC = IC + 1 10 I = I*MI J = J*MJ I = I - 1000*( I/1000 ) J = J - 1000*( J/1000 ) IF( IC.GE.5 )THEN IC = 0 GO TO 10 END IF ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) RETURN * * End of ZBEG. * END DOUBLE PRECISION FUNCTION DDIFF( X, Y ) * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. DOUBLE PRECISION X, Y * .. Executable Statements .. DDIFF = X - Y RETURN * * End of DDIFF. * END SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) * * Tests whether XERBLA has detected an error when it should. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*8 SRNAMT * .. Executable Statements .. IF( .NOT.LERR )THEN WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT OK = .FALSE. END IF LERR = .FALSE. RETURN * 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', $ 'ETECTED BY ', A8, ' *****' ) * * End of CHKXER. * END SUBROUTINE XERBLA( SRNAME, INFO ) * * This is a special version of XERBLA to be used only as part of * the test program for testing error exits from the Level 3 BLAS * routines. * * XERBLA is an error handler for the Level 3 BLAS routines. * * It is called by the Level 3 BLAS routines if an input parameter is * invalid. * * Auxiliary routine for test program for Level 3 Blas. * * -- Written on 8-February-1989. * Jack Dongarra, Argonne National Laboratory. * Iain Duff, AERE Harwell. * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * * .. Scalar Arguments .. INTEGER INFO CHARACTER*8 SRNAME * .. Scalars in Common .. INTEGER INFOT, NOUT LOGICAL LERR, OK CHARACTER*8 SRNAMT * .. Common blocks .. COMMON /INFOC/INFOT, NOUT, OK, LERR COMMON /SRNAMC/SRNAMT * .. Executable Statements .. LERR = .TRUE. IF( INFO.NE.INFOT )THEN IF( INFOT.NE.0 )THEN WRITE( NOUT, FMT = 9999 )INFO, INFOT ELSE WRITE( NOUT, FMT = 9997 )INFO END IF OK = .FALSE. END IF IF( SRNAME.NE.SRNAMT )THEN WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT OK = .FALSE. END IF RETURN * 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', $ ' OF ', I2, ' *******' ) 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A8, ' INSTE', $ 'AD OF ', A8, ' *******' ) 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, $ ' *******' ) * * End of XERBLA * END OpenBLAS-0.2.20/utest/000077500000000000000000000000001313527062700143105ustar00rootroot00000000000000OpenBLAS-0.2.20/utest/CMakeLists.txt000066400000000000000000000024231313527062700170510ustar00rootroot00000000000000include_directories(${PROJECT_SOURCE_DIR}) set(OpenBLAS_utest_src utest_main.c test_amax.c ) if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c ) endif() set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME}) if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(${OpenBLAS_utest_bin} m) endif() if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS") endif() #Set output for utest set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_CURRENT_BINARY_DIR}) endforeach() if (MSVC) add_custom_command(TARGET ${OpenBLAS_utest_bin} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/$/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. ) endif() add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})OpenBLAS-0.2.20/utest/Makefile000066400000000000000000000010121313527062700157420ustar00rootroot00000000000000UTEST_CHECK = 1 TOPDIR = .. UTESTBIN=openblas_utest .PHONY : all .NOTPARALLEL : all run_test $(UTESTBIN) include $(TOPDIR)/Makefile.system OBJS=utest_main.o test_amax.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o endif all : run_test $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) run_test: $(UTESTBIN) ifndef CROSS ./$(UTESTBIN) endif clean: -rm -f *.o $(UTESTBIN) libs: OpenBLAS-0.2.20/utest/ctest.h000066400000000000000000000530401313527062700156050ustar00rootroot00000000000000/* Copyright 2011-2016 Bas van den Berg * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef CTEST_H #define CTEST_H #if defined _WIN32 || defined __CYGWIN__ #ifndef WIN32 #define WIN32 #endif #endif #ifndef WIN32 #define WEAK __attribute__ ((weak)) #else #define WEAK #endif #include /* intmax_t, uintmax_t, PRI* */ #include /* size_t */ typedef void (*SetupFunc)(void*); typedef void (*TearDownFunc)(void*); typedef void (*RunWithDataFunc)(void*); struct ctest { const char* ssname; // suite name const char* ttname; // test name void (*run)(); int skip; void* data; SetupFunc setup; TearDownFunc teardown; struct ctest *next; unsigned int magic; }; #define __FNAME(sname, tname) __ctest_##sname##_##tname##_run #define __TNAME(sname, tname) __ctest_##sname##_##tname #define __PNAME(sname, tname) __ctest_##sname##_##tname##_pointer #ifdef __APPLE__ #define __CTEST_APPLE #endif #ifdef __MINGW32__ #undef CTEST_SEGFAULT #endif #if defined(_WIN32) && defined(_MSC_VER) #define __CTEST_MSVC #endif //config for MSVC compiler #ifdef __CTEST_MSVC #define __CTEST_NO_TIME #define CTEST_NO_COLORS #ifndef CTEST_ADD_TESTS_MANUALLY #pragma section(".ctest$a") #pragma section(".ctest$u") #pragma section(".ctest$z") #endif //clear this flag for msvc #ifdef CTEST_SEGFAULT #undef CTEST_SEGFAULT #endif #if _MSC_VER < 1900 #define snprintf _snprintf_s #endif #ifndef __cplusplus #define inline __inline #endif #endif #ifdef CTEST_NO_JMP #define __CTEST_NO_JMP #endif #define __CTEST_MAGIC (0xdeadbeef) #ifdef CTEST_ADD_TESTS_MANUALLY # define __Test_Section #else #ifdef __CTEST_APPLE #define __Test_Section __attribute__ ((used, section ("__DATA, .ctest"))) #elif defined (__CTEST_MSVC) #define __Test_Section __declspec( allocate(".ctest$u")) #else #define __Test_Section __attribute__ ((used, section (".ctest"))) #endif #endif #ifndef __CTEST_MSVC #define __CTEST_STRUCT(sname, tname, _skip, __data, __setup, __teardown) \ static struct ctest __TNAME(sname, tname) = { \ .ssname=#sname, \ .ttname=#tname, \ .run = __FNAME(sname, tname), \ .skip = _skip, \ .data = __data, \ .setup = (SetupFunc)__setup, \ .teardown = (TearDownFunc)__teardown, \ .next = NULL, \ .magic = __CTEST_MAGIC}; \ static void * __PNAME(sname, tname)[2] __Test_Section = {(void*)& __TNAME(sname,tname), (void*)__CTEST_MAGIC}; #else //for msvc #define __CTEST_STRUCT(sname, tname, _skip, __data, __setup, __teardown) \ static struct ctest __TNAME(sname, tname) = { \ #sname, \ #tname, \ __FNAME(sname, tname), \ _skip, \ __data, \ (SetupFunc)__setup, \ (TearDownFunc)__teardown, \ NULL, \ __CTEST_MAGIC}; \ __Test_Section static void * __PNAME(sname, tname)[2]= {(void*)& __TNAME(sname,tname), (void *)__CTEST_MAGIC}; #endif #define CTEST_DATA(sname) struct sname##_data #define CTEST_SETUP(sname) \ void WEAK sname##_setup(struct sname##_data* data) #define CTEST_TEARDOWN(sname) \ void WEAK sname##_teardown(struct sname##_data* data) #define __CTEST_INTERNAL(sname, tname, _skip) \ void __FNAME(sname, tname)(); \ __CTEST_STRUCT(sname, tname, _skip, NULL, NULL, NULL) \ void __FNAME(sname, tname)() #ifdef __CTEST_APPLE #define SETUP_FNAME(sname) NULL #define TEARDOWN_FNAME(sname) NULL #else #define SETUP_FNAME(sname) sname##_setup #define TEARDOWN_FNAME(sname) sname##_teardown #endif #define __CTEST2_INTERNAL(sname, tname, _skip) \ static struct sname##_data __ctest_##sname##_data; \ CTEST_SETUP(sname); \ CTEST_TEARDOWN(sname); \ void __FNAME(sname, tname)(struct sname##_data* data); \ __CTEST_STRUCT(sname, tname, _skip, &__ctest_##sname##_data, SETUP_FNAME(sname), TEARDOWN_FNAME(sname)) \ void __FNAME(sname, tname)(struct sname##_data* data) void CTEST_LOG(const char* fmt, ...); void CTEST_ERR(const char* fmt, ...); // doesn't return #define CTEST(sname, tname) __CTEST_INTERNAL(sname, tname, 0) #define CTEST_SKIP(sname, tname) __CTEST_INTERNAL(sname, tname, 1) #define CTEST2(sname, tname) __CTEST2_INTERNAL(sname, tname, 0) #define CTEST2_SKIP(sname, tname) __CTEST2_INTERNAL(sname, tname, 1) #ifdef CTEST_ADD_TESTS_MANUALLY void __ctest_addTest(struct ctest *); #define CTEST_ADD(sname, tname) do { \ extern struct ctest __TNAME(sname, tname); \ __ctest_addTest(&__TNAME(sname, tname)); \ } while (0) #define CTEST_ADD2(sname, tname) do { \ extern struct ctest __TNAME(sname, tname); \ __ctest_addTest(&__TNAME(sname, tname)); \ } while (0) #endif // CTEST_ADD_TESTS_MANUALLY void assert_str(const char* exp, const char* real, const char* caller, int line); #define ASSERT_STR(exp, real) assert_str(exp, real, __FILE__, __LINE__) void assert_data(const unsigned char* exp, size_t expsize, const unsigned char* real, size_t realsize, const char* caller, int line); #define ASSERT_DATA(exp, expsize, real, realsize) \ assert_data(exp, expsize, real, realsize, __FILE__, __LINE__) void assert_equal(intmax_t exp, intmax_t real, const char* caller, int line); #define ASSERT_EQUAL(exp, real) assert_equal(exp, real, __FILE__, __LINE__) void assert_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line); #define ASSERT_EQUAL_U(exp, real) assert_equal_u(exp, real, __FILE__, __LINE__) void assert_not_equal(intmax_t exp, intmax_t real, const char* caller, int line); #define ASSERT_NOT_EQUAL(exp, real) assert_not_equal(exp, real, __FILE__, __LINE__) void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line); #define ASSERT_NOT_EQUAL_U(exp, real) assert_not_equal_u(exp, real, __FILE__, __LINE__) void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line); #define ASSERT_INTERVAL(exp1, exp2, real) assert_interval(exp1, exp2, real, __FILE__, __LINE__) void assert_null(void* real, const char* caller, int line); #define ASSERT_NULL(real) assert_null((void*)real, __FILE__, __LINE__) void assert_not_null(const void* real, const char* caller, int line); #define ASSERT_NOT_NULL(real) assert_not_null(real, __FILE__, __LINE__) void assert_true(int real, const char* caller, int line); #define ASSERT_TRUE(real) assert_true(real, __FILE__, __LINE__) void assert_false(int real, const char* caller, int line); #define ASSERT_FALSE(real) assert_false(real, __FILE__, __LINE__) void assert_fail(const char* caller, int line); #define ASSERT_FAIL() assert_fail(__FILE__, __LINE__) /* If longjmp() is not available, integer flag will be used instead of jmp_buf. * * __CTEST_SETJMP() will clear the flag and return zero, and __CTEST_LONGJMP() * will set the flag to its argument. __CTEST_ERROR_CODE() will return that flag. * * If longjmp() is available, jmp_buf will be used as usual and __CTEST_ERROR_CODE() * will always return zero. * * You can check both __CTEST_SETJMP() and __CTEST_ERROR_CODE() return value * to detect error in a portable way. */ #ifdef __CTEST_NO_JMP # define __CTEST_JMPBUF int # define __CTEST_ERROR_CODE(_var) (_var) # define __CTEST_SETJMP(_var) (_var = 0) # define __CTEST_LONGJMP(_var, _err) (_var = _err) #else // !__CTEST_NO_JMP # define __CTEST_JMPBUF jmp_buf # define __CTEST_ERROR_CODE(_var) (0) # define __CTEST_SETJMP(_var) setjmp(_var) # define __CTEST_LONGJMP(_var, _err) longjmp(_var, _err) #endif // __CTEST_NO_JMP void assert_dbl_near(double exp, double real, double tol, const char* caller, int line); #define ASSERT_DBL_NEAR(exp, real) assert_dbl_near(exp, real, 1e-4, __FILE__, __LINE__) #define ASSERT_DBL_NEAR_TOL(exp, real, tol) assert_dbl_near(exp, real, tol, __FILE__, __LINE__) void assert_dbl_far(double exp, double real, double tol, const char* caller, int line); #define ASSERT_DBL_FAR(exp, real) assert_dbl_far(exp, real, 1e-4, __FILE__, __LINE__) #define ASSERT_DBL_FAR_TOL(exp, real, tol) assert_dbl_far(exp, real, tol, __FILE__, __LINE__) #ifdef CTEST_MAIN #ifndef __CTEST_NO_JMP #include #endif #include #include #include #ifndef __CTEST_NO_TIME #include #endif #include #ifdef __CTEST_MSVC #include #else #include #endif #include #ifdef __CTEST_APPLE #include #endif static size_t ctest_errorsize; static char* ctest_errormsg; #define MSG_SIZE 4096 static char ctest_errorbuffer[MSG_SIZE]; static __CTEST_JMPBUF ctest_err; static int color_output = 1; static const char* suite_name; static const char* test_name; typedef int (*filter_func)(struct ctest*); #define ANSI_BLACK "\033[0;30m" #define ANSI_RED "\033[0;31m" #define ANSI_GREEN "\033[0;32m" #define ANSI_YELLOW "\033[0;33m" #define ANSI_BLUE "\033[0;34m" #define ANSI_MAGENTA "\033[0;35m" #define ANSI_CYAN "\033[0;36m" #define ANSI_GREY "\033[0;37m" #define ANSI_DARKGREY "\033[01;30m" #define ANSI_BRED "\033[01;31m" #define ANSI_BGREEN "\033[01;32m" #define ANSI_BYELLOW "\033[01;33m" #define ANSI_BBLUE "\033[01;34m" #define ANSI_BMAGENTA "\033[01;35m" #define ANSI_BCYAN "\033[01;36m" #define ANSI_WHITE "\033[01;37m" #define ANSI_NORMAL "\033[0m" #ifdef __CTEST_MSVC #ifndef CTEST_ADD_TESTS_MANUALLY __declspec(allocate(".ctest$a")) struct ctest * ctest_win_begin; __declspec(allocate(".ctest$z")) struct ctest * ctest_win_end; #endif #endif static CTEST(suite, test) { } #define __CTEST_POINTER_NEXT(_test) (struct ctest **)((struct ctest **)(_test) + 2) #define __CTEST_POINTER_PREV(_test) (struct ctest **)((struct ctest **)(_test) - 2) /* First element of test list. */ static struct ctest * * __ctest_head_p = (struct ctest **)__PNAME(suite, test); #ifdef CTEST_ADD_TESTS_MANUALLY /* Last element of test list. */ static struct ctest *__ctest_tail = &__TNAME(suite, test); /* Add test to linked list manually. */ void __ctest_addTest(struct ctest *test) { __ctest_tail->next = test; __ctest_tail = test; } #else // !CTEST_ADD_TESTS_MANUALLY #ifndef __CTEST_MSVC /* Add all tests to linked list automatically. */ static void __ctest_linkTests() { struct ctest ** test; struct ctest ** ctest_begin = (struct ctest **)__PNAME(suite, test); struct ctest ** ctest_end = (struct ctest **)__PNAME(suite, test); // find begin and end of section by comparing magics while (1) { struct ctest** t = __CTEST_POINTER_PREV(ctest_begin); if (t[0] == NULL) break; if (t[1] != (struct ctest*)__CTEST_MAGIC) break; ctest_begin = t; } while (1) { struct ctest** t = __CTEST_POINTER_NEXT(ctest_end); if (t[0] == NULL) break; if (t[1] != (struct ctest*)__CTEST_MAGIC) break; ctest_end = t; } ctest_end = __CTEST_POINTER_NEXT(ctest_end); // end after last one for (test = ctest_begin; test != ctest_end; test = __CTEST_POINTER_NEXT(test)) { struct ctest ** next_p = __CTEST_POINTER_NEXT(test); struct ctest * next; if (next_p == ctest_end) next = NULL; else next = next_p[0]; (*test)->next = next; } __ctest_head_p = ctest_begin; } #else //for msvc static void __ctest_linkTests() { struct ctest ** ctest_start = __ctest_head_p; struct ctest ** test; struct ctest * cur=ctest_start[0]; for(test=&ctest_win_begin; test!=&ctest_win_end; test++){ //check if(test[1] == (struct ctest*)__CTEST_MAGIC){ //skip the start if((test[0]) == ctest_start[0]) continue; cur->next = test[0]; cur=cur->next; cur->next=NULL; } } } #endif #endif inline static void vprint_errormsg(const char* const fmt, va_list ap) { // (v)snprintf returns the number that would have been written const int ret = vsnprintf(ctest_errormsg, ctest_errorsize, fmt, ap); if (ret < 0) { ctest_errormsg[0] = 0x00; } else { const size_t size = (size_t) ret; const size_t s = (ctest_errorsize <= size ? size -ctest_errorsize : size); // ctest_errorsize may overflow at this point ctest_errorsize -= s; ctest_errormsg += s; } } inline static void print_errormsg(const char* const fmt, ...) { va_list argp; va_start(argp, fmt); vprint_errormsg(fmt, argp); va_end(argp); } static void msg_start(const char* color, const char* title) { if (color_output) { print_errormsg("%s", color); } print_errormsg(" %s: ", title); } static void msg_end() { if (color_output) { print_errormsg(ANSI_NORMAL); } print_errormsg("\n"); } void CTEST_LOG(const char* fmt, ...) { va_list argp; msg_start(ANSI_BLUE, "LOG"); va_start(argp, fmt); vprint_errormsg(fmt, argp); va_end(argp); msg_end(); } void CTEST_ERR(const char* fmt, ...) { va_list argp; msg_start(ANSI_YELLOW, "ERR"); va_start(argp, fmt); vprint_errormsg(fmt, argp); va_end(argp); msg_end(); __CTEST_LONGJMP(ctest_err, 1); } void assert_str(const char* exp, const char* real, const char* caller, int line) { if ((exp == NULL && real != NULL) || (exp != NULL && real == NULL) || (exp && real && strcmp(exp, real) != 0)) { CTEST_ERR("%s:%d expected '%s', got '%s'", caller, line, exp, real); } } void assert_data(const unsigned char* exp, size_t expsize, const unsigned char* real, size_t realsize, const char* caller, int line) { size_t i; if (expsize != realsize) { CTEST_ERR("%s:%d expected %" PRIuMAX " bytes, got %" PRIuMAX, caller, line, (uintmax_t) expsize, (uintmax_t) realsize); } for (i=0; i exp2) { CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real); } } void assert_dbl_near(double exp, double real, double tol, const char* caller, int line) { double diff = exp - real; double absdiff = diff; /* avoid using fabs and linking with a math lib */ if(diff < 0) { absdiff *= -1; } if (absdiff > tol) { CTEST_ERR("%s:%d expected %0.3e, got %0.3e (diff %0.3e, tol %0.3e)", caller, line, exp, real, diff, tol); } } void assert_dbl_far(double exp, double real, double tol, const char* caller, int line) { double diff = exp - real; double absdiff = diff; /* avoid using fabs and linking with a math lib */ if(diff < 0) { absdiff *= -1; } if (absdiff <= tol) { CTEST_ERR("%s:%d expected %0.3e, got %0.3e (diff %0.3e, tol %0.3e)", caller, line, exp, real, diff, tol); } } void assert_null(void* real, const char* caller, int line) { if ((real) != NULL) { CTEST_ERR("%s:%d should be NULL", caller, line); } } void assert_not_null(const void* real, const char* caller, int line) { if (real == NULL) { CTEST_ERR("%s:%d should not be NULL", caller, line); } } void assert_true(int real, const char* caller, int line) { if ((real) == 0) { CTEST_ERR("%s:%d should be true", caller, line); } } void assert_false(int real, const char* caller, int line) { if ((real) != 0) { CTEST_ERR("%s:%d should be false", caller, line); } } void assert_fail(const char* caller, int line) { CTEST_ERR("%s:%d shouldn't come here", caller, line); } static int suite_all(struct ctest* t) { (void) t; // fix unused parameter warning return 1; } static int suite_filter(struct ctest* t) { return strncmp(suite_name, t->ssname, strlen(suite_name)) == 0; } static int suite_test_filter(struct ctest* t) { int suit_match, test_match; suit_match=(strncmp(suite_name, t->ssname, strlen(suite_name)) == 0); test_match=(strncmp(test_name, t->ttname, strlen(test_name)) == 0); return (suit_match & test_match); } #ifndef __CTEST_NO_TIME static uint64_t getCurrentTime() { struct timeval now; gettimeofday(&now, NULL); uint64_t now64 = (uint64_t) now.tv_sec; now64 *= 1000000; now64 += ((uint64_t) now.tv_usec); return now64; } #endif static void color_print(const char* color, const char* text) { if (color_output) printf("%s%s"ANSI_NORMAL"\n", color, text); else printf("%s\n", text); } #ifdef __CTEST_APPLE static void *find_symbol(struct ctest *test, const char *fname) { size_t len = strlen(test->ssname) + 1 + strlen(fname); char *symbol_name = (char *) malloc(len + 1); memset(symbol_name, 0, len + 1); snprintf(symbol_name, len + 1, "%s_%s", test->ssname, fname); //fprintf(stderr, ">>>> dlsym: loading %s\n", symbol_name); void *symbol = dlsym(RTLD_DEFAULT, symbol_name); if (!symbol) { //fprintf(stderr, ">>>> ERROR: %s\n", dlerror()); } // returns NULL on error free(symbol_name); return symbol; } #endif #ifdef CTEST_SEGFAULT #include static void sighandler(int signum) { char msg[128]; snprintf(msg, sizeof(msg), "[SIGNAL %d: %s]", signum, strsignal(signum)); color_print(ANSI_BRED, msg); fflush(stdout); /* "Unregister" the signal handler and send the signal back to the process * so it can terminate as expected */ signal(signum, SIG_DFL); kill(getpid(), signum); } #endif int ctest_main(int argc, const char *argv[]) { static int total = 0; static int num_ok = 0; static int num_fail = 0; static int num_skip = 0; static int index = 1; static filter_func filter = suite_all; const char* color = (num_fail) ? ANSI_BRED : ANSI_GREEN; char results[80]; static struct ctest* test; #ifdef CTEST_SEGFAULT signal(SIGSEGV, sighandler); #endif if (argc == 2) { suite_name = argv[1]; filter = suite_filter; }else if (argc == 3) { suite_name = argv[1]; test_name = argv[2]; filter = suite_test_filter; } #ifdef CTEST_NO_COLORS color_output = 0; #else color_output = isatty(1); #endif #ifndef __CTEST_NO_TIME uint64_t t1 = getCurrentTime(); #endif #ifndef CTEST_ADD_TESTS_MANUALLY __ctest_linkTests(); #endif for (test = *(__ctest_head_p); test != NULL; test=test->next) { if (test == &__ctest_suite_test) continue; if (filter(test)) total++; } for (test = *(__ctest_head_p); test != NULL; test=test->next) { if (test == &__ctest_suite_test) continue; if (filter(test)) { ctest_errorbuffer[0] = 0; ctest_errorsize = MSG_SIZE-1; ctest_errormsg = ctest_errorbuffer; printf("TEST %d/%d %s:%s ", index, total, test->ssname, test->ttname); fflush(stdout); if (test->skip) { color_print(ANSI_BYELLOW, "[SKIPPED]"); num_skip++; } else { int result = __CTEST_SETJMP(ctest_err); if (result == 0) { #ifdef __CTEST_APPLE if (!test->setup) { test->setup = (SetupFunc) find_symbol(test, "setup"); } if (!test->teardown) { test->teardown = (TearDownFunc) find_symbol(test, "teardown"); } #endif if (test->setup) test->setup(test->data); if (test->data) ((RunWithDataFunc)test->run)(test->data); else test->run(); if (test->teardown) test->teardown(test->data); // if we got here it's ok #ifdef CTEST_COLOR_OK color_print(ANSI_BGREEN, "[OK]"); #else printf("[OK]\n"); #endif num_ok++; } else { color_print(ANSI_BRED, "[FAIL]"); num_fail++; } if (ctest_errorsize != MSG_SIZE-1) printf("%s", ctest_errorbuffer); } index++; } } #ifndef __CTEST_NO_TIME uint64_t t2 = getCurrentTime(); #endif #ifndef __CTEST_NO_TIME sprintf(results, "RESULTS: %d tests (%d ok, %d failed, %d skipped) ran in %"PRIu64" ms", total, num_ok, num_fail, num_skip, (t2 - t1)/1000); #else sprintf(results, "RESULTS: %d tests (%d ok, %d failed, %d skipped)", total, num_ok, num_fail, num_skip); #endif color_print(color, results); return num_fail; } #endif #endif OpenBLAS-0.2.20/utest/openblas_utest.h000066400000000000000000000035601313527062700175140ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #ifndef _OPENBLAS_UTEST_H_ #define _OPENBLAS_UTEST_H_ #include #include "ctest.h" #include #include #define SINGLE_EPS 1e-04 #define DOUBLE_EPS 1e-13 #endif OpenBLAS-0.2.20/utest/test_amax.c000066400000000000000000000036711313527062700164500ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" CTEST(amax, samax){ blasint N=3, inc=1; float te_max=0.0, tr_max=0.0; float x[]={-1.1, 2.2, -3.3}; te_max=BLASFUNC(samax)(&N, x, &inc); tr_max=3.3; ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } OpenBLAS-0.2.20/utest/test_axpy.c000066400000000000000000000073431313527062700165030ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common_utest.h" void test_daxpy_inc_0(void) { int i; int N=8,incX=0,incY=0; double a=0.25; double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; double y2[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; //OpenBLAS BLASFUNC(daxpy)(&N,&a,x1,&incX,y1,&incY); //reference BLASFUNC_REF(daxpy)(&N,&a,x2,&incX,y2,&incY); for(i=0; i void test_zdotu_n_1(void) { int N=1,incX=1,incY=1; double x1[]={1.0,1.0}; double y1[]={1.0,2.0}; double x2[]={1.0,1.0}; double y2[]={1.0,2.0}; double _Complex result1=0.0; double _Complex result2=0.0; //OpenBLAS result1=BLASFUNC(zdotu)(&N,x1,&incX,y1,&incY); //reference result2=BLASFUNC_REF(zdotu)(&N,x2,&incX,y2,&incY); CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); // printf("\%lf,%lf\n",creal(result1),cimag(result1)); } void test_zdotu_offset_1(void) { int N=1,incX=1,incY=1; double x1[]={1.0,2.0,3.0,4.0}; double y1[]={5.0,6.0,7.0,8.0}; double x2[]={1.0,2.0,3.0,4.0}; double y2[]={5.0,6.0,7.0,8.0}; double _Complex result1=0.0; double _Complex result2=0.0; //OpenBLAS result1=BLASFUNC(zdotu)(&N,x1+1,&incX,y1+1,&incY); //reference result2=BLASFUNC_REF(zdotu)(&N,x2+1,&incX,y2+1,&incY); CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); // printf("\%lf,%lf\n",creal(result1),cimag(result1)); } OpenBLAS-0.2.20/utest/test_dsdot.c000066400000000000000000000037621313527062700166400ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common_utest.h" void test_dsdot_n_1() { float x= 0.172555164; float y= -0.0138700781; int incx=1; int incy=1; int n=1; double res1=0.0f, res2=0.0f; res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy); CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS); } OpenBLAS-0.2.20/utest/test_fork.c000066400000000000000000000101201313527062700164460ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #ifndef OS_WINDOWS #include "common_utest.h" #include #include void* xmalloc(size_t n) { void* tmp; tmp = malloc(n); if (tmp == NULL) { fprintf(stderr, "You are about to die\n"); exit(1); } else { return tmp; } } void check_dgemm(double *a, double *b, double *result, double *expected, int n) { int i; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, a, n, b, n, 0.0, result, n); for(i = 0; i < n * n; ++i) { CU_ASSERT_DOUBLE_EQUAL(expected[i], result[i], CHECK_EPS); } } void test_fork_safety(void) { int n = 1000; int i; double *a, *b, *c, *d; size_t n_bytes; pid_t fork_pid; pid_t fork_pid_nested; n_bytes = sizeof(*a) * n * n; a = xmalloc(n_bytes); b = xmalloc(n_bytes); c = xmalloc(n_bytes); d = xmalloc(n_bytes); // Put ones in a and b for(i = 0; i < n * n; ++i) { a[i] = 1; b[i] = 1; } // Compute a DGEMM product in the parent process prior to forking to // ensure that the OpenBLAS thread pool is initialized. cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, a, n, b, n, 0.0, c, n); fork_pid = fork(); if (fork_pid == -1) { CU_FAIL("Failed to fork process."); } else if (fork_pid == 0) { // Compute a DGEMM product in the child process to check that the // thread pool as been properly been reinitialized after the fork. check_dgemm(a, b, d, c, n); // Nested fork to check that the pthread_atfork protection can work // recursively fork_pid_nested = fork(); if (fork_pid_nested == -1) { CU_FAIL("Failed to fork process."); exit(1); } else if (fork_pid_nested == 0) { check_dgemm(a, b, d, c, n); exit(0); } else { check_dgemm(a, b, d, c, n); int child_status = 0; pid_t wait_pid = wait(&child_status); CU_ASSERT(wait_pid == fork_pid_nested); CU_ASSERT(WEXITSTATUS (child_status) == 0); exit(0); } } else { check_dgemm(a, b, d, c, n); // Wait for the child to finish and check the exit code. int child_status = 0; pid_t wait_pid = wait(&child_status); CU_ASSERT(wait_pid == fork_pid); CU_ASSERT(WEXITSTATUS (child_status) == 0); } } #endif OpenBLAS-0.2.20/utest/test_potrs.c000066400000000000000000000546721313527062700167000ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, BLASINT*, complex double*, BLASINT*, BLASINT*); */ //https://github.com/xianyi/OpenBLAS/issues/695 CTEST(potrf, bug_695){ openblas_complex_float A1[100] = { openblas_make_complex_float(5.8525753, +0.0), openblas_make_complex_float(-0.79540455, -0.7066077), openblas_make_complex_float(0.98274714, -1.3824869), openblas_make_complex_float(2.619998, -1.8532984), openblas_make_complex_float(-1.8306153, +1.2336911), openblas_make_complex_float(0.32275113, -0.015575029), openblas_make_complex_float(2.1968813, -1.0640624), openblas_make_complex_float(0.27894387, -0.97911835), openblas_make_complex_float(3.0476584, -0.18548489), openblas_make_complex_float(0.3842994, -0.7050991), openblas_make_complex_float(-0.79540455, +0.7066077), openblas_make_complex_float(8.313246, +0.0), openblas_make_complex_float(-1.8076122, +0.8882447), openblas_make_complex_float(0.47806996, -0.48494184), openblas_make_complex_float(0.5096429, +0.5395974), openblas_make_complex_float(-0.7285097, +0.10360408), openblas_make_complex_float(-1.1760061, +2.7146957), openblas_make_complex_float(-0.4271084, -0.042899966), openblas_make_complex_float(-1.7228563, -2.8335886), openblas_make_complex_float(1.8942566, -0.6389735), openblas_make_complex_float(0.98274714, +1.3824869), openblas_make_complex_float(-1.8076122, -0.8882447), openblas_make_complex_float(9.367975, +0.0), openblas_make_complex_float(-0.1838578, -0.6468568), openblas_make_complex_float(-1.8338387, -0.7064959), openblas_make_complex_float(0.041852742, +0.6556877), openblas_make_complex_float(2.5673025, -1.9732997), openblas_make_complex_float(-1.1148382, +0.15693812), openblas_make_complex_float(2.4704504, +1.0389464), openblas_make_complex_float(1.0858271, +1.298006), openblas_make_complex_float(2.619998, +1.8532984), openblas_make_complex_float(0.47806996, +0.48494184), openblas_make_complex_float(-0.1838578, +0.6468568), openblas_make_complex_float(3.1117508, +0.0), openblas_make_complex_float(-1.956626, -0.22825956), openblas_make_complex_float(0.07081801, +0.31801307), openblas_make_complex_float(0.3698375, +0.5400855), openblas_make_complex_float(0.80686307, -1.5315914), openblas_make_complex_float(1.5649154, +1.6229297), openblas_make_complex_float(-0.112077385, -1.2014246), openblas_make_complex_float(-1.8306153, -1.2336911), openblas_make_complex_float(0.5096429, -0.5395974), openblas_make_complex_float(-1.8338387, +0.7064959), openblas_make_complex_float(-1.956626, +0.22825956), openblas_make_complex_float(3.6439795, +0.0), openblas_make_complex_float(-0.2594722, -0.48786148), openblas_make_complex_float(-0.47636223, +0.27821827), openblas_make_complex_float(-0.61608654, +2.01858), openblas_make_complex_float(-2.7767487, -1.7693765), openblas_make_complex_float(0.048102796, +0.9741874), openblas_make_complex_float(0.32275113, +0.015575029), openblas_make_complex_float(-0.7285097, -0.10360408), openblas_make_complex_float(0.041852742, -0.6556877), openblas_make_complex_float(0.07081801, -0.31801307), openblas_make_complex_float(-0.2594722, +0.48786148), openblas_make_complex_float(3.624376, +0.0), openblas_make_complex_float(-1.6697118, -0.4017511), openblas_make_complex_float(-1.4397877, +0.7550918), openblas_make_complex_float(-0.31456697, +1.0403451), openblas_make_complex_float(-0.31978557, -0.13701046), openblas_make_complex_float(2.1968813, +1.0640624), openblas_make_complex_float(-1.1760061, -2.7146957), openblas_make_complex_float(2.5673025, +1.9732997), openblas_make_complex_float(0.3698375, -0.5400855), openblas_make_complex_float(-0.47636223, -0.27821827), openblas_make_complex_float(-1.6697118, +0.4017511), openblas_make_complex_float(6.8273163, +0.0), openblas_make_complex_float(-0.10051322, -0.24303961), openblas_make_complex_float(1.4415971, -0.29750675), openblas_make_complex_float(1.221786, +0.85654986), openblas_make_complex_float(0.27894387, +0.97911835), openblas_make_complex_float(-0.4271084, +0.042899966), openblas_make_complex_float(-1.1148382, -0.15693812), openblas_make_complex_float(0.80686307, +1.5315914), openblas_make_complex_float(-0.61608654, -2.01858), openblas_make_complex_float(-1.4397877, -0.7550918), openblas_make_complex_float(-0.10051322, +0.24303961), openblas_make_complex_float(3.4057708, +0.0), openblas_make_complex_float(-0.5856801, +1.0203559), openblas_make_complex_float(0.7103452, -0.8422135), openblas_make_complex_float(3.0476584, +0.18548489), openblas_make_complex_float(-1.7228563, +2.8335886), openblas_make_complex_float(2.4704504, -1.0389464), openblas_make_complex_float(1.5649154, -1.6229297), openblas_make_complex_float(-2.7767487, +1.7693765), openblas_make_complex_float(-0.31456697, -1.0403451), openblas_make_complex_float(1.4415971, +0.29750675), openblas_make_complex_float(-0.5856801, -1.0203559), openblas_make_complex_float(7.005772, +0.0), openblas_make_complex_float(-0.9617417, +1.2486815), openblas_make_complex_float(0.3842994, +0.7050991), openblas_make_complex_float(1.8942566, +0.6389735), openblas_make_complex_float(1.0858271, -1.298006), openblas_make_complex_float(-0.112077385, +1.2014246), openblas_make_complex_float(0.048102796, -0.9741874), openblas_make_complex_float(-0.31978557, +0.13701046), openblas_make_complex_float(1.221786, -0.85654986), openblas_make_complex_float(0.7103452, +0.8422135), openblas_make_complex_float(-0.9617417, -1.2486815), openblas_make_complex_float(3.4629636, +0.0) }; char up = 'U'; blasint n=10; blasint info[1]; BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); openblas_complex_double A2[100] = { openblas_make_complex_double(3.0607147216796875, +0.0), openblas_make_complex_double(-0.5905849933624268, -0.29020825028419495), openblas_make_complex_double(0.321084201335907, +0.45168760418891907), openblas_make_complex_double(0.8387917876243591, -0.644718587398529), openblas_make_complex_double(-0.3642411530017853, +0.051274992525577545), openblas_make_complex_double(0.8071482181549072, +0.33944568037986755), openblas_make_complex_double(0.013674172572791576, +0.21422699093818665), openblas_make_complex_double(0.35476258397102356, +0.42408594489097595), openblas_make_complex_double(-0.5991537570953369, -0.23082709312438965), openblas_make_complex_double(-0.0600702166557312, -0.2113417387008667), openblas_make_complex_double(-0.7954045534133911, +0.7066076993942261), openblas_make_complex_double(2.807175397872925, +0.0), openblas_make_complex_double(-0.1691000759601593, +0.313548743724823), openblas_make_complex_double(-0.30911174416542053, +0.7447023987770081), openblas_make_complex_double(-0.22347848117351532, +0.03316075727343559), openblas_make_complex_double(-0.4088296890258789, -1.0214389562606812), openblas_make_complex_double(-0.2344931811094284, +0.08056317269802094), openblas_make_complex_double(0.793269693851471, -0.17507623136043549), openblas_make_complex_double(0.03163455054163933, +0.20559945702552795), openblas_make_complex_double(0.13581633567810059, -0.2110036462545395), openblas_make_complex_double(0.9827471375465393, +1.3824869394302368), openblas_make_complex_double(-1.8076121807098389, -0.8882446885108948), openblas_make_complex_double(2.3277781009674072, +0.0), openblas_make_complex_double(0.830405056476593, -0.19296252727508545), openblas_make_complex_double(0.1394239068031311, -0.5260677933692932), openblas_make_complex_double(1.239942193031311, -0.09915469586849213), openblas_make_complex_double(0.06731037050485611, -0.059320636093616486), openblas_make_complex_double(0.11507681757211685, -0.1984301060438156), openblas_make_complex_double(-0.6843825578689575, +0.4647614359855652), openblas_make_complex_double(1.213119387626648, -0.7757048010826111), openblas_make_complex_double(2.619997978210449, +1.8532984256744385), openblas_make_complex_double(0.4780699610710144, +0.48494184017181396), openblas_make_complex_double(-0.18385779857635498, +0.6468567848205566), openblas_make_complex_double(2.0811400413513184, +0.0), openblas_make_complex_double(-0.035075582563877106, +0.09732913225889206), openblas_make_complex_double(0.27337002754211426, -0.9032229781150818), openblas_make_complex_double(-0.8374675512313843, +0.0479498989880085), openblas_make_complex_double(0.6916252374649048, +0.45711082220077515), openblas_make_complex_double(0.1883818507194519, +0.06482727080583572), openblas_make_complex_double(-0.32384994626045227, +0.05857187137007713), openblas_make_complex_double(-1.8306152820587158, -1.2336910963058472), openblas_make_complex_double(0.5096428990364075, -0.5395973920822144), openblas_make_complex_double(-1.833838701248169, +0.7064958810806274), openblas_make_complex_double(-1.956626057624817, +0.22825956344604492), openblas_make_complex_double(1.706615924835205, +0.0), openblas_make_complex_double(-0.2895336151123047, +0.17579378187656403), openblas_make_complex_double(-0.923172116279602, -0.4530014097690582), openblas_make_complex_double(0.5040621757507324, -0.37026339769363403), openblas_make_complex_double(-0.2824432849884033, -1.0374568700790405), openblas_make_complex_double(0.1399831622838974, +0.4977008104324341), openblas_make_complex_double(0.32275113463401794, +0.015575028955936432), openblas_make_complex_double(-0.7285097241401672, -0.10360407829284668), openblas_make_complex_double(0.041852742433547974, -0.655687689781189), openblas_make_complex_double(0.07081800699234009, -0.318013072013855), openblas_make_complex_double(-0.25947219133377075, +0.4878614842891693), openblas_make_complex_double(1.5735365152359009, +0.0), openblas_make_complex_double(-0.2647853195667267, -0.26654252409935), openblas_make_complex_double(-0.6190430521965027, -0.24699924886226654), openblas_make_complex_double(-0.6288471221923828, +0.48154571652412415), openblas_make_complex_double(0.02446540631353855, -0.2611822783946991), openblas_make_complex_double(2.1968812942504883, +1.0640623569488525), openblas_make_complex_double(-1.1760060787200928, -2.714695692062378), openblas_make_complex_double(2.5673024654388428, +1.9732997417449951), openblas_make_complex_double(0.3698374927043915, -0.54008549451828), openblas_make_complex_double(-0.4763622283935547, -0.27821826934814453), openblas_make_complex_double(-1.6697118282318115, +0.4017511010169983), openblas_make_complex_double(1.2674795389175415, +0.0), openblas_make_complex_double(0.3079095482826233, -0.07258892804384232), openblas_make_complex_double(-0.5929520130157471, -0.038360968232154846), openblas_make_complex_double(0.04388086497783661, -0.025549031794071198), openblas_make_complex_double(0.27894386649131775, +0.9791183471679688), openblas_make_complex_double(-0.42710840702056885, +0.0428999662399292), openblas_make_complex_double(-1.1148382425308228, -0.1569381207227707), openblas_make_complex_double(0.8068630695343018, +1.5315914154052734), openblas_make_complex_double(-0.6160865426063538, -2.0185799598693848), openblas_make_complex_double(-1.439787745475769, -0.7550917863845825), openblas_make_complex_double(-0.10051321983337402, +0.24303960800170898), openblas_make_complex_double(0.9066106081008911, +0.0), openblas_make_complex_double(0.05315789580345154, -0.06136537343263626), openblas_make_complex_double(-0.21304509043693542, +0.6494344472885132), openblas_make_complex_double(3.0476584434509277, +0.1854848861694336), openblas_make_complex_double(-1.7228562831878662, +2.8335886001586914), openblas_make_complex_double(2.4704504013061523, -1.0389463901519775), openblas_make_complex_double(1.564915418624878, -1.6229296922683716), openblas_make_complex_double(-2.7767486572265625, +1.769376516342163), openblas_make_complex_double(-0.314566969871521, -1.0403450727462769), openblas_make_complex_double(1.4415971040725708, +0.29750674962997437), openblas_make_complex_double(-0.5856801271438599, -1.0203559398651123), openblas_make_complex_double(0.5668219923973083, +0.0), openblas_make_complex_double(0.033351436257362366, -0.07832501083612442), openblas_make_complex_double(0.3842993974685669, +0.7050991058349609), openblas_make_complex_double(1.894256591796875, +0.6389734745025635), openblas_make_complex_double(1.085827112197876, -1.2980060577392578), openblas_make_complex_double(-0.11207738518714905, +1.2014245986938477), openblas_make_complex_double(0.04810279607772827, -0.9741873741149902), openblas_make_complex_double(-0.31978556513786316, +0.13701045513153076), openblas_make_complex_double(1.2217860221862793, -0.856549859046936), openblas_make_complex_double(0.7103452086448669, +0.84221351146698), openblas_make_complex_double(-0.9617416858673096, -1.2486815452575684), openblas_make_complex_double(0.0756804421544075, +0.0) }; openblas_complex_double B[20] = { openblas_make_complex_double(-0.21782716937787788, -0.9222220085490986), openblas_make_complex_double(-0.7620356655676837, +0.15533508334193666), openblas_make_complex_double(-0.905011814118756, +0.2847570854574069), openblas_make_complex_double(-0.3451346708401685, +1.076948486041297), openblas_make_complex_double(0.25336108035924787, +0.975317836492159), openblas_make_complex_double(0.11192755545114, -0.1603741874112385), openblas_make_complex_double(-0.20604111555491242, +0.10570814584017311), openblas_make_complex_double(-1.0568488936791578, -0.06025820467086475), openblas_make_complex_double(-0.6650468984506477, -0.5000967284800251), openblas_make_complex_double(-1.0509472322215125, +0.5022165705328413), openblas_make_complex_double(-0.727775859267237, +0.50638268521728), openblas_make_complex_double(0.39947219167701153, -0.4576746001199889), openblas_make_complex_double(-0.7122162951294634, -0.630289556702497), openblas_make_complex_double(0.9870834574024372, -0.2825689605519449), openblas_make_complex_double(0.0628393808469436, -0.1253397353973715), openblas_make_complex_double(0.8439562576196216, +1.0850814110398734), openblas_make_complex_double(0.562377322638969, -0.2578030745663871), openblas_make_complex_double(0.12696236014017806, -0.09853584666755086), openblas_make_complex_double(-0.023682508769195098, +0.18093440285319276), openblas_make_complex_double(-0.7264975746431271, +0.31670415674097235) }; char lo = 'L'; blasint nrhs = 2; BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); // note that this is exactly equal to A1 openblas_complex_float A3[100] = { openblas_make_complex_float(5.8525753, +0.0), openblas_make_complex_float(-0.79540455, -0.7066077), openblas_make_complex_float(0.98274714, -1.3824869), openblas_make_complex_float(2.619998, -1.8532984), openblas_make_complex_float(-1.8306153, +1.2336911), openblas_make_complex_float(0.32275113, -0.015575029), openblas_make_complex_float(2.1968813, -1.0640624), openblas_make_complex_float(0.27894387, -0.97911835), openblas_make_complex_float(3.0476584, -0.18548489), openblas_make_complex_float(0.3842994, -0.7050991), openblas_make_complex_float(-0.79540455, +0.7066077), openblas_make_complex_float(8.313246, +0.0), openblas_make_complex_float(-1.8076122, +0.8882447), openblas_make_complex_float(0.47806996, -0.48494184), openblas_make_complex_float(0.5096429, +0.5395974), openblas_make_complex_float(-0.7285097, +0.10360408), openblas_make_complex_float(-1.1760061, +2.7146957), openblas_make_complex_float(-0.4271084, -0.042899966), openblas_make_complex_float(-1.7228563, -2.8335886), openblas_make_complex_float(1.8942566, -0.6389735), openblas_make_complex_float(0.98274714, +1.3824869), openblas_make_complex_float(-1.8076122, -0.8882447), openblas_make_complex_float(9.367975, +0.0), openblas_make_complex_float(-0.1838578, -0.6468568), openblas_make_complex_float(-1.8338387, -0.7064959), openblas_make_complex_float(0.041852742, +0.6556877), openblas_make_complex_float(2.5673025, -1.9732997), openblas_make_complex_float(-1.1148382, +0.15693812), openblas_make_complex_float(2.4704504, +1.0389464), openblas_make_complex_float(1.0858271, +1.298006), openblas_make_complex_float(2.619998, +1.8532984), openblas_make_complex_float(0.47806996, +0.48494184), openblas_make_complex_float(-0.1838578, +0.6468568), openblas_make_complex_float(3.1117508, +0.0), openblas_make_complex_float(-1.956626, -0.22825956), openblas_make_complex_float(0.07081801, +0.31801307), openblas_make_complex_float(0.3698375, +0.5400855), openblas_make_complex_float(0.80686307, -1.5315914), openblas_make_complex_float(1.5649154, +1.6229297), openblas_make_complex_float(-0.112077385, -1.2014246), openblas_make_complex_float(-1.8306153, -1.2336911), openblas_make_complex_float(0.5096429, -0.5395974), openblas_make_complex_float(-1.8338387, +0.7064959), openblas_make_complex_float(-1.956626, +0.22825956), openblas_make_complex_float(3.6439795, +0.0), openblas_make_complex_float(-0.2594722, -0.48786148), openblas_make_complex_float(-0.47636223, +0.27821827), openblas_make_complex_float(-0.61608654, +2.01858), openblas_make_complex_float(-2.7767487, -1.7693765), openblas_make_complex_float(0.048102796, +0.9741874), openblas_make_complex_float(0.32275113, +0.015575029), openblas_make_complex_float(-0.7285097, -0.10360408), openblas_make_complex_float(0.041852742, -0.6556877), openblas_make_complex_float(0.07081801, -0.31801307), openblas_make_complex_float(-0.2594722, +0.48786148), openblas_make_complex_float(3.624376, +0.0), openblas_make_complex_float(-1.6697118, -0.4017511), openblas_make_complex_float(-1.4397877, +0.7550918), openblas_make_complex_float(-0.31456697, +1.0403451), openblas_make_complex_float(-0.31978557, -0.13701046), openblas_make_complex_float(2.1968813, +1.0640624), openblas_make_complex_float(-1.1760061, -2.7146957), openblas_make_complex_float(2.5673025, +1.9732997), openblas_make_complex_float(0.3698375, -0.5400855), openblas_make_complex_float(-0.47636223, -0.27821827), openblas_make_complex_float(-1.6697118, +0.4017511), openblas_make_complex_float(6.8273163, +0.0), openblas_make_complex_float(-0.10051322, -0.24303961), openblas_make_complex_float(1.4415971, -0.29750675), openblas_make_complex_float(1.221786, +0.85654986), openblas_make_complex_float(0.27894387, +0.97911835), openblas_make_complex_float(-0.4271084, +0.042899966), openblas_make_complex_float(-1.1148382, -0.15693812), openblas_make_complex_float(0.80686307, +1.5315914), openblas_make_complex_float(-0.61608654, -2.01858), openblas_make_complex_float(-1.4397877, -0.7550918), openblas_make_complex_float(-0.10051322, +0.24303961), openblas_make_complex_float(3.4057708, +0.0), openblas_make_complex_float(-0.5856801, +1.0203559), openblas_make_complex_float(0.7103452, -0.8422135), openblas_make_complex_float(3.0476584, +0.18548489), openblas_make_complex_float(-1.7228563, +2.8335886), openblas_make_complex_float(2.4704504, -1.0389464), openblas_make_complex_float(1.5649154, -1.6229297), openblas_make_complex_float(-2.7767487, +1.7693765), openblas_make_complex_float(-0.31456697, -1.0403451), openblas_make_complex_float(1.4415971, +0.29750675), openblas_make_complex_float(-0.5856801, -1.0203559), openblas_make_complex_float(7.005772, +0.0), openblas_make_complex_float(-0.9617417, +1.2486815), openblas_make_complex_float(0.3842994, +0.7050991), openblas_make_complex_float(1.8942566, +0.6389735), openblas_make_complex_float(1.0858271, -1.298006), openblas_make_complex_float(-0.112077385, +1.2014246), openblas_make_complex_float(0.048102796, -0.9741874), openblas_make_complex_float(-0.31978557, +0.13701046), openblas_make_complex_float(1.221786, -0.85654986), openblas_make_complex_float(0.7103452, +0.8422135), openblas_make_complex_float(-0.9617417, -1.2486815), openblas_make_complex_float(3.4629636, +0.0) }; BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info); // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91])); if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__); } } OpenBLAS-0.2.20/utest/test_rot.c000066400000000000000000000072071313527062700163250ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common_utest.h" void test_drot_inc_0(void) { int i=0; int N=4,incX=0,incY=0; double c=0.25,s=0.5; double x1[]={1.0,3.0,5.0,7.0}; double y1[]={2.0,4.0,6.0,8.0}; double x2[]={1.0,3.0,5.0,7.0}; double y2[]={2.0,4.0,6.0,8.0}; //OpenBLAS BLASFUNC(drot)(&N,x1,&incX,y1,&incY,&c,&s); //reference BLASFUNC_REF(drot)(&N,x2,&incX,y2,&incY,&c,&s); for(i=0; i #define CTEST_MAIN #define CTEST_SEGFAULT #include "openblas_utest.h" int main(int argc, const char ** argv){ int num_fail=0; num_fail=ctest_main(argc, argv); return num_fail; } OpenBLAS-0.2.20/version.h000066400000000000000000000105441313527062700150060ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef VERSION_H #define VERSION_H #define VERSION " OpenBLAS" #endif